diff --git a/.gitignore b/.gitignore
index e1fa12ea6ad..bdcb067fc26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,17 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
+/.bazelrc
+/.tf_configure.bazelrc
 /bazel-*
-/third_party/py/numpy/numpy_include
-/tools/bazel.rc
+/bazel_pip
+/third_party/eigen3/mkl_include
+/third_party/mkl/*
 /tools/python_bin_path.sh
 /tools/git/gen
-/util/python/python_include
-/util/python/python_lib
 /pip_test
 /_python_build
 *.pyc
 __pycache__
 *.swp
+.vscode/
diff --git a/.mention-bot b/.mention-bot
deleted file mode 100644
index 9e4858977f5..00000000000
--- a/.mention-bot
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "maxReviewers": 2,
-  "numFilesToCheck": 10,
-  "userBlacklist": ["tensorflower-gardener"],
-  "requiredOrgs": ["tensorflow"],
-  "skipAlreadyAssignedPR": true,
-  "skipAlreadyMentionedPR": true,
-  "skipTitle": "Branch",
-  "delayed": true,
-  "delayedUntil": "10m"
-}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 36f2f9808e6..43abdaafbf4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,9 +21,151 @@ If you have improvements to TensorFlow, send us your pull requests! For those
 just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/).
 
 If you want to contribute but you're not sure where to start, take a look at the
-[issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/contributions%20welcome).
+[issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
 These are issues that we believe are particularly well suited for outside
 contributions, often because we probably won't get to them right now. If you
 decide to start on an issue, leave a comment so that other people know that
 you're working on it. If you want to help out, but not alone, use the issue
 comment thread to coordinate.
+
+### Contribution guidelines and standards
+
+Before sending your pull request for
+[review](https://github.com/tensorflow/tensorflow/pulls),
+make sure your changes are consistent with the guidelines and follow the
+TensorFlow coding style.
+
+#### General guidelines and philosophy for contribution
+
+* Include unit tests when you contribute new features, as they help to
+  a) prove that your code works correctly, b) guard against future breaking
+  changes to lower the maintenance cost.
+* Bug fixes also generally require unit tests, because the presence of bugs
+  usually indicates insufficient test coverage.
+* Keep API compatibility in mind when you change code in core TensorFlow,
+  e.g., code in [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core) and  [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
+  TensorFlow has reached version 1 and hence cannot make
+  non-backward-compatible API changes without a major release. Reviewers of your
+  pull request will comment on any API compatibility issues.
+* When you contribute a new feature to TensorFlow, the maintenance burden is (by
+  default) transferred to the TensorFlow team. This means that benefit of
+  contribution must be compared against the cost of maintaining the feature.
+* Full new features (e.g., a new op implementing a cutting-edge algorithm)
+  typically will live in
+  [tensorflow/contrib](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib)
+  to get some airtime before decision is made regarding whether they are to be
+  migrated to the core.
+
+#### License
+
+Include a license at the top of new files.
+
+* [C/C++ license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op.cc#L1)
+* [Python license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn.py#L1)
+* [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
+* [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
+* [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
+* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+
+Bazel BUILD files also need to include a license section, e.g.,
+[BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
+
+#### C++ coding style
+
+Changes to TensorFlow C++ code should conform to
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+
+```bash
+apt-get install -y clang-tidy
+```
+
+You can check a C/C++ file by doing:
+
+
+```bash
+clang-format <my_cc_file> --style=google > /tmp/my_cc_file.cc
+diff <my_cc_file> /tmp/my_cc_file.cc
+```
+
+#### Python coding style
+
+Changes to TensorFlow Python code should conform to
+[Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+
+Use `pylint` to check your Python changes. To install `pylint` and
+retrieve TensorFlow's custom style definition:
+
+```bash
+pip install pylint
+wget -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
+```
+
+To check a file with `pylint`:
+
+```bash
+pylint --rcfile=/tmp/pylintrc myfile.py
+```
+
+#### Coding style for other languages
+
+* [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
+* [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
+* [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+
+#### Running sanity check
+
+If you have Docker installed on your system, you can perform a sanity check on
+your changes by running the command:
+
+```bash
+tensorflow/tools/ci_build/ci_build.sh CPU tensorflow/tools/ci_build/ci_sanity.sh
+```
+
+This will catch most license, Python coding style and BUILD file issues that
+may exist in your changes.
+
+#### Running unit tests
+
+There are two ways to run TensorFlow unit tests.
+
+1. Using tools and libraries installed directly on your system.
+
+   Refer to the
+   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
+   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+   for the required packages. Alternatively, use the said
+   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
+   for development to avoid installing the packages directly on your system.
+
+   Once you have the packages installed, you can run a specific unit test in
+   bazel by doing as follows:
+
+   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+   the `cuda` option flag
+
+   ```bash
+   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+   export flags="--config=opt --config=cuda -k"
+   ```
+
+   For example, to run all tests under tensorflow/python, do:
+
+   ```bash
+   bazel test ${flags} //tensorflow/python/...
+   ```
+
+2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
+
+   ```bash
+   # Install Docker first, then this will build and run cpu tests
+   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+   ```
+
+   See
+   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index af76188c2f4..5b37028c509 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -1,36 +1,37 @@
-NOTE: Only file GitHub issues for bugs and feature requests.  All other topics will be closed.
+Please go to Stack Overflow for help and support:
 
-For general support from the community, see [StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow).
-To make bugs and feature requests more easy to find and organize, we close issues that are deemed
-out of scope for GitHub Issues and point people to StackOverflow.
+http://stackoverflow.com/questions/tagged/tensorflow
 
-For bugs or installation issues, please provide the following information.
-The more information you provide, the more easily we will be able to offer
-help and advice.
+If you open a GitHub issue, here is our policy:
 
-### What related GitHub issues or StackOverflow threads have you found by searching the web for your problem?
+1. It must be a bug or a feature request.
+2. The form below must be filled out.
+3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorflow/issues).
 
-### Environment info
-Operating System:
+**Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.
 
-Installed version of CUDA and cuDNN: 
-(please attach the output of `ls -l /path/to/cuda/lib/libcud*`):
+------------------------
 
-If installed from binary pip package, provide:
+### System information
+- **Have I written custom code (as opposed to using a stock example script provided in TensorFlow)**:
+- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
+- **TensorFlow installed from (source or binary)**:
+- **TensorFlow version (use command below)**:
+- **Bazel version (if compiling from source)**:
+- **CUDA/cuDNN version**:
+- **GPU model and memory**:
+- **Exact command to reproduce**:
 
-1. A link to the pip package you installed:
-2. The output from `python -c "import tensorflow; print(tensorflow.__version__)"`.
+You can collect some of this information using our environment capture script:
 
-If installed from source, provide 
+https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh
 
-1. The commit hash (`git rev-parse HEAD`)
-2. The output of `bazel version`
+You can obtain the TensorFlow version with
 
-### If possible, provide a minimal reproducible example (We usually don't have time to read hundreds of lines of your code)
+python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
 
+### Describe the problem
+Describe the problem clearly here. Be sure to convey here why it's a bug in TensorFlow or a feature request.
 
-### What other attempted solutions have you tried?
-
-
-### Logs or other output that would be helpful
-(If logs are large, please upload as attachment or provide link).
+### Source code / logs
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached. Try to provide a reproducible test case that is the bare minimum necessary to generate the problem.
diff --git a/README.md b/README.md
index 40e8a4b190c..cbc94c1ab2b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <div align="center">
   <img src="https://www.tensorflow.org/images/tf_logo_transp.png"><br><br>
 </div>
+
 -----------------
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
@@ -25,19 +26,20 @@ guidelines](CONTRIBUTING.md).**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
 tracking requests and bugs, but please see
-[Community](tensorflow/g3doc/resources/index.md#community) for general questions
+[Community](https://www.tensorflow.org/community/) for general questions
 and discussion.**
 
 ## Installation
-*See [Download and Setup](tensorflow/g3doc/get_started/os_setup.md) for instructions on how to install our release binaries or how to build from source.*
+*See [Installing TensorFlow](https://www.tensorflow.org/install/) for instructions on how to install our release binaries or how to build from source.*
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
@@ -50,7 +52,7 @@ $ python
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> sess = tf.Session()
 >>> sess.run(hello)
-Hello, TensorFlow!
+'Hello, TensorFlow!'
 >>> a = tf.constant(10)
 >>> b = tf.constant(32)
 >>> sess.run(a+b)
@@ -58,11 +60,11 @@ Hello, TensorFlow!
 >>>
 ```
 
-##For more information
+## For more information
 
-* [TensorFlow website](http://tensorflow.org)
+* [TensorFlow website](https://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 
-The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
+The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/about/#community) for an incomplete list.
diff --git a/RELEASE.md b/RELEASE.md
index ab3ecbd7746..9875838d7e1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,312 @@
+# Release 1.2.0
+
+## Major Features and Improvements
+* Python 3.6 support on Windows.
+* Added `tf.layers.conv3d_transpose` layer for spatio temporal deconvolution.
+* Added `tf.Session.make_callable()`, which provides a lower overhead means of running a similar step multiple times.
+* Added libverbs-based RDMA support to contrib (courtesy @junshi15 from Yahoo).
+* Bring `tf.feature_column.*` into the API. Non-deprecated functionality from `tf.contrib.layers.*` is moved to `tf.feature_column.*` with cosmetic changes.
+* `RNNCell` objects now subclass `tf.layers.Layer`.  The strictness described
+  in the TensorFlow 1.1 release is gone:  The first time an RNNCell is used,
+  it caches its scope.  All future uses of the RNNCell will reuse variables from
+  that same scope.  This is a breaking change from the behavior of RNNCells
+  in TensorFlow versions <= 1.0.1.  TensorFlow 1.1 had checks in place to
+  ensure old code works correctly with the new semantics; this version
+  allows more flexible uses of RNNCell but can lead to subtle errors if
+  using code meant for TensorFlow <= 1.0.1.  For example, writing:
+  `MultiRNNCell([lstm] * 5)` will now build a 5-layer LSTM stack where each
+  layer shares the **same** parameters.  To get 5 layers each with their own
+  parameters, write: `MultiRNNCell([LSTMCell(...) for _ in range(5)])`.
+  If at all unsure, first test your code with TF 1.1; ensure it raises no
+  errors, and then upgrade to TF 1.2.
+* RNNCells' variable names have been renamed for consistency with Keras layers.
+  Specifically, the previous variable names "weights" and "biases" have
+  been changed to "kernel" and "bias", respectively.
+  This may cause backward incompatibility with regard to your old
+  checkpoints containing such RNN cells, in which case you can use the tool
+  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
+  to convert the variable names in your old checkpoints.
+* Many of the RNN functions and classes that were in the `tf.nn` namespace
+  before the 1.0 release and which were moved to `tf.contrib.rnn` have now
+  been moved back to the core namespace.  This includes
+  `RNNCell`, `LSTMCell`, `GRUCell`, and a number of other cells.  These
+  now reside in `tf.nn.rnn_cell` (with aliases in `tf.contrib.rnn` for backwards
+  compatibility).  The original `tf.nn.rnn` function is now `tf.nn.static_rnn`,
+  and the bidirectional static and state saving static rnn functions are also
+  now back in the `tf.nn` namespace.
+
+  Notable exceptions are the `EmbeddingWrapper`, `InputProjectionWrapper` and
+  `OutputProjectionWrapper`,  which will slowly be moved to deprecation
+  in `tf.contrib.rnn`.  These are inefficient wrappers that should often
+  be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
+  processing of the rnn.  For RNN decoding, this functionality has been replaced
+  with an alternative API in `tf.contrib.seq2seq`.
+* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
+  optimized deep learning primitives: In addition to matrix multiplication and
+  convolution, these building blocks include:
+  Direct batched convolution
+  Pooling: maximum, minimum, average
+  Normalization: LRN, batch normalization
+  Activation: rectified linear unit (ReLU)
+  Data manipulation: multi-dimensional transposition (conversion), split,
+  concat, sum and scale.
+* TensorForest Estimator now supports SavedModel export for serving.
+* Support client-provided ClusterSpec's and propagate them to all workers to enable the creation of dynamic TensorFlow clusters.
+* TensorFlow C library now available for Windows.
+* We released a new open-source version of TensorBoard.
+* [`SavedModel CLI`](https://www.tensorflow.org/versions/master/programmers_guide/saved_model_cli) tool available to inspect and execute MetaGraph in SavedModel
+* Android releases of TensorFlow are now pushed to jcenter for easier
+  integration into apps. See
+  https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/android/README.md
+  for more details.
+* RNNCells' variable names have been renamed for consistency with Keras layers.
+  Specifically, the previous variable names "weights" and "biases" have
+  been changed to "kernel" and "bias", respectively.
+  This may cause backward incompatibility with regard to your old
+  checkpoints containing such RNN cells, in which case you can use the tool
+  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
+  to convert the variable names in your old checkpoints.
+* Many of the RNN functions and classes that were in the `tf.nn` namespace
+  before the 1.0 release and which were moved to `tf.contrib.rnn` have now
+  been moved back to the core namespace.  This includes
+  `RNNCell`, `LSTMCell`, `GRUCell`, and a number of other cells.  These
+  now reside in `tf.nn.rnn_cell` (with aliases in `tf.contrib.rnn` for backwards
+  compatibility).  The original `tf.nn.rnn` function is now `tf.nn.static_rnn`,
+  and the bidirectional static and state saving static rnn functions are also
+  now back in the `tf.nn` namespace.
+
+  Notable exceptions are the `EmbeddingWrapper`, `InputProjectionWrapper` and
+  `OutputProjectionWrapper`,  which will slowly be moved to deprecation
+  in `tf.contrib.rnn`.  These are inefficient wrappers that should often
+  be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
+  processing of the rnn.  For RNN decoding, this functionality has been replaced
+  with an alternative API in `tf.contrib.seq2seq`.
+* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
+  optimized deep learning primitives: In addition to matrix multiplication and
+  convolution, these building blocks include:
+  Direct batched convolution
+  Pooling: maximum, minimum, average
+  Normalization: LRN, batch normalization
+  Activation: rectified linear unit (ReLU)
+  Data manipulation: multi-dimensional transposition (conversion), split,
+  concat, sum and scale.
+
+## Deprecations
+
+* TensorFlow 1.2 may be the last time we build with cuDNN 5.1. Starting with
+  TensorFlow 1.3, we will try to build all our prebuilt binaries with cuDNN 6.0.
+  While we will try to keep our source code compatible with cuDNN 5.1, it will
+  be best effort.
+
+## Breaking Changes to the API
+* `org.tensorflow.contrib.android.TensorFlowInferenceInterface` now throws exceptions where possible and has simplified method signatures.
+
+## Changes to contrib APIs
+* Added `tf.contrib.util.create_example`.
+* Added bilinear interpolation to `tf.contrib.image`.
+* Add `tf.contrib.stateless` for random ops with custom seed control.
+* MultivariateNormalFullCovariance added to contrib/distributions/
+* tensorflow/contrib/rnn undergoes RNN cell variable renaming for
+  consistency with Keras layers. Specifically, the previous variable names
+  "weights" and "biases" are changed to "kernel" and "bias", respectively.
+  This may cause backward incompatibility with regard to your old
+  checkpoints containing such RNN cells, in which case you can use the
+  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
+  to convert the variable names in your old checkpoints.
+* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal
+  (explicit) kernel methods in TensorFlow.
+
+## Bug Fixes and Other Changes
+* In python, `Operation.get_attr` on type attributes returns the Python DType
+  version of the type to match expected get_attr documentation rather than the
+  protobuf enum.
+* tensorflow/contrib/rnn undergoes RNN cell variable renaming for
+  consistency with Keras layers. Specifically, the previous variable names
+  "weights" and "biases" are changed to "kernel" and "bias", respectively.
+* Changed MIN_SDK version to 8.0 when building iOS libraries.
+* Fixed LIBXSMM integration.
+* Make decode_jpeg/decode_png/decode_gif handle all formats, since users frequently try to decode an image as the wrong type.
+* Improve implicit broadcasting lowering.
+* Improving stability of GCS/Bigquery clients by a faster retrying of stale transmissions.
+* Remove OpKernelConstruction::op_def() as part of minimizing proto dependencies.
+* VectorLaplaceDiag distribution added.
+* Android demo no longer requires libtensorflow_demo.so to run (libtensorflow_inference.so still required)
+* Added `categorical_column_with_vocabulary_file`.
+* Introduce ops for batching/unbatching tensors across Session::Run() calls.
+* Add tf.log_sigmoid(x) = tf.log(tf.sigmoid(x)) = -tf.nn.softplus(-x).
+* Changed hooks lists to immutable tuples, and now allow any iterable for the associated arguments.
+* Introduce TFDecorator.
+* Added an Mfcc op for speech feature generation.
+* Improved DirectSession::Run() overhead and error checking. Feeding a value of the wrong type will now synchronously raise an INVALID_ARGUMENT error instead of asynchronously raising an INTERNAL error. Code that depends on the (undefined) behavior when feeding a tensor of the wrong type may need to be updated.
+* Added unreduced NONE, and reduced MEAN options for losses. Removed "WEIGHTED_" prefix from other Reduction constants.
+* assertAllClose now handles dicts.
+* Added Gmock matcher for HloInstructions.
+* Add var name to errors on variable restore.
+* Added an AudioSpectrogram op for audio feature generation.
+* Added `reduction` arg to losses.
+* `tf.placeholder` can represent scalar shapes and partially known.
+* Remove estimator_spec(mode) argument.
+* Added an AudioSpectrogram op for audio feature generation.
+* TensorBoard disables all runs by default if there are more than 40 runs.
+* Removed old doc generator code.
+* GCS file system integration now supports domain buckets, e.g gs://bucket.domain.com/path.
+* Add `tf.summary.text` for outputting text to TensorBoard.
+* The "run" command of tfdbg's command-line interface now supports filtering of tensors by node name, op type and tensor dtype.
+* `tf.string_to_number` now supports int64 and float64 outputs.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4F2E4A2E, Aaron Schumacher, Abhi Agg, admcrae, Adriano Carmezim, Adrià Arrufat,
+agramesh1, Akimitsu Seo, Alan Mosca, Alex Egg, Alex Rothberg, Alexander Heinecke,
+Alexander Matyasko, Alexandr Baranezky, Alexandre Caulier, Ali Siddiqui, Anand Venkat,
+Andrew Hundt, Androbin, Anmol Sharma, Arie, Arno Leist, Arron Cao, AuréLien Geron, Bairen Yi,
+Beomsu Kim, Carl Thomé, cfperez, Changming Sun, Corey Wharton, critiqjo, Dalei Li, Daniel
+Rasmussen, Daniel Trebbien, DaríO Hereñú, David Eng, David Norman, David Y. Zhang, Davy Song, ddurham2,
+Deepak Subburam, Dmytro Kyrychuk, Dominic Rossi, Dominik SchlöSser, Dustin Tran,
+Eduardo Pinho, Egil Martinsson, Elliot Saba, Eric Bigelow, Erik Smistad, Evan Klitzke,
+Fabrizio Milo, Falcon Dai, Fei Gao, FloopCZ, Fung Lam, Gautam, GBLin5566, Greg Peatfield,
+Gu Wang, Guenther Schmuelling, Hans Pabst, Harun Gunaydin, Huaizheng, Ido Shamay, Ikaro
+Silva, Ilya Edrenkin, Immexxx, James Mishra, Jamie Cooke, Jay Young, Jayaram Bobba,
+Jianfei Wang, jinghua2, Joey Meyer, John Maidens, Jonghoon Jin, Julian Villella,
+Jun Kim, Jun Shi, Junwei Pan, jyegerlehner, Karan Desai, Karel Van De Plassche,
+Kb Sriram, KhabarlakKonstantin, Koan-Sin Tan, krivard, Kwotsin, Leandro Gracia Gil,
+Li Chen, Liangliang He, Louie Helm, lspvic, Luiz Henrique Soares, LáSzló Csomor,
+Mark Wong, Mathew Wicks, Matthew Rahtz, Maxwell Paul Brickner, Michael Hofmann, Miguel
+Flores Ruiz De Eguino, MikeTam1021, Mortada Mehyar, Mycosynth, Namnamseo,
+Nate Harada, Neven Miculinic, Nghia Tran, Nick Lyu, Niranjan Hasabnis, Nishidha, Oleksii
+Kuchaiev, Oyesh Mann Singh, Panmari, Patrick, Paul Van Eck, Piyush Chaudhary, Quim Llimona,
+Raingo, Richard Davies, Ruben Vereecken, Sahit Chintalapudi, Sam Abrahams, Santiago Castro,
+Scott Sievert, Sean O'Keefe, Sebastian Schlecht, Shane, Shubhankar Deshpande, Spencer Schaber,
+Sunyeop Lee, t13m, td2014, Thomas H. P. Andersen, Toby Petty, Umang Mehta,
+Vadim Markovtsev, Valentin Iovene, Vincent Zhao, Vit Stepanovs, Vivek Rane, Vu Pham, wannabesrevenge,
+weipingpku, wuhaixutab, wydwww, Xiang Gao, Xiaolin Lin, xiaoyaozhuzi, Yaroslav Bulatov, Yi Liu,
+Yoshihiro Sugi, Yuan (Terry) Tang, Yuming Wang, Yuxin Wu, Zader Zheng, Zhaojun Zhang, zhengjiajin,
+ZhipengShen, Ziming Dong, zjj2wry
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+# Release 1.1.0
+
+## Major Features and Improvements
+* Added Java API support for Windows.
+* Added `tf.spectral` module. Moved existing FFT ops to `tf.spectral` while
+  keeping an alias in the old location (`tf.*`).
+* Added 1D, 2D and 3D Fourier transform ops for real signals to `tf.spectral`.
+* Added a `tf.bincount` function.
+* Added Keras 2 API to contrib.
+* Added a new lightweight queue-like object - `RecordInput`.
+* Added `tf.contrib.image.compose_transforms` function.
+* Bring `tf.estimator.*` into the API. Non-deprecated functionality from `tf.contrib.learn.Estimator` is moved to `tf.estimator.Estimator` with cosmetic changes.
+* Docker images: TF images on gcr.io and Docker Hub are upgraded to ubuntu:16.04.
+* Added the following features to TensorFlow Debugger (tfdbg):
+  * Ability to inspect Python source file against TF ops and tensors (command `print_source` / `ps`)
+  * New navigation bar in Curses-based UI
+  * NodeStepper (command `invoke_stepper`) now uses intermediate tensor dumps. It also uses `TensorHandles` as direct feeds during successive `cont` calls for improved performance and reduced memory consumption.
+* Initial release of installation guides for Java, C, and Go.
+* Added Text Dashboard to TensorBoard.
+
+## Deprecations
+
+* TensorFlow 1.1.0 will be the last time we release a binary with Mac GPU support. Going forward, we will stop testing on Mac GPU systems. We continue to welcome patches that maintain Mac GPU support, and we will try to keep the Mac GPU build working.
+
+## Changes to contrib APIs
+* The behavior of RNNCells is now stricter due to the transition towards making RNNCells act more like Keras layers.
+  * If an RNNCell is used twice in two different variable scopes, an error is raised describing how to avoid this behavior.
+  * If an RNNCell is used in a variable scope with existing conflicting variables, an error is raised showing that the RNNCell must be constructed with argument `reuse=True`.
+* Deprecated contrib/distributions `pmf`, `pdf`, `log_pmf`, `log_pdf`.
+* Moved `bayesflow.special_math` to distributions.
+* `tf.contrib.tensor_forest.python.tensor_forest.RandomForestDeviceAssigner` removed.
+* Changed some MVN classes and parameters:
+  * `tf.contrib.distributions.MultivariateNormalFull` replaced by `tf.contrib.distributions.MultivariateNormalTriL`.
+  * `tf.contrib.distributions.MultivariateNormalCholesky` replaced by `tf.contrib.distributions.MultivariateNormalTriL`
+  * `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev` replaced
+    by `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale`
+  * `tf.contrib.distributions.MultivariateNormalDiag` arguments changed from `mu`, `diag_stddev` to `log`, `scale_diag`.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` removed.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusLowRank` added.
+
+## Bug Fixes and Other Changes
+* Java: Support for loading models exported using the SavedModel API (courtesy @EronWright).
+* Go: Added support for incremental graph execution.
+* Fix a bug in the WALS solver when single-threaded.
+* Added support for integer sparse feature values in `tf.contrib.layers.sparse_column_with_keys`.
+* Fixed `tf.set_random_seed(0)` to be deterministic for all ops.
+* Stability improvements for the GCS file system support.
+* Improved TensorForest performance.
+* Added support for multiple filename globs in `tf.matching_files`.
+* `LogMessage` now includes a timestamp as beginning of a message.
+* Added MultiBox person detector example standalone binary.
+* Android demo: Makefile build functionality added to build.gradle to fully support building TensorFlow demo in Android on Windows.
+* Android demo: read MultiBox priors from txt file rather than protobuf.
+* Added colocation constraints to `StagingArea`.
+* `sparse_matmul_op` reenabled for Android builds.
+* Restrict weights rank to be the same as the broadcast target, to avoid ambiguity on broadcast rules.
+* Upgraded libxsmm to 1.7.1 and applied other changes for performance and memory usage.
+* Fixed bfloat16 integration of LIBXSMM sparse mat-mul.
+* Improved performance and reduce memory usage by allowing ops to forward input buffers to output buffers and perform computations in-place.
+* Improved the performance of CPU assignment for strings.
+* Speed up matrix * vector multiplication and matrix * matrix with unknown shapes.
+* C API: Graph imports now support input remapping, control dependencies, and returning imported nodes (see `TF_GraphImportGraphDefWithReturnOutputs()`)
+* Multiple C++ API updates.
+* Multiple TensorBoard updates including:
+  * Users can now view image summaries at various sampled steps (instead of just the last step).
+  * Bugs involving switching runs as well as the image dashboard are fixed.
+  * Removed data download links from TensorBoard.
+  * TensorBoard uses a relative data directory, for easier embedding.
+  * TensorBoard automatically ignores outliers for domain calculation, and formats proportional values consistently.
+* Multiple tfdbg bug fixes:
+  * Fixed Windows compatibility issues.
+  * Command history now persists across runs.
+  * Bug fix in graph validation related to `tf.while_loops`.
+* Java Maven fixes for bugs with Windows installation.
+* Backport fixes and improvements from external keras.
+* Keras config file handling fix.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton 
+Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
+Visser, Brady Zhou, Calpa Liu, Changming Sun, Chih Cheng Liang, Christopher
+Berner, Clark Zinzow, @Conchylicultor, Dan Ellis, Dan J, Dan Jarvis, Daniel
+Ylitalo, Darren Garvey, David Norman, David Truong, @DavidNorman, Dimitar
+Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan Noury, Eron Wright, Evgeny
+Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher Coder, Florian Courtial,
+Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang, @guilherme,
+@guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo Gao, Igor
+ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason Morton, Jay
+Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi, @jiqiu, Joan Thibault,
+John C F, Jojy George Varghese, Jon Malmaud, Julian Berman, Julian Niedermeier,
+Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle Bostelmann, @Lezcano, Li
+Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh, Marek Kolodziej, Mark
+Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael Gharbi, MichaëL Defferrard,
+Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal, Nayana Thorat, @nghiattran,
+Nicholas Connor, Nikolaas Steenbergen, Niraj Patel, Niranjan Hasabnis, @Panmari,
+Pavel Bulanov, Philip Pries Henningsen, Philipp Jund, @polonez, Prayag Verma, Rahul
+Kavi, Raphael Gontijo Lopes, @rasbt, Raven Iqqe, Reid Pryzant, Richard Shin, Rizwan
+Asif, Russell Kaplan, Ryo Asakura, RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay,
+Sean Papay, @seaotterman, @selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano
+Probst, @taknevski, @tbonza, @teldridge11, Tim Anglade, Tomas Reimers, Tomer Gafner,
+Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad Firoiu,
+@wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yi Liu, Yuan (Terry) Tang, @Yufeng,
+Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+
+# Release 1.0.1
+
+## Bug Fixes and Other Changes
+* Change GraphConstructor to not increase the version when importing, but instead take the min of all versions.
+* Google Cloud Storage fixes.
+* Removed `tf.core` and `tf.python` modules from the API. These were never intended to be exposed. Please use the same objects through top-level `tf` module instead.
+
 # Release 1.0.0
 
 ## Major Features and Improvements
@@ -87,8 +396,12 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * In the C++ API (in tensorflow/cc), Input, Output, etc. have moved
   from the tensorflow::ops namespace to tensorflow.
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
+* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.2.
+* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
+* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.
 
 ## Bug Fixes and Other Changes
+* Numerous C++ API updates.
 * New op: `parallel_stack`.
 * Introducing common tf io compression options constants for
   RecordReader/RecordWriter.
@@ -127,6 +440,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * `tf.divide` now honors the name field.
 * Make metrics weight broadcasting more strict.
 * Add new queue-like `StagingArea` and new ops: `stage` and `unstage`.
+* Enable inplace update ops for strings on CPU. Speed up string concat.
 
 ## Thanks to our Contributors
 
@@ -193,7 +507,7 @@ answered questions, and were part of inspiring discussions.
   indexing now starts from 1 instead of 0, and `bus_id==0` is used where
   previously `BUS_ANY` was used.
 * `Env::FileExists` and `FileSystem::FileExists` now return a tensorflow::Status
-  intead of a bool. Any callers to this function can be converted to a bool
+  instead of a bool. Any callers to this function can be converted to a bool
   by adding .ok() to the call.
 * The C API type `TF_SessionWithGraph` has been renamed to `TF_Session`,
   indicating its preferred use in language bindings for TensorFlow.
@@ -212,7 +526,7 @@ answered questions, and were part of inspiring discussions.
 * `SparseTensor.shape` has been renamed to `SparseTensor.dense_shape`.  Same for
   `SparseTensorValue.shape`.
 * `Env::FileExists` and `FileSystem::FileExists` now return a
-  `tensorflow::Status` intead of a bool. Any callers to this function can be
+  `tensorflow::Status` instead of a bool. Any callers to this function can be
   converted to a bool by adding `.ok()` to the call.
 * C API: Type `TF_SessionWithGraph` has been renamed to `TF_Session`, indicating
   its preferred use in language bindings for TensorFlow. What was previously
diff --git a/WORKSPACE b/WORKSPACE
index 958a53c30ed..74ce13f4e88 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "60fc6977908f999b23ca65698c2bb70213403824a84f7904310b6000d78be9ce",
-    strip_prefix = "rules_closure-5ca1dab6df9ad02050f7ba4e816407f88690cf7d",
+    sha256 = "bc41b80486413aaa551860fc37471dbc0666e1dbb5236fb6177cb83b0c105846",
+    strip_prefix = "rules_closure-dec425a4ff3faf09a56c85d082e4eed05d8ce38f",
     urls = [
-        "http://bazel-mirror.storage.googleapis.com/github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",  # 2017-02-03
-        "https://github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",
+        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",  # 2017-06-02
+        "https://github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",
     ],
 )
 
@@ -14,510 +14,56 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
-load("//tensorflow:workspace.bzl", "check_version", "tf_workspace")
-
-# We must check the bazel version before trying to parse any other BUILD files,
-# in case the parsing of those build files depends on the bazel version we
-# require here.
-check_version("0.4.2")
+load("//tensorflow:workspace.bzl", "tf_workspace")
 
 # Uncomment and update the paths in these entries to build the Android demo.
 #android_sdk_repository(
 #    name = "androidsdk",
 #    api_level = 23,
-#    build_tools_version = "23.0.1",
+#    # Ensure that you have the build_tools_version below installed in the
+#    # SDK manager as it updates periodically.
+#    build_tools_version = "25.0.2",
 #    # Replace with path to Android SDK on your system
 #    path = "<PATH_TO_SDK>",
 #)
 #
+# Android NDK r12b is recommended (higher may cause issues with Bazel)
 #android_ndk_repository(
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
-#    api_level=21)
+#    # This needs to be 14 or higher to compile TensorFlow.
+#    # Note that the NDK version is not the API level.
+#    api_level=14)
 
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
 
 new_http_archive(
-  name = "inception5h",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
-  sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364"
+    name = "inception5h",
+    build_file = "models.BUILD",
+    sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
+        "http://download.tensorflow.org/models/inception5h.zip",
+    ],
 )
 
 new_http_archive(
-  name = "mobile_multibox",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
-  sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96"
+    name = "mobile_multibox",
+    build_file = "models.BUILD",
+    sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
+        "http://download.tensorflow.org/models/mobile_multibox_v1a.zip",
+    ],
 )
 
 new_http_archive(
-  name = "stylize",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
-  sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa"
-)
-
-# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT
-
-new_http_archive(
-  name = "d3",
-  build_file = "bower.BUILD",
-  url = "https://github.com/mbostock-bower/d3-bower/archive/v3.5.15.tar.gz",
-  strip_prefix = "d3-bower-3.5.15",
-)
-
-new_http_archive(
-  name = "dagre",
-  build_file = "bower.BUILD",
-  url = "https://github.com/cpettitt/dagre/archive/v0.7.4.tar.gz",
-  strip_prefix = "dagre-0.7.4",
-)
-
-new_http_archive(
-  name = "es6_promise",
-  build_file = "bower.BUILD",
-  url = "https://github.com/components/es6-promise/archive/v2.1.0.tar.gz",
-  strip_prefix = "es6-promise-2.1.0",
-)
-
-new_http_archive(
-  name = "font_roboto",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/font-roboto/archive/v1.0.1.tar.gz",
-  strip_prefix = "font-roboto-1.0.1",
-)
-
-new_http_archive(
-  name = "graphlib",
-  build_file = "bower.BUILD",
-  url = "https://github.com/cpettitt/graphlib/archive/v1.0.7.tar.gz",
-  strip_prefix = "graphlib-1.0.7",
-)
-
-new_http_archive(
-  name = "iron_a11y_announcer",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-  strip_prefix = "iron-a11y-announcer-1.0.5",
-)
-
-new_http_archive(
-  name = "iron_a11y_keys_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-  strip_prefix = "iron-a11y-keys-behavior-1.1.8",
-)
-
-new_http_archive(
-  name = "iron_ajax",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-ajax/archive/v1.2.0.tar.gz",
-  strip_prefix = "iron-ajax-1.2.0",
-)
-
-new_http_archive(
-  name = "iron_autogrow_textarea",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-  strip_prefix = "iron-autogrow-textarea-1.0.12",
-)
-
-new_http_archive(
-  name = "iron_behaviors",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-behaviors/archive/v1.0.17.tar.gz",
-  strip_prefix = "iron-behaviors-1.0.17",
-)
-
-new_http_archive(
-  name = "iron_checked_element_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-  strip_prefix = "iron-checked-element-behavior-1.0.4",
-)
-
-new_http_archive(
-  name = "iron_collapse",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-collapse/archive/v1.0.8.tar.gz",
-  strip_prefix = "iron-collapse-1.0.8",
-)
-
-new_http_archive(
-  name = "iron_dropdown",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-dropdown/archive/v1.4.0.tar.gz",
-  strip_prefix = "iron-dropdown-1.4.0",
-)
-
-new_http_archive(
-  name = "iron_fit_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-  strip_prefix = "iron-fit-behavior-1.2.5",
-)
-
-new_http_archive(
-  name = "iron_flex_layout",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-flex-layout/archive/v1.3.0.tar.gz",
-  strip_prefix = "iron-flex-layout-1.3.0",
-)
-
-new_http_archive(
-  name = "iron_form_element_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-  strip_prefix = "iron-form-element-behavior-1.0.6",
-)
-
-new_http_archive(
-  name = "iron_icon",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-icon/archive/v1.0.11.tar.gz",
-  strip_prefix = "iron-icon-1.0.11",
-)
-
-new_http_archive(
-  name = "iron_icons",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-icons/archive/v1.1.3.tar.gz",
-  strip_prefix = "iron-icons-1.1.3",
-)
-
-new_http_archive(
-  name = "iron_iconset_svg",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-  strip_prefix = "iron-iconset-svg-1.1.0",
-)
-
-new_http_archive(
-  name = "iron_input",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-input/archive/1.0.10.tar.gz",
-  strip_prefix = "iron-input-1.0.10",
-)
-
-new_http_archive(
-  name = "iron_list",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-list/archive/v1.3.9.tar.gz",
-  strip_prefix = "iron-list-1.3.9",
-)
-
-new_http_archive(
-  name = "iron_menu_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-  strip_prefix = "iron-menu-behavior-1.1.10",
-)
-
-new_http_archive(
-  name = "iron_meta",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-meta/archive/v1.1.1.tar.gz",
-  strip_prefix = "iron-meta-1.1.1",
-)
-
-new_http_archive(
-  name = "iron_overlay_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-  strip_prefix = "iron-overlay-behavior-1.10.1",
-)
-
-new_http_archive(
-  name = "iron_range_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-range-behavior/archive/v1.0.4.tar.gz",
-  strip_prefix = "iron-range-behavior-1.0.4",
-)
-
-new_http_archive(
-  name = "iron_resizable_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-  strip_prefix = "iron-resizable-behavior-1.0.3",
-)
-
-new_http_archive(
-  name = "iron_scroll_target_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-  strip_prefix = "iron-scroll-target-behavior-1.0.3",
-)
-
-new_http_archive(
-  name = "iron_selector",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-selector/archive/v1.5.2.tar.gz",
-  strip_prefix = "iron-selector-1.5.2",
-)
-
-new_http_archive(
-  name = "iron_validatable_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-  strip_prefix = "iron-validatable-behavior-1.1.1",
-)
-
-new_http_archive(
-  name = "lodash",
-  build_file = "bower.BUILD",
-  url = "https://github.com/lodash/lodash/archive/3.8.0.tar.gz",
-  strip_prefix = "lodash-3.8.0",
-)
-
-new_http_archive(
-  name = "neon_animation",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/neon-animation/archive/v1.2.2.tar.gz",
-  strip_prefix = "neon-animation-1.2.2",
-)
-
-http_file(
-  name = "numericjs_numeric_min_js",
-  url = "https://cdnjs.cloudflare.com/ajax/libs/numeric/1.2.6/numeric.min.js",
-)
-
-new_http_archive(
-  name = "paper_behaviors",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-behaviors/archive/v1.0.12.tar.gz",
-  strip_prefix = "paper-behaviors-1.0.12",
-)
-
-new_http_archive(
-  name = "paper_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-button/archive/v1.0.11.tar.gz",
-  strip_prefix = "paper-button-1.0.11",
-)
-
-new_http_archive(
-  name = "paper_checkbox",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-checkbox/archive/v1.4.0.tar.gz",
-  strip_prefix = "paper-checkbox-1.4.0",
-)
-
-new_http_archive(
-  name = "paper_dialog",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog/archive/v1.0.4.tar.gz",
-  strip_prefix = "paper-dialog-1.0.4",
-)
-
-new_http_archive(
-  name = "paper_dialog_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-  strip_prefix = "paper-dialog-behavior-1.2.5",
-)
-
-new_http_archive(
-  name = "paper_dialog_scrollable",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-  strip_prefix = "paper-dialog-scrollable-1.1.5",
-)
-
-new_http_archive(
-  name = "paper_dropdown_menu",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-  strip_prefix = "paper-dropdown-menu-1.4.0",
-)
-
-new_http_archive(
-  name = "paper_header_panel",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-header-panel/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-header-panel-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_icon_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-icon-button/archive/v1.1.3.tar.gz",
-  strip_prefix = "paper-icon-button-1.1.3",
-)
-
-new_http_archive(
-  name = "paper_input",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-input/archive/v1.1.18.tar.gz",
-  strip_prefix = "paper-input-1.1.18",
-)
-
-new_http_archive(
-  name = "paper_item",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-item/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-item-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_listbox",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-listbox/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-listbox-1.1.2",
-)
-
-new_http_archive(
-  name = "paper_material",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-material/archive/v1.0.6.tar.gz",
-  strip_prefix = "paper-material-1.0.6",
-)
-
-new_http_archive(
-  name = "paper_menu",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-menu/archive/v1.2.2.tar.gz",
-  strip_prefix = "paper-menu-1.2.2",
-)
-
-new_http_archive(
-  name = "paper_menu_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-menu-button/archive/v1.5.1.tar.gz",
-  strip_prefix = "paper-menu-button-1.5.1",
-)
-
-new_http_archive(
-  name = "paper_progress",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-progress/archive/v1.0.9.tar.gz",
-  strip_prefix = "paper-progress-1.0.9",
-)
-
-new_http_archive(
-  name = "paper_radio_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-radio-button/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-radio-button-1.1.2",
-)
-
-new_http_archive(
-  name = "paper_radio_group",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-radio-group/archive/v1.0.9.tar.gz",
-  strip_prefix = "paper-radio-group-1.0.9",
-)
-
-new_http_archive(
-  name = "paper_ripple",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-ripple/archive/v1.0.5.tar.gz",
-  strip_prefix = "paper-ripple-1.0.5",
-)
-
-new_http_archive(
-  name = "paper_slider",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-slider/archive/v1.0.10.tar.gz",
-  strip_prefix = "paper-slider-1.0.10",
-)
-
-new_http_archive(
-  name = "paper_spinner",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-spinner/archive/v1.1.1.tar.gz",
-  strip_prefix = "paper-spinner-1.1.1",
-)
-
-new_http_archive(
-  name = "paper_styles",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-styles/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-styles-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_tabs",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-tabs/archive/v1.7.0.tar.gz",
-  strip_prefix = "paper-tabs-1.7.0",
-)
-
-new_http_archive(
-  name = "paper_toast",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toast/archive/v1.3.0.tar.gz",
-  strip_prefix = "paper-toast-1.3.0",
-)
-
-new_http_archive(
-  name = "paper_toggle_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toggle-button/archive/v1.2.0.tar.gz",
-  strip_prefix = "paper-toggle-button-1.2.0",
-)
-
-new_http_archive(
-  name = "paper_toolbar",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toolbar/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-toolbar-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_tooltip",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-tooltip/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-tooltip-1.1.2",
-)
-
-new_http_archive(
-  name = "plottable",
-  build_file = "bower.BUILD",
-  url = "https://github.com/palantir/plottable/archive/v1.16.1.tar.gz",
-  strip_prefix = "plottable-1.16.1",
-)
-
-new_http_archive(
-  name = "polymer_archive",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-  strip_prefix = "polymer-1.7.0",
-)
-
-new_http_archive(
-  name = "promise_polyfill",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerlabs/promise-polyfill/archive/v1.0.0.tar.gz",
-  strip_prefix = "promise-polyfill-1.0.0",
-)
-
-http_file(
-  name = "three_js_three_min_js",
-  url = "https://raw.githubusercontent.com/mrdoob/three.js/r77/build/three.min.js",
-)
-
-http_file(
-  name = "three_js_orbitcontrols_js",
-  url = "https://raw.githubusercontent.com/mrdoob/three.js/r77/examples/js/controls/OrbitControls.js",
-)
-
-new_http_archive(
-  name = "web_animations_js",
-  build_file = "bower.BUILD",
-  url = "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-  strip_prefix = "web-animations-js-2.2.1",
-)
-
-new_http_archive(
-  name = "webcomponentsjs",
-  build_file = "bower.BUILD",
-  url = "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-  strip_prefix = "webcomponentsjs-0.7.22",
-)
-
-http_file(
-  name = "weblas_weblas_js",
-  url = "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+    name = "stylize",
+    build_file = "models.BUILD",
+    sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
+        "http://download.tensorflow.org/models/stylize_v1.zip",
+    ],
 )
diff --git a/bower.BUILD b/bower.BUILD
deleted file mode 100644
index eabd1d64507..00000000000
--- a/bower.BUILD
+++ /dev/null
@@ -1,645 +0,0 @@
-# AUTOGENERATED FILE by tensorboard_bower_dependency_sync.py
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "d3",
-    srcs = [
-        "d3.js",
-        "d3.min.js",
-        "package.js",
-    ],
-)
-
-filegroup(
-    name = "dagre",
-    srcs = [
-        "dist/dagre.core.js",
-        "dist/dagre.core.min.js",
-    ],
-)
-
-filegroup(
-    name = "es6_promise",
-    srcs = [
-        "promise.js",
-        "promise.min.js",
-    ],
-)
-
-filegroup(
-    name = "font_roboto",
-    srcs = ["roboto.html"],
-)
-
-filegroup(
-    name = "graphlib",
-    srcs = [
-        "dist/graphlib.core.js",
-        "dist/graphlib.core.min.js",
-    ],
-)
-
-filegroup(
-    name = "iron_a11y_announcer",
-    srcs = [
-        "index.html",
-        "iron-a11y-announcer.html",
-    ],
-)
-
-filegroup(
-    name = "iron_a11y_keys_behavior",
-    srcs = [
-        "index.html",
-        "iron-a11y-keys-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_ajax",
-    srcs = [
-        "index.html",
-        "iron-ajax.html",
-        "iron-request.html",
-    ],
-)
-
-filegroup(
-    name = "iron_autogrow_textarea",
-    srcs = [
-        "index.html",
-        "iron-autogrow-textarea.html",
-    ],
-)
-
-filegroup(
-    name = "iron_behaviors",
-    srcs = [
-        "index.html",
-        "iron-button-state.html",
-        "iron-control-state.html",
-    ],
-)
-
-filegroup(
-    name = "iron_checked_element_behavior",
-    srcs = [
-        "index.html",
-        "iron-checked-element-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_collapse",
-    srcs = [
-        "index.html",
-        "iron-collapse.html",
-    ],
-)
-
-filegroup(
-    name = "iron_dropdown",
-    srcs = [
-        "index.html",
-        "iron-dropdown.html",
-        "iron-dropdown-scroll-manager.html",
-    ],
-)
-
-filegroup(
-    name = "iron_fit_behavior",
-    srcs = [
-        "index.html",
-        "iron-fit-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_flex_layout",
-    srcs = [
-        "classes/iron-flex-layout.html",
-        "classes/iron-shadow-flex-layout.html",
-        "index.html",
-        "iron-flex-layout.html",
-        "iron-flex-layout-classes.html",
-    ],
-)
-
-filegroup(
-    name = "iron_form_element_behavior",
-    srcs = [
-        "index.html",
-        "iron-form-element-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_icon",
-    srcs = [
-        "index.html",
-        "iron-icon.html",
-    ],
-)
-
-filegroup(
-    name = "iron_icons",
-    srcs = [
-        "av-icons.html",
-        "communication-icons.html",
-        "device-icons.html",
-        "editor-icons.html",
-        "hardware-icons.html",
-        "image-icons.html",
-        "index.html",
-        "iron-icons.html",
-        "maps-icons.html",
-        "notification-icons.html",
-        "places-icons.html",
-        "social-icons.html",
-    ],
-)
-
-filegroup(
-    name = "iron_iconset_svg",
-    srcs = [
-        "index.html",
-        "iron-iconset-svg.html",
-    ],
-)
-
-filegroup(
-    name = "iron_input",
-    srcs = [
-        "index.html",
-        "iron-input.html",
-    ],
-)
-
-filegroup(
-    name = "iron_list",
-    srcs = [
-        "index.html",
-        "iron-list.html",
-        "test/smoke/avg-worst-case.html",
-        "test/smoke/dummy-data.html",
-        "test/smoke/index.html",
-        "test/smoke/physical-count.html",
-    ],
-)
-
-filegroup(
-    name = "iron_menu_behavior",
-    srcs = [
-        "index.html",
-        "iron-menu-behavior.html",
-        "iron-menubar-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_meta",
-    srcs = [
-        "index.html",
-        "iron-meta.html",
-    ],
-)
-
-filegroup(
-    name = "iron_overlay_behavior",
-    srcs = [
-        "index.html",
-        "iron-focusables-helper.html",
-        "iron-overlay-backdrop.html",
-        "iron-overlay-behavior.html",
-        "iron-overlay-manager.html",
-    ],
-)
-
-filegroup(
-    name = "iron_range_behavior",
-    srcs = [
-        "index.html",
-        "iron-range-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_resizable_behavior",
-    srcs = [
-        "demo/src/x-app.html",
-        "index.html",
-        "iron-resizable-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_scroll_target_behavior",
-    srcs = [
-        "index.html",
-        "iron-scroll-target-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_selector",
-    srcs = [
-        "index.html",
-        "iron-multi-selectable.html",
-        "iron-selectable.html",
-        "iron-selection.html",
-        "iron-selector.html",
-    ],
-)
-
-filegroup(
-    name = "iron_validatable_behavior",
-    srcs = [
-        "index.html",
-        "iron-validatable-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "lodash",
-    srcs = [
-        "lodash.js",
-        "lodash.min.js",
-    ],
-)
-
-filegroup(
-    name = "neon_animation",
-    srcs = [
-        "animations/cascaded-animation.html",
-        "animations/fade-in-animation.html",
-        "animations/fade-out-animation.html",
-        "animations/hero-animation.html",
-        "animations/opaque-animation.html",
-        "animations/reverse-ripple-animation.html",
-        "animations/ripple-animation.html",
-        "animations/scale-down-animation.html",
-        "animations/scale-up-animation.html",
-        "animations/slide-down-animation.html",
-        "animations/slide-from-bottom-animation.html",
-        "animations/slide-from-left-animation.html",
-        "animations/slide-from-right-animation.html",
-        "animations/slide-from-top-animation.html",
-        "animations/slide-left-animation.html",
-        "animations/slide-right-animation.html",
-        "animations/slide-up-animation.html",
-        "animations/transform-animation.html",
-        "demo/card/index.html",
-        "demo/card/x-card.html",
-        "demo/card/x-cards-list.html",
-        "demo/declarative/index.html",
-        "demo/doc/index.html",
-        "demo/doc/my-animatable.html",
-        "demo/doc/my-dialog.html",
-        "demo/dropdown/animated-dropdown.html",
-        "demo/dropdown/index.html",
-        "demo/grid/animated-grid.html",
-        "demo/grid/fullsize-page-with-card.html",
-        "demo/grid/index.html",
-        "demo/list/full-view.html",
-        "demo/list/index.html",
-        "demo/list/list-demo.html",
-        "demo/list/list-view.html",
-        "demo/load/animated-grid.html",
-        "demo/load/full-page.html",
-        "demo/load/index.html",
-        "demo/reprojection/animated-grid.html",
-        "demo/reprojection/fullsize-page-with-card.html",
-        "demo/reprojection/index.html",
-        "demo/reprojection/reprojected-pages.html",
-        "demo/tiles/circles-page.html",
-        "demo/tiles/index.html",
-        "demo/tiles/squares-page.html",
-        "index.html",
-        "neon-animatable.html",
-        "neon-animatable-behavior.html",
-        "neon-animated-pages.html",
-        "neon-animation.html",
-        "neon-animation-behavior.html",
-        "neon-animation-runner-behavior.html",
-        "neon-animations.html",
-        "neon-shared-element-animatable-behavior.html",
-        "neon-shared-element-animation-behavior.html",
-        "web-animations.html",
-    ],
-)
-
-filegroup(
-    name = "paper_behaviors",
-    srcs = [
-        "index.html",
-        "paper-button-behavior.html",
-        "paper-checked-element-behavior.html",
-        "paper-inky-focus-behavior.html",
-        "paper-ripple-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "paper_button",
-    srcs = [
-        "index.html",
-        "paper-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_checkbox",
-    srcs = [
-        "index.html",
-        "paper-checkbox.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog",
-    srcs = [
-        "index.html",
-        "paper-dialog.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog_behavior",
-    srcs = [
-        "index.html",
-        "paper-dialog-behavior.html",
-        "paper-dialog-common.css",
-        "paper-dialog-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog_scrollable",
-    srcs = [
-        "index.html",
-        "paper-dialog-scrollable.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dropdown_menu",
-    srcs = [
-        "index.html",
-        "paper-dropdown-menu.html",
-        "paper-dropdown-menu-icons.html",
-        "paper-dropdown-menu-light.html",
-        "paper-dropdown-menu-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_header_panel",
-    srcs = [
-        "index.html",
-        "paper-header-panel.html",
-    ],
-)
-
-filegroup(
-    name = "paper_icon_button",
-    srcs = [
-        "index.html",
-        "paper-icon-button.html",
-        "paper-icon-button-light.html",
-    ],
-)
-
-filegroup(
-    name = "paper_input",
-    srcs = [
-        "all-imports.html",
-        "index.html",
-        "paper-input.html",
-        "paper-input-addon-behavior.html",
-        "paper-input-behavior.html",
-        "paper-input-char-counter.html",
-        "paper-input-container.html",
-        "paper-input-error.html",
-        "paper-textarea.html",
-    ],
-)
-
-filegroup(
-    name = "paper_item",
-    srcs = [
-        "all-imports.html",
-        "index.html",
-        "paper-icon-item.html",
-        "paper-item.html",
-        "paper-item-behavior.html",
-        "paper-item-body.html",
-        "paper-item-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_listbox",
-    srcs = [
-        "index.html",
-        "paper-listbox.html",
-    ],
-)
-
-filegroup(
-    name = "paper_material",
-    srcs = [
-        "index.html",
-        "paper-material.html",
-        "paper-material-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_menu",
-    srcs = [
-        "index.html",
-        "paper-menu.html",
-        "paper-menu-shared-styles.html",
-        "paper-submenu.html",
-    ],
-)
-
-filegroup(
-    name = "paper_menu_button",
-    srcs = [
-        "index.html",
-        "paper-menu-button.html",
-        "paper-menu-button-animations.html",
-    ],
-)
-
-filegroup(
-    name = "paper_progress",
-    srcs = [
-        "index.html",
-        "paper-progress.html",
-    ],
-)
-
-filegroup(
-    name = "paper_radio_button",
-    srcs = [
-        "index.html",
-        "paper-radio-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_radio_group",
-    srcs = [
-        "index.html",
-        "paper-radio-group.html",
-    ],
-)
-
-filegroup(
-    name = "paper_ripple",
-    srcs = [
-        "index.html",
-        "paper-ripple.html",
-    ],
-)
-
-filegroup(
-    name = "paper_slider",
-    srcs = [
-        "index.html",
-        "paper-slider.html",
-    ],
-)
-
-filegroup(
-    name = "paper_spinner",
-    srcs = [
-        "index.html",
-        "paper-spinner.html",
-        "paper-spinner-behavior.html",
-        "paper-spinner-lite.html",
-        "paper-spinner-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_styles",
-    srcs = [
-        "classes/global.html",
-        "classes/shadow.html",
-        "classes/shadow-layout.html",
-        "classes/typography.html",
-        "color.html",
-        "default-theme.html",
-        "demo.css",
-        "demo-pages.html",
-        "index.html",
-        "paper-styles.html",
-        "paper-styles-classes.html",
-        "shadow.html",
-        "typography.html",
-    ],
-)
-
-filegroup(
-    name = "paper_tabs",
-    srcs = [
-        "index.html",
-        "paper-tab.html",
-        "paper-tabs.html",
-        "paper-tabs-icons.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toast",
-    srcs = [
-        "index.html",
-        "paper-toast.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toggle_button",
-    srcs = [
-        "index.html",
-        "paper-toggle-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toolbar",
-    srcs = [
-        "index.html",
-        "paper-toolbar.html",
-    ],
-)
-
-filegroup(
-    name = "paper_tooltip",
-    srcs = [
-        "index.html",
-        "paper-tooltip.html",
-    ],
-)
-
-filegroup(
-    name = "plottable",
-    srcs = [
-        "plottable.css",
-        "plottable.js",
-        "plottable.min.js",
-    ],
-)
-
-filegroup(
-    name = "polymer",
-    srcs = [
-        "polymer.html",
-        "polymer-micro.html",
-        "polymer-mini.html",
-    ],
-)
-
-filegroup(
-    name = "promise_polyfill",
-    srcs = [
-        "Gruntfile.js",
-        "Promise.js",
-        "Promise.min.js",
-        "Promise-Statics.js",
-        "promise-polyfill.html",
-        "promise-polyfill-lite.html",
-    ],
-)
-
-filegroup(
-    name = "web_animations_js",
-    srcs = [
-        "web-animations.html",
-        "web-animations.min.js",
-        "web-animations-next.min.js",
-        "web-animations-next-lite.min.js",
-    ],
-)
-
-filegroup(
-    name = "webcomponentsjs",
-    srcs = [
-        "CustomElements.js",
-        "CustomElements.min.js",
-        "HTMLImports.js",
-        "HTMLImports.min.js",
-        "MutationObserver.js",
-        "MutationObserver.min.js",
-        "ShadowDOM.js",
-        "ShadowDOM.min.js",
-        "webcomponents.js",
-        "webcomponents.min.js",
-        "webcomponents-lite.js",
-        "webcomponents-lite.min.js",
-    ],
-)
diff --git a/configure b/configure
index 4f1dc2a9102..602124225fe 100755
--- a/configure
+++ b/configure
@@ -3,6 +3,8 @@
 set -e
 set -o pipefail
 
+MIN_BAZEL_VERSION=0.4.5
+
 # Find out the absolute path to where ./configure resides
 pushd `dirname $0` > /dev/null
 SOURCE_BASE_DIR=`pwd -P`
@@ -11,40 +13,179 @@ popd > /dev/null
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
 
 function is_linux() {
-  if [[ "${PLATFORM}" == "linux" ]]; then
-    true
-  else
-    false
-  fi
+  [[ "${PLATFORM}" == "linux" ]]
 }
 
 function is_macos() {
-  if [[ "${PLATFORM}" == "darwin" ]]; then
-    true
-  else
-    false
-  fi
+  [[ "${PLATFORM}" == "darwin" ]]
 }
 
 function is_windows() {
   # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]; then
-    true
-  else
-    false
-  fi
+  [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]
 }
 
-function bazel_clean_and_fetch() {
-  # bazel clean --expunge currently doesn't work on Windows
-  # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
-  if ! is_windows; then
-    bazel clean --expunge
-  fi
-  bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... \
-      -//tensorflow/examples/android/..."
+function sed_in_place() {
+  sed -e $1 $2 > "$2.bak"
+  mv "$2.bak" $2
 }
 
+function write_to_bazelrc() {
+  echo "$1" >> .tf_configure.bazelrc
+}
+
+function write_action_env_to_bazelrc() {
+  write_to_bazelrc "build --action_env $1=\"$2\""
+}
+
+function python_path {
+  "$PYTHON_BIN_PATH" - <<END
+from __future__ import print_function
+import site
+import os
+
+try:
+  input = raw_input
+except NameError:
+  pass
+
+python_paths = []
+if os.getenv('PYTHONPATH') is not None:
+  python_paths = os.getenv('PYTHONPATH').split(':')
+try:
+  library_paths = site.getsitepackages()
+except AttributeError:
+ from distutils.sysconfig import get_python_lib
+ library_paths = [get_python_lib()]
+all_paths = set(python_paths + library_paths)
+
+paths = []
+for path in all_paths:
+  if os.path.isdir(path):
+    paths.append(path)
+
+print(",".join(paths))
+END
+}
+
+function setup_python {
+  ## Set up python-related environment settings:
+  while true; do
+    fromuser=""
+    if [ -z "$PYTHON_BIN_PATH" ]; then
+      default_python_bin_path=$(which python || which python3 || true)
+      read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
+      fromuser="1"
+      if [ -z "$PYTHON_BIN_PATH" ]; then
+        PYTHON_BIN_PATH=$default_python_bin_path
+      fi
+    fi
+    if [ -e "$PYTHON_BIN_PATH" ]; then
+      break
+    fi
+    echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
+    if [ -z "$fromuser" ]; then
+      exit 1
+    fi
+    PYTHON_BIN_PATH=""
+    # Retry
+  done
+
+  if [ -z "$PYTHON_LIB_PATH" ]; then
+    # Split python_path into an array of paths, this allows path containing spaces
+    IFS=',' read -r -a python_lib_path <<< "$(python_path)"
+
+    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
+      PYTHON_LIB_PATH=${python_lib_path[0]}
+      echo "Using python library path: $PYTHON_LIB_PATH"
+
+    else
+      echo "Found possible Python library paths:"
+      for x in "${python_lib_path[@]}"; do
+        echo "  $x"
+      done
+      set -- "${python_lib_path[@]}"
+      echo "Please input the desired Python library path to use.  Default is [$1]"
+      read b || true
+      if [ "$b" == "" ]; then
+        PYTHON_LIB_PATH=${python_lib_path[0]}
+        echo "Using python library path: $PYTHON_LIB_PATH"
+      else
+        PYTHON_LIB_PATH="$b"
+      fi
+    fi
+  fi
+
+  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
+    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
+    exit 1
+  fi
+
+  local python_major_version
+  python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);' | head -c1)
+  if [ -z "$python_major_version" ]; then
+    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
+    exit 1
+  fi
+
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows; then
+    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
+    PYTHON_LIB_PATH="$(cygpath -m "$PYTHON_LIB_PATH")"
+  fi
+
+  # Set-up env variables used by python_configure.bzl
+  write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
+  write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH"
+  write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "build --force_python=py$python_major_version"
+  write_to_bazelrc "build --host_force_python=py$python_major_version"
+  write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --force_python=py$python_major_version"
+  write_to_bazelrc "test --host_force_python=py$python_major_version"
+  write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+
+  # Write tools/python_bin_path.sh
+  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
+}
+
+function version {
+  echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }';
+}
+
+
+bazel version > bazel.version
+curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3)
+rm -f bazel.version
+
+
+echo "You have bazel $curr_bazel_version installed."
+if [ -z "$curr_bazel_version" ]; then
+  echo "WARNING: current bazel installation is not a release version."
+  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
+elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
+  echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
+  echo "Exiting..."
+  exit 1
+fi
+
+# This file contains customized config settings.
+rm -f .tf_configure.bazelrc
+touch .tf_configure.bazelrc
+if [[ ! -e .bazelrc ]]; then
+  if [[ -e "${HOME}/.bazelrc" ]]; then
+    echo "import ${HOME}/.bazelrc" >.bazelrc
+  else
+    touch .bazelrc
+  fi
+fi
+sed_in_place "/tf_configure/d" .bazelrc
+echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc
+
 # Delete any leftover BUILD files from the Makefile build, which would interfere
 # with Bazel parsing.
 MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
@@ -52,58 +193,65 @@ if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then
   find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete
 fi
 
-## Set up python-related environment settings
-while true; do
-  fromuser=""
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    default_python_bin_path=$(which python || which python3  || true)
-    read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
-    fromuser="1"
-    if [ -z "$PYTHON_BIN_PATH" ]; then
-      PYTHON_BIN_PATH=$default_python_bin_path
-    fi
-  fi
-  if [ -e "$PYTHON_BIN_PATH" ]; then
-    break
-  fi
-  echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  PYTHON_BIN_PATH=""
-  # Retry
-done
+setup_python
 
 ## Set up MKL related environment settings
-if false; then # Disable building with MKL for now
-  while [ "$TF_NEED_MKL" == "" ]; do
+while [ "$TF_NEED_MKL" == "" ]; do
+  fromuser=""
+  read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
+  fromuser="1"
+  case $INPUT in
+    [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
+    [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+OSNAME=`uname -s`
+
+if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  while [ "$TF_DOWNLOAD_MKL" == "" ]; do
     fromuser=""
-    read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
+    read -p "Do you wish to download MKL LIB from the web? [Y/n] " INPUT
     fromuser="1"
     case $INPUT in
-      [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
-      [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      * ) echo "Invalid selection: " $INPUT;;
+      [Yy]* ) TF_DOWNLOAD_MKL=1;;
+      [Nn]* ) TF_DOWNLOAD_MKL=0;;
+      "" )    TF_DOWNLOAD_MKL=1;;
+      * )     echo "Invalid selection: " $INPUT; exit 1;;
     esac
   done
 
-  OSNAME=`uname -s`
-
-  if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  if [[ "$TF_DOWNLOAD_MKL" == "1" ]]; then
     DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz
-    GITHUB_RELEASE_TAG=v0.3
+    ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
+    GITHUB_RELEASE_TAG=v0.7
     MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
-    if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
-      wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
+    if ! [ -e "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" ]; then
+      curl -fSsL -o "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" "${MKLURL}"
     fi
     tar -xzf $DST/third_party/mkl/$ARCHIVE_BASENAME -C $DST/third_party/mkl/
     extracted_dir_name="${ARCHIVE_BASENAME%.*}"
     MKL_INSTALL_PATH=$DST/third_party/mkl/$extracted_dir_name
     MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
 
-    if [ "$OSNAME" == "Linux" ]; then
+  else
+    default_mkl_path=/opt/intel/mklml
+    fromuser=""
+    if [ -z "$MKL_INSTALL_PATH" ]; then
+      read -p "Please specify the location where MKL is installed. [Default is $default_mkl_path]: " MKL_INSTALL_PATH
+      fromuser="1"
+    fi
+    if [ -z "$MKL_INSTALL_PATH" ]; then
+      MKL_INSTALL_PATH=$default_mkl_path
+    fi
+    # Result returned from "read" will be used unexpanded. That make "~" unusable.
+    # Going through one more level of expansion to handle that.
+    MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
       # Full MKL configuration
       MKL_RT_LIB_PATH="lib/intel64/libmkl_rt.so" #${TF_MKL_EXT}#TODO version?
       MKL_RT_OMP_LIB_PATH="../compiler/lib/intel64/libiomp5.so" #TODO VERSION?
@@ -111,24 +259,29 @@ if false; then # Disable building with MKL for now
       # MKL-ML configuration
       MKL_ML_LIB_PATH="lib/libmklml_intel.so" #${TF_MKL_EXT}#TODO version?
       MKL_ML_OMP_LIB_PATH="lib/libiomp5.so" #TODO VERSION?
-    elif [ "$OSNAME" == "Darwin" ]; then
+  elif [ "$OSNAME" == "Darwin" ]; then
       echo "Darwin is unsupported yet";
       exit 1
-    fi
+  fi
 
-    if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
+  if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_OMP_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
-    else
-      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} does not exist";
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  elif [ -e "$MKL_INSTALL_PATH/${MKL_RT_LIB_PATH}" ]; then
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_OMP_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  else
+      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} nor $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} exists";
       exit 1
-    fi
-
-    if [ -z "$fromuser" ]; then
-      exit 1
-    fi
+  fi
 
 cat > third_party/mkl/mkl.config <<EOF
 # MKL_INSTALL_PATH refers to the location of MKL root folder. The MKL header and library
@@ -136,9 +289,8 @@ cat > third_party/mkl/mkl.config <<EOF
 MKL_INSTALL_PATH=$MKL_INSTALL_PATH
 EOF
 
-  fi # TF_NEED_MKL
-  ################## MKL
-fi # Disable building with MKL for now
+fi # TF_NEED_MKL
+## End MKL setup
 
 ## Set up architecture-dependent optimization flags.
 if [ -z "$CC_OPT_FLAGS" ]; then
@@ -155,6 +307,7 @@ if is_windows; then
   TF_NEED_HDFS=0
   TF_NEED_JEMALLOC=0
   TF_NEED_OPENCL=0
+  TF_CUDA_CLANG=0
 fi
 
 if is_linux; then
@@ -172,13 +325,11 @@ else
   TF_NEED_JEMALLOC=0
 fi
 
-if [ "$TF_NEED_JEMALLOC" == "1" ]; then
-  sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
+  write_to_bazelrc 'build --define with_jemalloc=true'
 fi
 
-while [ "$TF_NEED_GCP" == "" ]; do
+while [[ "$TF_NEED_GCP" == "" ]]; do
   read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
   case $INPUT in
@@ -192,23 +343,11 @@ while [ "$TF_NEED_GCP" == "" ]; do
   esac
 done
 
-if [ "$TF_NEED_GCP" == "1" ]; then
-  ## Verify that libcurl header files are available.
-  # Only check Linux, since on MacOS the header files are installed with XCode.
-  if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then
-    echo "ERROR: It appears that the development version of libcurl is not "\
-"available. Please install the libcurl3-dev package."
-    exit 1
-  fi
-
-  # Update Bazel build configuration.
-  sed -i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed -i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
+if [[ "$TF_NEED_GCP" == "1" ]]; then
+  write_to_bazelrc 'build --define with_gcp_support=true'
 fi
 
-while [ "$TF_NEED_HDFS" == "" ]; do
+while [[ "$TF_NEED_HDFS" == "" ]]; do
   read -p "Do you wish to build TensorFlow with "\
 "Hadoop File System support? [y/N] " INPUT
   case $INPUT in
@@ -222,16 +361,12 @@ while [ "$TF_NEED_HDFS" == "" ]; do
   esac
 done
 
-if [ "$TF_NEED_HDFS" == "1" ]; then
-  # Update Bazel build configuration.
-  sed -i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed -i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
+if [[ "$TF_NEED_HDFS" == "1" ]]; then
+  write_to_bazelrc 'build --define with_hdfs_support=true'
 fi
 
 ## Enable XLA.
-while [ "$TF_ENABLE_XLA" == "" ]; do
+while [[ "$TF_ENABLE_XLA" == "" ]]; do
   read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
   case $INPUT in
     [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
@@ -241,22 +376,32 @@ while [ "$TF_ENABLE_XLA" == "" ]; do
   esac
 done
 
-if [ "$TF_ENABLE_XLA" == "1" ]; then
-  # Update Bazel build configuration.
-  sed -i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = True/" tensorflow/core/platform/default/build_config_root.bzl
-else
-  # Update Bazel build configuration.
-  sed -i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = False/" tensorflow/core/platform/default/build_config_root.bzl
+if [[ "$TF_ENABLE_XLA" == "1" ]]; then
+  write_to_bazelrc 'build --define with_xla_support=true'
 fi
 
+# Verbs configuration
+while [ "$TF_NEED_VERBS" == "" ]; do
+  read -p "Do you wish to build TensorFlow with "\
+"VERBS support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=1;;
+    [Nn]* ) echo "No VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=0;;
+    "" ) echo "No VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
 
-# Invoke python_config and set up symlinks to python includes
-./util/python/python_config.sh --setup "$PYTHON_BIN_PATH"
+if [[ "$TF_NEED_VERBS" == "1" ]]; then
+  write_to_bazelrc 'build --define with_verbs_support=true'
+fi
 
 # Append CC optimization flags to bazel.rc
-echo >> tools/bazel.rc
 for opt in $CC_OPT_FLAGS; do
-  echo "build:opt --cxxopt=$opt --copt=$opt" >> tools/bazel.rc
+  write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt"
 done
 
 # Run the gen_git_source to create links where bazel can track dependencies for
@@ -289,35 +434,46 @@ while [ "$TF_NEED_CUDA" == "" ]; do
 done
 
 export TF_NEED_CUDA
+write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
+
 export TF_NEED_OPENCL
-if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
-  echo "Configuration finished"
-  bazel_clean_and_fetch
-  exit
-fi
+write_action_env_to_bazelrc "TF_NEED_OPENCL" "$TF_NEED_OPENCL"
 
 if [ "$TF_NEED_CUDA" == "1" ]; then
-# Set up which gcc nvcc should use as the host compiler
-# No need to set this on Windows
-while ! is_windows && true; do
+while [[ "$TF_CUDA_CLANG" == "" ]]; do
+  read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;;
+    [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
+    "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+export TF_CUDA_CLANG
+write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
+
+# Set up which clang we should use as the cuda / host compiler.
+while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
   fromuser=""
-  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-    default_gcc_host_compiler_path=$(which gcc || true)
-    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
+  if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
+    default_clang_host_compiler_path=$(which clang || true)
+    read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH
     fromuser="1"
-    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
+    if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
+      CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path"
     fi
   fi
-  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export GCC_HOST_COMPILER_PATH
+  if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
+    export CLANG_CUDA_COMPILER_PATH
+    write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
     break
   fi
-  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
+  echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
   if [ -z "$fromuser" ]; then
     exit 1
   fi
-  GCC_HOST_COMPILER_PATH=""
+  CLANG_CUDA_COMPILER_PATH=""
   # Retry
 done
 
@@ -325,7 +481,7 @@ done
 while true; do
   # Configure the Cuda SDK version to use.
   if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
+    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION
   fi
 
   fromuser=""
@@ -337,6 +493,11 @@ while true; do
       else
         default_cuda_path="$(cygpath -m "$CUDA_PATH")"
       fi
+    elif is_linux; then
+      # If the default doesn't exist, try an alternative default.
+      if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then
+        default_cuda_path=/opt/cuda
+      fi
     fi
     read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
     fromuser="1"
@@ -361,6 +522,7 @@ while true; do
 
   if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
     export CUDA_TOOLKIT_PATH
+    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
     export TF_CUDA_VERSION
     break
   fi
@@ -374,11 +536,47 @@ while true; do
   CUDA_TOOLKIT_PATH=""
 done
 
+# Set default CUDA version if not set
+if [ -z "$TF_CUDA_VERSION" ]; then
+  TF_CUDA_VERSION="8.0"
+  export TF_CUDA_VERSION 
+fi
+write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" 
+
+# Set up which gcc nvcc should use as the host compiler
+# No need to set this on Windows
+while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
+  fromuser=""
+  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+    default_gcc_host_compiler_path=$(which gcc || true)
+    cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc"
+    if [ -L "$cuda_bin_symlink" ]; then
+      default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink)
+    fi
+    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
+    fromuser="1"
+    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
+    fi
+  fi
+  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
+    export GCC_HOST_COMPILER_PATH
+    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
+    break
+  fi
+  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  GCC_HOST_COMPILER_PATH=""
+  # Retry
+done
+
 # Find out where the cuDNN library is installed
 while true; do
-  # Configure the Cudnn version to use.
+  # Configure the cuDNN version to use.
   if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the Cudnn version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+    read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION
   fi
 
   fromuser=""
@@ -389,7 +587,7 @@ while true; do
     if [ -z "$CUDNN_INSTALL_PATH" ]; then
       CUDNN_INSTALL_PATH=$default_cudnn_path
     fi
-    # Result returned from "read" will be used unexpanded. That make "~" unuseable.
+    # Result returned from "read" will be used unexpanded. That make "~" unusable.
     # Going through one more level of expansion to handle that.
     CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
   fi
@@ -411,17 +609,26 @@ while true; do
     CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
   fi
 
-  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
+  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" ] || [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
     export TF_CUDNN_VERSION
+    write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
     export CUDNN_INSTALL_PATH
+    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
     break
   fi
 
   if is_linux; then
-    CUDNN_PATH_FROM_LDCONFIG="$(ldconfig -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
+    if ! type ldconfig > /dev/null 2>&1; then
+        LDCONFIG_BIN=/sbin/ldconfig
+    else
+        LDCONFIG_BIN=ldconfig
+    fi
+    CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
     if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
       export TF_CUDNN_VERSION
-      export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
+      export CUDNN_INSTALL_PATH
+      CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
+      write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
       break
     fi
   fi
@@ -440,6 +647,13 @@ while true; do
   CUDNN_INSTALL_PATH=""
 done
 
+# Set default CUDNN version if not set
+if [ -z "$TF_CUDNN_VERSION" ]; then
+  TF_CUDNN_VERSION="6"
+  export TF_CUDNN_VERSION
+fi
+write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
+
 # Configure the compute capabilities that TensorFlow builds for.
 # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
 while true; do
@@ -473,6 +687,7 @@ EOF
     fi
   else
     export TF_CUDA_COMPUTE_CAPABILITIES
+    write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
     break
   fi
   TF_CUDA_COMPUTE_CAPABILITIES=""
@@ -483,9 +698,15 @@ if is_windows; then
   export CUDA_PATH="$CUDA_TOOLKIT_PATH"
   export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
   export NO_WHOLE_ARCHIVE_OPTION=1
-
-  # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
-  export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+  write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
+  write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
+  write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
+  write_to_bazelrc "build --config=win-cuda"
+  write_to_bazelrc "test --config=win-cuda"
+else
+  # If CUDA is enabled, always use GPU during build and test.
+  write_to_bazelrc "build --config=cuda"
+  write_to_bazelrc "test --config=cuda"
 fi
 
 # end of if "$TF_NEED_CUDA" == "1"
@@ -499,7 +720,7 @@ if [ "$TF_NEED_OPENCL" == "1" ]; then
 while true; do
   fromuser=""
   if [ -z "$HOST_CXX_COMPILER" ]; then
-    default_cxx_host_compiler=$(which clang++-3.6 || true)
+    default_cxx_host_compiler=$(which g++ || true)
     read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
     fromuser="1"
     if [ -z "$HOST_CXX_COMPILER" ]; then
@@ -508,6 +729,7 @@ while true; do
   fi
   if [ -e "$HOST_CXX_COMPILER" ]; then
     export HOST_CXX_COMPILER
+    write_action_env_to_bazelrc "HOST_CXX_COMPILER" "$HOST_CXX_COMPILER"
     break
   fi
   echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
@@ -522,7 +744,7 @@ done
 while true; do
   fromuser=""
   if [ -z "$HOST_C_COMPILER" ]; then
-    default_c_host_compiler=$(which clang-3.6 || true)
+    default_c_host_compiler=$(which gcc || true)
     read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
     fromuser="1"
     if [ -z "$HOST_C_COMPILER" ]; then
@@ -531,6 +753,7 @@ while true; do
   fi
   if [ -e "$HOST_C_COMPILER" ]; then
     export HOST_C_COMPILER
+    write_action_env_to_bazelrc "HOST_C_COMPILER" "$HOST_C_COMPILER"
     break
   fi
   echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
@@ -561,6 +784,7 @@ while true; do
 
   if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
     export COMPUTECPP_TOOLKIT_PATH
+    write_action_env_to_bazelrc "COMPUTECPP_TOOLKIT_PATH" "$COMPUTECPP_TOOLKIT_PATH"
     break
   fi
   echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
@@ -576,6 +800,82 @@ done
 # end of if "$TF_NEED_OPENCL" == "1"
 fi
 
-bazel_clean_and_fetch
+
+while [ "$TF_NEED_MPI" == "" ]; do
+  read -p "Do you wish to build TensorFlow with "\
+"MPI support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "MPI support will be enabled for "\
+"TensorFlow"; TF_NEED_MPI=1;;
+    [Nn]* ) echo "MPI support will not be enabled for "\
+"TensorFlow"; TF_NEED_MPI=0;;
+    "" ) echo "MPI support will not be enabled for "\
+"TensorFlow"; TF_NEED_MPI=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+# Find out where the MPI toolkit is installed
+while true; do
+    if [ "$TF_NEED_MPI" == "0" ]; then
+        break;
+    fi
+
+    fromuser=""
+    if [ -z "$MPI_HOME" ]; then
+        #Get the base folder by removing the bin path
+        default_mpi_path=$(dirname $(dirname $(which mpirun)) || dirname $(dirname $(which mpiexec))  || true)
+        read -p "Please specify the MPI toolkit folder. [Default is $default_mpi_path]: " MPI_HOME
+        fromuser="1"
+        if [ -z "$MPI_HOME" ]; then
+            MPI_HOME=$default_mpi_path
+        fi
+    fi
+
+    #Check that the include and library folders are where we expect them to be
+    if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then
+        break
+    fi
+ 
+    echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found."
+    if [ -z "$fromuser" ]; then
+        exit 1
+    fi
+
+    # Retry
+    MPI_HOME="" 
+done
+    
+    
+if [ "$TF_NEED_MPI" == "1" ]; then
+  write_to_bazelrc 'build --define with_mpi_support=true'
+
+  #Link the MPI header files
+  ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h
+
+
+  #Determine if we use OpenMPI or MVAPICH, these require different header files 
+  #to be included here to make bazel dependency checker happy
+
+  if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then
+        #OpenMPI 
+        ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/
+        sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl
+ else
+        #MVAPICH / MPICH
+        ln -sf "${MPI_HOME}/include/mpio.h" third_party/mpi/
+        ln -sf "${MPI_HOME}/include/mpicxx.h" third_party/mpi/
+        sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl
+ fi
+
+  
+  if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then
+    ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/
+  else
+    echo "Cannot find the MPI library file in ${MPI_HOME}/lib "
+    exit 1
+  fi
+fi
+
 
 echo "Configuration finished"
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 999e11c0e91..6450b2ad878 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -14,8 +14,33 @@ exports_files([
 # Config setting for determining if we are building for Android.
 config_setting(
     name = "android",
+    values = {"crosstool_top": "//external:android/crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_x86",
     values = {
         "crosstool_top": "//external:android/crosstool",
+        "cpu": "x86",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_x86_64",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "x86_64",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_armeabi",
+    values = {
+        "cc_target_os": "android",
+        "cpu": "armeabi",
     },
     visibility = ["//visibility:public"],
 )
@@ -46,6 +71,12 @@ config_setting(
 
 config_setting(
     name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
     visibility = ["//visibility:public"],
 )
@@ -58,9 +89,7 @@ config_setting(
 
 config_setting(
     name = "ios",
-    values = {
-        "crosstool_top": "//tools/osx/crosstool:crosstool",
-    },
+    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
     visibility = ["//visibility:public"],
 )
 
@@ -70,6 +99,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_ppc64le",
+    values = {"cpu": "ppc"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "debug",
     values = {
@@ -86,6 +121,61 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+# TODO(jhseu): Enable on other platforms other than Linux.
+config_setting(
+    name = "with_jemalloc_linux_x86_64",
+    values = {
+        "cpu": "k8",
+        "define": "with_jemalloc=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_jemalloc_linux_ppc64le",
+    values = {
+        "cpu": "ppc",
+        "define": "with_jemalloc=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_gcp_support",
+    values = {"define": "with_gcp_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support",
+    values = {"define": "with_hdfs_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_xla_support",
+    values = {"define": "with_xla_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_verbs_support",
+    values = {"define": "with_verbs_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_mpi_support",
+    values = {"define": "with_mpi_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = ["//tensorflow/..."],
@@ -123,7 +213,9 @@ filegroup(
         "//tensorflow/compiler/aot/tests:all_files",
         "//tensorflow/compiler/jit:all_files",
         "//tensorflow/compiler/jit/graphcycles:all_files",
+        "//tensorflow/compiler/jit/kernels:all_files",
         "//tensorflow/compiler/jit/legacy_flags:all_files",
+        "//tensorflow/compiler/jit/ops:all_files",
         "//tensorflow/compiler/tests:all_files",
         "//tensorflow/compiler/tf2xla:all_files",
         "//tensorflow/compiler/tf2xla/kernels:all_files",
@@ -131,7 +223,6 @@ filegroup(
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
         "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/port:all_files",
         "//tensorflow/compiler/xla/service:all_files",
         "//tensorflow/compiler/xla/service/cpu:all_files",
         "//tensorflow/compiler/xla/service/gpu:all_files",
@@ -141,11 +232,28 @@ filegroup(
         "//tensorflow/compiler/xla/tools:all_files",
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
+        "//tensorflow/contrib/batching:all_files",
+        "//tensorflow/contrib/batching/kernels:all_files",
+        "//tensorflow/contrib/batching/test_util:all_files",
+        "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
+        "//tensorflow/contrib/boosted_trees:all_files",
+        "//tensorflow/contrib/boosted_trees/lib:all_files",
+        "//tensorflow/contrib/boosted_trees/proto:all_files",
+        "//tensorflow/contrib/boosted_trees/resources:all_files",
+        "//tensorflow/contrib/cloud:all_files",
+        "//tensorflow/contrib/cloud/kernels:all_files",
+        "//tensorflow/contrib/cluster_resolver:all_files",
         "//tensorflow/contrib/compiler:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
         "//tensorflow/contrib/crf:all_files",
         "//tensorflow/contrib/cudnn_rnn:all_files",
+        "//tensorflow/contrib/data:all_files",
+        "//tensorflow/contrib/data/python/framework:all_files",
+        "//tensorflow/contrib/data/python/kernel_tests:all_files",
+        "//tensorflow/contrib/data/python/ops:all_files",
+        "//tensorflow/contrib/data/python/util:all_files",
+        "//tensorflow/contrib/decision_trees:all_files",
         "//tensorflow/contrib/distributions:all_files",
         "//tensorflow/contrib/factorization:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
@@ -154,10 +262,15 @@ filegroup(
         "//tensorflow/contrib/framework:all_files",
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
+        "//tensorflow/contrib/hooks:all_files",
+        "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
         "//tensorflow/contrib/image:all_files",
+        "//tensorflow/contrib/imperative:all_files",
         "//tensorflow/contrib/input_pipeline:all_files",
         "//tensorflow/contrib/input_pipeline/kernels:all_files",
         "//tensorflow/contrib/integrate:all_files",
+        "//tensorflow/contrib/keras:all_files",
+        "//tensorflow/contrib/kernel_methods:all_files",
         "//tensorflow/contrib/labeled_tensor:all_files",
         "//tensorflow/contrib/layers:all_files",
         "//tensorflow/contrib/layers/kernels:all_files",
@@ -172,30 +285,44 @@ filegroup(
         "//tensorflow/contrib/nn:all_files",
         "//tensorflow/contrib/opt:all_files",
         "//tensorflow/contrib/rnn:all_files",
+        "//tensorflow/contrib/saved_model:all_files",
+        "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
         "//tensorflow/contrib/seq2seq:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
         "//tensorflow/contrib/session_bundle/example:all_files",
+        "//tensorflow/contrib/signal:all_files",
         "//tensorflow/contrib/slim:all_files",
         "//tensorflow/contrib/slim/python/slim/data:all_files",
         "//tensorflow/contrib/slim/python/slim/nets:all_files",
         "//tensorflow/contrib/solvers:all_files",
         "//tensorflow/contrib/sparsemax:all_files",
         "//tensorflow/contrib/specs:all_files",
+        "//tensorflow/contrib/staging:all_files",
         "//tensorflow/contrib/stat_summarizer:all_files",
+        "//tensorflow/contrib/stateless:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/testing:all_files",
+        "//tensorflow/contrib/text:all_files",
         "//tensorflow/contrib/tfprof/python/tools/tfprof:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
+        "//tensorflow/contrib/verbs:all_files",
+        "//tensorflow/contrib/xla_tf_graph:all_files",
         "//tensorflow/core:all_files",
         "//tensorflow/core/debug:all_files",
         "//tensorflow/core/distributed_runtime:all_files",
         "//tensorflow/core/distributed_runtime/rpc:all_files",
+        "//tensorflow/core/grappler:all_files",
+        "//tensorflow/core/grappler/clusters:all_files",
+        "//tensorflow/core/grappler/costs:all_files",
+        "//tensorflow/core/grappler/inputs:all_files",
+        "//tensorflow/core/grappler/optimizers:all_files",
+        "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
-        "//tensorflow/core/kernels/cloud:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
+        "//tensorflow/core/kernels/neon:all_files",
         "//tensorflow/core/ops/compat:all_files",
         "//tensorflow/core/platform/cloud:all_files",
         "//tensorflow/core/platform/default/build_config:all_files",
@@ -203,6 +330,7 @@ filegroup(
         "//tensorflow/core/util/ctc:all_files",
         "//tensorflow/core/util/tensor_bundle:all_files",
         "//tensorflow/examples/android:all_files",
+        "//tensorflow/examples/benchmark:all_files",
         "//tensorflow/examples/how_tos/reading_data:all_files",
         "//tensorflow/examples/image_retraining:all_files",
         "//tensorflow/examples/label_image:all_files",
@@ -211,30 +339,87 @@ filegroup(
         "//tensorflow/examples/tutorials/estimators:all_files",
         "//tensorflow/examples/tutorials/mnist:all_files",
         "//tensorflow/examples/tutorials/word2vec:all_files",
-        "//tensorflow/g3doc/how_tos/adding_an_op:all_files",
-        "//tensorflow/g3doc/tutorials:all_files",
+        "//tensorflow/examples/wav_to_spectrogram:all_files",
         "//tensorflow/go:all_files",
         "//tensorflow/java:all_files",
         "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
         "//tensorflow/java/src/main/native:all_files",
         "//tensorflow/python:all_files",
         "//tensorflow/python/debug:all_files",
+        "//tensorflow/python/estimator:all_files",
+        "//tensorflow/python/feature_column:all_files",
         "//tensorflow/python/kernel_tests:all_files",
+        "//tensorflow/python/kernel_tests/distributions:all_files",
+        "//tensorflow/python/ops/distributions:all_files",
         "//tensorflow/python/saved_model:all_files",
         "//tensorflow/python/tools:all_files",
         "//tensorflow/tensorboard:all_files",
-        "//tensorflow/tensorboard/app:all_files",
         "//tensorflow/tensorboard/backend:all_files",
+        "//tensorflow/tensorboard/backend/event_processing:all_files",
         "//tensorflow/tensorboard/components:all_files",
-        "//tensorflow/tensorboard/components/vz_data_summary:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard/test:all_files",
+        "//tensorflow/tensorboard/components/tf_backend:all_files",
+        "//tensorflow/tensorboard/components/tf_backend/test:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale/test:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common/test:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_globals:all_files",
+        "//tensorflow/tensorboard/components/tf_graph:all_files",
+        "//tensorflow/tensorboard/components/tf_graph/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_common:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_imports:all_files",
+        "//tensorflow/tensorboard/components/tf_option_selector:all_files",
+        "//tensorflow/tensorboard/components/tf_profile_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_profile_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_runs_selector:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_storage:all_files",
+        "//tensorflow/tensorboard/components/tf_storage/test:all_files",
+        "//tensorflow/tensorboard/components/tf_tensorboard:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard:all_files",
+        "//tensorflow/tensorboard/components/tf_trace_viewer:all_files",
+        "//tensorflow/tensorboard/components/vz_distribution_chart:all_files",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries:all_files",
         "//tensorflow/tensorboard/components/vz_line_chart:all_files",
-        "//tensorflow/tensorboard/components/vz_line_chart/demo:all_files",
         "//tensorflow/tensorboard/components/vz_projector:all_files",
+        "//tensorflow/tensorboard/components/vz_projector/test:all_files",
         "//tensorflow/tensorboard/components/vz_sorting:all_files",
         "//tensorflow/tensorboard/components/vz_sorting/test:all_files",
-        "//tensorflow/tensorboard/lib:all_files",
-        "//tensorflow/tensorboard/lib/python:all_files",
+        "//tensorflow/tensorboard/demo:all_files",
+        "//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:all_files",
+        "//tensorflow/tensorboard/plugins:all_files",
+        "//tensorflow/tensorboard/plugins/audio:all_files",
+        "//tensorflow/tensorboard/plugins/distributions:all_files",
+        "//tensorflow/tensorboard/plugins/graphs:all_files",
+        "//tensorflow/tensorboard/plugins/histograms:all_files",
+        "//tensorflow/tensorboard/plugins/images:all_files",
+        "//tensorflow/tensorboard/plugins/projector:all_files",
+        "//tensorflow/tensorboard/plugins/scalars:all_files",
+        "//tensorflow/tensorboard/plugins/text:all_files",
         "//tensorflow/tensorboard/scripts:all_files",
+        "//tensorflow/tools/api/golden:all_files",
+        "//tensorflow/tools/api/lib:all_files",
+        "//tensorflow/tools/api/tests:all_files",
         "//tensorflow/tools/common:all_files",
         "//tensorflow/tools/compatibility:all_files",
         "//tensorflow/tools/dist_test/server:all_files",
@@ -247,6 +432,7 @@ filegroup(
         "//tensorflow/tools/test:all_files",
         "//tensorflow/tools/tfprof:all_files",
         "//tensorflow/tools/tfprof/internal:all_files",
+        "//tensorflow/tools/tfprof/internal/advisor:all_files",
         "//tensorflow/user_ops:all_files",
         "//third_party/hadoop:all_files",
         "//third_party/sycl:all_files",
@@ -269,23 +455,35 @@ filegroup(
     ),
 )
 
+filegroup(
+    name = "docs_src",
+    data = glob(["docs_src/**/*.md"]),
+)
+
 # -------------------------------------------
 # New rules should be added above this target.
 # -------------------------------------------
 cc_binary(
     name = "libtensorflow.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
+            "//tensorflow/c:exported_symbols.lds",
+        ],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//conditions:default": [
+            "-z defs",
+            "-s",
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "//tensorflow/c:version_script.lds",
+        ],
+    }),
     linkshared = 1,
     deps = [
         "//tensorflow/c:c_api",
-        "//tensorflow/core:tensorflow",
-    ],
-)
-
-cc_binary(
-    name = "libtensorflow_c.so",
-    linkshared = 1,
-    deps = [
-        "//tensorflow/c:c_api",
+        "//tensorflow/c:exported_symbols.lds",
+        "//tensorflow/c:version_script.lds",
         "//tensorflow/core:tensorflow",
     ],
 )
@@ -296,6 +494,8 @@ cc_binary(
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 92d390a9764..083634bd796 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -24,16 +24,19 @@ from __future__ import print_function
 from tensorflow.python import *
 # pylint: enable=wildcard-import
 
-# Lazily import the `tf.contrib` module. This avoids loading all of the
-# dependencies of `tf.contrib` at `import tensorflow` time.
-class _LazyContribLoader(object):
+from tensorflow.python.util.lazy_loader import LazyLoader
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
 
-  def __getattr__(self, item):
-    global contrib
-    # Replace the lazy loader with the imported module itself.
-    import importlib  # pylint: disable=g-import-not-at-top
-    contrib = importlib.import_module('tensorflow.contrib')
-    return getattr(contrib, item)
+del absolute_import
+del division
+del print_function
 
-
-contrib = _LazyContribLoader()
+# These symbols appear because we import the python package which
+# in turn imports from tensorflow.core and tensorflow.python. They
+# must come from this module. So python adds these symbols for the
+# resolution to succeed.
+# pylint: disable=undefined-variable
+del python
+del core
+# pylint: enable=undefined-variable
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index e0a4272ee22..3ab4e8efcdb 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -26,6 +26,22 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+tf_cuda_library(
+    name = "c_api_internal",
+    srcs = ["c_api.h"],
+    hdrs = ["c_api_internal.h"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "c_api",
     srcs = ["c_api.cc"],
@@ -34,10 +50,16 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
+            ":c_api_internal",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            ":c_api_internal",
             "//tensorflow/cc/saved_model:loader",
+            "//tensorflow/cc:gradients",
+            "//tensorflow/cc:ops",
+            "//tensorflow/cc:grad_ops",
+            "//tensorflow/cc:scope_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
@@ -45,6 +67,14 @@ tf_cuda_library(
     }),
 )
 
+exports_files(
+    [
+        "version_script.lds",
+        "exported_symbols.lds",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 tf_cuda_library(
     name = "tf_status_helper",
     srcs = ["tf_status_helper.cc"],
@@ -89,20 +119,22 @@ tf_cc_test(
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index a51a3ca69e9..77faa475ed4 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -21,8 +21,12 @@ limitations under the License.
 #include <vector>
 
 #ifndef __ANDROID__
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #endif
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -52,12 +56,11 @@ limitations under the License.
 using tensorflow::error::Code;
 using tensorflow::errors::InvalidArgument;
 using tensorflow::gtl::ArraySlice;
+using tensorflow::strings::StrCat;
 using tensorflow::AllocationDescription;
 using tensorflow::DataType;
-using tensorflow::Env;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
-using tensorflow::mutex;
 using tensorflow::mutex_lock;
 using tensorflow::NameRangeMap;
 using tensorflow::NameRangesForNode;
@@ -68,11 +71,9 @@ using tensorflow::NodeBuilder;
 using tensorflow::OpDef;
 using tensorflow::OpRegistry;
 using tensorflow::PartialTensorShape;
-using tensorflow::Reset;
 using tensorflow::RunMetadata;
 using tensorflow::RunOptions;
 using tensorflow::Session;
-using tensorflow::SessionOptions;
 using tensorflow::Status;
 using tensorflow::Tensor;
 using tensorflow::TensorBuffer;
@@ -92,9 +93,6 @@ size_t TF_DataTypeSize(TF_DataType dt) {
 }
 
 // --------------------------------------------------------------------------
-struct TF_Status {
-  Status status;
-};
 
 TF_Status* TF_NewStatus() { return new TF_Status; }
 
@@ -134,6 +132,9 @@ class TF_ManagedBuffer : public TensorBuffer {
     proto->set_requested_bytes(rb);
     proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
   }
+
+  // Prevents input forwarding from mutating this buffer.
+  bool OwnsMemory() const override { return false; }
 };
 
 void* allocate_tensor(const char* operation, size_t len) {
@@ -175,12 +176,6 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
 
 }  // namespace
 
-struct TF_Tensor {
-  TF_DataType dtype;
-  TensorShape shape;
-  TensorBuffer* buffer;
-};
-
 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                              int num_dims, size_t len) {
   void* data = allocate_tensor("TF_AllocateTensor", len);
@@ -216,6 +211,18 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
   return new TF_Tensor{dtype, TensorShape(dimvec), buf};
 }
 
+TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
+  // It is safe to move the Tensor if and only if we own the unique reference to
+  // it. In that case, we might as well not delete and reallocate, but a future
+  // implementation might need to do so.
+  if (tensor->buffer->RefCountIsOne() &&
+      tensor->buffer->root_buffer()->RefCountIsOne() &&
+      tensor->buffer->OwnsMemory()) {
+    return tensor;
+  }
+  return nullptr;
+}
+
 void TF_DeleteTensor(TF_Tensor* t) {
   t->buffer->Unref();
   delete t;
@@ -273,9 +280,6 @@ size_t TF_StringEncodedSize(size_t len) {
 }
 
 // --------------------------------------------------------------------------
-struct TF_SessionOptions {
-  SessionOptions options;
-};
 TF_SessionOptions* TF_NewSessionOptions() { return new TF_SessionOptions; }
 void TF_DeleteSessionOptions(TF_SessionOptions* opt) { delete opt; }
 
@@ -316,9 +320,6 @@ void TF_DeleteBuffer(TF_Buffer* buffer) {
 TF_Buffer TF_GetBuffer(TF_Buffer* buffer) { return *buffer; }
 
 // --------------------------------------------------------------------------
-struct TF_DeprecatedSession {
-  Session* session;
-};
 
 TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions* opt,
                                               TF_Status* status) {
@@ -328,7 +329,7 @@ TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions* opt,
     return new TF_DeprecatedSession({session});
   } else {
     DCHECK_EQ(nullptr, session);
-    return NULL;
+    return nullptr;
   }
 }
 
@@ -502,7 +503,7 @@ static void TF_Run_Setup(int noutputs, TF_Tensor** c_outputs,
                          TF_Status* status) {
   status->status = Status::OK();
   for (int i = 0; i < noutputs; ++i) {
-    c_outputs[i] = NULL;
+    c_outputs[i] = nullptr;
   }
 }
 
@@ -542,9 +543,8 @@ static void TF_Run_Helper(
 
   if (handle == nullptr) {
     RunOptions run_options_proto;
-    if (run_options != nullptr &&
-        !run_options_proto.ParseFromArray(run_options->data,
-                                          run_options->length)) {
+    if (run_options != nullptr && !run_options_proto.ParseFromArray(
+                                      run_options->data, run_options->length)) {
       status->status = InvalidArgument("Unparseable RunOptions proto");
       return;
     }
@@ -651,6 +651,7 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
   } else {
+    *handle = nullptr;
     status->status = result;
   }
 }
@@ -682,11 +683,6 @@ void TF_PRun(TF_DeprecatedSession* s, const char* handle,
                 c_outputs, target_oper_names, nullptr, status);
 }
 
-struct TF_Library {
-  void* lib_handle;
-  TF_Buffer op_list;
-};
-
 TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
   TF_Library* lib_handle = new TF_Library;
   status->status = tensorflow::LoadLibrary(
@@ -714,68 +710,58 @@ TF_Buffer* TF_GetAllOpList() {
     *(op_list.add_op()) = op;
   }
   TF_Buffer* ret = TF_NewBuffer();
-  MessageToBuffer(op_list, ret);
+  TF_CHECK_OK(MessageToBuffer(op_list, ret));
   return ret;
 }
 
+// --------------------------------------------------------------------------
+// ListDevices & SessionListDevices API
+
+void TF_DeleteDeviceList(TF_DeviceList* s) { delete s; }
+
+TF_DeviceList* TF_SessionListDevices(TF_Session* session, TF_Status* status) {
+  TF_DeviceList* response = new TF_DeviceList;
+  status->status = session->session->ListDevices(&response->response);
+  return response;
+}
+
+TF_DeviceList* TF_DeprecatedSessionListDevices(TF_DeprecatedSession* session,
+                                               TF_Status* status) {
+  TF_DeviceList* response = new TF_DeviceList;
+  status->status = session->session->ListDevices(&response->response);
+  return response;
+}
+
+int TF_DeviceListCount(const TF_DeviceList* list) {
+  return list->response.size();
+}
+
+#define TF_DEVICELIST_METHOD(return_type, method_name, accessor, err_val) \
+  return_type method_name(const TF_DeviceList* list, const int index,     \
+                          TF_Status* status) {                            \
+    if (list == nullptr) {                                                \
+      status->status = InvalidArgument("list is null!");                  \
+      return err_val;                                                     \
+    }                                                                     \
+    if (index < 0 || index >= list->response.size()) {                    \
+      status->status = InvalidArgument("index out of bounds");            \
+      return err_val;                                                     \
+    }                                                                     \
+    return list->response[index].accessor;                                \
+  }
+
+TF_DEVICELIST_METHOD(const char*, TF_DeviceListName, name().c_str(), nullptr);
+TF_DEVICELIST_METHOD(const char*, TF_DeviceListType, device_type().c_str(),
+                     nullptr);
+TF_DEVICELIST_METHOD(int64_t, TF_DeviceListMemoryBytes, memory_limit(), -1);
+
+#undef TF_DEVICELIST_METHOD
+
 }  // end extern "C"
 
 // --------------------------------------------------------------------------
 // New Graph and Session API
 
-// Structures -----------------------------------------------------------------
-
-extern "C" {
-
-struct TF_Graph {
-  TF_Graph()
-      : graph(OpRegistry::Global()),
-        refiner(graph.op_registry()),
-        num_sessions(0),
-        delete_requested(false) {}
-  mutex mu;
-  Graph graph GUARDED_BY(mu);
-
-  // Runs shape inference.
-  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
-
-  // Maps from name of an operation to the Node* in 'graph'.
-  std::unordered_map<tensorflow::string, Node*> name_map GUARDED_BY(mu);
-
-  // TF_Graph may only / must be deleted when
-  //   num_sessions == 0 && delete_requested == true
-
-  // num_sessions incremented by TF_NewSession, and decremented by
-  // TF_DeleteSession.
-  int num_sessions GUARDED_BY(mu);
-  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
-};
-
-struct TF_OperationDescription {
-  TF_OperationDescription(TF_Graph* g, const char* op_type,
-                          const char* node_name)
-      : node_builder(node_name, op_type, g->graph.op_registry()), graph(g) {}
-
-  NodeBuilder node_builder;
-  TF_Graph* graph;
-  std::vector<tensorflow::string> colocation_constraints;
-};
-
-struct TF_Operation {
-  Node node;
-};
-
-struct TF_Session {
-  TF_Session(Session* s, TF_Graph* g)
-      : session(s), graph(g), last_num_graph_nodes(0) {}
-  Session* session;
-  TF_Graph* graph;
-  mutex mu;
-  int last_num_graph_nodes;
-};
-
-}  // end extern "C"
-
 // Helper functions -----------------------------------------------------------
 
 namespace {
@@ -785,15 +771,13 @@ TF_Operation* ToOperation(Node* node) {
 }
 
 tensorflow::string OutputName(const TF_Output& output) {
-  return tensorflow::strings::StrCat(output.oper->node.name(), ":",
-                                     output.index);
+  return StrCat(output.oper->node.name(), ":", output.index);
 }
 
 const tensorflow::AttrValue* GetAttrValue(TF_Operation* oper,
                                           const char* attr_name,
                                           TF_Status* status) {
-  const tensorflow::AttrValue* attr =
-      tensorflow::AttrSlice(oper->node.def()).Find(attr_name);
+  const tensorflow::AttrValue* attr = oper->node.attrs().Find(attr_name);
   if (attr == nullptr) {
     status->status =
         InvalidArgument("Operation has no attr named '", attr_name, "'.");
@@ -821,6 +805,7 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
   }
 
   std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
+  dim_vec.reserve(num_dims);
   for (int i = 0; i < num_dims; ++i) {
     dim_vec.push_back(ic->MakeDim(dims[i]));
   }
@@ -899,10 +884,17 @@ void TF_GraphGetTensorShape(TF_Graph* graph, TF_Output output, int64_t* dims,
 
 extern "C" {
 
+static TF_OperationDescription* TF_NewOperationLocked(TF_Graph* graph,
+                                                      const char* op_type,
+                                                      const char* oper_name)
+    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+  return new TF_OperationDescription(graph, op_type, oper_name);
+}
+
 TF_OperationDescription* TF_NewOperation(TF_Graph* graph, const char* op_type,
                                          const char* oper_name) {
   mutex_lock l(graph->mu);
-  return new TF_OperationDescription(graph, op_type, oper_name);
+  return TF_NewOperationLocked(graph, op_type, oper_name);
 }
 
 void TF_SetDevice(TF_OperationDescription* desc, const char* device) {
@@ -928,8 +920,8 @@ void TF_AddControlInput(TF_OperationDescription* desc, TF_Operation* input) {
 }
 
 void TF_ColocateWith(TF_OperationDescription* desc, TF_Operation* op) {
-  desc->colocation_constraints.emplace_back(tensorflow::strings::StrCat(
-      tensorflow::kColocationGroupPrefix, op->node.name()));
+  desc->colocation_constraints.emplace_back(
+      StrCat(tensorflow::kColocationGroupPrefix, op->node.name()));
 }
 
 void TF_SetAttrString(TF_OperationDescription* desc, const char* attr_name,
@@ -1131,10 +1123,10 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
   }
 }
 
-TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
-                                 TF_Status* status) {
+static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
+                                              TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(desc->graph->mu) {
   Node* ret = nullptr;
-  mutex_lock l(desc->graph->mu);
 
   if (desc->graph->name_map.count(desc->node_builder.node_name())) {
     status->status = InvalidArgument("Duplicate node name in graph: '",
@@ -1148,14 +1140,14 @@ TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
 
     if (status->status.ok()) {
       // Run shape inference function for newly added node.
-      //
-      // TODO(b/28152992): Enable returning the result of this
-      // code-path once we have converted all python shape functions
-      // to call their C++ versions.
-      desc->graph->refiner.AddNode(ret);
-
+      status->status = desc->graph->refiner.AddNode(ret);
+    }
+    if (status->status.ok()) {
       // Add the node to the name-to-node mapping.
       desc->graph->name_map[ret->name()] = ret;
+    } else if (ret != nullptr) {
+      desc->graph->graph.RemoveNode(ret);
+      ret = nullptr;
     }
   }
 
@@ -1164,6 +1156,12 @@ TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
   return ToOperation(ret);
 }
 
+TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
+                                 TF_Status* status) {
+  mutex_lock l(desc->graph->mu);
+  return TF_FinishOperationLocked(desc, status);
+}
+
 // TF_Operation functions
 // ----------------------------------------------------------
 
@@ -1176,7 +1174,7 @@ const char* TF_OperationOpType(TF_Operation* oper) {
 }
 
 const char* TF_OperationDevice(TF_Operation* oper) {
-  return oper->node.def().device().c_str();
+  return oper->node.requested_device().c_str();
 }
 
 int TF_OperationNumOutputs(TF_Operation* oper) {
@@ -1191,8 +1189,8 @@ TF_DataType TF_OperationOutputType(TF_Output oper_out) {
 int TF_OperationOutputListLength(TF_Operation* oper, const char* arg_name,
                                  TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     nullptr, &name_ranges);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), nullptr, &name_ranges);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1213,8 +1211,8 @@ TF_DataType TF_OperationInputType(TF_Input oper_in) {
 int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
                                 TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     &name_ranges, nullptr);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), &name_ranges, nullptr);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1452,26 +1450,27 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
   }
 }
 
-#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                     \
-  void func(TF_Operation* oper, const char* attr_name, c_type* value,          \
-            TF_Status* status) {                                               \
-    cpp_type v;                                                                \
-    status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &v); \
-    *value = static_cast<c_type>(v);                                           \
-  }                                                                            \
-  void func##List(TF_Operation* oper, const char* attr_name, c_type* values,   \
-                  int max_values, TF_Status* status) {                         \
-    const auto* attr = GetAttrValue(oper, attr_name, status);                  \
-    if (!status->status.ok()) return;                                          \
-    if (attr->value_case() != tensorflow::AttrValue::kList) {                  \
-      status->status =                                                         \
-          InvalidArgument("Value for '", attr_name, "' is not a list.");       \
-      return;                                                                  \
-    }                                                                          \
-    const auto len = std::min(max_values, attr->list().list_field##_size());   \
-    for (int i = 0; i < len; ++i) {                                            \
-      values[i] = static_cast<c_type>(attr->list().list_field(i));             \
-    }                                                                          \
+#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                   \
+  void func(TF_Operation* oper, const char* attr_name, c_type* value,        \
+            TF_Status* status) {                                             \
+    cpp_type v;                                                              \
+    status->status =                                                         \
+        tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &v);          \
+    *value = static_cast<c_type>(v);                                         \
+  }                                                                          \
+  void func##List(TF_Operation* oper, const char* attr_name, c_type* values, \
+                  int max_values, TF_Status* status) {                       \
+    const auto* attr = GetAttrValue(oper, attr_name, status);                \
+    if (!status->status.ok()) return;                                        \
+    if (attr->value_case() != tensorflow::AttrValue::kList) {                \
+      status->status =                                                       \
+          InvalidArgument("Value for '", attr_name, "' is not a list.");     \
+      return;                                                                \
+    }                                                                        \
+    const auto len = std::min(max_values, attr->list().list_field##_size()); \
+    for (int i = 0; i < len; ++i) {                                          \
+      values[i] = static_cast<c_type>(attr->list().list_field(i));           \
+    }                                                                        \
   }
 DEFINE_GETATTR(TF_OperationGetAttrInt, int64_t, tensorflow::int64, i);
 DEFINE_GETATTR(TF_OperationGetAttrFloat, float, float, f);
@@ -1482,7 +1481,8 @@ DEFINE_GETATTR(TF_OperationGetAttrType, TF_DataType, DataType, type);
 void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
                               int64_t* value, int num_dims, TF_Status* status) {
   PartialTensorShape shape;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shape);
+  status->status =
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shape);
   if (!status->status.ok()) return;
   auto len = std::min(shape.dims(), num_dims);
   for (int i = 0; i < len; ++i) {
@@ -1496,7 +1496,7 @@ void TF_OperationGetAttrShapeList(TF_Operation* oper, const char* attr_name,
                                   int storage_size, TF_Status* status) {
   std::vector<PartialTensorShape> shapes;
   status->status =
-      tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shapes);
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shapes);
   if (!status->status.ok()) return;
   auto len = std::min(static_cast<int>(shapes.size()), max_values);
   int64_t* p = storage;
@@ -1563,7 +1563,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
                                TF_Tensor** value, TF_Status* status) {
   *value = nullptr;
   Tensor t;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &t);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
   if (!status->status.ok()) return;
   *value = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
                          tensorflow::TensorCApi::Buffer(t)};
@@ -1574,7 +1574,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
                                    TF_Tensor** values, int max_values,
                                    TF_Status* status) {
   std::vector<Tensor> ts;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &ts);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &ts);
   if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
@@ -1653,10 +1653,6 @@ void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
   status->status = MessageToBuffer(def, output_graph_def);
 }
 
-struct TF_ImportGraphDefOptions {
-  tensorflow::ImportGraphDefOptions opts;
-};
-
 TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions() {
   return new TF_ImportGraphDefOptions;
 }
@@ -1682,6 +1678,12 @@ void TF_ImportGraphDefOptionsAddInputMapping(TF_ImportGraphDefOptions* opts,
   opts->opts.input_map[TensorId(src_name, src_index)] = ToTensorId(dst);
 }
 
+void TF_ImportGraphDefOptionsRemapControlDependency(
+    TF_ImportGraphDefOptions* opts, const char* src_name, TF_Operation* dst) {
+  opts->opts.input_map[TensorId(src_name, tensorflow::Graph::kControlSlot)] =
+      TensorId(dst->node.name(), tensorflow::Graph::kControlSlot);
+}
+
 extern void TF_ImportGraphDefOptionsAddControlDependency(
     TF_ImportGraphDefOptions* opts, TF_Operation* oper) {
   opts->opts.control_dependencies.push_back(oper->node.name());
@@ -1750,6 +1752,398 @@ void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
                                           status);
 }
 
+// While loop functions -------------------------------------------------------
+
+namespace {
+bool CreateInput(const TF_Output& parent_input, TF_Graph* g, const char* name,
+                 TF_Output* input, TF_Status* status) {
+  TF_OperationDescription* desc = TF_NewOperation(g, "Placeholder", name);
+  TF_SetAttrType(desc, "dtype", TF_OperationOutputType(parent_input));
+  // TODO(skyewm): set placeholder shape
+  TF_Operation* oper = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) return false;
+  *input = {oper, 0};
+  return true;
+}
+
+bool CreateEnter(TF_Graph* g, const char* node_name, const char* frame_name,
+                 const TF_Output& input, TF_Output* enter, TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(g->mu) {
+  TF_OperationDescription* desc = TF_NewOperationLocked(g, "Enter", node_name);
+  TF_AddInput(desc, input);
+  TF_SetAttrString(desc, "frame_name", frame_name, strlen(frame_name));
+  TF_Operation* oper = TF_FinishOperationLocked(desc, status);
+  if (!status->status.ok()) return false;
+  *enter = {oper, 0};
+  return true;
+}
+
+bool CreateMerge(TF_Graph* g, const char* name, const TF_Output& input,
+                 const char* backedge_name, int backedge_index,
+                 TF_Output* merge, TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(g->mu) {
+  TF_OperationDescription* desc = TF_NewOperationLocked(g, "Merge", name);
+
+  // The merge nodes accept the while loop's back edges as an input. Use the
+  // underlying NodeBuilder API directly to create an input to the
+  // not-yet-created back edge.
+  std::vector<NodeBuilder::NodeOut> input_list;
+  input_list.push_back(NodeBuilder::NodeOut(&input.oper->node, input.index));
+  // All merge inputs must have same type
+  DataType type = input.oper->node.output_type(input.index);
+  input_list.push_back(
+      NodeBuilder::NodeOut(backedge_name, backedge_index, type));
+
+  desc->node_builder.Input(input_list);
+
+  TF_Operation* oper = TF_FinishOperationLocked(desc, status);
+  if (!status->status.ok()) return false;
+  *merge = {oper, 0};
+  return true;
+}
+
+bool CreateSwitch(TF_Graph* g, const char* name, const TF_Output& input,
+                  const TF_Output& predicate, TF_Output* switch_true,
+                  TF_Output* switch_false, TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(g->mu) {
+  TF_OperationDescription* desc = TF_NewOperationLocked(g, "Switch", name);
+  TF_AddInput(desc, input);
+  TF_AddInput(desc, predicate);
+  TF_Operation* oper = TF_FinishOperationLocked(desc, status);
+  if (!status->status.ok()) return false;
+  *switch_false = {oper, 0};
+  *switch_true = {oper, 1};
+  return true;
+}
+
+bool CreateNext(TF_Graph* g, const char* name, const TF_Output& input,
+                TF_Output* next, TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(g->mu) {
+  TF_OperationDescription* desc =
+      TF_NewOperationLocked(g, "NextIteration", name);
+  TF_AddInput(desc, input);
+  TF_Operation* oper = TF_FinishOperationLocked(desc, status);
+  if (!status->status.ok()) return false;
+  *next = {oper, 0};
+  return true;
+}
+
+bool CreateExit(TF_Graph* g, const char* name, const TF_Output& input,
+                TF_Output* exit, TF_Status* status)
+    EXCLUSIVE_LOCKS_REQUIRED(g->mu) {
+  TF_OperationDescription* desc = TF_NewOperationLocked(g, "Exit", name);
+  TF_AddInput(desc, input);
+  TF_Operation* oper = TF_FinishOperationLocked(desc, status);
+  if (!status->status.ok()) return false;
+  *exit = {oper, 0};
+  return true;
+}
+
+class ScopedImportGraphDefOptions {
+ public:
+  ScopedImportGraphDefOptions() { opts_ = TF_NewImportGraphDefOptions(); }
+  ~ScopedImportGraphDefOptions() { TF_DeleteImportGraphDefOptions(opts_); }
+
+  TF_ImportGraphDefOptions* get() const { return opts_; }
+
+ private:
+  TF_ImportGraphDefOptions* opts_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedImportGraphDefOptions);
+};
+
+// Copies `src_graph` into `dst_graph`. Any node in `src_graph` with input
+// `src_inputs[i]` will have that input replaced with `dst_inputs[i]`.
+// `prefix` will be prepended to copied node names. `return_nodes` are nodes
+// in `src_graph`, and the new corresponding nodes in `dst_graph` will be
+// returned. `return_nodes` should be preallocated to size `nreturn_nodes`.
+bool CopyGraph(TF_Graph* src_graph, TF_Graph* dst_graph,
+               const TF_Output* src_inputs,
+               const std::vector<TF_Output>& dst_inputs, const char* prefix,
+               const TF_Output* nodes_to_return, int nreturn_nodes,
+               TF_Output* return_nodes, TF_Status* s)
+    EXCLUSIVE_LOCKS_REQUIRED(dst_graph->mu) {
+  GraphDef gdef;
+  src_graph->graph.ToGraphDef(&gdef);
+
+  ScopedImportGraphDefOptions opts;
+  TF_ImportGraphDefOptionsSetPrefix(opts.get(), prefix);
+
+  for (int i = 0; i < dst_inputs.size(); ++i) {
+    TensorId src = ToTensorId(src_inputs[i]);
+    TF_ImportGraphDefOptionsAddInputMapping(opts.get(), src.first.data(),
+                                            src.second, dst_inputs[i]);
+  }
+  // We use the pivot node to control constants in `src_graph`
+  TF_Operation* pivot = dst_inputs[0].oper;
+  TF_ImportGraphDefOptionsAddControlDependency(opts.get(), pivot);
+
+  for (int i = 0; i < nreturn_nodes; ++i) {
+    TF_ImportGraphDefOptionsAddReturnOutput(
+        opts.get(), nodes_to_return[i].oper->node.name().c_str(),
+        nodes_to_return[i].index);
+  }
+
+  GraphImportGraphDefLocked(dst_graph, gdef, opts.get(), return_nodes,
+                            nreturn_nodes, s);
+  if (TF_GetCode(s) != TF_OK) return false;
+  return true;
+}
+
+bool ValidateConstWhileParams(const TF_WhileParams& params, TF_Status* s) {
+  if (params.cond_graph == nullptr || params.body_graph == nullptr ||
+      params.cond_graph->parent == nullptr ||
+      params.cond_graph->parent != params.body_graph->parent ||
+      params.cond_graph->parent_inputs != params.body_graph->parent_inputs ||
+      params.ninputs <= 0 || params.cond_inputs == nullptr ||
+      params.body_inputs == nullptr || params.body_outputs == nullptr) {
+    s->status = InvalidArgument(
+        "TF_WhileParams must be created by successful TF_NewWhile() call");
+    return false;
+  }
+  return true;
+}
+
+bool ValidateInputWhileParams(const TF_WhileParams& params, TF_Status* s) {
+  if (params.cond_output.oper == nullptr) {
+    s->status = InvalidArgument("TF_WhileParams `cond_output` field isn't set");
+    return false;
+  }
+  for (int i = 0; i < params.ninputs; ++i) {
+    if (params.body_outputs[i].oper == nullptr) {
+      s->status = InvalidArgument("TF_WhileParams `body_outputs[", i, "]` ",
+                                  "field isn't set");
+      return false;
+    }
+  }
+  if (params.name == nullptr) {
+    s->status = InvalidArgument("TF_WhileParams `name` field is null");
+    return false;
+  }
+  return true;
+}
+
+void FreeWhileResources(const TF_WhileParams* params) {
+  TF_DeleteGraph(params->cond_graph);
+  TF_DeleteGraph(params->body_graph);
+  delete[] params->cond_inputs;
+  delete[] params->body_inputs;
+  delete[] params->body_outputs;
+}
+
+TF_WhileParams EmptyWhileParams() {
+  return {0,       nullptr, nullptr, {nullptr, 0},
+          nullptr, nullptr, nullptr, nullptr};
+}
+
+}  // namespace
+
+TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs, int ninputs,
+                           TF_Status* status) {
+  if (ninputs == 0) {
+    status->status =
+        InvalidArgument("TF_NewWhile() must be passed at least one input");
+    return EmptyWhileParams();
+  }
+
+  TF_Graph* cond_graph = TF_NewGraph();
+  TF_Graph* body_graph = TF_NewGraph();
+  cond_graph->parent = g;
+  cond_graph->parent_inputs = inputs;
+  body_graph->parent = g;
+  body_graph->parent_inputs = inputs;
+
+  TF_Output* cond_inputs = new TF_Output[ninputs];
+  TF_Output cond_output = {nullptr, -1};
+  TF_Output* body_inputs = new TF_Output[ninputs];
+  TF_Output* body_outputs = new TF_Output[ninputs];
+  for (int i = 0; i < ninputs; ++i) body_outputs[i] = {nullptr, -1};
+  const char* name = nullptr;
+
+  for (int i = 0; i < ninputs; ++i) {
+    // TODO(skyewm): prefix names with underscore (requires some plumbing)
+    if (!CreateInput(inputs[i], cond_graph, StrCat("cond_input", i).c_str(),
+                     &cond_inputs[i], status)) {
+      break;
+    }
+    if (!CreateInput(inputs[i], body_graph, StrCat("body_input", i).c_str(),
+                     &body_inputs[i], status)) {
+      break;
+    }
+  }
+
+  TF_WhileParams params = {ninputs,    cond_graph,  cond_inputs,  cond_output,
+                           body_graph, body_inputs, body_outputs, name};
+
+  if (!status->status.ok()) {
+    FreeWhileResources(&params);
+    return EmptyWhileParams();
+  }
+  return params;
+}
+
+namespace {
+
+// TODO(skyewm): make nodes in while loop unfetchable like in Python version
+void TF_FinishWhileHelper(const TF_WhileParams* params, TF_Status* status,
+                          TF_Output* outputs) {
+  if (!ValidateInputWhileParams(*params, status)) return;
+
+  TF_Graph* parent = params->cond_graph->parent;
+  TF_Output* parent_inputs = params->cond_graph->parent_inputs;
+  int n = params->ninputs;
+
+  mutex_lock l(parent->mu);
+
+  // Create Enter nodes
+  std::vector<TF_Output> enter_nodes(n);
+  for (int i = 0; i < n; ++i) {
+    if (!CreateEnter(parent, StrCat(params->name, "/enter", i).c_str(),
+                     params->name, parent_inputs[i], &enter_nodes[i], status)) {
+      return;
+    }
+  }
+
+  // Create Merge nodes
+  std::vector<TF_Output> merge_nodes(n);
+  for (int i = 0; i < n; ++i) {
+    if (!CreateMerge(parent, StrCat(params->name, "/merge", i).c_str(),
+                     enter_nodes[i], StrCat(params->name, "/next", i).c_str(),
+                     0, &merge_nodes[i], status)) {
+      return;
+    }
+  }
+
+  // Copy cond_graph to parent and replace input placeholders with merge node
+  // outputs, and get handle to new cond output
+  tensorflow::string cond_prefix = StrCat(params->name, "/cond");
+  TF_Output cond_output;
+  if (!CopyGraph(params->cond_graph, parent, params->cond_inputs, merge_nodes,
+                 cond_prefix.c_str(), &params->cond_output, 1, &cond_output,
+                 status)) {
+    return;
+  }
+
+  // Create Switch nodes
+  std::vector<TF_Output> switch_trues(n);
+  std::vector<TF_Output> switch_falses(n);
+  for (int i = 0; i < n; ++i) {
+    if (!CreateSwitch(parent, StrCat(params->name, "/switch", i).c_str(),
+                      merge_nodes[i], cond_output, &switch_trues[i],
+                      &switch_falses[i], status)) {
+      return;
+    }
+  }
+
+  // Copy body_graph to parent, replace input placeholders with switch node
+  // true outputs, and get handles to new body outputs
+  tensorflow::string body_prefix = StrCat(params->name, "/body");
+  std::vector<TF_Output> body_outputs(n);
+  if (!CopyGraph(params->body_graph, parent, params->body_inputs, switch_trues,
+                 body_prefix.c_str(), params->body_outputs, n,
+                 body_outputs.data(), status)) {
+    return;
+  }
+
+  // Create Next nodes
+  std::vector<TF_Output> next_nodes(n);
+  for (int i = 0; i < n; ++i) {
+    if (!CreateNext(parent, StrCat(params->name, "/next", i).c_str(),
+                    body_outputs[i], &next_nodes[i], status)) {
+      return;
+    }
+  }
+
+  // Create Exit nodes (which are the outputs of the while loop)
+  for (int i = 0; i < n; ++i) {
+    if (!CreateExit(parent, StrCat(params->name, "/exit", i).c_str(),
+                    switch_falses[i], &outputs[i], status)) {
+      return;
+    }
+  }
+}
+
+}  // namespace
+
+void TF_FinishWhile(const TF_WhileParams* params, TF_Status* status,
+                    TF_Output* outputs) {
+  // If it appears the caller created or modified `params`, don't free resources
+  if (!ValidateConstWhileParams(*params, status)) return;
+  TF_FinishWhileHelper(params, status, outputs);
+  FreeWhileResources(params);
+}
+
+void TF_AbortWhile(const TF_WhileParams* params) { FreeWhileResources(params); }
+
+#ifndef __ANDROID__
+namespace {
+
+void OutputsFromTFOutputs(TF_Output* tf_outputs, int n, TF_Status* status,
+                          std::vector<tensorflow::Output>* outputs) {
+  outputs->resize(n);
+  for (int i = 0; i < n; i++) {
+    const TF_Output& tf_output = tf_outputs[i];
+    (*outputs)[i] = tensorflow::Output(&tf_output.oper->node, tf_output.index);
+  }
+}
+
+void TFOutputsFromOutputs(const std::vector<tensorflow::Output>& outputs,
+                          TF_Output* tf_outputs) {
+  for (int i = 0; i < outputs.size(); i++) {
+    tf_outputs[i].oper = ToOperation(outputs[i].node());
+    tf_outputs[i].index = outputs[i].index();
+  }
+}
+
+}  // namespace
+#endif  // __ANDROID__
+
+void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
+                     TF_Output* dx, TF_Status* status, TF_Output* dy) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Adding gradients is not supported in Android. File a bug at "
+      "https://github.com/tensorflow/tensorflow/issues if this feature is "
+      "important to you");
+#else
+  std::vector<tensorflow::Output> y_arg;
+  std::vector<tensorflow::Output> x_arg;
+  std::vector<tensorflow::Output> dy_arg;
+  OutputsFromTFOutputs(y, ny, status, &y_arg);
+  OutputsFromTFOutputs(x, nx, status, &x_arg);
+
+  {
+    // We need to hold on to the lock while we have a scope that uses TF_Graph.
+    mutex_lock graph_lock(g->mu);
+
+    const int max_node_id_before = g->graph.num_node_ids();
+
+    tensorflow::Scope scope =
+        NewInternalScope(&g->graph, &status->status, &g->refiner);
+
+    if (dx != nullptr) {
+      std::vector<tensorflow::Output> dx_arg;
+      OutputsFromTFOutputs(dx, ny, status, &dx_arg);
+      status->status =
+          AddSymbolicGradients(scope, y_arg, x_arg, dx_arg, &dy_arg);
+    } else {
+      status->status = AddSymbolicGradients(scope, y_arg, x_arg, &dy_arg);
+    }
+
+    // Update g->name_map with the name_map from the scope, which will contain
+    // the new gradient ops.
+    for (int i = max_node_id_before; i < g->graph.num_node_ids(); ++i) {
+      Node* n = g->graph.FindNodeId(i);
+      if (n == nullptr) continue;
+      g->name_map[n->name()] = n;
+    }
+  }
+
+  // Unpack the results from grad_outputs_arg.
+  TFOutputsFromOutputs(dy_arg, dy);
+#endif  // __ANDROID__
+}
+
 // TF_Session functions ----------------------------------------------
 
 TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
@@ -1764,15 +2158,23 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
     return new TF_Session(session, graph);
   } else {
     DCHECK_EQ(nullptr, session);
-    return NULL;
+    return nullptr;
   }
 }
 
-#ifndef __ANDROID__
 TF_Session* TF_LoadSessionFromSavedModel(
     const TF_SessionOptions* session_options, const TF_Buffer* run_options,
     const char* export_dir, const char* const* tags, int tags_len,
     TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status) {
+// TODO(ashankar): Remove the __ANDROID__ guard. This will require ensuring that
+// the tensorflow/cc/saved_model:loader build target is Android friendly.
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Loading a SavedModel is not supported in Android. File a bug at "
+      "https://github.com/tensorflow/tensorflow/issues if this feature is "
+      "important to you");
+  return nullptr;
+#else
   mutex_lock l(graph->mu);
 
   if (!graph->name_map.empty()) {
@@ -1781,9 +2183,8 @@ TF_Session* TF_LoadSessionFromSavedModel(
   }
 
   RunOptions run_options_proto;
-  if (run_options != nullptr &&
-      !run_options_proto.ParseFromArray(run_options->data,
-                                        run_options->length)) {
+  if (run_options != nullptr && !run_options_proto.ParseFromArray(
+                                    run_options->data, run_options->length)) {
     status->status = InvalidArgument("Unparseable RunOptions proto");
     return nullptr;
   }
@@ -1821,8 +2222,8 @@ TF_Session* TF_LoadSessionFromSavedModel(
   graph->num_sessions += 1;
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
-}
 #endif  // __ANDROID__
+}
 
 void TF_CloseSession(TF_Session* s, TF_Status* status) {
   status->status = s->session->Close();
@@ -1853,7 +2254,7 @@ static bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
     const auto num_nodes = graph.num_node_ids();
     if (session->last_num_graph_nodes < num_nodes) {
       GraphDef graph_def;
-      graph_def.mutable_versions()->CopyFrom(graph.versions());
+      *graph_def.mutable_versions() = graph.versions();
       // Fill graph_def with nodes with ids in the range
       // [session->last_num_graph_nodes, num_nodes), that is the nodes
       // added since the last TF_SessionRun() call.
@@ -1954,6 +2355,11 @@ void TF_SessionPRunSetup(TF_Session* session, const TF_Output* inputs,
   }
 }
 
+void TF_DeletePRunHandle(const char* handle) {
+  delete[] handle;
+  // TODO(suharshs): Free up any resources held by the partial run state.
+}
+
 void TF_SessionPRun(TF_Session* session, const char* handle,
                     const TF_Output* inputs, TF_Tensor* const* input_values,
                     int ninputs, const TF_Output* outputs,
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 8d0f398d4a5..15139a47acf 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -64,6 +64,25 @@ limitations under the License.
 //   and the API just provides high level controls over the number of
 //   devices of each type.
 
+// Macro to control visibility of exported symbols in the shared library (.so,
+// .dylib, .dll).
+// This duplicates the TF_EXPORT macro definition in
+// tensorflow/core/platform/macros.h in order to keep this .h file independent
+// of any other includes.$a
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(COMPILER_MSVC)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+#endif  // SWIG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -71,12 +90,12 @@ extern "C" {
 // --------------------------------------------------------------------------
 // TF_Version returns a string describing version information of the
 // TensorFlow library. TensorFlow using semantic versioning.
-extern const char* TF_Version();
+TF_CAPI_EXPORT extern const char* TF_Version();
 
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
 // The enum values here are identical to corresponding values in types.proto.
-typedef enum {
+typedef enum TF_DataType {
   TF_FLOAT = 1,
   TF_DOUBLE = 2,
   TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
@@ -103,12 +122,12 @@ typedef enum {
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
 // to the given TF_DataType enum value. Returns 0 for variable length types
 // (eg. TF_STRING) or on failure.
-extern size_t TF_DataTypeSize(TF_DataType dt);
+TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
 
 // --------------------------------------------------------------------------
 // TF_Code holds an error code.  The enum values here are identical to
 // corresponding values in error_codes.proto.
-typedef enum {
+typedef enum TF_Code {
   TF_OK = 0,
   TF_CANCELLED = 1,
   TF_UNKNOWN = 2,
@@ -134,23 +153,24 @@ typedef enum {
 typedef struct TF_Status TF_Status;
 
 // Return a new status object.
-extern TF_Status* TF_NewStatus();
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus();
 
 // Delete a previously created status object.
-extern void TF_DeleteStatus(TF_Status*);
+TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
 
 // Record <code, msg> in *s.  Any previous information is lost.
 // A common use is to clear a status: TF_SetStatus(s, TF_OK, "");
-extern void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg);
+TF_CAPI_EXPORT extern void TF_SetStatus(TF_Status* s, TF_Code code,
+                                        const char* msg);
 
 // Return the code record in *s.
-extern TF_Code TF_GetCode(const TF_Status* s);
+TF_CAPI_EXPORT extern TF_Code TF_GetCode(const TF_Status* s);
 
 // Return a pointer to the (null-terminated) error message in *s.  The
 // return value points to memory that is only usable until the next
 // mutation to *s.  Always returns an empty string if TF_GetCode(s) is
 // TF_OK.
-extern const char* TF_Message(const TF_Status* s);
+TF_CAPI_EXPORT extern const char* TF_Message(const TF_Status* s);
 
 // --------------------------------------------------------------------------
 // TF_Buffer holds a pointer to a block of data and its associated length.
@@ -168,14 +188,15 @@ typedef struct TF_Buffer {
 
 // Makes a copy of the input and sets an appropriate deallocator.  Useful for
 // passing in read-only, input protobufs.
-extern TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len);
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
+                                                        size_t proto_len);
 
 // Useful for passing *out* a protobuf.
-extern TF_Buffer* TF_NewBuffer();
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer();
 
-extern void TF_DeleteBuffer(TF_Buffer*);
+TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
-extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
+TF_CAPI_EXPORT extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
 
 // --------------------------------------------------------------------------
 // TF_Tensor holds a multi-dimensional array of elements of a single data type.
@@ -202,11 +223,10 @@ typedef struct TF_Tensor TF_Tensor;
 //      (*deallocator)(data, len, deallocator_arg)
 // Clients must provide a custom deallocator function so they can pass in
 // memory managed by something like numpy.
-extern TF_Tensor* TF_NewTensor(TF_DataType, const int64_t* dims, int num_dims,
-                               void* data, size_t len,
-                               void (*deallocator)(void* data, size_t len,
-                                                   void* arg),
-                               void* deallocator_arg);
+TF_CAPI_EXPORT extern TF_Tensor* TF_NewTensor(
+    TF_DataType, const int64_t* dims, int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg);
 
 // Allocate and return a new Tensor.
 //
@@ -217,27 +237,32 @@ extern TF_Tensor* TF_NewTensor(TF_DataType, const int64_t* dims, int num_dims,
 //
 // The caller must set the Tensor values by writing them to the pointer returned
 // by TF_TensorData with length TF_TensorByteSize.
-extern TF_Tensor* TF_AllocateTensor(TF_DataType, const int64_t* dims,
-                                    int num_dims, size_t len);
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTensor(TF_DataType,
+                                                   const int64_t* dims,
+                                                   int num_dims, size_t len);
+
+// Deletes `tensor` and returns a new TF_Tensor with the same content if
+// possible. Returns nullptr and leaves `tensor` untouched if not.
+TF_CAPI_EXPORT extern TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor);
 
 // Destroy a tensor.
-extern void TF_DeleteTensor(TF_Tensor*);
+TF_CAPI_EXPORT extern void TF_DeleteTensor(TF_Tensor*);
 
 // Return the type of a tensor element.
-extern TF_DataType TF_TensorType(const TF_Tensor*);
+TF_CAPI_EXPORT extern TF_DataType TF_TensorType(const TF_Tensor*);
 
 // Return the number of dimensions that the tensor has.
-extern int TF_NumDims(const TF_Tensor*);
+TF_CAPI_EXPORT extern int TF_NumDims(const TF_Tensor*);
 
 // Return the length of the tensor in the "dim_index" dimension.
 // REQUIRES: 0 <= dim_index < TF_NumDims(tensor)
-extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
+TF_CAPI_EXPORT extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
 
 // Return the size of the underlying data in bytes.
-extern size_t TF_TensorByteSize(const TF_Tensor*);
+TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 
 // Return a pointer to the underlying data buffer.
-extern void* TF_TensorData(const TF_Tensor*);
+TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
 // --------------------------------------------------------------------------
 // Encode the string `src` (`src_len` bytes long) into `dst` in the format
@@ -247,8 +272,9 @@ extern void* TF_TensorData(const TF_Tensor*);
 //
 // On success returns the size in bytes of the encoded string.
 // Returns an error into `status` otherwise.
-extern size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
-                              size_t dst_len, TF_Status* status);
+TF_CAPI_EXPORT extern size_t TF_StringEncode(const char* src, size_t src_len,
+                                             char* dst, size_t dst_len,
+                                             TF_Status* status);
 
 // Decode a string encoded using TF_StringEncode.
 //
@@ -258,19 +284,20 @@ extern size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
 // `*dst` and `*dst_len` are undefined and an error is set in `status`.
 //
 // Does not read memory more than `src_len` bytes beyond `src`.
-extern size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
-                              size_t* dst_len, TF_Status* status);
+TF_CAPI_EXPORT extern size_t TF_StringDecode(const char* src, size_t src_len,
+                                             const char** dst, size_t* dst_len,
+                                             TF_Status* status);
 
 // Return the size in bytes required to encode a string `len` bytes long into a
 // TF_STRING tensor.
-extern size_t TF_StringEncodedSize(size_t len);
+TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 
 // --------------------------------------------------------------------------
 // TF_SessionOptions holds options that can be passed during session creation.
 typedef struct TF_SessionOptions TF_SessionOptions;
 
 // Return a new options object.
-extern TF_SessionOptions* TF_NewSessionOptions();
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions();
 
 // Set the target in TF_SessionOptions.options.
 // target can be empty, a single entry, or a comma separated list of entries.
@@ -278,17 +305,19 @@ extern TF_SessionOptions* TF_NewSessionOptions();
 // "local"
 // ip:port
 // host:port
-extern void TF_SetTarget(TF_SessionOptions* options, const char* target);
+TF_CAPI_EXPORT extern void TF_SetTarget(TF_SessionOptions* options,
+                                        const char* target);
 
 // Set the config in TF_SessionOptions.options.
 // config should be a serialized tensorflow.ConfigProto proto.
 // If config was not parsed successfully as a ConfigProto, record the
 // error information in *status.
-extern void TF_SetConfig(TF_SessionOptions* options, const void* proto,
-                         size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetConfig(TF_SessionOptions* options,
+                                        const void* proto, size_t proto_len,
+                                        TF_Status* status);
 
 // Destroy an options object.
-extern void TF_DeleteSessionOptions(TF_SessionOptions*);
+TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 
 // TODO(jeff,sanjay):
 // - export functions to set Config fields
@@ -301,11 +330,11 @@ extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 typedef struct TF_Graph TF_Graph;
 
 // Return a new graph object.
-extern TF_Graph* TF_NewGraph();
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph();
 
 // Destroy an options object.  Graph will be deleted once no more
 // TFSession's are referencing it.
-extern void TF_DeleteGraph(TF_Graph*);
+TF_CAPI_EXPORT extern void TF_DeleteGraph(TF_Graph*);
 
 // Operation being built. The underlying graph must outlive this.
 typedef struct TF_OperationDescription TF_OperationDescription;
@@ -343,9 +372,11 @@ typedef struct TF_Output {
 //   * `output` is not in `graph`.
 //   * An invalid shape is being set (e.g., the shape being set
 //     is incompatible with the existing shape).
-extern void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
-                                   const int64_t* dims, const int num_dims,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphSetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  const int64_t* dims,
+                                                  const int num_dims,
+                                                  TF_Status* status);
 
 // Returns the number of dimensions of the Tensor referenced by `output`
 // in `graph`.
@@ -354,8 +385,9 @@ extern void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
 //
 // Returns an error into `status` if:
 //   * `output` is not in `graph`.
-extern int TF_GraphGetTensorNumDims(TF_Graph* graph, TF_Output output,
-                                    TF_Status* status);
+TF_CAPI_EXPORT extern int TF_GraphGetTensorNumDims(TF_Graph* graph,
+                                                   TF_Output output,
+                                                   TF_Status* status);
 
 // Returns the shape of the Tensor referenced by `output` in `graph`
 // into `dims`. `dims` must be an array large enough to hold `num_dims`
@@ -369,20 +401,21 @@ extern int TF_GraphGetTensorNumDims(TF_Graph* graph, TF_Output output,
 // Returns an error into `status` if:
 //   * `output` is not in `graph`.
 //   * `num_dims` does not match the actual number of dimensions.
-extern void TF_GraphGetTensorShape(TF_Graph* graph, TF_Output output,
-                                   int64_t* dims, int num_dims,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphGetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  int64_t* dims, int num_dims,
+                                                  TF_Status* status);
 
 // Operation will only be added to *graph when TF_FinishOperation() is
 // called (assuming TF_FinishOperation() does not return an error).
 // *graph must not be deleted until after TF_FinishOperation() is
 // called.
-extern TF_OperationDescription* TF_NewOperation(TF_Graph* graph,
-                                                const char* op_type,
-                                                const char* oper_name);
+TF_CAPI_EXPORT extern TF_OperationDescription* TF_NewOperation(
+    TF_Graph* graph, const char* op_type, const char* oper_name);
 
 // Specify the device for `desc`.  Defaults to empty, meaning unconstrained.
-extern void TF_SetDevice(TF_OperationDescription* desc, const char* device);
+TF_CAPI_EXPORT extern void TF_SetDevice(TF_OperationDescription* desc,
+                                        const char* device);
 
 // The calls to TF_AddInput and TF_AddInputList must match (in number,
 // order, and type) the op declaration.  For example, the "Concat" op
@@ -405,101 +438,115 @@ extern void TF_SetDevice(TF_OperationDescription* desc, const char* device);
 //   TF_AddInputList(desc, values_inputs, 5);
 
 // For inputs that take a single tensor.
-extern void TF_AddInput(TF_OperationDescription* desc, TF_Output input);
+TF_CAPI_EXPORT extern void TF_AddInput(TF_OperationDescription* desc,
+                                       TF_Output input);
 
 // For inputs that take a list of tensors.
 // inputs must point to TF_Output[num_inputs].
-extern void TF_AddInputList(TF_OperationDescription* desc,
-                            const TF_Output* inputs, int num_inputs);
+TF_CAPI_EXPORT extern void TF_AddInputList(TF_OperationDescription* desc,
+                                           const TF_Output* inputs,
+                                           int num_inputs);
 
 // Call once per control input to `desc`.
-extern void TF_AddControlInput(TF_OperationDescription* desc,
-                               TF_Operation* input);
+TF_CAPI_EXPORT extern void TF_AddControlInput(TF_OperationDescription* desc,
+                                              TF_Operation* input);
 
 // Request that `desc` be co-located on the device where `op`
 // is placed.
 //
 // Use of this is discouraged since the implementation of device placement is
 // subject to change. Primarily intended for internal libraries
-extern void TF_ColocateWith(TF_OperationDescription* desc, TF_Operation* op);
+TF_CAPI_EXPORT extern void TF_ColocateWith(TF_OperationDescription* desc,
+                                           TF_Operation* op);
 
 // Call some TF_SetAttr*() function for every attr that is not
 // inferred from an input and doesn't have a default value you wish to
 // keep.
 
 // `value` must point to a string of length `length` bytes.
-extern void TF_SetAttrString(TF_OperationDescription* desc,
-                             const char* attr_name, const void* value,
-                             size_t length);
+TF_CAPI_EXPORT extern void TF_SetAttrString(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            const void* value, size_t length);
 // `values` and `lengths` each must have lengths `num_values`.
 // `values[i]` must point to a string of length `lengths[i]` bytes.
-extern void TF_SetAttrStringList(TF_OperationDescription* desc,
-                                 const char* attr_name,
-                                 const void* const* values,
-                                 const size_t* lengths, int num_values);
-extern void TF_SetAttrInt(TF_OperationDescription* desc, const char* attr_name,
-                          int64_t value);
-extern void TF_SetAttrIntList(TF_OperationDescription* desc,
-                              const char* attr_name, const int64_t* values,
-                              int num_values);
-extern void TF_SetAttrFloat(TF_OperationDescription* desc,
-                            const char* attr_name, float value);
-extern void TF_SetAttrFloatList(TF_OperationDescription* desc,
-                                const char* attr_name, const float* values,
-                                int num_values);
-extern void TF_SetAttrBool(TF_OperationDescription* desc, const char* attr_name,
-                           unsigned char value);
-extern void TF_SetAttrBoolList(TF_OperationDescription* desc,
-                               const char* attr_name,
-                               const unsigned char* values, int num_values);
-extern void TF_SetAttrType(TF_OperationDescription* desc, const char* attr_name,
-                           TF_DataType value);
-extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
-                               const char* attr_name, const TF_DataType* values,
-                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrStringList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* const* values,
+                                                const size_t* lengths,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrInt(TF_OperationDescription* desc,
+                                         const char* attr_name, int64_t value);
+TF_CAPI_EXPORT extern void TF_SetAttrIntList(TF_OperationDescription* desc,
+                                             const char* attr_name,
+                                             const int64_t* values,
+                                             int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrFloat(TF_OperationDescription* desc,
+                                           const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TF_SetAttrFloatList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const float* values,
+                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrBool(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          unsigned char value);
+TF_CAPI_EXPORT extern void TF_SetAttrBoolList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const unsigned char* values,
+                                              int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrType(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          TF_DataType value);
+TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const TF_DataType* values,
+                                              int num_values);
 
 // Set `num_dims` to -1 to represent "unknown rank".  Otherwise,
 // `dims` points to an array of length `num_dims`.  `dims[i]` must be
 // >= -1, with -1 meaning "unknown dimension".
-extern void TF_SetAttrShape(TF_OperationDescription* desc,
-                            const char* attr_name, const int64_t* dims,
-                            int num_dims);
+TF_CAPI_EXPORT extern void TF_SetAttrShape(TF_OperationDescription* desc,
+                                           const char* attr_name,
+                                           const int64_t* dims, int num_dims);
 // `dims` and `num_dims` must point to arrays of length `num_shapes`.
 // Set `num_dims[i]` to -1 to represent "unknown rank".  Otherwise,
 // `dims[i]` points to an array of length `num_dims[i]`.  `dims[i][j]`
 // must be >= -1, with -1 meaning "unknown dimension".
-extern void TF_SetAttrShapeList(TF_OperationDescription* desc,
-                                const char* attr_name,
-                                const int64_t* const* dims, const int* num_dims,
-                                int num_shapes);
+TF_CAPI_EXPORT extern void TF_SetAttrShapeList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const int64_t* const* dims,
+                                               const int* num_dims,
+                                               int num_shapes);
 // `proto` must point to an array of `proto_len` bytes representing a
 // binary-serialized TensorShapeProto.
-extern void TF_SetAttrTensorShapeProto(TF_OperationDescription* desc,
-                                       const char* attr_name, const void* proto,
-                                       size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProto(
+    TF_OperationDescription* desc, const char* attr_name, const void* proto,
+    size_t proto_len, TF_Status* status);
 // `protos` and `proto_lens` must point to arrays of length `num_shapes`.
 // `protos[i]` must point to an array of `proto_lens[i]` bytes
 // representing a binary-serialized TensorShapeProto.
-extern void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc,
-                                           const char* attr_name,
-                                           const void* const* protos,
-                                           const size_t* proto_lens,
-                                           int num_shapes, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProtoList(
+    TF_OperationDescription* desc, const char* attr_name,
+    const void* const* protos, const size_t* proto_lens, int num_shapes,
+    TF_Status* status);
 
-extern void TF_SetAttrTensor(TF_OperationDescription* desc,
-                             const char* attr_name, TF_Tensor* value,
-                             TF_Status* status);
-extern void TF_SetAttrTensorList(TF_OperationDescription* desc,
-                                 const char* attr_name,
-                                 TF_Tensor* const* values, int num_values,
-                                 TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensor(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            TF_Tensor* value,
+                                            TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                TF_Tensor* const* values,
+                                                int num_values,
+                                                TF_Status* status);
 
 // `proto` should point to a sequence of bytes of length `proto_len`
 // representing a binary serialization of an AttrValue protocol
 // buffer.
-extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
-                                 const char* attr_name, const void* proto,
-                                 size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* proto,
+                                                size_t proto_len,
+                                                TF_Status* status);
 
 // If this function succeeds:
 //   * *status is set to an OK value,
@@ -511,37 +558,38 @@ extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
 //   * the graph is not modified,
 //   * a null value is returned.
 // In either case, it deletes `desc`.
-extern TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern TF_Operation* TF_FinishOperation(
+    TF_OperationDescription* desc, TF_Status* status);
 
 // TF_Operation functions.  Operations are immutable once created, so
 // these are all query functions.
 
-extern const char* TF_OperationName(TF_Operation* oper);
-extern const char* TF_OperationOpType(TF_Operation* oper);
-extern const char* TF_OperationDevice(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationName(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationOpType(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationDevice(TF_Operation* oper);
 
-extern int TF_OperationNumOutputs(TF_Operation* oper);
-extern TF_DataType TF_OperationOutputType(TF_Output oper_out);
-extern int TF_OperationOutputListLength(TF_Operation* oper,
-                                        const char* arg_name,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern int TF_OperationNumOutputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationOutputType(TF_Output oper_out);
+TF_CAPI_EXPORT extern int TF_OperationOutputListLength(TF_Operation* oper,
+                                                       const char* arg_name,
+                                                       TF_Status* status);
 
-extern int TF_OperationNumInputs(TF_Operation* oper);
-extern TF_DataType TF_OperationInputType(TF_Input oper_in);
-extern int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
-                                       TF_Status* status);
+TF_CAPI_EXPORT extern int TF_OperationNumInputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationInputType(TF_Input oper_in);
+TF_CAPI_EXPORT extern int TF_OperationInputListLength(TF_Operation* oper,
+                                                      const char* arg_name,
+                                                      TF_Status* status);
 
 // In this code:
 //   TF_Output producer = TF_OperationInput(consumer);
 // There is an edge from producer.oper's output (given by
 // producer.index) to consumer.oper's input (given by consumer.index).
-extern TF_Output TF_OperationInput(TF_Input oper_in);
+TF_CAPI_EXPORT extern TF_Output TF_OperationInput(TF_Input oper_in);
 
 // Get the number of current consumers of a specific output of an
 // operation.  Note that this number can change when new operations
 // are added to the graph.
-extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
+TF_CAPI_EXPORT extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
 
 // Get list of all current consumers of a specific output of an
 // operation.  `consumers` must point to an array of length at least
@@ -550,24 +598,24 @@ extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
 // modification of the graph can increase the number of consumers of
 // an operation.  Returns the number of output consumers (should match
 // TF_OperationOutputNumConsumers(oper_out)).
-extern int TF_OperationOutputConsumers(TF_Output oper_out, TF_Input* consumers,
-                                       int max_consumers);
+TF_CAPI_EXPORT extern int TF_OperationOutputConsumers(TF_Output oper_out,
+                                                      TF_Input* consumers,
+                                                      int max_consumers);
 
 // Get the number of control inputs to an operation.
-extern int TF_OperationNumControlInputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern int TF_OperationNumControlInputs(TF_Operation* oper);
 
 // Get list of all control inputs to an operation.  `control_inputs` must
 // point to an array of length `max_control_inputs` (ideally set to
 // TF_OperationNumControlInputs(oper)).  Returns the number of control
 // inputs (should match TF_OperationNumControlInputs(oper)).
-extern int TF_OperationGetControlInputs(TF_Operation* oper,
-                                        TF_Operation** control_inputs,
-                                        int max_control_inputs);
+TF_CAPI_EXPORT extern int TF_OperationGetControlInputs(
+    TF_Operation* oper, TF_Operation** control_inputs, int max_control_inputs);
 
 // Get the number of operations that have `*oper` as a control input.
 // Note that this number can change when new operations are added to
 // the graph.
-extern int TF_OperationNumControlOutputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern int TF_OperationNumControlOutputs(TF_Operation* oper);
 
 // Get the list of operations that have `*oper` as a control input.
 // `control_outputs` must point to an array of length at least
@@ -576,12 +624,12 @@ extern int TF_OperationNumControlOutputs(TF_Operation* oper);
 // modification of the graph can increase the number of control
 // outputs.  Returns the number of control outputs (should match
 // TF_OperationNumControlOutputs(oper)).
-extern int TF_OperationGetControlOutputs(TF_Operation* oper,
-                                         TF_Operation** control_outputs,
-                                         int max_control_outputs);
+TF_CAPI_EXPORT extern int TF_OperationGetControlOutputs(
+    TF_Operation* oper, TF_Operation** control_outputs,
+    int max_control_outputs);
 
 // TF_AttrType describes the type of the value of an attribute on an operation.
-typedef enum {
+typedef enum TF_AttrType {
   TF_ATTR_STRING = 0,
   TF_ATTR_INT = 1,
   TF_ATTR_FLOAT = 2,
@@ -625,17 +673,18 @@ typedef struct TF_AttrMetadata {
 } TF_AttrMetadata;
 
 // Returns metadata about the value of the attribute `attr_name` of `oper`.
-extern TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
-                                                   const char* attr_name,
-                                                   TF_Status* status);
+TF_CAPI_EXPORT extern TF_AttrMetadata TF_OperationGetAttrMetadata(
+    TF_Operation* oper, const char* attr_name, TF_Status* status);
 
 // Fills in `value` with the value of the attribute `attr_name`.  `value` must
 // point to an array of length at least `max_length` (ideally set to
 // TF_AttrMetadata.total_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
-                                      void* value, size_t max_length,
-                                      TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrString(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     void* value,
+                                                     size_t max_length,
+                                                     TF_Status* status);
 
 // Get the list of strings in the value of the attribute `attr_name`.  Fills in
 // `values` and `lengths`, each of which must point to an array of length at
@@ -648,64 +697,78 @@ extern void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
 // attr_name).
 //
 // Fails if storage_size is too small to hold the requested number of strings.
-extern void TF_OperationGetAttrStringList(TF_Operation* oper,
-                                          const char* attr_name, void** values,
-                                          size_t* lengths, int max_values,
-                                          void* storage, size_t storage_size,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrStringList(
+    TF_Operation* oper, const char* attr_name, void** values, size_t* lengths,
+    int max_values, void* storage, size_t storage_size, TF_Status* status);
 
-extern void TF_OperationGetAttrInt(TF_Operation* oper, const char* attr_name,
-                                   int64_t* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrInt(TF_Operation* oper,
+                                                  const char* attr_name,
+                                                  int64_t* value,
+                                                  TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrIntList(TF_Operation* oper,
-                                       const char* attr_name, int64_t* values,
-                                       int max_values, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrIntList(TF_Operation* oper,
+                                                      const char* attr_name,
+                                                      int64_t* values,
+                                                      int max_values,
+                                                      TF_Status* status);
 
-extern void TF_OperationGetAttrFloat(TF_Operation* oper, const char* attr_name,
-                                     float* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloat(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    float* value,
+                                                    TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrFloatList(TF_Operation* oper,
-                                         const char* attr_name, float* values,
-                                         int max_values, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloatList(TF_Operation* oper,
+                                                        const char* attr_name,
+                                                        float* values,
+                                                        int max_values,
+                                                        TF_Status* status);
 
-extern void TF_OperationGetAttrBool(TF_Operation* oper, const char* attr_name,
-                                    unsigned char* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBool(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   unsigned char* value,
+                                                   TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrBoolList(TF_Operation* oper,
-                                        const char* attr_name,
-                                        unsigned char* values, int max_values,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBoolList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       unsigned char* values,
+                                                       int max_values,
+                                                       TF_Status* status);
 
-extern void TF_OperationGetAttrType(TF_Operation* oper, const char* attr_name,
-                                    TF_DataType* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrType(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   TF_DataType* value,
+                                                   TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrTypeList(TF_Operation* oper,
-                                        const char* attr_name,
-                                        TF_DataType* values, int max_values,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTypeList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       TF_DataType* values,
+                                                       int max_values,
+                                                       TF_Status* status);
 
 // Fills in `value` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `num_dims` (ideally set to
 // TF_Attr_Meta.size from TF_OperationGetAttrMetadata(oper, attr_name)).
-extern void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
-                                     int64_t* value, int num_dims,
-                                     TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShape(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    int64_t* value,
+                                                    int num_dims,
+                                                    TF_Status* status);
 
 // Fills in `dims` with the list of shapes in the attribute `attr_name` of
 // `oper` and `num_dims` with the corresponding number of dimensions. On return,
@@ -720,35 +783,32 @@ extern void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
 // attr_name).
 //
 // Fails if storage_size is insufficient to hold the requested shapes.
-extern void TF_OperationGetAttrShapeList(TF_Operation* oper,
-                                         const char* attr_name, int64_t** dims,
-                                         int* num_dims, int num_shapes,
-                                         int64_t* storage, int storage_size,
-                                         TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShapeList(
+    TF_Operation* oper, const char* attr_name, int64_t** dims, int* num_dims,
+    int num_shapes, int64_t* storage, int storage_size, TF_Status* status);
 
 // Sets `value` to the binary-serialized TensorShapeProto of the value of
 // `attr_name` attribute of `oper`'.
-extern void TF_OperationGetAttrTensorShapeProto(TF_Operation* oper,
-                                                const char* attr_name,
-                                                TF_Buffer* value,
-                                                TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* value,
+    TF_Status* status);
 
 // Fills in `values` with binary-serialized TensorShapeProto values of the
 // attribute `attr_name` of `oper`. `values` must point to an array of length at
 // least `num_values` (ideally set to TF_AttrMetadata.list_size from
 // TF_OperationGetAttrMetadata(oper, attr_name)).
-extern void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
-                                                    const char* attr_name,
-                                                    TF_Buffer** values,
-                                                    int max_values,
-                                                    TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProtoList(
+    TF_Operation* oper, const char* attr_name, TF_Buffer** values,
+    int max_values, TF_Status* status);
 
 // Gets the TF_Tensor valued attribute of `attr_name` of `oper`.
 //
 // Allocates a new TF_Tensor which the caller is expected to take
 // ownership of (and can deallocate using TF_DeleteTensor).
-extern void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
-                                      TF_Tensor** value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensor(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     TF_Tensor** value,
+                                                     TF_Status* status);
 
 // Fills in `values` with the TF_Tensor values of the attribute `attr_name` of
 // `oper`. `values` must point to an array of TF_Tensor* of length at least
@@ -757,22 +817,22 @@ extern void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
 //
 // The caller takes ownership of all the non-null TF_Tensor* entries in `values`
 // (which can be deleted using TF_DeleteTensor(values[i])).
-extern void TF_OperationGetAttrTensorList(TF_Operation* oper,
-                                          const char* attr_name,
-                                          TF_Tensor** values, int max_values,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorList(TF_Operation* oper,
+                                                         const char* attr_name,
+                                                         TF_Tensor** values,
+                                                         int max_values,
+                                                         TF_Status* status);
 
 // Sets `output_attr_value` to the binary-serialized AttrValue proto
 // representation of the value of the `attr_name` attr of `oper`.
-extern void TF_OperationGetAttrValueProto(TF_Operation* oper,
-                                          const char* attr_name,
-                                          TF_Buffer* output_attr_value,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrValueProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* output_attr_value,
+    TF_Status* status);
 
 // Returns the operation in the graph with `oper_name`. Returns nullptr if
 // no operation found.
-extern TF_Operation* TF_GraphOperationByName(TF_Graph* graph,
-                                             const char* oper_name);
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphOperationByName(
+    TF_Graph* graph, const char* oper_name);
 
 // Iterate through the operations of a graph.  To use:
 // size_t pos = 0;
@@ -780,48 +840,60 @@ extern TF_Operation* TF_GraphOperationByName(TF_Graph* graph,
 // while ((oper = TF_GraphNextOperation(graph, &pos)) != nullptr) {
 //   DoSomethingWithOperation(oper);
 // }
-extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph, size_t* pos);
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph,
+                                                          size_t* pos);
 
 // Write out a serialized representation of `graph` (as a GraphDef protocol
 // message) to `output_graph_def` (allocated by TF_NewBuffer()).
+// `output_graph_def`'s underlying buffer will be freed when TF_DeleteBuffer()
+// is called.
 //
 // May fail on very large graphs in the future.
-extern void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
-                               TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphToGraphDef(TF_Graph* graph,
+                                              TF_Buffer* output_graph_def,
+                                              TF_Status* status);
 
 // TF_ImportGraphDefOptions holds options that can be passed to
 // TF_GraphImportGraphDef.
 typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
 
-extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
-extern void TF_DeleteImportGraphDefOptions(TF_ImportGraphDefOptions* opts);
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
+    TF_ImportGraphDefOptions* opts);
 
 // Set the prefix to be prepended to the names of nodes in `graph_def` that will
 // be imported into `graph`.
-extern void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
-                                              const char* prefix);
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
+    TF_ImportGraphDefOptions* opts, const char* prefix);
 
 // Set any imported nodes with input `src_name:src_index` to have that input
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references a node already existing in the graph being imported into.
-extern void TF_ImportGraphDefOptionsAddInputMapping(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddInputMapping(
     TF_ImportGraphDefOptions* opts, const char* src_name, int src_index,
     TF_Output dst);
 
+// Set any imported nodes with control input `src_name` to have that input
+// replaced with `dst`. `src_name` refers to a node in the graph to be imported,
+// `dst` references an operation already existing in the graph being imported
+// into.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsRemapControlDependency(
+    TF_ImportGraphDefOptions* opts, const char* src_name, TF_Operation* dst);
+
 // Cause the imported graph to have a control dependency on `oper`. `oper`
 // should exist in the graph being imported into.
-extern void TF_ImportGraphDefOptionsAddControlDependency(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddControlDependency(
     TF_ImportGraphDefOptions* opts, TF_Operation* oper);
 
 // Add an output in `graph_def` to be returned via the `return_outputs` output
 // parameter of TF_GraphImportGraphDef(). If the output is remapped via an input
 // mapping, the corresponding existing tensor in `graph` will be returned.
-extern void TF_ImportGraphDefOptionsAddReturnOutput(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOutput(
     TF_ImportGraphDefOptions* opts, const char* oper_name, int index);
 
 // Returns the number of return outputs added via
 // TF_ImportGraphDefOptionsAddReturnOutput().
-extern int TF_ImportGraphDefOptionsNumReturnOutputs(
+TF_CAPI_EXPORT extern int TF_ImportGraphDefOptionsNumReturnOutputs(
     const TF_ImportGraphDefOptions* opts);
 
 // Import the graph serialized in `graph_def` into `graph`.
@@ -830,24 +902,103 @@ extern int TF_ImportGraphDefOptionsNumReturnOutputs(
 // result of TF_ImportGraphDefOptionsNumReturnOutputs()).  If
 // `num_return_outputs` is non-zero, `return_outputs` must be of length
 // `num_return_outputs`. Otherwise it can be null.
-extern void TF_GraphImportGraphDefWithReturnOutputs(
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDefWithReturnOutputs(
     TF_Graph* graph, const TF_Buffer* graph_def,
     const TF_ImportGraphDefOptions* options, TF_Output* return_outputs,
     int num_return_outputs, TF_Status* status);
 
 // Import the graph serialized in `graph_def` into `graph`.
 // Convenience function for when no return outputs have been added.
-extern void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
-                                   const TF_ImportGraphDefOptions* options,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDef(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status);
 
 // Note: The following function may fail on very large protos in the future.
 
-extern void TF_OperationToNodeDef(TF_Operation* oper,
-                                  TF_Buffer* output_node_def,
-                                  TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper,
+                                                 TF_Buffer* output_node_def,
+                                                 TF_Status* status);
 
-// TODO(andydavis): Function to add gradients to a graph.
+typedef struct TF_WhileParams {
+  // The number of inputs to the while loop, i.e. the number of loop variables.
+  // This is the size of cond_inputs, body_inputs, and body_outputs.
+  const int ninputs;
+
+  // The while condition graph. The inputs are the current values of the loop
+  // variables. The output should be a scalar boolean.
+  TF_Graph* const cond_graph;
+  const TF_Output* const cond_inputs;
+  TF_Output cond_output;
+
+  // The loop body graph. The inputs are the current values of the loop
+  // variables. The outputs are the updated values of the loop variables.
+  TF_Graph* const body_graph;
+  const TF_Output* const body_inputs;
+  TF_Output* const body_outputs;
+
+  // Unique null-terminated name for this while loop. This is used as a prefix
+  // for created operations.
+  const char* name;
+} TF_WhileParams;
+
+// Creates a TF_WhileParams for creating a while loop in `g`. `inputs` are
+// outputs that already exist in `g` used as initial values for the loop
+// variables.
+//
+// The returned TF_WhileParams will have all fields initialized except
+// `cond_output`, `body_outputs`, and `name`. The `body_outputs` buffer will be
+// allocated to size `ninputs`. The caller should build `cond_graph` and
+// `body_graph` starting from the inputs, and store the final outputs in
+// `cond_output` and `body_outputs`.
+//
+// If `status` is OK, the caller must call either TF_FinishWhile or
+// TF_AbortWhile on the returned TF_WhileParams. If `status` isn't OK, the
+// returned TF_WhileParams is not valid, and the caller should not call
+// TF_FinishWhile() or TF_AbortWhile().
+//
+// Missing functionality (TODO):
+// - Gradients
+// - Reference-type inputs
+// - Directly referencing external tensors from the cond/body graphs (this is
+//   possible in the Python API)
+TF_CAPI_EXPORT extern TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs,
+                                                 int ninputs,
+                                                 TF_Status* status);
+
+// Builds the while loop specified by `params` and returns the output tensors of
+// the while loop in `outputs`. `outputs` should be allocated to size
+// `params.ninputs`.
+//
+// `params` is no longer valid once this returns.
+//
+// Either this or TF_AbortWhile() must be called after a successful
+// TF_NewWhile() call.
+TF_CAPI_EXPORT extern void TF_FinishWhile(const TF_WhileParams* params,
+                                          TF_Status* status,
+                                          TF_Output* outputs);
+
+// Frees `params`s resources without building a while loop. `params` is no
+// longer valid after this returns. Either this or TF_FinishWhile() must be
+// called after a successful TF_NewWhile() call.
+TF_CAPI_EXPORT extern void TF_AbortWhile(const TF_WhileParams* params);
+
+// Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s,
+// i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2...
+// `dx` are used as initial gradients (which represent the symbolic partial
+// derivatives of some loss function `L` w.r.t. `y`).
+// `dx` must be nullptr or have size `ny`.
+// If `dx` is nullptr, the implementation will use dx of `OnesLike` for all
+// shapes in `y`.
+// The partial derivatives are returned in `dy`. `dy` should be allocated to
+// size `nx`.
+//
+// WARNING: This function does not yet support all the gradients that python
+// supports. See
+// https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md
+// for instructions on how to add C++ more gradients.
+TF_CAPI_EXPORT void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny,
+                                    TF_Output* x, int nx, TF_Output* dx,
+                                    TF_Status* status, TF_Output* dy);
 
 // TODO(josh11b): Register OpDef, available to all operations added
 // to this graph.
@@ -855,7 +1006,6 @@ extern void TF_OperationToNodeDef(TF_Operation* oper,
 // The following two may both benefit from a subgraph-definition API
 // that re-uses most of the graph-definition API.
 // TODO(andydavis): Add functions to a graph.
-// TODO(yuanbyu): Add while loop to graph.
 
 // --------------------------------------------------------------------------
 // API for driving Graph execution.
@@ -867,12 +1017,9 @@ typedef struct TF_Session TF_Session;
 // *graph must be a valid graph (not deleted or nullptr).  This function will
 // prevent the graph from being deleted until TF_DeleteSession() is called.
 // Does not take ownership of opts.
-extern TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opts,
-                                 TF_Status* status);
-
-#ifndef __ANDROID__
-// TODO(ashankar): Remove the __ANDROID__ guard. This will require ensuring that
-// the tensorflow/cc/saved_model:loader build target is Android friendly.
+TF_CAPI_EXPORT extern TF_Session* TF_NewSession(TF_Graph* graph,
+                                                const TF_SessionOptions* opts,
+                                                TF_Status* status);
 
 // This function creates a new TF_Session (which is created on success) using
 // `session_options`, and then initializes state (restoring tensors and other
@@ -888,17 +1035,16 @@ extern TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opts,
 //
 // If successful, populates `graph` with the contents of the Graph and
 // `meta_graph_def` with the MetaGraphDef of the loaded model.
-TF_Session* TF_LoadSessionFromSavedModel(
+TF_CAPI_EXPORT extern TF_Session* TF_LoadSessionFromSavedModel(
     const TF_SessionOptions* session_options, const TF_Buffer* run_options,
     const char* export_dir, const char* const* tags, int tags_len,
     TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status);
-#endif  // __ANDROID__
 
 // Close a session.
 //
 // Contacts any other processes associated with the session, if applicable.
 // May not be called after TF_DeleteSession().
-extern void TF_CloseSession(TF_Session*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_CloseSession(TF_Session*, TF_Status* status);
 
 // Destroy a session object.
 //
@@ -906,7 +1052,7 @@ extern void TF_CloseSession(TF_Session*, TF_Status* status);
 // local resources associated with the session.  The session may not be used
 // during or after this call (and the session drops its reference to the
 // corresponding graph).
-extern void TF_DeleteSession(TF_Session*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteSession(TF_Session*, TF_Status* status);
 
 // Run the graph associated with the session starting with the supplied inputs
 // (inputs[0,ninputs-1] with corresponding values in input_values[0,ninputs-1]).
@@ -932,58 +1078,61 @@ extern void TF_DeleteSession(TF_Session*, TF_Status* status);
 // to the caller, which must eventually call TF_DeleteTensor on them.
 //
 // On failure, output_values[] contains NULLs.
-extern void TF_SessionRun(TF_Session* session,
-                          // RunOptions
-                          const TF_Buffer* run_options,
-                          // Input tensors
-                          const TF_Output* inputs,
-                          TF_Tensor* const* input_values, int ninputs,
-                          // Output tensors
-                          const TF_Output* outputs, TF_Tensor** output_values,
-                          int noutputs,
-                          // Target operations
-                          const TF_Operation* const* target_opers, int ntargets,
-                          // RunMetadata
-                          TF_Buffer* run_metadata,
-                          // Output status
-                          TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionRun(
+    TF_Session* session,
+    // RunOptions
+    const TF_Buffer* run_options,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // RunMetadata
+    TF_Buffer* run_metadata,
+    // Output status
+    TF_Status*);
 
 // Set up the graph with the intended feeds (inputs) and fetches (outputs) for a
 // sequence of partial run calls.
 //
-// On success, returns a handle that is used for subsequent PRun calls.
+// On success, returns a handle that is used for subsequent PRun calls. The
+// handle should be deleted with TF_DeletePRunHandle when it is no longer
+// needed.
 //
 // On failure, out_status contains a tensorflow::Status with an error
 // message.
 // NOTE: This is EXPERIMENTAL and subject to change.
-extern void TF_SessionPRunSetup(TF_Session*,
-                                // Input names
-                                const TF_Output* inputs, int ninputs,
-                                // Output names
-                                const TF_Output* outputs, int noutputs,
-                                // Target operations
-                                const TF_Operation* const* target_opers,
-                                int ntargets,
-                                // Output handle
-                                const char** handle,
-                                // Output status
-                                TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionPRunSetup(
+    TF_Session*,
+    // Input names
+    const TF_Output* inputs, int ninputs,
+    // Output names
+    const TF_Output* outputs, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output handle
+    const char** handle,
+    // Output status
+    TF_Status*);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
 // NOTE: This is EXPERIMENTAL and subject to change.
-extern void TF_SessionPRun(TF_Session*, const char* handle,
-                           // Input tensors
-                           const TF_Output* inputs,
-                           TF_Tensor* const* input_values, int ninputs,
-                           // Output tensors
-                           const TF_Output* outputs, TF_Tensor** output_values,
-                           int noutputs,
-                           // Target operations
-                           const TF_Operation* const* target_opers,
-                           int ntargets,
-                           // Output status
-                           TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionPRun(
+    TF_Session*, const char* handle,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output status
+    TF_Status*);
+
+// Deletes a handle allocated by TF_SessionPRunSetup.
+// Once called, no more calls to TF_SessionPRun should be made.
+TF_CAPI_EXPORT extern void TF_DeletePRunHandle(const char* handle);
 
 // --------------------------------------------------------------------------
 // The deprecated session API.  Please switch to the above instead of
@@ -992,39 +1141,96 @@ extern void TF_SessionPRun(TF_Session*, const char* handle,
 
 typedef struct TF_DeprecatedSession TF_DeprecatedSession;
 
-extern TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions*,
+TF_CAPI_EXPORT extern TF_DeprecatedSession* TF_NewDeprecatedSession(
+    const TF_SessionOptions*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_CloseDeprecatedSession(TF_DeprecatedSession*,
                                                      TF_Status* status);
-extern void TF_CloseDeprecatedSession(TF_DeprecatedSession*, TF_Status* status);
-extern void TF_DeleteDeprecatedSession(TF_DeprecatedSession*,
-                                       TF_Status* status);
-extern void TF_Reset(const TF_SessionOptions* opt, const char** containers,
-                     int ncontainers, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteDeprecatedSession(TF_DeprecatedSession*,
+                                                      TF_Status* status);
+TF_CAPI_EXPORT extern void TF_Reset(const TF_SessionOptions* opt,
+                                    const char** containers, int ncontainers,
+                                    TF_Status* status);
 // Treat the bytes proto[0,proto_len-1] as a serialized GraphDef and
 // add the nodes in that GraphDef to the graph for the session.
 //
 // Prefer use of TF_Session and TF_GraphImportGraphDef over this.
-extern void TF_ExtendGraph(TF_DeprecatedSession*, const void* proto,
-                           size_t proto_len, TF_Status*);
+TF_CAPI_EXPORT extern void TF_ExtendGraph(TF_DeprecatedSession*,
+                                          const void* proto, size_t proto_len,
+                                          TF_Status*);
 
 // See TF_SessionRun() above.
-extern void TF_Run(TF_DeprecatedSession*, const TF_Buffer* run_options,
-                   const char** input_names, TF_Tensor** inputs, int ninputs,
-                   const char** output_names, TF_Tensor** outputs, int noutputs,
-                   const char** target_oper_names, int ntargets,
-                   TF_Buffer* run_metadata, TF_Status*);
+TF_CAPI_EXPORT extern void TF_Run(TF_DeprecatedSession*,
+                                  const TF_Buffer* run_options,
+                                  const char** input_names, TF_Tensor** inputs,
+                                  int ninputs, const char** output_names,
+                                  TF_Tensor** outputs, int noutputs,
+                                  const char** target_oper_names, int ntargets,
+                                  TF_Buffer* run_metadata, TF_Status*);
 
 // See TF_SessionPRunSetup() above.
-extern void TF_PRunSetup(TF_DeprecatedSession*, const char** input_names,
-                         int ninputs, const char** output_names, int noutputs,
-                         const char** target_oper_names, int ntargets,
-                         const char** handle, TF_Status*);
+TF_CAPI_EXPORT extern void TF_PRunSetup(TF_DeprecatedSession*,
+                                        const char** input_names, int ninputs,
+                                        const char** output_names, int noutputs,
+                                        const char** target_oper_names,
+                                        int ntargets, const char** handle,
+                                        TF_Status*);
 
 // See TF_SessionPRun above.
-extern void TF_PRun(TF_DeprecatedSession*, const char* handle,
-                    const char** input_names, TF_Tensor** inputs, int ninputs,
-                    const char** output_names, TF_Tensor** outputs,
-                    int noutputs, const char** target_oper_names, int ntargets,
-                    TF_Status*);
+TF_CAPI_EXPORT extern void TF_PRun(TF_DeprecatedSession*, const char* handle,
+                                   const char** input_names, TF_Tensor** inputs,
+                                   int ninputs, const char** output_names,
+                                   TF_Tensor** outputs, int noutputs,
+                                   const char** target_oper_names, int ntargets,
+                                   TF_Status*);
+
+typedef struct TF_DeviceList TF_DeviceList;
+
+// Lists all devices in a TF_Session.
+//
+// Caller takes ownership of the returned TF_DeviceList* which must eventually
+// be freed with a call to TF_DeleteDeviceList.
+TF_CAPI_EXPORT extern TF_DeviceList* TF_SessionListDevices(TF_Session* session,
+                                                           TF_Status* status);
+
+// Lists all devices in a TF_Session.
+//
+// Caller takes ownership of the returned TF_DeviceList* which must eventually
+// be freed with a call to TF_DeleteDeviceList.
+TF_CAPI_EXPORT extern TF_DeviceList* TF_DeprecatedSessionListDevices(
+    TF_DeprecatedSession* session, TF_Status* status);
+
+// Deallocates the device list.
+TF_CAPI_EXPORT extern void TF_DeleteDeviceList(TF_DeviceList* list);
+
+// Counts the number of elements in the device list.
+TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
+
+// Retrieves the full name of the device (e.g. /job:worker/replica:0/...)
+// The return value will be a pointer to a null terminated string. The caller
+// must not modify or delete the string. It will be deallocated upon a call to
+// TF_DeleteDeviceList.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and a null pointer will be returned.
+TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
+                                                    int index, TF_Status*);
+
+// Retrieves the type of the device at the given index.
+//
+// The caller must not modify or delete the string. It will be deallocated upon
+// a call to TF_DeleteDeviceList.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and a null pointer will be returned.
+TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
+                                                    int index, TF_Status*);
+
+// Retrieve the amount of memory associated with a given device.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and -1 will be returned.
+TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
+    const TF_DeviceList* list, int index, TF_Status*);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
@@ -1043,19 +1249,19 @@ typedef struct TF_Library TF_Library;
 // The caller owns the library handle.
 //
 // On failure, place an error status in status and return NULL.
-extern TF_Library* TF_LoadLibrary(const char* library_filename,
-                                  TF_Status* status);
+TF_CAPI_EXPORT extern TF_Library* TF_LoadLibrary(const char* library_filename,
+                                                 TF_Status* status);
 
 // Get the OpList of OpDefs defined in the library pointed by lib_handle.
 //
 // Returns a TF_Buffer. The memory pointed to by the result is owned by
 // lib_handle. The data in the buffer will be the serialized OpList proto for
 // ops defined in the library.
-extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
+TF_CAPI_EXPORT extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
 
 // Frees the memory associated with the library handle.
 // Does NOT unload the library.
-extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
+TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 
 // Get the OpList of all OpDefs defined in this address space.
 // Returns a TF_Buffer, ownership of which is transferred to the caller
@@ -1063,7 +1269,7 @@ extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 //
 // The data in the buffer will be the serialized OpList proto for ops registered
 // in this address space.
-extern TF_Buffer* TF_GetAllOpList();
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
new file mode 100644
index 00000000000..f17ac26ad96
--- /dev/null
+++ b/tensorflow/c/c_api_internal.h
@@ -0,0 +1,119 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api.h"
+
+#include <vector>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+
+struct TF_Status {
+  tensorflow::Status status;
+};
+
+struct TF_Tensor {
+  TF_DataType dtype;
+  tensorflow::TensorShape shape;
+  tensorflow::TensorBuffer* buffer;
+};
+
+struct TF_SessionOptions {
+  tensorflow::SessionOptions options;
+};
+
+struct TF_DeprecatedSession {
+  tensorflow::Session* session;
+};
+
+struct TF_Library {
+  void* lib_handle;
+  TF_Buffer op_list;
+};
+
+struct TF_Graph {
+  TF_Graph()
+      : graph(tensorflow::OpRegistry::Global()),
+        refiner(graph.versions().producer(), graph.op_registry()),
+        num_sessions(0),
+        delete_requested(false),
+        parent(nullptr),
+        parent_inputs(nullptr) {}
+  tensorflow::mutex mu;
+  tensorflow::Graph graph GUARDED_BY(mu);
+
+  // Runs shape inference.
+  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
+
+  // Maps from name of an operation to the Node* in 'graph'.
+  std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
+      GUARDED_BY(mu);
+
+  // TF_Graph may only / must be deleted when
+  //   num_sessions == 0 && delete_requested == true
+
+  // num_sessions incremented by TF_NewSession, and decremented by
+  // TF_DeleteSession.
+  int num_sessions GUARDED_BY(mu);
+  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
+
+  // Used to link graphs contained in TF_WhileParams to the parent graph that
+  // will eventually contain the full while loop.
+  TF_Graph* parent;
+  TF_Output* parent_inputs;
+};
+
+struct TF_OperationDescription {
+  TF_OperationDescription(TF_Graph* g, const char* op_type,
+                          const char* node_name)
+      : node_builder(node_name, op_type, g->graph.op_registry()), graph(g) {}
+
+  tensorflow::NodeBuilder node_builder;
+  TF_Graph* graph;
+  std::vector<tensorflow::string> colocation_constraints;
+};
+
+struct TF_Operation {
+  tensorflow::Node node;
+};
+
+struct TF_Session {
+  TF_Session(tensorflow::Session* s, TF_Graph* g)
+      : session(s), graph(g), last_num_graph_nodes(0) {}
+  tensorflow::Session* session;
+  TF_Graph* graph;
+  tensorflow::mutex mu;
+  int last_num_graph_nodes;
+};
+
+struct TF_ImportGraphDefOptions {
+  tensorflow::ImportGraphDefOptions opts;
+};
+
+struct TF_DeviceList {
+  std::vector<tensorflow::DeviceAttributes> response;
+};
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 5591409d99b..04540bd793d 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 using tensorflow::int32;
 using tensorflow::string;
@@ -105,6 +107,22 @@ TEST(CAPI, AllocateTensor) {
   TF_DeleteTensor(t);
 }
 
+TEST(CAPI, MaybeMove) {
+  const int num_bytes = 6 * sizeof(float);
+  float* values =
+      reinterpret_cast<float*>(tensorflow::cpu_allocator()->AllocateRaw(
+          EIGEN_MAX_ALIGN_BYTES, num_bytes));
+  int64_t dims[] = {2, 3};
+  bool deallocator_called = false;
+  TF_Tensor* t = TF_NewTensor(TF_FLOAT, dims, 2, values, num_bytes,
+                              &Deallocator, &deallocator_called);
+
+  TF_Tensor* o = TF_TensorMaybeMove(t);
+  ASSERT_TRUE(o == nullptr);  // It is unsafe to move memory TF might not own.
+  TF_DeleteTensor(t);
+  EXPECT_TRUE(deallocator_called);
+}
+
 TEST(CAPI, LibraryLoadFunctions) {
   // Load the library.
   TF_Status* status = TF_NewStatus();
@@ -261,6 +279,19 @@ static void Int32Deallocator(void* data, size_t, void* arg) {
   delete[] static_cast<int32*>(data);
 }
 
+// Create a tensor with values of type TF_INT8 provided by `values`.
+static TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims,
+                             const char* values) {
+  int64_t num_values = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_values *= dims[i];
+  }
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
+  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
+  return t;
+}
+
 static TF_Tensor* Int32Tensor(int32 v) {
   const int num_bytes = sizeof(int32);
   int32* values = new int32[1];
@@ -269,29 +300,44 @@ static TF_Tensor* Int32Tensor(int32 v) {
                       &Int32Deallocator, nullptr);
 }
 
-TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", "feed");
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
+                          const char* name = "feed") {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
   TF_SetAttrType(desc, "dtype", TF_INT32);
   return TF_FinishOperation(desc, s);
 }
 
-TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s) {
-  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", "scalar");
-  TF_SetAttrTensor(desc, "value", tensor.get(), s);
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name = "const") {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
+  TF_SetAttrTensor(desc, "value", t, s);
   if (TF_GetCode(s) != TF_OK) return nullptr;
-  TF_SetAttrType(desc, "dtype", TF_INT32);
+  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar") {
+  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                  TF_Status* s) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", "add");
+                  TF_Status* s, const char* name = "add") {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
   TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
   TF_AddInputList(desc, add_inputs, 2);
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
+                  const char* name = "add") {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output inputs[2] = {l, r};
+  TF_AddInputList(desc, inputs, 2);
+  return TF_FinishOperation(desc, s);
+}
+
 TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Neg", "neg");
   TF_Output neg_input = {n, 0};
@@ -299,6 +345,14 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
+                       TF_Status* s) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Less", "less_than");
+  TF_AddInput(desc, l);
+  TF_AddInput(desc, r);
+  return TF_FinishOperation(desc, s);
+}
+
 bool IsPlaceholder(const NodeDef& node_def) {
   if (node_def.op() != "Placeholder" || node_def.name() != "feed") {
     return false;
@@ -667,6 +721,28 @@ TEST(CAPI, Graph) {
   TF_DeleteStatus(s);
 }
 
+/*
+TODO(skyewm): this test currently DCHECKs, change to bad status
+
+TEST(CAPI, InputFromDifferentGraphError) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* g1 = TF_NewGraph();
+  TF_Graph* g2 = TF_NewGraph();
+
+  TF_Operation* feed = Placeholder(g1, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Attempt to create node in g2 with input from g1
+  Neg(feed, g2, s);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s));
+  EXPECT_STREQ("foo", TF_Message(s));
+
+  TF_DeleteGraph(g1);
+  TF_DeleteGraph(g2);
+  TF_DeleteStatus(s);
+}
+*/
+
 TEST(CAPI, ImportGraphDef) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -765,6 +841,33 @@ TEST(CAPI, ImportGraphDef) {
   EXPECT_EQ(feed, control_inputs[0]);
   EXPECT_EQ(feed2, control_inputs[1]);
 
+  // Export to a graph def so we can import a graph with control dependencies
+  TF_DeleteBuffer(graph_def);
+  graph_def = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, graph_def, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Import again, with remapped control dependency, into the same graph
+  TF_DeleteImportGraphDefOptions(opts);
+  opts = TF_NewImportGraphDefOptions();
+  TF_ImportGraphDefOptionsSetPrefix(opts, "imported4");
+  TF_ImportGraphDefOptionsRemapControlDependency(opts, "imported/feed", feed);
+  TF_GraphImportGraphDef(graph, graph_def, opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* scalar4 =
+      TF_GraphOperationByName(graph, "imported4/imported3/scalar");
+  TF_Operation* feed4 =
+      TF_GraphOperationByName(graph, "imported4/imported2/feed");
+
+  // Check that imported `imported3/scalar` has remapped control dep from
+  // original graph and imported control dep
+  num_control_inputs = TF_OperationGetControlInputs(
+      scalar4, control_inputs, TF_OperationNumControlInputs(scalar4));
+  ASSERT_EQ(2, num_control_inputs);
+  EXPECT_EQ(feed, control_inputs[0]);
+  EXPECT_EQ(feed4, control_inputs[1]);
+
   TF_DeleteImportGraphDefOptions(opts);
   TF_DeleteBuffer(graph_def);
 
@@ -784,7 +887,7 @@ class CSession {
     TF_DeleteSessionOptions(opts);
   }
 
-  CSession(TF_Session* session) { session_ = session; }
+  explicit CSession(TF_Session* session) : session_(session) {}
 
   ~CSession() {
     TF_Status* s = TF_NewStatus();
@@ -793,8 +896,7 @@ class CSession {
     TF_DeleteStatus(s);
   }
 
-  void SetInputs(
-      std::initializer_list<std::pair<TF_Operation*, TF_Tensor*>> inputs) {
+  void SetInputs(std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs) {
     DeleteInputValues();
     inputs_.clear();
     for (const auto& p : inputs) {
@@ -811,6 +913,11 @@ class CSession {
     }
   }
 
+  void SetOutputs(const std::vector<TF_Output>& outputs) {
+    ResetOutputValues();
+    outputs_ = outputs;
+  }
+
   void SetTargets(std::initializer_list<TF_Operation*> targets) {
     targets_.clear();
     for (TF_Operation* t : targets) {
@@ -937,6 +1044,103 @@ TEST(CAPI, Session) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, SessionPRun) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Construct the graph: A + 2 + B
+  TF_Operation* a = Placeholder(graph, s, "A");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* b = Placeholder(graph, s, "B");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* two = ScalarConst(2, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* plus2 = Add(a, two, graph, s, "plus2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* plusB = Add(plus2, b, graph, s, "plusB");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Setup a session and a partial run handle.  The partial run will allow
+  // computation of A + 2 + B in two phases (calls to TF_SessionPRun):
+  // 1. Feed A and get (A+2)
+  // 2. Feed B and get (A+2)+B
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_Session* sess = TF_NewSession(graph, opts, s);
+  TF_DeleteSessionOptions(opts);
+
+  TF_Output feeds[] = {TF_Output{a, 0}, TF_Output{b, 0}};
+  TF_Output fetches[] = {TF_Output{plus2, 0}, TF_Output{plusB, 0}};
+
+  const char* handle = nullptr;
+  TF_SessionPRunSetup(sess, feeds, TF_ARRAYSIZE(feeds), fetches,
+                      TF_ARRAYSIZE(fetches), nullptr, 0, &handle, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Feed A and fetch A + 2.
+  TF_Output feeds1[] = {TF_Output{a, 0}};
+  TF_Output fetches1[] = {TF_Output{plus2, 0}};
+  TF_Tensor* feedValues1[] = {Int32Tensor(1)};
+  TF_Tensor* fetchValues1[1];
+  TF_SessionPRun(sess, handle, feeds1, feedValues1, 1, fetches1, fetchValues1,
+                 1, nullptr, 0, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(3, *(static_cast<int32*>(TF_TensorData(fetchValues1[0]))));
+  TF_DeleteTensor(feedValues1[0]);
+  TF_DeleteTensor(fetchValues1[0]);
+
+  // Feed B and fetch (A + 2) + B.
+  TF_Output feeds2[] = {TF_Output{b, 0}};
+  TF_Output fetches2[] = {TF_Output{plusB, 0}};
+  TF_Tensor* feedValues2[] = {Int32Tensor(4)};
+  TF_Tensor* fetchValues2[1];
+  TF_SessionPRun(sess, handle, feeds2, feedValues2, 1, fetches2, fetchValues2,
+                 1, nullptr, 0, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(7, *(static_cast<int32*>(TF_TensorData(fetchValues2[0]))));
+  TF_DeleteTensor(feedValues2[0]);
+  TF_DeleteTensor(fetchValues2[0]);
+
+  // Clean up.
+  TF_DeletePRunHandle(handle);
+  TF_DeleteSession(sess, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST(CAPI, ShapeInferenceError) {
+  // TF_FinishOperation should fail if the shape of the added operation cannot
+  // be inferred.
+  TF_Status* status = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Create this failure by trying to add two nodes with incompatible shapes
+  // (A tensor with shape [2] and a tensor with shape [3] cannot be added).
+  const char data[] = {1, 2, 3};
+  const int64_t vec2_dims[] = {2};
+  unique_tensor_ptr vec2_tensor(
+      Int8Tensor(vec2_dims, TF_ARRAYSIZE(vec2_dims), data), TF_DeleteTensor);
+  TF_Operation* vec2 = Const(vec2_tensor.get(), graph, status, "vec2");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const int64_t vec3_dims[] = {3};
+  unique_tensor_ptr vec3_tensor(
+      Int8Tensor(vec3_dims, TF_ARRAYSIZE(vec3_dims), data), TF_DeleteTensor);
+  TF_Operation* vec3 = Const(vec3_tensor.get(), graph, status, "vec3");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Operation* add = Add(vec2, vec3, graph, status);
+  ASSERT_NE(TF_OK, TF_GetCode(status));
+  ASSERT_TRUE(add == nullptr);
+
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, ColocateWith) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -1068,16 +1272,582 @@ TEST(CAPI, SavedModelNullArgsAreValid) {
   TF_DeleteStatus(s);
 }
 
-// Create a tensor with values of type TF_INT8 provided by `values`.
-TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
-  int64_t num_values = 1;
-  for (int i = 0; i < num_dims; ++i) {
-    num_values *= dims[i];
+class CApiWhileLoopTest : public ::testing::Test {
+ protected:
+  CApiWhileLoopTest() : s_(TF_NewStatus()), graph_(TF_NewGraph()) {}
+
+  ~CApiWhileLoopTest() override {
+    TF_DeleteGraph(graph_);
+    TF_DeleteStatus(s_);
   }
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
-  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
-  return t;
+
+  void Init(int ninputs) {
+    DCHECK(inputs_.empty());
+    DCHECK_GT(ninputs, 0);
+
+    for (int i = 0; i < ninputs; ++i) {
+      TF_Operation* placeholder = Placeholder(
+          graph_, s_, ::tensorflow::strings::StrCat("p", i).c_str());
+      DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+      inputs_.push_back({placeholder, 0});
+    }
+
+    original_graph_description_ = GraphDebugString();
+
+    params_.reset(new TF_WhileParams(
+        TF_NewWhile(graph_, &inputs_[0], inputs_.size(), s_)));
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    ASSERT_EQ(original_graph_description_, GraphDebugString())
+        << "TF_NewWhile() altered graph";
+
+    params_->name = "test_loop";
+
+    // Initialize outputs_ so we can easily detect errors/bugs
+    outputs_.resize(ninputs, {nullptr, -1});
+  }
+
+  void ExpectOK() {
+    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void ExpectError(TF_Code expected_code, const string& expected_msg) {
+    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
+    EXPECT_EQ(expected_code, TF_GetCode(s_));
+    EXPECT_EQ(expected_msg, TF_Message(s_));
+    // TODO(skyewm): this assert is currently broken. Fix or remove guarantee.
+    // ASSERT_EQ(original_graph_description_, GraphDebugString()) <<
+    //     "TF_FinishWhile() altered graph on error";
+  }
+
+  void Run(std::initializer_list<int> input_values) {
+    DCHECK_EQ(inputs_.size(), input_values.size());
+    std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs(inputs_.size());
+    int i = 0;
+    for (int v : input_values) {
+      inputs[i] = {inputs_[i].oper, Int32Tensor(v)};
+      ++i;
+    }
+    csession_.reset(new CSession(graph_, s_));
+    csession_->SetInputs(inputs);
+    csession_->SetOutputs(outputs_);
+    csession_->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void ExpectOutputValue(int idx, int expected_value) {
+    TF_Tensor* out = csession_->output_tensor(idx);
+    ASSERT_TRUE(out != nullptr);
+    EXPECT_EQ(TF_INT32, TF_TensorType(out));
+    EXPECT_EQ(0, TF_NumDims(out));
+    ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
+    int32* data = static_cast<int32*>(TF_TensorData(out));
+    EXPECT_EQ(expected_value, *data);
+  }
+
+  // Create a valid conditional graph. Useful for testing unrelated errors.
+  void CreateCondGraph() {
+    TF_Operation* one = ScalarConst(1, params_->cond_graph, s_);
+    TF_Operation* less_than =
+        LessThan(params_->cond_inputs[0], {one, 0}, params_->cond_graph, s_);
+    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    params_->cond_output = {less_than, 0};
+  }
+
+  string GraphDebugString() const {
+    TF_Buffer* buf = TF_NewBuffer();
+    TF_GraphToGraphDef(graph_, buf, s_);
+    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    GraphDef def;
+    bool success = def.ParseFromArray(buf->data, buf->length);
+    DCHECK(success);
+    TF_DeleteBuffer(buf);
+    return def.DebugString();
+  }
+
+  TF_Status* s_;
+  TF_Graph* graph_;
+  std::vector<TF_Output> inputs_;   // The inputs to the while loop
+  std::vector<TF_Output> outputs_;  // The final outputs of the while loop
+  std::unique_ptr<TF_WhileParams> params_;
+  std::unique_ptr<CSession> csession_;
+
+ private:
+  // Used to verify that errors don't change graph_
+  string original_graph_description_;
+};
+
+TEST_F(CApiWhileLoopTest, BasicLoop) {
+  Init(2);
+
+  // Validate TF_WhileParams returned by TF_NewWhile()
+  EXPECT_TRUE(params_->body_graph != nullptr);
+  EXPECT_TRUE(params_->cond_graph != nullptr);
+
+  EXPECT_EQ(params_->ninputs, 2);
+
+  ASSERT_TRUE(params_->cond_inputs != nullptr);
+  ASSERT_TRUE(params_->cond_inputs[0].oper != nullptr);
+  EXPECT_TRUE(params_->cond_inputs[1].oper != nullptr);
+
+  ASSERT_TRUE(params_->body_inputs != nullptr);
+  EXPECT_TRUE(params_->body_inputs[0].oper != nullptr);
+  EXPECT_TRUE(params_->body_inputs[1].oper != nullptr);
+
+  ASSERT_TRUE(params_->body_outputs != nullptr);
+
+  // Create loop: while (input1 < input2) input1 += input2 + 1
+  TF_Operation* less_than =
+      LessThan(params_->cond_inputs[0], params_->cond_inputs[1],
+               params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->cond_output = {less_than, 0};
+
+  TF_Operation* add1 = Add(params_->body_inputs[0], params_->body_inputs[1],
+                           params_->body_graph, s_, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* one = ScalarConst(1, params_->body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* add2 = Add(add1, one, params_->body_graph, s_, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->body_outputs[0] = {add2, 0};
+  params_->body_outputs[1] = params_->body_inputs[1];
+
+  // Finalize while loop
+  ExpectOK();
+
+  // Validate while loop outputs returned by TF_FinishWhile()
+  EXPECT_TRUE(outputs_[0].oper != nullptr);
+  EXPECT_GE(outputs_[0].index, 0);
+  EXPECT_TRUE(outputs_[1].oper != nullptr);
+  EXPECT_GE(outputs_[1].index, 0);
+
+  // Run the graph
+  Run({-9, 2});
+  ExpectOutputValue(0, 3);
+  ExpectOutputValue(1, 2);
+}
+
+TEST_F(CApiWhileLoopTest, NestedLoop) {
+  Init(2);
+  // Create nested loop:
+  //  while (input1 < 6) {
+  //    inner_input1 = input1
+  //    while (inner_input1 < 3) {
+  //      input2 += 1
+  //      inner_input1 += 2
+  //    }
+  //    input1 += input2
+  //  }
+  //
+  // Expected execution with initial values input1 = input2 = 0:
+  //
+  // outer inner               inner_
+  // step# step# input1 input2 input1
+  // ------------------------------------
+  //   0     0     0      0      0
+  //   0     1     0      1      2
+  //   0     2     0      2      4
+  //   0     -     2      2      -
+  //   1     0     2      2      2
+  //   1     1     2      3      4
+  //   1     -     5      3      -
+  //   2     0     5      3      5
+  //   2     -     8      3      -
+
+  // Create outer cond graph
+  TF_Operation* six = ScalarConst(6, params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* less_than =
+      LessThan(params_->cond_inputs[0], {six, 0}, params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->cond_output = {less_than, 0};
+
+  // Create outer body graph
+  // Init inner graph
+  TF_Output inner_inputs[] = {params_->body_inputs[0], params_->body_inputs[1]};
+  TF_WhileParams inner_params =
+      TF_NewWhile(params_->body_graph, inner_inputs, 2, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.name = "inner_loop";
+
+  // Create inner cond graph
+  TF_Operation* three = ScalarConst(3, inner_params.cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* inner_less_than = LessThan(
+      inner_params.cond_inputs[0], {three, 0}, inner_params.cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.cond_output = {inner_less_than, 0};
+
+  // Create inner body graph
+  TF_Operation* one = ScalarConst(1, inner_params.body_graph, s_, "one");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* two = ScalarConst(2, inner_params.body_graph, s_, "two");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* input2_add =
+      Add(inner_params.body_inputs[1].oper, one, inner_params.body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.body_outputs[1] = {input2_add, 0};
+
+  TF_Operation* inner_input1_add = Add(inner_params.body_inputs[0].oper, two,
+                                       inner_params.body_graph, s_, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.body_outputs[0] = {inner_input1_add, 0};
+
+  // Finalize inner graph
+  TF_Output inner_outputs[2] = {{nullptr, -1}};
+  TF_FinishWhile(&inner_params, s_, inner_outputs);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* input1_add =
+      Add(params_->body_inputs[0], inner_outputs[1], params_->body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->body_outputs[0] = {input1_add, 0};
+
+  params_->body_outputs[1] = inner_outputs[1];
+
+  // Finalize outer graph
+  ExpectOK();
+
+  // Check for a few expected nodes
+  const char* node_name = "test_loop/cond/scalar";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/add";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/inner_loop/body/one";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/inner_loop/cond/less_than";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+
+  // Run the graph
+  Run({0, 0});
+  ExpectOutputValue(0, 8);
+  ExpectOutputValue(1, 3);
+}
+
+TEST_F(CApiWhileLoopTest, BadCondOutput) {
+  Init(1);
+  params_->body_outputs[0] = params_->body_inputs[0];
+  ExpectError(TF_INVALID_ARGUMENT,
+              "TF_WhileParams `cond_output` field isn't set");
+}
+
+TEST_F(CApiWhileLoopTest, BadBodyOutput) {
+  Init(1);
+  CreateCondGraph();
+  ExpectError(TF_INVALID_ARGUMENT,
+              "TF_WhileParams `body_outputs[0]` field isn't set");
+}
+
+TEST_F(CApiWhileLoopTest, NullName) {
+  Init(1);
+  CreateCondGraph();
+  params_->body_outputs[0] = params_->body_inputs[0];
+  params_->name = nullptr;
+  ExpectError(TF_INVALID_ARGUMENT, "TF_WhileParams `name` field is null");
+}
+
+TEST_F(CApiWhileLoopTest, WrongGraph) {
+  Init(1);
+  CreateCondGraph();
+  // Set body output to output from outer graph
+  params_->body_outputs[0] = inputs_[0];
+  // TODO(skyewm): improve error message
+  ExpectError(TF_INVALID_ARGUMENT,
+              "Requested return node 'p0' not found in graph def");
+}
+
+TEST_F(CApiWhileLoopTest, BadTypes) {
+  Init(1);
+  CreateCondGraph();
+  // Op that has a float input + output
+  TF_OperationDescription* desc = TF_NewOperation(
+      params_->body_graph, "FakeQuantWithMinMaxArgs", "float_op");
+  TF_AddInput(desc, params_->body_inputs[0]);
+  TF_FinishOperation(desc, s_);
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  string msg(TF_Message(s_));
+  EXPECT_NE(msg.find("Input 'inputs' passed int32 expected float while "
+                     "building NodeDef 'float_op'"),
+            msg.npos);
+  TF_AbortWhile(params_.get());
+}
+
+REGISTER_OP("TestOpWithNoGradient")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {float, double}")
+    .Doc(R"doc(
+Test op with no grad registered.
+
+x: input
+y: output
+)doc")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
+
+class CApiGradientsTest : public ::testing::Test {
+ protected:
+  CApiGradientsTest()
+      : s_(TF_NewStatus()),
+        graph_(TF_NewGraph()),
+        expected_graph_(TF_NewGraph()) {}
+
+  ~CApiGradientsTest() override {
+    TF_DeleteGraph(graph_);
+    TF_DeleteGraph(expected_graph_);
+    TF_DeleteStatus(s_);
+  }
+
+  void TestGradientsSuccess(bool grad_inputs_provided) {
+    TF_Output inputs[2];
+    TF_Output outputs[1];
+    TF_Output grad_outputs[2];
+    TF_Output expected_grad_outputs[2];
+
+    BuildSuccessGraph(inputs, outputs);
+    BuildExpectedGraph(grad_inputs_provided, expected_grad_outputs);
+
+    AddGradients(grad_inputs_provided, inputs, 2, outputs, 1, grad_outputs);
+
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    // Compare that the graphs match.
+    GraphDef expected_gdef;
+    GraphDef gdef;
+    EXPECT_TRUE(GetGraphDef(expected_graph_, &expected_gdef));
+    EXPECT_TRUE(GetGraphDef(graph_, &gdef));
+    TF_EXPECT_GRAPH_EQ(expected_gdef, gdef);
+
+    // Compare that the output of the gradients of both graphs match.
+    RunGraphsAndCompareOutputs(grad_outputs, expected_grad_outputs);
+  }
+
+  void TestGradientsError(bool grad_inputs_provided) {
+    TF_Output inputs[1];
+    TF_Output outputs[1];
+    TF_Output grad_outputs[1];
+
+    BuildErrorGraph(inputs, outputs);
+
+    AddGradients(grad_inputs_provided, inputs, 1, outputs, 1, grad_outputs);
+
+    string expected_msg =
+        "No gradient defined for op: TestOpWithNoGradient. Please see "
+        "https://www.tensorflow.org/code/"
+        "tensorflow/cc/gradients/README.md"
+        " for instructions on how to add C++ gradients.";
+    EXPECT_EQ(expected_msg, TF_Message(s_));
+  }
+
+  // Run the graph and ensure that the gradient values are as expected.
+  void RunGraphsAndCompareOutputs(TF_Output* grad_outputs,
+                                  TF_Output* expected_grad_outputs) {
+    std::unique_ptr<CSession> csession(new CSession(graph_, s_));
+    std::unique_ptr<CSession> expected_csession(
+        new CSession(expected_graph_, s_));
+
+    std::vector<TF_Output> grad_outputs_vec;
+    grad_outputs_vec.assign(grad_outputs, grad_outputs + 2);
+    csession->SetOutputs(grad_outputs_vec);
+    csession->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Tensor* out0 = csession->output_tensor(0);
+    TF_Tensor* out1 = csession->output_tensor(1);
+
+    std::vector<TF_Output> expected_grad_outputs_vec;
+    expected_grad_outputs_vec.assign(expected_grad_outputs,
+                                     expected_grad_outputs + 2);
+    expected_csession->SetOutputs(expected_grad_outputs_vec);
+    expected_csession->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Tensor* expected_out0 = expected_csession->output_tensor(0);
+    TF_Tensor* expected_out1 = expected_csession->output_tensor(1);
+
+    CompareTensors(out0, expected_out0);
+    CompareTensors(out1, expected_out1);
+  }
+
+  void CompareTensors(TF_Tensor* a, TF_Tensor* b) {
+    float* a_data = static_cast<float*>(TF_TensorData(a));
+    float* b_data = static_cast<float*>(TF_TensorData(b));
+    EXPECT_EQ(*a_data, *b_data);
+  }
+
+  void AddGradients(bool grad_inputs_provided, TF_Output* inputs, int ninputs,
+                    TF_Output* outputs, int noutputs, TF_Output* grad_outputs) {
+    if (grad_inputs_provided) {
+      TF_Output grad_inputs[1];
+      const float grad_inputs_val[] = {1.0, 1.0, 1.0, 1.0};
+      TF_Operation* grad_inputs_op =
+          FloatConst2x2(graph_, s_, grad_inputs_val, "GradInputs");
+      grad_inputs[0] = TF_Output{grad_inputs_op, 0};
+      TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, grad_inputs,
+                      s_, grad_outputs);
+    } else {
+      TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, nullptr, s_,
+                      grad_outputs);
+    }
+  }
+
+  void BuildErrorGraph(TF_Output* inputs, TF_Output* outputs) {
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    TF_Operation* const0 = FloatConst2x2(graph_, s_, const0_val, "Const_0");
+    TF_Operation* nograd = NoGradientOp(graph_, s_, const0, "NoGrad");
+    inputs[0] = TF_Output{const0, 0};
+    outputs[0] = TF_Output{nograd, 0};
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void BuildSuccessGraph(TF_Output* inputs, TF_Output* outputs) {
+    // Construct the following graph:
+    //            |
+    //           z|
+    //            |
+    //          MatMul
+    //         /       \
+    //        ^         ^
+    //        |         |
+    //       x|        y|
+    //        |         |
+    //        |         |
+    //      Const_0    Const_1
+    //
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    const float const1_val[] = {1.0, 0.0, 0.0, 1.0};
+    TF_Operation* const0 = FloatConst2x2(graph_, s_, const0_val, "Const_0");
+    TF_Operation* const1 = FloatConst2x2(graph_, s_, const1_val, "Const_1");
+    TF_Operation* matmul = MatMul(graph_, s_, const0, const1, "MatMul");
+    inputs[0] = TF_Output{const0, 0};
+    inputs[1] = TF_Output{const1, 0};
+    outputs[0] = TF_Output{matmul, 0};
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void BuildExpectedGraph(bool grad_inputs_provided,
+                          TF_Output* expected_grad_outputs) {
+    // The expected graph looks like this if grad_inputs_provided.
+    // If grad_inputs_provided is false, Const_0 will be a OnesLike op.
+    //      ^             ^
+    //    dy|           dx|        // MatMul Gradient Graph
+    //      |             |
+    //   MatMul_2      MatMul_1
+    //   ^   ^          ^    ^
+    //   |   |----------|    |
+    //   |        ^          |
+    //   |      dz|          |
+    //   |        |          |
+    //   |     Const_3       |
+    //   |                   |
+    //   |        ^          |
+    //   |       z|          |     // MatMul Forward Graph
+    //   |        |          |
+    //   |      MatMul       |
+    //   |     /       \     |
+    //   |    ^         ^    |
+    //   |    |         |    |
+    //   |---x|        y|----|
+    //        |         |
+    //        |         |
+    //      Const_0   Const_1
+    //
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    const float const1_val[] = {1.0, 0.0, 0.0, 1.0};
+    TF_Operation* const0 =
+        FloatConst2x2(expected_graph_, s_, const0_val, "Const_0");
+    TF_Operation* const1 =
+        FloatConst2x2(expected_graph_, s_, const1_val, "Const_1");
+    TF_Operation* matmul =
+        MatMul(expected_graph_, s_, const0, const1, "MatMul");
+
+    TF_Operation* const3;
+    if (grad_inputs_provided) {
+      const float const3_val[] = {1.0, 1.0, 1.0, 1.0};
+      const3 = FloatConst2x2(expected_graph_, s_, const3_val, "GradInputs");
+    } else {
+      const3 = OnesLike(expected_graph_, s_, matmul, "OnesLike");
+    }
+
+    TF_Operation* matmul1 =
+        MatMul(expected_graph_, s_, const3, const1, "MatMul_1", false, true);
+    TF_Operation* matmul2 =
+        MatMul(expected_graph_, s_, const0, const3, "MatMul_2", true, false);
+    expected_grad_outputs[0] = {matmul1, 0};
+    expected_grad_outputs[1] = {matmul2, 0};
+  }
+
+  TF_Tensor* FloatTensor2x2(const float* values) {
+    const int64_t dims[2] = {2, 2};
+    TF_Tensor* t = TF_AllocateTensor(TF_FLOAT, dims, 2, sizeof(float) * 4);
+    memcpy(TF_TensorData(t), values, sizeof(float) * 4);
+    return t;
+  }
+
+  TF_Operation* FloatConst2x2(TF_Graph* graph, TF_Status* s,
+                              const float* values, const char* name) {
+    unique_tensor_ptr tensor(FloatTensor2x2(values), TF_DeleteTensor);
+    TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
+    TF_SetAttrTensor(desc, "value", tensor.get(), s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    TF_SetAttrType(desc, "dtype", TF_FLOAT);
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* MatMul(TF_Graph* graph, TF_Status* s, TF_Operation* l,
+                       TF_Operation* r, const char* name,
+                       bool transpose_a = false, bool transpose_b = false) {
+    TF_OperationDescription* desc = TF_NewOperation(graph, "MatMul", name);
+    if (transpose_a) {
+      TF_SetAttrBool(desc, "transpose_a", 1);
+    }
+    if (transpose_b) {
+      TF_SetAttrBool(desc, "transpose_b", 1);
+    }
+    TF_AddInput(desc, {l, 0});
+    TF_AddInput(desc, {r, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* OnesLike(TF_Graph* graph, TF_Status* s, TF_Operation* in,
+                         const char* name) {
+    TF_OperationDescription* desc = TF_NewOperation(graph, "OnesLike", name);
+    TF_AddInput(desc, {in, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* NoGradientOp(TF_Graph* graph, TF_Status* s, TF_Operation* in,
+                             const char* name) {
+    TF_OperationDescription* desc =
+        TF_NewOperation(graph, "TestOpWithNoGradient", name);
+    TF_AddInput(desc, {in, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Status* s_;
+  TF_Graph* graph_;
+  TF_Graph* expected_graph_;
+};
+
+TEST_F(CApiGradientsTest, Gradients_GradInputs) { TestGradientsSuccess(true); }
+
+TEST_F(CApiGradientsTest, Gradients_NoGradInputs) {
+  TestGradientsSuccess(false);
+}
+
+TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_GradInputs) {
+  TestGradientsError(true);
+}
+
+TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
+  TestGradientsError(false);
 }
 
 void StringVectorToArrays(const std::vector<string>& v,
@@ -1095,9 +1865,13 @@ void StringVectorToArrays(const std::vector<string>& v,
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
-#define ATTR_TEST_REGISTER_OP(type)                            \
-  REGISTER_OP("CApiAttributesTestOp" #type).Attr("v: " #type); \
-  REGISTER_OP("CApiAttributesTestOpList" #type).Attr("v: list(" #type ")")
+#define ATTR_TEST_REGISTER_OP(type)                           \
+  REGISTER_OP("CApiAttributesTestOp" #type)                   \
+      .Attr("v: " #type)                                      \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape); \
+  REGISTER_OP("CApiAttributesTestOpList" #type)               \
+      .Attr("v: list(" #type ")")                             \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape)
 ATTR_TEST_REGISTER_OP(string);
 ATTR_TEST_REGISTER_OP(int);
 ATTR_TEST_REGISTER_OP(float);
@@ -1504,8 +2278,8 @@ TEST_F(CApiAttributesTest, TensorList) {
     EXPECT_EQ(TF_INT8, TF_TensorType(v)) << i;
     EXPECT_EQ(tensor_ndims[i], TF_NumDims(v)) << i;
     for (int j = 0; j < TF_NumDims(v); ++j) {
-      EXPECT_EQ(tensor_dims[i][j], TF_Dim(v, j)) << "Tensor #" << i
-                                                 << ", dimension #" << j;
+      EXPECT_EQ(tensor_dims[i][j], TF_Dim(v, j))
+          << "Tensor #" << i << ", dimension #" << j;
     }
     EXPECT_EQ(sizeof(char) * tensor_size[i], TF_TensorByteSize(v)) << i;
     EXPECT_EQ(0,
diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
index 17b3f93193d..e7b9bca5b50 100644
--- a/tensorflow/c/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -58,6 +58,7 @@ CheckpointReader::CheckpointReader(const string& filename,
 CheckpointReader::~CheckpointReader() {
   delete var_to_shape_map_ptr_;
   delete reader_;
+  delete v2_reader_;
 }
 
 bool CheckpointReader::HasTensor(const string& name) const {
diff --git a/tensorflow/c/exported_symbols.lds b/tensorflow/c/exported_symbols.lds
new file mode 100644
index 00000000000..a14bdaa48be
--- /dev/null
+++ b/tensorflow/c/exported_symbols.lds
@@ -0,0 +1 @@
+_TF_*
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
new file mode 100755
index 00000000000..02a6a58b615
--- /dev/null
+++ b/tensorflow/c/generate-pc.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TF_PREFIX='/usr/local'
+
+usage() {
+    echo "Usage: $0 OPTIONS"
+    echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-v, --version\tset TensorFlow version"
+    echo -e "-h, --help\tdisplay this message"
+}
+
+[ $# == 0 ] && usage && exit 0
+
+# read the options
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+eval set -- "$ARGS"
+
+# extract options and their arguments into variables.
+while true ; do
+    case "$1" in
+        -h|--help) usage ; exit ;;
+        -p|--prefix)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_PREFIX=$2 ; shift 2 ;;
+            esac ;;
+        -v|--version)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_VERSION=$2 ; shift 2 ;;
+            esac ;;
+        --) shift ; break ;;
+        *) echo "Internal error! Try '$0 --help' for more information." ; exit 1 ;;
+    esac
+done
+
+[ -z $TF_VERSION ] && echo "Specify a version using -v or --version" && exit 1
+
+echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
+
+cat << EOF > tensorflow.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/lib
+includedir=\${prefix}/include
+
+Name: TensorFlow
+Version: ${TF_VERSION}
+Description: Library for computation using data flow graphs for scalable machine learning
+Requires:
+Libs: -L\${libdir} -ltensorflow
+Cflags: -I\${includedir}
+EOF
diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds
new file mode 100644
index 00000000000..455bd7362bb
--- /dev/null
+++ b/tensorflow/c/version_script.lds
@@ -0,0 +1,9 @@
+VERS_1.0 {
+  # Export symbols in c_api.h.
+  global:
+    TF_*;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 054278bbf77..f89cc6384b3 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -8,8 +8,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -36,6 +34,7 @@ cc_library(
 
 tf_cc_test(
     name = "framework_gradients_test",
+    size = "small",
     srcs = ["framework/gradients_test.cc"],
     deps = [
         ":cc_ops",
@@ -44,8 +43,8 @@ tf_cc_test(
         ":gradients",
         ":testutil",
         "//tensorflow/core:all_kernels",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -59,7 +58,6 @@ cc_library(
     deps = [
         ":cc_ops",
         ":client_session",
-        ":grad_op_registry",
         ":gradients",
         ":ops",
         ":scope",
@@ -72,6 +70,7 @@ cc_library(
 
 tf_cc_test(
     name = "framework_gradient_checker_test",
+    size = "small",
     srcs = ["framework/gradient_checker_test.cc"],
     deps = [
         ":cc_ops",
@@ -80,8 +79,8 @@ tf_cc_test(
         ":gradient_checker",
         ":testutil",
         "//tensorflow/core:all_kernels",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -93,6 +92,7 @@ cc_library(
     deps = [
         ":array_grad",
         ":math_grad",
+        ":nn_grad",
     ],
 )
 
@@ -124,7 +124,10 @@ cc_library_with_android_deps(
 
 cc_library_with_android_deps(
     name = "scope",
-    srcs = ["framework/scope.cc"],
+    srcs = [
+        "framework/scope.cc",
+        "framework/scope_internal.h",
+    ],
     hdrs = ["framework/scope.h"],
     android_deps = ["//tensorflow/core:android_tensorflow_lib"],
     common_deps = [
@@ -138,8 +141,18 @@ cc_library_with_android_deps(
     ],
 )
 
+cc_library_with_android_deps(
+    name = "scope_internal",
+    hdrs = ["framework/scope_internal.h"],
+    common_deps = [
+        ":scope",
+    ],
+    deps = [],
+)
+
 tf_cc_test(
     name = "framework_scope_test",
+    size = "small",
     srcs = ["framework/scope_test.cc"],
     deps = [
         ":ops",
@@ -169,6 +182,7 @@ cc_library_with_android_deps(
 
 tf_cc_test(
     name = "client_client_session_test",
+    size = "small",
     srcs = ["client/client_session_test.cc"],
     deps = [
         ":cc_ops",
@@ -203,6 +217,7 @@ cc_library_with_android_deps(
 
 tf_cc_test(
     name = "ops_const_op_test",
+    size = "small",
     srcs = ["ops/const_op_test.cc"],
     deps = [
         ":const_op",
@@ -231,11 +246,13 @@ cc_library(
         ":cc_ops_internal",
         ":grad_op_registry",
         ":gradients",
+        "//tensorflow/core:lib_proto_parsing",
     ],
 )
 
 tf_cc_test(
     name = "gradients_array_grad_test",
+    size = "small",
     srcs = ["gradients/array_grad_test.cc"],
     deps = [
         ":array_grad",
@@ -266,6 +283,7 @@ cc_library(
 
 tf_cc_test(
     name = "gradients_math_grad_test",
+    size = "small",
     srcs = ["gradients/math_grad_test.cc"],
     deps = [
         ":cc_ops",
@@ -296,6 +314,7 @@ cc_library(
 
 tf_cc_test(
     name = "gradients_nn_grad_test",
+    size = "small",
     srcs = ["gradients/nn_grad_test.cc"],
     deps = [
         ":cc_ops",
@@ -315,6 +334,7 @@ tf_gen_op_wrappers_cc(
     name = "cc_ops",
     op_lib_names = [
         "array_ops",
+        "audio_ops",
         "candidate_sampling_ops",
         "control_flow_ops",
         "data_flow_ops",
@@ -343,6 +363,7 @@ tf_gen_op_wrappers_cc(
 
 tf_cc_test(
     name = "framework_cc_ops_test",
+    size = "small",
     srcs = ["framework/cc_ops_test.cc"],
     deps = [
         ":cc_ops",
@@ -376,6 +397,34 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "functional_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "functional_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_gen_op_wrappers_cc(
+    name = "resource_variable_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "resource_variable_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_gen_op_wrappers_cc(
+    name = "remote_fused_graph_ops",
+    op_lib_names = [
+        "remote_fused_graph_ops",
+    ],
+    pkg = "//tensorflow/core",
+)
+
 cc_library_with_android_deps(
     name = "cc_op_gen_main",
     srcs = [
@@ -414,7 +463,6 @@ cc_library(
         ":client_session",
         ":ops",
         ":scope",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:tensorflow",
@@ -433,13 +481,25 @@ cc_binary(
     name = "tutorials_example_trainer",
     srcs = ["tutorials/example_trainer.cc"],
     copts = tf_copts(),
-    linkopts = [
-        "-lpthread",
-        "-lm",
-    ],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//tensorflow:darwin": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//tensorflow:ios": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-lpthread",
+            "-lrt",
+        ],
+    }),
     deps = [
         ":cc_ops",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -471,7 +531,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:ops_util",
     ],
 )
@@ -512,6 +571,7 @@ cc_library(
 
 tf_cc_test(
     name = "coordinator_test",
+    size = "small",
     srcs = ["training/coordinator_test.cc"],
     deps = [
         ":cc_ops",
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 644409203c1..ba056a8f3a8 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -16,32 +16,55 @@ limitations under the License.
 #include "tensorflow/cc/client/client_session.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+class ClientSession::Impl {
+ private:
+  friend class ClientSession;
+
+  Impl(Session* session, std::shared_ptr<Graph> graph)
+      : session_(session), graph_(std::move(graph)) {}
+
+  static SessionOptions MakeDefaultSessionOptions(const string& target);
+  Status MaybeExtendGraph() const;
+
+  std::unique_ptr<Session> session_;
+  std::shared_ptr<Graph> graph_;
+
+  mutable mutex mu_;
+  mutable int last_num_graph_nodes_ GUARDED_BY(mu_) = 0;
+};
+
 ClientSession::ClientSession(const Scope& scope, const string& target)
-    : ClientSession(scope, MakeDefaultSessionOptions(target)) {}
+    : ClientSession(scope, Impl::MakeDefaultSessionOptions(target)) {}
 
 ClientSession::ClientSession(const Scope& scope) : ClientSession(scope, "") {}
 
 ClientSession::ClientSession(const Scope& scope,
-                             const SessionOptions& session_options)
-    : graph_(scope.graph_as_shared_ptr()) {
+                             const SessionOptions& session_options) {
   Session* new_session;
   Status status = NewSession(session_options, &new_session);
   TF_CHECK_OK(status) << status;
-  session_.reset(new_session);
-  CHECK_NOTNULL(session_.get());
+  impl_.reset(new Impl(new_session, scope.graph_as_shared_ptr()));
+  CHECK_NOTNULL(impl()->session_.get());
 }
 
-SessionOptions ClientSession::MakeDefaultSessionOptions(
-    const string& target) const {
+// Define destructor here so we can forward declare `Impl` in client_session.h.
+// If we define a dtor in the header file or use the default dtor,
+// unique_ptr<Impl> needs the complete type.
+ClientSession::~ClientSession() {}
+
+SessionOptions ClientSession::Impl::MakeDefaultSessionOptions(
+    const string& target) {
   SessionOptions options;
   options.env = Env::Default();
   options.target = target;
@@ -67,7 +90,7 @@ Status ClientSession::Run(const FeedType& inputs,
              nullptr);
 }
 
-Status ClientSession::MaybeExtendGraph() const {
+Status ClientSession::Impl::MaybeExtendGraph() const {
   mutex_lock l(mu_);
   int num_nodes = graph_->num_node_ids();
   if (num_nodes > last_num_graph_nodes_) {
@@ -90,16 +113,18 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
     feeds.emplace_back(feed.first.name(), feed.second.tensor);
   }
   std::vector<string> output_tensor_names;
+  output_tensor_names.reserve(fetch_outputs.size());
   for (auto const& output : fetch_outputs) {
     output_tensor_names.push_back(output.name());
   }
   std::vector<string> target_node_names;
+  target_node_names.reserve(run_outputs.size());
   for (auto const& output : run_outputs) {
     target_node_names.push_back(output.node()->name());
   }
-  TF_RETURN_IF_ERROR(MaybeExtendGraph());
-  return session_->Run(run_options, feeds, output_tensor_names,
-                       target_node_names, outputs, run_metadata);
+  TF_RETURN_IF_ERROR(impl()->MaybeExtendGraph());
+  return impl()->session_->Run(run_options, feeds, output_tensor_names,
+                               target_node_names, outputs, run_metadata);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h
index 28ff3ec9641..5fb4109f7d1 100644
--- a/tensorflow/cc/client/client_session.h
+++ b/tensorflow/cc/client/client_session.h
@@ -23,14 +23,13 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+/// @addtogroup core
+/// @{
+
 /// A `ClientSession` object lets the caller drive the evaluation of the
 /// TensorFlow graph constructed with the C++ API.
 ///
@@ -64,6 +63,8 @@ class ClientSession {
   /// Create a new session, configuring it with `session_options`.
   ClientSession(const Scope& scope, const SessionOptions& session_options);
 
+  ~ClientSession();
+
   /// Evaluate the tensors in `fetch_outputs`. The values are returned as
   /// `Tensor` objects in `outputs`. The number and order of `outputs` will
   /// match `fetch_outputs`.
@@ -89,18 +90,14 @@ class ClientSession {
   // TODO(keveman): Add support for partial run.
 
  private:
-  SessionOptions MakeDefaultSessionOptions(const string& target) const;
-  Status MaybeExtendGraph() const;
-
-  std::unique_ptr<Session> session_;
-  std::shared_ptr<Graph> graph_;
-
-  mutable mutex mu_;
-  mutable int last_num_graph_nodes_ GUARDED_BY(mu_) = 0;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ClientSession);
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
 };
 
+/// @}
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CC_CLIENT_CLIENT_SESSION_H_
diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc
index 9c0f00f2b12..dfbac9788e1 100644
--- a/tensorflow/cc/client/client_session_test.cc
+++ b/tensorflow/cc/client/client_session_test.cc
@@ -49,7 +49,7 @@ TEST(ClientSessionTest, Feed) {
 
 TEST(ClientSessionTest, Extend) {
   Scope root = Scope::NewRootScope();
-  auto a = Placeholder(root, DT_INT32);
+  auto a = Placeholder(root, DT_INT32, Placeholder::Shape({2}));
   auto c = Add(root, a, {2, 2});
   ClientSession session(root);
   std::vector<Tensor> outputs;
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 5f85d8c5edf..71aa986f918 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -57,6 +57,16 @@ string GetPath(const string& dot_h_fname) {
   return result;
 }
 
+// Converts: some/path/to/file.xx
+// to: file
+// (note that suffix is removed)
+string GetFilename(const string& path) {
+  size_t slash_pos = path.rfind('/');
+  if (slash_pos == path.npos) slash_pos = -1;
+  size_t dot_pos = path.rfind('.');
+  return path.substr(slash_pos + 1, dot_pos - (slash_pos + 1));
+}
+
 // Converts:
 //   cc/ops/gen_foo_ops.h
 // to:
@@ -77,6 +87,17 @@ string ToGuard(const string& path) {
   return guard;
 }
 
+// Converts: some_name_xyz
+// to: Some Name Xyz
+string ToTitle(const string& name) {
+  string title = name;
+  for (int i = 0; i < title.size(); ++i) {
+    if (title[i] == '_') title[i] = ' ';
+  }
+  str_util::TitlecaseString(&title, " ");
+  return title;
+}
+
 // Change:     Into:
 //   ABC         /// ABC
 //               ///
@@ -105,7 +126,11 @@ string PrintString(const string& str) {
   return strings::StrCat("\"", str_util::CEscape(str), "\"");
 }
 
-string PrintTensorShape(const TensorShape& shape) {
+string PrintTensorShape(const TensorShapeProto& shape_proto) {
+  PartialTensorShape shape(shape_proto);
+  if (shape.IsIdenticalTo(PartialTensorShape())) {
+    return "::tensorflow::PartialTensorShape() /* unknown */";
+  }
   string ret = "{";
   for (int d = 0; d < shape.dims(); ++d) {
     if (d > 0) strings::StrAppend(&ret, ", ");
@@ -167,7 +192,13 @@ string PrintTensor(const TensorProto& tensor_proto) {
   }
 }
 
-string PrintAttrValue(string op, const AttrValue& attr_value) {
+string PrintTensorProto(const TensorProto& proto) {
+  return strings::StrCat("Input::Initializer(", "{", PrintTensor(proto), "}, ",
+                         PrintTensorShape(proto.tensor_shape()),
+                         ").AsTensorProto()");
+}
+
+string PrintAttrValue(const string& op, const AttrValue& attr_value) {
   switch (attr_value.value_case()) {
     case AttrValue::kS:
       return PrintString(attr_value.s());
@@ -182,12 +213,9 @@ string PrintAttrValue(string op, const AttrValue& attr_value) {
     case AttrValue::kType:
       return EnumName_DataType(attr_value.type());
     case AttrValue::kShape:
-      return PrintTensorShape(TensorShape(attr_value.shape()));
+      return PrintTensorShape(attr_value.shape());
     case AttrValue::kTensor:
-      return strings::StrCat(
-          "Input::Initializer(", "{", PrintTensor(attr_value.tensor()), "}, ",
-          PrintTensorShape(TensorShape(attr_value.tensor().tensor_shape())),
-          ").AsTensorProto()");
+      return PrintTensorProto(attr_value.tensor());
     case AttrValue::kList: {
       string ret = "{";
       if (attr_value.list().s_size() > 0) {
@@ -220,8 +248,14 @@ string PrintAttrValue(string op, const AttrValue& attr_value) {
       } else if (attr_value.list().shape_size() > 0) {
         for (int i = 0; i < attr_value.list().shape_size(); ++i) {
           if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(
-              &ret, PrintTensorShape(TensorShape(attr_value.list().shape(i))));
+          strings::StrAppend(&ret,
+                             PrintTensorShape(attr_value.list().shape(i)));
+        }
+      } else if (attr_value.list().tensor_size() > 0) {
+        for (int i = 0; i < attr_value.list().tensor_size(); ++i) {
+          if (i > 0) strings::StrAppend(&ret, ", ");
+          strings::StrAppend(&ret,
+                             PrintTensorProto(attr_value.list().tensor(i)));
         }
       }
       strings::StrAppend(&ret, "}");
@@ -271,8 +305,8 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"list(bool)", {"gtl::ArraySlice<bool>", true}},
           {"type", {"DataType", false}},
           {"list(type)", {"DataTypeSlice", true}},
-          {"shape", {"TensorShape", false}},
-          {"list(shape)", {"gtl::ArraySlice<TensorShape>", true}},
+          {"shape", {"PartialTensorShape", false}},
+          {"list(shape)", {"gtl::ArraySlice<PartialTensorShape>", true}},
           {"tensor", {"TensorProto", true}},
           {"list(tensor)", {"gtl::ArraySlice<TensorProto>", true}},
           {"func", {"NameAttrList", true}},
@@ -416,6 +450,7 @@ OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
   }
   strings::StrAppend(&comment, "\nArguments:\n* scope: A Scope object\n");
 
+  // Process inputs
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
     const auto& arg(op_def.input_arg(i));
     arg_types.push_back(strings::StrCat(
@@ -430,30 +465,45 @@ OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
                          arg.description(), "\n");
     }
   }
+
+  // Process attrs
+  string required_attrs_comment;
+  string optional_attrs_comment;
   for (int i = 0; i < op_def.attr_size(); ++i) {
     const auto& attr(op_def.attr(i));
-    // If the attr is going to be inferred or is optional, don't add it as a
-    // required argument.
-    if ((inferred_input_attrs.find(attr.name()) !=
-         inferred_input_attrs.end()) ||
-        attr.has_default_value()) {
-      continue;
-    }
+    // Skip inferred arguments
+    if (inferred_input_attrs.count(attr.name()) > 0) continue;
+
     const auto entry = AttrTypeName(attr.type());
     const auto attr_type_name = entry.first;
     const bool use_const = entry.second;
+    string attr_name = AvoidCPPKeywords(attr.name());
 
-    arg_types.push_back(strings::StrCat(use_const ? "const " : "",
-                                        attr_type_name, use_const ? "&" : ""));
-    arg_names.push_back(AvoidCPPKeywords(attr.name()));
+    string attr_comment;
     if (!attr.description().empty()) {
-      strings::StrAppend(&comment, "* ", AvoidCPPKeywords(attr.name()), ":\n");
       // TODO(keveman): Word wrap and indent this, to handle multi-line
       // descriptions.
-      strings::StrAppend(&comment, "    ", attr.description(), "\n");
+      strings::StrAppend(&attr_comment, "* ", attr_name, ": ",
+                         attr.description(), "\n");
+    }
+    if (attr.has_default_value()) {
+      strings::StrAppend(&optional_attrs_comment, attr_comment);
+    } else {
+      strings::StrAppend(&required_attrs_comment, attr_comment);
+      arg_types.push_back(strings::StrCat(
+          use_const ? "const " : "", attr_type_name, use_const ? "&" : ""));
+      arg_names.push_back(attr_name);
     }
   }
 
+  strings::StrAppend(&comment, required_attrs_comment);
+
+  if (!optional_attrs_comment.empty()) {
+    strings::StrAppend(&comment, "\nOptional attributes (see `Attrs`):\n");
+    strings::StrAppend(&comment, optional_attrs_comment);
+  }
+
+  // Process outputs
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
     const auto& arg = op_def.output_arg(i);
     bool is_list = ArgIsList(arg);
@@ -509,8 +559,6 @@ OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
 string OpInfo::GetOpAttrStruct() const {
   string struct_fields;
   string setters;
-  string attrs_comment =
-      strings::StrCat("Optional attribute setters for ", op_name, " :\n\n");
 
   for (int i = 0; i < op_def.attr_size(); ++i) {
     const auto& attr(op_def.attr(i));
@@ -531,13 +579,15 @@ string OpInfo::GetOpAttrStruct() const {
         strings::StrCat(camel_case_name, suffix, "(", use_const ? "const " : "",
                         attr_type_name, use_const ? "&" : "");
 
-    strings::StrAppend(&attrs_comment, attr_func_def, "): Defaults to ",
-                       SummarizeAttrValue(attr.default_value()), "\n");
+    string attr_comment;
     if (!attr.description().empty()) {
-      // TODO(keveman): Word wrap and indent this to handle multi-line
-      // description.
-      strings::StrAppend(&attrs_comment, "    ", attr.description(), "\n");
+      strings::StrAppend(&attr_comment, attr.description(), "\n\n");
     }
+    strings::StrAppend(&attr_comment, "Defaults to ",
+                       SummarizeAttrValue(attr.default_value()), "\n");
+    attr_comment = MakeComment(attr_comment, "    ");
+
+    strings::StrAppend(&setters, attr_comment);
     strings::StrAppend(&setters, "    Attrs ", attr_func_def, " x) {\n");
     strings::StrAppend(&setters, "      Attrs ret = *this;\n");
     strings::StrAppend(&setters, "      ret.", attr.name(), "_ = x;\n");
@@ -552,6 +602,8 @@ string OpInfo::GetOpAttrStruct() const {
     return "";
   }
 
+  string attrs_comment =
+      strings::StrCat("Optional attribute setters for ", op_name, "\n");
   string struct_decl = MakeComment(attrs_comment, "  ");
   strings::StrAppend(&struct_decl, "  struct Attrs {\n");
   strings::StrAppend(&struct_decl, setters, struct_fields);
@@ -678,7 +730,7 @@ void OpInfo::GetOutput(string* out) const {
     // One output, no need for NameRangeMap
     if (is_list_output[0]) {
       strings::StrAppend(out,
-                         "  for (int64 i = 0; i < ret->num_outputs(); ++i)\n");
+                         "  for (int32 i = 0; i < ret->num_outputs(); ++i)\n");
       strings::StrAppend(out, "    this->", output_names[0],
                          ".push_back(Output(ret, i));\n");
     } else {
@@ -688,11 +740,10 @@ void OpInfo::GetOutput(string* out) const {
     return;
   }
   strings::StrAppend(out, "  ::tensorflow::NameRangeMap _outputs_range;\n");
-  strings::StrAppend(
-      out,
-      "  ::tensorflow::Status _status_ = "
-      "::tensorflow::NameRangesForNode(ret->def(), ret->op_def(), "
-      "nullptr, &_outputs_range);\n");
+  strings::StrAppend(out,
+                     "  ::tensorflow::Status _status_ = "
+                     "::tensorflow::NameRangesForNode(*ret, ret->op_def(), "
+                     "nullptr, &_outputs_range);\n");
   strings::StrAppend(out, "  if (!_status_.ok()) {\n", "    ", scope_str,
                      ".UpdateStatus(_status_);\n", "    return;\n");
   strings::StrAppend(out, "  }\n\n");
@@ -701,7 +752,7 @@ void OpInfo::GetOutput(string* out) const {
     const string arg_range = strings::StrCat(
         "_outputs_range[\"", graph_op_def.output_arg(i).name(), "\"]");
     if (is_list_output[i]) {
-      strings::StrAppend(out, "  for (int64 i = ", arg_range, ".first; i < ",
+      strings::StrAppend(out, "  for (int32 i = ", arg_range, ".first; i < ",
                          arg_range, ".second; ++i)\n");
       strings::StrAppend(out, "    this->", output_names[i],
                          ".push_back(Output(ret, i));\n");
@@ -841,6 +892,10 @@ namespace ops {
 )include",
       "#include \"", op_header, "\"\n", namespace_begin);
 
+  const string filename = GetFilename(dot_h_fname);
+  const string doxygen = strings::StrCat("/// @defgroup ", filename, " ",
+                                         ToTitle(filename), "\n", "/// @{\n\n");
+
   TF_CHECK_OK(h->Append(
       strings::StrCat("// This file is MACHINE GENERATED! Do not edit.\n\n"
                       "#ifndef ",
@@ -850,6 +905,7 @@ namespace ops {
                       *op_header_guard, "\n\n")));
   TF_CHECK_OK(h->Append(header));
   TF_CHECK_OK(h->Append(namespace_begin));
+  TF_CHECK_OK(h->Append(doxygen));
   TF_CHECK_OK(cc->Append(cc_header));
 }
 
@@ -860,7 +916,9 @@ void FinishFiles(bool internal, WritableFile* h, WritableFile* cc,
 }  // namespace tensorflow
 )footer"
                                  :
-                                 R"footer(}  // namespace ops
+                                 R"footer(/// @}
+
+}  // namespace ops
 }  // namespace tensorflow
 )footer";
 
@@ -892,7 +950,7 @@ void WriteCCOps(const OpList& ops, const string& dot_h_fname,
   // Load the override map.
   OpGenOverrideMap override_map;
   if (!overrides_fnames.empty()) {
-    override_map.LoadFileList(env, overrides_fnames);
+    TF_CHECK_OK(override_map.LoadFileList(env, overrides_fnames));
   }
 
   // Write the initial boilerplate to the .h and .cc files.
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index 6dc0d84c16d..5da23036eaa 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -32,10 +32,11 @@ Output Linear(const Scope& scope, Input x, Input w, Input b) {
   return BiasAdd(cop_scopes.last, m, b);
 }
 
-void GetColocationConstraints(Output tensor, std::vector<string>* constraints) {
+void GetColocationConstraints(const Output& tensor,
+                              std::vector<string>* constraints) {
   constraints->clear();
-  TF_EXPECT_OK(
-      GetNodeAttr(tensor.op().node()->def(), kColocationAttrName, constraints));
+  TF_EXPECT_OK(GetNodeAttr(tensor.op().node()->attrs(), kColocationAttrName,
+                           constraints));
 }
 
 }  // namespace
@@ -158,11 +159,11 @@ TEST(CCOpTest, KernelLabel) {
   Scope root = Scope::NewRootScope();
   auto add = Add(root.WithKernelLabel("AddWithKernelLabel"), 1.0f, 2.0f);
   TF_EXPECT_OK(root.status());
-  const auto& attrs = add.z.op().node()->def().attr();
-  ASSERT_TRUE(attrs.find("_kernel") != attrs.end());
-  auto kernel_attr = attrs.find("_kernel")->second;
-  TF_EXPECT_OK(AttrValueHasType(kernel_attr, "string"));
-  EXPECT_EQ(kernel_attr.s(), "AddWithKernelLabel");
+  AttrSlice attrs = add.z.op().node()->attrs();
+  const auto* kernel_attr = attrs.Find("_kernel");
+  ASSERT_TRUE(kernel_attr);
+  TF_EXPECT_OK(AttrValueHasType(*kernel_attr, "string"));
+  EXPECT_EQ(kernel_attr->s(), "AddWithKernelLabel");
 }
 
 TEST(CCOpTest, ColocateWith) {
@@ -189,8 +190,7 @@ TEST(CCOpTest, ColocateWith) {
 
   Scope with_colocate = root.ColocateWith(c3).ColocateWith(c4);
   auto c6 = Const(with_colocate.WithOpName("c6").ClearColocation(), 7);
-  const auto& attrs = c6.op().node()->def().attr();
-  EXPECT_TRUE(attrs.find("_class") == attrs.end());
+  EXPECT_FALSE(c6.op().node()->attrs().Find("_class"));
 }
 
 TEST(CCOpTest, TemplatedConst) {
diff --git a/tensorflow/cc/framework/grad_op_registry.cc b/tensorflow/cc/framework/grad_op_registry.cc
index 0d6a377b507..254705736e7 100644
--- a/tensorflow/cc/framework/grad_op_registry.cc
+++ b/tensorflow/cc/framework/grad_op_registry.cc
@@ -32,7 +32,13 @@ bool GradOpRegistry::Register(const string& op, GradFunc func) {
 Status GradOpRegistry::Lookup(const string& op, GradFunc* func) const {
   auto iter = registry_.find(op);
   if (iter == registry_.end()) {
-    return errors::NotFound("No gradient defined for op: ", op);
+    const string error_msg =
+        "No gradient defined for op: " + op +
+        ". Please see "
+        "https://www.tensorflow.org/code/"
+        "tensorflow/cc/gradients/README.md"
+        " for instructions on how to add C++ gradients.";
+    return errors::NotFound(error_msg);
   }
   *func = iter->second;
   return Status::OK();
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
index 849a8eed6f2..f3a7c138c4e 100644
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
 // TODO(andydavis) Support returning relative error (as opposed to max error)
@@ -39,14 +37,16 @@ Status ComputeTheoreticalJacobianTranspose(
     const std::vector<TensorShape>& x_shapes,
     const std::vector<Tensor>& x_datas, const OutputList& ys,
     const std::vector<TensorShape>& y_shapes,
-    std::vector<Tensor>& jacobian_ts) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+    std::vector<Tensor>* jacobian_ts) {
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
   // Call AddSymbolicGradients to get 'dxs' (we will feed 'dys').
   OutputList dys;
+  dys.reserve(y_shapes.size());
   for (const auto& y_shape : y_shapes) {
     // TODO(suharshs): This currently assumes that all x's are the same type.
-    dys.push_back(Cast(scope, Const(scope, 1.0, y_shape), xs[0].type()));
+    dys.push_back(
+        ops::Cast(scope, ops::Const(scope, 1.0, y_shape), xs[0].type()));
   }
   OutputList dxs;
   TF_RETURN_IF_ERROR(AddSymbolicGradients(scope, ys, xs, dys, &dxs));
@@ -84,7 +84,7 @@ Status ComputeTheoreticalJacobianTranspose(
 
       for (int x_idx = 0; x_idx < x_num; x_idx++) {
         const int64 x_size = x_shapes[x_idx].num_elements();
-        auto jacobian = jacobian_ts[x_idx * y_num + y_idx].matrix<T>();
+        auto jacobian = (*jacobian_ts)[x_idx * y_num + y_idx].matrix<T>();
         auto dx_flat = dxout[x_idx].flat<T>();
         for (int r = 0; r < x_size; ++r) {
           jacobian(r, c) = dx_flat(r);
@@ -97,20 +97,20 @@ Status ComputeTheoreticalJacobianTranspose(
   return Status::OK();
 }
 
-Status EvaluateGraph(ClientSession& session, const OutputList& xs,
-                     const OutputList& ys, std::vector<Tensor>& x_datas,
+Status EvaluateGraph(ClientSession* session, const OutputList& xs,
+                     const OutputList& ys, std::vector<Tensor>* x_datas,
                      std::vector<Tensor>* y_datas) {
   // Create the feed list.
   ClientSession::FeedType feed_list;
-  for (int i = 0; i < x_datas.size(); i++) {
-    feed_list.insert({xs[i], x_datas[i]});
+  for (int i = 0; i < x_datas->size(); i++) {
+    feed_list.insert({xs[i], (*x_datas)[i]});
   }
 
-  TF_RETURN_IF_ERROR(session.Run(feed_list, ys, y_datas));
+  TF_RETURN_IF_ERROR(session->Run(feed_list, ys, y_datas));
   for (int y_idx = 0; y_idx < y_datas->size(); y_idx++) {
-    for (int x_idx = 0; x_idx < x_datas.size(); x_idx++) {
+    for (int x_idx = 0; x_idx < x_datas->size(); x_idx++) {
       Tensor y_data = (*y_datas)[y_idx];
-      if (y_data.SharesBufferWith(x_datas[x_idx])) {
+      if (y_data.SharesBufferWith((*x_datas)[x_idx])) {
         // Create copies of outputs that share a buffer with any inputs since
         // the underlying buffer of the input Tensors are not copied for some
         // operations (i.e. Identity), which can lead to incorrect results for
@@ -128,14 +128,14 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
                                        const OutputList& ys,
                                        const std::vector<TensorShape>& y_shapes,
                                        const T delta,
-                                       std::vector<Tensor>& x_datas,
-                                       std::vector<Tensor>& jacobian_ts) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+                                       std::vector<Tensor>* x_datas,
+                                       std::vector<Tensor>* jacobian_ts) {
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
 
   ClientSession session(scope);
   for (int x_idx = 0; x_idx < x_num; x_idx++) {
-    auto x_data_flat = x_datas[x_idx].flat<T>();
+    auto x_data_flat = (*x_datas)[x_idx].flat<T>();
     const int64 x_size = x_shapes[x_idx].num_elements();
 
     // Compute the numeric Jacobian one column at a time by perturbing each
@@ -147,11 +147,11 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
       // Evaluate at positive delta.
       x_data_flat(r) = v + delta;
       std::vector<Tensor> y_pos;
-      TF_RETURN_IF_ERROR(EvaluateGraph(session, xs, ys, x_datas, &y_pos));
+      TF_RETURN_IF_ERROR(EvaluateGraph(&session, xs, ys, x_datas, &y_pos));
       // Evaluate at negative delta.
       x_data_flat(r) = v - delta;
       std::vector<Tensor> y_neg;
-      TF_RETURN_IF_ERROR(EvaluateGraph(session, xs, ys, x_datas, &y_neg));
+      TF_RETURN_IF_ERROR(EvaluateGraph(&session, xs, ys, x_datas, &y_neg));
 
       for (int y_idx = 0; y_idx < y_num; y_idx++) {
         // Compute element-wise centered difference and store in each Jacobian.
@@ -159,7 +159,7 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
         auto y_neg_flat = y_neg[y_idx].flat<T>();
         const int64 y_size = y_shapes[y_idx].num_elements();
         const T scale = 2 * delta;
-        auto jacobian = jacobian_ts[x_idx * y_num + y_idx].matrix<T>();
+        auto jacobian = (*jacobian_ts)[x_idx * y_num + y_idx].matrix<T>();
         for (int c = 0; c < y_size; ++c) {
           jacobian(r, c) = (y_pos_flat(c) - y_neg_flat(c)) / scale;
         }
@@ -175,11 +175,11 @@ template <typename T>
 void InitJacobians(const OutputList& xs,
                    const std::vector<TensorShape>& x_shapes,
                    const std::vector<TensorShape>& y_shapes,
-                   std::vector<Tensor>& jacobians) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+                   std::vector<Tensor>* jacobians) {
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
 
-  jacobians.resize(y_num * x_num);
+  jacobians->resize(y_num * x_num);
   for (int x_idx = 0; x_idx < x_num; x_idx++) {
     const int64 x_size = x_shapes[x_idx].num_elements();
     for (int y_idx = 0; y_idx < y_num; y_idx++) {
@@ -187,7 +187,7 @@ void InitJacobians(const OutputList& xs,
       Tensor jacobian_t(xs[x_idx].type(), {x_size, y_size});
       auto jacobian_t_flat = jacobian_t.flat<T>();
       jacobian_t_flat.setZero();
-      jacobians[x_idx * y_num + y_idx] = std::move(jacobian_t);
+      (*jacobians)[x_idx * y_num + y_idx] = std::move(jacobian_t);
     }
   }
 }
@@ -197,23 +197,23 @@ Status ComputeGradientErrorInternal(const Scope& scope, const OutputList& xs,
                                     const std::vector<TensorShape>& x_shapes,
                                     const OutputList& ys,
                                     const std::vector<TensorShape>& y_shapes,
-                                    std::vector<Tensor>& x_datas,
+                                    std::vector<Tensor>* x_datas,
                                     T* max_error) {
   // Initialize theoretical Jacobians to zeros.
   std::vector<Tensor> jacobian_ts;
-  InitJacobians<T>(xs, x_shapes, y_shapes, jacobian_ts);
+  InitJacobians<T>(xs, x_shapes, y_shapes, &jacobian_ts);
 
   // Compute theoretical Jacobian.
   TF_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose<T>(
-      scope, xs, x_shapes, x_datas, ys, y_shapes, jacobian_ts));
+      scope, xs, x_shapes, *x_datas, ys, y_shapes, &jacobian_ts));
 
   // Initialize numeric Jacobian to zeros.
   std::vector<Tensor> jacobian_ns;
-  InitJacobians<T>(xs, x_shapes, y_shapes, jacobian_ns);
+  InitJacobians<T>(xs, x_shapes, y_shapes, &jacobian_ns);
 
   // Compute numeric Jacobian.
   TF_RETURN_IF_ERROR(ComputeNumericJacobianTranspose<T>(
-      scope, xs, x_shapes, ys, y_shapes, 1e-3, x_datas, jacobian_ns));
+      scope, xs, x_shapes, ys, y_shapes, 1e-3, x_datas, &jacobian_ns));
 
   for (int i = 0; i < jacobian_ts.size(); i++) {
     // Compute the maximum error between theoretical and numeric Jacobians.
@@ -256,7 +256,7 @@ Status ComputeGradientError(const Scope& scope, const OutputList& xs,
   }
   // Compute gradient error.
   return ComputeGradientErrorInternal(scope, xs, x_shapes, ys, y_shapes,
-                                      x_datas, max_error);
+                                      &x_datas, max_error);
 }
 
 template <typename T>
@@ -267,7 +267,7 @@ Status ComputeGradientError(const Scope& scope, const Output& x,
   std::vector<Tensor> x_datas(1, Tensor(x_init_value));
   // Compute gradient error.
   return ComputeGradientErrorInternal(scope, {x}, {x_datas[0].shape()}, {y},
-                                      {y_shape}, x_datas, max_error);
+                                      {y_shape}, &x_datas, max_error);
 }
 
 #define INSTANTIATE_GRAD_ERR_TYPE(T)                                   \
diff --git a/tensorflow/cc/framework/gradient_checker_test.cc b/tensorflow/cc/framework/gradient_checker_test.cc
index 998e9fe07dc..c5bddc50fcc 100644
--- a/tensorflow/cc/framework/gradient_checker_test.cc
+++ b/tensorflow/cc/framework/gradient_checker_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 using namespace ops;  // NOLINT(build/namespaces)
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 2c60f947a55..8c00a6f7049 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -210,8 +210,8 @@ Status SymbolicGradientBuilder::Initialize() {
 
   {
     // Initialize backprop with `grad_inputs_`.
-    const int num_dy = grad_inputs_.size();
-    for (int i = 0; i < num_dy; ++i) {
+    const size_t num_dy = grad_inputs_.size();
+    for (size_t i = 0; i < num_dy; ++i) {
       TF_RETURN_IF_ERROR(BackpropAlongEdge(grad_inputs_[i], outputs_[i]));
     }
   }
@@ -308,7 +308,7 @@ Status SymbolicGradientBuilder::AddGradients() {
       continue;
     }
 
-    const int num_no_grad = no_grad_dy_indices.size();
+    const size_t num_no_grad = no_grad_dy_indices.size();
     if (IsPrimitiveOpWithNoGrad(n->type_string()) || num_no_grad == num_y) {
       // No grad defined for this op, or all outputs returned 'NoGradient':
       // Backprop 'NoGradient' along the in edges.
@@ -367,6 +367,19 @@ Status AddSymbolicGradients(const Scope& scope,
   return builder.AddGradients();
 }
 
+Status AddSymbolicGradients(const Scope& scope,
+                            const std::vector<Output>& outputs,
+                            const std::vector<Output>& inputs,
+                            std::vector<Output>* grad_outputs) {
+  std::vector<Output> grad_inputs;
+  grad_inputs.reserve(outputs.size());
+  for (const Output& output : outputs) {
+    grad_inputs.emplace_back(ops::OnesLike(scope, output));
+  }
+  return AddSymbolicGradients(scope, outputs, inputs, grad_inputs,
+                              grad_outputs);
+}
+
 Output NoGradient() { return SymbolicGradientBuilder::NoGradient(); }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index d076bc43b4f..717f6f0636d 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -27,16 +27,19 @@ namespace tensorflow {
 /// derivatives of some loss function 'L' w.r.t 'outputs'), adds gradient nodes
 /// to the graph associated with 'scope', which compute (and return in
 /// 'grad_outputs') the symbolic partial derivatives of 'L' w.r.t 'inputs'.
-///
-
-// TODO(andydavis) Add overload of this function with no 'grad_inputs' arg.
-// Implementation will fill in 'OnesLike' for all shapes in 'outputs'.
 Status AddSymbolicGradients(const Scope& scope,
                             const std::vector<Output>& outputs,
                             const std::vector<Output>& inputs,
                             const std::vector<Output>& grad_inputs,
                             std::vector<Output>* grad_outputs);
 
+// Same as above, but uses 'OnesLike' for all shapes in
+// 'outputs' as grad_inputs.
+Status AddSymbolicGradients(const Scope& scope,
+                            const std::vector<Output>& outputs,
+                            const std::vector<Output>& inputs,
+                            std::vector<Output>* grad_outputs);
+
 /// Returns a sentinel Output that represents 'no gradient' (i.e. no gradient
 /// flows along some graph edge during backpropagation).
 /// Can be returned in 'grad_outputs' by an invocation of 'AddSymbolicGradients'
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 8c6e5de4259..6a249825812 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 using namespace ops;  // NOLINT(build/namespaces)
@@ -40,7 +40,7 @@ class GradientsTest : public ::testing::Test {
     TF_ASSERT_OK(scope_test_.ToGraphDef(&gdef_test));
     GraphDef gdef_exp;
     TF_ASSERT_OK(scope_expected_.ToGraphDef(&gdef_exp));
-    TF_EXPECT_GRAPH_EQ(gdef_test, gdef_exp);
+    TF_EXPECT_GRAPH_EQ(gdef_exp, gdef_test);
   }
 
   Scope scope_expected_;
@@ -98,6 +98,32 @@ TEST_F(GradientsTest, OneMatMul) {
   CompareTestAndExpectedGraphs();
 }
 
+TEST_F(GradientsTest, OneMatMul_InferGradInputs) {
+  for (const bool expected : {false, true}) {
+    const Scope& scope = expected ? scope_expected_ : scope_test_;
+    // Construct forward graph.
+    auto x = Const(scope, {{1.0, 2.0}, {3.0, 4.0}});
+    auto y = Const(scope, {{1.0, 0.0}, {0.0, 1.0}});
+    auto z = MatMul(scope, x, y);
+    TF_ASSERT_OK(scope.status());
+    CHECK_NOTNULL(z.node());
+
+    if (expected) {
+      // Construct backward graph.
+      // The gradients function adds a OnesLike to create a dz of ones with the
+      // shape of z.
+      auto dz = OnesLike(scope, z);
+      auto dx = MatMul(scope, dz, y, MatMul::TransposeB(true));
+      auto dy = MatMul(scope, x, dz, MatMul::TransposeA(true));
+    } else {
+      // Call AddSymbolicGradients.
+      std::vector<Output> grad_outputs;
+      TF_ASSERT_OK(AddSymbolicGradients(scope, {z}, {x, y}, &grad_outputs));
+    }
+  }
+  CompareTestAndExpectedGraphs();
+}
+
 TEST_F(GradientsTest, TwoMatMuls_Chained) {
   for (const bool expected : {false, true}) {
     const Scope& scope = expected ? scope_expected_ : scope_test_;
@@ -234,7 +260,7 @@ TEST_F(GradientsTest, StackUnstack_StopBackprop) {
 }
 
 TEST_F(GradientsTest, DependentGradOutputs) {
-  // Tests that dependant gradients (in this case the gradients w.r.t to the
+  // Tests that dependent gradients (in this case the gradients w.r.t to the
   // output and one input of MatMul) are computed properly.
 
   // Create two chained MatMul ops.
diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
index 50df891a4c4..920a8e79556 100644
--- a/tensorflow/cc/framework/ops.cc
+++ b/tensorflow/cc/framework/ops.cc
@@ -20,7 +20,7 @@ namespace tensorflow {
 
 Operation::Operation(Node* n) : inputs_(GetInputs(n)), node_(n) {}
 
-Output Operation::input(int i) const {
+Output Operation::input(int32 i) const {
   CHECK_NOTNULL(node_);
   CHECK_GE(i, 0);
   CHECK_LT(i, node_->num_inputs());
@@ -37,14 +37,14 @@ Output Operation::input(int i) const {
   return Output(inputs_[i].first, inputs_[i].second);
 }
 
-Output Operation::output(int i) const {
+Output Operation::output(int32 i) const {
   CHECK_NOTNULL(node_);
   CHECK_GE(i, 0);
   CHECK_LT(i, node_->num_outputs());
   return Output(node_, i);
 }
 
-uint64 Operation::hash(int64 index) const {
+uint64 Operation::hash(int32 index) const {
   return ::tensorflow::Hash64(reinterpret_cast<const char*>(&node_),
                               sizeof(Node*), index);
 }
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index d4f1079c3b2..8d4154220c4 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -26,30 +26,35 @@ limitations under the License.
 
 namespace tensorflow {
 
+/// @defgroup core Core Tensorflow API
+
 class Output;
 
+/// @addtogroup core
+/// @{
+
 /// Represents a node in the computation graph.
 class Operation {
  public:
   Operation() : node_(nullptr) {}
   explicit Operation(Node* n);
 
-  int num_inputs() const { return node_->num_inputs(); }
-  DataType input_type(int o) const { return node_->input_type(o); }
-  Output input(int i) const;
+  int32 num_inputs() const { return node_->num_inputs(); }
+  DataType input_type(int32 o) const { return node_->input_type(o); }
+  Output input(int32 i) const;
 
-  int num_outputs() const { return node_->num_outputs(); }
-  DataType output_type(int o) const { return node_->output_type(o); }
-  Output output(int i) const;
+  int32 num_outputs() const { return node_->num_outputs(); }
+  DataType output_type(int32 o) const { return node_->output_type(o); }
+  Output output(int32 i) const;
 
   Node* node() const { return node_; }
 
-  uint64 hash(int64 index) const;
+  uint64 hash(int32 index) const;
 
   bool operator==(const Operation& other) const { return node_ == other.node_; }
 
  private:
-  typedef std::vector<std::pair<Node*, int64>> Inputs;
+  typedef std::vector<std::pair<Node*, int32>> Inputs;
   static Inputs GetInputs(Node* node);
 
   Inputs inputs_;
@@ -61,12 +66,12 @@ class Output {
  public:
   Output() = default;
   explicit Output(Node* n) : op_(n) {}
-  Output(Node* n, int64 index) : op_(n), index_(index) {}
-  Output(const Operation& op, int64 index) : op_(op), index_(index) {}
+  Output(Node* n, int32 index) : op_(n), index_(index) {}
+  Output(const Operation& op, int32 index) : op_(op), index_(index) {}
 
   Operation op() const { return op_; }
   Node* node() const { return op().node(); }
-  int64 index() const { return index_; }
+  int32 index() const { return index_; }
   DataType type() const { return op_.output_type(index_); }
   string name() const { return strings::StrCat(node()->name(), ":", index()); }
   bool operator==(const Output& other) const {
@@ -77,13 +82,14 @@ class Output {
 
  private:
   Operation op_ = Operation(nullptr);
-  int64 index_ = 0;
+  int32 index_ = 0;
 };
 
+/// Hash class that can be used for e.g. storing Outputs in an unordered_map
 struct OutputHash {
   std::size_t operator()(const Output& output) const {
     return Hash64Combine(std::hash<Node*>()(output.node()),
-                         std::hash<int64>()(output.index()));
+                         std::hash<int32>()(output.index()));
   }
 };
 
@@ -161,6 +167,7 @@ class Input {
     /// initializer list is indeed a valid multi-dimensional tensor.
     Initializer(const std::initializer_list<Initializer>& v);
 
+    // START_SKIP_DOXYGEN
     template <typename T, bool = std::is_convertible<T, string>::value>
     struct RealType {
       typedef string type;
@@ -170,6 +177,7 @@ class Input {
     struct RealType<T, false> {
       typedef T type;
     };
+    // END_SKIP_DOXYGEN
 
     TensorProto AsTensorProto() {
       TensorProto tensor_proto;
@@ -222,12 +230,12 @@ class Input {
 
   /// Constructor specifying a node name, index and datatype. This should only
   /// be used for specifying a backward edge, needed by control flow.
-  Input(const string& name, int i, DataType dt)
+  Input(const string& name, int32 i, DataType dt)
       : node_name_(name), index_(i), data_type_(dt) {}
 
   Node* node() const { return output_.node(); }
   string node_name() const { return node_name_; }
-  int index() const { return node_name_.empty() ? output_.index() : index_; }
+  int32 index() const { return node_name_.empty() ? output_.index() : index_; }
   DataType data_type() const { return data_type_; }
   Status status() const { return status_; }
   const Tensor& tensor() const { return tensor_; }
@@ -237,7 +245,7 @@ class Input {
   Output output_ = Output(Operation(nullptr), 0);
   Tensor tensor_;
   const string node_name_ = "";
-  int index_ = 0;
+  int32 index_ = 0;
   DataType data_type_ = DT_INVALID;
 };
 
@@ -284,6 +292,8 @@ class InputList {
   std::vector<Input> inputs_;
 };
 
+/// @}
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index e1af5b36e8c..32c0822de69 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -16,15 +16,116 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 
 namespace tensorflow {
 
-Scope::Scope(Graph* graph, Status* status, Scope::NameMap* name_map,
-             ShapeRefiner* refiner)
+class Scope::Impl {
+ public:
+  // A NameMap is used to keep track of suffixes for names used in a scope. A
+  // name that has not been used so far in a scope will get no suffix. Later
+  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
+  // can share the same NameMap. For instance, a new scope created using
+  // WithControlDependencies() should would share the same NameMap with the
+  // parent.
+  typedef std::unordered_map<string, int> NameMap;
+
+  Impl(const std::shared_ptr<Graph>& graph,
+       const std::shared_ptr<Status>& status,
+       const std::shared_ptr<NameMap>& name_map,
+       const std::shared_ptr<ShapeRefiner>& refiner);
+
+ private:
+  friend class Scope;
+
+  // Tag types to choose the constructor to dispatch.
+  struct Tags {
+    enum class ScopeName;
+    enum class OpName;
+    enum class ControlDeps;
+    enum class Device;
+    enum class SingleUseScope;
+    enum class ExitOnError;
+    enum class KernelLabel;
+    enum class Colocate;
+  };
+
+  Impl(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner);
+  Impl(const Scope& other, Tags::ScopeName, const string& name,
+       bool copy_names);
+  Impl(const Scope& other, Tags::OpName, const string& name,
+       const string& op_name);
+  Impl(const Scope& other, Tags::ControlDeps,
+       std::vector<Operation> control_deps, bool clear_control_deps);
+  Impl(const Scope& other, Tags::Device, const string& device);
+  Impl(const Scope& other, Tags::SingleUseScope, const string& op_name);
+  Impl(const Scope& other, Tags::ExitOnError);
+  Impl(const Scope& other, Tags::KernelLabel, const string& kernel_label);
+  Impl(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
+       bool clear_colocations);
+
+  std::unordered_set<string> GetColocationConstraints(
+      const Operation& colocate_with_op) const;
+
+  // Helper functions to get a unique names.
+  string GetUniqueName(const string& prefix, bool check_single_use) const;
+  string GetNameForOp(const string& default_name) const;
+
+  bool single_use_scope() const { return scope_used_ != nullptr; }
+
+  // The graph, status, and name maps are shared by all child scopes
+  // created from a single 'root' scope. A root scope is created by calling the
+  // Scope::NewRootScope function, which creates a new graph, a new status and
+  // the name maps.
+  std::shared_ptr<Graph> graph_ = nullptr;
+  std::shared_ptr<Status> status_ = nullptr;
+  std::shared_ptr<NameMap> name_map_ = nullptr;
+  std::shared_ptr<ShapeRefiner> refiner_ = nullptr;
+
+  // If scope_used_ is not nullptr, op_name_ should be empty and
+  // GetUniqueNameForOp can only be called once on this scope. More calls to
+  // GetUniqueNameForOp will cause an error status to be set on this scope.
+  std::shared_ptr<bool> scope_used_ = nullptr;
+
+  const std::vector<Operation> control_deps_;
+
+  const string name_ = "";
+  const string op_name_ = "";
+  const bool exit_on_error_ = false;
+  const string kernel_label_ = "";
+  const string device_ = "";
+  const std::unordered_set<string> colocation_constraints_;
+};
+
+Scope::Scope(Impl* impl) : impl_(impl) {}
+
+Scope::Scope(const Scope& other) : impl_(new Impl(*other.impl())) {}
+
+Scope::~Scope() {}
+
+Scope& Scope::operator=(const Scope& other) {
+  // We can't copy Impls because of the const members, use copy ctor instead
+  impl_.reset(new Impl(*other.impl_));
+  return *this;
+}
+
+Scope::Impl::Impl(Graph* graph, Status* status, NameMap* name_map,
+                  ShapeRefiner* refiner)
+    : graph_(graph),
+      status_(status),
+      name_map_(name_map),
+      refiner_(refiner),
+      scope_used_(nullptr),
+      colocation_constraints_() {}
+
+Scope::Impl::Impl(const std::shared_ptr<Graph>& graph,
+                  const std::shared_ptr<Status>& status,
+                  const std::shared_ptr<NameMap>& name_map,
+                  const std::shared_ptr<ShapeRefiner>& refiner)
     : graph_(graph),
       status_(status),
       name_map_(name_map),
@@ -34,143 +135,145 @@ Scope::Scope(Graph* graph, Status* status, Scope::NameMap* name_map,
 
 Scope Scope::NewRootScope() {
   Graph* graph = new Graph(OpRegistry::Global());
-  ShapeRefiner* refiner = new ShapeRefiner(graph->op_registry());
-  return Scope(graph, new Status, new Scope::NameMap, refiner);
+  ShapeRefiner* refiner =
+      new ShapeRefiner(graph->versions().producer(), graph->op_registry());
+  return Scope(new Impl(graph, new Status, new Impl::NameMap, refiner));
 }
 
-Scope::Scope(const Scope& other, Scope::Tags::ScopeName, const string& name,
-             bool copy_names)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(copy_names ? other.name_map_
+Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
+                  bool copy_names)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(copy_names ? other.impl()->name_map_
                            : std::shared_ptr<NameMap>(new NameMap)),
-      refiner_(other.refiner_),
+      refiner_(other.impl()->refiner_),
       scope_used_(nullptr),
-      control_deps_(other.control_deps_),
+      control_deps_(other.impl()->control_deps_),
       name_(name),
       op_name_(""),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::OpName, const string& name,
-             const string& op_name)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(other.control_deps_),
+Scope::Impl::Impl(const Scope& other, Tags::OpName, const string& name,
+                  const string& op_name)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
       name_(name),
       op_name_(op_name),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::ControlDeps,
-             std::vector<Operation> control_deps, bool clear_control_deps)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(clear_control_deps
-                        ? std::vector<Operation>()
-                        : (control_deps.insert(control_deps.begin(),
-                                               other.control_deps_.begin(),
-                                               other.control_deps_.end()),
-                           control_deps)),
-      name_(other.name_),
-      op_name_(other.op_name_),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+Scope::Impl::Impl(const Scope& other, Tags::ControlDeps,
+                  std::vector<Operation> control_deps, bool clear_control_deps)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(
+          clear_control_deps
+              ? std::vector<Operation>()
+              : (control_deps.insert(control_deps.begin(),
+                                     other.impl()->control_deps_.begin(),
+                                     other.impl()->control_deps_.end()),
+                 control_deps)),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::Device, const string& device)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(other.control_deps_),
-      name_(other.name_),
-      op_name_(other.op_name_),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
+Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
       device_(device),
-      colocation_constraints_(other.colocation_constraints_) {}
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::SingleUseScope,
-             const string& op_name)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
+Scope::Impl::Impl(const Scope& other, Tags::SingleUseScope,
+                  const string& op_name)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
       scope_used_(new bool(false)),
-      control_deps_(other.control_deps_),
-      name_(other.name_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
       op_name_(op_name),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::ExitOnError)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(other.control_deps_),
-      name_(other.name_),
-      op_name_(other.op_name_),
+Scope::Impl::Impl(const Scope& other, Tags::ExitOnError)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
       exit_on_error_(true),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::KernelLabel,
-             const string& kernel_label)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(other.control_deps_),
-      name_(other.name_),
-      op_name_(other.op_name_),
-      exit_on_error_(other.exit_on_error_),
+Scope::Impl::Impl(const Scope& other, Tags::KernelLabel,
+                  const string& kernel_label)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(kernel_label),
-      device_(other.device_),
-      colocation_constraints_(other.colocation_constraints_) {}
+      device_(other.impl()->device_),
+      colocation_constraints_(other.impl()->colocation_constraints_) {}
 
-Scope::Scope(const Scope& other, Scope::Tags::Colocate,
-             const Operation& colocate_with_op, bool clear_colocations)
-    : graph_(other.graph_),
-      status_(other.status_),
-      name_map_(other.name_map_),
-      refiner_(other.refiner_),
-      scope_used_(other.scope_used_),
-      control_deps_(other.control_deps_),
-      name_(other.name_),
-      op_name_(other.op_name_),
-      exit_on_error_(other.exit_on_error_),
-      kernel_label_(other.kernel_label_),
-      device_(other.device_),
+Scope::Impl::Impl(const Scope& other, Tags::Colocate,
+                  const Operation& colocate_with_op, bool clear_colocations)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
       colocation_constraints_(
           clear_colocations
               ? std::unordered_set<string>()
-              : other.GetColocationConstraints(colocate_with_op)) {}
+              : other.impl()->GetColocationConstraints(colocate_with_op)) {}
 
-std::unordered_set<string> Scope::GetColocationConstraints(
+std::unordered_set<string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
   std::unordered_set<string> current_constraints(colocation_constraints_);
-  const NodeDef& node_def = colocate_with_op.node()->def();
+  const AttrSlice attrs = colocate_with_op.node()->attrs();
   std::vector<string> node_constraints;
-  if (GetNodeAttr(node_def, kColocationAttrName, &node_constraints).ok()) {
+  if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
       if (s.Consume(kColocationGroupPrefix)) {
@@ -183,45 +286,59 @@ std::unordered_set<string> Scope::GetColocationConstraints(
   return current_constraints;
 }
 
+bool Scope::ok() const { return impl()->status_->ok(); }
+
+Graph* Scope::graph() const { return impl()->graph_.get(); }
+
+std::shared_ptr<Graph> Scope::graph_as_shared_ptr() const {
+  return impl()->graph_;
+}
+
+Status Scope::status() const { return *impl()->status_; }
+
+const std::vector<Operation>& Scope::control_deps() const {
+  return impl()->control_deps_;
+}
+
 void Scope::UpdateStatus(const Status s) const {
-  status_->Update(s);
-  if (exit_on_error_ && !status_->ok()) {
-    LOG(FATAL) << *status_;
+  impl()->status_->Update(s);
+  if (impl()->exit_on_error_ && !ok()) {
+    LOG(FATAL) << *impl()->status_;
   }
 }
 
 Status Scope::ToGraphDef(GraphDef* gdef) const {
-  if (!status_->ok()) {
-    return *status_;
+  if (!ok()) {
+    return *impl()->status_;
   }
   graph()->ToGraphDef(gdef);
   return Status::OK();
 }
 
 Status Scope::ToGraph(Graph* g) const {
-  if (status_->ok()) {
+  if (ok()) {
     GraphDef graph_def;
     graph()->ToGraphDef(&graph_def);
     GraphConstructorOptions opts;
     UpdateStatus(ConvertGraphDefToGraph(opts, graph_def, g));
   }
-  return *status_;
+  return *impl()->status_;
 }
 
 void Scope::UpdateBuilder(NodeBuilder* builder) const {
   std::vector<Node*> control_inputs;
-  for (const auto& op : control_deps_) {
+  for (const auto& op : impl()->control_deps_) {
     control_inputs.push_back(op.node());
   }
   builder->ControlInputs(control_inputs);
 
-  if (!kernel_label_.empty()) {
-    builder->Attr("_kernel", kernel_label_);
+  if (!impl()->kernel_label_.empty()) {
+    builder->Attr("_kernel", impl()->kernel_label_);
   }
 
-  if (!colocation_constraints_.empty()) {
-    std::vector<string> constraints(colocation_constraints_.begin(),
-                                    colocation_constraints_.end());
+  if (!impl()->colocation_constraints_.empty()) {
+    std::vector<string> constraints(impl()->colocation_constraints_.begin(),
+                                    impl()->colocation_constraints_.end());
     // Sort the set.
     std::sort(constraints.begin(), constraints.end());
     // Add loc:@ prefix
@@ -231,12 +348,13 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
                    });
     builder->Attr(kColocationAttrName, constraints);
   }
-  if (!device_.empty()) {
-    builder->Device(device_);
+  if (!impl()->device_.empty()) {
+    builder->Device(impl()->device_);
   }
 }
 
-string Scope::GetUniqueName(const string& prefix, bool check_single_use) const {
+string Scope::Impl::GetUniqueName(const string& prefix,
+                                  bool check_single_use) const {
   if (check_single_use && single_use_scope()) {
     if (*scope_used_) {
       *status_ =
@@ -256,7 +374,7 @@ string Scope::GetUniqueName(const string& prefix, bool check_single_use) const {
   return unique_name;
 }
 
-string Scope::GetNameForOp(const string& default_name) const {
+string Scope::Impl::GetNameForOp(const string& default_name) const {
   const string unique_name =
       GetUniqueName(default_name, true /* check_single_use */);
   const string sep = name_.empty() || unique_name.empty() ? "" : "/";
@@ -264,96 +382,125 @@ string Scope::GetNameForOp(const string& default_name) const {
 }
 
 string Scope::GetUniqueNameForOp(const string& default_name) const {
-  if (single_use_scope()) {
-    if (op_name_.empty() || *scope_used_) {
-      *status_ =
+  if (impl()->single_use_scope()) {
+    if (impl()->op_name_.empty() || *impl()->scope_used_) {
+      *impl()->status_ =
           errors::InvalidArgument("Cannot get a unique name in this scope");
       return "";
     }
-    *scope_used_ = true;
-    return op_name_;
+    *impl()->scope_used_ = true;
+    return impl()->op_name_;
   }
-  return op_name_.empty() ? GetNameForOp(default_name) : GetNameForOp(op_name_);
+  return impl()->op_name_.empty() ? impl()->GetNameForOp(default_name)
+                                  : impl()->GetNameForOp(impl()->op_name_);
 }
 
 Scope Scope::NewSubScope(const string& child_scope_name) const {
   if (child_scope_name.empty()) {
-    return Scope(*this, Scope::Tags::ScopeName(), name_, true /* copy_names */);
+    return Scope(new Impl(*this, Impl::Tags::ScopeName(), impl()->name_,
+                          true /* copy_names */));
   }
   const string unique_name =
-      GetUniqueName(child_scope_name, false /* check_single_use */);
-  const string sep = name_.empty() || unique_name.empty() ? "" : "/";
-  return Scope(*this, Scope::Tags::ScopeName(),
-               strings::StrCat(name_, sep, unique_name),
-               false /* copy_names */);
+      impl()->GetUniqueName(child_scope_name, false /* check_single_use */);
+  const string sep = impl()->name_.empty() || unique_name.empty() ? "" : "/";
+  return Scope(new Impl(*this, Impl::Tags::ScopeName(),
+                        strings::StrCat(impl()->name_, sep, unique_name),
+                        false /* copy_names */));
 }
 
 Scope Scope::WithOpName(const string& op_name) const {
-  if (single_use_scope()) {
+  if (impl()->single_use_scope()) {
     UpdateStatus(errors::InvalidArgument("Cannot set op name ", op_name,
                                          " on this scope"));
     return *this;
   }
-  return Scope(*this, Scope::Tags::OpName(), name_, op_name);
+  return Scope(new Impl(*this, Impl::Tags::OpName(), impl()->name_, op_name));
 }
 
 Scope Scope::WithControlDependencies(
     const gtl::ArraySlice<Operation>& control_deps) const {
-  return Scope(*this, Scope::Tags::ControlDeps(),
+  return Scope(
+      new Impl(*this, Impl::Tags::ControlDeps(),
                std::vector<Operation>(control_deps.begin(), control_deps.end()),
-               /* clear_control_deps */ false);
+               /* clear_control_deps */ false));
 }
 
 Scope Scope::WithControlDependencies(const Output& control_dep) const {
-  return Scope(*this, Scope::Tags::ControlDeps(),
-               std::vector<Operation>(1, control_dep.op()),
-               /* clear_control_deps */ false);
+  return Scope(new Impl(*this, Impl::Tags::ControlDeps(),
+                        std::vector<Operation>(1, control_dep.op()),
+                        /* clear_control_deps */ false));
 }
 
 Scope Scope::WithNoControlDependencies() const {
-  return Scope(*this, Scope::Tags::ControlDeps(), std::vector<Operation>(),
-               /* clear_control_deps */ true);
+  return Scope(new Impl(*this, Impl::Tags::ControlDeps(),
+                        std::vector<Operation>(),
+                        /* clear_control_deps */ true));
 }
 
 Scope Scope::WithDevice(const string& device) const {
-  return Scope(*this, Scope::Tags::Device(), device);
+  return Scope(new Impl(*this, Impl::Tags::Device(), device));
 }
 
 Scope Scope::ColocateWith(const Operation& op) const {
-  return Scope(*this, Scope::Tags::Colocate(), op,
-               /* clear_colocations */ false);
+  return Scope(new Impl(*this, Impl::Tags::Colocate(), op,
+                        /* clear_colocations */ false));
 }
 
 Scope Scope::ClearColocation() const {
-  return Scope(*this, Scope::Tags::Colocate(), Operation(),
-               /* clear_colocations */ true);
+  return Scope(new Impl(*this, Impl::Tags::Colocate(), Operation(),
+                        /* clear_colocations */ true));
 }
 
 Scope Scope::ExitOnError() const {
-  return Scope(*this, Scope::Tags::ExitOnError());
+  return Scope(new Impl(*this, Impl::Tags::ExitOnError()));
 }
 
 Scope Scope::WithKernelLabel(const string& kernel_label) const {
-  return Scope(*this, Scope::Tags::KernelLabel(), kernel_label);
+  return Scope(new Impl(*this, Impl::Tags::KernelLabel(), kernel_label));
 }
 
 CompositeOpScopes Scope::GetCompositeOpScopes(
     const string& composite_op_name) const {
-  if (op_name_.empty() && composite_op_name.empty()) {
+  if (impl()->op_name_.empty() && composite_op_name.empty()) {
     UpdateStatus(errors::InvalidArgument(
         "Cannot create composite op scopes with empty name"));
     return {*this, *this};
   }
-  if (!single_use_scope()) {
-    Scope child = NewSubScope(op_name_.empty() ? composite_op_name : op_name_);
-    const string child_op_sep = name_.empty() ? "" : "_";
-    return {child, Scope(child, Scope::Tags::SingleUseScope(),
-                         strings::StrCat(name_, child_op_sep, child.name_))};
+  if (!impl()->single_use_scope()) {
+    Scope child = NewSubScope(impl()->op_name_.empty() ? composite_op_name
+                                                       : impl()->op_name_);
+    const string child_op_sep = impl()->name_.empty() ? "" : "_";
+    const string child_name =
+        strings::StrCat(impl()->name_, child_op_sep, child.impl()->name_);
+    return {child,
+            Scope(new Impl(child, Impl::Tags::SingleUseScope(), child_name))};
   } else {
-    return {
-        Scope(*this, Scope::Tags::ScopeName(), op_name_, true /* copy_names */),
-        *this};
+    return {Scope(new Impl(*this, Impl::Tags::ScopeName(), impl()->op_name_,
+                           true /* copy_names */)),
+            *this};
   }
 }
 
+class InternalScope {
+ public:
+  // NewScope doesn't take ownership of the inputs.
+  static Scope NewScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
+    Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap;
+    for (const Node* node : graph->nodes()) {
+      (*name_map)[node->name()] = 0;
+    }
+    // We provide null destructors for these shared ptrs (except for name_map)
+    // since the caller owns them and doesn't want the scope to destroy them.
+    return Scope(new Scope::Impl(
+        std::shared_ptr<Graph>(graph, [](Graph*) {}),
+        std::shared_ptr<Status>(status, [](Status*) {}),
+        std::shared_ptr<Scope::Impl::NameMap>(name_map),
+        std::shared_ptr<ShapeRefiner>(refiner, [](ShapeRefiner*) {})));
+  }
+};
+
+Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
+  return InternalScope::NewScope(graph, status, refiner);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 47d1026bb23..ec3543772d8 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -23,16 +23,19 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
+class Graph;
 class GraphDef;
 class NodeBuilder;
 struct CompositeOpScopes;
 
+/// @addtogroup core
+/// @{
+
 /// A `Scope` object represents a set of related TensorFlow ops that have the
 /// same properties such as a common name prefix.
 ///
@@ -91,6 +94,10 @@ struct CompositeOpScopes;
 /// op-constructor functions on the same `Scope` object.
 class Scope {
  public:
+  Scope(const Scope& other);
+  ~Scope();
+  Scope& operator=(const Scope& other);
+
   // The following functions are for users making graphs. They return brand new
   // scopes, or scopes derived from an existing scope object.
 
@@ -161,20 +168,21 @@ class Scope {
   // START_SKIP_DOXYGEN
 
   /// Update the builder with properties accumulated in this scope.
+  // TODO(skyewm): NodeBuilder is not part of public API
   void UpdateBuilder(NodeBuilder* builder) const;
   // END_SKIP_DOXYGEN
 
   CompositeOpScopes GetCompositeOpScopes(const string& composite_op_name) const;
 
-  bool ok() const { return status_->ok(); }
+  bool ok() const;
 
-  Graph* graph() const { return graph_.get(); }
+  // TODO(skyewm): Graph is not part of public API
+  Graph* graph() const;
 
-  ShapeRefiner* refiner() const { return refiner_.get(); }
+  // TODO(skyewm): Graph is not part of public API
+  std::shared_ptr<Graph> graph_as_shared_ptr() const;
 
-  std::shared_ptr<Graph> graph_as_shared_ptr() const { return graph_; }
-
-  Status status() const { return *status_; }
+  Status status() const;
 
   /// If status() is Status::OK(), convert the Graph object stored in this scope
   /// to a GraphDef proto and return Status::OK(). Otherwise, return the error
@@ -193,74 +201,15 @@ class Scope {
   Status ToGraph(Graph* g) const;
   // END_SKIP_DOXYGEN
 
-  const std::vector<Operation>& control_deps() const { return control_deps_; }
+  const std::vector<Operation>& control_deps() const;
 
  private:
-  // Tag types to choose the constructor to dispatch.
-  struct Tags {
-    enum class ScopeName;
-    enum class OpName;
-    enum class ControlDeps;
-    enum class Device;
-    enum class SingleUseScope;
-    enum class ExitOnError;
-    enum class KernelLabel;
-    enum class Colocate;
-  };
-
-  // A NameMap is used to keep track of suffixes for names used in a scope. A
-  // name that has not been used so far in a scope will get no suffix. Later
-  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
-  // can share the same NameMap. For instance, a new scope created using
-  // WithControlDependencies() should would share the same NameMap with the
-  // parent.
-  typedef std::unordered_map<string, int> NameMap;
-
-  Scope(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner);
-  Scope(const Scope& other, Tags::ScopeName, const string& name,
-        bool copy_names);
-  Scope(const Scope& other, Tags::OpName, const string& name,
-        const string& op_name);
-  Scope(const Scope& other, Tags::ControlDeps,
-        std::vector<Operation> control_deps, bool clear_control_deps);
-  Scope(const Scope& other, Tags::Device, const string& device);
-  Scope(const Scope& other, Tags::SingleUseScope, const string& op_name);
-  Scope(const Scope& other, Tags::ExitOnError);
-  Scope(const Scope& other, Tags::KernelLabel, const string& kernel_label);
-  Scope(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
-        bool clear_colocations);
-
-  std::unordered_set<string> GetColocationConstraints(
-      const Operation& colocate_with_op) const;
-
-  // Helper functions to get a unique names.
-  string GetUniqueName(const string& prefix, bool check_single_use) const;
-  string GetNameForOp(const string& default_name) const;
-
-  bool single_use_scope() const { return scope_used_ != nullptr; }
-
-  // The graph, status, and name maps are shared by all child scopes
-  // created from a single 'root' scope. A root scope is created by calling the
-  // Scope::NewRootScope function, which creates a new graph, a new status and
-  // the name maps.
-  std::shared_ptr<Graph> graph_ = nullptr;
-  std::shared_ptr<Status> status_ = nullptr;
-  std::shared_ptr<NameMap> name_map_ = nullptr;
-  std::shared_ptr<ShapeRefiner> refiner_ = nullptr;
-
-  // If scope_used_ is not nullptr, op_name_ should be empty and
-  // GetUniqueNameForOp can only be called once on this scope. More calls to
-  // GetUniqueNameForOp will cause an error status to be set on this scope.
-  std::shared_ptr<bool> scope_used_ = nullptr;
-
-  const std::vector<Operation> control_deps_;
-
-  const string name_ = "";
-  const string op_name_ = "";
-  const bool exit_on_error_ = false;
-  const string kernel_label_ = "";
-  const string device_ = "";
-  const std::unordered_set<string> colocation_constraints_;
+  friend class InternalScope;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
+  explicit Scope(Impl*);
 };
 
 /// A helper struct to hold the scopes that would be used by a function
@@ -273,6 +222,8 @@ struct CompositeOpScopes {
   Scope last;
 };
 
+/// @}
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
new file mode 100644
index 00000000000..f2a911877f0
--- /dev/null
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+
+class ShapeRefiner;
+
+// NewInternalScope returns a new scope which doesn't take ownership of
+// graph, status, name_map, and refiner.
+// This is intended to enable the C API (which are used by other language
+// bindings) to create a Scope and access C++ functionality (i.e. gradients).
+Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
diff --git a/tensorflow/cc/framework/testutil.cc b/tensorflow/cc/framework/testutil.cc
index b0746913a16..ca78f31db51 100644
--- a/tensorflow/cc/framework/testutil.cc
+++ b/tensorflow/cc/framework/testutil.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/testutil.h"
 
+#include <utility>
+
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/default_device.h"
@@ -30,7 +32,7 @@ void GetTensors(const Scope& scope, OutputList tensors,
 
 void GetTensor(const Scope& scope, Output tensor, Tensor* out) {
   std::vector<Tensor> outputs;
-  GetTensors(scope, {tensor}, &outputs);
+  GetTensors(scope, {std::move(tensor)}, &outputs);
   *out = outputs[0];
 }
 
diff --git a/tensorflow/cc/gradients/README.md b/tensorflow/cc/gradients/README.md
new file mode 100644
index 00000000000..3253163cc73
--- /dev/null
+++ b/tensorflow/cc/gradients/README.md
@@ -0,0 +1,52 @@
+# C++ gradients
+
+Gradients are currently being ported from
+[python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/ops)
+to C++ (in this directory).
+
+Contributions are welcome and much appreciated; please follow the instructions
+below.
+
+1.  Create the op gradient function in `foo_grad.cc` corresponding to the
+    `foo_grad.py` file where the op originated (i.e. `array_grad.py` op
+    gradients should be written in `array_grad.cc`).
+
+2.  Write the op gradient with the following naming scheme:
+
+        Status OpNameGrad(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+          ...
+          return scope.status();
+        }
+        REGISTER_GRADIENT_OP("OpName", OpNameGrad);
+
+3.  Ops gradients are implemented by using the [C++
+    API](https://www.tensorflow.org/api_docs/cc/).
+
+4.  Tests should be included in `foo_grad_test.cc`. Please see
+    [`array_grad_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/gradients/array_grad_test.cc)
+    for an many examples. Tests are as simple as, creating a placeholder input
+    for the op's inputs and calling `RunTest` (`RunTest` uses a [gradient
+    checker](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/framework/gradient_checker.cc)
+    to verify that the theoretical gradient matches the numeric gradient). For
+    example:
+
+        TEST_F(ArrayGradTest, IdentityGrad) {
+          TensorShape shape({5, 2});
+          auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+          auto y = Identity(scope_, x);
+          RunTest(x, shape, y, shape);
+        }
+
+NOTE: There are some ops that require features from the C++ API that are not yet
+implemented.
+
+*   Ops that require PartialTensorShape information cannot yet be implemented.
+
+*   Ops that require SparseTensor or IndexSlices (currently only in python)
+    cannot yet be implemented.
+
+*   Maybe more.
+
+For questions: Please create an issue assigned to suharshs.
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index dd532369762..37f07e71a0d 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 #include "tensorflow/cc/framework/grad_op_registry.h"
 #include "tensorflow/cc/framework/gradients.h"
@@ -42,9 +43,9 @@ Status PackGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   int N;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "N", &N));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "N", &N));
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
 
   grad_outputs->reserve(N);
   auto grad_op = Unstack(scope, grad_inputs[0], N, Unstack::Axis(axis));
@@ -59,7 +60,7 @@ Status UnpackGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
   grad_outputs->push_back(Stack(scope, grad_inputs, Stack::Axis(axis)));
   return scope.status();
 }
@@ -89,6 +90,16 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
+Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
+
 Status SplitGrad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
@@ -150,9 +161,12 @@ REGISTER_GRADIENT_OP("GatherNd", GatherNdGrad);
 Status CheckNumericsGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
-  grad_outputs->push_back(CheckNumerics(
-      scope, grad_inputs[0],
-      "Not a number (NaN) or infinity (Inf) values detected in gradient."));
+  string message;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "message", &message));
+  string err_msg = strings::StrCat(
+      "Not a number (NaN) or infinity (Inf) values detected in gradient. ",
+      message);
+  grad_outputs->push_back(CheckNumerics(scope, grad_inputs[0], err_msg));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("CheckNumerics", CheckNumericsGrad);
@@ -201,9 +215,9 @@ Status ReverseSequenceGrad(const Scope& scope, const Operation& op,
                            std::vector<Output>* grad_outputs) {
   auto seq_lengths = op.input(1);
   int batch_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "batch_dim", &batch_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "batch_dim", &batch_dim));
   int seq_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "seq_dim", &seq_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "seq_dim", &seq_dim));
   grad_outputs->push_back(
       ReverseSequence(scope, grad_inputs[0], seq_lengths, seq_dim,
                       ReverseSequence::BatchDim(batch_dim)));
@@ -253,7 +267,8 @@ Status SpaceToBatchGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       BatchToSpace(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -276,7 +291,8 @@ Status BatchToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       SpaceToBatch(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -299,7 +315,8 @@ Status SpaceToDepthGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(DepthToSpace(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -309,7 +326,8 @@ Status DepthToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(SpaceToDepth(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -319,7 +337,7 @@ Status MirrorPadGrad(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(tensorflow::ops::internal::MirrorPadGrad(
       scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
@@ -332,7 +350,7 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(MirrorPad(scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
   return scope.status();
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index aff06531395..71d9a8ed7be 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -21,6 +21,17 @@ namespace tensorflow {
 namespace ops {
 namespace {
 
+// Conjugate helper function returns the conjugate of an Output if it
+// is complex valued.
+Output ConjugateHelper(const Scope& scope, const Output& out) {
+  DataType dtype = out.type();
+  if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
+    return Conj(scope, out);
+  } else {
+    return out;
+  }
+}
+
 // TODO(andydavis) Add control dependencies to gradient functions (as needed).
 
 Status AbsGrad(const Scope& scope, const Operation& op,
@@ -44,9 +55,11 @@ REGISTER_GRADIENT_OP("Neg", NegGrad);
 Status InvGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // dx = dy * (-1 * (y * y))
+  // dy/dx = -1/x^2 = -y^2
+  auto dydx = Neg(scope, Square(scope, op.output(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Neg(scope, Square(scope, op.output(0)))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Inv", InvGrad);
@@ -55,10 +68,12 @@ REGISTER_GRADIENT_OP("Reciprocal", InvGrad);
 Status SquareGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
-  // dx = dy * (2 * x)
+  // dy/dx = (2 * x)
   auto two = Cast(scope, Const(scope, 2), op.input(0).type());
+  auto dydx = Mul(scope, two, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Mul(scope, two, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Square", SquareGrad);
@@ -68,11 +83,12 @@ Status SqrtGrad(const Scope& scope, const Operation& op,
                 std::vector<Output>* grad_outputs) {
   // y = sqrt(x)
   // dy/dx =  0.5 * (1 / sqrt(x)) = 0.5 * (1 / y)
-  // dx = dy * (0.5 * (1 / y))
   auto y_inv = Reciprocal(scope, op.output(0));
   auto half = Cast(scope, Const(scope, 0.5), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, half, y_inv));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, half, y_inv);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sqrt", SqrtGrad);
@@ -82,14 +98,14 @@ Status RsqrtGrad(const Scope& scope, const Operation& op,
                  std::vector<Output>* grad_outputs) {
   // y = 1/x^1/2 = x^-1/2
   // dy/dx = -1/2 * x^-3/2 = -1/2 * x^-1/2 * x^-1 = -1/2 * y * x^-1
-  // dx = dy * (-1/2 * y * x^-1)
   auto x_inv = Reciprocal(scope, op.input(0));
   auto y = op.output(0);
   auto neghalf = Cast(scope, Const(scope, -0.5), op.input(0).type());
   auto a = Mul(scope, neghalf, x_inv);
-  auto b = Mul(scope, a, y);
-  auto dx = Mul(scope, grad_inputs[0], b);
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, a, y);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
@@ -97,10 +113,11 @@ REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
 Status ExpGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // y = exp(x)
-  // dy/dx = exp(x)
-  // dx = dy * y
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], op.output(0)));
+  // dy/dx = exp(x) = y
+  // grad(x) = grad(y) * conj(dy/dx)
+  //         = grad(y) * conj(y)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, op.output(0))));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Exp", ExpGrad);
@@ -108,10 +125,12 @@ REGISTER_GRADIENT_OP("Exp", ExpGrad);
 Status Expm1Grad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = expm1(x)
-  // df/dx = exp(x)
-  // dx = dy * exp(x)
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], Exp(scope, op.input(0))));
+  // y = expm1(x)
+  // dy/dx = exp(x)
+  auto dydx = Exp(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
@@ -119,11 +138,12 @@ REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
 Status LogGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // f(x) = log(x) = y
-  // df/dx = 1 / x
-  // dx = dy * (1 / x)
+  // y = log(x)
+  // dy/dx = 1 / x
+  auto dydx = Reciprocal(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Reciprocal(scope, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log", LogGrad);
@@ -131,26 +151,54 @@ REGISTER_GRADIENT_OP("Log", LogGrad);
 Status Log1pGrad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = log1p(x) = y
-  // df/dx = 1 / (1 + x)
-  // dx = dy * (1 / (1 + x))
+  // y = log1p(x)
+  // dy/dx = 1 / (1 + x)
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
+  auto dydx = Reciprocal(scope, Add(scope, one, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Div(scope, grad_inputs[0], Add(scope, one, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
 
+Status SinhGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = sinh(x)
+  // dy/dx = cosh(x)
+  auto dydx = Cosh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Sinh", SinhGrad);
+
+Status CoshGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = cosh(x)
+  // dy/dx = sinh(x)
+  auto dydx = Sinh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Cosh", CoshGrad);
+
 Status TanhGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   // y = tanh(x)
   // dy/dx = 1 - (tanh(x))^2 = 1 - y^2
-  // dx = dy * (1 - y^2)
   auto y2 = Square(scope, op.output(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Sub(scope, one, y2));
-  grad_outputs->push_back(dx);
+  auto dydx = Sub(scope, one, y2);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Tanh", TanhGrad);
@@ -160,11 +208,13 @@ Status SigmoidGrad(const Scope& scope, const Operation& op,
                    std::vector<Output>* grad_outputs) {
   // y = 1 / (1 + exp(-x))
   // dy/dx = y * (1 - y)
-  // dx = dy * y * (1 - y)
   auto y = op.output(0);
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, y, Sub(scope, one, y)));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, y, Sub(scope, one, y));
+  // dx = dy * y * (1 - y)
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sigmoid", SigmoidGrad);
@@ -185,9 +235,10 @@ Status SinGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = sin(x)
   // dy/dx = cos(x)
-  // dx = dy * cos(x)
-  auto dx = Mul(scope, grad_inputs[0], Cos(scope, op.input(0)));
-  grad_outputs->push_back(dx);
+  auto dydx = Cos(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sin", SinGrad);
@@ -197,9 +248,10 @@ Status CosGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = cos(x)
   // dy/dx = -sin(x)
-  // dx = dy * -sin(x)
-  auto dx = Mul(scope, grad_inputs[0], Neg(scope, Sin(scope, op.input(0))));
-  grad_outputs->push_back(dx);
+  auto dydx = Neg(scope, Sin(scope, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Cos", CosGrad);
@@ -208,12 +260,12 @@ Status AsinGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   // y = asin(x)
-  // dy/dx = 1 / (1 - x * x)^1/2
-  // dx = dy * (1 / (1 - x * x)^1/2)
+  // dy/dx = 1 / sqrt(1 - x^2)
   auto x2 = Square(scope, op.input(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
   auto dydx = Reciprocal(scope, Sqrt(scope, Sub(scope, one, x2)));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -239,9 +291,9 @@ Status TanGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = tan(x)
   // dy/dx = sec(x)^2 = 1 / cos(x)^2
-  // dx = dy * (1 / cos(x)^2)
   auto dydx = Square(scope, Reciprocal(scope, Cos(scope, op.input(0))));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -324,7 +376,7 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
                         const string& attr_adj_x, const string& attr_adj_y,
                         std::vector<Output>* grad_outputs) {
   DataType dtype;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), "T", &dtype));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->attrs(), "T", &dtype));
   if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
     return errors::Unimplemented(
         "MatMul gradient for complex data type is not supported yet.");
@@ -332,8 +384,10 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
 
   bool ta;
   bool tb;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_x, &ta));
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_y, &tb));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_x, &ta));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_y, &tb));
 
   if (!ta && !tb) {
     return MatMulGradHelper(scope, is_batch, grad_inputs[0], false, op.input(1),
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index d7278929d46..1653b04378f 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -45,6 +45,8 @@ class CWiseUnaryGradTest : public ::testing::Test {
     EXPM1,
     LOG,
     LOG1P,
+    SINH,
+    COSH,
     TANH,
     SIGMOID,
     SIGN,
@@ -56,23 +58,25 @@ class CWiseUnaryGradTest : public ::testing::Test {
     ATAN
   };
 
-  void TestCWiseGrad(UnaryOpType op_type, std::function<float(int)> x_fn,
-                     std::function<float(float)> dy_fn,
-                     std::function<float(float, float)> dx_fn) {
-    Tensor x(DT_FLOAT, {2, 3, 2});
-    auto x_flat = x.flat<float>();
+  template <typename T>
+  void TestCWiseGrad(UnaryOpType op_type, const std::function<T(int)>& x_fn,
+                     const std::function<T(const T&)>& dy_fn,
+                     const std::function<T(const T&, const T&)>& dx_fn) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    Tensor x(dtype, {2, 3, 2});
+    auto x_flat = x.flat<T>();
     for (int i = 0; i < x_flat.size(); ++i) {
       x_flat(i) = x_fn(i);
     }
 
-    Tensor dy(DT_FLOAT, {2, 3, 2});
-    auto dy_flat = dy.flat<float>();
+    Tensor dy(dtype, {2, 3, 2});
+    auto dy_flat = dy.flat<T>();
     for (int i = 0; i < dy_flat.size(); ++i) {
       dy_flat(i) = dy_fn(x_flat(i));
     }
 
-    Tensor dx(DT_FLOAT, {2, 3, 2});
-    auto dx_flat = dx.flat<float>();
+    Tensor dx(dtype, {2, 3, 2});
+    auto dx_flat = dx.flat<T>();
     for (int i = 0; i < dx_flat.size(); ++i) {
       dx_flat(i) = dx_fn(x_flat(i), dy_flat(i));
     }
@@ -109,6 +113,12 @@ class CWiseUnaryGradTest : public ::testing::Test {
       case LOG1P:
         y = Log1p(scope_, x);
         break;
+      case SINH:
+        y = Sinh(scope_, x);
+        break;
+      case COSH:
+        y = Cosh(scope_, x);
+        break;
       case TANH:
         y = Tanh(scope_, x);
         break;
@@ -146,7 +156,19 @@ class CWiseUnaryGradTest : public ::testing::Test {
     test::ExpectClose(output, dx);
   }
 
-  float RV(std::vector<float> v) { return v[random::New64() % v.size()]; }
+  float RV(const std::vector<float>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 CRV(const std::vector<complex64>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 conjugate(const complex64& val) {
+    return complex64(val.real(), -val.imag());
+  }
+
+  const complex64 one_{1.0, 0};
 
   Scope scope_;
 };
@@ -155,14 +177,14 @@ TEST_F(CWiseUnaryGradTest, Abs) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return x * dy; };
-  TestCWiseGrad(ABS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ABS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Neg) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return -dy; };
-  TestCWiseGrad(NEG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(NEG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Reciprocal) {
@@ -171,14 +193,36 @@ TEST_F(CWiseUnaryGradTest, Reciprocal) {
   auto dx_fn = [this](const float x, const float dy) {
     return -(1 / (x * x)) * dy;
   };
-  TestCWiseGrad(INV, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(INV, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Reciprocal_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64 x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64 x, const complex64 dy) {
+    return -conjugate(one_ / (x * x)) * dy;
+  };
+  TestCWiseGrad<complex64>(INV, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Square) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return RV({0, -7, 7, -8, 8, -9, 9}); };
   auto dx_fn = [this](const float x, const float dy) { return 2 * x * dy; };
-  TestCWiseGrad(SQUARE, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQUARE, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Square_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(2, 0) * x) * dy;
+  };
+  TestCWiseGrad<complex64>(SQUARE, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sqrt) {
@@ -187,7 +231,18 @@ TEST_F(CWiseUnaryGradTest, Sqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * 0.5 * (1.0 / std::sqrt(x));
   };
-  TestCWiseGrad(SQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(0.5, 0) / std::sqrt(x)) * dy;
+  };
+  TestCWiseGrad<complex64>(SQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Rsqrt) {
@@ -196,7 +251,18 @@ TEST_F(CWiseUnaryGradTest, Rsqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -0.5 * (1 / std::sqrt(x)) * (1 / x);
   };
-  TestCWiseGrad(RSQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(RSQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Rsqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(-0.5, 0) / std::sqrt(x) / x) * dy;
+  };
+  TestCWiseGrad<complex64>(RSQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Exp) {
@@ -205,7 +271,18 @@ TEST_F(CWiseUnaryGradTest, Exp) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXP, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXP, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Exp_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXP, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Expm1) {
@@ -214,14 +291,36 @@ TEST_F(CWiseUnaryGradTest, Expm1) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXPM1, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXPM1, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Expm1_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXPM1, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log) {
   auto x_fn = [this](const int i) { return RV({-1, 1, -2, 2, -3, 3, -4, 4}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return dy * (1.0 / x); };
-  TestCWiseGrad(LOG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(one_ / x);
+  };
+  TestCWiseGrad<complex64>(LOG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log1p) {
@@ -230,7 +329,64 @@ TEST_F(CWiseUnaryGradTest, Log1p) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / (1.0 + x));
   };
-  TestCWiseGrad(LOG1P, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG1P, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{0, 0}, {1e-6, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + conjugate(x));
+  };
+  TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sinh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::cosh(x);
+  };
+  TestCWiseGrad<float>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sinh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cosh(x));
+  };
+  TestCWiseGrad<complex64>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::sinh(x);
+  };
+  TestCWiseGrad<float>(COSH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::sinh(x));
+  };
+  TestCWiseGrad<complex64>(COSH, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Tanh) {
@@ -240,7 +396,21 @@ TEST_F(CWiseUnaryGradTest, Tanh) {
     const float y = std::tanh(x);
     return dy * (1.0 - y * y);
   };
-  TestCWiseGrad(TANH, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TANH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tanh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = std::tanh(x);
+    return dy * conjugate((one_ - y * y));
+  };
+  TestCWiseGrad<complex64>(TANH, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sigmoid) {
@@ -250,14 +420,28 @@ TEST_F(CWiseUnaryGradTest, Sigmoid) {
     const float y = 1.0 / (1.0 + std::exp(-x));
     return dy * y * (1.0 - y);
   };
-  TestCWiseGrad(SIGMOID, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGMOID, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sigmoid_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = one_ / (one_ + std::exp(-x));
+    return dy * conjugate(y * (one_ - y));
+  };
+  TestCWiseGrad<complex64>(SIGMOID, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sign) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return 0.0; };
-  TestCWiseGrad(SIGN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sin) {
@@ -266,7 +450,20 @@ TEST_F(CWiseUnaryGradTest, Sin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::cos(x);
   };
-  TestCWiseGrad(SIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cos(x));
+  };
+  TestCWiseGrad<complex64>(SIN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Cos) {
@@ -275,7 +472,20 @@ TEST_F(CWiseUnaryGradTest, Cos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -1.0 * std::sin(x);
   };
-  TestCWiseGrad(COS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(COS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(-std::sin(x));
+  };
+  TestCWiseGrad<complex64>(COS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Asin) {
@@ -284,7 +494,24 @@ TEST_F(CWiseUnaryGradTest, Asin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ASIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ASIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Asin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Enable test when the asin kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ASIN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Acos) {
@@ -293,7 +520,24 @@ TEST_F(CWiseUnaryGradTest, Acos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (-1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ACOS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ACOS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Acos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / -conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Add test when the acos kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ACOS, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Tan) {
@@ -303,7 +547,25 @@ TEST_F(CWiseUnaryGradTest, Tan) {
     const float cosx = std::cos(x);
     return dy * (1 / (cosx * cosx));
   };
-  TestCWiseGrad(TAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 cosx = std::cos(x);
+    return dy / conjugate(cosx * cosx);
+  };
+  // TODO(kbsriram)
+  // Enable when tan kernel supports complex inputs
+  if (false) {
+    TestCWiseGrad<complex64>(TAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
@@ -312,7 +574,24 @@ TEST_F(CWiseUnaryGradTest, Atan) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1 / (1 + x * x));
   };
-  TestCWiseGrad(ATAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ATAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Atan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + x * x);
+  };
+  // TODO(kbsriram)
+  // Add test when the atan kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ATAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 class CWiseUnaryComplexGradTest : public ::testing::Test {
diff --git a/tensorflow/cc/ops/const_op.h b/tensorflow/cc/ops/const_op.h
index 8976a24edc6..e8cb6cf1dd1 100644
--- a/tensorflow/cc/ops/const_op.h
+++ b/tensorflow/cc/ops/const_op.h
@@ -23,6 +23,9 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
+/// @defgroup const_op Const Op
+/// @{
+
 Output Const(const Scope& scope, const Input::Initializer& val);
 
 NodeBuilder::NodeOut AsNodeOut(const Scope& scope, const Input& inp);
@@ -70,6 +73,8 @@ Output Const(const Scope& scope, const std::initializer_list<T>& v,
 std::vector<NodeBuilder::NodeOut> AsNodeOutList(const Scope& scope,
                                                 const InputList& inp);
 
+/// }@
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/ops/const_op_test.cc b/tensorflow/cc/ops/const_op_test.cc
index 5a4770f879f..3184edeb330 100644
--- a/tensorflow/cc/ops/const_op_test.cc
+++ b/tensorflow/cc/ops/const_op_test.cc
@@ -28,9 +28,9 @@ void ExpectNodeEqual(const Node* n, gtl::ArraySlice<T> values,
                      TensorShape shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(tensor.dtype(), dtype);
   test::ExpectTensorEqual<T>(tensor, test::AsTensor(values, shape));
 }
@@ -39,9 +39,9 @@ void ExpectTypeAndShape(const Node* n, DataType expected_dtype,
                         TensorShape expected_shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(dtype, expected_dtype);
   EXPECT_EQ(expected_shape, TensorShape(tensor.shape()));
 }
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
index 9e85e67cf5d..1dffb10c033 100644
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ b/tensorflow/cc/ops/op_gen_overrides.pbtxt
@@ -22,7 +22,7 @@ op { name: "Where" input_rename: { from: "input" to: "condition" } }
 op { name: "ThreadUnsafeUnigramCandidateSampler", skip: true }
 
 # control_flow_ops
-# TODO(josh11b): Hide Switch and Merge once we write and migrate users to
+# TODO(joshl): Hide Switch and Merge once we write and migrate users to
 # a Cond() API.
 #op { name: "Switch" hide: true }
 #op { name: "Merge" hide: true }
@@ -150,6 +150,12 @@ op { name: "Neg" rename_to: "Negate" alias: "Neg" }
 op { name: "Prod" alias: "ReduceProd" input_rename: { from: "reduction_indices" to: "axis" } }
 op { name: "Sub" rename_to: "Subtract" alias: "Sub" }
 op { name: "Sum" alias: "ReduceSum" input_rename: { from: "reduction_indices" to: "axis" } }
+op { name: "SigmoidGrad" hide: true }
+op { name: "TanhGrad" hide: true }
+op { name: "InvGrad" hide: true }
+op { name: "ReciprocalGrad" hide: true }
+op { name: "SqrtGrad" hide: true }
+op { name: "RsqrtGrad" hide: true }
 
 # *Grad ops get hidden, only for use by the gradient code.
 op { name: "SigmoidGrad" hide: true }
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 36fec7a2f2e..1cc7cf3f202 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -9,7 +9,14 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_android",
+    "if_ios",
+    "if_mobile",
+    "if_not_mobile",
+    "tf_cc_test",
+)
 
 cc_library(
     name = "constants",
@@ -28,17 +35,33 @@ cc_library(
 
 cc_library(
     name = "loader",
+    hdrs = ["loader.h"],
+    deps = [
+        ":loader_lite",
+    ] + if_not_mobile([
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ]) + if_android([
+        "//tensorflow/core:android_tensorflow_lib",
+    ]),
+)
+
+cc_library(
+    name = "loader_lite",
     srcs = ["loader.cc"],
     hdrs = ["loader.h"],
     deps = [
         ":constants",
+    ] + if_not_mobile([
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
         "//tensorflow/core/util/tensor_bundle:naming",
-    ],
+        # mobile not supported yet
+    ]),
 )
 
 tf_cc_test(
@@ -66,6 +89,7 @@ filegroup(
     name = "saved_model_half_plus_two",
     srcs = glob([
         "testdata/half_plus_two_pbtxt/**",
+        "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 7f2d5609780..94a3b3cf465 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -33,6 +33,9 @@ constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 /// SavedModel legacy init op key.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
+/// SavedModel main op key.
+constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
+
 /// Directory in which to save the SavedModel variables.
 constexpr char kSavedModelVariablesDirectory[] = "variables";
 
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 2acf9bf777a..807f5904afc 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/tensor_bundle/naming.h"
@@ -36,7 +37,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
     "status");
 auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/cc/saved_model/load_latency",
-    "Latency in microseconds for SavedModels that were succesfully loaded.",
+    "Latency in microseconds for SavedModels that were successfully loaded.",
     "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
@@ -106,6 +107,37 @@ void AddAssetsTensorsToInputs(const StringPiece export_dir,
   }
 }
 
+bool HasMainOp(const MetaGraphDef& meta_graph_def) {
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  if (collection_def_map.find(kSavedModelMainOpKey) !=
+      collection_def_map.end()) {
+    return true;
+  }
+  return false;
+}
+
+Status RunMainOp(const RunOptions& run_options, const string& export_dir,
+                 const MetaGraphDef& meta_graph_def,
+                 const std::vector<AssetFileDef>& asset_file_defs,
+                 Session* session) {
+  LOG(INFO) << "Running MainOp on SavedModel bundle.";
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
+  if (main_op_it != collection_def_map.end()) {
+    if (main_op_it->second.node_list().value_size() != 1) {
+      return errors::FailedPrecondition(
+          strings::StrCat("Expected exactly one main op in : ", export_dir));
+    }
+    std::vector<std::pair<string, Tensor>> inputs;
+    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+    RunMetadata run_metadata;
+    const StringPiece main_op_name = main_op_it->second.node_list().value(0);
+    return session->Run(run_options, inputs, {}, {main_op_name.ToString()},
+                        nullptr /* outputs */, &run_metadata);
+  }
+  return Status::OK();
+}
+
 Status RunRestore(const RunOptions& run_options, const string& export_dir,
                   const StringPiece restore_op_name,
                   const StringPiece variable_filename_const_op_name,
@@ -121,8 +153,9 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
   const string variables_index_path = io::JoinPath(
       variables_directory, MetaFilename(kSavedModelVariablesFilename));
   if (!Env::Default()->FileExists(variables_index_path).ok()) {
-    return errors::NotFound(
-        "Checkpoint index file not found in SavedModel directory.");
+    LOG(INFO) << "The specified SavedModel has no variables; no checkpoints "
+                 "were restored.";
+    return Status::OK();
   }
   const string variables_path =
       io::JoinPath(variables_directory, kSavedModelVariablesFilename);
@@ -210,11 +243,15 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
-  // TODO(sukritiramesh): Add support for a single main op to run upon load,
-  // which will supersede the legacy_init_op and separate RunRestore.
-  TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
-                                     bundle->meta_graph_def, asset_file_defs,
-                                     bundle->session.get()));
+  if (HasMainOp(bundle->meta_graph_def)) {
+    TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
+                                 bundle->meta_graph_def, asset_file_defs,
+                                 bundle->session.get()));
+  } else {
+    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
+                                       bundle->meta_graph_def, asset_file_defs,
+                                       bundle->session.get()));
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 9b9abdbb1f4..3d634dd5154 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -36,7 +36,7 @@ struct SavedModelBundle {
   /// resource leaks, we explicitly call Close on Sessions that we create.
   ~SavedModelBundle() {
     if (session) {
-      session->Close();
+      session->Close().IgnoreError();
     }
   }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 2a8a7c5bff6..cef29e7b071 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -31,6 +31,8 @@ namespace {
 
 constexpr char kTestDataPbTxt[] =
     "cc/saved_model/testdata/half_plus_two_pbtxt/00000123";
+constexpr char kTestDataMainOp[] =
+    "cc/saved_model/testdata/half_plus_two_main_op/00000123";
 constexpr char kTestDataSharded[] =
     "cc/saved_model/testdata/half_plus_two/00000123";
 
@@ -165,6 +167,18 @@ TEST_F(LoaderTest, PbtxtFormat) {
   CheckSavedModelBundle(export_dir, bundle);
 }
 
+TEST_F(LoaderTest, MainOpFormat) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataMainOp);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
 TEST_F(LoaderTest, InvalidExportPath) {
   SavedModelBundle bundle;
   RunOptions run_options;
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/saved_model.pb
index eeac8b12063..4a4fd025d9d 100755
Binary files a/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/saved_model.pb and b/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/assets/foo.txt b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/assets/foo.txt
new file mode 100644
index 00000000000..f9ff0366880
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/assets/foo.txt
@@ -0,0 +1 @@
+asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/saved_model.pb
new file mode 100644
index 00000000000..daa272aead0
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..15b75d6ef6b
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index
new file mode 100644
index 00000000000..7ec9fb4fe2d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt b/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
index 356dbe6eca6..9d7813a0a16 100755
--- a/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
@@ -284,6 +284,7 @@ meta_graphs {
           type: "shape"
           default_value {
             shape {
+              unknown_rank: true
             }
           }
         }
@@ -447,7 +448,7 @@ meta_graphs {
       }
     }
     tags: "serve"
-    tensorflow_version: "0.12.head"
+    tensorflow_version: "1.1.0-rc2"
     tensorflow_git_version: "unknown"
   }
   graph_def {
@@ -885,6 +886,7 @@ meta_graphs {
         key: "shape"
         value {
           shape {
+            unknown_rank: true
           }
         }
       }
@@ -1714,7 +1716,7 @@ meta_graphs {
             dtype: DT_STRING
             tensor_shape {
             }
-            string_val: "_temp_aeab824a1fc94305a10a2504f5995de2/part"
+            string_val: "_temp_80e928f1e0c844239d136d1ea966099d/part"
           }
         }
       }
@@ -2444,7 +2446,7 @@ meta_graphs {
       input: "^save/restore_shard"
     }
     versions {
-      producer: 21
+      producer: 23
     }
   }
   saver_def {
@@ -2503,6 +2505,42 @@ meta_graphs {
       }
     }
   }
+  signature_def {
+    key: "classify_x2_to_y3"
+    value {
+      inputs {
+        key: "inputs"
+        value {
+          name: "x2:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: -1
+            }
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      outputs {
+        key: "scores"
+        value {
+          name: "y3:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: -1
+            }
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      method_name: "tensorflow/serving/classify"
+    }
+  }
   signature_def {
     key: "classify_x_to_y"
     value {
diff --git a/tensorflow/cc/training/coordinator.cc b/tensorflow/cc/training/coordinator.cc
index 53a566db950..4511d043206 100644
--- a/tensorflow/cc/training/coordinator.cc
+++ b/tensorflow/cc/training/coordinator.cc
@@ -31,8 +31,8 @@ Coordinator::Coordinator(const std::vector<error::Code>& clean_stop_errors)
 }
 
 Coordinator::~Coordinator() {
-  RequestStop();
-  Join();
+  RequestStop().IgnoreError();
+  Join().IgnoreError();
 }
 
 Status Coordinator::RegisterRunner(std::unique_ptr<RunnerInterface> runner) {
@@ -115,4 +115,15 @@ void Coordinator::WaitForStop() {
   }
 }
 
-}  // namespace
+Status Coordinator::ExportCostGraph(CostGraphDef* cost_graph) const {
+  mutex_lock l(runners_lock_);
+  for (auto& t : runners_) {
+    Status s = t->ExportCostGraph(cost_graph);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index dbcf0720150..0e01b19cd98 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -21,19 +21,24 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
-/// The abstract interface for runners which must implement the Join function.
+/// The abstract interface for runners which must implement the Join and the
+/// IsRunning function.
 class RunnerInterface {
  public:
   virtual ~RunnerInterface() {}
   virtual Status Join() = 0;
-
+  virtual Status ExportCostGraph(CostGraphDef* cost_graph) const {
+    return Status(error::INVALID_ARGUMENT, "No cost model to export.");
+  }
   /// Returns true iff the runner is running, i.e. if it is trying to populate
   /// its queue.
   virtual bool IsRunning() const = 0;
@@ -101,6 +106,9 @@ class Coordinator {
   /// RequestStop() is called.
   void WaitForStop();
 
+  // Returns the cost graph from stored run metadata in registered runners.
+  Status ExportCostGraph(CostGraphDef* cost_graph) const;
+
  private:
   std::unordered_set<int> clean_stop_errors_;
   condition_variable wait_for_stop_;
@@ -111,12 +119,10 @@ class Coordinator {
   mutex status_lock_;
   Status status_ GUARDED_BY(status_lock_);
 
-  mutex runners_lock_;
+  mutable mutex runners_lock_;
   std::vector<std::unique_ptr<RunnerInterface>> runners_
       GUARDED_BY(runners_lock_);
 
-  std::atomic<int> num_runners_to_cancel_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
 };
 
diff --git a/tensorflow/cc/training/coordinator_test.cc b/tensorflow/cc/training/coordinator_test.cc
index 5e4a6966901..48874033841 100644
--- a/tensorflow/cc/training/coordinator_test.cc
+++ b/tensorflow/cc/training/coordinator_test.cc
@@ -29,9 +29,10 @@ namespace {
 
 using error::Code;
 
-void WaitForStopThread(Coordinator* coord, bool* stopped, Notification* done) {
+void WaitForStopThread(Coordinator* coord, Notification* about_to_wait,
+                       Notification* done) {
+  about_to_wait->Notify();
   coord->WaitForStop();
-  *stopped = true;
   done->Notify();
 }
 
@@ -39,22 +40,22 @@ TEST(CoordinatorTest, TestStopAndWaitOnStop) {
   Coordinator coord;
   EXPECT_EQ(coord.ShouldStop(), false);
 
-  bool stopped = false;
+  Notification about_to_wait;
   Notification done;
   Env::Default()->SchedClosure(
-      std::bind(&WaitForStopThread, &coord, &stopped, &done));
-  Env::Default()->SleepForMicroseconds(10000000);
-  EXPECT_EQ(stopped, false);
+      std::bind(&WaitForStopThread, &coord, &about_to_wait, &done));
+  about_to_wait.WaitForNotification();
+  Env::Default()->SleepForMicroseconds(1000 * 1000);
+  EXPECT_FALSE(done.HasBeenNotified());
 
-  coord.RequestStop();
+  TF_EXPECT_OK(coord.RequestStop());
   done.WaitForNotification();
-  EXPECT_EQ(stopped, true);
-  EXPECT_EQ(coord.ShouldStop(), true);
+  EXPECT_TRUE(coord.ShouldStop());
 }
 
 class MockQueueRunner : public RunnerInterface {
  public:
-  MockQueueRunner(Coordinator* coord) {
+  explicit MockQueueRunner(Coordinator* coord) {
     coord_ = coord;
     join_counter_ = nullptr;
     thread_pool_.reset(new thread::ThreadPool(Env::Default(), "test-pool", 10));
@@ -66,17 +67,19 @@ class MockQueueRunner : public RunnerInterface {
     join_counter_ = join_counter;
   }
 
-  void StartCounting(std::atomic<int>* counter, int until) {
+  void StartCounting(std::atomic<int>* counter, int until,
+                     Notification* start = nullptr) {
     thread_pool_->Schedule(
-        std::bind(&MockQueueRunner::CountThread, this, counter, until));
+        std::bind(&MockQueueRunner::CountThread, this, counter, until, start));
   }
 
-  void StartSettingStatus(const Status& status, BlockingCounter* counter) {
-    thread_pool_->Schedule(
-        std::bind(&MockQueueRunner::SetStatusThread, this, status, counter));
+  void StartSettingStatus(const Status& status, BlockingCounter* counter,
+                          Notification* start) {
+    thread_pool_->Schedule(std::bind(&MockQueueRunner::SetStatusThread, this,
+                                     status, counter, start));
   }
 
-  Status Join() {
+  Status Join() override {
     if (join_counter_ != nullptr) {
       (*join_counter_)++;
     }
@@ -93,15 +96,17 @@ class MockQueueRunner : public RunnerInterface {
   void Stop() { stopped_ = true; }
 
  private:
-  void CountThread(std::atomic<int>* counter, int until) {
+  void CountThread(std::atomic<int>* counter, int until, Notification* start) {
+    if (start != nullptr) start->WaitForNotification();
     while (!coord_->ShouldStop() && counter->load() < until) {
       (*counter)++;
-      Env::Default()->SleepForMicroseconds(100000);
+      Env::Default()->SleepForMicroseconds(10 * 1000);
     }
-    coord_->RequestStop();
+    coord_->RequestStop().IgnoreError();
   }
-  void SetStatusThread(const Status& status, BlockingCounter* counter) {
-    Env::Default()->SleepForMicroseconds(100000);
+  void SetStatusThread(const Status& status, BlockingCounter* counter,
+                       Notification* start) {
+    start->WaitForNotification();
     SetStatus(status);
     counter->DecrementCount();
   }
@@ -118,19 +123,19 @@ TEST(CoordinatorTest, TestRealStop) {
 
   std::unique_ptr<MockQueueRunner> qr1(new MockQueueRunner(&coord));
   qr1->StartCounting(&counter, 100);
-  coord.RegisterRunner(std::move(qr1));
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr1)));
 
   std::unique_ptr<MockQueueRunner> qr2(new MockQueueRunner(&coord));
   qr2->StartCounting(&counter, 100);
-  coord.RegisterRunner(std::move(qr2));
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr2)));
 
   // Wait until the counting has started
   while (counter.load() == 0)
     ;
-  coord.RequestStop();
+  TF_EXPECT_OK(coord.RequestStop());
 
   int temp_counter = counter.load();
-  Env::Default()->SleepForMicroseconds(10000000);
+  Env::Default()->SleepForMicroseconds(1000 * 1000);
   EXPECT_EQ(temp_counter, counter.load());
   TF_EXPECT_OK(coord.Join());
 }
@@ -138,12 +143,14 @@ TEST(CoordinatorTest, TestRealStop) {
 TEST(CoordinatorTest, TestRequestStop) {
   Coordinator coord;
   std::atomic<int> counter(0);
+  Notification start;
   std::unique_ptr<MockQueueRunner> qr;
   for (int i = 0; i < 10; i++) {
     qr.reset(new MockQueueRunner(&coord));
-    qr->StartCounting(&counter, 10);
-    coord.RegisterRunner(std::move(qr));
+    qr->StartCounting(&counter, 10, &start);
+    TF_ASSERT_OK(coord.RegisterRunner(std::move(qr)));
   }
+  start.Notify();
 
   coord.WaitForStop();
   EXPECT_EQ(coord.ShouldStop(), true);
@@ -156,41 +163,43 @@ TEST(CoordinatorTest, TestJoin) {
   int join_counter = 0;
   std::unique_ptr<MockQueueRunner> qr1(
       new MockQueueRunner(&coord, &join_counter));
-  coord.RegisterRunner(std::move(qr1));
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr1)));
   std::unique_ptr<MockQueueRunner> qr2(
       new MockQueueRunner(&coord, &join_counter));
-  coord.RegisterRunner(std::move(qr2));
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr2)));
 
-  coord.RequestStop();
+  TF_EXPECT_OK(coord.RequestStop());
   TF_EXPECT_OK(coord.Join());
   EXPECT_EQ(join_counter, 2);
 }
 
 TEST(CoordinatorTest, StatusReporting) {
   Coordinator coord({Code::CANCELLED, Code::OUT_OF_RANGE});
+  Notification start;
   BlockingCounter counter(3);
 
   std::unique_ptr<MockQueueRunner> qr1(new MockQueueRunner(&coord));
-  qr1->StartSettingStatus(Status(Code::CANCELLED, ""), &counter);
-  coord.RegisterRunner(std::move(qr1));
+  qr1->StartSettingStatus(Status(Code::CANCELLED, ""), &counter, &start);
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr1)));
 
   std::unique_ptr<MockQueueRunner> qr2(new MockQueueRunner(&coord));
-  qr2->StartSettingStatus(Status(Code::INVALID_ARGUMENT, ""), &counter);
-  coord.RegisterRunner(std::move(qr2));
+  qr2->StartSettingStatus(Status(Code::INVALID_ARGUMENT, ""), &counter, &start);
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr2)));
 
   std::unique_ptr<MockQueueRunner> qr3(new MockQueueRunner(&coord));
-  qr3->StartSettingStatus(Status(Code::OUT_OF_RANGE, ""), &counter);
-  coord.RegisterRunner(std::move(qr3));
+  qr3->StartSettingStatus(Status(Code::OUT_OF_RANGE, ""), &counter, &start);
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr3)));
 
+  start.Notify();
   counter.Wait();
-  coord.RequestStop();
+  TF_EXPECT_OK(coord.RequestStop());
   EXPECT_EQ(coord.Join().code(), Code::INVALID_ARGUMENT);
 }
 
 TEST(CoordinatorTest, JoinWithoutStop) {
   Coordinator coord;
   std::unique_ptr<MockQueueRunner> qr(new MockQueueRunner(&coord));
-  coord.RegisterRunner(std::move(qr));
+  TF_ASSERT_OK(coord.RegisterRunner(std::move(qr)));
 
   EXPECT_EQ(coord.Join().code(), Code::FAILED_PRECONDITION);
 }
@@ -198,7 +207,7 @@ TEST(CoordinatorTest, JoinWithoutStop) {
 TEST(CoordinatorTest, AllRunnersStopped) {
   Coordinator coord;
   MockQueueRunner* qr = new MockQueueRunner(&coord);
-  coord.RegisterRunner(std::unique_ptr<RunnerInterface>(qr));
+  TF_ASSERT_OK(coord.RegisterRunner(std::unique_ptr<RunnerInterface>(qr)));
 
   EXPECT_FALSE(coord.AllRunnersStopped());
   qr->Stop();
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index cd6cc673275..5aaaa116cf0 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -49,7 +49,12 @@ Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
   enqueue_op_names_.insert(enqueue_op_names_.end(),
                            queue_runner_def.enqueue_op_name().begin(),
                            queue_runner_def.enqueue_op_name().end());
-  runs_ = enqueue_op_names_.size();
+  size_t op_names_size = enqueue_op_names_.size();
+  if (op_names_size > kint32max) {
+    return Status(error::INVALID_ARGUMENT,
+                  "Enqueue ops to run cannot exceed kint32max");
+  }
+  runs_ = static_cast<int>(op_names_size);
   if (runs_ == 0) {
     return Status(error::INVALID_ARGUMENT, "Empty enqueue ops to run.");
   }
@@ -77,11 +82,17 @@ Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
 QueueRunner::~QueueRunner() {
   // Cannot run Stop() here because the session might already be closed or
   // destroyed.
-  Join();
+  Join().IgnoreError();
 }
 
 Status QueueRunner::Start(Session* sess) { return Start(sess, 0); }
 
+Status QueueRunner::StartAndCollectCostGraph(Session* sess,
+                                             const RunOptions* run_options) {
+  SetRunArgumentsAndCostGraph(run_options);
+  return Start(sess, 0);
+}
+
 Status QueueRunner::Start(Session* sess, int wait_for) {
   counter_.reset(new BlockingCounter(runs_));
   for (const string& enqueue_op : enqueue_op_names_) {
@@ -109,12 +120,18 @@ Status QueueRunner::Start(Session* sess, int wait_for) {
   return Status::OK();
 }
 
+Status QueueRunner::StartAndCollectCostGraph(Session* session, int wait_for_ms,
+                                             const RunOptions* run_options) {
+  SetRunArgumentsAndCostGraph(run_options);
+  return Start(session, wait_for_ms);
+}
+
 void QueueRunner::Stop(Session* sess) {
   if (coord_ != nullptr) {
     coord_->WaitForStop();
   }
   if (!cancel_op_name_.empty()) {
-    UpdateStatus(sess->Run({}, {}, {cancel_op_name_}, nullptr));
+    UpdateStatus(RealRun(sess, cancel_op_name_, false));
   }
   stopped_ = true;
 }
@@ -149,7 +166,7 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
     if (coord_ && coord_->ShouldStop()) {
       break;
     }
-    status = sess->Run({}, {}, {enqueue_op}, nullptr);
+    status = RealRun(sess, enqueue_op, true);
     if (first_iteration) {
       if (!status.ok()) {
         mutex_lock l(mu_);
@@ -170,12 +187,14 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
   // will be run anway in this case.
   if (IsQueueClosed(status) && (!coord_ || !coord_->ShouldStop())) {
     if (last_run && !close_op_name_.empty()) {
-      UpdateStatus(sess->Run({}, {}, {close_op_name_}, nullptr));
+      UpdateStatus(RealRun(sess, close_op_name_, false));
     }
   } else if (!status.ok()) {
+    LOG(ERROR) << "Queue runner thread got a failure status: "
+               << status.ToString();
     UpdateStatus(status);
     if (coord_) {
-      coord_->RequestStop();
+      coord_->RequestStop().IgnoreError();
     }
   }
 }
@@ -185,4 +204,39 @@ Status QueueRunner::GetStatus() {
   return status_;
 }
 
+Status QueueRunner::ExportCostGraph(CostGraphDef* cost_graph) const {
+  if (!cg_mu_) {
+    return Status(error::FAILED_PRECONDITION,
+                  "This QueueRunner doesn't collect a cost graph.");
+  }
+  mutex_lock l(*cg_mu_);
+  cost_graph->MergeFrom(*cost_graph_);
+  return Status::OK();
+}
+
+void QueueRunner::SetRunArgumentsAndCostGraph(const RunOptions* run_options) {
+  cg_mu_.reset(new mutex());
+  {
+    mutex_lock l(*cg_mu_);
+    cost_graph_.reset(new CostGraphDef());
+  }
+  if (run_options) {
+    run_options_ = *run_options;
+  }
+}
+
+Status QueueRunner::RealRun(Session* sess, const string& op,
+                            bool update_costs) {
+  Status s;
+  if (update_costs && cg_mu_) {
+    RunMetadata metadata;
+    s = sess->Run(run_options_, {}, {}, {op}, nullptr, &metadata);
+    mutex_lock l(*cg_mu_);
+    cost_graph_->Swap(metadata.mutable_cost_graph());
+  } else {
+    s = sess->Run({}, {}, {op}, nullptr);
+  }
+  return s;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index bfe6a305936..71ed44c9c60 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 #include "tensorflow/core/public/session.h"
 
@@ -58,9 +59,16 @@ class QueueRunner : public RunnerInterface {
   /// Starts the queue runner with the given session.
   Status Start(Session* sess);
 
+  /// Starts the queue runner with the given session and sets the run arguments
+  /// for sess->Run. It also collects and stores the cost model.
+  Status StartAndCollectCostGraph(Session* sess,
+                                  const RunOptions* run_options = nullptr);
+
   /// Starts the queue runner with the given session, and wait for up to the
   /// specified time (in milliseconds) for the queues to start to fill up.
   Status Start(Session* sess, int wait_for_ms);
+  Status StartAndCollectCostGraph(Session* session, int wait_for_ms,
+                                  const RunOptions* run_options = nullptr);
 
   /// Requests to stop and runs the cancel op. It would be called in a separate
   /// thread when coordinator is set. If there is no coordinator it should be
@@ -74,8 +82,11 @@ class QueueRunner : public RunnerInterface {
   /// Returns the latest status.
   Status GetStatus();
 
+  // Returns the stored cost model.
+  Status ExportCostGraph(CostGraphDef* cost_graph) const override;
+
  private:
-  QueueRunner() : coord_(nullptr), stopped_(false) {}
+  QueueRunner() : coord_(nullptr), stopped_(false), cg_mu_(nullptr) {}
 
   // Initializes the instance with the QueueRunnerDef proto.
   Status Init(const QueueRunnerDef& queue_runner_def);
@@ -94,6 +105,10 @@ class QueueRunner : public RunnerInterface {
 
   bool IsRunning() const override { return !stopped_; }
 
+  void SetRunArgumentsAndCostGraph(const RunOptions* run_options);
+
+  Status RealRun(Session* sess, const string& op, bool update_costs);
+
   string queue_name_;
   std::vector<string> enqueue_op_names_;
   string close_op_name_;
@@ -114,6 +129,10 @@ class QueueRunner : public RunnerInterface {
 
   mutex cb_mu_;
   std::vector<std::function<void(Status)>> callbacks_;
+
+  mutable std::unique_ptr<mutex> cg_mu_;
+  std::unique_ptr<CostGraphDef> cost_graph_ GUARDED_BY(cg_mu_);
+  RunOptions run_options_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/training/queue_runner_test.cc b/tensorflow/cc/training/queue_runner_test.cc
index 1661c5c91bb..da2fc03b6c0 100644
--- a/tensorflow/cc/training/queue_runner_test.cc
+++ b/tensorflow/cc/training/queue_runner_test.cc
@@ -44,6 +44,7 @@ using ops::FIFOQueue;
 using ops::QueueClose;
 using ops::QueueDequeue;
 using ops::QueueEnqueue;
+using ops::RandomNormal;
 using ops::Square;
 using ops::Variable;
 
@@ -84,7 +85,7 @@ QueueRunnerDef BuildQueueRunnerDef(
     const std::string& close_op, const std::string& cancel_op,
     const std::vector<Code>& queue_closed_error_codes) {
   QueueRunnerDef queue_runner_def;
-  *queue_runner_def.mutable_queue_name() = kQueueName;
+  *queue_runner_def.mutable_queue_name() = queue_name;
   for (const std::string& enqueue_op : enqueue_ops) {
     *queue_runner_def.mutable_enqueue_op_name()->Add() = enqueue_op;
   }
@@ -293,7 +294,7 @@ TEST(QueueRunnerTest, StartTimeout) {
   // This will timeout since queue0 is not fed and queue1 is fetching data from
   // queue0.
   EXPECT_EQ(qr->Start(session.get(), 1).code(), Code::DEADLINE_EXCEEDED);
-  session->Close();
+  TF_EXPECT_OK(session->Close());
 }
 
 TEST(QueueRunnerTest, TestCoordinatorStop) {
@@ -317,8 +318,8 @@ TEST(QueueRunnerTest, TestCoordinatorStop) {
   TF_EXPECT_OK(QueueRunner::New(queue_runner1, &coord, &qr1));
   TF_CHECK_OK(qr1->Start(session.get()));
 
-  coord.RegisterRunner(std::move(qr0));
-  coord.RegisterRunner(std::move(qr1));
+  TF_EXPECT_OK(coord.RegisterRunner(std::move(qr0)));
+  TF_EXPECT_OK(coord.RegisterRunner(std::move(qr1)));
 
   std::vector<Tensor> dq;
   TF_EXPECT_OK(session->Run({}, {kDequeueOp1}, {}, &dq));
@@ -340,9 +341,70 @@ TEST(QueueRunnerTest, CallbackCalledOnError) {
   bool error_caught = false;
   qr->AddErrorCallback([&error_caught](const Status&) { error_caught = true; });
   TF_EXPECT_OK(qr->Start(session.get()));
-  qr->Join();
+  EXPECT_FALSE(qr->Join().ok());
   EXPECT_TRUE(error_caught);
 }
 
+TEST(QueueRunnerTest, RunMetaDataTest) {
+  Scope root = Scope::NewRootScope();
+  auto q0 = FIFOQueue(root.WithOpName(kQueueName), {DataType::DT_FLOAT});
+  Output rnd = RandomNormal(root.WithOpName("rnd"), {1, 1}, DataType::DT_FLOAT);
+  Output square = Square(root.WithOpName(kSquareOpName), rnd);
+  auto enqueue0 = QueueEnqueue(root.WithOpName(kEnqueueOp0), q0, {square});
+  auto close0 = QueueClose(root.WithOpName(kCloseOp0), q0);
+  auto cancel0 = QueueClose(root.WithOpName(kCancelOp0), q0,
+                            QueueClose::CancelPendingEnqueues(true));
+  auto dequeue0 =
+      QueueDequeue(root.WithOpName(kDequeueOp0), q0, {DataType::DT_FLOAT});
+
+  GraphDef graph_def;
+  TF_EXPECT_OK(root.ToGraphDef(&graph_def));
+  for (auto& node : *graph_def.mutable_node()) {
+    node.set_device("/cpu:0");
+  }
+  SessionOptions sess_options;
+  sess_options.config.mutable_graph_options()->set_build_cost_model(1);
+  std::unique_ptr<Session> session(NewSession(sess_options));
+
+  TF_CHECK_OK(session->Create(graph_def));
+
+  QueueRunnerDef queue_runner_def =
+      BuildQueueRunnerDef(kQueueName, {kEnqueueOp0}, kCloseOp0, kCancelOp0, {});
+  std::unique_ptr<QueueRunner> qr;
+  TF_EXPECT_OK(QueueRunner::New(queue_runner_def, &qr));
+  RunOptions run_options;
+  TF_CHECK_OK(qr->StartAndCollectCostGraph(session.get(), &run_options));
+
+  // Make sure there was at least one element enqueued in q0: this prevents a
+  // race condition where we close the queue before it was populated.
+  std::vector<Tensor> dq0;
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp0}, {}, &dq0));
+  // Second call to run dequeue op is to make sure the cost graph has been
+  // stored.
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp0}, {}, &dq0));
+
+  CostGraphDef cost_graph;
+  TF_CHECK_OK(qr->ExportCostGraph(&cost_graph));
+  EXPECT_TRUE(cost_graph.node_size() > 0);
+
+  qr->Stop(session.get());
+}
+
+TEST(QueueRunnerTest, NoRunMetaDataTest) {
+  GraphDef graph_def = BuildSimpleGraph();
+  auto session = BuildSessionAndInitVariable(graph_def);
+
+  QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
+      kQueueName, {kCountUpToOpName}, kSquareOpName, "", {});
+  std::unique_ptr<QueueRunner> qr;
+  TF_EXPECT_OK(QueueRunner::New(queue_runner_def, &qr));
+  TF_CHECK_OK(qr->Start(session.get()));
+
+  TF_EXPECT_OK(qr->Join());
+  CostGraphDef cost_graph;
+  EXPECT_EQ(qr->ExportCostGraph(&cost_graph).code(),
+            error::FAILED_PRECONDITION);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index f2ecd2eddc2..49d3cca3a4e 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -227,7 +227,7 @@ int main(int argc, char* argv[]) {
     argv[dst++] = f;
   }
   argv[dst++] = nullptr;
-  argc = unknown_flags.size() + 1;
+  argc = static_cast<int>(unknown_flags.size() + 1);
   tensorflow::port::InitMain(argv[0], &argc, &argv);
   tensorflow::example::ConcurrentSessions(opts);
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index c52a56b6428..1f6fe28188c 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -20,6 +20,7 @@ cc_library(
 
 cc_test(
     name = "runtime_test",
+    size = "small",
     srcs = ["runtime_test.cc"],
     deps = [
         ":runtime",
@@ -73,7 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu",
@@ -88,6 +89,7 @@ cc_library(
 
 cc_test(
     name = "codegen_test",
+    size = "small",
     srcs = ["codegen_test.cc"],
     data = ["codegen_test_h.golden"],
     deps = [
@@ -101,6 +103,7 @@ cc_test(
 
 cc_test(
     name = "tfcompile_util_test",
+    size = "small",
     srcs = ["tfcompile_util_test.cc"],
     deps = [
         ":tfcompile_lib",
@@ -123,9 +126,16 @@ cc_library(
     deps = [
         ":tfcompile_lib",
         ":tfcompile_proto",
+        "//tensorflow/compiler/xla/legacy_flags:alias_analysis_flags",
+        "//tensorflow/compiler/xla/legacy_flags:buffer_assignment_flags",
         "//tensorflow/compiler/xla/legacy_flags:compiler_functor_flags",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
+        "//tensorflow/compiler/xla/legacy_flags:llvm_util_flags",
+        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/compiler/xla/legacy_flags:util_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/benchmark.cc b/tensorflow/compiler/aot/benchmark.cc
index 0c5e2c103ea..ff720382812 100644
--- a/tensorflow/compiler/aot/benchmark.cc
+++ b/tensorflow/compiler/aot/benchmark.cc
@@ -40,7 +40,7 @@ namespace benchmark {
 // the implementation without pulling in all of the Env dependencies.
 static double NowMicros() {
   struct timeval tv;
-  gettimeofday(&tv, NULL);
+  gettimeofday(&tv, nullptr);
   return static_cast<uint64>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
 
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 042a72745a7..bbdb342a623 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -152,8 +152,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
 string RewriteWithName(const string& name, string code,
                        const std::vector<std::pair<string, string>>& rewrites) {
   str_util::ReplaceAllPairs(&code, rewrites);
-  str_util::ReplaceAll(&code, "{{NAME}}", name);
-  return code;
+  return str_util::StringReplace(code, "{{NAME}}", name, /*replace_all=*/true);
 }
 
 // Generate methods for args (inputs).
@@ -366,7 +365,7 @@ Status GenerateHeader(const HeaderOpts& opts, const Config& config,
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 46d7c03006a..01963c6df46 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -15,7 +15,7 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 00c07932aca..ca17c5ab690 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -25,8 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/tfcompile_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -199,17 +200,17 @@ Status RewriteAndPruneGraph(Graph* graph, const Config& config,
   for (const Fetch& fetch : config.fetch()) {
     missing_fetches.insert(TensorIdToString(fetch.id()));
   }
-  for (const Node* n : graph->nodes()) {
+  for (const Node* n : graph->op_nodes()) {
     if (n->type_string() == kArgOp) {
       string feed_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFeedIdAttr, &feed_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
         return errors::Aborted(kArgOp,
                                " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == kRetvalOp) {
       string fetch_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFetchIdAttr, &fetch_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
         return errors::Aborted(kRetvalOp,
                                " node found with unknown fetch id: ", fetch_id);
@@ -233,7 +234,7 @@ Status CollectArgNodes(const Graph& graph, std::vector<Node*>* arg_nodes) {
   for (Node* n : graph.nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       auto insert_result = indexed_arg_nodes.insert({index, n});
       if (!insert_result.second) {
         const Node* dup = insert_result.first->second;
@@ -262,10 +263,10 @@ Status CreateXlaArgs(const Graph& graph,
   TF_RETURN_IF_ERROR(CollectArgNodes(graph, &arg_nodes));
   for (const Node* node : arg_nodes) {
     XlaCompiler::Argument arg;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "index", &arg.parameter));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kShapeAttr, &arg.shape));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kDebugNameAttr, &arg.name));
+    arg.kind = XlaCompiler::Argument::kParameter;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
   return Status::OK();
@@ -273,11 +274,11 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
-                         const FunctionLibraryDefinition* flib_def,
+Status ConvertGraphToXla(xla::CompileOnlyClient* client,
+                         std::unique_ptr<Graph> graph,
                          xla::Computation* computation, bool* has_context_arg) {
   // Create a device and context to convert the graph into an XLA computation.
-  XlaOpRegistry::RegisterJitKernels();
+  XlaOpRegistry::RegisterCompilationKernels();
   // Populate the context with args from the graph.
   for (Node* node : graph->nodes()) {
     node->set_assigned_device_name(DEVICE_CPU_XLA_JIT);
@@ -288,19 +289,19 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  compiler_options.device_type = &device_type;
+  compiler_options.flib_def = &graph->flib_def();
+  compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
   XlaCompiler compiler(compiler_options);
 
-  std::unique_ptr<FunctionLibraryRuntime> flib_run(NewFunctionLibraryRuntime(
-      compiler.device_mgr(), Env::Default(), compiler.device(),
-      graph->versions().producer(), flib_def, OptimizerOptions()));
   XlaCompiler::CompilationResult result;
-  TF_RETURN_IF_ERROR(compiler.CompileGraph("tfcompile", std::move(graph),
-                                           flib_run.get(), xla_args,
-                                           false /* use_tuple_arg */, &result));
+  TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                           "tfcompile", std::move(graph),
+                                           xla_args, &result));
   *has_context_arg = result.requires_runtime_context;
-  *computation = std::move(result.computation);
+  *computation = std::move(*result.computation);
 
   int num_const_results = 0;
   for (int i = 0; i < result.outputs.size(); ++i) {
@@ -334,7 +335,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
 }
 
 // Compiles the XLA computation into executable code.
-Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
+Status CompileXla(xla::CompileOnlyClient* client,
+                  const xla::Computation& computation,
                   const xla::cpu::CpuAotCompilationOptions& aot_opts,
                   CompileResult* compile_result) {
   // Retrieves arg and result layouts from the computation.
@@ -348,10 +350,11 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
   compile_result->program_shape = *pshape_or.ValueOrDie();
   xla::ProgramShape* pshape = &compile_result->program_shape;
   std::vector<const xla::Shape*> arg_layouts;
+  arg_layouts.reserve(pshape->parameters_size());
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::LocalClient::AheadOfTimeComputationInstance instance;
+  xla::CompileOnlyClient::AotComputationInstance instance;
   instance.computation = &computation;
   instance.argument_layouts = std::move(arg_layouts);
   instance.result_layout = &pshape->result();
@@ -366,17 +369,17 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
           std::move(aot_or.ValueOrDie().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
-      xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
+      xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
   return Status::OK();
 }
 
 }  // namespace
 
 Status InitGraph(const GraphDef& graph_def, const Config& config,
-                 const MainFlags& flags, const FunctionLibraryDefinition* flib,
-                 std::unique_ptr<Graph>* graph) {
+                 const MainFlags& flags, std::unique_ptr<Graph>* graph) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
-  std::unique_ptr<Graph> g(new Graph(flib));
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graph_def.library());
+  std::unique_ptr<Graph> g(new Graph(flib_def));
   GraphDef copy_def(graph_def);
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&copy_def, *g->op_registry(),
                                                0 /*node_offset*/));
@@ -388,7 +391,6 @@ Status InitGraph(const GraphDef& graph_def, const Config& config,
 }
 
 Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
-                    const FunctionLibraryDefinition* flib,
                     CompileResult* compile_result) {
   // Converts the graph into an XLA computation, and compiles the
   // computation.
@@ -396,11 +398,11 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
   namespace gpu = perftools::gputools;
   gpu::Platform* cpu_platform =
       gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
-  xla::LocalClient* client =
-      xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
+  xla::CompileOnlyClient* client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
+          .ValueOrDie();
   xla::Computation computation;
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), flib,
-                                       &computation,
+  TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
                                        &compile_result->has_context_arg));
   if (!flags.debug_dir.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 8e9c64820ba..e929272b2e4 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -56,8 +56,7 @@ extern const char* const kDebugNameAttr;
 // compute the outputs.  If dump_graphs is true, graph rewrites will be dumped
 // for debugging.
 Status InitGraph(const GraphDef& graph_def, const Config& config,
-                 const MainFlags& flags, const FunctionLibraryDefinition* flib,
-                 std::unique_ptr<Graph>* graph);
+                 const MainFlags& flags, std::unique_ptr<Graph>* graph);
 
 // CompileResult describes the output of CompileGraph, where the object file
 // data and meta-information is available in aot.
@@ -83,7 +82,6 @@ struct CompileResult {
 //
 // The XLA compilation options are specified in the flags.
 Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
-                    const FunctionLibraryDefinition* flib,
                     CompileResult* result);
 
 }  // namespace tfcompile
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 208de5498db..57727766661 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,6 +31,8 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
+#elif defined(COMPILER_MSVC)
+  return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
@@ -45,7 +47,13 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #endif
 }
 
-inline void aligned_free(void* aligned_memory) { free(aligned_memory); }
+inline void aligned_free(void* aligned_memory) {
+#if defined(COMPILER_MSVC)
+  _aligned_free(aligned_memory);
+#else
+  free(aligned_memory);
+#endif
+}
 
 size_t align_to(size_t n, size_t align) {
   return (((n - 1) / align) + 1) * align;
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index ecb071a416c..6bfdf37caad 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -43,14 +43,16 @@ genrule(
     testonly = 1,
     outs = [
         "test_graph_tfadd.pb",
-        "test_graph_tfadd_with_ckpt.pb",
         "test_graph_tfadd_with_ckpt.ckpt",
-        "test_graph_tfadd_with_ckpt_saver.pb",
+        "test_graph_tfadd_with_ckpt.pb",
         "test_graph_tfadd_with_ckpt_saver.ckpt",
+        "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
+        "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
         "test_graph_tfmatmulandadd.pb",
+        "test_graph_tfsplits.pb",
     ],
     cmd = "$(location :make_test_graphs) --out_dir $(@D)",
     tags = ["manual"],
@@ -114,6 +116,24 @@ tf_library(
     tags = ["manual"],
 )
 
+tf_library(
+    name = "test_graph_tffunction",
+    testonly = 1,
+    config = "test_graph_tffunction.config.pbtxt",
+    cpp_class = "FunctionComp",
+    graph = "test_graph_tffunction.pb",
+    tags = ["manual"],
+)
+
+tf_library(
+    name = "test_graph_tfsplits",
+    testonly = 1,
+    config = "test_graph_tfsplits.config.pbtxt",
+    cpp_class = "SplitsComp",
+    graph = "test_graph_tfsplits.pb",
+    tags = ["manual"],
+)
+
 cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
@@ -122,9 +142,11 @@ cc_test(
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
+        ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
         ":test_graph_tfmatmulandadd",
+        ":test_graph_tfsplits",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//third_party/eigen3",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 2a2d13dc498..a898eab1d1a 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -71,7 +72,7 @@ def tfadd_with_ckpt_saver(out_dir):
     saver.save(sess, ckpt_file)
     # Without the SaverDef, the restore op won't be named correctly.
     saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
-    with open(saver_file, 'w') as f:
+    with open(saver_file, 'wb') as f:
       f.write(saver.as_saver_def().SerializeToString())
 
 
@@ -95,13 +96,41 @@ def tfmatmulandadd(_):
   math_ops.add(x, y, name='x_y_sum')
 
 
+def tffunction(_):
+
+  @function.Defun(dtypes.int32, dtypes.int32)
+  def test_func(a, b):
+    return a + b
+
+  x = constant_op.constant([1], name='x_const')
+  y = constant_op.constant([2], name='y_const')
+  test_func(x, y, name='func_call')  # pylint: disable=unexpected-keyword-arg
+
+
+def tfsplits(_):
+  """A more complex graph, including splits."""
+  x = array_ops.placeholder(dtypes.float32, shape=[2, 2], name='x')
+  y = array_ops.placeholder(dtypes.float32, shape=[2, 2], name='y')
+  for _ in range(3):
+    x0, x1 = array_ops.split(x, 2, 0)
+    y0, y1 = array_ops.split(y, 2, 0)
+    x0 += 1
+    y0 += 1
+    z = math_ops.matmul(x, y, name='x_y_prod')
+    a = array_ops.concat([x0, y1], axis=0, name='concat_x0_y1')
+    b = array_ops.concat([y0, x1], axis=0, name='concat_y0_x1')
+    x = math_ops.matmul(a, b, name='a_b')
+    y = math_ops.add(x, z)
+  array_ops.identity(y, name='result')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
   with g.as_default():
     build_graph(out_dir)
     filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
-    with open(filename, 'w') as f:
+    with open(filename, 'wb') as f:
       f.write(g.as_graph_def().SerializeToString())
 
 
@@ -112,6 +141,8 @@ def main(_):
   write_graph(tfgather, FLAGS.out_dir)
   write_graph(tfmatmul, FLAGS.out_dir)
   write_graph(tfmatmulandadd, FLAGS.out_dir)
+  write_graph(tffunction, FLAGS.out_dir)
+  write_graph(tfsplits, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
@@ -121,7 +152,6 @@ if __name__ == '__main__':
       '--out_dir',
       type=str,
       default='',
-      help='Output directory for graphs, checkpoints and savers.'
-  )
+      help='Output directory for graphs, checkpoints and savers.')
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt
new file mode 100644
index 00000000000..eb9c1cacb7f
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "func_call" }
+}
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfsplits.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfsplits.config.pbtxt
new file mode 100644
index 00000000000..85fc7da4428
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfsplits.config.pbtxt
@@ -0,0 +1,18 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x" }
+  shape {
+    dim { size: 2 }
+    dim { size: 2 }
+  }
+}
+feed {
+  id { node_name: "y" }
+  shape {
+    dim { size: 2 }
+    dim { size: 2 }
+  }
+}
+fetch {
+  id { node_name: "result" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index f57d2859dfa..07562e59c8d 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -376,6 +378,49 @@ TEST(TFCompileTest, MatMulAndAdd1) {
   }
 }
 
+TEST(TFCompileTest, Function) {
+  // The function is equivalent to an addition
+  FunctionComp add_fn;
+  EXPECT_EQ(add_fn.arg0_data(), add_fn.args()[0]);
+  EXPECT_EQ(add_fn.arg1_data(), add_fn.args()[1]);
+
+  add_fn.arg0() = 1;
+  add_fn.arg1() = 2;
+  EXPECT_TRUE(add_fn.Run());
+  EXPECT_EQ(add_fn.error_msg(), "");
+  EXPECT_EQ(add_fn.result0(), 3);
+  EXPECT_EQ(add_fn.result0_data()[0], 3);
+  EXPECT_EQ(add_fn.result0_data(), add_fn.results()[0]);
+}
+
+TEST(TFCompileTest, Splits) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  SplitsComp fn;
+
+  fn.set_thread_pool(&device);
+  // x = [[1, 2], [3, 4]]
+  fn.arg0(0, 0) = 1;
+  fn.arg0(0, 1) = 2;
+  fn.arg0(1, 0) = 3;
+  fn.arg0(1, 1) = 4;
+
+  // y = [[10, 20], [30, 40]]
+  fn.arg1(0, 0) = 10;
+  fn.arg1(0, 1) = 20;
+  fn.arg1(1, 0) = 30;
+  fn.arg1(1, 1) = 40;
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
+  const float expected[] = {7.86375557e+10, 1.34274679e+11, 1.92741717e+12,
+                            3.29964742e+12};
+  EXPECT_NEAR(expected[0], fn.result0(0, 0), 1e4);
+  EXPECT_NEAR(expected[1], fn.result0(0, 1), 1e4);
+  EXPECT_NEAR(expected[2], fn.result0(1, 0), 1e4);
+  EXPECT_NEAR(expected[3], fn.result0(1, 1), 1e4);
+}
+
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 6f2e0958fd3..4be4e0fbb39 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -279,7 +279,11 @@ def target_llvm_triple():
   # TODO(toddw): Add target_triple for other targets.  For details see:
   # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
   return select({
+      "//tensorflow:android_armeabi": "armv5-none-android",
       "//tensorflow:android_arm": "armv7-none-android",
       "//tensorflow:android_arm64": "aarch64-none-android",
+      "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
+      "//tensorflow:darwin": "x86_64-none-darwin",
       "//conditions:default": "x86_64-pc-linux",
   })
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 85ef9560bbf..6fed46b4329 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -23,9 +23,16 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/tfcompile.pb.h"
 #include "tensorflow/compiler/aot/tfcompile_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/util_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -52,7 +59,8 @@ const char kUsageHeader[] =
     "header file that gives access to the functionality in the object file.\n"
     "A typical invocation looks like this:\n"
     "\n"
-    "   $ tfcompile --graph=mygraph.pb --config=myfile.pbtxt\n"
+    "   $ tfcompile --graph=mygraph.pb --config=myfile.pbtxt "
+    "--cpp_class=\"mynamespace::MyComputation\"\n"
     "\n";
 
 Status ReadProtoFile(const string& kind, const string& fname,
@@ -73,6 +81,9 @@ void ParseTensorId(const string& name, TensorId* id) {
 Status Main(const MainFlags& flags) {
   // Process config.
   Config config;
+  if (flags.config.empty()) {
+    return errors::InvalidArgument("Must specify --config");
+  }
   TF_RETURN_IF_ERROR(ReadProtoFile("config", flags.config, &config));
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   if (flags.dump_fetch_nodes) {
@@ -85,15 +96,16 @@ Status Main(const MainFlags& flags) {
   }
 
   // Read and initialize the graph.
+  if (flags.graph.empty()) {
+    return errors::InvalidArgument("Must specify --graph");
+  }
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(ReadProtoFile("graph", flags.graph, &graph_def));
   std::unique_ptr<Graph> graph;
-  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
-  TF_RETURN_IF_ERROR(InitGraph(graph_def, config, flags, &flib, &graph));
+  TF_RETURN_IF_ERROR(InitGraph(graph_def, config, flags, &graph));
 
   CompileResult compile_result;
-  TF_RETURN_IF_ERROR(
-      CompileGraph(std::move(graph), flags, &flib, &compile_result));
+  TF_RETURN_IF_ERROR(CompileGraph(std::move(graph), flags, &compile_result));
 
   // Write output files.
   Env* env = Env::Default();
@@ -101,6 +113,9 @@ Status Main(const MainFlags& flags) {
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_object,
                                        StringPiece(obj.data(), obj.size())));
   HeaderOpts header_opts;
+  if (flags.cpp_class.empty()) {
+    return errors::InvalidArgument("Must specify --cpp_class");
+  }
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &header_opts.class_name,
                                    &header_opts.namespaces));
   string header;
@@ -121,9 +136,16 @@ int main(int argc, char** argv) {
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
+  xla::legacy_flags::AppendAliasAnalysisFlags(&flag_list);
+  xla::legacy_flags::AppendBufferAssignmentFlags(&flag_list);
   xla::legacy_flags::AppendCompilerFunctorFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendCpuRuntimeFlags(&flag_list);
+  xla::legacy_flags::AppendHloGraphDumperFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::legacy_flags::AppendLlvmUtilFlags(&flag_list);
+  xla::legacy_flags::AppendServiceFlags(&flag_list);
+  xla::legacy_flags::AppendUtilFlags(&flag_list);
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
@@ -131,12 +153,16 @@ int main(int argc, char** argv) {
   QCHECK(parsed_flags_ok) << "\n" << usage;
 
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
-  QCHECK(argc == 1 && !flags.config.empty() &&
-         (flags.dump_fetch_nodes ||
-          (!flags.graph.empty() && !flags.entry_point.empty())))
-      << "\n"
-      << usage;
-
-  TF_QCHECK_OK(tensorflow::tfcompile::Main(flags));
+  QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
+                       "other than flags\n\n"
+                    << usage;
+  tensorflow::Status status = tensorflow::tfcompile::Main(flags);
+  if (status.code() == tensorflow::error::INVALID_ARGUMENT) {
+    std::cerr << "INVALID ARGUMENTS: " << status.error_message() << "\n\n"
+              << usage;
+    return 1;
+  } else {
+    TF_QCHECK_OK(status);
+  }
   return 0;
 }
diff --git a/tensorflow/compiler/aot/tfcompile_util_test.cc b/tensorflow/compiler/aot/tfcompile_util_test.cc
index 108ab1eab7b..c321d3ff4c7 100644
--- a/tensorflow/compiler/aot/tfcompile_util_test.cc
+++ b/tensorflow/compiler/aot/tfcompile_util_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace tfcompile {
 namespace {
 
-void ExpectErrorContains(Status status, StringPiece str) {
+void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
   EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
       << "expected error: " << status.error_message() << " to contain: " << str;
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 414e152cee4..5f857191da7 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -18,7 +18,23 @@ package(
     default_visibility = [":internal"],
 )
 
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+# This target can be used by XLA device plugins to prevent circular
+# dependencies, and provides access to all of the required headers
+# for building a device library.
+cc_header_only_library(
+    name = "xla_jit_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ],
+)
 
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
@@ -29,6 +45,7 @@ cc_library(
         ":xla_cpu_jit",
         ":xla_gpu_device",
         ":xla_gpu_jit",
+        "//tensorflow/compiler/plugin",
     ],
     alwayslink = 1,
 )
@@ -38,7 +55,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":jit_compilation_passes",
-        ":xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
@@ -48,12 +65,12 @@ cc_library(
 cc_library(
     name = "xla_gpu_jit",
     visibility = [":friends"],
-    deps = [
+    deps = if_cuda([
         ":jit_compilation_passes",
-        ":xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-    ],
+    ]),
     alwayslink = 1,
 )
 
@@ -64,8 +81,10 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         ":xla_device",
+        "//tensorflow/compiler/jit/kernels:xla_device_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
     ],
@@ -79,8 +98,10 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         ":xla_device",
+        "//tensorflow/compiler/jit/kernels:xla_device_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
     ],
@@ -105,26 +126,22 @@ cc_library(
     srcs = [
         "xla_device.cc",
         "xla_device_context.cc",
-        "xla_device_launch_op.cc",
         "xla_device_ops.cc",
     ],
     hdrs = [
         "xla_device.h",
         "xla_device_context.h",
-        "xla_device_launch_op.h",
         "xla_device_ops.h",
     ],
     deps = [
         ":common",
         ":jit_compilation_passes",
-        ":xla_compilation_cache",
-        ":xla_local_launch_op",
+        "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
@@ -132,9 +149,9 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:tensorflow_opensource",
-        "//tensorflow/core/kernels:assign_op",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:identity_op",
@@ -142,7 +159,6 @@ cc_library(
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
@@ -155,13 +171,13 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:variable_ops",
     ],
 )
 
@@ -175,27 +191,41 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "graph_to_functiondef",
+    srcs = ["graph_to_functiondef.cc"],
+    hdrs = ["graph_to_functiondef.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "compilation_passes",
     srcs = [
         "build_xla_launch_ops_pass.cc",
         "encapsulate_subgraphs_pass.cc",
-        "graph_to_functiondef.cc",
         "mark_for_compilation_pass.cc",
     ],
     hdrs = [
         "build_xla_launch_ops_pass.h",
         "encapsulate_subgraphs_pass.h",
-        "graph_to_functiondef.h",
         "mark_for_compilation_pass.h",
     ],
     deps = [
         ":common",
-        ":parallel_check_op",
-        ":xla_local_launch_op",
+        ":graph_to_functiondef",
+        ":union_find",
         "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/compiler/jit/kernels:parallel_check_op",
+        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
         "//tensorflow/compiler/jit/legacy_flags:encapsulate_subgraphs_pass_flags",
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/compiler/jit/ops:parallel_check_op",
+        "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:const_analysis",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -208,6 +238,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "union_find",
+    hdrs = ["union_find.h"],
+)
+
 cc_test(
     name = "compilation_passes_test",
     size = "small",
@@ -217,8 +252,9 @@ cc_test(
         "mark_for_compilation_pass_test.cc",
     ],
     deps = [
+        ":common",
         ":compilation_passes",
-        ":xla_local_launch_op",
+        ":graph_to_functiondef",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
@@ -226,48 +262,14 @@ cc_test(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-cc_library(
-    name = "xla_local_launch_op",
-    srcs = ["xla_local_launch_op.cc"],
-    hdrs = ["xla_local_launch_op.h"],
-    deps = [
-        ":common",
-        ":xla_compilation_cache",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core:tensorflow_opensource",
-    ],
-    alwayslink = 1,
-)
-
-tf_kernel_library(
-    name = "parallel_check_op",
-    srcs = ["parallel_check_op.cc"],
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/compiler/jit/legacy_flags:parallel_check_op_flags",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-    alwayslink = 1,
-)
-
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
index 8fde1974005..ef56ccf8e8f 100644
--- a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
@@ -16,14 +16,13 @@ limitations under the License.
 #include "tensorflow/compiler/jit/build_xla_launch_ops_pass.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/jit/xla_local_launch_op.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -32,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -40,14 +38,16 @@ namespace tensorflow {
 static Status BuildLaunchNode(
     const string& nodename, const string& function_name,
     const AttrValueMap& function_attr, const string& device_name,
-    const DataTypeVector& constant_dtypes, const DataTypeVector& arg_dtypes,
-    const DataTypeVector& result_dtypes, Graph* graph, Node** node) {
+    const DataTypeVector& constant_dtypes, int num_resources,
+    const DataTypeVector& arg_dtypes, const DataTypeVector& result_dtypes,
+    Graph* graph, Node** node) {
   NodeDef def;
   def.set_name(graph->NewName(nodename));
   def.set_op("_XlaLaunch");
   def.set_device(device_name);
   AddNodeAttr("Tconstants", constant_dtypes, &def);
   AddNodeAttr("Targs", arg_dtypes, &def);
+  AddNodeAttr("Nresources", num_resources, &def);
   AddNodeAttr("Tresults", result_dtypes, &def);
   NameAttrList function;
   function.set_name(function_name);
@@ -62,25 +62,32 @@ static Status BuildLaunchNode(
 static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
   VLOG(2) << "Replacing " << node->name() << " with XlaLaunch";
 
-  int num_constant_args;
+  int num_constant_args, num_resource_args;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->def(), kXlaNumConstantArgsAttr, &num_constant_args));
+      GetNodeAttr(node->attrs(), kXlaNumConstantArgsAttr, &num_constant_args));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(node->attrs(), kXlaNumResourceArgsAttr, &num_resource_args));
 
-  if (num_constant_args < 0 || num_constant_args > node->input_types().size()) {
+  if (num_constant_args < 0 || num_resource_args < 0 ||
+      num_constant_args + num_resource_args > node->num_inputs()) {
     return errors::InvalidArgument(
-        "Invalid number of constant arguments to XLA kernel");
+        "Invalid number of constant/resource arguments to XLA kernel.");
   }
+  const int num_nonconst_args =
+      node->num_inputs() - num_constant_args - num_resource_args;
+
   DataTypeVector const_dtypes(node->input_types().begin(),
                               node->input_types().begin() + num_constant_args);
-  DataTypeVector arg_dtypes(node->input_types().begin() + num_constant_args,
-                            node->input_types().end());
+  DataTypeVector arg_dtypes(
+      node->input_types().begin() + num_constant_args,
+      node->input_types().begin() + num_constant_args + num_nonconst_args);
 
   // Build a _XlaLaunch operator to execute the function body.
   Node* launch_node;
-  TF_RETURN_IF_ERROR(
-      BuildLaunchNode(graph->NewName(node->name()), node->type_string(),
-                      node->def().attr(), node->def().device(), const_dtypes,
-                      arg_dtypes, node->output_types(), graph, &launch_node));
+  TF_RETURN_IF_ERROR(BuildLaunchNode(
+      graph->NewName(node->name()), node->type_string(), node->def().attr(),
+      node->requested_device(), const_dtypes, num_resource_args, arg_dtypes,
+      node->output_types(), graph, &launch_node));
   launch_node->set_assigned_device_name(node->assigned_device_name());
 
   // Copy incoming edges to the launch node.
@@ -116,9 +123,9 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
 Status BuildXlaLaunchOpsPass::Run(const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
 
-  for (Node* n : graph->nodes()) {
+  for (Node* n : graph->op_nodes()) {
     // In all cases, only try to compile computational nodes.
-    if (!n->IsOp() || n->IsSend() || n->IsRecv() || n->IsControlFlow()) {
+    if (n->IsSend() || n->IsRecv() || n->IsControlFlow()) {
       continue;
     }
 
@@ -128,6 +135,11 @@ Status BuildXlaLaunchOpsPass::Run(const GraphOptimizationPassOptions& options) {
       TF_RETURN_IF_ERROR(ReplaceNodeWithXlaLaunch(graph, n));
     }
   }
+
+  if (VLOG_IS_ON(1)) {
+    dump_graph::DumpGraphToFile("build_xla_launch_ops", *graph,
+                                options.flib_def);
+  }
   return Status::OK();
 }
 
@@ -151,7 +163,7 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
     return errors::InvalidArgument("No ", kXlaCompileAttr, " for ", ndef.op());
   }
   // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterJitKernels();
+  XlaOpRegistry::RegisterCompilationKernels();
   if (!IsCompilable(flr, ndef)) {
     // ndef is calling a function that XLA can't compile.
     return errors::InvalidArgument("Not compilable: ", ndef.ShortDebugString());
@@ -159,7 +171,8 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   FunctionLibraryRuntime::Handle handle;
   // If ndef is not instantiable, e.g., the function does not exist,
   // simply bail out.
-  TF_RETURN_IF_ERROR(flr->Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
   const FunctionBody* fbody = flr->GetFunctionBody(handle);
   CHECK(fbody);  // Can't be nullptr since we just instantiated it.
   std::vector<bool> const_args(fbody->arg_types.size());
@@ -179,6 +192,7 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   launch_def.set_op("_XlaLaunch");
   launch_def.set_device(flr->device()->name());
   AddNodeAttr("Tconstants", DataTypeVector{}, &launch_def);
+  AddNodeAttr("Nresources", 0, &launch_def);
   AddNodeAttr("Targs", fbody->arg_types, &launch_def);
   AddNodeAttr("Tresults", fbody->ret_types, &launch_def);
   NameAttrList func;
diff --git a/tensorflow/compiler/jit/defs.cc b/tensorflow/compiler/jit/defs.cc
index b20ad53ef64..f847d66f3c6 100644
--- a/tensorflow/compiler/jit/defs.cc
+++ b/tensorflow/compiler/jit/defs.cc
@@ -18,5 +18,6 @@ limitations under the License.
 namespace tensorflow {
 
 const char* const kXlaCompileAttr = "_XlaCompile";
+const char* const kXlaScopeAttr = "_XlaScope";
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.h b/tensorflow/compiler/jit/defs.h
index ddc830cb770..a3aabc949db 100644
--- a/tensorflow/compiler/jit/defs.h
+++ b/tensorflow/compiler/jit/defs.h
@@ -23,6 +23,7 @@ namespace tensorflow {
 
 // Name of attribute used to tag operators for compilation with XLA
 extern const char* const kXlaCompileAttr;  // "_XlaCompile"
+extern const char* const kXlaScopeAttr;    // "_XlaScope"
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index c1e61462085..14d8f2ab351 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -46,6 +45,7 @@ namespace tensorflow {
 
 const char* const kXlaCompiledKernelAttr = "_XlaCompiledKernel";
 const char* const kXlaNumConstantArgsAttr = "_XlaNumConstantArgs";
+const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
 
 namespace {
 
@@ -87,9 +87,12 @@ class Encapsulator {
 
   // Build a FunctionDef for each subgraph, and add it 'library'. The values of
   // the 'group_attribute' annotations become the function names.
+  // If 'reuse_existing_functions' is set, use an existing function with the
+  // same name, if any.
   // If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
   // function conversion.
   Status BuildFunctionDefs(const RewriteSubgraphFn& rewrite_subgraph_fn,
+                           bool reuse_existing_functions,
                            FunctionLibraryDefinition* library);
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
@@ -109,8 +112,8 @@ class Encapsulator {
     // returned by _Retval nodes.
     std::unique_ptr<Graph> graph;
 
-    // Which device are these nodes on? Used both to check that all nodes
-    // are assigned to the same device, and to assign a device to the call node.
+    // Which device are these nodes on? Used to assign a device to the call
+    // node.
     string device;
 
     // NodeDef for the function call node.
@@ -161,7 +164,7 @@ static const char* const kRetValOp = "_Retval";
 // none.
 string Encapsulator::GetFunctionNameAttr(Node const* node) const {
   string attr;
-  if (!GetNodeAttr(node->def(), group_attribute_, &attr).ok()) {
+  if (!GetNodeAttr(node->attrs(), group_attribute_, &attr).ok()) {
     attr.clear();
   }
   return attr;
@@ -174,8 +177,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
   std::unordered_map<Node*, Node*> node_images;
 
   // Copy all marked nodes to a subgraph. Do nothing for unmarked nodes.
-  for (Node* node : graph_in_->nodes()) {
-    if (node->IsSource() || node->IsSink()) continue;
+  for (Node* node : graph_in_->op_nodes()) {
     string func_id = GetFunctionNameAttr(node);
     if (func_id.empty()) continue;
 
@@ -189,16 +191,10 @@ Status Encapsulator::SplitIntoSubgraphs() {
     image->ClearAttr(group_attribute_);
     node_images[node] = image;
 
-    // Check the device matches any existing device.
-    string device = node->assigned_device_name().empty()
-                        ? node->def().device()
-                        : node->assigned_device_name();
-
     if (subgraph.device.empty()) {
-      subgraph.device = device;
-    } else if (subgraph.device != device) {
-      s.Update(errors::InvalidArgument(
-          "Mismatched devices for nodes to be grouped by Encapsulator"));
+      subgraph.device = node->assigned_device_name().empty()
+                            ? node->requested_device()
+                            : node->assigned_device_name();
     }
   }
 
@@ -235,9 +231,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // Create a new _Retval node
         DataType dtype = edge->src()->output_type(edge->src_output());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef ret_def;
         ret_def.set_op(kRetValOp);
-        ret_def.set_name(src_subgraph.graph->NewName("output"));
+        ret_def.set_name(strings::StrCat(edge->src()->name(), "_",
+                                         edge->src_output(), "_retval"));
         AddNodeAttr("T", dtype, &ret_def);
         AddNodeAttr("index", ret_index, &ret_def);
         Node* ret = src_subgraph.graph->AddNode(ret_def, &s);
@@ -262,8 +265,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // This is the first time we have seen this tensor. Create an _Arg node.
         DataType dtype = edge->dst()->input_type(edge->dst_input());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef arg_def;
-        NodeDefBuilder builder(dst_subgraph.graph->NewName("input"), kArgOp);
+        NodeDefBuilder builder(strings::StrCat(edge->src()->name(), "_",
+                                               edge->src_output(), "_arg"),
+                               kArgOp);
         builder.Attr("T", dtype);
         builder.Attr("index", arg_index);
         s = builder.Finalize(&arg_def);
@@ -290,11 +301,11 @@ Status Encapsulator::SplitIntoSubgraphs() {
 }
 
 Status Encapsulator::BuildFunctionDefs(
-    const RewriteSubgraphFn& rewrite_subgraph_fn,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     FunctionLibraryDefinition* library) {
   // For each subgraph, build a FunctionDef.
   for (auto& subgraph_entry : subgraphs_) {
-    const string& name = subgraph_entry.first;
+    string name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
 
     subgraph.call_node_def.set_op(name);
@@ -331,6 +342,8 @@ Status Encapsulator::BuildFunctionDefs(
       for (auto& result : subgraph.results) {
         result.second = output_permutation[result.second];
       }
+
+      name = subgraph.call_node_def.op();
     }
 
     FunctionDef fdef;
@@ -345,7 +358,9 @@ Status Encapsulator::BuildFunctionDefs(
           strings::StrCat("encapsulate_fdef_", name), fdef);
     }
 
-    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    if (!reuse_existing_functions || library->Find(name) == nullptr) {
+      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    }
   }
   return Status::OK();
 }
@@ -422,8 +437,7 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
   std::unordered_map<const Node*, Node*> node_images;
 
   // Copy all unmarked nodes to the output graph.
-  for (Node* node : graph_in_->nodes()) {
-    if (node->IsSource() || node->IsSink()) continue;
+  for (Node* node : graph_in_->op_nodes()) {
     string func_id = GetFunctionNameAttr(node);
 
     // Don't copy nodes that going to be encapsulated, unless parallel checking
@@ -544,14 +558,16 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library) {
   Status s;
 
   Encapsulator encapsulator(std::move(group_attribute), &graph_in);
   s = encapsulator.SplitIntoSubgraphs();
   if (!s.ok()) return s;
 
-  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn, library);
+  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn,
+                                     reuse_existing_functions, library);
   if (!s.ok()) return s;
 
   std::unique_ptr<Graph> out(new Graph(library));
@@ -563,14 +579,29 @@ Status EncapsulateSubgraphsInFunctions(
   return s;
 }
 
+// Finds the types of the _Arg nodes, indexed by position.
+static Status GetArgTypes(const Graph& graph, DataTypeVector* types) {
+  for (Node* n : graph.op_nodes()) {
+    if (n->type_string() == kArgOp) {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      if (index < 0 || index >= types->size()) {
+        return errors::InvalidArgument("Invalid argument number");
+      }
+      (*types)[index] = n->output_type(0);
+    }
+  }
+  return Status::OK();
+}
+
 // Renumber the indices of _Arg nodes in a graph, according to
 // 'permutation' that maps old indices to new indices.
 static Status RenumberArguments(Graph* graph,
                                 const std::vector<int>& permutation) {
-  for (Node* n : graph->nodes()) {
+  for (Node* n : graph->op_nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       if (index < 0 || index >= permutation.size()) {
         return errors::InvalidArgument("Invalid argument number");
       }
@@ -604,19 +635,40 @@ Status EncapsulateSubgraphsPass::Run(
     // Optimize the subgraph.
     OptimizeGraph(flr.get(), subgraph);
 
-    std::vector<bool> const_args(input_permutation->size());
+    const int num_args = input_permutation->size();
+    std::vector<bool> const_args(num_args);
     TF_RETURN_IF_ERROR(BackwardsConstAnalysis(**subgraph, &const_args));
 
+    DataTypeVector arg_types(num_args);
+    TF_RETURN_IF_ERROR(GetArgTypes(**subgraph, &arg_types));
+
     // Compute a permutation of the arguments such that the constant arguments
     // are first.
     const int num_consts =
         std::count(const_args.begin(), const_args.end(), true);
+
+    const int num_resources =
+        std::count(arg_types.begin(), arg_types.end(), DT_RESOURCE);
+    const int num_nonconsts = num_args - num_resources - num_consts;
+    if (num_nonconsts < 0) {
+      return errors::Internal("num_nonconsts should be >= 0, was ",
+                              num_nonconsts);
+    }
+
     int const_pos = 0;
     int arg_pos = num_consts;
-    for (int i = 0; i < const_args.size(); ++i) {
+    int resource_pos = num_consts + num_nonconsts;
+    for (int i = 0; i < num_args; ++i) {
       if (const_args[i]) {
+        if (arg_types[i] == DT_RESOURCE) {
+          return errors::Internal(
+              "Resource arguments cannot be constant (argument ", i, ")");
+        }
         (*input_permutation)[i] = const_pos;
         ++const_pos;
+      } else if (arg_types[i] == DT_RESOURCE) {
+        (*input_permutation)[i] = resource_pos;
+        ++resource_pos;
       } else {
         (*input_permutation)[i] = arg_pos;
         ++arg_pos;
@@ -631,12 +683,14 @@ Status EncapsulateSubgraphsPass::Run(
 
     AddNodeAttr(kXlaCompiledKernelAttr, true, node);
     AddNodeAttr(kXlaNumConstantArgsAttr, num_consts, node);
+    AddNodeAttr(kXlaNumResourceArgsAttr, num_resources, node);
     return Status::OK();
   };
 
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
       kXlaClusterAttr, **options.graph, rewrite_subgraph,
-      flags->tf_xla_parallel_checking, &graph_out, library));
+      flags->tf_xla_parallel_checking, /*reuse_existing_functions=*/false,
+      &graph_out, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out,
@@ -650,7 +704,7 @@ Status EncapsulateSubgraphsPass::Run(
 bool IsXlaCompiledKernel(const Node& node) {
   bool is_compiled = false;
   bool has_compilation_attr =
-      GetNodeAttr(node.def(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
+      GetNodeAttr(node.attrs(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
       is_compiled;
   return has_compilation_attr ? is_compiled : false;
 }
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index ffd39f0b77f..b0987f76c91 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -34,6 +34,8 @@ namespace tensorflow {
 // 'input_permutation' and 'output_permutation' are initialized to the identity
 // permutation. 'nodedef' is the NodeDef for the call to the function under
 // construction, provided to allow additional attributes to be set.
+// The rewrite may also change the NodeDef's operator name, and that
+// name will be used as the name of the generated function.
 typedef std::function<Status(
     std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
     std::vector<int>* output_permutation, NodeDef* node_def)>
@@ -53,6 +55,9 @@ typedef std::function<Status(
 // output graph, together with a "ParallelCheck" operator, that verifies that
 // the original and encapsulated subgraphs produce similar results.
 //
+// If 'reuse_existing_functions' is set, use an existing function with the
+// same name, if any.
+//
 // TODO(phawkins): currently, some information in control edges
 // is not preserved. Suppose you have A and B in the main
 // graph, C and D in a subgraph. B and C have control deps from A, D has control
@@ -61,7 +66,8 @@ typedef std::function<Status(
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
 // subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
@@ -70,12 +76,22 @@ extern const char* const kXlaCompiledKernelAttr;
 // Does `node` have the kXlaCompiledKernelAttr attribute?
 bool IsXlaCompiledKernel(const Node& node);
 
-// Functions produce by the EncapsulateSubgraphs pass have their arguments
-// ordered such that compile-time constant arguments are first in the argument
-// order. The functions are annotated with the following attribute giving the
-// number of constant arguments.
+// Functions produced by the EncapsulateSubgraphs pass have their arguments in
+// the order:
+// 1) compile-time constant arguments, in host memory,
+// 2) other arguments, in device memory.
+// 3) resource variable arguments, in host memory. Note that only the resource
+//    Tensor itself is in host memory; the underlying value may be in device
+//    memory.
+// The functions are annotated with the following attributes that describe how
+// many constant and resource arguments there are:
+
+// Name of the attribute containing the number of constant arguments.
 extern const char* const kXlaNumConstantArgsAttr;
 
+// Name of the attribute containing the number of resource variable arguments.
+extern const char* const kXlaNumResourceArgsAttr;
+
 class EncapsulateSubgraphsPass : public GraphOptimizationPass {
  public:
   Status Run(const GraphOptimizationPassOptions& options) override;
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index c85882e0d7f..4a1dbaf05dc 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
@@ -76,7 +78,7 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 #define TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(expected, actual)         \
   do {                                                            \
     string diff;                                                  \
-    EXPECT_TRUE(EqualFunctionDefLibrary(actual, expected, &diff)) \
+    EXPECT_TRUE(EqualFunctionDefLibrary(expected, actual, &diff)) \
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
@@ -101,15 +103,15 @@ Node* Input(const GraphDefBuilder::Options& opts) {
 }
 
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
-  return ops::UnaryOp("UnaryTest", a, opts);
+  return ops::UnaryOp("UnaryTest", std::move(a), opts);
 }
 
 Node* Binary(ops::NodeOut a, ops::NodeOut b,
              const GraphDefBuilder::Options& opts) {
-  return ops::BinaryOp("BinaryTest", a, b, opts);
+  return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
-Node* AddNLike(std::vector<ops::NodeOut> inputs,
+Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("AddN"), "AddNLikeTest",
@@ -127,7 +129,7 @@ Node* RetOp(int index, ops::NodeOut a, const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("Retval"), "_Retval",
                            opts.op_registry());
-  node_builder.Input(a).Attr("index", index);
+  node_builder.Input(std::move(a)).Attr("index", index);
   return opts.FinalizeBuilder(&node_builder);
 }
 
@@ -144,8 +146,9 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
 
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions("_encapsulate", *graph,
-                                      /* rewrite_subgraph_fn= */ {},
-                                      /* parallel_checking= */ false,
+                                      /*rewrite_subgraph_fn=*/{},
+                                      /*parallel_checking=*/false,
+                                      /*reuse_existing_functions=*/false,
                                       &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
@@ -168,7 +171,7 @@ TEST(EncapsulateSubgraphsTest, NoFunctions) {
 
   GraphDef graphdef_in;
   FunctionDefLibrary library_in;
-  builder.ToGraphDef(&graphdef_in);
+  TF_EXPECT_OK(builder.ToGraphDef(&graphdef_in));
   *library_in.add_function() = test::function::XTimesTwo();
 
   GraphDef graphdef_out = graphdef_in;
@@ -195,7 +198,7 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
     Node* d = Binary(b, c, b1.opts().WithName("c").WithControlInput(c).WithAttr(
                                "_encapsulate", "F1"));
     Binary(a, d, b1.opts().WithName("E"));
-    b1.ToGraphDef(&graphdef);
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
   TF_EXPECT_OK(Encapsulate(&graphdef, &library));
@@ -205,12 +208,12 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
-          {{"c"}, "BinaryTest", {"input__1", "C:o:0"}, {}, {"C"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
       },
-      {{"output__2", "c:o:0"}});
+      {{"c_0_retval", "c:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -224,7 +227,7 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Binary(a, call, b2.opts().WithName("E"));
-    b2.ToGraphDef(&graphdef_expected);
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
   // If there are no marked nodes, funcification should be a no-op.
@@ -251,7 +254,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
         Binary(b, c, b1.opts().WithName("D").WithControlInput(control).WithAttr(
                          "_encapsulate", "F2"));
     Binary(a, d, b1.opts().WithName("E"));
-    b1.ToGraphDef(&graphdef);
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
   TF_EXPECT_OK(Encapsulate(&graphdef, &library));
@@ -261,17 +264,17 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float"}, {"output__1:float"}, {},
+      "F1", {"a_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
       },
-      {{"output__1", "C:o:0"}});
+      {{"c_0_retval", "C:o:0"}});
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F2", {"b_0_arg:float", "c_0_arg:float"}, {"d_0_retval:float"}, {},
       {
-          {{"D"}, "BinaryTest", {"input__0", "input__1"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "c_0_arg"}},
       },
-      {{"output__2", "D:o:0"}});
+      {{"d_0_retval", "D:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -290,7 +293,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
     Node* call2 = b2.opts().FinalizeBuilder(&nb2);
 
     Binary(a, call2, b2.opts().WithName("E"));
-    b2.ToGraphDef(&graphdef_expected);
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
   // If there are no marked nodes, funcification should be a no-op.
@@ -340,7 +343,8 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/false, &graph, &library));
+      /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
@@ -371,7 +375,8 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/true, &graph, &library));
+      /*parallel_checking=*/true, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {
       "add1", "add2", "cluster1", "cluster1_parallel_check/_0",
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index f5b99226acd..5cdbebd88ee 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -120,14 +120,12 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
   std::unordered_map<string, string> return_values;
   NodeNameMapping node_names;
 
-  for (Node const* node : graph.nodes()) {
-    if (!node->IsOp()) continue;
-
+  for (Node const* node : graph.op_nodes()) {
     if (node->type_string() == kArgOp) {
       int index;
       DataType type;
-      GetNodeAttr(node->def(), "T", &type);
-      GetNodeAttr(node->def(), "index", &index);
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().input_arg_size() <= index) {
         fdef->mutable_signature()->add_input_arg();
       }
@@ -143,8 +141,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     if (node->type_string() == kRetValOp) {
       int index;
       DataType type;
-      GetNodeAttr(node->def(), "T", &type);
-      GetNodeAttr(node->def(), "index", &index);
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().output_arg_size() <= index) {
         fdef->mutable_signature()->add_output_arg();
       }
@@ -161,9 +159,11 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     }
 
     NodeDef* node_def = fdef->add_node_def();
-    node_def->CopyFrom(node->def());
+    *node_def = node->def();
+    if (!node->assigned_device_name().empty()) {
+      node_def->set_device(node->assigned_device_name());
+    }
     node_def->set_name(node_names.Uniquify(node->name()));
-    node_def->clear_device();
 
     // Reset input names based on graph rather than the NodeDef.
     node_def->clear_input();
@@ -185,7 +185,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     }
 
     // Add regular inputs
-    for (int i = 0; i < in_edges.size(); ++i) {
+    for (std::vector<const Edge*>::size_type i = 0; i < in_edges.size(); ++i) {
       const Edge* edge = in_edges[i];
       if (edge == nullptr) {
         return errors::InvalidArgument(
@@ -204,8 +204,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
 
     // Populate tensor_renaming.
     NameRangeMap output_ranges;
-    TF_RETURN_IF_ERROR(NameRangesForNode(node->def(), node->op_def(), nullptr,
-                                         &output_ranges));
+    TF_RETURN_IF_ERROR(
+        NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges));
     for (const auto& output : output_ranges) {
       for (int i = output.second.first; i < output.second.second; ++i) {
         const string tensor_name = strings::StrCat(
diff --git a/tensorflow/compiler/jit/graph_to_functiondef_test.cc b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
index 04b2385c9c9..5c09e96a4c2 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef_test.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
@@ -54,7 +54,7 @@ TEST(GraphToFunctionDefTest, Basics) {
   auto h = ops::_Retval(root.WithOpName("H"), g, 0);
 
   GraphDef graph_def;
-  root.ToGraphDef(&graph_def);
+  TF_EXPECT_OK(root.ToGraphDef(&graph_def));
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions options;
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index 87d5de09d14..bc68afb322b 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -76,7 +76,7 @@ struct GraphCycles::Rep {
 GraphCycles::GraphCycles() : rep_(new Rep) {}
 
 GraphCycles::~GraphCycles() {
-  for (int i = 0; i < rep_->nodes_.size(); i++) {
+  for (Vec<Node*>::size_type i = 0; i < rep_->nodes_.size(); i++) {
     delete rep_->nodes_[i];
   }
   delete rep_;
@@ -85,7 +85,7 @@ GraphCycles::~GraphCycles() {
 bool GraphCycles::CheckInvariants() const {
   Rep* r = rep_;
   NodeSet ranks;  // Set of ranks seen so far.
-  for (int32 x = 0; x < r->nodes_.size(); x++) {
+  for (Vec<Node*>::size_type x = 0; x < r->nodes_.size(); x++) {
     Node* nx = r->nodes_[x];
     if (nx->visited) {
       LOG(FATAL) << "Did not clear visited marker on node " << x;
@@ -108,7 +108,7 @@ int32 GraphCycles::NewNode() {
   if (rep_->free_nodes_.empty()) {
     Node* n = new Node;
     n->visited = false;
-    n->data = NULL;
+    n->data = nullptr;
     n->rank = rep_->nodes_.size();
     rep_->nodes_.push_back(n);
     return n->rank;
@@ -116,7 +116,7 @@ int32 GraphCycles::NewNode() {
     // Preserve preceding rank since the set of ranks in use must be
     // a permutation of [0,rep_->nodes_.size()-1].
     int32 r = rep_->free_nodes_.back();
-    rep_->nodes_[r]->data = NULL;
+    rep_->nodes_[r]->data = nullptr;
     rep_->free_nodes_.pop_back();
     return r;
   }
@@ -259,7 +259,7 @@ static void Reorder(GraphCycles::Rep* r) {
              r->deltaf_.end(), r->merged_.begin());
 
   // Assign the ranks in order to the collected list.
-  for (int32 i = 0; i < r->list_.size(); i++) {
+  for (Vec<int32>::size_type i = 0; i < r->list_.size(); i++) {
     r->nodes_[r->list_[i]]->rank = r->merged_[i];
   }
 }
@@ -277,7 +277,7 @@ static void Sort(const Vec<Node*>& nodes, Vec<int32>* delta) {
 }
 
 static void MoveToList(GraphCycles::Rep* r, Vec<int32>* src, Vec<int32>* dst) {
-  for (int32 i = 0; i < src->size(); i++) {
+  for (Vec<int32>::size_type i = 0; i < src->size(); i++) {
     int32 w = (*src)[i];
     (*src)[i] = r->nodes_[w]->rank;  // Replace src entry with its rank
     r->nodes_[w]->visited = false;   // Prepare for future DFS calls
@@ -286,7 +286,7 @@ static void MoveToList(GraphCycles::Rep* r, Vec<int32>* src, Vec<int32>* dst) {
 }
 
 static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32>& nodes) {
-  for (int32 i = 0; i < nodes.size(); i++) {
+  for (Vec<int32>::size_type i = 0; i < nodes.size(); i++) {
     r->nodes_[nodes[i]]->visited = false;
   }
 }
@@ -332,7 +332,7 @@ int GraphCycles::FindPath(int32 x, int32 y, int max_path_len,
 }
 
 bool GraphCycles::IsReachable(int32 x, int32 y) const {
-  return FindPath(x, y, 0, NULL) > 0;
+  return FindPath(x, y, 0, nullptr) > 0;
 }
 
 bool GraphCycles::IsReachableNonConst(int32 x, int32 y) {
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
index f27a616ac9d..e47b782207e 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
@@ -230,7 +230,7 @@ TEST(GraphCycles, RandomizedTest) {
           int new_node = graph_cycles.NewNode();
           ASSERT_NE(-1, new_node);
           VLOG(1) << "adding node " << new_node;
-          ASSERT_EQ(0, graph_cycles.GetNodeData(new_node));
+          ASSERT_EQ(nullptr, graph_cycles.GetNodeData(new_node));
           graph_cycles.SetNodeData(
               new_node, reinterpret_cast<void *>(
                             static_cast<intptr_t>(new_node + kDataOffset)));
@@ -243,7 +243,7 @@ TEST(GraphCycles, RandomizedTest) {
         break;
 
       case 1:  // Remove a node
-        if (nodes.size() > 0) {
+        if (!nodes.empty()) {
           int node_index = RandomNode(&rnd, &nodes);
           int node = nodes[node_index];
           nodes[node_index] = nodes.back();
@@ -263,7 +263,7 @@ TEST(GraphCycles, RandomizedTest) {
         break;
 
       case 2:  // Add an edge
-        if (nodes.size() > 0) {
+        if (!nodes.empty()) {
           int from = RandomNode(&rnd, &nodes);
           int to = RandomNode(&rnd, &nodes);
           if (EdgeIndex(&edges, nodes[from], nodes[to]) == -1) {
@@ -282,7 +282,7 @@ TEST(GraphCycles, RandomizedTest) {
         break;
 
       case 3:  // Remove an edge
-        if (edges.size() > 0) {
+        if (!edges.empty()) {
           int i = RandomEdge(&rnd, &edges);
           int from = edges[i].from;
           int to = edges[i].to;
@@ -296,7 +296,7 @@ TEST(GraphCycles, RandomizedTest) {
         break;
 
       case 4:  // Check a path
-        if (nodes.size() > 0) {
+        if (!nodes.empty()) {
           int from = RandomNode(&rnd, &nodes);
           int to = RandomNode(&rnd, &nodes);
           int32 path[2 * kMaxNodes];
@@ -343,7 +343,7 @@ TEST(GraphCycles, RandomizedTest) {
         ASSERT_NE(-1, new_node);
         VLOG(1) << "adding node " << new_node;
         ASSERT_GE(new_node, 0);
-        ASSERT_EQ(0, graph_cycles.GetNodeData(new_node));
+        ASSERT_EQ(nullptr, graph_cycles.GetNodeData(new_node));
         graph_cycles.SetNodeData(
             new_node, reinterpret_cast<void *>(
                           static_cast<intptr_t>(new_node + kDataOffset)));
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
new file mode 100644
index 00000000000..c4116cb8b52
--- /dev/null
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -0,0 +1,74 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow/compiler/tf2xla:internal",
+    ],
+)
+
+cc_library(
+    name = "xla_local_launch_op",
+    srcs = ["xla_local_launch_op.cc"],
+    hdrs = ["xla_local_launch_op.h"],
+    deps = [
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:xla_compilation_cache",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:tensorflow_opensource",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "xla_device_launch_op",
+    srcs = ["xla_device_launch_op.cc"],
+    hdrs = ["xla_device_launch_op.h"],
+    deps = [
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:xla_compilation_cache",
+        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow_opensource",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "parallel_check_op",
+    srcs = ["parallel_check_op.cc"],
+    visibility = ["//tensorflow/compiler/jit:friends"],
+    deps = [
+        "//tensorflow/compiler/jit/legacy_flags:parallel_check_op_flags",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/jit/kernels/parallel_check_op.cc b/tensorflow/compiler/jit/kernels/parallel_check_op.cc
new file mode 100644
index 00000000000..c86e03118b5
--- /dev/null
+++ b/tensorflow/compiler/jit/kernels/parallel_check_op.cc
@@ -0,0 +1,144 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/legacy_flags/parallel_check_op_flags.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace {
+
+// Inputs 2*N tensors, outputs the first N inputs.
+// Logs errors if input tensor i and i + N are not (near) identical
+// in any position.
+class ParallelCheckOp : public OpKernel {
+ public:
+  explicit ParallelCheckOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  template <typename T>
+  int CompareTensors(DataType dtype, const char* v0, const char* v1,
+                     int64 num_elts, int input_idx) {
+    int failed = 0;
+    const T* p0 = reinterpret_cast<const T*>(v0);
+    const T* p1 = reinterpret_cast<const T*>(v1);
+    double rtol;
+    legacy_flags::ParallelCheckOpFlags* flags =
+        legacy_flags::GetParallelCheckOpFlags();
+    if (!tensorflow::strings::safe_strtod(flags->parallel_check_rtol.c_str(),
+                                          &rtol)) {
+      LOG(ERROR) << "can't convert parallel_check_rtol "
+                 << flags->parallel_check_rtol << " to double";
+    }
+    double atol;
+    if (!tensorflow::strings::safe_strtod(flags->parallel_check_atol.c_str(),
+                                          &atol)) {
+      LOG(ERROR) << "can't convert parallel_check_atol "
+                 << flags->parallel_check_atol << " to double";
+    }
+    for (int i = 0; i < num_elts; ++i) {
+      bool ok = (p0[i] == p1[i]);
+      VLOG(2) << "output " << input_idx << " element " << i << ": " << p0[i];
+      if (!ok) {
+        if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
+          float tolerance =
+              std::max(atol, std::max(fabs(rtol * p0[i]), fabs(rtol * p1[i])));
+          T diff = p0[i] - p1[i];
+          if (diff < 0) diff = 0 - diff;
+          ok = (diff <= tolerance);
+        }
+        if (ok) continue;
+        LOG(ERROR) << "Op " << def().name() << " fails equality at output "
+                   << input_idx << " type " << DataTypeString(dtype)
+                   << " element " << i << ": std_val=" << p0[i]
+                   << " test_val=" << p1[i] << " diff=" << (p0[i] - p1[i]);
+        if (++failed > 10) break;
+      }
+    }
+    return failed;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "Compute " << def().name();
+    const int num_pairs = ctx->num_inputs() / 2;
+    for (int i = 0; i < num_pairs; ++i) {
+      CHECK_EQ(ctx->input_dtype(i), ctx->input_dtype(i + num_pairs));
+      Tensor t0 = ctx->input(i);
+      Tensor t1 = ctx->input(i + num_pairs);
+      int64 num_elts = t0.NumElements();
+      CHECK_EQ(num_elts, t1.NumElements());
+
+      // Compare inputs elementwise for near-exact equality.
+      const char* v0 = t0.tensor_data().data();
+      const char* v1 = t1.tensor_data().data();
+      int failed = 0;
+      switch (ctx->input_dtype(i)) {
+        case DT_INT32:
+          failed =
+              CompareTensors<int32>(ctx->input_dtype(i), v0, v1, num_elts, i);
+          break;
+        case DT_INT64:
+          failed =
+              CompareTensors<int64>(ctx->input_dtype(i), v0, v1, num_elts, i);
+          break;
+        case DT_FLOAT:
+          failed =
+              CompareTensors<float>(ctx->input_dtype(i), v0, v1, num_elts, i);
+          break;
+        case DT_DOUBLE:
+          failed =
+              CompareTensors<double>(ctx->input_dtype(i), v0, v1, num_elts, i);
+          break;
+        case DT_BOOL:
+          failed =
+              CompareTensors<bool>(ctx->input_dtype(i), v0, v1, num_elts, i);
+          break;
+        default:
+          LOG(FATAL) << "unimpl: " << ctx->input_dtype(i);
+      }
+      if (failed > 0) {
+        LOG(ERROR) << "check failed for " << def().name() << " output " << i
+                   << " num_elts: " << num_elts;
+        legacy_flags::ParallelCheckOpFlags* flags =
+            legacy_flags::GetParallelCheckOpFlags();
+        if (flags->parallel_check_failfast) {
+          LOG(QFATAL) << "failfast on first parallel-check failure";
+        }
+      } else {
+        VLOG(1) << "check passed for " << def().name() << " output " << i
+                << " num_elts: " << num_elts;
+      }
+
+      // Propagate the std value.
+      if (IsRefType(ctx->input_dtype(i))) {
+        ctx->forward_ref_input_to_ref_output(i, i);
+      } else {
+        ctx->set_output(i, ctx->input(i));
+      }
+    }
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ParallelCheckOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParallelCheck").Device(DEVICE_CPU),
+                        ParallelCheckOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
new file mode 100644
index 00000000000..29c5ff72429
--- /dev/null
+++ b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
@@ -0,0 +1,253 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
+
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_device_context.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** cache) {
+  XlaDevice::Metadata* metadata;
+  Status s = rm->Lookup<XlaDevice::Metadata>(rm->default_container(),
+                                             "xla_metadata", &metadata);
+  if (!s.ok()) {
+    return s;
+  }
+  core::ScopedUnref metadata_ref(metadata);
+  *cache =
+      new XlaCompilationCache(metadata->client(), metadata->jit_device_type());
+  return Status::OK();
+}
+
+}  // namespace
+
+XlaDeviceLaunchOp::XlaDeviceLaunchOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  const NameAttrList* func;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
+  function_ = *func;
+  VLOG(1) << "XlaDeviceLaunch created function="
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
+  DataTypeVector constant_types;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
+  num_constant_args_ = constant_types.size();
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
+}
+
+std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
+                                                      int num_variables) {
+  std::vector<OptionalTensor> snapshot(num_variables);
+  int first_variable = ctx->num_inputs() - num_variables;
+  for (int i = 0; i < num_variables; ++i) {
+    Var* variable = nullptr;
+    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
+    if (LookupResource(ctx, handle, &variable).ok()) {
+      mutex_lock lock(*variable->mu());
+      snapshot[i].name = handle.name();
+      snapshot[i].present = true;
+      snapshot[i].value = *variable->tensor();
+    }
+  }
+  return snapshot;
+}
+
+void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "XlaDeviceLaunch::Compute "
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = ctx->resource_manager();
+  OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
+
+  XlaCompilationCache* cache;
+  OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
+                          rm->default_container(), "xla_compiler", &cache,
+                          [rm](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(rm, cache);
+                          }));
+  // Holds the reference to the JIT during evaluation. (We could probably
+  // free it sooner because the ResourceMgr will retain a reference, but
+  // this is more obviously correct.)
+  core::ScopedUnref cache_ref(cache);
+
+  std::vector<OptionalTensor> variables =
+      SnapshotResourceVariables(ctx, num_resource_args_);
+
+  XlaCompiler::Options options;
+  options.client = cache->client();
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = false;
+  options.local_executable_has_hybrid_result = false;
+
+  const XlaCompiler::CompilationResult* kernel;
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
+                                     variables, ctx, &kernel, nullptr));
+
+  VLOG(1) << "XLA compilation complete...";
+
+  OP_REQUIRES(ctx, ctx->num_outputs() == kernel->outputs.size(),
+              errors::Internal("Unexpected number of outputs"));
+
+  // Runs the computation, if any. There might not be a computation if all
+  // outputs were compile-time constants.
+  std::vector<std::unique_ptr<xla::GlobalData>> outputs;
+  if (!kernel->computation->IsNull()) {
+    auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
+
+    // Builds the inputs to the computation.
+    std::vector<std::shared_ptr<xla::GlobalData>> arg_handles(
+        kernel->input_mapping.size());
+    std::vector<xla::GlobalData*> arg_ptrs(kernel->input_mapping.size());
+
+    // Adds the argument tensors.
+    const int first_variable_arg = ctx->num_inputs() - num_resource_args_;
+    for (int i = 0; i < kernel->input_mapping.size(); ++i) {
+      int op_input_num = kernel->input_mapping[i];
+
+      if (op_input_num >= first_variable_arg) {
+        arg_handles[i] = XlaTransferManager::GetTensorGlobalData(
+            variables[op_input_num - first_variable_arg].value);
+      } else {
+        arg_handles[i] =
+            XlaTransferManager::GetTensorGlobalData(ctx->input(op_input_num));
+      }
+      arg_ptrs[i] = arg_handles[i].get();
+    }
+
+    // Execute the computation.
+    xla::ExecutionProfile profile;
+    xla::ExecutionOptions execution_options;
+    *execution_options.mutable_shape_with_output_layout() =
+        kernel->xla_output_shape;
+    Env* env = Env::Default();
+    auto start_time = env->NowMicros();
+    VLOG(1) << "Executing XLA Computation...";
+    auto result = cache->client()->Execute(*kernel->computation, arg_ptrs,
+                                           &execution_options, &profile);
+    auto elapsed = env->NowMicros() - start_time;
+    OP_REQUIRES(ctx, result.ok(), result.status());
+
+    VLOG(1) << "Elapsed time: " << elapsed << "us";
+    VLOG(1) << "ExecutionProfile: " << profile.DebugString();
+
+    if (xla::ShapeUtil::IsTuple(kernel->xla_output_shape)) {
+      auto outputs_or_error =
+          cache->client()->DeconstructTuple(*result.ValueOrDie());
+      OP_REQUIRES(ctx, outputs_or_error.ok(), outputs_or_error.status());
+      outputs = outputs_or_error.ConsumeValueOrDie();
+    } else {
+      outputs.push_back(result.ConsumeValueOrDie());
+    }
+  }
+
+  XlaDeviceContext* device_context = ctx->op_device_context<XlaDeviceContext>();
+
+  // Copy XLA outputs to the operator's outputs.
+  VLOG(2) << "Setting operator output";
+  int output_num = 0;
+  for (int i = 0; i < ctx->num_outputs(); ++i) {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(i, kernel->outputs[i].shape, &output));
+
+    if (kernel->outputs[i].is_constant) {
+      // TODO(phawkins): mark constant _XlaLaunch outputs as HostMemory and
+      // remove the copy from this code.
+      Status status;
+      device_context->CopyCPUTensorToDevice(
+          &kernel->outputs[i].constant_value, nullptr, output,
+          [&status](const Status& s) { status = s; });
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+        return;
+      }
+    } else {
+      CHECK_LT(output_num, outputs.size());
+      XlaTransferManager::SetTensorGlobalData(
+          std::shared_ptr<xla::GlobalData>(std::move(outputs[output_num])),
+          output);
+      ++output_num;
+    }
+  }
+
+  // Apply variable updates, if any.
+  VLOG(2) << "Applying variable updates";
+  for (int i = 0; i < kernel->variable_updates.size(); ++i) {
+    const XlaCompiler::VariableUpdate& write = kernel->variable_updates[i];
+    OP_REQUIRES(ctx,
+                write.input_index >= 0 && write.input_index < ctx->num_inputs(),
+                errors::Internal("Invalid input index for variable write."));
+    // This code is very close to being a clone of AssignVariableOp, but the
+    // key difference is that the contents of an XLA device tensor cannot be
+    // copied safely; instead we must use
+    // XlaTransferManager::SetTensorGlobalData.
+    Var* variable = nullptr;
+    // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, not
+    // a Tensor.
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
+                            ctx, HandleFromInput(ctx, write.input_index),
+                            &variable, [this, ctx, &write](Var** ptr) {
+                              *ptr = new Var(write.type);
+                              PersistentTensor unused;
+                              Tensor* tmp;
+                              TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+                                  write.type, write.shape, &unused, &tmp));
+                              *(*ptr)->tensor() = *tmp;
+                              return Status::OK();
+                            }));
+    core::ScopedUnref s(variable);
+
+    mutex_lock ml(*variable->mu());
+    OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
+                errors::Internal("Mismatched type in variable write"));
+    if (!variable->tensor()->shape().IsSameSize(write.shape)) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      OP_REQUIRES_OK(ctx, ctx->allocate_persistent(write.type, write.shape,
+                                                   &unused, &tmp));
+      *variable->tensor() = *tmp;
+    }
+    XlaTransferManager::SetTensorGlobalData(
+        std::shared_ptr<xla::GlobalData>(std::move(outputs[output_num])),
+        variable->tensor());
+    ++output_num;
+  }
+
+  VLOG(1) << "Done";
+}
+
+XlaDeviceLaunchOp::~XlaDeviceLaunchOp() {
+  VLOG(1) << "XlaDeviceLaunch destroyed";
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/xla_device_launch_op.h b/tensorflow/compiler/jit/kernels/xla_device_launch_op.h
new file mode 100644
index 00000000000..65516163c91
--- /dev/null
+++ b/tensorflow/compiler/jit/kernels/xla_device_launch_op.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_DEVICE_LAUNCH_OP_H_
+#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_DEVICE_LAUNCH_OP_H_
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Takes a snapshot of the values of resource variable arguments, which are
+// the last `num_variables` arguments. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
+                                                      int num_variables);
+
+// The XlaDeviceLaunchOp is used to replace a region of the TensorFlow graph
+// which will be compiled and executed using XLA.  The XlaDeviceLaunchOp is
+// responsible for handling interactions with the TensorFlow executor.
+// Once all inputs are present, and their shapes are known, the op can
+// use a 'TlaJit' to compile and execute code which is specific
+// to the shapes of input Tensors.
+class XlaDeviceLaunchOp : public OpKernel {
+ public:
+  explicit XlaDeviceLaunchOp(OpKernelConstruction* ctx);
+  ~XlaDeviceLaunchOp() override;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  NameAttrList function_;
+
+  // Number of compile-time constant arguments.
+  int num_constant_args_;
+
+  // Number of resource variable arguments.
+  int num_resource_args_;
+
+  Tensor dummy_tensor_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaDeviceLaunchOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_DEVICE_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/xla_local_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
similarity index 76%
rename from tensorflow/compiler/jit/xla_local_launch_op.cc
rename to tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
index acf2ccb8903..40acc0d81d0 100644
--- a/tensorflow/compiler/jit/xla_local_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
 
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -37,20 +38,9 @@ namespace gpu = perftools::gputools;
 
 namespace tensorflow {
 
-REGISTER_OP("_XlaLaunch")
-    .Input("constants: Tconstants")
-    .Attr("Tconstants: list(type) >= 0")
-    .Input("args: Targs")
-    .Attr("Targs: list(type) >= 0")
-    .Output("results: Tresults")
-    .Attr("Tresults: list(type) >= 0")
-    .Attr("function: func")
-    // XLA random-number generation ops are stateful.
-    // TODO(phawkins): create stateful and non-stateful variants of _XlaLaunch.
-    .SetIsStateful()
-    .Doc("XLA Launch Op. For use by the XLA JIT only.");
-
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
+// Assumes that the Tensorflow allocator permits asynchronous deallocation:
+// see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
   XlaAllocator(const perftools::gputools::Platform* platform,
@@ -66,6 +56,15 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   Status MakeTensorFromBuffer(gpu::DeviceMemoryBase buffer, DataType dtype,
                               const TensorShape& shape, Tensor* tensor) const;
 
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
  private:
   OpKernelContext* const op_context_;
 
@@ -143,45 +142,51 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   DataTypeVector constant_types;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
   num_constant_args_ = constant_types.size();
+
+  int num_resource_args;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args));
+  OP_REQUIRES(ctx, num_resource_args == 0,
+              errors::Unimplemented(
+                  "XlaLocalLaunchOp does not support resource variables"));
+  if (device_type_ == DeviceType(DEVICE_CPU)) {
+    platform_id_ = gpu::host::kHostPlatformId;
+  } else if (device_type_ == DeviceType(DEVICE_GPU)) {
+    platform_id_ = gpu::cuda::kCudaPlatformId;
+  } else {
+    ctx->SetStatus(
+        errors::InvalidArgument("Unknown device type for local _XlaLaunch"));
+    return;
+  }
 }
 
-Status XlaLocalLaunchOp::BuildCompilationCache(XlaCompilationCache** compiler) {
-  gpu::Platform::Id platform_id;
-  if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id = gpu::host::kHostPlatformId;
-  } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id = gpu::cuda::kCudaPlatformId;
-  } else {
-    return errors::InvalidArgument("Unknown device type for local _XlaLaunch");
-  }
-
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id);
+Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
+                                               XlaCompilationCache** cache) {
+  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
     return StreamExecutorUtil::ConvertStatus(platform.status());
   }
-  auto client =
-      xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie());
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform.ValueOrDie());
+  client_options.set_intra_op_parallelism_threads(
+      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
   if (!client.ok()) {
     return client.status();
   }
-  const string* compiler_device;
-  if (!XlaOpRegistry::GetJitDevice(device_type_.type(), &compiler_device,
-                                   /*requires_jit=*/nullptr)) {
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type_.type(),
+                                           &registration)) {
     return errors::InvalidArgument("No JIT device registered for ",
                                    device_type_.type());
   }
-  XlaCompiler::Options options;
-  options.device_type = DeviceType(*compiler_device);
-  options.client = client.ValueOrDie();
-  options.allow_cpu_custom_calls = (platform_id == gpu::host::kHostPlatformId);
-  options.local_executable_has_hybrid_result = true;
-  *compiler = new XlaCompilationCache(options);
+  *cache = new XlaCompilationCache(
+      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
   return Status::OK();
 }
 
 void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOp::Compute "
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
@@ -190,25 +195,31 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
-  XlaCompilationCache* compiler;
-  OP_REQUIRES_OK(ctx,
-                 rm->LookupOrCreate<XlaCompilationCache>(
-                     rm->default_container(), "xla_compiler", &compiler,
-                     [this](XlaCompilationCache** compiler) {
-                       return BuildCompilationCache(compiler);
-                     }));
+  XlaCompilationCache* cache;
+  OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
+                          rm->default_container(), "xla_cache", &cache,
+                          [this, ctx](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(ctx, cache);
+                          }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
   // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
+  core::ScopedUnref cache_ref(cache);
 
-  xla::LocalClient* client = static_cast<xla::LocalClient*>(compiler->client());
+  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
+
+  XlaCompiler::Options options;
+  options.client = client;
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.local_executable_has_hybrid_result = true;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  OP_REQUIRES_OK(ctx,
-                 compiler->Compile(function_, num_constant_args_, ctx, &kernel,
-                                   &executable));
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_, {},
+                                     ctx, &kernel, &executable));
 
   VLOG(1) << "Executing XLA Computation...";
 
@@ -218,7 +229,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   std::unique_ptr<xla::ShapedBuffer> output;
   bool output_is_tuple;
-  if (!kernel->computation.IsNull()) {
+  if (!kernel->computation->IsNull()) {
     // Build xla::ShapedBuffers that point directly to the Tensor buffers.
     std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
     arg_buffers.reserve(kernel->xla_input_shapes.size() + 1);
@@ -227,8 +238,8 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
     // Pass remaining parameters.
     for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
-      int arg_num = kernel->xla_input_shapes[i].first;
-      const xla::Shape& shape = kernel->xla_input_shapes[i].second;
+      int arg_num = kernel->input_mapping[i];
+      const xla::Shape& shape = kernel->xla_input_shapes[i];
       gpu::DeviceMemoryBase dmem(
           const_cast<char*>(ctx->input(arg_num).tensor_data().data()),
           ctx->input(arg_num).tensor_data().size());
@@ -316,10 +327,9 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
       }
       Tensor output_tensor;
       // Looks up the owning Tensor by buffer address.
-      OP_REQUIRES_OK(
-          ctx,
-          xla_allocator.MakeTensorFromBuffer(
-              buffer, ctx->expected_output_dtype(i), shape, &output_tensor));
+      OP_REQUIRES_OK(ctx, xla_allocator.MakeTensorFromBuffer(
+                              buffer, ctx->expected_output_dtype(i), shape,
+                              &output_tensor));
       ctx->set_output(i, output_tensor);
       ++output_num;
     }
diff --git a/tensorflow/compiler/jit/xla_local_launch_op.h b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
similarity index 76%
rename from tensorflow/compiler/jit/xla_local_launch_op.h
rename to tensorflow/compiler/jit/kernels/xla_local_launch_op.h
index 96ae664cbe2..5e4d3336a91 100644
--- a/tensorflow/compiler/jit/xla_local_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_LOCAL_LAUNCH_OP_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_LOCAL_LAUNCH_OP_H_
+#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
+#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
 
@@ -31,8 +32,9 @@ namespace tensorflow {
 // Once all inputs are present, and their shapes are known, the op can
 // use a 'XlaCompilationCache' to compile and execute code which is specific
 // to the shapes of input Tensors.
-// XlaLocalLaunchOp uses xla::LocalClient::ExecuteLocally and passes
-// arguments into/out of XLA in device memory.
+// XlaLocalLaunchOp uses xla::LocalClient::Compile() and
+// xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
+// memory.
 class XlaLocalLaunchOp : public OpKernel {
  public:
   explicit XlaLocalLaunchOp(OpKernelConstruction* ctx);
@@ -42,14 +44,18 @@ class XlaLocalLaunchOp : public OpKernel {
 
  private:
   // Builds a XlaCompilationCache class suitable for the current device.
-  Status BuildCompilationCache(XlaCompilationCache** compiler);
+  Status BuildCompilationCache(OpKernelContext* ctx,
+                               XlaCompilationCache** compiler);
 
   DeviceType device_type_;
   NameAttrList function_;
   int num_constant_args_;
+
+  perftools::gputools::Platform::Id platform_id_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_XLA_LOCAL_LAUNCH_OP_H_
+#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 725c969c051..f1fef85f994 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,8 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
@@ -50,22 +51,24 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
 }
 
 // Make sure we don't recurse infinitely on recursive functions.
-const int kMaxRecursionDepth = 5;
+const int kMaxRecursionDepth = 10;
 
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime);
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime);
 
-// Tests whether 'while_def' is a completely compilable loop.
+// Tests whether 'while_node' is a completely compilable loop.
 // Every operator in the condition and body functions must be compilable for a
 // while loop to be compilable.
-bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
-                       int depth, FunctionLibraryRuntime* lib_runtime) {
-  VLOG(2) << "Loop marking: " << while_def.op();
+bool IsCompilableWhile(const Node& while_node,
+                       const DeviceType& jit_device_type, int depth,
+                       FunctionLibraryRuntime* lib_runtime) {
+  VLOG(2) << "Loop marking: " << while_node.type_string();
 
   const NameAttrList* name_attr;
   NodeDef call;
   Status status;
-  status = GetNodeAttr(while_def, "cond", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'cond' attribute on While node.";
     return false;
@@ -78,7 +81,7 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
     VLOG(2) << "Can't compile loop condition: " << cond_func;
     return false;
   }
-  status = GetNodeAttr(while_def, "body", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'body' attribute on While node.";
     return false;
@@ -98,8 +101,9 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
 // Tests whether 'call_def' is a call to a completely compilable function.
 // Every operator in the function must be compilable for a function to be
 // compilable.
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime) {
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime) {
   VLOG(2) << "Function marking: " << call_def.op();
 
   if (depth > kMaxRecursionDepth) {
@@ -109,21 +113,32 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
 
   FunctionLibraryRuntime::Handle handle;
   Status status =
-      lib_runtime->Instantiate(call_def.op(), call_def.attr(), &handle);
+      lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle);
   if (!status.ok()) {
     VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status;
     return false;
   }
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   CHECK(fbody);
+  const FunctionDef& fdef = fbody->fdef;
+  bool noinline = false;
+  if (GetNodeAttr(AttrSlice(&fdef.attr()), "_noinline", &noinline).ok() &&
+      noinline) {
+    // The underlying mechanism that calls non-inlined functions uses
+    // LocalExecutor, which interacts poorly with the LocalExecutor used by
+    // tf2xla to translate the TF graph into XLA.  So we avoid this for now.
+    //
+    // TODO(b/36139787): Create a mechanism to set inlining hints.
+    VLOG(2) << "Can't compile noinline function: " << fdef.DebugString();
+    return false;
+  }
 
-  for (Node* node : fbody->graph->nodes()) {
-    if (node->IsSource() || node->IsSink()) continue;
-    if (node->def().op() == "_Arg" || node->def().op() == "_Retval") continue;
-    if (node->def().op() == "While") {
+  for (Node* node : fbody->graph->op_nodes()) {
+    if (node->type_string() == "_Arg" || node->type_string() == "_Retval")
+      continue;
+    if (node->type_string() == "While") {
       // Handle functional While loop (not in open source build).
-      return IsCompilableWhile(node->def(), jit_device_type, depth + 1,
-                               lib_runtime);
+      return IsCompilableWhile(*node, jit_device_type, depth + 1, lib_runtime);
     }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, depth + 1,
@@ -147,6 +162,12 @@ Status DeviceTypeOfDevice(const string& device, DeviceType* device_type) {
   return Status::OK();
 }
 
+// Does `node` have a DT_RESOURCE typed argument?
+bool HasResourceArgument(const Node& node) {
+  return std::find(node.input_types().begin(), node.input_types().end(),
+                   DT_RESOURCE) != node.input_types().end();
+}
+
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
@@ -155,28 +176,30 @@ Status FindCompilationCandidates(
   std::unique_ptr<FunctionLibraryRuntime> lib_runtime(NewFunctionLibraryRuntime(
       nullptr, env, nullptr, TF_GRAPH_DEF_VERSION, flib_def, opts));
 
-  for (Node* node : graph.nodes()) {
-    if (node->IsSource() || node->IsSink()) continue;
-
+  for (Node* node : graph.op_nodes()) {
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
         DeviceTypeOfDevice(node->assigned_device_name(), &device_type));
 
     if (is_compilable_fn && !is_compilable_fn(node, device_type)) continue;
 
-    const string* jit_device_name;
-    CHECK(XlaOpRegistry::GetJitDevice(device_type.type(), &jit_device_name,
-                                      /*requires_jit=*/nullptr));
-    DeviceType jit_device_type(*jit_device_name);
+    const XlaOpRegistry::DeviceRegistration* registration;
+    CHECK(
+        XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
+    DeviceType jit_device_type(registration->compilation_device_name);
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) {
       VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
-              << ": " << node->def().op();
+              << ": " << node->type_string();
       continue;
     }
-    if (node->def().op() == "While" &&
-        !IsCompilableWhile(node->def(), jit_device_type, 0,
-                           lib_runtime.get())) {
+    if (!registration->compile_resource_ops && HasResourceArgument(*node)) {
+      VLOG(2) << "Compilation rejected node: resource argument " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
+    if (node->type_string() == "While" &&
+        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime.get())) {
       continue;
     }
     candidates->insert(node);
@@ -184,85 +207,27 @@ Status FindCompilationCandidates(
   return Status::OK();
 }
 
-// Union-Find data structure used to compute clusters. We use our own
-// implementation because we want one key feature: when merging clusters, we
-// need to know which value becomes the representative of the merged clusters.
-// We use the representatives to name nodes in a cycle detection graph, and we
-// need to control which node is named.
-// TODO(phawkins): consider merging this code with union-find implementations
-// in Tensorflow, e.g., in SimplePlacer.
-class Cluster {
- public:
-  Cluster();
-
-  int Size() { return FindRoot()->size_; }
-
-  // Merges this cluster with 'other'. This cluster's representative becomes
-  // the representative of the merged cluster; the representative of 'other'
-  // is ignored.
-  void Merge(Cluster* other);
-
-  // Each cluster has an associated integer 'representative', initialized to -1
-  // by default.
-  int GetRepresentative() { return FindRoot()->representative_; }
-  void SetRepresentative(int representative) {
-    FindRoot()->representative_ = representative;
-  }
-
- private:
-  // Finds the root element of the cluster. Performs path compression.
-  Cluster* FindRoot();
-
-  int representative_;
-  int rank_;
-  int size_;  // Size of the cluster.
-  Cluster* parent_;
+struct Cluster {
+  // Identifies the node that represents this cluster in the cycle detection
+  // graph.
+  int representative = -1;
 };
 
-Cluster::Cluster()
-    : representative_(-1), rank_(0), size_(1), parent_(nullptr) {}
-
-void Cluster::Merge(Cluster* other) {
-  Cluster* a = FindRoot();
-  Cluster* b = other->FindRoot();
-  if (a == b) return;
-  if (a->rank_ > b->rank_) {
-    b->parent_ = a;
-    a->size_ += b->size_;
-    return;
-  }
-
-  a->parent_ = b;
-  if (a->rank_ == b->rank_) {
-    b->rank_++;
-  }
-  b->representative_ = a->representative_;
-  b->size_ += a->size_;
-}
-
-Cluster* Cluster::FindRoot() {
-  if (!parent_) return this;
-  // Path compression: update intermediate nodes to point to the root of the
-  // equivalence class.
-  parent_ = parent_->FindRoot();
-  return parent_;
-}
-
 }  // anonymous namespace
 
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   Device* device = flr->device();
-  const string* jit_device_name;
-  CHECK(XlaOpRegistry::GetJitDevice(device->device_type(), &jit_device_name,
-                                    /*requires_jit=*/nullptr));
-  DeviceType jit_device_type(*jit_device_name);
+  const XlaOpRegistry::DeviceRegistration* registration;
+  CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
+                                            &registration));
+  DeviceType jit_device_type(registration->compilation_device_name);
   return IsCompilableCall(ndef, jit_device_type, 0, flr);
 }
 
 Status MarkForCompilationPass::Run(
     const GraphOptimizationPassOptions& options) {
-  // TODO(phawkins): precompute the "GetJitDevice" properties each device ahead
-  // of time.
+  // TODO(phawkins): precompute the "GetCompilationDevice" properties of each
+  // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
       options.session_options->config.graph_options()
           .optimizer_options()
@@ -283,25 +248,24 @@ Status MarkForCompilationPass::Run(
   const FunctionLibraryDefinition* fld = options.flib_def;
   auto is_compilable = [global_jit_level, fld](const Node* node,
                                                const DeviceType& device_type) {
-    const string* jit_device;
-    bool requires_jit;
-    if (!XlaOpRegistry::GetJitDevice(device_type.type(), &jit_device,
-                                     &requires_jit)) {
+    const XlaOpRegistry::DeviceRegistration* registration;
+    if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
+                                             &registration)) {
       return false;
     }
     // If this device requires a JIT, we must say yes.
-    if (requires_jit) return true;
+    if (registration->requires_compilation) return true;
 
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
-    Status status = GetNodeAttr(node->def(), kXlaCompileAttr, &compile);
+    Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
-    status = fld->GetAttr(node->def(), kXlaCompileAttr, &compile);
+    status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
     // Otherwise use the value of global_jit_level.
-    return global_jit_level > 0;
+    return registration->enable_jit_by_default && global_jit_level > 0;
   };
   return RunImpl(options, is_compilable);
 }
@@ -323,7 +287,7 @@ Status MarkForCompilationPass::RunImpl(
   VLOG(1) << "MarkForCompilationPass::Run";
 
   // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterJitKernels();
+  XlaOpRegistry::RegisterCompilationKernels();
 
   Graph* graph = options.graph->get();
 
@@ -411,10 +375,11 @@ Status MarkForCompilationPass::RunImpl(
   // Each compilation candidate belongs to a cluster. The cluster's
   // representative
   // names the node in the 'cycles' graph that represents the cluster.
-  std::vector<Cluster> clusters(graph->num_node_ids());
-  std::deque<Cluster*> worklist;
+  std::vector<UnionFind<Cluster>> clusters(graph->num_node_ids());
+  std::deque<UnionFind<Cluster>*> worklist;
   for (Node* node : compilation_candidates) {
-    clusters[node->id()].SetRepresentative(node->id());
+    Cluster& cluster = clusters[node->id()].Get();
+    cluster.representative = node->id();
     worklist.push_back(&clusters[node->id()]);
   }
 
@@ -424,15 +389,19 @@ Status MarkForCompilationPass::RunImpl(
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
   while (!worklist.empty()) {
-    int from = worklist.front()->GetRepresentative();
+    int from = worklist.front()->Get().representative;
     worklist.pop_front();
 
     Node* node_from = graph->FindNodeId(from);
     if (node_from->IsControlFlow()) {
       // Control flow nodes aren't compilation candidates and should never
       // appear.
-      return errors::Internal("Found control flow node in clustering worklist");
+      return errors::Internal(
+          "Found control flow node in clustering worklist: ",
+          node_from->type_string());
     }
+    string from_scope;
+    string to_scope;
     for (int to : cycles.Successors(from)) {
       if (to >= graph->num_node_ids()) {
         // Node is a "frame" node that is present only in the cycle detection
@@ -440,10 +409,27 @@ Status MarkForCompilationPass::RunImpl(
         continue;
       }
       Node* node_to = graph->FindNodeId(to);
-      if (compilation_candidates.find(node_to) == compilation_candidates.cend())
+      if (compilation_candidates.find(node_to) ==
+          compilation_candidates.cend()) {
         continue;
-      if (node_from->assigned_device_name() != node_to->assigned_device_name())
+      }
+      if (node_from->assigned_device_name() !=
+          node_to->assigned_device_name()) {
         continue;
+      }
+      // Look for an _XlaScope on both nodes.  If both nodes have a
+      // scope and the scopes do not match, do not cluster along this
+      // edge.  If even one of the nodes lacks an _XlaScope attribute,
+      // then it is treated as a "bridge" and a cluster may be created
+      // along it.  We may want to restrict this behavior to require
+      // all nodes marked with _XlaCompile=true to also have a
+      // _XlaScope property set (and raise an error otherwise); but
+      // for now we don't do this.
+      if (GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() &&
+          GetNodeAttr(node_to->attrs(), kXlaScopeAttr, &to_scope).ok() &&
+          from_scope != to_scope) {
+        continue;
+      }
 
       // Ops that consume shapes cannot be the root of a cluster. This is an
       // optimization.
@@ -476,7 +462,7 @@ Status MarkForCompilationPass::RunImpl(
   // Count the number of elements in each cluster.
   std::vector<int> cluster_sizes(graph->num_node_ids());
   for (const Node* n : compilation_candidates) {
-    int cluster = clusters[n->id()].GetRepresentative();
+    int cluster = clusters[n->id()].Get().representative;
     cluster_sizes[cluster]++;
   }
 
@@ -490,32 +476,30 @@ Status MarkForCompilationPass::RunImpl(
   //   if compilation is enabled, otherwise there will be no such candidates).
   const int min_cluster_size = flags->tf_xla_min_cluster_size;
   for (Node* n : compilation_candidates) {
-    int cluster = clusters[n->id()].GetRepresentative();
+    int cluster = clusters[n->id()].Get().representative;
 
     // Compile if the user marked this node _XlaCompile=true
     bool compile_attr = false;
     bool marked_for_compilation = false;
-    if (GetNodeAttr(n->def(), kXlaCompileAttr, &compile_attr).ok()) {
+    if (GetNodeAttr(n->attrs(), kXlaCompileAttr, &compile_attr).ok()) {
       marked_for_compilation = compile_attr;
-    } else if (options.flib_def
-                   ->GetAttr(n->def(), kXlaCompileAttr, &compile_attr)
+    } else if (options.flib_def->GetAttr(*n, kXlaCompileAttr, &compile_attr)
                    .ok()) {
       marked_for_compilation = compile_attr;
     }
 
     // Compile if this operator is placed on a device that requires
     // compilation.
-    bool requires_jit = false;
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
         DeviceTypeOfDevice(n->assigned_device_name(), &device_type));
-    XlaOpRegistry::GetJitDevice(device_type.type(),
-                                /*jit_device_name=*/nullptr, &requires_jit);
+    const XlaOpRegistry::DeviceRegistration* registration;
+    XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
     // Or compile if this is a cluster of >= min_cluster_size compilable
     // operators.
     if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
-        requires_jit) {
+        registration->requires_compilation) {
       string& name = cluster_names[cluster];
       if (name.empty()) {
         name = strings::StrCat("cluster_", cluster_sequence_num++);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 61b2031a36e..9f30e12e0e3 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/defs.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
@@ -56,7 +57,7 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   std::unordered_map<string, string> ids;
   for (Node* node : graph.nodes()) {
     string cluster;
-    if (GetNodeAttr(node->def(), kXlaClusterAttr, &cluster).ok()) {
+    if (GetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster).ok()) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
     }
@@ -77,7 +78,7 @@ TEST(XlaCompilationTest, Chains) {
         ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D"));
     Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
     ops::UnaryOp("Relu", e, builder.opts().WithName("F"));
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -102,7 +103,7 @@ TEST(XlaCompilationTest, UncompilableCycles) {
     Node* b =
         ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -122,7 +123,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
                                          .WithAttr("value", Tensor()));
     Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -145,7 +146,7 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
                      .WithAttr("value", Tensor(DT_COMPLEX64, TensorShape())));
     Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -174,7 +175,7 @@ TEST(XlaCompilationTest, ConcatWithConstArg) {
     concat_builder.Input(dim).Input({a, a}).Attr("N", 2);
     builder.opts().FinalizeBuilder(&concat_builder);
 
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -183,13 +184,20 @@ TEST(XlaCompilationTest, ConcatWithConstArg) {
 }
 
 TEST(XlaCompilationTest, FunctionCalls) {
-  FunctionDefLibrary flib;
-  *flib.add_function() = FunctionDefHelper::Define(
+  FunctionDef compilable = FunctionDefHelper::Define(
       "CompilableFn", {"n_a:float", "n_b:float"}, {"n_c:float"}, {},
       {{{"n_c"}, "Add", {"n_a", "n_b"}, {{"T", DT_FLOAT}}}});
-  *flib.add_function() =
+  FunctionDef uncompilable =
       FunctionDefHelper::Define("UncompilableFn", {"n_a:float"}, {"n_c:float"},
                                 {}, {{{"n_c"}, "UncompilableUnary", {"n_a"}}});
+  FunctionDef noinline = compilable;
+  noinline.mutable_signature()->set_name("NoInlineFn");
+  AddAttr("_noinline", bool(true), noinline.mutable_attr());
+
+  FunctionDefLibrary flib;
+  *flib.add_function() = compilable;
+  *flib.add_function() = uncompilable;
+  *flib.add_function() = noinline;
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
 
   std::unique_ptr<Graph> graph(new Graph(&flib_def));
@@ -201,7 +209,8 @@ TEST(XlaCompilationTest, FunctionCalls) {
     Node* b = ops::BinaryOp("CompilableFn", a, a, builder.opts().WithName("B"));
     Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C"));
     ops::UnaryOp("UncompilableFn", c, builder.opts().WithName("D"));
-    builder.ToGraph(graph.get());
+    ops::BinaryOp("NoInlineFn", c, c, builder.opts().WithName("E"));
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph, &flib_def);
@@ -212,6 +221,7 @@ TEST(XlaCompilationTest, FunctionCalls) {
   EXPECT_EQ(clusters["B"], clusters["C"]);
   EXPECT_TRUE(clusters.find("A") == clusters.cend());
   EXPECT_TRUE(clusters.find("D") == clusters.cend());
+  EXPECT_TRUE(clusters.find("E") == clusters.cend());
 }
 
 // Metadata-only operators such as Shape/Rank/Size may not be the root of a
@@ -231,8 +241,8 @@ TEST(XlaCompilationTest, MetadataOpsDontStartClusters) {
     Node* b = ops::UnaryOp("Shape", a, builder.opts().WithName("B"));
     Node* c = ops::UnaryOp("Rank", b, builder.opts().WithName("C"));
     Node* d = ops::UnaryOp("Size", c, builder.opts().WithName("D"));
-    ops::UnaryOp("Shape", d, builder.opts().WithName("C"));
-    builder.ToGraph(graph.get());
+    ops::UnaryOp("Shape", d, builder.opts().WithName("E"));
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
   MarkForCompilation(&graph);
   auto clusters = GetClusters(*graph);
@@ -318,7 +328,7 @@ TEST(XlaCompilationTest, SymbolicGradients) {
     d_builder.Input({c, c});
     builder.opts().FinalizeBuilder(&d_builder);
 
-    builder.ToGraph(graph.get());
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
   }
 
   MarkForCompilation(&graph);
@@ -344,7 +354,7 @@ TEST(XlaCompilationTest, Loops) {
   auto d = ops::Add(root.WithOpName("D"), c, exit);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  root.ToGraph(graph.get());
+  TF_EXPECT_OK(root.ToGraph(graph.get()));
 
   MarkForCompilation(&graph);
   auto clusters = GetClusters(*graph);
@@ -354,5 +364,96 @@ TEST(XlaCompilationTest, Loops) {
   EXPECT_EQ(0, clusters.size());
 }
 
+TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaScopeAttr, "ScopeA"));
+    Node* b = ops::UnaryOp(
+        "Relu", a,
+        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
+    ops::BinaryOp(
+        "MatMul", a, b,
+        builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
+    TF_CHECK_OK(builder.ToGraph(graph.get()));
+  }
+
+  MarkForCompilation(&graph);
+  auto clusters = GetClusters(*graph);
+
+  // The computation is: C = A + relu(A)
+  // where A sits in ScopeA, relu(A) sits in ScopeB, and C sits in ScopeC.
+  // In this case, we cannot fuse anything, and there are no clusters.
+  EXPECT_EQ(0, clusters.size());
+}
+
+TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaScopeAttr, "Scope1"));
+    Node* b = ops::UnaryOp(
+        "Relu", a,
+        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "Scope1"));
+    Node* c = ops::BinaryOp(
+        "MatMul", a, b,
+        builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "Scope2"));
+    ops::BinaryOp(
+        "Add", b, c,
+        builder.opts().WithName("D").WithAttr(kXlaScopeAttr, "Scope2"));
+    TF_CHECK_OK(builder.ToGraph(graph.get()));
+  }
+
+  MarkForCompilation(&graph);
+  auto clusters = GetClusters(*graph);
+
+  // The computation is: D = relu(A) + (A @ relu(A))
+  // where A and relu(A) are in Scope1, and the @, + ops are in Scope2.
+  // In this case, we can fuse the A and relu(A), and we can fuse the
+  // second half of the operations; there are two clusters.
+  EXPECT_EQ(4, clusters.size());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+  EXPECT_NE(clusters["A"], clusters["C"]);
+  EXPECT_EQ(clusters["C"], clusters["D"]);
+}
+
+TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaScopeAttr, "ScopeA"));
+    Node* b = ops::UnaryOp(
+        "Relu", a,
+        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
+    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+    TF_CHECK_OK(builder.ToGraph(graph.get()));
+  }
+
+  MarkForCompilation(&graph);
+  auto clusters = GetClusters(*graph);
+
+  // The computation is: C = A @ relu(A)
+  // where A sits in ScopeA, relu(A) sits in ScopeB, and C sits in ScopeC.
+  // In this case, we cannot fuse anything.
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_NE(clusters["A"], clusters["B"]);
+  EXPECT_EQ(clusters["B"], clusters["C"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
new file mode 100644
index 00000000000..8d1fa03cc0d
--- /dev/null
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -0,0 +1,45 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow/compiler/tf2xla:internal",
+    ],
+)
+
+cc_library(
+    name = "xla_ops",
+    srcs = [
+        "xla_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "parallel_check_op",
+    srcs = ["parallel_check_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/jit/ops/parallel_check_op.cc b/tensorflow/compiler/jit/ops/parallel_check_op.cc
new file mode 100644
index 00000000000..db5c1955788
--- /dev/null
+++ b/tensorflow/compiler/jit/ops/parallel_check_op.cc
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("ParallelCheck")
+    .Attr("T: list(type) >= 0")
+    .Input("expected: T")
+    .Input("actual: T")
+    .Output("result: T")
+    .Doc(R"doc(
+Op that compares two sets of inputs for near-identity, and propagates the first.
+Inequality is logged to ERROR log.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
new file mode 100644
index 00000000000..07320b43dab
--- /dev/null
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_XlaLaunch")
+    .Input("constants: Tconstants")
+    .Attr("Tconstants: list(type) >= 0")
+    .Input("args: Targs")
+    .Attr("Targs: list(type) >= 0")
+    .Input("resources: Nresources * resource")
+    .Attr("Nresources: int >= 0")
+    .Output("results: Tresults")
+    .Attr("Tresults: list(type) >= 0")
+    .Attr("function: func")
+    // XLA random-number generation ops are stateful.
+    // TODO(phawkins): create stateful and non-stateful variants of _XlaLaunch.
+    .SetIsStateful()
+    .Doc("XLA Launch Op. For use by the XLA JIT only.");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/parallel_check_op.cc b/tensorflow/compiler/jit/parallel_check_op.cc
deleted file mode 100644
index d07da46ca04..00000000000
--- a/tensorflow/compiler/jit/parallel_check_op.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/legacy_flags/parallel_check_op_flags.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace {
-
-REGISTER_OP("ParallelCheck")
-    .Attr("T: list(type) >= 0")
-    .Input("expected: T")
-    .Input("actual: T")
-    .Output("result: T")
-    .Doc(R"doc(
-Op that compares two sets of inputs for near-identity, and propagates the first.
-Inequality is logged to ERROR log.
-)doc");
-
-// Inputs 2*N tensors, outputs the first N inputs.
-// Logs errors if input tensor i and i + N are not (near) identical
-// in any position.
-class ParallelCheckOp : public OpKernel {
- public:
-  explicit ParallelCheckOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  template <typename T>
-  int CompareTensors(DataType dtype, const char* v0, const char* v1,
-                     int64 num_elts, int input_idx) {
-    int failed = 0;
-    const T* p0 = reinterpret_cast<const T*>(v0);
-    const T* p1 = reinterpret_cast<const T*>(v1);
-    double rtol;
-    legacy_flags::ParallelCheckOpFlags* flags =
-        legacy_flags::GetParallelCheckOpFlags();
-    if (!tensorflow::strings::safe_strtod(flags->parallel_check_rtol.c_str(),
-                                          &rtol)) {
-      LOG(ERROR) << "can't convert parallel_check_rtol "
-                 << flags->parallel_check_rtol << " to double";
-    }
-    double atol;
-    if (!tensorflow::strings::safe_strtod(flags->parallel_check_atol.c_str(),
-                                          &atol)) {
-      LOG(ERROR) << "can't convert parallel_check_atol "
-                 << flags->parallel_check_atol << " to double";
-    }
-    for (int i = 0; i < num_elts; ++i) {
-      bool ok = (p0[i] == p1[i]);
-      VLOG(2) << "output " << input_idx << " element " << i << ": " << p0[i];
-      if (!ok) {
-        if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
-          float tolerance =
-              std::max(atol, std::max(fabs(rtol * p0[i]), fabs(rtol * p1[i])));
-          T diff = p0[i] - p1[i];
-          if (diff < 0) diff = 0 - diff;
-          ok = (diff <= tolerance);
-        }
-        if (ok) continue;
-        LOG(ERROR) << "Op " << def().name() << " fails equality at output "
-                   << input_idx << " type " << DataTypeString(dtype)
-                   << " element " << i << ": std_val=" << p0[i]
-                   << " test_val=" << p1[i] << " diff=" << (p0[i] - p1[i]);
-        if (++failed > 10) break;
-      }
-    }
-    return failed;
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "Compute " << def().name();
-    const int num_pairs = ctx->num_inputs() / 2;
-    for (int i = 0; i < num_pairs; ++i) {
-      CHECK_EQ(ctx->input_dtype(i), ctx->input_dtype(i + num_pairs));
-      Tensor t0 = ctx->input(i);
-      Tensor t1 = ctx->input(i + num_pairs);
-      int64 num_elts = t0.NumElements();
-      CHECK_EQ(num_elts, t1.NumElements());
-
-      // Compare inputs elementwise for near-exact equality.
-      const char* v0 = t0.tensor_data().data();
-      const char* v1 = t1.tensor_data().data();
-      int failed = 0;
-      switch (ctx->input_dtype(i)) {
-        case DT_INT32:
-          failed =
-              CompareTensors<int32>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_INT64:
-          failed =
-              CompareTensors<int64>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_FLOAT:
-          failed =
-              CompareTensors<float>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_DOUBLE:
-          failed =
-              CompareTensors<double>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_BOOL:
-          failed =
-              CompareTensors<bool>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        default:
-          LOG(FATAL) << "unimpl: " << ctx->input_dtype(i);
-      }
-      if (failed > 0) {
-        LOG(ERROR) << "check failed for " << def().name() << " output " << i
-                   << " num_elts: " << num_elts;
-        legacy_flags::ParallelCheckOpFlags* flags =
-            legacy_flags::GetParallelCheckOpFlags();
-        if (flags->parallel_check_failfast) {
-          LOG(QFATAL) << "failfast on first parallel-check failure";
-        }
-      } else {
-        VLOG(1) << "check passed for " << def().name() << " output " << i
-                << " num_elts: " << num_elts;
-      }
-
-      // Propagate the std value.
-      if (IsRefType(ctx->input_dtype(i))) {
-        ctx->forward_ref_input_to_ref_output(i, i);
-      } else {
-        ctx->set_output(i, ctx->input(i));
-      }
-    }
-  }
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ParallelCheckOp);
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelCheck").Device(DEVICE_CPU),
-                        ParallelCheckOp);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/union_find.h b/tensorflow/compiler/jit/union_find.h
new file mode 100644
index 00000000000..a1a7a6a4d0d
--- /dev/null
+++ b/tensorflow/compiler/jit/union_find.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
+#define TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
+
+namespace tensorflow {
+
+// Union-Find data structure.
+// Each cluster has an associated value; when merging clusters we can control
+// which value becomes the representative of the merged clusters. Values must be
+// copyable.
+template <typename T>
+class UnionFind {
+ public:
+  UnionFind() : rank_(0), size_(1), parent_(nullptr) {}
+
+  // Returns the number of elements in a cluster.
+  int Size() { return FindRoot()->size_; }
+
+  // Merges this cluster with 'other'. This cluster's value becomes
+  // the value of the merged cluster; the value of 'other' is ignored.
+  void Merge(UnionFind* other);
+
+  // Each cluster has an associated value. Retrieves the value associated
+  // with this cluster.
+  T& Get() { return FindRoot()->value_; }
+
+ private:
+  // Finds the root element of the cluster. Performs path compression.
+  UnionFind* FindRoot();
+
+  int rank_;
+  int size_;  // Size of the cluster.
+  UnionFind* parent_;
+  T value_;
+};
+
+template <typename T>
+void UnionFind<T>::Merge(UnionFind* other) {
+  UnionFind<T>* a = FindRoot();
+  UnionFind<T>* b = other->FindRoot();
+  if (a == b) return;
+  if (a->rank_ > b->rank_) {
+    b->parent_ = a;
+    a->size_ += b->size_;
+    return;
+  }
+
+  a->parent_ = b;
+  if (a->rank_ == b->rank_) {
+    b->rank_++;
+  }
+  b->value_ = a->value_;
+  b->size_ += a->size_;
+}
+
+template <typename T>
+UnionFind<T>* UnionFind<T>::FindRoot() {
+  if (!parent_) return this;
+  // Path compression: update intermediate nodes to point to the root of the
+  // equivalence class.
+  parent_ = parent_->FindRoot();
+  return parent_;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 4644121173e..63ca77f9a91 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,9 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaCompilationCache::XlaCompilationCache(const XlaCompiler::Options& options)
-    : compiler_(options) {}
-
+XlaCompilationCache::XlaCompilationCache(xla::Client* client,
+                                         DeviceType device_type)
+    : client_(client), device_type_(std::move(device_type)) {}
 XlaCompilationCache::~XlaCompilationCache() = default;
 
 string XlaCompilationCache::DebugString() {
@@ -54,7 +56,7 @@ string XlaCompilationCache::SignatureDebugString(const Signature& sig) {
   }
 
   for (const auto& v : sig.arg_values) {
-    strings::StrAppend(&result, "; ", v.first, ":", v.second.DebugString());
+    strings::StrAppend(&result, "; ", v.DebugString());
   }
   return result;
 }
@@ -65,9 +67,7 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
-    if (arg_values[i].first != other.arg_values[i].first ||
-        arg_values[i].second.tensor_data() !=
-            other.arg_values[i].second.tensor_data()) {
+    if (arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
       return false;
     }
   }
@@ -85,68 +85,159 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
     }
   }
   for (const auto& arg : signature.arg_values) {
-    h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
-    h = Hash64Combine(h, Hash64(arg.second.tensor_data().data(),
-                                arg.second.tensor_data().size()));
+    h = Hash64Combine(
+        h, Hash64(arg.tensor_data().data(), arg.tensor_data().size()));
   }
   return h;
 }
 
+Status XlaCompilationCache::BuildSignature(
+    const NameAttrList& function, int num_constant_args,
+    const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
+    Signature* signature) {
+  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
+  signature->arg_values.resize(num_constant_args);
+
+  signature->arg_types.reserve(ctx->num_inputs() - num_constant_args);
+
+  // Inputs are in the order: constants, non-constants, resource variables.
+  int input_num = 0;
+  // Use the values of compile time constants in the signature->
+  while (input_num < num_constant_args) {
+    signature->arg_values[input_num] = ctx->input(input_num);
+    ++input_num;
+  }
+  // Add the types and shapes of the remaining arguments.
+  while (input_num < ctx->num_inputs() - variable_args.size()) {
+    signature->arg_types.emplace_back(ctx->input_dtype(input_num),
+                                      ctx->input(input_num).shape());
+    ++input_num;
+  }
+  // For variable signatures, use the type and shape of the variable's
+  // current value.
+  for (const OptionalTensor& variable : variable_args) {
+    TF_RET_CHECK(input_num < ctx->num_inputs());
+    if (variable.present) {
+      signature->arg_types.emplace_back(variable.value.dtype(),
+                                        variable.value.shape());
+    } else {
+      signature->arg_types.emplace_back(DT_INVALID, TensorShape());
+    }
+    ++input_num;
+  }
+  return Status::OK();
+}
+
 namespace {
 
 // Builds a XlaCompiler::Argument vector from the arguments to the _XlaLaunch
 // op. The first `num_constant_args` arguments must be host-memory Tensors.
-std::vector<XlaCompiler::Argument> BuildArguments(int num_constant_args,
-                                                  OpKernelContext* ctx) {
-  std::vector<XlaCompiler::Argument> args(ctx->num_inputs());
-  int parameter_num = 0;
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    args[i].type = ctx->input(i).dtype();
-    args[i].shape = ctx->input(i).shape();
-    if (i < num_constant_args || ctx->input(i).NumElements() == 0) {
-      args[i].parameter = -1;
-      args[i].constant_value = ctx->input(i);
-    } else {
-      args[i].parameter = parameter_num;
-      ++parameter_num;
-    }
+Status BuildArguments(int num_constant_args,
+                      const std::vector<OptionalTensor>& variable_args,
+                      OpKernelContext* ctx,
+                      std::vector<XlaCompiler::Argument>* args) {
+  args->resize(ctx->num_inputs());
+
+  int input_num = 0;
+
+  // Handles compile-time constants.
+  TF_RET_CHECK(num_constant_args <= ctx->num_inputs());
+  while (input_num < num_constant_args) {
+    const Tensor& input = ctx->input(input_num);
+    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+    XlaCompiler::Argument& arg = (*args)[input_num];
+    arg.kind = XlaCompiler::Argument::kConstant;
+    arg.type = input.dtype();
+    arg.shape = input.shape();
+    arg.constant_value = input;
+    ++input_num;
   }
-  return args;
+
+  // Handles the non-constant arguments.
+  int num_variable_args = variable_args.size();
+  int num_nonconst_args =
+      ctx->num_inputs() - num_variable_args - num_constant_args;
+  TF_RET_CHECK(num_nonconst_args >= 0);
+  while (input_num < num_constant_args + num_nonconst_args) {
+    const Tensor& input = ctx->input(input_num);
+    TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+    XlaCompiler::Argument& arg = (*args)[input_num];
+    if (input.NumElements() > 0) {
+      arg.kind = XlaCompiler::Argument::kParameter;
+    } else {
+      arg.kind = XlaCompiler::Argument::kConstant;
+      arg.constant_value = input;
+    }
+    arg.type = input.dtype();
+    arg.shape = input.shape();
+    ++input_num;
+  }
+
+  // Handles resource variables.
+  TF_RET_CHECK(input_num + num_variable_args == ctx->num_inputs());
+  for (int variable_id = 0; variable_id < num_variable_args; ++variable_id) {
+    const Tensor& input = ctx->input(input_num);
+    TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+
+    XlaCompiler::Argument& arg = (*args)[input_num];
+
+    arg.name = variable_args[variable_id].name;
+    if (variable_args[variable_id].present) {
+      const Tensor& value = variable_args[variable_id].value;
+      arg.kind = XlaCompiler::Argument::kVariable;
+      arg.type = value.dtype();
+      arg.shape = value.shape();
+    } else {
+      // The values of uninitialized variables are not passed as inputs, since
+      // they are meaningless. However, it is legal to assign to a resource
+      // variable for the first time inside the XLA computation, so we do permit
+      // uninitialized variables.
+      arg.kind = XlaCompiler::Argument::kUninitializedVariable;
+      arg.type = DT_INVALID;
+      arg.shape = TensorShape();
+    }
+    ++input_num;
+  }
+
+  return Status::OK();
 }
 
 }  // namespace
 
 Status XlaCompilationCache::Compile(
-    const NameAttrList& function, int num_constant_args, OpKernelContext* ctx,
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    int num_constant_args, const std::vector<OptionalTensor>& variable_args,
+    OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
-    std::vector<string> argshapes;
-    VLOG(2) << "num_inputs = " << ctx->num_inputs()
-            << " num_constant_args= " << num_constant_args;
+    VLOG(2) << "num_inputs=" << ctx->num_inputs()
+            << " num_constant_args=" << num_constant_args
+            << " num_variable_args=" << variable_args.size();
     for (int i = 0; i < ctx->num_inputs(); i++) {
       TensorShape shape = ctx->input(i).shape();
-      VLOG(2) << i << ": dtype=" << ctx->input_dtype(i)
+      VLOG(2) << i << ": dtype=" << DataTypeString(ctx->input_dtype(i))
               << " present=" << ctx->has_input(i)
               << " shape=" << shape.DebugString();
-      argshapes.push_back(shape.DebugString());
+    }
+    for (const OptionalTensor& variable : variable_args) {
+      VLOG(2) << "variable present=" << variable.present
+              << " type=" << DataTypeString(variable.value.dtype())
+              << " shape=" << variable.value.shape().DebugString();
     }
     VLOG(2) << "num_outputs = " << ctx->num_outputs();
     for (int i = 0; i < ctx->num_outputs(); i++) {
       VLOG(2) << i << ": dtype=" << ctx->expected_output_dtype(i);
     }
   }
+
+  TF_RET_CHECK(num_constant_args + variable_args.size() <= ctx->num_inputs());
+
   Signature signature;
-  signature.name = Canonicalize(function.name(), function.attr());
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    signature.arg_types.emplace_back(ctx->input_dtype(i),
-                                     ctx->input(i).shape());
-    if (i < num_constant_args) {
-      signature.arg_values.emplace_back(i, ctx->input(i));
-    }
-  }
+  TF_RETURN_IF_ERROR(BuildSignature(function, num_constant_args, variable_args,
+                                    ctx, &signature));
 
   VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
@@ -169,24 +260,22 @@ Status XlaCompilationCache::Compile(
   if (!entry->compiled) {
     // Do the actual JIT compilation without holding the lock (it can take
     // a long time.)
-    std::vector<XlaCompiler::Argument> args =
-        BuildArguments(num_constant_args, ctx);
-
-    std::unique_ptr<FunctionLibraryRuntime> flr(NewFunctionLibraryRuntime(
-        compiler_.device_mgr(), ctx->env(), compiler_.device(),
-        TF_GRAPH_DEF_VERSION,
-        ctx->function_library()->GetFunctionLibraryDefinition(),
-        OptimizerOptions(), nullptr /* custom_kernel_creator */));
+    std::vector<XlaCompiler::Argument> args;
+    TF_RETURN_IF_ERROR(
+        BuildArguments(num_constant_args, variable_args, ctx, &args));
 
+    XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status = compiler_.CompileFunction(
-        flr.get(), function, args, &entry->compilation_result);
+    entry->compilation_status =
+        compiler.CompileFunction(XlaCompiler::CompileOptions(), function, args,
+                                 &entry->compilation_result);
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
     if (entry->executable == nullptr &&
-        !entry->compilation_result.computation.IsNull()) {
-      entry->compilation_status = compiler_.BuildExecutable(
+        !entry->compilation_result.computation->IsNull()) {
+      XlaCompiler compiler(options);
+      entry->compilation_status = compiler.BuildExecutable(
           entry->compilation_result, &entry->executable);
     }
     *executable = entry->executable.get();
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 44d76db0fd4..4ffcb68a322 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -30,6 +29,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Struct that represents a possibly-absent Tensor.
+struct OptionalTensor {
+  string name;           // A descriptive name
+  bool present = false;  // Is the tensor present?
+  Tensor value;          // If present, what is the Tensor's value?
+};
+
 // The XlaCompilationCache class caches the results of the XlaCompiler class,
 // which converts a Tensorflow graph into a compiled XLA compilation.
 //
@@ -40,39 +46,47 @@ namespace tensorflow {
 // bound.
 class XlaCompilationCache : public ResourceBase {
  public:
-  explicit XlaCompilationCache(const XlaCompiler::Options& options);
+  XlaCompilationCache(xla::Client* client, DeviceType device_type);
   ~XlaCompilationCache() override;
 
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
-  // to execute an XLA Computation. `compilation_result` must be non-null.
-  // If `executable` is non-null, also builds an xla::LocalExecutable and sets
-  // `executable to point to it. The resulting executable pointer may be null if
-  // the computation has no non-constant outputs.
-  // Compilation results are cached.
-  Status Compile(const NameAttrList& function, int num_constant_args,
+  // to execute an XLA Computation. Compilation results are cached.
+  // `function` is the name of a Tensorflow function to compile.
+  // `num_constant_args` is the number of compile-time constant arguments to
+  // `function`. `variable_args` is a snapshot of the current values of the
+  // resource variable arguments to `function`; uninitialized variables are
+  // represented by an absent OptionalTensor.
+  // The result of compilation is written to `*compilation_result`, which must
+  // be non-null. If `executable` is non-null, also builds an
+  // xla::LocalExecutable and sets `executable to point to it. The resulting
+  // executable pointer may be null if the computation has no non-constant
+  // outputs.
+  Status Compile(const XlaCompiler::Options& options,
+                 const NameAttrList& function, int num_constant_args,
+                 const std::vector<OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable);
 
-  xla::Client* client() const { return compiler_.client(); }
+  xla::Client* client() const { return client_; }
+  const DeviceType& device_type() const { return device_type_; }
 
   string DebugString() override;
 
  private:
-  XlaCompiler compiler_;
-  std::unique_ptr<FunctionLibraryRuntime> function_library_runtime_;
+  xla::Client* const client_;
+  const DeviceType device_type_;
 
   // Describes the types, shapes and any compile-time constant arguments
-  // to a kernel.
+  // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
     string name;
 
     std::vector<std::pair<DataType, TensorShape>> arg_types;
 
-    // List of (argument #, value) pairs for arguments whose values are
-    // part of the JIT signature, and that are therefore constants in any given
-    // JIT compilation. Tensors must be in host memory.
-    std::vector<std::pair<int, Tensor>> arg_values;
+    // List of Tensor values for compile-time constant arguments to the
+    // compilation, ordered by argument number. Tensors must be in host memory.
+    std::vector<Tensor> arg_values;
 
     bool operator==(const Signature& other) const;
 
@@ -82,6 +96,11 @@ class XlaCompilationCache : public ResourceBase {
   };
   static string SignatureDebugString(const Signature& sig);
 
+  // Builds the signature for a compilation.
+  Status BuildSignature(const NameAttrList& function, int num_constant_args,
+                        const std::vector<OptionalTensor>& variable_args,
+                        OpKernelContext* ctx, Signature* signature);
+
   // The value associated with a cache entry.
   struct Entry {
     mutex mu;
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 92784a5358b..e8b1f542ecf 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -16,16 +16,15 @@ limitations under the License.
 // Registers the XLA_CPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "Host" (CPU) backend.
 
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-const char* const DEVICE_XLA_CPU = "XLA_CPU";
-
 class XlaCpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 7835146a01d..5e336c5287b 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -19,11 +19,10 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -56,10 +56,13 @@ namespace tensorflow {
           << device_ordinal;
 
   // These are no-ops if they have already been done previously for
-  // this device_name/jit_device_name pair.
-  XlaOpRegistry::RegisterJitKernels();
-  XlaOpRegistry::RegisterJitDevice(device_name, jit_device_name,
-                                   /*requires_jit=*/true);
+  // this device_name/compilation_device_name pair.
+  XlaOpRegistry::DeviceRegistration registration;
+  registration.compilation_device_name = jit_device_name;
+  registration.requires_compilation = true;
+  registration.enable_jit_by_default = false;
+  registration.compile_resource_ops = true;
+  XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
 
   auto platform = perftools::gputools::MultiPlatformManager::PlatformWithName(
       platform_name);
@@ -106,12 +109,23 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 string XlaDevice::Metadata::DebugString() { return "XLA device metadata"; }
 
+/* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
+                                           Metadata** metadata) {
+  ResourceMgr* rm = ctx->resource_manager();
+  if (rm == nullptr) {
+    return errors::Internal("No resource manager.");
+  }
+  TF_RETURN_IF_ERROR(
+      rm->Lookup<Metadata>(rm->default_container(), "xla_metadata", metadata));
+  return Status::OK();
+}
+
 XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceAttributes& attrs, int device_ordinal,
                      const DeviceType& jit_device_name,
                      perftools::gputools::Platform* platform,
                      Allocator* xla_allocator)
-    : LocalDevice(options, attrs, xla_allocator),
+    : LocalDevice(options, attrs),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(xla_allocator),
@@ -161,6 +175,10 @@ Status XlaDevice::FillContextMap(const Graph* graph,
 void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":"
           << op_kernel->type_string();
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   op_kernel->Compute(context);
 }
 
@@ -168,6 +186,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                              AsyncOpKernel::DoneCallback done) {
   VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":"
           << op_kernel->type_string();
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -203,6 +222,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
+  XlaOpRegistry::RegisterCompilationKernels();
   XlaDeviceOpRegistrations* registrations = new XlaDeviceOpRegistrations;
   auto dummy_factory = [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaDeviceDummyOp(context);
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 3de14f30616..0badb390c6b 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -67,6 +67,10 @@ class XlaDevice : public LocalDevice {
     perftools::gputools::Platform* platform_;  // Not owned.
   };
 
+  // Sets `*metadata` to the XlaDevice Metadata in the resource manager of
+  // `ctx`.
+  static Status GetMetadata(OpKernelContext* ctx, Metadata** metadata);
+
   // Factory function. 'platform_name' is the name of the XLA platform.
   // 'device_name' is the name of the Tensorflow device to create.
   // 'jit_device_name' is the name of the corresponding JIT device.
diff --git a/tensorflow/compiler/jit/xla_device_launch_op.cc b/tensorflow/compiler/jit/xla_device_launch_op.cc
deleted file mode 100644
index 1d5d7da14cc..00000000000
--- a/tensorflow/compiler/jit/xla_device_launch_op.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/xla_device_launch_op.h"
-
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/compiler/jit/xla_device_context.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/env.h"
-
-namespace tensorflow {
-
-namespace {
-
-Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** compiler) {
-  XlaDevice::Metadata* metadata;
-  Status s = rm->Lookup<XlaDevice::Metadata>(rm->default_container(),
-                                             "xla_metadata", &metadata);
-  if (!s.ok()) {
-    return s;
-  }
-  core::ScopedUnref metadata_ref(metadata);
-  XlaCompiler::Options options;
-  options.device_type = metadata->jit_device_type();
-  options.client = metadata->client();
-  options.allow_cpu_custom_calls = false;
-  options.local_executable_has_hybrid_result = false;
-  *compiler = new XlaCompilationCache(options);
-  return Status::OK();
-}
-
-}  // namespace
-
-XlaDeviceLaunchOp::XlaDeviceLaunchOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx) {
-  const NameAttrList* func;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
-  function_ = *func;
-  VLOG(1) << "XlaDeviceLaunch created function="
-          << Canonicalize(function_.name(), function_.attr());
-  DataTypeVector constant_types;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
-  num_constant_args_ = constant_types.size();
-}
-
-void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XlaDeviceLaunch::Compute "
-          << Canonicalize(function_.name(), function_.attr());
-  // We store information about the JIT-compiled XLA computation
-  // in the ResourceMgr.
-  ResourceMgr* rm = ctx->resource_manager();
-  OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
-
-  XlaCompilationCache* compiler;
-  OP_REQUIRES_OK(ctx,
-                 rm->LookupOrCreate<XlaCompilationCache>(
-                     rm->default_container(), "xla_compiler", &compiler,
-                     [rm](XlaCompilationCache** compiler) {
-                       return BuildCompilationCache(rm, compiler);
-                     }));
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
-
-  const XlaCompiler::CompilationResult* kernel;
-  OP_REQUIRES_OK(
-      ctx,
-      compiler->Compile(function_, num_constant_args_, ctx, &kernel, nullptr));
-
-  VLOG(1) << "Executing XLA Computation...";
-
-  OP_REQUIRES(ctx, ctx->num_outputs() == kernel->outputs.size(),
-              errors::Internal("Unexpected number of outputs"));
-
-  // Run the computation, if any. There might not be a computation if all
-  // outputs were compile-time constants.
-  std::vector<std::unique_ptr<xla::GlobalData>> outputs;
-  if (!kernel->computation.IsNull()) {
-    auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
-
-    // Convert argument tensors to xla::GlobalData pointers.
-    std::vector<std::shared_ptr<xla::GlobalData>> arg_handles(
-        kernel->xla_input_shapes.size());
-    std::vector<xla::GlobalData*> arg_ptrs(kernel->xla_input_shapes.size());
-    for (int i = 0; i < kernel->xla_input_shapes.size(); ++i) {
-      int input_num = kernel->xla_input_shapes[i].first;
-      arg_handles[i] =
-          XlaTransferManager::GetTensorGlobalData(ctx->input(input_num));
-      arg_ptrs[i] = arg_handles[i].get();
-    }
-
-    // Execute the computation.
-    xla::ExecutionProfile profile;
-    xla::ExecutionOptions execution_options;
-    *execution_options.mutable_shape_with_output_layout() =
-        kernel->xla_output_shape;
-    Env* env = Env::Default();
-    auto start_time = env->NowMicros();
-    auto result = compiler->client()->Execute(kernel->computation, arg_ptrs,
-                                              &execution_options, &profile);
-    auto elapsed = env->NowMicros() - start_time;
-    OP_REQUIRES(ctx, result.ok(), result.status());
-
-    VLOG(1) << "Elapsed time: " << elapsed << "us";
-    VLOG(1) << "ExecutionProfile: " << profile.DebugString();
-
-    if (xla::ShapeUtil::IsTuple(kernel->xla_output_shape)) {
-      auto outputs_or_error =
-          compiler->client()->DeconstructTuple(*result.ValueOrDie());
-      OP_REQUIRES(ctx, outputs_or_error.ok(), outputs_or_error.status());
-      outputs = outputs_or_error.ConsumeValueOrDie();
-    } else {
-      outputs.push_back(result.ConsumeValueOrDie());
-    }
-  }
-
-  XlaDeviceContext* device_context = ctx->op_device_context<XlaDeviceContext>();
-
-  // Copy XLA outputs to the operator's outputs.
-  int output_num = 0;
-  for (int i = 0; i < ctx->num_outputs(); ++i) {
-    Tensor* output;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(i, kernel->outputs[i].shape, &output));
-    if (kernel->outputs[i].is_constant) {
-      // TODO(phawkins): mark constant _XlaLaunch outputs as HostMemory and
-      // remove the copy from this code.
-      Status status;
-      device_context->CopyCPUTensorToDevice(
-          &kernel->outputs[i].constant_value, nullptr, output,
-          [&status](const Status& s) { status = s; });
-      if (!status.ok()) {
-        ctx->SetStatus(status);
-        return;
-      }
-    } else {
-      CHECK_LT(output_num, outputs.size());
-      XlaTransferManager::SetTensorGlobalData(
-          std::shared_ptr<xla::GlobalData>(std::move(outputs[output_num])),
-          output);
-      ++output_num;
-    }
-  }
-
-  VLOG(1) << "Done";
-}
-
-XlaDeviceLaunchOp::~XlaDeviceLaunchOp() {
-  VLOG(1) << "XlaDeviceLaunch destroyed";
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_launch_op.h b/tensorflow/compiler/jit/xla_device_launch_op.h
deleted file mode 100644
index fbb9319b844..00000000000
--- a/tensorflow/compiler/jit/xla_device_launch_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_LAUNCH_OP_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_LAUNCH_OP_H_
-
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-// The XlaDeviceLaunchOp is used to replace a region of the TensorFlow graph
-// which will be compiled and executed using XLA.  The XlaDeviceLaunchOp is
-// responsible for handling interactions with the TensorFlow executor.
-// Once all inputs are present, and their shapes are known, the op can
-// use a 'TlaJit' to compile and execute code which is specific
-// to the shapes of input Tensors.
-class XlaDeviceLaunchOp : public OpKernel {
- public:
-  explicit XlaDeviceLaunchOp(OpKernelConstruction* ctx);
-  ~XlaDeviceLaunchOp() override;
-
-  void Compute(OpKernelContext* ctx) override;
-
- private:
-  NameAttrList function_;
-  int num_constant_args_;
-  Tensor dummy_tensor_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaDeviceLaunchOp);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/xla_device_ops.cc b/tensorflow/compiler/jit/xla_device_ops.cc
index 0d3a2fa3393..f68dba6b6a2 100644
--- a/tensorflow/compiler/jit/xla_device_ops.cc
+++ b/tensorflow/compiler/jit/xla_device_ops.cc
@@ -19,13 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-void XlaDeviceAssignOp::Copy(OpKernelContext* context, Tensor* lhs,
-                             const Tensor& rhs) {
-  std::shared_ptr<xla::GlobalData> gd =
-      XlaTransferManager::GetTensorGlobalData(rhs);
-  XlaTransferManager::SetTensorGlobalData(std::move(gd), lhs);
-}
-
 XlaDeviceDummyOp::XlaDeviceDummyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
 void XlaDeviceDummyOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 1fcb515ddb3..8699006ebc5 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -18,9 +18,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
 
-#include "tensorflow/compiler/jit/xla_device_launch_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/assign_op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/kernels/control_flow_ops.h"
 #include "tensorflow/core/kernels/identity_op.h"
@@ -30,14 +29,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Implementation of Assign for XLA devices.
-class XlaDeviceAssignOp : public AssignOp {
- public:
-  using AssignOp::AssignOp;
-
-  void Copy(OpKernelContext* context, Tensor* lhs, const Tensor& rhs) override;
-};
-
 // Dummy OpKernel, used for kernels assigned to an XLA device that should be
 // compiled. Should never be called at runtime since such ops should be
 // rewritten to a _XlaLaunch op. If it is called, it means the placer placed an
@@ -49,8 +40,11 @@ class XlaDeviceDummyOp : public OpKernel {
 };
 
 #define REGISTER_XLA_LAUNCH_KERNEL(DEVICE, KERNEL, TYPES) \
-  REGISTER_KERNEL_BUILDER(                                \
-      Name("_XlaLaunch").Device(DEVICE).HostMemory("constants"), KERNEL);
+  REGISTER_KERNEL_BUILDER(Name("_XlaLaunch")              \
+                              .Device(DEVICE)             \
+                              .HostMemory("constants")    \
+                              .HostMemory("resources"),   \
+                          KERNEL);
 
 #define REGISTER_XLA_DEVICE_KERNELS(DEVICE, TYPES)                             \
   REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE), SendOp);               \
@@ -65,53 +59,13 @@ class XlaDeviceDummyOp : public OpKernel {
       ConstantOp);                                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
-  REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE),                  \
-                          XlaDeviceDummyOp);                                   \
+  REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp);  \
+  REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
+                          PlaceholderOp);                                      \
                                                                                \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Variable").Device(DEVICE).TypeConstraint("dtype", TYPES),          \
-      VariableOp);                                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("VariableV2").Device(DEVICE).TypeConstraint("dtype", TYPES),        \
-      VariableOp);                                                             \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("TemporaryVariable").Device(DEVICE).TypeConstraint("dtype", TYPES), \
-      TemporaryVariableOp);                                                    \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                     \
-                              .Device(DEVICE)                                  \
-                              .TypeConstraint("T", TYPES),                     \
-                          DestroyTemporaryVariableOp);                         \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                        \
-                              .Device(DEVICE)                                  \
-                              .TypeConstraint("dtype", TYPES)                  \
-                              .HostMemory("is_initialized"),                   \
-                          IsVariableInitializedOp);                            \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Assign").Device(DEVICE).TypeConstraint("T", TYPES),                \
-      XlaDeviceAssignOp);                                                      \
-                                                                               \
-  REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE),               \
-                          ControlTriggerOp);                                   \
-  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
-  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
-                          NextIterationOp);                                    \
-  REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
-                          SwitchOp);                                           \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
-  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
-                              .Device(DEVICE)                                  \
-                              .HostMemory("input")                             \
-                              .HostMemory("output"),                           \
-                          IdentityOp);
-
-// TODO(phawkins): do we really need Placeholder? Should it be a real
-// implementation of Placeholder?
-
-// TODO(b/32507444): the registrations for the control flow operators are
-// temporary and exist primarily to work around a bug in the graph partitioning
-// code.
+      Name("VarHandleOp").Device(DEVICE).HostMemory("resource"),               \
+      ResourceHandleOp<Var>);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index db4c86505cb..872588a24e0 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -16,16 +16,15 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" (GPU) backend.
 
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-const char* const DEVICE_XLA_GPU = "XLA_GPU";
-
 class XlaGpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
new file mode 100644
index 00000000000..8c2e9a7c818
--- /dev/null
+++ b/tensorflow/compiler/plugin/BUILD
@@ -0,0 +1,38 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Configuration file for an XLA plugin.
+- please don't check in changes to this file
+- to prevent changes appearing in git status, use:
+  git update-index --assume-unchanged tensorflow/compiler/plugin/BUILD
+
+To add additional devices to the XLA subsystem, add targets to the
+dependency list in the 'plugin' target. For instance:
+
+    deps = ["//tensorflow/compiler/plugin/example:plugin_lib"],
+"""
+
+licenses(["notice"])
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "plugin",
+    deps = [
+        "//tensorflow/compiler/plugin/executor:plugin_lib",
+    ],
+)
diff --git a/tensorflow/compiler/plugin/executor/BUILD b/tensorflow/compiler/plugin/executor/BUILD
new file mode 100644
index 00000000000..9bc706abdf6
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/BUILD
@@ -0,0 +1,32 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "plugin_lib",
+    srcs = glob([
+        "*.cc",
+    ]),
+    hdrs = glob([
+        "*.h",
+    ]),
+    deps = [
+        "//tensorflow/compiler/jit:xla_jit_headers_lib",
+        "//tensorflow/compiler/xla:xla_headers_lib",
+        "//tensorflow/compiler/xla/service:hlo_evaluator",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@protobuf//:protobuf_headers",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/compiler/plugin/executor/compiler.cc b/tensorflow/compiler/plugin/executor/compiler.cc
new file mode 100644
index 00000000000..893ff152f0c
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/compiler.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdlib.h>
+#include <fstream>
+
+#include "tensorflow/compiler/plugin/executor/compiler.h"
+#include "tensorflow/compiler/plugin/executor/executable.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+namespace port = ::perftools::gputools::port;
+
+namespace xla {
+namespace executorplugin {
+
+/*
+ * Run optimization passes on the module.  The graph is transformed by
+ * each pass in the optimization pipeline.  The service subdirectory
+ * contains useful optimization passes.
+ */
+Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module,
+                                            HloDumper dump_hlo) {
+  HloPassPipeline pipeline("Executor", dump_hlo);
+  pipeline.AddPass<Inliner>();
+  pipeline.AddPass<HloSubcomputationUnification>();
+  pipeline.AddPass<HloCSE>(false);
+
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+      false, [](const Shape&, const Shape&) { return false; });
+  pipeline.AddPass<ReshapeMover>();
+  pipeline.AddPass<HloConstantFolding>();
+  pipeline.AddPass<HloCSE>(true);
+
+  pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
+  return pipeline.Run(hlo_module).status();
+}
+
+StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
+        std::unique_ptr<HloModule> hlo_module, HloDumper dump_hlo,
+        se::StreamExecutor* stream_exec) {
+  TF_RET_CHECK(stream_exec != nullptr);
+
+  VLOG(1) << "Generate graph " << hlo_module->name();
+
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get(), dump_hlo));
+
+  // Typically you would visit the HLO graph, building up a compiled equivalent
+  // In this case we are using an Hlo evaluator at execution time, so we don't
+  // need to compile anything
+
+  // Create executable from only the Hlo module
+  std::unique_ptr<Executable> executable;
+  executable.reset(new ExecutorExecutable(std::move(hlo_module)));
+
+  return std::move(executable);
+}
+
+StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
+        std::vector<std::unique_ptr<HloModule>> hlo_modules,
+        HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+
+  return tensorflow::errors::Unimplemented(
+      "Compilation of multiple HLO modules is not supported on Executor.");
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+ExecutorCompiler::CompileAheadOfTime(
+    std::vector<std::unique_ptr<HloModule>> hlo_modules,
+    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
+
+  return tensorflow::errors::InvalidArgument(
+      "AOT compilation not supported on Executor");
+}
+
+se::Platform::Id ExecutorCompiler::PlatformId() const {
+  return sep::kExecutorPlatformId;
+}
+
+HloCostAnalysis::ShapeSizeFunction
+ExecutorCompiler::ShapeSizeBytesFunction() const {
+  return ExecutorExecutable::ShapeSizeBytes;
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
+
+REGISTER_MODULE_INITIALIZER(executor_compiler, {
+  xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
+    return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
+  });
+});
diff --git a/tensorflow/compiler/plugin/executor/compiler.h b/tensorflow/compiler/plugin/executor/compiler.h
new file mode 100644
index 00000000000..8fe591c8abd
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/compiler.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorCompiler : public Compiler {
+ public:
+  ExecutorCompiler() {}
+  ~ExecutorCompiler() override {}
+
+  StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<HloModule> hlo_module,
+      HloDumper dump_hlo,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::vector<std::unique_ptr<HloModule>> hlo_module,
+      HloDumper dump_hlo,
+      std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      HloDumper dump_hlo, const AotCompilationOptions& options) override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  perftools::gputools::Platform::Id PlatformId() const override;
+
+ private:
+  Status RunHloOptimization(HloModule* hlo_module, HloDumper dump_hlo);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
diff --git a/tensorflow/compiler/plugin/executor/device.cc b/tensorflow/compiler/plugin/executor/device.cc
new file mode 100644
index 00000000000..bbc39dc03f8
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/device.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_device_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_XLA_EXEC = "XLA_EXEC";
+const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT";
+
+constexpr std::array<DataType, 5> kExecAllTypes = {
+    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};
+
+class XlaExaDeviceFactory : public DeviceFactory {
+ public:
+  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+                       std::vector<Device*>* devices) override;
+};
+
+Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
+                                          const string& name_prefix,
+                                          std::vector<Device*>* devices) {
+  static XlaDeviceOpRegistrations* registrations =
+      RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT);
+  (void)registrations;
+
+  std::unique_ptr<XlaDevice> device;
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0,
+                                       DEVICE_EXEC_XLA_JIT, options,
+                                       name_prefix, &device));
+  devices->push_back(device.release());
+  return Status::OK();
+}
+
+REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110);
+
+// Kernel registrations
+
+static bool OpFilter(KernelDef* kdef) { return true; }
+
+REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes);
+REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes);
+REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc
new file mode 100644
index 00000000000..92a517ba533
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executable.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executable.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
+    : Executable(std::move(hlo_module), ShapeSizeBytes) {}
+
+ExecutorExecutable::~ExecutorExecutable() {}
+
+static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
+  void* buf = executor->Allocate(size);
+  const void* src = LiteralUtil::InternalData(literal);
+  memcpy(buf, src, size);
+  return se::DeviceMemoryBase(buf, size);
+}
+
+static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  const Shape& shape = literal.shape();
+  if (shape.element_type() != xla::TUPLE) {
+    return AllocateSingleOutput(executor, literal);
+  } else {
+    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
+    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
+    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
+      se::DeviceMemoryBase out =
+          AllocateSingleOutput(executor, literal.tuple_literals(n));
+      *buf++ = out.opaque();
+    }
+
+    return se::DeviceMemoryBase(buf, size);
+  }
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* stream = run_options->stream();
+
+  VLOG(1) << "Execute " << module().name();
+  if (VLOG_IS_ON(2)) {
+    for (const auto& a : arguments) {
+      VLOG(2) << "-- argument " << a.opaque();
+    }
+  }
+
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
+  HloComputation* computation = module().entry_computation();
+  if (computation->num_parameters() != arguments.size()) {
+    return tensorflow::errors::Internal(
+        "Mismatch between argument count and graph parameter count.");
+  }
+
+  // Create the arguments as an vector of XLA literals
+  std::vector<std::unique_ptr<Literal>> arg_literals;
+  std::vector<Literal*> arg_literals_ptrs;
+  for (int64 p = 0; p < computation->num_parameters(); p++) {
+    // Create the input literal for the parameter
+    HloInstruction* param = computation->parameter_instruction(p);
+    arg_literals.emplace_back(LiteralUtil::CreateFromShape(param->shape()));
+    arg_literals_ptrs.push_back(arg_literals.back().get());
+
+    // Copy in the data from the stream_executor buffers
+    void* buffer = LiteralUtil::MutableInternalData(arg_literals.back().get());
+    memcpy(buffer, arguments[p].opaque(),
+           ShapeUtil::ByteSizeOf(param->shape()));
+  }
+
+  // Execute the graph using the evaluator
+  HloEvaluator evaluator;
+  std::unique_ptr<Literal> output;
+  TF_ASSIGN_OR_RETURN(output,
+                      evaluator.Evaluate(computation, arg_literals_ptrs));
+
+  // Copy the result into the return buffer
+  perftools::gputools::StreamExecutor* executor(stream->parent());
+  sep::ExecutorExecutor* executorExecutor(
+      static_cast<sep::ExecutorExecutor*>(executor->implementation()));
+
+  se::DeviceMemoryBase ret =
+      AllocateOutputBuffer(executorExecutor, *(output.get()));
+
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+  }
+
+  return ret;
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteOnStream is not yet supported on Executor.");
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteAsyncOnStream is not yet supported on Executor.");
+}
+
+/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
+  if (ShapeUtil::IsOpaque(shape)) {
+    return sizeof(void*);
+  }
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
diff --git a/tensorflow/compiler/plugin/executor/executable.h b/tensorflow/compiler/plugin/executor/executable.h
new file mode 100644
index 00000000000..ba3d4da21d0
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executable.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorExecutable : public Executable {
+ public:
+  ExecutorExecutable(std::unique_ptr<HloModule> hlo_module);
+  ~ExecutorExecutable() override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments) override;
+
+  static int64 ShapeSizeBytes(const Shape& shape);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
diff --git a/tensorflow/compiler/plugin/executor/executor.cc b/tensorflow/compiler/plugin/executor/executor.cc
new file mode 100644
index 00000000000..e72c2711f79
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executor.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+namespace se = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+host::HostStream *AsExecutorStream(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return dynamic_cast<host::HostStream *>(stream->implementation());
+}
+
+ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
+    : plugin_config_(plugin_config) {}
+
+ExecutorExecutor::~ExecutorExecutor() {}
+
+void *ExecutorExecutor::Allocate(uint64 size) {
+  void *buf = new char[size];
+  return buf;
+}
+
+void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                         uint64 offset_bytes,
+                                         uint64 size_bytes) {
+  return parent + offset_bytes;
+}
+
+void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) {
+  if (!mem->is_sub_buffer()) {
+    delete[] static_cast<char *>(mem->opaque());
+  }
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst,
+                             const DeviceMemoryBase &dev_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
+    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
+  });
+  return true;
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                             const void *host_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
+    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
+  });
+  return true;
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                                const void *host_src,
+                                                uint64 size) {
+  memcpy(dev_dst->opaque(), host_src, size);
+  return port::Status::OK();
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst,
+                                                const DeviceMemoryBase &dev_src,
+                                                uint64 size) {
+  memcpy(host_dst, dev_src.opaque(), size);
+  return port::Status::OK();
+}
+
+bool ExecutorExecutor::HostCallback(Stream *stream,
+                                   std::function<void()> callback) {
+  AsExecutorStream(stream)->EnqueueTask(callback);
+  return true;
+}
+
+bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+  AsExecutorStream(dependent)->EnqueueTask(
+      [other]() { other->BlockHostUntilDone(); });
+  AsExecutorStream(dependent)->BlockUntilDone();
+  return true;
+}
+
+bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
+  return true;
+}
+
+bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
+  return true;
+}
+
+bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) {
+  AsExecutorStream(stream)->BlockUntilDone();
+  return true;
+}
+
+DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  builder.set_device_address_bits(64);
+
+  builder.set_name("Executor");
+  builder.set_device_vendor("VectorName");
+  builder.set_platform_version("1.0");
+  builder.set_driver_version("1.0");
+  builder.set_runtime_version("1.0");
+  builder.set_pci_bus_id("1");
+  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
+  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/compiler/plugin/executor/executor.h b/tensorflow/compiler/plugin/executor/executor.h
new file mode 100644
index 00000000000..32fdb157e48
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/executor.h
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the ExecutorExecutor class, which is a CPU-only implementation of
+// the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+#include <list>
+#include <mutex>
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
+
+class ExecutorExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit ExecutorExecutor(const PluginConfig &plugin_config);
+  ~ExecutorExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override {
+    return false;
+  }
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &kernel,
+              const KernelArgsArrayBase &args) override {
+    return false;
+  }
+
+  void *Allocate(uint64 size) override;
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  void *HostMemoryAllocate(uint64 size) override { return new char[size]; }
+  void HostMemoryDeallocate(void *mem) override {
+    delete[] static_cast<char *>(mem);
+  }
+  bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
+  bool HostMemoryUnregister(void *mem) override { return true; }
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src,
+              uint64 size) override;
+  bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src,
+              uint64 size) override;
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst,
+                            const DeviceMemoryBase &host_src,
+                            uint64 size) override {
+    return false;
+  }
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override {
+    return false;
+  }
+  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
+              uint64 size) override {
+    return false;
+  }
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override {
+    return false;
+  }
+
+  // No "synchronize all activity" implemented for this platform at the moment.
+  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override {
+    return false;
+  }
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override {
+    return false;
+  }
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst,
+                                 const void *host_src, uint64 size) override;
+  port::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &pop_src,
+                                 uint64 size) override;
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
+                                               const DeviceMemoryBase &pop_src,
+                                               uint64 size) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  port::Status AllocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status DeallocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status RecordEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  Event::Status PollForEventStatus(Event *event) override {
+    return Event::Status::kError;
+  }
+
+  bool AllocateStream(Stream *stream) override { return true; }
+  void DeallocateStream(Stream *stream) override {}
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  bool AllocateTimer(Timer *timer) override { return true; }
+  void DeallocateTimer(Timer *timer) override {}
+  bool StartTimer(Stream *stream, Timer *timer) override;
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return 1; }
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
+    return false;
+  }
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return port::Status::OK();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return true;
+  }
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
+    return SharedMemoryConfig::kDefault;
+  }
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
+    return port::Status{port::error::UNIMPLEMENTED,
+                        "Shared memory not supported"};
+  }
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(new host::HostStream());
+  }
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(new host::HostTimer());
+  }
+
+  port::StatusOr<DeviceMemoryBase> ExecuteGraph(const xla::Shape &shape,
+                                                Args args);
+
+ private:
+  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
+
+  port::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
+      const xla::Shape &shape);
+
+  const PluginConfig plugin_config_;
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
diff --git a/tensorflow/compiler/plugin/executor/platform.cc b/tensorflow/compiler/plugin/executor/platform.cc
new file mode 100644
index 00000000000..2f339f04a7b
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/platform.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+PLATFORM_DEFINE_ID(kExecutorPlatformId);
+
+ExecutorPlatform::ExecutorPlatform() : name_("Executor") {}
+
+ExecutorPlatform::~ExecutorPlatform() {}
+
+Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; }
+
+int ExecutorPlatform::VisibleDeviceCount() const { return 1; }
+
+const string& ExecutorPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
+    int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*>
+ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  mutex_lock lock(executors_mutex_);
+
+  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
+  if (status.ok()) {
+    return status.ValueOrDie();
+  }
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
+      GetUncachedExecutor(config);
+  if (!executor.ok()) {
+    return executor.status();
+  }
+
+  StreamExecutor* naked_executor = executor.ValueOrDie().get();
+  SE_RETURN_IF_ERROR(
+      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
+  return naked_executor;
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = port::MakeUnique<StreamExecutor>(
+      this, port::MakeUnique<ExecutorExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf(
+            "failed initializing StreamExecutor for device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ExecutorPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register executor trace listener";
+}
+
+void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister executor trace listener";
+}
+
+static void InitializeExecutorPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::ExecutorPlatform);
+  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager);
diff --git a/tensorflow/compiler/plugin/executor/platform.h b/tensorflow/compiler/plugin/executor/platform.h
new file mode 100644
index 00000000000..c252a589d49
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+class ExecutorPlatform : public Platform {
+ public:
+  ExecutorPlatform();
+  ~ExecutorPlatform() override;
+
+  Platform::Id id() const override;
+
+  // Device count is less clear-cut for CPUs than accelerators. This call
+  // currently returns the number of thread units in the host, as reported by
+  // base::NumCPUs().
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // This platform's name.
+  string name_;
+
+  // mutex that guards the ordinal-to-executor map.
+  mutable mutex executors_mutex_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform);
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/compiler/plugin/executor/platform_id.h b/tensorflow/compiler/plugin/executor/platform_id.h
new file mode 100644
index 00000000000..8d2b29a3e4e
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/platform_id.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+extern const Platform::Id kExecutorPlatformId;
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.cc b/tensorflow/compiler/plugin/executor/transfer_manager.cc
new file mode 100644
index 00000000000..b59d20a7791
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc
@@ -0,0 +1,182 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/transfer_manager.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorTransferManager::ExecutorTransferManager() {}
+
+se::Platform::Id ExecutorTransferManager::PlatformId() const {
+  return se::executorplugin::kExecutorPlatformId;
+}
+
+Status ExecutorTransferManager::TransferLiteralFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
+  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
+
+  // Tuples are a special case and contain one or more shapes inside of them to
+  // an arbitrary nesting depth.
+  if (device_shape.element_type() == TUPLE) {
+    *literal->mutable_shape() = literal_shape;
+    TF_ASSIGN_OR_RETURN(
+        std::vector<se::DeviceMemoryBase> element_buffers,
+        ShallowCopyTupleFromDevice(executor, source, device_shape));
+    TF_RET_CHECK(element_buffers.size() ==
+                 ShapeUtil::TupleElementCount(device_shape));
+    for (int64 i = 0; i < element_buffers.size(); ++i) {
+      const Shape& element_device_shape = device_shape.tuple_shapes(i);
+      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
+      Literal* element_literal = literal->add_tuple_literals();
+      // Recursively call TransferFromDevice to copy over the data in the
+      // element array.
+      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
+          executor, element_buffers[i], element_device_shape,
+          element_literal_shape, element_literal));
+    }
+    return Status::OK();
+  }
+
+  *literal->mutable_shape() = device_shape;
+  LiteralUtil::Reserve(ShapeUtil::ElementsIn(device_shape), literal);
+  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
+      executor, source, ShapeUtil::ByteSizeOf(device_shape),
+      LiteralUtil::MutableInternalData(literal)));
+  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
+    literal->Swap(
+        LiteralUtil::Relayout(*literal, literal_shape.layout()).get());
+  }
+  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
+  return Status::OK();
+}
+
+StatusOr<std::vector<se::DeviceMemoryBase>>
+ExecutorTransferManager::ShallowCopyTupleFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& shape) {
+  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
+
+  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
+                                      nullptr);
+  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
+                                                    element_pointers.data());
+  if (!copy_status.ok()) {
+    return AddStatus(
+        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
+               copy_status.error_message()),
+        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
+  }
+
+  // Create a DeviceMemoryBase from each void* pointer.
+  std::vector<se::DeviceMemoryBase> destination;
+  for (int i = 0; i < element_pointers.size(); ++i) {
+    if (element_pointers[i] == nullptr &&
+        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
+      return FailedPrecondition("tuple contains nullptr at element %d", i);
+    }
+    int64 buffer_size =
+        ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*));
+    destination.emplace_back(element_pointers[i], buffer_size);
+  }
+  return std::move(destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToDevice(
+    se::StreamExecutor* executor, const Literal& literal,
+    se::DeviceMemoryBase* destination) {
+  const Shape& shape = literal.shape();
+
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<void*> tuple_elements_on_device;
+    for (const Literal& tuple_element : literal.tuple_literals()) {
+      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
+          GetByteSizeRequirement(tuple_element.shape()));
+      TF_RETURN_IF_ERROR(
+          TransferLiteralToDevice(executor, tuple_element, &allocation));
+      tuple_elements_on_device.push_back(allocation.opaque());
+    }
+    return TransferBufferToDevice(
+        executor, tuple_elements_on_device.size() * sizeof(void*),
+        tuple_elements_on_device.data(), destination);
+  }
+
+  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
+                                LiteralUtil::InternalData(literal),
+                                destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const Literal& literal) {
+  const Shape& shape = literal.shape();
+  VLOG(1) << "transferring literal shape to infeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::TransferLiteralFromOutfeed(
+    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    Literal* literal) {
+  const Shape& shape = literal->shape();
+  VLOG(1) << "transferring literal shape from outfeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::ResetDevices(
+    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+        executors) {
+  return Unimplemented("Device reset not supported");
+}
+
+int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+}  // namespace executorplugin
+}  // namespace xla
+
+static std::unique_ptr<xla::TransferManager> CreateExecutorTransferManager() {
+  return xla::MakeUnique<xla::executorplugin::ExecutorTransferManager>();
+}
+
+static bool InitModule() {
+  xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId,
+                                                &CreateExecutorTransferManager);
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.h b/tensorflow/compiler/plugin/executor/transfer_manager.h
new file mode 100644
index 00000000000..22142cd778a
--- /dev/null
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+#include <vector>
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorTransferManager : public TransferManager {
+ public:
+  ExecutorTransferManager();
+
+  ~ExecutorTransferManager() override {}
+
+  se::Platform::Id PlatformId() const override;
+
+  StatusOr<std::vector<se::DeviceMemoryBase>> ShallowCopyTupleFromDevice(
+      se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+      const Shape& shape) override;
+
+  Status TransferLiteralFromDevice(se::StreamExecutor* executor,
+                                   const se::DeviceMemoryBase& source,
+                                   const Shape& device_shape,
+                                   const Shape& literal_shape,
+                                   Literal* literal) override;
+
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                 const Literal& literal,
+                                 se::DeviceMemoryBase* destination) override;
+
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                 const Literal& literal) override;
+
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
+
+  Status ResetDevices(
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
+
+  int64 GetByteSizeRequirement(const Shape& shape) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a3c634c1abf..4bbb2767ac0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1,11 +1,23 @@
 licenses(["notice"])  # Apache 2.0
 
-package(
-    default_visibility = [
+package_group(
+    name = "internal",
+    includes = [
         "//tensorflow/compiler/tf2xla:internal",
     ],
 )
 
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/tf2xla:friends",
+    ],
+)
+
+package(
+    default_visibility = [":internal"],
+)
+
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
@@ -19,6 +31,7 @@ py_library(
     testonly = 1,
     srcs = ["xla_test.py"],
     srcs_version = "PY2AND3",
+    visibility = [":friends"],
     deps = [
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/core:protos_all_py",
@@ -38,6 +51,34 @@ cc_library(
     deps = ["//tensorflow/core:framework_lite"],
 )
 
+tf_xla_py_test(
+    name = "adagrad_test",
+    size = "small",
+    srcs = ["adagrad_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_xla_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "binary_ops_test",
     size = "small",
@@ -100,6 +141,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "conv3d_test",
+    size = "medium",
+    srcs = ["conv3d_test.py"],
+    shard_count = 5,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -113,6 +170,33 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "slice_ops_test",
+    size = "small",
+    srcs = ["slice_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
+    name = "ftrl_test",
+    size = "small",
+    srcs = ["ftrl_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "function_test",
     size = "small",
@@ -139,6 +223,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "momentum_test",
+    size = "small",
+    srcs = ["momentum_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "nary_ops_test",
     size = "small",
@@ -179,6 +277,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "pooling_ops_3d_test",
+    size = "medium",
+    srcs = ["pooling_ops_3d_test.py"],
+    shard_count = 10,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "random_ops_test",
     size = "small",
@@ -208,6 +321,64 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reverse_ops_test",
+    size = "small",
+    srcs = ["reverse_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+tf_xla_py_test(
+    name = "rmsprop_test",
+    size = "small",
+    srcs = ["rmsprop_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_xla_py_test(
+    name = "spacetobatch_op_test",
+    size = "medium",
+    srcs = ["spacetobatch_op_test.py"],
+    shard_count = 3,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
+    name = "tensor_array_ops_test",
+    size = "small",
+    srcs = ["tensor_array_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "ternary_ops_test",
     size = "small",
@@ -236,6 +407,23 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "variable_ops_test",
+    size = "small",
+    srcs = ["variable_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 cuda_py_test(
     name = "xla_device_test",
     size = "small",
@@ -294,7 +482,6 @@ tf_cuda_cc_test(
     # This test is randomized, so only run it if explicitly requested.
     tags = [
         "manual",
-        "noguitar",
         "notap",
     ],
     deps = [":randomized_tests_library"],
@@ -336,8 +523,12 @@ cuda_py_test(
 # --dump_graph_dir, and the config file was written by hand.
 #
 # Run the following to build a minimal benchmark of the computation on Android:
-# $ bazel build -c opt --config=android_arm \
-#       third_party/tensorflow/compiler/tests:lstm_layer_inference_benchmark
+# $ bazel build -c opt --cxxopt='-std=c++11' --linkopt='-lm' \
+#   --cpu=armeabi-v7a \
+#   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+#   --crosstool_top=//external:android/crosstool \
+#   //tensorflow/compiler/tests:lstm_layer_inference_benchmark
+
 #
 # Currently the resulting binary size is ~190KB
 tf_library(
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
new file mode 100644
index 00000000000..a5c5885b428
--- /dev/null
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -0,0 +1,116 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adagrad."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+
+
+class AdagradOptimizerTest(XLATestCase):
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSharing(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 Adagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
new file mode 100644
index 00000000000..3215dc36e5b
--- /dev/null
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -0,0 +1,176 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(XLATestCase):
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+          else:
+            update2.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 852c80db1fe..7221a0a3c74 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -107,6 +107,12 @@ class BinaryOpsTest(XLATestCase):
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([-75, -48, -21, 0], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops._elu_grad,
+          np.array([1, 2, 3, 4, 5, 6], dtype=dtype),
+          np.array([-.6, -.4, -.2, 0, .2, .4], dtype=dtype),
+          expected=np.array([0.4, 1.2, 2.4, 4, 5, 6], dtype=dtype))
+
       self._testBinary(
           gen_nn_ops._relu_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
@@ -132,6 +138,20 @@ class BinaryOpsTest(XLATestCase):
           ],
           equality_test=self.ListsAreClose)
 
+      self._testBinary(
+          gen_nn_ops._sparse_softmax_cross_entropy_with_logits,
+          np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                    [0.9, 1.0, 1.1, 1.2]], dtype=dtype),
+          np.array([2, 1, 7], dtype=np.int32),
+          expected=[
+              np.array([1.342536, 1.442536, np.nan], dtype=dtype),
+              np.array([[0.213838, 0.236328, -0.738817, 0.288651],
+                        [0.213838, -0.763672, 0.261183, 0.288651],
+                        [np.nan, np.nan, np.nan, np.nan]],
+                       dtype=dtype),
+          ],
+          equality_test=self.ListsAreClose)
+
   def testIntOps(self):
     for dtype in self.int_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 820db13d0b1..0bde616521a 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -1,12 +1,14 @@
 """Build rules for Tensorflow/XLA testing."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 
 def all_backends():
+  b = ["cpu"] + plugins.keys()
   if cuda_is_configured():
-    return ["cpu", "gpu"]
+    return b + ["gpu"]
   else:
-    return ["cpu"]
+    return b
 
 def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
                    disabled_backends=None, **kwargs):
@@ -53,6 +55,10 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
       backend_args += ["--test_device=XLA_GPU",
                        "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL"]
       backend_tags += ["requires-gpu-sm35"]
+    elif backend in plugins:
+      backend_args += ["--test_device=" + plugins[backend]["device"],
+                       "--types=" + plugins[backend]["types"]]
+      backend_tags += plugins[backend]["tags"]
     else:
       fail("Unknown backend {}".format(backend))
 
diff --git a/tensorflow/compiler/tests/conv2d_test.py b/tensorflow/compiler/tests/conv2d_test.py
index 01cfbd9f7c0..4bc118b5bdb 100644
--- a/tensorflow/compiler/tests/conv2d_test.py
+++ b/tensorflow/compiler/tests/conv2d_test.py
@@ -310,7 +310,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
             data_format="NHWC")
 
       value = sess.run(tensor, {t1: x1, t2: x2})
-      self.assertArrayNear(expected, np.ravel(value), 1e-5)
+      self.assertArrayNear(expected, np.ravel(value), 1e-3)
 
   def testConv2D1x1Filter(self):
     expected_output = [8056, 8432, 8312, 8704, 8568, 8976]
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
new file mode 100644
index 00000000000..3bebf46511c
--- /dev/null
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -0,0 +1,233 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for 3D convolutions using the XLA JIT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import googletest
+
+
+# Test cloned from
+# tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
+class Conv3DBackpropFilterV2GradTest(XLATestCase):
+
+  def testGradient(self):
+    with self.test_session(), self.test_scope():
+      for padding in ["SAME", "VALID"]:
+        for stride in [1, 2]:
+          np.random.seed(1)
+          in_shape = [2, 4, 3, 3, 2]
+          in_val = constant_op.constant(
+              2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32)
+          filter_shape = [3, 3, 3, 2, 3]
+          strides = [1, stride, stride, stride, 1]
+          # Make a convolution op with the current settings, just to easily get
+          # the shape of the output.
+          conv_out = nn_ops.conv3d(in_val,
+                                   array_ops.zeros(filter_shape), strides,
+                                   padding)
+          out_backprop_shape = conv_out.get_shape().as_list()
+          out_backprop_val = constant_op.constant(
+              2 * np.random.random_sample(out_backprop_shape) - 1,
+              dtype=dtypes.float32)
+          output = nn_ops.conv3d_backprop_filter_v2(in_val, filter_shape,
+                                                    out_backprop_val, strides,
+                                                    padding)
+          err = gradient_checker.compute_gradient_error(
+              [in_val, out_backprop_val], [in_shape, out_backprop_shape],
+              output, filter_shape)
+          print("conv3d_backprop_filter gradient err = %g " % err)
+          err_tolerance = 1e-3
+          self.assertLess(err, err_tolerance)
+
+
+# Test cloned from tensorflow/python/kernel_tests/conv3d_transpose_test.py
+class Conv3DTransposeTest(XLATestCase):
+
+  def testConv3DTransposeSingleStride(self):
+    with self.test_session(), self.test_scope():
+      strides = [1, 1, 1, 1, 1]
+
+      # Input, output: [batch, depth, height, width, channel]
+      x_shape = [2, 5, 6, 4, 3]
+      y_shape = [2, 5, 6, 4, 2]
+
+      # Filter: [kernel_depth, kernel_height, kernel_width, out_depth, in_depth]
+      f_shape = [3, 3, 3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv3d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = output.eval()
+
+      # We count the number of cells being added at the locations in the output.
+      # At the center, #cells = kernel_depth * kernel_height * kernel_width
+      # At the corners, #cells = ceil(kernel_depth/2) * ceil(kernel_height/2)
+      #                          * ceil(kernel_width/2)
+      # At the edges, #cells =
+      #   kernel_depth * ceil(kernel_height/2) * ceil(kernel_width/2) or
+      #   ceil(kernel_depth/2) * kernel_height * ceil(kernel_width/2) or
+      #   ceil(kernel_depth/2) * ceil(kernel_height/2) * kernel_width
+      # At the borders, #cells =
+      #   ceil(kernel_depth/2) * kernel_height * kernel_width or
+      #   kernel_depth * ceil(kernel_height/2) * kernel_width or
+      #   kernel_depth * kernel_height * ceil(kernel_width/2)
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[3]):
+          for w in xrange(y_shape[3]):
+            for h in xrange(y_shape[2]):
+              for d in xrange(y_shape[1]):
+                d_in = d > 0 and d < y_shape[1] - 1
+                h_in = h > 0 and h < y_shape[2] - 1
+                w_in = w > 0 and w < y_shape[3] - 1
+                if d_in + h_in + w_in == 3:
+                  target = 27 * 3.0
+                elif d_in + h_in + w_in == 2:
+                  target = 18 * 3.0
+                elif d_in or h_in or w_in:
+                  target = 12 * 3.0
+                else:
+                  target = 8 * 3.0
+                self.assertAllClose(target, value[n, d, h, w, k])
+
+  def testConv3DTransposeSame(self):
+    with self.test_session(), self.test_scope():
+      strides = [1, 2, 2, 2, 1]
+
+      # Input, output: [batch, depth, height, width, depth]
+      x_shape = [2, 5, 6, 4, 3]
+      y_shape = [2, 10, 12, 8, 2]
+
+      # Filter: [kernel_depth, kernel_height, kernel_width, out_depth, in_depth]
+      f_shape = [3, 3, 3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv3d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = output.eval()
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[3]):
+          for w in xrange(y_shape[3]):
+            for h in xrange(y_shape[2]):
+              for d in xrange(y_shape[1]):
+                # We add a case for locations divisible by the stride.
+                d_in = d % strides[1] == 0 and 0 < d < y_shape[1] - 1
+                h_in = h % strides[2] == 0 and 0 < h < y_shape[2] - 1
+                w_in = w % strides[3] == 0 and 0 < w < y_shape[3] - 1
+                if d_in + h_in + w_in == 3:
+                  target = 8 * 3.0
+                elif d_in + h_in + w_in == 2:
+                  target = 4 * 3.0
+                elif d_in or h_in or w_in:
+                  target = 2 * 3.0
+                else:
+                  target = 3.0
+                self.assertAllClose(target, value[n, d, h, w, k])
+
+  def testConv3DTransposeValid(self):
+    with self.test_session(), self.test_scope():
+      strides = [1, 2, 2, 2, 1]
+
+      # Input, output: [batch, depth, height, width, depth]
+      x_shape = [2, 5, 6, 4, 3]
+      y_shape = [2, 11, 13, 9, 2]
+
+      # Filter: [kernel_depth, kernel_height, kernel_width, out_depth, in_depth]
+      f_shape = [3, 3, 3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv3d_transpose(
+          x, f, y_shape, strides=strides, padding="VALID")
+      value = output.eval()
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[3]):
+          for w in xrange(y_shape[3]):
+            for h in xrange(y_shape[2]):
+              for d in xrange(y_shape[1]):
+                # We add a case for locations divisible by the stride.
+                d_in = d % strides[1] == 0 and pad < d < y_shape[1] - 1 - pad
+                h_in = h % strides[2] == 0 and pad < h < y_shape[2] - 1 - pad
+                w_in = w % strides[3] == 0 and pad < w < y_shape[3] - 1 - pad
+                if d_in + h_in + w_in == 3:
+                  target = 8 * 3.0
+                elif d_in + h_in + w_in == 2:
+                  target = 4 * 3.0
+                elif d_in or h_in or w_in:
+                  target = 2 * 3.0
+                else:
+                  target = 3.0
+                cache_values[n, d, h, w, k] = target
+
+          # copy values in the border
+          cache_values[n, :, :, 0, k] = cache_values[n, :, :, 1, k]
+          cache_values[n, :, :, -1, k] = cache_values[n, :, :, -2, k]
+          cache_values[n, :, 0, :, k] = cache_values[n, :, 1, :, k]
+          cache_values[n, :, -1, :, k] = cache_values[n, :, -2, :, k]
+          cache_values[n, 0, :, :, k] = cache_values[n, 1, :, :, k]
+          cache_values[n, -1, :, :, k] = cache_values[n, -2, :, :, k]
+
+    self.assertAllClose(cache_values, value)
+
+  def testGradient(self):
+    x_shape = [2, 3, 4, 3, 2]
+    f_shape = [3, 3, 3, 2, 2]
+    y_shape = [2, 6, 8, 6, 2]
+    strides = [1, 2, 2, 2, 1]
+    np.random.seed(1)  # Make it reproducible.
+    x_val = np.random.random_sample(x_shape).astype(np.float64)
+    f_val = np.random.random_sample(f_shape).astype(np.float64)
+    with self.test_session(), self.test_scope():
+      x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
+      output = nn_ops.conv3d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      err = gradient_checker.compute_gradient_error([x, f], [x_shape, f_shape],
+                                                    output, y_shape)
+    print("conv3d_transpose gradient err = %g " % err)
+    err_tolerance = 0.0005
+    self.assertLess(err, err_tolerance)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
new file mode 100644
index 00000000000..6b328fb618b
--- /dev/null
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Ftrl optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import gradient_descent
+
+
+class FtrlOptimizerTest(XLATestCase):
+
+  def initVariableAndGradient(self, dtype):
+    var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+    var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+    grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+    grads1 = constant_op.constant([0.02, 0.04], dtype=dtype)
+
+    return var0, var1, grads0, grads1
+
+  def equivAdagradTest_FtrlPart(self, steps, dtype):
+    var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
+    opt = ftrl.FtrlOptimizer(
+        3.0,
+        learning_rate_power=-0.5,  # using Adagrad learning rate
+        initial_accumulator_value=0.1,
+        l1_regularization_strength=0.0,
+        l2_regularization_strength=0.0)
+    ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+    # Fetch params to validate initial values
+    self.assertAllClose([0.0, 0.0], var0.eval())
+    self.assertAllClose([0.0, 0.0], var1.eval())
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      ftrl_update.run()
+
+    return var0.eval(), var1.eval()
+
+  def equivAdagradTest_AdagradPart(self, steps, dtype):
+    var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
+    opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+    adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+    # Fetch params to validate initial values
+    self.assertAllClose([0.0, 0.0], var0.eval())
+    self.assertAllClose([0.0, 0.0], var1.eval())
+
+    # Run Adagrad for a few steps
+    for _ in range(steps):
+      adagrad_update.run()
+
+    return var0.eval(), var1.eval()
+
+  def equivGradientDescentTest_FtrlPart(self, steps, dtype):
+    var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
+    opt = ftrl.FtrlOptimizer(
+        3.0,
+        learning_rate_power=-0.0,  # using Fixed learning rate
+        initial_accumulator_value=0.1,
+        l1_regularization_strength=0.0,
+        l2_regularization_strength=0.0)
+    ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+    # Fetch params to validate initial values
+    self.assertAllClose([0.0, 0.0], var0.eval())
+    self.assertAllClose([0.0, 0.0], var1.eval())
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      ftrl_update.run()
+
+    return var0.eval(), var1.eval()
+
+  def equivGradientDescentTest_GradientDescentPart(self, steps, dtype):
+    var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
+    opt = gradient_descent.GradientDescentOptimizer(3.0, name="sgd")
+    sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+    # Fetch params to validate initial values
+    self.assertAllClose([0.0, 0.0], var0.eval())
+    self.assertAllClose([0.0, 0.0], var1.eval())
+
+    # Run GradientDescent for a few steps
+    for _ in range(steps):
+      sgd_update.run()
+
+    return var0.eval(), var1.eval()
+
+  def testFtrlwithoutRegularization(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([0.0, 0.0], var0.eval())
+        self.assertAllClose([0.0, 0.0], var1.eval())
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          ftrl_update.run()
+
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-2.60260963, -4.29698515]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28432083, -0.56694895]), var1.eval())
+
+  def testFtrlwithoutRegularization2(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([4.0, 3.0], var1.eval())
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          ftrl_update.run()
+
+        # Validate updated params
+        self.assertAllClose(
+            np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5)
+        self.assertAllClose(
+            np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
+
+  def testFtrlWithL1(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=0.0)
+        ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([4.0, 3.0], var1.eval())
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          ftrl_update.run()
+
+        # Validate updated params
+        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval())
+        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval())
+
+  def testFtrlWithL1_L2(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([4.0, 3.0], var1.eval())
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          ftrl_update.run()
+
+        # Validate updated params
+        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval())
+        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval())
+
+  # When variables are intialized with Zero, FTRL-Proximal has two properties:
+  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+  # with GradientDescent.
+  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential
+  # with Adagrad.
+  # So, basing on these two properties, we test if our implementation of
+  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  def testEquivAdagradwithoutRegularization(self):
+    steps = 5
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        val0, val1 = self.equivAdagradTest_FtrlPart(steps, dtype)
+      with self.test_session(), self.test_scope():
+        val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+  def testEquivGradientDescentwithoutRegularization(self):
+    steps = 5
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        val0, val1 = self.equivGradientDescentTest_FtrlPart(steps, dtype)
+      with self.test_session(), self.test_scope():
+        val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
+            steps, dtype)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index 40cc7a5d600..cbe2888696c 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -103,7 +103,8 @@ class FunctionTest(XLATestCase):
       result = sess.run(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
-  def testFunctionsNoInline(self):
+  # TODO(b/36139787): Re-enable this test when noinline works again.
+  def DISABLED_testFunctionsNoInline(self):
 
     @function.Defun(dtypes.float32, noinline=True)
     def TimesTwo(x):
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 8a568d6d58d..11914080ecc 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -160,12 +160,14 @@ class JitLaunchTest(test.TestCase):
     # function (say, Bar) which is not inlined. When the compiler compiles
     # Foo, it needs to symbolic execute Bar correctly regardless whether
     # Bar is inlined or not.
-    #
+
+    # TODO(b/36139787): Re-enable this test when noinline works again.
     # Tests compiled=True and noinline=True.
-    self._compare(
-        AddOnceReturnTwice, [np.array(
-            [[[0.5, -1.0]]], dtype=np.float32)],
-        noinline=True)
+    # self._compare(
+    #     AddOnceReturnTwice, [np.array(
+    #         [[[0.5, -1.0]]], dtype=np.float32)],
+    #     noinline=True)
+
     # Tests compiled=True and noinline=False.
     self._compare(
         AddOnceReturnTwice, [np.array(
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
new file mode 100644
index 00000000000..c00e3035a09
--- /dev/null
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -0,0 +1,179 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Momentum."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import momentum as momentum_lib
+
+
+class MomentumOptimizerTest(XLATestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var += accum * lr * momentum
+    accum = accum * momentum + g
+    var -= lr * accum
+    var -= accum * lr * momentum
+    return var, accum
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def testNesterovMomentum(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = resource_variable_ops.ResourceVariable(
+            array_ops.zeros([], dtypes.int32), name="global_step")
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        variables.global_variables_initializer().run()
+        for _ in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py
index d94e11b0789..2660e1d5728 100644
--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@@ -75,6 +75,28 @@ class NAryOpsTest(XLATestCase):
         expected=np.array(
             [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], dtype=np.float32))
 
+  def testOneHot(self):
+    with self.test_session() as session, self.test_scope():
+      indices = array_ops.constant(np.array([[2, 3], [0, 1]], dtype=np.int32))
+      op = array_ops.one_hot(indices,
+                             np.int32(4),
+                             on_value=np.float32(7), off_value=np.float32(3))
+      output = session.run(op)
+      expected = np.array([[[3, 3, 7, 3], [3, 3, 3, 7]],
+                           [[7, 3, 3, 3], [3, 7, 3, 3]]],
+                          dtype=np.float32)
+      self.assertAllEqual(output, expected)
+
+      op = array_ops.one_hot(indices,
+                             np.int32(4),
+                             on_value=np.int32(2), off_value=np.int32(1),
+                             axis=1)
+      output = session.run(op)
+      expected = np.array([[[1, 1], [1, 1], [2, 1], [1, 2]],
+                           [[2, 1], [1, 2], [1, 1], [1, 1]]],
+                          dtype=np.int32)
+      self.assertAllEqual(output, expected)
+
   def testSplitV(self):
     with self.test_session() as session:
       with self.test_scope():
@@ -94,12 +116,14 @@ class NAryOpsTest(XLATestCase):
                     np.array([1, 1], dtype=np.int32)],
                    expected=np.array([[], []], dtype=np.float32))
 
-    self._testNAry(lambda x: array_ops.strided_slice(*x),
-                   [np.array([[], [], []], dtype=np.float32),
-                    np.array([1, 0], dtype=np.int64),
-                    np.array([3, 0], dtype=np.int64),
-                    np.array([1, 1], dtype=np.int64)],
-                   expected=np.array([[], []], dtype=np.float32))
+    if np.int64 in self.int_types:
+      self._testNAry(
+          lambda x: array_ops.strided_slice(*x), [
+              np.array([[], [], []], dtype=np.float32), np.array(
+                  [1, 0], dtype=np.int64), np.array([3, 0], dtype=np.int64),
+              np.array([1, 1], dtype=np.int64)
+          ],
+          expected=np.array([[], []], dtype=np.float32))
 
     self._testNAry(lambda x: array_ops.strided_slice(*x),
                    [np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
diff --git a/tensorflow/compiler/tests/plugin.bzl b/tensorflow/compiler/tests/plugin.bzl
new file mode 100644
index 00000000000..b6eb7a9e395
--- /dev/null
+++ b/tensorflow/compiler/tests/plugin.bzl
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Additional XLA devices to be included in the unit test suite."""
+
+# If you wish to edit this file without checking it into the repo, consider:
+#   git update-index --assume-unchanged tensorflow/compiler/tests/plugin.bzl
+
+plugins = {
+  #"poplar": {"device":"XLA_IPU", "types":"DT_FLOAT,DT_INT32", "tags":[]},
+}
+
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
new file mode 100644
index 00000000000..eb48fe555a0
--- /dev/null
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -0,0 +1,400 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for 3d pooling operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+# Wrapper around AvgPoolGrad that ignores extra arguments needed by
+# MaxPoolGrad.
+def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding):
+  del outputs  # Unused by average-pooling gradients.
+  return gen_nn_ops._avg_pool3d_grad(
+      inputs.get_shape().as_list(),
+      output_gradients,
+      ksize=ksize,
+      strides=strides,
+      padding=padding)
+
+
+class Pooling3DTest(XLATestCase):
+
+  def _VerifyValues(self, pool_func, input_sizes, window, strides, padding,
+                    expected):
+    """Verifies the output values of the pooling function.
+
+    Args:
+      pool_func: Function to be called: co.MaxPool, co.AvgPool.
+      input_sizes: Input tensor dimensions.
+      window: Tuple of kernel dims: planes, rows, cols.
+      strides: Tuple of strides for dims: planes, rows, cols.
+      padding: Padding type.
+      expected: An array containing the expected operation outputs.
+    """
+    total_size = 1
+    for s in input_sizes:
+      total_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x = np.arange(1.0, total_size + 1, dtype=np.float32)
+    x = x.reshape(input_sizes)
+    with self.test_session() as sess, self.test_scope():
+      inputs = array_ops.placeholder(dtypes.float32)
+      t = pool_func(
+          inputs,
+          ksize=[1] + window + [1],
+          strides=[1] + strides + [1],
+          padding=padding)
+      vals = sess.run(t, {inputs: x})
+    # Verifies values.
+    actual = vals.flatten()
+    self.assertAllClose(expected, actual)
+
+  def testAvgPool3dValidPadding(self):
+    expected_output = [20.5, 21.5, 22.5]
+    self._VerifyValues(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 3, 3, 3, 3],
+        window=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="VALID",
+        expected=expected_output)
+
+  def testAvgPool3dSamePadding(self):
+    expected_output = [20.5, 21.5, 22.5, 26.5, 27.5, 28.5]
+    self._VerifyValues(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 2, 2, 4, 3],
+        window=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="SAME",
+        expected=expected_output)
+
+  def testAvgPool3dSamePaddingDifferentStrides(self):
+    expected_output = [1.5, 4.5, 7.5, 17.5, 20.5, 23.5, 33.5, 36.5, 39.5]
+    self._VerifyValues(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 5, 8, 1, 1],
+        window=[1, 2, 3],
+        strides=[2, 3, 1],
+        padding="SAME",
+        expected=expected_output)
+
+  def testMaxPool3dValidPadding(self):
+    expected_output = [40.0, 41.0, 42.0]
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 3, 3, 3, 3],
+        window=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="VALID",
+        expected=expected_output)
+
+  def testMaxPool3dSamePadding(self):
+    expected_output = [31., 32., 33., 34., 35., 36.]
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 2, 2, 3, 3],
+        window=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="SAME",
+        expected=expected_output)
+
+  def testMaxPool3dSamePaddingDifferentStrides(self):
+    expected_output = [2., 5., 8., 18., 21., 24., 34., 37., 40.]
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 5, 8, 1, 1],
+        window=[1, 2, 3],
+        strides=[2, 3, 1],
+        padding="SAME",
+        expected=expected_output)
+
+    # Test pooling on a larger input, with different stride and kernel
+    # size for the 'z' dimension.
+
+    # Simulate max pooling in numpy to get the expected output.
+    input_data = np.arange(1, 5 * 27 * 27 * 64 + 1).reshape((5, 27, 27, 64))
+    input_data = np.pad(input_data, [[0, 0], [0, 1], [0, 1], [0, 0]],
+                        mode="constant")
+    expected_output = input_data[:, 1::2, 1::2, :]
+    expected_output[:, -1, :, :] = input_data[:, -2, 1::2, :]
+    expected_output[:, :, -1, :] = input_data[:, 1::2, -2, :]
+    expected_output[:, -1, -1, :] = input_data[:, -2, -2, :]
+
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 5, 27, 27, 64],
+        window=[1, 2, 2],
+        strides=[1, 2, 2],
+        padding="SAME",
+        expected=expected_output.flatten())
+
+  def testKernelSmallerThanStride(self):
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 3, 3, 3, 1],
+        window=[1, 1, 1],
+        strides=[2, 2, 2],
+        padding="SAME",
+        expected=[1, 3, 7, 9, 19, 21, 25, 27])
+
+    self._VerifyValues(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 7, 7, 7, 1],
+        window=[2, 2, 2],
+        strides=[3, 3, 3],
+        padding="VALID",
+        expected=[58, 61, 79, 82, 205, 208, 226, 229])
+
+    self._VerifyValues(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 3, 3, 3, 1],
+        window=[1, 1, 1],
+        strides=[2, 2, 2],
+        padding="SAME",
+        expected=[1, 3, 7, 9, 19, 21, 25, 27])
+
+    self._VerifyValues(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 7, 7, 7, 1],
+        window=[2, 2, 2],
+        strides=[3, 3, 3],
+        padding="VALID",
+        expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5])
+
+  def _VerifyGradient(self, pool_func, pool_grad_func, input_sizes, ksize,
+                      strides, padding):
+    """Verifies the output values of the pooling gradient function.
+
+    Args:
+      pool_func: Forward pooling function
+      pool_grad_func: Pooling gradient function for pool_grad_func
+      input_sizes: Input tensor dimensions.
+      ksize: The kernel size dimensions
+      strides: The stride dimensions
+      padding: Padding type.
+    """
+    ksize = [1] + ksize + [1]
+    strides = [1] + strides + [1]
+    total_size = np.prod(input_sizes)
+    x = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_sizes)
+    with self.test_session() as sess:
+      # Use the forward pool function to compute some corresponding outputs
+      # (needed for the CPU device, and we need the shape in both cases).
+      with ops.device("CPU"):
+        inputs = array_ops.placeholder(dtypes.float32, shape=input_sizes)
+        outputs = pool_func(
+            inputs,
+            ksize=ksize,
+            strides=strides,
+            padding=padding)
+
+      output_vals = np.array(sess.run(outputs, {inputs: x}))
+      output_gradient_vals = np.arange(
+          1, output_vals.size + 1, dtype=np.float32)
+      output_gradient_vals = output_gradient_vals.reshape(output_vals.shape)
+
+      # Use the Tensorflow CPU pooling gradient to compute the expected input
+      # gradients.
+      with ops.device("CPU"):
+        output_gradients = array_ops.placeholder(
+            dtypes.float32, shape=output_vals.shape)
+        expected_input_gradients = pool_grad_func(
+            inputs,
+            outputs,
+            output_gradients,
+            ksize=ksize,
+            strides=strides,
+            padding=padding)
+        expected_input_gradient_vals = sess.run(
+            expected_input_gradients,
+            {inputs: x,
+             output_gradients: output_gradient_vals})
+
+      # Run the gradient op on the XLA device
+      with self.test_scope():
+        outputs = array_ops.placeholder(dtypes.float32, shape=output_vals.shape)
+        actual_input_gradients = pool_grad_func(
+            inputs,
+            outputs,
+            output_gradients,
+            ksize=ksize,
+            strides=strides,
+            padding=padding)
+      actual = sess.run(actual_input_gradients, {
+          inputs: x,
+          outputs: output_vals,
+          output_gradients: output_gradient_vals
+      })
+
+      # Compare the Tensorflow and XLA results.
+      self.assertAllClose(
+          expected_input_gradient_vals.flatten(),
+          actual.flatten(),
+          rtol=1e-5,
+          atol=1e-6)
+      self.assertShapeEqual(actual, inputs)
+
+  def testMaxPoolGradValidPadding1_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[1, 3, 3, 3, 1],
+        ksize=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding="VALID")
+
+  def testMaxPoolGradValidPadding2_1_6_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 3, 3, 6, 3],
+        ksize=[2, 2, 2],
+        strides=[1, 1, 1],
+        padding="VALID")
+
+  def testMaxPoolGradValidPadding2_1_7_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 3, 5, 7, 3],
+        ksize=[2, 2, 2],
+        strides=[1, 1, 1],
+        padding="VALID")
+
+  def testMaxPoolGradValidPadding2_2_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 2, 2, 2, 3],
+        ksize=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="VALID")
+
+  def testMaxPoolGradSamePadding1_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 3, 2, 4, 1],
+        ksize=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+  def testMaxPoolGradSamePadding2_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 3, 2, 4, 1],
+        ksize=[2, 2, 2],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+  def testMaxPoolGradSamePadding2_2_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[2, 5, 2, 4, 3],
+        ksize=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="SAME")
+
+  def testMaxPoolGradSamePadding3_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.max_pool3d,
+        gen_nn_ops._max_pool3d_grad,
+        input_sizes=[1, 3, 3, 7, 1],
+        ksize=[3, 3, 3],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+  def testAvgPoolGradValidPadding1_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[2, 3, 3, 3, 3],
+        ksize=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding="VALID")
+
+  def testAvgPoolGradValidPadding2_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[2, 3, 3, 3, 3],
+        ksize=[2, 2, 2],
+        strides=[1, 1, 1],
+        padding="VALID")
+
+  def testAvgPoolGradValidPadding2_2_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[2, 2, 2, 2, 3],
+        ksize=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="VALID")
+
+  def testAvgPoolGradSamePadding1_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[2, 3, 2, 4, 3],
+        ksize=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+  def testAvgPoolGradSamePadding2_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[1, 2, 2, 2, 1],
+        ksize=[2, 2, 2],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+  def testAvgPoolGradSamePadding2_2_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[2, 5, 2, 4, 3],
+        ksize=[2, 2, 2],
+        strides=[2, 2, 2],
+        padding="SAME")
+
+  def testAvgPoolGradSamePadding3_1_3d(self):
+    self._VerifyGradient(
+        nn_ops.avg_pool3d,
+        _AvgPoolGrad,
+        input_sizes=[1, 3, 6, 7, 1],
+        ksize=[3, 3, 3],
+        strides=[1, 1, 1],
+        padding="SAME")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index b7c8b3f5980..a17a3f3d653 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -72,6 +72,17 @@ class RandomOpsTest(XLATestCase):
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
+  def testTruncatedNormalIsInRange(self):
+    count = 10000
+    # TODO(b/34339814): implement inverse erf support for non-F32 types.
+    for dtype in [dtypes.float32]:
+      with self.test_session() as sess:
+        with self.test_scope():
+          x = random_ops.truncated_normal(shape=[count], dtype=dtype, seed=42)
+        y = sess.run(x)
+        self.assertTrue((y >= -2).sum() == count)
+        self.assertTrue((y <= 2).sum() == count)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index abc0cb2cce7..d3821ad02e5 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -68,6 +68,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -75,6 +76,7 @@ namespace {
 // Command line flags: see main() below.
 int64 tf_xla_random_seed = 0;
 int32 tf_xla_test_repetitions = 20;
+int64 tf_xla_max_tensor_size = 100000LL;
 string* tf_xla_test_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 
@@ -93,7 +95,12 @@ class OpTestBuilder {
   explicit OpTestBuilder(const string& op_name);
 
   // Adds an input 'tensor'.
-  OpTestBuilder& Input(Tensor tensor);
+  OpTestBuilder& Input(const Tensor& tensor);
+
+  // Adds a random input tensor with 'type'. If 'dims' is not provided,
+  // RandomDims() is used.
+  OpTestBuilder& RandomInput(DataType type);
+  OpTestBuilder& RandomInput(DataType type, std::vector<int64> dims);
 
   // Sets an attribute.
   template <class T>
@@ -110,25 +117,54 @@ class OpTestBuilder {
   // sets it to the NodeDef of the operator under test. Fills 'inputs' and
   // 'outputs' with the names of the input placeholder nodes and the output
   // identity nodes, respectively.
-  Status BuildGraph(string name_prefix, string device, bool use_jit,
-                    GraphDef* graphdef, NodeDef** test_node_def,
+  Status BuildGraph(const string& name_prefix, const string& device,
+                    bool use_jit, GraphDef* graphdef, NodeDef** test_node_def,
                     std::vector<string>* inputs,
                     std::vector<string>* outputs) const;
 
-  const std::vector<Tensor>& inputs() const { return inputs_; }
+  struct InputDescription {
+    Tensor tensor;
+
+    DataType type = DT_INVALID;
+    bool has_dims = false;
+    std::vector<int64> dims;
+  };
+
+  const std::vector<InputDescription>& inputs() const { return inputs_; }
 
  private:
   NodeDef node_def_;
-  std::vector<Tensor> inputs_;
+  std::vector<InputDescription> inputs_;
 };
 
 OpTestBuilder::OpTestBuilder(const string& op_name) {
   node_def_.set_op(op_name);
 }
 
-OpTestBuilder& OpTestBuilder::Input(Tensor tensor) {
+OpTestBuilder& OpTestBuilder::Input(const Tensor& tensor) {
   VLOG(1) << "Adding input: " << tensor.DebugString();
-  inputs_.push_back(tensor);
+  InputDescription input;
+  input.tensor = tensor;
+  inputs_.push_back(input);
+  return *this;
+}
+
+OpTestBuilder& OpTestBuilder::RandomInput(DataType type) {
+  VLOG(1) << "Adding random input: " << type;
+  InputDescription input;
+  input.type = type;
+  inputs_.push_back(input);
+  return *this;
+}
+
+OpTestBuilder& OpTestBuilder::RandomInput(DataType type,
+                                          std::vector<int64> dims) {
+  VLOG(1) << "Adding input: " << type << " " << TensorShape(dims).DebugString();
+  InputDescription input;
+  input.type = type;
+  input.has_dims = true;
+  input.dims = std::move(dims);
+  inputs_.push_back(input);
   return *this;
 }
 
@@ -145,9 +181,9 @@ OpTestBuilder& OpTestBuilder::Attr(StringPiece attr_name,
   return *this;
 }
 
-Status OpTestBuilder::BuildGraph(string name_prefix, string device,
-                                 bool use_jit, GraphDef* graphdef,
-                                 NodeDef** test_node_def,
+Status OpTestBuilder::BuildGraph(const string& name_prefix,
+                                 const string& device, bool use_jit,
+                                 GraphDef* graphdef, NodeDef** test_node_def,
                                  std::vector<string>* inputs,
                                  std::vector<string>* outputs) const {
   OpRegistryInterface* op_registry = OpRegistry::Global();
@@ -206,23 +242,36 @@ class OpTest : public ::testing::Test {
  public:
   OpTest();
 
-  // Runs 'fn' up to --tf_xla_test_repetitions times, or until a failure occurs;
-  // whichever happens first.
-  void Repeatedly(std::function<void(void)> fn);
+  enum TestResult {
+    // The test saw an unrecoverable error. Don't try any more runs.
+    kFatalError,
+    // The parameters of the test were invalid (e.g., the "golden"
+    // implementation failed, or the parameters are oversize). Reruns are ok.
+    kInvalid,
+    // The test ran successfully, and we have a verdict. Does *not* mean the
+    // test passed.
+    kOk,
+  };
+
+  // Runs 'fn' up to --tf_xla_test_repetitions times, or until a test failure
+  // occurs; whichever happens first. Reruns if the TestResult is kInvalid.
+  void Repeatedly(const std::function<TestResult(void)>& fn);
 
   // Select a random element from 'candidates'.
   template <typename T>
   T Choose(gtl::ArraySlice<T> candidates);
 
   static constexpr int kDefaultMaxRank = 5;
-  static constexpr int64 kDefaultMaxDimensionSize = 20LL;
+  static constexpr int64 kDefaultMaxDimensionSize = 256LL;
 
-  // Returns a random dimension size.
+  // Returns true if 'dims' have a size less than tf_xla_max_tensor_size.
+  bool TensorSizeIsOk(gtl::ArraySlice<int64> dims);
+
+  // Returns a random dimension size, in the range [min, max).
   int64 RandomDim(int64 min = 0, int64 max = kDefaultMaxDimensionSize);
 
   // Returns a random shape. The tensor has rank in the range [min_rank,
-  // max_rank).
-  // Each dimension has size [0, kDefaultMaxDimensionSize].
+  // max_rank). Each dimension has size [min_size, max_size).
   std::vector<int64> RandomDims(int min_rank = 0,
                                 int max_rank = kDefaultMaxRank,
                                 int64 min_size = 0,
@@ -252,17 +301,23 @@ class OpTest : public ::testing::Test {
   // for use as reduction indices.
   Tensor RandomReductionIndices(int rank);
 
-  struct WindowedDims {
+  struct WindowedSpatialDims {
     Padding padding;
-    int kernel_rows, kernel_cols;
-    int stride_rows, stride_cols;
-    int input_rows, input_cols;
-    int64 output_rows, output_cols;
+    std::vector<int64> kernel_dims;
+    std::vector<int64> stride_dims;
+    std::vector<int64> input_dims;
+    std::vector<int64> output_dims;
   };
-  // Choose dimensions for a 2D windowed op such as pooling or convolution.
-  // TODO(phawkins): currently this only produces spatial windows, in NHWC
-  // format.
-  WindowedDims ChooseWindowedDims();
+  // Choose spatial dimensions for a windowed op such as pooling or convolution.
+  WindowedSpatialDims ChooseWindowedSpatialDims(int num_spatial_dims);
+
+  // Builds dimensions for a windowed op such as pooling or convolution,
+  // including a batch and feature dimension.
+  std::vector<int64> ImageDims(TensorFormat format, int batch, int feature,
+                               const std::vector<int64>& spatial_dims);
+
+  // Converts an int64 vector to an int32 vector.
+  std::vector<int32> AsInt32s(const std::vector<int64>& int64s);
 
   std::mt19937& generator() { return *generator_; }
 
@@ -272,8 +327,9 @@ class OpTest : public ::testing::Test {
   // element-wise difference between x and y must no more than
   // atol + rtol * abs(x); or both elements may be NaN or infinity. For
   // non-floating-point tensors the element values must match exactly.
-  void ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
-                                     double atol = 1e-2, double rtol = 1e-2);
+  TestResult ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
+                                           double atol = 1e-2,
+                                           double rtol = 1e-2);
 
  protected:
   // Per-test state:
@@ -309,10 +365,35 @@ OpTest::OpTest() {
   TF_CHECK_OK(session_->Create(def));
 }
 
-void OpTest::Repeatedly(std::function<void(void)> fn) {
+void OpTest::Repeatedly(const std::function<TestResult(void)>& fn) {
   int const max_repetitions = tf_xla_test_repetitions;
-  for (int i = 0; !HasFailure() && i < max_repetitions; ++i) {
-    fn();
+  int valid_test_runs = 0;
+  // We run up to 10 * max_repetitions times; the idea is that if we roll the
+  // dice enough times we will find some valid parameters. We want to put an
+  // upper limit on the number iterations just in case the probability of
+  // finding feasible parameters is very low.
+  for (int i = 0; !HasFailure() && i < max_repetitions * 10 &&
+                  valid_test_runs < max_repetitions;
+       ++i) {
+    TestResult result = fn();
+    switch (result) {
+      case kOk:
+        ++valid_test_runs;
+        break;
+
+      case kFatalError:
+        ASSERT_TRUE(false) << "Test had fatal failure";
+        return;
+
+      case kInvalid:
+        break;
+    }
+  }
+  if (!HasFailure()) {
+    EXPECT_GE(valid_test_runs, max_repetitions)
+        << "Not enough test instances passed; this means that either the "
+           "golden implementation is buggy or the operator harness is not "
+           "producing well-formed test cases with a high probability.";
   }
 }
 
@@ -327,6 +408,14 @@ int64 OpTest::RandomDim(int64 min, int64 max) {
   return size_distribution(generator());
 }
 
+bool OpTest::TensorSizeIsOk(gtl::ArraySlice<int64> dims) {
+  int64 size = 1LL;
+  for (int64 dim : dims) {
+    size *= dim;
+  }
+  return size < tf_xla_max_tensor_size;
+}
+
 std::vector<int64> OpTest::RandomDims(int min_rank, int max_rank,
                                       int64 min_size, int64 max_size) {
   CHECK_LE(0, min_rank);
@@ -334,9 +423,13 @@ std::vector<int64> OpTest::RandomDims(int min_rank, int max_rank,
   std::uniform_int_distribution<int> rank_distribution(min_rank, max_rank);
   int rank = rank_distribution(generator());
   std::vector<int64> dims(rank);
-  std::generate(dims.begin(), dims.end(), [this, min_size, max_size]() {
-    return RandomDim(min_size, max_size);
-  });
+  // TODO(phawkins): too small a maximum tensor size could lead to an infinite
+  // loop here.
+  do {
+    std::generate(dims.begin(), dims.end(), [this, min_size, max_size]() {
+      return RandomDim(min_size, max_size);
+    });
+  } while (!TensorSizeIsOk(dims));
   return dims;
 }
 
@@ -473,35 +566,63 @@ Tensor OpTest::RandomReductionIndices(int rank) {
   return test::AsTensor<int32>(indices);
 }
 
-OpTest::WindowedDims OpTest::ChooseWindowedDims() {
-  WindowedDims d;
+OpTest::WindowedSpatialDims OpTest::ChooseWindowedSpatialDims(
+    int num_spatial_dims) {
+  WindowedSpatialDims d;
   d.padding = Choose<Padding>({SAME, VALID});
   std::uniform_int_distribution<int> random_int(1, 5);
-  Status s;
-  // Repeatedly try different filter/stride sizes until we find a valid
-  // combination.
-  do {
-    // CPU implementations require stride <= kernel size.
-    d.kernel_rows = random_int(generator()),
-    d.input_rows = RandomDim(d.kernel_rows);
-    d.stride_rows =
-        std::uniform_int_distribution<int>(1, d.kernel_rows)(generator());
-    int64 pad_dummy;
-    s = GetWindowedOutputSize(d.input_rows, d.kernel_rows, d.stride_rows,
-                              d.padding, &d.output_rows, &pad_dummy);
-  } while (!s.ok());
-  do {
-    d.kernel_cols = random_int(generator());
-    d.input_cols = RandomDim(d.kernel_cols);
-    d.stride_cols =
-        std::uniform_int_distribution<int>(1, d.kernel_cols)(generator());
-    int64 pad_dummy;
-    s = GetWindowedOutputSize(d.input_cols, d.kernel_cols, d.stride_cols,
-                              d.padding, &d.output_cols, &pad_dummy);
-  } while (!s.ok());
+  d.kernel_dims.resize(num_spatial_dims);
+  d.input_dims.resize(num_spatial_dims);
+  d.output_dims.resize(num_spatial_dims);
+  d.stride_dims.resize(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    Status s;
+    // Repeatedly try different filter/stride sizes until we find a valid
+    // combination.
+    do {
+      // CPU implementations require stride <= kernel size.
+      d.kernel_dims[i] = random_int(generator()),
+      d.input_dims[i] = RandomDim(d.kernel_dims[i]);
+      d.stride_dims[i] =
+          std::uniform_int_distribution<int>(1, d.kernel_dims[i])(generator());
+      int64 pad_dummy;
+      s = GetWindowedOutputSize(d.input_dims[i], d.kernel_dims[i],
+                                d.stride_dims[i], d.padding, &d.output_dims[i],
+                                &pad_dummy);
+    } while (!s.ok());
+  }
   return d;
 }
 
+std::vector<int64> OpTest::ImageDims(TensorFormat format, int batch,
+                                     int feature,
+                                     const std::vector<int64>& spatial_dims) {
+  std::vector<int64> dims;
+  switch (format) {
+    case FORMAT_NHWC:
+      dims.push_back(batch);
+      for (int dim : spatial_dims) {
+        dims.push_back(dim);
+      }
+      dims.push_back(feature);
+      break;
+    case FORMAT_NCHW:
+      dims.push_back(batch);
+      dims.push_back(feature);
+      for (int dim : spatial_dims) {
+        dims.push_back(dim);
+      }
+      break;
+    case FORMAT_NCHW_VECT_C:
+      LOG(FATAL) << "FORMAT_NCHW_VECT_C not supported.";
+  }
+  return dims;
+}
+
+std::vector<int32> OpTest::AsInt32s(const std::vector<int64>& int64s) {
+  return std::vector<int32>(int64s.begin(), int64s.end());
+}
+
 // Functions for comparing tensors.
 
 template <typename T>
@@ -574,53 +695,84 @@ Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
   }
 }
 
-void OpTest::ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
-                                           double atol, double rtol) {
+OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
+    const OpTestBuilder& builder, double atol, double rtol) {
+  const std::vector<OpTestBuilder::InputDescription>& inputs = builder.inputs();
+  std::vector<Tensor> input_tensors;
+  input_tensors.reserve(inputs.size());
+  for (const OpTestBuilder::InputDescription& input : inputs) {
+    if (input.type == DT_INVALID) {
+      VLOG(1) << "Input: " << input.tensor.DebugString();
+      input_tensors.push_back(input.tensor);
+    } else {
+      VLOG(1) << "Input: " << input.type << " "
+              << TensorShape(input.dims).DebugString();
+      std::vector<int64> dims;
+      if (input.has_dims) {
+        dims = input.dims;
+      } else {
+        dims = RandomDims();
+      }
+      if (!TensorSizeIsOk(dims)) {
+        VLOG(1) << "Ignoring oversize dims.";
+        return kInvalid;
+      }
+      input_tensors.push_back(RandomTensor(input.type, dims));
+    }
+  }
+
   string cpu_device =
       LocalDeviceToFullDeviceName(strings::StrCat(DEVICE_CPU, ":0"));
   string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
 
   DeviceNameUtils::ParsedName parsed_name;
-  ASSERT_TRUE(
-      DeviceNameUtils::ParseLocalName(*tf_xla_test_device_ptr, &parsed_name));
+  if (!DeviceNameUtils::ParseLocalName(*tf_xla_test_device_ptr, &parsed_name)) {
+    LOG(ERROR) << "Could not parse device name: " << *tf_xla_test_device_ptr;
+    return kFatalError;
+  }
   DeviceType test_device_type(parsed_name.type);
   ++num_tests_;
 
   GraphDef graph;
   std::vector<string> expected_inputs, test_inputs;
   std::vector<string> expected_fetches, test_fetches;
-  TF_ASSERT_OK(builder.BuildGraph(
+  Status status = builder.BuildGraph(
       strings::StrCat("test", num_tests_, "_expected"), cpu_device,
       /* use_jit= */ false, &graph, /* test_node_def= */ nullptr,
-      &expected_inputs, &expected_fetches));
+      &expected_inputs, &expected_fetches);
+  if (!status.ok()) {
+    LOG(ERROR) << "Expected graph construction failed: " << status;
+    return kFatalError;
+  }
 
   NodeDef* node_def;
-  TF_ASSERT_OK(builder.BuildGraph(strings::StrCat("test", num_tests_, "_test"),
-                                  test_device, tf_xla_test_use_jit, &graph,
-                                  &node_def, &test_inputs, &test_fetches));
+  status = builder.BuildGraph(strings::StrCat("test", num_tests_, "_test"),
+                              test_device, tf_xla_test_use_jit, &graph,
+                              &node_def, &test_inputs, &test_fetches);
+  if (!status.ok()) {
+    LOG(ERROR) << "Test graph construction failed: " << status;
+    return kFatalError;
+  }
 
   // Check that there's a kernel corresponding to 'node_def' on the device under
   // test.
-  Status status = FindKernelDef(test_device_type, *node_def, nullptr, nullptr);
+  status = FindKernelDef(test_device_type, *node_def, nullptr, nullptr);
   if (!status.ok()) {
     VLOG(1) << "Skipping test because there is no corresponding registered "
             << "kernel on the test device: " << status;
-    return;
+    return kInvalid;
   }
 
-  TF_ASSERT_OK(session_->Extend(graph));
-
-  const std::vector<Tensor>& input_tensors = builder.inputs();
-  if (VLOG_IS_ON(1)) {
-    for (const Tensor& input : input_tensors) {
-      VLOG(1) << "Input: " << input.DebugString();
-    }
+  status = session_->Extend(graph);
+  if (!status.ok()) {
+    LOG(ERROR) << "Session::Extend() failed: " << status;
+    return kFatalError;
   }
 
   std::vector<std::pair<string, Tensor>> expected_feeds(expected_inputs.size());
   std::vector<std::pair<string, Tensor>> test_feeds(test_inputs.size());
-  ASSERT_EQ(input_tensors.size(), expected_inputs.size());
-  ASSERT_EQ(input_tensors.size(), test_inputs.size());
+  CHECK_EQ(input_tensors.size(), expected_inputs.size());
+  CHECK_EQ(input_tensors.size(), test_inputs.size());
 
   for (int i = 0; i < input_tensors.size(); ++i) {
     expected_feeds[i] = {expected_inputs[i], input_tensors[i]};
@@ -632,18 +784,27 @@ void OpTest::ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
   Status s =
       session_->Run(expected_feeds, expected_fetches, {}, &expected_outputs);
   if (!s.ok()) {
-    VLOG(1) << "Expected graph failed with status: " << s << ". Skipping test";
-    return;
+    VLOG(1) << "Expected graph failed with status: " << s << ". Ignoring test";
+    return kInvalid;
+  }
+  for (const Tensor& expected : expected_outputs) {
+    VLOG(1) << "Expected: " << expected.DebugString();
   }
 
   VLOG(1) << "Running test graph";
-  TF_ASSERT_OK(session_->Run(test_feeds, test_fetches, {}, &test_outputs));
+  status = session_->Run(test_feeds, test_fetches, {}, &test_outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << "Test graph failed: " << status;
+    return kFatalError;
+  }
 
-  ASSERT_EQ(expected_outputs.size(), test_outputs.size());
+  CHECK_EQ(expected_outputs.size(), test_outputs.size());
   for (int j = 0; s.ok() && j < test_outputs.size(); ++j) {
     s = TensorsAreClose(expected_outputs[j], test_outputs[j], atol, rtol);
   }
   TF_EXPECT_OK(s);
+
+  return kOk;
 }
 
 // Helper that converts 'values' to an int32 or int64 Tensor.
@@ -663,8 +824,8 @@ Tensor AsIntTensor(DataType dtype, const std::vector<int64>& values) {
 TEST_F(OpTest, Abs) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Abs").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Abs").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -672,10 +833,10 @@ TEST_F(OpTest, Add) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Add")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Add")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -690,47 +851,50 @@ TEST_F(OpTest, AddN) {
     builder.Attr("T", type);
     builder.Attr("N", n);
     for (int i = 0; i < n; ++i) {
-      builder.Input(RandomTensor(type, shape));
+      builder.RandomInput(type, shape);
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
   });
 }
 
 TEST_F(OpTest, All) {
   Repeatedly([this]() {
-    Tensor data = RandomTensor(DT_BOOL);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("All").Input(data).Input(indices).Attr("keep_dims",
-                                                             keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("All")
+                                             .RandomInput(DT_BOOL, data_dims)
+                                             .Input(indices)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
 TEST_F(OpTest, Any) {
   Repeatedly([this]() {
-    Tensor data = RandomTensor(DT_BOOL);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Any").Input(data).Input(indices).Attr("keep_dims",
-                                                             keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Any")
+                                             .RandomInput(DT_BOOL, data_dims)
+                                             .Input(indices)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
 TEST_F(OpTest, AvgPool) {
   Repeatedly([this]() {
     std::uniform_int_distribution<int> random_int(1, 5);
-    int kernel_rows = random_int(generator()),
-        kernel_cols = random_int(generator());
+    std::vector<int64> dims = RandomDims(4, 4, 1);
+    int kernel_rows =
+        std::uniform_int_distribution<int>(1, dims[1])(generator());
+    int kernel_cols =
+        std::uniform_int_distribution<int>(1, dims[2])(generator());
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
     string padding = Choose<string>({"SAME", "VALID"});
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool")
-            .Input(
-                RandomTensor(DT_FLOAT, {RandomDim(1), RandomDim(kernel_rows),
-                                        RandomDim(kernel_cols), RandomDim(1)}))
+            .RandomInput(DT_FLOAT, dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
             .Attr("strides", {1, stride_rows, stride_cols, 1})
@@ -741,21 +905,72 @@ TEST_F(OpTest, AvgPool) {
   // for batch pooling when supported.
 }
 
+TEST_F(OpTest, AvgPool3D) {
+  Repeatedly([this]() {
+    std::uniform_int_distribution<int> random_int(1, 5);
+    std::vector<int64> dims = RandomDims(5, 5, 1);
+
+    std::vector<int64> input_dims, kernel_dims, stride_dims;
+    for (int i = 0; i < 3; ++i) {
+      kernel_dims.push_back(
+          std::uniform_int_distribution<int>(1, dims[i])(generator()));
+      input_dims.push_back(dims[i]);
+      stride_dims.push_back(random_int(generator()));
+    }
+    int64 batch = dims[3];
+    int64 feature = dims[4];
+
+    string padding = Choose<string>({"SAME", "VALID"});
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("AvgPool3D")
+            .RandomInput(DT_FLOAT,
+                         ImageDims(FORMAT_NHWC, batch, feature, input_dims))
+            .Attr("T", DT_FLOAT)
+            .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, kernel_dims))
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, stride_dims))
+            .Attr("padding", padding)
+            .Attr("data_format", "NDHWC"));
+  });
+  // TODO(phawkins): test NCHW format (not supported by CPU)
+}
+
 TEST_F(OpTest, AvgPoolGrad) {
   Repeatedly([this]() {
     int batch = RandomDim(1), features = RandomDim(1);
-    WindowedDims d = ChooseWindowedDims();
-    ExpectTfAndXlaOutputsAreClose(
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
+    std::vector<int32> input_dims =
+        AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
+    std::vector<int64> output_dims =
+        ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPoolGrad")
-            .Input(test::AsTensor<int32>(
-                {batch, d.input_rows, d.input_cols, features}))
-            .Input(RandomTensor(
-                DT_FLOAT, {batch, d.output_rows, d.output_cols, features}))
+            .Input(test::AsTensor<int32>(input_dims))
+            .RandomInput(DT_FLOAT, output_dims)
             .Attr("T", DT_FLOAT)
-            .Attr("ksize", {1, d.kernel_rows, d.kernel_cols, 1})
-            .Attr("strides", {1, d.stride_rows, d.stride_cols, 1})
+            .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
             .Attr("padding", d.padding == SAME ? "SAME" : "VALID")
-            .Attr("data_format", "NHWC"));
+            .Attr("data_format", "NDHWC"));
+  });
+}
+
+TEST_F(OpTest, AvgPool3DGrad) {
+  Repeatedly([this]() {
+    int batch = RandomDim(1), features = RandomDim(1);
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
+    std::vector<int32> input_dims =
+        AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
+    std::vector<int64> output_dims =
+        ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("AvgPool3DGrad")
+            .Input(test::AsTensor<int32>(input_dims))
+            .RandomInput(DT_FLOAT, output_dims)
+            .Attr("T", DT_FLOAT)
+            .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
+            .Attr("padding", d.padding == SAME ? "SAME" : "VALID")
+            .Attr("data_format", "NDHWC"));
   });
 }
 
@@ -767,60 +982,127 @@ TEST_F(OpTest, BatchMatMul) {
     std::vector<int64> x_dims(output_dims), y_dims(output_dims);
     x_dims[ndims - 1] = inner_dim;
     y_dims[ndims - 2] = inner_dim;
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, x_dims))
-                                      .Input(RandomTensor(DT_FLOAT, y_dims))
-                                      .Attr("T", DT_FLOAT));
 
-    std::swap(x_dims[ndims - 1], x_dims[ndims - 2]);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, x_dims))
-                                      .Input(RandomTensor(DT_FLOAT, y_dims))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("adj_x", true));
+    std::bernoulli_distribution random_bool;
+    bool adj_x = random_bool(generator());
+    bool adj_y = random_bool(generator());
+    if (adj_x) {
+      std::swap(x_dims[ndims - 1], x_dims[ndims - 2]);
+    }
+    if (adj_y) {
+      std::swap(y_dims[ndims - 1], y_dims[ndims - 2]);
+    }
 
-    std::swap(y_dims[ndims - 1], y_dims[ndims - 2]);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, x_dims))
-                                      .Input(RandomTensor(DT_FLOAT, y_dims))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("adj_x", true)
-                                      .Attr("adj_y", true));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchMatMul")
+                                             .RandomInput(DT_FLOAT, x_dims)
+                                             .RandomInput(DT_FLOAT, y_dims)
+                                             .Attr("T", DT_FLOAT)
+                                             .Attr("adj_x", adj_x)
+                                             .Attr("adj_y", adj_y));
+  });
+}
 
-    std::swap(x_dims[ndims - 1], x_dims[ndims - 2]);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, x_dims))
-                                      .Input(RandomTensor(DT_FLOAT, y_dims))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("adj_y", true));
+TEST_F(OpTest, BatchToSpace) {
+  Repeatedly([this]() {
+    const int num_block_dims = 2;
+    std::vector<int64> block_dims =
+        RandomDims(num_block_dims, num_block_dims, 0, 5);
+    int64 block_size = RandomDim(0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + 1);
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[0] *= block_size;
+      input_dims[1 + i] = block_dims[i];
+    }
+    input_dims[1 + num_block_dims] = RandomDim();
+
+    std::vector<int64> crop_vals;
+    std::uniform_int_distribution<int> distribution(0, 4);
+    for (int i = 0; i < num_block_dims; ++i) {
+      // Chooses crop values; does not always choose legal values.
+      crop_vals.push_back(distribution(generator()));
+      crop_vals.push_back(distribution(generator()));
+    }
+    Tensor crops;
+    CHECK(crops.CopyFrom(AsIntTensor(DT_INT32, crop_vals),
+                         TensorShape({num_block_dims, 2})));
+
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchToSpace")
+                                             .RandomInput(DT_FLOAT, input_dims)
+                                             .Input(crops)
+                                             .Attr("T", DT_FLOAT)
+                                             .Attr("block_size", block_size));
+  });
+}
+
+TEST_F(OpTest, BatchToSpaceND) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(1, 3, 0, 5);
+    int num_block_dims = block_dims.size();
+    std::vector<int64> remaining_dims = RandomDims(0, 3);
+    std::vector<int64> block_multipliers =
+        RandomDims(block_dims.size(), block_dims.size(), 0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + remaining_dims.size());
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[0] *= block_dims[i];
+    }
+    std::copy(block_multipliers.begin(), block_multipliers.end(),
+              input_dims.begin() + 1);
+    std::copy(remaining_dims.begin(), remaining_dims.end(),
+              input_dims.begin() + 1 + num_block_dims);
+
+    std::vector<int64> crop_vals;
+    std::uniform_int_distribution<int> distribution(0, 3);
+    for (int i = 0; i < num_block_dims; ++i) {
+      // Chooses crop values; does not always choose legal values.
+      crop_vals.push_back(distribution(generator()));
+      crop_vals.push_back(distribution(generator()));
+    }
+    Tensor crops;
+    CHECK(crops.CopyFrom(AsIntTensor(DT_INT32, crop_vals),
+                         TensorShape({num_block_dims, 2})));
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("BatchToSpaceND")
+            .RandomInput(DT_FLOAT, input_dims)
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(crops)
+            .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, BiasAdd) {
   Repeatedly([this]() {
-    auto x = RandomTensor(DT_FLOAT, RandomDims(2, kDefaultMaxRank));
-    auto y = RandomTensor(DT_FLOAT, {x.dim_size(x.dims() - 1)});
+    auto x_dims = RandomDims(2, kDefaultMaxRank);
+    auto y_dims = {x_dims[x_dims.size() - 1]};
     // TODO(phawkins): test both data formats.
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("BiasAdd").Input(x).Input(y).Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BiasAdd")
+                                             .RandomInput(DT_FLOAT, x_dims)
+                                             .RandomInput(DT_FLOAT, y_dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, BiasAddGrad) {
   Repeatedly([this]() {
-    auto x = RandomTensor(DT_FLOAT);
     // TODO(phawkins): test both data formats.
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("BiasAddGrad").Input(x).Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("BiasAddGrad").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, BiasAddV1) {
   Repeatedly([this]() {
-    auto x = RandomTensor(DT_FLOAT, RandomDims(2, kDefaultMaxRank));
-    auto y = RandomTensor(DT_FLOAT, {x.dim_size(x.dims() - 1)});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("BiasAddV1").Input(x).Input(y).Attr("T", DT_FLOAT));
+    auto x_dims = RandomDims(2, kDefaultMaxRank);
+    auto y_dims = {x_dims[x_dims.size() - 1]};
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BiasAddV1")
+                                             .RandomInput(DT_FLOAT, x_dims)
+                                             .RandomInput(DT_FLOAT, y_dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
@@ -830,10 +1112,11 @@ TEST_F(OpTest, BroadcastGradientArgs) {
     // DataType type = Choose<DataType>({DT_INT32, DT_INT64});
     DataType type = DT_INT32;
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BroadcastGradientArgs")
-                                      .Input(AsIntTensor(type, dims.first))
-                                      .Input(AsIntTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("BroadcastGradientArgs")
+            .Input(AsIntTensor(type, dims.first))
+            .Input(AsIntTensor(type, dims.second))
+            .Attr("T", type));
   });
 }
 
@@ -842,18 +1125,17 @@ TEST_F(OpTest, Cast) {
     DataType src_type, dst_type;
     src_type = Choose<DataType>({DT_INT32, DT_FLOAT, DT_BOOL});
     dst_type = Choose<DataType>({DT_INT32, DT_FLOAT, DT_BOOL});
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Cast")
-                                      .Input(RandomTensor(src_type))
-                                      .Attr("SrcT", src_type)
-                                      .Attr("DstT", dst_type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Cast")
+                                             .RandomInput(src_type)
+                                             .Attr("SrcT", src_type)
+                                             .Attr("DstT", dst_type));
   });
 }
 
 TEST_F(OpTest, Ceil) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Ceil")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Ceil").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
@@ -873,9 +1155,9 @@ TEST_F(OpTest, Concat) {
     for (int i = 0; i < n; ++i) {
       std::vector<int64> shape = dims;
       shape[concat_dim] = RandomDim();
-      builder.Input(RandomTensor(type, shape));
+      builder.RandomInput(type, shape);
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
   });
 }
 
@@ -895,27 +1177,30 @@ TEST_F(OpTest, ConcatOffset) {
       shape[concat_dim] = RandomDim();
       builder.Input(test::AsTensor<int32>(shape));
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
   });
 }
 
 TEST_F(OpTest, Conv2D) {
   Repeatedly([this]() {
-    WindowedDims d = ChooseWindowedDims();
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
     std::uniform_int_distribution<int> random_int(1, 5);
     int features_in = random_int(generator());
     int features_out = random_int(generator());
-    Tensor data = RandomTensor(
-        DT_FLOAT, {RandomDim(), d.input_rows, d.input_cols, features_in});
 
-    Tensor kernel = RandomTensor(
-        DT_FLOAT, {d.kernel_rows, d.kernel_cols, features_in, features_out});
-    ExpectTfAndXlaOutputsAreClose(
+    int64 batch = RandomDim();
+
+    std::vector<int64> data_dims =
+        ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
+
+    std::vector<int64> kernel_dims = {d.kernel_dims[0], d.kernel_dims[1],
+                                      features_in, features_out};
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2D")
-            .Input(data)
-            .Input(kernel)
+            .RandomInput(DT_FLOAT, data_dims)
+            .RandomInput(DT_FLOAT, kernel_dims)
             .Attr("T", DT_FLOAT)
-            .Attr("strides", {1, d.stride_rows, d.stride_cols, 1})
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
             .Attr("padding", d.padding == SAME ? "SAME" : "VALID")
             .Attr("data_format", "NHWC"));
   });
@@ -923,24 +1208,24 @@ TEST_F(OpTest, Conv2D) {
 
 TEST_F(OpTest, Conv2DBackpropFilter) {
   Repeatedly([this]() {
-    WindowedDims d = ChooseWindowedDims();
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
     std::uniform_int_distribution<int> random_int(1, 5);
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32 batch = RandomDim();
-    Tensor activations = RandomTensor(
-        DT_FLOAT, {batch, d.input_rows, d.input_cols, features_in});
-    Tensor backprop = RandomTensor(
-        DT_FLOAT, {batch, d.output_rows, d.output_cols, features_out});
-    Tensor kernel_shape = test::AsTensor<int32>(
-        {d.kernel_rows, d.kernel_cols, features_in, features_out});
-    ExpectTfAndXlaOutputsAreClose(
+    std::vector<int64> activations =
+        ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
+    std::vector<int64> backprop =
+        ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
+    Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
+        {d.kernel_dims[0], d.kernel_dims[1], features_in, features_out}));
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2DBackpropFilter")
-            .Input(activations)
+            .RandomInput(DT_FLOAT, activations)
             .Input(kernel_shape)
-            .Input(backprop)
+            .RandomInput(DT_FLOAT, backprop)
             .Attr("T", DT_FLOAT)
-            .Attr("strides", {1, d.stride_rows, d.stride_cols, 1})
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
             .Attr("padding", d.padding == SAME ? "SAME" : "VALID")
             .Attr("data_format", "NHWC"));
   });
@@ -948,35 +1233,111 @@ TEST_F(OpTest, Conv2DBackpropFilter) {
 
 TEST_F(OpTest, Conv2DBackpropInput) {
   Repeatedly([this]() {
-    WindowedDims d = ChooseWindowedDims();
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
     std::uniform_int_distribution<int> random_int(1, 5);
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32 batch = RandomDim();
-    Tensor in_shape =
-        test::AsTensor<int32>({batch, d.input_rows, d.input_cols, features_in});
-    Tensor backprop = RandomTensor(
-        DT_FLOAT, {batch, d.output_rows, d.output_cols, features_out});
-    Tensor kernel = RandomTensor(
-        DT_FLOAT, {d.kernel_rows, d.kernel_cols, features_in, features_out});
-    ExpectTfAndXlaOutputsAreClose(
+    Tensor in_shape = test::AsTensor<int32>(
+        AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
+    std::vector<int64> backprop =
+        ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
+    std::vector<int64> kernel = {d.kernel_dims[0], d.kernel_dims[1],
+                                 features_in, features_out};
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2DBackpropInput")
             .Input(in_shape)
-            .Input(kernel)
-            .Input(backprop)
+            .RandomInput(DT_FLOAT, kernel)
+            .RandomInput(DT_FLOAT, backprop)
             .Attr("T", DT_FLOAT)
-            .Attr("strides", {1, d.stride_rows, d.stride_cols, 1})
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
             .Attr("padding", d.padding == SAME ? "SAME" : "VALID")
             .Attr("data_format", "NHWC"));
   });
 }
 
+TEST_F(OpTest, Conv3D) {
+  Repeatedly([this]() {
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
+    std::uniform_int_distribution<int> random_int(1, 5);
+    int features_in = random_int(generator());
+    int features_out = random_int(generator());
+    std::vector<int64> data = {RandomDim(), d.input_dims[0], d.input_dims[1],
+                               d.input_dims[2], features_in};
+
+    std::vector<int64> kernel = {d.kernel_dims[0], d.kernel_dims[1],
+                                 d.kernel_dims[2], features_in, features_out};
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Conv3D")
+            .RandomInput(DT_FLOAT, data)
+            .RandomInput(DT_FLOAT, kernel)
+            .Attr("T", DT_FLOAT)
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
+            .Attr("padding", d.padding == SAME ? "SAME" : "VALID"));
+  });
+}
+
+TEST_F(OpTest, Conv3DBackpropFilter) {
+  Repeatedly([this]() {
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
+    std::uniform_int_distribution<int> random_int(1, 5);
+    int features_in = random_int(generator());
+    int features_out = random_int(generator());
+    int32 batch = RandomDim(1);
+    std::vector<int64> activations =
+        ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
+    std::vector<int64> backprop =
+        ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
+    Tensor kernel_shape = test::AsTensor<int32>(
+        AsInt32s({d.kernel_dims[0], d.kernel_dims[1], d.kernel_dims[2],
+                  features_in, features_out}));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Conv3DBackpropFilterV2")
+            .RandomInput(DT_FLOAT, activations)
+            .Input(kernel_shape)
+            .RandomInput(DT_FLOAT, backprop)
+            .Attr("T", DT_FLOAT)
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
+            .Attr("padding", d.padding == SAME ? "SAME" : "VALID"));
+  });
+}
+
+TEST_F(OpTest, Conv3DBackpropInput) {
+  Repeatedly([this]() {
+    WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
+    std::uniform_int_distribution<int> random_int(1, 5);
+    int features_in = random_int(generator());
+    int features_out = random_int(generator());
+    int32 batch = RandomDim(1);
+    Tensor in_shape = test::AsTensor<int32>(
+        AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
+    std::vector<int64> backprop =
+        ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
+    std::vector<int64> kernel = {d.kernel_dims[0], d.kernel_dims[1],
+                                 d.kernel_dims[2], features_in, features_out};
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Conv3DBackpropInputV2")
+            .Input(in_shape)
+            .RandomInput(DT_FLOAT, kernel)
+            .RandomInput(DT_FLOAT, backprop)
+            .Attr("T", DT_FLOAT)
+            .Attr("strides", ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims))
+            .Attr("padding", d.padding == SAME ? "SAME" : "VALID"));
+  });
+}
+
 TEST_F(OpTest, Diag) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Diag")
-                                      .Input(RandomTensor(type, RandomDims(1)))
-                                      .Attr("T", type));
+    std::vector<int64> dims;
+    // Diag causes a quadratic blowup in output size.
+    int64 size;
+    do {
+      dims = RandomDims(1);
+      size = TensorShape(dims).num_elements();
+    } while (size * size < tf_xla_max_tensor_size);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Diag").RandomInput(type, dims).Attr("T", type));
   });
 }
 
@@ -988,9 +1349,9 @@ TEST_F(OpTest, DiagPart) {
     std::vector<int64> doubled_dims(dims.size() * 2);
     std::copy(dims.begin(), dims.end(), doubled_dims.begin());
     std::copy(dims.begin(), dims.end(), doubled_dims.begin() + dims.size());
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("DiagPart")
-                                      .Input(RandomTensor(type, doubled_dims))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("DiagPart")
+                                             .RandomInput(type, doubled_dims)
+                                             .Attr("T", type));
   });
 }
 
@@ -998,10 +1359,10 @@ TEST_F(OpTest, Div) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Div")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Div")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1050,10 +1411,26 @@ TEST_F(OpTest, DynamicStitch) {
       std::vector<int64> dims(index_dims[i].begin(), index_dims[i].end());
       std::copy(constant_dims.begin(), constant_dims.end(),
                 std::back_inserter(dims));
-      Tensor t = RandomTensor(type, dims);
-      builder.Input(t);
+      builder.RandomInput(type, dims);
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
+  });
+}
+
+TEST_F(OpTest, Elu) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Elu").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, EluGrad) {
+  Repeatedly([this]() {
+    auto dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("EluGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
@@ -1061,50 +1438,51 @@ TEST_F(OpTest, Equal) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Equal")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Equal")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Exp) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Exp").Input(RandomTensor(DT_FLOAT)).Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Exp").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, ExpandDims) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor in = RandomTensor(type);
+    std::vector<int64> in_dims = RandomDims();
     Tensor dim(DT_INT32, TensorShape());
-    std::uniform_int_distribution<int32> d(-1 - in.dims(), in.dims());
+    std::uniform_int_distribution<int32> d(-1 - in_dims.size(), in_dims.size());
     dim.scalar<int32>()() = d(generator());
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("ExpandDims").Input(in).Input(dim).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ExpandDims")
+                                             .RandomInput(type, in_dims)
+                                             .Input(dim)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Fill) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor scalar = RandomTensor(type, {});
     std::vector<int64> dims = RandomDims();
     std::vector<int32> shape(dims.begin(), dims.end());
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Fill")
-                                      .Input(test::AsTensor<int32>(shape))
-                                      .Input(scalar)
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Fill")
+            .Input(test::AsTensor<int32>(shape))
+            .RandomInput(type, {})
+            .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Floor) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Floor")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Floor").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
@@ -1112,10 +1490,10 @@ TEST_F(OpTest, FloorDiv) {
   Repeatedly([this]() {
     DataType type = DT_INT32;
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("FloorDiv")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("FloorDiv")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1123,10 +1501,10 @@ TEST_F(OpTest, FloorMod) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("FloorMod")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("FloorMod")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1134,10 +1512,10 @@ TEST_F(OpTest, Greater) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Greater")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Greater")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1145,18 +1523,10 @@ TEST_F(OpTest, GreaterEqual) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("GreaterEqual")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
-  });
-}
-
-TEST_F(OpTest, Reciprocal) {
-  Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Reciprocal")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("GreaterEqual")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1164,9 +1534,9 @@ TEST_F(OpTest, L2Loss) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     // TODO(b/31644876): scalars currently crash.
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("L2Loss")
-                                      .Input(RandomTensor(type, RandomDims(1)))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("L2Loss")
+                                             .RandomInput(type, RandomDims(1))
+                                             .Attr("T", type));
   });
 }
 
@@ -1174,10 +1544,10 @@ TEST_F(OpTest, Less) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Less")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Less")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1185,10 +1555,10 @@ TEST_F(OpTest, LessEqual) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("LessEqual")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("LessEqual")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -1200,10 +1570,10 @@ TEST_F(OpTest, LinSpace) {
     };
     std::uniform_int_distribution<int> distribution(-50, 50);
     DataType type = Choose<DataType>({DT_INT32, DT_INT64});
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("LinSpace")
-            .Input(RandomTensor(DT_FLOAT, {}))
-            .Input(RandomTensor(DT_FLOAT, {}))
+            .RandomInput(DT_FLOAT, {})
+            .RandomInput(DT_FLOAT, {})
             .Input(ToScalar(type, distribution(generator())))
             .Attr("T", DT_FLOAT)
             .Attr("Tidx", type));
@@ -1212,62 +1582,62 @@ TEST_F(OpTest, LinSpace) {
 
 TEST_F(OpTest, Log) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Log").Input(RandomTensor(DT_FLOAT)).Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Log").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, LogicalAnd) {
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("LogicalAnd")
-            .Input(RandomTensor(DT_BOOL, dims.first))
-            .Input(RandomTensor(DT_BOOL, dims.second)));
+            .RandomInput(DT_BOOL, dims.first)
+            .RandomInput(DT_BOOL, dims.second));
   });
 }
 
 TEST_F(OpTest, LogicalNot) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("LogicalNot").Input(RandomTensor(DT_BOOL)));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("LogicalNot").RandomInput(DT_BOOL));
   });
 }
 
 TEST_F(OpTest, LogicalOr) {
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("LogicalOr")
-            .Input(RandomTensor(DT_BOOL, dims.first))
-            .Input(RandomTensor(DT_BOOL, dims.second)));
+            .RandomInput(DT_BOOL, dims.first)
+            .RandomInput(DT_BOOL, dims.second));
   });
 }
 
 TEST_F(OpTest, LogSoftmax) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("LogSoftmax")
-            .Input(RandomTensor(DT_FLOAT, RandomDims(2, 2)))
+            .RandomInput(DT_FLOAT, RandomDims(2, 2))
             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, LRN) {
   Repeatedly([this]() {
-    Tensor data;
     // TODO(b/31362467): Crashes with 0 dims on GPU. Re-enable when fixed.
-    data = RandomTensor(DT_FLOAT, RandomDims(4, 4, 1, 8));
+    std::vector<int64> data_dims = RandomDims(4, 4, 1, 8);
     // CuDNN requires depth_radius > 0.
-    std::uniform_int_distribution<int> radius(1, data.dim_size(3));
+    std::uniform_int_distribution<int> radius(1, data_dims[3]);
     std::uniform_real_distribution<float> coeff(0.01, 2.0);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("LRN")
-                                      .Input(data)
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("depth_radius", radius(generator()))
-                                      .Attr("bias", coeff(generator()))
-                                      .Attr("alpha", coeff(generator()))
-                                      .Attr("beta", coeff(generator())));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("LRN")
+            .RandomInput(DT_FLOAT, data_dims)
+            .Attr("T", DT_FLOAT)
+            .Attr("depth_radius", radius(generator()))
+            .Attr("bias", coeff(generator()))
+            .Attr("alpha", coeff(generator()))
+            .Attr("beta", coeff(generator())));
   });
 }
 
@@ -1275,21 +1645,19 @@ TEST_F(OpTest, LRNGrad) {
   Repeatedly([this]() {
     // TODO(b/31362467): Crashes with 0 dims on GPU. Re-enable when fixed.
     std::vector<int64> dims = RandomDims(4, 4, 1, 8);
-    Tensor input_grads = RandomTensor(DT_FLOAT, dims);
-    Tensor input_image = RandomTensor(DT_FLOAT, dims);
-    Tensor output_image = RandomTensor(DT_FLOAT, dims);
     // CuDNN requires depth_radius > 0.
-    std::uniform_int_distribution<int> radius(1, input_grads.dim_size(3));
+    std::uniform_int_distribution<int> radius(1, dims[3]);
     std::uniform_real_distribution<float> coeff(0.0, 2.0);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("LRNGrad")
-                                      .Input(input_grads)
-                                      .Input(input_image)
-                                      .Input(output_image)
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("depth_radius", radius(generator()))
-                                      .Attr("bias", coeff(generator()))
-                                      .Attr("alpha", coeff(generator()))
-                                      .Attr("beta", coeff(generator())));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("LRNGrad")
+            .RandomInput(DT_FLOAT, dims)
+            .RandomInput(DT_FLOAT, dims)
+            .RandomInput(DT_FLOAT, dims)
+            .Attr("T", DT_FLOAT)
+            .Attr("depth_radius", radius(generator()))
+            .Attr("bias", coeff(generator()))
+            .Attr("alpha", coeff(generator()))
+            .Attr("beta", coeff(generator())));
   });
 }
 
@@ -1299,59 +1667,57 @@ TEST_F(OpTest, MatMul) {
     int64 y = RandomDim();
     int64 z = RandomDim();
 
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {x, y}))
-                                      .Input(RandomTensor(DT_FLOAT, {y, z}))
-                                      .Attr("T", DT_FLOAT));
+    std::vector<int64> a_dims = {x, y};
+    std::vector<int64> b_dims = {y, z};
 
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {y, x}))
-                                      .Input(RandomTensor(DT_FLOAT, {y, z}))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("transpose_a", true));
+    std::bernoulli_distribution random_bool;
+    bool transpose_a = random_bool(generator());
+    bool transpose_b = random_bool(generator());
+    if (transpose_a) {
+      std::swap(a_dims[0], a_dims[1]);
+    }
+    if (transpose_b) {
+      std::swap(b_dims[0], b_dims[1]);
+    }
 
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {x, y}))
-                                      .Input(RandomTensor(DT_FLOAT, {z, y}))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("transpose_b", true));
-
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {y, x}))
-                                      .Input(RandomTensor(DT_FLOAT, {z, y}))
-                                      .Attr("T", DT_FLOAT)
-                                      .Attr("transpose_a", true)
-                                      .Attr("transpose_b", true));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatMul")
+                                             .RandomInput(DT_FLOAT, a_dims)
+                                             .RandomInput(DT_FLOAT, b_dims)
+                                             .Attr("T", DT_FLOAT)
+                                             .Attr("transpose_a", transpose_a)
+                                             .Attr("transpose_b", transpose_b));
   });
 }
 
 TEST_F(OpTest, MatrixDiag) {
   Repeatedly([this]() {
-    DataType type = Choose<DataType>({DT_BOOL, DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiag")
-                                      .Input(RandomTensor(type, RandomDims(1)))
-                                      .Attr("T", type));
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiag")
+                                             .RandomInput(type, RandomDims(1))
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, MatrixDiagPart) {
   Repeatedly([this]() {
-    DataType type = Choose<DataType>({DT_BOOL, DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiagPart")
-                                      .Input(RandomTensor(type, RandomDims(2)))
-                                      .Attr("T", type));
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiagPart")
+                                             .RandomInput(type, RandomDims(2))
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Max) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    Tensor data = RandomTensor(type);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Max").Input(data).Input(indices).Attr("T", type).Attr(
-            "keep_dims", keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Max")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", type)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
@@ -1359,26 +1725,28 @@ TEST_F(OpTest, Maximum) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Maximum")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Maximum")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, MaxPool) {
   Repeatedly([this]() {
     std::uniform_int_distribution<int> random_int(1, 5);
-    int kernel_rows = random_int(generator()),
-        kernel_cols = random_int(generator());
+    std::vector<int64> dims = RandomDims(4, 4, 1);
+    int kernel_rows =
+        std::uniform_int_distribution<int>(1, dims[1])(generator());
+    int kernel_cols =
+        std::uniform_int_distribution<int>(1, dims[2])(generator());
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
+
     string padding = Choose<string>({"SAME", "VALID"});
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("MaxPool")
-            .Input(
-                RandomTensor(DT_FLOAT, {RandomDim(1), RandomDim(kernel_rows),
-                                        RandomDim(kernel_cols), RandomDim(1)}))
+            .RandomInput(DT_FLOAT, dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
             .Attr("strides", {1, stride_rows, stride_cols, 1})
@@ -1388,29 +1756,66 @@ TEST_F(OpTest, MaxPool) {
   // TODO(phawkins): test NCHW format (not supported by CPU)
 }
 
+TEST_F(OpTest, MaxPool3D) {
+  Repeatedly([this]() {
+    std::uniform_int_distribution<int> random_int(1, 5);
+    std::vector<int64> dims = RandomDims(5, 5, 1);
+
+    std::vector<int64> input_dims, kernel_dims, stride_dims;
+    kernel_dims.push_back(1);
+    stride_dims.push_back(1);
+    for (int i = 0; i < 3; ++i) {
+      kernel_dims.push_back(
+          std::uniform_int_distribution<int>(1, dims[i])(generator()));
+      input_dims.push_back(dims[i]);
+      stride_dims.push_back(random_int(generator()));
+    }
+    kernel_dims.push_back(1);
+    stride_dims.push_back(1);
+    int64 batch = dims[3];
+    int64 feature = dims[4];
+
+    string padding = Choose<string>({"SAME", "VALID"});
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("MaxPool3D")
+            .RandomInput(DT_FLOAT,
+                         ImageDims(FORMAT_NHWC, batch, feature, input_dims))
+            .Attr("T", DT_FLOAT)
+            .Attr("ksize", kernel_dims)
+            .Attr("strides", stride_dims)
+            .Attr("padding", padding)
+            .Attr("data_format", "NDHWC"));
+  });
+  // TODO(phawkins): test NCHW format (not supported by CPU)
+}
+
 TEST_F(OpTest, Mean) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     // TODO(phawkins): CPU and XLA differ output for reducing across a
     // size-0 dimension (nan vs 0). For now, require size >= 1.
-    Tensor data = RandomTensor(type, RandomDims(0, kDefaultMaxRank, 1));
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims(0, kDefaultMaxRank, 1);
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Mean").Input(data).Input(indices).Attr("T", type).Attr(
-            "keep_dims", keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Mean")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", type)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
 TEST_F(OpTest, Min) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    Tensor data = RandomTensor(type);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Min").Input(data).Input(indices).Attr("T", type).Attr(
-            "keep_dims", keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Min")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", type)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
@@ -1418,21 +1823,20 @@ TEST_F(OpTest, Minimum) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Minimum")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Minimum")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Mod) {
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Mod")
-            .Input(RandomTensor(DT_INT32, dims.first))
-            .Input(RandomTensor(DT_INT32, dims.second))
-            .Attr("T", DT_INT32));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Mod")
+                                             .RandomInput(DT_INT32, dims.first)
+                                             .RandomInput(DT_INT32, dims.second)
+                                             .Attr("T", DT_INT32));
   });
 }
 
@@ -1440,18 +1844,18 @@ TEST_F(OpTest, Mul) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Mul")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Mul")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Neg) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Neg").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Neg").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -1459,10 +1863,48 @@ TEST_F(OpTest, NotEqual) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("NotEqual")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("NotEqual")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
+  });
+}
+
+TEST_F(OpTest, OneHot) {
+  Repeatedly([this]() {
+    DataType type = Choose<DataType>(kAllXlaTypes);
+
+    std::vector<int64> dims = RandomDims();
+    int num_dims = dims.size();
+
+    int32 depth = RandomDim();
+
+    Tensor indices(DT_INT32, TensorShape(dims));
+    std::uniform_int_distribution<int32> distribution(-depth * 2, depth * 2);
+    test::FillFn<int32>(&indices, [this, &distribution](int i) -> int32 {
+      return distribution(generator());
+    });
+
+    int axis = std::uniform_int_distribution<int32>(-num_dims - 5,
+                                                    num_dims + 5)(generator());
+
+    OpTestBuilder builder("OneHot");
+    builder.Attr("T", type);
+    builder.Attr("TI", DT_INT32);
+    builder.Attr("axis", axis);
+    builder.Input(indices);
+    builder.Input(test::AsScalar<int32>(depth));
+    builder.RandomInput(type, {});
+    builder.RandomInput(type, {});
+    return ExpectTfAndXlaOutputsAreClose(builder);
+  });
+}
+
+TEST_F(OpTest, OnesLike) {
+  Repeatedly([this]() {
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("OnesLike").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -1481,9 +1923,9 @@ TEST_F(OpTest, Pack) {
     builder.Attr("N", n);
     builder.Attr("axis", axis);
     for (int i = 0; i < n; ++i) {
-      builder.Input(RandomTensor(type, dims));
+      builder.RandomInput(type, dims);
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
   });
 }
 
@@ -1491,23 +1933,26 @@ TEST_F(OpTest, Pack) {
 TEST_F(OpTest, Pad) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor t = RandomTensor(type);
+    std::vector<int64> t_dims = RandomDims();
 
     // TODO(b/31741996): re-enable DT_INT64 when bug is fixed.
     // DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
     DataType tpaddings = DT_INT32;
     std::vector<int64> paddings_vec;
     std::uniform_int_distribution<int> distribution(0, 7);
-    for (int i = 0; i < t.dims(); ++i) {
+    for (int i = 0; i < t_dims.size(); ++i) {
       paddings_vec.push_back(distribution(generator()));
       paddings_vec.push_back(distribution(generator()));
     }
     Tensor paddings;
-    CHECK(paddings.CopyFrom(AsIntTensor(tpaddings, paddings_vec),
-                            TensorShape({t.dims(), 2})));
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Pad").Input(t).Input(paddings).Attr("T", type).Attr(
-            "Tpaddings", tpaddings));
+    CHECK(
+        paddings.CopyFrom(AsIntTensor(tpaddings, paddings_vec),
+                          TensorShape({static_cast<int64>(t_dims.size()), 2})));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Pad")
+                                             .RandomInput(type, t_dims)
+                                             .Input(paddings)
+                                             .Attr("T", type)
+                                             .Attr("Tpaddings", tpaddings));
   });
 }
 
@@ -1516,23 +1961,24 @@ TEST_F(OpTest, Pow) {
   // nontermination.
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Pow")
-            .Input(RandomTensor(DT_FLOAT, dims.first))
-            .Input(RandomTensor(DT_FLOAT, dims.second))
-            .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Pow")
+                                             .RandomInput(DT_FLOAT, dims.first)
+                                             .RandomInput(DT_FLOAT, dims.second)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Prod) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    Tensor data = RandomTensor(type);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Prod").Input(data).Input(indices).Attr("T", type).Attr(
-            "keep_dims", keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Prod")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", type)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
@@ -1547,7 +1993,7 @@ TEST_F(OpTest, Range) {
     };
     std::uniform_int_distribution<int> distribution(-50, 50);
     DataType tidx = Choose<DataType>({DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE});
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Range")
             .Input(ToScalar(tidx, distribution(generator())))
             .Input(ToScalar(tidx, distribution(generator())))
@@ -1559,8 +2005,8 @@ TEST_F(OpTest, Range) {
 TEST_F(OpTest, Rank) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Rank").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Rank").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -1568,46 +2014,51 @@ TEST_F(OpTest, RealDiv) {
   Repeatedly([this]() {
     DataType type = DT_FLOAT;
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("RealDiv")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("RealDiv")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
+  });
+}
+
+TEST_F(OpTest, Reciprocal) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Reciprocal").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Relu) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Relu")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Relu").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Relu6) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Relu6")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Relu6").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Relu6Grad) {
   Repeatedly([this]() {
     auto dims = RandomDims(1);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Relu6Grad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Relu6Grad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, ReluGrad) {
   Repeatedly([this]() {
     auto dims = RandomDims(1);
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ReluGrad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ReluGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
@@ -1629,39 +2080,68 @@ TEST_F(OpTest, Reshape) {
         }
       }
     }
-    Tensor data = RandomTensor(type, dims_before);
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Reshape")
-            .Input(data)
+            .RandomInput(type, dims_before)
             .Input(test::AsTensor<int32>(
                 std::vector<int32>(dims_after.begin(), dims_after.end())))
             .Attr("T", type));
   });
 }
 
+TEST_F(OpTest, Reverse) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(1);
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    int64 rank = dims.size();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Reverse")
+                                             .RandomInput(type, dims)
+                                             .RandomInput(DT_BOOL, {rank})
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, ReverseV2) {
+  Repeatedly([this]() {
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ReverseV2")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, Round) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Round").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Rsqrt) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Rsqrt")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Rsqrt").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, RsqrtGrad) {
   Repeatedly([this]() {
     auto dims = RandomDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("RsqrtGrad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("RsqrtGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Shape) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Shape").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Shape").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -1673,76 +2153,235 @@ TEST_F(OpTest, ShapeN) {
     builder.Attr("T", type);
     builder.Attr("N", n);
     for (int i = 0; i < n; ++i) {
-      builder.Input(RandomTensor(type));
+      builder.RandomInput(type);
     }
-    ExpectTfAndXlaOutputsAreClose(builder);
+    return ExpectTfAndXlaOutputsAreClose(builder);
   });
 }
 
 TEST_F(OpTest, Sigmoid) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Sigmoid")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Sigmoid").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, SigmoidGrad) {
   Repeatedly([this]() {
     auto dims = RandomDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SigmoidGrad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SigmoidGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Sign) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Sign").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Sign").RandomInput(type).Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Size) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Size").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Size").RandomInput(type).Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Slice) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor data = RandomTensor(type);
+    std::vector<int64> data_dims = RandomDims();
 
-    std::vector<int32> begin(data.dims()), size(data.dims());
-    for (int i = 0; i < data.dims(); ++i) {
-      begin[i] = std::uniform_int_distribution<int32>(
-          0, data.dim_size(i))(generator());
+    std::vector<int32> begin(data_dims.size()), size(data_dims.size());
+    for (int i = 0; i < data_dims.size(); ++i) {
+      begin[i] =
+          std::uniform_int_distribution<int32>(0, data_dims[i])(generator());
       size[i] = std::uniform_int_distribution<int32>(
-          -1, data.dim_size(i) - begin[i])(generator());
+          -1, data_dims[i] - begin[i])(generator());
     }
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Slice")
-                                      .Input(data)
-                                      .Input(test::AsTensor<int32>(begin))
-                                      .Input(test::AsTensor<int32>(size))
-                                      .Attr("T", type)
-                                      .Attr("Index", DT_INT32));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Slice")
+            .RandomInput(type, data_dims)
+            .Input(test::AsTensor<int32>(begin))
+            .Input(test::AsTensor<int32>(size))
+            .Attr("T", type)
+            .Attr("Index", DT_INT32));
   });
 }
 
 TEST_F(OpTest, Softmax) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Softmax")
-            .Input(RandomTensor(DT_FLOAT, RandomDims(2, 2)))
+            .RandomInput(DT_FLOAT, RandomDims(2, 2))
             .Attr("T", DT_FLOAT));
   });
 }
 
+TEST_F(OpTest, SoftmaxCrossEntropyWithLogits) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, 2, 1);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("SoftmaxCrossEntropyWithLogits")
+            .RandomInput(DT_FLOAT, dims)
+            .RandomInput(DT_FLOAT, dims)
+            .Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, Softplus) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Softplus").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, SoftplusGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SoftplusGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, SpaceToBatch) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(4, 4, 0, 5);
+    const int num_block_dims = 2;
+    int64 block_size = RandomDim(0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + 1);
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[1 + i] = block_dims[i] * block_size;
+    }
+    input_dims[1 + num_block_dims] = RandomDim();
+
+    std::vector<int64> padding_vals;
+    std::uniform_int_distribution<int> distribution(0, 7);
+    for (int i = 0; i < num_block_dims; ++i) {
+      int64 pad_before;
+      int64 pad_after;
+      do {
+        pad_before = distribution(generator());
+        pad_after = distribution(generator());
+      } while (pad_before + pad_after > input_dims[1 + i]);
+      input_dims[1 + i] -= pad_before + pad_after;
+      padding_vals.push_back(pad_before);
+      padding_vals.push_back(pad_after);
+    }
+    Tensor paddings;
+    CHECK(paddings.CopyFrom(AsIntTensor(DT_INT32, padding_vals),
+                            TensorShape({num_block_dims, 2})));
+
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SpaceToBatch")
+                                             .RandomInput(DT_FLOAT, input_dims)
+                                             .Input(paddings)
+                                             .Attr("T", DT_FLOAT)
+                                             .Attr("block_size", block_size));
+  });
+}
+
+TEST_F(OpTest, SpaceToBatchND) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(1, 3, 0, 5);
+    int num_block_dims = block_dims.size();
+    std::vector<int64> remaining_dims = RandomDims(0, 3);
+    std::vector<int64> block_multipliers =
+        RandomDims(block_dims.size(), block_dims.size(), 0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + remaining_dims.size());
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[1 + i] = block_dims[i] * block_multipliers[i];
+    }
+    std::copy(remaining_dims.begin(), remaining_dims.end(),
+              input_dims.begin() + 1 + num_block_dims);
+
+    std::vector<int64> padding_vals;
+    std::uniform_int_distribution<int> distribution(0, 7);
+    for (int i = 0; i < num_block_dims; ++i) {
+      int64 pad_before;
+      int64 pad_after;
+      do {
+        pad_before = distribution(generator());
+        pad_after = distribution(generator());
+      } while (pad_before + pad_after > input_dims[1 + i]);
+      input_dims[1 + i] -= pad_before + pad_after;
+      padding_vals.push_back(pad_before);
+      padding_vals.push_back(pad_after);
+    }
+    Tensor paddings;
+    CHECK(paddings.CopyFrom(AsIntTensor(DT_INT32, padding_vals),
+                            TensorShape({num_block_dims, 2})));
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("SpaceToBatchND")
+            .RandomInput(DT_FLOAT, input_dims)
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(paddings)
+            .Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, SparseMatMul) {
+  Repeatedly([this]() {
+    int64 x = RandomDim();
+    int64 y = RandomDim();
+    int64 z = RandomDim();
+
+    std::vector<int64> a_dims = {x, y};
+    std::vector<int64> b_dims = {y, z};
+
+    std::bernoulli_distribution random_bool;
+    bool transpose_a = random_bool(generator());
+    bool transpose_b = random_bool(generator());
+    if (transpose_a) {
+      std::swap(a_dims[0], a_dims[1]);
+    }
+    if (transpose_b) {
+      std::swap(b_dims[0], b_dims[1]);
+    }
+
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SparseMatMul")
+                                             .RandomInput(DT_FLOAT, a_dims)
+                                             .RandomInput(DT_FLOAT, b_dims)
+                                             .Attr("Ta", DT_FLOAT)
+                                             .Attr("Tb", DT_FLOAT)
+                                             .Attr("transpose_a", transpose_a)
+                                             .Attr("transpose_b", transpose_b));
+  });
+}
+
+TEST_F(OpTest, SparseSoftmaxCrossEntropyWithLogits) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, 2, 1);
+    int64 batch_size = dims[0];
+    int64 num_classes = dims[1];
+
+    std::vector<int32> indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      indices[i] =
+          std::uniform_int_distribution<int32>(0, num_classes - 1)(generator());
+    }
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("SparseSoftmaxCrossEntropyWithLogits")
+            .RandomInput(DT_FLOAT, dims)
+            .Input(test::AsTensor<int32>(indices))
+            .Attr("T", DT_FLOAT)
+            .Attr("Tlabels", DT_INT32));
+  });
+}
+
 TEST_F(OpTest, Split) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
@@ -1754,110 +2393,54 @@ TEST_F(OpTest, Split) {
     // Ensure 'dim' is evenly divisible by 'n'.
     dims[dim] /= n;
     dims[dim] *= n;
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Split")
-                                      .Input(test::AsScalar<int32>(dim))
-                                      .Input(RandomTensor(type, dims))
-                                      .Attr("T", type)
-                                      .Attr("num_split", n));
-  });
-}
-
-TEST_F(OpTest, Softplus) {
-  Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Softplus")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
-  });
-}
-
-TEST_F(OpTest, SoftplusGrad) {
-  Repeatedly([this]() {
-    std::vector<int64> dims = RandomDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SoftplusGrad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
-  });
-}
-
-TEST_F(OpTest, SparseMatMul) {
-  Repeatedly([this]() {
-    int64 x = RandomDim();
-    int64 y = RandomDim();
-    int64 z = RandomDim();
-
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SparseMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {x, y}))
-                                      .Input(RandomTensor(DT_FLOAT, {y, z}))
-                                      .Attr("Ta", DT_FLOAT)
-                                      .Attr("Tb", DT_FLOAT));
-
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SparseMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {y, x}))
-                                      .Input(RandomTensor(DT_FLOAT, {y, z}))
-                                      .Attr("Ta", DT_FLOAT)
-                                      .Attr("Tb", DT_FLOAT)
-                                      .Attr("transpose_a", true));
-
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SparseMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {x, y}))
-                                      .Input(RandomTensor(DT_FLOAT, {z, y}))
-                                      .Attr("Ta", DT_FLOAT)
-                                      .Attr("Tb", DT_FLOAT)
-                                      .Attr("transpose_b", true));
-
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SparseMatMul")
-                                      .Input(RandomTensor(DT_FLOAT, {y, x}))
-                                      .Input(RandomTensor(DT_FLOAT, {z, y}))
-                                      .Attr("Ta", DT_FLOAT)
-                                      .Attr("Tb", DT_FLOAT)
-                                      .Attr("transpose_a", true)
-                                      .Attr("transpose_b", true));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Split")
+                                             .Input(test::AsScalar<int32>(dim))
+                                             .RandomInput(type, dims)
+                                             .Attr("T", type)
+                                             .Attr("num_split", n));
   });
 }
 
 TEST_F(OpTest, Sqrt) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Sqrt")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Sqrt").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, SquaredDifference) {
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("SquaredDifference")
-            .Input(RandomTensor(DT_FLOAT, dims.first))
-            .Input(RandomTensor(DT_FLOAT, dims.second))
-            .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SquaredDifference")
+                                             .RandomInput(DT_FLOAT, dims.first)
+                                             .RandomInput(DT_FLOAT, dims.second)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Square) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Square").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Square").RandomInput(type).Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Squeeze) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor t = RandomTensor(type, RandomDims(0, kDefaultMaxRank, 0, 5));
+    std::vector<int64> t_dims = RandomDims(0, kDefaultMaxRank, 0, 5);
     std::bernoulli_distribution random_bool;
     std::vector<int> squeeze_dims;
-    for (int i = 0; i < t.dims(); ++i) {
-      if (t.dim_size(i) == 1 && random_bool(generator())) {
+    for (int i = 0; i < t_dims.size(); ++i) {
+      if (t_dims[i] == 1 && random_bool(generator())) {
         squeeze_dims.push_back(i);
       }
     }
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Squeeze")
-                                      .Input(t)
-                                      .Attr("squeeze_dims", squeeze_dims)
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Squeeze")
+                                             .RandomInput(type, t_dims)
+                                             .Attr("squeeze_dims", squeeze_dims)
+                                             .Attr("T", type));
   });
 }
 
@@ -1865,58 +2448,59 @@ TEST_F(OpTest, Sub) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Sub")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Sub")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Sum) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    Tensor data = RandomTensor(type);
-    Tensor indices = RandomReductionIndices(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    Tensor indices = RandomReductionIndices(data_dims.size());
     bool keep_dims = Choose<bool>({false, true});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("Sum").Input(data).Input(indices).Attr("T", type).Attr(
-            "keep_dims", keep_dims));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Sum")
+                                             .RandomInput(type, data_dims)
+                                             .Input(indices)
+                                             .Attr("T", type)
+                                             .Attr("keep_dims", keep_dims));
   });
 }
 
 TEST_F(OpTest, StridedSlice) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor data = RandomTensor(type);
-
-    std::vector<int32> begin(data.dims()), end(data.dims());
-    std::vector<int32> strides(data.dims());
-    for (int i = 0; i < data.dims(); ++i) {
+    std::vector<int64> data_dims = RandomDims();
+    std::vector<int32> begin(data_dims.size()), end(data_dims.size());
+    std::vector<int32> strides(data_dims.size());
+    for (int i = 0; i < data_dims.size(); ++i) {
       begin[i] = std::uniform_int_distribution<int32>(
-          -2 * data.dim_size(i), 2 * data.dim_size(i))(generator());
+          -2 * data_dims[i], 2 * data_dims[i])(generator());
       end[i] = std::uniform_int_distribution<int32>(
-          -2 * data.dim_size(i), 2 * data.dim_size(i))(generator());
+          -2 * data_dims[i], 2 * data_dims[i])(generator());
       // TODO(b/31360685): support strides other than 1 or -1
       strides[i] = std::bernoulli_distribution()(generator()) ? 1 : -1;
     }
-    int64 max_bitmask = (1LL << data.dims()) - 1;
+    int64 max_bitmask = (1LL << data_dims.size()) - 1;
     std::uniform_int_distribution<int64> bitmask_distribution(0, max_bitmask);
     int64 begin_mask = bitmask_distribution(generator());
     int64 end_mask = bitmask_distribution(generator());
 
     // Create a ellipsis bitmask with at most one 1 bit set.
     int64 ellipsis_mask = 0;
-    if (data.dims() > 0 && std::bernoulli_distribution()(generator())) {
-      int ellipsis_pos =
-          std::uniform_int_distribution<int>(0, data.dims() - 1)(generator());
+    if (!data_dims.empty() && std::bernoulli_distribution()(generator())) {
+      int ellipsis_pos = std::uniform_int_distribution<int>(
+          0, data_dims.size() - 1)(generator());
       ellipsis_mask = 1LL << ellipsis_pos;
     }
 
     int64 new_axis_mask = bitmask_distribution(generator());
     int64 shrink_axis_mask = bitmask_distribution(generator());
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("StridedSlice")
-            .Input(data)
+            .RandomInput(type, data_dims)
             .Input(test::AsTensor<int32>(begin))
             .Input(test::AsTensor<int32>(end))
             .Input(test::AsTensor<int32>(strides))
@@ -1966,13 +2550,13 @@ TEST_F(OpTest, StridedSliceGrad) {
     // TODO(phawkins): use shape inference for the forward op to compute the
     // gradient shape for the backward op. At present, there is a low
     // probability of the golden op succeeding.
-    ExpectTfAndXlaOutputsAreClose(
+    return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("StridedSliceGrad")
             .Input(test::AsTensor<int64>(dims))
             .Input(test::AsTensor<int64>(begin))
             .Input(test::AsTensor<int64>(end))
             .Input(test::AsTensor<int64>(strides))
-            .Input(RandomTensor(type, RandomDims(1)))
+            .RandomInput(type, RandomDims(1))
             .Attr("T", type)
             .Attr("Index", DT_INT64)
             .Attr("begin_mask", begin_mask)
@@ -1985,48 +2569,48 @@ TEST_F(OpTest, StridedSliceGrad) {
 
 TEST_F(OpTest, Tanh) {
   Repeatedly([this]() {
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Tanh")
-                                      .Input(RandomTensor(DT_FLOAT))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Tanh").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, TanhGrad) {
   Repeatedly([this]() {
     auto dims = RandomDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TanhGrad")
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Input(RandomTensor(DT_FLOAT, dims))
-                                      .Attr("T", DT_FLOAT));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TanhGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
   });
 }
 
 TEST_F(OpTest, Tile) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor t = RandomTensor(type, RandomDims(1));
-    std::vector<int32> multiples(t.dims());
-    for (int i = 0; i < t.dims(); ++i) {
+    std::vector<int64> t_dims = RandomDims(1);
+    std::vector<int32> multiples(t_dims.size());
+    for (int i = 0; i < t_dims.size(); ++i) {
       multiples[i] = std::uniform_int_distribution<int>(1, 3)(generator());
     }
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Tile")
-                                      .Input(t)
-                                      .Input(test::AsTensor<int32>(multiples))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Tile")
+            .RandomInput(type, t_dims)
+            .Input(test::AsTensor<int32>(multiples))
+            .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, Transpose) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>(kAllXlaTypes);
-    Tensor data = RandomTensor(type);
-    std::vector<int32> perm(data.dims());
+    std::vector<int64> data_dims = RandomDims();
+    std::vector<int32> perm(data_dims.size());
     std::iota(perm.begin(), perm.end(), 0);
     std::shuffle(perm.begin(), perm.end(), generator());
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Transpose")
-                                      .Input(data)
-                                      .Input(test::AsTensor<int32>(perm))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Transpose")
+                                             .RandomInput(type, data_dims)
+                                             .Input(test::AsTensor<int32>(perm))
+                                             .Attr("T", type));
   });
 }
 
@@ -2034,10 +2618,10 @@ TEST_F(OpTest, TruncateDiv) {
   Repeatedly([this]() {
     DataType type = DT_INT32;
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TruncateDiv")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TruncateDiv")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
@@ -2045,18 +2629,18 @@ TEST_F(OpTest, TruncateMod) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
     auto dims = BroadcastableDims();
-    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TruncateMod")
-                                      .Input(RandomTensor(type, dims.first))
-                                      .Input(RandomTensor(type, dims.second))
-                                      .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TruncateMod")
+                                             .RandomInput(type, dims.first)
+                                             .RandomInput(type, dims.second)
+                                             .Attr("T", type));
   });
 }
 
 TEST_F(OpTest, ZerosLike) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    ExpectTfAndXlaOutputsAreClose(
-        OpTestBuilder("ZerosLike").Input(RandomTensor(type)).Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ZerosLike").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -2075,6 +2659,9 @@ int main(int argc, char** argv) {
       tensorflow::Flag("tf_xla_test_repetitions",
                        &tensorflow::tf_xla_test_repetitions,
                        "Number of repetitions for each test."),
+      tensorflow::Flag("tf_xla_max_tensor_size",
+                       &tensorflow::tf_xla_max_tensor_size,
+                       "Maximum number of elements for random input tensors."),
       tensorflow::Flag("tf_xla_test_device", tensorflow::tf_xla_test_device_ptr,
                        "Tensorflow device type to use for test"),
       tensorflow::Flag("tf_xla_test_use_jit", &tensorflow::tf_xla_test_use_jit,
diff --git a/tensorflow/compiler/tests/reverse_ops_test.py b/tensorflow/compiler/tests/reverse_ops_test.py
new file mode 100644
index 00000000000..18fabca28c9
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_ops_test.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for XLA Reverse Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class ReverseOpsTest(XLATestCase):
+
+  def testReverseOneDim(self):
+    shape = (7, 5, 9, 11)
+    for revdim in range(len(shape)):
+      self._AssertReverseEqual([revdim], shape)
+
+  def testReverseMoreThanOneDim(self):
+    shape = (7, 5, 9, 11)
+    for revdims in itertools.chain.from_iterable(
+        itertools.combinations(range(len(shape)), k)
+        for k in range(2, len(shape)+1)):
+      self._AssertReverseEqual(revdims, shape)
+
+  def _AssertReverseEqual(self, revdims, shape):
+    np.random.seed(120)
+    pval = np.random.randint(0, 100, size=shape).astype(float)
+    with self.test_session():
+      with self.test_scope():
+        p = array_ops.placeholder(dtypes.int32, shape=shape)
+        axis = constant_op.constant(
+            np.array(revdims, dtype=np.int32),
+            shape=(len(revdims),), dtype=dtypes.int32)
+        rval = array_ops.reverse(p, axis).eval({p: pval})
+
+        slices = [
+            slice(-1, None, -1) if d in revdims else slice(None)
+            for d in range(len(shape))]
+      self.assertEqual(
+          pval[slices].flatten().tolist(),
+          rval.flatten().tolist())
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
new file mode 100644
index 00000000000..ecdce4f052b
--- /dev/null
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -0,0 +1,61 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RMSProp optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
+
+
+class RmspropTest(XLATestCase):
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        rms_opt = rmsprop.RMSPropOptimizer(3.0)
+        rms_update = rms_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of RMSProp
+        for _ in range(3):
+          rms_update.run()
+
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([2.91705132e-04, 1.00029182e+00]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.89990854, 3.89990854]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
new file mode 100644
index 00000000000..4ddf2ee0dcb
--- /dev/null
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slicing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+
+class SliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.slice(i, [2], [4])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 3, 4, 5], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.slice(i, [1, 2, 2], [1, 1, 4])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[6, 5, 4, 3]]], result)
+
+
+
+class StridedSliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2], [6], [2])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 4], result)
+
+  def test1DNegtiveStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [6], [2], [-2])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([6, 4], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[1, 9]], [[6, 4]]], result)
+
+  def test3DNegativeStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 4, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0],
+                 [4, 5, 2, 4, 3, 7, 6, 8, 9, 4]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [4, 3, 4, 5, 7, 6, 5, 3, 4, 5],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7],
+                 [7, 1, 7, 1, 8, 1, 8, 1, 3, 1]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9],
+                 [9, 9, 5, 5, 6, 6, 3, 3, 6, 6]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[9, 8],
+                              [1, 1]],
+                             [[2, 4],
+                              [5, 7]]], result)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
new file mode 100644
index 00000000000..9c3b86c84b2
--- /dev/null
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -0,0 +1,266 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for SpaceToBatch and BatchToSpace ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import test
+
+
+def space_to_batch_direct(input_array, block_shape, paddings):
+  """Direct Python implementation of space-to-batch conversion.
+
+  This is used for tests only.
+
+  Args:
+    input_array: N-D array
+    block_shape: 1-D array of shape [num_block_dims].
+    paddings: 2-D array of shape [num_block_dims, 2].
+
+  Returns:
+    Converted tensor.
+  """
+  input_array = np.array(input_array)
+  block_shape = np.array(block_shape)
+  num_block_dims = len(block_shape)
+  paddings = np.array(paddings).reshape((len(block_shape), 2))
+
+  padded = np.pad(input_array,
+                  pad_width=([[0, 0]] + list(paddings) + [[0, 0]] *
+                             (input_array.ndim - 1 - num_block_dims)),
+                  mode="constant")
+  reshaped_padded_shape = [input_array.shape[0]]
+  output_shape = [input_array.shape[0] * np.prod(block_shape)]
+  for block_dim, block_shape_value in enumerate(block_shape):
+    reduced_size = padded.shape[block_dim + 1] // block_shape_value
+    reshaped_padded_shape.append(reduced_size)
+    output_shape.append(reduced_size)
+    reshaped_padded_shape.append(block_shape_value)
+  reshaped_padded_shape.extend(input_array.shape[num_block_dims + 1:])
+  output_shape.extend(input_array.shape[num_block_dims + 1:])
+
+  reshaped_padded = padded.reshape(reshaped_padded_shape)
+  permuted_reshaped_padded = np.transpose(reshaped_padded, (
+      list(np.arange(num_block_dims) * 2 + 2) + [0] +
+      list(np.arange(num_block_dims) * 2 + 1) + list(
+          np.arange(input_array.ndim - num_block_dims - 1) + 1 + num_block_dims
+          * 2)))
+  return permuted_reshaped_padded.reshape(output_shape)
+
+
+class SpaceToBatchTest(XLATestCase):
+  """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
+
+  def _testPad(self, inputs, paddings, block_size, outputs):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self.float_types:
+        # outputs = space_to_batch(inputs)
+        placeholder = array_ops.placeholder(dtype)
+        x_tf = gen_array_ops._space_to_batch(
+            placeholder, paddings, block_size=block_size)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        # inputs = batch_to_space(outputs)
+        x_tf = gen_array_ops._batch_to_space(
+            placeholder, paddings, block_size=block_size)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+
+  def _testOne(self, inputs, block_size, outputs):
+    paddings = np.zeros((2, 2), dtype=np.int32)
+    self._testPad(inputs, paddings, block_size, outputs)
+
+  # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  def testSmallInput2x2(self):
+    x_np = [[[[1], [2]], [[3], [4]]]]
+    block_size = 2
+    x_out = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
+  def testSmallInput2x2Pad1x0(self):
+    x_np = [[[[1], [2]], [[3], [4]]]]
+    paddings = np.array([[1, 0], [1, 0]], dtype=np.int32)
+    block_size = 3
+    x_out = [[[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[1]]], [[[2]]], [[[0]]],
+             [[[3]]], [[[4]]]]
+    self._testPad(x_np, paddings, block_size, x_out)
+
+  # Test with depth larger than 1.
+  # [1, 2, 2, 3] <-> [4, 1, 1, 3]
+  def testDepthInput2x2(self):
+    x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
+    block_size = 2
+    x_out = [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Test for larger input dimensions.
+  # [1, 4, 4, 1] <-> [4, 2, 2, 1]
+  def testLargerInput2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
+             [[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
+    block_size = 2
+    x_out = [[[[1], [3]], [[9], [11]]], [[[2], [4]], [[10], [12]]],
+             [[[5], [7]], [[13], [15]]], [[[6], [8]], [[14], [16]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Test with batch larger than 1.
+  # [2, 2, 4, 1] <-> [8, 1, 2, 1]
+  def testBatchInput2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]]],
+            [[[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
+    block_size = 2
+    x_out = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+             [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Tests for larger input spatial dimensions AND batch larger than 1, to ensure
+  # that elements are correctly laid out spatially and properly interleaved
+  # along the batch dimension.
+  # [2, 4, 4, 1] <-> [8, 2, 2, 1]
+  def testLargerInputBatch2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
+             [[9], [10], [11], [12]], [[13], [14], [15], [16]]],
+            [[[17], [18], [19], [20]], [[21], [22], [23], [24]],
+             [[25], [26], [27], [28]], [[29], [30], [31], [32]]]]
+    x_out = [[[[1], [3]], [[9], [11]]], [[[17], [19]], [[25], [27]]],
+             [[[2], [4]], [[10], [12]]], [[[18], [20]], [[26], [28]]],
+             [[[5], [7]], [[13], [15]]], [[[21], [23]], [[29], [31]]],
+             [[[6], [8]], [[14], [16]]], [[[22], [24]], [[30], [32]]]]
+    block_size = 2
+    self._testOne(x_np, block_size, x_out)
+
+
+class SpaceToBatchNDTest(XLATestCase):
+  """Tests input-output pairs for the SpaceToBatchND and BatchToSpaceND ops."""
+
+  def _testPad(self, inputs, block_shape, paddings, outputs):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings).reshape((len(block_shape), 2))
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self.float_types:
+        placeholder = array_ops.placeholder(dtype)
+        # outputs = space_to_batch(inputs)
+        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        # inputs = batch_to_space(outputs)
+        placeholder = array_ops.placeholder(dtype)
+        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape, paddings)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+
+  def _testDirect(self, input_shape, block_shape, paddings):
+    inputs = np.arange(np.prod(input_shape), dtype=np.float32)
+    inputs = inputs.reshape(input_shape)
+    self._testPad(inputs, block_shape, paddings,
+                  space_to_batch_direct(inputs, block_shape, paddings))
+
+  def testZeroBlockDimsZeroRemainingDims(self):
+    self._testPad(
+        inputs=[1, 2],
+        block_shape=[],
+        paddings=[],
+        outputs=[1, 2],)
+
+  def testZeroBlockDimsOneRemainingDim(self):
+    self._testPad(
+        inputs=[[1, 2], [3, 4]],
+        block_shape=[],
+        paddings=[],
+        outputs=[[1, 2], [3, 4]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(
+        inputs=[[1, 2], [3, 4]],
+        block_shape=[1],
+        paddings=[[0, 0]],
+        outputs=[[1, 2], [3, 4]])
+
+  def testZeroBlockDimsTwoRemainingDims(self):
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[],
+        paddings=[],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[1],
+        paddings=[[0, 0]],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with two no-op block dims.
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[1, 1],
+        paddings=[[0, 0], [0, 0]],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+  def testOneBlockDimZeroRemainingDims(self):
+    self._testPad(
+        inputs=[[1, 2, 3], [4, 5, 6]],
+        block_shape=[2],
+        paddings=[1, 0],
+        outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
+
+  def testOneBlockDimOneRemainingDim(self):
+    self._testPad(
+        inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
+        block_shape=[2],
+        paddings=[1, 0],
+        outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
+                 [[4, 41], [6, 61]]])
+
+  def testDirect(self):
+    # Test with zero-size remaining dimension.
+    self._testDirect(
+        input_shape=[3, 1, 2, 0], block_shape=[3], paddings=[[0, 2]])
+
+    # Test with zero-size blocked dimension.
+    self._testDirect(
+        input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[0, 0]])
+
+    # Test with padding up from zero size.
+    self._testDirect(
+        input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[1, 2]])
+
+    self._testDirect(
+        input_shape=[3, 3, 4, 5, 2],
+        block_shape=[3, 4, 2],
+        paddings=[[1, 2], [0, 0], [3, 0]])
+
+    self._testDirect(
+        input_shape=[3, 3, 4, 5, 2],
+        block_shape=[3, 4, 2, 2],
+        paddings=[[1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(
+        input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+        block_shape=[1, 1, 3, 4, 2, 2],
+        paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(
+        input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+        block_shape=[1, 1, 3, 4, 2, 2, 1],
+        paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0], [0, 0]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
new file mode 100644
index 00000000000..27a29773053
--- /dev/null
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -0,0 +1,1018 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for XLA TensorArray Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _make_converter(dtype):
+  def _converter(x):
+    return np.asarray(x).astype(dtype.as_numpy_dtype)
+  return _converter
+
+
+class TensorArrayTest(xla_test.XLATestCase):
+
+  def testTensorArrayWriteRead(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3)
+
+      w0 = ta.write(0, [[4.0, 5.0]])
+      w1 = w0.write(1, [[1.0, 3.0]])
+      w2 = w1.write(2, [[7.0, -8.5]])
+
+      r0 = w2.read(0)
+      r1 = w2.read(1)
+      r2 = w2.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual([[4.0, 5.0]], d0)
+      self.assertAllEqual([[1.0, 3.0]], d1)
+      self.assertAllEqual([[7.0, -8.5]], d2)
+
+  def _testTensorArrayWritePack(self, tf_dtype):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      convert = _make_converter(tf_dtype)
+
+      w0 = ta.write(0, convert([[4.0, 5.0]]))
+      w1 = w0.write(1, convert([[6.0, 7.0]]))
+      w2 = w1.write(2, convert([[8.0, 9.0]]))
+
+      c0 = w2.stack()
+
+      self.assertAllEqual(
+          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
+
+  def testTensorArrayWritePack(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArrayWritePack(dtype)
+
+  def testEmptyTensorArrayPack(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+
+      empty_element = np.zeros((0, 1), dtype=np.float32)
+      w0 = ta.write(0, empty_element)
+      w1 = w0.write(1, empty_element)
+      w2 = w1.write(2, empty_element)
+
+      c0 = w2.stack()
+
+      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+
+  def _testTensorArrayWriteConcat(self, tf_dtype):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      convert = _make_converter(tf_dtype)
+
+      w0 = ta.write(0, convert([[4.0, 5.0], [104.0, 105.0]]))
+      w1 = w0.write(1, convert([[6.0, 7.0], [106.0, 107.0]]))
+      w2 = w1.write(2, convert([[8.0, 9.0], [204.0, 205.0]]))
+
+      c0 = w2.concat()
+
+      self.assertAllEqual(
+          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0],
+                   [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval())
+
+  def testTensorArrayWriteConcat(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArrayWriteConcat(dtype)
+
+  def _testTensorArrayUnpackRead(self, tf_dtype):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      convert = _make_converter(tf_dtype)
+
+      # Unpack a vector into scalars
+      w0 = ta.unstack(convert([1.0, 2.0, 3.0]))
+      r0 = w0.read(0)
+      r1 = w0.read(1)
+      r2 = w0.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert(1.0), d0)
+      self.assertAllEqual(convert(2.0), d1)
+      self.assertAllEqual(convert(3.0), d2)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      # Unpack a matrix into vectors
+      w1 = ta.unstack(convert([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]))
+      r0 = w1.read(0)
+      r1 = w1.read(1)
+      r2 = w1.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert([1.0, 1.1]), d0)
+      self.assertAllEqual(convert([2.0, 2.1]), d1)
+      self.assertAllEqual(convert([3.0, 3.1]), d2)
+
+      # Reset ta because we're going to change the shape, else shape
+      # inference will throw an error.
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      # Try unpacking an empty matrix, which should not cause an error.
+      w2 = ta.unstack(convert([[], [], []]))
+      r0 = w2.read(0)
+      r1 = w2.read(1)
+      r2 = w2.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert([]), d0)
+      self.assertAllEqual(convert([]), d1)
+      self.assertAllEqual(convert([]), d2)
+
+  def _testTensorArrayUnpackReadMaybeLegacy(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArrayUnpackRead(dtype)
+
+  def testTensorArrayUnpackRead(self):
+    self._testTensorArrayUnpackReadMaybeLegacy()
+
+  def _testTensorArraySplitRead(self, tf_dtype):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+
+      convert = _make_converter(tf_dtype)
+
+      # Split an empty vector
+      lengths = constant_op.constant([0, 0, 0])
+      w0 = ta.split(convert([]), lengths=lengths)
+      r0 = w0.read(0)
+      r1 = w0.read(1)
+      r2 = w0.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert([]), d0)
+      self.assertAllEqual(convert([]), d1)
+      self.assertAllEqual(convert([]), d2)
+
+      # Split a vector
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+      lengths = constant_op.constant([1, 1, 1])
+      w0 = ta.split(convert([1.0, 2.0, 3.0]), lengths=lengths)
+      r0 = w0.read(0)
+      r1 = w0.read(1)
+      r2 = w0.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert([1.0]), d0)
+      self.assertAllEqual(convert([2.0]), d1)
+      self.assertAllEqual(convert([3.0]), d2)
+
+      # Split a matrix
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype, tensor_array_name="foo", size=3)
+      lengths = constant_op.constant([1, 1, 1])
+      w0 = ta.split(
+          convert([[1.0, 101.0], [2.0, 201.0], [3.0, 301.0]]), lengths=lengths)
+      r0 = w0.read(0)
+      r1 = w0.read(1)
+      r2 = w0.read(2)
+
+      d0, d1, d2 = session.run([r0, r1, r2])
+      self.assertAllEqual(convert([[1.0, 101.0]]), d0)
+      self.assertAllEqual(convert([[2.0, 201.0]]), d1)
+      self.assertAllEqual(convert([[3.0, 301.0]]), d2)
+
+  def testTensorArraySplitRead(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArraySplitRead(dtype)
+
+  def testTensorGradArrayWriteRead(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3)
+
+      w0 = ta.write(0, [[4.0]])
+      w1 = w0.write(1, [[1.0]])
+      w2 = w1.write(2, [[-3.0]])
+
+      g_ta = w2.grad("grad")
+
+      g_w0 = g_ta.write(0, [[5.0]])
+      g_w1 = g_w0.write(1, [[2.0]])
+      g_w2 = g_w1.write(2, [[-2.0]])
+
+      r0 = w2.read(0)
+      r1 = w2.read(1)
+      r2 = w2.read(2)
+
+      g_r0 = g_w2.read(0)
+      g_r1 = g_w2.read(1)
+      g_r2 = g_w2.read(2)
+
+      d0, d1, d2, g_d0, g_d1, g_d2 = session.run([r0, r1, r2, g_r0, g_r1, g_r2])
+      self.assertAllEqual([[4.0]], d0)
+      self.assertAllEqual([[1.0]], d1)
+      self.assertAllEqual([[-3.0]], d2)
+      self.assertAllEqual([[5.0]], g_d0)
+      self.assertAllEqual([[2.0]], g_d1)
+      self.assertAllEqual([[-2.0]], g_d2)
+
+  def testTensorGradArrayDynamicWriteRead(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3)
+
+      w0 = ta.write(0, [[4.0]])
+      w1 = w0.write(1, [[1.0]])
+      w2 = w1.write(2, [[-3.0]])
+
+      g_ta = w2.grad("grad")  # Get gradient array here so we know the shape
+
+      s = w2.size()
+      g_s = g_ta.size()
+
+      g_w0 = g_ta.write(0, [[5.0]])
+      g_w1 = g_w0.write(1, [[2.0]])
+      g_w2 = g_w1.write(2, [[-2.0]])
+
+      r0 = w2.read(0)
+      r1 = w2.read(1)
+      r2 = w2.read(2)
+
+      g_r0 = g_w2.read(0)
+      g_r1 = g_w2.read(1)
+      g_r2 = g_w2.read(2)
+
+      d0, d1, d2, g_d0, g_d1, g_d2, vs, g_vs = session.run(
+          [r0, r1, r2, g_r0, g_r1, g_r2, s, g_s])
+      self.assertAllEqual([[4.0]], d0)
+      self.assertAllEqual([[1.0]], d1)
+      self.assertAllEqual([[-3.0]], d2)
+      self.assertAllEqual([[5.0]], g_d0)
+      self.assertAllEqual([[2.0]], g_d1)
+      self.assertAllEqual([[-2.0]], g_d2)
+      self.assertAllEqual(3, vs)
+      self.assertAllEqual(3, g_vs)
+
+  def testTensorGradAccessTwiceReceiveSameObject(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3,
+          element_shape=[1, 2])
+      g_ta_0 = ta.grad("grad")
+      g_ta_1 = ta.grad("grad")
+
+      with ops.control_dependencies([g_ta_0.write(0, [[4.0, 5.0]]).flow]):
+        # Write with one gradient handle, read with another copy of it
+        r1_0 = g_ta_1.read(0)
+
+      t_g_ta_0, t_g_ta_1, d_r1_0 = session.run(
+          [g_ta_0.handle.op, g_ta_1.handle.op, r1_0])
+      self.assertAllEqual(t_g_ta_0, t_g_ta_1)
+      self.assertAllEqual([[4.0, 5.0]], d_r1_0)
+
+  def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+
+      # Test writing the wrong datatype
+      with self.assertRaisesOpError(
+          "TensorArray dtype is float but op has dtype int32"):
+        ta.write(-1, np.int32(7)).flow.eval()
+
+  def testTensorArrayReadWrongIndexOrDataTypeFails(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+
+      w0 = ta.write(0, [[4.0, 5.0]])
+
+      # Test reading wrong datatype
+      r0_bad = gen_data_flow_ops._tensor_array_read_v3(
+          handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
+      with self.assertRaisesOpError(
+          "TensorArray dtype is float but Op requested dtype double."):
+        r0_bad.eval()
+
+      # Test reading from a different index than the one we wrote to
+      w0.read(1)
+
+  def testTensorArraySplitIncompatibleShapesFails(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3,
+          infer_shape=False)
+
+      with self.assertRaisesOpError(
+          r"value is not 1D"):
+        lengths = array_ops.placeholder(dtypes.int64)
+        ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
+
+      with self.assertRaisesOpError(
+          r"lengths must be equal: 1 vs. 2"):
+        ta.split([1.0, 2.0, 3.0], [1, 2, 3]).flow.eval()
+
+      with self.assertRaisesOpError(
+          r"value must have rank >= 1"):
+        ta.split(1.0, [1]).flow.eval()
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=2,
+          infer_shape=False)
+
+      with self.assertRaisesOpError(
+          r"TensorArray's size is not equal to the size of lengths "
+          r"\(1 vs. 2\)"):
+        ta.split([1.0], [1]).flow.eval()
+
+  def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtype, tensor_array_name="foo", size=3, infer_shape=False)
+
+      c = lambda x: np.asarray(x, dtype=dtype.as_numpy_dtype)
+
+      w0 = ta.write(2, c(3.0))
+      w1 = w0.write(2, c(4.0))
+
+      ta_grad = w1.grad("grad")
+
+      w0_grad = ta_grad.write(2, c(3.0))
+      w1_grad = w0_grad.write(2, c(4.0))
+      w2_grad = w1_grad.write(2, c(5.0))
+
+      # Assert that aggregation works correctly
+      self.assertAllEqual(c(12.00), w2_grad.read(2).eval())
+
+      # Using differing shapes causes an exception
+      wb0_grad = ta_grad.write(1, c(1.0))
+      wb1_grad = wb0_grad.write(1, c([1.0]))
+
+      with self.assertRaisesOpError(
+          r"Mismatched TensorArray sizes"):
+        wb1_grad.flow.eval()
+
+  def testTensorArrayWriteGradientAddMultipleAdds(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
+
+  def testMultiTensorArray(self):
+    with self.test_session(), self.test_scope():
+      h1 = tensor_array_ops.TensorArray(
+          size=1, dtype=dtypes.float32, tensor_array_name="foo")
+      w1 = h1.write(0, 4.0)
+      r1 = w1.read(0)
+
+      h2 = tensor_array_ops.TensorArray(
+          size=1, dtype=dtypes.float32, tensor_array_name="bar")
+
+      w2 = h2.write(0, 5.0)
+      r2 = w2.read(0)
+      r = r1 + r2
+      self.assertAllClose(9.0, r.eval())
+
+  def _testTensorArrayGradientWriteReadType(self, dtype):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.as_dtype(dtype),
+          tensor_array_name="foo",
+          size=3,
+          infer_shape=False)
+
+      c = lambda x: np.array(x, dtype=dtype)
+
+      value_0 = constant_op.constant(c([[4.0, 5.0]]))
+      value_1 = constant_op.constant(c([[3.0, 3.5]]))
+
+      w0 = ta.write(0, value_0)
+      w1 = w0.write(1, value_1)
+      r0 = w1.read(0)
+      r1 = w1.read(1)
+      r0_2 = w1.read(0)
+
+      # Test individual components' gradients
+      grad_just_r0 = gradients_impl.gradients(
+          ys=[r0], xs=[value_0], grad_ys=[c([[2.0, 3.0]])])
+      grad_just_r0_vals = session.run(grad_just_r0)
+      self.assertAllEqual(c([[2.0, 3.0]]), grad_just_r0_vals[0])
+
+      grad_r0_r0_2 = gradients_impl.gradients(
+          ys=[r0, r0_2],
+          xs=[value_0],
+          grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]])])
+      grad_r0_r0_2_vals = session.run(grad_r0_r0_2)
+      self.assertAllEqual(c([[3.0, 2.0]]), grad_r0_r0_2_vals[0])
+
+      grad_just_r1 = gradients_impl.gradients(
+          ys=[r1], xs=[value_1], grad_ys=[c([[-2.0, -4.0]])])
+      grad_just_r1_vals = session.run(grad_just_r1)
+      self.assertAllEqual(c([[-2.0, -4.0]]), grad_just_r1_vals[0])
+
+      # Test combined gradients
+      grad = gradients_impl.gradients(
+          ys=[r0, r0_2, r1],
+          xs=[value_0, value_1],
+          grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]]), c([[-2.0, -10.0]])])
+      grad_vals = session.run(grad)
+      self.assertEqual(len(grad_vals), 2)
+      self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
+      self.assertAllEqual(c([[-2.0, -10.0]]), grad_vals[1])
+
+  def testTensorArrayGradientWriteRead(self):
+    for dtype in self.numeric_types:
+      self._testTensorArrayGradientWriteReadType(dtype)
+
+  def _testTensorArrayGradientWritePackConcatAndRead(self):
+    with self.test_session() as sess, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=2,
+          clear_after_read=False)
+
+      value_0 = constant_op.constant([-1.0, 1.0])
+      value_1 = constant_op.constant([-10.0, 10.0])
+
+      w0 = ta.write(0, value_0)
+      w1 = w0.write(1, value_1)
+      p0 = w1.stack()
+      r0 = w1.read(0)
+      s0 = w1.concat()
+
+      # Test gradient accumulation between read(0), pack(), and concat()
+      with ops.control_dependencies([p0, r0, s0]):
+        grad_r = gradients_impl.gradients(
+            ys=[p0, r0, s0],
+            xs=[value_0, value_1],
+            grad_ys=[
+                [[2.0, 3.0], [4.0, 5.0]],  # stack gradient
+                [-0.5, 1.5],  # read(0) gradient
+                [20.0, 30.0, 40.0, 50.0],  # concat gradient
+            ])
+      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+
+      self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
+      self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
+
+  def testTensorArrayGradientWritePackConcatAndRead(self):
+    self._testTensorArrayGradientWritePackConcatAndRead()
+
+  def testTensorArrayReadTwice(self):
+    with self.test_session(), self.test_scope():
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      ta_readtwice = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=2,
+          clear_after_read=False)
+      w_readtwice = ta_readtwice.unstack(value)
+      r0_readtwice = w_readtwice.read(0)
+      with ops.control_dependencies([r0_readtwice]):
+        r1_readtwice = w_readtwice.read(0)
+
+      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+
+  def _testTensorArrayGradientUnpackRead(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=2,
+          clear_after_read=False)
+
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      w = ta.unstack(value)
+      r0 = w.read(0)
+      r0_1 = w.read(0)
+      r1 = w.read(1)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[r0, r0_1, r1],
+          xs=[value],
+          grad_ys=[[2.0, 3.0], [-1.5, 1.5], [4.0, 5.0]])
+      grad_vals = session.run(grad)
+
+      self.assertEqual(len(grad_vals), 1)
+      self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
+
+  def testTensorArrayGradientUnpackRead(self):
+    self._testTensorArrayGradientUnpackRead()
+
+  def testTensorArrayGradientSplitConcat(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=2)
+
+      value = constant_op.constant(
+          [[1.0, -1.0], [10.0, -10.0], [100.0, -100.0], [1000.0, -1000.0]])
+
+      w = ta.split(value, [2, 2])
+      r = w.concat()
+
+      # Test combined gradients
+      grad = gradients_impl.gradients(
+          ys=[r],
+          xs=[value],
+          grad_ys=[[[2.0, -2.0], [20.0, -20.0], [200.0, -200.0],
+                    [2000.0, -2000.0]]])
+      grad_vals = session.run(grad)
+
+      self.assertEqual(len(grad_vals), 1)
+      self.assertAllEqual([[2.0, -2.0], [20.0, -20.0], [200.0, -200.0],
+                           [2000.0, -2000.0]],
+                          grad_vals[0])
+
+  # TODO(phawkins): implement TensorArrayClose
+  # def testCloseTensorArray(self):
+  #   with self.test_session() as session, self.test_scope():
+  #     ta = tensor_array_ops.TensorArray(
+  #         dtype=dtypes.float32, tensor_array_name="foo", size=3)
+  #     c1 = ta.close()
+  #     session.run(c1)
+
+  def testSizeTensorArray(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+      s = ta.size()
+      self.assertAllEqual(3, s.eval())
+
+  # TODO(phawkins): implement TensorArrayClose
+  # def testWriteCloseTensorArray(self):
+  #   with self.test_session(), self.test_scope():
+  #     ta = tensor_array_ops.TensorArray(
+  #         dtype=dtypes.float32,
+  #         tensor_array_name="foo",
+  #         size=3,
+  #         infer_shape=False)
+  #     w0 = ta.write(0, [[4.0, 5.0]])
+  #     w1 = w0.write(1, [3.0])
+  #     w1.close().run()  # Expected to run without problems
+
+  # TODO(phawkins): implement while loops.
+  # def _testWhileLoopWritePackGradients(self, dynamic_size, dtype):
+  #   np_dtype = dtype.as_numpy_dtype
+  #   with self.test_session() as session, self.test_scope():
+  #     v0 = array_ops.identity(np.arange(3 * 5, dtype=np_dtype).reshape(3, 5))
+  #     var = variables.Variable(np.arange(100, 105, dtype=np_dtype))
+  #     state0 = array_ops.identity(np.array([1] * 5, dtype=np_dtype))
+  #     ta = tensor_array_ops.TensorArray(
+  #         dtype=dtype,
+  #         tensor_array_name="foo",
+  #         size=0 if dynamic_size else 3,
+  #         dynamic_size=dynamic_size)
+  #     time_0 = array_ops.identity(0)
+
+  #     def body(time, ta_t, state):
+  #       sliced = array_ops.slice(
+  #           v0, begin=array_ops.stack([time, 0]), size=[1, -1])
+  #       sliced = array_ops.squeeze(sliced)
+  #       out = sliced + var + state
+  #       state += sliced
+  #       ta_t = ta_t.write(time, out)
+  #       return (time + 1, ta_t, state)
+
+  #     (unused_0, h_final, unused_2) = control_flow_ops.while_loop(
+  #         cond=lambda time, unused_1, unused_2: time < 3,
+  #         body=body,
+  #         loop_vars=(time_0, ta, state0),
+  #         shape_invariants=(time_0.get_shape(), tensor_shape.unknown_shape(),
+  #                           tensor_shape.unknown_shape()),
+  #         parallel_iterations=3)
+  #     vout = h_final.stack()
+
+  #     grad_val = -np.arange(3 * 5, dtype=np_dtype).reshape(3, 5)
+  #     v0_grad = gradients_impl.gradients([vout], [v0], [grad_val])[0]
+  #     state0_grad = gradients_impl.gradients([vout], [state0], [grad_val])[0]
+  #     var_grad = gradients_impl.gradients([vout], [var], [grad_val])[0]
+
+  #     variables.global_variables_initializer().run()
+  #     state0_t, var_t, v0_t, vout_t, v0_grad_t, var_grad_t, state0_grad_t = (
+  #         session.run([state0, var, v0, vout, v0_grad, var_grad, state0_grad])
+  #     )
+  #     just_v0_grad_t, = session.run([v0_grad])
+
+  #     # state = [ state0 | state0 + v0[0] | state0 + v0[0] + v0[1] ]
+  #     # vout = [ v0[0] + var + state[0] |
+  #     #          v0[1] + var + state[1] |
+  #     #          v0[2] + var + state[2] ]
+  #     #      = [ v0[0] + var + state0 |
+  #     #          v0[1] + var + state0 + v0[0] |
+  #     #          v0[2] + var + state0 + v0[0] + v0[1] ]
+  #     #
+  #     # d(vout[0])/d(v0) = [1 | 0 | 0 ]
+  #     # d(vout[1])/d(v0) = [1 | 1 | 0 ]
+  #     # d(vout[2])/d(v0) = [1 | 1 | 1 ]
+  #     # d(vout)/d(var) = [1 | 1 | 1]
+  #     # d(vout)/d(state0) = [ 1 | 1 | 1 ]
+
+  #     state_per_time = np.array(
+  #         [state0_t, state0_t + v0_t[0, :],
+  #         state0_t + v0_t[0, :] + v0_t[1, :]])
+
+  #     # Compare forward prop
+  #     self.assertAllClose(v0_t + var_t + state_per_time, vout_t)
+
+  #     # Compare backward prop
+  #     expected_v0_grad_t = np.array([
+  #         grad_val[0, :] + grad_val[1, :] + grad_val[2, :],
+  #         grad_val[1, :] + grad_val[2, :], grad_val[2, :]
+  #     ])
+
+  #     self.assertAllEqual(expected_v0_grad_t, v0_grad_t)
+  #     self.assertAllEqual(expected_v0_grad_t, just_v0_grad_t)
+  #     self.assertAllClose(grad_val.sum(axis=0), var_grad_t)
+  #     self.assertAllClose(grad_val.sum(axis=0), state0_grad_t)
+
+  # def testWhileLoopWritePackGradients(self):
+  #   self._testWhileLoopWritePackGradients(
+  #       dynamic_size=False, dtype=dtypes.float32)
+  #   # TODO(ebrevdo): re-enable when While supports non-float32 gradients.
+  #   # self._testWhileLoopWritePackGradients(
+  #   #     dynamic_size=False, dtype=tf.int64)
+
+  # def testWhileLoopDynamicWritePackGradients(self):
+  #   self._testWhileLoopWritePackGradients(
+  #       dynamic_size=True, dtype=dtypes.float32)
+
+  # def testGradSerialTwoLoops(self):
+  #   with self.test_session(), self.test_scope():
+  #     num_steps = 100
+  #     acc = tensor_array_ops.TensorArray(
+  #         dtype=dtypes.float32,
+  #         size=num_steps,
+  #         clear_after_read=False,
+  #         element_shape=tensor_shape.scalar())
+  #     i = constant_op.constant(0, name="i")
+  #     x = constant_op.constant(2.0, name="x")
+
+  #     c = lambda i, acc: i < 5
+
+  #     def b(i, acc):
+  #       x1 = control_flow_ops.cond(
+  #           math_ops.equal(i, 0), lambda: x,
+  #           lambda: math_ops.multiply(acc.read(i - 1), 2.0))
+  #       return i + 1, acc.write(i, x1)
+
+  #     i1, acc1 = control_flow_ops.while_loop(c, b, [i, acc])
+
+  #     z = constant_op.constant(0.0)
+
+  #     def fn(i, acc):
+  #       return i + 1, acc.write(i, z)
+
+  #     _, acc2 = control_flow_ops.while_loop(lambda i, acc: i < num_steps, fn,
+  #                                           [i1, acc1])
+
+  #     r = acc2.stack()
+  #     grad = gradients_impl.gradients(r, [x])[0]
+  #     self.assertAllClose(31.0, grad.eval())
+
+  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
+    with self.test_session() as session, self.test_scope():
+      a = array_ops.identity(
+          np.arange(
+              3 * 5, dtype=np.float32).reshape(3, 5) + 1)
+      b = array_ops.identity(
+          np.arange(
+              3 * 5, dtype=np.float32).reshape(3, 5) + 1 + 3 * 5)
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+      ta = ta.write(0, a, name="write_a")
+      ta = ta.write(1, b, name="write_b")
+      c = (
+          ta.read(
+              0, name="read_a_0") +  # a + b
+          ta.read(
+              1, name="read_b_0"))
+      g0 = -(np.arange(3 * 5, dtype=np.float32).reshape(3, 5) + 1)
+      grad_a = gradients_impl.gradients([c], [a], [g0])[0]  # d(a+b)/da = 1
+      grad_b = gradients_impl.gradients([c], [b], [g0])[0]  # d(a+b)/db = 1
+
+      # Test gradients calculated individually
+      grad_a_t, = session.run([grad_a])
+      self.assertAllEqual(grad_a_t, g0)
+
+      grad_b_t, = session.run([grad_b])
+      self.assertAllEqual(grad_b_t, g0)
+
+      # Test gradients calculated jointly
+      joint_grad_a_t, joint_grad_b_t = session.run([grad_a, grad_b])
+      self.assertAllEqual(joint_grad_a_t, g0)
+      self.assertAllEqual(joint_grad_b_t, g0)
+
+  def testWriteShape(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+      c0 = constant_op.constant([4.0, 5.0])
+      w0 = ta.write(0, c0)
+      r0 = w0.read(0)
+      self.assertAllEqual(c0.get_shape(), r0.get_shape())
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+      c1 = constant_op.constant([6.0, 7.0])
+      w1 = w0.write(1, c1)
+      r0 = w1.read(0)
+      r1 = w1.read(1)
+      self.assertAllEqual(c0.get_shape(), r0.get_shape())
+      self.assertAllEqual(c1.get_shape(), r1.get_shape())
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+      c2 = constant_op.constant([4.0, 5.0, 6.0])
+      with self.assertRaises(ValueError):
+        w0.write(0, c2)
+
+  def testPartlyUnknownShape(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=6)
+
+      c0 = array_ops.placeholder(dtypes.float32, [None, None, None, 3])
+      w0 = ta.write(0, c0)
+      r0 = w0.read(0)
+      self.assertAllEqual([None, None, None, 3], r0.get_shape().as_list())
+
+      c1 = array_ops.placeholder(dtypes.float32, [None, None, None, 3])
+      w1 = w0.write(1, c1)
+      r1 = w1.read(0)
+      self.assertAllEqual([None, None, None, 3], r1.get_shape().as_list())
+
+      # Writing less specific shape (doesn't change type.)
+      c2 = array_ops.placeholder(dtypes.float32, [None, None, None, None])
+      w2 = w1.write(2, c2)
+      r2 = w2.read(0)
+      self.assertAllEqual([None, None, None, 3], r2.get_shape().as_list())
+
+      # Writing more specific shape in one dimension and less specific in
+      # another.
+      c3 = array_ops.placeholder(dtypes.float32, [None, None, 2, None])
+      w3 = w2.write(3, c3)
+      r3 = w3.read(0)
+      self.assertAllEqual([None, None, 2, 3], r3.get_shape().as_list())
+
+      # Writing partly defined shape using TensorArray.scatter.
+      c4 = array_ops.placeholder(dtypes.float32, [2, None, 4, 2, 3])
+      w4 = w3.scatter([4, 5], c4)
+      r4 = w4.read(0)
+      self.assertAllEqual([None, 4, 2, 3], r4.get_shape().as_list())
+
+      # Writing fully defined shape using TensorArray.split.
+      c5 = array_ops.placeholder(dtypes.float32, [10, 4, 2, 3])
+      w5 = w4.split(c5, constant_op.constant([5, 5]))
+      r5 = w5.read(0)
+      self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
+
+  def _testUnpackShape(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=0,
+          infer_shape=True)
+      value = constant_op.constant(
+          [[1.0, -1.0], [10.0, -10.0], [100.0, -100.0]])
+      w0 = ta.unstack(value)
+      r0 = w0.read(0)
+      self.assertAllEqual((2,), r0.get_shape())
+
+      c1 = constant_op.constant([4.0, 5.0])
+      w1 = w0.write(3, c1)
+      r1 = w1.read(0)
+      self.assertAllEqual(c1.get_shape(), r1.get_shape())
+
+      c2 = constant_op.constant([4.0, 5.0, 6.0])
+      with self.assertRaises(ValueError):
+        w1.write(4, c2)
+
+  def testUnpackShape(self):
+    self._testUnpackShape()
+
+  def testSplitShape(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=0,
+          infer_shape=True)
+      value = constant_op.constant([[1.0, -1.0], [2.0, -2.0], [3.0, -3.0]])
+      w0 = ta.split(value, [1, 1, 1])
+      r0 = w0.read(0)
+      self.assertAllEqual((1, 2), r0.get_shape())
+
+      ta1 = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo1",
+          size=0,
+          infer_shape=True)
+      w0 = ta1.split(value, [1, 2])
+      r0 = w0.read(0)
+      self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
+
+  def testWriteUnknownShape(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3,
+          infer_shape=True)
+      c0 = array_ops.placeholder(dtypes.float32)
+      w0 = ta.write(0, c0)
+      r0 = w0.read(0)
+      self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
+
+  def _testGradientWhenNotAllComponentsRead(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+      x = constant_op.constant([2.0, 3.0])
+      w = ta.unstack(x)
+      r0 = w.read(0)
+      # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
+      grad_r0 = gradients_impl.gradients(ys=[r0], xs=[x], grad_ys=[1.0])
+      grad_r0_vals = session.run(grad_r0)[0]
+      self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
+
+  def testGradientWhenNotAllComponentsRead(self):
+    self._testGradientWhenNotAllComponentsRead()
+
+  def _testTensorArrayEvalEmpty(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=0, infer_shape=False)
+      with self.assertRaisesOpError(
+          "TensorArray has size zero, but element shape <unknown> is not fully "
+          "defined. Currently only static shapes are supported when packing "
+          "zero-size TensorArrays."):
+        ta.stack().eval()
+
+  def testTensorArrayEvalEmpty(self):
+    self._testTensorArrayEvalEmpty()
+
+  def _testTensorArrayEvalEmptyWithDefault(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=0, infer_shape=True)
+      self.assertEqual(0, ta.size().eval())
+      ta = ta.unstack(array_ops.zeros([0, 3, 5]))
+      packed = ta.stack()
+      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      # Concatenating zero tensors along their first dimension gives a
+      # first dimension of zero
+      self.assertAllEqual([0, 5], ta.concat().eval().shape)
+
+  def testTensorArrayEvalEmptyWithDefault(self):
+    self._testTensorArrayEvalEmptyWithDefault()
+
+  def testTensorArrayScatterReadAndGradients(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=10)
+
+      indices = constant_op.constant([1, 8])
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      w = ta.scatter(indices, value)
+      r0 = w.read(1)
+      r1 = w.read(8)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[r0, r1], xs=[value], grad_ys=[[2.0, 3.0], [4.0, 5.0]])
+      read_vals, grad_vals = session.run([[r0, r1], grad])
+
+      self.assertEqual(len(read_vals), 2)
+      self.assertEqual(len(grad_vals), 1)
+      self.assertAllEqual([1.0, -1.0], read_vals[0])
+      self.assertAllEqual([10.0, -10.0], read_vals[1])
+      self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
+
+  def testTensorArrayWriteGatherAndGradients(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=10)
+
+      values = constant_op.constant([[1.0 * x, -1.0 * x] for x in range(10)])
+      indices = constant_op.constant([1, 8])
+
+      w = ta.unstack(values)
+      g = w.gather(indices)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[g], xs=[values], grad_ys=[[[2.0, 3.0], [4.0, 5.0]]])
+      g_vals, grad_vals = session.run([[g], grad])
+
+      # Gradients for 8 of the 10 unread components are zero.
+      expected_grad = np.zeros((10, 2))
+      expected_grad[1] = [2.0, 3.0]
+      expected_grad[8] = [4.0, 5.0]
+
+      self.assertEqual(len(g_vals), 1)
+      self.assertEqual(len(grad_vals), 1)
+      self.assertAllEqual([[1.0, -1.0], [8.0, -8.0]], g_vals[0])
+      self.assertAllEqual(expected_grad, grad_vals[0])
+
+  def testTensorArrayIdentity(self):
+    with self.test_session() as session, self.test_scope():
+      ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
+                                         infer_shape=False)
+      ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4,
+                                         infer_shape=True)
+
+      ta0 = ta0.write(0, 0.)
+      ta1 = ta1.write(0, 1)
+
+      v0 = resource_variable_ops.ResourceVariable(0)
+      v1 = resource_variable_ops.ResourceVariable(0)
+
+      with ops.control_dependencies([v0.assign_add(1)]):
+        ta0 = ta0.identity()
+
+      with ops.control_dependencies([v1.assign_add(1)]):
+        ta1 = ta1.identity()
+
+      read0 = ta0.read(0)
+      read1 = ta1.read(0)
+
+      size0 = ta0.size()
+      size1 = ta1.size()
+
+      # Tests correct properties on new TensorArrays.
+      self.assertEqual(dtypes.float32, ta0.dtype)
+      self.assertEqual(dtypes.int32, ta1.dtype)
+      self.assertEqual(tensor_shape.unknown_shape(), read0.get_shape())
+      self.assertEqual(tensor_shape.scalar(), read1.get_shape())
+
+      variables.global_variables_initializer().run()
+
+      read0_v, read1_v, size0_v, size1_v = session.run(
+          (read0, read1, size0, size1))
+
+      # Tests that the control dependencies was added and executed.
+      self.assertEqual(1, v0.eval())
+      self.assertEqual(1, v1.eval())
+
+      # Tests correct TensorArray.
+      self.assertEqual(read0_v, 0)
+      self.assertEqual(read1_v, 1)
+      self.assertEqual(size0_v, 2)
+      self.assertEqual(size1_v, 4)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 22024f45116..ba5f829936f 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -75,6 +75,20 @@ class TernaryOpsTest(XLATestCase):
         np.array(7, dtype=np.float32),
         expected=np.array(7, dtype=np.float32))
 
+    self._testTernary(
+        array_ops.where,
+        np.array(1, dtype=np.bool),
+        np.array([1, 2, 3, 4], dtype=np.float32),
+        np.array([5, 6, 7, 8], dtype=np.float32),
+        expected=np.array([1, 2, 3, 4], dtype=np.float32))
+
+    self._testTernary(
+        array_ops.where,
+        np.array(0, dtype=np.bool),
+        np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
+        np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32),
+        expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32))
+
     self._testTernary(
         array_ops.where,
         np.array([0, 1, 1, 0], dtype=np.bool),
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index f0b80d1ffdb..51d8786ce3d 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -159,6 +159,13 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
           expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.round,
+          np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5]], dtype=dtype),
+          expected=np.array([[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]],
+                            dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.rsqrt,
           np.array([[4, 16]], dtype=dtype),
@@ -175,6 +182,11 @@ class UnaryOpsTest(XLATestCase):
                [0.7310586, 0.880797, 0.95257413, 0.98201376]],
               dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sigmoid,
+          np.array([-300, -150, 0, 150, 300], dtype=dtype),
+          expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.sqrt,
           np.array([[4, 9]], dtype=dtype),
@@ -202,6 +214,11 @@ class UnaryOpsTest(XLATestCase):
                [-3.4401896, -2.4401896, -1.4401897, -0.44018969]],
               dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.elu,
+          np.array([[-1, 0, 1]], dtype=dtype),
+          expected=np.array([[-0.63212056, 0, 1]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           nn_ops.relu,
           np.array([[-1, 1]], dtype=dtype),
@@ -250,6 +267,11 @@ class UnaryOpsTest(XLATestCase):
           np.array([[4, 3], [2, 1]], dtype=dtype),
           expected=np.array([[0, 0], [0, 0]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          array_ops.ones_like,
+          np.array([[4, 3], [2, 1]], dtype=dtype),
+          expected=np.array([[1, 1], [1, 1]], dtype=dtype))
+
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
         math_ops.logical_not,
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
new file mode 100644
index 00000000000..70dacd9de4b
--- /dev/null
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -0,0 +1,183 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for reading and writing variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
+
+
+class VariableOpsTest(XLATestCase):
+  """Test cases for resource variable operators."""
+
+  def testOneWriteOneOutput(self):
+    # Regression test for a bug where computations with one non-constant
+    # output and one variable update were mishandled.
+    for dtype in self.numeric_types:
+      init = np.array([[1, 2], [3, 4]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        p = array_ops.placeholder(dtype)
+        x = v.assign_add(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(np.array([[2, 3], [4, 5]], dtype=dtype),
+                            sess.run(y, {p: 1}))
+
+  def testReadWrite(self):
+    """Tests initialization, reading, and writing a resource variable."""
+    with self.test_session() as session:
+      with self.test_scope():
+        with variable_scope.variable_scope("ascope", use_resource=True):
+          x = variable_scope.get_variable(
+              "x",
+              shape=[],
+              dtype=dtypes.float32,
+              initializer=init_ops.constant_initializer(2))
+          a = x.read_value()
+          with ops.control_dependencies([a]):
+            b = state_ops.assign(x, 47)
+          with ops.control_dependencies([b]):
+            c = x.read_value()
+          with ops.control_dependencies([c]):
+            d = state_ops.assign_add(x, 3)
+          with ops.control_dependencies([d]):
+            e = x.read_value()
+
+      session.run(variables.global_variables_initializer())
+      v1, v2, v3 = session.run([a, c, e])
+      self.assertAllClose(2.0, v1)
+      self.assertAllClose(47.0, v2)
+      self.assertAllClose(50.0, v3)
+
+  def testTraining(self):
+    """Tests a gradient descent step for a simple model."""
+    with self.test_session() as session:
+      with self.test_scope():
+        with variable_scope.variable_scope("ascope", use_resource=True):
+          w = variable_scope.get_variable(
+              "w",
+              shape=[4, 2],
+              dtype=dtypes.float32,
+              initializer=init_ops.constant_initializer(
+                  np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32)))
+          b = variable_scope.get_variable(
+              "b",
+              shape=[2],
+              dtype=dtypes.float32,
+              initializer=init_ops.constant_initializer(
+                  np.array([2, 3], dtype=np.float32)))
+
+          x = array_ops.placeholder(dtypes.float32, shape=[1, 4])
+          y = math_ops.matmul(x, w) + b
+          loss = math_ops.reduce_sum(y)
+          optimizer = GradientDescentOptimizer(0.1)
+          train = optimizer.minimize(loss)
+
+      session.run(variables.global_variables_initializer())
+      session.run(train, {x: np.array([[7, 3, 5, 9]], dtype=np.float32)})
+      vw, vb = session.run([w, b])
+      self.assertAllClose(
+          np.array(
+              [[0.3, 1.3], [2.7, 3.7], [4.5, 5.5], [6.1, 7.1]],
+              dtype=np.float32),
+          vw,
+          rtol=1e-4)
+      self.assertAllClose(np.array([1.9, 2.9], dtype=np.float32), vb, rtol=1e-4)
+
+
+class StridedSliceAssignChecker(object):
+  """Compares the results of a slice assignment using Tensorflow and numpy."""
+
+  def __init__(self, test, x, dtype):
+    self.dtype = dtype
+    self.test = test
+    self.x_np = np.array(x).astype(dtype)
+
+  def __setitem__(self, index, value):
+    value = np.array(value).astype(self.dtype)
+
+    with self.test.test_session() as sess, self.test.test_scope():
+      x = constant_op.constant(self.x_np, dtype=self.dtype)
+      var = resource_variable_ops.ResourceVariable(x)
+      sess.run(variables.variables_initializer([var]))
+      val = sess.run(var[index].assign(value))
+      # val_copy is used to check that tf.assign works equivalently to the
+      # assign method above.
+      val_copy = sess.run(state_ops.assign(var[index], value))
+      valnp = np.copy(self.x_np)
+      valnp[index] = np.array(value)
+      self.test.assertAllEqual(val, valnp)
+      self.test.assertAllEqual(val_copy, valnp)
+
+
+class SliceAssignTest(XLATestCase):
+
+  def testSliceAssign(self):
+    for dtype in self.numeric_types:
+      checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]],
+                                          dtype=dtype)
+      # No-op assignment
+      checker[:] = [[10, 20, 30], [40, 50, 60]]
+      # Checks trivial (1,1) shape tensor
+      checker[1:2, 1:2] = [[66]]
+      # shrink shape changes
+      checker[1:2, 1] = [66]
+      checker[1, 1:2] = [66]
+      checker[1, 1] = 66
+      # newaxis shape changes
+      checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
+      # shrink and newaxis
+      checker[None, None, 0, 0:1] = [[[99]]]
+      # Non unit strides
+      checker[::1, 1::-1] = [[3, 33], [4, 44]]
+      # degenerate interval
+      checker[8:10, 0] = []
+      checker[8:10, 8:10] = [[]]
+
+      # Assign vector to scalar (rank-0) using newaxis
+      checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
+      checker2[()] = 6  # no indices
+      checker2[...] = 6  # ellipsis
+      checker2[None] = [6]  # new axis
+
+  def testUninitialized(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "uninitialized variable"):
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable([1, 2])
+        sess.run(v[:].assign([1, 2]))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 1388a892ba5..f5c228f8305 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -18,15 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -48,34 +43,6 @@ class XlaDeviceTest(test.TestCase):
       result = sess.run(w, {x: [1.5, 0.5]})
     self.assertAllClose(result, [12., 2.], rtol=1e-3)
 
-  def testLoops(self):
-    """Tests that loops work on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      with ops.device("device:XLA_CPU:0"):
-        c = lambda i, _: math_ops.less(i, 5)
-        b = lambda i, x: (i + 1, x * 2.0 + 1.0)
-        _, y = control_flow_ops.while_loop(c, b, (constant_op.constant(0), x))
-
-      result = session.run(y, {x: np.float32(2)})
-      self.assertAllClose(result, np.float32(95), rtol=1e-3)
-
-  def testCond(self):
-    """Tests that tf.cond works on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      y = array_ops.placeholder(dtypes.float32)
-      c = array_ops.placeholder(dtypes.bool)
-      with ops.device("device:XLA_CPU:0"):
-        z = x + 1.0
-        w = control_flow_ops.cond(c, lambda: z, lambda: y)
-        t = math_ops.add(z, w)
-
-      result = session.run(t, {x: np.float32(2), y: np.float32(4), c: True})
-      self.assertAllClose(result, np.float32(6), rtol=1e-3)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index b72e7c9713d..79549644ea0 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -19,14 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import random
 import re
 
+import numpy as np
+
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
@@ -50,16 +54,20 @@ class XLATestCase(test.TestCase):
     self.device = FLAGS.test_device
     self.has_custom_call = (self.device == 'XLA_CPU')
     self.all_tf_types = [
-        dtypes.DType(types_pb2.DataType.Value(name))
+        dtypes.as_dtype(types_pb2.DataType.Value(name))
         for name in FLAGS.types.split(',')
     ]
+    self.int_tf_types = [
+        dtype for dtype in self.all_tf_types if dtype.is_integer
+    ]
+    self.float_tf_types = [
+        dtype for dtype in self.all_tf_types if dtype.is_floating
+    ]
+    self.numeric_tf_types = self.int_tf_types + self.float_tf_types
+
     self.all_types = [dtype.as_numpy_dtype for dtype in self.all_tf_types]
-    self.int_types = [
-        dtype.as_numpy_dtype for dtype in self.all_tf_types if dtype.is_integer
-    ]
-    self.float_types = [
-        dtype.as_numpy_dtype for dtype in self.all_tf_types if dtype.is_floating
-    ]
+    self.int_types = [dtype.as_numpy_dtype for dtype in self.int_tf_types]
+    self.float_types = [dtype.as_numpy_dtype for dtype in self.float_tf_types]
     self.numeric_types = self.int_types + self.float_types
 
     # Parse the manifest file, if any, into a regex identifying tests to
@@ -81,6 +89,9 @@ class XLATestCase(test.TestCase):
       return
     logging.info('Start test case: %s', name)
 
+    random.seed(random_seed.DEFAULT_GRAPH_SEED)
+    np.random.seed(random_seed.DEFAULT_GRAPH_SEED)
+
   def tearDown(self):
     logging.info('End test case: %s', self._testMethodName)
 
@@ -112,7 +123,11 @@ class XLATestCase(test.TestCase):
       yield
 
 
-def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
+def Benchmark(tf_bench,
+              builder_fn,
+              use_xla_jit,
+              device,
+              separate_compiled_gradients=False):
   """Build a graph and run benchmarks against it, with or without XLA.
 
   Args:
@@ -122,6 +137,14 @@ def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
         is a list of tensors to fetch as output.
     use_xla_jit: If true compile with the XLA JIT, otherwise use regular TF.
     device: The tensorflow device to run on, e.g. "cpu", "gpu".
+    separate_compiled_gradients: If true put each gradient subgraph into a
+      separate compilation scope. This gives fine-grained control over which
+      portions of the graph will be compiled as a single unit. Compiling
+      gradients separately may yield better performance for some graphs.
+      The scope is named based on the scope of the forward computation as well
+      as the name of the gradients. As a result, the gradients will be compiled
+      in a scope that is separate from both the forward computation, and from
+      other gradients.
   """
 
   with ops.Graph().as_default():
@@ -130,7 +153,9 @@ def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
     with ops.device(device):
       fetches = []
       jit_scope = jit.experimental_jit_scope
-      with jit_scope(compile_ops=use_xla_jit):
+      with jit_scope(
+          compile_ops=use_xla_jit,
+          separate_compiled_gradients=separate_compiled_gradients):
         name, fetches = builder_fn()
 
       # We only want to benchmark the operations themselves, and not the data
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 08a03b8d357..93c484ca7a0 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -23,12 +23,12 @@ package(
 cc_library(
     name = "xla_compiler",
     srcs = [
-        "op_registrations.cc",
         "xla_compilation_device.cc",
         "xla_compiler.cc",
         "xla_context.cc",
         "xla_helpers.cc",
         "xla_op_kernel.cc",
+        "xla_op_registry.cc",
     ],
     hdrs = [
         "xla_compilation_device.h",
@@ -36,18 +36,21 @@ cc_library(
         "xla_context.h",
         "xla_helpers.h",
         "xla_op_kernel.h",
+        "xla_op_registry.h",
     ],
+    visibility = [":friends"],
     deps = [
         ":common",
         ":dump_graph",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -89,6 +92,7 @@ cc_library(
 
 cc_test(
     name = "xla_compiler_test",
+    size = "small",
     srcs = ["xla_compiler_test.cc"],
     deps = [
         ":xla_compiler",
@@ -110,6 +114,7 @@ cc_test(
 
 cc_test(
     name = "str_util_test",
+    size = "small",
     srcs = [
         "str_util_test.cc",
     ],
@@ -123,6 +128,7 @@ cc_test(
 
 cc_test(
     name = "literal_util_test",
+    size = "small",
     srcs = [
         "literal_util_test.cc",
     ],
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index e072ef7be7e..36a6c90af4f 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -35,6 +35,9 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Any", "reduction_indices"},
       {"ArgMax", "dimension"},
       {"AvgPoolGrad", "orig_input_shape"},
+      {"BatchToSpace", "crops"},
+      {"BatchToSpaceND", "block_shape"},
+      {"BatchToSpaceND", "crops"},
       {"BroadcastGradientArgs", "s0"},
       {"BroadcastGradientArgs", "s1"},
       {"Concat", "concat_dim"},
@@ -43,6 +46,8 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"ConcatOffset", "shape"},
       {"Conv2DBackpropFilter", "filter_sizes"},
       {"Conv2DBackpropInput", "input_sizes"},
+      {"Conv3DBackpropFilterV2", "filter_sizes"},
+      {"Conv3DBackpropInputV2", "input_sizes"},
       {"DynamicStitch", "indices"},
       {"ExpandDims", "dim"},
       {"Fill", "dims"},
@@ -53,6 +58,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Max", "reduction_indices"},
       {"Mean", "reduction_indices"},
       {"Min", "reduction_indices"},
+      {"OneHot", "depth"},
       {"Pad", "paddings"},
       {"Prod", "reduction_indices"},
       {"RandomStandardNormal", "shape"},
@@ -62,8 +68,16 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Range", "limit"},
       {"Range", "delta"},
       {"Reshape", "shape"},
+      {"ResourceStridedSliceAssign", "begin"},
+      {"ResourceStridedSliceAssign", "end"},
+      {"ResourceStridedSliceAssign", "strides"},
+      {"Reverse", "dims"},
+      {"ReverseV2", "axis"},
       {"Slice", "begin"},
       {"Slice", "size"},
+      {"SpaceToBatch", "paddings"},
+      {"SpaceToBatchND", "block_shape"},
+      {"SpaceToBatchND", "paddings"},
       {"Split", "split_dim"},
       {"SplitV", "split_dim"},
       {"SplitV", "size_splits"},
@@ -75,6 +89,8 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"StridedSliceGrad", "end"},
       {"StridedSliceGrad", "strides"},
       {"Sum", "reduction_indices"},
+      {"TensorArrayV3", "size"},
+      {"TensorArraySplitV3", "lengths"},
       {"Tile", "multiples"},
       {"Transpose", "perm"}};
 
@@ -97,7 +113,7 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (must_be_const.find(node) != must_be_const.end()) {
       if (node->type_string() == "_Arg") {
         int index;
-        status = GetNodeAttr(node->def(), "index", &index);
+        status = GetNodeAttr(node->attrs(), "index", &index);
         if (!status.ok()) return;
         compile_time_const_args->at(index) = true;
         return;
@@ -113,8 +129,8 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (range.first == range.second) return;
 
     NameRangeMap input_name_ranges;
-    status = NameRangesForNode(node->def(), node->op_def(), &input_name_ranges,
-                               nullptr);
+    status =
+        NameRangesForNode(*node, node->op_def(), &input_name_ranges, nullptr);
     if (!status.ok()) return;
 
     for (auto it = range.first; it != range.second; ++it) {
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 5aa6f806ac6..af5753c2600 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -33,8 +33,16 @@ struct NameCounts {
   std::unordered_map<string, int> counts;
 };
 
-string MakeUniquePath(const string& name) {
+string MakeUniquePath(string name) {
   static NameCounts& instance = *new NameCounts;
+
+  // Remove illegal characters from `name`.
+  for (int i = 0; i < name.size(); ++i) {
+    if (name[i] == '/') {
+      name[i] = '_';
+    }
+  }
+
   int count;
   {
     mutex_lock lock(instance.counts_mutex);
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d913f898e94..a434c746809 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -14,18 +14,21 @@ tf_kernel_library(
     name = "xla_ops",
     srcs = [
         "aggregate_ops.cc",
+        "arg_op.cc",
         "batch_matmul_op.cc",
+        "batchtospace_op.cc",
         "bcast_ops.cc",
         "bias_ops.cc",
         "binary_ops.cc",
         "cast_op.cc",
         "concat_op.cc",
+        "const_op.cc",
         "conv_ops.cc",
         "cwise_ops.cc",
-        "declaration_op.cc",
         "depthwise_conv_ops.cc",
         "diag_op.cc",
         "dynamic_stitch_op.cc",
+        "elu_op.cc",
         "fill_op.cc",
         "function_ops.cc",
         "identity_op.cc",
@@ -33,6 +36,7 @@ tf_kernel_library(
         "lrn_ops.cc",
         "matmul_op.cc",
         "no_op.cc",
+        "one_hot_op.cc",
         "pack_op.cc",
         "pad_op.cc",
         "pooling_ops.cc",
@@ -42,17 +46,22 @@ tf_kernel_library(
         "relu_op.cc",
         "reshape_op.cc",
         "retval_op.cc",
+        "reverse_op.cc",
         "select_op.cc",
         "sequence_ops.cc",
         "shape_op.cc",
         "slice_op.cc",
         "softmax_op.cc",
+        "spacetobatch_op.cc",
         "split_op.cc",
         "strided_slice_op.cc",
+        "tensor_array_ops.cc",
         "tile_ops.cc",
+        "training_ops.cc",
         "transpose_op.cc",
         "unary_ops.cc",
         "unpack_op.cc",
+        "variable_ops.cc",
     ],
     hdrs = [
         "cwise_ops.h",
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index 8f284c30174..5c9f66df101 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
 namespace tensorflow {
 namespace {
@@ -41,7 +41,7 @@ class AddNOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(AddNOp);
 };
 
-REGISTER_XLA_OP("AddN", AddNOp);
+REGISTER_XLA_OP(Name("AddN"), AddNOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
new file mode 100644
index 00000000000..620fc844378
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+// This OpKernel implements the _Arg Op for XLA JIT devices. It
+// associates its output with one of the arguments to a
+// subcomputation.
+class ArgOp : public XlaOpKernel {
+ public:
+  explicit ArgOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // If 'frame' is non-null, this is a function call inside an outer JIT
+    // compilation. Use the usual implementation of _Arg.
+    auto frame = ctx->call_frame();
+    if (frame != nullptr) {
+      Tensor val;
+      OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
+      OP_REQUIRES(ctx, val.dtype() == dtype_,
+                  errors::InvalidArgument(
+                      "Type mismatch: actual ", DataTypeString(val.dtype()),
+                      " vs. expect ", DataTypeString(dtype_)));
+      // Forwards the argument from the frame.
+      ctx->op_kernel_context()->set_output(0, val);
+      return;
+    }
+
+    XlaContext& xc = XlaContext::Get(ctx);
+    const XlaContext::Argument& arg = xc.args()[index_];
+    if (arg.is_variable) {
+      // TODO(phawkins): this code assumes that variables do not alias.
+      XlaVariable* var;
+      OP_REQUIRES_OK(ctx, xc.CreateVariable(index_, arg.name, arg.value.type,
+                                            arg.value.handle, &var));
+      var->tensor_array_size = arg.tensor_array_size;
+      ctx->SetVariableOutput(0, var);
+    } else if (arg.value.is_constant) {
+      ctx->SetConstantOutput(0, arg.value.constant_value);
+    } else {
+      ctx->SetOutput(0, arg.value.handle);
+    }
+  }
+
+ private:
+  int index_;
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+};
+
+REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes(), ArgOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 637360d149e..16b778bca43 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -18,8 +18,8 @@ limitations under the License.
 // dimension.
 // TODO(dominikg,phawkins): Use a real batched matmul instead of unrolling.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
 namespace tensorflow {
 namespace {
@@ -94,12 +94,14 @@ class BatchMatMulOp : public XlaOpKernel {
       // Slice off individual matrices and reshape to 2D tensors.
       auto x_slice = builder->Slice(
           x_flat, {i, 0, 0},
-          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
+          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
       x_slice = builder->Reshape(
           x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
       auto y_slice = builder->Slice(
           y_flat, {i, 0, 0},
-          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
+          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
       y_slice = builder->Reshape(
           y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
 
@@ -135,7 +137,7 @@ class BatchMatMulOp : public XlaOpKernel {
   bool adj_y_;
 };
 
-REGISTER_XLA_OP("BatchMatMul", BatchMatMulOp);
+REGISTER_XLA_OP(Name("BatchMatMul"), BatchMatMulOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
new file mode 100644
index 00000000000..8642cbf2a92
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -0,0 +1,187 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+void BatchToSpace(XlaOpKernelContext* ctx,
+                  const xla::ComputationDataHandle& input, DataType input_dtype,
+                  const TensorShape& input_tensor_shape,
+                  gtl::ArraySlice<int64> block_shape,
+                  const xla::Literal& crops) {
+  const int input_rank = input_tensor_shape.dims();
+  const gtl::InlinedVector<int64, 4> input_shape =
+      input_tensor_shape.dim_sizes();
+  const int block_rank = block_shape.size();
+
+  OP_REQUIRES(
+      ctx, input_rank >= 1 + block_rank,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_rank,
+                              " instead of ", input_rank));
+  gtl::ArraySlice<int64> remainder_shape(input_shape);
+  remainder_shape.remove_prefix(1 + block_rank);
+
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Rank(crops.shape()) == 2 &&
+          block_rank == xla::ShapeUtil::GetDimension(crops.shape(), 0) &&
+          2 == xla::ShapeUtil::GetDimension(crops.shape(), 1),
+      errors::InvalidArgument("crops should have shape [", block_rank,
+                              ", 2] instead of ",
+                              xla::ShapeUtil::HumanString(crops.shape())));
+
+  xla::ComputationBuilder* b = ctx->builder();
+  const int64 batch_size = input_shape[0];
+
+  // Compute the product of the block_shape values.
+  int64 block_num_elems = 1;
+  for (int i = 0; i < block_rank; ++i) {
+    block_num_elems *= block_shape[i];
+  }
+  OP_REQUIRES(ctx, block_num_elems > 0,
+              errors::InvalidArgument(
+                  "The product of the block dimensions must be positive"));
+
+  // 1. Reshape `input` to `reshaped` of shape:
+  //      [block_shape[0], ..., block_shape[M-1],
+  //       batch / prod(block_shape),
+  //       input_shape[1], ..., input_shape[N-1]]
+
+  OP_REQUIRES(
+      ctx, batch_size % block_num_elems == 0,
+      errors::InvalidArgument("Input batch dimension (", batch_size,
+                              ") is not divisible by product of block sizes (",
+                              block_num_elems, ")"));
+  std::vector<int64> reshaped_shape(input_rank + block_rank);
+  std::copy(block_shape.begin(), block_shape.end(), reshaped_shape.begin());
+  reshaped_shape[block_rank] = batch_size / block_num_elems;
+  std::copy(input_shape.begin() + 1, input_shape.end(),
+            reshaped_shape.begin() + block_rank + 1);
+  xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
+
+  // 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1], block_shape[0],
+  //       ...,
+  //       input_shape[M], block_shape[M-1],
+  //
+  //       input_shape[M+1], ..., input_shape[N-1]]
+  std::vector<int64> permutation(reshaped_shape.size());
+  permutation[0] = block_rank;
+  for (int i = 0; i < block_rank; ++i) {
+    permutation[1 + 2 * i] = block_rank + 1 + i;
+    permutation[1 + 2 * i + 1] = i;
+  }
+  std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
+            1 + block_rank * 2);
+  xla::ComputationDataHandle permuted = b->Transpose(reshaped, permutation);
+
+  // 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1] * block_shape[0],
+  //       ...,
+  //       input_shape[M] * block_shape[M-1],
+  //
+  //       input_shape[M+1],
+  //       ...,
+  //       input_shape[N-1]]
+  std::vector<int64> reshaped_permuted_shape(input_rank);
+  reshaped_permuted_shape[0] = batch_size / block_num_elems;
+  for (int i = 0; i < block_rank; ++i) {
+    reshaped_permuted_shape[1 + i] = block_shape[i] * input_shape[1 + i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            reshaped_permuted_shape.begin() + 1 + block_rank);
+
+  xla::ComputationDataHandle reshaped_permuted =
+      b->Reshape(permuted, reshaped_permuted_shape);
+
+  // 4. Crop the start and end of dimensions `[1, ..., M]` of
+  //    `reshaped_permuted` according to `crops` to produce the output of shape:
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+  //       ...,
+  //       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+  //
+  //       input_shape[M+1], ..., input_shape[N-1]]
+  std::vector<int64> start_indices(input_rank, 0);
+  std::vector<int64> end_indices = reshaped_permuted_shape;
+  std::vector<int64> strides(input_rank, 1);
+  for (int i = 0; i < block_rank; ++i) {
+    int64 crop_start = xla::LiteralUtil::Get<int64>(crops, {i, 0});
+    int64 crop_end = xla::LiteralUtil::Get<int64>(crops, {i, 1});
+    OP_REQUIRES(ctx, crop_start >= 0 && crop_end >= 0,
+                errors::InvalidArgument("Crops must be non-negative"));
+    start_indices[1 + i] = crop_start;
+    end_indices[1 + i] -= crop_end;
+    OP_REQUIRES(
+        ctx, start_indices[1 + i] <= end_indices[1 + i],
+        errors::InvalidArgument(
+            "Cropped size must be non-negative: start: ", crop_start,
+            " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
+  }
+  xla::ComputationDataHandle output =
+      b->Slice(reshaped_permuted, start_indices, end_indices, strides);
+  ctx->SetOutput(0, output);
+}
+
+class BatchToSpaceNDOp : public XlaOpKernel {
+ public:
+  explicit BatchToSpaceNDOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> block_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &block_shape));
+
+    xla::Literal crops;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(2, &crops));
+
+    BatchToSpace(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 block_shape, crops);
+  }
+};
+REGISTER_XLA_OP(Name("BatchToSpaceND"), BatchToSpaceNDOp);
+
+class BatchToSpaceOp : public XlaOpKernel {
+ public:
+  explicit BatchToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
+    OP_REQUIRES(
+        ctx, block_size_ > 1,
+        errors::InvalidArgument("Block size should be > 1: ", block_size_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::Literal crops;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(1, &crops));
+
+    BatchToSpace(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 {block_size_, block_size_}, crops);
+  }
+
+ private:
+  int block_size_;
+};
+REGISTER_XLA_OP(Name("BatchToSpace"), BatchToSpaceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index f35835df087..b0fee5e4bca 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific Ops for broadcasting used in gradient
 // code.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -81,7 +81,7 @@ class BCastGradArgsOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
 };
 
-REGISTER_XLA_OP("BroadcastGradientArgs", BCastGradArgsOp);
+REGISTER_XLA_OP(Name("BroadcastGradientArgs"), BCastGradArgsOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 217e82304e3..c667b4e3e32 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <numeric>
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -69,8 +69,8 @@ class BiasOp : public XlaOpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_XLA_OP("BiasAdd", BiasOp);
-REGISTER_XLA_OP("BiasAddV1", BiasOp);
+REGISTER_XLA_OP(Name("BiasAdd"), BiasOp);
+REGISTER_XLA_OP(Name("BiasAddV1"), BiasOp);
 
 class BiasAddGradOp : public XlaOpKernel {
  public:
@@ -113,7 +113,7 @@ class BiasAddGradOp : public XlaOpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_XLA_OP("BiasAddGrad", BiasAddGradOp);
+REGISTER_XLA_OP(Name("BiasAddGrad"), BiasAddGradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 6f117ebe616..ded20a9a3ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // Native XLA implementations of simple unary Ops
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -28,10 +28,10 @@ namespace {
 // A subclass of a XlaBinaryOp must build the computation that
 // describes the (tensor,tensor)->tensor function to apply to each element of
 // the input.
-#define XLA_MAKE_BINARY(Name, HLO)                                      \
-  class Name##Op : public XlaBinaryOp {                                 \
+#define XLA_MAKE_BINARY(NAME, HLO)                                      \
+  class NAME##Op : public XlaBinaryOp {                                 \
    public:                                                              \
-    explicit Name##Op(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}  \
+    explicit NAME##Op(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}  \
     xla::ComputationDataHandle Computation(                             \
         XlaOpKernelContext* ctx, const xla::ComputationDataHandle& lhs, \
         const gtl::ArraySlice<int64>& lhs_shape,                        \
@@ -43,7 +43,7 @@ namespace {
       return HLO;                                                       \
     }                                                                   \
   };                                                                    \
-  REGISTER_XLA_OP(#Name, Name##Op)
+  REGISTER_XLA_OP(Name(#NAME), NAME##Op)
 
 XLA_MAKE_BINARY(Add, b->Add(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(Sub, b->Sub(lhs, rhs, extend_dimensions));
@@ -127,32 +127,21 @@ XLA_MAKE_BINARY(GreaterEqual, b->Ge(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(Less, b->Lt(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(LessEqual, b->Le(lhs, rhs, extend_dimensions));
 
+// Non-linear ops
+XLA_MAKE_BINARY(SigmoidGrad,
+                b->Mul(b->Mul(rhs, lhs),
+                       b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
+
+XLA_MAKE_BINARY(SoftplusGrad,
+                b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
+                                   XlaHelpers::One(b, input_type(1)))));
+
+XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
+                                             b->Mul(lhs, lhs))));
+
+XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions));
+
 #undef XLA_MAKE_BINARY
 
-#define XLA_MAKE_BINARY_MAP(Name, HLO)                                    \
-  class Name##Op : public XlaBinaryMapOp {                                \
-   public:                                                                \
-    explicit Name##Op(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} \
-    void BuildMapLambda(xla::ComputationBuilder* b,                       \
-                        const xla::ComputationDataHandle& lhs,            \
-                        const xla::ComputationDataHandle& rhs) override { \
-      HLO;                                                                \
-    }                                                                     \
-  };                                                                      \
-  REGISTER_XLA_OP(#Name, Name##Op)
-
-XLA_MAKE_BINARY_MAP(Pow, b->Pow(lhs, rhs));
-XLA_MAKE_BINARY_MAP(SigmoidGrad,
-                    b->Mul(b->Mul(rhs, lhs),
-                           b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
-XLA_MAKE_BINARY_MAP(SoftplusGrad,
-                    b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
-                                       XlaHelpers::One(b, input_type(1)))));
-XLA_MAKE_BINARY_MAP(TanhGrad,
-                    b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
-                                       b->Mul(lhs, lhs))));
-
-#undef XLA_MAKE_BINARY_MAP
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index b0188b4f8d8..124e33d7935 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -65,7 +65,7 @@ class CastOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(CastOp);
 };
 
-REGISTER_XLA_OP("Cast", CastOp);
+REGISTER_XLA_OP(Name("Cast"), CastOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index d086e55cb79..e2eacb3839d 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -117,8 +117,8 @@ class ConcatV2Op : public ConcatBaseOp {
       : ConcatBaseOp(c, /* axis_index */ c->num_inputs() - 1) {}
 };
 
-REGISTER_XLA_OP("Concat", ConcatOp);
-REGISTER_XLA_OP("ConcatV2", ConcatV2Op);
+REGISTER_XLA_OP(Name("Concat"), ConcatOp);
+REGISTER_XLA_OP(Name("ConcatV2").TypeConstraint("Tidx", DT_INT32), ConcatV2Op);
 
 class ConcatOffsetOp : public XlaOpKernel {
  public:
@@ -204,7 +204,7 @@ class ConcatOffsetOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("ConcatOffset", ConcatOffsetOp);
+REGISTER_XLA_OP(Name("ConcatOffset"), ConcatOffsetOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
new file mode 100644
index 00000000000..ad676e7a2bb
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class ConstOp : public XlaOpKernel {
+ public:
+  explicit ConstOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    const TensorProto* proto = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+    proto_ = *proto;
+    OP_REQUIRES(
+        ctx, ctx->output_type(0) == proto_.dtype(),
+        errors::InvalidArgument("Type mismatch between value (",
+                                DataTypeString(proto_.dtype()), ") and dtype (",
+                                DataTypeString(ctx->output_type(0)), ")"));
+    OP_REQUIRES_OK(ctx, TensorShape::IsValidShape(proto_.tensor_shape()));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape(proto_.tensor_shape());
+
+    xla::ComputationBuilder* b = ctx->builder();
+
+    // To avoid blowups for large constants filled with the same value,
+    // recognize that case and emit a scalar broadcast instead.
+    if (shape.num_elements() > 1) {
+      switch (proto_.dtype()) {
+        case DT_BOOL:
+          if (proto_.bool_val_size() == 1) {
+            ctx->SetOutput(0,
+                           b->Broadcast(b->ConstantR0<bool>(proto_.bool_val(0)),
+                                        shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_FLOAT:
+          if (proto_.float_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<float>(proto_.float_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_DOUBLE:
+          if (proto_.double_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<double>(proto_.double_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_INT32:
+          if (proto_.int_val_size() == 1) {
+            ctx->SetOutput(0,
+                           b->Broadcast(b->ConstantR0<int32>(proto_.int_val(0)),
+                                        shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_INT64:
+          if (proto_.int64_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<int64>(proto_.int64_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        default:
+          break;
+      }
+    }
+
+    // General case
+    Tensor tensor(proto_.dtype());
+    OP_REQUIRES(ctx, tensor.FromProto(cpu_allocator(), proto_),
+                errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                        proto_.DebugString()));
+    ctx->SetConstantOutput(0, tensor);
+  }
+
+ private:
+  TensorProto proto_;
+  TF_DISALLOW_COPY_AND_ASSIGN(ConstOp);
+};
+
+// XLA_* devices also register a "real" Const operator so we suppress the
+// dummy operator using CompilationOnly().
+REGISTER_XLA_OP(Name("Const").CompilationOnly(), ConstOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 9bebfcfe47d..67a0b803c5b 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 // XLA-specific Ops for 2D convolution.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -35,96 +35,67 @@ namespace tensorflow {
 
 namespace {
 
-class Conv2DOp : public XlaOpKernel {
+class ConvOp : public XlaOpKernel {
  public:
-  explicit Conv2DOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  explicit ConvOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(ctx, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
-    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        ctx, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(ctx, strides_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    int batch_dim = GetTensorBatchDimIndex(num_dims(), data_format_);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims(), data_format_);
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+
     const TensorShape input_shape = ctx->InputShape(0);
     // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth]
+    // [ filter_rows, filter_cols, ..., in_depth, out_depth]
     const TensorShape filter_shape = ctx->InputShape(1);
 
     // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(ctx, input_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input_shape.DebugString()));
-    OP_REQUIRES(ctx, filter_shape.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, input_shape.dims() == num_dims(),
+        errors::InvalidArgument("input must be ", num_dims(), "-dimensional",
+                                input_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, filter_shape.dims() == num_dims(),
+        errors::InvalidArgument("filter must be ", num_dims(),
+                                "-dimensional: ", filter_shape.DebugString()));
+
+    // The last two dimension of the filter are the input and output shapes.
+    const int64 in_depth = filter_shape.dim_size(num_spatial_dims_);
 
     // The 'C' dimension for input is in_depth. It must be the same as
     // the filter's in_depth.
-    const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');
-    OP_REQUIRES(
-        ctx, in_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter_shape.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int64 out_depth = filter_shape.dim_size(3);
-
-    // The 'H' dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 input_rows = GetTensorDim(input_shape, data_format_, 'H');
-    const int64 filter_rows = filter_shape.dim_size(0);
-
-    // The 'W' dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 input_cols = GetTensorDim(input_shape, data_format_, 'W');
-    const int64 filter_cols = filter_shape.dim_size(1);
-
-    // For now we take the stride from the H and W dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
-    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(ctx,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(ctx,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-
-    VLOG(2) << "Conv2D: in_depth = " << in_depth
-            << ", input_cols = " << input_cols
-            << ", filter_cols = " << filter_cols
-            << ", input_rows = " << input_rows
-            << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", out_depth = " << out_depth;
+    OP_REQUIRES(ctx, in_depth == input_shape.dim_size(feature_dim),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", input_shape.dim_size(feature_dim)));
 
     xla::ConvolutionDimensionNumbers dims;
-    dims.set_batch_dimension(GetTensorDimIndex<2>(data_format_, 'N'));
-    dims.set_feature_dimension(GetTensorDimIndex<2>(data_format_, 'C'));
-    dims.add_spatial_dimensions(GetTensorDimIndex<2>(data_format_, 'H'));
-    dims.add_spatial_dimensions(GetTensorDimIndex<2>(data_format_, 'W'));
+    std::vector<int64> window_strides;
 
-    // TF filter shape is [ H, W, inC, outC ]
-    dims.add_kernel_spatial_dimensions(0);
-    dims.add_kernel_spatial_dimensions(1);
-    dims.set_kernel_input_feature_dimension(2);
-    dims.set_kernel_output_feature_dimension(3);
+    dims.set_batch_dimension(GetTensorBatchDimIndex(num_dims(), data_format_));
+    dims.set_feature_dimension(feature_dim);
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dims.add_spatial_dimensions(input_dim);
+      dims.add_kernel_spatial_dimensions(i);
+      window_strides.push_back(strides_.at(input_dim));
+    }
+    dims.set_kernel_input_feature_dimension(num_spatial_dims_);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims_ + 1);
 
-    std::vector<int64> window_strides = {stride_rows, stride_cols};
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
 
@@ -133,38 +104,58 @@ class Conv2DOp : public XlaOpKernel {
     ctx->SetOutput(0, conv);
   }
 
- private:
+ protected:
+  const int num_spatial_dims_;
   std::vector<int32> strides_;
   Padding padding_;
-  TensorFormat data_format_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ConvOp);
 };
 
-REGISTER_XLA_OP("Conv2D", Conv2DOp);
-
-// Backprop for input.
-class Conv2DBackpropInputOp : public XlaOpKernel {
+class Conv2DOp : public ConvOp {
  public:
-  explicit Conv2DBackpropInputOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  explicit Conv2DOp(OpKernelConstruction* ctx)
+      : ConvOp(ctx, /*num_spatial_dims=*/2) {
     string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("Conv2D"), Conv2DOp);
+
+class Conv3DOp : public ConvOp {
+ public:
+  explicit Conv3DOp(OpKernelConstruction* ctx)
+      : ConvOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("Conv3D"), Conv3DOp);
+
+// Backprop for input.
+class ConvBackpropInputOp : public XlaOpKernel {
+ public:
+  explicit ConvBackpropInputOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
-    OP_REQUIRES(ctx, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        ctx, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(ctx, strides_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    int batch_dim = GetTensorBatchDimIndex(num_dims(), data_format_);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims(), data_format_);
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
 
@@ -172,10 +163,10 @@ class Conv2DBackpropInputOp : public XlaOpKernel {
     const TensorShape out_backprop_shape = ctx->InputShape(2);
 
     // Reuse dimension computation logic from conv_grad_ops.cc.
-    Conv2DBackpropDimensions dims;
+    ConvBackpropDimensions dims;
     OP_REQUIRES_OK(
-        ctx, Conv2DBackpropComputeDimensions(
-                 "Conv2DBackpropInput", input_shape, filter_shape,
+        ctx, ConvBackpropComputeDimensions(
+                 type_string(), num_spatial_dims_, input_shape, filter_shape,
                  out_backprop_shape, strides_, padding_, data_format_, &dims));
 
     auto filter = ctx->Input(1);
@@ -186,73 +177,101 @@ class Conv2DBackpropInputOp : public XlaOpKernel {
     // comment at the top of conv_grad_ops.h for details.
 
     xla::ConvolutionDimensionNumbers dnums;
-    dnums.set_batch_dimension(GetTensorDimIndex(data_format_, 'N'));
-    dnums.add_spatial_dimensions(GetTensorDimIndex(data_format_, 'H'));
-    dnums.add_spatial_dimensions(GetTensorDimIndex(data_format_, 'W'));
-    dnums.set_feature_dimension(GetTensorDimIndex(data_format_, 'C'));
+    dnums.set_batch_dimension(batch_dim);
+    dnums.set_feature_dimension(feature_dim);
 
-    // TF filter shape is [ H, W, inC, outC ]
+    // TF filter shape is [ H, W, ..., inC, outC ]
     // Transpose the input and output features for computing the gradient.
-    dnums.add_kernel_spatial_dimensions(0);
-    dnums.add_kernel_spatial_dimensions(1);
-    dnums.set_kernel_input_feature_dimension(3);
-    dnums.set_kernel_output_feature_dimension(2);
+    dnums.set_kernel_input_feature_dimension(num_spatial_dims_ + 1);
+    dnums.set_kernel_output_feature_dimension(num_spatial_dims_);
+
+    std::vector<int64> kernel_spatial_dims(num_spatial_dims_);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims_);
+    std::vector<int64> lhs_dilation(num_spatial_dims_);
+    std::vector<int64> ones(num_spatial_dims_, 1);
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      dnums.add_spatial_dimensions(
+          GetTensorSpatialDimIndex(num_dims(), data_format_, i));
+      dnums.add_kernel_spatial_dimensions(i);
+
+      kernel_spatial_dims[i] = i;
+      padding[i] = {dims.spatial_dims[i].pad_before,
+                    dims.spatial_dims[i].pad_after};
+      lhs_dilation[i] = dims.spatial_dims[i].stride;
+    }
 
     // Mirror the filter in the spatial dimensions.
     xla::ComputationDataHandle mirrored_weights =
-        ctx->builder()->Rev(filter, {dnums.kernel_spatial_dimensions(0),
-                                     dnums.kernel_spatial_dimensions(1)});
+        ctx->builder()->Rev(filter, kernel_spatial_dims);
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     xla::ComputationDataHandle in_backprop = ctx->builder()->ConvGeneralDilated(
-        out_backprop, mirrored_weights, /*window_strides=*/{1, 1},
-        /*padding=*/{{dims.rows.pad_before, dims.rows.pad_after},
-                     {dims.cols.pad_before, dims.cols.pad_after}},
-        /*lhs_dilation=*/{dims.rows.stride, dims.cols.stride},
-        /*rhs_dilation=*/{1, 1}, dnums);
+        out_backprop, mirrored_weights, /*window_strides=*/ones, padding,
+        lhs_dilation, /*rhs_dilation=*/ones, dnums);
 
     ctx->SetOutput(0, in_backprop);
   }
 
- private:
+ protected:
+  const int num_spatial_dims_;
   std::vector<int32> strides_;
   Padding padding_;
-  TensorFormat data_format_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DBackpropInputOp);
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ConvBackpropInputOp);
 };
 
-class Conv2DBackpropFilterOp : public XlaOpKernel {
+class Conv2DBackpropInputOp : public ConvBackpropInputOp {
  public:
-  explicit Conv2DBackpropFilterOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {
+  explicit Conv2DBackpropInputOp(OpKernelConstruction* ctx)
+      : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2) {
     string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("Conv2DBackpropInput"), Conv2DBackpropInputOp);
+
+class Conv3DBackpropInputOp : public ConvBackpropInputOp {
+ public:
+  explicit Conv3DBackpropInputOp(OpKernelConstruction* ctx)
+      : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("Conv3DBackpropInputV2"), Conv3DBackpropInputOp);
+
+class ConvBackpropFilterOp : public XlaOpKernel {
+ public:
+  explicit ConvBackpropFilterOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        ctx, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   void Compile(XlaOpKernelContext* ctx) override {
+    const int n_dim = GetTensorBatchDimIndex(num_dims(), data_format_);
+    const int c_dim = GetTensorFeatureDimIndex(num_dims(), data_format_);
+
+    OP_REQUIRES(
+        ctx, (strides_[n_dim] == 1 && strides_[c_dim] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
     const TensorShape activations_shape = ctx->InputShape(0);
     TensorShape filter_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &filter_shape));
     const TensorShape out_backprop_shape = ctx->InputShape(2);
 
     // Reuse dimension computation logic from conv_grad_ops.cc.
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        ctx, Conv2DBackpropComputeDimensions(
-                 "Conv2DBackpropFilter", activations_shape, filter_shape,
-                 out_backprop_shape, strides_, padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+                            type_string(), num_spatial_dims_, activations_shape,
+                            filter_shape, out_backprop_shape, strides_,
+                            padding_, data_format_, &dims));
 
     xla::ComputationDataHandle activations = ctx->Input(0);
     xla::ComputationDataHandle gradients = ctx->Input(2);
@@ -264,72 +283,71 @@ class Conv2DBackpropFilterOp : public XlaOpKernel {
     xla::ConvolutionDimensionNumbers dnums;
 
     // The activations (inputs) form the LHS of the convolution.
-    // Activations have shape: [batch, in_rows, in_cols, in_depth]
+    // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
     // For the gradient computation, we flip the roles of the batch and
     // feature dimensions.
     // Each spatial entry has size in_depth * batch
-    const int n_dim = GetTensorDimIndex(data_format_, 'N');
-    const int h_dim = GetTensorDimIndex(data_format_, 'H');
-    const int w_dim = GetTensorDimIndex(data_format_, 'W');
-    const int c_dim = GetTensorDimIndex(data_format_, 'C');
 
     // Swap n_dim and c_dim in the activations.
     dnums.set_batch_dimension(c_dim);
-    dnums.add_spatial_dimensions(h_dim);
-    dnums.add_spatial_dimensions(w_dim);
     dnums.set_feature_dimension(n_dim);
 
     // The gradients become the RHS of the convolution.
-    // The gradients have shape [batch, out_rows, out_cols, out_depth] where
-    // the batch becomes the input feature for the convolution.
-    dnums.add_kernel_spatial_dimensions(h_dim);
-    dnums.add_kernel_spatial_dimensions(w_dim);
+    // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
+    // where the batch becomes the input feature for the convolution.
     dnums.set_kernel_input_feature_dimension(n_dim);
     dnums.set_kernel_output_feature_dimension(c_dim);
 
-    // We will also need to pad the input with zeros such that after the
-    // convolution, we get the right size for the filter.
-    // The padded_in_rows should be such that when we convolve this with the
-    // expanded_out_rows as a filter, we should get filter_rows back.
-    //
-    const int padded_in_rows =
-        dims.rows.expanded_output_size + dims.rows.filter_size - 1;
-    const int padded_in_cols =
-        dims.cols.expanded_output_size + dims.cols.filter_size - 1;
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims_);
+    std::vector<int64> rhs_dilation(num_spatial_dims_);
+    std::vector<int64> ones(num_spatial_dims_, 1);
 
-    // However it can be smaller than input_rows: in this
-    // case it means some of the inputs are not used.
-    //
-    // An example is to have input_cols = 3, filter_cols = 2 and stride = 2:
-    //
-    // INPUT =  [ A  B  C ]
-    //
-    // FILTER = [ x y ]
-    //
-    // and the output will only have one column: a = A * x + B * y
-    //
-    // and input "C" is not used at all.
-    //
-    // We apply negative padding in this case.
-    const int total_pad_in_rows = padded_in_rows - dims.rows.input_size;
-    const int total_pad_in_cols = padded_in_cols - dims.cols.input_size;
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dnums.add_spatial_dimensions(dim);
+      dnums.add_kernel_spatial_dimensions(dim);
 
-    // + For the VALID padding, we don't pad anything on the top/left side
-    //   and pad the bottom/right side with the remaining space.
-    // + For the SAME padding, we pad top/left side the same as bottom/right
-    //   side.
-    //
-    // In addition, if the padded input size is smaller than the input size,
-    // we need to ignore some training elements of the input. We do this by
-    // applying negative padding on the right/bottom.
-    const int top_pad_in_rows =
-        (total_pad_in_rows > 0 && padding_ == Padding::SAME)
-            ? total_pad_in_rows / 2
-            : 0;
-    const int left_pad_in_cols =
-        (total_pad_in_cols > 0 && padding_ == Padding::SAME)
-            ? total_pad_in_cols / 2
-            : 0;
+      // We will also need to pad the input with zeros such that after the
+      // convolution, we get the right size for the filter.
+      // The padded_in_rows should be such that when we convolve this with the
+      // expanded_out_rows as a filter, we should get filter_rows back.
+      //
+      const int padded_in_size = dims.spatial_dims[i].expanded_output_size +
+                                 dims.spatial_dims[i].filter_size - 1;
+
+      // However it can be smaller than input_rows: in this
+      // case it means some of the inputs are not used.
+      //
+      // An example is to have input_cols = 3, filter_cols = 2 and stride = 2:
+      //
+      // INPUT =  [ A  B  C ]
+      //
+      // FILTER = [ x y ]
+      //
+      // and the output will only have one column: a = A * x + B * y
+      //
+      // and input "C" is not used at all.
+      //
+      // We apply negative padding in this case.
+      const int total_pad_in_size =
+          padded_in_size - dims.spatial_dims[i].input_size;
+
+      // + For the VALID padding, we don't pad anything on the top/left side
+      //   and pad the bottom/right side with the remaining space.
+      // + For the SAME padding, we pad top/left side the same as bottom/right
+      //   side.
+      //
+      // In addition, if the padded input size is smaller than the input size,
+      // we need to ignore some training elements of the input. We do this by
+      // applying negative padding on the right/bottom.
+      const int before_pad_in_size =
+          (total_pad_in_size > 0 && padding_ == Padding::SAME)
+              ? total_pad_in_size / 2
+              : 0;
+
+      padding[i] = {before_pad_in_size, total_pad_in_size - before_pad_in_size};
+      rhs_dilation[i] = dims.spatial_dims[i].stride;
+    }
 
     // Besides padding the input, we will also expand output_rows to
     //    expanded_out_rows = (output_rows - 1) * stride + 1
@@ -341,33 +359,54 @@ class Conv2DBackpropFilterOp : public XlaOpKernel {
     // convolution HLO below.
     auto filter_backprop = ctx->builder()->ConvGeneralDilated(
         activations, gradients,
-        /*window_strides=*/{1, 1},
-        /*padding=*/{{top_pad_in_rows, total_pad_in_rows - top_pad_in_rows},
-                     {left_pad_in_cols, total_pad_in_cols - left_pad_in_cols}},
-        /*lhs_dilation=*/{1, 1},
-        /*rhs_dilation=*/{dims.rows.stride, dims.cols.stride}, dnums);
+        /*window_strides=*/ones, padding, /*lhs_dilation=*/ones, rhs_dilation,
+        dnums);
 
     // The layout of filter_backprop will match the layout of
     // padded_activations
-    // and so will have layout: [out_feature, h, w, in_feature]
-    // Tensorflow filter shape is [ H, W, inC, outC ], so we transpose the
+    // and so will have layout: [out_feature, h, w, ..., in_feature]
+    // Tensorflow filter shape is [ H, W, ..., inC, outC ], so we transpose the
     // output.
+    std::vector<int64> transpose_dims;
+    transpose_dims.reserve(num_dims());
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      transpose_dims.push_back(dnums.spatial_dimensions(i));
+    }
+    transpose_dims.push_back(c_dim);
+    transpose_dims.push_back(n_dim);
     xla::ComputationDataHandle filter_backprop_reshaped =
-        ctx->builder()->Transpose(filter_backprop,
-                                  {h_dim, w_dim, c_dim, n_dim});
+        ctx->builder()->Transpose(filter_backprop, transpose_dims);
     ctx->SetOutput(0, filter_backprop_reshaped);
   }
 
- private:
+ protected:
+  int num_spatial_dims_;
   std::vector<int32> strides_;
   Padding padding_;
-  TensorFormat data_format_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DBackpropFilterOp);
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ConvBackpropFilterOp);
 };
 
-REGISTER_XLA_OP("Conv2DBackpropInput", Conv2DBackpropInputOp);
-REGISTER_XLA_OP("Conv2DBackpropFilter", Conv2DBackpropFilterOp);
+class Conv2DBackpropFilterOp : public ConvBackpropFilterOp {
+ public:
+  explicit Conv2DBackpropFilterOp(OpKernelConstruction* ctx)
+      : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2) {
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("Conv2DBackpropFilter"), Conv2DBackpropFilterOp);
+
+class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
+ public:
+  explicit Conv3DBackpropFilterOp(OpKernelConstruction* ctx)
+      : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("Conv3DBackpropFilterV2"), Conv3DBackpropFilterOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index 3cd0b39c871..de93a88f064 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index f0687c1d4b5..ba38693325c 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -32,9 +32,7 @@ namespace tensorflow {
 // description of the operation; and Computation adds the
 // implementation of the operation to a xla::ComputationBuilder. For most
 // arithmetic Ops XLA handles the broadcasting automatically given the input
-// tensors. Ops like ReluGrad that need to map a scalar function over the inputs
-// can use the XlaBinaryMapOp subclass below which handles manual
-// broadcasting of the inputs.
+// tensors.
 class XlaBinaryOp : public XlaOpKernel {
  public:
   explicit XlaBinaryOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -83,6 +81,8 @@ class XlaBinaryOp : public XlaOpKernel {
 // virtual methods to override: description is a textual description
 // of the mapped function; and BuildMapLambda adds the
 // implementation of the lambda to a xla::ComputationBuilder.
+// Operations may have better performance if implemented as graphs of
+// element-wise tensor operations.
 class XlaBinaryMapOp : public XlaBinaryOp {
  public:
   explicit XlaBinaryMapOp(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/declaration_op.cc b/tensorflow/compiler/tf2xla/kernels/declaration_op.cc
deleted file mode 100644
index d96ff341789..00000000000
--- a/tensorflow/compiler/tf2xla/kernels/declaration_op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-
-namespace tensorflow {
-namespace {
-
-// This OpKernel implements the Constant Op for XLA JIT
-// devices. It extracts the constant Tensor from the Proto at kernel
-// construction time, and then every time the Constant Op is executed
-// an expression containing the constant is compiled.
-class ConstantDeclarationOp : public XlaOpKernel {
- public:
-  explicit ConstantDeclarationOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx), tensor_(ctx->output_type(0)) {
-    const TensorProto* proto = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
-    // MakeTensorFromProto uses the cpu_allocator, so tensor_ is a
-    // "real" tensor backed by CPU memory, holding the value of the
-    // constant.
-    OP_REQUIRES_OK(ctx, MakeTensorFromProto(*proto, &tensor_));
-    OP_REQUIRES(
-        ctx, ctx->output_type(0) == tensor_.dtype(),
-        errors::InvalidArgument(
-            "Type mismatch between value (", DataTypeString(tensor_.dtype()),
-            ") and dtype (", DataTypeString(ctx->output_type(0)), ")"));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    ctx->SetConstantOutput(0, tensor_);
-  }
-
- private:
-  // Extract the value of the constant from the Proto during Op kernel
-  // construction. The constant must be stored in a Tensor allocated
-  // using the cpu_allocator so that it is backed by real memory. The
-  // OpKernelConstruction's default allocator is the JITAllocator
-  // which only allocates enough space for metadata for each Tensor.
-  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                                    Tensor* tensor) {
-    Tensor parsed(tensor_proto.dtype());
-    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-      return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                     tensor_proto.DebugString());
-    }
-    *tensor = parsed;
-    return Status::OK();
-  }
-
-  // This is a "real" tensor backed by CPU memory, containing the
-  // constant values.
-  Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ConstantDeclarationOp);
-};
-
-REGISTER_XLA_OP("Const", ConstantDeclarationOp);
-
-// This OpKernel implements the _Arg Op for XLA JIT devices. It
-// associates its output with one of the arguments to a
-// subcomputation.
-class ArgOp : public XlaOpKernel {
- public:
-  explicit ArgOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
-    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type_));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    // If 'frame' is non-null, this is a function call inside an outer JIT
-    // compilation. Use the usual implementation of _Arg.
-    auto frame = ctx->call_frame();
-    if (frame != nullptr) {
-      Tensor val;
-      OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
-      OP_REQUIRES(ctx, val.dtype() == dtype_,
-                  errors::InvalidArgument(
-                      "Type mismatch: actual ", DataTypeString(val.dtype()),
-                      " vs. expect ", DataTypeString(dtype_)));
-      // Forwards the argument from the frame.
-      ctx->op_kernel_context()->set_output(0, val);
-      return;
-    }
-
-    XlaContext& tc = XlaContext::Get(ctx);
-
-    OP_REQUIRES(ctx, 0 <= index_ && index_ < tc.args().size(),
-                errors::InvalidArgument("Invalid argument index ", index_));
-    const XlaCompiler::Argument& arg = tc.args()[index_];
-
-    if (arg.parameter < 0) {
-      ctx->SetConstantOutput(0, arg.constant_value);
-    } else {
-      ctx->SetOutput(0, tc.parameter(arg.parameter));
-    }
-  }
-
- private:
-  int index_;
-  DataType dtype_;
-  xla::PrimitiveType type_;  // Corresponding XLA type.
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
-};
-
-REGISTER_XLA_OP("_Arg", ArgOp);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
index d408ab3338e..852d2a966ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -172,15 +172,14 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
     } else {
       // These will be used to define the bounds of each slice.
       // Within the loop, the input_channel index will be modified.
-      gtl::InlinedVector<int64, 4> filter_begin;
-      gtl::InlinedVector<int64, 4> filter_limits;
-      gtl::InlinedVector<int64, 4> input_begin;
-      gtl::InlinedVector<int64, 4> input_limits;
+      gtl::InlinedVector<int64, 4> filter_begin(4, 0);
+      gtl::InlinedVector<int64, 4> filter_limits(4);
+      gtl::InlinedVector<int64, 4> input_begin(4, 0);
+      gtl::InlinedVector<int64, 4> input_limits(4);
+      gtl::InlinedVector<int64, 4> strides(4, 1);
       for (int i = 0; i < 4; ++i) {
-        filter_begin.push_back(0);
-        filter_limits.push_back(filter_shape.dim_size(i));
-        input_begin.push_back(0);
-        input_limits.push_back(input_shape.dim_size(i));
+        filter_limits[i] = filter_shape.dim_size(i);
+        input_limits[i] = input_shape.dim_size(i);
       }
 
       std::vector<int64> strides_for_tla{strides_[1], strides_[2]};
@@ -209,9 +208,9 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
         input_limits[3] = i + 1;
 
         xla::ComputationDataHandle filter_slice =
-            b.Slice(filter, filter_begin, filter_limits);
+            b.Slice(filter, filter_begin, filter_limits, strides);
         xla::ComputationDataHandle input_slice =
-            b.Slice(input, input_begin, input_limits);
+            b.Slice(input, input_begin, input_limits, strides);
         convs.push_back(b.ConvWithGeneralDimensions(
             input_slice, filter_slice, strides_for_tla, xla_padding, dims));
       }
@@ -229,7 +228,8 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
 };
 
-REGISTER_XLA_OP("DepthwiseConv2dNative", DepthwiseConv2dNativeOp);
+REGISTER_XLA_OP(Name("DepthwiseConv2dNative").TypeConstraint("T", kFloatTypes),
+                DepthwiseConv2dNativeOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index b89109ff6ab..ec5017f6ab9 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -64,7 +64,7 @@ class DiagOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Diag", DiagOp);
+REGISTER_XLA_OP(Name("Diag"), DiagOp);
 
 class DiagPartOp : public XlaOpKernel {
  public:
@@ -125,14 +125,14 @@ class DiagPartOp : public XlaOpKernel {
     diag = builder->Reshape(diag, {new_size, new_size + 1});
 
     // Slices out the first column and reshapes to the final shape.
-    diag = builder->Slice(diag, {0, 0}, {new_size, 1});
+    diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1});
     diag = builder->Reshape(diag, new_dims);
 
     ctx->SetOutput(0, diag);
   }
 };
 
-REGISTER_XLA_OP("DiagPart", DiagPartOp);
+REGISTER_XLA_OP(Name("DiagPart"), DiagPartOp);
 
 class MatrixDiagOp : public XlaOpKernel {
  public:
@@ -167,7 +167,7 @@ class MatrixDiagOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("MatrixDiag", MatrixDiagOp);
+REGISTER_XLA_OP(Name("MatrixDiag"), MatrixDiagOp);
 
 class MatrixDiagPartOp : public XlaOpKernel {
  public:
@@ -224,8 +224,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
     } else if (actual_size > target_size) {
       std::vector<int64> start(flattened_dims.size(), 0);
       std::vector<int64> limits(flattened_dims.begin(), flattened_dims.end());
+      std::vector<int64> strides(flattened_dims.size(), 1);
       limits[flattened_dims.size() - 1] = target_size;
-      diag = builder->Slice(diag, start, limits);
+      diag = builder->Slice(diag, start, limits, strides);
     }
 
     // Reshape so the target values are in the first position of the last
@@ -238,8 +239,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
     // Slices out the first column and reshapes to the final shape.
     std::vector<int64> start(dims.size(), 0);
     std::vector<int64> limits(dims.begin(), dims.end());
+    std::vector<int64> strides(dims.size(), 1);
     limits[last_dim] = 1;
-    diag = builder->Slice(diag, start, limits);
+    diag = builder->Slice(diag, start, limits, strides);
 
     // Collapses away the last dimension.
     dims.pop_back();
@@ -249,7 +251,7 @@ class MatrixDiagPartOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("MatrixDiagPart", MatrixDiagPartOp);
+REGISTER_XLA_OP(Name("MatrixDiagPart"), MatrixDiagPartOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 2936e792619..107c673f4a7 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -157,6 +157,8 @@ class DynamicStitchOp : public XlaOpKernel {
                                    indices0_shape.dims());
     std::vector<int64> slice_limit(1 + data0_shape.dims() -
                                    indices0_shape.dims());
+    std::vector<int64> stride(1 + data0_shape.dims() -
+                              indices0_shape.dims(), 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
@@ -169,7 +171,7 @@ class DynamicStitchOp : public XlaOpKernel {
       // And place it in the concat list in the place indicated by
       // the index.
       to_concat[index_num] =
-          ctx->builder()->Slice(expression, slice_start, slice_limit);
+          ctx->builder()->Slice(expression, slice_start, slice_limit, stride);
     }
 
     ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0));
@@ -194,7 +196,7 @@ class DynamicStitchOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("DynamicStitch", DynamicStitchOp);
+REGISTER_XLA_OP(Name("DynamicStitch"), DynamicStitchOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
new file mode 100644
index 00000000000..62a5e1bd421
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Native XLA implementations of XLA Elu Ops
+
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+namespace {
+
+class EluOp : public XlaOpKernel {
+ public:
+  explicit EluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Computes the max of the scalar input x and 0.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto pred = b->Gt(ctx->Input(0), zero);
+    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    ctx->SetOutput(0, b->Select(pred, ctx->Input(0), expm1));
+  }
+};
+
+class EluGradOp : public XlaOpKernel {
+ public:
+  explicit EluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
+  // otherwise return lhs * (1 + rhs).
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto grad = ctx->Input(0);
+    const auto activation = ctx->Input(1);
+    const auto exp_grad = b->Mul(grad, b->Add(activation, one));
+    const auto pred = b->Gt(activation, zero);
+    ctx->SetOutput(0, b->Select(pred, grad, exp_grad));
+  }
+};
+
+REGISTER_XLA_OP(Name("Elu"), EluOp);
+REGISTER_XLA_OP(Name("EluGrad"), EluGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 918c80aad8c..1e1d2a1b4b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific Fill Op.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
@@ -50,6 +50,7 @@ class FillOp : public XlaOpKernel {
     // Convert the dims literal into a vector that we can pass to
     // ComputationBuilder.
     std::vector<int64> broadcast;
+    broadcast.reserve(dims_literal.shape().dimensions(0));
     for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
       broadcast.push_back(xla::LiteralUtil::Get<int>(dims_literal, {i}));
     }
@@ -68,7 +69,7 @@ class FillOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Fill", FillOp);
+REGISTER_XLA_OP(Name("Fill"), FillOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index 53f2196dc59..8dacb6627bd 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -47,8 +47,8 @@ class PassOn : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("_ListToArray", PassOn);
-REGISTER_XLA_OP("_ArrayToList", PassOn);
+REGISTER_XLA_OP(Name("_ListToArray"), PassOn);
+REGISTER_XLA_OP(Name("_ArrayToList"), PassOn);
 
 // TODO(phawkins): this is an almost exact copy of the SymbolicGradientOp
 // implementation from regular Tensorflow. Once XLA has been open sourced
@@ -68,7 +68,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
                       done);
 
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle_), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(&def().attr()), &handle_),
+        done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
@@ -104,7 +105,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientOp);
 };
 
-REGISTER_XLA_OP(kGradientOp, SymbolicGradientOp);
+REGISTER_XLA_OP(Name(kGradientOp), SymbolicGradientOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index b98d3864790..49eadaf9d1f 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -93,12 +93,10 @@ class GatherOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(GatherOp);
 };
 
-REGISTER_XLA_OP("Gather", GatherOp);
-
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Gather")
-                        .TypeConstraint("Tparams", DT_FLOAT)
-                        .TypeConstraint("Tindices", {DT_INT32, DT_INT64}));
+REGISTER_XLA_OP(Name("Gather")
+                    .TypeConstraint("Tparams", DT_FLOAT)
+                    .Device(DEVICE_CPU_XLA_JIT),
+                GatherOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
index eff23bd77d2..691a0b972d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
-gather_float_int32_xla_impl(float* out, void** data) {
+extern "C" void TF_EXPORT gather_float_int32_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int32_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
index ae31f6f2006..3dff6e2737b 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
-gather_float_int64_xla_impl(float* out, void** data) {
+extern "C" void TF_EXPORT gather_float_int64_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int64_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 01417a3cdf7..87d3d64a4e9 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
 namespace tensorflow {
 namespace {
@@ -31,9 +31,12 @@ class IdentityOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(IdentityOp);
 };
 
-REGISTER_XLA_OP("Identity", IdentityOp);
-REGISTER_XLA_OP("PreventGradient", IdentityOp);
-REGISTER_XLA_OP("StopGradient", IdentityOp);
+// XLA_* devices also register a "real" Identity operator so we suppress the
+// dummy operator using CompilationOnly().
+REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp);
+
+REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
+REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 293705e39fc..df002dddd04 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // Native XLA implementations of indexing ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -134,9 +134,9 @@ class ArgMaxOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ArgMaxOp);
 };
 
-REGISTER_XLA_OP("ArgMax", ArgMaxOp);
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("ArgMax").TypeConstraint("T", DT_FLOAT));
+REGISTER_XLA_OP(
+    Name("ArgMax").TypeConstraint("T", DT_FLOAT).Device(DEVICE_CPU_XLA_JIT),
+    ArgMaxOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 0033a949a37..afbd64ca503 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -43,7 +44,6 @@ EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
-argmax_float_1d_xla_impl(void* out, void** data) {
+extern "C" void TF_EXPORT argmax_float_1d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
index be8ad2317c9..841ff2f4df7 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -45,7 +46,6 @@ EIGEN_STRONG_INLINE void argmax_float_2d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
-argmax_float_2d_xla_impl(void* out, void** data) {
+extern "C" void TF_EXPORT argmax_float_2d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index 248984bcfec..d096415087e 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -47,7 +47,7 @@ class L2LossOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("L2Loss", L2LossOp);
+REGISTER_XLA_OP(Name("L2Loss"), L2LossOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 93966d3d5a9..759d1a1a2d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -69,7 +69,7 @@ class LRNOp : public XlaOpKernel {
   float beta_;
 };
 
-REGISTER_XLA_OP("LRN", LRNOp);
+REGISTER_XLA_OP(Name("LRN"), LRNOp);
 
 class LRNGradOp : public XlaOpKernel {
  public:
@@ -167,7 +167,7 @@ class LRNGradOp : public XlaOpKernel {
   float beta_;
 };
 
-REGISTER_XLA_OP("LRNGrad", LRNGradOp);
+REGISTER_XLA_OP(Name("LRNGrad"), LRNGradOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 5af6a79f3e4..5c799a0e4f8 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 // XLA-specific MatMul Op.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ class MatMulOp : public XlaOpKernel {
   bool transpose_b_;
 };
 
-REGISTER_XLA_OP("MatMul", MatMulOp);
+REGISTER_XLA_OP(Name("MatMul").TypeConstraint("T", kFloatTypes), MatMulOp);
 
 class SparseMatMulOp : public MatMulOp {
  public:
@@ -82,7 +82,10 @@ class SparseMatMulOp : public MatMulOp {
   ~SparseMatMulOp() override = default;
 };
 
-REGISTER_XLA_OP("SparseMatMul", SparseMatMulOp);
+REGISTER_XLA_OP(Name("SparseMatMul")
+                    .TypeConstraint("Ta", kFloatTypes)
+                    .TypeConstraint("Tb", kFloatTypes),
+                SparseMatMulOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc
index 806bfc604f7..b8f0c0b9fe6 100644
--- a/tensorflow/compiler/tf2xla/kernels/no_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/no_op.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
 
-REGISTER_XLA_OP("NoOp", NoOp);
+// XLA_* devices also register a "real" NoOp operator so we suppress the
+// dummy operator using CompilationOnly().
+REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
new file mode 100644
index 00000000000..2a9cfcb2eb8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA implementation of OneHot operator.
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class OneHotOp : public XlaOpKernel {
+ public:
+  explicit OneHotOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape indices_shape = ctx->InputShape(0);
+    const TensorShape depth_shape = ctx->InputShape(1);
+    const TensorShape on_value_shape = ctx->InputShape(2);
+    const TensorShape off_value_shape = ctx->InputShape(3);
+
+    const int indices_dims = indices_shape.dims();
+    const int output_dims = indices_dims + 1;
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(
+        ctx, axis_ == -1 || (axis_ >= 0 && axis_ < output_dims),
+        errors::InvalidArgument("Expected axis to be -1 or between [0, ",
+                                output_dims, ").  But received: ", axis_));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(depth_shape),
+                errors::InvalidArgument("depth must be a scalar, but got: ",
+                                        depth_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(on_value_shape),
+                errors::InvalidArgument("on_value must be a scalar, but got: ",
+                                        on_value_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(off_value_shape),
+                errors::InvalidArgument("off_value must be a scalar, but got: ",
+                                        off_value_shape.DebugString()));
+
+    const int axis = (axis_ == -1) ? indices_dims : axis_;
+
+    // The one-hot dimension.
+    int64 depth;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &depth));
+    OP_REQUIRES(
+        ctx, depth >= 0,
+        errors::InvalidArgument("depth must be non-negative, got: ", depth));
+
+    xla::ComputationDataHandle one_hot;
+    OP_REQUIRES_OK(
+        ctx, XlaHelpers::OneHot(ctx->builder(), depth, axis, input_type(0),
+                                indices_shape, ctx->Input(0), ctx->Input(2),
+                                ctx->Input(3), &one_hot));
+    ctx->SetOutput(0, one_hot);
+  }
+
+ private:
+  int32 axis_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
+};
+
+REGISTER_XLA_OP(Name("OneHot"), OneHotOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index 7456d92de03..a4318e29d25 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -87,7 +87,7 @@ class PackOp : public XlaOpKernel {
   int axis_;
 };
 
-REGISTER_XLA_OP("Pack", PackOp);
+REGISTER_XLA_OP(Name("Pack"), PackOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 2846414c5ec..22476f4a0c5 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
@@ -74,7 +74,7 @@ class PadOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Pad", PadOp);
+REGISTER_XLA_OP(Name("Pad"), PadOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 7a1ce2db85c..2b6053d19dd 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA specific pooling ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -35,19 +35,21 @@ namespace {
 // Superclass of pooling ops.
 class PoolingOp : public XlaOpKernel {
  public:
-  explicit PoolingOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    // Data format doesn't matter since the kernel is specified explicitly.
+  PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     std::vector<int32> ksize_int;
     std::vector<int32> stride_int;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == 4,
+    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == 4,
+    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
                 errors::InvalidArgument("Sliding window stride field must "
-                                        "specify 4 dimensions"));
-    for (int i = 0; i < 4; ++i) {
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    for (int i = 0; i < num_dims(); ++i) {
       ksize_.push_back(ksize_int[i]);
       stride_.push_back(stride_int[i]);
     }
@@ -56,6 +58,8 @@ class PoolingOp : public XlaOpKernel {
     padding_ = (padding == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   // Method that builds an initial value to use in reductions.
   virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
                                                DataType data_type) = 0;
@@ -73,6 +77,11 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
+                errors::InvalidArgument("Input to ", type_string(),
+                                        " operator must have ", num_dims(),
+                                        " dimensions"));
+
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
         input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
@@ -81,14 +90,17 @@ class PoolingOp : public XlaOpKernel {
   }
 
  protected:
+  const int num_spatial_dims_;
   std::vector<int64> ksize_;
   std::vector<int64> stride_;
   xla::Padding padding_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 };
 
 class MaxPoolOp : public PoolingOp {
  public:
-  explicit MaxPoolOp(OpKernelConstruction* ctx) : PoolingOp(ctx) {}
+  MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims) {}
 
   xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
                                        DataType data_type) override {
@@ -107,7 +119,24 @@ class MaxPoolOp : public PoolingOp {
   }
 };
 
-REGISTER_XLA_OP("MaxPool", MaxPoolOp);
+class MaxPool2DOp : public MaxPoolOp {
+ public:
+  explicit MaxPool2DOp(OpKernelConstruction* ctx)
+      : MaxPoolOp(ctx, /*num_spatial_dims=*/2) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+
+class MaxPool3DOp : public MaxPoolOp {
+ public:
+  explicit MaxPool3DOp(OpKernelConstruction* ctx)
+      : MaxPoolOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("MaxPool3D"), MaxPool3DOp);
 
 // Common computation shared between AvgPool and AvgPoolGrad. Divide each
 // element of an image by the count of elements that contributed to that
@@ -116,7 +145,7 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
     XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
     DataType dtype, const TensorShape& input_shape, xla::Padding padding,
     const std::vector<int64>& ksize, const std::vector<int64>& stride,
-    TensorFormat data_format) {
+    int num_spatial_dims, TensorFormat data_format) {
   if (padding == xla::Padding::kValid) {
     // In VALID padding, all windows have the same number of elements
     // contributing to each average. Divide by the window size everywhere to
@@ -134,34 +163,37 @@ static xla::ComputationDataHandle AvgPoolDivideByCount(
     // TODO(phawkins): use a less brute-force way to compute this. Only
     // the boundary regions will have interesting values here.
 
-    int height_dim = GetTensorDimIndex(data_format, 'H');
-    int width_dim = GetTensorDimIndex(data_format, 'W');
-    CHECK_LT(height_dim, width_dim);
+    std::vector<int64> input_dim_sizes(num_spatial_dims);
+    std::vector<int64> window_dims(num_spatial_dims);
+    std::vector<int64> window_ksize(num_spatial_dims);
+    std::vector<int64> window_stride(num_spatial_dims);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int dim = GetTensorSpatialDimIndex(num_spatial_dims + 2, data_format, i);
+      input_dim_sizes[i] = input_shape.dim_size(dim);
+      window_dims[i] = dim;
+      window_ksize[i] = ksize[dim];
+      window_stride[i] = stride[dim];
+    }
 
     // Build a matrix of all 1s, with the same width/height as the input.
     auto ones = ctx->builder()->Broadcast(
-        XlaHelpers::One(ctx->builder(), dtype),
-        {input_shape.dim_size(height_dim), input_shape.dim_size(width_dim)});
+        XlaHelpers::One(ctx->builder(), dtype), input_dim_sizes);
 
     // Perform a ReduceWindow with the same window size, strides, and padding
     // to count the number of contributions to each result element.
     auto counts = ctx->builder()->ReduceWindow(
         ones, XlaHelpers::Zero(ctx->builder(), dtype),
-        *ctx->GetOrCreateAdd(dtype), {ksize[height_dim], ksize[width_dim]},
-        {stride[height_dim], stride[width_dim]}, xla::Padding::kSame);
+        *ctx->GetOrCreateAdd(dtype), window_ksize, window_stride,
+        xla::Padding::kSame);
 
-    return ctx->builder()->Div(output, counts, {height_dim, width_dim});
+    return ctx->builder()->Div(output, counts, window_dims);
   }
 }
 
 class AvgPoolOp : public PoolingOp {
  public:
-  explicit AvgPoolOp(OpKernelConstruction* ctx) : PoolingOp(ctx) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-  }
+  AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : PoolingOp(ctx, num_spatial_dims) {}
 
   xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b,
                                        DataType data_type) override {
@@ -177,14 +209,29 @@ class AvgPoolOp : public PoolingOp {
       XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
       DataType dtype, const TensorShape& input_shape) override {
     return AvgPoolDivideByCount(ctx, output, dtype, input_shape, padding_,
-                                ksize_, stride_, data_format_);
+                                ksize_, stride_, num_spatial_dims_,
+                                data_format_);
   }
-
- private:
-  TensorFormat data_format_;
 };
 
-REGISTER_XLA_OP("AvgPool", AvgPoolOp);
+class AvgPool2DOp : public AvgPoolOp {
+ public:
+  explicit AvgPool2DOp(OpKernelConstruction* ctx)
+      : AvgPoolOp(ctx, /*num_spatial_dims=*/2) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp);
+
+class AvgPool3DOp : public AvgPoolOp {
+ public:
+  explicit AvgPool3DOp(OpKernelConstruction* ctx)
+      : AvgPoolOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("AvgPool3D"), AvgPool3DOp);
 
 // The operation to compute MaxPool gradients.
 // It takes three inputs:
@@ -194,35 +241,39 @@ REGISTER_XLA_OP("AvgPool", AvgPoolOp);
 // It produces one output: backprop tensor for input.
 class MaxPoolGradOp : public XlaOpKernel {
  public:
-  explicit MaxPoolGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
+  MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(ctx, ksize_.size() == 4,
+    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
-    OP_REQUIRES(ctx, stride_.size() == 4,
+    OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
     const TensorShape out_backprop_shape = ctx->InputShape(2);
 
-    // For maxpooling, tensor_in should have 4 dimensions.
-    OP_REQUIRES(ctx, tensor_in_shape.dims() == 4,
-                errors::InvalidArgument("tensor_in must be 4-dimensional"));
-    OP_REQUIRES(ctx, tensor_out_shape.dims() == 4,
-                errors::InvalidArgument("tensor_out must be 4-dimensional"));
-    // For maxpooling, out_backprop should have 4 dimensions.
-    OP_REQUIRES(ctx, out_backprop_shape.dims() == 4,
-                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+    // For maxpooling, tensor_in should have num_dims() dimensions.
+    OP_REQUIRES(ctx, tensor_in_shape.dims() == num_dims(),
+                errors::InvalidArgument("tensor_in must be ", num_dims(),
+                                        "-dimensional"));
+    OP_REQUIRES(ctx, tensor_out_shape.dims() == num_dims(),
+                errors::InvalidArgument("tensor_out must be ", num_dims(),
+                                        "-dimensional"));
+    // For maxpooling, out_backprop should have num_dims() dimensions.
+    OP_REQUIRES(ctx, out_backprop_shape.dims() == num_dims(),
+                errors::InvalidArgument("out_backprop must be ", num_dims(),
+                                        "-dimensional"));
 
     // TODO(phawkins): The XLA version doesn't need tensor_out. Investigate
     // whether this is a good time/space tradeoff.
@@ -245,55 +296,74 @@ class MaxPoolGradOp : public XlaOpKernel {
     ctx->SetOutput(0, gradients);
   }
 
- private:
+ protected:
+  const int num_spatial_dims_;
   std::vector<int64> ksize_;
   std::vector<int64> stride_;
   Padding padding_;
-  TensorFormat data_format_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 };
 
-REGISTER_XLA_OP("MaxPoolGrad", MaxPoolGradOp);
-
-// Average-pooling gradient
-class AvgPoolGradOp : public XlaOpKernel {
+class MaxPool2DGradOp : public MaxPoolGradOp {
  public:
-  explicit AvgPoolGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  explicit MaxPool2DGradOp(OpKernelConstruction* ctx)
+      : MaxPoolGradOp(ctx, /*num_spatial_dims=*/2) {
     string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
+
+class MaxPool3DGradOp : public MaxPoolGradOp {
+ public:
+  explicit MaxPool3DGradOp(OpKernelConstruction* ctx)
+      : MaxPoolGradOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("MaxPool3DGrad"), MaxPool3DGradOp);
+
+// Average-pooling gradient
+class AvgPoolGradOp : public XlaOpKernel {
+ public:
+  AvgPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
+      : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(ctx, ksize_.size() == 4,
+    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
-    OP_REQUIRES(ctx, stride_.size() == 4,
+    OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
   }
 
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape gradients_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &gradients_shape));
 
     const TensorShape out_backprop_shape = ctx->InputShape(1);
 
-    // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements.
-    OP_REQUIRES(
-        ctx, gradients_shape.dims() == 4,
-        errors::InvalidArgument("orig_input_shape must have 4 elements"));
+    // For avgpooling, tensor_in_shape should have num_dims() dimensions.
+    OP_REQUIRES(ctx, gradients_shape.dims() == num_dims(),
+                errors::InvalidArgument("orig_input_shape must be ", num_dims(),
+                                        "-dimensional"));
 
-    // For avgpooling, out_backprop should have 4 dimensions.
-    OP_REQUIRES(ctx, out_backprop_shape.dims() == 4,
-                errors::InvalidArgument("out_backprop must be 4-dimensional"));
+    // For avgpooling, out_backprop should have num_dims() dimensions.
+    OP_REQUIRES(ctx, out_backprop_shape.dims() == num_dims(),
+                errors::InvalidArgument("out_backprop must be ", num_dims(),
+                                        "-dimensional"));
 
-    int height_dim = GetTensorDimIndex(data_format_, 'H');
-    int width_dim = GetTensorDimIndex(data_format_, 'W');
-    int depth = GetTensorDim(out_backprop_shape, data_format_, 'C');
+    int depth_dim = GetTensorFeatureDimIndex(num_dims(), data_format_);
+    int64 depth = out_backprop_shape.dim_size(depth_dim);
 
     // We can think of average-pooling as:
     // * a convolution with a kernel consisting entirely of 1s, where the
@@ -308,16 +378,23 @@ class AvgPoolGradOp : public XlaOpKernel {
     // For an explanation of backpropagation for convolution, see the comments
     // in third_party/tensorflow/core/kernels/conv_grad_ops.h
 
-    // TF filter shape is [ H, W, inC, outC ]
-    TensorShape filter_shape(
-        {ksize_[height_dim], ksize_[width_dim], depth, depth});
+    // TF filter shape is [ H, W, ..., inC, outC ]
+    std::vector<int64> filter_dims(num_dims());
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      filter_dims[i] = ksize_[dim];
+    }
+    filter_dims[num_dims() - 2] = depth;
+    filter_dims[num_dims() - 1] = depth;
+    TensorShape filter_shape(filter_dims);
 
     // Reuse the logic from Conv2DBackpropInput to compute padding.
-    Conv2DBackpropDimensions dims;
+    ConvBackpropDimensions dims;
     OP_REQUIRES_OK(
-        ctx, Conv2DBackpropComputeDimensions(
-                 "AvgPoolGrad", gradients_shape, filter_shape,
-                 out_backprop_shape, stride_, padding_, data_format_, &dims));
+        ctx, ConvBackpropComputeDimensions(
+                 type_string(), /*num_spatial_dims=*/num_spatial_dims_,
+                 gradients_shape, filter_shape, out_backprop_shape, stride_,
+                 padding_, data_format_, &dims));
 
     auto out_backprop = ctx->Input(1);
 
@@ -332,43 +409,60 @@ class AvgPoolGradOp : public XlaOpKernel {
 
     // Divide the out_backprop values by the counts for each spatial position.
     std::vector<int64> stride_int64s(stride_.begin(), stride_.end());
-    auto out_backprop_div =
-        AvgPoolDivideByCount(ctx, out_backprop, dtype, gradients_shape,
-                             xla_padding, ksize_, stride_int64s, data_format_);
+    auto out_backprop_div = AvgPoolDivideByCount(
+        ctx, out_backprop, dtype, gradients_shape, xla_padding, ksize_,
+        stride_int64s, num_spatial_dims_, data_format_);
 
     // Pad the gradients in the spatial dimensions. We use the same padding
     // as Conv2DBackpropInput.
-    xla::PaddingConfig padding_config = xla::MakeNoPaddingConfig(4);
-    auto* row_padding = padding_config.mutable_dimensions(height_dim);
-    row_padding->set_edge_padding_low(dims.rows.pad_before);
-    row_padding->set_edge_padding_high(dims.rows.pad_after);
-    row_padding->set_interior_padding(dims.rows.stride - 1);
-
-    auto* col_padding = padding_config.mutable_dimensions(width_dim);
-    col_padding->set_edge_padding_low(dims.cols.pad_before);
-    col_padding->set_edge_padding_high(dims.cols.pad_after);
-    col_padding->set_interior_padding(dims.cols.stride - 1);
+    xla::PaddingConfig padding_config = xla::MakeNoPaddingConfig(num_dims());
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      auto* padding = padding_config.mutable_dimensions(dim);
+      padding->set_edge_padding_low(dims.spatial_dims[i].pad_before);
+      padding->set_edge_padding_high(dims.spatial_dims[i].pad_after);
+      padding->set_interior_padding(dims.spatial_dims[i].stride - 1);
+    }
 
     auto zero = XlaHelpers::Zero(ctx->builder(), dtype);
     auto padded_gradients =
         ctx->builder()->Pad(out_backprop_div, zero, padding_config);
 
     // in_backprop = padded_gradients <conv> ones
+    std::vector<int64> ones(num_dims(), 1LL);
     xla::ComputationDataHandle in_backprop = ctx->builder()->ReduceWindow(
         padded_gradients, zero, *ctx->GetOrCreateAdd(dtype), ksize_,
-        /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kValid);
+        /* window_strides=*/ones, xla::Padding::kValid);
 
     ctx->SetOutput(0, in_backprop);
   }
 
- private:
+ protected:
+  const int num_spatial_dims_;
   std::vector<int64> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
-  TensorFormat data_format_;
+  TensorFormat data_format_ = FORMAT_NHWC;
 };
 
-REGISTER_XLA_OP("AvgPoolGrad", AvgPoolGradOp);
+class AvgPool2DGradOp : public AvgPoolGradOp {
+ public:
+  explicit AvgPool2DGradOp(OpKernelConstruction* ctx)
+      : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) {
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+};
+REGISTER_XLA_OP(Name("AvgPoolGrad"), AvgPool2DGradOp);
+
+class AvgPool3DGradOp : public AvgPoolGradOp {
+ public:
+  explicit AvgPool3DGradOp(OpKernelConstruction* ctx)
+      : AvgPoolGradOp(ctx, /*num_spatial_dims=*/3) {}
+};
+REGISTER_XLA_OP(Name("AvgPool3DGrad"), AvgPool3DGradOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 4ffe278d1c4..66b99665cbe 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -18,9 +18,10 @@ limitations under the License.
 // TODO(misard,phawkins): add tests.
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -51,7 +52,7 @@ class RandomUniformOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformOp);
 };
 
-REGISTER_XLA_OP("RandomUniform", RandomUniformOp);
+REGISTER_XLA_OP(Name("RandomUniform"), RandomUniformOp);
 
 class RandomUniformIntOp : public XlaOpKernel {
  public:
@@ -82,7 +83,7 @@ class RandomUniformIntOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformIntOp);
 };
 
-REGISTER_XLA_OP("RandomUniformInt", RandomUniformIntOp);
+REGISTER_XLA_OP(Name("RandomUniformInt"), RandomUniformIntOp);
 
 class RandomStandardNormalOp : public XlaOpKernel {
  public:
@@ -110,7 +111,79 @@ class RandomStandardNormalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomStandardNormalOp);
 };
 
-REGISTER_XLA_OP("RandomStandardNormal", RandomStandardNormalOp);
+REGISTER_XLA_OP(Name("RandomStandardNormal"), RandomStandardNormalOp);
+
+class TruncatedNormalOp : public XlaOpKernel {
+ public:
+  explicit TruncatedNormalOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const DataType dtype = output_type(0);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
+    xla::Shape xla_element_shape =
+        xla::ShapeUtil::MakeShape(xla_shape.element_type(), {});
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle mean = XlaHelpers::Zero(b, dtype);
+    xla::ComputationDataHandle stddev = XlaHelpers::One(b, dtype);
+    xla::ComputationDataHandle candidate =
+        b->RngNormal(mean, stddev, xla_shape);
+
+    auto two_sd = [dtype](bool negate, xla::ComputationBuilder* b) {
+      return XlaHelpers::FloatLiteral(b, dtype, negate ? -2.0 : 2.0);
+    };
+    auto out_of_range_mask = [two_sd](xla::ComputationDataHandle candidate,
+                                      xla::ComputationBuilder* b) {
+      xla::ComputationDataHandle too_large = b->Gt(candidate, two_sd(false, b));
+      xla::ComputationDataHandle too_small = b->Lt(candidate, two_sd(true, b));
+      return b->LogicalOr(too_large, too_small);
+    };
+
+    // The algorithm we're using is roughly:
+    //
+    // while (any(candidate < mean-2*sd || candidate > mean+2*sd)) {
+    //   out_of_range_mask := candidate < mean-2*sd || candidate > mean+2*sd
+    //   candidate = select(out_of_range_mask, rng_normal(), candidate)
+    // }
+    std::unique_ptr<xla::ComputationBuilder> test_builder =
+        b->CreateSubBuilder("truncated_normal_test");
+    {
+      auto* b = test_builder.get();
+      xla::ComputationDataHandle candidate =
+          b->Parameter(0, xla_shape, "candidate");
+      xla::ComputationDataHandle oor_mask = out_of_range_mask(candidate, b);
+      OP_REQUIRES_OK(ctx, Any(out_of_range_mask(candidate, b), b).status());
+    }
+
+    std::unique_ptr<xla::ComputationBuilder> body_builder =
+        b->CreateSubBuilder("truncated_normal_body");
+    {
+      auto* b = body_builder.get();
+      xla::ComputationDataHandle candidate =
+          b->Parameter(0, xla_shape, "candidate");
+      xla::ComputationDataHandle to_resample = out_of_range_mask(candidate, b);
+      xla::ComputationDataHandle mean = XlaHelpers::Zero(b, dtype);
+      xla::ComputationDataHandle stddev = XlaHelpers::One(b, dtype);
+      b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape), candidate);
+    }
+
+    xla::StatusOr<xla::Computation> test_computation = test_builder->Build();
+    OP_REQUIRES_OK(ctx, test_computation.status());
+    xla::StatusOr<xla::Computation> body_computation = body_builder->Build();
+    OP_REQUIRES_OK(ctx, body_computation.status());
+    xla::ComputationDataHandle result =
+        b->While(test_computation.ValueOrDie(), body_computation.ValueOrDie(),
+                 candidate);
+
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("TruncatedNormal"), TruncatedNormalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index ac929af2e2b..518a9372c4f 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/reduction_ops.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
@@ -35,7 +35,7 @@ class SumOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("Sum", SumOp);
+REGISTER_XLA_OP(Name("Sum"), SumOp);
 
 class ProdOp : public XlaReductionOp {
  public:
@@ -53,7 +53,7 @@ class ProdOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("Prod", ProdOp);
+REGISTER_XLA_OP(Name("Prod"), ProdOp);
 
 class MinOp : public XlaReductionOp {
  public:
@@ -73,7 +73,7 @@ class MinOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("Min", MinOp);
+REGISTER_XLA_OP(Name("Min"), MinOp);
 
 class MaxOp : public XlaReductionOp {
  public:
@@ -93,7 +93,7 @@ class MaxOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("Max", MaxOp);
+REGISTER_XLA_OP(Name("Max"), MaxOp);
 
 class MeanOp : public XlaReductionOp {
  public:
@@ -105,17 +105,17 @@ class MeanOp : public XlaReductionOp {
     builder->Add(scalar_lhs, scalar_rhs);
   }
 
-  bool BuildFinalizer(xla::ComputationBuilder* builder,
-                      const xla::ComputationDataHandle& scalar_argument,
-                      int64 num_elements_reduced) override {
+  xla::ComputationDataHandle BuildFinalizer(
+      xla::ComputationBuilder* builder,
+      const xla::ComputationDataHandle& reduce_output,
+      int64 num_elements_reduced) override {
     auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
                                               num_elements_reduced);
-    builder->Div(scalar_argument, divisor);
-    return true;
+    return builder->Div(reduce_output, divisor);
   }
 };
 
-REGISTER_XLA_OP("Mean", MeanOp);
+REGISTER_XLA_OP(Name("Mean"), MeanOp);
 
 class AllOp : public XlaReductionOp {
  public:
@@ -133,7 +133,7 @@ class AllOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("All", AllOp);
+REGISTER_XLA_OP(Name("All"), AllOp);
 
 class AnyOp : public XlaReductionOp {
  public:
@@ -151,7 +151,7 @@ class AnyOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP("Any", AnyOp);
+REGISTER_XLA_OP(Name("Any"), AnyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 7f0dd26f914..9aca6d8fedf 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -48,16 +48,15 @@ class XlaReductionOp : public XlaOpKernel {
                             const xla::ComputationDataHandle& scalar_lhs,
                             const xla::ComputationDataHandle& scalar_rhs) = 0;
 
-  // Implement the scalar->scalar lambda that should be applied to
-  // each element to be finalized. The desired computation should be
-  // added to 'builder' and 'scalar_argument' is the function's
-  // input. 'num_elements_reduced' is the number of elements that contributed
-  // to the reduction. If the reduction has a finalizer return true, otherwise
-  // return false and any computation added to builder will be
-  // ignored. Defaults to return false.
-  virtual bool BuildFinalizer(xla::ComputationBuilder* builder,
-                              const xla::ComputationDataHandle& scalar_argument,
-                              int64 num_elements_reduced);
+  // Applies a transformation to the output of the reduction. The desired
+  // computation should be added to 'builder'. Argument 'reduce_output' is the
+  // output of the reduction. 'num_elements_reduced' is the number of elements
+  // that contributed to the reduction. Returns the transformed reduction
+  // output, Defaults to returning 'reduce_output' unchanged.
+  virtual xla::ComputationDataHandle BuildFinalizer(
+      xla::ComputationBuilder* builder,
+      const xla::ComputationDataHandle& reduce_output,
+      int64 num_elements_reduced);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index d6b085e8978..8798c80ad53 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -39,11 +39,11 @@ xla::ComputationDataHandle XlaReductionOp::InitialValue(
 
 // Unless BuildFinalizer is overridden the reduction has no
 // finalizer.
-bool XlaReductionOp::BuildFinalizer(
+xla::ComputationDataHandle XlaReductionOp::BuildFinalizer(
     xla::ComputationBuilder* builder,
-    const xla::ComputationDataHandle& scalar_argument,
+    const xla::ComputationDataHandle& reduce_output,
     int64 num_elements_reduced) {
-  return false;
+  return reduce_output;
 }
 
 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
@@ -121,28 +121,14 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   xla::ComputationDataHandle reduce =
       ctx->builder()->Reduce(data, initial, reduction_computation, xla_axes);
 
-  // Construct the builder for the finalizer lambda.
-  xla::ComputationBuilder f(ctx->builder()->client(),
-                            strings::StrCat(desc, "-finalizer"));
-  // Make the scalar parameter of the desired type for the lambda.
-  xla::ComputationDataHandle fx =
-      f.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x");
-  // Call virtual method to build the finalizer lambda.
-  bool has_finalizer = BuildFinalizer(&f, fx, num_elements_reduced);
-  xla::Computation finalizer_computation = f.Build().ConsumeValueOrDie();
-  xla::ComputationDataHandle pre_reshaped_data;
-  if (has_finalizer) {
-    // This reduction Op includes a finalizer so run it as a Map.
-    pre_reshaped_data = ctx->builder()->Map({reduce}, finalizer_computation);
-  } else {
-    pre_reshaped_data = reduce;
-  }
+  xla::ComputationDataHandle finalized =
+      BuildFinalizer(ctx->builder(), reduce, num_elements_reduced);
 
   xla::ComputationDataHandle result;
   if (keep_dims_) {
-    result = ctx->builder()->Reshape(pre_reshaped_data, final_shape);
+    result = ctx->builder()->Reshape(finalized, final_shape);
   } else {
-    result = pre_reshaped_data;
+    result = finalized;
   }
   ctx->SetOutput(0, result);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 8adac23eeec..a137d28118e 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // Native XLA implementations of XLA Relu Ops
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -31,7 +31,7 @@ class ReluOp : public XlaOpKernel {
  public:
   explicit ReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Computes the max of the scalar input x and 0.
-  void Compile(XlaOpKernelContext* ctx) {
+  void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
     auto zero = XlaHelpers::Zero(builder, input_type(0));
     ctx->SetOutput(0, builder->Max(zero, ctx->Input(0)));
@@ -42,7 +42,7 @@ class Relu6Op : public XlaOpKernel {
  public:
   explicit Relu6Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Clamp the scalar input between 0 and 6.
-  void Compile(XlaOpKernelContext* ctx) {
+  void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
     auto zero = XlaHelpers::Zero(builder, input_type(0));
     auto six = XlaHelpers::IntegerLiteral(builder, input_type(0), 6);
@@ -50,43 +50,44 @@ class Relu6Op : public XlaOpKernel {
   }
 };
 
-// A subclass of a XlaBinaryMapOp must build the lambda computation
-// that describes the (scalar,scalar)->scalar function to apply to
-// each element of the input. We have to use XlaBinaryMapOp instead of
-// XlaBinaryOp here because XLA Select does not do automatic
-// broadcasting.
-class ReluGradOp : public XlaBinaryMapOp {
+class ReluGradOp : public XlaOpKernel {
  public:
-  explicit ReluGradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+  explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
-  void BuildMapLambda(xla::ComputationBuilder* b,
-                      const xla::ComputationDataHandle& gradient,
-                      const xla::ComputationDataHandle& feature) override {
-    const auto zero = XlaHelpers::Zero(b, input_type(0));
-    b->Select(b->Gt(feature, zero), gradient, zero);
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const TensorShape shape = ctx->InputShape(0);
+    const auto zero =
+        b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+    const auto pred = b->Gt(ctx->Input(1), zero);
+    ctx->SetOutput(0, b->Select(pred, ctx->Input(0), zero));
   }
 };
 
-class Relu6GradOp : public XlaBinaryMapOp {
+class Relu6GradOp : public XlaOpKernel {
  public:
-  explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+  explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
-  void BuildMapLambda(xla::ComputationBuilder* b,
-                      const xla::ComputationDataHandle& gradient,
-                      const xla::ComputationDataHandle& feature) override {
-    const auto zero = XlaHelpers::Zero(b, input_type(0));
-    auto six = XlaHelpers::IntegerLiteral(b, input_type(0), 6);
-    b->Select(b->LogicalAnd(b->Lt(feature, six), b->Gt(feature, zero)),
-              gradient, zero);
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const TensorShape shape = ctx->InputShape(0);
+    const auto zero =
+        b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+    const auto six = b->Broadcast(
+        XlaHelpers::IntegerLiteral(b, input_type(0), 6), shape.dim_sizes());
+    auto out = b->Select(
+        b->LogicalAnd(b->Lt(ctx->Input(1), six), b->Gt(ctx->Input(1), zero)),
+        ctx->Input(0), zero);
+    ctx->SetOutput(0, out);
   }
 };
 
-REGISTER_XLA_OP("Relu", ReluOp);
-REGISTER_XLA_OP("Relu6", Relu6Op);
-REGISTER_XLA_OP("ReluGrad", ReluGradOp);
-REGISTER_XLA_OP("Relu6Grad", Relu6GradOp);
+REGISTER_XLA_OP(Name("Relu"), ReluOp);
+REGISTER_XLA_OP(Name("Relu6"), Relu6Op);
+REGISTER_XLA_OP(Name("ReluGrad"), ReluGradOp);
+REGISTER_XLA_OP(Name("Relu6Grad"), Relu6GradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index febce0e1267..df542350b44 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific reshape Op.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -95,7 +95,7 @@ class ReshapeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Reshape", ReshapeOp);
+REGISTER_XLA_OP(Name("Reshape"), ReshapeOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 87d11a38d4c..462267d1504 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -43,7 +43,7 @@ class RetvalOp : public XlaOpKernel {
     if (frame) {
       // If 'frame' is non-null, this is an inner function call inside a JIT
       // compilation.
-      frame->SetRetval(index_, input);
+      OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
       xla::ComputationDataHandle input = ctx->Input(0);
       const TensorShape input_shape = ctx->InputShape(0);
@@ -58,9 +58,9 @@ class RetvalOp : public XlaOpKernel {
       if (input_shape.num_elements() == 0 || is_constant.ValueOrDie()) {
         xla::Literal literal;
         OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
-        tc.AddConstRetval(index_, dtype_, literal);
+        OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
       } else {
-        tc.AddRetval(index_, input);
+        tc.AddRetval(index_, dtype_, input);
       }
     }
   }
@@ -73,7 +73,7 @@ class RetvalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_XLA_OP("_Retval", RetvalOp);
+REGISTER_XLA_OP(Name("_Retval"), RetvalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
new file mode 100644
index 00000000000..7489321f72f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific reverse Op.
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+class ReverseOp : public XlaOpKernel {
+ public:
+  explicit ReverseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // r = tf.reverse(x, revdims)
+    const TensorShape x_shape = ctx->InputShape(0);
+    const TensorShape revd_shape = ctx->InputShape(1);
+    // Validate input sizes.
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(revd_shape),
+                errors::InvalidArgument("axes must be a vector, not shape ",
+                                        revd_shape.DebugString()));
+    OP_REQUIRES(ctx, revd_shape.num_elements() == x_shape.dims(),
+                errors::InvalidArgument("axes ", revd_shape.DebugString(),
+                                        " must have same number of elements as"
+                                        " than input tensor has dimensions ",
+                                        x_shape.DebugString(), "."));
+    if (revd_shape.num_elements() == 0) {
+      ctx->SetOutput(0, ctx->Input(0));
+      return;
+    }
+    // ComputationBuilder::Rev() requires concrete values for dimensions arg.
+    xla::Literal lax;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
+    std::vector<bool> revdims(x_shape.dims());
+    std::copy(lax.preds().begin(), lax.preds().end(), revdims.begin());
+    std::vector<int64> dimensions;
+
+    for (int d = 0; d < x_shape.dims(); ++d) {
+      if (revdims[d]) {
+        dimensions.push_back(d);
+      }
+    }
+
+    ctx->SetOutput(0, ctx->builder()->Rev(ctx->Input(0), dimensions));
+  }
+};
+
+REGISTER_XLA_OP(Name("Reverse"), ReverseOp);
+
+class ReverseV2Op : public XlaOpKernel {
+ public:
+  explicit ReverseV2Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // r = tf.reverse(x, axes)
+    const TensorShape x_shape = ctx->InputShape(0);
+    const TensorShape axes_shape = ctx->InputShape(1);
+    // Validate input sizes.
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(axes_shape),
+                errors::InvalidArgument("axes must be a vector, not shape ",
+                                        axes_shape.DebugString()));
+    OP_REQUIRES(ctx, axes_shape.num_elements() <= x_shape.dims(),
+                errors::InvalidArgument("axes ", axes_shape.DebugString(),
+                                        " can not have more elements"
+                                        " than input tensor has dimensions ",
+                                        x_shape.DebugString(), "."));
+    // Reverse is a no-op if axes argument is empty.
+    if (axes_shape.num_elements() == 0) {
+      ctx->SetOutput(0, ctx->Input(0));
+      return;
+    }
+    // ComputationBuilder::Rev() requires concrete values for dimensions arg.
+    std::vector<int64> axes;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &axes));
+
+    for (int d = 0; d < axes.size(); ++d) {
+      OP_REQUIRES(ctx, (0 <= axes[d]) && (axes[d] < x_shape.dims()),
+                  errors::InvalidArgument(axes[d], " is out of range [0, ",
+                                          x_shape.dims(), ")."));
+    }
+
+    ctx->SetOutput(0, ctx->builder()->Rev(ctx->Input(0), axes));
+  }
+};
+
+REGISTER_XLA_OP(Name("ReverseV2"), ReverseV2Op);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 0fecc338ca5..8081d3c41c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <numeric>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
@@ -40,12 +40,19 @@ class SelectOp : public XlaOpKernel {
             "'then' and 'else' must have the same size.  but received: ",
             then_shape.DebugString(), " vs. ", else_shape.DebugString()));
 
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    auto cond_handle = ctx->Input(0);
+    auto then_handle = ctx->Input(1);
+    auto else_handle = ctx->Input(2);
+
     bool broadcasting = !cond_shape.IsSameSize(then_shape);
-    if (broadcasting) {
-      OP_REQUIRES(
-          ctx, TensorShapeUtils::IsVector(cond_shape),
-          errors::InvalidArgument("'cond' must be a vector, but saw shape: ",
-                                  cond_shape.DebugString()));
+    bool cond_is_scalar = TensorShapeUtils::IsScalar(cond_shape);
+    if (broadcasting && !cond_is_scalar) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(cond_shape),
+                  errors::InvalidArgument(
+                      "'cond' must be a scalar or a vector, but saw shape: ",
+                      cond_shape.DebugString()));
       OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(then_shape),
                   errors::InvalidArgument(
                       "'then' must be at least a vector, but saw shape: ",
@@ -55,15 +62,7 @@ class SelectOp : public XlaOpKernel {
                                           "match size of 'cond', but saw: ",
                                           then_shape.dim_size(0), " vs. ",
                                           cond_shape.num_elements()));
-    }
 
-    xla::ComputationBuilder* builder = ctx->builder();
-
-    auto cond_handle = ctx->Input(0);
-    auto then_handle = ctx->Input(1);
-    auto else_handle = ctx->Input(2);
-
-    if (broadcasting) {
       // TODO(phawkins): broadcasting on the right seems pretty awkward in
       // XLA. It seems we have to broadcast on the left and then Reshape
       // to get the dimensions in the right order.
@@ -84,7 +83,7 @@ class SelectOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
 
-REGISTER_XLA_OP("Select", SelectOp);
+REGISTER_XLA_OP(Name("Select"), SelectOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 42ae978c3ce..5b6fa64fa82 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 // XLA-specific sequence and range Ops.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -138,7 +138,7 @@ class RangeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Range", RangeOp);
+REGISTER_XLA_OP(Name("Range"), RangeOp);
 
 class LinSpaceOp : public XlaOpKernel {
  public:
@@ -207,7 +207,7 @@ class LinSpaceOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("LinSpace", LinSpaceOp);
+REGISTER_XLA_OP(Name("LinSpace"), LinSpaceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index e7eec1cefda..24a99f253d6 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific Shape Ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
@@ -48,7 +48,7 @@ class ShapeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Shape", ShapeOp);
+REGISTER_XLA_OP(Name("Shape"), ShapeOp);
 
 class ShapeNOp : public XlaOpKernel {
  public:
@@ -78,7 +78,7 @@ class ShapeNOp : public XlaOpKernel {
 
   bool IsExpensive() override { return false; }
 };
-REGISTER_XLA_OP("ShapeN", ShapeNOp);
+REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp);
 
 class RankOp : public XlaOpKernel {
  public:
@@ -94,7 +94,7 @@ class RankOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Rank", RankOp);
+REGISTER_XLA_OP(Name("Rank"), RankOp);
 
 class SizeOp : public XlaOpKernel {
  public:
@@ -113,7 +113,7 @@ class SizeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Size", SizeOp);
+REGISTER_XLA_OP(Name("Size"), SizeOp);
 
 class ExpandDimsOp : public XlaOpKernel {
  public:
@@ -163,7 +163,7 @@ class ExpandDimsOp : public XlaOpKernel {
     ctx->SetOutput(0, ctx->builder()->Reshape(ctx->Input(0), new_shape));
   }
 };
-REGISTER_XLA_OP("ExpandDims", ExpandDimsOp);
+REGISTER_XLA_OP(Name("ExpandDims"), ExpandDimsOp);
 
 class SqueezeOp : public XlaOpKernel {
  public:
@@ -225,7 +225,7 @@ class SqueezeOp : public XlaOpKernel {
   std::unordered_set<int32> squeeze_dims_;
 };
 
-REGISTER_XLA_OP("Squeeze", SqueezeOp);
+REGISTER_XLA_OP(Name("Squeeze"), SqueezeOp);
 
 class ZerosLikeOp : public XlaOpKernel {
  public:
@@ -239,7 +239,21 @@ class ZerosLikeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("ZerosLike", ZerosLikeOp);
+REGISTER_XLA_OP(Name("ZerosLike"), ZerosLikeOp);
+
+class OnesLikeOp : public XlaOpKernel {
+ public:
+  explicit OnesLikeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+
+    auto one = XlaHelpers::One(ctx->builder(), input_type(0));
+    ctx->SetOutput(0, ctx->builder()->Broadcast(one, input_shape.dim_sizes()));
+  }
+};
+
+REGISTER_XLA_OP(Name("OnesLike"), OnesLikeOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 8ec77e04afe..482c54a40cf 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific Slice Op.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -50,10 +50,13 @@ class SliceOp : public XlaOpKernel {
     // slice will be an empty handle if the output has no elements.
     CHECK_EQ(begin.size(), size.size());
     std::vector<int64> limits;
+    limits.reserve(begin.size());
     for (int i = 0; i < begin.size(); ++i) {
       limits.push_back(begin[i] + size[i]);
     }
-    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits));
+    std::vector<int64> strides(begin.size(), 1);
+    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits,
+                                            strides));
   }
 
  private:
@@ -115,7 +118,7 @@ void SliceOp::SharedValidation(XlaOpKernelContext* ctx, bool* is_identity,
   }
 }
 
-REGISTER_XLA_OP("Slice", SliceOp);
+REGISTER_XLA_OP(Name("Slice"), SliceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 06ee5201633..a0d8ab4d73f 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 // XLA-specific Ops for softmax.
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -74,8 +74,53 @@ class SoftmaxOp : public XlaOpKernel {
   bool log_;
 };
 
-REGISTER_XLA_OP("Softmax", SoftmaxOp);
-REGISTER_XLA_OP("LogSoftmax", SoftmaxOp);
+REGISTER_XLA_OP(Name("Softmax"), SoftmaxOp);
+REGISTER_XLA_OP(Name("LogSoftmax"), SoftmaxOp);
+
+std::pair<xla::ComputationDataHandle, xla::ComputationDataHandle>
+CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
+                       const xla::ComputationDataHandle& logits,
+                       const xla::ComputationDataHandle& labels) {
+  const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
+  const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  xla::ComputationBuilder* b = ctx->builder();
+  // Find the max in each batch, resulting in a tensor of shape [batch]
+  auto logits_max =
+      b->Reduce(logits, XlaHelpers::MinValue(b, type), max_func, {kClassDim});
+
+  // Subtract the max in batch b from every element in batch b.
+  // Broadcasts along the batch dimension.
+  auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim});
+
+  // exp(logits - max_logits)
+  auto exp_shifted_logits = b->Exp(shifted_logits);
+
+  // sum_{class} (exp(logits - max_logits))
+  auto sum_exp = b->Reduce(exp_shifted_logits, XlaHelpers::Zero(b, type),
+                           add_func, {kClassDim});
+
+  // log(sum(exp(logits - max_logits)))
+  auto log_sum_exp = b->Log(sum_exp);
+
+  // sum(-labels *
+  //    ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
+  // along classes
+  // (The subtraction broadcasts along the batch dimension.)
+  xla::ComputationDataHandle loss = b->Reduce(
+      b->Mul(b->Neg(labels), b->Sub(shifted_logits, log_sum_exp, {kBatchDim})),
+      XlaHelpers::Zero(b, type), add_func, {kClassDim});
+
+  // backprop: prob - labels, where
+  //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
+  //     (where the division broadcasts along the batch dimension)
+  xla::ComputationDataHandle backprop =
+      b->Sub(b->Div(exp_shifted_logits, sum_exp, {kBatchDim}), labels);
+  return {loss, backprop};
+}
 
 class SoftmaxXentWithLogitsOp : public XlaOpKernel {
  public:
@@ -88,65 +133,95 @@ class SoftmaxXentWithLogitsOp : public XlaOpKernel {
     OP_REQUIRES(ctx, logits_shape.IsSameSize(labels_shape),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_shape.DebugString(), " labels_size=",
-                    labels_shape.DebugString()));
+                    logits_shape.DebugString(),
+                    " labels_size=", labels_shape.DebugString()));
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
     // check that "labels" is a matrix too.
 
-    // loss is 1-D (one per example), and size is batch_size.
-
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-
     const DataType type = input_type(0);
-    xla::ComputationBuilder* b = ctx->builder();
     auto logits = ctx->Input(0);
     auto labels = ctx->Input(1);
 
-    const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
-    const xla::Computation& add_func = *ctx->GetOrCreateAdd(type);
-
-    // Find the max in each batch, resulting in a tensor of shape [batch]
-    auto logits_max =
-        b->Reduce(logits, XlaHelpers::MinValue(b, type), max_func, {kClassDim});
-
-    // Subtract the max in batch b from every element in batch b.
-    // Broadcasts along the batch dimension.
-    auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim});
-
-    // exp(logits - max_logits)
-    auto exp_shifted_logits = b->Exp(shifted_logits);
-
-    // sum_{class} (exp(logits - max_logits))
-    auto sum_exp = b->Reduce(exp_shifted_logits, XlaHelpers::Zero(b, type),
-                             add_func, {kClassDim});
-
-    // log(sum(exp(logits - max_logits)))
-    auto log_sum_exp = b->Log(sum_exp);
-
-    // sum(-labels *
-    //    ((logits - max_logits) - log(sum(exp(logits - max_logits)))))
-    // along classes
-    // (The subtraction broadcasts along the batch dimension.)
-    xla::ComputationDataHandle loss =
-        b->Reduce(b->Mul(b->Neg(labels),
-                         b->Sub(shifted_logits, log_sum_exp, {kBatchDim})),
-                  XlaHelpers::Zero(b, type), add_func, {kClassDim});
-
-    // backprop: prob - labels, where
-    //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
-    //     (where the division broadcasts along the batch dimension)
-    xla::ComputationDataHandle backprop =
-        b->Sub(b->Div(exp_shifted_logits, sum_exp, {kBatchDim}), labels);
-
+    xla::ComputationDataHandle loss, backprop;
+    std::tie(loss, backprop) =
+        CrossEntropyWithLogits(ctx, type, logits, labels);
     ctx->SetOutput(0, loss);
     ctx->SetOutput(1, backprop);
   }
 };
 
-REGISTER_XLA_OP("SoftmaxCrossEntropyWithLogits", SoftmaxXentWithLogitsOp);
+REGISTER_XLA_OP(Name("SoftmaxCrossEntropyWithLogits"), SoftmaxXentWithLogitsOp);
+
+class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel {
+ public:
+  explicit SparseSoftmaxXentWithLogitsOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape logits_shape = ctx->InputShape(0);
+    const TensorShape labels_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape),
+                errors::InvalidArgument("logits must be 2-D, but got shape ",
+                                        logits_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(labels_shape),
+                errors::InvalidArgument("labels must be 1-D, but got shape ",
+                                        labels_shape.DebugString()));
+    OP_REQUIRES(ctx, logits_shape.dim_size(0) == labels_shape.dim_size(0),
+                errors::InvalidArgument(
+                    "logits and labels must have the same first dimension, "
+                    "got logits shape ",
+                    logits_shape.DebugString(), " and labels shape ",
+                    labels_shape.DebugString()));
+    OP_REQUIRES(ctx, logits_shape.dim_size(1) > 0,
+                errors::InvalidArgument(
+                    "Must have at least one class, but got logits shape ",
+                    logits_shape.DebugString()));
+
+    int64 batch_size = logits_shape.dim_size(0);
+    int64 depth = logits_shape.dim_size(1);
+
+    DataType logits_type = input_type(0);
+    DataType indices_type = input_type(1);
+
+    xla::ComputationDataHandle indices = ctx->Input(1);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+    xla::ComputationDataHandle labels;
+    OP_REQUIRES_OK(ctx,
+                   XlaHelpers::OneHot(
+                       builder, depth, /*axis=*/1, input_type(1), labels_shape,
+                       indices, XlaHelpers::One(builder, logits_type),
+                       XlaHelpers::Zero(builder, logits_type), &labels));
+
+    // If any of the indices are out of range, we must populate the labels with
+    // NaNs to obey the interface contract of
+    // tf.nn.sparse_softmax_cross_entropy_with_logits.
+    // Builds a vector of {batch_size} that is 0 if the index is in range, or
+    // NaN otherwise; then add that vector to the labels to force out-of-range
+    // values to NaNs.
+    xla::ComputationDataHandle nan_or_zero = builder->Select(
+        builder->LogicalAnd(
+            builder->Le(XlaHelpers::Zero(builder, indices_type), indices),
+            builder->Lt(indices, XlaHelpers::IntegerLiteral(
+                                     builder, indices_type, depth))),
+        builder->Broadcast(XlaHelpers::Zero(builder, logits_type),
+                           {batch_size}),
+        builder->Broadcast(XlaHelpers::FloatLiteral(builder, logits_type, NAN),
+                           {batch_size}));
+    labels = builder->Add(labels, nan_or_zero, {0});
+
+    xla::ComputationDataHandle loss, backprop;
+    std::tie(loss, backprop) =
+        CrossEntropyWithLogits(ctx, logits_type, ctx->Input(0), labels);
+    ctx->SetOutput(0, loss);
+    ctx->SetOutput(1, backprop);
+  }
+};
+
+REGISTER_XLA_OP(Name("SparseSoftmaxCrossEntropyWithLogits"),
+                SparseSoftmaxXentWithLogitsOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
new file mode 100644
index 00000000000..f15b354cb26
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -0,0 +1,190 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+void SpaceToBatch(XlaOpKernelContext* ctx,
+                  const xla::ComputationDataHandle& input, DataType input_dtype,
+                  const TensorShape& input_tensor_shape,
+                  gtl::ArraySlice<int64> block_shape,
+                  const xla::Literal& paddings) {
+  const int input_rank = input_tensor_shape.dims();
+  const gtl::InlinedVector<int64, 4> input_shape =
+      input_tensor_shape.dim_sizes();
+  const int block_rank = block_shape.size();
+
+  OP_REQUIRES(
+      ctx, input_rank >= 1 + block_rank,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_rank,
+                              " instead of ", input_rank));
+  gtl::ArraySlice<int64> remainder_shape(input_shape);
+  remainder_shape.remove_prefix(1 + block_rank);
+
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Rank(paddings.shape()) == 2 &&
+          block_rank == xla::ShapeUtil::GetDimension(paddings.shape(), 0) &&
+          2 == xla::ShapeUtil::GetDimension(paddings.shape(), 1),
+      errors::InvalidArgument("paddings should have shape [", block_rank,
+                              ", 2] instead of ",
+                              xla::ShapeUtil::HumanString(paddings.shape())));
+
+  xla::ComputationBuilder* b = ctx->builder();
+
+  // 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+  //  input according to `paddings` to produce `padded` of shape `padded_shape`.
+  xla::PaddingConfig padding_config;
+  std::vector<int64> padded_shape(input_shape.begin(), input_shape.end());
+  int64 block_num_elems = 1LL;
+  padding_config.add_dimensions();  // Don't pad the batch dimension.
+  for (int i = 0; i < block_rank; ++i) {
+    auto* dim = padding_config.add_dimensions();
+    int64 pad_start = xla::LiteralUtil::Get<int64>(paddings, {i, 0});
+    int64 pad_end = xla::LiteralUtil::Get<int64>(paddings, {i, 1});
+    OP_REQUIRES(ctx, pad_start >= 0 && pad_end >= 0,
+                errors::InvalidArgument("Paddings must be non-negative"));
+    dim->set_edge_padding_low(pad_start);
+    dim->set_edge_padding_high(pad_end);
+    padded_shape[1 + i] += pad_start + pad_end;
+    block_num_elems *= block_shape[i];
+  }
+  // Don't pad the remainder dimensions.
+  for (int i = 0; i < remainder_shape.size(); ++i) {
+    padding_config.add_dimensions();
+  }
+  OP_REQUIRES(ctx, block_num_elems > 0,
+              errors::InvalidArgument(
+                  "The product of the block dimensions must be positive"));
+
+  xla::ComputationDataHandle padded =
+      b->Pad(input, XlaHelpers::Zero(b, input_dtype), padding_config);
+
+  // 2. Reshape `padded` to `reshaped_padded` of shape:
+  //
+  //      [batch] +
+  //      [padded_shape[1] / block_shape[0],
+  //        block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1],
+  //       block_shape[M-1]] +
+  //      remaining_shape
+  const int64 batch_size = input_shape[0];
+  std::vector<int64> reshaped_padded_shape(input_rank + block_rank);
+  reshaped_padded_shape[0] = batch_size;
+  for (int i = 0; i < block_rank; ++i) {
+    OP_REQUIRES(ctx, padded_shape[1 + i] % block_shape[i] == 0,
+                errors::InvalidArgument("padded_shape[", 1 + i,
+                                        "]=", padded_shape[1 + i],
+                                        " is not divisible by block_shape[", i,
+                                        "]=", block_shape[i]));
+
+    reshaped_padded_shape[1 + i * 2] = padded_shape[1 + i] / block_shape[i];
+    reshaped_padded_shape[1 + i * 2 + 1] = block_shape[i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            reshaped_padded_shape.begin() + 1 + 2 * block_rank);
+
+  xla::ComputationDataHandle reshaped_padded =
+      b->Reshape(padded, reshaped_padded_shape);
+
+  // 3. Permute dimensions of `reshaped_padded` to produce
+  //    `permuted_reshaped_padded` of shape:
+  //
+  //      block_shape +
+  //      [batch] +
+  //      [padded_shape[1] / block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1]] +
+  //      remaining_shape
+  std::vector<int64> permutation(reshaped_padded_shape.size());
+  for (int i = 0; i < block_rank; ++i) {
+    permutation[i] = 1 + 2 * i + 1;
+    permutation[block_rank + 1 + i] = 1 + 2 * i;
+  }
+  permutation[block_rank] = 0;
+  std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
+            1 + block_rank * 2);
+  xla::ComputationDataHandle permuted_reshaped_padded =
+      b->Transpose(reshaped_padded, permutation);
+
+  // 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the
+  //    batch dimension, producing an output tensor of shape:
+  //
+  //      [batch * prod(block_shape)] +
+  //      [padded_shape[1] / block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1]] +
+  //      remaining_shape
+  // Determine the length of the prefix of block dims that can be combined
+  // into the batch dimension due to having no padding and block_shape=1.
+  std::vector<int64> output_shape(input_rank);
+  output_shape[0] = batch_size * block_num_elems;
+  for (int i = 0; i < block_rank; ++i) {
+    output_shape[1 + i] = padded_shape[1 + i] / block_shape[i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            output_shape.begin() + 1 + block_rank);
+
+  xla::ComputationDataHandle output =
+      b->Reshape(permuted_reshaped_padded, output_shape);
+  ctx->SetOutput(0, output);
+}
+
+class SpaceToBatchNDOp : public XlaOpKernel {
+ public:
+  explicit SpaceToBatchNDOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> block_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &block_shape));
+
+    xla::Literal paddings;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(2, &paddings));
+
+    SpaceToBatch(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 block_shape, paddings);
+  }
+};
+REGISTER_XLA_OP(Name("SpaceToBatchND"), SpaceToBatchNDOp);
+
+class SpaceToBatchOp : public XlaOpKernel {
+ public:
+  explicit SpaceToBatchOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
+    OP_REQUIRES(
+        ctx, block_size_ > 1,
+        errors::InvalidArgument("Block size should be > 1: ", block_size_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::Literal paddings;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(1, &paddings));
+
+    SpaceToBatch(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 {block_size_, block_size_}, paddings);
+  }
+
+ private:
+  int block_size_;
+};
+REGISTER_XLA_OP(Name("SpaceToBatch"), SpaceToBatchOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 18c4c648db1..42bde900422 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 // XLA-specific Ops for split.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -77,14 +77,14 @@ class SplitOp : public XlaOpKernel {
 
     // The vectors we will use to define the slice. The entry for the
     // split dimensions varies for each output.
-    std::vector<int64> begin;
-    std::vector<int64> limits;
+    std::vector<int64> begin(input_shape.dims(), 0);
+    std::vector<int64> limits(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < input_shape.dims(); ++i) {
       // Initially set up the limits to be the full size of the input:
       // the split dimension is filled in below.
       int64 dim = input_shape.dim_size(i);
-      begin.push_back(0);
-      limits.push_back(dim);
+      limits[i] = dim;
     }
 
     auto input = ctx->Input(1);
@@ -94,12 +94,12 @@ class SplitOp : public XlaOpKernel {
       // Slice out the ith split from the split dimension.
       begin[split_dim] = i * slice_size;
       limits[split_dim] = (i + 1) * slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
     }
   }
 };
 
-REGISTER_XLA_OP("Split", SplitOp);
+REGISTER_XLA_OP(Name("Split"), SplitOp);
 
 class SplitVOp : public XlaOpKernel {
  public:
@@ -188,7 +188,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> begin(input_shape.dims(), 0);
     auto dim_sizes = input_shape.dim_sizes();
     std::vector<int64> limits(dim_sizes.begin(), dim_sizes.end());
-
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
       int slice_size = split_sizes_vec[i];
@@ -196,13 +196,13 @@ class SplitVOp : public XlaOpKernel {
 
       // Slice out the ith split from the split dimension.
       limits[split_dim] = begin[split_dim] + slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
       begin[split_dim] = limits[split_dim];
     }
   }
 };
 
-REGISTER_XLA_OP("SplitV", SplitVOp);
+REGISTER_XLA_OP(Name("SplitV"), SplitVOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 83bf24814f4..9eb68998310 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -76,29 +76,30 @@ class StridedSliceOp : public XlaOpKernel {
                  &dummy, &dummy, &begin, &end, &strides));
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
-    gtl::InlinedVector<int64, 4> slice_begin, slice_end;
+    gtl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
+
     for (int i = 0; i < begin.size(); ++i) {
-      // TODO(phawkins): implement strides != 1 when b/30878775 is fixed.
-      OP_REQUIRES(
-          ctx, strides[i] == 1 || strides[i] == -1,
-          errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
         slice_end.push_back(end[i]);
+        slice_strides.push_back(strides[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(end[i] + 1);
-        slice_end.push_back(begin[i] + 1);
+        slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
+        slice_end.push_back(input_shape.dim_size(i) - end[i] - 1);
+        slice_strides.push_back(-strides[i]);
         dimensions_to_reverse.push_back(i);
       }
     }
-    xla::ComputationDataHandle slice =
-        ctx->builder()->Slice(ctx->Input(0), slice_begin, slice_end);
+
+    xla::ComputationDataHandle slice = ctx->Input(0);
     if (!dimensions_to_reverse.empty()) {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
 
+    slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides);
+
     slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
   }
@@ -109,7 +110,7 @@ class StridedSliceOp : public XlaOpKernel {
   DataType index_type_;
 };
 
-REGISTER_XLA_OP("StridedSlice", StridedSliceOp);
+REGISTER_XLA_OP(Name("StridedSlice"), StridedSliceOp);
 
 class StridedSliceGradOp : public XlaOpKernel {
  public:
@@ -217,7 +218,120 @@ class StridedSliceGradOp : public XlaOpKernel {
   DataType index_type_;
 };
 
-REGISTER_XLA_OP("StridedSliceGrad", StridedSliceGradOp);
+REGISTER_XLA_OP(Name("StridedSliceGrad"), StridedSliceGradOp);
+
+class StridedSliceAssignOp : public XlaOpKernel {
+ public:
+  explicit StridedSliceAssignOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("begin_mask", &begin_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("end_mask", &end_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ellipsis_mask", &ellipsis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape final_shape;
+    gtl::InlinedVector<int64, 4> begin;
+    gtl::InlinedVector<int64, 4> end;
+    gtl::InlinedVector<int64, 4> strides;
+
+    xla::Literal begin_literal, end_literal, strides_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &begin_literal));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &end_literal));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
+
+    Tensor begin_tensor, end_tensor, strides_tensor;
+    OP_REQUIRES_OK(
+        ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
+    OP_REQUIRES_OK(ctx,
+                   LiteralToHostTensor(end_literal, index_type_, &end_tensor));
+    OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
+                                            &strides_tensor));
+
+    DataType lhs_type;
+    TensorShape lhs_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape));
+
+    const TensorShape rhs_shape = ctx->InputShape(4);
+
+    TensorShape dummy_processing_shape;
+    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
+    ShapeReadWriteFromTensorShape wrapped_dummy_processing_shape(
+        &dummy_processing_shape);
+    bool dummy = false;
+    OP_REQUIRES_OK(
+        ctx, ValidateStridedSliceOp(
+                 &begin_tensor, &end_tensor, strides_tensor,
+                 ShapeReadWriteFromTensorShape(&lhs_shape), begin_mask_,
+                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
+                 &wrapped_dummy_processing_shape, &wrapped_final_shape, &dummy,
+                 &dummy, &dummy, &begin, &end, &strides));
+
+    if (final_shape.num_elements() == 0 && rhs_shape.num_elements() == 0) {
+      // DynamicUpdateSlice does not allow 0-element updates. We should probably
+      // check that rhs_shape can be broadcast to final_shape, but that is
+      // probably better handled when implementing broadcasting more generally.
+      return;
+    }
+
+    // TODO(aselle): This check is too strong, we only should need
+    // input_shape to be broadcastable to final_shape
+    OP_REQUIRES(ctx, final_shape == rhs_shape,
+                errors::Unimplemented(
+                    "sliced l-value shape ", final_shape.DebugString(),
+                    " does not match r-value shape ", rhs_shape.DebugString(),
+                    ". Automatic broadcasting not yet implemented."));
+
+    xla::ComputationDataHandle lhs;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs));
+
+    xla::ComputationDataHandle rhs = ctx->Input(4);
+
+    gtl::InlinedVector<int64, 4> dimensions_to_reverse;
+    gtl::InlinedVector<int64, 4> slice_begin, slice_dims;
+    for (int i = 0; i < begin.size(); ++i) {
+      // TODO(phawkins): implement strides != 1
+      OP_REQUIRES(
+          ctx, strides[i] == 1 || strides[i] == -1,
+          errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
+      if (strides[i] > 0) {
+        slice_begin.push_back(begin[i]);
+        slice_dims.push_back(end[i] - begin[i]);
+      } else {
+        // Negative stride: swap begin and end, add 1 because the interval
+        // is semi-open, and mark the dimension to be reversed.
+        slice_begin.push_back(end[i] + 1);
+        slice_dims.push_back(begin[i] - end[i]);
+        dimensions_to_reverse.push_back(i);
+      }
+    }
+
+    if (!dimensions_to_reverse.empty()) {
+      rhs = ctx->builder()->Rev(rhs, dimensions_to_reverse);
+    }
+    rhs = ctx->builder()->Reshape(rhs, slice_dims);
+
+    if (lhs_shape.dims() == 0) {
+      // TODO(b/38323843): DynamicUpdateSlice crashes on rank 0 inputs. Fix
+      // and remove this workaround.
+      lhs = rhs;
+    } else {
+      lhs = ctx->builder()->DynamicUpdateSlice(
+          lhs, rhs, ctx->builder()->ConstantR1<int64>(slice_begin));
+    }
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs));
+  }
+
+ private:
+  int32 begin_mask_, end_mask_;
+  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  DataType index_type_;
+};
+
+REGISTER_XLA_OP(Name("ResourceStridedSliceAssign"), StridedSliceAssignOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
new file mode 100644
index 00000000000..deee7dd44db
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -0,0 +1,540 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA TensorArray operators.
+
+#include <limits>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// Since the element shape is not always provided to the TensorArrayV3 operator,
+// we must support lazily initialization of the TensorArray at the time of the
+// first write.
+// If a TensorArray `var` has not been initialized, constructs storage for the
+// TensorArray with elements of `elem_shape`. For both initialized and
+// uninitialized TensorArrays, checks that the tensor has a type compatible with
+// 'dtype' and shape compatible with 'elem_shape'.
+Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
+                                  XlaVariable* var, DataType dtype,
+                                  const TensorShape& elem_shape) {
+  if (var->type != dtype) {
+    return errors::InvalidArgument(
+        "TensorArray dtype is ", DataTypeString(var->type),
+        " but op has dtype ", DataTypeString(dtype), ".");
+  }
+
+  TF_RET_CHECK(var->tensor_array_size >= 0)
+      << var->name << " size " << var->tensor_array_size;
+  TensorShape ta_shape;
+  ta_shape.AddDim(var->tensor_array_size);
+  ta_shape.AppendShape(elem_shape);
+
+  if (var->value.handle() == 0) {
+    // TensorArray has not been initialized.
+    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, var->type);
+    var->value = builder->Broadcast(zero, ta_shape.dim_sizes());
+  } else {
+    // Checks the elem_shape matches the TensorArray shape.
+    auto shape_or_status = builder->GetShape(var->value);
+    if (!shape_or_status.ok()) {
+      return shape_or_status.status();
+    }
+    TensorShape shape = XLAShapeToTensorShape(*shape_or_status.ValueOrDie());
+    if (ta_shape != shape) {
+      return errors::InvalidArgument(
+          "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ",
+          shape.DebugString());
+    }
+  }
+  return Status::OK();
+}
+
+// Pads 'x' with 'count' zero indices. 'x' must have 1 element.
+xla::ComputationDataHandle PadIndexWithZeros(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    int count) {
+  xla::ComputationDataHandle zero = builder->ConstantR1<int32>({0});
+  std::vector<xla::ComputationDataHandle> xs(count + 1, zero);
+  xs[0] = builder->Reshape(x, {1});
+  return builder->ConcatInDim(xs, 0);
+}
+
+// Like ComputationBuilder::DynamicUpdateSlice, but adds 'update' to the
+// relevant slice of 'operand'.
+xla::ComputationDataHandle DynamicAddSlice(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& operand,
+    const xla::ComputationDataHandle& update,
+    const gtl::ArraySlice<int64>& update_dims,
+    const xla::ComputationDataHandle& start_indices) {
+  xla::ComputationDataHandle current =
+      builder->DynamicSlice(operand, start_indices, update_dims);
+  xla::ComputationDataHandle sum = builder->Add(current, update);
+  return builder->DynamicUpdateSlice(operand, sum, start_indices);
+}
+
+class TensorArrayOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_shape", &element_shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    bool dynamic_size;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dynamic_size", &dynamic_size));
+    OP_REQUIRES(
+        ctx, !dynamic_size,
+        errors::Unimplemented(
+            "TensorArrays with dynamic size are not supported by XLA."));
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_array_name", &tensor_array_name_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    int64 size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &size));
+    OP_REQUIRES(ctx, size >= 0,
+                errors::InvalidArgument("TensorArray size must be >= 0"));
+
+    xla::ComputationBuilder* b = ctx->builder();
+    b->set_die_immediately_on_error(true);
+
+    // Initializes the TensorArray value if we know the element shape.
+    // Otherwise, defer initialization to the first write.
+    xla::ComputationDataHandle value;
+    if (element_shape_.IsFullyDefined()) {
+      TensorShape shape;
+      CHECK(element_shape_.AsTensorShape(&shape));
+      TensorShape ta_shape;
+      ta_shape.AddDim(size);
+      ta_shape.AppendShape(shape);
+      xla::ComputationDataHandle zero = XlaHelpers::Zero(b, dtype_);
+      value = b->Broadcast(zero, ta_shape.dim_sizes());
+    }
+
+    XlaContext& xc = XlaContext::Get(ctx);
+    XlaVariable* var;
+    string name = strings::StrCat("TensorArray: ", tensor_array_name_);
+    OP_REQUIRES_OK(ctx,
+                   xc.CreateVariable(-1, std::move(name), dtype_, value, &var));
+    var->tensor_array_size = size;
+    ctx->SetVariableOutput(0, var);
+    ctx->SetConstantOutput(1, Tensor(DT_FLOAT));
+  }
+
+ private:
+  PartialTensorShape element_shape_;
+  DataType dtype_;
+  string tensor_array_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayV3"), TensorArrayOp);
+
+class TensorArrayWriteOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayWriteOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape elem_shape = ctx->InputShape(2);
+
+    // Initializes the TensorArray, if the element shape was not known at
+    // construction time.
+    XlaVariable* var;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
+
+    xla::ComputationDataHandle ta = var->value;
+    xla::ComputationDataHandle index = ctx->Input(1);
+    xla::ComputationDataHandle value = ctx->Input(2);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = b->Reshape(value, slice_shape.dim_sizes());
+
+    xla::ComputationDataHandle written =
+        DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, written));
+    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayWriteOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayWriteV3"), TensorArrayWriteOp);
+
+class TensorArrayReadOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayReadOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType ta_type;
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
+    OP_REQUIRES(ctx, ta_type == dtype_,
+                errors::InvalidArgument(
+                    "TensorArray dtype is ", DataTypeString(ta_type),
+                    " but Op requested dtype ", DataTypeString(dtype_), "."));
+    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
+                errors::InvalidArgument("TensorArray rank must be >= 1"));
+
+    xla::ComputationBuilder* b = ctx->builder();
+
+    xla::ComputationDataHandle ta;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+    xla::ComputationDataHandle index = ctx->Input(1);
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
+
+    auto slice_shape = ta_shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    xla::ComputationDataHandle read =
+        b->DynamicSlice(ta, start_indices, slice_shape);
+
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+    ctx->SetOutput(0, b->Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayReadOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayReadV3"), TensorArrayReadOp);
+
+class TensorArrayGatherOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType ta_type;
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
+    OP_REQUIRES(ctx, ta_type == dtype_,
+                errors::InvalidArgument("TensorArray type mismatch"));
+    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
+                errors::InvalidArgument("TensorArray rank must be >= 1"));
+
+    const TensorShape indices_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, indices_shape.dims() >= 1,
+                errors::InvalidArgument("indices must be rank 1"));
+    const int num_indices = indices_shape.dim_size(0);
+    auto indices = ctx->Input(1);
+
+    xla::ComputationBuilder* b = ctx->builder();
+
+    xla::ComputationDataHandle ta;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+
+    // For each index in `indices`, add the corresponding slice to `slices`.
+    std::vector<xla::ComputationDataHandle> slices(num_indices);
+    for (int i = 0; i < num_indices; ++i) {
+      // Slices the i-th index out of `indices`, and pads it with zeros in the
+      // minor dimensions to form an index into the TensorArray storage.
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});
+
+      // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+      auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
+
+      auto slice_shape = ta_shape.dim_sizes();
+      slice_shape[0] = 1LL;
+
+      slices[i] = b->DynamicSlice(ta, start_indices, slice_shape);
+    }
+
+    xla::ComputationDataHandle gather;
+    if (slices.empty()) {
+      auto shape = ta_shape.dim_sizes();
+      shape[0] = 0;
+      gather = b->Broadcast(XlaHelpers::Zero(b, dtype_), shape);
+    } else {
+      gather = b->ConcatInDim(slices, 0);
+    }
+    ctx->SetOutput(0, gather);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGatherOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayGatherV3"), TensorArrayGatherOp);
+
+class TensorArrayScatterOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayScatterOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    const TensorShape value_shape = ctx->InputShape(2);
+
+    XlaVariable* var;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    TensorShape elem_shape = value_shape;
+    elem_shape.RemoveDim(0);
+    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
+
+    const TensorShape indices_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, indices_shape.dims() >= 1,
+                errors::InvalidArgument("indices must be rank 1"));
+    const int num_indices = indices_shape.dim_size(0);
+    const xla::ComputationDataHandle indices = ctx->Input(1);
+
+    xla::ComputationDataHandle ta = var->value;
+    const xla::ComputationDataHandle value = ctx->Input(2);
+
+    auto slice_dims = value_shape.dim_sizes();
+    slice_dims[0] = 1LL;
+
+    std::vector<int64> value_starts(value_shape.dims(), 0);
+    auto value_ends = value_shape.dim_sizes();
+
+    std::vector<int64> value_strides(value_shape.dims(), 1);
+
+    // For every (index, value) pair, update the corresponding TensorArray
+    // storage.
+    for (int i = 0; i < num_indices; ++i) {
+      // Slice out part of the value.
+      value_starts[0] = i;
+      value_ends[0] = i + 1;
+      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+
+      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});
+      auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
+      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+    }
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, ta));
+    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayScatterOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayScatterV3"), TensorArrayScatterOp);
+
+class TensorArrayConcatOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayConcatOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType ta_type;
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
+    OP_REQUIRES(ctx, ta_type == dtype_,
+                errors::InvalidArgument("TensorArray type mismatch"));
+    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
+                errors::InvalidArgument("TensorArray rank must be >= 1"));
+
+    xla::ComputationBuilder* b = ctx->builder();
+
+    xla::ComputationDataHandle ta;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+
+    auto ta_dims = ta_shape.dim_sizes();
+    std::vector<int64> shape(ta_dims.begin() + 1, ta_dims.end());
+    shape[0] *= ta_shape.dim_size(0);
+    ctx->SetOutput(0, b->Reshape(ta, shape));
+
+    Tensor lengths(DT_INT64, {ta_dims[0]});
+    auto lengths_vec = lengths.vec<int64>();
+    for (int i = 0; i < ta_dims[0]; ++i) {
+      lengths_vec(i) = ta_dims[1];
+    }
+    ctx->SetConstantOutput(1, lengths);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayConcatOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayConcatV3"), TensorArrayConcatOp);
+
+class TensorArraySplitOp : public XlaOpKernel {
+ public:
+  explicit TensorArraySplitOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> lengths;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &lengths));
+
+    int64 length = 0;
+    if (!lengths.empty()) {
+      length = lengths[0];
+      for (int i = 1; i < lengths.size(); ++i) {
+        OP_REQUIRES(ctx, lengths[i] == length,
+                    errors::InvalidArgument("lengths must be equal: ", length,
+                                            " vs. ", lengths[i]));
+      }
+    }
+
+    TensorShape value_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, value_shape.dims() >= 1,
+                errors::InvalidArgument("value must have rank >= 1, got ",
+                                        value_shape.DebugString()));
+    TensorShape elem_shape = value_shape;
+    elem_shape.set_dim(0, length);
+
+    xla::ComputationBuilder* b = ctx->builder();
+    XlaVariable* var;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
+    xla::ComputationDataHandle ta = var->value;
+
+    TensorShape ta_shape;
+    ta_shape.AddDim(var->tensor_array_size);
+    ta_shape.AppendShape(elem_shape);
+
+    OP_REQUIRES(ctx, lengths.size() == var->tensor_array_size,
+                errors::InvalidArgument(
+                    "TensorArray's size is not equal to the size of lengths (",
+                    lengths.size(), " vs. ", var->tensor_array_size, ")"));
+
+    const xla::ComputationDataHandle value = ctx->Input(1);
+
+    OP_REQUIRES(ctx, value_shape.num_elements() == ta_shape.num_elements(),
+                errors::InvalidArgument("mismatched element count ",
+                                        value_shape.DebugString(), " vs. ",
+                                        ta_shape.DebugString()));
+
+    ta = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, ta));
+
+    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySplitOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArraySplitV3"), TensorArraySplitOp);
+
+class TensorArraySizeOp : public XlaOpKernel {
+ public:
+  explicit TensorArraySizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    XlaVariable* var;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    Tensor size_tensor(DT_INT32, {});
+    size_tensor.scalar<int32>()() = static_cast<int32>(var->tensor_array_size);
+    ctx->SetConstantOutput(0, size_tensor);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySizeOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArraySizeV3"), TensorArraySizeOp);
+
+class TensorArrayGradOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("source", &source_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    XlaVariable* var;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+
+    DataType ta_type;
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
+    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
+                errors::InvalidArgument("TensorArray rank must be >= 1"));
+
+    // Finds or looks up the corresponding gradient TensorArray, which stores
+    // gradients computed during backpropagation.
+    XlaVariable*& gradient = var->tensor_array_gradient[source_];
+    if (!gradient) {
+      xla::ComputationDataHandle zero = XlaHelpers::Zero(b, ta_type);
+      xla::ComputationDataHandle value =
+          b->Broadcast(zero, ta_shape.dim_sizes());
+
+      XlaContext& xc = XlaContext::Get(ctx);
+      string name = strings::StrCat("TensorArrayGrad: ", var->name);
+      OP_REQUIRES_OK(ctx, xc.CreateVariable(-1, std::move(name), var->type,
+                                            value, &gradient));
+      gradient->tensor_array_size = var->tensor_array_size;
+    }
+
+    ctx->SetVariableOutput(0, gradient);
+    ctx->SetConstantOutput(1, Tensor(DT_FLOAT));
+  }
+
+ private:
+  string source_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayGradV3"), TensorArrayGradOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 45ac5e12c74..4cc2eb8f877 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -122,7 +122,7 @@ class TileOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
 };
 
-REGISTER_XLA_OP("Tile", TileOp);
+REGISTER_XLA_OP(Name("Tile"), TileOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
new file mode 100644
index 00000000000..e9ac1ee91b8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -0,0 +1,475 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+namespace {
+
+class ResourceApplyGradientDescent : public XlaOpKernel {
+ public:
+  explicit ResourceApplyGradientDescent(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle handle;
+    xla::ComputationBuilder* b = ctx->builder();
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2)));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+  }
+};
+REGISTER_XLA_OP(Name("ResourceApplyGradientDescent"),
+                ResourceApplyGradientDescent);
+
+class ResourceApplyMomentum : public XlaOpKernel {
+ public:
+  explicit ResourceApplyMomentum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    DataType type = ctx->input_type(2);
+
+    DataType var_type, accum_type;
+    TensorShape var_shape, accum_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
+
+    OP_REQUIRES(
+        ctx, type == var_type && type == accum_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyMomentum must match: ",
+            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
+            DataTypeString(accum_type)));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    TensorShape lr_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    TensorShape momentum_shape = ctx->InputShape(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum_shape),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum_shape.DebugString()));
+
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
+
+    xla::ComputationDataHandle lr = ctx->Input(2);
+    xla::ComputationDataHandle grad = ctx->Input(3);
+    xla::ComputationDataHandle momentum = ctx->Input(4);
+
+    accum = b->Add(b->Mul(accum, momentum), grad);
+    if (use_nesterov_) {
+      // See https://github.com/tensorflow/tensorflow/pull/2798 for an
+      // explanation of the reparameterization used here.
+      var = b->Sub(
+          var, b->Add(b->Mul(grad, lr), b->Mul(b->Mul(accum, momentum), lr)));
+    } else {
+      var = b->Sub(var, b->Mul(accum, lr));
+    }
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum));
+  }
+
+ private:
+  bool use_nesterov_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyMomentum"), ResourceApplyMomentum);
+
+class ResourceApplyAdagrad : public XlaOpKernel {
+ public:
+  explicit ResourceApplyAdagrad(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    DataType type = ctx->input_type(2);
+
+    DataType var_type, accum_type;
+    TensorShape var_shape, accum_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
+
+    OP_REQUIRES(
+        ctx, type == var_type && type == accum_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyAdagrad must match: ",
+            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
+            DataTypeString(accum_type)));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    TensorShape lr_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
+    xla::ComputationDataHandle lr = ctx->Input(2);
+    xla::ComputationDataHandle grad = ctx->Input(3);
+
+    accum = b->Add(accum, b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0)));
+    var = b->Sub(
+        var, b->Mul(b->Mul(grad, lr),
+                    b->Pow(accum, XlaHelpers::FloatLiteral(b, type, -0.5))));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum));
+  }
+};
+REGISTER_XLA_OP(Name("ResourceApplyAdagrad"), ResourceApplyAdagrad);
+
+class ResourceApplyAdam : public XlaOpKernel {
+ public:
+  explicit ResourceApplyAdam(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType var_type, m_type, v_type;
+    TensorShape var_shape, m_shape, v_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
+
+    OP_REQUIRES(
+        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyRMSProp must match: ",
+            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
+            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+
+    TensorShape beta1_power_shape = ctx->InputShape(3);
+    TensorShape beta2_power_shape = ctx->InputShape(4);
+    TensorShape lr_shape = ctx->InputShape(5);
+    TensorShape beta1_shape = ctx->InputShape(6);
+    TensorShape beta2_shape = ctx->InputShape(7);
+    TensorShape epsilon_shape = ctx->InputShape(8);
+    TensorShape grad_shape = ctx->InputShape(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_shape),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_shape),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_shape),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_shape),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_shape.DebugString()));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(m_shape),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        m_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(v_shape),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        v_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
+    xla::ComputationDataHandle beta1_power = ctx->Input(3);
+    xla::ComputationDataHandle beta2_power = ctx->Input(4);
+    xla::ComputationDataHandle lr = ctx->Input(5);
+    xla::ComputationDataHandle beta1 = ctx->Input(6);
+    xla::ComputationDataHandle beta2 = ctx->Input(7);
+    xla::ComputationDataHandle epsilon = ctx->Input(8);
+    xla::ComputationDataHandle grad = ctx->Input(9);
+
+    // alpha <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    // m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+    // v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+    // variable <- variable - alpha * m_t / (sqrt(v_t) + epsilon)
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
+    xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
+    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+
+    xla::ComputationDataHandle alpha =
+        b->Div(b->Mul(lr, b->Pow(b->Sub(one, beta2_power), half)),
+               b->Sub(one, beta1_power));
+    m = b->Add(m, b->Mul(b->Sub(grad, m), b->Sub(one, beta1)));
+    v = b->Add(v, b->Mul(b->Sub(b->Pow(grad, two), v), b->Sub(one, beta2)));
+    var =
+        b->Sub(var, b->Div(b->Mul(m, alpha), b->Add(b->Pow(v, half), epsilon)));
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, v));
+  }
+
+ private:
+  DataType dtype_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyAdam"), ResourceApplyAdam);
+
+class ResourceApplyRMSProp : public XlaOpKernel {
+ public:
+  explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    DataType type = ctx->input_type(3);
+
+    DataType var_type, ms_type, mom_type;
+    TensorShape var_shape, ms_shape, mom_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &ms_type, &ms_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &mom_type, &mom_shape));
+
+    OP_REQUIRES(
+        ctx, type == var_type && type == ms_type && type == mom_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyRMSProp must match: ",
+            DataTypeString(type), " vs. ", DataTypeString(var_type), " vs. ",
+            DataTypeString(ms_type), " vs. ", DataTypeString(mom_type)));
+
+    TensorShape lr_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+    TensorShape rho_shape = ctx->InputShape(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho_shape),
+                errors::InvalidArgument("rho is not a scalar: ",
+                                        rho_shape.DebugString()));
+    TensorShape momentum_shape = ctx->InputShape(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum_shape),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum_shape.DebugString()));
+    TensorShape epsilon_shape = ctx->InputShape(6);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_shape.DebugString()));
+    TensorShape grad_shape = ctx->InputShape(7);
+
+    // var should be the same shape as mom and ms.
+    OP_REQUIRES(ctx, var_shape.IsSameSize(ms_shape),
+                errors::InvalidArgument("var and ms do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        ms_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(mom_shape),
+                errors::InvalidArgument(
+                    "var and mom do not have the same shape",
+                    var_shape.DebugString(), " ", mom_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::ComputationDataHandle var, ms, mom;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &ms));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &mom));
+    xla::ComputationDataHandle lr = ctx->Input(3);
+    xla::ComputationDataHandle rho = ctx->Input(4);
+    xla::ComputationDataHandle momentum = ctx->Input(5);
+    xla::ComputationDataHandle epsilon = ctx->Input(6);
+    xla::ComputationDataHandle grad = ctx->Input(7);
+
+    // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+    // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+    // var <- var - mom
+    //
+    // We use an alternate formulation of the ms equation:
+    //
+    //    ms <- ms + (grad**2 - ms) * (1 - rho)
+    //
+    // Which expands to:
+    //
+    //    ms <- ms + grad**2 - rho * grad ** 2 - ms + ms * rho
+    //
+    // Which simplifies to:
+    //
+    //    ms <- grad**2 (1 - rho) + ms * rho
+    //
+    // Which is the equation listed above.
+    xla::ComputationDataHandle new_ms = b->Add(
+        ms,
+        b->Mul(b->Sub(b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0)), ms),
+               b->Sub(XlaHelpers::FloatLiteral(b, type, 1.0), rho)));
+    xla::ComputationDataHandle new_mom =
+        b->Add(b->Mul(mom, momentum),
+               b->Div(b->Mul(grad, lr),
+                      b->Pow(b->Add(new_ms, epsilon),
+                             XlaHelpers::FloatLiteral(b, type, 0.5))));
+    xla::ComputationDataHandle new_var = b->Sub(var, new_mom);
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, new_var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, new_ms));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, type, new_mom));
+  }
+};
+REGISTER_XLA_OP(Name("ResourceApplyRMSProp"), ResourceApplyRMSProp);
+
+class ResourceApplyFtrl : public XlaOpKernel {
+ public:
+  explicit ResourceApplyFtrl(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    DataType var_type, accum_type, linear_type;
+    TensorShape var_shape, accum_shape, linear_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
+
+    OP_REQUIRES(
+        ctx,
+        dtype_ == var_type && dtype_ == accum_type && dtype_ == linear_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyFtrl must match: ",
+            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " and ",
+            DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(linear_shape),
+                errors::InvalidArgument(
+                    "var and linear do not have the same shape",
+                    var_shape.DebugString(), " ", linear_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(3);
+    TensorShape lr_shape = ctx->InputShape(4);
+    TensorShape l1_shape = ctx->InputShape(5);
+    TensorShape l2_shape = ctx->InputShape(6);
+    TensorShape lr_power_shape = ctx->InputShape(7);
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1_shape),
+                errors::InvalidArgument("l1 is not a scalar: ",
+                                        l1_shape.DebugString()));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2_shape),
+                errors::InvalidArgument("l2 is not a scalar: ",
+                                        l2_shape.DebugString()));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_power_shape),
+                errors::InvalidArgument("lr_power is not a scalar: ",
+                                        lr_power_shape.DebugString()));
+
+    xla::ComputationDataHandle var, accum, linear;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
+    xla::ComputationDataHandle grad = ctx->Input(3);
+    xla::ComputationDataHandle lr = ctx->Input(4);
+    xla::ComputationDataHandle l1 = ctx->Input(5);
+    xla::ComputationDataHandle l2 = ctx->Input(6);
+    xla::ComputationDataHandle lr_power = ctx->Input(7);
+
+    // new_accum = accum + grad * grad
+    // linear += grad - (new_accum^(-lr_power) - accum^(-lr_power)) / lr * var
+    // quadratic = (new_accum^(-lr_power) / lr) + 2 * l2
+    // var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+    // accum = new_accum
+
+    xla::ComputationDataHandle zero_broadcast = b->Broadcast(
+        XlaHelpers::FloatLiteral(b, dtype_, 0.0), var_shape.dim_sizes());
+    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+
+    xla::ComputationDataHandle new_accum = b->Add(accum, b->Pow(grad, two));
+    xla::ComputationDataHandle new_accum_lr_pow =
+        b->Pow(new_accum, b->Neg(lr_power));
+    xla::ComputationDataHandle accum_lr_pow = b->Pow(accum, b->Neg(lr_power));
+    linear = b->Add(
+        linear,
+        b->Sub(grad, b->Mul(b->Div(b->Sub(new_accum_lr_pow, accum_lr_pow), lr),
+                            var)));
+    xla::ComputationDataHandle quadratic =
+        b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2));
+    xla::ComputationDataHandle pre_shrink =
+        b->Div(b->Sub(b->Mul(l1, b->Sign(linear)), linear), quadratic);
+    var = b->Select(b->Gt(b->Abs(linear), l1), pre_shrink, zero_broadcast);
+    accum = new_accum;
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, linear));
+  }
+
+ private:
+  DataType dtype_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyFtrl"), ResourceApplyFtrl);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 2840abc8782..2fc5d40d105 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/transpose_op.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -87,7 +87,7 @@ class TransposeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("Transpose", TransposeOp);
+REGISTER_XLA_OP(Name("Transpose"), TransposeOp);
 
 // InvertPermutation frequently forms part of the gradient of Transpose.
 //
@@ -128,7 +128,8 @@ class InvertPermutationOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP("InvertPermutation", InvertPermutationOp);
+REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
+                InvertPermutationOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index c3ba1a7a8b0..abe4949f5db 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // Native XLA implementations of simple unary Ops
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -28,10 +28,10 @@ namespace {
 // A subclass of a TlaUnaryOp must build the lambda computation that
 // describes the scalar->scalar function to apply to each element of
 // the input.
-#define XLAJIT_MAKE_UNARY(Name, COMPUTATION)                           \
-  class Name##Op : public XlaOpKernel {                                \
+#define XLAJIT_MAKE_UNARY(NAME, COMPUTATION)                           \
+  class NAME##Op : public XlaOpKernel {                                \
    public:                                                             \
-    explicit Name##Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} \
+    explicit NAME##Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} \
     void Compile(XlaOpKernelContext* ctx) {                            \
       xla::ComputationBuilder* b = ctx->builder();                     \
       xla::ComputationDataHandle x = ctx->Input(0);                    \
@@ -39,7 +39,7 @@ namespace {
       ctx->SetOutput(0, y);                                            \
     }                                                                  \
   };                                                                   \
-  REGISTER_XLA_OP(#Name, Name##Op);
+  REGISTER_XLA_OP(Name(#NAME), NAME##Op);
 
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
@@ -58,6 +58,27 @@ XLAJIT_MAKE_UNARY(Log1p, b->Log(b->Add(XlaHelpers::One(b, input_type(0)), x)));
 
 XLAJIT_MAKE_UNARY(LogicalNot, b->LogicalNot(x));
 XLAJIT_MAKE_UNARY(Neg, b->Neg(x));
+
+// Implements Banker's rounding: numbers that are equidistant between two
+// integers are rounded towards even.
+static xla::ComputationDataHandle Round(xla::ComputationBuilder* b,
+                                        DataType dtype,
+                                        const xla::ComputationDataHandle& x) {
+  auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5);
+  auto one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
+  auto two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
+
+  auto round_val = b->Floor(x);
+  auto fraction = b->Sub(x, round_val);
+  auto nearest_even_int =
+      b->Sub(round_val, b->Mul(two, b->Floor(b->Mul(half, x))));
+  auto is_odd = b->Eq(nearest_even_int, one);
+  return b->Select(b->LogicalOr(b->Gt(fraction, half),
+                                b->LogicalAnd(b->Eq(fraction, half), is_odd)),
+                   b->Add(round_val, one), round_val);
+}
+XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x));
+
 XLAJIT_MAKE_UNARY(Rsqrt,
                   b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5)));
 XLAJIT_MAKE_UNARY(Sigmoid,
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index c5b2bdaf2dc..f87586ba578 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -66,6 +66,7 @@ class UnpackOp : public XlaOpKernel {
 
     std::vector<int64> start_indices(input_shape.dims(), 0);
     std::vector<int64> limit_indices(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < input_shape.dims(); ++i) {
       limit_indices[i] = input_shape.dim_size(i);
     }
@@ -73,7 +74,8 @@ class UnpackOp : public XlaOpKernel {
     for (int i = 0; i < num; ++i) {
       start_indices[axis] = i;
       limit_indices[axis] = i + 1;
-      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices);
+      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices,
+                                         strides);
       // Reshape to drop the 'axis' dimension.
       auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes());
       ctx->SetOutput(i, result);
@@ -84,7 +86,7 @@ class UnpackOp : public XlaOpKernel {
   int axis_;
 };
 
-REGISTER_XLA_OP("Unpack", UnpackOp);
+REGISTER_XLA_OP(Name("Unpack"), UnpackOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
new file mode 100644
index 00000000000..1b04b8b802c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+namespace {
+
+class VarIsInitializedOp : public XlaOpKernel {
+ public:
+  explicit VarIsInitializedOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle handle;
+    bool initialized = ctx->ReadVariableInput(0, &handle).ok();
+    ctx->SetOutput(0, ctx->builder()->ConstantR0<bool>(initialized));
+  }
+};
+REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp);
+
+class ReadVariableOp : public XlaOpKernel {
+ public:
+  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle handle;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    ctx->SetOutput(0, handle);
+  }
+};
+REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp);
+REGISTER_XLA_OP(Name("_UnsafeReadVariable"), ReadVariableOp);
+
+class AssignVariableOp : public XlaOpKernel {
+ public:
+  explicit AssignVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES_OK(ctx,
+                   ctx->AssignVariable(0, ctx->input_type(1), ctx->Input(1)));
+  }
+};
+REGISTER_XLA_OP(Name("AssignVariableOp"), AssignVariableOp);
+
+class AssignAddVariableOp : public XlaOpKernel {
+ public:
+  explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle handle;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    handle = ctx->builder()->Add(handle, ctx->Input(1));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+  }
+};
+REGISTER_XLA_OP(
+    Name("AssignAddVariableOp").TypeConstraint("dtype", kNumericTypes),
+    AssignAddVariableOp);
+
+class AssignSubVariableOp : public XlaOpKernel {
+ public:
+  explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle handle;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    handle = ctx->builder()->Sub(handle, ctx->Input(1));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+  }
+};
+REGISTER_XLA_OP(
+    Name("AssignSubVariableOp").TypeConstraint("dtype", kNumericTypes),
+    AssignSubVariableOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 3e509375efb..fe08e83c239 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -18,14 +18,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-// Copies 'host_tensor' to an XLA Literal. Fails if the host_tensor has zero
-// elements or is of an unsupported type.
+// Copies 'host_tensor' to an XLA Literal. Fails if host_tensor is of an
+// unsupported type.
 Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal);
 
 // Copies 'literal' to 'host_tensor', which is allocated of type <target_type>.
diff --git a/tensorflow/compiler/tf2xla/op_registrations.cc b/tensorflow/compiler/tf2xla/op_registrations.cc
deleted file mode 100644
index e32070efa32..00000000000
--- a/tensorflow/compiler/tf2xla/op_registrations.cc
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Kernel registrations for XLA JIT devices.
-
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace {
-
-// CPU JIT device registrations.
-
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("_Arg").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("_ArrayToList"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("_ListToArray"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("_Retval").TypeConstraint("T", kCpuAllTypes));
-
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Abs").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Add").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("AddN").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("All"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("Any"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("AvgPool").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("AvgPoolGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("BatchMatMul").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("BiasAdd").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("BiasAddV1").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("BiasAddGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("BroadcastGradientArgs"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Cast")
-                        .TypeConstraint("SrcT", kCpuAllTypes)
-                        .TypeConstraint("DstT", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Ceil").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Concat").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("ConcatV2")
-                        .TypeConstraint("T", kCpuAllTypes)
-                        .TypeConstraint("Tidx", DT_INT32));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("ConcatOffset"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Conv2D").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("Conv2DBackpropFilter").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("Conv2DBackpropInput").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("DepthwiseConv2dNative").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Diag").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("DiagPart").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Div").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("DynamicStitch").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Equal").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Exp").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("ExpandDims").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Fill").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Floor").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("FloorDiv").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("FloorMod").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Greater").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("GreaterEqual").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Inv").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Reciprocal").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("InvertPermutation").TypeConstraint("T", DT_INT32));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("L2Loss").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Less").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("LessEqual").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("LinSpace").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Log").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Log1p").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("LogicalAnd"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("LogicalNot"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("LogicalOr"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("LogSoftmax").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("LRN").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("LRNGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Maximum").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("MatMul").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("MatrixDiag").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("MatrixDiagPart").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Max").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("MaxPool").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("MaxPoolGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Mean").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Min").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Minimum").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Mod").TypeConstraint("T", kCpuIntTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Mul").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Neg").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("NotEqual").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Pack").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Pad").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Pow").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("PreventGradient").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Prod").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Range").TypeConstraint("Tidx", kCpuNumericTypes));
-// TODO(b/34339814): implement inverse erf for double types and update the
-// type constraint.
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("RandomStandardNormal").TypeConstraint("dtype", DT_FLOAT));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("RandomUniform"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("RandomUniformInt"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("Rank"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("RealDiv").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Relu").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Relu6").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("ReluGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Relu6Grad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Reshape").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Rsqrt").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("RsqrtGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Select").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("Shape"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("ShapeN"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Sigmoid").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("SigmoidGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Sign").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("Size"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Slice").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Softmax").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("SoftmaxCrossEntropyWithLogits").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Softplus").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("SoftplusGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("SparseMatMul")
-                        .TypeConstraint("Ta", kCpuFloatTypes)
-                        .TypeConstraint("Tb", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Split").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("SplitV").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Square").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_CPU_XLA_JIT,
-    Name("SquaredDifference").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Squeeze").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Sqrt").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("StopGradient").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("StridedSlice").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("StridedSliceGrad").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Sub").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Sum").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT, Name("SymbolicGradient"));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Tanh").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("TanhGrad").TypeConstraint("T", kCpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Tile").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Transpose").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("TruncateDiv").TypeConstraint("T", kCpuIntTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("TruncateMod").TypeConstraint("T", kCpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("Unpack").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_CPU_XLA_JIT,
-                    Name("ZerosLike").TypeConstraint("T", kCpuNumericTypes));
-
-REGISTER_XLA_JIT_ONLY_KERNEL(DEVICE_CPU_XLA_JIT,
-                             Name("Const").TypeConstraint("dtype",
-                                                          kCpuAllTypes));
-REGISTER_XLA_JIT_ONLY_KERNEL(
-    DEVICE_CPU_XLA_JIT, Name("Identity").TypeConstraint("T", kCpuAllTypes));
-REGISTER_XLA_JIT_ONLY_KERNEL(DEVICE_CPU_XLA_JIT, Name("NoOp"));
-
-// GPU JIT device registrations
-
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("_Arg").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("_ArrayToList"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("_ListToArray"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("_Retval").TypeConstraint("T", kGpuAllTypes));
-
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Abs").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Add").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("AddN").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("All"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("Any"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("AvgPool").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("AvgPoolGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("BatchMatMul").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("BiasAdd").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("BiasAddV1").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("BiasAddGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("BroadcastGradientArgs"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Cast")
-                        .TypeConstraint("SrcT", kGpuAllTypes)
-                        .TypeConstraint("DstT", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Ceil").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Concat").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("ConcatV2").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("ConcatOffset"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Conv2D").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_GPU_XLA_JIT,
-    Name("Conv2DBackpropFilter").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_GPU_XLA_JIT,
-    Name("Conv2DBackpropInput").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_GPU_XLA_JIT,
-    Name("DepthwiseConv2dNative").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Diag").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("DiagPart").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Div").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("DynamicStitch").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Equal").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Exp").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("ExpandDims").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Fill").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Floor").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("FloorDiv").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("FloorMod").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Greater").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("GreaterEqual").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Inv").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Reciprocal").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("InvertPermutation").TypeConstraint("T", DT_INT32));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("L2Loss").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Less").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("LessEqual").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("LinSpace").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Log").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Log1p").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("LogicalAnd"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("LogicalNot"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("LogicalOr"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("LogSoftmax").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("LRN").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("LRNGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Maximum").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("MatMul").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("MatrixDiag").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("MatrixDiagPart").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Max").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("MaxPool").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("MaxPoolGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Mean").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Min").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Minimum").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Mod").TypeConstraint("T", kGpuIntTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Mul").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Neg").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("NotEqual").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Pack").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Pad").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Pow").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("PreventGradient").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Prod").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Range").TypeConstraint("Tidx", kGpuNumericTypes));
-// TODO(b/31361304): disabled because of XLA bugs.
-// REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("RandomStandardNormal"));
-// REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("RandomUniform"));
-// REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("RandomUniformInt"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("Rank"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("RealDiv").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Relu").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Relu6").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("ReluGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Relu6Grad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Reshape").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Rsqrt").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("RsqrtGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Select").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("Shape"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("ShapeN"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Sigmoid").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("SigmoidGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Sign").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("Size"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Slice").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Softmax").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_GPU_XLA_JIT,
-    Name("SoftmaxCrossEntropyWithLogits").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Softplus").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("SoftplusGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("SparseMatMul")
-                        .TypeConstraint("Ta", kGpuFloatTypes)
-                        .TypeConstraint("Tb", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Split").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("SplitV").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Square").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(
-    DEVICE_GPU_XLA_JIT,
-    Name("SquaredDifference").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Squeeze").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Sqrt").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("StopGradient").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("StridedSlice").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("StridedSliceGrad").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Sub").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Sum").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT, Name("SymbolicGradient"));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Tanh").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("TanhGrad").TypeConstraint("T", kGpuFloatTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Tile").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Transpose").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("TruncateDiv").TypeConstraint("T", kGpuIntTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("TruncateMod").TypeConstraint("T", kGpuNumericTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("Unpack").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_KERNEL(DEVICE_GPU_XLA_JIT,
-                    Name("ZerosLike").TypeConstraint("T", kGpuNumericTypes));
-
-REGISTER_XLA_JIT_ONLY_KERNEL(DEVICE_GPU_XLA_JIT,
-                             Name("Const").TypeConstraint("dtype",
-                                                          kGpuAllTypes));
-REGISTER_XLA_JIT_ONLY_KERNEL(
-    DEVICE_GPU_XLA_JIT, Name("Identity").TypeConstraint("T", kGpuAllTypes));
-REGISTER_XLA_JIT_ONLY_KERNEL(DEVICE_GPU_XLA_JIT, Name("NoOp"));
-
-}  // anonymous namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/str_util.cc b/tensorflow/compiler/tf2xla/str_util.cc
index ce25d631271..2b0834fe7b6 100644
--- a/tensorflow/compiler/tf2xla/str_util.cc
+++ b/tensorflow/compiler/tf2xla/str_util.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-void ReplaceAll(string* text, StringPiece from, StringPiece to) {
+static void ReplaceAll(string* text, StringPiece from, StringPiece to) {
   size_t pos = 0;
   while ((pos = text->find(from.data(), pos, from.size())) != string::npos) {
     text->replace(pos, from.size(), to.data(), to.size());
diff --git a/tensorflow/compiler/tf2xla/str_util.h b/tensorflow/compiler/tf2xla/str_util.h
index 4920b1a4d48..51f25009d70 100644
--- a/tensorflow/compiler/tf2xla/str_util.h
+++ b/tensorflow/compiler/tf2xla/str_util.h
@@ -29,10 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-// Replace all non-overlapping occurrences of from with to in-place in text.  If
-// from is empty, it matches at the beginning of the text and after every byte.
-void ReplaceAll(string* text, StringPiece from, StringPiece to);
-
 // Replace all non-overlapping occurrences of the given (from,to) pairs in-place
 // in text.  If from is empty, it matches at the beginning of the text and after
 // every byte.  Each (from,to) replacement pair is processed in the order it is
diff --git a/tensorflow/compiler/tf2xla/str_util_test.cc b/tensorflow/compiler/tf2xla/str_util_test.cc
index f992007a345..8817f6902a8 100644
--- a/tensorflow/compiler/tf2xla/str_util_test.cc
+++ b/tensorflow/compiler/tf2xla/str_util_test.cc
@@ -25,36 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-class ReplaceAllTest : public ::testing::Test {
- protected:
-  void ExpectReplaceAll(string text, StringPiece from, StringPiece to,
-                        StringPiece want) {
-    ReplaceAll(&text, from, to);
-    EXPECT_EQ(text, want);
-  }
-};
-
-TEST_F(ReplaceAllTest, Simple) {
-  ExpectReplaceAll("", "", "", "");
-  ExpectReplaceAll("", "", "X", "X");
-  ExpectReplaceAll("", "", "XYZ", "XYZ");
-  ExpectReplaceAll("banana", "", "", "banana");
-  ExpectReplaceAll("banana", "", "_", "_b_a_n_a_n_a_");
-  ExpectReplaceAll("banana", "", "__", "__b__a__n__a__n__a__");
-  ExpectReplaceAll("banana", "a", "a", "banana");
-  ExpectReplaceAll("banana", "a", "", "bnn");
-  ExpectReplaceAll("banana", "a", "X", "bXnXnX");
-  ExpectReplaceAll("banana", "a", "XX", "bXXnXXnXX");
-  ExpectReplaceAll("banana", "an", "an", "banana");
-  ExpectReplaceAll("banana", "an", "", "ba");
-  ExpectReplaceAll("banana", "an", "X", "bXXa");
-  ExpectReplaceAll("banana", "an", "XY", "bXYXYa");
-  ExpectReplaceAll("banana", "an", "XYZ", "bXYZXYZa");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "X", "foo X baz X");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "ABCDEFGHIJKLMNOP",
-                   "foo ABCDEFGHIJKLMNOP baz ABCDEFGHIJKLMNOP");
-}
-
 class ReplaceAllPairsTest : public ::testing::Test {
  protected:
   void ExpectReplaceAllPairs(
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index ad3c9217440..1d0098591e3 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -18,20 +18,13 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace tensorflow {
 
-const char* const DEVICE_CPU_XLA_JIT = "XLA_CPU_JIT";
-const char* const DEVICE_GPU_XLA_JIT = "XLA_GPU_JIT";
-
 // The XlaCompilationAllocator doesn't actually back any Tensors with storage
 // buffers of values: instead for each Tensor it stores a
 // XlaExpression which corresponds to the XLA computation
@@ -41,13 +34,12 @@ class XlaCompilationAllocator : public Allocator {
   XlaCompilationAllocator() {}
   ~XlaCompilationAllocator() override {}
 
-  string Name() override { return "tla_jit"; }
+  string Name() override { return "xla_compilation"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    // Regardless of the size requested, always allocate a
-    // XlaExpression. Respect the aligment request because there is
-    // alignment checking even for Tensors whose data is never
-    // accessed.
+    // Regardless of the size requested, always allocates an XlaExpression.
+    // Respects the aligment request because there is alignment checking even
+    // for Tensors whose data is never accessed.
     void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
     new (expression) XlaExpression();
@@ -80,11 +72,11 @@ class XlaCompilationAllocator : public Allocator {
 
 XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
                                            DeviceType type)
-    : LocalDevice(options,
-                  Device::BuildDeviceAttributes(
-                      "", type, Bytes(256 << 20), DeviceLocality(),
-                      strings::StrCat("device: XLA JIT device ", type.type())),
-                  cpu_allocator()),
+    : LocalDevice(
+          options,
+          Device::BuildDeviceAttributes(
+              "", type, Bytes(256 << 20), DeviceLocality(),
+              strings::StrCat("device: XLA compilation device ", type.type()))),
       allocator_(new XlaCompilationAllocator()) {}
 
 XlaCompilationDevice::~XlaCompilationDevice() {}
@@ -93,112 +85,38 @@ Allocator* XlaCompilationDevice::GetAllocator(AllocatorAttributes attr) {
   return allocator_.get();
 }
 
+void XlaCompilationDevice::Compute(OpKernel* op_kernel,
+                                   OpKernelContext* context) {
+  VLOG(1) << "XlaCompilationDevice::Compute "
+          << SummarizeNodeDef(op_kernel->def());
+  auto* b = XlaContext::Get(context).builder();
+  xla::OpMetadata metadata;
+  metadata.set_op_type(op_kernel->type_string());
+  metadata.set_op_name(op_kernel->name());
+  b->SetOpMetadata(metadata);
+  op_kernel->Compute(context);
+  b->ClearOpMetadata();
+  VLOG(2) << "Done";
+}
+
 Status XlaCompilationDevice::Sync() { return Status::OK(); }
 
 Status XlaCompilationDevice::MakeTensorFromProto(
     const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
     Tensor* tensor) {
   return errors::InvalidArgument(
-      "Tla JIT Device should not parse tensor from proto");
+      "XLACompilationDevice::MakeTensorFromProto should not be called");
 }
 
-// Is platform 'id' supported by XLA?
-static bool IsPlatformSupported(perftools::gputools::Platform::Id id) {
-  auto platform = perftools::gputools::MultiPlatformManager::PlatformWithId(id);
-  if (!platform.ok()) return false;
-  return xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie()).ok();
+XlaExpression::XlaExpression() = default;
+
+void XlaExpression::set_handle(const xla::ComputationDataHandle& h) {
+  handle_ = h;
 }
 
-XlaOpRegistry::XlaOpRegistry() = default;
-XlaOpRegistry::~XlaOpRegistry() = default;
-
-/* static */ void XlaOpRegistry::RegisterJitDevice(
-    const string& device_name, const string& jit_device_name,
-    bool requires_jit) {
-  XlaOpRegistry& registry = Instance();
-  mutex_lock lock(registry.mutex_);
-  auto result = registry.jit_devices_.emplace(
-      device_name, std::make_pair(jit_device_name, requires_jit));
-  CHECK(result.second || result.first->second.first == jit_device_name);
-}
-
-/* static */ bool XlaOpRegistry::GetJitDevice(const string& device_name,
-                                              const string** jit_device_name,
-                                              bool* requires_jit) {
-  XlaOpRegistry& registry = Instance();
-
-  // Lazily register the CPU and GPU JIT devices the first time GetJitDevice is
-  // called.
-  static void* registration = [&registry]() {
-    mutex_lock lock(registry.mutex_);
-    if (IsPlatformSupported(perftools::gputools::host::kHostPlatformId)) {
-      registry.jit_devices_[DEVICE_CPU] = {DEVICE_CPU_XLA_JIT, false};
-    }
-    if (IsPlatformSupported(perftools::gputools::cuda::kCudaPlatformId)) {
-      registry.jit_devices_[DEVICE_GPU] = {DEVICE_GPU_XLA_JIT, false};
-    }
-    return nullptr;
-  }();
-  (void)registration;
-
-  mutex_lock lock(registry.mutex_);
-  auto it = registry.jit_devices_.find(device_name);
-  if (it == registry.jit_devices_.end()) return false;
-  if (jit_device_name) *jit_device_name = &it->second.first;
-  if (requires_jit) *requires_jit = it->second.second;
-  return true;
-}
-
-void XlaOpRegistry::RegisterJitKernels() {
-  XlaOpRegistry& registry = Instance();
-  mutex_lock lock(registry.mutex_);
-
-  if (registry.jit_kernels_registered_) return;
-  registry.jit_kernels_registered_ = true;
-
-  for (const auto& entry : registry.kernels_) {
-    for (const XlaKernel& k : entry.second) {
-      auto it = registry.ops_.find(k.kernel_def->op());
-      CHECK(it != registry.ops_.end()) << "Missing XLA op registration for op "
-                                       << k.kernel_def->op();
-      registry.kernel_registrars_.emplace_back(
-          new kernel_factory::OpKernelRegistrar(new KernelDef(*k.kernel_def),
-                                                "XlaJitOp", it->second));
-    }
-  }
-}
-
-std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
-    const string& jit_device_type) {
-  std::vector<const KernelDef*> kernels;
-  XlaOpRegistry& registry = Instance();
-  mutex_lock lock(registry.mutex_);
-  for (const XlaKernel& k : registry.kernels_.at(jit_device_type)) {
-    if (!k.jit_only) {
-      kernels.push_back(k.kernel_def.get());
-    }
-  }
-  return kernels;
-}
-
-XlaOpRegistry& XlaOpRegistry::Instance() {
-  static XlaOpRegistry* r = new XlaOpRegistry;
-  return *r;
-}
-
-XlaOpRegistrar::XlaOpRegistrar(StringPiece name,
-                               XlaOpRegistry::Factory factory) {
-  XlaOpRegistry& registry = XlaOpRegistry::Instance();
-  mutex_lock lock(registry.mutex_);
-  CHECK(registry.ops_.emplace(name.ToString(), factory).second)
-      << "Duplicate XLA op registration " << name;
-}
-
-XlaKernelRegistrar::XlaKernelRegistrar(bool jit_only, const KernelDef* def) {
-  XlaOpRegistry& registry = XlaOpRegistry::Instance();
-  mutex_lock lock(registry.mutex_);
-  registry.kernels_[def->device_type()].push_back(XlaOpRegistry::XlaKernel{
-      jit_only, std::unique_ptr<const KernelDef>(def)});
+void XlaExpression::set_constant_value(Tensor value) {
+  has_constant_value_ = true;
+  constant_value_ = std::move(value);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index f4b95b874b6..75630bee396 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -16,44 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
 
-#include <map>
 #include <memory>
 
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
-// Names of the XLA JIT devices. These are not user-visible, and are used
-// internally by the JIT to perform symbolic execution of a Tensorflow graph.
-
-extern const char* const DEVICE_CPU_XLA_JIT;  // "CPU_XLA_JIT"
-extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
-
-constexpr std::array<DataType, 5> kCpuAllTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_BOOL}};
-constexpr std::array<DataType, 2> kCpuIntTypes = {{DT_INT32, DT_INT64}};
-constexpr std::array<DataType, 2> kCpuFloatTypes = {{DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 4> kCpuNumericTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE}};
-
-constexpr std::array<DataType, 5> kGpuAllTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_BOOL}};
-constexpr std::array<DataType, 2> kGpuIntTypes = {{DT_INT32, DT_INT64}};
-constexpr std::array<DataType, 2> kGpuFloatTypes = {{DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 4> kGpuNumericTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE}};
-
-// Class is declared and defined in tla_jit_device.cc, reference
+// Class is defined in xla_compilation_device.cc, reference
 // included here only so the XlaCompilationDevice allocator_ member can be
-// defined.
+// declared.
 class XlaCompilationAllocator;
 
 // Deliberately don't register the device factory because we *never*
@@ -75,6 +52,8 @@ class XlaCompilationDevice : public LocalDevice {
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
 
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
   Status Sync() override;
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
@@ -85,130 +64,75 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-// Class that manages registrations of operators and devices for the XLA JIT.
-// Not thread-safe.
-class XlaOpRegistry {
+struct XlaVariable {
+  // If this variable is visible externally, what was its argument number?
+  int arg_num = -1;
+
+  // A descriptive name for the variable, used in error messages.
+  string name;
+
+  // Current type and value of the variable. Uninitialized variables are
+  // represented by a default (zero) handle and type DT_INVALID.
+  // While the type of a variable is notionally fixed during execution, when
+  // a variable is first initialized we do not yet know its type, so we keep
+  // track of its type dynamically.
+  DataType type = DT_INVALID;
+  xla::ComputationDataHandle value;
+
+  // Value of the variable at computation entry. Used to detect which
+  // variables have new values that need to be written back.
+  xla::ComputationDataHandle initial_value;
+
+  // We treat TensorArrays as a Variable with some extra metadata.
+
+  // 'tensor_array_size' stores the expected size of the TensorArray. We need
+  // to store this since sometimes TensorArrays must be initialized lazily since
+  // we do not know the element shape at construction time.
+  int64 tensor_array_size = -1;
+
+  // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
+  // to an XlaVariable containing the gradient TensorArrays. We store a pointer
+  // here since there should only be one gradient TensorArray per 'source'
+  // string, irrespective of the number of calls to TensorArrayGrad.
+  std::unordered_map<string, XlaVariable*> tensor_array_gradient;
+};
+
+// A XlaExpression wraps an XLA computation. Each Tensor on an
+// XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
+// matches the shape of the subcomputation in the ComputationDataHandle. Each
+// expression is either a constant, or a function of previously-compiled
+// expressions.
+class XlaExpression {
  public:
-  typedef OpKernel* (*Factory)(OpKernelConstruction*);
+  XlaExpression();
 
-  // Registers 'jit_device_name' as the JIT device corresponding to
-  // 'device_name'. If 'requires_jit' is true, then operators placed on this
-  // device must be JIT-compiled. Dies if a conflicting registration already
-  // exists.
-  static void RegisterJitDevice(const string& device_name,
-                                const string& jit_device_name,
-                                bool requires_jit);
+  // handle() stores the XLA handle of the computation that the
+  // expression represents.
+  void set_handle(const xla::ComputationDataHandle& h);
+  const xla::ComputationDataHandle& handle() const { return handle_; }
 
-  // Returns the JIT device name associated with 'device_name', setting
-  // 'jit_device_name' and 'requires_jit', if they are not null. Returns false
-  // and leaves 'jit_device_name' and 'requires_jit' unchanged if no matching
-  // JIT device is registered.
-  static bool GetJitDevice(const string& device_name,
-                           const string** jit_device_name, bool* requires_jit);
+  void set_constant_value(Tensor value);
+  bool has_constant_value() const { return has_constant_value_; }
+  const Tensor& constant_value() const { return constant_value_; }
 
-  // Registers all JIT kernels on JIT devices, if not already registered.
-  // Does nothing otherwise.
-  static void RegisterJitKernels();
-
-  // Returns KernelDefs for JIT ops registered on 'jit_device_type'.
-  // Does not include kernels registered using REGISTER_XLA_JIT_ONLY_KERNEL.
-  static std::vector<const KernelDef*> DeviceKernels(
-      const string& jit_device_type);
+  void set_variable(XlaVariable* variable) { variable_ = variable; }
+  XlaVariable* variable() const { return variable_; }
 
  private:
-  friend class XlaKernelRegistrar;
-  friend class XlaOpRegistrar;
+  // The XLA handle of the expression's computation.
+  xla::ComputationDataHandle handle_;
 
-  static XlaOpRegistry& Instance();
+  // If this expression is a constant with a known value, 'constant_value' is a
+  // host-memory Tensor containing the value. Used to avoid invoking XLA for
+  // expressions that are trivially constant.
+  bool has_constant_value_ = false;
+  Tensor constant_value_;
 
-  XlaOpRegistry();
-  ~XlaOpRegistry();
+  XlaVariable* variable_ = nullptr;  // Not owned.
 
-  mutex mutex_;
-
-  // Map from Tensorflow device names to the corresponding JIT device names.
-  std::unordered_map<string, std::pair<string, bool>> jit_devices_
-      GUARDED_BY(mutex_);
-
-  // Map from operator name to OpKernel factory, populated by REGISTER_XLA_OP.
-  std::unordered_map<string, Factory> ops_ GUARDED_BY(mutex_);
-
-  // Have we already registered the JIT kernels on the JIT devices?
-  bool jit_kernels_registered_ = false;
-
-  struct XlaKernel {
-    // Should this kernel be registered only on JIT devices, without a dummy
-    // kernel registered on the corresponding XLA device?
-    bool jit_only;
-
-    // KernelDef as built by REGISTER_XLA_KERNEL.
-    std::unique_ptr<const KernelDef> kernel_def;
-  };
-
-  // Map from JIT device name to a vector of XLA kernel descriptors.
-  std::unordered_map<string, std::vector<XlaKernel>> kernels_
-      GUARDED_BY(mutex_);
-
-  // Holds ownership of OpKernelRegistrars that represent the Tensorflow kernel
-  // registrations created by RegisterJitKernels() and RegisterDeviceKernels().
-  std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
-      kernel_registrars_ GUARDED_BY(mutex_);
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaExpression);
 };
 
-// REGISTER_XLA_OP() registers an XLA OpKernel by name, for example:
-// REGISTER_XLA_OP("Add", AddOp);
-// where 'AddOp' is the name of a JIT OpKernel class that implements "Add".
-//
-// We don't use a variadic macro here because we don't expect JIT operators to
-// be templated.
-
-#define REGISTER_XLA_OP(NAME, OP) \
-  REGISTER_XLA_OP_UNIQ_HELPER(__COUNTER__, NAME, OP)
-
-// REGISTER_XLA_KERNEL() associates an XLA OpKernel with a particular device and
-// set of type constraints, e.g.,
-// REGISTER_XLA_KERNEL(DEVICE_XLA_CPU_JIT,
-//                     Name("Relu").TypeConstraint("T", DT_FLOAT));
-//
-// REGISTER_XLA_JIT_ONLY_KERNEL is similar to REGISTER_XLA_KERNEL(), but causes
-// XlaOpRegistry::RegisterDeviceKernels() to ignore the kernel.
-
-#define REGISTER_XLA_KERNEL(DEVICE, BUILDER) \
-  REGISTER_XLA_KERNEL_UNIQ_HELPER(__COUNTER__, DEVICE, BUILDER, false)
-
-#define REGISTER_XLA_JIT_ONLY_KERNEL(DEVICE, BUILDER) \
-  REGISTER_XLA_KERNEL_UNIQ_HELPER(__COUNTER__, DEVICE, BUILDER, true)
-
-// Implementation details.
-
-class XlaOpRegistrar {
- public:
-  XlaOpRegistrar(StringPiece name, XlaOpRegistry::Factory factory);
-};
-
-#define REGISTER_XLA_OP_UNIQ_HELPER(COUNTER, NAME, OP) \
-  REGISTER_XLA_OP_UNIQ(COUNTER, NAME, OP)
-
-#define REGISTER_XLA_OP_UNIQ(CTR, NAME, OP)                                    \
-  static ::tensorflow::XlaOpRegistrar xla_op_registrar__body__##CTR##__object( \
-      NAME, [](::tensorflow::OpKernelConstruction* context)                    \
-                -> ::tensorflow::OpKernel* { return new OP(context); });
-
-// Implementation details.
-class XlaKernelRegistrar {
- public:
-  XlaKernelRegistrar(bool jit_only, const KernelDef* def);
-};
-
-#define REGISTER_XLA_KERNEL_UNIQ_HELPER(COUNTER, DEVICE, BUILDER, JIT_ONLY) \
-  REGISTER_XLA_KERNEL_UNIQ(COUNTER, DEVICE, BUILDER, JIT_ONLY)
-
-#define REGISTER_XLA_KERNEL_UNIQ(CTR, DEVICE, BUILDER, JIT_ONLY) \
-  static ::tensorflow::XlaKernelRegistrar                        \
-      xla_kernel_registrar__body__##CTR##__object(               \
-          JIT_ONLY,                                              \
-          ::tensorflow::register_kernel::BUILDER.Device(DEVICE).Build());
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 17adb9b1fdd..580ce3d802e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -37,36 +38,18 @@ namespace tensorflow {
 
 namespace {
 
-Status CheckSignature(const DataTypeVector& tf_types,
-                      const xla::Shape& xla_shape) {
-  if (xla::ShapeUtil::IsTuple(xla_shape)) {
-    if (xla::ShapeUtil::TupleElementCount(xla_shape) != tf_types.size()) {
-      return errors::Internal("XLA shape has ",
-                              xla::ShapeUtil::TupleElementCount(xla_shape),
-                              " elements while function has ", tf_types.size());
-    }
-    for (int i = 0; i < tf_types.size(); ++i) {
-      xla::PrimitiveType type;
-      TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(tf_types[i], &type));
-      if (type !=
-          xla::ShapeUtil::GetTupleElementShape(xla_shape, i).element_type()) {
-        return errors::Internal(
-            "element ", i, " has XLA type ",
-            xla::ShapeUtil::GetTupleElementShape(xla_shape, i).element_type(),
-            " and TensorFlow type ", DataTypeString(tf_types[i]));
-      }
-    }
-  } else {
-    if (tf_types.size() != 1) {
-      return errors::Internal("Expected singleton type, got ", tf_types.size(),
-                              " types");
-    }
-    xla::PrimitiveType type;
-    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(tf_types[0], &type));
-    if (type != xla_shape.element_type()) {
-      return errors::Internal("singleton element has XLA type ",
-                              xla_shape.element_type(), " and TensorFlow type ",
-                              DataTypeString(tf_types[0]));
+// Checks that arguments `args` match types `types`.
+Status CheckSignature(const DataTypeVector& types,
+                      const std::vector<XlaCompiler::Argument>& args) {
+  if (args.size() != types.size()) {
+    return errors::Internal("Compilation arguments have ", args.size(),
+                            " elements while function has ", types.size());
+  }
+  for (int i = 0; i < types.size(); ++i) {
+    if (types[i] != args[i].type && types[i] != DT_RESOURCE) {
+      return errors::Internal(
+          "Argument ", i, " has declared type ", DataTypeString(args[i].type),
+          " but function parameter has type ", DataTypeString(types[i]));
     }
   }
   return Status::OK();
@@ -74,15 +57,39 @@ Status CheckSignature(const DataTypeVector& tf_types,
 
 }  // namespace
 
-XlaCompiler::XlaCompiler(const XlaCompiler::Options& options)
-    : client_(options.client),
-      allow_cpu_custom_calls_(options.allow_cpu_custom_calls),
-      local_executable_has_hybrid_result_(
-          options.local_executable_has_hybrid_result),
-      resolve_compile_time_constants_(options.resolve_compile_time_constants),
+bool XlaCompiler::Argument::operator==(
+    const XlaCompiler::Argument& other) const {
+  if (std::tie(kind, type, shape, name, tensor_array_size) !=
+      std::tie(other.kind, other.type, other.shape, other.name,
+               other.tensor_array_size)) {
+    return false;
+  }
+  if (constant_value.shape() != other.constant_value.shape()) {
+    return false;
+  }
+  return constant_value.tensor_data() == other.constant_value.tensor_data();
+}
+
+XlaCompiler::XlaCompiler(XlaCompiler::Options options)
+    : options_(options),
+      initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(new XlaCompilationDevice(SessionOptions(), options.device_type)),
-      device_mgr_({device_}) {}
+      device_(
+          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
+      device_mgr_({device_}) {
+  // We no longer need the device_type.
+  options_.device_type = nullptr;
+
+  if (options_.populate_resource_manager) {
+    initialization_status_ =
+        (*options_.populate_resource_manager)(device_->resource_manager());
+  }
+
+  flib_runtime_.reset(NewFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), device_, options.graph_def_version,
+      options.flib_def, OptimizerOptions(),
+      nullptr /* custom_kernel_creator */));
+}
 
 XlaCompiler::~XlaCompiler() = default;
 
@@ -91,102 +98,63 @@ int64 XlaCompiler::NextStepId() {
   return next_step_id_++;
 }
 
+uint64 XlaCompiler::SignatureHash::operator()(
+    const std::pair<string, std::vector<Argument>>& signature) const {
+  return std::hash<string>()(signature.first);
+}
+
 Status XlaCompiler::CompileFunction(
-    FunctionLibraryRuntime* flr, const NameAttrList& function,
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
     const std::vector<XlaCompiler::Argument>& args,
     XlaCompiler::CompilationResult* result) {
-  const string function_id = Canonicalize(function.name(), function.attr());
+  const string function_id =
+      Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(
-      flr->Instantiate(function.name(), function.attr(), &handle));
-
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
-  CHECK(fbody);
-
-  return CompileFunctionBody(flr, *fbody, function_id, args,
-                             /*use_tuple_arg=*/false, result);
-}
-
-Status XlaCompiler::CompileSubComputation(FunctionLibraryRuntime* flr,
-                                          const NameAttrList& function,
-                                          const xla::Shape& input_shape,
-                                          const xla::Shape& output_shape,
-                                          xla::Computation* computation) {
-  const string function_id = Canonicalize(function.name(), function.attr());
-  VLOG(1) << "XlaCompiler::CompileSubComputation " << function_id;
-
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(
-      flr->Instantiate(function.name(), function.attr(), &handle));
-
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
-  CHECK(fbody);
-
-  TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, input_shape));
-  TF_RETURN_IF_ERROR(CheckSignature(fbody->ret_types, output_shape));
-
-  const bool use_tuple_arg = xla::ShapeUtil::IsTuple(input_shape);
-
-  std::vector<XlaCompiler::Argument> args(fbody->arg_types.size());
-  if (use_tuple_arg) {
-    for (int i = 0; i < args.size(); ++i) {
-      xla::Shape xla_shape =
-          xla::ShapeUtil::GetTupleElementShape(input_shape, i);
-      args[i].type = fbody->arg_types[i];
-      args[i].shape = XLAShapeToTensorShape(xla_shape);
-      args[i].parameter = i;
-    }
-  } else {
-    args[0].type = fbody->arg_types[0];
-    args[0].shape = XLAShapeToTensorShape(input_shape);
-    args[0].parameter = 0;
+  auto it = cache_.find({function_id, args});
+  if (it != cache_.end()) {
+    *result = it->second;
+    return Status::OK();
   }
 
-  CompilationResult result;
-  TF_RETURN_IF_ERROR(CompileFunctionBody(flr, *fbody, function_id, args,
-                                         use_tuple_arg, &result));
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(flib_runtime_->Instantiate(
+      function.name(), AttrSlice(&function.attr()), &handle));
 
-  if (!xla::ShapeUtil::Compatible(result.xla_output_shape, output_shape)) {
-    return errors::Internal("output shape mismatch from compilation");
-  }
-  *computation = std::move(result.computation);
+  const FunctionBody* fbody = flib_runtime_->GetFunctionBody(handle);
+  CHECK(fbody);
 
-  return Status::OK();
-}
+  TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, args));
 
-Status XlaCompiler::CompileFunctionBody(
-    FunctionLibraryRuntime* flr, const FunctionBody& fbody,
-    const string& function_id, const std::vector<XlaCompiler::Argument>& args,
-    bool use_tuple_arg, XlaCompiler::CompilationResult* result) {
-  VLOG(1) << "XlaCompiler::CompileFunctionBody " << function_id;
-
-  std::unique_ptr<Graph> graph(new Graph(flr->GetFunctionLibraryDefinition()));
-  CopyGraph(*fbody.graph, graph.get());
+  std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
+  CopyGraph(*fbody->graph, graph.get());
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile(
-        strings::StrCat("xla_jit_raw_input_", function_id), *graph);
+        strings::StrCat("xla_compile_function_input_", function_id), *graph);
   }
 
   // Optimize the graph before running the compiler.
-  // TODO(pbar): The constant folder currently does not simplify int32
-  // operations for devices other than CPU.
   OptimizerOptions opts;
+  opts.set_do_common_subexpression_elimination(true);
+  opts.set_do_function_inlining(true);
+  opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  OptimizeGraph(flr, &graph);
+  optimizer.Optimize(flib_runtime_.get(), flib_runtime_->env(),
+                     /*device=*/nullptr, &graph);
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile(
-        strings::StrCat("xla_jit_final_graph_", function_id), *graph);
+        strings::StrCat("xla_compile_function_optimized_", function_id),
+        *graph);
   }
 
   VLOG(1) << "====================================================";
-  TF_RETURN_IF_ERROR(CompileGraph(function_id, std::move(graph), flr, args,
-                                  use_tuple_arg, result));
+  TF_RETURN_IF_ERROR(
+      CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
+  cache_[{function_id, args}] = *result;
   return Status::OK();
 }
 
@@ -199,7 +167,7 @@ Status XlaCompiler::BuildExecutable(
   std::vector<const xla::Shape*> argument_layouts(
       result.xla_input_shapes.size());
   for (int i = 0; i < result.xla_input_shapes.size(); ++i) {
-    argument_layouts[i] = &result.xla_input_shapes[i].second;
+    argument_layouts[i] = &result.xla_input_shapes[i];
   }
   if (result.requires_runtime_context) {
     // The final arg is the XlaLocalRuntimeContext*.
@@ -210,9 +178,10 @@ Status XlaCompiler::BuildExecutable(
   build_options.set_device_ordinal(local_client->default_device_ordinal());
   build_options.set_platform(local_client->platform());
   build_options.set_result_layout(result.xla_output_shape);
-  build_options.set_has_hybrid_result(local_executable_has_hybrid_result_);
+  build_options.set_has_hybrid_result(
+      options_.local_executable_has_hybrid_result);
 
-  auto compile_result = local_client->Compile(result.computation,
+  auto compile_result = local_client->Compile(*result.computation,
                                               argument_layouts, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
@@ -256,24 +225,12 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   std::unique_ptr<Executor> exec(exec_ptr);
   // At this point ownership of the graph has been transferred to exec.
 
-  auto runner = [](Executor::Args::Closure c) {
-    // TODO(misard) Temporarily just schedule c eagerly while we
-    // decide what to do about the fact that the ComputationBuilder is
-    // thread-compatible, but we don't really want Op writers to have
-    // to remember to acquire a lock around every call to
-    // ComputationBuilder. One possibility is to add the (generally
-    // useful) ability to run a single-threaded Executor based on an
-    // option in LocalExecutorParams. Another is to automagically
-    // acquire a lock around ComputationBuilder calls using some
-    // wrapper or RAII funny business.
-    c();
-  };
-
   // Run the graph symbolically, turning the graph into an XLA computation.
   Executor::Args exec_args;
   exec_args.step_id = step_id;
   exec_args.step_container = step_container.get();
-  exec_args.runner = runner;
+  // Run all compilation kernels on the main thread.
+  exec_args.runner = [](Executor::Args::Closure c) { c(); };
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       exec->Run(exec_args),
       "Conversion from TensorFlow graph to XLA computation failed.");
@@ -283,84 +240,245 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   return cleanup_status;
 }
 
+// Builds XLA computations for each of the arguments to the computation.
+// `args` are the arguments to the computation.
+Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
+                      bool use_tuple_arg, xla::ComputationBuilder* builder,
+                      std::vector<XlaContext::Argument>* context_args,
+                      std::vector<int>* input_mapping,
+                      std::vector<xla::Shape>* input_shapes) {
+  context_args->resize(args.size());
+
+  // Argument numbers of arguments and variables that are to be passed to the
+  // XLA computation as runtime parameters.
+  std::vector<int> parameters, variables;
+  parameters.reserve(args.size());
+  variables.reserve(args.size());
+
+  for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
+       ++i) {
+    XlaContext::Argument& context_arg = (*context_args)[i];
+    context_arg.name = args[i].name;
+    context_arg.value.constant_value = args[i].constant_value;
+    context_arg.value.type = args[i].type;
+
+    switch (args[i].kind) {
+      case XlaCompiler::Argument::kVariable:
+        variables.push_back(i);
+        context_arg.is_variable = true;
+        context_arg.value.is_constant = false;
+        context_arg.tensor_array_size = args[i].tensor_array_size;
+        break;
+      case XlaCompiler::Argument::kParameter:
+        parameters.push_back(i);
+        context_arg.value.is_constant = false;
+        break;
+      case XlaCompiler::Argument::kUninitializedVariable:
+        context_arg.is_variable = true;
+        context_arg.value.is_constant = true;
+        context_arg.tensor_array_size = args[i].tensor_array_size;
+        break;
+      case XlaCompiler::Argument::kConstant:
+        context_arg.value.is_constant = true;
+        break;
+      case XlaCompiler::Argument::kInvalid:
+        return errors::Internal("Unreachable case in BuildArguments()");
+    }
+  }
+
+  // Append parameters containing variable values after the other runtime
+  // parameters.
+  parameters.insert(parameters.end(), variables.begin(), variables.end());
+  if (parameters.empty()) {
+    return Status::OK();
+  }
+
+  input_shapes->resize(parameters.size());
+  input_mapping->resize(parameters.size());
+  for (std::vector<int>::size_type i = 0; i < input_shapes->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[parameters[i]];
+    // Computes the shapes of non-constant arguments.
+    xla::PrimitiveType type;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(arg.type, &type));
+    xla::ShapeUtil::PopulateShape(type, arg.shape.dim_sizes(),
+                                  &(*input_shapes)[i]);
+    (*input_mapping)[i] = parameters[i];
+  }
+
+  if (use_tuple_arg) {
+    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(*input_shapes);
+    xla::ComputationDataHandle tuple =
+        builder->Parameter(0, tuple_shape, "arg_tuple");
+    for (std::vector<int>::size_type i = 0; i < input_shapes->size(); ++i) {
+      (*context_args)[parameters[i]].value.handle =
+          builder->GetTupleElement(tuple, i);
+    }
+  } else {
+    for (std::vector<int>::size_type i = 0; i < input_shapes->size(); ++i) {
+      (*context_args)[parameters[i]].value.handle =
+          builder->Parameter(i, (*input_shapes)[i], strings::StrCat("arg", i));
+    }
+  }
+  return Status::OK();
+}
+
+// Builds the XLA computation.
+//
+// `retvals` is the list of retvals produced by _Retval operators, in index
+// order. `variable_map` is a map from variable ID numbers to XlaOpContext
+// variable states, generated by the symbolic evaluation.
+// If `has_side_effects` is true, the computation has side effects and should be
+// built even if it has no outputs.
+// If `return_updated_values_for_all_variables` is true, all variables will be
+// included in `variable_updates`, regardless of whether their value changed.
+// Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
+// Sets `*variable_updates` to a description of variables whose values are
+// written by the computation; the variable writes are the last
+// `variable_updates.size()` return values from the computation. Each entry in
+// `variable_updates` is a (input_index, type) pair, where `input_index` is the
+// index of a resource variable argument to the computation, and `type` is the
+// type of the final output.
+Status BuildComputation(
+    const std::vector<XlaContext::HandleOrConstant>& retvals,
+    const std::vector<std::unique_ptr<XlaVariable>>& variables,
+    bool has_side_effects, bool return_updated_values_for_all_variables,
+    xla::ComputationBuilder* builder, xla::Computation* computation,
+    int* num_nonconst_outputs,
+    std::vector<XlaCompiler::VariableUpdate>* variable_updates) {
+  std::vector<xla::ComputationDataHandle> elems;
+  elems.reserve(retvals.size());
+  for (const XlaContext::HandleOrConstant& retval : retvals) {
+    if (!retval.is_constant) {
+      elems.push_back(retval.handle);
+    }
+  }
+  *num_nonconst_outputs = elems.size();
+
+  // Add return values for variables whose values have changed.
+  std::vector<const XlaVariable*> arg_vars;
+  arg_vars.reserve(variables.size());
+  for (const auto& var : variables) {
+    if (var->arg_num >= 0) {
+      arg_vars.push_back(var.get());
+    }
+  }
+  std::sort(arg_vars.begin(), arg_vars.end(),
+            [](const XlaVariable* a, const XlaVariable* b) {
+              return a->arg_num < b->arg_num;
+            });
+
+  for (const XlaVariable* var : arg_vars) {
+    bool modified = var->value.handle() != var->initial_value.handle();
+    if (return_updated_values_for_all_variables || modified) {
+      variable_updates->emplace_back();
+      XlaCompiler::VariableUpdate& update = variable_updates->back();
+      update.input_index = var->arg_num;
+      update.type = var->type;
+      update.modified = modified;
+      elems.push_back(var->value);
+    }
+  }
+
+  if (!elems.empty() || has_side_effects) {
+    // Builds a empty tuple return value for computations that have side effects
+    // but have no return values.
+    xla::ComputationDataHandle handle = builder->Tuple(elems);
+
+    // TODO(b/31775371): to workaround bug, we must build a no-op computation
+    // that is guaranteed to be constructed after all of the formal parameters
+    // to the computation. Once the bug is fixed, we could avoid tupling here.
+    if (elems.size() == 1) {
+      handle = builder->GetTupleElement(handle, 0);
+    }
+
+    // Builds the XLA computation.
+    xla::StatusOr<xla::Computation> computation_status = builder->Build();
+    if (!computation_status.ok()) {
+      return computation_status.status();
+    }
+    *computation = computation_status.ConsumeValueOrDie();
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
-Status XlaCompiler::CompileGraph(string const& name,
+Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
+                                 string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 FunctionLibraryRuntime* flib,
                                  const std::vector<XlaCompiler::Argument>& args,
-                                 bool use_tuple_arg,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
 
-  // Converts the input shapes into xla::Shape instances.
-  result->xla_input_shapes.reserve(args.size());
-  for (int i = 0; i < args.size(); ++i) {
-    if (args[i].parameter < 0) {
-      continue;
-    }
-    result->xla_input_shapes.push_back(std::make_pair(i, xla::Shape()));
-    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
-        args[i].type, args[i].shape, &result->xla_input_shapes.back().second));
-  }
+  // Report the error here if initialization failed.
+  TF_RETURN_IF_ERROR(initialization_status_);
 
-  XlaContext* xla_context =
-      new XlaContext(this, client(), name, allow_cpu_custom_calls_,
-                     resolve_compile_time_constants_);
-  core::ScopedUnref xla_context_unref(xla_context);
+  xla::ComputationBuilder builder(client(), name);
+  XlaContext* context =
+      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
+                     options_.resolve_compile_time_constants);
+  core::ScopedUnref context_unref(context);
 
-  TF_RETURN_IF_ERROR(xla_context->BuildArguments(args, use_tuple_arg));
+  result->tuple_arg = options.use_tuple_arg;
 
-  TF_RETURN_IF_ERROR(
-      ExecuteGraph(xla_context, std::move(graph), device_, flib, NextStepId()));
+  std::vector<XlaContext::Argument> context_args;
+  TF_RETURN_IF_ERROR(BuildArguments(args, options.use_tuple_arg, &builder,
+                                    &context_args, &result->input_mapping,
+                                    &result->xla_input_shapes));
+  context->set_args(std::move(context_args));
+
+  TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
+                                  flib_runtime_.get(), NextStepId()));
 
-  std::vector<XlaContext::ConstRetVal> compile_time_constants;
   int num_nonconst_outputs;
-  TF_RETURN_IF_ERROR(xla_context->CollectResults(
-      &result->computation, &result->requires_runtime_context,
-      &compile_time_constants, &num_nonconst_outputs));
+  result->computation = std::make_shared<xla::Computation>();
+  TF_RETURN_IF_ERROR(BuildComputation(
+      context->retvals(), context->variables(), context->has_side_effects(),
+      options.return_updated_values_for_all_variables, &builder,
+      result->computation.get(), &num_nonconst_outputs,
+      &result->variable_updates));
 
-  VLOG(2) << "Outputs: constant: " << compile_time_constants.size()
+  result->requires_runtime_context = context->has_context_parameter();
+
+  // Tuple arguments and runtime context parameters are incompatible.
+  CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
+
+  VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-  result->outputs.resize(compile_time_constants.size() + num_nonconst_outputs);
-  for (const auto& c : compile_time_constants) {
-    if (!c.status.ok()) {
-      Status constant_status = c.status;
-      errors::AppendToMessage(&constant_status,
-                              "Failed evaluating constant XLA return "
-                              "value ",
-                              c.index);
-      return constant_status;
+  result->outputs.resize(context->retvals().size());
+  for (std::vector<XlaContext::HandleOrConstant>::size_type i = 0;
+       i < context->retvals().size(); ++i) {
+    const XlaContext::HandleOrConstant& retval = context->retvals()[i];
+    if (retval.is_constant) {
+      OutputDescription& output = result->outputs[i];
+      output.shape = retval.constant_value.shape();
+      output.is_constant = true;
+      output.constant_value = retval.constant_value;
     }
-    if (c.index >= result->outputs.size()) {
-      return errors::InvalidArgument("Invalid argument index ", c.index);
-    }
-    OutputDescription& output = result->outputs[c.index];
-    output.shape = c.value.shape();
-    output.is_constant = true;
-    output.constant_value = c.value;
   }
 
-  if (result->computation.IsNull()) {
+  if (result->computation->IsNull()) {
     return Status::OK();
   }
 
   // Compute the output shapes, if there is a computation with non-constant
   // outputs.
-  auto computation_shape = client()->GetComputationShape(result->computation);
+  auto computation_shape = client()->GetComputationShape(*result->computation);
   if (!computation_shape.ok()) {
     return computation_shape.status();
   }
 
   result->xla_output_shape.Swap(
       computation_shape.ValueOrDie()->mutable_result());
+  VLOG(2) << "XLA output shape: "
+          << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
-  auto num_non_constant_outputs =
+  auto num_computation_outputs =
       (xla::ShapeUtil::IsTuple(result->xla_output_shape))
           ? xla::ShapeUtil::TupleElementCount(result->xla_output_shape)
           : 1;
   // Tensorflow expects a major-to-minor order of results.
-  if (1 == num_non_constant_outputs) {
+  if (1 == num_computation_outputs) {
     xla::Shape& s = result->xla_output_shape;
     auto& minor_to_major = *s.mutable_layout()->mutable_minor_to_major();
     minor_to_major.Resize(xla::ShapeUtil::Rank(s), 0);
@@ -375,20 +493,37 @@ Status XlaCompiler::CompileGraph(string const& name,
 
   // Converts the output shapes to TensorShapes.
   int computation_output = 0;
-  for (int i = 0; i < result->outputs.size(); ++i) {
-    if (!result->outputs[i].is_constant) {
-      CHECK_LT(computation_output, num_non_constant_outputs);
-      if (num_non_constant_outputs > 1) {
-        result->outputs[i].shape =
+  for (std::vector<XlaContext::HandleOrConstant>::size_type i = 0;
+       i < context->retvals().size(); ++i) {
+    const XlaContext::HandleOrConstant& retval = context->retvals()[i];
+    if (!retval.is_constant) {
+      CHECK_LT(computation_output, num_computation_outputs);
+      OutputDescription& output = result->outputs[i];
+      output.is_constant = false;
+      if (num_computation_outputs > 1) {
+        output.shape =
             XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
                 result->xla_output_shape, computation_output));
       } else {
-        result->outputs[i].shape =
-            XLAShapeToTensorShape(result->xla_output_shape);
+        output.shape = XLAShapeToTensorShape(result->xla_output_shape);
       }
       ++computation_output;
     }
   }
+
+  for (std::vector<VariableUpdate>::size_type i = 0;
+       i < result->variable_updates.size(); ++i) {
+    if (num_computation_outputs > 1) {
+      result->variable_updates[i].shape =
+          XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
+              result->xla_output_shape, computation_output));
+    } else {
+      CHECK_EQ(0, computation_output);
+      result->variable_updates[i].shape =
+          XLAShapeToTensorShape(result->xla_output_shape);
+    }
+    ++computation_output;
+  }
   return Status::OK();
 }
 
@@ -397,7 +532,7 @@ Status XlaCompiler::GetChannelHandle(const string& key,
   mutex_lock lock(mu_);
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
-    TF_ASSIGN_OR_RETURN(result.first->second, client_->CreateChannelHandle());
+    TF_ASSIGN_OR_RETURN(result.first->second, client()->CreateChannelHandle());
   }
   *channel = result.first->second;
   VLOG(1) << "Channel: " << key << " " << channel->DebugString();
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index df40af3bbd4..13143055325 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
@@ -34,36 +35,95 @@ namespace tensorflow {
 // It does a symbolic execution of the graph starting from specific input
 // shapes, using a JIT device to convert operators into XLA computations.
 //
-// It is typically invoked from an `_XlaLaunch` operator once the shapes
-// of all input parameters to the computation are known. This is
+// XlaCompiler is typically invoked from an `_XlaLaunch` operator once the
+// shapes of all input parameters to the computation are known. This is
 // because the symbolic execution requires known shapes for all operations.
+//
+// XlaCompiler compiles Tensorflow graphs that received inputs via _Arg nodes,
+// and return outputs via _Retval nodes.
+//
+// The XlaCompiler requires one Argument struct for each _Arg index, that
+// describes each argument. Arguments can be compile-time constants
+// (kind kConstant), run-time parameters (kind kParameter), or resource
+// variables (kinds kVariable and kUninitializedVariable).
+//
+// Only kParameter and kVariable arguments become runtime parameters to the
+// generated XLA computation. The XLA computation will have run-time parameters
+// in the following order:
+//   +---------------------+-----------------------------------------+
+//   |  kParameter values  |  Initial values of kVariable arguments  |
+//   +---------------------+-----------------------------------------+
+// Within each block, the arguments are arranged by the _Arg index from which
+// they were derived.
+// If `Options::requires_runtime_context` is true, then an additional runtime
+// context argument is passed as a final argument.
+//
+// The run-time outputs of the XLA computation are arranged in the following
+// order:
+//   +------------------+-----------------------------------------+
+//   |  _Retval values  |  Updated values of kVariable arguments  |
+//   +------------------+-----------------------------------------+
+// _Retval values are ordered by _Retval index, whereas kVariable values are
+// ordered by the original _Arg position of the variable.
+//
+// In both inputs and outputs, kVariable values are placed the end. When
+// emitting While loop bodies, we must ensure that the loop body has
+// identical input and output signatures. By moving variable values
+// to the end of the argument list and using the
+// `return_updated_values_for_all_variables` option, we can ensure that the
+// input and output values of variables appear at the same positions.
+
 class XlaCompiler {
  public:
   // Describes how to derive the value of each _Arg node in the graph/function
-  // being compiled. Each argument must be either a parameter of the generated
-  // XLA computation (parameter >= 0), or a compile time constant
-  // (parameter < 0).
+  // being compiled. There must be one Argument for each _Arg index.
   struct Argument {
-    // The type of the argument.
+    enum Kind {
+      // Default value; not a valid kind.
+      kInvalid,
+
+      // Argument is a compile-time constant. No associated runtime parameter.
+      kConstant,
+
+      // Argument is a variable that has not been initialized yet. No associated
+      // runtime parameter.
+      kUninitializedVariable,
+
+      // Argument is a variable that already has a value set. Expects a runtime
+      // parameter containing the current value.
+      kVariable,
+
+      // Argument is a run-time parameter.
+      kParameter,
+    };
+
+    Kind kind = kInvalid;
+
+    // The type of the argument. If the argument is a resource variable, this
+    // is the type of the variable's value, not DT_RESOURCE.
     DataType type;
 
-    // The shape of the argument.
+    // The shape of the argument. If the argument is a resource variable, this
+    // is the shape of the variable's value.
     TensorShape shape;
 
-    // The parameter number of this argument to the XLA computation. < 0
-    // means this is a compile-time constant argument.
-    int parameter;
-
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
     Tensor constant_value;
 
     // The name of this argument, used for debugging.
     string name;
+
+    // For a kVariable or kUninitializedVariable corresponding to a TensorArray,
+    // what is the tensor array's declared size?
+    int64 tensor_array_size = -1;
+
+    bool operator==(const Argument& other) const;
   };
 
   struct OutputDescription {
-    // Shape of the output.
+    // Type and shape of the output.
+    DataType type;
     TensorShape shape;
 
     // Constant output value, if known to be constant at JIT compilation time.
@@ -72,37 +132,69 @@ class XlaCompiler {
     Tensor constant_value;
   };
 
+  // Describes a variable write side effect of the computation.
+  struct VariableUpdate {
+    // Index of the input that contains the variable resource to write to.
+    int input_index;
+
+    // Type and shape of the tensor to be written back.
+    DataType type;
+    TensorShape shape;
+
+    // Was the value of the variable modified by the computation?
+    // (Always true, unless `return_updated_values_for_all_variables` is true.)
+    bool modified;
+  };
+
   struct CompilationResult {
-    // Vector of (Tensorflow input number, XLA shape) pairs that describe
-    // the arguments of the compiled XLA computation. (Because of constant
-    // inputs, the arguments to the XLA computation are a subset of the
-    // inputs passed to the JIT.)
-    std::vector<std::pair<int, xla::Shape>> xla_input_shapes;
+    // Vector that maps from the parameters of the XLA computation to their
+    // original argument positions. To handle compile-time constant inputs and
+    // variables, the parameters to the XLA computation may be a subset of the
+    // original arguments, and are not necessarily in the same order.)
+    std::vector<int> input_mapping;
 
     // Does the computation require the local runtime context to be passed as
     // the last argument?
     bool requires_runtime_context = false;
 
-    // Output shape in XLA format. This is a tuple if and only if
-    // there are multiple non-constant outputs.
+    // Input shapes of the computation.
+    std::vector<xla::Shape> xla_input_shapes;
+
+    // Should the arguments be packed into a single tuple?
+    bool tuple_arg;
+
+    // Output shape in XLA format. The output shape is a tuple if and only if
+    // the number of non-constant outputs is not equal to 1.
     xla::Shape xla_output_shape;
 
     // TensorFlow shapes of outputs, together with the values of any
     // constant arguments. Vector indexed by Tensorflow _Retval number,
-    // containing both constant and non-constant arguments.
+    // containing both constant and non-constant results.
     std::vector<OutputDescription> outputs;
 
+    // Variables whose values were updated by the computation, ordered
+    // by return value position. Variable updates follow the non-constant
+    // results in the outputs of XLA computation.
+    std::vector<VariableUpdate> variable_updates;
+
     // The XLA computation built from the tensorflow subgraph. May be null
     // if the output consists solely of compile-time constants.
-    xla::Computation computation;
+    std::shared_ptr<xla::Computation> computation;
   };
 
   struct Options {
-    // Name of the compilation device to use.
-    DeviceType device_type = DeviceType("");
+    // Name of the compilation device to use. Needs to be live only during
+    // XlaCompiler's constructor.
+    const DeviceType* device_type = nullptr;
 
     xla::Client* client = nullptr;
 
+    // Function library in which to find function definitions. Must be non-null.
+    const FunctionLibraryDefinition* flib_def = nullptr;
+
+    // The graph def version to be compiled.
+    int graph_def_version = TF_GRAPH_DEF_VERSION;
+
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
     // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
     // to the computation.
@@ -119,22 +211,43 @@ class XlaCompiler {
     // as Tensors at compile-time, rather than as run-time outputs of the
     // computation.
     bool resolve_compile_time_constants = true;
+
+    // If not nullptr, populate_resource_manager is called with the
+    // compilation device's resource manager when the compilation
+    // device is created, and can be used to create metadata objects
+    // that can be accessed by XLA op kernels.
+    std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
   };
 
-  explicit XlaCompiler(const Options& options);
+  explicit XlaCompiler(Options options);
   ~XlaCompiler();
 
+  // Options pertaining to an individual call to CompileGraph() or
+  // CompileFunction().
+  struct CompileOptions {
+    // If `use_tuple_arg` is true, a single tuple parameter will be used for all
+    // arguments; if false, each argument gets its own parameter.
+    bool use_tuple_arg = false;
+
+    // If 'return_updated_values_for_all_variables' is true, then updated
+    // values of all resource variables arguments will be included in the
+    // 'variable_updates' of the computation, even if the variable was not
+    // modified by the computation. Used when compiling loop bodies to ensure
+    // the input and output signatures match.
+    bool return_updated_values_for_all_variables = false;
+  };
+
   // Compiles a Tensorflow function `fn_name_attrs` into an XLA computation.
   // `args` describes the arguments to the function, each of which must either
-  // be a parameter to the XLA computation or a compile-time constant.
-  // Writes the compiled output to `result`.
+  // be a runtime-parameter to the XLA computation, a compile-time constant, or
+  // a resource variable. Writes the compiled output to `result`.
   //
   // The generated XLA computation returns a tuple containing only the
   // non-constant outputs as a function of the input arguments. Constant
   // arguments are returned as host memory tensors in the output list and are
   // not included in the XLA computation's outputs. The XLA computation is
-  // null if there are no data-dependent outputs.
-  Status CompileFunction(FunctionLibraryRuntime* flr,
+  // null if there are no data-dependent outputs and no side effects.
+  Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
                          const std::vector<Argument>& args,
                          CompilationResult* result);
@@ -142,43 +255,21 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::Computation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  // If `use_tuple_arg` is true, the compilation takes all of its arguments as
-  // a single tuple.
-  Status CompileGraph(string const& name, std::unique_ptr<Graph> graph,
-                      FunctionLibraryRuntime* flr,
-                      const std::vector<Argument>& args, bool use_tuple_arg,
+  Status CompileGraph(const CompileOptions& options, string const& name,
+                      std::unique_ptr<Graph> graph,
+                      const std::vector<Argument>& args,
                       CompilationResult* result);
 
-  // Helper function that compiles a function to an XLA computation suitable
-  // for use as a subroutine in other Computations, e.g., the body of a
-  // While loop.
-  //
-  // The emitted Computation takes a single input parameter with
-  // input_shape. If this is a tuple then the tuple element shapes
-  // must match the types of the function's _Arg nodes. If input_shape
-  // is not a tuple then the function must have a single _Arg node
-  // with the same type as input_shape. The shapes of the _Arg values
-  // will be compiled to match input_shape.
-  //
-  // The emitted Computation also returns a single value. If output_shape is a
-  // tuple the tuple elements' types and shapes must match the compiled
-  // function's _Retval nodes. If output_shape is not a tuple the
-  // function must have a single _Retval node with the correct type
-  // (and shape after compilation).
-  Status CompileSubComputation(FunctionLibraryRuntime* flr,
-                               const NameAttrList& fn_name_attrs,
-                               const xla::Shape& input_shape,
-                               const xla::Shape& output_shape,
-                               xla::Computation* computation);
-
-  // Takes <*result>, which has been compiled from a Tensorflow subgraph to a
+  // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const CompilationResult& result,
                          std::unique_ptr<xla::LocalExecutable>* executable);
 
-  xla::Client* client() const { return client_; }
+  const Options& options() const { return options_; }
+  xla::Client* client() const { return options_.client; }
   XlaCompilationDevice* device() const { return device_; }
   const DeviceMgr* device_mgr() const { return &device_mgr_; }
+  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_.get(); }
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
@@ -187,17 +278,10 @@ class XlaCompiler {
   Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
 
  private:
-  // Does the real work of Compile() and CompileToComputation().
-  Status CompileFunctionBody(FunctionLibraryRuntime* flr,
-                             const FunctionBody& function_body,
-                             const string& name,
-                             const std::vector<Argument>& args,
-                             bool use_tuple_arg, CompilationResult* result);
+  Options options_;
 
-  xla::Client* client_;  // Not owned.
-  const bool allow_cpu_custom_calls_;
-  const bool local_executable_has_hybrid_result_;
-  const bool resolve_compile_time_constants_;
+  // Status set to non-OK in the constructor if initialization fails.
+  Status initialization_status_;
 
   // Returns the next step sequence number.
   int64 NextStepId();
@@ -210,6 +294,17 @@ class XlaCompiler {
   XlaCompilationDevice* device_;  // Owned by device_mgr_
   DeviceMgr device_mgr_;
 
+  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+
+  struct SignatureHash {
+    uint64 operator()(
+        const std::pair<string, std::vector<Argument>>& signature) const;
+  };
+
+  std::unordered_map<std::pair<string, std::vector<Argument>>,
+                     CompilationResult, SignatureHash>
+      cache_;
+
   std::unordered_map<string, xla::ChannelHandle> channels_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index ca7c0b17b8c..58d74057d10 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -17,12 +17,14 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -33,12 +35,73 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Helper class to test the ability to pass resources through to XLA
+// compiled kernels.
+class DummyResourceForTest : public ResourceBase {
+ public:
+  string DebugString() override { return "dummy"; }
+  void Increment() { ++value_; }
+  int Get() { return value_; }
+
+ private:
+  int value_ = 0;
+};
+
+class DummyReadResourceOp : public XlaOpKernel {
+ public:
+  explicit DummyReadResourceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    ResourceMgr* rm = ctx->op_kernel_context()->resource_manager();
+    OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
+    DummyResourceForTest* dummy;
+    OP_REQUIRES_OK(ctx, rm->Lookup<DummyResourceForTest>(
+                            rm->default_container(), "dummy", &dummy));
+    dummy->Increment();
+    dummy->Unref();
+
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+};
+
+class DummyReadResourceCC {
+ public:
+  DummyReadResourceCC(const Scope& scope, const Input& value) {
+    if (!scope.ok()) return;
+    auto _value = ops::AsNodeOut(scope, value);
+    if (!scope.ok()) return;
+    Node* ret;
+    const auto unique_name = scope.GetUniqueNameForOp("DummyReadResource");
+    auto builder = NodeBuilder(unique_name, "DummyReadResource").Input(_value);
+    scope.UpdateBuilder(&builder);
+    scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+    if (!scope.ok()) return;
+    this->output_ = Output(ret, 0);
+  }
+  Node* node() const { return output_.node(); }
+
+  Output output_;
+};
+
+REGISTER_OP("DummyReadResource")
+    .Input("input: int32")
+    .Output("output: int32")
+    .Doc(R"doc(
+A dummy Op.
+
+input: dummy input.
+output: dummy output.
+)doc");
+
+REGISTER_XLA_OP(Name("DummyReadResource"), DummyReadResourceOp);
+
 class XlaCompilerTest : public ::testing::Test {
  protected:
+  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
+
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
-    XlaOpRegistry::RegisterJitKernels();
+    XlaOpRegistry::RegisterCompilationKernels();
 
     FunctionDefLibrary flib;
     flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
@@ -46,19 +109,13 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.device_type = &cpu_device_type_;
     options.client = client_;
+    options.flib_def = flib_def_.get();
     return options;
   }
 
-  std::unique_ptr<FunctionLibraryRuntime> BuildFunctionLibraryRuntime(
-      const XlaCompiler& compiler) {
-    return std::unique_ptr<FunctionLibraryRuntime>(NewFunctionLibraryRuntime(
-        compiler.device_mgr(), /*env=*/nullptr, compiler.device(),
-        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
-        /*custom_kernel_creator=*/nullptr));
-  }
-
+  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -66,16 +123,15 @@ class XlaCompilerTest : public ::testing::Test {
 // Tests compilation of an empty graph.
 TEST_F(XlaCompilerTest, EmptyReturnValues) {
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph("add", std::move(graph), flr.get(),
-                                     /*args=*/{}, /*use_tuple_arg=*/false,
-                                     &result));
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph),
+                                     /*args=*/{}, &result));
 
   // No computation should be generated.
-  EXPECT_EQ(0, result.computation.handle().handle());
+  EXPECT_EQ(0, result.computation->handle().handle());
 }
 
 // Tests compilation and execution of a graph that adds two tensors.
@@ -91,20 +147,19 @@ TEST_F(XlaCompilerTest, Simple) {
 
   // Builds a description of the arguments.
   std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({2});
-  args[0].parameter = 0;
+  args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
   args[1].shape = TensorShape({2});
-  args[1].parameter = 1;
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph("add", std::move(graph), flr.get(), args,
-                                     /*use_tuple_arg=*/false, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args, &result));
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
@@ -118,7 +173,7 @@ TEST_F(XlaCompilerTest, Simple) {
 
   std::unique_ptr<xla::GlobalData> actual =
       client_
-          ->Execute(result.computation, {param0_data.get(), param1_data.get()})
+          ->Execute(*result.computation, {param0_data.get(), param1_data.get()})
           .ConsumeValueOrDie();
   std::unique_ptr<xla::Literal> actual_literal =
       client_->Transfer(*actual).ConsumeValueOrDie();
@@ -144,23 +199,22 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
 
   // Builds a description of the arguments.
   std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({2});
-  args[0].parameter = 0;
 
   {
     // Compiles the graph, with resolve_compile_time_constants enabled.
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = true;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, /*use_tuple_arg=*/false,
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
                                        &result));
 
     ASSERT_EQ(2, result.outputs.size());
@@ -176,7 +230,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -191,14 +245,13 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = false;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, /*use_tuple_arg=*/false,
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
                                        &result));
 
     ASSERT_EQ(2, result.outputs.size());
@@ -212,7 +265,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -227,5 +280,44 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   }
 }
 
+// Tests compilation and execution of a graph that adds two tensors.
+TEST_F(XlaCompilerTest, ResourceManager) {
+  // Builds a graph that calls the dummy resource Op.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = DummyReadResourceCC(scope.WithOpName("B"), a);
+  auto c = ops::_Retval(scope.WithOpName("C"), b.output_, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the argument.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+
+  DummyResourceForTest* resource = new DummyResourceForTest();
+
+  // Compiles the graph.
+  auto options = DefaultOptions();
+  std::function<Status(ResourceMgr*)> populate_function =
+      [resource](ResourceMgr* rm) {
+        resource->Ref();
+        return rm->Create(rm->default_container(), "dummy", resource);
+      };
+  options.populate_resource_manager = &populate_function;
+  XlaCompiler compiler(options);
+
+  EXPECT_EQ(0, resource->Get());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "dummy",
+                                     std::move(graph), args, &result));
+
+  EXPECT_EQ(1, resource->Get());
+
+  resource->Unref();
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index a770271628c..4440b530696 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -31,25 +33,9 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 
-XlaExpression::XlaExpression() : has_constant_value_(false) {}
-
-void XlaExpression::set_handle(const xla::ComputationDataHandle& h) {
-  handle_ = h;
-}
-const xla::ComputationDataHandle& XlaExpression::handle() const {
-  return handle_;
-}
-
-void XlaExpression::set_constant_value(Tensor value) {
-  has_constant_value_ = true;
-  constant_value_ = std::move(value);
-}
-
 const char XlaContext::kXlaContextResourceName[] = "_xla_context";
 
 // Looks up the context associated with the current step. It is stored
@@ -68,145 +54,37 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   return *context;
 }
 
-Status XlaContext::BuildArguments(std::vector<XlaCompiler::Argument> args,
-                                  bool use_tuple_arg) {
+/* static */ XlaContext& XlaContext::Get(const XlaOpKernelContext* ctx) {
+  return Get(ctx->op_kernel_context());
+}
+
+void XlaContext::set_args(std::vector<Argument> args) {
   args_ = std::move(args);
-  use_tuple_arg_ = use_tuple_arg;
-
-  // Compute the number of parameters, verify that they are sequential starting
-  // from 0
-  num_parameters_ = 0;
-  for (const XlaCompiler::Argument& arg : args_) {
-    if (arg.parameter < 0) continue;
-    if (num_parameters_ != arg.parameter) {
-      return errors::InvalidArgument(
-          "Parameter numbers to JIT compilation are not consecutive starting "
-          "from 0");
-    }
-    ++num_parameters_;
-
-    if (arg.shape.num_elements() == 0) {
-      return errors::InvalidArgument(
-          "Non-constant argument must have a non-zero number of elements.");
-    }
-  }
-  if (num_parameters_ == 0) return Status::OK();
-
-  parameters_.resize(num_parameters_);
-
-  std::vector<xla::Shape> parameter_shapes(num_parameters_);
-  for (int i = 0; i < args_.size(); ++i) {
-    const XlaCompiler::Argument& arg = args_[i];
-    if (arg.parameter < 0) continue;
-    // Computes the shapes of non-constant arguments.
-    xla::PrimitiveType type;
-    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(arg.type, &type));
-    xla::ShapeUtil::PopulateShape(type, arg.shape.dim_sizes(),
-                                  &parameter_shapes[arg.parameter]);
-  }
-
-  if (use_tuple_arg_ && num_parameters_ > 0) {
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(parameter_shapes);
-    xla::ComputationDataHandle tuple =
-        builder().Parameter(0, tuple_shape, "arg_tuple");
-    for (int i = 0; i < args_.size(); ++i) {
-      const XlaCompiler::Argument& arg = args_[i];
-      if (arg.parameter < 0) continue;
-      parameters_[arg.parameter] =
-          builder().GetTupleElement(tuple, arg.parameter);
-    }
-  } else {
-    for (int i = 0; i < args_.size(); ++i) {
-      const XlaCompiler::Argument& arg = args_[i];
-      if (arg.parameter < 0) continue;
-      parameters_[arg.parameter] =
-          builder().Parameter(arg.parameter, parameter_shapes[arg.parameter],
-                              strings::StrCat("arg", i));
-    }
-  }
-  return Status::OK();
 }
 
-Status XlaContext::CollectResults(
-    xla::Computation* computation, bool* requires_runtime_context,
-    std::vector<ConstRetVal>* compile_time_constants,
-    int* num_nonconst_outputs) {
-  mutex_lock l(mu_);
-
-  xla::ComputationDataHandle handle;
-  if (retval_.empty() && has_side_effects_) {
-    // Build a empty tuple return value for computations that have side effects
-    // but have no return values.
-    handle = builder().Tuple({});
-  } else if (retval_.size() == 1) {
-    handle = retval_[0].second;
-
-    // TODO(b/31775371): to workaround bug, add a no-op computation that is
-    // guaranteed to be constructed after all of the formal parameters to the
-    // computation.
-    handle = builder().GetTupleElement(builder().Tuple({handle}), 0);
-
-    // Ensure that the retval is returned even if another computation
-    // was mistakenly placed on the ComputationBuilder.
-    TF_CHECK_OK(builder().SetReturnValue(handle));
-  } else if (retval_.size() > 1) {
-    // There is at least one data-dependent expression: combine them
-    // into a Tuple in index order before compiling.
-    VLOG(1) << "Making the retval tuple.";
-    std::sort(retval_.begin(), retval_.end(),
-              [](const std::pair<int, xla::ComputationDataHandle>& a,
-                 const std::pair<int, xla::ComputationDataHandle>& b) {
-                return a.first < b.first;
-              });
-    std::vector<xla::ComputationDataHandle> elems;
-    elems.reserve(retval_.size());
-    for (const std::pair<int, xla::ComputationDataHandle>& r : retval_) {
-      elems.push_back(r.second);
-    }
-    // Make a tuple from the vector of handles.
-    handle = builder().Tuple(elems);
-  }
-
-  if (handle.handle() > 0) {
-    // Builds the XLA computation.
-    xla::StatusOr<xla::Computation> computation_status = builder().Build();
-    if (!computation_status.ok()) {
-      return computation_status.status();
-    }
-    *computation = computation_status.ConsumeValueOrDie();
-  }
-
-  // Make sure the compile time constants are in RetVal index order.
-  std::sort(compile_time_constant_.begin(), compile_time_constant_.end(),
-            [](const ConstRetVal& a, const ConstRetVal& b) {
-              return a.index < b.index;
-            });
-
-  // Fill in the result details and return.
-  *compile_time_constants = std::move(compile_time_constant_);
-  *requires_runtime_context = has_context_parameter_;
-  *num_nonconst_outputs = retval_.size();
-  return Status::OK();
-}
-
-XlaContext::XlaContext(XlaCompiler* compiler, xla::Client* client,
-                       const string& computation_name,
+XlaContext::XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
                        bool allow_cpu_custom_calls,
                        bool resolve_compile_time_constants)
     : compiler_(compiler),
-      xla_builder_(client, computation_name),
+      builder_(builder),
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants) {}
 
 const xla::ComputationDataHandle&
 XlaContext::GetOrCreateRuntimeContextParameter() {
-  mutex_lock lock(mu_);
   CHECK(allow_cpu_custom_calls_);
-  CHECK(!use_tuple_arg_);
   if (has_context_parameter_) return context_parameter_;
   has_context_parameter_ = true;
-  context_parameter_ = xla_builder_.Parameter(
-      num_parameters_, xla::ShapeUtil::MakeOpaqueShape(), "tf_context");
+
+  // Allocate the next available parameter for the context parameter.
+  int num_parameters = 0;
+  for (const Argument& arg : args_) {
+    if (!arg.value.is_constant) {
+      ++num_parameters;
+    }
+  }
+  context_parameter_ = builder_->Parameter(
+      num_parameters, xla::ShapeUtil::MakeOpaqueShape(), "tf_context");
   return context_parameter_;
 }
 
@@ -214,72 +92,61 @@ string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
 // with a specific return value of the subgraph.
-void XlaContext::AddRetval(int retval_index,
+void XlaContext::AddRetval(int retval_index, DataType type,
                            const xla::ComputationDataHandle& handle) {
   VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
-  // Add the return value to the list being built up. The executor
-  // is multi-threaded so this has to happen under the
-  // lock.
-  mutex_lock l(mu_);
-  retval_.emplace_back(retval_index, handle);
+  // Add the return value to the list being built up.
+  if (retvals_.size() <= retval_index) {
+    retvals_.resize(retval_index + 1);
+  }
+  retvals_[retval_index].is_constant = false;
+  retvals_[retval_index].type = type;
+  retvals_[retval_index].handle = handle;
 }
 
 Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
                                   const xla::Literal& literal) {
   VLOG(1) << "Adding retval index " << retval_index
           << " with non-data-dependent tensor to XLA computation";
+  if (retvals_.size() <= retval_index) {
+    retvals_.resize(retval_index + 1);
+  }
+  retvals_[retval_index].type = dtype;
   if (resolve_compile_time_constants_) {
-    ConstRetVal value;
-    value.index = retval_index;
-    TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value.value));
-    mutex_lock l(mu_);
-    compile_time_constant_.push_back(std::move(value));
+    retvals_[retval_index].is_constant = true;
+    TF_RETURN_IF_ERROR(LiteralToHostTensor(
+        literal, dtype, &retvals_[retval_index].constant_value));
   } else {
-    mutex_lock l(mu_);
-    retval_.emplace_back(retval_index, xla_builder_.ConstantLiteral(literal));
+    retvals_[retval_index].is_constant = false;
+    retvals_[retval_index].handle = builder_->ConstantLiteral(literal);
   }
   return Status::OK();
 }
 
 void XlaContext::AddSideEffects() {
-  mutex_lock lock(mu_);
   has_side_effects_ = true;
 }
 
-/* static */ const XlaExpression* XlaContext::CastExpressionFromTensor(
-    const Tensor& tensor) {
-  const XlaExpression* expression =
-      reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK_NE(expression->handle().handle(), 0);
-  VLOG(1) << "Fetched T" << expression->handle().handle();
-  return expression;
-}
+xla::ComputationBuilder* XlaContext::builder() { return builder_; }
 
-/* static */ XlaExpression* XlaContext::CastExpressionFromUninitializedTensor(
-    Tensor* tensor) {
-  const XlaExpression* expression =
-      reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK_EQ(expression->handle().handle(), 0);
-  return const_cast<XlaExpression*>(expression);
+Status XlaContext::CreateVariable(int arg_num, string name, DataType type,
+                                  const xla::ComputationDataHandle& handle,
+                                  XlaVariable** variable) {
+  variables_.emplace_back(new XlaVariable);
+  *variable = variables_.back().get();
+  XlaVariable& var = **variable;
+  var.arg_num = arg_num;
+  var.name = std::move(name);
+  var.type = type;
+  var.initial_value = var.value = handle;
+  return Status::OK();
 }
 
-/* static */ const XlaExpression* XlaContext::GetExpressionFromTensor(
-    const Tensor& tensor) {
-  return CastExpressionFromTensor(tensor);
-}
-
-/* static */ const xla::ComputationDataHandle&
-XlaContext::GetComputationFromTensor(const Tensor& tensor) {
-  return CastExpressionFromTensor(tensor)->handle();
-}
-
-xla::ComputationBuilder& XlaContext::builder() { return xla_builder_; }
-
 const xla::Computation* XlaContext::GetOrCreateMax(const DataType type) {
   return LookupOrCreate(type, &max_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
-    xla::ComputationBuilder b(builder().client(), "max<" + type_string + ">");
+    xla::ComputationBuilder b(builder()->client(), "max<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -293,7 +160,7 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   return LookupOrCreate(type, &add_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
-    xla::ComputationBuilder b(builder().client(), "add<" + type_string + ">");
+    xla::ComputationBuilder b(builder()->client(), "add<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -307,14 +174,19 @@ const xla::Computation* XlaContext::GetOrCreateSigmoid(const DataType type) {
   return LookupOrCreate(type, &sigmoid_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Sigmoid() for " << type_string;
-    xla::ComputationBuilder b(builder().client(),
+    xla::ComputationBuilder b(builder()->client(),
                               "sigmoid<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
-    auto one = b.ConstantLiteral(xla::LiteralUtil::One(xla_type));
-    auto minus_one = b.Neg(one);
-    b.Div(one, b.Add(b.Exp(b.Mul(x, minus_one)), one));
+    // Clamp the inputs to the range [-18, 18] since anything outside
+    // this range is 0.0f or 1.0f in single-precision. We must clamp the range
+    // of x to avoid incorrect outputs due to fast-math optimizations for large
+    // negative x.
+    x = b.Clamp(XlaHelpers::IntegerLiteral(&b, type, -18), x,
+                XlaHelpers::IntegerLiteral(&b, type, 18));
+    auto one = XlaHelpers::One(&b, type);
+    b.Div(one, b.Add(b.Exp(b.Neg(x)), one));
     return b.Build().ConsumeValueOrDie();
   });
 }
@@ -323,7 +195,6 @@ const xla::Computation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
     const std::function<xla::Computation()>& create) {
   {
-    mutex_lock l(mu_);
     const auto& entry = (*out)[type];
     if (!entry.IsNull()) {
       return &entry;
@@ -331,7 +202,6 @@ const xla::Computation* XlaContext::LookupOrCreate(
   }
   auto new_entry = create();
   {
-    mutex_lock l(mu_);
     // Somebody else might have made one concurrently.
     auto& entry = (*out)[type];
     if (entry.IsNull()) {
@@ -341,4 +211,4 @@ const xla::Computation* XlaContext::LookupOrCreate(
   }
 }
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 8ece3d37984..3978baaf637 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -13,178 +13,109 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines the contexts used to represent XLA JIT computatations.
+// This file defines the contexts used during XLA compilation.
 
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_CONTEXT_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_CONTEXT_H_
 
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 
-// A XlaExpression wraps an XLA computation. Each Tensor sent
-// along an edge during XLA JIT compilation represents a
-// XlaExpression, and the shape of the Tensor matches the shape of
-// the subcomputation in the ComputationDataHandle. Each
-// expression is either a constant, an unbound parameter, or a
-// function of previously-compiled expressions.
-class XlaExpression {
- public:
-  XlaExpression();
+class XlaOpKernelContext;
 
-  // handle() stores the XLA handle of the computation that the
-  // expression represents.
-  void set_handle(const xla::ComputationDataHandle& h);
-  const xla::ComputationDataHandle& handle() const;
-
-  void set_constant_value(Tensor value);
-  bool has_constant_value() const { return has_constant_value_; }
-  const Tensor& constant_value() const { return constant_value_; }
-
- private:
-  friend class XlaContext;
-
-  // The XLA handle of the expression's computation.
-  xla::ComputationDataHandle handle_;
-
-  // If this expression is a constant with a known value, 'constant_value' is a
-  // host-memory Tensor containing the value. Used to avoid invoking XLA for
-  // expressions that are trivially constant.
-  bool has_constant_value_;
-  Tensor constant_value_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaExpression);
-};
-
-// The XlaContext is the data structure accessible from
-// OpKernelContexts when evaluating a subgraph of Ops for JIT
-// compilation by XLA. When an Op is executed during JIT
-// compilation the input Tensors to the Op store handles to
-// subcomputations compiled by earlier Ops in the subgraph. The Op can
-// retrieve these subcomputations by calling either
-// GetExpressionFromTensor, which returns the XlaExpression holding
-// the subcomputation; or EvaluateAsConstant which returns an XLA
-// literal of the result of the subcomputation or an error status if
-// the subcomputation depends on unbound parameters. The Op may then
-// use the ComputationBuilder available from XlaContext::builder()
-// to compile one or more functions of the inputs into
-// ComputationDataHandles. The handles can be stored as new
-// expressions corresponding to the outputs of the Op by calling
-// CreateOutputTensorFromComputation or
-// CreateConstantOutputTensor. The *only* correct way to allocate an
-// output tensor is using one of the preceding two methods, since they
-// ensure there is a valid XlaExpression backing the output
-// tensor. No Op should ever call allocate_output or allocate_temp
-// directly on the OpKernelContext. It is permissible to pass a tensor
-// from an Op input to an output (e.g. call ctx->set_output with a
-// tensor passed as an input). As an example, the softmax Op produces
-// output from input as follows:
-//
-//    XlaContext& tc = XlaContext::Get(context);
-//    xla::ComputationBuilder& b = tc.builder();
-//    xla::ComputationDataHandle logits =
-//        tc.GetComputationFromTensor(logits_in));
-//    ... The softmax computation uses the builder b to compute a
-//        xla::ComputationDataHandle softmax holding the desired output.
-//    ...
-//    OP_REQUIRES_OK(context, tc.CreateOutputTensorFromComputation(
-//                                context, 0, logits_in.shape().dim_sizes(),
-//                                softmax));
-//
+// The XlaContext is the data structure that holds the state of an XLA
+// compilation, that is accessible from OpKernelContexts when compiling a
+// subgraph of Ops using XLA.
 class XlaContext : public ResourceBase {
  public:
-  // If a retval can be evaluated at JIT time it is returned as a
-  // Literal in a ConstRetVal struct as part of the ComputationResult.
-  // TODO(misard) reconcile this with the duplicate data structure in
-  // the XlaCompilationCache class.
-  struct ConstRetVal {
-    // The index of the RetVal corresponding to this constant literal.
-    int index;
-    // If status is not OK, value's data is undefined.
-    Status status;
-    // The value of the RetVal evaluated at JIT compilation
-    // time. value.shape() always gives the correct shape of the
-    // RetVal. If !status.ok() then value's data is undefined, otherwise the
-    // Tensor buffer is allocated in CPU memory.
-    Tensor value;
+  // A struct that represents either a compile-time constant, or an XLA
+  // computation handle. Used to represent arguments and return values.
+  struct HandleOrConstant {
+    // Is this a compile-time constant? If so, what is its value?
+    bool is_constant;
+    Tensor constant_value;  // Must be in host memory.
+
+    // If this is not a constant, a computation handle. Since the mapping from
+    // Tensorflow types to XLA types is not necessarily injective (one-to-one),
+    // we also require the Tensorflow type.
+    DataType type;
+    xla::ComputationDataHandle handle;
   };
 
+  struct Argument {
+    // Descriptive name for the variable, for use in error messages.
+    string name;
+
+    // Is this a variable?
+    bool is_variable = false;
+
+    HandleOrConstant value;
+
+    int64 tensor_array_size = -1;
+  };
+
+  // Retrieves the XlaContext of the current compilation.
+  static XlaContext& Get(const OpKernelContext* ctx);
+  static XlaContext& Get(const XlaOpKernelContext* ctx);
+
+  // Creates a new XlaContext.
+  XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
+             bool allow_cpu_custom_calls, bool resolve_compile_time_constants);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
 
-  // Retrieve the XlaContext corresponding to a step's JIT compilation.
-  static XlaContext& Get(const OpKernelContext* ctx);
-  static XlaContext& Get(const XlaOpKernelContext* ctx) {
-    return Get(ctx->op_kernel_context());
-  }
-
-  // Create a new XlaContext.
-  XlaContext(XlaCompiler* compiler, xla::Client* client,
-             const string& computation_name, bool allow_cpu_custom_calls,
-             bool resolve_compile_time_constants);
-
-  // Builds XLA computations for each of the arguments.
-  // Should only be called once to initialize the arguments. Not thread-safe.
-  Status BuildArguments(std::vector<XlaCompiler::Argument> arguments,
-                        bool use_tuple_arg) TF_MUST_USE_RESULT;
-
-  // Returns the results of the symbolic computation that have accumulated in
-  // the XlaContext. After CollectResults() is called, the context is left in
-  // an invalid state and must not be reused.
-  // Sets `requires_runtime_context` if the emitted computation requires a
-  // runtime context argument. `compile_time_constants` describes any non
-  // data-dependent results of the computation. `num_nonconst_ouputs` is set to
-  // the number of outputs of the `computation`.
-  Status CollectResults(xla::Computation* computation,
-                        bool* requires_runtime_context,
-                        std::vector<ConstRetVal>* compile_time_constants,
-                        int* num_nonconst_outputs);
-
-  // This is called by the Retval Op to associate a computed value
-  // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, const xla::ComputationDataHandle& handle);
-
-  // As for Retval, but for return values that are compile-time constants.
-  Status AddConstRetval(int retval_index, DataType dtype,
-                        const xla::Literal& literal);
-
-  // Mark the computation as having side effects (i.e., Send operators).
-  void AddSideEffects();
-
-  // Retrieves the ComputationDataHandle from an input Tensor to an Op. This
-  // computation was constructed by an Op that executed previously and
-  // created the output Tensor using CreateOutputTensorFromComputation
-  // or CreateConstantOutputTensor.
-  static const xla::ComputationDataHandle& GetComputationFromTensor(
-      const Tensor& tensor);
-
   XlaCompiler* compiler() const { return compiler_; }
 
   // Returns the ComputationBuilder that Ops use for compiling new
   // expressions.
-  xla::ComputationBuilder& builder();
+  xla::ComputationBuilder* builder();
 
-  const std::vector<XlaCompiler::Argument>& args() const { return args_; }
-  xla::ComputationDataHandle parameter(int num) { return parameters_[num]; }
+  bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
+  bool has_context_parameter() const { return has_context_parameter_; }
+
+  const std::vector<Argument>& args() const { return args_; }
+  void set_args(std::vector<Argument> args);
 
   // Get the runtime context parameter, adding one if it does not already exist.
   // Dies if not compiling a local executable.
   const xla::ComputationDataHandle& GetOrCreateRuntimeContextParameter();
 
-  bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
+  const std::vector<HandleOrConstant>& retvals() { return retvals_; }
+
+  // This is called by the Retval Op to associate a computed value
+  // with a specific return value of the subgraph.
+  void AddRetval(int retval_index, DataType type,
+                 const xla::ComputationDataHandle& handle);
+
+  // As for Retval, but for return values that are compile-time constants.
+  Status AddConstRetval(int retval_index, DataType dtype,
+                        const xla::Literal& literal);
+
+  // Mark the computation as having side effects (e.g., Send operators).
+  void AddSideEffects();
+
+  bool has_side_effects() const { return has_side_effects_; }
+
+  // Creates a variable with variable `variable_id` and initial type `type` and
+  // value `handle`. `name` is a descriptive name for use in error messages.
+  // Fails if the variable already exists.
+  Status CreateVariable(int arg_num, string name, DataType type,
+                        const xla::ComputationDataHandle& handle,
+                        XlaVariable** variable);
+
+  const std::vector<std::unique_ptr<XlaVariable>>& variables() {
+    return variables_;
+  }
 
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
@@ -205,41 +136,11 @@ class XlaContext : public ResourceBase {
   static const char kXlaContextResourceName[];
 
  private:
-  friend class XlaOpKernelContext;
-
-  // This method is used to retrieve an expression that was allocated by
-  // a previous Op.
-  static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor);
-
-  // This method is used to retrieve an uninitialized expression from a
-  // newly-allocated tensor.
-  static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor);
-
-  // Retrieves the expression from an input Tensor to an Op. This
-  // expression was constructed by an Op that executed previously and
-  // created the output Tensor using CreateOutputTensorFromComputation
-  // or CreateConstantOutputTensor.
-  static const XlaExpression* GetExpressionFromTensor(const Tensor& tensor);
-
   XlaCompiler* const compiler_;
 
-  mutable mutex mu_;
-
   // The ComputationBuilder used to construct the subgraph's compiled
   // representation.
-  xla::ComputationBuilder xla_builder_ GUARDED_BY(mu_);
-
-  // Number of XLA Parameters, not counting the context parameter, if any.
-  int num_parameters_;
-
-  // Arguments to the JIT compilation, both compile-time constant arguments and
-  // runtime parameters.
-  std::vector<XlaCompiler::Argument> args_;
-  bool use_tuple_arg_ = false;
-
-  // Runtime parameters to the XLA computation. Does not include
-  // compile-time constant arguments.
-  std::vector<xla::ComputationDataHandle> parameters_;
+  xla::ComputationBuilder* builder_;
 
   // Allow ops to emit CustomCall operations for CPU.
   const bool allow_cpu_custom_calls_;
@@ -252,18 +153,21 @@ class XlaContext : public ResourceBase {
   // for an additional final parameter to the computation, through which will be
   // passed a XlaLocalRuntimeContext* at runtime. Created on demand by
   // GetOrCreateRuntimeContextParameter().
-  bool has_context_parameter_ GUARDED_BY(mu_) = false;
-  xla::ComputationDataHandle context_parameter_ GUARDED_BY(mu_);
+  bool has_context_parameter_ = false;
+  xla::ComputationDataHandle context_parameter_;
 
-  // The data-dependent return values of the computation.
-  std::vector<std::pair<int, xla::ComputationDataHandle>> retval_
-      GUARDED_BY(mu_);
+  // Arguments to the Tensorflow graph, indexed by _Arg index.
+  // Includes both compile-time constant arguments and runtime parameters.
+  std::vector<Argument> args_;
 
-  // The non-data-dependent return values of the computation.
-  std::vector<ConstRetVal> compile_time_constant_ GUARDED_BY(mu_);
+  // Return values of the Tensorflow graph, indexed by _Retval index.
+  std::vector<HandleOrConstant> retvals_;
 
   // Does the computation have side effects, i.e., Send() calls?
-  bool has_side_effects_ GUARDED_BY(mu_) = false;
+  bool has_side_effects_ = false;
+
+  // Holds ownership of variables. The variables are not ordered.
+  std::vector<std::unique_ptr<XlaVariable>> variables_;
 
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::Computation>;
@@ -273,16 +177,16 @@ class XlaContext : public ResourceBase {
   // map. The returned value != nullptr and is owned by the map.
   const xla::Computation* LookupOrCreate(
       DataType type, ComputationMap* out,
-      const std::function<xla::Computation()>& create) LOCKS_EXCLUDED(mu_);
+      const std::function<xla::Computation()>& create);
 
   // Cached computation to compute Max of two elements, specialized by type.
-  ComputationMap max_func_ GUARDED_BY(mu_);
+  ComputationMap max_func_;
 
   // Cached computation to compute Sum of two elements, specialized by type.
-  ComputationMap add_func_ GUARDED_BY(mu_);
+  ComputationMap add_func_;
 
   // Cached computation to compute Sigmoid of an element, specialized by type.
-  ComputationMap sigmoid_func_ GUARDED_BY(mu_);
+  ComputationMap sigmoid_func_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaContext);
 };
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index efb0facf7b8..f060f8f2f17 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
@@ -89,7 +90,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     case xla::U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case xla::F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      literal =
+          *xla::LiteralUtil::CreateR0<xla::half>(static_cast<xla::half>(value));
+      break;
     case xla::TUPLE:
       LOG(FATAL) << "tuple element type is not integral";
     case xla::OPAQUE:
@@ -107,6 +110,9 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   switch (type) {
+    case xla::F16:
+      return b->ConstantR0<xla::half>(static_cast<xla::half>(value));
+      break;
     case xla::F32:
       return b->ConstantR0<float>(static_cast<float>(value));
       break;
@@ -139,4 +145,64 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
   return Status::OK();
 }
 
+template <typename T>
+static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
+  Tensor linspace(DataTypeToEnum<T>::v(), shape);
+  auto linspace_flat = linspace.flat<T>();
+  for (int64 i = 0; i < depth; ++i) {
+    linspace_flat(i) = i;
+  }
+  return linspace;
+}
+
+Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
+                          int axis, DataType index_type,
+                          const TensorShape& indices_shape,
+                          const xla::ComputationDataHandle& indices,
+                          const xla::ComputationDataHandle& on_value,
+                          const xla::ComputationDataHandle& off_value,
+                          xla::ComputationDataHandle* one_hot) {
+  const int indices_dims = indices_shape.dims();
+  const int output_dims = indices_dims + 1;
+
+  TensorShape output_shape = indices_shape;
+  output_shape.InsertDim(axis, depth);
+
+  // Build a Tensor populated with values 0, 1, 2, ... depth.
+  std::vector<int64> linspace_dims(output_dims, 1);
+  linspace_dims[axis] = depth;
+  TensorShape linspace_shape(linspace_dims);
+  Tensor linspace;
+  switch (index_type) {
+    case DT_UINT8:
+      linspace = MakeLinspaceTensor<uint8>(linspace_shape, depth);
+      break;
+    case DT_INT32:
+      linspace = MakeLinspaceTensor<int32>(linspace_shape, depth);
+      break;
+    case DT_INT64:
+      linspace = MakeLinspaceTensor<int64>(linspace_shape, depth);
+      break;
+    default:
+      return errors::InvalidArgument("Invalid argument type ",
+                                     DataTypeString(index_type));
+  }
+  xla::Literal linspace_literal;
+  TF_RETURN_IF_ERROR(HostTensorToLiteral(linspace, &linspace_literal));
+
+  // Broadcast the linspace constant across the indices along the new axis,
+  // and test equality at each position.
+  std::vector<int64> broadcast_dims(indices_shape.dims());
+  std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
+  std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
+  xla::ComputationDataHandle one_hot_bool = builder->Eq(
+      indices, builder->ConstantLiteral(linspace_literal), broadcast_dims);
+
+  // Selects the user-provided off_value and on_value values.
+  *one_hot = builder->Select(
+      one_hot_bool, builder->Broadcast(on_value, output_shape.dim_sizes()),
+      builder->Broadcast(off_value, output_shape.dim_sizes()));
+  return Status::OK();
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 353ed02edda..a141ee05c13 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -66,6 +66,17 @@ class XlaHelpers {
   static Status ReshapeLiteral(const xla::Literal& input,
                                gtl::ArraySlice<int64> shape,
                                xla::Literal* output);
+
+  // Converts `indices` into a one-hot representation. `depth` is the size
+  // of the new axis to add. `axis` is the position at which to add the new
+  // axis. `indices_shape` is the shape of `indices`. `on_value` and `off_value`
+  // represent the values to use for the on and off positions, respectively.
+  static Status OneHot(xla::ComputationBuilder* builder, int64 depth, int axis,
+                       DataType index_type, const TensorShape& indices_shape,
+                       const xla::ComputationDataHandle& indices,
+                       const xla::ComputationDataHandle& on_value,
+                       const xla::ComputationDataHandle& off_value,
+                       xla::ComputationDataHandle* one_hot);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
index cd773d64ed4..dca420d6ee3 100644
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
@@ -23,7 +23,7 @@ limitations under the License.
 // actually used.  E.g. some ahead-of-time compiled computations don't need a
 // thread pool.
 namespace Eigen {
-class ThreadPoolDevice;
+struct ThreadPoolDevice;
 }
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 00cf1adc119..3272b1efa15 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -31,11 +31,38 @@ bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
 }
 
 xla::ComputationBuilder* XlaOpKernelContext::builder() const {
-  return &XlaContext::Get(this).builder();
+  return XlaContext::Get(this).builder();
+}
+
+// Retrieves an XlaExpression that was allocated by a previous Op.
+static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
+  const XlaExpression* expression =
+      reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
+  CHECK(expression->handle().handle() != 0 ||
+        expression->variable() != nullptr);
+  VLOG(1) << "Fetched T" << expression->handle().handle();
+  return expression;
+}
+
+// Retrieves an uninitialized XlaExpression from a newly-allocated tensor.
+static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) {
+  const XlaExpression* expression =
+      reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
+  CHECK_EQ(expression->handle().handle(), 0);
+  return const_cast<XlaExpression*>(expression);
+}
+
+// Retrieves the ComputationDataHandle from an input Tensor to an Op. This
+// computation was constructed by an Op that executed previously and
+// created the output Tensor using CreateOutputTensorFromComputation
+// or CreateConstantOutputTensor.
+static const xla::ComputationDataHandle& GetComputationFromTensor(
+    const Tensor& tensor) {
+  return CastExpressionFromTensor(tensor)->handle();
 }
 
 const xla::ComputationDataHandle& XlaOpKernelContext::Input(int index) {
-  return XlaContext::GetComputationFromTensor(context_->input(index));
+  return GetComputationFromTensor(context_->input(index));
 }
 
 TensorShape XlaOpKernelContext::InputShape(int index) {
@@ -60,8 +87,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
         " but was asked to be reshaped to incompatible shape ",
         new_shape.DebugString());
   }
-  const XlaExpression* expression =
-      XlaContext::CastExpressionFromTensor(tensor);
+  const XlaExpression* expression = CastExpressionFromTensor(tensor);
 
   // If the tensor has a known constant value, there is no need to invoke XLA.
   if (expression->has_constant_value()) {
@@ -112,6 +138,27 @@ Status XlaOpKernelContext::ConstantInputReshaped(
   return Status::OK();
 }
 
+// Converts an int32 or int64 scalar literal to an int64.
+static Status LiteralToInt64Scalar(const xla::Literal& literal, int64* out) {
+  if (xla::ShapeUtil::Rank(literal.shape()) != 0) {
+    return errors::InvalidArgument("value is not a scalar");
+  }
+  if (literal.shape().element_type() == xla::S32) {
+    *out = xla::LiteralUtil::Get<int32>(literal, {});
+  } else if (literal.shape().element_type() == xla::S64) {
+    *out = xla::LiteralUtil::Get<int64>(literal, {});
+  } else {
+    return errors::InvalidArgument("value must be either int32 or int64");
+  }
+  return Status::OK();
+}
+
+Status XlaOpKernelContext::ConstantInputAsIntScalar(int index, int64* out) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  return LiteralToInt64Scalar(literal, out);
+}
+
 // Converts an int32 or int64 1D literal to an int64 vector.
 static Status LiteralToInt64Vector(const xla::Literal& literal,
                                    std::vector<int64>* out) {
@@ -140,6 +187,31 @@ Status XlaOpKernelContext::ConstantInputAsIntVector(int index,
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
+                                                       xla::Literal* out) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  switch (literal.shape().element_type()) {
+    case xla::S32:
+      out->Clear();
+      *out->mutable_shape() = literal.shape();
+      out->mutable_shape()->set_element_type(xla::S64);
+      for (int32 x : literal.s32s()) {
+        out->add_s64s(x);
+      }
+      return Status::OK();
+
+    case xla::S64:
+      out->Swap(&literal);
+      return Status::OK();
+
+    default:
+      return errors::InvalidArgument(
+          "Invalid argument to ConstantInputAsInt64Literal: ",
+          xla::ShapeUtil::HumanString(literal.shape()));
+  }
+}
+
 // TODO(phawkins): validate that the dimensions form a valid shape, fail
 // gracefully if they do not.
 Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
@@ -159,7 +231,7 @@ Status XlaOpKernelContext::InputList(
   handles->clear();
   shapes->clear();
   for (const Tensor& input : inputs) {
-    handles->push_back(XlaContext::GetComputationFromTensor(input));
+    handles->push_back(GetComputationFromTensor(input));
     shapes->push_back(input.shape());
   }
   return Status::OK();
@@ -176,6 +248,49 @@ Status XlaOpKernelContext::ConstantInputList(
   return Status::OK();
 }
 
+Status XlaOpKernelContext::ReadVariableInput(
+    int index, xla::ComputationDataHandle* value) {
+  const Tensor& tensor = context_->input(index);
+  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  XlaVariable* variable = expression->variable();
+  TF_RET_CHECK(variable != nullptr);
+  if (variable->value.handle() == 0) {
+    return errors::InvalidArgument("Read of uninitialized variable ",
+                                   variable->name);
+  }
+  *value = variable->value;
+  return Status::OK();
+}
+
+string XlaOpKernelContext::VariableDebugString(int index) {
+  const Tensor& tensor = context_->input(index);
+  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  XlaVariable* variable = expression->variable();
+  if (!variable) {
+    return "<invalid variable ID>";
+  }
+  return variable->name;
+}
+
+Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
+                                                   TensorShape* shape) const {
+  const Tensor& tensor = context_->input(index);
+  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  XlaVariable* variable = expression->variable();
+  TF_RET_CHECK(variable != nullptr);
+  if (variable->value.handle() == 0) {
+    return errors::InvalidArgument("Read of uninitialized variable ",
+                                   variable->name);
+  }
+  *type = variable->type;
+  auto shape_or_status = builder()->GetShape(variable->value);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  *shape = XLAShapeToTensorShape(*shape_or_status.ValueOrDie());
+  return Status::OK();
+}
+
 void XlaOpKernelContext::SetOutput(int index,
                                    const xla::ComputationDataHandle& handle) {
   // Makes the host Tensor that will refer to the expression.
@@ -196,8 +311,7 @@ void XlaOpKernelContext::SetOutput(int index,
 
   // The expression is stored in the tensor's data buffer. Fill in the
   // fields now.
-  XlaExpression* expression =
-      XlaContext::CastExpressionFromUninitializedTensor(output);
+  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
   expression->set_handle(handle);
 }
 
@@ -207,6 +321,7 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   xla::Literal literal;
   OP_REQUIRES_OK(context_, HostTensorToLiteral(constant, &literal));
   xla::ComputationDataHandle handle = builder()->ConstantLiteral(literal);
+  CHECK_NE(handle.handle(), 0);
 
   // Make the Tensor that will refer to the expression.
   Tensor* output = nullptr;
@@ -217,16 +332,57 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
 
   // The expression is stored in the tensor's data buffer. Fill in the
   // fields now.
-  XlaExpression* expression =
-      XlaContext::CastExpressionFromUninitializedTensor(output);
+  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
   expression->set_handle(handle);
   expression->set_constant_value(constant);
 }
 
+void XlaOpKernelContext::SetVariableOutput(int index, XlaVariable* variable) {
+  Tensor* output = nullptr;
+  // The shape of the output tensor is the shape of the variable resource
+  // (i.e., a scalar), not the shape of the variable's value.
+  OP_REQUIRES_OK(context_,
+                 context_->allocate_output(index, TensorShape(), &output));
+  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
+  expression->set_variable(variable);
+}
+
+Status XlaOpKernelContext::GetVariableInput(int index, XlaVariable** variable) {
+  const XlaExpression* expression =
+      CastExpressionFromTensor(context_->input(index));
+  TF_RET_CHECK(expression->variable() != nullptr);
+  *variable = expression->variable();
+  return Status::OK();
+}
+
+Status XlaOpKernelContext::AssignVariable(
+    int index, DataType type, const xla::ComputationDataHandle& handle) {
+  TF_RET_CHECK(handle.handle() != 0);
+  SetOpHasSideEffects();
+
+  const XlaExpression* expression =
+      CastExpressionFromTensor(context_->input(index));
+  XlaVariable* variable = expression->variable();
+  TF_RET_CHECK(variable != nullptr);
+  if (!((variable->type == DT_INVALID && type != DT_INVALID) ||
+        (variable->type == type))) {
+    return errors::InvalidArgument(
+        "Types of variables cannot change after initialization: old type was ",
+        DataTypeString(variable->type), ", new type is ", DataTypeString(type));
+  }
+  variable->type = type;
+  variable->value = handle;
+  return Status::OK();
+}
+
 void XlaOpKernelContext::SetOpHasSideEffects() {
   XlaContext::Get(context_).AddSideEffects();
 }
 
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return XlaContext::Get(context_).compiler();
+}
+
 void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); }
 void XlaOpKernelContext::CtxFailureWithWarning(Status s) {
   context_->CtxFailureWithWarning(s);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 5fbc0cb6ac3..a25774c3a6a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
@@ -45,9 +46,14 @@ class XlaOpKernel : public OpKernel {
 // XlaOpKernelContext is a variant of the standard OpKernel class, tailored for
 // implementing operators that perform symbolic execution as part of the XLA
 // compiler. The key difference is that XlaOpKernelContext produces and consumes
-// data as XLA computations, rather than as standard Tensors. (Under the hood,
-// symbolic execution communicates using special Tensors, but that is an
-// implementation detail that this class hides.)
+// data as XLA computations, rather than as standard Tensors.
+//
+// Under the hood, symbolic execution communicates using special Tensors that
+// wrap XlaExpression objects, however this is an implementation detail that
+// this class hides. The *only* correct way to allocate a Tensor during
+// compilation is using the XlaOpKernelContext methods, since they ensure there
+// is a valid XlaExpression backing the tensor. No Op should ever call
+// allocate_output or allocate_temp directly on the underlying OpKernelContext.
 class XlaOpKernelContext {
  public:
   explicit XlaOpKernelContext(OpKernelContext* context);
@@ -98,9 +104,15 @@ class XlaOpKernelContext {
   Status ConstantInputReshaped(int index, gtl::ArraySlice<int64> new_shape,
                                xla::Literal* constant_literal);
 
+  // Converts a constant 1D int32 or int64 tensor into an int64.
+  Status ConstantInputAsIntScalar(int index, int64* out);
+
   // Converts a constant 1D int32 or int64 tensor into a vector of int64s.
   Status ConstantInputAsIntVector(int index, std::vector<int64>* out);
 
+  // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
+  Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
+
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
 
@@ -134,6 +146,32 @@ class XlaOpKernelContext {
   // Mark the op has having side effects (i.e., via Send).
   void SetOpHasSideEffects();
 
+  // Variables
+
+  // Sets `*type` and `*shape` to the current type and shape of a variable's
+  // value.
+  Status GetVariableTypeAndShape(int index, DataType* type,
+                                 TensorShape* shape) const;
+
+  // Reads the current value of the resouce variable referred to by input
+  // 'index'.
+  Status ReadVariableInput(int index, xla::ComputationDataHandle* value);
+
+  // Assigns the value `handle` to the variable referenced by input
+  // `variable_index`. Marks the operator as having side effects.
+  Status AssignVariable(int variable_index, DataType type,
+                        const xla::ComputationDataHandle& handle);
+
+  // Sets '*variable' to the variable associated with input `index`.
+  Status GetVariableInput(int index, XlaVariable** variable);
+
+  // Sets output 'index' to be a reference to variable 'variable'. Used
+  // to propagate resource variables through the compilation.
+  void SetVariableOutput(int index, XlaVariable* variable);
+
+  // Returns a human-readable debug string describing 'variable_index'.
+  string VariableDebugString(int variable_index);
+
   // Helper routines for the OP_REQUIRES macros
   void CtxFailure(Status s);
   void CtxFailureWithWarning(Status s);
@@ -151,6 +189,10 @@ class XlaOpKernelContext {
   // Returns the underlying OpKernelContext. Use rarely.
   OpKernelContext* op_kernel_context() const { return context_; }
 
+  // Returns the XlaCompiler that is performing the compilation. Used for, e.g.,
+  // While to compile nested computations.
+  XlaCompiler* compiler() const;
+
   // TODO(phawkins): find a better home for these helpers.
 
   // Get an XLA lambda to compute Max. This is cached in the
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
new file mode 100644
index 00000000000..1bb0d852899
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -0,0 +1,311 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_CPU_XLA_JIT = "XLA_CPU_JIT";
+const char* const DEVICE_GPU_XLA_JIT = "XLA_GPU_JIT";
+const char* const DEVICE_XLA_CPU = "XLA_CPU";
+const char* const DEVICE_XLA_GPU = "XLA_GPU";
+
+// Is platform 'id' supported by XLA?
+static bool IsPlatformSupported(perftools::gputools::Platform::Id id) {
+  auto platform = perftools::gputools::MultiPlatformManager::PlatformWithId(id);
+  if (!platform.ok()) return false;
+  return xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie()).ok();
+}
+
+XlaOpRegistry::XlaOpRegistry() = default;
+XlaOpRegistry::~XlaOpRegistry() = default;
+
+/* static */ void XlaOpRegistry::RegisterCompilationDevice(
+    const string& device_name, const DeviceRegistration& registration) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto result =
+      registry.compilation_devices_.emplace(device_name, registration);
+  CHECK(result.second || result.first->second.compilation_device_name ==
+                             registration.compilation_device_name);
+}
+
+/* static */ void XlaOpRegistry::RegisterBackend(
+    const string& compilation_device_name,
+    gtl::ArraySlice<DataType> supported_types, BackendOpFilter op_filter) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto result = registry.backends_.emplace(compilation_device_name, Backend());
+  CHECK(result.second) << "Duplicate XLA backend registration "
+                       << compilation_device_name;
+  result.first->second.supported_types.insert(supported_types.begin(),
+                                              supported_types.end());
+  result.first->second.op_filter = op_filter;
+}
+
+/* static */ bool XlaOpRegistry::GetCompilationDevice(
+    const string& device_name, const DeviceRegistration** registration) {
+  XlaOpRegistry& registry = Instance();
+
+  // Lazily register the CPU and GPU JIT devices the first time
+  // GetCompilationDevice is called.
+  static void* registration_init = [&registry]() {
+    mutex_lock lock(registry.mutex_);
+    if (IsPlatformSupported(perftools::gputools::host::kHostPlatformId)) {
+      DeviceRegistration& registration =
+          registry.compilation_devices_[DEVICE_CPU];
+      registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
+      registration.requires_compilation = false;
+      registration.enable_jit_by_default = false;
+      registration.compile_resource_ops = false;
+    }
+    if (IsPlatformSupported(perftools::gputools::cuda::kCudaPlatformId)) {
+      DeviceRegistration& registration =
+          registry.compilation_devices_[DEVICE_GPU];
+      registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
+      registration.requires_compilation = false;
+      registration.enable_jit_by_default = true;
+      registration.compile_resource_ops = false;
+    }
+    return nullptr;
+  }();
+  (void)registration_init;
+
+  mutex_lock lock(registry.mutex_);
+  auto it = registry.compilation_devices_.find(device_name);
+  if (it == registry.compilation_devices_.end()) return false;
+  *registration = &it->second;
+  return true;
+}
+
+void XlaOpRegistry::RegisterCompilationKernels() {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+
+  if (registry.jit_kernels_registered_) return;
+  registry.jit_kernels_registered_ = true;
+
+  OpRegistryInterface* op_registry = OpRegistry::Global();
+  for (const auto& op : registry.ops_) {
+    const OpDef* op_def;
+    TF_CHECK_OK(op_registry->LookUpOpDef(op.first, &op_def));
+
+    std::unordered_set<string> type_attrs;
+    for (const OpDef::AttrDef& attr_def : op_def->attr()) {
+      if (attr_def.type() == "type" || attr_def.type() == "list(type)") {
+        type_attrs.insert(attr_def.name());
+      }
+    }
+
+    // Checks there are no type constraints referring to unknown attributes.
+    for (const auto& constraint : op.second->type_constraints) {
+      if (type_attrs.find(constraint.first) == type_attrs.end()) {
+        LOG(FATAL) << "Unknown type attribute " << constraint.first
+                   << " in XLA op registration for " << op.first;
+      }
+    }
+
+    for (auto& backend : registry.backends_) {
+      // If the operator has a device whitelist, only register on whitelisted
+      // devices.
+      if (op.second->has_device_whitelist &&
+          op.second->device_whitelist.find(backend.first) ==
+              op.second->device_whitelist.end()) {
+        continue;
+      }
+
+      std::unique_ptr<KernelDef> kdef(new KernelDef);
+      kdef->set_op(op.second->name);
+      kdef->set_device_type(backend.first);
+
+      // Constrain each type attribute to the intersection of:
+      // a) the types supported by the backend, and
+      // b) the attribute's type constraints.
+      // TODO(phawkins): it may be necessary to also take the intersection with
+      // the set of types supported by the OpDef.
+      for (const string& type_attr : type_attrs) {
+        KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
+        attr_constraint->set_name(type_attr);
+        auto* allowed_values =
+            attr_constraint->mutable_allowed_values()->mutable_list();
+
+        auto it = op.second->type_constraints.find(type_attr);
+        for (DataType dtype : backend.second.supported_types) {
+          if (it == op.second->type_constraints.end() ||
+              (it != op.second->type_constraints.end() &&
+               it->second.find(dtype) != it->second.end())) {
+            allowed_values->add_type(dtype);
+          }
+        }
+        if (op.second->allow_resource_types) {
+          allowed_values->add_type(DT_RESOURCE);
+        }
+      }
+      if (backend.second.op_filter != nullptr &&
+          !backend.second.op_filter(kdef.get())) {
+        continue;
+      }
+      VLOG(2) << "XLA op registration: device: " << backend.first
+              << " op: " << op.first;
+      registry.kernel_registrars_.emplace_back(
+          new kernel_factory::OpKernelRegistrar(
+              new KernelDef(*kdef), "XlaJitOp", op.second->factory));
+      backend.second.kernel_defs.push_back(std::move(kdef));
+    }
+  }
+}
+
+std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
+    const string& compilation_device_name) {
+  std::vector<const KernelDef*> kernels;
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto it = registry.backends_.find(compilation_device_name);
+  CHECK(it != registry.backends_.end())
+      << "Unknown backend " << compilation_device_name;
+  for (const std::unique_ptr<KernelDef>& k : it->second.kernel_defs) {
+    if (!registry.ops_.at(k->op())->compilation_only) {
+      kernels.push_back(k.get());
+    }
+  }
+  return kernels;
+}
+
+XlaOpRegistry& XlaOpRegistry::Instance() {
+  static XlaOpRegistry* r = new XlaOpRegistry;
+  return *r;
+}
+
+XlaOpRegistrationBuilder::XlaOpRegistrationBuilder(StringPiece name) {
+  registration_.reset(new XlaOpRegistry::OpRegistration);
+  registration_->name = name.ToString();
+}
+
+XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(StringPiece name) {
+  XlaOpRegistrationBuilder registration(name);
+  return registration;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device(
+    gtl::ArraySlice<StringPiece> devices) {
+  registration_->has_device_whitelist = true;
+  for (StringPiece device : devices) {
+    registration_->device_whitelist.insert(device.ToString());
+  }
+  return *this;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device(StringPiece device) {
+  registration_->has_device_whitelist = true;
+  registration_->device_whitelist.insert(device.ToString());
+  return *this;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompilationOnly() {
+  registration_->compilation_only = true;
+  return *this;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowResourceTypes() {
+  registration_->allow_resource_types = true;
+  return *this;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
+    StringPiece attr_name, DataType allowed) {
+  std::set<DataType>& types =
+      registration_->type_constraints[attr_name.ToString()];
+  types.insert(allowed);
+  return *this;
+}
+
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
+    StringPiece attr_name, gtl::ArraySlice<DataType> allowed) {
+  std::set<DataType>& types =
+      registration_->type_constraints[attr_name.ToString()];
+  for (DataType t : allowed) {
+    types.insert(t);
+  }
+  return *this;
+}
+
+std::unique_ptr<XlaOpRegistry::OpRegistration> XlaOpRegistrationBuilder::Build(
+    XlaOpRegistry::Factory factory) {
+  registration_->factory = factory;
+  return std::move(registration_);
+}
+
+XlaOpRegistrar::XlaOpRegistrar(
+    std::unique_ptr<XlaOpRegistry::OpRegistration> registration) {
+  XlaOpRegistry& registry = XlaOpRegistry::Instance();
+  mutex_lock lock(registry.mutex_);
+  auto result = registry.ops_.emplace(registration->name, nullptr);
+  if (!result.second) {
+    LOG(FATAL) << "Duplicate XLA op registration " << registration->name;
+  }
+  result.first->second = std::move(registration);
+}
+
+XlaBackendRegistrar::XlaBackendRegistrar(
+    StringPiece name, gtl::ArraySlice<DataType> types,
+    XlaOpRegistry::BackendOpFilter op_filter) {
+  XlaOpRegistry& registry = XlaOpRegistry::Instance();
+  registry.RegisterBackend(name.ToString(), types, op_filter);
+}
+
+bool CpuOpFilter(KernelDef* kdef) {
+  // TODO(b/34339814): implement inverse erf for double types and remove this
+  // workaround.
+  if (kdef->op() == "RandomStandardNormal") {
+    kdef->clear_constraint();
+    // Change the type constraint to permit only DTD_FLOAT.
+    KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
+    attr_constraint->set_name("dtype");
+    attr_constraint->mutable_allowed_values()->mutable_list()->add_type(
+        DT_FLOAT);
+    return true;
+  }
+  return true;
+}
+
+REGISTER_XLA_BACKEND(DEVICE_CPU_XLA_JIT, kCpuAllTypes, CpuOpFilter);
+
+bool GpuOpFilter(KernelDef* kdef) {
+  // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
+  // slow code.
+  // TODO(b/34969189) The implementation of TruncatedNormal generates illegal
+  // code on GPU.
+  if (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
+      kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
+    return false;
+  }
+  return true;
+}
+
+REGISTER_XLA_BACKEND(DEVICE_GPU_XLA_JIT, kGpuAllTypes, GpuOpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
new file mode 100644
index 00000000000..9a39cc96754
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+// Names of the XLA compilation devices. These are not user-visible, and are
+// used internally by the Tensorflow/XLA bridge to perform symbolic execution of
+// a Tensorflow graph.
+
+extern const char* const DEVICE_CPU_XLA_JIT;  // "CPU_XLA_JIT"
+extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
+
+extern const char* const DEVICE_XLA_CPU;
+extern const char* const DEVICE_XLA_GPU;
+
+constexpr std::array<DataType, 2> kIntTypes = {{DT_INT32, DT_INT64}};
+constexpr std::array<DataType, 2> kFloatTypes = {{DT_FLOAT, DT_DOUBLE}};
+constexpr std::array<DataType, 4> kNumericTypes = {
+    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE}};
+
+constexpr std::array<DataType, 5> kCpuAllTypes = {
+    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_BOOL}};
+
+constexpr std::array<DataType, 5> kGpuAllTypes = {
+    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_BOOL}};
+
+// Class that manages registrations of operators and devices for the XLA JIT.
+// Not thread-safe.
+class XlaOpRegistry {
+ public:
+  typedef OpKernel* (*Factory)(OpKernelConstruction*);
+
+  // Describes how to compile operators assigned to a device.
+  struct DeviceRegistration {
+    // The name of the an XLA compilation device to use to compile code.
+    string compilation_device_name;
+
+    // Do operators assigned to this device require compilation?
+    bool requires_compilation;
+
+    // If !requires_compilation, should we try to JIT operators on this device
+    // when XLA JIT compilation is enabled globally via the SessionOptions?
+    // (It is still possible to explicitly mark operators to JIT compile, even
+    // if enable_jit_by_default is false.)
+    bool enable_jit_by_default;
+
+    // Enable compilation of operators that use DT_RESOURCE types?
+    bool compile_resource_ops = false;
+  };
+
+  // Registers an XLA backend. `compilation_device_name` is the name of the
+  // device used for symbolic execution during compilation. `supported_types`
+  // is the list of non-resource types supported by the device. Each operators
+  // will be registered for the intersection of the operator's supported types
+  // and the device's supported types. `backend_op_filter` is a function used
+  // to exclude or modify operator registrations on the device; it may be
+  // nullptr, in which case all ops are included.
+  // `backend_op_filter` should return true if the op should be registered on
+  // the device; it may optionally modify the KernelDef.
+  typedef bool (*BackendOpFilter)(KernelDef* kdef);
+  static void RegisterBackend(const string& compilation_device_name,
+                              gtl::ArraySlice<DataType> supported_types,
+                              BackendOpFilter op_filter);
+
+  // Registers `device_name` for XLA compilation, using information from
+  // `registration`.
+  static void RegisterCompilationDevice(const string& device_name,
+                                        const DeviceRegistration& registration);
+
+  // Returns the JIT device name associated with 'device_name', setting
+  // 'jit_device_name', 'requires_jit', and 'enabled_jit_by_default', if they
+  // are not null. Returns false and leaves the outputs unchanged if no matching
+  // JIT device is registered.
+  // '*enable_jit_by_default' is set to true if we should try to JIT using this
+  // device when the JIT is enabled via the Session OptimizerOptions.
+  static bool GetCompilationDevice(const string& device_name,
+                                   const DeviceRegistration** registration);
+
+  // Registers all JIT kernels on JIT devices, if not already registered.
+  // Does nothing otherwise.
+  static void RegisterCompilationKernels();
+
+  // Returns KernelDefs for compilation ops registered on
+  // 'compilation_device_name'.
+  // Does not include kernels registered as CompilationOnly.
+  static std::vector<const KernelDef*> DeviceKernels(
+      const string& compilation_device_name);
+
+ private:
+  friend class XlaBackendRegistrar;
+  friend class XlaOpRegistrar;
+  friend class XlaOpRegistrationBuilder;
+
+  static XlaOpRegistry& Instance();
+
+  XlaOpRegistry();
+  ~XlaOpRegistry();
+
+  mutex mutex_;
+
+  // Describes an XLA backend.
+  struct Backend {
+    // Which types are supported by this device?
+    std::set<DataType> supported_types;
+
+    // The per-backend operator filter function. See the comment on
+    // RegisterBackend() for details.
+    BackendOpFilter op_filter;
+
+    // KernelDefs built by RegisterCompilationKernels() for each op supported
+    // by the device.
+    std::vector<std::unique_ptr<KernelDef>> kernel_defs;
+  };
+
+  // Map from compilation device names to a description of the backend.
+  std::unordered_map<string, Backend> backends_ GUARDED_BY(mutex_);
+
+  // Map from Tensorflow device names to the corresponding JIT device metadata.
+  std::unordered_map<string, DeviceRegistration> compilation_devices_
+      GUARDED_BY(mutex_);
+
+  // A description of a Tensorflow operator that can be compiled to XLA.
+  struct OpRegistration {
+    string name;
+
+    // Should this operator be registered only on compilation devices, without a
+    // dummy kernel registered on the corresponding XLA device?
+    bool compilation_only = false;
+
+    // Should we allow resource types for type attributes? Used by _Arg to
+    // allow DT_RESOURCE.
+    bool allow_resource_types = false;
+
+    // Mapping from attribute name to a list of supported types.
+    std::unordered_map<string, std::set<DataType>> type_constraints;
+
+    // An optional whitelist of devices. If there is no whitelist, all devices
+    // are permitted.
+    bool has_device_whitelist = false;
+    std::unordered_set<string> device_whitelist;
+
+    // Factory used to build OpKernels that perform symbolic execution.
+    Factory factory;
+  };
+
+  // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP.
+  std::unordered_map<string, std::unique_ptr<OpRegistration>> ops_
+      GUARDED_BY(mutex_);
+
+  // Have we already registered the JIT kernels on the JIT devices?
+  bool jit_kernels_registered_ = false;
+
+  // Holds ownership of OpKernelRegistrars that represent the Tensorflow kernel
+  // registrations created by RegisterCompilationKernels() and
+  // RegisterDeviceKernels().
+  std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
+      kernel_registrars_ GUARDED_BY(mutex_);
+};
+
+// REGISTER_XLA_OP() registers an XLA OpKernel by name, for example:
+// REGISTER_XLA_OP(Name("Add"), AddOp);
+// where 'AddOp' is the name of a JIT OpKernel class that implements "Add".
+//
+// We don't use a variadic macro here because we don't expect JIT operators to
+// be templated.
+
+#define REGISTER_XLA_OP(NAME, OP) \
+  REGISTER_XLA_OP_UNIQ_HELPER(__COUNTER__, NAME, OP)
+
+class XlaOpRegistrationBuilder {
+ public:
+  // Starts an operator registration chain.
+  static XlaOpRegistrationBuilder Name(StringPiece name);
+
+  // Specifies a whitelist of devices on which the operator may run.
+  XlaOpRegistrationBuilder& Device(StringPiece devices);
+  XlaOpRegistrationBuilder& Device(gtl::ArraySlice<StringPiece> devices);
+
+  // Specifies a type constraint for a type variable attribute. Each constraint
+  // specifies the set of types that the type variable may assume.
+  XlaOpRegistrationBuilder& TypeConstraint(StringPiece attr_name,
+                                           DataType allowed);
+
+  XlaOpRegistrationBuilder& TypeConstraint(StringPiece attr_name,
+                                           gtl::ArraySlice<DataType> allowed);
+
+  // Specifies that a dummy copy of this operator should not be registered on
+  // XLA_* devices, but may be used during compilation.
+  XlaOpRegistrationBuilder& CompilationOnly();
+
+  // Allow DT_RESOURCE types for type parameters.
+  XlaOpRegistrationBuilder& AllowResourceTypes();
+
+  std::unique_ptr<XlaOpRegistry::OpRegistration> Build(
+      XlaOpRegistry::Factory factory);
+
+ private:
+  XlaOpRegistrationBuilder(StringPiece name);
+
+  std::unique_ptr<XlaOpRegistry::OpRegistration> registration_;
+};
+
+// REGISTER_XLA_BACKEND() registers an XLA backend. Example usage:
+// REGISTER_XLA_BACKEND(DEVICE_GPU_XLA_JIT, kGpuAllTypes, GpuOpFilter);
+#define REGISTER_XLA_BACKEND(NAME, ...) \
+  REGISTER_XLA_BACKEND_UNIQ_HELPER(__COUNTER__, NAME, __VA_ARGS__)
+
+// Implementation details.
+
+class XlaOpRegistrar {
+ public:
+  XlaOpRegistrar(std::unique_ptr<XlaOpRegistry::OpRegistration> registration);
+};
+
+#define REGISTER_XLA_OP_UNIQ_HELPER(COUNTER, BUILDER, OP) \
+  REGISTER_XLA_OP_UNIQ(COUNTER, BUILDER, OP)
+
+#define REGISTER_XLA_OP_UNIQ(CTR, BUILDER, OP)                                 \
+  static ::tensorflow::XlaOpRegistrar xla_op_registrar__body__##CTR##__object( \
+      XlaOpRegistrationBuilder::BUILDER.Build(                                 \
+          [](::tensorflow::OpKernelConstruction* context)                      \
+              -> ::tensorflow::OpKernel* { return new OP(context); }));
+
+class XlaBackendRegistrar {
+ public:
+  XlaBackendRegistrar(StringPiece name, gtl::ArraySlice<DataType> types,
+                      XlaOpRegistry::BackendOpFilter op_filter = nullptr);
+};
+
+#define REGISTER_XLA_BACKEND_UNIQ_HELPER(COUNTER, NAME, ...) \
+  REGISTER_XLA_BACKEND_UNIQ(COUNTER, NAME, __VA_ARGS__)
+
+#define REGISTER_XLA_BACKEND_UNIQ(CTR, NAME, ...) \
+  static ::tensorflow::XlaBackendRegistrar        \
+      xla_backend_registrar__body__##CTR##__object(NAME, __VA_ARGS__);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_OP_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 0f2a46c11d3..2491cc3f7a2 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -6,6 +6,8 @@ package_group(
     name = "friends",
     packages = [
         "//tensorflow/compiler/...",
+        "//tensorflow/contrib/tpu/...",
+        "//tensorflow/contrib/xla_tf_graph/...",
     ],
 )
 
@@ -16,6 +18,7 @@ package_group(
     ],
 )
 
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
 # Filegroup used to collect source files for dependency checking.
@@ -43,11 +46,43 @@ xla_proto_library(
     ],
 )
 
+# This is a headers target that extra XLA devices can use to prevent
+# circular dependencies.  Devices that are compiled as separate shared
+# objects can also use it to prevent linking of library code.
+cc_header_only_library(
+    name = "xla_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_evaluator",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "test",
+    testonly = 1,
+    hdrs = ["test.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "types",
     hdrs = ["types.h"],
     visibility = [":friends"],
-    deps = ["//tensorflow/core:lib"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
 )
 
 cc_library(
@@ -80,9 +115,9 @@ cc_test(
     deps = [
         ":status_macros",
         ":statusor",
+        ":test",
         ":test_helpers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -115,6 +150,7 @@ cc_test(
     srcs = ["statusor_test.cc"],
     deps = [
         ":statusor",
+        ":test",
         ":types",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -148,18 +184,22 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":status_macros",
+        ":statusor",
         ":types",
+        ":util",
         "//tensorflow/core:lib",
     ],
 )
 
 cc_test(
     name = "util_test",
+    size = "small",
     srcs = ["util_test.cc"],
     deps = [
+        ":test",
         ":types",
         ":util",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -195,37 +235,40 @@ cc_library(
 
 cc_test(
     name = "shape_util_test",
+    size = "small",
     srcs = ["shape_util_test.cc"],
     deps = [
         ":shape_util",
+        ":test",
         ":test_helpers",
         ":types",
         ":util",
-        "//tensorflow/core:test",
+        ":xla_data_proto",
         "//tensorflow/core:test_main",
     ],
 )
 
 cc_test(
     name = "layout_util_test",
+    size = "small",
     srcs = ["layout_util_test.cc"],
     deps = [
         ":shape_util",
+        ":test",
         ":test_helpers",
         "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
 cc_test(
     name = "index_util_test",
+    size = "small",
     srcs = ["index_util_test.cc"],
     deps = [
         ":shape_util",
-        ":test_helpers",
+        ":test",
         ":xla_data_proto",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -240,6 +283,7 @@ cc_library(
         ":array3d",
         ":array4d",
         ":shape_util",
+        ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto",
@@ -249,13 +293,14 @@ cc_library(
 
 cc_test(
     name = "literal_util_test",
+    size = "small",
     srcs = ["literal_util_test.cc"],
     deps = [
         ":array3d",
         ":array4d",
         ":literal_util",
         ":shape_util",
-        ":test_helpers",
+        ":test",
         ":types",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -270,7 +315,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":util",
-        ":xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -300,10 +344,11 @@ cc_library(
 
 cc_test(
     name = "array2d_test",
+    size = "small",
     srcs = ["array2d_test.cc"],
     deps = [
         ":array2d",
-        "//tensorflow/core:test",
+        ":test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -320,11 +365,12 @@ cc_library(
 
 cc_test(
     name = "array3d_test",
+    size = "small",
     srcs = ["array3d_test.cc"],
     deps = [
         ":array3d",
+        ":test",
         ":types",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -342,11 +388,12 @@ cc_library(
 
 cc_test(
     name = "array4d_test",
+    size = "small",
     srcs = ["array4d_test.cc"],
     deps = [
         ":array4d",
+        ":test",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -358,25 +405,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "differential_set",
-    hdrs = ["differential_set.h"],
-    visibility = [":internal"],
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "differential_set_test",
-    srcs = ["differential_set_test.cc"],
-    deps = [
-        ":differential_set",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "packed_literal_reader",
     srcs = ["packed_literal_reader.cc"],
@@ -397,7 +425,6 @@ cc_library(
 cc_library(
     name = "test_helpers",
     testonly = 1,
-    srcs = ["test_helpers.cc"],
     hdrs = ["test_helpers.h"],
     visibility = [":internal"],
     deps = [
@@ -429,15 +456,16 @@ cc_library(
 
 cc_test(
     name = "text_literal_reader_test",
+    size = "small",
     srcs = ["text_literal_reader_test.cc"],
     deps = [
         ":literal_util",
         ":shape_util",
+        ":test",
         ":text_literal_reader",
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -459,14 +487,15 @@ cc_library(
 
 cc_test(
     name = "text_literal_writer_test",
+    size = "small",
     srcs = ["text_literal_writer_test.cc"],
     deps = [
         ":literal_util",
+        ":test",
         ":test_helpers",
         ":text_literal_writer",
         ":types",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -486,12 +515,13 @@ cc_library(
 
 cc_test(
     name = "shape_tree_test",
+    size = "small",
     srcs = ["shape_tree_test.cc"],
     deps = [
         ":shape_tree",
         ":shape_util",
+        ":test",
         ":xla_data_proto",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -543,17 +573,18 @@ cc_library(
 
 cc_test(
     name = "reference_util_test",
+    size = "small",
     srcs = ["reference_util_test.cc"],
     deps = [
         ":array2d",
         ":array4d",
         ":literal_util",
         ":reference_util",
+        ":test",
         ":util",
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index ceed573f1f0..593084a0c11 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -44,12 +44,14 @@ class Array2D {
   Array2D() : n1_(0), n2_(0) {}
 
   // Creates an array of dimensions n1 x n2, uninitialized values.
-  Array2D(const int64 n1, const int64 n2) : n1_(n1), n2_(n2) {
-    values_.resize(n1 * n2);
+  Array2D(const int64 n1, const int64 n2)
+      : n1_(n1), n2_(n2), values_(new T[n1 * n2]()) {
+    Fill(T());
   }
 
   // Creates an array of dimensions n1 x n2, initialized to value.
-  Array2D(const int64 n1, const int64 n2, const T value) : Array2D(n1, n2) {
+  Array2D(const int64 n1, const int64 n2, const T value)
+      : n1_(n1), n2_(n2), values_(new T[n1 * n2]()) {
     Fill(value);
   }
 
@@ -67,16 +69,30 @@ class Array2D {
     }
   }
 
-  T& operator()(const int64 n1, const int64 n2) {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    return values_[n1 * n2_ + n2];
+  Array2D(const Array2D<T>& other) : Array2D(other.n1(), other.n2()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
   }
 
-  const T& operator()(const int64 n1, const int64 n2) const {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    return values_[n1 * n2_ + n2];
+  Array2D<T>& operator=(const Array2D<T>& other) {
+    n1_ = other.n1();
+    n2_ = other.n2();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
+  T& operator()(const int64 i1, const int64 i2) {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    return values_[i1 * n2_ + i2];
+  }
+
+  const T& operator()(const int64 i1, const int64 i2) const {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    return values_[i1 * n2_ + i2];
   }
 
   // Access to the array's dimensions. height() and width() provide the
@@ -86,15 +102,15 @@ class Array2D {
   int64 n2() const { return n2_; }
   int64 height() const { return n1_; }
   int64 width() const { return n2_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return n1_ * n2_; }
 
   // Low-level accessor for stuff like memcmp, handle with care. Returns pointer
   // to the underlying storage of the array (similarly to std::vector::data()).
-  T* data() const { return const_cast<Array2D*>(this)->values_.data(); }
+  T* data() const { return const_cast<Array2D*>(this)->values_.get(); }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Applies f to all cells in this array, in row-major order.
@@ -126,8 +142,8 @@ class Array2D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -152,7 +168,7 @@ class Array2D {
  private:
   int64 n1_;
   int64 n2_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 // Returns a linspace-populated Array2D in the range [from, to] (inclusive)
diff --git a/tensorflow/compiler/xla/array2d_test.cc b/tensorflow/compiler/xla/array2d_test.cc
index ac107b1c0d4..795d50ca5b5 100644
--- a/tensorflow/compiler/xla/array2d_test.cc
+++ b/tensorflow/compiler/xla/array2d_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <initializer_list>
 
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/compiler/xla/test.h"
 
 namespace xla {
 namespace {
@@ -84,6 +84,17 @@ TEST(Array2dTest, IndexingReadWrite) {
   EXPECT_EQ(arr(1, 2), 61);
 }
 
+TEST(Array2dTest, IndexingReadWriteBool) {
+  Array2D<bool> arr = {{false, true, false}, {true, true, false}};
+
+  EXPECT_EQ(arr(1, 1), true);
+  EXPECT_EQ(arr(1, 2), false);
+  arr(1, 1) = false;
+  arr(1, 2) = true;
+  EXPECT_EQ(arr(1, 1), false);
+  EXPECT_EQ(arr(1, 2), true);
+}
+
 TEST(Array2dTest, Fill) {
   Array2D<int> fullof7(2, 3, 7);
   for (int64 n1 = 0; n1 < fullof7.n1(); ++n1) {
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index 46bc1a63921..124ccd1975b 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <random>
-#include <vector>
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -39,13 +39,13 @@ class Array3D {
  public:
   // Creates an array of dimensions n1 x n2 x n3, uninitialized values.
   Array3D(const int64 n1, const int64 n2, const int64 n3)
-      : n1_(n1), n2_(n2), n3_(n3) {
-    values_.resize(n1 * n2 * n3);
+      : n1_(n1), n2_(n2), n3_(n3), values_(new T[n1 * n2 * n3]) {
+    Fill(T());
   }
 
   // Creates an array of dimensions n1 x n2 x n3, initialized to value.
   Array3D(const int64 n1, const int64 n2, const int64 n3, const T value)
-      : Array3D(n1, n2, n3) {
+      : n1_(n1), n2_(n2), n3_(n3), values_(new T[n1 * n2 * n3]) {
     Fill(value);
   }
 
@@ -73,34 +73,50 @@ class Array3D {
     }
   }
 
-  T& operator()(const int64 n1, const int64 n2, const int64 n3) {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    CHECK_LT(n3, n3_);
-    return values_[n1 * n2_ * n3_ + n2 * n3_ + n3];
+  Array3D(const Array3D<T>& other)
+      : Array3D(other.n1(), other.n2(), other.n3()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
   }
 
-  const T& operator()(const int64 n1, const int64 n2, const int64 n3) const {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    CHECK_LT(n3, n3_);
-    return values_[n1 * n2_ * n3_ + n2 * n3_ + n3];
+  Array3D<T>& operator=(const Array3D<T>& other) {
+    n1_ = other.n1();
+    n2_ = other.n2();
+    n3_ = other.n3();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
+  T& operator()(const int64 i1, const int64 i2, const int64 i3) {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    CHECK_LT(i3, n3_);
+    return values_[i1 * n2_ * n3_ + i2 * n3_ + i3];
+  }
+
+  const T& operator()(const int64 i1, const int64 i2, const int64 i3) const {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    CHECK_LT(i3, n3_);
+    return values_[i1 * n2_ * n3_ + i2 * n3_ + i3];
   }
 
   // Access to the array's dimensions.
   int64 n1() const { return n1_; }
   int64 n2() const { return n2_; }
   int64 n3() const { return n3_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return n1_ * n2_ * n3_; }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with sequentially increasing values.
   void FillIota(const T& value) {
-    std::iota(values_.begin(), values_.end(), value);
+    std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with random normal values with a mean of 0 and standard
@@ -110,8 +126,8 @@ class Array3D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -119,7 +135,7 @@ class Array3D {
   int64 n1_;
   int64 n2_;
   int64 n3_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/array3d_test.cc b/tensorflow/compiler/xla/array3d_test.cc
index fa4435dfc48..6b5f4b343b2 100644
--- a/tensorflow/compiler/xla/array3d_test.cc
+++ b/tensorflow/compiler/xla/array3d_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <initializer_list>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index db51a57cf26..d93f968f4d7 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <random>
 #include <string>
@@ -54,13 +55,17 @@ namespace xla {
 template <typename T>
 class Array4D {
  public:
-  // Creates a 4D array, unitialized values.
+  // Creates a 4D array, uninitialized values.
   Array4D(int64 planes, int64 depth, int64 height, int64 width)
-      : planes_(planes), depth_(depth), height_(height), width_(width) {
-    values_.resize(planes * depth * height * width);
+      : planes_(planes),
+        depth_(depth),
+        height_(height),
+        width_(width),
+        values_(new T[planes * depth * height * width]) {
+    Fill(T());
   }
 
-  // Creates a 4D array, initalized to value.
+  // Creates a 4D array, initialized to value.
   Array4D(int64 planes, int64 depth, int64 height, int64 width, T value)
       : Array4D(planes, depth, height, width) {
     Fill(value);
@@ -107,6 +112,23 @@ class Array4D {
     }
   }
 
+  Array4D(const Array4D<T>& other)
+      : Array4D(other.planes(), other.depth(), other.height(), other.width()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+  }
+
+  Array4D<T>& operator=(const Array4D<T>& other) {
+    planes_ = other.planes();
+    depth_ = other.depth();
+    height_ = other.height();
+    width_ = other.width();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
   T& operator()(int64 plane, int64 depth, int64 height, int64 width) {
     CHECK_LT(plane, planes_);
     CHECK_LT(depth, depth_);
@@ -131,24 +153,24 @@ class Array4D {
   int64 n3() const { return height_; }
   int64 n2() const { return depth_; }
   int64 n1() const { return planes_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return width_ * height_ * depth_ * planes_; }
 
   // Sets all the values in the array to values.
   template <typename Container = std::initializer_list<T>>
   void SetValues(const Container& container) {
     CHECK_EQ(std::distance(std::begin(container), std::end(container)),
              num_elements());
-    values_.assign(std::begin(container), std::end(container));
+    std::copy(std::begin(container), std::end(container), &values_[0]);
   }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with iota.
   void FillIota(const T& value) {
-    std::iota(values_.begin(), values_.end(), value);
+    std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with random variable with a deviation of value and a mean
@@ -158,8 +180,8 @@ class Array4D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -264,7 +286,7 @@ class Array4D {
   int64 depth_;
   int64 height_;
   int64 width_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/array4d_test.cc b/tensorflow/compiler/xla/array4d_test.cc
index 72ada467e51..3bc8148c911 100644
--- a/tensorflow/compiler/xla/array4d_test.cc
+++ b/tensorflow/compiler/xla/array4d_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <initializer_list>
 #include <numeric>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 3e9dfe2a922..63c6d9ddaca 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -46,6 +46,7 @@ cc_library(
 
 cc_test(
     name = "padding_test",
+    size = "small",
     srcs = ["padding_test.cc"],
     deps = [
         ":padding",
@@ -99,6 +100,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compile_only_client",
+    srcs = ["compile_only_client.cc"],
+    hdrs = ["compile_only_client.h"],
+    deps = [
+        ":client",
+        ":computation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:compile_only_service",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@llvm//:support",
+    ],
+)
+
 # This target is used to instantiate the XLA service in-process and create
 # a client for it.
 cc_library(
@@ -106,12 +127,14 @@ cc_library(
     srcs = ["client_library.cc"],
     hdrs = ["client_library.h"],
     deps = [
+        ":compile_only_client",
         ":local_client",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 341c02f1a1f..454d0fbd965 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -58,34 +58,13 @@ StatusOr<std::unique_ptr<Literal>> Client::Transfer(
         "server provided response without a literal in "
         "TransferToClient request");
   }
-
-  return WrapUnique(response.release_literal());
-}
-
-Status Client::TransferInProcess(const GlobalData& data, void* destination) {
-  TransferToClientInProcessRequest request;
-  *request.mutable_data() = data.handle();
-  request.set_buffer(reinterpret_cast<uint64>(destination));
-  TransferToClientInProcessResponse response;
-
-  VLOG(1) << "making transfer in-process request";
-  VLOG(3) << "TransferToClientInProcessRequest: {" << request.DebugString()
-          << "}";
-  Status s = stub_->TransferToClientInProcess(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  VLOG(3) << "TransferToClientInProcessResponse: {" << response.DebugString()
-          << "}";
-  return Status::OK();
+  return MakeUnique<Literal>(response.literal());
 }
 
 StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
     const Literal& literal, const DeviceHandle* device_handle) {
   TransferToServerRequest request;
-  *request.mutable_literal() = literal;
+  *request.mutable_literal() = literal.ToProto();
   if (device_handle) {
     *request.mutable_device_handle() = *device_handle;
   }
@@ -113,7 +92,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
 Status Client::TransferToInfeed(const Literal& literal, int64 replica_id,
                                 const DeviceHandle* device_handle) {
   TransferToInfeedRequest request;
-  *request.mutable_literal() = literal;
+  *request.mutable_literal() = literal.ToProto();
   if (device_handle) {
     *request.mutable_device_handle() = *device_handle;
   }
@@ -132,6 +111,39 @@ Status Client::TransferToInfeed(const Literal& literal, int64 replica_id,
   return Status::OK();
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::TransferFromOutfeed(
+    const Shape* shape_with_layout, int64 replica_id,
+    const DeviceHandle* device_handle) {
+  TransferFromOutfeedRequest request;
+  if (device_handle) {
+    *request.mutable_device_handle() = *device_handle;
+  }
+  request.set_replica_id(replica_id);
+  if (shape_with_layout != nullptr) {
+    *request.mutable_shape_with_layout() = *shape_with_layout;
+  }
+  TransferFromOutfeedResponse response;
+
+  VLOG(1) << "making transfer from outfeed request";
+  VLOG(3) << "TransferFromOutfeedRequest: {" << request.DebugString() << "}";
+  Status s = stub_->TransferFromOutfeed(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  VLOG(3) << "TransferFromOutfeedResponse: {" << response.DebugString() << "}";
+
+  if (!response.has_literal()) {
+    return FailedPrecondition(
+        "server provided response without a literal in "
+        "TransferToClient request");
+  }
+
+  Literal literal(response.literal());
+  return MakeUnique<Literal>(literal);
+}
+
 Status Client::ResetDevice() {
   ResetDeviceRequest request;
   ResetDeviceResponse response;
@@ -164,34 +176,6 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServerInProcess(
-    const Shape& shape, const void* buffer) {
-  TransferToServerInProcessRequest request;
-  request.set_buffer(reinterpret_cast<uint64>(buffer));
-  *request.mutable_shape() = shape;
-  TransferToServerInProcessResponse response;
-
-  VLOG(1) << "making transfer to server in-process request";
-  VLOG(3) << "TransferToServerInProcessRequest: {" << request.DebugString()
-          << "}";
-  Status s = stub_->TransferToServerInProcess(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  VLOG(3) << "TransferToServerInProcessResponse: {" << response.DebugString()
-          << "}";
-
-  if (!response.has_data()) {
-    return FailedPrecondition(
-        "server provided response without a data handle in "
-        "TransferToServerInProcess request");
-  }
-
-  return MakeUnique<GlobalData>(stub_, response.data());
-}
-
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
@@ -269,7 +253,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   }
 
   std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (int64 i = 0; i < computations.size(); ++i) {
+  for (size_t i = 0; i < computations.size(); ++i) {
     outputs.push_back(
         MakeUnique<GlobalData>(stub_, response.responses(i).output()));
     if (computations[i].execution_profile != nullptr) {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index f261de9d0d1..797835160fa 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -119,6 +120,15 @@ class Client {
   Status TransferToInfeed(const Literal& literal, int64 replica_id = 0,
                           const DeviceHandle* device_handle = nullptr);
 
+  // Transfers from the Outfeed of the device.
+  //
+  // device_handle and replica_id together specify a particular device; a device
+  // assigned for the given replica_id among the replicas that the given device
+  // handle belongs to.
+  StatusOr<std::unique_ptr<Literal>> TransferFromOutfeed(
+      const Shape* shape_with_layout, int64 replica_id = 0,
+      const DeviceHandle* device_handle = nullptr);
+
   // Resets the device, clearing all existing state on the device.
   Status ResetDevice();
 
@@ -143,8 +153,7 @@ class Client {
       const Computation& computation) const;
 
   // Returns the Shape of the given array specified by 'data'. The shape
-  // includes the Layout of the array as it is stored on the service. The layout
-  // information is useful for calling TransferInProcess.
+  // includes the Layout of the array as it is stored on the service.
   StatusOr<Shape> GetShape(const GlobalData& data);
 
   // As above, but returns the shape of the provided computation (parameter
@@ -156,24 +165,6 @@ class Client {
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
 
-  // If the service is running in the same process as the client then the
-  // following "InProcess" transfer methods may be used. These methods enable
-  // more efficient transfer of arrays to and from the service.
-
-  // Transfer array from the service into the given buffer. The buffer must be
-  // large enough to hold the array. The array is copied verbatim (memcpy) from
-  // the service. The method GetShape should be called ahead of time
-  // to get the shape and layout of the array as it is stored in the
-  // service. The shape and layout can be used to determine how large the buffer
-  // needs to be.
-  Status TransferInProcess(const GlobalData& data, void* destination);
-
-  // Transfer array to the service from the given buffer with the given shape
-  // and layout. The service creates an internal copy of the data so the client
-  // can free the buffer when this method returns.
-  StatusOr<std::unique_ptr<GlobalData>> TransferToServerInProcess(
-      const Shape& shape, const void* buffer);
-
   StatusOr<Computation> LoadSnapshot(const SessionModule& module);
 
   ServiceInterface* stub() { return stub_; }
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 93437023bc8..8238261e1c9 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -43,6 +43,16 @@ int LocalClientOptions::number_of_replicas() const {
   return number_of_replicas_;
 }
 
+LocalClientOptions& LocalClientOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int LocalClientOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ ClientLibrary& ClientLibrary::Singleton() {
   static ClientLibrary* c = new ClientLibrary;
   return *c;
@@ -69,22 +79,24 @@ ClientLibrary::~ClientLibrary() = default;
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  auto it = client_library.instances_.find(platform->id());
-  if (it != client_library.instances_.end()) {
+  auto it = client_library.local_instances_.find(platform->id());
+  if (it != client_library.local_instances_.end()) {
     return it->second->client.get();
   }
 
   ServiceOptions service_options;
   service_options.set_platform(platform);
   service_options.set_number_of_replicas(replica_count);
+  service_options.set_intra_op_parallelism_threads(
+      options.intra_op_parallelism_threads());
 
-  std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
+  auto instance = MakeUnique<LocalInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       LocalService::NewService(service_options));
   instance->client = MakeUnique<LocalClient>(instance->service.get());
   LocalClient* cl = instance->client.get();
 
-  client_library.instances_.insert(
+  client_library.local_instances_.insert(
       std::make_pair(platform->id(), std::move(instance)));
   return cl;
 }
@@ -99,9 +111,35 @@ ClientLibrary::~ClientLibrary() = default;
     perftools::gputools::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
-  auto it = client_library.instances_.find(platform->id());
-  CHECK(it != client_library.instances_.end());
+  auto it = client_library.local_instances_.find(platform->id());
+  CHECK(it != client_library.local_instances_.end());
   return it->second->service.get();
 }
 
+/* static */ StatusOr<CompileOnlyClient*>
+ClientLibrary::GetOrCreateCompileOnlyClient(
+    perftools::gputools::Platform* platform) {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  auto it = client_library.compile_only_instances_.find(platform->id());
+  if (it != client_library.compile_only_instances_.end()) {
+    return it->second->client.get();
+  }
+
+  auto instance = MakeUnique<CompileOnlyInstance>();
+  TF_ASSIGN_OR_RETURN(instance->service,
+                      CompileOnlyService::NewService(platform));
+  instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
+  CompileOnlyClient* cl = instance->client.get();
+
+  client_library.compile_only_instances_.insert(
+      std::make_pair(platform->id(), std::move(instance)));
+  return cl;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 2bc319f9333..3ddd235d0ef 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -51,9 +53,14 @@ class LocalClientOptions {
   LocalClientOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  LocalClientOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 class ClientLibrary {
@@ -76,6 +83,13 @@ class ClientLibrary {
   // access user computations from client.
   static LocalService* GetXlaService(perftools::gputools::Platform* platform);
 
+  // Singleton constructor-or-accessor for compile-only clients. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
+      perftools::gputools::Platform* platform = nullptr);
+
  private:
   // Returns the singleton instance of ClientLibrary.
   static ClientLibrary& Singleton();
@@ -90,10 +104,21 @@ class ClientLibrary {
     std::unique_ptr<LocalClient> client;
   };
 
+  struct CompileOnlyInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<CompileOnlyService> service;
+    // Singleton client object.
+    std::unique_ptr<CompileOnlyClient> client;
+  };
+
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
   std::unordered_map<perftools::gputools::Platform::Id,
                      std::unique_ptr<LocalInstance>>
-      instances_ GUARDED_BY(service_mutex_);
+      local_instances_ GUARDED_BY(service_mutex_);
+
+  std::unordered_map<perftools::gputools::Platform::Id,
+                     std::unique_ptr<CompileOnlyInstance>>
+      compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
new file mode 100644
index 00000000000..d9972ef77b9
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+
+#include "external/llvm/include/llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotComputationInstance& instance : computations) {
+    service_instances.push_back({});
+    CompileOnlyService::AotComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->handle();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
+int64 CompileOnlyClient::PointerSizeForTriple(
+    tensorflow::StringPiece target_triple) {
+  llvm::Triple triple(
+      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
+  if (triple.isArch64Bit()) {
+    return 8;
+  } else if (triple.isArch32Bit()) {
+    return 4;
+  } else {
+    CHECK(triple.isArch16Bit());
+    return 2;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
new file mode 100644
index 00000000000..59000487113
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Client specialization for doing ahead-of-time compilation.  This does
+// not require (or attempt to instantiate) an execution-capable backend for the
+// relevant platform.
+class CompileOnlyClient : public Client {
+ public:
+  explicit CompileOnlyClient(CompileOnlyService* service)
+      : Client(service), compiler_service_(service) {}
+
+  CompileOnlyClient(const CompileOnlyClient&) = delete;
+  void operator=(const CompileOnlyClient&) = delete;
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    const Computation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& options);
+
+  // Returns the size of a pointer in bytes for a given triple.
+  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
+
+ private:
+  CompileOnlyService* compiler_service_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
diff --git a/tensorflow/compiler/xla/client/computation.cc b/tensorflow/compiler/xla/client/computation.cc
index cd7d8df58b8..4baea8df6e3 100644
--- a/tensorflow/compiler/xla/client/computation.cc
+++ b/tensorflow/compiler/xla/client/computation.cc
@@ -28,12 +28,12 @@ Computation::Computation(ServiceInterface* parent,
     : handle_(handle), parent_(parent) {}
 
 Computation::Computation(Computation&& computation)
-    : handle_(computation.handle_), parent_(computation.parent_) {
+    : handle_(std::move(computation.handle_)), parent_(computation.parent_) {
   computation.ResetWithoutFreeing();
 }
 
 void Computation::Reset() {
-  // TODO(leary) deallocate any owned computation.
+  // TODO(b/34469253) deallocate any owned computation.
   ResetWithoutFreeing();
 }
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 73f450e1f2e..37bf697683b 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -106,7 +106,7 @@ bool ComputationBuilder::MakeWindow(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation, Window* window) {
-  const auto verify_size = [&](const int64 x, const char* x_name) {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
     if (x == 0 || x == window_dimensions.size()) {
       return true;
     } else {
@@ -165,12 +165,14 @@ ComputationDataHandle ComputationBuilder::ConstantOp(
   }
 
   ConstantRequest request;
-  Literal* literal = request.mutable_literal();
-  populate(literal);
-  VLOG(3) << "created constant: " << literal->ShortDebugString();
+  Literal literal;
+  populate(&literal);
+  *request.mutable_literal() = literal.ToProto();
+  VLOG(3) << "created constant: " << request.literal().ShortDebugString();
   OpRequest op_request;
   *op_request.mutable_constant_request() = request;
   *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making constant request";
@@ -198,6 +200,7 @@ ComputationDataHandle ComputationBuilder::Parameter(int64 parameter_number,
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_parameter_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making parameter request";
@@ -253,7 +256,8 @@ void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> stride) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
     return ComputationDataHandle();
   }
@@ -266,9 +270,13 @@ ComputationDataHandle ComputationBuilder::Slice(
   for (int64 index : limit_indices) {
     request.add_limit_indices(index);
   }
+  for (int64 index : stride) {
+    request.add_stride(index);
+  }
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_slice_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making slice request";
@@ -293,6 +301,7 @@ ComputationDataHandle ComputationBuilder::DynamicSlice(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_dynamic_slice_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making dynamic slice request";
@@ -314,6 +323,7 @@ ComputationDataHandle ComputationBuilder::DynamicUpdateSlice(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_dynamic_update_slice_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making dynamic update slice request";
@@ -336,6 +346,7 @@ ComputationDataHandle ComputationBuilder::ConcatInDim(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_concatenate_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making concatenate request";
@@ -358,6 +369,7 @@ ComputationDataHandle ComputationBuilder::Broadcast(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_broadcast_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making broadcast request";
@@ -380,6 +392,7 @@ ComputationDataHandle ComputationBuilder::Pad(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_pad_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making pad request";
@@ -406,6 +419,7 @@ ComputationDataHandle ComputationBuilder::Reshape(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_reshape_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making reshape request";
@@ -440,7 +454,8 @@ ComputationDataHandle ComputationBuilder::Collapse(
 
   // Don't support out-of-order collapse here.
   // Checks that the collapsed dimensions are in order and consecutive.
-  for (int i = 1; i < dims_to_collapse.size(); ++i) {
+  for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
+       i < dims_to_collapse.size(); ++i) {
     if (dims_to_collapse[i] - 1 != dims_to_collapse[i - 1]) {
       NoteError(InvalidArgument(
           "Collapsed dimensions are not in order and consecutive."));
@@ -482,6 +497,7 @@ void ComputationBuilder::Trace(const string& tag,
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_trace_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making trace request";
@@ -513,6 +529,7 @@ ComputationDataHandle ComputationBuilder::Tuple(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_variadic_op_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making variadic op request";
@@ -532,6 +549,7 @@ ComputationDataHandle ComputationBuilder::GetTupleElement(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_get_tuple_element_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making get tuple element op request";
@@ -681,14 +699,15 @@ ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
 
   std::vector<int64> base_area_dimensions(
       dimension_numbers.spatial_dimensions_size());
-  for (int i = 0; i < base_area_dimensions.size(); ++i) {
+  for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
+       ++i) {
     base_area_dimensions[i] =
         lhs_shape->dimensions(dimension_numbers.spatial_dimensions(i));
   }
 
   std::vector<int64> window_dimensions(
       dimension_numbers.kernel_spatial_dimensions_size());
-  for (int i = 0; i < window_dimensions.size(); ++i) {
+  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
     window_dimensions[i] =
         rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
   }
@@ -740,7 +759,7 @@ ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
 
   std::vector<int64> window_dimensions(
       dimension_numbers.kernel_spatial_dimensions_size());
-  for (int i = 0; i < window_dimensions.size(); ++i) {
+  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
     window_dimensions[i] =
         rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
   }
@@ -758,6 +777,7 @@ ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_convolve_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making convolve request";
@@ -777,6 +797,7 @@ ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_infeed_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making infeed op request";
@@ -786,6 +807,7 @@ ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
 }
 
 void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
+                                 const Shape& shape,
                                  const string& outfeed_config) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
     return;
@@ -794,9 +816,11 @@ void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
   OutfeedRequest request;
   request.set_outfeed_config(outfeed_config);
   *request.mutable_operand() = operand;
+  *request.mutable_shape() = shape;
   OpRequest op_request;
   *op_request.mutable_outfeed_request() = request;
   *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making outfeed op request";
@@ -823,6 +847,7 @@ ComputationDataHandle ComputationBuilder::Call(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_call_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making call op request";
@@ -848,6 +873,7 @@ ComputationDataHandle ComputationBuilder::CustomCall(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_custom_call_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making custom call op request";
@@ -950,22 +976,31 @@ ComputationDataHandle ComputationBuilder::Tanh(
   return UnaryOp(UNOP_TANH, operand);
 }
 
+ComputationDataHandle ComputationBuilder::IsFinite(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_IS_FINITE, operand);
+}
+
 ComputationDataHandle ComputationBuilder::Transpose(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> permutation) {
-  if (!first_error_.ok()) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
     return ComputationDataHandle();
   }
 
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    // Just early return with the existing error status.
-    first_error_ = shape.status();
-    return ComputationDataHandle();
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  TransposeRequest* request = op_request.mutable_transpose_request();
+  *request->mutable_operand() = operand;
+  for (int64 dimension : permutation) {
+    request->add_dimensions(dimension);
   }
-  return Reshape(operand, permutation,
-                 Permute(InversePermutation(permutation),
-                         AsInt64Slice(shape.ValueOrDie()->dimensions())));
+  AddOpMetadata(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making transpose request";
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
 }
 
 ComputationDataHandle ComputationBuilder::Rev(
@@ -983,6 +1018,7 @@ ComputationDataHandle ComputationBuilder::Rev(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_reverse_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making reverse op request";
@@ -1003,8 +1039,9 @@ ComputationDataHandle ComputationBuilder::SqrtF32(
 }
 
 ComputationDataHandle ComputationBuilder::Pow(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return BinaryOp(BINOP_POW, lhs, rhs, /*broadcast_dimensions=*/{});
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions);
 }
 
 ComputationDataHandle ComputationBuilder::ConvertElementType(
@@ -1027,6 +1064,7 @@ ComputationDataHandle ComputationBuilder::ConvertElementType(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_convert_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making convert request";
@@ -1070,6 +1108,7 @@ ComputationDataHandle ComputationBuilder::UnaryOp(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_unary_op_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making unop request";
@@ -1096,6 +1135,7 @@ ComputationDataHandle ComputationBuilder::BinaryOp(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_binary_op_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making binop request";
@@ -1121,6 +1161,7 @@ ComputationDataHandle ComputationBuilder::RngOp(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_rng_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making rngop request";
@@ -1144,6 +1185,7 @@ ComputationDataHandle ComputationBuilder::TernaryOp(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_ternary_op_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making triop request";
@@ -1192,8 +1234,7 @@ StatusOr<bool> ComputationBuilder::IsConstant(
   VLOG(2) << "done with request";
 
   if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
+    return s;
   }
   return response.is_constant();
 }
@@ -1218,8 +1259,7 @@ StatusOr<std::unique_ptr<GlobalData>> ComputationBuilder::ComputeConstant(
   VLOG(2) << "done with request";
 
   if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
+    return s;
   }
 
   TF_RET_CHECK(response.output().handle() != 0);
@@ -1245,6 +1285,7 @@ ComputationDataHandle ComputationBuilder::Map(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_map_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making Map request";
@@ -1283,6 +1324,7 @@ ComputationDataHandle ComputationBuilder::While(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_while_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making while request";
@@ -1308,6 +1350,7 @@ ComputationDataHandle ComputationBuilder::Reduce(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_reduce_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making reduce request";
@@ -1360,6 +1403,7 @@ ComputationDataHandle ComputationBuilder::ReduceWindowWithGeneralPadding(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_reduce_window_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making reduce-window request";
@@ -1378,6 +1422,7 @@ ComputationDataHandle ComputationBuilder::CrossReplicaSum(
   OpRequest op_request;
   *op_request.mutable_cross_replica_sum_request() = request;
   *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making cross-replica-sum request";
@@ -1434,6 +1479,7 @@ ComputationDataHandle ComputationBuilder::SelectAndScatterWithGeneralPadding(
   OpRequest op_request;
   *op_request.mutable_computation() = computation_.handle();
   *op_request.mutable_select_and_scatter_request() = request;
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making select-and-scatter request";
@@ -1453,11 +1499,12 @@ void ComputationBuilder::Send(const ComputationDataHandle& operand,
   OpRequest op_request;
   *op_request.mutable_send_request() = request;
   *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making send request";
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  VLOG(2) << "done with request";
+  Status s = client_->stub()->Op(&op_request, &response);
+  VLOG(2) << "done with op request";
 
   if (!s.ok()) {
     NoteError(s);
@@ -1477,12 +1524,11 @@ ComputationDataHandle ComputationBuilder::Recv(const Shape& shape,
   OpRequest op_request;
   *op_request.mutable_recv_request() = request;
   *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
   OpResponse response;
 
   VLOG(2) << "making recv request";
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  VLOG(2) << "done with request";
-
+  Status s = client_->stub()->Op(&op_request, &response);
   return ParseOpResponse(s, &response);
 }
 
@@ -1512,6 +1558,11 @@ StatusOr<Computation> ComputationBuilder::Build() {
   return {std::move(computation_)};
 }
 
+void ComputationBuilder::AddOpMetadata(OpRequest* request) const {
+  tensorflow::mutex_lock lock(mutex_);
+  *request->mutable_metadata() = metadata_;
+}
+
 /* static */ ConvolutionDimensionNumbers
 ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   ConvolutionDimensionNumbers dimension_numbers;
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 67ca9c6cf74..5cc73c28d03 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -61,6 +62,23 @@ class ComputationBuilder {
   // Returns the computation name.
   const string& name() { return name_; }
 
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the Computation Builder. All subsequent
+  // instructions generated via this Computation Builder will have the same
+  // OpMetadata attached until a call to ClearOpMetdata.
+  void SetOpMetadata(const OpMetadata& metadata) {
+    tensorflow::mutex_lock lock(mutex_);
+    metadata_ = metadata;
+  }
+
+  // Clears the HloMetdata state.
+  void ClearOpMetadata() {
+    tensorflow::mutex_lock lock(mutex_);
+    metadata_.Clear();
+  }
+
   // Sets the builder to a mode where it will die immediately when an error is
   // encountered, rather than producing it in a deferred fashion when Build() is
   // called (which is the default).
@@ -193,9 +211,11 @@ class ComputationBuilder {
   //
   // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
   // range notation.
+  // The stride parameter determines the stride over the slice
   ComputationDataHandle Slice(const ComputationDataHandle& operand,
                               tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices);
+                              tensorflow::gtl::ArraySlice<int64> limit_indices,
+                              tensorflow::gtl::ArraySlice<int64> stride);
 
   // Enqueues a slice operation onto the computation that slices the 'operand'
   // from dynamic start indices which are passed in 'start_indices'.
@@ -352,13 +372,13 @@ class ComputationBuilder {
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  // Enqueues an infeed instruction onto the computation, which reads data of
-  // the given shape from the infeed buffer of the device.
+  // Enqueues an infeed instruction onto the computation, which writes data of
+  // the given shape to the infeed buffer of the device.
   ComputationDataHandle Infeed(const Shape& shape, const string& config = "");
 
   // Enqueues an outfeed instruction onto the computation. This instruction
   // generates outgoing data transfers for the given data.
-  void Outfeed(const ComputationDataHandle& operand,
+  void Outfeed(const ComputationDataHandle& operand, const Shape& shape,
                const string& outfeed_config);
 
   // Enqueues a call instruction onto the computation.
@@ -504,8 +524,15 @@ class ComputationBuilder {
   ComputationDataHandle SquareF32(const ComputationDataHandle& operand);
 
   // Enqueues a lhs^rhs computation onto the computation.
-  ComputationDataHandle Pow(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
+  ComputationDataHandle Pow(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an operator that tests if the operand's values are finite, i.e.,
+  // not Inf or NaN. Defined only for floating-point types. Returns an array of
+  // booleans with the same shape where entries are true iff the corresponding
+  // entry was NaN.
+  ComputationDataHandle IsFinite(const ComputationDataHandle& operand);
 
   // Enqueues a convert instruction onto the computation that changes the
   // element type of the operand array to primitive_type.
@@ -516,8 +543,8 @@ class ComputationBuilder {
   // (float32 is specified as there is an implicit float32 -1.0f constant
   // exponent).
   //
-  // TODO(leary) axe F32 suffix, can be determined by reflecting on the shape of
-  // the operand.
+  // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the
+  // shape of the operand.
   ComputationDataHandle ReciprocalF32(const ComputationDataHandle& operand);
 
   // Enqueues a negate instruction onto the computation.
@@ -586,6 +613,48 @@ class ComputationBuilder {
   // computation.
   StatusOr<bool> IsConstant(const ComputationDataHandle& operand);
 
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+  // is the normalized result and batch_mean and batch_var are the mean and
+  // variance, respectively, across batch for the operand.
+  ComputationDataHandle BatchNormTraining(const ComputationDataHandle& operand,
+                                          const ComputationDataHandle& scale,
+                                          const ComputationDataHandle& offset,
+                                          float epsilon, int64 feature_index);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+  // computing `mean` and `variance` for each batch inside the operation. It
+  // uses the input `mean` and `variance` instead as estimated values. The
+  // purpose of this op is to reduce latency in inference, hence the name
+  // `BatchNormInference`.
+  //
+  // The output has the same shape as `operand`, and contains the normalized
+  // values for each batch.
+  ComputationDataHandle BatchNormInference(
+      const ComputationDataHandle& operand, const ComputationDataHandle& scale,
+      const ComputationDataHandle& offset, const ComputationDataHandle& mean,
+      const ComputationDataHandle& variance, float epsilon,
+      int64 feature_index);
+
+  // Calculates the gradients of a batch norm op.
+  //
+  // The inputs `batch_mean` and `batch_var` represent the mean and variance
+  // across the batch.
+  //
+  // Returns a tuple of three elements:
+  //   - grad_operand: Gradient with respect to input `operand`
+  //   - grad_offset: Gradient with respect to input `offset`
+  //   - grad_scale: Gradient with respect to input `scale`
+  ComputationDataHandle BatchNormGrad(const ComputationDataHandle& operand,
+                                      const ComputationDataHandle& scale,
+                                      const ComputationDataHandle& batch_mean,
+                                      const ComputationDataHandle& batch_var,
+                                      const ComputationDataHandle& grad_output,
+                                      float epsilon, int64 feature_index);
+
   // Computes the value of a constant indicated by a
   // ComputationDataHandle.
   //
@@ -643,6 +712,14 @@ class ComputationBuilder {
   // then Build() should be used instead.
   Computation BuildAndNoteError();
 
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // ComputationDataHandle and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
  private:
   using PopulateLiteral = std::function<void(Literal*)>;
 
@@ -710,6 +787,8 @@ class ComputationBuilder {
   // * dying if die_immediately_on_error_ is true
   void NoteError(const Status& error);
 
+  void AddOpMetadata(OpRequest* request) const;
+
   string name_;  // Name to use for the built computation.
 
   // The first error encountered while building the computation.
@@ -728,6 +807,14 @@ class ComputationBuilder {
   // Mode bit that indicates whether to die when a first error is encountered.
   bool die_immediately_on_error_{false};
 
+  // Mutex to guard against concurrent access to metadata_.
+  mutable tensorflow::mutex mutex_;
+
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_ GUARDED_BY(mutex_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationBuilder);
 };
 
@@ -804,7 +891,7 @@ template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR4FromArray4DWithLayout(
     const Array4D<NativeT>& values, const Layout& layout) {
   return ConstantOp([&values, &layout](Literal* literal) {
-    LiteralUtil::PopulateR4FromArray4D(values, layout, literal);
+    LiteralUtil::PopulateR4FromArray4DWithLayout(values, layout, literal);
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/global_data.cc b/tensorflow/compiler/xla/client/global_data.cc
index be706f7d232..40f59eaa68e 100644
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 
 #include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 GlobalData::GlobalData(ServiceInterface* parent, GlobalDataHandle handle)
-    : handle_(handle), parent_(parent) {}
+    : handle_(std::move(handle)), parent_(parent) {}
 
 GlobalData::~GlobalData() {
   UnregisterRequest request;
diff --git a/tensorflow/compiler/xla/client/global_data.h b/tensorflow/compiler/xla/client/global_data.h
index eb11d91034b..b7929357d06 100644
--- a/tensorflow/compiler/xla/client/global_data.h
+++ b/tensorflow/compiler/xla/client/global_data.h
@@ -23,13 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-// Wraps a GlobalDataHandle with a lifetime.
+// A GlobalData object represents a globally-accessible allocation of
+// data in the associated XLA service.
 class GlobalData {
  public:
   // Gives ownership of the global data handle to this object.
   GlobalData(ServiceInterface* parent, GlobalDataHandle handle);
 
-  // Unregisters the wrapped handle.
+  // Unregisters the wrapped handle, which causes the service to
+  // deallocate the associated data.
   ~GlobalData();
 
   const GlobalDataHandle& handle() const { return handle_; }
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index e185beaedd3..86b16be62f0 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -19,6 +19,7 @@ cc_library(
     hdrs = ["arithmetic.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 31efd8ee64d..a45974b86b6 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -64,4 +65,33 @@ Computation CreateScalarMinComputation(PrimitiveType type,
   return b->BuildAndNoteError();
 }
 
+Computation CreateScalarLogicalAndComputation(ComputationBuilder* builder) {
+  const Shape scalar = ShapeUtil::MakeShape(PRED, {});
+  auto b = builder->CreateSubBuilder("logical_and");
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  b->LogicalAnd(lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
+Computation CreateScalarLogicalOrComputation(ComputationBuilder* builder) {
+  const Shape scalar = ShapeUtil::MakeShape(PRED, {});
+  auto b = builder->CreateSubBuilder("logical_or");
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  b->LogicalOr(lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
+StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
+                                    ComputationBuilder* builder) {
+  auto f = builder->ConstantR0<bool>(false);
+  Computation logical_or = CreateScalarLogicalOrComputation(builder);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Shape> predicates_shape,
+                      builder->GetShape(predicates));
+  std::vector<int64> all_dimensions(ShapeUtil::Rank(*predicates_shape));
+  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
+  return builder->Reduce(predicates, f, logical_or, all_dimensions);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 57fe7d74624..633086a2e7e 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -40,6 +40,18 @@ Computation CreateScalarMaxComputation(PrimitiveType type,
 Computation CreateScalarMinComputation(PrimitiveType type,
                                        ComputationBuilder* builder);
 
+// Creates a scalar logical AND computation and returns it.
+Computation CreateScalarLogicalAndComputation(ComputationBuilder* builder);
+
+// Creates a scalar logical OR computation and returns it.
+Computation CreateScalarLogicalOrComputation(ComputationBuilder* builder);
+
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
+                                    ComputationBuilder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 384aae867b1..96944a53b7e 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace se = ::perftools::gputools;
@@ -67,35 +68,13 @@ bool ExecutableBuildOptions::has_hybrid_result() const {
 }
 
 namespace {
-
-// Convenience class which holds an acquired stream from the backend and
-// automatically releases it when destructed.
-class StreamManager {
- public:
-  static StatusOr<std::unique_ptr<StreamManager>> AcquireStream(
-      Backend* backend, int device_ordinal) {
-    TF_ASSIGN_OR_RETURN(
-        se::StreamExecutor * executor,
-        backend->stream_executor(device_ordinal == -1
-                                     ? backend->default_device_ordinal()
-                                     : device_ordinal));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
-                        backend->AcquireStream(executor));
-    return WrapUnique(new StreamManager(backend, std::move(stream)));
+StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
+                                                   Backend* backend) {
+  if (device_ordinal < 0) {
+    device_ordinal = backend->default_device_ordinal();
   }
-
-  ~StreamManager() { backend_->ReleaseStream(std::move(stream_)); }
-
-  se::Stream* stream() const { return stream_.get(); }
-
- private:
-  StreamManager(Backend* backend, std::unique_ptr<se::Stream> stream)
-      : backend_(backend), stream_(std::move(stream)) {}
-
-  Backend* backend_;
-  std::unique_ptr<se::Stream> stream_;
-};
-
+  return backend->BorrowStream(device_ordinal);
+}
 }  // namespace
 
 LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
@@ -108,7 +87,7 @@ LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
+    const ExecutableRunOptions& options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -177,71 +156,54 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
         run_executor->GetDeviceDescription().name().c_str());
   }
 
+  if (!options.allocator()) {
+    return InvalidArgument("an allocator must be provided to ExecuteLocally");
+  }
+
+  if (options.allocator()->platform() != backend.platform()) {
+    return InvalidArgument(
+        "allocator platform (%s) does not match service platform (%s)",
+        options.allocator()->platform()->Name().c_str(),
+        backend.platform()->Name().c_str());
+  }
+
   return tensorflow::Status::OK();
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options));
+  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
 
   ExecutableRunOptions actual_options = options;
-  std::unique_ptr<StreamManager> acquired_stream;
   if (options.stream() == nullptr) {
     TF_ASSIGN_OR_RETURN(
-        acquired_stream,
-        StreamManager::AcquireStream(backend_, options.device_ordinal()));
-    actual_options.set_stream(acquired_stream->stream());
+        Backend::StreamPtr stream,
+        BorrowStreamForDevice(options.device_ordinal(), backend_));
+    actual_options.set_stream(stream.get());
   }
   if (options.allocator() == nullptr) {
     actual_options.set_allocator(backend_->memory_allocator());
   }
 
-  if (executable_->dumping()) {
-    return ExecuteAndDump(&actual_options, arguments);
-  }
-  return executable_->ExecuteOnStream(&actual_options, arguments,
-                                      /*hlo_execution_profile=*/nullptr);
-}
-
-tensorflow::Status LocalExecutable::Run(
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options, ShapedBuffer* result) {
-  const ComputationLayout& computation_layout =
-      executable_->module_config().entry_computation_layout();
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options));
-
-  if (!computation_layout.result_layout().MatchesLayoutInShape(
-          result->shape())) {
-    return InvalidArgument(
-        "result buffer does not match shape or layout of computation result: "
-        "expected %s, got %s",
-        ShapeUtil::HumanString(computation_layout.result_layout().shape())
-            .c_str(),
-        ShapeUtil::HumanString(result->shape()).c_str());
-  }
-
-  ExecutableRunOptions actual_options = options;
-  std::unique_ptr<StreamManager> acquired_stream;
-  if (options.stream() == nullptr) {
-    TF_ASSIGN_OR_RETURN(
-        acquired_stream,
-        StreamManager::AcquireStream(backend_, options.device_ordinal()));
-    actual_options.set_stream(acquired_stream->stream());
-  }
-  if (options.allocator() == nullptr) {
-    actual_options.set_allocator(backend_->memory_allocator());
-  }
+  // For local client execution on CPU backends:
+  // *) The thread pool used for eigen CPU ops is from
+  //    ExecutableRunOptions.eigen_intra_op_thread_pool.
+  // *) The thread pool used for XLA CPU ops is from
+  //    backend_->eigen_intra_op_thread_pool().
+  ServiceExecutableRunOptions service_options(
+      actual_options, backend_->StreamBorrower(),
+      backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
-    return Unimplemented("dumping execution not supported on this path");
+    return ExecuteAndDump(&service_options, arguments);
   }
-  return executable_->ExecuteOnStream(&actual_options, arguments, result,
-                                      /*hlo_execution_profile=*/nullptr);
+  return executable_->ExecuteOnStreamWrapper<std::unique_ptr<ShapedBuffer>>(
+      &service_options, options.execution_profile(), arguments);
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::ExecuteAndDump(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   executable_->session_module()->set_execution_platform(
       backend_->platform()->Name());
@@ -260,8 +222,9 @@ tensorflow::Status LocalExecutable::RecordArguments(
     SessionModule* session_module) {
   session_module->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
-    TF_RETURN_IF_ERROR(
-        LiteralFromShapedBuffer(*argument, session_module->add_arguments()));
+    Literal literal;
+    TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*argument, &literal));
+    *session_module->add_arguments() = literal.ToProto();
   }
   return tensorflow::Status::OK();
 }
@@ -269,9 +232,13 @@ tensorflow::Status LocalExecutable::RecordArguments(
 tensorflow::Status LocalExecutable::RecordResult(
     const ShapedBuffer* result, SessionModule* session_module) {
   session_module->clear_result();
-  return LiteralFromShapedBuffer(*result, session_module->mutable_result());
+  Literal literal(session_module->result());
+  TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*result, &literal));
+  *session_module->mutable_result() = literal.ToProto();
+  return tensorflow::Status::OK();
 }
 
+// TODO(dnovillo) Change signature to return StatusOr<Literal>.
 tensorflow::Status LocalExecutable::LiteralFromShapedBuffer(
     const ShapedBuffer& shaped_buffer, Literal* literal) {
   TF_ASSIGN_OR_RETURN(
@@ -290,62 +257,6 @@ StatusOr<std::unique_ptr<GlobalData>> LocalClient::AllocateBufferOnDevice(
   return std::unique_ptr<GlobalData>(new GlobalData(local_service_, handle));
 }
 
-tensorflow::Status LocalClient::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  return local_service_->ResolveArguments(arguments, device_ordinal,
-                                          argument_ptrs);
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> LocalClient::ExecuteLocally(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options) {
-  return local_service_->ExecuteLocally(computation.handle(), arguments,
-                                        options);
-}
-
-tensorflow::Status LocalClient::ExecuteLocally(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options, ShapedBuffer* result) {
-  return local_service_->ExecuteLocally(computation.handle(), arguments,
-                                        options, result);
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    LocalService::AheadOfTimeComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return local_service_->CompileAheadOfTime(service_instances, options);
-}
-
-int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
-  llvm::Triple triple(
-      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
-  if (triple.isArch64Bit()) {
-    return 8;
-  } else if (triple.isArch32Bit()) {
-    return 4;
-  } else {
-    CHECK(triple.isArch16Bit());
-    return 2;
-  }
-}
-
 se::Platform* LocalClient::platform() const {
   return local_service_->backend().platform();
 }
@@ -362,6 +273,14 @@ int LocalClient::default_device_ordinal() const {
   return local_service_->backend().default_device_ordinal();
 }
 
+const Backend& LocalClient::backend() const {
+  return local_service_->backend();
+}
+
+Backend* LocalClient::mutable_backend() {
+  return local_service_->mutable_backend();
+}
+
 StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const Computation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 33366b97fd5..c903cd27112 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -56,7 +56,7 @@ class ExecutableBuildOptions {
 
   // If set, this specifies the layout of the result of the computation. If not
   // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accomodate tuple result shapes. A value of nullptr
+  // store the layout to accommodate tuple result shapes. A value of nullptr
   // indicates the option has not been set.
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
@@ -83,12 +83,6 @@ class LocalExecutable {
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableRunOptions& options);
 
-  // Overload which places the computation result in the given preallocated
-  // buffer.
-  tensorflow::Status Run(
-      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options, ShapedBuffer* result);
-
   // Return the layout (contained in a shape) of the result produced by the
   // computation.
   const Shape& result_layout() const {
@@ -117,12 +111,12 @@ class LocalExecutable {
   // of the computation.
   tensorflow::Status ValidateExecutionOptions(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      const ExecutableRunOptions& options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAndDump(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
   // Records the arguments used to invoke the computation in a SessionModule
@@ -154,7 +148,7 @@ class LocalExecutable {
   const ExecutableBuildOptions& build_options_;
 };
 
-// An XLA service client object for use when the client and service run in
+// An XLA Client specialization for use when the client and service run in
 // the same process.
 class LocalClient : public Client {
  public:
@@ -164,14 +158,6 @@ class LocalClient : public Client {
   LocalClient(const LocalClient&) = delete;
   void operator=(const LocalClient&) = delete;
 
-  // For an array of arguments held on the local service, validate
-  // that each is placed on the specified device_ordinal, and return
-  // the DeviceMemoryBase corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal on the local service. If
   // allocate_space_for_deep_copy, the buffer is large enough to hold
@@ -181,37 +167,6 @@ class LocalClient : public Client {
       const Shape& shape, int device_ordinal,
       bool allocate_space_for_deep_copy);
 
-  // Executes the given computation with the given arguments and
-  // options. Arguments and result are "zero-copy", and are passed as pointers
-  // to device memory. See LocalExecuteOptions class comments for description of
-  // available options. The returned ShapedBuffer includes pointer(s) to device
-  // memory (DeviceMemoryBase) which are the caller's responsibility to
-  // deallocate. The layout of the result is chosen by the XLA service and
-  // should not be relied upon to be a specific value. If a specific result
-  // layout is needed, then the layout should be set in options.
-  //
-  // The arrays of arguments with different shapes or layouts are assumed not to
-  // alias.
-  //
-  // TODO(b/31220873): Remove ExecuteLocally methods. The path forward is to use
-  // Compile and run the returned LocalExecutable.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteLocally(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options);
-
-  // Overload of ExecuteLocally which writes the result into the given
-  // ShapedBuffer "result". Result is const because the ShapedBuffer data
-  // structure itself is not modified, only the buffers in device memory to
-  // which it refers.
-  //
-  // TODO(b/31220873): Remove ExecuteLocally methods. The path forward is to use
-  // Compile and run the returned LocalExecutable.
-  tensorflow::Status ExecuteLocally(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options, ShapedBuffer* result);
-
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given argument layouts and options.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
@@ -219,30 +174,6 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  //
-  // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
-  // own library.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& options);
-
-  // Returns the size of a pointer in bytes for a given triple.
-  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
-
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
@@ -261,6 +192,10 @@ class LocalClient : public Client {
   // capability).
   bool device_ordinal_supported(int device_ordinal) const;
 
+  // Returns the backend used to execute computations.
+  const Backend& backend() const;
+  Backend* mutable_backend();
+
  private:
   LocalService* local_service_;
 };
diff --git a/tensorflow/compiler/xla/client/padding.cc b/tensorflow/compiler/xla/client/padding.cc
index 281fa104084..0b18d8946a2 100644
--- a/tensorflow/compiler/xla/client/padding.cc
+++ b/tensorflow/compiler/xla/client/padding.cc
@@ -35,7 +35,7 @@ std::vector<std::pair<int64, int64>> MakePadding(
       return low_high_padding;
 
     case Padding::kSame:
-      for (int64 i = 0; i < input_dimensions.size(); ++i) {
+      for (size_t i = 0; i < input_dimensions.size(); ++i) {
         int64 input_dimension = input_dimensions[i];
         int64 window_dimension = window_dimensions[i];
         int64 window_stride = window_strides[i];
diff --git a/tensorflow/compiler/xla/differential_set.h b/tensorflow/compiler/xla/differential_set.h
deleted file mode 100644
index 9eae24ce30e..00000000000
--- a/tensorflow/compiler/xla/differential_set.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_DIFFERENTIAL_SET_H_
-#define TENSORFLOW_COMPILER_XLA_DIFFERENTIAL_SET_H_
-
-#include <set>
-
-#include "tensorflow/core/platform/macros.h"
-
-namespace xla {
-
-// In the base case, the differential set is just a set.
-// However, you can also point a differential set at another differential set to
-// use as a "parent". This makes a chain of sets, which each node in the chain
-// adds some number of elements to the "Contains" property.
-//
-// E.g. if the base set holds {1, 2}, you can create a derived set that holds
-// {3}, and the derived set will tell you it contains {1, 2, 3} whereas the base
-// will continue to tell you it holds only {1, 2}.
-template <typename T>
-class DifferentialSet {
- public:
-  // Constructs a differential set capable of holding values. It may have an
-  // ancestor link, which would it into a chain of sets.
-  explicit DifferentialSet(const DifferentialSet* parent = nullptr)
-      : parent_(parent) {}
-
-  // Adds a value to be held directly by this set.
-  void Add(T value) { held_.insert(value); }
-
-  // Returns whether this set holds the given value, or any ancestor in the
-  // chain of sets.
-  bool Contains(T value) const {
-    return held_.find(value) != held_.end() ||
-           (parent_ != nullptr && parent_->Contains(value));
-  }
-
- private:
-  // Values held directly by this node in the chain of sets.
-  std::set<T> held_;
-
-  // Parent node in the chain of sets.
-  const DifferentialSet* parent_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(DifferentialSet);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_DIFFERENTIAL_SET_H_
diff --git a/tensorflow/compiler/xla/differential_set_test.cc b/tensorflow/compiler/xla/differential_set_test.cc
deleted file mode 100644
index dacbbcc1adb..00000000000
--- a/tensorflow/compiler/xla/differential_set_test.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/differential_set.h"
-
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-TEST(DifferentialSetTest, TellsWhetherSetContainsSomethingHeld) {
-  DifferentialSet<int> set;
-  set.Add(1);
-  set.Add(2);
-  EXPECT_FALSE(set.Contains(3));
-  EXPECT_TRUE(set.Contains(1));
-  EXPECT_TRUE(set.Contains(2));
-  EXPECT_FALSE(set.Contains(0));
-}
-
-TEST(DifferentialSetTest, TellsWhetherSetContainsSomethingParentHolds) {
-  DifferentialSet<int> parent;
-  parent.Add(1);
-  DifferentialSet<int> child(&parent);
-  child.Add(2);
-
-  // Test properties of the child.
-  EXPECT_FALSE(child.Contains(3));
-  EXPECT_TRUE(child.Contains(1));
-  EXPECT_TRUE(child.Contains(2));
-  EXPECT_FALSE(child.Contains(0));
-
-  // Test properties of the parent.
-  EXPECT_TRUE(parent.Contains(1));
-  EXPECT_FALSE(parent.Contains(2));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 1c54fec97ce..67f3a6c1df4 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -67,4 +67,14 @@ const Eigen::ThreadPoolDevice* ExecutableRunOptions::intra_op_thread_pool()
   return intra_op_thread_pool_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_execution_profile(
+    ExecutionProfile* profile) {
+  execution_profile_ = profile;
+  return *this;
+}
+
+ExecutionProfile* ExecutableRunOptions::execution_profile() const {
+  return execution_profile_;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 212fce9eab7..03f2d016ad0 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -40,6 +40,7 @@ struct ThreadPoolDevice;
 namespace xla {
 
 class DeviceMemoryAllocator;
+class ExecutionProfile;
 
 // Class containing options for running a LocalExecutable.
 class ExecutableRunOptions {
@@ -74,12 +75,17 @@ class ExecutableRunOptions {
       const Eigen::ThreadPoolDevice* intra_op_thread_pool);
   const Eigen::ThreadPoolDevice* intra_op_thread_pool() const;
 
+  // If set, profiling information is written to 'profile'.
+  ExecutionProfile* execution_profile() const;
+  ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   perftools::gputools::Stream* stream_ = nullptr;
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
+  ExecutionProfile* execution_profile_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 901fcd89ea2..76c0168f370 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -28,13 +28,13 @@ namespace xla {
 
 /* static */ int64 IndexUtil::MultidimensionalIndexToLinearIndex(
     const Shape& shape, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  CHECK_EQ(shape.dimensions_size(), multi_index.size());
+  DCHECK_EQ(shape.dimensions_size(), multi_index.size());
   // Padding and nested layouts not supported yet.
-  CHECK_EQ(0, shape.layout().padded_dimensions_size());
+  DCHECK_EQ(0, shape.layout().padded_dimensions_size());
 
-  for (int i = 0; i < multi_index.size(); ++i) {
-    CHECK_GE(multi_index[i], 0);
-    CHECK_LT(multi_index[i], shape.dimensions(i))
+  for (size_t i = 0; i < multi_index.size(); ++i) {
+    DCHECK_GE(multi_index[i], 0);
+    DCHECK_LT(multi_index[i], shape.dimensions(i))
         << "indexing beyond extent in dimension " << i << ":"
         << "\n\tindex: " << tensorflow::str_util::Join(multi_index, ",")
         << "\n\tshape: " << ShapeUtil::HumanString(shape);
@@ -77,9 +77,17 @@ namespace xla {
   // Scale factor holding the growing product of D{L(i)} terms.
   int64 scale = 1;
   int64 linear_index = 0;
+  bool first = true;
   for (auto dimension : shape.layout().minor_to_major()) {
-    linear_index += scale * multi_index[dimension];
-    scale *= shape.dimensions(dimension);
+    if (first) {
+      // Avoid two multiplies on the first loop iteration
+      linear_index = multi_index[dimension];
+      scale = shape.dimensions(dimension);
+      first = false;
+    } else {
+      linear_index += scale * multi_index[dimension];
+      scale *= shape.dimensions(dimension);
+    }
   }
   return linear_index;
 }
@@ -87,9 +95,9 @@ namespace xla {
 /* static */ std::vector<int64> IndexUtil::LinearIndexToMultidimensionalIndex(
     const Shape& shape, int64 linear_index) {
   // Padding and nested layouts not supported yet.
-  CHECK_EQ(0, shape.layout().padded_dimensions_size());
-  CHECK_GE(linear_index, 0);
-  CHECK_LT(linear_index, ShapeUtil::ElementsIn(shape));
+  DCHECK_EQ(0, shape.layout().padded_dimensions_size());
+  DCHECK_GE(linear_index, 0);
+  DCHECK_LT(linear_index, ShapeUtil::ElementsIn(shape));
 
   // The following formula computes each element of the multidimensional index
   // (See comments in MultidimensionalIndexToLinearIndex for notation):
@@ -110,17 +118,36 @@ namespace xla {
   return multi_index;
 }
 
-/* static */ bool IndexUtil::BumpIndices(const Shape& shape,
-                                         std::vector<int64>* indices) {
-  for (int64 dimno = indices->size() - 1; dimno >= 0; --dimno) {
+/* static */ bool IndexUtil::BumpIndices(
+    const Shape& shape, tensorflow::gtl::MutableArraySlice<int64> indices) {
+  for (int64 dimno = indices.size() - 1; dimno >= 0; --dimno) {
     int64 limit = shape.dimensions(dimno);
-    if ((*indices)[dimno] + 1 < limit) {
-      (*indices)[dimno]++;
-      std::fill(indices->begin() + dimno + 1, indices->end(), 0);
+    if (indices[dimno] + 1 < limit) {
+      indices[dimno]++;
+      std::fill(indices.begin() + dimno + 1, indices.end(), 0);
       return true;
     }
   }
   return false;
 }
 
+/* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
+                                                 int64 dimension) {
+  const Layout& layout = shape.layout();
+  int64 pdim_size = layout.padded_dimensions_size();
+  int64 stride = 1;
+  DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
+  for (auto dim : layout.minor_to_major()) {
+    if (dim == dimension) {
+      break;
+    }
+    if (pdim_size == 0) {
+      stride *= shape.dimensions(dim);
+    } else {
+      stride *= layout.padded_dimensions(dim);
+    }
+  }
+  return stride;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 2d8753c3fe8..c9838966a5b 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -58,7 +58,16 @@ class IndexUtil {
   //
   // Returns true iff the indices were successfully bumped; false if we've hit
   // the limit where it can no longer be bumped in-bounds.
-  static bool BumpIndices(const Shape& shape, std::vector<int64>* indices);
+  static bool BumpIndices(const Shape& shape,
+                          tensorflow::gtl::MutableArraySlice<int64> indices);
+
+  // Calculates the stride size (in number of elements, not byte size) of a
+  // given logical shape dimension (from 0 to rank-1). If available, padded
+  // dimensions are used.
+  // Example:
+  //  GetDimensionStride(F32[5,8,10,4]{3,2,1,0}, 1) ==
+  //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
+  static int64 GetDimensionStride(const Shape& shape, int64 dimension);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IndexUtil);
diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc
index 85259b33f0b..7c4efdee484 100644
--- a/tensorflow/compiler/xla/index_util_test.cc
+++ b/tensorflow/compiler/xla/index_util_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <initializer_list>
 
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -144,14 +143,11 @@ TEST(IndexUtilTest, BumpIndices2x2) {
   auto shape = ShapeUtil::MakeShape(S32, {2, 2});
   std::vector<int64> indices = {0, 0};
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{0, 1}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(0, 1));
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{1, 0}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(1, 0));
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{1, 1}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(1, 1));
   EXPECT_FALSE(IndexUtil::BumpIndices(shape, &indices));
 }
 
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 531a6e03dad..d3fcccff654 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -14,11 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-
 #include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -114,8 +113,8 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
   Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
@@ -133,8 +132,8 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
@@ -145,9 +144,10 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("layout minor_to_major field contains .* "
-                                      "elements, but shape is rank"));
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::ContainsRegex("layout minor_to_major field contains .* "
+                               "elements, but shape is rank"));
 }
 
 TEST_F(LayoutUtilTest, ClearLayoutTuple) {
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index 79ff81262e9..a147ce67a28 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -29,6 +29,7 @@ cc_library(
 
 cc_test(
     name = "parse_flags_from_env_test",
+    size = "small",
     srcs = ["parse_flags_from_env_test.cc"],
     deps =
         [
@@ -65,6 +66,20 @@ cc_library(
         ],
 )
 
+cc_library(
+    name = "debug_options_flags",
+    srcs = ["debug_options_flags.cc"],
+    hdrs = ["debug_options_flags.h"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+)
+
 cc_library(
     name = "cpu_compiler_flags",
     srcs = ["cpu_compiler_flags.cc"],
@@ -160,18 +175,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "hlo_pass_pipeline_flags",
-    srcs = ["hlo_pass_pipeline_flags.cc"],
-    hdrs = ["hlo_pass_pipeline_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "alias_analysis_flags",
     srcs = ["alias_analysis_flags.cc"],
@@ -240,6 +243,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "user_computation_flags",
+    srcs = ["user_computation_flags.cc"],
+    hdrs = ["user_computation_flags.h"],
+    deps = [
+        ":parse_flags_from_env",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc
index f8ae25552d4..13d41a8636b 100644
--- a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc
@@ -36,23 +36,15 @@ static std::once_flag flags_init;
 // Allocate *flags.  Called via call_once(&flags_init,...).
 static void AllocateFlags() {
   flags = new CpuCompilerFlags;
-  flags->xla_cpu_llvm_opt_level = 2;
-  flags->xla_cpu_llvm_cl_opts = "";
   flags->xla_cpu_embed_ir = false;
-  flags->xla_cpu_parallel = false;
+  flags->xla_cpu_dump_debug_json_to = "";
   flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_cpu_llvm_opt_level", &flags->xla_cpu_llvm_opt_level,
-          "The LLVM optimization level for the CPU XLA backend. "
-          "Valid range is from 0 to 3 where 0 means no optimizations."),
-      tensorflow::Flag(
-          "xla_cpu_llvm_cl_opts", &flags->xla_cpu_llvm_cl_opts,
-          "Comma-separated list of command line options to pass to LLVM."),
       tensorflow::Flag(
           "xla_cpu_embed_ir", &flags->xla_cpu_embed_ir,
           "Embed the LLVM IR module string in the resultant CpuExecutable."),
-      tensorflow::Flag("xla_cpu_parallel", &flags->xla_cpu_parallel,
-                       "Use the multi-threaded CPU backend."),
+      tensorflow::Flag("xla_cpu_dump_debug_json_to",
+                       &flags->xla_cpu_dump_debug_json_to,
+                       "Dump debug JSON to this directory."),
   });
   ParseFlagsFromEnv(*flag_list);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h
index 16a7b687116..bac498e18eb 100644
--- a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h
@@ -33,14 +33,9 @@ void AppendCpuCompilerFlags(std::vector<tensorflow::Flag>* flag_list);
 
 // The values of flags associated with XLA's cpu_compiler module.
 typedef struct {
-  // The LLVM optimization level for the CPU XLA backend.
-  // Valid range is from 0 to 3 where 0 means no optimizations.
-  int32 xla_cpu_llvm_opt_level;
-  string xla_cpu_llvm_cl_opts;  // Comma-separated list of command line options
-                                // to pass to LLVM.
   bool xla_cpu_embed_ir;  // Embed the LLVM IR module string in the resultant
                           // CpuExecutable
-  bool xla_cpu_parallel;  // Use the multi-threaded CPU backend.
+  string xla_cpu_dump_debug_json_to;  // Dump debug JSON to this directory.
 } CpuCompilerFlags;
 
 // Return a pointer to the CpuCompilerFlags struct;
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
new file mode 100644
index 00000000000..5e3c4f912bf
--- /dev/null
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
+#include <vector>
+#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace legacy_flags {
+
+struct DebugOptionsFlags {
+  string xla_generate_hlo_graph;
+  string xla_disable_hlo_passes;
+  bool xla_enable_fast_math;
+  int32 xla_backend_optimization_level;
+  string xla_backend_extra_options;
+};
+
+namespace {
+
+DebugOptionsFlags* flag_values;
+std::vector<tensorflow::Flag>* flag_objects;
+std::once_flag flags_init;
+
+// Allocates flag_values and flag_objects; this function must not be called more
+// than once - its call done via call_once.
+void AllocateFlags() {
+  flag_values = new DebugOptionsFlags;
+  flag_values->xla_generate_hlo_graph = "";
+  flag_values->xla_disable_hlo_passes = "";
+  flag_values->xla_enable_fast_math = true;
+  flag_values->xla_backend_optimization_level = 2;
+  flag_values->xla_backend_extra_options = "";
+
+  flag_objects = new std::vector<tensorflow::Flag>(
+      {tensorflow::Flag(
+           "xla_generate_hlo_graph", &flag_values->xla_generate_hlo_graph,
+           "HLO modules matching this regex will be dumped to a .dot file "
+           "throughout various stages in compilation."),
+
+       tensorflow::Flag(
+           "xla_enable_fast_math", &flag_values->xla_enable_fast_math,
+           "Enable unsafe fast-math optimizations in the compiler; "
+           "this may produce faster code at the expense of some accuracy."),
+       tensorflow::Flag(
+           "xla_backend_optimization_level",
+           &flag_values->xla_backend_optimization_level,
+           "Numerical optimization level for the XLA compiler backend."),
+
+       tensorflow::Flag("xla_backend_extra_options",
+                        &flag_values->xla_backend_extra_options,
+                        "Extra options to pass to a backend; "
+                        "comma-separated list of 'key=val' strings (=val "
+                        "may be omitted); no whitespace around commas."),
+
+       tensorflow::Flag(
+           "xla_disable_hlo_passes", &flag_values->xla_disable_hlo_passes,
+           "Comma-separated list of HLO passes to be disabled. These names "
+           "must exactly match the passes' names; "
+           "no whitespace around commas.")});
+  ParseFlagsFromEnv(*flag_objects);
+}
+
+}  // namespace
+
+void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateFlags);
+  flag_list->insert(flag_list->end(), flag_objects->begin(),
+                    flag_objects->end());
+}
+
+xla::DebugOptions GetDebugOptionsFromFlags() {
+  std::call_once(flags_init, &AllocateFlags);
+
+  DebugOptions options;
+  options.set_xla_generate_hlo_graph(flag_values->xla_generate_hlo_graph);
+
+  std::vector<string> disabled_passes =
+      tensorflow::str_util::Split(flag_values->xla_disable_hlo_passes, ',');
+  for (const auto& passname : disabled_passes) {
+    options.add_xla_disable_hlo_passes(passname);
+  }
+
+  options.set_xla_enable_fast_math(flag_values->xla_enable_fast_math);
+  options.set_xla_backend_optimization_level(
+      flag_values->xla_backend_optimization_level);
+
+  std::vector<string> extra_options_parts =
+      tensorflow::str_util::Split(flag_values->xla_backend_extra_options, ',');
+  auto* extra_options_map = options.mutable_xla_backend_extra_options();
+
+  // The flag contains a comma-separated list of options; some options have
+  // arguments following "=", some don't.
+  for (const auto& part : extra_options_parts) {
+    size_t eq_pos = part.find_first_of('=');
+    if (eq_pos == string::npos) {
+      (*extra_options_map)[part] = "";
+    } else {
+      string value = "";
+      if (eq_pos + 1 < part.size()) {
+        value = part.substr(eq_pos + 1);
+      }
+      (*extra_options_map)[part.substr(0, eq_pos)] = value;
+    }
+  }
+
+  return options;
+}
+
+}  // namespace legacy_flags
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
new file mode 100644
index 00000000000..d0ef8e66ab0
--- /dev/null
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace legacy_flags {
+
+// Appends flag definitions for debug options to flag_list.
+void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
+
+// Fetches a DebugOptions proto message from flags provided to the program.
+// Flags must be registered with the flags parser using AppendDebugOptionsFlags
+// first.
+xla::DebugOptions GetDebugOptionsFromFlags();
+
+}  // namespace legacy_flags
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
index c355b1ed9b7..f8f6ea26b1d 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
@@ -38,7 +38,6 @@ static void AllocateFlags() {
   flags->dump_temp_products_to = "";
   flags->ftz = false;
   flags->fma = true;
-  flags->gpu_architecture = "compute_35";
   flags->verbose_ptx_asm = false;
   flags->kernel = "";
   flags->llvm_dump_passes = false;
@@ -51,8 +50,6 @@ static void AllocateFlags() {
                        "If empty, no dump is produced"),
       tensorflow::Flag("ftz", &flags->ftz, "flush to zero semantics"),
       tensorflow::Flag("fma", &flags->fma, "use FMA synthesis"),
-      tensorflow::Flag("gpu_architecture", &flags->gpu_architecture,
-                       "GPU architecture"),
       tensorflow::Flag("verbose_ptx_asm", &flags->verbose_ptx_asm,
                        "emit PTX assembly with extra comments"),
       tensorflow::Flag("kernel", &flags->kernel,
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
index fbb88634545..31cb50e9da9 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
@@ -36,7 +36,6 @@ typedef struct {
   string dump_temp_products_to;  // temporary compilation products dir
   bool ftz;                      // flush to zero semantics
   bool fma;                      // use FMA synthesis
-  string gpu_architecture;       // GPU architecture
   bool verbose_ptx_asm;          // emit PTX assembly with extra comments
   string kernel;                 // only emit the IR and PTX for this kernel
   bool llvm_dump_passes;         // dump the passes LLVM runs to stderr
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
index e79d3635095..131e3ce70ac 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
@@ -38,7 +38,7 @@ static void AllocateFlags() {
   flags = new GpuCompilerFlags;
   flags->xla_gpu_embed_ir = false;
   flags->xla_cuda_data_dir = "./cuda_sdk_lib";
-  flags->xla_ptxas_path = "/usr/local/cuda/bin/ptxas";
+  flags->xla_gpu_dump_debug_json_to = "";
   flag_list = new std::vector<tensorflow::Flag>({
       tensorflow::Flag(
           "xla_gpu_embed_ir", &flags->xla_gpu_embed_ir,
@@ -50,6 +50,9 @@ static void AllocateFlags() {
           "runfile directories."),
       tensorflow::Flag("xla_ptxas_path", &flags->xla_ptxas_path,
                        "The path to ptxas. Required to log stats of the ptx."),
+      tensorflow::Flag("xla_gpu_dump_debug_json_to",
+                       &flags->xla_gpu_dump_debug_json_to,
+                       "Dump debug JSON to this directory."),
   });
   ParseFlagsFromEnv(*flag_list);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h
index 04ddedab732..0cf39e0ab35 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h
@@ -41,6 +41,7 @@ typedef struct {
                              // directories.
   string xla_ptxas_path;     // The path to ptxas.  Required to log stats of
                              // the ptx.
+  string xla_gpu_dump_debug_json_to;  // Dump debug JSON to this directory.
 } GpuCompilerFlags;
 
 // Return a pointer to the GpuCompilerFlags struct;
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
index 8822f6f6107..ba43a591952 100644
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
@@ -36,10 +36,14 @@ static std::once_flag flags_init;
 static void AllocateFlags() {
   flags = new HloGraphDumperFlags;
   flags->xla_hlo_dump_graph_path = "/tmp/";
+  flags->xla_hlo_dump_as_graphdef = false;
   flag_list = new std::vector<tensorflow::Flag>({
       tensorflow::Flag("xla_hlo_dump_graph_path",
                        &flags->xla_hlo_dump_graph_path,
                        "Path to write dumped HLO graphs to"),
+      tensorflow::Flag("xla_hlo_dump_as_graphdef",
+                       &flags->xla_hlo_dump_as_graphdef,
+                       "Dumps HLO graphs as tensorflow GraphDefs"),
   });
   ParseFlagsFromEnv(*flag_list);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
index b6dfced87ca..d0b4d092ff1 100644
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
@@ -34,6 +34,9 @@ void AppendHloGraphDumperFlags(std::vector<tensorflow::Flag>* flag_list);
 // The values of flags associated with XLA's hlo_graph_dumper module.
 typedef struct {
   string xla_hlo_dump_graph_path;  // Path to write dumped HLO graphs to
+  // If set, dumps HLO graphs as tensorflow GraphDef; otherwise, dumps HLO
+  // graphs as DOT graph.
+  bool xla_hlo_dump_as_graphdef;
 } HloGraphDumperFlags;
 
 // Return a pointer to the HloGraphDumperFlags struct;
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.cc b/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.cc
deleted file mode 100644
index edc04d51a70..00000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's hlo_pass_pipeline module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static HloPassPipelineFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new HloPassPipelineFlags;
-  flags->xla_disable_hlo_passes = "";
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_disable_hlo_passes", &flags->xla_disable_hlo_passes,
-                       "Comma-separated list of HLO passes to disable."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's hlo_pass_pipeline
-// module.
-void AppendHloPassPipelineFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the HloPassPipelineFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloPassPipelineFlags* GetHloPassPipelineFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h b/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h
deleted file mode 100644
index 520759bbf0d..00000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_PASS_PIPELINE_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_PASS_PIPELINE_FLAGS_H_
-
-// Legacy flags for XLA's hlo_pass_pipeline module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's hlo_pass_pipeline
-// module.
-void AppendHloPassPipelineFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's hlo_pass_pipeline module.
-typedef struct {
-  // Comma-separated list of HLO passes to disable.
-  string xla_disable_hlo_passes;
-} HloPassPipelineFlags;
-
-// Return a pointer to the HloPassPipelineFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloPassPipelineFlags* GetHloPassPipelineFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_PASS_PIPELINE_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc b/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc
index 4242b501d41..f838861898d 100644
--- a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc
@@ -53,7 +53,7 @@ static void AllocateRawFlag() {
 static bool ParseDefaultLayout(const string& text, DefaultLayout* layout) {
   bool result = true;
   std::vector<string> field = tensorflow::str_util::Split(text, ':');
-  if (field.size() > 0) {
+  if (!field.empty()) {
     if (field[0] == "random") {
       layout->dimension_order = DefaultLayout::DimensionOrder::kRandom;
       if (field.size() > 1) {
diff --git a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc
new file mode 100644
index 00000000000..a9597d0cd8f
--- /dev/null
+++ b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
+#include <vector>
+
+#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace legacy_flags {
+
+// Pointers to the parsed value of the flags and flag descriptors, initialized
+// via flags_init.
+static UserComputationFlags* flags;
+static std::vector<tensorflow::Flag>* flag_list;
+static std::once_flag flags_init;
+
+// Allocate *flags.  Called via call_once(&flags_init,...).
+static void AllocateFlags() {
+  flags = new UserComputationFlags;
+  flags->xla_eliminate_hlo_implicit_broadcast = false;
+  flag_list = new std::vector<tensorflow::Flag>({
+      tensorflow::Flag("xla_eliminate_hlo_implicit_broadcast",
+                       &flags->xla_eliminate_hlo_implicit_broadcast,
+                       "Eliminate implicit broadcast on when lowering user "
+                       "computation to HLO instructions, use explicit "
+                       "broadcast instead."),
+  });
+  ParseFlagsFromEnv(*flag_list);
+}
+
+// Append to *append_to flag definitions associated with XLA's hlo_pass_pipeline
+// module.
+void AppendUserComputationFlags(std::vector<tensorflow::Flag>* append_to) {
+  std::call_once(flags_init, &AllocateFlags);
+  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
+}
+
+// Return a pointer to the UserComputationFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+UserComputationFlags* GetUserComputationFlags() {
+  std::call_once(flags_init, &AllocateFlags);
+  return flags;
+}
+
+}  // namespace legacy_flags
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h
new file mode 100644
index 00000000000..f5222c927cb
--- /dev/null
+++ b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
+
+// Legacy flags for XLA's user_computation module.
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace legacy_flags {
+
+// Append to *flag_list flags definitions associated with XLA's user_computation
+// module.
+void AppendUserComputationFlags(std::vector<tensorflow::Flag>* flag_list);
+
+typedef struct {
+  // Eliminate implicit broadcast on when lowering user computation to HLO
+  // instructions, use explicit broadcast instead.
+  bool xla_eliminate_hlo_implicit_broadcast;
+} UserComputationFlags;
+
+// Return a pointer to the UserComputationFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+UserComputationFlags* GetUserComputationFlags();
+
+}  // namespace legacy_flags
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index b8bb56a97b2..caef3a3869f 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 
 #include <algorithm>
+#include <cstring>
+#include <functional>
 #include <limits>
 #include <numeric>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -33,31 +36,151 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ Literal LiteralUtil::Zero(PrimitiveType primitive_type) {
+Literal::StrideConfig::StrideConfig(
+    const Shape& source_shape, const Shape& dest_shape,
+    tensorflow::gtl::ArraySlice<int64> dimensions)
+    : dimensions(dimensions),
+      base(dimensions.size(), 0),
+      step(dimensions.size(), 1) {
+  if (!dimensions.empty()) {
+    // Selects the shape with the highest minor dimension as the one upon
+    // where to run the tight stride loop.
+    if (source_shape.layout().minor_to_major()[0] >=
+        dest_shape.layout().minor_to_major()[0]) {
+      minor_dimension = source_shape.layout().minor_to_major()[0];
+      dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension);
+    } else {
+      minor_dimension = dest_shape.layout().minor_to_major()[0];
+      source_stride =
+          IndexUtil::GetDimensionStride(source_shape, minor_dimension);
+    }
+    minor_loop_size = dimensions[minor_dimension];
+    step[minor_dimension] = minor_loop_size;
+  }
+}
+
+std::unique_ptr<Literal> Literal::CreateFromShape(const Shape& shape) {
+  auto literal = MakeUnique<Literal>();
+  *literal->mutable_shape() = shape;
+  literal->Reserve(ShapeUtil::ElementsIn(literal->shape()));
+  return literal;
+}
+
+/* static */ std::unique_ptr<Literal> Literal::CreateFromDimensions(
+    PrimitiveType primitive_type,
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
+}
+
+template <typename T>
+Status Literal::CopyRange(const Literal& src_literal,
+                          tensorflow::gtl::ArraySlice<int64> src_base,
+                          tensorflow::gtl::ArraySlice<int64> dest_base,
+                          tensorflow::gtl::ArraySlice<int64> copy_size) {
+  const Shape& src_shape = src_literal.shape();
+  const Shape& dest_shape = shape();
+  tensorflow::gtl::ArraySlice<T> src_data = src_literal.GetArraySlice<T>();
+  tensorflow::gtl::MutableArraySlice<T> dest_data = GetMutableArraySlice<T>();
+
+  TF_RET_CHECK(ShapeUtil::Rank(src_shape) == src_base.size());
+  TF_RET_CHECK(ShapeUtil::Rank(dest_shape) == dest_base.size());
+  if (ShapeUtil::Rank(src_shape) == 0 || ShapeUtil::Rank(dest_shape) == 0) {
+    // If any of the two shapes are scalars, we can just call the StridedCopy()
+    // directly, and we know we will be copying only one value.
+    TF_RET_CHECK(copy_size.empty());
+    StridedCopy(dest_data, LinearIndex(dest_base), 0, src_data,
+                src_literal.LinearIndex(src_base), 0, 1);
+  } else if (!ShapeUtil::HasZeroElements(dest_shape)) {
+    TF_RET_CHECK(!ShapeUtil::HasZeroElements(src_shape));
+    TF_RET_CHECK(src_base.size() == dest_base.size());
+    TF_RET_CHECK(src_base.size() == copy_size.size());
+
+    // Scan the source from minor, stepping in copy size blocks, then within
+    // the index enumaration functor, do a strided copy advancing source index
+    // by one (walking through the minor dimension), and destination index by
+    // proper stride size at the matching dimension.
+    DimensionVector src_indexes(src_base.size(), 0);
+    DimensionVector dest_indexes(dest_base.size(), 0);
+    StrideConfig stride_config(src_shape, dest_shape, copy_size);
+
+    auto copy_proc = [&](const std::vector<int64>& indexes) {
+      // Map from multi-dimensional index, to source index.
+      std::transform(indexes.begin(), indexes.end(), src_base.begin(),
+                     src_indexes.begin(), std::plus<int64>());
+      // Map from multi-dimensional index, to destination index.
+      std::transform(indexes.begin(), indexes.end(), dest_base.begin(),
+                     dest_indexes.begin(), std::plus<int64>());
+
+      int64 src_index = src_literal.LinearIndex(src_indexes);
+      int64 dest_index = LinearIndex(dest_indexes);
+
+      StridedCopy(dest_data, dest_index, stride_config.dest_stride, src_data,
+                  src_index, stride_config.source_stride,
+                  stride_config.minor_loop_size);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(src_shape, stride_config.base,
+                            stride_config.dimensions, stride_config.step,
+                            copy_proc);
+  }
+  return Status::OK();
+}
+
+Status Literal::Copy(const Literal& src_literal,
+                     tensorflow::gtl::ArraySlice<int64> src_base,
+                     tensorflow::gtl::ArraySlice<int64> dest_base,
+                     tensorflow::gtl::ArraySlice<int64> copy_size) {
+  TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
+  switch (src_literal.shape().element_type()) {
+    case U32:
+      return CopyRange<uint32>(src_literal, src_base, dest_base, copy_size);
+    case U64:
+      return CopyRange<uint64>(src_literal, src_base, dest_base, copy_size);
+    case S32:
+      return CopyRange<int32>(src_literal, src_base, dest_base, copy_size);
+    case S64:
+      return CopyRange<int64>(src_literal, src_base, dest_base, copy_size);
+    case F16:
+      return CopyRange<half>(src_literal, src_base, dest_base, copy_size);
+    case F32:
+      return CopyRange<float>(src_literal, src_base, dest_base, copy_size);
+    case F64:
+      return CopyRange<double>(src_literal, src_base, dest_base, copy_size);
+    case PRED:
+      return CopyRange<bool>(src_literal, src_base, dest_base, copy_size);
+    default:
+      break;
+  }
+  return Unimplemented("Unhandled primitive type %d",
+                       src_literal.shape().element_type());
+}
+
+/* static */ Literal Literal::Zero(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *LiteralUtil::CreateR0<uint8>(0);
+      return *Literal::CreateR0<uint8>(0);
     case U32:
-      return *LiteralUtil::CreateR0<uint32>(0);
+      return *Literal::CreateR0<uint32>(0);
     case U64:
-      return *LiteralUtil::CreateR0<uint64>(0);
+      return *Literal::CreateR0<uint64>(0);
     case S8:
-      return *LiteralUtil::CreateR0<int8>(0);
+      return *Literal::CreateR0<int8>(0);
     case S32:
-      return *LiteralUtil::CreateR0<int32>(0);
+      return *Literal::CreateR0<int32>(0);
     case S64:
-      return *LiteralUtil::CreateR0<int64>(0);
+      return *Literal::CreateR0<int64>(0);
+    case F16:
+      return *Literal::CreateR0<half>(static_cast<half>(0.0f));
     case F32:
-      return *LiteralUtil::CreateR0<float>(0);
+      return *Literal::CreateR0<float>(0);
     case F64:
-      return *LiteralUtil::CreateR0<double>(0);
+      return *Literal::CreateR0<double>(0);
     case PRED:
-      return *LiteralUtil::CreateR0<bool>(false);
+      return *Literal::CreateR0<bool>(false);
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
     case OPAQUE:
@@ -67,31 +190,31 @@ namespace xla {
   }
 }
 
-/* static */ Literal LiteralUtil::One(PrimitiveType primitive_type) {
+/* static */ Literal Literal::One(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *LiteralUtil::CreateR0<uint8>(1);
+      return *Literal::CreateR0<uint8>(1);
     case U32:
-      return *LiteralUtil::CreateR0<uint32>(1);
+      return *Literal::CreateR0<uint32>(1);
     case U64:
-      return *LiteralUtil::CreateR0<uint64>(1);
+      return *Literal::CreateR0<uint64>(1);
     case S8:
-      return *LiteralUtil::CreateR0<int8>(1);
+      return *Literal::CreateR0<int8>(1);
     case S32:
-      return *LiteralUtil::CreateR0<int32>(1);
+      return *Literal::CreateR0<int32>(1);
     case S64:
-      return *LiteralUtil::CreateR0<int64>(1);
+      return *Literal::CreateR0<int64>(1);
     case F32:
-      return *LiteralUtil::CreateR0<float>(1);
+      return *Literal::CreateR0<float>(1);
     case F64:
-      return *LiteralUtil::CreateR0<double>(1);
+      return *Literal::CreateR0<double>(1);
     case PRED:
-      return *LiteralUtil::CreateR0<bool>(true);
+      return *Literal::CreateR0<bool>(true);
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -101,33 +224,33 @@ namespace xla {
   }
 }
 
-/* static */ Literal LiteralUtil::MinValue(PrimitiveType primitive_type) {
+/* static */ Literal Literal::MinValue(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *LiteralUtil::CreateR0<uint8>(std::numeric_limits<uint8>::min());
+      return *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::min());
     case U32:
-      return *LiteralUtil::CreateR0<uint32>(std::numeric_limits<uint32>::min());
+      return *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::min());
     case U64:
-      return *LiteralUtil::CreateR0<uint64>(std::numeric_limits<uint64>::min());
+      return *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::min());
     case S8:
-      return *LiteralUtil::CreateR0<int8>(std::numeric_limits<int8>::min());
+      return *Literal::CreateR0<int8>(std::numeric_limits<int8>::min());
     case S32:
-      return *LiteralUtil::CreateR0<int32>(std::numeric_limits<int32>::min());
+      return *Literal::CreateR0<int32>(std::numeric_limits<int32>::min());
     case S64:
-      return *LiteralUtil::CreateR0<int64>(std::numeric_limits<int64>::min());
+      return *Literal::CreateR0<int64>(std::numeric_limits<int64>::min());
     case F32:
-      return *LiteralUtil::CreateR0<float>(
-          -std::numeric_limits<float>::infinity());
+      return *Literal::CreateR0<float>(-std::numeric_limits<float>::infinity());
     case F64:
-      return *LiteralUtil::CreateR0<double>(
+      return *Literal::CreateR0<double>(
           -std::numeric_limits<double>::infinity());
     case PRED:
-      return *LiteralUtil::CreateR0<bool>(false);
+      return *Literal::CreateR0<bool>(false);
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *Literal::CreateR0<half>(
+          static_cast<half>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -137,33 +260,33 @@ namespace xla {
   }
 }
 
-/* static */ Literal LiteralUtil::MaxValue(PrimitiveType primitive_type) {
+/* static */ Literal Literal::MaxValue(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *LiteralUtil::CreateR0<uint8>(std::numeric_limits<uint8>::max());
+      return *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::max());
     case U32:
-      return *LiteralUtil::CreateR0<uint32>(std::numeric_limits<uint32>::max());
+      return *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::max());
     case U64:
-      return *LiteralUtil::CreateR0<uint64>(std::numeric_limits<uint64>::max());
+      return *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::max());
     case S8:
-      return *LiteralUtil::CreateR0<int8>(std::numeric_limits<int8>::max());
+      return *Literal::CreateR0<int8>(std::numeric_limits<int8>::max());
     case S32:
-      return *LiteralUtil::CreateR0<int32>(std::numeric_limits<int32>::max());
+      return *Literal::CreateR0<int32>(std::numeric_limits<int32>::max());
     case S64:
-      return *LiteralUtil::CreateR0<int64>(std::numeric_limits<int64>::max());
+      return *Literal::CreateR0<int64>(std::numeric_limits<int64>::max());
     case F32:
-      return *LiteralUtil::CreateR0<float>(
-          std::numeric_limits<float>::infinity());
+      return *Literal::CreateR0<float>(std::numeric_limits<float>::infinity());
     case F64:
-      return *LiteralUtil::CreateR0<double>(
+      return *Literal::CreateR0<double>(
           std::numeric_limits<double>::infinity());
     case PRED:
-      return *LiteralUtil::CreateR0<bool>(true);
+      return *Literal::CreateR0<bool>(true);
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *Literal::CreateR0<half>(
+          static_cast<half>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -173,191 +296,161 @@ namespace xla {
   }
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1(
+/* static */ std::unique_ptr<Literal> Literal::CreateR1(
     const tensorflow::core::Bitmap& values) {
   auto literal = MakeUnique<Literal>();
-  PopulateR1(values, literal.get());
+  literal->PopulateR1(values);
   return literal;
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1U8(
+/* static */ std::unique_ptr<Literal> Literal::CreateR1U8(
     tensorflow::StringPiece value) {
   auto literal = MakeUnique<Literal>();
   *literal->mutable_shape() =
       ShapeUtil::MakeShape(U8, {static_cast<int64>(value.size())});
-  literal->set_u8s(value.ToString());
+  literal->set_u8s(tensorflow::StringPiece(value.ToString()));
   return literal;
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR2F32Linspace(
-    float from, float to, int64 rows, int64 cols) {
+/* static */ std::unique_ptr<Literal> Literal::CreateR2F32Linspace(float from,
+                                                                   float to,
+                                                                   int64 rows,
+                                                                   int64 cols) {
   auto value = MakeLinspaceArray2D(from, to, rows, cols);
   return CreateR2FromArray2D(*value);
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
-    const Literal& original, const Layout& layout) {
-  // Note: if this were a performance bottleneck, we avoid cloning and just make
-  // an uninitialized array instead, since all values are clobbered below.
-  std::unique_ptr<Literal> result = CloneToUnique(original);
+std::unique_ptr<Literal> Literal::Relayout(const Layout& layout) const {
+  std::unique_ptr<Literal> result = CloneToUnique();
   *result->mutable_shape()->mutable_layout() = layout;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      LiteralUtil::EachCell<float>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-            LiteralUtil::Set<float>(result.get(), indices, value);
-          });
-      return result;
-    case S32:
-      LiteralUtil::EachCell<int32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
-            LiteralUtil::Set<int32>(result.get(), indices, value);
-          });
-      return result;
-    case U32:
-      LiteralUtil::EachCell<uint32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
-            LiteralUtil::Set<uint32>(result.get(), indices, value);
-          });
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
-  }
+
+  DimensionVector base(ShapeUtil::Rank(shape()), 0);
+  DimensionVector copy_size(shape().dimensions().begin(),
+                            shape().dimensions().end());
+
+  TF_CHECK_OK(result->Copy(*this, base, base, copy_size));
+  return result;
 }
 
-/* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
-    const xla::Literal& input, tensorflow::gtl::ArraySlice<int64> dimensions) {
-  if (ShapeUtil::IsTuple(input.shape())) {
+StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
+    tensorflow::gtl::ArraySlice<int64> dimensions) const {
+  if (ShapeUtil::IsTuple(shape())) {
     return InvalidArgument("Reshape does not support tuples.");
   }
-
-  if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
-    return Unimplemented(
-        "Input shape must have a monotonic layout where dimension 0 is major, "
-        "was: %s",
-        LayoutUtil::HumanString(input.shape().layout()).c_str());
+  std::unique_ptr<Literal> output;
+  if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
+    std::vector<int64> minor_to_major(ShapeUtil::Rank(shape()));
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
+              static_cast<int64>(0));
+    output = Relayout(LayoutUtil::MakeLayout(minor_to_major));
+  } else {
+    output = CloneToUnique();
   }
-  std::vector<int64> layout(dimensions.size());
-  std::iota(layout.rbegin(), layout.rend(), 0);
-
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  std::unique_ptr<Literal> output = CloneToUnique(input);
-  output->clear_shape();
-  output->mutable_shape()->set_element_type(input.shape().element_type());
-  for (int64 dimension : dimensions) {
-    output->mutable_shape()->add_dimensions(dimension);
-  }
-  *output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
+  *output->mutable_shape() =
+      ShapeUtil::MakeShape(shape().element_type(), dimensions);
 
-  int64 elements_before = ShapeUtil::ElementsIn(input.shape());
+  int64 elements_before = ShapeUtil::ElementsIn(shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
   if (elements_before != elements_after) {
     return InvalidArgument(
-        "Shapes before and after LiteralUtil::Reshape have different numbers "
+        "Shapes before and after Literal::Reshape have different numbers "
         "of elements: %s vs %s.",
-        ShapeUtil::HumanString(input.shape()).c_str(),
+        ShapeUtil::HumanString(shape()).c_str(),
         ShapeUtil::HumanString(output->shape()).c_str());
   }
   return std::move(output);
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
-    const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
-  CHECK(!ShapeUtil::IsTuple(original.shape()))
-      << "tuple is not supported for transpose";
-  std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
-  std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
-  CHECK(std::is_permutation(permutation.begin(), permutation.end(),
-                            dimension_numbers.begin()))
-      << "given permutation is not a permutation of dimension numbers";
-  std::vector<int64> new_dimension_sizes;
-  for (const int64 dim : permutation) {
-    new_dimension_sizes.push_back(original.shape().dimensions(dim));
-  }
-  const auto result_shape = ShapeUtil::MakeShape(
-      original.shape().element_type(), new_dimension_sizes);
-  std::unique_ptr<Literal> result = CloneToUnique(original);
-  *result->mutable_shape() = result_shape;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
-  switch (primitive_type) {
-    case F32:
-      LiteralUtil::EachCell<float>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-            for (int64 i = 0; i < permutation.size(); ++i) {
-              new_indices[i] = indices[permutation[i]];
-            }
-            LiteralUtil::Set<float>(result.get(), new_indices, value);
-          });
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
+std::unique_ptr<Literal> Literal::Transpose(
+    tensorflow::gtl::ArraySlice<int64> permutation) const {
+  CHECK(!ShapeUtil::IsTuple(shape())) << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
+      << "Given permutation is not a permutation of dimension numbers";
+  // To transpose the array, we just permute the dimensions and layout, and
+  // do a straight memory copy of the raw data set.
+  // This is considerably faster than iterating over every array element using
+  // the EachCell<>() and Set<>() APIs.
+  std::vector<int64> inverse_permutation = InversePermutation(permutation);
+  Shape permuted_shape =
+      ShapeUtil::PermuteDimensions(inverse_permutation, shape());
+  // Replace the layout with one affine to this shape, such that a
+  // transpose operation can be performed by leaving the flat values
+  // representation intact.
+  // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
+  // The shape with affine layout resulting from that operation will be
+  // F32[8,11]{0,1}, since it leaves the original most minor (the 8 sized), the
+  // most minor.
+  // Essentially, given MinMaj(Di) the position of the Di dimension within the
+  // minor to major vector, and given T(Di) the index that the original Di
+  // dimension has within the transposed array, a layout is affine if
+  // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
+  // vector of the affine layout.
+  Layout* layout = permuted_shape.mutable_layout();
+  layout->clear_minor_to_major();
+  for (auto index : shape().layout().minor_to_major()) {
+    layout->add_minor_to_major(inverse_permutation[index]);
   }
+  std::unique_ptr<Literal> new_literal = CreateFromShape(permuted_shape);
+  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+            ShapeUtil::ByteSizeOf(shape()));
+  std::memcpy(new_literal->MutableInternalData(), InternalData(),
+              ShapeUtil::ByteSizeOf(shape()));
+  return new_literal;
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
-  CHECK(!ShapeUtil::IsTuple(literal.shape()))
-      << "tuple is not supported for reshape";
+std::unique_ptr<Literal> Literal::Slice(
+    tensorflow::gtl::ArraySlice<int64> start_indices,
+    tensorflow::gtl::ArraySlice<int64> limit_indices) const {
+  CHECK(!ShapeUtil::IsTuple(shape())) << "tuple is not supported for reshape";
 
-  std::vector<int64> result_dimensions;
-  for (int64 dnum = 0; dnum < ShapeUtil::Rank(literal.shape()); ++dnum) {
+  DimensionVector result_dimensions;
+  for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
-    CHECK_LE(limit_indices[dnum], literal.shape().dimensions(dnum));
+    CHECK_LE(limit_indices[dnum], shape().dimensions(dnum));
     int64 dimension = limit_indices[dnum] - start_indices[dnum];
     CHECK_GT(dimension, 0);
     result_dimensions.push_back(dimension);
   }
   const auto result_shape = ShapeUtil::MakeShapeWithLayout(
-      literal.shape().element_type(), result_dimensions,
-      AsInt64Slice(literal.shape().layout().minor_to_major()));
+      shape().element_type(), result_dimensions,
+      AsInt64Slice(shape().layout().minor_to_major()));
 
   auto result_literal = MakeUnique<Literal>();
   *result_literal->mutable_shape() = result_shape;
-  Reserve(ShapeUtil::ElementsIn(result_shape), result_literal.get());
+  result_literal->Reserve(ShapeUtil::ElementsIn(result_shape));
 
-  std::vector<int64> new_indices(ShapeUtil::Rank(result_shape));
+  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
     case F32:
-      LiteralUtil::EachCell<float>(
-          *result_literal,
+      result_literal->EachCell<float>(
           [&](tensorflow::gtl::ArraySlice<int64> indices, float /*value*/) {
             for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
               new_indices[i] = indices[i] + start_indices[i];
             }
-            float value = LiteralUtil::Get<float>(literal, new_indices);
-            LiteralUtil::Set<float>(result_literal.get(), indices, value);
+            float value = Get<float>(new_indices);
+            result_literal->Set<float>(indices, value);
           });
       return result_literal;
     case S32:
-      LiteralUtil::EachCell<int32>(
-          *result_literal,
+      result_literal->EachCell<int32>(
           [&](tensorflow::gtl::ArraySlice<int64> indices, int32 /*value*/) {
             for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
               new_indices[i] = indices[i] + start_indices[i];
             }
-            int32 value = LiteralUtil::Get<int32>(literal, new_indices);
-            LiteralUtil::Set<int32>(result_literal.get(), indices, value);
+            int32 value = Get<int32>(new_indices);
+            result_literal->Set<int32>(indices, value);
           });
       return result_literal;
     case U32:
-      LiteralUtil::EachCell<uint32>(
-          *result_literal,
+      result_literal->EachCell<uint32>(
           [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 /*value*/) {
             for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
               new_indices[i] = indices[i] + start_indices[i];
             }
-            uint32 value = LiteralUtil::Get<uint32>(literal, new_indices);
-            LiteralUtil::Set<uint32>(result_literal.get(), indices, value);
+            uint32 value = Get<uint32>(new_indices);
+            result_literal->Set<uint32>(indices, value);
           });
       return result_literal;
     default:
@@ -366,96 +459,95 @@ namespace xla {
   }
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::CloneToUnique(
-    const Literal& literal) {
+std::unique_ptr<Literal> Literal::CloneToUnique() const {
   auto unique = MakeUnique<Literal>();
-  *unique = literal;
+  *unique = *this;
   return unique;
 }
 
-/* static */ string LiteralUtil::GetAsString(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  switch (literal.shape().element_type()) {
+string Literal::GetAsString(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  switch (shape().element_type()) {
     case PRED:
-      return Get<bool>(literal, multi_index) ? "true" : "false";
+      return Get<bool>(multi_index) ? "true" : "false";
     case U8:
-      return tensorflow::strings::StrCat(Get<uint8>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<uint8>(multi_index));
     case S32:
-      return tensorflow::strings::StrCat(Get<int32>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<int32>(multi_index));
     case S64:
-      return tensorflow::strings::StrCat(Get<int64>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<int64>(multi_index));
     case U32:
-      return tensorflow::strings::StrCat(Get<uint32>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<uint32>(multi_index));
     case U64:
-      return tensorflow::strings::StrCat(Get<uint64>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<uint64>(multi_index));
     case F32:
-      return tensorflow::strings::StrCat(Get<float>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<float>(multi_index));
     case F64:
-      return tensorflow::strings::StrCat(Get<double>(literal, multi_index));
+      return tensorflow::strings::StrCat(Get<double>(multi_index));
+    case F16:
+      return tensorflow::strings::StrCat(Get<half>(multi_index));
     default:
       return tensorflow::strings::StrCat(
-          "[", PrimitiveType_Name(literal.shape().element_type()), "]");
+          "[", PrimitiveType_Name(shape().element_type()), "]");
   }
 }
 
-/* static */ int64 LiteralUtil::LinearIndex(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return IndexUtil::MultidimensionalIndexToLinearIndex(literal.shape(),
-                                                       multi_index);
+int64 Literal::LinearIndex(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  return IndexUtil::MultidimensionalIndexToLinearIndex(shape(), multi_index);
 }
 
-/* static */ string LiteralUtil::ToString(const Literal& literal) {
-  const Shape& shape = literal.shape();
+string Literal::ToString() const {
   std::vector<string> pieces;
 
   auto element_to_string =
-      [&literal](tensorflow::gtl::ArraySlice<int64> indices) -> string {
-    PrimitiveType element_type = literal.shape().element_type();
+      [this](tensorflow::gtl::ArraySlice<int64> indices) -> string {
+    PrimitiveType element_type = shape().element_type();
     if (element_type == PRED) {
       // We display predicates in a densely packed form.
-      return Get<bool>(literal, indices) ? "1" : "0";
+      return Get<bool>(indices) ? "1" : "0";
     }
     return ((!indices.empty() && indices.back() > 0) ? ", " : "") +
-           GetAsString(literal, indices);
+           GetAsString(indices);
   };
 
   // TODO(b/32894291): refactor this code to reduce code duplication.
-  if (ShapeUtil::IsTuple(shape)) {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+  if (ShapeUtil::IsTuple(shape())) {
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" (\n");
-    for (const auto& element_literal : literal.tuple_literals()) {
-      pieces.push_back(ToString(element_literal));
+    for (const auto& element_literal : tuple_literals()) {
+      pieces.push_back(element_literal.ToString());
       pieces.push_back(",\n");
     }
     pieces.push_back(")");
-  } else if (ShapeUtil::Rank(shape) == 0) {
-    pieces.push_back(GetAsString(literal, {}));
-  } else if (ShapeUtil::Rank(shape) == 1) {
+  } else if (ShapeUtil::Rank(shape()) == 0) {
+    pieces.push_back(GetAsString({}));
+  } else if (ShapeUtil::Rank(shape()) == 1) {
     pieces.push_back("{");
-    for (int64 i0 = 0; i0 < shape.dimensions(0); ++i0) {
+    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(element_to_string({i0}));
     }
     pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape) == 2) {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+  } else if (ShapeUtil::Rank(shape()) == 2) {
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape.dimensions(0); ++i0) {
+    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back("  { ");
-      for (int64 i1 = 0; i1 < shape.dimensions(1); ++i1) {
+      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(element_to_string({i0, i1}));
       }
       pieces.push_back(" ");
       pieces.push_back("},\n");
     }
     pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape) == 3) {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+  } else if (ShapeUtil::Rank(shape()) == 3) {
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape.dimensions(0); ++i0) {
+    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < shape.dimensions(1); ++i1) {
+      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < shape.dimensions(2); ++i2) {
+        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
           pieces.push_back(element_to_string({i0, i1, i2}));
         }
         pieces.push_back(" }");
@@ -463,17 +555,17 @@ namespace xla {
       pieces.push_back(" }");
     }
     pieces.push_back("\n}");
-  } else if (ShapeUtil::Rank(shape) == 4) {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+  } else if (ShapeUtil::Rank(shape()) == 4) {
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape.dimensions(0); ++i0) {
+    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(tensorflow::strings::Printf("  {  // i0=%lld\n", i0));
-      for (int64 i1 = 0; i1 < shape.dimensions(1); ++i1) {
+      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(
             tensorflow::strings::Printf("    {  // i1=%lld\n", i1));
-        for (int64 i2 = 0; i2 < shape.dimensions(2); ++i2) {
+        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
           pieces.push_back("      {");
-          for (int64 i3 = 0; i3 < shape.dimensions(3); ++i3) {
+          for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
             pieces.push_back(element_to_string({i0, i1, i2, i3}));
           }
           pieces.push_back("},\n");
@@ -483,20 +575,20 @@ namespace xla {
       pieces.push_back("  },\n");
     }
     pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape) == 5) {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+  } else if (ShapeUtil::Rank(shape()) == 5) {
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape.dimensions(0); ++i0) {
+    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(tensorflow::strings::Printf("  {  // i0=%lld\n", i0));
-      for (int64 i1 = 0; i1 < shape.dimensions(1); ++i1) {
+      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(
             tensorflow::strings::Printf("    {  // i1=%lld\n", i1));
-        for (int64 i2 = 0; i2 < shape.dimensions(2); ++i2) {
+        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
           pieces.push_back(
               tensorflow::strings::Printf("      {  // i2=%lld\n", i2));
-          for (int64 i3 = 0; i3 < shape.dimensions(3); ++i3) {
+          for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
             pieces.push_back("        {");
-            for (int64 i4 = 0; i4 < shape.dimensions(4); ++i4) {
+            for (int64 i4 = 0; i4 < shape().dimensions(4); ++i4) {
               pieces.push_back(element_to_string({i0, i1, i2, i3, i4}));
             }
             pieces.push_back("},\n");
@@ -509,14 +601,14 @@ namespace xla {
     }
     pieces.push_back("}");
   } else {
-    pieces.push_back(ShapeUtil::HumanString(shape));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {...}");
   }
 
   return tensorflow::str_util::Join(pieces, "");
 }
 
-/* static */ std::unique_ptr<Literal> LiteralUtil::MakeTuple(
+/* static */ std::unique_ptr<Literal> Literal::MakeTuple(
     tensorflow::gtl::ArraySlice<const Literal*> elements) {
   auto literal = MakeUnique<Literal>();
   std::vector<Shape> shape;
@@ -528,169 +620,197 @@ namespace xla {
   return literal;
 }
 
-/* static */ const void* LiteralUtil::InternalData(const Literal& literal) {
-  switch (literal.shape().element_type()) {
+const void* Literal::InternalData() const {
+  return const_cast<const void*>(
+      const_cast<Literal*>(this)->MutableInternalData());
+}
+
+void* Literal::MutableInternalData() {
+  // NOTE: We access the vectors directly to avoid the const reference
+  // created by the accessor functions.
+  switch (shape().element_type()) {
     case PRED:
-      return reinterpret_cast<const void*>(literal.preds().data());
+      return reinterpret_cast<void*>(preds_.data());
     case U8:
-      return reinterpret_cast<const void*>(literal.u8s().data());
+      return reinterpret_cast<void*>(u8s_.data());
     case S32:
-      return reinterpret_cast<const void*>(literal.s32s().data());
+      return reinterpret_cast<void*>(s32s_.data());
     case S64:
-      return reinterpret_cast<const void*>(literal.s64s().data());
+      return reinterpret_cast<void*>(s64s_.data());
     case U32:
-      return reinterpret_cast<const void*>(literal.u32s().data());
+      return reinterpret_cast<void*>(u32s_.data());
     case U64:
-      return reinterpret_cast<const void*>(literal.u64s().data());
+      return reinterpret_cast<void*>(u64s_.data());
     case F32:
-      return reinterpret_cast<const void*>(literal.f32s().data());
+      return reinterpret_cast<void*>(f32s_.data());
     case F64:
-      return reinterpret_cast<const void*>(literal.f64s().data());
+      return reinterpret_cast<void*>(f64s_.data());
+    case F16:
+      return reinterpret_cast<void*>(f16s_.data());
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
-                 << PrimitiveType_Name(literal.shape().element_type());
+                 << PrimitiveType_Name(shape().element_type());
   }
 }
 
-/* static */ void* LiteralUtil::MutableInternalData(Literal* literal) {
-  return const_cast<void*>(LiteralUtil::InternalData(*literal));
-}
-
-/* static */ void LiteralUtil::Reserve(int64 num_elements, Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  switch (literal->shape().element_type()) {
+void Literal::Reserve(int64 num_elements) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  switch (shape().element_type()) {
     case PRED:
-      GetMutableRepeatedField<bool>(literal)->Resize(num_elements, false);
+      Resize<bool>(num_elements, false);
+      break;
+    case S8:
+      Resize<int8>(num_elements, 0);
       break;
     case U8:
-      // u8s is an optional "bytes", rather than a repeated field. Therefore its
-      // access methods are somewhat different from the others.
-      literal->mutable_u8s()->resize(num_elements, 0);
+      Resize<uint8>(num_elements, 0);
       break;
     case S32:
-      GetMutableRepeatedField<int32>(literal)->Resize(num_elements,
-                                                      /*value=*/0);
+      Resize<int32>(num_elements, 0);
       break;
     case S64:
-      GetMutableRepeatedField<tensorflow::protobuf_int64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<int64>(num_elements, 0);
       break;
     case U32:
-      GetMutableRepeatedField<uint32>(literal)->Resize(num_elements,
-                                                       /*value=*/0);
+      Resize<uint32>(num_elements, 0);
       break;
     case U64:
-      GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<uint64>(num_elements, 0);
       break;
     case F32:
-      GetMutableRepeatedField<float>(literal)->Resize(num_elements,
-                                                      /*value=*/0.0f);
+      Resize<float>(num_elements, 0);
       break;
     case F64:
-      GetMutableRepeatedField<double>(literal)->Resize(num_elements,
-                                                       /*value=*/0.0);
+      Resize<double>(num_elements, 0);
+      break;
+    case F16:
+      Resize<half>(num_elements, static_cast<half>(0.0f));
       break;
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
-                 << PrimitiveType_Name(literal->shape().element_type());
+                 << PrimitiveType_Name(shape().element_type());
   }
 }
 
-/* static */ tensorflow::Status LiteralUtil::ValidateLiteral(
-    const Literal& literal) {
-  TF_CHECK_OK(ShapeUtil::ValidateShape(literal.shape()));
-  int64 expected = ShapeUtil::ElementsIn(literal.shape());
+tensorflow::Status Literal::ValidateLiteral() const {
+  TF_CHECK_OK(ShapeUtil::ValidateShape(shape()));
+  int64 expected = ShapeUtil::ElementsIn(shape());
   int64 actual = -1;
-  switch (literal.shape().element_type()) {
+  switch (shape().element_type()) {
     case PRED:
-      actual = literal.preds().size();
+      actual = preds_size();
       break;
     case U8:
-      actual = literal.u8s().size();
+      actual = u8s_size();
       break;
     case S32:
-      actual = literal.s32s_size();
+      actual = s32s_size();
       break;
     case U32:
-      actual = literal.u32s_size();
+      actual = u32s_size();
       break;
     case S64:
-      actual = literal.s64s_size();
+      actual = s64s_size();
       break;
     case U64:
-      actual = literal.u64s_size();
+      actual = u64s_size();
       break;
     case F32:
-      actual = literal.f32s_size();
+      actual = f32s_size();
       break;
     case F64:
-      actual = literal.f64s_size();
+      actual = f64s_size();
+      break;
+    case F16:
+      actual = f16s().size() / sizeof(half);
       break;
     default:
       return tensorflow::errors::Unimplemented(
           "unhandled element type for literal validation: " +
-          PrimitiveType_Name(literal.shape().element_type()));
+          PrimitiveType_Name(shape().element_type()));
   }
 
   if (expected != actual) {
     return tensorflow::errors::InvalidArgument(tensorflow::strings::Printf(
         "literal has bad number of elements for its shape %s: want %lld "
         "got %lld",
-        ShapeUtil::HumanString(literal.shape()).c_str(), expected, actual));
+        ShapeUtil::HumanString(shape()).c_str(), expected, actual));
   }
 
   return tensorflow::Status::OK();
 }
 
-/* static */ void LiteralUtil::EachCellAsString(
-    const Literal& literal,
-    std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                       const string& value)>
-        per_cell) {
-  if (ShapeUtil::Rank(literal.shape()) == 1) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      per_cell({i0}, GetAsString(literal, {i0}));
-    }
+void Literal::EachCellAsString(
+    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                             const string& value)>& per_cell) const {
+  if (ShapeUtil::HasZeroElements(shape())) {
     return;
   }
+  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+      shape(), /*linear_index=*/0);
+  do {
+    per_cell(indices, GetAsString(indices));
+  } while (IndexUtil::BumpIndices(shape(), &indices));
+}
 
-  if (ShapeUtil::Rank(literal.shape()) == 2) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
-      }
-    }
-    return;
+namespace {
+template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
+std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
+  return LiteralUtil::Convert<
+      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<
+          primitive_dest_type>::type>(src_literal);
+}
+
+template <PrimitiveType primitive_src_type>
+StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (primitive_dest_type) {
+#define CONVERT_IF_TYPES_MATCH(type) \
+  case (type):                       \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+    CONVERT_IF_TYPES_MATCH(PRED)
+    CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S32)
+    CONVERT_IF_TYPES_MATCH(S64)
+    CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U32)
+    CONVERT_IF_TYPES_MATCH(U64)
+    CONVERT_IF_TYPES_MATCH(F32)
+    CONVERT_IF_TYPES_MATCH(F64)
+#undef CONVERT_IF_TYPES_MATCH
+    // Other types are not yet supported.
+    default:
+      return tensorflow::errors::InvalidArgument(
+          "Unimplemented: ConvertIfDestTypeMatches for type " +
+          PrimitiveType_Name(src_literal.shape().element_type()));
   }
+}
+}
 
-  if (ShapeUtil::Rank(literal.shape()) == 3) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
-        }
-      }
-    }
-    return;
+StatusOr<std::unique_ptr<Literal>> LiteralUtil::ConvertIfSrcTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (src_literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
+  case (type):                             \
+    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
+    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
+    CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S32)
+    CONVERT_IF_DEST_TYPE_MATCHES(S64)
+    CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U32)
+    CONVERT_IF_DEST_TYPE_MATCHES(U64)
+    CONVERT_IF_DEST_TYPE_MATCHES(F32)
+    CONVERT_IF_DEST_TYPE_MATCHES(F64)
+#undef CONVERT_IF_DEST_TYPE_MATCHES
+    // Other types are not yet supported.
+    default:
+      return tensorflow::errors::InvalidArgument(
+          "Unimplemented: ConvertIfSrcTypeMatches for type " +
+          PrimitiveType_Name(src_literal.shape().element_type()));
   }
-
-  if (ShapeUtil::Rank(literal.shape()) == 4) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
-            per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
 }
 
 namespace {
@@ -704,8 +824,8 @@ template <typename NativeT>
 bool EqualElements(const Literal& literal1, const Literal& literal2,
                    int dimension, std::vector<int64>* multi_index) {
   if (dimension == ShapeUtil::Rank(literal1.shape())) {
-    return (LiteralUtil::Get<NativeT>(literal1, *multi_index) ==
-            LiteralUtil::Get<NativeT>(literal2, *multi_index));
+    return (literal1.Get<NativeT>(*multi_index) ==
+            literal2.Get<NativeT>(*multi_index));
   }
   for (int64 i = 0; i < literal1.shape().dimensions(dimension); ++i) {
     (*multi_index)[dimension] = i;
@@ -719,145 +839,197 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
 
 }  // namespace
 
-/* static */ bool LiteralUtil::Equal(const Literal& literal1,
-                                     const Literal& literal2) {
-  if (!ShapeUtil::Compatible(literal1.shape(), literal2.shape())) {
+bool Literal::Equal(const Literal& literal2) const {
+  if (!ShapeUtil::Compatible(shape(), literal2.shape())) {
     return false;
   }
-  if (ShapeUtil::IsTuple(literal1.shape())) {
+  if (ShapeUtil::IsTuple(shape())) {
     // Because the shapes are compatible, they must have the same number of
     // tuple elements.
-    CHECK_EQ(literal1.tuple_literals_size(), literal2.tuple_literals_size());
-    for (int i = 0; i < literal1.tuple_literals_size(); ++i) {
-      if (!Equal(literal1.tuple_literals(i), literal2.tuple_literals(i))) {
+    CHECK_EQ(tuple_literals_size(), literal2.tuple_literals_size());
+    for (int i = 0; i < tuple_literals_size(); ++i) {
+      if (!tuple_literals(i).Equal(literal2.tuple_literals(i))) {
         return false;
       }
     }
     return true;
   } else {
-    std::vector<int64> multi_index(ShapeUtil::Rank(literal1.shape()), 0);
-    switch (literal1.shape().element_type()) {
+    std::vector<int64> multi_index(ShapeUtil::Rank(shape()), 0);
+    switch (shape().element_type()) {
       case PRED:
-        return EqualElements<bool>(literal1, literal2, 0, &multi_index);
+        return EqualElements<bool>(*this, literal2, 0, &multi_index);
       case U8:
-        return EqualElements<uint8>(literal1, literal2, 0, &multi_index);
+        return EqualElements<uint8>(*this, literal2, 0, &multi_index);
       case S32:
-        return EqualElements<int32>(literal1, literal2, 0, &multi_index);
+        return EqualElements<int32>(*this, literal2, 0, &multi_index);
       case S64:
-        return EqualElements<int64>(literal1, literal2, 0, &multi_index);
+        return EqualElements<int64>(*this, literal2, 0, &multi_index);
       case U32:
-        return EqualElements<uint32>(literal1, literal2, 0, &multi_index);
+        return EqualElements<uint32>(*this, literal2, 0, &multi_index);
       case U64:
-        return EqualElements<uint64>(literal1, literal2, 0, &multi_index);
+        return EqualElements<uint64>(*this, literal2, 0, &multi_index);
       case F32:
-        return EqualElements<float>(literal1, literal2, 0, &multi_index);
+        return EqualElements<float>(*this, literal2, 0, &multi_index);
       case F64:
-        return EqualElements<double>(literal1, literal2, 0, &multi_index);
+        return EqualElements<double>(*this, literal2, 0, &multi_index);
+      case F16:
+        return EqualElements<half>(*this, literal2, 0, &multi_index);
       default:
-        LOG(FATAL) << "Unimplemented: LiteralUtil::Equal for type "
-                   << PrimitiveType_Name(literal1.shape().element_type());
+        LOG(FATAL) << "Unimplemented: Literal::Equal for type "
+                   << PrimitiveType_Name(shape().element_type());
     }
   }
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
-    const Literal& literal) {
-  CHECK(literal.shape().element_type() == PRED);
-  return literal.preds();
+tensorflow::gtl::MutableArraySlice<bool> Literal::GetMutableArraySlice() {
+  auto values = mutable_preds();
+  return tensorflow::gtl::MutableArraySlice<bool>(values->data(),
+                                                  values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal) {
-  CHECK(literal->shape().element_type() == PRED);
-  return literal->mutable_preds();
+tensorflow::gtl::MutableArraySlice<int8> Literal::GetMutableArraySlice() {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<int8>(
+      reinterpret_cast<int8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U32);
-  return literal.u32s();
+tensorflow::gtl::MutableArraySlice<uint8> Literal::GetMutableArraySlice() {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<uint8>(
+      reinterpret_cast<uint8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == U32);
-  return literal->mutable_u32s();
+tensorflow::gtl::MutableArraySlice<int32> Literal::GetMutableArraySlice() {
+  auto values = mutable_s32s();
+  return tensorflow::gtl::MutableArraySlice<int32>(values->data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint64>
-LiteralUtil::GetArraySlice<uint64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U64);
-  return AsUInt64Slice(literal.u64s());
+tensorflow::gtl::MutableArraySlice<uint32> Literal::GetMutableArraySlice() {
+  auto values = mutable_u32s();
+  return tensorflow::gtl::MutableArraySlice<uint32>(values->data(),
+                                                    values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == U64);
-  return literal->mutable_u64s();
+tensorflow::gtl::MutableArraySlice<int64> Literal::GetMutableArraySlice() {
+  static_assert(sizeof(int64) == sizeof(tensorflow::protobuf_int64) &&
+                    alignof(int64) == alignof(tensorflow::protobuf_int64),
+                "The int64 and tensorflow::protobuf_int64 types are not "
+                "compatible");
+  auto values = mutable_s64s();
+  // Because of the fact that tensorflow::protobuf_int64 is defined as int64_t
+  // while tensorflow::int64 is defined as long long, a reinterpret_cast<> is
+  // necessary from the raw data pointer returned by the mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<int64>(
+      reinterpret_cast<int64*>(values->data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int32>
-LiteralUtil::GetArraySlice<int32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S32);
-  return literal.s32s();
+tensorflow::gtl::MutableArraySlice<uint64> Literal::GetMutableArraySlice() {
+  static_assert(sizeof(uint64) == sizeof(tensorflow::protobuf_uint64) &&
+                    alignof(uint64) == alignof(tensorflow::protobuf_uint64),
+                "The uint64 and tensorflow::protobuf_uint64 types are not "
+                "compatible");
+  auto values = mutable_u64s();
+  // Because of the fact that tensorflow::protobuf_uint64 is defined as uint64_t
+  // while tensorflow::uint64 is defined as unsigned long long, a
+  // reinterpret_cast<> is necessary from the raw data pointer returned by the
+  // mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<uint64>(
+      reinterpret_cast<uint64*>(values->data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == S32);
-  return literal->mutable_s32s();
+tensorflow::gtl::MutableArraySlice<float> Literal::GetMutableArraySlice() {
+  auto values = mutable_f32s();
+  return tensorflow::gtl::MutableArraySlice<float>(values->data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int64>
-LiteralUtil::GetArraySlice<int64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S64);
-  return AsInt64Slice(literal.s64s());
+tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice() {
+  auto values = mutable_f64s();
+  return tensorflow::gtl::MutableArraySlice<double>(values->data(),
+                                                    values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == S64);
-  return literal->mutable_s64s();
+tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice<half>() {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  // TODO - there is an endianess problem here. fix it, or wait for uint16
+  //        support in protobuf
+  auto values = mutable_f16s();
+  return tensorflow::gtl::MutableArraySlice<half>(
+      reinterpret_cast<half*>(&(*values)[0]), values->size() / sizeof(half));
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<float>
-LiteralUtil::GetArraySlice<float>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == F32);
-  return literal.f32s();
+tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const {
+  CHECK_EQ(shape().element_type(), PRED);
+  return tensorflow::gtl::ArraySlice<bool>(preds().data(), preds().size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F32);
-  return literal->mutable_f32s();
+tensorflow::gtl::ArraySlice<uint8> Literal::GetArraySlice<uint8>() const {
+  CHECK_EQ(shape().element_type(), U8);
+  return tensorflow::gtl::ArraySlice<uint8>(
+      reinterpret_cast<const uint8*>(u8s().data()), u8s().size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<double>
-LiteralUtil::GetArraySlice<double>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == F64);
-  return literal.f64s();
+tensorflow::gtl::ArraySlice<int8> Literal::GetArraySlice<int8>() const {
+  CHECK_EQ(shape().element_type(), S8);
+  return tensorflow::gtl::ArraySlice<int8>(
+      reinterpret_cast<const int8*>(u8s().data()), u8s().size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F64);
-  return literal->mutable_f64s();
+tensorflow::gtl::ArraySlice<uint32> Literal::GetArraySlice<uint32>() const {
+  CHECK_EQ(shape().element_type(), U32);
+  return u32s();
+}
+
+template <>
+tensorflow::gtl::ArraySlice<uint64> Literal::GetArraySlice<uint64>() const {
+  CHECK_EQ(shape().element_type(), U64);
+  return u64s();
+}
+
+template <>
+tensorflow::gtl::ArraySlice<int32> Literal::GetArraySlice<int32>() const {
+  CHECK_EQ(shape().element_type(), S32);
+  return s32s();
+}
+
+template <>
+tensorflow::gtl::ArraySlice<int64> Literal::GetArraySlice<int64>() const {
+  CHECK_EQ(shape().element_type(), S64);
+  return s64s();
+}
+
+template <>
+tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const {
+  CHECK_EQ(shape().element_type(), F64);
+  return f64s();
+}
+
+template <>
+tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const {
+  CHECK_EQ(shape().element_type(), F16);
+  return tensorflow::gtl::ArraySlice<half>(
+      reinterpret_cast<const half*>(f16s().data()),
+      f16s().size() / sizeof(half));
 }
 
 template <typename NativeT>
@@ -865,46 +1037,48 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
   for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
     auto multi_index =
         IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
-    if (LiteralUtil::Get<NativeT>(literal, multi_index) != value) {
+    if (literal.Get<NativeT>(multi_index) != value) {
       return false;
     }
   }
   return true;
 }
 
-/* static */ bool LiteralUtil::IsAll(const Literal& literal, int8 value) {
-  switch (literal.shape().element_type()) {
+bool Literal::IsAll(int8 value) const {
+  switch (shape().element_type()) {
     case U8:
       if (value >= 0) {
-        return AllElementsEqualValue<uint8>(literal, value);
+        return AllElementsEqualValue<uint8>(*this, value);
       }
       return false;
     case U32:
       if (value >= 0) {
-        return AllElementsEqualValue<uint32>(literal, value);
+        return AllElementsEqualValue<uint32>(*this, value);
       }
       return false;
     case U64:
       if (value >= 0) {
-        return AllElementsEqualValue<uint64>(literal, value);
+        return AllElementsEqualValue<uint64>(*this, value);
       }
       return false;
     case S8:
-      return AllElementsEqualValue<int8>(literal, value);
+      return AllElementsEqualValue<int8>(*this, value);
     case S32:
-      return AllElementsEqualValue<int32>(literal, value);
+      return AllElementsEqualValue<int32>(*this, value);
     case S64:
-      return AllElementsEqualValue<int64>(literal, value);
+      return AllElementsEqualValue<int64>(*this, value);
     case F32:
-      return AllElementsEqualValue<float>(literal, value);
+      return AllElementsEqualValue<float>(*this, value);
     case F64:
-      return AllElementsEqualValue<double>(literal, value);
+      return AllElementsEqualValue<double>(*this, value);
+    case F16:
+      return AllElementsEqualValue<half>(*this, static_cast<half>(value));
     case PRED:
       if (value == 0) {
-        return AllElementsEqualValue<bool>(literal, false);
+        return AllElementsEqualValue<bool>(*this, false);
       }
       if (value == 1) {
-        return AllElementsEqualValue<bool>(literal, true);
+        return AllElementsEqualValue<bool>(*this, true);
       }
       return false;
     default:
@@ -912,89 +1086,219 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
   }
 }
 
-/* static */ bool LiteralUtil::IsAllFloat(const Literal& literal, float value) {
-  switch (literal.shape().element_type()) {
+bool Literal::IsAllFloat(float value) const {
+  switch (shape().element_type()) {
     case F32:
-      return AllElementsEqualValue<float>(literal, value);
+      return AllElementsEqualValue<float>(*this, value);
     case F64:
-      return AllElementsEqualValue<double>(literal, value);
+      return AllElementsEqualValue<double>(*this, value);
+    case F16:
+      return AllElementsEqualValue<half>(*this, static_cast<half>(value));
     default:
       return false;
   }
 }
 
-/* static */ bool LiteralUtil::IsZero(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> indices) {
-  switch (literal.shape().element_type()) {
+bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
+  switch (shape().element_type()) {
     case U8:
-      return Get<uint8>(literal, indices) == 0;
+      return Get<uint8>(indices) == 0;
     case U32:
-      return Get<uint32>(literal, indices) == 0;
+      return Get<uint32>(indices) == 0;
     case U64:
-      return Get<uint64>(literal, indices) == 0;
+      return Get<uint64>(indices) == 0;
     case S8:
-      return Get<int8>(literal, indices) == 0;
+      return Get<int8>(indices) == 0;
     case S32:
-      return Get<int32>(literal, indices) == 0;
+      return Get<int32>(indices) == 0;
     case S64:
-      return Get<int64>(literal, indices) == 0;
+      return Get<int64>(indices) == 0;
     case F32:
-      return Get<float>(literal, indices) == 0.0f;
+      return Get<float>(indices) == 0.0f;
     case F64:
-      return Get<double>(literal, indices) == 0.0;
+      return Get<double>(indices) == 0.0;
+    case F16:
+      return Get<half>(indices) == static_cast<half>(0.0f);
     case PRED:
-      return Get<bool>(literal, indices) == false;
+      return Get<bool>(indices) == false;
     default:
       LOG(FATAL) << "Input literal must be an array.";
   }
 }
 
 template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<int64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
+/* static */ void Literal::Resize<bool>(int64 num_elements, bool value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_preds()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<int8>(int64 num_elements, int8 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_u8s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<uint8>(int64 num_elements, uint8 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_u8s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<int32>(int64 num_elements, int32 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_s32s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<uint32>(int64 num_elements, uint32 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_u32s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<int64>(int64 num_elements, int64 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_s64s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<uint64>(int64 num_elements, uint64 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_u64s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<float>(int64 num_elements, float value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_f32s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<double>(int64 num_elements, double value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_f64s()->resize(num_elements, value);
+}
+
+template <>
+void Literal::Resize<half>(int64 num_elements, half value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_f16s()->resize(num_elements, value);
+}
+
+template <typename RepeatedFieldT, typename NativeT>
+static void CopyToRepeatedField(RepeatedFieldT* dest,
+                                const std::vector<NativeT>& src) {
+  *dest = RepeatedFieldT(src.begin(), src.end());
+}
+
+template <typename RepeatedFieldT>
+static void CopyToRepeatedBoolField(RepeatedFieldT* dest,
+                                    const BoolVector& src) {
+  *dest = RepeatedFieldT(src.begin(), src.end());
+}
+
+LiteralProto Literal::ToProto() const {
+  LiteralProto proto;
+  proto.Clear();
+  *proto.mutable_shape() = shape();
+  switch (shape().element_type()) {
+    case PRED:
+      if (preds().begin()) {
+        CopyToRepeatedBoolField(proto.mutable_preds(), preds());
+      }
+      break;
+    case U8:
+      *proto.mutable_u8s() = u8s_string();
+      break;
+    case S32:
+      CopyToRepeatedField(proto.mutable_s32s(), s32s());
+      break;
+    case S64:
+      CopyToRepeatedField(proto.mutable_s64s(), s64s());
+      break;
+    case U32:
+      CopyToRepeatedField(proto.mutable_u32s(), u32s());
+      break;
+    case U64:
+      CopyToRepeatedField(proto.mutable_u64s(), u64s());
+      break;
+    case F16:
+      *proto.mutable_f16s() =
+          string(reinterpret_cast<const char*>(f16s_.data()),
+                 f16s_.size() * sizeof(half));
+      break;
+    case F32:
+      CopyToRepeatedField(proto.mutable_f32s(), f32s());
+      break;
+    case F64:
+      CopyToRepeatedField(proto.mutable_f64s(), f64s());
+      break;
+    case TUPLE:
+      for (const auto& tuple : tuple_literals()) {
+        *proto.add_tuple_literals() = tuple.ToProto();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
   }
+
+  return proto;
 }
 
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<uint64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
+template <typename RepeatedFieldT, typename NativeT>
+static void CopyFromRepeatedField(std::vector<NativeT>* dest,
+                                  const RepeatedFieldT& src) {
+  *dest = std::vector<NativeT>(src.begin(), src.end());
+}
+
+void Literal::CopyFromProto(const LiteralProto& literal_proto) {
+  if (!literal_proto.has_shape()) {
+    return;
   }
-}
 
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  repeated_field->Resize(num_elements, value);
-}
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  repeated_field->Resize(num_elements, value);
+  *mutable_shape() = literal_proto.shape();
+  switch (shape().element_type()) {
+    case PRED:
+      *mutable_preds() = BoolVector(literal_proto.preds().begin(),
+                                    literal_proto.preds().end());
+      break;
+    case U8:
+      set_u8s(literal_proto.u8s());
+      break;
+    case S32:
+      CopyFromRepeatedField(mutable_s32s(), literal_proto.s32s());
+      break;
+    case S64:
+      CopyFromRepeatedField(mutable_s64s(), literal_proto.s64s());
+      break;
+    case U32:
+      CopyFromRepeatedField(mutable_u32s(), literal_proto.u32s());
+      break;
+    case U64:
+      CopyFromRepeatedField(mutable_u64s(), literal_proto.u64s());
+      break;
+    case F16: {
+      const string& s(literal_proto.f16s());
+      CHECK_EQ(0, s.size() % sizeof(half));
+      f16s_ = std::vector<half>(s.size() / sizeof(half));
+      memcpy(f16s_.data(), s.data(), s.size());
+      break;
+    }
+    case F32:
+      CopyFromRepeatedField(mutable_f32s(), literal_proto.f32s());
+      break;
+    case F64:
+      CopyFromRepeatedField(mutable_f64s(), literal_proto.f64s());
+      break;
+    case TUPLE:
+      for (const auto& proto : literal_proto.tuple_literals()) {
+        mutable_tuple_literals()->push_back(Literal(proto));
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
+  }
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index db467a59113..42c8b61acec 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <functional>
 #include <initializer_list>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <type_traits>
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -47,15 +49,210 @@ limitations under the License.
 
 namespace xla {
 
+// This class is a simple vector of boolean values. It's used to workaround some
+// implementations of std::vector<bool> that use a bitset which does not have
+// the semantics expected by Literal::preds().
+class BoolVector {
+ public:
+  typedef bool* iterator;
+  typedef const bool* const_iterator;
+
+  BoolVector() : bits_(nullptr), size_(0), capacity_(0) {}
+
+  BoolVector(const_iterator other_begin, const_iterator other_end)
+      : bits_(nullptr), size_(0), capacity_(0) {
+    if (other_begin && other_end) {
+      resize(other_end - other_begin);
+      memcpy(begin(), other_begin, size());
+    }
+  }
+
+  BoolVector(const BoolVector& other) { CopyFrom(other); }
+
+  BoolVector& operator=(const BoolVector& other) {
+    CopyFrom(other);
+    return *this;
+  }
+
+  void push_back(const bool& value) {
+    resize(size_ + 1);
+    bits_[size_ - 1] = value;
+  }
+
+  bool* data() const { return bits_.get(); }
+
+  size_t size() const { return size_; }
+
+  size_t capacity() const { return capacity_; }
+
+  void resize(size_t new_size, bool val = false) {
+    if (new_size == 0) {
+      bits_.reset(nullptr);
+      size_ = 0;
+      capacity_ = 0;
+    } else {
+      size_t old_size = size();
+      if (new_size > old_size) {
+        grow(new_size);
+      }
+      if (old_size < new_size) {
+        memset(&bits_[old_size], val, new_size - old_size);
+      }
+      size_ = new_size;
+    }
+  }
+
+  void clear() {
+    bits_.reset(nullptr);
+    size_ = 0;
+    capacity_ = 0;
+  }
+
+  iterator begin() { return &bits_[0]; }
+  iterator end() { return &bits_[size()]; }
+  const_iterator begin() const { return &bits_[0]; }
+  const_iterator end() const { return &bits_[size()]; }
+
+ private:
+  void grow(size_t n) {
+    if (capacity_ < n) {
+      capacity_ = 2 * n;
+      bool* new_bits = new bool[capacity_]();
+      if (size_ > 0) {
+        memcpy(new_bits, bits_.get(), size_);
+      }
+      bits_.reset(new_bits);
+    }
+  }
+
+  void CopyFrom(const BoolVector& other) {
+    bits_ = MakeUnique<bool[]>(other.capacity());
+    memcpy(begin(), other.begin(), other.size());
+    size_ = other.size();
+    capacity_ = other.capacity();
+  }
+
+  std::unique_ptr<bool[]> bits_;
+  size_t size_;
+  size_t capacity_;
+};
+
 // Utility class for dealing with XLA literal values.  Most methods are
 // templated by native (host) type which corresponds to a unique XLA
 // PrimitiveType. See ComputationBuilder for details.  Not all primitive types
 // defined in xla_data.proto have a corresponding native type or even have a
 // storage location in the Literal proto yet (for example, primitive type F16).
-class LiteralUtil {
+class Literal {
  public:
-  // Create new literal of a given rank. To minimize ambiguity (for users and
-  // the compiler) these CreateR[0-2] methods should explicitly specify the
+  Literal() {}
+
+  Literal(const Literal& other) = default;
+
+  explicit Literal(const LiteralProto& other) { CopyFromProto(other); }
+
+  Literal& operator=(const Literal& other) = default;
+
+  LiteralProto ToProto() const;
+
+  bool has_shape() const {
+    return shape_.element_type() != PRIMITIVE_TYPE_INVALID;
+  }
+
+  // Basic accessor functions.  Names mirror the original protobuf
+  // functions for convenience.
+  string DebugString() const { return ToProto().DebugString(); }
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+
+  void Clear() {
+    shape_.Clear();
+    preds_.clear();
+    u8s_.clear();
+    s32s_.clear();
+    s64s_.clear();
+    u32s_.clear();
+    u64s_.clear();
+    f16s_.clear();
+    f32s_.clear();
+    f64s_.clear();
+    tuple_literals_.clear();
+  }
+
+  int preds_size() const { return preds().size(); }
+  const BoolVector& preds() const { return preds_; }
+  BoolVector* mutable_preds() { return &preds_; }
+
+  int s32s_size() const { return s32s().size(); }
+  int32 s32s(int i) const { return s32s_[i]; }
+  const std::vector<int32>& s32s() const { return s32s_; }
+  std::vector<int32>* mutable_s32s() { return &s32s_; }
+
+  int s64s_size() const { return s64s().size(); }
+  void add_s64s(int64 value) { s64s_.push_back(value); }
+  const std::vector<int64>& s64s() const { return s64s_; }
+  std::vector<int64>* mutable_s64s() { return &s64s_; }
+
+  int u32s_size() const { return u32s().size(); }
+  uint32 u32s(int i) const { return u32s_[i]; }
+  const std::vector<uint32>& u32s() const { return u32s_; }
+  std::vector<uint32>* mutable_u32s() { return &u32s_; }
+
+  int u64s_size() const { return u64s().size(); }
+  const std::vector<uint64>& u64s() const { return u64s_; }
+  std::vector<uint64>* mutable_u64s() { return &u64s_; }
+
+  int f16s_size() const { return f16s().size(); }
+  half f16s(int i) const { return f16s_[i]; }
+  const std::vector<half>& f16s() const { return f16s_; }
+  std::vector<half>* mutable_f16s() { return &f16s_; }
+
+  int f32s_size() const { return f32s().size(); }
+  float f32s(int i) const { return f32s_[i]; }
+  void add_f32s(float value) { f32s_.push_back(value); }
+  const std::vector<float>& f32s() const { return f32s_; }
+  std::vector<float>& f32s() { return f32s_; }
+  std::vector<float>* mutable_f32s() { return &f32s_; }
+
+  int f64s_size() const { return f64s().size(); }
+  const std::vector<double>& f64s() const { return f64s_; }
+  std::vector<double>* mutable_f64s() { return &f64s_; }
+
+  int tuple_literals_size() const { return tuple_literals().size(); }
+  const Literal& tuple_literals(int i) const { return tuple_literals_[i]; }
+  Literal* add_tuple_literals() {
+    tuple_literals_.push_back(Literal());
+    return &tuple_literals_.back();
+  }
+  std::vector<Literal>* mutable_tuple_literals() { return &tuple_literals_; }
+  const std::vector<Literal>& tuple_literals() const { return tuple_literals_; }
+
+  int u8s_size() const { return u8s().size(); }
+  const std::vector<uint8>& u8s() const { return u8s_; }
+  void set_u8s(const std::vector<uint8>& value) { u8s_ = value; }
+  void set_u8s(tensorflow::StringPiece value) {
+    u8s_ = std::vector<uint8>(value.size());
+    u8s_.clear();
+    append_u8s(value);
+  }
+
+  void append_u8s(tensorflow::StringPiece value) {
+    u8s_.insert(u8s_.end(), value.begin(), value.end());
+  }
+
+  string u8s_string() const { return string(u8s().begin(), u8s().end()); }
+
+  std::vector<uint8>* mutable_u8s() { return &u8s_; }
+
+  const Shape& shape() const { return shape_; }
+  Shape* mutable_shape() { return &shape_; }
+
+  void Swap(Literal* other) {
+    Literal temp = *this;
+    *this = *other;
+    *other = temp;
+  }
+
+  // CreatesCreate new literal of a given rank. To minimize ambiguity (for users
+  // and the compiler) these CreateR[0-2] methods should explicitly specify the
   // native type. For example:
   //
   //  CreateR1<float>({1.0, 42.0});
@@ -100,75 +297,98 @@ class LiteralUtil {
           values,
       const Layout& layout);
 
-  // Creates a new value that has the equivalent value as literal, but conforms
-  // to new_layout; e.g. a literal matrix that was in {0, 1} minor-to-major
-  // dimension layout can be re-layed-out as {1, 0} minor-to-major dimension
-  // layout and the value in the cell at any given logical index (i0, i1) will
-  // be the same.
+  // Creates a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
+
+  // Creates a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to this literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and this literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  Status Copy(const Literal& src_literal,
+              tensorflow::gtl::ArraySlice<int64> src_base,
+              tensorflow::gtl::ArraySlice<int64> dest_base,
+              tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Creates a new value that has the equivalent value as this literal, but
+  // conforms to new_layout; e.g. a literal matrix that was in {0, 1}
+  // minor-to-major dimension layout can be re-layed-out as {1, 0}
+  // minor-to-major dimension layout and the value in the cell at any given
+  // logical index (i0, i1) will be the same.
   //
   // Note: this is useful when the client wants to ensure that a value placed in
   // the XLA allocation tracker has a particular layout; for efficiency
   // purposes or avoiding unimplemented operation/layout combinations.
-  static std::unique_ptr<Literal> Relayout(const Literal& literal,
-                                           const Layout& new_layout);
+  std::unique_ptr<Literal> Relayout(const Layout& new_layout) const;
 
-  // Reshapes literal 'input' to have 'shape'. Both the original shape and
-  // 'shape' must contain the same number of elements. The implementation
-  // currently only supports monotonic dim0-major layouts.
-  static StatusOr<std::unique_ptr<Literal>> Reshape(
-      const xla::Literal& input, tensorflow::gtl::ArraySlice<int64> shape);
+  // Creates a new literal by reshaping this literal to have 'shape'. Both the
+  // original shape and 'shape' must contain the same number of elements. The
+  // implementation currently only supports monotonic dim0-major layouts.
+  StatusOr<std::unique_ptr<Literal>> Reshape(
+      tensorflow::gtl::ArraySlice<int64> shape) const;
 
-  // Creates a new literal by reordering the dimensions of the original literal.
+  // Creates a new literal by reordering the dimensions of this literal.
   // The given `permutation` must be a permutation of the dimension numbers
   // in the original literal, and it specifies the order of the new dimensions
   // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
   // For example, a transpose call on a literal of shape [3 x 8 x 4] and
   // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
-  static std::unique_ptr<Literal> Transpose(
-      const Literal& literal, tensorflow::gtl::ArraySlice<int64> permutation);
+  std::unique_ptr<Literal> Transpose(
+      tensorflow::gtl::ArraySlice<int64> permutation) const;
 
-  // Creates a sub-array from the the given literal by extracting the indices
+  // Creates a sub-array from this literal by extracting the indices
   // [start_index, limit_index) of each dimension. The result literal has the
   // same rank and layout as for the given literal. The number of indices in
   // start_indices and limit_indices must be the rank of the literal, and the
   // indices follow the order of the dimensions.
-  static std::unique_ptr<Literal> Slice(
-      const Literal& literal, tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices);
+  std::unique_ptr<Literal> Slice(
+      tensorflow::gtl::ArraySlice<int64> start_indices,
+      tensorflow::gtl::ArraySlice<int64> limit_indices) const;
 
   // Creates a literal with a prepended dimension with bound "times"; e.g. a
-  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from the input
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
   // literal replicated four times.
   template <typename NativeT>
-  static std::unique_ptr<Literal> Replicate(const Literal& input, int64 times);
+  std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Create a literal by converting each element in an original literal to a new
+  // Creates a literal by converting each element in this literal to a new
   // type.
   template <typename NativeSrcT, typename NativeDestT>
-  static std::unique_ptr<Literal> Convert(const Literal& literal);
+  std::unique_ptr<Literal> Convert() const;
 
-  // Create a literal value zero of the given primitive type.
+  // Creates a literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
 
-  // Create a literal value one of the given primitive type.
+  // Creates a literal value one of the given primitive type.
   static Literal One(PrimitiveType primitive_type);
 
   // Creates a literal value containing the minimum value of the given
   // primitive type. For floating-point types, returns -inf.
   static Literal MinValue(PrimitiveType primitive_type);
 
-  // Create a literal value containing the maximum value of the given
+  // Creates a literal value containing the maximum value of the given
   // primitive type. For floating-point types, returns inf.
   static Literal MaxValue(PrimitiveType primitive_type);
 
-  // Create a literal of the given shape where each element is `value`.
+  // Creates a literal of the given shape where each element is `value`.
   template <typename NativeT>
   static std::unique_ptr<Literal> CreateFullWithMonotonicDim0MajorLayout(
       tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value);
 
-  // Create a new literal from an array. The variants not ending with WithLayout
-  // use the default XLA layout for the literal's linear representation in
-  // memory.
+  // Creates a new literal from an array. The variants not ending with
+  // WithLayout use the default XLA layout for the literal's linear
+  // representation in memory.
   template <typename NativeT>
   static std::unique_ptr<Literal> CreateR2FromArray2D(
       const Array2D<NativeT>& values);
@@ -210,28 +430,33 @@ class LiteralUtil {
       std::initializer_list<std::initializer_list<NativeT>> values,
       int64 projection_p, int64 projection_z);
 
-  // Clones literal into an owned unique_ptr version.
-  static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);
+  // Clones this literal into an owned unique_ptr version.
+  std::unique_ptr<Literal> CloneToUnique() const;
+
+  // Returns the linear index of the given index within this literal's
+  // element_type repeated field.
+  int64 LinearIndex(tensorflow::gtl::ArraySlice<int64> multi_index) const;
 
   // Gets or sets an element in the literal at the given index. The index is
   // CHECKed against the dimension sizes.
   template <typename NativeT>
-  static NativeT Get(const Literal& literal,
-                     tensorflow::gtl::ArraySlice<int64> multi_index);
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
   template <typename NativeT>
-  static void Set(Literal* literal,
-                  tensorflow::gtl::ArraySlice<int64> multi_index,
-                  NativeT value);
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
+
+  // Retrieves the mutable array slice interface which can be used to manipulate
+  // pre-allocated literal values.
+  template <typename NativeT>
+  tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice();
 
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
   template <typename NativeT>
-  static NativeT GetFirstElement(const Literal& literal);
+  NativeT GetFirstElement() const;
 
   // As Get(), but determines the correct type and converts the value
   // into text.
-  static string GetAsString(const Literal& literal,
-                            tensorflow::gtl::ArraySlice<int64> multi_index);
+  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index) const;
 
   // Returns an identity matrix (rank 2) with the given row and column count.
   template <typename NativeT>
@@ -243,10 +468,530 @@ class LiteralUtil {
 
   // Validates that the data payload of the literal matches the literal shape;
   // if it does not, an appropriate status is returned.
-  static tensorflow::Status ValidateLiteral(const Literal& literal);
+  tensorflow::Status ValidateLiteral() const;
 
   // Returns a string representation of the literal value.
-  static string ToString(const Literal& literal);
+  string ToString() const;
+
+  // Invokes the "per cell" callback for each element in the provided
+  // literal with the element's indices and a string representation of
+  // the element's value.
+  //
+  // This function is useful if you want a polymorphic representation
+  // of the tensor's elements (turning it to a string for something
+  // like representation in a protobuf).
+  void EachCellAsString(
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell) const;
+  template <typename NativeT>
+  void EachCell(std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                                   NativeT value)>
+                    per_cell) const;
+
+  // Templated methods which populate the given repeated field in this literal
+  // with the given value(s). The Shape field of this literal is set
+  // to match the array dimensions and type. Examples:
+  //
+  //   // Populate with floats.
+  //   Array2D<float> float_values = ...
+  //   literal.PopulateR2FromArray2D(values);
+  //
+  //   // Populate with int32s.
+  //   literal.PopulateR2({{1, 2}, {3, 4}});
+  //
+  template <typename NativeT>
+  void PopulateR0(NativeT values);
+  template <typename NativeT>
+  void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  void PopulateR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  void PopulateR2WithLayout(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      const Layout& layout);
+  template <typename NativeT>
+  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                       const Layout& layout);
+  template <typename NativeT>
+  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                       const Layout& layout);
+  template <typename NativeT>
+  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                       const Layout& layout);
+
+  // Populates literal values by calling the generator function for every cell
+  // in this literal object.
+  template <typename NativeT>
+  Status Populate(
+      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+          generator);
+
+  // Creates a Literal of the given dimensions with all elements set to the
+  // given value.
+  template <typename NativeT>
+  void PopulateWithValue(NativeT value,
+                         tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Returns a pointer to the underlying vector corresponding to the Literal's
+  // shape.
+  const void* InternalData() const;
+  void* MutableInternalData();
+
+  // Allocates space in the underlying vector of this literal sufficient to hold
+  // num_elements of this literal's primitive type. Values in the vector are set
+  // to zero. num_elements must equal the number of elements in the literal's
+  // shape.
+  void Reserve(int64 num_elements);
+
+  // Allocates space in the underlying vector of this literal sufficient to hold
+  // num_elements of this literal's primitive type and sets each element in this
+  // literal to the given value. num_elements must equal the number of elements
+  // in this literal's shape.
+  template <typename NativeT>
+  void Resize(int64 num_elements, NativeT value);
+
+  // Returns true if this literal has the same shape and value as the given
+  // literal. Layout is not considered in the comparison.
+  bool Equal(const Literal& literal2) const;
+
+  // Returns whether every element in this literal is equal to value.
+  //
+  // value is an int8 because we expect this to be called with small
+  // compile-time constants (0, -1, etc.) and so that whatever value you pass
+  // can be represented exactly by floating-point types as small as 16 bits.
+  //
+  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
+  // are considered equal to true/false; other values are not considered equal
+  // to true.
+  bool IsAll(int8 value) const;
+
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular floating-point number.
+  //
+  // If the literal is not a floating-point value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for values that can be expressed precisely as a float,
+  // e.g. -0.5.
+  bool IsAllFloat(float value) const;
+
+  // Returns whether this literal is zero at the specified index. This literal
+  // must be an array.
+  bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
+
+ private:
+  // Returns an ArraySlice view of the array for this literal for the given
+  // NativeT (e.g., float). These functions map native type to XLA PrimitiveType
+  // via template specialization. The unspecialized forms below aborts to handle
+  // the error case where the given native type does not map to an XLA primitive
+  // type.
+  template <typename NativeT>
+  tensorflow::gtl::ArraySlice<NativeT> GetArraySlice() const {
+    static_assert(!std::is_same<NativeT, NativeT>::value,
+                  "Cannot map native type to primitive type.");
+  }
+
+  // Copy from a LiteralProto instance.
+  void CopyFromProto(const LiteralProto& literal_proto);
+
+  // Internal template helper for the Copy() API, matching its arguments one by
+  // one.
+  template <typename T>
+  Status CopyRange(const Literal& src_literal,
+                   tensorflow::gtl::ArraySlice<int64> src_base,
+                   tensorflow::gtl::ArraySlice<int64> dest_base,
+                   tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Utility structure which is used to create the optimal configuration for
+  // a ShapeUtil::ForEachIndex() scan across two literals.
+  struct StrideConfig {
+    StrideConfig(const Shape& source_shape, const Shape& dest_shape,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+    // The dimensions of the stride operation. Essentially every dimension
+    // will be iterated from base[i] to base[i]+dimensions[i], in step[i]
+    // steps.
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    DimensionVector base;
+    DimensionVector step;
+    int64 minor_dimension = 0;
+    // The size of the strides for source and destination. One of the two
+    // (the one looping through its most minor dimension) will be 1, while
+    // the other will be the stride size at the dimension matching the other
+    // shape most minor dimension being scanned.
+    int64 dest_stride = 1;
+    int64 source_stride = 1;
+    // The size of the inner loop on the most minor dimension.
+    int64 minor_loop_size = 1;
+  };
+
+  Shape shape_;
+  BoolVector preds_;
+  std::vector<uint8> u8s_;
+  std::vector<int32> s32s_;
+  std::vector<int64> s64s_;
+  std::vector<uint32> u32s_;
+  std::vector<uint64> u64s_;
+  std::vector<half> f16s_;
+  std::vector<float> f32s_;
+  std::vector<double> f64s_;
+  std::vector<Literal> tuple_literals_;
+};
+
+// Utility class for dealing with XLA literal values.  Most methods are
+// templated by native (host) type which corresponds to a unique XLA
+// PrimitiveType. See ComputationBuilder for details.  Not all primitive types
+// defined in xla_data.proto have a corresponding native type or even have a
+// storage location in the Literal proto yet (for example, primitive type F16).
+//
+// TODO(dnovillo) - All functions in this class simply redirect to the
+// corresponding function in class Literal. Remove this class after converting
+// all user code to use Literal directly.
+class LiteralUtil {
+ public:
+  // Creates new literal of a given rank. To minimize ambiguity (for users and
+  // the compiler) these CreateR[0-2] methods should explicitly specify the
+  // native type. For example:
+  //
+  //  CreateR1<float>({1.0, 42.0});
+  //  CreateR2<uint32>({{1, 2}, {3, 4}});
+  //
+  // The variants not ending with WithLayout use the default XLA layout for the
+  // literal's linear representation in memory.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR0(NativeT value) {
+    return Literal::CreateR0(value);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR1(
+      tensorflow::gtl::ArraySlice<NativeT> values) {
+    return Literal::CreateR1(values);
+  }
+
+  static std::unique_ptr<Literal> CreateR1(
+      const tensorflow::core::Bitmap& values) {
+    return Literal::CreateR1(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR2(
+      std::initializer_list<std::initializer_list<NativeT>> values) {
+    return Literal::CreateR2(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR2WithLayout(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      const Layout& layout) {
+    return Literal::CreateR2WithLayout(values, layout);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR3(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>
+          values) {
+    return Literal::CreateR3(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR3WithLayout(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>
+          values,
+      const Layout& layout) {
+    return Literal::CreateR3WithLayout(values, layout);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          values) {
+    return Literal::CreateR4(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4WithLayout(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          values,
+      const Layout& layout) {
+    return Literal::CreateR4WithLayout(values, layout);
+  }
+
+  // Creates a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape) {
+    return Literal::CreateFromShape(shape);
+  }
+
+  // Creates a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions) {
+    return Literal::CreateFromDimensions(primitive_type, dimensions);
+  }
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to dest_literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  //
+  // The src_literal and dest_literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  static Status Copy(const Literal& src_literal,
+                     tensorflow::gtl::ArraySlice<int64> src_base,
+                     Literal* dest_literal,
+                     tensorflow::gtl::ArraySlice<int64> dest_base,
+                     tensorflow::gtl::ArraySlice<int64> copy_size) {
+    return dest_literal->Copy(src_literal, src_base, dest_base, copy_size);
+  }
+
+  // Creates a new value that has the equivalent value as literal, but conforms
+  // to new_layout; e.g. a literal matrix that was in {0, 1} minor-to-major
+  // dimension layout can be re-laid-out as {1, 0} minor-to-major dimension
+  // layout and the value in the cell at any given logical index (i0, i1) will
+  // be the same.
+  //
+  // Note: this is useful when the client wants to ensure that a value placed in
+  // the XLA allocation tracker has a particular layout; for efficiency
+  // purposes or avoiding unimplemented operation/layout combinations.
+  static std::unique_ptr<Literal> Relayout(const Literal& literal,
+                                           const Layout& new_layout) {
+    return literal.Relayout(new_layout);
+  }
+
+  // Reshapes literal 'input' to have 'shape'. Both the original shape and
+  // 'shape' must contain the same number of elements. The implementation
+  // currently only supports monotonic dim0-major layouts.
+  static StatusOr<std::unique_ptr<Literal>> Reshape(
+      const xla::Literal& input, tensorflow::gtl::ArraySlice<int64> shape) {
+    return input.Reshape(shape);
+  }
+
+  // Creates a new literal by reordering the dimensions of the original literal.
+  // The given `permutation` must be a permutation of the dimension numbers
+  // in the original literal, and it specifies the order of the new dimensions
+  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
+  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
+  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
+  static std::unique_ptr<Literal> Transpose(
+      const Literal& literal, tensorflow::gtl::ArraySlice<int64> permutation) {
+    return literal.Transpose(permutation);
+  }
+
+  // Creates a sub-array from the given literal by extracting the indices
+  // [start_index, limit_index) of each dimension. The result literal has the
+  // same rank and layout as for the given literal. The number of indices in
+  // start_indices and limit_indices must be the rank of the literal, and the
+  // indices follow the order of the dimensions.
+  static std::unique_ptr<Literal> Slice(
+      const Literal& literal, tensorflow::gtl::ArraySlice<int64> start_indices,
+      tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    return literal.Slice(start_indices, limit_indices);
+  }
+
+  // Creates a literal with a prepended dimension with bound "times"; e.g. a
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from the input
+  // literal replicated four times.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> Replicate(const Literal& input, int64 times) {
+    return input.Replicate<NativeT>(times);
+  }
+
+  // Creates a literal by converting each element in an original literal to a
+  // new type.
+  template <typename NativeSrcT, typename NativeDestT>
+  static std::unique_ptr<Literal> Convert(const Literal& literal) {
+    return literal.Convert<NativeSrcT, NativeDestT>();
+  }
+
+  // Convert a literal to another primitive type, but only if the literal
+  // type is connvertable into the destination type
+  static StatusOr<std::unique_ptr<Literal>> ConvertIfSrcTypeMatches(
+      const Literal& src_literal, PrimitiveType primitive_dest_type);
+
+  // Creates a literal value zero of the given primitive type.
+  static Literal Zero(PrimitiveType primitive_type) {
+    return Literal::Zero(primitive_type);
+  }
+
+  // Creates a literal value one of the given primitive type.
+  static Literal One(PrimitiveType primitive_type) {
+    return Literal::One(primitive_type);
+  }
+
+  // Creates a literal value containing the minimum value of the given
+  // primitive type. For floating-point types, returns -inf.
+  static Literal MinValue(PrimitiveType primitive_type) {
+    return Literal::MinValue(primitive_type);
+  }
+
+  // Creates a literal value containing the maximum value of the given
+  // primitive type. For floating-point types, returns inf.
+  static Literal MaxValue(PrimitiveType primitive_type) {
+    return Literal::MaxValue(primitive_type);
+  }
+
+  // Creates a literal of the given shape where each element is `value`.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateFullWithMonotonicDim0MajorLayout(
+      tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value) {
+    return Literal::CreateFullWithMonotonicDim0MajorLayout(dimensions, value);
+  }
+
+  // Creates a new literal from an array. The variants not ending with
+  // WithLayout use the default XLA layout for the literal's linear
+  // representation in memory.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR2FromArray2D(
+      const Array2D<NativeT>& values) {
+    return Literal::CreateR2FromArray2D(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR2FromArray2DWithLayout(
+      const Array2D<NativeT>& values, const Layout& layout) {
+    return Literal::CreateR2FromArray2DWithLayout(values, layout);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR3FromArray3D(
+      const Array3D<NativeT>& values) {
+    return Literal::CreateR3FromArray3D(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR3FromArray3DWithLayout(
+      const Array3D<NativeT>& values, const Layout& layout) {
+    return Literal::CreateR3FromArray3DWithLayout(values, layout);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4FromArray4D(
+      const Array4D<NativeT>& values) {
+    return Literal::CreateR4FromArray4D(values);
+  }
+
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4FromArray4DWithLayout(
+      const Array4D<NativeT>& values, const Layout& layout) {
+    return Literal::CreateR4FromArray4DWithLayout(values, layout);
+  }
+
+  // Creates a new vector of U8s literal value from a string.
+  static std::unique_ptr<Literal> CreateR1U8(tensorflow::StringPiece value) {
+    return Literal::CreateR1U8(value);
+  }
+
+  // Creates a linspace-populated literal with the given number of rows and
+  // columns.
+  static std::unique_ptr<Literal> CreateR2F32Linspace(float from, float to,
+                                                      int64 rows, int64 cols) {
+    return Literal::CreateR2F32Linspace(from, to, rows, cols);
+  }
+
+  // Creates a literal that projects the (x, y) dimensions given in values into
+  // the z dimension given by "projection".
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR3Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64 projection) {
+    return Literal::CreateR3Projected(values, projection);
+  }
+
+  // Creates a literal that projects the (x, y) dimensions given in values into
+  // the z and p dimensions given.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64 projection_p, int64 projection_z) {
+    return Literal::CreateR4Projected(values, projection_p, projection_z);
+  }
+
+  // Clones literal into an owned unique_ptr version.
+  static std::unique_ptr<Literal> CloneToUnique(const Literal& literal) {
+    return literal.CloneToUnique();
+  }
+
+  // Returns the linear index of the given index within the literal's
+  // element_type repeated field.
+  static int64 LinearIndex(const Literal& literal,
+                           tensorflow::gtl::ArraySlice<int64> multi_index) {
+    return literal.LinearIndex(multi_index);
+  }
+
+  // Gets or sets an element in the literal at the given index. The index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  static NativeT Get(const Literal& literal,
+                     tensorflow::gtl::ArraySlice<int64> multi_index) {
+    return literal.Get<NativeT>(multi_index);
+  }
+
+  template <typename NativeT>
+  static void Set(Literal* literal,
+                  tensorflow::gtl::ArraySlice<int64> multi_index,
+                  NativeT value) {
+    literal->Set(multi_index, value);
+  }
+
+  // Retrieves the mutable array slice interface which can be used to manipulate
+  // pre-allocated literal values.
+  template <typename NativeT>
+  static tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice(
+      Literal* literal) {
+    return literal->GetMutableArraySlice<NativeT>();
+  }
+
+  // Returns the element value at index (0, ..., 0), however many zeroes are
+  // required for that index.
+  template <typename NativeT>
+  static NativeT GetFirstElement(const Literal& literal) {
+    return literal.GetFirstElement<NativeT>();
+  }
+
+  // As Get(), but determines the correct type and converts the value
+  // into text.
+  static string GetAsString(const Literal& literal,
+                            tensorflow::gtl::ArraySlice<int64> multi_index) {
+    return literal.GetAsString(multi_index);
+  }
+
+  // Returns an identity matrix (rank 2) with the given row and column count.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> MakeIdentityR2(int64 size) {
+    return Literal::MakeIdentityR2<NativeT>(size);
+  }
+
+  // Returns a tuple literal composed of given literals.
+  static std::unique_ptr<Literal> MakeTuple(
+      tensorflow::gtl::ArraySlice<const Literal*> elements) {
+    return Literal::MakeTuple(elements);
+  }
+
+  // Validates that the data payload of the literal matches the literal shape;
+  // if it does not, an appropriate status is returned.
+  static tensorflow::Status ValidateLiteral(const Literal& literal) {
+    return literal.ValidateLiteral();
+  }
+
+  // Returns a string representation of the literal value.
+  static string ToString(const Literal& literal) { return literal.ToString(); }
 
   // Invokes the "per cell" callback for each element in the provided
   // literal with the element's indices and a string representation of
@@ -257,15 +1002,19 @@ class LiteralUtil {
   // like representation in a protobuf).
   static void EachCellAsString(
       const Literal& literal,
-      std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                         const string& value)>
-          per_cell);
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell) {
+    literal.EachCellAsString(per_cell);
+  }
+
   template <typename NativeT>
   static void EachCell(
       const Literal& literal,
       std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                          NativeT value)>
-          per_cell);
+          per_cell) {
+    literal.EachCell<NativeT>(per_cell);
+  }
 
   // Templated methods which populate the given repeated field in the Literal
   // proto with the given value(s). The Shape field of the Literal proto is set
@@ -279,70 +1028,125 @@ class LiteralUtil {
   //   PopulateR2({{1, 2}, {3, 4}}, literal);
   //
   template <typename NativeT>
-  static void PopulateR0(NativeT values, Literal* literal);
+  static void PopulateR0(NativeT values, Literal* literal) {
+    literal->PopulateR0(values);
+  }
+
   template <typename NativeT>
   static void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values,
-                         Literal* literal);
+                         Literal* literal) {
+    literal->PopulateR1(values);
+  }
+
   static void PopulateR1(const tensorflow::core::Bitmap& values,
-                         Literal* literal);
+                         Literal* literal) {
+    literal->PopulateR1(values);
+  }
+
   template <typename NativeT>
   static void PopulateR2(
       std::initializer_list<std::initializer_list<NativeT>> values,
-      Literal* literal);
+      Literal* literal) {
+    literal->PopulateR2(values);
+  }
+
   template <typename NativeT>
   static void PopulateR2WithLayout(
       std::initializer_list<std::initializer_list<NativeT>> values,
-      const Layout& layout, Literal* literal);
+      const Layout& layout, Literal* literal) {
+    literal->PopulateR2WithLayout(values, layout);
+  }
+
   template <typename NativeT>
   static void PopulateR2FromArray2D(const Array2D<NativeT>& values,
-                                    Literal* literal);
+                                    Literal* literal) {
+    literal->PopulateR2FromArray2D(values);
+  }
+
   template <typename NativeT>
   static void PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
                                               const Layout& layout,
-                                              Literal* literal);
+                                              Literal* literal) {
+    literal->PopulateR2FromArray2DWithLayout(values, layout);
+  }
+
   template <typename NativeT>
   static void PopulateR3FromArray3D(const Array3D<NativeT>& values,
-                                    Literal* literal);
+                                    Literal* literal) {
+    literal->PopulateR3FromArray3D(values);
+  }
+
   template <typename NativeT>
   static void PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
                                               const Layout& layout,
-                                              Literal* literal);
+                                              Literal* literal) {
+    literal->PopulateR3FromArray3DWithLayout(values, layout);
+  }
+
   template <typename NativeT>
   static void PopulateR4FromArray4D(const Array4D<NativeT>& values,
-                                    Literal* literal);
+                                    Literal* literal) {
+    literal->PopulateR4FromArray4D(values);
+  }
+
   template <typename NativeT>
   static void PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
                                               const Layout& layout,
-                                              Literal* literal);
+                                              Literal* literal) {
+    literal->PopulateR4FromArray4DWithLayout(values, layout);
+  }
+
+  // Populates literal values by calling the generator function for every cell
+  // in the literal object.
+  template <typename NativeT>
+  static Status Populate(
+      Literal* literal,
+      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+          generator) {
+    return literal->Populate(generator);
+  }
 
   // Creates a Literal of the given dimensions with all elements set to the
   // given value.
   template <typename NativeT>
   static void PopulateWithValue(NativeT value,
                                 tensorflow::gtl::ArraySlice<int64> dimensions,
-                                Literal* literal);
+                                Literal* literal) {
+    return literal->PopulateWithValue(value, dimensions);
+  }
 
-  // Returns a pointer to the underlying buffer in the protobuf containing the
-  // array data. Use with care.
-  static const void* InternalData(const Literal& literal);
-  static void* MutableInternalData(Literal* literal);
+  // Returns a pointer to the underlying vector containing the array data. Use
+  // with care.
+  static const void* InternalData(const Literal& literal) {
+    return literal.InternalData();
+  }
 
-  // Allocates space in the repeated_field of the literal sufficient to hold
-  // num_elements of the literal's primitive type. Values in the buffer are set
+  static void* MutableInternalData(Literal* literal) {
+    return literal->MutableInternalData();
+  }
+
+  // Allocates space in the underlying vector of the literal sufficient to hold
+  // num_elements of the literal's primitive type. Values in the vector are set
   // to zero. num_elements must equal the number of elements in the literals
   // shape.
-  static void Reserve(int64 num_elements, Literal* literal);
+  static void Reserve(int64 num_elements, Literal* literal) {
+    literal->Reserve(num_elements);
+  }
 
-  // Allocates space in the repeated_field of the literal sufficient to hold
+  // Allocates space in the underlying vector of the literal sufficient to hold
   // num_elements of the literal's primitive type and sets each element in the
   // literal to the given value. num_elements must equal the number of elements
   // in the literals shape.
   template <typename NativeT>
-  static void Resize(int64 num_elements, NativeT value, Literal* literal);
+  static void Resize(int64 num_elements, NativeT value, Literal* literal) {
+    literal->Resize(num_elements, value);
+  }
 
   // Returns true if the two given literals have the same shape and
   // values. Layout is not considered in the comparison.
-  static bool Equal(const Literal& literal1, const Literal& literal2);
+  static bool Equal(const Literal& literal1, const Literal& literal2) {
+    return literal1.Equal(literal2);
+  }
 
   // Returns whether every element in the given literal is equal to value.
   //
@@ -353,7 +1157,9 @@ class LiteralUtil {
   // If value doesn't fit in literal's type, returns false.  Values of 1/0 are
   // considered equal to true/false; other values are not considered equal to
   // true.
-  static bool IsAll(const Literal& literal, int8 value);
+  static bool IsAll(const Literal& literal, int8 value) {
+    return literal.IsAll(value);
+  }
 
   // Like IsAll(const Literal&, int8), except we check whether the literal is
   // equal to a particular floating-point number.
@@ -364,137 +1170,149 @@ class LiteralUtil {
   // admonishments about floating-point equality checks apply.  We expect you to
   // use this to check for values that can be expressed precisely as a float,
   // e.g. -0.5.
-  static bool IsAllFloat(const Literal& literal, float value);
+  static bool IsAllFloat(const Literal& literal, float value) {
+    return literal.IsAllFloat(value);
+  }
 
   // Returns whether the literal is zero at the specified index. The literal
   // must be an array.
   static bool IsZero(const Literal& literal,
-                     tensorflow::gtl::ArraySlice<int64> indices);
-
- private:
-  // Returns an ArraySlice view of the array for the given literal for the
-  // given NativeT (e.g., float). These
-  // functions map native type to XLA PrimitiveType via template
-  // specialization. The unspecialized forms below aborts to handle the error
-  // case where the given native type does not map to an XLA primitive type.
-  template <typename NativeT>
-  static tensorflow::gtl::ArraySlice<NativeT> GetArraySlice(
-      const Literal& literal) {
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
+                     tensorflow::gtl::ArraySlice<int64> indices) {
+    return literal.IsZero(indices);
   }
-  template <typename NativeT>
-  static tensorflow::protobuf::RepeatedField<NativeT>* GetMutableRepeatedField(
-      Literal* literal) {
-    // Make the expression depend on the template parameter NativeT so
-    // that this compile-time error only apperas if this function is
-    // instantiated with some concrete type that is not specialized
-    // below.
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
-
-  // Returns the linear index of the given index within the literal's
-  // element_type repeated field.
-  static int64 LinearIndex(const Literal& literal,
-                           tensorflow::gtl::ArraySlice<int64> multi_index);
 
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralUtil);
 };
 
 // Declarations of template specializations for GetArraySlice and
-// GetMutableRepeatedField. The specializations map native type to XLA primitive
+// GetMutableArraySlice. The specializations map native type to XLA primitive
 // type.
 template <>
-/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
-    const Literal& literal);
+tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const;
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal);
+tensorflow::gtl::ArraySlice<uint8> Literal::GetArraySlice<uint8>() const;
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal);
+tensorflow::gtl::ArraySlice<int8> Literal::GetArraySlice<int8>() const;
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal);
+tensorflow::gtl::ArraySlice<uint32> Literal::GetArraySlice<uint32>() const;
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint64>
-LiteralUtil::GetArraySlice<uint64>(const Literal& literal);
+tensorflow::gtl::ArraySlice<uint64> Literal::GetArraySlice<uint64>() const;
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal);
+tensorflow::gtl::ArraySlice<int32> Literal::GetArraySlice<int32>() const;
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int32>
-LiteralUtil::GetArraySlice<int32>(const Literal& literal);
+tensorflow::gtl::ArraySlice<int64> Literal::GetArraySlice<int64>() const;
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal);
+inline tensorflow::gtl::ArraySlice<float> Literal::GetArraySlice<float>()
+    const {
+  DCHECK(shape().element_type() == F32);
+  return f32s();
+}
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int64>
-LiteralUtil::GetArraySlice<int64>(const Literal& literal);
+tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const;
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal);
+tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const;
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<float>
-LiteralUtil::GetArraySlice<float>(const Literal& literal);
+tensorflow::gtl::MutableArraySlice<bool> Literal::GetMutableArraySlice();
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal);
+tensorflow::gtl::MutableArraySlice<int8> Literal::GetMutableArraySlice();
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<double>
-LiteralUtil::GetArraySlice<double>(const Literal& literal);
+tensorflow::gtl::MutableArraySlice<uint8> Literal::GetMutableArraySlice();
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal);
+tensorflow::gtl::MutableArraySlice<int32> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<uint32> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<int64> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<uint64> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<float> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice();
+
+template <>
+tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice();
+
+template <>
+void Literal::Resize<bool>(int64 num_elements, bool value);
+
+template <>
+void Literal::Resize<int8>(int64 num_elements, int8 value);
+
+template <>
+void Literal::Resize<uint8>(int64 num_elements, uint8 value);
+
+template <>
+void Literal::Resize<int32>(int64 num_elements, int32 value);
+
+template <>
+void Literal::Resize<uint32>(int64 num_elements, uint32 value);
+
+template <>
+void Literal::Resize<int64>(int64 num_elements, int64 value);
+
+template <>
+void Literal::Resize<uint64>(int64 num_elements, uint64 value);
+
+template <>
+void Literal::Resize<float>(int64 num_elements, float value);
+
+template <>
+void Literal::Resize<double>(int64 num_elements, double value);
+
+template <>
+void Literal::Resize<half>(int64 num_elements, half value);
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR0(NativeT value) {
+/* static */ std::unique_ptr<Literal> Literal::CreateR0(NativeT value) {
   auto literal = MakeUnique<Literal>();
-  PopulateR0(value, literal.get());
+  literal->PopulateR0<NativeT>(value);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1(
+/* static */ std::unique_ptr<Literal> Literal::CreateR1(
     tensorflow::gtl::ArraySlice<NativeT> values) {
   auto literal = MakeUnique<Literal>();
-  PopulateR1(values, literal.get());
+  literal->PopulateR1(values);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR2WithLayout(
+/* static */ std::unique_ptr<Literal> Literal::CreateR2WithLayout(
     std::initializer_list<std::initializer_list<NativeT>> values,
     const Layout& layout) {
   auto literal = MakeUnique<Literal>();
-  PopulateR2WithLayout(values, layout, literal.get());
+  literal->PopulateR2WithLayout(values, layout);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR2(
+/* static */ std::unique_ptr<Literal> Literal::CreateR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
   return CreateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2());
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR3WithLayout(
+/* static */ std::unique_ptr<Literal> Literal::CreateR3WithLayout(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         values,
     const Layout& layout) {
@@ -519,14 +1337,14 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR3(
+/* static */ std::unique_ptr<Literal> Literal::CreateR3(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         values) {
   return CreateR3WithLayout(values, LayoutUtil::GetDefaultLayoutForR3());
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR4WithLayout(
+/* static */ std::unique_ptr<Literal> Literal::CreateR4WithLayout(
     std::initializer_list<std::initializer_list<
         std::initializer_list<std::initializer_list<NativeT>>>>
         values,
@@ -557,7 +1375,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR4(
+/* static */ std::unique_ptr<Literal> Literal::CreateR4(
     std::initializer_list<std::initializer_list<
         std::initializer_list<std::initializer_list<NativeT>>>>
         values) {
@@ -565,38 +1383,37 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal>
-LiteralUtil::CreateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                           const Layout& layout) {
+/* static */ std::unique_ptr<Literal> Literal::CreateR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
   auto literal = MakeUnique<Literal>();
-  PopulateR2FromArray2DWithLayout(values, layout, literal.get());
+  literal->PopulateR2FromArray2DWithLayout(values, layout);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR2FromArray2D(
+/* static */ std::unique_ptr<Literal> Literal::CreateR2FromArray2D(
     const Array2D<NativeT>& values) {
   return CreateR2FromArray2DWithLayout(values,
                                        LayoutUtil::GetDefaultLayoutForR2());
 }
+
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal>
-LiteralUtil::CreateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                           const Layout& layout) {
+/* static */ std::unique_ptr<Literal> Literal::CreateR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
   auto literal = MakeUnique<Literal>();
-  PopulateR3FromArray3DWithLayout(values, layout, literal.get());
+  literal->PopulateR3FromArray3DWithLayout(values, layout);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR3FromArray3D(
+/* static */ std::unique_ptr<Literal> Literal::CreateR3FromArray3D(
     const Array3D<NativeT>& values) {
   return CreateR3FromArray3DWithLayout(values,
                                        LayoutUtil::GetDefaultLayoutForR3());
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR3Projected(
+/* static */ std::unique_ptr<Literal> Literal::CreateR3Projected(
     std::initializer_list<std::initializer_list<NativeT>> values,
     int64 projection) {
   int64 dim0_size = projection;
@@ -621,7 +1438,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR4Projected(
+/* static */ std::unique_ptr<Literal> Literal::CreateR4Projected(
     std::initializer_list<std::initializer_list<NativeT>> values,
     int64 projection_p, int64 projection_z) {
   int64 dim0_size = projection_p;
@@ -649,91 +1466,92 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR4FromArray4D(
+/* static */ std::unique_ptr<Literal> Literal::CreateR4FromArray4D(
     const Array4D<NativeT>& values) {
   return CreateR4FromArray4DWithLayout(values,
                                        LayoutUtil::GetDefaultLayoutForR4());
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal>
-LiteralUtil::CreateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                           const Layout& layout) {
+/* static */ std::unique_ptr<Literal> Literal::CreateR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
   auto literal = MakeUnique<Literal>();
-  PopulateR4FromArray4DWithLayout(values, layout, literal.get());
+  literal->PopulateR4FromArray4DWithLayout(values, layout);
   return literal;
 }
 
 template <typename NativeT>
-/* static */ NativeT LiteralUtil::Get(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  int64 linear_index = LinearIndex(literal, multi_index);
-  return GetArraySlice<NativeT>(literal).at(linear_index);
+NativeT Literal::Get(tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  int64 linear_index = LinearIndex(multi_index);
+  return GetArraySlice<NativeT>().at(linear_index);
 }
 
 template <typename NativeT>
-/* static */ NativeT LiteralUtil::GetFirstElement(const Literal& literal) {
-  return GetArraySlice<NativeT>(literal).at(0);
+NativeT Literal::GetFirstElement() const {
+  return GetArraySlice<NativeT>().at(0);
 }
 
 template <>
-/* static */ inline uint8 LiteralUtil::Get<uint8>(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  CHECK(literal.shape().element_type() == U8);
-  int64 linear_index = LinearIndex(literal, multi_index);
-  return literal.u8s()[linear_index];
+inline uint8 Literal::Get<uint8>(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(shape().element_type() == U8);
+  int64 linear_index = LinearIndex(multi_index);
+  return u8s()[linear_index];
 }
 
 template <>
-/* static */ inline int8 LiteralUtil::Get<int8>(
-    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
-  CHECK(literal.shape().element_type() == S8);
-  int64 linear_index = LinearIndex(literal, multi_index);
-  return literal.u8s()[linear_index];
+inline int8 Literal::Get<int8>(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(shape().element_type() == S8);
+  int64 linear_index = LinearIndex(multi_index);
+  return u8s()[linear_index];
+}
+
+template <>
+inline half Literal::Get<half>(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(shape().element_type() == F16);
+  int64 linear_index = LinearIndex(multi_index);
+  return GetArraySlice<half>()[linear_index];
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::Set(
-    Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
-    NativeT value) {
-  int64 linear_index = LinearIndex(*literal, multi_index);
-  GetMutableRepeatedField<NativeT>(literal)->Set(linear_index, value);
+void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                  NativeT value) {
+  int64 linear_index = LinearIndex(multi_index);
+  GetMutableArraySlice<NativeT>().at(linear_index) = value;
 }
 
 template <>
-/* static */ inline void LiteralUtil::Set(
-    Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
-    uint8 value) {
-  int64 linear_index = LinearIndex(*literal, multi_index);
-  (*literal->mutable_u8s())[linear_index] = value;
+inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         uint8 value) {
+  int64 linear_index = LinearIndex(multi_index);
+  (*mutable_u8s())[linear_index] = value;
 }
 
 template <>
-/* static */ inline void LiteralUtil::Set(
-    Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
-    int8 value) {
-  return Set<uint8>(literal, multi_index, value);
+inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         int8 value) {
+  return Set<uint8>(multi_index, value);
 }
 
 template <>
-/* static */ inline void LiteralUtil::Set(
-    Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
-    int64 value) {
-  int64 linear_index = LinearIndex(*literal, multi_index);
-  (*literal->mutable_s64s())[linear_index] = value;
+inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         int64 value) {
+  int64 linear_index = LinearIndex(multi_index);
+  (*mutable_s64s())[linear_index] = value;
 }
 
 template <>
-/* static */ inline void LiteralUtil::Set(
-    Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
-    uint64 value) {
-  int64 linear_index = LinearIndex(*literal, multi_index);
-  (*literal->mutable_u64s())[linear_index] = value;
+/* static */ inline void Literal::Set(
+    tensorflow::gtl::ArraySlice<int64> multi_index, uint64 value) {
+  int64 linear_index = LinearIndex(multi_index);
+  (*mutable_u64s())[linear_index] = value;
 }
 
 // Returns an identity matrix (rank 2) with the given row and column count.
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::MakeIdentityR2(int64 size) {
+/* static */ std::unique_ptr<Literal> Literal::MakeIdentityR2(int64 size) {
   Array2D<NativeT> array(size, size, 0);
   for (int64 i = 0; i < size; ++i) {
     array(i, i) = 1;
@@ -742,88 +1560,51 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::EachCell(
-    const Literal& literal,
+void Literal::EachCell(
     std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                        NativeT value)>
-        per_cell) {
-  if (ShapeUtil::HasZeroElements(literal.shape())) {
+        per_cell) const {
+  if (ShapeUtil::HasZeroElements(shape())) {
     return;
   }
-  std::vector<int64> indices(ShapeUtil::Rank(literal.shape()), 0);
+  std::vector<int64> indices(ShapeUtil::Rank(shape()), 0);
   do {
-    per_cell(indices, Get<NativeT>(literal, indices));
-  } while (IndexUtil::BumpIndices(literal.shape(), &indices));
+    per_cell(indices, Get<NativeT>(indices));
+  } while (IndexUtil::BumpIndices(shape(), &indices));
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR0(NativeT value, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
+inline void Literal::PopulateR0(NativeT value) {
+  *mutable_shape() = ShapeUtil::MakeShape(
       primitive_util::NativeToPrimitiveType<NativeT>(), {});
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint8>(uint8 value,
-                                                        Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int8>(int8 value,
-                                                       Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint64>(uint64 value,
-                                                         Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint64>(), {});
-  literal->mutable_u64s()->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int64>(int64 value,
-                                                        Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int64>(), {});
-  literal->mutable_s64s()->Add(value);
+  Resize<NativeT>(1, value);
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR1(
-    tensorflow::gtl::ArraySlice<NativeT> values, Literal* literal) {
-  *literal->mutable_shape() =
+inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+  *mutable_shape() =
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
                            {static_cast<int64>(values.size())});
-  Reserve(values.size(), literal);
+  Reserve(values.size());
   for (int64 i = 0; i < values.size(); ++i) {
-    Set(literal, {i}, values[i]);
+    Set({i}, values[i]);
   }
 }
 
-/* static */ inline void LiteralUtil::PopulateR1(
-    const tensorflow::core::Bitmap& values, Literal* literal) {
-  *literal->mutable_shape() =
+inline void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
+  *mutable_shape() =
       ShapeUtil::MakeShape(PRED, {static_cast<int64>(values.bits())});
-  Reserve(values.bits(), literal);
-  for (int64 i = 0; i < values.bits(); ++i) {
-    Set(literal, {i}, values.get(i));
+  Reserve(values.bits());
+  for (int64 i = 0; i < static_cast<int64>(values.bits()); ++i) {
+    Set({i}, values.get(i));
   }
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR2WithLayout(
+void Literal::PopulateR2WithLayout(
     std::initializer_list<std::initializer_list<NativeT>> values,
-    const Layout& layout, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShapeWithLayout(
+    const Layout& layout) {
+  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {static_cast<int64>(values.size()),
        static_cast<int64>(values.begin()->size())},
@@ -831,17 +1612,17 @@ template <typename NativeT>
 
   const int64 dim0_size = values.size();
   const int64 dim1_size = values.begin()->size();
-  CHECK_EQ(dim0_size, literal->shape().dimensions(0));
-  CHECK_EQ(dim1_size, literal->shape().dimensions(1));
+  CHECK_EQ(dim0_size, shape().dimensions(0));
+  CHECK_EQ(dim1_size, shape().dimensions(1));
 
   const int64 num_elements = dim1_size * dim0_size;
-  Reserve(num_elements, literal);
+  Reserve(num_elements);
 
   int64 dim0 = 0;
   for (auto inner_list : values) {
     int64 dim1 = 0;
     for (auto value : inner_list) {
-      Set(literal, {dim0, dim1}, value);
+      Set({dim0, dim1}, value);
       ++dim1;
     }
     CHECK_EQ(dim1_size, dim1);
@@ -850,84 +1631,79 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR2(
-    std::initializer_list<std::initializer_list<NativeT>> values,
-    Literal* literal) {
-  PopulateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2(), literal);
+void Literal::PopulateR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  PopulateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2());
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShapeWithLayout(
+void Literal::PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                              const Layout& layout) {
+  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {values.height(), values.width()}, AsInt64Slice(layout.minor_to_major()));
 
   const int64 dim1_size = values.width();
   const int64 dim0_size = values.height();
-  CHECK_EQ(dim0_size, literal->shape().dimensions(0));
-  CHECK_EQ(dim1_size, literal->shape().dimensions(1));
-  Reserve(dim1_size * dim0_size, literal);
+  CHECK_EQ(dim0_size, shape().dimensions(0));
+  CHECK_EQ(dim1_size, shape().dimensions(1));
+  Reserve(dim1_size * dim0_size);
   for (int64 dim0 = 0; dim0 < dim0_size; ++dim0) {
     for (int64 dim1 = 0; dim1 < dim1_size; ++dim1) {
-      Set(literal, {dim0, dim1}, values(dim0, dim1));
+      Set({dim0, dim1}, values(dim0, dim1));
     }
   }
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR2FromArray2D(
-    const Array2D<NativeT>& values, Literal* literal) {
-  PopulateR2FromArray2DWithLayout(values, LayoutUtil::GetDefaultLayoutForR2(),
-                                  literal);
+void Literal::PopulateR2FromArray2D(const Array2D<NativeT>& values) {
+  PopulateR2FromArray2DWithLayout(values, LayoutUtil::GetDefaultLayoutForR2());
 }
+
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShapeWithLayout(
+void Literal::PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                              const Layout& layout) {
+  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {values.n1(), values.n2(), values.n3()},
       AsInt64Slice(layout.minor_to_major()));
 
-  CHECK_EQ(values.n1(), literal->shape().dimensions(0));
-  CHECK_EQ(values.n2(), literal->shape().dimensions(1));
-  CHECK_EQ(values.n3(), literal->shape().dimensions(2));
-  Reserve(values.n1() * values.n2() * values.n3(), literal);
+  CHECK_EQ(values.n1(), shape().dimensions(0));
+  CHECK_EQ(values.n2(), shape().dimensions(1));
+  CHECK_EQ(values.n3(), shape().dimensions(2));
+  Reserve(values.n1() * values.n2() * values.n3());
   for (int64 dim0 = 0; dim0 < values.n1(); ++dim0) {
     for (int64 dim1 = 0; dim1 < values.n2(); ++dim1) {
       for (int64 dim2 = 0; dim2 < values.n3(); ++dim2) {
-        Set(literal, {dim0, dim1, dim2}, values(dim0, dim1, dim2));
+        Set({dim0, dim1, dim2}, values(dim0, dim1, dim2));
       }
     }
   }
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR3FromArray3D(
-    const Array3D<NativeT>& values, Literal* literal) {
-  PopulateR3FromArray3DWithLayout(values, LayoutUtil::GetDefaultLayoutForR3(),
-                                  literal);
+void Literal::PopulateR3FromArray3D(const Array3D<NativeT>& values) {
+  PopulateR3FromArray3DWithLayout(values, LayoutUtil::GetDefaultLayoutForR3());
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout, Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShapeWithLayout(
+void Literal::PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                              const Layout& layout) {
+  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {values.planes(), values.depth(), values.height(), values.width()},
       AsInt64Slice(layout.minor_to_major()));
 
-  CHECK_EQ(values.n1(), literal->shape().dimensions(0));
-  CHECK_EQ(values.n2(), literal->shape().dimensions(1));
-  CHECK_EQ(values.n3(), literal->shape().dimensions(2));
-  CHECK_EQ(values.n4(), literal->shape().dimensions(3));
-  Reserve(values.n1() * values.n2() * values.n3() * values.n4(), literal);
+  CHECK_EQ(values.n1(), shape().dimensions(0));
+  CHECK_EQ(values.n2(), shape().dimensions(1));
+  CHECK_EQ(values.n3(), shape().dimensions(2));
+  CHECK_EQ(values.n4(), shape().dimensions(3));
+  Reserve(values.n1() * values.n2() * values.n3() * values.n4());
   for (int64 dim0 = 0; dim0 < values.n1(); ++dim0) {
     for (int64 dim1 = 0; dim1 < values.n2(); ++dim1) {
       for (int64 dim2 = 0; dim2 < values.n3(); ++dim2) {
         for (int64 dim3 = 0; dim3 < values.n4(); ++dim3) {
-          Set(literal, {dim0, dim1, dim2, dim3},
-              values(dim0, dim1, dim2, dim3));
+          Set({dim0, dim1, dim2, dim3}, values(dim0, dim1, dim2, dim3));
         }
       }
     }
@@ -935,106 +1711,124 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR4FromArray4D(
-    const Array4D<NativeT>& values, Literal* literal) {
-  PopulateR4FromArray4DWithLayout(values, LayoutUtil::GetDefaultLayoutForR4(),
-                                  literal);
+void Literal::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
+  PopulateR4FromArray4DWithLayout(values, LayoutUtil::GetDefaultLayoutForR4());
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateWithValue(
-    NativeT value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
+Status Literal::Populate(
+    const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+        generator) {
+  const Shape& this_shape = shape();
+  int64 rank = ShapeUtil::Rank(this_shape);
+  TF_RET_CHECK(this_shape.element_type() ==
+               primitive_util::NativeToPrimitiveType<NativeT>());
+  tensorflow::gtl::MutableArraySlice<NativeT> data =
+      GetMutableArraySlice<NativeT>();
+  if (rank > 0) {
+    StrideConfig stride_config(this_shape, this_shape,
+                               AsInt64Slice(this_shape.dimensions()));
+    DimensionVector minor_scan_indexes(rank, 0);
+    int64 minor_dimension_size =
+        ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
+
+    auto init_function = [&](const std::vector<int64>& indexes) {
+      int64 index = LinearIndex(indexes);
+      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
+      for (int64 i = 0; i < minor_dimension_size; ++i) {
+        minor_scan_indexes[stride_config.minor_dimension] = i;
+        data.at(index + i) = generator(minor_scan_indexes);
+      }
+      return true;
+    };
+    ShapeUtil::ForEachIndex(this_shape, stride_config.base,
+                            stride_config.dimensions, stride_config.step,
+                            init_function);
+  } else {
+    // For scalars.
+    data.at(0) = generator({});
   }
+  return Status::OK();
 }
 
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
+template <typename NativeT>
+void Literal::PopulateWithValue(NativeT value,
+                                tensorflow::gtl::ArraySlice<int64> dimensions) {
+  *mutable_shape() = ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
+  Resize<NativeT>(ShapeUtil::ElementsIn(shape()), value);
+}
 
 template <typename NativeSrcT, typename NativeDestT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::Convert(
-    const Literal& literal) {
+std::unique_ptr<Literal> Literal::Convert() const {
+  const Shape& this_shape = shape();
   auto result_literal = MakeUnique<Literal>();
-  Shape result_shape = literal.shape();
-  result_shape.set_element_type(
+  Shape* result_shape = result_literal->mutable_shape();
+  *result_shape = this_shape;
+  result_shape->set_element_type(
       primitive_util::NativeToPrimitiveType<NativeDestT>());
-  *result_literal->mutable_shape() = result_shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(result_shape),
-                       result_literal.get());
-  LiteralUtil::EachCell<NativeSrcT>(
-      literal,
-      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeSrcT value) {
-        LiteralUtil::Set<NativeDestT>(result_literal.get(), indices,
-                                      static_cast<NativeDestT>(value));
-      });
+  result_literal->Reserve(ShapeUtil::ElementsIn(*result_shape));
+  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
+      GetArraySlice<NativeSrcT>();
+  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
+      result_literal->GetMutableArraySlice<NativeDestT>();
+  int64 num_elements = ShapeUtil::ElementsIn(this_shape);
+
+  for (int64 i = 0; i < num_elements; ++i) {
+    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+  }
   return result_literal;
 }
 
-template <typename NativeT>
-/* static */ void LiteralUtil::Resize(int64 num_elements, NativeT value,
-                                      Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Resize(num_elements, value);
-}
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal);
-
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal>
-LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
+Literal::CreateFullWithMonotonicDim0MajorLayout(
     tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value) {
-  Shape shape = ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+  Shape this_shape = ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
   auto literal = MakeUnique<Literal>();
-  *literal->mutable_shape() = shape;
-  Reserve(ShapeUtil::ElementsIn(shape), literal.get());
+  *literal->mutable_shape() = this_shape;
+  literal->Reserve(ShapeUtil::ElementsIn(this_shape));
   std::vector<int64> index(dimensions.size(), 0);
   do {
-    Set(literal.get(), index, value);
-  } while (IndexUtil::BumpIndices(shape, &index));
+    literal->Set(index, value);
+  } while (IndexUtil::BumpIndices(this_shape, &index));
   return literal;
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal> LiteralUtil::Replicate(
-    const Literal& input, int64 times) {
-  std::vector<int64> bounds = {times};
-  bounds.insert(bounds.end(), input.shape().dimensions().begin(),
-                input.shape().dimensions().end());
+std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
+  DimensionVector bounds = {times};
+  bounds.reserve(shape().dimensions_size() + 1);
+  for (int64 bound : shape().dimensions()) {
+    bounds.push_back(bound);
+  }
   auto literal = MakeUnique<Literal>();
   *literal->mutable_shape() =
-      ShapeUtil::MakeShape(input.shape().element_type(), bounds);
-  Reserve(ShapeUtil::ElementsIn(literal->shape()), literal.get());
-  for (int64 index = 0; index < ShapeUtil::ElementsIn(input.shape()); ++index) {
-    const std::vector<int64> element_indices =
-        IndexUtil::LinearIndexToMultidimensionalIndex(input.shape(), index);
-    const auto element = Get<NativeT>(input, element_indices);
-    for (int64 sample = 0; sample < times; ++sample) {
-      std::vector<int64> output_indices = {sample};
-      output_indices.insert(output_indices.end(), element_indices.begin(),
-                            element_indices.end());
-      Set<NativeT>(literal.get(), output_indices, element);
+      ShapeUtil::MakeShape(shape().element_type(), bounds);
+  int64 elements = ShapeUtil::ElementsIn(literal->shape());
+  if (elements == 0) {
+    return literal;
+  }
+  literal->Reserve(elements);
+
+  DimensionVector output_indices(bounds.size(), 0);
+  tensorflow::gtl::ArraySlice<int64> input_indices = output_indices;
+  input_indices.remove_prefix(1);
+
+  bool done = false;
+  while (!done) {
+    const auto element = Get<NativeT>(input_indices);
+    literal->Set<NativeT>(output_indices, element);
+
+    done = true;
+    for (int n = 0; n < output_indices.size(); ++n) {
+      ++output_indices[n];
+      if (output_indices[n] < bounds[n]) {
+        done = false;
+        break;
+      }
+      output_indices[n] = 0;
     }
   }
   return literal;
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index e4adb5df56a..8d4a75d7aff 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -21,14 +21,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 class LiteralUtilTest : public ::testing::Test {
  protected:
   LiteralUtilTest() {
@@ -101,6 +104,9 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
   auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
   ASSERT_EQ("3.14", LiteralUtil::ToString(*f32_lit));
+
+  auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
+  ASSERT_EQ("0.5", LiteralUtil::ToString(*f16_lit));
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -159,9 +165,7 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   // clang-format on
 
   auto literal = LiteralUtil::CreateR3FromArray3D(array_3d);
-  EXPECT_MATCH(testing::PBToVec<tensorflow::protobuf_int64>(
-                   literal->shape().dimensions()),
-               testing::VectorMatcher<tensorflow::protobuf_int64>({2, 3, 2}));
+  EXPECT_THAT(literal->shape().dimensions(), ElementsAre(2, 3, 2));
   string result = LiteralUtil::ToString(*literal);
   const string expected = R"(f32[2,3,2] {
 { { 1, 2 },
@@ -182,9 +186,7 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
     {2001, 2002},
   }, /*projection_p=*/1, /*projection_z=*/2);
   // clang-format on
-  EXPECT_MATCH(
-      testing::PBToVec(literal->shape().dimensions()),
-      testing::VectorMatcher<tensorflow::protobuf_int64>({1, 2, 3, 2}));
+  EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = LiteralUtil::ToString(*literal);
   const string expected = R"(f32[1,2,3,2] {
   {  // i0=0
@@ -204,10 +206,8 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
-  EXPECT_MATCH(
-      testing::PBToVec<tensorflow::protobuf_int64>(
-          literal_r4_2x2x3x3_dim0major_->shape().dimensions()),
-      testing::VectorMatcher<tensorflow::protobuf_int64>({2, 2, 3, 3}));
+  EXPECT_THAT(literal_r4_2x2x3x3_dim0major_->shape().dimensions(),
+              ElementsAre(2, 2, 3, 3));
   string result = LiteralUtil::ToString(*literal_r4_2x2x3x3_dim0major_);
   const string expected = R"(f32[2,2,3,3] {
   {  // i0=0
@@ -375,6 +375,15 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(
       LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{9, 8}, {8, 8}}), 8));
 
+  half h8(8.0f);
+  half h9(9.0f);
+  EXPECT_TRUE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h8}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h9}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h9}, {h8}}), 8));
+
   auto uint64_max = std::numeric_limits<uint64>::max();
   EXPECT_FALSE(LiteralUtil::IsAll(
       *LiteralUtil::CreateR2<uint64>(
@@ -471,6 +480,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
   EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
 }
 
+TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
+  // clang-format off
+  // F32[1x3x2x4]
+  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0minor_);
+  // F32[1x3x4x2]
+  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+    {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
+    {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
+    {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
+  }, layout_r3_dim0major_);
+  // clang-format on
+  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+}
+
 TEST_F(LiteralUtilTest, TransposeR0) {
   auto original = LiteralUtil::CreateR0<float>(1.7f);
   auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
@@ -516,27 +545,23 @@ TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   auto mat_dim0minor = LiteralUtil::CreateR2WithLayout<int>(
       {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_);
   EXPECT_EQ(mat_dim0minor->s32s_size(), 6);
-  EXPECT_MATCH(testing::PBToVec<int32>(mat_dim0minor->s32s()),
-               testing::VectorMatcher<int32>({1, 4, 2, 5, 3, 6}));
+  EXPECT_THAT(mat_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_mat_to_dim0major =
       LiteralUtil::Relayout(*mat_dim0minor, layout_r2_dim0major_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_mat_to_dim0major->s32s()),
-               testing::VectorMatcher<int32>({1, 2, 3, 4, 5, 6}));
+  EXPECT_THAT(relaid_mat_to_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout of R2 created with dim0-major (row-major).
   auto mat_dim0major = LiteralUtil::CreateR2WithLayout<int>(
       {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_);
   EXPECT_EQ(mat_dim0major->s32s_size(), 6);
-  EXPECT_MATCH(testing::PBToVec<int32>(mat_dim0major->s32s()),
-               testing::VectorMatcher<int32>({1, 2, 3, 4, 5, 6}));
+  EXPECT_THAT(mat_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_mat_to_dim0minor =
       LiteralUtil::Relayout(*mat_dim0major, layout_r2_dim0minor_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_mat_to_dim0minor->s32s()),
-               testing::VectorMatcher<int32>({1, 4, 2, 5, 3, 6}));
+  EXPECT_THAT(relaid_mat_to_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 }
 
 TEST_F(LiteralUtilTest, TestR3LinearLayout) {
@@ -558,28 +583,28 @@ TEST_F(LiteralUtilTest, TestR3LinearLayout) {
 
   EXPECT_EQ(lit_dim0minor->s32s_size(), 12);
   std::vector<int> expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12};
-  EXPECT_MATCH(testing::PBToVec<int32>(lit_dim0minor->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0minor));
+  EXPECT_THAT(lit_dim0minor->s32s(),
+              testing::ElementsAreArray(expected_dim0minor));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_lit_to_dim0major =
       LiteralUtil::Relayout(*lit_dim0minor, layout_r3_dim0major_);
   std::vector<int> expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_lit_to_dim0major->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0major));
+  EXPECT_THAT(relaid_lit_to_dim0major->s32s(),
+              testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout of R3 created with dim0-major (row-major).
   auto lit_dim0major = LiteralUtil::CreateR3FromArray3DWithLayout<int>(
       arr3d, layout_r3_dim0major_);
   EXPECT_EQ(lit_dim0major->s32s_size(), 12);
-  EXPECT_MATCH(testing::PBToVec<int32>(lit_dim0major->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0major));
+  EXPECT_THAT(lit_dim0major->s32s(),
+              testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_lit_to_dim0minor =
       LiteralUtil::Relayout(*lit_dim0major, layout_r3_dim0minor_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_lit_to_dim0minor->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0minor));
+  EXPECT_THAT(relaid_lit_to_dim0minor->s32s(),
+              testing::ElementsAreArray(expected_dim0minor));
 }
 
 TEST_F(LiteralUtilTest, SliceR0S32) {
@@ -645,5 +670,358 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
   EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
+  Literal output;
+  half h(0.25f);
+  LiteralUtil::PopulateWithValue<half>(h, {}, &output);
+  auto expected = LiteralUtil::CreateR0<half>(h);
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
+  Literal output;
+  half h(0.5f);
+  LiteralUtil::PopulateWithValue<half>(h, {3}, &output);
+  auto expected = LiteralUtil::CreateR1<half>({h, h, h});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
+  Literal output;
+  half h(2.0f);
+  LiteralUtil::PopulateWithValue<half>(h, {2, 2}, &output);
+  auto expected = LiteralUtil::CreateR2<half>({{h, h}, {h, h}});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, ReplicateR2U32) {
+  auto input = LiteralUtil::CreateR2<uint32>(
+      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  auto output = LiteralUtil::Replicate<uint32>(*input, 3);
+  auto expected = LiteralUtil::CreateR3<uint32>(
+      {{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
+       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
+       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}});
+  EXPECT_TRUE(LiteralUtil::Equal(*output, *expected));
+}
+
+TEST_F(LiteralUtilTest, Copy) {
+  const int64 dimensions[] = {17, 15, 34, 21};
+  const int64 layouts[][4] = {
+      {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}};
+  for (const auto& layout : layouts) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
+    auto blank = LiteralUtil::CreateFromShape(shape);
+    auto source = LiteralUtil::CreateFromShape(shape);
+    const int64 zero_base[] = {0, 0, 0, 0};
+    const int64 step[] = {1, 1, 1, 1};
+    uint32 seqnr = 0;
+    auto init_proc = [&](const std::vector<int64>& indexes) {
+      LiteralUtil::Set(source.get(), indexes, ++seqnr);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
+                            init_proc);
+
+    const int64 src_base[] = {3, 1, 5, 7};
+    const int64 dest_base[] = {6, 4, 12, 2};
+    const int64 copy_size[] = {7, 8, 11, 9};
+
+    TF_EXPECT_OK(LiteralUtil::Copy(*source, src_base, blank.get(), dest_base,
+                                   copy_size));
+    std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
+    std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
+    bool matched = true;
+    auto check_proc = [&](const std::vector<int64>& indexes) {
+      std::copy(indexes.begin(), indexes.end(), source_indexes.begin());
+      std::transform(source_indexes.begin(), source_indexes.end(), src_base,
+                     source_indexes.begin(), std::plus<int64>());
+      std::copy(indexes.begin(), indexes.end(), blank_indexes.begin());
+      std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base,
+                     blank_indexes.begin(), std::plus<int64>());
+      auto bval = LiteralUtil::Get<uint32>(*blank, blank_indexes);
+      matched = (bval != 0 &&
+                 bval == LiteralUtil::Get<uint32>(*source, source_indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
+                            check_proc);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, CopyScalars) {
+  auto zero = LiteralUtil::CreateR0<uint32>(0);
+  auto nine = LiteralUtil::CreateR0<uint32>(9);
+  TF_EXPECT_OK(LiteralUtil::Copy(*nine, {}, zero.get(), {}, {}));
+  EXPECT_TRUE(LiteralUtil::Equal(*zero, *nine));
+
+  auto vect = LiteralUtil::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
+  TF_EXPECT_OK(LiteralUtil::Copy(*vect, {5}, zero.get(), {}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*zero, {}), 17);
+  TF_EXPECT_OK(LiteralUtil::Copy(*zero, {}, vect.get(), {4}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
+}
+
+TEST_F(LiteralUtilTest, F16) {
+  // Verify that the internal data views are consistent and that they
+  // are in little endian format
+  // TODO - modify if we make the data format machine endianess dependent
+  auto m1 = LiteralUtil::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
+  Literal* l1 = m1.get();
+  const char* d1 = static_cast<const char*>(LiteralUtil::InternalData(*l1));
+  EXPECT_EQ(d1[0], 0);
+  EXPECT_EQ(d1[1], 0);
+  EXPECT_EQ(d1[2], 0);
+  EXPECT_EQ(d1[3], 0);
+  EXPECT_EQ(d1[4], 0);
+  EXPECT_EQ(d1[5], 0);
+  EXPECT_EQ(d1[6], 0);
+  EXPECT_EQ(d1[7], 0);
+  EXPECT_EQ(LiteralUtil::InternalData(*l1),
+            LiteralUtil::MutableInternalData(l1));
+
+  half h1(1.0f);
+  half h2(2.0f);
+  auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l2 = m2.get();
+  const char* d2 = static_cast<const char*>(LiteralUtil::InternalData(*l2));
+  EXPECT_EQ(d2[0], 0);
+  EXPECT_EQ(d2[1], 0x3C);
+  EXPECT_EQ(d2[2], 0);
+  EXPECT_EQ(d2[3], 0x40);
+  EXPECT_EQ(d2[4], 0);
+  EXPECT_EQ(d2[5], 0x40);
+  EXPECT_EQ(d2[6], 0);
+  EXPECT_EQ(d2[7], 0x3C);
+  EXPECT_EQ(LiteralUtil::InternalData(*l2),
+            LiteralUtil::MutableInternalData(l2));
+}
+
+TEST_F(LiteralUtilTest, Populate) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{0}, {0}},
+      {{16}, {0}},
+      {{2, 0}, {1, 0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = LiteralUtil::CreateFromShape(shape);
+    auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return LiteralUtil::LinearIndex(*literal, indexes) + 17;
+    };
+    TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](const std::vector<int64>& indexes) {
+      auto value = LiteralUtil::Get<uint32>(*literal, indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, ConvertR4) {
+  // clang-format off
+  auto original = LiteralUtil::CreateR4WithLayout<int8>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  auto expected = LiteralUtil::CreateR4WithLayout<uint32>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  // clang-format on
+  auto converted = LiteralUtil::Convert<int8, uint32>(*original);
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *converted));
+}
+
+TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
+  // clang-format off
+  auto s8 = LiteralUtil::CreateR4WithLayout<int8>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
+  auto s32 = LiteralUtil::CreateR4WithLayout<int32>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
+  auto u32 = LiteralUtil::CreateR4WithLayout<uint32>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
+  auto s64 = LiteralUtil::CreateR4WithLayout<int64>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
+  auto u64 = LiteralUtil::CreateR4WithLayout<uint64>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
+  auto pred = LiteralUtil::CreateR4WithLayout<bool>({{
+    {{true, false, true, false}, {false, true, false, true}},
+    {{false, true, false, true}, {true, false, true, false}},
+    {{true, false, true, false}, {false, true, false, true}},
+  }}, layout_r4_dim0major_);
+  auto int32_pred = LiteralUtil::CreateR4WithLayout<int32>({{
+    {{1, 0, 1, 0}, {0, 1, 0, 1}},
+    {{0, 1, 0, 1}, {1, 0, 1, 0}},
+    {{1, 0, 1, 0}, {0, 1, 0, 1}},
+  }}, layout_r4_dim0major_);
+  auto f32 = LiteralUtil::CreateR4WithLayout<float>({{
+    {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}},
+    {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
+    {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}},
+  }}, layout_r4_dim0major_);
+  auto f64 = LiteralUtil::CreateR4WithLayout<double>({{
+    {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}},
+    {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}},
+    {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}},
+  }}, layout_r4_dim0major_);
+  // clang-format on
+  std::unique_ptr<Literal> conv;
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, U32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *u32));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, S32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, U64).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *u64));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, S64).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s64));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, PRED).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *pred));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*pred, S32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *int32_pred));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*f32, S32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*f64, S32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+
+  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s32, F32).ConsumeValueOrDie();
+  EXPECT_TRUE(LiteralUtil::Equal(*conv, *f32));
+
+  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, TUPLE).status().code(),
+            tensorflow::error::INVALID_ARGUMENT);
+  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, F16).status().code(),
+            tensorflow::error::INVALID_ARGUMENT);
+  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, S16).status().code(),
+            tensorflow::error::INVALID_ARGUMENT);
+  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, U16).status().code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
+
+TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(PRED);
+  for (int len = 0; len < 25; ++len) {
+    p.mutable_shape()->clear_dimensions();
+    p.mutable_shape()->add_dimensions(len);
+    p.clear_preds();
+    for (int i = 0; i < len; ++i) {
+      p.add_preds((i % 2) == (len % 2));
+    }
+
+    Literal literal(p);
+    ASSERT_EQ(len, literal.preds_size());
+    int i = 0;
+    for (auto it = literal.preds().begin(); it < literal.preds().end(); ++it) {
+      EXPECT_EQ((i % 2) == (len % 2), *it);
+      ++i;
+    }
+  }
+}
+
+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, ToProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  auto m = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l = m.get();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape()));
+  EXPECT_EQ(4, l->f16s().size());
+  EXPECT_EQ(4, l->f16s_size());
+
+  LiteralProto p = l->ToProto();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(8, p.f16s().size());
+  const char* d = p.f16s().data();
+  EXPECT_EQ(d[0], 0);
+  EXPECT_EQ(d[1], 0x3C);
+  EXPECT_EQ(d[2], 0);
+  EXPECT_EQ(d[3], 0x40);
+  EXPECT_EQ(d[4], 0);
+  EXPECT_EQ(d[5], 0x40);
+  EXPECT_EQ(d[6], 0);
+  EXPECT_EQ(d[7], 0x3C);
+}
+
+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, CopyFromProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  const char half_vals[8] = {
+    0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C
+  };
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(F16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  p.clear_f16s();
+  p.set_f16s(half_vals, 8);
+
+
+  Literal literal(p);
+  ASSERT_EQ(4, literal.f16s_size());
+  ASSERT_EQ(h1, literal.f16s(0));
+  ASSERT_EQ(h2, literal.f16s(1));
+  ASSERT_EQ(h2, literal.f16s(2));
+  ASSERT_EQ(h1, literal.f16s(3));
+
+  const std::vector<half>& r = literal.f16s();
+  ASSERT_EQ(4, r.size());
+  ASSERT_EQ(h1, r[0]);
+  ASSERT_EQ(h2, r[1]);
+  ASSERT_EQ(h2, r[2]);
+  ASSERT_EQ(h1, r[3]);
+}
+
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index cd7c42f6e17..fed0e58e66a 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -38,7 +38,8 @@ void MetricTableReport::SetEntryName(string entry_name) {
 
 void MetricTableReport::SetShowAllEntries() {
   max_entries_to_show_ = std::numeric_limits<int64>::max();
-  max_metric_proportion_to_show = 1.1;  // more than 100%
+  max_entries_per_category_to_show_ = std::numeric_limits<int64>::max();
+  max_metric_proportion_to_show_ = 1.1;  // more than 100%
 }
 
 void MetricTableReport::SetShowCategoryTable() { show_category_table_ = true; }
@@ -141,7 +142,7 @@ void MetricTableReport::AppendCategoryTable() {
   int64 categories_shown = 0;
   for (const auto& category : categories) {
     if (categories_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++categories_shown;
@@ -149,22 +150,21 @@ void MetricTableReport::AppendCategoryTable() {
 
     // Show the category.
     string text = category.category_text;
-    if (text == "") {
+    if (text.empty()) {
       text = "[no category]";
     }
     tensorflow::strings::StrAppend(&text, " (", category.entries.size(), " ",
                                    entry_name_, ")");
     AppendTableRow(text, category.metric_sum, metric_sum);
 
-    // Show the top few entries in the category.
-    const int64 kMaxToShow = 5;
+    // Show the top entries in the category.
     const char* const kIndentPrefix = "                              * ";
-    int64 entries_to_show =
-        std::min<int64>(kMaxToShow, category.entries.size());
-    if (category.entries.size() == kMaxToShow + 1) {
+    int64 entries_to_show = std::min<int64>(max_entries_per_category_to_show_,
+                                            category.entries.size());
+    if (category.entries.size() == entries_to_show + 1) {
       // May as well show the last entry on the line that would otherwise say
       // that there is a single entry not shown.
-      entries_to_show = category.entries.size();
+      ++entries_to_show;
     }
     for (int64 i = 0; i < entries_to_show; ++i) {
       AppendLine(kIndentPrefix, MetricPercent(category.entries[i]->metric), " ",
@@ -193,14 +193,14 @@ void MetricTableReport::AppendEntryTable() {
   int64 entries_shown = 0;
   for (const auto& entry : entries_) {
     if (entries_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++entries_shown;
     metric_sum += entry.metric;
 
     string text = entry.text;
-    if (text == "") {
+    if (text.empty()) {
       text = "[no entry text]";
     }
     AppendTableRow(text, entry.metric, metric_sum);
@@ -220,7 +220,14 @@ void MetricTableReport::AppendTableRow(const string& text, const double metric,
   const int64 max_metric_string_size =
       MetricString(expected_metric_sum_).size();
   string metric_string = MetricString(metric);
-  string padding(max_metric_string_size - metric_string.size() + 1, ' ');
+
+  // Don't try to make a gigantic string and crash if expected_metric_sum_ is
+  // wrong somehow.
+  int64 padding_len = 1;
+  if (max_metric_string_size >= metric_string.size()) {
+    padding_len += max_metric_string_size - metric_string.size();
+  }
+  string padding(padding_len, ' ');
   AppendLine(padding, metric_string, " (", MetricPercent(metric), " Σ",
              MetricPercent(running_metric_sum), ")   ", text);
 }
diff --git a/tensorflow/compiler/xla/metric_table_report.h b/tensorflow/compiler/xla/metric_table_report.h
index e967627bff4..818fb1d3fe0 100644
--- a/tensorflow/compiler/xla/metric_table_report.h
+++ b/tensorflow/compiler/xla/metric_table_report.h
@@ -103,6 +103,7 @@ class MetricTableReport {
  private:
   static constexpr double kDefaultMaxMetricProportionToShow = 0.99;
   static constexpr int64 kDefaultMaxEntriesToShow = 100;
+  static constexpr int64 kDefaultMaxEntriesPerCategoryToShow = 5;
 
   // Append all parameters to the report.
   template <typename... Args>
@@ -162,7 +163,8 @@ class MetricTableReport {
 
   // These members control how many categories and entries to show in tables.
   int64 max_entries_to_show_ = kDefaultMaxEntriesToShow;
-  double max_metric_proportion_to_show = kDefaultMaxMetricProportionToShow;
+  int64 max_entries_per_category_to_show_ = kDefaultMaxEntriesPerCategoryToShow;
+  double max_metric_proportion_to_show_ = kDefaultMaxMetricProportionToShow;
 
   // The report that is being created.
   string report_;
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index 21766a2a0c8..d488830a6cd 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -60,8 +60,8 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
   int64 elements = ShapeUtil::ElementsIn(shape);
   LiteralUtil::Resize(elements, std::numeric_limits<float>::quiet_NaN(),
                       result.get());
-  tensorflow::protobuf::RepeatedField<float>* field = result->mutable_f32s();
-  char* data = tensorflow::bit_cast<char*>(field->mutable_data());
+  std::vector<float>* field = result->mutable_f32s();
+  char* data = tensorflow::bit_cast<char*>(field->data());
   uint64 bytes = elements * sizeof(float);
   tensorflow::StringPiece sp;
   auto s = file_->Read(offset_, bytes, &sp, data);
diff --git a/tensorflow/compiler/xla/packed_literal_reader.h b/tensorflow/compiler/xla/packed_literal_reader.h
index 563d978cf5d..45a9fe01278 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.h
+++ b/tensorflow/compiler/xla/packed_literal_reader.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/port/BUILD b/tensorflow/compiler/xla/port/BUILD
deleted file mode 100644
index 6fc5f1185c9..00000000000
--- a/tensorflow/compiler/xla/port/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-    visibility = ["//tensorflow/compiler/xla:internal"],
-)
-
-cc_library(
-    name = "initialize",
-    hdrs = ["initialize.h"],
-    visibility = [
-        "//tensorflow/compiler/xla:__subpackages__",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/port/initialize.h b/tensorflow/compiler/xla/port/initialize.h
deleted file mode 100644
index 13d9632f97c..00000000000
--- a/tensorflow/compiler/xla/port/initialize.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
-#define TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
-
-#undef REGISTER_MODULE_INITIALIZER
-
-namespace xla {
-
-class Initializer {
- public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-};
-
-}  // namespace xla
-
-#define REGISTER_INITIALIZER(type, name, body)         \
-  static void google_init_##type##_##name() { body; }  \
-  xla::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
-
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
-
-#endif  // TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index e3909ae8e97..e4e37177a2d 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -78,6 +78,11 @@ PrimitiveType NativeToPrimitiveType<double>() {
   return F64;
 }
 
+template <>
+PrimitiveType NativeToPrimitiveType<half>() {
+  return F16;
+}
+
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64;
 }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 78f0ee6f592..162a11c7d29 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -75,6 +75,8 @@ template <>
 PrimitiveType NativeToPrimitiveType<float>();
 template <>
 PrimitiveType NativeToPrimitiveType<double>();
+template <>
+PrimitiveType NativeToPrimitiveType<half>();
 
 bool IsFloatingPointType(PrimitiveType type);
 
@@ -150,6 +152,10 @@ template <>
 struct PrimitiveTypeToNative<F64> {
   using type = double;
 };
+template <>
+struct PrimitiveTypeToNative<F16> {
+  using type = half;
+};
 
 }  // namespace primitive_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index adb2e99ad25..cdc4139cd69 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -14,7 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/protobuf_util.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace xla {
 namespace protobuf_util {
@@ -31,5 +37,35 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
   return (serialized1 == serialized2);
 }
 
+StatusOr<string> ToJson(const tensorflow::protobuf::Message& message) {
+  string json_output;
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.add_whitespace = true;
+  json_options.always_print_primitive_fields = true;
+  auto status = tensorflow::protobuf::util::MessageToJsonString(
+      message, &json_output, json_options);
+  if (!status.ok()) {
+    return InternalError("MessageToJsonString failed: %s",
+                         status.error_message().data());
+  }
+  return json_output;
+}
+
+Status DumpJsonToDirectory(const tensorflow::protobuf::Message& message,
+                           const string& directory, const string& file_name) {
+  TF_ASSIGN_OR_RETURN(const string json_output, ToJson(message));
+
+  tensorflow::Env* env = tensorflow::Env::Default();
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory));
+  string safe_file_name = file_name + ".json";
+  for (char& c : safe_file_name) {
+    if (c == '/' || c == '\\') {
+      c = '_';
+    }
+  }
+  const string path = tensorflow::io::JoinPath(directory, safe_file_name);
+  return tensorflow::WriteStringToFile(env, path, json_output);
+}
+
 }  // namespace protobuf_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index 36247f1bdec..1a895c35859 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace xla {
@@ -29,6 +31,17 @@ namespace protobuf_util {
 // base, this form of equality checking is sufficient.
 extern bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
                            const tensorflow::protobuf::Message& m2);
+
+// Returns 'message' as a JSON string.
+StatusOr<string> ToJson(const tensorflow::protobuf::Message& message);
+
+// Converts 'message' to JSON, and dumps it to the path formed by joining
+// 'directory/file_name.json'. The 'directory' is recursively created if it
+// doesn't already exist, and the 'file_name' is sanitized by replacing illegal
+// characters with underscore '_'.
+Status DumpJsonToDirectory(const tensorflow::protobuf::Message& message,
+                           const string& directory, const string& file_name);
+
 }  // namespace protobuf_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 142d2c2163f..e8de559a5ef 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/reference_util.h"
 
 #include <array>
+#include <utility>
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
@@ -134,12 +135,11 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
   return tensorflow::MathUtil::CeilOfRatio(unpadded_width, stride);
 }
 
-/* static  */ std::unique_ptr<Array4D<float>> ReferenceUtil::ReduceWindow4DAdd(
-    const Array4D<float>& operand, float init,
+/* static  */ std::unique_ptr<Array2D<float>> ReferenceUtil::ReduceWindow2DAdd(
+    const Array2D<float>& operand, float init,
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
-  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
-                                 operand.n4()};
+  std::vector<int64> dim_lengths{operand.height(), operand.width()};
   auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
 
   std::vector<int64> window_counts(window.size(), 0);
@@ -149,6 +149,61 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
         WindowCount(dim_lengths[i], window[i], stride[i], padding);
     pad_low[i] = padding_both[i].first;
   }
+  auto result = MakeUnique<Array2D<float>>(window_counts[0], window_counts[1]);
+
+  // Do a full 2D reduce window.
+  for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
+    for (int64 i1 = 0; i1 < window_counts[1]; ++i1) {
+      int64 i0_base = i0 * stride[0] - pad_low[0];
+      int64 i1_base = i1 * stride[1] - pad_low[1];
+
+      float val = init;
+      for (int64 i0_win = 0; i0_win < window[0]; ++i0_win) {
+        for (int64 i1_win = 0; i1_win < window[1]; ++i1_win) {
+          if (i0_base + i0_win >= 0 && i1_base + i1_win >= 0 &&
+              i0_base + i0_win < operand.n1() &&
+              i1_base + i1_win < operand.n2()) {
+            val += operand(i0_base + i0_win, i1_base + i1_win);
+          }
+        }
+      }
+      (*result)(i0, i1) = val;
+    }
+  }
+  return result;
+}
+
+/* static */ std::unique_ptr<Array4D<float>>
+ReferenceUtil::ReduceWindow4DGeneric(
+    const Array4D<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
+                                 operand.n4()};
+  return ReduceWindow4DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
+
+/* static */ std::unique_ptr<Array4D<float>>
+ReferenceUtil::ReduceWindow4DGeneric(
+    const Array4D<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
+                                 operand.n4()};
+
+  std::vector<int64> window_counts(window.size(), 0);
+  std::vector<int64> pad_low(window.size(), 0);
+  for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
+    window_counts[i] =
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
+  }
   auto result = MakeUnique<Array4D<float>>(window_counts[0], window_counts[1],
                                            window_counts[2], window_counts[3]);
   // Do a full 4D reduce window.
@@ -172,8 +227,9 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
                       i1_base + i1_win < operand.n2() &&
                       i2_base + i2_win < operand.n3() &&
                       i3_base + i3_win < operand.n4()) {
-                    val += operand(i0_base + i0_win, i1_base + i1_win,
-                                   i2_base + i2_win, i3_base + i3_win);
+                    val = reduce_func(
+                        val, operand(i0_base + i0_win, i1_base + i1_win,
+                                     i2_base + i2_win, i3_base + i3_win));
                   }
                 }
               }
@@ -187,6 +243,15 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
   return result;
 }
 
+/* static  */ std::unique_ptr<Array4D<float>> ReferenceUtil::ReduceWindow4DAdd(
+    const Array4D<float>& operand, float init,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; };
+  return ReduceWindow4DGeneric(operand, init, add_reduce, window, stride,
+                               padding);
+}
+
 /* static  */ std::unique_ptr<Array4D<float>>
 ReferenceUtil::SelectAndScatter4DGePlus(
     const Array4D<float>& operand, const Array4D<float>& source, float init,
@@ -267,7 +332,8 @@ ReferenceUtil::ConvArray4DGeneralDimensions(
     std::pair<int64, int64> kernel_stride, Padding padding,
     ConvolutionDimensionNumbers dimension_numbers) {
   return ConvArray4DGeneralDimensionsDilated(lhs, rhs, kernel_stride, padding,
-                                             {1, 1}, {1, 1}, dimension_numbers);
+                                             {1, 1}, {1, 1},
+                                             std::move(dimension_numbers));
 }
 
 /* static */ std::unique_ptr<Array4D<float>>
@@ -335,32 +401,57 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
                                  result_dimensions[2], result_dimensions[3]);
   result->Fill(0.0);
 
+  const auto is_int32 = [](int64 x) {
+    return x >= std::numeric_limits<int32>::min() &&
+           x <= std::numeric_limits<int32>::max();
+  };
+
+  // 64-bit idiv/mod are much more expensive x86-64 than 32-bit idiv/imod (at
+  // least on x86-64), so we avoid them where possible.
+  const auto fast_idiv64 = [&](int64 a, int64 b) {
+    if (is_int32(a) && is_int32(b)) {
+      return static_cast<int64>(static_cast<int32>(a) / static_cast<int32>(b));
+    }
+    return a / b;
+  };
+  const auto fast_imod64 = [&](int64 a, int64 b) {
+    if (is_int32(a) && is_int32(b)) {
+      return static_cast<int64>(static_cast<int32>(a) % static_cast<int32>(b));
+    }
+    return a % b;
+  };
+
   // Lambda to access the lhs operand at the given 4D index.
   const auto lhs_element = [&](int64 batch, int64 feature, int64 height,
                                int64 width) {
-    if (height % dy != 0 || width % dx != 0) {
+    if (fast_imod64(height, dy) != 0 || fast_imod64(width, dx) != 0) {
       return 0.0f;
     }
 
     std::array<int64, 4> index;
     index[dnums.batch_dimension()] = batch;
     index[dnums.feature_dimension()] = feature;
-    index[dnums.spatial_dimensions(0)] = height / dy;
-    index[dnums.spatial_dimensions(1)] = width / dx;
+    index[dnums.spatial_dimensions(0)] = fast_idiv64(height, dy);
+    index[dnums.spatial_dimensions(1)] = fast_idiv64(width, dx);
     return lhs(index[0], index[1], index[2], index[3]);
   };
 
-  // Lambda to access the rhs operand at the given 4D index.
-  const auto rhs_element = [&](int64 kernel_output_feature,
-                               int64 kernel_input_feature, int64 height,
-                               int64 width) {
-    CHECK_EQ(height % dky, 0);
-    CHECK_EQ(width % dkx, 0);
+  // Lambda to access the rhs operand at the given 4D index.  height_over_dky
+  // should be equal to height / dky, and width_over_dkx should be equal to
+  // width / dkx.  (This is an optimization to avoid doing divisions.)
+  const auto rhs_element = [&](
+      int64 kernel_output_feature, int64 kernel_input_feature, int64 height,
+      int64 width, int64 height_over_dky, int64 width_over_dkx) {
+    DCHECK_EQ(height % dky, 0);
+    DCHECK_EQ(width % dkx, 0);
+    DCHECK_EQ(height / dky, height_over_dky);
+    DCHECK_EQ(width / dkx, width_over_dkx);
+
     std::array<int64, 4> index;
     index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
     index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
-    index[dnums.kernel_spatial_dimensions(0)] = height / dky;
-    index[dnums.kernel_spatial_dimensions(1)] = width / dkx;
+    index[dnums.kernel_spatial_dimensions(0)] = height_over_dky;
+    index[dnums.kernel_spatial_dimensions(1)] = width_over_dkx;
     return rhs(index[0], index[1], index[2], index[3]);
   };
 
@@ -380,14 +471,17 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
       for (int64 sample = 0; sample < samples; ++sample) {
         for (int64 izi = 0; izi < iz; ++izi) {
           for (int64 ozi = 0; ozi < oz; ++ozi) {
-            for (int64 kyi = 0; kyi < ky; kyi += dky) {
-              for (int64 kxi = 0; kxi < kx; kxi += dkx) {
+            for (int64 kyi = 0, kyi_over_dky = 0; kyi < ky;
+                 kyi += dky, kyi_over_dky++) {
+              for (int64 kxi = 0, kxi_over_dkx = 0; kxi < kx;
+                   kxi += dkx, kxi_over_dkx++) {
                 int64 iyi = istarty + ksy * oyi + kyi;
                 int64 ixi = istartx + ksx * oxi + kxi;
                 float input = (iyi >= iy || ixi >= ix || iyi < 0 || ixi < 0)
                                   ? 0.0
                                   : lhs_element(sample, izi, iyi, ixi);
-                float gain = rhs_element(ozi, izi, kyi, kxi);
+                float gain =
+                    rhs_element(ozi, izi, kyi, kxi, kyi_over_dky, kxi_over_dkx);
                 float addend = input * gain;
                 result_element(sample, ozi, oyi, oxi) += addend;
               }
@@ -571,4 +665,49 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
+/* static */ Array4D<float> ReferenceUtil::PadArray4D(
+    const Array4D<float>& operand, const PaddingConfig& padding,
+    const float pad) {
+  CHECK_EQ(padding.dimensions_size(), 4);
+
+  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                           operand.n3(), operand.n4()};
+  std::vector<int64> pad_low(4);
+  std::vector<int64> pad_high(4);
+  std::vector<int64> pad_interior(4);
+  std::vector<int64> output_bounds(4);
+  for (int64 i = 0; i < 4; ++i) {
+    pad_low[i] = padding.dimensions(i).edge_padding_low();
+    pad_high[i] = padding.dimensions(i).edge_padding_high();
+    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
+    pad_interior[i] = padding.dimensions(i).interior_padding();
+
+    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                       (input_bounds[i] - 1) * pad_interior[i];
+  }
+
+  Array4D<float> result(output_bounds[0], output_bounds[1], output_bounds[2],
+                        output_bounds[3]);
+  result.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    for (int i = 0; i < 4; ++i) {
+      bool in_low_padding = indices[i] < pad_low[i];
+      bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+      if (in_low_padding || in_high_padding) {
+        *value = pad;
+        return;
+      }
+      if (pad_interior[i] &&
+          (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+        *value = pad;
+        return;
+      }
+    }
+    *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                     (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                     (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
+                     (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
+  });
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index d19d5f9dbb6..f58f0bdc9f5 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -144,12 +144,31 @@ class ReferenceUtil {
   static int64 WindowCount(int64 unpadded_width, int64 window_len, int64 stride,
                            Padding padding);
 
+  // Performs a 2D window reduction with Add as the function to apply.
+  static std::unique_ptr<Array2D<float>> ReduceWindow2DAdd(
+      const Array2D<float>& operand, float init,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+
   // Performs a 4D window reduction with Add as the function to apply.
   static std::unique_ptr<Array4D<float>> ReduceWindow4DAdd(
       const Array4D<float>& operand, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
 
+  // Performs a 4D window reduction with a generic reduce function.
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
+
   // Performs select and scatter with Greater Than or equal as the select, plus
   // as the scatter, and Same Padding.
   static std::unique_ptr<Array4D<float>> SelectAndScatter4DGePlus(
@@ -382,7 +401,51 @@ class ReferenceUtil {
       const Array2D<float>& operand, const PaddingConfig& padding,
       const float pad);
 
+  // Returns the result of a 4D pad on an input array.
+  static Array4D<float> PadArray4D(const Array4D<float>& operand,
+                                   const PaddingConfig& padding,
+                                   const float pad);
+
+  // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
+  // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
+  //
+  // The given arrays must have the same size and element type, and the return
+  // type of f must be implicitly convertible to the arrays' element type.
+  //
+  // Example usage:
+  //
+  //   Array2D<float> x, y, z = ...;
+  //   std::unique_ptr<Array2D> result = ReferenceUtil::ApplyElementwise2D(
+  //     [](float a, float b, float c) { return a * b + c; }, x, y, z);
+  //
+  template <typename F, typename T1, typename... Ts>
+  static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
+      F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
+    AssertSameSize2D(array1, arrays...);
+    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
+    for (int64 i = 0; i < array1.n1(); ++i) {
+      for (int64 j = 0; j < array1.n2(); ++j) {
+        (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
+      }
+    }
+    return result;
+  }
+
  private:
+  template <typename T1, typename T2, typename... Ts>
+  static void AssertSameSize2D(const Array2D<T1>& array1,
+                               const Array2D<T2>& array2,
+                               const Array2D<Ts>&... arrays) {
+    static_assert(std::is_same<T1, T2>::value, "Args must be same type.");
+    CHECK_EQ(array1.n1(), array2.n1());
+    CHECK_EQ(array1.n2(), array2.n2());
+    AssertSameSize2D(array2, arrays...);
+  }
+
+  // Recursive base case for AssertSameSize2D.
+  template <typename Array1>
+  static void AssertSameSize2D(const Array1& array1) {}
+
   TF_DISALLOW_COPY_AND_ASSIGN(ReferenceUtil);
 };
 
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index c53351ca93e..f839ac019df 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -52,9 +52,9 @@ class ReferenceUtilTest : public ::testing::Test {
 
 TEST_F(ReferenceUtilTest, TransposeArray2D) {
   auto result = ReferenceUtil::TransposeArray2D(*matrix_);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 4.f}, {2.f, 5.f}, {3.f, 6.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MatmulArray2D) {
@@ -62,32 +62,32 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) {
       {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{58.f, 64.f}, {139.f, 154.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToColArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToColArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToRowArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray2D) {
   auto identity = [](float value) { return log(exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
-  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -96,9 +96,9 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) {
     return value + row + col;
   };
   auto result = ReferenceUtil::MapWithIndexArray2D(*matrix_, add_index);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 3.f, 5.f}, {5.f, 7.f, 9.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray4D) {
@@ -107,11 +107,11 @@ TEST_F(ReferenceUtilTest, MapArray4D) {
   input->FillWithMultiples(1.0f);
   auto multiply_by_two = [](float value) { return 2 * value; };
   auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.FillWithMultiples(2.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -124,11 +124,11 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
     return value - (3 * 4 * 5 * plane + 4 * 5 * depth + 5 * height + width);
   };
   auto result = ReferenceUtil::MapWithIndexArray4D(*input, subtract_index);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.Fill(0.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -302,5 +302,17 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
                                               ErrorSpec(0.0001));
 }
 
+TEST_F(ReferenceUtilTest, ApplyElementwise2D) {
+  Array2D<float> a({{1, 2}, {3, 4}});
+  Array2D<float> b({{10, 20}, {30, 40}});
+  Array2D<float> c({{100, 200}, {300, 400}});
+
+  auto actual = ReferenceUtil::ApplyElementwise2D(
+      [](float x, float y, float z) { return 100 * x + 10 * y + z; }, a, b, c);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*actual);
+  LiteralTestUtil::ExpectR2Near({{300.f, 600.f}, {900.f, 1200.f}},
+                                *actual_literal, ErrorSpec(0.0001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4a59ce2f17e..0687368b83d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -21,6 +21,14 @@ xla_proto_library(
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+xla_proto_library(
+    name = "hlo_proto",
+    srcs = ["hlo.proto"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
+)
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -48,10 +56,12 @@ cc_library(
 
 cc_test(
     name = "shape_inference_test",
+    size = "small",
     srcs = ["shape_inference_test.cc"],
     deps = [
         ":shape_inference",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -61,11 +71,49 @@ cc_test(
 
 cc_test(
     name = "hlo_opcode_test",
+    size = "small",
     srcs = ["hlo_opcode_test.cc"],
     deps = [
         ":hlo",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_evaluator",
+    srcs = ["hlo_evaluator.cc"],
+    hdrs = ["hlo_evaluator.h"],
+    deps = [
+        ":hlo",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_evaluator_test",
+    size = "small",
+    srcs = ["hlo_evaluator_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
@@ -88,6 +136,8 @@ cc_library(
         "hlo_opcode.h",
     ],
     deps = [
+        ":hlo_module_config",
+        ":hlo_proto",
         ":name_uniquer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:literal_util",
@@ -105,10 +155,34 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_matchers",
+    testonly = 1,
+    srcs = ["hlo_matchers.cc"],
+    hdrs = ["hlo_matchers.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "hlo_matchers_test",
+    size = "small",
+    srcs = ["hlo_matchers_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:shape_util",
+    ],
+)
+
 cc_library(
     name = "versioned_computation_handle",
+    srcs = ["versioned_computation_handle.cc"],
     hdrs = ["versioned_computation_handle.h"],
     deps = [
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
@@ -116,14 +190,82 @@ cc_library(
 
 cc_test(
     name = "hlo_instruction_test",
+    size = "small",
     srcs = ["hlo_instruction_test.cc"],
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:test_main",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
+cc_library(
+    name = "call_graph",
+    srcs = ["call_graph.cc"],
+    hdrs = ["call_graph.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "call_graph_test",
+    size = "small",
+    srcs = ["call_graph_test.cc"],
+    deps = [
+        ":call_graph",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "flatten_call_graph",
+    srcs = ["flatten_call_graph.cc"],
+    hdrs = ["flatten_call_graph.h"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "flatten_call_graph_test",
+    size = "small",
+    srcs = ["flatten_call_graph_test.cc"],
+    deps = [
+        ":call_graph",
+        ":flatten_call_graph",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -143,10 +285,30 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
         "//tensorflow/core:lib",
     ],
 )
 
+cc_test(
+    name = "user_computation_test",
+    size = "small",
+    srcs = ["user_computation_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":user_computation",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "platform_util",
     srcs = ["platform_util.cc"],
@@ -170,6 +332,7 @@ cc_library(
         ":compiler",
         ":device_memory_allocator",
         ":platform_util",
+        ":pool",
         ":transfer_manager",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -199,10 +362,10 @@ cc_library(
         ":device_memory_allocator",
         ":executable",
         ":execution_tracker",
+        ":gpu_transfer_manager",
         ":hlo",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
-        ":hlo_graph_dumper",
         ":hlo_module_config",
         ":platform_util",
         ":session_proto",
@@ -222,7 +385,6 @@ cc_library(
         "//tensorflow/compiler/xla/legacy_flags:service_flags",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
     alwayslink = 1,
@@ -254,6 +416,29 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "compile_only_service",
+    srcs = ["compile_only_service.cc"],
+    hdrs = ["compile_only_service.h"],
+    deps = [
+        ":backend",
+        ":compiler",
+        ":computation_layout",
+        ":computation_tracker",
+        ":platform_util",
+        ":service",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -272,7 +457,7 @@ cc_library(
 cc_library(
     name = "gpu_plugin",
     deps = [
-        ":generic_transfer_manager",
+        ":gpu_transfer_manager",
         ":service",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -301,23 +486,31 @@ cc_library(
 cc_library(
     name = "executable",
     srcs = ["executable.cc"],
-    hdrs = ["executable.h"],
+    hdrs = [
+        "executable.h",
+        "service_executable_run_options.h",
+    ],
     deps = [
         ":computation_layout",
         ":device_memory_allocator",
         ":hlo",
+        ":hlo_cost_analysis",
         ":hlo_execution_profile",
-        ":hlo_module_config",
+        ":hlo_graph_dumper",
+        ":pool",
         ":session_proto",
         ":shaped_buffer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:service_flags",
         "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor",
     ],
 )
 
@@ -329,6 +522,7 @@ cc_library(
         ":executable",
         ":hlo",
         ":hlo_module_config",
+        ":logical_buffer",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -342,6 +536,7 @@ cc_library(
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
     deps = [
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -378,6 +573,7 @@ cc_library(
     hdrs = ["execution_tracker.h"],
     deps = [
         ":backend",
+        ":pool",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -393,6 +589,7 @@ cc_library(
     hdrs = ["computation_tracker.h"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":session_proto",
         ":user_computation",
         ":versioned_computation_handle",
@@ -435,6 +632,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "liveness_util",
+    srcs = ["liveness_util.cc"],
+    hdrs = ["liveness_util.h"],
+    deps = [
+        ":hlo",
+        ":logical_buffer",
+        ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+cc_test(
+    name = "liveness_util_test",
+    size = "small",
+    srcs = ["liveness_util_test.cc"],
+    deps = [
+        ":hlo",
+        ":liveness_util",
+        ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
 cc_library(
     name = "buffer_liveness",
     srcs = [
@@ -446,6 +669,7 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_ordering",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -459,10 +683,10 @@ cc_library(
 
 cc_test(
     name = "buffer_liveness_test",
+    size = "small",
     srcs = ["buffer_liveness_test.cc"],
     deps = [
         ":buffer_liveness",
-        ":cpu_plugin",
         ":hlo",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -484,6 +708,8 @@ cc_library(
     deps = [
         ":buffer_liveness",
         ":hlo",
+        ":hlo_ordering",
+        ":hlo_proto",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -494,38 +720,68 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:buffer_assignment_flags",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 cc_test(
     name = "buffer_assignment_test",
+    size = "small",
     srcs = ["buffer_assignment_test.cc"],
     deps = [
         ":buffer_assignment",
+        ":call_graph",
         ":computation_tracker",
-        ":cpu_plugin",
+        ":copy_insertion",
+        ":flatten_call_graph",
         ":hlo",
+        ":hlo_ordering",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "heap_simulator_test",
+    size = "small",
+    srcs = ["heap_simulator_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_ordering",
+        ":logical_buffer",
+        ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
 
+# The hlo_ordering library contains both hlo_ordering and heap_simulator because
+# they are mutually dependent.
 cc_library(
     name = "hlo_ordering",
     srcs = [
+        "heap_simulator.cc",
         "hlo_ordering.cc",
     ],
     hdrs = [
+        "heap_simulator.h",
         "hlo_ordering.h",
     ],
     deps = [
+        ":call_graph",
         ":hlo",
+        ":hlo_proto",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -539,16 +795,15 @@ cc_library(
 
 cc_test(
     name = "hlo_ordering_test",
+    size = "small",
     srcs = ["hlo_ordering_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -577,11 +832,12 @@ cc_library(
 
 cc_test(
     name = "instruction_fusion_test",
+    size = "small",
     srcs = ["instruction_fusion_test.cc"],
     deps = [
+        ":hlo_matchers",
         ":instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -607,21 +863,21 @@ cc_library(
 
 cc_test(
     name = "algebraic_simplifier_test",
+    size = "small",
     srcs = ["algebraic_simplifier_test.cc"],
     deps = [
         ":algebraic_simplifier",
-        ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -631,26 +887,31 @@ cc_library(
     hdrs = ["reshape_mover.h"],
     deps = [
         ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
     ],
 )
 
 cc_test(
     name = "reshape_mover_test",
+    size = "small",
     srcs = ["reshape_mover_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
         ":reshape_mover",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -670,14 +931,15 @@ cc_library(
 
 cc_test(
     name = "inliner_test",
+    size = "small",
     srcs = ["inliner_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":inliner",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -726,8 +988,30 @@ cc_library(
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
 
+cc_library(
+    name = "gpu_transfer_manager",
+    srcs = ["gpu_transfer_manager.cc"],
+    hdrs = ["gpu_transfer_manager.h"],
+    deps = [
+        ":generic_transfer_manager",
+        ":transfer_manager",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/gpu:infeed_manager",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+    alwayslink = True,  # Contains per-platform transfer manager registration
+)
+
 cc_test(
     name = "transfer_manager_test",
+    size = "small",
     srcs = ["transfer_manager_test.cc"],
     deps = [
         ":cpu_transfer_manager",
@@ -765,6 +1049,7 @@ cc_library(
 
 cc_test(
     name = "hlo_cost_analysis_test",
+    size = "small",
     srcs = ["hlo_cost_analysis_test.cc"],
     deps = [
         ":computation_tracker",
@@ -805,12 +1090,14 @@ cc_library(
 
 cc_test(
     name = "hlo_computation_test",
+    size = "small",
     srcs = ["hlo_computation_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test_main",
@@ -834,18 +1121,17 @@ cc_binary(
 
 cc_test(
     name = "hlo_module_test",
+    size = "small",
     srcs = ["hlo_module_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -859,10 +1145,101 @@ cc_library(
     ],
     deps = [
         ":hlo",
+        ":hlo_proto",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "hlo_dataflow_analysis",
+    srcs = [
+        "hlo_dataflow_analysis.cc",
+    ],
+    hdrs = [
+        "hlo_dataflow_analysis.h",
+    ],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":liveness_util",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_test(
+    name = "hlo_dataflow_analysis_test",
+    size = "small",
+    srcs = ["hlo_dataflow_analysis_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":hlo_matchers",
+        ":instruction_fusion",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_alias_analysis",
+    srcs = [
+        "hlo_alias_analysis.cc",
+    ],
+    hdrs = [
+        "hlo_alias_analysis.h",
+    ],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":logical_buffer",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_alias_analysis_test",
+    srcs = ["hlo_alias_analysis_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_matchers",
+        ":instruction_fusion",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -889,12 +1266,16 @@ cc_library(
 
 cc_test(
     name = "tuple_points_to_analysis_test",
+    size = "small",
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
+        ":instruction_fusion",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -956,6 +1337,7 @@ cc_library(
         ":buffer_liveness",
         ":hlo",
         ":hlo_pass",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:status_macros",
@@ -968,19 +1350,19 @@ cc_library(
 
 cc_test(
     name = "copy_insertion_test",
+    size = "small",
     srcs = ["copy_insertion_test.cc"],
     deps = [
-        ":buffer_liveness",
         ":copy_insertion",
-        ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -1000,11 +1382,59 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_verifier",
+    srcs = ["hlo_verifier.cc"],
+    hdrs = ["hlo_verifier.h"],
+    deps = [":hlo_pass"],
+)
+
+cc_library(
+    name = "hlo_rematerialization",
+    srcs = ["hlo_rematerialization.cc"],
+    hdrs = ["hlo_rematerialization.h"],
+    deps = [
+        ":buffer_liveness",
+        ":call_graph",
+        ":flatten_call_graph",
+        ":hlo",
+        ":hlo_cost_analysis",
+        ":hlo_dce",
+        ":hlo_ordering",
+        ":liveness_util",
+        ":logical_buffer",
+        ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_rematerialization_test",
+    size = "small",
+    srcs = ["hlo_rematerialization_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_ordering",
+        ":hlo_rematerialization",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_test(
     name = "hlo_dce_test",
+    size = "small",
     srcs = ["hlo_dce_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_dce",
         "//tensorflow/compiler/xla:literal_util",
@@ -1016,29 +1446,30 @@ cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core:test",
     ],
 )
 
 cc_test(
     name = "layout_assignment_test",
+    size = "small",
     srcs = ["layout_assignment_test.cc"],
     deps = [
         ":algebraic_simplifier",
         ":computation_layout",
-        ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":layout_assignment",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -1073,7 +1504,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_pass_pipeline_flags",
         "//tensorflow/core:lib",
     ],
 )
@@ -1087,7 +1517,6 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
@@ -1096,11 +1525,12 @@ cc_library(
 
 cc_test(
     name = "hlo_cse_test",
+    size = "small",
     srcs = ["hlo_cse_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_cse",
+        ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -1110,7 +1540,42 @@ cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_constant_folding",
+    srcs = ["hlo_constant_folding.cc"],
+    hdrs = ["hlo_constant_folding.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_constant_folding_test",
+    size = "small",
+    srcs = ["hlo_constant_folding_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_constant_folding",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -1159,6 +1624,7 @@ cc_library(
         ":computation_layout",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
@@ -1188,6 +1654,7 @@ cc_library(
 
 cc_test(
     name = "hlo_subcomputation_unification_test",
+    size = "small",
     srcs = ["hlo_subcomputation_unification_test.cc"],
     deps = [
         ":hlo",
@@ -1196,7 +1663,33 @@ cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_tfgraph_builder",
+    srcs = ["hlo_tfgraph_builder.cc"],
+    hdrs = ["hlo_tfgraph_builder.h"],
+    visibility = ["//tensorflow/compiler/xla/tools:__pkg__"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "hlo_tfgraph_builder_test",
+    size = "small",
+    srcs = ["hlo_tfgraph_builder_test.cc"],
+    deps = [
+        ":hlo_tfgraph_builder",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -1209,9 +1702,11 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_execution_profile",
+        ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
         "//tensorflow/core:lib",
     ],
@@ -1225,7 +1720,9 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/core:lib",
     ],
@@ -1233,20 +1730,57 @@ cc_library(
 
 cc_test(
     name = "transpose_folding_test",
+    size = "small",
     srcs = ["transpose_folding_test.cc"],
     deps = [
         ":hlo",
+        ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
 
+cc_library(
+    name = "pool",
+    hdrs = ["pool.h"],
+    deps = [
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "pool_test",
+    size = "small",
+    srcs = ["pool_test.cc"],
+    deps = [
+        ":pool",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_proto_util",
+    srcs = ["hlo_proto_util.cc"],
+    hdrs = ["hlo_proto_util.h"],
+    deps = [
+        ":buffer_assignment",
+        ":hlo",
+        ":hlo_proto",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index d35c6d6adb0..754ac0c68dc 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -51,6 +51,16 @@ bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
          LiteralUtil::IsAll(operand->literal(), value);
 }
 
+bool IsAll(const HloInstruction* op, int8 value) {
+  if (IsLiteralWithValue(op, value)) {
+    return true;
+  }
+  if (op->opcode() == HloOpcode::kBroadcast && IsAll(op->operand(0), value)) {
+    return true;
+  }
+  return false;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -76,6 +86,24 @@ bool ReshapeIsBitcast(
   return ShapeUtil::ReshapeIsBitcast(operand->shape(), reshape->shape()) &&
          valid_bitcast_callback(operand->shape(), reshape->shape());
 }
+
+// Adds a scalar computation to the module to enable optimizations with dot
+// converting into reduction.
+HloComputation* CreateScalarBinaryComputation(HloModule* module,
+                                              PrimitiveType primitive_type,
+                                              HloOpcode opcode) {
+  HloComputation::Builder b("scalar computation");
+  auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar lhs"));
+  auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {}), "scalar rhs"));
+  auto scalar_op = b.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
+                                   opcode, scalar_lhs, scalar_rhs));
+  HloComputation* scalar_computation =
+      module->AddEmbeddedComputation(b.Build(scalar_op));
+  return scalar_computation;
+}
 }  // namespace
 
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
@@ -94,6 +122,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
   Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
 
   Status HandleConvert(HloInstruction* convert,
@@ -105,6 +137,9 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
                       HloInstruction* rhs) override;
 
+  Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
+                   HloInstruction* rhs) override;
+
   Status HandleGetTupleElement(HloInstruction* get_tuple_element,
                                HloInstruction* operand) override;
 
@@ -125,7 +160,19 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function) override;
 
+  Status HandleReduceWindow(HloInstruction* reduce_window,
+                            HloInstruction* operand, const Window& window,
+                            HloComputation* function) override;
+
+  Status HandleReverse(HloInstruction* reverse,
+                       HloInstruction* operand) override;
   Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+  Status HandleDynamicSlice(HloInstruction* slice, HloInstruction* operand,
+                            HloInstruction* start_indices) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                  HloInstruction* operand,
+                                  HloInstruction* update,
+                                  HloInstruction* start_indices) override;
 
   Status HandleTranspose(HloInstruction* transpose) override;
 
@@ -144,15 +191,18 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Runs the visitor on a computation.
   static bool Run(
       HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback);
+      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
+      bool enable_dot_simplification);
 
  private:
   explicit AlgebraicSimplifierVisitor(
       HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback)
+      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
+      bool enable_dot_simplification)
       : computation_(computation),
         is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
+        enable_dot_simplification_(enable_dot_simplification) {}
 
   // Convenience method for replacing an instruction with a bitcast.
   void ReplaceWithBitcast(HloInstruction* instruction);
@@ -179,6 +229,34 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                    HloInstruction* operand, HloInstruction* max,
                                    HloInstruction* max_operand);
 
+  // A Reshape or Broadcast that feeds an element-wise operation with a unique
+  // non-scalar operand can sink to after the operation.
+  StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
+      HloInstruction* reshape_or_broadcast);
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction) {
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction)));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceInstruction(HloInstruction* old_instruction,
+                            HloInstruction* new_instruction) {
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceInstruction(old_instruction, new_instruction));
+    changed_ = true;
+    return Status::OK();
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -191,13 +269,18 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   // Callback used to determine if a bitcast is possible.
   AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
+
+  // Disable dot simplication on platforms where it causes a slowdown.
+  bool enable_dot_simplification_;
 };
 
 bool AlgebraicSimplifierVisitor::Run(
     HloComputation* computation, bool is_layout_sensitive,
-    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback) {
+    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
+    bool enable_dot_simplification) {
   AlgebraicSimplifierVisitor visitor(computation, is_layout_sensitive,
-                                     std::move(valid_bitcast_callback));
+                                     std::move(valid_bitcast_callback),
+                                     enable_dot_simplification);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -222,8 +305,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
   auto bitcast = computation_->AddInstruction(
       HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
                                   instruction->mutable_operand(0)));
-  TF_CHECK_OK(computation_->ReplaceInstruction(instruction, bitcast));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
 bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
@@ -231,9 +313,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
   if (!SameShape(old_instruction, new_instruction)) {
     return false;
   }
-  TF_CHECK_OK(
-      computation_->ReplaceInstruction(old_instruction, new_instruction));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(old_instruction, new_instruction));
   return true;
 }
 
@@ -242,12 +322,12 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add,
                                              HloInstruction* rhs) {
   // A + 0 => A
   VLOG(10) << "trying transform [A + 0 => A]: " << add->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
     return Status::OK();
   }
   // 0 + A => A
   VLOG(10) << "trying transform [0 + A => A]: " << add->ToString();
-  if (IsLiteralWithValue(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
+  if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
     return Status::OK();
   }
 
@@ -256,17 +336,91 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add,
 
 Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy,
                                               HloInstruction* operand) {
+  // If a copy feeds a copy, make it a single copy.
+  if (operand->opcode() == HloOpcode::kCopy) {
+    return ReplaceWithNewInstruction(
+        copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy,
+                                          operand->operands()[0]));
+  }
   // All copies can be eliminated (assuming layout constraints are satisified).
   ReplaceInstructionIfSameShape(copy, operand);
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  if (operands.size() == 1) {
+    // Unary concatenates are useless.
+    ReplaceInstructionIfSameShape(concatenate, operands[0]);
+    return Status::OK();
+  }
+  // Filter out and remove empty operands.
+  std::vector<HloInstruction*> nonempty_operands;
+  for (HloInstruction* operand : operands) {
+    if (!ShapeUtil::HasZeroElements(operand->shape())) {
+      nonempty_operands.push_back(operand);
+    }
+  }
+  if (nonempty_operands.size() < operands.size()) {
+    HloInstruction* replacement;
+    if (nonempty_operands.empty()) {
+      replacement = operands[0];
+    } else if (nonempty_operands.size() == 1) {
+      replacement = nonempty_operands[0];
+    } else {
+      replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), nonempty_operands));
+    }
+    VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
+             << replacement->ToString();
+    ReplaceInstructionIfSameShape(concatenate, replacement);
+  } else if (operands.size() == 2) {
+    // A binary concat with a broadcasted scalar as an operand can be converted
+    // into a pad which is simpler to fold into other operations.
+    bool is_effective_low_pad =
+        operands[0]->opcode() == HloOpcode::kBroadcast &&
+        ShapeUtil::IsScalar(operands[0]->operand(0)->shape());
+    bool is_effective_high_pad =
+        operands[1]->opcode() == HloOpcode::kBroadcast &&
+        ShapeUtil::IsScalar(operands[1]->operand(0)->shape());
+    if (!is_effective_low_pad && !is_effective_high_pad) {
+      return Status::OK();
+    }
+    PaddingConfig padding_config;
+    for (int64 dim = 0; dim < ShapeUtil::Rank(operands[0]->shape()); ++dim) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_edge_padding_high(0);
+      padding_config_dim->set_edge_padding_low(0);
+      padding_config_dim->set_interior_padding(0);
+      if (dim == concatenate->concatenate_dimension()) {
+        if (is_effective_low_pad) {
+          padding_config_dim->set_edge_padding_low(
+              operands[0]->shape().dimensions(dim));
+        } else {
+          padding_config_dim->set_edge_padding_high(
+              operands[1]->shape().dimensions(dim));
+        }
+      }
+    }
+    int64 operand_to_pad = is_effective_low_pad ? 1 : 0;
+    int64 pad_value_operand = is_effective_low_pad ? 0 : 1;
+    HloInstruction* pad =
+        computation_->AddInstruction(HloInstruction::CreatePad(
+            concatenate->shape(), operands[operand_to_pad],
+            operands[pad_value_operand]->mutable_operand(0), padding_config));
+    return ReplaceInstruction(concatenate, pad);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub,
                                                   HloInstruction* lhs,
                                                   HloInstruction* rhs) {
   // A - 0 => A
   VLOG(10) << "trying transform [A - 0 => A]: " << sub->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
     return Status::OK();
   }
 
@@ -278,8 +432,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
                                                 HloInstruction* rhs) {
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
     return Status::OK();
   }
 
@@ -290,8 +443,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
         computation_->AddInstruction(HloInstruction::CreateBinary(
             divide->shape(), HloOpcode::kSubtract, lhs->mutable_operand(0),
             rhs->mutable_operand(0)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         divide, HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp,
                                             subtract));
   }
@@ -299,19 +451,148 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
+                                             HloInstruction* lhs,
+                                             HloInstruction* rhs) {
+  if (!enable_dot_simplification_) {
+    return Status::OK();
+  }
+  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
+  // below.
+  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
+      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
+    return Status::OK();
+  }
+
+  // Replace a zero element dot with a broadcast of the constant 0.
+  if (ShapeUtil::HasZeroElements(dot->shape()) ||
+      ShapeUtil::HasZeroElements(lhs->shape()) ||
+      ShapeUtil::HasZeroElements(rhs->shape())) {
+    auto zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
+  }
+
+  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
+  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+    auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
+        rhs->mutable_operand(0), lhs->mutable_operand(0)));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
+  }
+
+  // Simplify outer product into multiply with implicit broadcasting.
+  //
+  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
+  if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
+                                          lhs, rhs));
+  }
+
+  // The following graph transformations take Dots where at least one input is a
+  // vector or has a degenerate dimension and converts it into a multiply and
+  // reduce. This should enable more fusion than leaving the nodes as Dot
+  // operations.
+
+  // Strength reduce dot(a[K] , b[K]) =
+  //  reshape(result.shape,
+  //          reduce_sum(multiply(a, b), {0}))
+  if (ShapeUtil::Rank(rhs->shape()) == 1 &&
+      ShapeUtil::Rank(lhs->shape()) == 1) {
+    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
+        rhs->shape(), HloOpcode::kMultiply, lhs, rhs));
+    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    auto zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
+        ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
+        {0}, add_reduce_computation));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+  }
+
+  // Strength reduce dot(a[1, K], b) =
+  //    reshape(result.shape,
+  //      reduce_sum(
+  //        multiply(broadcast(reshape(a, [K]), {0}), b),
+  //        {0})
+  //      )
+  //    )
+  if (ShapeUtil::Rank(lhs->shape()) == 1 ||
+      (ShapeUtil::Rank(lhs->shape()) == 2 && lhs->shape().dimensions(0) == 1)) {
+    auto new_lhs = computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(lhs->shape().element_type(),
+                             {ShapeUtil::ElementsIn(lhs->shape())}),
+        lhs));
+    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    auto zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    HloInstruction* reduce;
+    if (ShapeUtil::Rank(rhs->shape()) == 1) {
+      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
+          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
+      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
+          ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
+          {0}, add_reduce_computation));
+    } else {
+      new_lhs = computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(rhs->shape(), new_lhs, {0}));
+      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
+          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
+
+      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
+          ShapeUtil::MakeShape(dot->shape().element_type(),
+                               {rhs->shape().dimensions(1)}),
+          multiply, zero, {0}, add_reduce_computation));
+    }
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+  }
+
+  // Strength reduce dot(a, b[K, 1]) =
+  //  reshape(result.shape,
+  //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
+  //  )
+  if (ShapeUtil::Rank(rhs->shape()) == 1 ||
+      (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(1) == 1)) {
+    auto new_rhs = computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(rhs->shape().element_type(),
+                             {ShapeUtil::ElementsIn(rhs->shape())}),
+        rhs));
+    new_rhs = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(lhs->shape(), new_rhs, {1}));
+    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
+        lhs->shape(), HloOpcode::kMultiply, lhs, new_rhs));
+    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    auto zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
+        ShapeUtil::MakeShape(dot->shape().element_type(),
+                             {lhs->shape().dimensions(0)}),
+        multiply, zero, {1}, add_reduce_computation));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply,
                                                   HloInstruction* lhs,
                                                   HloInstruction* rhs) {
   // A*1 => A
   VLOG(10) << "trying transform [A*1 => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(multiply, lhs)) {
     return Status::OK();
   }
   // 1*A => A
   VLOG(10) << "trying transform [1*A => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(lhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, rhs)) {
+  if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(multiply, rhs)) {
     return Status::OK();
   }
   return Status::OK();
@@ -383,8 +664,9 @@ std::pair<bool, std::vector<int64>> ReshapeLeavesDimensionsUnmodified(
   return std::make_pair(true, output_dim_indices);
 }
 
-// Returns true if the output of "instruction" is a permutation of the elements
-// of "operand". Precondition: "operand" is an operand of "instruction".
+// Returns true if the output of "instruction" is a permutation of the
+// elements of "operand". Precondition: "operand" is an operand of
+// "instruction".
 bool OutputIsPermutationOfOperandElements(HloInstruction* instruction,
                                           HloInstruction* operand) {
   DCHECK(!instruction->OperandIndices(operand).empty());
@@ -432,13 +714,25 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> reshape(X) where "
                 "n(broadcast(X)) == n(X)";
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         broadcast, HloInstruction::CreateReshape(broadcast->shape(), operand));
   }
 
-  // A broadcast of a reshape which merely inserts 1-sized dimensions can elide
-  // its operand.
+  // A degenerate broadcast that has the same input and output rank can be
+  // converted into a transpose.
+  if (ShapeUtil::Rank(broadcast->shape()) ==
+          ShapeUtil::Rank(operand->shape()) &&
+      ShapeUtil::ElementsIn(broadcast->shape()) ==
+          ShapeUtil::ElementsIn(operand->shape())) {
+    VLOG(10) << "transform broadcast(X) -> transpose(X) where "
+                "n(broadcast(X)) == n(X)";
+    return ReplaceWithNewInstruction(
+        broadcast, HloInstruction::CreateTranspose(broadcast->shape(), operand,
+                                                   broadcast->dimensions()));
+  }
+
+  // A broadcast of a reshape which merely inserts 1-sized dimensions can
+  // elide its operand.
   {
     bool merely_inserts_or_deletes_1_sized_dimensions;
     std::vector<int64> inserted_indices, deleted_indices;
@@ -452,14 +746,22 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
       for (auto inserted_index : inserted_indices) {
         dims.erase(dims.begin() + inserted_index);
       }
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           broadcast,
           HloInstruction::CreateBroadcast(broadcast->shape(),
                                           operand->mutable_operand(0), dims));
     }
   }
 
+  // A Broadcast that feeds a unary element-wise operation can sink the
+  // broadcast after the unary element-wise operation.
+  TF_ASSIGN_OR_RETURN(
+      changed_,
+      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(broadcast));
+  if (changed_) {
+    return Status::OK();
+  }
+
   // A scalar broadcast feeding an instruction which only permutes (reshape,
   // transpose, sort, reverse) or selects a subset of operand elements (slice,
   // dynamic slice) can be replaced with a broadcast directly to the output
@@ -487,65 +789,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-static std::unique_ptr<HloInstruction> ConvertIfTypesMatch(
-    const Literal& src_literal) {
-  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-
-  return HloInstruction::CreateConstant(
-      LiteralUtil::Convert<typename primitive_util::PrimitiveTypeToNative<
-                               primitive_src_type>::type,
-                           typename primitive_util::PrimitiveTypeToNative<
-                               primitive_dest_type>::type>(src_literal));
-}
-
-template <PrimitiveType primitive_src_type>
-static std::unique_ptr<HloInstruction> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
-    CONVERT_IF_TYPES_MATCH(PRED)
-    CONVERT_IF_TYPES_MATCH(S8)
-    CONVERT_IF_TYPES_MATCH(S32)
-    CONVERT_IF_TYPES_MATCH(S64)
-    CONVERT_IF_TYPES_MATCH(U8)
-    CONVERT_IF_TYPES_MATCH(U32)
-    CONVERT_IF_TYPES_MATCH(U64)
-    CONVERT_IF_TYPES_MATCH(F32)
-    CONVERT_IF_TYPES_MATCH(F64)
-#undef CONVERT_IF_TYPES_MATCH
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
-static std::unique_ptr<HloInstruction> ConvertIfSrcTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (src_literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
-    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
-    CONVERT_IF_DEST_TYPE_MATCHES(S8)
-    CONVERT_IF_DEST_TYPE_MATCHES(S32)
-    CONVERT_IF_DEST_TYPE_MATCHES(S64)
-    CONVERT_IF_DEST_TYPE_MATCHES(U8)
-    CONVERT_IF_DEST_TYPE_MATCHES(U32)
-    CONVERT_IF_DEST_TYPE_MATCHES(U64)
-    CONVERT_IF_DEST_TYPE_MATCHES(F32)
-    CONVERT_IF_DEST_TYPE_MATCHES(F64)
-#undef CONVERT_IF_DEST_TYPE_MATCHES
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
@@ -554,16 +797,7 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert,
   PrimitiveType src_type = operand->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   if (src_type == dest_type) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(convert, operand);
-  }
-  if (operand->opcode() == HloOpcode::kConstant) {
-    const Literal& src_literal = operand->literal();
-    std::unique_ptr<HloInstruction> new_constant =
-        ConvertIfSrcTypeMatches(src_literal, dest_type);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(convert,
-                                                   std::move(new_constant));
+    return ReplaceInstruction(convert, operand);
   }
   return Status::OK();
 }
@@ -626,6 +860,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     // Second, construct the slice instruction to perform the negative padding.
     std::vector<int64> start_indices;
     std::vector<int64> end_indices;
+    std::vector<int64> strides;
     for (int64 i = 0; i < pad->padding_config().dimensions_size(); ++i) {
       const PaddingConfig::PaddingConfigDimension& padding_dimension =
           pad->padding_config().dimensions(i);
@@ -639,18 +874,19 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       }
       start_indices.push_back(start);
       end_indices.push_back(end);
+      strides.push_back(1);
     }
 
     // Verify that the slice shape matches the pad shape.
     TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape,
                         ShapeInference::InferSliceShape(
-                            nonzero_pad_shape, start_indices, end_indices));
+                            nonzero_pad_shape, start_indices, end_indices,
+                            strides));
     TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
-        pad->shape(), nonzero_pad, start_indices, end_indices);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(pad, std::move(slice));
+        pad->shape(), nonzero_pad, start_indices, end_indices, strides);
+    return ReplaceWithNewInstruction(pad, std::move(slice));
   }
 
   return Status::OK();
@@ -660,7 +896,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
                                                HloInstruction* lhs,
                                                HloInstruction* rhs) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 0)) {
+  if (IsAll(rhs, 0)) {
     auto one = HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
         LiteralUtil::One(power->shape().element_type())));
     std::unique_ptr<HloInstruction> ones;
@@ -670,51 +906,122 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
       ones = HloInstruction::CreateBroadcast(
           power->shape(), computation_->AddInstruction(std::move(one)), {});
     }
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(power, std::move(ones));
-    return Status::OK();
+    return ReplaceWithNewInstruction(power, std::move(ones));
   }
 
   VLOG(10) << "trying transform [pow(A, 1) => A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
     return Status::OK();
   }
 
   VLOG(10) << "trying transform [pow(A, 2) => A*A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 2)) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+  if (IsAll(rhs, 2)) {
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(),
                                             HloOpcode::kMultiply, lhs, lhs));
   }
 
   VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, -1)) {
+  if (IsAll(rhs, -1)) {
     auto* one = computation_->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
             LiteralUtil::One(rhs->shape().element_type()))));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
                                             one, lhs));
   }
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::
+    TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
+        HloInstruction* reshape_or_broadcast) {
+  bool changed = false;
+  HloInstruction* operand = reshape_or_broadcast->mutable_operand(0);
+  for (HloInstruction* user : reshape_or_broadcast->users()) {
+    if (user->user_count() == 0 && user != computation_->root_instruction()) {
+      continue;
+    }
+    // Do not move reshapes or broadcasts past copies since the shape the copy
+    // will operate on will change.
+    if (user->opcode() == HloOpcode::kCopy) {
+      continue;
+    }
+    // Do not change the shape of fusion nodes in case there a multiple shapes
+    // inside the fusion node already.
+    if (user->opcode() == HloOpcode::kFusion) {
+      continue;
+    }
+    if (!user->IsElementwise()) {
+      continue;
+    }
+
+    int64 reshape_or_broadcast_operand_index = -1;
+    // Find the unique non-scalar operand or continue if there isn't one.
+    int64 scalar_count = 0;
+    for (int64 i = 0; i < user->operand_count(); ++i) {
+      if (ShapeUtil::IsScalar(user->operand(i)->shape())) {
+        ++scalar_count;
+      } else {
+        reshape_or_broadcast_operand_index = i;
+      }
+    }
+    if (scalar_count != user->operand_count() - 1) {
+      continue;
+    }
+    CHECK_EQ(user->operand(reshape_or_broadcast_operand_index),
+             reshape_or_broadcast);
+    std::vector<HloInstruction*> new_user_operands = user->operands();
+    new_user_operands[reshape_or_broadcast_operand_index] = operand;
+    auto new_user = computation_->AddInstruction(user->CloneWithNewOperands(
+        ShapeUtil::MakeShape(user->shape().element_type(),
+                             AsInt64Slice(operand->shape().dimensions())),
+        new_user_operands));
+    HloInstruction* new_reshape_or_broadcast = nullptr;
+    if (reshape_or_broadcast->opcode() == HloOpcode::kReshape) {
+      new_reshape_or_broadcast =
+          computation_->AddInstruction(HloInstruction::CreateReshape(
+              ShapeUtil::MakeShape(
+                  user->shape().element_type(),
+                  AsInt64Slice(reshape_or_broadcast->shape().dimensions())),
+              new_user));
+    } else {
+      TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast);
+      new_reshape_or_broadcast =
+          computation_->AddInstruction(HloInstruction::CreateBroadcast(
+              ShapeUtil::MakeShape(
+                  user->shape().element_type(),
+                  AsInt64Slice(reshape_or_broadcast->shape().dimensions())),
+              new_user, reshape_or_broadcast->dimensions()));
+    }
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceUsesOfInstruction(user, new_reshape_or_broadcast));
+    changed = true;
+  }
+  return changed;
+}
+
 Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
+  // Reshape directly to empty constant if the shape contains zero-element
+  // dimension.
+  if (ShapeUtil::HasZeroElements(reshape->shape())) {
+    auto empty_constant = HloInstruction::CreateConstant(
+        LiteralUtil::CreateFromShape(reshape->shape()));
+
+    return ReplaceWithNewInstruction(reshape, std::move(empty_constant));
+  }
+
   // Delete no-op reshapes, i.e. where shape = operand shape.
   if (SameShape(reshape, operand)) {
     VLOG(10) << "deleting no-op reshape";
-    changed_ = true;
-    return computation_->ReplaceInstruction(reshape, operand);
+    return ReplaceInstruction(reshape, operand);
   }
 
   // Merge reshapes.
   if (HloOpcode::kReshape == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
@@ -723,8 +1030,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     auto opt_dims = ReshapeLeavesDimensionsUnmodified(
         reshape, reshape->operand(0)->dimensions());
     if (opt_dims.first) {
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           reshape,
           HloInstruction::CreateBroadcast(
               reshape->shape(), reshape->mutable_operand(0)->mutable_operand(0),
@@ -732,6 +1038,15 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     }
   }
 
+  // A Reshape that feeds a unary element-wise operation can sink the
+  // reshape after the unary element-wise operation.
+  TF_ASSIGN_OR_RETURN(
+      changed_,
+      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(reshape));
+  if (changed_) {
+    return Status::OK();
+  }
+
   // Make this a bitcast if possible.
   if (is_layout_sensitive_ &&
       ReshapeIsBitcast(reshape, valid_bitcast_callback_)) {
@@ -742,6 +1057,20 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse,
+                                                 HloInstruction* operand) {
+  // When all the dimensions to reverse are trivial (i.e. the bound is 1),
+  // there is nothing to be done.
+  auto dim_is_one = [&](int64 i) -> bool {
+    return reverse->shape().dimensions(i) == 1;
+  };
+  if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
+                  dim_is_one)) {
+    return ReplaceInstruction(reverse, operand);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice,
                                                HloInstruction* operand) {
   // Delete no-op slices, i.e. where shape = operand shape.
@@ -751,34 +1080,176 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice,
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
+    HloInstruction* dynamic_slice, HloInstruction* operand,
+    HloInstruction* start_indices) {
+  if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
+    return ReplaceInstruction(dynamic_slice, operand);
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice, HloInstruction* operand,
+    HloInstruction* update, HloInstruction* start_indices) {
+  // DynamicUpdateSlice on a scalar just passes through the update argument.
+  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
+    return ReplaceInstruction(dynamic_update_slice, update);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function) {
+  if (ShapeUtil::HasZeroElements(arg->shape()) ||
+      ShapeUtil::HasZeroElements(reduce->shape())) {
+    return ReplaceWithNewInstruction(
+        reduce,
+        HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
+    return Status::OK();
+  }
+  // A Transpose feeding a reduce can simply permute the reduction dimensions
+  // field.
+  if (arg->opcode() == HloOpcode::kTranspose) {
+    auto transpose_dimensions = arg->dimensions();
+    std::vector<int64> new_reduce_dimensions;
+    for (auto dim : dimensions) {
+      new_reduce_dimensions.push_back(transpose_dimensions[dim]);
+    }
+    return ReplaceWithNewInstruction(
+        reduce, HloInstruction::CreateReduce(
+                    reduce->shape(), arg->mutable_operand(0), init_value,
+                    new_reduce_dimensions, function));
+  }
+
+  // A reshape that collapses multiple dimensions into a dimension being
+  // reduced can just reduce all of those dimensions instead of doing a
+  // collapsing reshape before a reduction.
+  if (arg->opcode() == HloOpcode::kReshape) {
+    std::vector<std::pair<int64, int64>> unmodified_dims =
+        ShapeUtil::DimensionsUnmodifiedByReshape(arg->operand(0)->shape(),
+                                                 arg->shape());
+    std::vector<bool> arg_dim_in_output(ShapeUtil::Rank(arg->shape()), true);
+    std::vector<bool> arg_dim_unmodified(ShapeUtil::Rank(arg->shape()), false);
+    for (auto dim : dimensions) {
+      arg_dim_in_output[dim] = false;
+    }
+    for (auto dim_pair : unmodified_dims) {
+      arg_dim_unmodified[dim_pair.second] = true;
+    }
+    // The goal is to verify that all dimensions that are not removed in the
+    // reduce are unmodified by the reshape. For example:
+    // reduce(reshape([A,B*C], a[A,B,C]),[1]) = reduce(a[A, B, C], [1, 2])
+    bool can_move_reshape_into_reduce = true;
+    for (int64 i = 0; i < arg_dim_in_output.size(); ++i) {
+      if (arg_dim_in_output[i] && !arg_dim_unmodified[i]) {
+        can_move_reshape_into_reduce = false;
+      }
+    }
+    if (can_move_reshape_into_reduce) {
+      changed_ = true;
+      std::unordered_set<int64> dimensions_not_to_reduce;
+      for (auto dim_pair : unmodified_dims) {
+        if (arg_dim_in_output[dim_pair.second]) {
+          dimensions_not_to_reduce.insert(dim_pair.first);
+        }
+      }
+      std::vector<int64> new_reduce_dimensions;
+      for (int64 i = 0; i < ShapeUtil::Rank(arg->operand(0)->shape()); ++i) {
+        if (dimensions_not_to_reduce.count(i) == 0) {
+          new_reduce_dimensions.push_back(i);
+        }
+      }
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateReduce(
+                      reduce->shape(), arg->mutable_operand(0), init_value,
+                      new_reduce_dimensions, function));
+    }
+  }
   if (ShapeUtil::ElementsIn(reduce->shape()) ==
-      ShapeUtil::ElementsIn(arg->shape())) {
+          ShapeUtil::ElementsIn(arg->shape()) ||
+      ShapeUtil::HasZeroElements(arg->shape())) {
     auto reshape = computation_->AddInstruction(
         HloInstruction::CreateReshape(reduce->shape(), arg));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateMap(reduce->shape(),
                                           {reshape, init_value}, function));
   }
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleReduceWindow(
+    HloInstruction* reduce_window, HloInstruction* operand,
+    const Window& window, HloComputation* function) {
+  VLOG(10) << "Considering folding Pad: " << operand->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString();
+
+  // This optimization folds a pad op into reduce_window.
+  if (operand->opcode() != HloOpcode::kPad) {
+    VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
+    return Status::OK();
+  }
+
+  // Do not fold interior padding into ReduceWindow since the backends do not
+  // support it.
+  const PaddingConfig& pad_config = operand->padding_config();
+  if (HasInteriorPadding(pad_config)) {
+    VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
+    return Status::OK();
+  }
+
+  // If reduce_window already has padding, the pad value of the pad op and the
+  // init value of reduce_window must match to allow folding the pad.
+  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* reduce_init_value = reduce_window->operand(1);
+  if (pad_value != reduce_init_value) {
+    // The pad value is usually a constant, so we handle that case and do not
+    // try to get more fancy about proving equivalence in cases beyond that.
+    if (pad_value->opcode() != HloOpcode::kConstant ||
+        reduce_init_value->opcode() != HloOpcode::kConstant ||
+        !LiteralUtil::Equal(pad_value->literal(),
+                            reduce_init_value->literal())) {
+      VLOG(10) << "Not folding pad into reduce-window due to different pad "
+                  "values.";
+      return Status::OK();
+    }
+  }
+
+  // Carry out the folding of the pad into reduce_window.
+  VLOG(10) << "Folding pad into reduce-window.";
+  Window new_window = window;
+  const int64 rank = ShapeUtil::Rank(reduce_window->shape());
+  TF_RET_CHECK(pad_config.dimensions_size() == rank);
+  TF_RET_CHECK(window.dimensions_size() == rank);
+  for (int64 i = 0; i < rank; ++i) {
+    const auto& pad_dim = pad_config.dimensions(i);
+    auto& window_dim = *new_window.mutable_dimensions(i);
+    window_dim.set_padding_low(window_dim.padding_low() +
+                               pad_dim.edge_padding_low());
+    window_dim.set_padding_high(window_dim.padding_high() +
+                                pad_dim.edge_padding_high());
+  }
+  return ReplaceWithNewInstruction(
+      reduce_window, HloInstruction::CreateReduceWindow(
+                         /*shape=*/reduce_window->shape(),
+                         /*operand=*/operand->mutable_operand(0),
+                         /*init_value=*/reduce_window->mutable_operand(1),
+                         /*window=*/new_window,
+                         /*reduce_computation=*/function));
+}
+
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
 
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
     VLOG(10) << "deleting no-op transpose";
-    changed_ = true;
-    return computation_->ReplaceInstruction(transpose, operand);
+    return ReplaceInstruction(transpose, operand);
   }
 
   if (HloOpcode::kTranspose == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         transpose, HloInstruction::CreateTranspose(
                        transpose->shape(), operand->mutable_operand(0),
                        ComposePermutations(operand->dimensions(),
@@ -805,7 +1276,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   //   bitcasts_ == true.
 
   // TODO(cwhipkey): b/31337498, make this layout insensitive.
-  if (!is_layout_sensitive_) return Status::OK();
+  if (!is_layout_sensitive_) {
+    return Status::OK();
+  }
 
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
@@ -905,9 +1378,7 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
       dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
-  changed_ = true;
-  return computation_->ReplaceInstruction(convolution,
-                                          add_bitcast(convolution_shape, dot));
+  return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
 bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
@@ -921,8 +1392,7 @@ bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
 
   auto clamp = HloInstruction::CreateTernary(root->shape(), HloOpcode::kClamp,
                                              max_operand, operand, min_operand);
-  TF_CHECK_OK(computation_->ReplaceWithNewInstruction(root, std::move(clamp)));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceWithNewInstruction(root, std::move(clamp)));
   return true;
 }
 
@@ -995,12 +1465,20 @@ Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum,
 StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
-  bool changed = std::any_of(
-      module->computations().begin(), module->computations().end(),
-      [=](const std::unique_ptr<HloComputation>& computation) {
-        return AlgebraicSimplifierVisitor::Run(
-            computation.get(), is_layout_sensitive_, valid_bitcast_callback_);
-      });
+  bool changed = false;
+  // Make a copy of the computations because we may add computations to the
+  // module, invalidating iteration.
+  std::vector<HloComputation*> computations;
+  for (auto& comp : module->computations()) {
+    computations.push_back(comp.get());
+  }
+  for (auto& comp : computations) {
+    if (AlgebraicSimplifierVisitor::Run(comp, is_layout_sensitive_,
+                                        valid_bitcast_callback_,
+                                        enable_dot_simplification_)) {
+      changed = true;
+    }
+  }
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), after:\n" + module->ToString());
   return changed;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index d10d1edc1d2..f8919f0caad 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -35,12 +35,14 @@ class AlgebraicSimplifier : public HloPassInterface {
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
-  // returns true, then the pass will replace reshapes and tranposes with
+  // returns true, then the pass will replace reshapes and transposes with
   // bitcasts.
   AlgebraicSimplifier(bool is_layout_sensitive,
-                      ValidBitcastCallback valid_bitcast_callback)
+                      ValidBitcastCallback valid_bitcast_callback,
+                      bool enable_dot_simplification = true)
       : is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
+        enable_dot_simplification_(enable_dot_simplification) {}
   ~AlgebraicSimplifier() override {}
   tensorflow::StringPiece name() const override { return "algsimp"; }
 
@@ -51,6 +53,9 @@ class AlgebraicSimplifier : public HloPassInterface {
  private:
   bool is_layout_sensitive_;
   ValidBitcastCallback valid_bitcast_callback_;
+
+  // Enable dot simplication on platforms where it is profitable.
+  bool enable_dot_simplification_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 8dd94e2c70c..e4368a7bb25 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -23,21 +23,25 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
 AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
+
 AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
@@ -55,7 +59,53 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, zero, {0, 1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0, 0, 0})));
+  HloInstruction* bcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(r2f32, zero, {1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
@@ -77,7 +127,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
@@ -99,7 +149,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
@@ -121,7 +171,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
@@ -149,7 +199,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
@@ -157,9 +207,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_EQ(root, add);
-  EXPECT_EQ(root->operand(0), param1);
-  EXPECT_EQ(root->operand(1), param2);
+  EXPECT_THAT(root, op::Add(param1, param2));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
@@ -177,19 +225,18 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kDivide);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(op::Exp(param0), op::Exp(param1)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kExp);
-  EXPECT_EQ(root->operand_count(), 1);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kSubtract);
-  EXPECT_EQ(root->operand(0)->operand(0), param0);
-  EXPECT_EQ(root->operand(0)->operand(1), param1);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Exp(op::Subtract(param0, param1)));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -203,16 +250,16 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kLog);
+
+  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
-  EXPECT_EQ(root, param0);
+
+  EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that ln(exp(A)/exp(B)) is simplified to A-B
@@ -232,17 +279,17 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kLog);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  EXPECT_EQ(root->operand(0), param0);
-  EXPECT_EQ(root->operand(1), param1);
+
+  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
@@ -257,13 +304,17 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
+  EXPECT_THAT(root, op::Constant());
   EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->literal()), 1);
 }
 
@@ -278,13 +329,17 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  EXPECT_THAT(root, op::Broadcast());
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -304,14 +359,16 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
-  EXPECT_EQ(root, param0);
+
+  EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that pow(A, 2) is simplified to A*A.
@@ -325,15 +382,16 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  EXPECT_EQ(root->operand(0), param0);
-  EXPECT_EQ(root->operand(1), param0);
+
+  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
@@ -347,17 +405,19 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kDivide);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConstant);
+  EXPECT_THAT(root, op::Divide(op::Constant(), param0));
   EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->operand(0)->literal()),
             1);
-  EXPECT_EQ(root->operand(1), param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -374,14 +434,17 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(std::move(computation));
-  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(op::Broadcast(op::Reshape(op))));
+
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
@@ -392,85 +455,16 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertF32ToS64) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
-                computation->root_instruction()->literal()),
-            42);
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertS64ToF32) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
-                computation->root_instruction()->literal()),
-            42.0f);
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertF32ArrayToS64Array) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
-      42);
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
-      19);
+  EXPECT_THAT(computation->root_instruction(), input);
 }
 
 // Test that copies are removed.
@@ -479,19 +473,125 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
-  HloInstruction* copy = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(param0, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), param0);
+}
+
+// Test that unary concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  builder.AddInstruction(
+      HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), param0);
+}
+
+// Test that empty operands of concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r1f32, "param1"));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(param0, param0, param1));
+}
+
+// Test a concatenate with only empty operands is removed.
+TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {0});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, empty_slice}, 0));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(empty_literal, empty_slice));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(computation->root_instruction(), empty_literal);
+}
+
+// Test that concat with a scalar broadcast becomes a pad.
+TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r1f32, param1, {}));
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      param0->shape(), {broadcast, param0}, 0));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -504,21 +604,21 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
@@ -531,21 +631,21 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Copy has been removed.
-  EXPECT_EQ(param0, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 // Test that a reshape which could be replaced with a bitcast is not if
@@ -563,17 +663,17 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(reshape, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_EQ(reshape, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
 // Test transforming reshapes to bitcasts under various conditions.
@@ -609,25 +709,48 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(transformable_reshape, computation->root_instruction()->operand(0));
-  EXPECT_EQ(dimensions_wrong_reshape,
-            computation->root_instruction()->operand(1));
-  EXPECT_EQ(layout_wrong_reshape, computation->root_instruction()->operand(2));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
+                        layout_wrong_reshape));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  simplifier.Run(module.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
-  EXPECT_NE(transformable_reshape, computation->root_instruction()->operand(0));
-  EXPECT_EQ(HloOpcode::kBitcast,
-            computation->root_instruction()->operand(0)->opcode());
-  EXPECT_EQ(dimensions_wrong_reshape,
-            computation->root_instruction()->operand(1));
-  EXPECT_EQ(layout_wrong_reshape, computation->root_instruction()->operand(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), "param"));
+  HloInstruction* movable_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), param));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
+                                   HloOpcode::kMaximum, movable_reshape, zero));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Reshape(param), zero));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+
+  simplifier.Run(module.get()).ValueOrDie();
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Maximum(param, zero)));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
@@ -644,16 +767,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(HloOpcode::kBitcast, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
@@ -670,16 +794,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(HloOpcode::kBitcast, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
@@ -692,23 +817,47 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {2, 1, 2}), param0));
 
-  HloInstruction* reshape2 =
-      builder.AddInstruction(HloInstruction::CreateReshape(
-          ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
+  builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(reshape2, computation->root_instruction());
-  EXPECT_EQ(reshape1, computation->root_instruction()->operand(0));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kReshape, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+}
+
+TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(F32, {2, 2, 2}),
+          "param0"));
+
+  HloInstruction* copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
+      HloOpcode::kCopy, param0));
+
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
+      HloOpcode::kCopy, copy1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
@@ -721,25 +870,21 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 4, 2}), param0, {1, 2, 0}));
 
-  HloInstruction* transpose2 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
+  builder.AddInstruction(HloInstruction::CreateTranspose(
+      ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(transpose2, computation->root_instruction());
-  EXPECT_EQ(transpose1, computation->root_instruction()->operand(0));
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kTranspose, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
 }
 
 // Test merging reshape and broadcast.
@@ -752,16 +897,17 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 2, 3}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Broadcast(op::Reshape(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 // Test merging broadcast and reshape.
@@ -774,16 +920,17 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
@@ -795,12 +942,18 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
@@ -812,15 +965,19 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_MATCH(computation->root_instruction()->dimensions(),
-               testing::VectorMatcher<int64>({3}));
+
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction()->dimensions(),
+              ::testing::ElementsAre(3));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
@@ -832,18 +989,21 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
+
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
-  EXPECT_TRUE(broadcast_dims[0] == 1 || broadcast_dims[0] == 2 ||
-              broadcast_dims[3] == 3);
+  EXPECT_THAT(broadcast_dims[0], ::testing::AnyOf(1, 2, 3));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
@@ -855,12 +1015,18 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -871,7 +1037,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   HloInstruction* zero = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
   PaddingConfig no_padding;
-  for (auto i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     auto dimension = no_padding.add_dimensions();
     dimension->set_edge_padding_low(0);
     dimension->set_edge_padding_high(0);
@@ -883,10 +1049,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, NegativePadding) {
@@ -901,7 +1070,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   PaddingConfig padding;
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {2, -3};
-  for (auto i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     auto dimension = padding.add_dimensions();
     dimension->set_edge_padding_low(low_padding[i]);
     dimension->set_edge_padding_high(high_padding[i]);
@@ -926,18 +1095,14 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(computation->root_instruction(), pad);
+  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
 
-  EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kSlice);
-  const HloInstruction* root_operand =
-      computation->root_instruction()->operand(0);
-  EXPECT_EQ(root_operand->opcode(), HloOpcode::kPad);
-  EXPECT_FALSE(has_negative_padding(root_operand));
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_FALSE(
+      has_negative_padding(computation->root_instruction()->operand(0)));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
@@ -951,10 +1116,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
@@ -966,15 +1134,18 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
           0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
   builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
-      /*limit_indices=*/{dim0, dim1}));
+      /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1}));
 
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
@@ -1210,21 +1381,21 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, param0, min_value));
-  HloInstruction* max = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMaximum, min, max_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, max);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Minimum(param0, min_value), max_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, x), y) is transformed to clamp(x, A, y) for scalar
@@ -1240,21 +1411,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, x), y) is transformed to clamp(x, A, y) for
@@ -1271,21 +1442,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, non-constant1), non-constant2) is not canonicalized to
@@ -1301,17 +1472,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
       HloInstruction::CreateParameter(2, r0f32, "param2"));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
 }
 
 // Test that min(f(max(A, constant1)), constant2) is not transformed to
@@ -1329,18 +1504,23 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
       r0f32, HloOpcode::kMaximum, param0, max_value));
   HloInstruction* fmax = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, max, max_value));
-  HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, fmax, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
+                          min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
+                          min_value));
 }
 
 // Test that slice(broadcast(/*scalar value*/)) simplifies to a single
@@ -1359,7 +1539,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
-      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}));
+      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
@@ -1377,8 +1557,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
 
   root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  EXPECT_EQ(scalar_param, root->operand(0));
+  EXPECT_THAT(root, op::Broadcast(scalar_param));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 }
 
@@ -1415,10 +1594,143 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
 
   root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  EXPECT_EQ(forty_two, root->operand(0));
+  EXPECT_THAT(root, op::Broadcast(forty_two));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 }
 
+// Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
+TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* operand =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {1, 3, 3, 5}), operand, pad_value, padding));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, pad, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module.AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
+TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
+  HloComputation::Builder builder(TestName());
+  const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  builder.AddInstruction(
+      HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
+
+  HloModule module(TestName());
+  auto computation = module.AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(a, root);
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+}
+
+TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
+  // Dots add computations to the parent module. Test that, when the HloModule's
+  // computations are updated, then iterator invalidation doesn't occur
+  // when running on subsequent computations.
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {1});
+  HloComputation::Builder builder(TestName() + ".Dot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kDot, x, y));
+  std::unique_ptr<HloComputation> dot_computation(builder.Build());
+
+  HloComputation::Builder call_builder(TestName() + ".Call");
+  HloInstruction* zero = call_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0.0f})));
+  HloInstruction* one = call_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
+
+  auto module = CreateNewModule();
+  module->AddEmbeddedComputation(std::move(dot_computation));
+  module->AddEntryComputation(call_builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index a123213401d..ad2fee2d39a 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -64,8 +64,9 @@ GlobalDataHandle AllocationTracker::RegisterInternal(
     auto& allocation = FindOrDie(handle_to_allocation_, handle);
     int ref_count = allocation->ref_count();
     CHECK_GT(ref_count, 0);
-    VLOG(2) << "ref_count: " << ref_count << " -> " << ref_count + 1;
-    allocation->increment_ref_count();
+    VLOG(2) << "ref_count: " << ref_count << " -> " <<
+            (ref_count + initial_ref_count);
+    allocation->increment_ref_count(initial_ref_count);
   } else {
     handle = next_handle_++;
     VLOG(2) << "ref_count: " << initial_ref_count;
@@ -136,7 +137,7 @@ tensorflow::Status AllocationTracker::DeallocateShape(
     TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
         << "tuple has unexpected number of elements: " << elements.size()
         << " != " << ShapeUtil::TupleElementCount(shape);
-    for (int i = 0; i < elements.size(); ++i) {
+    for (size_t i = 0; i < elements.size(); ++i) {
       VLOG(2) << "recursing onto the tuple elements";
       TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
                                          shape.tuple_shapes(i),
@@ -170,6 +171,7 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
           executor, allocation->device_memory(), allocation->shape()));
 
   std::vector<GlobalDataHandle> element_handles;
+  element_handles.reserve(element_bases.size());
   for (int i = 0; i < element_bases.size(); ++i) {
     element_handles.push_back(RegisterInternal(
         allocation->backend(), allocation->device_ordinal(), element_bases[i],
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index e0076800162..ebbf35b6fe8 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -63,10 +63,10 @@ class Allocation {
     CHECK_GE(ref_count_, 0);
     return ref_count_;
   }
-  void increment_ref_count() {
+  void increment_ref_count(int inc) {
     CHECK_GT(ref_count_, 0);
-    CHECK_LT(ref_count_, INT_MAX);
-    ++ref_count_;
+    CHECK_LE(ref_count_, INT_MAX - inc);
+    ref_count_ += inc;
   }
   void decrement_ref_count() {
     CHECK_GT(ref_count_, 0);
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 7452a7b6965..66d54ad3802 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -41,13 +41,39 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+BackendOptions& BackendOptions::set_platform(
+    perftools::gputools::Platform* platform) {
+  platform_ = platform;
+  return *this;
+}
+
+perftools::gputools::Platform* BackendOptions::platform() const {
+  return platform_;
+}
+
+BackendOptions& BackendOptions::set_number_of_replicas(int number_of_replicas) {
+  number_of_replicas_ = number_of_replicas;
+  return *this;
+}
+
+int BackendOptions::number_of_replicas() const { return number_of_replicas_; }
+
+BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int BackendOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper()
-      : pool(new tensorflow::thread::ThreadPool(
-            tensorflow::Env::Default(), "XLAEigen",
-            tensorflow::port::NumSchedulableCPUs())),
+  explicit EigenThreadPoolWrapper(const int num_threads)
+      : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
+                                                "XLAEigen", num_threads)),
         wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
@@ -58,20 +84,21 @@ struct Backend::EigenThreadPoolWrapper {
 };
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
-    perftools::gputools::Platform* platform, int64 replica_count) {
+    const BackendOptions& options) {
+  int64 replica_count = options.number_of_replicas();
   if (replica_count == -1) {
     legacy_flags::BackendFlags* flags = legacy_flags::GetBackendFlags();
     replica_count = flags->xla_replicas;
   }
+  perftools::gputools::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
-  std::unique_ptr<Backend> backend(new Backend(
-      replica_count, platform, compiler, stream_executors, transfer_manager));
-  TF_RETURN_IF_ERROR(backend->PoolStreams(kInitialStreamsToPool,
-                                          backend->default_stream_executor()));
+  std::unique_ptr<Backend> backend(
+      new Backend(replica_count, platform, compiler, stream_executors,
+                  transfer_manager, options.intra_op_parallelism_threads()));
   return std::move(backend);
 }
 
@@ -79,51 +106,36 @@ struct Backend::EigenThreadPoolWrapper {
 Backend::CreateDefaultBackend() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetDefaultPlatform());
-  return CreateBackend(platform);
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  return CreateBackend(backend_options);
 }
 
-tensorflow::Status Backend::PoolStreams(int n, se::StreamExecutor* executor) {
-  std::vector<std::unique_ptr<se::Stream>> primed;
-  for (int i = 0; i < n; ++i) {
-    TF_ASSIGN_OR_RETURN(auto stream, AcquireStream(executor));
-    primed.emplace_back(std::move(stream));
-  }
-  for (int i = 0; i < n; ++i) {
-    ReleaseStream(std::move(primed.back()));
-    primed.pop_back();
-  }
-  return tensorflow::Status::OK();
+StatusOr<Backend::StreamPtr> Backend::BorrowStream(int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(auto exec, stream_executor(device_ordinal));
+  return BorrowStream(exec);
 }
 
-StatusOr<std::unique_ptr<perftools::gputools::Stream>> Backend::AcquireStream(
-    perftools::gputools::StreamExecutor* executor) {
-  tensorflow::mutex_lock lock(mutex_);
-  auto& cached_streams = cached_streams_[executor];
-  if (!cached_streams.empty()) {
-    auto result = std::move(cached_streams.back());
-    cached_streams.pop_back();
-    return std::move(result);
+StatusOr<Backend::StreamPtr> Backend::BorrowStream(
+    se::StreamExecutor* executor) {
+  tensorflow::mutex_lock l(mu_);
+  if (0 == stream_pools_.count(executor)) {
+    stream_pools_.emplace(std::piecewise_construct,
+                          std::forward_as_tuple(executor),
+                          std::forward_as_tuple([executor]() {
+                            auto stream = MakeUnique<se::Stream>(executor);
+                            stream->Init();
+                            return stream;
+                          }));
   }
-
-  auto stream = MakeUnique<se::Stream>(executor);
-  if (!stream->Init().ok()) {
-    return InternalError("failed to initialize stream");
-  }
-  return std::move(stream);
-}
-
-void Backend::ReleaseStream(
-    std::unique_ptr<perftools::gputools::Stream> stream) {
-  tensorflow::mutex_lock lock(mutex_);
-  auto& streams = cached_streams_[stream->parent()];
-  streams.emplace_back(std::move(stream));
+  return stream_pools_.at(executor).Allocate();
 }
 
 Backend::Backend(
     int64 replica_count, perftools::gputools::Platform* platform,
     Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
-    TransferManager* transfer_manager)
+    TransferManager* transfer_manager, int intra_op_parallelism_threads)
     : platform_(platform),
       compiler_(compiler),
       transfer_manager_(transfer_manager),
@@ -153,7 +165,11 @@ Backend::Backend(
     inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
         tensorflow::Env::Default(), "xla_inter_op",
         tensorflow::port::NumSchedulableCPUs()));
-    intra_op_thread_pool_wrapper_.reset(new EigenThreadPoolWrapper());
+    const int num_threads = intra_op_parallelism_threads > 0
+                                ? intra_op_parallelism_threads
+                                : tensorflow::port::NumSchedulableCPUs();
+    intra_op_thread_pool_wrapper_.reset(
+        new EigenThreadPoolWrapper(num_threads));
   }
 }
 
@@ -199,10 +215,19 @@ tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) return nullptr;
+  if (intra_op_thread_pool_wrapper_ == nullptr) {
+    return nullptr;
+  }
   return intra_op_thread_pool_wrapper_->device.get();
 }
 
+tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
+  if (intra_op_thread_pool_wrapper_ == nullptr) {
+    return nullptr;
+  }
+  return intra_op_thread_pool_wrapper_->pool.get();
+}
+
 StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index db482c09ae2..e0b15dc43f2 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -33,29 +34,50 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 
 namespace Eigen {
-class ThreadPoolDevice;
+struct ThreadPoolDevice;
 }
 
 namespace xla {
 
+// Options to configure the backend when it is created.
+class BackendOptions {
+ public:
+  // Set the platform backing the backend, or nullptr for the default platform.
+  BackendOptions& set_platform(perftools::gputools::Platform* platform);
+  perftools::gputools::Platform* platform() const;
+
+  // Set the number of replicas to use when compiling replicated
+  // programs. The default is -1 meaning that the value is read from
+  // the xla_replicas flag.
+  BackendOptions& set_number_of_replicas(int number_of_replicas);
+  int number_of_replicas() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  // The default value of -1 will result in initializing the thread pool with
+  // the number of threads equal to the number of cores in the system.
+  BackendOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+ private:
+  perftools::gputools::Platform* platform_ = nullptr;
+  int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
+};
+
 // Class which encapsulates an XLA backend. It includes everything necessary
 // to compile and execute computations on a particular platform.
 //
 // It also offers a pooling API for creation/use of initialized streams:
 //
-//    std::unique_ptr<se::Stream> stream =
-//        backend->AcquireStream().ConsumeValueOrDie();
-//    // ... use stream ...
-//    backend->ReleaseStream(std::move(stream));
+//    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  // The number of streams we create for the pool at initialization time.
-  static constexpr int kInitialStreamsToPool = 8;
+  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
 
   // Creates a new backend for the given platform with the given number of
-  // replicas. A value of -1 means to use the flag value.
+  // replicas.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
-      perftools::gputools::Platform* platform, int64 replica_count = -1);
+      const BackendOptions& options);
 
   // Creates a backend for the default platform. The default platform is defined
   // in PlatformUtil.
@@ -108,22 +130,19 @@ class Backend {
     return stream_executors_[0];
   }
 
-  // Primes the internal pool of streams for AcquireStream/ReleaseStream with n
-  // initialized stream instances.
-  tensorflow::Status PoolStreams(int n,
-                                 perftools::gputools::StreamExecutor* executor);
-
-  // Acquires a stream for use by the caller, either by grabbing it from an
+  // Borrows a stream for use by the caller, either by grabbing it from an
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
-  //
-  // TODO(b/32989582): Return std::unique_ptr with custom deleter.
-  StatusOr<std::unique_ptr<perftools::gputools::Stream>> AcquireStream(
+  StatusOr<StreamPtr> BorrowStream(int device_ordinal);
+  StatusOr<StreamPtr> BorrowStream(
       perftools::gputools::StreamExecutor* executor);
 
-  // Releases a stream from the caller to the internal pool, for use with the
-  // paired AcquireStream above.
-  void ReleaseStream(std::unique_ptr<perftools::gputools::Stream> stream);
+  // Returns a function to borrow a stream, as `BorrowStream` above does.
+  // Purely for convenience, the caller could rather make this anonymous
+  // function itself.
+  std::function<StatusOr<StreamPtr>(int)> StreamBorrower() {
+    return [this](int device_ordinal) { return BorrowStream(device_ordinal); };
+  }
 
   // Returns whether the given device ordinal of the backend is supported.
   bool device_ordinal_supported(int device_ordinal) const {
@@ -148,6 +167,7 @@ class Backend {
   // For the host platform, returns the configured eigen threadpool device to be
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
+  tensorflow::thread::ThreadPool* eigen_intra_op_thread_pool() const;
 
   // Resets the devices associated with this backend.
   Status ResetDevices();
@@ -158,7 +178,7 @@ class Backend {
           Compiler* compiler,
           tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
               stream_executors,
-          TransferManager* transfer_manager);
+          TransferManager* transfer_manager, int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
@@ -170,14 +190,12 @@ class Backend {
   // Vector of stream executors. stream_executors_[0] is the default executor.
   std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
 
-  // Guards the mutable state in the backend object.
-  tensorflow::mutex mutex_;
+  tensorflow::mutex mu_;
 
-  // Mapping from stream executor to cached streams, used by
-  // AcquireStream/ReleaseStream above.
+  // Mapping from stream executor to stream pools, used by `BorrowStream` above.
   std::map<perftools::gputools::StreamExecutor*,
-           std::vector<std::unique_ptr<perftools::gputools::Stream>>>
-      cached_streams_ GUARDED_BY(mutex_);
+           Pool<perftools::gputools::Stream>>
+      stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 820c2e95f1a..f91eb0207a2 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -25,13 +25,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -39,12 +41,66 @@ limitations under the License.
 
 namespace xla {
 
-void BufferAllocation::AddAssignment(const LogicalBuffer& buffer) {
-  DCHECK(std::find(assigned_buffers_.begin(), assigned_buffers_.end(),
-                   &buffer) == assigned_buffers_.end())
-      << "LogicalBuffer " << buffer.ToString()
-      << " already assigned to allocation " << index();
-  assigned_buffers_.push_back(&buffer);
+using ::tensorflow::gtl::FlatMap;
+using ::tensorflow::gtl::FlatSet;
+using ::tensorflow::strings::Appendf;
+using ::tensorflow::strings::HumanReadableNumBytes;
+
+size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
+  uint64 h = std::hash<int64>()(s.index());
+  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.offset()));
+  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.size()));
+  return h;
+}
+
+string BufferAllocation::Slice::ToString() const {
+  return tensorflow::strings::StrCat("{index:", index(), ", offset:", offset_,
+                                     ", size:", size_, "}");
+}
+
+BufferAllocation::Slice BufferAllocation::GetSlice(
+    const LogicalBuffer& buffer) const {
+  const OffsetSize os = FindOrDie(assigned_buffers_, &buffer);
+  return Slice(this, os.offset, os.size);
+}
+
+void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
+                                     int64 size) {
+  CHECK(assigned_buffers_.count(&buffer) == 0)
+      << "LogicalBuffer " << buffer << " already assigned to allocation "
+      << index_;
+  CHECK_LE(offset, size_) << "LogicalBuffer " << buffer
+                          << " offset out of range";
+  CHECK_LE(offset + size, size_)
+      << "LogicalBuffer " << buffer << " size out of range";
+  CHECK_EQ(buffer.color(), color())
+      << "Buffer color " << buffer.color()
+      << " does not match allocation color " << color() << ".";
+  OffsetSize offset_size;
+  offset_size.offset = offset;
+  offset_size.size = size;
+  assigned_buffers_.emplace(&buffer, offset_size);
+}
+
+BufferAllocationProto BufferAllocation::ToProto() const {
+  BufferAllocationProto proto;
+  proto.set_index(index_);
+  proto.set_size(size_);
+  proto.set_is_thread_local(is_thread_local_);
+  proto.set_is_reusable(is_reusable_);
+  proto.set_color(color_.value());
+  if (is_entry_computation_parameter_) {
+    proto.set_is_entry_computation_parameter(true);
+    proto.set_parameter_number(parameter_number_);
+  }
+  proto.set_maybe_live_out(maybe_live_out_);
+  for (const auto& buffer_offset_size : assigned_buffers_) {
+    BufferAllocationProto::Assigned* proto_assigned = proto.add_assigned();
+    proto_assigned->set_logical_buffer_id(buffer_offset_size.first->id());
+    proto_assigned->set_offset(buffer_offset_size.second.offset);
+    proto_assigned->set_size(buffer_offset_size.second.size);
+  }
+  return proto;
 }
 
 string BufferAllocation::ToString() const {
@@ -52,19 +108,38 @@ string BufferAllocation::ToString() const {
   tensorflow::strings::StrAppend(
       &output, tensorflow::strings::Printf("allocation %lld: %p, size %lld",
                                            index_, this, size()));
+  if (color().value() != 0) {
+    tensorflow::strings::StrAppend(&output, ", color ", color().value());
+  }
   if (is_entry_computation_parameter()) {
     tensorflow::strings::StrAppend(&output, ", parameter ", parameter_number());
   }
   if (is_thread_local()) {
     tensorflow::strings::StrAppend(&output, ", thread-local");
   }
+  if (maybe_live_out()) {
+    tensorflow::strings::StrAppend(&output, ", maybe-live-out");
+  }
+  if (IsPreallocatedTempBuffer()) {
+    tensorflow::strings::StrAppend(&output, ", preallocated-temp");
+  }
   tensorflow::strings::StrAppend(&output, ":\n");
-  for (const auto& buffer : assigned_buffers()) {
+  // Dump the assigned buffers ordered by id.
+  std::vector<const LogicalBuffer*> sorted_buffers;
+  for (const auto& buffer_offset_size : assigned_buffers_) {
+    sorted_buffers.push_back(buffer_offset_size.first);
+  }
+  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
+            [](const LogicalBuffer* a, const LogicalBuffer* b) {
+              return a->id() < b->id();
+            });
+  for (const LogicalBuffer* buffer : sorted_buffers) {
+    const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
     tensorflow::strings::StrAppend(
         &output,
         tensorflow::strings::Printf(
-            "  %s::%s : %s\n", buffer->instruction()->parent()->name().c_str(),
-            buffer->ToString().c_str(),
+            "  %s [%lld,%lld]: %s\n", buffer->ToString().c_str(),
+            offset_size.offset, offset_size.size,
             ShapeUtil::HumanStringWithLayout(buffer->shape()).c_str()));
   }
   return output;
@@ -75,6 +150,11 @@ std::ostream& operator<<(std::ostream& out, const BufferAllocation& buffer) {
   return out;
 }
 
+std::ostream& operator<<(std::ostream& out, const BufferAllocation::Slice& s) {
+  out << s.ToString();
+  return out;
+}
+
 const PointsToSet& BufferAssignment::GetPointsToSet(
     const HloInstruction* instruction) const {
   return points_to_analysis().GetPointsToSet(instruction);
@@ -96,22 +176,21 @@ BufferAllocation* BufferAssignment::GetMutableAssignedAllocation(
   return const_cast<BufferAllocation*>(&GetAssignedAllocation(buffer));
 }
 
-std::set<BufferAllocation> BufferAssignment::GetAllocations(
+std::set<BufferAllocation::Slice> BufferAssignment::GetAllSlices(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  std::set<BufferAllocation> allocations;
+  std::set<BufferAllocation::Slice> result;
   for (const LogicalBuffer* buffer : GetSourceBuffers(instruction, index)) {
-    if (allocation_index_for_buffer_.count(buffer) > 0) {
-      allocations.insert(
-          GetAllocation(allocation_index_for_buffer_.at(buffer)));
+    if (HasAllocation(*buffer)) {
+      result.insert(GetAssignedAllocation(*buffer).GetSlice(*buffer));
     }
   }
-  return allocations;
+  return result;
 }
 
 const BufferAllocation& BufferAssignment::GetAllocation(
     BufferAllocation::Index index) const {
-  CHECK(index >= 0 && index < allocations_.size())
-      << "Allocation index " << index << " is out of range.";
+  CHECK_GE(index, 0);
+  CHECK_LT(index, allocations_.size());
   return allocations_[index];
 }
 
@@ -131,71 +210,212 @@ bool BufferAssignment::HasTopLevelAllocation(
   return false;
 }
 
-StatusOr<const BufferAllocation*> BufferAssignment::GetUniqueAllocation(
+StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  const BufferAllocation* allocation = nullptr;
+  BufferAllocation::Slice result;
   for (const LogicalBuffer* buffer :
        GetPointsToSet(instruction).element(index)) {
     if (HasAllocation(*buffer)) {
-      if (allocation != nullptr &&
-          *allocation != GetAssignedAllocation(*buffer)) {
+      const BufferAllocation::Slice slice =
+          GetAssignedAllocation(*buffer).GetSlice(*buffer);
+      if (result.allocation() == nullptr) {
+        result = slice;
+      } else if (result != slice) {
         return FailedPrecondition(
-            "LogicalBuffer allocation for instruction %s at index {%s} cannot "
+            "BufferAllocation::Slice for instruction %s at index %s cannot "
             "be determined at compile-time.",
-            instruction->name().c_str(),
-            tensorflow::str_util::Join(index, ",").c_str());
+            instruction->name().c_str(), index.ToString().c_str());
       }
-      allocation = &GetAssignedAllocation(*buffer);
     }
   }
-  if (allocation == nullptr) {
+  if (result.allocation() == nullptr) {
     return FailedPrecondition(
-        "instruction %s has no buffer allocation at index {%s}",
-        instruction->name().c_str(),
-        tensorflow::str_util::Join(index, ",").c_str());
+        "BufferAllocation::Slice not assigned for instruction %s at index %s",
+        instruction->name().c_str(), index.ToString().c_str());
   }
-  return allocation;
+  return result;
 }
 
-StatusOr<const BufferAllocation*> BufferAssignment::GetUniqueTopLevelAllocation(
+StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueTopLevelSlice(
     const HloInstruction* instruction) const {
-  return GetUniqueAllocation(instruction, /*index=*/{});
+  return GetUniqueSlice(instruction, /*index=*/{});
 }
 
-StatusOr<const BufferAllocation*>
-BufferAssignment::GetUniqueTopLevelOutputAllocation() const {
-  return GetUniqueTopLevelAllocation(
+bool BufferAssignment::SharesSliceAtIndex(
+    const HloInstruction* hlo_a, const ShapeIndex& shape_index_a,
+    const HloInstruction* hlo_b, const ShapeIndex& shape_index_b) const {
+  return GetUniqueSlice(hlo_a, shape_index_a).ConsumeValueOrDie() ==
+         GetUniqueSlice(hlo_b, shape_index_b).ConsumeValueOrDie();
+}
+
+StatusOr<BufferAllocation::Slice>
+BufferAssignment::GetUniqueTopLevelOutputSlice() const {
+  return GetUniqueTopLevelSlice(
       module_->entry_computation()->root_instruction());
 }
 
+BufferAllocation* BufferAssignment::NewEmptyAllocation(
+    int64 size, bool is_thread_local, bool is_reusable,
+    LogicalBuffer::Color color) {
+  BufferAllocation::Index index = allocations_.size();
+  allocations_.emplace_back(index, size, is_thread_local, is_reusable, color);
+  BufferAllocation* allocation = &allocations_.back();
+  return allocation;
+}
+
 BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
                                                   int64 size,
                                                   bool is_thread_local,
                                                   bool is_reusable) {
-  BufferAllocation::Index index = allocations_.size();
-  allocations_.emplace_back(index, size, is_thread_local, is_reusable);
-  BufferAllocation* allocation = &allocations_.back();
-  AddAssignment(buffer, allocation, /*colocated_buffer=*/false);
-  allocation_index_for_buffer_[&buffer] = index;
+  BufferAllocation* allocation =
+      NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
+  AddAssignment(allocation, buffer, /*offset=*/0, size);
   return allocation;
 }
 
 // Adds an instruction to the set assigned to the given buffer.
-void BufferAssignment::AddAssignment(const LogicalBuffer& buffer,
-                                     BufferAllocation* allocation,
-                                     bool colocated_buffer) {
+void BufferAssignment::AddAssignment(BufferAllocation* allocation,
+                                     const LogicalBuffer& buffer, int64 offset,
+                                     int64 size) {
   CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
-  CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty() ||
-        colocated_buffer)
+  CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
       << "Non-reusable allocation already assigned a buffer";
 
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
 
-  allocation->AddAssignment(buffer);
+  allocation->AddAssignment(buffer, offset, size);
   allocation_index_for_buffer_[&buffer] = allocation->index();
 }
 
+// Combines allocations of temporary buffers of the same color into one big
+// BufferAllocation.
+void BufferAssignment::CombineTempAllocations() {
+  FlatMap<LogicalBuffer::Color, BufferAllocation, LogicalBuffer::Color::Hasher>
+      combined_allocation_map;
+
+  // Move all temp allocations into a single run at the end of the allocations
+  // vector.
+  const auto first_temp_it =
+      std::partition(allocations_.begin(), allocations_.end(),
+                     [](const BufferAllocation& allocation) {
+                       return !allocation.IsPreallocatedTempBuffer();
+                     });
+
+  // Walk over the run of temp allocations, collecting the allocations belonging
+  // to the same color.
+  if (first_temp_it != allocations_.end()) {
+    for (auto it = first_temp_it; it != allocations_.end(); ++it) {
+      const BufferAllocation& temp_allocation = *it;
+      LogicalBuffer::Color color = temp_allocation.color();
+      auto combined_it = combined_allocation_map.find(color);
+      if (combined_it == combined_allocation_map.end()) {
+        // We have found the first temp allocation of this color. Collect
+        // the other temp allocations of the same color into it.
+        combined_allocation_map.emplace(color, temp_allocation);
+        continue;
+      }
+
+      auto* combined_allocation = &combined_it->second;
+      // Each temp allocation is placed end-to-end, accounting for alignment.
+      // The offset of each buffer in the combined allocation is computed from
+      // the base offset of the allocation.
+      const int64 base =
+          RoundUpToNearest(combined_allocation->size(), alignment_);
+      combined_allocation->set_size(base + temp_allocation.size());
+      for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
+        const LogicalBuffer* buffer = buffer_offset_size.first;
+        const int64 offset = buffer_offset_size.second.offset;
+        const int64 size = buffer_offset_size.second.size;
+        combined_allocation->AddAssignment(*buffer, base + offset, size);
+      }
+    }
+    // Replace all existing temporary allocations with the new combined
+    // allocations.
+    allocations_.erase(first_temp_it, allocations_.end());
+    for (auto& combined : combined_allocation_map) {
+      allocations_.push_back(combined.second);
+      temp_allocation_total_size_ += combined.second.size();
+    }
+  }
+
+  // Update allocation indices to their new positions.
+  allocation_index_for_buffer_.clear_no_resize();
+  for (size_t index = 0; index < allocations_.size(); ++index) {
+    BufferAllocation* allocation = &allocations_[index];
+    allocation->set_index(index);
+    for (const auto& buffer_offset_size : allocation->assigned_buffers_) {
+      const LogicalBuffer* buffer = buffer_offset_size.first;
+      allocation_index_for_buffer_[buffer] = index;
+    }
+  }
+}
+
+Status BufferAssignment::ComputeSummaryStats() {
+  for (auto& allocation : Allocations()) {
+    if (allocation.is_entry_computation_parameter()) {
+      stats_.parameter_allocation_count++;
+      stats_.parameter_allocation_bytes += allocation.size();
+    }
+    if (allocation.maybe_live_out()) {
+      stats_.maybe_live_out_allocation_count++;
+      stats_.maybe_live_out_allocation_bytes += allocation.size();
+    }
+    if (allocation.IsPreallocatedTempBuffer()) {
+      stats_.preallocated_temp_allocation_count++;
+      stats_.preallocated_temp_allocation_bytes += allocation.size();
+    }
+    stats_.total_allocation_count++;
+    stats_.total_allocation_bytes += allocation.size();
+  }
+
+  // Only compute total fragmentation if all computations are sequential.
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  for (const auto& computation : module_->computations()) {
+    const std::vector<const HloInstruction*>* sequence =
+        liveness_->hlo_ordering().SequentialOrder(*computation);
+    if (sequence != nullptr) {
+      module_sequence.emplace(computation.get(), *sequence);
+    }
+  }
+  if (module_sequence.size() == module_->computations().size()) {
+    TF_ASSIGN_OR_RETURN(
+        const int64 min_size,
+        MinimumMemoryForSequence(module_sequence, buffer_size_));
+    stats_.total_fragmentation_bytes = stats_.total_allocation_bytes - min_size;
+  }
+
+  return Status::OK();
+}
+
+string BufferAssignment::Stats::ToString() const {
+  string s;
+  Appendf(&s, "BufferAssignment stats:\n");
+  Appendf(&s, "             parameter allocation: %10s\n",
+          HumanReadableNumBytes(parameter_allocation_bytes).c_str());
+  Appendf(&s, "        maybe_live_out allocation: %10s\n",
+          HumanReadableNumBytes(maybe_live_out_allocation_bytes).c_str());
+  Appendf(&s, "     preallocated temp allocation: %10s\n",
+          HumanReadableNumBytes(preallocated_temp_allocation_bytes).c_str());
+  if (preallocated_temp_fragmentation_bytes >= 0) {
+    const double percent = 100. * preallocated_temp_fragmentation_bytes /
+                           preallocated_temp_allocation_bytes;
+    Appendf(
+        &s, "  preallocated temp fragmentation: %10s (%.2f%%)\n",
+        HumanReadableNumBytes(preallocated_temp_fragmentation_bytes).c_str(),
+        percent);
+  }
+  Appendf(&s, "                 total allocation: %10s\n",
+          HumanReadableNumBytes(total_allocation_bytes).c_str());
+  if (total_fragmentation_bytes >= 0) {
+    const double percent =
+        100. * total_fragmentation_bytes / total_allocation_bytes;
+    Appendf(&s, "              total fragmentation: %10s (%.2f%%)\n",
+            HumanReadableNumBytes(total_fragmentation_bytes).c_str(), percent);
+  }
+  return s;
+}
+
 string BufferAssignment::ToString() const {
   string output;
   tensorflow::strings::StrAppend(&output, "BufferAssignment:\n");
@@ -205,6 +425,44 @@ string BufferAssignment::ToString() const {
   return output;
 }
 
+BufferAssignmentProto BufferAssignment::ToProto() const {
+  BufferAssignmentProto proto;
+  // NOTE: TuplePointsToAnalysis state is serialized here in BufferAssigment,
+  // because we need to do the HasAllocation check for each buffer. Otherwise
+  // the buffer_size_ call might fail for some backends.
+  const TuplePointsToAnalysis& points_to_analysis =
+      liveness_->points_to_analysis();
+  for (const auto& buffer : points_to_analysis.logical_buffers()) {
+    if (HasAllocation(*buffer)) {
+      LogicalBufferProto proto_buffer = buffer->ToProto(buffer_size_);
+      proto.add_logical_buffers()->Swap(&proto_buffer);
+
+      // Fill buffer aliases.
+      for (const BufferAlias& alias :
+           points_to_analysis.GetBufferAliases(*buffer)) {
+        if (alias.instruction() == buffer->instruction() &&
+            alias.index() == buffer->index()) {
+          continue;  // skip self-aliases
+        }
+        BufferAssignmentProto::BufferAlias* proto_alias =
+            proto.add_buffer_aliases();
+        LogicalBufferProto::Location proto_alias_location =
+            LogicalBuffer::ToLocationProto(*alias.instruction(), alias.index());
+        proto_alias->set_source_buffer_id(buffer->id());
+        proto_alias->mutable_location()->Swap(&proto_alias_location);
+      }
+    }
+  }
+  for (const BufferAllocation& allocation : Allocations()) {
+    BufferAllocationProto proto_allocation = allocation.ToProto();
+    proto.add_buffer_allocations()->Swap(&proto_allocation);
+  }
+  for (const HeapSimulatorTrace& trace : heap_simulator_traces_) {
+    *proto.add_heap_simulator_traces() = trace;
+  }
+  return proto;
+}
+
 namespace {
 
 // Walk the call graph of the HLO module and place each computation into either
@@ -213,7 +471,7 @@ namespace {
 // elements in thread_local_computations and global_computations are in post
 // order (if computation A has an instruction which calls computation B, then A
 // will appear after B in the vector).
-tensorflow::Status GatherComputationsByAllocationType(
+Status GatherComputationsByAllocationType(
     const HloModule* module,
     std::vector<const HloComputation*>* thread_local_computations,
     std::vector<const HloComputation*>* global_computations) {
@@ -225,8 +483,8 @@ tensorflow::Status GatherComputationsByAllocationType(
 
   // Sets for quickly checking membership. Computations are returned in vectors
   // for stable iteration.
-  std::unordered_set<HloComputation*> thread_local_set;
-  std::unordered_set<HloComputation*> global_set;
+  FlatSet<HloComputation*> thread_local_set;
+  FlatSet<HloComputation*> global_set;
 
   while (!worklist.empty()) {
     auto worklist_front = worklist.front();
@@ -263,7 +521,8 @@ tensorflow::Status GatherComputationsByAllocationType(
     }
 
     for (auto& instruction : computation->instructions()) {
-      for (auto* subcomputation : instruction->MakeCalledComputationsSet()) {
+      for (HloComputation* subcomputation :
+           instruction->called_computations()) {
         switch (instruction->opcode()) {
           case HloOpcode::kCall:
           case HloOpcode::kWhile:
@@ -308,7 +567,7 @@ tensorflow::Status GatherComputationsByAllocationType(
     // will not appear in either thread_local_set or global_set. We don't bother
     // assigning buffers for these.
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
@@ -316,39 +575,33 @@ tensorflow::Status GatherComputationsByAllocationType(
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size, bool colocate_related_buffers,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
-  BufferAssigner assigner(std::move(buffer_size), colocate_related_buffers);
+    LogicalBuffer::SizeFunction buffer_size, int64 alignment,
+    bool allow_input_output_aliasing, TuplePointsToAnalysis::Colorer colorer) {
+  BufferAssigner assigner(alignment, allow_input_output_aliasing,
+                          std::move(colorer));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
-                                   hlos_to_allocate);
-}
-
-/* static */
-StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    int64 pointer_size) {
-  return BufferAssigner::Run(module, std::move(hlo_ordering),
-                             [pointer_size](const LogicalBuffer& buffer) {
-                               return ShapeUtil::IsOpaque(buffer.shape())
-                                          ? 0
-                                          : ShapeUtil::ByteSizeOf(
-                                                buffer.shape(), pointer_size);
-                             },
-                             /*colocate_related_buffers=*/true);
+                                   std::move(buffer_size));
 }
 
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
                                        const LogicalBuffer& buffer,
                                        BufferAssignment* assignment) {
+  const LogicalBuffer::SizeFunction& buffer_size = assignment->buffer_size_;
+
   CHECK(!assignment->HasAllocation(buffer))
       << "buffer " << buffer << " already has an allocation assigned.";
 
-  VLOG(4) << "Trying to assign " << buffer.ToString()
-          << " to allocation: " << allocation->ToString();
+  VLOG(4) << "Trying to assign " << buffer << " to allocation: " << *allocation;
 
-  if (buffer_size_(buffer) > allocation->size()) {
+  if (buffer.color() != allocation->color()) {
+    VLOG(4) << "Can't assign: buffer has color" << buffer.color()
+            << " and allocation has color " << allocation->color() << ".";
+    return false;
+  }
+
+  if (buffer_size(buffer) > allocation->size()) {
     VLOG(4) << "Can't assign: buffer is larger than allocation ("
-            << buffer_size_(buffer) << " > " << allocation->size() << ")";
+            << buffer_size(buffer) << " > " << allocation->size() << ")";
     return false;
   }
 
@@ -362,139 +615,198 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     return false;
   }
 
-  for (const LogicalBuffer* assigned_buffer : allocation->assigned_buffers()) {
-    if (assignment->liveness().MayInterfere(*assigned_buffer, buffer)) {
-      VLOG(4) << "Can't assign: assignee " << assigned_buffer->ToString()
-              << " may interfere with " << buffer.ToString();
+  for (const auto& buffer_offset_size : allocation->assigned_buffers()) {
+    const LogicalBuffer& assigned_buffer = *buffer_offset_size.first;
+    if (assignment->liveness().MayInterfere(assigned_buffer, buffer)) {
+      VLOG(4) << "Can't assign: assignee " << assigned_buffer
+              << " may interfere with " << buffer;
       return false;
     }
+    // Copy instruction don't share a buffer with their input operand.
+    if (buffer.instruction()->IsUserOf(assigned_buffer.instruction()) &&
+        buffer.instruction()->opcode() == HloOpcode::kCopy) {
+      VLOG(4) << "Can't assign: assignee " << assigned_buffer
+              << " is used at copy instruction " << buffer;
+      return false;
+    }
+  }
+
+  if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
+    HloComputation* entry_computation =
+        assignment->module_->entry_computation();
+    for (auto param : entry_computation->parameter_instructions()) {
+      for (auto& param_buffer :
+           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
+               param)) {
+        if (assignment->liveness().MayInterfere(*param_buffer, buffer)) {
+          VLOG(4) << "Can't assign: Parameter interference with result";
+          return false;
+        }
+      }
+    }
   }
 
   // If the buffer is live out of the computation then it should only be
   // assigned a buffer which exactly fits the result to avoid wasting memory
   // (result buffers can have arbitrary lifetimes).
   if (assignment->liveness().MaybeLiveOut(buffer) &&
-      allocation->size() != buffer_size_(buffer)) {
-    VLOG(4) << "Can't assign: buffer " << buffer.ToString()
+      allocation->size() != buffer_size(buffer)) {
+    VLOG(4) << "Can't assign: buffer " << buffer
             << "is live out and size not the same as allocation";
     return false;
   }
 
-  assignment->AddAssignment(buffer, allocation, /*colocated_buffer=*/false);
+  assignment->AddAssignment(allocation, buffer, /*offset=*/0,
+                            buffer_size(buffer));
   return true;
 }
 
-tensorflow::Status BufferAssigner::AssignBuffersForComputation(
+Status BufferAssigner::AssignBuffersForComputation(
     const HloComputation* computation, bool is_thread_local,
-    const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
-    const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
-    const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
-        colocated_allocations,
+    const FlatSet<const LogicalBuffer*>& colocated_buffers,
+    const FlatSet<BufferAllocation::Index>& colocated_allocations,
+    FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+        buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
   // size.
   std::vector<const LogicalBuffer*> sorted_buffers;
   for (auto& instruction : computation->instructions()) {
-    if (hlos_to_allocate == nullptr ||
-        hlos_to_allocate->count(instruction.get()) > 0) {
-      // Add all buffers which this instruction defines. Instruction which don't
-      // define buffers (eg, bitcast which just forwards a pointer) don't need
-      // any allocations.
-      for (const LogicalBuffer* buffer :
-           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-               instruction.get())) {
-        sorted_buffers.push_back(buffer);
-      }
+    // Add all buffers which this instruction defines. Instruction which don't
+    // define buffers (eg, bitcast which just forwards a pointer) don't need
+    // any allocations.
+    for (const LogicalBuffer* buffer :
+         assignment->points_to_analysis().GetBuffersDefinedByInstruction(
+             instruction.get())) {
+      sorted_buffers.push_back(buffer);
     }
   }
 
   // Generate a post order sort of instructions for sorting of the
   // LogicalBuffers.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> post_order_position;
+  FlatMap<const HloInstruction*, int> post_order_position;
   int position = 0;
   for (auto* instruction : computation->MakeInstructionPostOrder()) {
     post_order_position.emplace(instruction, position);
     position++;
   }
 
+  // If there is a sequential instruction ordering, we'll delay assignment of
+  // temp buffers until after the main assignment loop.
+  const BufferLiveness& liveness = assignment->liveness();
+  const bool has_sequential_order =
+      liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
+  if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
+    // Every sequential computation must get an entry in the
+    // buffers_to_assign_sequentially map, even if we end up with an empty set
+    // of buffers. This ensures we can correctly determine whether to run
+    // whole-module heap simulation.
+    buffers_to_assign_sequentially->emplace(computation,
+                                            FlatSet<const LogicalBuffer*>());
+  }
+
   // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
   // first for simplicity. This means any previously created BufferAllocation is
   // necessarily large enough to hold the output of the current Buffer in
   // consideration.
   //
-  // As a secondary sorting criteria, use post order position of the HLO
-  // instruction which defines the buffer. This means an instruction will appear
-  // after its operands (assuming operands are the same/larger size) enabling
-  // the important reuse case where an elementwise instruction reuses one of its
+  // As a secondary sorting criteria, if the instructions are sequentially
+  // ordered, we assign live-out buffers before others. Note that for sequential
+  // computations, we'll take temp buffers that can't re-use any allocations and
+  // assign them via a heap scheduler. By assigning live-out buffers first, we
+  // increase the odds that temp buffers can re-use an allocation.
+  //
+  // As a final tiebreaker use post order position of the HLO instruction which
+  // defines the buffer. This means an instruction will appear after its
+  // operands (assuming operands are the same/larger size) enabling the
+  // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
   std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [this, &post_order_position](const LogicalBuffer* a,
-                                         const LogicalBuffer* b) {
-              int64 a_size = buffer_size_(*a);
-              int64 b_size = buffer_size_(*b);
-              if (a_size == b_size) {
-                // For instructions with the same size buffers, sort them in
-                // post order.
-                return post_order_position.at(a->instruction()) <
-                       post_order_position.at(b->instruction());
-              } else {
-                // We want the HLOs sorted in reverse order by size so use ">".
-                return a_size > b_size;
+            [this, has_sequential_order, &liveness, &post_order_position,
+             assignment](const LogicalBuffer* a, const LogicalBuffer* b) {
+              // Primary sort is by decreasing buffer size.
+              const int64 a_size = assignment->buffer_size_(*a);
+              const int64 b_size = assignment->buffer_size_(*b);
+              if (a_size != b_size) {
+                return a_size > b_size;  // use ">" for decreasing size.
               }
+              // Otherwise live out buffers come before others, if the
+              // instructions are sequentially ordered.
+              if (has_sequential_order) {
+                const bool a_live_out = liveness.MaybeLiveOut(*a);
+                const bool b_live_out = liveness.MaybeLiveOut(*b);
+                if (a_live_out != b_live_out) {
+                  return a_live_out;
+                }
+              }
+              // Final tiebreaker is in instruction post order.
+              return post_order_position.at(a->instruction()) <
+                     post_order_position.at(b->instruction());
             });
 
   // BufferAllocations are necessarily created in decreasing size order. Keep
   // indices of previously created BufferAllocations in allocation_indices.
   std::vector<BufferAllocation::Index> allocation_indices;
-  for (const auto* buffer : sorted_buffers) {
-    VLOG(3) << "Assigning allocation to: " << buffer->ToString();
+  for (const LogicalBuffer* buffer : sorted_buffers) {
+    VLOG(3) << "Assigning allocation to: " << *buffer;
     if (colocated_buffers.count(buffer) > 0) {
       // Colocated buffers are currently assigned in an earlier pass.
+      VLOG(3) << "Skipping colocated buffer: " << *buffer;
       continue;
     }
 
     TF_RET_CHECK(!assignment->HasAllocation(*buffer));
 
-    if (buffer->instruction()->opcode() == HloOpcode::kConstant) {
+    const HloInstruction* instruction = buffer->instruction();
+    if (instruction->opcode() == HloOpcode::kConstant) {
       // No BufferAllocations for constants.
       // TODO(b/32248867): For consistency, constants should get allocations.
+      VLOG(3) << "Skipping constant: " << *buffer;
       continue;
     }
 
-    if (buffer->instruction()->opcode() == HloOpcode::kParameter &&
-        computation == computation->parent()->entry_computation()) {
+    const int64 buffer_size = assignment->buffer_size_(*buffer);
+
+    const bool is_entry_parameter =
+        instruction->opcode() == HloOpcode::kParameter &&
+        computation == computation->parent()->entry_computation();
+    if (is_entry_parameter) {
       // If the LogicalBuffer is part of an external parameter, creates a new
       // allocation and sets its parameter number. Parameters of non-entry
       // computations do not need special allocations because they live inside
       // callers.
       BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size_(*buffer),
+          assignment->NewAllocation(*buffer, buffer_size,
                                     /*is_thread_local=*/false,
                                     /*is_reusable=*/false);
       allocation->set_entry_computation_parameter(
-          buffer->instruction()->parameter_number());
-      VLOG(3) << "New allocation for entry computation parameter: "
-              << buffer->ToString();
+          instruction->parameter_number());
+      VLOG(3) << "New allocation #" << allocation->index()
+              << " for entry computation parameter: " << *buffer;
       continue;
     }
 
     legacy_flags::BufferAssignmentFlags* flags =
         legacy_flags::GetBufferAssignmentFlags();
     if (!flags->xla_enable_buffer_reuse || is_thread_local ||
-        buffer->instruction()->opcode() == HloOpcode::kCustomCall) {
+        instruction->opcode() == HloOpcode::kCustomCall) {
       // Custom call operations never have reusable buffers. Also we do not
       // reuse thread-local buffers for now, because they are dynamically
       // allocated and their lifetimes are hard to compute.
-      assignment->NewAllocation(*buffer, buffer_size_(*buffer), is_thread_local,
-                                /*is_reusable=*/false);
+      BufferAllocation* allocation = assignment->NewAllocation(
+          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      VLOG(3) << "New allocation #" << allocation->index()
+              << " for thread-local/CustomCall: " << *buffer;
       continue;
     }
 
     if (ShapeUtil::IsTuple(buffer->shape())) {
       // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
       // assumes longer buffer liveness than indicated by the analysis.
-      assignment->NewAllocation(*buffer, buffer_size_(*buffer), is_thread_local,
-                                /*is_reusable=*/false);
+      BufferAllocation* allocation = assignment->NewAllocation(
+          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      VLOG(3) << "New allocation #" << allocation->index()
+              << " for tuple-shaped buffer: " << *buffer;
       continue;
     }
 
@@ -503,23 +815,23 @@ tensorflow::Status BufferAssigner::AssignBuffersForComputation(
     // (checked in liveness analysis) which are necessarily top-level
     // array-shaped buffers.
     if (buffer->IsTopLevel() && !buffer->IsTuple()) {
-      for (auto* operand : buffer->instruction()->operands()) {
+      for (auto* operand : instruction->operands()) {
         bool assigned_operand = false;
-        for (const auto& operand_allocation :
-             assignment->GetAllocations(operand, /*index=*/{})) {
+        for (const auto& operand_slice :
+             assignment->GetAllSlices(operand, /*index=*/{})) {
           BufferAllocation* allocation =
-              assignment->GetMutableAllocation(operand_allocation.index());
+              assignment->GetMutableAllocation(operand_slice.index());
           if (colocated_allocations.count(allocation->index()) == 0) {
             // TODO(b/32491382) Colocated buffers are currently assigned in an
             // earlier pass, and so can break the "increasing allocation size"
             // invariant in this function (causing this CHECK to fail). However,
             // the call to MaybeAssignBuffer is safe as it returns false if
             // allocation.size < buffer.size.
-            CHECK_GE(allocation->size(), buffer_size_(*buffer));
+            CHECK_GE(allocation->size(), buffer_size);
           }
           if (MaybeAssignBuffer(allocation, *buffer, assignment)) {
-            VLOG(3) << "Reusing (operand) allocation for: "
-                    << buffer->ToString();
+            VLOG(3) << "Reusing (operand) allocation #" << allocation->index()
+                    << " for: " << *buffer;
             assigned_operand = true;
             break;
           }
@@ -546,24 +858,148 @@ tensorflow::Status BufferAssigner::AssignBuffersForComputation(
           // invariant in this function (causing this CHECK to fail). However,
           // the call to MaybeAssignBuffer is safe as it returns false if
           // allocation.size < buffer.size.
-          CHECK_GE(allocation->size(), buffer_size_(*buffer));
+          CHECK_GE(allocation->size(), buffer_size);
         }
 
         if (MaybeAssignBuffer(allocation, *buffer, assignment)) {
-          VLOG(3) << "Reusing buffer for: " << buffer->ToString();
+          VLOG(3) << "Reusing allocation #" << allocation->index()
+                  << " for: " << *buffer;
           break;
         }
       }
     }
+
+    if (!assignment->HasAllocation(*buffer) && has_sequential_order &&
+        !liveness.MaybeLiveOut(*buffer)) {
+      // There is a sequential instruction ordering, so we delay assignment of
+      // temp buffers until after the loop. We do this right before we decide to
+      // create a new allocation, to ensure we've exhausted all the buffer
+      // re-use cases above.
+      //
+      // Entry parameters and thread local buffers were already handled earlier
+      // in this loop iteration.  See BufferAllocation::IsPreallocatedTempBuffer
+      // for the definition of temp buffers.
+      CHECK(!is_entry_parameter) << *buffer;
+      CHECK(!is_thread_local) << *buffer;
+      (*buffers_to_assign_sequentially)[computation].insert(buffer);
+      VLOG(3) << "Delaying assignment of temp buffer: " << *buffer;
+      continue;
+    }
+
     if (!assignment->HasAllocation(*buffer)) {
-      auto* allocation =
-          assignment->NewAllocation(*buffer, buffer_size_(*buffer),
-                                    is_thread_local, /*is_reusable=*/true);
-      VLOG(3) << "New allocation for: " << buffer->ToString();
+      BufferAllocation* allocation = assignment->NewAllocation(
+          *buffer, buffer_size, is_thread_local, /*is_reusable=*/true);
       allocation_indices.push_back(allocation->index());
+      VLOG(3) << "New allocation #" << allocation->index()
+              << " for: " << *buffer;
     }
   }
-  return tensorflow::Status::OK();
+
+  return Status::OK();
+}
+
+FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+        LogicalBuffer::Color::Hasher>
+BufferAssigner::SplitBuffersByColor(
+    const FlatSet<const LogicalBuffer*>& buffers) {
+  FlatMap<LogicalBuffer::Color, FlatSet<const LogicalBuffer*>,
+          LogicalBuffer::Color::Hasher>
+      color_map;
+  for (auto buffer : buffers) {
+    color_map[buffer->color()].insert(buffer);
+  }
+  return color_map;
+}
+
+Status BufferAssigner::AssignBuffersWithSequentialOrdering(
+    const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+        buffers_to_assign_sequentially,
+    bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
+  // Run the sequence of instructions through the heap simulator.  The heuristic
+  // that seems to give the best results is lazy-best-fit, with all runs of
+  // alloc / free calls sorted in decreasing size order.
+  const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
+  if (run_whole_module_heap_simulation) {
+    // Run the heap simulation over the whole module. This reduces memory usage,
+    // since buffers for kCall and kWhile sub-computations are only live for the
+    // duration of their calling instructions.
+    VLOG(1) << "Running whole-module heap simulation";
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    FlatSet<const LogicalBuffer*> all_buffers_to_assign;
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      module_sequence[computation] = *instruction_sequence;
+      all_buffers_to_assign.insert(buffers_to_assign.begin(),
+                                   buffers_to_assign.end());
+    }
+    auto color_map = SplitBuffersByColor(all_buffers_to_assign);
+    for (auto& single_colored_set : color_map) {
+      VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+      TF_ASSIGN_OR_RETURN(
+          const HeapSimulator::Result result,
+          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                             assignment->module(), module_sequence,
+                             assignment->points_to_analysis(),
+                             assignment->buffer_size_,
+                             &single_colored_set.second));
+      AssignBuffersFromHeapSimulator(result, assignment,
+                                     single_colored_set.first);
+    }
+  } else {
+    // Run the heap-simulation on a per-computation basis. Buffers for
+    // sub-computations are assigned disjoint BufferAllocations, assuming the
+    // worst-case that they may all be live concurrently.
+    VLOG(1) << "Running per-computation heap simulation";
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      auto color_map = SplitBuffersByColor(buffers_to_assign);
+      for (auto& single_colored_set : color_map) {
+        VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+        TF_ASSIGN_OR_RETURN(
+            const HeapSimulator::Result result,
+            HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                   MakeUnique<LazyBestFitHeap>(alignment_)),
+                               *computation, *instruction_sequence,
+                               assignment->points_to_analysis(),
+                               assignment->buffer_size_,
+                               &single_colored_set.second));
+        AssignBuffersFromHeapSimulator(result, assignment,
+                                       single_colored_set.first);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void BufferAssigner::AssignBuffersFromHeapSimulator(
+    const HeapSimulator::Result& result, BufferAssignment* assignment,
+    LogicalBuffer::Color color) {
+  if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
+    assignment->stats_.preallocated_temp_fragmentation_bytes =
+        result.fragmentation_size;
+  } else {
+    assignment->stats_.preallocated_temp_fragmentation_bytes +=
+        result.fragmentation_size;
+  }
+
+  BufferAllocation* allocation = assignment->NewEmptyAllocation(
+      result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
+  for (const auto& buffer_chunk : result.chunk_map) {
+    const LogicalBuffer& buffer = *buffer_chunk.first;
+    const HeapSimulator::Chunk& chunk = buffer_chunk.second;
+    assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
+  }
+
+  assignment->heap_simulator_traces_.push_back(result.debug_trace);
 }
 
 // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
@@ -586,12 +1022,14 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 
   // Find existing sets that overlap with at least one buffer from the
-  // colocated_set.
+  // colocated_set. The resulting 'overlap_set_indices' will have at most
+  // colocated_buffer_sets->size() entries, and will be in increasing order.
   std::vector<size_t> overlap_set_indices;
-  for (const LogicalBuffer* buffer : colocated_set) {
-    for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
+  for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
+    for (const LogicalBuffer* buffer : colocated_set) {
       if ((*colocated_buffer_sets)[index].count(buffer) > 0) {
         overlap_set_indices.push_back(index);
+        break;
       }
     }
   }
@@ -622,40 +1060,154 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 }
 
+// Conceptually the same as AddSetToColocatedBufferSets, but specific to the
+// colocated buffers for while instructions. 'colocated_set' contains the
+// buffers for a single while instruction that must be colocated. The idea here
+// is to apply a memory-saving heuristic for separate while instructions whose
+// buffers are disjoint in liveness, by using the colocation mechanism to force
+// buffer sharing. This often reduces memory for multi-layer RNNs.
+//
+// TODO(b/32491382): We should be able to remove this heuristic after we
+// implement module-level liveness analysis, which would let us directly detect
+// buffer sharing opportunities between the while instruction buffer and the
+// buffers from the predicate and body computation, as well as sharing across
+// different while instructions.
+void BufferAssigner::AddWhileSetToColocatedBufferSets(
+    const std::vector<const LogicalBuffer*>& colocated_set,
+    const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+    const HloComputation& computation, const BufferLiveness& buffer_liveness,
+    const LogicalBuffer::SizeFunction& buffer_size,
+    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
+  CHECK(!colocated_set.empty());
+  const TuplePointsToAnalysis& points_to_analysis =
+      buffer_liveness.points_to_analysis();
+
+  // Parallel while loops cannot safely share colocated buffer sets.
+  if (buffer_liveness.hlo_ordering().SequentialOrder(computation) == nullptr) {
+    AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+    return;
+  }
+
+  // Scan 'colocated_buffer_sets' in reverse order for locality; colocated sets
+  // are added in postorder over computations and instructions.
+  const int64 init_buffer_size = buffer_size(*while_init_buffer);
+  for (int i = colocated_buffer_sets->size() - 1; i >= 0; --i) {
+    const ColocatedBufferSet& predecessor_set = (*colocated_buffer_sets)[i];
+
+    // Skip predecessor sets not associated with while loops.
+    if (std::all_of(predecessor_set.begin(), predecessor_set.end(),
+                    [](const LogicalBuffer* buffer) {
+                      return buffer->instruction()->opcode() !=
+                             HloOpcode::kWhile;
+                    })) {
+      continue;
+    }
+
+    // Skip predecessor sets already associated with 'while_hlo'.
+    if (std::any_of(predecessor_set.begin(), predecessor_set.end(),
+                    [&while_hlo](const LogicalBuffer* buffer) {
+                      return buffer->instruction() == while_hlo;
+                    })) {
+      continue;
+    }
+
+    // Build vector of predecessor while result and init buffers, which are
+    // checked for liveness interference below. We must check both the result
+    // and init buffers because they're aliased together, but
+    // TuplePointsToAnalysis is unaware of this aliasing.
+    std::vector<const LogicalBuffer*> predecessor_while_buffers;
+    for (const LogicalBuffer* buffer : predecessor_set) {
+      const HloInstruction* instruction = buffer->instruction();
+      if (instruction->opcode() == HloOpcode::kWhile &&
+          buffer_size(*buffer) == init_buffer_size &&
+          instruction->parent() == &computation) {
+        predecessor_while_buffers.push_back(buffer);
+        // Add the init buffer at the same index, which must also exist in the
+        // predecessor set, and must be unambiguous.
+        const PointsToSet& init_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        const std::vector<const LogicalBuffer*>& init_buffers =
+            init_points_to.element(buffer->index());
+        CHECK_EQ(init_buffers.size(), 1);
+        CHECK_GT(predecessor_set.count(init_buffers[0]), 0);
+        predecessor_while_buffers.push_back(init_buffers[0]);
+      }
+    }
+    if (predecessor_while_buffers.empty()) {
+      continue;
+    }
+
+    // Skip predecessor set if the live range of any predecessor buffers
+    // overlaps with 'while_init_buffer'. Note that tuple element buffer
+    // forwarding can cause the same buffer to appear on both sides of the
+    // interference comparison below.
+    if (std::any_of(
+            predecessor_while_buffers.begin(), predecessor_while_buffers.end(),
+            [while_init_buffer, &buffer_liveness](const LogicalBuffer* buffer) {
+              return while_init_buffer->id() != buffer->id() &&
+                     buffer_liveness.MayInterfere(*while_init_buffer, *buffer);
+            })) {
+      continue;
+    }
+
+    // All our checks have passed; merge 'predecessor_set' with 'colocated_set',
+    // and add the merged set to 'colocated_buffer_sets'. This forces the
+    // colocation of buffers across different while instructions.
+    FlatSet<const LogicalBuffer*> unique;
+    unique.insert(predecessor_set.begin(), predecessor_set.end());
+    unique.insert(colocated_set.begin(), colocated_set.end());
+    std::vector<const LogicalBuffer*> merged_set(unique.begin(), unique.end());
+    AddSetToColocatedBufferSets(merged_set, colocated_buffer_sets);
+    return;
+  }
+
+  // Failed to merge into predecessor set; add 'colocated_set' as-is.
+  AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+}
+
 namespace {
+
 // Checks that points-to set of 'instruction' is unambiguous and distinct
 // (ensured by CopyInsertion), then adds the buffer from the points-to set at
 // 'index' to 'colocated_set'.
-void AddBufferToColocatedSet(const HloInstruction* instruction,
-                             const ShapeIndex& index,
-                             const TuplePointsToAnalysis& points_to_analysis,
-                             std::vector<const LogicalBuffer*>* colocated_set) {
+const LogicalBuffer* AddBufferToColocatedSet(
+    const HloInstruction* instruction, const ShapeIndex& index,
+    const TuplePointsToAnalysis& points_to_analysis,
+    std::vector<const LogicalBuffer*>* colocated_set) {
   // CopyInsertion ensures root points-to set is unambiguous and distinct.
   const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
   CHECK(!points_to.IsAmbiguous());
   CHECK(points_to.IsDistinct());
   colocated_set->push_back(points_to.element(index)[0]);
+  return colocated_set->back();
 }
+
 }  // namespace
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
 // in the same allocation (currently just supports kWhile and kCall).
 void BufferAssigner::BuildColocatedBufferSets(
-    const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+    const HloModule* module, const BufferLiveness& buffer_liveness,
+    const LogicalBuffer::SizeFunction& buffer_size,
     std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  for (auto& computation : module->computations()) {
-    for (auto& instruction : computation->instructions()) {
+  const TuplePointsToAnalysis& points_to_analysis =
+      buffer_liveness.points_to_analysis();
+  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    for (const HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       const HloOpcode opcode = instruction->opcode();
       if (opcode == HloOpcode::kWhile) {
-        HloInstruction* while_hlo = instruction.get();
-        TF_CHECK_OK(ShapeUtil::ForEachSubshape(
+        const HloInstruction* while_hlo = instruction;
+        ShapeUtil::ForEachSubshape(
             while_hlo->shape(),
-            [this, while_hlo, &points_to_analysis, colocated_buffer_sets](
+            [this, while_hlo, &points_to_analysis, &buffer_liveness,
+             buffer_size, computation, colocated_buffer_sets](
                 const Shape& /*subshape*/, const ShapeIndex& index) {
               std::vector<const LogicalBuffer*> colocated_set;
               // Add while.init.
-              AddBufferToColocatedSet(while_hlo->operand(0), index,
-                                      points_to_analysis, &colocated_set);
+              auto* init_buffer =
+                  AddBufferToColocatedSet(while_hlo->operand(0), index,
+                                          points_to_analysis, &colocated_set);
               // Add while.result.
               AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
                                       &colocated_set);
@@ -671,13 +1223,15 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddBufferToColocatedSet(
                   while_hlo->while_body()->root_instruction(), index,
                   points_to_analysis, &colocated_set);
-              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-              return tensorflow::Status::OK();
-            }));
+              AddWhileSetToColocatedBufferSets(
+                  colocated_set, init_buffer, while_hlo, *computation,
+                  buffer_liveness, buffer_size, colocated_buffer_sets);
+            });
       } else if (opcode == HloOpcode::kCall) {
-        HloInstruction* call_hlo = instruction.get();
-        HloInstruction* root_hlo = call_hlo->to_apply()->root_instruction();
-        TF_CHECK_OK(ShapeUtil::ForEachSubshape(
+        const HloInstruction* call_hlo = instruction;
+        const HloInstruction* root_hlo =
+            call_hlo->to_apply()->root_instruction();
+        ShapeUtil::ForEachSubshape(
             call_hlo->shape(),
             [this, call_hlo, root_hlo, &points_to_analysis,
              colocated_buffer_sets](const Shape& /*subshape*/,
@@ -690,8 +1244,7 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddBufferToColocatedSet(root_hlo, index, points_to_analysis,
                                       &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-              return tensorflow::Status::OK();
-            }));
+            });
       }
     }
   }
@@ -702,23 +1255,43 @@ void BufferAssigner::BuildColocatedBufferSets(
 void BufferAssigner::AssignColocatedBufferSets(
     const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
     BufferAssignment* assignment,
-    tensorflow::gtl::FlatSet<const LogicalBuffer*>* colocated_buffers,
-    tensorflow::gtl::FlatSet<BufferAllocation::Index>* colocated_allocations) {
+    FlatSet<const LogicalBuffer*>* colocated_buffers,
+    FlatSet<BufferAllocation::Index>* colocated_allocations) {
   for (const ColocatedBufferSet& colocated_buffer_set : colocated_buffer_sets) {
     BufferAllocation* allocation = nullptr;
+    // Set 'entry_parameter_number' if entry param in 'colocated_buffer_set'.
+    int64 entry_parameter_number = -1;
+    for (const LogicalBuffer* buffer : colocated_buffer_set) {
+      const HloInstruction* instruction = buffer->instruction();
+      const HloComputation* computation = instruction->parent();
+      if (instruction->opcode() == HloOpcode::kParameter &&
+          computation == computation->parent()->entry_computation()) {
+        entry_parameter_number = instruction->parameter_number();
+        break;
+      }
+    }
+
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       if (allocation == nullptr) {
         // TODO(b/32491382) Avoid current trivial solution of using new
         // allocations for each colocated buffer set. When liveness has
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
-        allocation = assignment->NewAllocation(*buffer, buffer_size_(*buffer),
-                                               /*is_thread_local=*/false,
-                                               /*is_reusable=*/true);
+        allocation = assignment->NewAllocation(
+            *buffer, assignment->buffer_size_(*buffer),
+            /*is_thread_local=*/false, /*is_reusable=*/true);
+        if (entry_parameter_number >= 0) {
+          // This colocated buffer set contains an entry parameter and other
+          // logical buffers which use the parameter as read-only in a while
+          // body computation (which updates in place).
+          // Set 'entry_computation_parameter' to indicate that it contains
+          // an entry parameter, and to prevent reuse in MaybeAssignBuffer.
+          allocation->set_entry_computation_parameter(entry_parameter_number);
+        }
         colocated_allocations->insert(allocation->index());
       } else {
-        assignment->AddAssignment(*buffer, allocation,
-                                  /*colocated_buffer=*/true);
+        assignment->AddAssignment(allocation, *buffer, /*offset=*/0,
+                                  assignment->buffer_size_(*buffer));
       }
       colocated_buffers->insert(buffer);
     }
@@ -727,121 +1300,88 @@ void BufferAssigner::AssignColocatedBufferSets(
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
+    LogicalBuffer::SizeFunction buffer_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
-                      BufferLiveness::Run(module, std::move(hlo_ordering)));
+                      BufferLiveness::Run(module, std::move(hlo_ordering),
+                                          std::move(colorer_)));
 
-  std::vector<const HloComputation*> thread_local_computations;
-  std::vector<const HloComputation*> global_computations;
   VLOG(1) << "Assigning buffers to module " << module->name();
-  if (hlos_to_allocate != nullptr) {
-    VLOG(3) << "LogicalBuffer assignment restricted to hlos: ";
-    for (auto hlo : *hlos_to_allocate) {
-      VLOG(3) << "  " << hlo->parent()->name() << "::" << hlo->name();
-    }
-  }
-  XLA_VLOG_LINES(3, module->ToString());
+  XLA_VLOG_LINES(2, module->ToString());
   XLA_VLOG_LINES(3, liveness->ToString());
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
-  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
-      module, &thread_local_computations, &global_computations));
-
-  // Set of HLO's to allocate if hlos_to_allocate is given. Passed as a set to
-  // AssignBuffersForComputation for fast membership testing.
-  std::unique_ptr<tensorflow::gtl::FlatSet<const HloInstruction*>> hlo_set;
-  if (hlos_to_allocate != nullptr) {
-    hlo_set = MakeUnique<tensorflow::gtl::FlatSet<const HloInstruction*>>(
-        hlos_to_allocate->begin(), hlos_to_allocate->end());
-  }
-
   // Can't use MakeUnique because BufferAssignment constructor is private.
-  std::unique_ptr<BufferAssignment> assignment(
-      new BufferAssignment(module, std::move(liveness)));
+  std::unique_ptr<BufferAssignment> assignment(new BufferAssignment(
+      module, std::move(liveness), alignment_, std::move(buffer_size)));
 
   // Assign buffers with the tightest constraints first (colocated buffer sets).
   // Once b/32491382 enables module-level liveness analysis, we may be able
   // to assign colocated buffers (or at least reuse their allocation for
   // buffers outside of the set) in AssignBuffersForComputation.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> colocated_buffers;
-  tensorflow::gtl::FlatSet<BufferAllocation::Index> colocated_allocations;
-  if (colocate_related_buffers_) {
-    std::vector<ColocatedBufferSet> colocated_buffer_sets;
-    BuildColocatedBufferSets(module, assignment->points_to_analysis(),
-                             &colocated_buffer_sets);
-    AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
-                              &colocated_buffers, &colocated_allocations);
-  }
+  FlatSet<const LogicalBuffer*> colocated_buffers;
+  FlatSet<BufferAllocation::Index> colocated_allocations;
+  std::vector<ColocatedBufferSet> colocated_buffer_sets;
+  BuildColocatedBufferSets(module, assignment->liveness(),
+                           assignment->buffer_size_, &colocated_buffer_sets);
+  AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
+                            &colocated_buffers, &colocated_allocations);
 
+  std::vector<const HloComputation*> thread_local_computations;
+  std::vector<const HloComputation*> global_computations;
+  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
+      module, &thread_local_computations, &global_computations));
+
+  // First assign buffers for global computatations. Temporary buffers for
+  // sequential computations are collected in 'buffers_to_assign_sequentially'.
+  FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>
+      buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/false, hlo_set.get(),
-        colocated_buffers, colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/false, colocated_buffers,
+        colocated_allocations, &buffers_to_assign_sequentially,
+        assignment.get()));
   }
+  // Assign buffers with sequential ordering, if any. If all global computations
+  // are sequential, we can run heap simuation on the whole module, which
+  // reduces memory usage.
+  const bool run_whole_module_heap_simulation =
+      buffers_to_assign_sequentially.size() == global_computations.size();
+  TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
+      buffers_to_assign_sequentially, run_whole_module_heap_simulation,
+      assignment.get()));
+
+  // Now assign buffers for thread-local computations. All LogicalBuffers get
+  // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/true, hlo_set.get(), colocated_buffers,
-        colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/true, colocated_buffers,
+        colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr,
+        assignment.get()));
   }
 
   // Mark all buffers which may be live out of the entry computation as
   // "liveout".
-  auto entry = module->entry_computation();
-  auto root_instruction = entry->root_instruction();
-  const PointsToSet& root_points_to =
-      assignment->GetPointsToSet(root_instruction);
-  TF_RETURN_IF_ERROR(root_points_to.ForEachElement(
-      [&assignment](const ShapeIndex& /*index*/, bool /*is_leaf*/,
-                    const std::vector<const LogicalBuffer*>& buffers) {
-        for (const LogicalBuffer* buffer : buffers) {
-          VLOG(3) << "maybe_live_out LogicalBuffer: " << buffer->ToString();
-          if (assignment->HasAllocation(*buffer)) {
-            BufferAllocation* alloc =
-                assignment->GetMutableAssignedAllocation(*buffer);
-            alloc->set_maybe_live_out(true);
-            VLOG(3) << "maybe_live_out BufferAllocation: " << alloc->ToString();
-          }
-        }
-        return tensorflow::Status::OK();
-      }));
-
-  XLA_VLOG_LINES(2, assignment->ToString());
-
-  // Compute sizes of various kinds of buffers for logging.
-  int64 total_size = 0;
-  int64 parameter_size = 0;
-  for (auto& allocation : assignment->Allocations()) {
-    if (allocation.is_entry_computation_parameter()) {
-      parameter_size += allocation.size();
+  for (const LogicalBuffer* buffer :
+       assignment->liveness().maybe_live_out_buffers()) {
+    VLOG(3) << "maybe_live_out LogicalBuffer: " << *buffer;
+    if (assignment->HasAllocation(*buffer)) {
+      BufferAllocation* alloc =
+          assignment->GetMutableAssignedAllocation(*buffer);
+      alloc->set_maybe_live_out(true);
+      VLOG(3) << "maybe_live_out BufferAllocation: " << *alloc;
     }
-    total_size += allocation.size();
   }
 
-  // Compute the total size of the output. Iterate over the subshapes and sum up
-  // the sizes of the buffers for each subshape.
-  int64 output_size = 0;
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshape(
-      root->shape(), [this, &output_size, root, &assignment](
-                         const Shape& /*subshape*/, const ShapeIndex& index) {
-        const auto& allocations = assignment->GetAllocations(root, index);
-        if (!allocations.empty()) {
-          output_size += allocations.begin()->size();
-        }
-        return tensorflow::Status::OK();
-      }));
+  // Combines allocations of temporary buffers into one big BufferAllocation.
+  // This can only be performed after all buffers have been assigned, and after
+  // maybe_live_out is marked, since it is used to determine whether an
+  // allocation contains temporary buffers or not.
+  assignment->CombineTempAllocations();
 
-  VLOG(1) << "Allocation sizes for module " << module->name() << ":";
-  VLOG(1) << "  parameter allocation total size: "
-          << tensorflow::strings::HumanReadableNumBytes(parameter_size);
-  VLOG(1) << "     output allocation total size: "
-          << tensorflow::strings::HumanReadableNumBytes(output_size);
-  VLOG(1) << "       temp allocation total size: "
-          << tensorflow::strings::HumanReadableNumBytes(
-                 total_size - parameter_size - output_size);
-  VLOG(1) << "            total allocation size: "
-          << tensorflow::strings::HumanReadableNumBytes(total_size);
+  XLA_VLOG_LINES(2, assignment->ToString());
+  TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats());
+  XLA_VLOG_LINES(1, assignment->GetStats().ToString());
   return std::move(assignment);
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index e7aeb35967e..b3933f11c1e 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -20,19 +20,21 @@ limitations under the License.
 #include <iosfwd>
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -41,12 +43,15 @@ limitations under the License.
 namespace xla {
 
 // This class abstracts an allocation of contiguous memory which can hold the
-// values described by LogicalBuffers. A BufferAllocation may hold different
-// LogicalBuffers at different times, but currently never more than one
-// LogicalBuffer simultaneously. The abstraction includes information required
-// by the backends for allocation, use, and deallocation of the buffer. This
-// includes the LogicalBuffers which are held in this allocation through the
-// execution of the computation.
+// values described by LogicalBuffers. Each LogicalBuffer occupies a sub-range
+// of the allocation, represented by a Slice. A single BufferAllocation may hold
+// LogicalBuffers with disjoint liveness, which may have overlapping Slices. A
+// single BufferAllocation may also hold LogicalBuffers with overlapping
+// liveness, which must have disjoint Slices.
+//
+// The abstraction includes information required by the backends for allocation,
+// use, and deallocation of the buffer. This includes the LogicalBuffers which
+// are held in this allocation through the execution of the computation.
 class BufferAllocation {
  public:
   // Holds a unique identifier for each allocation. Values are assigned
@@ -54,15 +59,16 @@ class BufferAllocation {
   using Index = int64;
 
   BufferAllocation(Index index, int64 size, bool is_thread_local,
-                   bool is_reusable)
+                   bool is_reusable, LogicalBuffer::Color color)
       : index_(index),
         size_(size),
         is_thread_local_(is_thread_local),
-        is_reusable_(is_reusable) {}
+        is_reusable_(is_reusable),
+        color_(color) {}
   ~BufferAllocation() {}
 
-  // Adds a LogicalBuffer to the set assigned to this buffer.
-  void AddAssignment(const LogicalBuffer& buffer);
+  // Returns the index of this allocation.
+  Index index() const { return index_; }
 
   // Whether this allocation is used in a parallel calling context such as
   // inside of a map or reduce computation. Such allocations need to be thread
@@ -84,30 +90,83 @@ class BufferAllocation {
     CHECK(is_entry_computation_parameter_);
     return parameter_number_;
   }
-  // Sets that this allocation holds a LogicalBuffer from a parameter of the
-  // entry computation.
-  void set_entry_computation_parameter(int64 parameter_number) {
-    is_entry_computation_parameter_ = true;
-    parameter_number_ = parameter_number;
-  }
 
-  // Returns/sets whether this allocation is assigned a LogicalBuffer which may
+  // Returns whether this allocation is assigned a LogicalBuffer which may
   // be live out of the entry computation.
   bool maybe_live_out() const { return maybe_live_out_; }
-  void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
 
   // Returns the size of the allocation. Necessarily this must be at least as
   // large as any LogicalBuffer assigned to this allocation.
   int64 size() const { return size_; }
 
-  // Access to the logical buffers assigned to this allocation.
-  const std::vector<const LogicalBuffer*>& assigned_buffers() const {
+  // Returns the color of the allocation. Only logical buffers with a matching
+  // color can reside in this allocation.
+  LogicalBuffer::Color color() const { return color_; }
+
+  struct OffsetSize {
+    int64 offset = 0;
+    int64 size = 0;
+  };
+
+  // Access to the logical buffers assigned to this allocation, and their
+  // associated logical offsets and sizes.
+  const tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize>&
+  assigned_buffers() const {
     return assigned_buffers_;
   }
 
-  Index index() const { return index_; }
+  // A Slice represents a contiguous portion of a memory allocation. It is used
+  // to identify the memory range that a LogicalBuffer corresponds to.
+  class Slice {
+   public:
+    Slice() {}
+    Slice(const BufferAllocation* allocation, int64 offset, int64 size)
+        : allocation_(allocation), offset_(offset), size_(size) {}
+
+    const BufferAllocation* allocation() const { return allocation_; }
+    Index index() const { return allocation_->index(); }
+    int64 offset() const { return offset_; }
+    int64 size() const { return size_; }
+
+    bool operator==(const Slice& other) const {
+      return index() == other.index() && offset_ == other.offset_ &&
+             size_ == other.size_;
+    }
+    bool operator!=(const Slice& other) const { return !(*this == other); }
+    bool operator<(const Slice& other) const {
+      if (index() != other.index()) return index() < other.index();
+      if (offset_ != other.offset_) return offset_ < other.offset_;
+      return size_ < other.size_;
+    }
+
+    // Returns true iff this slice's memory range has a non-empty intersection
+    // with the other slice's memory range.
+    bool OverlapsWith(const Slice& other) const {
+      const int64 end = offset_ + size_;
+      const int64 other_end = other.offset_ + other.size_;
+      return index() == other.index() && offset_ < other_end &&
+             end > other.offset_;
+    }
+
+    struct Hasher {
+      size_t operator()(Slice s) const;
+    };
+
+    string ToString() const;
+
+   private:
+    const BufferAllocation* allocation_ = nullptr;
+    int64 offset_ = 0;
+    int64 size_ = 0;
+  };
+
+  // GetSlice returns the Slice of contiguous memory that holds the value
+  // described by the given 'buffer'.
+  // REQUIRES: 'buffer' must be assigned to this allocation.
+  Slice GetSlice(const LogicalBuffer& buffer) const;
 
   string ToString() const;
+  BufferAllocationProto ToProto() const;
 
   // Whether the buffer is a parameter to or live out of the entry computation.
   bool IsInputOrOutput() const {
@@ -137,6 +196,21 @@ class BufferAllocation {
   }
 
  private:
+  // Only BufferAssigner and BufferAssignment can modify BufferAllocation.
+  friend class BufferAssigner;
+  friend class BufferAssignment;
+
+  // Adds a LogicalBuffer to the set assigned to this buffer.
+  void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
+
+  void set_entry_computation_parameter(int64 parameter_number) {
+    is_entry_computation_parameter_ = true;
+    parameter_number_ = parameter_number;
+  }
+  void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
+  void set_index(Index index) { index_ = index; }
+  void set_size(int64 size) { size_ = size; }
+
   // The index of the allocation in the BufferAssignment.
   Index index_;
 
@@ -149,6 +223,9 @@ class BufferAllocation {
   // Whether this buffer is usable by more than one logical buffer.
   bool is_reusable_;
 
+  // Color of the allocation.
+  LogicalBuffer::Color color_;
+
   // Whether this allocation holds an entry computation parameter. Entry
   // computation parameters are special be cause they have lifetimes which may
   // outlast the computation.
@@ -164,12 +241,14 @@ class BufferAllocation {
   // might not actually escape.
   bool maybe_live_out_ = false;
 
-  // The set of buffers assigned to this allocation.
-  std::vector<const LogicalBuffer*> assigned_buffers_;
+  // Mapping from the set of buffers assigned to this allocation to their
+  // logical offsets and sizes.
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
 };
 
-// Add stream operator for nicer output of CHECK/RET_CHECK failures.
+// Add stream operators for nicer output of CHECK/RET_CHECK failures.
 std::ostream& operator<<(std::ostream& out, const BufferAllocation& s);
+std::ostream& operator<<(std::ostream& out, const BufferAllocation::Slice& s);
 
 // This class encapsulates an assignment of the LogicalBuffers in an XLA
 // module to a set of BufferAllocations.
@@ -180,6 +259,11 @@ class BufferAssignment {
     return allocations_;
   }
 
+  // Returns the total size allocation holding all temporary buffers.
+  int64 temp_allocation_total_size() const {
+    return temp_allocation_total_size_;
+  }
+
   // Returns whether the given buffer has been assigned an allocation.
   bool HasAllocation(const LogicalBuffer& buffer) const;
 
@@ -192,29 +276,28 @@ class BufferAssignment {
   // with the given index.
   const BufferAllocation& GetAllocation(BufferAllocation::Index index) const;
 
-  // Builds and returns a vector containing the allocations which might contain
-  // the subvalue at the given index of given instruction.
-  std::set<BufferAllocation> GetAllocations(const HloInstruction* instruction,
-                                            const ShapeIndex& index) const;
+  // Builds and returns a vector containing the slices which might contain the
+  // subvalue at the given index of given instruction.
+  std::set<BufferAllocation::Slice> GetAllSlices(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
 
   // Convenience function which returns whether the top-level buffer of the
   // instruction (index == {}) is assigned an allocation.
   bool HasTopLevelAllocation(const HloInstruction* instruction) const;
 
-  // Convenience function which returns the unique buffer allocation containing
-  // the buffer at the given index of the given instruction. If an allocation is
-  // not assigned or the allocation cannot be determined at compile time then an
-  // error is returned.
-  StatusOr<const BufferAllocation*> GetUniqueAllocation(
+  // Convenience function which returns the unique slice containing the buffer
+  // at the given index of the given instruction. If a slice is not assigned or
+  // the slice cannot be determined at compile time then an error is returned.
+  StatusOr<BufferAllocation::Slice> GetUniqueSlice(
       const HloInstruction* instruction, const ShapeIndex& index) const;
-  // Like GetUniqueAllocation but fixes the index to the top-level of the shape
+  // Like GetUniqueSlice but fixes the index to the top-level of the shape
   // (index = {}).
-  StatusOr<const BufferAllocation*> GetUniqueTopLevelAllocation(
+  StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
       const HloInstruction* instruction) const;
-  // Like GetUniqueTopLevelAllocation but returns the allocation for the output
-  // of the entry computation of the HLO module (ie, the result of the XLA
+  // Like GetUniqueTopLevelSlice but returns the slice for the output of the
+  // entry computation of the HLO module (ie, the result of the XLA
   // computation).
-  StatusOr<const BufferAllocation*> GetUniqueTopLevelOutputAllocation() const;
+  StatusOr<BufferAllocation::Slice> GetUniqueTopLevelOutputSlice() const;
 
   // Returns the set LogicalBuffers which may be the source of the value at the
   // given index and instruction.
@@ -223,36 +306,75 @@ class BufferAssignment {
     return GetPointsToSet(instruction).element(index);
   }
 
+  // Returns true if 'hlo_a{shape_index_a}' and 'hlo_b{shape_index_b}'
+  // share the same BufferAllocation::Slice.
+  // Returns false otherwise.
+  // REQUIRES: BufferAssignment assigned allocations to both instructions.
+  bool SharesSliceAtIndex(const HloInstruction* hlo_a,
+                          const ShapeIndex& shape_index_a,
+                          const HloInstruction* hlo_b,
+                          const ShapeIndex& shape_index_b) const;
+
   // Returns the underlying points-to analysis used for this assignment.
   const TuplePointsToAnalysis& points_to_analysis() const {
     return liveness_->points_to_analysis();
   }
 
+  // Returns the BufferLiveness object used to construct this assignment.
+  const BufferLiveness& liveness() const { return *liveness_; }
+
   string ToString() const;
+  BufferAssignmentProto ToProto() const;
+
+  // Statistics for the assignment.  Values initialized to -1 are not always
+  // collected; fragmentation is only collected for instructions that have a
+  // sequential total ordering.
+  struct Stats {
+    int64 parameter_allocation_count = 0;
+    int64 parameter_allocation_bytes = 0;
+    int64 maybe_live_out_allocation_count = 0;
+    int64 maybe_live_out_allocation_bytes = 0;
+    int64 preallocated_temp_allocation_count = 0;
+    int64 preallocated_temp_allocation_bytes = 0;
+    int64 preallocated_temp_fragmentation_bytes = -1;
+    int64 total_allocation_count = 0;
+    int64 total_allocation_bytes = 0;
+    int64 total_fragmentation_bytes = -1;
+
+    string ToString() const;
+  };
+  const Stats& GetStats() const { return stats_; }
 
  private:
   // Only BufferAssigner can build or modify BufferAssignments.
   friend class BufferAssigner;
 
   explicit BufferAssignment(const HloModule* module,
-                            std::unique_ptr<BufferLiveness> liveness)
-      : module_(module), liveness_(std::move(liveness)) {}
+                            std::unique_ptr<BufferLiveness> liveness,
+                            int64 alignment,
+                            LogicalBuffer::SizeFunction buffer_size)
+      : module_(module),
+        liveness_(std::move(liveness)),
+        alignment_(alignment),
+        buffer_size_(std::move(buffer_size)) {}
 
-  // Creates and returns a new BufferAllocation. Ownership is maintained
-  // internally. The allocation initially has only the given LogicalBuffer
-  // assigned to it. `is_thread_local` indicates whether this buffer needs to be
-  // thread-local.
+  // Creates and returns a new BufferAllocation, with no assigned
+  // LogicalBuffers. Ownership is maintained internally.
+  BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
+                                       bool is_reusable,
+                                       LogicalBuffer::Color color);
+
+  // Helper that calls NewEmptyAllocation and AddAssignment in one call,
+  // creating an allocation containing a single LogicalBuffer.
   BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size,
                                   bool is_thread_local, bool is_reusable);
 
-  // Adds a LogicalBuffer to the set assigned to the given allocation. If
-  // colocated_buffer is true, then the logical buffer is an alias of another
-  // buffer assigned to this allocation.
-  void AddAssignment(const LogicalBuffer& buffer, BufferAllocation* allocation,
-                     bool colocated_buffer);
+  // Adds a LogicalBuffer to the set assigned to the given allocation.
+  void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
+                     int64 offset, int64 size);
 
-  // Returns the BufferLiveness object used to construct this assignment.
-  const BufferLiveness& liveness() { return *liveness_; }
+  // Returns the HloModule used to construct this assignment.
+  const HloModule& module() const { return *module_; }
 
   // Convenience function which returns the PointsToSet for the given
   // instruction. Extracted from the liveness object.
@@ -262,15 +384,31 @@ class BufferAssignment {
   BufferAllocation* GetMutableAssignedAllocation(const LogicalBuffer& buffer);
   BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
 
+  // Combines allocations of temporary buffers into one big BufferAllocation.
+  void CombineTempAllocations();
+
+  // Computes stats for the assignment, to be retrieved by GetStats.
+  Status ComputeSummaryStats();
+
   // The vector of buffer allocations. Indexed by BufferAllocation::Index.
   std::vector<BufferAllocation> allocations_;
 
+  // The total size of all temporary buffers.
+  int64 temp_allocation_total_size_ = 0;
+
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
-  std::map<const LogicalBuffer*, BufferAllocation::Index>
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index>
       allocation_index_for_buffer_;
 
   const HloModule* module_;
-  std::unique_ptr<BufferLiveness> liveness_;
+  const std::unique_ptr<BufferLiveness> liveness_;
+  const int64 alignment_;
+
+  // Function which returns the buffer size for a given logical buffer (shape).
+  LogicalBuffer::SizeFunction buffer_size_;
+
+  Stats stats_;
+  std::vector<HeapSimulatorTrace> heap_simulator_traces_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssignment);
 };
@@ -280,50 +418,61 @@ class BufferAssigner {
  public:
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size is a function
-  // which returns the size of a LogicalBuffer. If hlos_to_allocate is not null
-  // then only instructions in this vector are considered for buffer
-  // assignment. If hlos_to_allocate is null then all instructions are
-  // considered. If 'colocate_related_buffers' is true, related LogicalBuffers
-  // will be colocated in the same allocation (i.e buffers for while result
-  // will share an allocation with buffers related to that same while
-  // instruction: init operand, condition/body parameter and body result).
+  // which returns the size of a LogicalBuffer. Alignment is the minimum
+  // alignment of any buffer. allow_input_output_aliasing specifies whether
+  // input buffer are allowed to be reused as outbut buffers by the client code.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size, bool colocate_related_buffers,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
-
-  // Overload of Run which uses ShapeUtil::ByteSizeOf to determine buffer size
-  // and assigns buffers to all HLO instructions in the module.
-  static StatusOr<std::unique_ptr<BufferAssignment>> Run(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      int64 pointer_size);
+      LogicalBuffer::SizeFunction buffer_size, int64 alignment,
+      bool allow_input_output_aliasing = false,
+      TuplePointsToAnalysis::Colorer colorer =
+          TuplePointsToAnalysis::DefaultColorer());
 
  private:
-  explicit BufferAssigner(LogicalBuffer::SizeFunction buffer_size,
-                          bool colocate_related_buffers)
-      : buffer_size_(std::move(buffer_size)),
-        colocate_related_buffers_(colocate_related_buffers) {}
+  BufferAssigner(int64 alignment, bool allow_input_output_aliasing,
+                 TuplePointsToAnalysis::Colorer colorer)
+      : alignment_(alignment),
+        allow_input_output_aliasing_(allow_input_output_aliasing),
+        colorer_(colorer) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
   StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
+      LogicalBuffer::SizeFunction buffer_size);
 
   // Assigns buffers to the instructions in the given computation. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
-  // true. If hlos_to_allocate is not null it indicates which HLOs to include in
-  // buffer assignment. If null, all instructions in the computation are
-  // included.
-  tensorflow::Status AssignBuffersForComputation(
+  // true.
+  Status AssignBuffersForComputation(
       const HloComputation* computation, bool is_thread_local,
-      const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
+      tensorflow::gtl::FlatMap<const HloComputation*,
+                               tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+          buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
+  // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
+  // the HLO instructions will be executed in the sequential order given by
+  // assignment->liveness().hlo_ordering().SequentialOrder. If
+  // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
+  // assuming all global computations are sequentially ordered.
+  Status AssignBuffersWithSequentialOrdering(
+      const tensorflow::gtl::FlatMap<
+          const HloComputation*,
+          tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+          buffers_to_assign_sequentially,
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment);
+
+  // Uses the results of the heap simulator to create a single allocation, with
+  // LogicalBuffers packed to specific offsets.
+  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
+                                      BufferAssignment* assignment,
+                                      LogicalBuffer::Color color);
+
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
   bool MaybeAssignBuffer(BufferAllocation* allocation,
@@ -340,7 +489,8 @@ class BufferAssigner {
   // ColocatedBufferSet aggregates a set of related LogicalBuffers from 'module'
   // which should be colocated in the same buffer allocation.
   void BuildColocatedBufferSets(
-      const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+      const HloModule* module, const BufferLiveness& buffer_liveness,
+      const LogicalBuffer::SizeFunction& buffer_size,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
   // For each buffer set in 'colocated_buffer_sets', assigns all buffers in the
@@ -357,13 +507,32 @@ class BufferAssigner {
       const std::vector<const LogicalBuffer*>& colocated_set,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  const HloModule* module_;
+  // Conceptually the same as AddSetToColocatedBufferSets, but specific to the
+  // colocated buffers for while instructions.
+  void AddWhileSetToColocatedBufferSets(
+      const std::vector<const LogicalBuffer*>& colocated_set,
+      const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+      const HloComputation& computation, const BufferLiveness& buffer_liveness,
+      const LogicalBuffer::SizeFunction& buffer_size,
+      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  // Function which returns the buffer size for a given shape.
-  LogicalBuffer::SizeFunction buffer_size_;
+  // Split a set of buffers into several sets, each of which contains buffers
+  // colored with the same color.
+  tensorflow::gtl::FlatMap<LogicalBuffer::Color,
+                           tensorflow::gtl::FlatSet<const LogicalBuffer*>,
+                           LogicalBuffer::Color::Hasher>
+  SplitBuffersByColor(
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
 
-  // Indicates whether related buffers should share the same buffer allocation.
-  const bool colocate_related_buffers_;
+  // Minimum alignment of any buffer.
+  int64 alignment_;
+
+  // If true, buffer assignments assumes that input parameter buffers and output
+  // buffers can be shared if their sizes match.
+  bool allow_input_output_aliasing_;
+
+  // Functor used to assign colors to newly allocated logical buffers.
+  TuplePointsToAnalysis::Colorer colorer_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index b8841c35f68..892f67a8812 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -18,16 +18,23 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -74,6 +81,24 @@ class BufferAssignmentTest : public HloTestBase {
   BufferAssignmentTest() : computation_tracker_() {}
   ~BufferAssignmentTest() override {}
 
+  std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
+                                                        int64 alignment = 1) {
+    return BufferAssigner::Run(
+               module, MakeUnique<DependencyHloOrdering>(module),
+               backend_->compiler()->BufferSizeBytesFunction(), alignment)
+        .ConsumeValueOrDie();
+  }
+
+  std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
+      HloModule* module, TuplePointsToAnalysis::Colorer colorer,
+      int64 alignment = 1) {
+    return BufferAssigner::Run(module,
+                               MakeUnique<DependencyHloOrdering>(module),
+                               backend_->compiler()->BufferSizeBytesFunction(),
+                               alignment, false, std::move(colorer))
+        .ConsumeValueOrDie();
+  }
+
   // Builds an x+1.0 computation to use in a Map.
   std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
     auto builder = HloComputation::Builder(name);
@@ -145,7 +170,7 @@ class BufferAssignmentTest : public HloTestBase {
       const BufferAssignment& buffers, HloInstruction* hlo) {
     LOG(INFO) << "Checking input: " << hlo->ToString();
     const BufferAllocation& buffer =
-        *buffers.GetUniqueTopLevelAllocation(hlo).ConsumeValueOrDie();
+        *buffers.GetUniqueTopLevelSlice(hlo).ConsumeValueOrDie().allocation();
     EXPECT_EQ(hlo->parameter_number(), buffer.parameter_number());
     return buffer;
   }
@@ -163,11 +188,13 @@ class BufferAssignmentTest : public HloTestBase {
   const BufferAllocation& GetAllocation(const BufferAssignment& buffers,
                                         const HloInstruction* hlo,
                                         const ShapeIndex& index) {
-    return *buffers.GetUniqueAllocation(hlo, index).ConsumeValueOrDie();
+    return *buffers.GetUniqueSlice(hlo, index).ConsumeValueOrDie().allocation();
   }
   const BufferAllocation& GetTopLevelAllocation(const BufferAssignment& buffers,
                                                 const HloInstruction* hlo) {
-    return *buffers.GetUniqueTopLevelAllocation(hlo).ConsumeValueOrDie();
+    return *buffers.GetUniqueTopLevelSlice(hlo)
+                .ConsumeValueOrDie()
+                .allocation();
   }
 
   // Verifies that all instructions in the given instruction list except
@@ -195,32 +222,6 @@ class BufferAssignmentTest : public HloTestBase {
     return total_size;
   }
 
-  // Returns true if the buffers assigned to instructions in "a" are distinct
-  // from the buffers assigned to those in "b" (ie, intersection is empty).
-  bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
-                       const std::vector<const HloInstruction*>& b,
-                       const BufferAssignment& assignment) {
-    std::set<BufferAllocation::Index> a_buffers;
-    for (const HloInstruction* instruction : a) {
-      if (assignment.HasTopLevelAllocation(instruction)) {
-        a_buffers.insert(assignment.GetUniqueTopLevelAllocation(instruction)
-                             .ConsumeValueOrDie()
-                             ->index());
-      }
-    }
-
-    for (const HloInstruction* instruction : b) {
-      if (assignment.HasTopLevelAllocation(instruction)) {
-        if (a_buffers.count(assignment.GetUniqueTopLevelAllocation(instruction)
-                                .ConsumeValueOrDie()
-                                ->index())) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
   // Computation tracker for nested computations.
   ComputationTracker computation_tracker_;
 
@@ -235,12 +236,28 @@ class BufferAssignmentTest : public HloTestBase {
   Shape t_s32_f32v10_ = ShapeUtil::MakeTupleShape({s32_, f32vec10_});
 };
 
-namespace {
-std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module) {
-  return BufferAssigner::Run(module, MakeUnique<DependencyHloOrdering>(module),
-                             /*pointer_size=*/sizeof(void*))
-      .ConsumeValueOrDie();
-}
+// Returns true if the buffers assigned to instructions in "a" are distinct
+// from the buffers assigned to those in "b" (ie, intersection is empty).
+static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
+                            const std::vector<const HloInstruction*>& b,
+                            const BufferAssignment& assignment) {
+  std::set<BufferAllocation::Slice> a_slices;
+  for (const HloInstruction* instruction : a) {
+    if (assignment.HasTopLevelAllocation(instruction)) {
+      a_slices.insert(
+          assignment.GetUniqueTopLevelSlice(instruction).ConsumeValueOrDie());
+    }
+  }
+
+  for (const HloInstruction* instruction : b) {
+    if (assignment.HasTopLevelAllocation(instruction)) {
+      if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
+                             .ConsumeValueOrDie())) {
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 // Tests a computation consisting of a single scalar constant node.
@@ -248,7 +265,7 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignment(module.get());
@@ -266,7 +283,7 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
       LiteralUtil::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignment(module.get());
@@ -284,7 +301,7 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) {
       LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignment(module.get());
@@ -311,7 +328,7 @@ TEST_F(BufferAssignmentTest, Basic) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignment(module.get());
@@ -331,7 +348,113 @@ TEST_F(BufferAssignmentTest, Basic) {
 
   // The add node can reuse the mul node's buffer.
   const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
-  EXPECT_EQ(add_buffer.index(), add_buffer.index());
+  EXPECT_EQ(add_buffer.index(), mul_buffer.index());
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // The output of each op is colored with a different color, so we can not
+  // share anything.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, ""));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, ""));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunColoredBufferAssignment(
+      module.get(),
+      [](const HloInstruction* instruction, const ShapeIndex& index) {
+        static int64 serial = 0;
+        return LogicalBuffer::Color(serial++);
+      });
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+  // The mul node has a valid buffer assigned, doesn't share with input.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+  // The add node can not reuse the mul node's buffer due to coloring.
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_NE(add_buffer.index(), mul_buffer.index());
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
+}
+
+TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // The output of the mul and the add have the color 1, and the other buffers
+  // have the color 0, which allows the mul and add to share buffers.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, ""));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, ""));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunColoredBufferAssignment(
+      module.get(),
+      [](const HloInstruction* instruction, const ShapeIndex& index) {
+        return (instruction->opcode() == HloOpcode::kAdd ||
+                instruction->opcode() == HloOpcode::kMultiply)
+                   ? LogicalBuffer::Color(1)
+                   : LogicalBuffer::Color(0);
+      });
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+  // The mul node has a valid buffer assigned, doesn't share with input.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+  // The add node can reuse the mul node's buffer.
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_EQ(add_buffer.index(), mul_buffer.index());
 
   // The sub node has a valid output buffer assigned.
   GetAssignedOutputAllocation(*buffers, sub);
@@ -361,7 +484,7 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kSubtract, add, mul));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignment(module.get());
@@ -396,7 +519,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   // param0[100x10] ---> (map x+1)
   //
   // Builds the map function.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto map_computation =
       module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1"));
   auto inner_last = map_computation->root_instruction();
@@ -451,7 +574,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
   // out-of-order reductions could overwrite an element before a use.)
   //
   // param0[100] --- (exp1) --- (exp2) --- (reduce x+1) --- (exp3)
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1"));
 
@@ -502,7 +625,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   // const4[f32[4]] --- tuple --- while[condition, body]
   //
   // Builds the nested condition and body.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto condition_computation =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
   auto body_computation =
@@ -553,15 +676,14 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
 
   // Check that buffer for each subshape of 'while_op' shares allocation with
   // corresponding buffer from while body computation at same index.
-  TF_CHECK_OK(ShapeUtil::ForEachSubshape(
+  ShapeUtil::ForEachSubshape(
       while_op->shape(),
       [this, &buffers, while_op, body_root](const Shape& /*subshape*/,
                                             const ShapeIndex& index) {
         auto while_op_allocation = GetAllocation(*buffers, while_op, index);
         auto body_root_allocation = GetAllocation(*buffers, body_root, index);
         EXPECT_EQ(while_op_allocation.index(), body_root_allocation.index());
-        return Status::OK();
-      }));
+      });
 
   // Log size information for inspection.
   LOG(INFO) << "LogicalBuffer count " << buffers->Allocations().size()
@@ -583,7 +705,7 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, exp2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -608,11 +730,11 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -640,12 +762,12 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -677,12 +799,12 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
   auto tuple_element = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32vec100_, tuple, 0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}, {1}));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -703,30 +825,29 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
   //
   // param ---> (negate) ---> (slice) ---> (broadcast)
   //
-  // The negate should *not* share a buffer with broadcast.
+  // Neither negate nor slice may share a buffer with broadcast.
   auto builder = HloComputation::Builder(TestName());
   auto param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, f32vec100_, "param0"));
   // Negate output is 100 elements.
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
+  // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
-  // The instructions should not share buffers.
+  // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
             GetTopLevelAllocation(*assignment, negate));
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
             GetTopLevelAllocation(*assignment, slice));
-  EXPECT_NE(GetTopLevelAllocation(*assignment, negate),
-            GetTopLevelAllocation(*assignment, slice));
 }
 
 TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
@@ -745,12 +866,12 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -773,38 +894,37 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
   //
   // param ---> (negate) ---> (slice) ---> (broadcast) --> (tuple)
   //
-  // The negate should *not* share a buffer with broadcast.
+  // Neither negate nor slice may share a buffer with broadcast.
   auto builder = HloComputation::Builder(TestName());
   auto param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, f32vec100_, "param0"));
   // Negate output is 100 elements.
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
+  // Slice output is 10 elements.
   auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
   // Broadcast output is 40 elements.
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
   builder.AddInstruction(HloInstruction::CreateTuple({broadcast}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
-  // The instructions should not share buffers.
+  // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
             GetTopLevelAllocation(*assignment, negate));
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
             GetTopLevelAllocation(*assignment, slice));
-  EXPECT_NE(GetTopLevelAllocation(*assignment, negate),
-            GetTopLevelAllocation(*assignment, slice));
 }
 
 TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   // Verify that buffers for embedded computations are properly marked as
   // thread-local and that embedded parameters are not marked as
   // is_entry_computation_parameter.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto vec_shape = ShapeUtil::MakeShape(F32, {42});
   auto scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -849,8 +969,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   EXPECT_FALSE(map_root_alloc.maybe_live_out());
   EXPECT_TRUE(map_root_alloc.is_thread_local());
 
-  // Allocations for the call computation should not be thread-local and not
-  // live-out.
+  // Allocations for the call computation should not be thread-local.
   auto& call_param_alloc = GetTopLevelAllocation(*assignment, call_param);
   EXPECT_FALSE(call_param_alloc.is_entry_computation_parameter());
   EXPECT_FALSE(call_param_alloc.maybe_live_out());
@@ -858,7 +977,6 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
 
   auto& call_root_alloc = GetTopLevelAllocation(*assignment, call_root);
   EXPECT_FALSE(call_root_alloc.is_entry_computation_parameter());
-  EXPECT_FALSE(call_root_alloc.maybe_live_out());
   EXPECT_FALSE(call_root_alloc.is_thread_local());
 
   // Entry computation allocations can be marked liveout and
@@ -883,7 +1001,7 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
                                     ShapeUtil::MakeShape(S32, {42})}),
       "param0"));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -893,7 +1011,7 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
 
   // Verify each buffer allocation is marked as an entry computation parameter
   // and is liveout.
-  TF_CHECK_OK(ShapeUtil::ForEachSubshape(
+  ShapeUtil::ForEachSubshape(
       tuple_param->shape(),
       [this, &assignment, tuple_param](const Shape& /*subshape*/,
                                        const ShapeIndex& index) {
@@ -901,8 +1019,7 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
         EXPECT_TRUE(allocation.is_entry_computation_parameter());
         EXPECT_EQ(0, allocation.parameter_number());
         EXPECT_TRUE(allocation.maybe_live_out());
-        return Status::OK();
-      }));
+      });
 }
 
 TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
@@ -919,7 +1036,7 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(tuple_param->shape(), {1}), tuple_param, 1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -962,7 +1079,7 @@ TEST_F(BufferAssignmentTest, DISABLED_TupleConstantAsOutput) {
       LiteralUtil::MakeTuple({LiteralUtil::CreateR0<int64>(0).get(),
                               LiteralUtil::CreateR0<int64>(1).get()})));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -976,7 +1093,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
                                  ShapeUtil::MakeShape(S32, {101})}),
       /*operands=*/{}, /*custom_call_target=*/"foo_function"));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -991,7 +1108,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
 
 TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
   // Test a computation which returns a tuple call value.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1024,6 +1141,75 @@ TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
             GetTopLevelAllocation(*assignment, sub_param));
 }
 
+TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
+  // Test a chain of calls with tuple output. The chain looks like:
+  // A: call(B, tuple(param))
+  // B: call(C, param)
+  // C: call(D, param)
+  // D: param
+  auto module = CreateNewModule();
+  auto elem_shape = f32vec4_;
+  auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
+
+  auto d_builder = HloComputation::Builder(TestName() + "_d");
+  auto d_param = d_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "d_param"));
+  auto d_computation = d_builder.Build();
+
+  auto c_builder = HloComputation::Builder(TestName() + "_c");
+  auto c_param = c_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "c_param"));
+  auto c_call = c_builder.AddInstruction(
+      HloInstruction::CreateCall(tuple_shape, {c_param}, d_computation.get()));
+  auto c_computation = c_builder.Build();
+
+  auto b_builder = HloComputation::Builder(TestName() + "_b");
+  auto b_param = b_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "b_param"));
+  auto b_call = b_builder.AddInstruction(
+      HloInstruction::CreateCall(tuple_shape, {b_param}, c_computation.get()));
+  auto b_computation = b_builder.Build();
+
+  auto a_builder = HloComputation::Builder(TestName());
+  auto a_param = a_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, elem_shape, "param"));
+  auto a_tuple =
+      a_builder.AddInstruction(HloInstruction::CreateTuple({a_param}));
+  auto a_call = a_builder.AddInstruction(
+      HloInstruction::CreateCall(tuple_shape, {a_tuple}, b_computation.get()));
+  auto a_computation = a_builder.Build();
+
+  // Add the computations in an order that doesn't match the dependency
+  // post-order, to shake out more possible bugs.
+  module->AddEmbeddedComputation(std::move(d_computation));
+  module->AddEmbeddedComputation(std::move(c_computation));
+  module->AddEntryComputation(std::move(a_computation));
+  module->AddEmbeddedComputation(std::move(b_computation));
+
+  auto assignment = RunBufferAssignment(module.get());
+
+  // Buffers for call are co-located with the sub-computations.
+  EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{}),
+            GetAllocation(*assignment, b_call, /*index=*/{}));
+  EXPECT_EQ(GetAllocation(*assignment, b_call, /*index=*/{}),
+            GetAllocation(*assignment, c_call, /*index=*/{}));
+  EXPECT_EQ(GetAllocation(*assignment, c_call, /*index=*/{}),
+            GetAllocation(*assignment, d_param, /*index=*/{}));
+  EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{0}),
+            GetAllocation(*assignment, b_call, /*index=*/{0}));
+  EXPECT_EQ(GetAllocation(*assignment, b_call, /*index=*/{0}),
+            GetAllocation(*assignment, c_call, /*index=*/{0}));
+  EXPECT_EQ(GetAllocation(*assignment, c_call, /*index=*/{0}),
+            GetAllocation(*assignment, d_param, /*index=*/{0}));
+  // The parameters aren't aliased with anything.
+  EXPECT_TRUE(BuffersDistinct({a_param}, {b_param}, *assignment));
+  EXPECT_TRUE(BuffersDistinct({a_param}, {c_param}, *assignment));
+  EXPECT_TRUE(BuffersDistinct({a_param}, {d_param}, *assignment));
+  EXPECT_TRUE(BuffersDistinct({b_param}, {c_param}, *assignment));
+  EXPECT_TRUE(BuffersDistinct({b_param}, {d_param}, *assignment));
+  EXPECT_TRUE(BuffersDistinct({c_param}, {d_param}, *assignment));
+}
+
 TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   // Test a computation which returns a bitcast value.
   auto builder = HloComputation::Builder(TestName());
@@ -1032,7 +1218,7 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   auto bitcast = builder.AddInstruction(
       HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -1058,7 +1244,7 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple_shape, HloOpcode::kSelect, pred_param, tuple_param0, tuple_param1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -1066,19 +1252,20 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
   // buffer and receives its own allocation.
   auto select_alloc = GetTopLevelAllocation(*assignment, select);
   EXPECT_EQ(1, select_alloc.assigned_buffers().size());
-  EXPECT_EQ(select, select_alloc.assigned_buffers()[0]->instruction());
+  EXPECT_EQ(select,
+            select_alloc.assigned_buffers().begin()->first->instruction());
 
   // The buffer for the tuple element of the select is forwarded from one its
-  // operands which cannot be determined statically. Therefore its allocation
-  // should include the allocations of both of the elements in the parameters.
-  auto element_allocations = assignment->GetAllocations(select, /*index=*/{0});
-  EXPECT_EQ(2, element_allocations.size());
-  EXPECT_MATCH(testing::SetToVec<BufferAllocation>(element_allocations),
-               testing::UnorderedMatcher<BufferAllocation>(
-                   *assignment->GetUniqueAllocation(tuple_param0, /*index=*/{0})
-                        .ConsumeValueOrDie(),
-                   *assignment->GetUniqueAllocation(tuple_param1, /*index=*/{0})
-                        .ConsumeValueOrDie()));
+  // operands which cannot be determined statically. Therefore its slices
+  // should include the slices of both of the elements in the parameters.
+  auto element_slices = assignment->GetAllSlices(select, /*index=*/{0});
+  EXPECT_EQ(2, element_slices.size());
+  EXPECT_THAT(element_slices,
+              ::testing::UnorderedElementsAre(
+                  assignment->GetUniqueSlice(tuple_param0, /*index=*/{0})
+                      .ConsumeValueOrDie(),
+                  assignment->GetUniqueSlice(tuple_param1, /*index=*/{0})
+                      .ConsumeValueOrDie()));
 }
 
 // TODO(b/34669761): Remove this test when buffers are allowed to share
@@ -1095,7 +1282,7 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
   auto copy = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape, HloOpcode::kCopy, tuple_element));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   auto assignment = RunBufferAssignment(module.get());
 
@@ -1106,6 +1293,330 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
             GetTopLevelAllocation(*assignment, copy));
 }
 
-}  // namespace
+TEST_F(BufferAssignmentTest, OneTempAllocation) {
+  // Test a computation that requires multiple temp buffers, and ensure they are
+  // combined into a single allocation.
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape_2x3 = ShapeUtil::MakeShape(F32, {2, 3});
+  Shape shape_2x4 = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape shape_3x4 = ShapeUtil::MakeShape(F32, {3, 4});
+  Shape shape_4x4 = ShapeUtil::MakeShape(F32, {4, 4});
+  Shape shape_5x4 = ShapeUtil::MakeShape(F32, {5, 4});
 
+  // There should be separate temp buffers for dot_ab and dot_bc.
+  auto param_a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape_2x3, "param_a"));
+  auto param_b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape_3x4, "param_b"));
+  auto param_c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, shape_4x4, "param_c"));
+  auto dot_ab = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_2x4, HloOpcode::kDot, param_a, param_b));
+  auto dot_bc = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_3x4, HloOpcode::kDot, param_b, param_c));
+  builder.AddInstruction(
+      HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 1));
+
+  // Run buffer assignment with alignment=1.
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  auto assignment = RunBufferAssignment(module.get(), /*alignment=*/1);
+
+  // There are 5 allocations: 3 parameters, 1 output, and 1 temp.
+  EXPECT_EQ(5, assignment->Allocations().size());
+
+  // Ensure the temp buffers for dot_ab and dot_bc share a single allocation,
+  // and each occupies different slices of that allocation.
+  BufferAllocation::Slice slice_ab =
+      assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie();
+  BufferAllocation::Slice slice_bc =
+      assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie();
+  EXPECT_EQ(slice_ab.allocation(), slice_bc.allocation());
+  EXPECT_NE(slice_ab, slice_bc);
+  EXPECT_EQ(32, slice_ab.size());
+  EXPECT_EQ(48, slice_bc.size());
+  EXPECT_EQ(80, slice_ab.allocation()->size());
+  EXPECT_EQ(80, slice_bc.allocation()->size());
+
+  // Re-run buffer assignment with alignment=64.
+  assignment = RunBufferAssignment(module.get(), /*alignment=*/64);
+  EXPECT_EQ(5, assignment->Allocations().size());
+  slice_ab = assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie();
+  slice_bc = assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie();
+  EXPECT_EQ(slice_ab.allocation(), slice_bc.allocation());
+  EXPECT_NE(slice_ab, slice_bc);
+  EXPECT_EQ(32, slice_ab.size());
+  EXPECT_EQ(48, slice_bc.size());
+  // Ensure the offsets and allocation size account for the alignment, without
+  // assuming which buffer gets assigned first.
+  if (slice_ab.offset() == 0) {
+    EXPECT_EQ(64, slice_bc.offset());
+    EXPECT_EQ(64 + 48, slice_ab.allocation()->size());
+    EXPECT_EQ(64 + 48, slice_bc.allocation()->size());
+  } else {
+    EXPECT_EQ(64, slice_ab.offset());
+    EXPECT_EQ(0, slice_bc.offset());
+    EXPECT_EQ(64 + 32, slice_ab.allocation()->size());
+    EXPECT_EQ(64 + 32, slice_bc.allocation()->size());
+  }
+}
+
+class WhileBufferAssignmentTest : public HloTestBase {
+ protected:
+  std::unique_ptr<HloComputation> BuildWhileConditionComputation(
+      const string& name) {
+    auto builder = HloComputation::Builder(name);
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto zero = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)));
+    auto ten = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(10)));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, zero, ten));
+    return builder.Build();
+  }
+
+  std::unique_ptr<HloComputation> BuildWhileBodyComputation(
+      const string& name) {
+    auto builder = HloComputation::Builder(name);
+    auto loop_state = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto input = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 0));
+    auto weights = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    auto output = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, input, weights));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({input, weights, output}));
+    return builder.Build();
+  }
+
+  std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
+                                                        int64 alignment = 1) {
+    auto sequence =
+        CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
+    return BufferAssigner::Run(
+               module, MakeUnique<SequentialHloOrdering>(module, sequence),
+               ByteSizeOf, alignment)
+        .ConsumeValueOrDie();
+  }
+
+  static int64 ByteSizeOf(const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+  }
+
+  Shape data_shape_ = ShapeUtil::MakeShape(F32, {4});
+  Shape loop_state_shape_ =
+      ShapeUtil::MakeTupleShape({data_shape_, data_shape_, data_shape_});
+};
+
+static void RunCopyInsertion(HloModule* module) {
+  CopyInsertion copy_insertion;
+  EXPECT_IS_OK(copy_insertion.Run(module).status());
+}
+
+TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+  auto weights1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, data_shape_, "weights1"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+  auto output1 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+
+  auto cond1 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body1 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+  auto input1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input1, weights1, output1}));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+
+  module->AddEntryComputation(builder.Build());
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  // Verify 'input0' and read-only use while0{0} alias.
+  EXPECT_EQ(assignment->GetUniqueSlice(input0, {}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie());
+  // Verify 'weights0' and read-only use while0{1} alias.
+  EXPECT_EQ(assignment->GetUniqueSlice(weights0, {}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while0, {1}).ConsumeValueOrDie());
+  // Verify 'while0{2}' and read-only use while1{0} alias.
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {2}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {0}).ConsumeValueOrDie());
+  // Verify 'weights1' and read-only use while1{1} alias.
+  EXPECT_EQ(assignment->GetUniqueSlice(weights1, {}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
+}
+
+TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+  auto output1 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+
+  auto cond1 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body1 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output1}));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+
+  module->AddEntryComputation(builder.Build());
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  // while0 and while1 buffers should be completely aligned.
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {0}).ConsumeValueOrDie());
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {1}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {2}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {2}).ConsumeValueOrDie());
+}
+
+TEST_F(BufferAssignmentTest, TwoCalls) {
+  auto module = MakeUnique<HloModule>(TestName());
+  Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
+  HloComputation* sub_computation;
+  {
+    auto builder = HloComputation::Builder(TestName() + "_sub_comp");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32, "param"));
+    auto constant1 = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+    auto add = builder.AddInstruction(
+        HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, constant1));
+    sub_computation = module->AddEmbeddedComputation(builder.Build(add));
+  }
+  auto builder = HloComputation::Builder(TestName());
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto call1 = builder.AddInstruction(
+      HloInstruction::CreateCall(r0f32, {constant2}, sub_computation));
+  auto call2 = builder.AddInstruction(
+      HloInstruction::CreateCall(r0f32, {constant3}, sub_computation));
+  auto add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, call1, constant2));
+  auto add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, call2, add1));
+  module->AddEntryComputation(builder.Build(add2));
+
+  {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  }
+
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
+}
+
+// Test buffer assignment for while nodes with multiple uses.
+// TODO(b/37245345): Fix buffer assignment for this case.
+TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, while0));
+
+  auto get0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
+  auto get1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, get0, get1));
+  module->AddEntryComputation(builder.Build());
+
+  RunCopyInsertion(module.get());
+
+  {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    EXPECT_TRUE(result);
+  }
+
+  auto assignment = RunBufferAssignment(module.get());
+
+  EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
+}
+
+}  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index c788c643069..1b14c26340f 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 
-#include <set>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -37,15 +37,17 @@ namespace xla {
 
 /* static */
 StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
+    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+    TuplePointsToAnalysis::Colorer colorer) {
   std::unique_ptr<BufferLiveness> liveness(
-      new BufferLiveness(module, std::move(hlo_ordering)));
+      new BufferLiveness(module, std::move(hlo_ordering), std::move(colorer)));
   TF_RETURN_IF_ERROR(liveness->Analyze());
   return std::move(liveness);
 }
 
 tensorflow::Status BufferLiveness::Analyze() {
-  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_,
+                      TuplePointsToAnalysis::Run(module_, colorer_));
   for (auto& computation : module_->computations()) {
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
@@ -61,11 +63,9 @@ tensorflow::Status BufferLiveness::Analyze() {
     }
 
     if (computation.get() == module_->entry_computation()) {
-      for (const LogicalBuffer* live_out_buffer :
-           points_to_analysis_->GetPointsToSet(computation->root_instruction())
-               .CreateFlattenedSet()) {
-        maybe_live_out_buffers_.insert(live_out_buffer);
-      }
+      const HloInstruction* root = computation->root_instruction();
+      maybe_live_out_buffers_ =
+          points_to_analysis_->GetPointsToSet(root).CreateFlattenedSet();
     }
   }
 
@@ -92,19 +92,6 @@ string BufferLiveness::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
-// Returns false if 'user' cannot possibly use the buffer at 'index' in
-// 'operand'. Returns true otherwise.
-// Precondition: 'operand' is an operand of 'user'.
-bool MayUseBufferInOperand(HloInstruction* operand, const ShapeIndex& index,
-                           HloInstruction* user) {
-  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
-    // GetTupleElement instructions only access the top-level buffer of their
-    // operand.
-    return false;
-  }
-  return true;
-}
-
 bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
                                                 const LogicalBuffer& b) const {
   TF_CHECK_OK(points_to_analysis_->VerifyBuffer(a));
@@ -117,7 +104,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   // Every user of 'a' must be a predecessor of 'b' or 'b' itself.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     for (auto user : alias.instruction()->users()) {
-      if (!MayUseBufferInOperand(alias.instruction(), alias.index(), user)) {
+      if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(), user,
+                                  points_to_analysis())) {
         continue;
       }
       if (user != b.instruction() &&
@@ -127,23 +115,44 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
     }
   }
 
-  // If 'b' is a user of 'a' then the buffers interfere if b is not an
-  // elementwise operation emitting the same shape/layout as 'a'.
+  // If 'b' is a user of 'a' then the buffers interfere unless 'a.instruction'
+  // and 'b.instruction' emit the same shape/layout, and 'b.instruction' meets
+  // the qualifications specified in CanShareOperandBufferWithUser.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
-    if (alias.instruction()->users().count(b.instruction()) > 0 &&
-        (!ShapeUtil::Equal(alias.instruction()->shape(),
-                           b.instruction()->shape()) ||
-         !b.instruction()->IsElementwise())) {
+    if (b.instruction()->IsUserOf(alias.instruction()) &&
+        !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
+                                       b.instruction(), b.index(),
+                                       points_to_analysis())) {
       return false;
     }
   }
   return true;
 }
 
+namespace {
+bool IsEntryParameter(const HloInstruction* instruction) {
+  const HloComputation* computation = instruction->parent();
+  return instruction->opcode() == HloOpcode::kParameter &&
+         computation == computation->parent()->entry_computation();
+}
+}  // namespace
+
 bool BufferLiveness::MayInterfere(const LogicalBuffer& a,
                                   const LogicalBuffer& b) const {
-  return (!live_range_strictly_before(a, b) &&
-          !live_range_strictly_before(b, a));
+  // Entry parameters live at the entry of the execution, thus always interfere
+  // with all other instructions executing before them in the ordering.
+  const HloInstruction* a_instruction = a.instruction();
+  const HloInstruction* b_instruction = b.instruction();
+  if (IsEntryParameter(a_instruction) &&
+      hlo_ordering_->ExecutesBefore(b_instruction, a_instruction)) {
+    return true;
+  }
+  if (IsEntryParameter(b_instruction) &&
+      hlo_ordering_->ExecutesBefore(a_instruction, b_instruction)) {
+    return true;
+  }
+  // Buffers without disjoint liveness may interfere.
+  return !live_range_strictly_before(a, b) && !live_range_strictly_before(b, a);
 }
 
 bool BufferLiveness::MaybeLiveOut(const LogicalBuffer& buffer) const {
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index b9e7a2a28db..9bb2564a831 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -39,7 +39,9 @@ class BufferLiveness {
   // Constructs a buffer liveness object for the given module assuming the given
   // HLO instruction ordering.
   static StatusOr<std::unique_ptr<BufferLiveness>> Run(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
+      TuplePointsToAnalysis::Colorer colorer =
+          TuplePointsToAnalysis::DefaultColorer());
 
   // Returns true if the live range of the buffer containing the output of 'a'
   // may overlap with the live range of the buffer of 'b'. If instruction 'a'
@@ -51,17 +53,29 @@ class BufferLiveness {
   // the entry computation.
   bool MaybeLiveOut(const LogicalBuffer& buffer) const;
 
+  // Returns the complete set of buffers that may be live out of the module.
+  const tensorflow::gtl::FlatSet<const LogicalBuffer*>& maybe_live_out_buffers()
+      const {
+    return maybe_live_out_buffers_;
+  }
+
   // Returns the underlying points-to analysis used for this liveness analysis.
   const TuplePointsToAnalysis& points_to_analysis() const {
     return *points_to_analysis_;
   }
 
+  // Returns the underlying hlo ordering used for this liveness analysis.
+  const HloOrdering& hlo_ordering() const { return *hlo_ordering_; }
+
   string ToString() const;
 
  private:
   explicit BufferLiveness(const HloModule* module,
-                          std::unique_ptr<HloOrdering> hlo_ordering)
-      : module_(module), hlo_ordering_(std::move(hlo_ordering)) {}
+                          std::unique_ptr<HloOrdering> hlo_ordering,
+                          TuplePointsToAnalysis::Colorer colorer)
+      : module_(module),
+        hlo_ordering_(std::move(hlo_ordering)),
+        colorer_(colorer) {}
 
   // Perform buffer liveness analysis. This method must be called prior to
   // MayInterfere or MaybeLiveOut.
@@ -84,6 +98,8 @@ class BufferLiveness {
   tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_;
 
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  TuplePointsToAnalysis::Colorer colorer_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 1ca5768dbe1..fda44ff4d2d 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -92,6 +92,12 @@ class BufferLivenessTest : public HloTestBase {
         GetBuffer(liveness, instruction, /*index=*/{}));
   }
 
+  std::unique_ptr<HloComputation> BuildDummyComputation() {
+    auto builder = HloComputation::Builder(TestName() + "_dummy");
+    builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
+    return builder.Build();
+  }
+
   const Shape vec_ = ShapeUtil::MakeShape(xla::F32, {42});
 };
 
@@ -110,7 +116,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -118,12 +124,17 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
                           MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
-  // No buffers should interfere.
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, log));
+
+  // No buffers should interfere.
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, log));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, negate));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, log));
-  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, log));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, log, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, log, exp));
 
   // Buffers should interfere with itself.
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, exp));
@@ -135,22 +146,73 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   EXPECT_TRUE(InstructionMaybeLiveOut(*liveness, log));
 }
 
-TEST_F(BufferLivenessTest, NonElementwiseOperand) {
-  // A chain of operations with one elementwise and one non-elementwise. The
-  // elementwise op should not interfere with its operand, while the
-  // non-elementwise op should interfere.
+TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
+  // Two entry params, which interfere with each other.
   //
-  // param --> negate -> reverse
+  // param0 --> negate ---------------\
+  //                   param1 --> exp --> add
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, vec_, "param1"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, param0));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kExp, param1));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
+
+  auto module = CreateNewModule();
+  HloComputation* entry = module->AddEntryComputation(builder.Build());
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence.insert({entry, {param0, negate, param1, exp, add}});
+  auto liveness = BufferLiveness::Run(
+                      module.get(),
+                      MakeUnique<SequentialHloOrdering>(module.get(), sequence))
+                      .ConsumeValueOrDie();
+
+  // Entry parameters interfere as if they are defined simultaneously at
+  // the very beginning.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param0, param1));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, add));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param1, param0));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param1, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param1, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param1, add));
+
+  // Negate and exp still interfere.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+
+  // But {negate, add} and {exp, add} don't interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
+}
+
+TEST_F(BufferLivenessTest, NonElementwiseOperand) {
+  // A chain of operations with two elementwise and one non-elementwise. The
+  // elementwise op should not interfere with its operand, while the
+  // non-elementwise op should interfere. Entry params always interfere.
+  //
+  // param --> exp -> negate -> reverse
   //
   auto builder = HloComputation::Builder(TestName());
   auto param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kExp, param));
   auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, param));
+      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, exp));
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -158,10 +220,14 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
                           MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
-  // No buffers should interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, reverse));
+
+  // Negate is elementwise, so doesn't interfere with its operand.
+  // Reverse is non-elementwise, so does interfere with its operand.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, negate));
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, reverse));
-  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
 }
 
 TEST_F(BufferLivenessTest, OverlappedBuffers) {
@@ -180,7 +246,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -190,8 +256,15 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, exp));
-  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
 }
 
 TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
@@ -204,8 +277,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   // Sequential order:
   //  param, negate, exp, add
   //
-  // Liveness is identical to the DependencyHloOrdering except that 'param' and
-  // exp no longer interfere.
+  // Liveness is identical to the DependencyHloOrdering.
   auto builder = HloComputation::Builder(TestName());
   auto param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
@@ -216,7 +288,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   SequentialHloOrdering::HloModuleSequence module_sequence;
@@ -229,8 +301,15 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
-  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
 }
 
 TEST_F(BufferLivenessTest, TupleLiveOut) {
@@ -251,7 +330,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -271,7 +350,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -328,7 +407,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0->shape(), tuple_constant, 0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -391,8 +470,9 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(BuildDummyComputation());
+  module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
       BufferLiveness::Run(module.get(),
@@ -451,8 +531,9 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(BuildDummyComputation());
+  module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
       BufferLiveness::Run(module.get(),
@@ -482,6 +563,229 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
       TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1}));
 }
 
+class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
+ protected:
+  // Builds and runs a computation (see test case computation graphs below).
+  // Runs BufferLiveness on this computation.
+  // Returns whether buffer interference is detected between tuple-shaped
+  // parameter and root instructions at tuple element 1.
+  bool Run(const bool update_uses_tuple_element1,
+           const bool fuse_gte0 = false) {
+    auto builder = HloComputation::Builder(TestName());
+    // Create param0 Tuple.
+    Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+    Shape update_shape = ShapeUtil::MakeShape(F32, {3});
+    auto tuple_param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "param0"));
+
+    auto gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 0));
+
+    auto gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1));
+
+    auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+    HloInstruction* slice = nullptr;
+    if (update_uses_tuple_element1) {
+      // Create a slice instruction as an additional user of 'gte1'.
+      slice = builder.AddInstruction(
+          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}, {1}));
+      update = builder.AddInstruction(HloInstruction::CreateBinary(
+          update_shape, HloOpcode::kAdd, update, slice));
+    }
+    // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
+    auto starts = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+    auto dynamic_update_slice =
+        builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            data_shape, gte1, update, starts));
+    // Create output tuple.
+    auto tuple_root = builder.AddInstruction(
+        HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+    // Build module and get reference to entry computation.
+    auto module = CreateNewModule();
+    module->AddEntryComputation(BuildDummyComputation());
+    auto* computation = module->AddEmbeddedComputation(builder.Build());
+    // Create fusion instruction based on number of tuple element 1 users.
+    if (update_uses_tuple_element1) {
+      computation->CreateFusionInstruction(
+          {dynamic_update_slice, starts, update, CHECK_NOTNULL(slice), gte1},
+          HloInstruction::FusionKind::kLoop);
+    } else {
+      computation->CreateFusionInstruction(
+          {dynamic_update_slice, starts, update, gte1},
+          HloInstruction::FusionKind::kLoop);
+    }
+    // Create fusion instruction for tuple element 0 (if requested).
+    if (fuse_gte0) {
+      computation->CreateFusionInstruction({gte0},
+                                           HloInstruction::FusionKind::kLoop);
+    }
+
+    // Run BufferLiveness on 'module'.
+    auto liveness =
+        BufferLiveness::Run(module.get(),
+                            MakeUnique<DependencyHloOrdering>(module.get()))
+            .ConsumeValueOrDie();
+    // Return whether or not buffers interference is detected between
+    // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
+    return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
+  }
+};
+
+// Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion)
+// do not overlap with the following computation:
+//
+//         Param0
+//        /     \
+//     GTE(0)  Fusion ----------->  FusionParam
+//        |      |                      |
+//        |      |                    GTE(1) Const Const
+//        |      |                      \      |    /
+//        |      |                    DynamicUpdateSlice  // fused root
+//         \    /
+//          Tuple  // computation root
+//
+TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterference) {
+  EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false));
+}
+
+// Tests that live ranges of buffers Param0[1] and Tuple[1] (which aliases
+// 'fusion1') do not overlap in the presence of another fusion instruction
+// (which is a user of 'param0' at a different tuple index).
+// BufferLiveness should detect no uses of Param0 at index {1} in Fusion0
+// (because Fusion0 only uses Param0 at index {0}).
+//
+//                               Param0
+//                               /    \
+//      FusionParam  <----- Fusion0  Fusion1 ------>  FusionParam
+//         |                    |      |                 |
+//        GTE(0)                |      |               GTE(1) Const Const
+//                              |      |                  \      |    /
+//                               \    /                DynamicUpdateSlice
+//                               Tuple
+//
+TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterferenceWithUnrelatedFusion) {
+  EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false, /*fuse_gte0=*/true));
+}
+
+// Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion)
+// do overlap because GTE(1) has two users:
+// 1) DynamicUpdateSlice at operand 0.
+// 2) Slice at operand 0.
+//
+//         Param0
+//        /     \   Const
+//       /       \  /
+//     GTE(0)  Fusion ----------->  FusionParam FusionParam
+//        |      |                      |         |
+//        |      |                    GTE(1)      /
+//        |      |                      | \      /
+//        |      |                      | Slice /
+//        |      |                      |   \  /
+//        |      |                      |   Add   Const
+//        |      |                      |    |      |
+//        |      |                    DynamicUpdateSlice  // fused root
+//         \    /
+//          Tuple  // computation root
+//
+TEST_F(FusedDynamicUpdateSliceLivenessTest, WithInterference) {
+  EXPECT_TRUE(Run(/*update_uses_tuple_element1=*/true));
+}
+
+class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
+ protected:
+  // Builds and runs a computation (see test case computation graphs below).
+  // Runs BufferLiveness on this computation.
+  // Returns whether buffer interference is detected between tuple-shaped
+  // parameter and root instructions at tuple element 1.
+  bool Run(const bool tuple_element1_has_two_uses) {
+    auto builder = HloComputation::Builder(TestName());
+    // Create param0 Tuple.
+    Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+    Shape update_shape = ShapeUtil::MakeShape(F32, {3});
+    auto tuple_param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "param0"));
+
+    auto gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 0));
+
+    auto gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1));
+
+    auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+
+    if (tuple_element1_has_two_uses) {
+      // Add 'gte0' and 'gte1' to create another user of 'gte1'.
+      gte0 = builder.AddInstruction(HloInstruction::CreateBinary(
+          data_shape, HloOpcode::kAdd, gte0, gte1));
+    }
+    // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
+    auto starts = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+    auto dynamic_update_slice =
+        builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            data_shape, gte1, update, starts));
+    // Create output tuple.
+    auto tuple_root = builder.AddInstruction(
+        HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+    // Build module and get reference to entry computation.
+    auto module = CreateNewModule();
+    module->AddEntryComputation(BuildDummyComputation());
+    module->AddEmbeddedComputation(builder.Build());
+    // Run BufferLiveness on 'module'.
+    auto liveness =
+        BufferLiveness::Run(module.get(),
+                            MakeUnique<DependencyHloOrdering>(module.get()))
+            .ConsumeValueOrDie();
+    // Return whether or not buffers interference is detected between
+    // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
+    return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
+  }
+};
+
+// Tests that live ranges of buffers Param0[1] and Tuple[1] do not overlap in
+// the following computation (because DynamicUpdateSlice (at operand 0) is the
+// unique user):
+//
+//     Parameter0
+//      |      |
+//    GTE(0) GTE(1) Const Const
+//      |      \      |    /
+//      |    DynamicUpdateSlice
+//       \    /
+//        Tuple
+//
+TEST_F(DynamicUpdateSliceLivenessTest, NoInterference) {
+  EXPECT_FALSE(Run(/*tuple_element1_has_two_uses=*/false));
+}
+
+// Tests that live ranges of buffers Param0[1] and Tuple[1] do overlap because
+// GTE(1) has two users:
+// 1) DynamicUpdateSlice at operand 0.
+// 2) Add at operand 1.
+//
+//     Parameter0
+//      |      |
+//    GTE(0) GTE(1)
+//      |   /  |
+//      |  /   |
+//      Add    |     Const Const
+//      |      |      |      |
+//      |    DynamicUpdateSlice
+//       \    /
+//        Tuple
+//
+TEST_F(DynamicUpdateSliceLivenessTest, WithInterference) {
+  EXPECT_TRUE(Run(/*tuple_element1_has_two_uses=*/true));
+}
+
 }  // namespace
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
new file mode 100644
index 00000000000..fa7b2a30952
--- /dev/null
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -0,0 +1,306 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+
+#include <queue>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+using ::tensorflow::strings::Appendf;
+using ::tensorflow::strings::StrCat;
+
+string CallContextToString(CallContext context) {
+  switch (context) {
+    case CallContext::kNone:
+      return "kNone";
+    case CallContext::kSequential:
+      return "kSequential";
+    case CallContext::kParallel:
+      return "kParallel";
+    case CallContext::kBoth:
+      return "kBoth";
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const CallContext& context) {
+  out << CallContextToString(context);
+  return out;
+}
+
+CallContext GetInstructionCallContext(const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kWhile:
+      return CallContext::kSequential;
+    case HloOpcode::kMap:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kFusion:
+      return CallContext::kParallel;
+    default:
+      return CallContext::kNone;
+  }
+}
+
+string CallSite::ToString() const {
+  return StrCat(instruction()->name(), " calls in context ",
+                CallContextToString(context()), ": ",
+                tensorflow::str_util::Join(
+                    called_computations(), ", ",
+                    [](string* out, const HloComputation* computation) {
+                      out->append(computation->name());
+                    }));
+}
+
+CallGraphNode::CallGraphNode(HloComputation* computation)
+    : computation_(computation) {}
+
+const CallSite* CallGraphNode::GetCallSite(
+    const HloInstruction* instruction) const {
+  auto it = callsite_instructions_.find(instruction);
+  if (it == callsite_instructions_.end()) {
+    return nullptr;
+  }
+  return &callsites_[it->second];
+}
+
+void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
+  caller_callsites_.push_back(caller_callsite);
+  HloComputation* caller = caller_callsite.instruction()->parent();
+  if (!ContainsKey(caller_set_, caller)) {
+    callers_.push_back(caller);
+    caller_set_.insert(caller);
+  }
+}
+
+void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
+  CHECK_EQ(instruction->parent(), computation());
+  const CallContext context = GetInstructionCallContext(instruction);
+  if (!instruction->called_computations().empty()) {
+    CHECK(context == CallContext::kSequential ||
+          context == CallContext::kParallel);
+    callsite_instructions_.insert({instruction, callsites_.size()});
+    callsites_.push_back(
+        CallSite(instruction, instruction->called_computations(), context));
+    // Update callee computations to include any new computations called by this
+    // instruction.
+    for (auto* callee : callsites_.back().called_computations()) {
+      if (!ContainsKey(callee_set_, callee)) {
+        callees_.push_back(callee);
+        callee_set_.insert(callee);
+      }
+    }
+  }
+}
+
+CallGraph::CallGraph(const HloModule* module) : module_(module) {}
+
+const CallGraphNode& CallGraph::GetNode(
+    const HloComputation* computation) const {
+  auto it = node_indices_.find(computation);
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
+}
+
+CallGraphNode& CallGraph::GetNode(const HloComputation* computation) {
+  auto it = node_indices_.find(computation);
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
+}
+
+namespace {
+
+// Returns the call context of a computation which is called from contexts 'a'
+// and 'b'.
+CallContext UnionContexts(CallContext a, CallContext b) {
+  if (a == CallContext::kNone) {
+    return b;
+  } else if (b == CallContext::kNone) {
+    return a;
+  } else if (a == b) {
+    return a;
+  } else {
+    // Contexts are different and neither is kNone, ie one is kSequential and
+    // the other is kParallel.
+    return CallContext::kBoth;
+  }
+}
+
+}  // namespace
+
+void CallGraph::SetCallContexts() {
+  std::queue<CallGraphNode*> worklist;
+
+  // Initialize worklist with all roots of the call graph (computations without
+  // callers).
+  for (const std::unique_ptr<HloComputation>& computation :
+       module_->computations()) {
+    CallGraphNode& node = GetNode(computation.get());
+    if (node.callers().empty()) {
+      node.set_context(CallContext::kSequential);
+      worklist.push(&node);
+    }
+  }
+
+  while (!worklist.empty()) {
+    CallGraphNode* node = worklist.front();
+    worklist.pop();
+
+    for (const CallSite& callsite : node->callsites()) {
+      for (const HloComputation* callee : callsite.called_computations()) {
+        CallGraphNode& callee_node = GetNode(callee);
+
+        // Update context of callee computation based on the callsite and its
+        // current context.
+        CallContext context_to_add;
+        if (callsite.context() == CallContext::kParallel) {
+          context_to_add = CallContext::kParallel;
+        } else {
+          CHECK_EQ(callsite.context(), CallContext::kSequential);
+          context_to_add = node->context();
+        }
+        CallContext new_context =
+            UnionContexts(context_to_add, callee_node.context());
+
+        if (new_context != callee_node.context()) {
+          // Context of computation has been changed so add node to worklist.
+          callee_node.set_context(new_context);
+          worklist.push(&callee_node);
+        }
+      }
+    }
+  }
+
+  // No node should have a kNone calling context.
+  for (const std::unique_ptr<HloComputation>& computation :
+       module_->computations()) {
+    CHECK_NE(GetNode(computation.get()).context(), CallContext::kNone);
+  }
+}
+
+/* static */
+std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
+  // Constructor for CallGraph is private so MakeUnique can't be used.
+  auto call_graph = WrapUnique<CallGraph>(new CallGraph(module));
+
+  VLOG(2) << "Building call graph for:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  // Construct nodes of the call graph and populate the callsites.
+  for (const std::unique_ptr<HloComputation>& computation :
+       module->computations()) {
+    auto it_added = call_graph->node_indices_.insert(
+        {computation.get(), call_graph->nodes_.size()});
+    // All computations should be unique, so the computation should not already
+    // exist in the map.
+    CHECK(it_added.second);
+    call_graph->nodes_.emplace_back(computation.get());
+
+    // Add all callsites in this computation.
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      call_graph->nodes_.back().AddCallSiteForInstruction(instruction.get());
+    }
+  }
+
+  // Add caller callsites to each node.
+  for (const std::unique_ptr<HloComputation>& computation :
+       module->computations()) {
+    for (const CallSite& callsite :
+         call_graph->GetNode(computation.get()).callsites()) {
+      for (auto* callee : callsite.called_computations()) {
+        // Add caller callsites.
+        call_graph->GetNode(callee).AddCallerCallSite(callsite);
+      }
+    }
+  }
+
+  call_graph->SetCallContexts();
+  XLA_VLOG_LINES(1, call_graph->ToString());
+
+  return call_graph;
+}
+
+Status CallGraph::VisitNodesInternal(
+    const VisitorFunction& visitor_func, const CallGraphNode& node,
+    tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const {
+  auto pair = visited->insert(&node);
+  if (!pair.second) {
+    // Node was not inserted. Node has already been visited.
+    return Status::OK();
+  }
+
+  for (const HloComputation* computation : node.callees()) {
+    TF_RETURN_IF_ERROR(
+        VisitNodesInternal(visitor_func, GetNode(computation), visited));
+  }
+
+  return visitor_func(node);
+}
+
+Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
+                             bool visit_unreachable_nodes) const {
+  tensorflow::gtl::FlatSet<const CallGraphNode*> visited;
+  if (visit_unreachable_nodes) {
+    // Traverse from all roots in the call graph.
+    for (const CallGraphNode& node : nodes()) {
+      if (node.callers().empty()) {
+        TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, node, &visited));
+      }
+    }
+  } else {
+    // Traverse only from the entry computation.
+    TF_RETURN_IF_ERROR(VisitNodesInternal(
+        visitor_func, GetNode(module_->entry_computation()), &visited));
+  }
+
+  return Status::OK();
+}
+
+string CallGraph::ToString() const {
+  string out;
+  Appendf(&out, "Call graph for module %s:\n", module_->name().c_str());
+  for (const CallGraphNode& node : nodes()) {
+    Appendf(&out, "Computation %s:\n", node.computation()->name().c_str());
+    Appendf(&out, "  calls:\n");
+    for (const HloComputation* callee : node.callees()) {
+      Appendf(&out, "    %s\n", callee->name().c_str());
+    }
+    Appendf(&out, "  called by:\n");
+    for (const HloComputation* caller : node.callers()) {
+      Appendf(&out, "    %s\n", caller->name().c_str());
+    }
+    Appendf(&out, "  callsites:\n");
+    for (const CallSite& callsite : node.callsites()) {
+      Appendf(&out, "    %s\n", callsite.ToString().c_str());
+    }
+  }
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
new file mode 100644
index 00000000000..7f9990f06d4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -0,0 +1,221 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Call graph for an HLO module.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
+
+#include <ostream>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// The context in which a computation is called by another computation.
+enum class CallContext {
+  // In a parallel contex the computation is applied to each element of the
+  // array argument(s). kMap and kReduce instructions call computations in
+  // parallel context.
+  kParallel,
+
+  // In a sequential context the computation is applied to the entire argument
+  // shape(s). kCall and kWhile (body and condition) call computations in
+  // sequential context.
+  kSequential,
+
+  // A computation is called from both a parallel and sequential context.
+  kBoth,
+
+  // During call graph construction kNone is used to indicate that the context
+  // has not been determined. This is the top value for the context
+  // lattice. After construction, no call sites or call graph nodes should have
+  // this value.
+  kNone
+};
+
+string CallContextToString(CallContext context);
+std::ostream& operator<<(std::ostream& out, const CallContext& context);
+
+CallContext GetInstructionCallContext(const HloInstruction* instruction);
+
+// Represents an HLO instruction which calls one or more computations.
+class CallSite {
+ public:
+  CallSite(HloInstruction* instruction,
+           const std::vector<HloComputation*>& called_computations,
+           CallContext context)
+      : instruction_(CHECK_NOTNULL(instruction)),
+        called_computations_(called_computations),
+        context_(context) {}
+
+  // Returns the instruction associated with this call site.
+  HloInstruction* instruction() const { return instruction_; }
+
+  // Returns the computations called at this call site.
+  const std::vector<HloComputation*>& called_computations() const {
+    return called_computations_;
+  }
+
+  // Returns the context in which computations are called at this call site.
+  CallContext context() const { return context_; }
+
+  string ToString() const;
+
+ private:
+  // The calling instruction.
+  HloInstruction* instruction_;
+
+  // The computations called by this callsite.
+  const std::vector<HloComputation*> called_computations_;
+
+  // The context in which the computations are called.
+  const CallContext context_;
+};
+
+// A node in the call graph representing an HLO computation.
+class CallGraphNode {
+ public:
+  CallGraphNode(HloComputation* computation);
+
+  // Returns the computation represented by this call graph node.
+  HloComputation* computation() const { return computation_; }
+
+  // Returns the call sites in this computation. These are the instructions in
+  // this computation which call other computations.
+  const std::vector<CallSite>& callsites() const { return callsites_; }
+
+  // Returns the callsite associated with the given instruction. If this
+  // instruction calls no computations nullptr is returned.
+  // Prerequisite: instruction is in the computation associated with this call
+  // graph node.
+  const CallSite* GetCallSite(const HloInstruction* instruction) const;
+
+  // Returns the computations called by this computation.
+  const std::vector<HloComputation*>& callees() const { return callees_; }
+
+  // Returns the call sites in other computations which call this computation.
+  const std::vector<CallSite>& caller_callsites() const {
+    return caller_callsites_;
+  }
+
+  // Returns the computations which call this computation.
+  const std::vector<HloComputation*>& callers() const { return callers_; }
+
+  // Returns the context in which this computation is called.
+  CallContext context() const { return context_; }
+
+  string ToString() const;
+
+ private:
+  // Only CallGraph can modify CallGraphNode.
+  friend class CallGraph;
+
+  // Sets the context in which this computation is called.
+  void set_context(CallContext value) { context_ = value; }
+
+  // Adds a callsite which calls this computation. Updates callers to include
+  // the calling computation.
+  void AddCallerCallSite(const CallSite& caller_callsite);
+
+  // If instruction calls any computations adds a call site for this instruction
+  // to the call graph node. If the instruction calls no computations then no
+  // call site is added.
+  void AddCallSiteForInstruction(HloInstruction* instruction);
+
+  // Computation represented by this call graph node.
+  HloComputation* computation_;
+
+  // The computations called by this computation. The vector is used for a
+  // stable ordering and the set enables fast membership testing.
+  std::vector<HloComputation*> callees_;
+  tensorflow::gtl::FlatSet<HloComputation*> callee_set_;
+
+  // The computations which call this computation. The vector is used for a
+  // stable ordering and the set enables fast membership testing.
+  std::vector<HloComputation*> callers_;
+  tensorflow::gtl::FlatSet<HloComputation*> caller_set_;
+
+  // The call sites in this computation
+  std::vector<CallSite> callsites_;
+
+  // The map from instruction to index in callsites_ for looking up the callsite
+  // (if any) associated with a particular instruction in this computation.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> callsite_instructions_;
+
+  // The call sites in other computations which call this computation.
+  std::vector<CallSite> caller_callsites_;
+
+  // The context in which this computation is called.
+  CallContext context_ = CallContext::kNone;
+};
+
+// The call graph for an HLO module. The graph includes a node for each
+// computation in the module.
+class CallGraph {
+ public:
+  using VisitorFunction = std::function<Status(const CallGraphNode&)>;
+
+  // Builds and returns a call graph for the given HLO module.
+  static std::unique_ptr<CallGraph> Build(const HloModule* module);
+
+  // Returns the node associated with the given computation.
+  const CallGraphNode& GetNode(const HloComputation* computation) const;
+  CallGraphNode& GetNode(const HloComputation* computation);
+
+  // Returns the vector of all nodes in the call graph.
+  const std::vector<CallGraphNode>& nodes() const { return nodes_; }
+
+  // Calls the given function on each node in the call graph. Nodes are visited
+  // in post order (callees before callers). If visit_unreachable_nodes is true
+  // then all nodes in the call graph are visited. Otherwise only those nodes
+  // reachable from the entry computation are visited.
+  Status VisitNodes(const VisitorFunction& visitor_func,
+                    bool visit_unreachable_nodes = true) const;
+
+  string ToString() const;
+
+ private:
+  CallGraph(const HloModule* module);
+
+  // Sets the call contexts for every node in the graph.
+  void SetCallContexts();
+
+  // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
+  // post order (callee before caller) calling visitor_func on each node. Adds
+  // nodes to 'visited' as each node is visited. Skips nodes already in
+  // 'visited'.
+  Status VisitNodesInternal(
+      const VisitorFunction& visitor_func, const CallGraphNode& node,
+      tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const;
+
+  // The HLO module represented by this call graph.
+  const HloModule* module_ = nullptr;
+
+  // Vector of all nodes in the call graph.
+  std::vector<CallGraphNode> nodes_;
+
+  // Map from HLO computation to the index of the corresponding call graph node
+  // in nodes_.
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> node_indices_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
new file mode 100644
index 00000000000..e276473c90a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -0,0 +1,391 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+class CallGraphTest : public HloTestBase {
+ protected:
+  // Build and return a trivial computation taking and returning a scalar.
+  std::unique_ptr<HloComputation> MakeScalarComputation() {
+    HloComputation::Builder builder(TestName() + ".ScalarComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and maps (kMap) the
+  // given computation to the value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeMappingComputation(
+      HloComputation* map_computation, int64 callsites) {
+    HloComputation::Builder builder(TestName() + ".MappingComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateMap(
+          kScalarShape, {last_value}, map_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and calls (kCall) the
+  // given computation with value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeCallingComputation(
+      HloComputation* callee_computation, int64 callsites,
+      const string& suffix = ".CallingComputation") {
+    HloComputation::Builder builder(TestName() + suffix);
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateCall(
+          kScalarShape, {last_value}, callee_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and returns a PRED
+  // value.
+  std::unique_ptr<HloComputation> MakeConditionComputation() {
+    HloComputation::Builder builder(TestName() + ".ConditionComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* zero = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
+    return builder.Build();
+  }
+
+  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(CallGraphTest, SingletonComputation) {
+  // Test the call graph of a module with a single computation.
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(MakeScalarComputation());
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(1, call_graph->nodes().size());
+  const CallGraphNode& node = call_graph->GetNode(computation);
+  EXPECT_EQ(computation, node.computation());
+  EXPECT_TRUE(node.callsites().empty());
+  EXPECT_TRUE(node.callees().empty());
+  EXPECT_TRUE(node.caller_callsites().empty());
+  EXPECT_TRUE(node.callers().empty());
+  EXPECT_EQ(CallContext::kSequential, node.context());
+}
+
+TEST_F(CallGraphTest, UnreachableComputation) {
+  // Test the call graph of a module with an entry computation and an
+  // unreachable computation.
+  auto module = CreateNewModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(MakeScalarComputation());
+  HloComputation* unreachable_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(2, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+
+  const CallGraphNode& unreachable_node =
+      call_graph->GetNode(unreachable_computation);
+  EXPECT_EQ(unreachable_computation, unreachable_node.computation());
+  EXPECT_EQ(CallContext::kSequential, unreachable_node.context());
+}
+
+TEST_F(CallGraphTest, ParallelComputation) {
+  // Test a call graph of a module with an entry computation which calls another
+  // computation in a parallel context via kMap.
+  auto module = CreateNewModule();
+  HloComputation* map_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+  HloComputation* entry_computation = module->AddEntryComputation(
+      MakeMappingComputation(map_computation, /*callsites=*/5));
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(2, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(5, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& map_node = call_graph->GetNode(map_computation);
+  EXPECT_EQ(map_computation, map_node.computation());
+  EXPECT_EQ(CallContext::kParallel, map_node.context());
+  EXPECT_TRUE(map_node.callsites().empty());
+  EXPECT_TRUE(map_node.callees().empty());
+  EXPECT_EQ(5, map_node.caller_callsites().size());
+  EXPECT_EQ(1, map_node.callers().size());
+}
+
+TEST_F(CallGraphTest, SequentialComputations) {
+  // Test a call graph of a module with an entry computation which calls another
+  // computation in a sequential context via kCall.
+  auto module = CreateNewModule();
+  HloComputation* called_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+  HloComputation* entry_computation = module->AddEntryComputation(
+      MakeCallingComputation(called_computation, /*callsites=*/3));
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(2, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(3, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& called_node = call_graph->GetNode(called_computation);
+  EXPECT_EQ(called_computation, called_node.computation());
+  EXPECT_EQ(CallContext::kSequential, called_node.context());
+  EXPECT_TRUE(called_node.callsites().empty());
+  EXPECT_TRUE(called_node.callees().empty());
+  EXPECT_EQ(3, called_node.caller_callsites().size());
+  EXPECT_EQ(1, called_node.callers().size());
+}
+
+TEST_F(CallGraphTest, ContextBothComputations) {
+  // Test a call graph of a module with an entry computation which calls another
+  // computation in both a parallel and sequential context.
+  auto module = CreateNewModule();
+  HloComputation* subcomputation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+  HloInstruction* call = builder.AddInstruction(
+      HloInstruction::CreateCall(kScalarShape, {param0}, subcomputation));
+  HloInstruction* map = builder.AddInstruction(
+      HloInstruction::CreateMap(kScalarShape, {call}, subcomputation));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(2, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(2, entry_node.callsites().size());
+
+  const CallSite& call_callsite = entry_node.callsites()[0];
+  EXPECT_EQ(call, call_callsite.instruction());
+  EXPECT_THAT(call_callsite.called_computations(),
+              UnorderedElementsAre(subcomputation));
+  EXPECT_EQ(CallContext::kSequential, call_callsite.context());
+  EXPECT_EQ(entry_node.GetCallSite(call), &call_callsite);
+
+  const CallSite& map_callsite = entry_node.callsites()[1];
+  EXPECT_EQ(map, map_callsite.instruction());
+  EXPECT_THAT(map_callsite.called_computations(),
+              UnorderedElementsAre(subcomputation));
+  EXPECT_EQ(CallContext::kParallel, map_callsite.context());
+  EXPECT_EQ(entry_node.GetCallSite(map), &map_callsite);
+
+  const CallGraphNode& sub_node = call_graph->GetNode(subcomputation);
+  EXPECT_EQ(CallContext::kBoth, sub_node.context());
+}
+
+TEST_F(CallGraphTest, ComplexGraph) {
+  // Test a call graph of a module with several computation called in various
+  // contexts. The call graph looks like:
+  //
+  //      entry
+  //      /  |
+  //     a   |
+  //   / | \ |
+  //  b  |  cond
+  //   \ |
+  //    c
+  //
+  // Calls are made via kCall, kWhile, and kMap instructions.
+  auto module = CreateNewModule();
+  HloComputation* cond_computation =
+      module->AddEmbeddedComputation(MakeConditionComputation());
+  HloComputation* c_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+  HloComputation* b_computation = module->AddEmbeddedComputation(
+      MakeMappingComputation(c_computation, /*callsites=*/1));
+
+  HloComputation* a_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".a");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* call = builder.AddInstruction(
+        HloInstruction::CreateCall(kScalarShape, {param0}, c_computation));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, b_computation, call));
+    a_computation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation* entry_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".entry");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, a_computation, param0));
+    entry_computation = module->AddEntryComputation(builder.Build());
+  }
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(5, call_graph->nodes().size());
+
+  // Entry computation has one while instruction calling two computations
+  // (cond_computation and a_computation).
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  ASSERT_EQ(1, entry_node.callsites().size());
+  const std::vector<HloComputation*>& called_computations =
+      entry_node.callsites()[0].called_computations();
+  EXPECT_THAT(called_computations,
+              UnorderedElementsAre(cond_computation, a_computation));
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_TRUE(c_node.callsites().empty());
+  EXPECT_THAT(c_node.callers(),
+              UnorderedElementsAre(a_computation, b_computation));
+  EXPECT_EQ(CallContext::kBoth, c_node.context());
+
+  // Visit the graph and verify nodes were visited in callee-before-caller
+  // order.
+  std::vector<const HloComputation*> visited;
+  TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
+    visited.push_back(node.computation());
+    return Status::OK();
+  }));
+  EXPECT_EQ(visited.size(), 5);
+  // All values in visited should be unique.
+  EXPECT_EQ(
+      std::unordered_set<const HloComputation*>(visited.begin(), visited.end())
+          .size(),
+      5);
+
+  // Verify visitation order of some computations in the graph.
+  auto index_of = [&visited](const HloComputation* comp) {
+    auto it = std::find(visited.begin(), visited.end(), comp);
+    EXPECT_NE(it, visited.end());
+    return std::distance(visited.begin(), it);
+  };
+  EXPECT_EQ(4, index_of(entry_computation));
+  EXPECT_LT(index_of(cond_computation), index_of(a_computation));
+  EXPECT_LT(index_of(c_computation), index_of(b_computation));
+  EXPECT_LT(index_of(b_computation), index_of(a_computation));
+}
+
+TEST_F(CallGraphTest, VisitSingletonComputation) {
+  // Test the call graph visitor with a call graph with a single node.
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(MakeScalarComputation());
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  std::vector<HloComputation*> visited;
+  TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
+    visited.push_back(node.computation());
+    return Status::OK();
+  }));
+  EXPECT_THAT(visited, UnorderedElementsAre(computation));
+}
+
+TEST_F(CallGraphTest, VisitUnreachableComputation) {
+  // Test the call graph visitor with a call graph with an unreachable node.
+  auto module = CreateNewModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(MakeScalarComputation());
+  HloComputation* unreachable_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  // Test visitation of only reachable nodes.
+  {
+    std::vector<const HloComputation*> visited;
+    TF_ASSERT_OK(call_graph->VisitNodes(
+        [&visited](const CallGraphNode& node) {
+          visited.push_back(node.computation());
+          return Status::OK();
+        },
+        /*visit_unreachable_nodes=*/false));
+    EXPECT_EQ(visited.size(), 1);
+    EXPECT_EQ(visited[0], entry_computation);
+  }
+
+  // Test visitation of all nodes (reachable and unreachable).
+  {
+    std::vector<HloComputation*> visited;
+    TF_ASSERT_OK(call_graph->VisitNodes(
+        [&visited](const CallGraphNode& node) {
+          visited.push_back(node.computation());
+          return Status::OK();
+        },
+        /*visit_unreachable_nodes=*/true));
+    EXPECT_EQ(visited.size(), 2);
+    EXPECT_THAT(visited, UnorderedElementsAre(entry_computation,
+                                              unreachable_computation));
+  }
+}
+
+TEST_F(CallGraphTest, VisitWithError) {
+  // Test that the call graph visitor properly propagates errors.
+  auto module = CreateNewModule();
+  module->AddEntryComputation(MakeScalarComputation());
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  Status status = call_graph->VisitNodes(
+      [](const CallGraphNode&) { return InternalError("Visitation failed"); });
+
+  ASSERT_FALSE(status.ok());
+  ASSERT_EQ(status.code(), tensorflow::error::INTERNAL);
+  ASSERT_THAT(status.error_message(),
+              ::testing::HasSubstr("Visitation failed"));
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
new file mode 100644
index 00000000000..0d1a439724a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+  ServiceOptions default_options;
+  default_options.set_platform(platform);
+  return NewService(default_options);
+}
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(const ServiceOptions& options) {
+  perftools::gputools::Platform* platform = options.platform();
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
+                      CreateComputeConstantBackend());
+  std::unique_ptr<CompileOnlyService> service(
+      new CompileOnlyService(compiler, std::move(compute_constant_backend)));
+  return std::move(service);
+}
+
+CompileOnlyService::CompileOnlyService(
+    Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
+    : Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
+      compiler_(compiler) {
+  runs_in_client_process_ = true;
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  for (const AotComputationInstance& instance : computations) {
+    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
+                        computation_tracker_.Resolve(instance.computation));
+    VersionedComputationHandle versioned_handle =
+        user_computation->GetVersionedHandle();
+
+    // Dump computation proto state if flag is set.
+    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+    const string& directory_path = flags->xla_dump_computations_to;
+    if (!directory_path.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<SessionModule> session_module,
+          computation_tracker_.SnapshotComputation(versioned_handle.handle));
+      string filename = tensorflow::strings::StrCat(
+          "computation_", versioned_handle.handle.handle(), "__",
+          session_module->entry().name(), "__version_",
+          versioned_handle.version);
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+                                                     *session_module));
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const ProgramShape> program_shape,
+        user_computation->ComputeProgramShape(versioned_handle.version));
+
+    HloModuleConfig hlo_module_config(*program_shape);
+    hlo_module_config.set_debug_options(
+        legacy_flags::GetDebugOptionsFromFlags());
+    auto* computation_layout =
+        hlo_module_config.mutable_entry_computation_layout();
+    if (flags->xla_hlo_profile) {
+      hlo_module_config.enable_hlo_profiling(true);
+    }
+    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
+      const Shape& argument_layout = *instance.argument_layouts[i];
+      if (ShapeUtil::IsTuple(argument_layout)) {
+        return Unimplemented("tuple arguments not supported yet");
+      }
+      TF_RETURN_IF_ERROR(
+          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
+              argument_layout));
+    }
+    TF_RETURN_IF_ERROR(
+        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            *instance.result_layout));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                        computation_tracker_.BuildHloModule(
+                            versioned_handle, hlo_module_config,
+                            /*include_unreachable_instructions=*/true));
+    hlo_modules.push_back(std::move(hlo_module));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules),
+                                       MakeHloDumper(), options);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
new file mode 100644
index 00000000000..3358305c03c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Service specialization for ahead-of-time compilation.  This only
+// instantiates a Compiler object for the relevant platform; it does not
+// instantiate or require an execution backend.
+class CompileOnlyService : public Service {
+ public:
+  // Factory for creating a CompileOnlyService. The parameter platform is the
+  // platform that the service should target. If platform is null then the
+  // default platform is used.
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      perftools::gputools::Platform* platform);
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      const ServiceOptions& options);
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    ComputationHandle computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& Options);
+
+  // Override Service methods that require or imply the existence of an
+  // execute backend.  Note that this does not include TransferToClient, as
+  // computing constants produces global data that we may wish to transfer.
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+
+ private:
+  explicit CompileOnlyService(
+      Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
+  CompileOnlyService(const CompileOnlyService&) = delete;
+  void operator=(const CompileOnlyService&) = delete;
+
+  // The compiler for the target platform.  This is included in place of
+  // the Service::execute_backend_'s compiler, since execute_backend_ is a
+  // nullptr in CompileOnlyService.
+  Compiler* compiler_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 85c2d03e1bc..7ae285170e4 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -110,29 +111,24 @@ class Compiler {
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
-  // TODO(leary) will need to update this API when a single computation can run
-  // across multiple devices simultaneously.
+  // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* executor) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo, const AotCompilationOptions& options) = 0;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                     HloDumper dump_hlo,
+                     const AotCompilationOptions& options) = 0;
 
   /////
   // The Compiler class also serves as a point to register compiler objects
@@ -153,6 +149,19 @@ class Compiler {
   static StatusOr<Compiler*> GetForPlatform(
       const perftools::gputools::Platform* platform);
 
+  // Returns a function that computes the size in bytes of the logical
+  // buffer that contains a shape.
+  virtual HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const = 0;
+
+  // Returns a function that computes the size in bytes of a given
+  // logical buffer.
+  std::function<int64(const LogicalBuffer&)> BufferSizeBytesFunction() {
+    HloCostAnalysis::ShapeSizeFunction shape_size = ShapeSizeBytesFunction();
+    return [shape_size](const LogicalBuffer& buffer) {
+      return shape_size(buffer.shape());
+    };
+  }
+
  private:
   // Mutex that guards the platform-compiler map.
   static tensorflow::mutex* platform_compiler_mutex_;
diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc
index 281277bed57..9aa32a1fb76 100644
--- a/tensorflow/compiler/xla/service/computation_tracker.cc
+++ b/tensorflow/compiler/xla/service/computation_tracker.cc
@@ -26,8 +26,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
+using ::tensorflow::strings::Appendf;
+
 namespace xla {
 
 ComputationTracker::ComputationTracker() : next_computation_(1) {}
@@ -50,12 +53,28 @@ StatusOr<ComputationHandle> ComputationTracker::LoadSessionModule(
   // For each embedded computation, create a new computation based on its
   // serialized data, and place the mapping from the old computation handle to
   // the new computation handle.
+
+  // Build a mapping from old embedded computation handles to new computation
+  // handles. We build the ID mapping first since the embedded computations are
+  // in no particular order and may refer to each other.
   std::map<int64, ComputationHandle> old_to_new;
   for (const SessionComputation& computation :
        session_module.embedded_computations()) {
     const int64 old_handle = computation.computation_handle().handle();
-    TF_ASSIGN_OR_RETURN(old_to_new[old_handle],
-                        LoadSessionComputation(computation, &old_to_new));
+    if (!old_to_new.emplace(old_handle, AllocateHandle()).second) {
+      return InvalidArgument("Duplicate embedded computation handle %lld",
+                             old_handle);
+    }
+  }
+
+  // Create a new computation from each serialized embedded computation.
+  for (const SessionComputation& computation :
+       session_module.embedded_computations()) {
+    const int64 old_handle = computation.computation_handle().handle();
+    const ComputationHandle& new_handle = old_to_new[old_handle];
+    TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()],
+                        UserComputation::MakeWithRemapping(
+                            computation, new_handle, old_to_new));
   }
 
   // Finally, place the entry computation in the tracker with all of the
@@ -130,7 +149,7 @@ void ComputationTracker::ComputeComputationPostOrder(
     std::set<VersionedComputationHandle>* visited,
     std::list<VersionedComputationHandle>* post_order) const {
   if (visited->count(versioned_handle) > 0) {
-    DCHECK_EQ(1, visited->count(versioned_handle));
+    CHECK_EQ(1, visited->count(versioned_handle));
     return;
   }
 
@@ -145,14 +164,19 @@ void ComputationTracker::ComputeComputationPostOrder(
 
   visited->insert(versioned_handle);
   post_order->push_back(versioned_handle);
-  return;
 }
 
 StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
     const VersionedComputationHandle& entry_handle,
-    bool include_unused_parameters) const {
+    const HloModuleConfig& config,
+    bool include_unreachable_instructions) const {
   tensorflow::mutex_lock lock(computation_mutex_);
 
+  VLOG(1) << "BuildHloModule(" << entry_handle
+          << ", include_unreachable_instructions="
+          << include_unreachable_instructions << ")";
+  XLA_VLOG_LINES(1, ToStringInternal());
+
   TF_ASSIGN_OR_RETURN(UserComputation * entry_computation,
                       ResolveInternal(entry_handle.handle));
 
@@ -174,9 +198,17 @@ StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
     return hlo_computations.at(versioned_handle);
   };
 
+  // Print the post-order list for this entry computation.
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Visiting UserComputations in post order:";
+    for (const VersionedComputationHandle& versioned_handle : post_order) {
+      VLOG(2) << "  " << versioned_handle;
+    }
+  }
+
   string module_name =
       tensorflow::strings::StrCat(entry_computation->name(), "_module");
-  auto module = MakeUnique<HloModule>(module_name, entry_handle);
+  auto module = MakeUnique<HloModule>(module_name, entry_handle, config);
   for (auto versioned_handle : post_order) {
     UserComputation* computation =
         ResolveInternal(versioned_handle.handle).ValueOrDie();
@@ -184,7 +216,7 @@ StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloComputation> hlo_computation,
         computation->BuildHloComputation(versioned_handle.version, resolver,
-                                         include_unused_parameters));
+                                         include_unreachable_instructions));
 
     // Add the newly created computation to VersionedHandle-to-HloComputation
     // map.
@@ -201,4 +233,23 @@ StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
   return std::move(module);
 }
 
+string ComputationTracker::ToString() const {
+  tensorflow::mutex_lock lock(computation_mutex_);
+  return ToStringInternal();
+}
+
+string ComputationTracker::ToStringInternal() const {
+  string out;
+  Appendf(&out, "ComputationTracker(%p):\n", this);
+  for (const auto& handle_computation : opaque_to_computation_) {
+    int64 handle = handle_computation.first;
+    const std::unique_ptr<UserComputation>& computation =
+        handle_computation.second;
+    Appendf(&out, "  %4lld : %s \"%s\"\n", handle,
+            computation->GetVersionedHandle().ToString().c_str(),
+            computation->name().c_str());
+  }
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_tracker.h b/tensorflow/compiler/xla/service/computation_tracker.h
index 7d0660d7f6d..d42d66adefe 100644
--- a/tensorflow/compiler/xla/service/computation_tracker.h
+++ b/tensorflow/compiler/xla/service/computation_tracker.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
@@ -72,13 +73,18 @@ class ComputationTracker {
   // Builds an HLO module using the specified computation as the entry. The
   // module will include the entry computation as well as all computations which
   // are called directly or indirectly from the entry computation via operations
-  // like "map". If include_unused_parameters is true, then all parameters are
-  // lowered to HLO instructions even if they are not used. This ensures the
-  // entry HloComputation has the same program shape (ProgramShape) as the entry
-  // UserComputation.
+  // like "map". config is the HLO module configuration to use for the
+  // constructed module.
+  // If include_unreachable_instructions is true, then instructions
+  // which are not reachable from the root are lowered into HloInstructions
+  // including unreachable parameters. This ensures the entry HloComputation has
+  // the same program shape (ProgramShape) as the entry UserComputation.
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(
       const VersionedComputationHandle& entry_handle,
-      bool include_unused_parameters = true) const;
+      const HloModuleConfig& config,
+      bool include_unreachable_instructions = true) const;
+
+  string ToString() const;
 
  private:
   // Bumps the next_computation_ number and returns the allocated number wrapped
@@ -117,6 +123,8 @@ class ComputationTracker {
       std::list<VersionedComputationHandle>* post_order) const
       EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
 
+  string ToStringInternal() const EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
+
   // Guards the computation mapping. Marked mutable so that the Resolve method
   // can remain const; Resolve does't really modify the tracker in any way, but
   // it has to lock the mutex for safety.
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 81f54c26ec5..a3803c34ba7 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -16,19 +16,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
 #include <memory>
-#include <set>
-#include <string>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,6 +37,9 @@ namespace xla {
 
 namespace {
 
+using tensorflow::gtl::FlatMap;
+using tensorflow::gtl::FlatSet;
+
 // InstructionCopier encapsulates indices at which to copy 'instruction'.
 // All 'instruction' users in 'copy_users' are updated to use the copy.
 //
@@ -52,7 +56,7 @@ namespace {
 //
 //      Example two-element tuple with one element that needs a copy:
 //
-//                    Tuple  // instruction
+//             original-instruction
 //                   /    \
 //                GTE(0)  GTE(1)
 //                  |       |
@@ -60,23 +64,54 @@ namespace {
 //                   \     /
 //                    Tuple  // copied-instruction
 //
+//      As an optimization, if the original instruction is itself a Tuple
+//      instruction, we elide the unnecessary extra GTE and Tuple instructions,
+//      and just insert the copy into a new Tuple instruction, with control
+//      dependencies to ensure the copy occurs after any possible interference.
 class InstructionCopier {
  public:
-  InstructionCopier(const bool init_value, HloInstruction* instruction,
-                    const std::vector<HloInstruction*>& copy_users);
+  InstructionCopier(HloInstruction* instruction,
+                    const std::vector<HloInstruction*>& copy_users)
+      : instruction_(instruction),
+        copy_users_(copy_users),
+        indices_to_copy_(instruction->shape()),
+        control_predecessors_(instruction->shape()) {}
+
+  // Sets indices that are read-only, and thus do not need to be copied.
+  void SetReadOnlyIndices(const ShapeTree<bool>& read_only_indices) {
+    read_only_indices_ = read_only_indices;
+  }
+
+  // Sets copy overrides, which are copy instructions to use at each index. This
+  // is used to share a single copy of read-only entry parameters and constants
+  // between multiple While loops.
+  void SetCopyOverrides(const ShapeTree<HloInstruction*>& copy_overrides) {
+    copy_overrides_ = copy_overrides;
+  }
 
   // Returns true if all recorded indices are false (returns true otherwise).
   bool HasAllIndicesFalse() const;
 
   // Records instruction buffer indices which point-to a Parameter or Constant.
-  tensorflow::Status RecordIndicesWhichPointToParamOrConstant(
+  Status RecordIndicesWhichPointToParamOrConstant(
       const TuplePointsToAnalysis& points_to_analysis);
 
   // Records instruction buffer indices to copy which are necessary to ensure:
   // *) PointsToSet of 'instruction_' is unambiguous and distinct.
   // *) No liveness interference between 'instruction_' and 'other_instruction'.
-  tensorflow::Status RecordIndicesToCopyForColocatingBuffers(
-      BufferLiveness* liveness, HloInstruction* other_instruction);
+  //
+  // If 'read_only_indices_out' is non-null, read-only indices are set to true.
+  Status RecordIndicesToCopyForColocatingBuffers(
+      const BufferLiveness& liveness, const HloInstruction* other_instruction,
+      ShapeTree<bool>* read_only_indices_out);
+
+  // Records control predecessors to add for inserted copy instructions.
+  // 'parameter' must have the same shape as the instruction that will be
+  // copied, and must define all buffers in the shape. Control predecessors are
+  // only recorded for indices that have already been marked for copying.
+  Status RecordControlPredecessors(
+      const TuplePointsToAnalysis& points_to_analysis,
+      HloInstruction* parameter);
 
   // Inserts copies of 'instruction' buffers at indices in 'indices_to_copy',
   // and replaces all uses for instructions in 'copy_users_' with copy.
@@ -88,15 +123,29 @@ class InstructionCopier {
   const std::vector<HloInstruction*>& copy_users() const { return copy_users_; }
 
  private:
+  // Does the given index represent a read-only buffer?
+  bool IsReadOnlyIndex(const ShapeIndex& index) const {
+    return !ShapeUtil::IsNil(read_only_indices_.shape()) &&
+           read_only_indices_.element(index);
+  }
+
+  // Returns the copy override at the given index, or nullptr.
+  HloInstruction* GetCopyOverride(const ShapeIndex& index) const {
+    return ShapeUtil::IsNil(copy_overrides_.shape())
+               ? nullptr
+               : copy_overrides_.element(index);
+  }
+
   // Records instruction buffer indices which have ambiguous or non-distinct
   // points-to sets.
-  tensorflow::Status RecordAmbiguousOrNonDistinctIndices(
+  Status RecordAmbiguousOrNonDistinctIndices(
       const TuplePointsToAnalysis& points_to_analysis);
 
-  // Records instruction buffer indices which have interferring live ranges
+  // Records instruction buffer indices which have interfering live ranges
   // with 'other_instruction' buffers at same index.
-  tensorflow::Status RecordIndicesWhichInterfereWithOtherInstruction(
-      BufferLiveness* liveness, HloInstruction* other_instruction);
+  Status RecordIndicesWhichInterfereWithOtherInstruction(
+      const BufferLiveness& liveness, const HloInstruction* other_instruction,
+      ShapeTree<bool>* read_only_indices_out);
 
   // Recursively inserts copies of 'instruction' tuple elements at indices
   // specified in 'indices_to_copy', and returns the copy of 'instruction'.
@@ -107,28 +156,25 @@ class InstructionCopier {
   }
 
   HloInstruction* instruction_;
-  std::vector<HloInstruction*> copy_users_;
+  const std::vector<HloInstruction*> copy_users_;
   ShapeTree<bool> indices_to_copy_;
+  ShapeTree<std::vector<HloInstruction*>> control_predecessors_;
+  ShapeTree<bool> read_only_indices_;
+  ShapeTree<HloInstruction*> copy_overrides_;
 };
 
-InstructionCopier::InstructionCopier(
-    const bool init_value, HloInstruction* instruction,
-    const std::vector<HloInstruction*>& copy_users)
-    : instruction_(instruction),
-      copy_users_(copy_users),
-      indices_to_copy_(instruction->shape(), init_value) {}
-
 bool InstructionCopier::HasAllIndicesFalse() const {
   bool all_indices_false = true;
-  TF_CHECK_OK(indices_to_copy_.ForEachElement([&all_indices_false](
-      const ShapeIndex& /*index*/, bool /*is_leaf*/, const bool& data) {
-    if (data) all_indices_false = false;
-    return tensorflow::Status::OK();
-  }));
+  indices_to_copy_.ForEachElement(
+      [&all_indices_false](const ShapeIndex& /*index*/, bool data) {
+        if (data) {
+          all_indices_false = false;
+        }
+      });
   return all_indices_false;
 }
 
-tensorflow::Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
+Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
     const TuplePointsToAnalysis& points_to_analysis) {
   const PointsToSet& points_to =
       points_to_analysis.GetPointsToSet(instruction_);
@@ -141,72 +187,73 @@ tensorflow::Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
 
   // Multiple buffers within a parameter/constant may be live out, so collect
   // a set of indices at which to copy first.
-  TF_RETURN_IF_ERROR(points_to.ForEachElement([this](
-      const ShapeIndex& index, bool /*is_leaf*/,
-      const std::vector<const LogicalBuffer*>& buffers) {
-    for (auto buffer : buffers) {
-      // pointee is the HloInstruction producing the buffer which may be
-      // liveout.
-      HloInstruction* pointee = buffer->instruction();
-      if (pointee->opcode() == HloOpcode::kParameter ||
-          pointee->opcode() == HloOpcode::kConstant) {
-        VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
-                << " index: " << tensorflow::str_util::Join(index, ",")
-                << " may be live out of computation: " << pointee->ToString();
-        RecordIndex(index);
-      }
-    }
-    return tensorflow::Status::OK();
-  }));
-  return tensorflow::Status::OK();
+  points_to.ForEachElement(
+      [this](const ShapeIndex& index,
+             const std::vector<const LogicalBuffer*>& buffers) {
+        if (IsReadOnlyIndex(index)) {
+          return;
+        }
+        for (const LogicalBuffer* buffer : buffers) {
+          // pointee is the HloInstruction producing the buffer which may be
+          // liveout.
+          HloInstruction* pointee = buffer->instruction();
+          if (pointee->opcode() == HloOpcode::kParameter ||
+              pointee->opcode() == HloOpcode::kConstant) {
+            VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
+                    << " index: " << tensorflow::str_util::Join(index, ",")
+                    << " may be live out of computation: "
+                    << pointee->ToString();
+            RecordIndex(index);
+            break;
+          }
+        }
+      });
+  return Status::OK();
 }
 
-tensorflow::Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
-    BufferLiveness* liveness, HloInstruction* other_instruction) {
+Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
+    const BufferLiveness& liveness, const HloInstruction* other_instruction,
+    ShapeTree<bool>* read_only_indices_out) {
   TF_RETURN_IF_ERROR(
-      RecordAmbiguousOrNonDistinctIndices(liveness->points_to_analysis()));
+      RecordAmbiguousOrNonDistinctIndices(liveness.points_to_analysis()));
   TF_RETURN_IF_ERROR(RecordIndicesWhichInterfereWithOtherInstruction(
-      liveness, other_instruction));
-  return tensorflow::Status::OK();
+      liveness, other_instruction, read_only_indices_out));
+  return Status::OK();
 }
 
-tensorflow::Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
+Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
     const TuplePointsToAnalysis& points_to_analysis) {
   const PointsToSet& points_to =
       points_to_analysis.GetPointsToSet(instruction_);
   // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  std::unordered_map<const LogicalBuffer*, std::vector<ShapeIndex>>
+  FlatMap<const LogicalBuffer*, std::vector<ShapeIndex>>
       buffer_to_source_indices;
-  TF_RETURN_IF_ERROR(points_to.ForEachElement([this, &buffer_to_source_indices](
-      const ShapeIndex& index, bool /*is_leaf*/,
-      const std::vector<const LogicalBuffer*>& buffers) {
-    if (buffers.size() > 1) {
-      // Record ambiguous points-to set at 'index'.
-      if (!indices_to_copy_.element(index)) {
-        VLOG(2) << "Adding copy of buffer for instruction: "
-                << instruction_->name()
-                << " at index: " << tensorflow::str_util::Join(index, ",")
-                << " with ambiguous points-to set.";
-        RecordIndex(index);
-      }
-    }
-    // For each 'buffer': record a mapping from 'buffer' to 'index'.
-    for (auto& buffer : buffers) {
-      auto it = buffer_to_source_indices.find(buffer);
-      if (it == buffer_to_source_indices.end()) {
-        buffer_to_source_indices.insert({buffer, std::vector<ShapeIndex>()});
-      }
-      buffer_to_source_indices[buffer].push_back(index);
-    }
-    return tensorflow::Status::OK();
-  }));
+  points_to.ForEachElement(
+      [this, &buffer_to_source_indices](
+          const ShapeIndex& index,
+          const std::vector<const LogicalBuffer*>& buffers) {
+        if (buffers.size() > 1) {
+          // Record ambiguous points-to set at 'index'.
+          if (!indices_to_copy_.element(index)) {
+            VLOG(2) << "Adding copy of buffer for instruction: "
+                    << instruction_->name()
+                    << " at index: " << tensorflow::str_util::Join(index, ",")
+                    << " with ambiguous points-to set.";
+            RecordIndex(index);
+          }
+        }
+        // For each 'buffer': record a mapping from 'buffer' to 'index'.
+        for (const LogicalBuffer* buffer : buffers) {
+          buffer_to_source_indices[buffer].push_back(index);
+        }
+      });
 
   // Record all non-distinct indices detected in 'buffer_to_source_indices'.
-  for (auto& buff_to_src : buffer_to_source_indices) {
+  for (const auto& buff_to_src : buffer_to_source_indices) {
     if (buff_to_src.second.size() == 1) {
       continue;
     }
-    for (auto& src_index : buff_to_src.second) {
+    for (const ShapeIndex& src_index : buff_to_src.second) {
       // Record non-distinct points-to set at 'src_index'.
       if (!indices_to_copy_.element(src_index)) {
         VLOG(2) << "Adding copy of buffer for instruction: "
@@ -217,23 +264,26 @@ tensorflow::Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status
-InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
-    BufferLiveness* liveness, HloInstruction* other_instruction) {
+Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
+    const BufferLiveness& liveness, const HloInstruction* other_instruction,
+    ShapeTree<bool>* read_only_indices_out) {
   // Record all buffer indices for 'instruction_', which interfere with
   // 'other_instruction' at the same index.
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshape(
+  ShapeUtil::ForEachSubshape(
       instruction_->shape(),
-      [this, &liveness, &other_instruction](const Shape& /*subshape*/,
-                                            const ShapeIndex& index) {
+      [this, &liveness, other_instruction, read_only_indices_out](
+          const Shape& /*subshape*/, const ShapeIndex& index) {
+        if (IsReadOnlyIndex(index)) {
+          return;
+        }
         if (indices_to_copy_.element(index)) {
           // Return if previous pass already set index.
-          return tensorflow::Status::OK();
+          return;
         }
-        auto& points_to_analysis = liveness->points_to_analysis();
+        const auto& points_to_analysis = liveness.points_to_analysis();
         // Lookup buffers for 'instruction_' and 'other_instruction'.
         const std::vector<const LogicalBuffer*> instruction_buffers =
             points_to_analysis.GetPointsToSet(instruction_).element(index);
@@ -252,20 +302,24 @@ InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
         // then that buffer is not updated on the path between the two
         // instructions. Therefore, any other (possibly interference-causing)
         // users of that buffer from 'other_instruction' will see the same data,
-        // irrespecive of whether we insert a copy of this buffer at
+        // irrespective of whether we insert a copy of this buffer at
         // 'instruction_' or not.
         if (other_instruction_buffers.size() == 1 &&
             other_instruction_buffers[0]->id() == instruction_buffer->id()) {
-          return tensorflow::Status::OK();
+          if (read_only_indices_out != nullptr) {
+            *read_only_indices_out->mutable_element(index) = true;
+          }
+          return;
         }
-        // We cant say anything about the ambiguity of 'other_instruction' at
+        // We can't say anything about the ambiguity of 'other_instruction' at
         // this point, so we need to check interference between the single
         // buffer in the points-to set of 'instruction_' and all buffers in
         // 'other_instruction_buffers'.
-        for (auto& other_buffer : other_instruction_buffers) {
-          if (liveness->MayInterfere(*instruction_buffer, *other_buffer)) {
+        for (const LogicalBuffer* other_buffer : other_instruction_buffers) {
+          if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) {
             VLOG(2) << "Adding copy of buffer for instruction: "
                     << instruction_->name()
+                    << " instruction_buffer: " << instruction_buffer->ToString()
                     << " at index: " << tensorflow::str_util::Join(index, ",")
                     << " because of interference with buffer: "
                     << other_buffer->ToString();
@@ -273,40 +327,88 @@ InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
             break;
           }
         }
-        return tensorflow::Status::OK();
-      }));
-  return tensorflow::Status::OK();
+      });
+  return Status::OK();
+}
+
+// This is called when 'instruction_' is a while body root, and 'parameter' is
+// the while body parameter. We record all users of all aliases of 'parameter'
+// as control predecessors, so that when we add a copy of 'instruction_', we can
+// mark the control dependencies. This is necessary because points-to and
+// liveness analysis doesn't know about the aliasing between the while body root
+// and param. Without these control dependencies, the copy might get scheduled
+// to run at a point that interferes with users of the buffer.
+Status InstructionCopier::RecordControlPredecessors(
+    const TuplePointsToAnalysis& points_to_analysis,
+    HloInstruction* parameter) {
+  return indices_to_copy_.ForEachElementWithStatus(
+      [this, &points_to_analysis, parameter](const ShapeIndex& index,
+                                             bool will_copy) {
+        if (will_copy) {
+          TF_ASSIGN_OR_RETURN(
+              const LogicalBuffer* buffer,
+              points_to_analysis.GetBufferDefinedAt(parameter, index));
+          for (const BufferAlias& alias :
+               points_to_analysis.GetBufferAliases(*buffer)) {
+            for (HloInstruction* user : alias.instruction()->users()) {
+              if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                          user, points_to_analysis)) {
+                continue;
+              }
+
+              if (user != instruction_) {
+                control_predecessors_.mutable_element(index)->push_back(user);
+              }
+            }
+          }
+        }
+        return Status::OK();
+      });
 }
 
 // Recursively inserts copies of 'instruction' tuple element buffers at
 // indices in 'indices_to_copy_', expanding tuples as needed.
-// TODO(b/31159897) Remove superfluous Tuple->GTE->Tuple expressions.
 HloInstruction* InstructionCopier::CopyTuple(HloInstruction* instruction,
                                              ShapeIndex* index) {
-  std::vector<HloInstruction*> element_copies;
   const int64 num_tuple_elements =
       ShapeUtil::TupleElementCount(instruction->shape());
+  std::vector<HloInstruction*> elem_copies(num_tuple_elements);
   for (int64 i = 0; i < num_tuple_elements; ++i) {
-    HloInstruction* gte = instruction->parent()->AddInstruction(
-        HloInstruction::CreateGetTupleElement(
-            ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction, i));
-    HloInstruction* element_copy;
-    index->push_back(i);
-    if (ShapeUtil::IsTuple(gte->shape())) {
-      element_copy = CopyTuple(gte, index);
+    HloInstruction* elem;
+    if (instruction->opcode() == HloOpcode::kTuple) {
+      // If the instruction is already a Tuple instruction, we know that the
+      // element buffers are aliased, so we can just grab the operand directly.
+      elem = instruction->mutable_operand(i);
     } else {
-      if (indices_to_copy_.element(*index)) {
-        element_copy = gte->parent()->AddInstruction(
-            HloInstruction::CreateUnary(gte->shape(), HloOpcode::kCopy, gte));
-      } else {
-        element_copy = gte;
+      // Otherwise we need to add a GTE to unpack the element out of the tuple.
+      elem = instruction->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
+              i));
+    }
+    index->push_back(i);
+    if (ShapeUtil::IsTuple(elem->shape())) {
+      elem_copies[i] = CopyTuple(elem, index);
+    } else if (!indices_to_copy_.element(*index)) {
+      elem_copies[i] = elem;
+    } else if (HloInstruction* copy_override = GetCopyOverride(*index)) {
+      elem_copies[i] = copy_override;
+    } else {
+      HloInstruction* elem_copy = elem->parent()->AddInstruction(
+          HloInstruction::CreateUnary(elem->shape(), HloOpcode::kCopy, elem));
+      for (HloInstruction* control_predecessor :
+           control_predecessors_.element(*index)) {
+        VLOG(2) << "Adding control dependency from "
+                << control_predecessor->ToString() << " to "
+                << elem_copy->ToString();
+        TF_CHECK_OK(control_predecessor->AddControlDependencyTo(elem_copy));
       }
+      elem_copies[i] = elem_copy;
     }
     index->pop_back();
-    element_copies.push_back(element_copy);
   }
   return instruction->parent()->AddInstruction(
-      HloInstruction::CreateTuple(element_copies));
+      HloInstruction::CreateTuple(elem_copies));
 }
 
 // Inserts copies of 'instruction_' buffers at indices in 'indices_to_copy_'.
@@ -327,8 +429,88 @@ HloInstruction* InstructionCopier::Copy() {
   return copy;
 }
 
+// The 'read_only_indices' are initialized based on points-to analysis on the
+// while body corresponding to 'while_hlo'. If the init buffer corresponding to
+// a read-only index aliases with a constant, it cannot be considered read-only,
+// and must be copied. This is necessary because BufferAssignment does not
+// currently assign an allocation for constants (b/32248867).
+// This function performs this fix-up of 'read_only_indices'.
+//
+// Returns a ShapeTree of copy_overrides, which implements an optimization to
+// allow multiple while loops that share the same read-only constants to
+// share a single copy.
+StatusOr<ShapeTree<HloInstruction*>> RevertReadOnlyIndicesForConstants(
+    const HloInstruction* while_hlo,
+    const TuplePointsToAnalysis& points_to_analysis,
+    ShapeTree<bool>* read_only_indices,
+    FlatMap<const HloInstruction*, HloInstruction*>* shared_copies) {
+  const HloInstruction* init_hlo = while_hlo->operand(0);
+  const PointsToSet& points_to = points_to_analysis.GetPointsToSet(init_hlo);
+
+  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
+  FlatSet<const LogicalBuffer*> buffer_set;
+
+  ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
+  points_to.ForEachElement(
+      [init_hlo, read_only_indices, shared_copies, &buffer_set,
+       &copy_overrides](const ShapeIndex& index,
+                        const std::vector<const LogicalBuffer*>& buffers) {
+        // Look for read-only entry parameters.
+        if (!read_only_indices->element(index)) {
+          return;
+        }
+        for (const LogicalBuffer* buffer : buffers) {
+          HloInstruction* pointee = buffer->instruction();
+          const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
+          if (!is_constant) {
+            continue;
+          }
+
+          // We have found an constant that is read-only in
+          // the while body. These buffers are managed by the caller, and cannot
+          // be aliased with HLO buffers. Revert this read-only index,
+          // to allow it to be copied.
+          *read_only_indices->mutable_element(index) = false;
+
+          // Optimization to allow multiple while loops that share the same
+          // read-only entry constants to share a single copy.
+          // Only unambiguous and distinct array-shaped buffers are allowed, to
+          // reduce code complexity. The shape of the entry parameter must be
+          // identical to the shape of the init_hlo at this index, to ensure
+          // there were no intervening bitcast or GTE instructions, which are
+          // also hard to handle.
+          const Shape& pointee_shape = pointee->shape();
+          const Shape& init_shape =
+              ShapeUtil::GetSubshape(init_hlo->shape(), index);
+          if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
+              ShapeUtil::Equal(pointee_shape, init_shape) &&
+              buffer_set.count(buffer) < 1) {
+            HloInstruction** copy = &(*shared_copies)[pointee];
+            if (*copy == nullptr) {
+              *copy =
+                  pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
+                      pointee_shape, HloOpcode::kCopy, pointee));
+            }
+            // Add the copy as an override.
+            *copy_overrides.mutable_element(index) = *copy;
+          }
+
+          // Tracks whether this current buffer is distinct.
+          buffer_set.insert(buffer);
+
+          // We've already reverted the read-only index and handled the
+          // single-copy optimization above, so there's nothing more to do.
+          break;
+        }
+      });
+  return copy_overrides;
+}
+
 }  // anonymous namespace
 
+// NOTE: This is only called by gpu::CopyInsertion. It's not called here in the
+// base class, since the regular CopyInsertion logic above selectively copies
+// tuple elements, while this method assumes all buffers need to be deep copied.
 StatusOr<HloInstruction*> CopyInsertion::FindOrInsertCopy(HloInstruction* hlo) {
   auto copy_it = inserted_copies_.find(hlo);
   if (copy_it == inserted_copies_.end()) {
@@ -347,85 +529,96 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferLiveness> liveness,
       BufferLiveness::Run(module, MakeUnique<DependencyHloOrdering>(module)));
-  auto& points_to_analysis = liveness->points_to_analysis();
+  const auto& points_to_analysis = liveness->points_to_analysis();
   XLA_VLOG_LINES(2, points_to_analysis.ToString());
   XLA_VLOG_LINES(2, module->ToString());
 
-  // Gather references to all while body computations in 'module'.
-  std::unordered_set<const HloComputation*> while_body_computations;
-  // Gather references to all while instructions in 'module' by computation.
-  std::unordered_map<const HloComputation*, std::vector<HloInstruction*>>
-      while_instructions;
+  // Gather all while body computations and while instructions.
+  FlatSet<const HloComputation*> while_body_computations;
+  std::vector<HloInstruction*> while_instructions;
   for (auto& computation : module->computations()) {
     for (auto& instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kWhile) {
-        continue;
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        while_body_computations.insert(instruction->while_body());
+        while_instructions.push_back(instruction.get());
       }
-      while_body_computations.insert(instruction->while_body());
-      auto it = while_instructions.find(computation.get());
-      if (it == while_instructions.end()) {
-        while_instructions.insert(
-            {computation.get(), std::vector<HloInstruction*>()});
-      }
-      while_instructions[computation.get()].emplace_back(instruction.get());
     }
   }
 
+  // Collect instruction buffer indices to copy in 'instructions_to_copy'.
+  std::vector<InstructionCopier> instructions_to_copy;
+
+  // Add copies of computation root instructions, if needed.
+  FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
   for (auto& computation : module->computations()) {
     VLOG(2) << "computation " << computation->name();
-
-    // Collect instruction buffer indices to copy in 'instructions_to_copy'.
-    std::vector<InstructionCopier> instructions_to_copy;
-
-    // Add copies of while 'init' operand instructions (if needed).
-    // TODO(b/33301720) Remove redundant while instruction copies.
-    auto it = while_instructions.find(computation.get());
-    if (it != while_instructions.end()) {
-      for (auto& while_hlo : it->second) {
-        // Create InstructionCopier for init operand of while instruction.
-        HloInstruction* init_hlo = while_hlo->mutable_operand(0);
-        instructions_to_copy.push_back(
-            InstructionCopier(/*init_value=*/false, init_hlo, {while_hlo}));
-        InstructionCopier& init_copier = instructions_to_copy.back();
-        // Record 'init' buffer indices which point-to a Constant or Parameter.
-        TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
-            liveness->points_to_analysis()));
-        // Record indices necessary to colocate while and init operand buffers.
-        TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
-            liveness.get(), while_hlo));
-      }
-    }
-
-    // Create InstructionCopier for computation root instruction.
-    instructions_to_copy.push_back(InstructionCopier(
-        /*init_value=*/false, computation->root_instruction(), {}));
-    InstructionCopier& root_copier = instructions_to_copy.back();
-
+    InstructionCopier root_copier(computation->root_instruction(),
+                                  /*copy_users=*/{});
     if (while_body_computations.count(computation.get()) > 0) {
-      // Record root indices to copy for while body sub-computations.
-      // We do not need to call RecordIndicesWhichPointToParamOrConstant for
-      // the while root instruction here, because any neccessary copies needed
-      // to avoid constant or parameters in the output are handled by while.init
-      // operand copy insertion above (which will share an allocation).
+      // Record root indices to copy for while body sub-computations. We do not
+      // need to call RecordIndicesWhichPointToParamOrConstant for the while
+      // body root instruction here, because any necessary copies needed to
+      // avoid constants or parameters in the output are handled by while.init
+      // operand copy insertion below (which will share an allocation).
+      HloInstruction* while_body_param = computation->parameter_instruction(0);
+      ShapeTree<bool> read_only_indices(while_body_param->shape());
       TF_RETURN_IF_ERROR(root_copier.RecordIndicesToCopyForColocatingBuffers(
-          liveness.get(), computation->parameter_instruction(0)));
+          *liveness, while_body_param, &read_only_indices));
+      while_body_read_only_indices[computation.get()] = read_only_indices;
+
+      // Mark control predecessors, based on the body param, for any copies
+      // we'll be inserting. This ensures the copy doesn't run too early.
+      TF_RETURN_IF_ERROR(root_copier.RecordControlPredecessors(
+          points_to_analysis, while_body_param));
     } else {
       // Record root indices to copy for general computations.
       TF_RETURN_IF_ERROR(root_copier.RecordIndicesWhichPointToParamOrConstant(
-          liveness->points_to_analysis()));
+          points_to_analysis));
     }
+    instructions_to_copy.push_back(root_copier);
+  }
 
-    for (auto& to_copy : instructions_to_copy) {
-      if (to_copy.HasAllIndicesFalse()) {
-        continue;
-      }
-      changed = true;
+  // Add copies of while 'init' operand instructions, if needed. 'shared_copies'
+  // is used to ensure that multiple while loops can share a single copy of the
+  // same entry parameter or constant, if all loops use it read-only.
+  //
+  // TODO(b/33301720) Remove redundant while instruction copies.
+  FlatMap<const HloInstruction*, HloInstruction*> shared_copies;
+  for (HloInstruction* while_hlo : while_instructions) {
+    // Fix read_only_indices to account for entry constants. Also
+    // initialize copy_overrides, which ensures a single copy for each read-only
+    // constant that is used in multiple while loops.
+    ShapeTree<bool>* read_only_indices =
+        &while_body_read_only_indices[while_hlo->while_body()];
+    TF_ASSIGN_OR_RETURN(
+        const ShapeTree<HloInstruction*> copy_overrides,
+        RevertReadOnlyIndicesForConstants(while_hlo, points_to_analysis,
+                                          read_only_indices, &shared_copies));
+    // Create InstructionCopier for init operand of while instruction.
+    HloInstruction* init_hlo = while_hlo->mutable_operand(0);
+    InstructionCopier init_copier(init_hlo, {while_hlo});
+    init_copier.SetReadOnlyIndices(*read_only_indices);
+    init_copier.SetCopyOverrides(copy_overrides);
+    // Record 'init' buffer indices which point-to a Constant or Parameter.
+    TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
+        points_to_analysis));
+    // Record indices necessary to colocate while and init operand buffers.
+    TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
+        *liveness, while_hlo, /*read_only_indices_out=*/nullptr));
+    instructions_to_copy.push_back(init_copier);
+  }
 
-      // Copy instruction at recorded buffer indices.
-      HloInstruction* copy = to_copy.Copy();
-      if (to_copy.instruction() == computation->root_instruction()) {
-        computation->set_root_instruction(copy);
-      }
+  for (InstructionCopier& to_copy : instructions_to_copy) {
+    if (to_copy.HasAllIndicesFalse()) {
+      continue;
+    }
+    changed = true;
+
+    // Copy instruction at recorded buffer indices.
+    HloComputation* computation = to_copy.instruction()->parent();
+    HloInstruction* copy = to_copy.Copy();
+    if (to_copy.instruction() == computation->root_instruction()) {
+      computation->set_root_instruction(copy);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 5bf6f2501b1..28bb62e40c7 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -32,7 +33,6 @@ namespace xla {
 // different lifetimes than computation results.
 class CopyInsertion : public HloPassInterface {
  public:
-  ~CopyInsertion() override {}
   tensorflow::StringPiece name() const override { return "copy-insertion"; }
 
   // Run the pass on the given module. Returns whether the module was changed
@@ -46,7 +46,7 @@ class CopyInsertion : public HloPassInterface {
 
   // A map containing all copies inserted during the copy insertion pass. The
   // key is the copied instruction and the value is the copy.
-  std::unordered_map<HloInstruction*, HloInstruction*> inserted_copies_;
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index e64da58dc79..cc77339bb63 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -20,18 +20,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
+using ::testing::UnorderedElementsAre;
+
 class CopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
@@ -39,59 +44,27 @@ class CopyInsertionTest : public HloTestBase {
     EXPECT_IS_OK(copy_insertion.Run(module).status());
 
     // Verify the points to set of the root of the computation after copy
-    // insertion contains no constants or parameters.
+    // insertion contains no constants or parameters, and is distinct and
+    // non-ambiguous.
     auto points_to_analysis =
         TuplePointsToAnalysis::Run(module).ConsumeValueOrDie();
-    const std::set<const LogicalBuffer*> maybe_live_out_buffers =
+    const auto& points_to = points_to_analysis->GetPointsToSet(
+        module->entry_computation()->root_instruction());
+    EXPECT_TRUE(points_to.IsDistinct());
+    EXPECT_TRUE(!points_to.IsAmbiguous());
+
+    tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers =
         points_to_analysis
             ->GetPointsToSet(module->entry_computation()->root_instruction())
             .CreateFlattenedSet();
+
     for (const LogicalBuffer* buffer : maybe_live_out_buffers) {
       EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kConstant);
       EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kParameter);
     }
   }
-
-  // OperandTree is a test helper class that simplifies the expression of
-  // an expected tree of operands (starting at some root instruction) in a
-  // unit test.
-  // Each HLO instruction is represented as a node in the OperandTree.
-  struct OperandTree {
-    // The expected opcode for this OperandTree node.
-    HloOpcode opcode;
-    // The set of operands expected for this OperandTree node.
-    std::vector<OperandTree> operands;
-    // If non-null, a pointer to the expected HloInstruction at this node.
-    const HloInstruction* instruction = nullptr;
-
-    // Returns a mutable reference to operand 'i' of this node.
-    OperandTree& op(int i) {
-      if (i >= operands.size()) {
-        operands.resize(i + 1);
-      }
-      return operands[i];
-    }
-
-    // Check that 'instruction' and its operands match expected values recorded
-    // in OperandTree.
-    void Check(const HloInstruction* instruction) {
-      EXPECT_EQ(opcode, instruction->opcode());
-      if (instruction != nullptr) {
-        EXPECT_EQ(instruction, instruction);
-      }
-      if (operands.empty()) {
-        return;
-      }
-      EXPECT_EQ(operands.size(), instruction->operand_count());
-      for (int i = 0; i < instruction->operand_count(); ++i) {
-        operands[i].Check(instruction->operand(i));
-      }
-    }
-  };
 };
 
-#define EXPECT_INST(A, E...) EXPECT_EQ(A, (std::set<HloInstruction*>{E}))
-
 TEST_F(CopyInsertionTest, SingleParameter) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
@@ -99,25 +72,16 @@ TEST_F(CopyInsertionTest, SingleParameter) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({x}));
 
-  EXPECT_INST(x->users(), tuple);
+  EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, SingleConstant) {
@@ -127,25 +91,16 @@ TEST_F(CopyInsertionTest, SingleConstant) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant}));
 
-  EXPECT_INST(constant->users(), tuple);
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
@@ -169,35 +124,15 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // "constant2" and parameter "x" are pointed to by the tuple and should be
-  // copied.
-
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.op(2).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(2).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(2).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0)),
+                        op::Copy(old_root->operand(1)), old_root->operand(2)));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
@@ -221,32 +156,19 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
-  EXPECT_INST(constant1->users(), tuple1);
-  EXPECT_INST(constant2->users(), tuple1, tuple2);
-  EXPECT_INST(constant3->users(), tuple2);
+  EXPECT_THAT(constant1->users(), UnorderedElementsAre(tuple1));
+  EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
+  EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kSelect;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kSelect;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
+                        op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, BitcastParameter) {
@@ -258,22 +180,16 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_INST(x->users(), bitcast);
+  EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kBitcast;
-  op_tree.op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 TEST_F(CopyInsertionTest, BitcastConstant) {
@@ -286,22 +202,16 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_INST(constant->users(), bitcast);
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kBitcast;
-  op_tree.op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
@@ -313,25 +223,16 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(1, x->user_count());
-  EXPECT_EQ(*x->users().begin(), bitcast);
+  EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, NestedTupleParameter) {
@@ -342,47 +243,31 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
 
   // Param shape is: ((F32[], S32[1,2,3]), F32[42])
   builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
-                                         ShapeUtil::MakeShape(S32, {1, 2, 3})}),
-              ShapeUtil::MakeShape(F32, {42})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
+                                      ShapeUtil::MakeShape(S32, {1, 2, 3})}),
+           ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
-            module.entry_computation()->root_instruction()->opcode());
+            module->entry_computation()->root_instruction()->opcode());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
+  HloInstruction* new_root = module->entry_computation()->root_instruction();
   EXPECT_NE(old_root, new_root);
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(0).op(0).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(1).op(0).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(0).op(1).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(
+      new_root,
+      op::Tuple(
+          op::Tuple(
+              op::Copy(op::GetTupleElement(op::GetTupleElement(old_root))),
+              op::Copy(op::GetTupleElement(op::GetTupleElement(old_root)))),
+          op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
@@ -392,10 +277,11 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
 
   // Param shape is: ((F32[], S32[1,2,3]), F32[42])
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
-                                         ShapeUtil::MakeShape(S32, {1, 2, 3})}),
-              ShapeUtil::MakeShape(F32, {42})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
+                                      ShapeUtil::MakeShape(S32, {1, 2, 3})}),
+           ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
   // The return value of the computation is the zero-th elemnt of the nested
@@ -403,30 +289,17 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(gte, module.entry_computation()->root_instruction());
+  EXPECT_EQ(gte, module->entry_computation()->root_instruction());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
+                        op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
@@ -452,27 +325,21 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(gte, module.entry_computation()->root_instruction());
+  EXPECT_EQ(gte, module->entry_computation()->root_instruction());
 
-  HloInstruction* old_root = module.entry_computation()->root_instruction();
-  InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
+  HloInstruction* old_root = module->entry_computation()->root_instruction();
+  InsertCopies(module.get());
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 class WhileCopyInsertionTest : public CopyInsertionTest {
  protected:
-  WhileCopyInsertionTest() : module_(TestName()) {}
+  WhileCopyInsertionTest() : module_(CreateNewModule()) {}
 
   // Builds a While condition computation which reads the induction variable
   // from the tuple parameter, and returns a predicate indicating whether this
@@ -530,8 +397,48 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     return builder.Build();
   }
 
-  // Builds a While body computation with read-only tuple element 0.
+  // Builds a While body computation with two output tuple elements dependent on
   // both input tuple elements.
+  //
+  // EX: Body({in0, in1, in2})
+  //   out0 = Add(in0, 1)
+  //   out1 = in1
+  //   out2 = in2
+  //   Tuple(out0, out1, out2)
+  std::unique_ptr<HloComputation> BuildDependentBodyComputation2() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+
+    const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
+        {induction_variable_shape_, data_shape_, data_shape_});
+
+    auto loop_state = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
+
+    // Update the induction variable GTE(0).
+    auto induction_variable =
+        builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            induction_variable_shape_, loop_state, 0));
+    auto inc = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+
+    // add0 = Add(in0, 1)
+    auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
+        induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
+    // data1 = GTE(1).
+    HloInstruction* data1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+
+    // data2 = GTE(2).
+    HloInstruction* data2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 2));
+
+    // Create output Tuple.
+    builder.AddInstruction(HloInstruction::CreateTuple({add0, data1, data2}));
+
+    return builder.Build();
+  }
+
+  // Builds a While body computation with read-only tuple element 0.
   // EX:
   // Body({in0, in1})
   //   out0 = in0
@@ -549,6 +456,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     // Update data GTE(1).
     auto data = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+
     // Use 'induction_variable' in computation with no path to output tuple.
     auto update = builder.AddInstruction(
         HloInstruction::CreateBroadcast(data_shape_, induction_variable, {8}));
@@ -566,11 +474,15 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   //   out0 = Add(in0, 1)
   //   out1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
   //   Tuple(out0, out1)
-  std::unique_ptr<HloComputation> BuildIndependentBodyComputation() {
+  std::unique_ptr<HloComputation> BuildIndependentBodyComputation(
+      bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".Body");
     // Create param instruction to access loop state.
+    const Shape& loop_state_shape =
+        nested ? nested_loop_state_shape_ : loop_state_shape_;
+
     auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+        HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
     // Update the induction variable GTE(0).
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -581,16 +493,30 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
     // Update data GTE(1).
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    HloInstruction* data = nullptr;
+    if (nested) {
+      data = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          nested_tuple_shape_, loop_state, 1));
+      data = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(data_shape_, data, 0));
+    } else {
+      data = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    }
     auto update = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
             {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
-    // add0 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
+    // add1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
     // Create output Tuple.
-    builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+    if (nested) {
+      auto nested_tuple =
+          builder.AddInstruction(HloInstruction::CreateTuple({add1, add1}));
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, nested_tuple}));
+    } else {
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+    }
     return builder.Build();
   }
 
@@ -643,8 +569,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 
   // Builds a While instruction using 'condition' and 'body' sub-computations.
   // Init operand is initialized to zeros of appropriate shape.
-  void BuildWhileInstruction(HloComputation* condition, HloComputation* body,
-                             bool nested = false) {
+  HloInstruction* BuildWhileInstruction(HloComputation* condition,
+                                        HloComputation* body,
+                                        bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".While");
     auto induction_var_init = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
@@ -658,17 +585,18 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
           HloInstruction::CreateTuple({data_init, data_init}));
       auto loop_state_init = builder.AddInstruction(
           HloInstruction::CreateTuple({induction_var_init, inner_init}));
-      builder.AddInstruction(HloInstruction::CreateWhile(
+      auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
           loop_state_shape_, condition, body, loop_state_init));
-      module_.AddEntryComputation(builder.Build());
-      return;
+      module_->AddEntryComputation(builder.Build());
+      return while_hlo;
     }
 
     auto loop_state_init = builder.AddInstruction(
         HloInstruction::CreateTuple({induction_var_init, data_init}));
-    builder.AddInstruction(HloInstruction::CreateWhile(
+    auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
         loop_state_shape_, condition, body, loop_state_init));
-    module_.AddEntryComputation(builder.Build());
+    module_->AddEntryComputation(builder.Build());
+    return while_hlo;
   }
 
   HloInstruction* BuildWhileInstruction_InitPointsToConstant() {
@@ -746,21 +674,23 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   HloInstruction* BuildWhileInstructionWithCustomInit(
       const Shape& loop_state_shape, HloInstruction* data_init,
       HloComputation::Builder* builder) {
+    const bool nested =
+        ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_);
     auto induction_var_init = builder->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
     auto condition =
-        module_.AddEmbeddedComputation(BuildConditionComputation());
-    auto body =
-        module_.AddEmbeddedComputation(BuildIndependentBodyComputation());
+        module_->AddEmbeddedComputation(BuildConditionComputation(nested));
+    auto body = module_->AddEmbeddedComputation(
+        BuildIndependentBodyComputation(nested));
     auto loop_state_init = builder->AddInstruction(
         HloInstruction::CreateTuple({induction_var_init, data_init}));
     auto while_hlo = builder->AddInstruction(HloInstruction::CreateWhile(
         loop_state_shape, condition, body, loop_state_init));
-    module_.AddEntryComputation(builder->Build());
+    module_->AddEntryComputation(builder->Build());
     return while_hlo;
   }
 
-  HloModule module_;
+  std::unique_ptr<HloModule> module_;
   Shape induction_variable_shape_ = ShapeUtil::MakeShape(S32, {});
   Shape data_shape_ = ShapeUtil::MakeShape(F32, {8});
   Shape loop_state_shape_ =
@@ -782,16 +712,23 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 // CopyInsertion pass should not generate any copies.
 //
 TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
-  auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
-  auto body = module_.AddEmbeddedComputation(BuildIndependentBodyComputation());
-  BuildWhileInstruction(condition, body);
+  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto body =
+      module_->AddEmbeddedComputation(BuildIndependentBodyComputation());
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
-  InsertCopies(&module_);
+  InsertCopies(module_.get());
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
   // No copies should be inserted so root should not be updated.
-  CHECK_EQ(old_root, new_root);
+  EXPECT_EQ(old_root, new_root);
+
+  // Both init indices need copies.
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
 }
 
 // Tests while body computation with dependent tuple elements:
@@ -801,39 +738,25 @@ TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
 //     out1 = Add(BCast(in0), in1)
 //     Tuple(out0, out1)
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass should convert the root instruction to:
 //
-//                    Tuple  // old root
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy     |
-//                   \     /
-//                    Tuple  // new root
+//     Tuple(Copy(out0), out1)
 //
 TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
-  auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
-  auto body = module_.AddEmbeddedComputation(BuildDependentBodyComputation());
-  BuildWhileInstruction(condition, body);
+  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto body = module_->AddEmbeddedComputation(BuildDependentBodyComputation());
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
-  InsertCopies(&module_);
+  InsertCopies(module_.get());
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(new_root,
+              op::Tuple(op::Copy(old_root->operand(0)), old_root->operand(1)));
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
 }
 
 // Tests while body computation with read-only tuple element 0:
@@ -849,20 +772,113 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
 //                         \      /
 //                           TUPLE (root)
 //
-// CopyInsertion pass should not generate any copies.
-//
+// CopyInsertion pass should not generate any copies for the while body.
 TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
-  auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
-  auto body = module_.AddEmbeddedComputation(
+  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto body = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
-  BuildWhileInstruction(condition, body);
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
-  InsertCopies(&module_);
+  InsertCopies(module_.get());
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // No copies should be inserted so root should not be updated.
-  CHECK_EQ(old_root, new_root);
+  // No copies should be inserted in the body, so root should not be updated.
+  EXPECT_EQ(old_root, new_root);
+
+  // Both indices need copies, even though Index 0 is read-only, since both are
+  // constants, which must be copied.
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
+}
+
+// Same as above, but with two while loops, sharing entry parameters.
+TEST_F(WhileCopyInsertionTest,
+       DependentTupleElements_OneReadOnly_TwoLoops_EntryParams) {
+  auto condition1 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition2 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto body1 = module_->AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+  auto body2 = module_->AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+
+  auto builder = HloComputation::Builder(TestName() + ".While");
+  auto iter_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "data"));
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({iter_param, data_param}));
+
+  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition1, body1, loop_init));
+  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition2, body2, loop_init));
+  module_->AddEntryComputation(builder.Build());
+
+  InsertCopies(module_.get());
+
+  // Both while loops alias iter_param, since index 0 is read-only in the body.
+  EXPECT_EQ(while_hlo1->operand(0)->operand(0),
+            while_hlo2->operand(0)->operand(0));
+  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_param);
+
+  // Each while loop gets its own copy of data_param, since index 1 is not
+  // read-only in the body.
+  EXPECT_NE(while_hlo1->operand(0)->operand(1),
+            while_hlo2->operand(0)->operand(1));
+  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_param));
+  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_param));
+}
+
+// Same as above, but with two while loops, sharing non-parameters.
+TEST_F(WhileCopyInsertionTest,
+       DependentTupleElements_OneReadOnly_TwoLoops_NonParams) {
+  auto condition1 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition2 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto body1 = module_->AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+  auto body2 = module_->AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+
+  auto builder = HloComputation::Builder(TestName() + ".While");
+  auto iter_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "data"));
+  // Add dummy ops to ensure loop_init elements aren't entry parameters.
+  auto iter_value = builder.AddInstruction(HloInstruction::CreateUnary(
+      iter_param->shape(), HloOpcode::kExp, iter_param));
+  auto data_value = builder.AddInstruction(HloInstruction::CreateUnary(
+      data_param->shape(), HloOpcode::kExp, data_param));
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({iter_value, data_value}));
+
+  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition1, body1, loop_init));
+  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition2, body2, loop_init));
+  module_->AddEntryComputation(builder.Build());
+
+  InsertCopies(module_.get());
+
+  // No copies of iter_value are necessary, since index 0 is read-only in both
+  // while bodies.
+  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_value);
+  EXPECT_EQ(while_hlo2->operand(0)->operand(0), iter_value);
+
+  // Each while loop gets its own copy of data_value, since index 1 is not
+  // read-only in the body.
+  EXPECT_NE(while_hlo1->operand(0)->operand(1),
+            while_hlo2->operand(0)->operand(1));
+  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_value));
+  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_value));
 }
 
 // Tests while body computation with nested tuple elements:
@@ -875,7 +891,8 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
 //          Add                           Reverse
 //           |                              |
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with the
+// actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old root
 //                   /     \
@@ -895,110 +912,47 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
 //
 TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
   auto condition =
-      module_.AddEmbeddedComputation(BuildConditionComputation(true));
-  auto body = module_.AddEmbeddedComputation(BuildNestedBodyComputation());
+      module_->AddEmbeddedComputation(BuildConditionComputation(true));
+  auto body = module_->AddEmbeddedComputation(BuildNestedBodyComputation());
   BuildWhileInstruction(condition, body, true);
 
   HloInstruction* old_root = body->root_instruction();
-  InsertCopies(&module_);
-  HloInstruction* new_root = body->root_instruction();
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(body->root_instruction(),
+              op::Tuple(old_root->operand(0),
+                        op::Tuple(old_root->operand(1)->operand(0),
+                                  op::Copy(old_root->operand(1)->operand(1)))));
 }
 
 // Tests while init instruction which points-to a constant.
 //
 //     init = Tuple(Constant(S32, {}), Constant(F32, {8}))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should add copies for both constants.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
   auto while_hlo = BuildWhileInstruction_InitPointsToConstant();
   auto old_init = while_hlo->operand(0);
-  InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
 }
 
 // Tests while init instruction which points-to a parameter.
 //
 //     init = Tuple(Constant(S32, {}), Parameter(F32, {8}))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should add copies for both the constant and parameter.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
   auto while_hlo = BuildWhileInstruction_InitPointsToParameter();
   auto old_init = while_hlo->operand(0);
-  InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
 }
 
 // Tests while init instruction which has an ambiguous points-to set.
@@ -1006,7 +960,8 @@ TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
 //     select = Select(pred, tuple1, tuple2)
 //     init = Tuple(Constant(S32, {}), Parameter(F32, {8}))
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with some of
+// the actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old init
 //                   /     \
@@ -1027,40 +982,22 @@ TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
 TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
   auto while_hlo = BuildWhileInstruction_InitPointsToAmbiguous();
   auto old_init = while_hlo->operand(0);
-  InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(
+      while_hlo->operand(0),
+      op::Tuple(
+          op::Copy(old_init->operand(0)),
+          op::Tuple(op::Copy(op::GetTupleElement(old_init->operand(1))),
+                    op::Copy(op::GetTupleElement(old_init->operand(1))))));
 }
 
 // Tests while init instruction which has a non-distinct points-to set.
 //
 //     init = Tuple(Constant(S32, {}), Tuple({vec_one, vec_one}))
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with some of
+// the actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old init
 //                   /     \
@@ -1081,73 +1018,116 @@ TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
   auto while_hlo = BuildWhileInstruction_InitPointsToNonDistinct();
   auto old_init = while_hlo->operand(0);
-  InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(old_init->operand(0)),
+                        op::Tuple(op::Copy(old_init->operand(1)->operand(0)),
+                                  op::Copy(old_init->operand(1)->operand(0)))));
 }
 
-// Tests while init instruction buffer which interfers with while result buffer.
+// Tests while init instruction buffer which interferes with while result
+// buffer.
 //
 //     init_data = Broadcast(...)
 //     add_unrelated = Add(init_data) // takes a reference to cause interference
 //     init = Tuple(Constant(S32, {}), init_data))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should copy both operands.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
   auto while_hlo = BuildWhileInstruction_InitPointsToInterfering();
   auto old_init = while_hlo->operand(0);
-  InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
+  InsertCopies(module_.get());
 
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
+}
 
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
+// Tests while init instruction buffer which has a non-distinct points-to set:
+//
+//     init = Tuple(Parameter(S32, {}), Parameter(F32, {8},
+//                  Parameter(F32, {8})))
+//
+// where the second and third parameters are identical *and* the tuple shared
+// by another while instruction..
+//
+// Verifies that the resulting point-to set is distinct in the resulting Tuple
+// (non-identical Copys). In other words, verifies that copy sharing does not
+// insert identical copies to the resulting tuple.
+TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
+  auto condition1 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition2 =
+      module_->AddEmbeddedComputation(BuildConditionComputation());
+  // Loop body that outputs tuple comprises two elements dependent on the init
+  // tuple.
+  auto body1 =
+      module_->AddEmbeddedComputation(BuildDependentBodyComputation2());
+  auto body2 =
+      module_->AddEmbeddedComputation(BuildDependentBodyComputation2());
 
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
+  auto builder = HloComputation::Builder(TestName() + ".While");
 
-  op_tree.Check(new_init);
+  auto iter_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "data"));
+
+  // Loop init tuple contains two identical parameter buffers.
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({iter_param, data_param, data_param}));
+
+  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
+      {induction_variable_shape_, data_shape_, data_shape_});
+
+  // Two while loops shares the same loop init tuple.
+  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape, condition1, body1, loop_init));
+  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape, condition2, body2, loop_init));
+
+  module_->AddEntryComputation(builder.Build());
+
+  auto points_to_analysis =
+      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+
+  // Asserts that the init tuples before copy insertion is non-distinct.
+  ASSERT_FALSE(
+      points_to_analysis->GetPointsToSet(while_hlo1->operand(0)).IsDistinct());
+  ASSERT_FALSE(
+      points_to_analysis->GetPointsToSet(while_hlo2->operand(0)).IsDistinct());
+
+  auto old_init1 = while_hlo1->operand(0);
+  auto old_init2 = while_hlo2->operand(0);
+
+  InsertCopies(module_.get());
+
+  EXPECT_THAT(while_hlo1->operand(0),
+              op::Tuple(op::Copy(old_init1->operand(0)),
+                        op::Copy(old_init1->operand(1)),
+                        op::Copy(old_init1->operand(2))));
+
+  EXPECT_THAT(while_hlo2->operand(0),
+              op::Tuple(op::Copy(old_init2->operand(0)),
+                        op::Copy(old_init2->operand(1)),
+                        op::Copy(old_init2->operand(2))));
+
+  // Verifies the init tuples after copy insertion is distinct.
+  points_to_analysis =
+      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  const auto& points_to1 =
+      points_to_analysis->GetPointsToSet(while_hlo1->operand(0));
+  EXPECT_TRUE(points_to1.IsDistinct());
+
+  const auto& points_to2 =
+      points_to_analysis->GetPointsToSet(while_hlo2->operand(0));
+  EXPECT_TRUE(points_to2.IsDistinct());
 }
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 3d2df5a459b..51ecbccd494 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -53,21 +53,24 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/port:initialize",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
@@ -95,6 +98,7 @@ cc_library(
     name = "simple_orc_jit",
     srcs = ["simple_orc_jit.cc"],
     hdrs = ["simple_orc_jit.h"],
+    linkopts = ["-ldl"],
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
@@ -135,7 +139,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
@@ -163,7 +166,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:lib",
@@ -412,6 +414,7 @@ cc_test(
 
 cc_test(
     name = "infeed_manager_test",
+    size = "small",
     srcs = ["infeed_manager_test.cc"],
     deps = [
         ":cpu_runtime",
@@ -504,6 +507,7 @@ cc_library(
 
 cc_test(
     name = "conv_canonicalization_test",
+    size = "small",
     srcs = ["conv_canonicalization_test.cc"],
     deps = [
         ":conv_canonicalization",
@@ -511,7 +515,6 @@ cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 89b3302bca0..8ebf9ab110d 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -81,12 +81,17 @@ operator()(llvm::Module& module) const {
 
   // Run optimization passes on module.
   function_passes.doInitialization();
+
+  CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
+
   for (auto func = module.begin(); func != module.end(); ++func) {
     function_passes.run(*func);
   }
   function_passes.doFinalization();
   module_passes.run(module);
 
+  CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
+
   // Buffer for holding machine code prior to constructing the ObjectFile.
   llvm::SmallVector<char, 0> stream_buffer;
   llvm::raw_svector_ostream ostream(stream_buffer);
@@ -192,8 +197,6 @@ void CompilerFunctor::AddOptimizationPasses(
   module_passes->add(createTargetTransformInfoWrapperPass(
       target_machine_->getTargetIRAnalysis()));
 
-  module_passes->add(llvm::createVerifierPass());
-
   llvm::PassManagerBuilder builder;
   builder.OptLevel = opt_level_;
   builder.SizeLevel = 0;
@@ -212,8 +215,6 @@ void CompilerFunctor::AddOptimizationPasses(
 
   builder.populateFunctionPassManager(*function_passes);
   builder.populateModulePassManager(*module_passes);
-
-  module_passes->add(llvm::createVerifierPass());
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index d18141af83e..f5ad431277d 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -28,6 +29,8 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+using ::testing::ElementsAre;
+
 class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
@@ -78,7 +81,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
           F32, {kOutputFeatureCount, kBatchSize, output_size, output_size}),
       input, kernel, conv_window_, dnums));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
@@ -96,17 +99,14 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
 
   // The input is in CNHW order. input_reshape should produce
   // NHWC for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(input_reshape->dimensions(),
-                              std::vector<int64>({1, 2, 3, 0})));
+  EXPECT_THAT(input_reshape->dimensions(), ElementsAre(1, 2, 3, 0));
   // The kernel is in OIHW order. kernel_reshape should produce
   // HWIO for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(kernel_reshape->dimensions(),
-                              std::vector<int64>({2, 3, 1, 0})));
+  EXPECT_THAT(kernel_reshape->dimensions(), ElementsAre(2, 3, 1, 0));
   // The output of the canonical convolution is in NHWC order (the same as
   // input_reshape's order). output_reshape should restore that order to the
   // order of the computation root (CNHW).
-  EXPECT_TRUE(ContainersEqual(output_reshape->dimensions(),
-                              std::vector<int64>({3, 0, 1, 2})));
+  EXPECT_THAT(output_reshape->dimensions(), ElementsAre(3, 0, 1, 2));
 }
 
 TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
@@ -135,7 +135,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
           F32, {kBatchSize, output_size, output_size, kOutputFeatureCount}),
       input, kernel, conv_window_, dnums));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
   ConvCanonicalization conv_canonicalization;
@@ -144,3 +144,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
 
 }  // namespace cpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 712c180f95f..34b99f2440b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 #include <string.h>
 #include <map>
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/port/initialize.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
@@ -58,7 +58,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -66,7 +69,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
@@ -140,22 +145,54 @@ CpuCompiler::CpuCompiler() {
   LLVMInitializePowerPCTargetMC();
   LLVMInitializePowerPCAsmPrinter();
   LLVMInitializePowerPCDisassembler();
+}
 
-  // LLVM command-line flags are global, so set them during initialization.
-  legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
-  if (!flags->xla_cpu_llvm_cl_opts.empty()) {
-    std::vector<string> opts =
-        tensorflow::str_util::Split(flags->xla_cpu_llvm_cl_opts, ',');
+namespace {
+
+const char* kXlaParallelCpuOption = "xla_cpu_parallel";
+
+// LLVM makes certain options configurable only through its command-line
+// options; it provide the ParseCommandLineOptions function that lets us set
+// flags at runtime. However, since these flags are global we want to avoid
+// multiple invocations of the LLVM compilation pipeline with a different set of
+// flags. Therefore, we only pass command-line flags to LLVM once, before the
+// first module is compiled.
+std::once_flag llvm_command_line_options_initialized;
+
+void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
+  auto options = config.debug_options().xla_backend_extra_options();
+  if (!options.empty()) {
+    std::vector<string> fake_argv_storage;
+    fake_argv_storage.push_back("");
+    for (const auto& it : options) {
+      // Skip options the XLA backend itself consumes.
+      if (it.first != kXlaParallelCpuOption) {
+        if (it.second.empty()) {
+          fake_argv_storage.push_back(it.first);
+        } else {
+          fake_argv_storage.push_back(it.first + "=" + it.second);
+        }
+      }
+    }
+
+    VLOG(2) << "Passing argv to LLVM:";
     std::vector<const char*> fake_argv;
-    fake_argv.push_back("");
-    for (const string& opt : opts) {
-      fake_argv.push_back(opt.c_str());
+    for (const auto& s : fake_argv_storage) {
+      fake_argv.push_back(s.c_str());
+      VLOG(2) << s;
     }
     llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
   }
 }
 
-namespace {
+// Helps determine whether the parallel CPU backend was requested in the options
+// of this module configuration.
+bool CpuParallelBackendRequested(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaParallelCpuOption) > 0;
+}
+
 // This visitor records which HLO instructions should have profiling information
 // recorded.
 class CollectProfileCandidates : public DfsHloVisitorWithDefault {
@@ -190,16 +227,16 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   }
   // It is important to recurse for "while" or else we risk overly coarse
   // profiling information.
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* /*init*/,
-                     HloComputation* condition, HloComputation* body) override {
+  Status HandleWhile(HloInstruction* xla_while) override {
     TF_RETURN_IF_ERROR(DefaultAction(xla_while));
 
     CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
-    TF_RETURN_IF_ERROR(
-        condition->root_instruction()->Accept(&candidates_for_condition));
+    TF_RETURN_IF_ERROR(xla_while->while_condition()->root_instruction()->Accept(
+        &candidates_for_condition));
 
     CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
-    TF_RETURN_IF_ERROR(body->root_instruction()->Accept(&candidates_for_body));
+    TF_RETURN_IF_ERROR(xla_while->while_body()->root_instruction()->Accept(
+        &candidates_for_body));
 
     return Status::OK();
   }
@@ -208,64 +245,86 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 }  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* hlo_module,
-                                 HloModuleConfig* module_config,
-                                 HloDumper dump_hlo) {
+Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
   // Optimization pipeline.
   HloPassPipeline pipeline("CPU", dump_hlo);
-  pipeline.AddPass<Inliner>();
+  pipeline.AddInvariantChecker<HloVerifier>();
+
+  // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding
+  // where we will take this pass in future.
+  // pipeline.AddPass<Inliner>();
+
   pipeline.AddPass<ConvCanonicalization>();
   {
     auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification",
                                                                dump_hlo);
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
-        [](const Shape&, const Shape&) { return false; });
+        [](const Shape&, const Shape&) { return false; },
+        /*enable_dot_simplification=*/false);
     pass.AddPass<ReshapeMover>();
+    pass.AddPass<HloConstantFolding>();
   }
-  pipeline.AddPass<TransposeFolding>(PotentiallyImplementedAsEigenDot);
-  pipeline.AddPass<HloSubcomputationUnification>();
+  pipeline.AddPass<TransposeFolding>(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return PotentiallyImplementedAsEigenDot(dot)
+                   ? candidate_operands
+                   : TransposeFolding::OperandIndices{};
+      },
+      TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
   pipeline.AddPass<CpuLayoutAssignment>(
-      module_config->mutable_entry_computation_layout());
+      module->mutable_entry_computation_layout());
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
       /*is_layout_sensitive=*/true,
-      [](const Shape&, const Shape&) { return true; });
+      [](const Shape&, const Shape&) { return true; },
+      /*enable_dot_simplification=*/false);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   // Outline ops in the entry computation into calls to subcomputations.
-  legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
-  if (flags->xla_cpu_parallel) {
+  if (CpuParallelBackendRequested(module->config())) {
     pipeline.AddPass<ParallelizationPreparation>();
   }
-  // Copy insertion should be performed immediately before IR emission to
-  // avoid inserting unnecessary copies (later pass adds an instruction which
-  // materializes the value) or missing a necessary copy (later pass removes
-  // an instruction which materializes a value).
+  // Copy insertion should be performed immediately before IR emission to avoid
+  // inserting unnecessary copies (later pass adds an instruction which
+  // materializes the value) or missing a necessary copy (later pass removes an
+  // instruction which materializes a value). DCE must be run immediately before
+  // (and sometime after) copy insertion, to avoid dead code from interfering
+  // with the rewrites.
+  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<CopyInsertion>();
-  if (flags->xla_cpu_parallel) {
+  if (CpuParallelBackendRequested(module->config())) {
     // Re-run the outlining, in case any copies were inserted into the entry
     // computation.
     pipeline.AddPass<ParallelizationPreparation>();
   }
   pipeline.AddPass<HloDCE>();
-  return pipeline.Run(hlo_module).status();
+  pipeline.AddPass<FlattenCallGraph>();
+  return pipeline.Run(module).status();
 }
 
 namespace {
 
+// Align buffers to 16-byte boundaries.
+constexpr int64 kMemoryAlignment = 16;
+
 llvm::TargetOptions CompilerTargetOptions(
-    const HloModuleConfig& execution_options) {
+    const HloModuleConfig& module_config) {
   llvm::TargetOptions target_options;
-  llvm_ir::SetTargetOptions(execution_options, &target_options);
+  llvm_ir::SetTargetOptions(
+      /*fast_math_enabled=*/module_config.debug_options()
+          .xla_enable_fast_math(),
+      &target_options);
   return target_options;
 }
 
-llvm::CodeGenOpt::Level CodeGenOptLevel() {
-  legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
-  switch (flags->xla_cpu_llvm_opt_level) {
+llvm::CodeGenOpt::Level CodeGenOptLevel(const HloModuleConfig& module_config) {
+  VLOG(2) << "backend_optimization_level: "
+          << module_config.debug_options().xla_backend_optimization_level();
+  switch (module_config.debug_options().xla_backend_optimization_level()) {
     case 1:
       return llvm::CodeGenOpt::Less;
     case 2:
@@ -282,28 +341,26 @@ llvm::CodeGenOpt::Level CodeGenOptLevel() {
 }  // namespace
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
     se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
+  std::call_once(llvm_command_line_options_initialized,
+                 &InitializeLLVMCommandLineOptions, module->config());
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   auto llvm_context = MakeUnique<llvm::LLVMContext>();
   auto llvm_module =
       MakeUnique<llvm::Module>("__compute_module", *llvm_context);
-  auto jit = MakeUnique<SimpleOrcJIT>(CompilerTargetOptions(*module_config),
-                                      CodeGenOptLevel());
+  auto jit = MakeUnique<SimpleOrcJIT>(CompilerTargetOptions(module->config()),
+                                      CodeGenOptLevel(module->config()));
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
-  const llvm::DataLayout& data_layout = llvm_module->getDataLayout();
-  int64 pointer_size = data_layout.getPointerSize();
 
-  TF_RETURN_IF_ERROR(
-      RunHloPasses(hlo_module.get(), module_config.get(), dump_hlo));
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), dump_hlo));
 
-  HloComputation* computation = hlo_module->entry_computation();
+  HloComputation* computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
-  if (module_config->hlo_profiling_enabled()) {
+  if (module->config().hlo_profiling_enabled()) {
     TF_ASSIGN_OR_RETURN(
         hlo_to_profile_idx,
         CollectProfileCandidates::GetCandidatesForComputation(computation));
@@ -311,7 +368,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 
   std::unique_ptr<Executable> cpu_executable;
   legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
-  if (flags->xla_cpu_parallel) {
+  if (CpuParallelBackendRequested(module->config())) {
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
     // DependencyHloOrdering is used for the parallel emitter because the order
@@ -320,9 +377,15 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // uses data dependencies for determining order.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(hlo_module.get(),
-                            MakeUnique<DependencyHloOrdering>(hlo_module.get()),
-                            pointer_size));
+        BufferAssigner::Run(module.get(),
+                            MakeUnique<DependencyHloOrdering>(module.get()),
+                            BufferSizeBytesFunction(), kMemoryAlignment));
+
+    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+      HloProto proto = MakeHloProto(*module, *assignment);
+      TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
+          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+    }
 
     // If we are using the parallel CPU backend, we need to create map from
     // HloInstruction to the corresponding generated function name.
@@ -338,7 +401,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         // Copy the constant out of the ProtocolBuffer so that we can give it a
         // higher alignment.
         const void* data = LiteralUtil::InternalData(instruction->literal());
-        int64 size = llvm_ir::ByteSizeOf(instruction->shape(), data_layout);
+        int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
         auto iter = aligned_constants.emplace(
             instruction, MakeUnique<unsigned char[]>(size));
         CHECK_EQ(iter.second, true);
@@ -348,13 +411,14 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       }
       // The parallel preparation should have ensured that the top-level
       // computation consists solely of Call instructions.
-      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall);
+      TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall)
+          << module->ToString();
       HloComputation* to_apply = instruction->to_apply();
       parallel_computations.emplace(to_apply, instruction);
     }
 
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment,
-                         llvm_module.get(), &hlo_to_profile_idx);
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         &hlo_to_profile_idx);
     std::unique_ptr<std::map<HloInstruction*, string>> function_names(
         new std::map<HloInstruction*, string>());
     for (auto embedded_computation :
@@ -369,7 +433,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           llvm::Function * ir_function,
           ir_emitter.EmitComputation(
               embedded_computation, embedded_computation->name(),
-              /*is_entry_computation=*/computation_is_parallel));
+              /*is_entry_computation=*/computation_is_parallel,
+              /*instruction_order=*/nullptr));
       // If this computation is parallel, remember it in the function name map.
       // This way we know what function to execute when we try to run code for
       // the Call instruction.
@@ -388,9 +453,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new ParallelCpuExecutable(
-        std::move(jit), std::move(assignment), std::move(hlo_module),
-        std::move(module_config), std::move(function_names),
-        std::move(hlo_to_profile_idx), std::move(aligned_constants)));
+        std::move(jit), std::move(assignment), std::move(module),
+        std::move(function_names), std::move(hlo_to_profile_idx),
+        std::move(aligned_constants)));
 
     if (flags->xla_cpu_embed_ir) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -402,26 +467,29 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // and reduced memory usage (as compared to using DependencyHloOrdering).
     TF_ASSIGN_OR_RETURN(
         SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(
-            *hlo_module, [&](const LogicalBuffer& buffer) {
-              return llvm_ir::ByteSizeOf(buffer.shape(), data_layout);
-            }));
+        CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(hlo_module.get(),
-                            MakeUnique<SequentialHloOrdering>(hlo_module.get(),
-                                                              module_sequence),
-                            pointer_size));
+        BufferAssigner::Run(
+            module.get(),
+            MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
+            BufferSizeBytesFunction(), kMemoryAlignment));
+
+    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+      HloProto proto = MakeHloProto(*module, *assignment);
+      TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
+          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+    }
 
     // Each computation is a single function.  Emit all embedded computations
     // before the entry computation. The order of computations returned from
     // GetEmbeddedComputations guarantees that a called computation occurs
     // before a caller computation.
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment,
-                         llvm_module.get(), &hlo_to_profile_idx);
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         &hlo_to_profile_idx);
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
       TF_RETURN_IF_ERROR(
@@ -448,10 +516,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(
-        new CpuExecutable(std::move(jit), std::move(assignment),
-                          std::move(hlo_module), std::move(module_config),
-                          function_name, std::move(hlo_to_profile_idx)));
+    cpu_executable.reset(new CpuExecutable(
+        std::move(jit), std::move(assignment), std::move(module), function_name,
+        std::move(hlo_to_profile_idx)));
 
     if (flags->xla_cpu_embed_ir) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -463,30 +530,31 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on CPU.");
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CpuCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
-  TF_RET_CHECK(hlo_modules.size() == module_configs.size());
-  TF_RET_CHECK(!hlo_modules.empty());
+CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                                HloDumper dump_hlo,
+                                const AotCompilationOptions& aot_options) {
+  TF_RET_CHECK(!modules.empty());
+  std::call_once(llvm_command_line_options_initialized,
+                 &InitializeLLVMCommandLineOptions, modules[0]->config());
 
   // We can pass just one llvm::TargetOptions when we compile the LLVM module,
   // so we bail if the configs have conflicting flags. At the moment, the only
   // flag that needs to be consistent is fast-math.
-  bool fast_math_disabled = module_configs[0]->fast_math_disabled();
-  for (const auto& module_config : module_configs) {
-    if (module_config->fast_math_disabled() != fast_math_disabled) {
+  const bool fast_math_enabled =
+      modules[0]->config().debug_options().xla_enable_fast_math();
+  for (const auto& module : modules) {
+    if (module->config().debug_options().xla_enable_fast_math() !=
+        fast_math_enabled) {
       return InvalidArgument(
           "All HLO module configs must have the same value for "
-          "fast_math_disabled.");
+          "xla_enable_fast_math.");
     }
   }
 
@@ -505,9 +573,9 @@ CpuCompiler::CompileAheadOfTime(
                          error.c_str());
   }
 
-  llvm::Reloc::Model reloc_model;
-  llvm::PICLevel::Level pic_level;
-  llvm::PIELevel::Level pie_level;
+  llvm::Reloc::Model reloc_model = llvm::Reloc::Static;
+  llvm::PICLevel::Level pic_level = llvm::PICLevel::NotPIC;
+  llvm::PIELevel::Level pie_level = llvm::PIELevel::Default;
   switch (options.relocation_model()) {
     case CpuAotCompilationOptions::RelocationModel::Static:
       reloc_model = llvm::Reloc::Static;
@@ -537,11 +605,11 @@ CpuCompiler::CompileAheadOfTime(
   }
   llvm::StringRef cpu_name = llvm_ir::AsStringRef(options.cpu_name());
   llvm::StringRef features = llvm_ir::AsStringRef(options.features());
-  llvm::CodeGenOpt::Level opt_level = CodeGenOptLevel();
+  llvm::CodeGenOpt::Level opt_level = CodeGenOptLevel(modules[0]->config());
   std::unique_ptr<llvm::TargetMachine> target_machine =
       WrapUnique(target->createTargetMachine(
           triple.getTriple(), cpu_name, features,
-          CompilerTargetOptions(*module_configs[0]), reloc_model,
+          CompilerTargetOptions(modules[0]->config()), reloc_model,
           llvm::CodeModel::Default, opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
@@ -555,34 +623,35 @@ CpuCompiler::CompileAheadOfTime(
   if (pie_level != llvm::PIELevel::Default) {
     llvm_module.setPIELevel(pie_level);
   }
-  const llvm::DataLayout& data_layout = llvm_module.getDataLayout();
-  int64 pointer_size = data_layout.getPointerSize();
 
   std::vector<std::unique_ptr<AotCompilationResult>> results;
-  for (int i = 0; i < hlo_modules.size(); ++i) {
-    HloModule* hlo_module = hlo_modules[i].get();
-    HloModuleConfig* module_config = module_configs[i].get();
+  for (size_t i = 0; i < modules.size(); ++i) {
+    HloModule* module = modules[i].get();
 
-    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module, module_config, dump_hlo));
+    TF_RETURN_IF_ERROR(RunHloPasses(module, dump_hlo));
 
     TF_ASSIGN_OR_RETURN(
         SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(
-            *hlo_module, [&](const LogicalBuffer& buffer) {
-              return llvm_ir::ByteSizeOf(buffer.shape(), data_layout);
-            }));
+        CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(hlo_module, MakeUnique<SequentialHloOrdering>(
-                                            hlo_module, module_sequence),
-                            pointer_size));
+        BufferAssigner::Run(
+            module, MakeUnique<SequentialHloOrdering>(module, module_sequence),
+            BufferSizeBytesFunction(), kMemoryAlignment));
 
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module,
+    legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
+    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+      HloProto proto = MakeHloProto(*module, *assignment);
+      TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
+          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+    }
+
+    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          /*hlo_to_profile_idx=*/nullptr);
-    HloComputation* computation = hlo_module->entry_computation();
+    HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
       TF_RETURN_IF_ERROR(
@@ -597,7 +666,8 @@ CpuCompiler::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, entry_point_name,
-                                   /*is_entry_computation=*/true));
+                                   /*is_entry_computation=*/true,
+                                   &module_sequence.at(computation)));
 
     entry_function->setName(llvm_ir::AsStringRef(entry_point_name));
 
@@ -627,12 +697,12 @@ CpuCompiler::CompileAheadOfTime(
       buffer_sizes.push_back(allocation.size());
     }
 
-    TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                        assignment->GetUniqueTopLevelOutputAllocation());
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                        assignment->GetUniqueTopLevelOutputSlice());
 
     results.emplace_back(MakeUnique<CpuAotCompilationResult>(
         std::move(object_file_data), std::move(buffer_sizes),
-        result_allocation->index()));
+        result_slice.index()));
   }
   return std::move(results);
 }
@@ -641,11 +711,17 @@ se::Platform::Id CpuCompiler::PlatformId() const {
   return se::host::kHostPlatformId;
 }
 
+HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
+  return CpuExecutable::ShapeSizeBytes;
+}
+
 }  // namespace cpu
 }  // namespace xla
 
-REGISTER_MODULE_INITIALIZER(cpu_compiler, {
+static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
     return xla::MakeUnique<xla::cpu::CpuCompiler>();
   });
-});
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index d7d77ce58a6..29fa4eac61b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
@@ -33,8 +32,6 @@ namespace cpu {
 // This class wraps the configurability options that LLVM exposes including: the
 // target triple, the target cpu and the target features.  It also includes the
 // desired linkage name for the computation entry point.
-// Note that the optimization level can be controlled by the
-// --xla_cpu_llvm_opt_level flag.
 class CpuAotCompilationOptions : public AotCompilationOptions {
  public:
   // Relocation models available for compilation.
@@ -113,32 +110,29 @@ class CpuCompiler : public Compiler {
   ~CpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo, const AotCompilationOptions& options) override;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                     HloDumper dump_hlo,
+                     const AotCompilationOptions& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
  private:
   // Initialize the LLVM target.
   static void InitializeLLVMTarget();
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* hlo_module, HloModuleConfig* module_config,
-                      HloDumper dump_hlo);
+  Status RunHloPasses(HloModule* hlo_module, HloDumper dump_hlo);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 727257d4f11..671d6957a39 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -53,11 +52,9 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<BufferAssignment> assignment,
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    : Executable(std::move(hlo_module), CpuExecutable::ShapeSizeBytes),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
@@ -135,10 +132,9 @@ Status CpuExecutable::AllocateBuffers(
     TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size);
   }
 
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                      assignment_->GetUniqueTopLevelOutputAllocation());
-
-  VLOG(3) << "result index: " << result_allocation->index();
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  VLOG(3) << "result index: " << result_slice.index();
 
   return Status::OK();
 }
@@ -193,9 +189,9 @@ Status CpuExecutable::ExecuteComputeFunction(
   for (auto& buffer : buffers) {
     buffer_pointers.push_back(const_cast<void*>(buffer.opaque()));
   }
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                      assignment_->GetUniqueTopLevelOutputAllocation());
-  void* result_buffer = buffer_pointers[result_allocation->index()];
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  void* result_buffer = buffer_pointers[result_slice.index()];
   if (VLOG_IS_ON(3)) {
     VLOG(3) << "Executing compute function:";
     VLOG(3) << tensorflow::strings::Printf(
@@ -231,7 +227,8 @@ Status CpuExecutable::ExecuteComputeFunction(
   }
 
   if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+    hlo_execution_profile->set_total_cycles_executed(
+        *module().entry_computation(), profile_counters.back());
 
     for (auto hlo_prof_idx : hlo_to_profile_idx_) {
       const HloInstruction* hlo = hlo_prof_idx.first;
@@ -243,24 +240,24 @@ Status CpuExecutable::ExecuteComputeFunction(
 }
 
 StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(run_options, arguments, buffers,
-                                            hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
+      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
   // Mark the buffers that are actually live (used in the output) when the
   // computation finishes executing.
   std::unordered_set<const void*> marked_addresses;
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                      assignment_->GetUniqueTopLevelOutputAllocation());
-  se::DeviceMemoryBase top_level_output = buffers[result_allocation->index()];
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  se::DeviceMemoryBase top_level_output = buffers[result_slice.index()];
   MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
                             &marked_addresses);
 
@@ -275,10 +272,9 @@ StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
   // Computation is done - deallocate temp buffers. Keep those marked live
   // because they are referenced by the output of the computation and are needed
   // by the service. They will be deallocated by the service.
-  for (auto i = 0; i < buffers.size(); ++i) {
-    auto alloc = buffers[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 &&
-        alloc.opaque() != nullptr) {
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    se::DeviceMemoryBase alloc = buffers[i];
+    if (marked_addresses.count(alloc.opaque()) == 0 && !alloc.is_null()) {
       VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
               << alloc.opaque() << "]";
       TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
@@ -290,37 +286,35 @@ StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   if (GetRootPointsToSet().IsAmbiguous()) {
     return Unimplemented("Points-to set of root instruction is ambiguous");
   }
+
+  se::Stream* stream = run_options->stream();
+  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> result_buffer,
-      ShapedBuffer::MakeShapedBuffer(
-          module_config().entry_computation_layout().result_shape(),
-          stream->parent()->platform(), stream->parent()->device_ordinal()));
-
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result_buffer,
+                      ShapedBuffer::MakeShapedBuffer(
+                          result_shape(), stream->parent()->platform(),
+                          stream->parent()->device_ordinal()));
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(run_options, arguments, buffers,
-                                            hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
+      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
   TF_RETURN_IF_ERROR(
       result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElement(
+          ->ForEachMutableElementWithStatus(
               [&buffers, &buffers_in_result, &result_buffer, this](
-                  const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) {
-                if (is_leaf) {
+                  const ShapeIndex& index, size_t* buffer_entry) {
+                if (ShapeUtil::IsLeafIndex(result_buffer->shape(), index)) {
                   const std::vector<const LogicalBuffer*>& sources =
                       this->GetRootPointsToSet().element(index);
                   // The points to set is unambiguous so the set should be a
@@ -334,24 +328,24 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
 
                   // The source instruction should have a non-parameter buffer
                   // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                                      this->assignment_->GetUniqueAllocation(
+                  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                                      this->assignment_->GetUniqueSlice(
                                           src, buffer_source->index()));
-                  CHECK(!allocation->is_entry_computation_parameter());
+                  CHECK(!slice.allocation()->is_entry_computation_parameter());
 
-                  CHECK(!buffers[allocation->index()].is_null() ||
-                        buffers[allocation->index()].size() == 0);
-                  result_buffer->mutable_buffers()->push_back(
-                      buffers[allocation->index()]);
-                  *buffer_entry = result_buffer->mutable_buffers()->size() - 1;
-                  buffers_in_result[allocation->index()] = true;
+                  const BufferAllocation::Index buffer_index = slice.index();
+                  const se::DeviceMemoryBase& buffer = buffers[buffer_index];
+                  CHECK(!buffer.is_null() || buffer.size() == 0);
+                  *buffer_entry = result_buffer->mutable_buffers()->size();
+                  result_buffer->mutable_buffers()->push_back(buffer);
+                  buffers_in_result[buffer_index] = true;
                 }
                 return Status::OK();
               }));
 
   // Free all buffers not in the result.
-  for (auto i = 0; i < buffers.size(); ++i) {
-    auto alloc = buffers[i];
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    se::DeviceMemoryBase alloc = buffers[i];
     if (!buffers_in_result[i] && !alloc.is_null()) {
       VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
               << alloc.opaque() << "]";
@@ -363,111 +357,23 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-Status CpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    ShapedBuffer* result_buffer, HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  // Every array element in the result of the computation must be unambiguously
-  // produced by a single instruction.
-  // This ensures that the buffers inside result_buffer can be assigned without
-  // conflict to the respective instructions because there is a one-to-one
-  // correspondence between hlo instructions and array buffers in the result.
-  if (GetRootPointsToSet().IsAmbiguous()) {
-    return Unimplemented(
-        "Points-to set of root instruction is ambiguous or not distinct");
-  }
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-  DCHECK(ShapeUtil::Compatible(result_buffer->shape(), result_shape()));
-
-  // If two tuple elements point to the same buffer, one of the results in the
-  // result buffer is considered the canonical location while the other result
-  // points to it (instead of, say, making a copy of the result).
-  // buffer_index_to_shape_index maps a buffer index to its canonical location
-  // in the result buffer.
-  std::unordered_map<BufferAllocation::Index, size_t>
-      buffer_index_to_shape_index;
-
-  // Copy values from result_buffer to the index in "buffers". These buffers
-  // will not be allocated in the call to AllocateBuffers.
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElement(
-              [&buffers, &buffers_in_result, &buffer_index_to_shape_index,
-               result_buffer, this](const ShapeIndex& index, bool is_leaf,
-                                    size_t* buffer_entry) {
-                if (is_leaf) {
-                  const std::vector<const LogicalBuffer*>& sources =
-                      this->GetRootPointsToSet().element(index);
-                  // The points to set is unambiguous so the set should be a
-                  // singleton.
-                  CHECK_EQ(1, sources.size());
-                  const LogicalBuffer* buffer_source = sources[0];
-                  HloInstruction* src = buffer_source->instruction();
-
-                  // The source for this result buffer can be a nested buffer
-                  // such as a tuple element.
-
-                  // The source instruction should have a non-parameter buffer
-                  // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                                      this->assignment_->GetUniqueAllocation(
-                                          src, buffer_source->index()));
-                  CHECK(!allocation->is_entry_computation_parameter());
-
-                  auto insert_result = buffer_index_to_shape_index.emplace(
-                      allocation->index(), *buffer_entry);
-                  if (insert_result.second) {
-                    // The points-to set is distinct so this buffer should not
-                    // have
-                    // been assigned in a previous invocation of this lambda.
-                    perftools::gputools::DeviceMemoryBase memory_base =
-                        result_buffer->buffer(index);
-                    CHECK(buffers[allocation->index()].is_null());
-                    CHECK(!memory_base.is_null());
-                    buffers[allocation->index()] = memory_base;
-                    buffers_in_result[allocation->index()] = true;
-                  } else {
-                    // Record the fact that this tuple element is identical to
-                    // some
-                    // prior result.
-                    *buffer_entry = insert_result.first->second;
-                  }
-                }
-                return Status::OK();
-              }));
-
-  TF_RETURN_IF_ERROR(AllocateBuffers(
-      memory_allocator, stream->parent()->device_ordinal(), &buffers));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(run_options, arguments, buffers,
-                                            hlo_execution_profile));
-
-  // Free all buffers not in the result.
-  for (auto i = 0; i < buffers.size(); ++i) {
-    auto alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
-
-  return Status::OK();
-}
-
 StatusOr<perftools::gputools::DeviceMemoryBase>
 CpuExecutable::ExecuteAsyncOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on CPU.");
 }
 
+/*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
+  // On the cpu, opaques are pointers.
+  if (ShapeUtil::IsOpaque(shape)) {
+    return sizeof(void*);
+  }
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
 const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
   return assignment_->points_to_analysis().GetPointsToSet(
       module().entry_computation()->root_instruction());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 8f3247e6834..b5746769ba7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -52,30 +51,23 @@ class CpuExecutable : public Executable {
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config,
       const string& entry_function_name,
       std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
   ~CpuExecutable() override {}
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  Status ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      ShapedBuffer* result_buffer,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments) override;
 
@@ -86,6 +78,8 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
+  static int64 ShapeSizeBytes(const Shape& shape);
+
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 240da35ef19..dc002846e9e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -24,6 +24,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on CPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // Condition for consumer: must be elementwise or a fusion op
   // (which necessarily only contains elementwise operations)
   if (!(consumer->opcode() == HloOpcode::kFusion ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
index b7c646ad47d..0eca4c3473e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
@@ -24,8 +24,9 @@ namespace cpu {
 
 class CpuInstructionFusion : public InstructionFusion {
  public:
-  CpuInstructionFusion() {}
-  ~CpuInstructionFusion() override {}
+  CpuInstructionFusion()
+      : InstructionFusion(CpuInstructionFusion::IsExpensive) {}
+  ~CpuInstructionFusion() override = default;
 
  protected:
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 8e06f0520ed..253de20f251 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 
-#include <sched.h>
 #include <functional>
 
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 981f24ca6f5..7ad497ff1a2 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -63,8 +63,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    const HloModule& hlo_module, const HloModuleConfig& hlo_module_config,
-    const BufferAssignment& assignment, llvm::Module* llvm_module,
+    const HloModule& hlo_module, const BufferAssignment& assignment,
+    llvm::Module* llvm_module,
     const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
     : assignment_(assignment),
       module_(llvm_module),
@@ -72,8 +72,10 @@ IrEmitter::IrEmitter(
       ir_builder_(llvm_module->getContext()),
       hlo_to_profile_idx_(hlo_to_profile_idx),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
-      hlo_module_config_(hlo_module_config) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(hlo_module_config));
+      hlo_module_config_(hlo_module.config()) {
+  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+      /*fast_math_enabled=*/hlo_module_config_.debug_options()
+          .xla_enable_fast_math()));
 }
 
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
@@ -201,7 +203,8 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
     if (&argument == retval) {
       continue;
     }
-    compute_function_->setDoesNotAlias(argument.getArgNo() + 1);
+    compute_function_->addAttribute(argument.getArgNo() + 1,
+                                    llvm::Attribute::NoAlias);
   }
 
   ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
@@ -506,7 +509,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window,
 
         llvm_ir::IrArray::Index input_index(index.size());
         llvm::Value* in_bounds_condition = nullptr;
-        for (int64 i = 0; i < index.size(); ++i) {
+        for (size_t i = 0; i < index.size(); ++i) {
           llvm::Value* strided_index = ir_builder_.CreateNSWMul(
               index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
           input_index[i] = ir_builder_.CreateNSWSub(
@@ -1111,7 +1114,7 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce, HloInstruction* arg,
         llvm_ir::IrArray::Index input_index = reduced_dims_index;
         llvm_ir::IrArray::Index::const_iterator it = index.begin();
 
-        for (int64 i = 0; i < input_index.size(); ++i) {
+        for (size_t i = 0; i < input_index.size(); ++i) {
           if (input_index[i] == nullptr) {
             input_index[i] = *it++;
           }
@@ -1136,6 +1139,41 @@ Status IrEmitter::HandleSend(HloInstruction* send) {
   return Unimplemented("Send is not implemented on CPU. See b/33942983.");
 }
 
+Status IrEmitter::HandleSlice(HloInstruction* slice, HloInstruction* operand) {
+  if (ShapeUtil::IsScalar(slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(slice));
+    emitted_value_[slice] = target_address;
+    return EmitMemcpy(*operand, *slice);
+  }
+  return DefaultAction(slice);
+}
+
+Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                     HloInstruction* operand,
+                                     HloInstruction* /*start_indices*/) {
+  if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(dynamic_slice));
+    emitted_value_[dynamic_slice] = target_address;
+    return EmitMemcpy(*operand, *dynamic_slice);
+  }
+  return DefaultAction(dynamic_slice);
+}
+
+Status IrEmitter::HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                           HloInstruction* /*operand*/,
+                                           HloInstruction* update,
+                                           HloInstruction* /*start_indices*/) {
+  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(dynamic_update_slice));
+    emitted_value_[dynamic_update_slice] = target_address;
+    return EmitMemcpy(*update, *dynamic_update_slice);
+  }
+  return DefaultAction(dynamic_update_slice);
+}
+
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
@@ -1180,7 +1218,7 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   // output_index := edge_padding_low + operand_index * (interior_padding + 1)
   const PaddingConfig& padding_config = pad->padding_config();
   llvm_ir::IrArray::Index output_index;
-  for (int64 i = 0; i < operand_index.size(); ++i) {
+  for (size_t i = 0; i < operand_index.size(); ++i) {
     llvm::Value* offset = ir_builder_.CreateMul(
         operand_index[i],
         ir_builder_.getInt64(padding_config.dimensions(i).interior_padding() +
@@ -1265,13 +1303,12 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   }
 }
 
-Status IrEmitter::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
+Status IrEmitter::HandleCall(HloInstruction* call) {
+  HloComputation* computation = call->to_apply();
   llvm::Function* call_ir_function = FindOrDie(emitted_functions_, computation);
 
   std::vector<llvm::Value*> parameter_addresses;
-  for (HloInstruction* operand : operands) {
+  for (const HloInstruction* operand : call->operands()) {
     parameter_addresses.push_back(GetEmittedValueFor(operand));
   }
 
@@ -1294,12 +1331,12 @@ Status IrEmitter::HandleCustomCall(
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
           i8_ptr_type, ir_builder_.getInt32(operands.size()),
           "cc_operands_alloca", &ir_builder_);
-  for (int i = 0; i < operands.size(); ++i) {
+  for (size_t i = 0; i < operands.size(); ++i) {
     const HloInstruction* operand = operands[i];
     llvm::Value* operand_as_i8ptr =
         ir_builder_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type);
     llvm::Value* slot_in_operands_alloca = ir_builder_.CreateInBoundsGEP(
-        operands_alloca, {ir_builder_.getInt32(i)});
+        operands_alloca, {ir_builder_.getInt64(i)});
     ir_builder_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca);
   }
   auto* custom_call_ir_function =
@@ -1322,32 +1359,29 @@ Status IrEmitter::HandleCustomCall(
   return Status::OK();
 }
 
-Status IrEmitter::HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                              HloComputation* condition, HloComputation* body) {
+Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Precondition: Condition computation must return a scalar bool.
+  HloComputation* condition = xla_while->while_condition();
   TF_RET_CHECK(ShapeUtil::IsScalar(condition->root_instruction()->shape()) &&
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
-  // Check that all while-related buffers share an allocation.
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshape(
+  // Check that all while-related buffers share an allocation slice.
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
       [this, &xla_while](const Shape& /*subshape*/,
                          const ShapeIndex& index) -> Status {
         auto check = [this](const HloInstruction* a, const HloInstruction* b,
                             const ShapeIndex& index) {
-          BufferAllocation::Index index_a =
-              assignment_.GetUniqueAllocation(a, index)
-                  .ConsumeValueOrDie()
-                  ->index();
-          BufferAllocation::Index index_b =
-              assignment_.GetUniqueAllocation(b, index)
-                  .ConsumeValueOrDie()
-                  ->index();
-          if (index_a != index_b) {
+          const BufferAllocation::Slice slice_a =
+              assignment_.GetUniqueSlice(a, index).ConsumeValueOrDie();
+          const BufferAllocation::Slice slice_b =
+              assignment_.GetUniqueSlice(b, index).ConsumeValueOrDie();
+          if (slice_a != slice_b) {
             return InternalError(
-                "instruction %s does not share allocation with "
-                "instruction %s ",
-                a->ToString().c_str(), b->ToString().c_str());
+                "instruction %s %s does not share slice with "
+                "instruction %s %s",
+                a->ToString().c_str(), slice_a.ToString().c_str(),
+                b->ToString().c_str(), slice_b.ToString().c_str());
           }
           return Status::OK();
         };
@@ -1364,12 +1398,14 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while, HloInstruction* init,
       }));
 
   // Set emitted value to that of 'init' with which it shares an allocation.
+  const HloInstruction* init = xla_while->operand(0);
   emitted_value_[xla_while] = GetEmittedValueFor(init);
 
   // The called computation should have been emitted previously.
   llvm::Function* condition_ir_function =
       FindOrDie(emitted_functions_, condition);
-  llvm::Function* body_ir_function = FindOrDie(emitted_functions_, body);
+  llvm::Function* body_ir_function =
+      FindOrDie(emitted_functions_, xla_while->while_body());
 
   // Generating:
   //   while (Condition(while_result)) {
@@ -1582,44 +1618,49 @@ llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() {
 }
 
 llvm::Value* IrEmitter::EmitTempBufferPointer(
-    BufferAllocation::Index temp_buf_index, const Shape& target_shape) {
+    const BufferAllocation::Slice& slice, const Shape& target_shape) {
   llvm::Type* element_type = IrShapeType(target_shape);
   // The alignment and number of bytes within the temporary buffer is determined
   // by the maximal shape as determined by buffer assignment.
-  const BufferAllocation& allocation =
-      assignment_.GetAllocation(temp_buf_index);
+  const BufferAllocation& allocation = assignment_.GetAllocation(slice.index());
   if (allocation.is_thread_local()) {
     // Thread-local allocations should only be assigned a single buffer.
-    CHECK_EQ(1, allocation.assigned_buffers().size());
-    const Shape& shape = allocation.assigned_buffers()[0]->shape();
+    const auto& assigned_buffers = allocation.assigned_buffers();
+    CHECK_EQ(1, assigned_buffers.size());
+    const Shape& shape = assigned_buffers.begin()->first->shape();
 
     llvm::AllocaInst*& tempbuf_address = thread_local_buffers_[{
-        ir_builder_.GetInsertBlock()->getParent(), temp_buf_index}];
+        ir_builder_.GetInsertBlock()->getParent(), slice}];
     if (tempbuf_address == nullptr) {
       tempbuf_address = llvm_ir::EmitAllocaAtFunctionEntry(
           IrShapeType(shape),
-          tensorflow::strings::StrCat("thread_local", temp_buf_index),
+          tensorflow::strings::StrCat("thread_local", slice.ToString()),
           &ir_builder_, MinimumAlignmentForShape(target_shape));
     }
     return ir_builder_.CreateBitCast(tempbuf_address,
                                      element_type->getPointerTo());
   }
 
-  llvm::Value* tempbuf_address_offset = llvm_ir::EmitBufferIndexingGEP(
-      GetTempBuffersArgument(), temp_buf_index, &ir_builder_);
-  llvm::LoadInst* tempbuf_address_untyped =
-      ir_builder_.CreateLoad(tempbuf_address_offset);
+  llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP(
+      GetTempBuffersArgument(), slice.index(), &ir_builder_);
+  llvm::LoadInst* tempbuf_address_base =
+      ir_builder_.CreateLoad(tempbuf_address_ptr);
   //  Loading the address of a buffer is invariant of the point at which the
   //  load is executed in the program because we never reassign buffers.
-  tempbuf_address_untyped->setMetadata(
+  tempbuf_address_base->setMetadata(
       llvm::LLVMContext::MD_invariant_load,
-      llvm::MDNode::get(tempbuf_address_untyped->getContext(), /*MDs=*/{}));
-  llvm_ir::SetTbaaForInstruction(tempbuf_address_untyped, target_shape,
+      llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{}));
+  llvm_ir::SetTbaaForInstruction(tempbuf_address_base, target_shape,
                                  /*is_pointer_to=*/true);
+  AttachAlignmentMetadataForLoad(tempbuf_address_base, allocation.size());
+  AttachDereferenceableMetadataForLoad(tempbuf_address_base, allocation.size());
 
-  AttachAlignmentMetadataForLoad(tempbuf_address_untyped, allocation.size());
-  AttachDereferenceableMetadataForLoad(tempbuf_address_untyped,
-                                       allocation.size());
+  llvm::Value* tempbuf_address_untyped = tempbuf_address_base;
+  if (slice.offset() > 0) {
+    // Adjust the address to account for the slice offset.
+    tempbuf_address_untyped = ir_builder_.CreateInBoundsGEP(
+        tempbuf_address_base, ir_builder_.getInt64(slice.offset()));
+  }
   return ir_builder_.CreateBitCast(tempbuf_address_untyped,
                                    element_type->getPointerTo());
 }
@@ -1657,13 +1698,13 @@ void IrEmitter::EmitArrayFunctionCallInto(
           ir_builder_.getInt32(parameter_addresses.size()),
           tensorflow::strings::StrCat(name, "_parameter_addresses"),
           &ir_builder_);
-  for (int i = 0; i < parameter_addresses.size(); ++i) {
+  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
     llvm::Value* parameter_as_i8ptr = ir_builder_.CreateBitCast(
         parameter_addresses[i], ir_builder_.getInt8PtrTy(),
         llvm_ir::AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
                                                          "_address_as_i8ptr")));
     llvm::Value* slot_in_param_adresses = ir_builder_.CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder_.getInt32(i)});
+        parameter_addresses_buffer, {ir_builder_.getInt64(i)});
     ir_builder_.CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
   }
 
@@ -1708,8 +1749,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetAddressForOp(
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
       attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape));
-      retval->addAttr(llvm::AttributeSet::get(
-          retval->getContext(), retval->getArgNo() + 1, attr_builder));
+      retval->addAttrs(attr_builder);
     }
     return ir_builder_.CreateBitCast(retval,
                                      IrShapeType(target_shape)->getPointerTo());
@@ -1717,9 +1757,9 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetAddressForOp(
 
   // For other nodes, we need the temporary buffer allocated for this node to
   // write the result into.
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                      assignment_.GetUniqueTopLevelAllocation(op));
-  return EmitTempBufferPointer(allocation->index(), target_shape);
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                      assignment_.GetUniqueTopLevelSlice(op));
+  return EmitTempBufferPointer(slice, target_shape);
 }
 
 Status IrEmitter::EmitTargetElementLoop(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 9df5b8b3d25..ebb7296a075 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -60,24 +60,30 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // llvm_module: the LLVM module to emit IR into.
   // hlo_to_profile_idx: the mapping from HLO to its index in the profiling
   //                     array.
-  IrEmitter(const HloModule& hlo_module, const HloModuleConfig& module_config,
-            const BufferAssignment& assignment, llvm::Module* llvm_module,
+  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
+            llvm::Module* llvm_module,
             const std::unordered_map<const HloInstruction*, size_t>*
                 hlo_to_profile_idx);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
-  // function. function_name_prefix is the desired name of the function. If the
-  // name is not unique among already emitted functions then a suffix is
-  // appended to make the name unique. is_entry_computation indicates that this
-  // is the entry computation of the HLO module. If 'instruction_order' is given
-  // then the HLO instructions are emitted in the given order.  In this case,
-  // 'instruction_order' must be a topological sort of the set of nodes
-  // accessible from the root of the computation.
+  // function.
+  //
+  // function_name_prefix is the desired name of the function. If the name is
+  // not unique among already emitted functions then a suffix is appended to
+  // make the name unique.
+  //
+  // is_entry_computation indicates that this is the entry computation of the
+  // HLO module.
+  //
+  // If 'instruction_order' is not NULL, then the HLO instructions are emitted
+  // in the given order.  In this case, 'instruction_order' must be a
+  // topological sort of the set of nodes accessible from the root of the
+  // computation.
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_entry_computation,
-      std::vector<const HloInstruction*>* instruction_order = nullptr);
+      std::vector<const HloInstruction*>* instruction_order);
 
  protected:
   //
@@ -114,6 +120,15 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                             HloComputation* function) override;
   Status HandleSelectAndScatter(HloInstruction* instruction) override;
   Status HandleSend(HloInstruction* send) override;
+  Status HandleSlice(HloInstruction* slice,
+                     HloInstruction* /*operand*/) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* /*operand*/,
+                            HloInstruction* /*start_indices*/) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                  HloInstruction* /*operand*/,
+                                  HloInstruction* /*update*/,
+                                  HloInstruction* /*start_indices*/) override;
   Status HandleRecv(HloInstruction* recv) override;
   Status HandlePad(HloInstruction* pad) override;
   Status HandleTuple(
@@ -125,14 +140,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloComputation* function,
       tensorflow::gtl::ArraySlice<HloInstruction*> static_operands) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -183,7 +195,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Emits code that computes the address of the given temporary buffer to the
   // function. target_shape is the shape of this temporary buffer.
   // The returned Value's type is a pointer to element_type.
-  llvm::Value* EmitTempBufferPointer(BufferAllocation::Index temp_buf_index,
+  llvm::Value* EmitTempBufferPointer(const BufferAllocation::Slice& slice,
                                      const Shape& target_shape);
 
   // Emits a function into the current module. This can be used for
@@ -290,7 +302,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   std::map<HloComputation*, llvm::Function*> emitted_functions_;
 
   // Map containing all previously emitted thread-local temporary buffers.
-  std::map<std::pair<llvm::Function*, BufferAllocation::Index>,
+  std::map<std::pair<llvm::Function*, BufferAllocation::Slice>,
            llvm::AllocaInst*>
       thread_local_buffers_;
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index ade7fa58a2b..bdddca99c2f 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -58,12 +57,11 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
     std::unique_ptr<std::map<HloInstruction*, string>> function_names,
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    : Executable(std::move(hlo_module), ParallelCpuExecutable::ShapeSizeBytes),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       functions_names_(std::move(function_names)),
@@ -97,75 +95,81 @@ static void MarkLiveAddressesInOutput(
   }
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-ParallelCpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  VLOG(3) << "ExecuteOnStream arg size: " << arguments.size();
-  if (!arguments.empty()) {
-    VLOG(3) << "ExecuteOnStream arg[0]: " << arguments.at(0).opaque();
-  }
-
-  // Allocate the temporary buffers required for the computation.
-  se::StreamExecutor* stream_executor = stream->parent();
-  int device_ordinal = stream_executor->device_ordinal();
-  int64 buffer_count = assignment_->Allocations().size();
-  VLOG(3) << "temp buffer count: " << buffer_count;
-
-  std::vector<se::DeviceMemoryBase> device_allocations;
-  for (BufferAllocation::Index i = 0; i < buffer_count; ++i) {
+Status ParallelCpuExecutable::AllocateBuffers(
+    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+  CHECK_EQ(buffers->size(), assignment_->Allocations().size());
+  VLOG(3) << "Allocating " << assignment_->Allocations().size()
+          << " allocations for module " << module().name();
+  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+       ++i) {
     auto& allocation = assignment_->GetAllocation(i);
+
+    VLOG(3) << allocation.ToString();
+
     if (allocation.is_entry_computation_parameter()) {
-      // Buffers do not need to be allocated for parameters.
-      device_allocations.push_back(se::DeviceMemoryBase(nullptr));
+      VLOG(3) << "allocation #" << i << " is a parameter";
       continue;
     }
 
     if (allocation.is_thread_local()) {
-      // Buffers do not need to be allocated for thread-local temporaries.
-      device_allocations.push_back(se::DeviceMemoryBase(nullptr));
+      VLOG(3) << "buffer #" << i << " is thread-local";
       continue;
     }
 
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase device_allocation,
-        memory_allocator->Allocate(device_ordinal, allocation.size()));
+    int64 buffer_size = allocation.size();
+    if (!(*buffers)[i].is_null()) {
+      VLOG(3) << "buffer #" << i
+              << " is in the preallocated result ShapedBuffer";
+    } else {
+      TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate(
+                                             device_ordinal, buffer_size));
 
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << "ParallelCpuExecutable allocating " << allocation.size()
-              << " bytes for allocation #" << i << " ["
-              << device_allocation.opaque() << "]";
-      std::vector<string> parts;
-      for (const LogicalBuffer* buffer : allocation.assigned_buffers()) {
-        parts.push_back(buffer->ToString());
-      }
-      VLOG(3) << " " << tensorflow::str_util::Join(parts, ", ");
+      VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes ["
+              << (*buffers)[i].opaque() << "]";
     }
 
-    device_allocations.push_back(device_allocation);
     // Since the output buffer and all the temporary buffers were written into
     // by the JITed code, msan has no way of knowing their memory was
     // initialized. Mark them initialized so that msan doesn't flag loads from
     // these buffers.
-    TF_ANNOTATE_MEMORY_IS_INITIALIZED(device_allocation.opaque(),
-                                      allocation.size());
+    TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size);
   }
 
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                      assignment_->GetUniqueTopLevelOutputAllocation());
-  BufferAllocation::Index result_index = result_allocation->index();
-  VLOG(3) << "result index: " << result_index;
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  VLOG(3) << "result index: " << result_slice.index();
 
+  return Status::OK();
+}
+
+Status ParallelCpuExecutable::ExecuteComputeFunctions(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    HloExecutionProfile* hlo_execution_profile) {
+  std::vector<se::DeviceMemoryBase> argument_buffers(arguments.size());
+  for (int i = 0; i < arguments.size(); ++i) {
+    TF_RET_CHECK(!ShapeUtil::IsTuple(arguments[i]->shape()));
+    argument_buffers[i] = arguments[i]->buffer(/*index=*/{});
+  }
+  return ExecuteComputeFunctions(run_options, argument_buffers, buffers,
+                                 hlo_execution_profile);
+}
+
+Status ParallelCpuExecutable::ExecuteComputeFunctions(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    HloExecutionProfile* hlo_execution_profile) {
   // Allocate profiling counters for each hlo instruction that we would like to
   // profile.  Allocate an additional profile counter for the entire
   // computation.
   std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
 
   std::vector<void*> buffer_pointers;
-  for (auto& device_allocation : device_allocations) {
+  buffer_pointers.reserve(buffers.size());
+  for (auto device_allocation : buffers) {
     buffer_pointers.push_back(device_allocation.opaque());
   }
 
@@ -188,8 +192,8 @@ ParallelCpuExecutable::ExecuteOnStream(
   std::list<HloInstruction*> pending;
 
   // Call the function for each HLO instruction in topological order.
-  for (auto* instruction :
-       module().entry_computation()->MakeInstructionPostOrder()) {
+  const HloComputation& entry_computation = *module().entry_computation();
+  for (auto* instruction : entry_computation.MakeInstructionPostOrder()) {
     // Parameters and constants have no functions associated with them. Instead
     // just copy the existing buffer into the map containing instruction
     // results..
@@ -206,9 +210,9 @@ ParallelCpuExecutable::ExecuteOnStream(
     }
   }
 
-  auto* temps_array = buffer_pointers.data();
-  auto* profile_counters_array = profile_counters.data();
-  auto* thread_pool = CHECK_NOTNULL(run_options->inter_op_thread_pool());
+  void** temps_array = buffer_pointers.data();
+  uint64* profile_counters_array = profile_counters.data();
+  auto* thread_pool = CHECK_NOTNULL(run_options->xla_intra_op_thread_pool());
   tensorflow::mutex completion_queue_lock;
   tensorflow::condition_variable completion_queue_cv;
   std::deque<HloInstruction*> completion_queue;
@@ -227,11 +231,11 @@ ParallelCpuExecutable::ExecuteOnStream(
         continue;
       }
 
-      TF_ASSIGN_OR_RETURN(
-          const BufferAllocation* result_allocation,
-          assignment_->GetUniqueTopLevelAllocation(instruction));
-
-      void* result_buffer = buffer_pointers[result_allocation->index()];
+      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                          assignment_->GetUniqueTopLevelSlice(instruction));
+      void* result_buffer =
+          static_cast<char*>(temps_array[result_slice.index()]) +
+          result_slice.offset();
       // We cannot use a move-only RAII type like std::unique_ptr because the
       // list of operands is allocated on the main thread and transferred to the
       // worker via the lambda passed to enqueue_function.  In order for the
@@ -245,11 +249,12 @@ ParallelCpuExecutable::ExecuteOnStream(
                      });
       auto function = FindOrDie(functions, instruction);
       // The thread pool entry takes ownership of |operand_buffers|.
+      const auto* exec_run_options = &run_options->run_options();
       thread_pool->Schedule([instruction, &completion_queue,
                              &completion_queue_lock, &completion_queue_cv,
-                             result_buffer, run_options, operand_buffers,
+                             result_buffer, exec_run_options, operand_buffers,
                              temps_array, profile_counters_array, function] {
-        function(result_buffer, run_options, operand_buffers, temps_array,
+        function(result_buffer, exec_run_options, operand_buffers, temps_array,
                  profile_counters_array);
         delete[] operand_buffers;
         // Push the completed HLO instruction on the queue, the main thread
@@ -279,9 +284,11 @@ ParallelCpuExecutable::ExecuteOnStream(
         break;
       }
     } while (1);
-    TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                        assignment_->GetUniqueTopLevelAllocation(instruction));
-    void* result_buffer = buffer_pointers[result_allocation->index()];
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                        assignment_->GetUniqueTopLevelSlice(instruction));
+    void* result_buffer =
+        static_cast<char*>(temps_array[result_slice.index()]) +
+        result_slice.offset();
     InsertOrDie(&results, instruction, result_buffer);
     --instructions_in_flight;
   }
@@ -295,7 +302,8 @@ ParallelCpuExecutable::ExecuteOnStream(
     execution_profile_.set_compute_cycle_count(profile_counters.back());
   }
   if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+    hlo_execution_profile->set_total_cycles_executed(entry_computation,
+                                                     profile_counters.back());
 
     for (auto hlo_prof_idx : hlo_to_profile_idx_) {
       const HloInstruction* hlo = hlo_prof_idx.first;
@@ -304,6 +312,41 @@ ParallelCpuExecutable::ExecuteOnStream(
     }
   }
 
+  return Status::OK();
+}
+
+StatusOr<perftools::gputools::DeviceMemoryBase>
+ParallelCpuExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* stream = run_options->stream();
+  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  VLOG(3) << "ExecuteOnStream arg size: " << arguments.size();
+  if (!arguments.empty()) {
+    VLOG(3) << "ExecuteOnStream arg[0]: " << arguments.at(0).opaque();
+  }
+
+  // Allocate the temporary buffers required for the computation.
+  se::StreamExecutor* stream_executor = stream->parent();
+  int device_ordinal = stream_executor->device_ordinal();
+  int64 buffer_count = assignment_->Allocations().size();
+  VLOG(3) << "temp buffer count: " << buffer_count;
+
+  std::vector<se::DeviceMemoryBase> device_allocations(
+      assignment_->Allocations().size());
+  TF_RETURN_IF_ERROR(AllocateBuffers(memory_allocator,
+                                     stream->parent()->device_ordinal(),
+                                     &device_allocations));
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  const BufferAllocation::Index result_index = result_slice.index();
+  VLOG(3) << "result index: " << result_index;
+
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
+      run_options, arguments, device_allocations, hlo_execution_profile));
+
   // Mark the buffers that are actually live (used in the output) when the
   // computation finishes executing.
   std::unordered_set<const void*> marked_addresses;
@@ -322,7 +365,7 @@ ParallelCpuExecutable::ExecuteOnStream(
   // live because they are referenced by the output of the computation
   // and are needed by the service. They will be deallocated by the
   // service.
-  for (auto i = 0; i < device_allocations.size(); ++i) {
+  for (size_t i = 0; i < device_allocations.size(); ++i) {
     auto alloc = device_allocations[i];
     if (marked_addresses.count(alloc.opaque()) == 0 &&
         alloc.opaque() != nullptr) {
@@ -336,29 +379,92 @@ ParallelCpuExecutable::ExecuteOnStream(
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  return Unimplemented(
-      "ParallelCpuExecutable not supported yet with LocalService execution");
-}
+  if (GetRootPointsToSet().IsAmbiguous()) {
+    return Unimplemented("Points-to set of root instruction is ambiguous");
+  }
 
-Status ParallelCpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    ShapedBuffer* result_buffer, HloExecutionProfile* hlo_execution_profile) {
-  return Unimplemented(
-      "preallocated result buffer not supported with ParallelCpuExecutable");
+  se::Stream* stream = run_options->stream();
+  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result_buffer,
+                      ShapedBuffer::MakeShapedBuffer(
+                          result_shape(), stream->parent()->platform(),
+                          stream->parent()->device_ordinal()));
+
+  TF_RETURN_IF_ERROR(AllocateBuffers(
+      memory_allocator, stream->parent()->device_ordinal(), &buffers));
+
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers,
+                                             hlo_execution_profile));
+
+  // Copy DeviceMemoryBase values which contain the array(s) of the result into
+  // the respective location in ShapedBuffer which is returned to the caller.
+  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
+  TF_RETURN_IF_ERROR(
+      result_buffer->mutable_shape_index_to_buffer_entry()
+          ->ForEachMutableElementWithStatus(
+              [&buffers, &buffers_in_result, &result_buffer, this](
+                  const ShapeIndex& index, size_t* buffer_entry) {
+                if (ShapeUtil::IsLeafIndex(result_buffer->shape(), index)) {
+                  const std::vector<const LogicalBuffer*>& sources =
+                      this->GetRootPointsToSet().element(index);
+                  // The points to set is unambiguous so the set should be a
+                  // singleton.
+                  CHECK_EQ(1, sources.size());
+                  const LogicalBuffer* buffer_source = sources[0];
+                  HloInstruction* src = buffer_source->instruction();
+
+                  // The source for this result buffer can be a nested buffer
+                  // such as a tuple element.
+
+                  // The source instruction should have a non-parameter buffer
+                  // assigned.
+                  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                                      this->assignment_->GetUniqueSlice(
+                                          src, buffer_source->index()));
+                  CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+                  const BufferAllocation::Index buffer_index = slice.index();
+                  const se::DeviceMemoryBase& buffer = buffers[buffer_index];
+                  CHECK(!buffer.is_null() || buffer.size() == 0);
+                  *buffer_entry = result_buffer->mutable_buffers()->size();
+                  result_buffer->mutable_buffers()->push_back(buffer);
+                  buffers_in_result[buffer_index] = true;
+                }
+                return Status::OK();
+              }));
+
+  // Free all buffers not in the result.
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    se::DeviceMemoryBase alloc = buffers[i];
+    if (!buffers_in_result[i] && !alloc.is_null()) {
+      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
+              << alloc.opaque() << "]";
+      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
+          stream->parent()->device_ordinal(), &alloc));
+    }
+  }
+
+  return std::move(result_buffer);
 }
 
 StatusOr<perftools::gputools::DeviceMemoryBase>
 ParallelCpuExecutable::ExecuteAsyncOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on CPU.");
 }
 
+const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
+  return assignment_->points_to_analysis().GetPointsToSet(
+      module().entry_computation()->root_instruction());
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 51ec9e5a741..6d5f790c394 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -52,7 +51,6 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config,
       std::unique_ptr<std::map<HloInstruction*, string>> instruction_functions,
       std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
       std::unordered_map<const HloInstruction*,
@@ -61,24 +59,18 @@ class ParallelCpuExecutable : public Executable {
   ~ParallelCpuExecutable() override {}
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  Status ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      ShapedBuffer* result_buffer,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments) override;
 
@@ -89,7 +81,44 @@ class ParallelCpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
+  static int64 ShapeSizeBytes(const Shape& shape) {
+    // On the cpu, opaques are pointers.
+    if (ShapeUtil::IsOpaque(shape)) {
+      return sizeof(void*);
+    }
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+
  private:
+  // Allocate buffers required for execution and assign them to the elements of
+  // "buffers". "buffers" should be sized to the number of buffers in buffer
+  // assignment. Each vector element corresponds to a particular Index. If
+  // a vector element already contains a non-null DeviceMemoryBase, then no
+  // buffer is assigned for this element.
+  Status AllocateBuffers(
+      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+
+  // Calls the generated functions in 'function_names_', performing the
+  // computation with the given arguments using the supplied buffers.
+  Status ExecuteComputeFunctions(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          buffers,
+      HloExecutionProfile* hlo_execution_profile);
+  Status ExecuteComputeFunctions(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          buffers,
+      HloExecutionProfile* hlo_execution_profile);
+
+  // Returns the points-to set of the root instruction of the entry
+  // computation. Uses points-to analysis from buffer assignment.
+  const PointsToSet& GetRootPointsToSet() const;
+
   // The JIT containing compiled modules.
   tensorflow::mutex jit_mutex_;
   std::unique_ptr<SimpleOrcJIT> jit_ GUARDED_BY(jit_mutex_);
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 677080a8623..ee772f5c396 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -54,7 +54,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
   const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 384a978873d..6f1c97a2334 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -48,7 +48,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
   const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8beb565ab3e..7c74912a7ab 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -112,13 +112,25 @@ llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
   if (llvm::sys::getHostCPUFeatures(host_features)) {
     for (auto &feature : host_features) {
       if (feature.second) {
-        result.push_back(feature.first());
+        llvm::StringRef feature_name = feature.first();
+        // Skip avx512 for now, it isn't quite ready in LLVM.
+        if (feature_name.startswith("avx512")) {
+          continue;
+        }
+        result.push_back(feature_name);
       }
     }
   }
   return result;
 }
 
+llvm::StringRef GetHostCpuName() {
+  auto cpu_name = llvm::sys::getHostCPUName();
+  // Skip avx512 for now, it isn't quite ready in LLVM.
+  cpu_name.consume_back("-avx512");
+  return cpu_name;
+}
+
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
   intrinsics.sse_intrinsics = (&runtime::ExpV4F32 != nullptr);
@@ -136,13 +148,16 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions &target_options,
                             .setOptLevel(opt_level)
                             .selectTarget(
                                 /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-                                /*MCPU=*/llvm::sys::getHostCPUName(),
+                                /*MCPU=*/GetHostCpuName(),
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
-                                     opt_level, GetAvailableIntrinsics())) {}
+                                     opt_level, GetAvailableIntrinsics())) {
+  VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
+          << " features: " << target_machine_->getTargetFeatureString().str();
+}
 
 SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 9d1c842e0fb..4d8653484a0 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "external/llvm/include/llvm/ADT/Triple.h"
 #include "external/llvm/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "external/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "external/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
@@ -41,7 +41,7 @@ namespace cpu {
 // it's added to the JIT.
 class SimpleOrcJIT {
  public:
-  using ObjLayerT = llvm::orc::ObjectLinkingLayer<>;
+  using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer<>;
   using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT>;
   using ModuleHandleT = CompileLayerT::ModuleSetHandleT;
 
diff --git a/tensorflow/compiler/xla/service/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
index 423ec29fdc9..2d9d9c7de62 100644
--- a/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
@@ -96,8 +96,8 @@ Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
 
 }  // namespace xla
 
-static xla::TransferManager* CreateCpuTransferManager() {
-  return new xla::CpuTransferManager();
+static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
+  return xla::MakeUnique<xla::CpuTransferManager>();
 }
 
 static bool InitModule() {
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 1bef4e2b8c7..c13c86741cc 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -33,9 +33,6 @@ StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
 StatusOr<perftools::gputools::DeviceMemoryBase>
 StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
                                         bool retry_on_failure) {
-  if (size == 0) {
-    return perftools::gputools::DeviceMemoryBase(nullptr, 0);
-  }
   TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
   return stream_executor->AllocateArray<uint8>(size);
@@ -74,4 +71,8 @@ StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
   return stream_executors_[device_ordinal];
 }
 
+bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
+  return false;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 461cc818bff..391585a306d 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -51,6 +51,10 @@ class DeviceMemoryAllocator {
   // Return the platform that the allocator allocates memory on.
   const perftools::gputools::Platform* platform() const { return platform_; }
 
+  // Can we call Deallocate() as soon as a computation has been scheduled on
+  // a stream, or do we have to wait for the computation to complete first?
+  virtual bool AllowsAsynchronousDeallocation() const = 0;
+
  protected:
   const perftools::gputools::Platform* platform_;
 };
@@ -69,6 +73,8 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
   tensorflow::Status Deallocate(
       int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
 
+  bool AllowsAsynchronousDeallocation() const override;
+
  private:
   StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
       int device_ordinal);
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index f9c9bbe2cdc..78a398f8efa 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -150,6 +151,10 @@ class DfsHloVisitor {
   virtual Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) {
     return HandleElementwiseUnary(tanh, HloOpcode::kTanh, operand);
   }
+  virtual Status HandleIsFinite(HloInstruction* is_finite,
+                                HloInstruction* operand) {
+    return HandleElementwiseUnary(is_finite, HloOpcode::kIsFinite, operand);
+  }
   virtual Status HandleLogicalAnd(HloInstruction* logical_and,
                                   HloInstruction* lhs, HloInstruction* rhs) {
     return HandleElementwiseBinary(logical_and, HloOpcode::kLogicalAnd, lhs,
@@ -185,19 +190,16 @@ class DfsHloVisitor {
   virtual Status HandleTranspose(HloInstruction* transpose) = 0;
   virtual Status HandleParameter(HloInstruction* parameter) = 0;
   virtual Status HandleFusion(HloInstruction* fusion) = 0;
-  virtual Status HandleCall(
-      HloInstruction* call,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloComputation* computation) = 0;
+  virtual Status HandleCall(HloInstruction* call) = 0;
   virtual Status HandleCustomCall(
       HloInstruction* custom_call,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       tensorflow::StringPiece custom_call_target) = 0;
   virtual Status HandleSlice(HloInstruction* slice,
                              HloInstruction* operand) = 0;
-  virtual Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) = 0;
+  virtual Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                    HloInstruction* operand,
+                                    HloInstruction* start_indices) = 0;
   virtual Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                           HloInstruction* operand,
                                           HloInstruction* update,
@@ -215,9 +217,7 @@ class DfsHloVisitor {
                                     const Window& window,
                                     HloComputation* function) = 0;
   virtual Status HandleSelectAndScatter(HloInstruction* instruction) = 0;
-  virtual Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                             HloComputation* condition,
-                             HloComputation* body) = 0;
+  virtual Status HandleWhile(HloInstruction* xla_while) = 0;
 
   virtual Status HandlePad(HloInstruction* pad) = 0;
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 18cfaf83e1c..6557c3aa8e6 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -121,9 +122,7 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   Status HandleFusion(HloInstruction* fusion) override {
     return DefaultAction(fusion);
   }
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> /*operands*/,
-                    HloComputation* /*computation*/) override {
+  Status HandleCall(HloInstruction* call) override {
     return DefaultAction(call);
   }
   Status HandleCustomCall(
@@ -136,10 +135,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
                      HloInstruction* /*operand*/) override {
     return DefaultAction(slice);
   }
-  Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> /*operands*/) override {
-    return DefaultAction(slice);
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* /*operand*/,
+                            HloInstruction* /*start_indices*/) override {
+    return DefaultAction(dynamic_slice);
   }
   Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                   HloInstruction* /*operand*/,
@@ -188,9 +187,7 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   Status HandleTranspose(HloInstruction* transpose) override {
     return DefaultAction(transpose);
   }
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* /*init*/,
-                     HloComputation* /*condition*/,
-                     HloComputation* /*body*/) override {
+  Status HandleWhile(HloInstruction* xla_while) override {
     return DefaultAction(xla_while);
   }
   Status HandleSend(HloInstruction* send) override {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 9dd276952cc..be4aadb6522 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -195,6 +195,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           ir_builder_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
                                     llvm::ConstantFP::get(type, 1.0)));
     }
+    case HloOpcode::kIsFinite: {
+      // (x == x) && abs(x) != inf
+      auto type = operand_value->getType();
+      auto equal_self =
+          ir_builder_->CreateFCmpOEQ(operand_value, operand_value);
+      auto abs_value = llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::fabs, {operand_value}, {type}, ir_builder_);
+      auto infinity = llvm::ConstantFP::getInfinity(type);
+      auto not_infinite = ir_builder_->CreateFCmpONE(abs_value, infinity);
+      auto result_i1 = ir_builder_->CreateAnd(equal_self, not_infinite);
+      return ir_builder_->CreateZExt(
+          result_i1, llvm_ir::PrimitiveTypeToIrType(PRED, ir_builder_));
+    }
     case HloOpcode::kNegate:
       return ir_builder_->CreateFNeg(operand_value);
     default:
@@ -227,14 +240,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
       return ir_builder_->CreateFDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
       return ir_builder_->CreateFRem(lhs_value, rhs_value);
-
-    // The 'O' prefix on the LLVM ops means "ordered" compare where comparisons
-    // with NAN always return false.
+    // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
+    // comparisons always return false when one of the operands is NaN, whereas
+    // unordered comparisons return true.
+    //
+    // We use ordered comparisons for everything except kNe, where we use an
+    // unordered comparison.  This makes x != y equivalent to !(x == y), and
+    // matches C++'s semantics.
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kNe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_ONE, lhs_value,
+      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
@@ -428,8 +445,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
     const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
     int64 operand_no) const {
-  CHECK(hlo.IsElementwise()) << "HLO " << hlo.ToString()
-                             << " is not elementwise.";
+  CHECK(hlo.IsElementwise())
+      << "HLO " << hlo.ToString() << " is not elementwise.";
 
   const Shape& operand_shape = hlo.operand(operand_no)->shape();
   // If the operand is scalar, the source index is always {}.
@@ -474,8 +491,9 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
       llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D}));
 
   auto random_value = [hlo]() {
-    CHECK(hlo->parent() != nullptr && hlo->parent()->parent() != nullptr);
-    const HloModule* module = hlo->parent()->parent();
+    const HloModule* module =
+        hlo->IsFused() ? hlo->fusion_instruction()->parent()->parent()
+                       : hlo->parent()->parent();
     return module->RandomNew64();
   };
 
@@ -631,6 +649,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kCopy:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kNegate:
     case HloOpcode::kSign:
@@ -724,11 +743,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
           const HloInstruction* operand = hlo->operand(operand_idx);
           auto true_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_from_operand", operand_idx),
+                      "concat_index_from_operand", operand_idx),
               ir_builder_);
           auto false_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_not_from_operand", operand_idx),
+                      "concat_index_not_from_operand", operand_idx),
               ir_builder_);
           auto concat_dim_size =
               llvm::ConstantInt::get(source_index[concat_dim]->getType(),
@@ -788,9 +807,20 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index(index.size());
         for (int i = 0; i < index.size(); ++i) {
-          sliced_index[i] = ir_builder_->CreateAdd(
-              index[i], llvm::ConstantInt::get(index[i]->getType(),
-                                               hlo->slice_starts(i)));
+          int64 stride = hlo->slice_stride(i);
+          if (stride != 1) {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                ir_builder_->CreateMul(
+                    index[i], llvm::ConstantInt::get(index[i]->getType(),
+                                                     stride)),
+                llvm::ConstantInt::get(index[i]->getType(),
+                                       hlo->slice_starts(i)));
+          } else {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                    index[i],
+                    llvm::ConstantInt::get(index[i]->getType(),
+                                           hlo->slice_starts(i)));
+          }
         }
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
@@ -922,6 +952,68 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
       };
     case HloOpcode::kRng:
       return MakeRngElementGenerator(hlo, operand_to_generator);
+    case HloOpcode::kPad:
+      return [=, &operand_to_generator](
+                 const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
+        auto index = padded_index;
+        llvm::Value* in_bounds = ir_builder_->getTrue();
+        for (size_t i = 0; i < index.size(); ++i) {
+          auto index_typed_const = [=](int64 n) {
+            return llvm::ConstantInt::get(index[i]->getType(), n);
+          };
+          const auto& pad_dim = hlo->padding_config().dimensions(i);
+          index[i] = ir_builder_->CreateSub(
+              index[i], index_typed_const(pad_dim.edge_padding_low()));
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
+              "in_bounds");
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpEQ(
+                  index_typed_const(0),
+                  ir_builder_->CreateURem(
+                      index[i],
+                      index_typed_const(pad_dim.interior_padding() + 1))),
+              "in_bounds");
+          index[i] = ir_builder_->CreateSDiv(
+              index[i], index_typed_const(pad_dim.interior_padding() + 1));
+          in_bounds = ir_builder_->CreateAnd(
+              in_bounds,
+              ir_builder_->CreateICmpSLT(
+                  index[i],
+                  index_typed_const(hlo->operand(0)->shape().dimensions(i))),
+              "in_bounds");
+        }
+
+        // if (in_bounds) {
+        //   ret_value = operand0[index];  // source
+        // } else {
+        //   ret_value = *operand1;        // padding
+        // }
+        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
+                                           ir_builder_),
+            "pad_result_addr", ir_builder_);
+        llvm_ir::LlvmIfData if_data =
+            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                            operand_to_generator.at(hlo->operand(0))(index));
+        ir_builder_->CreateStore(operand_value, ret_value_addr);
+
+        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+        TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
+                            operand_to_generator.at(hlo->operand(1))({}));
+        ir_builder_->CreateStore(padding_value, ret_value_addr);
+
+        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+        // Don't create phi(operand_value, padding_value) here, because invoking
+        // operand_to_generator may create new basic blocks, making the parent
+        // of operand_value or padding_value no longer a predecessor of
+        // if_data.after_block.
+        return ir_builder_->CreateLoad(ret_value_addr);
+      };
     default:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return Unimplemented("%s", HloOpcodeString(hlo->opcode()).c_str());
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 5b1a5a16d1f..3a9f8dc79ee 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -16,16 +16,47 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 
 #include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
 
+/* static */ void Executable::DumpExecutedHlo(
+    const HloModule& module, const string& label,
+    const HloExecutionProfile* profile) {
+  VLOG(2) << "module name = " << module.name();
+  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+  string generate_hlo_graph_regex;
+  if (!flags->xla_generate_hlo_graph.empty()) {
+    generate_hlo_graph_regex = flags->xla_generate_hlo_graph;
+  } else {
+    generate_hlo_graph_regex =
+        module.config().debug_options().xla_generate_hlo_graph();
+  }
+  if (!generate_hlo_graph_regex.empty() &&
+      RE2::PartialMatch(module.name(), generate_hlo_graph_regex)) {
+    hlo_graph_dumper::DumpGraph(*module.entry_computation(), label,
+                                flags->xla_hlo_graph_addresses,
+                                flags->xla_hlo_graph_layout, profile);
+  }
+  if (!flags->xla_log_hlo_text.empty() &&
+      RE2::PartialMatch(module.name(), flags->xla_log_hlo_text)) {
+    LOG(INFO) << "HLO for module " << module.name();
+    LOG(INFO) << "Label: " << label;
+    XLA_LOG_LINES(2, module.ToString());
+  }
+  if (!flags->xla_dump_hlo_text_to.empty()) {
+    hlo_graph_dumper::DumpText(module, label, flags->xla_dump_hlo_text_to);
+  }
+}
+
 StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
 Executable::ExecuteOnStreams(
-    tensorflow::gtl::ArraySlice<const ExecutableRunOptions> run_options,
+    tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions> run_options,
     tensorflow::gtl::ArraySlice<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
         arguments) {
@@ -40,7 +71,7 @@ Executable::ExecuteOnStreams(
 
   std::vector<perftools::gputools::DeviceMemoryBase> return_values(
       run_options.size());
-  for (int64 i = 0; i < run_options.size(); ++i) {
+  for (size_t i = 0; i < run_options.size(); ++i) {
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
@@ -68,13 +99,23 @@ Status Executable::DumpSessionModule() {
                                      *session_module_);
 }
 
+// Removes illegal characters from filenames.
+static void SanitizeFilename(string* name) {
+  for (char& c : *name) {
+    if (c == '/' || c == '\\' || c == '[' || c == ']') {
+      c = '_';
+    }
+  }
+}
+
 /* static */ Status Executable::DumpToDirectory(
-    const string& directory_path, const string& filename,
+    const string& directory_path, string filename,
     const SessionModule& session_module) {
   tensorflow::Env* env = tensorflow::Env::Default();
   if (!env->IsDirectory(directory_path).ok()) {
     TF_RETURN_IF_ERROR(env->CreateDir(directory_path));
   }
+  SanitizeFilename(&filename);
   string file_path = tensorflow::io::JoinPath(directory_path, filename);
   return tensorflow::WriteBinaryProto(env, file_path, session_module);
 }
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index ac478afabc2..291916cd9f7 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -19,16 +19,18 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -39,17 +41,18 @@ namespace xla {
 
 // A given platform's compiler will produce an Executable -- this is a uniform
 // interface that is used for launching compiled programs across platforms.
-//
-// TODO(leary) will need to extend this to support multiple streams/devices as
-// we begin to compile single programs to run on multiple devices.
 class Executable {
  public:
   explicit Executable(std::unique_ptr<HloModule> hlo_module,
-                      std::unique_ptr<HloModuleConfig> module_config)
+                      HloCostAnalysis::ShapeSizeFunction shape_size_function)
       : hlo_module_(std::move(hlo_module)),
-        module_config_(std::move(module_config)) {}
+        shape_size_function_(std::move(shape_size_function)) {}
   virtual ~Executable() {}
 
+  // Dumps the executed HLO according to service-associated flags.
+  static void DumpExecutedHlo(const HloModule& module, const string& label,
+                              const HloExecutionProfile* profile);
+
   // Enqueues the compilation result on the provided stream, passing the given
   // arguments. This call is blocking and returns after the execution is done.
   //
@@ -59,7 +62,7 @@ class Executable {
   // Returns the device memory region that a successful execution would
   // populate.
   virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
@@ -67,22 +70,14 @@ class Executable {
   // Overload of ExecuteOnStream which returns and takes arguments as
   // ShapedBuffers. Used for LocalService execution.
   virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) = 0;
 
-  // Overload of which writes the result into a pre-allocated buffer
-  // (result_buffer).
-  virtual Status ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      ShapedBuffer* result_buffer,
-      HloExecutionProfile* hlo_execution_profile) = 0;
-
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
   virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments) = 0;
 
@@ -92,11 +87,22 @@ class Executable {
   // returned vector.
   virtual StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
   ExecuteOnStreams(
-      tensorflow::gtl::ArraySlice<const ExecutableRunOptions> run_options,
+      tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
+          run_options,
       tensorflow::gtl::ArraySlice<
           tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
           arguments);
 
+  // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
+  // timer for the execution, sets up HLO profiling if enabled, and fills in the
+  // given ExecutionProfile if non-null.  The ExecuteOnStream overloads have
+  // different argument types and return types, so this method is templated on
+  // argument type and return type of the execute function.
+  template <typename ReturnT, typename ArgT>
+  StatusOr<ReturnT> ExecuteOnStreamWrapper(
+      const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
+      const ArgT& arguments);
+
   // Returns the ExecutionProfile from executing on the device. This includes
   // the number of cycles taken for the computation or the compilation time.
   ExecutionProfile execution_profile() const {
@@ -108,15 +114,14 @@ class Executable {
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
   bool hlo_profiling_enabled() const {
-    return module_config_->hlo_profiling_enabled();
+    return hlo_module_->config().hlo_profiling_enabled();
   }
 
   const HloModule& module() const { return *hlo_module_; }
 
-  const HloModuleConfig& module_config() const { return *module_config_; }
+  const bool has_module() const { return hlo_module_ != nullptr; }
 
-  // Returns whether this executable has an associated HloModuleConfig.
-  bool has_module_config() const { return module_config_ != nullptr; }
+  const HloModuleConfig& module_config() const { return hlo_module_->config(); }
 
   // Returns the versioned computation handle of the computation computed by
   // this executable.
@@ -127,7 +132,7 @@ class Executable {
   // The shape (including layout) that results from this execution. This is the
   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
   const Shape& result_shape() const {
-    return module_config_->entry_computation_layout().result_shape();
+    return hlo_module_->config().entry_computation_layout().result_shape();
   }
 
   // Dumping helpers.
@@ -139,10 +144,14 @@ class Executable {
   Status DumpSessionModule();
 
   // Dump session_module to directory_path/filename.
-  static Status DumpToDirectory(const string& directory_path,
-                                const string& filename,
+  static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
+  // Return a reference to a function that computes the size of a given Shape.
+  const HloCostAnalysis::ShapeSizeFunction& shape_size_function() const {
+    return shape_size_function_;
+  }
+
  protected:
   mutable tensorflow::mutex mutex_;
 
@@ -154,9 +163,10 @@ class Executable {
   // around.
   std::unique_ptr<HloModule> hlo_module_;
 
-  // The configuration used to build this executable (parameter layouts, result
-  // layout, profiling enabled, etc).
-  std::unique_ptr<HloModuleConfig> module_config_;
+  // Function to compute the size of a given Shape, in bytes.  This is
+  // provided to the Executable when it is constructed, and used to produce
+  // data for profiling the execution.
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 
   // SessionModule this was compiled from. Null if not dumping executions.
   std::unique_ptr<SessionModule> session_module_;
@@ -166,6 +176,76 @@ class Executable {
   int64 execution_count_ = 0;
 };
 
+template <typename ReturnT, typename ArgT>
+StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
+    const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
+    const ArgT& arguments) {
+  perftools::gputools::Stream* stream = run_options->stream();
+  std::unique_ptr<perftools::gputools::Timer> timer;
+  if (profile != nullptr) {
+    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
+  }
+
+  VLOG(1) << "enqueueing executable on stream...";
+  // If the profiling flag isn't enabled, we pass nullptr as the profile to
+  // indicate profiling is not requested.
+  HloExecutionProfile hlo_execution_profile;
+  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+  HloExecutionProfile* profile_ptr =
+      flags->xla_hlo_profile && hlo_profiling_enabled() ? &hlo_execution_profile
+                                                        : nullptr;
+
+  auto return_value = ExecuteOnStream(run_options, arguments, profile_ptr);
+
+  if (profile != nullptr) {
+    VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
+    stream->ThenStopTimer(timer.get()).BlockHostUntilDone();
+    VLOG(1) << "done with block-host-until-done";
+
+    // Merge in run-time profile information from execution_profile.
+    profile->MergeFrom(execution_profile());
+
+    // Overall execution time (in nanoseconds) from the executor timer.
+    profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
+
+    // TODO(b/28123297): On GPU we end up including transfer time in
+    // the compute time this way. Instead, we should get the correct
+    // value by measuring it. Setting the field here at least lets
+    // benchmarks provide *some* value for GPU computations.
+    //
+    // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
+    // the compute time without the transfer time, so this way we get the
+    // correct compute time. We should instead have the correct value for
+    // compute_and_transfer_time and set compute_time to the compute time.
+    if (profile->compute_time_ns() == 0) {
+      profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
+    }
+  }
+
+  if (profile_ptr != nullptr) {
+    std::unordered_set<const xla::HloComputation*> profiled_computations =
+        profile_ptr->profiled_computations();
+    // To ensure we have print the profiles in a stable order, iterate over the
+    // computations in post order.
+    std::list<xla::HloComputation*> all_computations =
+        module().MakeComputationPostOrder();
+    for (xla::HloComputation* computation : all_computations) {
+      if (profiled_computations.count(computation) > 0) {
+        string profile_string = profile_ptr->ToString(
+            *computation, stream->parent()->GetDeviceDescription(),
+            shape_size_function_);
+        if (!profile_string.empty()) {
+          XLA_LOG_LINES(tensorflow::INFO, profile_string);
+        }
+      }
+    }
+    DumpExecutedHlo(module(), "Service::Execute", profile_ptr);
+  }
+
+  return return_value;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index cf1870580c4..c225e62e3e1 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -24,25 +24,19 @@ limitations under the License.
 
 namespace xla {
 
-AsyncExecution::AsyncExecution(
-    Backend* backend,
-    std::vector<std::unique_ptr<perftools::gputools::Stream>> streams,
-    const ExecutionProfile& profile, GlobalDataHandle result)
+AsyncExecution::AsyncExecution(Backend* backend,
+                               std::vector<Backend::StreamPtr> streams,
+                               const ExecutionProfile& profile,
+                               GlobalDataHandle result)
     : backend_(CHECK_NOTNULL(backend)),
       streams_(std::move(streams)),
       profile_(profile),
-      result_(result) {
+      result_(std::move(result)) {
   for (const auto& stream : streams_) {
     CHECK(stream != nullptr);
   }
 }
 
-AsyncExecution::~AsyncExecution() {
-  for (auto& stream : streams_) {
-    backend_->ReleaseStream(std::move(stream));
-  }
-}
-
 tensorflow::Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
     if (!stream->BlockHostUntilDone()) {
@@ -55,8 +49,7 @@ tensorflow::Status AsyncExecution::BlockUntilDone() const {
 ExecutionTracker::ExecutionTracker() : next_handle_(1) {}
 
 ExecutionHandle ExecutionTracker::Register(
-    Backend* backend,
-    std::vector<std::unique_ptr<perftools::gputools::Stream>> streams,
+    Backend* backend, std::vector<Backend::StreamPtr> streams,
     const ExecutionProfile& profile, GlobalDataHandle result) {
   tensorflow::mutex_lock lock(execution_mutex_);
   int64 handle = next_handle_++;
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 99a5bb5ad99..5b6bddf9f16 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -39,12 +40,9 @@ namespace xla {
 // the stream when destructed.
 class AsyncExecution {
  public:
-  AsyncExecution(
-      Backend* backend,
-      std::vector<std::unique_ptr<perftools::gputools::Stream>> streams,
-      const ExecutionProfile& profile, GlobalDataHandle result);
+  AsyncExecution(Backend* backend, std::vector<Backend::StreamPtr> streams,
+                 const ExecutionProfile& profile, GlobalDataHandle result);
 
-  ~AsyncExecution();
   tensorflow::Status BlockUntilDone() const;
 
   const GlobalDataHandle& result() const { return result_; }
@@ -56,7 +54,7 @@ class AsyncExecution {
   Backend* backend_;
 
   // Stream on which the execution is launched.
-  std::vector<std::unique_ptr<perftools::gputools::Stream>> streams_;
+  std::vector<Backend::StreamPtr> streams_;
 
   // Profile object of the execution to be returned to the user.
   ExecutionProfile profile_;
@@ -73,10 +71,10 @@ class ExecutionTracker {
 
   // Registers an execution with its backend, streams, and data handle to the
   // execution result. Returns a handle for the registered execution.
-  ExecutionHandle Register(
-      Backend* backend,
-      std::vector<std::unique_ptr<perftools::gputools::Stream>> stream,
-      const ExecutionProfile& profile, GlobalDataHandle data);
+  ExecutionHandle Register(Backend* backend,
+                           std::vector<Backend::StreamPtr> stream,
+                           const ExecutionProfile& profile,
+                           GlobalDataHandle data);
 
   // Unregisters the execution for the given handle.
   tensorflow::Status Unregister(const ExecutionHandle& handle);
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
new file mode 100644
index 00000000000..297a4f7599f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Helper to replace the called computation at a while- or call-instruction.
+void ReplaceCalledComputation(HloInstruction* instruction,
+                              HloComputation* computation,
+                              HloComputation* new_computation) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kWhile: {
+      if (computation == instruction->while_condition()) {
+        instruction->set_while_condition(new_computation);
+      } else {
+        CHECK_EQ(computation, instruction->while_body());
+        instruction->set_while_body(new_computation);
+      }
+      break;
+    }
+    case HloOpcode::kCall: {
+      CHECK_EQ(instruction->to_apply(), computation);
+      instruction->set_to_apply(new_computation);
+      break;
+    }
+    default:
+      LOG(FATAL) << "unexpected opcode: "
+                 << HloOpcodeString(instruction->opcode());
+  }
+}
+
+// Flatten a single call graph node. Expects to visit nodes in postorder.
+Status FlattenNode(const CallGraphNode& node) {
+  HloComputation* computation = node.computation();
+  HloModule* module = computation->parent();
+  // Clone callee for all call-sites except the first one.
+  for (int i = 0; i < node.caller_callsites().size(); ++i) {
+    CallSite call_site = node.caller_callsites()[i];
+    // Only consider sequential call contexts.
+    if (call_site.context() == CallContext::kParallel) {
+      continue;
+    }
+    CHECK_EQ(call_site.context(), CallContext::kSequential);
+
+    // Skip first element if this computation is only called from a sequential
+    // context.
+    if (node.context() != CallContext::kBoth && i == 0) {
+      continue;
+    }
+
+    // Clone computation for the remaining sequential context call sites.
+    HloComputation* clone =
+        module->AddEmbeddedComputation(computation->Clone());
+    ReplaceCalledComputation(call_site.instruction(), computation, clone);
+    // Clone the sub-tree of all computations called from this node.
+    std::vector<HloComputation*> worklist;
+    worklist.push_back(clone);
+    while (!worklist.empty()) {
+      auto current = worklist.back();
+      worklist.pop_back();
+      for (auto& instruction : current->instructions()) {
+        if (GetInstructionCallContext(instruction.get()) !=
+            CallContext::kSequential) {
+          continue;
+        }
+        for (auto callee : instruction->called_computations()) {
+          HloComputation* callee_clone =
+              module->AddEmbeddedComputation(callee->Clone());
+          ReplaceCalledComputation(instruction.get(), callee, callee_clone);
+          worklist.push_back(callee_clone);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+StatusOr<bool> FlattenCallGraph::Run(HloModule* module) {
+  XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
+
+  XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
+  return true;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.h b/tensorflow/compiler/xla/service/flatten_call_graph.h
new file mode 100644
index 00000000000..d3efab36149
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Flatten the call graph for an HLO module into a tree.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Flattening associates each call site with a unique computation (for
+// sequential calling contexts) This simplifies buffer assignment and
+// points-to analysis (see b/36865746 for details).
+class FlattenCallGraph : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "flatten-call-graph"; }
+
+  // Duplicates computations called from multiple call- or while-nodes to
+  // flatten the call graph.
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
new file mode 100644
index 00000000000..bb4712c86f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -0,0 +1,231 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class FlattenCallGraphTest : public HloTestBase {
+ protected:
+  // Build and return a trivial computation taking and returning a scalar.
+  std::unique_ptr<HloComputation> MakeScalarComputation() {
+    HloComputation::Builder builder(TestName() + ".ScalarComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and maps (kMap) the
+  // given computation to the value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeMappingComputation(
+      HloComputation* map_computation, int64 callsites) {
+    HloComputation::Builder builder(TestName() + ".MappingComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateMap(
+          kScalarShape, {last_value}, map_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and calls (kCall) the
+  // given computation with value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeCallingComputation(
+      HloComputation* callee_computation, int64 callsites,
+      const string& suffix = ".CallingComputation") {
+    HloComputation::Builder builder(TestName() + suffix);
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateCall(
+          kScalarShape, {last_value}, callee_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and returns a PRED
+  // value.
+  std::unique_ptr<HloComputation> MakeConditionComputation() {
+    HloComputation::Builder builder(TestName() + ".ConditionComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* zero = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
+    return builder.Build();
+  }
+
+  StatusOr<bool> RunFlattenCallGraph(HloModule* module) {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_RETURN(bool result, flatten.Run(module));
+    return result;
+  }
+
+  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(FlattenCallGraphTest, ComplexGraph) {
+  // Test a call graph of a module with several computation called in various
+  // contexts. The call graph looks like:
+  //
+  //      entry
+  //      /  |
+  //     a   |
+  //   / | \ |
+  //  b  |  cond
+  //   \ |
+  //    c
+  //
+  // Calls are made via kCall, kWhile, and kMap instructions.
+  auto module = CreateNewModule();
+  HloComputation* cond_computation =
+      module->AddEmbeddedComputation(MakeConditionComputation());
+  HloComputation* c_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+  HloComputation* b_computation = module->AddEmbeddedComputation(
+      MakeMappingComputation(c_computation, /*callsites=*/1));
+
+  HloComputation* a_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".a");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* call = builder.AddInstruction(
+        HloInstruction::CreateCall(kScalarShape, {param0}, c_computation));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, b_computation, call));
+    a_computation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation* entry_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".entry");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, a_computation, param0));
+    entry_computation = module->AddEntryComputation(builder.Build());
+  }
+
+  {
+    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module.get());
+    const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
+    EXPECT_EQ(1, c_node.caller_callsites().size());
+  }
+}
+
+// Test corner case of a computation used as a body and a loop condition.
+TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
+  auto module = CreateNewModule();
+  HloComputation* cond_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".cond");
+    HloInstruction* param0 =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            0, ShapeUtil::MakeShape(PRED, {}), "param0"));
+    HloInstruction* false_constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                     HloOpcode::kEq, param0, false_constant));
+    cond_computation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation* entry_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".entry");
+    HloInstruction* false_constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        ShapeUtil::MakeShape(PRED, {}), cond_computation, cond_computation,
+        false_constant));
+    entry_computation = module->AddEntryComputation(builder.Build());
+  }
+
+  {
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(2, cond_node.caller_callsites().size());
+  }
+
+  {
+    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(1, cond_node.caller_callsites().size());
+  }
+}
+
+// Test flattening of a nested calling computations.
+//
+//   Entry
+//    / \
+//    \ /
+//     B
+//    / \
+//    \ /
+//     C
+//
+TEST_F(FlattenCallGraphTest, FlattenCalls) {
+  auto module = CreateNewModule();
+  HloComputation* c_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  HloComputation* b_computation = module->AddEmbeddedComputation(
+      MakeCallingComputation(c_computation, /*callsites=*/2, ".B"));
+
+  module->AddEntryComputation(
+      MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
+
+  TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  EXPECT_EQ(7, module->computations().size());
+
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_EQ(1, c_node.caller_callsites().size());
+
+  const CallGraphNode& b_node = call_graph->GetNode(b_computation);
+  EXPECT_EQ(1, b_node.caller_callsites().size());
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 8f39ba8b1d2..eb8b93330fb 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -118,10 +118,10 @@ GenericTransferManager::ShallowCopyTupleFromDevice(
 
   // Create a DeviceMemoryBase from each void* pointer.
   std::vector<se::DeviceMemoryBase> destination;
-  for (int i = 0; i < element_pointers.size(); ++i) {
+  for (size_t i = 0; i < element_pointers.size(); ++i) {
     if (element_pointers[i] == nullptr &&
         !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %d", i);
+      return FailedPrecondition("tuple contains nullptr at element %lu", i);
     }
     int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i),
                                               /*pointer_size=*/sizeof(void*));
@@ -162,6 +162,12 @@ Status GenericTransferManager::TransferLiteralToInfeed(
   return Unimplemented("Infeed is not supported on GPU (b/30467474)");
 }
 
+Status GenericTransferManager::TransferLiteralFromOutfeed(
+    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    Literal* literal) {
+  return Unimplemented("Outfeed is not supported on CPU/GPU (b/30467474)");
+}
+
 Status GenericTransferManager::ResetDevices(
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
         executors) {
@@ -174,14 +180,3 @@ int64 GenericTransferManager::GetByteSizeRequirement(const Shape& shape) {
 }
 
 }  // namespace xla
-
-static xla::TransferManager* CreateGenericTransferManager() {
-  return new xla::GenericTransferManager(se::cuda::kCudaPlatformId);
-}
-
-static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
-                                                CreateGenericTransferManager);
-  return true;
-}
-static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 06819d65c70..2fbdb94f06f 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -55,6 +55,10 @@ class GenericTransferManager : public TransferManager {
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
 
+  Status TransferLiteralFromOutfeed(
+      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+      Literal* literal) override;
+
   Status ResetDevices(
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           executors) override;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index c1abf2237bd..86986934117 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -76,6 +76,7 @@ cc_library(
 
 cc_test(
     name = "stream_assignment_test",
+    size = "small",
     srcs = [
         "stream_assignment_test.cc",
     ],
@@ -86,7 +87,6 @@ cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -96,7 +96,6 @@ cc_library(
     hdrs = ["hlo_to_ir_bindings.h"],
     deps = [
         ":ir_emission_utils",
-        ":temp_buffer_offsets",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
@@ -127,7 +126,6 @@ cc_library(
         ":ir_emission_utils",
         ":parallel_loop_emitter",
         ":partition_assignment",
-        ":temp_buffer_offsets",
         ":while_transformer",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -197,23 +195,11 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "temp_buffer_offsets",
-    srcs = ["temp_buffer_offsets.cc"],
-    hdrs = ["temp_buffer_offsets.h"],
-    deps = [
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "buffer_allocations",
     srcs = ["buffer_allocations.cc"],
     hdrs = ["buffer_allocations.h"],
     deps = [
-        ":temp_buffer_offsets",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -233,6 +219,7 @@ cc_library(
         "for_thunk.cc",
         "gemm_thunk.cc",
         "gpu_executable.cc",
+        "infeed_thunk.cc",
         "kernel_thunk.cc",
         "sequential_thunk.cc",
         "thunk_schedule.cc",
@@ -245,6 +232,7 @@ cc_library(
         "for_thunk.h",
         "gemm_thunk.h",
         "gpu_executable.h",
+        "infeed_thunk.h",
         "kernel_thunk.h",
         "sequential_thunk.h",
         "thunk.h",
@@ -254,9 +242,9 @@ cc_library(
     ],
     deps = [
         ":buffer_allocations",
+        ":infeed_manager",
         ":partition_assignment",
         ":stream_assignment",
-        ":temp_buffer_offsets",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
@@ -271,13 +259,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ],
 )
@@ -316,6 +305,7 @@ cc_library(
 
 cc_test(
     name = "convolution_folding_test",
+    size = "small",
     srcs = ["convolution_folding_test.cc"],
     deps = [
         ":convolution_folding",
@@ -324,7 +314,6 @@ cc_test(
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -342,12 +331,11 @@ cc_library(
 
 cc_test(
     name = "instruction_fusion_test",
+    size = "small",
     srcs = ["instruction_fusion_test.cc"],
     deps = [
         ":instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -370,31 +358,26 @@ cc_library(
     srcs = ["fusion_merger.cc"],
     hdrs = ["fusion_merger.h"],
     deps = [
+        ":instruction_fusion",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service:instruction_fusion",
         "//tensorflow/core:lib",
     ],
 )
 
 cc_test(
     name = "fusion_merger_test",
+    size = "small",
     srcs = ["fusion_merger_test.cc"],
     deps = [
         ":fusion_merger",
         ":instruction_fusion",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -430,7 +413,7 @@ cc_library(
         ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
-        ":temp_buffer_offsets",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -441,13 +424,17 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
@@ -461,6 +448,18 @@ cc_library(
     alwayslink = True,  # Contains compiler registration
 )
 
+cc_library(
+    name = "infeed_manager",
+    srcs = ["infeed_manager.cc"],
+    hdrs = ["infeed_manager.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
     name = "layout_assignment",
     srcs = ["layout_assignment.cc"],
@@ -479,6 +478,7 @@ cc_library(
 
 cc_test(
     name = "layout_assignment_test",
+    size = "small",
     srcs = ["layout_assignment_test.cc"],
     deps = [
         ":layout_assignment",
@@ -488,7 +488,6 @@ cc_test(
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -508,6 +507,7 @@ cc_library(
 
 cc_test(
     name = "hlo_schedule_test",
+    size = "small",
     srcs = [
         "hlo_schedule_test.cc",
     ],
@@ -518,7 +518,6 @@ cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -539,19 +538,15 @@ cc_library(
 
 cc_test(
     name = "while_transformer_test",
+    size = "small",
     srcs = ["while_transformer_test.cc"],
     deps = [
         ":instruction_fusion",
         ":while_transformer",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:copy_insertion",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index a9975de3f17..9fdf717b5d4 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -38,28 +38,12 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
 }
 
 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
-    const BufferAssignment& buffer_assignment,
-    const TempBufferOffsets& temp_buffer_offsets, int device_ordinal,
+    const BufferAssignment& buffer_assignment, int device_ordinal,
     DeviceMemoryAllocator* memory_allocator) {
-  se::DeviceMemoryBase temp_buffer_base;
-  if (temp_buffer_offsets.TotalSizeInBytes() > 0) {
-    TF_ASSIGN_OR_RETURN(
-        temp_buffer_base,
-        memory_allocator->Allocate(device_ordinal,
-                                   temp_buffer_offsets.TotalSizeInBytes()));
-    if (temp_buffer_base == nullptr) {
-      return ResourceExhausted(
-          "Out of memory when allocating %s bytes for temporary buffers.",
-          tensorflow::strings::HumanReadableNumBytes(
-              temp_buffer_offsets.TotalSizeInBytes())
-              .c_str());
-    }
-  }
-  auto buffer_allocations = WrapUnique(new BufferAllocations(
-      buffer_assignment.Allocations().size(), temp_buffer_base, device_ordinal,
-      memory_allocator));
+  const int64 num_buffers = buffer_assignment.Allocations().size();
+  auto buffer_allocations = WrapUnique(
+      new BufferAllocations(num_buffers, device_ordinal, memory_allocator));
 
-  int64 num_buffers = buffer_assignment.Allocations().size();
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
@@ -68,13 +52,13 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
       continue;
     }
 
+    // Allocate each allocation that might escape, or is the temp buffer.
+    bool seen_temp_buffer = false;
     const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
-    if (allocation.maybe_live_out()) {
-      auto buffer_size = allocation.size();
+    if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
+      const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
       if (buffer_size > 0) {
-        // If the buffer escapes, we need to allocate it separately instead of
-        // merging it into the memory block for temporary buffers.
         TF_ASSIGN_OR_RETURN(buffer_address, memory_allocator->Allocate(
                                                 device_ordinal, buffer_size));
         if (buffer_address == nullptr) {
@@ -85,13 +69,14 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
         }
       }
       buffer_allocations->SetBuffer(i, buffer_address);
-    } else if (allocation.IsPreallocatedTempBuffer()) {
-      se::DeviceMemoryBase temp_buffer_address(
-          /*opaque=*/static_cast<char*>(
-              buffer_allocations->GetTempBufferBase().opaque()) +
-              temp_buffer_offsets.GetOffset(i),
-          /*size=*/allocation.size());
-      buffer_allocations->SetBuffer(i, temp_buffer_address);
+      if (allocation.IsPreallocatedTempBuffer()) {
+        if (seen_temp_buffer) {
+          LOG(FATAL) << "Multiple temporary buffers detected.  BufferAssigner "
+                     << "must guarantee at most one temporary buffer.";
+        }
+        seen_temp_buffer = true;
+        buffer_allocations->temp_buffer_base_ = buffer_address;
+      }
     }
   }
 
@@ -102,22 +87,19 @@ tensorflow::Status BufferAllocations::TearDown(
     const std::set<se::DeviceMemoryBase>& live_addresses,
     const BufferAssignment& buffer_assignment) {
   // Deallocate temporary buffers.
-  for (auto i = 0; i < buffer_assignment.Allocations().size(); ++i) {
+  const int64 num_buffers = buffer_assignment.Allocations().size();
+  for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
     const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
     se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
-    if (allocation.maybe_live_out() && !live_addresses.count(buffer_address)) {
-      // Deallocate buffers that marked "maybe_live_out" but is not actually
-      // live out.
+    // Deallocate buffers marked "maybe_live_out" but aren't actually live out,
+    // and temp buffers.
+    if ((allocation.maybe_live_out() &&
+         !live_addresses.count(buffer_address)) ||
+        allocation.IsPreallocatedTempBuffer()) {
       TF_RETURN_IF_ERROR(
           memory_allocator_->Deallocate(device_ordinal_, &buffer_address));
     }
   }
-
-  // Deallocate the memory block for temporary buffers.
-  if (temp_buffer_base_ != nullptr) {
-    TF_RETURN_IF_ERROR(
-        memory_allocator_->Deallocate(device_ordinal_, &temp_buffer_base_));
-  }
   return tensorflow::Status::OK();
 }
 
@@ -128,6 +110,16 @@ se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
   return buffers_[buffer_index];
 }
 
+se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
+    const BufferAllocation::Slice& buffer_slice) const {
+  se::DeviceMemoryBase base = GetDeviceAddress(buffer_slice.index());
+  CHECK_LE(buffer_slice.offset(), base.size());
+  CHECK_LE(buffer_slice.offset() + buffer_slice.size(), base.size());
+  return se::DeviceMemoryBase(
+      static_cast<char*>(base.opaque()) + buffer_slice.offset(),
+      buffer_slice.size(), /*is_sub_buffer=*/true);
+}
+
 void BufferAllocations::SetBuffer(BufferAllocation::Index buffer_index,
                                   se::DeviceMemoryBase buffer) {
   CHECK_GE(buffer_index, 0);
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index a0cd6cac016..ea7f0eb3745 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -49,8 +48,7 @@ class BufferAllocations {
     // `device_ordinal` is the number of the device this function allocates
     // memory on.
     StatusOr<std::unique_ptr<BufferAllocations>> Build(
-        const BufferAssignment& buffer_assignment,
-        const TempBufferOffsets& temp_buffer_offsets, int device_ordinal,
+        const BufferAssignment& buffer_assignment, int device_ordinal,
         DeviceMemoryAllocator* memory_allocator);
 
    private:
@@ -70,6 +68,11 @@ class BufferAllocations {
   perftools::gputools::DeviceMemoryBase GetDeviceAddress(
       BufferAllocation::Index buffer_index) const;
 
+  // Same as above, but also adjusts the returned address for the offset and
+  // size contained in the given slice.
+  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+      const BufferAllocation::Slice& buffer_slice) const;
+
   perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
     return temp_buffer_base_;
   }
@@ -81,12 +84,9 @@ class BufferAllocations {
       const BufferAssignment& buffer_assignment);
 
  private:
-  BufferAllocations(BufferAllocation::Index buffer_count,
-                    perftools::gputools::DeviceMemoryBase temp_buffer_base,
-                    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+  BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
+                    DeviceMemoryAllocator* memory_allocator)
       : buffers_(buffer_count),
-        temp_buffer_base_(
-            perftools::gputools::DeviceMemory<void*>(temp_buffer_base)),
         device_ordinal_(device_ordinal),
         memory_allocator_(memory_allocator) {}
 
@@ -100,7 +100,7 @@ class BufferAllocations {
   std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
 
   // The base address of the memory block that contains all temporary buffers.
-  perftools::gputools::DeviceMemory<void*> temp_buffer_base_;
+  perftools::gputools::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
index b407a01f0af..16febea14de 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
@@ -106,7 +106,7 @@ MatchBackwardFilter(HloInstruction* conv) {
   //
   // Compute the window of the backward convolution.
   Window backward_conv_window;
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < spatial_dims.size(); ++i) {
     WindowDimension* dim = backward_conv_window.add_dimensions();
     // The window size of the backward convolution equals the output size of the
     // forward convolution.
@@ -185,7 +185,7 @@ MatchBackwardFilter(HloInstruction* conv) {
   ConvolutionDimensionNumbers backward_conv_dnums;
   backward_conv_dnums.set_batch_dimension(feature_dim);
   backward_conv_dnums.set_feature_dimension(batch_dim);
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < spatial_dims.size(); ++i) {
     backward_conv_dnums.add_spatial_dimensions(spatial_dims[i]);
   }
   // The dimension numbering of the output of the forward convolution (before
@@ -201,7 +201,7 @@ MatchBackwardFilter(HloInstruction* conv) {
       PositionInContainer(transpose->dimensions(), batch_dim));
   backward_conv_dnums.set_kernel_output_feature_dimension(
       PositionInContainer(transpose->dimensions(), feature_dim));
-  for (int i = 0; i < 2; ++i) {
+  for (int i = 0; i < spatial_dims.size(); ++i) {
     backward_conv_dnums.add_kernel_spatial_dimensions(
         PositionInContainer(transpose->dimensions(), spatial_dims[i]));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
index 83922cbe14a..ba9c70ded36 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
@@ -97,10 +97,10 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithoutTranspose) {
       activations, gradients, conv_window,
       tf_default_dnums_for_backward_filter_));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
@@ -126,9 +126,9 @@ TEST_F(ConvolutionFoldingTest,
       activations, gradients, conv_window,
       tf_default_dnums_for_backward_filter_));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(&module));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  EXPECT_FALSE(FoldConvolution(module.get()));
 }
 
 // Extracted from block35 training.
@@ -155,10 +155,10 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {3, 3, 32, 32}), convolution, {1, 2, 3, 0}));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
@@ -189,10 +189,10 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), convolution, {1, 2, 3, 0}));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
@@ -222,10 +222,10 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {2, 2, 32, 32}), convolution, {1, 2, 3, 0}));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
@@ -269,10 +269,10 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
           output->shape(), reverse_kernel->shape(), conv_window, conv_dnums)
           .ValueOrDie()));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
@@ -313,10 +313,10 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
       /*lhs=*/output, /*rhs=*/kernel, conv_window,
       tf_default_dnums_for_backward_input_));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
@@ -346,9 +346,9 @@ TEST_F(ConvolutionFoldingTest,
       /*lhs=*/output, /*rhs=*/kernel, default_conv_window_,
       tf_default_dnums_for_backward_input_));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(&module));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  EXPECT_FALSE(FoldConvolution(module.get()));
 }
 
 // Extracted from Inception V3 training.
@@ -394,10 +394,10 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
                          tf_default_dnums_for_backward_input_)
                          .ValueOrDie()));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   EXPECT_EQ(HloOpcode::kFusion,
             entry_computation->root_instruction()->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
@@ -441,9 +441,9 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
                          tf_default_dnums_for_backward_input_)
                          .ValueOrDie()));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(&module));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  EXPECT_FALSE(FoldConvolution(module.get()));
 }
 
 // Extracted from //learning/brain/google/xla/benchmarks/resnet.py
@@ -490,10 +490,10 @@ TEST_F(ConvolutionFoldingTest,
                          tf_default_dnums_for_backward_input_)
                          .ValueOrDie()));
 
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   const HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(&module));
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(FoldConvolution(module.get()));
   const HloInstruction* backward_conv = entry_computation->root_instruction();
   EXPECT_EQ(HloOpcode::kFusion, backward_conv->opcode());
   EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
@@ -543,10 +543,14 @@ TEST_F(ConvolutionFoldingTest,
                          tf_default_dnums_for_backward_input_)
                          .ValueOrDie()));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(&module));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  EXPECT_FALSE(FoldConvolution(module.get()));
 }
 
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 30a92ab3130..9a0b14eb733 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -29,7 +29,6 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
-using Index = BufferAllocation::Index;
 using se::dnn::BatchDescriptor;
 using se::dnn::ConvolutionDescriptor;
 using se::dnn::DataLayout;
@@ -92,12 +91,15 @@ string ConvolutionKindToString(
     case ConvolutionThunk::ConvolutionKind::kBackwardInput:
       return "backward_input";
   }
+  return "unknown convolution kind";
 }
 
 ConvolutionThunk::ConvolutionThunk(
-    ConvolutionKind convolution_kind, Index input_buffer, Index filter_buffer,
-    Index output_buffer, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, const Window& window,
+    ConvolutionKind convolution_kind,
+    const BufferAllocation::Slice& input_buffer,
+    const BufferAllocation::Slice& filter_buffer,
+    const BufferAllocation::Slice& output_buffer, const Shape& input_shape,
+    const Shape& filter_shape, const Shape& output_shape, const Window& window,
     const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo)
     : Thunk(Kind::kConvolution, hlo),
       convolution_kind_(convolution_kind),
@@ -119,50 +121,78 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }";
   VLOG(3) << "Window: { " << window_.ShortDebugString() << " }";
 
+  const int num_dimensions = window_.dimensions_size();
+  CHECK_LE(num_dimensions, 3);
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
   CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(2, window_.dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
   for (const WindowDimension& dim : window_.dimensions()) {
     CHECK_EQ(dim.padding_low(), dim.padding_high());
   }
 
-  const WindowDimension& height = window_.dimensions(0);
-  const WindowDimension& width = window_.dimensions(1);
   // cuDNN's convolution APIs support the BDYX layout for activations/output and
   // the OIYX layout for weights.
-  // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls
-  // when we switch to cuDNN v5.
-  BatchDescriptor input_descriptor;
+  BatchDescriptor input_descriptor(effective_num_dimensions);
   input_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_height(input_shape_.dimensions(dim_nums_.spatial_dimensions(0)))
-      .set_width(input_shape_.dimensions(dim_nums_.spatial_dimensions(1)))
       .set_feature_map_count(
           input_shape_.dimensions(dim_nums_.feature_dimension()))
       .set_count(input_shape_.dimensions(dim_nums_.batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+  }
 
-  FilterDescriptor filter_descriptor;
+  FilterDescriptor filter_descriptor(effective_num_dimensions);
   filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
       .set_input_feature_map_count(
           filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(
-          filter_shape_.dimensions(dim_nums_.kernel_output_feature_dimension()))
-      .set_input_filter_height(
-          filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(0)))
-      .set_input_filter_width(
-          filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(1)));
+      .set_output_feature_map_count(filter_shape_.dimensions(
+          dim_nums_.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(dim)));
+  }
 
-  ConvolutionDescriptor convolution_descriptor;
-  convolution_descriptor.set_zero_padding_width(width.padding_low())
-      .set_zero_padding_height(height.padding_low())
-      .set_horizontal_filter_stride(width.stride())
-      .set_vertical_filter_stride(height.stride());
+  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    convolution_descriptor
+        .set_zero_padding(
+            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
+            window_.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
+            window_.dimensions(dim).stride());
+  }
 
-  BatchDescriptor output_descriptor;
+  BatchDescriptor output_descriptor(effective_num_dimensions);
   output_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_height(output_shape_.dimensions(dim_nums_.spatial_dimensions(0)))
-      .set_width(output_shape_.dimensions(dim_nums_.spatial_dimensions(1)))
       .set_feature_map_count(
           output_shape_.dimensions(dim_nums_.feature_dimension()))
       .set_count(output_shape_.dimensions(dim_nums_.batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
+    convolution_descriptor
+        .set_zero_padding(static_cast<se::dnn::DimIndex>(0), 0)
+        .set_filter_stride(static_cast<se::dnn::DimIndex>(0), 1);
+  }
 
   se::DeviceMemory<float> input_data(
       buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -228,15 +258,21 @@ tensorflow::Status ConvolutionThunk::Convolve(
 std::vector<se::dnn::AlgorithmType> ConvolutionThunk::GetAlgorithms(
     se::StreamExecutor* stream_exec) const {
   std::vector<se::dnn::AlgorithmType> algorithms;
+  // TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
+  // by default. Should send in conv parameters and enable it when
+  // ShouldIncludeWinogradNonfusedAlgo() returns true.
   switch (convolution_kind_) {
     case ConvolutionKind::kBackwardFilter:
-      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(&algorithms));
+      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
+          /*with_winograd_nonfused=*/false, &algorithms));
       break;
     case ConvolutionKind::kBackwardInput:
-      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(&algorithms));
+      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
+          /*with_winograd_nonfused=*/false, &algorithms));
       break;
     case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(&algorithms));
+      CHECK(stream_exec->GetConvolveAlgorithms(/*with_winograd_nonfused=*/false,
+                                               &algorithms));
       break;
   }
   return algorithms;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index cd9568f6a25..aaf72935e61 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -70,9 +70,9 @@ class ConvolutionThunk : public Thunk {
   // Constructs a thunk for launching a DNN convolution.
   // Semantics of null hlo_instruction argument are as in Thunk.
   ConvolutionThunk(ConvolutionKind convolution_kind,
-                   BufferAllocation::Index input_buffer,
-                   BufferAllocation::Index filter_buffer,
-                   BufferAllocation::Index output_buffer,
+                   const BufferAllocation::Slice& input_buffer,
+                   const BufferAllocation::Slice& filter_buffer,
+                   const BufferAllocation::Slice& output_buffer,
                    const Shape& input_shape, const Shape& filter_shape,
                    const Shape& output_shape, const Window& window,
                    const ConvolutionDimensionNumbers& dnums,
@@ -125,19 +125,19 @@ class ConvolutionThunk : public Thunk {
   // the best algorithm from some heuristics based on its parameters.
   perftools::gputools::dnn::AlgorithmConfig best_algorithm_;
 
-  ConvolutionKind convolution_kind_;
+  const ConvolutionKind convolution_kind_;
 
-  BufferAllocation::Index input_buffer_;
-  BufferAllocation::Index filter_buffer_;
-  BufferAllocation::Index output_buffer_;
+  const BufferAllocation::Slice input_buffer_;
+  const BufferAllocation::Slice filter_buffer_;
+  const BufferAllocation::Slice output_buffer_;
 
-  Shape input_shape_;
-  Shape filter_shape_;
-  Shape output_shape_;
+  const Shape input_shape_;
+  const Shape filter_shape_;
+  const Shape output_shape_;
 
-  Window window_;
+  const Window window_;
 
-  ConvolutionDimensionNumbers dim_nums_;
+  const ConvolutionDimensionNumbers dim_nums_;
 };
 
 string ConvolutionKindToString(
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index 76fb079bd4d..87858e94090 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -21,7 +21,7 @@ namespace xla {
 namespace gpu {
 
 CopyThunk::CopyThunk(const void* source_address,
-                     BufferAllocation::Index destination_buffer,
+                     const BufferAllocation::Slice& destination_buffer,
                      uint64 mem_size, const HloInstruction* hlo_instruction)
     : Thunk(Kind::kCopy, hlo_instruction),
       source_address_(source_address),
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index 803e699bfdd..6b8c432715f 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -34,7 +34,7 @@ class CopyThunk : public Thunk {
   // device buffer `destination_buffer`. `mem_size` is the size of the data in
   // bytes.
   CopyThunk(const void* source_address,
-            BufferAllocation::Index destination_buffer, uint64 mem_size,
+            const BufferAllocation::Slice& destination_buffer, uint64 mem_size,
             const HloInstruction* hlo_instruction);
 
   CopyThunk(const CopyThunk&) = delete;
@@ -46,8 +46,8 @@ class CopyThunk : public Thunk {
 
  private:
   const void* source_address_;
-  BufferAllocation::Index destination_buffer_;
-  uint64 mem_size_;
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64 mem_size_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 67c80bf93b1..2987c8913d7 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -113,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // Binary math functions tranform are of type [T] -> T.
+  // Binary math functions transform are of type [T] -> T.
   for (PrimitiveType input_type : input_types) {
     if (output_type != input_type) {
       return Unimplemented("Input type ≠ output type: %s ≠ %s",
@@ -175,7 +175,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPowerOp(
     return make_sqrt();
   }
 
-  if (!hlo_module_config_.fast_math_disabled() &&
+  if (hlo_module_config_.debug_options().xla_enable_fast_math() &&
       IsFPLiteralWithValue(rhs, -.5)) {
     VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString();
     // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
@@ -270,69 +270,6 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) const {
   switch (hlo->opcode()) {
-    case HloOpcode::kPad:
-      return [=, &operand_to_generator](
-                 const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
-        auto index = padded_index;
-        llvm::Value* in_bounds =
-            llvm::ConstantInt::get(ir_builder_->getInt1Ty(), 1);
-        for (int i = 0; i < index.size(); ++i) {
-          auto index_typed_const = [=](int64 n) {
-            return llvm::ConstantInt::get(index[i]->getType(), n);
-          };
-          const auto& pad_dim = hlo->padding_config().dimensions(i);
-          index[i] = ir_builder_->CreateSub(
-              index[i], index_typed_const(pad_dim.edge_padding_low()));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
-              "in_bounds");
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpEQ(
-                  index_typed_const(0),
-                  ir_builder_->CreateURem(
-                      index[i],
-                      index_typed_const(pad_dim.interior_padding() + 1))),
-              "in_bounds");
-          index[i] = ir_builder_->CreateSDiv(
-              index[i], index_typed_const(pad_dim.interior_padding() + 1));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSLT(
-                  index[i],
-                  index_typed_const(hlo->operand(0)->shape().dimensions(i))),
-              "in_bounds");
-        }
-
-        // if (in_bounds) {
-        //   ret_value = operand0[index];  // source
-        // } else {
-        //   ret_value = *operand1;        // padding
-        // }
-        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           ir_builder_),
-            "pad_result_addr", ir_builder_);
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        ir_builder_->CreateStore(operand_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
-                            operand_to_generator.at(hlo->operand(1))({}));
-        ir_builder_->CreateStore(padding_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-        // Don't create phi(operand_value, padding_value) here, because invoking
-        // operand_to_generator may create new basic blocks, making the parent
-        // of operand_value or padding_value no longer a predecessor of
-        // if_data.after_block.
-        return ir_builder_->CreateLoad(ret_value_addr);
-      };
     case HloOpcode::kMap:
       return [=, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index fb053b62a75..afb78b8300b 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 
 #include <algorithm>
+#include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -98,8 +99,9 @@ double CalculateFlopsToBytesRatio(HloInstruction* fusion) {
   double bytes = CalculateBytesReadByFusionInstruction(fusion);
   // Add bytes written to root instructions buffer.
   bytes += ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
-  // Calculate flops for all fused instructions.
-  HloCostAnalysis analysis;
+  // Calculate flops for all fused instructions. Use a null shape size function
+  // because we don't care about bytes accessed by the ops.
+  HloCostAnalysis analysis([](const Shape& shape) { return 0; });
   TF_CHECK_OK(fusion->fused_expression_root()->Accept(&analysis));
   // Return flops / bytes.
   return bytes > 0.0 ? analysis.flop_count() / bytes : analysis.flop_count();
@@ -219,7 +221,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
                    fusion->fused_instructions().end(),
                    [](const std::unique_ptr<HloInstruction>& instruction) {
                      if (instruction->opcode() != HloOpcode::kParameter &&
-                         IsExpensive(*instruction)) {
+                         GpuInstructionFusion::IsExpensive(*instruction)) {
                        return false;
                      }
                      return true;
@@ -248,7 +250,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
   // Merge fused instructions from 'fusion' into each user.
-  std::set<HloInstruction*> users = fusion->users();
+  std::vector<HloInstruction*> users = fusion->users();
   for (HloInstruction* user : users) {
     user->MergeFusionInstruction(fusion);
     changed_ = true;
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index 9a989d26f93..bd720f8584f 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -25,7 +25,7 @@ namespace gpu {
 // An HLO pass that attempts to merge fusion instructions to reduce kernel
 // launch overhead and improve data locality.
 //
-// Fusion instructions are merged into their users if two conditons are met:
+// Fusion instructions are merged into their users if two conditions are met:
 //
 // 1) The flops_to_bytes ratio of the fusion instruction is below the threshold
 //    value of 1.0.
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index a87e66ca869..8afc32dea97 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 class FusionMergerTest : public HloTestBase {
  protected:
-  FusionMergerTest() : module_(TestName()) {}
+  FusionMergerTest() : module_(CreateNewModule()) {}
 
   // Builds the following computation:
   //
@@ -86,7 +86,7 @@ class FusionMergerTest : public HloTestBase {
 
     // Create output Tuple.
     builder.AddInstruction(HloInstruction::CreateTuple({out0, out1, out2}));
-    return module_.AddEntryComputation(builder.Build());
+    return module_->AddEntryComputation(builder.Build());
   }
 
   // Builds the following computation:
@@ -154,7 +154,7 @@ class FusionMergerTest : public HloTestBase {
 
     // Create output Tuple.
     builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
-    return module_.AddEntryComputation(builder.Build());
+    return module_->AddEntryComputation(builder.Build());
   }
 
   // Builds the following computation:
@@ -225,7 +225,7 @@ class FusionMergerTest : public HloTestBase {
 
     // Create output Tuple.
     builder.AddInstruction(HloInstruction::CreateTuple({out0, out1}));
-    return module_.AddEntryComputation(builder.Build());
+    return module_->AddEntryComputation(builder.Build());
   }
 
   Shape data_shape_ = ShapeUtil::MakeShape(F32, {4});
@@ -235,7 +235,7 @@ class FusionMergerTest : public HloTestBase {
   Shape tuple_shape4_ = ShapeUtil::MakeTupleShape(
       {data_shape_, data_shape_, data_shape_, data_shape_});
 
-  HloModule module_;
+  std::unique_ptr<HloModule> module_;
 };
 
 // Tests that we can merge a fusion instruction that is below threshold.
@@ -278,13 +278,15 @@ class FusionMergerTest : public HloTestBase {
 TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
   auto computation = BuildComputation0();
   // Run standard fusion passes.
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
+                  .Run(module_.get())
+                  .ValueOrDie());
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module_.get())
+                   .ValueOrDie());
   // Run fusion merger pass, which should merge the shared fusion instruction
   // into its two users.
-  EXPECT_TRUE(FusionMerger().Run(&module_).ValueOrDie());
+  EXPECT_TRUE(FusionMerger().Run(module_.get()).ValueOrDie());
 
   auto* root = computation->root_instruction();
   EXPECT_EQ(HloOpcode::kTuple, root->opcode());
@@ -338,14 +340,16 @@ TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
 TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
   BuildComputation1();
   // Run standard fusion passes.
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
+                  .Run(module_.get())
+                  .ValueOrDie());
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module_.get())
+                   .ValueOrDie());
   // Run fusion merger pass, which should detect that the flops/bytes of the
   // shared fusion instruction exceeds the threshold ratio, and therefore
   // cannot be merged with other fusion instructions.
-  EXPECT_FALSE(FusionMerger().Run(&module_).ValueOrDie());
+  EXPECT_FALSE(FusionMerger().Run(module_.get()).ValueOrDie());
 }
 
 // Tests that threshold for bytes transferred if merged is exceeded.
@@ -388,13 +392,15 @@ TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
 TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
   BuildComputation2(/*add_extra_input=*/true);
   // Run standard fusion passes.
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
+                  .Run(module_.get())
+                  .ValueOrDie());
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module_.get())
+                   .ValueOrDie());
   // Run fusion merger pass, which should detect that the net bytes transferred
   // (if merged) would increase.
-  EXPECT_FALSE(FusionMerger().Run(&module_).ValueOrDie());
+  EXPECT_FALSE(FusionMerger().Run(module_.get()).ValueOrDie());
 }
 
 // Tests that threshold for bytes transferred if merged is not exceeded.
@@ -442,15 +448,21 @@ TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
 TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) {
   BuildComputation2(/*add_extra_input=*/false);
   // Run standard fusion passes.
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/false).Run(&module_).ValueOrDie());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(&module_).ValueOrDie());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false)
+                  .Run(module_.get())
+                  .ValueOrDie());
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module_.get())
+                   .ValueOrDie());
   // Run fusion merger pass, which should detect that the net bytes transferred
   // (if merged) would not increase.
-  EXPECT_TRUE(FusionMerger().Run(&module_).ValueOrDie());
+  EXPECT_TRUE(FusionMerger().Run(module_.get()).ValueOrDie());
 }
 
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 98a8a4a2b1c..e784046450e 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -27,8 +27,6 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
-using Index = BufferAllocation::Index;
-
 namespace {
 
 // This struct contains the metadata of a matrix, e.g., its base address and
@@ -47,63 +45,171 @@ struct MatrixDescriptor {
   int64 num_cols;
 };
 
-// Performs a gemm call on lhs_matrix and rhs_matrix and stores the result to
-// output_matrix.
+// Performs a gemm call without an explicit algorithm on lhs_matrix and
+// rhs_matrix, and stores the result to output_matrix.
 template <typename Element>
-tensorflow::Status DoGemm(MatrixDescriptor lhs_matrix,
-                          MatrixDescriptor rhs_matrix,
-                          MatrixDescriptor output_matrix, se::Stream* stream) {
+bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
+            MatrixDescriptor output_matrix, se::Stream* stream) {
   DCHECK(!output_matrix.transpose);
 
   se::DeviceMemory<Element> lhs_data(lhs_matrix.data);
   se::DeviceMemory<Element> rhs_data(rhs_matrix.data);
   se::DeviceMemory<Element> output_data(output_matrix.data);
 
-  bool launch_ok =
-      stream
-          ->ThenBlasGemm(
-              lhs_matrix.transpose ? se::blas::Transpose::kTranspose
-                                   : se::blas::Transpose::kNoTranspose,
-              rhs_matrix.transpose ? se::blas::Transpose::kTranspose
-                                   : se::blas::Transpose::kNoTranspose,
-              output_matrix.num_rows, output_matrix.num_cols,
-              lhs_matrix.transpose
-                  ? lhs_matrix.num_rows
-                  : lhs_matrix.num_cols,  // Size of the reduce dimension.
-              /*alpha=*/1.0,
-              lhs_data,
-              lhs_matrix.num_rows,  // The leading dimension of LHS.
-              rhs_data,
-              rhs_matrix.num_rows,  // The leading dimension of RHS.
-              /*beta=*/0.0, &output_data,
-              output_matrix
-                  .num_rows)  // The leading dimension of the output matrix.
-          .ok();
-  if (!launch_ok) {
-    return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
-  }
-  return tensorflow::Status::OK();
+  auto lhs_transpose = lhs_matrix.transpose ? se::blas::Transpose::kTranspose
+                                            : se::blas::Transpose::kNoTranspose;
+  auto rhs_transpose = rhs_matrix.transpose ? se::blas::Transpose::kTranspose
+                                            : se::blas::Transpose::kNoTranspose;
+  auto k = lhs_matrix.transpose ? lhs_matrix.num_rows : lhs_matrix.num_cols;
+
+  return stream
+      ->ThenBlasGemm(
+          lhs_transpose, rhs_transpose, output_matrix.num_rows,
+          output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/1.0,
+          lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
+          /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0,
+          &output_data, /*leading dim of output=*/output_matrix.num_rows)
+      .ok();
 }
 
-// Return, if the given type is a valid Gemm elemental type, the executor for
-// that type, else null.
-// TODO(b/27202055): consider more element types.
-std::function<tensorflow::Status(MatrixDescriptor, MatrixDescriptor,
-                                 MatrixDescriptor, se::Stream*)>
-FindGemmExecutor(PrimitiveType type) {
+// Like DoGemm, but takes an explicit computation type and algorithm.
+// computation_type specifies the type of intermediate values generated during
+// the matmul (e.g. your input/output matricies could be f16s but you could do
+// computations with f32s).  algorithm is an opaque identifier which functions
+// as a hint to cublas.
+//
+// Not all algorithms are valid for all matrix sizes, and not all CUDA versions
+// and GPUs even support gemm-with-algorithm.  So expect that this may fail
+// unless you've already checked that it works for this particular GPU + input
+// size.
+//
+// If you pass a non-null ProfileResult, this will always return true (assuming
+// the Stream was valid to begin with); check the is_valid property of the
+// ProfileResult to see whether the call actually succeeded.
+template <typename Element>
+bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
+                         MatrixDescriptor rhs_matrix,
+                         MatrixDescriptor output_matrix,
+                         se::blas::ComputationType computation_type,
+                         se::blas::AlgorithmType algorithm, se::Stream* stream,
+                         se::blas::ProfileResult* output_profile_result) {
+  DCHECK(!output_matrix.transpose);
+
+  se::DeviceMemory<Element> lhs_data(lhs_matrix.data);
+  se::DeviceMemory<Element> rhs_data(rhs_matrix.data);
+  se::DeviceMemory<Element> output_data(output_matrix.data);
+
+  auto lhs_transpose = lhs_matrix.transpose ? se::blas::Transpose::kTranspose
+                                            : se::blas::Transpose::kNoTranspose;
+  auto rhs_transpose = rhs_matrix.transpose ? se::blas::Transpose::kTranspose
+                                            : se::blas::Transpose::kNoTranspose;
+  auto k = lhs_matrix.transpose ? lhs_matrix.num_rows : lhs_matrix.num_cols;
+
+  return stream
+      ->ThenBlasGemmWithAlgorithm(
+          lhs_transpose, rhs_transpose, output_matrix.num_rows,
+          output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/1.0,
+          lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
+          /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0,
+          &output_data, /*leading dim of output=*/output_matrix.num_rows,
+          computation_type, algorithm, output_profile_result)
+      .ok();
+}
+
+// Experimentally tries to pick the best algorithm for the given gemm.
+//
+// This may fail under perfectly normal circumstances.  In particular, it will
+// fail if the program was built with < CUDA 8 or if we're using a gpu older
+// than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
+// all.
+template <typename Element>
+StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
+    MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
+    MatrixDescriptor output_matrix, se::blas::ComputationType computation_type,
+    se::Stream* stream) {
+  std::vector<se::blas::AlgorithmType> algorithms;
+  CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));
+
+  se::blas::ProfileResult best_result;
+  for (auto algorithm : algorithms) {
+    se::blas::ProfileResult profile_result;
+    // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
+    // for all algorithms if we're targeting < sm_50.  But because we pass a
+    // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
+    // and the actual success-ness is returned in ProfileResult::is_valid.
+    DCHECK(DoGemmWithAlgorithm<Element>(lhs_matrix, rhs_matrix, output_matrix,
+                                        computation_type, algorithm, stream,
+                                        &profile_result));
+
+    if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() <
+                                         best_result.elapsed_time_in_ms()) {
+      best_result = profile_result;
+    }
+  }
+
+  if (best_result.is_valid()) {
+    return best_result.algorithm();
+  }
+
+  return InternalError(
+      "Unable to autotune cuBLAS gemm on stream %p; none of the %zu algorithms "
+      "ran successfully",
+      stream, algorithms.size());
+}
+
+// Helper functions to go from a PrimitiveType to a templated version of
+// DoGemm/DoGemmWithAlgorithm/DoGemmAutotune.
+auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm<float>) {
   switch (type) {
     case F32:
       return &DoGemm<float>;
     case F64:
       return &DoGemm<double>;
     default:
-      return nullptr;
+      LOG(FATAL) << "Unsupported type.";
+  }
+}
+auto GetGemmWithAlgorithmFn(PrimitiveType type)
+    -> decltype(&DoGemmWithAlgorithm<float>) {
+  switch (type) {
+    case F32:
+      return &DoGemmWithAlgorithm<float>;
+    case F64:
+      return &DoGemmWithAlgorithm<double>;
+    default:
+      LOG(FATAL) << "Unsupported type.";
+  }
+}
+auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune<float>) {
+  switch (type) {
+    case F32:
+      return &DoGemmAutotune<float>;
+    case F64:
+      return &DoGemmAutotune<double>;
+    default:
+      LOG(FATAL) << "Unsupported type.";
+  }
+}
+
+// Converts from an XLA PrimitiveType to a blas::ComputationType, which is used
+// to specify the precision with which matmul computations should be performed,
+// separately from the precision of the inputs and result.
+se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
+  switch (type) {
+    case F32:
+      return se::blas::ComputationType::kF32;
+    case F64:
+      return se::blas::ComputationType::kF64;
+    default:
+      LOG(FATAL) << "Unsupported type.";
   }
 }
 
 }  // namespace
 
-GemmThunk::GemmThunk(Index lhs_buffer, Index rhs_buffer, Index output_buffer,
+GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
+                     const BufferAllocation::Slice& rhs_buffer,
+                     const BufferAllocation::Slice& output_buffer,
                      const Shape& lhs_shape, const Shape& rhs_shape,
                      const Shape& output_shape, bool transpose_lhs,
                      bool transpose_rhs, const HloInstruction* hlo_instruction)
@@ -120,8 +226,6 @@ GemmThunk::GemmThunk(Index lhs_buffer, Index rhs_buffer, Index output_buffer,
 tensorflow::Status GemmThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   VLOG(2) << "Executing a GemmThunk";
-  auto executor = FindGemmExecutor(output_shape_.element_type());
-  DCHECK(executor != nullptr);
 
   se::DeviceMemoryBase lhs_data =
       buffer_allocations.GetDeviceAddress(lhs_buffer_);
@@ -141,7 +245,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
   // Therefore, we need to convert dot between row-major matrices to that
   // between column-major matrices. The key insight for the conversion is that,
   // in linear storage, matrix M in column-major order is identical to the
-  // tranpose of M in row-major order. In other words,
+  // transpose of M in row-major order. In other words,
   //
   //   column-major(M) = row-major(M^T).
   //
@@ -172,17 +276,66 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
       make_descriptor(lhs_data, lhs_shape_, transpose_lhs_);
   const MatrixDescriptor rhs_descriptor =
       make_descriptor(rhs_data, rhs_shape_, transpose_rhs_);
+
+  // Dispatches to a regular cublas gemm, a gemm-with-algorithm, or attempts to
+  // autotune this gemm to figure out the best algorithm.
+  auto launch = [this](MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
+                       MatrixDescriptor output_matrix, se::Stream* stream) {
+    PrimitiveType element_type = output_shape_.element_type();
+    se::blas::ComputationType computation_type =
+        GetBlasComputationType(element_type);
+
+    const string& device_name = stream->parent()->GetDeviceDescription().name();
+    auto autotune_it = autotune_results_.find(device_name);
+    if (autotune_it == autotune_results_.end()) {
+      StatusOr<se::blas::AlgorithmType> best_algorithm =
+          GetGemmAutotuneFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
+                                          computation_type, stream);
+      autotune_it =
+          autotune_results_.insert({device_name, best_algorithm}).first;
+
+      if (autotune_it->second.ok()) {
+        VLOG(2) << "Autotune on GemmThunk " << this
+                << " successful; best algorithm is "
+                << best_algorithm.ValueOrDie();
+      } else {
+        VLOG(2) << "Autotune on GemmThunk " << this
+                << " unsuccessful.  Will use generic gemm.";
+      }
+    }
+
+    const StatusOr<se::blas::AlgorithmType>& best_algorithm =
+        autotune_it->second;
+    if (best_algorithm.ok()) {
+      auto algorithm = best_algorithm.ValueOrDie();
+      VLOG(2) << "Using algorithm " << algorithm
+              << " chosen by autotuning on GemmThunk " << this;
+      return GetGemmWithAlgorithmFn(element_type)(
+          lhs_matrix, rhs_matrix, output_matrix, computation_type, algorithm,
+          stream,
+          /*output_profile_result=*/nullptr);
+    }
+    return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
+                                   stream);
+  };
+
+  bool launch_ok;
   if (output_shape_.layout().minor_to_major(0) == 0) {
-    return executor(
+    launch_ok = launch(
         lhs_descriptor, rhs_descriptor,
         MatrixDescriptor(output_data, false, output_num_rows, output_num_cols),
         stream);
   } else {
-    return executor(
+    launch_ok = launch(
         rhs_descriptor, lhs_descriptor,
         MatrixDescriptor(output_data, false, output_num_cols, output_num_rows),
         stream);
   }
+
+  if (!launch_ok) {
+    return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
+  }
+  return tensorflow::Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 7c8574d2752..983cb872924 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -37,11 +37,11 @@ class GemmThunk : public Thunk {
   // Constructs a thunk that computes "output = lhs <dot> rhs" using BLAS gemm.
   // transpose_lhs and transpose_rhs indicate whether gemm should transpose the
   // lhs and rhs operand. hlo_instruction is as in Thunk.
-  GemmThunk(BufferAllocation::Index lhs_buffer,
-            BufferAllocation::Index rhs_buffer,
-            BufferAllocation::Index output_buffer, const Shape& lhs_shape,
-            const Shape& rhs_shape, const Shape& output_shape,
-            bool transpose_lhs, bool transpose_rhs,
+  GemmThunk(const BufferAllocation::Slice& lhs_buffer,
+            const BufferAllocation::Slice& rhs_buffer,
+            const BufferAllocation::Slice& output_buffer,
+            const Shape& lhs_shape, const Shape& rhs_shape,
+            const Shape& output_shape, bool transpose_lhs, bool transpose_rhs,
             const HloInstruction* hlo_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
@@ -53,16 +53,24 @@ class GemmThunk : public Thunk {
       perftools::gputools::Stream* stream) override;
 
  private:
-  BufferAllocation::Index lhs_buffer_;
-  BufferAllocation::Index rhs_buffer_;
-  BufferAllocation::Index output_buffer_;
+  const BufferAllocation::Slice lhs_buffer_;
+  const BufferAllocation::Slice rhs_buffer_;
+  const BufferAllocation::Slice output_buffer_;
 
-  Shape lhs_shape_;
-  Shape rhs_shape_;
-  Shape output_shape_;
+  const Shape lhs_shape_;
+  const Shape rhs_shape_;
+  const Shape output_shape_;
 
-  bool transpose_lhs_;
-  bool transpose_rhs_;
+  const bool transpose_lhs_;
+  const bool transpose_rhs_;
+
+  // Maps device names (StreamExecutor::DeviceDescription::name()) to autotune
+  // results.  The map's value is the best algorithm we've found for this thunk
+  // on this device, or an error if none of the algorithms worked and we should
+  // use the regular gemm without an algorithm.
+  std::unordered_map<string,
+                     StatusOr<::perftools::gputools::blas::AlgorithmType>>
+      autotune_results_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index a7e5c5226f9..86137a569f9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -24,10 +24,12 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/LLVMContext.h"
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
@@ -42,15 +44,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
@@ -79,6 +84,13 @@ const char* kTargetTriple = "nvptx64-nvidia-cuda";
 // NVPTXTargetMachine.cpp.
 const char* kDataLayout = "e-i64:64-v16:16-v32:32-n16:32:64";
 
+// Any address of a variable residing in global memory or returned by one of the
+// memory allocation routines from the driver or runtime API is always aligned
+// to at least 256 bytes.
+//
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
+constexpr int64 kMemoryAlignment = 256;
+
 // Returns the directory containing nvvm libdevice files. This function is
 // called in GpuCompiler's constructor, so can't return an error. But
 // GpuCompiler::Compile will return an error when the wanted libdevice file
@@ -114,6 +126,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
                                      const se::DeviceDescription& device_desc) {
   {
     HloPassPipeline pipeline("optimization", dump_hlo);
+    pipeline.AddInvariantChecker<HloVerifier>();
     {
       auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
           "simplification", dump_hlo);
@@ -121,10 +134,16 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
       pass.AddPass<ReshapeMover>();
+      pass.AddPass<HloConstantFolding>();
     }
     pipeline.AddPass<ConvolutionFolding>();
-    pipeline.AddPass<TransposeFolding>(ImplementedAsGemm);
-    pipeline.AddPass<HloSubcomputationUnification>();
+    pipeline.AddPass<TransposeFolding>(
+        [](const HloInstruction& dot,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return ImplementedAsGemm(dot) ? candidate_operands
+                                        : TransposeFolding::OperandIndices{};
+        },
+        TransposeFolding::NeverFoldTranspose);
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
@@ -141,17 +160,17 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
 tensorflow::Status PrepareHloModuleForIrEmitting(
-    const Compiler::HloDumper& dump_hlo, HloModule* hlo_module,
-    HloModuleConfig* module_config) {
+    const Compiler::HloDumper& dump_hlo, HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare", dump_hlo);
+  pipeline.AddInvariantChecker<HloVerifier>();
   pipeline.AddPass<PadInsertion>();
   pipeline.AddPass<GpuLayoutAssignment>(
-      module_config->mutable_entry_computation_layout());
+      hlo_module->mutable_entry_computation_layout());
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -161,16 +180,20 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
   // materializes the value) or missing a necessary copy (later pass removes an
-  // instruction which materializes a value).
+  // instruction which materializes a value). DCE must be run immediately before
+  // (and sometime after) copy insertion, to avoid dead code from interfering
+  // with the rewrites.
+  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<GpuCopyInsertion>();
   pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(hlo_module).status();
 }
 
 // Invokes the ptxas tool on the given PTX string, and dumps its output.
 void DumpPtxasInfo(const string& ptx) {
-  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
-  const string ptxas_path = flags->xla_ptxas_path;
+  const string ptxas_path =
+      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
   // Do not log PTX stats if ptxas is not found at the given path.
   if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
     LOG(WARNING)
@@ -206,18 +229,18 @@ void DumpPtxasInfo(const string& ptx) {
 
 }  // namespace
 
-GpuCompiler::GpuCompiler() : libdevice_dir_(GetLibdeviceDir()) {}
+GpuCompiler::GpuCompiler()
+    : libdevice_dir_(GetLibdeviceDir()),
+      pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
     se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(OptimizeHloModule(hlo_module.get(), dump_hlo,
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), dump_hlo,
                                        stream_exec->GetDeviceDescription()));
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(dump_hlo, hlo_module.get(),
-                                                   module_config.get()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(dump_hlo, module.get()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -230,42 +253,45 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   };
   llvm_context.setDiagnosticHandler(DiagnosticHandler, &printer);
 
-  llvm::Module llvm_module(hlo_module->name().c_str(), llvm_context);
+  llvm::Module llvm_module(module->name().c_str(), llvm_context);
   // Set the target triple and the data layout.
   llvm_module.setTargetTriple(kTargetTriple);
   llvm_module.setDataLayout(kDataLayout);
-  const llvm::DataLayout& data_layout = llvm_module.getDataLayout();
-  int64 pointer_size = data_layout.getPointerSize();
 
   // Determine the HLO schedule, which is an ordering of HLO instructions.  This
   // is used by buffer assignment to enable buffer reuse, and the same ordering
   // must also be used to determine the thunk launch schedule.
-  std::unique_ptr<StreamAssignment> stream_assignment =
-      AssignStreams(*hlo_module);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloSchedule> hlo_schedule,
-                      HloSchedule::Build(*hlo_module, *stream_assignment));
+  std::unique_ptr<StreamAssignment> stream_assignment = AssignStreams(*module);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloSchedule> hlo_schedule,
+      HloSchedule::Build(*module, *stream_assignment, pointer_size_));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(hlo_module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          pointer_size));
-  auto temp_buffer_offsets = MakeUnique<TempBufferOffsets>(*buffer_assignment);
+      BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
+                          BufferSizeBytesFunction(), kMemoryAlignment));
 
-  IrEmitterContext ir_emitter_context(
-      hlo_module.get(), buffer_assignment.get(), temp_buffer_offsets.get(),
-      &stream_exec->GetDeviceDescription(), &llvm_module);
+  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
+  if (!flags->xla_gpu_dump_debug_json_to.empty()) {
+    HloProto proto = MakeHloProto(*module, *buffer_assignment);
+    TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
+        proto, flags->xla_gpu_dump_debug_json_to, module->name()));
+  }
 
-  HloComputation* entry_computation = hlo_module->entry_computation();
-  IrEmitterUnnested ir_emitter(*module_config, entry_computation,
-                               module_config->has_hybrid_result(),
+  IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
+                                      &stream_exec->GetDeviceDescription(),
+                                      &llvm_module);
+
+  HloComputation* entry_computation = module->entry_computation();
+  IrEmitterUnnested ir_emitter(module->config(), entry_computation,
+                               module->config().has_hybrid_result(),
                                &ir_emitter_context);
   TF_RETURN_IF_ERROR(
       entry_computation->root_instruction()->Accept(&ir_emitter));
 
   string ir_module_string_before_opt;
-  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
   if (VLOG_IS_ON(2) || flags->xla_gpu_embed_ir) {
     ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
     VLOG(2) << "LLVM module before optimizations:";
@@ -279,8 +305,16 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     generated_ptxes_.emplace_back(MakeUnique<string>());
     ptx = generated_ptxes_.back().get();
   }
-  TF_ASSIGN_OR_RETURN(
-      *ptx, CompileToPtx(&llvm_module, *module_config, libdevice_dir_));
+  int cc_major, cc_minor;
+  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor)) {
+    LOG(WARNING)
+        << "Couldn't get compute capability for device; assuming sm_20.";
+    cc_major = 2;
+    cc_minor = 0;
+  }
+  TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
+                                         module->config(), libdevice_dir_));
 
   VLOG(2) << "LLVM module after optimizations:";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
@@ -297,9 +331,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
   auto* gpu_executable =
-      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
-                        std::move(module_config), std::move(buffer_assignment),
-                        std::move(temp_buffer_offsets));
+      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(module),
+                        std::move(buffer_assignment), ShapeSizeBytesFunction());
   if (flags->xla_gpu_embed_ir) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -308,9 +341,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on GPU.");
 }
@@ -318,7 +350,6 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(
     std::vector<std::unique_ptr<HloModule>> module,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_config,
     HloDumper dump_hlo, const AotCompilationOptions& options) {
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index a074607760f..da52f5ab1f8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -42,24 +41,28 @@ class GpuCompiler : public Compiler {
   ~GpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
       HloDumper dump_hlo, AotCompilationOptions const& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
+    // Capture just the pointer size, not the entire GpuCompiler object.
+    int64 pointer_size = pointer_size_;
+    return [pointer_size](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape, pointer_size);
+    };
+  }
+
  private:
   // The parent directory of libdevice IR libraries.
   const string libdevice_dir_;
@@ -70,6 +73,9 @@ class GpuCompiler : public Compiler {
   tensorflow::mutex mutex_;
   std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
 
+  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
+  int64 pointer_size_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index f654ffd22d5..7f9e60460c2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -47,8 +47,12 @@ class HloExecutionProfiler {
  public:
   // If profiling is enabled, start an execution timer running.
   explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
-                                se::Stream* stream)
-      : do_profile_(do_profile), profile_(profile), stream_(stream) {
+                                se::Stream* stream,
+                                const HloComputation* computation)
+      : do_profile_(do_profile),
+        profile_(profile),
+        stream_(stream),
+        computation_(computation) {
     if (do_profile_) {
       clock_rate_ghz_ =
           stream->parent()->GetDeviceDescription().clock_rate_ghz();
@@ -66,8 +70,8 @@ class HloExecutionProfiler {
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
       stream_->BlockHostUntilDone();
-      profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() *
-                                          clock_rate_ghz_);
+      profile_->set_total_cycles_executed(
+          *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
     }
   }
 
@@ -94,6 +98,7 @@ class HloExecutionProfiler {
   double clock_rate_ghz_;
   HloExecutionProfile* profile_;
   se::Stream* stream_;
+  const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
 };
@@ -105,30 +110,33 @@ class HloExecutionProfiler {
 GpuExecutable::GpuExecutable(
     tensorflow::StringPiece ptx, std::unique_ptr<ThunkSchedule> thunk_schedule,
     std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
     std::unique_ptr<BufferAssignment> assignment,
-    std::unique_ptr<TempBufferOffsets> temp_buffer_offsets)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    HloCostAnalysis::ShapeSizeFunction shape_size_function)
+    : Executable(std::move(hlo_module), std::move(shape_size_function)),
       ptx_(ptx),
       thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)),
-      temp_buffer_offsets_(std::move(temp_buffer_offsets)) {}
+      assignment_(std::move(assignment)) {}
 
 Status GpuExecutable::ExecuteThunks(
-    se::Stream* main_stream, const BufferAllocations& buffer_allocations,
+    const ServiceExecutableRunOptions* run_options,
+    const BufferAllocations& buffer_allocations, bool block_host_until_done,
     HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* main_stream = run_options->stream();
+
   bool do_profile = hlo_execution_profile != nullptr;
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
-  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream);
+  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
+                                hlo_module_->entry_computation());
 
-  std::vector<std::unique_ptr<se::Stream>> sub_streams;
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
-  for (int32 i = 1; i < thunk_schedule_->StreamCount(); ++i) {
-    auto sub_stream = MakeUnique<se::Stream>(main_stream->parent());
-    sub_stream->Init();
-    sub_streams.emplace_back(std::move(sub_stream));
+  std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
+  while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
+    sub_streams.emplace_back();
+    TF_ASSIGN_OR_RETURN(
+        sub_streams.back(),
+        run_options->BorrowStream(main_stream->parent()->device_ordinal()));
   }
 
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
@@ -160,7 +168,7 @@ Status GpuExecutable::ExecuteThunks(
   // Make sure kernels are completed before deallocating temporary buffers.
   // TODO(b/30100571): we could potentially postpone deallocating the temp
   // buffers until a different computation is executed.
-  if (!main_stream->BlockHostUntilDone()) {
+  if (block_host_until_done && !main_stream->BlockHostUntilDone()) {
     return InternalError("Failed to complete all kernels launched on stream %p",
                          main_stream);
   }
@@ -169,7 +177,7 @@ Status GpuExecutable::ExecuteThunks(
 }
 
 StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* stream = run_options->stream();
@@ -188,19 +196,22 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
     }
   }
   se::StreamExecutor* executor = stream->parent();
-  TF_ASSIGN_OR_RETURN(auto buffer_allocations,
-                      buffer_allocations_builder.Build(
-                          *assignment_, *temp_buffer_offsets_,
-                          executor->device_ordinal(), memory_allocator));
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_allocations,
+      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
+                                       memory_allocator));
 
-  TF_RETURN_IF_ERROR(
-      ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
+  bool block_host_until_done =
+      !memory_allocator->AllowsAsynchronousDeallocation();
+  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
+                                   block_host_until_done,
+                                   hlo_execution_profile));
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* output_allocation,
-                      assignment_->GetUniqueTopLevelOutputAllocation());
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice output_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
   se::DeviceMemoryBase output_buffer_address =
-      buffer_allocations->GetDeviceAddress(output_allocation->index());
+      buffer_allocations->GetDeviceAddress(output_slice.index());
 
   if (ShapeUtil::IsTuple(root->shape())) {
     std::set<se::DeviceMemoryBase> referred_by_output;
@@ -217,21 +228,21 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
       // The points-to set of the root is unambiguous so it's known statically
       // which buffers are in the result. Gather these buffers using the root's
       // points-to set.
-      TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElement(
+      TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus(
           [&referred_by_output, &buffer_allocations, this](
-              const ShapeIndex& /*index*/, bool /*is_leaf*/,
+              const ShapeIndex& /*index*/,
               const std::vector<const LogicalBuffer*>& buffers) {
             // The points to set is unambiguous so the set should be a
             // singleton. That is, we know exactly which instruction produced
             // the array at this element.
             CHECK_EQ(1, buffers.size());
             HloInstruction* hlo = buffers[0]->instruction();
-            TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                                this->assignment_->GetUniqueAllocation(
-                                    hlo, buffers[0]->index()));
-            CHECK(!allocation->is_entry_computation_parameter());
+            TF_ASSIGN_OR_RETURN(
+                const BufferAllocation::Slice slice,
+                this->assignment_->GetUniqueSlice(hlo, buffers[0]->index()));
+            CHECK(!slice.allocation()->is_entry_computation_parameter());
             referred_by_output.insert(
-                buffer_allocations->GetDeviceAddress(allocation->index()));
+                buffer_allocations->GetDeviceAddress(slice.index()));
             return Status::OK();
           }));
     }
@@ -247,10 +258,9 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   // This ExecuteOnStream overload should only be called by the LocalService
   // which sets has_hybrid_result to true.
@@ -273,14 +283,17 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
           i, arguments[param_no]->buffer(/*index=*/{}));
     }
   }
-  se::StreamExecutor* executor = stream->parent();
-  TF_ASSIGN_OR_RETURN(auto buffer_allocations,
-                      buffer_allocations_builder.Build(
-                          *assignment_, *temp_buffer_offsets_,
-                          executor->device_ordinal(), memory_allocator));
+  se::StreamExecutor* executor = run_options->stream()->parent();
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_allocations,
+      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
+                                       memory_allocator));
 
-  TF_RETURN_IF_ERROR(
-      ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
+  bool block_host_until_done =
+      !memory_allocator->AllowsAsynchronousDeallocation();
+  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
+                                   block_host_until_done,
+                                   hlo_execution_profile));
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
@@ -293,10 +306,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   std::set<se::DeviceMemoryBase> buffers_in_result;
   TF_RETURN_IF_ERROR(
       shaped_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElement(
+          ->ForEachMutableElementWithStatus(
               [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
-                  const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) {
-                if (is_leaf) {
+                  const ShapeIndex& index, size_t* buffer_entry) {
+                if (ShapeUtil::IsLeafIndex(shaped_buffer->shape(), index)) {
                   const std::vector<const LogicalBuffer*>& sources =
                       this->GetRootPointsToSet().element(index);
                   // The points to set is unambiguous so the set should be a
@@ -309,13 +322,13 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
 
                   // The source instruction should have a non-parameter buffer
                   // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                                      this->assignment_->GetUniqueAllocation(
+                  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                                      this->assignment_->GetUniqueSlice(
                                           src_hlo, sources[0]->index()));
-                  CHECK(!allocation->is_entry_computation_parameter());
+                  CHECK(!slice.allocation()->is_entry_computation_parameter());
 
                   perftools::gputools::DeviceMemoryBase src_base =
-                      buffer_allocations->GetDeviceAddress(allocation->index());
+                      buffer_allocations->GetDeviceAddress(slice.index());
                   CHECK(!src_base.is_null() || src_base.size() == 0);
                   shaped_buffer->mutable_buffers()->push_back(src_base);
                   *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1;
@@ -330,115 +343,8 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   return std::move(shaped_buffer);
 }
 
-Status GpuExecutable::ExecuteOnStream(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    ShapedBuffer* result_buffer, HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  // This ExecuteOnStream overload should only be called by the LocalService
-  // which sets has_hybrid_result to true.
-  TF_RET_CHECK(module_config().has_hybrid_result());
-
-  // Every array element in the result of the computation must be unambiguously
-  // produced by a single instruction.
-  // This ensures that the buffers inside result_buffer can be assigned without
-  // conflict to the respective instructions because there is a one-to-one
-  // correspondence between hlo instructions and array buffers in the result.
-  if (GetRootPointsToSet().IsAmbiguous()) {
-    return Unimplemented(
-        "Points-to set of root instruction is ambiguous or not distinct");
-  }
-
-  DCHECK(ShapeUtil::Compatible(result_buffer->shape(), result_shape()));
-
-  BufferAllocations::Builder buffer_allocations_builder;
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_entry_computation_parameter()) {
-      auto param_no = allocation.parameter_number();
-      if (ShapeUtil::IsTuple(arguments[param_no]->shape())) {
-        return Unimplemented("Tuple ShapedBuffer arguments not supported");
-      }
-      buffer_allocations_builder.RegisterBuffer(
-          i, arguments[param_no]->buffer(/*index=*/{}));
-    }
-  }
-
-  // If two tuple elements point to the same buffer, one of the results in the
-  // result buffer is considered the canonical location while the other result
-  // points to it (instead of, say, making a copy of the result).
-  // buffer_index_to_shape_index maps a buffer index to its canonical location
-  // in the result buffer.
-  std::unordered_map<BufferAllocation::Index, size_t>
-      buffer_index_to_shape_index;
-
-  // Register DeviceMemoryBase values in result_buffer to their corresponding
-  // buffer indices. These buffers will not be allocated in the call to
-  // BufferAllocationsBuilder::Build.
-  std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElement(
-              [&buffer_allocations_builder, &buffers_in_result,
-               &buffer_index_to_shape_index, result_buffer, this](
-                  const ShapeIndex& index, bool is_leaf, size_t* buffer_entry) {
-                if (is_leaf) {
-                  const std::vector<const LogicalBuffer*>& sources =
-                      this->GetRootPointsToSet().element(index);
-                  // The points to set is unambiguous so the set should be a
-                  // singleton. That is, we know exactly which instruction
-                  // produced the array at this element.
-                  CHECK_EQ(1, sources.size());
-                  auto src_hlo = sources[0]->instruction();
-
-                  VLOG(4) << "Looking at: " << sources[0];
-
-                  // The source instruction should have a non-parameter buffer
-                  // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation* allocation,
-                                      this->assignment_->GetUniqueAllocation(
-                                          src_hlo, sources[0]->index()));
-                  CHECK(!allocation->is_entry_computation_parameter());
-
-                  auto insert_result = buffer_index_to_shape_index.emplace(
-                      allocation->index(), *buffer_entry);
-                  if (insert_result.second) {
-                    // The points-to set is distinct so this buffer should not
-                    // have been assigned in a previous invocation of this
-                    // lambda.
-                    perftools::gputools::DeviceMemoryBase memory_base =
-                        result_buffer->buffer(index);
-                    CHECK(!memory_base.is_null());
-                    buffer_allocations_builder.RegisterBuffer(
-                        allocation->index(), memory_base);
-                    buffers_in_result.insert(memory_base);
-                  } else {
-                    // Record the fact that this tuple element is identical to
-                    // some
-                    // prior result.
-                    *buffer_entry = insert_result.first->second;
-                  }
-                }
-                return Status::OK();
-              }));
-
-  se::StreamExecutor* executor = stream->parent();
-  auto device_ordinal = executor->device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      auto buffer_allocations,
-      buffer_allocations_builder.Build(*assignment_, *temp_buffer_offsets_,
-                                       device_ordinal, memory_allocator));
-
-  TF_RETURN_IF_ERROR(
-      ExecuteThunks(stream, *buffer_allocations, hlo_execution_profile));
-
-  return buffer_allocations->TearDown(buffers_in_result, *assignment_);
-}
-
 StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 2343d264dee..e1a55118fc7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -24,12 +24,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -52,9 +50,8 @@ class GpuExecutable : public Executable {
   GpuExecutable(tensorflow::StringPiece ptx,
                 std::unique_ptr<ThunkSchedule> thunk_schedule,
                 std::unique_ptr<HloModule> hlo_module,
-                std::unique_ptr<HloModuleConfig> module_config,
                 std::unique_ptr<BufferAssignment> assignment,
-                std::unique_ptr<TempBufferOffsets> temp_buffer_offsets);
+                HloCostAnalysis::ShapeSizeFunction shape_size_function);
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -68,30 +65,30 @@ class GpuExecutable : public Executable {
   tensorflow::StringPiece ptx() const { return ptx_; }
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  Status ExecuteOnStream(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      ShapedBuffer* result_buffer,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments) override;
 
  private:
-  Status ExecuteThunks(perftools::gputools::Stream* stream,
+  // If `block_host_until_done` is false, execution will not block the host
+  // until the kernels have completed. This is used as an optimization for
+  // clients, such as Tensorflow, that use a single stream of execution for
+  // computations, and allow host-side deallocation from the allocator before
+  // GPU execution completes.
+  Status ExecuteThunks(const ServiceExecutableRunOptions* run_options,
                        const BufferAllocations& buffer_allocations,
+                       bool block_host_until_done,
                        HloExecutionProfile* hlo_execution_profile);
 
   // Returns the points-to set of the root instruction of the entry
@@ -117,10 +114,6 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::unique_ptr<BufferAssignment> assignment_;
 
-  // Owns the mapping from temporary buffers to their offsets in the temp-buffer
-  // memory block.
-  const std::unique_ptr<TempBufferOffsets> temp_buffer_offsets_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
index 404a53e13b7..d16a1d4ee5b 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
@@ -36,26 +36,42 @@ class GpuHloOrdering : public PredecessorHloOrdering {
                  const std::vector<const HloInstruction*>& thunk_launch_order);
   ~GpuHloOrdering() override = default;
 
+  // Only the entry computation can possibly be sequentially ordered, and only
+  // if we've assigned all instructions to a single stream.
+  const std::vector<const HloInstruction*>* SequentialOrder(
+      const HloComputation& computation) const override {
+    return &computation == module_->entry_computation() ? entry_sequence_.get()
+                                                        : nullptr;
+  }
+
   string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
+
+ private:
+  std::unique_ptr<std::vector<const HloInstruction*>> entry_sequence_;
 };
 
 GpuHloOrdering::GpuHloOrdering(
     const HloModule* module, const StreamAssignment& stream_assignment,
     const std::vector<const HloInstruction*>& thunk_launch_order)
     : PredecessorHloOrdering(module) {
+  // The entry computation has a total order when there's only one stream.
+  if (stream_assignment.StreamCount() == 1) {
+    entry_sequence_ =
+        MakeUnique<std::vector<const HloInstruction*>>(thunk_launch_order);
+  }
+
   // The ordering of instructions for the entry computation is determined by the
   // total order of thunk launches, and stream assignment. Instructions are
   // sequential within a stream and concurrent across streams. In addition, the
   // GpuExecutable adds cross-stream dependency edges to ensure each instruction
   // waits for its operands before executing.
   //
-  // The predecessor map is built incrementally, in thunk launch
-  // order. We record the instructions already visited per stream in
-  // 'instructions_per_stream'. This lets us quickly determine the
-  // same-stream predecessors of each instruction. To capture
-  // cross-stream dependency edges, we use the predecessor map to
-  // insert each operand as well as its transitive closure of
-  // dependencies.
+  // The predecessor map is built incrementally, in thunk launch order. We
+  // record the instructions already visited per stream in
+  // 'instructions_per_stream'. This lets us quickly determine the same-stream
+  // predecessors of each instruction. To capture cross-stream dependency edges,
+  // we use the predecessor map to insert each operand as well as its transitive
+  // closure of dependencies.
 
   // Compute the set of all instructions we will want to set reachability on
   auto predecessor_map = MakeUnique<HloComputation::ReachabilityMap>(
@@ -98,12 +114,9 @@ GpuHloOrdering::GpuHloOrdering(
   // dependencies. I.e. the strict predecessors of each subcomputation
   // instruction is its transitive operands.
   //
-  // TODO(toddw): Each subcomputation is actually emitted as a function in
-  // DFS
-  // postorder, so we can do better and establish the total order here. We
-  // don't
-  // do that yet since it's hard to ensure that the order here is the order
-  // used
+  // TODO(toddw): Each subcomputation is actually emitted as a function in DFS
+  // postorder, so we can do better and establish the total order here. We don't
+  // do that yet since it's hard to ensure that the order here is the order used
   // by IrEmitterNested. And mismatched ordering bugs would be hard to find.
   for (auto& computation : module->computations()) {
     if (computation.get() != module->entry_computation()) {
@@ -113,20 +126,6 @@ GpuHloOrdering::GpuHloOrdering(
   }
 }
 
-// Computes a topological launch_order based on depth-first order, visiting
-// operands in essentially an arbitrary order.
-//
-// TODO(b/32006145): Use an ordering that minimizes memory pressure.
-tensorflow::Status DFSLaunchOrder(
-    const HloComputation* computation,
-    std::vector<const HloInstruction*>* launch_order) {
-  return computation->root_instruction()->Accept(
-      [launch_order](HloInstruction* hlo) {
-        launch_order->push_back(hlo);
-        return tensorflow::Status::OK();
-      });
-}
-
 // Computes a topological launch_order that is close to a breadth-first
 // order. This heuristic works well for graphs where concurrent kernels are
 // located at the same layer. It can often reduce dependency between concurrent
@@ -187,19 +186,24 @@ HloSchedule::HloSchedule() {}
 
 /* static */
 StatusOr<std::unique_ptr<HloSchedule>> HloSchedule::Build(
-    const HloModule& module, const StreamAssignment& stream_assignment) {
+    const HloModule& module, const StreamAssignment& stream_assignment,
+    int64 pointer_size) {
   std::unique_ptr<HloSchedule> schedule(new HloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  const HloComputation* computation = module.entry_computation();
+  const HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
-    // DFS tends to increase buffer reuse, reducing memory usage.  All kernels
-    // are launched on a single stream, so there's no loss of concurrency.
-    TF_RETURN_IF_ERROR(
-        DFSLaunchOrder(computation, &schedule->thunk_launch_order_));
+    // All kernels are launched on a single stream, so there's no loss of
+    // concurrency by optimizing for minimal memory usage.
+    TF_ASSIGN_OR_RETURN(
+        schedule->thunk_launch_order_,
+        CreateMemoryMinimizingSequence(
+            *entry_computation, [pointer_size](const LogicalBuffer& buffer) {
+              return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
+            }));
   } else {
     // BFS tends to increase concurrency, but also increases memory usage.
-    BFSLaunchOrder(computation, &schedule->thunk_launch_order_);
+    BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
   }
 
   schedule->hlo_ordering_ = MakeUnique<GpuHloOrdering>(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
index 42d9051aede..773973010a4 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
@@ -39,7 +39,8 @@ class HloSchedule {
   // Constructs an HloSchedule for the given module, based on the given stream
   // assignment.
   static StatusOr<std::unique_ptr<HloSchedule>> Build(
-      const HloModule& module, const StreamAssignment& stream_assignment);
+      const HloModule& module, const StreamAssignment& stream_assignment,
+      int64 pointer_size);
 
   // Returns the total order of thunk launches, represented in terms of HLO
   // instructions.
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
index 70628f11917..118ef18c44b 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 
+#include <algorithm>
+#include <unordered_set>
+
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -28,10 +31,27 @@ namespace gpu {
 
 class HloScheduleTest : public HloTestBase {
  protected:
-  typedef std::vector<const HloInstruction*> hlovec;
+  using HloVec = std::vector<const HloInstruction*>;
 
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
+
+  static std::unique_ptr<HloSchedule> BuildHloSchedule(
+      const HloModule& module, const StreamAssignment& streams) {
+    return HloSchedule::Build(module, streams, /*pointer_size=*/8)
+        .ConsumeValueOrDie();
+  }
+
+  HloVec RemoveHlo(const HloVec& input,
+                   const std::unordered_set<const HloInstruction*>& remove) {
+    HloVec result(input);
+    result.erase(std::remove_if(result.begin(), result.end(),
+                                [&remove](const HloInstruction* x) {
+                                  return remove.count(x) > 0;
+                                }),
+                 result.end());
+    return result;
+  }
 };
 
 // Test of a single stream, where data dependencies fully determine the
@@ -49,15 +69,17 @@ TEST_F(HloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(dot2));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(dot2));
 
-  std::unique_ptr<StreamAssignment> streams = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
   EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
             streams->StreamNumberForHlo(*dot2));
 
-  auto schedule = HloSchedule::Build(module, *streams).ConsumeValueOrDie();
-  EXPECT_EQ(schedule->ThunkLaunchOrder(), hlovec({x, y, dot1, z, dot2}));
+  auto schedule = BuildHloSchedule(*module, *streams);
+  // Remove parameters, which are unordered.
+  EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
+            HloVec({dot1, dot2}));
 
   // Parameters x,y,z are mutually unordered, while dot1 and dot2 are
   // transitively ordered by operands.
@@ -107,17 +129,19 @@ TEST_F(HloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(add3));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(add3));
 
-  std::unique_ptr<StreamAssignment> streams = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
   EXPECT_EQ(streams->StreamNumberForHlo(*add1),
             streams->StreamNumberForHlo(*add2));
   EXPECT_EQ(streams->StreamNumberForHlo(*add1),
             streams->StreamNumberForHlo(*add3));
 
-  auto schedule = HloSchedule::Build(module, *streams).ConsumeValueOrDie();
-  EXPECT_EQ(schedule->ThunkLaunchOrder(), hlovec({x, y, add1, z, add2, add3}));
+  auto schedule = BuildHloSchedule(*module, *streams);
+  // Remove parameters, which are unordered.
+  EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
+            HloVec({add1, add2, add3}));
 
   // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
   // transitively ordered by operands.
@@ -175,16 +199,18 @@ TEST_F(HloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(add));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(add));
 
-  std::unique_ptr<StreamAssignment> streams = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
   EXPECT_NE(streams->StreamNumberForHlo(*dot1),
             streams->StreamNumberForHlo(*dot2));
 
-  auto schedule = HloSchedule::Build(module, *streams).ConsumeValueOrDie();
-  EXPECT_TRUE(schedule->ThunkLaunchOrder() == hlovec({x, y, dot1, dot2, add}) ||
-              schedule->ThunkLaunchOrder() == hlovec({x, y, dot2, dot1, add}));
+  auto schedule = BuildHloSchedule(*module, *streams);
+  // Remove parameters, which are unordered.
+  HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
+  EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
+              thunk_launch_order == HloVec({dot2, dot1, add}));
 
   // Parameters x,y are mutually unordered, while dot1, dot2 and add are
   // transitively ordered by operands.
@@ -228,6 +254,7 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
   //      d40      -- layer 4
   HloComputation::Builder builder("entry_computation");
   std::vector<HloInstruction*> params;
+  params.reserve(6);
   for (int i = 0; i < 6; ++i) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
@@ -251,10 +278,10 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(d40));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(d40));
 
-  std::unique_ptr<StreamAssignment> streams = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
   // The two dots on layer 1 are concurrent.
   EXPECT_NE(streams->StreamNumberForHlo(*d10),
             streams->StreamNumberForHlo(*d11));
@@ -271,12 +298,12 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
 
   // We don't check the thunk launch order, since there are many valid total
   // orders, and it's annoying to express.
-  auto schedule = HloSchedule::Build(module, *streams).ConsumeValueOrDie();
+  auto schedule = BuildHloSchedule(*module, *streams);
 
   auto order = schedule->ConsumeHloOrdering();
-  const hlovec all_params(
+  const HloVec all_params(
       {params[0], params[1], params[2], params[3], params[4], params[5]});
-  const hlovec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40});
+  const HloVec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40});
 
   // Parameters are mutually unordered, and never execute before ops.
   for (const HloInstruction* param : all_params) {
@@ -366,3 +393,7 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
 
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index accc406c76f..1a61eec3537 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -41,7 +41,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   // operand HLOs are already bound to avoid rebinding the same HLO.
   std::set<const HloInstruction*> already_bound_for_this_function;
   auto arg_iter = function->arg_begin();
-  for (const auto* io_hlo : io_hlos) {
+  for (const HloInstruction* io_hlo : io_hlos) {
     if (!already_bound_for_this_function.count(io_hlo)) {
       if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) {
         BindHloToIrValue(*io_hlo, EmitGetTupleElement(io_hlo, &*arg_iter));
@@ -56,7 +56,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   temp_buffer_base_ = &*arg_iter;
   temp_buffer_base_->setName("temp_buffer");
 
-  for (auto* non_io_hlo : non_io_hlos) {
+  for (const HloInstruction* non_io_hlo : non_io_hlos) {
     if (already_bound_for_this_function.count(non_io_hlo)) {
       continue;
     }
@@ -65,13 +65,13 @@ void HloToIrBindings::EmitBasePointersForHlos(
     if (non_io_hlo->opcode() == HloOpcode::kGetTupleElement) {
       if (!is_nested_) {
         // Lookup allocation GetTupleElement operand.
-        const BufferAllocation* allocation =
+        const BufferAllocation::Slice slice =
             buffer_assignment_
-                ->GetUniqueTopLevelAllocation(LatestNonGteAncestor(non_io_hlo))
+                ->GetUniqueTopLevelSlice(LatestNonGteAncestor(non_io_hlo))
                 .ConsumeValueOrDie();
         // We are not in a nested context, so check non-thread-local allocation.
-        CHECK(!allocation->is_thread_local());
-        int64 offset = temp_buffer_offsets_->GetOffset(allocation->index());
+        CHECK(!slice.allocation()->is_thread_local());
+        const int64 offset = slice.offset();
         CHECK_NE(nullptr, temp_buffer_base_);
         // Emit IR for GetTupleElement instruction and bind to emitted value.
         llvm::Value* base_ptr = ir_builder_->CreateInBoundsGEP(
@@ -89,15 +89,15 @@ void HloToIrBindings::EmitBasePointersForHlos(
     // A non-IO HLO with a buffer is bound to
     // (1) an alloca if it is thread-local, or
     // (2) an internal pointer in temp_buffer_base according to its offset.
-    const BufferAllocation* allocation =
-        buffer_assignment_->GetUniqueTopLevelAllocation(non_io_hlo)
+    const BufferAllocation::Slice slice =
+        buffer_assignment_->GetUniqueTopLevelSlice(non_io_hlo)
             .ConsumeValueOrDie();
-    if (allocation->is_thread_local()) {
+    if (slice.allocation()->is_thread_local()) {
       llvm::Type* pointee_type =
           llvm_ir::ShapeToIrType(non_io_hlo->shape(), ir_builder_);
       BindHloToIrValue(*non_io_hlo, ir_builder_->CreateAlloca(pointee_type));
     } else {
-      int64 offset = temp_buffer_offsets_->GetOffset(allocation->index());
+      const int64 offset = slice.offset();
       CHECK_NE(nullptr, temp_buffer_base_);
       BindHloToIrValue(*non_io_hlo,
                        ir_builder_->CreateInBoundsGEP(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 1e3b2684239..5be2150801f 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -37,10 +36,8 @@ class HloToIrBindings {
  public:
   HloToIrBindings(const HloModule& module,
                   const BufferAssignment* buffer_assignment,
-                  const TempBufferOffsets* temp_buffer_offsets,
                   llvm::IRBuilder<>* ir_builder, bool is_nested)
       : buffer_assignment_(buffer_assignment),
-        temp_buffer_offsets_(temp_buffer_offsets),
         is_nested_(is_nested),
         ir_builder_(ir_builder),
         alias_analysis_(module, *buffer_assignment_,
@@ -88,8 +85,6 @@ class HloToIrBindings {
 
   const BufferAssignment* buffer_assignment_;
 
-  const TempBufferOffsets* temp_buffer_offsets_;
-
   const bool is_nested_;
 
   llvm::IRBuilder<>* ir_builder_;
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
new file mode 100644
index 00000000000..120a3f7fba2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace gpu {
+
+InfeedManager::InfeedManager()
+    : current_buffer_(nullptr),
+      host_to_device_executor_(nullptr) {}
+
+void InfeedManager::Reset() {
+  tensorflow::mutex_lock l(mu_);
+  CHECK(!current_buffer_);
+  for (auto buffer : enqueued_buffer_) {
+    buffer->Done();
+  }
+  enqueued_buffer_.clear();
+}
+
+void InfeedManager::EnqueueBuffer(InfeedBuffer* buffer) {
+  tensorflow::mutex_lock l(mu_);
+  bool was_empty = enqueued_buffer_.empty();
+  enqueued_buffer_.push_back(buffer);
+  if (was_empty) {
+    // This has the potential to suffer from the notified thread
+    // immediately trying and failing to acquire mu_, but seems
+    // preferable to the alternative of notifying outside the lock
+    // on every enqueue.
+    cv_.notify_one();
+  }
+}
+
+InfeedBuffer* InfeedManager::BlockingDequeueBuffer() {
+  tensorflow::mutex_lock l(mu_);
+  while (enqueued_buffer_.empty()) {
+    cv_.wait(l);
+  }
+  CHECK(!current_buffer_);
+  current_buffer_ = enqueued_buffer_.front();
+  enqueued_buffer_.pop_front();
+  return current_buffer_;
+}
+
+void InfeedManager::ReleaseCurrentBuffer(se::DeviceMemoryBase* device_memory) {
+  tensorflow::mutex_lock l(mu_);
+  CHECK(current_buffer_);
+  CHECK(device_memory->IsSameAs(*current_buffer_->device_memory()));
+  current_buffer_->Done();
+  current_buffer_ = nullptr;
+}
+
+se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) {
+  if (host_to_device_executor_ == nullptr) {
+    host_to_device_executor_ = executor;
+    host_to_device_stream_ = MakeUnique<se::Stream>(executor);
+    host_to_device_stream_->Init();
+  }
+
+  if (executor != host_to_device_executor_) {
+    // The requested executor must be the same as the one for which
+    // the stream is cached.
+    return nullptr;
+  }
+
+  return host_to_device_stream_.get();
+}
+
+InfeedManager* GetOrCreateInfeedManager() {
+  static InfeedManager* manager = new InfeedManager;
+  return manager;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
new file mode 100644
index 00000000000..50d0ce340f3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header declares classes for the infeed manager and the infeed
+// buffer that are used by the GPU runtime to transfer buffers into an
+// executing GPU computation, e.g., to feed data into a while loop.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_
+
+#include <deque>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// TODO(b/30467474) Once GPU infeed implementation settles, consider
+// folding back the cpu and gpu infeed implementations into a generic
+// one if possible.
+//
+// Current limitations:
+// * Does not handle multiple devices/replicas.
+//
+// * Buffer space on GPU is allocated on every infeed enqueue request,
+// and it does not handle the case when it runs out of
+// memory. Potential solution is to pre-allocate a fixed amount of
+// memory and block when that memory is full.
+
+// Defines an infeed buffer that is passed to the runtime by
+// the client. The client manages the memory of the buffer.
+class InfeedBuffer {
+ public:
+  InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
+      : executor_(executor), length_(length) {
+    device_memory_ = executor_->AllocateArray<uint8>(length);
+    CHECK(!device_memory_.is_null());
+  }
+
+  ~InfeedBuffer() { executor_->Deallocate(&device_memory_); }
+
+  int64 length() const { return length_; }
+
+  // Callback to signal that this buffer is consumed. This helps the
+  // client to manage memory for the infeed buffers.
+  void Done() { delete this; }
+
+  perftools::gputools::DeviceMemoryBase* device_memory() {
+    return &device_memory_;
+  }
+
+ private:
+  perftools::gputools::StreamExecutor* executor_;  // Not owned.
+  const int64 length_;
+  perftools::gputools::DeviceMemoryBase device_memory_;
+};
+
+// Client-side class used to enqueue infeed buffers.
+class InfeedManager {
+ public:
+  InfeedManager();
+
+  // Calls the completion callback for any enqueued buffers that have
+  // not been dequeued by the runtime, and empties the infeed
+  // queue. Reset may not be called while a runtime computation is
+  // processing a dequeued buffer. The only safe way to ensure this
+  // condition is to call Reset when no computation is taking place.
+  void Reset();
+
+  // Adds buffer to the infeed queue. buffer->Done will be called when
+  // the buffer will no longer be accessed by the InfeedManager,
+  // either as a result of a call to Reset or because the runtime has
+  // dequeued and used the buffer.
+  void EnqueueBuffer(InfeedBuffer* buffer);
+
+  // Blocks until the infeed queue is non-empty, then returns the
+  // buffer at the head of the queue. Sets the current buffer to be
+  // the returned buffer. It is an error to call BlockingDequeueBuffer
+  // if there is an unreleased current buffer, i.e.,
+  // ReleaseCurrentBuffer must be called between calls to
+  // BlockingDequeueBuffer.
+  InfeedBuffer* BlockingDequeueBuffer();
+
+  // Releases the current buffer, which is the last buffer returned by
+  // BlockingDequeueBuffer and not yet released. device_memory must
+  // match that of the current buffer.
+  void ReleaseCurrentBuffer(
+      perftools::gputools::DeviceMemoryBase* device_memory);
+
+  // Returns a cached stream associated with an executor. Allocates a
+  // new stream on the first invocation. On subsequent invocations, if
+  // the cached executor is not the same as the requested executor,
+  // returns null.
+  perftools::gputools::Stream* GetStream(
+      perftools::gputools::StreamExecutor* executor);
+
+ private:
+  tensorflow::mutex mu_;
+  // Condition variable that is signaled every time a buffer is
+  // enqueued to an empty queue.
+  tensorflow::condition_variable cv_;
+  // InfeedBuffer* queue contents are not owned, but buffer->Done must
+  // be called when the buffer is no longer needed by the runtime.
+  std::deque<InfeedBuffer*> enqueued_buffer_;
+  // If non-NULL, the buffer that is currently being processed by the
+  // runtime. Not owned.
+  InfeedBuffer* current_buffer_;
+  // Cached host to device stream for queuing infeed data.
+  std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+  // Executor that the host_to_device_stream belongs to. Not owned.
+  perftools::gputools::StreamExecutor* host_to_device_executor_;
+};
+
+// Singleton creator-or-accessor: Returns the GPU infeed manager.
+InfeedManager* GetOrCreateInfeedManager();
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
new file mode 100644
index 00000000000..6f144c7273e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+InfeedThunk::InfeedThunk(const BufferAllocation::Slice& destination_buffer,
+                         uint64 mem_size, const HloInstruction* hlo_instruction)
+    : Thunk(Kind::kInfeed, hlo_instruction),
+      destination_buffer_(destination_buffer),
+      mem_size_(mem_size) {}
+
+tensorflow::Status InfeedThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations,
+    perftools::gputools::Stream* stream) {
+  VLOG(2) << "Infeeding to GPU ";
+  perftools::gputools::DeviceMemoryBase destination_data =
+      buffer_allocations.GetDeviceAddress(destination_buffer_);
+
+  InfeedManager* infeed_manager = GetOrCreateInfeedManager();
+  InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), mem_size_);
+  stream->ThenMemcpy(&destination_data, *(buffer->device_memory()),
+                     buffer->length());
+  if (!stream->BlockHostUntilDone()) {
+    return InternalError("Failed to complete data transfer on stream %p",
+                         stream);
+  }
+  // Since Infeeds are totally ordered, no other infeed should sneak
+  // in and we should be able to release the same buffer we dequeued.
+  infeed_manager->ReleaseCurrentBuffer(buffer->device_memory());
+  return tensorflow::Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
new file mode 100644
index 00000000000..0a808186c21
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace gpu {
+
+// A thunk that infeeds data. Data must be already resident on the
+// device. This thunk performs an intra-device copy from that location
+// to the buffer allocated for the infeed op.
+class InfeedThunk : public Thunk {
+ public:
+  // Constructs a InfeedThunk that copies data from the on-device
+  // infeed queue to the device buffer
+  // `destination_buffer`. `mem_size` is the size of the data in
+  // bytes.
+  InfeedThunk(const BufferAllocation::Slice& destination_buffer,
+              uint64 mem_size, const HloInstruction* hlo_instruction);
+
+  InfeedThunk(const InfeedThunk&) = delete;
+  InfeedThunk& operator=(const InfeedThunk&) = delete;
+
+  tensorflow::Status ExecuteOnStream(
+      const BufferAllocations& buffer_allocations,
+      perftools::gputools::Stream* stream) override;
+
+ private:
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64 mem_size_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 91fd7ae77a9..a36dcbbd2fa 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -46,6 +46,16 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on GPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
+  // RNG operations are not currently parallel-friendly on GPU.
+  if (producer->opcode() == HloOpcode::kRng) {
+    return false;
+  }
+
   // Do not fuse to-vector reduction into other consumers. They should be
   // unfused or the root of a kInput fusion.
   if (IsReductionToVector(*producer)) {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index 21f3b542a27..bb2990e6dfc 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -25,7 +25,7 @@ namespace gpu {
 class GpuInstructionFusion : public InstructionFusion {
  public:
   explicit GpuInstructionFusion(bool may_duplicate)
-      : InstructionFusion(may_duplicate) {}
+      : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index c58af04bad0..896f6ea8425 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -32,7 +31,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -49,7 +48,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -89,7 +88,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), transpose));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
                    .Run(module.get())
@@ -108,7 +107,7 @@ TEST_F(InstructionFusionTest, GetTupleElementFused) {
       HloInstruction::CreateGetTupleElement(data_shape, param, 1));
   builder.AddInstruction(
       HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, gte0, gte1));
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
@@ -124,3 +123,7 @@ TEST_F(InstructionFusionTest, GetTupleElementFused) {
 
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index e141179ba17..a77d3d7065c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -59,6 +60,11 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
@@ -85,15 +91,19 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 }
 
 bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // Forward convolution.
   if (hlo.opcode() == HloOpcode::kConvolution) {
     const ConvolutionDimensionNumbers& dnums =
         hlo.convolution_dimension_numbers();
-    // Only 2D convolutions are implemented.
-    // TODO(b/32873825): add support for 3D convolutions using CuDNN.
-    if (dnums.spatial_dimensions_size() != 2) {
+    if (dnums.spatial_dimensions_size() > 3) {
       return false;
     }
+
     // CuDNN does not accept zero-element arguments
     if (ShapeUtil::HasZeroElements(hlo.operand(0)->shape()) ||
         ShapeUtil::HasZeroElements(hlo.operand(1)->shape())) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 4d3e9b10b2e..e8c68a6ef72 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -25,16 +25,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-const int64 kWarpSize = 32;
-
-// Precondition: "hlo" is an operand of a Dot instruction.
-//
-// Returns whether "hlo" is foldable to its user.
-bool IsOperandFoldableToDot(const HloInstruction& hlo);
-
-// Returns true if GpuCompiler can fold any operands of "dot" into "dot" for
-// better performance.
-bool CanFoldOperandsIntoDot(const HloInstruction& dot);
+constexpr int64 kWarpSize = 32;
 
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
 bool ImplementedAsGemm(const HloInstruction& hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index cad2c903ff3..7d5b6ed5cfa 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -54,11 +54,12 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
     : ir_emitter_context_(ir_emitter_context),
       ir_builder_(ir_emitter_context->llvm_module()->getContext()),
       bindings_(ir_emitter_context->hlo_module(),
-                &ir_emitter_context->buffer_assignment(),
-                &ir_emitter_context->temp_buffer_offsets(), &ir_builder_,
+                &ir_emitter_context->buffer_assignment(), &ir_builder_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(hlo_module_config));
+  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+      /*fast_math_enabled=*/hlo_module_config.debug_options()
+          .xla_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
@@ -99,7 +100,7 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   // sometimes, e.g., when it's operand is a constant or a bitcast of a
   // constant.
   if (bindings_.BoundToIrValue(*operand)) {
-    bindings_.BindHloToIrValue(*bitcast, bindings_.GetBasePointer(*operand));
+    bindings_.BindHloToIrValue(*bitcast, GetBasePointer(*operand));
   }
   return Status::OK();
 }
@@ -400,7 +401,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot,
   llvm::Type* accum_type = target_array.GetElementLlvmType();
   llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
       accum_type,       // The pointee type of the alloca instruction.
-      "accum_address",  // The name of the alloca instuction.
+      "accum_address",  // The name of the alloca instruction.
       &ir_builder_);
 
   // Initialize the accumulator in the preheader to zero.
@@ -431,12 +432,12 @@ Status IrEmitter::HandleDot(HloInstruction* dot,
   // and lhs indexes with the reduction dimensions removed. The terms from the
   // rhs index are the lower dimensions in the index so we add them first.
   llvm_ir::IrArray::Index target_index;
-  for (int dimension = 0; dimension < lhs_index.size(); ++dimension) {
+  for (size_t dimension = 0; dimension < lhs_index.size(); ++dimension) {
     if (dimension != lhs_reduction_dimension) {
       target_index.push_back(lhs_index[dimension]);
     }
   }
-  for (int dimension = 0; dimension < rhs_index.size(); ++dimension) {
+  for (size_t dimension = 0; dimension < rhs_index.size(); ++dimension) {
     if (dimension != rhs_reduction_dimension) {
       target_index.push_back(rhs_index[dimension]);
     }
@@ -514,7 +515,7 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce, HloInstruction* arg,
         llvm_ir::IrArray::Index input_index = reduced_dims_index;
         llvm_ir::IrArray::Index::const_iterator it = index.begin();
 
-        for (int64 i = 0; i < input_index.size(); ++i) {
+        for (size_t i = 0; i < input_index.size(); ++i) {
           if (input_index[i] == nullptr) {
             input_index[i] = *it++;
           }
@@ -550,14 +551,12 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   return EmitTargetElementLoop(*fusion, fused_emitter.GetRootGenerator());
 }
 
-Status IrEmitter::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
+Status IrEmitter::HandleCall(HloInstruction* call) {
   std::vector<llvm::Value*> operand_addresses;
-  for (HloInstruction* operand : operands) {
+  for (HloInstruction* operand : call->operands()) {
     operand_addresses.push_back(GetBasePointer(*operand));
   }
-  return EmitCallToNestedComputation(*computation, operand_addresses,
+  return EmitCallToNestedComputation(*call->to_apply(), operand_addresses,
                                      GetBasePointer(*call));
 }
 
@@ -615,7 +614,7 @@ llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
   llvm_ir::IrArray::Index index =
       loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
   // Verify every dimension except the reduction dimension was set in the index.
-  for (int dimension = 0; dimension < index.size(); ++dimension) {
+  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
     if (dimension == reduction_dimension) {
       DCHECK_EQ(nullptr, index[dimension]);
     } else {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index c8ca5c41b08..607a366ac67 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -101,9 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                       HloInstruction* on_true,
                       HloInstruction* on_false) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
@@ -127,12 +125,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm::Value* GetBasePointer(const HloInstruction& inst) const {
     return bindings_.GetBasePointer(inst);
   }
-  // A convenient helper for calling BufferAssignment::GetAllocationIndex.
-  BufferAllocation::Index GetAllocationIndex(const HloInstruction& hlo) const {
+  // A convenient helper for calling BufferAssignment::GetUniqueTopLevelSlice.
+  BufferAllocation::Slice GetAllocationSlice(const HloInstruction& hlo) const {
     return ir_emitter_context_->buffer_assignment()
-        .GetUniqueTopLevelAllocation(&hlo)
-        .ConsumeValueOrDie()
-        ->index();
+        .GetUniqueTopLevelSlice(&hlo)
+        .ConsumeValueOrDie();
   }
 
   // Emit a singlethreaded or multithreaded loop that computes every element in
@@ -250,8 +247,8 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleTuple(
       HloInstruction* tuple,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleInfeed(HloInstruction* xla_infeed) override;
   Status HandleRng(HloInstruction* random,
                    RandomDistribution distribution) override;
   Status HandleSelect(HloInstruction* select, HloInstruction* pred,
@@ -345,6 +342,10 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a CopyThunk that calls host-to-device cuMemcpy to implement `inst`.
   std::unique_ptr<Thunk> BuildCopyThunk(const HloInstruction* inst);
 
+  // Returns an InfeedThunk that performs device-to-device memcpy to implement
+  // `inst`.
+  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
+
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction 'hlo'.
   std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index b204d9625c1..454c3f9ab2d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -33,12 +32,10 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
-                   const TempBufferOffsets* temp_buffer_offsets,
                    const perftools::gputools::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
-        temp_buffer_offsets_(temp_buffer_offsets),
         device_desc_(device_desc),
         llvm_module_(llvm_module) {}
   // Disallow copy and assign.
@@ -50,9 +47,6 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
-  const TempBufferOffsets& temp_buffer_offsets() const {
-    return *temp_buffer_offsets_;
-  }
   const perftools::gputools::DeviceDescription& device_description() const {
     return *device_desc_;
   }
@@ -62,7 +56,6 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
-  const TempBufferOffsets* temp_buffer_offsets_;
   const perftools::gputools::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index c107f9cbbe2..5fa2bfdd7e4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
@@ -190,14 +191,15 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   // The last argument is a pointer to the temporary buffer memory block.
   // We know that it doesn't alias any of the escaped arguments (the inputs +
   // the result).  We also know how many bytes can be dereferenced in it.
-  const llvm::Argument& temp_buffer = kernel->getArgumentList().back();
-  int64 temp_buffer_size =
-      ir_emitter_context_->temp_buffer_offsets().TotalSizeInBytes();
+  const llvm::Argument& temp_buffer = *std::prev(kernel->arg_end());
   int64 temp_buffer_arg_no = temp_buffer.getArgNo();
-  if (temp_buffer_size > 0) {
-    kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, temp_buffer_size);
+  int64 temp_allocation_total_size =
+      ir_emitter_context_->buffer_assignment().temp_allocation_total_size();
+  if (temp_allocation_total_size != 0) {
+    kernel->addDereferenceableAttr(temp_buffer_arg_no + 1,
+                                   temp_allocation_total_size);
   }
-  kernel->setDoesNotAlias(temp_buffer_arg_no + 1);
+  kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
 
   // Add the declaration of this kernel to llvm.nvvm.annotations so that NVPTX
   // treats it as a CUDA kernel.
@@ -249,6 +251,46 @@ Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution,
                                       rhs_instruction, window);
 }
 
+namespace {
+
+// Returns the first non-GetTupleElement ancestor instruction of 'hlo'.
+// If the first non-GTE ancestor is tuple-shaped, populates 'index' with the
+// (possibly nested) tuple indices used on the path from ancestor to 'hlo'.
+const HloInstruction* LatestNonGteAncestorAndIndex(const HloInstruction* hlo,
+                                                   ShapeIndex* index) {
+  if (hlo->opcode() == HloOpcode::kGetTupleElement) {
+    const auto* operand = LatestNonGteAncestorAndIndex(hlo->operand(0), index);
+    index->push_back(hlo->tuple_index());
+    return operand;
+  }
+  return hlo;
+}
+
+// Checks if we can emit code for DynamicUpdateSlice to update data in-place.
+// Returns true if operand 0 of DynamicUpdateSlice and its output buffer
+// share the same buffer allocation.
+// Returns false otherwise.
+bool CanUpdateDynamicSliceInPlace(const BufferAssignment& assignment,
+                                  HloInstruction* fusion) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  HloInstruction* fused_root = fusion->fused_expression_root();
+  if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice) {
+    return false;
+  }
+  // Walk DynamicUpdateSlice operand(0) to fused parameter and get its
+  // associated operand. See if it shares an allocation with this operand.
+  ShapeIndex index;
+  auto* fusion_operand =
+      LatestNonGteAncestorAndIndex(fused_root->operand(0), &index);
+  if (fusion_operand->opcode() != HloOpcode::kParameter) {
+    return false;
+  }
+  auto* operand = fusion->operand(fusion_operand->parameter_number());
+  return assignment.SharesSliceAtIndex(fusion, {}, operand, index);
+}
+
+}  // namespace
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   // HandleFusion specializes reduction from a multi-dimensional array to a 1D
@@ -277,20 +319,19 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
         Shape input_shape = root->operand(0)->shape();
-        // EmitRedutionToVector requires the input shape to have a layout, but
+        // EmitReductionToVector requires the input shape to have a layout, but
         // fused instructions don't have one. So we determine its layout from
         // the fusion's operands. The choice of the layout only affects
         // performance but not correctness.
         auto choose_input_layout = [](
             tensorflow::gtl::ArraySlice<const HloInstruction*> operands,
-            Shape* input_shape) {
+            Shape* input_shape) -> Status {
           // Prefer the layout of an operand whose shape is compatible with
           // input_shape.
           for (const HloInstruction* operand : operands) {
             if (ShapeUtil::Compatible(*input_shape, operand->shape())) {
-              LayoutUtil::CopyLayoutBetweenShapes(operand->shape(),
-                                                  input_shape);
-              return;
+              return LayoutUtil::CopyLayoutBetweenShapes(operand->shape(),
+                                                         input_shape);
             }
           }
           // If no operand has a compatible shape, prefer an operand that has
@@ -301,24 +342,114 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
               // Do not use CopyLayoutBetweenShapes because input_shape and
               // operand->shape() may be incompatible.
               *input_shape->mutable_layout() = operand->shape().layout();
-              return;
+              return Status::OK();
             }
           }
           // When all the above fails, which is rare, set the default layout.
           LayoutUtil::SetToDefaultLayout(input_shape);
+          return Status::OK();
         };
-        choose_input_layout(fusion->operands(), &input_shape);
+        TF_RETURN_IF_ERROR(
+            choose_input_layout(fusion->operands(), &input_shape));
 
         return EmitReductionToVector(
             root, input_shape, fused_emitter.GetGenerator(root->operand(0)),
             fused_emitter.GetGenerator(root->operand(1)), root->dimensions(),
             root->to_apply());
-        break;
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
                    << fusion->fused_expression_root()->opcode();
     }
+  } else if (HloInstruction::FusionKind::kLoop == fusion->fusion_kind() &&
+             root->opcode() == HloOpcode::kDynamicUpdateSlice &&
+             CanUpdateDynamicSliceInPlace(
+                 ir_emitter_context_->buffer_assignment(), fusion)) {
+    // Loop fusion instruction with DynamicUpdateSlice as fused root.
+    // DynamicUpdateSlice's operand(0) and 'fusion' output share the same
+    // BufferAllocation::Slice, so it is safe to emit code to update the slice
+    // 'in-place'. This avoids copying data outside of the slice update region.
+
+    // Set up kernel thunk and fused ir emitter.
+    thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
+    std::vector<llvm_ir::IrArray> parameter_arrays;
+    for (HloInstruction* operand : fusion->operands()) {
+      parameter_arrays.push_back(GetIrArray(*operand));
+    }
+    GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
+                                            ir_emitter_context_->llvm_module(),
+                                            &ir_builder_, GetNestedComputer());
+    FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
+    TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
+
+    // Recursively lookup 'fusion_operand' for DynamicUpdateSlice operand 0.
+    auto* fusion_operand = LatestNonGteAncestor(root->operand(0));
+    CHECK_EQ(HloOpcode::kParameter, fusion_operand->opcode());
+
+    // Operand(0) the input array which shares an allocation with the output.
+    const auto* input = root->operand(0);
+    llvm::Value* input_base_ptr = fused_emitter.GetIrValueForGTE(input);
+    // Operand(1) 'update' is slice with which to update input at operand(0).
+    const auto* update = root->operand(1);
+    Shape update_shape = update->shape();
+    TF_RETURN_IF_ERROR(
+        LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));
+    // Operand(2) the dynamic slice indices at which to write 'update'.
+    const auto* start_indices = root->operand(2);
+
+    // Create element generators for 'update' and 'start_indices'.
+    llvm_ir::ElementGenerator element_generator =
+        fused_emitter.GetGenerator(update);
+    llvm_ir::ElementGenerator start_generator =
+        fused_emitter.GetGenerator(start_indices);
+
+    // Create loop body emitter which emits code to do the following:
+    // *) Read dynamic slice start indices into 'start_index'.
+    // *) Map requested 'index' and slice 'start_index' to input/output shape
+    //    as 'output_index'.
+    // *) Reads value from 'update' element generator.
+    // *) Writes value to input/output array at 'output_index'.
+    auto loop_body_emitter =
+        [=](const llvm_ir::IrArray::Index& index) -> Status {
+      // Emit IR to read dynamic start indices from hlo->operand(2).
+      const int64 rank = ShapeUtil::Rank(input->shape());
+      llvm_ir::IrArray::Index start_index(rank);
+      for (int64 i = 0; i < rank; ++i) {
+        llvm_ir::IrArray::Index dim_index({ir_builder_.getInt64(i)});
+        TF_ASSIGN_OR_RETURN(start_index[i], start_generator(dim_index));
+      }
+
+      // Calculate 'output_index' at which to write value from update.
+      llvm_ir::IrArray::Index output_index(rank);
+      for (int64 i = 0; i < rank; ++i) {
+        // Emit IR which computes:
+        //   output_index = (start_index + index) % dim_size
+        llvm::Value* dim_size = llvm::ConstantInt::get(
+            index[i]->getType(), input->shape().dimensions(i));
+        llvm::Value* start_index0 = ir_builder_.CreateZExtOrBitCast(
+            start_index[i], index[i]->getType());
+        output_index[i] = ir_builder_.CreateURem(
+            ir_builder_.CreateAdd(start_index0, index[i]), dim_size);
+      }
+
+      // Read value from 'update'.
+      TF_ASSIGN_OR_RETURN(llvm::Value * input_value, element_generator(index));
+      // Write value to output array.
+      llvm_ir::IrArray(input_base_ptr, input->shape())
+          .EmitWriteArrayElement(output_index, input_value, &ir_builder_);
+      return Status::OK();
+    };
+
+    // Create loop which iterates over 'update' shape.
+    LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+        update_shape, ir_emitter_context_->device_description());
+    CHECK(Thunk::Kind::kKernel == LastThunk()->kind());
+    UpdateLaunchDimensions(launch_dimensions,
+                           static_cast<KernelThunk*>(LastThunk()),
+                           ir_emitter_context_->llvm_module());
+    return ParallelLoopEmitter(loop_body_emitter, update_shape,
+                               launch_dimensions, &ir_builder_)
+        .EmitLoop();
   }
   if (ImplementedAsGemm(*fusion)) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
@@ -1195,12 +1326,12 @@ Status IrEmitterUnnested::HandleTuple(
   // buffer -- their contents are stored in code. In that case, we fall back
   // to emitting kernels which have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
-    std::vector<BufferAllocation::Index> tuple_element_buffers;
+    std::vector<BufferAllocation::Slice> tuple_element_buffers;
     for (const HloInstruction* tuple_element : operands) {
-      tuple_element_buffers.push_back(GetAllocationIndex(*tuple_element));
+      tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
     thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
-        tuple_element_buffers, GetAllocationIndex(*tuple), tuple));
+        tuple_element_buffers, GetAllocationSlice(*tuple), tuple));
     return Status::OK();
   }
   // If `inst` is a nested thunk that can be disassembled from the result tuple,
@@ -1412,10 +1543,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       .EmitLoop();
 }
 
-Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while,
-                                      HloInstruction* init,
-                                      HloComputation* condition,
-                                      HloComputation* body) {
+Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
+  HloComputation* condition = xla_while->while_condition();
   TF_RET_CHECK(ShapeUtil::IsScalar(condition->root_instruction()->shape()) &&
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
@@ -1451,6 +1580,11 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select,
   return IrEmitter::HandleSelect(select, pred, on_true, on_false);
 }
 
+Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
+  thunk_sequence_->emplace_back(BuildInfeedThunk(infeed));
+  return Status::OK();
+}
+
 llvm::Function* IrEmitterUnnested::EmitBasePointersForHloAndItsOperands(
     const HloInstruction& hlo, std::vector<const HloInstruction*>* io_hlos) {
   const BufferAssignment& buffer_assignment =
@@ -1463,8 +1597,9 @@ llvm::Function* IrEmitterUnnested::EmitBasePointersForHloAndItsOperands(
   for (const HloInstruction* operand : hlo.operands()) {
     const HloInstruction* to_lookup = LatestNonGteAncestor(operand);
     if (buffer_assignment.HasTopLevelAllocation(to_lookup) &&
-        buffer_assignment.GetUniqueTopLevelAllocation(to_lookup)
+        buffer_assignment.GetUniqueTopLevelSlice(to_lookup)
             .ConsumeValueOrDie()
+            .allocation()
             ->IsInputOrOutput()) {
       io_hlos->push_back(operand);
     } else {
@@ -1474,8 +1609,9 @@ llvm::Function* IrEmitterUnnested::EmitBasePointersForHloAndItsOperands(
 
   CHECK_NE(HloOpcode::kGetTupleElement, hlo.opcode());
   if (buffer_assignment.HasTopLevelAllocation(&hlo) &&
-      buffer_assignment.GetUniqueTopLevelAllocation(&hlo)
+      buffer_assignment.GetUniqueTopLevelSlice(&hlo)
           .ConsumeValueOrDie()
+          .allocation()
           ->IsInputOrOutput()) {
     io_hlos->push_back(&hlo);
   } else {
@@ -1496,9 +1632,10 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
       EmitBasePointersForHloAndItsOperands(*inst, &io_hlos);
 
   // Compute the input buffer indices.
-  std::vector<BufferAllocation::Index> io_buffers;
+  std::vector<BufferAllocation::Slice> io_buffers;
+  io_buffers.reserve(io_hlos.size());
   for (const HloInstruction* io_hlo : io_hlos) {
-    io_buffers.push_back(GetAllocationIndex(*LatestNonGteAncestor(io_hlo)));
+    io_buffers.push_back(GetAllocationSlice(*LatestNonGteAncestor(io_hlo)));
   }
 
   // Create a KernelThunk that launches the kernel that implements "inst".
@@ -1512,10 +1649,21 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildCopyThunk(
   CHECK_EQ(HloOpcode::kConstant, operand->opcode());
   return MakeUnique<CopyThunk>(
       /*source_address=*/LiteralUtil::InternalData(operand->literal()),
-      /*destination_buffer=*/GetAllocationIndex(*inst),
-      /*mem_size=*/llvm_ir::ByteSizeOf(
-          operand->shape(),
-          ir_emitter_context_->llvm_module()->getDataLayout()),
+      /*destination_buffer=*/GetAllocationSlice(*inst),
+      /*mem_size=*/
+      llvm_ir::ByteSizeOf(operand->shape(),
+                          ir_emitter_context_->llvm_module()->getDataLayout()),
+      inst);
+}
+
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
+    const HloInstruction* inst) {
+  CHECK_EQ(HloOpcode::kInfeed, inst->opcode());
+  return MakeUnique<InfeedThunk>(
+      /*destination_buffer=*/GetAllocationSlice(*inst),
+      /*mem_size=*/
+      llvm_ir::ByteSizeOf(inst->shape(),
+                          ir_emitter_context_->llvm_module()->getDataLayout()),
       inst);
 }
 
@@ -1525,9 +1673,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     const HloInstruction* lhs = inst->operand(0);
     const HloInstruction* rhs = inst->operand(1);
     return MakeUnique<GemmThunk>(
-        GetAllocationIndex(*lhs),   // The buffer assigned to LHS.
-        GetAllocationIndex(*rhs),   // The buffer assigned to RHS.
-        GetAllocationIndex(*inst),  // The output buffer.
+        GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+        GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+        GetAllocationSlice(*inst),  // The output buffer.
         lhs->shape(),               // The shape of LHS.
         rhs->shape(),               // The shape of RHS.
         inst->shape(),              // The shape of the output.
@@ -1549,9 +1697,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
         inst->operand(rhs_parameter->parameter_number());
 
     return MakeUnique<GemmThunk>(
-        GetAllocationIndex(*lhs),             // The buffer assigned to LHS.
-        GetAllocationIndex(*rhs),             // The buffer assigned to RHS.
-        GetAllocationIndex(*inst),            // The output buffer.
+        GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
+        GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
+        GetAllocationSlice(*inst),            // The output buffer.
         lhs->shape(),                         // The shape of LHS.
         rhs->shape(),                         // The shape of RHS.
         inst->shape(),                        // The shape of the output.
@@ -1571,9 +1719,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
     // Forward covolution.
     return MakeUnique<ConvolutionThunk>(
         ConvolutionThunk::ConvolutionKind::kForward,
-        /*input_buffer=*/GetAllocationIndex(*lhs),
-        /*filter_buffer=*/GetAllocationIndex(*rhs),
-        /*output_buffer=*/GetAllocationIndex(*inst),
+        /*input_buffer=*/GetAllocationSlice(*lhs),
+        /*filter_buffer=*/GetAllocationSlice(*rhs),
+        /*output_buffer=*/GetAllocationSlice(*inst),
         /*input_shape=*/lhs->shape(),
         /*filter_shape=*/rhs->shape(),
         /*output_shape=*/inst->shape(), inst->window(),
@@ -1587,9 +1735,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
     case HloInstruction::FusionKind::kConvBackwardFilter:
       return MakeUnique<ConvolutionThunk>(
           ConvolutionThunk::ConvolutionKind::kBackwardFilter,
-          /*input_buffer=*/GetAllocationIndex(*lhs),
-          /*filter_buffer=*/GetAllocationIndex(*inst),
-          /*output_buffer=*/GetAllocationIndex(*rhs),
+          /*input_buffer=*/GetAllocationSlice(*lhs),
+          /*filter_buffer=*/GetAllocationSlice(*inst),
+          /*output_buffer=*/GetAllocationSlice(*rhs),
           /*input_shape=*/lhs->shape(),
           /*filter_shape=*/inst->shape(),
           /*output_shape=*/rhs->shape(), inst->window(),
@@ -1597,9 +1745,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
     case HloInstruction::FusionKind::kConvBackwardInput:
       return MakeUnique<ConvolutionThunk>(
           ConvolutionThunk::ConvolutionKind::kBackwardInput,
-          /*input_buffer=*/GetAllocationIndex(*inst),
-          /*filter_buffer=*/GetAllocationIndex(*rhs),
-          /*output_buffer=*/GetAllocationIndex(*lhs),
+          /*input_buffer=*/GetAllocationSlice(*inst),
+          /*filter_buffer=*/GetAllocationSlice(*rhs),
+          /*output_buffer=*/GetAllocationSlice(*lhs),
           /*input_shape=*/inst->shape(),
           /*filter_shape=*/rhs->shape(),
           /*output_shape=*/lhs->shape(), inst->window(),
@@ -1651,26 +1799,23 @@ namespace {
 Status CheckWhileBuffersShareAllocation(
     const HloInstruction* xla_while,
     const BufferAssignment& buffer_assignment) {
-  return ShapeUtil::ForEachSubshape(
+  return ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
       [&buffer_assignment, &xla_while](const Shape& /*subshape*/,
                                        const ShapeIndex& index) -> Status {
         auto check = [&buffer_assignment](const HloInstruction* a,
                                           const HloInstruction* b,
                                           const ShapeIndex& index) -> Status {
-          BufferAllocation::Index index_a =
-              buffer_assignment.GetUniqueAllocation(a, index)
-                  .ConsumeValueOrDie()
-                  ->index();
-          BufferAllocation::Index index_b =
-              buffer_assignment.GetUniqueAllocation(b, index)
-                  .ConsumeValueOrDie()
-                  ->index();
-          if (index_a != index_b) {
+          const BufferAllocation::Slice slice_a =
+              buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
+          const BufferAllocation::Slice slice_b =
+              buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
+          if (slice_a != slice_b) {
             return InternalError(
-                "instruction %s does not share allocation with "
-                "instruction %s ",
-                a->ToString().c_str(), b->ToString().c_str());
+                "instruction %s %s does not share allocation with "
+                "instruction %s %s",
+                a->ToString().c_str(), slice_a.ToString().c_str(),
+                b->ToString().c_str(), slice_b.ToString().c_str());
           }
           return Status::OK();
         };
@@ -1710,7 +1855,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
   TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body));
 
   return MakeUnique<WhileThunk>(
-      GetAllocationIndex(*condition->root_instruction()),        // cond result
+      GetAllocationSlice(*condition->root_instruction()),  // cond result
       ir_emitter_condition.ConsumeThunkSequence(),
       ir_emitter_body.ConsumeThunkSequence(), hlo);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 14760fe92cc..69399e36c4c 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -28,11 +28,9 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
-using Index = BufferAllocation::Index;
-
-KernelThunk::KernelThunk(tensorflow::gtl::ArraySlice<Index> io_buffers,
-                         const string& kernel_name,
-                         const HloInstruction* hlo_instruction)
+KernelThunk::KernelThunk(
+    tensorflow::gtl::ArraySlice<BufferAllocation::Slice> io_buffers,
+    const string& kernel_name, const HloInstruction* hlo_instruction)
     : Thunk(Kind::kKernel, hlo_instruction),
       io_buffers_(io_buffers.begin(), io_buffers.end()),
       kernel_name_(kernel_name) {}
@@ -62,20 +60,25 @@ tensorflow::Status KernelThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // Load the kernel.
   se::StreamExecutor* executor = stream->parent();
-  se::KernelBase kernel(executor);
   LaunchDimensions launch_dimensions;
+  const se::KernelBase* kernel = nullptr;
   {
     tensorflow::mutex_lock lock(mutex_);
-    if (!executor->GetKernel(*loader_spec_, &kernel)) {
-      return InternalError("Unable to load kernel %s", kernel_name_.c_str());
+    auto it = kernel_cache_.find(executor);
+    if (kernel_cache_.end() == it) {
+      it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
+      if (!executor->GetKernel(*loader_spec_, &it->second)) {
+        return InternalError("Unable to load kernel %s", kernel_name_.c_str());
+      }
     }
     launch_dimensions = launch_dimensions_;
+    kernel = &it->second;
   }
 
   // Launch the kernel with potentially multiple blocks and threads.
   static constexpr int kKernelArgsLimit = 1024;
   auto kernel_args = MakeUnique<se::KernelArgsArray<kKernelArgsLimit>>();
-  for (BufferAllocation::Index io_buffer : io_buffers_) {
+  for (const BufferAllocation::Slice io_buffer : io_buffers_) {
     kernel_args->add_device_memory_argument(
         buffer_allocations.GetDeviceAddress(io_buffer));
   }
@@ -83,7 +86,7 @@ tensorflow::Status KernelThunk::ExecuteOnStream(
       buffer_allocations.GetTempBufferBase());
   if (!stream->parent()->Launch(
           stream, se::ThreadDim(launch_dimensions.threads_per_block()),
-          se::BlockDim(launch_dimensions.block_count()), kernel,
+          se::BlockDim(launch_dimensions.block_count()), *kernel,
           *kernel_args)) {
     return InternalError("Unable to launch kernel %s", kernel_name_.c_str());
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 901825873ae..350b5aaf360 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -46,7 +46,7 @@ class KernelThunk : public Thunk {
   // Constructs a thunk for the given kernel.
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
-  KernelThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Index> io_buffers,
+  KernelThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Slice> io_buffers,
               const string& kernel_name, const HloInstruction* hlo_instruction);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
@@ -64,7 +64,7 @@ class KernelThunk : public Thunk {
 
  private:
   // The indices of the input/output buffers.
-  const std::vector<BufferAllocation::Index> io_buffers_;
+  const std::vector<BufferAllocation::Slice> io_buffers_;
 
   // Entry kernel name for the computation.
   const string kernel_name_;
@@ -78,6 +78,11 @@ class KernelThunk : public Thunk {
   mutable tensorflow::mutex mutex_;
   std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
       GUARDED_BY(mutex_);
+
+  // Loaded kernels for each `StreamExecutor`
+  std::unordered_map<perftools::gputools::StreamExecutor*,
+                     perftools::gputools::KernelBase>
+      kernel_cache_ GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
index ff6cfd94484..66cc7b3e40d 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
@@ -79,26 +79,37 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       // calls after we switch to cuDNN v5.
       const ConvolutionDimensionNumbers& dimension_numbers =
           instruction->convolution_dimension_numbers();
+      std::vector<int64> input_layout;
+      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
+           --i) {
+        input_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      }
+      input_layout.push_back(dimension_numbers.feature_dimension());
+      input_layout.push_back(dimension_numbers.batch_dimension());
       Shape input_shape(input->shape());
-      *input_shape.mutable_layout() =
-          LayoutUtil::MakeLayout({dimension_numbers.spatial_dimensions(1),
-                                  dimension_numbers.spatial_dimensions(0),
-                                  dimension_numbers.feature_dimension(),
-                                  dimension_numbers.batch_dimension()});
+      *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
 
+      std::vector<int64> filter_layout;
+      for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1;
+           i >= 0; --i) {
+        filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
+      }
+      filter_layout.push_back(
+          dimension_numbers.kernel_input_feature_dimension());
+      filter_layout.push_back(
+          dimension_numbers.kernel_output_feature_dimension());
       Shape filter_shape(filter->shape());
-      *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(
-          {dimension_numbers.kernel_spatial_dimensions(1),
-           dimension_numbers.kernel_spatial_dimensions(0),
-           dimension_numbers.kernel_input_feature_dimension(),
-           dimension_numbers.kernel_output_feature_dimension()});
+      *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
 
+      std::vector<int64> output_layout;
+      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
+           --i) {
+        output_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      }
+      output_layout.push_back(dimension_numbers.feature_dimension());
+      output_layout.push_back(dimension_numbers.batch_dimension());
       Shape output_shape(output->shape());
-      *output_shape.mutable_layout() =
-          LayoutUtil::MakeLayout({dimension_numbers.spatial_dimensions(1),
-                                  dimension_numbers.spatial_dimensions(0),
-                                  dimension_numbers.feature_dimension(),
-                                  dimension_numbers.batch_dimension()});
+      *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
 
       // Set layouts of the instructions' shapes.
       if (instruction->opcode() == HloOpcode::kConvolution) {
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
index 692ec8147d3..fa258b6e567 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
@@ -55,9 +55,9 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        HloModule module(TestName());
+        auto module = CreateNewModule();
         HloComputation* computation =
-            module.AddEntryComputation(builder.Build(add));
+            module->AddEntryComputation(builder.Build(add));
 
         ComputationLayout computation_layout(
             computation->ComputeProgramShape());
@@ -69,7 +69,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             ShapeLayout(result_shape_with_layout);
 
         GpuLayoutAssignment layout_assignment(&computation_layout);
-        EXPECT_TRUE(layout_assignment.Run(&module).ValueOrDie());
+        EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         for (const HloInstruction* operand : add->operands()) {
           EXPECT_TRUE(LayoutUtil::Equal(add->shape().layout(),
@@ -83,3 +83,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 12ea573a9c1..e03571a9672 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #include "external/llvm/include/llvm/ADT/STLExtras.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h"
 #include "external/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
 
+#include "external/llvm/include/llvm/Transforms/IPO/Internalize.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -68,29 +70,64 @@ namespace {
 // Default inline threshold value to use in llvm.
 const int kDefaultInlineThreshold = 1100;
 
-// Information about a GPU architecture for the backend.
-struct GpuBackendInfo {
-  string libdevice_name;
-  string sm_name;
-};
-
-// Maps supported CUDA compute capability to a libdevice file to link for this
-// capability.
-std::map<string, GpuBackendInfo> gpu_info_map = {
-    {"compute_20", {"libdevice.compute_20.10.bc", "sm_20"}},
-    {"compute_30", {"libdevice.compute_30.10.bc", "sm_30"}},
-    {"compute_35", {"libdevice.compute_35.10.bc", "sm_35"}},
-
-    // NVIDIA does not provide a separate libdevice for CC 3.7, but we can use
-    // the one for 3.5.
-    {"compute_37", {"libdevice.compute_35.10.bc", "sm_37"}},
-};
-
-// Validate the --gpu_architecture command-line flag.
-static void ValidateGPUArchitecture(const string& value) {
-  if (!gpu_info_map.count(value)) {
-    LOG(FATAL) << "value for --gpu_architecture must be compute_{20,30,35,37}";
+// Gets the libdevice filename for a particular compute capability.  When
+// presented with a GPU we don't recognize, we just return the libdevice from
+// compute_20.
+static string GetLibdeviceFilename(std::pair<int, int> compute_capability) {
+  // There are only four libdevice files: compute_{20,30,35,50}.  Each GPU
+  // version gets mapped to one of these.  Note in particular that sm_60 and
+  // sm_61 map to libdevice.compute_30.
+  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
+                                                           {{2, 1}, 20},
+                                                           {{3, 0}, 30},
+                                                           {{3, 2}, 30},
+                                                           {{3, 5}, 35},
+                                                           {{3, 7}, 35},
+                                                           {{5, 0}, 50},
+                                                           {{5, 2}, 50},
+                                                           {{5, 3}, 50},
+                                                           {{6, 0}, 30},
+                                                           {{6, 1}, 30},
+                                                           {{6, 2}, 30}});
+  int libdevice_version = 20;
+  auto it = m->find(compute_capability);
+  if (it != m->end()) {
+    libdevice_version = it->second;
+  } else {
+    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
+                 << ", " << compute_capability.second << ") ."
+                 << "Defaulting to libdevice for compute_" << libdevice_version;
   }
+  return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version,
+                                     ".10.bc");
+}
+
+// Gets the GPU name as it's known to LLVM for a given compute capability.  If
+// we see an unrecognized compute capability, we return "sm_20".
+static string GetSmName(std::pair<int, int> compute_capability) {
+  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
+                                                           {{2, 1}, 21},
+                                                           {{3, 0}, 30},
+                                                           {{3, 2}, 32},
+                                                           {{3, 5}, 35},
+                                                           {{3, 7}, 37},
+                                                           {{5, 0}, 50},
+                                                           {{5, 2}, 52},
+                                                           {{5, 3}, 53},
+                                                           {{6, 0}, 60},
+                                                           {{6, 1}, 61},
+                                                           {{6, 2}, 62}});
+  int sm_version = 20;
+  auto it = m->find(compute_capability);
+  if (it != m->end()) {
+    sm_version = it->second;
+  } else {
+    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
+                 << ", " << compute_capability.second << ") ."
+                 << "Defaulting to telling LLVM that we're compiling for sm_"
+                 << sm_version;
+  }
+  return tensorflow::strings::StrCat("sm_", sm_version);
 }
 
 // Convenience function for producing a name of a temporary compilation product
@@ -135,8 +172,10 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
-  // Set options from hlo_module_config (specifically, fast-math flags).
-  llvm_ir::SetTargetOptions(hlo_module_config, &target_options);
+  llvm_ir::SetTargetOptions(
+      /*fast_math_enabled=*/hlo_module_config.debug_options()
+          .xla_enable_fast_math(),
+      &target_options);
 
   // Enable FMA synthesis if desired.
   legacy_flags::GpuBackendLibFlags* flags =
@@ -270,39 +309,41 @@ bool CouldNeedLibdevice(const llvm::Module& module) {
 }
 
 // Links libdevice into the given module if the module needs libdevice.
-tensorflow::Status LinkLibdeviceIfNecessary(const string& libdevice_dir_path,
-                                            llvm::Module* module) {
+tensorflow::Status LinkLibdeviceIfNecessary(
+    llvm::Module* module, std::pair<int, int> compute_capability,
+    const string& libdevice_dir_path) {
   if (!CouldNeedLibdevice(*module)) {
     return tensorflow::Status::OK();
   }
 
   llvm::Linker linker(*module);
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  ValidateGPUArchitecture(flags->gpu_architecture);
-  string libdevice_bc_filename =
-      gpu_info_map[flags->gpu_architecture].libdevice_name;
-  string libdevice_bc_fullpath =
-      tensorflow::io::JoinPath(libdevice_dir_path, libdevice_bc_filename);
-  TF_RETURN_IF_ERROR(
-      tensorflow::Env::Default()->FileExists(libdevice_bc_fullpath));
+  string libdevice_path = tensorflow::io::JoinPath(
+      libdevice_dir_path, GetLibdeviceFilename(compute_capability));
+  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
+  VLOG(1) << "Linking with libdevice from: " << libdevice_path;
   std::unique_ptr<llvm::Module> libdevice_module =
-      LoadIRModule(libdevice_bc_fullpath, &module->getContext());
-  VLOG(1) << "Linking with libdevice from: " << libdevice_bc_fullpath;
-  if (linker.linkInModule(std::move(libdevice_module),
-                          llvm::Linker::Flags::InternalizeLinkedSymbols |
-                              llvm::Linker::Flags::LinkOnlyNeeded)) {
-    LOG(FATAL) << "Error linking libdevice from " << libdevice_bc_fullpath;
+      LoadIRModule(libdevice_path, &module->getContext());
+  if (linker.linkInModule(
+          std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
+          [](Module& M, const StringSet<>& GVS) {
+            internalizeModule(M, [&M, &GVS](const GlobalValue& GV) {
+              return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+            });
+          })) {
+    return tensorflow::errors::Internal(tensorflow::strings::StrCat(
+        "Error linking libdevice from ", libdevice_path));
   }
   return tensorflow::Status::OK();
 }
 
 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
+                                    std::pair<int, int> compute_capability,
                                     const HloModuleConfig& hlo_module_config,
                                     const string& libdevice_dir_path) {
   // Link the input module with libdevice, to pull in implementations of some
   // builtins.
-  TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(libdevice_dir_path, module));
+  TF_RETURN_IF_ERROR(
+      LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
 
   legacy_flags::GpuBackendLibFlags* flags =
       legacy_flags::GetGpuBackendLibFlags();
@@ -351,17 +392,14 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   // Figure out the exact name of the processor as known to the NVPTX backend
   // from the gpu_architecture flag.
-  ValidateGPUArchitecture(flags->gpu_architecture);
-  string cpu_name = gpu_info_map[flags->gpu_architecture].sm_name;
-
-  std::unique_ptr<llvm::TargetMachine> target_machine =
-      GetTargetMachine(target_triple, cpu_name, hlo_module_config);
+  std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
+      target_triple, GetSmName(compute_capability), hlo_module_config);
   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
       target_machine->getTargetIRAnalysis()));
 
   // The LLVM IR verifier performs sanity checking on the IR. This helps
   // discover problems and report them in a meaningful manner, rather than let
-  // later passes report obscure assertions becasue of unfulfilled invariants.
+  // later passes report obscure assertions because of unfulfilled invariants.
   module_passes.add(llvm::createVerifierPass());
 
   // Create the function-level pass manager. It needs data layout information
@@ -370,9 +408,9 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   AddOptimizationPasses(flags->opt_level, /*size_level=*/0,
                         target_machine.get(), &module_passes, &function_passes);
-  // Loop unrolling exposes more opportunites for SROA. Therefore, we run SROA
+  // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
   // again after the standard optimization passes [http://b/13329423].
-  // TODO(jingyue): SROA may further expose more optimization opportunites, such
+  // TODO(jingyue): SROA may further expose more optimization opportunities, such
   // as more precise alias analysis and more function inlining (SROA may change
   // the inlining cost of a function). For now, running SROA already emits good
   // enough code for the evaluated benchmarks. We may want to run more
@@ -466,6 +504,7 @@ void GPUBackendInit() {
 }  // namespace
 
 StatusOr<string> CompileToPtx(llvm::Module* module,
+                              std::pair<int, int> compute_capability,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path) {
   static std::once_flag backend_init_flag;
@@ -477,7 +516,8 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
         "Compile module " + llvm_ir::AsString(module->getName()),
         /*vlog_level=*/2);
     TF_ASSIGN_OR_RETURN(
-        ptx, CompileModuleToPtx(module, hlo_module_config, libdevice_dir_path));
+        ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
+                                libdevice_dir_path));
   }
   return ptx;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index cf6f3197bb7..fd894072170 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -18,6 +18,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_
 
 #include <string>
+#include <utility>
 
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -28,14 +29,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// The Compile.* interfaces each create their own llvm::LLVMContext objects for
-// thread safety, but note that LLVM's multithreaded support is very
-// preliminary; multithreaded use is not recommended at this time.
-//
 // Compiles the argument module and returns it. libdevice_dir_path is the parent
 // directory of the libdevice bitcode libraries. The contents of the module may
 // be changed.
+//
+// The Compile.* interfaces each create their own llvm::LLVMContext objects for
+// thread safety, but note that LLVM's multithreaded support is very
+// preliminary; multithreaded use is not recommended at this time.
 StatusOr<string> CompileToPtx(llvm::Module* module,
+                              std::pair<int, int> compute_capability,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path);
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
index c10346bbc23..72f6cfd2d60 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
@@ -28,7 +28,8 @@ limitations under the License.
 namespace {
 
 static void DieWithSMDiagnosticError(llvm::SMDiagnostic* diagnostic) {
-  LOG(FATAL) << diagnostic->getLineNo() << ":" << diagnostic->getColumnNo()
+  LOG(FATAL) << diagnostic->getFilename().str() << ":"
+             << diagnostic->getLineNo() << ":" << diagnostic->getColumnNo()
              << ": " << diagnostic->getMessage().str();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index c645e84aa4f..a12a9a71682 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -80,6 +80,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
     std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                      input->shape().dimensions().end());
+    std::vector<int64> strides(input->shape().dimensions_size(), 1);
     for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
       int64 dim = conv_dnums.spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
@@ -92,9 +93,9 @@ HloInstruction* MaybePaddedAndSlicedInput(
 
     input = computation->AddInstruction(HloInstruction::CreateSlice(
         ShapeInference::InferSliceShape(input->shape(), start_indices,
-                                        limit_indices)
+                                        limit_indices, strides)
             .ConsumeValueOrDie(),
-        input, start_indices, limit_indices));
+        input, start_indices, limit_indices, strides));
   }
 
   return input;
@@ -354,6 +355,8 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   std::vector<int64> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
+                             1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -373,13 +376,13 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   // Replace the old backward convolution with the slice.
   CHECK(ShapeUtil::Compatible(
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
-                                      limit_indices)
+                                      limit_indices, strides)
           .ConsumeValueOrDie(),
       backward_conv->shape()));
   TF_CHECK_OK(computation->ReplaceWithNewInstruction(
       backward_conv,
       HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices)));
+                                  start_indices, limit_indices, strides)));
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 8ac4c599663..8f7fce884ac 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -33,7 +33,7 @@ namespace gpu {
 enum class PartitionStrategy {
   // Optimized for latency by allowing maximum number of registers per thread.
   kLatency,
-  // Optimized for throughtput. This may limit registers per thread and cause
+  // Optimized for throughput. This may limit registers per thread and cause
   // longer latency.
   kThroughput
 };
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index 28d47d2b0f8..a5230b3e8e9 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -45,10 +45,10 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(dot2));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(dot2));
 
-  std::unique_ptr<StreamAssignment> assignment = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
   EXPECT_EQ(assignment->StreamNumberForHlo(*dot1),
             assignment->StreamNumberForHlo(*dot2));
 }
@@ -66,10 +66,10 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(add));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(add));
 
-  std::unique_ptr<StreamAssignment> assignment = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
   EXPECT_NE(assignment->StreamNumberForHlo(*dot1),
             assignment->StreamNumberForHlo(*dot2));
 }
@@ -86,6 +86,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   //      d40      -- layer 4
   HloComputation::Builder builder("entry_computation");
   std::vector<HloInstruction*> params;
+  params.reserve(6);
   for (int i = 0; i < 6; ++i) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
@@ -109,10 +110,10 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build(d40));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build(d40));
 
-  std::unique_ptr<StreamAssignment> assignment = AssignStreams(module);
+  std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
   // The two dots on layer 1 are concurrent.
   EXPECT_NE(assignment->StreamNumberForHlo(*d10),
             assignment->StreamNumberForHlo(*d11));
@@ -130,3 +131,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
 
 }  // namespace gpu
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.cc b/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.cc
deleted file mode 100644
index 3cf5dd021a1..00000000000
--- a/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h"
-
-#include "tensorflow/compiler/xla/map_util.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-int64 RoundUpToAlignment(int64 value) {
-  // Any address of a variable residing in global memory or returned by one of
-  // the memory allocation routines from the driver or runtime API is always
-  // aligned to at least 256 bytes.
-  // (http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses)
-  static constexpr int64 kCudaMallocAlignment = 256;
-  return (value + kCudaMallocAlignment - 1) / kCudaMallocAlignment *
-         kCudaMallocAlignment;
-}
-}  // namespace
-
-TempBufferOffsets::TempBufferOffsets(
-    const BufferAssignment& buffer_assignment) {
-  total_size_of_temp_buffers_ = 0;
-  for (auto i = 0; i < buffer_assignment.Allocations().size(); ++i) {
-    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
-    if (allocation.IsPreallocatedTempBuffer()) {
-      InsertOrDie(&buffer_index_to_offset_, i, total_size_of_temp_buffers_);
-      total_size_of_temp_buffers_ += RoundUpToAlignment(allocation.size());
-    }
-  }
-}
-
-int64 TempBufferOffsets::GetOffset(BufferAllocation::Index index) const {
-  return FindOrDie(buffer_index_to_offset_, index);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h b/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h
deleted file mode 100644
index 05aca99bf34..00000000000
--- a/tensorflow/compiler/xla/service/gpu/temp_buffer_offsets.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TEMP_BUFFER_OFFSETS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TEMP_BUFFER_OFFSETS_H_
-
-#include <map>
-
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace gpu {
-
-// GpuExecutable merges all temporary buffers into one memory block. This class
-// stores the offset of each temporary buffer in that memory block.
-class TempBufferOffsets {
- public:
-  explicit TempBufferOffsets(const BufferAssignment& buffer_assignment);
-
-  int64 GetOffset(BufferAllocation::Index index) const;
-  int64 TotalSizeInBytes() const { return total_size_of_temp_buffers_; }
-
- private:
-  std::map<BufferAllocation::Index, int64> buffer_index_to_offset_;
-
-  // The total size of all temporary buffers. This includes paddings that are
-  // necessary for alignment.
-  int64 total_size_of_temp_buffers_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TEMP_BUFFER_OFFSETS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 3ced3484007..0ff27888ad7 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -44,6 +44,7 @@ class Thunk {
     kConvolution,
     kCopy,
     kGemm,
+    kInfeed,
     kKernel,
     kSequential,
     kTuple,
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 8addcd87eaa..bdb062837c5 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -79,7 +79,7 @@ ThunkSchedule::ThunkSchedule(
 
 void ThunkSchedule::RemoveRedundantDependencyEdges() {
   std::unordered_map<const Thunk*, int> thunk_to_total_order;
-  for (auto i = 0; i < thunk_total_order_.size(); ++i) {
+  for (int i = 0; i < thunk_total_order_.size(); ++i) {
     InsertOrDie(&thunk_to_total_order, thunk_total_order_[i], i);
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index 323775b3e84..bd65e72393a 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -25,7 +25,7 @@ namespace gpu {
 tensorflow::Status TupleThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   std::vector<void*> tuple_element_buffer_addresses;
-  for (BufferAllocation::Index tuple_element_buffer : tuple_element_buffers_) {
+  for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) {
     tuple_element_buffer_addresses.push_back(
         buffer_allocations.GetDeviceAddress(tuple_element_buffer).opaque());
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index ca0404286fb..3b1a4963285 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -33,9 +33,9 @@ namespace gpu {
 // issue (b/31336476).
 class TupleThunk : public Thunk {
  public:
-  TupleThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Index>
+  TupleThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Slice>
                  tuple_element_buffers,
-             BufferAllocation::Index dest_buffer,
+             const BufferAllocation::Slice& dest_buffer,
              const HloInstruction* hlo_instruction)
       : Thunk(Kind::kTuple, hlo_instruction),
         tuple_element_buffers_(tuple_element_buffers.begin(),
@@ -50,8 +50,8 @@ class TupleThunk : public Thunk {
       perftools::gputools::Stream* stream) override;
 
  private:
-  std::vector<BufferAllocation::Index> tuple_element_buffers_;
-  const BufferAllocation::Index dest_buffer_;
+  const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
+  const BufferAllocation::Slice dest_buffer_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 36883e4920a..0d2412096ab 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -22,10 +22,11 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-WhileThunk::WhileThunk(BufferAllocation::Index condition_result_buffer_index,
-                       std::unique_ptr<ThunkSequence> condition_thunk_sequence,
-                       std::unique_ptr<ThunkSequence> body_thunk_sequence,
-                       const HloInstruction* hlo)
+WhileThunk::WhileThunk(
+    const BufferAllocation::Slice& condition_result_buffer_index,
+    std::unique_ptr<ThunkSequence> condition_thunk_sequence,
+    std::unique_ptr<ThunkSequence> body_thunk_sequence,
+    const HloInstruction* hlo)
     : Thunk(Kind::kWhile, hlo),
       condition_result_buffer_index_(condition_result_buffer_index),
       condition_thunk_sequence_(MakeUnique<SequentialThunk>(
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 1658cdaf87f..95ed5497cea 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -38,7 +38,7 @@ namespace gpu {
 class WhileThunk : public Thunk {
  public:
   // Constructs a WhileThunk to compute while instruction 'hlo'.
-  WhileThunk(BufferAllocation::Index condition_result_buffer_index,
+  WhileThunk(const BufferAllocation::Slice& condition_result_buffer_index,
              std::unique_ptr<ThunkSequence> condition_thunk_sequence,
              std::unique_ptr<ThunkSequence> body_thunk_sequence,
              const HloInstruction* hlo);
@@ -51,7 +51,7 @@ class WhileThunk : public Thunk {
       perftools::gputools::Stream* stream) override;
 
  private:
-  BufferAllocation::Index condition_result_buffer_index_;
+  const BufferAllocation::Slice condition_result_buffer_index_;
   std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ec75e135814..06b01d311da 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -37,7 +37,7 @@ namespace {
 // patterns to match.
 //
 // Each ExprTree node is comprised of an HloOpcode, and a set of operands (each
-// of type ExprTree). Operands can be added by specifing the index and HloOpcode
+// of type ExprTree). Operands can be added by specifying the index and HloOpcode
 // of the operand.
 //
 // For example, the following computation:
@@ -122,10 +122,12 @@ class ExprTree {
   Status Match(const HloInstruction* instruction,
                TaggedInstructionMap* tagged_instructions) const {
     if (opcode_ != instruction->opcode()) {
-      return InvalidArgument("Unexpected opcode: %s",
-                             HloOpcodeString(instruction->opcode()).c_str());
+      return InvalidArgument("got opcode %s, want %s",
+                             HloOpcodeString(instruction->opcode()).c_str(),
+                             HloOpcodeString(opcode_).c_str());
     }
 
+    VLOG(2) << "Matched " << HloOpcodeString(opcode_) << ": " << tag_;
     if (!tag_.empty()) {
       tagged_instructions->insert({tag_, instruction});
     }
@@ -166,7 +168,7 @@ class MatcherBase {
   virtual ~MatcherBase() {}
 
   // Attempts to match each ExprTree in 'expr_trees_'.
-  // Returns OK on the first succesful match, error status otherwise.
+  // Returns OK on the first successful match, error status otherwise.
   virtual tensorflow::Status Run() {
     Status status;
     for (const ExprTree& expr_tree : expr_trees_) {
@@ -238,7 +240,7 @@ class MatcherBase {
 //
 class WhileConditionComputationMatcher : public MatcherBase {
  public:
-  WhileConditionComputationMatcher(const HloComputation* computation)
+  explicit WhileConditionComputationMatcher(const HloComputation* computation)
       : computation_(computation) {
     expr_trees_.emplace_back(BuildCondExprTree());
   }
@@ -275,6 +277,7 @@ class WhileConditionComputationMatcher : public MatcherBase {
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while condition";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
                                        &tagged_instructions));
@@ -344,10 +347,6 @@ class WhileInitOperandMatcher : public MatcherBase {
   //
   //             Const
   //               |
-  //             Tuple1
-  //               |
-  //             GTE0
-  //               |
   //             Copy
   //               |
   //             Tuple0
@@ -355,15 +354,15 @@ class WhileInitOperandMatcher : public MatcherBase {
   //             While
   //
   ExprTree BuildInitExprTree() {
-    ExprTree gte0(HloOpcode::kGetTupleElement, "gte",
-                  ExprTree(HloOpcode::kTuple, tuple_index_,
-                           ExprTree(HloOpcode::kConstant, "loop_start")));
-    return ExprTree(HloOpcode::kWhile, "while",
-                    ExprTree(HloOpcode::kTuple, tuple_index_,
-                             ExprTree(HloOpcode::kCopy, gte0)));
+    return ExprTree(
+        HloOpcode::kWhile, "while",
+        ExprTree(HloOpcode::kTuple, tuple_index_,
+                 ExprTree(HloOpcode::kCopy,
+                          ExprTree(HloOpcode::kConstant, "loop_start"))));
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while init";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(while_hlo_, &tagged_instructions));
 
@@ -375,14 +374,6 @@ class WhileInitOperandMatcher : public MatcherBase {
                              while_hlo->name().c_str());
     }
 
-    // Get tagged GTE instruction and check 'tuple_index_'.
-    TF_ASSIGN_OR_RETURN(const HloInstruction* gte,
-                        GetTaggedInstruction("gte", tagged_instructions));
-    if (gte->tuple_index() != tuple_index_) {
-      return InvalidArgument("Unexpected tuple index instruction : %s",
-                             gte->name().c_str());
-    }
-
     // Get tagged Constant instruction and parse 'loop_start_'.
     TF_ASSIGN_OR_RETURN(
         const HloInstruction* const_hlo,
@@ -427,10 +418,6 @@ class WhileBodyComputationMatcher : public MatcherBase {
   //                     \  /              \  /
   //                    Fusion -----------> Add
   //                      |
-  //                     Tuple1
-  //                      |
-  //                     GTE0
-  //                      |
   //                     Copy
   //                      |
   //                     Tuple0
@@ -450,15 +437,13 @@ class WhileBodyComputationMatcher : public MatcherBase {
     fusion.SetFusedRoot(fused_root);
 
     // Build top-level computation.
-    ExprTree tuple0(
-        HloOpcode::kTuple, tuple_index_,
-        ExprTree(HloOpcode::kCopy,
-                 ExprTree(HloOpcode::kGetTupleElement, "gte",
-                          ExprTree(HloOpcode::kTuple, tuple_index_, fusion))));
+    ExprTree tuple0(HloOpcode::kTuple, tuple_index_,
+                    ExprTree(HloOpcode::kCopy, fusion));
     return tuple0;
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while body";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
                                        &tagged_instructions));
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index ddf9676e378..e82491fd6f9 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -17,16 +17,20 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
+using ::testing::Eq;
+using ::testing::HasSubstr;
+
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(TestName()),
+      : module_(CreateNewModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         loop_state_shape_(ShapeUtil::MakeTupleShape(
@@ -98,26 +102,26 @@ class WhileTransformerTest : public HloTestBase {
                   HloInstruction::CreateTuple({data_init, induction_var_init}));
     auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
         loop_state_shape_, condition, body, loop_state_init));
-    module_.AddEntryComputation(builder.Build());
+    module_->AddEntryComputation(builder.Build());
     return while_hlo;
   }
 
   void RunFusionPasses() {
     // Run standard fusion passes.
     EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/false)
-                    .Run(&module_)
+                    .Run(module_.get())
                     .ValueOrDie());
     EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/true)
-                    .Run(&module_)
+                    .Run(module_.get())
                     .ValueOrDie());
   }
 
   void RunCopyInsertionPass() {
     CopyInsertion copy_insertion;
-    EXPECT_IS_OK(copy_insertion.Run(&module_).status());
+    EXPECT_IS_OK(copy_insertion.Run(module_.get()).status());
   }
 
-  HloModule module_;
+  std::unique_ptr<HloModule> module_;
   Shape induction_variable_shape_;
   Shape data_shape_;
   Shape loop_state_shape_;
@@ -127,74 +131,72 @@ class WhileTransformerTest : public HloTestBase {
 TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
   // Build computation with induction variable at tuple element 0.
   auto condition =
-      module_.AddEmbeddedComputation(BuildConditionComputation(0, 10));
-  auto body = module_.AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
+      module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
+  auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 0);
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_TRUE(result.ok());
+  ASSERT_TRUE(result.ok());
   // Check results.
-  auto tuple = result.ConsumeValueOrDie();
-  EXPECT_EQ(0, std::get<0>(tuple));
-  EXPECT_EQ(10, std::get<1>(tuple));
-  EXPECT_EQ(1, std::get<2>(tuple));
+  EXPECT_THAT(result.ConsumeValueOrDie(),
+              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
 TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
   // Build computation with induction variable at tuple element 1.
   auto condition =
-      module_.AddEmbeddedComputation(BuildConditionComputation(1, 10));
-  auto body = module_.AddEmbeddedComputation(BuildBodyComputation(1, 0, 1));
+      module_->AddEmbeddedComputation(BuildConditionComputation(1, 10));
+  auto body = module_->AddEmbeddedComputation(BuildBodyComputation(1, 0, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 1, 0);
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_TRUE(result.ok());
+  ASSERT_TRUE(result.ok());
   // Check results.
-  auto tuple = result.ConsumeValueOrDie();
-  EXPECT_EQ(0, std::get<0>(tuple));
-  EXPECT_EQ(10, std::get<1>(tuple));
-  EXPECT_EQ(1, std::get<2>(tuple));
+  EXPECT_THAT(result.ConsumeValueOrDie(),
+              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
 TEST_F(WhileTransformerTest, InvalidLoopLimit) {
   // Build computation with invalid loop limit.
   auto condition =
-      module_.AddEmbeddedComputation(BuildConditionComputation(0, 5));
-  auto body = module_.AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
+      module_->AddEmbeddedComputation(BuildConditionComputation(0, 5));
+  auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 10);
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_FALSE(result.ok());
-  EXPECT_MATCH(
-      result.status().error_message(),
-      testing::ContainsRegex("Loop start must be less than loop limit."));
+  ASSERT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              HasSubstr("Loop start must be less than loop limit."));
 }
 
 TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
   // Build computation with invalid loop increment.
   auto condition =
-      module_.AddEmbeddedComputation(BuildConditionComputation(0, 10));
-  auto body = module_.AddEmbeddedComputation(BuildBodyComputation(0, 1, -1));
+      module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
+  auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, -1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 0);
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_FALSE(result.ok());
-  EXPECT_MATCH(
-      result.status().error_message(),
-      testing::ContainsRegex("Loop increment must greater than zero."));
+  ASSERT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              HasSubstr("Loop increment must greater than zero."));
 }
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu_transfer_manager.cc
new file mode 100644
index 00000000000..4b8d190a463
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu_transfer_manager.cc
@@ -0,0 +1,102 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu_transfer_manager.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+// TODO(b/30467474) Once GPU infeed implementation settles, consider
+// folding back the cpu and gpu infeed implementations into a generic
+// one if possible.
+GpuTransferManager::GpuTransferManager()
+    : GenericTransferManager(se::cuda::kCudaPlatformId) {}
+
+Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                                   const Literal& literal) {
+  const Shape& shape = literal.shape();
+  VLOG(2) << "Transferring literal shape to infeed: "
+          << ShapeUtil::HumanString(shape);
+
+  // TODO(b/30467474) handle tuples.
+  if (ShapeUtil::IsTuple(shape)) {
+    return Unimplemented("Infeed with a tuple shape is not supported: %s",
+                         ShapeUtil::HumanString(literal.shape()).c_str());
+  }
+
+  int64 size = GetByteSizeRequirement(shape);
+  if (size > std::numeric_limits<int32>::max()) {
+    return Unimplemented("Infeed shape is too large: %s needs %lld bytes",
+                         ShapeUtil::HumanString(literal.shape()).c_str(), size);
+  }
+
+  if (size == 0) {
+    return Unimplemented("Infeed shape %s needs 0 bytes",
+                         ShapeUtil::HumanString(literal.shape()).c_str());
+  }
+
+  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
+  se::Stream* stream = infeed_manager->GetStream(executor);
+  if (stream == nullptr) {
+    return InternalError("Failed to obtain a stream");
+  }
+
+  gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size);
+  stream->ThenMemcpy(buffer->device_memory(),
+                     LiteralUtil::InternalData(literal), size);
+
+  VLOG(2) << "Queued infeed data on stream " << stream;
+
+  if (!stream->BlockHostUntilDone()) {
+    buffer->Done();
+    return InternalError("Failed to complete data transfer on stream %p",
+                         stream);
+  }
+
+  infeed_manager->EnqueueBuffer(buffer);
+
+  VLOG(2) << "Infeed data transferred";
+  return Status::OK();
+}
+
+}  // namespace xla
+
+static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
+  return xla::MakeUnique<xla::GpuTransferManager>();
+}
+
+static bool InitModule() {
+  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
+                                                &CreateGpuTransferManager);
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu_transfer_manager.h
new file mode 100644
index 00000000000..6dfe7ba0295
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu_transfer_manager.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// An implementation of the XLA GenericTransferManager that
+// handles GPU-specific infeed.
+class GpuTransferManager : public GenericTransferManager {
+ public:
+  GpuTransferManager();
+  ~GpuTransferManager() override {}
+
+  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+                                 const Literal& literal) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
new file mode 100644
index 00000000000..86f62accd3b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -0,0 +1,607 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+using tensorflow::gtl::FlatMap;
+using tensorflow::gtl::FlatSet;
+
+namespace {
+
+// Returns the set of buffers that may be sources of all operands of the given
+// instruction.  The returned buffers are guaranteed to have no duplicates, and
+// to be sorted in a deterministic order.
+std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
+    const HloInstruction* instruction,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  FlatSet<const LogicalBuffer*> buffers;
+  for (const HloInstruction* operand : instruction->operands()) {
+    FlatSet<const LogicalBuffer*> sources =
+        points_to_analysis.GetPointsToSet(operand).CreateFlattenedSet();
+    buffers.insert(sources.begin(), sources.end());
+  }
+  std::vector<const LogicalBuffer*> sorted(buffers.begin(), buffers.end());
+  std::sort(sorted.begin(), sorted.end(),
+            [](const LogicalBuffer* a, const LogicalBuffer* b) {
+              return a->id() < b->id();
+            });
+  return sorted;
+}
+
+}  // namespace
+
+/*static*/
+StatusOr<HeapSimulator::Result> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+                     &module_sequence);
+  const HloComputation* entry_computation = module.entry_computation();
+  const std::vector<const HloInstruction*>& instruction_sequence =
+      FindOrDie(module_sequence, entry_computation);
+  TF_RETURN_IF_ERROR(heap.RunComputation(
+      *entry_computation, instruction_sequence, points_to_analysis));
+  return heap.Finish();
+}
+
+/*static*/
+StatusOr<HeapSimulator::Result> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+                     /*module_sequence=*/nullptr);
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         points_to_analysis));
+  return heap.Finish();
+}
+
+// Runs a heap simulation for the given 'computation', assuming the given
+// 'instruction_sequence'.
+Status HeapSimulator::RunComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  // The goal here is to minimize memory usage, assuming the given sequential
+  // ordering of instructions.  The strategy is to walk through the instruction
+  // sequence, calling Alloc and Free on the underlying heap algorithm.  The
+  // heap algorithm takes care of packing and reducing fragmentation.
+  //
+  // 'live_buffers' tracks the liveness of each buffer that we assign, by
+  // associating it with a set of HloInstructions that need to be visited.  When
+  // the set becomes empty, the buffer is no longer used, and can be freed.
+  FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
+
+  const HloInstruction* root = computation.root_instruction();
+  FlatSet<const LogicalBuffer*> output_source_buffers =
+      points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
+
+  for (const HloInstruction* instruction : instruction_sequence) {
+    const std::vector<const LogicalBuffer*>& buffers_defined_by_instruction =
+        points_to_analysis.GetBuffersDefinedByInstruction(instruction);
+
+    // Initialize live_buffers for each buffer that we're going to assign.  The
+    // set of instructions that need to be visited contains all users of all
+    // aliases.  The alias itself is not necessary; if it has users, the users
+    // are necessarily scheduled after the alias.  And if it has no users, it is
+    // either a dead value or an output, both of which are handled below.
+    //
+    // We ignore control dependencies here. The reasoning is that the control
+    // dependencies have already been accounted for in the ordering of the given
+    // 'instruction_sequence', and should not otherwise artificially extend the
+    // lifetime of buffers that aren't already connected by a data dependency.
+    std::vector<const LogicalBuffer*> dead_buffers_to_free;
+    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+      if (IgnoreBuffer(buffer)) {
+        continue;
+      }
+      for (const BufferAlias& alias :
+           points_to_analysis.GetBufferAliases(*buffer)) {
+        const std::vector<HloInstruction*>& users =
+            alias.instruction()->users();
+        if (!users.empty()) {
+          live_buffers[buffer].insert(users.begin(), users.end());
+        }
+      }
+
+      // Add a nullptr sentry to ensure entry parameters and output source
+      // buffers are not freed until the very end.
+      const bool entry_parameter =
+          &computation == computation.parent()->entry_computation() &&
+          buffer->instruction()->opcode() == HloOpcode::kParameter;
+      const bool output = output_source_buffers.count(buffer) > 0;
+      if (entry_parameter || output) {
+        live_buffers[buffer].insert(nullptr);
+      }
+
+      // If the buffer has no users and isn't an entry parameter or output, it
+      // must be a dead value.
+      if (live_buffers.count(buffer) == 0) {
+        dead_buffers_to_free.push_back(buffer);
+      }
+    }
+
+    // Update live_buffers to indicate we've visited this instruction; this is
+    // the inverse of the initialization logic.  We erase this instruction from
+    // all source buffers of all operands of this instruction.  Buffers that
+    // have no instructions left to visit are moved from live_buffers to
+    // operand_buffers_to_free.
+    std::vector<const LogicalBuffer*> operand_buffers_to_free;
+    for (const LogicalBuffer* operand_buffer :
+         UniqueOperandSourceBuffers(instruction, points_to_analysis)) {
+      if (IgnoreBuffer(operand_buffer)) {
+        continue;
+      }
+      live_buffers[operand_buffer].erase(instruction);
+      if (live_buffers[operand_buffer].empty()) {
+        live_buffers.erase(operand_buffer);
+        operand_buffers_to_free.push_back(operand_buffer);
+      }
+    }
+
+    // Allocate buffers defined by this instruction.  This is the latest point
+    // that we can allocate; right before the buffer is first used.  This must
+    // happen before dead or operand buffers are freed; the instruction reads
+    // the operand buffers to produce its output.
+    //
+    // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
+    // that we should assign.
+    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+      if (IgnoreBuffer(buffer)) {
+        continue;
+      }
+
+      // Check whether the buffer can share with one of its operands; we can
+      // save memory by sharing the buffer, rather than allocating a new one.
+      // We can only share with the operand buffer if it is about to be freed;
+      // we must be the last user of the buffer.
+      bool shared = false;
+      for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+        if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+            buffer->instruction()->opcode() != HloOpcode::kCopy &&
+            CanShareOperandBufferWithUser(
+                operand_buffer->instruction(), operand_buffer->index(),
+                buffer->instruction(), buffer->index(), points_to_analysis)) {
+          ShareBuffer(buffer, operand_buffer, instruction);
+          shared = true;
+          break;
+        }
+      }
+
+      if (!shared) {
+        Alloc(buffer, instruction);
+      }
+    }
+
+    // If the whole module is sequential, we can save memory by running the
+    // heap-simulation for sub-computations inline. E.g. the buffers for the
+    // condition and body of a kWhile instruction are only live for the duration
+    // of the instruction itself.
+    //
+    // The order that the sub-computations are simulated does not affect
+    // correctness; since the whole module is sequential, we know that the
+    // sub-computations will never be run concurrently.
+    if (module_sequence_ != nullptr) {
+      if (instruction->opcode() == HloOpcode::kCall ||
+          instruction->opcode() == HloOpcode::kWhile) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          const std::vector<const HloInstruction*>& called_sequence =
+              FindOrDie(*module_sequence_, called_computation);
+          TF_RETURN_IF_ERROR(RunComputation(
+              *called_computation, called_sequence, points_to_analysis));
+        }
+      }
+
+      // Other sub-computations (e.g. Map, Reduce, ...) are skipped; they are
+      // assigned "thread-local" allocations, meaning their buffers are not
+      // allocated up-front at the beginning of the computation.
+    }
+
+    // Free buffers that are no longer live.  This is the earliest point that we
+    // can de-allocate; right after the last use of the buffer.
+    for (const LogicalBuffer* buffer : dead_buffers_to_free) {
+      Free(buffer, instruction);
+    }
+    for (const LogicalBuffer* buffer : operand_buffers_to_free) {
+      Free(buffer, instruction);
+    }
+  }
+
+  // Any remaining live buffers must be entry parameters or output source
+  // buffers, which had a nullptr sentry added.  Free them now.
+  for (const auto& buffer_pending : live_buffers) {
+    const LogicalBuffer* buffer = buffer_pending.first;
+    const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
+    CHECK_EQ(pending.size(), 1) << *buffer;
+    CHECK(*pending.begin() == nullptr) << *buffer;
+    Free(buffer, root);
+  }
+
+  return Status::OK();
+}
+
+HeapSimulator::HeapSimulator(
+    std::unique_ptr<HeapAlgorithm> algorithm,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const SequentialHloOrdering::HloModuleSequence* module_sequence)
+    : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
+      algorithm_(std::move(algorithm)),
+      size_fn_(size_fn),
+      buffers_to_assign_(buffers_to_assign),
+      module_sequence_(module_sequence) {
+  debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
+}
+
+HeapSimulator::~HeapSimulator() {}
+
+bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
+  // Buffers for constants are ignored, as with BufferAssigner.  Also ignore
+  // buffers that we're not meant to assign.
+  //
+  // TODO(b/32248867): For consistency, constants should get allocations.
+  return buffer->instruction()->opcode() == HloOpcode::kConstant ||
+         (buffers_to_assign_ != nullptr &&
+          buffers_to_assign_->count(buffer) == 0);
+}
+
+// Alloc always calls the underlying heap algorithm.
+void HeapSimulator::Alloc(const LogicalBuffer* buffer,
+                          const HloInstruction* instruction) {
+  CHECK(allocated_buffers_.count(buffer) == 0)
+      << "Alloc called on allocated buffer: " << *buffer;
+  CHECK(freed_buffers_.count(buffer) == 0)
+      << "Alloc called on freed buffer: " << *buffer;
+
+  allocated_buffers_.insert(buffer);
+  const int64 size = size_fn_(*buffer);
+  algorithm_->Alloc(buffer, size);
+  no_fragmentation_stats_->Alloc(buffer, size);
+
+  FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction,
+                 nullptr);
+}
+
+// Free calls the underlying algorithm for non-shared buffers, and for shared
+// buffers whose group liveness has expired.  Shared group liveness is tracked
+// by maintaining a refcount; the Free call on the last buffer in the group
+// causes Free to be called on the underlying algorithm.
+void HeapSimulator::Free(const LogicalBuffer* buffer,
+                         const HloInstruction* instruction) {
+  auto shared_it = shared_buffers_.find(buffer);
+  if (shared_it != shared_buffers_.end()) {
+    std::shared_ptr<SharedGroup> group = shared_it->second;
+    --group->refcount;
+    if (group->refcount > 0) {
+      return;
+    }
+    CHECK_EQ(group->refcount, 0)
+        << "Free caused negative refcount on shared buffer: " << *buffer;
+    buffer = group->canonical;
+  }
+
+  CHECK(allocated_buffers_.count(buffer) > 0)
+      << "Free called on non-allocated buffer: " << *buffer;
+  CHECK(freed_buffers_.count(buffer) == 0)
+      << "Free called on freed buffer: " << *buffer;
+
+  freed_buffers_.insert(buffer);
+  const int64 size = size_fn_(*buffer);
+  algorithm_->Free(buffer, size);
+  no_fragmentation_stats_->Free(buffer, size);
+
+  FillDebugTrace(HeapSimulatorTrace::Event::FREE, buffer, instruction, nullptr);
+}
+
+// ShareBuffer associates buffers with their SharedGroup in shared_buffers_.
+// The 'buffer' must be a non-allocated, non-freed buffer, just like in calls to
+// Alloc.  The 'shared' buffer must be a previously allocated or shared buffer.
+// Both 'buffer' and 'shared' will be associated with the same SharedGroup.
+void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
+                                const LogicalBuffer* shared,
+                                const HloInstruction* instruction) {
+  CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
+      << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
+  CHECK(allocated_buffers_.count(buffer) == 0)
+      << "ShareBuffer called on allocated buffer: " << *buffer;
+  CHECK(freed_buffers_.count(buffer) == 0)
+      << "ShareBuffer called on freed buffer: " << *buffer;
+  CHECK(freed_buffers_.count(shared) == 0)
+      << "ShareBuffer called on freed shared buffer: " << *shared;
+
+  const LogicalBuffer* canonical = nullptr;
+  auto shared_it = shared_buffers_.find(shared);
+  if (shared_it != shared_buffers_.end()) {
+    // The 'shared' buffer already has a group; it might be the canonical, but
+    // also might not be.  Just add 'buffer' to the existing group.
+    std::shared_ptr<SharedGroup> group = shared_it->second;
+    canonical = group->canonical;
+    ++group->refcount;
+    shared_buffers_.emplace(buffer, group);
+  } else {
+    // The 'shared' buffer doesn't have a group; it must be the canonical.  Add
+    // both 'buffer' and 'shared' to a new group.
+    CHECK(allocated_buffers_.count(shared) > 0)
+        << "ShareBuffer called on non-allocated shared buffer: " << *shared;
+    auto group = std::make_shared<SharedGroup>();
+    canonical = shared;
+    group->canonical = canonical;
+    group->refcount = 2;
+    shared_buffers_.emplace(buffer, group);
+    shared_buffers_.emplace(shared, group);
+  }
+
+  FillDebugTrace(HeapSimulatorTrace::Event::SHARE_WITH, buffer, instruction,
+                 canonical);
+}
+
+HeapSimulator::Result HeapSimulator::Finish() {
+  Result result = algorithm_->Finish();
+
+  // Post-process the result to add chunks for shared buffers.  An empty chunk
+  // map means that either no buffers were allocated, or the heap was only
+  // collecting statistics, e.g. NoFragmentationStatsHeap.
+  if (!result.chunk_map.empty()) {
+    for (const auto& share_pair : shared_buffers_) {
+      const LogicalBuffer* buffer = share_pair.first;
+      std::shared_ptr<SharedGroup> group = share_pair.second;
+      if (buffer != group->canonical) {
+        // The canonical must already exist in the chunk_map, since we called
+        // Alloc(canonical) on the underlying algorithm.  Add non-canonical
+        // chunks with the same offset as the canonical.
+        Chunk chunk = FindOrDie(result.chunk_map, group->canonical);
+        chunk.size = size_fn_(*buffer);
+        result.chunk_map.emplace(buffer, chunk);
+      }
+    }
+    // If we were told to assign specific buffers, make sure we've assigned
+    // exactly that many buffers.
+    if (buffers_to_assign_ != nullptr) {
+      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    }
+  }
+
+  // Fragmentation is the difference between the actual and ideal sizes.
+  const Result no_frag_result = no_fragmentation_stats_->Finish();
+  result.fragmentation_size = result.heap_size - no_frag_result.heap_size;
+
+  // Copy the debug trace we collected to the final result.
+  result.debug_trace.Swap(&debug_trace_);
+
+  return result;
+}
+
+void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
+                                   const LogicalBuffer* buffer,
+                                   const HloInstruction* instruction,
+                                   const LogicalBuffer* share_with_canonical) {
+  HeapSimulatorTrace::Event* event = debug_trace_.add_events();
+  event->set_kind(kind);
+  event->set_buffer_id(buffer->id());
+  event->set_computation_name(instruction->parent()->name());
+  event->set_instruction_name(instruction->name());
+  if (kind == HeapSimulatorTrace::Event::SHARE_WITH) {
+    CHECK(share_with_canonical != nullptr);
+    event->set_share_with_canonical_id(share_with_canonical->id());
+  } else {
+    CHECK(share_with_canonical == nullptr);
+  }
+}
+
+void NoFragmentationStatsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+  current_heap_size_ += size;
+  if (current_heap_size_ > max_heap_size_) {
+    max_heap_size_ = current_heap_size_;
+  }
+}
+
+void NoFragmentationStatsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+  current_heap_size_ -= size;
+}
+
+HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
+  // The result.chunk_map is empty, since we only collect stats, and don't
+  // actually compute chunk assignments.
+  Result result;
+  result.heap_size = max_heap_size_;
+  return result;
+}
+
+void DecreasingSizeRunsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+  SetMode(kAlloc);
+  run_.emplace_back(Op{buffer, size});
+}
+
+void DecreasingSizeRunsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+  CHECK(mode_ != kInit) << "Free called on empty heap: " << *buffer;
+  SetMode(kFree);
+  run_.emplace_back(Op{buffer, size});
+}
+
+HeapSimulator::Result DecreasingSizeRunsHeap::Finish() {
+  CallAndDrainRun();
+  return algorithm_->Finish();
+}
+
+void DecreasingSizeRunsHeap::SetMode(Mode mode) {
+  if (mode_ != mode) {
+    CallAndDrainRun();
+    mode_ = mode;
+  }
+}
+
+void DecreasingSizeRunsHeap::CallAndDrainRun() {
+  if (mode_ == kInit) {
+    CHECK(run_.empty());
+    return;
+  }
+
+  // Call ops in the run sorted by decreasing size, breaking ties by buffer id.
+  std::sort(run_.begin(), run_.end(), [](const Op& a, const Op& b) {
+    if (a.size != b.size) {
+      return a.size > b.size;
+    }
+    return a.buffer->id() < b.buffer->id();
+  });
+  for (const Op& op : run_) {
+    if (mode_ == kAlloc) {
+      algorithm_->Alloc(op.buffer, op.size);
+    } else {
+      algorithm_->Free(op.buffer, op.size);
+    }
+  }
+  run_.clear();
+}
+
+void LazyBestFitHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+  // Degenerate case: 0-sized buffers are always allocated at offset 0.
+  if (size == 0) {
+    result_.chunk_map.emplace(buffer, Chunk{0, 0});
+  }
+
+  // First try to allocate from the best-fitting free chunk.
+  auto best_fit_it = free_.lower_bound(Chunk{0, size});
+  while (best_fit_it != free_.end()) {
+    // Account for alignment.
+    const Chunk best = *best_fit_it;
+    const int64 new_offset = RoundUpToNearest(best.offset, alignment_);
+    const int64 new_end = new_offset + size;
+    if (new_end > best.chunk_end()) {
+      // We don't fit after accounting for alignment.
+      ++best_fit_it;
+      continue;
+    }
+    // The buffer is allocated a chunk out of the best-fitting free chunk.
+    free_.erase(best_fit_it);
+    result_.chunk_map.emplace(buffer, Chunk{new_offset, size});
+    // Add remaining portions of the best-fitting free chunk back into free_.
+    AddFreeChunk(best.offset, new_offset - best.offset);
+    AddFreeChunk(new_end, best.chunk_end() - new_end);
+    return;
+  }
+
+  // The buffer doesn't completely fit into any existing free chunk.  If the
+  // last free chunk is adjacent to the end of the heap, allocate the buffer
+  // re-using that space, increasing the heap size.
+  //
+  // Allocating the buffer now causes the heap to grow by less than the buffer
+  // size, whereas if we allocated lazily in Free, the heap would grow by
+  // exactly the buffer size.  However it's still a greedy heuristical approach;
+  // we might have ended up with a tighter packing by being lazy here.
+  //
+  // In theory we could also check if we could re-use space from the first free
+  // chunk and grow the heap at the front, and choose whether to grow from the
+  // front or back based on the amount of re-use.  But that's more complicated,
+  // and these are all heuristics anyways, so it isn't implemented.
+  for (auto it = free_.begin(); it != free_.end(); ++it) {
+    if (it->chunk_end() == result_.heap_size) {
+      // Account for alignment in the last free chunk.
+      const Chunk last = *it;
+      const int64 new_offset = RoundUpToNearest(last.offset, alignment_);
+      if (new_offset >= last.chunk_end()) {
+        // There's no point in using the last free chunk if alignment causes us
+        // to skip over it anyways.
+        break;
+      }
+      // The buffer is allocated a chunk that includes the last free chunk.
+      free_.erase(it);
+      result_.chunk_map.emplace(buffer, Chunk{new_offset, size});
+      // Add remaining portion of the last free chunk back into free_.
+      AddFreeChunk(last.offset, new_offset - last.offset);
+      // Grow the heap.
+      const int64 new_end = new_offset + size;
+      CHECK_GT(new_end, result_.heap_size);
+      CHECK_LT(new_end, result_.heap_size + size);
+      result_.heap_size = new_end;
+      return;
+    }
+  }
+
+  // Otherwise lazily allocate the buffer in Free.
+  result_.chunk_map.emplace(buffer, Chunk{kLazyAllocOffset, size});
+}
+
+void LazyBestFitHeap::Free(const LogicalBuffer* buffer, int64 size) {
+  auto alloc_it = result_.chunk_map.find(buffer);
+  CHECK(alloc_it != result_.chunk_map.end())
+      << "Free called on non-allocated buffer: " << *buffer;
+  Chunk* alloc = &alloc_it->second;
+  CHECK_EQ(alloc->size, size) << "Free with mismatched sizes: " << *buffer;
+  if (alloc->offset != kLazyAllocOffset) {
+    // The buffer was already allocated in Alloc, do a normal free.
+    AddFreeChunk(alloc->offset, alloc->size);
+  } else {
+    // This buffer is lazily allocated, so we *can not* allocate out of existing
+    // free chunks, since that might cause interference between buffers.  The
+    // buffer is allocated by growing the heap, accounting for alignment.
+    alloc->offset = RoundUpToNearest(result_.heap_size, alignment_);
+    const int64 new_end = alloc->chunk_end();
+    AddFreeChunk(result_.heap_size, new_end - result_.heap_size);
+    CHECK_GT(new_end, result_.heap_size);
+    CHECK_GE(new_end, result_.heap_size + alloc->size);
+    result_.heap_size = new_end;
+  }
+}
+
+void LazyBestFitHeap::AddFreeChunk(int64 offset, int64 size) {
+  if (size <= 0) {
+    return;
+  }
+
+  // Coalesce the chunk with adjacent free chunks on either side.  We must
+  // remove the free chunks from free_, since it's ordered by size.
+  Chunk chunk{offset, size};
+  for (auto it = free_.begin(); it != free_.end();) {
+    if (it->chunk_end() == chunk.offset || it->offset == chunk.chunk_end()) {
+      chunk.offset = std::min(chunk.offset, it->offset);
+      chunk.size += it->size;
+      it = free_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  // This is the only place we add free chunks to free_.  It maintains the
+  // invariant that all free chunks are disjoint and non-adjacent.
+  free_.emplace(chunk);
+}
+
+HeapSimulator::Result LazyBestFitHeap::Finish() {
+  if (!free_.empty()) {
+    // When Finish is called, all calls to Alloc must have had corresponding
+    // calls to Free, which will result in a single free chunk [0, heap_size).
+    CHECK_EQ(free_.size(), 1);
+    CHECK_EQ(free_.begin()->offset, 0);
+    CHECK_EQ(free_.begin()->size, result_.heap_size);
+  }
+  return result_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
new file mode 100644
index 00000000000..a03ad2f37cf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
+
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// Forward declare classes defined below.
+class HeapAlgorithm;
+
+// HeapSimulator assigns buffer offsets by running a simulation of a regular
+// memory heap with Alloc and Free calls.  It only works for completely
+// sequential instruction sequences.  Unlike regular heaps, we have the
+// advantage that the sequence of Alloc and Free calls is known up-front; we
+// don't need to return the assignment of buffer offsets until the very end.
+class HeapSimulator {
+ public:
+  // Chunk represents a contiguous piece of memory.  Each LogicalBuffer will be
+  // associated with a chunk in the assignment result.
+  struct Chunk {
+    int64 offset;
+    int64 size;
+
+    int64 chunk_end() const { return offset + size; }
+  };
+
+  // Result represents the result of the heap simulation.
+  struct Result {
+    // The assignment of buffers to chunks.
+    tensorflow::gtl::FlatMap<const LogicalBuffer*, Chunk> chunk_map;
+
+    // The total size in bytes of the heap, containing all assigned chunks.
+    int64 heap_size = 0;
+
+    // The total size in bytes of heap fragmentation.
+    int64 fragmentation_size = 0;
+
+    // A trace of heap simulation events.
+    HeapSimulatorTrace debug_trace;
+  };
+
+  // Run the heap simulation with the given algorithm, assuming the given
+  // module_sequence, which must contain a topologically-consistent total
+  // ordering of all instructions within each computation. The result is invalid
+  // if instructions are not run in exactly this sequence.
+  //
+  // Running heap simulation on the whole module tends to save memory, compared
+  // to running on a per-computation basis, since we can re-use buffer space for
+  // called sub-computations.
+  //
+  // If 'buffers_to_assign' is provided, only those buffers are assigned
+  // offsets, otherwise all buffers defined by the instructions are assigned.
+  static StatusOr<Result> Run(
+      std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+      const SequentialHloOrdering::HloModuleSequence& module_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
+          nullptr);
+
+  // Same as above, but runs on a single computation. The 'instruction_sequence'
+  // must contain a topologically-consistent total ordering of all instructions
+  // in the computation. The result is invalid if instructions are not run in
+  // exactly this sequence.
+  static StatusOr<Result> Run(
+      std::unique_ptr<HeapAlgorithm> algorithm,
+      const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
+          nullptr);
+
+ private:
+  // If 'module_sequence' is non-null, it is used to find kCall and kWhile
+  // sub-computations, and the heap simulation for those sub-computations will
+  // be run recursively. I.e. the simulation is run over the whole module.
+  HeapSimulator(
+      std::unique_ptr<HeapAlgorithm> algorithm,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign,
+      const SequentialHloOrdering::HloModuleSequence* module_sequence);
+  ~HeapSimulator();
+
+  Status RunComputation(
+      const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
+      const TuplePointsToAnalysis& points_to_analysis);
+
+  bool IgnoreBuffer(const LogicalBuffer* buffer) const;
+  void Alloc(const LogicalBuffer* buffer, const HloInstruction* instruction);
+  void Free(const LogicalBuffer* buffer, const HloInstruction* instruction);
+  void ShareBuffer(const LogicalBuffer* buffer, const LogicalBuffer* shared,
+                   const HloInstruction* instruction);
+  Result Finish();
+
+  void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
+                      const LogicalBuffer* buffer,
+                      const HloInstruction* instruction,
+                      const LogicalBuffer* shared_with_canonical);
+
+  const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
+  const std::unique_ptr<HeapAlgorithm> algorithm_;
+  const LogicalBuffer::SizeFunction size_fn_;
+  const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign_;
+  const SequentialHloOrdering::HloModuleSequence* module_sequence_;
+
+  // In addition to Alloc and Free, the heap simulator exposes a concept of
+  // buffer sharing.  When ShareBuffer is called, instead of allocating new
+  // space for the buffer, it associates the buffer with a previously allocated
+  // (or shared) buffer.  Each group of mutually-shared buffers points to a
+  // single SharedGroup instance, which is a shared control block.
+  //
+  // This forced buffer sharing is hidden from the underlying heap algorithm,
+  // which only sees a regular Alloc call on the canonical buffer.  The
+  // corresponding Free call is delayed until the liveness of all shared buffers
+  // in the group has expired, which is tracked via the refcount.  The results
+  // are post-processed in Finish to add chunks for shared buffers.
+  //
+  // The shared_buffers_ map associates each shared buffer (including the
+  // canonical) to its SharedGroup control block.
+  struct SharedGroup {
+    const LogicalBuffer* canonical = nullptr;
+    int64 refcount = 0;
+  };
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, std::shared_ptr<SharedGroup>>
+      shared_buffers_;
+
+  // Hold some sets for error-checking the sequence of Alloc and Free calls.
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> allocated_buffers_;
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> freed_buffers_;
+
+  // Debugging information filled in while the heap simulator runs.
+  HeapSimulatorTrace debug_trace_;
+};
+
+// Abstract base class describing a heap simulation algorithm that assigns
+// offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
+// same semantics as a regular memory heap.  Finish will be called at the end to
+// collect the simulation results.
+class HeapAlgorithm {
+ public:
+  using Chunk = HeapSimulator::Chunk;
+  using Result = HeapSimulator::Result;
+
+  virtual ~HeapAlgorithm() = default;
+
+  // Alloc allocates a buffer of 'size' bytes.
+  virtual void Alloc(const LogicalBuffer* buffer, int64 size) = 0;
+
+  // Free de-allocates a previously allocated buffer.
+  virtual void Free(const LogicalBuffer* buffer, int64 size) = 0;
+
+  // Finish collects the buffer offset assignment results.  Free may only be
+  // called once, after the Alloc and Free calls.
+  virtual Result Finish() = 0;
+};
+
+// NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
+// this is the absolute minimum size for a given instruction sequence.  The
+// result.chunk_map returned in Finish is always empty, since we only collect
+// stats, and don't actually compute chunk assignments.
+class NoFragmentationStatsHeap : public HeapAlgorithm {
+ public:
+  NoFragmentationStatsHeap() = default;
+  ~NoFragmentationStatsHeap() override = default;
+
+  void Alloc(const LogicalBuffer* buffer, int64 size) override;
+  void Free(const LogicalBuffer* buffer, int64 size) override;
+  Result Finish() override;
+
+ private:
+  int64 current_heap_size_ = 0;
+  int64 max_heap_size_ = 0;
+};
+
+// DecreasingSizeRunsHeap collects runs of Alloc and Free calls, sorts them by
+// decreasing size, and delegates the actual calls to another heap algorithm.
+// This greedy heuristic tends to reduce fragmentation for all algorithms.
+class DecreasingSizeRunsHeap : public HeapAlgorithm {
+ public:
+  DecreasingSizeRunsHeap(std::unique_ptr<HeapAlgorithm> algorithm)
+      : algorithm_(std::move(algorithm)) {}
+  ~DecreasingSizeRunsHeap() override {}
+
+  void Alloc(const LogicalBuffer* buffer, int64 size) override;
+  void Free(const LogicalBuffer* buffer, int64 size) override;
+  Result Finish() override;
+
+ private:
+  // A single Alloc or Free operation that we've buffered in run_.
+  struct Op {
+    const LogicalBuffer* buffer;
+    int64 size;
+  };
+
+  // Current collection mode; kInit means no ops have been collected yet.
+  enum Mode { kInit, kAlloc, kFree };
+
+  void SetMode(Mode mode);
+  void CallAndDrainRun();
+
+  const std::unique_ptr<HeapAlgorithm> algorithm_;
+  std::vector<Op> run_;
+  Mode mode_ = kInit;
+};
+
+// LazyBestFitHeap is a variant of the traditional best-fit heap.  This is a
+// greedy heuristic, based on the idea that delaying offset assignment helps
+// reduce fragmentation.  Here's an example of a "bad" offset assignment, where
+// a tiny buffer A prevents adjacent free chunks from being coalesced:
+//    BAD: |  free  |A|  free  |
+// If we could have delayed the assignment of A, we might have ended up with:
+//   GOOD: |      free       |A|
+//
+// In general it's actually hard to say whether GOOD is better than BAD; the
+// heuristic we use is we try to leave large contiguous chunks free, and we try
+// to avoid growing the overall heap size unless necessary.
+//
+// Just like regular best-fit, in Alloc we look for the smallest free chunk that
+// fits the requested size.  Unlike regular best-fit, we postpone offset
+// assignment for buffers that cannot re-use existing free chunks (and force us
+// to grow the heap); these buffers are "lazily" assigned offsets in Free.
+class LazyBestFitHeap : public HeapAlgorithm {
+ public:
+  LazyBestFitHeap(int64 alignment) : alignment_(alignment) {}
+  ~LazyBestFitHeap() override {}
+
+  void Alloc(const LogicalBuffer* buffer, int64 size) override;
+  void Free(const LogicalBuffer* buffer, int64 size) override;
+  Result Finish() override;
+
+ private:
+  // Sentry value used to indicate a chunk that wasn't assigned an offset in
+  // Alloc, and will instead be assigned an offset in Free.
+  enum { kLazyAllocOffset = -1 };
+
+  struct OrderChunkByIncreasingSize {
+    bool operator()(const Chunk& a, const Chunk& b) {
+      if (a.size != b.size) return a.size < b.size;
+      return a.offset < b.offset;
+    }
+  };
+
+  void AddFreeChunk(int64 offset, int64 size);
+
+  const int64 alignment_;
+  Result result_;
+
+  // Maintain the set of free chunks, ordered by increasing size.
+  std::set<Chunk, OrderChunkByIncreasingSize> free_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
new file mode 100644
index 00000000000..60a0768a86b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -0,0 +1,849 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+namespace {
+
+const char kAlloc[] = "Alloc";
+const char kFree[] = "Free";
+const char kFinish[] = "Finish";
+
+// CallSequence records a sequence of Alloc/Free/Finish calls.
+using CallSequence = std::vector<std::pair<string, const LogicalBuffer*>>;
+
+// HeapCallRecorder is a dummy heap algorithm that simply records its calls.
+class HeapCallRecorder : public HeapAlgorithm {
+ public:
+  explicit HeapCallRecorder(CallSequence* calls) : calls_(calls) {}
+  ~HeapCallRecorder() override {}
+
+  void Alloc(const LogicalBuffer* buffer, int64 size) override {
+    calls_->emplace_back(kAlloc, buffer);
+    // Instead of assigning a real offset, we set the cardinality of the Alloc
+    // call.  This isn't a valid assignment, but allows us to easily test for
+    // buffer sharing.
+    const int64 offset = result_.chunk_map.size();
+    result_.chunk_map.emplace(buffer, Chunk{offset, size});
+  }
+  void Free(const LogicalBuffer* buffer, int64 size) override {
+    calls_->emplace_back(kFree, buffer);
+  }
+  Result Finish() override {
+    calls_->emplace_back(kFinish, nullptr);
+    return result_;
+  }
+
+ private:
+  CallSequence* calls_;
+  Result result_;
+};
+
+// HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
+// made to the underlying heap algorithm.  Tests compare the actual call
+// sequence against an expected sequence.
+class HeapSimulatorTracker {
+ public:
+  // Constructor for testing a single entry computation.
+  HeapSimulatorTracker(
+      const string& name, std::unique_ptr<HloComputation> computation,
+      const std::vector<const HloInstruction*>& instruction_sequence) {
+    module_ = MakeUnique<HloModule>(name);
+    module_->AddEntryComputation(std::move(computation));
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+    // Since we're only tracking the sequence of Alloc/Free calls, the actual
+    // size of the buffers doesn't matter, so we always return 0.  We rely on
+    // the secondary sorting criteria of DecreasingSizeRunsHeap to sort calls by
+    // buffer id, for determinism in the tests.
+    auto zero_size = [](const LogicalBuffer& buffer) { return 0; };
+    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
+        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    result_ = HeapSimulator::Run(
+                  std::move(algorithm), *module_->entry_computation(),
+                  instruction_sequence, *points_to_analysis_, zero_size)
+                  .ConsumeValueOrDie();
+  }
+
+  explicit HeapSimulatorTracker(const string& name) {
+    module_ = MakeUnique<HloModule>(name);
+  }
+
+  // Similar to the single entry computation constructor above, but runs the
+  // simulation over the entire module.
+  void RunWholeModule(
+      const std::vector<const HloInstruction*>& full_module_sequence) {
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+
+    // Construct the module sequence grouped by computation.
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    tensorflow::gtl::FlatMap<const HloInstruction*, int> reverse_position;
+    for (int i = 0; i < full_module_sequence.size(); ++i) {
+      const HloInstruction* instruction = full_module_sequence[i];
+      module_sequence[instruction->parent()].push_back(instruction);
+      reverse_position[instruction] = full_module_sequence.size() - i;
+    }
+
+    // Hack the size_fn so that it returns a decreasing value as we step through
+    // the sequence. This lets us ensure the Alloc calls are in the sequence
+    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // deterministic.
+    auto size_fn = [&reverse_position](const LogicalBuffer& buffer) {
+      return reverse_position[buffer.instruction()];
+    };
+    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
+        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    result_ = HeapSimulator::Run(std::move(algorithm), *module_,
+                                 module_sequence, *points_to_analysis_, size_fn)
+                  .ConsumeValueOrDie();
+  }
+
+  HloModule* module() { return module_.get(); }
+
+  // Returns the buffer defined at the given instruction and index.
+  const LogicalBuffer* BufferAt(const HloInstruction* instruction,
+                                const ShapeIndex& index) const {
+    return points_to_analysis_->GetBufferDefinedAt(instruction, index)
+        .ConsumeValueOrDie();
+  }
+
+  // Ensures the expected sequence of Alloc/Free/Finish calls was performed.
+  void ExpectCallSequence(const CallSequence& expected) const {
+    EXPECT_EQ(expected, actual_calls_);
+  }
+
+  // Ensures the buffers defined by the respective (instruction,index) pairs are
+  // shared, relying on the unique offsets assigned in HeapCallRecorder::Alloc.
+  void ExpectSharedBuffers(const HloInstruction* instruction_a,
+                           const ShapeIndex& index_a,
+                           const HloInstruction* instruction_b,
+                           const ShapeIndex& index_b) {
+    const LogicalBuffer* a = BufferAt(instruction_a, index_a);
+    const LogicalBuffer* b = BufferAt(instruction_b, index_b);
+    EXPECT_EQ(result_.chunk_map[a].offset, result_.chunk_map[b].offset)
+        << *a << ", " << *b;
+  }
+
+ private:
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+  CallSequence actual_calls_;
+  HeapSimulator::Result result_;
+};
+
+class HeapSimulatorTest : public HloTestBase {
+ protected:
+  HeapSimulatorTest() {}
+  ~HeapSimulatorTest() override {}
+
+  // Shapes for use in the examples.
+  Shape f32scalar_ = ShapeUtil::MakeShape(xla::F32, {});
+  Shape f32vec4_ = ShapeUtil::MakeShape(F32, {4});
+};
+
+TEST_F(HeapSimulatorTest, ScalarConstant) {
+  auto builder = HloComputation::Builder(TestName());
+  auto const0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+
+  // Constants aren't assigned.  See b/32248867
+  HeapSimulatorTracker tracker(TestName(), builder.Build(), {const0});
+  tracker.ExpectCallSequence({{kFinish, nullptr}});
+}
+
+TEST_F(HeapSimulatorTest, OneParam) {
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "param0"));
+
+  // A single parameter which is also the output.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(), {param0});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(param0, {})},
+      {kFree, tracker.BufferAt(param0, {})},
+      {kFinish, nullptr},
+  });
+}
+
+TEST_F(HeapSimulatorTest, Multiply) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+
+  // We must keep all parameters and outputs.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramX, mul});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(mul, {})},
+      {kFinish, nullptr},
+  });
+}
+
+TEST_F(HeapSimulatorTest, MultiplyAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec4_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, mul, paramY));
+
+  // The buffer for add is the output, and it's shared with the buffer for mul.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramX, mul, paramY, add});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(mul, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFinish, nullptr},
+  });
+  tracker.ExpectSharedBuffers(add, {}, mul, {});
+}
+
+TEST_F(HeapSimulatorTest, MultiplyDot) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+
+  // The buffer for dot is the output, and it cannot be shared with the buffer
+  // for mul, since dot isn't elementwise.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramX, mul, paramY, dot});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(dot, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(mul, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFree, tracker.BufferAt(dot, {})},
+      {kFinish, nullptr},
+  });
+}
+
+TEST_F(HeapSimulatorTest, MultiplyDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, dot, paramA));
+
+  // The buffer for add is the output, and it's shared with the buffer for dot.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramX, mul, paramY, dot, add});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(dot, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(mul, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFree, tracker.BufferAt(dot, {})},
+      {kFinish, nullptr},
+  });
+  tracker.ExpectSharedBuffers(add, {}, dot, {});
+}
+
+TEST_F(HeapSimulatorTest, MultiplyDotDot) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto dot0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+  auto dot1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+
+  // The buffer for dot1 is the output.  No buffers can be shared.  The buffer
+  // for mul is freed before the end, since it's no longer used after dot0
+  // finishes.
+  HeapSimulatorTracker tracker(TestName(), builder.Build(),
+                               {paramA, paramX, mul, paramY, dot0, dot1});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(dot0, {})},
+      {kFree, tracker.BufferAt(mul, {})},  // mul no longer used
+      {kAlloc, tracker.BufferAt(dot1, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFree, tracker.BufferAt(dot0, {})},
+      {kFree, tracker.BufferAt(dot1, {})},
+      {kFinish, nullptr},
+  });
+}
+
+TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32scalar_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto paramY = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  auto dot0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+  auto dot1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({dot0, dot1}));
+
+  // The buffers for dot0, dot1 and tuple are the output.  No buffers can be
+  // shared.  The buffer for mul is freed before the end, since it's no longer
+  // used after dot0 finishes.
+  HeapSimulatorTracker tracker(
+      TestName(), builder.Build(),
+      {paramA, paramX, mul, paramY, dot0, dot1, tuple});
+  tracker.ExpectCallSequence({
+      {kAlloc, tracker.BufferAt(paramA, {})},
+      {kAlloc, tracker.BufferAt(paramX, {})},
+      {kAlloc, tracker.BufferAt(mul, {})},
+      {kAlloc, tracker.BufferAt(paramY, {})},
+      {kAlloc, tracker.BufferAt(dot0, {})},
+      {kFree, tracker.BufferAt(mul, {})},  // mul no longer used
+      {kAlloc, tracker.BufferAt(dot1, {})},
+      {kAlloc, tracker.BufferAt(tuple, {})},
+      // All params and outputs are freed at the end.
+      {kFree, tracker.BufferAt(paramA, {})},
+      {kFree, tracker.BufferAt(paramX, {})},
+      {kFree, tracker.BufferAt(paramY, {})},
+      {kFree, tracker.BufferAt(dot0, {})},
+      {kFree, tracker.BufferAt(dot1, {})},
+      {kFree, tracker.BufferAt(tuple, {})},
+      {kFinish, nullptr},
+  });
+}
+
+TEST_F(HeapSimulatorTest, WholeModule) {
+  HeapSimulatorTracker tracker(TestName());
+
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      tracker.module()->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      tracker.module()->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, param));
+  tracker.module()->AddEntryComputation(builder.Build());
+
+  tracker.RunWholeModule(
+      {param, while_op, body_param, cond_param, cond_iter, cond_data, cond_lt});
+  tracker.ExpectCallSequence({
+      // The entry computation param and while_op are allocated first.
+      {kAlloc, tracker.BufferAt(param, {})},
+      {kAlloc, tracker.BufferAt(param, {0})},
+      {kAlloc, tracker.BufferAt(param, {1})},
+      {kAlloc, tracker.BufferAt(while_op, {})},
+      {kAlloc, tracker.BufferAt(while_op, {0})},
+      {kAlloc, tracker.BufferAt(while_op, {1})},
+
+      // Now the while body param is allocated and freed.
+      {kAlloc, tracker.BufferAt(body_param, {})},
+      {kAlloc, tracker.BufferAt(body_param, {0})},
+      {kAlloc, tracker.BufferAt(body_param, {1})},
+      {kFree, tracker.BufferAt(body_param, {})},
+      {kFree, tracker.BufferAt(body_param, {0})},
+      {kFree, tracker.BufferAt(body_param, {1})},
+
+      // Now the while cond param is allocated. The GTE instructions just alias
+      // the param elements, so the param tuple can immediately be freed.
+      {kAlloc, tracker.BufferAt(cond_param, {})},
+      {kAlloc, tracker.BufferAt(cond_param, {0})},
+      {kAlloc, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_param, {})},
+
+      // Now the final cond less-than buffer is allocated.
+      {kAlloc, tracker.BufferAt(cond_lt, {})},
+
+      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // which is deterministic, but not obvious.
+      {kFree, tracker.BufferAt(param, {})},
+      {kFree, tracker.BufferAt(param, {0})},
+      {kFree, tracker.BufferAt(param, {1})},
+
+      {kFree, tracker.BufferAt(while_op, {})},
+      {kFree, tracker.BufferAt(while_op, {0})},
+      {kFree, tracker.BufferAt(while_op, {1})},
+
+      {kFree, tracker.BufferAt(cond_param, {0})},
+      {kFree, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_lt, {})},
+
+      {kFinish, nullptr},
+  });
+}
+
+// Base class for heap algorithm tests.
+class HeapAlgorithmTestBase : public ::testing::Test {
+ protected:
+  HeapAlgorithmTestBase() {
+    buffer_a_ = DummyLogicalBuffer();
+    buffer_b_ = DummyLogicalBuffer();
+    buffer_c_ = DummyLogicalBuffer();
+    buffer_d_ = DummyLogicalBuffer();
+    buffer_e_ = DummyLogicalBuffer();
+    buffer_f_ = DummyLogicalBuffer();
+    buffer_g_ = DummyLogicalBuffer();
+    buffer_h_ = DummyLogicalBuffer();
+    buffer_i_ = DummyLogicalBuffer();
+  }
+  ~HeapAlgorithmTestBase() override {}
+
+  const LogicalBuffer* buffer_a_;
+  const LogicalBuffer* buffer_b_;
+  const LogicalBuffer* buffer_c_;
+  const LogicalBuffer* buffer_d_;
+  const LogicalBuffer* buffer_e_;
+  const LogicalBuffer* buffer_f_;
+  const LogicalBuffer* buffer_g_;
+  const LogicalBuffer* buffer_h_;
+  const LogicalBuffer* buffer_i_;
+
+ private:
+  // Create a dummy LogicalBuffer to pass to the heap algorithm.  Since the
+  // algorithms only use the buffer as a handle, we don't need to fill in much
+  // other than the id and color.
+  const LogicalBuffer* DummyLogicalBuffer() {
+    const LogicalBuffer::Id id = buffers_.size();
+    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id,
+                                                    LogicalBuffer::Color(0)));
+    return buffers_.back().get();
+  }
+
+  std::vector<std::unique_ptr<LogicalBuffer>> buffers_;
+};
+
+class NoFragmentationStatsHeapTest : public HeapAlgorithmTestBase {};
+
+TEST_F(NoFragmentationStatsHeapTest, Empty) {
+  NoFragmentationStatsHeap heap;
+  EXPECT_EQ(0, heap.Finish().heap_size);
+}
+
+TEST_F(NoFragmentationStatsHeapTest, Simple) {
+  NoFragmentationStatsHeap heap;
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Alloc(buffer_c_, 30);
+  heap.Alloc(buffer_d_, 30);
+  heap.Free(buffer_a_, 10);
+  heap.Free(buffer_b_, 20);
+  heap.Free(buffer_c_, 30);
+  heap.Free(buffer_d_, 30);
+  EXPECT_EQ(90, heap.Finish().heap_size);
+}
+
+TEST_F(NoFragmentationStatsHeapTest, Mixed) {
+  NoFragmentationStatsHeap heap;
+  heap.Alloc(buffer_a_, 10);  // max: A
+
+  heap.Alloc(buffer_b_, 20);  // max: A+B
+  heap.Free(buffer_b_, 20);
+
+  heap.Alloc(buffer_c_, 30);  // max: A+C
+  heap.Free(buffer_c_, 30);
+
+  heap.Alloc(buffer_d_, 5);  // max: A+C
+  heap.Free(buffer_d_, 5);
+
+  heap.Free(buffer_a_, 10);
+  EXPECT_EQ(40, heap.Finish().heap_size);
+}
+
+class DecreasingSizeRunsHeapTest : public HeapAlgorithmTestBase {};
+
+TEST_F(DecreasingSizeRunsHeapTest, Empty) {
+  CallSequence call_sequence;
+  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  heap.Finish();
+  EXPECT_EQ(call_sequence, CallSequence({
+                               {kFinish, nullptr},
+                           }));
+}
+
+TEST_F(DecreasingSizeRunsHeapTest, Simple) {
+  CallSequence call_sequence;
+  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Alloc(buffer_c_, 30);
+  heap.Alloc(buffer_d_, 30);
+  heap.Free(buffer_a_, 10);
+  heap.Free(buffer_b_, 20);
+  heap.Free(buffer_c_, 30);
+  heap.Free(buffer_d_, 30);
+  heap.Finish();
+  // Runs of Allocs and Frees are sorted by decreasing size, with buffer id
+  // tiebreaker.
+  EXPECT_EQ(call_sequence, CallSequence({
+                               {kAlloc, buffer_c_},
+                               {kAlloc, buffer_d_},
+                               {kAlloc, buffer_b_},
+                               {kAlloc, buffer_a_},
+                               {kFree, buffer_c_},
+                               {kFree, buffer_d_},
+                               {kFree, buffer_b_},
+                               {kFree, buffer_a_},
+                               {kFinish, nullptr},
+                           }));
+}
+
+TEST_F(DecreasingSizeRunsHeapTest, Mixed) {
+  CallSequence call_sequence;
+  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Free(buffer_b_, 20);
+
+  heap.Alloc(buffer_c_, 30);
+  heap.Free(buffer_c_, 30);
+
+  heap.Alloc(buffer_d_, 5);
+  heap.Free(buffer_d_, 5);
+  heap.Free(buffer_a_, 10);
+  heap.Finish();
+  // Runs of Allocs and Frees are sorted by decreasing size.
+  EXPECT_EQ(call_sequence, CallSequence({
+                               {kAlloc, buffer_b_},
+                               {kAlloc, buffer_a_},
+                               {kFree, buffer_b_},
+
+                               {kAlloc, buffer_c_},
+                               {kFree, buffer_c_},
+
+                               {kAlloc, buffer_d_},
+                               {kFree, buffer_a_},
+                               {kFree, buffer_d_},
+                               {kFinish, nullptr},
+                           }));
+}
+
+class LazyBestFitHeapTest : public HeapAlgorithmTestBase {};
+
+TEST_F(LazyBestFitHeapTest, Empty) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(0, result.heap_size);
+  EXPECT_EQ(0, result.chunk_map.size());
+}
+
+TEST_F(LazyBestFitHeapTest, Simple) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Alloc(buffer_c_, 30);
+  heap.Alloc(buffer_d_, 30);
+  heap.Free(buffer_a_, 10);
+  heap.Free(buffer_b_, 20);
+  heap.Free(buffer_c_, 30);
+  heap.Free(buffer_d_, 30);
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(90, result.heap_size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_d_).size);
+
+  EXPECT_EQ(0, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(60, result.chunk_map.at(buffer_d_).offset);
+}
+
+TEST_F(LazyBestFitHeapTest, Mixed) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+  heap.Alloc(buffer_a_, 10);  // A lazy offset
+
+  heap.Alloc(buffer_b_, 20);  // B lazy offset
+  heap.Free(buffer_b_, 20);   // B range = [0, 20)  free = [0, 20)
+
+  heap.Alloc(buffer_c_, 30);  // C range = [0, 30)
+  heap.Free(buffer_c_, 30);   //                    free = [0, 30)
+
+  heap.Alloc(buffer_d_, 5);  // D range = [0, 5)   free = [5, 30)
+  heap.Free(buffer_d_, 5);   //                    free = [0, 30)
+
+  heap.Free(buffer_a_, 10);  // A range = [30, 10) free = [0, 40)
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(40, result.heap_size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(5, result.chunk_map.at(buffer_d_).size);
+
+  EXPECT_EQ(30, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_d_).offset);
+}
+
+TEST_F(LazyBestFitHeapTest, BestFit) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+
+  // First alloc/free buffer_a_, to force a big free chunk to appear.
+  heap.Alloc(buffer_a_, 200);  // A lazy offset
+  heap.Free(buffer_a_, 200);   // A range = [0, 200)   free = [0, 200)
+
+  // Now alloc a bunch of buffers that are allocated out of the free chunk.
+  heap.Alloc(buffer_b_, 30);  // B range = [0, 30)    free = [30, 200)
+  heap.Alloc(buffer_c_, 30);  // C range = [30, 60)   free = [60, 200)
+  heap.Alloc(buffer_d_, 20);  // D range = [60, 80)   free = [80, 200)
+  heap.Alloc(buffer_e_, 20);  // E range = [80, 100)  free = [100, 200)
+  heap.Alloc(buffer_f_, 10);  // F range = [100, 110) free = [110, 200)
+  heap.Alloc(buffer_g_, 10);  // G range = [110, 120) free = [120, 200)
+  heap.Alloc(buffer_h_, 80);  // H range = [120, 200)
+
+  // Free buffers to create free chunks of different sizes.
+  heap.Free(buffer_c_, 30);  // free = [30, 60)
+  heap.Free(buffer_e_, 20);  // free = [30, 60), [80, 100)
+  heap.Free(buffer_g_, 10);  // free = [30, 60), [80, 100), [110, 120)
+
+  // The best fit is picked out of the existing free chunks.
+  heap.Alloc(buffer_i_, 15);  // I range = [80, 95)
+
+  // The frees here ensure the buffer-coalescing logic is exercised.
+  heap.Free(buffer_b_, 30);
+  heap.Free(buffer_d_, 20);
+  heap.Free(buffer_f_, 10);
+  heap.Free(buffer_h_, 80);
+  heap.Free(buffer_i_, 15);
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(200, result.heap_size);
+  EXPECT_EQ(200, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_d_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_e_).size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_f_).size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_g_).size);
+  EXPECT_EQ(80, result.chunk_map.at(buffer_h_).size);
+  EXPECT_EQ(15, result.chunk_map.at(buffer_i_).size);
+
+  EXPECT_EQ(0, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(60, result.chunk_map.at(buffer_d_).offset);
+  EXPECT_EQ(80, result.chunk_map.at(buffer_e_).offset);
+  EXPECT_EQ(100, result.chunk_map.at(buffer_f_).offset);
+  EXPECT_EQ(110, result.chunk_map.at(buffer_g_).offset);
+  EXPECT_EQ(120, result.chunk_map.at(buffer_h_).offset);
+  EXPECT_EQ(80, result.chunk_map.at(buffer_i_).offset);
+}
+
+TEST_F(LazyBestFitHeapTest, Lazy) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+
+  // First alloc some buffers, which are all lazily allocated offsets.
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 5);
+  heap.Alloc(buffer_c_, 10);
+
+  // Now free some buffers, which forces offset assignment.
+  heap.Free(buffer_a_, 10);  // A range = [0, 10)  free = [0, 10)
+  heap.Free(buffer_c_, 10);  // C range = [10, 20) free = [0, 20)
+
+  // If we hadn't lazily assigned offsets, the free chunk wouldn't be large
+  // enough to hold the entire allocation.
+  heap.Alloc(buffer_d_, 20);  // D range = [0, 20)
+
+  heap.Free(buffer_b_, 5);  // B range = [20, 25)
+  heap.Free(buffer_d_, 20);
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(25, result.heap_size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(5, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_d_).size);
+
+  EXPECT_EQ(0, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_d_).offset);
+}
+
+TEST_F(LazyBestFitHeapTest, ReuseLastFreeChunk) {
+  LazyBestFitHeap heap(/*alignment=*/1);
+
+  // First alloc/free buffer_a_, to force a big free chunk to appear.
+  heap.Alloc(buffer_a_, 60);  // A lazy offset
+  heap.Free(buffer_a_, 60);   // A range = [0, 60)   free = [0, 60)
+
+  // Now alloc a bunch of buffers that are allocated out of the free chunk.
+  heap.Alloc(buffer_b_, 10);  // B range = [0, 10)    free = [10, 60)
+  heap.Alloc(buffer_c_, 20);  // C range = [10, 30)   free = [30, 60)
+  heap.Alloc(buffer_d_, 30);  // D range = [30, 60)
+
+  // Free buffers to create free chunks of different sizes.
+  heap.Free(buffer_b_, 10);  // free = [0, 10)
+  heap.Free(buffer_d_, 30);  // free = [0, 10), [30, 60)
+
+  // No free chunks are large enough, but the last free chunk is adjacent to the
+  // end of the heap, so we re-use that chunk.
+  heap.Alloc(buffer_e_, 40);  // E range = [30, 70)
+
+  heap.Free(buffer_c_, 20);
+  heap.Free(buffer_e_, 40);
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(70, result.heap_size);
+  EXPECT_EQ(60, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(20, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_d_).size);
+  EXPECT_EQ(40, result.chunk_map.at(buffer_e_).size);
+
+  EXPECT_EQ(0, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_d_).offset);
+  EXPECT_EQ(30, result.chunk_map.at(buffer_e_).offset);
+}
+
+TEST_F(LazyBestFitHeapTest, Alignment) {
+  LazyBestFitHeap heap(/*alignment=*/64);
+
+  // First alloc some buffers, which are all lazily allocated offsets.
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 5);
+  heap.Alloc(buffer_c_, 10);
+
+  // Now free some buffers, which forces offset assignment with alignment.
+  heap.Free(buffer_a_, 10);  //  A range = [0, 10)    free = [0, 10)
+  heap.Free(buffer_c_, 10);  //  C range = [64, 74)   free = [0, 74)
+
+  // If we hadn't lazily assigned offsets, and accounted for alignment, the free
+  // chunk wouldn't be large enough to hold the entire allocation.
+  heap.Alloc(buffer_d_, 74);  // D range = [0, 74)    free = [)
+
+  heap.Free(buffer_b_, 5);    // B range = [128, 133) free = [74, 133)
+  heap.Alloc(buffer_e_, 23);  // E range = [128, 151) free = [74, 128)
+
+  heap.Free(buffer_d_, 74);  //                       free = [0, 128)
+  heap.Free(buffer_e_, 23);  //                       free = [0, 151)
+
+  const HeapSimulator::Result result = heap.Finish();
+  EXPECT_EQ(151, result.heap_size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(5, result.chunk_map.at(buffer_b_).size);
+  EXPECT_EQ(10, result.chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(74, result.chunk_map.at(buffer_d_).size);
+  EXPECT_EQ(23, result.chunk_map.at(buffer_e_).size);
+
+  EXPECT_EQ(0, result.chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(128, result.chunk_map.at(buffer_b_).offset);
+  EXPECT_EQ(64, result.chunk_map.at(buffer_c_).offset);
+  EXPECT_EQ(0, result.chunk_map.at(buffer_d_).offset);
+  EXPECT_EQ(128, result.chunk_map.at(buffer_e_).offset);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
new file mode 100644
index 00000000000..af853385d63
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DO NOT USE THESE PROTO MESSAGES FOR ANYTHING OTHER THAN DEBUGGING.
+//
+// Don't use these protos in the real compilation or execution codepaths. The
+// data format is meant for debugging only, and may change without notice.
+//
+// Many of the protos below are simple 1-to-1 serializations of the
+// corresponding C++ classes.
+//
+// FIELD NAMES ARE IMPORTANT
+//
+// Unlike most protos, you can't safely change the names of fields, even if you
+// keep the numeric ids the same. This is because we sometimes serialize these
+// protos as JSON, which includes the field names in the serialization.
+
+syntax = "proto3";
+
+package xla;
+import "tensorflow/compiler/xla/xla_data.proto";
+
+option cc_enable_arenas = true;
+
+// Serialization of HloInstruction.
+message HloInstructionProto {
+  string name = 1;
+  string opcode = 2;
+  xla.Shape shape = 3;
+  repeated string operand_names = 4;
+  repeated string control_predecessor_names = 5;
+  repeated string called_computation_names = 6;
+
+  xla.OpMetadata metadata = 7;
+
+  // Literal, only present for kConstant.
+  xla.LiteralProto literal = 8;
+
+  // Parameter info, only present for kParameter.
+  int64 parameter_number = 9;
+  string parameter_name = 10;
+
+  // Fusion state, only present for kFusion.
+  string fusion_kind = 11;
+  HloComputationProto fused_instructions_computation = 12;
+
+  // Index for kGetTupleElement.
+  int64 tuple_index = 13;
+}
+
+// Serialization of HloComputation.
+message HloComputationProto {
+  string name = 1;
+
+  // The array of instructions is always in a valid dependency order, where
+  // operands appear before their users.
+  repeated HloInstructionProto instructions = 2;
+}
+
+// Serialization of HloModule.
+message HloModuleProto {
+  string name = 1;
+  string entry_computation_name = 2;
+
+  // The array of computations is always in a valid dependency order, where
+  // callees appear before their callers.
+  repeated HloComputationProto computations = 3;
+}
+
+// Serialization of HloOrdering.
+message HloOrderingProto {
+  // NOTE: currently only sequential orderings are serialized.
+  message SequentialComputation {
+    string computation_name = 1;
+    repeated string instruction_names = 2;
+  }
+  repeated SequentialComputation sequential_computations = 1;
+}
+
+// Serialization of LogicalBuffer.
+message LogicalBufferProto {
+  // Location represents an instruction and its shape index, which uniquely
+  // identifies a point where a buffer is needed.
+  message Location {
+    // NOTE: module_name isn't necessary, since all LogicalBuffers are
+    // associated with a single HloModule.
+    string computation_name = 1;
+    string instruction_name = 2;
+    repeated int64 shape_index = 3;
+  }
+
+  int64 id = 1;
+  int64 size = 2;
+
+  // The location where the buffer is defined.
+  Location defined_at = 3;
+
+  int64 color = 4;
+}
+
+// Serialization of BufferAllocation.
+message BufferAllocationProto {
+  // Assigned represents a single LogicalBuffer that is assigned to this
+  // BufferAllocation.
+  message Assigned {
+    int64 logical_buffer_id = 1;
+    int64 offset = 2;
+    int64 size = 3;
+  }
+
+  int64 index = 1;
+  int64 size = 2;
+  bool is_thread_local = 3;
+  bool is_reusable = 4;
+  bool is_entry_computation_parameter = 5;
+  int64 parameter_number = 6;
+  bool maybe_live_out = 7;
+  int64 color = 8;
+  repeated Assigned assigned = 9;
+}
+
+// A trace of a HeapSimulator run.
+message HeapSimulatorTrace {
+  // The trace includes a list of events, where each event describes one action
+  // performed by the heap simulator.
+  message Event {
+    enum Kind {
+      ALLOC = 0;  // A memory region was allocated for the buffer.
+      FREE = 1;   // A memory region was freed for the buffer.
+
+      // A buffer was shared with another (canonical) buffer. This is similar to
+      // ALLOC, except that instead of allocating a new region of memory, the
+      // memory region of the canonical buffer is directly re-used. Multiple
+      // buffers may share with the same canonical buffer. The lifetime of the
+      // canonical buffer is extended to the union of all lifetimes.
+      SHARE_WITH = 2;
+    }
+    Kind kind = 1;
+
+    // The id of the LogicalBuffer that the event applies to.
+    int64 buffer_id = 2;
+
+    // The HloInstruction that the simulation was processing that caused this
+    // event to occur, identified by its computation and instruction name. E.g.
+    // buffers defined by instruction A are allocated when processing A.
+    string computation_name = 3;
+    string instruction_name = 4;
+
+    // The id of the canonical LogicalBuffer that the buffer shares with. Only
+    // set for SHARE_WITH events.
+    int64 share_with_canonical_id = 5;
+  }
+  repeated Event events = 1;
+  bool whole_module_simulation = 2;
+}
+
+// Serialization of BufferAssignment.
+message BufferAssignmentProto {
+  // Alias represents a source LogicalBuffer, and the buffer location that
+  // aliases it.
+  message BufferAlias {
+    int64 source_buffer_id = 1;
+    LogicalBufferProto.Location location = 2;
+  }
+
+  repeated LogicalBufferProto logical_buffers = 1;
+  repeated BufferAlias buffer_aliases = 2;
+  repeated BufferAllocationProto buffer_allocations = 3;
+  repeated HeapSimulatorTrace heap_simulator_traces = 4;
+}
+
+// Grouping message that contains all of the information above.
+message HloProto {
+  HloModuleProto hlo_module = 1;
+  HloOrderingProto hlo_ordering = 2;
+  BufferAssignmentProto buffer_assignment = 3;
+}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
new file mode 100644
index 00000000000..3b37f4a4b89
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -0,0 +1,396 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+
+#include <ostream>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+void HloBuffer::AddValue(const HloValue& value) {
+  // If the value is already contained in this buffer, just return.
+  if (std::find(value_ids_.begin(), value_ids_.end(), value.id()) !=
+      value_ids_.end()) {
+    return;
+  }
+
+  value_ids_.push_back(value.id());
+
+  // Add all of the locations of the HloValue to this buffer.
+  for (const HloLocation& location : value.locations()) {
+    if (std::find(locations_.begin(), locations_.end(), location) ==
+        locations_.end()) {
+      locations_.push_back(location);
+    }
+  }
+}
+
+bool HloBuffer::operator==(const HloBuffer& other) const {
+  bool equal = id() == other.id();
+  if (equal) {
+    // DCHECK because these comparisons are expensive (linear time).
+    DCHECK(value_ids() == other.value_ids());
+    DCHECK(locations() == other.locations());
+  }
+  return equal;
+}
+
+string HloBuffer::ToString() const {
+  return StrCat("HloBuffer ", id_, ", values: ", Join(value_ids_, ", "));
+}
+
+std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer) {
+  out << buffer.ToString();
+  return out;
+}
+
+void HloBufferSet::AddBuffer(HloBuffer::Id buffer_id) {
+  if (std::find(buffer_ids_.begin(), buffer_ids_.end(), buffer_id) ==
+      buffer_ids_.end()) {
+    buffer_ids_.push_back(buffer_id);
+  }
+}
+
+void HloBufferSet::RemoveBufferOrDie(HloBuffer::Id buffer_id) {
+  auto it = std::find(buffer_ids_.begin(), buffer_ids_.end(), buffer_id);
+  CHECK(it != buffer_ids_.end());
+  buffer_ids_.erase(it);
+}
+
+string HloBufferSet::ToString() const {
+  return StrCat("HloBufferSet, buffers: ", Join(buffer_ids_, ", "));
+}
+
+std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set) {
+  out << buffer_set.ToString();
+  return out;
+}
+
+bool InstructionBufferSet::IsAmbiguous() const {
+  bool is_ambiguous = false;
+  ForEachElement(
+      [&is_ambiguous](const ShapeIndex& index, const HloBufferSet& buffer_set) {
+        is_ambiguous |= buffer_set.buffer_ids().size() > 1;
+      });
+  return is_ambiguous;
+}
+
+bool InstructionBufferSet::IsDistinct() const {
+  bool is_distinct = true;
+  tensorflow::gtl::FlatSet<HloBuffer::Id> seen_ids;
+  ForEachElement([&is_distinct, &seen_ids](const ShapeIndex& index,
+                                           const HloBufferSet& buffer_set) {
+    for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
+      auto pair = seen_ids.insert(buffer_id);
+      if (!pair.second) {
+        is_distinct = false;
+      }
+    }
+  });
+  return is_distinct;
+}
+
+string InstructionBufferSet::ToString() const {
+  string out =
+      StrCat("InstructionBufferSet(", ShapeUtil::HumanString(shape()), ")\n");
+  ForEachElement([this, &out](const ShapeIndex& index,
+                              const HloBufferSet& value_set) {
+    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
+  });
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionBufferSet& buffer_set) {
+  out << buffer_set.ToString();
+  return out;
+}
+
+HloAliasAnalysis::HloAliasAnalysis(HloModule* module) : module_(module) {}
+
+void HloAliasAnalysis::InitializeBufferSets() {
+  std::unordered_map<HloValue::Id, HloBuffer::Id> value_to_buffer;
+
+  // Initially define a buffer for every HloValue in the module.
+  for (const HloValue* value : dataflow_analysis_->values()) {
+    HloBuffer& buffer = NewHloBuffer();
+    buffer.AddValue(*value);
+    value_to_buffer[value->id()] = buffer.id();
+  }
+
+  // Construct the Instruction buffer set to contain the HloBuffers for each
+  // HloValue in the InstructionValueSet.
+  for (auto& computation : module_->computations()) {
+    for (auto& instruction : computation->instructions()) {
+      buffer_sets_.emplace(std::piecewise_construct,
+                           std::forward_as_tuple(instruction.get()),
+                           std::forward_as_tuple(instruction->shape()));
+      dataflow_analysis_->GetInstructionValueSet(instruction.get())
+          .ForEachElement(
+              [this, &instruction, &value_to_buffer](
+                  const ShapeIndex& index, const HloValueSet& value_set) {
+                for (HloValue::Id value_id : value_set.value_ids()) {
+                  HloBuffer::Id buffer_id = value_to_buffer.at(value_id);
+                  GetBufferSet(instruction.get(), index).AddBuffer(buffer_id);
+                }
+              });
+    }
+  }
+}
+
+void HloAliasAnalysis::CombineBuffers(
+    tensorflow::gtl::ArraySlice<HloBuffer::Id> buffer_ids) {
+  VLOG(4) << "Combining buffers: " << Join(buffer_ids, ", ");
+
+  if (buffer_ids.size() < 2) {
+    return;
+  }
+
+  // Merging buffers invalidates the buffer vector.
+  buffers_vector_.clear();
+
+  // Add all values from all buffers to the first buffer in the list.
+  HloBuffer& unified_buffer = GetBuffer(buffer_ids[0]);
+  for (int i = 1; i < buffer_ids.size(); ++i) {
+    const HloBuffer::Id buffer_id = buffer_ids[i];
+    const HloBuffer& buffer = GetBuffer(buffer_id);
+
+    VLOG(4) << "Eliminating buffer: " << buffer_id;
+
+    // Add all values held by the buffer-to-eliminate to the unified buffer.
+    for (HloValue::Id value_id : buffer.value_ids()) {
+      unified_buffer.AddValue(dataflow_analysis_->GetValue(value_id));
+    }
+
+    // Iterate through all locations where the buffer-to-eliminate exists and
+    // replace it with the unified buffer.
+    for (const HloLocation& location : buffer.locations()) {
+      VLOG(4) << "Replacing in " << location;
+      GetBufferSet(location.instruction, location.index)
+          .RemoveBufferOrDie(buffer_id);
+      GetBufferSet(location.instruction, location.index)
+          .AddBuffer(unified_buffer.id());
+    }
+
+    buffers_.erase(buffer_id);
+  }
+
+  TF_DCHECK_OK(Verify());
+}
+
+Status HloAliasAnalysis::Verify() const {
+  // Verify every HloBuffer in buffers_ exists somewhere in an HloBufferSet and
+  // verify that every HloBuffer in the HloBufferSets exists somewhere in
+  // buffers_.
+  tensorflow::gtl::FlatSet<HloBuffer::Id> buffers_in_sets;
+  for (auto& pair : buffer_sets_) {
+    const InstructionBufferSet& instruction_buffer_set = pair.second;
+    TF_RETURN_IF_ERROR(instruction_buffer_set.ForEachElementWithStatus(
+        [this, &buffers_in_sets](const ShapeIndex& index,
+                                 const HloBufferSet& buffer_set) -> Status {
+          for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
+            TF_RET_CHECK(ContainsKey(buffers_, buffer_id));
+            buffers_in_sets.insert(buffer_id);
+          }
+          return Status::OK();
+        }));
+  }
+  for (auto& pair : buffers_) {
+    const HloBuffer::Id buffer_id = pair.first;
+    const HloBuffer& buffer = pair.second;
+    TF_RET_CHECK(buffer_id == buffer.id());
+    TF_RET_CHECK(ContainsKey(buffers_in_sets, buffer_id));
+  }
+  return Status::OK();
+}
+
+void HloAliasAnalysis::FlattenInstructionBufferSets(
+    tensorflow::gtl::ArraySlice<const HloInstruction*> instructions) {
+  VLOG(4) << "Flattening buffer sets of instructions: "
+          << Join(instructions, ", ",
+                  [this](string* out, const HloInstruction* instruction) {
+                    StrAppend(out, instruction->FullyQualifiedName());
+                  });
+  if (instructions.size() < 2) {
+    return;
+  }
+  ShapeUtil::ForEachSubshape(
+      instructions[0]->shape(),
+      [this, instructions](const Shape& /*subshape*/, const ShapeIndex& index) {
+        // Gather all HloBuffers contained in all the buffer sets of the
+        // given instructions at the current index.
+        std::vector<HloBuffer::Id> to_unify;
+        for (const HloInstruction* instruction : instructions) {
+          const HloBufferSet& buffer_set = GetBufferSet(instruction, index);
+          to_unify.insert(to_unify.end(), buffer_set.buffer_ids().begin(),
+                          buffer_set.buffer_ids().end());
+        }
+        // Sort and uniquify buffers to combine.
+        std::sort(to_unify.begin(), to_unify.end());
+        to_unify.erase(std::unique(to_unify.begin(), to_unify.end()),
+                       to_unify.end());
+
+        CombineBuffers(to_unify);
+      });
+}
+
+HloBuffer& HloAliasAnalysis::NewHloBuffer() {
+  HloBuffer::Id buffer_id = next_buffer_id_++;
+  auto it_added = buffers_.emplace(std::piecewise_construct,
+                                   std::forward_as_tuple(buffer_id),
+                                   std::forward_as_tuple(buffer_id));
+  CHECK(it_added.second);
+
+  return it_added.first->second;
+}
+
+string HloAliasAnalysis::ToString() const {
+  string out = StrCat("HloAliasAnalysis, module ", module_->name(), "\n");
+  StrAppend(&out, "  Instruction buffer sets:\n");
+  for (const std::unique_ptr<HloComputation>& computation :
+       module_->computations()) {
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      StrAppend(&out, "    ", instruction->FullyQualifiedName(), ":\n");
+      auto buffer_str = [this](const HloBuffer& buffer) {
+        return StrCat(
+            "Buffer ", buffer.id(), ", values: ",
+            Join(buffer.value_ids(), ", ",
+                 [this](string* out, HloValue::Id value_id) {
+                   StrAppend(
+                       out,
+                       dataflow_analysis_->GetValue(value_id).ToShortString());
+                 }));
+      };
+      if (ShapeUtil::IsTuple(instruction->shape())) {
+        GetInstructionBufferSet(instruction.get())
+            .ForEachElement([this, &out, &buffer_str](
+                                const ShapeIndex& index,
+                                const HloBufferSet& buffer_set) {
+              StrAppend(&out, "      tuple index ", index.ToString(), ":\n");
+              for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
+                StrAppend(&out, "        ", buffer_str(GetBuffer(buffer_id)),
+                          "\n");
+              }
+            });
+      } else {
+        const HloBufferSet top_level_buffer_set =
+            GetBufferSet(instruction.get());
+        for (HloBuffer::Id buffer_id : top_level_buffer_set.buffer_ids()) {
+          StrAppend(&out, "      ", buffer_str(GetBuffer(buffer_id)), "\n");
+        }
+      }
+    }
+  }
+  return out;
+}
+
+const InstructionBufferSet& HloAliasAnalysis::GetInstructionBufferSet(
+    const HloInstruction* instruction) const {
+  return buffer_sets_.at(instruction);
+}
+
+InstructionBufferSet& HloAliasAnalysis::GetInstructionBufferSet(
+    const HloInstruction* instruction) {
+  return buffer_sets_.at(instruction);
+}
+
+const HloBufferSet& HloAliasAnalysis::GetBufferSet(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  return buffer_sets_.at(instruction).element(index);
+}
+
+HloBufferSet& HloAliasAnalysis::GetBufferSet(const HloInstruction* instruction,
+                                             const ShapeIndex& index) {
+  return *buffer_sets_.at(instruction).mutable_element(index);
+}
+
+const std::vector<const HloBuffer*>& HloAliasAnalysis::buffers() const {
+  if (buffers_vector_.empty()) {
+    // Lazily construct vector of buffers.
+    buffers_vector_.reserve(buffers_.size());
+    for (auto& pair : buffers_) {
+      buffers_vector_.push_back(&pair.second);
+    }
+    std::sort(buffers_vector_.begin(), buffers_vector_.end(),
+              [](const HloBuffer* a, const HloBuffer* b) {
+                return a->id() < b->id();
+              });
+  } else {
+    CHECK_EQ(buffers_vector_.size(), buffers_.size());
+    for (const HloBuffer* buffer : buffers_vector_) {
+      DCHECK(ContainsKey(buffers_, buffer->id()));
+      DCHECK(&GetBuffer(buffer->id()) == buffer);
+    }
+  }
+  return buffers_vector_;
+}
+
+/* static */
+StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
+    HloModule* module) {
+  VLOG(1) << "HloAliasAnalysis::Run on module " << module->name();
+  XLA_VLOG_LINES(2, module->ToString());
+
+  auto alias_analysis = WrapUnique(new HloAliasAnalysis(module));
+  TF_ASSIGN_OR_RETURN(
+      alias_analysis->dataflow_analysis_,
+      HloDataflowAnalysis::Run(module, /*ssa_form=*/true,
+                               /*bitcast_defines_value=*/false));
+
+  alias_analysis->InitializeBufferSets();
+  VLOG(3) << "Initial state:\n" << alias_analysis->ToString();
+
+  // The while instruction updates its state inplace, so the inputs to the while
+  // alias the while instruction, the parameters of the subcomputations, and the
+  // root of the body subcomputation.
+  for (auto& computation : module->computations()) {
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        VLOG(4) << "Flattening buffer sets at kWhile instruction: "
+                << instruction->name();
+        alias_analysis->FlattenInstructionBufferSets(
+            {instruction->operand(0),
+             instruction->while_body()->parameter_instruction(0),
+             instruction->while_body()->root_instruction(),
+             instruction->while_condition()->parameter_instruction(0),
+             instruction.get()});
+      }
+    }
+  }
+  VLOG(1) << alias_analysis->ToString();
+  return std::move(alias_analysis);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
new file mode 100644
index 00000000000..0fa35827b5e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -0,0 +1,301 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
+
+#include <stddef.h>
+#include <iosfwd>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// A container which can hold one or more HloValues. An HLO buffer abstractly
+// represents the allocation which HLO instructions write into and read
+// from. Generally there is a one-to-one correspondence between HloBuffers and
+// HloValue where each HloValue in the module is held in a unique HloBuffer. An
+// exception is the while instruction which updates the loop state in-place. In
+// this case, we have a single HloBuffer for each HloLocation in the loop state,
+// but multiple HloValues. For example:
+//
+//   %init = ...
+//   %while = While(%init, body, condition)
+//
+//  body:
+//   %body_param = Param(0)
+//     ...
+//   %body_root = ...
+//
+//  condition:
+//   %cond_param = Param(0)
+//     ...
+//
+// For simplicity, assume that %while is array-shaped. In this case, we have a
+// single HloBuffer which holds the following HloValues: HloValue{%init},
+// HloValue{%while}, HloValue{%body_param}, HloValue{%body_root}, and
+// HloValue{%cond_param}.
+//
+// HloBuffers may appear at different HloLocations in the module mirroring the
+// same propery of HloValues. For example:
+//
+//   %sub = Sub(...)
+//   %add = Add(...)
+//   %tuple = Tuple(%add, %sub)
+//   %gte = GetTupleElement(%tuple, 0)
+//
+// In this case, the HloBuffer containing %add appears at the following
+// locations: HloLocation{%add, {}}, HloLocation{%tuple, {0}}, and
+// HloLocation{%gte, {}}.
+//
+// Different HloLocations which share the same HloBuffer indicate mandatory
+// aliasing in the HLO module. These locations must share the same memory
+// allocation for correctness (the backends rely on this property). This differs
+// from incidental aliasing introduced by memory reuse in BufferAssignment where
+// different instructions may happen to get the same allocation.
+class HloBuffer {
+ public:
+  using Id = int64;
+
+  HloBuffer(int64 id) : id_(id) {}
+
+  // Return the unique identifier for this HloBuffer.
+  int64 id() const { return id_; }
+
+  // Add a value to the set of values held by this buffer. Also adds the
+  // HloLocations of the value to the locations vector of the buffer. If the
+  // buffer already contains this value, then this method is a nop.
+  void AddValue(const HloValue& value);
+
+  // Return the IDs of all values contained in this buffer.
+  const std::vector<HloValue::Id>& value_ids() const { return value_ids_; }
+
+  // Return the locations (output of which instruction and at what index) where
+  // the buffer is used. This is exactly the union of the locations of the
+  // HloValues contained by the buffer.
+  const std::vector<HloLocation>& locations() const { return locations_; }
+
+  string ToString() const;
+
+  bool operator==(const HloBuffer& other) const;
+  bool operator!=(const HloBuffer& other) const { return !(*this == other); }
+
+ private:
+  // Unique identifier for this HloBuffer.
+  const Id id_;
+
+  // The set of values contained in the this buffer.
+  std::vector<HloValue::Id> value_ids_;
+
+  // The set of locations where this buffer is used.
+  std::vector<HloLocation> locations_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
+
+// A class representing the set of possible HloBuffers at a particular
+// HloLocation (shape index in the output of an instruction) in the XLA
+// graph. In most cases, the buffer set will have a single HloBuffer indicating
+// that the HloBuffer which appears at that particular location is known
+// unambiguously at compile-time.  However, tuple-shaped Select instructions can
+// introduce ambiguity as the tuple elements of the operands are passed by
+// reference into the output of the Select. For example:
+//
+//   %pred = ...
+//   %tuple0 = Tuple(%a, %b)
+//   %tuple1 = Tuple(%x, %y)
+//   %select = Select(%pred, %tuple0, %tuple1)
+//
+// In this case the HloBufferSet at HloLocation{%select, {0}} contains the
+// HloBuffer holding %a and the HloBuffer holding %x.
+class HloBufferSet {
+ public:
+  HloBufferSet() = default;
+
+  // Add the given buffer to this buffer set. If the buffer already exists in
+  // the set, then this is a NOP.
+  void AddBuffer(HloBuffer::Id buffer_id);
+
+  // Removes the given buffer from this buffer set. CHECK fails in the buffer is
+  // not contained in this set.
+  void RemoveBufferOrDie(HloBuffer::Id buffer_id);
+
+  // Returns the unique buffer in this set. CHECK fails if the set does not
+  // contain exactly one buffer.
+  HloBuffer::Id GetUniqueBufferId() const {
+    CHECK_EQ(buffer_ids().size(), 1);
+    return buffer_ids()[0];
+  }
+
+  // Returns the IDs of the HloBuffers contained in this buffer set.
+  const std::vector<HloBuffer::Id>& buffer_ids() const { return buffer_ids_; }
+
+  string ToString() const;
+
+ private:
+  // The IDs of the HloBuffers containted in this buffer set.
+  std::vector<HloBuffer::Id> buffer_ids_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set);
+
+// A class collecting the HloBuffers in the output of an HLO instruction. For
+// array-shaped instructions, an InstructionBufferSet trivially holds a single
+// HloBufferSet. Tuple-shaped InstructionBufferSets hold multiple
+// HloBufferSets.
+class InstructionBufferSet : public ShapeTree<HloBufferSet> {
+ public:
+  InstructionBufferSet(const Shape& shape) : ShapeTree<HloBufferSet>(shape) {}
+
+  // Returns true if any HloBufferSet contained in this InstructionBufferSet
+  // is not a singleton.
+  bool IsAmbiguous() const;
+
+  // Returns true if any HloBuffer appears in more than one HloBufferSet
+  // contained in this InstructionBufferSet.
+  bool IsDistinct() const;
+
+  string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionBufferSet& buffer_set);
+
+class HloAliasAnalysis {
+ public:
+  static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(HloModule* module);
+
+  string ToString() const;
+
+  // Return the InstructionBufferSet for the given instruction.
+  const InstructionBufferSet& GetInstructionBufferSet(
+      const HloInstruction* instruction) const;
+  InstructionBufferSet& GetInstructionBufferSet(
+      const HloInstruction* instruction);
+
+  // Return the HloBufferSet for the given location.
+  const HloBufferSet& GetBufferSet(const HloInstruction* instruction,
+                                   const ShapeIndex& index = {}) const;
+  HloBufferSet& GetBufferSet(const HloInstruction* instruction,
+                             const ShapeIndex& index = {});
+
+  // Return the HloBuffer with the given ID.
+  const HloBuffer& GetBuffer(HloBuffer::Id buffer_id) const {
+    return buffers_.at(buffer_id);
+  }
+  HloBuffer& GetBuffer(HloBuffer::Id buffer_id) {
+    return buffers_.at(buffer_id);
+  }
+
+  // Returns the unique buffer at the given location. CHECK fails if the buffer
+  // set at that location does not contain exactly one buffer.
+  const HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
+                                     const ShapeIndex& index = {}) const {
+    return GetBuffer(GetBufferSet(instruction, index).GetUniqueBufferId());
+  }
+  HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
+                               const ShapeIndex& index = {}) {
+    return GetBuffer(GetBufferSet(instruction, index).GetUniqueBufferId());
+  }
+
+  // Return a vector of all HloBuffers stabily sorted by HloBuffer::Id. This
+  // vector is lazily computed. Mutating operations on HloAliasAnalysis may
+  // invalidate the underlying vector requiring recomputation.
+  const std::vector<const HloBuffer*>& buffers() const;
+
+  // Returns the underlying dataflow analysis used by this alias analysis.
+  const HloDataflowAnalysis& dataflow_analysis() const {
+    return *dataflow_analysis_;
+  }
+
+ protected:
+  HloAliasAnalysis(HloModule* module);
+
+  // Creates a new HloBuffer and returns a reference to it.
+  HloBuffer& NewHloBuffer();
+
+  // Construct the initial set of buffer sets where an HloBuffer is created for
+  // each HloValue in the module.
+  void InitializeBufferSets();
+
+  // Combine the InstructionBufferSets for given instructions. The HloBuffers in
+  // the HloBufferSets at each ShapeIndex are combined via CombineBuffers
+  // into a single HloBuffer. This single HloBuffer then becomes the only member
+  // of these HloBufferSets (ie, they become singletons). The HloBuffers
+  // which are removed from the buffer sets are deleted from the analysis. This
+  // flattening may change InstructionBufferSets of other instructions not in
+  // 'instructions' because the HloBuffers of the InstructionBufferSets of
+  // 'instructions' can be used elsewhere in the module.
+  //
+  // This method is used to enforce the mandatory aliasing of while instructions
+  // where the init operand, body parameter, condition parameter, body root
+  // instruction, and the while itself must have exactly the same HloBuffer at
+  // each ShapeIndex.
+  //
+  // Precondition: The shapes on the given instructions must be compatible.
+  void FlattenInstructionBufferSets(
+      tensorflow::gtl::ArraySlice<const HloInstruction*> instructions);
+
+  // Combines the given HloBuffers into a single buffer. One of the given
+  // HloBuffers is chosen as the unified buffer, and all other references to the
+  // remaining buffers are replaced by this unified buffer. All HloValues
+  // contained in the replaced buffers are moved to the unified buffer, and the
+  // replaced buffers are deleted from the analysis.
+  void CombineBuffers(tensorflow::gtl::ArraySlice<HloBuffer::Id> buffer_ids);
+
+  // Verifies internal state of the analysis.
+  Status Verify() const;
+
+  HloModule* module_;
+
+  // The underlying dataflow analysis used by this alias analysis.
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+
+  // The map of all HloBuffers in the module.
+  std::unordered_map<HloBuffer::Id, HloBuffer> buffers_;
+
+  // A map from instruction to its InstructionBufferSet.
+  std::unordered_map<const HloInstruction*, InstructionBufferSet> buffer_sets_;
+
+  // A lazily constructed vector containing all HloBuffers sorted by
+  // HloBuffer::Id.
+  mutable std::vector<const HloBuffer*> buffers_vector_;
+
+  // The Id to use for the next HloBuffer.
+  int64 next_buffer_id_ = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
new file mode 100644
index 00000000000..24c467d411b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -0,0 +1,760 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+
+#include <map>
+#include <memory>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/instruction_fusion.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+class HloAliasAnalysisTest : public HloTestBase {
+ protected:
+  HloAliasAnalysisTest() : module_(TestName()) {}
+
+  // Run alias analysis on the member module. For convenience returns a
+  // reference to the generated analysis stored in analysis_.
+  const HloAliasAnalysis& RunAnalysis() {
+    analysis_ = HloAliasAnalysis::Run(&module_).ConsumeValueOrDie();
+    return *analysis_;
+  }
+
+  // Return a vector of the buffers in the buffer set at the current location.
+  std::vector<HloBuffer> GetBuffersAt(const HloInstruction* instruction,
+                                      const ShapeIndex& index = {}) const {
+    std::vector<HloBuffer> buffers;
+    for (HloBuffer::Id buffer_id :
+         analysis_->GetBufferSet(instruction, index).buffer_ids()) {
+      buffers.push_back(analysis_->GetBuffer(buffer_id));
+    }
+    return buffers;
+  }
+
+  // Return a vector containing all of the HloValues in the given buffer.
+  std::vector<HloValue> GetValuesInBuffer(const HloBuffer& buffer) {
+    std::vector<HloValue> values;
+    for (HloValue::Id value_id : buffer.value_ids()) {
+      values.push_back(analysis_->dataflow_analysis().GetValue(value_id));
+    }
+    return values;
+  }
+
+  // Return the HloValue defined at the given location.
+  const HloValue& GetValueDefinedAt(const HloInstruction* instruction,
+                                    const ShapeIndex& index = {}) const {
+    return analysis_->dataflow_analysis().GetValueDefinedAt(instruction, index);
+  }
+
+  const HloValue& GetUniqueValueInBuffer(const HloBuffer& buffer) const {
+    CHECK_EQ(buffer.value_ids().size(), 1);
+    return analysis_->dataflow_analysis().GetValue(buffer.value_ids()[0]);
+  }
+
+  HloModule module_;
+  std::unique_ptr<HloAliasAnalysis> analysis_;
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(HloAliasAnalysisTest, BinaryOperation) {
+  // Test the analysis on a single binary operation (Add).
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, constant1, constant2));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.buffers().size(), 3);
+
+  // All of the buffer sets should trivially contain a single buffer containing
+  // a single value.
+  for (const HloInstruction* instruction : {constant1, constant2, add}) {
+    EXPECT_EQ(GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(instruction)),
+              GetValueDefinedAt(instruction));
+  }
+
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(add).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(add).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, TupleAndGtes) {
+  // Verify the analysis for a Tuple and GetTupleElement instructions.
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({param0, param1}));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.buffers().size(), 4);
+
+  // Verify the expected aliasing of the tuple elements.
+  EXPECT_EQ(
+      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{})),
+      GetValueDefinedAt(tuple, /*index=*/{}));
+  EXPECT_EQ(
+      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{0})),
+      GetValueDefinedAt(param0));
+  EXPECT_EQ(
+      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{1})),
+      GetValueDefinedAt(param1));
+
+  // The tuple operand, tuple element, and result of the GTE instruction should
+  // all be the same buffer.
+  EXPECT_EQ(analysis.GetUniqueBufferAt(param0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(param0),
+            analysis.GetUniqueBufferAt(gte0));
+
+  // Verify the locations of an aliased buffer.
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(param0).locations(),
+      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
+                           HloLocation{gte0, {}}));
+
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(tuple).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
+  // Test a expression with a non-distinct buffer set.
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  // param0 is included twice in the tuple.
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({param0, param1, param0}));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(param0).locations(),
+      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
+                           HloLocation{tuple, {2}}));
+
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsAmbiguous());
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, SingleCall) {
+  // Test a single call of a subcomputation. The subcomputation adds its two
+  // array-shaped parameters.
+  auto subbuilder = HloComputation::Builder("Subcomputation");
+  auto subparam0 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto subparam1 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
+  HloComputation* called_computation =
+      module_.AddEmbeddedComputation(subbuilder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // Verify aliasing of the kCall operands and the subcomputation parameters.
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).locations(),
+              UnorderedElementsAre(HloLocation{constant1, {}},
+                                   HloLocation{subparam0, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).locations(),
+              UnorderedElementsAre(HloLocation{constant2, {}},
+                                   HloLocation{subparam1, {}}));
+
+  // The subcomputation root and the kCall itself should alias.
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(add).locations(),
+      UnorderedElementsAre(HloLocation{add, {}}, HloLocation{call, {}}));
+}
+
+TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) {
+  // Test a subcomputation which is called twice with different argument values.
+  auto subbuilder = HloComputation::Builder("Subcomputation");
+  auto subparam0 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto subparam1 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
+  HloComputation* called_computation =
+      module_.AddEmbeddedComputation(subbuilder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {call1, constant2}, called_computation));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).locations(),
+              UnorderedElementsAre(HloLocation{constant1, {}},
+                                   HloLocation{subparam0, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).locations(),
+              UnorderedElementsAre(HloLocation{constant2, {}},
+                                   HloLocation{subparam1, {}}));
+
+  // The 'add' (root of the subcomputation) aliases the two call instruction,
+  // and the first parameter of the subcomputation because 'call1' it is passed
+  // as an argument to the subcomputation in 'call2'.
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(add).locations(),
+      UnorderedElementsAre(HloLocation{add, {}}, HloLocation{call1, {}},
+                           HloLocation{subparam0, {}}, HloLocation{call2, {}}));
+
+  EXPECT_THAT(GetBuffersAt(subparam0),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1),
+                                   analysis.GetUniqueBufferAt(add)));
+  EXPECT_THAT(GetBuffersAt(subparam1),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant2)));
+
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(subparam0).IsAmbiguous());
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(subparam1).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(subparam0).IsDistinct());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(subparam1).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, SingleWhile) {
+  // Test a simple single while instruction. The while body includes a
+  // pass-through value. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   return While(%tuple, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  auto body_tuple = body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  // Condition computation trivially returns a constant "false".
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // Verify the locations of the aliased while buffers.
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{}).locations(),
+              UnorderedElementsAre(
+                  HloLocation{tuple, {}}, HloLocation{xla_while, {}},
+                  HloLocation{body_param, {}}, HloLocation{body_tuple, {}},
+                  HloLocation{cond_param, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}).locations(),
+              UnorderedElementsAre(
+                  HloLocation{constant1, {}}, HloLocation{tuple, {0}},
+                  HloLocation{xla_while, {0}}, HloLocation{body_param, {0}},
+                  HloLocation{body_element_0, {}}, HloLocation{body_tuple, {0}},
+                  HloLocation{cond_param, {0}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}).locations(),
+              UnorderedElementsAre(
+                  HloLocation{constant2, {}}, HloLocation{tuple, {1}},
+                  HloLocation{xla_while, {1}}, HloLocation{body_param, {1}},
+                  HloLocation{body_element_1, {}}, HloLocation{add, {}},
+                  HloLocation{body_tuple, {1}}, HloLocation{cond_param, {1}}));
+
+  EXPECT_THAT(
+      GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0})),
+      UnorderedElementsAre(GetValueDefinedAt(constant1)));
+  EXPECT_THAT(
+      GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1})),
+      UnorderedElementsAre(GetValueDefinedAt(constant2),
+                           GetValueDefinedAt(xla_while, /*index=*/{1}),
+                           GetValueDefinedAt(body_param, {1}),
+                           GetValueDefinedAt(cond_param, {1}),
+                           GetValueDefinedAt(add)));
+}
+
+TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
+  // Test sequential while instructions. The while body includes a
+  // pass-through value. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   %while0 = While(%tuple, body, condition)
+  //   %while1 = While(%while0, body, condition)
+  //   return While(%while1, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  auto xla_while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while0));
+  auto xla_while2 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(tuple, /*index=*/{}),
+            analysis.GetUniqueBufferAt(xla_while2, /*index=*/{}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(xla_while2, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant2),
+            analysis.GetUniqueBufferAt(xla_while2, /*index=*/{1}));
+}
+
+TEST_F(HloAliasAnalysisTest, NestedWhiles) {
+  // Test nested while instructions. The inner body passes through element 0 of
+  // its parameter, and the outer body passes through element 1.  HLO:
+  //
+  // inner_body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // outer_body((F32[], F32[]) %tuple_param):
+  //   %negate = Negate(%tuple_param{0})
+  //   %tuple = Tuple(%negate, %tuple_param{1})
+  //   return While(%tuple, inner_body, condition)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   return While(%tuple, outer_body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  // Element 0 passes transparently through the body.
+  auto inner_builder = HloComputation::Builder("inner_body");
+  auto inner_param = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto inner_element_0 = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, inner_param, 0));
+  auto inner_element_1 = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, inner_param, 1));
+  auto add = inner_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, inner_element_0, inner_element_1));
+  inner_builder.AddInstruction(
+      HloInstruction::CreateTuple({inner_element_0, add}));
+  HloComputation* inner_body =
+      module_.AddEmbeddedComputation(inner_builder.Build());
+
+  // Element 1 passes transparently through the body.
+  auto outer_builder = HloComputation::Builder("outer_body");
+  auto outer_param = outer_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto outer_element_0 = outer_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, outer_param, 0));
+  auto negate = outer_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, outer_element_0));
+  auto outer_element_1 = outer_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, outer_param, 1));
+  auto outer_tuple = outer_builder.AddInstruction(
+      HloInstruction::CreateTuple({negate, outer_element_1}));
+  auto nested_while = outer_builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, condition, inner_body, outer_tuple));
+  HloComputation* outer_body =
+      module_.AddEmbeddedComputation(outer_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto entry_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(entry_while, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(nested_while, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(inner_element_0));
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant2),
+            analysis.GetUniqueBufferAt(entry_while, /*index=*/{1}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant2),
+            analysis.GetUniqueBufferAt(nested_while, /*index=*/{1}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant2),
+            analysis.GetUniqueBufferAt(inner_element_1));
+}
+
+TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
+  // Test a while instruction with a body which permutes it's tuple parameter
+  // elements. HLO:
+  //
+  // body((F32[], F32[], F32[]) %tuple_param):
+  //   return Tuple(%tuple_param{1}, %tuple_param{2}, %tuple_param{0})
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %constant3 = Constant(3.0)
+  //   %tuple = Tuple(%constant1, %constant2, %constant3)
+  //   return While(%tuple, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_, scalar_shape_});
+
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto body_element_2 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 2));
+  body_builder.AddInstruction(HloInstruction::CreateTuple(
+      {body_element_1, body_element_2, body_element_0}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2, constant3}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // The swizzling while makes most locations in the module alias leaving only 3
+  // HloBuffers.
+  EXPECT_THAT(
+      analysis.buffers(),
+      UnorderedElementsAre(&analysis.GetUniqueBufferAt(constant1),
+                           &analysis.GetUniqueBufferAt(tuple, /*index=*/{}),
+                           &analysis.GetUniqueBufferAt(cond_constant)));
+
+  // The tuple elements of the while and the three constant inputs should all be
+  // smooshed into the same buffer.
+  EXPECT_EQ(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}),
+            analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}),
+            analysis.GetUniqueBufferAt(xla_while, /*index=*/{2}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}),
+            analysis.GetUniqueBufferAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(constant3));
+}
+
+TEST_F(HloAliasAnalysisTest, TupleSelect) {
+  // Test a kSelect of a tuple value. Non-top-level element flow through the
+  // instruction.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto constant4 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+  auto tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
+  auto tuple2 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant2}));
+  auto tuple3 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant3}));
+  auto tuple4 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant4}));
+  const Shape tuple_shape = tuple1->shape();
+  auto select11 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple1));
+  auto select12 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2));
+  auto select34 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple3, tuple4));
+  auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, select12, select34));
+
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // Verify the buffer sets of each select.
+  EXPECT_THAT(analysis.GetBufferSet(select11, /*index=*/{0}).buffer_ids(),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id()));
+  EXPECT_THAT(analysis.GetBufferSet(select12, /*index=*/{0}).buffer_ids(),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id(),
+                                   analysis.GetUniqueBufferAt(constant2).id()));
+  EXPECT_THAT(analysis.GetBufferSet(select34, /*index=*/{0}).buffer_ids(),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant3).id(),
+                                   analysis.GetUniqueBufferAt(constant4).id()));
+  EXPECT_THAT(analysis.GetBufferSet(select1234, /*index=*/{0}).buffer_ids(),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id(),
+                                   analysis.GetUniqueBufferAt(constant2).id(),
+                                   analysis.GetUniqueBufferAt(constant3).id(),
+                                   analysis.GetUniqueBufferAt(constant4).id()));
+
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(select11).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select12).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select34).IsAmbiguous());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select1234).IsAmbiguous());
+
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select11).IsDistinct());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select12).IsDistinct());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select34).IsDistinct());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select1234).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
+  // Test a tuple-shaped kSelect feeding a kWhile instruction. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %negate = Negate(%tuple_param{0})
+  //   return Tuple(%negate)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple1 = Tuple(%constant1)
+  //   %tuple2 = Tuple(%constant2)
+  //   %select = Select(%tuple1, %tuple2)
+  //   return While(%select, body, condition)
+  //
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, body_element));
+  body_builder.AddInstruction(HloInstruction::CreateTuple({negate}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
+  auto tuple2 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant2}));
+  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, select));
+
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // The while should flatten the ambiguous select buffer set so that the buffer
+  // set contents (constant1 and constant2) becomes a single buffer.
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
+            analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}));
+
+  EXPECT_THAT(GetValuesInBuffer(analysis.GetUniqueBufferAt(constant1)),
+              UnorderedElementsAre(GetValueDefinedAt(constant1),
+                                   GetValueDefinedAt(constant2),
+                                   GetValueDefinedAt(xla_while, /*index=*/{0}),
+                                   GetValueDefinedAt(body_param, /*index=*/{0}),
+                                   GetValueDefinedAt(cond_param, /*index=*/{0}),
+                                   GetValueDefinedAt(negate)));
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(select).IsAmbiguous());
+  EXPECT_FALSE(analysis.GetInstructionBufferSet(xla_while).IsAmbiguous());
+
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(select).IsDistinct());
+  EXPECT_TRUE(analysis.GetInstructionBufferSet(xla_while).IsDistinct());
+}
+
+TEST_F(HloAliasAnalysisTest, Bitcast) {
+  // Bitcasting a value should not produce a new buffer.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kBitcast, constant));
+
+  module_.AddEntryComputation(builder.Build());
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.buffers().size(), 1);
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(constant),
+            analysis.GetUniqueBufferAt(bitcast));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index c55f489494e..ff76cc7bf67 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -35,10 +35,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+using ::tensorflow::strings::StrCat;
+
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
     HloInstruction* root_instruction) {
   int parameter_count = 0;
@@ -52,16 +56,17 @@ std::unique_ptr<HloComputation> HloComputation::Builder::Build(
       root_instruction ? root_instruction : last_added_instruction_;
   CHECK_NE(nullptr, root);
 
-  return WrapUnique(
-      new HloComputation(name_, parameter_count, &instructions_, root));
+  return WrapUnique(new HloComputation(name_, parameter_count, &instructions_,
+                                       root, is_fusion_computation_));
 }
 
 HloComputation::HloComputation(
     const string& name, int parameter_count,
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
-    HloInstruction* root_instruction)
+    HloInstruction* root_instruction, bool is_fusion_computation)
     : name_(name),
       root_instruction_(root_instruction),
+      is_fusion_computation_(is_fusion_computation),
       instruction_name_uniquer_(/*separator=*/".") {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
@@ -90,19 +95,85 @@ HloInstruction* HloComputation::AddInstruction(
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
   // Generate a unique name for the instruction.
-  instruction->set_name(
-      instruction_name_uniquer_.GetUniqueName(instruction->name()));
-  instruction->set_parent(this);
+  instruction->UniquifyName(&instruction_name_uniquer_);
+  Reparent(instruction.get());
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
       instructions_.insert(instructions_.end(), std::move(instruction));
   return pinst;
 }
 
-/* static */ bool HloComputation::IsRemovable(const HloOpcode& opcode) {
-  return !(opcode == HloOpcode::kParameter || opcode == HloOpcode::kRecv ||
-           opcode == HloOpcode::kSend || opcode == HloOpcode::kTrace ||
-           opcode == HloOpcode::kOutfeed);
+HloInstruction* HloComputation::AddParameter(
+    std::unique_ptr<HloInstruction> instruction) {
+  CHECK(instruction->opcode() == HloOpcode::kParameter);
+  CHECK(is_fusion_computation_);
+  CHECK(root_instruction_->fusion_instruction() != nullptr);
+  instruction->SetParentFusion(root_instruction_->fusion_instruction());
+  CHECK(root_instruction_->fusion_instruction()->operand_count() ==
+        param_instructions_.size());
+  instruction->set_parent(this);
+  param_instructions_.push_back(instruction.get());
+  AddInstructionInternal(std::move(instruction));
+  return instructions_.back().get();
+}
+
+Status HloComputation::RemoveParameter(int64 param_no) {
+  CHECK_GE(param_no, 0);
+  CHECK_LT(param_no, param_instructions_.size());
+  CHECK(is_fusion_computation_);
+  CHECK(root_instruction_->fusion_instruction() != nullptr);
+  HloInstruction* param_instruction = param_instructions_[param_no];
+  auto param_instruction_iterator = param_instructions_.begin() + param_no;
+  param_instructions_.erase(param_instruction_iterator);
+  // Throw removed fused parameter instruction away.
+  TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+
+  while (param_no < param_instructions_.size()) {
+    param_instruction = param_instructions_[param_no];
+    string param_name = param_instruction->parameter_name();
+    // Fusion parameters are named foo.param_1, bar.param_2, etc. We are
+    // renumbering the parameters so replace the final number in the name with
+    // the updated value.
+    const string param_underscore = ".param_";
+    size_t index = param_name.rfind(param_underscore);
+    if (index == string::npos) {
+      string after_param = name().substr(index + param_underscore.size());
+      int64 numeric_suffix;
+      if (tensorflow::strings::safe_strto64(after_param, &numeric_suffix)) {
+        param_name =
+            StrCat(param_name.substr(0, index), param_underscore, param_no);
+      }
+    }
+
+    HloInstruction* new_instr =
+        AddInstructionInternal(HloInstruction::CreateParameter(
+            param_no, param_instruction->shape(), param_name));
+    TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
+    new_instr->SetParentFusion(root_instruction_->fusion_instruction());
+    param_instructions_[param_no] = new_instr;
+    TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+    param_no++;
+  }
+
+  return Status::OK();
+}
+
+void HloComputation::Reparent(HloInstruction* instruction) {
+  instruction->set_parent(this);
+}
+
+bool HloComputation::IsRemovable(const HloInstruction* instruction) {
+  // If the instruction has control predecessors or successors then we cannot
+  // remove the instruction without violating ordering constraints (added, for
+  // example, to avert interference due to buffer aliasing).
+  if (!instruction->control_predecessors().empty() ||
+      !instruction->control_successors().empty()) {
+    return false;
+  }
+  const HloOpcode opcode = instruction->opcode();
+  return !((opcode == HloOpcode::kParameter && !is_fusion_computation_) ||
+           opcode == HloOpcode::kRecv || opcode == HloOpcode::kSend ||
+           opcode == HloOpcode::kTrace || opcode == HloOpcode::kOutfeed);
 }
 
 Status HloComputation::RemoveInstructionAndUnusedOperands(
@@ -110,51 +181,49 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->user_count() == 0);
-  TF_RET_CHECK(HloComputation::IsRemovable(instruction->opcode()));
-  std::queue<HloInstruction*> remove;
-  remove.push(instruction);
-  while (!remove.empty()) {
-    HloInstruction* item = remove.front();
-    remove.pop();
-    if (item->user_count() != 0 || item == root_instruction_ ||
-        !HloComputation::IsRemovable(item->opcode())) {
+  TF_RET_CHECK(IsRemovable(instruction));
+  std::unordered_set<HloInstruction*> removed;
+  std::queue<HloInstruction*> worklist;
+  worklist.push(instruction);
+  while (!worklist.empty()) {
+    HloInstruction* item = worklist.front();
+    worklist.pop();
+
+    if (removed.count(item) != 0 || item->user_count() != 0 ||
+        item == root_instruction() || !IsRemovable(item)) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
-      remove.push(item->mutable_operand(i));
+      worklist.push(item->mutable_operand(i));
     }
 
-    // If an instruction has the same operand more than once, we must not remove
-    // it again.
     TF_RETURN_IF_ERROR(RemoveInstruction(item));
+    removed.insert(item);
   }
   return Status::OK();
 }
 
-StatusOr<bool> HloComputation::RemoveInstructionIfFound(
-    HloInstruction* instruction) {
-  TF_RET_CHECK(IsRemovable(instruction->opcode()));
-  TF_RET_CHECK(root_instruction() != instruction)
-      << "cannot remove root instruction";
-  TF_RET_CHECK(instruction->user_count() == 0)
-      << "instruction with users cannot be removed";
-
-  if (instruction_iterators_.count(instruction) == 0) {
-    return false;
-  }
+Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
   VLOG(2) << "Removing instruction " << instruction->name()
           << " from computation " << name();
+  TF_RET_CHECK(IsRemovable(instruction));
+  TF_RET_CHECK(root_instruction() != instruction)
+      << "cannot remove root instruction " << instruction->name();
+  TF_RET_CHECK(instruction->user_count() == 0)
+      << "instruction " << instruction->name()
+      << " has users and cannot be removed";
+  TF_RET_CHECK(instruction->control_predecessors().empty())
+      << "instruction " << instruction->name()
+      << " has control predecessors and cannot be removed";
+  TF_RET_CHECK(instruction->control_successors().empty())
+      << "instruction " << instruction->name()
+      << " has control successors and cannot be removed";
+
+  TF_RET_CHECK(instruction_iterators_.count(instruction) != 0);
   auto inst_it = instruction_iterators_.at(instruction);
   (*inst_it)->set_parent(nullptr);
   instruction->DetachFromOperands();
   instructions_.erase(inst_it);
-  return true;
-}
-
-Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
-  TF_ASSIGN_OR_RETURN(bool removed, RemoveInstructionIfFound(instruction));
-  TF_RET_CHECK(removed) << instruction->ToString()
-                        << " is not a member of computation " << name();
   return Status::OK();
 }
 
@@ -234,14 +303,14 @@ void ComputeComputationPostOrder(
   }
 
   for (auto& instruction : computation->instructions()) {
-    for (auto& called_computation : instruction->MakeCalledComputationsSet()) {
+    for (HloComputation* called_computation :
+         instruction->called_computations()) {
       ComputeComputationPostOrder(called_computation, visited, post_order);
     }
   }
 
   visited->insert(computation);
   post_order->push_back(computation);
-  return;
 }
 
 }  // namespace
@@ -286,22 +355,41 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-string HloComputation::ToString() const {
+string HloComputation::ToString(int nested_level) const {
   std::ostringstream s;
+  for (int i = 0; i < nested_level; i++) {
+    s << "    ";
+  }
   s << name() << " " << ShapeUtil::HumanString(ComputeProgramShape())
     << " { \n";
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
+    for (int i = 0; i < nested_level; i++) {
+      s << "    ";
+    }
     s << "  " << instruction->ToString() << "\n";
     if (instruction->opcode() == HloOpcode::kFusion) {
-      for (const auto& fused_instruction : instruction->fused_instructions()) {
-        s << "    " << fused_instruction->ToString() << "\n";
-      }
+      s << instruction->fused_instructions_computation()->ToString(
+               nested_level + 1)
+        << "\n";
     }
   }
+  for (int i = 0; i < nested_level; i++) {
+    s << "    ";
+  }
   s << "}";
   return s.str();
 }
 
+HloComputationProto HloComputation::ToProto() const {
+  HloComputationProto proto;
+  proto.set_name(name_);
+  for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
+    HloInstructionProto instruction_proto = instruction->ToProto();
+    proto.add_instructions()->Swap(&instruction_proto);
+  }
+  return proto;
+}
+
 void HloComputation::FuseInstructionsInto(
     tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
     HloInstruction* fusion_instruction) {
@@ -390,15 +478,6 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
   }
 }
 
-Status HloComputation::AddControlDependency(HloInstruction* predecessor,
-                                            HloInstruction* successor) {
-  TF_RET_CHECK(instruction_iterators_.count(predecessor) > 0);
-  TF_RET_CHECK(instruction_iterators_.count(successor) > 0);
-  successor->AddControlPredecessor(predecessor);
-  predecessor->AddControlSuccessor(successor);
-  return Status::OK();
-}
-
 ProgramShape HloComputation::ComputeProgramShape() const {
   ProgramShape program_shape;
 
@@ -419,7 +498,9 @@ bool HloComputation::operator==(const HloComputation& other) const {
         // If <a,b> are visited but not identical, the recursion should have
         // been aborted. So, if <a,b> are visited at this point, they must be
         // identical.
-        if (visited.count(std::make_pair(a, b)) > 0) return true;
+        if (visited.count(std::make_pair(a, b)) > 0) {
+          return true;
+        }
         visited.emplace(a, b);
         return a->Identical(
             *b, eq, [](const HloComputation* a, const HloComputation* b) {
@@ -442,6 +523,15 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
                                      new_instruction->shape()));
   VLOG(10) << "transformed " << old_instruction->ToString() << " to "
            << new_instruction->ToString();
+  // Try to add metadata for HLO instructions that are created to replace
+  // existing HLO instructions (e.g. during optimizations). The assumption is
+  // that the old instruction and the new instruction would perform the same
+  // function, and that they would be correlated to the same TF op. This might
+  // not always be correct since HLO optimizations can cross TF op boundaries.
+  // But still this seems to be better than nothing.
+  if (new_instruction->metadata().op_name().empty()) {
+    new_instruction->set_metadata(old_instruction->metadata());
+  }
   TF_RETURN_IF_ERROR(
       ReplaceUsesOfInstruction(old_instruction, new_instruction));
   return RemoveInstructionAndUnusedOperands(old_instruction);
@@ -510,21 +600,46 @@ HloComputation::ComputeTransitiveOperands() const {
   return result;
 }
 
-Status HloComputation::Accept(DfsHloVisitor* visitor) const {
-  // Visit all dead roots.
+std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
+  std::vector<HloInstruction*> unreachable_roots;
   for (auto& instruction : instructions()) {
     if (instruction->user_count() == 0 &&
         instruction->control_successors().empty() &&
         instruction.get() != root_instruction()) {
-      // Call FinishVisit only at the end.
-      TF_RETURN_IF_ERROR(
-          instruction->Accept(visitor, /*call_finish_visit=*/false));
+      unreachable_roots.push_back(instruction.get());
     }
   }
-  // Visit root instruction last.
+  return unreachable_roots;
+}
+
+Status HloComputation::Accept(DfsHloVisitor* visitor) const {
+  // Visit unreachable roots. Beware that the visitor might delete the currently
+  // visited root, which would invalidate iterators if the unreachable roots
+  // weren't computed ahead of time.
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    // Call FinishVisit only at the end.
+    TF_RETURN_IF_ERROR(root->Accept(visitor, /*call_finish_visit=*/false));
+  }
+  // Visit the computation root instruction last.
   return root_instruction()->Accept(visitor, /*call_finish_visit=*/true);
 }
 
+Status HloComputation::AcceptWithOperandOrder(
+    DfsHloVisitor* visitor,
+    const HloInstruction::CompareFunction& operand_order) const {
+  // Visit unreachable roots. Beware that the visitor might delete the currently
+  // visited root, which would invalidate iterators if the unreachable roots
+  // weren't computed ahead of time.
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    TF_RETURN_IF_ERROR(
+        root->AcceptWithOperandOrder(visitor, operand_order,
+                                     /*call_finish_visit=*/false));
+  }
+  // Visit the computation root instruction last.
+  return root_instruction()->AcceptWithOperandOrder(visitor, operand_order,
+                                                    /*call_finish_visit=*/true);
+}
+
 Status HloComputation::AcceptOrdered(
     DfsHloVisitor* visitor,
     const std::vector<const HloInstruction*>& order) const {
@@ -555,4 +670,44 @@ Status HloComputation::Accept(
   return this->Accept(&visitor);
 }
 
+std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix) {
+  VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
+  auto postorder = MakeInstructionPostOrder();
+  std::unordered_map<HloInstruction*, HloInstruction*> clone_map;
+  std::vector<std::unique_ptr<HloInstruction>> instructions;
+  std::unique_ptr<HloInstruction> new_instr = nullptr;
+  for (auto instr : postorder) {
+    std::vector<HloInstruction*> new_operands;
+    for (auto operand : instr->operands()) {
+      HloInstruction* new_operand = FindOrDie(clone_map, operand);
+      CHECK(new_operand != nullptr);
+      new_operands.push_back(new_operand);
+    }
+
+    new_instr = instr->CloneWithNewOperands(instr->shape(), new_operands);
+    InsertOrDie(&clone_map, instr, new_instr.get());
+    instructions.push_back(std::move(new_instr));
+  }
+  Builder builder(name() + suffix);
+  for (auto& instr : instructions) {
+    builder.AddInstruction(std::move(instr));
+  }
+  auto result = builder.Build(
+      /*root_instruction=*/FindOrDie(clone_map, root_instruction()));
+
+  // Clone control dependencies.
+  for (auto instr : postorder) {
+    HloInstruction* new_instr = FindOrDie(clone_map, instr);
+    for (auto successor : instr->control_successors()) {
+      TF_CHECK_OK(
+          new_instr->AddControlDependencyTo(FindOrDie(clone_map, successor)));
+    }
+  }
+  return result;
+}
+
+void HloComputation::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index e78e86b91fd..39074b24e41 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,8 +55,10 @@ class HloComputation {
   // Builder class for HloComputation.
   class Builder {
    public:
-    explicit Builder(const string& name)
-        : name_(name), last_added_instruction_(nullptr) {}
+    explicit Builder(const string& name, bool is_fusion_computation = false)
+        : name_(name),
+          last_added_instruction_(nullptr),
+          is_fusion_computation_(is_fusion_computation) {}
 
     // Build and return an HloComputation. The parameter root_instruction
     // specifies the already-added instruction to use as the root. If
@@ -74,6 +77,7 @@ class HloComputation {
    private:
     const string name_;
     HloInstruction* last_added_instruction_;
+    bool is_fusion_computation_;
     std::vector<std::unique_ptr<HloInstruction>> instructions_;
   };
 
@@ -81,6 +85,16 @@ class HloComputation {
   // the instruction.
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
 
+  // Remove the param_no'th parameter from the computation.
+  // Note this is only applicatable to the computation for the fusion
+  // instruction.
+  Status RemoveParameter(int64 param_no);
+
+  // Add new parameter instruction to the computation.
+  // This should be a new parameter. Instruction will be appended to parameters
+  // and inserted to the instruction list.
+  HloInstruction* AddParameter(std::unique_ptr<HloInstruction> instruction);
+
   // Remove an instruction from the computation. The instruction must have no
   // users. Instruction is deallocated with this call.
   Status RemoveInstruction(HloInstruction* instruction);
@@ -111,7 +125,7 @@ class HloComputation {
   // Returns the parameter instruction for the given parameter number.
   HloInstruction* parameter_instruction(int64 param_no) const {
     CHECK_GE(param_no, 0);
-    CHECK_LT(param_no, param_instructions_.size());
+    CHECK_LT(param_no, static_cast<int64>(param_instructions_.size()));
     return param_instructions_[param_no];
   }
 
@@ -121,24 +135,20 @@ class HloComputation {
 
   const string& name() const { return name_; }
 
+  // Use the given NameUniquer to select a unique name for the computation based
+  // on the computation's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
+
   // Return a string representation of the computation.
-  string ToString() const;
+  string ToString(int nested_level = 0) const;
+
+  // Returns a serialized representation of this computation.
+  HloComputationProto ToProto() const;
 
   const std::list<std::unique_ptr<HloInstruction>>& instructions() const {
     return instructions_;
   }
 
-  // Add a control dependency between the two instructions in this computation
-  // so that the 'predecessor' is visited before the 'successor' during the DFS
-  // traversal of the computation. Returns an error status if either of the
-  // given instructions does not belong to the current computation.
-  //
-  // This is used to enforce an additional ordering requirement that is not
-  // captured by normal data dependencies, such as ordering among Send or Recv
-  // operations to avoid deadlock.
-  Status AddControlDependency(HloInstruction* predecessor,
-                              HloInstruction* successor);
-
   // Compute and return a post-order of the instructions in the computation. In
   // this order, definitions of values always appear before their uses.
   std::list<HloInstruction*> MakeInstructionPostOrder() const;
@@ -205,6 +215,7 @@ class HloComputation {
   // Set/get the module containing this computation.
   void set_parent(HloModule* module) { parent_ = module; }
   const HloModule* parent() const { return parent_; }
+  HloModule* parent() { return parent_; }
 
   // Visit every node in the computation in DFS post-order with the given
   // visitor. This is similar to calling HloInstruction::Accept on the root of
@@ -214,6 +225,13 @@ class HloComputation {
   // root instruction as the argument).
   Status Accept(DfsHloVisitor* visitor) const;
 
+  // Same as Accept() above, but the order of operand and control predecessor
+  // visitation is determined by the given operand order; if compare(A, B) ==
+  // true, A is visited before B.
+  Status AcceptWithOperandOrder(
+      DfsHloVisitor* visitor,
+      const HloInstruction::CompareFunction& operand_order) const;
+
   // Visit every node in the computation in the given order. 'order' must
   // be a topological sort of all instructions in the computation.
   Status AcceptOrdered(DfsHloVisitor* visitor,
@@ -222,26 +240,32 @@ class HloComputation {
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const FunctionVisitor::VisitorFunction& visitor_func) const;
 
-  // Returns true if instructions of the given opcode can be removed from the
+  // Returns a deep copy of this computation including all instructions.
+  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone");
+
+  // Returns true if the given instruction can be removed from the
   // computation. Instructions such as parameters and send/receive instructions
   // cannot be removed without violating invariants of the HLO computation or
-  // module.
-  static bool IsRemovable(const HloOpcode& opcode);
+  // module with the exception of fusion computation.  A parameter instruction
+  // is removable for a fusion computation.
+  bool IsRemovable(const HloInstruction* instruction);
+
+  // Returns if this computation is a fusion computation.
+  bool IsFusionComputation() const { return is_fusion_computation_; }
 
  private:
   explicit HloComputation(
       const string& name, int parameter_count,
       std::vector<std::unique_ptr<HloInstruction>>* instructions,
-      HloInstruction* root_instruction);
+      HloInstruction* root_instruction, bool is_fusion_computation = false);
 
   // Internal helper for adding instructions.
   HloInstruction* AddInstructionInternal(
       std::unique_ptr<HloInstruction> instruction);
 
-  // Remove an instruction from the computation if found. The instruction must
-  // have no users. Instruction is deallocated with this call.
-  // Return whether instruction was found and removed.
-  StatusOr<bool> RemoveInstructionIfFound(HloInstruction* instruction);
+  // Helper for setting the parent of instructions that are added to this
+  // computation.
+  void Reparent(HloInstruction* instruction);
 
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
   //
@@ -254,9 +278,15 @@ class HloComputation {
   // of the given instruction. The given instruction must be tuple-shaped.
   StatusOr<HloInstruction*> DeepCopyTuple(HloInstruction* instruction);
 
-  const string name_;
+  // Internal helper to collect unreachable roots.
+  std::vector<HloInstruction*> CollectUnreachableRoots() const;
+
+  string name_;
   HloInstruction* root_instruction_;
 
+  // A tag shows if this is a fusion computation.
+  bool is_fusion_computation_;
+
   // Module containing this computation.
   HloModule* parent_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 1e0d09b72c7..5d49c83e2d0 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -20,15 +20,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
 class HloComputationTest : public HloTestBase {
  protected:
   HloComputationTest() {}
@@ -67,8 +74,8 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   auto negate_computation = CreateNegateComputation();
   auto map_computation = CreateMapComputation(negate_computation.get());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
-  EXPECT_EQ(map_computation->MakeEmbeddedComputationsList().front(),
-            negate_computation.get());
+  EXPECT_THAT(map_computation->MakeEmbeddedComputationsList(),
+              ElementsAre(negate_computation.get()));
 }
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
@@ -93,10 +100,10 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // GetEmbeddedComputations returns a post order of the embedded computations,
   // so the negate computation must come first.
   EXPECT_EQ(negate_computation.get(), *embedded_computations.begin());
-  EXPECT_MATCH(testing::ListToVec<HloComputation*>(embedded_computations),
-               testing::UnorderedMatcher<HloComputation*>(
-                   negate_computation.get(), map1_computation.get(),
-                   map2_computation.get()));
+  EXPECT_THAT(
+      embedded_computations,
+      UnorderedElementsAre(negate_computation.get(), map1_computation.get(),
+                           map2_computation.get()));
 }
 
 TEST_F(HloComputationTest, PostOrderSingleton) {
@@ -106,7 +113,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
-  EXPECT_EQ(computation->MakeInstructionPostOrder().front(), constant);
+  EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
 
 TEST_F(HloComputationTest, PostOrderSimple) {
@@ -121,10 +128,8 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
   auto computation = builder.Build();
 
-  EXPECT_MATCH(
-      testing::ListToVec<HloInstruction*>(
-          computation->MakeInstructionPostOrder()),
-      testing::OrderedMatcher<HloInstruction*>(constant, negate1, negate2));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              ElementsAre(constant, negate1, negate2));
 }
 
 TEST_F(HloComputationTest, PostOrderTrace) {
@@ -141,10 +146,8 @@ TEST_F(HloComputationTest, PostOrderTrace) {
   auto computation = builder.Build();
 
   // Trace instructions should be at the end of the sort.
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(
-                   computation->MakeInstructionPostOrder()),
-               testing::OrderedMatcher<HloInstruction*>(constant, negate1,
-                                                        negate2, trace));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              ElementsAre(constant, negate1, negate2, trace));
 }
 
 TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
@@ -161,10 +164,8 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(
-                   computation->MakeInstructionPostOrder()),
-               testing::UnorderedMatcher<HloInstruction*>(
-                   constant1, constant2, constant3, constant4));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              UnorderedElementsAre(constant1, constant2, constant3, constant4));
 }
 
 TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
@@ -187,9 +188,8 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
 
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(post_order),
-               testing::UnorderedMatcher<HloInstruction*>(
-                   constant1, constant2, constant3, add1, add2, add3));
+  EXPECT_THAT(post_order, UnorderedElementsAre(constant1, constant2, constant3,
+                                               add1, add2, add3));
 }
 
 TEST_F(HloComputationTest, VisitWithMultipleRoots) {
@@ -253,8 +253,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
 
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
-  EXPECT_EQ(HloOpcode::kCopy, copy->opcode());
-  EXPECT_EQ(constant, copy->operand(0));
+  EXPECT_THAT(copy, op::Copy(constant));
 }
 
 TEST_F(HloComputationTest, DeepCopyTuple) {
@@ -271,18 +270,10 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
 
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
-  EXPECT_EQ(HloOpcode::kTuple, tuple_copy->opcode());
-  EXPECT_EQ(HloOpcode::kCopy, tuple_copy->operand(0)->opcode());
-  const HloInstruction* gte0 = tuple_copy->operand(0)->operand(0);
-  EXPECT_EQ(HloOpcode::kGetTupleElement, gte0->opcode());
-  EXPECT_EQ(0, gte0->tuple_index());
-  EXPECT_EQ(tuple, gte0->operand(0));
-
-  EXPECT_EQ(HloOpcode::kCopy, tuple_copy->operand(1)->opcode());
-  const HloInstruction* gte1 = tuple_copy->operand(1)->operand(0);
-  EXPECT_EQ(HloOpcode::kGetTupleElement, gte1->opcode());
-  EXPECT_EQ(1, gte1->tuple_index());
-  EXPECT_EQ(tuple, gte1->operand(0));
+  EXPECT_THAT(tuple_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
+                                    op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_EQ(0, tuple_copy->operand(0)->operand(0)->tuple_index());
+  EXPECT_EQ(1, tuple_copy->operand(1)->operand(0)->tuple_index());
 }
 
 TEST_F(HloComputationTest, CycleDetection) {
@@ -297,15 +288,74 @@ TEST_F(HloComputationTest, CycleDetection) {
   auto computation = builder.Build();
 
   // Add a control dependency to create a cycle.
-  ASSERT_IS_OK(computation->AddControlDependency(add, negate));
+  ASSERT_IS_OK(add->AddControlDependencyTo(negate));
 
   const auto visitor = [](HloInstruction* instruction) { return Status::OK(); };
   auto visit_status = computation->Accept(visitor);
   ASSERT_FALSE(visit_status.ok());
-  ASSERT_MATCH(visit_status.error_message(),
-               testing::ContainsRegex("cycle is detecte"));
+  ASSERT_THAT(visit_status.error_message(),
+              ::testing::ContainsRegex("cycle is detecte"));
+}
+
+TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
+  // Test RemoveInstructionAndUnusedOperands with an instruction which has a
+  // duplicated (dead) operand. This verifies that the operand is not deleted
+  // twice.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  auto dead_negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
+  auto dead_add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
+  auto computation = builder.Build();
+
+  EXPECT_EQ(4, computation->instruction_count());
+  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_EQ(negate, computation->root_instruction());
+
+  ASSERT_IS_OK(computation->RemoveInstructionAndUnusedOperands(dead_add));
+
+  EXPECT_EQ(2, computation->instruction_count());
+  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_EQ(negate, computation->root_instruction());
+}
+
+TEST_F(HloComputationTest, CloneWithControlDependency) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
+  auto computation = builder.Build(/*root_instruction=*/add);
+
+  TF_CHECK_OK(negate->AddControlDependencyTo(add));
+
+  auto clone = computation->Clone();
+
+  auto cloned_add = clone->root_instruction();
+  EXPECT_EQ(cloned_add->opcode(), HloOpcode::kAdd);
+
+  auto predecessors = cloned_add->control_predecessors();
+  EXPECT_EQ(1, predecessors.size());
+  EXPECT_EQ(HloOpcode::kNegate, predecessors[0]->opcode());
+  auto successors = predecessors[0]->control_successors();
+  EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
 }  // namespace
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
new file mode 100644
index 00000000000..93f448e7018
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
+  auto evaluator = MakeUnique<HloEvaluator>();
+
+  XLA_VLOG_LINES(2,
+                 "HloConstantFolding::Run(), before:\n" + module->ToString());
+  bool changed = false;
+
+  for (auto& computation : module->computations()) {
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      // Skip dead code.
+      if (instruction->user_count() == 0 &&
+          computation->root_instruction() != instruction) {
+        continue;
+      }
+      // Skip Constant and Parameter operation.
+      if (instruction->opcode() == HloOpcode::kParameter ||
+          instruction->opcode() == HloOpcode::kConstant) {
+        continue;
+      }
+      // Skip instructions with non-constant operands.
+      if (!hlo_query::AllOperandsAreConstants(*instruction)) {
+        continue;
+      }
+
+      std::unique_ptr<Literal> result = evaluator->TryEvaluate(instruction);
+      // Currently we skip unimplemented operations.
+      // TODO(b/35975797): Fold constant computations for more operations.
+      if (result == nullptr) {
+        VLOG(2) << "Constant folding failed for instruction: "
+                << instruction->ToString();
+        continue;
+      }
+
+      TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+          instruction, HloInstruction::CreateConstant(std::move(result))));
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2, "HloConstantFolding::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
new file mode 100644
index 00000000000..331480bd029
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which performs constant folding in order to avoid unnecessary
+// computation on constants.
+class HloConstantFolding : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "constant_folding"; }
+
+  // Run constant folding operations on the given module. Returns whether the
+  // module was changed (constant expressions folded).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
new file mode 100644
index 00000000000..31b81052cb2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -0,0 +1,218 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using HloConstantFoldingTest = HloTestBase;
+
+TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
+                computation->root_instruction()->literal()),
+            42);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
+                computation->root_instruction()->literal()),
+            42.0f);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
+      42);
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
+      19);
+}
+
+TEST_F(HloConstantFoldingTest, Concatenate) {
+  const struct TestConfig {
+    int concat_dimension;
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    tensorflow::gtl::ArraySlice<int64> concat_sizes;
+  } test_configs[] = {
+      {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}},
+      {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}},
+  };
+
+  for (auto& test_config : test_configs) {
+    HloComputation::Builder builder(TestName());
+    std::vector<int64> dimensions(test_config.dimensions.begin(),
+                                  test_config.dimensions.end());
+    int64 concat_size = 0;
+    std::vector<HloInstruction*> operands;
+    for (auto csize : test_config.concat_sizes) {
+      dimensions[test_config.concat_dimension] = csize;
+      concat_size += csize;
+      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+      HloInstruction* insn = builder.AddInstruction(
+          HloInstruction::CreateConstant(std::move(literal)));
+      operands.push_back(insn);
+    }
+    dimensions[test_config.concat_dimension] = concat_size;
+    Shape shape = ShapeUtil::MakeShape(F32, dimensions);
+    builder.AddInstruction(HloInstruction::CreateConcatenate(
+        shape, operands, test_config.concat_dimension));
+    auto module = CreateNewModule();
+    auto computation = module->AddEntryComputation(builder.Build());
+
+    HloConstantFolding const_folder;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+    EXPECT_TRUE(result);
+
+    HloInstruction* root = computation->root_instruction();
+    EXPECT_THAT(root, op::Constant());
+    EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+  }
+}
+
+TEST_F(HloConstantFoldingTest, Slice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  const int64 slice_start[] = {4, 2, 3, 1, 5};
+  const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  const int64 slice_strides[] = {1, 1, 1, 1, 1};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      shape, literal_instruction, slice_start, slice_limits, slice_strides));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+}
+
+TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
+  const int64 permutation[] = {1, 2, 0, 4, 3};
+  builder.AddInstruction(
+      HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
+
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
+  bool matched = true;
+  LiteralUtil::EachCell<NativeT>(
+      root->literal(),
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+        std::vector<int64> rindexes = Permute(permutation, indices);
+        matched = matched && (value == LiteralUtil::Get<NativeT>(*literal_clone,
+                                                                 rindexes));
+      });
+  EXPECT_TRUE(matched);
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 2866f8158d5..38cc74b0f1e 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -20,10 +20,43 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 
+Status HloCostAnalysis::Preprocess(HloInstruction* hlo) {
+  // Set current instruction cost values to reasonable default values. Each
+  // handler can overwrite these values. In Postprocess, these value are
+  // accumulated and written to the per-instruction maps.
+  current_flop_count_ = 0;
+  current_transcendental_count_ = 0;
+
+  // The default element count for an instruction is the sum of elements in the
+  // operands and output. The default ShapeUtil::ByteSizeOf does not handle
+  // opaque types.
+  current_bytes_accessed_ = shape_size_(hlo->shape());
+  for (const HloInstruction* operand : hlo->operands()) {
+    current_bytes_accessed_ += shape_size_(operand->shape());
+  }
+
+  return Status::OK();
+}
+
+Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
+  // Accumulate cost values and write into per-instruction maps.
+  flop_count_ += current_flop_count_;
+  hlo_to_flop_count_[hlo] = current_flop_count_;
+
+  transcendental_count_ += current_transcendental_count_;
+  hlo_to_transcendental_count_[hlo] = current_transcendental_count_;
+
+  bytes_accessed_ += current_bytes_accessed_;
+  hlo_to_bytes_accessed_[hlo] = current_bytes_accessed_;
+
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
   const auto& shape = hlo_instruction->shape();
   // For element-wise operations, the number of computations is the same as the
@@ -32,12 +65,11 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
   auto opcode = hlo_instruction->opcode();
   // We treat the two opcodes (kExp, kPower) as transcendental operations.
   if (opcode == HloOpcode::kExp || opcode == HloOpcode::kPower) {
-    transcendental_count_ += computation_count;
+    current_transcendental_count_ = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
     // FLOPs.
-    hlo_to_flop_count_[hlo_instruction] = computation_count;
-    flop_count_ += computation_count;
+    current_flop_count_ = computation_count;
   }
   return Status::OK();
 }
@@ -69,16 +101,21 @@ Status HloCostAnalysis::HandleClamp(HloInstruction* clamp,
 }
 
 Status HloCostAnalysis::HandleParameter(HloInstruction* parameter) {
+  current_bytes_accessed_ = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleConstant(HloInstruction* constant,
                                        const Literal& literal) {
+  current_bytes_accessed_ = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleGetTupleElement(HloInstruction* get_tuple_element,
                                               HloInstruction* operand) {
+  // GetTupleElement forwards a pointer and does not touch each element in the
+  // output.
+  current_bytes_accessed_ = 0;
   return Status::OK();
 }
 
@@ -99,9 +136,9 @@ Status HloCostAnalysis::HandleSlice(HloInstruction* slice,
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleDynamicSlice(
-    HloInstruction* slice,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+Status HloCostAnalysis::HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                           HloInstruction* operand,
+                                           HloInstruction* start_indices) {
   return Status::OK();
 }
 
@@ -114,6 +151,10 @@ Status HloCostAnalysis::HandleDynamicUpdateSlice(
 Status HloCostAnalysis::HandleTuple(
     HloInstruction* tuple,
     tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  // The tuple instruction only gathers pointers from inputs (it doesn't iterate
+  // through them). The memory touched is then only the size of the output
+  // buffer.
+  current_bytes_accessed_ = shape_size_(tuple->shape());
   return Status::OK();
 }
 
@@ -125,8 +166,7 @@ Status HloCostAnalysis::HandleConcatenate(
 
 Status HloCostAnalysis::HandleConvert(HloInstruction* convert,
                                       HloInstruction* operand) {
-  flop_count_ += ShapeUtil::ElementsIn(operand->shape());
-  return Status::OK();
+  return HandleElementwiseOp(convert);
 }
 
 Status HloCostAnalysis::HandleCopy(HloInstruction* copy,
@@ -137,15 +177,24 @@ Status HloCostAnalysis::HandleCopy(HloInstruction* copy,
 Status HloCostAnalysis::HandleDot(HloInstruction* dot,
                                   HloInstruction* lhs_instruction,
                                   HloInstruction* rhs_instruction) {
+  const Shape& lhs_shape = lhs_instruction->shape();
+  const Shape& rhs_shape = rhs_instruction->shape();
+  // Count of elements along the reduction dimension (last dimension for the
+  // rhs).
+  int64 reduction_width = lhs_shape.dimensions(ShapeUtil::Rank(lhs_shape) - 1);
+
+  // First divide by reduction width before multiplying by rhs elements to avoid
+  // overflow.
+  int64 fma_count;
+  if (reduction_width == 0) {
+    fma_count = 0;
+  } else {
+    fma_count = (ShapeUtil::ElementsIn(lhs_shape) / reduction_width) *
+                ShapeUtil::ElementsIn(rhs_shape);
+  }
+
   // We count an FMA operation as 2 floating point operations.
-  // Multiplying the sizes of lhs, rhs, and result produces the square of the
-  // number of FMAs during the computation.
-  auto fma_count = std::sqrt(
-      static_cast<double>(ShapeUtil::ElementsIn(lhs_instruction->shape())) *
-      ShapeUtil::ElementsIn(rhs_instruction->shape()) *
-      ShapeUtil::ElementsIn(dot->shape()));
-  flop_count_ += 2 * fma_count;
-  hlo_to_flop_count_[dot] = 2 * fma_count;
+  current_flop_count_ = kFmaFlops * fma_count;
   return Status::OK();
 }
 
@@ -163,15 +212,14 @@ Status HloCostAnalysis::HandleMap(
     tensorflow::gtl::ArraySlice<HloInstruction*> /*static_operands*/) {
   // Compute the cost of the user function.
   HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor;
+  HloCostAnalysis visitor(shape_size_);
   TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
 
   // Compute the cost of all elements for this Map operation.
-  auto element_count = ShapeUtil::ElementsIn(map->shape());
-  transcendental_count_ += element_count * visitor.transcendental_count();
-  auto hlo_flop_count = element_count * visitor.flop_count();
-  hlo_to_flop_count_[map] = hlo_flop_count;
-  flop_count_ += hlo_flop_count;
+  int64 element_count = ShapeUtil::ElementsIn(map->shape());
+  current_transcendental_count_ =
+      element_count * visitor.transcendental_count();
+  current_flop_count_ = element_count * visitor.flop_count();
   return Status::OK();
 }
 
@@ -180,16 +228,15 @@ Status HloCostAnalysis::HandleReduce(
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function) {
   // Compute the cost of the user function.
   HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor;
+  HloCostAnalysis visitor(shape_size_);
   TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
 
   // Compute the cost of all elements for this Reduce operation.
-  auto reduction_count = ShapeUtil::ElementsIn(arg->shape()) -
-                         ShapeUtil::ElementsIn(reduce->shape());
-  auto hlo_flop_count = reduction_count * visitor.flop_count();
-  hlo_to_flop_count_[reduce] = hlo_flop_count;
-  flop_count_ += hlo_flop_count;
-  transcendental_count_ += reduction_count * visitor.transcendental_count();
+  int64 reduction_count = ShapeUtil::ElementsIn(arg->shape()) -
+                          ShapeUtil::ElementsIn(reduce->shape());
+  current_flop_count_ = reduction_count * visitor.flop_count();
+  current_transcendental_count_ =
+      reduction_count * visitor.transcendental_count();
   return Status::OK();
 }
 
@@ -199,7 +246,7 @@ Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window,
                                            HloComputation* function) {
   // Compute the cost of the user function.
   HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor;
+  HloCostAnalysis visitor(shape_size_);
   TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
@@ -209,10 +256,8 @@ Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window,
   for (const auto& dimension : window.dimensions()) {
     window_size *= dimension.size();
   }
-  auto hlo_flop_count = output_size * (window_size - 1) * visitor.flop_count();
-  hlo_to_flop_count_[reduce_window] = hlo_flop_count;
-  flop_count_ += hlo_flop_count;
-  transcendental_count_ +=
+  current_flop_count_ = output_size * (window_size - 1) * visitor.flop_count();
+  current_transcendental_count_ =
       output_size * (window_size - 1) * visitor.transcendental_count();
   return Status::OK();
 }
@@ -220,10 +265,10 @@ Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window,
 Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
   // Compute the cost of the select and scatter function.
   HloInstruction* select = instruction->select()->root_instruction();
-  HloCostAnalysis select_visitor;
+  HloCostAnalysis select_visitor(shape_size_);
   TF_RETURN_IF_ERROR(select->Accept(&select_visitor));
   HloInstruction* scatter = instruction->scatter()->root_instruction();
-  HloCostAnalysis scatter_visitor;
+  HloCostAnalysis scatter_visitor(shape_size_);
   TF_RETURN_IF_ERROR(scatter->Accept(&scatter_visitor));
 
   // Compute the cost of all elements for this operation. For each scatter
@@ -235,12 +280,10 @@ Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
   for (const auto& dimension : instruction->window().dimensions()) {
     window_size *= dimension.size();
   }
-  auto hlo_flop_count =
+  current_flop_count_ =
       source_element_count * ((window_size - 1) * select_visitor.flop_count() +
                               scatter_visitor.flop_count());
-  hlo_to_flop_count_[instruction] = hlo_flop_count;
-  flop_count_ += hlo_flop_count;
-  transcendental_count_ +=
+  current_transcendental_count_ =
       source_element_count *
       ((window_size - 1) * select_visitor.transcendental_count() +
        scatter_visitor.transcendental_count());
@@ -248,6 +291,8 @@ Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
 }
 
 Status HloCostAnalysis::HandleBitcast(HloInstruction* bitcast) {
+  // A bitcast does no computation and touches no memory.
+  current_bytes_accessed_ = 0;
   return Status::OK();
 }
 
@@ -286,10 +331,7 @@ Status HloCostAnalysis::HandleConvolution(HloInstruction* convolution,
   const int64 fmas_per_output_element =
       ShapeUtil::ElementsIn(rhs_instruction->shape()) / output_features;
   const int64 output_elements = ShapeUtil::ElementsIn(convolution->shape());
-  const double hlo_flop_count = static_cast<double>(output_elements) *
-                                fmas_per_output_element * kFmaFlops;
-  flop_count_ += hlo_flop_count;
-  hlo_to_flop_count_[convolution] = hlo_flop_count;
+  current_flop_count_ = output_elements * fmas_per_output_element * kFmaFlops;
   return Status::OK();
 }
 
@@ -299,9 +341,7 @@ Status HloCostAnalysis::HandleCrossReplicaSum(HloInstruction* crs) {
   //
   // TODO(b/33004697): Compute correct cost here, taking the actual number of
   // replicas into account.
-  const double hlo_flop_count = ShapeUtil::ElementsIn(crs->shape());
-  flop_count_ += hlo_flop_count;
-  hlo_to_flop_count_[crs] = hlo_flop_count;
+  current_flop_count_ = ShapeUtil::ElementsIn(crs->shape());
   return Status::OK();
 }
 
@@ -310,27 +350,32 @@ Status HloCostAnalysis::HandleRng(HloInstruction* random,
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
-  transcendental_count_ += ShapeUtil::ElementsIn(random->shape());
+  current_transcendental_count_ = ShapeUtil::ElementsIn(random->shape());
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
   // Compute the cost of the fused expression.
   HloInstruction* fused_expression_root = fusion->fused_expression_root();
-  HloCostAnalysis visitor;
+  // Don't compute sizes inside of fused ops. We don't use the size here and the
+  // operations inside might not have a layout.
+  HloCostAnalysis visitor([](const Shape&) { return 0; });
   TF_RETURN_IF_ERROR(fused_expression_root->Accept(&visitor));
 
   // Attribute the cost of the fused expression to the fusion node.
-  transcendental_count_ += visitor.transcendental_count();
-  hlo_to_flop_count_[fusion] += visitor.flop_count();
-  flop_count_ += visitor.flop_count();
+  current_transcendental_count_ = visitor.transcendental_count();
+  current_flop_count_ = visitor.flop_count();
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
-  return Unimplemented("call");
+Status HloCostAnalysis::HandleCall(HloInstruction* call) {
+  HloCostAnalysis computation_visitor(shape_size_);
+  TF_RETURN_IF_ERROR(call->to_apply()->Accept(&computation_visitor));
+
+  current_flop_count_ = computation_visitor.flop_count();
+  current_transcendental_count_ = computation_visitor.transcendental_count();
+  current_bytes_accessed_ = computation_visitor.bytes_accessed();
+  return Status::OK();
 }
 
 Status HloCostAnalysis::HandleCustomCall(
@@ -343,28 +388,49 @@ Status HloCostAnalysis::HandleCustomCall(
 Status HloCostAnalysis::HandleSort(HloInstruction* sort,
                                    HloInstruction* operand_instruction) {
   // The cost of sort is implementation dependent, so cannot determine at HLO
-  // level. Maybe just assume the comparison based N*log(N) sorting?
-  // TODO(b/26346211): Implement the cost model for sort.
-  return Unimplemented("HandleSort");
+  // level. Assume comparison based N*log(N) sorting.
+  int64 elements = ShapeUtil::ElementsIn(operand_instruction->shape());
+  current_flop_count_ = elements * tensorflow::Log2Ceiling(elements);
+  return Status::OK();
 }
 
-Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while,
-                                    HloInstruction* init,
-                                    HloComputation* condition,
-                                    HloComputation* body) {
+Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while) {
   // Since the number of iterations of the while node is not statically
-  // determined, we cannot analyze the computation cost of a while node.
-  // TODO(b/26346211): Add cost analysis for while node.
-  return Unimplemented("HandleWhile");
+  // determined, we cannot precisely compute the cost of a while node. For now
+  // compute the cost of a single iteration.
+  // TODO(b/26346211): Improve the cost analysis for while node.
+  HloCostAnalysis body_visitor(shape_size_);
+  TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&body_visitor));
+  HloCostAnalysis condition_visitor(shape_size_);
+  TF_RETURN_IF_ERROR(xla_while->while_condition()->Accept(&condition_visitor));
+
+  current_flop_count_ =
+      body_visitor.flop_count() + condition_visitor.flop_count();
+  current_transcendental_count_ = body_visitor.transcendental_count() +
+                                  condition_visitor.transcendental_count();
+  current_bytes_accessed_ =
+      body_visitor.bytes_accessed() + condition_visitor.bytes_accessed();
+
+  return Status::OK();
 }
 
 Status HloCostAnalysis::FinishVisit(HloInstruction* root) {
   return Status::OK();
 }
 
-double HloCostAnalysis::hlo_to_flop_count(const HloInstruction& hlo) const {
+int64 HloCostAnalysis::flop_count(const HloInstruction& hlo) const {
   auto it = hlo_to_flop_count_.find(&hlo);
-  return it == hlo_to_flop_count_.end() ? 0.0 : it->second;
+  return it == hlo_to_flop_count_.end() ? 0 : it->second;
+}
+
+int64 HloCostAnalysis::transcendental_count(const HloInstruction& hlo) const {
+  auto it = hlo_to_transcendental_count_.find(&hlo);
+  return it == hlo_to_transcendental_count_.end() ? 0 : it->second;
+}
+
+int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
+  auto it = hlo_to_bytes_accessed_.find(&hlo);
+  return it == hlo_to_bytes_accessed_.end() ? 0 : it->second;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 2377b5b9be1..b2c40f75ca4 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -35,8 +36,11 @@ namespace xla {
 // operations separately from transcendental operations.
 class HloCostAnalysis : public DfsHloVisitor {
  public:
-  HloCostAnalysis() = default;
-  ~HloCostAnalysis() override = default;
+  // shape_size is a function which returns the size in bytes of the top-level
+  // buffer of a shape.
+  using ShapeSizeFunction = std::function<int64(const Shape&)>;
+  explicit HloCostAnalysis(const ShapeSizeFunction& shape_size)
+      : shape_size_(shape_size) {}
 
   Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode,
                                 HloInstruction* operand) override;
@@ -80,16 +84,14 @@ class HloCostAnalysis : public DfsHloVisitor {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function_handle) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
   Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
-  Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* operand,
+                            HloInstruction* start_indices) override;
   Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                   HloInstruction* operand,
                                   HloInstruction* update,
@@ -111,34 +113,54 @@ class HloCostAnalysis : public DfsHloVisitor {
   Status HandlePad(HloInstruction* pad) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
   Status FinishVisit(HloInstruction* root) override;
 
-  // Returns the amount of computations in the graph.
-  double flop_count() { return flop_count_; }
-  double transcendental_count() { return transcendental_count_; }
+  Status Preprocess(HloInstruction* hlo) override;
+  Status Postprocess(HloInstruction* hlo) override;
 
-  // Resolves the provided HLO instruction to a flop count, or 0 if the HLO was
-  // not found to have a flop count in the analysis.
-  double hlo_to_flop_count(const HloInstruction& hlo) const;
+  // Returns the amount of computations in the graph.
+  int64 flop_count() const { return flop_count_; }
+  int64 transcendental_count() const { return transcendental_count_; }
+
+  // Returns the respective cost computed for a particular HLO instruction, or 0
+  // if the HLO was not found to have a cost in the analysis.
+  int64 flop_count(const HloInstruction& hlo) const;
+  int64 transcendental_count(const HloInstruction& hlo) const;
+
+  // Returns the number of bytes read/written.
+  int64 bytes_accessed(const HloInstruction& hlo) const;
+  int64 bytes_accessed() const { return bytes_accessed_; }
 
  private:
-  // An FMA counts as two floating point operations in these analyses.
+  // An FMA counts as two floating point operations in these analyzes.
   static constexpr int64 kFmaFlops = 2;
 
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(HloInstruction* hlo_instruction);
 
-  // Mapping from HLO instructions to the flop count we computed for them in the
+  // Function which computes the size of the top-level of a given shape (not
+  // including nested elements, if any). If null then bytes_accessed methods
+  // return an error.
+  const ShapeSizeFunction shape_size_;
+
+  // The total number of floating point operations, transcendental operations,
+  // and bytes accesses (read or written) in the computation.
+  int64 flop_count_ = 0;
+  int64 transcendental_count_ = 0;
+  int64 bytes_accessed_ = 0;
+
+  // Cost counts of the current instruction. These should be set by each
+  // handlers if different from the default values computed in Preprocess.
+  int64 current_flop_count_;
+  int64 current_transcendental_count_;
+  int64 current_bytes_accessed_;
+
+  // Mapping from HLO instructions to the cost we computed for them in the
   // course of the graph analysis.
-  std::map<const HloInstruction*, double> hlo_to_flop_count_;
-
-  // The number of floating point operations in the graph.
-  double flop_count_ = 0.0;
-
-  // The number of transcendental operations in the graph.
-  double transcendental_count_ = 0.0;
+  std::map<const HloInstruction*, int64> hlo_to_flop_count_;
+  std::map<const HloInstruction*, int64> hlo_to_transcendental_count_;
+  std::map<const HloInstruction*, int64> hlo_to_bytes_accessed_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloCostAnalysis);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index f55d939b42e..b74c7eb4e07 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -39,6 +39,12 @@ limitations under the License.
 namespace xla {
 namespace {
 
+constexpr int64 kPointerSize = 8;
+
+int64 ShapeSize(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+}
+
 // This test suite tests the HLO cost analysis by first building a computation
 // using the client computation builder and running the HloCostAnalysis that
 // returns the number of floating point and transcendental operations in the
@@ -48,7 +54,7 @@ class HloCostAnalysisTest : public ::testing::Test {
   HloCostAnalysisTest()
       : client_(ClientLibrary::LocalClientOrDie()),
         // Accessing service instance is required for the unit tests to enable
-        // whitebox acccesses to the user computation built from the client,
+        // whitebox accesses to the user computation built from the client,
         // as shown in the BuildHloGraph functions below.
         service_(static_cast<Service*>(ClientLibrary::GetXlaService(
             static_cast<LocalClient*>(client_)->platform()))),
@@ -121,7 +127,8 @@ class HloCostAnalysisTest : public ::testing::Test {
     VersionedComputationHandle versioned_handle =
         user_computation->GetVersionedHandle();
     return std::move(
-        computation_tracker_.BuildHloModule(versioned_handle).ValueOrDie());
+        computation_tracker_.BuildHloModule(versioned_handle, HloModuleConfig())
+            .ValueOrDie());
   }
 
   Client* client_;
@@ -144,12 +151,18 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
   // Check the number of computations returned from the analysis (1500 FMAs).
   EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 + 5 * 30 + 10 * 30));
 }
 
 TEST_F(HloCostAnalysisTest, Map) {
@@ -159,13 +172,14 @@ TEST_F(HloCostAnalysisTest, Map) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
   // add contributes to 10 flops and exp contributes to 10 transcendental ops.
   EXPECT_EQ(analysis.flop_count(), 10);
   EXPECT_EQ(analysis.transcendental_count(), 10);
+  EXPECT_EQ(analysis.bytes_accessed(), 80);
 }
 
 TEST_F(HloCostAnalysisTest, Convolution) {
@@ -182,13 +196,17 @@ TEST_F(HloCostAnalysisTest, Convolution) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
   // Output shape is [1x1x8x18] and each output element requires (3x3)
   // FMAs and one FMA is 2 flops.
   EXPECT_EQ(analysis.flop_count(), 8 * 18 * 2 * 3 * 3);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 20 + 3 * 3 + 8 * 18));
 }
 
 TEST_F(HloCostAnalysisTest, Reduce) {
@@ -200,7 +218,7 @@ TEST_F(HloCostAnalysisTest, Reduce) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
@@ -218,7 +236,7 @@ TEST_F(HloCostAnalysisTest, ReduceWindow) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
@@ -238,7 +256,7 @@ TEST_F(HloCostAnalysisTest, SelectAndScatter) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
@@ -251,7 +269,7 @@ TEST_F(HloCostAnalysisTest, Broadcast) {
   ComputationBuilder b(client_, "broadcast");
   b.Broadcast(b.ConstantR0<float>(42), {10, 7});
   auto hlo_module = BuildHloGraph(&b);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
   EXPECT_EQ(analysis.flop_count(), 0);
@@ -271,7 +289,7 @@ TEST_F(HloCostAnalysisTest, FullyConnectedForward) {
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(ShapeSize);
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
@@ -282,7 +300,7 @@ TEST_F(HloCostAnalysisTest, FullyConnectedForward) {
 }
 
 TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
-  HloCostAnalysis conv_analysis;
+  HloCostAnalysis conv_analysis(ShapeSize);
   {
     ComputationBuilder builder(client_, "conv_looking_matmul");
     auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}),
@@ -295,7 +313,7 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
         &conv_analysis));
   }
 
-  HloCostAnalysis matmul_analysis;
+  HloCostAnalysis matmul_analysis(ShapeSize);
   {
     ComputationBuilder builder(client_, "matmul");
     auto lhs =
@@ -311,28 +329,6 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
   EXPECT_EQ(conv_analysis.flop_count(), matmul_analysis.flop_count());
 }
 
-// Note that we still expect that any given operation won't overflow 2^64 FLOPs,
-// just that the sum total may.
-TEST_F(HloCostAnalysisTest, TotalOverflowsInt64) {
-  HloCostAnalysis matmul_analysis;
-  {
-    ComputationBuilder builder(client_, "matmul");
-    auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {1, 1LL << 62}),
-                                 "input");
-    auto rhs = builder.Parameter(1, ShapeUtil::MakeShape(F32, {1LL << 62, 1}),
-                                 "weights");
-    auto a = builder.Dot(lhs, rhs);
-    auto b = builder.Dot(a, lhs);
-    builder.Dot(b, rhs);
-    auto hlo_module = BuildHloGraph(&builder);
-    ASSERT_IS_OK(hlo_module->entry_computation()->root_instruction()->Accept(
-        &matmul_analysis));
-  }
-
-  LOG(INFO) << matmul_analysis.flop_count();
-  EXPECT_GT(matmul_analysis.flop_count(), std::numeric_limits<int64>::max());
-}
-
 using FusionCostAnalysis = ::testing::Test;
 
 TEST_F(FusionCostAnalysis, LoopFusion) {
@@ -373,12 +369,57 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
   fusion->FuseInstruction(clamp.get());
   fusion->FuseInstruction(add.get());
 
-  HloCostAnalysis fusion_analysis;
+  HloCostAnalysis fusion_analysis(ShapeSize);
   ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
 
   EXPECT_EQ(fusion_analysis.flop_count(), 16);
   EXPECT_EQ(fusion_analysis.transcendental_count(), 4);
 }
 
+TEST_F(FusionCostAnalysis, NoLayout) {
+  Shape shape_with_layout = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
+  // Instructions within a fused op may have no layout.
+  Shape shape_without_layout = shape_with_layout;
+  shape_without_layout.clear_layout();
+
+  auto c1 = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5)));
+  auto c2 =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1, 2, 3}));
+
+  auto broadcast =
+      HloInstruction::CreateBroadcast(shape_without_layout, c2.get(), {1});
+  auto add = HloInstruction::CreateBinary(shape_with_layout, HloOpcode::kAdd,
+                                          c1.get(), broadcast.get());
+
+  auto fusion = HloInstruction::CreateFusion(
+      shape_with_layout, HloInstruction::FusionKind::kLoop, add.get());
+  fusion->FuseInstruction(broadcast.get());
+
+  HloCostAnalysis fusion_analysis(ShapeSize);
+  ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
+
+  EXPECT_EQ(fusion_analysis.flop_count(), 120);
+  EXPECT_EQ(fusion_analysis.transcendental_count(), 0);
+}
+
+TEST_F(HloCostAnalysisTest, TupleCost) {
+  HloCostAnalysis analysis(ShapeSize);
+  {
+    ComputationBuilder builder(client_, "matmul");
+    auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {123}), "x");
+    auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {42}), "y");
+    auto tuple = builder.Tuple({x, y});
+    auto hlo_module = BuildHloGraph(&builder);
+
+    ASSERT_IS_OK(
+        hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+  }
+
+  EXPECT_EQ(analysis.flop_count(), 0);
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+  EXPECT_EQ(analysis.bytes_accessed(), kPointerSize * 2);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index ec8161f55fd..cc39c3ac203 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -36,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -54,7 +57,7 @@ TEST_F(HloCseTest, CombineTwoConstants) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -84,17 +87,19 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_NE(add->operand(0), add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(add->operand(0), add->operand(1));
+  auto first_operand = add->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(constant1, constant2));
+  EXPECT_THAT(add, op::Add(first_operand, first_operand));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -114,19 +119,17 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -153,13 +156,13 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
   builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(7, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(6, computation->instruction_count());
 }
@@ -181,20 +184,22 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple(
       {common_constant1, common_constant2, uncommon_constant}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
+  EXPECT_THAT(tuple,
+              op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(uncommon_constant, tuple->operand(2));
-  EXPECT_TRUE(tuple->operand(0) == common_constant1 ||
-              tuple->operand(0) == common_constant2);
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand,
+              ::testing::AnyOf(common_constant1, common_constant2));
+  EXPECT_THAT(tuple,
+              op::Tuple(first_operand, first_operand, uncommon_constant));
 }
 
 TEST_F(HloCseTest, IdenticalInstructions) {
@@ -211,20 +216,19 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2, exp3}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
-  EXPECT_NE(tuple->operand(1), tuple->operand(2));
-  EXPECT_NE(tuple->operand(0), tuple->operand(2));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(tuple->operand(1), tuple->operand(2));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2, exp3));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
@@ -245,17 +249,17 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(&module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
@@ -276,17 +280,19 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalExpressions) {
@@ -324,18 +330,19 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(HloOpcode::kAdd, tuple->operand(0)->opcode());
+  auto operand = tuple->operand(0);
+  EXPECT_THAT(tuple, op::Tuple(operand, operand));
+  EXPECT_THAT(operand, op::Add(op::Negate(), op::Exp()));
 }
 
 TEST_F(HloCseTest, DoNotCombineRng) {
@@ -351,12 +358,16 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   auto rng2 = builder.AddInstruction(HloInstruction::CreateRng(
       ShapeUtil::MakeShape(F32, {}), RandomDistribution::RNG_UNIFORM,
       {constant1, constant2}));
+
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
+
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
@@ -364,11 +375,8 @@ TEST_F(HloCseTest, DoNotCombineRng) {
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kRng);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kRng);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
 }
 
 // TODO(b/28245743): Handle impure functions correctly in CSE.
@@ -376,7 +384,7 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
 
   // rng_function is an impure function because it does RNG.
   HloComputation* rng_function = nullptr;
@@ -412,17 +420,22 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   }
 
   EXPECT_EQ(4, computation->instruction_count());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Map(), op::Map()));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kMap);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kMap);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  auto operand = root->operand(0)->operand(0);
+  EXPECT_THAT(operand, op::Map());
+  EXPECT_THAT(root, op::Add(operand, operand));
 }
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
new file mode 100644
index 00000000000..d1b87256445
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -0,0 +1,834 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+
+#include <algorithm>
+#include <iosfwd>
+#include <queue>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+string HloLocation::ToString() const {
+  string index_str =
+      ShapeUtil::IsTuple(instruction->shape()) ? (" " + index.ToString()) : "";
+  return StrCat(instruction->FullyQualifiedName(), index_str);
+}
+
+std::ostream& operator<<(std::ostream& out, const HloLocation& location) {
+  out << location.ToString();
+  return out;
+}
+
+string HloUse::ToString() const {
+  string index_str =
+      ShapeUtil::IsTuple(instruction->operand(operand_number)->shape())
+          ? (" " + operand_index.ToString())
+          : "";
+  return StrCat(instruction->FullyQualifiedName(), ", operand ", operand_number,
+                index_str);
+}
+
+std::ostream& operator<<(std::ostream& out, const HloUse& use) {
+  out << use.ToString();
+  return out;
+}
+
+HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
+                   const ShapeIndex& index, bool is_phi)
+    : id_(id), is_phi_(is_phi) {
+  // The defining location is always the first element in the locations_ vector.
+  AddLocation(instruction, index);
+}
+
+bool HloValue::operator==(const HloValue& other) const {
+  bool equal = instruction() == other.instruction() && index() == other.index();
+  // If the values are equal they most both be phi (or non phi).
+  CHECK(!(equal && is_phi() != other.is_phi()));
+  return equal;
+}
+
+bool HloValue::operator!=(const HloValue& other) const {
+  return !(*this == other);
+}
+
+string HloValue::ToShortString() const {
+  string index_str =
+      ShapeUtil::IsTuple(instruction()->shape()) ? index().ToString() : "";
+  return StrCat(is_phi_ ? "PHI " : "", instruction()->FullyQualifiedName(),
+                index_str);
+}
+
+string HloValue::ToString(int indent) const {
+  string indentation(indent, ' ');
+  string out = StrCat(indentation, ToShortString(), ", locations:\n");
+  for (const HloLocation& location : locations()) {
+    StrAppend(&out, indentation, "  ", location.ToString(), "\n");
+  }
+  StrAppend(&out, indentation, " uses:\n");
+  for (const HloUse& use : uses()) {
+    StrAppend(&out, indentation, "  ", use.ToString(), "\n");
+  }
+  return out;
+}
+
+void HloValue::AddLocation(HloInstruction* instruction,
+                           const ShapeIndex& index) {
+  // The given location should not already exist in locations_.
+  for (const HloLocation& location : locations_) {
+    DCHECK(!(location.instruction == instruction && location.index == index));
+  }
+
+  locations_.push_back(HloLocation{instruction, index});
+
+  //  Update uses.
+  for (HloInstruction* user : instruction->users()) {
+    for (int64 operand_number : user->OperandIndices(instruction)) {
+      if (!DoesNotUseOperandBuffer(instruction, index, user)) {
+        for (const HloUse& use : uses_) {
+          // Verify that this use does not already exist.
+          DCHECK(!(use.instruction == user &&
+                   use.operand_number == operand_number &&
+                   use.operand_index == index));
+        }
+
+        uses_.push_back(HloUse{user, operand_number, index});
+      }
+    }
+  }
+
+  // Update liveout status of this HloValue.
+  const HloModule& module = *instruction->parent()->parent();
+  if (instruction == module.entry_computation()->root_instruction()) {
+    live_out_of_module_ = true;
+  }
+}
+
+void HloValue::RemoveLocation(HloInstruction* instruction,
+                              const ShapeIndex& index) {
+  // The defining location cannot be removed.
+  CHECK(!(instruction == this->instruction() && index == this->index()));
+
+  int64 size_before = locations_.size();
+  locations_.erase(
+      std::remove_if(locations_.begin(), locations_.end(),
+                     [instruction, &index](const HloLocation& location) {
+                       return location.instruction == instruction &&
+                              location.index == index;
+                     }),
+      locations_.end());
+  // Only a single location should have been removed.
+  CHECK_EQ(locations_.size(), size_before - 1);
+
+  //  Update uses which referred to this location.
+  uses_.erase(std::remove_if(uses_.begin(), uses_.end(),
+                             [instruction, &index](const HloUse& use) {
+                               return use.instruction->operand(
+                                          use.operand_number) == instruction &&
+                                      use.operand_index == index;
+                             }),
+              uses_.end());
+
+  const HloModule& module = *instruction->parent()->parent();
+  if (instruction == module.entry_computation()->root_instruction()) {
+    // Value has been removed from a location in the entry root instruction.
+    // Check if the value is still live out of the module by walking all
+    // remaining locations.
+    live_out_of_module_ = false;
+    for (const HloLocation& location : locations()) {
+      if (location.instruction ==
+          module.entry_computation()->root_instruction()) {
+        live_out_of_module_ = true;
+        break;
+      }
+    }
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const HloValue& value) {
+  out << value.ToShortString();
+  return out;
+}
+
+void HloValueSet::SortAndUniquifyValues() {
+  std::sort(value_ids_.begin(), value_ids_.end());
+  value_ids_.erase(std::unique(value_ids_.begin(), value_ids_.end()),
+                   value_ids_.end());
+}
+
+string HloValueSet::ToString() const {
+  return StrCat("HloValueSet: ", tensorflow::str_util::Join(value_ids_, ", "));
+}
+
+/*static */
+HloValueSet HloValueSet::Union(
+    tensorflow::gtl::ArraySlice<const HloValueSet*> inputs) {
+  HloValueSet union_set;
+  for (const HloValueSet* input : inputs) {
+    for (HloValue::Id value_id : input->value_ids()) {
+      union_set.value_ids_.push_back(value_id);
+    }
+  }
+  union_set.SortAndUniquifyValues();
+  return union_set;
+}
+
+std::ostream& operator<<(std::ostream& out, const HloValueSet& value_set) {
+  out << value_set.ToString();
+  return out;
+}
+
+InstructionValueSet InstructionValueSet::Union(
+    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
+  CHECK_GT(inputs.size(), 0);
+  for (int i = 1; i < inputs.size(); ++i) {
+    CHECK(ShapeUtil::Compatible(inputs[0]->shape(), inputs[i]->shape()));
+  }
+  InstructionValueSet union_set(inputs[0]->shape());
+  union_set.ForEachMutableElement(
+      [&inputs](const ShapeIndex& index, HloValueSet* value_set) {
+        std::vector<const HloValueSet*> input_sets;
+        for (const InstructionValueSet* input : inputs) {
+          input_sets.push_back(&input->element(index));
+        }
+        *value_set = HloValueSet::Union(input_sets);
+      });
+  return union_set;
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionValueSet& instruction_value_set) {
+  out << instruction_value_set.ToString();
+  return out;
+}
+
+string InstructionValueSet::ToString() const {
+  string out =
+      StrCat("InstructionValueSet(", ShapeUtil::HumanString(shape()), ")\n");
+  ForEachElement([this, &out](const ShapeIndex& index,
+                              const HloValueSet& value_set) {
+    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
+  });
+  return out;
+}
+
+HloDataflowAnalysis::HloDataflowAnalysis(HloModule* module, bool ssa_form,
+                                         bool bitcast_defines_value)
+    : module_(module),
+      ssa_form_(ssa_form),
+      bitcast_defines_value_(bitcast_defines_value),
+      call_graph_(CallGraph::Build(module)) {}
+
+bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
+                                           const ShapeIndex& index) const {
+  const HloValueSet& value_set = GetValueSet(instruction, index);
+  if (value_set.value_ids().size() != 1) {
+    return false;
+  }
+  return GetValue(value_set.GetUniqueValueId()).instruction() == instruction;
+}
+
+const HloValue& HloDataflowAnalysis::GetValueDefinedAt(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  CHECK(ValueIsDefinedAt(instruction, index));
+  return GetUniqueValueAt(instruction, index);
+}
+
+HloValue& HloDataflowAnalysis::GetValueDefinedAt(
+    const HloInstruction* instruction, const ShapeIndex& index) {
+  CHECK(ValueIsDefinedAt(instruction, index));
+  return GetUniqueValueAt(instruction, index);
+}
+
+HloValue::Id HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
+                                              const ShapeIndex& index,
+                                              bool is_phi) {
+  int64 value_id = next_value_id_++;
+  auto it_added = values_.emplace(
+      std::piecewise_construct, std::forward_as_tuple(value_id),
+      std::forward_as_tuple(value_id, instruction, index, is_phi));
+  CHECK(it_added.second);
+
+  // Clear the vector of values as it is now stale. It will be lazily
+  // reconstructed if needed when HloDataflowAnalysis::values() is called.
+  values_vector_.clear();
+
+  return value_id;
+}
+
+void HloDataflowAnalysis::DeleteHloValue(HloValue::Id value_id) {
+  values_.erase(value_id);
+
+  // Clear the vector of values as it is now stale. It will be lazily
+  // reconstructed if needed when HloDataflowAnalysis::values() is called.
+  values_vector_.clear();
+}
+
+string HloDataflowAnalysis::ToString() const {
+  string out = StrCat("HloDataflowAnalysis, module ", module_->name(), "\n");
+  StrAppend(&out, "  Instruction value sets:\n");
+  for (const std::unique_ptr<HloComputation>& computation :
+       module_->computations()) {
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      StrAppend(&out, "    ", instruction->FullyQualifiedName(), ":\n");
+      if (ShapeUtil::IsTuple(instruction->shape())) {
+        GetInstructionValueSet(instruction.get())
+            .ForEachElement([this, &instruction, &out](
+                                const ShapeIndex& index,
+                                const HloValueSet& value_set) {
+              StrAppend(&out, "      tuple index ", index.ToString(), ":\n");
+              for (HloValue::Id value_id : value_set.value_ids()) {
+                StrAppend(
+                    &out, "        ", GetValue(value_id).ToShortString(),
+                    ValueIsDefinedAt(instruction.get(), index) ? " (def)" : "",
+                    "\n");
+              }
+            });
+      } else {
+        const HloValueSet& top_level_value_set =
+            GetValueSet(instruction.get(), /*index=*/{});
+        for (HloValue::Id value_id : top_level_value_set.value_ids()) {
+          StrAppend(&out, "      ", GetValue(value_id).ToShortString(),
+                    ValueIsDefinedAt(instruction.get()) ? " (def)" : "", "\n");
+        }
+      }
+    }
+  }
+  StrAppend(&out, "  HloValues:\n");
+  for (const auto& pair : values_) {
+    StrAppend(&out, pair.second.ToString(/*indent=*/4));
+  }
+  return out;
+}
+
+const HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) const {
+  return values_.at(value_id);
+}
+
+HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) {
+  return values_.at(value_id);
+}
+
+const HloValueSet& HloDataflowAnalysis::GetValueSet(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  return GetInstructionValueSet(instruction).element(index);
+}
+
+HloValueSet& HloDataflowAnalysis::GetValueSet(const HloInstruction* instruction,
+                                              const ShapeIndex& index) {
+  return *GetInstructionValueSet(instruction).mutable_element(index);
+}
+
+const std::vector<const HloValue*>& HloDataflowAnalysis::values() const {
+  if (values_vector_.empty()) {
+    // Lazily construct vector of values.
+    values_vector_.reserve(values_.size());
+    for (auto& pair : values_) {
+      values_vector_.push_back(&pair.second);
+    }
+    std::sort(
+        values_vector_.begin(), values_vector_.end(),
+        [](const HloValue* a, const HloValue* b) { return a->id() < b->id(); });
+  } else {
+    CHECK_EQ(values_vector_.size(), values_.size());
+    for (const HloValue* value : values_vector_) {
+      DCHECK(ContainsKey(values_, value->id()));
+      DCHECK(&GetValue(value->id()) == value);
+    }
+  }
+  return values_vector_;
+}
+
+/* static */
+InstructionValueSet HloDataflowAnalysis::Phi(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs,
+    bool skip_top_level) {
+  CHECK(ssa_form_);
+
+  for (const InstructionValueSet* input : inputs) {
+    CHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
+  }
+  InstructionValueSet new_value_set(instruction->shape());
+  new_value_set.ForEachMutableElement(
+      [this, instruction, &inputs, skip_top_level](const ShapeIndex& index,
+                                                   HloValueSet* value_set) {
+        // If we're skipping the top level, just copy over the existing
+        // HloValueSet.
+        if (skip_top_level && index.empty()) {
+          *value_set = GetInstructionValueSet(instruction).element(index);
+          return;
+        }
+
+        // Identify the existing phi value at this index if it exists.
+        const HloValue* existing_phi_value = nullptr;
+        if (ValueIsDefinedAt(instruction, index) &&
+            GetUniqueValueAt(instruction, index).is_phi()) {
+          existing_phi_value = &GetUniqueValueAt(instruction, index);
+        }
+
+        // Construct a vector of unique value IDs of the inputs.
+        std::vector<HloValue::Id> input_value_ids;
+        for (const InstructionValueSet* input : inputs) {
+          for (HloValue::Id value_id : input->element(index).value_ids()) {
+            input_value_ids.push_back(value_id);
+          }
+        }
+        std::sort(input_value_ids.begin(), input_value_ids.end());
+        input_value_ids.erase(
+            std::unique(input_value_ids.begin(), input_value_ids.end()),
+            input_value_ids.end());
+
+        // Remove the existing phi value (if it exists). The phi can be its own
+        // input, for example, in while body parameters where the body passes
+        // through the parameter value.
+        if (existing_phi_value != nullptr) {
+          auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
+                              existing_phi_value->id());
+          if (it != input_value_ids.end()) {
+            input_value_ids.erase(it);
+          }
+        }
+
+        if (input_value_ids.size() <= 1) {
+          if (input_value_ids.size() == 1) {
+            *value_set = HloValueSet({input_value_ids[0]});
+          }
+          if (existing_phi_value) {
+            // The merge point does not have multiple distinct inputs (which are
+            // not the phi value itself). Therefore there is no need to insert a
+            // phi value because there is a single reaching definition (or no
+            // reaching definition).
+            DeleteHloValue(existing_phi_value->id());
+          }
+        } else if (input_value_ids.size() > 1) {
+          // Multiple distinct values reach this point. A phi value is
+          // necessary.
+          if (existing_phi_value) {
+            // A phi value already exists so reuse it in the new
+            // InstructionValueSet.
+            *value_set = HloValueSet({existing_phi_value->id()});
+          } else {
+            // Create a new phi value.
+            *value_set =
+                HloValueSet({NewHloValue(instruction, index, /*is_phi=*/true)});
+          }
+        }
+      });
+  return new_value_set;
+}
+
+void HloDataflowAnalysis::UpdateLocationsOfValuesAt(
+    HloInstruction* instruction, const InstructionValueSet& new_value_set,
+    const InstructionValueSet* prev_value_set) {
+  if (prev_value_set != nullptr) {
+    // Remove locations from the old value set.
+    prev_value_set->ForEachElement(
+        [this, instruction](const ShapeIndex& index,
+                            const HloValueSet& value_set) {
+          for (HloValue::Id value_id : value_set.value_ids()) {
+            // HloValues in the previous value set may have been deleted.
+            if (!ContainsKey(values_, value_id)) {
+              continue;
+            }
+            // Don't remove the defining location of the value.
+            HloValue& value = GetValue(value_id);
+            if (instruction == value.instruction()) {
+              CHECK_EQ(index, value.index());
+            } else {
+              value.RemoveLocation(instruction, index);
+            }
+          }
+        });
+  }
+  // Add locations in the new value set.
+  new_value_set.ForEachElement(
+      [this, instruction](const ShapeIndex& index,
+                          const HloValueSet& value_set) {
+        for (HloValue::Id value_id : value_set.value_ids()) {
+          HloValue& value = GetValue(value_id);
+          if (instruction == value.instruction()) {
+            CHECK_EQ(index, value.index());
+          } else {
+            value.AddLocation(instruction, index);
+          }
+        }
+      });
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeBitcastValueSet(
+    HloInstruction* bitcast) {
+  CHECK_EQ(bitcast->opcode(), HloOpcode::kBitcast);
+  if (bitcast_defines_value_) {
+    return GetInstructionValueSet(bitcast);
+  } else {
+    return GetInstructionValueSet(bitcast->operand(0));
+  }
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeCopyValueSet(
+    HloInstruction* copy) {
+  CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+  InstructionValueSet new_value_set = GetInstructionValueSet(copy);
+  if (ShapeUtil::IsTuple(copy->shape())) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(copy->shape()); ++i) {
+      new_value_set.CopySubtreeFrom(GetInstructionValueSet(copy->operand(0)),
+                                    /*source_base_index=*/{i},
+                                    /*target_base_index=*/{i});
+    }
+  }
+  return new_value_set;
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeGetTupleElementValueSet(
+    HloInstruction* gte) {
+  CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement);
+  InstructionValueSet new_value_set(gte->shape());
+  new_value_set.CopySubtreeFrom(GetInstructionValueSet(gte->operand(0)),
+                                /*source_base_index=*/{gte->tuple_index()},
+                                /*target_base_index=*/{});
+  return new_value_set;
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeSelectValueSet(
+    HloInstruction* select) {
+  CHECK_EQ(select->opcode(), HloOpcode::kSelect);
+  std::vector<const InstructionValueSet*> inputs = {
+      &GetInstructionValueSet(select->operand(1)),
+      &GetInstructionValueSet(select->operand(2))};
+  // A phi value is not defined at a kSelect instruction because kSelect does
+  // not create a new value. Rather it forwards a value from its operands. This
+  // contrasts with kWhile instruction (which does define a phi value) which has
+  // in-place update semantics.
+  InstructionValueSet new_value_set = InstructionValueSet::Union(inputs);
+  *new_value_set.mutable_element(/*index=*/{}) =
+      GetInstructionValueSet(select).element(/*index=*/{});
+  return new_value_set;
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeTupleValueSet(
+    HloInstruction* tuple) {
+  CHECK_EQ(tuple->opcode(), HloOpcode::kTuple);
+  InstructionValueSet new_value_set(tuple->shape());
+  *new_value_set.mutable_element(/*index=*/{}) =
+      GetInstructionValueSet(tuple).element(/*index=*/{});
+  for (int64 i = 0; i < tuple->operands().size(); ++i) {
+    new_value_set.CopySubtreeFrom(GetInstructionValueSet(tuple->operand(i)),
+                                  /*source_base_index=*/{},
+                                  /*target_base_index=*/{i});
+  }
+  return new_value_set;
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeWhileValueSet(
+    HloInstruction* xla_while) {
+  CHECK_EQ(xla_while->opcode(), HloOpcode::kWhile);
+  std::vector<const InstructionValueSet*> inputs = {
+      &GetInstructionValueSet(xla_while->while_body()->root_instruction()),
+      &GetInstructionValueSet(xla_while->operand(0))};
+  if (ssa_form_) {
+    return Phi(xla_while, inputs);
+  } else {
+    return InstructionValueSet::Union(inputs);
+  }
+}
+
+void HloDataflowAnalysis::UpdateInstructionValueSet(
+    HloInstruction* instruction) {
+  // Recompute from operands.
+  InstructionValueSet& value_set = GetInstructionValueSet(instruction);
+  switch (instruction->opcode()) {
+    case HloOpcode::kBitcast:
+      value_set = RecomputeBitcastValueSet(instruction);
+      break;
+    case HloOpcode::kCopy:
+      value_set = RecomputeCopyValueSet(instruction);
+      break;
+    case HloOpcode::kGetTupleElement:
+      value_set = RecomputeGetTupleElementValueSet(instruction);
+      break;
+    case HloOpcode::kSelect:
+      value_set = RecomputeSelectValueSet(instruction);
+      break;
+    case HloOpcode::kTuple:
+      value_set = RecomputeTupleValueSet(instruction);
+      break;
+    case HloOpcode::kParameter:
+      value_set = RecomputeParameterValueSet(instruction);
+      break;
+    case HloOpcode::kCall:
+      // The output of a kCall instruction is exactly the output of the root of
+      // the subcomputation.
+      value_set =
+          GetInstructionValueSet(instruction->to_apply()->root_instruction());
+      break;
+    case HloOpcode::kWhile:
+      value_set = RecomputeWhileValueSet(instruction);
+      break;
+    default:
+      // Instruction does not forward HloValues (it defines all values in its
+      // output). No update is necessary.
+      return;
+  }
+}
+
+void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
+    tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
+  std::queue<HloInstruction*> worklist;
+  for (HloInstruction* instruction : instructions) {
+    worklist.push(instruction);
+  }
+
+  while (!worklist.empty()) {
+    HloInstruction* instruction = worklist.front();
+    worklist.pop();
+
+    VLOG(3) << "Worklist top: " << instruction->name();
+    VLOG(3) << ToString();
+
+    // Save old value for recomputing uses and live out.
+    InstructionValueSet old_value = GetInstructionValueSet(instruction);
+    UpdateInstructionValueSet(instruction);
+
+    if (GetInstructionValueSet(instruction) == old_value) {
+      // No change to the instruction's value set.
+      VLOG(4) << "No change.";
+      continue;
+    }
+
+    VLOG(4) << "New value set for " << instruction->name() << ": "
+            << GetInstructionValueSet(instruction);
+    VLOG(4) << "Previously: " << old_value;
+
+    // Instruction value was updated. Add users to work list.
+    for (HloInstruction* user : instruction->users()) {
+      worklist.push(user);
+
+      // If user calls a computation, then the respective parameter(s) of the
+      // computation need to be updated.
+      for (HloComputation* called_computation : user->called_computations()) {
+        for (int64 operand_number : user->OperandIndices(instruction)) {
+          worklist.push(
+              called_computation->parameter_instruction(operand_number));
+        }
+      }
+    }
+
+    // If instruction is a root instruction, then propagate out to any calling
+    // instruction and across any while backedge.
+    if (instruction == instruction->parent()->root_instruction()) {
+      const CallGraphNode& call_graph_node =
+          call_graph_->GetNode(instruction->parent());
+      for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+        if (callsite.instruction()->opcode() == HloOpcode::kCall) {
+          worklist.push(callsite.instruction());
+        } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+          // Add the while itself, and the body and condition parameters.
+          worklist.push(callsite.instruction());
+          worklist.push(
+              callsite.instruction()->while_body()->parameter_instruction(0));
+          worklist.push(
+              callsite.instruction()->while_condition()->parameter_instruction(
+                  0));
+        }
+      }
+    }
+
+    // Update uses. First clear all of the old uses at the particular
+    // operands. Then add the new uses. There may be overlap between the old
+    // uses and new uses.
+    UpdateLocationsOfValuesAt(instruction, GetInstructionValueSet(instruction),
+                              &old_value);
+  }
+}
+
+InstructionValueSet HloDataflowAnalysis::RecomputeParameterValueSet(
+    HloInstruction* parameter) {
+  CHECK_EQ(parameter->opcode(), HloOpcode::kParameter);
+  const CallGraphNode& call_graph_node =
+      call_graph_->GetNode(parameter->parent());
+
+  // Subcomputations called in a parallel context (eg, map) do not have dataflow
+  // from the caller operands.
+  if (call_graph_node.context() == CallContext::kParallel ||
+      call_graph_node.caller_callsites().empty()) {
+    return GetInstructionValueSet(parameter);
+  }
+  CHECK_EQ(call_graph_node.context(), CallContext::kSequential);
+
+  std::vector<const InstructionValueSet*> inputs;
+  bool called_from_while = false;
+  for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+    inputs.push_back(&GetInstructionValueSet(
+        callsite.instruction()->operand(parameter->parameter_number())));
+    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      // In a while instruction, the backedge is also a dataflow input to the
+      // parameter instruction. This code covers the case where the parameter is
+      // in the while body or the parameter is in the while condition.
+      inputs.push_back(&GetInstructionValueSet(
+          callsite.instruction()->while_body()->root_instruction()));
+      called_from_while = true;
+    }
+  }
+
+  if (ssa_form_ && called_from_while) {
+    return Phi(parameter, inputs);
+  } else {
+    return InstructionValueSet::Union(inputs);
+  }
+}
+
+const InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
+    const HloInstruction* instruction) const {
+  return value_sets_.at(instruction);
+}
+
+InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
+    const HloInstruction* instruction) {
+  return value_sets_.at(instruction);
+}
+
+Status HloDataflowAnalysis::InitializeInstructionValueSets() {
+  for (const std::unique_ptr<HloComputation>& computation :
+       module_->computations()) {
+    const CallGraphNode& call_graph_node =
+        call_graph_->GetNode(computation.get());
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      // Create an empty shape tree.
+      value_sets_.emplace(std::piecewise_construct,
+                          std::forward_as_tuple(instruction.get()),
+                          std::forward_as_tuple(instruction->shape()));
+
+      // Lambda to set the value set to define all values in the output of the
+      // instruction.
+      auto define_all_values = [this, &instruction]() {
+        GetInstructionValueSet(instruction.get())
+            .ForEachMutableElement([this, &instruction](
+                                       const ShapeIndex& index,
+                                       HloValueSet* value_set) {
+              *value_set = HloValueSet({NewHloValue(instruction.get(), index)});
+            });
+      };
+
+      // Lambda to set the value set to define only the top-level buffer in the
+      // output of the instruction. Any other values flow from the operands of
+      // the instruction (or from cross-computation dataflow).
+      auto define_top_level_only = [this, &instruction]() {
+        GetValueSet(instruction.get(), /*index=*/{}) =
+            HloValueSet({NewHloValue(instruction.get(), /*index=*/{})});
+      };
+
+      switch (instruction->opcode()) {
+        case HloOpcode::kBitcast:
+          if (bitcast_defines_value_) {
+            define_all_values();
+          }
+          break;
+        case HloOpcode::kCall:
+        case HloOpcode::kWhile:
+        case HloOpcode::kGetTupleElement:
+          // These instructions define no values. The values in their output
+          // flow from their operands or from cross computation dataflow.
+          break;
+        case HloOpcode::kParameter:
+          if (call_graph_node.caller_callsites().empty() ||
+              call_graph_node.context() == CallContext::kParallel) {
+            // Parameters of computations called in a parallel context (eg, map
+            // and reduce) as well as parameters of dead computations define all
+            // values in their output. Otherwise the values of the parameter
+            // come from the caller (eg, operands to the kCall instruction).
+            define_all_values();
+          } else if (call_graph_node.context() == CallContext::kBoth) {
+            // We do not support a subcomputation that is called from both a
+            // parallel and sequential context. In this case, the parameter
+            // would both define a value and propagate a value from its
+            // caller. This limitation is not really a problem because the call
+            // graph is typically flattened.
+            return Unimplemented(
+                "Computation %s is called in both a parallel (eg, kMap) and "
+                "sequential (eg, kCall) context",
+                computation->name().c_str());
+          }
+          break;
+        case HloOpcode::kCopy:
+        case HloOpcode::kSelect:
+        case HloOpcode::kTuple:
+          // These instructions only define their top-level values. Any other
+          // values flow from their operands.
+          define_top_level_only();
+          break;
+        default:
+          define_all_values();
+          break;
+      }
+      UpdateLocationsOfValuesAt(instruction.get(),
+                                GetInstructionValueSet(instruction.get()));
+    }
+  }
+  return Status::OK();
+}
+
+/* static */
+StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
+    HloModule* module, bool ssa_form, bool bitcast_defines_value) {
+  VLOG(1) << "HloDataflowAnalysis::Run on module " << module->name();
+  XLA_VLOG_LINES(2, module->ToString());
+
+  auto dataflow_analysis = WrapUnique(
+      new HloDataflowAnalysis(module, ssa_form, bitcast_defines_value));
+
+  TF_RETURN_IF_ERROR(dataflow_analysis->InitializeInstructionValueSets());
+
+  // Construct list of all instructions to initialize the worklist to propagate
+  // the data flow. For efficiency sort the instruction in post order so
+  // producers appear before consumers.
+  std::vector<HloInstruction*> all_instructions;
+  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      all_instructions.push_back(instruction);
+    }
+  }
+  dataflow_analysis->UpdateInstructionsAndPropagate(all_instructions);
+
+  VLOG(1) << dataflow_analysis->ToString();
+  return std::move(dataflow_analysis);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
new file mode 100644
index 00000000000..2f9b0a64be5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -0,0 +1,399 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Analysis for determining the possible set of values for all locations
+// (instructions and ShapeIndexes) in the HLO module. Analysis is module-scoped
+// tracking values across computation boundaries.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Abstraction which identifies a specific point in the XLA graph. An
+// HloLocation specifies a ShapeIndex within the output of a specific
+// instruction.
+struct HloLocation {
+  HloInstruction* instruction;
+  ShapeIndex index;
+
+  string ToString() const;
+
+  bool operator==(const HloLocation& other) const {
+    return instruction == other.instruction && index == other.index;
+  }
+  bool operator!=(const HloLocation& other) const { return !(*this == other); }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloLocation& location);
+
+// Defines a single use of an HLO value.
+struct HloUse {
+  // Instruction at which the value is used.
+  HloInstruction* instruction;
+
+  // The operand number in which the value is appears.
+  int64 operand_number;
+
+  // The shape index within the operand in which the value appears.
+  ShapeIndex operand_index;
+
+  string ToString() const;
+
+  bool operator==(const HloUse& other) const {
+    return instruction == other.instruction &&
+           operand_number == other.operand_number &&
+           operand_index == other.operand_index;
+  }
+
+  bool operator!=(const HloUse& other) const { return !(*this == other); }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloUse& use);
+
+// Class describing a value used by the dataflow analysis. XLA arrays are
+// trivially a single HloValue. Tuples are made up of more than one HloValue: an
+// HloValue for the pointer vector, and an HloValue for each child element.
+//
+// Every HloValue is defined by a particular instruction and most instructions
+// define only a single HloValue. Instructions which define a single HloValue
+// include array-shaped instructions such as Add but also includes Tuple-shaped
+// instructions such as Tuple. The Tuple instruction defines a single HloValue
+// which is a vector of pointers to the values containing the Tuple
+// instruction's operands. Though the result of the Tuple instruction includes
+// multiple values only the top-level HloValue (the vector of pointers) is
+// defined by the Tuple instruction. The values containing the tuple elements
+// are defined by earlier instructions, usually the operands of the Tuple
+// instruction.
+//
+// Instructions which construct both the tuple *and* the tuple elements define
+// more than one HloValue. This includes (at least) tuple-shaped Constant,
+// Parameter, Infeed and While instructions. These tuple-shaped instructions do
+// not assemble a tuple from existing HloValues like the Tuple instruction does,
+// but rather define all the HloValues in the tuple.
+class HloValue {
+ public:
+  using Id = int64;
+
+  // Construct an HloValue defined by 'instruction' at shape index 'index'. If
+  // is_phi is true, then this value is a phi value, for example, at the
+  // parameter of a while body computation. Phi values are only used in the SSA
+  // dataflow analysis (HloDataflowAnalysis::ssa_form_ is true).
+  HloValue(HloValue::Id id, HloInstruction* instruction,
+           const ShapeIndex& index, bool is_phi = false);
+
+  // Return a unique identifier for this HloValue. This value is used for stable
+  // sorting and iteration
+  Id id() const { return id_; }
+
+  // Returns whether this value is a phi value.
+  bool is_phi() const { return is_phi_; }
+
+  // Return the location where this value is defined.
+  const HloLocation& DefinitionLocation() const { return locations_[0]; }
+
+  // Return the instruction which defines this HloValue.
+  HloInstruction* instruction() const {
+    return DefinitionLocation().instruction;
+  }
+
+  // Return the shape index at which this HloValue is defined in the output of
+  // instruction().
+  const ShapeIndex& index() const { return DefinitionLocation().index; }
+
+  // Add or remove a location at which the HloValue appears. The definition
+  // location can not be removed. The uses of the HloValue are updated.
+  void AddLocation(HloInstruction* instruction, const ShapeIndex& index);
+  void RemoveLocation(HloInstruction* instruction, const ShapeIndex& index);
+
+  // Return all locations of the HloValue in the module.
+  const std::vector<HloLocation>& locations() const { return locations_; }
+
+  // Return all uses of the HloValue.
+  const std::vector<HloUse>& uses() const { return uses_; }
+
+  // Set/get whether this HloValue is live out of the module.
+  bool live_out_of_module() const { return live_out_of_module_; }
+
+  bool operator==(const HloValue& other) const;
+  bool operator!=(const HloValue& other) const;
+
+  // Return a single-line string representation of the value.
+  string ToShortString() const;
+
+  string ToString(int indent = 0) const;
+
+ private:
+  // Unique identifier for this HloValue. Used for stable sorting and iteration.
+  const Id id_;
+
+  // Whether this instruction is a phi value.
+  const bool is_phi_;
+
+  // The set of locations of this HloValue. The first element is always the
+  // location of the definition.
+  std::vector<HloLocation> locations_;
+
+  // The set of uses of this HloValue.
+  std::vector<HloUse> uses_;
+
+  // Whether this value is live out of the HLO module.
+  bool live_out_of_module_ = false;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
+
+// A class representing the possible set of HloValues at a particular point
+// (shape index in the output of an instruction) in the XLA graph. This set
+// contains the set of reaching HloValue definitions. For a simple array-shaped
+// instruction like Add, the HloValueSet of the top-level of the instruction's
+// output trivially contains only the HloValue defined by the instruction. For
+// instructions which have non-trivial dataflow such as Tuple or Select, the
+// HloValueSets of the instruction's output contains one or more HloValues
+// defined by the instruction's operands or defined further up in the XLA graph.
+class HloValueSet {
+ public:
+  HloValueSet() = default;
+
+  explicit HloValueSet(tensorflow::gtl::ArraySlice<HloValue::Id> value_ids)
+      : value_ids_(value_ids.begin(), value_ids.end()) {
+    SortAndUniquifyValues();
+  }
+
+  // Return the union of the given HloValueSets.
+  static HloValueSet Union(
+      tensorflow::gtl::ArraySlice<const HloValueSet*> inputs);
+
+  // Return the vector of the IDs of all HloValues in the set. Values in the
+  // vector are unique and sorted.
+  const std::vector<HloValue::Id>& value_ids() const { return value_ids_; }
+
+  // Return the unique HLO value in the set. CHECKs if the set does not contain
+  // exactly one value.
+  HloValue::Id GetUniqueValueId() const {
+    CHECK_EQ(value_ids().size(), 1);
+    return value_ids()[0];
+  }
+
+  bool operator==(const HloValueSet& other) const {
+    return value_ids() == other.value_ids();
+  }
+  bool operator!=(const HloValueSet& other) const { return !(*this == other); }
+
+  string ToString() const;
+
+ private:
+  // Sorts value_ and removes duplicates. This should be called after adding any
+  // elements to values_.
+  void SortAndUniquifyValues();
+
+  // HloValues sorted by HloValue::Id.
+  std::vector<HloValue::Id> value_ids_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValueSet& hlo_value);
+
+// A class collecting the HloValues which might be contained in the output of
+// an HLO instruction. For array-shaped instructions, an InstructionValueSet
+// trivially holds a single HloValueSet. Tuple-shaped InstructionValueSets
+// hold multiple HloValueSets.
+class InstructionValueSet : public ShapeTree<HloValueSet> {
+ public:
+  InstructionValueSet(const Shape& shape) : ShapeTree<HloValueSet>(shape) {}
+
+  // Return the union of the given InstructionValueSets.
+  static InstructionValueSet Union(
+      tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs);
+
+  string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionValueSet& instruction_value_set);
+
+// Analysis which identifies all HLO values and their uses in an HLO module.
+class HloDataflowAnalysis {
+ public:
+  // Run dataflow analysis on the given module. Parameters:
+  //
+  //   ssa_form : If true then new values are defined at the merge points of
+  //     kWhile instructions. Abusing nomenclature somewhat, we call these "phi
+  //     values".  The merge is formed by the init value and loop backedge. The
+  //     SSA form is minimal in that a new phi value is defined only if the
+  //     merge point is reachable by multiple different values. The SSA form is
+  //     also in loop-closed form in that no values defined inside of a loop
+  //     (while body) is used outside of the loop.
+  //
+  //     If ssa_form is false, then merge points do not define new
+  //     values. Rather, the HloValueSet for the merge point contains the union
+  //     of the merged HloValues.
+  //
+  //   bitcast_defines_value : If true then the Bitcast HLO instruction defines
+  //     a new HLO value in the analysis. If false then Bitcast forwards the
+  //     value of its operand.
+  static StatusOr<std::unique_ptr<HloDataflowAnalysis>> Run(
+      HloModule* module, bool ssa_form = false,
+      bool bitcast_defines_value = false);
+
+  // Returns true if 'instruction' defines an HLO value at the given shape index
+  // of its output.
+  bool ValueIsDefinedAt(const HloInstruction* instruction,
+                        const ShapeIndex& index = {}) const;
+
+  // Return the HloValue defined by 'instruction' at the given shape index of
+  // its output.
+  //
+  // Precondition: ValueIsDefinedAt is true for this instruction and index.
+  const HloValue& GetValueDefinedAt(const HloInstruction* instruction,
+                                    const ShapeIndex& index = {}) const;
+  HloValue& GetValueDefinedAt(const HloInstruction* instruction,
+                              const ShapeIndex& index = {});
+
+  // Return the InstructionValueSet for the given instruction.
+  const InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction) const;
+  InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction);
+
+  // Return the HloValueSet for the given instruction at the given index.
+  const HloValueSet& GetValueSet(const HloInstruction* instruction,
+                                 const ShapeIndex& index = {}) const;
+  HloValueSet& GetValueSet(const HloInstruction* instruction,
+                           const ShapeIndex& index = {});
+
+  // Return the unique value in the HloValueSet at the given instruction and
+  // shape index. CHECKs if the value set does not contain a exactly one value.
+  const HloValue& GetUniqueValueAt(const HloInstruction* instruction,
+                                   const ShapeIndex& index = {}) const {
+    return GetValue(GetValueSet(instruction, index).GetUniqueValueId());
+  }
+  HloValue& GetUniqueValueAt(const HloInstruction* instruction,
+                             const ShapeIndex& index = {}) {
+    return GetValue(GetValueSet(instruction, index).GetUniqueValueId());
+  }
+
+  // Return the HloValue with the given Id.
+  const HloValue& GetValue(HloValue::Id value_id) const;
+  HloValue& GetValue(HloValue::Id value_id);
+
+  // Return the total number of HloValues.
+  int64 value_count() const { return values_.size(); }
+
+  // Return a vector of all HloValues stabily sorted by HloValue::Id. This
+  // vector is lazily computed. Mutating operations on HloDataflowAnalysis may
+  // invalidate the underlying vector requiring recomputation.
+  const std::vector<const HloValue*>& values() const;
+
+  string ToString() const;
+
+ protected:
+  HloDataflowAnalysis(HloModule* module, bool ssa_form,
+                      bool bitcast_defines_value = false);
+
+  // Creates a new HloValue defined at the given instruction and shape index and
+  // return its ID.
+  HloValue::Id NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
+                           bool is_phi = false);
+
+  // Delete the HloValue with the given ID.
+  void DeleteHloValue(HloValue::Id value_id);
+
+  // Constructs and initializes the InstructionValueSets of all instructions to
+  // contain exactly the HloValues defined by each instruction. These values can
+  // then propagated throughout the HLO graph by calling
+  // UpdateInstructionsAndPropagate.
+  Status InitializeInstructionValueSets();
+
+  // Updates the value set of the given instruction based on the values flowing
+  // into the instruction (operands and cross-computation dataflow).
+  void UpdateInstructionValueSet(HloInstruction* instruction);
+
+  // Recomputes and returns the value set for the given parameter instruction.
+  InstructionValueSet RecomputeBitcastValueSet(HloInstruction* bitcast);
+  InstructionValueSet RecomputeCopyValueSet(HloInstruction* copy);
+  InstructionValueSet RecomputeGetTupleElementValueSet(HloInstruction* gte);
+  InstructionValueSet RecomputeParameterValueSet(HloInstruction* parameter);
+  InstructionValueSet RecomputeSelectValueSet(HloInstruction* select);
+  InstructionValueSet RecomputeTupleValueSet(HloInstruction* tuple);
+  InstructionValueSet RecomputeWhileValueSet(HloInstruction* xla_while);
+
+  // Update the value sets of the given instructions and propagate the
+  // changes to fixed point.
+  void UpdateInstructionsAndPropagate(
+      tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
+
+  // Return the result of the SSA Phi function applied to the given inputs at
+  // the given instruction. If skip_top_level is true, then the top level of the
+  // value set of 'instruction' is not modified.
+  InstructionValueSet Phi(
+      HloInstruction* instruction,
+      tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs,
+      bool skip_top_level = false);
+
+  // Updates the locations of the HloValues in the output of the given
+  // instruction. This should be called after the instruction value set of
+  // 'instruction' has been changed. 'prev_value_set' must point to the previous
+  // state of the value set prior to the change. 'prev_value_set' may be null if
+  // this is the first time locations are being computed. The previous state is
+  // necessary to efficiently remove locations which have been eliminated due to
+  // changes in the instructions' InstructionValueSet.
+  void UpdateLocationsOfValuesAt(
+      HloInstruction* instruction, const InstructionValueSet& new_value_set,
+      const InstructionValueSet* prev_value_set = nullptr);
+
+  HloModule* const module_;
+  const bool ssa_form_;
+  const bool bitcast_defines_value_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+
+  // The map of all HloValues in the module.
+  std::unordered_map<HloValue::Id, HloValue> values_;
+
+  // A map from instruction to InstructionValueSet.
+  std::unordered_map<const HloInstruction*, InstructionValueSet> value_sets_;
+
+  // A lazily constructed vector containing all HloValues sorted by
+  // HloValue::Id.
+  mutable std::vector<const HloValue*> values_vector_;
+
+  // The Id to use for the next HloValue.
+  HloValue::Id next_value_id_ = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
new file mode 100644
index 00000000000..21344af5f22
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -0,0 +1,1134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/instruction_fusion.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+// Test is parameterized on a bool which is whether the dataflow analysis is
+// performed with SSA form.
+class HloDataflowAnalysisTest : public HloTestBase,
+                                public ::testing::WithParamInterface<bool> {
+ protected:
+  HloDataflowAnalysisTest() : module_(TestName()) {}
+
+  // Run dataflow analysis on the member module. For convenience returns a
+  // reference to the generated analysis stored in analysis_.
+  const HloDataflowAnalysis& RunAnalysis(bool ssa_form,
+                                         bool bitcast_defines_value = false) {
+    analysis_ =
+        HloDataflowAnalysis::Run(&module_, ssa_form, bitcast_defines_value)
+            .ConsumeValueOrDie();
+    return *analysis_;
+  }
+
+  // Return a vector of the HloValues at the given program location.
+  std::vector<HloValue> HloValuesAt(const HloInstruction* instruction,
+                                    const ShapeIndex& index = {}) {
+    CHECK(analysis_ != nullptr);
+    std::vector<HloValue> values;
+    for (HloValue::Id value_id :
+         analysis_->GetValueSet(instruction, index).value_ids()) {
+      values.push_back(analysis_->GetValue(value_id));
+    }
+    return values;
+  }
+
+  HloModule module_;
+  std::unique_ptr<HloDataflowAnalysis> analysis_;
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
+  // Test the dataflow for a simple binary operation (Add).
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, constant1, constant2));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  // Each instruction should define a single value.
+  EXPECT_EQ(analysis.values().size(), 3);
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+
+  // Verify the locations of the values. These locations are all trivial because
+  // there are no instructions which forward values.
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).locations(),
+              UnorderedElementsAre(HloLocation{constant1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).locations(),
+              UnorderedElementsAre(HloLocation{constant2, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(add).locations(),
+              UnorderedElementsAre(HloLocation{add, {}}));
+
+  // Verify the uses of the values.
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{add, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{add, 1, {}}));
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).uses().empty());
+
+  // Verify liveout values from the module.
+  EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+  EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, TupleAndGtes) {
+  // Verify the dataflow through a Tuple and GetTupleElement instructions.
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({param0, param1}));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  // The two params, tuple, and add should each define one value.
+  EXPECT_EQ(analysis.values().size(), 4);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(param0));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(param1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple, /*index=*/{}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(tuple, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(tuple, /*index=*/{1}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(gte0));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(gte1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+
+  // Verify the locations of the values.
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(param0).locations(),
+      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
+                           HloLocation{gte0, {}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(param1).locations(),
+      UnorderedElementsAre(HloLocation{param1, {}}, HloLocation{tuple, {1}},
+                           HloLocation{gte1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple).locations(),
+              UnorderedElementsAre(HloLocation{tuple, {}}));
+
+  // Verify uses. Of interest is that a GetTupleElement instruction is only a
+  // use of the top-level value in the tuple operand.
+  EXPECT_THAT(analysis.GetValueDefinedAt(param0).uses(),
+              UnorderedElementsAre(HloUse{tuple, 0, {}}, HloUse{add, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(param1).uses(),
+              UnorderedElementsAre(HloUse{tuple, 1, {}}, HloUse{add, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple, /*index=*/{}).uses(),
+              UnorderedElementsAre(HloUse{gte0, 0, {}}, HloUse{gte1, 0, {}}));
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedTuple) {
+  // Verify the dataflow through a nested tuple of the following form for two
+  // constants %constant1 and %constant2:
+  //
+  // %nested_tuple = {{%constant1, %constant2},
+  //                  {%constant1, %constant2},
+  //                  %constant1}
+  //
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto nested_tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({tuple, tuple, constant1}));
+  auto gte_tuple = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(tuple->shape(), nested_tuple, 1));
+  auto gte_out = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, gte_tuple, 0));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 4);
+
+  // Verify locations and uses.
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant1).locations(),
+      UnorderedElementsAre(
+          HloLocation{constant1, {}}, HloLocation{tuple, {0}},
+          HloLocation{nested_tuple, {0, 0}}, HloLocation{nested_tuple, {1, 0}},
+          HloLocation{nested_tuple, {2}}, HloLocation{gte_tuple, {0}},
+          HloLocation{gte_out, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(
+                  HloUse{tuple, 0, {}}, HloUse{nested_tuple, 0, {0}},
+                  HloUse{nested_tuple, 1, {0}}, HloUse{nested_tuple, 2, {}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant2).uses(),
+      UnorderedElementsAre(HloUse{tuple, 1, {}}, HloUse{nested_tuple, 0, {1}},
+                           HloUse{nested_tuple, 1, {1}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple, /*index=*/{}).uses(),
+              UnorderedElementsAre(HloUse{nested_tuple, 0, {}},
+                                   HloUse{nested_tuple, 1, {}},
+                                   HloUse{gte_out, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(nested_tuple, /*index=*/{}).uses(),
+              UnorderedElementsAre(HloUse{gte_tuple, 0, {}}));
+
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+  EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+  EXPECT_FALSE(
+      analysis.GetValueDefinedAt(tuple, /*index=*/{}).live_out_of_module());
+  EXPECT_FALSE(analysis.GetValueDefinedAt(nested_tuple, /*index=*/{})
+                   .live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, SingleCall) {
+  // Test a single call of a subcomputation. The subcomputation adds its two
+  // array-shaped parameters.
+  auto subbuilder = HloComputation::Builder("Subcomputation");
+  auto subparam0 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto subparam1 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
+  HloComputation* called_computation =
+      module_.AddEmbeddedComputation(subbuilder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 3);
+
+  // The parameters of the subcomputation and the call instruction itself should
+  // not define values. Their values flow from elsewhere.
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(subparam0));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(subparam1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(call));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(subparam0),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(subparam1),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(call), analysis.GetValueDefinedAt(add));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{call, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{add, 1, {}}, HloUse{call, 1, {}}));
+
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
+  // Test a subcomputation which is called twice with identical values.
+  auto subbuilder = HloComputation::Builder("Subcomputation");
+  auto subparam0 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto subparam1 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
+  HloComputation* called_computation =
+      module_.AddEmbeddedComputation(subbuilder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kSubtract, call1, call2));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 4);
+
+  // Definitions should be identical to the single callsite case.
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(subparam0));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(subparam1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(call1));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(call2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{call1, 0, {}},
+                                   HloUse{call2, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{add, 1, {}}, HloUse{call1, 1, {}},
+                                   HloUse{call2, 1, {}}));
+  // The Add from the subcomputation is used as both operands of the Subtract.
+  EXPECT_THAT(analysis.GetValueDefinedAt(add).uses(),
+              UnorderedElementsAre(HloUse{sub, 0, {}}, HloUse{sub, 1, {}}));
+
+  EXPECT_FALSE(analysis.GetValueDefinedAt(add).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(sub).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) {
+  // Test a subcomputation which is called twice with different argument values.
+  auto subbuilder = HloComputation::Builder("Subcomputation");
+  auto subparam0 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto subparam1 = subbuilder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
+  HloComputation* called_computation =
+      module_.AddEmbeddedComputation(subbuilder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, called_computation));
+  auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {call1, constant2}, called_computation));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(call1));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(call2));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(subparam0));
+
+  EXPECT_THAT(HloValuesAt(subparam0),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(add)));
+  EXPECT_THAT(HloValuesAt(subparam1),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant2)));
+
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedCalls) {
+  // Test a module with nested computations. HLO is:
+  //
+  // F32[] inner_computation(F32[] %param0, F32[] %param1):
+  //   %add = Add(%param0, %param1)
+  //
+  // F32[] outer_computation((F32[] %param0, F32[] %param1):
+  //  ;; Note that parameters are interchanged in the call.
+  //   %nested_call = Call(inner_computation, {%param1, %param0})
+  //
+  // F32[] entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %call = Call(outer_computation, {%constant1, %constant2})
+  //
+  auto inner_builder = HloComputation::Builder("InnerComputation");
+  auto inner_param0 = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto inner_param1 = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto add = inner_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, inner_param0, inner_param1));
+  HloComputation* inner_computation =
+      module_.AddEmbeddedComputation(inner_builder.Build());
+
+  auto outer_builder = HloComputation::Builder("OuterComputation");
+  auto outer_param0 = outer_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto outer_param1 = outer_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  // Swizzle parameters.
+  auto nested_call = outer_builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {outer_param1, outer_param0}, inner_computation));
+  HloComputation* outer_computation =
+      module_.AddEmbeddedComputation(outer_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto call = builder.AddInstruction(HloInstruction::CreateCall(
+      scalar_shape_, {constant1, constant2}, outer_computation));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  // Only three values should be defined. Most instructions just pass through
+  // their operand values.
+  EXPECT_EQ(analysis.values().size(), 3);
+
+  // Verify that the uses of the constants are properly swizzled by parameter
+  // permutation in nested_call.
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant1).uses(),
+      UnorderedElementsAre(HloUse{call, 0, {}}, HloUse{nested_call, 1, {}},
+                           HloUse{add, 1, {}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant2).uses(),
+      UnorderedElementsAre(HloUse{call, 1, {}}, HloUse{nested_call, 0, {}},
+                           HloUse{add, 0, {}}));
+
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, SingleWhile) {
+  // Test a simple single while instruction. The while body includes a
+  // pass-through value. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   return While(%tuple, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  auto body_tuple = body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  // Condition computation trivially returns a constant "false".
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  if (ssa_form) {
+    // Element 0 of the tuple passed through the body so no phi value is
+    // defined.
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0}));
+
+    // Element 1 of the tuple should be a phi value.
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{1}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{1}).is_phi());
+
+    EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+                UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{tuple, 0, {}},
+                                     HloUse{xla_while, 0, {0}},
+                                     HloUse{body_tuple, 0, {}}));
+
+    // Constant1 passes through the body and out of the module.
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
+                    .live_out_of_module());
+  } else {
+    // While instruction and subcomputation parameters should not define values
+    // in non-ssa form.
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(body_param, /*index=*/{1}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1}));
+
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, SequentialWhiles) {
+  // Test sequential while instructions. The while body includes a
+  // pass-through value. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   %while0 = While(%tuple, body, condition)
+  //   %while1 = While(%while0, body, condition)
+  //   return While(%while1, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  auto xla_while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while0));
+  auto xla_while2 = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  // Element 0 is passed through all the while instructions and out of the
+  // module..
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while0, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while1, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while2, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
+  // Test nested while instructions. The inner body passes through element 0 of
+  // its parameter, and the outer body passes through element 1.  HLO:
+  //
+  // inner_body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // outer_body((F32[], F32[]) %tuple_param):
+  //   %negate = Negate(%tuple_param{0})
+  //   %tuple = Tuple(%negate, %tuple_param{1})
+  //   return While(%tuple, inner_body, condition)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   return While(%tuple, outer_body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  // Element 0 passes transparently through the body.
+  auto inner_builder = HloComputation::Builder("inner_body");
+  auto inner_param = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto inner_element_0 = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, inner_param, 0));
+  auto inner_element_1 = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, inner_param, 1));
+  auto add = inner_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, inner_element_0, inner_element_1));
+  inner_builder.AddInstruction(
+      HloInstruction::CreateTuple({inner_element_0, add}));
+  HloComputation* inner_body =
+      module_.AddEmbeddedComputation(inner_builder.Build());
+
+  // Element 1 passes transparently through the body.
+  auto outer_builder = HloComputation::Builder("outer_body");
+  auto outer_param = outer_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto outer_element_0 = outer_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, outer_param, 0));
+  auto negate = outer_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, outer_element_0));
+  auto outer_element_1 = outer_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, outer_param, 1));
+  auto outer_tuple = outer_builder.AddInstruction(
+      HloInstruction::CreateTuple({negate, outer_element_1}));
+  auto nested_while = outer_builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, condition, inner_body, outer_tuple));
+  HloComputation* outer_body =
+      module_.AddEmbeddedComputation(outer_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto entry_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
+  if (ssa_form) {
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(inner_param, /*index=*/{1}));
+    EXPECT_TRUE(
+        analysis.GetValueDefinedAt(inner_param, /*index=*/{1}).is_phi());
+
+    // Element 0 of the nested while is %negate.
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{0}));
+    EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
+    // Element 1 is a phi value (join of %add and %constant2).
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{1}));
+    EXPECT_TRUE(
+        analysis.GetValueDefinedAt(nested_while, /*index=*/{1}).is_phi());
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(entry_while, /*index=*/{0}));
+    EXPECT_TRUE(
+        analysis.GetValueDefinedAt(entry_while, /*index=*/{0}).is_phi());
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(entry_while, /*index=*/{1}));
+    EXPECT_TRUE(
+        analysis.GetValueDefinedAt(entry_while, /*index=*/{1}).is_phi());
+  } else {
+    EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(constant2)));
+
+    EXPECT_THAT(HloValuesAt(nested_while, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
+    EXPECT_THAT(HloValuesAt(nested_while, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(constant2)));
+
+    EXPECT_THAT(HloValuesAt(entry_while, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(negate),
+                                     analysis.GetValueDefinedAt(constant1)));
+    EXPECT_THAT(HloValuesAt(entry_while, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(constant2)));
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, SwizzlingWhile) {
+  // Test a while instruction with a body which permutes it's tuple parameter
+  // elements. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   return Tuple(%tuple_param{1}, %tuple_param{0})
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %tuple = Tuple(%constant1, %constant2)
+  //   return While(%tuple, body, condition)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_1, body_element_0}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  if (ssa_form) {
+    // Element 0 and 1 in the while should both be phi values.
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{0}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{1}).is_phi());
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}).is_phi());
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{0}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{1}).is_phi());
+
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{})
+                    .live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0})
+                    .live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
+                    .live_out_of_module());
+  } else {
+    // Elements 0 and 1 have both constants as reaching definitions.
+    EXPECT_THAT(HloValuesAt(xla_while, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+    EXPECT_THAT(HloValuesAt(xla_while, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, ArraySelect) {
+  // Test a kSelect of an array value.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+      scalar_shape_, HloOpcode::kSelect, pred, constant1, constant2));
+
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select));
+  EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+  EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(select).live_out_of_module());
+}
+
+TEST_P(HloDataflowAnalysisTest, TupleSelect) {
+  // Test a kSelect of a tuple value. Non-top-level element flow through the
+  // instruction.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto constant4 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+  auto tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
+  auto tuple2 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant2}));
+  auto tuple3 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant3}));
+  auto tuple4 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant4}));
+  const Shape tuple_shape = tuple1->shape();
+  auto select11 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple1));
+  auto select12 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2));
+  auto select34 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, tuple3, tuple4));
+  auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple_shape, HloOpcode::kSelect, pred, select12, select34));
+
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  // Top-level value is always defined by a kSelect.
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select11));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select12));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select34));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select1234));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(select11, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(select12, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(select34, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(select1234, /*index=*/{0}));
+
+  EXPECT_THAT(HloValuesAt(select11, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1)));
+  EXPECT_THAT(HloValuesAt(select12, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(constant2)));
+  EXPECT_THAT(HloValuesAt(select34, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant3),
+                                   analysis.GetValueDefinedAt(constant4)));
+  EXPECT_THAT(HloValuesAt(select1234, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(constant2),
+                                   analysis.GetValueDefinedAt(constant3),
+                                   analysis.GetValueDefinedAt(constant4)));
+
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant1).uses(),
+      UnorderedElementsAre(HloUse{tuple1, 0, {}}, HloUse{select11, 1, {0}},
+                           HloUse{select11, 2, {0}}, HloUse{select12, 1, {0}},
+                           HloUse{select1234, 1, {0}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant2).uses(),
+      UnorderedElementsAre(HloUse{tuple2, 0, {}}, HloUse{select12, 2, {0}},
+                           HloUse{select1234, 1, {0}}));
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) {
+  // Test kSelect of a nested tuple.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto constant4 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+  auto constant5 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0)));
+  auto inner_tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant2, constant3}));
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, inner_tuple1}));
+  auto inner_tuple2 = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant5, constant3}));
+  auto tuple2 = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant4, inner_tuple2}));
+  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
+
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(select));
+
+    EXPECT_THAT(HloValuesAt(select, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant4)));
+    EXPECT_THAT(HloValuesAt(select, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(inner_tuple1),
+                                     analysis.GetValueDefinedAt(inner_tuple2)));
+    EXPECT_THAT(HloValuesAt(select, /*index=*/{1, 0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant2),
+                                     analysis.GetValueDefinedAt(constant5)));
+    EXPECT_THAT(HloValuesAt(select, /*index=*/{1, 1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant3)));
+}
+
+TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) {
+  // Test a tuple-shaped kSelect feeding a kWhile instruction. HLO:
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %constant1 = Constant(1.0)
+  //   %constant2 = Constant(2.0)
+  //   %constant3 = Constant(3.0)
+  //   %tuple1 = Tuple(%constant1)
+  //   %tuple2 = Tuple(%constant2)
+  //   %select = Select(%tuple1, %tuple2)
+  //   %gte = GetTupleElement(%select, 0)
+  //   %tuple = Tuple(%gte, %constant3)
+  //   return While(%tuple, body, condition)
+  //
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_.AddEmbeddedComputation(cond_builder.Build());
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto tuple1 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
+  auto tuple2 =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant2}));
+  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+      tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
+  auto gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, select, 0));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({gte, constant3}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple->shape(), condition, body, tuple));
+
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  if (ssa_form) {
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0}).is_phi());
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1}));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}).is_phi());
+
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(select, /*index=*/{0}));
+
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant3).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
+                    .live_out_of_module());
+  } else {
+    EXPECT_THAT(HloValuesAt(gte),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+    EXPECT_THAT(HloValuesAt(xla_while, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                     analysis.GetValueDefinedAt(constant2)));
+    EXPECT_THAT(HloValuesAt(xla_while, /*index=*/{1}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                     analysis.GetValueDefinedAt(constant3)));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant3).live_out_of_module());
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) {
+  // Test the bitcast_defines_value flag to the dataflow analysis.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kBitcast, constant));
+
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  {
+    const HloDataflowAnalysis& analysis =
+        RunAnalysis(ssa_form, /*bitcast_defines_value=*/true);
+
+    EXPECT_EQ(analysis.values().size(), 2);
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(constant));
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(bitcast));
+    EXPECT_FALSE(analysis.GetValueDefinedAt(constant).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(bitcast).live_out_of_module());
+  }
+  {
+    const HloDataflowAnalysis& analysis =
+        RunAnalysis(ssa_form, /*bitcast_defines_value=*/false);
+    EXPECT_EQ(analysis.values().size(), 1);
+
+    EXPECT_TRUE(analysis.ValueIsDefinedAt(constant));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(bitcast));
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant).live_out_of_module());
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, TupleCopy) {
+  // Test that a tuple-shaped copy only copies (defines) the top-level value.
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({param0, param1}));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(tuple->shape(), HloOpcode::kCopy, tuple));
+  module_.AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 4);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(param0));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(param1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple, /*index=*/{}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(tuple, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(tuple, /*index=*/{1}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(copy, /*index=*/{}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(copy, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(copy, /*index=*/{1}));
+
+  EXPECT_THAT(HloValuesAt(copy, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(param0)));
+  EXPECT_THAT(HloValuesAt(copy, /*index=*/{1}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(param1)));
+  EXPECT_TRUE(
+      analysis.GetValueDefinedAt(copy, /*index=*/{}).live_out_of_module());
+}
+
+INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
+                        HloDataflowAnalysisTest,
+                        ::testing::Values(false, true));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index fdfbbf8baf6..3755b9e4c00 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -52,7 +52,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto& instruction : computation->instructions()) {
       if (instruction->user_count() == 0 &&
           live_instructions.count(instruction.get()) == 0 &&
-          HloComputation::IsRemovable(instruction->opcode())) {
+          computation->IsRemovable(instruction.get())) {
         dead_roots.push_back(instruction.get());
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index dcd9e00c56c..10cd7ca7c09 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -50,7 +51,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -80,7 +81,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -93,5 +94,69 @@ TEST_F(HloDceTest, DeadParameters) {
   EXPECT_EQ(0, dead_param1->user_count());
 }
 
+TEST_F(HloDceTest, ControlDependencies) {
+  // Verify that instructions with control dependencies are not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(123.0f)));
+
+  // Create two dead instructions: a negate and an add.
+  auto dead_negate = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant1->shape(), HloOpcode::kNegate, constant1));
+  auto dead_add = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  // Create the same two instructions again, but these will have a control
+  // dependency added.
+  auto dead_negate_with_control_dep =
+      builder.AddInstruction(HloInstruction::CreateUnary(
+          constant1->shape(), HloOpcode::kNegate, constant1));
+  auto dead_add_with_control_dep =
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  // Create a root so the previously added instruction is dead.
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Add a control dependency between two instructions.
+  TF_ASSERT_OK(dead_negate_with_control_dep->AddControlDependencyTo(
+      dead_add_with_control_dep));
+
+  // Returns whether the given instruction exists in the test computation.
+  auto has_instruction = [computation](const HloInstruction* instruction) {
+    for (auto& inst : computation->instructions()) {
+      if (inst.get() == instruction) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  EXPECT_EQ(7, computation->instruction_count());
+  EXPECT_TRUE(has_instruction(dead_negate));
+  EXPECT_TRUE(has_instruction(dead_add));
+  EXPECT_TRUE(has_instruction(dead_negate_with_control_dep));
+  EXPECT_TRUE(has_instruction(dead_add_with_control_dep));
+
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(5, computation->instruction_count());
+  EXPECT_FALSE(has_instruction(dead_negate));
+  EXPECT_FALSE(has_instruction(dead_add));
+  EXPECT_TRUE(has_instruction(dead_negate_with_control_dep));
+  EXPECT_TRUE(has_instruction(dead_add_with_control_dep));
+}
+
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
new file mode 100644
index 00000000000..3e7f5b1f3d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -0,0 +1,791 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+template <typename OperandT>
+StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
+                                           const Literal& lhs_literal,
+                                           const Literal& rhs_literal) {
+  std::function<bool(OperandT, OperandT)> compare_op;
+  switch (opcode) {
+    case HloOpcode::kEq:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el == rhs_el;
+      };
+      break;
+    case HloOpcode::kNe:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el != rhs_el;
+      };
+      break;
+    case HloOpcode::kGe:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el >= rhs_el;
+      };
+      break;
+    case HloOpcode::kGt:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el > rhs_el;
+      };
+      break;
+    case HloOpcode::kLe:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el <= rhs_el;
+      };
+      break;
+    case HloOpcode::kLt:
+      compare_op = [](OperandT lhs_el, OperandT rhs_el) {
+        return lhs_el < rhs_el;
+      };
+      break;
+    default:
+      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
+                 << HloOpcodeString(opcode);
+  }
+
+  auto result = LiteralUtil::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<bool>(
+      result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+        return compare_op(LiteralUtil::Get<OperandT>(lhs_literal, multi_index),
+                          LiteralUtil::Get<OperandT>(rhs_literal, multi_index));
+      }));
+
+  return std::move(result);
+}
+
+template <typename ReturnT, typename NativeT>
+StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
+    HloInstruction* instruction,
+    const std::function<ReturnT(NativeT)>& unary_op,
+    const Literal& operand_literal) {
+  const auto shape = instruction->shape();
+  const auto* operand = instruction->operand(0);
+
+  // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+  // removed.
+  if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
+    return Unimplemented(
+        "Implicit broadcasting is currently unsupported in HLO evaluator "
+        "Shape Mismatch: %s vs %s",
+        ShapeUtil::HumanString(shape).c_str(),
+        ShapeUtil::HumanString(operand->shape()).c_str());
+  }
+
+  auto result = LiteralUtil::CreateFromShape(shape);
+
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
+      result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+        return unary_op(
+            LiteralUtil::Get<NativeT>(operand_literal, multi_index));
+      }));
+  return std::move(result);
+}
+
+}  // namespace
+
+template <typename ReturnT>
+class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
+
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()).c_str());
+  };
+
+  // TODO(b/35950897): many of the stl functions used in the handlers are not
+  // overloaded for every XLA primitive types.
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) override {
+    return HandleAbs<ReturnT>(abs, operand);
+  };
+
+  Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
+                        ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
+                          return std::ceil(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[copy],
+                        ElementWiseUnaryOp(copy, [](ReturnT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  template <PrimitiveType src_type, PrimitiveType dest_type>
+  std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+    DCHECK_EQ(src_type, src_literal.shape().element_type());
+    return LiteralUtil::Convert<
+        typename primitive_util::PrimitiveTypeToNative<src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<dest_type>::type>(
+        src_literal);
+  }
+
+  Status HandleConvert(HloInstruction* convert,
+                       HloInstruction* operand) override {
+    auto operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+
+    switch (operand->shape().element_type()) {
+#define CONVERT_IF_TYPES_MATCH(src_type)                                \
+  case (src_type):                                                      \
+    parent_->evaluated_[convert] = LiteralUtil::Convert<                \
+        typename primitive_util::PrimitiveTypeToNative<src_type>::type, \
+        ReturnT>(operand_literal);                                      \
+    break;
+      CONVERT_IF_TYPES_MATCH(PRED)
+      CONVERT_IF_TYPES_MATCH(S8)
+      CONVERT_IF_TYPES_MATCH(S32)
+      CONVERT_IF_TYPES_MATCH(S64)
+      CONVERT_IF_TYPES_MATCH(U8)
+      CONVERT_IF_TYPES_MATCH(U32)
+      CONVERT_IF_TYPES_MATCH(U64)
+      CONVERT_IF_TYPES_MATCH(F32)
+      CONVERT_IF_TYPES_MATCH(F64)
+#undef CONVERT_IF_TYPES_MATCH
+      // Other types are not yet supported.
+      default:
+        LOG(FATAL) << "unimplemented operand type for HandleCovert: "
+                   << PrimitiveType_Name(operand->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  Status HandleExp(HloInstruction* exp, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ReturnT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleFloor(HloInstruction* floor, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[floor],
+                        ElementWiseUnaryOp(floor, [](ReturnT elem_operand) {
+                          return std::floor(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLog(HloInstruction* log, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ReturnT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalNot(HloInstruction* logical_not,
+                          HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_not],
+        ElementWiseUnaryOp(logical_not,
+                           [](ReturnT elem_operand) { return !elem_operand; }));
+    return Status::OK();
+  };
+
+  Status HandleNegate(HloInstruction* negate,
+                      HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
+                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
+                          return -elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleSign(HloInstruction* sign, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
+                          return (ReturnT(0) < elem_operand) -
+                                 (elem_operand < ReturnT(0));
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ReturnT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem * rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(subtract, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem - rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleAdd(HloInstruction* add, HloInstruction* lhs,
+                   HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[add],
+        ElementWiseBinaryOp(add, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem + rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
+                      HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[divide],
+        ElementWiseBinaryOp(divide, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem / rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
+          return std::fmax(lhs, rhs);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::fmin(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandlePower(HloInstruction* power, HloInstruction* lhs,
+                     HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(power, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::pow(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[remainder],
+        ElementWiseBinaryOp(remainder, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::fmod(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalAnd(HloInstruction* logical_and, HloInstruction* lhs,
+                          HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_and],
+        ElementWiseBinaryOp(logical_and, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el && rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalOr(HloInstruction* logical_or, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_or],
+        ElementWiseBinaryOp(logical_or, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el || rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
+                     HloInstruction* arg, HloInstruction* max) override {
+    std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
+        [](ReturnT low, ReturnT high, ReturnT value) {
+          return std::fmax(low, std::fmin(value, high));
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
+                        ElementWiseTernaryOp(clamp, std::move(clamp_op)));
+    return Status::OK();
+  };
+
+  Status HandleSelect(HloInstruction* select, HloInstruction* pred,
+                      HloInstruction* on_true,
+                      HloInstruction* on_false) override {
+    CHECK(!ShapeUtil::IsTuple(select->shape()));
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementWiseTernaryOp(select, std::move(select_op)));
+    return Status::OK();
+  };
+
+  Status Preprocess(HloInstruction* hlo) override {
+    VLOG(2) << hlo->ToString();
+    return Status::OK();
+  };
+
+ private:
+  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT)>& unary_op) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
+    return ElementWiseUnaryOpImpl<ReturnT, ReturnT>(instruction, unary_op,
+                                                    operand_literal);
+  }
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT, ReturnT)>& binary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+
+    TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
+        result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return binary_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
+                           LiteralUtil::Get<ReturnT>(rhs_literal, multi_index));
+        }));
+    return std::move(result);
+  }
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  StatusOr<std::unique_ptr<Literal>> ElementWiseTernaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str(),
+          ShapeUtil::HumanString(ehs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+
+    TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
+        result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ternary_op(
+              LiteralUtil::Get<LhsType>(lhs_literal, multi_index),
+              LiteralUtil::Get<RhsType>(rhs_literal, multi_index),
+              LiteralUtil::Get<EhsType>(ehs_literal, multi_index));
+        }));
+
+    return std::move(result);
+  }
+
+  HloEvaluator* parent_;
+};
+
+HloEvaluator::HloEvaluator() {
+  typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
+  typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
+  typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: U16.");
+  });
+  typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
+  typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
+  typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: S16.");
+  });
+  typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
+  typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
+  typed_visitors_[F16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: F16.");
+  });
+  typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
+  typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
+  typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: TUPLE.");
+  });
+  typed_visitors_[OPAQUE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: OPAQUE.");
+  });
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloComputation* computation,
+    tensorflow::gtl::ArraySlice<const Literal*> args) {
+  arg_literals_ = args;
+  evaluated_.clear();
+
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+  return MakeUnique<Literal>(
+      GetEvaluatedLiteralFor(computation->root_instruction()));
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const Literal*> operands) {
+  TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
+
+  arg_literals_ = operands;
+  evaluated_.clear();
+
+  // Evaluate operands of Parameter type against the input literals which
+  // caches the evaluated literal results.
+  for (const auto operand : instruction->operands()) {
+    if (operand->opcode() == HloOpcode::kParameter) {
+      const Literal* input_literal = arg_literals_[operand->parameter_number()];
+      VLOG(2) << "Parameter operand evaluated to: "
+              << LiteralUtil::ToString(*input_literal);
+      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
+
+      evaluated_[operand] = MakeUnique<Literal>(*input_literal);
+    }
+  }
+
+  TF_RETURN_IF_ERROR(instruction->Visit(this));
+  return MakeUnique<Literal>(GetEvaluatedLiteralFor(instruction));
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloInstruction* instruction) {
+  TF_RET_CHECK(hlo_query::AllOperandsAreConstants(*instruction));
+  TF_RET_CHECK(instruction->opcode() != HloOpcode::kParameter);
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
+
+  arg_literals_.clear();
+  evaluated_.clear();
+  TF_RETURN_IF_ERROR(instruction->Visit(this));
+  return MakeUnique<Literal>(GetEvaluatedLiteralFor(instruction));
+}
+
+std::unique_ptr<Literal> HloEvaluator::TryEvaluate(
+    HloInstruction* instruction) {
+  auto result_or = Evaluate(instruction);
+  if (!result_or.ok()) {
+    VLOG(1) << "TryEvaluate failed:" << result_or.status();
+    return nullptr;
+  }
+
+  return result_or.ConsumeValueOrDie();
+}
+
+Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  VLOG(2) << "HandleParameter: " << parameter->ToString();
+  const Literal* input_literal = arg_literals_[parameter->parameter_number()];
+  VLOG(2) << "Parameter evaluated to: "
+          << LiteralUtil::ToString(*input_literal);
+  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+
+  evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConstant(HloInstruction* constant,
+                                    const Literal& literal) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleReshape(HloInstruction* reshape) {
+  TF_ASSIGN_OR_RETURN(
+      evaluated_[reshape],
+      LiteralUtil::Reshape(GetEvaluatedLiteralFor(reshape->operand(0)),
+                           AsInt64Slice(reshape->shape().dimensions())));
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleTranspose(HloInstruction* transpose) {
+  evaluated_[transpose] = LiteralUtil::Transpose(
+      GetEvaluatedLiteralFor(transpose->operand(0)), transpose->dimensions());
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  // The result concatenate dimension is going to be the sum of all concatenate
+  // dimensions of the operands taking part of the operation.
+  const Shape& reference_shape = operands[0]->shape();
+  CHECK(!ShapeUtil::IsTuple(reference_shape));
+  const int64 rank = ShapeUtil::Rank(reference_shape);
+  const int64 concat_dim = concatenate->dimensions()[0];
+  CHECK_GE(concat_dim, 0);
+  CHECK_LT(concat_dim, rank);
+
+  DimensionVector concat_dimensions(reference_shape.dimensions().begin(),
+                                    reference_shape.dimensions().end());
+
+  for (int64 i = 1; i < operands.size(); ++i) {
+    const Shape& operand_shape = operands[i]->shape();
+    CHECK(!ShapeUtil::IsTuple(operand_shape));
+    // Accumulate the concat dimension from all tensors taking part to the
+    // operation.
+    concat_dimensions[concat_dim] +=
+        ShapeUtil::GetDimension(operand_shape, concat_dim);
+  }
+
+  auto result_literal = LiteralUtil::CreateFromDimensions(
+      reference_shape.element_type(), concat_dimensions);
+  DimensionVector source_indices(rank, 0);
+  DimensionVector dest_indices(concat_dimensions.size(), 0);
+
+  for (auto operand : operands) {
+    const Shape& operand_shape = operand->shape();
+    TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+        GetEvaluatedLiteralFor(operand), source_indices, result_literal.get(),
+        dest_indices, AsInt64Slice(operand_shape.dimensions())));
+    dest_indices[concat_dim] +=
+        ShapeUtil::GetDimension(operand_shape, concat_dim);
+  }
+
+  evaluated_[concatenate] = std::move(result_literal);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite,
+                                    HloInstruction* operand) {
+  if (!ShapeUtil::ElementIsFloating(operand->shape())) {
+    return InvalidArgument(
+        "expected element type in shape to be float for IsFinite op, got: %s",
+        PrimitiveType_Name(operand->shape().element_type()).c_str());
+  }
+
+  switch (operand->shape().element_type()) {
+    case F16:
+      return Unimplemented("unhandled primitive type: F16.");
+    case F32: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, float>(
+          is_finite,
+          [](float elem_operand) { return std::isfinite(elem_operand); },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
+    case F64: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, double>(
+          is_finite,
+          [](double elem_operand) { return std::isfinite(elem_operand); },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
+    default:
+      LOG(FATAL) << "unknown/unhandled primitive type.";
+  }
+
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleCompare(HloInstruction* compare, HloOpcode opcode,
+                                   HloInstruction* lhs, HloInstruction* rhs) {
+  // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+  // removed.
+  if (!(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
+        ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+    return Unimplemented(
+        "Implicit broadcasting is currently unsupported in HLO evaluator "
+        "Shape Mismatch: %s vs %s vs %s",
+        ShapeUtil::HumanString(compare->shape()).c_str(),
+        ShapeUtil::HumanString(lhs->shape()).c_str(),
+        ShapeUtil::HumanString(rhs->shape()).c_str());
+  }
+
+  TF_RET_CHECK(lhs->shape().element_type() == rhs->shape().element_type());
+
+  const Literal& lhs_literal = GetEvaluatedLiteralFor(lhs);
+  const Literal& rhs_literal = GetEvaluatedLiteralFor(rhs);
+
+  // Note here we switch on the operand's type.
+  switch (lhs->shape().element_type()) {
+    case PRED: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<bool>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case U8: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<uint8>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case U16:
+      return Unimplemented("unhandled primitive type: U16.");
+    case U32: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<uint32>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case U64: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<uint64>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case S8: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<int8>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case S16:
+      return Unimplemented("unhandled primitive type: S16.");
+    case S32: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<int32>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case S64: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<int64>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case F16:
+      return Unimplemented("unhandled primitive type: F16.");
+    case F32: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<float>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    case F64: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<double>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
+    default:
+      LOG(FATAL) << "unknown primitive type.";
+  }
+
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleSlice(HloInstruction* slice,
+                                 HloInstruction* operand) {
+  const Shape& shape = slice->shape();
+  auto literal = LiteralUtil::CreateFromDimensions(
+      shape.element_type(), AsInt64Slice(shape.dimensions()));
+
+  DimensionVector dest_indices(slice->slice_starts().size(), 0);
+
+  TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+      GetEvaluatedLiteralFor(operand), slice->slice_starts(), literal.get(),
+      dest_indices, AsInt64Slice(shape.dimensions())));
+
+  evaluated_[slice] = std::move(literal);
+  return Status::OK();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
new file mode 100644
index 00000000000..91fd56f54c5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// Responsible for evaluating HLO and obtain literal as the evaluation results.
+//
+// This class is not thread-safe.
+class HloEvaluator : public DfsHloVisitorWithDefault {
+ public:
+  HloEvaluator();
+  // Evaluates a HLO computation and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition: argument literals are corresponds to the input computation's
+  // parameters in their post-ordering. For e.g., consider the following graph:
+  //
+  //                *
+  //            /       \
+  //            +     Parameter1
+  //        /      \
+  //       /        \
+  //    Parameter0  Constant
+  //
+  // The input literals array will have its first literal map to Parameter0 and
+  // the second map to Parameter1.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloComputation* computation,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+  // Evaluates a single HLO instruction and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition:
+  // 1. argument literals correspond to the input instruction's parameters in
+  // their post-ordering.
+  // 2. the instruction's operands must be of either Parameter or Constant type.
+  // TODO(b/35950897): implement more ops other than element-wise ops.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloInstruction* instruction,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+  // Evaluates a single HLO instruction with constant operands.
+  // Returns the evaluated result as literal if successful.
+  // Precondition:
+  // 1. all operands of the input instruction are constants.
+  // 2. the instruction is not a Parameter operation.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(HloInstruction* instruction);
+
+  // Same as Evaluate, except returning nullptr on error.
+  std::unique_ptr<Literal> TryEvaluate(HloInstruction* instruction);
+
+ protected:
+  // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
+  // literal type of each evaluated Handle* method of a TypedVisitor.
+  // There are however a few notable exceptions to this is rule, notably:
+  // - HandleCompare and HandleIsFinite: where the resulting literal type is
+  // always boolean.
+  // These operations are handled outside of the parent HloEvaluator handlers
+  // instead of from within TypedVisitor.
+  template <typename ReturnT>
+  class TypedVisitor;
+
+  // Wraps around instruction handling to infer types before dispatching to
+  // the corresponding typed Visitor.
+  Status DefaultAction(HloInstruction* hlo) override {
+    return hlo->Visit(typed_visitors_.at(hlo->shape().element_type()).get());
+  }
+
+  // Operations that are type-agnostic.
+  //
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  Status HandleConstant(HloInstruction* constant,
+                        const Literal& literal) override;
+
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
+  Status HandleReshape(HloInstruction* reshape) override;
+
+  Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+
+  Status HandleTranspose(HloInstruction* transpose) override;
+
+  Status HandleIsFinite(HloInstruction* is_finite,
+                        HloInstruction* operand) override;
+
+  Status HandleCompare(HloInstruction* compare, HloOpcode opcode,
+                       HloInstruction* lhs, HloInstruction* rhs) override;
+
+ private:
+  // Returns the already-evaluated literal result for the instruction.
+  // A Constant instruction is considered evaluated and its literal will be
+  // returned directly without looking up the cache.
+  // Crash with log if the given instruction has not been evaluated previously.
+  const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
+    if (hlo->IsConstant()) {
+      return hlo->literal();
+    }
+    auto it = evaluated_.find(hlo);
+    CHECK(it != evaluated_.end())
+        << "could not find evaluated value for: " << hlo->ToString();
+    return *(it->second);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  // Note: the hash function here is only needed because current gcc std::hash
+  // does not specialize for enum types. This should however be fixed in the
+  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
+  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
+                           std::hash<int>>
+      typed_visitors_;
+
+  // Tracks the HLO instruction and its evaluated literal result.
+  // TODO(b/35950897): have better memory management here to free instructions
+  // that are no longer a parent for any other subsequent instruction in
+  // post-orderring.
+  tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
+      evaluated_;
+
+  // Stores input literals, assuming they are in post-order. Literals are not
+  // owned by this class, and they must outlive the lifetime of the instance of
+  // this class.
+  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
new file mode 100644
index 00000000000..b26ece28b75
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class HloEvaluatorTest : public ::testing::Test {
+ protected:
+  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+
+  std::unique_ptr<HloEvaluator> evaluator_;
+};
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
+// with 3 operands.
+TEST_F(HloEvaluatorTest, DoesClamp) {
+  auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
+  auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+
+  Shape shape = low->shape();
+  auto c1 = HloInstruction::CreateConstant(std::move(low));
+  auto c2 = HloInstruction::CreateConstant(std::move(high));
+  auto c3 = HloInstruction::CreateConstant(std::move(value));
+  auto instruction = HloInstruction::CreateTernary(
+      shape, HloOpcode::kClamp, c1.get(), c2.get(), c3.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs select
+// with 3 operands.
+TEST_F(HloEvaluatorTest, DoesSelect) {
+  auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
+  auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+
+  Shape shape = on_true->shape();
+  auto c1 = HloInstruction::CreateConstant(std::move(pred));
+  auto c2 = HloInstruction::CreateConstant(std::move(on_true));
+  auto c3 = HloInstruction::CreateConstant(std::move(on_false));
+  auto instruction = HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, c1.get(), c2.get(), c3.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise addition with 2 operands.
+TEST_F(HloEvaluatorTest, DoesAdd) {
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1 = HloInstruction::CreateConstant(std::move(lhs));
+  auto c2 = HloInstruction::CreateConstant(std::move(rhs));
+  auto instruction =
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1.get(), c2.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise divide with 2 operands.
+TEST_F(HloEvaluatorTest, DoesDivide) {
+  auto lhs_s64 = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs_s64 = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+
+  Shape shape_s64 = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1_s64 = HloInstruction::CreateConstant(std::move(lhs_s64));
+  auto c2_s64 = HloInstruction::CreateConstant(std::move(rhs_s64));
+  auto instruction = HloInstruction::CreateBinary(shape_s64, HloOpcode::kDivide,
+                                                  c1_s64.get(), c2_s64.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+
+  auto lhs_f64 = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
+  auto rhs_f64 = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
+
+  Shape shape_f64 = ShapeUtil::MakeShape(F64, {2, 2});
+  auto c1_f64 = HloInstruction::CreateConstant(std::move(lhs_f64));
+  auto c2_f64 = HloInstruction::CreateConstant(std::move(rhs_f64));
+  instruction = HloInstruction::CreateBinary(shape_f64, HloOpcode::kDivide,
+                                             c1_f64.get(), c2_f64.get());
+
+  result = evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  expected =
+      LiteralUtil::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise abs op with 1 operand.
+TEST_F(HloEvaluatorTest, DoesAbs) {
+  auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  const Shape& shape = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1 = HloInstruction::CreateConstant(std::move(operand));
+  auto instruction =
+      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+
+  // For R0 literal.
+  const Shape& r0 = ShapeUtil::MakeShape(F32, {});
+  operand = LiteralUtil::CreateR0<float>(-1.0f);
+  c1 = HloInstruction::CreateConstant(std::move(operand));
+  instruction = HloInstruction::CreateUnary(r0, HloOpcode::kAbs, c1.get());
+  result = evaluator_->Evaluate(instruction.get()).ConsumeValueOrDie();
+  expected = LiteralUtil::CreateR0<float>(1.0f);
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+
+  // For R1 literal with dimension of size 0.
+  Shape empty_r1 = ShapeUtil::MakeShape(F32, {0});
+  operand = LiteralUtil::CreateR1<float>({});
+  c1 = HloInstruction::CreateConstant(std::move(operand));
+  instruction =
+      HloInstruction::CreateUnary(empty_r1, HloOpcode::kAbs, c1.get());
+
+  result = evaluator_->Evaluate(instruction.get()).ConsumeValueOrDie();
+  expected = LiteralUtil::CreateR1<float>({});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}  // namespace
+
+// Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
+// constant operands.
+TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
+  HloComputation::Builder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  std::vector<const Literal*> args = {lhs.get(), rhs.get(), rhs2.get()};
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+
+  auto param_lhs = HloInstruction::CreateParameter(0, shape, "lhs");
+  auto param_rhs = HloInstruction::CreateParameter(1, shape, "rhs");
+  auto lhs_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, param_lhs.get(), param_rhs.get());
+
+  auto param_rhs2 = HloInstruction::CreateParameter(2, shape, "rhs2");
+  auto root_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, lhs_instruction.get(), param_rhs2.get());
+
+  builder.AddInstruction(std::move(root_instruction));
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), args).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies Reshape operation is correctly evaluated.
+TEST_F(HloEvaluatorTest, DoesReshape) {
+  HloComputation::Builder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
+  const int64 permutation[] = {1, 2, 0, 4, 3};
+  builder.AddInstruction(
+      HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), {}).ConsumeValueOrDie();
+
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
+  LiteralUtil::EachCell<NativeT>(
+      *result, [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+        std::vector<int64> rindexes = Permute(permutation, indices);
+        EXPECT_TRUE(value ==
+                    LiteralUtil::Get<NativeT>(*literal_clone, rindexes));
+      });
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 0b87b04fc4b..9e25f1aceb1 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -32,6 +33,7 @@ namespace xla {
 void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo,
                                            uint64 cycles_taken) {
   hlo_to_cycles_taken_[hlo] = cycles_taken;
+  profiled_computations_.insert(hlo->parent());
 }
 
 uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
@@ -43,63 +45,104 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
 }
 
 string HloExecutionProfile::ToString(
+    const HloComputation& computation,
     const DeviceDescription& device_description,
-    const HloCostAnalysis& cost_analysis) const {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
+  HloCostAnalysis cost_analysis(shape_size);
+  tensorflow::Status analysis_status =
+      computation.root_instruction()->Accept(&cost_analysis);
+  if (!analysis_status.ok()) {
+    return "";
+  }
+
   using Item = std::pair<const HloInstruction*, uint64>;
-  std::vector<Item> items(hlo_to_cycles_taken_.begin(),
-                          hlo_to_cycles_taken_.end());
+  std::vector<Item> items;
+  for (Item item : hlo_to_cycles_taken_) {
+    // Only include the HLOs which are part of the desired computation.
+    if (item.first->parent() == &computation) {
+      items.push_back(item);
+    }
+  }
   auto custom_less = [](const Item& lhs, const Item& rhs) {
     return lhs.second > rhs.second;
   };
   std::sort(items.begin(), items.end(), custom_less);
   string result;
-  const int64 total_cycles = total_cycles_executed();
+  const int64 total_cycles = total_cycles_executed(computation);
   double clock_rate_ghz = device_description.clock_rate_ghz();
+  CHECK_GE(clock_rate_ghz, 1e-9);
 
   const auto cycles_to_microseconds = [&](double cycles) {
     return cycles / clock_rate_ghz / 1000.0;
   };
 
-  auto append_item = [&](int64 cycles, int64 flops, const string& name) {
+  auto append_item = [&](int64 cycles, int64 flops, int64 bytes_accessed,
+                         const string& name) {
     double nsecs = cycles / clock_rate_ghz;
+    string bytes_per_sec;
+    string bytes_per_cycle;
+    if (cycles <= 0 || bytes_accessed < 0) {
+      bytes_per_sec = "<unknown>";
+      bytes_per_cycle = "<unknown>";
+    } else {
+      bytes_per_sec = tensorflow::strings::HumanReadableNumBytes(
+          bytes_accessed / (nsecs / 1e9));
+      bytes_per_cycle =
+          tensorflow::strings::HumanReadableNumBytes(bytes_accessed / cycles);
+    }
+
+    double cycles_percent = 0;
+    if (total_cycles > 0) {
+      cycles_percent = cycles / static_cast<double>(total_cycles) * 100;
+    }
+
     tensorflow::strings::StrAppend(
         &result,
         tensorflow::strings::Printf(
-            "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %s",
-            cycles, cycles / static_cast<double>(total_cycles) * 100,
-            cycles_to_microseconds(cycles),
+            "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %12s/s "
+            ":: "
+            "%12s/cycle :: "
+            "%s",
+            cycles, cycles_percent, cycles_to_microseconds(cycles),
             flops <= 0 ? "<none>" : HumanReadableNumFlops(flops, nsecs).c_str(),
-            name.c_str()));
+            bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
   };
   tensorflow::strings::StrAppend(
-      &result,
-      tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t",
-                                  tensorflow::strings::HumanReadableElapsedTime(
-                                      total_cycles / clock_rate_ghz / 1e9)
-                                      .c_str()));
-  append_item(total_cycles, -1, "[total]");
+      &result, tensorflow::strings::Printf(
+                   "HLO execution profile for %s: (%s @ f_nom)\n\t",
+                   computation.name().c_str(),
+                   tensorflow::strings::HumanReadableElapsedTime(
+                       total_cycles / clock_rate_ghz / 1e9)
+                       .c_str()));
+
+  append_item(total_cycles, -1, -1, "[total]");
   for (const auto& item : items) {
+    const HloInstruction* hlo = item.first;
     tensorflow::strings::StrAppend(&result, "\n\t");
-    auto flops = item.first == nullptr
-                     ? -1
-                     : cost_analysis.hlo_to_flop_count(*item.first);
-    string display = item.first == nullptr ? "<none>" : item.first->ToString();
-    append_item(item.second, flops, display);
+    const int64 flops = (hlo == nullptr) ? -1 : cost_analysis.flop_count(*hlo);
+    const int64 bytes_accessed =
+        (hlo == nullptr) ? -1 : cost_analysis.bytes_accessed(*hlo);
+    const string display = (hlo == nullptr) ? "<none>" : hlo->ToString();
+    append_item(item.second, flops, bytes_accessed, display);
   }
 
-  MetricTableReport table;
-  table.SetMetricName("microseconds");
-  table.SetEntryName("ops");
-  table.SetShowCategoryTable();
-  for (const auto& item : items) {
-    MetricTableReport::Entry entry;
-    entry.text = item.first->ToString();
-    entry.short_text = item.first->ToString(/*compact_operands=*/true);
-    entry.category_text = item.first->ToCategory();
-    entry.metric = cycles_to_microseconds(item.second);
-    table.AddEntry(std::move(entry));
+  if (total_cycles <= 0) {
+    result += "****** 0 total cycles ******\n";
+  } else {
+    MetricTableReport table;
+    table.SetMetricName("microseconds");
+    table.SetEntryName("ops");
+    table.SetShowCategoryTable();
+    for (const auto& item : items) {
+      MetricTableReport::Entry entry;
+      entry.text = item.first->ToString();
+      entry.short_text = item.first->ToString(/*compact_operands=*/true);
+      entry.category_text = item.first->ToCategory();
+      entry.metric = cycles_to_microseconds(item.second);
+      table.AddEntry(std::move(entry));
+    }
+    result += table.MakeReport(cycles_to_microseconds(total_cycles));
   }
-  result += table.MakeReport(cycles_to_microseconds(total_cycles));
 
   return result;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6cc20798139..70b94a3f950 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -43,27 +43,45 @@ class HloExecutionProfile {
   uint64 GetProfileResult(const HloInstruction& hlo) const;
 
   // Return the number of cycles this computation took to execute.
-  uint64 total_cycles_executed() const { return total_cycles_executed_; }
+  uint64 total_cycles_executed(const HloComputation& computation) const {
+    auto it = total_cycles_executed_.find(&computation);
+    if (it != total_cycles_executed_.end()) {
+      return it->second;
+    }
+    return 0;
+  }
 
-  // Record how many cycles the entire computation took to execute.
-  void set_total_cycles_executed(uint64 total_cycles_executed) {
-    total_cycles_executed_ = total_cycles_executed;
+  // Record how many cycles a computation took to execute.
+  void set_total_cycles_executed(const HloComputation& computation,
+                                 uint64 total_cycles_executed) {
+    total_cycles_executed_[&computation] = total_cycles_executed;
   }
 
   // Returns a version of the execution profile suitable for performance
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
-  // for the operations.
-  string ToString(const DeviceDescription& device_description,
-                  const HloCostAnalysis& cost_analysis) const;
+  // for the operations in a given computation.
+  // Returns an empty string if it wasn't possible to generate a printable
+  // version.
+  string ToString(const HloComputation& computation,
+                  const DeviceDescription& device_description,
+                  const HloCostAnalysis::ShapeSizeFunction& shape_size) const;
+
+  // Returns the computations we have profiled.
+  std::unordered_set<const HloComputation*> profiled_computations() const {
+    return profiled_computations_;
+  }
 
  private:
   // Contains a mapping from HLO to the number of cycles it took to execute it.
   std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
 
-  // If non-empty, contains the total number of cycles this computation took to
+  // If non-empty, contains the total number of cycles a computation took to
   // execute.
-  uint64 total_cycles_executed_ = 0;
+  std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+
+  // The computations we have profiled.
+  std::unordered_set<const HloComputation*> profiled_computations_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 631e784755d..eb2e5dfb37f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -15,14 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 
+#include <unistd.h>
 #include <string>
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -45,6 +48,73 @@ namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+// Node color schemes, used by NodeColorAttributes.
+enum ColorScheme {
+  kBlue,
+  kBrown,
+  kDarkBlue,
+  kDarkGreen,
+  kDarkRed,
+  kGray,
+  kGreen,
+  kOrange,
+  kPurple,
+  kRed,
+  kWhite,
+  kYellow,
+};
+
+// Given a ColorScheme, returns an attribute string for a node of that color.
+// Sets the node's fill, stroke, and text colors.
+//
+// Colors are from https://material.io/color.
+string NodeColorAttributes(ColorScheme color) {
+  using std::make_tuple;
+
+  const char *fill_color, *stroke_color, *font_color;
+  std::tie(fill_color, stroke_color, font_color) =
+      [color]() -> std::tuple<const char*, const char*, const char*> {
+    switch (color) {
+      case kBlue:
+        return make_tuple("#bbdefb", "#8aacc8", "black");
+      case kBrown:
+        return make_tuple("#bcaaa4", "#8c7b75", "black");
+      case kDarkBlue:
+        return make_tuple("#1565c0", "#003c8f", "white");
+      case kDarkGreen:
+        return make_tuple("#2e7d32", "#005005", "white");
+      case kDarkRed:
+        return make_tuple("#b71c1c", "#7f0000", "white");
+      case kGray:
+        return make_tuple("#cfd8dc", "#9ea7aa", "black");
+      case kGreen:
+        return make_tuple("#c8e6c9", "#97b498", "black");
+      case kOrange:
+        return make_tuple("#ffe0b2", "#cbae82", "black");
+      case kPurple:
+        return make_tuple("#e1bee7", "#af8eb5", "black");
+      case kRed:
+        return make_tuple("#ffcdd2", "#cb9ca1", "black");
+      case kWhite:
+        return make_tuple("white", "black", "black");
+      case kYellow:
+        return make_tuple("#fff9c4", "#cbc693", "black");
+    }
+  }();
+
+  return Printf(
+      "style=filled, fontcolor=\"%s\", color=\"%s\", fillcolor=\"%s\"",
+      font_color, stroke_color, fill_color);
+}
+
+// Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
+// graphviz HTML-like string.
+string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
+  return tensorflow::str_util::StringReplace(
+      tensorflow::str_util::StringReplace(s, "<", "&lt;", /*replace_all=*/true),
+      ">", "&gt;", /*replace_all=*/true);
+}
+
 // Returns the dot graph identifier for the given instruction.
 string InstructionId(const HloInstruction* instruction) {
   return Printf("%lld", reinterpret_cast<uint64>(instruction));
@@ -55,68 +125,6 @@ string ComputationId(const HloComputation* computation) {
   return Printf("%lld", reinterpret_cast<uint64>(computation));
 }
 
-// Returns a compact string that represents the convolution dimension numbers.
-string ConvolutionDimensionNumbersToString(
-    const ConvolutionDimensionNumbers& dim_numbers) {
-  return Printf("B@%lld,Z@%lld,KIZ@%lld,KOZ@%lld",
-                dim_numbers.batch_dimension(), dim_numbers.feature_dimension(),
-                dim_numbers.kernel_input_feature_dimension(),
-                dim_numbers.kernel_output_feature_dimension());
-}
-
-// Returns a compact string that represents the non-trivial fields in the window
-// description. If there are no non-trivial fields, the empty string is
-// returned.
-string WindowToString(const Window& window) {
-  bool display_padding = false;
-  bool display_window_dilation = false;
-  bool display_base_dilation = false;
-  bool display_stride = false;
-  for (const WindowDimension& dimension : window.dimensions()) {
-    display_padding |=
-        dimension.padding_low() != 0 || dimension.padding_high() != 0;
-    display_window_dilation |= dimension.window_dilation() != 1;
-    display_base_dilation |= dimension.base_dilation() != 1;
-    display_stride |= dimension.stride() != 1;
-  }
-  std::vector<string> pieces = {};
-  if (display_padding) {
-    pieces.push_back("\\n");
-    pieces.push_back("padding=[");
-    for (const WindowDimension& dimension : window.dimensions()) {
-      pieces.push_back(StrCat("(", dimension.padding_low(), ",",
-                              dimension.padding_high(), ")"));
-      pieces.push_back(", ");
-    }
-    pieces.pop_back();
-    pieces.push_back("]");
-  }
-  // Make a convenient lambda that adds a simple int64 field in each
-  // WindowDimension.
-  auto add_field = [&pieces, &window](
-      const string& label,
-      tensorflow::protobuf_int64 (WindowDimension::*member)() const) {
-    pieces.push_back("\\n");
-    pieces.push_back(label + "=[");
-    for (const WindowDimension& dimension : window.dimensions()) {
-      pieces.push_back(StrCat(((&dimension)->*member)()));
-      pieces.push_back(", ");
-    }
-    pieces.pop_back();
-    pieces.push_back("]");
-  };
-  if (display_window_dilation) {
-    add_field("window_dilation", &WindowDimension::window_dilation);
-  }
-  if (display_base_dilation) {
-    add_field("base_dilation", &WindowDimension::base_dilation);
-  }
-  if (display_stride) {
-    add_field("stride", &WindowDimension::stride);
-  }
-  return Join(pieces, "");
-}
-
 // Returns the dot graph edges and nodes for the given instruction sequence.
 // Edges which extend between computations are added to the vector
 // intercomputation_edges. This is necessary because graphviz does not render
@@ -135,7 +143,8 @@ string InstructionSequenceGraph(
   std::vector<HloInstruction*> param_instructions;
   for (auto& instruction : instructions) {
     if (instruction->opcode() == HloOpcode::kParameter) {
-      int64 param_number = instruction->parameter_number();
+      size_t param_number = instruction->parameter_number();
+
       if (param_instructions.size() < param_number + 1) {
         param_instructions.resize(param_number + 1, nullptr);
       }
@@ -160,25 +169,38 @@ string InstructionSequenceGraph(
       param_ports.push_back(
           Printf("<%s> %s", InstructionId(param).c_str(), label.c_str()));
     }
-    StrAppend(&graph_body, param_node_name,
-              " [shape=record,style=filled,fillcolor=\"lightblue1\",",
-              "label=\"{parameters | {", Join(param_ports, "|"), "}}\"];\n");
+    // (If we wanted the word "parameters" to be bold like the other op names,
+    // we'd have to make this into an HTML-like table.  It is possible but
+    // complicated; see http://www.graphviz.org/doc/info/shapes.html#html.)
+    StrAppend(&graph_body, param_node_name, " [shape=record ",
+              NodeColorAttributes(kOrange), "label=\"{parameters | {",
+              Join(param_ports, "|"), "}}\"];\n");
   }
 
   for (auto& instruction : instructions) {
-    string color = "peachpuff";
-    string shape = "ellipse";
-    string name = HloOpcodeString(instruction->opcode());
-    if (HloOpcode::kFusion == instruction->opcode()) {
-      name += ": " + FusionKindString(instruction->fusion_kind());
-    }
+    ColorScheme color = kYellow;
+    string shape = "box";
+    string name =
+        StrCat("<b>", HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()),
+               "</b> ", HtmlLikeStringSanitize(instruction->name()));
     if (HloOpcode::kConvolution == instruction->opcode()) {
-      name += ":\\n" + ConvolutionDimensionNumbersToString(
-                           instruction->convolution_dimension_numbers()) +
-              WindowToString(instruction->window());
+      StrAppend(
+          &name, "<br/>",
+          HtmlLikeStringSanitize(
+              instruction->ConvolutionDimensionNumbersToString()),
+          "<br/>",
+          HtmlLikeStringSanitize(window_util::ToString(instruction->window())));
+    }
+
+    if (!instruction->metadata().op_name().empty()) {
+      StrAppend(&name, "<br/>",
+                HtmlLikeStringSanitize(instruction->metadata().op_name()));
+    }
+    if (!instruction->metadata().source_file().empty() &&
+        instruction->metadata().source_line() != 0) {
+      StrAppend(&name, "<br/>", instruction->metadata().source_file(), ":",
+                instruction->metadata().source_line());
     }
-    name += "\\n" + instruction->name();
-    std::vector<HloComputation*> called_computations;
 
     // Pick different colors or shapes for instructions which are particularly
     // expensive (eg, dot) and those which are unusual in some way or unique
@@ -191,17 +213,15 @@ string InstructionSequenceGraph(
       case HloOpcode::kAdd:
       case HloOpcode::kCeil:
       case HloOpcode::kClamp:
-      case HloOpcode::kConcatenate:
       case HloOpcode::kConvert:
       case HloOpcode::kDivide:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kEq:
       case HloOpcode::kExp:
       case HloOpcode::kFloor:
       case HloOpcode::kGe:
       case HloOpcode::kGt:
       case HloOpcode::kIndex:
+      case HloOpcode::kIsFinite:
       case HloOpcode::kLe:
       case HloOpcode::kLog:
       case HloOpcode::kLogicalAnd:
@@ -213,64 +233,49 @@ string InstructionSequenceGraph(
       case HloOpcode::kMultiply:
       case HloOpcode::kNe:
       case HloOpcode::kNegate:
-      case HloOpcode::kPad:
       case HloOpcode::kPower:
       case HloOpcode::kRemainder:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
       case HloOpcode::kSelect:
       case HloOpcode::kSign:
       case HloOpcode::kSlice:
       case HloOpcode::kSort:
       case HloOpcode::kSubtract:
       case HloOpcode::kTanh:
-      case HloOpcode::kTuple:
-      case HloOpcode::kUpdate:
-        break;
-
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-        StrAppend(&name, "\\n", "dims={", Join(instruction->dimensions(), ","),
-                  "}");
-        break;
-      case HloOpcode::kGetTupleElement:
-        StrAppend(&name, "\\nindex=", instruction->tuple_index());
         break;
       case HloOpcode::kRng:
-        StrAppend(&name, "\\n",
+        StrAppend(&name, "<br/>",
                   RandomDistribution_Name(instruction->random_distribution()));
         break;
-      case HloOpcode::kConstant:
-        shape = "boxed";
-        color = "palegreen";
-        if (ShapeUtil::IsScalar(instruction->shape())) {
-          StrAppend(&name, "\\n", "value=", LiteralUtil::GetAsString(
-                                                instruction->literal(), {}));
-        }
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kTranspose:
+        StrAppend(&name, "<br/>", "dims={",
+                  Join(instruction->dimensions(), ","), "}");
         break;
       case HloOpcode::kBitcast:
+      case HloOpcode::kTuple:
+      case HloOpcode::kTrace:
+        color = kWhite;
+        break;
+      case HloOpcode::kGetTupleElement:
+        color = kWhite;
+        StrAppend(&name, "<br/>index=", instruction->tuple_index());
+        break;
+      case HloOpcode::kConcatenate:
       case HloOpcode::kCopy:
-        color = "white";
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kPad:
+      case HloOpcode::kReshape:
+      case HloOpcode::kReverse:
+      case HloOpcode::kUpdate:
+        color = kGreen;
         break;
-      case HloOpcode::kCall:
-        color = "tomato";
-        break;
-      case HloOpcode::kCustomCall:
-        color = "tomato4";
-        StrAppend(&name, "\\n",
-                  "custom_call_target=", instruction->custom_call_target());
+      case HloOpcode::kConstant:
+        color = kBlue;
         break;
+      case HloOpcode::kConvolution:
       case HloOpcode::kDot:
-        color = "slateblue";
-        break;
-      case HloOpcode::kSend:
-        color = "purple";
-        break;
-      case HloOpcode::kRecv:
-        color = "orange";
-        break;
-      case HloOpcode::kMap:
-        color = "palevioletred";
+        color = kDarkBlue;
         break;
       case HloOpcode::kParameter:
         // A single record node is created for all the parameter nodes with a
@@ -279,38 +284,54 @@ string InstructionSequenceGraph(
         continue;
       case HloOpcode::kReduce:
         StrAppend(&name, " dims=", Join(instruction->dimensions(), ","));
-        color = "lightsalmon";
+        color = kPurple;
         break;
       case HloOpcode::kSelectAndScatter:
       case HloOpcode::kReduceWindow:
-        color = "lightsalmon";
-        break;
-      case HloOpcode::kTrace:
-        color = "white";
+        color = kPurple;
         break;
       case HloOpcode::kWhile:
-        color = "forestgreen";
+        shape = "ellipse";
+        color = kDarkGreen;
         break;
+      case HloOpcode::kMap:
       case HloOpcode::kFusion:
-        color = "gray";
-        break;
-      case HloOpcode::kConvolution:
-        color = "red";
-        break;
-      case HloOpcode::kCrossReplicaSum:
-        color = "turquoise";
+        color = kGray;
         break;
+      case HloOpcode::kSend:
+      case HloOpcode::kRecv:
       case HloOpcode::kInfeed:
       case HloOpcode::kOutfeed:
-        color = "blue";
+      case HloOpcode::kCrossReplicaSum:
+        color = kBrown;
+        break;
+      case HloOpcode::kCall:
+        color = kDarkGreen;
+        break;
+      case HloOpcode::kCustomCall:
+        color = kDarkGreen;
+        StrAppend(&name, "<br/>",
+                  "custom_call_target=", instruction->custom_call_target());
         break;
     }
 
     // Create instruction node with appropriate label, shape, and color.
+    // label is interpreted as an HTML-like string, so newlines must be
+    // delimited with <br/>, rather than \n.
     string label =
-        StrCat(name, "\\n", ShapeUtil::HumanString(instruction->shape()));
+        StrCat(name, "<br/>", ShapeUtil::HumanString(instruction->shape()));
+
+    if (instruction->opcode() == HloOpcode::kConstant &&
+        ShapeUtil::IsEffectiveScalar(instruction->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          instruction->shape(), /*linear_index=*/0);
+      StrAppend(&label, " = {",
+                LiteralUtil::GetAsString(instruction->literal(), elem_idx),
+                "}");
+    }
+
     if (show_addresses) {
-      Appendf(&label, "\\n[%p]", instruction.get());
+      Appendf(&label, "<br/>[%p]", instruction.get());
     }
     if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) {
       string layout_string;
@@ -322,24 +343,24 @@ string InstructionSequenceGraph(
         layout_string =
             Join(instruction->shape().layout().minor_to_major(), ",");
       }
-      StrAppend(&label, "\\nlayout={", layout_string, "}");
+      StrAppend(&label, "<br/>layout={", layout_string, "}");
     }
     if (hlo_execution_profile != nullptr) {
       auto hlo_cycles_executed =
           hlo_execution_profile->GetProfileResult(*instruction);
       auto total_cycles_executed =
-          hlo_execution_profile->total_cycles_executed();
+          hlo_execution_profile->total_cycles_executed(*instruction->parent());
       if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
-        Appendf(&label, "\\n%% of cycles executed=%.2f",
+        Appendf(&label, "<br/>%% of cycles executed=%.2f",
                 (static_cast<double>(hlo_cycles_executed) /
                  static_cast<double>(total_cycles_executed)) *
                     100);
       }
     }
-    Appendf(&graph_body,
-            "%s [label=\"%s\", shape=%s, style=filled, fillcolor=%s];\n",
+
+    Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n",
             InstructionId(instruction.get()).c_str(), label.c_str(),
-            shape.c_str(), color.c_str());
+            shape.c_str(), NodeColorAttributes(color).c_str());
 
     // Create edges from the instruction's operands to the instruction.
     int64 operand_number = 0;
@@ -369,7 +390,7 @@ string InstructionSequenceGraph(
           StrCat("cluster_", InstructionId(instruction.get()));
       StrAppend(&graph_body, "subgraph ", cluster_name, " {\n");
       StrAppend(&graph_body,
-                "label=\"fused expression\";\nstyle=filled;\n"
+                "label=<<b>fused expression</b>>;\nstyle=\"rounded,filled\";\n"
                 "color=lightgrey;\n");
       StrAppend(&graph_body, InstructionSequenceGraph(
                                  instruction->fused_instructions(),
@@ -385,7 +406,8 @@ string InstructionSequenceGraph(
     } else {
       // Add a dotted edge between the instruction and any computations that the
       // instruction calls.
-      for (auto* computation : instruction->MakeCalledComputationsSet()) {
+      for (const HloComputation* computation :
+           instruction->called_computations()) {
         string cluster_name = StrCat("cluster_", ComputationId(computation));
         string call_edge = Printf(
             "%s -> %s [ style=dashed; ltail=%s ];\n",
@@ -398,19 +420,39 @@ string InstructionSequenceGraph(
   return graph_body;
 }
 
+// DOT graphs accept a stylesheet as a URL.  So naturally, an inline stylesheet
+// is a data URI!
+//
+// We don't perform any escaping on this string, so be careful not to use double
+// quotes inside.
+static const char* dot_stylesheet = R"(
+data:text/css,
+@import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+svg text {
+  font-family: 'Roboto';
+  font-size: 12px;
+}
+)";
+
 string ComputationToDotGraph(const HloComputation& computation,
                              const string& label, bool show_addresses,
                              bool show_layouts,
                              const HloExecutionProfile* hlo_execution_profile) {
-  string graph_label = StrCat(label, "\\n", computation.name());
+  string graph_label = StrCat(label, "<br/>", computation.name());
   if (hlo_execution_profile != nullptr) {
-    auto cycles = hlo_execution_profile->total_cycles_executed();
-    Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles,
+    auto cycles = hlo_execution_profile->total_cycles_executed(computation);
+    Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
             tensorflow::strings::HumanReadableNum(cycles).c_str());
   }
-  string graph =
-      Printf("digraph G {\nrankdir=TB;\ncompound=true;\nlabel=\"%s\"\n",
-             graph_label.c_str());
+  string graph = Printf(
+      R"(digraph G {
+rankdir=TB;
+compound=true;
+label=<<b>%s</b>>;
+labelloc=t;
+stylesheet="%s"
+)",
+      graph_label.c_str(), dot_stylesheet);
 
   // Emit embedded computations as subgraph clusters.
   std::vector<string> intercomputation_edges;
@@ -418,7 +460,9 @@ string ComputationToDotGraph(const HloComputation& computation,
     string graph_body = InstructionSequenceGraph(
         embedded->instructions(), show_addresses, show_layouts,
         &intercomputation_edges, hlo_execution_profile);
-    Appendf(&graph, "subgraph cluster_%s {\nlabel=\"%s\";\n%s}\n",
+    Appendf(&graph,
+            "subgraph cluster_%s "
+            "{\nstyle=rounded;label=<<b>%s</b>>;labelloc=t;\n%s}\n",
             ComputationId(embedded).c_str(), embedded->name().c_str(),
             graph_body.c_str());
   }
@@ -464,14 +508,34 @@ namespace {
 
 class FileGraphRenderer : public GraphRendererInterface {
  public:
-  string RenderGraph(const string& graph) override {
+  string RenderGraph(const string& graph, GraphKind graph_kind) override {
     static std::atomic<int> output_num(0);
     legacy_flags::HloGraphDumperFlags* flags =
         legacy_flags::GetHloGraphDumperFlags();
-    string path = StrCat(flags->xla_hlo_dump_graph_path, "hlo_graph_",
-                         output_num++, ".dot");
-    tensorflow::Status status =
-        tensorflow::WriteStringToFile(tensorflow::Env::Default(), path, graph);
+    string file_extension;
+    switch (graph_kind) {
+      case DOT_GRAPH:
+        file_extension = ".dot";
+        break;
+      case TF_GRAPHDEF:
+        file_extension = ".pbtxt";
+        break;
+    }
+    string path =
+        JoinPath(flags->xla_hlo_dump_graph_path,
+                 StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
+    auto status = Status::OK();
+    int fd = mkstemps(&path[0], file_extension.length());
+    if (fd < 0) {
+      status =
+          Status(tensorflow::error::Code::UNKNOWN,
+                 StrCat("Failed to create temporary file to dump HLO graph: ",
+                        strerror(errno)));
+    } else {
+      status = tensorflow::WriteStringToFile(tensorflow::Env::Default(), path,
+                                             graph);
+      close(fd);
+    }
     if (!status.ok()) {
       LOG(WARNING) << "Saving HLO graph failed: " << status;
     }
@@ -486,10 +550,26 @@ XLA_REGISTER_GRAPH_RENDERER(FileGraphRenderer, 0);
 string DumpGraph(const HloComputation& computation, const string& label,
                  bool show_addresses, bool show_layouts,
                  const HloExecutionProfile* hlo_execution_profile) {
-  string graph = ComputationToDotGraph(computation, label, show_addresses,
-                                       show_layouts, hlo_execution_profile);
-
-  string graph_url = GetGraphRenderer()->RenderGraph(graph);
+  string graph;
+  string graph_url;
+  legacy_flags::HloGraphDumperFlags* flags =
+      legacy_flags::GetHloGraphDumperFlags();
+  if (flags->xla_hlo_dump_as_graphdef) {
+    HloTfGraphBuilder builder;
+    TF_CHECK_OK(builder.AddComputation(computation));
+    CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
+                                                          &graph));
+    // TODO(b/37198616): Use the default registered renderers when all
+    // renderers support rendering GraphDefs. Always dump GraphDefs to files
+    // for now.
+    graph_url = FileGraphRenderer().RenderGraph(
+        graph, GraphRendererInterface::TF_GRAPHDEF);
+  } else {
+    graph = ComputationToDotGraph(computation, label, show_addresses,
+                                  show_layouts, hlo_execution_profile);
+    graph_url = GetGraphRenderer()->RenderGraph(
+        graph, GraphRendererInterface::DOT_GRAPH);
+  }
   LOG(INFO) << "computation " << computation.name() << " [" << label
             << "]: " << graph_url;
   return graph_url;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 5f841da1f35..8ed50c38473 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -25,8 +25,25 @@ limitations under the License.
 namespace xla {
 namespace hlo_graph_dumper {
 
-// Dumps a graph of the computation to the GraphViz server and returns
-// a description of the rendered graph (e.g., a URL).
+// Abstract interface for classes that render HLO graphs (e.g. DOT graph,
+// tensorflow GraphDef).
+class GraphRendererInterface {
+ public:
+  enum GraphKind {
+    DOT_GRAPH,
+    TF_GRAPHDEF,
+  };
+
+  virtual ~GraphRendererInterface() = default;
+
+  // Renders a DOT graph, returning a description of the rendered output
+  // (e.g., a URL)
+  virtual string RenderGraph(const string& graph, GraphKind graph_kind) = 0;
+};
+
+// Dumps a graph of the computation and returns a description of the rendered
+// graph (e.g., a URL) based on the renderer. The "best" renderer in the
+// registry is used.
 string DumpGraph(const HloComputation& computation, const string& label,
                  bool show_addresses, bool show_layouts,
                  const HloExecutionProfile* hlo_execution_profile = nullptr);
@@ -40,16 +57,6 @@ string DumpGraph(const HloComputation& computation, const string& label,
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix = true);
 
-// Abstract interface for classes that render DOT graphs.
-class GraphRendererInterface {
- public:
-  virtual ~GraphRendererInterface() = default;
-
-  // Renders a DOT graph, returning a description of the rendered output
-  // (e.g., a URL)
-  virtual string RenderGraph(const string& graph) = 0;
-};
-
 // Graph renderers may be added using a registration mechanism, e.g.:
 // XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
 // The renderer with the highest numeric priority value is used.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index cd67757bb2c..ea813c98743 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -17,17 +17,19 @@ limitations under the License.
 
 #include <algorithm>
 #include <deque>
+#include <ostream>
 #include <set>
 #include <unordered_set>
 #include <utility>
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -42,6 +44,11 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateParameter(
     int64 parameter_number, const Shape& shape, const string& name) {
   auto instruction =
@@ -58,7 +65,7 @@ namespace xla {
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
   instruction->literal_.reset(new Literal);
-  *instruction->literal_->mutable_u8s() += tag;
+  instruction->literal_->append_u8s(tag);
   return instruction;
 }
 
@@ -117,6 +124,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCopy:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kLogicalNot:
     case HloOpcode::kNegate:
@@ -194,7 +202,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->to_apply_ = map_computation;
+  instruction->called_computations_.push_back(map_computation);
   return instruction;
 }
 
@@ -205,10 +213,10 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kConvolution, shape));
   if (window_util::HasBaseDilation(window)) {
-    instruction->set_name(instruction->name() + "-base-dilated");
+    instruction->name_ = instruction->name() + "-base-dilated";
   }
   if (window_util::HasWindowDilation(window)) {
-    instruction->set_name(instruction->name() + "-window-dilated");
+    instruction->name_ = instruction->name() + "-window-dilated";
   }
   instruction->AppendOperand(lhs);
   instruction->AppendOperand(rhs);
@@ -235,11 +243,13 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateOutfeed(
-    HloInstruction* operand, tensorflow::StringPiece outfeed_config) {
+    const Shape& shape, HloInstruction* operand,
+    tensorflow::StringPiece outfeed_config) {
   std::unique_ptr<HloInstruction> instruction =
       WrapUnique(new HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeNil()));
   instruction->AppendOperand(operand);
   instruction->outfeed_config_ = outfeed_config.ToString();
+  instruction->outfeed_shape_ = shape;
   return instruction;
 }
 
@@ -273,19 +283,22 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
     HloInstruction* init) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kWhile, shape));
   instruction->AppendOperand(init);
-  instruction->condition_ = condition;
-  instruction->body_ = body;
+  // Body comes before condition computation in the vector.
+  instruction->called_computations_.push_back(body);
+  instruction->called_computations_.push_back(condition);
   return instruction;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape));
   instruction->AppendOperand(operand);
   instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
   instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
+  instruction->slice_strides_.assign(strides.begin(), strides.end());
   return instruction;
 }
 
@@ -342,7 +355,7 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
   instruction->AppendOperand(init_value);
   instruction->dimensions_.assign(dimensions_to_reduce.begin(),
                                   dimensions_to_reduce.end());
-  instruction->to_apply_ = reduce_computation;
+  instruction->called_computations_.push_back(reduce_computation);
   return instruction;
 }
 
@@ -353,7 +366,7 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
       WrapUnique(new HloInstruction(HloOpcode::kReduceWindow, shape));
   instruction->AppendOperand(operand);
   instruction->AppendOperand(init_value);
-  instruction->to_apply_ = reduce_computation;
+  instruction->called_computations_.push_back(reduce_computation);
   instruction->window_ = MakeUnique<Window>(window);
   return instruction;
 }
@@ -368,8 +381,9 @@ HloInstruction::CreateSelectAndScatter(
   instruction->AppendOperand(operand);
   instruction->AppendOperand(source);
   instruction->AppendOperand(init_value);
-  instruction->select_ = select;
-  instruction->scatter_ = scatter;
+  // Select comes before scatter in the vector.
+  instruction->called_computations_.push_back(select);
+  instruction->called_computations_.push_back(scatter);
   instruction->window_ = MakeUnique<Window>(window);
   return instruction;
 }
@@ -398,7 +412,9 @@ HloInstruction::CreateSelectAndScatter(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReshape(
     const Shape& shape, HloInstruction* operand) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape),
-           ShapeUtil::ElementsIn(operand->shape()));
+           ShapeUtil::ElementsIn(operand->shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(operand->shape());
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReshape, shape));
   instruction->AppendOperand(operand);
   return instruction;
@@ -423,6 +439,8 @@ HloInstruction::CreateSelectAndScatter(
     const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
   instruction->fusion_kind_ = fusion_kind;
+  instruction->set_parent(fused_root->parent());
+  instruction->set_metadata(fused_root->metadata());
   instruction->CloneAndFuseInternal(fused_root);
   instruction->CheckFusionInstruction();
   return instruction;
@@ -477,7 +495,7 @@ HloInstruction* HloInstruction::FuseInstruction(
   CHECK_EQ(opcode_, HloOpcode::kFusion);
 
   // This fusion instruction must be a user of instruction_to_fuse.
-  CHECK_NE(0, instruction_to_fuse->users().count(this));
+  CHECK(IsUserOf(instruction_to_fuse));
   HloInstruction* fused_instruction = CloneAndFuseInternal(instruction_to_fuse);
   CheckFusionInstruction();
   return fused_instruction;
@@ -488,14 +506,20 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(instruction_to_fuse->IsFusable());
 
-  bool new_fusion_instruction = fused_instructions_.empty();
-  fused_instructions_.emplace_back(instruction_to_fuse->Clone());
-  HloInstruction* clone = fused_instructions_.back().get();
-  clone->parent_fusion_instruction_ = this;
-
-  if (new_fusion_instruction) {
-    fused_root_ = clone;
+  HloInstruction* clone = nullptr;
+  if (fused_instructions_computation_ == nullptr) {
+    // New fusion instruction.
+    auto builder = HloComputation::Builder("fused_computation", true);
+    builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/""));
+    fused_instructions_computation_ = builder.Build();
+    clone = fused_expression_root();
+    clone->parent_fusion_instruction_ = this;
   } else {
+    CHECK(fused_instructions_computation_ != nullptr &&
+          fused_instructions_computation_->IsFusionComputation());
+    clone = fused_instructions_computation_->AddInstruction(
+        instruction_to_fuse->Clone(/*suffix=*/""));
+    clone->parent_fusion_instruction_ = this;
     // instruction_to_fuse is necessarily an operand of the fusion instruction.
     // After fusion this will no longer be the case. Remove the operand from the
     // operand list and remove its corresponding fused parameter
@@ -503,6 +527,8 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     // consistent with their index in the fused_parameter_ vector.
     CHECK(std::find(operands_.begin(), operands_.end(), instruction_to_fuse) !=
           operands_.end());
+    const std::vector<HloInstruction*>& fused_parameters_ =
+        fused_instructions_computation_->parameter_instructions();
     for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
       if (instruction_to_fuse == operands_[operand_num]) {
         // replace the fused parameter instruction's uses with the clone.
@@ -511,22 +537,9 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
 
         // Remove the corresponding fused parameter and operand from their
         // respective vectors.
-        fused_parameters_.erase(fused_parameters_.begin() + operand_num);
+        TF_CHECK_OK(
+            fused_instructions_computation_->RemoveParameter(operand_num));
         operands_.erase(operands_.begin() + operand_num);
-
-        // Renumber fused parameter numbers to match the vector index.
-        while (operand_num < fused_parameters_.size()) {
-          fused_parameters_[operand_num]->parameter_number_ = operand_num;
-          operand_num++;
-        }
-        // Throw removed fused parameter instruction away.
-        auto inst_it =
-            std::find_if(fused_instructions_.begin(), fused_instructions_.end(),
-                         [=](const std::unique_ptr<HloInstruction>& inst) {
-                           return inst.get() == fused_parameter;
-                         });
-        CHECK(inst_it != fused_instructions_.end());
-        fused_instructions_.erase(inst_it);
         break;
       }
     }
@@ -535,6 +548,10 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     instruction_to_fuse->RemoveUser(this);
   }
 
+  // Reread the parameters in the computation.
+  const std::vector<HloInstruction*>& fused_parameters_ =
+      fused_instructions_computation_->parameter_instructions();
+
   // Add each operand of the clone as an operand of the fusion instruction. A
   // complication is that some clone operands may already be operands of the
   // fusion instruction.
@@ -557,19 +574,30 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       // instruction. Add it as an operand and add a corresponding fused
       // parameter instruction.
       int64 param_no = fused_parameters_.size();
+      // Name the parameter after the instruction it represents in the outer
+      // (non-fusion) computation. Strip the leading "%" from the operand name
+      // to avoid a double %%.
+      string param_name =
+          StrCat(operand->name().substr(1), ".param_", param_no);
       std::unique_ptr<HloInstruction> param_instruction =
-          CreateParameter(param_no, operand->shape(), "fusion_param");
+          CreateParameter(param_no, operand->shape(), param_name);
 
       param_instruction->parent_fusion_instruction_ = this;
-      fused_parameters_.push_back(param_instruction.get());
-      fused_instructions_.push_back(std::move(param_instruction));
+      fused_param = fused_instructions_computation_->AddParameter(
+          std::move(param_instruction));
       AppendOperand(operand);
-
-      fused_param = fused_instructions_.back().get();
     }
     TF_CHECK_OK(clone->ReplaceOperandWith(operand_num, fused_param));
   }
 
+  for (HloComputation* computation :
+       instruction_to_fuse->called_computations()) {
+    if (std::find(called_computations_.begin(), called_computations_.end(),
+                  computation) == called_computations_.end()) {
+      called_computations_.push_back(computation);
+    }
+  }
+
   return clone;
 }
 
@@ -578,58 +606,27 @@ RandomDistribution HloInstruction::random_distribution() const {
   return distribution_;
 }
 
-namespace {
-
-// Adds any HloComputations this instruction calls directly to the given set.
-void CalledComputationsInternal(
-    const HloInstruction& instruction,
-    std::set<HloComputation*>* called_computations) {
-  switch (instruction.opcode()) {
-    case HloOpcode::kCall:
-    case HloOpcode::kMap:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReduceWindow:
-      called_computations->insert(instruction.to_apply());
-      break;
-    case HloOpcode::kSelectAndScatter:
-      called_computations->insert(instruction.select());
-      called_computations->insert(instruction.scatter());
-      break;
-    case HloOpcode::kWhile:
-      called_computations->insert(instruction.while_condition());
-      called_computations->insert(instruction.while_body());
-      break;
-    case HloOpcode::kFusion:
-      for (const auto& fused_instruction : instruction.fused_instructions()) {
-        CalledComputationsInternal(*fused_instruction, called_computations);
-      }
-      break;
-    default:
-      break;
-  }
-}
-
-}  // namespace
-
-std::set<HloComputation*> HloInstruction::MakeCalledComputationsSet() const {
-  std::set<HloComputation*> called_computations;
-  CalledComputationsInternal(*this, &called_computations);
-  return called_computations;
-}
-
 void HloInstruction::CheckFusionInstruction() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
 
+  const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
+      fused_instructions_computation_->instructions();
   // All instructions owned by this fusion instruction must be fused, and the
   // parent fusion instruction of the fused instructions must be 'this'.
   for (auto& instruction : fused_instructions_) {
     CHECK(instruction->IsFused());
     CHECK_EQ(this, instruction->fusion_instruction());
+    CHECK_EQ(fused_instructions_computation_.get(), instruction->parent())
+        << instruction->ToString();
   }
 
   // Fused root instruction and fused parameters must all be owned by the fusion
   // instruction.
   bool root_owned = false;
+  const std::vector<HloInstruction*>& fused_parameters_ = fused_parameters();
+  const HloInstruction* fused_root_ = fused_expression_root();
   std::vector<bool> parameter_owned(fused_parameters_.size(), false);
   for (auto& instruction : fused_instructions_) {
     if (fused_root_ == instruction.get()) {
@@ -695,7 +692,7 @@ void HloInstruction::CheckFusionInstruction() const {
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->to_apply_ = computation;
+  instruction->called_computations_.push_back(computation);
   return instruction;
 }
 
@@ -722,7 +719,8 @@ void HloInstruction::CheckFusionInstruction() const {
 }
 
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands) {
   // Explicitly call the factory for the instruction type. This is more robust
   // in the face of code changes than copying fields explicitly. This also
   // properly sets the user fields of the operands.
@@ -733,6 +731,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
     case HloOpcode::kExp:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
     case HloOpcode::kLogicalNot:
@@ -740,8 +739,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSign:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
-      CHECK_EQ(operands.size(), 1);
-      return CreateUnary(shape, opcode_, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateUnary(shape, opcode_, new_operands[0]);
     // Binary ops.
     case HloOpcode::kAdd:
     case HloOpcode::kDivide:
@@ -760,93 +759,93 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kRemainder:
     case HloOpcode::kLogicalAnd:
     case HloOpcode::kLogicalOr:
-      CHECK_EQ(operands.size(), 2);
-      return CreateBinary(shape, opcode_, operands[0], operands[1]);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateBinary(shape, opcode_, new_operands[0], new_operands[1]);
     // Ternary ops.
     case HloOpcode::kClamp:
     case HloOpcode::kSelect:
-      CHECK_EQ(operands.size(), 3);
-      return CreateTernary(shape, opcode_, operands[0], operands[1],
-                           operands[2]);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateTernary(shape, opcode_, new_operands[0], new_operands[1],
+                           new_operands[2]);
     // Other supported ops.
     case HloOpcode::kBroadcast:
-      CHECK_EQ(operands.size(), 1);
-      return CreateBroadcast(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateBroadcast(shape, new_operands[0], dimensions_);
     case HloOpcode::kCall:
-      return CreateCall(shape, operands, to_apply_);
+      return CreateCall(shape, new_operands, to_apply());
     case HloOpcode::kCustomCall:
-      return CreateCustomCall(shape, operands, custom_call_target_);
+      return CreateCustomCall(shape, new_operands, custom_call_target_);
     case HloOpcode::kConcatenate:
-      return CreateConcatenate(shape, operands, dimensions(0));
+      return CreateConcatenate(shape, new_operands, dimensions(0));
     case HloOpcode::kConvert:
-      CHECK_EQ(operands.size(), 1);
-      return CreateConvert(shape, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateConvert(shape, new_operands[0]);
     case HloOpcode::kConvolution:
-      CHECK_EQ(operands.size(), 2);
-      return CreateConvolve(shape, operands[0], operands[1], *window_,
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
                             *convolution_dimension_numbers_);
     case HloOpcode::kCrossReplicaSum:
-      CHECK_EQ(operands.size(), 1);
-      return CreateCrossReplicaSum(shape, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateCrossReplicaSum(shape, new_operands[0]);
     case HloOpcode::kGetTupleElement:
-      CHECK_EQ(operands.size(), 1);
-      return CreateGetTupleElement(shape, operands[0], tuple_index());
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateGetTupleElement(shape, new_operands[0], tuple_index());
     case HloOpcode::kMap:
-      return CreateMap(shape, operands, to_apply_);
+      return CreateMap(shape, new_operands, to_apply());
     case HloOpcode::kPad:
-      CHECK_EQ(operands.size(), 2);
-      return CreatePad(shape, operands[0], operands[1], *padding_config_);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreatePad(shape, new_operands[0], new_operands[1],
+                       *padding_config_);
     case HloOpcode::kReduce:
-      CHECK_EQ(operands.size(), 2);
-      return CreateReduce(shape, operands[0], operands[1], dimensions_,
-                          to_apply_);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateReduce(shape, new_operands[0], new_operands[1], dimensions_,
+                          to_apply());
     case HloOpcode::kReduceWindow:
-      CHECK_EQ(operands.size(), 2);
-      return CreateReduceWindow(shape, operands[0], operands[1], *window_,
-                                to_apply_);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateReduceWindow(shape, new_operands[0], new_operands[1],
+                                *window_, to_apply());
     case HloOpcode::kSelectAndScatter:
-      CHECK_EQ(operands.size(), 3);
-      return CreateSelectAndScatter(shape, operands[0], select_, *window_,
-                                    operands[1], operands[2], scatter_);
-    case HloOpcode::kRecv:
-      CHECK_EQ(operands.size(), 0);
-      return CreateRecv(shape, channel_id_);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateSelectAndScatter(shape, new_operands[0], select(), *window_,
+                                    new_operands[1], new_operands[2],
+                                    scatter());
     case HloOpcode::kReverse:
-      CHECK_EQ(operands.size(), 1);
-      return CreateReverse(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateReverse(shape, new_operands[0], dimensions_);
     case HloOpcode::kRng:
-      return CreateRng(shape, distribution_, operands);
+      return CreateRng(shape, distribution_, new_operands);
     case HloOpcode::kReshape:
-      CHECK_EQ(operands.size(), 1);
-      return CreateReshape(shape, operands[0]);
-    case HloOpcode::kSend:
-      CHECK_EQ(operands.size(), 1);
-      return CreateSend(operands[0], channel_id_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateReshape(shape, new_operands[0]);
     case HloOpcode::kSlice:
-      CHECK_EQ(operands.size(), 1);
-      return CreateSlice(shape, operands[0], slice_starts_, slice_limits_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_,
+                         slice_strides_);
     case HloOpcode::kDynamicSlice:
-      return CreateDynamicSlice(shape, operands[0], operands[1],
+      return CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                 dynamic_slice_sizes_);
     case HloOpcode::kDynamicUpdateSlice:
-      CHECK_EQ(operands.size(), 3);
-      return CreateDynamicUpdateSlice(shape, operands[0], operands[1],
-                                      operands[2]);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
+                                      new_operands[2]);
     case HloOpcode::kTranspose:
-      CHECK_EQ(operands.size(), 1);
-      return CreateTranspose(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateTranspose(shape, new_operands[0], dimensions_);
     case HloOpcode::kTuple:
-      return CreateTuple(operands_);
+      return CreateTuple(new_operands);
     case HloOpcode::kWhile:
-      CHECK_EQ(operands.size(), 1);
-      return CreateWhile(shape, condition_, body_, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateWhile(shape, while_condition(), while_body(),
+                         new_operands[0]);
     case HloOpcode::kConstant:
       return CreateConstant(LiteralUtil::CloneToUnique(*literal_));
     case HloOpcode::kFusion:
-      return CloneFusionWithNewOperands(shape, operands);
+      return CloneFusionWithNewOperands(shape, new_operands);
     case HloOpcode::kParameter:
       return CreateParameter(parameter_number_, shape, parameter_name_);
     // Unsupported ops for cloning.
+    case HloOpcode::kRecv:
+    case HloOpcode::kSend:
     case HloOpcode::kUpdate:
     case HloOpcode::kIndex:
     case HloOpcode::kInfeed:
@@ -856,16 +855,55 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
 }
 
-std::unique_ptr<HloInstruction> HloInstruction::Clone() {
+HloInstruction::~HloInstruction() {}
+
+std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix) {
   std::unique_ptr<HloInstruction> clone =
       CloneWithNewOperands(shape_, operands_);
-  clone->name_ = name() + ".clone";
+  if (suffix.empty()) {
+    clone->name_ = name();
+  } else {
+    // If an instruction is cloned multiple times avoid names like
+    // foo.suffix.suffix.suffix. Instead of repeating the suffix add a numeric
+    // suffix. Specifically, the clone of foo.suffix is named foo.suffix2, the
+    // clone of foo.suffix2 is named foo.suffix3 and so on.
+    const string dot_suffix = "." + suffix;
+    size_t index = name().rfind(dot_suffix);
+    if (index == string::npos) {
+      // Existing name does not include ".suffix".
+      clone->name_ = name() + dot_suffix;
+    } else {
+      // Existing name includes ".suffix". Determine if substring after
+      // ".suffix" is numeric and should be replaced with an incremented number.
+      string after_suffix = name().substr(index + dot_suffix.size());
+      if (after_suffix.empty()) {
+        // Existing name ends in ".suffix". New name should end in ".suffix2".
+        clone->name_ = name() + "2";
+      } else {
+        // If names ends with .suffix[0-9]+ then replace with a suffix with the
+        // numeric value incremented.
+        int64 numeric_suffix;
+        if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+          clone->name_ =
+              StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
+        } else {
+          // Substring after ".suffix" is non-numeric.
+          clone->name_ = name() + dot_suffix;
+        }
+      }
+    }
+  }
+  clone->set_parent(parent());
+  clone->set_metadata(metadata_);
   return clone;
 }
 
 std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
+  CHECK(parent() != nullptr);
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
 
   auto new_instruction =
       WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
@@ -879,6 +917,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   // Create the list of fused parameters by mapping through the cloned,
   // fused instructions.
   std::vector<HloInstruction*> new_fused_parameters;
+  const std::vector<HloInstruction*>& fused_parameters_ =
+      fused_instructions_computation_->parameter_instructions();
+  const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
+      fused_instructions_computation_->instructions();
+
   for (HloInstruction* old_fused_parameter : fused_parameters_) {
     new_fused_instructions.push_back(old_fused_parameter->Clone());
     HloInstruction* new_fusion_parameter = new_fused_instructions.back().get();
@@ -905,16 +948,24 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
         old_fused_instruction->CloneWithNewOperands(
             old_fused_instruction->shape(), new_operands));
     HloInstruction* new_fused_instruction = new_fused_instructions.back().get();
+    new_fused_instruction->set_parent(parent());
     new_fused_instruction->parent_fusion_instruction_ = new_instruction.get();
     InsertOrDie(&old_to_new, old_fused_instruction, new_fused_instruction);
   }
+  new_instruction->fusion_kind_ = fusion_kind_;
+  auto computation_builder = HloComputation::Builder(
+      fused_instructions_computation_->name() + ".clone", true);
   // We iterated the fusion instructions in reverse post order which means
   // that we must reverse our new list of fusion instructions.
-  std::reverse(new_fused_instructions.begin(), new_fused_instructions.end());
-  new_instruction->fusion_kind_ = fusion_kind_;
-  new_instruction->fused_instructions_ = std::move(new_fused_instructions);
-  new_instruction->fused_parameters_ = std::move(new_fused_parameters);
-  new_instruction->fused_root_ = FindOrDie(old_to_new, fused_root_);
+  for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
+       new_fused_instruction_iter != new_fused_instructions.rend();
+       ++new_fused_instruction_iter) {
+    computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
+  }
+  auto fused_root_ = fused_expression_root();
+  new_instruction->fused_instructions_computation_ =
+      computation_builder.Build(FindOrDie(old_to_new, fused_root_));
+  new_instruction->set_parent(parent());
   new_instruction->CheckFusionInstruction();
   return new_instruction;
 }
@@ -969,12 +1020,43 @@ int64 HloInstruction::operand_index(const HloInstruction* target) const {
   LOG(FATAL) << "target was not an operand";
 }
 
+Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
+  TF_RET_CHECK(instruction->parent() == parent());
+  if (std::find(control_successors_.begin(), control_successors_.end(),
+                instruction) == control_successors_.end()) {
+    control_successors_.push_back(instruction);
+    TF_RET_CHECK(std::find(instruction->control_predecessors_.begin(),
+                           instruction->control_predecessors_.end(),
+                           this) == instruction->control_predecessors_.end());
+    instruction->control_predecessors_.push_back(this);
+  }
+  return Status::OK();
+}
+
+Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
+  auto succ_it = std::find(control_successors_.begin(),
+                           control_successors_.end(), instruction);
+  TF_RET_CHECK(succ_it != control_successors_.end());
+  control_successors_.erase(succ_it);
+  auto pred_it = std::find(instruction->control_predecessors_.begin(),
+                           instruction->control_predecessors_.end(), this);
+  TF_RET_CHECK(pred_it != instruction->control_predecessors_.end());
+  instruction->control_predecessors_.erase(succ_it);
+
+  return Status::OK();
+}
+
 void HloInstruction::AppendOperand(HloInstruction* operand) {
   operands_.push_back(operand);
   operand->AddUser(this);
 }
 
-void HloInstruction::AddUser(HloInstruction* user) { users_.insert(user); }
+void HloInstruction::AddUser(HloInstruction* user) {
+  if (!ContainsKey(user_set_, user)) {
+    user_set_.insert(user);
+    users_.push_back(user);
+  }
+}
 
 bool HloInstruction::IsConstant() const {
   return opcode_ == HloOpcode::kConstant;
@@ -989,14 +1071,6 @@ bool HloInstruction::HasConstantOperand() const {
   return false;
 }
 
-void HloInstruction::AddControlPredecessor(HloInstruction* instruction) {
-  control_predecessors_.insert(instruction);
-}
-
-void HloInstruction::AddControlSuccessor(HloInstruction* instruction) {
-  control_successors_.insert(instruction);
-}
-
 bool HloInstruction::Identical(
     const HloInstruction& other,
     std::function<bool(const HloInstruction*, const HloInstruction*)>
@@ -1012,7 +1086,7 @@ bool HloInstruction::Identical(
   // general, there is no need to check shape because shape is inferred from the
   // shape of the operands.
   if (opcode() != other.opcode() ||
-      !ContainersEqual(operands(), other.operands(), eq_operands)) {
+      !ContainersEqual(operands(), other.operands(), std::move(eq_operands))) {
     return false;
   }
 
@@ -1033,6 +1107,7 @@ bool HloInstruction::Identical(
     case HloOpcode::kFloor:
     case HloOpcode::kGe:
     case HloOpcode::kGt:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
     case HloOpcode::kLogicalAnd:
@@ -1156,9 +1231,14 @@ bool HloInstruction::IsRank2Transpose() const {
 }
 
 void HloInstruction::RemoveUser(HloInstruction* user) {
-  auto user_it = users_.find(user);
-  CHECK(user_it != users_.end());
-  users_.erase(user_it);
+  auto set_it = user_set_.find(user);
+  CHECK(set_it != user_set_.end());
+  user_set_.erase(set_it);
+  // This is linear in the number of the users, but a vector provides a stable
+  // iteration order and much faster traversal.
+  auto vec_it = std::find(users_.begin(), users_.end(), user);
+  CHECK(vec_it != users_.end());
+  users_.erase(vec_it);
 }
 
 Status HloInstruction::ReplaceUseWith(HloInstruction* user,
@@ -1167,15 +1247,12 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
       << "this shape: " << ShapeUtil::HumanString(shape())
       << ", replacement shape: "
       << ShapeUtil::HumanString(new_producer->shape());
-  auto user_it = std::find(users_.begin(), users_.end(), user);
-  TF_RET_CHECK(user_it != users_.end())
-      << "Instruction " << user->name() << " not a use of instruction "
-      << name();
-  users_.erase(user_it);
 
   VLOG(3) << "Replacing uses of " << name() << " in " << user->name()
           << " with " << new_producer->name();
 
+  RemoveUser(user);
+
   TF_RET_CHECK(
       std::count(user->operands_.begin(), user->operands_.end(), this) >= 0);
   std::replace(user->operands_.begin(), user->operands_.end(), this,
@@ -1207,30 +1284,37 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
 }
 
 Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
-  // We can't use range-based loop because the iterator is invalidated by call
-  // to ReplaceUseWith.
-  for (auto user = users_.begin(); user != users_.end();) {
-    auto this_user = user;
-    user++;
-    // It's possible that new_producer is a user of this instruction as might
-    // be the case when replacing an instruction with a kCopy of itself. In
-    // this case, don't do the replacement to avoid creating a cycle in the
-    // graph.
-    if (*this_user != new_producer) {
-      TF_RETURN_IF_ERROR(ReplaceUseWith(*this_user, new_producer));
+  bool new_producer_is_user = false;
+  for (HloInstruction* user : users()) {
+    if (user == new_producer) {
+      // It's possible that new_producer is a user of this instruction as might
+      // be the case when replacing an instruction with a kCopy of itself. In
+      // this case, don't do the replacement to avoid creating a cycle in the
+      // graph. new_producer remains the only user of this instruction.
+      new_producer_is_user = true;
+    } else {
+      std::replace(user->operands_.begin(), user->operands_.end(), this,
+                   new_producer);
+      new_producer->AddUser(user);
     }
   }
+  users_.clear();
+  user_set_.clear();
+  if (new_producer_is_user) {
+    AddUser(new_producer);
+  }
+
   return Status::OK();
 }
 
 void HloInstruction::DetachFromOperands() {
   CHECK_EQ(0, user_count());
-  // An intruction may be repeated as an operand. To avoid calling RemoveUser
+  // An instruction may be repeated as an operand. To avoid calling RemoveUser
   // twice on the same operand, keep a set of already detached operands.
   std::set<HloInstruction*> detached_operands;
   for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
     HloInstruction* operand = operands_[operand_num];
-    if (detached_operands.count(operand) == 0) {
+    if (!ContainsKey(detached_operands, operand)) {
       operand->RemoveUser(this);
       detached_operands.insert(operand);
     }
@@ -1244,22 +1328,29 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
-      return to_apply_;
+      CHECK_EQ(called_computations_.size(), 1);
+      return called_computations_[0];
     default:
-      LOG(FATAL) << "Invalid instruction for to_apply(): " << ToString();
+      LOG(FATAL) << "Invalid opcode for to_apply(): "
+                 << HloOpcodeString(opcode());
   }
 }
 
 void HloInstruction::set_to_apply(HloComputation* computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
   switch (opcode_) {
     case HloOpcode::kCall:
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
-      to_apply_ = computation;
+      CHECK_EQ(called_computations_.size(), 1);
+      called_computations_[0] = computation;
       break;
     default:
-      LOG(FATAL) << "Invalid instruction for to_apply(): " << ToString();
+      LOG(FATAL) << "Invalid opcode for to_apply(): "
+                 << HloOpcodeString(opcode());
   }
 }
 
@@ -1275,55 +1366,75 @@ const string& HloInstruction::outfeed_config() const {
 
 HloComputation* HloInstruction::while_condition() const {
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  return condition_;
+  return called_computations_[kConditionComputationIndex];
 }
 
 HloComputation* HloInstruction::while_body() const {
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  return body_;
+  return called_computations_[kBodyComputationIndex];
 }
 
 void HloInstruction::set_while_condition(HloComputation* computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  condition_ = computation;
+  called_computations_[kConditionComputationIndex] = computation;
 }
 
 void HloInstruction::set_while_body(HloComputation* computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  body_ = computation;
+  called_computations_[kBodyComputationIndex] = computation;
 }
 
 HloComputation* HloInstruction::select() const {
   CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_);
-  return select_;
+  return called_computations_[kSelectComputationIndex];
 }
 
 HloComputation* HloInstruction::scatter() const {
   CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_);
-  return scatter_;
+  return called_computations_[kScatterComputationIndex];
 }
 
 void HloInstruction::set_select(HloComputation* computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
   CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_);
-  select_ = computation;
+  called_computations_[kSelectComputationIndex] = computation;
 }
 
 void HloInstruction::set_scatter(HloComputation* computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
   CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_);
-  scatter_ = computation;
+  called_computations_[kScatterComputationIndex] = computation;
 }
 
 string HloInstruction::SignatureString() const {
-  string operands = tensorflow::str_util::Join(
-      operands_, ", ", [](string* out, HloInstruction* operand) {
-        tensorflow::strings::StrAppend(
-            out, ShapeUtil::HumanString(operand->shape()));
+  string operands =
+      Join(operands_, ", ", [](string* out, HloInstruction* operand) {
+        StrAppend(out, ShapeUtil::HumanString(operand->shape()));
       });
-  return tensorflow::strings::StrCat("(", operands, ") -> ",
-                                     ShapeUtil::HumanString(shape()));
+  return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
-string HloInstruction::ToString(bool compact_operands) const {
+string HloInstruction::ExtendedOpcodeStr() const {
+  string opc_name = HloOpcodeString(opcode());
+  HloOpcode opc = opcode();
+  if (HloOpcode::kFusion == opc) {
+    opc_name += ":" + xla::ToString(fusion_kind());
+  }
+  return opc_name;
+}
+
+string HloInstruction::ToString(bool compact_operands,
+                                bool include_metadata) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
@@ -1337,120 +1448,140 @@ string HloInstruction::ToString(bool compact_operands) const {
       // Concatenate elements in "v" with spaces separating them, but ignoring
       // empty entries.
       for (const auto& s : v) {
-        if (s.empty()) continue;
-        tensorflow::strings::StrAppend(&operands, (first ? "" : " "), s);
+        if (s.empty()) {
+          continue;
+        }
+        StrAppend(&operands, (first ? "" : " "), s);
         first = false;
       }
     } else {
       // Do not show large constants.
       operands = "{...}";
     }
+  } else if (opcode() == HloOpcode::kParameter) {
+    operands = Printf("%lld", parameter_number_);
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
     const int64 kMaxOperandsToShowIfCompact = 4;
     if (compact_operands && slice.size() > kMaxOperandsToShowIfCompact) {
       slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact);
     }
-    operands = tensorflow::str_util::Join(
-        slice, ", ", [&](string* out, HloInstruction* operand) {
-          *out += ShapeUtil::HumanStringWithLayout(operand->shape());
-          if (!compact_operands) {
-            tensorflow::strings::StrAppend(out, " ", operand->name());
-          }
-        });
+    operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
+      *out += ShapeUtil::HumanStringWithLayout(operand->shape());
+      if (!compact_operands) {
+        StrAppend(out, " ", operand->name());
+      }
+    });
     const int64 remaining = operands_.size() - slice.size();
     if (slice.size() != operands_.size()) {
-      tensorflow::strings::StrAppend(&operands, ", ...(+", remaining, ")");
+      StrAppend(&operands, ", ...(+", remaining, ")");
     }
   }
   string extra;
   if (CanHaveDimensionsField()) {
-    tensorflow::strings::StrAppend(
-        &extra, ", dimensions={", tensorflow::str_util::Join(dimensions(), ","),
-        "}");
+    StrAppend(&extra, ", dimensions={", Join(dimensions(), ","), "}");
   }
   if (window_ != nullptr) {
-    tensorflow::strings::StrAppend(&extra, ", ",
-                                   window_util::ToString(*window_));
+    StrAppend(&extra, ", ", window_util::ToString(*window_));
   }
   if (padding_config_ != nullptr) {
-    tensorflow::strings::StrAppend(
-        &extra, ", padding=", padding_config_->ShortDebugString());
+    StrAppend(&extra, ", padding=", padding_config_->ShortDebugString());
   }
   if (!slice_starts_.empty() && !slice_limits_.empty()) {
     std::vector<string> bounds;
+    bounds.reserve(slice_starts_.size());
     for (int i = 0; i < slice_starts_.size(); ++i) {
-      bounds.push_back(tensorflow::strings::StrCat("[", slice_starts_[i], ":",
-                                                   slice_limits_[i], "]"));
+      bounds.push_back(
+          StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]"));
     }
-    tensorflow::strings::StrAppend(
-        &extra, ", slice={", tensorflow::str_util::Join(bounds, ", "), "}");
+    StrAppend(&extra, ", slice={", Join(bounds, ", "), "}");
   }
+
   if (convolution_dimension_numbers_ != nullptr) {
-    const auto& dnums = *convolution_dimension_numbers_;
-
-    // Show the given dimension labels in order of major to minor based on the
-    // shape's layout.
-    const auto append_dims = [&](const std::vector<string>& dims,
-                                 const Shape& shape) {
-      CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
-      for (int64 logical = 0; logical < dims.size(); ++logical) {
-        int64 physical = logical;
-        if (!shape.layout().minor_to_major().empty()) {
-          physical = LayoutUtil::Major(shape.layout(), logical);
-        }
-        extra += dims[physical];
-      }
-    };
-
-    // lhs_dims[i] is the symbol of the logical dimension i for the lhs
-    // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
-    std::vector<string> lhs_dims(2 + dnums.spatial_dimensions().size());
-    lhs_dims[dnums.batch_dimension()] = 'b';
-    lhs_dims[dnums.feature_dimension()] = 'f';
-    for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-      lhs_dims[dnums.spatial_dimensions(i)] = tensorflow::strings::StrCat(i);
-    }
-
-    std::vector<string> rhs_dims(2 + dnums.kernel_spatial_dimensions().size());
-    rhs_dims[dnums.kernel_input_feature_dimension()] = "i";
-    rhs_dims[dnums.kernel_output_feature_dimension()] = "o";
-    for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-      rhs_dims[dnums.kernel_spatial_dimensions(i)] =
-          tensorflow::strings::StrCat(i);
-    }
-
-    extra += " dims: ";
-    append_dims(lhs_dims, operands_.at(0)->shape());
-    extra += "_";
-    append_dims(rhs_dims, operands_.at(1)->shape());
-    extra += "->";
-    append_dims(lhs_dims, shape());
-  }
-  if (to_apply_ != nullptr) {
-    tensorflow::strings::StrAppend(&extra, ", computation=", to_apply_->name());
+    StrAppend(&extra, ", ", ConvolutionDimensionNumbersToString());
   }
+
   if (opcode() == HloOpcode::kWhile) {
-    tensorflow::strings::StrAppend(&extra,
-                                   ", condition=", while_condition()->name());
-    tensorflow::strings::StrAppend(&extra, ", body=", while_body()->name());
+    StrAppend(&extra, ", condition=", while_condition()->name());
+    StrAppend(&extra, ", body=", while_body()->name());
+  } else if (opcode() == HloOpcode::kSelectAndScatter) {
+    StrAppend(&extra, ", select=", select()->name());
+    StrAppend(&extra, ", scatter=", scatter()->name());
+  } else if (!called_computations().empty()) {
+    StrAppend(&extra, ", calls=",
+              Join(called_computations(), ", ",
+                   [](string* out, const HloComputation* computation) {
+                     StrAppend(out, computation->name());
+                   }));
   }
+
   if (opcode() == HloOpcode::kGetTupleElement) {
-    tensorflow::strings::StrAppend(&extra, ", index=", tuple_index());
+    StrAppend(&extra, ", index=", tuple_index());
   }
-  return tensorflow::strings::Printf(
-      "%s = %s %s(%s)%s", name().c_str(),
-      ShapeUtil::HumanStringWithLayout(shape()).c_str(),
-      HloOpcodeString(opcode()).c_str(), operands.c_str(), extra.c_str());
+  if (include_metadata &&
+      (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
+       !metadata_.source_file().empty())) {
+    StrAppend(&extra, " # metadata=", metadata_.ShortDebugString());
+  }
+
+  return Printf("%s = %s %s(%s)%s", name().c_str(),
+                ShapeUtil::HumanStringWithLayout(shape()).c_str(),
+                ExtendedOpcodeStr().c_str(), operands.c_str(), extra.c_str());
 }
 
 string HloInstruction::ToShortString() const {
-  return tensorflow::strings::Printf(
-      "%s = %s(%s)", name().c_str(), HloOpcodeString(opcode()).c_str(),
-      tensorflow::str_util::Join(operands_, ", ", [](string* out,
-                                                     HloInstruction* operand) {
-        tensorflow::strings::StrAppend(out, operand->name());
-      }).c_str());
+  return Printf("%s = %s(%s)", name().c_str(),
+                HloOpcodeString(opcode()).c_str(),
+                Join(operands_, ", ",
+                     [](string* out, HloInstruction* operand) {
+                       StrAppend(out, operand->name());
+                     })
+                    .c_str());
+}
+
+HloInstructionProto HloInstruction::ToProto() const {
+  HloInstructionProto proto;
+  proto.set_name(name_);
+  proto.set_opcode(HloOpcodeString(opcode_));
+  *proto.mutable_shape() = shape_;
+  for (const HloInstruction* operand : operands_) {
+    *proto.add_operand_names() = operand->name();
+  }
+  for (const HloInstruction* control : control_predecessors_) {
+    *proto.add_control_predecessor_names() = control->name();
+  }
+  for (const HloComputation* computation : called_computations_) {
+    *proto.add_called_computation_names() = computation->name();
+  }
+  *proto.mutable_metadata() = metadata_;
+  switch (opcode_) {
+    case HloOpcode::kConstant:
+      *proto.mutable_literal() = literal_->ToProto();
+      break;
+    case HloOpcode::kParameter:
+      proto.set_parameter_number(parameter_number_);
+      proto.set_parameter_name(parameter_name_);
+      break;
+    case HloOpcode::kFusion: {
+      HloComputationProto* proto_fused_computation =
+          proto.mutable_fused_instructions_computation();
+      proto_fused_computation->set_name(FullyQualifiedName());
+
+      // Fill in fused instructions. Note that fused_instructions() returns in
+      // reverse post-order (i.e. root first), so we reverse to get post-order.
+      for (auto fused_it = fused_instructions().rbegin();
+           fused_it != fused_instructions().rend(); ++fused_it) {
+        HloInstructionProto fused_proto = (*fused_it)->ToProto();
+        proto_fused_computation->add_instructions()->Swap(&fused_proto);
+      }
+      break;
+    }
+    case HloOpcode::kGetTupleElement:
+      proto.set_tuple_index(tuple_index_);
+      break;
+    default: {}  // Nothing to do
+  }
+  return proto;
 }
 
 string HloInstruction::ToCategory() const {
@@ -1482,10 +1613,22 @@ string HloInstruction::ToCategory() const {
         return "rank-1-broadcast binary fusion";
       }
     }
-    if (IsElementwise()) {
-      return "elementwise fusion";
-    } else {
-      return "non-elementwise fusion";
+    switch (fusion_kind()) {
+      case FusionKind::kLoop:
+        if (IsElementwise()) {
+          return "elementwise fusion";
+        } else {
+          return "non-elementwise fusion";
+        }
+      case FusionKind::kInput:
+        return "input fusion";
+      case FusionKind::kOutput:
+        return "output fusion";
+      case FusionKind::kTransposeDot:
+        return "dot fusion";
+      case FusionKind::kConvBackwardFilter:
+      case FusionKind::kConvBackwardInput:
+        return "convolution fusion";
     }
   }
 
@@ -1496,16 +1639,24 @@ string HloInstruction::ToCategory() const {
   return HloOpcodeString(opcode());
 }
 
+string HloInstruction::FullyQualifiedName() const {
+  if (IsFused()) {
+    return StrCat(fusion_instruction()->parent()->name(),
+                  "::", fusion_instruction()->name(), "::", name_);
+  }
+  return StrCat(parent_->name(), "::", name_);
+}
+
 HloInstruction* HloInstruction::tracing() const { return trace_instruction_; }
 
 void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
   trace_instruction_ = trace_instruction;
 }
 
-const string& HloInstruction::tracing_tag() const {
+string HloInstruction::TracingTag() const {
   CHECK_EQ(HloOpcode::kTrace, opcode());
   CHECK(literal_ != nullptr);
-  return literal_->u8s();
+  return literal_->u8s_string();
 }
 
 bool HloInstruction::IsFused() const {
@@ -1520,7 +1671,6 @@ bool HloInstruction::IsFusable() const {
 
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kFusion:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
@@ -1528,11 +1678,20 @@ bool HloInstruction::IsFusable() const {
     case HloOpcode::kSend:
     case HloOpcode::kRecv:
       return false;
+    // Only fuse Rng if it is used once, otherwise the random numbers generated
+    // will be different in each fusion.
+    case HloOpcode::kRng:
+      return users_.size() == 1;
     default:
       return true;
   }
 }
 
+HloComputation* HloInstruction::fused_instructions_computation() const {
+  CHECK_EQ(opcode_, HloOpcode::kFusion);
+  return fused_instructions_computation_.get();
+}
+
 HloInstruction* HloInstruction::fusion_instruction() const {
   CHECK(IsFused());
   return parent_fusion_instruction_;
@@ -1540,20 +1699,32 @@ HloInstruction* HloInstruction::fusion_instruction() const {
 
 HloInstruction* HloInstruction::fused_expression_root() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_root_;
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->root_instruction();
 }
 
 HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK_GE(parameter_number, 0);
-  CHECK_LT(parameter_number, fused_parameters_.size());
-  return fused_parameters_[parameter_number];
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->parameter_instruction(
+      parameter_number);
+}
+
+const std::vector<HloInstruction*>& HloInstruction::fused_parameters() const {
+  CHECK_EQ(opcode_, HloOpcode::kFusion);
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->parameter_instructions();
 }
 
 const std::list<std::unique_ptr<HloInstruction>>&
 HloInstruction::fused_instructions() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_instructions_;
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->instructions();
 }
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
@@ -1619,16 +1790,16 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this, operands_);
     case HloOpcode::kMap:
-      return visitor->HandleMap(this, operands_, to_apply_, {});
+      return visitor->HandleMap(this, operands_, to_apply(), {});
     case HloOpcode::kClamp:
       return visitor->HandleClamp(this, operands_[0], operands_[1],
                                   operands_[2]);
     case HloOpcode::kReduce:
       return visitor->HandleReduce(this, operands_[0], operands_[1],
-                                   dimensions_, to_apply_);
+                                   dimensions_, to_apply());
     case HloOpcode::kReduceWindow:
       return visitor->HandleReduceWindow(this, operands_[0], window(),
-                                         to_apply_);
+                                         to_apply());
     case HloOpcode::kSelectAndScatter:
       return visitor->HandleSelectAndScatter(this);
     case HloOpcode::kNegate:
@@ -1643,6 +1814,8 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleLog(this, operands_[0]);
     case HloOpcode::kTanh:
       return visitor->HandleTanh(this, operands_[0]);
+    case HloOpcode::kIsFinite:
+      return visitor->HandleIsFinite(this, operands_[0]);
     case HloOpcode::kLogicalNot:
       return visitor->HandleLogicalNot(this, operands_[0]);
     case HloOpcode::kBitcast:
@@ -1660,7 +1833,7 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kSlice:
       return visitor->HandleSlice(this, operands_[0]);
     case HloOpcode::kDynamicSlice:
-      return visitor->HandleDynamicSlice(this, operands_);
+      return visitor->HandleDynamicSlice(this, operands_[0], operands_[1]);
     case HloOpcode::kDynamicUpdateSlice:
       return visitor->HandleDynamicUpdateSlice(this, operands_[0], operands_[1],
                                                operands_[2]);
@@ -1673,11 +1846,11 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kRng:
       return visitor->HandleRng(this, distribution_);
     case HloOpcode::kWhile:
-      return visitor->HandleWhile(this, operands_[0], condition_, body_);
+      return visitor->HandleWhile(this);
     case HloOpcode::kFusion:
       return visitor->HandleFusion(this);
     case HloOpcode::kCall:
-      return visitor->HandleCall(this, operands_, to_apply_);
+      return visitor->HandleCall(this);
     case HloOpcode::kCustomCall:
       return visitor->HandleCustomCall(this, operands_, custom_call_target_);
     case HloOpcode::kSend:
@@ -1695,7 +1868,9 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
                        HloOpcodeString(opcode_).c_str());
 }
 
-Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor) {
+Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor,
+                                      const CompareFunction* operand_order,
+                                      bool ignore_control_predecessors) {
   // Do not visit this HLO node again if it is already visited.
   if (visitor->DidVisit(*this)) {
     VLOG(3) << "Not visiting HLO " << name() << " as it was already visited.";
@@ -1710,16 +1885,41 @@ Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor) {
   }
   visitor->SetVisiting(*this);
 
-  for (auto operand : operands_) {
+  // Sort operands, if an ordering was provided. 'temp_sorted_operands' must
+  // live at this scope, since 'operands' will point to it if the operands are
+  // sorted.  The purpose of the 'operands' pointer is to avoid copying the
+  // operands in the common case where the operands are not sorted.
+  std::vector<HloInstruction*>* operands = &operands_;
+  std::vector<HloInstruction*> temp_sorted_operands;
+  if (operand_order != nullptr) {
+    temp_sorted_operands = operands_;
+    std::sort(temp_sorted_operands.begin(), temp_sorted_operands.end(),
+              *operand_order);
+    operands = &temp_sorted_operands;
+  }
+  for (HloInstruction* operand : *operands) {
     VLOG(3) << "Going to visit HLO " << operand->name() << " as operand of HLO "
             << name();
-    TF_RETURN_IF_ERROR(operand->AcceptInternal(visitor));
+    TF_RETURN_IF_ERROR(operand->AcceptInternal(visitor, operand_order,
+                                               ignore_control_predecessors));
   }
 
-  for (auto control_predecessor : control_predecessors_) {
-    VLOG(3) << "Going to visit HLO " << control_predecessor->name()
-            << " as a control predecessor of HLO " << name();
-    TF_RETURN_IF_ERROR(control_predecessor->AcceptInternal(visitor));
+  if (!ignore_control_predecessors) {
+    // This uses the same pointer/vector sorting to avoid extra copies as above.
+    std::vector<HloInstruction*>* predecessors = &control_predecessors_;
+    std::vector<HloInstruction*> temp_sorted_predecessors;
+    if (operand_order != nullptr) {
+      temp_sorted_predecessors = control_predecessors_;
+      std::sort(temp_sorted_predecessors.begin(),
+                temp_sorted_predecessors.end(), *operand_order);
+      predecessors = &temp_sorted_predecessors;
+    }
+    for (HloInstruction* control_predecessor : *predecessors) {
+      VLOG(3) << "Going to visit HLO " << control_predecessor->name()
+              << " as a control predecessor of HLO " << name();
+      TF_RETURN_IF_ERROR(control_predecessor->AcceptInternal(
+          visitor, operand_order, ignore_control_predecessors));
+    }
   }
 
   TF_RETURN_IF_ERROR(visitor->Preprocess(this));
@@ -1729,18 +1929,27 @@ Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor) {
   return visitor->Postprocess(this);
 }
 
-Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit) {
+Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
+                              bool ignore_control_predecessors) {
   VLOG(2) << "HloInstruction::Accept(" << name() << ")";
-  auto status = AcceptInternal(visitor);
-  if (!status.ok()) {
-    return status;
-  }
-
+  TF_RETURN_IF_ERROR(
+      AcceptInternal(visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
-    return visitor->FinishVisit(this);
-  } else {
-    return Status::OK();
+    TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
   }
+  return Status::OK();
+}
+
+Status HloInstruction::AcceptWithOperandOrder(
+    DfsHloVisitor* visitor, const CompareFunction& operand_order,
+    bool call_finish_visit) {
+  VLOG(2) << "HloInstruction::AcceptWithOperandOrder(" << name() << ")";
+  TF_RETURN_IF_ERROR(AcceptInternal(visitor, &operand_order,
+                                    /*ignore_control_predecessors=*/false));
+  if (call_finish_visit) {
+    TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -1761,7 +1970,7 @@ bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
   // ops).
   for (auto* instruction : order) {
     for (auto* operand : instruction->operands()) {
-      if (order_position.count(operand) == 0 ||
+      if (!ContainsKey(order_position, operand) ||
           order_position.at(operand) >= order_position.at(instruction)) {
         return false;
       }
@@ -1773,7 +1982,8 @@ bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
 
 }  // namespace
 
-Status HloInstruction::Accept(FunctionVisitor::VisitorFunction visitor_func) {
+Status HloInstruction::Accept(
+    const FunctionVisitor::VisitorFunction& visitor_func) {
   FunctionVisitor visitor(visitor_func);
   return this->Accept(&visitor);
 }
@@ -1791,7 +2001,7 @@ Status HloInstruction::AcceptOrdered(
   }));
 
   for (auto* const_instruction : order) {
-    if (predecessors.count(const_instruction) == 0) {
+    if (!ContainsKey(predecessors, const_instruction)) {
       // Instruction is not a predecessors of 'this'.
       continue;
     }
@@ -1817,6 +2027,12 @@ Status HloInstruction::AcceptOrdered(
   return visitor->FinishVisit(this);
 }
 
+const Shape& HloInstruction::outfeed_shape() const {
+  DCHECK_EQ(opcode_, HloOpcode::kOutfeed);
+  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
+  return outfeed_shape_;
+}
+
 const Shape& HloInstruction::shape() const {
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
   return shape_;
@@ -1846,6 +2062,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kCopy:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kLogicalNot:
     case HloOpcode::kNegate:
@@ -1879,6 +2096,7 @@ bool HloInstruction::IsElementwise() const {
       return true;
 
     // Other operations.
+    case HloOpcode::kRng:
     case HloOpcode::kMap:
       return true;
     case HloOpcode::kFusion:
@@ -1932,7 +2150,7 @@ bool HloInstruction::IsElementwiseOnOperand(int64 operand_idx) const {
     HloInstruction* operand = worklist.front();
     worklist.pop_front();
     for (HloInstruction* user : operand->users()) {
-      if (visited.count(user)) {
+      if (ContainsKey(visited, user)) {
         continue;
       }
       if (user->IsElementwise() ||
@@ -1947,6 +2165,70 @@ bool HloInstruction::IsElementwiseOnOperand(int64 operand_idx) const {
   return true;
 }
 
+// A helper class for memoized, recursive computation of HloOpcode::kFusion
+// in HloInstruction::OperandElementUse below.
+class HloInstruction::FusionReusesParamElements {
+ public:
+  using UseKind = HloInstruction::UseKind;
+
+  // We could rather iterate backwards thru fused_instructions_ here, as it is
+  // in reverse postorder, and compute whether each fused instruction reuses the
+  // value of this parameter, which would save stack space but not allow us to
+  // finish early if we find a reuse.
+  static UseKind Compute(int64 i, const HloInstruction& hlo) {
+    tensorflow::gtl::FlatMap<const HloInstruction*, UseKind> memoization_cache;
+    return ComputeInternal(i, hlo, &memoization_cache);
+  }
+
+ private:
+  static UseKind ComputeInternal(
+      int64 i, const HloInstruction& hlo,
+      tensorflow::gtl::FlatMap<const HloInstruction*, UseKind>* cache) {
+    if (hlo.opcode_ == HloOpcode::kParameter && hlo.parameter_number_ == i) {
+      return UseKind::kUse;
+    }
+
+    auto p = cache->emplace(&hlo, UseKind{});
+    auto value_it = p.first;
+    const bool key_is_new = p.second;
+
+    if (key_is_new) {
+      for (int64 j = 0; j < hlo.operands_.size(); ++j) {
+        UseKind old_val = value_it->second;
+
+        // The next operation invalidates iterators.
+        UseKind new_val =
+            Plus(old_val, std::min(hlo.OperandElementUse(j),
+                                   ComputeInternal(i, *hlo.operand(j), cache)));
+
+        // Re-acquire the iterator. We could work harder to do this only if
+        // absolutely necessary, but this code is not hot enough to warrant
+        // that.
+        value_it = cache->find(&hlo);
+        value_it->second = new_val;
+      }
+    }
+    return value_it->second;
+  }
+
+  // Fold operation for UseKinds.
+  static UseKind Plus(UseKind a, UseKind b) {
+    if (a == UseKind::kNoUse) {
+      return b;
+    } else if (b == UseKind::kNoUse) {
+      return a;
+    } else if (a == UseKind::kReuse || b == UseKind::kReuse) {
+      return UseKind::kReuse;
+    } else if (a == UseKind::kUsePermutingElements ||
+               b == UseKind::kUsePermutingElements) {
+      return UseKind::kReuse;
+    } else {
+      CHECK(a == UseKind::kUse && b == UseKind::kUse);
+      return UseKind::kUse;
+    }
+  }
+};
+
 HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
   switch (opcode_) {
     case HloOpcode::kBitcast:
@@ -1961,69 +2243,14 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
       // Pad reuses the padding value but not the padded array elements.
       // Reduce reuses the init value but not the operand array elements.
       return i > 0 ? UseKind::kReuse : UseKind::kUsePermutingElements;
-    case HloOpcode::kFusion: {
-      tensorflow::gtl::FlatMap<const HloInstruction*, UseKind> cache;
-      // We could rather iterate backwards thru fused_instructions_ here, as it
-      // is in reverse postorder, and compute whether each fused instruction
-      // reuses the value of this parameter, which would save stack space but
-      // not allow us to finish early if we find a reuse.
-      std::function<UseKind(const HloInstruction&)> reuses_parameter_elements =
-          [i, &cache, &reuses_parameter_elements](const HloInstruction& hlo) {
-            auto plus = [](const UseKind& a, const UseKind& b) {
-              if (a == UseKind::kNoUse) return b;
-              if (b == UseKind::kNoUse) return a;
-              if (a == UseKind::kReuse || b == UseKind::kReuse) {
-                return UseKind::kReuse;
-              }
-              if (a == UseKind::kUsePermutingElements ||
-                  b == UseKind::kUsePermutingElements) {
-                return UseKind::kReuse;
-              }
-              CHECK(UseKind::kUse == a && UseKind::kUse == b);
-              return UseKind::kUse;
-            };
-
-            if (hlo.opcode_ == HloOpcode::kParameter &&
-                hlo.parameter_number_ == i) {
-              return UseKind::kUse;
-            }
-            if (cache.count(&hlo) == 0) {
-              for (int64 j = 0; j < hlo.operands_.size(); ++j) {
-                UseKind old = cache[&hlo];
-                UseKind updated = plus(
-                    old, std::min(hlo.OperandElementUse(j),
-                                  reuses_parameter_elements(*hlo.operand(j))));
-                cache[&hlo] = updated;
-              }
-            }
-            return cache[&hlo];
-          };
-      return reuses_parameter_elements(*fused_root_);
-    }
+    case HloOpcode::kFusion:
+      // Uses the memoizing, recursive computation defined above.
+      return FusionReusesParamElements::Compute(i, *fused_expression_root());
     default:
       return IsElementwise() ? UseKind::kUse : UseKind::kReuse;
   }
 }
 
-namespace {
-
-// Prereq: `order` is a permutation of {0, 1, ..., `dims.size()-1`}
-void Strip1SizedDimensions(tensorflow::protobuf::RepeatedField<int64>* dims,
-                           std::vector<int64>* order) {
-  // We can't merely call StripDegenerateDimensions here as we must also delete
-  // the dimension indices.
-  for (size_t i = 0; i < dims->size(); ++i) {
-    if (1 == dims->Get(i)) {
-      dims->erase(dims->begin() + i);
-      // We must find this, as order must be a permutation of operand
-      // dimensions.
-      order->erase(std::find(order->begin(), order->end(), i));
-    }
-  }
-}
-
-}  // namespace
-
 std::tuple<bool, std::vector<int64>, std::vector<int64>>
 HloInstruction::ReshapeMerelyInsertsOrDeletes1SizedDimensions() const {
   if (HloOpcode::kReshape != opcode_) {
@@ -2033,21 +2260,72 @@ HloInstruction::ReshapeMerelyInsertsOrDeletes1SizedDimensions() const {
                                                       shape_);
 }
 
-string FusionKindString(HloInstruction::FusionKind kind) {
+string ToString(HloInstruction::FusionKind kind) {
   switch (kind) {
     case HloInstruction::FusionKind::kLoop:
-      return "Loop";
+      return "kLoop";
     case HloInstruction::FusionKind::kInput:
-      return "Input";
+      return "kInput";
+    case HloInstruction::FusionKind::kOutput:
+      return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
-      return "TransposeDot";
+      return "kTransposeDot";
     case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "ConvBackwardFilter";
+      return "kConvBackwardFilter";
     case HloInstruction::FusionKind::kConvBackwardInput:
-      return "ConvBackwardInput";
+      return "kConvBackwardInput";
   }
 }
 
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
+  return os << ToString(kind);
+}
+
+string HloInstruction::ConvolutionDimensionNumbersToString() const {
+  string result;
+  if (convolution_dimension_numbers_ == nullptr) {
+    return result;
+  }
+  const ConvolutionDimensionNumbers& dnums = *convolution_dimension_numbers_;
+  // Show the given dimension labels in order of major to minor based on the
+  // shape's layout.
+  const auto append_dims = [&](const std::vector<string>& dims,
+                               const Shape& shape) {
+    CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
+    for (int64 logical = 0; logical < dims.size(); ++logical) {
+      int64 physical = logical;
+      if (!shape.layout().minor_to_major().empty()) {
+        physical = LayoutUtil::Major(shape.layout(), logical);
+      }
+      result += dims[physical];
+    }
+  };
+
+  // lhs_dims[i] is the symbol of the logical dimension i for the lhs
+  // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
+  std::vector<string> lhs_dims(2 + dnums.spatial_dimensions().size());
+  lhs_dims[dnums.batch_dimension()] = 'b';
+  lhs_dims[dnums.feature_dimension()] = 'f';
+  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
+    lhs_dims[dnums.spatial_dimensions(i)] = StrCat(i);
+  }
+
+  std::vector<string> rhs_dims(2 + dnums.kernel_spatial_dimensions().size());
+  rhs_dims[dnums.kernel_input_feature_dimension()] = "i";
+  rhs_dims[dnums.kernel_output_feature_dimension()] = "o";
+  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
+    rhs_dims[dnums.kernel_spatial_dimensions(i)] = StrCat(i);
+  }
+
+  result += "dim_labels=";
+  append_dims(lhs_dims, operand(0)->shape());
+  result += "_";
+  append_dims(rhs_dims, operand(1)->shape());
+  result += "->";
+  append_dims(lhs_dims, shape());
+  return result;
+}
+
 bool HloInstruction::CouldBeBitcast() const {
   switch (opcode_) {
     case HloOpcode::kTranspose:
@@ -2059,4 +2337,15 @@ bool HloInstruction::CouldBeBitcast() const {
   }
 }
 
+HloModule* HloInstruction::GetModule() const {
+  if (parent_) {
+    return parent_->parent();
+  }
+  return nullptr;
+}
+
+void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index ff52900a2b8..c7cd729934b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -22,16 +22,21 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
 
 #include <functional>
+#include <iosfwd>
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <tuple>
+#include <unordered_set>
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -44,18 +49,23 @@ limitations under the License.
 namespace xla {
 
 class HloComputation;
+class HloModule;
 
 // HLO instructions are the IR used by the high-level compiler.
 class HloInstruction {
  public:
   enum class FusionKind {
     kLoop,                // Fused into a loop.
-    kInput,               // Fused into a reduction kernel.
+    kInput,               // Op's input is fused into the op itself.
+    kOutput,              // Op's output is fused into the op itself.
+                          // REQUIRES: At least one operand buffer must be able
+                          // to alias the output buffer.
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
   };
 
+  ~HloInstruction();
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
                                                          const Shape& shape,
@@ -137,7 +147,8 @@ class HloInstruction {
 
   // Creates an outfeed instruction, which outputs data.
   static std::unique_ptr<HloInstruction> CreateOutfeed(
-      HloInstruction* operand, tensorflow::StringPiece outfeed_config);
+      const Shape& shape, HloInstruction* operand,
+      tensorflow::StringPiece outfeed_config);
 
   // Creates a send instruction with the given channel id, which sends the
   // operand data to a unique receive instruction in another computation that
@@ -156,7 +167,8 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateSlice(
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices);
+      tensorflow::gtl::ArraySlice<int64> limit_indices,
+      tensorflow::gtl::ArraySlice<int64> strides);
 
   // Creates a slice instruction, where the first operand is sliced by
   // start indices specified in the second operand, and by size specfied in
@@ -302,28 +314,38 @@ class HloInstruction {
   int64 user_count() const { return users_.size(); }
 
   // Returns the users of this instruction.
-  const std::set<HloInstruction*>& users() const { return users_; }
+  const std::vector<HloInstruction*>& users() const { return users_; }
 
-  // Returns the set of control predecessors of this instruction. Control
-  // predecessors are the instructions that must be scheduled before the current
-  // instruction.
-  const std::set<HloInstruction*>& control_predecessors() const {
+  // Returns true if this instruction is a user of 'instruction'.
+  bool IsUserOf(const HloInstruction* instruction) const {
+    return ContainsKey(instruction->user_set_, this);
+  }
+
+  // Adds a control dependency from this instruction to the given
+  // instruction. This instruction becomes a control predecessor of
+  // 'instruction', and 'instruction' becomes a control successor of this
+  // instruction. Returns an error status if either of the given instructions
+  // does not belong to the same computation.
+  //
+  // This is used to enforce an additional ordering requirement that is not
+  // captured by normal data dependencies, such as ordering among Send or Recv
+  // operations to avoid deadlock.
+  Status AddControlDependencyTo(HloInstruction* instruction);
+
+  // Removes a previously added control dependency from this instruction to
+  // 'instruction'.
+  Status RemoveControlDependencyTo(HloInstruction* instruction);
+
+  // Returns the set of control predecessors (successors) of this
+  // instruction. Control predecessors (sucessors) must execute before (after)
+  // the current instruction.
+  const std::vector<HloInstruction*>& control_predecessors() const {
     return control_predecessors_;
   }
-
-  // Adds the given instruction to the set of control predecessors.
-  void AddControlPredecessor(HloInstruction* instruction);
-
-  // Returns the set of control successors of this instruction. Control
-  // successors are the instructions that must be scheduled after the current
-  // instruction.
-  const std::set<HloInstruction*>& control_successors() const {
+  const std::vector<HloInstruction*>& control_successors() const {
     return control_successors_;
   }
 
-  // Adds the given instruction to the set of control successors.
-  void AddControlSuccessor(HloInstruction* instruction);
-
   // Returns true if "other" performs the same computation as this instruction.
   // Layout of the instructions' output array is not considered.
   bool Identical(
@@ -359,12 +381,25 @@ class HloInstruction {
 
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
-  // complete.
-  Status Accept(DfsHloVisitor* visitor, bool call_finish_visit = true);
+  // complete. If ignore_control_predecessors is true, instructions only
+  // reachable via control dependencies will not be visited, and the postorder
+  // will not take control dependencies into account. It is as if the control
+  // dependencies didn't exist in the graph at all.
+  Status Accept(DfsHloVisitor* visitor, bool call_finish_visit = true,
+                bool ignore_control_predecessors = false);
+
+  // Same as Accept() above, but the order of operand and control predecessor
+  // visitation is determined by the given operand order; if compare(A, B) ==
+  // true, A is visited before B.
+  using CompareFunction =
+      std::function<bool(const HloInstruction*, const HloInstruction*)>;
+  Status AcceptWithOperandOrder(DfsHloVisitor* visitor,
+                                const CompareFunction& operand_order,
+                                bool call_finish_visit = true);
 
   // Performs a postorder DFS visit using this node as the root. Calls the given
   // visitor function at each instruction.
-  Status Accept(FunctionVisitor::VisitorFunction visitor_func);
+  Status Accept(const FunctionVisitor::VisitorFunction& visitor_func);
 
   // Visits all instructions rooted at this instruction using the given visitor
   // in the given order. 'order' must contain at least the set of instructions
@@ -397,6 +432,11 @@ class HloInstruction {
     return parameter_name_;
   }
 
+  void set_parameter_name(const string& str) {
+    CHECK_EQ(HloOpcode::kParameter, opcode_);
+    parameter_name_ = str;
+  }
+
   // Returns the dimension sizes or numbers associated with this instruction.
   //
   // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape,
@@ -428,6 +468,10 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kOutfeed
   const string& outfeed_config() const;
 
+  // Returns the shape for the Outfeed instruction.
+  // Precondition: opcode() == HloOpcode::kOutfeed
+  const Shape& outfeed_shape() const;
+
   // Gets/sets the while_condition or while_body HloComputation for While. The
   // setters should only be called by HloModule or HloComputation methods.
   //
@@ -451,15 +495,26 @@ class HloInstruction {
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false) const;
+  string ToString(bool compact_operands = false,
+                  bool include_metadata = true) const;
+
+  string ToStringNoMetadata() const { return ToString(false, false); }
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
 
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const;
+
   // Returns a category for the HLO. This could be something like "convolution"
   // or "elementwise".
   string ToCategory() const;
 
+  // Returns the string concatenation of parent name and this instructions
+  // name. This name is guaranteed to be unique among all instructions in the
+  // HloModule.
+  string FullyQualifiedName() const;
+
   // Returns a logging instruction, if the output of this instruction is logged.
   //
   // Postcondition: retval == nullptr || retval->opcode() == HloOpcode::kTrace
@@ -482,7 +537,7 @@ class HloInstruction {
   // Returns a tag to be used in tracing.
   //
   // Precondition: opcode() == HloOpcode::kTrace
-  const string& tracing_tag() const;
+  string TracingTag() const;
 
   // Returns whether the instruction is a constant.
   bool IsConstant() const;
@@ -506,10 +561,18 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kFusion
   HloInstruction* fused_expression_root() const;
 
+  // Returns the computation for this fused instruction.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  HloComputation* fused_instructions_computation() const;
+
   // Returns the vector of fused instructions inside this fusion
   // instruction. The order is a reverse postorder of the fused expression (root
   // is first in the order).
   //
+  // Note: although the list itself is const, the instructions contained in the
+  // list returned here are mutable.
+  //
   // Precondition: opcode() == HloOpcode::kFusion
   const std::list<std::unique_ptr<HloInstruction>>& fused_instructions() const;
 
@@ -519,6 +582,18 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kFusion
   HloInstruction* fused_parameter(int64 parameter_number) const;
 
+  // Returns the vector of fused parameters inside this fusion instruction.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  const std::vector<HloInstruction*>& fused_parameters() const;
+
+  // Returns true if this instruction is a fusion instruction that generates
+  // multiple outputs.
+  const bool IsMultiOutputFusion() const {
+    return (opcode() == HloOpcode::kFusion &&
+            fused_expression_root()->opcode() == HloOpcode::kTuple);
+  }
+
   FusionKind fusion_kind() const {
     CHECK_EQ(HloOpcode::kFusion, opcode_);
     return fusion_kind_;
@@ -564,6 +639,15 @@ class HloInstruction {
     return slice_limits_;
   }
 
+  // Returns the stride in the given dimension for a slice node.
+  //
+  // Precondition: opcode() == HloOpcode::kSlice
+  int64 slice_stride(int64 dimension) const {
+    CHECK_EQ(HloOpcode::kSlice, opcode_);
+    return slice_strides_[dimension];
+  }
+  const std::vector<int64>& slice_strides() const { return slice_strides_; }
+
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
   //
@@ -599,6 +683,9 @@ class HloInstruction {
     return *convolution_dimension_numbers_;
   }
 
+  // Returns the dump string of the convolution dimension numbers.
+  string ConvolutionDimensionNumbersToString() const;
+
   // Returns the random distribution for this rng node.
   //
   // Precondition: opcode() == HloOpcode::kRng
@@ -606,18 +693,20 @@ class HloInstruction {
 
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
-  // cloned from) is not changed.
-  std::unique_ptr<HloInstruction> Clone();
+  // cloned from) is not changed. Suffix is the string to append to the name of
+  // the instruction to form the name of the cloned instruction.
+  std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone");
 
   // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
-  // Computes and returns the computations this instruction calls (if any). This
-  // includes computations called by fused instructions inside of a fusion
-  // instruction.
-  std::set<HloComputation*> MakeCalledComputationsSet() const;
+  // Returns the computations this instruction calls (if any). This includes
+  // computations called by fused instructions inside of a fusion instruction.
+  const std::vector<HloComputation*>& called_computations() const {
+    return called_computations_;
+  }
 
   // Returns true if this instruction performs an elementwise operation on
   // `operand_idx`-th operand. An instruction is elementwise on an operand iff,
@@ -653,13 +742,23 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
+  // Returns the opcode string for this instruction. Compared with
+  // HloOpcodeString method, this wrapper dumps additional information
+  // such as fusion kind.
+  string ExtendedOpcodeStr() const;
+
   // Returns a string identifier for this instruction. If no string identifier
   // has been explicitly set, then the identifier is the serialized pointer to
   // this instruction.
   const string& name() const { return name_; }
 
-  // Sets the string identifier for this instruction.
-  void set_name(const string& name) { name_ = name; }
+  // Use the given NameUniquer to select a unique name for the instruction based
+  // on the instruction's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
+
+  // Sets the debug metadata for this instruction.
+  void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
+  const OpMetadata& metadata() const { return metadata_; }
 
   // Set/get the computation containing this instruction. set_parent should only
   // be called by HloComputation methods which add/remove instructions to
@@ -668,13 +767,27 @@ class HloInstruction {
   const HloComputation* parent() const { return parent_; }
   HloComputation* parent() { return parent_; }
 
+  // Returns the module for this instruction.
+  HloModule* GetModule() const;
+
   // Returns whether we could assign input and output layouts to this
   // instruction to make it a bitcast.
   bool CouldBeBitcast() const;
 
+  // Sets the parent fusion instruction for this instruction.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  void SetParentFusion(HloInstruction* fusion_instruction) {
+    CHECK_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+    parent_fusion_instruction_ = fusion_instruction;
+  }
+
  private:
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
+  // Helper class for computing OperandElementUse for kFusion.
+  class FusionReusesParamElements;
+
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
       const Shape& shape, HloOpcode opcode,
@@ -707,7 +820,9 @@ class HloInstruction {
 
   // Inner DFS traversal function -- this function being called (rather than
   // Accept above) allows us to distinguish the root of the traversal.
-  Status AcceptInternal(DfsHloVisitor* visitor);
+  Status AcceptInternal(DfsHloVisitor* visitor,
+                        const CompareFunction* operand_order,
+                        bool ignore_control_predecessors);
 
   // CHECKs various invariants of a fusion instruction.
   void CheckFusionInstruction() const;
@@ -719,6 +834,9 @@ class HloInstruction {
   // Returns how this instruction uses elements of its `i`th operand.
   UseKind OperandElementUse(int64 i) const;
 
+  // Shape of outfeed request.
+  Shape outfeed_shape_;
+
   // Result shape of this instruction.
   Shape shape_;
 
@@ -744,6 +862,7 @@ class HloInstruction {
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
+  std::vector<int64> slice_strides_;
 
   // Describes the [start, start + size) range size for a dynamic slice
   // ('start' is specified dynamically in the second operand of the operation).
@@ -753,22 +872,14 @@ class HloInstruction {
   // padding of this pad instruction. Only set for pad instructions.
   std::unique_ptr<PaddingConfig> padding_config_;
 
-  // The set of instruction fused into this fusion instruction. Only set for
-  // fusion instructions.
-  std::list<std::unique_ptr<HloInstruction>> fused_instructions_;
+  // The computation that stores of instructions fused into this fusion
+  // instruction. Only set for fusion instructions.
+  std::unique_ptr<HloComputation> fused_instructions_computation_;
 
   // If this instruction is fused into a fusion instruction, this field points
   // to the fusion instruction.
   HloInstruction* parent_fusion_instruction_ = nullptr;
 
-  // The vector of parameter instructions inside this fusion instruction.  The
-  // index of the vector is the parameter_number of the parameter instruction.
-  // This vector is non-empty only for fusion instructions.
-  std::vector<HloInstruction*> fused_parameters_;
-
-  // The root of the expression fused into this fusion instruction.
-  HloInstruction* fused_root_ = nullptr;
-
   // The type of the fusion. Used by kFusion only.
   FusionKind fusion_kind_;
 
@@ -776,21 +887,23 @@ class HloInstruction {
   int64 parameter_number_ = 0;
   string parameter_name_;
 
-  // Computation to apply, only present for kCall, kMap, kReduce and
-  // kReduceWindow.
-  HloComputation* to_apply_ = nullptr;
-
   // Name of a global symbol to call, only present for kCustomCall.
   string custom_call_target_;
 
-  // Computation for condition and body of kWhile, only present for kWhile.
-  HloComputation* condition_ = nullptr;
-  HloComputation* body_ = nullptr;
+  // Computations called by this instruction.
+  std::vector<HloComputation*> called_computations_;
 
-  // Computation for select and scatter, only present for
-  // kSelectAndScatter.
-  HloComputation* select_ = nullptr;
-  HloComputation* scatter_ = nullptr;
+  // Indices of computations in called_computations_ for instructions which call
+  // multiple computations.
+  enum {
+    // kWhile computations.
+    kBodyComputationIndex = 0,
+    kConditionComputationIndex = 1,
+
+    // kSelectAndScatter computations.
+    kSelectComputationIndex = 0,
+    kScatterComputationIndex = 1,
+  };
 
   // Outfeed configuration information, only present for kOutfeed.
   string outfeed_config_;
@@ -799,14 +912,17 @@ class HloInstruction {
   std::vector<HloInstruction*> operands_;
 
   // The users of this instruction. Users are HLOs where this instruction is an
-  // operand.
-  std::set<HloInstruction*> users_;
+  // operand. The vector users_ and the set user_set_ contain identical
+  // members. The set enables fast membership testing and the vector enables
+  // fast, stable iteration.
+  std::vector<HloInstruction*> users_;
+  std::unordered_set<const HloInstruction*> user_set_;
 
   // The set of control predecessors of this instruction.
-  std::set<HloInstruction*> control_predecessors_;
+  std::vector<HloInstruction*> control_predecessors_;
 
   // The set of control successors of this instruction.
-  std::set<HloInstruction*> control_successors_;
+  std::vector<HloInstruction*> control_successors_;
 
   // A trace instruction that consumes this instruction.
   //
@@ -831,10 +947,15 @@ class HloInstruction {
   // The computation in which this instruction is contained.
   HloComputation* parent_ = nullptr;
 
+  // Metadata for debugging.
+  OpMetadata metadata_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HloInstruction);
 };
 
-string FusionKindString(HloInstruction::FusionKind kind);
+string ToString(HloInstruction::FusionKind kind);
+
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 48711b605f2..bcf81cd8ddf 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -21,19 +21,22 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace {
 
-#define EXPECT_ISET(A, E...) EXPECT_EQ(A, (std::set<HloInstruction*>{E}))
-#define EXPECT_IVEC(A, E...) EXPECT_EQ(A, (std::vector<HloInstruction*>{E}))
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
 
-class HloInstructionTest : public ::testing::Test {
+class HloInstructionTest : public HloTestBase {
  protected:
   HloInstructionTest() {}
 
@@ -149,10 +152,10 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
   auto bar = HloInstruction::CreateParameter(1, r0f32_, "bar");
   auto add = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo.get(),
                                           bar.get());
-  EXPECT_MATCH(add->operands(), testing::UnorderedMatcher<HloInstruction*>(
-                                    foo.get(), bar.get()));
-  EXPECT_ISET(foo->users(), add.get());
-  EXPECT_ISET(bar->users(), add.get());
+
+  EXPECT_THAT(add->operands(), UnorderedElementsAre(foo.get(), bar.get()));
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add.get()));
+  EXPECT_THAT(bar->users(), UnorderedElementsAre(add.get()));
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(add->Accept(&visitor));
@@ -385,12 +388,13 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
   EXPECT_EQ(1, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
 
-  EXPECT_ISET(foo->users(), add_foobar.get());
-  EXPECT_IVEC(add_foobar->operands(), foo.get(), bar.get());
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add_foobar.get()));
+  EXPECT_THAT(add_foobar->operands(), ElementsAre(foo.get(), bar.get()));
 
-  EXPECT_ISET(bar->users(), add_foobar.get(), add_foofoo.get());
-  EXPECT_IVEC(add_foobar->operands(), foo.get(), bar.get());
-  EXPECT_IVEC(add_foofoo->operands(), bar.get(), bar.get());
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), add_foofoo.get()));
+  EXPECT_THAT(add_foobar->operands(), ElementsAre(foo.get(), bar.get()));
+  EXPECT_THAT(add_foofoo->operands(), ElementsAre(bar.get(), bar.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
@@ -406,15 +410,17 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
                                                  foo.get(), bar.get());
 
   EXPECT_EQ(2, foo->user_count());
-  EXPECT_ISET(foo->users(), tuple.get(), add_foobar.get());
+  EXPECT_THAT(foo->users(),
+              UnorderedElementsAre(tuple.get(), add_foobar.get()));
 
   // Replace the use of foo in tuple with bar.
   ASSERT_IS_OK(foo->ReplaceUseWith(tuple.get(), bar.get()));
 
-  EXPECT_ISET(foo->users(), add_foobar.get());
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add_foobar.get()));
 
   // Both uses of foo in tuple should have been replaced with bar.
-  EXPECT_IVEC(tuple->operands(), bar.get(), bar.get(), baz.get(), bar.get());
+  EXPECT_THAT(tuple->operands(),
+              ElementsAre(bar.get(), bar.get(), baz.get(), bar.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
@@ -427,7 +433,7 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
   auto log = HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo.get());
 
   EXPECT_EQ(2, foo->user_count());
-  EXPECT_ISET(foo->users(), exp.get(), log.get());
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(exp.get(), log.get()));
   EXPECT_EQ(0, bar->user_count());
 
   // Replace the use of foo in exp with bar.
@@ -435,8 +441,8 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
 
   // The use of foo in log should not have been affected.
   EXPECT_EQ(1, foo->user_count());
-  EXPECT_ISET(foo->users(), log.get());
-  EXPECT_IVEC(log->operands(), foo.get());
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(log.get()));
+  EXPECT_THAT(log->operands(), ElementsAre(foo.get()));
 
   // Bar should now be used in exp.
   EXPECT_EQ(1, bar->user_count());
@@ -467,7 +473,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
   EXPECT_EQ(0, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
 
-  EXPECT_ISET(bar->users(), add_foobar.get(), add_foofoo.get());
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), add_foofoo.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
@@ -491,7 +498,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   EXPECT_EQ(0, foo->user_count());
   EXPECT_EQ(3, bar->user_count());
 
-  EXPECT_ISET(bar->users(), add_foobar.get(), exp.get(), tuple.get());
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), exp.get(), tuple.get()));
 }
 
 // Simple visitor that collects and post-processes each node in the graph.
@@ -559,8 +567,8 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
   auto fusion = HloInstruction::CreateFusion(
       r0f32_, HloInstruction::FusionKind::kLoop, exp.get());
 
-  EXPECT_IVEC(fusion->operands(), constant.get());
-  EXPECT_ISET(constant->users(), fusion.get(), exp.get());
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(fusion.get(), exp.get()));
 }
 
 TEST_F(HloInstructionTest, BinaryFusionOp) {
@@ -575,9 +583,12 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
   auto fusion = HloInstruction::CreateFusion(
       r0f32_, HloInstruction::FusionKind::kLoop, add.get());
 
-  EXPECT_IVEC(fusion->operands(), constant1.get(), constant2.get());
-  EXPECT_ISET(constant1->users(), fusion.get(), add.get());
-  EXPECT_ISET(constant2->users(), fusion.get(), add.get());
+  EXPECT_THAT(fusion->operands(),
+              ElementsAre(constant1.get(), constant2.get()));
+  EXPECT_THAT(constant1->users(),
+              UnorderedElementsAre(fusion.get(), add.get()));
+  EXPECT_THAT(constant2->users(),
+              UnorderedElementsAre(fusion.get(), add.get()));
 }
 
 TEST_F(HloInstructionTest, ChainFusionOp) {
@@ -594,8 +605,68 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   fusion->FuseInstruction(exp2.get());
   fusion->FuseInstruction(exp1.get());
 
-  EXPECT_IVEC(fusion->operands(), constant.get());
-  EXPECT_ISET(constant->users(), fusion.get(), exp1.get());
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
+  EXPECT_THAT(constant->users(),
+              UnorderedElementsAre(fusion.get(), exp1.get()));
+}
+
+TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
+  // Create a chain of fused unary ops.
+  auto constant =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto exp1 =
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
+  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
+  OpMetadata metadata;
+  metadata.set_op_name("tf_op");
+  exp1->set_metadata(metadata);
+  exp2->set_metadata(metadata);
+
+  auto fusion = HloInstruction::CreateFusion(
+      r0f32_, HloInstruction::FusionKind::kLoop, exp2.get());
+  auto* fused = fusion->FuseInstruction(exp1.get());
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fused->metadata()));
+}
+
+TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
+  // Create a fusion instruction containing a single unary operation.
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+
+  auto make_map_computation = [&]() {
+    auto builder = HloComputation::Builder("FusionMap");
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "param"));
+    return builder.Build();
+  };
+
+  std::unique_ptr<HloComputation> computation_x = make_map_computation();
+  std::unique_ptr<HloComputation> computation_y = make_map_computation();
+
+  auto constant =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto map_1_x =
+      HloInstruction::CreateMap(scalar_shape, {constant.get()},
+                                computation_x.get(), /*static_operands=*/{});
+  auto map_2_x =
+      HloInstruction::CreateMap(scalar_shape, {map_1_x.get()},
+                                computation_x.get(), /*static_operands=*/{});
+  auto map_3_y =
+      HloInstruction::CreateMap(scalar_shape, {map_2_x.get()},
+                                computation_y.get(), /*static_operands=*/{});
+
+  auto fusion = HloInstruction::CreateFusion(
+      scalar_shape, HloInstruction::FusionKind::kLoop, map_3_y.get());
+
+  EXPECT_THAT(fusion->called_computations(), ElementsAre(computation_y.get()));
+
+  fusion->FuseInstruction(map_2_x.get());
+  EXPECT_THAT(fusion->called_computations(),
+              ElementsAre(computation_y.get(), computation_x.get()));
+
+  fusion->FuseInstruction(map_1_x.get());
+  EXPECT_THAT(fusion->called_computations(),
+              ElementsAre(computation_y.get(), computation_x.get()));
 }
 
 TEST_F(HloInstructionTest, ComplexFusionOp) {
@@ -636,8 +707,9 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
 
   // Operands in the fusion instruction's operands() vector should be in the
   // order in which their users were added fused.
-  EXPECT_IVEC(fusion->operands(), c1.get(), c3.get(), c2.get());
-  EXPECT_ISET(c1->users(), add.get(), tuple.get(), fusion.get());
+  EXPECT_THAT(fusion->operands(), ElementsAre(c1.get(), c3.get(), c2.get()));
+  EXPECT_THAT(c1->users(),
+              UnorderedElementsAre(add.get(), tuple.get(), fusion.get()));
 }
 
 // Convenience function for comparing two HloInstructions inside of
@@ -890,5 +962,48 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
                                root2->operand(1)->operand(0)->shape()));
 }
 
+TEST_F(HloInstructionTest, CloneSuffixNames) {
+  // Test that the suffix string added to cloned instructions is not
+  // duplicated. Rather a numeric incrementing value should be appended. That
+  // is, we want "foo.clone2", not "foo.clone.clone".
+
+  // Test cloning the same instruction multiple times.
+  auto foo =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "foo");
+  EXPECT_EQ(foo->Clone()->name(), "%foo.clone");
+  EXPECT_EQ(foo->Clone()->Clone()->name(), "%foo.clone2");
+  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "%foo.clone3");
+
+  // Test custom suffixes.
+  EXPECT_EQ(foo->Clone("bar")->name(), "%foo.bar");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "%foo.bar2");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(),
+            "%foo.bar2.clone");
+
+  // Test instruction name with a dot.
+  auto foo_baz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.baz");
+  EXPECT_EQ(foo_baz->Clone()->name(), "%foo.baz.clone");
+
+  // Test incrementing a large number after the suffix.
+  auto foo_clone234 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone234");
+  EXPECT_EQ(foo_clone234->Clone()->name(), "%foo.clone235");
+
+  // Test a non-numeric string after the cloning suffix.
+  auto foo_clonexyz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clonexyz");
+  EXPECT_EQ(foo_clonexyz->Clone()->name(), "%foo.clonexyz.clone");
+
+  // Test a name with multiple appearances of the suffix.
+  auto foo_clone_clone3 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone.clone3");
+  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "%foo.clone.clone4");
+}
+
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
new file mode 100644
index 00000000000..e022c4836d8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace testing {
+
+bool HloMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  // These cases are self-explanatory from the printed value.
+  if (!instruction || instruction->opcode() != opcode_) {
+    return false;
+  }
+  // Special case: no operand matchers means don't verify.
+  if (operands_.empty()) {
+    return true;
+  }
+  const auto& operands = instruction->operands();
+  if (operands.size() != operands_.size()) {
+    *listener << "has too "
+              << (operands.size() > operands_.size() ? "many" : "few")
+              << " operands (got " << operands.size() << ", want "
+              << operands_.size() << ")";
+    return false;
+  }
+  for (int index = 0; index < operands.size(); index++) {
+    ::testing::StringMatchResultListener inner_listener;
+    if (!operands_[index].MatchAndExplain(operands[index], &inner_listener)) {
+      if (listener->IsInterested()) {
+        *listener << "\noperand " << index << ":\n\t"
+                  << operands[index]->ToString()
+                  << "\ndoesn't match expected:\n\t";
+        operands_[index].DescribeTo(listener->stream());
+        string explanation = inner_listener.str();
+        if (!explanation.empty()) {
+          *listener << ", " << explanation;
+        }
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+void HloMatcher::DescribeTo(::std::ostream* os) const {
+  *os << opcode_;
+  if (!operands_.empty()) {
+    *os << "(";
+    for (int i = 0; i < operands_.size(); i++) {
+      if (i > 0) {
+        *os << ", ";
+      }
+      operands_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+}
+
+}  // namespace testing
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
new file mode 100644
index 00000000000..141251011cc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace testing {
+
+class HloMatcher : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  HloMatcher(HloOpcode opcode,
+             std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : opcode_(opcode), operands_(operands) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  HloOpcode opcode_;
+  std::vector<::testing::Matcher<const HloInstruction*>> operands_;
+};
+
+// HloInstruction* matchers for opcode and operands. Example:
+//   namespace op = xla::opcode_matchers;
+//   EXPECT_THAT(instruction,
+//               op::Add(op::Reshape(), op::Add(op::Reshape(), _)));
+namespace opcode_matchers {
+#define HLO_MATCHER(opcode)                                                \
+  template <typename... M>                                                 \
+  ::testing::Matcher<const ::xla::HloInstruction*> opcode(M... operands) { \
+    return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(          \
+        ::xla::HloOpcode::k##opcode, {operands...}));                      \
+  }
+HLO_MATCHER(Abs);
+HLO_MATCHER(Add);
+HLO_MATCHER(Bitcast);
+HLO_MATCHER(Broadcast);
+HLO_MATCHER(Call);
+HLO_MATCHER(Ceil);
+HLO_MATCHER(Clamp);
+HLO_MATCHER(Concatenate);
+HLO_MATCHER(Constant);
+HLO_MATCHER(Convert);
+HLO_MATCHER(Convolution);
+HLO_MATCHER(Copy);
+HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CustomCall);
+HLO_MATCHER(Divide);
+HLO_MATCHER(Dot);
+HLO_MATCHER(DynamicSlice);
+HLO_MATCHER(DynamicUpdateSlice);
+HLO_MATCHER(Eq);
+HLO_MATCHER(Exp);
+HLO_MATCHER(Floor);
+HLO_MATCHER(Fusion);
+HLO_MATCHER(Ge);
+HLO_MATCHER(GetTupleElement);
+HLO_MATCHER(Gt);
+HLO_MATCHER(Index);
+HLO_MATCHER(Infeed);
+HLO_MATCHER(IsFinite);
+HLO_MATCHER(Le);
+HLO_MATCHER(Log);
+HLO_MATCHER(LogicalAnd);
+HLO_MATCHER(LogicalNot);
+HLO_MATCHER(LogicalOr);
+HLO_MATCHER(Lt);
+HLO_MATCHER(Map);
+HLO_MATCHER(Maximum);
+HLO_MATCHER(Minimum);
+HLO_MATCHER(Multiply);
+HLO_MATCHER(Ne);
+HLO_MATCHER(Negate);
+HLO_MATCHER(Outfeed);
+HLO_MATCHER(Pad);
+HLO_MATCHER(Parameter);
+HLO_MATCHER(Power);
+HLO_MATCHER(Recv);
+HLO_MATCHER(Reduce);
+HLO_MATCHER(ReduceWindow);
+HLO_MATCHER(Remainder);
+HLO_MATCHER(Reshape);
+HLO_MATCHER(Reverse);
+HLO_MATCHER(Rng);
+HLO_MATCHER(Select);
+HLO_MATCHER(SelectAndScatter);
+HLO_MATCHER(Send);
+HLO_MATCHER(Sign);
+HLO_MATCHER(Slice);
+HLO_MATCHER(Sort);
+HLO_MATCHER(Subtract);
+HLO_MATCHER(Tanh);
+HLO_MATCHER(Trace);
+HLO_MATCHER(Transpose);
+HLO_MATCHER(Tuple);
+HLO_MATCHER(Update);
+HLO_MATCHER(While);
+#undef HLO_MATCHER
+}  // namespace opcode_matchers
+
+// Helper to convert smart to raw pointers for matching.
+template <typename Container>
+std::vector<const HloInstruction*> Pointers(const Container& container) {
+  std::vector<const HloInstruction*> result;
+  result.reserve(container.size());
+  for (const auto& entry : container) result.push_back(entry.get());
+  return result;
+}
+
+}  // namespace testing
+
+// Tell GMock to print HloInstruction* by value, so error messages are nice.
+// Has to be in the same namespace as 'HloInstruction'.
+void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
+  *os << (inst ? inst->ToString() : "nullptr");
+}
+
+void PrintTo(HloInstruction* inst, ::std::ostream* os) {
+  PrintTo(const_cast<const HloInstruction*>(inst), os);
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
new file mode 100644
index 00000000000..1465d1cacdc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+using ::testing::Eq;
+
+namespace xla {
+namespace {
+
+template <typename M, typename T>
+string Explain(const T& t, const M& m) {
+  ::testing::StringMatchResultListener listener;
+  EXPECT_THAT(t, ::testing::Not(m));  // For the error message.
+  EXPECT_FALSE(m.MatchAndExplain(t, &listener));
+  return listener.str();
+}
+
+TEST(HloMatchersTest, Test) {
+  auto shape = ShapeUtil::MakeShape(F32, {1});
+  auto param = HloInstruction::CreateParameter(0, shape, "param");
+  auto mul = HloInstruction::CreateBinary(shape, HloOpcode::kMultiply,
+                                          param.get(), param.get());
+  auto add = HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param.get(),
+                                          mul.get());
+
+  EXPECT_THAT(add.get(), op::Add());
+  EXPECT_THAT(add.get(), op::Add(op::Parameter(), op::Multiply()));
+  EXPECT_THAT(add.get(),
+              op::Add(op::Parameter(), op::Multiply(_, op::Parameter())));
+
+  // Negative matches: check the explanation string.
+  EXPECT_THAT(Explain(add.get(), op::Parameter()), Eq(""));
+  EXPECT_THAT(Explain(add.get(), op::Add(op::Parameter())),
+              Eq("has too many operands (got 2, want 1)"));
+  EXPECT_THAT(
+      Explain(add.get(), op::Add(op::Parameter(), op::Parameter())),
+      Eq("\noperand 1:\n\t"
+         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
+         "doesn't match expected:\n\t"
+         "parameter"));
+  EXPECT_THAT(
+      Explain(add.get(),
+              op::Add(op::Parameter(), op::Multiply(op::Add(), op::Add()))),
+      Eq("\noperand 1:\n\t"
+         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
+         "doesn't match expected:\n\t"
+         "multiply(add, add), \n"
+         "operand 0:\n\t"
+         "%param = f32[1]{0} parameter(0)\n"
+         "doesn't match expected:\n\t"
+         "add"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 5d68b456cda..22ef9c590bc 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -31,20 +32,46 @@ limitations under the License.
 
 namespace xla {
 
-HloComputation* HloModule::AddEntryComputation(
+HloModule::HloModule(const string& name,
+                     const VersionedComputationHandle& entry_computation_handle,
+                     const HloModuleConfig& config)
+    : name_(name),
+      config_(config),
+      entry_computation_(nullptr),
+      has_entry_computation_handle_(true),
+      entry_computation_handle_(entry_computation_handle),
+      computation_name_uniquer_(/*separator=*/".") {}
+
+HloModule::HloModule(const string& name)
+    : name_(name),
+      entry_computation_(nullptr),
+      computation_name_uniquer_(/*separator=*/".") {}
+
+HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation) {
-  CHECK_EQ(nullptr, entry_computation_);
-  entry_computation_ = computation.get();
+  computation->UniquifyName(&computation_name_uniquer_);
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
 }
 
+HloComputation* HloModule::AddEntryComputation(
+    std::unique_ptr<HloComputation> computation) {
+  CHECK_EQ(nullptr, entry_computation_);
+  entry_computation_ = computation.get();
+
+  // If the module configuration has no entry layout computation set, create a
+  // default one based on the program shape.
+  if (!config_.has_entry_computation_layout()) {
+    config_.SetDefaultComputationLayout(
+        entry_computation_->ComputeProgramShape());
+  }
+  return AddComputationInternal(std::move(computation));
+}
+
 HloComputation* HloModule::AddEmbeddedComputation(
     std::unique_ptr<HloComputation> computation) {
-  computation->set_parent(this);
-  computations_.push_back(std::move(computation));
-  return computations_.back().get();
+  return AddComputationInternal(std::move(computation));
 }
 
 void HloModule::ReplaceComputations(
@@ -123,6 +150,17 @@ string HloModule::ToString() const {
   return s.str();
 }
 
+HloModuleProto HloModule::ToProto() const {
+  HloModuleProto proto;
+  proto.set_name(name_);
+  proto.set_entry_computation_name(entry_computation_->name());
+  for (const HloComputation* computation : MakeComputationPostOrder()) {
+    HloComputationProto computation_proto = computation->ToProto();
+    proto.add_computations()->Swap(&computation_proto);
+  }
+  return proto;
+}
+
 namespace {
 // Returns whether `hlo` is used outside the given subcomputation.
 // `instructions_in_subcomputation` is the instruction set of the given
@@ -232,7 +270,8 @@ std::list<HloComputation*> HloModule::MakeComputationPostOrder() const {
   std::set<HloComputation*> nonroot_computations;
   for (auto& computation : computations_) {
     for (auto& instruction : computation->instructions()) {
-      for (auto called_computation : instruction->MakeCalledComputationsSet()) {
+      for (HloComputation* called_computation :
+           instruction->called_computations()) {
         nonroot_computations.insert(called_computation);
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index d598750da65..4b14b4fd62a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -23,8 +23,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -41,19 +44,15 @@ namespace xla {
 // computations are owned by the module.
 class HloModule {
  public:
-  explicit HloModule(const string& name,
-                     const VersionedComputationHandle& entry_computation_handle)
-      : name_(name),
-        entry_computation_(nullptr),
-        has_entry_computation_handle_(true),
-        entry_computation_handle_(entry_computation_handle) {}
+  HloModule(const string& name,
+            const VersionedComputationHandle& entry_computation_handle,
+            const HloModuleConfig& config);
 
   // Constructor without a versioned computation handle. This constructor should
   // only be used for HloModules used outside of the XLA service (eg
   // tests). The versioned handle is used by the service in the compilation
-  // cache.
-  explicit HloModule(const string& name)
-      : name_(name), entry_computation_(nullptr) {}
+  // cache. A default configuration is created for this module.
+  explicit HloModule(const string& name);
 
   // Adds an entry computation to the module. A module can only have one entry
   // computation. Returns a pointer to the newly added computation.
@@ -82,6 +81,10 @@ class HloModule {
     return entry_computation_;
   }
 
+  ComputationLayout* mutable_entry_computation_layout() {
+    return config_.mutable_entry_computation_layout();
+  }
+
   const VersionedComputationHandle& entry_computation_handle() const {
     return entry_computation_handle_;
   }
@@ -95,7 +98,10 @@ class HloModule {
   // computation B, then A will appear after B in the sort.
   std::list<HloComputation*> MakeComputationPostOrder() const;
 
+  const HloModuleConfig& config() const { return config_; }
+
   string ToString() const;
+  HloModuleProto ToProto() const;
 
   // Outlines the given expression from the given computation.
   // instructions_to_outline contains the instructions that form the expression.
@@ -110,8 +116,17 @@ class HloModule {
   // Returns a randomly generated uint64.
   uint64 RandomNew64() const;
 
+  // Returns the unique name for a computation in this module.
+  string GetUniqueCompuationName(const string& prefix) {
+    return computation_name_uniquer_.GetUniqueName(prefix);
+  }
+
  private:
+  HloComputation* AddComputationInternal(
+      std::unique_ptr<HloComputation> computation);
+
   const string name_;
+  HloModuleConfig config_;
   HloComputation* entry_computation_;
   std::vector<std::unique_ptr<HloComputation>> computations_;
 
@@ -125,6 +140,9 @@ class HloModule {
   // Versioned handle of the entry computation of the module.
   bool has_entry_computation_handle_ = false;
   VersionedComputationHandle entry_computation_handle_;
+
+  // Unique name generator for computation names, which are unique per module.
+  NameUniquer computation_name_uniquer_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index c129ad1b392..a2235a26823 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>
 #include <vector>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -27,20 +28,27 @@ namespace xla {
 
 using tensorflow::strings::StrAppend;
 
+HloModuleConfig::HloModuleConfig() {}
+
 HloModuleConfig::HloModuleConfig(const ProgramShape& program_shape)
     : entry_computation_layout_(program_shape) {}
 
+void HloModuleConfig::SetDefaultComputationLayout(
+    const ProgramShape& program_shape) {
+  entry_computation_layout_ = ComputationLayout(program_shape);
+}
+
 string HloModuleConfig::compilation_cache_key() const {
   string key = tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_,
                                            "::hybrid=", has_hybrid_result_);
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
-       entry_computation_layout_.parameter_layouts()) {
+       entry_computation_layout_->parameter_layouts()) {
     params.push_back(param_layout.shape().DebugString());
   }
   StrAppend(&key, tensorflow::str_util::Join(params, ", "), ") => ",
-            entry_computation_layout_.result_shape().SerializeAsString());
+            entry_computation_layout_->result_shape().SerializeAsString());
   if (seed() != 0) {
     // TODO(b/32083678): force recompilation to reset global state.
     static std::atomic<int> counter{0};
@@ -49,7 +57,7 @@ string HloModuleConfig::compilation_cache_key() const {
   if (replica_count() != 1) {
     StrAppend(&key, "::replica_count=", replica_count());
   }
-  StrAppend(&key, "::fast_math_disabled=", fast_math_disabled_);
+  StrAppend(&key, debug_options_.DebugString());
   return key;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index f9a61c1cd1c..ee32ab9bc4b 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 
@@ -32,14 +33,34 @@ namespace xla {
 // executable.
 class HloModuleConfig {
  public:
+  // A configuration can be created either with, or without an entry
+  // ComputationLayout. The default ctor creates it without -- in this case
+  // accessing entry_computation_layout will CHECK-fail. The ctor accepting a
+  // ProgramShape creates a computation layout using this shape.
+  HloModuleConfig();
   explicit HloModuleConfig(const ProgramShape& program_shape);
 
-  // Return a reference to the layout of the entry computation.
-  const ComputationLayout& entry_computation_layout() const {
-    return entry_computation_layout_;
+  // Checks if this config has an entry computation layout already.
+  bool has_entry_computation_layout() const {
+    return entry_computation_layout_.has_value();
   }
+
+  // Sets the entry computation layout for this config. If the entry computation
+  // layout already exists, it is silently replaced.
+  void SetDefaultComputationLayout(const ProgramShape& program_shape);
+
+  // Returns a constant reference to the layout of the entry computation.
+  // Assumes the layout was set.
+  const ComputationLayout& entry_computation_layout() const {
+    CHECK(entry_computation_layout_.has_value());
+    return *entry_computation_layout_;
+  }
+
+  // Returns a mutable pointer to the layout of the entry computation. Assumes
+  // the layout was set.
   ComputationLayout* mutable_entry_computation_layout() {
-    return &entry_computation_layout_;
+    CHECK(entry_computation_layout_.has_value());
+    return &(*entry_computation_layout_);
   }
 
   // Sets/returns whether to enable HLO-level profiling.
@@ -60,23 +81,21 @@ class HloModuleConfig {
   }
   int64 replica_count() const { return replica_count_; }
 
-  // Sets/returns whether unsafe math optimizations are disabled for this
-  // module.  Default is fast-math enabled.
-  //
-  // This is named fast_math_disabled rather than the more natural
-  // fast_math_enabled for consistency with the ExecutionOptions proto.
-  bool fast_math_disabled() const { return fast_math_disabled_; }
-  void set_fast_math_disabled(bool disabled) { fast_math_disabled_ = disabled; }
-
   // Return a string which unambiguously represents all the fields of this data
   // structure. Used for generating a cache key for storing the compiled
   // executable.
   string compilation_cache_key() const;
 
+  const DebugOptions& debug_options() const { return debug_options_; }
+
+  void set_debug_options(const DebugOptions& debug_options) {
+    debug_options_ = debug_options;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
-  ComputationLayout entry_computation_layout_;
+  tensorflow::gtl::optional<ComputationLayout> entry_computation_layout_;
 
   // Whether to enable HLO-level profiling.
   bool hlo_profiling_enabled_ = false;
@@ -97,7 +116,7 @@ class HloModuleConfig {
   // The number of replicas to compile this binary for.
   int64 replica_count_ = 1;
 
-  bool fast_math_disabled_ = false;
+  DebugOptions debug_options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 0f4252522d3..870bc729aec 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace xla {
@@ -58,27 +58,32 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
-  EXPECT_EQ(module->MakeComputationPostOrder().front(), computation);
+  EXPECT_THAT(module->MakeComputationPostOrder(),
+              ::testing::ElementsAre(computation));
 }
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
 
-  EXPECT_MATCH(
-      testing::ListToVec<HloComputation*>(module->MakeComputationPostOrder()),
-      testing::UnorderedMatcher<HloComputation*>(computation1, computation2));
+  EXPECT_THAT(module->MakeComputationPostOrder(),
+              ::testing::UnorderedElementsAre(computation1, computation2));
+
+  // We specified the same name for both computations, but the HloModule should
+  // have made the names unique.
+  EXPECT_EQ(computation1->name(), "Constant");
+  EXPECT_EQ(computation2->name(), "Constant.1");
 }
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -89,9 +94,9 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
       CreateCallComputation({computation2, computation3}));
 
   auto post_order = module->MakeComputationPostOrder();
-  EXPECT_MATCH(testing::ListToVec<HloComputation*>(post_order),
-               testing::UnorderedMatcher<HloComputation*>(
-                   computation1, computation2, computation3, computation4));
+  EXPECT_THAT(post_order,
+              ::testing::UnorderedElementsAre(computation1, computation2,
+                                              computation3, computation4));
   EXPECT_EQ(post_order.back(), computation4);
   EXPECT_EQ(post_order.front(), computation1);
 }
@@ -99,3 +104,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 }  // namespace
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 5f7243b0fe7..ceb0cdaa316 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -74,6 +74,8 @@ string HloOpcodeString(HloOpcode opcode) {
       return "index";
     case HloOpcode::kInfeed:
       return "infeed";
+    case HloOpcode::kIsFinite:
+      return "is-finite";
     case HloOpcode::kLe:
       return "less-than-or-equal-to";
     case HloOpcode::kLog:
@@ -163,4 +165,17 @@ bool HloOpcodeIsComparison(HloOpcode opcode) {
   }
 }
 
+bool HloOpcodeIsVariadic(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kFusion:
+    case HloOpcode::kMap:
+    case HloOpcode::kTuple:
+      return true;
+    default:
+      return false;
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 5d60a77e14f..e2cdbfdfa7a 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -55,6 +55,7 @@ enum class HloOpcode {
   kGt,
   kIndex,
   kInfeed,
+  kIsFinite,
   kLe,
   kLog,
   kLogicalAnd,
@@ -103,6 +104,9 @@ inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) {
 // Returns true iff the given opcode is a comparison operation.
 bool HloOpcodeIsComparison(HloOpcode opcode);
 
+// Returns true iff the given opcode has variadic operands.
+bool HloOpcodeIsVariadic(HloOpcode opcode);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 0b64c16fdc6..892c89f9df2 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 38106dbbb11..72911ae9f91 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -33,15 +34,112 @@ limitations under the License.
 
 namespace xla {
 
-PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
-    : module_(module) {}
+namespace {
 
-bool PredecessorHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                            const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
+// Returns the nearest call graph ancestors of instructions 'a' and 'b' for
+// which the ancestors are in the same computation. An instruction is an call
+// graph ancestor of 'a' if the instruction calls the computation containing 'a'
+// either directly or transitively. Degeneratively an instruction is an ancestor
+// of itself. nullptr is returned if there is no common ancestor or if the
+// caller chain of 'a' or 'b' diverges (has multiple callers) before the nearest
+// common ancestor.
+//
+// Example:
+//
+// Entry computation:
+//   %x = Call(A, {Constant(42.0)})
+//   %y = Call(B, {%x})
+//
+// Computation A:
+//   %a = Negate(Param())
+//
+// Computation B:
+//   %b = Exp(Param());
+//
+// If called with %a and %b, this function would return (%x, %y). %x is an
+// ancestor of %a, and %y is an ancestor of %b, and %x and %y are in the same
+// computation.
+std::pair<const HloInstruction*, const HloInstruction*>
+GetNearestCallGraphAncestorsInSameComputation(const HloInstruction* a,
+                                              const HloInstruction* b,
+                                              const CallGraph& call_graph) {
+  // Lambda which returns the next instruction in the callee->caller chain in
+  // the call graph. This is the unique instruction which calls the computation
+  // containing 'instruction'. If more than one instruction calls the
+  // computation containing 'instruction' or no instructions call the
+  // computation then nullptr is returned.
+  auto next_caller =
+      [&call_graph](
+          const HloInstruction* instruction) -> const HloInstruction* {
+    const CallGraphNode& node = call_graph.GetNode(instruction->parent());
+    if (node.caller_callsites().size() != 1) {
+      return nullptr;
+    }
+    return node.caller_callsites()[0].instruction();
+  };
+
+  // Iterate through the callee->caller chains and find the earliest common
+  // element.
+  for (const HloInstruction* a_ancestor = a; a_ancestor != nullptr;
+       a_ancestor = next_caller(a_ancestor)) {
+    for (const HloInstruction* b_ancestor = b; b_ancestor != nullptr;
+         b_ancestor = next_caller(b_ancestor)) {
+      if (a_ancestor->parent() == b_ancestor->parent()) {
+        return {a_ancestor, b_ancestor};
+      }
+    }
+  }
+  return {nullptr, nullptr};
+}
+
+}  // namespace
+
+bool HloOrdering::ExecutesBefore(const HloInstruction* a,
+                                 const HloInstruction* b) const {
+  // 'a' and 'b' may be in different computations. In this case, find the
+  // callgraph ancestor instructions which call (potentially transitively) the
+  // computations containing 'a' and 'b' and use these ancestor instructions to
+  // compare order.
+  const HloInstruction* a_ancestor;
+  const HloInstruction* b_ancestor;
+  std::tie(a_ancestor, b_ancestor) =
+      GetNearestCallGraphAncestorsInSameComputation(a, b, *call_graph_);
+
+  if (a_ancestor == nullptr) {
+    // Ancestors in a common computation could not be found so consider the
+    // instructions 'a' and 'b' to be unordered.
     return false;
   }
+  // a_ancestor and b_ancestor must be either both null or both non-null.
+  CHECK_NE(b_ancestor, nullptr);
+  CHECK_EQ(a_ancestor->parent(), b_ancestor->parent());
+  return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
+}
+
+HloOrderingProto HloOrdering::ToProto() const {
+  HloOrderingProto proto;
+  for (const auto& computation : module_->computations()) {
+    const std::vector<const HloInstruction*>* sequence =
+        SequentialOrder(*computation);
+    if (sequence != nullptr) {
+      HloOrderingProto::SequentialComputation* proto_computation =
+          proto.add_sequential_computations();
+      proto_computation->set_computation_name(computation->name());
+      for (const HloInstruction* instruction : *sequence) {
+        *proto_computation->add_instruction_names() = instruction->name();
+      }
+    }
+  }
+  return proto;
+}
+
+PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
+    : HloOrdering(module) {}
+
+bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
+
   // 'a' executes before 'b' if 'a' is in the strict predecessor set of 'b'.
   return strict_predecessors_.at(b->parent())->IsReachable(b, a);
 }
@@ -85,9 +183,9 @@ string DependencyHloOrdering::ToString() const {
 
 SequentialHloOrdering::SequentialHloOrdering(
     const HloModule* module, const HloModuleSequence& module_sequence)
-    : module_(module) {
+    : HloOrdering(module), module_sequence_(module_sequence) {
   // Create a map from instruction to its order position.
-  for (auto computation_order : module_sequence) {
+  for (auto computation_order : module_sequence_) {
     const std::vector<const HloInstruction*>& order = computation_order.second;
     for (int i = 0; i < order.size(); ++i) {
       DCHECK_EQ(0, order_position_.count(order[i]));
@@ -96,12 +194,9 @@ SequentialHloOrdering::SequentialHloOrdering(
   }
 }
 
-bool SequentialHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                           const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
-    return false;
-  }
+bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
   if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
     return false;
@@ -109,6 +204,13 @@ bool SequentialHloOrdering::ExecutesBefore(const HloInstruction* a,
   return order_position_.at(a) < order_position_.at(b);
 }
 
+const std::vector<const HloInstruction*>*
+SequentialHloOrdering::SequentialOrder(
+    const HloComputation& computation) const {
+  auto find_it = module_sequence_.find(&computation);
+  return find_it == module_sequence_.end() ? nullptr : &find_it->second;
+}
+
 string SequentialHloOrdering::ToString() const {
   std::vector<string> pieces;
   pieces.push_back("SequentialHloOrdering");
@@ -136,6 +238,29 @@ string SequentialHloOrdering::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
+StatusOr<int64> MinimumMemoryForSequence(
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const LogicalBuffer::SizeFunction& size_function) {
+  if (module_sequence.empty()) {
+    return 0;
+  }
+
+  const HloModule* module = module_sequence.begin()->first->parent();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
+                      TuplePointsToAnalysis::Run(module));
+
+  // The absolute minimum memory required for a given sequence of instructions
+  // is determined by the sequence of Alloc and Free calls on a simulated heap,
+  // ignoring fragmentation. We run the heap simulation on the whole module,
+  // rather than summing each computation, since it gives us a better lower
+  // bound, by minimizing the liveness of sub-computations.
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+                         module_sequence, *points_to_analysis, size_function));
+  return result.heap_size;
+}
+
 namespace {
 
 // Class implementing a list scheduler of HLO instructions which produces a
@@ -235,7 +360,7 @@ class ListScheduler {
     return freed_bytes;
   }
 
-  // Construct the scheduling priority of the given instruciton.
+  // Construct the scheduling priority of the given instruction.
   Priority GetPriority(const HloInstruction* instruction) {
     return {BytesFreedIfScheduled(instruction), instruction->user_count()};
   }
@@ -243,11 +368,24 @@ class ListScheduler {
   std::vector<const HloInstruction*> CreateSchedule() {
     std::vector<const HloInstruction*> schedule;
 
-    // Populate the ready list with instructions which have no operands.
+    // Populate the ready list with instructions which have no operands or
+    // control predecessors.
+    std::unordered_map<const HloInstruction*, int64> unscheduled_pred_count;
     std::list<const HloInstruction*> ready_list;
     for (auto& instruction : computation_.instructions()) {
-      if (instruction->operand_count() == 0 &&
-          instruction->control_predecessors().empty()) {
+      // TODO(b/34466113): Replace this and above with successors() or
+      // predecessors() when these methods are added to HloInstruction.
+      for (const HloInstruction* user : instruction->users()) {
+        unscheduled_pred_count[user]++;
+      }
+      for (const HloInstruction* succ : instruction->control_successors()) {
+        unscheduled_pred_count[succ]++;
+      }
+    }
+    for (auto& instruction : computation_.instructions()) {
+      // Instruction with no operands or control predecessors will
+      // not be in the map.
+      if (unscheduled_pred_count.count(instruction.get()) == 0) {
         ready_list.push_back(instruction.get());
       }
     }
@@ -279,28 +417,21 @@ class ListScheduler {
       }
 
       // Add new instructions to ready list.
-      // TODO(b/34466113): Replace this with successors()/predecessors() when
-      // predecessor/successor methods are added to HloInstruction. This also
-      // will resolve the nondeterminism of using a set here assuming
-      // predecessors/successors is a vector.
-      std::set<HloInstruction*> successors = best->users();
-      successors.insert(best->control_successors().begin(),
-                        best->control_successors().end());
-      for (auto* successor : successors) {
-        std::set<HloInstruction*> predecessors(successor->operands().begin(),
-                                               successor->operands().end());
-        predecessors.insert(successor->control_predecessors().begin(),
-                            successor->control_predecessors().end());
-        bool is_ready = true;
-        for (auto* predecessor : predecessors) {
-          if (scheduled_instructions_.count(predecessor) == 0) {
-            is_ready = false;
-            break;
-          }
-        }
-        if (is_ready) {
-          ready_list.push_back(successor);
+      auto update_pred_count = [&unscheduled_pred_count,
+                                &ready_list](HloInstruction* inst) {
+        int64 pred_count = --unscheduled_pred_count.at(inst);
+        CHECK_GE(pred_count, 0);
+        if (pred_count == 0) {
+          ready_list.push_back(inst);
         }
+      };
+      // TODO(b/34466113): Replace this and above with successors() or
+      // predecessors() when these methods are added to HloInstruction.
+      for (HloInstruction* user : best->users()) {
+        update_pred_count(user);
+      }
+      for (HloInstruction* succ : best->control_successors()) {
+        update_pred_count(succ);
       }
     }
     CHECK_EQ(schedule.size(), computation_.instructions().size());
@@ -327,6 +458,113 @@ class ListScheduler {
   std::unordered_set<const HloInstruction*> scheduled_instructions_;
 };
 
+int64 SumLogicalBufferSizes(const std::vector<const LogicalBuffer*>& buffers,
+                            const LogicalBuffer::SizeFunction& size_function) {
+  int64 size = 0;
+  for (const LogicalBuffer* buffer : buffers) {
+    size += size_function(*buffer);
+  }
+  return size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  // This ordering is based on DFS post-order, with a heuristic to decide which
+  // operand to visit first.  The heuristic is based on 'extra_users', which is
+  // simply users-1 for each instruction.  By subtracting 1, we're saying that
+  // instructions with no users or a single user don't count; instructions with
+  // lots of fan-out will be visited earlier.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
+  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+    extra_users[hlo] = hlo->users().empty() ? 0 : hlo->users().size() - 1;
+    total_sizes[hlo] = SumLogicalBufferSizes(
+        points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
+    tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
+        hlo->operands().begin(), hlo->operands().end());
+    for (const HloInstruction* operand : unique_operands) {
+      extra_users[hlo] += extra_users[operand];
+      total_sizes[hlo] += total_sizes[operand];
+    }
+  }
+  CHECK_EQ(extra_users.size(), computation.instructions().size());
+  CHECK_EQ(total_sizes.size(), computation.instructions().size());
+
+  // Construct a total order based on DFS post-order, visiting operands in
+  // decreasing cumulative extra user order, and next by cumulative size, with a
+  // tiebreaker by name for determinism.
+  std::vector<const HloInstruction*> sequence;
+  FunctionVisitor visitor([&sequence](HloInstruction* hlo) {
+    sequence.push_back(hlo);
+    return Status::OK();
+  });
+  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+      &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
+                                             const HloInstruction* b) {
+        if (extra_users[a] != extra_users[b]) {
+          return extra_users[a] > extra_users[b];
+        }
+        if (total_sizes[a] != total_sizes[b]) {
+          return total_sizes[a] > total_sizes[b];
+        }
+        return a->name() < b->name();
+      }));
+  CHECK_EQ(sequence.size(), computation.instructions().size());
+  return sequence;
+}
+
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  // We try both a list-scheduler based ordering and a DFS based ordering, and
+  // choose whichever returns a lower min-memory, not accounting for
+  // fragmentation.
+  //
+  // Note that this is just a heuristic. One obvious inaccuracy is that the
+  // memory required for sub-computations might be different when considered
+  // within the caller's context. But it's good enough for now.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> list_sequence,
+      ListScheduler::Run(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(
+      const int64 list_memory,
+      MinimumMemoryForComputation(computation, list_sequence,
+                                  points_to_analysis, size_function));
+  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> dfs_sequence,
+      RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(
+      const int64 dfs_memory,
+      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
+                                  size_function));
+  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
+
+  if (list_memory <= dfs_memory) {
+    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
+    return list_sequence;
+  } else {
+    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
+    return dfs_sequence;
+  }
+}
+
 }  // namespace
 
 StatusOr<SequentialHloOrdering::HloModuleSequence>
@@ -335,16 +573,23 @@ CreateMemoryMinimizingSequence(
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
-
-  for (auto& computation : module.computations()) {
-    TF_ASSIGN_OR_RETURN(
-        sequence[computation.get()],
-        ListScheduler::Run(*computation, *points_to_analysis, size_function));
+  for (const auto& computation : module.computations()) {
+    TF_ASSIGN_OR_RETURN(sequence[computation.get()],
+                        CreateMemoryMinimizingSequence(
+                            *computation, *points_to_analysis, size_function));
   }
-
   return sequence;
 }
 
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
+                      TuplePointsToAnalysis::Run(computation.parent()));
+  return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
+                                        size_function);
+}
+
 std::ostream& operator<<(
     std::ostream& out,
     const SequentialHloOrdering::HloModuleSequence& module_sequence) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index 97f7c6060b8..b59e1ea5eb0 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -31,19 +33,43 @@ limitations under the License.
 
 namespace xla {
 
-// Abstract base class for describing a partial ordering of HLO
-// instructions. Used to determine live range overlap of HLO instruction output
-// buffers.
+// Base class for describing a partial ordering of HLO instructions. Used to
+// determine live range overlap of HLO instruction output buffers.
 class HloOrdering {
  public:
-  HloOrdering() = default;
+  HloOrdering(const HloModule* module)
+      : module_(module), call_graph_(CallGraph::Build(module)) {}
   virtual ~HloOrdering() = default;
 
   // Returns true if instruction 'a' executes before instruction 'b'. This is
   // not reflexive, that is, an instruction does not execute before itself.
-  virtual bool ExecutesBefore(const HloInstruction* a,
-                              const HloInstruction* b) const = 0;
+  bool ExecutesBefore(const HloInstruction* a, const HloInstruction* b) const;
+
+  // Returns the sequential instruction order for the given computation, or
+  // nullptr if the computation does not have a sequential ordering.
+  virtual const std::vector<const HloInstruction*>* SequentialOrder(
+      const HloComputation& computation) const = 0;
+
   virtual string ToString() const = 0;
+
+  // Returns the serialized representation of this ordering.
+  // Only sequential computation orders are represented.
+  HloOrderingProto ToProto() const;
+
+ protected:
+  // Returns true if instruction 'a' executes before instruction 'b'.
+  // Precondition: 'a' and 'b' are in the same computation.
+  //
+  // Derived classes should implement this method for determining order of
+  // instructions in the same comptuation. ExecutesBefore() analyzes the
+  // callgraph and uses this method to determine ordering of instructions in
+  // different computations.
+  virtual bool ExecutesBeforeInSameComputation(
+      const HloInstruction* a, const HloInstruction* b) const = 0;
+
+  const HloModule* module_;
+
+  std::unique_ptr<CallGraph> call_graph_;
 };
 
 // Base class for partial orderings implemented by a map of strict predecessors
@@ -52,20 +78,23 @@ class PredecessorHloOrdering : public HloOrdering {
  public:
   ~PredecessorHloOrdering() override = default;
 
-  // Returns true if instruction 'a' executes before instruction 'b'.
-  // Instructions in different computations are not ordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
+  // Returns nullptr indicating the computation does not have a sequential
+  // ordering.
+  const std::vector<const HloInstruction*>* SequentialOrder(
+      const HloComputation& computation) const override {
+    return nullptr;
+  }
 
  protected:
   explicit PredecessorHloOrdering(const HloModule* module);
   string ToStringHelper(const string& name) const;
 
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
 
-  // For each each computation in the module, this is the set of the
-  // instruction's strict predecessors. An instruction is not an element of its
-  // own strict predecessor set.
+  // For each computation in the module, this is the set of the instruction's
+  // strict predecessors. An instruction is not an element of its own strict
+  // predecessor set.
   //
   // Subclasses should fill this in to define the desired ordering.
   tensorflow::gtl::FlatMap<const HloComputation*,
@@ -76,13 +105,13 @@ class PredecessorHloOrdering : public HloOrdering {
 // An HLO ordering based on data dependencies in the HLO graph. In this partial
 // order, instruction A executes before instruction B only if there is a path
 // from A to B in the HLO graph. For example, given the following graph:
-//
-//        param
-//       /     \
-//    negate   exp
-//        \    /
-//         add
-//
+/*
+          param
+         /     \
+      negate   exp
+          \    /
+           add
+*/
 // DependencyHloOrdering gives the following executes-before relations:
 //   param executes before negate, exp, and add
 //   negate executes before add
@@ -105,13 +134,13 @@ class DependencyHloOrdering : public PredecessorHloOrdering {
 // The computation total order is a sequencing of all of its instructions in
 // the computation (eg, {inst0, inst1, inst2,...}) as in single-threaded
 // execution. For example, given the following HLO graph:
-//
-//        param
-//       /     \
-//    negate   exp
-//        \    /
-//         add
-//
+/*
+          param
+         /     \
+      negate   exp
+          \    /
+           add
+*/
 // and the following sequence:
 //
 //  {param, negate, exp, add}
@@ -137,15 +166,17 @@ class SequentialHloOrdering : public HloOrdering {
                         const HloModuleSequence& module_sequence);
   ~SequentialHloOrdering() override = default;
 
-  // Instruction 'a' executes before 'b' if 'a' appears before 'b' in the
-  // instruction sequence for the computation. Instructions in different
-  // computations are unordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
+  // Returns the sequential instruction order for the given computation.
+  const std::vector<const HloInstruction*>* SequentialOrder(
+      const HloComputation& computation) const override;
+
   string ToString() const override;
 
  protected:
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
+
+  const HloModuleSequence module_sequence_;
 
   // The position of every instruction in the HLO module in its respective
   // computation sequence (a value of zero indicates the instruction is first in
@@ -156,6 +187,16 @@ class SequentialHloOrdering : public HloOrdering {
   tensorflow::gtl::FlatMap<const HloInstruction*, int> order_position_;
 };
 
+std::ostream& operator<<(
+    std::ostream& out,
+    const SequentialHloOrdering::HloModuleSequence& module_sequence);
+
+// Returns the minimum memory required to compute the given module sequence,
+// assuming no fragmentation.
+StatusOr<int64> MinimumMemoryForSequence(
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const LogicalBuffer::SizeFunction& size_function);
+
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
@@ -163,9 +204,10 @@ StatusOr<SequentialHloOrdering::HloModuleSequence>
 CreateMemoryMinimizingSequence(
     const HloModule& module, const LogicalBuffer::SizeFunction& size_function);
 
-std::ostream& operator<<(
-    std::ostream& out,
-    const SequentialHloOrdering::HloModuleSequence& module_sequence);
+// Overload of above that computes the sequence for a single computation.
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const LogicalBuffer::SizeFunction& size_function);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 425bee601a8..21d852a51d6 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -58,26 +58,166 @@ TEST_F(HloOrderingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   TF_ASSIGN_OR_ASSERT_OK(
       SequentialHloOrdering::HloModuleSequence sequence,
-      CreateMemoryMinimizingSequence(module, [](const LogicalBuffer& buffer) {
+      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
   // Verify that all instructions are in the sequence.
-  EXPECT_EQ(module.entry_computation()->instruction_count(),
-            sequence.at(module.entry_computation()).size());
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
 
   // The first instruction should be the parameter and the last the root "sub".
-  EXPECT_EQ(param, sequence.at(module.entry_computation()).front());
-  EXPECT_EQ(sub, sequence.at(module.entry_computation()).back());
+  EXPECT_EQ(param, sequence.at(module->entry_computation()).front());
+  EXPECT_EQ(sub, sequence.at(module->entry_computation()).back());
 
-  SequentialHloOrdering ordering(&module, sequence);
+  SequentialHloOrdering ordering(module.get(), sequence);
   EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
 }
 
+TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
+  // Tests the ordering of instructions in different computations using the
+  // following HLO code:
+  //
+  // Entry computation:
+  //   %x = Call(A, {})
+  //   %y = Call(B, {%x})
+  //
+  // Computation A:
+  //   %a = Call(C, {})
+  //
+  // Computation B:
+  //   %b = Call(C, {})
+  //
+  // Computation C:
+  //   %c = Constant(42.0f)
+  //
+  // This results in a diamond-shaped callgraph.
+  auto module = CreateNewModule();
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+
+  auto builder_c = HloComputation::Builder("C");
+  HloInstruction* c = builder_c.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  HloComputation* computation_c =
+      module->AddEmbeddedComputation(builder_c.Build());
+
+  auto builder_b = HloComputation::Builder("B");
+  builder_b.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* b = builder_b.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_b =
+      module->AddEmbeddedComputation(builder_b.Build());
+
+  auto builder_a = HloComputation::Builder("A");
+  HloInstruction* a = builder_a.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_a =
+      module->AddEmbeddedComputation(builder_a.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* x = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_a));
+  HloInstruction* y = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {x}, computation_b));
+  module->AddEntryComputation(builder.Build());
+
+  DependencyHloOrdering ordering(module.get());
+  EXPECT_TRUE(ordering.ExecutesBefore(x, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, x));
+
+  EXPECT_TRUE(ordering.ExecutesBefore(a, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(a, x));
+  EXPECT_TRUE(ordering.ExecutesBefore(a, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(b, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, y));
+  EXPECT_TRUE(ordering.ExecutesBefore(x, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, b));
+
+  // Instruction 'c' is called from multiple callsites and should be unordered
+  // relative to all other instructions in the module.
+  EXPECT_FALSE(ordering.ExecutesBefore(c, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(a, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, c));
+}
+
+class MinimumMemoryForSequenceTest : public HloTestBase {};
+
+TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+  auto module = CreateNewModule();
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      module->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
+  // While: 8 bytes (4 bytes per element), TOTAL=32
+  // Both cond and body use a max of 24 bytes, TOTAL=56
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
+                                       cond_lt};
+  module_sequence[body_computation] = {body_param};
+  module_sequence[entry_computation] = {iter, data, tuple, while_op};
+  EXPECT_EQ(56,
+            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+}
+
 }  // namespace
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 91468fd35b0..119e2d79022 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -26,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
+using ::tensorflow::strings::StrAppend;
+
 namespace xla {
 
 namespace {
@@ -38,32 +39,52 @@ void DumpModule(const Compiler::HloDumper& dumper_, const HloModule& module,
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
-  legacy_flags::HloPassPipelineFlags* flags =
-      legacy_flags::GetHloPassPipelineFlags();
-  std::vector<string> tmp =
-      tensorflow::str_util::Split(flags->xla_disable_hlo_passes, ',');
-  tensorflow::gtl::FlatSet<string> disabled_passes(tmp.begin(), tmp.end());
+  run_called_ = true;
+
+  VLOG(1) << "Running HLO pass pipeline " << name();
+
+  auto repeated_field =
+      module->config().debug_options().xla_disable_hlo_passes();
+  tensorflow::gtl::FlatSet<string> disabled_passes(repeated_field.begin(),
+                                                   repeated_field.end());
+  if (!disabled_passes.empty()) {
+    VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
+            << tensorflow::str_util::Join(disabled_passes, ", ");
+  }
+
+  auto run_invariant_checkers = [this, module]() -> Status {
+    for (auto& invariant_checker : invariant_checkers_) {
+      TF_ASSIGN_OR_RETURN(bool changed, invariant_checker->Run(module));
+      TF_RET_CHECK(!changed) << "invariant checkers must not change the graph";
+    }
+    return Status::OK();
+  };
 
   string prefix = name().ToString() + ": pipeline start";
   bool changed = false;
   string message;
   for (auto& pass : passes_) {
-    if (!disabled_passes.empty() &&
-        disabled_passes.count(pass->name().ToString()) > 0) {
+    if (disabled_passes.count(pass->name().ToString()) > 0) {
+      VLOG(1) << "  Skipping HLO pass " << pass->name()
+              << ", disabled by --xla_disable_hlo_passes";
       continue;
     }
 
+    VLOG(1) << "  HLO pass " << pass->name();
+
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
-    tensorflow::strings::StrAppend(&message, prefix, ", before ", pass->name());
+    StrAppend(&message, prefix, ", before ", pass->name());
     DumpModule(dumper_, *module, message);
 
+    TF_RETURN_IF_ERROR(run_invariant_checkers());
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
 
     changed |= changed_this_pass;
     prefix.clear();
-    tensorflow::strings::StrAppend(&prefix, name(), ": after ", pass->name());
+    StrAppend(&prefix, name(), ": after ", pass->name());
   }
+  TF_RETURN_IF_ERROR(run_invariant_checkers());
   DumpModule(dumper_, *module, prefix + ", pipeline end");
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index 7a9c606a487..682c4b952df 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -47,11 +47,23 @@ class HloPassPipeline : public HloPassInterface {
   // Returns a reference to the added pass.
   template <typename T, typename... Args>
   T& AddPass(Args&&... args) {
+    CHECK(!run_called_) << "AddPass cannot be called after Run";
     auto pass = new T(std::forward<Args>(args)...);
     passes_.push_back(std::unique_ptr<T>(pass));
     return *pass;
   }
 
+  // Add an invariant-checking pass to the pipeline. It will be run before and
+  // after each HLO pass. The invariant checking pass must not mutate the graph
+  // (it is required to always return "false" from its Run() method).
+  template <typename T, typename... Args>
+  T& AddInvariantChecker(Args&&... args) {
+    CHECK(!run_called_) << "AddInvariantChecker cannot be called after Run";
+    auto pass = new T(std::forward<Args>(args)...);
+    invariant_checkers_.push_back(std::unique_ptr<T>(pass));
+    return *pass;
+  }
+
   // Run all passes on the given HLO module.
   StatusOr<bool> Run(HloModule* module) override;
 
@@ -59,6 +71,8 @@ class HloPassPipeline : public HloPassInterface {
   const string name_;
   Compiler::HloDumper dumper_;
   std::vector<std::unique_ptr<HloPassInterface>> passes_;
+  std::vector<std::unique_ptr<HloPassInterface>> invariant_checkers_;
+  bool run_called_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloPassPipeline);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
new file mode 100644
index 00000000000..727ad0178c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+
+namespace xla {
+
+HloProto MakeHloProto(const HloModule& module,
+                      const BufferAssignment& assignment) {
+  HloModuleProto proto_module = module.ToProto();
+  HloOrderingProto proto_ordering =
+      assignment.liveness().hlo_ordering().ToProto();
+  BufferAssignmentProto proto_assignment = assignment.ToProto();
+  HloProto proto;
+  proto.mutable_hlo_module()->Swap(&proto_module);
+  proto.mutable_hlo_ordering()->Swap(&proto_ordering);
+  proto.mutable_buffer_assignment()->Swap(&proto_assignment);
+  return proto;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
new file mode 100644
index 00000000000..603259a11fc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utilities to manipulate data in hlo.proto.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROTO_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROTO_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+
+// Returns a serialized representation of the HLO state.
+HloProto MakeHloProto(const HloModule& module,
+                      const BufferAssignment& assignment);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROTO_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 1556d1772f9..a153d73dbd8 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -32,6 +32,16 @@ bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   return false;
 }
 
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kParameter &&
+        operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AllOperandsAreParameters(const HloInstruction& instruction) {
   for (const auto& operand : instruction.operands()) {
     if (operand->opcode() != HloOpcode::kParameter) {
@@ -41,6 +51,15 @@ bool AllOperandsAreParameters(const HloInstruction& instruction) {
   return true;
 }
 
+bool AllOperandsAreConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 HloInstruction* GetMatchingOperand(
     std::function<bool(const HloInstruction*)> matcher,
     HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index 864f892e920..c79347bbf9d 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -28,9 +28,16 @@ namespace hlo_query {
 // Precondition: out != nullptr
 bool IsConstantR0F32(HloInstruction* instruction, float* out);
 
+// Returns whether all of an instruction's operands are of the types constants
+// and parameters.
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction);
+
 // Returns whether all of an instruction's operands are parameters.
 bool AllOperandsAreParameters(const HloInstruction& instruction);
 
+// Returns whether all of an instruction's operands are constants.
+bool AllOperandsAreConstants(const HloInstruction& instruction);
+
 // Returns whether the instruction is a scalar constant.
 bool IsScalarConstant(const HloInstruction* instruction);
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
new file mode 100644
index 00000000000..2c1b0fff4e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -0,0 +1,1294 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_rematerialization.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+using ::tensorflow::strings::HumanReadableNumBytes;
+
+namespace xla {
+
+namespace {
+
+// Returns true if the given instruction is rematerializable.
+bool IsRematerializable(const HloInstruction* instruction) {
+  // Conservatively, don't rematerialize instruction with control
+  // dependencies. For one, control dependencies are added to prevent
+  // interference of aliased buffers (say, in while bodies) and
+  // rematerialization is ignorant of liveness and may break the intended
+  // ordering.
+  if (!instruction->control_predecessors().empty() ||
+      !instruction->control_successors().empty()) {
+    return false;
+  }
+
+  // Don't rematerialize instructions with side effects, those with a cost that
+  // might not be captured by HloCostAnalysis, or instructions which cannot be
+  // cloned safely.
+  switch (instruction->opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConstant:
+    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kParameter:
+    case HloOpcode::kRecv:
+    case HloOpcode::kSend:
+    case HloOpcode::kTrace:
+    case HloOpcode::kWhile:
+      return false;
+    default:
+      return true;
+  }
+}
+
+// Class which maintains an ordered list of instructions with fast insertion
+// before arbitrary elements.
+class InstructionList {
+ public:
+  explicit InstructionList(const std::vector<const HloInstruction*> order) {
+    int64 position = 0;
+    for (const HloInstruction* inst : order) {
+      instructions_.push_back(const_cast<HloInstruction*>(inst));
+      instruction_iterators_.insert({const_cast<HloInstruction*>(inst),
+                                     std::next(instructions_.end(), -1)});
+      // Initially position numbers are uniquely assigned in order. Later as
+      // instructions are added with InsertBefore* methods, some instructions
+      // may have duplicate position numbers, but the values will be guaranteed
+      // to be monotonically increasing through the list, and so is still useful
+      // for quickly(-ish) determining the order of arbitrary instructions in
+      // the list.
+      position_number_[inst] = position;
+      first_at_position_[position] = inst;
+      position++;
+    }
+  }
+
+  // Returns the list of instructions.
+  const std::list<HloInstruction*>& instructions() const {
+    return instructions_;
+  }
+
+  // Insert instruction 'to_insert' immediately before instruction 'before' in
+  // the list.
+  void InsertBefore(HloInstruction* to_insert, HloInstruction* before) {
+    VLOG(3) << "InsertBefore: " << to_insert->name() << " before "
+            << before->name();
+    auto it = instruction_iterators_.find(before);
+    CHECK(it != instruction_iterators_.end());
+    instruction_iterators_.insert(
+        {to_insert, instructions_.insert(it->second, to_insert)});
+    // Assign the same position number to the newly added instruction as
+    // 'before'. This guarantees monotonicity of the position numbers, but not
+    // uniqueness.
+    int64 pos = position_number_.at(before);
+    position_number_[to_insert] = pos;
+    if (first_at_position_.at(pos) == before) {
+      first_at_position_[pos] = to_insert;
+    }
+  }
+
+  // Insert instruction 'to_insert' immediately before the earliest instruction
+  // in 'before_instructions'.
+  void InsertBeforeInstructions(
+      HloInstruction* to_insert,
+      tensorflow::gtl::ArraySlice<HloInstruction*> before_instructions) {
+    VLOG(3) << "InsertBeforeInstructions: " << to_insert->name() << " before {"
+            << tensorflow::str_util::Join(
+                   before_instructions, ", ",
+                   [](string* out, HloInstruction* inst) {
+                     tensorflow::strings::StrAppend(out, inst->name());
+                   })
+            << "}";
+
+    // Find the minimal position number of any instruction in
+    // 'before_instructions'.
+    CHECK(!before_instructions.empty());
+    int64 min_position_number = std::numeric_limits<int64>::max();
+    for (const HloInstruction* instruction : before_instructions) {
+      min_position_number =
+          std::min(min_position_number, position_number_.at(instruction));
+    }
+
+    // Because more than one instruction in 'before_instructions' may have a
+    // position number of 'min_position_number', find the first such instruction
+    // with position number 'min_position_number'.
+    for (auto it = instruction_iterators_.at(
+             first_at_position_.at(min_position_number));
+         it != instructions_.end() &&
+         position_number_.at(*it) == min_position_number;
+         ++it) {
+      if (std::find(before_instructions.begin(), before_instructions.end(),
+                    *it) != before_instructions.end()) {
+        return InsertBefore(to_insert, *it);
+      }
+    }
+    LOG(FATAL) << "Expected to find instruction in before_instructions with "
+                  "position number "
+               << min_position_number;
+  }
+
+ private:
+  // List of instructions.
+  std::list<HloInstruction*> instructions_;
+
+  // Iterators for each instruction in the list.
+  tensorflow::gtl::FlatMap<const HloInstruction*,
+                           std::list<HloInstruction*>::iterator>
+      instruction_iterators_;
+
+  // A number assigned to each instruction which increases monotonically through
+  // 'instructions_'. Used to facilitate fast insertion of an instruction before
+  // the earliest instruction in a set of instructions
+  // (InsertBeforeInstructions) by enabling fast-ish ordering queries between
+  // instructions. If position_number_[a] < position_number_[b] then 'a' comes
+  // before 'b' in the list. If the position numbers are the same then nothing
+  // can be said about their order without examining the list.
+  //
+  // On object construction this value is precisely the instruction's ordinal
+  // position in the list. Instructions inserted via InsertBefore receive
+  // duplicate values. However, monotonicity is preserved.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> position_number_;
+
+  // The first instruction in the list assigned a particular position number.
+  tensorflow::gtl::FlatMap<int64, const HloInstruction*> first_at_position_;
+};
+
+// Return the HloInstructions which use the given LogicalBuffer. Sets
+// has_indirect_users to whether any of the uses is indirect. A use is indirect
+// if the instruction defining logical_buffer is not an operand of the use. This
+// can happen via buffer aliasing (eg, tuples).
+std::vector<const HloInstruction*> GetUsers(
+    const LogicalBuffer* logical_buffer,
+    const TuplePointsToAnalysis& points_to_analysis, bool* has_indirect_users) {
+  std::vector<const HloInstruction*> users;
+  // To identify uses iterate through all HloInstruction users of the
+  // BufferAliases of the logical buffer.
+  *has_indirect_users = false;
+  for (const BufferAlias& buffer_alias :
+       points_to_analysis.GetBufferAliases(*logical_buffer)) {
+    for (const HloInstruction* user : buffer_alias.instruction()->users()) {
+      if (DoesNotUseOperandBuffer(buffer_alias.instruction(),
+                                  buffer_alias.index(), user,
+                                  points_to_analysis)) {
+        // The alias may be an operand of 'user', but the LogicalBuffer cannot
+        // possibly be used by the instruction so ignore 'user'. This is the
+        // case, for example, for the tuple element buffers in a GetTupleElement
+        // instruction (the GTE instruction only uses the pointer vector).
+        continue;
+      }
+      if (buffer_alias.instruction() != logical_buffer->instruction()) {
+        *has_indirect_users = true;
+      }
+      // A buffer may be used by the instruction via more than one alias. For
+      // example, a buffer which appears in more than one element of a tuple.
+      if (std::find(users.begin(), users.end(), user) == users.end()) {
+        users.push_back(user);
+      }
+    }
+  }
+  return users;
+}
+
+// Class for tracking memory usage of a computation as the instructions are
+// placed sequentially. Memory usage is the sum of the sizes of live values
+// (LogicalBuffers) at the current point in the instruction sequence.
+class MemoryUsageTracker {
+ public:
+  MemoryUsageTracker(
+      const HloComputation* computation,
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const InstructionList& instruction_list);
+
+  // Starts the placement of the given instruction. This adds the sizes of the
+  // LogicalBuffers defined by the instruction to the current memory
+  // usage. Placement is broken into two steps (BeginInstruction and
+  // EndInstruction) to accurately model memory usage. At BeginInstruction the
+  // memory for the output value(s) of the current instruction is allocated. At
+  // EndInstruction memory for dead operand(s) is freed.
+  Status BeginInstruction(const HloInstruction* instruction);
+
+  // Finishes the placement of the current instruction. This frees any dead
+  // operands or dead result of the instruction. This must be called after
+  // each call to BeginInstruction.
+  Status EndInstruction();
+
+  // Returns the number of bytes that the current memory usage will be reduced
+  // if the given instruction is rematerialized.
+  int64 MemoryReducedIfRematerialized(const HloInstruction* instruction) const;
+
+  // Adjusts memory usage to account for the rematerialization of
+  // original_instruction for all remaining unplaced uses. The rematerialization
+  // is remat_instruction. This method should be called after the HLO graph has
+  // been transformed (rematerialization instruction created and connected to
+  // uses).
+  Status AddRematerializedInstruction(HloInstruction* original_instruction,
+                                      HloInstruction* remat_instruction);
+
+  // Returns whether the given instruction has been placed (BeginInstruction
+  // has been called with 'instruction' as the argument).
+  bool IsPlaced(const HloInstruction* instruction) const {
+    return ContainsKey(placed_instructions_, instruction);
+  }
+
+  // Returns the current memory usage. This is the sum of sizes of all live
+  // values.
+  int64 memory_usage() const { return memory_usage_; }
+
+  // Returns the current instruction being placed.
+  const HloInstruction* in_progress_instruction() const {
+    return in_progress_instruction_;
+  }
+
+  // Check invariants of the data structure. This is expensive to call.
+  bool Check() const;
+
+  string ToString() const;
+
+ private:
+  // Type holding a unique identifier for each Buffer object.
+  using BufferId = int64;
+
+  // A Buffer represents a single LogicalBuffer in the computation including
+  // various metadata useful for tracking liveness of the value. A LogicalBuffer
+  // is not used directly because the HLO graph is transformed and
+  // TuplePointsToAnalysis which owns all LogicalBuffers cannot be updated after
+  // HLO graph transformations.
+  struct Buffer {
+    // The unique id of this Buffer. This value is equal to the buffer's index
+    // in the vector buffers_.
+    const BufferId id;
+
+    // The instruction which defines this buffer.
+    const HloInstruction* defining_instruction;
+
+    // The materialized size of the buffer in bytes.
+    const int64 size;
+
+    // Whether this buffer is live-out of the computation.
+    bool live_out;
+
+    // Whether this buffer has indirect uses. Ie, an instruction which is not a
+    // user of defining_instruction uses this buffer. This can occur due to
+    // buffer aliasing (eg, tuples).
+    bool has_indirect_uses;
+
+    // The instructions which use this buffer.
+    std::vector<const HloInstruction*> users;
+
+    // The number of users (HloInstructions) of this buffer which have not yet
+    // been placed in the sequence.
+    int64 unfinished_user_count;
+
+    string ToString() const {
+      return tensorflow::strings::StrCat("Buffer ", id, " (defined by ",
+                                         defining_instruction->name(),
+                                         ", size ", size, " bytes)");
+    }
+  };
+
+  // Creates a Buffer representing the given logical buffer. The buffer is added
+  // to buffers_ and a reference is returned.
+  Buffer& CreateBufferFromLogicalBuffer(
+      const LogicalBuffer* logical_buffer,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      bool live_out) {
+    bool has_indirect_uses = false;
+    std::vector<const HloInstruction*> users =
+        GetUsers(logical_buffer, points_to_analysis, &has_indirect_uses);
+    return NewBuffer(logical_buffer->instruction(),
+                     size_function(logical_buffer->shape()), std::move(users),
+                     live_out, has_indirect_uses);
+  }
+
+  // Create a new buffer representing a rematerialization of given buffer for
+  // the given uses.
+  Buffer& RematerializeBuffer(
+      const Buffer& original_buffer, const HloInstruction* remat_instruction,
+      std::vector<const HloInstruction*>&& rematerialized_uses) {
+    CHECK(IsPlaced(original_buffer.defining_instruction));
+    CHECK(!original_buffer.has_indirect_uses);
+    CHECK(!original_buffer.live_out);
+    for (const HloInstruction* use : rematerialized_uses) {
+      CHECK(!IsPlaced(use));
+    }
+    return NewBuffer(remat_instruction, original_buffer.size,
+                     std::move(rematerialized_uses), /*live_out=*/false,
+                     /*has_indirect_uses=*/false);
+  }
+
+  // Return number of bytes allocated for the buffer with the given id. Buffers
+  // allocated by the calling computation (eg, parameter and output buffers) are
+  // considered to have zero bytes because the memory is accounted for in a
+  // different computation.
+  int64 AllocatedSize(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    HloOpcode def_opcode = buffer.defining_instruction->opcode();
+    if (buffer.live_out || def_opcode == HloOpcode::kParameter) {
+      return 0;
+    } else {
+      return buffer.size;
+    }
+  }
+
+  // Returns true if BeginInstruction and EndInstruction has been called for the
+  // given instruction.
+  bool IsFinished(const HloInstruction* instruction) const {
+    return IsPlaced(instruction) && instruction != in_progress_instruction_;
+  }
+
+  // Returns whether the given buffer is being used by the in-progress
+  // instruction.
+  bool IsInUse(BufferId buffer_id) const {
+    if (in_progress_instruction_ == nullptr) {
+      return false;
+    }
+    const std::vector<BufferId>& in_progress_uses =
+        buffers_used_by_instruction_.at(in_progress_instruction_);
+    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
+                     buffer_id) != in_progress_uses.end();
+  }
+
+  // Returns whether the given instruction is live at the current program
+  // point.
+  bool IsCurrentlyLive(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_[buffer_id];
+    return (IsPlaced(buffer.defining_instruction) &&
+            buffer.unfinished_user_count > 0);
+  }
+
+  // Create a new buffer, add it to buffers_, and return a reference.
+  Buffer& NewBuffer(const HloInstruction* defining_instruction, int64 size,
+                    std::vector<const HloInstruction*>&& users, bool live_out,
+                    bool has_indirect_uses) {
+    int buffer_id = buffers_.size();
+    buffers_.push_back(Buffer{buffer_id, defining_instruction, size, live_out,
+                              has_indirect_uses, users,
+                              static_cast<int64>(users.size())});
+    return buffers_.back();
+  }
+
+  const HloComputation* computation_;
+
+  // Instruction list containing the ordering of instructions in
+  // computation_. This is the order in which instructions are placed
+  // (BeginInstruction/EndInstruction calls).
+  const InstructionList& instruction_list_;
+
+  // Memory usage at the currently placed instruction.
+  int64 memory_usage_ = 0;
+
+  // The instruction currently being placed. This value is non-null only
+  // between the calling of BeginInstruction and EndInstruction.
+  const HloInstruction* in_progress_instruction_ = nullptr;
+
+  // The buffers defined by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_defined_by_instruction_;
+
+  // The buffers used by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_used_by_instruction_;
+
+  // The set of instructions which have been placed. That is, BeginInstruction
+  // has been called with the instruction as an argument.
+  tensorflow::gtl::FlatSet<const HloInstruction*> placed_instructions_;
+
+  // All buffers in the computation.
+  std::vector<Buffer> buffers_;
+};
+
+MemoryUsageTracker::MemoryUsageTracker(
+    const HloComputation* computation,
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const InstructionList& instruction_list)
+    : computation_(computation), instruction_list_(instruction_list) {
+  // Iterate through all LogicalBuffers in the computation and gather the
+  // instructions which define them in buffers_defined_by_instruction_ and the
+  // instructions which use them in buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    // Initialize empty vectors for defs and uses of each instruction.
+    buffers_used_by_instruction_[instruction.get()];
+    buffers_defined_by_instruction_[instruction.get()];
+  }
+
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_out_set =
+      points_to_analysis.GetPointsToSet(computation_->root_instruction())
+          .CreateFlattenedSet();
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferId>
+      logical_buffer_to_buffer_id;
+
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
+      Buffer* buffer;
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        // The while instruction defines no new buffers. Instead it reuses the
+        // buffers of its operand. Find the Buffer of its operand at the
+        // proper ShapeIndex.
+        const PointsToSet& operand_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        CHECK_EQ(operand_points_to.element(logical_buffer->index()).size(), 1);
+        const LogicalBuffer* source_logical_buffer =
+            operand_points_to.element(logical_buffer->index())[0];
+        buffer =
+            &buffers_.at(logical_buffer_to_buffer_id.at(source_logical_buffer));
+
+        // Mark buffer as has indirect use and live out.
+        buffer->has_indirect_uses = true;
+        buffer->live_out =
+            buffer->live_out || ContainsKey(live_out_set, logical_buffer);
+
+        // Add users of while to Buffer users.
+        bool unused;
+        for (const HloInstruction* user :
+             GetUsers(logical_buffer, points_to_analysis, &unused)) {
+          if (std::find(buffer->users.begin(), buffer->users.end(), user) ==
+              buffer->users.end()) {
+            buffer->users.push_back(user);
+            buffer->unfinished_user_count++;
+            buffers_used_by_instruction_.at(user).push_back(buffer->id);
+          }
+        }
+      } else {
+        buffer = &CreateBufferFromLogicalBuffer(
+            logical_buffer, points_to_analysis, size_function,
+            ContainsKey(live_out_set, logical_buffer));
+        buffers_defined_by_instruction_.at(instruction).push_back(buffer->id);
+        for (const HloInstruction* user : buffer->users) {
+          buffers_used_by_instruction_.at(user).push_back(buffer->id);
+        }
+      }
+
+      logical_buffer_to_buffer_id[logical_buffer] = buffer->id;
+    }
+  }
+  XLA_VLOG_LINES(10, ToString());
+  DCHECK(Check());
+}
+
+Status MemoryUsageTracker::BeginInstruction(const HloInstruction* instruction) {
+  VLOG(3) << "BeginInstruction " << instruction->name();
+  TF_RET_CHECK(in_progress_instruction_ == nullptr);
+  in_progress_instruction_ = instruction;
+
+  placed_instructions_.insert(in_progress_instruction_);
+
+  // All buffers defined by this instruction need memory.
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    VLOG(3) << "  Buffer " << buffers_.at(buffer_id).ToString()
+            << " is now live.";
+    memory_usage_ += AllocatedSize(buffer_id);
+  }
+
+  // TODO(b/37686934): Elementwise instructions can share the buffer of a (dead)
+  // operand. Account for this potential reuse here.
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
+  return Status::OK();
+}
+
+Status MemoryUsageTracker::EndInstruction() {
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  VLOG(3) << "EndInstruction " << in_progress_instruction_->name();
+
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(in_progress_instruction_)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    buffer.unfinished_user_count--;
+    CHECK_GE(buffer.unfinished_user_count, 0)
+        << buffer.ToString() << " has negative unfinished use count.";
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer is now dead.
+      VLOG(3) << "  " << buffer.ToString() << " is now dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
+    }
+  }
+
+  // If any buffer defined by this instruction has no uses, then memory can be
+  // reclaimed immediately.
+  for (BufferId buffer_id :
+       buffers_defined_by_instruction_.at(in_progress_instruction_)) {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      VLOG(3) << "  " << buffer.ToString() << " is immediately dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
+    }
+  }
+
+  in_progress_instruction_ = nullptr;
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
+
+  return Status::OK();
+}
+
+int64 MemoryUsageTracker::MemoryReducedIfRematerialized(
+    const HloInstruction* instruction) const {
+  CHECK_NE(in_progress_instruction_, nullptr);
+  if (!IsPlaced(instruction) || instruction == in_progress_instruction_) {
+    return 0;
+  }
+
+  // TODO(b/37687140): Rematerialization can increase peak memory consumption at
+  // an earlier point in the program if rematerialization extends the live range
+  // of the operand of the instruction being rematerialized across the live
+  // range of the value of instruction being rematerialized. Don't rematerialize
+  // in this case (ie, return 0 here).
+
+  // Compute the amount of memory reduced (if any) by rematerializing
+  // 'instruction'. The LogicalBuffers defined by 'instruction' will no longer
+  // be live at this program point, so initially set memory_reduced to the
+  // size of its defined values.
+  int64 memory_reduced = 0;
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    // Avoid rematerializing instructions with indirect uses as it is difficult
+    // to reason about liveness after rematerializing the instruction.
+    // TODO(b/37714814): Consider rematerialzing instructions with indirect
+    // uses.
+    if (buffers_.at(buffer_id).has_indirect_uses) {
+      return 0;
+    }
+
+    if (IsCurrentlyLive(buffer_id) && !IsInUse(buffer_id)) {
+      memory_reduced += AllocatedSize(buffer_id);
+    }
+  }
+
+  // Account for any logical buffers whose live range must be extended across
+  // this program point.
+  for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+    if (!IsCurrentlyLive(buffer_id)) {
+      // This logical buffer is used by 'instruction' but is not live at this
+      // program point. Rematerializing 'instruction' will extend the buffer's
+      // live range across this program point.
+      memory_reduced -= AllocatedSize(buffer_id);
+    }
+  }
+
+  return memory_reduced;
+}
+
+Status MemoryUsageTracker::AddRematerializedInstruction(
+    HloInstruction* original_instruction, HloInstruction* remat_instruction) {
+  VLOG(3) << "AddRematerializedInstruction: original_instruction = "
+          << original_instruction->name()
+          << ", remat_instruction = " << remat_instruction->name();
+
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  TF_RET_CHECK(IsPlaced(original_instruction));
+  TF_RET_CHECK(!IsPlaced(remat_instruction));
+  CHECK(!ContainsKey(buffers_defined_by_instruction_, remat_instruction));
+  CHECK(!ContainsKey(buffers_used_by_instruction_, remat_instruction));
+
+  // Construct the list of buffers used and defined by the rematerialization.
+  buffers_defined_by_instruction_[remat_instruction];
+  buffers_used_by_instruction_[remat_instruction] =
+      buffers_used_by_instruction_.at(original_instruction);
+
+  // Account for the additional buffer uses created by the new rematerialization
+  // instruction. Update memory usage if the rematerialization makes a dead
+  // buffer live again.
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(original_instruction)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer used by this instruction was dead, now is alive.
+      memory_usage_ += AllocatedSize(buffer.id);
+    }
+
+    buffer.unfinished_user_count++;
+    buffer.users.push_back(remat_instruction);
+  }
+
+  // Create a new set of Buffers defined by the new rematerialization
+  // instruction. Update the internal data structures and memory use to account
+  // for them.
+  for (BufferId old_buffer_id :
+       buffers_defined_by_instruction_.at(original_instruction)) {
+    Buffer& old_buffer = buffers_.at(old_buffer_id);
+
+    std::vector<const HloInstruction*> placed_users;
+    std::vector<const HloInstruction*> unplaced_users;
+    for (const HloInstruction* user : old_buffer.users) {
+      if (IsPlaced(user)) {
+        CHECK(IsFinished(user));
+        placed_users.push_back(user);
+      } else {
+        unplaced_users.push_back(user);
+      }
+    }
+    old_buffer.users = std::move(placed_users);
+    old_buffer.unfinished_user_count = 0;
+
+    // Buffer is now dead.
+    memory_usage_ -= AllocatedSize(old_buffer.id);
+
+    Buffer& new_buffer = RematerializeBuffer(old_buffer, remat_instruction,
+                                             std::move(unplaced_users));
+
+    buffers_defined_by_instruction_.at(remat_instruction)
+        .push_back(new_buffer.id);
+    for (const HloInstruction* user : new_buffer.users) {
+      std::vector<BufferId>& buffers_used =
+          buffers_used_by_instruction_.at(user);
+      std::replace(buffers_used.begin(), buffers_used.end(), old_buffer_id,
+                   new_buffer.id);
+    }
+  }
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  XLA_VLOG_LINES(10, ToString());
+
+  DCHECK(Check());
+
+  return Status::OK();
+}
+
+string MemoryUsageTracker::ToString() const {
+  string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
+                                              computation_->name(), "\n");
+  tensorflow::strings::StrAppend(
+      &output, "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (",
+      memory_usage(), " bytes)");
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    string inprogress =
+        instruction == in_progress_instruction_ ? " in-progress" : "";
+    string placed = IsPlaced(instruction) ? " placed" : "";
+    tensorflow::strings::StrAppend(&output, "  ", instruction->name(),
+                                   inprogress, placed, "\n    Defines:\n");
+    for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+      const Buffer& buffer = buffers_[buffer_id];
+      string live = IsCurrentlyLive(buffer_id) ? " live" : "";
+      tensorflow::strings::StrAppend(&output, "      ", buffer.ToString(), live,
+                                     ", ", buffer.unfinished_user_count,
+                                     " unfinished uses\n");
+    }
+    tensorflow::strings::StrAppend(&output, "    Uses:\n");
+    for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+      tensorflow::strings::StrAppend(&output, "      ",
+                                     buffers_[buffer_id].ToString(), "\n");
+    }
+  }
+  return output;
+}
+
+bool MemoryUsageTracker::Check() const {
+  auto elements_are_unique = [](const std::vector<BufferId>& vec) {
+    return vec.size() == std::set<BufferId>(vec.begin(), vec.end()).size();
+  };
+
+  // Verify buffers_defined_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& defined_buffers =
+        buffers_defined_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(defined_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique defined buffers: "
+        << tensorflow::str_util::Join(
+               defined_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
+
+    for (const Buffer& buffer : buffers_) {
+      if (buffer.defining_instruction == instruction.get()) {
+        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
+                        buffer.id) != defined_buffers.end())
+            << "Instruction " << instruction->name()
+            << " defined buffers is missing: " << buffer.ToString();
+      }
+    }
+  }
+
+  // Verify buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& used_buffers =
+        buffers_used_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(used_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique used buffers: "
+        << tensorflow::str_util::Join(
+               used_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
+  }
+  for (const Buffer& buffer : buffers_) {
+    int64 unfinished_uses = 0;
+    for (const HloInstruction* user : buffer.users) {
+      const std::vector<BufferId>& used_buffers =
+          buffers_used_by_instruction_.at(user);
+      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
+            used_buffers.end())
+          << "Instruction " << user->name() << " used buffers is missing "
+          << buffer.ToString();
+      if (!IsFinished(user)) {
+        unfinished_uses++;
+      }
+    }
+    CHECK_EQ(buffer.unfinished_user_count, unfinished_uses)
+        << "Incorrect unplaced use count for " << buffer.ToString();
+  }
+
+  // Verify live set size against memory_usage_.
+  int64 live_size = 0;
+  for (const Buffer& buffer : buffers_) {
+    // The while instruction reuses its input buffers as output buffers so
+    // don't double count its buffers if it is currently executing.
+    if (IsCurrentlyLive(buffer.id) &&
+        !(buffer.defining_instruction == in_progress_instruction_ &&
+          in_progress_instruction_->opcode() == HloOpcode::kWhile)) {
+      live_size += AllocatedSize(buffer.id);
+    }
+  }
+  CHECK(live_size == memory_usage_)
+      << "Live set size " << live_size << " is not same as memory usage "
+      << memory_usage_
+      << ". This could happen if some nodes defined in the "
+         "computation are not being used/executed.";
+
+  return true;
+}
+
+// Computes and returns the cost of rematerializing the given instruction.
+// Cost per rematerialized instruction is defined as:
+//
+// (flop_count + transcendental_count + element_count) / memory_reduced
+//
+//   flop_count: from HloCostAnalysis
+//   transcendental_count: from HloCostAnalysis
+//   element_count: number of elements accessed in operands and output of
+//     instruction
+//   memory_reduced: The memory usage reduced by rematerializing the
+//     instruction.
+//
+// This is a rough estimate of the extra execution time per byte saved by
+// rematerializing this instruction for its remaining uses. In general, we
+// want the most memory saving for the least latency penalty which is captured
+// by this heuristic.
+int64 RematerializationCost(const HloInstruction* instruction,
+                            const MemoryUsageTracker& memory_tracker,
+                            const HloCostAnalysis& cost_analysis,
+                            int64 memory_reduced) {
+  // If none of the users of 'instruction' have been placed in the sequence (as
+  // tracked by memory_tracker), then rematerialization of 'instruction' is a
+  // zero-cost move of 'instruction' in the sequence.
+  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
+                   [&memory_tracker](const HloInstruction* inst) {
+                     return memory_tracker.IsPlaced(inst);
+                   })) {
+    return 0;
+  }
+
+  CHECK_GT(memory_reduced, 0);
+  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
+  const int64 elements_accessed =
+      ShapeUtil::IsTuple(instruction->shape())
+          ? bytes_accessed
+          : bytes_accessed / ShapeUtil::ByteSizeOfPrimitiveType(
+                                 instruction->shape().element_type());
+
+  // Multiply by 256 to improve precision of cost. Without this factor,
+  // many instructions such as many elementwise instructions would have
+  // zero cost because the bytes reduced can be several times greater than
+  // the element count.
+  return 256 *
+         (cost_analysis.flop_count(*instruction) +
+          cost_analysis.transcendental_count(*instruction) +
+          elements_accessed) /
+         memory_reduced;
+}
+
+// Selects and returns the best candidate instruction for rematerialization.
+// The instruction with lowest rematerialization cost is selected among those
+// candidate which reduce memory use at the program point of the current
+// instruction as indicated by memory_tracker. nullptr is returned if no
+// candidate can be found.
+HloInstruction* PickRematerializationCandidate(
+    const MemoryUsageTracker& memory_tracker,
+    const InstructionList& instruction_list,
+    const HloCostAnalysis& cost_analysis,
+    const tensorflow::gtl::FlatSet<const HloInstruction*>& blacklist) {
+  HloInstruction* best = nullptr;
+  int64 best_cost = 0;
+
+  // TODO(b/35244891): This is currently quadratic in the number of HLO
+  // instructions.
+  for (HloInstruction* candidate : instruction_list.instructions()) {
+    if (!memory_tracker.IsPlaced(candidate)) {
+      // Only iterate up to the currently placed instruction as indicated by
+      // memory_tracker. We are trying to reduce memory usage at the placed
+      // instruction so rematerializing later values is of no benefit.
+      break;
+    }
+    VLOG(5) << "considering rematerialization candidate " << candidate->name();
+
+    if (ContainsKey(blacklist, candidate)) {
+      // Skip instructions on the blacklist to avoid infinite loops of
+      // rematerializing the same instruction(s) repeatedly.
+      VLOG(5) << "candidate " << candidate->name()
+              << " is excluded from rematerialization";
+      continue;
+    }
+
+    if (!IsRematerializable(candidate)) {
+      VLOG(5) << "candidate " << candidate->name()
+              << " not viable: is not rematerializable";
+      continue;
+    }
+
+    const int64 memory_reduced =
+        memory_tracker.MemoryReducedIfRematerialized(candidate);
+
+    if (memory_reduced <= 0) {
+      VLOG(5) << "candidate " << candidate->name()
+              << " memory reduced = " << memory_reduced << " <= 0";
+      continue;
+    }
+
+    const int cost = RematerializationCost(candidate, memory_tracker,
+                                           cost_analysis, memory_reduced);
+
+    VLOG(5) << "candidate " << candidate->name() << ", memory reduced "
+            << memory_reduced << ", cost per byte " << cost;
+
+    if (best == nullptr || cost < best_cost) {
+      VLOG(5) << "candidate " << candidate->name() << " now best";
+      best = candidate;
+      best_cost = cost;
+    }
+  }
+  return best;
+}
+
+}  // namespace
+
+StatusOr<int64> HloRematerialization::ComputePeakMemory(
+    const HloComputation* computation,
+    const std::vector<const HloInstruction*>& order) const {
+  InstructionList instruction_list(order);
+  MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
+                             instruction_list);
+  int64 peak_memory = tracker.memory_usage();
+  for (const HloInstruction* instruction : order) {
+    TF_RETURN_IF_ERROR(tracker.BeginInstruction(instruction));
+    TF_ASSIGN_OR_RETURN(int64 callee_usage,
+                        CalledComputationsMemoryUsage(instruction));
+    peak_memory =
+        std::max<int64>(peak_memory, tracker.memory_usage() + callee_usage);
+    TF_RETURN_IF_ERROR(tracker.EndInstruction());
+  }
+  VLOG(1) << "Peak memory for " << computation->name() << ": "
+          << HumanReadableNumBytes(peak_memory);
+  return peak_memory;
+}
+
+StatusOr<int64> HloRematerialization::CalledComputationsMemoryUsage(
+    const HloInstruction* instruction) const {
+  const CallSite* callsite =
+      call_graph_->GetNode(instruction->parent()).GetCallSite(instruction);
+  if (callsite == nullptr || callsite->context() == CallContext::kParallel) {
+    return 0;
+  }
+  int64 callee_usage = 0;
+  for (const HloComputation* computation : callsite->called_computations()) {
+    TF_RET_CHECK(ContainsKey(computation_peak_memory_, computation));
+    callee_usage += computation_peak_memory_.at(computation);
+  }
+  return callee_usage;
+}
+
+StatusOr<bool> HloRematerialization::RematerializeComputation(
+    HloComputation* computation,
+    SequentialHloOrdering::HloModuleSequence* sequence,
+    int64 memory_limit_bytes) {
+  VLOG(1) << "Rematerializing computation " << computation->name()
+          << " with limit " << HumanReadableNumBytes(memory_limit_bytes);
+  VLOG(1) << "peak memory usage is "
+          << HumanReadableNumBytes(computation_peak_memory_.at(computation));
+  CHECK(!ContainsKey(rematerialized_computations_, computation));
+
+  InstructionList instruction_list(sequence->at(computation));
+  MemoryUsageTracker memory_tracker(computation, size_function_,
+                                    *points_to_analysis_, instruction_list);
+  bool changed = false;
+
+  // To avoid an infinite loop rematerializing the same set of instructions ad
+  // infinitum, keep a blacklist of instructions which should not be
+  // rematerialized.
+  tensorflow::gtl::FlatSet<const HloInstruction*> blacklist;
+
+  // If the rematerialization makes the source instruction dead, then the
+  // rematerialization is added to 'remat_move_instructions' (the
+  // rematerialization is essentially a move). If the next rematerialization of
+  // the instruction is also a move then the rematerialization is added to the
+  // blacklist.
+  tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
+
+  // The peak memory of the computation at any point in the instruction
+  // sequence.
+  int64 peak_memory = memory_tracker.memory_usage();
+
+  // Total count of instructions rematerialized.
+  int64 remat_count = 0;
+  // Total count of clones created minus number of original rematerialized
+  // instructions which are dead.
+  int64 net_instructions_added = 0;
+
+  const CallGraphNode& call_graph_node = call_graph_->GetNode(computation);
+
+  // Iterate through all instructions in the sequence. At each instruction
+  // (program point) if memory_usage exceeds the specified limit then
+  // rematerialize HLO instructions until memory_usage is reduced.
+  int64 instruction_index = 0;
+  for (auto list_it = instruction_list.instructions().begin();
+       list_it != instruction_list.instructions().end(); ++list_it) {
+    HloInstruction* instruction = *list_it;
+    TF_ASSIGN_OR_RETURN(int64 callee_usage,
+                        CalledComputationsMemoryUsage(instruction));
+    TF_RETURN_IF_ERROR(memory_tracker.BeginInstruction(instruction));
+
+    VLOG(2) << "Program point at " << instruction->name()
+            << ", memory usage = " << memory_tracker.memory_usage()
+            << ", callee usage = " << callee_usage << ", [" << instruction_index
+            << "/" << instruction_list.instructions().size() << "]";
+    instruction_index++;
+
+    while (memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
+      VLOG(2) << "Over memory limit at instruction " << instruction->name()
+              << ", using "
+              << HumanReadableNumBytes(memory_tracker.memory_usage() +
+                                       callee_usage)
+              << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
+
+      HloInstruction* best = PickRematerializationCandidate(
+          memory_tracker, instruction_list, cost_analysis_, blacklist);
+
+      if (best == nullptr) {
+        VLOG(3) << "Unable to find rematerialization candidate at program "
+                   "point "
+                << instruction->name() << ". Memory usage = "
+                << HumanReadableNumBytes(memory_tracker.memory_usage() +
+                                         callee_usage);
+        break;
+      }
+
+      VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
+              << memory_tracker.MemoryReducedIfRematerialized(best) << ")";
+      changed = true;
+      remat_count++;
+
+      HloInstruction* remat =
+          computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
+
+      // Replace each remaining use of 'best' with the rematerialization.
+      std::vector<HloInstruction*> best_users_copy = best->users();
+      for (HloInstruction* user : best_users_copy) {
+        if (!memory_tracker.IsPlaced(user)) {
+          VLOG(2) << "  Replacing use of " << best->name() << " in "
+                  << user->name() << " with " << remat->name();
+          TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, remat));
+        }
+      }
+
+      // Account for the rematerialization in the memory tracker.
+      TF_RETURN_IF_ERROR(
+          memory_tracker.AddRematerializedInstruction(best, remat));
+
+      // Insert rematerialized instruction right before the earliest unplaced
+      // use of the instruction *and* the earliest unplaced last use of any
+      // operands of remat. Unplaced uses of the remat's operands are included
+      // because we don't want to extend the live range of remat's operands as
+      // this could increase memory usage.
+      std::vector<HloInstruction*> place_before = remat->users();
+      for (auto* operand : remat->operands()) {
+        for (auto* operand_user : operand->users()) {
+          if (!memory_tracker.IsPlaced(operand_user) && operand_user != remat) {
+            place_before.push_back(operand_user);
+          }
+        }
+      }
+      instruction_list.InsertBeforeInstructions(remat, place_before);
+
+      // If the rematerialized instruction is dead then rematerialization is
+      // essentially a move. Don't delete the instruction now because we don't
+      // want duplicate HloInstruction* values during the course of the
+      // transformation because we keep maps with HloInstruction* values as
+      // keys.
+      if (best->users().empty()) {
+        VLOG(2) << best->name() << " is now dead";
+        if (ContainsKey(remat_move_instructions, best)) {
+          // Previously, 'best' was a rematerialization which killed the
+          // instruction it was a copying of. Now 'remat' is a rematerialization
+          // of 'best' and kills 'best'. Stop rematerializing this instruction
+          // to avoid an infinite loop.
+          blacklist.insert(remat);
+        }
+        remat_move_instructions.insert(remat);
+      } else {
+        net_instructions_added++;
+      }
+
+      VLOG(3) << "memory_usage after rematerialization = "
+              << memory_tracker.memory_usage();
+    }
+
+    const CallSite* callsite = call_graph_node.GetCallSite(instruction);
+    if (callsite != nullptr &&
+        callsite->context() == CallContext::kSequential &&
+        memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
+      // Memory usage exceeds the limit. Try to rematerialize any
+      // subcomputation(s) that this instruction calls.
+      VLOG(1) << "Memory usage still over the limit ("
+              << (memory_tracker.memory_usage() + callee_usage) << " > "
+              << memory_limit_bytes
+              << "). Rematerializing computations called by "
+              << instruction->name();
+
+      // Recompute callee usage to account for any rematerialization performed
+      // in the callee computations.
+      for (HloComputation* called_computation :
+           callsite->called_computations()) {
+        if (!ContainsKey(rematerialized_computations_, called_computation)) {
+          // Memory limit for the subcomputation is the memory limit less the
+          // amount of memory used at this point in the computation.
+          int64 subcomputation_memory_limit_bytes = std::max<int64>(
+              0, memory_limit_bytes - memory_tracker.memory_usage());
+          TF_ASSIGN_OR_RETURN(
+              bool subcomputation_changed,
+              RematerializeComputation(called_computation, sequence,
+                                       subcomputation_memory_limit_bytes));
+          changed |= subcomputation_changed;
+        }
+      }
+      TF_ASSIGN_OR_RETURN(callee_usage,
+                          CalledComputationsMemoryUsage(instruction));
+    }
+
+    peak_memory = std::max<int64>(peak_memory,
+                                  memory_tracker.memory_usage() + callee_usage);
+    VLOG(3) << "peak memory usage = " << HumanReadableNumBytes(peak_memory);
+
+    TF_RETURN_IF_ERROR(memory_tracker.EndInstruction());
+  }
+
+  // Verify some invariants on the memory tracker.
+  CHECK_EQ(memory_tracker.memory_usage(), 0);
+  for (auto& instruction : computation->instructions()) {
+    CHECK(memory_tracker.IsPlaced(instruction.get()));
+  }
+
+  VLOG(1) << "In computation " << computation->name() << " rematerialized "
+          << remat_count << " instructions; " << net_instructions_added
+          << " net instructions added";
+  VLOG(1) << "  peak memory usage now " << HumanReadableNumBytes(peak_memory)
+          << " (was "
+          << HumanReadableNumBytes(computation_peak_memory_.at(computation))
+          << ")";
+
+  // Update peak memory used by computation.
+  computation_peak_memory_.at(computation) = peak_memory;
+
+  // Update order to include rematerialized instructions.
+  sequence->at(computation)
+      .assign(instruction_list.instructions().begin(),
+              instruction_list.instructions().end());
+
+  rematerialized_computations_.insert(computation);
+
+  instructions_rematerialized_ += remat_count;
+  net_instructions_added_ += net_instructions_added;
+
+  return changed;
+}
+
+StatusOr<bool> HloRematerialization::Run(
+    HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence,
+    int64 memory_limit_bytes) {
+  // The sequence is constructed entirely by this method.
+  TF_RET_CHECK(sequence->empty());
+
+  VLOG(1) << "HloRematerialization() with memory limit of "
+          << HumanReadableNumBytes(memory_limit_bytes);
+
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module));
+
+  // Adjust memory limit to account for the output of the entry
+  // computation. This is necessary because the per-computation accounting in
+  // MemoryUsageTracker do not include output as these are typically allocated
+  // by the caller.
+  int64 module_output_size = 0;
+  ShapeUtil::ForEachSubshape(
+      module->entry_computation()->root_instruction()->shape(),
+      [&module_output_size, this](const Shape& subshape,
+                                  const ShapeIndex& /*index*/) {
+        module_output_size += size_function_(subshape);
+      });
+
+  const int64 adjusted_memory_limit_bytes =
+      memory_limit_bytes - module_output_size;
+  VLOG(1) << "Adjusted memory limit accounting for output ("
+          << HumanReadableNumBytes(module_output_size)
+          << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
+
+  XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
+  // Create initial sequence of HLO instructions.
+  TF_ASSIGN_OR_RETURN(*sequence,
+                      CreateMemoryMinimizingSequence(
+                          *module, [this](const LogicalBuffer& buffer) {
+                            return size_function_(buffer.shape());
+                          }));
+  // Compute peak memory usage of all computations in the module called in a
+  // sequential context.
+  call_graph_ = CallGraph::Build(module);
+  TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
+      [this, sequence](const CallGraphNode& node) -> Status {
+        if (node.context() == CallContext::kSequential) {
+          TF_ASSIGN_OR_RETURN(
+              computation_peak_memory_[node.computation()],
+              ComputePeakMemory(node.computation(),
+                                sequence->at(node.computation())));
+        }
+        return Status::OK();
+      }));
+
+  // The peak memory usage of the module equals the peak memory use of the entry
+  // computation plus the output size of the computation. This is because the
+  // peak memory for a computation does not include the output as this is
+  // typically accounted for in the caller.
+  const int64 before_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
+  VLOG(1) << "Peak memory usage of module (before): "
+          << HumanReadableNumBytes(before_peak_memory);
+
+  // Run cost analysis. Operation cost is used in the heuristic for selecting
+  // instructions for rematerialization.
+  TF_RETURN_IF_ERROR(
+      module->entry_computation()->root_instruction()->Accept(&cost_analysis_));
+
+  // Subcomputations called by the entry computation will also be
+  // rematerialized.
+  TF_ASSIGN_OR_RETURN(bool changed, RematerializeComputation(
+                                        module->entry_computation(), sequence,
+                                        adjusted_memory_limit_bytes));
+
+  // Rematerialization can introduce dead code. This occurs if all uses of an
+  // instruction are replaced with rematerializations of the instruction.
+  TF_ASSIGN_OR_RETURN(bool dead_code_removed, HloDCE().Run(module));
+  changed |= dead_code_removed;
+
+  // After DCE, the module sequence may include instructions which no longer
+  // exist.
+  for (const auto& computation : module->computations()) {
+    if (sequence->at(computation.get()).size() !=
+        computation->instruction_count()) {
+      // A size mismatch between the computation instruction count and the size
+      // of the ordering of instructions can only be caused by DCE. Rebuild the
+      // order by removing the deleted instructions from the order.
+      tensorflow::gtl::FlatSet<const HloInstruction*> instruction_set;
+      for (const auto& instruction : computation->instructions()) {
+        instruction_set.insert(instruction.get());
+      }
+      // Move the old order into a temporary vector, then build new order
+      // inplace.
+      std::vector<const HloInstruction*>& order =
+          sequence->at(computation.get());
+      std::vector<const HloInstruction*> old_order;
+      using std::swap;
+      swap(order, old_order);
+      std::copy_if(old_order.begin(), old_order.end(),
+                   std::back_inserter(order),
+                   [&instruction_set](const HloInstruction* instruction) {
+                     return ContainsKey(instruction_set, instruction);
+                   });
+      TF_RET_CHECK(sequence->at(computation.get()).size() ==
+                   computation->instruction_count());
+    }
+  }
+  VLOG(1) << "Rematerialized " << instructions_rematerialized_
+          << " instructions in module " << module->name() << "; "
+          << net_instructions_added_ << " net instructions added";
+  const int64 current_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
+  VLOG(1) << "Peak memory usage of module now "
+          << HumanReadableNumBytes(current_peak_memory) << " ("
+          << current_peak_memory << " bytes), was "
+          << HumanReadableNumBytes(before_peak_memory) << " ("
+          << before_peak_memory << " bytes)";
+  const int64 reduced_peak_memory = before_peak_memory - current_peak_memory;
+  VLOG(1) << "Reduced peak memory by "
+          << HumanReadableNumBytes(reduced_peak_memory) << " ("
+          << reduced_peak_memory << " bytes)";
+
+  XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString());
+
+  if (current_peak_memory > memory_limit_bytes) {
+    LOG(WARNING) << "Can't reduce memory use below "
+                 << HumanReadableNumBytes(memory_limit_bytes)
+                 << " by rematerialization (only reduced to "
+                 << HumanReadableNumBytes(current_peak_memory) << ")";
+  }
+
+  return changed;
+}
+
+/* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    int64 memory_limit_bytes, HloModule* hlo_module,
+    SequentialHloOrdering::HloModuleSequence* sequence) {
+  HloRematerialization remat(size_function);
+  return remat.Run(hlo_module, sequence, memory_limit_bytes);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
new file mode 100644
index 00000000000..1693f93183b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
+
+#include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+
+namespace xla {
+
+class HloRematerialization {
+ public:
+  using ShapeSizeFunction = std::function<int64(const Shape&)>;
+
+  // Rematerialize HLO instructions in the given module to reduce peak memory
+  // use below memory_limit_bytes where memory use is defined as the total size
+  // of all live HLO instruction values. Parameters and constants are included
+  // in memory use estimates. Method parameters:
+  //
+  //   size_function: Function which returns the size in bytes of the top-level
+  //     buffer of the given shape.
+  //
+  //   memory_limit_bytes: The threshold number of bytes to reduce memory use to
+  //     via rematerialization.
+  //
+  //   hlo_module: HLO module to rematerialize instructions in.
+  //
+  //   sequence: Should point to an empty HloModuleSequence. Upon return
+  //     contains the HLO instruction order which was used for
+  //     rematerialization. This is the order in which HLO instructions should
+  //     be emitted to minimize memory use.
+  //
+  // Returns whether any instructions were rematerialized. If memory use is
+  // already below the given limit then no instructions are rematerialized and
+  // false is returned.
+  //
+  // CSE will undo the effects of this optimization and should not be run after
+  // this pass. In general, this pass should be run very late immediately before
+  // code generation.
+  static StatusOr<bool> RematerializeAndSchedule(
+      const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
+      HloModule* hlo_module,
+      SequentialHloOrdering::HloModuleSequence* sequence);
+
+ protected:
+  HloRematerialization(const ShapeSizeFunction& size_function)
+      : size_function_(size_function), cost_analysis_(size_function_) {}
+  ~HloRematerialization() {}
+
+  // Runs rematerialization on the given module. Returns whether the module was
+  // changed. memory_limit is the target maximum peak memory usage by the
+  // module. sequence should be an empty HloModuleSequence. Upon return sequence
+  // contains the memory-minimizing order in which to emit the HLO instructions.
+  StatusOr<bool> Run(HloModule* module,
+                     SequentialHloOrdering::HloModuleSequence* sequence,
+                     int64 memory_limit);
+
+  // Rematerializes instructions within the given computation. 'order' is the
+  // order in which the computation's instructions will be emitted in the
+  // backend. Rematerialized instructions will be added to the HLO computation
+  // and inserted into 'order'.
+  StatusOr<bool> RematerializeComputation(
+      HloComputation* computation,
+      SequentialHloOrdering::HloModuleSequence* sequence,
+      int64 computation_memory_limit);
+
+  // Computes and returns the peak memory used by the given computation. The
+  // peak memory is the maximum total size of all live HLO instruction values at
+  // any program point. 'order' is the order in which the HLO instructions will
+  // be emitted which is used to determine lifespans of HLO values.
+  StatusOr<int64> ComputePeakMemory(
+      const HloComputation* computation,
+      const std::vector<const HloInstruction*>& order) const;
+
+  // Returns the peak memory usage of the called computations for the given
+  // instruction. Zero is returned if the instruction calls no computations.
+  StatusOr<int64> CalledComputationsMemoryUsage(
+      const HloInstruction* instruction) const;
+
+  // Function which computes the size of the top-level buffer of a shape.
+  const ShapeSizeFunction size_function_;
+
+  // Call graph of the hlo_module.
+  std::unique_ptr<CallGraph> call_graph_;
+
+  // Analysis used for computing the rematerialization cost of instructions.
+  HloCostAnalysis cost_analysis_;
+
+  // The peak memory usage of each computation. The map contains only those
+  // computations called from sequential context
+  // (CallContext::kSequential). These values are updated as rematerialization
+  // occurs.
+  tensorflow::gtl::FlatMap<const HloComputation*, int64>
+      computation_peak_memory_;
+
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  // Set of computations which have had rematerialization
+  // applied. Rematerialization is only applied once per computation.
+  tensorflow::gtl::FlatSet<const HloComputation*> rematerialized_computations_;
+
+  // Count of the total instructions rematerialized.
+  int64 instructions_rematerialized_ = 0;
+
+  // Count of the net instructions added to the HLO module by
+  // rematerialization. This can be different than instructions_rematerialized_
+  // because some rematerializations are effectively moves in the HLO
+  // schedule. In these cases, the rematerialization instruction replaces all
+  // uses of the original instruction and the original instruction is
+  // dead. Hence, no net instructions were added.
+  int64 net_instructions_added_ = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REMATERIALIZATION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
new file mode 100644
index 00000000000..f306bcc309c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -0,0 +1,531 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_rematerialization.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+using ::testing::_;
+
+class HloRematerializationTest : public HloTestBase {
+ protected:
+  // Creates and returns a computation which can benefit from
+  // rematerialization. The computation looks like:
+  //
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %negate = negate(%bcast)
+  //   F32[2048] %concat_1 = concat({%negate, %negate})
+  //   F32[1] %slice_1 = slice(%concat_1, {0:1})
+  //   F32[1025] %concat_2 = concat({%bcast, %slice_1})
+  //   F32[1] %slice_2 = slice(%concat_2, {0:1});
+  //
+  // The instruction %bcast can be rematerialized before its use at %concat_2
+  // to reduce peak memory usage. This avoids %bcast and %concat_1 being
+  // simultaneously live. Peak memory use is about 16KB before rematerialization
+  // (during execution of %concat_1) and about 12KB after rematerializing %bcast
+  // for its use in %concat_2.
+  std::unique_ptr<HloComputation> MakeRematerializableComputation(
+      const string& suffix = "") {
+    auto builder = HloComputation::Builder(TestName() + suffix);
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+    auto bcast = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+    auto negate = builder.AddInstruction(
+        HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, bcast));
+    auto concat_1 = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {negate, negate},
+        /*dimension=*/0));
+    auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1_shape_, concat_1, /*start_indices=*/{0},
+        /*limit_indices=*/{1},
+        /*strides=*/{1}));
+    auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1},
+        /*dimension=*/0));
+    // Add a final slice to make the parameter shape match the output shape
+    // which is necessary to use this computation in a while.
+    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2,
+                                                       /*start_indices=*/{0},
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
+    return builder.Build();
+  }
+
+  // Creates and returns a computation which includes a while and can benefit
+  // from rematerialization. The computation looks like:
+  //
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1] %slice_1 = slice(%bcast, {0:1})
+  //   F32[1] %while = while(%slice_1, while_body, while_cond)
+  //   F32[1025] %concat = concat({%bcast, %while})
+  //   F32[1] %slice_2 = slice(%concat, {0:1});
+  //
+  // The instruction %bcast can be rematerialized before its use at %concat to
+  // reduce peak memory usage. This avoids %bcast being live during execution of
+  // the while. Peak memory use is maximum of 8K and 4K plus the memory use of
+  // the while subcomputations.
+  std::unique_ptr<HloComputation> MakeRematerializableWhileComputation(
+      HloComputation* while_cond, HloComputation* while_body,
+      const string& suffix = "") {
+    auto builder = HloComputation::Builder(TestName() + suffix);
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+    auto bcast = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+    auto slice_1 = builder.AddInstruction(
+        HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
+                                    /*limit_indices=*/{1},
+                                    /*strides=*/{1}));
+    auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+        vec1_shape_, while_cond, while_body, slice_1));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, while_inst},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat,
+                                                       /*start_indices=*/{0},
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
+    return builder.Build();
+  }
+
+  // Create and return a trivial computation appropriate for use as a while
+  // condition.
+  std::unique_ptr<HloComputation> MakeConditionComputation() {
+    auto builder = HloComputation::Builder(TestName() + ".cond");
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+    return builder.Build();
+  }
+
+  // Return the byte size of the top-level buffer of the given shape.
+  static int64 ByteSizeOf(const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+
+  // Various shapes used in the canned computations.
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1});
+  const Shape vec1024_shape_ = ShapeUtil::MakeShape(xla::F32, {1024});
+};
+
+// Test rematerialization of a single computation produced by
+// MakeRematerializableComputation.
+TEST_F(HloRematerializationTest, SingleComputation) {
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(MakeRematerializableComputation());
+
+  // Find and save the original broadcast instruction which should be
+  // rematerialized.
+  const HloInstruction* slice = computation->root_instruction();
+  ASSERT_THAT(slice, op::Slice(op::Concatenate(op::Broadcast(_), _)));
+  const HloInstruction* concat = slice->operand(0);
+  const HloInstruction* bcast = concat->operand(0);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Computation requires 16KB without rematerialization, but uses only 12KB
+  // with rematerialization so pick a memory limit between these values (14KB).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/14 * 1024, module.get(), &sequence));
+  EXPECT_TRUE(changed);
+
+  // Root should not have changed.
+  EXPECT_EQ(computation->root_instruction(), slice);
+
+  // The broadcast should have been rematerialized.
+  const HloInstruction* remat_bcast = concat->operand(0);
+  EXPECT_THAT(remat_bcast, op::Broadcast(::testing::Ne(bcast)));
+
+  // The rematerialized broadcast should be immediate before the concat in the
+  // sequence.
+  EXPECT_EQ(sequence.at(computation)[computation->instruction_count() - 2],
+            concat);
+  EXPECT_EQ(sequence.at(computation)[computation->instruction_count() - 3],
+            remat_bcast);
+}
+
+// Test rematerialization of a single computation produced by
+// MakeRematerializableComputation but with a sufficiently high memory limit
+// such that no instructions are rematerialized.
+TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(MakeRematerializableComputation());
+
+  EXPECT_EQ(computation->instruction_count(), 7);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/20 * 1024, module.get(), &sequence));
+
+  // No instructions should have been materialized.
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(computation->instruction_count(), 7);
+}
+
+// Test rematerialization of a computation which calls another computation via a
+// while. Both the entry computation and while body computation can have memory
+// usage reduced via rematerialization however the memory limit is set such that
+// only one computation needs to have an instruction rematerialized. The entry
+// computation should be the one chosen because rematerialization in the while
+// will presumably be more expensive.
+TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
+  auto module = CreateNewModule();
+
+  auto cond_builder = HloComputation::Builder(TestName() + ".cond");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  HloComputation* while_cond =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  HloComputation* body_computation = module->AddEmbeddedComputation(
+      MakeRematerializableComputation(/*suffix=*/".body"));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(MakeRematerializableWhileComputation(
+          while_cond, /*while_body=*/body_computation));
+
+  EXPECT_EQ(entry_computation->instruction_count(), 6);
+  EXPECT_EQ(body_computation->instruction_count(), 7);
+
+  // The body computation uses 16KB and the entry computation uses 2KB at the
+  // while so the peak memory use of the module is 18KB. Set the memory limit a
+  // bit lower (17KB) to force rematerialization of the entry computation.
+  SequentialHloOrdering::HloModuleSequence sequence;
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/17 * 1024, module.get(), &sequence));
+  EXPECT_TRUE(changed);
+
+  // Only the entry computation should have a rematerialized instruction added.
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(body_computation->instruction_count(), 7);
+}
+
+// Test rematerialization of a computation which calls another computation via a
+// while. Both the entry computation and while body computation should have
+// computations rematerialized.
+TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
+  auto module = CreateNewModule();
+
+  auto cond_builder = HloComputation::Builder(TestName() + ".cond");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  HloComputation* while_cond =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  HloComputation* body_computation = module->AddEmbeddedComputation(
+      MakeRematerializableComputation(/*suffix=*/".body"));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(MakeRematerializableWhileComputation(
+          while_cond, /*while_body=*/body_computation));
+
+  EXPECT_EQ(entry_computation->instruction_count(), 6);
+  EXPECT_EQ(body_computation->instruction_count(), 7);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/15 * 1024, module.get(), &sequence));
+  EXPECT_TRUE(changed);
+
+  // Both computations should have a rematerialized instruction added.
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(body_computation->instruction_count(), 8);
+}
+
+// Test rematerialization of a doubly nested computation. All computations
+// should have an instruction rematerialized.
+TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
+  auto module = CreateNewModule();
+
+  auto cond_builder = HloComputation::Builder(TestName() + ".cond");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  HloComputation* while_cond =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  HloComputation* inner_computation = module->AddEmbeddedComputation(
+      MakeRematerializableComputation(/*suffix=*/".inner"));
+  HloComputation* middle_computation =
+      module->AddEmbeddedComputation(MakeRematerializableWhileComputation(
+          while_cond, /*while_body=*/inner_computation,
+          /*suffix=*/".middle"));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(MakeRematerializableWhileComputation(
+          while_cond, /*while_body=*/middle_computation));
+
+  EXPECT_EQ(entry_computation->instruction_count(), 6);
+  EXPECT_EQ(middle_computation->instruction_count(), 6);
+  EXPECT_EQ(inner_computation->instruction_count(), 7);
+
+  // If all computations are maximally rematerialized then peak memory usage is
+  // ~12K so pick something slightly larger.
+  SequentialHloOrdering::HloModuleSequence sequence;
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/13 * 1024, module.get(), &sequence));
+  EXPECT_TRUE(changed);
+
+  // All computations should have a rematerialized instruction added.
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(middle_computation->instruction_count(), 7);
+  EXPECT_EQ(inner_computation->instruction_count(), 8);
+}
+
+TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
+  // Test that a single instruction is rematerialized several times. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call_1 = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call_1)
+  //   F32[1024] %call_2 = call(SubComputation, {%add_2})
+  //   F32[1024] %add_3 = add(%bcast, call_2)
+  //   F32[1024] %call_3 = call(Subcomputation, {%add_3})
+  //   F32[1024] %add_4 = add(%bcast, call_3)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across each call of Subcomputation (which requires
+  // 8KB) though the value is not used in the calls. Rematerializing %bcast
+  // across these calls reduces peak memory use from ~20KB down to ~16KB.
+  auto module = CreateNewModule();
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
+    subcomputation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto call_2 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_2}, subcomputation));
+  auto add_3 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_2));
+  auto call_3 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_3}, subcomputation));
+  auto add_4 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_3));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto count_broadcasts = [](const HloComputation* computation) {
+    int64 bcast_count = 0;
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
+        bcast_count++;
+      }
+    }
+    return bcast_count;
+  };
+
+  // Before rematerialization there should be a single broadcast instruction in
+  // the graph.
+  EXPECT_EQ(count_broadcasts(entry_computation), 1);
+  EXPECT_EQ(entry_computation->instruction_count(), 9);
+
+  EXPECT_EQ(add_2->operand(0), bcast);
+  EXPECT_EQ(add_3->operand(0), bcast);
+  EXPECT_EQ(add_4->operand(0), bcast);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  EXPECT_TRUE(changed);
+
+  // The broadcast should have been rematerialized 3 times.
+  EXPECT_EQ(count_broadcasts(entry_computation), 4);
+  EXPECT_EQ(entry_computation->instruction_count(), 12);
+
+  // The operands of add_2, add_3, and add_4 should all be rematerialized
+  // broadcasts.
+  EXPECT_NE(add_2->operand(0), bcast);
+  EXPECT_THAT(add_2->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_3->operand(0), bcast);
+  EXPECT_THAT(add_3->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_4->operand(0), bcast);
+  EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
+}
+
+class IndirectUseTest : public HloRematerializationTest,
+                        public ::testing::WithParamInterface<bool> {};
+
+TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
+  // Test that an rematerializable instruction is not rematerialized if it has
+  // an indirect use. Test is parameterized on whether the value has an indirect
+  // use, and the instruction should be rematerialized iff the value has no
+  // indirect use. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call)
+  //   {F32[1024], F32[1024]} %tuple = tuple(%bcast, %add_2)
+  //   F32[1024] %gte = GetTupleElememt(%tuple, 0)
+  //   F32[1024] %negate = negate(%gte)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across the call and rematerialization of %bcast
+  // across that point would reduce peak memory use by 4KB. However, %bcast is
+  // used indirectly in the %negate so rematerialization should not happen.
+  //
+  // This test is parameterized on whether the broadcast has an indirect use or
+  // not. The indirect use is controlled by the index of the GetTupleElement
+  // instruction. If the element is 0, then the %negate operand aliases %bcast
+  // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
+  // aliases %add_2.
+  const bool indirectly_used = GetParam();
+  auto module = CreateNewModule();
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
+    subcomputation = module->AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({bcast, add_2}));
+  auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      vec1024_shape_, tuple, indirectly_used ? 0 : 1));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, gte));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(entry_computation->instruction_count(), 8);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed,
+      HloRematerialization::RematerializeAndSchedule(
+          ByteSizeOf,
+          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  // Rematerialization should only occur if the rematerializable instruction has
+  // no indirect uses.
+  if (indirectly_used) {
+    EXPECT_FALSE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 8);
+  } else {
+    EXPECT_TRUE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 9);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IndirectUseTestInstantiation, IndirectUseTest,
+                        ::testing::Values(true, false));
+
+}  // namespace
+
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 14800b53420..867ebc7f61a 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,13 +66,13 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
-      hlo_module->AddEmbeddedComputation(CreateR0S32IdentityComputation());
+      module->AddEmbeddedComputation(CreateR0S32IdentityComputation());
   auto callee2 =
-      hlo_module->AddEmbeddedComputation(CreateR0S32IdentityComputation());
+      module->AddEmbeddedComputation(CreateR0S32IdentityComputation());
 
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(5)));
@@ -83,32 +83,31 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32_, HloOpcode::kAdd, x, y));
 
-  hlo_module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(3, hlo_module->computations().size());
+  EXPECT_EQ(3, module->computations().size());
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "before unification", false, false, nullptr);
   }
-  EXPECT_TRUE(
-      HloSubcomputationUnification().Run(hlo_module.get()).ValueOrDie());
+  EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "after unification", false, false, nullptr);
   }
-  EXPECT_EQ(2, hlo_module->computations().size());
+  EXPECT_EQ(2, module->computations().size());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
-      hlo_module->AddEmbeddedComputation(CreateR0S32AdditionComputation());
+      module->AddEmbeddedComputation(CreateR0S32AdditionComputation());
   auto callee2 =
-      hlo_module->AddEmbeddedComputation(CreateR0S32AdditionComputation());
+      module->AddEmbeddedComputation(CreateR0S32AdditionComputation());
 
   auto constant1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(5)));
@@ -121,33 +120,32 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32_, HloOpcode::kAdd, x, y));
 
-  hlo_module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(3, hlo_module->computations().size());
+  EXPECT_EQ(3, module->computations().size());
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "before unification", false, false, nullptr);
   }
-  EXPECT_TRUE(
-      HloSubcomputationUnification().Run(hlo_module.get()).ValueOrDie());
+  EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "after unification", false, false, nullptr);
   }
-  EXPECT_EQ(2, hlo_module->computations().size());
+  EXPECT_EQ(2, module->computations().size());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
 
 // Do not unify subcomputations with different parameter shapes.
 TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
-  auto callee1 = hlo_module->AddEmbeddedComputation(
-      CreateR1S32AdditionComputation(r1s32_5_));
-  auto callee2 = hlo_module->AddEmbeddedComputation(
-      CreateR1S32AdditionComputation(r1s32_3_));
+  auto callee1 =
+      module->AddEmbeddedComputation(CreateR1S32AdditionComputation(r1s32_5_));
+  auto callee2 =
+      module->AddEmbeddedComputation(CreateR1S32AdditionComputation(r1s32_3_));
 
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r1s32_5_, "param1"));
@@ -160,28 +158,27 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(S32, {8}), {x, y}, 0));
 
-  hlo_module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(3, hlo_module->computations().size());
+  EXPECT_EQ(3, module->computations().size());
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "before unification", false, false, nullptr);
   }
-  EXPECT_FALSE(
-      HloSubcomputationUnification().Run(hlo_module.get()).ValueOrDie());
+  EXPECT_FALSE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*hlo_module->entry_computation(),
+    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "after unification", false, false, nullptr);
   }
-  EXPECT_EQ(3, hlo_module->computations().size());
+  EXPECT_EQ(3, module->computations().size());
   EXPECT_NE(x->to_apply(), y->to_apply());
 }
 
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
@@ -191,15 +188,19 @@ TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
     builder.AddInstruction(
         HloInstruction::CreateBinary(r0f32_, HloOpcode::kPower, x, y));
     if (i == 0) {
-      module.AddEmbeddedComputation(builder.Build());
+      module->AddEmbeddedComputation(builder.Build());
     } else {
-      module.AddEntryComputation(builder.Build());
+      module->AddEntryComputation(builder.Build());
     }
   }
 
-  EXPECT_TRUE(HloSubcomputationUnification().Run(&module).ValueOrDie());
-  EXPECT_EQ(1, module.computations().size());
-  EXPECT_EQ(module.computations().front().get(), module.entry_computation());
+  EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
+  EXPECT_EQ(1, module->computations().size());
+  EXPECT_EQ(module->computations().front().get(), module->entry_computation());
 }
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
new file mode 100644
index 00000000000..6707b02c5c5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -0,0 +1,214 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+LIcensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::tensorflow::TensorShapeProto;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+using ::tensorflow::str_util::Join;
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+string GetOpDefName(const HloInstruction* instruction) {
+  string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
+  tensorflow::str_util::TitlecaseString(&name, "-");
+  name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
+
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    string fusion_name = ToString(instruction->fusion_kind());
+    StrAppend(&name, tensorflow::StringPiece(fusion_name).substr(1));
+  }
+  return name;
+}
+
+TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
+  TensorShapeProto tensor_shape;
+  const Shape& shape = instruction->shape();
+  for (auto dim : shape.dimensions()) {
+    tensor_shape.add_dim()->set_size(dim);
+  }
+  return tensor_shape;
+}
+
+}  // namespace
+
+void CleanNodeName(string* name) {
+  name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
+  const string chars_to_replace = "<>[]";
+  auto pred = [&](char c) {
+    return std::find(chars_to_replace.begin(), chars_to_replace.end(), c) !=
+           chars_to_replace.end();
+  };
+  std::replace_if(name->begin(), name->end(), pred, '_');
+}
+
+Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
+  VLOG(2) << "Adding computation " << computation.name();
+  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
+    for (auto& instruction : embedded->instructions()) {
+      TF_RETURN_IF_ERROR(AddInstruction(instruction.get()));
+    }
+  }
+  for (auto& instruction : computation.instructions()) {
+    TF_RETURN_IF_ERROR(AddInstruction(instruction.get()));
+  }
+  return Status::OK();
+}
+
+const GraphDef& HloTfGraphBuilder::GetGraphDef() const { return graph_def_; }
+
+const string& HloTfGraphBuilder::GetNodeNameForInstruction(
+    const HloInstruction* instruction) {
+  if (ContainsKey(instruction_to_node_name_, instruction)) {
+    return instruction_to_node_name_[instruction];
+  }
+  string node_name;
+  // If an instruction is fused, put it in the subgraph of the fusion;
+  // otherwise, put it in the computation subgraph.
+  if (instruction->IsFused()) {
+    node_name = GetNodeNameForInstruction(instruction->fusion_instruction());
+  } else {
+    node_name = instruction->parent()->name();
+    if (!instruction->metadata().op_name().empty()) {
+      // Always make computations contain TF ops but not the other way around.
+      StrAppend(&node_name, "/", instruction->metadata().op_name());
+    }
+  }
+  string instruction_name = instruction->name();
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    StrAppend(&instruction_name, ".", instruction->parameter_number());
+  }
+  StrAppend(&node_name, "/", instruction_name);
+  CleanNodeName(&node_name);
+  auto ret =
+      instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
+  CHECK(ret.second);
+  return ret.first->second;
+}
+
+void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
+                                     NodeDef* node_def) const {
+  auto& attrs = *node_def->mutable_attr();
+
+  // Set the number of arguments for instructions that have variadic operands.
+  if (HloOpcodeIsVariadic(instruction->opcode())) {
+    tensorflow::AttrValue attr_value;
+    attr_value.set_i(instruction->operands().size());
+    attrs["arg_num"] = attr_value;
+  }
+
+  // Set the node type.
+  attrs["type"].set_s(
+      xla::PrimitiveType_Name(instruction->shape().element_type()));
+
+  // Set the framework op (e.g. Tensorflow op) that generated this XLA op.
+  attrs["tf_op_type"].set_s(instruction->metadata().op_type());
+  attrs["tf_op_name"].set_s(instruction->metadata().op_name());
+
+  // Set the shape of the output tensor. "_output_shapes" is a special attribute
+  // name used by Tensorboard for shapes of output tensors.
+  tensorflow::AttrValue shapes;
+  *shapes.mutable_list()->add_shape() = GetTensorShape(instruction);
+  attrs["_output_shapes"] = shapes;
+
+  // Set the layout.
+  if (LayoutUtil::HasLayout(instruction->shape())) {
+    string layout_string;
+    if (ShapeUtil::IsTuple(instruction->shape())) {
+      // For tuples, emit the full shape because the layout of a tuple is not
+      // represented in a single Layout field.
+      layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
+    } else {
+      layout_string = StrCat(
+          "{", Join(instruction->shape().layout().minor_to_major(), ","), "}");
+    }
+    attrs["layout"].set_s(layout_string);
+  }
+
+  // Set op-specific attributes.
+  switch (instruction->opcode()) {
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReverse:
+    case HloOpcode::kTranspose:
+      for (auto dim : instruction->dimensions()) {
+        attrs["dims"].mutable_list()->add_i(dim);
+      }
+      break;
+    case HloOpcode::kGetTupleElement:
+      attrs["index"].set_i(instruction->tuple_index());
+      break;
+    case HloOpcode::kRng:
+      attrs["dist"].set_s(
+          RandomDistribution_Name(instruction->random_distribution()));
+      break;
+    case HloOpcode::kConstant:
+      if (ShapeUtil::IsScalar(instruction->shape())) {
+        attrs["value"].set_s(
+            LiteralUtil::GetAsString(instruction->literal(), {}));
+      }
+      break;
+    case HloOpcode::kCustomCall:
+      attrs["custom_call_target"].set_s(instruction->custom_call_target());
+      break;
+    default:
+      break;
+  }
+}
+
+Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) {
+  if (!visited_instructions_.insert(instruction).second) {
+    // Skip instructions that have already been added.
+    return Status::OK();
+  }
+
+  NodeDef* node_def = graph_def_.add_node();
+  node_def->set_name(GetNodeNameForInstruction(instruction));
+  node_def->set_op(GetOpDefName(instruction));
+  SetNodeAttrs(instruction, node_def);
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    for (auto& fused_instruction : instruction->fused_instructions()) {
+      TF_RETURN_IF_ERROR(AddInstruction(fused_instruction.get()));
+    }
+  }
+  // Add all edges including control edges.
+  for (unsigned i = 0; i < instruction->operands().size(); ++i) {
+    *node_def->add_input() = GetNodeNameForInstruction(instruction->operand(i));
+  }
+  // Called computations are control dependencies.
+  for (const auto* called_computation : instruction->called_computations()) {
+    *node_def->add_input() = StrCat(
+        "^", GetNodeNameForInstruction(called_computation->root_instruction()));
+  }
+  return Status::OK();
+}
+
+}  // namespace hlo_graph_dumper
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
new file mode 100644
index 00000000000..b2c578af912
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+
+// This constructs a tensorflow graph for HLO computations.
+class HloTfGraphBuilder {
+ public:
+  // Adds a computation to the graph.
+  Status AddComputation(const HloComputation& computation);
+
+  const tensorflow::GraphDef& GetGraphDef() const;
+
+ private:
+  // Gets the node name of an instruction. The node name is hierarchical. For
+  // example, if an instruction is fused, it will be put in a subgraph of the
+  // fusion instruction.
+  const string& GetNodeNameForInstruction(const HloInstruction* instruction);
+
+  void SetNodeAttrs(const HloInstruction* instruction,
+                    tensorflow::NodeDef* node_def) const;
+
+  Status AddInstruction(const HloInstruction* instruction);
+
+  tensorflow::GraphDef graph_def_;
+  // This records instructions that have been visited.
+  std::unordered_set<const HloInstruction*> visited_instructions_;
+  // A cache that maps instruction to the node name.
+  std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
+};
+
+// Cleans the node name to make it a valid name in a tensorflow graph.
+void CleanNodeName(string* name);
+
+}  // namespace hlo_graph_dumper
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
new file mode 100644
index 00000000000..c2718ea8003
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+using ::tensorflow::GraphDef;
+
+class HloTfGraphBuilderTest : public HloTestBase {
+ protected:
+  HloTfGraphBuilderTest() {}
+  HloTfGraphBuilder generator_;
+
+  // Create a computation which takes a scalar and returns its negation.
+  std::unique_ptr<HloComputation> CreateNegateComputation() {
+    auto builder = HloComputation::Builder("Negate");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
+    return builder.Build();
+  }
+
+  // Creates a computation which calls map with the given computation.
+  std::unique_ptr<HloComputation> CreateMapComputation(
+      HloComputation *map_computation) {
+    auto builder = HloComputation::Builder("Map");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateMap(r0f32_, {param}, map_computation));
+    return builder.Build();
+  }
+  Shape r0f32_ = ShapeUtil::MakeShape(PrimitiveType::F32, {});
+};
+
+static const tensorflow::AttrValue &GetNodeAttr(const tensorflow::NodeDef &node,
+                                                const string &attr_name) {
+  auto attr = node.attr().find(attr_name);
+  CHECK(attr != node.attr().end());
+  return attr->second;
+}
+
+TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
+  auto builder = HloComputation::Builder("Concatenate");
+  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2});
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {param_1, param_2}, 1));
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  const auto &node = graph_def.node(2);
+  EXPECT_EQ(node.name(), "Concatenate/concatenate");
+
+  // Check dimensions.
+  auto dims_value = GetNodeAttr(node, "dims");
+  EXPECT_EQ(dims_value.list().i_size(), 1);
+  EXPECT_EQ(dims_value.list().i(0), 1);
+
+  // Check shapes.
+  auto shape_value = GetNodeAttr(node, "_output_shapes");
+  EXPECT_EQ(shape_value.list().shape_size(), 1);
+  EXPECT_EQ(shape_value.list().shape(0).dim_size(), 2);
+  EXPECT_EQ(shape_value.list().shape(0).dim(0).size(), 2);
+  EXPECT_EQ(shape_value.list().shape(0).dim(1).size(), 4);
+}
+
+TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
+  auto builder = HloComputation::Builder("Const");
+  HloInstruction *instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
+  OpMetadata metadata;
+  metadata.set_op_name("x");
+  metadata.set_op_type("y");
+  instruction->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 1);
+  const auto &node = graph_def.node(0);
+  EXPECT_EQ(GetNodeAttr(node, "value").s(), "123");
+  EXPECT_EQ(GetNodeAttr(node, "type").s(), "S32");
+  EXPECT_EQ(GetNodeAttr(node, "tf_op_name").s(), "x");
+  EXPECT_EQ(GetNodeAttr(node, "tf_op_type").s(), "y");
+}
+
+TEST_F(HloTfGraphBuilderTest, SimpleNegateComputation) {
+  auto negate_computation = CreateNegateComputation();
+  TF_CHECK_OK(generator_.AddComputation(*negate_computation));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 2);
+  EXPECT_EQ(graph_def.node(0).name(), "Negate/param0.0");
+  EXPECT_EQ(graph_def.node(0).op(), "HloParameter");
+  EXPECT_EQ(graph_def.node(1).name(), "Negate/negate");
+  EXPECT_EQ(graph_def.node(1).op(), "HloNegate");
+  EXPECT_EQ(graph_def.node(1).input_size(), 1);
+  EXPECT_EQ(graph_def.node(1).input(0), "Negate/param0.0");
+}
+
+TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
+TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  auto ge = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  OpMetadata metadata;
+  metadata.set_op_name("x/y");
+  metadata.set_op_type("Y");
+  ge->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
+TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
+  // Create computations with a diamond-shaped callgraph.
+  auto negate_computation = CreateNegateComputation();
+  auto map1_computation = CreateMapComputation(negate_computation.get());
+  auto map2_computation = CreateMapComputation(negate_computation.get());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto map1 = builder.AddInstruction(
+      HloInstruction::CreateMap(r0f32_, {param}, map1_computation.get()));
+  auto map2 = builder.AddInstruction(
+      HloInstruction::CreateMap(r0f32_, {param}, map2_computation.get()));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, map1, map2));
+  auto computation = builder.Build();
+  TF_CHECK_OK(generator_.AddComputation(*computation));
+  EXPECT_GT(generator_.GetGraphDef().node_size(), 0);
+}
+
+}  // namespace
+}  // namespace hlo_graph_dumper
+}  // namespace xla
+
+int main(int argc, char **argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
new file mode 100644
index 00000000000..de6081e57e7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+
+namespace xla {
+
+StatusOr<bool> HloVerifier::Run(HloModule* module) {
+  for (auto& computation : module->computations()) {
+    for (const auto& instruction : computation->instructions()) {
+      TF_RET_CHECK(instruction->parent() == computation.get());
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        for (const auto& fused : instruction->fused_instructions()) {
+          TF_RET_CHECK(fused->parent() ==
+                       instruction->fused_instructions_computation())
+              << "Fused HLO was missing a parent: " << fused->ToString()
+              << " parent: " << fused->parent()
+              << " computation: " << computation.get();
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
new file mode 100644
index 00000000000..5159420b3fb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that verifies invariants of HLO instructions for each computation in
+// the module.
+class HloVerifier : public HloPassInterface {
+ public:
+  ~HloVerifier() override = default;
+  tensorflow::StringPiece name() const override { return "verifier"; }
+
+  // Note: always returns false (no instructions are ever modified by this
+  // pass).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 0054edcf6ab..2887a8a0a09 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -22,13 +22,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -56,14 +59,14 @@ TEST_F(InlinerTest, MapMax) {
       HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
-  HloInstruction* root = hlo_module->entry_computation()->root_instruction();
+
   Inliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
-  root = hlo_module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kMaximum);
+  EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
+              op::Maximum(lhs, rhs));
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -90,14 +93,14 @@ TEST_F(InlinerTest, MapConstant) {
       HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEmbeddedComputation(std::move(const2_f32));
   hlo_module->AddEntryComputation(std::move(computation));
   HloInstruction* root = hlo_module->entry_computation()->root_instruction();
   Inliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  EXPECT_THAT(root, op::Broadcast(op::Constant()));
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -107,3 +110,7 @@ TEST_F(InlinerTest, MapConstant) {
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 42e33d53967..721640cdbd8 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -29,7 +29,8 @@ limitations under the License.
 
 namespace xla {
 
-bool IsExpensive(const HloInstruction& instruction) {
+/*static*/ bool InstructionFusion::IsExpensive(
+    const HloInstruction& instruction) {
   switch (instruction.opcode()) {
     // Cheap instructions.
     case HloOpcode::kAbs:
@@ -50,7 +51,7 @@ bool IsExpensive(const HloInstruction& instruction) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kGt:
     case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
+    case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLogicalAnd:
     case HloOpcode::kLogicalNot:
@@ -61,6 +62,7 @@ bool IsExpensive(const HloInstruction& instruction) {
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
     case HloOpcode::kNegate:
+    case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
@@ -100,12 +102,18 @@ bool IsExpensive(const HloInstruction& instruction) {
     case HloOpcode::kRecv:
       return true;
   }
+
+  return false;
 }
 
-bool FusionWouldDuplicate(HloInstruction* producer, HloInstruction* consumer) {
-  return !(producer->users().size() == 1 &&
-           producer->users().count(consumer) == 1);
+namespace {
+// Returns true if fusing producer into consumer would cause producer to be
+// duplicated. This is the case if producer has uses other than consumer.
+bool FusionWouldDuplicate(const HloInstruction& producer,
+                          const HloInstruction& consumer) {
+  return !(producer.users().size() == 1 && consumer.IsUserOf(&producer));
 }
+}  // namespace
 
 StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   bool changed = false;
@@ -122,8 +130,54 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         computation_->MakeInstructionPostOrder();
     std::vector<HloInstruction*> post_order(post_order_list.begin(),
                                             post_order_list.end());
+
+    std::set<HloInstruction*> all_consumers_fusable;
+    // Find which ops can be fused into all of their operands. We would rather
+    // not fuse an op into only some of its users, as that offers no benefit in
+    // terms of memory bandwidth, but forces us to keep more live values around.
+    for (auto* hlo : post_order) {
+      auto user_fusable_into_hlo = [this, &hlo](HloInstruction* consumer) {
+        if (!consumer->IsFusable()) {
+          return false;
+        }
+        for (int operand_number = 0;
+             operand_number < consumer->operands().size(); ++operand_number) {
+          if (consumer->operand(operand_number) == hlo) {
+            if (!ShouldFuse(consumer, operand_number)) {
+              return false;
+            }
+          }
+        }
+        return true;
+      };
+
+      // An "effectively unary" operation is one that has one "large"
+      // input with the others being negligible in terms of memory usage.
+      // We use "has a smaller true rank than the output" as a heuristic
+      // for "negligible" memory usage.
+      auto effectively_unary = [](HloInstruction* hlo) {
+        if (hlo->operands().size() == 1) {
+          return true;
+        }
+        auto output_rank = ShapeUtil::TrueRank(hlo->shape());
+        return std::count_if(
+                   hlo->operands().begin(), hlo->operands().end(),
+                   [output_rank](HloInstruction* operand) {
+                     return ((operand->opcode() != HloOpcode::kBroadcast) &&
+                             ShapeUtil::TrueRank(operand->shape()) >=
+                                 output_rank);
+                   }) <= 1;
+      };
+
+      if (effectively_unary(hlo) ||
+          std::all_of(hlo->users().begin(), hlo->users().end(),
+                      user_fusable_into_hlo)) {
+        all_consumers_fusable.insert(hlo);
+      }
+    }
+
     tensorflow::gtl::FlatMap<HloInstruction*, int> post_order_index;
-    for (int i = 0; i < post_order.size(); ++i) {
+    for (size_t i = 0; i < post_order.size(); ++i) {
       InsertOrDie(&post_order_index, post_order[i], i);
     }
 
@@ -208,6 +262,12 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
+
+        if (FusionWouldDuplicate(*operand, *instruction) &&
+            (all_consumers_fusable.count(operand) == 0)) {
+          continue;
+        }
+
         if (operand->IsFusable() && ShouldFuse(instruction, i)) {
           HloInstruction* fusion_instruction = Fuse(operand, instruction);
 
@@ -260,8 +320,8 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
                                    int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
   // Cost condition: don't duplicate expensive instructions.
-  if (FusionWouldDuplicate(producer, consumer) &&
-      (IsExpensive(*producer) || !may_duplicate_)) {
+  if (FusionWouldDuplicate(*producer, *consumer) &&
+      (is_expensive_(*producer) || !may_duplicate_)) {
     return false;
   }
 
@@ -274,7 +334,7 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
   // Cost condition: not fuse (expensive producers) and (consumers who reuse
   // operand elements).
   if (consumer->ReusesOperandElements(operand_index) &&
-      IsExpensive(*producer)) {
+      is_expensive_(*producer)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index b8fd3dd4f37..a9f3723f2df 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -24,15 +24,6 @@ limitations under the License.
 
 namespace xla {
 
-// Returns true if the computation of the given instruction is significantly
-// more expensive than just writing all the values of the instructions' result
-// array. Expensive operations should not be duplicated.
-bool IsExpensive(const HloInstruction& instruction);
-
-// Returns true if fusing producer into consumer would cause producer to be
-// duplicated. This is the case if producer has uses other than consumer.
-bool FusionWouldDuplicate(HloInstruction* producer, HloInstruction* consumer);
-
 // HLO pass which performs instruction fusion. Instructions are fused
 // "vertically", meaning producing instructions are fused into their consumers
 // with the intent that the loops which compute their values will be fused in
@@ -40,15 +31,22 @@ bool FusionWouldDuplicate(HloInstruction* producer, HloInstruction* consumer);
 // instructions to fuse.
 class InstructionFusion : public HloPassInterface {
  public:
-  explicit InstructionFusion(bool may_duplicate = true)
-      : may_duplicate_(may_duplicate) {}
-  ~InstructionFusion() override {}
+  explicit InstructionFusion(
+      std::function<bool(const HloInstruction& instruction)> is_expensive,
+      bool may_duplicate = true)
+      : is_expensive_(is_expensive), may_duplicate_(may_duplicate) {}
+  ~InstructionFusion() override = default;
   tensorflow::StringPiece name() const override { return "fusion"; }
 
   // Run instruction fusion on the given computation. Returns whether the
   // computation was changed (instructions were fused).
   StatusOr<bool> Run(HloModule* module) override;
 
+  // Returns true if the computation of the given instruction is significantly
+  // more expensive than just writing all the values of the instructions' result
+  // array. Expensive operations will not be duplicated.
+  static bool IsExpensive(const HloInstruction& instruction);
+
  protected:
   // Returns whether the given producer instruction should be fused into the
   // given consumer instruction. producer is necessarily an operand of consumer.
@@ -74,6 +72,10 @@ class InstructionFusion : public HloPassInterface {
  private:
   HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer);
 
+  // Used to determine if an HLO is expensive. Expensive operations will not be
+  // duplicated.
+  std::function<bool(const HloInstruction& instruction)> is_expensive_;
+
   // Returns whether we may duplicate an instruction if we want to fuse it.
   bool may_duplicate_;
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 2e3742ed75f..a2e6c2ae00b 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 
 using InstructionFusionTest = HloTestBase;
@@ -32,11 +35,13 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), exp1, {0}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_TRUE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
   EXPECT_EQ(broadcast2, computation->root_instruction());
 }
 
@@ -51,12 +56,14 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), negate1, {0}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_TRUE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest,
@@ -69,12 +76,14 @@ TEST_F(InstructionFusionTest,
   HloInstruction* reshape2 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), exp1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_TRUE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest,
@@ -87,12 +96,14 @@ TEST_F(InstructionFusionTest,
   HloInstruction* transpose2 = builder.AddInstruction(
       HloInstruction::CreateTranspose(ShapeUtil::MakeShape(S32, {}), exp1, {}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_TRUE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
@@ -102,11 +113,13 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
@@ -116,11 +129,13 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
@@ -130,11 +145,82 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
-      InstructionFusion(/*may_duplicate=*/true).Run(module.get()).ValueOrDie());
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+}
+
+TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
+  HloComputation::Builder builder(TestName());
+  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1"));
+  HloInstruction* binary1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  builder.AddInstruction(HloInstruction::CreateSend(binary1, 0));
+  HloInstruction* unary = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(unary, computation->root_instruction());
+  EXPECT_FALSE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+}
+
+TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
+  HloComputation::Builder builder(TestName());
+  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "0"));
+  HloInstruction* unary1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kFloor, param0));
+  builder.AddInstruction(HloInstruction::CreateSend(unary1, 0));
+  HloInstruction* unary2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(unary2, computation->root_instruction());
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+}
+
+TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
+  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
+  auto small_shape = ShapeUtil::MakeShape(F32, {16});
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, small_shape, "0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1"));
+  HloInstruction* binary1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+  builder.AddInstruction(HloInstruction::CreateSend(binary1, 0));
+  HloInstruction* unary = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(unary, computation->root_instruction());
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
 }
 
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a350acc4dae..e9e199226a6 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -60,8 +60,9 @@ std::ostream& operator<<(std::ostream& out,
 }
 
 BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout,
-                                               const LogicalBuffer& buffer)
-    : layout_(layout), buffer_(&buffer) {
+                                               const LogicalBuffer& buffer,
+                                               bool mandatory)
+    : LayoutConstraint(mandatory), layout_(layout), buffer_(&buffer) {
   CHECK(LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()).ok());
 }
 
@@ -73,8 +74,9 @@ string BufferLayoutConstraint::ToString() const {
 
 OperandLayoutConstraint::OperandLayoutConstraint(
     const ShapeLayout& shape_layout, const HloInstruction* instruction,
-    int64 operand_no)
-    : shape_layout_(shape_layout),
+    int64 operand_no, bool mandatory)
+    : LayoutConstraint(mandatory),
+      shape_layout_(shape_layout),
       instruction_(instruction),
       operand_no_(operand_no) {
   CHECK(shape_layout_.LayoutIsSet());
@@ -100,7 +102,9 @@ LayoutConstraints::LayoutConstraints(
     : points_to_analysis_(points_to_analysis), computation_(computation) {
   // Gather all array-shaped logical buffers into unconstrained_buffer_ids.
   for (auto& buffer : points_to_analysis_.logical_buffers()) {
-    if (buffer->IsArray()) {
+    // The points to analysis is computed per module, restrict constraints to
+    // array buffers in this computation.
+    if (buffer->IsArray() && buffer->instruction()->parent() == computation) {
       unconstrained_buffer_ids_.insert(buffer->id());
     }
   }
@@ -115,15 +119,17 @@ bool LayoutConstraints::OperandBufferForwarded(
   auto operand_buffers =
       points_to_analysis_.GetPointsToSet(instruction->operand(operand_no))
           .CreateFlattenedSet();
-  std::vector<const LogicalBuffer*> intersection;
-  std::set_intersection(output_buffers.begin(), output_buffers.end(),
-                        operand_buffers.begin(), operand_buffers.end(),
-                        std::back_inserter(intersection));
-  return !intersection.empty();
+  for (const LogicalBuffer* output_buffer : output_buffers) {
+    if (operand_buffers.count(output_buffer) > 0) {
+      return true;
+    }
+  }
+  return false;
 }
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
-                                          const LogicalBuffer& buffer) {
+                                          const LogicalBuffer& buffer,
+                                          bool mandatory) {
   VLOG(3) << "SetBufferLayout : " << buffer << " : "
           << LayoutUtil::HumanString(layout);
 
@@ -138,26 +144,38 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
   TF_RETURN_IF_ERROR(
       LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()));
 
-  const Layout* curr_layout = BufferLayout(buffer);
-  if (curr_layout != nullptr) {
-    if (!LayoutUtil::Equal(*curr_layout, layout)) {
+  const BufferLayoutConstraint* curr_constraint =
+      GetBufferLayoutConstraint(buffer);
+  if (curr_constraint != nullptr) {
+    if (LayoutUtil::Equal(curr_constraint->layout(), layout)) {
+      // New constraint matches existing constraint. Nothing to do.
+      return Status::OK();
+    }
+    if (curr_constraint->mandatory()) {
       return FailedPrecondition(
           "Buffer %s already has the layout constraint %s, cannot add "
           "incompatible constraint %s",
           buffer.ToString().c_str(),
-          LayoutUtil::HumanString(*curr_layout).c_str(),
+          LayoutUtil::HumanString(curr_constraint->layout()).c_str(),
           LayoutUtil::HumanString(layout).c_str());
     }
-    // New constraint matches existing constraint. Nothing to do.
-    return Status::OK();
   }
 
-  auto new_constraint_it = buffer_constraints_.insert(
-      {&buffer, BufferLayoutConstraint(layout, buffer)});
-  added_constraints_.push_back(&new_constraint_it.first->second);
+  auto iter = buffer_constraints_.find(&buffer);
+  bool overwrite = iter != buffer_constraints_.end();
+  if (!overwrite) {
+    iter = buffer_constraints_
+               .insert(std::make_pair(
+                   &buffer, BufferLayoutConstraint(layout, buffer, mandatory)))
+               .first;
+  } else {
+    iter->second = BufferLayoutConstraint(layout, buffer, /*mandatory=*/true);
+  }
+  added_constraints_.push_back(&iter->second);
 
   // Remove buffer from the set of unconstrained buffers.
-  TF_RET_CHECK(unconstrained_buffer_ids_.count(buffer.id()) == 1);
+  TF_RET_CHECK(unconstrained_buffer_ids_.count(buffer.id()) ==
+               static_cast<int>(!overwrite));
   unconstrained_buffer_ids_.erase(buffer.id());
 
   return Status::OK();
@@ -165,23 +183,27 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
 
 Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
                                            const HloInstruction* instruction,
-                                           int64 operand_no) {
+                                           int64 operand_no, bool mandatory) {
   VLOG(3) << "SetOperandLayout : " << instruction->name() << ", operand "
           << operand_no << " : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
-  const ShapeLayout* curr_shape_layout = OperandLayout(instruction, operand_no);
+  const OperandLayoutConstraint* curr_shape_layout =
+      GetOperandLayoutConstraint(instruction, operand_no);
   if (curr_shape_layout != nullptr) {
-    if (!curr_shape_layout->MatchesLayoutInShape(shape_with_layout)) {
+    if (curr_shape_layout->shape_layout().MatchesLayoutInShape(
+            shape_with_layout)) {
+      // New constraint matches existing constraint. Nothing to do.
+      return Status::OK();
+    }
+    if (curr_shape_layout->mandatory()) {
       return FailedPrecondition(
           "Operand %lld of instruction %s already has a layout constraint "
           "%s, cannot add incompatible constraint %s",
           operand_no, instruction->name().c_str(),
-          curr_shape_layout->ToString().c_str(),
+          curr_shape_layout->shape_layout().ToString().c_str(),
           ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str());
     }
-    // New constraint matches existing constraint. Nothing to do.
-    return Status::OK();
   }
 
   // If any buffers in the operand occur in the output of the instruction, then
@@ -195,22 +217,31 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
   }
 
   auto key = std::make_pair(instruction, operand_no);
-  auto new_constraint_it = operand_constraints_.insert(
-      {key, OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
-                                    operand_no)});
-  added_constraints_.push_back(&new_constraint_it.first->second);
+  auto iter = operand_constraints_.find(key);
+  if (iter == operand_constraints_.end()) {
+    auto pair = std::make_pair(
+        key, OperandLayoutConstraint(ShapeLayout(shape_with_layout),
+                                     instruction, operand_no, mandatory));
+    iter = operand_constraints_.insert(pair).first;
+  } else {
+    iter->second =
+        OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
+                                operand_no, /*mandatory=*/true);
+  }
+  added_constraints_.push_back(&iter->second);
 
   return Status::OK();
 }
 
 Status LayoutConstraints::SetArrayOperandLayout(
-    const Layout& layout, const HloInstruction* instruction, int64 operand_no) {
+    const Layout& layout, const HloInstruction* instruction, int64 operand_no,
+    bool mandatory) {
   const HloInstruction* operand = instruction->operand(operand_no);
   TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
-  return SetOperandLayout(shape, instruction, operand_no);
+  return SetOperandLayout(shape, instruction, operand_no, mandatory);
 }
 
 Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
@@ -252,7 +283,7 @@ Status LayoutConstraints::SetInstructionLayout(
 
   // Create a BufferLayoutConstraint for each array shape in the output of the
   // instruction.
-  return ShapeUtil::ForEachSubshape(
+  return ShapeUtil::ForEachSubshapeWithStatus(
       shape_with_layout,
       [this, instruction](const Shape& subshape,
                           const ShapeIndex& index) -> Status {
@@ -273,15 +304,29 @@ Status LayoutConstraints::SetInstructionLayout(
 
 const Layout* LayoutConstraints::BufferLayout(
     const LogicalBuffer& buffer) const {
+  if (const auto* constraint = GetBufferLayoutConstraint(buffer)) {
+    return &constraint->layout();
+  }
+  return nullptr;
+}
+const BufferLayoutConstraint* LayoutConstraints::GetBufferLayoutConstraint(
+    const LogicalBuffer& buffer) const {
   auto it = buffer_constraints_.find(&buffer);
-  return it == buffer_constraints_.end() ? nullptr : &it->second.layout();
+  return it == buffer_constraints_.end() ? nullptr : &it->second;
 }
 
 const ShapeLayout* LayoutConstraints::OperandLayout(
     const HloInstruction* instruction, int64 operand_no) const {
+  if (const auto* constraint =
+          GetOperandLayoutConstraint(instruction, operand_no)) {
+    return &constraint->shape_layout();
+  }
+  return nullptr;
+}
+const OperandLayoutConstraint* LayoutConstraints::GetOperandLayoutConstraint(
+    const HloInstruction* instruction, int64 operand_no) const {
   auto it = operand_constraints_.find(std::make_pair(instruction, operand_no));
-  return it == operand_constraints_.end() ? nullptr
-                                          : &it->second.shape_layout();
+  return it == operand_constraints_.end() ? nullptr : &it->second;
 }
 
 const ShapeLayout* LayoutConstraints::ResultLayout() const {
@@ -298,8 +343,8 @@ string LayoutConstraints::ToString() const {
     for (int64 i = 0; i < instruction->operand_count(); ++i) {
       if (OperandLayout(instruction, i) != nullptr) {
         tensorflow::strings::StrAppend(
-            &output, "    operand (", i, "): ",
-            OperandLayout(instruction, i)->ToString(), "\n");
+            &output, "    operand (", i,
+            "): ", OperandLayout(instruction, i)->ToString(), "\n");
       }
     }
     for (const LogicalBuffer* buffer :
@@ -338,6 +383,12 @@ Status LayoutAssignment::AddMandatoryConstraints(
       // TODO(b/31425034): Change infeeds to be more like parameters, with
       // shapes in the ComputationLayout.
       shape_with_layout = &instruction->shape();
+    } else if (instruction->opcode() == HloOpcode::kOutfeed) {
+      // Constrain the input to the Outfeed instruction to be the expected
+      // layout of the Outfeed.
+      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+          instruction->outfeed_shape(), instruction.get(), 0,
+          /*mandatory=*/true));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
       // Parameter layouts must match the respective layout in
       // ComputationLayout.
@@ -369,7 +420,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             called_computation_layout.parameter_layout(i).shape(),
-            instruction.get(), i));
+            instruction.get(), i, /*mandatory=*/true));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile) {
       // Layout of input and output of kWhile instruction must be equal and must
@@ -420,7 +471,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           body_layout.result_shape(), instruction.get()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          body_layout.result_shape(), instruction.get(), 0));
+          body_layout.result_shape(), instruction.get(), 0,
+          /*mandatory=*/true));
     } else if (instruction->opcode() == HloOpcode::kCustomCall) {
       // Add constraints for kCustomCall instruction operands and instructions.
       // For now we only support row major layouts for all inputs and outputs.
@@ -444,7 +496,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
 
         Shape row_major_operand_shape(row_major_shape(operand_shape));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction.get(), i));
+            row_major_operand_shape, instruction.get(), i, /*mandatory=*/true));
       }
     }
   }
@@ -566,11 +618,11 @@ Status CheckLayouts(
       // which could be the source of the subshape value.
       const PointsToSet& points_to_set =
           points_to_analysis->GetPointsToSet(instruction.get());
-      TF_RETURN_IF_ERROR(points_to_set.ForEachElement(
+      TF_RETURN_IF_ERROR(points_to_set.ForEachElementWithStatus(
           [&instruction](
-              ShapeIndex index, bool is_leaf,
+              ShapeIndex index,
               const std::vector<const LogicalBuffer*>& buffers) -> Status {
-            if (is_leaf) {
+            if (ShapeUtil::IsLeafIndex(instruction->shape(), index)) {
               const Shape& instruction_subshape =
                   ShapeUtil::GetSubshape(instruction->shape(), index);
               for (const LogicalBuffer* buffer : buffers) {
@@ -653,44 +705,6 @@ LayoutAssignment::LayoutAssignment(ComputationLayout* entry_computation_layout)
   }
 }
 
-namespace {
-
-// Given a pemutation of `{0, 1, ..., n}` `indices`, returns a permutation of
-// `{0, 1, ..., n - to_delete.size() + to_insert.size()}` by deleting the
-// indices `to_delete` wherever in `indices` they are, and inserting the indices
-// `to_insert` arbitrarily at the back.
-tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>
-DeleteAndInsertIndices(
-    std::vector<int64> to_delete, std::vector<int64> to_insert,
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> indices) {
-  std::sort(to_delete.begin(), to_delete.end(), std::greater<int64>());
-  std::sort(to_insert.begin(), to_insert.end(), std::less<int64>());
-  for (auto index : to_delete) {
-    auto i = indices.begin();
-    while (i != indices.end()) {
-      if (*i == index) {
-        i = indices.erase(i);
-      } else {
-        if (*i > index) {
-          (*i)--;
-        }
-        ++i;
-      }
-    }
-  }
-  for (auto index : to_insert) {
-    for (auto i = indices.begin(); i != indices.end(); ++i) {
-      if (*i >= index) {
-        (*i)++;
-      }
-    }
-    indices.Add(index);
-  }
-  return indices;
-}
-
-}  // namespace
-
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
@@ -713,21 +727,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   }
 
   if (instruction->opcode() == HloOpcode::kReshape) {
-    // Pick the operand layout that makes the reshape a bitcast. If the reshape
-    // only inserts or deletes degenerate dimensions, we can easily compute the
-    // desired layout by accordingly inserting and deleting the elements in the
-    // minor-to-major list.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        instruction->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout operand_layout = LayoutUtil::MakeLayout(
-          AsInt64Slice(DeleteAndInsertIndices(inserted_indices, deleted_indices,
-                                              output_layout.minor_to_major())));
+    // Prefer the operand layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the operand shape, there may be several such
+    // layouts. So if 'output_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    const Shape& output_shape = instruction->shape();
+    Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+        AsInt64Slice(output_layout.minor_to_major()));
+    const Shape& operand_shape = operand->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(output_layout)) {
+      Shape operand_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              operand_shape.element_type(),
+              AsInt64Slice(operand_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(operand_shape_with_layout,
+                                      output_shape_with_layout)) {
+        return MakeUnique<Layout>(operand_shape_with_layout.layout());
+      }
+    }
+    auto aligned_operand_shape =
+        ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
+    if (aligned_operand_shape) {
+      auto operand_layout = aligned_operand_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
+          LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape));
       return MakeUnique<Layout>(operand_layout);
     }
   }
@@ -762,18 +787,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   }
 
   if (user->opcode() == HloOpcode::kReshape) {
-    // Pick the user layout that makes the reshape a bitcast.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        user->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout user_layout = LayoutUtil::MakeLayout(AsInt64Slice(
-          DeleteAndInsertIndices(deleted_indices, inserted_indices,
-                                 operand_layout.minor_to_major())));
+    // Prefer the user layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the user shape, there may be several such
+    // layouts. So if 'operand_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        operand->shape().element_type(),
+        AsInt64Slice(operand->shape().dimensions()),
+        AsInt64Slice(operand_layout.minor_to_major()));
+    const Shape& output_shape = user->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(operand_layout)) {
+      Shape output_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              output_shape.element_type(),
+              AsInt64Slice(output_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(output_shape_with_layout,
+                                      operand_shape_with_layout)) {
+        return MakeUnique<Layout>(output_shape_with_layout.layout());
+      }
+    }
+    auto aligned_user_shape =
+        ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
+    if (aligned_user_shape) {
+      auto user_layout = aligned_user_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
+          LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
       return MakeUnique<Layout>(user_layout);
     }
   }
@@ -877,11 +916,11 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
   // match the given layout.
   const PointsToSet& points_to_set =
       constraints->points_to_analysis().GetPointsToSet(instruction);
-  return points_to_set.ForEachElement(
+  return points_to_set.ForEachElementWithStatus(
       [this, &shape_layout, constraints](
-          const ShapeIndex& index, bool is_leaf,
+          const ShapeIndex& index,
           const std::vector<const LogicalBuffer*>& buffers) -> Status {
-        if (is_leaf) {
+        if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) {
           for (const LogicalBuffer* buffer : buffers) {
             if (constraints->BufferLayout(*buffer) == nullptr &&
                 ShapeUtil::IsArray(buffer->shape())) {
@@ -930,7 +969,8 @@ Status LayoutAssignment::PropagateOperandConstraint(
         operand_constraint.shape_layout().layout(), user,
         operand_constraint.operand_no());
     if (layout != nullptr) {
-      TF_RETURN_IF_ERROR(constraints->SetBufferLayout(*layout, *buffer));
+      TF_RETURN_IF_ERROR(
+          constraints->SetBufferLayout(*layout, *buffer, /*mandatory=*/false));
     }
   }
   return Status::OK();
@@ -960,11 +1000,19 @@ Status LayoutAssignment::PropagateBufferConstraint(
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
-              *operand_layout, instruction, operand_no));
+              *operand_layout, instruction, operand_no, /*mandatory=*/true));
         }
       }
     }
   }
+  return PropagateBufferConstraintToUses(buffer_constraint, constraints);
+}
+
+Status LayoutAssignment::PropagateBufferConstraintToUses(
+    const BufferLayoutConstraint& buffer_constraint,
+    LayoutConstraints* constraints) {
+  const LogicalBuffer& buffer = buffer_constraint.buffer();
+  TF_RET_CHECK(buffer.IsArray());
 
   // Propagate the layout to all array uses of the logical buffer. This skips
   // uses of the buffer where the buffer is the element of a tuple.
@@ -977,7 +1025,7 @@ Status LayoutAssignment::PropagateBufferConstraint(
     if (constraints->OperandLayout(user, operand_no) == nullptr &&
         !constraints->OperandBufferForwarded(user, operand_no)) {
       TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
-          buffer_constraint.layout(), user, operand_no));
+          buffer_constraint.layout(), user, operand_no, /*mandatory=*/false));
     }
   }
 
@@ -1034,7 +1082,7 @@ StatusOr<Layout> InferArrayLayout(
                                   *first_buffer_layout)) {
       // The points-to set is ambiguous for this index and the different source
       // buffers have different layouts. This case is possible in valid XLA
-      // computations because we do not propagate BufferLayoutConstaints to all
+      // computations because we do not propagate BufferLayoutConstraints to all
       // LogicalBuffers which may alias the constrained LogicalBuffer at some
       // point in the computation.
       return FailedPrecondition(
@@ -1197,7 +1245,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
 
     // Any remaining layouts in the output of the instruction must be
     // inferrable using points-to analysis.
-    TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshape(
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
         instruction->mutable_shape(),
         [instruction, &constraints](Shape* subshape, const ShapeIndex& index) {
           if (subshape->has_layout() || !ShapeUtil::IsArray(*subshape)) {
@@ -1217,6 +1265,9 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
       TF_RETURN_IF_ERROR(SetFusionLayouts(instruction));
     }
 
+    // Execute extra verification step once the layout has been finalized.
+    TF_RETURN_IF_ERROR(Verify(instruction));
+
     // Verify all layouts in the shape have been set.
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
@@ -1247,7 +1298,7 @@ Status LayoutAssignment::RunOnComputation(
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(computation->parent()));
 
-  // Construct LayoutConstaints with all layout constraints of the computation.
+  // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(*points_to_analysis, computation);
 
   // Add constraints required for correctness on all backends (eg, entry
@@ -1272,7 +1323,8 @@ Status LayoutAssignment::RunOnComputation(
     const LogicalBuffer& buffer = points_to_analysis->GetBuffer(
         *constraints.unconstrained_buffer_ids().begin());
     TF_RETURN_IF_ERROR(constraints.SetBufferLayout(
-        LayoutUtil::GetDefaultLayoutForShape(buffer.shape()), buffer));
+        LayoutUtil::GetDefaultLayoutForShape(buffer.shape()), buffer,
+        /*mandatory=*/false));
 
     TF_RETURN_IF_ERROR(PropagateConstraints(&constraints));
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 61dc7b12075..ccfc17da4c4 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -46,10 +46,16 @@ namespace xla {
 // gathered together in LayoutConstraints object.
 class LayoutConstraint {
  public:
-  LayoutConstraint() = default;
+  LayoutConstraint(bool mandatory) : mandatory_(mandatory) {}
   virtual ~LayoutConstraint() = default;
 
   virtual string ToString() const = 0;
+
+  // True if this constraint cannot be overwritten by a different constraint.
+  bool mandatory() const { return mandatory_; }
+
+ private:
+  bool mandatory_;
 };
 
 std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
@@ -58,7 +64,8 @@ std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
 // array produced by a particular instruction.
 class BufferLayoutConstraint : public LayoutConstraint {
  public:
-  BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer);
+  BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer,
+                         bool mandatory);
 
   const LogicalBuffer& buffer() const { return *buffer_; }
   const Layout& layout() const { return layout_; }
@@ -66,7 +73,7 @@ class BufferLayoutConstraint : public LayoutConstraint {
   string ToString() const override;
 
  private:
-  const Layout layout_;
+  Layout layout_;
   const LogicalBuffer* buffer_;
 };
 
@@ -78,7 +85,8 @@ class BufferLayoutConstraint : public LayoutConstraint {
 class OperandLayoutConstraint : public LayoutConstraint {
  public:
   OperandLayoutConstraint(const ShapeLayout& shape_layout,
-                          const HloInstruction* instruction, int64 operand_no);
+                          const HloInstruction* instruction, int64 operand_no,
+                          bool mandatory);
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   const HloInstruction* instruction() const { return instruction_; }
@@ -90,7 +98,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
   string ToString() const override;
 
  private:
-  const ShapeLayout shape_layout_;
+  ShapeLayout shape_layout_;
   const HloInstruction* instruction_;
   int64 operand_no_;
 };
@@ -99,7 +107,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
 class ResultLayoutConstraint : public LayoutConstraint {
  public:
   explicit ResultLayoutConstraint(const ShapeLayout& shape_layout)
-      : shape_layout_(shape_layout) {}
+      : LayoutConstraint(/*mandatory=*/true), shape_layout_(shape_layout) {}
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   string ToString() const override;
@@ -124,8 +132,7 @@ class LayoutConstraints {
   // Return a vector containing the constraints which have been added to the
   // LayoutConstraints object since the construction of the object or since the
   // last time ConsumeAddedConstraints() has been called. This is used to
-  // identify
-  // newly added constraints when propagating layouts.
+  // identify newly added constraints when propagating layouts.
   std::vector<const LayoutConstraint*> ConsumeAddedConstraints() {
     std::vector<const LayoutConstraint*> ret_vec(std::move(added_constraints_));
     added_constraints_.clear();
@@ -137,23 +144,29 @@ class LayoutConstraints {
   // instruction, or the layout of the result of the computation, respectively,
   // if it has been constrained. Otherwise return nullptr.
   const Layout* BufferLayout(const LogicalBuffer& buffer) const;
+  const BufferLayoutConstraint* GetBufferLayoutConstraint(
+      const LogicalBuffer& buffer) const;
   const ShapeLayout* OperandLayout(const HloInstruction* instruction,
                                    int64 operand_no) const;
+  const OperandLayoutConstraint* GetOperandLayoutConstraint(
+      const HloInstruction* instruction, int64 operand_no) const;
   const ShapeLayout* ResultLayout() const;
 
   // Add a constraint on the layout of a LogicalBuffer, the layout of the
   // operand of the instruction, or the layout of the result of the computation,
   // respectively.
-  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer);
+  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
+                         bool mandatory = true);
   Status SetOperandLayout(const Shape& shape_with_layout,
-                          const HloInstruction* instruction, int64 operand_no);
+                          const HloInstruction* instruction, int64 operand_no,
+                          bool mandatory = true);
   Status SetResultLayout(const Shape& shape_with_layout);
 
   // Convenience wrapper around SetOperandLayout for setting the layout of a
   // operand using a Layout object. The operand must be array-shaped.
   Status SetArrayOperandLayout(const Layout& layout,
                                const HloInstruction* instruction,
-                               int64 operand_no);
+                               int64 operand_no, bool mandatory = true);
 
   // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
   // created by the instruction to the layouts in the given shape. The
@@ -233,6 +246,39 @@ class LayoutAssignment : public HloPassInterface {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
+  // Called after layouts of an instruction have been finalized to allow
+  // subclasses to check for platform specific assumptions.
+  virtual Status Verify(const HloInstruction* instruction) {
+    return Status::OK();
+  }
+
+  // Propagates a buffer layout constraint into the operands that use it.
+  Status PropagateBufferConstraintToUses(
+      const BufferLayoutConstraint& layout_constraint,
+      LayoutConstraints* constraints);
+
+  // Propagates a layout constraint on the use of the result of the given
+  // instruction to the definitions of the LogicalBuffers which make up the
+  // result.
+  Status PropagateUseConstraintToDefs(const ShapeLayout& shape_layout,
+                                      const HloInstruction* instruction,
+                                      LayoutConstraints* constraints);
+
+  // Chooses a layout of operand `operand_no` of `instruction` that minimizes
+  // the cost of `instruction`. `output_layout` is the layout of `instruction`.
+  // Returns null if it can't decide the best layout.
+  // Precondition: `instruction` and the operand are array-shaped.
+  std::unique_ptr<Layout> ChooseOperandLayoutFromOutputLayout(
+      const Layout& output_layout, const HloInstruction* instruction,
+      int64 operand_no);
+  // Given the layout of `user`'s `operand_no`-th operand, chooses a layout of
+  // `user` that minimizes its cost on that operand.  Returns null if it can't
+  // decide the best layout.
+  // Precondition: `user` and the operand are array-shaped.
+  std::unique_ptr<Layout> ChooseOutputLayoutFromOperandLayout(
+      const Layout& operand_layout, const HloInstruction* user,
+      int64 operand_no);
+
  private:
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
@@ -267,28 +313,6 @@ class LayoutAssignment : public HloPassInterface {
   // required for correctness.
   Status PropagateConstraints(LayoutConstraints* constraints);
 
-  // Propagates a layout constraint on the use of the result of the given
-  // instruction to the definitions of the LogicalBuffers which make up the
-  // result.
-  Status PropagateUseConstraintToDefs(const ShapeLayout& shape_layout,
-                                      const HloInstruction* instruction,
-                                      LayoutConstraints* constraints);
-
-  // Chooses a layout of operand `operand_no` of `instruction` that minimizes
-  // the cost of `instruction`. `output_layout` is the layout of `instruction`.
-  // Returns null if it can't decide the best layout.
-  // Precondition: `instruction` and the operand are array-shaped.
-  std::unique_ptr<Layout> ChooseOperandLayoutFromOutputLayout(
-      const Layout& output_layout, const HloInstruction* instruction,
-      int64 operand_no);
-  // Given the layout of `user`'s `operand_no`-th operand, chooses a layout of
-  // `user` that minimizes its cost on that operand.  Returns null if it can't
-  // decide the best layout.
-  // Precondition: `user` and the operand are array-shaped.
-  std::unique_ptr<Layout> ChooseOutputLayoutFromOperandLayout(
-      const Layout& operand_layout, const HloInstruction* user,
-      int64 operand_no);
-
   ComputationLayout* entry_computation_layout_;
 
   // Map containing the layouts of all computations assigned so
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 6361907b0e4..6d818cdea0c 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -26,10 +26,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -38,9 +40,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 class LayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
@@ -63,8 +69,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
         HloInstruction::CreateParameter(1, ashape, "param1"));
     auto add = builder.AddInstruction(
         HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-    HloModule module(TestName());
-    HloComputation* computation = module.AddEntryComputation(builder.Build());
+    auto module = CreateNewModule();
+    HloComputation* computation = module->AddEntryComputation(builder.Build());
 
     Layout layout = LayoutUtil::MakeLayout(minor_to_major);
     Shape shape(ashape);
@@ -75,7 +81,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
     *computation_layout.mutable_parameter_layout(0) = shape_layout;
     *computation_layout.mutable_parameter_layout(1) = shape_layout;
     *computation_layout.mutable_result_layout() = shape_layout;
-    AssignLayouts(&module, &computation_layout);
+    AssignLayouts(module.get(), &computation_layout);
     EXPECT_TRUE(LayoutUtil::Equal(layout, param0->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, param1->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, add->shape().layout()));
@@ -93,8 +99,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
       HloInstruction::CreateParameter(1, ashape, "param1"));
   builder.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   Layout col_major_layout = LayoutUtil::MakeLayout({1, 0});
   Shape col_major_shape(ashape);
@@ -111,7 +117,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
   *computation_layout.mutable_parameter_layout(1) = row_major;
   *computation_layout.mutable_result_layout() = col_major;
 
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
   EXPECT_TRUE(LayoutUtil::Equal(col_major_layout, param0->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(row_major_layout, param1->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(
@@ -142,8 +148,8 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     auto negate2 = builder.AddInstruction(
         HloInstruction::CreateUnary(ashape, HloOpcode::kNegate, negate1));
 
-    HloModule module(TestName());
-    HloComputation* computation = module.AddEntryComputation(builder.Build());
+    auto module = CreateNewModule();
+    HloComputation* computation = module->AddEntryComputation(builder.Build());
 
     auto fusion = computation->CreateFusionInstruction(
         {negate2, negate1, add}, HloInstruction::FusionKind::kLoop);
@@ -156,7 +162,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     ComputationLayout computation_layout(computation->ComputeProgramShape());
     *computation_layout.mutable_result_layout() = shape_layout;
 
-    AssignLayouts(&module, &computation_layout);
+    AssignLayouts(module.get(), &computation_layout);
 
     EXPECT_TRUE(LayoutUtil::Equal(
         layout, fusion->fused_parameter(0)->shape().layout()));
@@ -191,13 +197,13 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   auto negate = builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kNegate, get_element0));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module.entry_computation()->ComputeProgramShape());
+      module->entry_computation()->ComputeProgramShape());
 
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
   EXPECT_FALSE(
       LayoutUtil::LayoutsInShapesEqual(constant0->shape(), constant1->shape()));
@@ -229,17 +235,17 @@ TEST_F(LayoutAssignmentTest, TupleSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple0->shape(), HloOpcode::kSelect, pred, tuple0, tuple1));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module.entry_computation()->ComputeProgramShape());
+      module->entry_computation()->ComputeProgramShape());
   Shape result_shape =
       ShapeUtil::MakeTupleShape({constant0->shape(), constant1->shape()});
   TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(result_shape, select->shape()));
 }
@@ -264,11 +270,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   auto nested_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, inner_tuple}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module.entry_computation()->ComputeProgramShape());
+      module->entry_computation()->ComputeProgramShape());
   Shape result_shape = nested_tuple->shape();
   *ShapeUtil::GetMutableSubshape(&result_shape, /*index=*/{0, 0}) =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
@@ -278,7 +284,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
       result_shape));
 
   LayoutAssignment layout_assignment(&computation_layout);
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
   // Layout assignment should have deep copied the result of the computation to
   // address the layout conflict. This results in several Tuple() and
@@ -294,9 +300,9 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   EXPECT_TRUE(
       AlgebraicSimplifier(/*is_layout_sensitive=*/true,
                           [](const Shape&, const Shape&) { return false; })
-          .Run(&module)
+          .Run(module.get())
           .ValueOrDie());
-  HloInstruction* root = module.entry_computation()->root_instruction();
+  HloInstruction* root = module->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {0}),
@@ -304,18 +310,16 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {1}),
                                root->operand(1)->shape()));
 
-  // Verify some of the structure of the HLO graph.
-  EXPECT_EQ(constant, root->operand(0)->operand(0));
-  EXPECT_EQ(HloOpcode::kCopy, root->operand(1)->operand(0)->opcode());
-  EXPECT_EQ(HloOpcode::kConstant,
-            root->operand(1)->operand(0)->operand(0)->opcode());
+  // Verify the structure of the HLO graph.
+  EXPECT_THAT(root,
+              op::Tuple(op::Tuple(constant), op::Tuple(op::Copy(constant))));
 }
 
 TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   // param -> log -> reshape -> tanh
   auto builder = HloComputation::Builder(TestName());
   Shape ashape = ShapeUtil::MakeShape(F32, {1, 2, 3, 1});
-  Shape bshape = ShapeUtil::MakeShape(F32, {2, 1, 3});
+  Shape bshape = ShapeUtil::MakeShape(F32, {3, 1, 2});
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ashape, "param"));
   auto log = builder.AddInstruction(
@@ -325,28 +329,29 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, reshape));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build(tanh));
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
-  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2, 3});
-  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2});
+  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 2, 1, 3});
+  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({2, 1, 0});
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(log_minor_to_major, 1),
+  EXPECT_GT(PositionInContainer(log_minor_to_major, 1),
             PositionInContainer(log_minor_to_major, 2));
 
   auto reshape_minor_to_major =
       AsInt64Slice(reshape->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(reshape_minor_to_major, 0),
+  EXPECT_GT(PositionInContainer(reshape_minor_to_major, 0),
             PositionInContainer(reshape_minor_to_major, 2));
 }
 
@@ -366,8 +371,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
       HloInstruction::CreateTranspose(bshape, log, {1, 0}));
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, transpose));
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build(tanh));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -378,7 +383,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::Equal(ashape_with_layout.layout(), log->shape().layout()));
@@ -402,9 +407,9 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       HloInstruction::CreateBroadcast(bshape, param, {1, 2}));
   auto transpose = builder.AddInstruction(
       HloInstruction::CreateTranspose(cshape, broadcast, {2, 1, 0}));
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* computation =
-      module.AddEntryComputation(builder.Build(transpose));
+      module->AddEntryComputation(builder.Build(transpose));
 
   Shape input_shape_with_layout(ashape);
   Shape output_shape_with_layout(cshape);
@@ -417,10 +422,10 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(input_shape_with_layout);
   *computation_layout.mutable_result_layout() =
       ShapeLayout(output_shape_with_layout);
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
-  EXPECT_TRUE(ContainersEqual(broadcast->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1, 2}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
 }
 
 TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
@@ -451,9 +456,9 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
       HloInstruction::CreateBroadcast(f32_234, tanh, {2}));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({transpose, broadcast2}));
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation* computation =
-      module.AddEntryComputation(builder.Build(tuple));
+      module->AddEntryComputation(builder.Build(tuple));
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   Shape param_shape_with_layout(f32_4);
@@ -470,17 +475,86 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
-  AssignLayouts(&module, &computation_layout);
+  AssignLayouts(module.get(), &computation_layout);
 
-  EXPECT_TRUE(ContainersEqual(broadcast->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1}));
-  EXPECT_TRUE(ContainersEqual(transpose->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{1, 0}));
-  EXPECT_TRUE(ContainersEqual(tanh->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
+  EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
+  EXPECT_THAT(tanh->shape().layout().minor_to_major(), ElementsAre(0, 1));
 }
 
-// Add test which fails due to copy tuple.
+class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
+ public:
+  explicit OperandsMustBeTheSameLayoutAssignment(
+      ComputationLayout* entry_computation_layout)
+      : LayoutAssignment(entry_computation_layout) {}
+
+ protected:
+  Status PropagateBufferConstraint(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints) override {
+    const LogicalBuffer& buffer = buffer_constraint.buffer();
+    const HloInstruction* instruction = buffer.instruction();
+
+    // Force the operands' layout to the output layout.
+    for (int64 operand_no = 0; operand_no < instruction->operand_count();
+         ++operand_no) {
+      const HloInstruction* operand = instruction->operand(operand_no);
+      if (ShapeUtil::Rank(instruction->shape()) !=
+          ShapeUtil::Rank(operand->shape())) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
+          buffer_constraint.layout(), instruction, operand_no,
+          /*mandatory=*/true));
+    }
+    return PropagateBufferConstraintToUses(buffer_constraint, constraints);
+  }
+};
+
+TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
+  // param0 -> concatenate -> reshape
+  // param1   -^
+  auto builder = HloComputation::Builder(TestName());
+  Shape ashape = ShapeUtil::MakeShape(F32, {50, 1});
+  Shape bshape = ShapeUtil::MakeShape(F32, {50, 2});
+  Shape cshape = ShapeUtil::MakeShape(F32, {100});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ashape, "param"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ashape, "param"));
+  auto concatenate = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(bshape, {param0, param1}, 1));
+  auto reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(cshape, concatenate));
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(builder.Build(reshape));
+
+  Shape param0_shape_with_layout(ashape);
+  Shape param1_shape_with_layout(ashape);
+  *param0_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  *param1_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({1, 0});
+
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(param0_shape_with_layout);
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(param1_shape_with_layout);
+  OperandsMustBeTheSameLayoutAssignment layout_assignment(&computation_layout);
+  EXPECT_IS_OK(layout_assignment.Run(module.get()).status());
+
+  EXPECT_EQ(HloOpcode::kCopy, concatenate->operand(0)->opcode());
+  EXPECT_THAT(concatenate->operand(0)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+  EXPECT_THAT(concatenate->operand(1)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+  EXPECT_THAT(concatenate->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+}
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
new file mode 100644
index 00000000000..682bf19807b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -0,0 +1,225 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/liveness_util.h"
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user) {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  return (user->opcode() == HloOpcode::kGetTupleElement && !index.empty());
+}
+
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
+                             const TuplePointsToAnalysis& points_to_analysis) {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
+    // GetTupleElement instructions only access the top-level buffer of their
+    // operand.
+    return true;
+  } else if (user->opcode() == HloOpcode::kFusion &&
+             user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+    // Find fusion parameter associated with 'operand'.
+    auto it = std::find_if(
+        user->fused_parameters().begin(), user->fused_parameters().end(),
+        [=](HloInstruction* fused_param) {
+          return user->operand(fused_param->parameter_number()) == operand;
+        });
+    CHECK(it != user->fused_parameters().end());
+    // Iterate through all users of all buffer aliases of the buffer in the
+    // points-to set of fusion parameter at 'index'.
+    // Return false if any uses are detected at 'index', returns true otherwise.
+    const LogicalBuffer* buffer =
+        points_to_analysis.GetBufferDefinedAt(*it, index).ValueOrDie();
+    for (const BufferAlias& alias :
+         points_to_analysis.GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user, points_to_analysis)) {
+          continue;
+        }
+        // Return false: use detected at 'buffer' -> 'alias' -> 'alias_user'.
+        return false;
+      }
+    }
+    // Return true: found no uses of 'operand' at 'index' in 'user'.
+    return true;
+  }
+  return false;
+}
+
+namespace {
+
+// Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
+// Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
+// where 'user' is a user of an alias of 'intruction' at 'index', and
+// 'operand_index' is the operand index at which the alias appears in the
+// operand list of 'user'.
+std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
+    HloInstruction* instruction, const ShapeIndex& index,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  std::vector<std::pair<HloInstruction*, int64>> uses;
+  const std::vector<const LogicalBuffer*>& points_to =
+      points_to_analysis.GetPointsToSet(instruction).element(index);
+  for (const LogicalBuffer* buffer : points_to) {
+    for (const BufferAlias& alias :
+         points_to_analysis.GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user, points_to_analysis)) {
+          continue;
+        }
+        for (int64 op_idx : alias_user->OperandIndices(alias.instruction())) {
+          uses.emplace_back(alias_user, op_idx);
+        }
+      }
+    }
+  }
+  return uses;
+}
+
+// Returns true if there is exactly one use of 'operand' at 'operand_index'
+// in 'fusion.fused_instructions', where the singleton use is the fused
+// root at operand index 'use_operand_index'. Returns false otherwise.
+//
+// REQUIRES: 'fusion' opcode is a kFusion instruction.
+bool HasUniqueFusedUseOfOperandAt(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* fusion, const int64 use_operand_index,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Check that 'operand' is unique in the operand list of 'fusion'.
+  if (fusion->OperandIndices(operand).size() > 1) {
+    return false;
+  }
+  // Find fusion parameter associated with 'operand'.
+  const auto& fused_params = fusion->fused_parameters();
+  auto fused_param_it = std::find_if(
+      fused_params.begin(), fused_params.end(),
+      [&](HloInstruction* fused_param) {
+        return fusion->operand(fused_param->parameter_number()) == operand;
+      });
+  if (fused_param_it == fused_params.end()) {
+    return false;
+  }
+  auto* fused_param = *fused_param_it;
+  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
+  auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
+      fused_param, operand_index, points_to_analysis);
+  // Return true iff there is exactly one use of 'operand' at 'index', and
+  // this singleton use is the fused root (at index in 'use_operand_indices').
+  return fused_param_uses.size() == 1 &&
+         fused_param_uses[0].first == fusion->fused_expression_root() &&
+         fused_param_uses[0].second == use_operand_index;
+}
+
+}  // namespace
+
+// User and operand can share buffers iff both instructions emit the same shape
+// and layout, and 'user' meets one of the following qualifications:
+// *) Is element-wise. Or...
+// *) Is a loop fusion instruction where the only use of 'operand' at 'index'
+//    in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
+//    at operand 0. Or...
+// *) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
+//    instruction where the only use of 'operand' at 'index' in the set
+//    'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
+// *) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index 0.
+bool CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  Shape operand_subshape =
+      ShapeUtil::GetSubshape(operand->shape(), operand_index);
+  Shape user_subshape = ShapeUtil::GetSubshape(user->shape(), user_index);
+  // Check that operand and user emit the same shape and layout.
+  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
+    return false;
+  }
+  if (user->opcode() == HloOpcode::kFusion) {
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
+                                          points_to_analysis);
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is either kDot, or nested
+      // kFusion of kind kTransposeDot.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kDot ||
+                                (operand->opcode() == HloOpcode::kFusion &&
+                                 operand->fusion_kind() ==
+                                     HloInstruction::FusionKind::kTransposeDot);
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
+      }
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
+                                          other_add_operand_index,
+                                          points_to_analysis);
+    }
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
+    // We eliminated other users in BufferLiveness::live_range_strictly_before,
+    // so here we just need to check that the use is at operand index 0.
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 0;
+  }
+  // Check if 'user' is element-wise.
+  return user->IsElementwise();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
new file mode 100644
index 00000000000..0b01223db73
--- /dev/null
+++ b/tensorflow/compiler/xla/service/liveness_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A collection of utilities on the HLO graph.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// Returns true if 'user' cannot possibly use the buffer at 'index' in
+// 'operand'. Returns false otherwise.
+//
+// REQUIRES: 'operand' is an operand of 'user'.
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
+                             const TuplePointsToAnalysis& points_to_analysis);
+
+// Overload which does not require points-to analysis. The result is more
+// conservative (returns false more often).
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user);
+
+// Returns true if 'user' (at 'user_index') can share a buffer with its operand
+// 'operand' (at 'operand_index').
+// Returns false otherwise.
+//
+// REQUIRES: 'operand' is an operand of 'user'.
+bool CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index,
+    const TuplePointsToAnalysis& points_to_analysis);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
new file mode 100644
index 00000000000..bad4be149a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -0,0 +1,372 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/liveness_util.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+class PointsToAnalysisTestBase : public HloTestBase {
+ protected:
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
+    computation_ = module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  }
+
+  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation* computation_ = nullptr;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+};
+
+class DoesNotUseOperandBufferTest : public PointsToAnalysisTestBase {};
+
+TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, gte0, *points_to_analysis_));
+  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {1}, gte1, *points_to_analysis_));
+  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte0, *points_to_analysis_));
+  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte1, *points_to_analysis_));
+}
+
+TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(
+      DoesNotUseOperandBuffer(tuple, {0}, fusion, *points_to_analysis_));
+  EXPECT_FALSE(
+      DoesNotUseOperandBuffer(tuple, {1}, fusion, *points_to_analysis_));
+}
+
+class CanShareOperandBufferWithUserTest : public PointsToAnalysisTestBase {};
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(exp, {}, log, {}, *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
+                                             *points_to_analysis_));
+  EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
+                                             *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(exp, {}, copy, {}, *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can share with tuple element 1.
+  EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
+                                             *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(data, {}, dus, {}, *points_to_analysis_));
+  EXPECT_FALSE(
+      CanShareOperandBufferWithUser(update, {}, dus, {}, *points_to_analysis_));
+  EXPECT_FALSE(
+      CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+  auto b_t = builder.AddInstruction(
+      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+
+  auto nested_fusion = computation_->CreateFusionInstruction(
+      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused transpose-dot-add should be share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
+                                             *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = CreateNewModule();
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(data, {}, whil, {}, *points_to_analysis_));
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 1edfec4dae5..12b2762f0ed 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -48,8 +48,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:llvm_util_flags",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
@@ -80,6 +78,7 @@ cc_library(
     deps = [
         ":ir_array",
         ":llvm_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/README.md b/tensorflow/compiler/xla/service/llvm_ir/README.md
index 9fe7152477f..9e4cdd45dca 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/README.md
+++ b/tensorflow/compiler/xla/service/llvm_ir/README.md
@@ -1,2 +1,2 @@
-Common utilites and abstractions for handling and emitting LLVM IR for XLA
+Common utilities and abstractions for handling and emitting LLVM IR for XLA
 backends.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index a552ea0218a..02710ff57f6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -26,35 +26,41 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+// Sentry allocation used to represent parameters of the entry computation in
+// alias_scope_metadata_ and noalias_metadata_.
+static const BufferAllocation* kParameterAllocation = new BufferAllocation(
+    /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false,
+    LogicalBuffer::Color(0));
+
 void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                                     llvm_ir::IrArray* array) {
-  BufferAllocation::Index buffer_index;
+  BufferAllocation::Slice buffer_slice;
   if (hlo.opcode() == HloOpcode::kParameter) {
     // Parameters may alias with each other but may not alias with our temporary
     // buffers.
-    buffer_index = kParameterAliasSet;
+    buffer_slice = BufferAllocation::Slice(kParameterAllocation, 0, 0);
   } else {
-    const std::set<BufferAllocation> allocations =
-        assignment_.GetAllocations(&hlo, /*index=*/{});
-    if (allocations.empty() || allocations.size() > 1) {
-      // Skip HLOs which don't have buffers a buffer assigned or for which the
+    const std::set<BufferAllocation::Slice> slices =
+        assignment_.GetAllSlices(&hlo, /*index=*/{});
+    if (slices.empty() || slices.size() > 1) {
+      // Skip HLOs which don't have a buffer assigned or for which the
       // buffer can't be determined statically. We cannot determine their
       // aliasing properties in these cases.
       return;
     }
-    buffer_index = allocations.begin()->index();
+    buffer_slice = *slices.begin();
   }
 
-  llvm::MDNode*& alias_scope_md = alias_scope_metadata_[buffer_index];
+  llvm::MDNode*& alias_scope_md = alias_scope_metadata_[buffer_slice];
   if (alias_scope_md == nullptr) {
     alias_scope_md =
-        GetAliasScopeMetadataForBuffer(buffer_index, GetAliasDomain());
+        GetAliasScopeMetadataForBuffer(buffer_slice, GetAliasDomain());
   }
   array->AddAliasScopeMetadata(alias_scope_md);
 
-  llvm::MDNode*& noalias_md = noalias_metadata_[buffer_index];
+  llvm::MDNode*& noalias_md = noalias_metadata_[buffer_slice];
   if (noalias_md == nullptr) {
-    noalias_md = GetNoaliasMetadataForBuffer(buffer_index, GetAliasDomain(),
+    noalias_md = GetNoaliasMetadataForBuffer(buffer_slice, GetAliasDomain(),
                                              assignment_, hlo);
   }
   array->AddNoaliasMetadata(noalias_md);
@@ -80,7 +86,7 @@ llvm::MDNode* AliasAnalysis::GetAliasDomain() {
 }
 
 llvm::MDNode* AliasAnalysis::GetAliasScopeMetadataForBuffer(
-    BufferAllocation::Index buffer_index, llvm::MDNode* domain) {
+    const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain) {
   legacy_flags::AliasAnalysisFlags* flags =
       legacy_flags::GetAliasAnalysisFlags();
   if (!flags->xla_emit_alias_scope) {
@@ -89,20 +95,19 @@ llvm::MDNode* AliasAnalysis::GetAliasScopeMetadataForBuffer(
 
   // While we could synthesize an alias.scope, doing so is not more profitable
   // than LLVM's default behavior.
-  if (buffer_index == kParameterAliasSet) {
+  if (buffer_slice.allocation() == kParameterAllocation) {
     return nullptr;
   }
 
   llvm::MDBuilder metadata_builder(domain->getContext());
   llvm::MDNode* scope = metadata_builder.createAliasScope(
-      AsStringRef(tensorflow::strings::StrCat("buffer: ", buffer_index)),
-      domain);
+      AsStringRef("buffer: " + buffer_slice.ToString()), domain);
   llvm::MDNode* scope_list = llvm::MDNode::get(domain->getContext(), scope);
   return scope_list;
 }
 
 llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
-    BufferAllocation::Index buffer_index, llvm::MDNode* domain,
+    const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain,
     const BufferAssignment& assignment, const HloInstruction& hlo) {
   legacy_flags::AliasAnalysisFlags* flags =
       legacy_flags::GetAliasAnalysisFlags();
@@ -147,18 +152,20 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
     add_buffers_to_worklist(operand);
   }
 
-  std::unordered_set<BufferAllocation::Index> buffers;
+  tensorflow::gtl::FlatSet<BufferAllocation::Slice,
+                           BufferAllocation::Slice::Hasher>
+      buffers;
   for (const LogicalBuffer* buffer : worklist) {
     // Skip buffers which cannot be added to the noalias set.
     if (!assignment.HasAllocation(*buffer) ||
         buffer->instruction()->opcode() == HloOpcode::kParameter) {
       continue;
     }
-    BufferAllocation::Index noalias_index =
-        assignment.GetAssignedAllocation(*buffer).index();
-    // Our buffer must not noalias itself.
-    if (noalias_index != buffer_index) {
-      buffers.insert(noalias_index);
+    const BufferAllocation::Slice noalias_slice =
+        assignment.GetAssignedAllocation(*buffer).GetSlice(*buffer);
+    // Our buffer must not overlap with the noalias slice.
+    if (!buffer_slice.OverlapsWith(noalias_slice)) {
+      buffers.insert(noalias_slice);
       // Some instructions have too many operands, causing the noalias set to be
       // too large. To reduce compilation time (b/31901575), truncate noalias
       // sets to at most 500 elements.
@@ -180,10 +187,9 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
 
   llvm::MDBuilder metadata_builder(domain->getContext());
   std::vector<llvm::Metadata*> scopes;
-  for (BufferAllocation::Index noalias_index : buffers) {
+  for (const BufferAllocation::Slice noalias_slice : buffers) {
     llvm::MDNode* scope = metadata_builder.createAliasScope(
-        AsStringRef(tensorflow::strings::StrCat("buffer: ", noalias_index)),
-        domain);
+        AsStringRef("buffer: " + noalias_slice.ToString()), domain);
     scopes.push_back(scope);
   }
   llvm::MDNode* noalias_list =
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index d8d45dd49b3..9eb1cbaa341 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 
-#include <unordered_map>
-
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
@@ -44,20 +44,20 @@ class AliasAnalysis {
   // Returns a unique alias domain for this emitter.
   llvm::MDNode* GetAliasDomain();
 
-  // Returns an alias.scope metadata node corresponding to a given buffer index.
+  // Returns an alias.scope metadata node corresponding to a given buffer slice.
   llvm::MDNode* GetAliasScopeMetadataForBuffer(
-      BufferAllocation::Index buffer_index, llvm::MDNode* domain);
+      const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain);
 
-  // Returns a noalias metadata node corresponding to a given buffer index.
+  // Returns a noalias metadata node corresponding to a given buffer slice.
   //
-  // |buffer_index| is the buffer index.
+  // |buffer_slice| is the buffer slice.
   //
   // |domain| corresponds to the alias scope domain as documented at
   // http://llvm.org/docs/LangRef.html#noalias-and-alias-scope-metadata
   //
   // |hlo| is the instruction we are computing a noalias set for.
   llvm::MDNode* GetNoaliasMetadataForBuffer(
-      BufferAllocation::Index buffer_index, llvm::MDNode* domain,
+      const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain,
       const BufferAssignment& assignment, const HloInstruction& hlo);
 
   // The HLO module we are compiling for.
@@ -73,18 +73,18 @@ class AliasAnalysis {
   // Holds the alias domain for this computation.
   llvm::MDNode* alias_domain_ = nullptr;
 
-  // Index in alias_scope_metadata_ and noalias_metadata_ for parameters
-  // of the entry computation which have special aliasing properties.
-  static constexpr int kParameterAliasSet = -1;
-
-  // A map from a buffer index to metadata corresponding to its alias.scope
+  // A map from a buffer slice to metadata corresponding to its alias.scope
   // metadata.  The index kParameterAliasSet is used to hold aliasing
   // information for parameters.
-  std::unordered_map<int, llvm::MDNode*> alias_scope_metadata_;
+  tensorflow::gtl::FlatMap<BufferAllocation::Slice, llvm::MDNode*,
+                           BufferAllocation::Slice::Hasher>
+      alias_scope_metadata_;
 
-  // A map from a buffer index to metadata corresponding to its noalias
+  // A map from a buffer slice to metadata corresponding to its noalias
   // metadata.
-  std::unordered_map<int, llvm::MDNode*> noalias_metadata_;
+  tensorflow::gtl::FlatMap<BufferAllocation::Slice, llvm::MDNode*,
+                           BufferAllocation::Slice::Hasher>
+      noalias_metadata_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 303bb3ee6b9..79007b7099a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -62,6 +62,13 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   // Returns the generator function for the given instruction.
   Generator GetGenerator(const HloInstruction* instruction) const;
 
+  // Returns the ir value for instruction 'hlo'.
+  llvm::Value* GetIrValueForGTE(const HloInstruction* hlo) const {
+    auto it = gte_values_.find(hlo);
+    CHECK(it != gte_values_.end());
+    return it->second;
+  }
+
  private:
   // Arrays of parameters of fusion instruction
   tensorflow::gtl::ArraySlice<llvm_ir::IrArray> parameter_arrays_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 38728d2e1f3..e401305ae73 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -236,10 +236,8 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
   llvm::LoadInst* load = ir_builder->CreateLoad(element_address);
   llvm_ir::SetTbaaForInstruction(load, GetShape(),
                                  /*is_pointer_to=*/false);
-  for (const std::pair<int, llvm::MDNode*>& kind_md_pair : metadata_) {
-    int kind = kind_md_pair.first;
-    llvm::MDNode* md = kind_md_pair.second;
-    load->setMetadata(kind, md);
+  for (const auto& kind_md_pair : metadata_) {
+    load->setMetadata(kind_md_pair.first, kind_md_pair.second);
   }
   return load;
 }
@@ -250,11 +248,9 @@ void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
   llvm::StoreInst* store = ir_builder->CreateStore(value, element_address);
   llvm_ir::SetTbaaForInstruction(store, GetShape(),
                                  /*is_pointer_to=*/false);
-  for (const std::pair<int, llvm::MDNode*>& kind_md_pair : metadata_) {
-    int kind = kind_md_pair.first;
-    CHECK_NE(kind, llvm::LLVMContext::MD_invariant_load);
-    llvm::MDNode* md = kind_md_pair.second;
-    store->setMetadata(kind, md);
+  for (const auto& kind_md_pair : metadata_) {
+    CHECK_NE(kind_md_pair.first, llvm::LLVMContext::MD_invariant_load);
+    store->setMetadata(kind_md_pair.first, kind_md_pair.second);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 4ccded61e73..97f1b8ce308 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Function.h"
 #include "external/llvm/include/llvm/IR/Instructions.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 0cc82b040d2..60ac0444bcd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -47,22 +47,22 @@ class ForLoop {
   // created exit basic block. Instructions before the insert point remain in
   // the insert BB:
   //
-  //                   /--------------\         /----------------\
+  //                   +--------------+         +----------------+
   //                   |  insert BB   |         |   insert BB    |
   //                   |     ...      |         | (preheader BB) |
   //                   | %foo = ...   |         |      ...       |
   //    insert point ->| %bar = ...   |  ===>   | %foo = ...     |
-  //                   |     ...      |         \----------------/
-  //                   \--------------/                 |
+  //                   |     ...      |         +----------------+
+  //                   +--------------+                 |
   //                                                    V
   //                                              [[ LOOP BBs ]]
   //                                                    |
   //                                                    V
-  //                                             /--------------\
+  //                                             +--------------+
   //                                             |   exit BB    |
   //                                             | %bar = ...   |
   //                                             |     ...      |
-  //                                             \--------------/
+  //                                             +--------------+
   //
   // `suffix` is a string used to disambiguate variable and basic block names
   // emitted in LLVM IR. This string is appended to the name of the induction
@@ -82,31 +82,31 @@ class ForLoop {
   //     do_stuff(i);
   //   }
   //
-  //      /--------------\
+  //      +--------------+
   //      | preheader BB |
   //      |     i = 0    |
-  //      \--------------/
+  //      +--------------+
   //              |
   //              V
-  //      /-------------\
+  //      +-------------+
   //      |  header BB  |<-+
   //      | if i < n:   |  |
   //      |   goto body |  |
   //      | else:       |  |
   //      |   goto exit |  |
-  //      \-------------/  |
+  //      +-------------+  |
   //            | |        |
   //   +--------+ |        |
   //   |          V        |
-  //   |  /-------------\  |
+  //   |  +-------------+  |
   //   |  |   body BB   |  |
   //   |  | dostuff(i)  |--+
   //   |  | ++i         |
-  //   |  \-------------/
+  //   |  +-------------+
   //   |
-  //   |  /-------------\
+  //   |  +-------------+
   //   +->|   exit BB   |
-  //      \-------------/
+  //      +-------------+
   //
   // Caller-emitted code to execute within the loop should be placed within the
   // "body" basic block.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 99d0d0e1c42..ff2f4cd693c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -449,25 +449,23 @@ int64 ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout) {
   return ShapeUtil::ByteSizeOf(shape, pointer_size);
 }
 
-llvm::FastMathFlags GetFastMathFlags(const HloModuleConfig& config) {
+llvm::FastMathFlags GetFastMathFlags(bool fast_math_enabled) {
   llvm::FastMathFlags flags;
-  if (!config.fast_math_disabled()) {
+  if (fast_math_enabled) {
     // UnsafeAlgebra implies NoInfs, NoNaNs, NoSignedZeros, and AllowReciprocal.
     flags.setUnsafeAlgebra();
   }
   return flags;
 }
 
-void SetTargetOptions(const HloModuleConfig& config,
+void SetTargetOptions(bool fast_math_enabled,
                       llvm::TargetOptions* target_options) {
-  bool fast = !config.fast_math_disabled();
   // In LLVM backend flags, UnsafeFPMath does not explicitly imply
   // NoInfs, etc.
-  target_options->UnsafeFPMath = fast;
-  target_options->NoInfsFPMath = fast;
-  target_options->NoNaNsFPMath = fast;
-  target_options->NoSignedZerosFPMath = fast;
-  target_options->LessPreciseFPMADOption = fast;
+  target_options->UnsafeFPMath = fast_math_enabled;
+  target_options->NoInfsFPMath = fast_math_enabled;
+  target_options->NoNaNsFPMath = fast_math_enabled;
+  target_options->NoSignedZerosFPMath = fast_math_enabled;
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 28488ca9991..7b09c1f8314 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/IR/Value.h"
 #include "external/llvm/include/llvm/Support/raw_ostream.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -130,7 +129,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
     llvm::IRBuilder<>* ir_builder, int alignment = 0);
 
-// Creates a basic block with the same context and funtion as for the
+// Creates a basic block with the same context and function as for the
 // builder. Inserts at the end of the function if insert_before is
 // null.
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
@@ -219,11 +218,11 @@ int64 ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout);
 
 // Gets an llvm::FastMathFlags that reflects the settings in the given
 // module config.
-llvm::FastMathFlags GetFastMathFlags(const HloModuleConfig& config);
+llvm::FastMathFlags GetFastMathFlags(bool fast_math_enabled);
 
 // Sets values in the given TargetOptions struct according to the given
 // compilation options.
-void SetTargetOptions(const HloModuleConfig& config,
+void SetTargetOptions(bool fast_math_enabled,
                       llvm::TargetOptions* target_options);
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 30bf450c5b1..131c2ee87b0 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -44,65 +46,6 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
-LocalExecuteOptions& LocalExecuteOptions::set_platform(
-    perftools::gputools::Platform* platform) {
-  platform_ = platform;
-  return *this;
-}
-
-perftools::gputools::Platform* LocalExecuteOptions::platform() const {
-  return platform_;
-}
-
-LocalExecuteOptions& LocalExecuteOptions::set_device_ordinal(
-    int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-  return *this;
-}
-
-int LocalExecuteOptions::device_ordinal() const { return device_ordinal_; }
-
-LocalExecuteOptions& LocalExecuteOptions::set_allocator(
-    DeviceMemoryAllocator* allocator) {
-  allocator_ = allocator;
-  return *this;
-}
-
-DeviceMemoryAllocator* LocalExecuteOptions::allocator() const {
-  return allocator_;
-}
-
-LocalExecuteOptions& LocalExecuteOptions::set_stream(
-    perftools::gputools::Stream* stream) {
-  stream_ = stream;
-  return *this;
-}
-
-perftools::gputools::Stream* LocalExecuteOptions::stream() const {
-  return stream_;
-}
-
-LocalExecuteOptions& LocalExecuteOptions::set_execution_profile(
-    ExecutionProfile* profile) {
-  profile_ = profile;
-  return *this;
-}
-
-ExecutionProfile* LocalExecuteOptions::execution_profile() const {
-  return profile_;
-}
-
-LocalExecuteOptions& LocalExecuteOptions::set_result_layout(
-    const Shape& shape_with_layout) {
-  has_result_shape_with_layout_ = true;
-  result_shape_with_layout_ = shape_with_layout;
-  return *this;
-}
-
-const Shape* LocalExecuteOptions::result_layout() const {
-  return has_result_shape_with_layout_ ? &result_shape_with_layout_ : nullptr;
-}
-
 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
     perftools::gputools::Platform* platform) {
   ServiceOptions default_options;
@@ -117,9 +60,12 @@ const Shape* LocalExecuteOptions::result_layout() const {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Backend> backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform)
+      .set_number_of_replicas(options.number_of_replicas())
+      .set_intra_op_parallelism_threads(options.intra_op_parallelism_threads());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> backend,
+                      Backend::CreateBackend(backend_options));
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
@@ -134,21 +80,6 @@ LocalService::LocalService(std::unique_ptr<Backend> execute_backend,
   runs_in_client_process_ = true;
 }
 
-tensorflow::Status LocalService::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  TF_ASSIGN_OR_RETURN(std::vector<const Allocation*> arg_allocations,
-                      ResolveAndValidateArguments(
-                          arguments, execute_backend_.get(), device_ordinal));
-  argument_ptrs->resize(arg_allocations.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    const Allocation& allocation = *arg_allocations[i];
-    (*argument_ptrs)[i] = allocation.device_memory();
-  }
-  return tensorflow::Status::OK();
-}
-
 namespace {
 // Returns the space required to allocate a shape. If
 // allocate_space_for_deep_copy the space includes all sub-buffers of
@@ -159,12 +90,11 @@ int64 RequiredSpace(const Shape& shape, bool allocate_space_for_deep_copy,
   // TODO(b/33492279) remove once no devices represent result tuples as
   // contiguous buffers.
   if (allocate_space_for_deep_copy) {
-    TF_CHECK_OK(ShapeUtil::ForEachSubshape(
+    ShapeUtil::ForEachSubshape(
         shape, [&size, transfer_manager](const Shape& subshape,
                                          const ShapeIndex& /*index*/) {
           size += transfer_manager->GetByteSizeRequirement(subshape);
-          return tensorflow::Status::OK();
-        }));
+        });
   }
   return size;
 }
@@ -185,302 +115,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
                                   allocation_size));
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> LocalService::ExecuteLocally(
-    const ComputationHandle& computation,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options) {
-  return ExecuteLocallyInternal(computation, arguments, options,
-                                /*preallocated_result_buffer=*/nullptr);
-}
-
-tensorflow::Status LocalService::ExecuteLocally(
-    const ComputationHandle& computation,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options, ShapedBuffer* result_buffer) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ShapedBuffer> null_buffer,
-      ExecuteLocallyInternal(computation, arguments, options, result_buffer));
-  // Because the result is written into result_buffer, a null ShapedBuffer
-  // pointer should have been returned.
-  CHECK_EQ(nullptr, null_buffer.get());
-  return tensorflow::Status::OK();
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle,
-                            /*include_unused_parameters=*/true));
-    hlo_modules.push_back(std::move(hlo_module));
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
-    HloModuleConfig* module_config = module_configs.back().get();
-    auto* computation_layout =
-        module_config->mutable_entry_computation_layout();
-    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
-      const Shape& argument_layout = *instance.argument_layouts[i];
-      if (ShapeUtil::IsTuple(argument_layout)) {
-        return Unimplemented("tuple arguments not supported yet");
-      }
-      TF_RETURN_IF_ERROR(
-          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-              argument_layout));
-    }
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            *instance.result_layout));
-  }
-
-  return execute_backend_->compiler()->CompileAheadOfTime(
-      std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
-      options);
-}
-
-tensorflow::Status LocalService::ValidateExecuteOptions(
-    const ProgramShape& program_shape,
-    tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const LocalExecuteOptions& options,
-    const ShapedBuffer* preallocated_result_buffer) {
-  if (argument_layouts.size() != program_shape.parameters_size()) {
-    return InvalidArgument(
-        "invalid number of arguments for computation: expected %d, got %zu",
-        program_shape.parameters_size(), argument_layouts.size());
-  }
-
-  if (options.stream()) {
-    if (!options.stream()->ok()) {
-      return InvalidArgument("stream is uninitialized or in an error state");
-    }
-
-    // Check stream matches service platform.
-    const se::Platform* stream_platform =
-        options.stream()->parent()->platform();
-    if (stream_platform != execute_backend_->platform()) {
-      return InvalidArgument(
-          "stream is for platform %s, but service targets platform %s",
-          stream_platform->Name().c_str(),
-          execute_backend_->platform()->Name().c_str());
-    }
-
-    // Cannot specify platform or device_ordinal with a stream. The stream
-    // determines these values.
-    if (options.device_ordinal() >= 0) {
-      return InvalidArgument(
-          "cannot set both device ordinal and stream options in "
-          "LocalExecuteOptions; the stream determines the device ordinal");
-    }
-    if (options.platform()) {
-      return InvalidArgument(
-          "cannot set both platform and stream options in "
-          "LocalExecuteOptions; the stream determines the platform");
-    }
-  }
-  if (options.platform() &&
-      options.platform() != execute_backend_->platform()) {
-    return InvalidArgument(
-        "service platform (%s) does not match platform set in "
-        "LocalExecuteOptions (%s)",
-        execute_backend_->platform()->Name().c_str(),
-        options.platform()->Name().c_str());
-  }
-
-  // TODO(cwhipkey): validate the thread pool provided?
-
-  if (!options.allocator()) {
-    return InvalidArgument("an allocator must be provided to ExecuteLocally");
-  }
-
-  if (options.allocator()->platform() != execute_backend_->platform()) {
-    return InvalidArgument(
-        "allocator platform (%s) does not match service platform (%s)",
-        options.allocator()->platform()->Name().c_str(),
-        execute_backend_->platform()->Name().c_str());
-  }
-
-  if (preallocated_result_buffer != nullptr) {
-    if (options.result_layout()) {
-      return InvalidArgument(
-          "cannot set both result ShapedBuffer and result layout; the result "
-          "ShapedBuffer determines the result layout");
-    }
-    if (!ShapeUtil::Compatible(preallocated_result_buffer->shape(),
-                               program_shape.result())) {
-      return InvalidArgument(
-          "result ShapedBuffer of shape %s not compatible with computation "
-          "result shape %s",
-          ShapeUtil::HumanString(preallocated_result_buffer->shape()).c_str(),
-          ShapeUtil::HumanString(program_shape.result()).c_str());
-    }
-  }
-  if (options.result_layout()) {
-    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(*options.result_layout(),
-                                                     program_shape.result()));
-  }
-
-  // Check that all argument layouts are valid and the right shape.
-  for (int i = 0; i < argument_layouts.size(); ++i) {
-    const Shape& argument_shape = *argument_layouts[i];
-    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
-    if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
-      return InvalidArgument(
-          "invalid argument shape for argument %d, expected %s, got %s", i,
-          ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
-          ShapeUtil::HumanString(argument_shape).c_str());
-    }
-  }
-
-  return tensorflow::Status::OK();
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> LocalService::ExecuteLocallyInternal(
-    const ComputationHandle& computation,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options,
-    ShapedBuffer* preallocated_result_buffer) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(computation));
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  // Determine device ordinal the computation will run on.
-  int device_ordinal;
-  if (options.device_ordinal() >= 0) {
-    device_ordinal = options.device_ordinal();
-  } else if (options.stream()) {
-    device_ordinal = options.stream()->parent()->device_ordinal();
-  } else {
-    device_ordinal = execute_backend_->default_device_ordinal();
-  }
-
-  // Check that all arguments are on the right platform and device ordinal.
-  std::vector<const Shape*> argument_layouts(arguments.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    auto argument = arguments[i];
-    if (argument->platform() != execute_backend_->platform() ||
-        argument->device_ordinal() != device_ordinal) {
-      return InvalidArgument(
-          "computation to run on device %s but argument %d is on "
-          "device %s:%d",
-          execute_backend_->device_name(device_ordinal).c_str(), i,
-          argument->platform()->Name().c_str(), argument->device_ordinal());
-    }
-    argument_layouts[i] = &argument->shape();
-  }
-
-  TF_RETURN_IF_ERROR(ValidateExecuteOptions(
-      *program_shape, argument_layouts, options, preallocated_result_buffer));
-
-  // Construct computation layout from the argument layouts.
-  auto module_config = MakeUnique<HloModuleConfig>(*program_shape);
-  module_config->set_has_hybrid_result(true);
-  module_config->set_replica_count(execute_backend_->Replicas().size());
-  std::vector<perftools::gputools::DeviceMemoryBase> argument_buffers;
-  auto* computation_layout = module_config->mutable_entry_computation_layout();
-  for (int i = 0; i < arguments.size(); ++i) {
-    const ShapedBuffer* argument = arguments[i];
-    if (ShapeUtil::IsTuple(argument->shape())) {
-      return Unimplemented("tuple arguments not supported yet");
-    }
-    argument_buffers.push_back(argument->buffer(/*index=*/{}));
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-            argument->shape()));
-  }
-  if (options.result_layout()) {
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            *options.result_layout()));
-  } else if (preallocated_result_buffer != nullptr) {
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            preallocated_result_buffer->shape()));
-  } else {
-    computation_layout->mutable_result_layout()->SetToDefaultLayout();
-  }
-
-  ExecutableRunOptions run_options;
-  run_options.set_allocator(options.allocator());
-  run_options.set_inter_op_thread_pool(
-      execute_backend_->inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      execute_backend_->eigen_intra_op_thread_pool_device());
-
-  // "acquired_stream" owns the stream used for execution if no stream is given.
-  std::unique_ptr<se::Stream> acquired_stream;
-  if (options.stream()) {
-    run_options.set_stream(options.stream());
-  } else {
-    se::StreamExecutor* stream_executor;
-    if (options.device_ordinal() >= 0) {
-      TF_ASSIGN_OR_RETURN(
-          stream_executor,
-          execute_backend_->stream_executor(options.device_ordinal()));
-    } else {
-      stream_executor = execute_backend_->default_stream_executor();
-    }
-    TF_ASSIGN_OR_RETURN(acquired_stream,
-                        execute_backend_->AcquireStream(stream_executor));
-    run_options.set_stream(acquired_stream.get());
-  }
-  auto stream_releaser =
-      ::tensorflow::gtl::MakeCleanup([this, &acquired_stream]() {
-        if (acquired_stream != nullptr) {
-          execute_backend_->ReleaseStream(std::move(acquired_stream));
-        }
-      });
-
-  ExecutionProfile* profile = options.execution_profile();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              argument_buffers, execute_backend_.get(),
-                              run_options.stream()->parent(), profile));
-
-  if (preallocated_result_buffer == nullptr) {
-    return Service::ExecuteOnStreamWrapper<
-        StatusOr<std::unique_ptr<ShapedBuffer>>>(
-        executable.get(), &run_options, profile,
-        [&arguments](Executable* executable,
-                     const ExecutableRunOptions* run_options,
-                     HloExecutionProfile* hlo_execution_profile) {
-          return executable->ExecuteOnStream(run_options, arguments,
-                                             hlo_execution_profile);
-        });
-  } else {
-    TF_RETURN_IF_ERROR(Service::ExecuteOnStreamWrapper<tensorflow::Status>(
-        executable.get(), &run_options, profile,
-        [&arguments, preallocated_result_buffer](
-            Executable* executable, const ExecutableRunOptions* run_options,
-            HloExecutionProfile* hlo_execution_profile) {
-          return executable->ExecuteOnStream(run_options, arguments,
-                                             preallocated_result_buffer,
-                                             hlo_execution_profile);
-        }));
-    // To satisfy the return value type, Return a null ShapedBuffer pointer.
-    return std::unique_ptr<ShapedBuffer>();
-  }
-}
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -519,6 +153,10 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   auto module_config = MakeUnique<HloModuleConfig>(*program_shape);
   module_config->set_has_hybrid_result(has_hybrid_result);
   module_config->set_replica_count(execute_backend_->Replicas().size());
+  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+  if (flags->xla_hlo_profile) {
+    module_config->enable_hlo_profiling(true);
+  }
   auto* computation_layout = module_config->mutable_entry_computation_layout();
   for (int i = 0; i < argument_layouts.size(); ++i) {
     const Shape& shape = *argument_layouts[i];
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 9fe0d5993b3..767a3ab697f 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -31,60 +31,6 @@ limitations under the License.
 
 namespace xla {
 
-// Computation execution options which may be set by the client when executing
-// locally (via LocalClient::ExecuteLocally).
-class LocalExecuteOptions {
- public:
-  // Specifies the allocator to use during execution. Execution will fail if no
-  // allocator is provided.
-  LocalExecuteOptions& set_allocator(DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* allocator() const;
-
-  // If set, this is the platform to run the computation on. This must match
-  // the underlying platform of the service. A value of nullptr means the
-  // platform is not set.
-  // TODO(b/28616830): Support multiple platforms.
-  LocalExecuteOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
-
-  // If set, this is the device to run the computation on. Valid device_ordinal
-  // values are: 0 to # of devices - 1. These values are identical to the
-  // device ordinal values used by StreamExecutor. A value of < 0 means the
-  // ordinal is not set.
-  LocalExecuteOptions& set_device_ordinal(int device_ordinal);
-  int device_ordinal() const;
-
-  // If set, this is the stream to run the computation on. The platform of the
-  // stream must match the service's platform. The device ordinal
-  // option (if set) must match the stream's device. A value of nullptr means
-  // the stream is not set.
-  LocalExecuteOptions& set_stream(perftools::gputools::Stream* stream);
-  perftools::gputools::Stream* stream() const;
-
-  // If set, collect profile information during execution and fill the given
-  // ExecutionProfile object with the profile data. A value of nullptr means
-  // the profile is not set.
-  LocalExecuteOptions& set_execution_profile(ExecutionProfile* profile);
-  ExecutionProfile* execution_profile() const;
-
-  // If set, this specifies the layout of the result of the computation. If not
-  // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accomodate tuple result shapes. A value of nullptr
-  // means the shape is not set.
-  LocalExecuteOptions& set_result_layout(const Shape& shape_with_layout);
-  const Shape* result_layout() const;
-
- private:
-  DeviceMemoryAllocator* allocator_ = nullptr;
-  perftools::gputools::Platform* platform_ = nullptr;
-  int device_ordinal_ = -1;
-  perftools::gputools::Stream* stream_ = nullptr;
-  ExecutionProfile* profile_ = nullptr;
-
-  bool has_result_shape_with_layout_ = false;
-  Shape result_shape_with_layout_;
-};
-
 // Service implementation that extends the XLA Service to leverage running
 // in the same process as the client.
 class LocalService : public Service {
@@ -97,14 +43,6 @@ class LocalService : public Service {
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
-  // For an array of arguments, validate that each is placed on the
-  // specified device_ordinal, and return the DeviceMemoryBase
-  // corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal. If allocate_space_for_deep_copy, the buffer is
   // large enough to hold all sub-buffers of a tuple shape, otherwise
@@ -113,48 +51,6 @@ class LocalService : public Service {
       const Shape& shape, int device_ordinal,
       bool allocate_space_for_deep_copy);
 
-  // Execute the given computation with the given arguments and options with
-  // zero-copy data handling of arguments and result.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteLocally(
-      const ComputationHandle& computation,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options);
-
-  // Overload which writes the result into the given ShapedBuffer "result".
-  // Due to aliasing, not all buffers which comprise "result" may be utilized
-  // in the computation and thus be uninitialized.  The |ShapedBuffer::buffer|
-  // or |ShapedBuffer::mutable_buffer| methods should be used to map an index to
-  // the initialized buffer.
-  //
-  // For example:
-  //  Let 'result' be a ShapedBuffer holding a tuple with the same element,
-  //  'x', twice: (x, x).  It is incorrect to assume that the second buffer
-  //  which comprises 'result' is initialized.  Instead, a mapping has been
-  //  added to 'result' which can be used to recover the correct buffer.
-  //  In this case, result->buffer({0}) should be used to extract the address of
-  //  the first tuple element while result->buffer({1}) should be used for the
-  //  second.
-  tensorflow::Status ExecuteLocally(
-      const ComputationHandle& computation,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options, ShapedBuffer* result_buffer);
-
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |LocalClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& Options);
-
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
   // result of the given layout.
@@ -168,24 +64,6 @@ class LocalService : public Service {
                         std::unique_ptr<Backend> compute_constant_backend);
   LocalService(const LocalService&) = delete;
   void operator=(const LocalService&) = delete;
-
-  // Internal helper for executing a computation. If result_buffer is null then
-  // the result is returned as a ShapedBuffer. If result_buffer is non-null then
-  // the result is written into result_buffer and a null ShapedBuffer pointer is
-  // returned.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteLocallyInternal(
-      const ComputationHandle& computation,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options,
-      ShapedBuffer* preallocated_result_buffer);
-
-  // Validates the given options and argument layouts and returns an appropriate
-  // error code.
-  tensorflow::Status ValidateExecuteOptions(
-      const ProgramShape& program_shape,
-      tensorflow::gtl::ArraySlice<const Shape*> arguments,
-      const LocalExecuteOptions& options,
-      const ShapedBuffer* preallocated_result_buffer);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index 00e4b35d155..d24a592f46e 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <ostream>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -26,9 +27,9 @@ limitations under the License.
 namespace xla {
 
 string LogicalBuffer::ToString() const {
-  return tensorflow::strings::StrCat(instruction_->name(), "[",
+  return tensorflow::strings::StrCat(instruction_->FullyQualifiedName(), "[",
                                      tensorflow::str_util::Join(index_, ","),
-                                     "](#", id_, ")");
+                                     "](#", id_, " @", color_.value(), ")");
 }
 
 std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) {
@@ -36,4 +37,26 @@ std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) {
   return out;
 }
 
+/*static*/ LogicalBufferProto::Location LogicalBuffer::ToLocationProto(
+    const HloInstruction& instruction, const ShapeIndex& index) {
+  LogicalBufferProto::Location proto;
+  proto.set_computation_name(instruction.parent()->name());
+  proto.set_instruction_name(instruction.name());
+  for (const int64 index_entry : index) {
+    proto.add_shape_index(index_entry);
+  }
+  return proto;
+}
+
+LogicalBufferProto LogicalBuffer::ToProto(const SizeFunction& size_fn) const {
+  LogicalBufferProto proto;
+  proto.set_id(id_);
+  proto.set_size(size_fn(*this));
+  LogicalBufferProto::Location proto_location =
+      ToLocationProto(*instruction_, index_);
+  proto.mutable_defined_at()->Swap(&proto_location);
+  proto.set_color(color_.value());
+  return proto;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 35a9935f44c..566cd01ea43 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -16,22 +16,23 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 
+#include <functional>
 #include <iosfwd>
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-struct HashLogicalBuffer;
-
 // Class describing a contiguous sequence of elements (ie, C array) which form
 // the components of Shaped values in XLA. XLA arrays are trivially a
 // single LogicalBuffer. Tuple values are made up of more than one
@@ -83,6 +84,8 @@ struct HashLogicalBuffer;
 //   LogicalBuffer(%tuple_constant, {1, 1})  // Holds value "43"
 class LogicalBuffer {
  public:
+  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+
   // Id is a unique identifier for the LogicalBuffer to facilitate efficient
   // collections of LogicalBuffers with stable iteration order.
   // LogicalBuffers are typically created and accessed through
@@ -90,11 +93,13 @@ class LogicalBuffer {
   // unique value.
   using Id = int64;
 
-  // Function which returns the size of a logical buffer in bytes.
+  // Functions which return the size and alignment of a logical buffer in bytes.
   using SizeFunction = std::function<int64(const LogicalBuffer&)>;
+  using AlignmentFunction = std::function<int64(const LogicalBuffer&)>;
 
-  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id)
-      : instruction_(instruction), index_(index), id_(id) {}
+  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id,
+                Color color)
+      : instruction_(instruction), index_(index), id_(id), color_(color) {}
 
   Id id() const { return id_; }
 
@@ -105,6 +110,11 @@ class LogicalBuffer {
   // defined. Index used defined as in ShapeUtil::GetSubshape()
   const ShapeIndex& index() const { return index_; }
 
+  // Return the color of the logical buffer. Differently colored buffers can
+  // not be parts of the same allocation.
+  Color color() const { return color_; }
+  void set_color(Color color) { color_ = color; }
+
   // Return the shape of the buffer. This reference points into the shape field
   // of the instruction defining the buffer.  Therefore, the returned shape will
   // contain the layout of instruction, if any.
@@ -126,29 +136,24 @@ class LogicalBuffer {
   bool IsArray() const { return ShapeUtil::IsArray(shape()); }
 
   string ToString() const;
+  LogicalBufferProto ToProto(const SizeFunction& size_fn) const;
+
+  // Returns the LogicalBufferProto::Location that serializes the given
+  // instruction and index.
+  static LogicalBufferProto::Location ToLocationProto(
+      const HloInstruction& instruction, const ShapeIndex& index);
 
  private:
-  friend struct HashLogicalBuffer;
   HloInstruction* instruction_;
   ShapeIndex index_;
   Id id_;
+  Color color_;
 
   // Similar to HLO constructs (HloInstruction, etc), pointers are used for
   // comparison to equality, so disable all copying.
   TF_DISALLOW_COPY_AND_ASSIGN(LogicalBuffer);
 };
 
-struct HashLogicalBuffer {
-  size_t operator()(const LogicalBuffer& b) const {
-    std::hash<const HloInstruction*> hasher;
-    size_t h = hasher(b.instruction_);
-    for (int i = 0; i < b.index_.size(); i++) {
-      h += static_cast<size_t>(b.index_[i] << i);
-    }
-    return h;
-  }
-};
-
 std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pool.h b/tensorflow/compiler/xla/service/pool.h
new file mode 100644
index 00000000000..8e710ebb6dc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pool.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_POOL_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace xla {
+
+// Pool of values, which are created as needed and destroyed when the `Pool` is
+// destroyed
+template <typename T>
+class Pool {
+ public:
+  struct Deleter {
+    void operator()(T* ptr) { pool->Deallocate(ptr); }
+
+    Pool<T>* pool;
+  };
+
+  // A pointer to a taken element of a `Pool` which returns it to the pool on
+  // destruction
+  using SmartPtr = std::unique_ptr<T, Deleter>;
+
+  // Constructs a `Pool` with given factory function, which need not be
+  // thread-safe.
+  explicit Pool(std::function<std::unique_ptr<T>()> factory)
+      : factory_(factory) {}
+
+  explicit Pool() : Pool([]() { return MakeUnique<T>(); }) {}
+
+  // Returns a pointer to a value in the pool, creating a new value if none is
+  // free. The returned smart pointer returns the element to the pool on
+  // destruction.
+  //
+  // This method is thread-safe.
+  SmartPtr Allocate() {
+    tensorflow::mutex_lock lock(mu_);
+    T* ptr;
+    if (!xs_.empty()) {
+      ptr = std::move(xs_.back()).release();
+      xs_.pop_back();
+    } else {
+      ptr = factory_().release();
+    }
+    Deleter del = {this};
+    return std::unique_ptr<T, Deleter>(ptr, del);
+  }
+
+ private:
+  // Puts a pointer to a value back into the pool, leaving it free for future
+  // use.
+  //
+  // This method is thread-safe.
+  void Deallocate(T* ptr) {
+    tensorflow::mutex_lock lock(mu_);
+    xs_.push_back(std::unique_ptr<T>(ptr));
+  }
+
+  const std::function<std::unique_ptr<T>()> factory_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<T>> xs_ GUARDED_BY(mu_);
+  tensorflow::mutex mu_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_POOL_H_
diff --git a/tensorflow/compiler/xla/service/pool_test.cc b/tensorflow/compiler/xla/service/pool_test.cc
new file mode 100644
index 00000000000..8c4fe258e38
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pool_test.cc
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pool.h"
+
+#include "tensorflow/compiler/xla/test_helpers.h"
+
+namespace xla {
+namespace {
+
+using PoolTest = ::testing::Test;
+
+TEST_F(PoolTest, Test) {
+  Pool<int> pool;
+
+  {
+    auto ptr = pool.Allocate();
+    EXPECT_NE(nullptr, ptr.get());
+    *ptr = 5;
+  }
+
+  auto ptr = pool.Allocate();
+  EXPECT_NE(nullptr, ptr.get());
+  EXPECT_EQ(5, *ptr);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index f8023f1c375..2d35ba5e548 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -13,16 +13,98 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// Implementation note:
+//
+// The general idea behind this pass is that we're converting from this:
+//   %param.A = OldShape
+//   %param.B = OldShape
+//   %reshape.A = NewShape reshape(%param.A)
+//   %reshape.B = NewShape reshape(%param.B)
+//   %instruction = NewShape instruction(%reshape.A, %reshape.B)
+// To this:
+//   %param.A = OldShape
+//   %param.B = OldShape
+//   %instruction = OldShape instruction(%param.A, %param.B)
+//   %reshape = NewShape reshape(%instruction)
+//
+// Where the instruction must be elementwise, and both reshapes and transposes
+// are moved.
+//
+// Most elementwise instructions support implicit broadcast of scalar operands,
+// but select is a special-case.  The signature is Select(Pred, A, B), and the
+// only implicit scalar broadcast is on Pred, not on A or B. Since reshapes or
+// transposes to a scalar should be cheap, we simply never move them.
+
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
 #include <algorithm>
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 
 namespace {
 
+// Checks if an instruction can change its shape simply by adjusting metadata.
+// This is the case if it is:
+//
+// - an instruction does not have any producers like Constants
+// or Rng instruction, or is a scalar.
+//
+// Or
+//
+// - an reshape/transpose instruction with an operand that can trivially change
+// its shape.
+bool InstructionCanTriviallyChangeShape(const HloInstruction* instruction) {
+  // Reshape/Transposes are only trivial if their operand is trivial.
+  if (instruction->opcode() == HloOpcode::kReshape ||
+      instruction->opcode() == HloOpcode::kTranspose) {
+    CHECK_EQ(instruction->operand_count(), 1);
+    return InstructionCanTriviallyChangeShape(instruction->operand(0));
+  }
+
+  // Scalars can operate with any shape.
+  if (ShapeUtil::IsScalar(instruction->shape())) {
+    return true;
+  }
+
+  // A constant can trivially reshape the literal it holds.
+  if (instruction->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+
+  // An Rng instruction can be any shape as long as it has one user. Two copies
+  // of the same Rng would be problematic if an Rng of a different shape would
+  // produce random numbers in a different order.
+  if (instruction->opcode() == HloOpcode::kRng &&
+      instruction->user_count() == 1) {
+    return true;
+  }
+  return false;
+}
+
+// Finds the first non-scalar operand of an instruction that is a non-trivial
+// reshape or transpose. Returns the operand if it is found or nullptr if not
+// found.
+HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
+    const HloInstruction* hlo) {
+  for (HloInstruction* operand : hlo->operands()) {
+    if (!ShapeUtil::IsScalar(operand->shape()) &&
+        ((operand->opcode() == HloOpcode::kReshape ||
+          operand->opcode() == HloOpcode::kTranspose) &&
+         !InstructionCanTriviallyChangeShape(operand->operand(0)))) {
+      VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
+              << hlo->ToStringNoMetadata() << ":\n\t"
+              << operand->ToStringNoMetadata();
+      return operand;
+    }
+  }
+  return nullptr;
+}
+
 // Returns whether `a` and `b` are equivalent for the purposes of this pass.
 bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
@@ -40,82 +122,204 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
+// Returns true if an elementwise operation has all operands that can easily
+// change shape. Operands can easily change shape if they are all
+// reshapes/transposes to and from the same shape. Additionally, operands like
+// constant, rng, and any scalar change shape with only an adjustment of
+// metadata.
 bool IsElementwiseOfEquivalentReshapesOrTransposes(
     const HloInstruction* instruction) {
   const std::vector<HloInstruction*>& operands = instruction->operands();
-  return instruction->IsElementwise() && instruction->operand_count() > 0 &&
-         std::all_of(operands.begin(), operands.end(),
-                     [](const HloInstruction* instruction) {
-                       // We require operand have no other users as otherwise
-                       // this is not a clear win.
-                       return 1 == instruction->users().size();
-                     }) &&
-         // Check whether each operand beyond the first is equivalent to the
-         // first.
-         std::all_of(operands.begin(), operands.end(),
-                     [&operands](const HloInstruction* operand) {
-                       return AreEquivalentReshapes(operands[0], operand);
-                     });
+  HloInstruction* first_reshape_operand =
+      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+  // If there are no non-trivial reshapes or transposes, then there is nothing
+  // to sink below the elementwise operation.
+  if (!first_reshape_operand) {
+    return false;
+  }
+  VLOG(3) << "** Checking whether instruction is an elementwise operation of "
+             "equivalent reshapes/transposes: "
+          << instruction->ToStringNoMetadata();
+  bool result = (instruction->user_count() > 0 ||
+                 instruction == instruction->parent()->root_instruction()) &&
+                instruction->IsElementwise() && !operands.empty();
+
+  // Check whether all operands:
+  //    0. Have the same dimensions as the output -- if not, it may be
+  //       implicitly broadcast, which can confound the movement's
+  //       correctness.
+  //
+  // And one of the following:
+  //    1. Are reshapes or transposes that have the same input and
+  //       output shapes as all other reshaped or transposed operands.
+  //     or
+  //    2. Are one of kConstant, kRng, and scalars that can change shape
+  //    trivially,
+  if (result) {
+    for (auto& operand : operands) {
+      if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+        VLOG(5) << "Operand shape differs from output shape; may be "
+                   "implicitly broadcast, so preventing "
+                   "movement\n\toperand: "
+                << operand->ToStringNoMetadata()
+                << "\n\tinstruction: " << instruction->ToStringNoMetadata();
+        result = false;
+        break;
+      }
+
+      if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+        VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
+                << first_reshape_operand->ToStringNoMetadata()
+                << "\n\toperand: " << operand->ToStringNoMetadata();
+        continue;
+      }
+
+      if (InstructionCanTriviallyChangeShape(operand)) {
+        VLOG(5) << "Operand can trivially change shape: "
+                << operand->ToStringNoMetadata();
+        continue;
+      }
+
+      // TODO(someone): Look into supporting general ops for the operands as
+      // well.
+      VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
+                 "nor can trivially change shape: "
+              << operand->ToStringNoMetadata();
+      result = false;
+      break;
+    }
+  }
+
+  VLOG(3) << "ElementwiseOfEquivalentReshapesOrTransposes result for "
+          << instruction->ToStringNoMetadata() << ": " << result;
+  return result;
 }
 
 // Try to sink any reshape or transpose operands of `instruction` across it. We
-// do so if `instruction` is elementwise and all operands are equivalent
-// reshapes or transposes.
-bool TrySinkReshapeOrTranspose(HloComputation* computation,
-                               HloInstruction* instruction) {
-  if (IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
-    std::vector<HloInstruction*> operands = instruction->operands();
-    auto old_reshape = operands[0];
-    for (size_t i = 0; i < operands.size(); ++i) {
-      operands[i] = operands[i]->mutable_operand(0);
+// do so if `instruction` is elementwise and all operands are either equivalent
+// reshapes/transposes or are trivially reshapable. Note that no move is
+// performend if there is no nontrivial reshapes/transposes.
+StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
+                                         HloInstruction* instruction) {
+  if (!IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
+    return false;
+  }
+
+  HloInstruction* old_reshape =
+      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+  TF_RET_CHECK(old_reshape != nullptr);
+  Shape new_elementwise_shape = old_reshape->operand(0)->shape();
+
+  VLOG(3) << "** Trying to sink reshape or transpose: "
+          << instruction->ToStringNoMetadata()
+          << "\n\told reshape: " << old_reshape->ToStringNoMetadata()
+          << "\n\tnew elementwise shape: "
+          << ShapeUtil::HumanString(new_elementwise_shape);
+
+  std::vector<HloInstruction*> operands = instruction->operands();
+  for (size_t i = 0; i < operands.size(); ++i) {
+    // All scalar operands remain as-is, even if they're reshape or transpose,
+    // to simplify handling wrt special scalar broadcast rules for ops like
+    // Select. Scalar reshapes should be cheap anyways.
+    if (ShapeUtil::IsScalar(operands[i]->shape())) {
+      continue;
     }
-    auto new_elementwise =
-        computation->AddInstruction(instruction->CloneWithNewOperands(
-            // `instruction` may change the element type, e.g., from
-            //   operands[0] -> reshape -> convert (`instruction`)
-            // to
-            //   operands[0] -> convert' -> reshape'
-            //
-            // In this case, convert' should have the same element type as
-            // `convert` and the same dimensions as operands[0].
-            ShapeUtil::MakeShape(
-                instruction->shape().element_type(),
-                AsInt64Slice(operands[0]->shape().dimensions())),
-            operands));
-    std::unique_ptr<HloInstruction> new_reshape;
-    switch (old_reshape->opcode()) {
-      case HloOpcode::kReshape:
-        new_reshape = HloInstruction::CreateReshape(instruction->shape(),
-                                                    new_elementwise);
+    PrimitiveType element_type = operands[i]->shape().element_type();
+    switch (operands[i]->opcode()) {
+      case HloOpcode::kConstant: {
+        if (old_reshape->opcode() == HloOpcode::kReshape) {
+          VLOG(3) << "Creating reshape for kConstant operand " << i << ": "
+                  << operands[i]->ToStringNoMetadata();
+          operands[i] = instruction->parent()->AddInstruction(
+              HloInstruction::CreateReshape(
+                  ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                               element_type),
+                  operands[i]));
+        } else {
+          TF_RET_CHECK(old_reshape->opcode() == HloOpcode::kTranspose);
+          std::vector<int64> inverse_permutation =
+              InversePermutation(old_reshape->dimensions());
+          operands[i] = instruction->parent()->AddInstruction(
+              HloInstruction::CreateTranspose(
+                  ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                               element_type),
+                  operands[i], inverse_permutation));
+        }
         break;
+      }
+      case HloOpcode::kRng: {
+        CHECK_EQ(operands[i]->user_count(), 1);
+        operands[i] = instruction->parent()->AddInstruction(
+            operands[i]->CloneWithNewOperands(
+                ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                             element_type),
+                operands[i]->operands()));
+        break;
+      }
+      case HloOpcode::kReshape:
       case HloOpcode::kTranspose:
-        new_reshape = HloInstruction::CreateTranspose(
-            instruction->shape(), new_elementwise, old_reshape->dimensions());
+        operands[i] = operands[i]->mutable_operand(0);
         break;
       default:
-        LOG(FATAL) << "Bad opcode";
+        LOG(FATAL) << "Unexpected opcode while trying to sink reshapes or "
+                      "transposes.";
     }
-    TF_CHECK_OK(computation->ReplaceWithNewInstruction(instruction,
-                                                       std::move(new_reshape)));
-    return true;
   }
-  return false;
+  if (HloOpcode::kFusion == instruction->opcode()) {
+    // Here we already know `instruction` is elementwise, and no operand is
+    // implicit broadcast as if it were the operands would not be equivalent
+    // reshapes, so all the fused instructions have the same dimensions.
+    for (const auto& fused_instruction : instruction->fused_instructions()) {
+      Shape* shape = fused_instruction->mutable_shape();
+      *shape->mutable_dimensions() = new_elementwise_shape.dimensions();
+      *shape->mutable_layout() = new_elementwise_shape.layout();
+    }
+  }
+  HloInstruction* new_elementwise =
+      computation->AddInstruction(instruction->CloneWithNewOperands(
+          // `instruction` may change the element type, e.g., from
+          //   operands[0] -> reshape -> convert (`instruction`)
+          // to
+          //   operands[0] -> convert' -> reshape'
+          //
+          // In this case, convert' should have the same element type as
+          // `convert` and the same dimensions as operands[0].
+          ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                       instruction->shape().element_type()),
+          operands));
+
+  std::unique_ptr<HloInstruction> new_reshape;
+  switch (old_reshape->opcode()) {
+    case HloOpcode::kReshape:
+      VLOG(3) << "Creating new reshape for new elementwise op: "
+              << new_elementwise->ToStringNoMetadata();
+      new_reshape =
+          HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
+      break;
+    case HloOpcode::kTranspose:
+      new_reshape = HloInstruction::CreateTranspose(
+          instruction->shape(), new_elementwise, old_reshape->dimensions());
+      break;
+    default:
+      LOG(FATAL) << "Bad opcode";
+  }
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      instruction, std::move(new_reshape)));
+  return true;
 }
 
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
-  return std::any_of(
-      module->computations().begin(), module->computations().end(),
-      [](const std::unique_ptr<HloComputation>& computation) {
-        std::list<HloInstruction*> postorder =
-            computation->MakeInstructionPostOrder();
-        return std::any_of(postorder.begin(), postorder.end(),
-                           [&computation](HloInstruction* instruction) {
-                             return TrySinkReshapeOrTranspose(computation.get(),
-                                                              instruction);
-                           });
-      });
+  bool changed = false;
+  for (const auto& comp : module->computations()) {
+    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(bool did_change,
+                          TrySinkReshapeOrTranspose(comp.get(), instruction));
+      changed |= did_change;
+    }
+  }
+  return changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 850295c7261..9becdb2bed4 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -20,38 +20,523 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 using ReshapeMoverTest = HloTestBase;
 
-TEST_F(ReshapeMoverTest, ReshapesWithNonSameInputShapesNotMoved) {
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
   HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
   auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 8, 7, 1}), "param0"));
-  auto reshape2 =
+      1, ShapeUtil::MakeShape(F32, {1, 8, 7, 1}), "param1"));
+  auto reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape3 =
+  auto reshape1 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto add4 = builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape2, reshape3));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(add4, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+
   EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
-  EXPECT_EQ(add4, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+}
+
+// For a graph that looks like:
+//
+// +- reshape0 - rng0
+// |
+// +- const1
+// |
+// add
+//
+// where rng0 has a different shape than reshape0.
+//
+// Verifies that the reshape is not moved, since rng0 is trivially reshapable
+// and therefore there is no nontrivial reshapes to move.
+TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+  auto rng0 = builder.AddInstruction(
+      HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}),
+                                RandomDistribution::RNG_UNIFORM, {}));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0));
+
+  auto const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateFromShape(root_shape)));
+
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, const1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(rng0), const1));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(rng0), const1));
+}
+
+TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 1, 1}), "param0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 1, 1}), "param1"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, reshape1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Add(op::Reshape(op::Parameter()), op::Reshape(op::Parameter())));
+}
+
+TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, reshape1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Add(param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+// For a graph that looks like:
+//
+// +- reshape2 - param2
+// |
+// +- reshape1 - param1
+// |
+// +- constant0
+// |
+// select
+//
+// Verifies that the reshape1 and reshape2 sink past select:
+//
+// +- param2
+// |
+// +- param1
+// |
+// +- reshape3(constant0)
+// |
+// select
+// |
+// reshape4
+TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
+  auto const0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2<bool>(
+          {{true, true, false}, {false, false, true}})));
+
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param1"));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
+
+  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param2"));
+  auto reshape2 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param2));
+
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      ShapeUtil::MakeShape(PRED, {2, 3}), HloOpcode::kSelect, const0, reshape1,
+      reshape2));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(const0, reshape1, reshape2));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Select(op::Reshape(const0), param1, param2)));
+
+  EXPECT_EQ(const0->shape().DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+// For a graph that looks like:
+//
+// +- reshape0 - param0
+// |
+// +- param1
+// |
+// add
+//
+// Verifies that the reshape0 does not sink below add, because param1 is not
+// trivially reshapable nor is a Reshape/Transpose.
+TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, param1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), param1));
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), param1));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+// For a graph that looks like:
+//
+// +- pred
+// |
+// +- reshape0 - const0
+// |
+// +- reshape1 - const1
+// |
+// select
+//
+// Verifies that we don't unnecessarily sink reshapes, which are in fact
+// trivial reshapes.
+TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
+  auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const0));
+
+  auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const1));
+
+  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(PRED, {1, 3, 1, 2}), "pred"));
+
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      root_shape, HloOpcode::kSelect, pred, reshape0, reshape1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+// For a graph that looks like:
+//
+// +- reshape0 - param0
+// |
+// +- const1
+// |
+// add
+//
+// where there is only 1 non-trivial reshape (reshape0), we sink the reshape
+// here for canonicalization benefit:
+//
+// +- param0
+// |
+// +- reshape1 - const1
+// |
+// add
+// |
+// reshape2
+//
+// (note that reshape1 here is trivial).
+TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param0"));
+  auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, const1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), const1));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Add(param0, op::Reshape(const1))));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      root_shape, HloOpcode::kAdd, reshape0, reshape1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  auto fusion = computation->AddInstruction(HloInstruction::CreateFusion(
+      add->shape(), HloInstruction::FusionKind::kLoop, add));
+  TF_CHECK_OK(computation->ReplaceInstruction(add, fusion));
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Fusion(op::Reshape(param0), op::Reshape(param1)));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Fusion(param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
+  auto pred_shape = ShapeUtil::MakeShape(PRED, {8, 7});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
+  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(PRED, {1, 8, 1, 7}), "pred"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
+  auto reshape_pred =
+      builder.AddInstruction(HloInstruction::CreateReshape(pred_shape, pred));
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Select(pred, param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
+}
+
+TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
+  HloComputation::Builder builder(TestName());
+  auto root_shape = ShapeUtil::MakeShape(F32, {});
+  auto pred_shape = ShapeUtil::MakeShape(PRED, {});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "param0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {}), "param1"));
+  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(PRED, {1, 1, 1}), "pred"));
+  auto reshape_pred =
+      builder.AddInstruction(HloInstruction::CreateReshape(pred_shape, pred));
+  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+      root_shape, HloOpcode::kSelect, reshape_pred, param0, param1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(op::Reshape(pred), param0, param1));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(op::Reshape(pred), param0, param1));
+  EXPECT_EQ(select, computation->root_instruction());
+}
+
+// Tree looks like:
+//
+// param0 [1,128,1]
+//  |
+// reshape [128,1]          constant [128,1024]
+//   \                         /
+//     multiply w/implicit broadcast [128,1024]
+//
+// The reshape mover would like to sink the reshape below the multiply.
+//
+// Previously we would attempt to insert a reshape of the constant to [1,128,1]
+// (which is unsound, because it has a different number of elements) as
+// preparation for sinking the reshape.
+//
+// To eliminate the unsoundness, we outlaw reshape sinking when one of the
+// operands is implicitly broadcast in the elementwise consumer.
+//
+// TODO(b/37799338) However, it would be possible in this case to do a more
+// in-depth analysis to get reshape movement to occur:
+//
+// 1. Note that the broadcast dimension (logical dimension 1) in the operands
+//    would map back to logical dimension 2 in the param0 node.
+// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
+//    dimension).
+// 3. Reshape to [128,1024] at the root.
+//
+// But this is not currently done.
+TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
+  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(F32, {128, 1}), param0));
+  Array2D<float> a(128, 1024);
+  auto literal = LiteralUtil::CreateR2FromArray2D<float>(a);
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant->shape(), HloOpcode::kMultiply, constant, reshape));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+  EXPECT_EQ(multiply, computation->root_instruction());
+}
+
+// Tree looks like this:
+//
+// add1
+// |
+// +- reshape2 - param2
+// |
+// +- reshape3 - add0
+//               |
+//               + reshape0 - param0
+//               |
+//               + reshape1 - param1
+//
+// We expect reshape{0,1} AND reshape{2,3} to be lifted.
+TEST_F(ReshapeMoverTest, MultiplePasses) {
+  auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
+  auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
+  auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape1, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape1, "param1"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, shape2, "param2"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param1));
+  auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape2, HloOpcode::kAdd, reshape0, reshape1));
+  auto reshape2 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape3, param2));
+  auto reshape3 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape3, add0));
+  builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
+                                                      reshape2, reshape3));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Add(op::Reshape(param2),
+              op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Reshape(op::Add(param2, op::Reshape(op::Add(param0, param1)))));
 }
 
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 725808bc887..85ca7e4e59c 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -47,12 +46,14 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace se = ::perftools::gputools;
 
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrCat;
+
 namespace xla {
 
 namespace {
@@ -76,8 +77,10 @@ tensorflow::Status RecordArguments(
     SessionModule* module) {
   module->clear_arguments();
   for (const Allocation* allocation : arg_allocations) {
-    TF_RETURN_IF_ERROR(LiteralFromAllocation(allocation, allocation->shape(),
-                                             module->add_arguments()));
+    Literal argument;
+    TF_RETURN_IF_ERROR(
+        LiteralFromAllocation(allocation, allocation->shape(), &argument));
+    *module->add_arguments() = argument.ToProto();
   }
   return tensorflow::Status::OK();
 }
@@ -86,8 +89,11 @@ tensorflow::Status RecordArguments(
 tensorflow::Status RecordResult(const Allocation* result_allocation,
                                 SessionModule* module) {
   module->clear_result();
-  return LiteralFromAllocation(result_allocation, result_allocation->shape(),
-                               module->mutable_result());
+  Literal result;
+  TF_RETURN_IF_ERROR(LiteralFromAllocation(
+      result_allocation, result_allocation->shape(), &result));
+  *module->mutable_result() = result.ToProto();
+  return tensorflow::Status::OK();
 }
 
 }  // namespace
@@ -109,6 +115,16 @@ ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
 
 int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
 
+ServiceOptions& ServiceOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int ServiceOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     perftools::gputools::Platform* platform) {
   ServiceOptions default_options;
@@ -123,9 +139,10 @@ int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
-  TF_ASSIGN_OR_RETURN(
-      execute_backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  backend_options.set_number_of_replicas(options.number_of_replicas());
+  TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
   std::unique_ptr<Service> service(new Service(
@@ -139,37 +156,18 @@ Service::CreateComputeConstantBackend() {
                       PlatformUtil::GetSupportedPlatforms());
   for (auto* platform : platforms) {
     if (platform->id() == se::host::kHostPlatformId) {
-      return Backend::CreateBackend(platform, /*replica_count=*/1);
+      BackendOptions backend_options;
+      backend_options.set_platform(platform);
+      backend_options.set_number_of_replicas(1);
+      return Backend::CreateBackend(backend_options);
     }
   }
   return NotFound("CPU platform not found");
 }
 
-/* static */ void Service::DumpExecutedHlo(const HloModule& module,
-                                           const string& label,
-                                           const HloExecutionProfile* profile) {
-  VLOG(2) << "module name = " << module.name();
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  if (!flags->xla_generate_hlo_graph.empty() &&
-      RE2::PartialMatch(module.name(), flags->xla_generate_hlo_graph)) {
-    hlo_graph_dumper::DumpGraph(*module.entry_computation(), label,
-                                flags->xla_hlo_graph_addresses,
-                                flags->xla_hlo_graph_layout, profile);
-  }
-  if (!flags->xla_log_hlo_text.empty() &&
-      RE2::PartialMatch(module.name(), flags->xla_log_hlo_text)) {
-    LOG(INFO) << "HLO for module " << module.name();
-    LOG(INFO) << "Label: " << label;
-    XLA_LOG_LINES(2, module.ToString());
-  }
-  if (!flags->xla_dump_hlo_text_to.empty()) {
-    hlo_graph_dumper::DumpText(module, label, flags->xla_dump_hlo_text_to);
-  }
-}
-
 /* static */ Compiler::HloDumper Service::MakeHloDumper() {
   return [](const HloModule& module, const string& label) {
-    return DumpExecutedHlo(module, label, /*profile=*/nullptr);
+    return Executable::DumpExecutedHlo(module, label, /*profile=*/nullptr);
   };
 }
 
@@ -177,20 +175,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
                  std::unique_ptr<Backend> compute_constant_backend)
     : execute_backend_(std::move(execute_backend)),
       compute_constant_backend_(std::move(compute_constant_backend)) {
-  LOG(INFO) << "XLA service executing computations on platform "
-            << execute_backend_->platform()->Name() << ". Devices:";
-  for (int i = 0; i < execute_backend_->device_count(); ++i) {
-    if (execute_backend_->device_ordinal_supported(i)) {
-      se::StreamExecutor* executor =
-          execute_backend_->stream_executor(i).ValueOrDie();
-      const auto& description = executor->GetDeviceDescription();
-      LOG(INFO) << tensorflow::strings::Printf(
-          "  StreamExecutor device (%d): %s, %s", i, description.name().c_str(),
-          description.platform_version().c_str());
-    } else {
-      LOG(INFO) << tensorflow::strings::Printf(
-          "  StreamExecutor device (%d) not supported", i);
+  if (execute_backend_) {
+    LOG(INFO) << Printf(
+        "XLA service %p executing computations on platform %s. Devices:", this,
+        execute_backend_->platform()->Name().c_str());
+    for (int i = 0; i < execute_backend_->device_count(); ++i) {
+      if (execute_backend_->device_ordinal_supported(i)) {
+        se::StreamExecutor* executor =
+            execute_backend_->stream_executor(i).ValueOrDie();
+        const auto& description = executor->GetDeviceDescription();
+        LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
+                            description.name().c_str(),
+                            description.platform_version().c_str());
+      } else {
+        LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+      }
     }
+  } else {
+    VLOG(1) << "XLA compile-only service constructed";
   }
 }
 
@@ -202,6 +204,8 @@ tensorflow::Status Service::Computation(const ComputationRequest* arg,
 
   *result->mutable_computation() =
       computation_tracker_.NewComputation(arg->name());
+  VLOG(1) << Printf("Created new computation %s on service %p",
+                    result->computation().ShortDebugString().c_str(), this);
   return tensorflow::Status::OK();
 }
 
@@ -251,13 +255,12 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
     const Backend* backend, int device_ordinal) {
   std::vector<const Allocation*> allocations;
-  for (int i = 0; i < arguments.size(); ++i) {
+  for (size_t i = 0; i < arguments.size(); ++i) {
     auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
     if (!allocation_status.ok()) {
       return Status(allocation_status.status().code(),
-                    tensorflow::strings::StrCat(
-                        allocation_status.status().error_message(), ", ",
-                        "failed to resolve allocation for parameter ", i));
+                    StrCat(allocation_status.status().error_message(), ", ",
+                           "failed to resolve allocation for parameter ", i));
     }
     const Allocation* allocation = allocation_status.ValueOrDie();
 
@@ -265,7 +268,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
     if (allocation->backend() != backend ||
         allocation->device_ordinal() != device_ordinal) {
       return InvalidArgument(
-          "argument %d is on device %s but computation will be executed "
+          "argument %lu is on device %s but computation will be executed "
           "on device %s",
           i,
           allocation->backend()
@@ -282,7 +285,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options, Backend* backend) {
   auto module_config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = module_config->mutable_entry_computation_layout();
 
@@ -291,13 +294,13 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
                            program_shape.parameters_size(), arguments.size());
   }
 
-  for (int i = 0; i < arguments.size(); ++i) {
+  for (size_t i = 0; i < arguments.size(); ++i) {
     // Verify that shape of arguments matches the shape of the arguments in the
     // ProgramShape.
     if (!ShapeUtil::Compatible(arguments[i]->shape(),
                                program_shape.parameters(i))) {
       return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+          "computation expects parameter %lu to have shape %s, given shape %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(arguments[i]->shape()).c_str());
     }
@@ -322,9 +325,9 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     module_config->enable_hlo_profiling(true);
   }
 
-  module_config->set_replica_count(execute_backend_->Replicas().size());
-  module_config->set_fast_math_disabled(execution_options.disable_fast_math());
+  module_config->set_replica_count(backend->Replicas().size());
   module_config->set_seed(execution_options.seed());
+  module_config->set_debug_options(execution_options.debug_options());
 
   return std::move(module_config);
 }
@@ -334,6 +337,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend,
     std::vector<perftools::gputools::StreamExecutor*> executors) {
+  VLOG(1) << Printf("BuildExecutable on service %p", this);
+
   // Dump computation proto state if flag is set.
   std::vector<std::unique_ptr<SessionModule>> session_modules;
   legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
@@ -345,11 +350,10 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
                           computation_tracker_.SnapshotComputation(
                               versioned_handles[i].handle));
       if (!directory_path.empty()) {
-        string filename =
-            tensorflow::strings::Printf("computation_%lld__%s__version_%lld",
-                                        versioned_handles[i].handle.handle(),
-                                        session_module->entry().name().c_str(),
-                                        versioned_handles[i].version);
+        string filename = Printf("computation_%lld__%s__version_%lld",
+                                 versioned_handles[i].handle.handle(),
+                                 session_module->entry().name().c_str(),
+                                 versioned_handles[i].version);
         TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
                                                        *session_module));
         session_modules.push_back(std::move(session_module));
@@ -357,29 +361,31 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     }
   }
 
-  VLOG(1) << "building executables from:";
+  VLOG(1) << "Computation handles:";
   for (const VersionedComputationHandle& versioned_handle : versioned_handles) {
-    VLOG(1) << versioned_handle.handle.handle() << "@v"
-            << versioned_handle.version;
+    VLOG(1) << versioned_handle;
   }
 
+  CHECK_EQ(versioned_handles.size(), module_configs.size());
   std::vector<std::unique_ptr<HloModule>> modules;
-  for (const VersionedComputationHandle& versioned_handle : versioned_handles) {
+  for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    const VersionedComputationHandle& versioned_handle = versioned_handles[i];
+    const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module,
                         computation_tracker_.BuildHloModule(
-                            versioned_handle,
-                            /*include_unused_parameters=*/true));
+                            versioned_handle, config,
+                            /*include_unreachable_instructions=*/true));
     modules.push_back(std::move(module));
   }
 
   Compiler::HloDumper hlo_dumper = MakeHloDumper();
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
-                      backend->compiler()->Compile(
-                          std::move(modules), std::move(module_configs),
-                          hlo_dumper, std::move(executors)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), hlo_dumper,
+                                   std::move(executors)));
 
   if (!other_directory_path.empty()) {
-    for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    for (size_t i = 0; i < versioned_handles.size(); ++i) {
       executables[i]->set_session_module(std::move(session_modules[i]));
     }
   }
@@ -394,6 +400,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
         arguments,
     Backend* backend, se::StreamExecutor* executor) {
+  VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
+                    versioned_handle.ToString().c_str());
+
   // Dump computation proto state if flag is set.
   std::unique_ptr<SessionModule> session_module;
   legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
@@ -405,24 +414,20 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
         session_module,
         computation_tracker_.SnapshotComputation(versioned_handle.handle));
     if (!directory_path.empty()) {
-      string filename = tensorflow::strings::Printf(
-          "computation_%lld__%s__version_%lld",
-          versioned_handle.handle.handle(),
-          session_module->entry().name().c_str(), versioned_handle.version);
+      string filename = Printf("computation_%lld__%s__version_%lld",
+                               versioned_handle.handle.handle(),
+                               session_module->entry().name().c_str(),
+                               versioned_handle.version);
       TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
                                                      *session_module));
     }
   }
 
-  VLOG(1) << tensorflow::strings::Printf("building executable %lld@v%lld",
-                                         versioned_handle.handle.handle(),
-                                         versioned_handle.version);
-
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(
-          versioned_handle,
-          /*include_unused_parameters=*/!executable_for_compute_constant));
+      computation_tracker_.BuildHloModule(versioned_handle, *module_config,
+                                          /*include_unreachable_instructions=*/
+                                          !executable_for_compute_constant));
 
   Compiler::HloDumper hlo_dumper = MakeHloDumper();
   if (executable_for_compute_constant &&
@@ -432,8 +437,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend->compiler()->Compile(std::move(module), std::move(module_config),
-                                   hlo_dumper, executor));
+      backend->compiler()->Compile(std::move(module), hlo_dumper, executor));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -471,7 +475,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config),
                       /*executable_for_compute_constant=*/false, arguments,
-                      execute_backend_.get(), executor));
+                      backend, executor));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -498,36 +502,30 @@ Service::ExecuteParallelAndRegisterResult(
   TF_RET_CHECK(backend->Replicas().size() == 1);
 
   // Set up streams.
-  std::vector<std::unique_ptr<se::Stream>> streams;
-
-  auto stream_releaser = ::tensorflow::gtl::MakeCleanup([backend, &streams]() {
-    for (std::unique_ptr<se::Stream>& stream : streams) {
-      backend->ReleaseStream(std::move(stream));
-    }
-  });
+  std::vector<Pool<se::Stream>::SmartPtr> streams;
 
   for (se::StreamExecutor* executor : executors) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
-                        backend->AcquireStream(executor));
-    // Push back after so that the releaser only sees real streams.
+    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+                        backend->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
 
   // Set up run options.
-  std::vector<ExecutableRunOptions> run_options;
-  for (const std::unique_ptr<se::Stream>& stream : streams) {
-    run_options.emplace_back();
-    auto& options = run_options.back();
+  std::vector<ServiceExecutableRunOptions> run_options;
+  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
+    ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_allocator(backend->memory_allocator());
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
+    run_options.emplace_back(options, backend->StreamBorrower());
   }
 
   // Asynchronously launch all executables.
   std::vector<GlobalDataHandle> result_handles;
-  for (int64 i = 0; i < executables.size(); i++) {
+  for (tensorflow::gtl::ArraySlice<Executable*>::size_type i = 0;
+       i < executables.size(); i++) {
     TF_ASSIGN_OR_RETURN(
         perftools::gputools::DeviceMemoryBase result,
         executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i]));
@@ -555,52 +553,39 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
   TF_RET_CHECK(!backend->Replicas().empty());
 
   // Set up streams.
-  std::vector<std::unique_ptr<se::Stream>> streams;
-
-  auto stream_releaser = ::tensorflow::gtl::MakeCleanup([backend, &streams]() {
-    for (std::unique_ptr<se::Stream>& stream : streams) {
-      backend->ReleaseStream(std::move(stream));
-    }
-  });
+  std::vector<Pool<se::Stream>::SmartPtr> streams;
 
   for (se::StreamExecutor* executor : backend->Replicas()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
-                        backend->AcquireStream(executor));
-    // Push back after so that the releaser only sees real streams.
+    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+                        backend->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
 
   // Set up run options.
-  std::vector<ExecutableRunOptions> run_options;
-  for (const std::unique_ptr<se::Stream>& stream : streams) {
-    run_options.emplace_back();
-    auto& options = run_options.back();
+  std::vector<ServiceExecutableRunOptions> run_options;
+  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
+    ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_allocator(backend->memory_allocator());
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
+    run_options.emplace_back(options, backend->StreamBorrower(),
+                             backend->inter_op_thread_pool());
   }
 
   perftools::gputools::DeviceMemoryBase result;
   if (backend->Replicas().size() == 1) {
     TF_ASSIGN_OR_RETURN(
-        result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
-                    executable, &run_options[0], profile,
-                    [&arguments](Executable* executable,
-                                 const ExecutableRunOptions* run_options,
-                                 HloExecutionProfile* hlo_execution_profile) {
-                      return executable->ExecuteOnStream(run_options, arguments,
-                                                         hlo_execution_profile);
-                    }));
+        result, executable->ExecuteOnStreamWrapper<se::DeviceMemoryBase>(
+                    &run_options[0], profile, arguments));
   } else {
     std::vector<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
         repeated_arguments(backend->Replicas().size(), arguments);
 
-    TF_ASSIGN_OR_RETURN(
-        auto results,
-        executable->ExecuteOnStreams(run_options, repeated_arguments));
+    TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
+                                          run_options, repeated_arguments));
     TF_RET_CHECK(!results.empty());
     result = results[0];
   }
@@ -668,6 +653,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
         ResolveAndValidateArguments(request.arguments(), execute_backend_.get(),
                                     executor->device_ordinal()));
     std::vector<se::DeviceMemoryBase> arguments;
+    arguments.reserve(arg_allocations.size());
     for (const Allocation* allocation : arg_allocations) {
       arguments.push_back(allocation->device_memory());
     }
@@ -676,7 +662,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // the program and the argument allocations.
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                         CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options()));
+                                           request.execution_options(),
+                                           execute_backend_.get()));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -695,6 +682,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
       BuildExecutables(versioned_handles, std::move(module_configs),
                        execute_backend_.get(), executors));
   std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
   for (const auto& executable : executables) {
     executable_ptrs.push_back(executable.get());
   }
@@ -761,14 +749,16 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
   std::vector<se::DeviceMemoryBase> arguments;
+  arguments.reserve(arg_allocations.size());
   for (const Allocation* allocation : arg_allocations) {
     arguments.push_back(allocation->device_memory());
   }
@@ -828,14 +818,16 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
   std::vector<se::DeviceMemoryBase> arguments;
+  arguments.reserve(arg_allocations.size());
   for (const Allocation* allocation : arg_allocations) {
     arguments.push_back(allocation->device_memory());
   }
@@ -851,23 +843,16 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
 
   TF_RET_CHECK(!execute_backend_->Replicas().empty());
   // Set up streams.
-  std::vector<std::unique_ptr<se::Stream>> streams;
-
-  auto stream_releaser = ::tensorflow::gtl::MakeCleanup([this, &streams]() {
-    for (std::unique_ptr<se::Stream>& stream : streams) {
-      execute_backend_->ReleaseStream(std::move(stream));
-    }
-  });
+  std::vector<Pool<se::Stream>::SmartPtr> streams;
 
   for (se::StreamExecutor* executor : execute_backend_->Replicas()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
-                        execute_backend_->AcquireStream(executor));
-    // Push back after so that the releaser only sees real streams.
+    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+                        execute_backend_->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
 
   perftools::gputools::DeviceMemoryBase result_data;
-  for (const std::unique_ptr<se::Stream>& stream : streams) {
+  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_allocator(execute_backend_->memory_allocator());
@@ -875,8 +860,12 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     options.set_intra_op_thread_pool(
         execute_backend_->eigen_intra_op_thread_pool_device());
 
-    TF_ASSIGN_OR_RETURN(perftools::gputools::DeviceMemoryBase this_result_data,
-                        executable->ExecuteAsyncOnStream(&options, arguments));
+    ServiceExecutableRunOptions service_options(
+        options, execute_backend_->StreamBorrower());
+
+    TF_ASSIGN_OR_RETURN(
+        perftools::gputools::DeviceMemoryBase this_result_data,
+        executable->ExecuteAsyncOnStream(&service_options, arguments));
 
     // Take the first result.
     if (result_data == nullptr) {
@@ -927,13 +916,15 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
     literal_shape = &allocation->shape();
   }
 
-  return LiteralFromAllocation(allocation, *literal_shape,
-                               result->mutable_literal());
+  Literal literal;
+  auto status = LiteralFromAllocation(allocation, *literal_shape, &literal);
+  *result->mutable_literal() = literal.ToProto();
+  return status;
 }
 
 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                                              TransferToServerResponse* result) {
-  const Literal& literal = arg->literal();
+  Literal literal = Literal(arg->literal());
   const Shape& shape = literal.shape();
 
   if (ShapeUtil::IsTuple(shape) && execute_backend_->Replicas().size() > 1) {
@@ -945,9 +936,8 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
 
   se::StreamExecutor* stream_executor;
   if (arg->has_device_handle()) {
-    TF_ASSIGN_OR_RETURN(
-        stream_executor,
-        execute_backend_->stream_executor(arg->device_handle().handle()));
+    TF_ASSIGN_OR_RETURN(stream_executor, execute_backend_->stream_executor(
+                                             arg->device_handle().handle()));
   } else {
     stream_executor = execute_backend_->default_stream_executor();
   }
@@ -964,12 +954,10 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
 
   *result->mutable_data() = allocation_tracker_.Register(
       execute_backend_.get(), stream_executor->device_ordinal(), allocation,
-      shape, tensorflow::strings::StrCat("TransferToServer literal of size ",
-                                         allocation_size));
+      shape, StrCat("TransferToServer literal of size ", allocation_size));
 
-  TF_ASSIGN_OR_RETURN(
-      auto replicas,
-      execute_backend_->Replicas(stream_executor->device_ordinal()));
+  TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
+                                         stream_executor->device_ordinal()));
   for (se::StreamExecutor* executor : replicas) {
     TF_RETURN_IF_ERROR(
         execute_backend_->transfer_manager()->TransferLiteralToDevice(
@@ -984,25 +972,51 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
         "%s",
-        tensorflow::strings::StrCat(
-            "The replica_id=", arg->replica_id(),
-            " on TransferToInfeedRequest not in range [0, replica_count=",
-            replica_count, ").")
+        StrCat("The replica_id=", arg->replica_id(),
+               " on TransferToInfeedRequest not in range [0, replica_count=",
+               replica_count, ").")
             .c_str());
   }
 
   se::StreamExecutor* executor;
   if (arg->has_device_handle()) {
-    TF_ASSIGN_OR_RETURN(
-        auto replicas,
-        execute_backend_->Replicas(arg->device_handle().handle()));
+    TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
+                                           arg->device_handle().handle()));
     executor = replicas[arg->replica_id()];
   } else {
     executor = execute_backend_->Replicas()[arg->replica_id()];
   }
 
   return execute_backend_->transfer_manager()->TransferLiteralToInfeed(
-      executor, arg->literal());
+      executor, Literal(arg->literal()));
+}
+
+tensorflow::Status Service::TransferFromOutfeed(
+    const TransferFromOutfeedRequest* arg,
+    TransferFromOutfeedResponse* result) {
+  const int64 replica_count = execute_backend_->Replicas().size();
+  if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
+    return FailedPrecondition(
+        "The replica_id=%lld on TransferFromOutfeedRequest not in range [0, "
+        "%lld)",
+        arg->replica_id(), replica_count);
+  }
+
+  se::StreamExecutor* executor;
+  if (arg->has_device_handle()) {
+    TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
+                                           arg->device_handle().handle()));
+    executor = replicas[arg->replica_id()];
+  } else {
+    executor = execute_backend_->Replicas()[arg->replica_id()];
+  }
+
+  Literal literal;
+  TF_RETURN_IF_ERROR(
+      execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
+          executor, arg->shape_with_layout(), &literal));
+  *result->mutable_literal() = literal.ToProto();
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
@@ -1010,71 +1024,6 @@ tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
   return execute_backend_->ResetDevices();
 }
 
-tensorflow::Status Service::TransferToClientInProcess(
-    const TransferToClientInProcessRequest* arg,
-    TransferToClientInProcessResponse* result) {
-  TF_RETURN_IF_ERROR(CheckRunsInClientProcess("TransferToClientInProcess"));
-
-  TF_ASSIGN_OR_RETURN(const Allocation* allocation,
-                      allocation_tracker_.Resolve(arg->data()));
-
-  void* buffer = reinterpret_cast<void*>(arg->buffer());
-  int64 size = ShapeUtil::ByteSizeOf(allocation->shape());
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      allocation->backend()->stream_executor(allocation->device_ordinal()));
-
-  return allocation->backend()->transfer_manager()->TransferBufferFromDevice(
-      executor, allocation->device_memory(), size, buffer);
-}
-
-tensorflow::Status Service::TransferToServerInProcess(
-    const TransferToServerInProcessRequest* arg,
-    TransferToServerInProcessResponse* result) {
-  TF_RETURN_IF_ERROR(CheckRunsInClientProcess("TransferToServerInProcess"));
-
-  const Shape& shape = arg->shape();
-
-  if (ShapeUtil::IsTuple(shape) && execute_backend_->Replicas().size() > 1) {
-    // TODO(b/32990684): Tuple transfers to host end up allocating further
-    // buffers - implement that correctly.
-    return Unimplemented(
-        "Tuple transfers to the device not supported with replication.");
-  }
-
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("shape must have layout");
-  }
-
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-
-  const void* buffer = reinterpret_cast<const void*>(arg->buffer());
-
-  // Allocate memory on the device, using the stream executor. The size of the
-  // allocation is obtained by examining the shape of the literal passed from
-  // the client. An allocation handle is returned in the response.
-  int64 allocation_size =
-      execute_backend_->transfer_manager()->GetByteSizeRequirement(shape);
-  se::StreamExecutor* stream_executor =
-      execute_backend_->default_stream_executor();
-
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase allocation,
-                      execute_backend_->memory_allocator()->Allocate(
-                          stream_executor->device_ordinal(), allocation_size));
-
-  *result->mutable_data() = allocation_tracker_.Register(
-      execute_backend_.get(), stream_executor->device_ordinal(), allocation,
-      shape, tensorflow::strings::StrCat("TransferToServer literal of size ",
-                                         allocation_size));
-
-  for (se::StreamExecutor* executor : execute_backend_->Replicas()) {
-    TF_RETURN_IF_ERROR(
-        execute_backend_->transfer_manager()->TransferBufferToDevice(
-            executor, allocation_size, buffer, &allocation));
-  }
-  return tensorflow::Status::OK();
-}
-
 tensorflow::Status Service::IsConstant(const IsConstantRequest* arg,
                                        IsConstantResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
@@ -1123,7 +1072,7 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
 
   ExecutionOptions execution_options;
-  execution_options.set_disable_fast_math(true);
+  execution_options.mutable_debug_options()->set_xla_enable_fast_math(false);
   *execution_options.mutable_shape_with_output_layout() =
       program_shape.result();
 
@@ -1136,7 +1085,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         compute_constant_backend_.get()));
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
@@ -1172,9 +1122,8 @@ tensorflow::Status Service::GetComputationShape(
   VersionedComputationHandle versioned_handle =
       computation->GetVersionedHandle();
 
-  TF_ASSIGN_OR_RETURN(
-      auto program_shape,
-      computation->ComputeProgramShape(versioned_handle.version));
+  TF_ASSIGN_OR_RETURN(auto program_shape, computation->ComputeProgramShape(
+                                              versioned_handle.version));
   *result->mutable_program_shape() = *program_shape;
   return tensorflow::Status::OK();
 }
@@ -1197,13 +1146,15 @@ tensorflow::Status Service::GetComputationStats(
   VersionedComputationHandle versioned_handle =
       user_computation->GetVersionedHandle();
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      computation_tracker_.BuildHloModule(versioned_handle));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      computation_tracker_.BuildHloModule(versioned_handle, HloModuleConfig()));
 
   MakeHloDumper()(*module, "computation statistics subject");
 
   // Run HLO analysis to get the computation statistics.
-  HloCostAnalysis analysis;
+  HloCostAnalysis analysis(
+      execute_backend_->compiler()->ShapeSizeBytesFunction());
 
   TF_RETURN_IF_ERROR(
       module->entry_computation()->root_instruction()->Accept(&analysis));
@@ -1241,57 +1192,63 @@ tensorflow::Status Service::AddInstruction(
 tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
   TF_ASSIGN_OR_RETURN(UserComputation * computation,
                       computation_tracker_.Resolve(arg->computation()));
-  StatusOr<ComputationDataHandle> handle;
+  StatusOr<ComputationDataHandle> handle_status;
 
   switch (arg->op_case()) {
     case OpRequest::kBinaryOpRequest:
-      handle = computation->AddBinaryInstruction(arg->binary_op_request());
+      handle_status =
+          computation->AddBinaryInstruction(arg->binary_op_request());
       break;
     case OpRequest::kBroadcastRequest:
-      handle = computation->AddBroadcastInstruction(arg->broadcast_request());
+      handle_status =
+          computation->AddBroadcastInstruction(arg->broadcast_request());
       break;
     case OpRequest::kCallRequest: {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
           computation_tracker_.Resolve(arg->call_request().to_apply()));
-      handle = computation->AddCallInstruction(arg->call_request(), *to_apply);
+      handle_status =
+          computation->AddCallInstruction(arg->call_request(), *to_apply);
       break;
     }
     case OpRequest::kConcatenateRequest:
-      handle =
+      handle_status =
           computation->AddConcatenateInstruction(arg->concatenate_request());
       break;
     case OpRequest::kConstantRequest:
-      handle = computation->AddConstantInstruction(arg->constant_request());
+      handle_status =
+          computation->AddConstantInstruction(arg->constant_request());
       break;
     case OpRequest::kConvertRequest:
-      handle = computation->AddConvertInstruction(arg->convert_request());
+      handle_status =
+          computation->AddConvertInstruction(arg->convert_request());
       break;
     case OpRequest::kConvolveRequest:
-      handle = computation->AddConvolveInstruction(arg->convolve_request());
+      handle_status =
+          computation->AddConvolveInstruction(arg->convolve_request());
       break;
     case OpRequest::kCrossReplicaSumRequest:
-      handle = computation->AddCrossReplicaSumInstruction(
+      handle_status = computation->AddCrossReplicaSumInstruction(
           arg->cross_replica_sum_request());
       break;
     case OpRequest::kCustomCallRequest:
-      handle =
+      handle_status =
           computation->AddCustomCallInstruction(arg->custom_call_request());
       break;
     case OpRequest::kDynamicSliceRequest:
-      handle =
+      handle_status =
           computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
       break;
     case OpRequest::kDynamicUpdateSliceRequest:
-      handle = computation->AddDynamicUpdateSliceInstruction(
+      handle_status = computation->AddDynamicUpdateSliceInstruction(
           arg->dynamic_update_slice_request());
       break;
     case OpRequest::kGetTupleElementRequest:
-      handle = computation->AddGetTupleElementInstruction(
+      handle_status = computation->AddGetTupleElementInstruction(
           arg->get_tuple_element_request());
       break;
     case OpRequest::kInfeedRequest:
-      handle = computation->AddInfeedInstruction(arg->infeed_request());
+      handle_status = computation->AddInfeedInstruction(arg->infeed_request());
       break;
     case OpRequest::kOutfeedRequest:
       TF_RETURN_IF_ERROR(
@@ -1301,20 +1258,22 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
           computation_tracker_.Resolve(arg->map_request().to_apply()));
-      handle = computation->AddMapInstruction(arg->map_request(), *to_apply);
+      handle_status =
+          computation->AddMapInstruction(arg->map_request(), *to_apply);
       break;
     }
     case OpRequest::kPadRequest:
-      handle = computation->AddPadInstruction(arg->pad_request());
+      handle_status = computation->AddPadInstruction(arg->pad_request());
       break;
     case OpRequest::kParameterRequest:
-      handle = computation->AddParameterInstruction(arg->parameter_request());
+      handle_status =
+          computation->AddParameterInstruction(arg->parameter_request());
       break;
     case OpRequest::kReduceRequest: {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
           computation_tracker_.Resolve(arg->reduce_request().to_apply()));
-      handle =
+      handle_status =
           computation->AddReduceInstruction(arg->reduce_request(), *to_apply);
       break;
     }
@@ -1322,18 +1281,20 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       TF_ASSIGN_OR_RETURN(UserComputation * to_apply,
                           computation_tracker_.Resolve(
                               arg->reduce_window_request().to_apply()));
-      handle = computation->AddReduceWindowInstruction(
+      handle_status = computation->AddReduceWindowInstruction(
           arg->reduce_window_request(), *to_apply);
       break;
     }
     case OpRequest::kReshapeRequest:
-      handle = computation->AddReshapeInstruction(arg->reshape_request());
+      handle_status =
+          computation->AddReshapeInstruction(arg->reshape_request());
       break;
     case OpRequest::kReverseRequest:
-      handle = computation->AddReverseInstruction(arg->reverse_request());
+      handle_status =
+          computation->AddReverseInstruction(arg->reverse_request());
       break;
     case OpRequest::kRngRequest:
-      handle = computation->AddRngInstruction(arg->rng_request());
+      handle_status = computation->AddRngInstruction(arg->rng_request());
       break;
     case OpRequest::kSelectAndScatterRequest: {
       TF_ASSIGN_OR_RETURN(UserComputation * select,
@@ -1342,23 +1303,29 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       TF_ASSIGN_OR_RETURN(UserComputation * scatter,
                           computation_tracker_.Resolve(
                               arg->select_and_scatter_request().scatter()));
-      handle = computation->AddSelectAndScatterInstruction(
+      handle_status = computation->AddSelectAndScatterInstruction(
           arg->select_and_scatter_request(), *select, *scatter);
       break;
     }
     case OpRequest::kSliceRequest:
-      handle = computation->AddSliceInstruction(arg->slice_request());
+      handle_status = computation->AddSliceInstruction(arg->slice_request());
       break;
     case OpRequest::kTernaryOpRequest:
-      handle = computation->AddTernaryInstruction(arg->ternary_op_request());
+      handle_status =
+          computation->AddTernaryInstruction(arg->ternary_op_request());
       break;
     case OpRequest::kTraceRequest:
       return computation->AddTraceInstruction(arg->trace_request());
+    case OpRequest::kTransposeRequest:
+      handle_status =
+          computation->AddTransposeInstruction(arg->transpose_request());
+      break;
     case OpRequest::kUnaryOpRequest:
-      handle = computation->AddUnaryInstruction(arg->unary_op_request());
+      handle_status = computation->AddUnaryInstruction(arg->unary_op_request());
       break;
     case OpRequest::kVariadicOpRequest:
-      handle = computation->AddVariadicInstruction(arg->variadic_op_request());
+      handle_status =
+          computation->AddVariadicInstruction(arg->variadic_op_request());
       break;
     case OpRequest::kWhileRequest: {
       TF_ASSIGN_OR_RETURN(
@@ -1367,8 +1334,8 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       TF_ASSIGN_OR_RETURN(
           UserComputation * body,
           computation_tracker_.Resolve(arg->while_request().body()));
-      handle = computation->AddWhileInstruction(arg->while_request(),
-                                                *condition, *body);
+      handle_status = computation->AddWhileInstruction(arg->while_request(),
+                                                       *condition, *body);
       break;
     }
     case OpRequest::kSendRequest: {
@@ -1380,13 +1347,19 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
     case OpRequest::kRecvRequest: {
       TF_RETURN_IF_ERROR(
           channel_tracker_.RegisterRecv(arg->recv_request().channel_handle()));
-      handle = computation->AddRecvInstruction(arg->recv_request());
+      handle_status = computation->AddRecvInstruction(arg->recv_request());
       break;
     }
     default:
       return InvalidArgument("Unsupported operation");
   }
-  TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle);
+  TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
+
+  // We set the debug metadata here, because we slice off part of the OpRequest
+  // proto in the above switch statement.
+  TF_ASSIGN_OR_RETURN(ComputationDataHandle handle, handle_status);
+  TF_RETURN_IF_ERROR(computation->SetOpMetadata(handle, arg->metadata()));
+
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 9c4b0f44c82..abd1281bdd0 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -63,9 +63,14 @@ class ServiceOptions {
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  ServiceOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 // The XLA service object, which is the same across all
@@ -146,11 +151,6 @@ class Service : public ServiceInterface {
       const TransferToClientRequest* arg,
       TransferToClientResponse* result) override;
 
-  // Requests that global data be copied into a buffer supplied by the client.
-  tensorflow::Status TransferToClientInProcess(
-      const TransferToClientInProcessRequest* arg,
-      TransferToClientInProcessResponse* result) override;
-
   // Transfers data from a literal provided by the client, into device memory.
   tensorflow::Status TransferToServer(
       const TransferToServerRequest* arg,
@@ -162,6 +162,12 @@ class Service : public ServiceInterface {
       const TransferToInfeedRequest* arg,
       TransferToInfeedResponse* result) override;
 
+  // Transfers data from the Outfeed othe device to the literal provided by the
+  // client.
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
   // Resets devices, clearing all existing state on all the devices associated
   // with this service (including memory allocated on the devices).
   //
@@ -174,11 +180,6 @@ class Service : public ServiceInterface {
   tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
                                  ResetDeviceResponse* result) override;
 
-  // Transfers data from a buffer provided by the client, into device memory.
-  tensorflow::Status TransferToServerInProcess(
-      const TransferToServerInProcessRequest* arg,
-      TransferToServerInProcessResponse* result) override;
-
   // Tests if an expression is a compile-time constant.
   tensorflow::Status IsConstant(const IsConstantRequest* arg,
                                 IsConstantResponse* result) override;
@@ -243,6 +244,8 @@ class Service : public ServiceInterface {
   Backend* mutable_backend() { return execute_backend_.get(); }
 
  protected:
+  friend class LocalExecutable;
+
   // The constructor is private. Use the NewService factory to create new
   // service objects.
   Service(std::unique_ptr<Backend> backend,
@@ -257,11 +260,11 @@ class Service : public ServiceInterface {
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
       const Backend* backend, int device_ordinal);
 
-  // Create a Hlo module config foe the given program shape and arguments.
+  // Create a Hlo module config for the given program shape and arguments.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options, Backend* backend);
 
   // Builds an Executable for the given parameters. If
   // executable_for_compute_constant is true, then the executable is intended to
@@ -320,10 +323,6 @@ class Service : public ServiceInterface {
           executors,
       tensorflow::gtl::ArraySlice<string> result_tags);
 
-  // Dumps the executed HLO according to service-associated flags.
-  static void DumpExecutedHlo(const HloModule& module, const string& label,
-                              const HloExecutionProfile* profile);
-
   // Returns an HLO dumper for use in the compiler (it refers to flags
   // associated with the service).
   static Compiler::HloDumper MakeHloDumper();
@@ -347,21 +346,6 @@ class Service : public ServiceInterface {
   tensorflow::Status ValidateResultShapeWithLayout(
       const Shape& shape_with_layout, const Shape& result_shape) const;
 
-  // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
-  // timer for the execution, sets up HLO profiling if enabled, and fills in the
-  // given ExecutionProfile if non-null. The given execute_func should be a
-  // function which calls the desired ExecuteOnStream overload with the supplied
-  // arguments. The ExecuteOnStream overloads return different types so this
-  // method is templated on return-type of the execute function.
-  template <typename ReturnT>
-  ReturnT ExecuteOnStreamWrapper(
-      Executable* executable, const ExecutableRunOptions* run_options,
-      ExecutionProfile* profile,
-      std::function<ReturnT(Executable* executable,
-                            const ExecutableRunOptions* run_options,
-                            HloExecutionProfile* hlo_execution_profile)>
-          execute_func);
-
   // Tracks computations built via the API.
   ComputationTracker computation_tracker_;
 
@@ -391,73 +375,6 @@ class Service : public ServiceInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
 };
 
-template <typename ReturnT>
-ReturnT Service::ExecuteOnStreamWrapper(
-    Executable* executable, const ExecutableRunOptions* run_options,
-    ExecutionProfile* profile,
-    std::function<ReturnT(Executable* executable,
-                          const ExecutableRunOptions* run_options,
-                          HloExecutionProfile* hlo_execution_profile)>
-        execute_func) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
-  if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
-    stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
-  }
-
-  VLOG(1) << "enqueueing executable on stream...";
-  // If the profiling flag isn't enabled, we pass nullptr as the profile to
-  // indicate profiling is not requested.
-  HloExecutionProfile hlo_execution_profile;
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  HloExecutionProfile* profile_ptr =
-      flags->xla_hlo_profile && executable->hlo_profiling_enabled()
-          ? &hlo_execution_profile
-          : nullptr;
-
-  auto return_value = execute_func(executable, run_options, profile_ptr);
-
-  if (profile != nullptr) {
-    VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
-    stream->ThenStopTimer(timer.get()).BlockHostUntilDone();
-    VLOG(1) << "done with block-host-until-done";
-
-    // Merge in run time profile information from the executable.
-    profile->MergeFrom(executable->execution_profile());
-
-    // Overall execution time (in nanoseconds) from the executor timer.
-    profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
-
-    // TODO(b/28123297): On GPU we end up including transfer time in
-    // the compute time this way. Instead, we should get the correct
-    // value by measuring it. Setting the field here at least lets
-    // benchmarks provide *some* value for GPU computations.
-    //
-    // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
-    // the compute time without the transfer time, so this way we get the
-    // correct compute time. We should instead have the correct value for
-    // compute_and_transfer_time and set compute_time to the compute time.
-    if (profile->compute_time_ns() == 0) {
-      profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
-    }
-  }
-
-  if (profile_ptr != nullptr) {
-    HloCostAnalysis analysis;
-    tensorflow::Status analysis_status =
-        executable->module().entry_computation()->root_instruction()->Accept(
-            &analysis);
-    if (analysis_status.ok()) {
-      XLA_LOG_LINES(tensorflow::INFO,
-                    profile_ptr->ToString(
-                        stream->parent()->GetDeviceDescription(), analysis));
-    }
-    DumpExecutedHlo(executable->module(), "Service::Execute", profile_ptr);
-  }
-
-  return return_value;
-}
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_H_
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
new file mode 100644
index 00000000000..017e5ef09ed
--- /dev/null
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace xla {
+
+// Class containing options for running a LocalExecutable and other auxiliary
+// data, now only a stream cache for GPU backend.
+class ServiceExecutableRunOptions {
+ public:
+  using StreamBorrower =
+      std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
+
+  explicit ServiceExecutableRunOptions(
+      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
+      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
+      : run_options_(std::move(run_options)),
+        borrow_stream_(std::move(borrow_stream)),
+        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
+
+  // Returns reference or pointer to `ExecutableRunOptions` member.
+  const ExecutableRunOptions& run_options() const { return run_options_; }
+  ExecutableRunOptions* mutable_run_options() { return &run_options_; }
+
+  // Delegate to `ExecutableRunOptions` member.
+  perftools::gputools::Stream* stream() const { return run_options_.stream(); }
+  DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
+  int device_ordinal() const { return run_options_.device_ordinal(); }
+
+  // Borrows a stream and returns a smart pointer which returns the stream on
+  // destruction.
+  StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
+      int device_ordinal) const {
+    return borrow_stream_
+               ? borrow_stream_(device_ordinal)
+               : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
+  }
+
+  // Returns reference to thread pool for execution of XLA ops on CPU backend.
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
+    return xla_intra_op_thread_pool_;
+  }
+
+ private:
+  ExecutableRunOptions run_options_;
+  StreamBorrower borrow_stream_;
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/service/session.proto b/tensorflow/compiler/xla/service/session.proto
index fa4aa7b0a5f..bb8d1cd2a10 100644
--- a/tensorflow/compiler/xla/service/session.proto
+++ b/tensorflow/compiler/xla/service/session.proto
@@ -57,15 +57,6 @@ message SessionComputation {
   // Map from ComputationDataHandle value to operation request. The highest
   // ComputationDataHandle value corresponds to the root of the computation.
   map<int64, OperationRequest> requests = 3;
-
-  // The list of Trace requests in this SessionComputation.
-  repeated TraceRequest trace_requests = 4;
-
-  // The list of Send requests in this SessionComputation.
-  repeated SendRequest send_requests = 5;
-
-  // The list of Outfeed requests in this SessionComputation.
-  repeated OutfeedRequest outfeed_requests = 6;
 }
 
 // Describes a group of SessionComputations with an "entry point" computation
@@ -84,10 +75,10 @@ message SessionModule {
   repeated SessionComputation embedded_computations = 2;
 
   // The arguments passed to the computation.
-  repeated Literal arguments = 3;
+  repeated LiteralProto arguments = 3;
 
   // The result of the computation.
-  Literal result = 4;
+  LiteralProto result = 4;
 
   // The name of the platform used to run the computation.
   string execution_platform = 5;
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index fbab2dfd4af..d6436cf988d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -208,6 +208,16 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
             PrimitiveType_Name(arg.element_type()).c_str());
       }
       return arg;
+
+    case UNOP_IS_FINITE:
+      if (!ShapeUtil::ElementIsFloating(arg)) {
+        return InvalidArgument(
+            "expected element type in shape to be floating point for IsFinite "
+            "operation; got %s",
+            PrimitiveType_Name(arg.element_type()).c_str());
+      }
+      return ShapeUtil::ChangeElementType(arg, PRED);
+
     default:
       return InvalidArgument("unknown operation %s",
                              UnaryOperation_Name(operation).c_str());
@@ -217,7 +227,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 /* static */ StatusOr<Shape> ShapeInference::InferConcatOpShape(
     tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
     const int64 dimension) {
-  if (arg_shapes.size() == 0) {
+  if (arg_shapes.empty()) {
     return InvalidArgument("Concatenate expects at least one argument");
   }
   if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) {
@@ -234,8 +244,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     }
     if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
       return InvalidArgument(
-          "cannot concatenate arrays with different ranks: %lld vs %lld",
-          ShapeUtil::Rank(*arg_shape), ShapeUtil::Rank(*shape));
+          "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld "
+          "(%s)",
+          ShapeUtil::Rank(*arg_shape),
+          ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape),
+          ShapeUtil::HumanString(*shape).c_str());
     }
     if (arg_shape->element_type() != shape->element_type()) {
       return InvalidArgument(
@@ -299,6 +312,10 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return InvalidArgument(
         "the rank of the operand and the padding configuration do not match.");
   }
+  if (operand_shape.element_type() != padding_value_shape.element_type()) {
+    return InvalidArgument(
+        "the element types of the operands to pad do not match");
+  }
   std::vector<int64> dimensions(ShapeUtil::Rank(operand_shape));
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     dimensions[i] = operand_shape.dimensions(i) +
@@ -328,7 +345,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
   // Check if both element types are the same.
   if (lhs.element_type() != rhs.element_type()) {
-    return fail("element types mismatch");
+    return fail("element types do not match");
   }
 
   if (ShapeUtil::Rank(lhs) < 1 || ShapeUtil::Rank(lhs) > 2 ||
@@ -530,7 +547,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InferDegenerateDimensionBroadcastShape(operation, lhs, rhs);
   } else {
     // Ranks do not match, so perform InDim broadcasting using
-    // broadcast_dimensions. Scalar broadcasting is a special case of this).
+    // broadcast_dimensions. Scalar broadcasting is a special case of this.
     const Shape& larger_shape =
         ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? lhs : rhs;
     const Shape& smaller_shape =
@@ -623,26 +640,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_DCHECK_OK(ShapeUtil::ValidateShape(ehs));
   switch (operation) {
     case TRIOP_CLAMP:
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(lhs, "lhs of ternary operation"));
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(rhs, "rhs of ternary operation"));
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(ehs, "ehs of ternary operation"));
-      if (((ShapeUtil::Compatible(lhs, rhs) || ShapeUtil::Rank(lhs) == 0) &&
-           (ShapeUtil::Compatible(rhs, ehs) || ShapeUtil::Rank(ehs) == 0))) {
-        return rhs;
-      }
-      if (ShapeUtil::Rank(rhs) == 0) {
-        if (ShapeUtil::Compatible(lhs, ehs)) {
-          return lhs;
-        }
-        return ShapeUtil::Rank(ehs) == 0 ? lhs : ehs;
-      }
-      return Unimplemented("not yet implemented: %s, %s <clamp> %s",
-                           lhs.ShortDebugString().c_str(),
-                           ehs.ShortDebugString().c_str(),
-                           rhs.ShortDebugString().c_str());
+      return InferClampShape(lhs, rhs, ehs);
     case TRIOP_SELECT:
       return InferSelectShape(lhs, rhs, ehs);
     case TRIOP_UPDATE:
@@ -681,7 +679,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 /* static */ StatusOr<Shape> ShapeInference::InferMapShape(
     tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
     const ProgramShape& to_apply) {
-  if (arg_shapes.size() == 0) {
+  if (arg_shapes.empty()) {
     return InvalidArgument("Map expects at least one argument");
   }
 
@@ -1007,7 +1005,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-    tensorflow::gtl::ArraySlice<int64> limits) {
+    tensorflow::gtl::ArraySlice<int64> limits,
+    tensorflow::gtl::ArraySlice<int64> strides) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
@@ -1030,13 +1029,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dimension = 0; dimension < starts.size(); ++dimension) {
     int64 start_index = starts[dimension];
     int64 limit_index = limits[dimension];
+    int64 stride = strides[dimension];
     if (start_index < 0) {
       return InvalidArgument("negative start index to slice: %lld",
                              start_index);
     }
-    if (limit_index < 0) {
-      return InvalidArgument("negative limit index to slice: %lld",
-                             limit_index);
+    if (stride == 0) {
+      return InvalidArgument("Zero stride");
     }
     if (limit_index > arg.dimensions(dimension)) {
       return InvalidArgument(
@@ -1044,18 +1043,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
           "size (%lld)",
           limit_index, arg.dimensions(dimension));
     }
-    if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice",
-          limit_index, start_index);
-    }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
-
-    sizes.push_back(limits[dimension] - starts[dimension]);
+    if (stride > 0) {
+      if (start_index > limit_index) {
+        return InvalidArgument(
+            "limit index (%lld) must be greater or equal to "
+            "start index (%lld) in slice with positive stride",
+            limit_index, start_index);
+      }
+      sizes.push_back((limit_index - start_index + stride - 1) / stride);
+    } else {
+      return InvalidArgument("Negative strides not supported");
+    }
   }
 
   return ShapeUtil::MakeShape(arg.element_type(), sizes);
@@ -1089,9 +1091,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   const int64 start_num_dims = start_indices_shape.dimensions(0);
   if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
     return InvalidArgument(
-        "dynamic slice start number of dimensions %lld must match rank %lld of "
-        "slice input",
-        start_num_dims, ShapeUtil::Rank(operand_shape));
+        "dynamic slice start number of dimensions %lld (%s) must match rank "
+        "%lld of slice input (%s)",
+        start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(),
+        ShapeUtil::Rank(operand_shape),
+        ShapeUtil::HumanString(operand_shape).c_str());
   }
 
   if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) {
@@ -1103,7 +1107,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dim = 0; dim < slice_sizes.size(); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 slice_dim_size = slice_sizes[dim];
-    if (slice_dim_size <= 0) {
+    if (slice_dim_size < 0) {
       return InvalidArgument("negative size index to dynamic slice: %lld",
                              slice_dim_size);
     }
@@ -1150,9 +1154,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   const int64 start_num_dims = start_indices_shape.dimensions(0);
   if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
     return InvalidArgument(
-        "dynamic update slice start number of dimensions %lld must match "
-        "rank %lld of slice input",
-        start_num_dims, ShapeUtil::Rank(operand_shape));
+        "dynamic slice start number of dimensions %lld (%s) must match rank "
+        "%lld of slice input (%s)",
+        start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(),
+        ShapeUtil::Rank(operand_shape),
+        ShapeUtil::HumanString(operand_shape).c_str());
   }
 
   if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) {
@@ -1173,9 +1179,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 update_dim_size = update_shape.dimensions(dim);
-    if (update_dim_size <= 0) {
+    if (update_dim_size < 0) {
       return InvalidArgument(
-          "size index %lld to dynamic update slice must be > 0",
+          "size index %lld to dynamic update slice must be >= 0",
           update_dim_size);
     }
     if (update_dim_size > input_dim_size) {
@@ -1322,6 +1328,41 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   return ShapeUtil::PermuteDimensions(InversePermutation(dimensions), operand);
 }
 
+// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
+// "degenerate" cases, as with binary elementwise ops.
+/* static */ StatusOr<Shape> ShapeInference::InferClampShape(
+    const Shape& min, const Shape& operand, const Shape& max) {
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(min, "clamp min"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "clamp operand"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(max, "clamp max"));
+  if (!ShapeUtil::SameElementType(min, operand) ||
+      !ShapeUtil::SameElementType(max, operand)) {
+    return InvalidArgument("clamp op with different operand types: %s, %s, %s",
+                           ShapeUtil::HumanString(min).c_str(),
+                           ShapeUtil::HumanString(operand).c_str(),
+                           ShapeUtil::HumanString(max).c_str());
+  }
+  if (((ShapeUtil::Compatible(min, operand) || ShapeUtil::IsScalar(min)) &&
+       (ShapeUtil::Compatible(max, operand) || ShapeUtil::IsScalar(max)))) {
+    return operand;
+  }
+  if (ShapeUtil::IsScalar(operand)) {
+    if (ShapeUtil::Compatible(min, max)) {
+      return min;
+    } else if (ShapeUtil::IsScalar(min)) {
+      return max;
+    } else if (ShapeUtil::IsScalar(max)) {
+      return min;
+    }
+  }
+  return Unimplemented(
+      "not yet implemented: %s, %s <clamp> %s", min.ShortDebugString().c_str(),
+      max.ShortDebugString().c_str(), operand.ShortDebugString().c_str());
+}
+
+// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
+// "degenerate" cases, as with binary elementwise ops, as well as scalar
+// broadcast from all operands, not just the predicate.
 /* static */ StatusOr<Shape> ShapeInference::InferSelectShape(
     const Shape& pred, const Shape& on_true, const Shape& on_false) {
   if (!ShapeUtil::Compatible(on_true, on_false)) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index ced2f4d0017..0d270f99794 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -109,7 +109,8 @@ class ShapeInference {
   // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16]
   static StatusOr<Shape> InferSliceShape(
       const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-      tensorflow::gtl::ArraySlice<int64> limits);
+      tensorflow::gtl::ArraySlice<int64> limits,
+      tensorflow::gtl::ArraySlice<int64> strides);
 
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
@@ -190,6 +191,10 @@ class ShapeInference {
       BinaryOperation operation, const Shape& lhs, const Shape& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Helper for inferring the shape of Clamp ops.
+  static StatusOr<Shape> InferClampShape(const Shape& min, const Shape& operand,
+                                         const Shape& max);
+
   // Helper for inferring the shape of Select ops.
   static StatusOr<Shape> InferSelectShape(const Shape& pred,
                                           const Shape& on_true,
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 5a1ae6b0024..8c731ae2976 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -20,12 +20,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+
 class ShapeInferenceTest : public ::testing::Test {
  protected:
   // Some handy scalar shapes.
@@ -128,23 +132,21 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
   auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(
-      inferred_status_error1.status().error_message(),
-      testing::ContainsRegex("operands to select must be the same shape"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("operands to select must be the same shape"));
 
   auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, s32_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("pred operand must have PRED"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("pred operand must have PRED"));
 
   auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeShape(PRED, {64}),
       matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(
-      inferred_status_error3.status().error_message(),
-      testing::ContainsRegex("with non-scalar predicate with dimensionality"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("with non-scalar predicate with dimensionality"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
   auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
@@ -152,9 +154,101 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
       ShapeUtil::MakeTupleShape({f32_, f32_}),
       ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
-      inferred_status_error4.status().error_message(),
-      testing::ContainsRegex("pred operand must have PRED element type"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("pred operand must have PRED element type"));
+}
+
+TEST_F(ShapeInferenceTest, ClampAllMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_,
+      matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampAllScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, f32_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMinScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMaxScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampOperandScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMinMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, f32_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampBadShapes) {
+  // Type mismatch
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, s32_, f32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, s32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, f32_, s32_)
+                   .ok());
+  // Dimension mismatch
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_64_, vector_32_, vector_32_)
+          .ok());
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_32_, vector_64_, vector_32_)
+          .ok());
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_32_, vector_32_, vector_64_)
+          .ok());
+  // Dimension mismatch, where one operand is a scalar
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, vector_64_, vector_32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, vector_64_, f32_, vector_32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, vector_64_, vector_32_)
+                   .ok());
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
@@ -205,8 +299,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
       operand_shape_, select_program_shape_, window_, source_shape_fail,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("source shape does not match"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("source shape does not match"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
@@ -216,9 +310,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(
-      inferred_status_fail.status().error_message(),
-      testing::ContainsRegex("select function must take 2 parameters"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function must take 2 parameters"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
@@ -228,8 +321,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function must have rank-0 PRED"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function must have rank-0 PRED"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
@@ -239,8 +332,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function's first parameter"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function's first parameter"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
@@ -250,8 +343,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function's second parameter"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function's second parameter"));
 }
 
 TEST_F(ShapeInferenceTest, Convolve) {
@@ -405,8 +498,8 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   auto inferred_status =
       ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, window, dnums);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("each dimension exactly once"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("each dimension exactly once"));
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
@@ -443,43 +536,42 @@ TEST_F(ShapeInferenceTest, Map) {
   auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(no_args_error.ok());
-  ASSERT_MATCH(no_args_error.status().error_message(),
-               testing::ContainsRegex("expects at least one argument"));
+  ASSERT_THAT(no_args_error.status().error_message(),
+              HasSubstr("expects at least one argument"));
 
   auto args_diff_shapes_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_64_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(args_diff_shapes_error.ok());
-  ASSERT_MATCH(
-      args_diff_shapes_error.status().error_message(),
-      testing::ContainsRegex("requires all operands to have the same shape"));
+  ASSERT_THAT(args_diff_shapes_error.status().error_message(),
+              HasSubstr("requires all operands to have the same shape"));
 
   auto arity_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_));
   ASSERT_FALSE(arity_error.ok());
-  ASSERT_MATCH(arity_error.status().error_message(),
-               testing::ContainsRegex("function arity must match"));
+  ASSERT_THAT(arity_error.status().error_message(),
+              HasSubstr("function arity must match"));
 
   auto output_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, vector_32_));
   ASSERT_FALSE(output_shape_error.ok());
-  ASSERT_MATCH(output_shape_error.status().error_message(),
-               testing::ContainsRegex("result has to be a scalar"));
+  ASSERT_THAT(output_shape_error.status().error_message(),
+              HasSubstr("result has to be a scalar"));
 
   auto param_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({vector_32_, f32_}, f32_));
   ASSERT_FALSE(param_shape_error.ok());
-  ASSERT_MATCH(param_shape_error.status().error_message(),
-               testing::ContainsRegex("parameter has to be a scalar"));
+  ASSERT_THAT(param_shape_error.status().error_message(),
+              HasSubstr("parameter has to be a scalar"));
 
   auto param_element_type_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, s32_}, f32_));
   ASSERT_FALSE(param_element_type_error.ok());
-  ASSERT_MATCH(param_element_type_error.status().error_message(),
-               testing::ContainsRegex("parameter type has to match argument"));
+  ASSERT_THAT(param_element_type_error.status().error_message(),
+              HasSubstr("parameter type has to match argument"));
 
   Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, f32_);
@@ -490,26 +582,26 @@ TEST_F(ShapeInferenceTest, Map) {
   auto inferred_status_error1 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("arity must match number of arguments"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("arity must match number of arguments"));
 
   auto inferred_status_error2 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_));
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("has to be a scalar"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("has to be a scalar"));
 
   auto inferred_status_error3 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_));
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("has to be a scalar"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("has to be a scalar"));
 
   auto inferred_status_error5 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_));
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(inferred_status_error5.status().error_message(),
-               testing::ContainsRegex("parameter type has to match argument"));
+  ASSERT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("parameter type has to match argument"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceVectorToScalar) {
@@ -563,8 +655,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
       ShapeUtil::MakeShape(F32, {5, 3}), f32_, /*dimensions_to_reduce=*/{3, 4},
       to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("out-of-bounds dimension"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("out-of-bounds dimension"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
@@ -573,8 +665,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
       ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_,
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("take 2 parameters"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("take 2 parameters"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
@@ -583,23 +675,50 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
       ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_,
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("first parameter shape differs"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("first parameter shape differs"));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64});
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), inferred));
 }
 
+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferInvalidStride) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
+            inferred_status.status().code());
+}
+
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2});
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
             inferred_status.status().code());
@@ -608,7 +727,7 @@ TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
   Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
   auto inferred_status =
-      ShapeInference::InferSliceShape(vector_shape, {2}, {4});
+      ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
   ASSERT_TRUE(inferred_status.ok());
   Shape inferred = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(inferred, ShapeUtil::MakeShape(F32, {2})));
@@ -726,8 +845,8 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
   auto inferred_status =
       ShapeInference::InferBinaryOpShape(BINOP_DOT, f32_, vector_32_, {});
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("dot only supports rank"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("dot only supports rank"));
 }
 
 // 3D <dot> 2D: error
@@ -735,8 +854,8 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   auto inferred_status = ShapeInference::InferBinaryOpShape(
       BINOP_DOT, ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, {});
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("dot only supports rank"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("dot only supports rank"));
 }
 
 // vector <dot> vector -> scalar
@@ -848,46 +967,43 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error1 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("automatic"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("automatic"));
 
   // broadcast_dimension out of bounds for tensor's rank
   auto inferred_status_error2 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(
-      inferred_status_error2.status().error_message(),
-      testing::ContainsRegex("broadcast dimension number .* too large"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              ContainsRegex("broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
   auto inferred_status_error3 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
   auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
-      inferred_status_error4.status().error_message(),
-      testing::ContainsRegex("size of broadcast_dimensions has to match"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("size of broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
   auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(
-      inferred_status_error5.status().error_message(),
-      testing::ContainsRegex("broadcast dimension number .* too large"));
+  ASSERT_THAT(inferred_status_error5.status().error_message(),
+              ContainsRegex("broadcast dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
   auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_MATCH(inferred_status_error6.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  ASSERT_THAT(inferred_status_error6.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
@@ -895,14 +1011,14 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
-  ASSERT_MATCH(inferred_status_error7.status().error_message(),
-               testing::ContainsRegex("broadcast dimensions order is wrong"));
+  ASSERT_THAT(inferred_status_error7.status().error_message(),
+              HasSubstr("broadcast dimensions order is wrong"));
 
   auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
-  ASSERT_MATCH(inferred_status_error8.status().error_message(),
-               testing::ContainsRegex("broadcast dimensions order is wrong"));
+  ASSERT_THAT(inferred_status_error8.status().error_message(),
+              HasSubstr("broadcast dimensions order is wrong"));
 }
 
 // Tests for the while instruction with proper shapes.
@@ -927,30 +1043,30 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("condition must take 1 arguments"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("condition must take 1 arguments"));
 
   auto bad_shape_2 =
       ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
   auto inferred_status_error2 =
       ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("body must take 1 arguments"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("body must take 1 arguments"));
 
   auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
   auto inferred_status_error3 =
       ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("condition must return a boolean"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("condition must return a boolean"));
 
   auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
   auto inferred_status_error4 =
       ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(inferred_status_error4.status().error_message(),
-               testing::ContainsRegex("parameter of condition and body"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("parameter of condition and body"));
 }
 
 // Tests for the concatenate instruction with proper shapes.
@@ -980,49 +1096,44 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(
-      inferred_status_error1.status().error_message(),
-      testing::ContainsRegex("Concatenate expects at least one argument"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("Concatenate expects at least one argument"));
 
   auto inferred_status_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: -1"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("dimension to concatenate along out of bounds: -1"));
 
   auto inferred_status_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: 1"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("dimension to concatenate along out of bounds: 1"));
 
   Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
   auto inferred_status_error4 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &tuple}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
+  ASSERT_THAT(
       inferred_status_error4.status().error_message(),
-      testing::ContainsRegex(
-          "Expected non-tuple argument for operand of concatenation."));
+      HasSubstr("Expected non-tuple argument for operand of concatenation."));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
   auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &vector_s32}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(inferred_status_error5.status().error_message(),
-               testing::ContainsRegex(
-                   "cannot concatenate arrays with different element types"));
+  ASSERT_THAT(
+      inferred_status_error5.status().error_message(),
+      HasSubstr("cannot concatenate arrays with different element types"));
 
   auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
       {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_MATCH(
-      inferred_status_error6.status().error_message(),
-      testing::ContainsRegex("cannot concatenate arrays that differ in "
-                             "dimensions other than the one being "
-                             "concatenated"));
+  ASSERT_THAT(inferred_status_error6.status().error_message(),
+              HasSubstr("cannot concatenate arrays that differ in "
+                        "dimensions other than the one being "
+                        "concatenated"));
 }
 
 TEST_F(ShapeInferenceTest, Pad) {
@@ -1063,27 +1174,27 @@ TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
   auto inferred_status_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
   ASSERT_FALSE(inferred_status_error0.ok());
-  ASSERT_MATCH(inferred_status_error0.status().error_message(),
-               testing::ContainsRegex("out-of-bounds"));
+  ASSERT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("out-of-bounds"));
 
   auto inferred_status_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("out-of-bounds"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("out-of-bounds"));
 
   auto inferred_status_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("duplicated"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("duplicated"));
 
   Shape tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
   auto inferred_status_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("Expected non-tuple argument"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("Expected non-tuple argument"));
 }
 
 TEST_F(ShapeInferenceTest, Call) {
@@ -1103,20 +1214,20 @@ TEST_F(ShapeInferenceTest, Call) {
   auto inferred_status_error0 = ShapeInference::InferCallShape(
       {}, ShapeUtil::MakeProgramShape({f32_}, f32_));
   EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_MATCH(inferred_status_error0.status().error_message(),
-               testing::ContainsRegex("arity must match"));
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("arity must match"));
 
   auto inferred_status_error1 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({}, f32_));
   EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("arity must match"));
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("arity must match"));
 
   auto inferred_status_error2 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({s32_}, f32_));
   EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("parameter must match argument"));
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("parameter must match argument"));
 }
 
 TEST_F(ShapeInferenceTest, Transpose) {
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index cf49fd72b7d..865be1b84f2 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -73,16 +73,13 @@ ShapedBuffer::MakeUnnestedTupleShapedBuffer(
   }
   TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> shaped_buffer,
                       MakeShapedBuffer(shape, platform, device_ordinal));
-  TF_CHECK_OK(shaped_buffer->mutable_shape_index_to_buffer_entry()
-                  ->ForEachMutableElement(
-                      [](const ShapeIndex& index, bool is_leaf,
-                         size_t* buffer_element) -> tensorflow::Status {
-                        if (is_leaf) {
-                          CHECK_EQ(index.size(), 1);
-                          *buffer_element = index[0];
-                        }
-                        return tensorflow::Status::OK();
-                      }));
+  shaped_buffer->mutable_shape_index_to_buffer_entry()->ForEachMutableElement(
+      [&shaped_buffer](const ShapeIndex& index, size_t* buffer_element) {
+        if (ShapeUtil::IsLeafIndex(shaped_buffer->shape(), index)) {
+          CHECK_EQ(index.size(), 1);
+          *buffer_element = index[0];
+        }
+      });
   shaped_buffer->mutable_buffers()->reserve(buffers.size());
   for (const perftools::gputools::DeviceMemoryBase& memory_base : buffers) {
     shaped_buffer->mutable_buffers()->push_back(memory_base);
@@ -126,10 +123,12 @@ ScopedShapedBuffer::MakeScopedShapedBuffer(const Shape& shape,
 
   // Allocate an appropriate sized buffer for each array element in the shape.
   TF_RETURN_IF_ERROR(
-      shaped_buffer->shape_index_to_buffer_entry_.ForEachMutableElement(
-          [&shaped_buffer](const ShapeIndex& index, bool is_leaf,
-                           size_t* buffer_entry) -> tensorflow::Status {
-            if (is_leaf) {
+      shaped_buffer->shape_index_to_buffer_entry_
+          .ForEachMutableElementWithStatus([&shaped_buffer](
+                                               const ShapeIndex& index,
+                                               size_t* buffer_entry)
+                                               -> tensorflow::Status {
+            if (ShapeUtil::IsLeafIndex(shaped_buffer->shape(), index)) {
               TF_ASSIGN_OR_RETURN(
                   perftools::gputools::DeviceMemoryBase memory_base,
                   shaped_buffer->allocator_->Allocate(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index c7f6a13023d..4da0a0d3684 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -72,7 +72,7 @@ TransferManager::GetPlatformTransferManagers() {
     it->second.manager = (*it->second.creation_function)();
   }
 
-  return it->second.manager;
+  return it->second.manager.get();
 }
 
 Status TransferManager::TransferBufferFromDevice(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 7ffce452139..15f6b7bfb4a 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -64,6 +65,12 @@ class TransferManager {
       perftools::gputools::StreamExecutor* executor,
       const Literal& literal) = 0;
 
+  // Transfers the given literal from the Outfeed interface of the device,
+  // using the given executor.
+  virtual Status TransferLiteralFromOutfeed(
+      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+      Literal* literal) = 0;
+
   // Resets the devices associated with this transfer manager.
   virtual Status ResetDevices(
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
@@ -110,7 +117,7 @@ class TransferManager {
       perftools::gputools::StreamExecutor* executor, int64 size,
       const void* source, perftools::gputools::DeviceMemoryBase* destination);
 
-  typedef TransferManager* (*TransferManagerCreationFunction)();
+  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
 
   /////
   // The TransferManager class also serves as a point to register objects for
@@ -140,7 +147,7 @@ class TransferManager {
   // set up creation_function, and then we use that to lazily create
   // "manager" the first time GetForPlatform is invoked for a particular id.
   struct State {
-    TransferManager* manager = nullptr;
+    std::unique_ptr<TransferManager> manager;
     TransferManagerCreationFunction creation_function = nullptr;
   };
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager_test.cc b/tensorflow/compiler/xla/service/transfer_manager_test.cc
index 564111c4f2b..ca38601d919 100644
--- a/tensorflow/compiler/xla/service/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager_test.cc
@@ -121,7 +121,7 @@ TEST_F(CpuTransferManagerTest, TransferR1U8FromDevice) {
   const Shape shape = ShapeUtil::MakeShape(U8, {4});
   TF_CHECK_OK(transfer_manager_.TransferLiteralFromDevice(
       stream_exec_, memptr, shape, shape, &literal));
-  CHECK_EQ("klmn", literal.u8s());
+  CHECK_EQ("klmn", literal.u8s_string());
 }
 
 TEST_F(CpuTransferManagerTest, TransferBufferFromDevice) {
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 07e0ce89f6a..a0c88c6bbc2 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -30,43 +32,55 @@ namespace xla {
 
 namespace {
 
-bool IsOperandFoldableToDot(const HloInstruction& hlo) {
-  return hlo.IsRank2Transpose() &&
-         hlo.user_count() == 1;  // The dot is its only user.
-}
-
-bool CanFoldOperandsIntoDot(
+TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     const HloInstruction& dot,
-    const TransposeFolding::IsTransposableGemmFn& is_transposable_gemm) {
+    const TransposeFolding::TransposableGemmOperandsFn&
+        transposable_gemm_operands) {
   if (HloOpcode::kDot != dot.opcode()) {
-    return false;
+    return {};
   }
 
-  if (!is_transposable_gemm(dot)) {
-    return false;
+  TransposeFolding::OperandIndices operand_set;
+  for (int64 i = 0; i < dot.operand_count(); ++i) {
+    auto& operand = *dot.operand(i);
+    if (operand.IsRank2Transpose() && operand.user_count() == 1) {
+      operand_set.push_back(i);
+    }
   }
 
-  const HloInstruction* lhs = dot.operand(0);
-  const HloInstruction* rhs = dot.operand(1);
-  bool lhs_foldable = IsOperandFoldableToDot(*lhs);
-  bool rhs_foldable = IsOperandFoldableToDot(*rhs);
-  if (!lhs_foldable && !rhs_foldable) {
-    return false;
-  }
-  return true;
+  return transposable_gemm_operands(dot, operand_set);
 }
 
+TransposeFolding::OperandIndices CanFoldOperandsIntoConvolution(
+    const HloInstruction& convolution,
+    const TransposeFolding::TransposableConvOperandsFn&
+        transposable_conv_operands) {
+  if (HloOpcode::kConvolution != convolution.opcode()) {
+    return {};
+  }
+
+  // We only support folding the RHS.
+  const int64 kRhsOperandIndex = 1;
+  auto& operand = *convolution.operand(kRhsOperandIndex);
+  if (operand.opcode() == HloOpcode::kTranspose && operand.user_count() == 1) {
+    return transposable_conv_operands(convolution, {kRhsOperandIndex});
+  }
+
+  return {};
+}
+
+using InstructionOperandsPair =
+    std::pair<HloInstruction*, TransposeFolding::OperandIndices>;
+
 // Folds the operands of `dot` that are foldable transposes. `computation` is
-// the parent HLO computation of `dot`. `module` is the parent HloModule of
-// `computation`.
+// the parent HLO computation of `dot`.
 //
 // Returns whether the module is changed.
-bool FoldTransposeIntoDot(HloInstruction* dot, HloComputation* computation) {
+bool FoldTransposeIntoDot(InstructionOperandsPair pair) {
+  auto* dot = pair.first;
   std::vector<HloInstruction*> instructions_to_fuse(1, dot);
-  for (HloInstruction* operand : dot->operands()) {
-    if (IsOperandFoldableToDot(*operand)) {
-      instructions_to_fuse.push_back(operand);
-    }
+  for (const int64 operand_index : pair.second) {
+    instructions_to_fuse.push_back(dot->mutable_operand(operand_index));
   }
 
   // Early-exit if no operands are foldable.
@@ -74,33 +88,100 @@ bool FoldTransposeIntoDot(HloInstruction* dot, HloComputation* computation) {
     return false;
   }
 
-  computation->CreateFusionInstruction(
+  dot->parent()->CreateFusionInstruction(
       instructions_to_fuse, HloInstruction::FusionKind::kTransposeDot);
   return true;
 }
 
+// Folds the operands of `convolution` that are foldable transposes.
+// `computation` is the parent HLO computation of `convolution`.
+//
+// Returns whether the module is changed.
+bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
+  auto& convolution = *pair.first;
+
+  // We only support fusing the RHS transpose into convolution.
+  //
+  // ConvolutionDimensionNumbers doesn't make enough of a distinction between
+  // the output and the activations.
+  //
+  // TODO(b/37125184): Support transposing the LHS too.
+  if (pair.second.size() != 1 || pair.second.front() != 1) {
+    return false;
+  }
+
+  const ConvolutionDimensionNumbers& dnums =
+      convolution.convolution_dimension_numbers();
+  HloInstruction& transpose = *convolution.mutable_operand(1);
+  CHECK_EQ(transpose.opcode(), HloOpcode::kTranspose);
+  const auto& transpose_dimensions = transpose.dimensions();
+  HloInstruction& transpose_operand = *transpose.mutable_operand(0);
+
+  // Everything remains the same except for the kernel dimension numbers. We
+  // need to apply the transpose permutation to the original shape to figure out
+  // what the new logical dimensions are.
+  ConvolutionDimensionNumbers new_dnums = dnums;
+  new_dnums.set_kernel_input_feature_dimension(
+      transpose_dimensions[dnums.kernel_input_feature_dimension()]);
+  new_dnums.set_kernel_output_feature_dimension(
+      transpose_dimensions[dnums.kernel_output_feature_dimension()]);
+  for (auto& kernel_spatial_dimension :
+       *new_dnums.mutable_kernel_spatial_dimensions()) {
+    kernel_spatial_dimension = transpose_dimensions[kernel_spatial_dimension];
+  }
+
+  auto new_conv = HloInstruction::CreateConvolve(
+      convolution.shape(), convolution.mutable_operand(0), &transpose_operand,
+      convolution.window(), new_dnums);
+  TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
+      &convolution, std::move(new_conv)));
+
+  return true;
+}
+
 }  // namespace
 
-TransposeFolding::TransposeFolding(IsTransposableGemmFn is_transposable_gemm)
-    : is_transposable_gemm_(std::move(is_transposable_gemm)) {}
+TransposeFolding::TransposeFolding(
+    TransposableGemmOperandsFn transposable_gemm_operands,
+    TransposableConvOperandsFn transposable_conv_operands)
+    : transposable_gemm_operands_(std::move(transposable_gemm_operands)),
+      transposable_conv_operands_(std::move(transposable_conv_operands)) {}
 
 StatusOr<bool> TransposeFolding::Run(HloModule* module) {
   // Modifying the graph while traversing is dangerous, so we find all folding
   // opportunities before actually folding them.
-  HloComputation* entry_computation = module->entry_computation();
-
-  std::vector<HloInstruction*> foldable_dots;
-  auto visit_fn = [this, &foldable_dots](HloInstruction* instruction) {
-    if (CanFoldOperandsIntoDot(*instruction, is_transposable_gemm_)) {
-      foldable_dots.emplace_back(instruction);
+  std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_dots;
+  std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_convolutions;
+  auto visit_fn = [this, &foldable_dots,
+                   &foldable_convolutions](HloInstruction* instruction) {
+    {
+      OperandIndices operand_indices =
+          CanFoldOperandsIntoDot(*instruction, transposable_gemm_operands_);
+      if (!operand_indices.empty()) {
+        foldable_dots.emplace_back(instruction, operand_indices);
+      }
+    }
+    {
+      OperandIndices operand_indices = CanFoldOperandsIntoConvolution(
+          *instruction, transposable_conv_operands_);
+      if (!operand_indices.empty()) {
+        foldable_convolutions.emplace_back(
+            std::make_pair(instruction, operand_indices));
+      }
     }
     return tensorflow::Status::OK();
   };
-  TF_RETURN_IF_ERROR(entry_computation->root_instruction()->Accept(visit_fn));
+
+  for (auto& comp : module->computations()) {
+    TF_RETURN_IF_ERROR(comp->Accept(visit_fn));
+  }
 
   bool changed = false;
-  for (HloInstruction* dot : foldable_dots) {
-    changed |= FoldTransposeIntoDot(dot, entry_computation);
+  for (InstructionOperandsPair& pair : foldable_dots) {
+    changed |= FoldTransposeIntoDot(pair);
+  }
+  for (InstructionOperandsPair& pair : foldable_convolutions) {
+    changed |= FoldTransposeIntoConvolution(pair);
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h
index d857c04ed8d..71e8446452f 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.h
+++ b/tensorflow/compiler/xla/service/transpose_folding.h
@@ -25,16 +25,37 @@ namespace xla {
 // operator is implemented by a GEMM kernel that can transpose its inputs.
 class TransposeFolding : public HloPassInterface {
  public:
-  // IsTransposableGemmFn should return true iff the instruction argument is
-  // implemented as a GEMM kernel that supports transposing its arguments.
-  typedef std::function<bool(const HloInstruction&)> IsTransposableGemmFn;
-  explicit TransposeFolding(IsTransposableGemmFn is_transposable_gemm);
+  using OperandIndices = std::vector<int64>;
+
+  // Returns the set of foldable operands for a given HLO and some candidate
+  // operands.
+  using FoldableOperands = std::function<OperandIndices(const HloInstruction&,
+                                                        const OperandIndices&)>;
+  using TransposableGemmOperandsFn = FoldableOperands;
+  using TransposableConvOperandsFn = FoldableOperands;
+
+  // Helper function to explicitly not fold transposes.
+  static OperandIndices NeverFoldTranspose(const HloInstruction&,
+                                           const OperandIndices&) {
+    return {};
+  }
+  // transposable_gemm_operands returns the set of operands it wants to fold if
+  // the instruction argument is implemented as a GEMM kernel that supports
+  // transposing its arguments.
+  //
+  // transposable_conv_operands returns the set of operands it wants to fold if
+  // the instruction argument is implemented as a convolution that supports
+  // transposing its arguments.
+  explicit TransposeFolding(
+      TransposableGemmOperandsFn transposable_gemm_operands,
+      TransposableConvOperandsFn transposable_conv_operands);
   tensorflow::StringPiece name() const override { return "transpose-folding"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  IsTransposableGemmFn is_transposable_gemm_;
+  TransposableGemmOperandsFn transposable_gemm_operands_;
+  TransposableConvOperandsFn transposable_conv_operands_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 09f932e29e6..c72d127ea86 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -16,16 +16,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,12 +38,20 @@ namespace xla {
 class TransposeFoldingTest : public ::testing::Test {
  protected:
   void FoldTranspose(HloModule* module) {
-    TransposeFolding transpose_folding(gpu::ImplementedAsGemm);
+    TransposeFolding transpose_folding(
+        [](const HloInstruction& dot,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return candidate_operands;
+        },
+        [](const HloInstruction& convolution,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return candidate_operands;
+        });
     EXPECT_IS_OK(transpose_folding.Run(module).status());
   }
 };
 
-TEST_F(TransposeFoldingTest, FoldTranspose) {
+TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   auto builder = HloComputation::Builder("entry_computation");
   HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
@@ -61,7 +72,7 @@ TEST_F(TransposeFoldingTest, FoldTranspose) {
   FoldTranspose(&module);
 
   // Instructions after folding: x, y, and the fusion.
-  std::set<HloInstruction*> instruction_set;
+  std::unordered_set<HloInstruction*> instruction_set;
   for (auto& instruction : entry_computation->instructions()) {
     instruction_set.insert(instruction.get());
   }
@@ -77,7 +88,7 @@ TEST_F(TransposeFoldingTest, FoldTranspose) {
   EXPECT_EQ(4, fusion->fused_instructions().size());
 }
 
-TEST_F(TransposeFoldingTest, FoldTransposeConstant) {
+TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   auto builder = HloComputation::Builder("entry_computation");
   // 2x1
   HloInstruction* const0 = builder.AddInstruction(
@@ -115,7 +126,7 @@ TEST_F(TransposeFoldingTest, FoldTransposeConstant) {
             entry_computation->root_instruction()->fused_instructions().size());
 }
 
-TEST_F(TransposeFoldingTest, FuseWithConstantOperands) {
+TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   auto builder = HloComputation::Builder("entry");
   // (1.0 + 2.0) * (2.0 - 3.0)
   HloInstruction* const1 = builder.AddInstruction(
@@ -139,11 +150,219 @@ TEST_F(TransposeFoldingTest, FuseWithConstantOperands) {
   EXPECT_EQ(call, entry_computation->root_instruction());
   HloComputation* callee_computation = call->to_apply();
   // The arguments to the call should be const1, const2, and const3.
-  EXPECT_MATCH(call->operands(), testing::UnorderedMatcher<HloInstruction*>(
-                                     const1, const2, const3));
+  EXPECT_THAT(call->operands(),
+              ::testing::UnorderedElementsAre(const1, const2, const3));
 
   // The callee should contain 3 parameters and 3 binary operators.
   EXPECT_EQ(6, callee_computation->instructions().size());
 }
 
+TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
+      /*lhs=*/x, /*rhs=*/transpose_y));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(dot));
+
+  HloInstruction* call = module.OutlineExpressionFromComputation(
+      {transpose_y, dot}, "outlined", entry_computation);
+
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the fusion.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(call))
+      << "call is not in entry_computation.";
+  CHECK(instruction_set.empty())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* fusion =
+      call->called_computations().front()->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+
+  // The fusion instruction should contain two parameters, one transpose and
+  // one dot.
+  EXPECT_EQ(4, fusion->fused_instructions().size());
+}
+
+// Test that a two dimension swap of the kernel gets folded into convolution.
+TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 0, 2, 3}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(
+        transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      x->shape(), transpose_y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.kernel_input_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_output_feature_dimension());
+  EXPECT_EQ(dnums.kernel_output_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_input_feature_dimension());
+}
+
+// Test that a complex transpose of the kernel gets folded into convolution.
+TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {1, 2, 1, 3}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 3, 0, 2}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(
+        transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      x->shape(), transpose_y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.kernel_input_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_output_feature_dimension());
+  EXPECT_EQ(dnums.kernel_spatial_dimensions(1),
+            new_conv->convolution_dimension_numbers()
+                .kernel_input_feature_dimension());
+  EXPECT_EQ(
+      dnums.kernel_output_feature_dimension(),
+      new_conv->convolution_dimension_numbers().kernel_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.kernel_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().kernel_spatial_dimensions(1));
+}
+
+// Test that a transpose of the activations does not get folded into
+// convolution.
+TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_x =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 2, 3}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      transpose_x->shape(), y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: transpose_x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(transpose_x))
+      << "transpose_x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(conv))
+      << "transpose_x is not in entry_computation.";
+  CHECK_EQ(0, instruction_set.size())
+      << "entry_computation should contain exactly 4 instructions.";
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 0e0c0b02e3b..ad6f015c70e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -33,9 +33,9 @@ limitations under the License.
 namespace xla {
 
 string BufferAlias::ToString() const {
-  return tensorflow::strings::StrCat("BufferAlias(", instruction_->name(), "[",
-                                     tensorflow::str_util::Join(index_, ","),
-                                     "] => ", buffer_->ToString(), ")");
+  return tensorflow::strings::StrCat(
+      "BufferAlias(", instruction_->FullyQualifiedName(), "[",
+      tensorflow::str_util::Join(index_, ","), "])");
 }
 
 std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) {
@@ -45,29 +45,27 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) {
 
 bool PointsToSet::IsAmbiguous() const {
   bool ambiguous = false;
-  TF_CHECK_OK(ForEachElement(
-      [&ambiguous](const ShapeIndex& /*index*/, bool /*is_leaf*/,
+  ForEachElement(
+      [&ambiguous](const ShapeIndex& /*index*/,
                    const std::vector<const LogicalBuffer*>& points_to) {
         ambiguous |= points_to.size() > 1;
-        return Status::OK();
-      }));
+      });
   return ambiguous;
 }
 
 bool PointsToSet::IsDistinct() const {
   bool distinct = true;
   std::set<const LogicalBuffer*> all_points_to;
-  TF_CHECK_OK(ForEachElement([&distinct, &all_points_to](
-      const ShapeIndex& /*index*/, bool /*is_leaf*/,
-      const std::vector<const LogicalBuffer*>& points_to) {
+  ForEachElement([&distinct, &all_points_to](
+                     const ShapeIndex& /*index*/,
+                     const std::vector<const LogicalBuffer*>& points_to) {
     for (auto& buffer : points_to) {
       if (all_points_to.count(buffer) != 0) {
         distinct = false;
       }
       all_points_to.insert(buffer);
     }
-    return Status::OK();
-  }));
+  });
   return distinct;
 }
 
@@ -77,29 +75,27 @@ size_t PointsToSet::size() const {
   return CreateFlattenedSet().size();
 }
 
-std::set<const LogicalBuffer*> PointsToSet::CreateFlattenedSet() const {
-  std::set<const LogicalBuffer*> flat_set;
-  TF_CHECK_OK(ForEachElement(
-      [&flat_set](const ShapeIndex& /*index*/, bool /*is_leaf*/,
-                  const std::vector<const LogicalBuffer*>& buffers) {
-        flat_set.insert(buffers.begin(), buffers.end());
-        return Status::OK();
-      }));
+tensorflow::gtl::FlatSet<const LogicalBuffer*> PointsToSet::CreateFlattenedSet()
+    const {
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> flat_set;
+  ForEachElement([&flat_set](const ShapeIndex& /*index*/,
+                             const std::vector<const LogicalBuffer*>& buffers) {
+    flat_set.insert(buffers.begin(), buffers.end());
+  });
   return flat_set;
 }
 
 bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
   bool found = false;
-  TF_CHECK_OK(ForEachElement([&found, &buffer](
-      const ShapeIndex& /*index*/, bool /*is_leaf*/,
-      const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
+  ForEachElement([&found, &buffer](const ShapeIndex& /*index*/,
+                                   const std::vector<const LogicalBuffer*>&
+                                       pointed_to_buffers) {
     if (!found &&
         std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
                   &buffer) != pointed_to_buffers.end()) {
       found = true;
     }
-    return Status::OK();
-  }));
+  });
   return found;
 }
 
@@ -129,34 +125,32 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module) {
+TuplePointsToAnalysis::Run(const HloModule* module, Colorer colorer) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(
-      new TuplePointsToAnalysis(module));
+      new TuplePointsToAnalysis(module, std::move(colorer)));
   TF_RETURN_IF_ERROR(analysis->Analyze());
   return std::move(analysis);
 }
 
+/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
+TuplePointsToAnalysis::Run(const HloModule* module) {
+  return Run(module, DefaultColorer());
+}
+
 Status TuplePointsToAnalysis::Analyze() {
   points_to_.clear();
   for (auto& computation : module_->computations()) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
+    TF_RETURN_IF_ERROR(
+        PopulateDefinedBuffersAndAliases(computation->instructions()));
+    // Run points-to analysis on fusion instructions in 'computation'.
     for (auto& instruction : computation->instructions()) {
-      TF_RETURN_IF_ERROR(GatherBuffersDefinedByInstruction(
-          instruction.get(), &instruction_defined_buffers_[instruction.get()]));
-
-      const PointsToSet& points_to_set = GetPointsToSet(instruction.get());
-      TF_RETURN_IF_ERROR(points_to_set.ForEachElement([this, &instruction](
-          const ShapeIndex& index, bool /*is_leaf*/,
-          const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
-        for (const LogicalBuffer* buffer : pointed_to_buffers) {
-          if (buffer_aliases_.count(buffer) == 0) {
-            buffer_aliases_.insert({buffer, std::vector<BufferAlias>()});
-          }
-          buffer_aliases_[buffer].emplace_back(*buffer, instruction.get(),
-                                               index);
-        }
-        return Status::OK();
-      }));
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+      TF_RETURN_IF_ERROR(
+          PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
     }
   }
 
@@ -165,11 +159,33 @@ Status TuplePointsToAnalysis::Analyze() {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
+    const std::list<std::unique_ptr<HloInstruction>>& instructions) {
+  for (auto& instruction : instructions) {
+    TF_RETURN_IF_ERROR(GatherBuffersDefinedByInstruction(
+        instruction.get(), &instruction_defined_buffers_[instruction.get()]));
+
+    const PointsToSet& points_to_set = GetPointsToSet(instruction.get());
+    points_to_set.ForEachElement(
+        [this, &instruction](
+            const ShapeIndex& index,
+            const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
+          for (const LogicalBuffer* buffer : pointed_to_buffers) {
+            if (buffer_aliases_.count(buffer) == 0) {
+              buffer_aliases_.insert({buffer, std::vector<BufferAlias>()});
+            }
+            buffer_aliases_[buffer].emplace_back(instruction.get(), index);
+          }
+        });
+  }
+  return Status::OK();
+}
+
 const LogicalBuffer& TuplePointsToAnalysis::NewLogicalBuffer(
     HloInstruction* instruction, const ShapeIndex& index) {
   CHECK_EQ(logical_buffers_.size(), next_buffer_id_);
-  logical_buffers_.push_back(
-      MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_));
+  logical_buffers_.push_back(MakeUnique<LogicalBuffer>(
+      instruction, index, next_buffer_id_, colorer_(instruction, index)));
   ++next_buffer_id_;
   return *logical_buffers_.back();
 }
@@ -179,13 +195,12 @@ Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
   // contains a single element LogicalBuffer(hlo_instruction, i). This indicates
   // that this instruction is the source of all buffers in its own output.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(hlo_instruction);
-  TF_RETURN_IF_ERROR(points_to_set.ForEachMutableElement(
-      [this, hlo_instruction](const ShapeIndex& index, bool /*is_leaf*/,
+  points_to_set.ForEachMutableElement(
+      [this, hlo_instruction](const ShapeIndex& index,
                               std::vector<const LogicalBuffer*>* buffers) {
         const LogicalBuffer& buffer = NewLogicalBuffer(hlo_instruction, index);
         buffers->push_back(&buffer);
-        return Status::OK();
-      }));
+      });
 
   if (ShapeUtil::IsTuple(hlo_instruction->shape())) {
     // If the hlo instruction is a tuple-shaped, then trivially the instruction
@@ -207,24 +222,23 @@ Status TuplePointsToAnalysis::HandleGetTupleElement(
 
   // Copy the points-to set (and tuple sources) at index {element_index} of the
   // operand to the points-to set for this GetTupleElement instruction.
-  TF_RETURN_IF_ERROR(points_to_set.ForEachMutableElement([&, this](
-      const ShapeIndex& target_index, bool /*is_leaf*/,
-      std::vector<const LogicalBuffer*>* points_to) {
-    // Construct an index into the operand by prepending element_index to the
-    // index for the GetTupleElement instruction's points-to set.
-    ShapeIndex src_index;
-    src_index.push_back(element_index);
-    for (auto element : target_index) {
-      src_index.push_back(element);
-    }
+  points_to_set.ForEachMutableElement(
+      [&, this](const ShapeIndex& target_index,
+                std::vector<const LogicalBuffer*>* points_to) {
+        // Construct an index into the operand by prepending element_index to
+        // the index for the GetTupleElement instruction's points-to set.
+        ShapeIndex src_index;
+        src_index.push_back(element_index);
+        for (auto element : target_index) {
+          src_index.push_back(element);
+        }
 
-    *points_to = operand_points_to_set.element(src_index);
-    for (HloInstruction* tuple :
-         operand_points_to_set.tuple_sources(src_index)) {
-      points_to_set.add_tuple_source(target_index, tuple);
-    }
-    return Status::OK();
-  }));
+        *points_to = operand_points_to_set.element(src_index);
+        for (HloInstruction* tuple :
+             operand_points_to_set.tuple_sources(src_index)) {
+          points_to_set.add_tuple_source(target_index, tuple);
+        }
+      });
 
   return Status::OK();
 }
@@ -265,9 +279,9 @@ Status TuplePointsToAnalysis::HandleTuple(
 
     // Copy the points-to set (and tuple sources) of the operand into the
     // respective subtree of the tuple instructions points-to set.
-    TF_RETURN_IF_ERROR(operand_points_to_set.ForEachElement(
+    operand_points_to_set.ForEachElement(
         [&points_to_set, &operand_points_to_set, i](
-            const ShapeIndex& src_index, bool /*is_leaf*/,
+            const ShapeIndex& src_index,
             const std::vector<const LogicalBuffer*>& points_to) {
           ShapeIndex target_index;
           target_index.push_back(i);
@@ -281,8 +295,7 @@ Status TuplePointsToAnalysis::HandleTuple(
                operand_points_to_set.tuple_sources(src_index)) {
             points_to_set.add_tuple_source(target_index, tuple);
           }
-          return Status::OK();
-        }));
+        });
   }
 
   points_to_set.add_tuple_source({}, tuple);
@@ -303,9 +316,8 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select,
   // add in elements of the on_false points-to set (tuple sources).
   PointsToSet& points_to_set = CreateCopiedPointsToSet(select, on_true);
   const PointsToSet& false_points_to_set = *FindOrDie(points_to_, on_false);
-  TF_RETURN_IF_ERROR(points_to_set.ForEachMutableElement(
-      [&](const ShapeIndex& index, bool /*is_leaf*/,
-          std::vector<const LogicalBuffer*>* buffers) {
+  points_to_set.ForEachMutableElement(
+      [&](const ShapeIndex& index, std::vector<const LogicalBuffer*>* buffers) {
         for (const LogicalBuffer* false_buffer :
              false_points_to_set.element(index)) {
           points_to_set.AddPointedToBuffer(*false_buffer, index);
@@ -314,8 +326,7 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select,
         for (HloInstruction* tuple : false_points_to_set.tuple_sources(index)) {
           points_to_set.add_tuple_source(index, tuple);
         }
-        return Status::OK();
-      }));
+      });
 
   // Select creates a new (top-level) buffer to store its result, so its
   // respective element in the points-to set should contain only itself.
@@ -325,12 +336,6 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select,
   return Status::OK();
 }
 
-Status TuplePointsToAnalysis::HandleFusion(HloInstruction* fusion) {
-  return ShapeUtil::IsTuple(fusion->shape())
-             ? Unimplemented("HandleFusion with tuple output")
-             : DefaultAction(fusion);
-}
-
 const PointsToSet& TuplePointsToAnalysis::GetPointsToSet(
     const HloInstruction* hlo_instruction) const {
   return *FindOrDie(points_to_, hlo_instruction);
@@ -344,7 +349,7 @@ PointsToSet& TuplePointsToAnalysis::CreateEmptyPointsToSet(
 }
 
 bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
-    HloInstruction* instruction, const ShapeIndex& index) const {
+    const HloInstruction* instruction, const ShapeIndex& index) const {
   const std::vector<const LogicalBuffer*>& buffers =
       GetPointsToSet(instruction).element(index);
   return (buffers.size() == 1 && buffers[0]->instruction() == instruction);
@@ -407,28 +412,29 @@ TuplePointsToAnalysis::GetBuffersDefinedByInstruction(
 Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction(
     const HloInstruction* instruction,
     std::vector<const LogicalBuffer*>* buffers) {
-  return GetPointsToSet(instruction)
-      .ForEachElement([this, buffers, instruction](
-          const ShapeIndex& index, bool /*is_leaf*/,
-          const std::vector<const LogicalBuffer*>& source_buffers) {
-        // Add buffers which 'instruction' is the source of.
-        CHECK(!source_buffers.empty());
-        if (source_buffers.size() == 1 &&
-            source_buffers[0]->instruction() == instruction) {
-          // If this instruction is the source of this buffer the
-          // indices must match.
-          DCHECK(source_buffers[0]->index() == index);
-          buffers->push_back(source_buffers[0]);
-        } else {
-          // If the points-to set includes more than one buffer then
-          // necessarily this instruction did not produce the
-          // buffer.
-          for (const LogicalBuffer* source_buffer : source_buffers) {
-            DCHECK(source_buffer->instruction() != instruction);
-          }
-        }
-        return Status::OK();
-      });
+  GetPointsToSet(instruction)
+      .ForEachElement(
+          [this, buffers, instruction](
+              const ShapeIndex& index,
+              const std::vector<const LogicalBuffer*>& source_buffers) {
+            // Add buffers which 'instruction' is the source of.
+            CHECK(!source_buffers.empty());
+            if (source_buffers.size() == 1 &&
+                source_buffers[0]->instruction() == instruction) {
+              // If this instruction is the source of this buffer the
+              // indices must match.
+              DCHECK(source_buffers[0]->index() == index);
+              buffers->push_back(source_buffers[0]);
+            } else {
+              // If the points-to set includes more than one buffer then
+              // necessarily this instruction did not produce the
+              // buffer.
+              for (const LogicalBuffer* source_buffer : source_buffers) {
+                DCHECK(source_buffer->instruction() != instruction);
+              }
+            }
+          });
+  return Status::OK();
 }
 
 PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet(
@@ -437,59 +443,67 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet(
   // from src PointsToSet.
   PointsToSet& dst_points_to_set = CreateEmptyPointsToSet(instruction);
   const PointsToSet& src_points_to_set = GetPointsToSet(src);
-  TF_CHECK_OK(dst_points_to_set.ForEachMutableElement(
+  dst_points_to_set.ForEachMutableElement(
       [this, &dst_points_to_set, &src_points_to_set](
-          const ShapeIndex& index, bool /*is_leaf*/,
-          std::vector<const LogicalBuffer*>* buffers) {
+          const ShapeIndex& index, std::vector<const LogicalBuffer*>* buffers) {
         *buffers = src_points_to_set.element(index);
         for (auto& tuple_source : src_points_to_set.tuple_sources(index)) {
           dst_points_to_set.add_tuple_source(index, tuple_source);
         }
-        return Status::OK();
-      }));
+      });
   return *FindOrDie(points_to_, instruction);
 }
 
 string TuplePointsToAnalysis::ToString() const {
   string output = tensorflow::strings::Printf(
       "TuplePointsToSet for module %s:\n", module_->name().c_str());
-  for (auto& computation : module_->computations()) {
-    tensorflow::strings::StrAppend(&output, "computation ",
-                                   computation->name().c_str(), ":\n");
+  for (const auto& computation : module_->computations()) {
+    const char* entry =
+        computation.get() == module_->entry_computation() ? "entry " : "";
+    tensorflow::strings::StrAppend(&output, entry, "computation ",
+                                   computation->name(), ":\n");
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
-      tensorflow::strings::StrAppend(&output, "  instruction ",
-                                     instruction->ToShortString(), ":\n");
-      const PointsToSet& points_to_set = GetPointsToSet(instruction);
-      TF_CHECK_OK(points_to_set.ForEachElement(
-          [&output](const ShapeIndex& index, bool /*is_leaf*/,
-                    const std::vector<const LogicalBuffer*>& points_to) {
-            tensorflow::strings::StrAppend(
-                &output, "    {", tensorflow::str_util::Join(index, ","), "}: ",
-                tensorflow::str_util::Join(
-                    points_to, ", ",
-                    [](string* out, const LogicalBuffer* source) {
-                      out->append(source->ToString());
-                    }),
-                "\n");
-            return Status::OK();
-          }));
-    }
-    for (auto& buffer : logical_buffers_) {
-      tensorflow::strings::StrAppend(&output, "  buffer ", buffer->ToString(),
-                                     ":\n");
-      for (const BufferAlias& buffer_alias : buffer_aliases_.at(buffer.get())) {
-        tensorflow::strings::StrAppend(&output, "    alias ",
-                                       buffer_alias.ToString(), "\n");
+      InstructionToString(instruction, &output);
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        for (auto& fused : instruction->fused_instructions()) {
+          InstructionToString(fused.get(), &output);
+        }
       }
     }
   }
 
   tensorflow::strings::StrAppend(&output, "LogicalBuffers:\n");
-  for (const auto& buffer : logical_buffers_) {
-    tensorflow::strings::StrAppend(&output, "  ", buffer->ToString());
+  for (auto& buffer : logical_buffers_) {
+    tensorflow::strings::StrAppend(&output, "  buffer ", buffer->ToString(),
+                                   ":\n");
+    for (const BufferAlias& buffer_alias : buffer_aliases_.at(buffer.get())) {
+      tensorflow::strings::StrAppend(&output, "    alias ",
+                                     buffer_alias.ToString(), "\n");
+    }
   }
   return output;
 }
 
+void TuplePointsToAnalysis::InstructionToString(
+    const HloInstruction* instruction, string* output) const {
+  const string prefix = instruction->IsFused() ? "    " : "";
+  tensorflow::strings::StrAppend(output, prefix, "  instruction ",
+                                 instruction->ToShortString(), ":\n");
+  const PointsToSet& points_to_set = GetPointsToSet(instruction);
+  points_to_set.ForEachElement([&prefix, &output](
+                                   const ShapeIndex& index,
+                                   const std::vector<const LogicalBuffer*>&
+                                       points_to) {
+    tensorflow::strings::StrAppend(
+        output, prefix, "    {", tensorflow::str_util::Join(index, ","), "}: ",
+        tensorflow::str_util::Join(
+            points_to, ", ",
+            [](string* out, const LogicalBuffer* source) {
+              out->append(source->ToString());
+            }),
+        "\n");
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 7a3eb772d6b..4d7fc7cbc9e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -65,7 +66,7 @@ class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
 
   // Creates a set containing the union of all LogicalBuffers contained in the
   // PointsToSet.
-  std::set<const LogicalBuffer*> CreateFlattenedSet() const;
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> CreateFlattenedSet() const;
 
   // Returns true if the given buffer is in the points-to set at the given
   // index.
@@ -116,27 +117,21 @@ class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
 // value.
 class BufferAlias {
  public:
-  BufferAlias(const LogicalBuffer& buffer, HloInstruction* instruction,
-              const ShapeIndex& index)
-      : buffer_(&buffer), instruction_(instruction), index_(index) {}
-
-  // Return the logical buffer aliased at the instruction and index.
-  const LogicalBuffer& buffer() const { return *buffer_; }
+  BufferAlias(HloInstruction* instruction, const ShapeIndex& index)
+      : instruction_(instruction), index_(index) {}
 
   // Return the instruction/index of the subshape.
   HloInstruction* instruction() const { return instruction_; }
   const ShapeIndex& index() const { return index_; }
 
   bool operator==(const BufferAlias& other) const {
-    return buffer_ == other.buffer_ && instruction_ == other.instruction_ &&
-           index_ == other.index_;
+    return instruction_ == other.instruction_ && index_ == other.index_;
   }
   bool operator!=(const BufferAlias& other) const { return !(*this == other); }
 
   string ToString() const;
 
  private:
-  const LogicalBuffer* buffer_;
   HloInstruction* instruction_;
   const ShapeIndex index_;
 };
@@ -147,6 +142,15 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 // the potential sources of each buffer in each instruction's output.
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
+  using Colorer = std::function<LogicalBuffer::Color(
+      const HloInstruction* instruction, const ShapeIndex& index)>;
+
+  // Runs points-to analysis on 'module' with the provided buffer color
+  // assigner.
+  static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
+      const HloModule* module, Colorer colorer);
+
+  // Runs points-to analysis on 'module' with the default color assigner.
   static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
       const HloModule* module);
 
@@ -185,7 +189,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
       const HloInstruction* instruction) const;
 
   // Returns true if the given instruction defines a buffer at the given index.
-  bool InstructionDefinesBufferAtIndex(HloInstruction* instruction,
+  bool InstructionDefinesBufferAtIndex(const HloInstruction* instruction,
                                        const ShapeIndex& index) const;
 
   // Returns an OK status if the given buffer is defined by instruction
@@ -205,20 +209,34 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
                                HloInstruction* operand) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
-  Status HandleFusion(HloInstruction* fusion) override;
   Status HandleSelect(HloInstruction* select, HloInstruction* pred,
                       HloInstruction* on_true,
                       HloInstruction* on_false) override;
 
   string ToString() const;
 
+  static Colorer DefaultColorer() {
+    return [](const HloInstruction* instruction, const ShapeIndex& index) {
+      return LogicalBuffer::Color(0);
+    };
+  }
+
  private:
-  explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
+  explicit TuplePointsToAnalysis(const HloModule* module,
+                                 Colorer colorer = DefaultColorer())
+      : module_(module), colorer_(colorer) {}
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
   Status Analyze();
 
+  // Populates instruction-defined buffers and aliases for each instruction
+  // in 'instructions'. The parameter 'instructions' is passed in a form
+  // common to how both HloComputation, and fusion instructions maintain a
+  // list of instructions.
+  Status PopulateDefinedBuffersAndAliases(
+      const std::list<std::unique_ptr<HloInstruction>>& instructions);
+
   // Create a new logical buffer and return a reference to it. The newly created
   // buffer is stored in an internal vector of LogicalBuffers and can be
   // accessed with GetBuffer.
@@ -239,6 +257,10 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
       const HloInstruction* instruction,
       std::vector<const LogicalBuffer*>* buffers);
 
+  // Print points-to set for 'instruction' to 'output'.
+  void InstructionToString(const HloInstruction* instruction,
+                           string* output) const;
+
   // The module this analysis is performed on.
   const HloModule* module_;
 
@@ -247,10 +269,11 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
       points_to_;
 
   // A map containing the LogicalBuffers defined by each HLO instruction.
-  std::unordered_map<const HloInstruction*, std::vector<const LogicalBuffer*>>
+  tensorflow::gtl::FlatMap<const HloInstruction*,
+                           std::vector<const LogicalBuffer*>>
       instruction_defined_buffers_;
 
-  std::unordered_map<const LogicalBuffer*, std::vector<BufferAlias>>
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, std::vector<BufferAlias>>
       buffer_aliases_;
 
   // All logical buffers in the module, indexed by LogicalBuffer::Id. Keep as
@@ -260,6 +283,9 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // The ID of the next logical buffer created.
   LogicalBuffer::Id next_buffer_id_ = 0;
 
+  // Used to color the created logical buffers.
+  Colorer colorer_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(TuplePointsToAnalysis);
 };
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index e4dd4d309e5..9909c11929d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,24 +19,41 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
+using ::testing::UnorderedElementsAreArray;
+using ::testing::UnorderedElementsAre;
+
 class TuplePointsToAnalysisTest : public HloTestBase {
  protected:
   // Builds a module with the given entry computation and runs points to
   // analysis.
   void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
-    module_.reset(new HloModule(TestName()));
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
     module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
   }
@@ -59,7 +76,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
       const std::vector<const LogicalBuffer*>& points_to_set,
       tensorflow::gtl::ArraySlice<const LogicalBuffer*> buffers) {
     std::vector<const LogicalBuffer*> vec(buffers.begin(), buffers.end());
-    EXPECT_MATCH(points_to_set, testing::UnorderedElementsAre(vec));
+    EXPECT_THAT(points_to_set, UnorderedElementsAreArray(vec));
   }
 
   // Checks that the given points-to set contains exactly (unordered) the
@@ -76,7 +93,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
 
   // Overload which takes a std::set instead of a std::vector.
   void ExpectHasTopLevelBuffers(
-      const std::set<const LogicalBuffer*>& points_to_set,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& points_to_set,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
     ExpectHasTopLevelBuffers(std::vector<const LogicalBuffer*>(
                                  points_to_set.begin(), points_to_set.end()),
@@ -94,22 +111,16 @@ class TuplePointsToAnalysisTest : public HloTestBase {
             .ValueOrDie();
     std::vector<BufferAlias> expected_aliases;
     for (auto& pair : expected) {
-      expected_aliases.push_back(BufferAlias(*buffer, pair.first, pair.second));
+      expected_aliases.push_back(BufferAlias(pair.first, pair.second));
     }
-    EXPECT_MATCH(points_to_analysis_->GetBufferAliases(*buffer),
-                 testing::UnorderedElementsAre(expected_aliases));
+    EXPECT_THAT(points_to_analysis_->GetBufferAliases(*buffer),
+                UnorderedElementsAreArray(expected_aliases));
   }
 
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
 };
 
-// Expect the given std::set<HloInstruction*> as A contains exactly the given
-// HloInstruction*s as __VA_ARGS__.
-#define EXPECT_ISET(A, ...)                           \
-  EXPECT_MATCH(testing::SetToVec<HloInstruction*>(A), \
-               testing::UnorderedMatcher<HloInstruction*>(__VA_ARGS__))
-
 TEST_F(TuplePointsToAnalysisTest, SimpleTuple) {
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
@@ -135,8 +146,8 @@ TEST_F(TuplePointsToAnalysisTest, SimpleTuple) {
 
   EXPECT_EQ(3, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
 
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
@@ -194,9 +205,9 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) {
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(inner_tuple).element({}),
       {inner_tuple});
-  EXPECT_ISET(
+  EXPECT_THAT(
       points_to_analysis_->GetPointsToSet(inner_tuple).tuple_sources({}),
-      inner_tuple);
+      UnorderedElementsAre(inner_tuple));
 
   EXPECT_EQ(5, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
@@ -204,10 +215,10 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) {
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
       {constant1, constant2, constant3, inner_tuple, tuple});
 
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({0}),
-              inner_tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({0}),
+              UnorderedElementsAre(inner_tuple));
   EXPECT_TRUE(
       points_to_analysis_->GetPointsToSet(tuple).tuple_sources({1}).empty());
 
@@ -251,7 +262,8 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
                            {constant1, constant2, inner_tuple});
   ExpectHasTopLevelBuffers(points_to_set.element({}), {inner_tuple});
 
-  EXPECT_ISET(points_to_set.tuple_sources({}), inner_tuple);
+  EXPECT_THAT(points_to_set.tuple_sources({}),
+              UnorderedElementsAre(inner_tuple));
 }
 
 TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
@@ -449,8 +461,10 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) {
   ExpectHasTopLevelBuffers(points_to_set.element({0, 1}), {constant2});
 
   // Verify tuple sources.
-  EXPECT_ISET(points_to_set.tuple_sources({}), tuple1, tuple2);
-  EXPECT_ISET(points_to_set.tuple_sources({0}), inner_tuple1, inner_tuple2);
+  EXPECT_THAT(points_to_set.tuple_sources({}),
+              UnorderedElementsAre(tuple1, tuple2));
+  EXPECT_THAT(points_to_set.tuple_sources({0}),
+              UnorderedElementsAre(inner_tuple1, inner_tuple2));
   EXPECT_EQ(0, points_to_set.tuple_sources({0, 0}).size());
   EXPECT_EQ(0, points_to_set.tuple_sources({0, 1}).size());
 }
@@ -478,8 +492,8 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) {
 
   EXPECT_EQ(3, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
 
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
@@ -540,5 +554,217 @@ TEST_F(TuplePointsToAnalysisTest, BufferAliases) {
   ExpectHasBufferAliases(tuple, /*index=*/{}, {{tuple, {}}});
 }
 
+class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
+ protected:
+  // Builds a computation, runs instruction fusion HloPass, runs points-to
+  // analysis, then checks for expected results (see unit test cases for
+  // example computation graphs).
+  void Run(const bool add_additional_gte0_user) {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {8});
+    Shape update_shape = ShapeUtil::MakeShape(F32, {3});
+    Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+    Shape tuple_shape =
+        ShapeUtil::MakeTupleShape({input_shape, update_shape, starts_shape});
+
+    auto builder = HloComputation::Builder(TestName());
+    // Create tuple-shaped parameter.
+    auto tuple_param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape, "param0"));
+    // Create 'tuple_element1' = GetTupleElement(tuple_param0, 1).
+    auto tuple_element1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(update_shape, tuple_param0, 1));
+    auto ones = builder.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+    // Create 'update' = Add(GetTupleElement(tuple_param0, 1), ones)
+    auto update = builder.AddInstruction(HloInstruction::CreateBinary(
+        update_shape, HloOpcode::kAdd, tuple_element1, ones));
+    // Create 'input' = GetTupleElement(tuple_param0, 0).
+    auto input = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(input_shape, tuple_param0, 0));
+
+    if (add_additional_gte0_user) {
+      // Create 'slice' as an additional user of 'input'.
+      auto slice = builder.AddInstruction(
+          HloInstruction::CreateSlice(update_shape, input, {0}, {3}, {1}));
+      // Modify 'update' to take 'slice' output.
+      update = builder.AddInstruction(HloInstruction::CreateBinary(
+          update_shape, HloOpcode::kAdd, update, slice));
+    }
+
+    // Create slice 'starts' = GetTupleElement(tuple_param0, 2).
+    auto starts = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(starts_shape, tuple_param0, 2));
+    // Update 'input' with 'update' at dynamic 'starts' indices.
+    builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        input_shape, input, update, starts));
+
+    // Build computation and add it to module as entry computation.
+    BuildModule(builder.Build());
+    // Run instruction fusion HloPass.
+    EXPECT_TRUE(InstructionFusion(InstructionFusion::IsExpensive)
+                    .Run(module_.get())
+                    .ValueOrDie());
+    // Get computation root instruction (should be a kFusion).
+    auto* fusion = module_->entry_computation()->root_instruction();
+    EXPECT_THAT(fusion, op::Fusion(tuple_param0));
+    // Run points-to analysis (should include fused instructions from 'fusion').
+    RunAnalysis();
+
+    // Check points-to set of fusion parameter associated with 'tuple_param0'.
+    auto* fusion_param = GetFusionParameterForOperand(fusion, tuple_param0);
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fusion_param).element({}),
+        {GetBuffer(fusion_param, {})});
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fusion_param).element({0}),
+        {GetBuffer(fusion_param, {0})});
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fusion_param).element({1}),
+        {GetBuffer(fusion_param, {1})});
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fusion_param).element({2}),
+        {GetBuffer(fusion_param, {2})});
+
+    // Check that Gte at tuple_index = 0 points-to fusion_param({0})
+    auto fused_gte0 = GetUniqueFusionParameterUserAt(fusion_param, 0);
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fused_gte0).element({}),
+        {GetBuffer(fusion_param, {0})});
+    // Check that Gte at tuple_index = 1 points-to fusion_param({1})
+    auto fused_gte1 = GetUniqueFusionParameterUserAt(fusion_param, 1);
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fused_gte1).element({}),
+        {GetBuffer(fusion_param, {1})});
+    // Check that Gte at tuple_index = 2 points-to fusion_param({2})
+    auto fused_gte2 = GetUniqueFusionParameterUserAt(fusion_param, 2);
+    ExpectHasBuffers(
+        points_to_analysis_->GetPointsToSet(fused_gte2).element({}),
+        {GetBuffer(fusion_param, {2})});
+
+    // Check buffer aliases of 'fusion_param' at shape index {0}.
+    ExpectHasBufferAliases(fusion_param, /*index=*/{0},
+                           {{fusion_param, {0}}, {fused_gte0, {}}});
+    // Check buffer aliases of 'fusion_param' at shape index {1}.
+    ExpectHasBufferAliases(fusion_param, /*index=*/{1},
+                           {{fusion_param, {1}}, {fused_gte1, {}}});
+    // Check buffer aliases of 'fusion_param' at shape index {2}.
+    ExpectHasBufferAliases(fusion_param, /*index=*/{2},
+                           {{fusion_param, {2}}, {fused_gte2, {}}});
+
+    // Check number of users of 'fusion_param' aliases at shape index {0}.
+    ExpectNumUsersOfAliases(fusion_param, {0},
+                            add_additional_gte0_user ? 2 : 1);
+  }
+
+  // Returns fusion parameter (from 'fusion.fused_instructions') corresponding
+  // to fusion 'operand'.
+  HloInstruction* GetFusionParameterForOperand(HloInstruction* fusion,
+                                               HloInstruction* operand) {
+    auto it = std::find_if(
+        fusion->fused_instructions().begin(),
+        fusion->fused_instructions().end(),
+        [=](const std::unique_ptr<HloInstruction>& fused) {
+          return fused->opcode() == HloOpcode::kParameter &&
+                 fusion->operand(fused->parameter_number()) == operand;
+        });
+    CHECK(it != fusion->fused_instructions().end());
+    return (*it).get();
+  }
+
+  // Returns all users of 'fusion_paran' at 'tuple_index'.
+  std::vector<HloInstruction*> GetFusionParameterUsersAt(
+      HloInstruction* fusion_param, int64 tuple_index) {
+    CHECK(ShapeUtil::IsTuple(fusion_param->shape()));
+    std::vector<HloInstruction*> users_at_tuple_index;
+    for (auto user : fusion_param->users()) {
+      CHECK_EQ(HloOpcode::kGetTupleElement, user->opcode());
+      if (user->tuple_index() == tuple_index) {
+        users_at_tuple_index.push_back(user);
+      }
+    }
+    return users_at_tuple_index;
+  }
+
+  // Returns the unique user of 'fusion_param' at 'tuple_index'.
+  HloInstruction* GetUniqueFusionParameterUserAt(HloInstruction* fusion_param,
+                                                 int64 tuple_index) {
+    std::vector<HloInstruction*> users =
+        GetFusionParameterUsersAt(fusion_param, tuple_index);
+    CHECK_EQ(1, users.size());
+    return users[0];
+  }
+
+  // Checks that the count of all users of all aliases of 'instruction' at
+  // 'index' match 'expected_num_users'.
+  void ExpectNumUsersOfAliases(const HloInstruction* instruction,
+                               const ShapeIndex& index,
+                               const int64 expected_num_users) {
+    const auto* buffer = GetBuffer(instruction, index);
+    int64 num_users = 0;
+    for (const auto& alias : points_to_analysis_->GetBufferAliases(*buffer)) {
+      for (auto user : alias.instruction()->users()) {
+        if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
+          // Gte instructions only access the top-level buffer of their operand.
+          continue;
+        }
+        ++num_users;
+      }
+    }
+    EXPECT_EQ(expected_num_users, num_users);
+  }
+};
+
+// Tests the points-to set of tuple-shaped fusion parameter 0 and all GTE users.
+// Tests the alias set of tuple-shaped fusion parameter 0 at all shape indices.
+// Tests that there is a single user of the aliases of tuple-shaped fusion
+// parameter 0 at shape index {0}.
+//
+//             Param0    Const
+//                 \      /
+//                  Fusion
+//                 /      \
+//        FusionParam0   FusionParam1
+//        /     |    \       |
+//     Gte(0) Gte(2) Gte(1)  /
+//        \     |      \    /
+//         \    |       Add
+//          \   |        /
+//           \0 |2      /1
+//          DynamicUpdateSlice  // fused root.
+//
+TEST_F(FusionPointsToAnalysisTest, FusionParam0OneUser) {
+  Run(/*add_additional_gte0_user=*/false);
+}
+
+// Tests the points-to set of tuple-shaped fusion parameter 0 and all GTE users.
+// Tests the alias set of tuple-shaped fusion parameter 0 at all shape indices.
+// Tests that there are two users of the aliases of tuple-shaped fusion
+// parameter 0 at shape index {0}.
+//
+//             Param0    Const
+//                 \      /
+//                  Fusion
+//                 /      \
+//        FusionParam0   FusionParam1
+//        /     |    \       |
+//     Gte(2) Gte(0) Gte(1)  /
+//        \     |      \    /
+//         \    |\      Add
+//          \   | \      /
+//           |  | Slice /
+//           |  |   \  /
+//           |  |   Add
+//           |  |    |
+//           |2 |0   |1
+//          DynamicUpdateSlice  // fused root.
+//
+TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
+  Run(/*add_additional_gte0_user=*/true);
+}
+
 }  // namespace
 }  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index db0c1f0369a..4aba8875161 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <set>
+#include <stack>
+#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -50,6 +53,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kExp;
     case UNOP_FLOOR:
       return HloOpcode::kFloor;
+    case UNOP_IS_FINITE:
+      return HloOpcode::kIsFinite;
     case UNOP_LOG:
       return HloOpcode::kLog;
     case UNOP_LOGICAL_NOT:
@@ -164,6 +169,9 @@ UserComputation::UserComputation(const string& name,
     : name_(name), next_handle_value_(1) {
   *session_computation_.mutable_computation_handle() = handle;
   session_computation_.set_name(name);
+
+  VLOG(1) << "New UserComputation \"" << name
+          << "\", handle: " << handle.handle();
 }
 
 ComputationDataHandle UserComputation::CreateComputationDataHandle() {
@@ -198,15 +206,30 @@ StatusOr<ComputationDataHandle> UserComputation::AddParameterInstruction(
 
   parameters_[parameter_number] = &request;
 
+  VLOG(1) << "AddParameterInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << parameter_request.ShortDebugString();
   return handle;
 }
 
 Status UserComputation::AddSendInstruction(const SendRequest& send_request) {
   tensorflow::mutex_lock lock(mutex_);
 
-  *session_computation_.add_send_requests() = send_request;
   // Check if the operand of the instruction is valid.
-  TF_RETURN_IF_ERROR(LookupRequest(send_request.operand()).status());
+  TF_RETURN_IF_ERROR(LookUpRequest(send_request.operand()).status());
+
+  // No handle is returned, but a handle must be assigned to this instruction
+  // for computation versioning.
+  ComputationDataHandle handle = CreateComputationDataHandle();
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = ShapeUtil::MakeNil();
+  *request.mutable_request()->mutable_send_request() = send_request;
+
+  VLOG(1) << "AddSendInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << send_request.ShortDebugString();
   return Status::OK();
 }
 
@@ -223,6 +246,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddRecvInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_recv_request() = recv_request;
 
+  VLOG(1) << "AddRecvInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << recv_request.ShortDebugString();
   return handle;
 }
 
@@ -231,10 +257,10 @@ StatusOr<ComputationDataHandle> UserComputation::AddPadInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(pad_request.operand()));
+                      LookUpRequest(pad_request.operand()));
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* padding_value,
-                      LookupRequest(pad_request.padding_value()));
+                      LookUpRequest(pad_request.padding_value()));
 
   TF_ASSIGN_OR_RETURN(Shape inferred_shape, ShapeInference::InferPadShape(
                                                 operand->output_shape(),
@@ -248,6 +274,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddPadInstruction(
   *request.mutable_output_shape() = inferred_shape;
   *request.mutable_request()->mutable_pad_request() = pad_request;
 
+  VLOG(1) << "AddPadInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << pad_request.ShortDebugString();
   return handle;
 }
 
@@ -267,6 +296,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddConstantInstruction(
   *request.mutable_output_shape() = validated_shape;
   *request.mutable_request()->mutable_constant_request() = constant_request;
 
+  VLOG(1) << "AddConstantInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle();
   return handle;
 }
 
@@ -275,7 +306,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddGetTupleElementInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(get_tuple_element_request.operand()));
+                      LookUpRequest(get_tuple_element_request.operand()));
   Shape element_shape = ShapeUtil::GetTupleElementShape(
       operand->output_shape(), get_tuple_element_request.index());
 
@@ -288,6 +319,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddGetTupleElementInstruction(
   *request.mutable_request()->mutable_get_tuple_element_request() =
       get_tuple_element_request;
 
+  VLOG(1) << "AddGetTupleElementInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << get_tuple_element_request.ShortDebugString();
   return handle;
 }
 
@@ -295,10 +329,18 @@ Status UserComputation::AddTraceInstruction(const TraceRequest& trace_request) {
   tensorflow::mutex_lock lock(mutex_);
 
   // Verify that the operand index is valid.
-  TF_RETURN_IF_ERROR(LookupRequest(trace_request.operand()).status());
+  TF_RETURN_IF_ERROR(LookUpRequest(trace_request.operand()).status());
 
-  *session_computation_.add_trace_requests() = trace_request;
+  ComputationDataHandle handle = CreateComputationDataHandle();
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = ShapeUtil::MakeNil();
+  *request.mutable_request()->mutable_trace_request() = trace_request;
 
+  VLOG(1) << "AddTraceInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << trace_request.ShortDebugString();
   return Status::OK();
 }
 
@@ -331,7 +373,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddRngInstruction(
 
   // Verify that the parameter indices are valid;
   for (const ComputationDataHandle& param : rng_request.parameter()) {
-    TF_RETURN_IF_ERROR(LookupRequest(param).status());
+    TF_RETURN_IF_ERROR(LookUpRequest(param).status());
   }
   const Shape& validated_shape = rng_request.shape();
   TF_RETURN_IF_ERROR(
@@ -345,6 +387,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddRngInstruction(
   *request.mutable_output_shape() = validated_shape;
   *request.mutable_request()->mutable_rng_request() = rng_request;
 
+  VLOG(1) << "AddRngInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << rng_request.ShortDebugString();
   return handle;
 }
 
@@ -355,7 +400,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddMapInstruction(
 
   std::vector<const Shape*> operand_shapes;
   for (const ComputationDataHandle& handle : map_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookupRequest(handle));
+    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
     operand_shapes.push_back(&operand->output_shape());
   }
 
@@ -377,6 +422,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddMapInstruction(
   request.add_embedded_computation_versions(to_apply_version);
   *request.mutable_request()->mutable_map_request() = map_request;
 
+  VLOG(1) << "AddMapInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << map_request.ShortDebugString();
   return handle;
 }
 
@@ -386,9 +434,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddReduceInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(reduce_request.operand()));
+                      LookUpRequest(reduce_request.operand()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookupRequest(reduce_request.init_value()));
+                      LookUpRequest(reduce_request.init_value()));
 
   VersionedComputationHandle::Version to_apply_version =
       to_apply_computation.version();
@@ -411,6 +459,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddReduceInstruction(
   request.add_embedded_computation_versions(to_apply_version);
   *request.mutable_request()->mutable_reduce_request() = reduce_request;
 
+  VLOG(1) << "AddReduceInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << reduce_request.ShortDebugString();
   return handle;
 }
 
@@ -420,9 +471,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddReduceWindowInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(reduce_window_request.operand()));
+                      LookUpRequest(reduce_window_request.operand()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookupRequest(reduce_window_request.init_value()));
+                      LookUpRequest(reduce_window_request.init_value()));
 
   VersionedComputationHandle::Version to_apply_version =
       to_apply_computation.version();
@@ -446,6 +497,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddReduceWindowInstruction(
   *request.mutable_request()->mutable_reduce_window_request() =
       reduce_window_request;
 
+  VLOG(1) << "AddReduceWindowInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << reduce_window_request.ShortDebugString();
   return handle;
 }
 
@@ -456,11 +510,11 @@ StatusOr<ComputationDataHandle> UserComputation::AddSelectAndScatterInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(select_and_scatter_request.operand()));
+                      LookUpRequest(select_and_scatter_request.operand()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* source,
-                      LookupRequest(select_and_scatter_request.source()));
+                      LookUpRequest(select_and_scatter_request.source()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookupRequest(select_and_scatter_request.init_value()));
+                      LookUpRequest(select_and_scatter_request.init_value()));
 
   VersionedComputationHandle::Version select_version =
       select_computation.version();
@@ -489,6 +543,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddSelectAndScatterInstruction(
   *request.mutable_request()->mutable_select_and_scatter_request() =
       select_and_scatter_request;
 
+  VLOG(1) << "AddSelectAndScatterInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << select_and_scatter_request.ShortDebugString();
   return handle;
 }
 
@@ -497,7 +554,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddReverseInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(reverse_request.operand()));
+                      LookUpRequest(reverse_request.operand()));
   TF_ASSIGN_OR_RETURN(
       Shape inferred_shape,
       ShapeInference::InferReverseShape(
@@ -509,6 +566,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddReverseInstruction(
   *request.mutable_output_handle() = handle;
   *request.mutable_output_shape() = inferred_shape;
   *request.mutable_request()->mutable_reverse_request() = reverse_request;
+  VLOG(1) << "AddReverseInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << reverse_request.ShortDebugString();
   return handle;
 }
 
@@ -519,7 +579,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* init,
-                      LookupRequest(while_request.init()));
+                      LookUpRequest(while_request.init()));
 
   VersionedComputationHandle::Version condition_version =
       condition_computation.version();
@@ -546,6 +606,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
   request.add_embedded_computation_versions(body_version);
   *request.mutable_request()->mutable_while_request() = while_request;
 
+  VLOG(1) << "AddWhileInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << while_request.ShortDebugString();
   return handle;
 }
 
@@ -555,7 +618,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
 
   // Fetches and validates the operand.
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(broadcast_request.operand()));
+                      LookUpRequest(broadcast_request.operand()));
   TF_ASSIGN_OR_RETURN(Shape inferred_shape,
                       ShapeInference::InferBroadcastShape(
                           operand->output_shape(),
@@ -567,6 +630,10 @@ StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
   *request.mutable_output_handle() = handle;
   *request.mutable_output_shape() = inferred_shape;
   *request.mutable_request()->mutable_broadcast_request() = broadcast_request;
+
+  VLOG(1) << "AddBroadcastInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << broadcast_request.ShortDebugString();
   return handle;
 }
 
@@ -576,7 +643,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddReshapeInstruction(
 
   // Fetches and validates the operand.
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(reshape_request.operand()));
+                      LookUpRequest(reshape_request.operand()));
 
   TF_ASSIGN_OR_RETURN(
       Shape inferred_shape,
@@ -592,6 +659,36 @@ StatusOr<ComputationDataHandle> UserComputation::AddReshapeInstruction(
   *request.mutable_output_shape() = inferred_shape;
   *request.mutable_request()->mutable_reshape_request() = reshape_request;
 
+  VLOG(1) << "AddReshapeInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << reshape_request.ShortDebugString();
+  return handle;
+}
+
+StatusOr<ComputationDataHandle> UserComputation::AddTransposeInstruction(
+    const TransposeRequest& transpose_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  // Fetches and validates the operand.
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(transpose_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                      ShapeInference::InferTransposeShape(
+                          operand->output_shape(),
+                          AsInt64Slice(transpose_request.dimensions())));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = inferred_shape;
+  *request.mutable_request()->mutable_transpose_request() = transpose_request;
+
+  VLOG(1) << "AddTransposeInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << transpose_request.ShortDebugString();
   return handle;
 }
 
@@ -600,13 +697,14 @@ StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(slice_request.operand()));
+                      LookUpRequest(slice_request.operand()));
 
   TF_ASSIGN_OR_RETURN(
       Shape new_shape,
       ShapeInference::InferSliceShape(
           operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -616,6 +714,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
   *request.mutable_output_shape() = new_shape;
   *request.mutable_request()->mutable_slice_request() = slice_request;
 
+  VLOG(1) << "AddSliceInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << slice_request.ShortDebugString();
   return handle;
 }
 
@@ -624,10 +725,10 @@ StatusOr<ComputationDataHandle> UserComputation::AddDynamicSliceInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(dynamic_slice_request.operand()));
+                      LookUpRequest(dynamic_slice_request.operand()));
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* start_indices,
-                      LookupRequest(dynamic_slice_request.start_indices()));
+                      LookUpRequest(dynamic_slice_request.start_indices()));
 
   TF_ASSIGN_OR_RETURN(
       Shape new_shape,
@@ -644,6 +745,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddDynamicSliceInstruction(
   *request.mutable_request()->mutable_dynamic_slice_request() =
       dynamic_slice_request;
 
+  VLOG(1) << "AddDynamicSliceInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << dynamic_slice_request.ShortDebugString();
   return handle;
 }
 
@@ -653,14 +757,14 @@ UserComputation::AddDynamicUpdateSliceInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(dynamic_update_slice_request.operand()));
+                      LookUpRequest(dynamic_update_slice_request.operand()));
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* update,
-                      LookupRequest(dynamic_update_slice_request.update()));
+                      LookUpRequest(dynamic_update_slice_request.update()));
 
   TF_ASSIGN_OR_RETURN(
       const OperationRequest* start_indices,
-      LookupRequest(dynamic_update_slice_request.start_indices()));
+      LookUpRequest(dynamic_update_slice_request.start_indices()));
 
   TF_ASSIGN_OR_RETURN(Shape new_shape,
                       ShapeInference::InferDynamicUpdateSliceShape(
@@ -676,6 +780,10 @@ UserComputation::AddDynamicUpdateSliceInstruction(
   *request.mutable_request()->mutable_dynamic_update_slice_request() =
       dynamic_update_slice_request;
 
+  VLOG(1) << "AddDynamicUpdateSliceInstruction ("
+          << GetVersionedHandleInternal() << "), data handle "
+          << handle.handle() << ": "
+          << dynamic_update_slice_request.ShortDebugString();
   return handle;
 }
 
@@ -685,7 +793,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddConcatenateInstruction(
 
   std::vector<const Shape*> operand_shapes;
   for (const ComputationDataHandle& handle : concatenate_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookupRequest(handle));
+    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
     operand_shapes.push_back(&operand->output_shape());
   }
 
@@ -702,6 +810,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddConcatenateInstruction(
   *request.mutable_request()->mutable_concatenate_request() =
       concatenate_request;
 
+  VLOG(1) << "AddConcatenateInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << concatenate_request.ShortDebugString();
   return handle;
 }
 
@@ -710,7 +821,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(convert_request.operand()));
+                      LookUpRequest(convert_request.operand()));
 
   TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
                                            operand->output_shape(),
@@ -724,6 +835,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
   *request.mutable_output_shape() = new_shape;
   *request.mutable_request()->mutable_convert_request() = convert_request;
 
+  VLOG(1) << "AddConvertInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << convert_request.ShortDebugString();
   return handle;
 }
 
@@ -732,9 +846,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookupRequest(convolve_request.lhs()));
+                      LookUpRequest(convolve_request.lhs()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookupRequest(convolve_request.rhs()));
+                      LookUpRequest(convolve_request.rhs()));
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvolveShape(
                                        lhs->output_shape(), rhs->output_shape(),
                                        convolve_request.window(),
@@ -748,6 +862,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_convolve_request() = convolve_request;
 
+  VLOG(1) << "AddConvolveInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << convolve_request.ShortDebugString();
   return handle;
 }
 
@@ -756,7 +873,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(cross_replica_sum_request.operand()));
+                      LookUpRequest(cross_replica_sum_request.operand()));
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
                                        operand->output_shape()));
 
@@ -769,6 +886,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
   *request.mutable_request()->mutable_cross_replica_sum_request() =
       cross_replica_sum_request;
 
+  VLOG(1) << "AddCrossreplicaSumInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << cross_replica_sum_request.ShortDebugString();
   return handle;
 }
 
@@ -792,6 +912,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_infeed_request() = infeed_request;
 
+  VLOG(1) << "AddInfeedInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << infeed_request.ShortDebugString();
   return handle;
 }
 
@@ -799,9 +922,29 @@ Status UserComputation::AddOutfeedInstruction(
     const OutfeedRequest& outfeed_request) {
   tensorflow::mutex_lock lock(mutex_);
 
-  *session_computation_.add_outfeed_requests() = outfeed_request;
+  const Shape& shape = outfeed_request.shape();
+  if (ShapeUtil::IsNestedTuple(shape)) {
+    return InvalidArgument("Outfeed does not support nested tuple shapes");
+  }
+  if (!LayoutUtil::HasLayout(shape)) {
+    return InvalidArgument("Given shape to Outfeed must have a layout");
+  }
+
   // Verify that operand is valid.
-  TF_RETURN_IF_ERROR(LookupRequest(outfeed_request.operand()).status());
+  TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
+
+  // No handle is returned, but a handle must be assigned to this instruction
+  // for computation versioning.
+  ComputationDataHandle handle = CreateComputationDataHandle();
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = shape;
+  *request.mutable_request()->mutable_outfeed_request() = outfeed_request;
+
+  VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << outfeed_request.ShortDebugString();
   return Status::OK();
 }
 
@@ -812,7 +955,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
 
   std::vector<const Shape*> operand_shapes;
   for (const ComputationDataHandle& handle : call_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookupRequest(handle));
+    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
     operand_shapes.push_back(&operand->output_shape());
   }
 
@@ -834,6 +977,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
   request.add_embedded_computation_versions(to_apply_version);
   *request.mutable_request()->mutable_call_request() = call_request;
 
+  VLOG(1) << "AddCallInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << call_request.ShortDebugString();
   return handle;
 }
 
@@ -842,7 +988,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   for (const ComputationDataHandle& handle : custom_call_request.operands()) {
-    TF_RETURN_IF_ERROR(LookupRequest(handle).status());
+    TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
   }
 
   const ComputationDataHandle handle = CreateComputationDataHandle();
@@ -854,6 +1000,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
   *request.mutable_request()->mutable_custom_call_request() =
       custom_call_request;
 
+  VLOG(1) << "AddCustomCallInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << custom_call_request.ShortDebugString();
   return handle;
 }
 
@@ -862,7 +1011,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookupRequest(unary_request.operand()));
+                      LookUpRequest(unary_request.operand()));
   TF_ASSIGN_OR_RETURN(
       Shape shape, ShapeInference::InferUnaryOpShape(unary_request.unop(),
                                                      operand->output_shape()));
@@ -875,6 +1024,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_unary_op_request() = unary_request;
 
+  VLOG(1) << "AddUnaryInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << unary_request.ShortDebugString();
   return handle;
 }
 
@@ -883,9 +1035,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddBinaryInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookupRequest(binary_request.lhs()));
+                      LookUpRequest(binary_request.lhs()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookupRequest(binary_request.rhs()));
+                      LookUpRequest(binary_request.rhs()));
   TF_ASSIGN_OR_RETURN(
       Shape shape,
       ShapeInference::InferBinaryOpShape(
@@ -900,6 +1052,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddBinaryInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_binary_op_request() = binary_request;
 
+  VLOG(1) << "AddBinaryInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << binary_request.ShortDebugString();
   return handle;
 }
 
@@ -908,11 +1063,11 @@ StatusOr<ComputationDataHandle> UserComputation::AddTernaryInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookupRequest(ternary_request.lhs()));
+                      LookUpRequest(ternary_request.lhs()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookupRequest(ternary_request.rhs()));
+                      LookUpRequest(ternary_request.rhs()));
   TF_ASSIGN_OR_RETURN(const OperationRequest* ehs,
-                      LookupRequest(ternary_request.ehs()));
+                      LookUpRequest(ternary_request.ehs()));
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeInference::InferTernaryOpShape(
                           ternary_request.triop(), lhs->output_shape(),
@@ -926,6 +1081,9 @@ StatusOr<ComputationDataHandle> UserComputation::AddTernaryInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_ternary_op_request() = ternary_request;
 
+  VLOG(1) << "AddTernaryInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << ternary_request.ShortDebugString();
   return handle;
 }
 
@@ -935,7 +1093,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddVariadicInstruction(
 
   std::vector<const Shape*> operand_shapes;
   for (const ComputationDataHandle& handle : variadic_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookupRequest(handle));
+    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
     operand_shapes.push_back(&operand->output_shape());
   }
 
@@ -951,16 +1109,35 @@ StatusOr<ComputationDataHandle> UserComputation::AddVariadicInstruction(
   *request.mutable_output_shape() = shape;
   *request.mutable_request()->mutable_variadic_op_request() = variadic_request;
 
+  VLOG(1) << "AddVariadicInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << variadic_request.ShortDebugString();
   return handle;
 }
 
 StatusOr<Shape> UserComputation::GetShape(const ComputationDataHandle& handle) {
   tensorflow::mutex_lock lock(mutex_);
 
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookupRequest(handle));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
   return operand->output_shape();
 }
 
+Status UserComputation::SetOpMetadata(const ComputationDataHandle& handle,
+                                      const OpMetadata& metadata) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  int64 handle_value = handle.handle();
+  if (session_computation_.requests().count(handle_value) == 0) {
+    return InvalidArgument("Invalid handle in SetDebugMetadata (%lld)",
+                           handle_value);
+  }
+  *session_computation_.mutable_requests()
+       ->at(handle_value)
+       .mutable_request()
+       ->mutable_metadata() = metadata;
+  return Status::OK();
+}
+
 Status UserComputation::SetReturnValue(const ComputationDataHandle& handle) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -970,12 +1147,18 @@ Status UserComputation::SetReturnValue(const ComputationDataHandle& handle) {
 
   handle_to_return_ = handle;
 
+  VLOG(1) << "SetReturnValue of computation \"" << name() << "\" fixed to "
+          << GetVersionedHandleInternal();
+
   return Status::OK();
 }
 
 VersionedComputationHandle UserComputation::GetVersionedHandle() const {
   tensorflow::mutex_lock lock(mutex_);
+  return GetVersionedHandleInternal();
+}
 
+VersionedComputationHandle UserComputation::GetVersionedHandleInternal() const {
   VersionedComputationHandle versioned_handle;
   versioned_handle.handle = session_computation_.computation_handle();
 
@@ -1008,12 +1191,62 @@ VersionedComputationHandle::Version UserComputation::version() const {
   return GetVersionedHandle().version;
 }
 
+namespace {
+
+// Returns true if the operation type corresponding to the given opcase can be
+// the root of the computation.
+bool CanBeRoot(const OpRequest::OpCase& op_case) {
+  switch (op_case) {
+    case OpRequest::kTraceRequest:
+    case OpRequest::kSendRequest:
+    case OpRequest::kOutfeedRequest:
+      return false;
+    default:
+      return true;
+  }
+}
+
+// Returns a pointer to the operation with the given data handle value in the
+// given SessionComputation.
+StatusOr<const OperationRequest*> LookUpRequest(
+    int64 handle_value, const SessionComputation& session_computation) {
+  if (session_computation.requests().count(handle_value) == 0) {
+    return InvalidArgument("no ComputationDataHandle value %lld", handle_value);
+  }
+  return &session_computation.requests().at(handle_value);
+}
+
+// Returns the OperationRequestion corresponding to the root (result) of the
+// session computation.
+StatusOr<const OperationRequest*> GetRoot(
+    VersionedComputationHandle::Version version,
+    const SessionComputation& session_computation) {
+  TF_RET_CHECK(version > 0);
+  // Not all instructions can be roots. Walk backwards from the operation
+  // indicated by this version until a valid root is found.
+  const OperationRequest* root_request = nullptr;
+  while (version > 0) {
+    TF_ASSIGN_OR_RETURN(root_request,
+                        LookUpRequest(version, session_computation));
+    if (CanBeRoot(root_request->request().op_case())) {
+      break;
+    }
+    version--;
+  }
+  if (version == 0) {
+    return InternalError("Computation contains no root operation");
+  }
+  return root_request;
+}
+
+}  // namespace
+
 StatusOr<std::shared_ptr<const ProgramShape>>
 UserComputation::ComputeProgramShape(
     VersionedComputationHandle::Version version) const {
   tensorflow::mutex_lock lock(mutex_);
 
-  CHECK(version > 0 && version < next_handle_value_);
+  TF_RET_CHECK(version > 0 && version < next_handle_value_);
 
   if (program_shape_ == nullptr || program_shape_version_ != version) {
     // ProgramShape has not been computed yet, or is for different
@@ -1042,7 +1275,9 @@ UserComputation::ComputeProgramShape(
     }
 
     // The root determines the output shape.
-    *program_shape->mutable_result() = GetRoot(version).output_shape();
+    TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
+                        GetRoot(version, session_computation_));
+    *program_shape->mutable_result() = root_request->output_shape();
     if (ShapeUtil::IsOpaque(program_shape->result())) {
       return Unimplemented("Computation results cannot be opaque");
     }
@@ -1279,6 +1514,7 @@ void ConstantVisitor(const SessionComputation& session_computation,
                       is_constant);
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
+      *is_constant = false;
       break;
     }
 
@@ -1294,6 +1530,14 @@ void ConstantVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kTransposeRequest: {
+      const TransposeRequest& transpose_request =
+          request.request().transpose_request();
+      ConstantVisitor(session_computation, transpose_request.operand(), visited,
+                      is_constant);
+      break;
+    }
+
     case OpRequest::kVariadicOpRequest: {
       const VariadicOpRequest& variadic_op_request =
           request.request().variadic_op_request();
@@ -1338,7 +1582,7 @@ StatusOr<bool> UserComputation::IsConstant(
   tensorflow::mutex_lock lock(mutex_);
 
   // Verify that the handle is valid.
-  auto operation_status = LookupRequest(handle);
+  auto operation_status = LookUpRequest(handle);
   if (!operation_status.ok()) {
     return operation_status.status();
   }
@@ -1350,17 +1594,18 @@ StatusOr<bool> UserComputation::IsConstant(
   return is_constant;
 }
 
-const OperationRequest& UserComputation::GetRoot(
-    VersionedComputationHandle::Version version) const {
-  CHECK(version > 0 && version < next_handle_value_);
-  return session_computation_.requests().at(version);
-}
-
 std::vector<VersionedComputationHandle>
 UserComputation::GetEmbeddedComputations(
     VersionedComputationHandle::Version version) const {
   tensorflow::mutex_lock lock(mutex_);
 
+  VLOG(1)
+      << "GetEmbeddedComputations(" << name() << " "
+      << VersionedComputationHandle{session_computation_.computation_handle(),
+                                    version}
+      << ")";
+  XLA_VLOG_LINES(3, session_computation_.DebugString());
+
   std::vector<VersionedComputationHandle> computations;
   for (const auto& handle_request : session_computation_.requests()) {
     int64 handle_value = handle_request.first;
@@ -1442,6 +1687,12 @@ UserComputation::GetEmbeddedComputations(
       }
     }
   }
+  VLOG(2) << "Embedded computations: "
+          << tensorflow::str_util::Join(
+                 computations, ", ",
+                 [](string* out, const VersionedComputationHandle& h) {
+                   out->append(h.ToString());
+                 });
   return computations;
 }
 
@@ -1543,7 +1794,7 @@ SessionComputation UserComputation::CloneSessionComputation(
   return result;
 }
 
-StatusOr<const OperationRequest*> UserComputation::LookupRequest(
+StatusOr<const OperationRequest*> UserComputation::LookUpRequest(
     const ComputationDataHandle& handle) const {
   int64 handle_value = handle.handle();
   if (session_computation_.requests().count(handle_value) == 0) {
@@ -1594,15 +1845,15 @@ namespace {
 // DFS order lowering each OperationRequest to an HLO instruction.
 class ComputationLowerer {
  public:
-  static std::unique_ptr<HloComputation> Lower(
+  static StatusOr<std::unique_ptr<HloComputation>> Lower(
       const string& computation_name,
       const SessionComputation& session_computation,
       VersionedComputationHandle::Version version,
       UserComputation::HloComputationResolver hlo_resolver,
-      bool include_unused_parameters) {
+      bool include_unreachable_instructions) {
     ComputationLowerer lowerer(computation_name, session_computation, version,
                                std::move(hlo_resolver));
-    return lowerer.Lower(include_unused_parameters);
+    return lowerer.Lower(include_unreachable_instructions);
   }
 
  private:
@@ -1617,13 +1868,20 @@ class ComputationLowerer {
 
   // Build an HLO computation from the SessionComputation at the given
   // version.
-  std::unique_ptr<HloComputation> Lower(bool include_unused_parameters);
+  StatusOr<std::unique_ptr<HloComputation>> Lower(
+      bool include_unreachable_instructions);
 
  private:
+  // Traverses the computation 'root' using a DFS, calling 'visit' in postorder.
+  void TraversePostorder(
+      const ComputationDataHandle& root,
+      std::unordered_map<int64, HloInstruction*>* visited,
+      const std::function<void(const ComputationDataHandle&)>& visit);
+
   // DFS visitor of the UserComputation operations which lowers the operations
   // to HLO instructions.
-  HloInstruction* Visit(const ComputationDataHandle& handle,
-                        std::map<int64, HloInstruction*>* visited);
+  void Visit(const ComputationDataHandle& handle,
+             std::unordered_map<int64, HloInstruction*>* instructions);
 
   // Resolves a ComputationHandle and Version to a previously lowered
   // HloComputation using the hlo_resolver_ function.
@@ -1631,70 +1889,319 @@ class ComputationLowerer {
       const ComputationHandle& handle,
       VersionedComputationHandle::Version version);
 
+  // This function takes an input value which is being implicitly broadcast into
+  // an output shape and figures out the right kBroadcast instruction(s)
+  // necessary to replicate the implicit broadcast semantics explicitly.
+  HloInstruction* ImplicitBroadcastToExplicitBroadcast(
+      HloInstruction* operand, const Shape& output_shape);
+
   HloComputation::Builder hlo_builder_;
   const SessionComputation& session_computation_;
   const VersionedComputationHandle::Version version_;
   const UserComputation::HloComputationResolver hlo_resolver_;
 };
 
-std::unique_ptr<HloComputation> ComputationLowerer::Lower(
-    bool include_unused_parameters) {
+// Calls 'apply' on each operand of 'request'.
+static void ForEachOperand(
+    const OperationRequest& request,
+    const std::function<void(const ComputationDataHandle& param)>& apply) {
+  switch (request.request().op_case()) {
+    case OpRequest::kRngRequest: {
+      const RngRequest& rng_request = request.request().rng_request();
+      for (const ComputationDataHandle& param : rng_request.parameter()) {
+        apply(param);
+      }
+      break;
+    }
+
+    case OpRequest::kConstantRequest:
+      break;
+    case OpRequest::kGetTupleElementRequest: {
+      const GetTupleElementRequest& get_tuple_element_request =
+          request.request().get_tuple_element_request();
+      apply(get_tuple_element_request.operand());
+      break;
+    }
+
+    case OpRequest::kSliceRequest: {
+      const SliceRequest& slice_request = request.request().slice_request();
+      apply(slice_request.operand());
+      break;
+    }
+
+    case OpRequest::kDynamicSliceRequest: {
+      const DynamicSliceRequest& dynamic_slice_request =
+          request.request().dynamic_slice_request();
+      apply(dynamic_slice_request.operand());
+      apply(dynamic_slice_request.start_indices());
+      break;
+    }
+
+    case OpRequest::kDynamicUpdateSliceRequest: {
+      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
+          request.request().dynamic_update_slice_request();
+      apply(dynamic_update_slice_request.operand());
+      apply(dynamic_update_slice_request.update());
+      apply(dynamic_update_slice_request.start_indices());
+      break;
+    }
+
+    case OpRequest::kConcatenateRequest: {
+      const ConcatenateRequest& concatenate_request =
+          request.request().concatenate_request();
+      for (const ComputationDataHandle& handle :
+           concatenate_request.operands()) {
+        apply(handle);
+      }
+      break;
+    }
+
+    case OpRequest::kConvolveRequest: {
+      const ConvolveRequest& convolve_request =
+          request.request().convolve_request();
+      apply(convolve_request.lhs());
+      apply(convolve_request.rhs());
+      break;
+    }
+
+    case OpRequest::kCrossReplicaSumRequest: {
+      const CrossReplicaSumRequest& cross_replica_sum_request =
+          request.request().cross_replica_sum_request();
+      apply(cross_replica_sum_request.operand());
+      break;
+    }
+
+    case OpRequest::kInfeedRequest:
+      break;
+
+    case OpRequest::kOutfeedRequest: {
+      const OutfeedRequest& outfeed_request =
+          request.request().outfeed_request();
+      apply(outfeed_request.operand());
+      break;
+    }
+
+    case OpRequest::kMapRequest: {
+      const MapRequest& map_request = request.request().map_request();
+      for (const ComputationDataHandle& handle : map_request.operands()) {
+        apply(handle);
+      }
+      break;
+    }
+
+    case OpRequest::kReduceRequest: {
+      const ReduceRequest& reduce_request = request.request().reduce_request();
+      apply(reduce_request.operand());
+      apply(reduce_request.init_value());
+      break;
+    }
+
+    case OpRequest::kReduceWindowRequest: {
+      const ReduceWindowRequest& reduce_window_request =
+          request.request().reduce_window_request();
+      apply(reduce_window_request.operand());
+      apply(reduce_window_request.init_value());
+      break;
+    }
+
+    case OpRequest::kSelectAndScatterRequest: {
+      const SelectAndScatterRequest& select_and_scatter_request =
+          request.request().select_and_scatter_request();
+      apply(select_and_scatter_request.operand());
+      apply(select_and_scatter_request.source());
+      apply(select_and_scatter_request.init_value());
+
+      break;
+    }
+
+    case OpRequest::kBroadcastRequest: {
+      const BroadcastRequest& broadcast_request =
+          request.request().broadcast_request();
+      apply(broadcast_request.operand());
+      break;
+    }
+
+    case OpRequest::kReshapeRequest: {
+      const ReshapeRequest& reshape_request =
+          request.request().reshape_request();
+      apply(reshape_request.operand());
+      break;
+    }
+
+    case OpRequest::kTransposeRequest: {
+      const TransposeRequest& transpose_request =
+          request.request().transpose_request();
+      apply(transpose_request.operand());
+      break;
+    }
+
+    case OpRequest::kReverseRequest: {
+      const ReverseRequest& reverse_request =
+          request.request().reverse_request();
+      apply(reverse_request.operand());
+      break;
+    }
+
+    case OpRequest::kPadRequest: {
+      const PadRequest& pad_request = request.request().pad_request();
+      apply(pad_request.operand());
+      apply(pad_request.padding_value());
+      break;
+    }
+
+    case OpRequest::kRecvRequest:
+    case OpRequest::kParameterRequest:
+      break;
+
+    case OpRequest::kConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().convert_request();
+      apply(convert_request.operand());
+      break;
+    }
+
+    case OpRequest::kWhileRequest: {
+      const WhileRequest& while_request = request.request().while_request();
+      apply(while_request.init());
+      break;
+    }
+
+    case OpRequest::kTernaryOpRequest: {
+      const TernaryOpRequest& ternary_op_request =
+          request.request().ternary_op_request();
+      apply(ternary_op_request.lhs());
+      apply(ternary_op_request.rhs());
+      apply(ternary_op_request.ehs());
+      break;
+    }
+
+    case OpRequest::kVariadicOpRequest: {
+      const VariadicOpRequest& variadic_op_request =
+          request.request().variadic_op_request();
+      for (const ComputationDataHandle& handle :
+           variadic_op_request.operands()) {
+        apply(handle);
+      }
+      break;
+    }
+
+    case OpRequest::kCallRequest: {
+      const CallRequest& call_request = request.request().call_request();
+      for (const ComputationDataHandle& handle : call_request.operands()) {
+        apply(handle);
+      }
+      break;
+    }
+
+    case OpRequest::kCustomCallRequest: {
+      const CustomCallRequest& cc_request =
+          request.request().custom_call_request();
+      for (const ComputationDataHandle& operand : cc_request.operands()) {
+        apply(operand);
+      }
+      break;
+    }
+
+    case OpRequest::kUnaryOpRequest: {
+      const UnaryOpRequest& unary_op_request =
+          request.request().unary_op_request();
+      apply(unary_op_request.operand());
+      break;
+    }
+
+    case OpRequest::kBinaryOpRequest: {
+      const BinaryOpRequest& binary_op_request =
+          request.request().binary_op_request();
+      apply(binary_op_request.rhs());
+      apply(binary_op_request.lhs());
+      break;
+    }
+
+    case OpRequest::kTraceRequest: {
+      const TraceRequest& trace_request = request.request().trace_request();
+      apply(trace_request.operand());
+      break;
+    }
+
+    case OpRequest::kSendRequest: {
+      const SendRequest& send_request = request.request().send_request();
+      apply(send_request.operand());
+      break;
+    }
+
+    case OpRequest::OP_NOT_SET:
+      LOG(FATAL) << "OperationRequest doesn't contain a request";
+
+    default:
+      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
+  }
+}
+
+void ComputationLowerer::TraversePostorder(
+    const ComputationDataHandle& root,
+    std::unordered_map<int64, HloInstruction*>* visited,
+    const std::function<void(const ComputationDataHandle&)>& visit) {
+  // Stack containing {handle, enter} pairs. The 'enter' value describes whether
+  // we are entering or leaving 'handle'.
+  std::stack<std::pair<ComputationDataHandle, bool>> work;
+  work.push({root, true});
+  while (!work.empty()) {
+    ComputationDataHandle handle;
+    bool enter;
+    std::tie(handle, enter) = work.top();
+    work.pop();
+
+    if (enter) {
+      // We are entering 'handle'. The first time we enter 'handle', we add it
+      // to 'visited' with a nullptr value. If 'handle' is already in 'visited',
+      // we do not visit it again. This algorithm only uses the presence of
+      // a handle in 'visited', but we use a map so we can use the same data
+      // structure to store the HloInstruction outputs.
+      if (visited->emplace(handle.handle(), nullptr).second) {
+        const OperationRequest& request =
+            session_computation_.requests().at(handle.handle());
+        // Push the corresponding 'leave' action onto the stack, followed by
+        // the operands.
+        work.push({handle, false});
+        ForEachOperand(request, [&work](const ComputationDataHandle& child) {
+          work.push({child, true});
+        });
+      }
+    } else {
+      // We are leaving 'handle'. We have visited the operands of 'handle', and
+      // now can visit the 'handle' itself.
+      visit(handle);
+    }
+  }
+}
+
+StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower(
+    bool include_unreachable_instructions) {
   // Map from ComputationDataHandle to HLO instruction. Serves as a record of
   // which operations have been visited as well as a cache for looking up
   // ComputationDataHandles as HloInstructions.
-  std::map<int64, HloInstruction*> visited;
+  std::unordered_map<int64, HloInstruction*> instructions;
 
-  // A version is simply a ComputationDataHandle of the root of the computation
-  // at the time the version was generated. Create a ComputationDataHandle with
-  // this value and pass it to the visitor as the root of the computation to
-  // lower.
-  ComputationDataHandle root_handle;
-  root_handle.set_handle(version_);
+  TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
+                      GetRoot(version_, session_computation_));
 
-  HloInstruction* hlo_root = Visit(root_handle, &visited);
+  auto visit = [&](const ComputationDataHandle& handle) {
+    Visit(handle, &instructions);
+  };
+  TraversePostorder(root_request->output_handle(), &instructions, visit);
+  HloInstruction* hlo_root =
+      instructions.at(root_request->output_handle().handle());
 
-  // A computation may have unused parameters.
-  if (include_unused_parameters) {
+  if (include_unreachable_instructions) {
+    // Iterate through all computation data handles, and visit any unvisited
+    // operations.
     for (int64 request_num = 1; request_num <= version_; ++request_num) {
-      const OperationRequest& request =
-          session_computation_.requests().at(request_num);
-      if (request.request().op_case() == OpRequest::kParameterRequest &&
-          visited.count(request.output_handle().handle()) == 0) {
-        Visit(request.output_handle(), &visited);
-      }
+      TF_ASSIGN_OR_RETURN(const OperationRequest* request,
+                          LookUpRequest(request_num, session_computation_));
+      TraversePostorder(request->output_handle(), &instructions, visit);
     }
   }
 
-  // Add trace instructions.
-  for (const auto& trace_request : session_computation_.trace_requests()) {
-    if (trace_request.operand().handle() <= version_) {
-      HloInstruction* operand = visited[trace_request.operand().handle()];
-      // Trace instructions cannot be the root of a computation.
-      HloInstruction* trace_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateTrace(trace_request.tag(), operand));
-      operand->set_tracing(trace_instruction);
-    }
-  }
-
-  // Send instructions do not have users, so they are not reachable from the
-  // root instruction. Therefore, explicitly visit all Send requests (and their
-  // operand chains) and add to the builder.
-  for (const auto& send_request : session_computation_.send_requests()) {
-    Visit(send_request.operand(), &visited);
-    HloInstruction* operand = visited[send_request.operand().handle()];
-    hlo_builder_.AddInstruction(HloInstruction::CreateSend(
-        operand, send_request.channel_handle().handle()));
-  }
-
-  // Outfeed instructions do not have users. Explicitly visit all Outfeed
-  // requests (and their operand chains).
-  for (const auto& outfeed_request : session_computation_.outfeed_requests()) {
-    Visit(outfeed_request.operand(), &visited);
-    HloInstruction* operand = visited[outfeed_request.operand().handle()];
-    hlo_builder_.AddInstruction(HloInstruction::CreateOutfeed(
-        operand, outfeed_request.outfeed_config()));
-  }
-
   return hlo_builder_.Build(hlo_root);
 }
 
@@ -1705,24 +2212,62 @@ HloComputation* ComputationLowerer::ResolveComputation(
   return hlo_resolver_(checked_handle);
 }
 
-HloInstruction* ComputationLowerer::Visit(
-    const ComputationDataHandle& handle,
-    std::map<int64, HloInstruction*>* visited) {
-  if (visited->count(handle.handle()) != 0) {
-    return (*visited)[handle.handle()];
+HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
+    HloInstruction* operand, const Shape& output_shape) {
+  CHECK(ShapeUtil::IsScalar(operand->shape()) ||
+        ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape));
+  Shape broadcast_shape = ShapeUtil::MakeShape(
+      operand->shape().element_type(), AsInt64Slice(output_shape.dimensions()));
+  // Do explicit broadcast for scalar.
+  if (ShapeUtil::IsScalar(operand->shape())) {
+    return hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+        broadcast_shape, operand, AsInt64Slice(broadcast_shape.dimensions())));
   }
+  // Do explicit broadcast for degenerate broadcast.
+  std::vector<int64> broadcast_dimensions;
+  std::vector<int64> reshaped_dimensions;
+  for (int i = 0; i < ShapeUtil::Rank(operand->shape()); i++) {
+    if (operand->shape().dimensions(i) > 1) {
+      broadcast_dimensions.push_back(i);
+      reshaped_dimensions.push_back(operand->shape().dimensions(i));
+    }
+  }
+  // Eliminate the size one dimensions.
+  HloInstruction* reshaped_operand =
+      hlo_builder_.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(operand->shape().element_type(),
+                               reshaped_dimensions),
+          operand));
+  // Broadcast 'reshape' up to the larger size.
+  return hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+      broadcast_shape, reshaped_operand, broadcast_dimensions));
+}
 
+void ComputationLowerer::Visit(
+    const ComputationDataHandle& handle,
+    std::unordered_map<int64, HloInstruction*>* instructions) {
+  CHECK_LE(handle.handle(), version_);
+  CHECK(instructions->at(handle.handle()) == nullptr);
   const OperationRequest& request =
       session_computation_.requests().at(handle.handle());
+  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
+    HloInstruction* hlo_instruction =
+        hlo_builder_.AddInstruction(std::move(instruction));
+    hlo_instruction->set_metadata(request.request().metadata());
+    return hlo_instruction;
+  };
+  auto lookup_instruction = [&](const ComputationDataHandle& handle) {
+    return instructions->at(handle.handle());
+  };
   HloInstruction* hlo_instruction;
   switch (request.request().op_case()) {
     case OpRequest::kRngRequest: {
       const RngRequest& rng_request = request.request().rng_request();
       std::vector<HloInstruction*> parameters;
       for (const ComputationDataHandle& param : rng_request.parameter()) {
-        parameters.push_back(Visit(param, visited));
+        parameters.push_back(lookup_instruction(param));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRng(
+      hlo_instruction = add_instruction(HloInstruction::CreateRng(
           request.output_shape(), rng_request.distribution(), parameters));
       break;
     }
@@ -1730,9 +2275,8 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kConstantRequest: {
       const ConstantRequest& constant_request =
           request.request().constant_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CloneToUnique(constant_request.literal())));
+      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
+          LiteralUtil::CloneToUnique(Literal(constant_request.literal()))));
       break;
     }
 
@@ -1740,35 +2284,34 @@ HloInstruction* ComputationLowerer::Visit(
       const GetTupleElementRequest& get_tuple_element_request =
           request.request().get_tuple_element_request();
       HloInstruction* operand =
-          Visit(get_tuple_element_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateGetTupleElement(
-              request.output_shape(), operand,
-              get_tuple_element_request.index()));
+          lookup_instruction(get_tuple_element_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
+          request.output_shape(), operand, get_tuple_element_request.index()));
       break;
     }
 
     case OpRequest::kSliceRequest: {
       const SliceRequest& slice_request = request.request().slice_request();
-      HloInstruction* operand = Visit(slice_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSlice(
+      HloInstruction* operand = lookup_instruction(slice_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));
       break;
     }
 
     case OpRequest::kDynamicSliceRequest: {
       const DynamicSliceRequest& dynamic_slice_request =
           request.request().dynamic_slice_request();
-      HloInstruction* operand = Visit(dynamic_slice_request.operand(), visited);
+      HloInstruction* operand =
+          lookup_instruction(dynamic_slice_request.operand());
       HloInstruction* start_indices =
-          Visit(dynamic_slice_request.start_indices(), visited);
+          lookup_instruction(dynamic_slice_request.start_indices());
 
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicSlice(
-              request.output_shape(), operand, start_indices,
-              AsInt64Slice(dynamic_slice_request.slice_sizes())));
+      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
+          request.output_shape(), operand, start_indices,
+          AsInt64Slice(dynamic_slice_request.slice_sizes())));
       break;
     }
 
@@ -1776,13 +2319,13 @@ HloInstruction* ComputationLowerer::Visit(
       const DynamicUpdateSliceRequest& dynamic_update_slice_request =
           request.request().dynamic_update_slice_request();
       HloInstruction* operand =
-          Visit(dynamic_update_slice_request.operand(), visited);
+          lookup_instruction(dynamic_update_slice_request.operand());
       HloInstruction* update =
-          Visit(dynamic_update_slice_request.update(), visited);
+          lookup_instruction(dynamic_update_slice_request.update());
       HloInstruction* start_indices =
-          Visit(dynamic_update_slice_request.start_indices(), visited);
+          lookup_instruction(dynamic_update_slice_request.start_indices());
       hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
               request.output_shape(), operand, update, start_indices));
       break;
     }
@@ -1793,24 +2336,22 @@ HloInstruction* ComputationLowerer::Visit(
       std::vector<HloInstruction*> operands;
       for (const ComputationDataHandle& handle :
            concatenate_request.operands()) {
-        HloInstruction* operand = Visit(handle, visited);
+        HloInstruction* operand = lookup_instruction(handle);
         operands.push_back(operand);
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateConcatenate(request.output_shape(), operands,
-                                            concatenate_request.dimension()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
+          request.output_shape(), operands, concatenate_request.dimension()));
       break;
     }
 
     case OpRequest::kConvolveRequest: {
       const ConvolveRequest& convolve_request =
           request.request().convolve_request();
-      HloInstruction* lhs = Visit(convolve_request.lhs(), visited);
-      HloInstruction* rhs = Visit(convolve_request.rhs(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConvolve(
-              request.output_shape(), lhs, rhs, convolve_request.window(),
-              convolve_request.dimension_numbers()));
+      HloInstruction* lhs = lookup_instruction(convolve_request.lhs());
+      HloInstruction* rhs = lookup_instruction(convolve_request.rhs());
+      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
+          request.output_shape(), lhs, rhs, convolve_request.window(),
+          convolve_request.dimension_numbers()));
       break;
     }
 
@@ -1818,28 +2359,25 @@ HloInstruction* ComputationLowerer::Visit(
       const CrossReplicaSumRequest& cross_replica_sum_request =
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
-          Visit(cross_replica_sum_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-              request.output_shape(), operand));
+          lookup_instruction(cross_replica_sum_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
+          request.output_shape(), operand));
       break;
     }
 
     case OpRequest::kInfeedRequest: {
       const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateInfeed(
-              request.output_shape(), infeed_request.config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
+          request.output_shape(), infeed_request.config()));
       break;
     }
 
     case OpRequest::kOutfeedRequest: {
       const OutfeedRequest& outfeed_request =
           request.request().outfeed_request();
-      HloInstruction* operand = Visit(outfeed_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateOutfeed(
-              operand, outfeed_request.outfeed_config()));
+      HloInstruction* operand = lookup_instruction(outfeed_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
+          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
       break;
     }
 
@@ -1847,7 +2385,7 @@ HloInstruction* ComputationLowerer::Visit(
       const MapRequest& map_request = request.request().map_request();
       std::vector<HloInstruction*> operands;
       for (const ComputationDataHandle& handle : map_request.operands()) {
-        HloInstruction* operand = Visit(handle, visited);
+        HloInstruction* operand = lookup_instruction(handle);
         operands.push_back(operand);
       }
       CHECK_EQ(1, request.embedded_computation_versions_size());
@@ -1855,42 +2393,42 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* map_computation =
           ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateMap(
+      hlo_instruction = add_instruction(HloInstruction::CreateMap(
           request.output_shape(), operands, map_computation));
       break;
     }
 
     case OpRequest::kReduceRequest: {
       const ReduceRequest& reduce_request = request.request().reduce_request();
-      HloInstruction* operand = Visit(reduce_request.operand(), visited);
-      HloInstruction* init_value = Visit(reduce_request.init_value(), visited);
+      HloInstruction* operand = lookup_instruction(reduce_request.operand());
+      HloInstruction* init_value =
+          lookup_instruction(reduce_request.init_value());
       CHECK_EQ(1, request.embedded_computation_versions_size());
       VersionedComputationHandle::Version reduce_version =
           request.embedded_computation_versions(0);
       HloComputation* reduce_computation =
           ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduce(
-              request.output_shape(), operand, init_value,
-              AsInt64Slice(reduce_request.dimensions()), reduce_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
+          request.output_shape(), operand, init_value,
+          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
       break;
     }
 
     case OpRequest::kReduceWindowRequest: {
       const ReduceWindowRequest& reduce_window_request =
           request.request().reduce_window_request();
-      HloInstruction* operand = Visit(reduce_window_request.operand(), visited);
+      HloInstruction* operand =
+          lookup_instruction(reduce_window_request.operand());
       HloInstruction* init_value =
-          Visit(reduce_window_request.init_value(), visited);
+          lookup_instruction(reduce_window_request.init_value());
       CHECK_EQ(1, request.embedded_computation_versions_size());
       VersionedComputationHandle::Version reduce_window_version =
           request.embedded_computation_versions(0);
       HloComputation* reduce_window_computation = ResolveComputation(
           reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduceWindow(
-              request.output_shape(), operand, init_value,
-              reduce_window_request.window(), reduce_window_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
+          request.output_shape(), operand, init_value,
+          reduce_window_request.window(), reduce_window_computation));
       break;
     }
 
@@ -1898,11 +2436,11 @@ HloInstruction* ComputationLowerer::Visit(
       const SelectAndScatterRequest& select_and_scatter_request =
           request.request().select_and_scatter_request();
       HloInstruction* operand =
-          Visit(select_and_scatter_request.operand(), visited);
+          lookup_instruction(select_and_scatter_request.operand());
       HloInstruction* source =
-          Visit(select_and_scatter_request.source(), visited);
+          lookup_instruction(select_and_scatter_request.source());
       HloInstruction* init_value =
-          Visit(select_and_scatter_request.init_value(), visited);
+          lookup_instruction(select_and_scatter_request.init_value());
       CHECK_EQ(2, request.embedded_computation_versions_size());
       VersionedComputationHandle::Version select_version =
           request.embedded_computation_versions(0);
@@ -1912,18 +2450,17 @@ HloInstruction* ComputationLowerer::Visit(
           select_and_scatter_request.select(), select_version);
       HloComputation* scatter_computation = ResolveComputation(
           select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateSelectAndScatter(
-              request.output_shape(), operand, select_computation,
-              select_and_scatter_request.window(), source, init_value,
-              scatter_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
+          request.output_shape(), operand, select_computation,
+          select_and_scatter_request.window(), source, init_value,
+          scatter_computation));
       break;
     }
 
     case OpRequest::kBroadcastRequest: {
       const BroadcastRequest& broadcast_request =
           request.request().broadcast_request();
-      HloInstruction* operand = Visit(broadcast_request.operand(), visited);
+      HloInstruction* operand = lookup_instruction(broadcast_request.operand());
       std::vector<int64> broadcast_dimensions;
       // The client-level broadcast instruction just appends dimensions on the
       // left (adds lowest numbered dimensions). The HLO broadcast op is more
@@ -1932,50 +2469,64 @@ HloInstruction* ComputationLowerer::Visit(
       // to append dimensions on the left the broadcast_dimensions should just
       // be the n highest dimension numbers of the output shape where n is
       // the number of input dimensions.
+      broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape()));
       for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
         broadcast_dimensions.push_back(i +
                                        ShapeUtil::Rank(request.output_shape()) -
                                        ShapeUtil::Rank(operand->shape()));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
-              request.output_shape(), operand, broadcast_dimensions));
+      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
+          request.output_shape(), operand, broadcast_dimensions));
       break;
     }
 
     case OpRequest::kReshapeRequest: {
       const ReshapeRequest& reshape_request =
           request.request().reshape_request();
-      HloInstruction* operand = Visit(reshape_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReshape(
-              request.output_shape(),
-              hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-                  ShapeUtil::PermuteDimensions(
-                      InversePermutation(
-                          AsInt64Slice(reshape_request.dimensions())),
-                      operand->shape()),
-                  operand, AsInt64Slice(reshape_request.dimensions())))));
+      HloInstruction* operand = lookup_instruction(reshape_request.operand());
+      HloInstruction* transposed;
+      if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
+        transposed = operand;
+      } else {
+        transposed = add_instruction(HloInstruction::CreateTranspose(
+            ShapeUtil::PermuteDimensions(
+                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
+                operand->shape()),
+            operand, AsInt64Slice(reshape_request.dimensions())));
+      }
+      hlo_instruction = add_instruction(
+          HloInstruction::CreateReshape(request.output_shape(), transposed));
+      break;
+    }
+
+    case OpRequest::kTransposeRequest: {
+      const TransposeRequest& transpose_request =
+          request.request().transpose_request();
+      HloInstruction* operand = lookup_instruction(transpose_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(
+              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
+              operand->shape()),
+          operand, AsInt64Slice(transpose_request.dimensions())));
       break;
     }
 
     case OpRequest::kReverseRequest: {
       const ReverseRequest& reverse_request =
           request.request().reverse_request();
-      HloInstruction* operand = Visit(reverse_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReverse(
-              request.output_shape(), operand,
-              AsInt64Slice(reverse_request.dimensions())));
+      HloInstruction* operand = lookup_instruction(reverse_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
+          request.output_shape(), operand,
+          AsInt64Slice(reverse_request.dimensions())));
       break;
     }
 
     case OpRequest::kPadRequest: {
       const PadRequest& pad_request = request.request().pad_request();
-      HloInstruction* operand = Visit(pad_request.operand(), visited);
+      HloInstruction* operand = lookup_instruction(pad_request.operand());
       HloInstruction* padding_value =
-          Visit(pad_request.padding_value(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreatePad(
+          lookup_instruction(pad_request.padding_value());
+      hlo_instruction = add_instruction(HloInstruction::CreatePad(
           request.output_shape(), operand, padding_value,
           pad_request.padding_config()));
       break;
@@ -1983,7 +2534,7 @@ HloInstruction* ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRecv(
+      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
       break;
     }
@@ -1991,18 +2542,17 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kParameterRequest: {
       const ParameterRequest& parameter_request =
           request.request().parameter_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateParameter(
-              parameter_request.parameter(), request.output_shape(),
-              parameter_request.name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
+          parameter_request.parameter(), request.output_shape(),
+          parameter_request.name()));
       break;
     }
 
     case OpRequest::kConvertRequest: {
       const ConvertRequest& convert_request =
           request.request().convert_request();
-      HloInstruction* operand = Visit(convert_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      HloInstruction* operand = lookup_instruction(convert_request.operand());
+      hlo_instruction = add_instruction(
           HloInstruction::CreateConvert(request.output_shape(), operand));
       break;
     }
@@ -2018,8 +2568,8 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(1);
       HloComputation* body =
           ResolveComputation(while_request.body(), body_version);
-      HloInstruction* init = Visit(while_request.init(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateWhile(
+      HloInstruction* init = lookup_instruction(while_request.init());
+      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
           request.output_shape(), condition, body, init));
       break;
     }
@@ -2027,13 +2577,12 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
-      HloInstruction* lhs = Visit(ternary_op_request.lhs(), visited);
-      HloInstruction* rhs = Visit(ternary_op_request.rhs(), visited);
-      HloInstruction* ehs = Visit(ternary_op_request.ehs(), visited);
+      HloInstruction* lhs = lookup_instruction(ternary_op_request.lhs());
+      HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
+      HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTernary(
-              request.output_shape(), hlo_opcode, lhs, rhs, ehs));
+      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
+          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
     }
 
@@ -2043,14 +2592,13 @@ HloInstruction* ComputationLowerer::Visit(
       std::vector<HloInstruction*> operands;
       for (const ComputationDataHandle& handle :
            variadic_op_request.operands()) {
-        HloInstruction* operand = Visit(handle, visited);
+        HloInstruction* operand = lookup_instruction(handle);
         operands.push_back(operand);
       }
       auto hlo_opcode =
           VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateVariadic(
-              request.output_shape(), hlo_opcode, operands));
+      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
+          request.output_shape(), hlo_opcode, operands));
       break;
     }
 
@@ -2058,14 +2606,14 @@ HloInstruction* ComputationLowerer::Visit(
       const CallRequest& call_request = request.request().call_request();
       std::vector<HloInstruction*> operands;
       for (const ComputationDataHandle& handle : call_request.operands()) {
-        operands.push_back(Visit(handle, visited));
+        operands.push_back(lookup_instruction(handle));
       }
       CHECK_EQ(1, request.embedded_computation_versions_size());
       VersionedComputationHandle::Version call_version =
           request.embedded_computation_versions(0);
       HloComputation* call_computation =
           ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateCall(
+      hlo_instruction = add_instruction(HloInstruction::CreateCall(
           request.output_shape(), operands, call_computation));
       break;
     }
@@ -2075,20 +2623,19 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().custom_call_request();
       std::vector<HloInstruction*> operands;
       for (const ComputationDataHandle& operand : cc_request.operands()) {
-        operands.push_back(Visit(operand, visited));
+        operands.push_back(lookup_instruction(operand));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCustomCall(
-              cc_request.shape(), operands, cc_request.call_target_name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
+          cc_request.shape(), operands, cc_request.call_target_name()));
       break;
     }
 
     case OpRequest::kUnaryOpRequest: {
       const UnaryOpRequest& unary_op_request =
           request.request().unary_op_request();
-      HloInstruction* operand = Visit(unary_op_request.operand(), visited);
+      HloInstruction* operand = lookup_instruction(unary_op_request.operand());
       auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateUnary(
+      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
           request.output_shape(), hlo_opcode, operand));
       break;
     }
@@ -2096,8 +2643,8 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kBinaryOpRequest: {
       const BinaryOpRequest& binary_op_request =
           request.request().binary_op_request();
-      HloInstruction* lhs = Visit(binary_op_request.lhs(), visited);
-      HloInstruction* rhs = Visit(binary_op_request.rhs(), visited);
+      HloInstruction* lhs = lookup_instruction(binary_op_request.lhs());
+      HloInstruction* rhs = lookup_instruction(binary_op_request.rhs());
       auto hlo_opcode = BinaryOperationToHloOpcode(binary_op_request.binop());
       if (binary_op_request.broadcast_dimensions_size() > 0) {
         // Emit a broadcast instruction to perform the "broadcast in dimension"
@@ -2116,16 +2663,45 @@ HloInstruction* ComputationLowerer::Visit(
         // identical to the HLO broadcast semantics so the broadcast_dimensions
         // field can just be passed to the instruction builder.
         HloInstruction* broadcasted_operand =
-            hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+            add_instruction(HloInstruction::CreateBroadcast(
                 broadcast_shape, operand_to_broadcast,
                 AsInt64Slice(binary_op_request.broadcast_dimensions())));
 
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBinary(
-              request.output_shape(), hlo_opcode, lhs, rhs));
+      if (legacy_flags::GetUserComputationFlags()
+              ->xla_eliminate_hlo_implicit_broadcast) {
+        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
+          // lhs side is being implicitly broadcast. Change to explicit.
+          lhs =
+              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
+        }
+
+        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
+          rhs =
+              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
+        }
+      }
+      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
+          request.output_shape(), hlo_opcode, lhs, rhs));
+      break;
+    }
+
+    case OpRequest::kTraceRequest: {
+      const TraceRequest& trace_request = request.request().trace_request();
+      HloInstruction* operand = lookup_instruction(trace_request.operand());
+      hlo_instruction = add_instruction(
+          HloInstruction::CreateTrace(trace_request.tag(), operand));
+      operand->set_tracing(hlo_instruction);
+      break;
+    }
+
+    case OpRequest::kSendRequest: {
+      const SendRequest& send_request = request.request().send_request();
+      HloInstruction* operand = lookup_instruction(send_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateSend(
+          operand, send_request.channel_handle().handle()));
       break;
     }
 
@@ -2135,26 +2711,29 @@ HloInstruction* ComputationLowerer::Visit(
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
-  (*visited)[handle.handle()] = hlo_instruction;
-  return hlo_instruction;
+  (*instructions)[handle.handle()] = hlo_instruction;
 }
 
 }  // namespace
 
 StatusOr<std::unique_ptr<HloComputation>> UserComputation::BuildHloComputation(
     VersionedComputationHandle::Version version,
-    HloComputationResolver hlo_resolver, bool include_unused_parameters) const {
+    HloComputationResolver hlo_resolver,
+    bool include_unreachable_instructions) const {
   tensorflow::mutex_lock lock(mutex_);
 
   VLOG(2) << "Building HloComputation from UserComputation " << name_
-          << " at version " << version << ". Operation requests:\n"
-          << session_computation_.ShortDebugString();
+          << " at version " << version;
+  XLA_VLOG_LINES(3, session_computation_.DebugString());
 
-  std::unique_ptr<HloComputation> hlo_computation = ComputationLowerer::Lower(
-      tensorflow::strings::StrCat(name(), ".v", version), session_computation_,
-      version, std::move(hlo_resolver), include_unused_parameters);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloComputation> hlo_computation,
+      ComputationLowerer::Lower(
+          tensorflow::strings::StrCat(name(), ".v", version),
+          session_computation_, version, std::move(hlo_resolver),
+          include_unreachable_instructions));
 
-  VLOG(2) << "HloComputation:\n" << hlo_computation->ToString();
+  XLA_VLOG_LINES(2, hlo_computation->ToString());
   return std::move(hlo_computation);
 }
 
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index 2be448466f5..fb5425ae61a 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -144,6 +144,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddReshapeInstruction(
       const ReshapeRequest& reshape_request);
 
+  // Enqueues a transpose instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddTransposeInstruction(
+      const TransposeRequest& transpose_request);
+
   // Enqueues a slice instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddSliceInstruction(
       const SliceRequest& slice_request);
@@ -236,20 +240,24 @@ class UserComputation {
   // Returns the output shape of the operation indicated by the given handle.
   StatusOr<Shape> GetShape(const ComputationDataHandle& handle);
 
+  // Sets metadata on the Hlo instruction referenced by the given handle.
+  Status SetOpMetadata(const ComputationDataHandle& handle,
+                       const OpMetadata& metadata);
+
   // Builds a HLO computation from the UserComputation. The parameter "resolver"
   // is a function which returns a pointer to the HloComputation corresponding
   // to the given ComputationHandle at the given version. The resolver is used
   // for operations, such as map, which call other computations and need a
   // pointer to the called HloComputation to construct the respective HLO
-  // instructions. If include_unused_computation is true, then all parameter
-  // instructions are lowered into HloInstructions even if the parameter is
-  // unused (the root of the computation is unreachable from the parameter).
+  // instructions. If include_unreachable_instructions is true, then
+  // instructions which are not reachable from the root are lowered into
+  // HloInstructions.
   using HloComputationResolver =
       std::function<HloComputation*(const VersionedComputationHandle& handle)>;
   StatusOr<std::unique_ptr<HloComputation>> BuildHloComputation(
       VersionedComputationHandle::Version version,
       HloComputationResolver hlo_resolver,
-      bool include_unused_parameters = true) const;
+      bool include_unreachable_instructions = true) const;
 
   // Return a vector containing the embedded computations used by this
   // UserComputation. Only embedded computations which are called directly by
@@ -285,13 +293,8 @@ class UserComputation {
       const std::map<int64, ComputationHandle>& old_to_new)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  // Returns the OperationRequestion corresponding to the root (result) of the
-  // computation.
-  const OperationRequest& GetRoot(VersionedComputationHandle::Version version)
-      const EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Returns the OperationRequest corresponding to the given handle value.
-  StatusOr<const OperationRequest*> LookupRequest(
+  // Returns the OperationRequest corresponding to the given handle.
+  StatusOr<const OperationRequest*> LookUpRequest(
       const ComputationDataHandle& handle) const
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
@@ -305,6 +308,9 @@ class UserComputation {
       VersionedComputationHandle::Version version) const
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
+  VersionedComputationHandle GetVersionedHandleInternal() const
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
   // Name of the computation.
   string name_;
 
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
new file mode 100644
index 00000000000..ea691201263
--- /dev/null
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -0,0 +1,282 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/user_computation.h"
+
+#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using UserComputationTest = ::testing::Test;
+
+TEST_F(UserComputationTest, SimpleComputation) {
+  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
+  const Shape kVectorShape = ShapeUtil::MakeShape(F32, {2});
+
+  // Build a simple three operation computatation:
+  //
+  //   %constant = Constant({123, 42})
+  //   %param = Param(0)
+  //   %outfeed = Outfeed(%constant)
+  //
+  // Build the computation at two different versions and check invariants.
+  ComputationHandle handle;
+  handle.set_handle(123);
+  UserComputation computation("TheComputation", handle);
+
+  ConstantRequest constant_request;
+  *constant_request.mutable_literal() =
+      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle constant_handle,
+                         computation.AddConstantInstruction(constant_request));
+
+  ParameterRequest param_request;
+  *param_request.mutable_shape() = kScalarShape;
+  param_request.set_parameter(0);
+  param_request.set_name("param0");
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle param_handle,
+                         computation.AddParameterInstruction(param_request));
+  OpMetadata metadata;
+  metadata.set_op_name("meta");
+  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
+
+  OutfeedRequest outfeed_request;
+  *outfeed_request.mutable_operand() = constant_handle;
+  outfeed_request.set_outfeed_config("abc");
+  TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request));
+
+  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
+    return nullptr;
+  };
+  {
+    // Test the computation at the latest version. In this case, the most
+    // recently added operation is an outfeed. However, the outfeed is not the
+    // root because outfeeds cannot be the root of a computation.
+    VersionedComputationHandle latest_version =
+        computation.GetVersionedHandle();
+
+    // Program shape should have a single scalar parameter and scalar
+    // result. The outfeed instruction should not affect the program shape.
+    TF_ASSIGN_OR_ASSERT_OK(
+        std::shared_ptr<const ProgramShape> program_shape,
+        computation.ComputeProgramShape(latest_version.version));
+    ASSERT_EQ(1, program_shape->parameters_size());
+    EXPECT_TRUE(
+        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
+    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
+
+    // Build the HLO computation.
+    TF_ASSIGN_OR_ASSERT_OK(
+        std::unique_ptr<HloComputation> hlo_computation,
+        computation.BuildHloComputation(latest_version.version, hlo_resolver));
+    // There should be one HloInstruction per UserComputation operation.
+    EXPECT_EQ(3, hlo_computation->instruction_count());
+    // The root of the instruction should be the parameter instruction (not the
+    // outfeed).
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+  }
+
+  {
+    // Test the computation at the version right after the parameter instruction
+    // is added.
+    VersionedComputationHandle version_at_param =
+        computation.GetVersionedHandleAtOperation(param_handle);
+
+    // Program shape should have a single scalar parameter, and scalar result.
+    TF_ASSIGN_OR_ASSERT_OK(
+        std::shared_ptr<const ProgramShape> program_shape,
+        computation.ComputeProgramShape(version_at_param.version));
+    ASSERT_EQ(1, program_shape->parameters_size());
+    EXPECT_TRUE(
+        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
+    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
+
+    // There should be two instructions, one for the constant and one for the
+    // parameter. The outfeed instruction should not be included.
+    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<HloComputation> hlo_computation,
+                           computation.BuildHloComputation(
+                               version_at_param.version, hlo_resolver));
+    EXPECT_EQ(2, hlo_computation->instruction_count());
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+  }
+  {
+    // Test the computation at the latest version, but lowered with
+    // include_unreachable_instructions set to false.
+    VersionedComputationHandle latest_version =
+        computation.GetVersionedHandle();
+
+    // Build the HLO computation.
+    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<HloComputation> hlo_computation,
+                           computation.BuildHloComputation(
+                               latest_version.version, hlo_resolver,
+                               /*include_unreachable_instructions=*/false));
+    // There is only one reachable instruction, the parameter.
+    EXPECT_EQ(1, hlo_computation->instruction_count());
+    // The root of the instruction should be the parameter instruction (not the
+    // outfeed).
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
+              "meta");
+  }
+}
+
+TEST_F(UserComputationTest, EliminateScalarBroadcast) {
+  if (!legacy_flags::GetUserComputationFlags()
+           ->xla_eliminate_hlo_implicit_broadcast) {
+    return;
+  }
+
+  // Build a binary computation with scalar broadcast.
+  //
+  //  %a = Constant({123, 42})
+  //  %b = Constant(1)
+  //  %add = Add(%a, %b)
+  ComputationHandle handle;
+  handle.set_handle(123);
+  UserComputation computation("TheComputation", handle);
+
+  ConstantRequest a_request;
+  *a_request.mutable_literal() =
+      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle a_handle,
+                         computation.AddConstantInstruction(a_request));
+
+  ConstantRequest b_request;
+  *b_request.mutable_literal() = LiteralUtil::CreateR0<float>(1.0f)->ToProto();
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle b_handle,
+                         computation.AddConstantInstruction(b_request));
+
+  BinaryOpRequest add;
+  add.set_binop(BINOP_ADD);
+  *add.mutable_lhs() = a_handle;
+  *add.mutable_rhs() = b_handle;
+  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
+
+  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
+    return nullptr;
+  };
+  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
+
+  // Build the HLO computation.
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<HloComputation> hlo_computation,
+      computation.BuildHloComputation(latest_version.version, hlo_resolver));
+  // The binary operation has implicit scalar broadcast, should be converted
+  // to an explicit broadcast intruction and a binary instruction.
+  EXPECT_EQ(4, hlo_computation->instruction_count());
+  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
+  const auto& operands = hlo_computation->root_instruction()->operands();
+  ASSERT_EQ(2, operands.size());
+  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast ||
+              operands[1]->opcode() == HloOpcode::kBroadcast);
+}
+
+TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
+  if (!legacy_flags::GetUserComputationFlags()
+           ->xla_eliminate_hlo_implicit_broadcast) {
+    return;
+  }
+
+  // Build a binary computation with in-dim broadcast and degenerate broadcast.
+  //
+  //  %a = Param({2, 3});
+  //  %b = Param({2, 1, 4});
+  //  %add = Add(%a, %b, {0, 1});
+  ComputationHandle handle;
+  handle.set_handle(123);
+  UserComputation computation("TheComputation", handle);
+
+  ParameterRequest a_request;
+  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 3});
+  a_request.set_name("a");
+  a_request.set_parameter(0);
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle a_handle,
+                         computation.AddParameterInstruction(a_request));
+
+  ParameterRequest b_request;
+  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 1, 4});
+  b_request.set_name("b");
+  b_request.set_parameter(1);
+  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle b_handle,
+                         computation.AddParameterInstruction(b_request));
+
+  BinaryOpRequest add;
+  add.set_binop(BINOP_ADD);
+  *add.mutable_lhs() = a_handle;
+  *add.mutable_rhs() = b_handle;
+  add.add_broadcast_dimensions(0);
+  add.add_broadcast_dimensions(1);
+  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
+
+  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
+    return nullptr;
+  };
+  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
+
+  // Build the HLO computation.
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<HloComputation> hlo_computation,
+      computation.BuildHloComputation(latest_version.version, hlo_resolver));
+
+  // The binary operation has in-dim broadcast and degenerate broadcast, should
+  // first do the in-dim broadcast then convert the degnerate broadcast into a
+  // reshape and a broadcast.
+  //
+  //    b         a
+  //    |         |
+  // broadcast reshape
+  //    |         |
+  //    |     broadcast
+  //     \        /
+  //        add
+  EXPECT_EQ(6, hlo_computation->instruction_count());
+  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
+  const auto& operands = hlo_computation->root_instruction()->operands();
+  ASSERT_EQ(2, operands.size());
+  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast &&
+              operands[1]->opcode() == HloOpcode::kBroadcast);
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.cc b/tensorflow/compiler/xla/service/versioned_computation_handle.cc
new file mode 100644
index 00000000000..a693c4695f0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/versioned_computation_handle.cc
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+
+string VersionedComputationHandle::ToString() const {
+  return tensorflow::strings::StrCat(handle.handle(), ":v", version);
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const VersionedComputationHandle& versioned_handle) {
+  out << versioned_handle.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.h b/tensorflow/compiler/xla/service/versioned_computation_handle.h
index 03bee3d4a5f..5732a56caff 100644
--- a/tensorflow/compiler/xla/service/versioned_computation_handle.h
+++ b/tensorflow/compiler/xla/service/versioned_computation_handle.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
 
+#include <ostream>
+
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -32,6 +34,8 @@ struct VersionedComputationHandle {
 
   ComputationHandle handle;
   Version version;
+
+  string ToString() const;
   bool operator==(const VersionedComputationHandle& other) const {
     return (handle.handle() == other.handle.handle()) &&
            (version == other.version);
@@ -43,6 +47,9 @@ struct VersionedComputationHandle {
   }
 };
 
+std::ostream& operator<<(std::ostream& out,
+                         const VersionedComputationHandle& versioned_handle);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index fc107480f73..809941d8fe1 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -21,7 +21,10 @@ limitations under the License.
 
 namespace xla {
 
-// Defines the interface for an XLA service.
+// Defines the interface for an XLA service on the client side. This service
+// helps abstract around the actual implementation of a service - the service
+// can be local (running in the same process), or remote - in which case an RPC
+// stub is used as the implementation.
 class ServiceInterface {
  public:
   ServiceInterface() {}
@@ -31,23 +34,19 @@ class ServiceInterface {
   virtual tensorflow::Status TransferToClient(
       const TransferToClientRequest* arg, TransferToClientResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToClientInProcess(
-      const TransferToClientInProcessRequest* arg,
-      TransferToClientInProcessResponse* result) = 0;
-
   virtual tensorflow::Status TransferToServer(
       const TransferToServerRequest* arg, TransferToServerResponse* result) = 0;
 
   virtual tensorflow::Status TransferToInfeed(
       const TransferToInfeedRequest* arg, TransferToInfeedResponse* result) = 0;
 
+  virtual tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) = 0;
+
   virtual tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
                                          ResetDeviceResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToServerInProcess(
-      const TransferToServerInProcessRequest* arg,
-      TransferToServerInProcessResponse* result) = 0;
-
   virtual tensorflow::Status LoadComputationSnapshot(
       const LoadComputationSnapshotRequest* request,
       LoadComputationSnapshotResponse* result) = 0;
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 6963a68d10d..cc456df4fce 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -33,22 +33,67 @@ limitations under the License.
 
 namespace xla {
 
+namespace internal {
+
+// Internal representation of each node in a ShapeTree.
+template <typename T>
+struct ShapeTreeNode {
+  // Data corresponding to this node.
+  T data;
+
+  // Children of this node.
+  std::vector<std::unique_ptr<ShapeTreeNode>> children;
+
+  ShapeTreeNode() = default;
+  explicit ShapeTreeNode(const T& data) : data(data) {}
+
+  ShapeTreeNode(const ShapeTreeNode& other)
+      : data(other.data), children(other.children.size()) {
+    for (size_t i = 0; i < children.size(); ++i) {
+      children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+    }
+  }
+
+  ShapeTreeNode& operator=(const ShapeTreeNode& other) {
+    if (this != &other) {
+      data = other.data;
+      children.resize(other.children.size());
+      for (size_t i = 0; i < children.size(); ++i) {
+        children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+      }
+    }
+    return *this;
+  }
+};
+
+}  // namespace internal
+
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
-// XLA shape and holds a value of type T for each array in the shape. For
-// array shapes, a ShapeTree trivially holds a single value of type T. For tuple
-// shapes which can be an arbitrary tree with arrays at the leaves, a ShapeTree
-// is an identically structured tree with data elements of type T at the leaves.
+// XLA shape and holds a value of type T for each subshape (i.e. tuple or array)
+// in the shape. For array shapes, a ShapeTree trivially holds a single value of
+// type T.
+//
+// For tuple shapes which can be an arbitrary tree with arrays at the leaves, a
+// ShapeTree is an identically structured tree with data elements of type T at
+// every node. I.e. the root is a tuple by definition, all interior nodes are
+// also tuples, and all leaves are arrays.
 //
 // Like the Shape data structure, this is a tree and tuple elements cannot be
-// duplicated. That is, every distinct element position in the Shape has a
-// unique T object.
+// duplicated. That is, every distinct ShapeIndex in the Shape has a unique T
+// object.
 template <typename T>
 class ShapeTree {
  public:
+  // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
+  ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
+  // Create ShapeTree with the given shape, and default-constructed T values for
+  // all nodes.
   explicit ShapeTree(const Shape& shape);
+  // Create ShapeTree with the given shape, and init_value for all nodes.
   ShapeTree(const Shape& shape, const T& init_value);
-  ShapeTree(const ShapeTree<T>& other);
-  ShapeTree<T>& operator=(const ShapeTree<T>& other);
+
+  ShapeTree(const ShapeTree& other) = default;
+  ShapeTree& operator=(const ShapeTree& other) = default;
 
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
@@ -56,12 +101,12 @@ class ShapeTree {
   T* mutable_element(const ShapeIndex& index);
 
   // Return the shape represented with this ShapeTree.
-  const Shape& shape() const { return *shape_; }
+  const Shape& shape() const { return shape_; }
 
   // Returns true if the node at the given index is a leaf node (an array
   // shape).
   bool IsLeaf(const ShapeIndex& index) const {
-    return Lookup(index).elements_.empty();
+    return Lookup(index)->children.empty();
   }
 
   // Recursively traverses the shape and calls the given function at each
@@ -69,190 +114,235 @@ class ShapeTree {
   //
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
-  //   is_leaf : Whether this element is a leaf element in the shape. That is,
-  //             whether this index corresponds to an array and not a (nested)
-  //             tuple element.
   //   data : The data value at this elemnt.
-  //
-  // If any call to the given function returns a non-OK status, then traversal
-  // is aborted and the status value is returned.
-  using VisitorFunction = std::function<tensorflow::Status(
-      const ShapeIndex& /*index*/, bool /*is_leaf*/, const T& /*data*/)>;
-  tensorflow::Status ForEachElement(VisitorFunction func) const;
+  using VisitorFunction =
+      std::function<void(const ShapeIndex& /*index*/, const T& /*data*/)>;
+  void ForEachElement(const VisitorFunction& func) const;
 
-  using MutableVisitorFunction = std::function<tensorflow::Status(
-      const ShapeIndex& /*index*/, bool /*is_leaf*/, T* /*data*/)>;
-  tensorflow::Status ForEachMutableElement(MutableVisitorFunction func);
+  using MutableVisitorFunction =
+      std::function<void(const ShapeIndex& /*index*/, T* /*data*/)>;
+  void ForEachMutableElement(const MutableVisitorFunction& func);
+
+  // Variants of ForEach(Mutable)Element which propagate a Status value from the
+  // visitor.
+  using StatusVisitorFunction =
+      std::function<Status(const ShapeIndex& /*index*/, const T& /*data*/)>;
+  Status ForEachElementWithStatus(const StatusVisitorFunction& func) const;
+
+  using MutableStatusVisitorFunction =
+      std::function<Status(const ShapeIndex& /*index*/, T* /*data*/)>;
+  Status ForEachMutableElementWithStatus(
+      const MutableStatusVisitorFunction& func);
+
+  // Copy the subtree of values from 'other' rooted at ShapeIndex
+  // 'source_base_index' into the subtree of value in this ShapeTree rooted at
+  // 'target_base_index'.
+  //
+  // Precondition: The subshape of other.shape() at index source_base_index must
+  // be compatible with the subshape of shape() at index target_base_index.
+  void CopySubtreeFrom(const ShapeTree<T>& other,
+                       const ShapeIndex& source_base_index,
+                       const ShapeIndex& target_base_index);
+
+  bool operator==(const ShapeTree<T>& other) const;
+  bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
 
  private:
-  // Private default constructor for non-root nodes of the tree.
-  ShapeTree() = default;
+  using Node = internal::ShapeTreeNode<T>;
+
+  // Initialize node->children based on 'shape'. All children are assigned the
+  // the given 'init_value'.
+  void InitChildren(const Shape& shape, const T& init_value, Node* node);
+
+  // Initialize node->children based on 'shape'. All children have
+  // default-constructed data values.
+  void InitChildren(const Shape& shape, Node* node);
 
   // Helpers for traversing the shape via ForEachElement. The helpers
   // recursively traverse the subtree rooted at "index" (defined as in
   // ShapeUtil::GetSubshape).
-  static tensorflow::Status ForEachHelperMutable(ShapeIndex* index,
-                                                 ShapeTree<T>* shape_tree,
-                                                 MutableVisitorFunction func);
-  static tensorflow::Status ForEachHelper(ShapeIndex* index,
-                                          const ShapeTree<T>& shape_tree,
-                                          VisitorFunction func);
-
-  // Copy all the data elements (of type T) from "other" into "this". "this"
-  // must have the same tree structure as "other" prior to calling this method.
-  void CopyDataElements(const ShapeTree<T>& other);
-
-  // Recursive helper for constructing a subtree beneath "this" node.
-  void BuildTree(const Shape& shape);
+  static Status ForEachHelper(const StatusVisitorFunction& func,
+                              const Node& node, ShapeIndex* index);
+  static Status ForEachMutableHelper(const MutableStatusVisitorFunction& func,
+                                     Node* node, ShapeIndex* index);
 
   // Return the tree node at the given index.
-  ShapeTree<T>& Lookup(const ShapeIndex& index);
-  const ShapeTree<T>& Lookup(const ShapeIndex& index) const;
+  Node* Lookup(const ShapeIndex& index);
+  const Node* Lookup(const ShapeIndex& index) const;
 
-  // The data corresponding to the array at this node.
-  T data_;
+  // The root node, which contains all other nodes.
+  Node root_;
 
-  // The XLA shape mirrored in this ShapeTree. Only the root of the
-  // ShapeTree has this member set.
-  std::unique_ptr<Shape> shape_;
-
-  // The children of this node in the tree.
-  std::vector<std::unique_ptr<ShapeTree>> elements_;
+  // The XLA shape mirrored in this ShapeTree.
+  Shape shape_;
 };
 
 template <typename T>
-void ShapeTree<T>::BuildTree(const Shape& shape) {
+void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
+                                Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      elements_.emplace_back(new ShapeTree());
-      elements_.back()->BuildTree(shape.tuple_shapes(i));
+      node->children.emplace_back(new Node(init_value));
+      InitChildren(shape.tuple_shapes(i), init_value,
+                   node->children.back().get());
     }
   }
 }
 
 template <typename T>
-ShapeTree<T>::ShapeTree(const Shape& shape) : shape_(MakeUnique<Shape>(shape)) {
-  // The shape_ field is just used to hold the structure of the shape. It should
-  // not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
+void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
+  if (ShapeUtil::IsTuple(shape)) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      node->children.emplace_back(new Node());
+      InitChildren(shape.tuple_shapes(i), node->children.back().get());
+    }
+  }
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const Shape& shape) : root_(), shape_(shape) {
+  // The shape_ field is just used to hold the structure of the shape.
+  // It should not be relied upon to store layout information.
+  LayoutUtil::ClearLayout(&shape_);
+  InitChildren(shape_, &root_);
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape& shape, const T& init_value)
-    : shape_(MakeUnique<Shape>(shape)) {
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
-  TF_CHECK_OK(ForEachMutableElement(
-      [&init_value](const ShapeIndex& /*index*/, bool /*is_leaf*/, bool* data) {
-        *data = init_value;
-        return tensorflow::Status::OK();
-      }));
-}
-
-template <typename T>
-ShapeTree<T>::ShapeTree(const ShapeTree& other)
-    : shape_(MakeUnique<Shape>(other.shape())) {
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
-  CopyDataElements(other);
-}
-
-template <typename T>
-ShapeTree<T>& ShapeTree<T>::operator=(const ShapeTree<T>& other) {
-  if (this == &other) {
-    return *this;
-  }
-  elements_.clear();
-  shape_ = MakeUnique<Shape>(other.shape());
-  LayoutUtil::ClearLayout(shape_.get());
-
-  BuildTree(*shape_);
-  CopyDataElements(other);
-  return *this;
-}
-
-template <typename T>
-void ShapeTree<T>::CopyDataElements(const ShapeTree<T>& other) {
-  CHECK(ShapeUtil::Compatible(shape(), other.shape()));
-  TF_CHECK_OK(ForEachMutableElement(
-      [&other](const ShapeIndex& index, bool /*is_leaf*/, T* data) {
-        *data = other.element(index);
-        return tensorflow::Status::OK();
-      }));
+    : root_(init_value), shape_(shape) {
+  // The shape_ field is just used to hold the structure of the shape.
+  // It should not be relied upon to store layout information.
+  LayoutUtil::ClearLayout(&shape_);
+  InitChildren(shape_, init_value, &root_);
 }
 
 template <typename T>
 const T& ShapeTree<T>::element(const ShapeIndex& index) const {
-  return Lookup(index).data_;
+  return Lookup(index)->data;
 }
 
 template <typename T>
 T* ShapeTree<T>::mutable_element(const ShapeIndex& index) {
-  return &Lookup(index).data_;
+  return &Lookup(index)->data;
 }
 
 template <typename T>
-ShapeTree<T>& ShapeTree<T>::Lookup(const ShapeIndex& index) {
-  ShapeTree<T>* node = this;
-  for (auto& i : index) {
+internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(const ShapeIndex& index) {
+  Node* node = &root_;
+  for (const int64 i : index) {
     CHECK_GE(i, 0);
-    CHECK_LT(i, node->elements_.size());
-    node = node->elements_[i].get();
+    CHECK_LT(i, node->children.size());
+    node = node->children[i].get();
   }
-  return *node;
+  return node;
 }
 
 template <typename T>
-const ShapeTree<T>& ShapeTree<T>::Lookup(const ShapeIndex& index) const {
-  return const_cast<ShapeTree<T>*>(this)->Lookup(index);
+const internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(
+    const ShapeIndex& index) const {
+  return const_cast<ShapeTree*>(this)->Lookup(index);
 }
 
 /* static */
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachHelperMutable(
-    ShapeIndex* index, ShapeTree<T>* shape_tree,
-    ShapeTree<T>::MutableVisitorFunction func) {
-  TF_RETURN_IF_ERROR(
-      func(*index, shape_tree->elements_.empty(), &shape_tree->data_));
-  for (int i = 0; i < shape_tree->elements_.size(); ++i) {
+Status ShapeTree<T>::ForEachHelper(const StatusVisitorFunction& func,
+                                   const Node& node, ShapeIndex* index) {
+  TF_RETURN_IF_ERROR(func(*index, node.data));
+  for (int64 i = 0; i < node.children.size(); ++i) {
+    index->push_back(i);
+    TF_RETURN_IF_ERROR(ForEachHelper(func, *node.children[i], index));
+    index->pop_back();
+  }
+  return Status::OK();
+}
+
+/* static */
+template <typename T>
+Status ShapeTree<T>::ForEachMutableHelper(
+    const MutableStatusVisitorFunction& func, Node* node, ShapeIndex* index) {
+  TF_RETURN_IF_ERROR(func(*index, &node->data));
+  for (int64 i = 0; i < node->children.size(); ++i) {
     index->push_back(i);
     TF_RETURN_IF_ERROR(
-        ForEachHelperMutable(index, shape_tree->elements_[i].get(), func));
+        ForEachMutableHelper(func, node->children[i].get(), index));
     index->pop_back();
   }
-
-  return tensorflow::Status::OK();
-}
-
-/* static */
-template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachHelper(
-    ShapeIndex* index, const ShapeTree<T>& shape_tree,
-    ShapeTree<T>::VisitorFunction func) {
-  TF_RETURN_IF_ERROR(
-      func(*index, shape_tree.elements_.empty(), shape_tree.data_));
-  for (int i = 0; i < shape_tree.elements_.size(); ++i) {
-    index->push_back(i);
-    TF_RETURN_IF_ERROR(ForEachHelper(index, *shape_tree.elements_[i], func));
-    index->pop_back();
-  }
-
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachElement(
-    ShapeTree<T>::VisitorFunction func) const {
+Status ShapeTree<T>::ForEachElementWithStatus(
+    const StatusVisitorFunction& func) const {
   ShapeIndex index;
-  return ForEachHelper(&index, *this, func);
+  return ForEachHelper(func, root_, &index);
 }
 
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachMutableElement(
-    ShapeTree<T>::MutableVisitorFunction func) {
+Status ShapeTree<T>::ForEachMutableElementWithStatus(
+    const MutableStatusVisitorFunction& func) {
   ShapeIndex index;
-  return ForEachHelperMutable(&index, this, func);
+  return ForEachMutableHelper(func, &root_, &index);
+}
+
+template <typename T>
+void ShapeTree<T>::ForEachElement(const VisitorFunction& func) const {
+  ShapeIndex index;
+  return ForEachHelper(
+             [&func](const ShapeIndex& index, const T& data) {
+               func(index, data);
+               return Status::OK();
+             },
+             root_, &index)
+      .IgnoreError();
+}
+
+template <typename T>
+void ShapeTree<T>::ForEachMutableElement(const MutableVisitorFunction& func) {
+  ShapeIndex index;
+  return ForEachMutableHelper(
+             [&func](const ShapeIndex& index, T* data) {
+               func(index, data);
+               return Status::OK();
+             },
+             &root_, &index)
+      .IgnoreError();
+}
+
+template <typename T>
+void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
+                                   const ShapeIndex& source_base_index,
+                                   const ShapeIndex& target_base_index) {
+  CHECK(ShapeUtil::Compatible(
+      ShapeUtil::GetSubshape(shape(), target_base_index),
+      ShapeUtil::GetSubshape(other.shape(), source_base_index)));
+  ForEachMutableElement([this, &other, &source_base_index, &target_base_index](
+                            const ShapeIndex& index, T* data) {
+    // Copy the data element only if index is in the
+    // subtree rooted at target_base_index.
+    for (int i = 0; i < target_base_index.size(); ++i) {
+      if (i >= index.size() || index[i] != target_base_index[i]) {
+        return;
+      }
+    }
+    // Construct source element index to copy from.
+    ShapeIndex source_index = source_base_index;
+    for (int i = target_base_index.size(); i < index.size(); ++i) {
+      source_index.push_back(index[i]);
+    }
+    *data = other.element(source_index);
+  });
+}
+
+template <typename T>
+bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
+  bool equal = true;
+  ForEachElement(
+      [this, &other, &equal](const ShapeIndex& index, const T& data) {
+        if (data != other.element(index)) {
+          equal = false;
+        }
+      });
+  return equal;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index d37f536b755..afc3a2b2a34 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_tree.h"
 
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -35,6 +35,9 @@ class ShapeTreeTest : public ::testing::Test {
               array_shape_})});
   }
 
+  void TestShapeConstructor(const Shape& shape, int expected_num_nodes);
+  void TestInitValueConstructor(const Shape& shape, int expected_num_nodes);
+
   // An array shape (non-tuple).
   Shape array_shape_;
 
@@ -45,6 +48,73 @@ class ShapeTreeTest : public ::testing::Test {
   Shape nested_tuple_shape_;
 };
 
+TEST_F(ShapeTreeTest, DefaultConstructor) {
+  ShapeTree<int> int_tree;
+  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+
+  ShapeTree<bool> bool_tree;
+  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+}
+
+void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
+                                         int expected_num_nodes) {
+  ShapeTree<int> int_tree(shape);
+  int num_nodes = 0;
+  int_tree.ForEachElement([&num_nodes](const ShapeIndex& /*index*/, int data) {
+    EXPECT_EQ(0, data);
+    ++num_nodes;
+  });
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  ShapeTree<bool> bool_tree(shape);
+  num_nodes = 0;
+  bool_tree.ForEachElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool data) {
+        EXPECT_EQ(false, data);
+        ++num_nodes;
+      });
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+}
+
+TEST_F(ShapeTreeTest, ShapeConstructor) {
+  TestShapeConstructor(array_shape_, 1);
+  TestShapeConstructor(tuple_shape_, 4);
+  TestShapeConstructor(nested_tuple_shape_, 10);
+}
+
+void ShapeTreeTest::TestInitValueConstructor(const Shape& shape,
+                                             int expected_num_nodes) {
+  ShapeTree<int> tree(shape, 42);
+  int num_nodes = 0;
+  tree.ForEachElement([&num_nodes](const ShapeIndex& /*index*/, int data) {
+    EXPECT_EQ(42, data);
+    ++num_nodes;
+  });
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  num_nodes = 0;
+  tree.ForEachMutableElement(
+      [&num_nodes](const ShapeIndex& /*index*/, int* data) {
+        EXPECT_EQ(42, *data);
+        *data = num_nodes;
+        ++num_nodes;
+      });
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  num_nodes = 0;
+  tree.ForEachElement([&num_nodes](const ShapeIndex& /*index*/, int data) {
+    EXPECT_EQ(num_nodes, data);
+    ++num_nodes;
+  });
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+}
+
+TEST_F(ShapeTreeTest, InitValueConstructor) {
+  TestInitValueConstructor(array_shape_, 1);
+  TestInitValueConstructor(tuple_shape_, 4);
+  TestInitValueConstructor(nested_tuple_shape_, 10);
+}
+
 TEST_F(ShapeTreeTest, ArrayShape) {
   ShapeTree<int> shape_tree{array_shape_};
   *shape_tree.mutable_element({}) = 42;
@@ -57,6 +127,15 @@ TEST_F(ShapeTreeTest, ArrayShape) {
   // Test the copy constructor.
   ShapeTree<int> copy{shape_tree};
   EXPECT_EQ(123, copy.element({}));
+
+  // Mutate the copy, and ensure the original doesn't change.
+  *copy.mutable_element({}) = 99;
+  EXPECT_EQ(99, copy.element({}));
+  EXPECT_EQ(123, shape_tree.element({}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(123, copy.element({}));
 }
 
 TEST_F(ShapeTreeTest, TupleShape) {
@@ -74,11 +153,8 @@ TEST_F(ShapeTreeTest, TupleShape) {
 
   // Sum all elements in the shape.
   int sum = 0;
-  TF_CHECK_OK(shape_tree.ForEachElement(
-      [&sum](const ShapeIndex& /*index*/, bool /*is_leaf*/, int data) {
-        sum += data;
-        return tensorflow::Status::OK();
-      }));
+  shape_tree.ForEachElement(
+      [&sum](const ShapeIndex& /*index*/, int data) { sum += data; });
   EXPECT_EQ(66, sum);
 
   // Test the copy constructor.
@@ -89,15 +165,23 @@ TEST_F(ShapeTreeTest, TupleShape) {
   EXPECT_EQ(-100, copy.element({2}));
 
   // Write zero to all data elements.
-  TF_CHECK_OK(shape_tree.ForEachMutableElement(
-      [&sum](const ShapeIndex& /*index*/, bool /*is_leaf*/, int* data) {
-        *data = 0;
-        return tensorflow::Status::OK();
-      }));
+  shape_tree.ForEachMutableElement(
+      [&sum](const ShapeIndex& /*index*/, int* data) { *data = 0; });
   EXPECT_EQ(0, shape_tree.element({}));
   EXPECT_EQ(0, shape_tree.element({0}));
   EXPECT_EQ(0, shape_tree.element({1}));
   EXPECT_EQ(0, shape_tree.element({2}));
+  EXPECT_EQ(1, copy.element({}));
+  EXPECT_EQ(42, copy.element({0}));
+  EXPECT_EQ(123, copy.element({1}));
+  EXPECT_EQ(-100, copy.element({2}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(0, copy.element({}));
+  EXPECT_EQ(0, copy.element({0}));
+  EXPECT_EQ(0, copy.element({1}));
+  EXPECT_EQ(0, copy.element({2}));
 }
 
 TEST_F(ShapeTreeTest, NestedTupleShape) {
@@ -116,6 +200,23 @@ TEST_F(ShapeTreeTest, NestedTupleShape) {
   EXPECT_EQ(42, copy.element({0}));
   EXPECT_EQ(123, copy.element({1, 1}));
   EXPECT_EQ(-100, copy.element({2, 0, 1}));
+
+  // Mutate the copy, and ensure the original doesn't change.
+  *copy.mutable_element({0}) = 1;
+  *copy.mutable_element({1, 1}) = 2;
+  *copy.mutable_element({2, 0, 1}) = 3;
+  EXPECT_EQ(1, copy.element({0}));
+  EXPECT_EQ(2, copy.element({1, 1}));
+  EXPECT_EQ(3, copy.element({2, 0, 1}));
+  EXPECT_EQ(42, shape_tree.element({0}));
+  EXPECT_EQ(123, shape_tree.element({1, 1}));
+  EXPECT_EQ(-100, shape_tree.element({2, 0, 1}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(42, copy.element({0}));
+  EXPECT_EQ(123, copy.element({1, 1}));
+  EXPECT_EQ(-100, copy.element({2, 0, 1}));
 }
 
 TEST_F(ShapeTreeTest, InvalidIndexingTuple) {
@@ -130,5 +231,139 @@ TEST_F(ShapeTreeTest, InvalidIndexingNestedTuple) {
   EXPECT_DEATH(shape_tree.element({0, 0}), "");
 }
 
+TEST_F(ShapeTreeTest, ShapeTreeOfNonCopyableType) {
+  ShapeTree<std::unique_ptr<int>> shape_tree{tuple_shape_};
+  EXPECT_EQ(shape_tree.element({2}).get(), nullptr);
+  *shape_tree.mutable_element({2}) = MakeUnique<int>(42);
+  EXPECT_EQ(*shape_tree.element({2}), 42);
+}
+
+TEST_F(ShapeTreeTest, CopySubtreeFromArrayShape) {
+  // Test CopySubtreeFrom method for a single value copied between array-shaped
+  // ShapeTrees.
+  ShapeTree<int> source(array_shape_);
+  *source.mutable_element(/*index=*/{}) = 42;
+  ShapeTree<int> destination(array_shape_, 123);
+
+  EXPECT_EQ(destination.element(/*index=*/{}), 123);
+  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
+                              /*target_base_index=*/{});
+  EXPECT_EQ(destination.element(/*index=*/{}), 42);
+}
+
+TEST_F(ShapeTreeTest, FullCopySubtreeFromTupleShape) {
+  // Test CopySubtreeFrom method for a copy of all elements from one
+  // tuple-shaped ShapeTree to another.
+  ShapeTree<int> source(tuple_shape_);
+  *source.mutable_element(/*index=*/{}) = 10;
+  *source.mutable_element(/*index=*/{0}) = 11;
+  *source.mutable_element(/*index=*/{1}) = 12;
+  *source.mutable_element(/*index=*/{2}) = 13;
+
+  ShapeTree<int> destination(tuple_shape_, 0);
+
+  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
+                              /*target_base_index=*/{});
+  EXPECT_EQ(destination.element(/*index=*/{}), 10);
+  EXPECT_EQ(destination.element(/*index=*/{0}), 11);
+  EXPECT_EQ(destination.element(/*index=*/{1}), 12);
+  EXPECT_EQ(destination.element(/*index=*/{2}), 13);
+}
+
+TEST_F(ShapeTreeTest, SingleElementCopySubtreeFromTupleShape) {
+  // Test CopySubtreeFrom method for a copy of a single element from one
+  // tuple-shaped ShapeTree to another.
+  ShapeTree<int> source(tuple_shape_);
+  *source.mutable_element(/*index=*/{}) = 10;
+  *source.mutable_element(/*index=*/{0}) = 11;
+  *source.mutable_element(/*index=*/{1}) = 12;
+  *source.mutable_element(/*index=*/{2}) = 13;
+
+  ShapeTree<int> destination(tuple_shape_, 0);
+
+  destination.CopySubtreeFrom(source, /*source_base_index=*/{0},
+                              /*target_base_index=*/{1});
+  EXPECT_EQ(destination.element(/*index=*/{}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{0}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{1}), 11);
+  EXPECT_EQ(destination.element(/*index=*/{2}), 0);
+}
+
+TEST_F(ShapeTreeTest, CopySubtreeIntoNestedShape) {
+  // Test CopySubtreeFrom method for a copy of a tuple-shaped ShapeTree into a
+  // nested-tuple-shaped ShapeTree.
+  ShapeTree<int> source(
+      ShapeUtil::MakeTupleShape({array_shape_, array_shape_}));
+  *source.mutable_element(/*index=*/{}) = 10;
+  *source.mutable_element(/*index=*/{0}) = 11;
+  *source.mutable_element(/*index=*/{1}) = 12;
+
+  ShapeTree<int> destination(nested_tuple_shape_, 0);
+
+  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
+                              /*target_base_index=*/{2, 0});
+
+  EXPECT_EQ(destination.element(/*index=*/{}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{0}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{1}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{1, 0}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{1, 1}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{2}), 0);
+  EXPECT_EQ(destination.element(/*index=*/{2, 0}), 10);
+  EXPECT_EQ(destination.element(/*index=*/{2, 0, 0}), 11);
+  EXPECT_EQ(destination.element(/*index=*/{2, 0, 1}), 12);
+  EXPECT_EQ(destination.element(/*index=*/{2, 1}), 0);
+}
+
+TEST_F(ShapeTreeTest, CopySubtreeFromNestedShape) {
+  // Test CopySubtreeFrom method for a copy from a nested-tuple-shape.
+  ShapeTree<int> source(nested_tuple_shape_, 42);
+  *source.mutable_element(/*index=*/{1}) = 10;
+  *source.mutable_element(/*index=*/{1, 0}) = 11;
+  *source.mutable_element(/*index=*/{1, 1}) = 12;
+
+  ShapeTree<int> destination(
+      ShapeUtil::MakeTupleShape({array_shape_, array_shape_}), 0);
+
+  destination.CopySubtreeFrom(source, /*source_base_index=*/{1},
+                              /*target_base_index=*/{});
+
+  EXPECT_EQ(destination.element(/*index=*/{}), 10);
+  EXPECT_EQ(destination.element(/*index=*/{0}), 11);
+  EXPECT_EQ(destination.element(/*index=*/{1}), 12);
+}
+
+TEST_F(ShapeTreeTest, OperatorEquals) {
+  {
+    ShapeTree<int> a(array_shape_, 123);
+    ShapeTree<int> b(array_shape_, 42);
+    ShapeTree<int> c(array_shape_, 42);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a != b);
+    EXPECT_TRUE(b == c);
+  }
+  {
+    ShapeTree<int> a(tuple_shape_);
+    *a.mutable_element(/*index=*/{}) = 10;
+    *a.mutable_element(/*index=*/{0}) = 11;
+    *a.mutable_element(/*index=*/{1}) = 12;
+
+    ShapeTree<int> b(tuple_shape_);
+    *b.mutable_element(/*index=*/{}) = 10;
+    *b.mutable_element(/*index=*/{0}) = 42;
+    *b.mutable_element(/*index=*/{1}) = 11;
+
+    ShapeTree<int> c(tuple_shape_);
+    *c.mutable_element(/*index=*/{}) = 10;
+    *c.mutable_element(/*index=*/{0}) = 42;
+    *c.mutable_element(/*index=*/{1}) = 11;
+
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a != b);
+    EXPECT_TRUE(b == c);
+    EXPECT_FALSE(b != c);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 4acdd71d173..ee49a9ae5f5 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -37,6 +39,16 @@ limitations under the License.
 
 namespace xla {
 
+string ShapeIndex::ToString() const {
+  return tensorflow::strings::StrCat(
+      "{", tensorflow::str_util::Join(indices_, ","), "}");
+}
+
+std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
+  out << shape_index.ToString();
+  return out;
+}
+
 namespace {
 
 // Recursive helper for comparing the equality of two shapes. Returns true if
@@ -44,18 +56,11 @@ namespace {
 // match.
 bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   if (ShapeUtil::IsTuple(lhs)) {
-    if (!ShapeUtil::IsTuple(rhs)) {
-      VLOG(3) << "CompareShapes: lhs is a tuple, rhs not a tuple";
-      return false;
-    }
-
-    if (!ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         [=](const Shape& l, const Shape& r) {
-                           return CompareShapes(l, r, compare_layouts);
-                         })) {
-      VLOG(3) << "CompareShapes: tuples on lhs and rhs not equal";
-      return false;
-    }
+    return ShapeUtil::IsTuple(rhs) &&
+           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+                           [=](const Shape& l, const Shape& r) {
+                             return CompareShapes(l, r, compare_layouts);
+                           });
   }
   // Explicitly compare the fields rather than using MessageDifferencer because
   // we want empty layouts to be treated identically to missing layouts.
@@ -117,7 +122,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   for (const auto& shape : parameters) {
     *program_shape.add_parameters() = shape;
   }
-  *program_shape.mutable_result() = result;
+  *program_shape.mutable_result() = std::move(result);
   return program_shape;
 }
 
@@ -197,7 +202,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
-  shape->mutable_layout()->add_minor_to_major(ShapeUtil::Rank(*shape));
+  shape->mutable_layout()->add_minor_to_major(Rank(*shape));
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
 }
@@ -290,7 +295,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 
   std::vector<Shape> new_elements(tuple.tuple_shapes().begin() + start,
                                   tuple.tuple_shapes().begin() + limit);
-  return ShapeUtil::MakeTupleShape(new_elements);
+  return MakeTupleShape(new_elements);
 }
 
 /* static */ bool ShapeUtil::IsOpaque(const Shape& shape) {
@@ -304,7 +309,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   if (shape.element_type() != element_type) {
     return false;
   }
-  if (shape.dimensions_size() != ShapeUtil::Rank(shape)) {
+  if (shape.dimensions_size() != Rank(shape)) {
     return false;
   }
   int64 i = 0;
@@ -318,7 +323,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  CHECK_EQ(shape.dimensions_size(), ShapeUtil::Rank(shape));
+  CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
       std::multiplies<int64>());
@@ -329,7 +334,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) {
-  return shape.element_type() == F32 && ShapeUtil::Rank(shape) == 0;
+  return shape.element_type() == F32 && Rank(shape) == 0;
 }
 
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
@@ -427,13 +432,12 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     }
     Shape result;
     if (layout_string.empty()) {
-      result = ShapeUtil::MakeShape(primitive_type, dimensions);
+      result = MakeShape(primitive_type, dimensions);
     } else {
       TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
                           comma_list_to_int64s(layout_string));
       TF_RET_CHECK(dimensions.size() == min2maj.size());
-      result =
-          ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions, min2maj);
+      result = MakeShapeWithLayout(primitive_type, dimensions, min2maj);
     }
     TF_DCHECK_OK(ValidateShape(result));
     return result;
@@ -463,7 +467,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 /* static */ int64 ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                  int64 dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += ShapeUtil::Rank(shape);
+    dimension_number += Rank(shape);
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -515,7 +519,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   }
   int64 allocated_element_count;
   if (shape.layout().padded_dimensions_size() > 0) {
-    CHECK_EQ(ShapeUtil::Rank(shape), shape.layout().padded_dimensions_size());
+    CHECK_EQ(Rank(shape), shape.layout().padded_dimensions_size());
     allocated_element_count = 1;
     for (int64 dimension_size : shape.layout().padded_dimensions()) {
       allocated_element_count *= dimension_size;
@@ -531,9 +535,9 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     const Shape& shape) {
   if (shape.element_type() == TUPLE) {
     // Tuple shape.
-    if (ShapeUtil::Rank(shape) != 0) {
+    if (Rank(shape) != 0) {
       return InvalidArgument("tuples must be rank-0; got rank %lld",
-                             ShapeUtil::Rank(shape));
+                             Rank(shape));
     }
     if (shape.dimensions_size() != 0) {
       return InvalidArgument("tuples must not have dimensions specified");
@@ -553,13 +557,13 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     return InvalidArgument("shape has invalid element type: %s",
                            shape.ShortDebugString().c_str());
   }
-  if (ShapeUtil::Rank(shape) != shape.dimensions_size()) {
+  if (Rank(shape) != shape.dimensions_size()) {
     return InvalidArgument(
         "shape's rank is mismatched with dimension count; rank=%lld "
         "dimensions_size=%d",
-        ShapeUtil::Rank(shape), shape.dimensions_size());
+        Rank(shape), shape.dimensions_size());
   }
-  for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+  for (int64 i = 0; i < Rank(shape); ++i) {
     int64 dimension = shape.dimensions(i);
     if (dimension < 0) {
       return InvalidArgument(
@@ -614,6 +618,11 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   return return_shape;
 }
 
+/* static */
+bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
+  return !IsTuple(GetSubshape(shape, index));
+}
+
 /* static */ Shape ShapeUtil::StripDegenerateDimensions(const Shape& shape) {
   std::vector<int64> dimension_sizes;
   std::vector<int64> degenerate_dimensions;
@@ -672,7 +681,7 @@ namespace {
 // Helper for ForEachSubshape which visits the subshapes of the given shape in
 // DFS pre-order starting with the index.
 Status ForEachSubshapeHelper(const Shape& shape,
-                             const ShapeUtil::VisitorFunction func,
+                             const ShapeUtil::StatusVisitorFunction& func,
                              ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(shape)) {
@@ -689,7 +698,7 @@ Status ForEachSubshapeHelper(const Shape& shape,
 // Helper for ForEachMutableSubshape which visits the subshapes of the given
 // shape in DFS pre-order starting with the index.
 Status ForEachMutableSubshapeHelper(
-    Shape* shape, const ShapeUtil::MutatingVisitorFunction func,
+    Shape* shape, const ShapeUtil::MutatingStatusVisitorFunction& func,
     ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(*shape)) {
@@ -705,14 +714,40 @@ Status ForEachMutableSubshapeHelper(
 
 }  // namespace
 
-/* static */ Status ShapeUtil::ForEachSubshape(const Shape& shape,
-                                               VisitorFunction func) {
+/* static */ void ShapeUtil::ForEachSubshape(const Shape& shape,
+                                             const VisitorFunction& func) {
+  ShapeIndex index;
+  ForEachSubshapeHelper(
+      shape,
+      [&func](const Shape& subshape, const ShapeIndex& index) {
+        func(subshape, index);
+        return Status::OK();
+      },
+      &index)
+      .IgnoreError();
+}
+
+/* static */ void ShapeUtil::ForEachMutableSubshape(
+    Shape* shape, const MutatingVisitorFunction& func) {
+  ShapeIndex index;
+  ForEachMutableSubshapeHelper(
+      shape,
+      [&func](Shape* subshape, const ShapeIndex& index) {
+        func(subshape, index);
+        return Status::OK();
+      },
+      &index)
+      .IgnoreError();
+}
+
+/* static */ Status ShapeUtil::ForEachSubshapeWithStatus(
+    const Shape& shape, const StatusVisitorFunction& func) {
   ShapeIndex index;
   return ForEachSubshapeHelper(shape, func, &index);
 }
 
-/* static */ Status ShapeUtil::ForEachMutableSubshape(
-    Shape* shape, MutatingVisitorFunction func) {
+/* static */ Status ShapeUtil::ForEachMutableSubshapeWithStatus(
+    Shape* shape, const MutatingStatusVisitorFunction& func) {
   ShapeIndex index;
   return ForEachMutableSubshapeHelper(shape, func, &index);
 }
@@ -725,9 +760,17 @@ Status ForEachMutableSubshapeHelper(
     new_shape.add_dimensions(dim);
   }
   if (shape.has_layout()) {
-    new_shape.mutable_layout()->clear_minor_to_major();
+    Layout* new_layout = new_shape.mutable_layout();
+    new_layout->clear_minor_to_major();
     for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
-      new_shape.mutable_layout()->add_minor_to_major(index);
+      new_layout->add_minor_to_major(index);
+    }
+    if (shape.layout().padded_dimensions_size() > 0) {
+      new_layout->clear_padded_dimensions();
+      for (auto dim :
+           Permute(permutation, shape.layout().padded_dimensions())) {
+        new_layout->add_padded_dimensions(dim);
+      }
     }
   }
   return new_shape;
@@ -744,27 +787,28 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
   // and unmodified_dim_pair have size >1. Otherwise, returns true and appends
   // the degerenate input/output dimensions in the gap to
   // deleted_indices/inserted_indices respectively.
-  auto check_modified_dims = [&shape_pre, &shape_post, &deleted_indices,
-                              &inserted_indices](
-      std::pair<int64, int64> prior_unmodified_dim_pair,
-      std::pair<int64, int64> unmodified_dim_pair) {
-    for (int64 modified_input_dim = prior_unmodified_dim_pair.first + 1;
-         modified_input_dim < unmodified_dim_pair.first; ++modified_input_dim) {
-      if (shape_pre.dimensions(modified_input_dim) > 1) {
-        return false;
-      }
-      deleted_indices.push_back(modified_input_dim);
-    }
-    for (int64 modified_output_dim = prior_unmodified_dim_pair.second + 1;
-         modified_output_dim < unmodified_dim_pair.second;
-         ++modified_output_dim) {
-      if (shape_post.dimensions(modified_output_dim) > 1) {
-        return false;
-      }
-      inserted_indices.push_back(modified_output_dim);
-    }
-    return true;
-  };
+  auto check_modified_dims =
+      [&shape_pre, &shape_post, &deleted_indices, &inserted_indices](
+          std::pair<int64, int64> prior_unmodified_dim_pair,
+          std::pair<int64, int64> unmodified_dim_pair) {
+        for (int64 modified_input_dim = prior_unmodified_dim_pair.first + 1;
+             modified_input_dim < unmodified_dim_pair.first;
+             ++modified_input_dim) {
+          if (shape_pre.dimensions(modified_input_dim) > 1) {
+            return false;
+          }
+          deleted_indices.push_back(modified_input_dim);
+        }
+        for (int64 modified_output_dim = prior_unmodified_dim_pair.second + 1;
+             modified_output_dim < unmodified_dim_pair.second;
+             ++modified_output_dim) {
+          if (shape_post.dimensions(modified_output_dim) > 1) {
+            return false;
+          }
+          inserted_indices.push_back(modified_output_dim);
+        }
+        return true;
+      };
 
   std::vector<std::pair<int64, int64>> unmodified_dims =
       DimensionsUnmodifiedByReshape(shape_pre, shape_post);
@@ -780,8 +824,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(ShapeUtil::Rank(shape_pre),
-                             ShapeUtil::Rank(shape_post));
+            : std::make_pair(Rank(shape_pre), Rank(shape_post));
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return nil;
     }
@@ -856,9 +899,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     return false;
   }
 
-  CHECK_EQ(ShapeUtil::ElementsIn(input_shape),
-           ShapeUtil::ElementsIn(output_shape));
-  if (ShapeUtil::ElementsIn(input_shape) == 0) {
+  CHECK_EQ(ElementsIn(input_shape), ElementsIn(output_shape));
+  if (ElementsIn(input_shape) == 0) {
     return true;
   }
 
@@ -972,21 +1014,17 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     // as input_shape/output_shape and the dimension-0-major layout. These two
     // shapes are used for conversion between logical linear indices and
     // multi-dimensional indices.
-    Shape input_shape_dim0_major =
-        ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-            input_shape.element_type(), AsInt64Slice(input_shape.dimensions()));
-    Shape output_shape_dim0_major =
-        ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-            output_shape.element_type(),
-            AsInt64Slice(output_shape.dimensions()));
+    Shape input_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+        input_shape.element_type(), AsInt64Slice(input_shape.dimensions()));
+    Shape output_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+        output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
 
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
+    for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64> input_unit_index(ShapeUtil::Rank(input_shape), 0);
+      std::vector<int64> input_unit_index(Rank(input_shape), 0);
       input_unit_index[input_dim] = 1;
       int64 logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1010,6 +1048,140 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
          check_input_unit_indices(output_shape, input_shape);
 }
 
+/* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
+    const Shape& input_shape, const Shape& output_shape) {
+  int64 input_rank = Rank(input_shape);
+  int64 output_rank = Rank(output_shape);
+
+  // First, calculate an alignment of the dimensions. A consecutive sequence of
+  // input dimensions and output dimensions belong to the same alignment part if
+  // the products of their dimension bounds are the same. In the easiest case,
+  // an alignment part consists of one input dimension and one output dimension
+  // which both have the same dimension bound. An alignment part specifies which
+  // dimensions need to be kept together in a physical layout if we want a
+  // reshape to be a bitcast. The order of the alignment parts is defined by the
+  // physical layout of the input shape, so when we construct the layout for the
+  // output shape we just process the alignment parts in this order, and then
+  // layout the dimensions belonging to each part in descending (major to minor)
+  // order.
+
+  // Stores the input and output dimension numbers where each alignment part
+  // starts.
+  std::vector<std::pair<int64, int64>> alignment;
+  alignment.push_back({0, 0});
+
+  // Stores a mapping from the input dimension to the alignment part it belongs
+  // to.
+  std::vector<int64> dimension_to_alignment_index(input_rank);
+  int64 input_dimension_product = 1, output_dimension_product = 1;
+  for (int64 i = 0, j = 0; i < input_rank || j < output_rank;) {
+    // Check if we have reached the end of an alignment part.
+    if (input_dimension_product == output_dimension_product &&
+        input_dimension_product > 1) {
+      alignment.push_back({i, j});
+      input_dimension_product = output_dimension_product = 1;
+    }
+    if (input_dimension_product < output_dimension_product ||
+        j == output_rank) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      dimension_to_alignment_index[i] = alignment.size() - 1;
+      input_dimension_product *= input_shape.dimensions(i);
+      ++i;
+    } else {
+      output_dimension_product *= output_shape.dimensions(j);
+      ++j;
+    }
+  }
+  if (input_dimension_product != output_dimension_product) {
+    return tensorflow::gtl::nullopt;
+  }
+  // We also need to store an end element so that we know where the last
+  // alignment part ends.
+  alignment.push_back({input_rank, output_rank});
+
+  // Now check if the physical layout can potentially be aligned to the output
+  // shape by changing the physical layout of the output shape. We need to check
+  // that all dimension numbers that belong to the same alignment part appear
+  // consecutively, and are in descending order. However we can ignore any
+  // trivial dimension bounds of 1, because they can be placed anywhere.
+  auto input_dimension_numbers = input_shape.layout().minor_to_major();
+  std::vector<int64> output_layout;
+  output_layout.reserve(output_rank);
+  for (int64 i = 0; i < input_rank;) {
+    int64 current_dimension_number = input_dimension_numbers[i];
+
+    // Skip trivial dimensions with a bound of 1.
+    if (input_shape.dimensions(current_dimension_number) == 1) {
+      ++i;
+      continue;
+    }
+
+    // Calculate the number of non-trivial dimension bounds in the input shape
+    // belonging to the current alignment part.
+    const int64 current_alignment_index =
+        dimension_to_alignment_index[current_dimension_number];
+    // Because of the special end element that we added, we can be sure that
+    // 'current_alignment_index' is < alignment.size() - 1.
+    CHECK_LT(current_alignment_index, alignment.size() - 1);
+    int64 num_non_trivial_dimensions_in_alignment_part = 0;
+    for (int64 j = alignment[current_alignment_index].first;
+         j < alignment[current_alignment_index + 1].first; ++j) {
+      if (input_shape.dimensions(j) != 1) {
+        ++num_non_trivial_dimensions_in_alignment_part;
+      }
+    }
+
+    // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
+    // dimension numbers (ignoring dimension numbers with dimension bound 1) are
+    // in descending order and belong to the current alignment part.
+    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+         ++i, ++j) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      // Skip trivial dimensions with a bound of 1.
+      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
+        --j;
+        continue;
+      }
+      // If the current dimension number belongs to a different alignment part,
+      // or the dimension numbers are not in descending order, we can return
+      // early.
+      if (dimension_to_alignment_index[input_dimension_numbers[i]] !=
+              current_alignment_index ||
+          input_dimension_numbers[i] > current_dimension_number) {
+        return tensorflow::gtl::nullopt;
+      }
+      current_dimension_number = input_dimension_numbers[i];
+    }
+
+    // The output dimension numbers that belong to the current alignment part
+    // need to appear in the same descending order as in the input. Again, we
+    // can skip dimensions with a bound of 1.
+    for (int64 j = alignment[current_alignment_index + 1].second - 1;
+         j >= alignment[current_alignment_index].second; --j) {
+      if (output_shape.dimensions(j) != 1) {
+        output_layout.push_back(j);
+      }
+    }
+  }
+  // Now add all the dimensions with dimension bound 1 at the end of
+  // 'output_layout'.
+  for (int64 i = 0; i < output_rank; ++i) {
+    if (output_shape.dimensions(i) == 1) {
+      output_layout.push_back(i);
+    }
+  }
+  CHECK_EQ(output_layout.size(), output_rank);
+  Shape output_shape_with_layout = MakeShapeWithLayout(
+      output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+      output_layout);
+  CHECK(ReshapeIsBitcast(input_shape, output_shape_with_layout));
+  return output_shape_with_layout;
+}
+
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
@@ -1044,4 +1216,34 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
+/* static */ void ShapeUtil::ForEachIndex(
+    const Shape& shape, tensorflow::gtl::ArraySlice<int64> base,
+    tensorflow::gtl::ArraySlice<int64> count,
+    tensorflow::gtl::ArraySlice<int64> incr,
+    const IndexVisitorFunction& visitor_function) {
+  if (ShapeUtil::HasZeroElements(shape)) {
+    return;
+  }
+  DCHECK_EQ(Rank(shape), base.size());
+  DCHECK_EQ(incr.size(), base.size());
+  DCHECK_EQ(count.size(), base.size());
+  const Layout& layout = shape.layout();
+  int64 rank = layout.minor_to_major_size();
+  // Allows handling R0 arrays, such that the visitor function will be called
+  // once with the proper empty indexes.
+  int64 n = -1;
+  std::vector<int64> indexes(base.begin(), base.end());
+  while (n < rank && visitor_function(indexes)) {
+    // Increments dimensions in minor to major order.
+    for (n = 0; n < rank; ++n) {
+      int64 dim = layout.minor_to_major(n);
+      indexes[dim] += incr[dim];
+      if (indexes[dim] < base[dim] + count[dim]) {
+        break;
+      }
+      indexes[dim] = base[dim];
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 290d993b50a..853be6b4cb8 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -72,11 +73,18 @@ class ShapeIndex {
     return indices_ == other.indices_;
   }
   bool operator!=(const ShapeIndex& other) const { return !(*this == other); }
+  bool operator<(const ShapeIndex& other) const {
+    return indices_ < other.indices_;
+  }
+
+  string ToString() const;
 
  private:
   std::vector<int64> indices_;
 };
 
+std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
+
 // Namespaced collection of (static) shape utilities.
 //
 // These are all effectively convenience functions for testing/tweaking proto
@@ -220,7 +228,7 @@ class ShapeUtil {
   // Validates that the provided shape satisfies invariants.
   static Status ValidateShape(const Shape& shape);
 
-  // Validates the the provided shape satisfies invariants, except those that
+  // Validates the provided shape satisfies invariants, except those that
   // pertain to layout.
   //
   // Layout is optional for client-provided shapes, so that the compiler may
@@ -287,18 +295,31 @@ class ShapeUtil {
   static const Shape& GetSubshape(const Shape& shape, const ShapeIndex& index);
   static Shape* GetMutableSubshape(Shape* shape, const ShapeIndex& index);
 
-  // Calls the given visitor function for each subshape of the given shape.
-  // Returns early if an error status is returned. Subshapes are visited in DFS
-  // pre-order starting with the entire shape (index {}).
-  using VisitorFunction = std::function<Status(const Shape& /*subshape*/,
-                                               const ShapeIndex& /*index*/)>;
-  static Status ForEachSubshape(const Shape& shape, VisitorFunction func);
+  // Returns whether the given index in the given shape is a leaf element of the
+  // shape.
+  static bool IsLeafIndex(const Shape& shape, const ShapeIndex& index);
 
-  // Mutating variant of ForEachSubshape.
+  // Calls the given visitor function for each subshape of the given shape.
+  // Subshapes are visited in DFS pre-order starting with the entire shape
+  // (index {}).
+  using VisitorFunction = std::function<void(const Shape& /*subshape*/,
+                                             const ShapeIndex& /*index*/)>;
+  static void ForEachSubshape(const Shape& shape, const VisitorFunction& func);
   using MutatingVisitorFunction =
+      std::function<void(Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
+  static void ForEachMutableSubshape(Shape* shape,
+                                     const MutatingVisitorFunction& func);
+
+  // Variants of ForEach(Mutable)Subshape which propagate Status from the
+  // visitor function.
+  using StatusVisitorFunction = std::function<Status(
+      const Shape& /*subshape*/, const ShapeIndex& /*index*/)>;
+  static Status ForEachSubshapeWithStatus(const Shape& shape,
+                                          const StatusVisitorFunction& func);
+  using MutatingStatusVisitorFunction =
       std::function<Status(Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
-  static Status ForEachMutableSubshape(Shape* shape,
-                                       MutatingVisitorFunction func);
+  static Status ForEachMutableSubshapeWithStatus(
+      Shape* shape, const MutatingStatusVisitorFunction& func);
 
   // Removes all degenerate dimensions (size one) from the given shape. The
   // stripped minor_to_major preserves the relative ordering of non-degenerate
@@ -370,6 +391,15 @@ class ShapeUtil {
   static bool ReshapeIsBitcast(const Shape& input_shape,
                                const Shape& output_shape);
 
+  // Find a physical layout for 'output_shape' such that
+  // ShapeUtil::ReshapeIsBitcast(input_shape, output_shape_with_layout) returns
+  // true (where 'output_shape_with_layout' is 'output_shape' with the found
+  // layout). The layout of 'input_shape' is kept fixed. Returns
+  // 'output_shape_with_layout' if such a layout can be found, and an error
+  // otherwise.
+  static tensorflow::gtl::optional<Shape> AlignLayouts(
+      const Shape& input_shape, const Shape& output_shape);
+
   // Returns a shape with the given dimension deleted.
   // For example:
   // • `DeleteDimension(1, T[m, n, k]) = T[m, k]`
@@ -383,6 +413,19 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
+  // Iterates through all the shape indexes, in minor to major order, starting
+  // from the base indexes, incrementing by the incr steps, up to count
+  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
+  // current index.
+  // The visitor_function visitor function should return true if it wants to
+  // continue, or false otherwise.
+  using IndexVisitorFunction = std::function<bool(const std::vector<int64>&)>;
+  static void ForEachIndex(const Shape& shape,
+                           tensorflow::gtl::ArraySlice<int64> base,
+                           tensorflow::gtl::ArraySlice<int64> count,
+                           tensorflow::gtl::ArraySlice<int64> incr,
+                           const IndexVisitorFunction& visitor_function);
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 9e6b243611b..69ef6175ccd 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
   Shape matrix = ShapeUtil::MakeShape(F32, {2, 3});
   EXPECT_EQ(3, ShapeUtil::GetDimension(matrix, -1));
@@ -319,6 +322,30 @@ TEST(ShapeUtilTest, GetSubshape) {
                        ShapeUtil::GetSubshape(nested_tuple_shape, {2, 0})));
 }
 
+TEST(ShapeUtilTest, IsLeafIndex) {
+  // Test array shape.
+  Shape array_shape = ShapeUtil::MakeShape(F32, {42, 42, 123});
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(array_shape, {}));
+
+  // Test tuple shape.
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({array_shape, array_shape});
+  EXPECT_FALSE(ShapeUtil::IsLeafIndex(tuple_shape, {}));
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(tuple_shape, {0}));
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(tuple_shape, {1}));
+
+  // Test nested tuple shape.
+  Shape nested_tuple_shape = ShapeUtil::MakeTupleShape(
+      {array_shape, ShapeUtil::MakeTupleShape({array_shape, array_shape}),
+       ShapeUtil::MakeTupleShape(
+           {ShapeUtil::MakeTupleShape({array_shape, array_shape}),
+            array_shape})});
+  EXPECT_FALSE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {}));
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {0}));
+  EXPECT_FALSE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1}));
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1, 0}));
+  EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1, 1}));
+}
+
 TEST(ShapeUtilTest, HumanString) {
   Shape opaque = ShapeUtil::MakeOpaqueShape();
   Shape scalar = ShapeUtil::MakeShape(F32, {});
@@ -377,13 +404,12 @@ TEST(ShapeUtilTest, HumanString) {
 TEST(ShapeUtilTest, ForEachSubshapeArray) {
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   int calls = 0;
-  EXPECT_IS_OK(ShapeUtil::ForEachSubshape(
+  ShapeUtil::ForEachSubshape(
       shape, [&calls, &shape](const Shape& subshape, const ShapeIndex& index) {
         EXPECT_EQ(&shape, &subshape);
         EXPECT_TRUE(index.empty());
         ++calls;
-        return tensorflow::Status::OK();
-      }));
+      });
   EXPECT_EQ(1, calls);
 }
 
@@ -393,7 +419,7 @@ TEST(ShapeUtilTest, ForEachSubshapeNestedTuple) {
        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {101}),
                                   ShapeUtil::MakeShape(PRED, {33})})});
   int calls = 0;
-  EXPECT_IS_OK(ShapeUtil::ForEachSubshape(
+  ShapeUtil::ForEachSubshape(
       shape, [&calls, &shape](const Shape& subshape, const ShapeIndex& index) {
         EXPECT_TRUE(
             ShapeUtil::Equal(subshape, ShapeUtil::GetSubshape(shape, index)));
@@ -405,8 +431,7 @@ TEST(ShapeUtilTest, ForEachSubshapeNestedTuple) {
           EXPECT_EQ(33, ShapeUtil::ElementsIn(subshape));
         }
         ++calls;
-        return tensorflow::Status::OK();
-      }));
+      });
   EXPECT_EQ(5, calls);
 }
 
@@ -416,7 +441,7 @@ TEST(ShapeUtilTest, ForEachMutableSubshapeNestedTuple) {
        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {101}),
                                   ShapeUtil::MakeShape(PRED, {33})})});
   int calls = 0;
-  EXPECT_IS_OK(ShapeUtil::ForEachMutableSubshape(
+  ShapeUtil::ForEachMutableSubshape(
       &shape, [&calls, &shape](const Shape* subshape, const ShapeIndex& index) {
         // Pointer values should be equal
         EXPECT_EQ(subshape, ShapeUtil::GetMutableSubshape(&shape, index));
@@ -428,8 +453,7 @@ TEST(ShapeUtilTest, ForEachMutableSubshapeNestedTuple) {
           EXPECT_EQ(33, ShapeUtil::ElementsIn(*subshape));
         }
         ++calls;
-        return tensorflow::Status::OK();
-      }));
+      });
   EXPECT_EQ(5, calls);
 }
 
@@ -443,24 +467,52 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape2)));
 }
 
+TEST(ShapeUtilTest, ForEachIndex) {
+  struct ShapeDimensionAndNumberInvocations {
+    std::vector<int64> dimensions;
+    int invocations;
+  } test_data[] = {
+      {{}, 1},     {{0}, 0},      {{16}, 16},          {{3, 0}, 0},
+      {{0, 2}, 0}, {{4, 16}, 64}, {{6, 11, 17}, 1122}, {{6, 11, 5, 17}, 5610},
+  };
+
+  for (const auto& data : test_data) {
+    Shape shape = ShapeUtil::MakeShape(F32, data.dimensions);
+    // Increments at every invocation.
+    int invocations = 0;
+    auto increment_func = [&invocations](const std::vector<int64>& indexes) {
+      invocations++;
+      return true;
+    };
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+
+    ShapeUtil::ForEachIndex(shape, zero_base, data.dimensions, step,
+                            increment_func);
+
+    EXPECT_EQ(invocations, data.invocations);
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1_to_1x1x1x1) {
   // All input dimensions should be unmodified. One of the output dimensions is
   // modified because the output rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
@@ -468,11 +520,10 @@ TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
   // 4, 1, 3, 5, 6, 7
   //          |
   // 2, 6, 1, 5, 1, 42
-  EXPECT_TRUE(
-      ContainersEqual(ShapeUtil::DimensionsUnmodifiedByReshape(
-                          ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
-                          ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
-                      std::vector<std::pair<int64, int64>>({{3, 3}})));
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
+                  ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
+              ElementsAre(std::make_pair(3, 3)));
 }
 
 TEST(ShapeUtilTest, ReshapeIsBitcast_3x4_6x2) {
@@ -521,5 +572,58 @@ TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
       ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1})));
 }
 
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensions) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(4, 3, 2, 1, 0, 5));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+
+  aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {3, 2, 4, 35, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(3, 2, 1, 0, 4));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 3, 8, 1, 5, 7, 1, 11, 1, 1},
+                                     {5, 0, 4, 2, 1, 3, 6, 7, 9, 8});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+// A test case where the consecutive elements of the input shape belonging to
+// the same layout part are not in descending order.
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensionsWrongInputLayout) {
+  // Same physical layout as in AlignLayoutsWithoutTrivialDimensions, except
+  // that the first two dimension numbers are exchanged.
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {2, 3, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_FALSE(aligned_shape);
+}
+
+// A test case where the physical layout of the input shape does not place all
+// dimensions that belong to the same alignment part consecutively.
+TEST(AlignmentTest,
+     AlignLayoutsWithoutTrivialDimensionsNonConsecutiveAlignmentPart) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 5, 77}));
+  EXPECT_FALSE(aligned_shape);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/status.h b/tensorflow/compiler/xla/status.h
index f3b561fada3..4eb3bf37664 100644
--- a/tensorflow/compiler/xla/status.h
+++ b/tensorflow/compiler/xla/status.h
@@ -21,25 +21,7 @@ limitations under the License.
 
 namespace xla {
 
-#if defined(__clang__)
-// Only clang supports warn_unused_result as a type annotation.
-class TF_MUST_USE_RESULT Status;
-#endif
-
-// Simple wrapper around tensorflow::Status that has the MUST_USE_RESULT
-// annotation above. When tensorflow::Status adopts this annotation, this can
-// simply become a "using tensorflow::Status".
-class Status : public tensorflow::Status {
- public:
-  static Status OK() { return tensorflow::Status::OK(); }
-
-  // Note: implicit constructor.
-  Status(tensorflow::Status other) : tensorflow::Status(other) {}
-
-  Status() : tensorflow::Status() {}
-  Status(tensorflow::error::Code code, tensorflow::StringPiece msg)
-      : tensorflow::Status(code, msg) {}
-};
+using tensorflow::Status;
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/status_macros_test.cc b/tensorflow/compiler/xla/status_macros_test.cc
index 4e7b9161db5..dead17cdfa1 100644
--- a/tensorflow/compiler/xla/status_macros_test.cc
+++ b/tensorflow/compiler/xla/status_macros_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 
@@ -40,15 +40,15 @@ Status RetCheckSuccess() {
 TEST(StatusMacros, RetCheckFailing) {
   Status status = RetCheckFail();
   EXPECT_EQ(status.code(), tensorflow::error::INTERNAL);
-  EXPECT_MATCH(status.error_message(),
-               xla::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
 }
 
 TEST(StatusMacros, RetCheckFailingWithExtraMessage) {
   Status status = RetCheckFailWithExtraMessage();
   EXPECT_EQ(status.code(), tensorflow::error::INTERNAL);
-  EXPECT_MATCH(status.error_message(),
-               xla::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
 }
 
 TEST(StatusMacros, RetCheckSucceeding) {
@@ -73,7 +73,7 @@ Status ReturnStatusError() { return (tensorflow::errors::Internal("foobar")); }
 
 using StatusReturningFunction = std::function<Status()>;
 
-StatusOr<int> CallStatusReturningFunction(StatusReturningFunction func) {
+StatusOr<int> CallStatusReturningFunction(const StatusReturningFunction& func) {
   TF_RETURN_IF_ERROR(func());
   return 42;
 }
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index 8046a2216fe..d8cd736238c 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -105,7 +105,6 @@ class StatusOr {
   // In optimized builds, passing Status::OK here will have the effect
   // of passing tensorflow::error::INTERNAL as a fallback.
   StatusOr(Status status);              // NOLINT
-  StatusOr(tensorflow::Status status);  // NOLINT
 
   // Construct a new StatusOr with the given value. If T is a plain pointer,
   // value must not be NULL. After calling this constructor, calls to
@@ -196,8 +195,6 @@ class StatusOr<T, false> : public StatusOr<T, true> {
       : StatusOr<T, true>::StatusOr(std::move(value)) {}
   StatusOr(Status status)  // NOLINT
       : StatusOr<T, true>::StatusOr(std::move(status)) {}
-  StatusOr(tensorflow::Status status)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(status)) {}
   template <typename U>
   StatusOr(StatusOr<U>&& other)  // NOLINT
       : StatusOr<T, true>::StatusOr(std::move(other)) {}
@@ -245,14 +242,6 @@ inline StatusOr<T, CopyConstructible>::StatusOr(Status status)
   }
 }
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(tensorflow::Status status)
-    : status_(status) {
-  if (status_.ok()) {
-    status_ = internal::StatusOrHelper::HandleInvalidStatusCtorArg();
-  }
-}
-
 template <typename T, bool CopyConstructible>
 inline StatusOr<T, CopyConstructible>::StatusOr(const T& value)
     : value_(value) {
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index d98eb279336..f8555113f81 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <memory>
 #include <type_traits>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace xla {
@@ -45,7 +45,7 @@ class Base2 {
 
 class Derived : public Base1, public Base2 {
  public:
-  virtual ~Derived() {}
+  ~Derived() override {}
   int evenmorepad;
 };
 
@@ -436,17 +436,17 @@ class BenchmarkFactory {
   }
 
   Status ArgumentFactoryFail(T** result) TF_ATTRIBUTE_NOINLINE {
-    *result = NULL;
+    *result = nullptr;
     return Status(tensorflow::error::CANCELLED, "");
   }
 
   Status ArgumentFactoryFailShortMsg(T** result) TF_ATTRIBUTE_NOINLINE {
-    *result = NULL;
+    *result = nullptr;
     return Status(::tensorflow::error::INTERNAL, "");
   }
 
   Status ArgumentFactoryFailLongMsg(T** result) TF_ATTRIBUTE_NOINLINE {
-    *result = NULL;
+    *result = nullptr;
     return Status(::tensorflow::error::INTERNAL,
                   "a big string of message junk that will never be read");
   }
@@ -489,26 +489,30 @@ class BenchmarkType {
 
 // Calibrate the amount of time spent just calling DoWork, since each of our
 // tests will do this, we can subtract this out of benchmark results.
-static void BM_CalibrateWorkLoop(int iters) {
+void BM_CalibrateWorkLoop(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   BenchmarkType* result = factory.TrivialFactory();
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
-    if (result != NULL) result->DoWork();
+    if (result != nullptr) {
+      result->DoWork();
+    }
   }
 }
 BENCHMARK(BM_CalibrateWorkLoop);
 
 // Measure the time taken to call into the factory, return the value,
 // determine that it is OK, and invoke a trivial function.
-static void BM_TrivialFactory(int iters) {
+void BM_TrivialFactory(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
     BenchmarkType* result = factory.TrivialFactory();
-    if (result != NULL) result->DoWork();
+    if (result != nullptr) {
+      result->DoWork();
+    }
   }
 }
 BENCHMARK(BM_TrivialFactory);
@@ -516,14 +520,14 @@ BENCHMARK(BM_TrivialFactory);
 // Measure the time taken to call into the factory, providing an
 // out-param for the result, evaluating the status result and the
 // result pointer, and invoking the trivial function.
-static void BM_ArgumentFactory(int iters) {
+void BM_ArgumentFactory(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
-    BenchmarkType* result = NULL;
+    BenchmarkType* result = nullptr;
     Status status = factory.ArgumentFactory(&result);
-    if (status.ok() && result != NULL) {
+    if (status.ok() && result != nullptr) {
       result->DoWork();
     }
   }
@@ -532,7 +536,7 @@ BENCHMARK(BM_ArgumentFactory);
 
 // Measure the time to use the StatusOr<T*> factory, evaluate the result,
 // and invoke the trivial function.
-static void BM_StatusOrFactory(int iters) {
+void BM_StatusOrFactory(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
@@ -548,14 +552,14 @@ BENCHMARK(BM_StatusOrFactory);
 // Measure the time taken to call into the factory, providing an
 // out-param for the result, evaluating the status result and the
 // result pointer, and invoking the trivial function.
-static void BM_ArgumentFactoryFail(int iters) {
+void BM_ArgumentFactoryFail(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
-    BenchmarkType* result = NULL;
+    BenchmarkType* result = nullptr;
     Status status = factory.ArgumentFactoryFail(&result);
-    if (status.ok() && result != NULL) {
+    if (status.ok() && result != nullptr) {
       result->DoWork();
     }
   }
@@ -564,7 +568,7 @@ BENCHMARK(BM_ArgumentFactoryFail);
 
 // Measure the time to use the StatusOr<T*> factory, evaluate the result,
 // and invoke the trivial function.
-static void BM_StatusOrFactoryFail(int iters) {
+void BM_StatusOrFactoryFail(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
@@ -580,14 +584,14 @@ BENCHMARK(BM_StatusOrFactoryFail);
 // Measure the time taken to call into the factory, providing an
 // out-param for the result, evaluating the status result and the
 // result pointer, and invoking the trivial function.
-static void BM_ArgumentFactoryFailShortMsg(int iters) {
+void BM_ArgumentFactoryFailShortMsg(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
-    BenchmarkType* result = NULL;
+    BenchmarkType* result = nullptr;
     Status status = factory.ArgumentFactoryFailShortMsg(&result);
-    if (status.ok() && result != NULL) {
+    if (status.ok() && result != nullptr) {
       result->DoWork();
     }
   }
@@ -596,7 +600,7 @@ BENCHMARK(BM_ArgumentFactoryFailShortMsg);
 
 // Measure the time to use the StatusOr<T*> factory, evaluate the result,
 // and invoke the trivial function.
-static void BM_StatusOrFactoryFailShortMsg(int iters) {
+void BM_StatusOrFactoryFailShortMsg(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
@@ -612,14 +616,14 @@ BENCHMARK(BM_StatusOrFactoryFailShortMsg);
 // Measure the time taken to call into the factory, providing an
 // out-param for the result, evaluating the status result and the
 // result pointer, and invoking the trivial function.
-static void BM_ArgumentFactoryFailLongMsg(int iters) {
+void BM_ArgumentFactoryFailLongMsg(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
   for (int i = 0; i != iters; ++i) {
-    BenchmarkType* result = NULL;
+    BenchmarkType* result = nullptr;
     Status status = factory.ArgumentFactoryFailLongMsg(&result);
-    if (status.ok() && result != NULL) {
+    if (status.ok() && result != nullptr) {
       result->DoWork();
     }
   }
@@ -628,7 +632,7 @@ BENCHMARK(BM_ArgumentFactoryFailLongMsg);
 
 // Measure the time to use the StatusOr<T*> factory, evaluate the result,
 // and invoke the trivial function.
-static void BM_StatusOrFactoryFailLongMsg(int iters) {
+void BM_StatusOrFactoryFailLongMsg(int iters) {
   tensorflow::testing::StopTiming();
   BenchmarkFactory<BenchmarkType> factory;
   tensorflow::testing::StartTiming();
diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h
new file mode 100644
index 00000000000..87a8c5f3a52
--- /dev/null
+++ b/tensorflow/compiler/xla/test.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPLIER_XLA_TEST_H_
+#define TENSORFLOW_COMPLIER_XLA_TEST_H_
+
+// This header includes gmock.h and enables the use of gmock matchers in tests
+// in third_party/tensorflow/compiler/xla.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+//
+// Note that while the use of gmock matchers is allowed in the xla project, the
+// use of mocks is disallowed in the whole tensorflow project!
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
+#include "testing/base/public/gmock.h"
+#else
+#include <gmock/gmock-generated-matchers.h>
+#include <gmock/gmock-matchers.h>
+#endif
+
+#include "tensorflow/core/platform/test.h"
+
+#endif  // TENSORFLOW_COMPLIER_XLA_TEST_H_
diff --git a/tensorflow/compiler/xla/test_helpers.cc b/tensorflow/compiler/xla/test_helpers.cc
deleted file mode 100644
index 02abfdeab80..00000000000
--- a/tensorflow/compiler/xla/test_helpers.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/regexp.h"
-
-namespace xla {
-namespace testing {
-
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_ != nullptr ? new std::string(*other.message_)
-                                         : static_cast<std::string*>(nullptr)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_ != nullptr) negation << *message_;
-  return negation;
-}
-
-AssertionResult& AssertionResult::operator=(const AssertionResult& ar) {
-  success_ = ar.success_;
-  message_.reset(ar.message_ != nullptr ? new std::string(*ar.message_)
-                                        : nullptr);
-  return *this;
-}
-
-AssertionResult AssertionFailure() { return AssertionResult(false); }
-
-AssertionResult AssertionSuccess() { return AssertionResult(true); }
-
-std::function<bool(tensorflow::StringPiece)> ContainsRegex(
-    const tensorflow::StringPiece regex) {
-  return [regex](const tensorflow::StringPiece to_test) {
-    if (RE2::PartialMatch(
-            tensorflow::RegexpStringPiece(to_test.data(), to_test.size()),
-            tensorflow::RegexpStringPiece(regex.data(), regex.size()))) {
-      return true;
-    } else {
-      LOG(ERROR) << "Expected to find " << regex << " in " << to_test;
-      return false;
-    }
-  };
-}
-
-std::function<bool(tensorflow::StringPiece)> HasSubstr(
-    const tensorflow::StringPiece part) {
-  return [part](const tensorflow::StringPiece whole) {
-    return whole.contains(part);
-  };
-}
-
-}  // namespace testing
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index f923d9f36c8..634cdb5aa29 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -39,286 +39,6 @@ class Literal;
 
 namespace testing {
 
-class AssertionResult {
- public:
-  explicit AssertionResult(bool success) : success_(success) {}
-
-  // Returns true iff the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_ != nullptr ? message_->c_str() : "";
-  }
-
-  // Streams a custom failure message into this object.
-  template <typename T>
-  AssertionResult& operator<<(const T& value) {
-    AppendMessage(::testing::Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      std::ostream& (*basic_manipulator)(std::ostream& stream)) {
-    AppendMessage(::testing::Message() << basic_manipulator);
-    return *this;
-  }
-
-  // Copy operator.
-  AssertionResult(const AssertionResult& ar);
-
-  // Assignment operator.
-  AssertionResult& operator=(const AssertionResult&);
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const ::testing::Message& a_message) {
-    if (message_ == nullptr) message_.reset(new std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  bool success_ = false;
-
-  // Stores the message describing the condition in case the
-  // expectation construct is not satisfied with the predicate's
-  // outcome.  Referenced via a pointer to avoid taking too much stack
-  // frame space with test assertions.
-  std::unique_ptr<std::string> message_;
-};
-
-AssertionResult AssertionFailure();
-
-AssertionResult AssertionSuccess();
-
-std::function<bool(tensorflow::StringPiece)> ContainsRegex(
-    const tensorflow::StringPiece regex);
-
-std::function<bool(tensorflow::StringPiece)> HasSubstr(
-    const tensorflow::StringPiece part);
-
-// Matcher for a vector of same-type values for which operator= is
-// defined.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> VectorMatcher(
-    const std::vector<T>& expected) {
-  return [expected](const std::vector<T>& actual) -> AssertionResult {
-    int len = expected.size();
-    if (actual.size() != len) {
-      return AssertionFailure() << "Actual values len of " << actual.size()
-                                << " != expected.size " << len;
-    }
-    for (int i = 0; i < len; ++i) {
-      if (actual[i] != expected[i]) {
-        return AssertionFailure() << "Element " << i << " actual " << actual[i]
-                                  << " != " << expected[i];
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Approximate matcher for a vector of floats or similar.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)>
-ApproxVectorMatcher(const std::vector<T>& expected, float abs_diff,
-                    float rel_diff) {
-  return [abs_diff, rel_diff,
-          expected](const std::vector<T>& actual) -> AssertionResult {
-    int len = expected.size();
-    if (actual.size() != len) {
-      AssertionResult ar = AssertionFailure() << "Actual values len of "
-                                              << actual.size()
-                                              << " != expected.size " << len;
-      LOG(ERROR) << ar.message();
-      return ar;
-    }
-    for (int i = 0; i < len; ++i) {
-      T diff = actual[i] - expected[i];
-      if (diff < 0) {
-        diff *= -1;
-      }
-      if (diff > abs_diff) {
-        T rdiff = (expected[i] != 0 ? diff / expected[i] : 0.0 * expected[i]);
-        if (rdiff > rel_diff) {
-          AssertionResult ar = AssertionFailure()
-                               << "Element " << i << " actual " << actual[i]
-                               << " != " << expected[i]
-                               << "( abs_diff = " << diff
-                               << ", rel_diff = " << rdiff << ")";
-          LOG(ERROR) << ar.message();
-          return ar;
-        }
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Matches a vector of same-type values against another, succeeding so
-// long as they have the same length and every value in 'actual'
-// matches one in 'expected.'  Does not verify an exhaustive
-// one-to-one mapping between the two.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)>
-UnorderedElementsAre(const std::vector<T>& expected) {
-  return [expected](const std::vector<T>& actual) -> AssertionResult {
-    if (actual.size() != expected.size()) {
-      return AssertionFailure() << "sizes don't match";
-    }
-    for (auto a : actual) {
-      bool found = false;
-      for (auto e : expected) {
-        if (a == e) {
-          found = true;
-          break;
-        }
-      }
-      if (!found) {
-        return AssertionFailure() << "actual element " << a
-                                  << " not in expected";
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Overloaded cover functions for UnorderedElementsAre, for the numbers
-// of values used in practice.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d, T e) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  expected.push_back(e);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d, T e, T f) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  expected.push_back(e);
-  expected.push_back(f);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-// Overloaded cover functions for VectorMatcher for the numbers of
-// elements used in practice.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b, T c) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b, T c, T d) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  return testing::VectorMatcher<T>(expected);
-}
-
-// Convert a RepeatedField to a flat vector.
-template <typename T>
-std::vector<T> PBToVec(const tensorflow::protobuf::RepeatedField<T> rf) {
-  return std::vector<T>(rf.begin(), rf.end());
-}
-
-// Convert a List to a flat vector.
-template <typename T>
-std::vector<T> ListToVec(const std::list<T>& l) {
-  return std::vector<T>(l.begin(), l.end());
-}
-
-// Convert a Set to a flat vector.
-template <typename T>
-std::vector<T> SetToVec(const std::set<T>& c) {
-  return std::vector<T>(c.begin(), c.end());
-}
-
-// Convert an Array to a flat vector.
-template <typename T>
-std::vector<T> Array2DToVec(const Array2D<T>& a) {
-  return std::vector<T>(a.data(), a.data() + a.num_elements());
-}
-
 namespace internal_status {
 inline const ::tensorflow::Status& GetStatus(
     const ::tensorflow::Status& status) {
@@ -347,9 +67,4 @@ inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
   ASSERT_EQ(tensorflow::Status::OK(), \
             xla::testing::internal_status::GetStatus(expression))
 
-// Macros that apply a Matcher to a Value, returning an
-// AssertionResult which gets digested by a standard gunit macro.
-#define EXPECT_MATCH(V, M) EXPECT_TRUE((M)((V)))
-#define ASSERT_MATCH(V, M) ASSERT_TRUE(M(V))
-
 #endif  // TENSORFLOW_COMPILER_XLA_TEST_HELPERS_H_
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 31d549cc421..13dd1a30b60 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -92,6 +93,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/legacy_flags:hlo_test_base_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:backend",
@@ -101,8 +103,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -149,7 +151,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_pass_pipeline_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -173,7 +175,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -203,6 +204,7 @@ cc_library(
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//third_party/eigen3",
     ],
 )
 
@@ -217,6 +219,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -236,6 +239,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
     ],
@@ -252,6 +256,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
     ],
@@ -271,6 +276,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -286,6 +292,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -309,6 +316,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -332,6 +340,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -346,7 +355,9 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
     ],
@@ -454,16 +465,18 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
     ],
 )
 
@@ -656,7 +669,9 @@ xla_test(
     },
     shard_count = 30,
     deps = [
+        "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
@@ -886,6 +901,7 @@ xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
     deps = [
+        ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
@@ -930,7 +946,6 @@ xla_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
     ],
 )
 
@@ -958,13 +973,13 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:test",
     ],
 )
 
@@ -1147,6 +1162,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1201,12 +1217,11 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -1234,27 +1249,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "inprocess_service_test",
-    srcs = ["inprocess_service_test.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
@@ -1344,6 +1338,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "hlo_metadata_test",
+    srcs = [
+        "hlo_metadata_test.cc",
+    ],
+    deps = [
+        ":local_client_test_base",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/service:computation_tracker",
+        "//tensorflow/compiler/xla/service:local_service",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 xla_test(
     name = "round_trip_transfer_test",
     srcs = ["round_trip_transfer_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 23579088c9e..c07f2745fe9 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -27,14 +27,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -82,6 +85,50 @@ TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
                              {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({});
+  auto result = builder.IsFinite(a);
+
+  ComputeAndCompareR1<bool>(&builder, {}, {});
+}
+
+// A non-canonical quiet NaN value.
+static const float kNonCanonicalNaN = tensorflow::bit_cast<float>(0x7FD01234);
+
+XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteScalarF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto result = builder.IsFinite(builder.ConstantR0<float>(NAN));
+  ComputeAndCompareR0<bool>(&builder, false, {});
+
+  EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
+  auto result_non_canonical =
+      builder.IsFinite(builder.ConstantR0<float>(kNonCanonicalNaN));
+  ComputeAndCompareR0<bool>(&builder, false, {});
+
+  const float inf = std::numeric_limits<float>::infinity();
+  auto result_inf = builder.IsFinite(builder.ConstantR0<float>(inf));
+  ComputeAndCompareR0<bool>(&builder, false, {});
+
+  auto result_neg_inf = builder.IsFinite(builder.ConstantR0<float>(-inf));
+  ComputeAndCompareR0<bool>(&builder, false, {});
+
+  auto result_zero = builder.IsFinite(builder.ConstantR0<float>(0.0f));
+  ComputeAndCompareR0<bool>(&builder, true, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) {
+  ComputationBuilder builder(client_, TestName());
+  const float inf = std::numeric_limits<float>::infinity();
+  EXPECT_TRUE(std::isnan(kNonCanonicalNaN));
+  auto a = builder.ConstantR1<float>(
+      {{NAN, 7.0f, kNonCanonicalNaN, -1.0f, inf, -inf}});
+  auto result = builder.IsFinite(a);
+
+  ComputeAndCompareR1<bool>(&builder, {false, true, false, true, false, false},
+                            {});
+}
+
 TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
@@ -197,6 +244,150 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
+TEST_F(ArrayElementwiseOpTest, DivS32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<int32> vals = {
+    INT32_MIN, INT32_MIN + 1, INT32_MIN + 2, -0x40000000, -0x3fffffff,
+    -271181, -1309, -17, -10, -5, -3, -2, -1, 0, 1, 2, 3, 5, 10, 17, 26, 101,
+    7919, 0x40000000, INT32_MAX - 2, INT32_MAX - 1, INT32_MAX};
+  // clang-format on
+
+  std::vector<int32> dividends, divisors, quotients, remainders;
+  for (int32 divisor : vals) {
+    if (divisor != 0) {
+      for (int32 dividend : vals) {
+        // Avoid integer overflow.
+        if (dividend != INT32_MIN || divisor != -1) {
+          dividends.push_back(dividend);
+          divisors.push_back(divisor);
+          quotients.push_back(dividend / divisor);
+          remainders.push_back(dividend % divisor);
+        }
+      }
+    }
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Div(dividend, divisor);
+
+    ComputeAndCompareR1<int32>(&builder, quotients,
+                               {dividend_data.get(), divisor_data.get()});
+  }
+
+  // Test with a compile-time constant divisor.
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    builder.Div(dividend, builder.ConstantR1<int32>(divisors));
+
+    ComputeAndCompareR1<int32>(&builder, quotients, {dividend_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Rem(dividend, divisor);
+
+    ComputeAndCompareR1<int32>(&builder, remainders,
+                               {dividend_data.get(), divisor_data.get()});
+  }
+
+  // Test with a compile-time constant divisor.
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    builder.Rem(dividend, builder.ConstantR1<int32>(divisors));
+
+    ComputeAndCompareR1<int32>(&builder, remainders, {dividend_data.get()});
+  }
+}
+
+TEST_F(ArrayElementwiseOpTest, DivU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0xABCDEF12, 0xCAFEBEEF, 0x80000000,
+    0x80000001, UINT32_MAX - 2, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
+
+  std::vector<uint32> dividends, divisors, quotients, remainders;
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        dividends.push_back(dividend);
+        divisors.push_back(divisor);
+        quotients.push_back(dividend / divisor);
+        remainders.push_back(dividend % divisor);
+      }
+    }
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Div(dividend, divisor);
+
+    ComputeAndCompareR1<uint32>(&builder, quotients,
+                                {dividend_data.get(), divisor_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    builder.Div(dividend, builder.ConstantR1<uint32>(divisors));
+
+    ComputeAndCompareR1<uint32>(&builder, quotients, {dividend_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Rem(dividend, divisor);
+
+    ComputeAndCompareR1<uint32>(&builder, remainders,
+                                {dividend_data.get(), divisor_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    builder.Rem(dividend, builder.ConstantR1<uint32>(divisors));
+
+    ComputeAndCompareR1<uint32>(&builder, remainders, {dividend_data.get()});
+  }
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>(
@@ -441,6 +632,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
+TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
+  // Disable fast-math because we're operating on NaNs.
+  SetFastMathDisabled(true);
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+  auto rhs = builder.ConstantR1<float>({10.0f, 25.5f, 1.0f, 10.0f, NAN});
+  auto compare = builder.Ne(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
+}
+
 TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
@@ -575,12 +778,14 @@ TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f});
-  auto rhs = builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN});
+  auto lhs =
+      builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f});
+  auto rhs =
+      builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f});
   auto minimum = builder.Pow(lhs, rhs);
 
-  ComputeAndCompareR1<float>(&builder, {16.0f, 0.25f, 8.0f, NAN, NAN}, {},
-                             error_spec_);
+  ComputeAndCompareR1<float>(
+      &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
@@ -625,6 +830,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   const int count = GetParam();
   ComputationBuilder builder(client_, TestName());
   std::vector<float> values;
+  values.reserve(count);
   for (int i = 0; i < count; ++i) {
     values.push_back(i / static_cast<float>(count));
   }
@@ -632,6 +838,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
 
   std::vector<float> expected;
+  expected.reserve(values.size());
   for (float value : values) {
     expected.push_back(value * value);
   }
@@ -1584,7 +1791,7 @@ TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, R4_32x64x2x2_Plus_R1_64) {
+TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   constexpr int d0 = 16;
   constexpr int d1 = 16;
   constexpr int d2 = 2;
@@ -1622,9 +1829,9 @@ TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
   auto concatenated = builder.Add(x, x);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::ContainsRegex(
-                   "Expected non-opaque argument for lhs of binary operation"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              ::testing::ContainsRegex(
+                  "Expected non-opaque argument for lhs of binary operation"));
 }
 
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
@@ -1638,7 +1845,7 @@ TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
 
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, y_literal->shape(), "y");
-  auto slice = builder.Slice(x, {1}, {2});
+  auto slice = builder.Slice(x, {1}, {2}, {1});
   builder.Sub(slice, y);
 
   ComputeAndCompareR1<float>(&builder, {-2, -3}, {x_data.get(), y_data.get()},
@@ -1654,7 +1861,9 @@ INSTANTIATE_TEST_CASE_P(ArrayElementwiseOpTestParamCount,
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index adffac09e36..a1ca1de584f 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -74,6 +75,7 @@ TEST_F(AxpySimpleTest, AxpyTenValues) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index c7b533b80f1..ea58491038c 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -45,8 +45,8 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
   StatusOr<Computation> computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
-  EXPECT_MATCH(computation.status().error_message(),
-               testing::HasSubstr("shape has invalid"));
+  EXPECT_THAT(computation.status().error_message(),
+              ::testing::HasSubstr("shape has invalid"));
 }
 
 TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
@@ -69,6 +69,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 598fd69909b..6a47f1b718a 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -194,6 +195,7 @@ TEST_F(BatchNormalizationTest, SpecComparisonForward) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index e825bd435b6..5e3b70702dd 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -141,6 +142,7 @@ TEST_F(BinopScalingTest, R4PlusR0S32) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 200d4d45634..25fe04a930e 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -22,18 +22,92 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
-using BroadcastSimpleTest = ClientLibraryTestBase;
+class BroadcastSimpleTest : public ClientLibraryTestBase {
+ public:
+  ComputationDataHandle BuildBinOp(HloOpcode op,
+                                   const ComputationDataHandle& lhs,
+                                   const ComputationDataHandle& rhs,
+                                   ComputationBuilder* builder) {
+    switch (op) {
+      case HloOpcode::kMinimum: {
+        return builder->Min(lhs, rhs);
+      }
+      case HloOpcode::kMaximum: {
+        return builder->Max(lhs, rhs);
+      }
+      case HloOpcode::kMultiply: {
+        return builder->Mul(lhs, rhs);
+      }
+      default: {
+        // Default to Add
+        return builder->Add(lhs, rhs);
+      }
+    }
+  }
+
+  std::unique_ptr<GlobalData> MakeR3Data(
+      tensorflow::gtl::ArraySlice<int64> bounds,
+      tensorflow::gtl::ArraySlice<int64> minor_to_major, Shape* r3_shape,
+      Array3D<float>* r3_array, float start, float end, int seed) {
+    *r3_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major);
+    r3_array->FillRandom(start, end, seed);
+    auto r3_data =
+        LiteralUtil::Relayout(*LiteralUtil::CreateR3FromArray3D(*r3_array),
+                              LayoutUtil::MakeLayout(minor_to_major));
+    std::unique_ptr<GlobalData> r3_global_data =
+        client_->TransferToServer(*r3_data).ConsumeValueOrDie();
+    return r3_global_data;
+  }
+
+  std::unique_ptr<GlobalData> MakeR2Data(
+      tensorflow::gtl::ArraySlice<int64> bounds,
+      tensorflow::gtl::ArraySlice<int64> minor_to_major, Shape* r2_shape,
+      Array2D<float>* r2_array, float start, float end, int seed) {
+    *r2_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major);
+    r2_array->FillRandom(start, end, seed);
+    auto r2_data =
+        LiteralUtil::Relayout(*LiteralUtil::CreateR2FromArray2D(*r2_array),
+                              LayoutUtil::MakeLayout(minor_to_major));
+    std::unique_ptr<GlobalData> r2_global_data =
+        client_->TransferToServer(*r2_data).ConsumeValueOrDie();
+    return r2_global_data;
+  }
+
+  float ApplyOpToFloats(HloOpcode op, float lhs, float rhs) {
+    switch (op) {
+      case HloOpcode::kMinimum: {
+        return std::min(lhs, rhs);
+      }
+      case HloOpcode::kMaximum: {
+        return std::max(lhs, rhs);
+      }
+      case HloOpcode::kMultiply: {
+        return lhs * rhs;
+      }
+      case HloOpcode::kAdd: {
+        return lhs + rhs;
+      }
+      default: {
+        // Default to Add
+        CHECK(false);
+      }
+    }
+  }
+};
+
+using ::testing::HasSubstr;
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
   ComputationBuilder b(client_, TestName());
@@ -48,6 +122,19 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle src;
+  std::unique_ptr<GlobalData> param_data =
+      CreateR0Parameter<float>(2.25f, /*parameter_number=*/0, /*name=*/"src",
+                               /*builder=*/&b, /*data_handle=*/&src);
+
+  b.Broadcast(src, {2, 3});
+  Array2D<float> expected(2, 3, 2.25);
+  ComputeAndCompareR2<float>(&b, expected, {param_data.get()},
+                             ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
   ComputationBuilder b(client_, TestName());
   b.Broadcast(b.ConstantR0<float>(2.25), {2, 0});
@@ -76,6 +163,33 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
+// Tests implicit broadcasting of PREDs.
+XLA_TEST_F(BroadcastSimpleTest, LogicalAnd2DTo3D_Pred) {
+  ComputationBuilder b(client_, TestName());
+
+  Array2D<bool> x_vals(2, 1);
+  x_vals(0, 0) = true;
+  x_vals(1, 0) = false;
+  Array3D<bool> y_vals(2, 2, 1);
+  y_vals(0, 0, 0) = false;
+  y_vals(0, 1, 0) = false;
+  y_vals(1, 0, 0) = true;
+  y_vals(1, 1, 0) = true;
+
+  ComputationDataHandle x, y;
+  auto x_data = CreateR2Parameter<bool>(x_vals, 0, "x", &b, &x);
+  auto y_data = CreateR3Parameter<bool>(y_vals, 1, "y", &b, &y);
+  b.LogicalAnd(x, y, /*broadcast_dimensions=*/{1, 2});
+
+  Array3D<bool> expected(2, 2, 1);
+  expected(0, 0, 0) = false;
+  expected(0, 1, 0) = false;
+  expected(1, 0, 0) = true;
+  expected(1, 1, 0) = false;
+
+  ComputeAndCompareR3<bool>(&b, expected, {x_data.get(), y_data.get()});
+}
+
 XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
   ComputationBuilder b(client_, TestName());
   b.Broadcast(b.ConstantR1<float>({}), {2});
@@ -114,6 +228,434 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
+struct R3ImplicitBroadcastSpec {
+  std::array<int64, 3> output_bounds;
+  std::array<int64, 3> minor2major_layout;
+  std::array<int64, 3> input_bounds;
+  HloOpcode op;
+} kR3ImplicitBroadcastTestCases[] = {
+    {{{1, 1, 1}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 5}}, HloOpcode::kMaximum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 1}}, HloOpcode::kMultiply},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 5}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 4, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 5}}, HloOpcode::kAdd},
+    {{{3, 199, 5}}, {{2, 1, 0}}, {{1, 199, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 199}}, {{2, 1, 0}}, {{1, 1, 199}}, HloOpcode::kAdd},
+};
+
+class BroadcastR3ImplicitTest
+    : public BroadcastSimpleTest,
+      public ::testing::WithParamInterface<R3ImplicitBroadcastSpec> {};
+
+XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
+  const R3ImplicitBroadcastSpec& spec = GetParam();
+  ComputationBuilder builder(client_, TestName());
+
+  Shape r3_shape, r3_implicit_shape;
+  Array3D<float> r3_array(spec.output_bounds[0], spec.output_bounds[1],
+                          spec.output_bounds[2]);
+  Array3D<float> r3_implicit_array(spec.input_bounds[0], spec.input_bounds[1],
+                                   spec.input_bounds[2]);
+
+  std::unique_ptr<GlobalData> r3_global_data =
+      MakeR3Data(spec.output_bounds, spec.minor2major_layout, &r3_shape,
+                 &r3_array, 1.0, 2.5, 56789);
+  std::unique_ptr<GlobalData> r3_implicit_global_data =
+      MakeR3Data(spec.input_bounds, spec.minor2major_layout, &r3_implicit_shape,
+                 &r3_implicit_array, 1.0, 0.2, 56789);
+
+  auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
+  auto r3_parameter = builder.Parameter(1, r3_shape, "input");
+  ComputationDataHandle op =
+      BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
+
+  Array3D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1],
+                                spec.output_bounds[2]);
+  auto Each = ([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    float r3_implicit = r3_implicit_array(indices[0] % spec.input_bounds[0],
+                                          indices[1] % spec.input_bounds[1],
+                                          indices[2] % spec.input_bounds[2]);
+    float r3 = r3_array(indices[0], indices[1], indices[2]);
+    *value = ApplyOpToFloats(spec.op, r3_implicit, r3);
+  });
+
+  int n1 = expected_array.n1();
+  int n2 = expected_array.n2();
+  int n3 = expected_array.n3();
+  for (int64 i = 0; i < n1; i++) {
+    for (int64 j = 0; j < n2; j++) {
+      for (int64 k = 0; k < n3; k++) {
+        Each({i, j, k}, &expected_array(i, j, k));
+      }
+    }
+  }
+  auto expected = LiteralUtil::CreateR3FromArray3D(expected_array);
+  ComputeAndCompareLiteral(
+      &builder, *expected,
+      {r3_implicit_global_data.get(), r3_global_data.get()},
+      ErrorSpec(1e-7, 1e-7));
+}
+
+INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
+                        BroadcastR3ImplicitTest,
+                        ::testing::ValuesIn(kR3ImplicitBroadcastTestCases));
+
+// r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle r1h;
+  ComputationDataHandle r3h;
+
+  Array3D<float> r1d = {{{1}}, {{2}}};
+  Array3D<float> r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
+  auto r1 = CreateR3Parameter(r1d, 1, "r1", &b, &r1h);
+  auto r3 = CreateR3Parameter(r3d, 0, "r3", &b, &r3h);
+
+  b.Add(r3h, r1h);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {r3.get(), r1.get()},
+                           ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}, {2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+struct R2ImplicitBroadcastSpec {
+  std::array<int64, 2> output_bounds;
+  std::array<int64, 2> minor2major_layout;
+  std::array<int64, 2> input_bounds1;
+  std::array<int64, 2> input_bounds2;
+  HloOpcode op1;
+  HloOpcode op2;
+} kR2ImplicitBroadcastTestCases[] = {
+    {{{2, 3}}, {{1, 0}}, {{2, 1}}, {{2, 1}}, HloOpcode::kAdd, HloOpcode::kAdd},
+    {{{2, 3}}, {{1, 0}}, {{2, 1}}, {{1, 3}}, HloOpcode::kAdd, HloOpcode::kAdd},
+    {{{2, 3}},
+     {{1, 0}},
+     {{2, 1}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kMinimum},
+    {{{2, 3}},
+     {{1, 0}},
+     {{1, 3}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kMinimum},
+    {{{2, 3}},
+     {{1, 0}},
+     {{1, 1}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kMinimum},
+    {{{2, 3}}, {{0, 1}}, {{2, 1}}, {{2, 1}}, HloOpcode::kAdd, HloOpcode::kAdd},
+    {{{150, 150}},
+     {{1, 0}},
+     {{150, 1}},
+     {{150, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{150, 150}},
+     {{1, 0}},
+     {{150, 1}},
+     {{1, 150}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{150, 150}},
+     {{1, 0}},
+     {{150, 1}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{50, 150}},
+     {{1, 0}},
+     {{50, 1}},
+     {{50, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{50, 150}},
+     {{1, 0}},
+     {{50, 1}},
+     {{1, 150}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{50, 150}},
+     {{1, 0}},
+     {{50, 1}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{150, 50}},
+     {{1, 0}},
+     {{150, 1}},
+     {{150, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{150, 50}},
+     {{1, 0}},
+     {{150, 1}},
+     {{1, 50}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd},
+    {{{150, 50}},
+     {{1, 0}},
+     {{150, 1}},
+     {{1, 1}},
+     HloOpcode::kAdd,
+     HloOpcode::kAdd}};
+
+class BroadcastR2ImplicitTest
+    : public BroadcastSimpleTest,
+      public ::testing::WithParamInterface<R2ImplicitBroadcastSpec> {};
+
+// Test r2 op1 r2_implicit_1 op2 r2_implicit_2
+// where R2 is a rank-2 operand, and r2_implicit_2 are two
+// rank-2 operands with degenerate dimensions:
+XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
+  const R2ImplicitBroadcastSpec& spec = GetParam();
+
+  ComputationBuilder builder(client_, TestName());
+
+  // Operands with degenerate dimensions require implicit broadcasting:
+  Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2;
+  Array2D<float> r2_array(spec.output_bounds[0], spec.output_bounds[1]);
+  Array2D<float> r2_implicit_array1(spec.input_bounds1[0],
+                                    spec.input_bounds1[1]);
+  Array2D<float> r2_implicit_array2(spec.input_bounds2[0],
+                                    spec.input_bounds2[1]);
+
+  std::unique_ptr<GlobalData> r2_global_data =
+      MakeR2Data(spec.output_bounds, spec.minor2major_layout, &r2_shape,
+                 &r2_array, 1.0, 2.5, 56789);
+  std::unique_ptr<GlobalData> r2_implicit_global_data1 =
+      MakeR2Data(spec.input_bounds1, spec.minor2major_layout,
+                 &r2_implicit_shape1, &r2_implicit_array1, 1.0, 0.2, 56789);
+  std::unique_ptr<GlobalData> r2_implicit_global_data2 =
+      MakeR2Data(spec.input_bounds2, spec.minor2major_layout,
+                 &r2_implicit_shape2, &r2_implicit_array2, 0.8, 0.4, 56789);
+
+  auto r2_implicit_parameter1 =
+      builder.Parameter(0, r2_implicit_shape1, "input0");
+  auto r2_parameter = builder.Parameter(1, r2_shape, "input1");
+  auto r2_implicit_parameter2 =
+      builder.Parameter(2, r2_implicit_shape2, "input2");
+
+  ComputationDataHandle op1 =
+      BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder);
+  ComputationDataHandle op2 =
+      BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
+
+  Array2D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1]);
+
+  expected_array.Each([&](int64 i, int64 j, float* v) {
+    float v1 = r2_implicit_array1(i % spec.input_bounds1[0],
+                                  j % spec.input_bounds1[1]);
+    float v2 = r2_array(i, j);
+    float v3 = r2_implicit_array2(i % spec.input_bounds2[0],
+                                  j % spec.input_bounds2[1]);
+    float tmp = ApplyOpToFloats(spec.op1, v1, v2);
+    *v = ApplyOpToFloats(spec.op2, tmp, v3);
+  });
+
+  auto expected = LiteralUtil::CreateR2FromArray2D(expected_array);
+  ComputeAndCompareLiteral(
+      &builder, *expected,
+      {r2_implicit_global_data1.get(), r2_global_data.get(),
+       r2_implicit_global_data2.get()},
+      ErrorSpec(1e-6, 1e-6));
+}
+
+INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
+                        BroadcastR2ImplicitTest,
+                        ::testing::ValuesIn(kR2ImplicitBroadcastTestCases));
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 4}, {4, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1}, {2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 3}, {5, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantR1<float>({10, 20});
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1, {0});
+
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{11, 12}, {13, 14}}, {{25, 26}, {27, 28}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantR1<float>({10, 20});
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r1, r3, {1});
+
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{11, 12}, {23, 24}}, {{15, 16}, {27, 28}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantR1<float>({10, 20});
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r1, r3, {2});
+
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{11, 22}, {13, 24}}, {{15, 26}, {17, 28}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
+  ComputationBuilder b(client_, TestName());
+  auto r1_0 = b.ConstantR1<float>({1000, 2000});
+  auto r1_1 = b.ConstantR1<float>({100, 200});
+  auto r1_2 = b.ConstantR1<float>({10, 20});
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  for (int i = 0; i < 3; ++i) {
+    r3 = b.Add(r1_0, r3, {0});
+    r3 = b.Add(r3, r1_1, {1});
+    r3 = b.Add(r1_2, r3, {2});
+  }
+  r3 = b.Mul(r3, b.ConstantR0<float>(-2));
+
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{-6 * 1110 - 2, -6 * 1120 - 4}, {-6 * 1210 - 6, -6 * 1220 - 8}},
+       {{-6 * 2110 - 10, -6 * 2120 - 12}, {-6 * 2210 - 14, -6 * 2220 - 16}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
+  ComputationBuilder b(client_, TestName());
+  auto r1_0 = b.ConstantR1<float>({1000, 2000});
+  auto r1_1 = b.ConstantR1<float>({100, 200});
+  auto r1_2 = b.ConstantR1<float>({10, 20});
+  auto r0 = b.ConstantR0<float>(3);
+  auto r3 = b.Broadcast(r0, {2, 2, 2});
+  for (int i = 0; i < 3; ++i) {
+    r3 = b.Add(r1_0, r3, {0});
+    r3 = b.Add(r3, r1_1, {1});
+    r3 = b.Add(r1_2, r3, {2});
+  }
+  r3 = b.Mul(r3, b.ConstantR0<float>(-1));
+
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{-3 * 1110 - 3, -3 * 1120 - 3}, {-3 * 1210 - 3, -3 * 1220 - 3}},
+       {{-3 * 2110 - 3, -3 * 2120 - 3}, {-3 * 2210 - 3, -3 * 2220 - 3}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2])
   // results in a shape incompatible with the lhs [2, 3, 1].
@@ -126,8 +668,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(result_status.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
@@ -139,9 +681,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("binary op BINOP_ADD with incompatible shapes"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
@@ -153,9 +694,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("binary op BINOP_ADD with incompatible shapes"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
 }
 
 }  // namespace
@@ -163,7 +703,9 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 1796a732e54..96a329a9bd8 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -43,7 +44,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
       ShapeUtil::MakeShape(F32, {}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -59,7 +60,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -82,7 +83,7 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   builder.AddInstruction(HloInstruction::CreateTuple({element1, element2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -103,7 +104,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -122,7 +123,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -139,7 +140,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
       ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -159,7 +160,7 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
       ShapeUtil::MakeShape(F32, {2, 2, 3, 3}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -184,12 +185,12 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
       ShapeUtil::MakeShape(F32, {3, 3, 3, r1_size}), input, {3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   Array4D<float> expected(3, 3, 3, 1025);
-  Array2D<float> yx(/*height=*/3, /*width=*/r1_size);
+  Array2D<float> yx(3, r1_size);
   for (int64 y = 0; y < 3; ++y) {
     for (int64 x = 0; x < r1_size; ++x) {
       yx(y, x) = input_data[x];
@@ -215,7 +216,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
       ShapeUtil::MakeShape(F32, {32, 64, 7, 7}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -231,7 +232,7 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
       ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   LOG(INFO) << hlo_module->ToString();
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -254,7 +255,7 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       ShapeUtil::MakeShape(F32, {3, 3, 2, 2}), input, {2, 3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -265,12 +266,44 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
 }
 
+TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
+  auto builder = HloComputation::Builder(TestName());
+  Array3D<float> input_vals(2, 3, 4);
+  input_vals.FillRandom(1.0);
+
+  Array4D<float> expected(2, 3, 4, 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int m = 0; m < 5; ++m) {
+          expected(i, j, k, m) = input_vals(i, j, k);
+        }
+      }
+    }
+  }
+  auto input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR3FromArray3D<float>(input_vals)));
+
+  // Broadcast vector in dimensions 2 and 3.
+  builder.AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), input, {0, 1, 2}));
+
+  // Create HLO module, compile, and execute.
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(builder.Build());
+  auto result = ExecuteAndTransfer(std::move(hlo_module), {});
+
+  LiteralTestUtil::ExpectNear(
+      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 2c7eeb820d3..1f61743451a 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -102,7 +102,7 @@ def xla_test(name,
     elif backend == "cpu_parallel":
       backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
-      this_backend_args += ["--xla_cpu_parallel=true"]
+      this_backend_args += ["--xla_backend_extra_options=\"xla_cpu_parallel\""]
     elif backend == "gpu":
       backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 0b5e6d51277..55701c62db2 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -117,6 +119,7 @@ XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR0F32Tuple)) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 675c9fccb00..4825eaf19dc 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -19,18 +19,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+
 class CheckExecutionArityTest : public ClientLibraryTestBase {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
@@ -60,15 +63,15 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
   ASSERT_FALSE(result_one_arg.ok());
   ASSERT_EQ(result_one_arg.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(result_one_arg.status().error_message(),
-               testing::ContainsRegex("takes 2"));
+  ASSERT_THAT(result_one_arg.status().error_message(),
+              ContainsRegex("takes 2"));
 
   auto result_zero_args = client_->Execute(computation, {});
   ASSERT_FALSE(result_zero_args.ok());
   ASSERT_EQ(result_zero_args.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(result_zero_args.status().error_message(),
-               testing::ContainsRegex("takes 2"));
+  ASSERT_THAT(result_zero_args.status().error_message(),
+              ContainsRegex("takes 2"));
 }
 
 XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
@@ -99,22 +102,22 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   status = client_->Execute(computation, {f32_4_data.get(), f32_4_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 0"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
   status = client_->Execute(computation, {f32_data.get(), f32_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 1"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
   status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 1"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 1"));
 }
 
 }  // namespace
@@ -122,6 +125,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 7bf1168dc39..b96bb8f8469 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -44,14 +44,19 @@ Client* GetOrCreateLocalClientOrDie(se::Platform* platform) {
 }
 }  // namespace
 
-ClientLibraryTestBase::ClientLibraryTestBase(
-    se::Platform* platform,
-    tensorflow::gtl::ArraySlice<string> disabled_pass_names)
+ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
     : client_(GetOrCreateLocalClientOrDie(platform)) {
-  legacy_flags::HloPassPipelineFlags* flags =
-      legacy_flags::GetHloPassPipelineFlags();
-  flags->xla_disable_hlo_passes =
-      tensorflow::str_util::Join(disabled_pass_names, ",");
+  *(execution_options_.mutable_debug_options()) =
+      legacy_flags::GetDebugOptionsFromFlags();
+
+  // Disabling constant_folding so that tests (usually written using Constants)
+  // will exercise the intended code paths, instead of being constant folded.
+  //
+  // TODO(b/38354253): Constant folding is currently disabled. Change tests to
+  // use Parameters instead of Constants, and re-enable constant folding by
+  // default.
+  execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+      "constant_folding");
 }
 
 string ClientLibraryTestBase::TestName() const {
@@ -179,7 +184,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   VLOG(1) << "expected: " << LiteralUtil::ToString(*expected_literal);
   VLOG(1) << "actual:   " << LiteralUtil::ToString(*actual);
 
-  EXPECT_EQ(expected, actual->u8s());
+  EXPECT_EQ(expected, actual->u8s_string());
 }
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 026f487c2df..f9e1082ebb4 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -46,14 +46,22 @@ namespace xla {
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
   explicit ClientLibraryTestBase(
-      perftools::gputools::Platform* platform = nullptr,
-      tensorflow::gtl::ArraySlice<string> disabled_pass_names = {});
+      perftools::gputools::Platform* platform = nullptr);
 
   // Returns the name of the test currently being run.
   string TestName() const;
 
   void SetFastMathDisabled(bool disabled) {
-    execution_options_.set_disable_fast_math(disabled);
+    execution_options_.mutable_debug_options()->set_xla_enable_fast_math(
+        !disabled);
+  }
+
+  void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
+
+  // Provides mutable access to the execution DebugOptions field; this lets
+  // tests tweak the options that will be used to compile/run the graph.
+  DebugOptions* mutable_debug_options() {
+    return execution_options_.mutable_debug_options();
   }
 
   // TODO(b/25566808): Add helper that populates a literal from a testdata file.
@@ -216,6 +224,16 @@ class ClientLibraryTestBase : public ::testing::Test {
       const int rows, const int cols, const int rows_padded,
       const int cols_padded);
 
+  // Create a parameter instruction that wraps a given value and then stores
+  // into "data_handle" the global handle for that parameter.
+  //
+  // "parameter_number" is the parameter number.
+  // "name" is the name of the parameter instruction.
+  template <typename NativeT>
+  std::unique_ptr<GlobalData> CreateR0Parameter(
+      NativeT value, int64 parameter_number, const string& name,
+      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+
   // Create a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
   //
@@ -370,6 +388,17 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments, error);
 }
 
+template <typename NativeT>
+std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
+    NativeT value, int64 parameter_number, const string& name,
+    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+  std::unique_ptr<Literal> literal = LiteralUtil::CreateR0(value);
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*literal).ConsumeValueOrDie();
+  *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
+  return data;
+}
+
 template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 86ce636ee56..1247804dae0 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -113,6 +114,7 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index 81c0568ff92..cc3eb0e8d46 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -43,33 +42,33 @@ void CodegenTestBase::CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
 
 std::unique_ptr<Executable> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
-  auto module_config = MakeUnique<HloModuleConfig>(
-      hlo_module->entry_computation()->ComputeProgramShape());
-  module_config->set_fast_math_disabled(fast_math_disabled_);
   return backend_->compiler()
-      ->Compile(std::move(hlo_module), std::move(module_config),
-                test_hlo_dumper_, backend_->default_stream_executor())
+      ->Compile(std::move(hlo_module), test_hlo_dumper_,
+                backend_->default_stream_executor())
       .ConsumeValueOrDie();
 }
 
 void CodegenTestBase::RunFileCheck(const string& input, const string& pattern) {
+  using tensorflow::io::JoinPath;
+
   // Write input to a temporary file.
   char tempdir_template[] = "/tmp/ir_testXXXXXX";
   char* tempdir_name = mkdtemp(tempdir_template);
   CHECK_NOTNULL(tempdir_name);
-  string pattern_path =
-      tensorflow::io::JoinPath(tempdir_name, "xla_hlo_test_ir_pattern");
+  string pattern_path = JoinPath(tempdir_name, "xla_hlo_test_ir_pattern");
   TF_CHECK_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
                                             pattern_path, pattern));
 
   // Invoke FileCheck to check whether input matches `pattern`.
-  tensorflow::SubProcess file_check_process;
-  const char* test_srcdir = getenv("TEST_SRCDIR");
-  if (test_srcdir == nullptr) {
-    test_srcdir = ".";
+  const char* file_check_path_suffix = "external/llvm/FileCheck";
+  string file_check_path;
+  if (const char* test_srcdir = getenv("TEST_SRCDIR")) {
+    file_check_path = JoinPath(test_srcdir, file_check_path_suffix);
+  } else {
+    file_check_path = file_check_path_suffix;
   }
-  string file_check_path = tensorflow::io::JoinPath(
-      test_srcdir, "external/llvm/FileCheck");
+
+  tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(file_check_path,
                                 {file_check_path, pattern_path});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.h b/tensorflow/compiler/xla/tests/codegen_test_base.h
index ba32aac8e4b..50c04531070 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.h
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.h
@@ -41,9 +41,6 @@ class CodegenTestBase : public HloTestBase {
   void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
                           const string& pattern);
 
-  // Sets the fast-math-disabled flag on the config we use when compiling.
-  void set_fast_math_disabled(bool disabled) { fast_math_disabled_ = disabled; }
-
  protected:
   // Compiles hlo_module to an executable, CHECK-failing if this fails.
   std::unique_ptr<Executable> CompileToExecutable(
@@ -52,8 +49,6 @@ class CodegenTestBase : public HloTestBase {
   // Runs FileCheck with the given pattern over the given string and EXPECTs
   // that FileCheck succeeded in matching the input.
   void RunFileCheck(const string& input, const string& pattern);
-
-  bool fast_math_disabled_ = false;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 1d0df615824..18ea9714d1a 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -203,6 +204,7 @@ XLA_TEST_F(CompilationCacheTest, MutatedComputation) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 709ce5029c8..13c78fb1633 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -17,43 +17,75 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-class ComputeConstantTest : public ClientLibraryTestBase {
+// An enumerator for the client types that we want to iterate over in
+// the various tests.
+enum class ClientType { kLocal, kCompileOnly };
+ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
+
+class ComputeConstantTest : public ::testing::Test {
  public:
+  explicit ComputeConstantTest(
+      perftools::gputools::Platform* platform = nullptr)
+      : platform_(platform) {}
+
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  Client* ClientOrDie(::perftools::gputools::Platform* platform,
+                      ClientType client_type) {
+    if (client_type == ClientType::kLocal) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateLocalClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create LocalClient for testing";
+      return result.ValueOrDie();
+    } else if (client_type == ClientType::kCompileOnly) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateCompileOnlyClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create CompileOnlyClient for testing";
+      return result.ValueOrDie();
+    }
+    LOG(FATAL) << "invalid client_type value";
+  }
+
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      ComputationDataHandle operand, ComputationBuilder* builder,
-      Layout* output_layout = nullptr) {
+      Client* client, const ComputationDataHandle& operand,
+      ComputationBuilder* builder, Layout* output_layout = nullptr) {
     TF_ASSIGN_OR_RETURN(auto remote_computed,
                         builder->ComputeConstant(operand, output_layout));
-    TF_ASSIGN_OR_RETURN(auto computed, client_->Transfer(*remote_computed));
+    TF_ASSIGN_OR_RETURN(auto computed, client->Transfer(*remote_computed));
     return std::move(computed);
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(ComputationDataHandle operand,
+  StatusOr<Scalar> ComputeConstantScalar(Client* client,
+                                         const ComputationDataHandle& operand,
                                          ComputationBuilder* builder) {
-    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(operand, builder));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        ComputeConstantLiteral(client, operand, builder));
     return LiteralUtil::Get<Scalar>(*literal, {});
   }
 
@@ -64,140 +96,162 @@ class ComputeConstantTest : public ClientLibraryTestBase {
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  template <class Scalar>
-  void ExpectConstantComputedScalar(ComputationDataHandle operand,
-                                    Scalar expected,
-                                    ComputationBuilder* builder) {
-    Scalar computed = ComputeConstantScalar<Scalar>(operand, builder);
-    ASSERT_TRUE(computed.ok()) << computed.status();
-    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0(expected);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
-  }
+  perftools::gputools::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.ConstantR0<int32>(42);
-  EXPECT_TRUE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.ConstantR0<int32>(42);
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto value = ComputeConstantScalar<int32>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 42);
+    auto value = ComputeConstantScalar<int32>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 42);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
-  EXPECT_TRUE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 44.0f);
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarRng) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
-                   ShapeUtil::MakeShape(F32, {}));
-  EXPECT_FALSE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
+                     ShapeUtil::MakeShape(F32, {}));
+    EXPECT_FALSE(IsConstant(computation, &b));
 
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_FALSE(value.ok())
-      << "computing a RNG value should not be considered a constant";
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_FALSE(value.ok())
+        << "computing a RNG value should not be considered a constant";
+  }
 }
 
 TEST_F(ComputeConstantTest, DirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
-  EXPECT_FALSE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
+    EXPECT_FALSE(IsConstant(computation, &b));
 
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 TEST_F(ComputeConstantTest, IndirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(1.0f),
-            b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
-  EXPECT_FALSE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(1.0f),
+              b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
+    EXPECT_FALSE(IsConstant(computation, &b));
 
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 // Test computation of an expression interspersed with param nodes but
 // the expression does not depend on the param nodes.
 TEST_F(ComputeConstantTest, UnrelatedParam) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
-  auto constant_4 = b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
-  auto not_constant_a = b.Add(constant_4, param_a);
+    auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
+    auto constant_4 =
+        b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
+    auto not_constant_a = b.Add(constant_4, param_a);
 
-  auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
-  auto constant_9 = b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
-  auto not_constant_b = b.Add(param_b, constant_9);
+    auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
+    auto constant_9 =
+        b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
+    auto not_constant_b = b.Add(param_b, constant_9);
 
-  auto constant_13 = b.Add(constant_4, constant_9);
-  b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
+    auto constant_13 = b.Add(constant_4, constant_9);
+    b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
 
-  EXPECT_TRUE(IsConstant(constant_13, &b));
+    EXPECT_TRUE(IsConstant(constant_13, &b));
 
-  auto value = ComputeConstantScalar<float>(constant_13, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto computation =
-      b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
-  EXPECT_TRUE(IsConstant(computation, &b));
+    auto computation =
+        b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR1<int32>({4, 6});
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    auto computed = ComputeConstantLiteral(client, computation, &b);
+    ASSERT_TRUE(computed.ok()) << computed.status();
+    std::unique_ptr<Literal> expected_literal =
+        LiteralUtil::CreateR1<int32>({4, 6});
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+  }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
-  EXPECT_TRUE(IsConstant(computation, &b));
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    auto computed = ComputeConstantLiteral(client, computation, &b);
+    ASSERT_TRUE(computed.ok()) << computed.status();
+    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+  }
 }
 
 XLA_TEST_F(ComputeConstantTest, Layout) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
-  for (const std::vector<int64>& layout : layouts) {
-    auto layout_proto = LayoutUtil::MakeLayout(layout);
-    auto computed =
-        ComputeConstantLiteral(b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                                     b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-                               &b, &layout_proto);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
+    for (const std::vector<int64>& layout : layouts) {
+      auto layout_proto = LayoutUtil::MakeLayout(layout);
+      auto computed = ComputeConstantLiteral(
+          client,
+          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+          &b, &layout_proto);
+      ASSERT_TRUE(computed.ok()) << computed.status();
 
-    std::unique_ptr<Literal> expected_literal =
-        test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
-                                                     layout);
-    LiteralTestUtil::AssertEqualShapesAndLayouts(
-        expected_literal->shape(), computed.ValueOrDie()->shape());
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      std::unique_ptr<Literal> expected_literal =
+          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
+                                                       layout);
+      LiteralTestUtil::AssertEqualShapesAndLayouts(
+          expected_literal->shape(), computed.ValueOrDie()->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    }
   }
 }
 
@@ -207,25 +261,28 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
 TEST_F(ComputeConstantTest, DISABLED_ON_CPU(ReuseComputedConstant)) {
   // Compute a trivial constant, then try to use the value in an Execute
   // call. This should fail because the constant resides on the CPU and the
-  // Execute call is executed on a different backend.
-  ComputationBuilder constant_b(client_, TestName());
+  // Execute call is executed on a different backend.  This test only makes
+  // sense with LocalClient, since CompileOnlyClient does not support
+  // execution.
+  Client* client = ClientOrDie(platform_, ClientType::kLocal);
+  ComputationBuilder constant_b(client, TestName());
   auto constant = constant_b.ConstantR0<int32>(42);
   auto handle = constant_b.ComputeConstant(constant).ConsumeValueOrDie();
-  auto literal = client_->Transfer(*handle).ConsumeValueOrDie();
+  auto literal = client->Transfer(*handle).ConsumeValueOrDie();
   LiteralTestUtil::ExpectR0Equal(42, *literal);
 
   // Build trivial computation which takes one parameter.
-  ComputationBuilder b(client_, TestName());
+  ComputationBuilder b(client, TestName());
   b.Neg(b.Parameter(0, ShapeUtil::MakeShape(S32, {}), "param0"));
   auto computation = b.Build().ConsumeValueOrDie();
 
   // Try to use value from ComputeConstant in Execute.
-  auto execute_status = client_->Execute(computation, {handle.get()});
+  auto execute_status = client->Execute(computation, {handle.get()});
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_MATCH(
+  EXPECT_THAT(
       execute_status.status().error_message(),
-      testing::ContainsRegex("argument 0 is on device Host:0 but computation "
-                             "will be executed on device"));
+      ::testing::ContainsRegex("argument 0 is on device Host:0 but computation "
+                               "will be executed on device"));
 }
 
 }  // namespace
@@ -233,6 +290,7 @@ TEST_F(ComputeConstantTest, DISABLED_ON_CPU(ReuseComputedConstant)) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9a48b19b96a..a7034930bc9 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -34,6 +36,7 @@ namespace xla {
 namespace {
 
 using ConcatTest = ClientLibraryTestBase;
+using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
@@ -41,9 +44,8 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
   auto concatenated = builder.ConcatInDim({}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(
-      computation_status.status().ToString(),
-      testing::ContainsRegex("Concatenate expects at least one argument"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              HasSubstr("Concatenate expects at least one argument"));
 }
 
 // Concatenate with one argument works.
@@ -56,6 +58,15 @@ XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({});
+  auto concatenated = builder.ConcatInDim({a}, 0);
+
+  std::vector<float> expected = {};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
@@ -65,9 +76,8 @@ XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
   auto concatenated = builder.ConcatInDim({a, b}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: 0"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              HasSubstr("dimension to concatenate along out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
@@ -404,10 +414,9 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
   auto concatenated = builder.ConcatInDim({x, y}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(
+  EXPECT_THAT(
       computation_status.status().ToString(),
-      testing::ContainsRegex(
-          "Expected non-opaque argument for operand of concatenation"));
+      HasSubstr("Expected non-opaque argument for operand of concatenation"));
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
@@ -434,6 +443,39 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
+XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
+  ComputationBuilder builder(client_, TestName());
+
+  Array3D<float> arr0(9, 17, 1);
+  arr0.Fill(1);
+
+  Array3D<float> arr1(9, 17, 256);
+  arr1.Fill(2);
+
+  Array3D<float> expected(9, 17, arr0.n3() + arr1.n3());
+  for (int64 i = 0; i < expected.n1(); ++i) {
+    for (int64 j = 0; j < expected.n2(); ++j) {
+      int64 kk = 0;
+      for (const Array3D<float>& arr : {arr0, arr1}) {
+        for (int64 k = 0; k < arr.n3(); ++k, ++kk) {
+          expected(i, j, kk) = arr(i, j, k);
+        }
+      }
+    }
+  }
+
+  ComputationDataHandle h0;
+  auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
+                                     &builder, &h0);
+  ComputationDataHandle h1;
+  auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
+                                     &builder, &h1);
+
+  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
+
+  ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
@@ -494,6 +536,63 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
                              {x_data.get(), y_data.get()}, ErrorSpec(1e-4));
 }
 
+// Test that the HLO optimization to replace a concat of a bradcasted scalar
+// produces the correct result in rank 1.
+XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
+  auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
+  auto x_literal = LiteralUtil::CreateR1<float>({2.0f, 3.0f, 5.0f, 6.0f});
+  auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
+  auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
+  auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
+  auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
+  auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
+
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.Parameter(0, x_literal->shape(), "x");
+  auto y = builder.Parameter(1, f32_scalar, "y");
+  auto z = builder.Parameter(2, f32_scalar, "z");
+  auto bcast = builder.Broadcast(y, {5});
+  auto bcast2 = builder.Broadcast(z, {3});
+  auto concat = builder.ConcatInDim({bcast, x}, /*dimension=*/0);
+  builder.ConcatInDim({concat, bcast2}, /*dimension=*/0);
+
+  ComputeAndCompareR1<float>(
+      &builder,
+      {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 2.0f, 3.0f, 5.0f, 6.0f, 5.5f, 5.5f, 5.5f},
+      {x_data.get(), y_data.get(), z_data.get()}, ErrorSpec(1e-4));
+}
+
+// Test that the HLO optimization to replace a concat of a bradcasted scalar
+// produces the correct result in rank 3 with both high and low padding in
+// different dimensions.
+XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
+  auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
+  Array3D<float> x3d(3, 5, 7, 3.14f);
+  auto x_literal = LiteralUtil::CreateR3FromArray3D<float>(x3d);
+  auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
+  auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
+  auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
+  auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
+  auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
+
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.Parameter(0, x_literal->shape(), "x");
+  auto y = builder.Parameter(1, f32_scalar, "y");
+  auto z = builder.Parameter(2, f32_scalar, "y");
+  auto y_bcast = builder.Broadcast(y, {1, 5, 7});
+  auto z_bcast = builder.Broadcast(z, {4, 1, 7});
+  auto concat = builder.ConcatInDim({y_bcast, x}, /*dimension=*/0);
+  builder.ConcatInDim({concat, z_bcast}, /*dimension=*/1);
+  Array3D<float> y_bcast3d(1, 5, 7, 1.5f);
+  Array3D<float> z_bcast3d(4, 1, 7, 5.5f);
+  auto concat0 = ReferenceUtil::Concat3D(y_bcast3d, x3d, 0);
+  auto concat1 = ReferenceUtil::Concat3D(*concat0, z_bcast3d, 1);
+
+  ComputeAndCompareR3<float>(&builder, *concat1,
+                             {x_data.get(), y_data.get(), z_data.get()},
+                             ErrorSpec(1e-4));
+}
+
 INSTANTIATE_TEST_CASE_P(ConcatR2BinaryTestInstantiation, ConcatR2BinaryTest,
                         ::testing::Values(R2BinarySpec{1, 1, 1, 1, 0},
                                           R2BinarySpec{1, 1, 1, 1, 1},
@@ -507,6 +606,7 @@ INSTANTIATE_TEST_CASE_P(ConcatR2BinaryTestInstantiation, ConcatR2BinaryTest,
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 58d52ac1168..1c065de8ba7 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -177,6 +178,7 @@ TEST_F(ConstantsTest, DISABLED_TupleConstant) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 9f8c3a9aeb7..6d379797250 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -36,8 +37,10 @@ namespace {
 class ConvertTest : public ClientLibraryTestBase {
  public:
   explicit ConvertTest(perftools::gputools::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform,
-                              /*disabled_pass_names=*/{"algsimp", "inline"}) {}
+      : ClientLibraryTestBase(platform) {
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+  }
 };
 
 TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
@@ -195,6 +198,7 @@ TEST_F(ConvertTest, ConvertReshape) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 9f38dc4b365..0b09416a747 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -22,15 +22,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -43,8 +43,8 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
       ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_MATCH(dimension_numbers_status.status().error_message(),
-               testing::ContainsRegex("input are not unique"));
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("input are not unique"));
 }
 
 // Tests the convolution operation with invalid weight dimension numbers.
@@ -52,8 +52,8 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
       ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 2, 3, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_MATCH(dimension_numbers_status.status().error_message(),
-               testing::ContainsRegex("weight are not unique"));
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("weight are not unique"));
 }
 
 XLA_TEST_F(ConvolutionDimensionNumbersTest,
@@ -101,6 +101,7 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index ffbda89b948..ec19469fa66 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -48,7 +49,7 @@ class ConvolutionTest : public ClientLibraryTestBase {
 #if XLA_TEST_BACKEND_GPU
   // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
   // convolution. So relax the absolute error threshold.
-  ErrorSpec error_spec_ = ErrorSpec(1e-3);
+  ErrorSpec error_spec_ = ErrorSpec(1e-2);
 #else
   ErrorSpec error_spec_ = ErrorSpec(1e-4);
 #endif
@@ -256,8 +257,7 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x3x3_Same) {
                              error_spec_);
 }
 
-// TODO(b/32873825): implement 1D convolution on GPU.
-XLA_TEST_F(ConvolutionTest, DISABLED_ON_GPU(Convolve1D_1x2x5_1x2x2_Valid)) {
+XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
   ComputationBuilder builder(client_, TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
@@ -284,9 +284,7 @@ XLA_TEST_F(ConvolutionTest, DISABLED_ON_GPU(Convolve1D_1x2x5_1x2x2_Valid)) {
                              error_spec_);
 }
 
-// TODO(b/32873825): implement 3D convolution on GPU.
-XLA_TEST_F(ConvolutionTest,
-           DISABLED_ON_GPU(Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid)) {
+XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
   ComputationBuilder builder(client_, TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
   std::vector<int64> filter_dims = {2, 2, 2, 3, 3};
@@ -345,6 +343,7 @@ XLA_TEST_F(ConvolutionTest,
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index b599f9b95bc..b5afc2498da 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -23,11 +23,14 @@ limitations under the License.
 #include <random>
 #include <vector>
 
+#include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -1273,11 +1276,100 @@ TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
   ComputeAndCompareR4<float>(&builder, {{{{13, 24, 130}}}}, {}, error_spec_);
 }
 
+TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto gradients = builder.ConstantR3FromArray3D<float>(
+      Array3D<float>(1, 1, 1, /*value=*/1));
+  auto weights =
+      builder.ConstantR3FromArray3D<float>(Array3D<float>({{{1, 10, 100}}}));
+  auto mirrored_weights = builder.Rev(weights, {2});
+  builder.ConvWithGeneralPadding(gradients, mirrored_weights,
+                                 /*window_strides=*/{1},
+                                 /*padding=*/{{1, 1}});
+  ComputeAndCompareR3<float>(&builder, {{{10}}}, {}, error_spec_);
+}
+
+TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto activations =
+      builder.ConstantR3FromArray3D<float>(Array3D<float>({{{1, 2, 3, 4}}}));
+  auto gradients =
+      builder.ConstantR3FromArray3D<float>(Array3D<float>({{{100, 10, 1}}}));
+  auto forward_conv = builder.ConvGeneralDilated(
+      activations, gradients,
+      /*window_strides=*/{1},
+      /*padding=*/{{2, 1}},
+      /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(
+          /*num_spatial_dims=*/1));
+  builder.Transpose(forward_conv, {0, 1, 2});
+
+  ComputeAndCompareR3<float>(&builder, {{{13, 24, 130}}}, {}, error_spec_);
+}
+
+TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto gradients_flat = LiteralUtil::CreateR1<float>({1});
+  auto gradients_literal =
+      LiteralUtil::Reshape(*gradients_flat, {1, 1, 1, 1, 1})
+          .ConsumeValueOrDie();
+  auto gradients = builder.ConstantLiteral(*gradients_literal);
+
+  auto weights_flat = LiteralUtil::CreateR1<float>({1, 10, 100});
+  auto weights_literal =
+      LiteralUtil::Reshape(*weights_flat, {1, 1, 1, 1, 3}).ConsumeValueOrDie();
+  auto weights = builder.ConstantLiteral(*weights_literal);
+
+  auto expected_flat = LiteralUtil::CreateR1<float>({10});
+  auto expected_literal =
+      LiteralUtil::Reshape(*expected_flat, {1, 1, 1, 1, 1}).ConsumeValueOrDie();
+
+  auto mirrored_weights = builder.Rev(weights, {2, 3, 4});
+  builder.ConvWithGeneralPadding(gradients, mirrored_weights,
+                                 /*window_strides=*/{1, 1, 1},
+                                 /*padding=*/{{0, 0}, {0, 0}, {1, 1}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_);
+}
+
+TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto activations_flat = LiteralUtil::CreateR1<float>({1, 2, 3, 4});
+  auto activations_literal =
+      LiteralUtil::Reshape(*activations_flat, {1, 1, 1, 1, 4})
+          .ConsumeValueOrDie();
+  auto activations = builder.ConstantLiteral(*activations_literal);
+
+  auto gradients_flat = LiteralUtil::CreateR1<float>({100, 10, 1});
+  auto gradients_literal =
+      LiteralUtil::Reshape(*gradients_flat, {1, 1, 1, 1, 3})
+          .ConsumeValueOrDie();
+  auto gradients = builder.ConstantLiteral(*gradients_literal);
+
+  auto expected_flat = LiteralUtil::CreateR1<float>({13, 24, 130});
+  auto expected_literal =
+      LiteralUtil::Reshape(*expected_flat, {1, 1, 1, 1, 3}).ConsumeValueOrDie();
+
+  auto forward_conv = builder.ConvGeneralDilated(
+      activations, gradients,
+      /*window_strides=*/{1, 1, 1},
+      /*padding=*/{{0, 0}, {0, 0}, {2, 1}},
+      /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2},
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(
+          /*num_spatial_dims=*/3));
+  builder.Transpose(forward_conv, {0, 1, 2, 3, 4});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 29e29505333..4c2413d0fe4 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -18,12 +18,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -44,11 +46,10 @@ class CopyOpTest : public HloTestBase {
     builder.AddInstruction(HloInstruction::CreateUnary(
         constant->shape(), HloOpcode::kCopy, constant));
     auto computation = builder.Build();
-    auto hlo_module = MakeUnique<HloModule>("test_module");
-    hlo_module->AddEntryComputation(std::move(computation));
+    auto module = CreateNewModule();
+    module->AddEntryComputation(std::move(computation));
 
-    std::unique_ptr<Literal> result =
-        ExecuteAndTransfer(std::move(hlo_module), {});
+    std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
     LiteralTestUtil::ExpectEqual(literal, *result);
   }
 
@@ -100,11 +101,11 @@ TEST_F(CopyOpTest, CopyParameterScalar) {
 
   auto computation = builder.Build();
 
-  auto hlo_module = MakeUnique<HloModule>("test_module");
-  hlo_module->AddEntryComputation(std::move(computation));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(std::move(computation));
 
   std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {constant_device_base});
+      ExecuteAndTransfer(std::move(module), {constant_device_base});
   LiteralTestUtil::ExpectR0Near<float>(42.0f, *result, error_spec_);
 }
 
@@ -122,10 +123,9 @@ TEST_F(CopyOpTest, CopyConstantR2Twice) {
 
   auto computation = builder.Build();
 
-  auto hlo_module = MakeUnique<HloModule>("test_module");
-  hlo_module->AddEntryComputation(std::move(computation));
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {});
+  auto module = CreateNewModule();
+  module->AddEntryComputation(std::move(computation));
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR2Near<float>({{1.0, 2.0}, {3.0, 4.0}}, *result,
                                        error_spec_);
 }
@@ -148,10 +148,9 @@ TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = MakeUnique<HloModule>("test_module");
-  hlo_module->AddEntryComputation(std::move(computation));
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {});
+  auto module = CreateNewModule();
+  module->AddEntryComputation(std::move(computation));
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
 
   // The result of the computation has the default layout, which is the inverse
   // of the layout of the source literal.
@@ -181,15 +180,10 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = MakeUnique<HloModule>("test_module");
-  auto config = MakeUnique<HloModuleConfig>(computation->ComputeProgramShape());
-  *config->mutable_entry_computation_layout()->mutable_result_layout() =
-      ShapeLayout(ShapeUtil::MakeShapeWithLayout(
-          constant->shape().element_type(),
-          AsInt64Slice(constant->shape().dimensions()), {1, 2, 0}));
-  hlo_module->AddEntryComputation(std::move(computation));
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), std::move(config), {});
+  auto module = CreateNewModule();
+  module->AddEntryComputation(std::move(computation));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({1, 2, 0}));
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
 
   LiteralTestUtil::ExpectR3EqualArray3D(a, *result);
 }
@@ -220,18 +214,10 @@ void CopyOpTest::TestCopyConstantLayoutR4(
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = MakeUnique<HloModule>("test_module");
-  auto config = MakeUnique<HloModuleConfig>(computation->ComputeProgramShape());
-  *config->mutable_entry_computation_layout()->mutable_result_layout() =
-      ShapeLayout(ShapeUtil::MakeShapeWithLayout(
-          constant->shape().element_type(),
-          AsInt64Slice(constant->shape().dimensions()), ({
-            std::vector<int64> p(permutation.rbegin(), permutation.rend());
-            p;
-          })));
-  hlo_module->AddEntryComputation(std::move(computation));
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), std::move(config), {});
+  auto module = CreateNewModule();
+  module->AddEntryComputation(std::move(computation));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout(permutation));
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
 
   LiteralTestUtil::ExpectR4EqualArray4D(a, *result);
 }
@@ -256,12 +242,29 @@ XLA_TEST_F(CopyOpTest, CopyConstantR4Layout0312_MultipleTilesPerLayer) {
   TestCopyConstantLayoutR4(2, 14, 5, 35, {0, 3, 1, 2});
 }
 
+using CopyOpClientTest = ClientLibraryTestBase;
+
+XLA_TEST_F(CopyOpClientTest, Copy0x0) {
+  Shape in_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {0, 1});
+  Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
+  auto empty = LiteralUtil::CreateFromShape(in_shape);
+
+  ComputationBuilder builder(client_, TestName());
+  auto param0 = builder.Parameter(0, in_shape, "input");
+  auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie();
+
+  auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape)
+                    .ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*empty, *actual);
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index dc54c9defec..32232acf6e3 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -29,23 +30,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 
-extern "C" void __attribute__((visibility("default")))
-R0F32Add2(float* out, float** in) {
+
+extern "C" void TF_EXPORT R0F32Add2(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float*));
   *out = **in + 2.0f;
 }
 
-extern "C" void __attribute__((visibility("default")))
-R2F32ReduceSum(float* out, float** in) {
+extern "C" void TF_EXPORT R2F32ReduceSum(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
   *out = array[0] + array[1] + array[2] + array[3];
 }
 
-extern "C" void __attribute__((visibility("default")))
-Add1ToValues(float* out, float** in) {
+extern "C" void TF_EXPORT Add1ToValues(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
   out[0] = array[0] + 1;
@@ -64,7 +64,7 @@ class CustomCallTest : public HloTestBase {
 };
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto constant = builder.AddInstruction(
@@ -72,15 +72,14 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
   builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2"));
 
-  hlo_module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {});
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR0Near<float>(44.0f, *result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   Array2D<float> array(2, 2);
@@ -94,16 +93,15 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum"));
 
-  hlo_module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {});
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR0Near<float>(10.0f, *result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest,
            DISABLED_ON_GPU(CustomCall_UsedInOtherComputations)) {
-  auto hlo_module = MakeUnique<HloModule>("test_module");
+  auto module = CreateNewModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input = b.AddInstruction(
@@ -119,10 +117,9 @@ XLA_TEST_F(CustomCallTest,
       HloInstruction::CreateConcatenate(ShapeUtil::MakeShape(F32, {2, 2, 2}),
                                         {incremented, incremented_again}, 0));
 
-  hlo_module->AddEntryComputation(b.Build());
+  module->AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(hlo_module), {});
+  std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR3EqualArray3D<float>(
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, *result);
 }
@@ -133,6 +130,7 @@ XLA_TEST_F(CustomCallTest,
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index 528efd2942b..074753bf6f8 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -20,16 +20,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::testing::HasSubstr;
+
 class DeallocationTest : public ClientLibraryTestBase {
  protected:
   // Build and execute the given computation then verify the results can be
@@ -50,7 +53,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
   builder.ConstantR0<float>(42.0);
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
-  // A result can be transfered an arbitrary number of times.  Add an extra
+  // A result can be transferred an arbitrary number of times.  Add an extra
   // transfer here so we're not just testing that a second call to Transfer
   // fails.
   ASSERT_IS_OK(client_->Transfer(*global_data).status());
@@ -59,8 +62,8 @@ TEST_F(DeallocationTest, DeallocateScalar) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 TEST_F(DeallocationTest, DeallocateVector) {
@@ -72,8 +75,8 @@ TEST_F(DeallocationTest, DeallocateVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 TEST_F(DeallocationTest, DeallocateEmptyVector) {
@@ -85,8 +88,8 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTuple) {
@@ -99,8 +102,8 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
@@ -114,8 +117,8 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
@@ -130,8 +133,8 @@ XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 }  // namespace
@@ -139,6 +142,7 @@ XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index 57a7c61b141..fcddffc1e13 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -34,6 +36,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+
 class DeconstructTupleTest : public ClientLibraryTestBase {
  protected:
   // Build and execute the given computation then verify the results can be
@@ -61,11 +66,11 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) {
 
   // Try copying the elements back and comparing it
   auto handles = result_status.ConsumeValueOrDie();
-  std::vector<float> copy(4);
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  std::unique_ptr<Literal> literal;
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
@@ -82,19 +87,20 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
 
   auto handles1 = result_status1.ConsumeValueOrDie();
   auto handles2 = result_status2.ConsumeValueOrDie();
-  std::vector<float> copy(4);
 
-  ASSERT_IS_OK(client_->TransferInProcess(*handles1[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles1[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  std::unique_ptr<Literal> literal;
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles1[0]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles1[1]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
+
   handles1[0].reset();
   handles1[1].reset();
 
-  ASSERT_IS_OK(client_->TransferInProcess(*handles2[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles2[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles2[0]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles2[1]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
@@ -112,15 +118,15 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
   // the same as handle[3] and handle[1] should be the same as handle[2].
   auto handles = result_status.ConsumeValueOrDie();
 
-  std::vector<float> copy(4);
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[3], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  std::unique_ptr<Literal> literal;
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[3]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
@@ -138,19 +144,19 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
   // should not have been deallocated because of reference counting.
   global_data.reset();
 
-  std::vector<float> copy(4);
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  std::unique_ptr<Literal> literal;
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 
   /// Try deallocating one of the repeated elements, then copy
   handles[0].reset();
 
-  ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 }
 
 TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
@@ -160,8 +166,8 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(result_status.status().error_message(),
-               testing::ContainsRegex("global data handle .* is not a tuple"));
+  EXPECT_THAT(result_status.status().error_message(),
+              ContainsRegex("global data handle .* is not a tuple"));
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
@@ -189,9 +195,8 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("deconstructing nested tuples not yet supported"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("deconstructing nested tuples not yet supported"));
 }
 
 }  // namespace
@@ -199,6 +204,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 197a8f86cb0..754eec1b1ed 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -54,6 +55,8 @@ class DotOperationTest : public ClientLibraryTestBase {
   template <typename Element>
   void TestNonsquareMatrixDot(bool lhs_row_major = false,
                               bool rhs_row_major = false);
+  void TestMatrixDot(int M, int K, int N, bool lhs_row_major = false,
+                     bool rhs_row_major = false);
 };
 
 XLA_TEST_F(DotOperationTest, ZeroElementVectorDotF32) {
@@ -65,6 +68,15 @@ XLA_TEST_F(DotOperationTest, ZeroElementVectorDotF32) {
   ComputeAndCompareR0<float>(&builder, 0.0, {}, error_spec_);
 }
 
+XLA_TEST_F(DotOperationTest, TrivialMatrixVectorDotF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR2<float>({{3.0, 4.0}});
+  auto rhs = builder.ConstantR1<float>({3.0, 4.0});
+  auto result = builder.Dot(lhs, rhs);
+
+  ComputeAndCompareR1<float>(&builder, {25.0}, {}, error_spec_);
+}
+
 template <typename Element>
 void DotOperationTest::TestOneElementVectorDot() {
   ComputationBuilder builder(client_, TestName());
@@ -170,6 +182,84 @@ void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
       &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
 }
 
+void DotOperationTest::TestMatrixDot(int M, int K, int N, bool lhs_row_major,
+                                     bool rhs_row_major) {
+  std::unique_ptr<Array2D<float>> lhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, M, K);
+  std::unique_ptr<Literal> lhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+      *lhs_data,
+      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
+  auto lhs_handle = client_->TransferToServer(*lhs_lit).ConsumeValueOrDie();
+
+  std::unique_ptr<Array2D<float>> rhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, K, N);
+  std::unique_ptr<Literal> rhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+      *rhs_data,
+      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
+  auto rhs_handle = client_->TransferToServer(*rhs_lit).ConsumeValueOrDie();
+
+  ComputationBuilder builder(client_, TestName());
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+  auto result = builder.Dot(
+      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {M, K}), "lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {K, N}), "rhs"));
+
+  std::unique_ptr<Array2D<float>> expected =
+      ReferenceUtil::MatmulArray2D(*lhs_data, *rhs_data);
+
+  ComputeAndCompareR2<float>(&builder, *expected,
+                             {lhs_handle.get(), rhs_handle.get()},
+                             ErrorSpec(0.3, 3e-3));
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTF) {
+  TestMatrixDot(12, 117, 7, true, false);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFT) {
+  TestMatrixDot(12, 117, 7, false, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTT) {
+  TestMatrixDot(12, 117, 7, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFF) {
+  TestMatrixDot(12, 117, 7, false, false);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTT) {
+  TestMatrixDot(270, 270, 520, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTF) {
+  TestMatrixDot(270, 270, 520, true, false);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFT) {
+  TestMatrixDot(270, 270, 520, false, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFF) {
+  TestMatrixDot(270, 270, 520, false, false);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTT) {
+  TestMatrixDot(269, 3, 520, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTF) {
+  TestMatrixDot(260, 3, 520, true, false);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFT) {
+  TestMatrixDot(260, 3, 520, false, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFF) {
+  TestMatrixDot(260, 3, 520, false, false);
+}
+
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
   constexpr bool kLhsRowMajor = false;
   constexpr bool kRhsRowMajor = false;
@@ -277,9 +367,9 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
   std::vector<xla::ComputationDataHandle> out_slices;
   for (int i = 0; i < 4; ++i) {
     // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
     x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2});
-    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
     y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2});
 
     auto out = builder.Dot(x_slice, y_slice);
@@ -371,6 +461,7 @@ TEST_F(DotOperationTest, TransposeFolding) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendLayoutUtilFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuRuntimeFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 9fbd9d2f7fb..b7bb1792f3b 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -22,12 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -57,6 +59,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4},
                   {6.0, 7.0, 0.0, 1.0});
+    // Zero element slice.
+    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {});
   }
 
   template <typename IndexT>
@@ -74,6 +78,12 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
                   {1, 1}, {3, 3},
                   {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}});
+    // Zero element slice: 2x0.
+    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+                  {0, 0}, {2, 0}, {{}, {}});
+    // Zero element slice: 0x2.
+    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+                  {0, 0}, {0, 2}, Array2D<float>(0, 2));
   }
 
   template <typename IndexT>
@@ -108,7 +118,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR1(const std::vector<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const std::vector<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -126,7 +136,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR2(const Array2D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array2D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -144,7 +154,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR3(const Array3D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array3D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -199,6 +209,10 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
                   {8.0, 9.0, 10.0}, {6},
                   {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0});
+    // Zero-sized update.
+    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
+                  {}, {2},
+                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
     // clang-format on
   }
 
@@ -225,6 +239,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
         {{10.0f, 11.0f}}, {2, 2},
         {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}});
+    // Zero-sized update.
+    RunR2<IndexT>(
+        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+        {{}}, {2, 1},
+        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
     // clang-format on
   }
 
@@ -474,19 +493,23 @@ void BM_DynamicSlice(int num_iters) {
       executors[device_ordinal], *start_indices_literal,
       buffer->mutable_buffer({})));
 
+  std::unique_ptr<LocalExecutable> executable =
+      client->Compile(computation, {&buffer->shape()}, ExecutableBuildOptions())
+          .ConsumeValueOrDie();
+
   // Run some warm-up executions.
-  LocalExecuteOptions options;
+  ExecutableRunOptions options;
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = client->ExecuteLocally(computation, {buffer.get()}, options);
+    auto result = executable->Run({buffer.get()}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = client->ExecuteLocally(computation, {buffer.get()}, options);
+    auto result = executable->Run({buffer.get()}, options);
     ASSERT_TRUE(result.ok());
   }
 }
@@ -497,6 +520,7 @@ BENCHMARK(BM_DynamicSlice);
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index 8e300630858..80267e5459d 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -112,6 +113,7 @@ TEST_F(FloorCeilTest, R0Ceil) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index 2835038c90c..ee4e92505d9 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -45,6 +46,7 @@ TEST_F(FmaxSimpleTest, FmaxTenValues) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 7bddbfa894c..fa36381267e 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 using tensorflow::gtl::ArraySlice;
@@ -74,7 +74,7 @@ class FusionTest : public HloTestBase {
     }
 
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = MakeUnique<HloModule>(TestName());
+    auto hlo_module = CreateNewModule();
 
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
@@ -176,7 +176,7 @@ XLA_TEST_F(FusionTest, Test) {
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -204,7 +204,7 @@ XLA_TEST_F(FusionTest, Test) {
       HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                     HloOpcode::kSelect, const10, add8, const9));
   auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice(
-      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}));
+      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1}));
   // CreateFusionInstruction needs the `instructions_to_fuse` argument in
   // reverse topological order, so the first element in `instructions_to_fuse`
   // must be the root.
@@ -224,7 +224,7 @@ XLA_TEST_F(FusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -247,7 +247,7 @@ XLA_TEST_F(FusionTest, Parameter) {
 
 XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -271,7 +271,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -285,7 +285,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -300,7 +300,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
@@ -315,7 +315,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
@@ -329,7 +329,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -343,7 +343,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
 
 XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
@@ -357,7 +357,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
@@ -372,7 +372,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -387,7 +387,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -402,7 +402,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
 
 XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -427,7 +427,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -446,7 +446,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -468,7 +468,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = MakeUnique<HloModule>(TestName());
+  auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
@@ -574,6 +574,7 @@ XLA_TEST_F(FusionTest, Clamp2D) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
new file mode 100644
index 00000000000..f54fa2256e2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -0,0 +1,89 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/local_service.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+
+namespace xla {
+namespace {
+
+class HloMetadataTest : public LocalClientTestBase {
+ protected:
+  HloMetadataTest() {
+    metadata_.set_op_type("add");
+    metadata_.set_op_name("my_sum_op");
+  }
+
+  void BuildAddComputation(ComputationBuilder* builder) {
+    auto x = builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
+    auto y = builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
+    builder->Add(x, y);
+  }
+
+  OpMetadata metadata_;
+};
+
+TEST_F(HloMetadataTest, MetadataPropagation) {
+  ComputationBuilder builder(local_client_, "add");
+  builder.SetOpMetadata(metadata_);
+  BuildAddComputation(&builder);
+  builder.ClearOpMetadata();
+
+  Shape argument_layout = ShapeUtil::MakeShape(F32, {});
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<LocalExecutable> executable,
+      local_client_->Compile(builder.Build().ValueOrDie(),
+                             {&argument_layout, &argument_layout},
+                             ExecutableBuildOptions()));
+
+  auto instruction = executable->executable()
+                         ->module()
+                         .entry_computation()
+                         ->root_instruction();
+  EXPECT_EQ("add", instruction->metadata().op_type());
+  EXPECT_EQ("my_sum_op", instruction->metadata().op_name());
+}
+
+TEST_F(HloMetadataTest, MetadataClearing) {
+  ComputationBuilder builder(local_client_, "add");
+  builder.SetOpMetadata(metadata_);
+  // Some other pretend computation here.
+  builder.ClearOpMetadata();
+  BuildAddComputation(&builder);
+
+  Shape argument_layout = ShapeUtil::MakeShape(F32, {});
+  auto executable_status = local_client_->Compile(
+      builder.Build().ValueOrDie(), {&argument_layout, &argument_layout},
+      ExecutableBuildOptions());
+  ASSERT_IS_OK(executable_status);
+
+  std::unique_ptr<LocalExecutable> executable =
+      executable_status.ConsumeValueOrDie();
+
+  auto instruction = executable->executable()
+                         ->module()
+                         .entry_computation()
+                         ->root_instruction();
+  // We expect these to be empty (no metadata set).
+  EXPECT_EQ("", instruction->metadata().op_type());
+  EXPECT_EQ("", instruction->metadata().op_name());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 5b83171c4e2..5f7b7aa434e 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace se = ::perftools::gputools;
@@ -55,6 +56,8 @@ struct HloTestBase::EigenThreadPoolWrapper {
 
 HloTestBase::HloTestBase()
     : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()) {
+  // TODO(b/62411181): get rid of this flag entirely when the usual debug flags
+  // are piped to all HLO tests.
   test_hlo_dumper_ = [](const HloModule& module, const string& label) {
     legacy_flags::HloTestBaseFlags* flags = legacy_flags::GetHloTestBaseFlags();
     if (flags->xla_hlo_test_generate_hlo_graph) {
@@ -74,30 +77,21 @@ HloTestBase::~HloTestBase() {
   }
 }
 
+std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                               config);
+}
+
 StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
         arguments,
     Shape* result_shape) {
-  auto module_config = MakeUnique<HloModuleConfig>(
-      module->entry_computation()->ComputeProgramShape());
-  return Execute(std::move(module), std::move(module_config), arguments,
-                 result_shape);
-}
-
-StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    Shape* result_shape) {
-  VLOG(3) << "module_config layout "
-          << LayoutUtil::HumanString(module_config->entry_computation_layout()
-                                         .result_layout()
-                                         .layout());
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend_->compiler()->Compile(std::move(hlo_module),
-                                    std::move(module_config), test_hlo_dumper_,
+      backend_->compiler()->Compile(std::move(module), test_hlo_dumper_,
                                     backend_->default_stream_executor()));
 
   se::Stream stream(backend_->default_stream_executor());
@@ -111,9 +105,13 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
       backend_->eigen_intra_op_thread_pool_device());
 
   HloExecutionProfile hlo_execution_profile;
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase result,
-                      executable->ExecuteOnStream(&run_options, arguments,
-                                                  &hlo_execution_profile));
+  ServiceExecutableRunOptions service_run_options(
+      run_options, backend_->StreamBorrower(),
+      backend_->inter_op_thread_pool());
+  TF_ASSIGN_OR_RETURN(
+      se::DeviceMemoryBase result,
+      executable->ExecuteOnStream(&service_run_options, arguments,
+                                  &hlo_execution_profile));
   TF_RET_CHECK(stream.BlockHostUntilDone());
 
   allocations_.push_back(result);
@@ -133,6 +131,7 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
     std::set<void*> added_opaques;
     for (auto element_buffer : element_buffers) {
       if (added_opaques.count(element_buffer.opaque()) == 0) {
+        CHECK(element_buffer.opaque() != nullptr);
         added_opaques.insert(element_buffer.opaque());
         allocations_.push_back(element_buffer);
       }
@@ -175,20 +174,26 @@ std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
   return TransferFromDevice(result_shape, device_base);
 }
 
-std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module,
-    std::unique_ptr<HloModuleConfig> module_config,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  Shape result_shape;
-  se::DeviceMemoryBase device_base =
-      Execute(std::move(module), std::move(module_config), arguments,
-              &result_shape)
-          .ValueOrDie();
-  return TransferFromDevice(result_shape, device_base);
-}
-
-string HloTestBase::TestName() const {
+/* static */
+string HloTestBase::TestName() {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+int ParseDebugOptionsFlagsAndRunTests(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+  ::testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 91fc9b87cd5..98bc35ae528 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -45,6 +44,12 @@ class HloTestBase : public ::testing::Test {
 
   ~HloTestBase() override;
 
+  // Creates a new HLO module for a test. The module created will have
+  // TestName() for its name; it will also automatically populate its debug
+  // options from command-line flags. It's recommended to use this method to
+  // create all HloModules for tests.
+  std::unique_ptr<HloModule> CreateNewModule();
+
   // Executes the given module and returns a global data handle.
   StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
       std::unique_ptr<HloModule> module,
@@ -52,20 +57,11 @@ class HloTestBase : public ::testing::Test {
           arguments,
       Shape* result_shape);
 
-  // Variation of Execute which takes a custom module_config instead of creating
-  // a default one.
-  StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Shape* result_shape);
-
   // Transfers the given literal to the device and returns the data handle.
   perftools::gputools::DeviceMemoryBase TransferToDevice(
       const Literal& literal);
 
-  // Transfers the array refered to by the given handle from the device and
+  // Transfers the array referred to by the given handle from the device and
   // returns as a Literal.
   std::unique_ptr<Literal> TransferFromDevice(
       const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
@@ -76,15 +72,35 @@ class HloTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments);
 
-  // Variation of ExecuteAndTransfer which takes a custom module_config instead
-  // of creating a default one.
-  std::unique_ptr<Literal> ExecuteAndTransfer(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments);
+  // Convenience method to force the layout of a given parameter in a module.
+  // The layout of parameter number 'param_no' in the 'module' is set to
+  // 'layout'.
+  void ForceParameterLayout(HloModule* module, int64 param_no,
+                            const Layout& layout) {
+    ASSERT_LT(param_no,
+              module->mutable_entry_computation_layout()->parameter_count());
+    module->mutable_entry_computation_layout()
+        ->mutable_parameter_layout(param_no)
+        ->ResetLayout(layout);
+  }
 
-  string TestName() const;
+  // Convenience method to force the layout of the computation result in a
+  // module. The result layout of 'module' is set to 'layout'.
+  void ForceResultLayout(HloModule* module, const Layout& layout) {
+    module->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->ResetLayout(layout);
+  }
+
+  // Convenience method to clear the layout of the computation result in
+  // 'module'.
+  void ForceClearResultLayout(HloModule* module) {
+    module->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->Clear();
+  }
+
+  static string TestName();
 
   std::unique_ptr<Backend> backend_;
 
@@ -99,6 +115,11 @@ class HloTestBase : public ::testing::Test {
   std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 };
 
+// Convenience function that parses XLA debug options flags from argc/argv,
+// calls InitGoogleTest and then calls and returns RUN_ALL_TESTS. Intended to be
+// invoked from a test main() function.
+int ParseDebugOptionsFlagsAndRunTests(int argc, char** argv);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_HLO_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/inprocess_service_test.cc b/tensorflow/compiler/xla/tests/inprocess_service_test.cc
deleted file mode 100644
index ea0be07872f..00000000000
--- a/tensorflow/compiler/xla/tests/inprocess_service_test.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <initializer_list>
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// Tests which exercise the "InProcess" methods of xla::Client. The
-// "InProcess" methods require that the client and server share the same
-// process.
-class InProcessServiceTest : public ClientLibraryTestBase {
- protected:
-  std::unique_ptr<GlobalData> ExecuteR2F32Constant(
-      std::initializer_list<std::initializer_list<float>> values,
-      tensorflow::gtl::ArraySlice<int64> minor_to_major) {
-    ComputationBuilder builder(client_, TestName());
-    builder.ConstantR2<float>(values);
-    auto computation = builder.Build().ConsumeValueOrDie();
-    CHECK_EQ(2, minor_to_major.size());
-
-    ExecutionOptions execution_options;
-    *execution_options.mutable_shape_with_output_layout() =
-        ShapeUtil::MakeShapeWithLayout(
-            F32,
-            /*dimensions=*/{static_cast<int64>(values.size()),
-                            static_cast<int64>(values.begin()->size())},
-            minor_to_major);
-    return client_->Execute(computation, {}, &execution_options)
-        .ConsumeValueOrDie();
-  }
-
-  ErrorSpec error_spec_{0.0001};
-};
-
-XLA_TEST_F(InProcessServiceTest, TransferFromServer) {
-  ComputationBuilder builder(client_, TestName());
-  builder.ConstantR1<int32>({1, 42, 5});
-  auto computation = builder.Build().ConsumeValueOrDie();
-
-  auto handle = client_->Execute(computation, {}).ConsumeValueOrDie();
-
-  std::vector<int32> result(3, 0);
-  ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
-  EXPECT_MATCH(result, testing::VectorMatcher<int32>({1, 42, 5}));
-}
-
-XLA_TEST_F(InProcessServiceTest, TransferToServer) {
-  std::vector<float> input{1.0f, 2.0f, -42.0f};
-  Shape shape = ShapeUtil::MakeShape(F32, {3});
-  auto data_handle = client_->TransferToServerInProcess(shape, input.data())
-                         .ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto param = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "param");
-  builder.Add(param, param);
-
-  ComputeAndCompareR1<float>(&builder, {2.0f, 4.0f, -84.0f},
-                             {data_handle.get()}, error_spec_);
-}
-
-// TODO(b/28506710): This test case seems not to test inprocess
-// methods.
-TEST_F(InProcessServiceTest, GetShape) {
-  ComputationBuilder builder(client_, TestName());
-  builder.ConstantR1<int32>({1, 42, 5});
-  auto computation = builder.Build().ConsumeValueOrDie();
-
-  auto handle = client_->Execute(computation, {}).ConsumeValueOrDie();
-
-  Shape shape = client_->GetShape(*handle).ConsumeValueOrDie();
-  ASSERT_EQ(S32, shape.element_type());
-  ASSERT_EQ(1, ShapeUtil::Rank(shape));
-  ASSERT_EQ(3, shape.dimensions(0));
-}
-
-XLA_TEST_F(InProcessServiceTest, GetShapeOfClientSuppliedArrayRowMajor) {
-  std::vector<float> input{1.0f, 2.0f, 3.0f, 4.0f};
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
-  shape.clear_layout();
-  *shape.mutable_layout() = LayoutUtil::MakeLayout({1, 0});
-  auto handle = client_->TransferToServerInProcess(shape, input.data())
-                    .ConsumeValueOrDie();
-
-  Shape shape_returned = client_->GetShape(*handle).ConsumeValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(shape, shape_returned));
-}
-
-XLA_TEST_F(InProcessServiceTest, GetShapeOfClientSuppliedArrayColMajor) {
-  std::vector<float> input{1.0f, 2.0f, 3.0f, 4.0f};
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
-  shape.clear_layout();
-  *shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
-  auto handle = client_->TransferToServerInProcess(shape, input.data())
-                    .ConsumeValueOrDie();
-
-  Shape shape_returned = client_->GetShape(*handle).ConsumeValueOrDie();
-  ASSERT_TRUE(ShapeUtil::Equal(shape, shape_returned));
-}
-
-TEST_F(InProcessServiceTest, TransferToServerNoLayout) {
-  std::vector<float> input{1.0f, 2.0f, -42.0f};
-  Shape shape = ShapeUtil::MakeShape(F32, {3});
-  shape.clear_layout();
-  auto transfer_status =
-      client_->TransferToServerInProcess(shape, input.data());
-  ASSERT_EQ(transfer_status.status().code(),
-            tensorflow::error::INVALID_ARGUMENT);
-}
-
-XLA_TEST_F(InProcessServiceTest, ExecuteRowMajor) {
-  auto handle =
-      ExecuteR2F32Constant({{1.0, 2.0}, {3.0, 4.0}}, /*minor_to_major=*/{1, 0});
-
-  std::vector<float> result(4, 0.0);
-  Shape shape;
-  ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
-
-  EXPECT_MATCH(result, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
-}
-
-XLA_TEST_F(InProcessServiceTest, ExecuteColumnMajor) {
-  auto handle =
-      ExecuteR2F32Constant({{1.0, 2.0}, {3.0, 4.0}}, /*minor_to_major=*/{0, 1});
-
-  std::vector<float> result(4, 0);
-  Shape shape;
-  ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
-
-  EXPECT_MATCH(result, testing::VectorMatcher<float>({1.0, 3.0, 2.0, 4.0}));
-}
-
-XLA_TEST_F(InProcessServiceTest, ExecuteAndReuseDifferentLayouts) {
-  // Create arrays on the server which have different layouts. Verify the
-  // computation still produces the correct results.
-  auto handle_rowmaj =
-      ExecuteR2F32Constant({{1.0, 2.0}, {3.0, 4.0}}, /*minor_to_major=*/{1, 0});
-
-  auto handle_colmaj = ExecuteR2F32Constant({{10.0, 20.0}, {30.0, 40.0}},
-                                            /*minor_to_major=*/{0, 1});
-
-  ComputationBuilder builder(client_, TestName());
-  auto param0 =
-      builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "param0");
-  auto param1 =
-      builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "param1");
-  builder.Add(param0, param1);
-
-  Array2D<float> expected({{11.0, 22.0}, {33.0, 44.0}});
-  ComputeAndCompareR2<float>(&builder, expected,
-                             {handle_rowmaj.get(), handle_colmaj.get()},
-                             error_spec_);
-}
-
-}  // namespace
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  if (argc > 1) {
-    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
-    return 2;
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index f7bbc0f38bb..eb979ad189d 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -76,11 +76,11 @@ string Hostname() {
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
   if (ulhs != urhs) {
-    return testing::AssertionFailure() << tensorflow::strings::Printf(
+    return ::testing::AssertionFailure() << tensorflow::strings::Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
                tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
@@ -90,33 +90,33 @@ testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
                    .c_str(),
                rhs, rhs);
   }
-  return testing::AssertionSuccess();
+  return ::testing::AssertionSuccess();
 }
 
 // Templated comparator that specializes for float equality comparison with the
 // bitwise helper above (this is the un-specialized fallback, to just use the
 // default gunit implementation).
 template <typename NativeT>
-testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
+::testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
   if (lhs == rhs) {
-    return testing::AssertionSuccess();
+    return ::testing::AssertionSuccess();
   }
   ::testing::Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs;
   msg << "\n  " << rhs;
 
-  return testing::AssertionFailure() << msg;
+  return ::testing::AssertionFailure() << msg;
 }
 
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
-testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
+::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
   return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
 }
 template <>
-testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
+::testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
   return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
 }
 
@@ -130,7 +130,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = LiteralUtil::Get<NativeT>(expected, multi_index);
     NativeT actual_value = LiteralUtil::Get<NativeT>(actual, multi_index);
-    testing::AssertionResult result =
+    ::testing::AssertionResult result =
         CompareEqual<NativeT>(expected_value, actual_value);
     return result;  // Defines implicit coersion to bool.
   }
@@ -159,7 +159,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   EXPECT_FALSE(Equal(expected, actual));
 }
 
-/* static */ testing::AssertionResult LiteralTestUtil::Equal(
+/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
     const Literal& expected, const Literal& actual) {
   VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
   VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
@@ -207,9 +207,9 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
           << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
           << PrimitiveType_Name(expected.shape().element_type());
   }
-  testing::AssertionResult result = testing::AssertionSuccess();
+  ::testing::AssertionResult result = ::testing::AssertionSuccess();
   if (!match) {
-    result = testing::AssertionFailure()
+    result = ::testing::AssertionFailure()
              << "expected: " << LiteralUtil::ToString(expected)
              << "\nactual:   " << LiteralUtil::ToString(actual);
     VLOG(1) << result.message();
@@ -262,7 +262,7 @@ class NearComparator {
     max_abs_err_ = 0.0;
     *miscompares_.mutable_shape() =
         ShapeUtil::ChangeElementType(actual.shape(), PRED);
-    miscompares_.mutable_preds()->Resize(
+    miscompares_.mutable_preds()->resize(
         ShapeUtil::ElementsIn(miscompares_.shape()), false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
@@ -314,7 +314,7 @@ class NearComparator {
 
  private:
   // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occured to keep the size of the output
+  // track of how many mismatches have occurred to keep the size of the output
   // manageable.
   template <typename NativeT>
   bool ExpectValuesNear(NativeT expected, NativeT actual) {
@@ -389,7 +389,7 @@ class NearComparator {
         tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(),
                                     now_usec, name.c_str()));
     TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
-                                             filename, literal));
+                                             filename, literal.ToProto()));
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
@@ -421,12 +421,12 @@ class NearComparator {
 
 }  // namespace
 
-/* static */ testing::AssertionResult LiteralTestUtil::Near(
+/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
   NearComparator comparator(error);
   return comparator.ExpectNear(expected, actual)
-             ? testing::AssertionSuccess()
-             : testing::AssertionFailure() << "values were not near";
+             ? ::testing::AssertionSuccess()
+             : ::testing::AssertionFailure() << "values were not near";
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
@@ -435,14 +435,14 @@ class NearComparator {
   EXPECT_TRUE(Near(expected, actual, error));
 }
 
-/* static */ testing::AssertionResult LiteralTestUtil::NearTuple(
+/* static */ ::testing::AssertionResult LiteralTestUtil::NearTuple(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
   VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
   VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
 
   if (!ShapeUtil::IsTuple(expected.shape()) ||
       !ShapeUtil::IsTuple(actual.shape())) {
-    return testing::AssertionFailure()
+    return ::testing::AssertionFailure()
            << "tuples expected expected shape = "
            << expected.shape().ShortDebugString()
            << " actual shape = " << actual.shape().ShortDebugString();
@@ -469,7 +469,7 @@ class NearComparator {
     }
   }
 
-  return testing::AssertionSuccess();
+  return ::testing::AssertionSuccess();
 }
 
 /* static */ void LiteralTestUtil::ExpectNearTuple(const Literal& expected,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 85656a53e44..a8b07a2c5d1 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -18,15 +18,18 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
+#include <random>
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
@@ -57,7 +60,7 @@ class LiteralTestUtil {
   // Asserts that the expected and actual literals are (bitwise) equal for all
   // elements in the literal. Also, asserts that the rank, dimensions sizes, and
   // primitive type are equal.
-  static testing::AssertionResult Equal(
+  static ::testing::AssertionResult Equal(
       const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
 
   // Expects that expected and actual are Equal.
@@ -101,7 +104,7 @@ class LiteralTestUtil {
   // Asserts that the expected and actual literals are within the given error
   // bound for all elements. Also, asserts that the rank, dimensions sizes, and
   // bounds are equivalent. Only supported for floating point values.
-  static testing::AssertionResult Near(
+  static ::testing::AssertionResult Near(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
 
@@ -147,7 +150,7 @@ class LiteralTestUtil {
   // tuples are within the given error bound. Tuples are matched recursively.
   // If the elements of the tuple are not floating-point types, the error spec
   // is ignored and exact equality is checked.
-  static testing::AssertionResult NearTuple(
+  static ::testing::AssertionResult NearTuple(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
 
@@ -170,6 +173,36 @@ class LiteralTestUtil {
       tensorflow::gtl::ArraySlice<int64> minor_to_major,
       const Literal& literal);
 
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
@@ -269,6 +302,40 @@ template <typename NativeT>
   ExpectNear(*LiteralUtil::CreateR4FromArray4D(expected), actual, error);
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  std::unique_ptr<Literal> literal = LiteralUtil::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<NativeT>(
+      literal.get(), [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
+                                     T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_LITERAL_TEST_UTIL_H_
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index fdec11c0e98..a94f45f73b7 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -83,9 +83,10 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   LOG(INFO) << "results: [" << tensorflow::str_util::Join(results, ", ") << "]";
   EXPECT_EQ(3, results.size());
   for (const string& result : results) {
-    Literal literal;
+    LiteralProto literal_proto;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), result,
-                                            &literal));
+                                            &literal_proto));
+    Literal literal(literal_proto);
     if (result.find("expected") != string::npos) {
       EXPECT_EQ("2", LiteralUtil::ToString(literal));
     } else if (result.find("actual") != string::npos) {
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
index 5c7079267ba..89f9b8a701e 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
@@ -44,9 +44,8 @@ TEST_F(LocalClientAotTest, Constant) {
   OpaqueData opaque_data{100, 20, 3};
   void* parameters[] = {&opaque_data};
   float out = 0;
-  float tmp1 = 0;
-  float tmp2 = 0;
-  void* temporary_buffers[] = {&out, &tmp1, &tmp2, nullptr};
+  char tmp[20] = {0};
+  void* temporary_buffers[] = {&out, nullptr, &tmp};
   SumAndDouble(&out, &run_options, parameters, temporary_buffers);
   EXPECT_EQ(out, 246.0f);
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index eed51bd6ad4..52816dc72cc 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
 int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  auto client = xla::ClientLibrary::LocalClientOrDie();
+  auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
 
   xla::ComputationBuilder builder(client, "aot_test_helper");
   auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
@@ -74,7 +74,7 @@ int main(int argc, char** argv) {
   llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
 
   xla::Computation computation = builder.Build().ConsumeValueOrDie();
-  xla::LocalClient::AheadOfTimeComputationInstance instance{
+  xla::CompileOnlyClient::AotComputationInstance instance{
       &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
 
   xla::cpu::CpuAotCompilationOptions options(
@@ -89,11 +89,10 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 0);
-  CHECK_EQ(result->buffer_sizes().size(), 4);
+  CHECK_EQ(result->buffer_sizes().size(), 3);
   CHECK_EQ(result->buffer_sizes()[0], sizeof(float));  // result buffer
-  CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // temp buffer
-  CHECK_EQ(result->buffer_sizes()[2], sizeof(float));  // temp buffer
-  CHECK_EQ(result->buffer_sizes()[3], -1);             // param buffer
+  CHECK_EQ(result->buffer_sizes()[1], -1);             // param buffer
+  CHECK_EQ(result->buffer_sizes()[2], 20);             // temp buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 5c32ed88955..49207356e30 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -17,12 +17,19 @@ limitations under the License.
 
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -91,16 +98,34 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
   return allocator_;
 }
 
+// Define this in .cc file to avoid having to include eigen or forward declare
+// these types in the header.
+struct LocalClientTestBase::EigenThreadPoolWrapper {
+  explicit EigenThreadPoolWrapper()
+      : pool(new tensorflow::thread::ThreadPool(
+            tensorflow::Env::Default(), "XLAEigenTest", /*num_threads=*/2)),
+        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        device(new Eigen::ThreadPoolDevice(wrapper.get(),
+                                           wrapper->NumThreads())) {}
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<Eigen::ThreadPoolDevice> device;
+};
+
 LocalClientTestBase::LocalClientTestBase(
     perftools::gputools::Platform* platform)
     : local_client_(
-          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()) {
+          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
+      thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
   stream_executor_ = PlatformUtil::GetStreamExecutors(local_client_->platform())
                          .ValueOrDie()[local_client_->default_device_ordinal()];
   transfer_manager_ =
       TransferManager::GetForPlatform(local_client_->platform()).ValueOrDie();
 }
 
+LocalClientTestBase::~LocalClientTestBase() {}
+
 std::unique_ptr<ScopedShapedBuffer>
 LocalClientTestBase::LiteralToScopedShapedBuffer(const Literal& literal) {
   return LiteralToScopedShapedBuffer(literal,
@@ -166,55 +191,72 @@ LocalClientTestBase::ShapedBufferToScopedShapedBuffer(
   }
   *scoped_buffer->mutable_buffers() = shaped_buffer->buffers();
 
-  TF_CHECK_OK(
-      scoped_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElement(
-              [&shaped_buffer](const ShapeIndex& index, bool is_leaf,
-                               size_t* buffer_entry) -> ::tensorflow::Status {
-                if (is_leaf) {
-                  *buffer_entry =
-                      shaped_buffer->shape_index_to_buffer_entry().element(
-                          index);
-                }
-                return tensorflow::Status::OK();
-              }));
+  scoped_buffer->mutable_shape_index_to_buffer_entry()->ForEachMutableElement(
+      [&shaped_buffer](const ShapeIndex& index, size_t* buffer_entry) {
+        if (ShapeUtil::IsLeafIndex(shaped_buffer->shape(), index)) {
+          *buffer_entry =
+              shaped_buffer->shape_index_to_buffer_entry().element(index);
+        }
+      });
   return scoped_buffer;
 }
 
-LocalExecuteOptions LocalClientTestBase::DefaultLocalExecuteOptions() const {
-  return LocalExecuteOptions().set_allocator(
-      GetOrCreateAllocator(local_client_->platform()));
+ExecutableBuildOptions LocalClientTestBase::DefaultExecutableBuildOptions()
+    const {
+  return ExecutableBuildOptions();
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
+ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
+  ExecutableRunOptions run_options;
+  run_options.set_inter_op_thread_pool(
+      local_client_->backend().inter_op_thread_pool());
+  run_options.set_intra_op_thread_pool(thread_pool_wrapper_->device.get());
+  run_options.set_allocator(GetOrCreateAllocator(local_client_->platform()));
+  return run_options;
+}
+
+std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
-  return ExecuteLocally(computation, arguments, DefaultLocalExecuteOptions());
+  return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
+                        DefaultExecutableRunOptions())
+      .ConsumeValueOrDie();
 }
 
-std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
+std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocallyOrDie(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options) {
-  return ShapedBufferToScopedShapedBuffer(
-      local_client_->ExecuteLocally(computation, arguments, options)
-          .ConsumeValueOrDie(),
-      options.allocator());
+    const ExecutableBuildOptions& build_options,
+    const ExecutableRunOptions& run_options) {
+  return ExecuteLocally(computation, arguments, build_options, run_options)
+      .ConsumeValueOrDie();
 }
 
-void LocalClientTestBase::ExecuteLocally(
+StatusOr<std::unique_ptr<ScopedShapedBuffer>>
+LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    ShapedBuffer* result) {
-  ExecuteLocally(computation, arguments, DefaultLocalExecuteOptions(), result);
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
+  return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(),
+                        DefaultExecutableRunOptions());
 }
 
-void LocalClientTestBase::ExecuteLocally(
+StatusOr<std::unique_ptr<ScopedShapedBuffer>>
+LocalClientTestBase::ExecuteLocally(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options, ShapedBuffer* result) {
-  ASSERT_IS_OK(
-      local_client_->ExecuteLocally(computation, arguments, options, result));
+    const ExecutableBuildOptions& build_options,
+    const ExecutableRunOptions& run_options) {
+  std::vector<const Shape*> argument_layouts(arguments.size());
+  for (int i = 0; i < arguments.size(); ++i) {
+    argument_layouts[i] = &arguments[i]->shape();
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<LocalExecutable> executable,
+      local_client_->Compile(computation, argument_layouts, build_options));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> buffer,
+                      executable->Run(arguments, run_options));
+  return ShapedBufferToScopedShapedBuffer(std::move(buffer),
+                                          run_options.allocator());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 62916d50e3c..e3c3bb46cf2 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -74,8 +74,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 // A base class for tests which exercise the LocalClient interface.
 class LocalClientTestBase : public ::testing::Test {
  protected:
+  struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(
       perftools::gputools::Platform* platform = nullptr);
+  virtual ~LocalClientTestBase();
 
   static TestAllocator* GetOrCreateAllocator(
       perftools::gputools::Platform* platform);
@@ -99,27 +101,30 @@ class LocalClientTestBase : public ::testing::Test {
 
   // Execute the given computation on the local client. With and without
   // options.
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocally(
+  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
-  std::unique_ptr<ScopedShapedBuffer> ExecuteLocally(
+  StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
       const Computation& computation,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options);
+      const ExecutableBuildOptions& build_options,
+      const ExecutableRunOptions& run_options);
+
+  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+      const Computation& computation,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
+  std::unique_ptr<ScopedShapedBuffer> ExecuteLocallyOrDie(
+      const Computation& computation,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      const ExecutableBuildOptions& build_options,
+      const ExecutableRunOptions& run_options);
+
+  // Returns a default set of execute options.
+  ExecutableBuildOptions DefaultExecutableBuildOptions() const;
 
   // Returns a default set of execute options, configured to use allocator_
   // as the allocator.
-  LocalExecuteOptions DefaultLocalExecuteOptions() const;
-
-  // Overloads which write result into the given buffer.
-  void ExecuteLocally(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      ShapedBuffer* result);
-  void ExecuteLocally(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options, ShapedBuffer* result);
+  ExecutableRunOptions DefaultExecutableRunOptions() const;
 
   // Convert a ShapedBuffer into a ScopedShaped buffer so that all buffers are
   // deallocated when the object is destructed.
@@ -139,6 +144,8 @@ class LocalClientTestBase : public ::testing::Test {
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
+
+  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index b520d89de3c..796f43ea4ed 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -47,6 +48,7 @@ TEST_F(LogTest, LogTenValues) {
   builder.Log(x);
 
   std::vector<float> expected;
+  expected.reserve(input.size());
   for (float f : input) {
     expected.push_back(std::log(f));
   }
@@ -59,6 +61,7 @@ TEST_F(LogTest, LogTenValues) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 014417a2057..e4dbd6864a3 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -22,18 +22,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -42,8 +42,10 @@ namespace {
 class MapTest : public ClientLibraryTestBase {
  public:
   explicit MapTest(perftools::gputools::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform,
-                              /*disabled_pass_names=*/{"algsimp", "inline"}) {}
+      : ClientLibraryTestBase(platform) {
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+  }
 
   // Creates a function that adds its scalar argument with the constant 1.0.
   //
@@ -100,8 +102,8 @@ class MapTest : public ClientLibraryTestBase {
   // Creates a function that adds its scalar argument with the constant 1.0 and
   // then multiplies by the original element.
   //
-  //           /---------------\
-  //          /                 \
+  //           /------------------|
+  //          /                   |
   // x {R0F32} ----> (add) ----> (mul)
   //                /
   // 1.0f ---------/
@@ -147,8 +149,8 @@ class MapTest : public ClientLibraryTestBase {
 
   // Creates a function that adds three scalar arguments
   //
-  // x {R0F32} ----\
-  //                \
+  // x {R0F32} -------|
+  //                  |
   // y {R0F32} ----> (add) ---> (add)
   //                           /
   // z {R0F32} ---------------/
@@ -529,9 +531,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::HasSubstr("error from: ErrorAdd: binary op with "
-                                  "different element types: f32[] and u16[]"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              ::testing::HasSubstr("error from: ErrorAdd: binary op with "
+                                   "different element types: f32[] and u16[]"));
 }
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
@@ -568,12 +570,60 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
                              ErrorSpec(0.01f));
 }
 
+// Regression test for b/35786417, where the inliner would not notice the change
+// of parameter order inside the map.
+TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto sub_builder = builder.CreateSubBuilder("power");
+  auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
+  auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
+  sub_builder->Sub(y, x);  // note that this is y - x, not x - y
+  auto sub_opposite = sub_builder->BuildAndNoteError();
+
+  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(2.0f);
+  std::unique_ptr<Literal> param1_literal = LiteralUtil::CreateR0<float>(5.0f);
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> param1_data =
+      client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
+
+  auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
+  auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
+  builder.Map({param0, param1}, sub_opposite);
+
+  ComputeAndCompareR0<float>(
+      &builder, 3.0f, {param0_data.get(), param1_data.get()}, ErrorSpec(0.01f));
+}
+
+// Regression test for b/35786417, where the inliner would CHECK-fail due to the
+// mul inside the map having more parameters than the map does.
+TEST_F(MapTestWithFullOpt, MapSquare) {
+  ComputationBuilder builder(client_, TestName());
+
+  auto sub_builder = builder.CreateSubBuilder("power");
+  auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
+  sub_builder->Mul(x, x);
+  auto square = sub_builder->BuildAndNoteError();
+
+  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(10.0f);
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
+  builder.Map({param0}, square);
+
+  ComputeAndCompareR0<float>(&builder, 100.0f, {param0_data.get()},
+                             ErrorSpec(0.01f));
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 8aa40294406..51261f0ac1c 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -158,11 +159,65 @@ TEST_F(MatOpsSimpleTest, Max32x8Linspace) { TestLinspaceMax(32, 8); }
 
 TEST_F(MatOpsSimpleTest, Max64x8Linspace) { TestLinspaceMax(64, 8); }
 
+class MatOpsDotAddTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2) {
+  bool row_major = std::get<0>(GetParam());
+  bool add_lhs = std::get<1>(GetParam());
+  Array2D<float> lhs({{1.0, 2.0}, {3.0, 4.0}});
+  Array2D<float> rhs({{10.0, 11.0}, {12.0, 13.0}});
+
+  auto minor_to_major = [](bool row_major) -> std::vector<int64> {
+    return {row_major ? 1 : 0, row_major ? 0 : 1};
+  };
+
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+  Shape lhs_shape =
+      ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
+  Shape rhs_shape =
+      ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
+
+  TF_ASSIGN_OR_ASSERT_OK(
+      auto lhs_handle,
+      client_->TransferToServer(
+          *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
+              lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+  TF_ASSIGN_OR_ASSERT_OK(
+      auto rhs_handle,
+      client_->TransferToServer(
+          *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
+              rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
+  auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
+  auto result = builder.Dot(lhs_arg, rhs_arg);
+  Array2D<float> expected;
+  if (add_lhs) {
+    result = builder.Add(result, lhs_arg);
+    expected = Array2D<float>({{35, 39}, {81, 89}});
+  } else {
+    result = builder.Add(result, rhs_arg);
+    expected = Array2D<float>({{44, 48}, {90, 98}});
+  }
+
+  ComputeAndCompareR2<float>(&builder, expected,
+                             {lhs_handle.get(), rhs_handle.get()},
+                             ErrorSpec(1e-6));
+}
+
+INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index 2cd680399b3..4929e25c580 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -36,7 +37,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
   ComputationBuilder builder(client_, "slice_2d");
   auto original = builder.ConstantR2<float>(
       {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
-  builder.Slice(original, {2, 1}, {4, 3});
+  builder.Slice(original, {2, 1}, {4, 3}, {1, 1});
 
   Array2D<float> expected({{8.0f, 9.0f}, {11.0f, 12.0f}});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -47,7 +48,7 @@ XLA_TEST_F(SliceTest, Slice3D) {
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
   auto original = builder.ConstantR3FromArray3D<float>(array_3d);
-  builder.Slice(original, {0, 0, 1}, {2, 1, 2});
+  builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1});
 
   Array3D<float> expected_3d({{{2.0f}}, {{6.0f}}});
   ComputeAndCompareR3<float>(&builder, expected_3d, {}, ErrorSpec(0.000001));
@@ -58,6 +59,7 @@ XLA_TEST_F(SliceTest, Slice3D) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index f044c94b8d0..4922bbf21c4 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -468,6 +469,7 @@ XLA_TEST_F(PadTest, ReducePad) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 2f05576ceeb..3e1bfcd3090 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -163,7 +164,7 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   auto execute_status = client_->Execute(computation, {data.get(), data.get()},
-                                         /*output_layout=*/nullptr,
+                                         /*execution_options=*/nullptr,
                                          /*execution_profile=*/nullptr);
   ASSERT_EQ(execute_status.status().code(),
             tensorflow::error::FAILED_PRECONDITION);
@@ -246,6 +247,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
   }
 
   std::vector<GlobalData*> param_data;
+  param_data.reserve(param_data_owner.size());
   for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
     param_data.push_back(data.get());
   }
@@ -326,7 +328,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
   ComputationBuilder builder(client_, TestName());
   auto input = builder.Parameter(0, original, "input");
   // Use the slice operator to get an off-diagonal element.
-  builder.Slice(input, {0, 1}, {1, 2});
+  builder.Slice(input, {0, 1}, {1, 2}, {1, 1});
 
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -341,6 +343,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 96393c41e80..b031725d8ab 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -18,9 +18,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -94,11 +97,51 @@ TEST_F(PredTest, ConstantR2Pred) {
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
 
+TEST_F(PredTest, AnyR1True) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<bool>({true, false});
+  TF_ASSERT_OK(Any(a, &builder).status());
+  ComputeAndCompareR0<bool>(&builder, true, {});
+}
+
+TEST_F(PredTest, AnyR1False) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<bool>({false, false});
+  TF_ASSERT_OK(Any(a, &builder).status());
+  ComputeAndCompareR0<bool>(&builder, false, {});
+}
+
+TEST_F(PredTest, AnyR1VacuouslyFalse) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<bool>({});
+  TF_ASSERT_OK(Any(a, &builder).status());
+  ComputeAndCompareR0<bool>(&builder, false, {});
+}
+
+TEST_F(PredTest, AnyR2True) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR2<bool>({
+      {false, false, false}, {false, false, false}, {false, false, true},
+  });
+  TF_ASSERT_OK(Any(a, &builder).status());
+  ComputeAndCompareR0<bool>(&builder, true, {});
+}
+
+TEST_F(PredTest, AnyR2False) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR2<bool>({
+      {false, false, false}, {false, false, false}, {false, false, false},
+  });
+  TF_ASSERT_OK(Any(a, &builder).status());
+  ComputeAndCompareR0<bool>(&builder, false, {});
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 1b4b170dfd4..5117478bfd5 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -38,6 +40,12 @@ class PrngTest : public ClientLibraryTestBase {
   template <typename T>
   void UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims);
   void BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims);
+
+  // Computes the χ² statistic of a sample of the discrete uniform distribution
+  // of the given range size. `expected_count` is the number of times each
+  // possible value is expected to be generated. Thus, the sample size is
+  // `range_size * expected_count`.
+  double UniformChiSquared(int32 range_size, int32 expected_count);
 };
 
 template <typename T>
@@ -47,8 +55,9 @@ void PrngTest::UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims) {
       builder.ConstantR0<T>(a), builder.ConstantR0<T>(b),
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(), dims));
 
+  SetSeed(42);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   LiteralUtil::EachCell<T>(*actual,
                            [=](tensorflow::gtl::ArraySlice<int64>, T value) {
                              EXPECT_LE(a, value);
@@ -68,7 +77,7 @@ void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
       auto actual,
       client_->ExecuteAndTransfer(computation, /*arguments=*/{},
                                   &execution_options));
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   int32 sum = 0;
   LiteralUtil::EachCell<uint32>(
       *actual, [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
@@ -97,6 +106,57 @@ XLA_TEST_F(PrngTest, ZeroValuesR2) { UniformTest<float>(0, 1, {0, 20}); }
 XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
+namespace {
+template <typename T>
+T Square(T x) {
+  return x * x;
+}
+}  // namespace
+
+double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count) {
+  int32 sample_size = range_size * expected_count;
+
+  ComputationBuilder builder(client_, TestName());
+  builder.RngUniform(builder.ConstantR0<int32>(0),
+                     builder.ConstantR0<int32>(range_size),
+                     ShapeUtil::MakeShape(S32, {sample_size}));
+
+  SetSeed(42);
+  auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+  std::vector<int32> counts(range_size, 0);
+  LiteralUtil::EachCell<int32>(
+      *actual, [&counts](tensorflow::gtl::ArraySlice<int64>, int32 value) {
+        ++counts[value];
+      });
+  int64 sum = 0;
+  for (int32 i = 0; i < range_size; ++i) {
+    sum += Square(static_cast<int64>(counts[i] - expected_count));
+  }
+  return static_cast<double>(sum) / expected_count;
+}
+
+// We only test distribution of uniform discrete PRNG as other types are based
+// on it.
+// These range sizes are arbitrary but include prime numbers, powers of 2, and
+// other composite numbers.
+// The level of significance in all these cases is 1/20.
+// TODO(b/35723038): Use parametrized tests where possible.
+XLA_TEST_F(PrngTest, Uniformity7) {
+  EXPECT_LT(UniformChiSquared(7, 256), 12.5916);
+}
+XLA_TEST_F(PrngTest, Uniformity61) {
+  EXPECT_LT(UniformChiSquared(61, 256), 79.0819);
+}
+XLA_TEST_F(PrngTest, Uniformity64) {
+  EXPECT_LT(UniformChiSquared(64, 256), 82.5287);
+}
+XLA_TEST_F(PrngTest, Uniformity108) {
+  EXPECT_LT(UniformChiSquared(108, 256), 132.144);
+}
+XLA_TEST_F(PrngTest, Uniformity256) {
+  EXPECT_LT(UniformChiSquared(256, 256), 293.248);
+}
+
 XLA_TEST_F(PrngTest, MapUsingRng) {
   // Build a x -> (x + U[0,1)) computation.
   auto build_sum_rng = [this](ComputationBuilder& builder) {
@@ -135,7 +195,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
   }
 }
 
-// This tests demonstrates the global seeding behaviour.
+// This tests demonstrates the global seeding behavior.
 // * If a seed is passed in via Execute (ExecuteAndTransfer) then the output is
 //   fixed (i.e., there is a single output for a given seed);
 // * If no seed is passed in then the output of every call can be different;
@@ -208,6 +268,7 @@ XLA_TEST_F(PrngTest, TenValuesN01) {
   builder.RngNormal(builder.ConstantR0<float>(0), builder.ConstantR0<float>(1),
                     ShapeUtil::MakeShape(F32, {10}));
 
+  SetSeed(42);
   ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
   // TODO(b/25995601): Test that resultant values are reasonable
 }
@@ -217,6 +278,7 @@ XLA_TEST_F(PrngTest, TenValuesN01) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index eb7e63705b2..4a02567a1a2 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -46,6 +47,7 @@ TEST_F(QueryInferredShapeTest, OnePlusOneShape) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index f3d8da5c8c8..ff24177520e 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -61,7 +62,7 @@ namespace {
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
-    // Implementation note: layed out z >> y >> x by default.
+    // Implementation note: laid out z >> y >> x by default.
     // clang-format off
     literal_2d_ = LiteralUtil::CreateR2<float>({
       // x0   x1   x2
@@ -109,6 +110,41 @@ class ReduceTest : public ClientLibraryTestBase {
                                ErrorSpec(0.001));
   }
 
+  void RunR1ToR0PredTest(bool and_reduce,
+                         tensorflow::gtl::ArraySlice<int> input_data) {
+    const int element_count = input_data.size();
+    ComputationBuilder builder(client_, TestName());
+    const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count});
+    auto input_par = builder.Parameter(0, input_shape, "input");
+    auto pred_values =
+        builder.Eq(input_par, builder.ConstantR1<int>(element_count, 1));
+    ComputationDataHandle init_value;
+    Computation reduce;
+    if (and_reduce) {
+      init_value = builder.ConstantR0<bool>(true);
+      reduce = CreateScalarLogicalAndComputation(&builder);
+    } else {
+      init_value = builder.ConstantR0<bool>(false);
+      reduce = CreateScalarLogicalOrComputation(&builder);
+    }
+    builder.Reduce(pred_values, init_value, reduce,
+                   /*dimensions_to_reduce=*/{0});
+
+    std::unique_ptr<Literal> input_literal = LiteralUtil::CreateR1(input_data);
+    std::unique_ptr<GlobalData> input_global_data =
+        client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+    bool expected = and_reduce;
+    for (bool item : input_data) {
+      if (and_reduce) {
+        expected = expected && item;
+      } else {
+        expected = expected || item;
+      }
+    }
+    ComputeAndCompareR0<bool>(&builder, expected, {input_global_data.get()});
+  }
+
   // Runs an R2 => R0 reduction test with the given number of (rows, cols).
   void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
     ComputationBuilder builder(client_, TestName());
@@ -176,9 +212,9 @@ XLA_TEST_F(ReduceTest, ReduceR1_0_F32_To_R0) { RunR1ToR0Test(0); }
 XLA_TEST_F(ReduceTest, ReduceR1_1_F32_To_R0) { RunR1ToR0Test(1); }
 XLA_TEST_F(ReduceTest, ReduceR1_2_F32_To_R0) { RunR1ToR0Test(2); }
 XLA_TEST_F(ReduceTest, ReduceR1_16_F32_To_R0) { RunR1ToR0Test(16); }
-XLA_TEST_F(ReduceTest, ReduceR1_240_F32_To_R0) { RunR1ToR0Test(240); }
 XLA_TEST_F(ReduceTest, ReduceR1_128_F32_To_R0) { RunR1ToR0Test(128); }
 XLA_TEST_F(ReduceTest, ReduceR1_129_F32_To_R0) { RunR1ToR0Test(129); }
+XLA_TEST_F(ReduceTest, ReduceR1_240_F32_To_R0) { RunR1ToR0Test(240); }
 XLA_TEST_F(ReduceTest, ReduceR1_256_F32_To_R0) { RunR1ToR0Test(256); }
 XLA_TEST_F(ReduceTest, ReduceR1_1024_F32_To_R0) { RunR1ToR0Test(1024); }
 XLA_TEST_F(ReduceTest, ReduceR1_2048_F32_To_R0) { RunR1ToR0Test(2048); }
@@ -186,6 +222,9 @@ XLA_TEST_F(ReduceTest, ReduceR1_16K_F32_To_R0) { RunR1ToR0Test(16 * 1024); }
 XLA_TEST_F(ReduceTest, ReduceR1_16KP1_F32_To_R0) {
   RunR1ToR0Test(16 * 1024 + 1);
 }
+XLA_TEST_F(ReduceTest, ReduceR1_64K_F32_To_R0) { RunR1ToR0Test(64 * 1024); }
+XLA_TEST_F(ReduceTest, ReduceR1_1M_F32_To_R0) { RunR1ToR0Test(1024 * 1024); }
+XLA_TEST_F(ReduceTest, ReduceR1_16M_F32_To_R0) { RunR1ToR0Test(4096 * 4096); }
 
 XLA_TEST_F(ReduceTest, ReduceR2_0x0_To_R0) { RunR2ToR0Test(0, 0); }
 XLA_TEST_F(ReduceTest, ReduceR2_0x2_To_R0) { RunR2ToR0Test(0, 2); }
@@ -219,6 +258,40 @@ XLA_TEST_F(ReduceTest, ReduceR2_111x50_01_To_R1) {
 XLA_TEST_F(ReduceTest, ReduceR2_1024x1024_To_R1) { RunR2ToR1Test(1024, 1024); }
 XLA_TEST_F(ReduceTest, ReduceR2_1000x1500_To_R1) { RunR2ToR1Test(1000, 1500); }
 
+// TODO(b/34969189): Invalid CAS generated on GPU.
+XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceAllOnesR1_10_Pred)) {
+  constexpr int element_count = 10;
+  std::vector<int> input(element_count, 1);
+  RunR1ToR0PredTest(/*and_reduce=*/true, input);
+}
+
+// TODO(b/34969189): Invalid CAS generated on GPU.
+XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
+  constexpr int element_count = 10;
+  std::vector<int> input(element_count);
+  for (int i = 0; i < element_count; ++i) {
+    input[i] = i % 2;
+  }
+  RunR1ToR0PredTest(/*and_reduce=*/true, input);
+}
+
+// TODO(b/34969189): Invalid CAS generated on GPU.
+XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceAllOnesR1_10_Pred)) {
+  constexpr int element_count = 10;
+  std::vector<int> input(element_count, 1);
+  RunR1ToR0PredTest(/*and_reduce=*/false, input);
+}
+
+// TODO(b/34969189): Invalid CAS generated on GPU.
+XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceOnesAndZerosR1_10_Pred)) {
+  constexpr int element_count = 10;
+  std::vector<int> input(element_count);
+  for (int i = 0; i < element_count; ++i) {
+    input[i] = i % 2;
+  }
+  RunR1ToR0PredTest(/*and_reduce=*/false, input);
+}
+
 XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
@@ -251,6 +324,72 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
                              ErrorSpec(0.01, 1e-4));
 }
 
+XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
+  const int64 rows = 111, cols = 50;
+
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto zero = builder.ConstantR0<float>(0.0);
+  auto log_ = builder.Log(input);
+  auto transpose = builder.Transpose(log_, {1, 0});
+  builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{1});
+
+  Array2D<float> input_data(rows, cols);
+  input_data.FillRandom(3.14f, 0.04);
+  std::unique_ptr<Literal> input_literal =
+      LiteralUtil::CreateR2FromArray2D(input_data);
+  input_literal =
+      LiteralUtil::Relayout(*input_literal, LayoutUtil::MakeLayout({0, 1}));
+  std::unique_ptr<GlobalData> input_global_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+  std::vector<float> expected;
+  for (int64 colno = 0; colno < cols; ++colno) {
+    float column_sum = 0;
+    for (int64 rowno = 0; rowno < rows; ++rowno) {
+      column_sum += log(input_data(rowno, colno));
+    }
+    expected.push_back(column_sum);
+  }
+  ComputeAndCompareR1<float>(&builder, expected, {input_global_data.get()},
+                             ErrorSpec(0.01, 1e-4));
+}
+
+XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
+  const int64 rows = 111, cols = 50;
+
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto zero = builder.ConstantR0<float>(0.0);
+  auto log_ = builder.Log(input);
+  auto reshape = builder.Reshape(log_, {rows, cols});
+  builder.Reduce(reshape, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  Array3D<float> input_data(rows, 2, cols / 2);
+  input_data.FillRandom(3.14f, 0.04);
+  std::unique_ptr<Literal> input_literal =
+      LiteralUtil::CreateR3FromArray3D(input_data);
+  std::unique_ptr<GlobalData> input_global_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+  std::vector<float> expected;
+  for (int64 major = 0; major < 2; ++major) {
+    for (int64 colno = 0; colno < cols / 2; ++colno) {
+      float column_sum = 0;
+      for (int64 rowno = 0; rowno < rows; ++rowno) {
+        column_sum += log(input_data(rowno, major, colno));
+      }
+      expected.push_back(column_sum);
+    }
+  }
+  ComputeAndCompareR1<float>(&builder, expected, {input_global_data.get()},
+                             ErrorSpec(0.01, 1e-4));
+}
+
 struct BoundsLayout {
   std::vector<int64> bounds;
   std::vector<int64> layout;
@@ -490,6 +629,7 @@ INSTANTIATE_TEST_CASE_P(
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 149a75c8e10..ec7b47bc283 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -43,7 +44,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
  public:
   ReduceWindowTest() : builder_(client_, TestName()) {}
 
-  void ReduceWindowAdd(ComputationDataHandle input,
+  void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -52,7 +53,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(ComputationDataHandle input,
+  void ReduceWindowMax(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -61,7 +62,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
         CreateScalarMax(), window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(ComputationDataHandle input,
+  void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -182,6 +183,7 @@ TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
 
   ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
 }
+
 // TODO(b/31809540): Implement minor dim reduction to reduce num of reshapes.
 TEST_F(ReduceWindowTest, ReduceR4AmongXYMinorSmall) {
   Array4D<float> input_array(2, 2, 4, 16);
@@ -368,6 +370,16 @@ TEST_F(ReduceWindowTest, Add2x2In2x2Disjoint) {
   ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(ReduceWindowTest, Add1x2In2x2Same) {
+  Array2D<float> input_array({{1.0f, 2.0f}, {3.0f, 4.0f}});
+  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
+  ReduceWindowAdd(input, {1, 2}, {1, 1}, Padding::kSame);
+  Array2D<float> expected({
+      {3.0f, 2.0f}, {7.0f, 4.0f},
+  });
+  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
@@ -446,10 +458,16 @@ XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
                         /*window_dimensions=*/{1, 1, 2, 1},
                         /*window_strides=*/{1, 1, 1, 1}, padding);
 
-  Array4D<float> expected(1, 2, 1, 1);
-  expected(0, 0, 0, 0) = 6;
-  expected(0, 1, 0, 0) = 8;
-  ComputeAndCompareR4<float>(&builder_, expected, {}, ErrorSpec(1e-3, 1e-3));
+  const auto reduce_func = [](float arg1, float arg2) {
+    return std::min<float>(arg1 + arg2, 8.0f);
+  };
+
+  auto expected =
+      ReferenceUtil::ReduceWindow4DGeneric(input_array, 3.0f, reduce_func,
+                                           /*window=*/{1, 1, 2, 1},
+                                           /*stride=*/{1, 1, 1, 1}, padding);
+
+  ComputeAndCompareR4<float>(&builder_, *expected, {}, ErrorSpec(1e-3, 1e-3));
 }
 
 }  // namespace
@@ -457,6 +475,7 @@ XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 802087b5086..7c6700feef8 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
@@ -152,6 +153,7 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index ce309eb7439..c9817bc23d8 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -61,6 +62,7 @@ TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 18e6e2d3f1d..ae7d07727b1 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -26,18 +26,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -68,6 +68,22 @@ XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
   ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
 }
 
+XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(1.0f);
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
+  a = builder.Neg(a);
+  auto reshape =
+      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
@@ -76,6 +92,24 @@ XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-05-15
+// with an incorrect result rank.
+XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
+  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
+
+  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial3x0) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
@@ -383,15 +417,15 @@ XLA_TEST_F(ReshapeTest, ToScalar) {
 XLA_TEST_F(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
   b.Reshape(b.ConstantR1<int32>({1}), {}, {});
-  EXPECT_MATCH(ExecuteToString(&b, {}),
-               testing::HasSubstr("dimensions not a permutation"));
+  EXPECT_THAT(ExecuteToString(&b, {}),
+              ::testing::HasSubstr("dimensions not a permutation"));
 }
 
 XLA_TEST_F(ReshapeTest, BadNewSizes) {
   ComputationBuilder b(client_, TestName());
   b.Reshape(b.ConstantR1<int32>({1, 2}), {1}, {});
-  EXPECT_MATCH(ExecuteToString(&b, {}),
-               testing::HasSubstr("mismatched element counts"));
+  EXPECT_THAT(ExecuteToString(&b, {}),
+              ::testing::HasSubstr("mismatched element counts"));
 }
 
 XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
@@ -796,6 +830,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 63dd4421fad..5ca9702380f 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -157,6 +158,7 @@ TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
index 5b734c0f400..05ce22fc359 100644
--- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/packed_literal_reader.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -144,6 +145,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
index 04a8bab0eb8..f0760241cdb 100644
--- a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -148,6 +148,7 @@ TEST_F(RoundTripTransferTest, R4F32_Large) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 4d68ba46211..47a39ffbbc4 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -245,37 +247,183 @@ XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) {
   ComputeAndCompareR0<float>(&builder, 2.5f, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Div(builder.ConstantR0<int32>(-5), builder.ConstantR0<int32>(2));
+struct DivS32Params {
+  int32 dividend;
+  int32 divisor;
+  int32 quotient;
+  int32 remainder;
+};
 
-  ComputeAndCompareR0<int32>(&builder, -2, {});
+void PrintTo(const DivS32Params& p, std::ostream* os) {
+  *os << "{" << p.dividend << ", " << p.divisor << ", " << p.quotient << ", "
+      << p.remainder << "}";
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsNegativeResultS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(-5), builder.ConstantR0<int32>(2));
+class DivS32Test : public ClientLibraryTestBase,
+                   public ::testing::WithParamInterface<DivS32Params> {};
 
-  ComputeAndCompareR0<int32>(&builder, -1, {});
+XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  builder.Div(builder.ConstantR0<int32>(p.dividend),
+              builder.ConstantR0<int32>(p.divisor));
+
+  ComputeAndCompareR0<int32>(&builder, p.quotient, {});
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsIntMinS32) {
+XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
+  DivS32Params p = GetParam();
   ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(INT_MIN),
-              builder.ConstantR0<int32>(7919));
+  builder.Rem(builder.ConstantR0<int32>(p.dividend),
+              builder.ConstantR0<int32>(p.divisor));
 
-  ComputeAndCompareR0<int32>(&builder, -1309, {});
+  ComputeAndCompareR0<int32>(&builder, p.remainder, {});
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsIntMinVsIntMaxS32) {
+XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
+  DivS32Params p = GetParam();
   ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(INT_MIN),
-              builder.ConstantR0<int32>(INT_MAX));
+  ComputationDataHandle dividend;
+  ComputationDataHandle divisor;
+  auto dividendd =
+      CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
+  auto divisord =
+      CreateR0Parameter<int32>(p.divisor, 1, "divisor", &builder, &divisor);
+  builder.Div(dividend, divisor);
 
-  ComputeAndCompareR0<int32>(&builder, -1, {});
+  ComputeAndCompareR0<int32>(&builder, p.quotient,
+                             {dividendd.get(), divisord.get()});
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsPositiveResultS32) {
+XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle dividend;
+  ComputationDataHandle divisor;
+  auto dividendd =
+      CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
+  auto divisord =
+      CreateR0Parameter<int32>(p.divisor, 1, "divisor", &builder, &divisor);
+  builder.Rem(dividend, divisor);
+
+  ComputeAndCompareR0<int32>(&builder, p.remainder,
+                             {dividendd.get(), divisord.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DivS32Test_Instantiation, DivS32Test,
+    ::testing::Values(
+        // Positive divisors.
+        DivS32Params{5, 2, 2, 1},      //
+        DivS32Params{-5, 2, -2, -1},   //
+        DivS32Params{17, 3, 5, 2},     //
+        DivS32Params{-17, 3, -5, -2},  //
+        // Negative divisors.
+        DivS32Params{5, -2, -2, 1},    //
+        DivS32Params{-5, -2, 2, -1},   //
+        DivS32Params{17, -3, -5, 2},   //
+        DivS32Params{-17, -3, 5, -2},  //
+        // Large positive divisors.
+        DivS32Params{INT32_MIN, 7919, -271181, -1309},             //
+        DivS32Params{INT32_MIN, INT32_MAX, -1, -1},                //
+        DivS32Params{INT32_MIN + 1, INT32_MAX, -1, 0},             //
+        DivS32Params{INT32_MIN + 2, INT32_MAX, 0, INT32_MIN + 2},  //
+        DivS32Params{INT32_MIN, 0x40000000, -2, 0},                //
+        DivS32Params{INT32_MIN + 1, 0x40000000, -1, -0x3fffffff},  //
+        // Large negative divisors.
+        DivS32Params{INT32_MIN, INT32_MIN, 1, 0},                  //
+        DivS32Params{INT32_MIN, INT32_MIN + 1, 1, -1},             //
+        DivS32Params{INT32_MIN + 1, INT32_MIN, 0, INT32_MIN + 1},  //
+        DivS32Params{INT32_MAX, INT32_MIN, 0, INT32_MAX},          //
+        DivS32Params{INT32_MAX, INT32_MIN + 1, -1, 0},             //
+        DivS32Params{INT32_MIN, -0x40000000, 2, 0},                //
+        DivS32Params{INT32_MIN + 1, -0x40000000, 1, -0x3fffffff}));
+
+TEST_F(ScalarComputationsTest, DivU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
+
+  Computation div_computation;
+  {
+    ComputationBuilder builder(client_, TestName());
+
+    ComputationDataHandle dividend =
+        builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
+    ComputationDataHandle divisor =
+        builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
+    builder.Div(dividend, divisor);
+    TF_ASSIGN_OR_ASSERT_OK(div_computation, builder.Build());
+  }
+
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
+        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
+        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
+                               client_->TransferToServer(*dividend_literal));
+        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
+                               client_->TransferToServer(*divisor_literal));
+        auto actual_literal =
+            client_
+                ->ExecuteAndTransfer(div_computation,
+                                     {dividend_data.get(), divisor_data.get()},
+                                     &execution_options_)
+                .ConsumeValueOrDie();
+        auto expected_literal =
+            LiteralUtil::CreateR0<uint32>(dividend / divisor);
+        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+      }
+    }
+  }
+}
+
+TEST_F(ScalarComputationsTest, RemU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
+
+  Computation rem_computation;
+  {
+    ComputationBuilder builder(client_, TestName());
+
+    ComputationDataHandle dividend =
+        builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
+    ComputationDataHandle divisor =
+        builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
+    builder.Rem(dividend, divisor);
+    TF_ASSIGN_OR_ASSERT_OK(rem_computation, builder.Build());
+  }
+
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
+        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
+        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
+                               client_->TransferToServer(*dividend_literal));
+        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
+                               client_->TransferToServer(*divisor_literal));
+        auto actual_literal =
+            client_
+                ->ExecuteAndTransfer(rem_computation,
+                                     {dividend_data.get(), divisor_data.get()},
+                                     &execution_options_)
+                .ConsumeValueOrDie();
+        auto expected_literal =
+            LiteralUtil::CreateR0<uint32>(dividend % divisor);
+        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+      }
+    }
+  }
+}
+
+TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
@@ -295,6 +443,13 @@ XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
   ComputeAndCompareR0<uint32>(&builder, 0x7FFFFFFF, {});
 }
 
+XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Rem(builder.ConstantR0<uint32>(11), builder.ConstantR0<uint32>(3));
+
+  ComputeAndCompareR0<uint32>(&builder, 2, {});
+}
+
 TEST_F(ScalarComputationsTest, LogicalAnd) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
@@ -626,6 +781,7 @@ TEST_F(ScalarComputationsTest, SqrtF320) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index fb1effc8c46..36110da2478 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -379,6 +380,7 @@ XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMinScatter) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 5ec9ac95fae..5eb4fee8ed2 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -260,6 +261,7 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/set_return_value_test.cc b/tensorflow/compiler/xla/tests/set_return_value_test.cc
index e15d744d953..25bb915be56 100644
--- a/tensorflow/compiler/xla/tests/set_return_value_test.cc
+++ b/tensorflow/compiler/xla/tests/set_return_value_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -100,6 +101,7 @@ TEST_F(SetReturnValueTest, SetValueMultipleTimesAndModify) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index d63582fb98a..70345c300cc 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -37,13 +38,14 @@ class SliceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void RunSliceTenToTwo() {
     std::vector<NativeT> constant;
+    constant.reserve(10);
     for (int i = 0; i < 10; ++i) {
       constant.push_back(static_cast<NativeT>(i));
     }
 
     ComputationBuilder builder(client_, TestName());
     auto original = builder.ConstantR1<NativeT>(constant);
-    builder.Slice(original, {2}, {4});
+    builder.Slice(original, {2}, {4}, {1});
 
     const std::vector<NativeT> expected = {static_cast<NativeT>(2),
                                            static_cast<NativeT>(3)};
@@ -54,7 +56,7 @@ class SliceTest : public ClientLibraryTestBase {
 XLA_TEST_F(SliceTest, SliceZeroToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>({});
-  builder.Slice(original, {0}, {0});
+  builder.Slice(original, {0}, {0}, {1});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -63,7 +65,7 @@ XLA_TEST_F(SliceTest, SliceTenToZeroF32) {
   ComputationBuilder builder(client_, TestName());
   std::vector<float> constant(10, 0.3);
   auto original = builder.ConstantR1<float>(constant);
-  builder.Slice(original, {7}, {7});
+  builder.Slice(original, {7}, {7}, {1});
 
   ComputeAndCompareR1<float>(&builder, {}, {});
 }
@@ -86,7 +88,7 @@ TEST_F(SliceTest, SliceTenToTen) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {0}, {10});
+  builder.Slice(original, {0}, {10}, {1});
 
   ComputeAndCompareR1<float>(&builder, values, {}, ErrorSpec(0.000001));
 }
@@ -97,7 +99,7 @@ TEST_F(SliceTest, SliceLastFourOf1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {1024 - 4}, {1024});
+  builder.Slice(original, {1024 - 4}, {1024}, {1});
 
   const std::vector<float> expected = {1020, 1021, 1022, 1023};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -111,7 +113,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {7}, {7 + 1024});
+  builder.Slice(original, {7}, {7 + 1024}, {1});
 
   std::vector<float> expected(1024);
   std::iota(values.begin(), values.end(), 7.0);
@@ -121,7 +123,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
-  builder.Slice(original, {0, 0}, {0, 0});
+  builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {});
 }
@@ -129,7 +131,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
-  builder.Slice(original, {0, 15}, {0, 20});
+  builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 5), {});
 }
@@ -137,7 +139,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  builder.Slice(original, {1, 0}, {3, 0});
+  builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {});
 }
@@ -152,7 +154,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {128, 128}, {256, 256});
+  builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
   Array2D<float> expected(128, 128);
   for (int row = 0; row < 128; ++row) {
@@ -170,7 +172,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
 
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 3072}, {1, 4096});
+  builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
   Array2D<float> expected(1, 1024);
   std::iota(expected.data(), expected.data() + 1024, 3072.0);
@@ -191,7 +193,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
   }
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 0}, {16, 2});
+  builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
@@ -203,7 +205,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
       ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}});
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR4FromArray4D(values);
-  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128});
+  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
@@ -212,6 +214,7 @@ struct R2Spec {
   int64 input_dim1;
   std::array<int64, 2> slice_starts;
   std::array<int64, 2> slice_limits;
+  std::array<int64, 2> slice_strides;
   Layout layout;
 };
 
@@ -227,7 +230,7 @@ TEST_P(SliceR2Test, DoIt) {
 
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<int32>(input);
-  builder.Slice(a, spec.slice_starts, spec.slice_limits);
+  builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
 
   std::unique_ptr<Array2D<int32>> expected =
       ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits);
@@ -238,19 +241,23 @@ TEST_P(SliceR2Test, DoIt) {
 INSTANTIATE_TEST_CASE_P(
     SliceR2TestInstantiation, SliceR2Test,
     ::testing::Values(
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {256, 400, {{0, 300}}, {{256, 400}},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 257}},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 400}},
+        R2Spec {256, 400, {{0, 300}}, {{256, 400}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {384, 512, {{128, 256}}, {{256, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 257}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {357, 512, {{111, 256}}, {{301, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 400}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})}
     )
 );
@@ -261,6 +268,7 @@ INSTANTIATE_TEST_CASE_P(
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index 79f251bbc48..e4951c42010 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -187,6 +188,7 @@ TEST_F(TransposeTest, TransposeConstant021_MultipleTilesPerLayer) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index cea9316a6d6..6309e712973 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -399,6 +400,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index fdbaa0d1786..61110d5b4cd 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -163,6 +164,7 @@ TEST_F(UnaryOpTest, SignAbsTestR2) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index 7f3d7d9cb4c..26a08953b15 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -219,6 +220,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32AllDims) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index d9fc1e1e8f5..efde45375fd 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -41,8 +42,10 @@ namespace {
 class VecOpsSimpleTest : public ClientLibraryTestBase {
  public:
   explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform,
-                              /*disabled_pass_names=*/{"algsimp", "inline"}) {}
+      : ClientLibraryTestBase(platform) {
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+  }
 
   ErrorSpec error_spec_{0.0001};
 };
@@ -64,6 +67,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
   for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
     ComputationBuilder builder(client_, TestName());
     std::vector<float> exponents;
+    exponents.reserve(count);
     for (int i = 0; i < count; ++i) {
       exponents.push_back(i / static_cast<float>(count));
     }
@@ -71,6 +75,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
     auto exp = builder.Exp(x);
 
     std::vector<float> expected;
+    expected.reserve(exponents.size());
     for (float exponent : exponents) {
       expected.push_back(std::exp(exponent));
     }
@@ -155,6 +160,35 @@ TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR1<float>({0.0, -0.0});
+  auto exp = builder.SqrtF32(x);
+
+  ComputeAndCompareR1<float>(&builder, {0, 0}, {}, error_spec_);
+}
+
+XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345});
+  auto exp = builder.SqrtF32(x);
+
+  std::vector<float> expected = {4, 1, 32, 0.4, 0.4472, 111.1080};
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
+  ComputationBuilder builder(client_, TestName());
+  auto x =
+      builder.ConstantR1<float>({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345});
+  auto exp = builder.Pow(x, builder.ConstantR0<float>(-.5f));
+
+  std::vector<float> expected = {.25,     1,       .03125, 2.5,
+                                 2.23607, .009000, .900025};
+
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
 TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
   ComputationBuilder builder(client_, TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
@@ -408,6 +442,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index e6bbed671ff..5f917797744 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -247,6 +249,291 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
+// Tests two while nodes when the result type T is a Tuple and the second
+// while node uses the result of the first while node which is used in two
+// nodes.
+// tuple<int32, vector<float>> w0(0, vector<float>(10, 0.0f));
+// w0 = while (get<0>(w0) < c1) {
+//        get<0>(w0) = get<0>(w0) + 1;
+//        get<1>(w0) = get<1>(w0) + vector<float>(10, 1.0f);
+//      }
+// tuple<int32, vector<float>> w1(get<0>(w0), get<1>(w0));
+// w1 = while (get<0>(w1) < c2) {
+//        get<0>(w1) = get<0>(w1) + 1;
+//        get<1>(w1) = get<1>(w1) + vector<float>(10, 1.0f);
+//      }
+// result = get<1>(w0) + get<1>(w1)
+TEST_F(WhileTest, TwoWhileWithTupleResult) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  Computation body2;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body2, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+
+  auto while2 = builder.While(condition2, body2, while1);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+// Test while nodes that share the while body computation.
+TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+
+  auto while2 = builder.While(condition2, body, while1);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+// Test while nodes that share the while body computation.
+// TODO(b/37245345): Fails on GPU backend.
+TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+  auto while2 = builder.While(condition2, body, init);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+// WhileTest that uses DynamicUpdateSlice instruction in body computation.
+// Loop state tuple element 1 has as its single user operand(0) of
+// DynamicUpdateSlice, which will trigger in-place dynamic slice update on GPU.
+XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Gt(builder.ConstantR0<int32>(5), iteration);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    // TupleElement 0
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto out0 = builder.Add(iteration, builder.ConstantR0<int32>(1));
+    // TupleElement 1
+    auto input = builder.GetTupleElement(prev, 1);
+    // Update.
+    auto update = builder.ConvertElementType(builder.Broadcast(out0, {2}), F32);
+    // Starts = iteration * 2;
+    auto starts = builder.Reshape(
+        builder.Mul(iteration, builder.ConstantR0<int32>(2)), {1});
+    // UpdateSlice.
+    auto out1 = builder.DynamicUpdateSlice(input, update, starts);
+
+    auto result = builder.Tuple({out0, out1});
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto result = builder.While(condition, body, init);
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+
+  auto expected_counter = LiteralUtil::CreateR0<int32>(5);
+  auto expected_data = LiteralUtil::CreateR1<float>(
+      {1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f});
+  auto expected =
+      LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()});
+  VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape());
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
+}
+
 // Tests a while node when the result type T is a vector of S32.
 //
 // int32 result = (0, 0, 0, 0, 0, 0);
@@ -254,7 +541,8 @@ TEST_F(WhileTest, WhileWithTupleResult) {
 //   result += (1, U[0, 100], U[0, 100], U[0, 100], U[0, 100], U[0, 100]);
 // }
 //
-// This test misuses a vector to represent a pair:
+// This test misuses a vector WhileTest.WhileLoopsWithSharedBodyto represent a
+// pair:
 //   ((iteration, (random vector))).
 //
 // Note: this test currently only tests generating random values within a loop.
@@ -268,7 +556,8 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   auto build_condition = [this, v6s32](int count) {
     ComputationBuilder builder(client_, TestName());
     auto prev = builder.Reshape(
-        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}), {0}, {});
+        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
+          {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
@@ -308,6 +597,74 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   }
 }
 
+// Tests nested while loops.
+//
+// int32 result = 0;
+// while (result < 30) {
+//   int i = 0;
+//   while (i < 7) {
+//     result = result + 2;
+//     i = i + 1;
+//   }
+// }
+XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
+  auto outer_result_shape = ShapeUtil::MakeShape(S32, {});
+  auto inner_result_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
+
+  Computation inner_condition;
+  {
+    ComputationBuilder builder(client_, "inner_condition");
+    auto params = builder.Parameter(0, inner_result_shape, "prev");
+    auto i = builder.GetTupleElement(params, 0);
+    builder.Lt(i, builder.ConstantR0<int32>(7));
+    inner_condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Creates a computation for the outer loop condition:
+  // repeat while result < 30.
+  Computation outer_condition;
+  {
+    ComputationBuilder builder(client_, "outer_condition");
+    auto prev = builder.Parameter(0, outer_result_shape, "prev");
+    builder.Lt(prev, builder.ConstantR0<int32>(30));
+    outer_condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Creates a computation for the inner loop body: add 1 to `i`, and add 2 to
+  // `result`.
+  Computation inner_body;
+  {
+    ComputationBuilder builder(client_, "inner_body");
+    auto params = builder.Parameter(0, inner_result_shape, "prev");
+    auto i = builder.GetTupleElement(params, 0);
+    auto result = builder.GetTupleElement(params, 1);
+    i = builder.Add(builder.ConstantR0<int32>(1), i);
+    result = builder.Add(builder.ConstantR0<int32>(2), result);
+    auto output = builder.Tuple({i, result});
+    inner_body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Creates a computation for the outer loop: run the inner loop with i = 0.
+  Computation outer_body;
+  {
+    ComputationBuilder builder(client_, "outer_body");
+    auto prev = builder.Parameter(0, outer_result_shape, "prev");
+    auto init = builder.Tuple({builder.ConstantR0<int32>(0), prev});
+    auto result = builder.While(inner_condition, inner_body, init);
+    auto output = builder.GetTupleElement(result, 1);
+    outer_body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, TestName());
+  auto init = builder.ConstantR0<int32>(0);
+  auto result = builder.While(outer_condition, outer_body, init);
+  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+
+  ComputeAndCompareR0<int32>(&builder, 42, {});
+}
+
 void BM_WhileLoop(int num_iters) {
   // Benchmark a simple kernel to measure while loop overheads.
   tensorflow::testing::StopTiming();
@@ -354,19 +711,23 @@ void BM_WhileLoop(int num_iters) {
   builder.While(condition, body, init);
   auto computation = builder.Build().ConsumeValueOrDie();
 
+  std::unique_ptr<LocalExecutable> executable =
+      client->Compile(computation, {}, ExecutableBuildOptions())
+          .ConsumeValueOrDie();
+
   // Run some warm-up executions.
-  LocalExecuteOptions options;
+  ExecutableRunOptions options;
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = client->ExecuteLocally(computation, {}, options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = client->ExecuteLocally(computation, {}, options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 }
@@ -381,6 +742,7 @@ BENCHMARK(BM_WhileLoop);
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/text_literal_reader.h b/tensorflow/compiler/xla/text_literal_reader.h
index 3cfbb2c7fbf..e45e5291c9b 100644
--- a/tensorflow/compiler/xla/text_literal_reader.h
+++ b/tensorflow/compiler/xla/text_literal_reader.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/text_literal_reader_test.cc b/tensorflow/compiler/xla/text_literal_reader_test.cc
index 94d0f2646b1..a167d80f73b 100644
--- a/tensorflow/compiler/xla/text_literal_reader_test.cc
+++ b/tensorflow/compiler/xla/text_literal_reader_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/text_literal_writer.h b/tensorflow/compiler/xla/text_literal_writer.h
index 545bd22da91..7375493f430 100644
--- a/tensorflow/compiler/xla/text_literal_writer.h
+++ b/tensorflow/compiler/xla/text_literal_writer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 #define TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/compiler/xla/text_literal_writer_test.cc b/tensorflow/compiler/xla/text_literal_writer_test.cc
index 9dce4d13bb0..177ae4ea036 100644
--- a/tensorflow/compiler/xla/text_literal_writer_test.cc
+++ b/tensorflow/compiler/xla/text_literal_writer_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 46eab7f02bb..535e5b605b4 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -153,6 +153,7 @@ cc_binary(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
@@ -176,6 +177,24 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "dumped_computation_to_tf_graphdef",
+    srcs = ["dumped_computation_to_tf_graphdef.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4c242abc9b7..8d7f7fd1237 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -81,6 +81,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
         client->GetComputationShape(computation).ConsumeValueOrDie();
 
     std::vector<const Shape*> layouts;
+    layouts.reserve(program_shape->parameters_size());
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 8b96e134897..2a3a8803283 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace tools {
 
-void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
+void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
@@ -50,23 +51,37 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     }
     Computation computation = computation_status.ConsumeValueOrDie();
 
-    std::unique_ptr<ProgramShape> program_shape =
-        client->GetComputationShape(computation).ConsumeValueOrDie();
+    if (compile) {
+      std::unique_ptr<ProgramShape> program_shape =
+          client->GetComputationShape(computation).ConsumeValueOrDie();
 
-    std::vector<const Shape*> layouts;
-    for (int i = 0; i < program_shape->parameters_size(); ++i) {
-      layouts.push_back(&program_shape->parameters(i));
+      std::vector<const Shape*> layouts;
+      layouts.reserve(program_shape->parameters_size());
+      for (int i = 0; i < program_shape->parameters_size(); ++i) {
+        layouts.push_back(&program_shape->parameters(i));
+      }
+      StatusOr<std::unique_ptr<Executable>> executable =
+          local_service->CompileExecutable(
+              computation.handle(), layouts, &program_shape->result(),
+              /*device_ordinal=*/0, /*has_hybrid_result=*/true);
+
+      const HloModule& module = executable.ValueOrDie()->module();
+
+      fprintf(stdout, "HLO compiled for %s backend:\n%s\n",
+              local_service->backend().platform()->Name().c_str(),
+              module.ToString().c_str());
+    } else {
+      const ComputationTracker& tracker = local_service->computation_tracker();
+      UserComputation* user_computation =
+          tracker.Resolve(computation.handle()).ConsumeValueOrDie();
+      VersionedComputationHandle versioned_handle =
+          user_computation->GetVersionedHandle();
+      std::unique_ptr<HloModule> module =
+          tracker.BuildHloModule(versioned_handle, HloModuleConfig())
+              .ConsumeValueOrDie();
+
+      fprintf(stdout, "%s\n", module->ToString().c_str());
     }
-    StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(
-            computation.handle(), layouts, &program_shape->result(),
-            /*device_ordinal=*/0, /*has_hybrid_result=*/true);
-
-    const HloModule& module = executable.ValueOrDie()->module();
-
-    fprintf(stdout, "HLO for %s backend:\n%s\n",
-            local_service->backend().platform()->Name().c_str(),
-            module.ToString().c_str());
   }
 }
 
@@ -74,10 +89,21 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  bool compile = false;
+  std::vector<tensorflow::Flag> flag_list = {
+      {"compile", &compile,
+       "If true, compile the computation using the default client before "
+       "dumping the HLO. Otherwise dump the raw (uncompiled) HLO."},
+  };
+  const xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc > 1) << "\nERROR: must specify at least one module\n" << usage;
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
+  xla::tools::RealMain(args, compile);
   return 0;
 }
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
new file mode 100644
index 00000000000..850267d3195
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Usage: dumped_computation_to_tf_graph some_binary_snapshot_proto*
+//
+// Dumps a tensorflow GraphDef in text format for a snapshot computation. The
+// dumped graph is an HLO computation with HLO instructions as nodes and can be
+// visualized on Tensorboard. Upload the dumped files on Tensorboard.
+//
+// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// ServiceInterface::SnapshotComputation to disk.
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::Env;
+
+namespace xla {
+namespace tools {
+
+void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
+  Client* client = ClientLibrary::LocalClientOrDie();
+  for (char* arg : args) {
+    SessionModule module;
+    TF_CHECK_OK(
+        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
+    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    ComputationStats stats =
+        client->GetComputationStats(computation).ConsumeValueOrDie();
+    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
+  }
+}
+
+}  // namespace tools
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  xla::legacy_flags::ServiceFlags* flags = xla::legacy_flags::GetServiceFlags();
+  flags->xla_generate_hlo_graph = ".*";
+
+  xla::legacy_flags::HloGraphDumperFlags* dumper_flags =
+      xla::legacy_flags::GetHloGraphDumperFlags();
+  dumper_flags->xla_hlo_dump_as_graphdef = true;
+
+  tensorflow::gtl::ArraySlice<char*> args(argv, argc);
+  args.pop_front();  // Pop off the binary name, argv[0]
+  xla::tools::RealMain(args);
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index ffb2d5aefba..3a75bf64954 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -66,7 +66,8 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   if (use_fake_data) {
     arguments = MakeFakeArgumentsOrDie(computation, client);
   } else {  // use recorded data if available
-    for (const Literal& literal : module.arguments()) {
+    for (const auto& proto : module.arguments()) {
+      Literal literal(proto);
       TF_ASSIGN_OR_RETURN(std::unique_ptr<GlobalData> data,
                           client->TransferToServer(literal));
       arguments.push_back(std::move(data));
@@ -74,6 +75,7 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   }
 
   std::vector<GlobalData*> execute_arguments;
+  execute_arguments.reserve(arguments.size());
   for (auto& argument : arguments) {
     execute_arguments.push_back(argument.get());
   }
@@ -100,7 +102,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
     if (module.has_result()) {
       fprintf(stdout, "was %s:%s\n",
               ShapeUtil::HumanString(module.result().shape()).c_str(),
-              LiteralUtil::ToString(module.result()).c_str());
+              LiteralUtil::ToString(Literal(module.result())).c_str());
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tools/show_literal.cc b/tensorflow/compiler/xla/tools/show_literal.cc
index cf363913b15..b6538f5de07 100644
--- a/tensorflow/compiler/xla/tools/show_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_literal.cc
@@ -37,9 +37,10 @@ int main(int argc, char **argv) {
                 << " <path-to-serialized-literal-proto>";
   }
 
-  xla::Literal literal;
+  xla::LiteralProto literal_proto;
   TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), argv[1],
-                                          &literal));
-  LOG(INFO) << "literal: " << literal.ShortDebugString();
+                                          &literal_proto));
+  xla::Literal literal(literal_proto);
+  LOG(INFO) << "literal: " << literal_proto.ShortDebugString();
   fprintf(stderr, "%s\n", xla::LiteralUtil::ToString(literal).c_str());
 }
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 8258031a2c5..ea8b4b7b989 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_TYPES_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
+#include <Eigen/Core>
+
 namespace xla {
 
 using ::tensorflow::string;
@@ -32,6 +35,8 @@ using ::tensorflow::uint16;
 using ::tensorflow::uint32;
 using ::tensorflow::uint64;
 
+using ::Eigen::half;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 3ee5dfc9496..d467178cb52 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -33,7 +33,7 @@ namespace {
 // Adds a backtrace to the provided status iff the xla_status_add_backtrace flag
 // is set. This is useful for quickly tracing status errors observed coming out
 // of the service.
-Status MaybeAddBacktrace(Status prior) {
+Status MaybeAddBacktrace(const Status& prior) {
   DCHECK(!prior.ok());
   if (legacy_flags::GetUtilFlags()->xla_status_add_backtrace) {
     return Status{prior.code(),
@@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
       });
 }
 
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
+  if (rank != permutation.size()) {
+    return false;
+  }
+  std::vector<int64> output(permutation.size(), -1);
+  for (auto index : permutation) {
+    CHECK_GE(index, 0);
+    CHECK_LT(index, rank);
+    output[index] = 0;
+  }
+  return std::find(output.begin(), output.end(), -1) == output.end();
+}
+
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation) {
+  DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
     output_permutation[input_permutation[i]] = i;
   }
-  DCHECK_EQ(
-      0, std::count(output_permutation.begin(), output_permutation.end(), -1));
-  DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
-                             output_permutation.begin()));
   return output_permutation;
 }
 
@@ -176,6 +186,15 @@ std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
   return output;
 }
 
+bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> p) {
+  for (int64 i = 0; i < p.size(); ++i) {
+    if (p[i] != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
 PaddingConfig MakeNoPaddingConfig(int64 rank) {
   PaddingConfig padding_config;
   for (int64 dnum = 0; dnum < rank; ++dnum) {
@@ -187,6 +206,15 @@ PaddingConfig MakeNoPaddingConfig(int64 rank) {
   return padding_config;
 }
 
+bool HasInteriorPadding(const PaddingConfig& config) {
+  for (const auto& dim : config.dimensions()) {
+    if (dim.interior_padding() != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 string HumanReadableNumFlops(double flops, double nanoseconds) {
   if (nanoseconds == 0) {
     return "NaN FLOP/s";
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 00f8d946f89..42d5c1d1550 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -38,6 +39,13 @@ limitations under the License.
 
 namespace xla {
 
+// Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
+// the bounds and indices. And for the rare cases of ranks greater than 8,
+// the InlinedVector will just behave like an std::vector<> and allocate the
+// memory to store its values.
+static constexpr int kInlineRank = 8;
+using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
+
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
@@ -120,6 +128,14 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2) {
           std::equal(std::begin(c1), std::end(c1), std::begin(c2)));
 }
 
+template <typename Container1T,
+          typename ElementType = typename Container1T::value_type>
+bool ContainersEqual(const Container1T& c1,
+                     std::initializer_list<ElementType> il) {
+  tensorflow::gtl::ArraySlice<ElementType> c2{il};
+  return ContainersEqual(c1, c2);
+}
+
 // Compares two containers for equality. Returns true iff the two containers
 // have the same size and all their elements compare equal using the predicate
 // p. Like std::equal, but forces size equality.
@@ -130,6 +146,18 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2,
           std::equal(std::begin(c1), std::end(c1), std::begin(c2), p));
 }
 
+// Performs a copy of count values from src to dest, using different strides for
+// source and destination. The source starting index is src_base, while the
+// destination one is dest_base.
+template <typename D, typename S>
+void StridedCopy(tensorflow::gtl::MutableArraySlice<D> dest, int64 dest_base,
+                 int64 dest_stride, tensorflow::gtl::ArraySlice<S> src,
+                 int64 src_base, int64 src_stride, int64 count) {
+  for (; count > 0; --count, dest_base += dest_stride, src_base += src_stride) {
+    dest[dest_base] = static_cast<D>(src[src_base]);
+  }
+}
+
 // Adds some context information to the error message in a
 // Status.  This is useful as Statuses are
 // propagated upwards.
@@ -156,6 +184,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 string Reindent(tensorflow::StringPiece original,
                 tensorflow::StringPiece indentation);
 
+// Checks whether permutation is a permutation of the [0, rank) integer range.
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
+
 // Applies `permutation` on `input` and returns the permuted array.
 // For each i, output[permutation[i]] = input[i].
 //
@@ -166,12 +197,11 @@ template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                        C<T> input_) {
   tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK_EQ(permutation.size(), input.size());
+  CHECK(IsPermutation(permutation, input.size()));
   std::vector<T> output(input.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
     output[permutation[i]] = input[i];
   }
-  DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
   return output;
 }
 
@@ -183,15 +213,62 @@ std::vector<int64> InversePermutation(
 std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
                                        tensorflow::gtl::ArraySlice<int64> p2);
 
+// Returns true iff permutation == {0, 1, 2, ...}.
+bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> permutation);
+
 template <typename Container>
 int64 PositionInContainer(const Container& container, int64 value) {
   return std::distance(container.begin(),
                        std::find(container.begin(), container.end(), value));
 }
 
+// Formats the container as a comma-separated string. StrAppend must support
+// appending the elements of the container. Prefix is prepended and suffix is
+// appended to the returned string.
+template <typename Container>
+string CommaSeparatedString(const Container& c, const char* prefix = "",
+                            const char* suffix = "") {
+  // Not using Join() since the implementation here is simple anyway and this
+  // avoids copying the string to append prefix.
+  string comma_separated = prefix;
+  const char* separator = "";
+  for (const auto& entry : c) {
+    tensorflow::strings::StrAppend(&comma_separated, separator, entry);
+    separator = ", ";
+  }
+  comma_separated += suffix;
+  return comma_separated;
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+string CommaSeparatedString(const std::initializer_list<T>& c,
+                            const char* prefix = "", const char* suffix = "") {
+  return CommaSeparatedString<std::initializer_list<T>>(c, prefix, suffix);
+}
+
+// Formats the container in the mathematical notation for a vector, e.g. (1, 3,
+// 7). StrAppend must support appending the elements of c.
+template <typename Container>
+string VectorString(const Container& c) {
+  return CommaSeparatedString(c, "(", ")");
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+string VectorString(const std::initializer_list<T>& c) {
+  return VectorString<std::initializer_list<T>>(c);
+}
+
 // Returns a PaddingConfig object that represents no padding for the given rank.
 PaddingConfig MakeNoPaddingConfig(int64 rank);
 
+// Returns true if the padding configuration has at least one dimension with
+// non-zero interior padding.
+bool HasInteriorPadding(const PaddingConfig& config);
+
 // Imports the templated FloorOfRatio math function from the TensorFlow
 // namespace, as it is very commonly used.
 template <typename T>
diff --git a/tensorflow/compiler/xla/util_test.cc b/tensorflow/compiler/xla/util_test.cc
index bf01ac0fec2..547b924180b 100644
--- a/tensorflow/compiler/xla/util_test.cc
+++ b/tensorflow/compiler/xla/util_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <list>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -80,6 +80,26 @@ TEST(UtilTest, HumanReadableNumFlopsExample) {
   ASSERT_EQ("1.00GFLOP/s", HumanReadableNumFlops(1e9, 1e9));
 }
 
+TEST(UtilTest, CommaSeparatedString) {
+  EXPECT_EQ(CommaSeparatedString({}), "");
+  EXPECT_EQ(CommaSeparatedString({"hello world"}), "hello world");
+  EXPECT_EQ(CommaSeparatedString({1, 57, 2}, "foo", "bar"), "foo1, 57, 2bar");
+}
+
+TEST(UtilTest, VectorString) {
+  std::list<int64> empty_list;
+  EXPECT_EQ(VectorString(empty_list), "()");
+
+  std::vector<float> float_vector = {5.5};
+  EXPECT_EQ(VectorString(float_vector), "(5.5)");
+
+  std::set<const char*> string_set = {"a", "b"};
+  EXPECT_EQ(VectorString(string_set), "(a, b)");
+
+  EXPECT_EQ(VectorString({}), "()");
+  EXPECT_EQ(VectorString({1, 57, 2}), "(1, 57, 2)");
+}
+
 TEST(UtilTest, LogLines) {
   // Just make sure this code runs (not verifying the output).
   LogLines(tensorflow::INFO, "hello\n\nworld", __FILE__, __LINE__);
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index b9d82c557b2..4c3cd321f68 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -20,20 +20,41 @@ import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
 
-// These settings control how XLA compiles and/or runs code.  Not all settings
-// will have an effect on every platform.
-//
-// When adding new fields, keep in mind that boolean fields default to false.
-message ExecutionOptions {
-  // When false, "unsafe" mathematical optimizations are enabled.  These
+// Debugging options for XLA. These options may change at any time - there are
+// no guarantees about backward or forward compatibility for these fields.
+message DebugOptions {
+  // HLO modules matching this regex will be dumped to a .dot file throughout
+  // various stages in compilation (file names are LOG(INFO)'d). Set to ".*" to
+  // dump *all* HLO modules.
+  string xla_generate_hlo_graph = 1;
+
+  // List of HLO passes to disable. These names must exactly match the pass
+  // names as specified by the HloPassInterface::name() method.
+  repeated string xla_disable_hlo_passes = 2;
+
+  // Numerical optimization level for the XLA compiler backend; the specific
+  // interpretation of this value is left to the backends.
+  int32 xla_backend_optimization_level = 3;
+
+  // When true, "unsafe" mathematical optimizations are enabled. These
   // transformations include but are not limited to:
   //
   //  - Reducing the precision of operations (e.g. using an approximate sin
   //    function, or transforming x/y into x * (1/y)).
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
-  bool disable_fast_math = 1;
+  bool xla_enable_fast_math = 4;
 
+  // Extra options to pass to the compilation backend; specific interpretation
+  // of these values is left to the backend.
+  map<string, string> xla_backend_extra_options = 5;
+}
+
+// These settings control how XLA compiles and/or runs code.  Not all settings
+// will have an effect on every platform.
+//
+// When adding new fields, keep in mind that boolean fields default to false.
+message ExecutionOptions {
   // This optional field's layout is used as a hint when storing the output of
   // this computation.  Subsequent transfers of this output array to the client
   // may be faster when using this layout.
@@ -46,6 +67,8 @@ message ExecutionOptions {
   //
   // TODO(b/32083678): Changing the seed unnecessarily forces a recompilation.
   uint64 seed = 3;
+
+  DebugOptions debug_options = 4;
 }
 
 message SnapshotComputationRequest {
@@ -76,16 +99,16 @@ message TransferToClientRequest {
   GlobalDataHandle data = 1;
 
   // This optional field directs the service to return the literal in this
-  // layout. A shape is used to hold the layout to accomodate tuples.
+  // layout. A shape is used to hold the layout to accommodate tuples.
   Shape shape_with_layout = 2;
 }
 
 message TransferToClientResponse {
-  Literal literal = 1;
+  LiteralProto literal = 1;
 }
 
 message TransferToServerRequest {
-  Literal literal = 1;
+  LiteralProto literal = 1;
   DeviceHandle device_handle = 2;
 }
 
@@ -93,25 +116,8 @@ message TransferToServerResponse {
   GlobalDataHandle data = 1;
 }
 
-message TransferToServerInProcessRequest {
-  uint64 buffer = 1;
-  Shape shape = 2;
-}
-
-message TransferToServerInProcessResponse {
-  GlobalDataHandle data = 1;
-}
-
-message TransferToClientInProcessRequest {
-  GlobalDataHandle data = 1;
-  uint64 buffer = 2;
-}
-
-message TransferToClientInProcessResponse {
-}
-
 message TransferToInfeedRequest {
-  Literal literal = 1;
+  LiteralProto literal = 1;
   int64 replica_id = 2;
   DeviceHandle device_handle = 3;
 }
@@ -119,6 +125,19 @@ message TransferToInfeedRequest {
 message TransferToInfeedResponse {
 }
 
+message TransferFromOutfeedRequest {
+  // This optional field directs the service to return the literal in this
+  // layout. A shape is used to hold the layout to accommodate tuples.
+  Shape shape_with_layout = 1;
+
+  int64 replica_id = 2;
+  DeviceHandle device_handle = 3;
+}
+
+message TransferFromOutfeedResponse {
+  LiteralProto literal = 1;
+}
+
 message ResetDeviceRequest {
   DeviceHandle device_handle = 1;
 }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 10e14b43445..9470e6c3b26 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 package xla;
+option cc_enable_arenas = true;
 
 // Primitive types are the individual values that can be held in rectangular
 // multidimensional arrays. A description of the rectangular multidimensional
@@ -178,6 +179,31 @@ message ComputationStats {
   double transcendental_count = 2;
 }
 
+// Symbolization metadata for HLO Instructions.
+//
+// This metadata is used for debugging XLA code generation, as well as
+// performance profiling of XLA-generated executables.
+message OpMetadata {
+  // The framework op name that generated this XLA op.
+  //
+  // Frameworks that build on top of XLA should mirror the names of their ops
+  // back to users by specifying the op_type. In this way, even if the
+  // framework's "ops" are implemented as multiple XLA HLO Ops, they can be
+  // grouped appropriately. (e.g. if a SoftMax layer is emitted into XLA as
+  // multiple ops, then each op should have the op_type be "SoftMax".)
+  string op_type = 1;
+  // The user-specified name of the op.
+  //
+  // This name is often unique within a computation. Note: some frameworks
+  // add auto-generated names if the user does not provide one.
+  string op_name = 2;
+  // Indicate a file and line that this op is associated to in a user's program.
+  //
+  // e.g. it could be be the file and line of user code that generated the op.
+  string source_file = 3;
+  int32 source_line = 4;
+}
+
 // Profile data from the execution of a computation.
 message ExecutionProfile {
   // Whether the executable was read from the compilation cache.
@@ -249,7 +275,7 @@ message ChannelHandle {
 //
 // Transfers to/from the client are encoded in literal form, and the structure
 // of the repeated fields is implied by the shape.
-message Literal {
+message LiteralProto {
   Shape shape = 1;
   repeated bool preds = 2;
   bytes u8s = 3;
@@ -259,7 +285,8 @@ message Literal {
   repeated uint64 u64s = 7;
   repeated float f32s = 8;
   repeated double f64s = 9;
-  repeated Literal tuple_literals = 10;
+  repeated LiteralProto tuple_literals = 10;
+  bytes f16s = 11;  // Note: the F16s are encoded in little endian byte order
 }
 
 message WindowDimension {
@@ -310,7 +337,7 @@ message Window {
 // field in OpRequest.
 
 message ConstantRequest {
-  Literal literal = 2;
+  LiteralProto literal = 2;
 }
 
 message GetTupleElementRequest {
@@ -322,6 +349,7 @@ message SliceRequest {
   ComputationDataHandle operand = 2;
   repeated int64 start_indices = 3;
   repeated int64 limit_indices = 4;
+  repeated int64 stride = 5;
 }
 
 message DynamicSliceRequest {
@@ -382,15 +410,18 @@ message InfeedRequest {
   Shape shape = 2;
 
   // Additional infeed configuration for the backend.
-  string config = 3;
+  bytes config = 3;
 }
 
 message OutfeedRequest {
+  // The shape of the data returned by reading the device's outfeed buffer.
+  Shape shape = 1;
+
   // Operand to the Outfeed. Supports tuple.
   ComputationDataHandle operand = 2;
 
   // Backend-specific information for how to perform the outfeed.
-  string outfeed_config = 3;
+  bytes outfeed_config = 3;
 }
 
 message CallRequest {
@@ -483,6 +514,13 @@ message ReshapeRequest {
   repeated int64 new_sizes = 4;
 }
 
+message TransposeRequest {
+  ComputationDataHandle operand = 2;
+
+  // The permutation of the operand's dimensions (in the range 0 to n-1).
+  repeated int64 dimensions = 3;
+}
+
 message ParameterRequest {
   Shape shape = 2;
   int64 parameter = 3;
@@ -555,6 +593,9 @@ enum UnaryOperation {
 
   // Elementwise, computes the sign of x.
   UNOP_SIGN = 10;
+
+  // Elementwise, tests if values are finite (not NaN or inf)
+  UNOP_IS_FINITE = 11;
 }
 
 message UnaryOpRequest {
@@ -684,6 +725,7 @@ message RecvRequest {
 
 message OpRequest {
   ComputationHandle computation = 1;
+  OpMetadata metadata = 33;
 
   oneof op {
     BinaryOpRequest binary_op_request = 2;
@@ -711,13 +753,14 @@ message OpRequest {
     SliceRequest slice_request = 24;
     TernaryOpRequest ternary_op_request = 25;
     TraceRequest trace_request = 26;
+    TransposeRequest transpose_request = 34;
     UnaryOpRequest unary_op_request = 27;
     VariadicOpRequest variadic_op_request = 28;
     WhileRequest while_request = 29;
     SendRequest send_request = 30;
     RecvRequest recv_request = 31;
     OutfeedRequest outfeed_request = 32;
-    // Next: 33
+    // Next: 35
   }
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
old mode 100644
new mode 100755
index 6cea7b09521..b99933ff9b5
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -13,11 +13,15 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
+        "//tensorflow/contrib/cloud:cloud_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
+        "//tensorflow/contrib/data",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/factorization:factorization_py",
@@ -25,34 +29,46 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
+        "//tensorflow/contrib/hooks",
         "//tensorflow/contrib/image:image_py",
+        "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
+        "//tensorflow/contrib/imperative",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/keras",
+        "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/labeled_tensor",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/contrib/legacy_seq2seq:seq2seq_py",
         "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
+        "//tensorflow/contrib/memory_stats:memory_stats_py",
         "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
+        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/contrib/slim",
         "//tensorflow/contrib/slim:nets",
         "//tensorflow/contrib/solvers:solvers_py",
         "//tensorflow/contrib/sparsemax:sparsemax_py",
         "//tensorflow/contrib/specs",
+        "//tensorflow/contrib/staging",
         "//tensorflow/contrib/stat_summarizer:stat_summarizer_py",
+        "//tensorflow/contrib/stateless",
         "//tensorflow/contrib/tensor_forest:init_py",
-        "//tensorflow/contrib/tensor_forest/hybrid:ops_lib",  # XXX: no ref but need for pip
         "//tensorflow/contrib/tensorboard",
         "//tensorflow/contrib/testing:testing_py",
+        "//tensorflow/contrib/text:text_py",
         "//tensorflow/contrib/tfprof",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
@@ -63,10 +79,16 @@ cc_library(
     name = "contrib_kernels",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
+        "//tensorflow/contrib/framework:generate_vocab_remapping_kernel",
+        "//tensorflow/contrib/framework:load_and_remap_matrix_kernel",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
-        "//tensorflow/contrib/layers:bucketization_op_kernel",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
+        "//tensorflow/contrib/nccl:nccl_kernels",
+        "//tensorflow/contrib/seq2seq:beam_search_ops_kernels",
+        "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
+        "//tensorflow/contrib/text:all_kernels",
     ],
 )
 
@@ -74,11 +96,15 @@ cc_library(
     name = "contrib_ops_op_lib",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/layers:bucketization_op_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
+        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
+        "//tensorflow/contrib/seq2seq:beam_search_ops_op_lib",
+        "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
+        "//tensorflow/contrib/text:all_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index fede580f0f7..a94e809c139 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import bayesflow
+from tensorflow.contrib import cloud
 from tensorflow.contrib import compiler
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
+from tensorflow.contrib import data
 from tensorflow.contrib import deprecated
 from tensorflow.contrib import distributions
 from tensorflow.contrib import factorization
@@ -33,6 +35,8 @@ from tensorflow.contrib import grid_rnn
 from tensorflow.contrib import image
 from tensorflow.contrib import input_pipeline
 from tensorflow.contrib import integrate
+from tensorflow.contrib import keras
+from tensorflow.contrib import kernel_methods
 from tensorflow.contrib import labeled_tensor
 from tensorflow.contrib import layers
 from tensorflow.contrib import learn
@@ -41,16 +45,22 @@ from tensorflow.contrib import linalg
 from tensorflow.contrib import linear_optimizer
 from tensorflow.contrib import lookup
 from tensorflow.contrib import losses
+from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
+from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import quantization
 from tensorflow.contrib import rnn
+from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
+from tensorflow.contrib import signal
 from tensorflow.contrib import slim
 from tensorflow.contrib import solvers
 from tensorflow.contrib import sparsemax
+from tensorflow.contrib import staging
 from tensorflow.contrib import stat_summarizer
+from tensorflow.contrib import stateless
 from tensorflow.contrib import tensor_forest
 from tensorflow.contrib import tensorboard
 from tensorflow.contrib import testing
@@ -60,6 +70,11 @@ from tensorflow.contrib import util
 from tensorflow.contrib.ndlstm import python as ndlstm
 from tensorflow.contrib.specs import python as specs
 
+from tensorflow.python.util.lazy_loader import LazyLoader
+ffmpeg = LazyLoader("ffmpeg", globals(),
+                    "tensorflow.contrib.ffmpeg")
+del LazyLoader
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index acd82dc21ea..952f24f34b2 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -72,13 +72,17 @@ LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
 cc_binary(
     name = "libtensorflow_inference.so",
     srcs = [],
-    copts = tf_copts(),
+    copts = tf_copts() + [
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
     linkopts = if_android([
         "-landroid",
         "-llog",
         "-lm",
         "-z defs",
         "-s",
+        "-Wl,--gc-sections",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
         LINKER_SCRIPT,
     ]),
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index 522a6877f47..f49e5857fe5 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -1,25 +1,45 @@
 # Android TensorFlow support
 
-This directory contains components geared towards supporting TensorFlow on
-Android.
+This directory defines components (a native `.so` library and a Java JAR)
+geared towards supporting TensorFlow on Android. This includes:
 
-## Android Java Inference Interface for TensorFlow
+- The [TensorFlow Java API](../../java/README.md)
+- A `TensorFlowInferenceInterface` class that provides a smaller API
+  surface suitable for inference and summarizing performance of model execution.
 
-This library contains a Java API capable of loading TensorFlow graphs and
-performing inference on Android. See [TensorFlowImageClassifier.java](../../examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java)
-in the [TensorFlow Android Camera Demo](../../examples/android) for an example.
-
-With both the native .so library and the Java JAR (or AAR which combines both),
-you will have everything you need to use TensorFlow for inference in your
-Android app. Note that training is not supported through this interface; for
-that you will have to use one of the other APIs.
+For example usage, see [TensorFlowImageClassifier.java](../../examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java)
+in the [TensorFlow Android Demo](../../examples/android).
 
 For prebuilt libraries, see the
 [nightly Android build artifacts](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)
 page for a recent build.
 
-To build the inference libraries yourself (if, for example, you want to support
-custom TensorFlow operators), pick your preferred approach below:
+The TensorFlow Inference Interface is also available as a
+[JCenter package](https://bintray.com/google/tensorflow/tensorflow-android) and
+can be included quite simply in your android project with a couple of lines in
+the project's `build.gradle` file:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-android:+'
+}
+```
+
+This will tell Gradle to use the
+[latest version](https://bintray.com/google/tensorflow/tensorflow-android/_latestVersion)
+of the TensorFlow AAR that has been released to
+[https://bintray.com/google/tensorflow/tensorflow-android](https://bintray.com/google/tensorflow/tensorflow-android).
+You may replace the `+` with an explicit version label if you wish to
+use a specific release of TensorFlow in your app.
+
+To build the libraries yourself (if, for example, you want to support custom
+TensorFlow operators), pick your preferred approach below:
 
 ### Bazel
 
@@ -61,8 +81,9 @@ For documentation on building a self-contained AAR file with cmake, see
 [tensorflow/contrib/android/cmake](cmake).
 
 
-## AssetManagerFileSystem:
+## AssetManagerFileSystem
 
-A TensorFlow filesystem supporting the Android asset manager. This may be
-useful when writing native (C/C++) code that is tightly coupled with TensorFlow
-(for typical usage the Inference Interface library above will be sufficient).
+This directory also contains a TensorFlow filesystem supporting the Android
+asset manager. This may be useful when writing native (C++) code that is tightly
+coupled with TensorFlow. For typical usage, the library above will be
+sufficient.
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index a7e0d581aa4..9f9d740d6ce 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -30,20 +30,29 @@ add_library(lib_tf STATIC IMPORTED )
 set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
         ${PREBUILT_DIR}/lib/libtensorflow-core.a)
 # Change to compile flags should be replicated into bazel build file
-# LINT.IfChange
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fno-rtti -fno-exceptions \
-                     -fpic -O2 -mfpu=neon -DTF_LEAN_BINARY \
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
+                     -std=c++11 -fno-rtti -fno-exceptions \
+                     -O2 -Wno-narrowing \
+                     -mfpu=neon -mfloat-abi=softfp -fPIE \
                      -DGOOGLE_PROTOBUF_NO_RTTI \
                      -DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER")
-# LINT.ThenChange(//tensorflow/tensorflow.bzl)
 
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} \
                               -Wl,--allow-multiple-definition \
-                              -Wl,--whole-archive")
-
+                              -Wl,--whole-archive \
+                              -fPIE -pie -v")
 file(GLOB tensorflow_inference_sources
      ${CMAKE_CURRENT_SOURCE_DIR}/../jni/*.cc)
-add_library(tensorflow_inference SHARED ${tensorflow_inference_sources})
+file(GLOB java_api_native_sources
+     ${TENSORFLOW_ROOT_DIR}/tensorflow/java/src/main/native/*.cc)
+
+add_library(tensorflow_inference SHARED
+            ${tensorflow_inference_sources}
+            ${TENSORFLOW_ROOT_DIR}/tensorflow/c/tf_status_helper.cc
+            ${TENSORFLOW_ROOT_DIR}/tensorflow/c/checkpoint_reader.cc
+            ${TENSORFLOW_ROOT_DIR}/tensorflow/c/test_op.cc
+            ${TENSORFLOW_ROOT_DIR}/tensorflow/c/c_api.cc
+            ${java_api_native_sources})
 
 # Include libraries needed for hello-jni lib
 target_link_libraries(tensorflow_inference
diff --git a/tensorflow/contrib/android/cmake/README.md b/tensorflow/contrib/android/cmake/README.md
index 915319da557..6f19b657fe7 100644
--- a/tensorflow/contrib/android/cmake/README.md
+++ b/tensorflow/contrib/android/cmake/README.md
@@ -43,6 +43,6 @@ Output
 - TensorFlow-Inference-release.aar
 
 File libtensorflow_inference.so should be packed under jni/${ANDROID_ABI}/
-in the above aar, and it is transparent to the app as it will acccess them via
+in the above aar, and it is transparent to the app as it will access them via
 equivalent java APIs.
 
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index 8791fac18a9..17a57b99fd6 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -1,8 +1,12 @@
 apply plugin: 'com.android.library'
 
+// TensorFlow repo root dir on local machine
+def TF_SRC_DIR = projectDir.toString() + "/../../../.."
+
 android {
     compileSdkVersion 24
-    buildToolsVersion "24.0.2"
+    // Check local build_tools_version as this is liable to change within Android Studio.
+    buildToolsVersion '25.0.2'
 
     // for debugging native code purpose
     publishNonDefault true
@@ -10,7 +14,7 @@ android {
     defaultConfig {
         archivesBaseName = "Tensorflow-Android-Inference"
         minSdkVersion 21
-        targetSdkVersion 21
+        targetSdkVersion 23
         versionCode 1
         versionName "1.0"
         ndk {
@@ -25,7 +29,11 @@ android {
     }
     sourceSets {
         main {
-            java.srcDirs =  ["../java"]
+            java {
+                srcDir "${TF_SRC_DIR}/tensorflow/contrib/android/java"
+                srcDir "${TF_SRC_DIR}/tensorflow/java/src/main/java"
+                exclude '**/examples/**'
+            }
         }
     }
 
@@ -86,7 +94,7 @@ if(! Os.isFamily(Os.FAMILY_WINDOWS)) {
                 //    just uncomment this line to use it:
                 //    it can take long time to build by default
                 //    it is disabled to avoid false first impression
-                // task.dependsOn buildTensorflow
+                task.dependsOn buildTensorflow
             }
         }
     }
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/RunStats.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/RunStats.java
new file mode 100644
index 00000000000..39996f6ab03
--- /dev/null
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/RunStats.java
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.contrib.android;
+
+/** Accumulate and analyze stats from metadata obtained from Session.Runner.run. */
+public class RunStats implements AutoCloseable {
+
+  /**
+   * Options to be provided to a {@link org.tensorflow.Session.Runner} to enable stats accumulation.
+   */
+  public static byte[] runOptions() {
+    return fullTraceRunOptions;
+  }
+
+  public RunStats() {
+    nativeHandle = allocate();
+  }
+
+  @Override
+  public void close() {
+    if (nativeHandle != 0) {
+      delete(nativeHandle);
+    }
+    nativeHandle = 0;
+  }
+
+  /** Accumulate stats obtained when executing a graph. */
+  public synchronized void add(byte[] runMetadata) {
+    add(nativeHandle, runMetadata);
+  }
+
+  /** Summary of the accumulated runtime stats. */
+  public synchronized String summary() {
+    return summary(nativeHandle);
+  }
+
+  private long nativeHandle;
+
+  // Hack: This is what a serialized RunOptions protocol buffer with trace_level: FULL_TRACE ends
+  // up as.
+  private static byte[] fullTraceRunOptions = new byte[] {0x08, 0x03};
+
+  private static native long allocate();
+
+  private static native void delete(long handle);
+
+  private static native void add(long handle, byte[] runMetadata);
+
+  private static native String summary(long handle);
+}
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 68a52f2162b..b1d18d2faf8 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -16,84 +16,193 @@ limitations under the License.
 package org.tensorflow.contrib.android;
 
 import android.content.res.AssetManager;
+import android.os.Trace;
+import android.os.Build.VERSION;
+import android.text.TextUtils;
 import android.util.Log;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.DoubleBuffer;
 import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
-import java.util.Random;
+import java.util.ArrayList;
+import java.util.List;
+import org.tensorflow.DataType;
+import org.tensorflow.Graph;
+import org.tensorflow.Operation;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.TensorFlow;
 
 /**
- * JNI wrapper class for the Tensorflow native code.
+ * Wrapper over the TensorFlow API ({@link Graph}, {@link Session}) providing a smaller API surface
+ * for inference.
  *
- * See tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
- * for an example usage.
- * */
+ * <p>See tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java for an
+ * example usage.
+ */
 public class TensorFlowInferenceInterface {
   private static final String TAG = "TensorFlowInferenceInterface";
+  private static final String ASSET_FILE_PREFIX = "file:///android_asset/";
 
-  /**
-   * A unique identifier used to associate the Java TensorFlowInferenceInterface
-   * with its associated native variables.
-   * It is accessed via native reflection so any refactoring must also be accompanied
-   * by a change to tensorflow_inference_jni.cc.
+  /*
+   * Load a TensorFlow model from the AssetManager or from disk if it is not an asset file.
+   *
+   * @param assetManager The AssetManager to use to load the model file.
+   * @param model The filepath to the GraphDef proto representing the model.
    */
-  private final long id;
-
-  public TensorFlowInferenceInterface() {
-    id = new Random().nextLong();
-
-    // Fallback to loading from the default libtensorflow_inference.so
-    // only if the app hasn't already loaded a library containing the
-    // native TF bindings.
+  public TensorFlowInferenceInterface(AssetManager assetManager, String model) {
+    Log.i(TAG, "Checking to see if TensorFlow native methods are already loaded");
     try {
-      testLoaded();
-      Log.i(TAG, "Native methods already loaded.");
+      // Hack to see if the native libraries have been loaded.
+      new RunStats();
+      Log.i(TAG, "TensorFlow native methods already loaded");
     } catch (UnsatisfiedLinkError e1) {
-      Log.i(TAG, "Loading tensorflow_inference.");
+      Log.i(
+          TAG, "TensorFlow native methods not found, attempting to load via tensorflow_inference");
       try {
         System.loadLibrary("tensorflow_inference");
+        Log.i(TAG, "Successfully loaded TensorFlow native methods (RunStats error may be ignored)");
       } catch (UnsatisfiedLinkError e2) {
         throw new RuntimeException(
             "Native TF methods not found; check that the correct native"
-                + " libraries are present and loaded.");
+                + " libraries are present in the APK.");
       }
     }
+
+    this.modelName = model;
+    this.g = new Graph();
+    this.sess = new Session(g);
+    this.runner = sess.runner();
+
+    final boolean hasAssetPrefix = model.startsWith(ASSET_FILE_PREFIX);
+    InputStream is = null;
+    try {
+      String aname = hasAssetPrefix ? model.split(ASSET_FILE_PREFIX)[1] : model;
+      is = assetManager.open(aname);
+    } catch (IOException e) {
+      if (hasAssetPrefix) {
+        throw new RuntimeException("Failed to load model from '" + model + "'", e);
+      }
+      // Perhaps the model file is not an asset but is on disk.
+      try {
+        is = new FileInputStream(model);
+      } catch (IOException e2) {
+        throw new RuntimeException("Failed to load model from '" + model + "'", e);
+      }
+    }
+    try {
+      loadGraph(is, g);
+      is.close();
+      Log.i(TAG, "Successfully loaded model from '" + model + "'");
+    } catch (IOException e) {
+      throw new RuntimeException("Failed to load model from '" + model + "'", e);
+    }
   }
 
   /**
-   * Creates a native TensorFlow session for the given model.
-   *
-   * @param assetManager The AssetManager to use to load the model file.
-   * @param model The filepath to the GraphDef proto representing the model.
-   * @return The native status returned by TensorFlow. 0 indicates success.
-   */
-  public native int initializeTensorFlow(AssetManager assetManager, String model);
-
-  /**
-   * Runs inference between the previously registered input nodes (via fillNode*)
-   * and the requested output nodes. Output nodes can then be queried with the
-   * readNode* methods.
+   * Runs inference between the previously registered input nodes (via feed*) and the requested
+   * output nodes. Output nodes can then be queried with the fetch* methods.
    *
    * @param outputNames A list of output nodes which should be filled by the inference pass.
-   * @return The native status returned by TensorFlow. 0 indicates success.
    */
-  public native int runInference(String[] outputNames);
+  public void run(String[] outputNames) {
+    run(outputNames, false);
+  }
 
   /**
-   * Whether to collect and log stats to logcat during inference via StepStats and StatSummarizer.
-   * This should only be enabled when needed, as it will add overhead.
+   * Runs inference between the previously registered input nodes (via feed*) and the requested
+   * output nodes. Output nodes can then be queried with the fetch* methods.
+   *
+   * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
-  public native void enableStatLogging(boolean enabled);
+  public void run(String[] outputNames, boolean enableStats) {
+    // Release any Tensors from the previous run calls.
+    closeFetches();
+
+    // Add fetches.
+    for (String o : outputNames) {
+      fetchNames.add(o);
+      TensorId tid = TensorId.parse(o);
+      runner.fetch(tid.name, tid.outputIndex);
+    }
+
+    // Run the session.
+    try {
+      if (enableStats) {
+        Session.Run r = runner.setOptions(RunStats.runOptions()).runAndFetchMetadata();
+        fetchTensors = r.outputs;
+
+        if (runStats == null) {
+          runStats = new RunStats();
+        }
+        runStats.add(r.metadata);
+      } else {
+        fetchTensors = runner.run();
+      }
+    } catch (RuntimeException e) {
+      // Ideally the exception would have been let through, but since this interface predates the
+      // TensorFlow Java API, must return -1.
+      Log.e(
+          TAG,
+          "Failed to run TensorFlow inference with inputs:["
+              + TextUtils.join(", ", feedNames)
+              + "], outputs:["
+              + TextUtils.join(", ", fetchNames)
+              + "]");
+      throw e;
+    } finally {
+      // Always release the feeds (to save resources) and reset the runner, this run is
+      // over.
+      closeFeeds();
+      runner = sess.runner();
+    }
+  }
+
+  /** Returns a reference to the Graph describing the computation run during inference. */
+  public Graph graph() {
+    return g;
+  }
+
+  public Operation graphOperation(String operationName) {
+    final Operation operation = g.operation(operationName);
+    if (operation == null) {
+      throw new RuntimeException(
+          "Node '" + operationName + "' does not exist in model '" + modelName + "'");
+    }
+    return operation;
+  }
 
   /** Returns the last stat summary string if logging is enabled. */
-  public native String getStatString();
+  public String getStatString() {
+    return (runStats == null) ? "" : runStats.summary();
+  }
 
   /**
-   * Cleans up the native variables associated with this Object. initializeTensorFlow() can then
-   * be called again to initialize a new session.
+   * Cleans up the state associated with this Object. initializeTensorFlow() can then be called
+   * again to initialize a new session.
    */
-  public native void close();
+  public void close() {
+    closeFeeds();
+    closeFetches();
+    sess.close();
+    g.close();
+    if (runStats != null) {
+      runStats.close();
+    }
+    runStats = null;
+  }
+
+  @Override
+  protected void finalize() throws Throwable {
+    try {
+      close();
+    } finally {
+      super.finalize();
+    }
+  }
 
   // Methods for taking a native Tensor and filling it with values from Java arrays.
 
@@ -103,7 +212,9 @@ public class TensorFlowInferenceInterface {
    * as many elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeFloat(String inputName, int[] dims, float[] src);
+  public void feed(String inputName, float[] src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, FloatBuffer.wrap(src)));
+  }
 
   /**
    * Given a source array with shape {@link dims} and content {@link src}, copy the contents into
@@ -111,7 +222,9 @@ public class TensorFlowInferenceInterface {
    * as many elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeInt(String inputName, int[] dims, int[] src);
+  public void feed(String inputName, int[] src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, IntBuffer.wrap(src)));
+  }
 
   /**
    * Given a source array with shape {@link dims} and content {@link src}, copy the contents into
@@ -119,7 +232,9 @@ public class TensorFlowInferenceInterface {
    * as many elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeDouble(String inputName, int[] dims, double[] src);
+  public void feed(String inputName, double[] src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, DoubleBuffer.wrap(src)));
+  }
 
   /**
    * Given a source array with shape {@link dims} and content {@link src}, copy the contents into
@@ -127,7 +242,9 @@ public class TensorFlowInferenceInterface {
    * as many elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeByte(String inputName, int[] dims, byte[] src);
+  public void feed(String inputName, byte[] src, long... dims) {
+    addFeed(inputName, Tensor.create(DataType.UINT8, dims, ByteBuffer.wrap(src)));
+  }
 
   // Methods for taking a native Tensor and filling it with src from Java native IO buffers.
 
@@ -138,7 +255,9 @@ public class TensorFlowInferenceInterface {
    * elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeFromFloatBuffer(String inputName, IntBuffer dims, FloatBuffer src);
+  public void feed(String inputName, FloatBuffer src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, src));
+  }
 
   /**
    * Given a source buffer with shape {@link dims} and content {@link src}, both stored as
@@ -147,7 +266,9 @@ public class TensorFlowInferenceInterface {
    * elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeFromIntBuffer(String inputName, IntBuffer dims, IntBuffer src);
+  public void feed(String inputName, IntBuffer src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, src));
+  }
 
   /**
    * Given a source buffer with shape {@link dims} and content {@link src}, both stored as
@@ -156,7 +277,9 @@ public class TensorFlowInferenceInterface {
    * elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeFromDoubleBuffer(String inputName, IntBuffer dims, DoubleBuffer src);
+  public void feed(String inputName, DoubleBuffer src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, src));
+  }
 
   /**
    * Given a source buffer with shape {@link dims} and content {@link src}, both stored as
@@ -165,88 +288,201 @@ public class TensorFlowInferenceInterface {
    * elements as that of the destination Tensor. If {@link src} has more elements than the
    * destination has capacity, the copy is truncated.
    */
-  public native void fillNodeFromByteBuffer(String inputName, IntBuffer dims, ByteBuffer src);
+  public void feed(String inputName, ByteBuffer src, long... dims) {
+    addFeed(inputName, Tensor.create(DataType.UINT8, dims, src));
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
    * dst} must have length greater than or equal to that of the source Tensor. This operation will
    * not affect dst's content past the source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeFloat(String outputName, float[] dst);
+  public void fetch(String outputName, float[] dst) {
+    fetch(outputName, FloatBuffer.wrap(dst));
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
    * dst} must have length greater than or equal to that of the source Tensor. This operation will
    * not affect dst's content past the source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeInt(String outputName, int[] dst);
+  public void fetch(String outputName, int[] dst) {
+    fetch(outputName, IntBuffer.wrap(dst));
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
    * dst} must have length greater than or equal to that of the source Tensor. This operation will
    * not affect dst's content past the source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeDouble(String outputName, double[] dst);
+  public void fetch(String outputName, double[] dst) {
+    fetch(outputName, DoubleBuffer.wrap(dst));
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
    * dst} must have length greater than or equal to that of the source Tensor. This operation will
    * not affect dst's content past the source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeByte(String outputName, byte[] dst);
+  public void fetch(String outputName, byte[] dst) {
+    fetch(outputName, ByteBuffer.wrap(dst));
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
    * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
    * or equal to that of the source Tensor. This operation will not affect dst's content past the
    * source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeIntoFloatBuffer(String outputName, FloatBuffer dst);
+  public void fetch(String outputName, FloatBuffer dst) {
+    getTensor(outputName).writeTo(dst);
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
    * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
    * or equal to that of the source Tensor. This operation will not affect dst's content past the
    * source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeIntoIntBuffer(String outputName, IntBuffer dst);
+  public void fetch(String outputName, IntBuffer dst) {
+    getTensor(outputName).writeTo(dst);
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
    * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
    * or equal to that of the source Tensor. This operation will not affect dst's content past the
    * source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeIntoDoubleBuffer(String outputName, DoubleBuffer dst);
+  public void fetch(String outputName, DoubleBuffer dst) {
+    getTensor(outputName).writeTo(dst);
+  }
 
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
    * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
    * or equal to that of the source Tensor. This operation will not affect dst's content past the
    * source Tensor's size.
-   *
-   * @return 0 on success, -1 on failure.
    */
-  public native int readNodeIntoByteBuffer(String outputName, ByteBuffer dst);
+  public void fetch(String outputName, ByteBuffer dst) {
+    getTensor(outputName).writeTo(dst);
+  }
 
-  /**
-   * Canary method solely for determining if the tensorflow_inference native library should be
-   * loaded. If the method is already present, assume that another library is providing the
-   * implementations for this class.
-   */
-  private native void testLoaded();
+  private void loadGraph(InputStream is, Graph g) throws IOException {
+    final long startMs = System.currentTimeMillis();
+
+    if (VERSION.SDK_INT >= 18) {
+      Trace.beginSection("initializeTensorFlow");
+      Trace.beginSection("readGraphDef");
+    }
+
+    // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
+    byte[] graphDef = new byte[is.available()];
+    final int numBytesRead = is.read(graphDef);
+    if (numBytesRead != graphDef.length) {
+      throw new IOException(
+          "read error: read only "
+              + numBytesRead
+              + " of the graph, expected to read "
+              + graphDef.length);
+    }
+
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // readGraphDef.
+      Trace.beginSection("importGraphDef");
+    }
+
+    try {
+      g.importGraphDef(graphDef);
+    } catch (IllegalArgumentException e) {
+      throw new IOException("Not a valid TensorFlow Graph serialization: " + e.getMessage());
+    }
+
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // importGraphDef.
+      Trace.endSection(); // initializeTensorFlow.
+    }
+
+    final long endMs = System.currentTimeMillis();
+    Log.i(
+        TAG,
+        "Model load took " + (endMs - startMs) + "ms, TensorFlow version: " + TensorFlow.version());
+  }
+
+  private void addFeed(String inputName, Tensor t) {
+    // The string format accepted by TensorFlowInferenceInterface is node_name[:output_index].
+    TensorId tid = TensorId.parse(inputName);
+    runner.feed(tid.name, tid.outputIndex, t);
+    feedNames.add(inputName);
+    feedTensors.add(t);
+  }
+
+  private static class TensorId {
+    String name;
+    int outputIndex;
+
+    // Parse output names into a TensorId.
+    //
+    // E.g., "foo" --> ("foo", 0), while "foo:1" --> ("foo", 1)
+    public static TensorId parse(String name) {
+      TensorId tid = new TensorId();
+      int colonIndex = name.lastIndexOf(':');
+      if (colonIndex < 0) {
+        tid.outputIndex = 0;
+        tid.name = name;
+        return tid;
+      }
+      try {
+        tid.outputIndex = Integer.parseInt(name.substring(colonIndex + 1));
+        tid.name = name.substring(0, colonIndex);
+      } catch (NumberFormatException e) {
+        tid.outputIndex = 0;
+        tid.name = name;
+      }
+      return tid;
+    }
+  }
+
+  private Tensor getTensor(String outputName) {
+    int i = 0;
+    for (String n : fetchNames) {
+      if (n.equals(outputName)) {
+        return fetchTensors.get(i);
+      }
+      ++i;
+    }
+    throw new RuntimeException(
+        "Node '" + outputName + "' was not provided to run(), so it cannot be read");
+  }
+
+  private void closeFeeds() {
+    for (Tensor t : feedTensors) {
+      t.close();
+    }
+    feedTensors.clear();
+    feedNames.clear();
+  }
+
+  private void closeFetches() {
+    for (Tensor t : fetchTensors) {
+      t.close();
+    }
+    fetchTensors.clear();
+    fetchNames.clear();
+  }
+
+  // State immutable between initializeTensorFlow calls.
+  private final String modelName;
+  private final Graph g;
+  private final Session sess;
+
+  // State reset on every call to run.
+  private Session.Runner runner;
+  private List<String> feedNames = new ArrayList<String>();
+  private List<Tensor> feedTensors = new ArrayList<Tensor>();
+  private List<String> fetchNames = new ArrayList<String>();
+  private List<Tensor> fetchTensors = new ArrayList<Tensor>();
+
+  // Mutable state.
+  private RunStats runStats;
 }
diff --git a/tensorflow/contrib/android/jni/jni_utils.cc b/tensorflow/contrib/android/jni/jni_utils.cc
deleted file mode 100644
index 71a93ea1f1e..00000000000
--- a/tensorflow/contrib/android/jni/jni_utils.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/android/jni/jni_utils.h"
-
-#include <android/asset_manager.h>
-#include <android/asset_manager_jni.h>
-#include <jni.h>
-#include <stdlib.h>
-
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/message_lite.h"
-#include "tensorflow/contrib/android/jni/limiting_file_input_stream.h"
-#include "tensorflow/core/platform/logging.h"
-
-static const char* const ASSET_PREFIX = "file:///android_asset/";
-
-namespace {
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
-  explicit IfstreamInputStream(const std::string& file_name)
-      : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {
-    CHECK(ifs_.good()) << "Failed to open file \"" << file_name
-                       << "\" or file is 0 length! Use prefix \""
-                       << ASSET_PREFIX
-                       << "\" if attempting to load proto from assets.";
-  }
-  ~IfstreamInputStream() { ifs_.close(); }
-
-  int Read(void* buffer, int size) {
-    if (!ifs_) {
-      return -1;
-    }
-    ifs_.read(static_cast<char*>(buffer), size);
-    return ifs_.gcount();
-  }
-
- private:
-  std::ifstream ifs_;
-};
-
-}  // namespace
-
-bool PortableReadFileToProto(const std::string& file_name,
-                             ::google::protobuf::MessageLite* proto) {
-  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
-      new IfstreamInputStream(file_name));
-
-  stream.SetOwnsCopyingStream(true);
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
-  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-bool IsAsset(const char* const filename) {
-  return strstr(filename, ASSET_PREFIX) == filename;
-}
-
-void ReadFileToProtoOrDie(AAssetManager* const asset_manager,
-                          const char* const filename,
-                          google::protobuf::MessageLite* message) {
-  if (!IsAsset(filename)) {
-    VLOG(0) << "Opening file: " << filename;
-    CHECK(PortableReadFileToProto(filename, message));
-    return;
-  }
-
-  CHECK_NOTNULL(asset_manager);
-
-  const char* const asset_filename = filename + strlen(ASSET_PREFIX);
-  AAsset* asset =
-      AAssetManager_open(asset_manager, asset_filename, AASSET_MODE_STREAMING);
-  CHECK_NOTNULL(asset);
-
-  off_t start;
-  off_t length;
-  const int fd = AAsset_openFileDescriptor(asset, &start, &length);
-
-  if (fd >= 0) {
-    ::tensorflow::android::LimitingFileInputStream is(fd, start + length);
-    google::protobuf::io::CopyingInputStreamAdaptor adaptor(&is);
-    // If the file is smaller than protobuf's default limit, avoid copies.
-    if (length < (64 * 1024 * 1024)) {
-      // If it has a file descriptor that means it can be memmapped directly
-      // from the APK.
-      VLOG(0) << "Opening asset " << asset_filename
-              << " from disk with zero-copy.";
-      adaptor.Skip(start);
-      CHECK(message->ParseFromZeroCopyStream(&adaptor));
-    } else {
-      ::google::protobuf::io::CodedInputStream coded_stream(&adaptor);
-      // Total bytes hard limit / warning limit are set to 1GB and 512MB
-      // respectively.
-      coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
-      coded_stream.Skip(start);
-      CHECK(message->ParseFromCodedStream(&coded_stream));
-    }
-  } else {
-    // It may be compressed, in which case we have to uncompress
-    // it to memory first.
-    VLOG(0) << "Opening asset " << asset_filename << " from disk with copy.";
-    const off_t data_size = AAsset_getLength(asset);
-
-    // TODO(andrewharp): Add codepath for loading compressed protos as well.
-    if (data_size > 64 * 1024 * 1024) {
-      LOG(WARNING) << "Compressed proto is larger than 64mb; if problems occur "
-                   << " turn off compression for protocol buffer files in APK.";
-    }
-
-    const void* const memory = AAsset_getBuffer(asset);
-    CHECK(message->ParseFromArray(memory, data_size));
-  }
-  AAsset_close(asset);
-}
-
-std::string GetString(JNIEnv* env, jstring java_string) {
-  const char* raw_string = env->GetStringUTFChars(java_string, 0);
-  std::string return_str(raw_string);
-  env->ReleaseStringUTFChars(java_string, raw_string);
-  return return_str;
-}
-
-tensorflow::int64 CurrentWallTimeUs() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return tv.tv_sec * 1000000 + tv.tv_usec;
-}
diff --git a/tensorflow/contrib/android/jni/jni_utils.h b/tensorflow/contrib/android/jni/jni_utils.h
deleted file mode 100644
index 7cef1e83967..00000000000
--- a/tensorflow/contrib/android/jni/jni_utils.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef ORG_TENSORFLOW_JNI_JNI_UTILS_H_  // NOLINT
-#define ORG_TENSORFLOW_JNI_JNI_UTILS_H_  // NOLINT
-
-#include <jni.h>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-
-class AAssetManager;
-
-bool PortableReadFileToProto(const std::string& file_name,
-                             ::google::protobuf::MessageLite* proto)
-    TF_MUST_USE_RESULT;
-
-// Deserializes the contents of a file into memory.
-void ReadFileToProtoOrDie(AAssetManager* const asset_manager,
-                          const char* const filename,
-                          google::protobuf::MessageLite* message);
-
-std::string GetString(JNIEnv* env, jstring java_string);
-
-tensorflow::int64 CurrentWallTimeUs();
-
-#endif  // ORG_TENSORFLOW_JNI_JNI_UTILS_H_
diff --git a/tensorflow/contrib/android/jni/limiting_file_input_stream.h b/tensorflow/contrib/android/jni/limiting_file_input_stream.h
deleted file mode 100644
index fb3cb59719a..00000000000
--- a/tensorflow/contrib/android/jni/limiting_file_input_stream.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_LIMITING_FILE_INPUT_STREAM_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_LIMITING_FILE_INPUT_STREAM_H_
-
-#include <errno.h>
-#include <unistd.h>
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-
-namespace tensorflow {
-namespace android {
-
-// Input stream that reads a limited amount of data from an input file
-// descriptor.
-class LimitingFileInputStream
-    : public ::google::protobuf::io::CopyingInputStream {
- public:
-  // Construct a stream to read from file <fd>, returning on the first <limit>
-  // bytes. If <fd> has fewer than <limit> bytes, then limit has no effect.
-  LimitingFileInputStream(int fd, int limit) : fd_(fd), bytes_left_(limit) {}
-  ~LimitingFileInputStream() {}
-
-  int Read(void* buffer, int size) {
-    int result;
-    do {
-      result = read(fd_, buffer, std::min(bytes_left_, size));
-    } while (result < 0 && errno == EINTR);
-
-    if (result < 0) {
-      errno_ = errno;
-    } else {
-      bytes_left_ -= result;
-    }
-    return result;
-  }
-
-  int Skip(int count) {
-    if (lseek(fd_, count, SEEK_CUR) == (off_t)-1) {
-      return -1;
-    }
-    // Seek succeeded.
-    bytes_left_ -= count;
-    return count;
-  }
-
- private:
-  const int fd_;
-  int bytes_left_;
-  int errno_ = 0;
-};
-
-}  // namespace android
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_LIMITING_FILE_INPUT_STREAM_H_
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
new file mode 100644
index 00000000000..119fa9cd2c3
--- /dev/null
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/android/jni/run_stats_jni.h"
+
+#include <jni.h>
+#include <sstream>
+
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/stat_summarizer.h"
+
+using tensorflow::StatSummarizer;
+using tensorflow::RunMetadata;
+
+namespace {
+StatSummarizer* requireHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
+                  "close() has been called on the RunStats object");
+    return nullptr;
+  }
+  return reinterpret_cast<StatSummarizer*>(handle);
+}
+}  // namespace
+
+#define RUN_STATS_METHOD(name) \
+  JNICALL Java_org_tensorflow_contrib_android_RunStats_##name
+
+JNIEXPORT jlong RUN_STATS_METHOD(allocate)(JNIEnv* env, jclass clazz) {
+  static_assert(sizeof(jlong) >= sizeof(StatSummarizer*),
+                "Cannot package C++ object pointers as a Java long");
+  tensorflow::StatSummarizerOptions opts;
+  return reinterpret_cast<jlong>(new StatSummarizer(opts));
+}
+
+JNIEXPORT void RUN_STATS_METHOD(delete)(JNIEnv* env, jclass clazz,
+                                        jlong handle) {
+  if (handle == 0) return;
+  delete reinterpret_cast<StatSummarizer*>(handle);
+}
+
+JNIEXPORT void RUN_STATS_METHOD(add)(JNIEnv* env, jclass clazz, jlong handle,
+                                     jbyteArray run_metadata) {
+  StatSummarizer* s = requireHandle(env, handle);
+  if (s == nullptr) return;
+  jbyte* data = env->GetByteArrayElements(run_metadata, nullptr);
+  int size = static_cast<int>(env->GetArrayLength(run_metadata));
+  tensorflow::RunMetadata proto;
+  if (!proto.ParseFromArray(data, size)) {
+    env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"),
+                  "runMetadata does not seem to be a serialized RunMetadata "
+                  "protocol message");
+  } else if (proto.has_step_stats()) {
+    s->ProcessStepStats(proto.step_stats());
+  }
+  env->ReleaseByteArrayElements(run_metadata, data, JNI_ABORT);
+}
+
+JNIEXPORT jstring RUN_STATS_METHOD(summary)(JNIEnv* env, jclass clazz,
+                                            jlong handle) {
+  StatSummarizer* s = requireHandle(env, handle);
+  if (s == nullptr) return nullptr;
+  std::stringstream ret;
+  ret << s->GetStatsByMetric("Top 10 CPU", StatSummarizer::BY_TIME, 10)
+      << s->GetStatsByNodeType() << s->ShortSummary();
+  return env->NewStringUTF(ret.str().c_str());
+}
+
+#undef RUN_STATS_METHOD
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.h b/tensorflow/contrib/android/jni/run_stats_jni.h
new file mode 100644
index 00000000000..de3bceff0a1
--- /dev/null
+++ b/tensorflow/contrib/android/jni/run_stats_jni.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef ORG_TENSORFLOW_JNI_RUN_STATS_JNI_H_
+#define ORG_TENSORFLOW_JNI_RUN_STATS_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define RUN_STATS_METHOD(name) \
+  Java_org_tensorflow_contrib_android_RunStats_##name
+
+JNIEXPORT JNICALL jlong RUN_STATS_METHOD(allocate)(JNIEnv*, jclass);
+JNIEXPORT JNICALL void RUN_STATS_METHOD(delete)(JNIEnv*, jclass, jlong);
+JNIEXPORT JNICALL void RUN_STATS_METHOD(add)(JNIEnv*, jclass, jlong,
+                                             jbyteArray);
+JNIEXPORT JNICALL jstring RUN_STATS_METHOD(summary)(JNIEnv*, jclass, jlong);
+
+#undef RUN_STATS_METHOD
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // ORG_TENSORFLOW_JNI_RUN_STATS_JNI_H_
diff --git a/tensorflow/contrib/android/jni/tensorflow_inference_jni.cc b/tensorflow/contrib/android/jni/tensorflow_inference_jni.cc
deleted file mode 100644
index d3cfe1fdf0a..00000000000
--- a/tensorflow/contrib/android/jni/tensorflow_inference_jni.cc
+++ /dev/null
@@ -1,405 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/android/jni/tensorflow_inference_jni.h"
-
-#include <android/asset_manager.h>
-#include <android/asset_manager_jni.h>
-#include <android/bitmap.h>
-
-#include <jni.h>
-#include <pthread.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <map>
-#include <queue>
-#include <sstream>
-#include <string>
-
-#include "tensorflow/contrib/android/jni/jni_utils.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/stat_summarizer.h"
-
-using namespace tensorflow;
-
-typedef std::map<std::string, std::pair<std::string, tensorflow::Tensor> >
-    InputMap;
-
-// Variables associated with a single TF session.
-struct SessionVariables {
-  std::unique_ptr<tensorflow::Session> session;
-
-  int64 id = -1;  // Copied from Java field for convenience.
-  int num_runs = 0;
-  int64 timing_total_us = 0;
-
-  bool log_stats = false;
-  StatSummarizer* summarizer = nullptr;
-
-  InputMap input_tensors;
-  std::vector<std::string> output_tensor_names;
-  std::vector<tensorflow::Tensor> output_tensors;
-};
-
-static tensorflow::mutex mutex_(tensorflow::LINKER_INITIALIZED);
-
-std::map<int64, SessionVariables*>* GetSessionsSingleton() {
-  static std::map<int64, SessionVariables*>* sessions PT_GUARDED_BY(mutex_) =
-      new std::map<int64, SessionVariables*>();
-  return sessions;
-}
-
-inline static SessionVariables* GetSessionVars(JNIEnv* env, jobject thiz) {
-  jclass clazz = env->GetObjectClass(thiz);
-  assert(clazz != nullptr);
-  jfieldID fid = env->GetFieldID(clazz, "id", "J");
-  assert(fid != nullptr);
-  const int64 id = env->GetLongField(thiz, fid);
-
-  // This method is thread-safe as we support working with multiple
-  // sessions simultaneously. However care must be taken at the calling
-  // level on a per-session basis.
-  mutex_lock l(mutex_);
-  std::map<int64, SessionVariables*>& sessions = *GetSessionsSingleton();
-  if (sessions.find(id) == sessions.end()) {
-    LOG(INFO) << "Creating new session variables for " << std::hex << id;
-    SessionVariables* vars = new SessionVariables;
-    vars->id = id;
-    sessions[id] = vars;
-  } else {
-    VLOG(1) << "Found session variables for " << std::hex << id;
-  }
-  return sessions[id];
-}
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(testLoaded)(JNIEnv* env,
-                                                     jobject thiz) {
-  LOG(INFO) << "Native TF methods loaded.";
-}
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(initializeTensorFlow)(
-    JNIEnv* env, jobject thiz, jobject java_asset_manager, jstring model) {
-  SessionVariables* vars = GetSessionVars(env, thiz);
-
-  if (vars->session.get() != nullptr) {
-    LOG(INFO) << "Compute graph already loaded. skipping.";
-    return 0;
-  }
-
-  const int64 start_time = CurrentWallTimeUs();
-
-  const std::string model_str = GetString(env, model);
-
-  LOG(INFO) << "Loading Tensorflow.";
-
-  tensorflow::SessionOptions options;
-  tensorflow::ConfigProto& config = options.config;
-
-  tensorflow::Session* session = tensorflow::NewSession(options);
-  vars->session.reset(session);
-  LOG(INFO) << "Session created.";
-
-  tensorflow::GraphDef tensorflow_graph;
-
-  AAssetManager* const asset_manager =
-      AAssetManager_fromJava(env, java_asset_manager);
-  LOG(INFO) << "Acquired AssetManager.";
-
-  LOG(INFO) << "Reading file to proto: " << model_str;
-  ReadFileToProtoOrDie(asset_manager, model_str.c_str(), &tensorflow_graph);
-  CHECK(tensorflow_graph.node_size() > 0) << "Problem loading GraphDef!";
-
-  LOG(INFO) << "GraphDef loaded from " << model_str << " with "
-            << tensorflow_graph.node_size() << " nodes.";
-
-  // Whether or not stat logging is currently enabled, the StatSummarizer must
-  // be initialized here with the GraphDef while it is available.
-  vars->summarizer = new StatSummarizer(tensorflow_graph);
-
-  LOG(INFO) << "Creating TensorFlow graph from GraphDef.";
-  tensorflow::Status s = session->Create(tensorflow_graph);
-
-  // Clear the proto to save memory space.
-  tensorflow_graph.Clear();
-
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow graph: " << s;
-    return s.code();
-  }
-
-  const int64 end_time = CurrentWallTimeUs();
-  LOG(INFO) << "Initialization done in " << (end_time - start_time) / 1000.0
-            << "ms";
-
-  return s.code();
-}
-
-static tensorflow::Tensor* GetTensor(JNIEnv* env, jobject thiz,
-                                     jstring node_name_jstring) {
-  SessionVariables* vars = GetSessionVars(env, thiz);
-  std::string node_name = GetString(env, node_name_jstring);
-
-  int output_index = -1;
-  for (int i = 0; i < vars->output_tensors.size(); ++i) {
-    if (vars->output_tensor_names[i] == node_name) {
-      output_index = i;
-      break;
-    }
-  }
-  if (output_index == -1) {
-    LOG(ERROR) << "Output [" << node_name << "] not found, aborting!";
-    return nullptr;
-  }
-
-  tensorflow::Tensor* output = &vars->output_tensors[output_index];
-  return output;
-}
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(runInference)(
-    JNIEnv* env, jobject thiz, jobjectArray output_name_strings) {
-  SessionVariables* vars = GetSessionVars(env, thiz);
-
-  // Add the requested outputs to the output list.
-  vars->output_tensor_names.clear();
-  for (int i = 0; i < env->GetArrayLength(output_name_strings); i++) {
-    jstring java_string =
-        (jstring)(env->GetObjectArrayElement(output_name_strings, i));
-    std::string output_name = GetString(env, java_string);
-    vars->output_tensor_names.push_back(output_name);
-  }
-
-  ++(vars->num_runs);
-  tensorflow::Status s;
-  int64 start_time, end_time;
-
-  start_time = CurrentWallTimeUs();
-
-  std::vector<std::pair<std::string, tensorflow::Tensor> > input_tensors;
-  for (const auto& entry : vars->input_tensors) {
-    input_tensors.push_back(entry.second);
-  }
-
-  vars->output_tensors.clear();
-
-  if (vars->log_stats) {
-    RunOptions run_options;
-    run_options.set_trace_level(RunOptions::FULL_TRACE);
-    RunMetadata run_metadata;
-
-    s = vars->session->Run(run_options, input_tensors,
-                           vars->output_tensor_names, {},
-                           &(vars->output_tensors), &run_metadata);
-
-    assert(run_metadata.has_step_stats());
-    const StepStats& step_stats = run_metadata.step_stats();
-    vars->summarizer->ProcessStepStats(step_stats);
-
-    // Print the full output string, not just the abbreviated one returned by
-    // getStatString().
-    vars->summarizer->PrintStepStats();
-  } else {
-    s = vars->session->Run(input_tensors, vars->output_tensor_names, {},
-                           &(vars->output_tensors));
-  }
-
-  end_time = CurrentWallTimeUs();
-  const int64 elapsed_time_inf = end_time - start_time;
-  vars->timing_total_us += elapsed_time_inf;
-  VLOG(0) << "End computing. Ran in " << elapsed_time_inf / 1000 << "ms ("
-          << (vars->timing_total_us / vars->num_runs / 1000) << "ms avg over "
-          << vars->num_runs << " runs)";
-
-  if (!s.ok()) {
-    LOG(ERROR) << "Error during inference: " << s;
-  }
-  return s.code();
-}
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(enableStatLogging)(
-    JNIEnv* env, jobject thiz, jboolean enableStatLogging) {
-  SessionVariables* vars = GetSessionVars(env, thiz);
-  vars->log_stats = enableStatLogging;
-}
-
-JNIEXPORT jstring JNICALL TENSORFLOW_METHOD(getStatString)(JNIEnv* env,
-                                                           jobject thiz) {
-  // Return an abbreviated stat string suitable for displaying on screen.
-  SessionVariables* vars = GetSessionVars(env, thiz);
-  std::stringstream ss;
-  ss << vars->summarizer->GetStatsByMetric("Top 10 CPU",
-                                           StatSummarizer::BY_TIME, 10);
-  ss << vars->summarizer->GetStatsByNodeType();
-  ss << vars->summarizer->ShortSummary();
-  return env->NewStringUTF(ss.str().c_str());
-}
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(close)(JNIEnv* env, jobject thiz) {
-  SessionVariables* vars = GetSessionVars(env, thiz);
-
-  tensorflow::Status s = vars->session->Close();
-  if (!s.ok()) {
-    LOG(ERROR) << "Error closing session: " << s;
-  }
-
-  delete vars->summarizer;
-
-  mutex_lock l(mutex_);
-  std::map<int64, SessionVariables*>& sessions = *GetSessionsSingleton();
-  sessions.erase(vars->id);
-  delete vars;
-
-  return s.code();
-}
-
-// TODO(andrewharp): Use memcpy to fill/read nodes.
-#define FILL_NODE_METHOD(DTYPE, JAVA_DTYPE, CTYPE, TENSOR_DTYPE)           \
-  FILL_NODE_SIGNATURE(DTYPE, JAVA_DTYPE) {                                 \
-    SessionVariables* vars = GetSessionVars(env, thiz);                    \
-    jboolean iCopied = JNI_FALSE;                                          \
-    tensorflow::TensorShape shape;                                         \
-    jint* dim_vals = env->GetIntArrayElements(dims, &iCopied);             \
-    const int num_dims = env->GetArrayLength(dims);                        \
-    for (int i = 0; i < num_dims; ++i) {                                   \
-      shape.AddDim(dim_vals[i]);                                           \
-    }                                                                      \
-    env->ReleaseIntArrayElements(dims, dim_vals, JNI_ABORT);               \
-    tensorflow::Tensor input_tensor(TENSOR_DTYPE, shape);                  \
-    auto tensor_mapped = input_tensor.flat<CTYPE>();                       \
-    j##JAVA_DTYPE* values = env->Get##DTYPE##ArrayElements(src, &iCopied); \
-    j##JAVA_DTYPE* value_ptr = values;                                     \
-    const int src_size = static_cast<int>(env->GetArrayLength(src));       \
-    const int dst_size = static_cast<int>(tensor_mapped.size());           \
-    CHECK_GE(src_size, dst_size)                                           \
-        << "src array must have at least as many elements as dst Tensor."; \
-    const int num_items = std::min(src_size, dst_size);                    \
-    for (int i = 0; i < num_items; ++i) {                                  \
-      tensor_mapped(i) = *value_ptr++;                                     \
-    }                                                                      \
-    env->Release##DTYPE##ArrayElements(src, values, JNI_ABORT);            \
-    std::string input_name = GetString(env, node_name);                    \
-    std::pair<std::string, tensorflow::Tensor> input_pair(input_name,      \
-                                                          input_tensor);   \
-    vars->input_tensors[input_name] = input_pair;                          \
-  }
-
-#define FILL_NODE_NIO_BUFFER_METHOD(DTYPE, CTYPE, TENSOR_DTYPE)             \
-  FILL_NODE_NIO_BUFFER_SIGNATURE(DTYPE) {                                   \
-    SessionVariables* vars = GetSessionVars(env, thiz);                     \
-    tensorflow::TensorShape shape;                                          \
-    const int* dim_vals = reinterpret_cast<const int*>(                     \
-        env->GetDirectBufferAddress(dims_buffer));                          \
-    const int num_dims = env->GetDirectBufferCapacity(dims_buffer);         \
-    for (int i = 0; i < num_dims; ++i) {                                    \
-      shape.AddDim(dim_vals[i]);                                            \
-    }                                                                       \
-    tensorflow::Tensor input_tensor(TENSOR_DTYPE, shape);                   \
-    auto tensor_mapped = input_tensor.flat<CTYPE>();                        \
-    const CTYPE* values = reinterpret_cast<const CTYPE*>(                   \
-        env->GetDirectBufferAddress(src_buffer));                           \
-    const CTYPE* value_ptr = values;                                        \
-    const int src_size =                                                    \
-        static_cast<int>(env->GetDirectBufferCapacity(src_buffer));         \
-    const int dst_size = static_cast<int>(tensor_mapped.size());            \
-    CHECK_GE(src_size, dst_size)                                            \
-        << "src buffer must have at least as many elements as dst Tensor."; \
-    const int num_items = std::min(src_size, dst_size);                     \
-    for (int i = 0; i < num_items; ++i) {                                   \
-      tensor_mapped(i) = *value_ptr++;                                      \
-    }                                                                       \
-    std::string input_name = GetString(env, node_name);                     \
-    std::pair<std::string, tensorflow::Tensor> input_pair(input_name,       \
-                                                          input_tensor);    \
-    vars->input_tensors[input_name] = input_pair;                           \
-  }
-
-#define READ_NODE_METHOD(DTYPE, JAVA_DTYPE, CTYPE)                         \
-  READ_NODE_SIGNATURE(DTYPE, JAVA_DTYPE) {                                 \
-    SessionVariables* vars = GetSessionVars(env, thiz);                    \
-    Tensor* t = GetTensor(env, thiz, node_name);                           \
-    if (t == nullptr) {                                                    \
-      return -1;                                                           \
-    }                                                                      \
-    auto tensor_mapped = t->flat<CTYPE>();                                 \
-    jboolean iCopied = JNI_FALSE;                                          \
-    j##JAVA_DTYPE* values = env->Get##DTYPE##ArrayElements(dst, &iCopied); \
-    if (values == nullptr) {                                               \
-      return -1;                                                           \
-    }                                                                      \
-    j##JAVA_DTYPE* value_ptr = values;                                     \
-    const int src_size = static_cast<int>(tensor_mapped.size());           \
-    const int dst_size = static_cast<int>(env->GetArrayLength(dst));       \
-    CHECK_GE(dst_size, src_size)                                           \
-        << "dst array must have length >= src Tensor's flattened size.";   \
-    const int num_items = std::min(src_size, dst_size);                    \
-    for (int i = 0; i < num_items; ++i) {                                  \
-      *value_ptr++ = tensor_mapped(i);                                     \
-    }                                                                      \
-    env->Release##DTYPE##ArrayElements(dst, values, 0);                    \
-    return 0;                                                              \
-  }
-
-#define READ_NODE_NIO_BUFFER_METHOD(DTYPE, CTYPE)                           \
-  READ_NODE_NIO_BUFFER_SIGNATURE(DTYPE) {                                   \
-    SessionVariables* vars = GetSessionVars(env, thiz);                     \
-    Tensor* t = GetTensor(env, thiz, node_name);                            \
-    if (t == nullptr) {                                                     \
-      return -1;                                                            \
-    }                                                                       \
-    auto tensor_mapped = t->flat<CTYPE>();                                  \
-    CTYPE* values =                                                         \
-        reinterpret_cast<CTYPE*>(env->GetDirectBufferAddress(dst_buffer));  \
-    if (values == nullptr) {                                                \
-      return -1;                                                            \
-    }                                                                       \
-    CTYPE* value_ptr = values;                                              \
-    const int src_size = static_cast<int>(tensor_mapped.size());            \
-    const int dst_size =                                                    \
-        static_cast<int>(env->GetDirectBufferCapacity(dst_buffer));         \
-    CHECK_GE(dst_size, src_size)                                            \
-        << "dst buffer must have capacity >= src Tensor's flattened size."; \
-    const int num_items = std::min(src_size, dst_size);                     \
-    for (int i = 0; i < num_items; ++i) {                                   \
-      *value_ptr++ = tensor_mapped(i);                                      \
-    }                                                                       \
-    return 0;                                                               \
-  }
-
-FILL_NODE_METHOD(Float, float, float, tensorflow::DT_FLOAT)
-FILL_NODE_METHOD(Int, int, int, tensorflow::DT_INT32)
-FILL_NODE_METHOD(Double, double, double, tensorflow::DT_DOUBLE)
-FILL_NODE_METHOD(Byte, byte, uint8_t, tensorflow::DT_UINT8)
-
-FILL_NODE_NIO_BUFFER_METHOD(Float, float, tensorflow::DT_FLOAT)
-FILL_NODE_NIO_BUFFER_METHOD(Int, int, tensorflow::DT_INT32)
-FILL_NODE_NIO_BUFFER_METHOD(Double, double, tensorflow::DT_DOUBLE)
-FILL_NODE_NIO_BUFFER_METHOD(Byte, uint8_t, tensorflow::DT_UINT8)
-
-READ_NODE_METHOD(Float, float, float)
-READ_NODE_METHOD(Int, int, int)
-READ_NODE_METHOD(Double, double, double)
-READ_NODE_METHOD(Byte, byte, uint8_t)
-
-READ_NODE_NIO_BUFFER_METHOD(Float, float);
-READ_NODE_NIO_BUFFER_METHOD(Int, int);
-READ_NODE_NIO_BUFFER_METHOD(Double, double);
-READ_NODE_NIO_BUFFER_METHOD(Byte, uint8_t);
diff --git a/tensorflow/contrib/android/jni/tensorflow_inference_jni.h b/tensorflow/contrib/android/jni/tensorflow_inference_jni.h
deleted file mode 100644
index 93fb8ba3159..00000000000
--- a/tensorflow/contrib/android/jni/tensorflow_inference_jni.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// The methods are exposed to Java to allow for interaction with the native
-// TensorFlow code. See
-// tensorflow/examples/android/src/org/tensorflow/TensorFlowClassifier.java
-// for the Java counterparts.
-
-#ifndef ORG_TENSORFLOW_JNI_TENSORFLOW_JNI_H_  // NOLINT
-#define ORG_TENSORFLOW_JNI_TENSORFLOW_JNI_H_  // NOLINT
-
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#define TENSORFLOW_METHOD(METHOD_NAME) \
-  Java_org_tensorflow_contrib_android_TensorFlowInferenceInterface_##METHOD_NAME  // NOLINT
-
-#define FILL_NODE_SIGNATURE(DTYPE, JAVA_DTYPE)                       \
-  JNIEXPORT void TENSORFLOW_METHOD(fillNode##DTYPE)(                 \
-      JNIEnv * env, jobject thiz, jstring node_name, jintArray dims, \
-      j##JAVA_DTYPE##Array src)
-
-#define FILL_NODE_NIO_BUFFER_SIGNATURE(DTYPE)                             \
-  JNIEXPORT void TENSORFLOW_METHOD(fillNodeFrom##DTYPE##Buffer)(          \
-      JNIEnv * env, jobject thiz, jstring node_name, jobject dims_buffer, \
-      jobject src_buffer)
-
-#define READ_NODE_SIGNATURE(DTYPE, JAVA_DTYPE)       \
-  JNIEXPORT jint TENSORFLOW_METHOD(readNode##DTYPE)( \
-      JNIEnv * env, jobject thiz, jstring node_name, j##JAVA_DTYPE##Array dst)
-
-#define READ_NODE_NIO_BUFFER_SIGNATURE(DTYPE)                    \
-  JNIEXPORT jint TENSORFLOW_METHOD(readNodeInto##DTYPE##Buffer)( \
-      JNIEnv * env, jobject thiz, jstring node_name, jobject dst_buffer)
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(testLoaded)(JNIEnv* env, jobject thiz);
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(initializeTensorFlow)(
-    JNIEnv* env, jobject thiz, jobject java_asset_manager, jstring model);
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(runInference)(
-    JNIEnv* env, jobject thiz, jobjectArray output_name_strings);
-
-JNIEXPORT void JNICALL TENSORFLOW_METHOD(enableStatLogging)(
-    JNIEnv* env, jobject thiz, jboolean enableStatLogging);
-
-JNIEXPORT jstring JNICALL TENSORFLOW_METHOD(getStatString)(JNIEnv* env,
-                                                           jobject thiz);
-
-JNIEXPORT jint JNICALL TENSORFLOW_METHOD(close)(JNIEnv* env, jobject thiz);
-
-FILL_NODE_SIGNATURE(Float, float);
-FILL_NODE_SIGNATURE(Int, int);
-FILL_NODE_SIGNATURE(Double, double);
-FILL_NODE_SIGNATURE(Byte, byte);
-
-FILL_NODE_NIO_BUFFER_SIGNATURE(Float);
-FILL_NODE_NIO_BUFFER_SIGNATURE(Int);
-FILL_NODE_NIO_BUFFER_SIGNATURE(Double);
-FILL_NODE_NIO_BUFFER_SIGNATURE(Byte);
-
-READ_NODE_SIGNATURE(Float, float);
-READ_NODE_SIGNATURE(Int, int);
-READ_NODE_SIGNATURE(Double, double);
-READ_NODE_SIGNATURE(Byte, byte);
-
-READ_NODE_NIO_BUFFER_SIGNATURE(Float);
-READ_NODE_NIO_BUFFER_SIGNATURE(Int);
-READ_NODE_NIO_BUFFER_SIGNATURE(Double);
-READ_NODE_NIO_BUFFER_SIGNATURE(Byte);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // ORG_TENSORFLOW_JNI_TENSORFLOW_JNI_H_  // NOLINT
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
new file mode 100644
index 00000000000..6478ad2ea77
--- /dev/null
+++ b/tensorflow/contrib/batching/BUILD
@@ -0,0 +1,210 @@
+# Description: Batching scheduling library.
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["-layering_check"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "batch_scheduler_hdrs",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "batch_scheduler",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "batch_scheduler_test",
+    size = "small",
+    srcs = [
+        "batch_scheduler_test.cc",
+    ],
+    deps = [
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "shared_batch_scheduler_hdrs",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "shared_batch_scheduler",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler",
+        ":shared_batch_scheduler_hdrs",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_test(
+    name = "shared_batch_scheduler_test",
+    size = "small",
+    srcs = [
+        "shared_batch_scheduler_test.cc",
+    ],
+    deps = [
+        ":shared_batch_scheduler",
+        "//tensorflow/contrib/batching/test_util:fake_clock_env",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "basic_batch_scheduler",
+    hdrs = ["basic_batch_scheduler.h"],
+    deps = [
+        ":shared_batch_scheduler",
+    ],
+)
+
+cc_test(
+    name = "basic_batch_scheduler_test",
+    size = "small",
+    srcs = [
+        "basic_batch_scheduler_test.cc",
+    ],
+    deps = [
+        ":basic_batch_scheduler",
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "basic_batch_scheduler_benchmark",
+    srcs = ["basic_batch_scheduler_benchmark.cc"],
+    tags = [
+        "local",
+        "manual",
+    ],
+    deps = [
+        ":basic_batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+    ],
+)
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+tf_custom_op_library(
+    name = "python/ops/_batch_ops.so",
+    srcs = [
+        "ops/batch_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["batch_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "batch_ops",
+    deps = [":batch_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "batch_ops_kernels",
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+        "//tensorflow/contrib/batching/util:periodic_function",
+        "//tensorflow/core/kernels:concat_lib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:split_lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "batch_py",
+    srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
+    dso = [":python/ops/_batch_ops.so"],
+    kernels = [
+        ":batch_ops_kernels",
+        ":batch_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batch_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "batch_ops_test",
+    size = "small",
+    srcs = ["python/ops/batch_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "no_pip",
+        "nomac",
+    ],
+    deps = [
+        ":batch_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/__init__.py b/tensorflow/contrib/batching/__init__.py
new file mode 100644
index 00000000000..44fa5f42a73
--- /dev/null
+++ b/tensorflow/contrib/batching/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to batch.
+
+@@batch_function
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.python.ops.batch_ops import batch_function
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
new file mode 100644
index 00000000000..9d3805fbaf3
--- /dev/null
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -0,0 +1,264 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+
+namespace tensorflow {
+namespace serving {
+
+// A BatchScheduler implementation geared toward handling a single request type
+// running on a specific set of hardware resources. A typical scenario is one in
+// which all requests invoke the same machine-learned model on one GPU.
+//
+// If there are, say, two GPUs and two models each bound to one of the GPUs, one
+// could use two BasicBatchScheduler instances to schedule the two model/GPU
+// combinations independently. If multiple models must share a given GPU or
+// other hardware resource, consider using SharedBatchScheduler instead.
+//
+//
+// PARAMETERS AND BEHAVIOR:
+//
+// BasicBatchScheduler runs a fixed pool of threads, which it uses to process
+// batches of tasks. It enforces a maximum batch size, and enqueues a bounded
+// number of tasks. If the queue is nearly empty, such that a full batch cannot
+// be formed, when a thread becomes free, it anyway schedules a batch
+// immediately if a task has been in the queue for longer than a given timeout
+// parameter. If the timeout parameter is set to 0, then the batch threads will
+// always be kept busy (unless there are zero tasks waiting to be processed).
+//
+// For online serving, it is recommended to set the maximum number of enqueued
+// batches worth of tasks equal to the number of batch threads, which allows
+// enqueuing of enough tasks s.t. if every thread becomes available it can be
+// kept busy, but no more. For bulk processing jobs and throughput-oriented
+// benchmarks, you may want to set it much higher.
+//
+// When Schedule() is called, if the queue is full the call will fail with an
+// UNAVAILABLE error (after which the client may retry again later). If the call
+// succeeds, the maximum time the task will spend in the queue before being
+// placed in a batch and assigned to a thread for processing, is the greater of:
+//  - the maximum time to process ceil(max_enqueued_batches/num_batch_threads)
+//    (1 in the recommended configuration) batches of previously-submitted tasks
+//  - the configured timeout parameter (which can be 0, as mentioned above)
+//
+// Unlike StreamingBatchScheduler, when BasicBatchScheduler assigns a batch to a
+// thread, it closes the batch. The process-batch callback may assume that every
+// batch it receives is closed at the outset.
+//
+//
+// RECOMMENDED USE-CASES:
+//
+// BasicBatchScheduler is suitable for use-cases that feature a single kind of
+// request (e.g. a server performing inference with a single machine-learned
+// model, possibly evolving over time), with loose versioning semantics.
+// Concretely, the following conditions should hold:
+//
+//  A. All requests batched onto a given resource (e.g. a hardware accelerator,
+//     or a pool accelerators) are of the same type. For example, they all
+//     invoke the same machine-learned model.
+//
+//     These variations are permitted:
+//      - The model may reside in a single servable, or it may be spread across
+//        multiple servables that are used in unison (e.g. a vocabulary lookup
+//        table servable and a tensorflow session servable).
+//      - The model's servable(s) may be static, or they may evolve over time
+//        (successive servable versions).
+//      - Zero or more of the servables are used in the request thread; the rest
+//        are used in the batch thread. In our running example, the vocabulary
+//        lookups and tensorflow runs may both be performed in the batch thread,
+//        or alternatively the vocabulary lookup may occur in the request thread
+//        with only the tensorflow run performed in the batch thread.
+//
+//     In contrast, BasicBatchScheduler is not a good fit if the server
+//     hosts multiple distinct models running on a pool accelerators, with each
+//     request specifying which model it wants to use. BasicBatchScheduler
+//     has no facility to time-multiplex the batch threads across multiple
+//     models in a principled way. More basically, it cannot ensure that a given
+//     batch doesn't contain a mixture of requests for different models.
+//
+//  B. Requests do not specify a particular version of the servable(s) that must
+//     be used. Instead, each request is content to use the "latest" version.
+//
+//     BasicBatchScheduler does not constrain which requests get grouped
+//     together into a batch, so using this scheduler there is no way to achieve
+//     cohesion of versioned requests to version-specific batches.
+//
+//  C. No servable version coordination needs to be performed between the
+//     request threads and the batch threads. Often, servables are only used in
+//     the batch threads, in which case this condition trivially holds. If
+//     servables are used in both threads, then the use-case must tolerate
+//     version skew across the servables used in the two kinds of threads.
+//
+//
+// EXAMPLE USE-CASE FLOW:
+//
+// For such use-cases, request processing via BasicBatchScheduler generally
+// follows this flow (given for illustration; variations are possible):
+//  1. Optionally perform some pre-processing on each request in the request
+//     threads.
+//  2. Route the requests to the batch scheduler, as batching::Task objects.
+//     (Since all requests are of the same type and are not versioned, the
+//     scheduler is free to group them into batches arbitrarily.)
+//  3. Merge the requests into a single batched representation B.
+//  4. Obtain handles to the servable(s) needed to process B. The simplest
+//     approach is to obtain the latest version of each servable. Alternatively,
+//     if cross-servable consistency is required (e.g. the vocabulary lookup
+//     table's version number must match that of the tensorflow session),
+//     identify an appropriate version number and obtain the servable handles
+//     accordingly.
+//  5. Process B using the obtained servable handles, and split the result into
+//     individual per-request units.
+//  6. Perform any post-processing in the batch thread and/or request thread.
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class BasicBatchScheduler : public BatchScheduler<TaskType> {
+ public:
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  // (Keep them mirrored to the ones in SharedBatchScheduler::QueueOptions and
+  // SharedBatchScheduler::Options.)
+  struct Options {
+    // The maximum size of each batch.
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    int max_batch_size = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'max_batch_size'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64 batch_timeout_micros = 0;
+
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::NumSchedulableCPUs();
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    int max_enqueued_batches = 10;
+
+    // The following options are typically only overridden by test code.
+
+    // The environment to use.
+    Env* env = Env::Default();
+  };
+  static Status Create(const Options& options,
+                       std::function<void(std::unique_ptr<Batch<TaskType>>)>
+                           process_batch_callback,
+                       std::unique_ptr<BasicBatchScheduler>* scheduler);
+
+  ~BasicBatchScheduler() override = default;
+
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+ private:
+  explicit BasicBatchScheduler(
+      std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
+
+  // This class is merely a thin wrapper around a SharedBatchScheduler with a
+  // single queue.
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BasicBatchScheduler);
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+Status BasicBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::function<void(std::unique_ptr<Batch<TaskType>>)>
+        process_batch_callback,
+    std::unique_ptr<BasicBatchScheduler>* scheduler) {
+  typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
+  shared_scheduler_options.thread_pool_name = options.thread_pool_name;
+  shared_scheduler_options.num_batch_threads = options.num_batch_threads;
+  shared_scheduler_options.env = options.env;
+  std::shared_ptr<SharedBatchScheduler<TaskType>> shared_scheduler;
+  TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
+      shared_scheduler_options, &shared_scheduler));
+
+  typename SharedBatchScheduler<TaskType>::QueueOptions
+      shared_scheduler_queue_options;
+  shared_scheduler_queue_options.max_batch_size = options.max_batch_size;
+  shared_scheduler_queue_options.batch_timeout_micros =
+      options.batch_timeout_micros;
+  shared_scheduler_queue_options.max_enqueued_batches =
+      options.max_enqueued_batches;
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue;
+  TF_RETURN_IF_ERROR(shared_scheduler->AddQueue(shared_scheduler_queue_options,
+                                                process_batch_callback,
+                                                &shared_scheduler_queue));
+
+  scheduler->reset(
+      new BasicBatchScheduler<TaskType>(std::move(shared_scheduler_queue)));
+  return Status::OK();
+}
+
+template <typename TaskType>
+Status BasicBatchScheduler<TaskType>::Schedule(
+    std::unique_ptr<TaskType>* task) {
+  return shared_scheduler_queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::NumEnqueuedTasks() const {
+  return shared_scheduler_queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::SchedulingCapacity() const {
+  return shared_scheduler_queue_->SchedulingCapacity();
+}
+
+template <typename TaskType>
+BasicBatchScheduler<TaskType>::BasicBatchScheduler(
+    std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue)
+    : shared_scheduler_queue_(std::move(shared_scheduler_queue)) {}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc b/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
new file mode 100644
index 00000000000..ab6c8104335
--- /dev/null
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
@@ -0,0 +1,435 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Benchmarks for performance (throughput and latency) of BasicBatchScheduler
+// under various rates of task injection.
+
+#include "tensorflow/contrib/batching/basic_batch_scheduler.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace serving {
+namespace {
+
+using ::tensorflow::histogram::Histogram;
+
+// An abstract class for injecting load into a system at a specific rate.
+class LoadInjector {
+ public:
+  virtual ~LoadInjector() = default;
+
+  // Run 'injector' 'num_injection' times, with average inter-injection spacing
+  // as 'average_injection_interval_micros' (in microseconds).
+  virtual void InjectLoad(std::function<void()> injector, int num_injections,
+                          int64 average_injection_interval_micros) const = 0;
+};
+
+// A load injector that uses uniform inter-injection spacing, i.e. each pair of
+// injections is separated in time by 'average_injection_interval_micros' (as
+// best as possible).
+class UniformLoadInjector : public LoadInjector {
+ public:
+  UniformLoadInjector() = default;
+  ~UniformLoadInjector() override = default;
+
+  void InjectLoad(std::function<void()> injector, int num_injections,
+                  int64 average_injection_interval_micros) const override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(UniformLoadInjector);
+};
+
+void UniformLoadInjector::InjectLoad(
+    std::function<void()> injector, const int num_injections,
+    const int64 average_injection_interval_micros) const {
+  int num_injections_performed = 0;
+  const int64 start_time_micros = Env::Default()->NowMicros();
+  while (num_injections_performed < num_injections) {
+    // Inject.
+    injector();
+    ++num_injections_performed;
+
+    // Wait until it's time for the next injection.
+    const int64 next_injection_time_micros =
+        start_time_micros +
+        (num_injections_performed * average_injection_interval_micros);
+    int64 now_micros = Env::Default()->NowMicros();
+    while (now_micros < next_injection_time_micros) {
+      const int64 kSleepThresholdMicros = 1000;
+      if (next_injection_time_micros - now_micros >= kSleepThresholdMicros) {
+        Env::Default()->SleepForMicroseconds(1 /* minimum time */);
+      }
+      now_micros = Env::Default()->NowMicros();
+    }
+  }
+}
+
+class BenchmarkBatchTask : public BatchTask {
+ public:
+  BenchmarkBatchTask();
+
+  BenchmarkBatchTask(const BenchmarkBatchTask&) = delete;
+  BenchmarkBatchTask& operator=(const BenchmarkBatchTask&) = delete;
+
+  ~BenchmarkBatchTask() override = default;
+
+  size_t size() const override { return 1; }
+
+  uint64 start_time_micros() const { return start_time_micros_; }
+
+ private:
+  // The time at which the task was created, in microseconds.
+  const uint64 start_time_micros_;
+};
+
+BenchmarkBatchTask::BenchmarkBatchTask()
+    : start_time_micros_(Env::Default()->NowMicros()) {}
+
+// The state and logic associated with a throughput benchmark, which injects a
+// large number of tasks into a batch scheduler and measures the total time to
+// process all the tasks.
+class ThroughputBenchmark {
+ public:
+  explicit ThroughputBenchmark(
+      const BasicBatchScheduler<BenchmarkBatchTask>::Options&
+          scheduler_options);
+
+  ThroughputBenchmark(const ThroughputBenchmark&) = delete;
+  ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete;
+
+  // Perform the benchmark run, based on the parameters supplied to the ctor.
+  void RunBenchmark(int iters);
+
+ private:
+  // Resets all mutable state, including the scheduler.
+  void ResetState();
+
+  // Processes a batch of tasks. (Invoked by 'scheduler_' on one of its batch
+  // threads.)
+  void ProcessBatch(std::unique_ptr<Batch<BenchmarkBatchTask>> batch);
+
+  // Parameters for the BasicBatchScheduler being benchmarked.
+  const BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options_;
+
+  // The BasicBatchScheduler being benchmarked.
+  std::unique_ptr<BasicBatchScheduler<BenchmarkBatchTask>> scheduler_;
+};
+
+ThroughputBenchmark::ThroughputBenchmark(
+    const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options)
+    : scheduler_options_(scheduler_options) {}
+
+void ThroughputBenchmark::RunBenchmark(int iters) {
+  CHECK_GE(iters, 1);
+
+  testing::StopTiming();
+  ResetState();
+
+  // Have each iteration issue a reasonably large number of tasks, to ensure our
+  // measurements reflect steady-state behavior.
+  const int kNumTasksPerIteration = 100 * 1000;
+
+  testing::ItemsProcessed(iters * kNumTasksPerIteration);
+  testing::UseRealTime();
+  testing::StartTiming();
+
+  // Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < kNumTasksPerIteration; ++j) {
+      auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
+      TF_CHECK_OK(scheduler_->Schedule(&task));
+    }
+  }
+
+  // Wait for the scheduler to process all tasks.
+  scheduler_.reset();
+  testing::StopTiming();
+}
+
+void ThroughputBenchmark::ResetState() {
+  auto process_batch_callback =
+      [this](std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
+        ProcessBatch(std::move(batch));
+      };
+  TF_CHECK_OK(BasicBatchScheduler<BenchmarkBatchTask>::Create(
+      scheduler_options_, process_batch_callback, &scheduler_));
+}
+
+void ThroughputBenchmark::ProcessBatch(
+    std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
+  // No-op.
+}
+
+// The state and logic associated with a latency benchmark, which injects tasks
+// into a batch scheduler at a controlled rate and measures the distribution of
+// task completion latencies.
+//
+// Reports the measurements to std::cout (not LOG(INFO)), like the throughput
+// measurements.
+class LatencyBenchmark {
+ public:
+  LatencyBenchmark(
+      const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options,
+      int64 task_injection_interval_micros, int batch_cpu_cost);
+
+  LatencyBenchmark(const LatencyBenchmark&) = delete;
+  LatencyBenchmark& operator=(const LatencyBenchmark&) = delete;
+
+  // Perform the benchmark run, based on the parameters supplied to the ctor.
+  void RunBenchmark();
+
+ private:
+  // Resets all mutable state, including the scheduler and latency measurements.
+  void ResetState() LOCKS_EXCLUDED(mu_);
+
+  // Processes a batch of tasks. (Invoked by 'scheduler_' on one of its batch
+  // threads.)
+  void ProcessBatch(std::unique_ptr<Batch<BenchmarkBatchTask>> batch);
+
+  // Performs one batch's dummy CPU work.
+  void PerformBatchCpuWork() const;
+
+  // Parameters for the BasicBatchScheduler being benchmarked.
+  const BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options_;
+
+  // The time interval between successively injected tasks, in microseconds.
+  // A large interval corresponds to a slow rate of task injection, and vice-
+  // versa.
+  const int64 task_injection_interval_micros_;
+
+  // The amount of work to do while processing one batch of tasks. (The cost is
+  // independent of the number of tasks in the batch.)
+  const int batch_cpu_cost_;
+
+  // The BasicBatchScheduler being benchmarked.
+  std::unique_ptr<BasicBatchScheduler<BenchmarkBatchTask>> scheduler_;
+
+  mutable mutex mu_;
+
+  // A histogram of the task latencies, i.e. queue time plus processing time, in
+  // milliseconds.
+  Histogram task_latency_millis_histogram_ GUARDED_BY(mu_);
+
+  // A histogram of the batch sizes.
+  Histogram batch_size_histogram_ GUARDED_BY(mu_);
+};
+
+LatencyBenchmark::LatencyBenchmark(
+    const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options,
+    int64 task_injection_interval_micros, int batch_cpu_cost)
+    : scheduler_options_(scheduler_options),
+      task_injection_interval_micros_(task_injection_interval_micros),
+      batch_cpu_cost_(batch_cpu_cost) {}
+
+void LatencyBenchmark::RunBenchmark() {
+  ResetState();
+
+  // Arrange to inject tasks at the specified rate, for a fixed total time
+  // duration.
+  const int kTimeDurationMicros = 100 * 1000 * 1000 /* 100 seconds */;
+  const int kNumTasks = kTimeDurationMicros / task_injection_interval_micros_;
+  CHECK_GE(kNumTasks, 100000)
+      << "Not enough tasks to report meaningful 99.9% latency";
+
+  const int64 start_time_micros = Env::Default()->NowMicros();
+
+  // Inject the tasks.
+  UniformLoadInjector injector;
+  injector.InjectLoad(
+      [this] {
+        auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
+        TF_CHECK_OK(scheduler_->Schedule(&task));
+      },
+      kNumTasks, task_injection_interval_micros_);
+
+  // Be sure we were able to more-or-less match our target injection rate.
+  const int64 target_injection_time_micros =
+      kNumTasks * task_injection_interval_micros_;
+  const int64 actual_injection_time_micros =
+      Env::Default()->NowMicros() - start_time_micros;
+  if (actual_injection_time_micros > 1.1 * target_injection_time_micros) {
+    LOG(FATAL) << "Unable to inject tasks at the requested rate";
+  }
+
+  // Wait for the scheduler to process all injected tasks.
+  scheduler_.reset();
+
+  // Be sure the scheduler was able to process the tasks at close to the
+  // injection rate. If not, our latency measurements will be dominated by queue
+  // waiting time
+  const int64 actual_processing_time_micros =
+      Env::Default()->NowMicros() - start_time_micros;
+  if (actual_processing_time_micros > 1.01 * actual_injection_time_micros) {
+    LOG(FATAL) << "Unable to keep up with task injection rate";
+  }
+
+  // Report benchmark measurements.
+  {
+    mutex_lock l(mu_);
+    std::cout << "\t"
+              << "99.9% latency: "
+              << task_latency_millis_histogram_.Percentile(99.9) << "ms"
+              << "\t"
+              << "99% batch size: " << batch_size_histogram_.Percentile(99)
+              << std::endl;
+  }
+}
+
+void LatencyBenchmark::ResetState() {
+  auto process_batch_callback =
+      [this](std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
+        ProcessBatch(std::move(batch));
+      };
+  TF_CHECK_OK(BasicBatchScheduler<BenchmarkBatchTask>::Create(
+      scheduler_options_, process_batch_callback, &scheduler_));
+
+  {
+    mutex_lock l(mu_);
+    task_latency_millis_histogram_.Clear();
+    batch_size_histogram_.Clear();
+  }
+}
+
+void LatencyBenchmark::ProcessBatch(
+    std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
+  PerformBatchCpuWork();
+  const uint64 batch_completion_time = Env::Default()->NowMicros();
+
+  {
+    mutex_lock l(mu_);
+    batch_size_histogram_.Add(batch->num_tasks());
+  }
+
+  for (int i = 0; i < batch->num_tasks(); ++i) {
+    const BenchmarkBatchTask& task = batch->task(i);
+
+    const uint64 task_latency_micros =
+        batch_completion_time - task.start_time_micros();
+
+    {
+      mutex_lock l(mu_);
+      task_latency_millis_histogram_.Add(task_latency_micros / 1000.0);
+    }
+  }
+}
+
+void LatencyBenchmark::PerformBatchCpuWork() const {
+  int dummy = 1;
+  for (int i = 0; i < batch_cpu_cost_; ++i) {
+    dummy += dummy * 2;
+  }
+  CHECK_NE(dummy, 0);
+}
+
+static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
+                                   int num_batch_threads) {
+  BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
+  const int kMaxBatchSize = 100;
+  scheduler_options.max_batch_size = kMaxBatchSize;
+  scheduler_options.batch_timeout_micros = batch_timeout_micros;
+  scheduler_options.num_batch_threads = num_batch_threads;
+  scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
+  ThroughputBenchmark benchmark(scheduler_options);
+  benchmark.RunBenchmark(iters);
+}
+
+static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) {
+  RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads);
+}
+BENCHMARK(ThroughputBM_ZeroTimeout)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64);
+
+static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) {
+  RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads);
+}
+BENCHMARK(ThroughputBM_SmallTimeout)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64);
+
+static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) {
+  RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */,
+                         num_batch_threads);
+}
+BENCHMARK(ThroughputBM_LargeTimeout)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64);
+
+static void RunLatencyBenchmark(int64 task_injection_interval_micros,
+                                int64 batch_timeout_micros) {
+  BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
+  const int kMaxBatchSize = 100;
+  scheduler_options.max_batch_size = kMaxBatchSize;
+  scheduler_options.batch_timeout_micros = batch_timeout_micros;
+  const int kNumBatchThreads = 2;
+  scheduler_options.num_batch_threads = kNumBatchThreads;
+  scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
+  const int kBatchCpuCost = 10 * 1000 * 1000;
+  LatencyBenchmark benchmark(scheduler_options, task_injection_interval_micros,
+                             kBatchCpuCost);
+  benchmark.RunBenchmark();
+}
+
+static void RunLatencyBenchmarks() {
+  for (const int64 batch_timeout_micros : {0, 1 * 1000, 2 * 1000, 5 * 1000}) {
+    for (const int64 task_injection_interval_micros : {1000, 50, 20}) {
+      std::cout << "Latency benchmark w/ batch timeout "
+                << batch_timeout_micros / 1000.0 << "ms"
+                << "; "
+                << "task injection rate "
+                << 1000000.0 / task_injection_interval_micros << "/sec"
+                << "\t...";
+      RunLatencyBenchmark(task_injection_interval_micros, batch_timeout_micros);
+    }
+    std::cout << std::endl;
+  }
+}
+
+}  // namespace
+}  // namespace serving
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  std::setprecision(5);
+
+  // Run latency benchmarks (outside of tensorflow benchmark framework).
+  tensorflow::serving::RunLatencyBenchmarks();
+
+  // Run throughput benchmarks (via tensorflow benchmark framework).
+  tensorflow::testing::RunBenchmarks();
+
+  return 0;
+}
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
new file mode 100644
index 00000000000..e020301795c
--- /dev/null
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/basic_batch_scheduler.h"
+
+#include <utility>
+
+#include "tensorflow/contrib/batching/batch_scheduler.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+namespace {
+
+class FakeTask : public BatchTask {
+ public:
+  explicit FakeTask(size_t size) : size_(size) {}
+
+  ~FakeTask() override = default;
+
+  size_t size() const override { return size_; }
+
+ private:
+  const size_t size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+};
+
+// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()'
+// on that task. Returns the resulting status.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Since BasicBatchScheduler is implemented as a thin wrapper around
+// SharedBatchScheduler, we only do some basic testing. More comprehensive
+// testing is done in shared_batch_scheduler_test.cc.
+
+TEST(BasicBatchSchedulerTest, Basic) {
+  bool callback_called = false;
+  auto callback = [&callback_called](std::unique_ptr<Batch<FakeTask>> batch) {
+    callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(3, batch->task(0).size());
+    EXPECT_EQ(5, batch->task(1).size());
+  };
+  {
+    BasicBatchScheduler<FakeTask>::Options options;
+    options.max_batch_size = 10;
+    options.batch_timeout_micros = 100 * 1000;  // 100 milliseconds
+    options.num_batch_threads = 1;
+    options.max_enqueued_batches = 3;
+    std::unique_ptr<BasicBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        BasicBatchScheduler<FakeTask>::Create(options, callback, &scheduler));
+    EXPECT_EQ(0, scheduler->NumEnqueuedTasks());
+    EXPECT_EQ(3 * 10, scheduler->SchedulingCapacity());
+    TF_ASSERT_OK(ScheduleTask(3, scheduler.get()));
+    EXPECT_EQ(1, scheduler->NumEnqueuedTasks());
+    EXPECT_EQ((3 * 10) - 3, scheduler->SchedulingCapacity());
+    TF_ASSERT_OK(ScheduleTask(5, scheduler.get()));
+    EXPECT_EQ(2, scheduler->NumEnqueuedTasks());
+    EXPECT_EQ((3 * 10) - (3 + 5), scheduler->SchedulingCapacity());
+  }
+  EXPECT_TRUE(callback_called);
+}
+
+}  // namespace
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
new file mode 100644
index 00000000000..7c41ad88180
--- /dev/null
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -0,0 +1,276 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Abstractions for processing small tasks in a batched fashion, to reduce
+// processing times and costs that can be amortized across multiple tasks.
+//
+// The core class is BatchScheduler, which groups tasks into batches.
+//
+// BatchScheduler encapsulates logic for aggregating multiple tasks into a
+// batch, and kicking off processing of a batch on a thread pool it manages.
+//
+// This file defines an abstract BatchScheduler class.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+// The abstract superclass for a unit of work to be done as part of a batch.
+//
+// An implementing subclass typically contains (or points to):
+//  (a) input data;
+//  (b) a thread-safe completion signal (e.g. a Notification);
+//  (c) a place to store the outcome (success, or some error), upon completion;
+//  (d) a place to store the output data, upon success.
+//
+// Items (b), (c) and (d) are typically non-owned pointers to data homed
+// elsewhere, because a task's ownership gets transferred to a BatchScheduler
+// (see below) and it may be deleted as soon as it is done executing.
+class BatchTask {
+ public:
+  virtual ~BatchTask() = default;
+
+  // Returns the size of the task, in terms of how much it contributes to the
+  // size of a batch. (A batch's size is the sum of its task sizes.)
+  virtual size_t size() const = 0;
+};
+
+// A thread-safe collection of BatchTasks, to be executed together in some
+// fashion.
+//
+// At a given time, a batch is either "open" or "closed": an open batch can
+// accept new tasks; a closed one cannot. A batch is monotonic: initially it is
+// open and tasks can be added to it; then it is closed and its set of tasks
+// remains fixed for the remainder of its life. A closed batch cannot be re-
+// opened. Tasks can never be removed from a batch.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class Batch {
+ public:
+  Batch() = default;
+  ~Batch();  // Blocks until the batch is closed.
+
+  // Appends 'task' to the batch. After calling AddTask(), the newly-added task
+  // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
+  // Dies if the batch is closed.
+  void AddTask(std::unique_ptr<TaskType> task);
+
+  // Removes the most recently added task. Returns nullptr if the batch is
+  // empty.
+  std::unique_ptr<TaskType> RemoveTask();
+
+  // Returns the number of tasks in the batch.
+  int num_tasks() const;
+
+  // Returns true iff the batch contains 0 tasks.
+  bool empty() const;
+
+  // Returns a reference to the ith task (in terms of insertion order).
+  const TaskType& task(int i) const;
+
+  // Returns a pointer to the ith task (in terms of insertion order).
+  TaskType* mutable_task(int i);
+
+  // Returns the sum of the task sizes.
+  size_t size() const;
+
+  // Returns true iff the batch is currently closed.
+  bool IsClosed() const;
+
+  // Blocks until the batch is closed.
+  void WaitUntilClosed() const;
+
+  // Marks the batch as closed. Dies if called more than once.
+  void Close();
+
+ private:
+  mutable mutex mu_;
+
+  // The tasks in the batch.
+  std::vector<std::unique_ptr<TaskType>> tasks_ GUARDED_BY(mu_);
+
+  // The sum of the sizes of the tasks in 'tasks_'.
+  size_t size_ GUARDED_BY(mu_) = 0;
+
+  // Whether the batch has been closed.
+  Notification closed_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Batch);
+};
+
+// An abstract batch scheduler class. Collects individual tasks into batches,
+// and processes each batch on a pool of "batch threads" that it manages. The
+// actual logic for processing a batch is accomplished via a callback.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class BatchScheduler {
+ public:
+  virtual ~BatchScheduler() = default;
+
+  // Submits a task to be processed as part of a batch.
+  //
+  // Ownership of '*task' is transferred to the callee iff the method returns
+  // Status::OK. In that case, '*task' is left as nullptr. Otherwise, '*task' is
+  // left as-is.
+  //
+  // If no batch processing capacity is available to process this task at the
+  // present time, and any task queue maintained by the implementing subclass is
+  // full, this method returns an UNAVAILABLE error code. The client may retry
+  // later.
+  //
+  // Other problems, such as the task size being larger than the maximum batch
+  // size, yield other, permanent error types.
+  //
+  // In all cases, this method returns "quickly" without blocking for any
+  // substantial amount of time. If the method returns Status::OK, the task is
+  // processed asynchronously, and any errors that occur during the processing
+  // of the batch that includes the task can be reported to 'task'.
+  virtual Status Schedule(std::unique_ptr<TaskType>* task) = 0;
+
+  // Returns the number of tasks that have been scheduled (i.e. accepted by
+  // Schedule()), but have yet to be handed to a thread for execution as part of
+  // a batch. Note that this returns the number of tasks, not the aggregate task
+  // size (so if there is one task of size 3 and one task of size 5, this method
+  // returns 2 rather than 8).
+  virtual size_t NumEnqueuedTasks() const = 0;
+
+  // Returns a guaranteed number of size 1 tasks that can be Schedule()d without
+  // getting an UNAVAILABLE error. In a typical implementation, returns the
+  // available space on a queue.
+  //
+  // There are two important caveats:
+  //  1. The guarantee does not extend to varying-size tasks due to possible
+  //     internal fragmentation of batches.
+  //  2. The guarantee only holds in a single-thread environment or critical
+  //     section, i.e. if an intervening thread cannot call Schedule().
+  //
+  // This method is useful for monitoring, or for guaranteeing a future slot in
+  // the schedule (but being mindful about the caveats listed above).
+  virtual size_t SchedulingCapacity() const = 0;
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+Batch<TaskType>::~Batch() {
+  WaitUntilClosed();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
+  DCHECK(!IsClosed());
+  {
+    mutex_lock l(mu_);
+    size_ += task->size();
+    tasks_.push_back(std::move(task));
+  }
+}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> Batch<TaskType>::RemoveTask() {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return nullptr;
+    }
+    std::unique_ptr<TaskType> task = std::move(tasks_.back());
+    tasks_.pop_back();
+    return task;
+  }
+}
+
+template <typename TaskType>
+int Batch<TaskType>::num_tasks() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.size();
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::empty() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.empty();
+  }
+}
+
+template <typename TaskType>
+const TaskType& Batch<TaskType>::task(int i) const {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return *tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+TaskType* Batch<TaskType>::mutable_task(int i) {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+size_t Batch<TaskType>::size() const {
+  {
+    mutex_lock l(mu_);
+    return size_;
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::IsClosed() const {
+  return const_cast<Notification*>(&closed_)->HasBeenNotified();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::WaitUntilClosed() const {
+  const_cast<Notification*>(&closed_)->WaitForNotification();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::Close() {
+  closed_.Notify();
+}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/batch_scheduler_test.cc b/tensorflow/contrib/batching/batch_scheduler_test.cc
new file mode 100644
index 00000000000..f15d8cc8e57
--- /dev/null
+++ b/tensorflow/contrib/batching/batch_scheduler_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/batch_scheduler.h"
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+namespace {
+
+class FakeTask : public BatchTask {
+ public:
+  explicit FakeTask(size_t size) : size_(size) {}
+
+  ~FakeTask() override = default;
+
+  size_t size() const override { return size_; }
+
+ private:
+  const size_t size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+};
+
+TEST(BatchTest, Basic) {
+  Batch<FakeTask> batch;
+
+  EXPECT_EQ(0, batch.num_tasks());
+  EXPECT_TRUE(batch.empty());
+  EXPECT_EQ(0, batch.size());
+  EXPECT_FALSE(batch.IsClosed());
+
+  auto task0 = new FakeTask(3);
+  batch.AddTask(std::unique_ptr<FakeTask>(task0));
+
+  EXPECT_EQ(1, batch.num_tasks());
+  EXPECT_FALSE(batch.empty());
+  EXPECT_EQ(task0->size(), batch.size());
+  EXPECT_EQ(task0->size(), batch.task(0).size());
+  EXPECT_FALSE(batch.IsClosed());
+
+  auto task1 = new FakeTask(7);
+  batch.AddTask(std::unique_ptr<FakeTask>(task1));
+
+  EXPECT_EQ(2, batch.num_tasks());
+  EXPECT_FALSE(batch.empty());
+  EXPECT_EQ(task0->size() + task1->size(), batch.size());
+  EXPECT_EQ(task1->size(), batch.task(1).size());
+  EXPECT_EQ(task1->size(), batch.mutable_task(1)->size());
+  EXPECT_FALSE(batch.IsClosed());
+
+  batch.Close();
+  EXPECT_TRUE(batch.IsClosed());
+
+  EXPECT_EQ(2, batch.num_tasks());
+  EXPECT_FALSE(batch.empty());
+  EXPECT_EQ(task0->size() + task1->size(), batch.size());
+  EXPECT_EQ(task0->size(), batch.task(0).size());
+  EXPECT_EQ(task1->size(), batch.task(1).size());
+
+  EXPECT_EQ(7, batch.RemoveTask()->size());
+  EXPECT_EQ(3, batch.RemoveTask()->size());
+  EXPECT_TRUE(batch.empty());
+}
+
+TEST(BatchTest, WaitUntilClosed) {
+  Batch<FakeTask> batch;
+  batch.AddTask(std::unique_ptr<FakeTask>(new FakeTask(3)));
+  EXPECT_FALSE(batch.IsClosed());
+
+  std::unique_ptr<Thread> close_thread(
+      Env::Default()->StartThread(ThreadOptions(), "test", [&batch]() {
+        Env::Default()->SleepForMicroseconds(100);
+        batch.Close();
+      }));
+  batch.WaitUntilClosed();
+  EXPECT_TRUE(batch.IsClosed());
+}
+
+TEST(BatchTest, DeletionBlocksUntilClosed) {
+  Batch<FakeTask>* batch = new Batch<FakeTask>;
+  batch->AddTask(std::unique_ptr<FakeTask>(new FakeTask(3)));
+  EXPECT_FALSE(batch->IsClosed());
+
+  Notification do_delete, deleted;
+  std::unique_ptr<Thread> delete_thread(Env::Default()->StartThread(
+      ThreadOptions(), "test", [&batch, &do_delete, &deleted]() {
+        do_delete.WaitForNotification();
+        delete batch;
+        deleted.Notify();
+      }));
+  do_delete.Notify();
+  Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+  EXPECT_FALSE(deleted.HasBeenNotified());
+  batch->Close();
+  deleted.WaitForNotification();
+}
+
+}  // namespace
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/kernels/BUILD b/tensorflow/contrib/batching/kernels/BUILD
new file mode 100644
index 00000000000..6e53dd9a5fc
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/BUILD
@@ -0,0 +1,34 @@
+# Description:
+#   Contains kernels for the batching ops.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "batch_kernels",
+    srcs = ["batch_kernels.cc"],
+    deps = [
+        "//tensorflow/contrib/batching:shared_batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:split_lib_hdrs",
+    ],
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
new file mode 100644
index 00000000000..3c06325651f
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -0,0 +1,997 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T. Writes to the
+// op's output at position 'output_index', using 'context' for the allocation to
+// ensure proper device placement.
+template <typename T>
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
+              int output_index) {
+  const int input_dims = inputs[0].dims();
+  const TensorShape& input_shape = inputs[0].shape();
+
+  // Note that we reduce the concat of k-dimensional tensors into a two
+  // dimensional concat. Assuming the dimensions of any input tensor are
+  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  int64 output_dim0 = 0;
+  for (int i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    if (input.dims() != input_dims) {
+      return errors::InvalidArgument(
+          "Ranks of all input tensors should match: shape[0] = ",
+          input_shape.DebugString(), " vs. shape[", i,
+          "] = ", input.shape().DebugString());
+    }
+    for (int j = 1; j < input_dims; ++j) {
+      if (input.dim_size(j) != input_shape.dim_size(j)) {
+        return errors::InvalidArgument(
+            "Dimensions of inputs should match: shape[0] = ",
+            input_shape.DebugString(), " vs. shape[", i,
+            "] = ", input.shape().DebugString());
+      }
+    }
+    if (input.NumElements() > 0) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          input.shaped<T, 2>({1, input.NumElements()})));
+    }
+    output_dim0 += input.dim_size(0);
+  }
+
+  TensorShape output_shape(input_shape);
+  output_shape.set_dim(0, output_dim0);
+  Tensor* output = nullptr;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, output_shape, &output));
+  if (output->NumElements() > 0) {
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+#if GOOGLE_CUDA
+    if (std::is_same<Device, GPUDevice>::value) {
+      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+      return Status::OK();
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
+  }
+
+  return Status::OK();
+}
+
+// The Split*() functions split 'input' with element type T into 'sizes.size()'
+// tensors along the zeroth dimension, with the ith split having zeroth-
+// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
+// for proper device placement.
+
+// Handles special cases that are cheap. Sets 'done==true' iff it found an
+// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
+template <typename T>
+Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
+                      const gtl::ArraySlice<int64>& sizes,
+                      std::vector<Tensor>* outputs, bool* done) {
+  *done = false;
+
+  int64 total_size = 0;
+  for (const int64 size : sizes) {
+    total_size += size;
+  }
+  if (total_size > input.shape().dim_size(0)) {
+    return errors::InvalidArgument(
+        "Sum of split sizes must not exceed dim0-size of input tensor");
+  }
+
+  // Special case 0: trivial 1-way split.
+  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
+    outputs->push_back(input);
+    *done = true;
+    return Status::OK();
+  }
+
+  // Special case 1: input is aligned.
+  if (IsInnerDimsSizeAligned<T>(input.shape())) {
+    int64 position = 0;
+    for (const int64 size : sizes) {
+      outputs->emplace_back(input.Slice(position, position + size));
+      position += size;
+    }
+    *done = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+// Handles the general case, on CPU.
+template <typename T>
+Status SplitCPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  int64 suffix_dim_size = 1;
+  for (int i = 1; i < input.shape().dims(); ++i) {
+    suffix_dim_size *= input.shape().dim_size(i);
+  }
+  auto input_reshaped =
+      input.shaped<T, 3>({1, input.shape().dim_size(0), suffix_dim_size});
+
+  int64 position = 0;
+  for (const int64 size : sizes) {
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, size);
+    Tensor output;
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(input.dtype(), output_shape, &output));
+    auto output_shaped = output.shaped<T, 3>({1, size, suffix_dim_size});
+
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices{0, position, 0};
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes{1, size, suffix_dim_size};
+    functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
+                                   output_shaped, input_reshaped, slice_indices,
+                                   slice_sizes);
+
+    outputs->emplace_back(output);
+
+    position += size;
+  }
+
+  return Status::OK();
+}
+
+#if GOOGLE_CUDA
+
+// Handles the general case, on GPU.
+template <typename T>
+Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  // TODO(olston, apassos): Implement this.
+  LOG(FATAL) << "Not yet implemented";  // Crash ok
+}
+
+#endif  // GOOGLE_CUDA
+
+// The outer function that dispatches to the various Split*() functions above.
+template <typename T>
+Status Split(OpKernelContext* context, const Tensor& input,
+             const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* outputs) {
+  bool easy_cases_done;
+  TF_RETURN_IF_ERROR(
+      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
+  if (easy_cases_done) {
+    return Status::OK();
+  }
+
+#if GOOGLE_CUDA
+// TODO(olston, apassos): Handle non-CPU cases.
+// return SplitGPU<T>(context, input, sizes, outputs);
+#endif  // GOOGLE_CUDA
+  return SplitCPU<T>(context, input, sizes, outputs);
+}
+
+// A class encapsulating the state and logic for batching tensors.
+class BatchResource : public ResourceBase {
+ public:
+  static Status Create(int32 num_batch_threads, int32 max_batch_size,
+                       int32 batch_timeout_micros,
+                       const std::vector<int32>& allowed_batch_sizes,
+                       std::unique_ptr<BatchResource>* resource) {
+    std::unique_ptr<BatchResource> new_resource(new BatchResource);
+
+    Batcher::Options batcher_options;
+    batcher_options.num_batch_threads = num_batch_threads;
+    TF_RETURN_IF_ERROR(
+        Batcher::Create(batcher_options, &new_resource->batcher_));
+
+    new_resource->batcher_queue_options_.max_batch_size = max_batch_size;
+    new_resource->batcher_queue_options_.batch_timeout_micros =
+        batch_timeout_micros;
+
+    new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
+
+    *resource = std::move(new_resource);
+    return Status::OK();
+  }
+
+  string DebugString() final { return "BatchResource"; }
+
+  // Ingests data from one invocation of the batch op. The data is enqueued to
+  // be combined with others into a batch, asynchronously.
+  Status RegisterInput(int64 guid, OpKernelContext* context,
+                       const string& batcher_queue_name,
+                       AsyncOpKernel::DoneCallback done_callback) {
+    std::unique_ptr<BatchTask> batch_components(new BatchTask);
+    batch_components->guid = guid;
+    OpInputList tensors;
+    TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
+    for (int i = 0; i < tensors.size(); ++i) {
+      const Tensor& tensor = tensors[i];
+      if (tensor.shape().dims() == 0) {
+        return errors::InvalidArgument(
+            "Batching input tensors must have at least one dimension");
+      }
+      if (tensors.size() >= 2 &&
+          tensor.shape().dim_size(0) != tensors[0].shape().dim_size(0)) {
+        return errors::InvalidArgument(
+            "Batching input tensors supplied in a given op invocation must "
+            "have equal 0th-dimension size");
+      }
+      batch_components->inputs.push_back(tensor);
+    }
+    batch_components->context = context;
+    batch_components->done_callback = std::move(done_callback);
+
+    BatcherQueue* batcher_queue;
+    TF_RETURN_IF_ERROR(
+        LookupOrCreateBatcherQueue(batcher_queue_name, &batcher_queue));
+    return batcher_queue->Schedule(&batch_components);
+  }
+
+ private:
+  BatchResource() = default;
+
+  // One input to be batched. Corresponds to one invocation of the batch op.
+  struct BatchTask : public serving::BatchTask {
+    // A unique ID to identify this invocation of Batch.
+    int64 guid;
+
+    std::vector<Tensor> inputs;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done_callback;
+
+    size_t size() const override { return inputs[0].shape().dim_size(0); }
+  };
+
+  using Batcher = serving::SharedBatchScheduler<BatchTask>;
+  using BatcherQueue = serving::BatchScheduler<BatchTask>;
+  using Batch = serving::Batch<BatchTask>;
+
+  // Validates that it's legal to combine the tasks in 'batch' into a batch.
+  // Assumes the batch is non-empty.
+  static Status ValidateBatch(const Batch& batch) {
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+
+      if (task.inputs.size() != batch.task(0).inputs.size()) {
+        return errors::InvalidArgument(
+            "Batching inputs must have equal number of edges");
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+  // returns 'batch_size'.
+  int RoundToLowestAllowedBatchSize(int batch_size) const {
+    if (allowed_batch_sizes_.empty()) {
+      return batch_size;
+    }
+    for (int allowed_size : allowed_batch_sizes_) {
+      if (allowed_size >= batch_size) {
+        return allowed_size;
+      }
+    }
+    LOG(ERROR) << "Maximum batch size greater than largest allowed size; "
+                  "ignoring allowed sizes constraint";
+    return batch_size;
+  }
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch->size());
+    const int padding_amount = padded_batch_size - batch->size();
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+    AsyncOpKernel::DoneCallback last_task_callback =
+        batch->task(batch->num_tasks() - 1).done_callback;
+
+    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                         last_task_callback);
+
+    // All tasks should have the same number of input edges.
+    const int num_input_edges = batch->task(0).inputs.size();
+
+    // Process each input edge one at a time (the typical case has just one).
+    for (int i = 0; i < num_input_edges; ++i) {
+      // Emit batch->num_tasks() - 1 empty output tensors.
+      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+        const BatchTask& task = batch->task(task_idx);
+        TensorShape output_shape(task.inputs.at(i).shape());
+        output_shape.set_dim(0, 0);
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            task.context,
+            task.context->allocate_output(i, output_shape, &output),
+            task.done_callback);
+      }
+
+      // Concatenate the tasks ith input tensors into a big output tensor.
+      std::vector<Tensor> to_concatenate;
+      to_concatenate.reserve(batch->num_tasks());
+      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
+      }
+
+      // Add padding as needed. Use the first row of the first task's tensor as
+      // the data for padding.
+      if (padding_amount > 0) {
+        const Tensor& padding_source = batch->task(0).inputs.at(i);
+        Tensor padding;
+        if (padding_source.shape().dim_size(0) == 1) {
+          padding = padding_source;
+        } else {
+          const std::vector<int64> slice_sizes = {1};
+          const DataType type = padding_source.dtype();
+          Status slice_status;
+          std::vector<Tensor> slices;
+          switch (type) {
+#define CASE(type)                                                   \
+  case DataTypeToEnum<type>::value:                                  \
+    slice_status = SplitCPU<type>(last_task_context, padding_source, \
+                                  slice_sizes, &slices);             \
+    break;
+            TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+            default:
+              slice_status =
+                  errors::InvalidArgument("Unsupported data type: ", type);
+              break;
+          }
+          OP_REQUIRES_OK_ASYNC(last_task_context, slice_status,
+                               last_task_callback);
+          padding = slices.at(0);
+        }
+        for (int i = 0; i < padding_amount; ++i) {
+          to_concatenate.push_back(padding);
+        }
+      }
+
+      const DataType type = to_concatenate[0].dtype();
+      Status concat_status;
+      switch (type) {
+#define CASE(type)                                                      \
+  case DataTypeToEnum<type>::value:                                     \
+    concat_status = Concat<type>(last_task_context, to_concatenate, i); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          concat_status =
+              errors::InvalidArgument("Unsupported data type: ", type);
+          break;
+      }
+      OP_REQUIRES_OK_ASYNC(last_task_context, concat_status,
+                           last_task_callback);
+    }
+
+    // Emit batch->num_tasks() - 1 empty index tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      TensorShape index_shape({0, 3});
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          task.context,
+          task.context->allocate_output(num_input_edges, index_shape, &output),
+          task.done_callback);
+    }
+    // Emit all ID tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      Tensor* id;
+      OP_REQUIRES_OK_ASYNC(task.context,
+                           task.context->allocate_output(num_input_edges + 1,
+                                                         TensorShape({}), &id),
+                           task.done_callback);
+      id->scalar<int64>()() = task.guid;
+    }
+    OP_REQUIRES_OK_ASYNC(
+        last_task_context,
+        EmitIndexTensor(last_task_context, *batch, num_input_edges),
+        last_task_callback);
+
+    // Signal done for each element of the batch. (At this point, the contexts
+    // are no longer guaranteed to remain live.)
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      batch->mutable_task(task_idx)->done_callback();
+    }
+  }
+
+  // Emits an index tensor, which the Unbatch op will use to un-concatenate
+  // the tensor and attribute the pieces to the right batch keys. The index
+  // tensor contains, for each input: [batch_key, start_offset, end_offset]
+  // where start_offset and end_offset represent the range of entries in the
+  // concatenated tensors that belong to that input.
+  //
+  // Emits the result to the output at 'output_index' using 'context'.
+  static Status EmitIndexTensor(OpKernelContext* context, const Batch& batch,
+                                int output_index) {
+    const TensorShape index_shape({batch.num_tasks(), 3});
+    Tensor* index = nullptr;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(output_index, index_shape, &index));
+    auto index_flat = index->shaped<int64, 2>({batch.num_tasks(), 3});
+    size_t offset = 0;
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+      index_flat(task_idx, 0) = task.guid;
+      index_flat(task_idx, 1) = offset;
+      index_flat(task_idx, 2) = offset + task.size();
+      offset += task.size();
+    }
+    return Status::OK();
+  }
+
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // creates it.
+  Status LookupOrCreateBatcherQueue(const string& queue_name,
+                                    BatcherQueue** queue) {
+    mutex_lock l(batcher_queues_mu_);
+
+    auto it = batcher_queues_.find(queue_name);
+    if (it != batcher_queues_.end()) {
+      *queue = it->second.get();
+      return Status::OK();
+    }
+
+    std::unique_ptr<BatcherQueue> new_queue;
+    auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
+      ProcessBatch(std::move(batch));
+    };
+    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
+                                          process_batch_callback, &new_queue));
+    *queue = new_queue.get();
+    batcher_queues_[queue_name] = std::move(new_queue);
+    return Status::OK();
+  }
+
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<Batcher> batcher_;
+  Batcher::QueueOptions batcher_queue_options_;
+
+  // A collection of batcher queues, keyed on queue name.
+  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
+  // ones (with a time delay?); it's okay if they get recreated later).
+  mutable mutex batcher_queues_mu_;
+  std::map<string, std::unique_ptr<BatcherQueue>> batcher_queues_
+      GUARDED_BY(batcher_queues_mu_);
+
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+class BatchKernel : public AsyncOpKernel {
+ public:
+  explicit BatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    BatchResource* br;
+    std::function<Status(BatchResource * *r)> creator =
+        [this](BatchResource** r) {
+          std::unique_ptr<BatchResource> new_resource;
+          TF_RETURN_IF_ERROR(BatchResource::Create(
+              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+              allowed_batch_sizes_, &new_resource));
+          *r = new_resource.release();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &br, creator),
+                         done);
+    const Status status =
+        br->RegisterInput(random::New64(), c, batcher_queue_, done);
+    br->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume br calls done, so nothing to do here.
+  }
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  Status ValidateAllowedBatchSizes() const {
+    if (allowed_batch_sizes_.empty()) {
+      return Status::OK();
+    }
+    int32 last_size = 0;
+    for (int i = 0; i < allowed_batch_sizes_.size(); ++i) {
+      const int32 size = allowed_batch_sizes_.at(i);
+      if (i > 0 && size <= last_size) {
+        return errors::InvalidArgument(
+            "allowed_batch_sizes entries must be monotonically increasing");
+      }
+      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+        return errors::InvalidArgument(
+            "final entry in allowed_batch_sizes must equal max_batch_size");
+      }
+      last_size = size;
+    }
+    return Status::OK();
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Batch").Device(DEVICE_CPU), BatchKernel);
+
+// A class encapsulating the state and logic for unbatching tensors.
+//
+// UnbatchResource keeps two data structures indexed by batch-key: one which has
+// the continuations for all concurrent kernels which are waiting for tensors
+// and another which has tensors which are waiting for their corresponding
+// kernels to run. Whenever a kernel runs, we either grab its tensor if it's
+// waiting already, or we insert it in the queue and then look at its tensor to
+// see if it can be used to dispatch any stored continuations.
+class UnbatchResource : public ResourceBase {
+ public:
+  explicit UnbatchResource(int32 timeout_micros)
+      : timeout_micros_(timeout_micros),
+        timeout_enforcer_(new serving::PeriodicFunction(
+            [this] { EnforceTimeout(); }, 1000 /* 1 ms */)) {}
+
+  ~UnbatchResource() override {
+    // Tear down 'timeout_enforcer_' first, since it accesses other state in
+    // this class.
+    timeout_enforcer_ = nullptr;
+  }
+
+  string DebugString() final { return "UnbatchResource"; }
+
+  Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+
+    if (batch_index_t.shape().dim_size(0) > data_t.shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 0th dimension size to be no "
+          "greater than ",
+          data_t.shape().dim_size(0),
+          "; Got: ", batch_index_t.shape().dim_size(0), ".");
+    }
+    if (batch_index_t.shape().dim_size(1) != 3) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 1st dimension size to be 3 ; "
+          "Got: ",
+          batch_index_t.shape().dim_size(1), ".");
+    }
+
+    const int64 batch_key = context->input(2).scalar<int64>()();
+    const bool nonempty_input = batch_index_t.dim_size(0) > 0;
+
+    // If we have a non-empty tensor, slice it up.
+    // (It is important to do this outside of the critical section below.)
+    // The following variables are populated iff 'nonempty_input==true'.
+    std::vector<int64> sizes;
+    std::vector<int64> batch_keys;
+    std::vector<Tensor> split_inputs;
+    if (nonempty_input) {
+      auto batch_indices =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        sizes.push_back(batch_indices(i, 2) - batch_indices(i, 1));
+        batch_keys.push_back(batch_indices(i, 0));
+      }
+
+      const DataType type = data_t.dtype();
+      switch (type) {
+#define CASE(type)                                                          \
+  case DataTypeToEnum<type>::value:                                         \
+    TF_RETURN_IF_ERROR(Split<type>(context, data_t, sizes, &split_inputs)); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          return errors::InvalidArgument("Unsupported data type: ", type);
+      }
+    }
+
+    // Critical section.
+    std::vector<AsyncOpKernel::DoneCallback> done_callbacks_to_call;
+    Status status = [&]() -> Status {
+      mutex_lock ml(mu_);
+
+      // Check to see whether the tensor we want is already ready.
+      auto tensor_it = waiting_tensors_.find(batch_key);
+      if (tensor_it != waiting_tensors_.end()) {
+        context->set_output(0, tensor_it->second.tensor);
+        waiting_tensors_.erase(tensor_it);
+        done_callbacks_to_call.push_back(done);
+        return Status::OK();
+      }
+
+      const uint64 deadline_micros =
+          Env::Default()->NowMicros() + timeout_micros_;
+
+      // Add ourselves to the waitlist for tensors.
+      if (!waiting_callbacks_
+               .emplace(batch_key,
+                        WaitingCallback{deadline_micros, context, done})
+               .second) {
+        return errors::AlreadyExists(
+            "Multiple session runs with the same batch key.");
+      }
+
+      // If we have a non-empty tensor, finish the waitlisted runs,
+      // and store any remaining pieces.
+      if (nonempty_input) {
+        for (int i = 0; i < batch_keys.size(); ++i) {
+          auto runs_it = waiting_callbacks_.find(batch_keys[i]);
+          if (runs_it != waiting_callbacks_.end()) {
+            runs_it->second.context->set_output(0, split_inputs[i]);
+            done_callbacks_to_call.push_back(runs_it->second.done);
+            waiting_callbacks_.erase(runs_it);
+          } else {
+            // Note: the deadline here is in case we are arriving late and the
+            // kernel that should rendezvous with this tensor has already waited
+            // and timed out.
+            if (!waiting_tensors_
+                     .emplace(batch_keys[i],
+                              WaitingTensor{deadline_micros, split_inputs[i]})
+                     .second) {
+              return errors::AlreadyExists(
+                  "Multiple tensors returned for same batch key.");
+            }
+          }
+        }
+      }
+
+      return Status::OK();
+    }();
+
+    for (const AsyncOpKernel::DoneCallback& done_callback :
+         done_callbacks_to_call) {
+      done_callback();
+    }
+
+    return status;
+  }
+
+ private:
+  // Evicts waiting tensors and callbacks that have exceeded their deadline.
+  void EnforceTimeout() {
+    const uint64 now = Env::Default()->NowMicros();
+    std::vector<WaitingCallback> evicted_callbacks;
+
+    {
+      mutex_lock ml(mu_);
+
+      for (auto it = waiting_tensors_.begin(); it != waiting_tensors_.end();) {
+        const WaitingTensor& waiting_tensor = it->second;
+        if (waiting_tensor.deadline_micros < now) {
+          it = waiting_tensors_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+
+      for (auto it = waiting_callbacks_.begin();
+           it != waiting_callbacks_.end();) {
+        const WaitingCallback& waiting_callback = it->second;
+        if (waiting_callback.deadline_micros < now) {
+          evicted_callbacks.push_back(waiting_callback);
+          it = waiting_callbacks_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+    }
+
+    for (const WaitingCallback& evicted_callback : evicted_callbacks) {
+      evicted_callback.context->CtxFailureWithWarning(errors::DeadlineExceeded(
+          "Batched data did not arrive within timeout window."));
+      evicted_callback.done();
+    }
+  }
+
+  struct WaitingTensor {
+    uint64 deadline_micros;
+    Tensor tensor;
+  };
+
+  struct WaitingCallback {
+    uint64 deadline_micros;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  const int32 timeout_micros_;
+
+  mutex mu_;
+
+  // Maps keyed by BatchKey of tensors waiting for callbacks and callbacks
+  // waiting for tensors.
+  std::unordered_map<int64, WaitingTensor> waiting_tensors_ GUARDED_BY(mu_);
+  std::unordered_map<int64, WaitingCallback> waiting_callbacks_ GUARDED_BY(mu_);
+
+  // A thread that evicts waiting tensors and callbacks that have exceeded their
+  // deadline.
+  std::unique_ptr<serving::PeriodicFunction> timeout_enforcer_;
+};
+
+class UnbatchKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_micros", &timeout_micros_));
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchResource* ubr;
+    std::function<Status(UnbatchResource * *r)> creator =
+        [this](UnbatchResource** r) {
+          *r = new UnbatchResource(timeout_micros_);
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    auto status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  int32 timeout_micros_;
+};
+REGISTER_KERNEL_BUILDER(Name("Unbatch").Device(DEVICE_CPU), UnbatchKernel);
+
+// A class encapsulating the state and logic for batching tensors
+// deterministically for the gradient of unbatch.
+class UnbatchGradResource : public ResourceBase {
+ public:
+  UnbatchGradResource() {}
+
+  string DebugString() final { return "UnbatchGradResource"; }
+
+  // Flushes the information for one batch, given its context and done
+  // callback. Clears all information about it from the available_tensors_.
+  Status OutputBatch(OpKernelContext* context,
+                     const AsyncOpKernel::DoneCallback& done)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    const Tensor& batch_index_t = context->input(1);
+    auto batch_index =
+        batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+    std::vector<Tensor> tensors;
+    for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+      auto available_it = available_tensors_.find(batch_index(i, 0));
+      if (available_it == available_tensors_.end()) {
+        return errors::Internal("bad bookkeeping of available tensors.");
+      }
+      tensors.push_back(available_it->second);
+      available_tensors_.erase(available_it);
+    }
+
+    const DataType type = tensors[0].dtype();
+    switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, 0)); \
+    break;
+      TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+      default:
+        return errors::InvalidArgument("Unsupported data type: ", type);
+    }
+    done();
+    return Status::OK();
+  }
+
+  // Ingests data from one invocation of the op.
+  Status Compute(OpKernelContext* context,
+                 const AsyncOpKernel::DoneCallback& done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+    const Tensor& grad_t = context->input(2);
+
+    mutex_lock ml(mu_);
+
+    const int64 batch_key = context->input(3).scalar<int64>()();
+    // Mark our tensor as available.
+    if (!available_tensors_.emplace(batch_key, grad_t).second) {
+      return errors::InvalidArgument("Two runs with the same batch key.");
+    }
+
+    // Check whether we have a valid input tensor and, if so, create its
+    // dispatch logic.
+    if (data_t.NumElements() > 0) {
+      if (batch_index_t.NumElements() == 0) {
+        return errors::InvalidArgument(
+            "batch_index is empty while the tensor isn't.");
+      }
+      std::unordered_set<int64> missing_tensors;
+      const auto batch_index =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        const int64 batch_key = batch_index(i, 0);
+        if (available_tensors_.find(batch_key) == available_tensors_.end()) {
+          missing_tensors.emplace(batch_key);
+        }
+      }
+      if (missing_tensors.empty()) {
+        return OutputBatch(context, done);
+      }
+      if (!available_batches_
+               .emplace(batch_key, Batch{missing_tensors, context, done})
+               .second) {
+        return errors::InvalidArgument(
+            "Batch key with valid batch used twice.");
+      }
+      for (const int64 i : missing_tensors) {
+        if (!desired_tensor_to_batch_map_.emplace(i, batch_key).second) {
+          return errors::InvalidArgument(
+              "Missing tensor wanted by more than one batch.");
+        }
+      }
+    } else {
+      // If we don't have a valid input tensor we can output an empty tensor and
+      // call our done closure.
+      TensorShape output_shape(grad_t.shape());
+      output_shape.set_dim(0, 0);
+      Tensor* output = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, &output));
+      done();
+    }
+
+    // Search to see whether our tensor is desired by any existing batch.
+    auto desire_it = desired_tensor_to_batch_map_.find(batch_key);
+    if (desire_it != desired_tensor_to_batch_map_.end()) {
+      // Mark our tensor as no longer missing.
+      auto batch_it = available_batches_.find(desire_it->second);
+      desired_tensor_to_batch_map_.erase(desire_it);
+      if (batch_it == available_batches_.end()) {
+        return errors::InvalidArgument("Batch no longer exists.");
+      }
+      batch_it->second.missing_tensors.erase(batch_key);
+      // If all tensors are available we should concatenate them and dispatch
+      // the batch.
+      if (batch_it->second.missing_tensors.empty()) {
+        TF_RETURN_IF_ERROR(
+            OutputBatch(batch_it->second.context, batch_it->second.done));
+        available_batches_.erase(batch_it);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  mutex mu_;
+
+  // Represents a still-incomplete batch of tensors. When all tensors become
+  // available they will be concatenated in the right order and sent through the
+  // context.
+  struct Batch {
+    // Batch keys for tensors which are still missing from this batch. When this
+    // is empty the Tensors can be concatenated and forwarded.
+    std::unordered_set<int64> missing_tensors;
+
+    // Context and callback for the session responsible for finishing this
+    // batch.
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  // Map from batch key of the session which will output the batched gradients
+  // to still-incomplete batches.
+  std::unordered_map<int64, Batch> available_batches_;
+
+  // Map from batch key to tensors which are waiting for their batches to be
+  // available.
+  std::unordered_map<int64, Tensor> available_tensors_;
+
+  // Map from batch key of a tensor which is not yet available to the batch key
+  // of the batch to which it belongs.
+  std::unordered_map<int64, int64> desired_tensor_to_batch_map_;
+};
+
+class UnbatchGradKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchGradKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchGradResource* ubr;
+    std::function<Status(UnbatchGradResource * *r)> creator =
+        [this](UnbatchGradResource** r) {
+          *r = new UnbatchGradResource();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    Status status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+};
+REGISTER_KERNEL_BUILDER(Name("UnbatchGrad").Device(DEVICE_CPU),
+                        UnbatchGradKernel);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/ops/batch_ops.cc b/tensorflow/contrib/batching/ops/batch_ops.cc
new file mode 100644
index 00000000000..85e0ccba4aa
--- /dev/null
+++ b/tensorflow/contrib/batching/ops/batch_ops.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Batch")
+    .Input("in_tensors: T")
+    .Output("batched_tensors: T")
+    .Output("batch_index: int64")
+    .Output("id: int64")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("grad_timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("T: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> in_shapes;
+      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
+      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
+      for (int i = 0; i < in_shapes.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
+      }
+      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
+      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
+      TF_RETURN_IF_ERROR(c->set_output(
+          "batch_index",
+          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
+                         shape_inference::DimensionOrConstant(3)})}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Batches all input tensors nondeterministically.
+
+When many instances of this Op are being run concurrently with the same
+container/shared_name in the same device, some will output zero-shaped Tensors
+and others will output Tensors of size up to max_batch_size.
+
+All Tensors in in_tensors are batched together (so, for example, labels and
+features should be batched with a single instance of this operation.
+
+Each invocation of batch emits an `id` scalar which will be used to identify
+this particular invocation when doing unbatch or its gradient.
+
+Each op which emits a non-empty batch will also emit a non-empty batch_index
+Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+start, and length of elements of each set of Tensors present in batched_tensors.
+
+Batched tensors are concatenated along the first dimension, and all tensors in
+in_tensors must have the first dimension of the same size.
+
+in_tensors: The tensors to be batched.
+num_batch_threads: Number of scheduling threads for processing batches of work.
+ Determines the number of batches processed in parallel.
+max_batch_size: Batch sizes will never be bigger than this.
+batch_timeout_micros: Maximum number of microseconds to wait before outputting
+ an incomplete batch.
+allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+ nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+ batches up to one of those sizes. The entries must increase monotonically, and
+ the final entry must equal max_batch_size.
+grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+batch_index: If out_tensors is non-empty, has information to invert it.
+container: Controls the scope of sharing of this batch.
+id: always contains a scalar with a unique ID for this invocation of Batch.
+shared_name: Concurrently running instances of batch in the same device with the
+ same container and shared_name will batch their elements together. If left
+ empty, the op name will be used as the shared name.
+T: the types of tensors to be batched.
+)doc");
+
+REGISTER_OP("Unbatch")
+    .Input("batched_tensor: T")
+    .Input("batch_index: int64")
+    .Input("id: int64")
+    .Output("unbatched_tensor: T")
+    .Attr("timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
+      c->set_output(0, out_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Reverses the operation of Batch for a single output Tensor.
+
+An instance of Unbatch either receives an empty batched_tensor, in which case it
+asynchronously waits until the values become available from a concurrently
+running instance of Unbatch with the same container and shared_name, or receives
+a non-empty batched_tensor in which case it finalizes all other concurrently
+running instances and outputs its own element from the batch.
+
+batched_tensor: The possibly transformed output of Batch. The size of the first
+ dimension should remain unchanged by the transformations for the operation to
+ work.
+batch_index: The matching batch_index obtained from Batch.
+id: The id scalar emitted by Batch.
+unbatched_tensor: The Tensor corresponding to this execution.
+timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+ batched input tensor associated with a given invocation of the op.
+container: Container to control resource sharing.
+shared_name: Instances of Unbatch with the same container and shared_name are
+ assumed to possibly belong to the same batch. If left empty, the op name will
+ be used as the shared name.
+)doc");
+
+REGISTER_OP("UnbatchGrad")
+    .Input("original_input: T")
+    .Input("batch_index: int64")
+    .Input("grad: T")
+    .Input("id: int64")
+    .Output("batched_grad: T")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gradient of Unbatch.
+
+Acts like Batch but using the given batch_index index of batching things as they
+become available. This ensures that the gradients are propagated back in the
+same session which did the forward pass.
+
+original_input: The input to the Unbatch operation this is the gradient of.
+batch_index: The batch_index given to the Unbatch operation this is the gradient
+of.
+grad: The downstream gradient.
+id: The id scalar emitted by Batch.
+batched_grad: The return value, either an empty tensor or the batched gradient.
+container: Container to control resource sharing.
+shared_name: Instances of UnbatchGrad with the same container and shared_name
+ are assumed to possibly belong to the same batch. If left empty, the op name
+ will be used as the shared name.
+  )doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
new file mode 100644
index 00000000000..867ee6dfbc8
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Operations for automatic batching and unbatching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.ops import gen_batch_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.batching.ops.gen_batch_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+
+_batch_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_batch_ops.so"))
+
+
+@ops.RegisterGradient("Batch")
+def _BatchGrad(op, *out_grads):  # pylint: disable=invalid-name
+  """Gradient for batch op."""
+  gradients = []
+  for i in range(len(op.inputs)):
+    gradients.append(
+        gen_batch_ops.unbatch(
+            out_grads[i],
+            op.outputs[-2],
+            op.outputs[-1],
+            timeout_micros=op.get_attr("grad_timeout_micros"),
+            shared_name="batch_gradient_{}_{}".format(op.name, i)))
+  return gradients
+
+
+@ops.RegisterGradient("Unbatch")
+def _UnbatchGrad(op, grad):   # pylint: disable=invalid-name
+  return [
+      gen_batch_ops.unbatch_grad(
+          op.inputs[0],
+          op.inputs[1],
+          grad,
+          op.inputs[2],
+          shared_name="unbatch_gradient_{}".format(op.name)), None, None
+  ]
+
+
+def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
+                   allowed_batch_sizes=None,
+                   grad_timeout_micros=60 * 1000 * 1000,
+                   unbatch_timeout_micros=60 * 1000 * 1000):
+  """Batches the computation done by the decorated function.
+
+  So, for example, in the following code
+
+  ```
+  @batch_function(1, 2, 3)
+  def layer(a):
+    return tf.matmul(a, a)
+
+  b = layer(w)
+  ```
+
+  if more than one session.run call is simultaneously trying to compute `b`
+  the values of `w` will be gathered, non-deterministically concatenated
+  along the first axis, and only one thread will run the computation. See the
+  documentation of the `Batch` op for more details.
+
+  Assumes that all arguments of the decorated function are Tensors which will
+  be batched along their first dimension.
+
+  SparseTensor is not supported. The return value of the decorated function
+  must be a Tensor or a list/tuple of Tensors.
+
+  Args:
+    num_batch_threads: Number of scheduling threads for processing batches
+     of work. Determines the number of batches processed in parallel.
+    max_batch_size: Batch sizes will never be bigger than this.
+    batch_timeout_micros: Maximum number of microseconds to wait before
+     outputting an incomplete batch.
+    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
+     does nothing. Otherwise, supplies a list of batch sizes, causing the op
+     to pad batches up to one of those sizes. The entries must increase
+     monotonically, and the final entry must equal max_batch_size.
+    grad_timeout_micros: The timeout to use for the gradient. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+    unbatch_timeout_micros: The timeout to use for unbatching. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+
+  Returns:
+    The decorated function will return the unbatched computation output Tensors.
+  """
+  def decorator(f):  # pylint: disable=missing-docstring
+    def decorated(*args):
+      with ops.name_scope("batch") as name:
+        for a in args:
+          if not isinstance(a, ops.Tensor):
+            raise ValueError("All arguments to functions decorated with "
+                             "`batch_function`  are supposed to be Tensors; "
+                             "found %s" % repr(a))
+        batched_tensors, batch_index, id_t = gen_batch_ops.batch(
+            args,
+            num_batch_threads=num_batch_threads,
+            max_batch_size=max_batch_size,
+            batch_timeout_micros=batch_timeout_micros,
+            allowed_batch_sizes=allowed_batch_sizes,
+            grad_timeout_micros=grad_timeout_micros,
+            shared_name=name)
+        outputs = f(*batched_tensors)
+        if isinstance(outputs, ops.Tensor):
+          outputs_list = [outputs]
+        else:
+          outputs_list = outputs
+        with ops.name_scope("unbatch") as unbatch_name:
+          unbatched = [
+              gen_batch_ops.unbatch(t, batch_index, id_t,
+                                    timeout_micros=unbatch_timeout_micros,
+                                    shared_name=unbatch_name)
+              for t in outputs_list]
+        if isinstance(outputs, ops.Tensor):
+          return unbatched[0]
+        return unbatched
+    return decorated
+  return decorator
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
new file mode 100644
index 00000000000..fac7aff29f7
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the currently experimental in-graph batch ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import time
+
+from tensorflow.contrib.batching.python.ops import batch_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+def delayed_plus1(x):
+  """Sleeps for 100ms then returns x+1."""
+  time.sleep(0.1)
+  return x + 1
+
+
+class BatchOpsTest(test.TestCase):
+  """Tests for batch_ops.{un,}batch."""
+
+  def testBasicBatch(self):
+    """Tests that a single batched tensor executes together and only once."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+        index_t = thread_results[1]
+        empty_b = main_results[0][0]
+        empty_m = main_results[1]
+      else:
+        batch_t = main_results[0][0]
+        index_t = main_results[1]
+        empty_b = thread_results[0][0]
+        empty_m = thread_results[1]
+
+      # Check that both the inputs made it out exactly once.
+      self.assertAllEqual(sorted(batch_t), (1, 2))
+      # Check that we get 2 rows in the index tensor.
+      self.assertEqual(len(index_t), 2)
+      # Check that the other ones are empty.
+      self.assertEqual(len(empty_b), 0)
+      self.assertEqual(len(empty_m), 0)
+
+  def testBatchWithPadding(self):
+    """Test that batching with padding up to an allowed batch size works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[5, 10],
+          grad_timeout_micros=0, batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1, 3]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2, 4]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+      else:
+        batch_t = main_results[0][0]
+
+      # Check that the batch tensor incorporates the padding.
+      self.assertEqual(len(batch_t), 5)
+
+  def testMultipleBatch(self):
+    """Tests that multiple batched tensors execute together."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, _, _ = batch_ops.batch(
+          [inp0, inp1],
+          num_batch_threads=1,
+          max_batch_size=2,
+          batch_timeout_micros=36000000,
+          grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched], feed_dict={inp0: [1],
+                                           inp1: [2]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched], feed_dict={inp0: [2], inp1: [3]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0]
+        empty_t = main_results[0]
+      else:
+        batch_t = main_results[0]
+        empty_t = thread_results[0]
+
+      # Assert that the tensors were batched together.
+      self.assertAllEqual(sorted(batch_t[0]), [1, 2])
+      self.assertAllEqual(sorted(batch_t[1]), [2, 3])
+      self.assertAllEqual(empty_t[0], [])
+      self.assertAllEqual(empty_t[1], [])
+
+  def testIllegalBatchDifferentDim0Sizes(self):
+    """Tests illegally feeding tensors with different dim0 sizes."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp0, inp1], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=0, grad_timeout_micros=0, batching_queue="")
+      with self.assertRaises(Exception) as raised:
+        _ = sess.run([batched, index], feed_dict={inp0: [0], inp1: [1, 2]})
+      self.assertGreater(
+          raised.exception.message.find("must have equal 0th-dimension size"),
+          0)
+
+  def testBasicUnbatch(self):
+    """Tests that batch and unbatch work together."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          grad_timeout_micros=0, batching_queue="")
+      computation = batched[0] + 1
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBasicUnbatchDecorated(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return in_t + 1
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testUnbatchTimeout(self):
+    """Tests that the unbatch timeout works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      computation = batched[0] + 1
+      timeout_micros = 10
+      result = batch_ops.unbatch(computation, index, id_t, timeout_micros,
+                                 shared_name="shared_unbatch")
+      # Set up a parallel pipeline that delays the computation, but uses the
+      # same unbatch resource object as the non-delayed pipeline.
+      computation_delayed = script_ops.py_func(delayed_plus1,
+                                               [batched[0]],
+                                               dtypes.int32)
+      result_delayed = batch_ops.unbatch(computation_delayed,
+                                         index,
+                                         id_t,
+                                         timeout_micros,
+                                         shared_name="shared_unbatch")
+
+      thread_results = []
+      def worker():
+        # A first call using the non-delayed pipeline. The batcher will send an
+        # empty tensor along the non-delayed pipeline.
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      time.sleep(0.1)  # Ensure the thread's call starts first.
+      # A second call using the delayed pipeline.  The batcher will send the
+      # batched tensor along the delayed pipeline, thus delaying the arrival of
+      # the batched tensor at the unbatch op, relative to the empty tensor.
+      #
+      # TODO(olston, apassos): Avoid relying on the order in which the batch op
+      # emits the empty tensor versus the batched one.
+      _ = sess.run([result_delayed], feed_dict={inp: [2]})
+      worker_thread.join()
+      # The thread's call should hit the timeout, and thus get 0 results.
+      self.assertEqual(len(thread_results), 0)
+
+  def testUnbatchGrad(self):
+    """Tests that batch and unbatch are differentiable."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=1000000,
+          batching_queue="")
+      computation = batched[0] * batched[0]
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      grad = gradients_impl.gradients(result, inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([grad], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([grad], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [4])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
new file mode 100644
index 00000000000..41a3f99137a
--- /dev/null
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -0,0 +1,700 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <deque>
+#include <functional>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/batching/batch_scheduler.h"
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class Queue;
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace serving {
+
+// A batch scheduler for server instances that service multiple request types
+// (e.g. multiple machine-learned models, or multiple versions of a model served
+// concurrently), or even multiple distinct tasks for a given request. The
+// scheduler multiplexes batches of different kinds of tasks onto a fixed-size
+// thread pool (each batch contains tasks of a single type), in a carefully
+// controlled manner. A common configuration is to set the number of threads
+// equal to the number of hardware accelerator units, in which case the
+// scheduler takes care of multiplexing the task types onto the shared hardware,
+// in a manner that is both fair and efficient.
+//
+// Semantically, SharedBatchScheduler behaves like having N instances of
+// BasicBatchScheduler (see basic_batch_scheduler.h), one per task type. The
+// difference is that under the covers there is a single shared thread pool,
+// instead of N independent ones, with their sharing deliberately coordinated.
+//
+// SharedBatchScheduler does not implement the BatchScheduler API; rather, it
+// presents an abstraction of "queues", where each queue coresponds to one type
+// of task. Tasks submitted to a given queue are placed in their own batches,
+// and cannot be mixed with other tasks. Queues can be added and deleted
+// dynamically, to accommodate e.g. versions of a model being brought up and
+// down over the lifetime of a server.
+//
+// The batch thread pool round-robins through the queues, running one batch
+// from a queue and then moving to the next queue. Each queue behaves like a
+// BasicBatchScheduler instance, in the sense that it has maximum batch size and
+// timeout parameters, which govern when a batch is eligible to be processed.
+//
+// Each queue is independently configured with a maximum size (in terms of the
+// maximum number of batches worth of enqueued tasks). For online serving, it is
+// recommended that the queue sizes be configured such that the sum of the sizes
+// of the active queues roughly equal the number of batch threads. (The idea is
+// that if all threads become available at roughly the same time, there will be
+// enough enqueued work for them to take on, but no more.)
+//
+// If queue sizes are configured in the manner suggested above, the maximum time
+// a task can spend in a queue before being placed in a batch and assigned to a
+// thread for processing, is the greater of:
+//  - the maximum time to process one batch of tasks from any active queue
+//  - the configured timeout parameter for the task's queue (which can be 0)
+//
+// For bulk processing jobs and throughput-oriented benchmarks, you may want to
+// set the maximum queue size to a large value.
+//
+// TODO(b/26539183): Support queue servicing policies other than round-robin.
+// E.g. let each queue specify a "share" (an int >= 1), so e.g. with queues A
+// and B having shares 1 and 2 respectively, the servicing pattern is ABBABB...
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class SharedBatchScheduler
+    : public std::enable_shared_from_this<SharedBatchScheduler<TaskType>> {
+ public:
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::NumSchedulableCPUs();
+
+    // The environment to use.
+    // (Typically only overridden by test code.)
+    Env* env = Env::Default();
+  };
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static Status Create(
+      const Options& options,
+      std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler);
+
+  ~SharedBatchScheduler();
+
+  // Adds a queue to which tasks may be submitted. The returned queue implements
+  // the BatchScheduler API. Each queue has its own set of scheduling options,
+  // and its own callback to process batches of tasks submitted to the queue.
+  //
+  // The returned queue's destructor blocks until all tasks submitted to it have
+  // been processed.
+  struct QueueOptions {
+    // The maximum size of each batch.
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    int max_batch_size = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'max_batch_size'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64 batch_timeout_micros = 0;
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    int max_enqueued_batches = 10;
+  };
+  Status AddQueue(const QueueOptions& options,
+                  std::function<void(std::unique_ptr<Batch<TaskType>>)>
+                      process_batch_callback,
+                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+ private:
+  explicit SharedBatchScheduler(const Options& options);
+
+  // The code executed in 'batch_threads_'. Obtains a batch to process from the
+  // queue pointed to by 'next_queue_to_schedule_', and processes it. If that
+  // queue declines to provide a batch to process, moves onto the next queue. If
+  // no queues provide a batch to process, just sleeps briefly and exits.
+  void ThreadLogic();
+
+  const Options options_;
+
+  mutex mu_;
+
+  // A list of queues. (We use std::list instead of std::vector to ensure that
+  // iterators are not invalidated by adding/removing elements. It also offers
+  // efficient removal of elements from the middle.)
+  using QueueList = std::list<std::unique_ptr<internal::Queue<TaskType>>>;
+
+  // All "active" queues, i.e. ones that either:
+  //  - have not been removed, or
+  //  - have been removed but are not yet empty.
+  QueueList queues_ GUARDED_BY(mu_);
+
+  // An iterator over 'queues_', pointing to the queue from which the next
+  // available batch thread should grab work.
+  typename QueueList::iterator next_queue_to_schedule_ GUARDED_BY(mu_);
+
+  // Used by idle batch threads to wait for work to enter the system. Notified
+  // whenever a batch becomes schedulable.
+  condition_variable schedulable_batch_cv_;
+
+  // Threads that process batches obtained from the queues.
+  std::vector<std::unique_ptr<PeriodicFunction>> batch_threads_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SharedBatchScheduler);
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+
+// A task queue for SharedBatchScheduler. Accepts tasks and accumulates them
+// into batches, and dispenses those batches to be processed via a "pull"
+// interface. The queue's behavior is governed by maximum batch size, timeout
+// and maximum queue length parameters; see their documentation in
+// SharedBatchScheduler.
+//
+// The queue is implemented as a deque of batches, with these invariants:
+//  - The number of batches is between 1 and 'options_.max_enqueued_batches'.
+//  - The back-most batch is open; the rest are closed.
+//
+// Submitted tasks are added to the open batch. If that batch doesn't have room
+// but the queue isn't full, then that batch is closed and a new open batch is
+// started.
+//
+// Batch pull requests are handled by dequeuing the front-most batch if it is
+// closed. If the front-most batch is open (i.e. the queue contains only one
+// batch) and has reached the timeout, it is immediately closed and returned;
+// otherwise no batch is returned for the request.
+template <typename TaskType>
+class Queue {
+ public:
+  using ProcessBatchCallback =
+      std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+  using SchedulableBatchCallback = std::function<void()>;
+  Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+        Env* env, ProcessBatchCallback process_batch_callback,
+        SchedulableBatchCallback schdulable_batch_callback);
+
+  // Illegal to destruct unless the queue is empty.
+  ~Queue();
+
+  // Submits a task to the queue, with the same semantics as
+  // BatchScheduler::Schedule().
+  Status Schedule(std::unique_ptr<TaskType>* task);
+
+  // Returns the number of enqueued tasks, with the same semantics as
+  // BatchScheduler::NumEnqueuedTasks().
+  size_t NumEnqueuedTasks() const;
+
+  // Returns the queue capacity, with the same semantics as
+  // BatchScheduler::SchedulingCapacity().
+  size_t SchedulingCapacity() const;
+
+  // Called by a thread that is ready to process a batch, to request one from
+  // this queue. Either returns a batch that is ready to be processed, or
+  // nullptr if the queue declines to schedule a batch at this time. If it
+  // returns a batch, the batch is guaranteed to be closed.
+  std::unique_ptr<Batch<TaskType>> ScheduleBatch();
+
+  // Processes a batch that has been returned earlier by ScheduleBatch().
+  void ProcessBatch(std::unique_ptr<Batch<TaskType>> batch);
+
+  // Determines whether the queue is empty, i.e. has no tasks waiting or being
+  // processed.
+  bool IsEmpty() const;
+
+  // Marks the queue closed, and waits until it is empty.
+  void CloseAndWaitUntilEmpty();
+
+  bool closed() const {
+    mutex_lock l(mu_);
+    return closed_;
+  }
+
+ private:
+  // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
+  bool IsEmptyInternal() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Closes the open batch residing at the back of 'batches_', and inserts a
+  // fresh open batch behind it.
+  void StartNewBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Determines whether the open batch residing at the back of 'batches_' is
+  // currently schedulable.
+  bool IsOpenBatchSchedulable() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
+
+  // The environment to use.
+  Env* env_;
+
+  // A callback invoked to processes a batch of work units. Always invoked from
+  // a batch thread.
+  ProcessBatchCallback process_batch_callback_;
+
+  // A callback invoked to notify the scheduler that a new batch has become
+  // schedulable.
+  SchedulableBatchCallback schedulable_batch_callback_;
+
+  mutable mutex mu_;
+
+  // Whether this queue can accept new tasks. This variable is monotonic: it
+  // starts as false, and then at some point gets set to true and remains true
+  // for the duration of this object's life.
+  bool closed_ GUARDED_BY(mu_) = false;
+
+  // The enqueued batches. See the invariants in the class comments above.
+  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ GUARDED_BY(mu_);
+
+  // The time at which the first task was added to the open (back-most) batch
+  // in 'batches_'. Valid iff that batch contains at least one task.
+  uint64 open_batch_start_time_micros_ GUARDED_BY(mu_);
+
+  // Whether this queue contains a batch that is eligible to be scheduled. Used
+  // to keep track of when to call 'schedulable_batch_callback_'.
+  bool schedulable_batch_ GUARDED_BY(mu_) = false;
+
+  // The number of batches currently being processed by batch threads.
+  // Incremented in ScheduleBatch() and decremented in ProcessBatch().
+  int num_batches_being_processed_ GUARDED_BY(mu_) = 0;
+
+  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for the
+  // case in which the queue is not empty when CloseAndWaitUntilEmpty() starts.
+  // When ProcessBatch() dequeues the last batch and makes the queue empty, if
+  // 'empty_notification_' is non-null it calls 'empty_notification_->Notify()'.
+  Notification* empty_notification_ GUARDED_BY(mu_) = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Queue);
+};
+
+// A RAII-style object that points to a Queue and implements
+// the BatchScheduler API. To be handed out to clients who call AddQueue().
+template <typename TaskType>
+class QueueHandle : public BatchScheduler<TaskType> {
+ public:
+  QueueHandle(std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+              Queue<TaskType>* queue);
+  ~QueueHandle() override;
+
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+ private:
+  // The scheduler that owns 'queue_'.
+  std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
+
+  // The queue this handle wraps. Owned by 'scheduler_', which keeps it alive at
+  // least until this class's destructor closes it.
+  Queue<TaskType>* queue_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueHandle);
+};
+
+}  // namespace internal
+
+template <typename TaskType>
+Status SharedBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  scheduler->reset(new SharedBatchScheduler<TaskType>(options));
+  return Status::OK();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::~SharedBatchScheduler() {
+  // Wait until the batch threads finish clearing out and deleting the closed
+  // queues.
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (queues_.empty()) {
+        break;
+      }
+    }
+    const int64 kSleepTimeMicros = 100;
+    options_.env->SleepForMicroseconds(kSleepTimeMicros);
+  }
+  // Delete the batch threads before allowing state the threads may access (e.g.
+  // 'mu_') to be deleted.
+  batch_threads_.clear();
+}
+
+template <typename TaskType>
+Status SharedBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options,
+    std::function<void(std::unique_ptr<Batch<TaskType>>)>
+        process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.batch_timeout_micros < 0) {
+    return errors::InvalidArgument(
+        "batch_timeout_micros must be non-negative; was ",
+        options.batch_timeout_micros);
+  }
+  if (options.max_enqueued_batches < 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be non-negative; was ",
+        options.max_enqueued_batches);
+  }
+
+  auto schedulable_batch_callback = [this] {
+    mutex_lock l(mu_);
+    schedulable_batch_cv_.notify_one();
+  };
+  auto internal_queue =
+      std::unique_ptr<internal::Queue<TaskType>>(new internal::Queue<TaskType>(
+          options, options_.env, process_batch_callback,
+          schedulable_batch_callback));
+  auto handle = std::unique_ptr<BatchScheduler<TaskType>>(
+      new internal::QueueHandle<TaskType>(this->shared_from_this(),
+                                          internal_queue.get()));
+  {
+    mutex_lock l(mu_);
+    queues_.push_back(std::move(internal_queue));
+    if (next_queue_to_schedule_ == queues_.end()) {
+      next_queue_to_schedule_ = queues_.begin();
+    }
+  }
+  *queue = std::move(handle);
+  return Status::OK();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::SharedBatchScheduler(const Options& options)
+    : options_(options), next_queue_to_schedule_(queues_.end()) {
+  // Kick off the batch threads.
+  PeriodicFunction::Options periodic_fn_options;
+  periodic_fn_options.thread_name_prefix =
+      strings::StrCat(options.thread_pool_name, "_");
+  for (int i = 0; i < options.num_batch_threads; ++i) {
+    std::unique_ptr<PeriodicFunction> thread(new PeriodicFunction(
+        [this] { this->ThreadLogic(); },
+        0 /* function invocation interval time */, periodic_fn_options));
+    batch_threads_.push_back(std::move(thread));
+  }
+}
+
+template <typename TaskType>
+void SharedBatchScheduler<TaskType>::ThreadLogic() {
+  // A batch to process next (or nullptr if no work to do).
+  std::unique_ptr<Batch<TaskType>> batch_to_process;
+  // The queue with which 'batch_to_process' is associated.
+  internal::Queue<TaskType>* queue_for_batch = nullptr;
+  {
+    mutex_lock l(mu_);
+
+    const int num_queues = queues_.size();
+    for (int num_queues_tried = 0;
+         batch_to_process == nullptr && num_queues_tried < num_queues;
+         ++num_queues_tried) {
+      DCHECK(next_queue_to_schedule_ != queues_.end());
+
+      // If a closed queue responds to ScheduleBatch() with nullptr, the queue
+      // will never yield any further batches so we can drop it. To avoid a
+      // race, we take a snapshot of the queue's closedness state *before*
+      // calling ScheduleBatch().
+      const bool queue_closed = (*next_queue_to_schedule_)->closed();
+
+      // Ask '*next_queue_to_schedule_' if it wants us to process a batch.
+      batch_to_process = (*next_queue_to_schedule_)->ScheduleBatch();
+      if (batch_to_process != nullptr) {
+        queue_for_batch = next_queue_to_schedule_->get();
+      }
+
+      // Advance 'next_queue_to_schedule_'.
+      if (queue_closed && (*next_queue_to_schedule_)->IsEmpty() &&
+          batch_to_process == nullptr) {
+        // We've encountered a closed queue with no work to do. Drop it.
+        DCHECK_NE(queue_for_batch, next_queue_to_schedule_->get());
+        next_queue_to_schedule_ = queues_.erase(next_queue_to_schedule_);
+      } else {
+        ++next_queue_to_schedule_;
+      }
+      if (next_queue_to_schedule_ == queues_.end() && !queues_.empty()) {
+        // We've hit the end. Wrap to the first queue.
+        next_queue_to_schedule_ = queues_.begin();
+      }
+    }
+
+    if (batch_to_process == nullptr) {
+      // We couldn't find any work to do. Wait until a new batch becomes
+      // schedulable, or some time has elapsed, before checking again.
+      const int64 kTimeoutMillis = 1;  // The smallest accepted granule of time.
+      WaitForMilliseconds(&l, &schedulable_batch_cv_, kTimeoutMillis);
+      return;
+    }
+  }
+
+  queue_for_batch->ProcessBatch(std::move(batch_to_process));
+}
+
+namespace internal {
+
+template <typename TaskType>
+Queue<TaskType>::Queue(
+    const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+    Env* env, ProcessBatchCallback process_batch_callback,
+    SchedulableBatchCallback schedulable_batch_callback)
+    : options_(options),
+      env_(env),
+      process_batch_callback_(process_batch_callback),
+      schedulable_batch_callback_(schedulable_batch_callback) {
+  // Create an initial, open batch.
+  batches_.emplace_back(new Batch<TaskType>);
+}
+
+template <typename TaskType>
+Queue<TaskType>::~Queue() {
+  mutex_lock l(mu_);
+  DCHECK(IsEmptyInternal());
+
+  // Close the (empty) open batch, so its destructor doesn't block.
+  batches_.back()->Close();
+}
+
+template <typename TaskType>
+Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  if ((*task)->size() > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", (*task)->size(),
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+
+  bool notify_of_schedulable_batch = false;
+  {
+    mutex_lock l(mu_);
+
+    DCHECK(!closed_);
+
+    if (batches_.back()->size() + (*task)->size() > options_.max_batch_size) {
+      if (batches_.size() >= options_.max_enqueued_batches) {
+        return errors::Unavailable(
+            "The batch scheduling queue to which this task was submitted is "
+            "full");
+      }
+      StartNewBatch();
+    }
+    if (batches_.back()->empty()) {
+      open_batch_start_time_micros_ = env_->NowMicros();
+    }
+    batches_.back()->AddTask(std::move(*task));
+
+    if (!schedulable_batch_) {
+      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
+        schedulable_batch_ = true;
+        notify_of_schedulable_batch = true;
+      }
+    }
+  }
+
+  if (notify_of_schedulable_batch) {
+    schedulable_batch_callback_();
+  }
+
+  return Status::OK();
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  size_t num_enqueued_tasks = 0;
+  for (const auto& batch : batches_) {
+    num_enqueued_tasks += batch->num_tasks();
+  }
+  return num_enqueued_tasks;
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int num_new_batches_schedulable =
+      options_.max_enqueued_batches - batches_.size();
+  const int open_batch_capacity =
+      options_.max_batch_size - batches_.back()->size();
+  return (num_new_batches_schedulable * options_.max_batch_size) +
+         open_batch_capacity;
+}
+
+template <typename TaskType>
+std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
+  // The batch to schedule, which we may populate below. (If left as nullptr,
+  // that means we are electing not to schedule a batch at this time.)
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+
+  {
+    mutex_lock l(mu_);
+
+    // Consider closing the open batch at this time, to schedule it.
+    if (batches_.size() == 1 && IsOpenBatchSchedulable()) {
+      StartNewBatch();
+    }
+
+    if (batches_.size() >= 2) {
+      // There is at least one closed batch that is ready to be scheduled.
+      ++num_batches_being_processed_;
+      batch_to_schedule = std::move(batches_.front());
+      batches_.pop_front();
+    } else {
+      schedulable_batch_ = false;
+    }
+  }
+
+  return batch_to_schedule;
+}
+
+template <typename TaskType>
+void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
+  process_batch_callback_(std::move(batch));
+
+  {
+    mutex_lock l(mu_);
+    --num_batches_being_processed_;
+    if (empty_notification_ != nullptr && IsEmptyInternal()) {
+      empty_notification_->Notify();
+    }
+  }
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmpty() const {
+  mutex_lock l(mu_);
+  return IsEmptyInternal();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::CloseAndWaitUntilEmpty() {
+  Notification empty;
+  {
+    mutex_lock l(mu_);
+    closed_ = true;
+    if (IsEmptyInternal()) {
+      empty.Notify();
+    } else {
+      // Arrange for ProcessBatch() to notify when the queue becomes empty.
+      empty_notification_ = &empty;
+    }
+  }
+  empty.WaitForNotification();
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmptyInternal() const {
+  return num_batches_being_processed_ == 0 && batches_.size() == 1 &&
+         batches_.back()->empty();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::StartNewBatch() {
+  batches_.back()->Close();
+  batches_.emplace_back(new Batch<TaskType>);
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsOpenBatchSchedulable() const {
+  Batch<TaskType>* open_batch = batches_.back().get();
+  if (open_batch->empty()) {
+    return false;
+  }
+  return closed_ || open_batch->size() >= options_.max_batch_size ||
+         env_->NowMicros() >=
+             open_batch_start_time_micros_ + options_.batch_timeout_micros;
+}
+
+template <typename TaskType>
+QueueHandle<TaskType>::QueueHandle(
+    std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+    Queue<TaskType>* queue)
+    : scheduler_(scheduler), queue_(queue) {}
+
+template <typename TaskType>
+QueueHandle<TaskType>::~QueueHandle() {
+  queue_->CloseAndWaitUntilEmpty();
+}
+
+template <typename TaskType>
+Status QueueHandle<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  return queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::NumEnqueuedTasks() const {
+  return queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::SchedulingCapacity() const {
+  return queue_->SchedulingCapacity();
+}
+
+}  // namespace internal
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
new file mode 100644
index 00000000000..3e924ae5f13
--- /dev/null
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@@ -0,0 +1,596 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+
+#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+namespace {
+
+class FakeTask : public BatchTask {
+ public:
+  explicit FakeTask(size_t size) : size_(size) {}
+
+  ~FakeTask() override = default;
+
+  size_t size() const override { return size_; }
+
+ private:
+  const size_t size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+};
+
+// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
+// that task. Returns the resulting status.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Creates a thread that waits on 'start' and then advances the fake clock in
+// 'env' in a loop until 'stop' is notified. Useful for allowing objects that
+// use the clock to be destroyed.
+std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
+    test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
+  return std::unique_ptr<Thread>(
+      Env::Default()->StartThread({}, "FakeClockAdvancerThread",
+                                  [env, start, stop] {
+                                    start->WaitForNotification();
+                                    while (!stop->HasBeenNotified()) {
+                                      env->AdvanceByMicroseconds(10);
+                                      Env::Default()->SleepForMicroseconds(10);
+                                    }
+                                  }));
+}
+
+TEST(SharedBatchSchedulerTest, Basic) {
+  for (int num_batch_threads : {1, 2, 3}) {
+    for (const bool delete_scheduler_early : {false, true}) {
+      for (const bool delete_queue_1_early : {false, true}) {
+        bool queue_0_callback_called = false;
+        auto queue_0_callback =
+            [&queue_0_callback_called](std::unique_ptr<Batch<FakeTask>> batch) {
+              queue_0_callback_called = true;
+              ASSERT_TRUE(batch->IsClosed());
+              ASSERT_EQ(3, batch->num_tasks());
+              EXPECT_EQ(1, batch->task(0).size());
+              EXPECT_EQ(3, batch->task(1).size());
+              EXPECT_EQ(5, batch->task(2).size());
+            };
+        bool queue_1_callback_called = false;
+        auto queue_1_callback =
+            [&queue_1_callback_called](std::unique_ptr<Batch<FakeTask>> batch) {
+              queue_1_callback_called = true;
+              ASSERT_TRUE(batch->IsClosed());
+              ASSERT_EQ(2, batch->num_tasks());
+              EXPECT_EQ(2, batch->task(0).size());
+              EXPECT_EQ(4, batch->task(1).size());
+            };
+        {
+          SharedBatchScheduler<FakeTask>::Options options;
+          options.num_batch_threads = num_batch_threads;
+          std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+          TF_ASSERT_OK(
+              SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+
+          // Create two queues.
+          SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+          queue_options.max_batch_size = 10;
+          queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
+          queue_options.max_enqueued_batches = 2;
+          std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
+          TF_ASSERT_OK(
+              scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
+          std::unique_ptr<BatchScheduler<FakeTask>> queue_1;
+          TF_ASSERT_OK(
+              scheduler->AddQueue(queue_options, queue_1_callback, &queue_1));
+
+          if (delete_scheduler_early) {
+            // Delete our copy of the scheduler. The queues should keep it alive
+            // under the covers.
+            scheduler = nullptr;
+          }
+
+          // Submit tasks to the two queues, and (optionally) remove the queues.
+          TF_ASSERT_OK(ScheduleTask(1, queue_0.get()));
+          TF_ASSERT_OK(ScheduleTask(2, queue_1.get()));
+          TF_ASSERT_OK(ScheduleTask(3, queue_0.get()));
+          TF_ASSERT_OK(ScheduleTask(4, queue_1.get()));
+          if (delete_queue_1_early) {
+            queue_1 = nullptr;
+          }
+          TF_ASSERT_OK(ScheduleTask(5, queue_0.get()));
+        }
+        EXPECT_TRUE(queue_0_callback_called);
+        EXPECT_TRUE(queue_1_callback_called);
+      }
+    }
+  }
+}
+
+TEST(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
+  // Set up a callback that captures the batches' task sizes.
+  mutex mu;
+  std::vector<std::vector<size_t>> callback_data;
+  auto callback = [&mu,
+                   &callback_data](std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    std::vector<size_t> batch_data;
+    batch_data.reserve(batch->num_tasks());
+    for (int i = 0; i < batch->num_tasks(); ++i) {
+      batch_data.push_back(batch->mutable_task(i)->size());
+    }
+    {
+      mutex_lock l(mu);
+      callback_data.push_back(batch_data);
+    }
+  };
+
+  // Run a batch scheduler and inject some tasks.
+  {
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 2;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    queue_options.max_batch_size = 10;
+    queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
+    queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+
+    // First batch.
+    TF_ASSERT_OK(ScheduleTask(3, queue.get()));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get()));
+
+    // Second batch (due to size overage).
+    TF_ASSERT_OK(ScheduleTask(3 /* (3+5) + 3 > 10 */, queue.get()));
+    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    TF_ASSERT_OK(ScheduleTask(6, queue.get()));
+
+    // (Empty third batch, since the second batch exactly hit the size limit,
+    // which should never get sent to the callback.)
+  }
+
+  // Expect a certain grouping of the tasks into batches.
+  ASSERT_EQ(2, callback_data.size());
+  ASSERT_TRUE((callback_data[0].size() == 2 && callback_data[1].size() == 3) ||
+              (callback_data[0].size() == 3 && callback_data[1].size() == 2));
+  const std::vector<size_t>& callback_data_a =
+      callback_data[0].size() == 2 ? callback_data[0] : callback_data[1];
+  const std::vector<size_t>& callback_data_b =
+      callback_data[0].size() == 2 ? callback_data[1] : callback_data[0];
+  EXPECT_EQ((std::vector<size_t>{3, 5}), callback_data_a);
+  EXPECT_EQ((std::vector<size_t>{3, 1, 6}), callback_data_b);
+}
+
+TEST(SharedBatchSchedulerTest, ObeysTimeout) {
+  // Set up a fake clock, which only advances when we explicitly tell it to.
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+
+  {
+    Notification first_batch_processed, second_batch_processed,
+        third_batch_processed;
+    auto callback =
+        [&first_batch_processed, &second_batch_processed,
+         &third_batch_processed](std::unique_ptr<Batch<FakeTask>> batch) {
+          ASSERT_TRUE(batch->IsClosed());
+          if (batch->size() == 1) {
+            first_batch_processed.Notify();
+          } else if (batch->size() == 2) {
+            second_batch_processed.Notify();
+          } else if (batch->size() == 3) {
+            third_batch_processed.Notify();
+          } else {
+            EXPECT_TRUE(false) << "Unexpected batch size";
+          }
+        };
+
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 1;
+    options.env = &env;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    queue_options.max_batch_size = 4;
+    queue_options.batch_timeout_micros = 10;
+    queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+
+    // Create an underfull batch, and ensure that it gets processed when the
+    // clock hits the timeout.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    env.AdvanceByMicroseconds(9);
+    Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+    EXPECT_FALSE(first_batch_processed.HasBeenNotified());
+    env.AdvanceByMicroseconds(1);
+    first_batch_processed.WaitForNotification();
+
+    // Start creating a batch, while leaving the clock well below the timeout.
+    // Then submit a new task that overflows into the next batch, causing
+    // the original batch to close.
+    TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+    Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+    EXPECT_FALSE(second_batch_processed.HasBeenNotified());
+    TF_ASSERT_OK(ScheduleTask(3, queue.get()));
+    second_batch_processed.WaitForNotification();
+
+    // Allow the third batch to hit its timeout, and ensure it gets closed at
+    // the right time.
+    env.AdvanceByMicroseconds(9);
+    Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+    EXPECT_FALSE(third_batch_processed.HasBeenNotified());
+    env.AdvanceByMicroseconds(1);
+    third_batch_processed.WaitForNotification();
+
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
+  Notification first_batch_processed, second_batch_processed;
+  auto callback = [&first_batch_processed, &second_batch_processed](
+      std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    if (batch->size() == 1) {
+      first_batch_processed.Notify();
+    } else if (batch->size() == 2) {
+      second_batch_processed.Notify();
+    } else {
+      EXPECT_TRUE(false) << "Unexpected batch size";
+    }
+  };
+
+  SharedBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 2;
+  std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 10;
+  queue_options.batch_timeout_micros = 100 * 1000;  // 100 milliseconds
+  queue_options.max_enqueued_batches = 2;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+
+  // Submit a single task that doesn't fill up the batch.
+  // Ensure that it gets processed due to the timeout.
+  TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+  first_batch_processed.WaitForNotification();
+
+  // Do it again.
+  TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+  second_batch_processed.WaitForNotification();
+}
+
+TEST(SharedBatchSchedulerTest,
+     WithZeroTimeoutBatchesScheduledAsSoonAsThreadIsAvailable) {
+  // Set up a fake clock, and never advance the time.
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+
+  {
+    Notification first_batch_processed, second_batch_processed;
+    auto callback = [&first_batch_processed, &second_batch_processed](
+        std::unique_ptr<Batch<FakeTask>> batch) {
+      ASSERT_TRUE(batch->IsClosed());
+      if (batch->size() == 1) {
+        first_batch_processed.Notify();
+      } else if (batch->size() == 2) {
+        second_batch_processed.Notify();
+      } else {
+        EXPECT_TRUE(false) << "Unexpected batch size";
+      }
+    };
+
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 2;
+    options.env = &env;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    // Set a large batch size, so that we don't hit the batch size limit.
+    queue_options.max_batch_size = 100;
+    // Process a batch as soon as a thread is available.
+    queue_options.batch_timeout_micros = 0;
+    queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+
+    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    first_batch_processed.WaitForNotification();
+    TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+    second_batch_processed.WaitForNotification();
+
+    // Shut everything down.
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(SharedBatchSchedulerTest, Fairness) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+
+  {
+    Notification queue_0_first_batch_scheduled, queue_0_first_batch_proceed,
+        queue_0_second_batch_scheduled;
+    auto queue_0_callback = [&queue_0_first_batch_scheduled,
+                             &queue_0_first_batch_proceed,
+                             &queue_0_second_batch_scheduled](
+        std::unique_ptr<Batch<FakeTask>> batch) {
+      if (!queue_0_first_batch_scheduled.HasBeenNotified()) {
+        queue_0_first_batch_scheduled.Notify();
+        queue_0_first_batch_proceed.WaitForNotification();
+      } else if (!queue_0_second_batch_scheduled.HasBeenNotified()) {
+        queue_0_second_batch_scheduled.Notify();
+      }
+    };
+
+    Notification queue_1_first_batch_scheduled, queue_1_first_batch_proceed;
+    auto queue_1_callback =
+        [&queue_1_first_batch_scheduled,
+         &queue_1_first_batch_proceed](std::unique_ptr<Batch<FakeTask>> batch) {
+          queue_1_first_batch_scheduled.Notify();
+          queue_1_first_batch_proceed.WaitForNotification();
+        };
+
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 1;
+    options.env = &env;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    queue_options.max_batch_size = 10;
+    queue_options.batch_timeout_micros = 1;
+    queue_options.max_enqueued_batches = 100 /* give plenty of room */;
+    std::vector<std::unique_ptr<BatchScheduler<FakeTask>>> queues(2);
+    TF_ASSERT_OK(
+        scheduler->AddQueue(queue_options, queue_0_callback, &queues[0]));
+    TF_ASSERT_OK(
+        scheduler->AddQueue(queue_options, queue_1_callback, &queues[1]));
+
+    // Enqueue a batch-filling task to queue 0, and wait for it to get
+    // scheduled.
+    TF_ASSERT_OK(ScheduleTask(10, queues[0].get()));
+    env.AdvanceByMicroseconds(1);
+    queue_0_first_batch_scheduled.WaitForNotification();
+
+    // Enqueue two more batch-filling tasks to queue 0.
+    TF_ASSERT_OK(ScheduleTask(10, queues[0].get()));
+    TF_ASSERT_OK(ScheduleTask(10, queues[0].get()));
+
+    // Enqueue one task to queue 1, and then advance the clock so it becomes
+    // eligible for scheduling due to the timeout. Ensure that the queue 1 batch
+    // gets scheduled before the next queue 0 one.
+    TF_ASSERT_OK(ScheduleTask(1, queues[1].get()));
+    env.AdvanceByMicroseconds(1);
+    queue_0_first_batch_proceed.Notify();
+    queue_1_first_batch_scheduled.WaitForNotification();
+    Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+    EXPECT_FALSE(queue_0_second_batch_scheduled.HasBeenNotified());
+
+    // Shut everything down.
+    queue_1_first_batch_proceed.Notify();
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(SharedBatchSchedulerTest, ConstMethods) {
+  for (const int max_enqueued_batches : {1, 2, 5}) {
+    Notification processing, proceed;
+    auto callback = [&processing,
+                     &proceed](std::unique_ptr<Batch<FakeTask>> batch) {
+      if (!processing.HasBeenNotified()) {
+        processing.Notify();
+      }
+      proceed.WaitForNotification();
+    };
+
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 1;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    queue_options.max_batch_size = 2;
+    queue_options.batch_timeout_micros = 0;
+    queue_options.max_enqueued_batches = max_enqueued_batches;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+    EXPECT_EQ(0, queue->NumEnqueuedTasks());
+    EXPECT_EQ(max_enqueued_batches * 2, queue->SchedulingCapacity());
+
+    // Get one batch going on the thread, and keep the thread blocked until
+    // we're done testing the maximum queue length.
+    TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+    processing.WaitForNotification();
+    EXPECT_EQ(0, queue->NumEnqueuedTasks());
+
+    // We should be able to enqueue 'max_enqueued_batches'*2 tasks without
+    // issue.
+    for (int i = 0; i < max_enqueued_batches; ++i) {
+      EXPECT_EQ(i * 2, queue->NumEnqueuedTasks());
+      EXPECT_EQ((max_enqueued_batches - i) * 2, queue->SchedulingCapacity());
+      TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+      EXPECT_EQ((i * 2) + 1, queue->NumEnqueuedTasks());
+      EXPECT_EQ((max_enqueued_batches - i) * 2 - 1,
+                queue->SchedulingCapacity());
+      TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    }
+    EXPECT_EQ(max_enqueued_batches * 2, queue->NumEnqueuedTasks());
+    EXPECT_EQ(0, queue->SchedulingCapacity());
+
+    // Attempting to enqueue one more task should yield an UNAVAILABLE error.
+    Status status = ScheduleTask(1, queue.get());
+    ASSERT_FALSE(status.ok());
+    EXPECT_EQ(error::UNAVAILABLE, status.code());
+    EXPECT_EQ(max_enqueued_batches * 2, queue->NumEnqueuedTasks());
+    EXPECT_EQ(0, queue->SchedulingCapacity());
+
+    proceed.Notify();
+  }
+}
+
+TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
+  Notification queue_0_processing, queue_0_proceed;
+  auto queue_0_callback = [&queue_0_processing, &queue_0_proceed](
+      std::unique_ptr<Batch<FakeTask>> batch) {
+    if (!queue_0_processing.HasBeenNotified()) {
+      queue_0_processing.Notify();
+      queue_0_proceed.WaitForNotification();
+    }
+  };
+
+  Notification queue_1_first_batch_processed, queue_1_second_batch_processed,
+      queue_1_third_batch_processed;
+  auto queue_1_callback =
+      [&queue_1_first_batch_processed, &queue_1_second_batch_processed,
+       &queue_1_third_batch_processed](std::unique_ptr<Batch<FakeTask>> batch) {
+        if (batch->size() == 1) {
+          queue_1_first_batch_processed.Notify();
+        } else if (batch->size() == 2) {
+          queue_1_second_batch_processed.Notify();
+        } else if (batch->size() == 3) {
+          queue_1_third_batch_processed.Notify();
+        } else {
+          EXPECT_TRUE(false) << "Unexpected batch size";
+        }
+      };
+
+  SharedBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 2;
+  std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 10;
+  queue_options.batch_timeout_micros = 0;
+  queue_options.max_enqueued_batches = 2;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue_1;
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_1_callback, &queue_1));
+
+  // Clog up queue 0.
+  TF_ASSERT_OK(ScheduleTask(1, queue_0.get()));
+  queue_0_processing.WaitForNotification();
+  Status queue_0_status;
+  do {
+    queue_0_status = ScheduleTask(1, queue_0.get());
+  } while (queue_0_status.ok());
+  EXPECT_EQ(error::UNAVAILABLE, queue_0_status.code());
+
+  // Ensure that queue 1 still behaves normally, and lets us process tasks.
+  TF_ASSERT_OK(ScheduleTask(1, queue_1.get()));
+  queue_1_first_batch_processed.WaitForNotification();
+  TF_ASSERT_OK(ScheduleTask(2, queue_1.get()));
+  queue_1_second_batch_processed.WaitForNotification();
+  TF_ASSERT_OK(ScheduleTask(3, queue_1.get()));
+  queue_1_third_batch_processed.WaitForNotification();
+
+  // Let poor queue 0 drain.
+  queue_0_proceed.Notify();
+}
+
+TEST(SharedBatchSchedulerTest, QueueDestructorBlocksUntilAllTasksProcessed) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+
+  {
+    int current_batch = 0;
+    Notification first_callback_started;
+    const int kMaxEnqueuedBatches = 3;
+    std::vector<Notification> callback_proceed(kMaxEnqueuedBatches);
+    auto callback =
+        [&current_batch, &first_callback_started,
+         &callback_proceed](std::unique_ptr<Batch<FakeTask>> batch) {
+          if (current_batch == 0) {
+            first_callback_started.Notify();
+          }
+          callback_proceed[current_batch].WaitForNotification();
+          ++current_batch;
+        };
+
+    SharedBatchScheduler<FakeTask>::Options options;
+    options.num_batch_threads = 1;
+    options.env = &env;
+    std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+    queue_options.max_batch_size = 10;
+    queue_options.batch_timeout_micros = 0;
+    queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+
+    // Clog up the queue.
+    int num_enqueued_batches = 0;
+    TF_ASSERT_OK(ScheduleTask(10, queue.get()));
+    ++num_enqueued_batches;
+    env.AdvanceByMicroseconds(1);
+    first_callback_started.WaitForNotification();
+    for (int i = 0; i < 2; ++i) {
+      TF_ASSERT_OK(ScheduleTask(10, queue.get()));
+      ++num_enqueued_batches;
+    }
+    EXPECT_EQ(kMaxEnqueuedBatches, num_enqueued_batches);
+    EXPECT_EQ(error::UNAVAILABLE, ScheduleTask(10, queue.get()).code());
+
+    // Destroy the queue. The destructor should block until all tasks have been
+    // processed.
+    Notification destroy_queue_thread_started, queue_destroyed;
+    std::unique_ptr<Thread> destroy_queue_thread(Env::Default()->StartThread(
+        {}, "DestroyQueueThread",
+        [&queue, &destroy_queue_thread_started, &queue_destroyed] {
+          destroy_queue_thread_started.Notify();
+          queue = nullptr;
+          queue_destroyed.Notify();
+        }));
+    destroy_queue_thread_started.WaitForNotification();
+    for (int i = 0; i < num_enqueued_batches; ++i) {
+      Env::Default()->SleepForMicroseconds(10 * 1000 /* 10 milliseconds */);
+      EXPECT_FALSE(queue_destroyed.HasBeenNotified());
+      callback_proceed[i].Notify();
+    }
+
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+}  // namespace
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
new file mode 100644
index 00000000000..491991b4c67
--- /dev/null
+++ b/tensorflow/contrib/batching/test_util/BUILD
@@ -0,0 +1,33 @@
+# Description: Utilities to aid testing.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = ["-layering_check"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+cc_library(
+    name = "fake_clock_env",
+    testonly = 1,
+    srcs = ["fake_clock_env.cc"],
+    hdrs = ["fake_clock_env.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+    ],
+)
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.cc b/tensorflow/contrib/batching/test_util/fake_clock_env.cc
new file mode 100644
index 00000000000..166d6703bde
--- /dev/null
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.cc
@@ -0,0 +1,90 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+
+#include <string>
+
+namespace tensorflow {
+namespace serving {
+namespace test_util {
+
+FakeClockEnv::FakeClockEnv(Env* wrapped) : EnvWrapper(wrapped) {}
+
+void FakeClockEnv::AdvanceByMicroseconds(int micros) {
+  {
+    mutex_lock l(mu_);
+    current_time_ += micros;
+    for (auto it = sleeping_threads_.begin(); it != sleeping_threads_.end();) {
+      if (current_time_ >= it->wake_time) {
+        it->wake_notification->Notify();
+        it = sleeping_threads_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
+void FakeClockEnv::BlockUntilSleepingThread(uint64 wake_time) {
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      for (auto it = sleeping_threads_.begin(); it != sleeping_threads_.end();
+           ++it) {
+        if (it->wake_time == wake_time) {
+          return;
+        }
+      }
+    }
+    EnvWrapper::SleepForMicroseconds(100);
+  }
+}
+
+void FakeClockEnv::BlockUntilThreadsAsleep(int num_threads) {
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_threads <= sleeping_threads_.size()) {
+        return;
+      }
+    }
+    EnvWrapper::SleepForMicroseconds(100);
+  }
+}
+
+uint64 FakeClockEnv::NowMicros() {
+  {
+    mutex_lock l(mu_);
+    return current_time_;
+  }
+}
+
+void FakeClockEnv::SleepForMicroseconds(int64 micros) {
+  if (micros == 0) {
+    return;
+  }
+
+  Notification wake_notification;
+  {
+    mutex_lock l(mu_);
+    sleeping_threads_.push_back({current_time_ + micros, &wake_notification});
+  }
+  wake_notification.WaitForNotification();
+}
+
+}  // namespace test_util
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.h b/tensorflow/contrib/batching/test_util/fake_clock_env.h
new file mode 100644
index 00000000000..35cafcb73c5
--- /dev/null
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace test_util {
+
+// An Env implementation with a fake clock for NowMicros() and
+// SleepForMicroseconds(). The clock doesn't advance on its own; it advances via
+// an explicit Advance() method.
+// All other Env virtual methods pass through to a wrapped Env.
+class FakeClockEnv : public EnvWrapper {
+ public:
+  explicit FakeClockEnv(Env* wrapped);
+  ~FakeClockEnv() override = default;
+
+  // Advance the clock by a certain number of microseconds.
+  void AdvanceByMicroseconds(int micros);
+
+  // Blocks until there is a sleeping thread that is scheduled to wake up at
+  // the given (absolute) time.
+  void BlockUntilSleepingThread(uint64 wake_time);
+
+  // Blocks until there are at least num_threads sleeping.
+  void BlockUntilThreadsAsleep(int num_threads);
+
+  // Methods that this class implements.
+  uint64 NowMicros() override;
+  void SleepForMicroseconds(int64 micros) override;
+
+ private:
+  mutex mu_;
+
+  uint64 current_time_ GUARDED_BY(mu_) = 0;
+
+  struct SleepingThread {
+    uint64 wake_time;
+    Notification* wake_notification;
+  };
+  std::vector<SleepingThread> sleeping_threads_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
+};
+
+}  // namespace test_util
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
new file mode 100644
index 00000000000..48d03746bbe
--- /dev/null
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -0,0 +1,53 @@
+# Description: Utilities.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = ["-layering_check"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "**/google_*",
+        ],
+    ),
+)
+
+cc_library(
+    name = "periodic_function_dynamic",
+    srcs = ["periodic_function.cc"],
+    hdrs = ["periodic_function.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "periodic_function",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":periodic_function_dynamic",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "periodic_function_test",
+    size = "small",
+    srcs = ["periodic_function_test.cc"],
+    deps = [
+        ":periodic_function",
+        "//tensorflow/contrib/batching/test_util:fake_clock_env",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/contrib/batching/util/periodic_function.cc b/tensorflow/contrib/batching/util/periodic_function.cc
new file mode 100644
index 00000000000..b7e4838da50
--- /dev/null
+++ b/tensorflow/contrib/batching/util/periodic_function.cc
@@ -0,0 +1,102 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+
+#include <algorithm>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace serving {
+
+PeriodicFunction::PeriodicFunction(const std::function<void()>& function,
+                                   const int64 interval_micros,
+                                   const Options& options)
+    : function_(function),
+      interval_micros_([interval_micros]() -> int64 {
+        if (interval_micros < 0) {
+          const string error = strings::StrCat(
+              " The value of 'interval_micros' should be >= 0: ",
+              interval_micros, ". ");
+          DCHECK(false) << error;
+          LOG(WARNING) << error << "Resetting it to 0.";
+          return 0;
+        }
+        return interval_micros;
+      }()),
+      options_(options) {
+  thread_.reset(options_.env->StartThread(
+      options_.thread_options, options_.thread_name_prefix, [this]() {
+        // Record the starting time here instead of in RunLoop.  That way, if
+        // there is a delay starting RunLoop, that does not affect the timing
+        // of
+        // the first function.  (Such a delay can often happen in tests where
+        // the test simulates a large time delay immediately after calling
+        // Start.)
+        RunLoop(options_.env->NowMicros());
+      }));
+}
+
+PeriodicFunction::~PeriodicFunction() {
+  NotifyStop();
+
+  // Waits for thread_ to complete and clean up.
+  thread_.reset();
+}
+
+void PeriodicFunction::NotifyStop() {
+  if (!stop_thread_.HasBeenNotified()) {
+    stop_thread_.Notify();
+  }
+}
+
+void PeriodicFunction::RunLoop(const int64 start) {
+  {
+    if (options_.startup_delay_micros > 0) {
+      const int64 deadline = start + options_.startup_delay_micros;
+      options_.env->SleepForMicroseconds(deadline - start);
+    }
+
+    while (!stop_thread_.HasBeenNotified()) {
+      VLOG(3) << "Running function.";
+      const int64 begin = options_.env->NowMicros();
+      function_();
+
+      // Take the max() here to guard against time going backwards which
+      // sometimes happens in multiproc machines.
+      const int64 end =
+          std::max(static_cast<int64>(options_.env->NowMicros()), begin);
+
+      // The deadline is relative to when the last function started.
+      const int64 deadline = begin + interval_micros_;
+
+      // We want to sleep until 'deadline'.
+      if (deadline > end) {
+        if (end > begin) {
+          VLOG(3) << "Reducing interval_micros from " << interval_micros_
+                  << " to " << (deadline - end);
+        }
+        options_.env->SleepForMicroseconds(deadline - end);
+      } else {
+        VLOG(3) << "Function took longer than interval_micros, so not sleeping";
+      }
+    }
+  }
+}
+
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/util/periodic_function.h b/tensorflow/contrib/batching/util/periodic_function.h
new file mode 100644
index 00000000000..2c032d802fe
--- /dev/null
+++ b/tensorflow/contrib/batching/util/periodic_function.h
@@ -0,0 +1,132 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// PeriodicFunction will periodically call the given function with a specified
+// period in a background thread.  After Start() returns, the thread is
+// guaranteed to have started. The destruction of the class causes the
+// background thread to be destroyed as well.  Start() should not be called more
+// than once.
+//
+// PeriodicFunction runs the function as soon as any previous run both is
+// complete and was started more than "interval_micros" earlier.  Thus, runs are
+// both serialized, and normally have a period of "interval_micros" if no run
+// exceeds the time.
+//
+// Note that, if the function takes longer than two interval_micross to finish,
+// then PeriodicFunction will "skip" at least one call to the function.  For
+// instance, if the period is 50ms and the function starts runs at time 0 for
+// 150ms, then the function will immediately start executing again at time 150,
+// but there will be no function runs corresponding to times 50 or 100.  This is
+// especially important to remember when using an environment with a simulated
+// clock: advancing simulated time atomically over N interval_micross will not
+// cause the function to be called N times.
+//
+// This object is thread-safe.
+//
+// Example:
+//
+//   class Foo {
+//    public:
+//     Foo() : periodic_function_([this]() { Bar(); },
+//                               1000 /* 1000us == 1ms*/) {
+//     }
+//
+//    private:
+//     void Bar() { ... }
+//
+//     PeriodicFunction periodic_function_;
+//   };
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+namespace internal {
+class PeriodicFunctionTestAccess;
+}
+
+class PeriodicFunction {
+ public:
+  // Provides the ability to customize several aspects of the PeriodicFunction.
+  // Passed to constructor of PeriodicFunction.
+  struct Options {
+    Options() {}
+
+    // Any standard thread options, such as stack size, should
+    // be passed via "thread_options".
+    ThreadOptions thread_options;
+
+    // Specifies the thread name prefix (see the description in class
+    // Thread).
+    string thread_name_prefix = "periodic_function";
+
+    // The environment to use. Does not take ownership, but must remain alive
+    // for as long as the PeriodicFunction exists.
+    Env* env = Env::Default();
+
+    // Specifies the length of sleep before the first invocation of the
+    // function.
+    // This can be used for adding a random jitter to avoid synchronous behavior
+    // across multiple periodic functions.
+    int64 startup_delay_micros = 0;
+  };
+
+  // Also starts the background thread which will be calling the function.
+  PeriodicFunction(const std::function<void()>& function, int64 interval_micros,
+                   const Options& options = Options());
+
+  ~PeriodicFunction();
+
+ private:
+  friend class internal::PeriodicFunctionTestAccess;
+
+  // Notifies the background thread to stop.
+  void NotifyStop();
+
+  // (Blocking.) Loops forever calling "function_" every "interval_micros_".
+  void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
+
+  const std::function<void()> function_;  // Actual client function
+  const int64 interval_micros_;    // Interval between calls.
+  const Options options_;
+
+  // Protects state below.
+  mutable mutex mutex_;
+  // Used to notify the thread to stop.
+  Notification stop_thread_;
+
+  // Thread for running "function_"
+  std::unique_ptr<Thread> thread_ GUARDED_BY(mutex_) = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PeriodicFunction);
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/tensorflow/contrib/batching/util/periodic_function_test.cc b/tensorflow/contrib/batching/util/periodic_function_test.cc
new file mode 100644
index 00000000000..15179611160
--- /dev/null
+++ b/tensorflow/contrib/batching/util/periodic_function_test.cc
@@ -0,0 +1,225 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+
+namespace internal {
+
+class PeriodicFunctionTestAccess {
+ public:
+  explicit PeriodicFunctionTestAccess(PeriodicFunction* periodic_function)
+      : periodic_function_(periodic_function) {}
+
+  void NotifyStop() { periodic_function_->NotifyStop(); }
+
+ private:
+  PeriodicFunction* const periodic_function_;
+};
+
+}  // namespace internal
+
+namespace {
+
+using test_util::FakeClockEnv;
+
+void StopPeriodicFunction(PeriodicFunction* periodic_function,
+                          FakeClockEnv* fake_clock_env,
+                          const uint64 pf_interval_micros) {
+  fake_clock_env->BlockUntilThreadsAsleep(1);
+  internal::PeriodicFunctionTestAccess(periodic_function).NotifyStop();
+  fake_clock_env->AdvanceByMicroseconds(pf_interval_micros);
+}
+
+TEST(PeriodicFunctionTest, ObeyInterval) {
+  const int64 kPeriodMicros = 2;
+  const int kCalls = 10;
+
+  int actual_calls = 0;
+  {
+    FakeClockEnv fake_clock_env(Env::Default());
+    PeriodicFunction::Options options;
+    options.env = &fake_clock_env;
+    PeriodicFunction periodic_function([&actual_calls]() { ++actual_calls; },
+                                       kPeriodMicros, options);
+
+    for (int i = 0; i < kCalls; ++i) {
+      fake_clock_env.BlockUntilThreadsAsleep(1);
+      fake_clock_env.AdvanceByMicroseconds(kPeriodMicros);
+    }
+    StopPeriodicFunction(&periodic_function, &fake_clock_env, kPeriodMicros);
+  }
+
+  // The function gets called kCalls+1 times: once at time 0, once at time
+  // kPeriodMicros, once at time kPeriodMicros*2, up to once at time
+  // kPeriodMicros*kCalls.
+  ASSERT_EQ(actual_calls, kCalls + 1);
+}
+
+TEST(PeriodicFunctionTest, ObeyStartupDelay) {
+  const int64 kDelayMicros = 10;
+  const int64 kPeriodMicros = kDelayMicros / 10;
+
+  int actual_calls = 0;
+  {
+    PeriodicFunction::Options options;
+    options.startup_delay_micros = kDelayMicros;
+    FakeClockEnv fake_clock_env(Env::Default());
+    options.env = &fake_clock_env;
+    PeriodicFunction periodic_function([&actual_calls]() { ++actual_calls; },
+                                       kPeriodMicros, options);
+
+    // Wait for the thread to start up.
+    fake_clock_env.BlockUntilThreadsAsleep(1);
+    // Function shouldn't have been called yet.
+    EXPECT_EQ(0, actual_calls);
+    // Give enough time for startup delay to expire.
+    fake_clock_env.AdvanceByMicroseconds(kDelayMicros);
+    StopPeriodicFunction(&periodic_function, &fake_clock_env, kDelayMicros);
+  }
+
+  // Function should have been called at least once.
+  EXPECT_EQ(1, actual_calls);
+}
+
+// Test for race in calculating the first time the callback should fire.
+TEST(PeriodicFunctionTest, StartupDelayRace) {
+  const int64 kDelayMicros = 10;
+  const int64 kPeriodMicros = kDelayMicros / 10;
+
+  mutex mu;
+  int counter = 0;
+  std::unique_ptr<Notification> listener(new Notification);
+
+  FakeClockEnv fake_clock_env(Env::Default());
+  PeriodicFunction::Options options;
+  options.env = &fake_clock_env;
+  options.startup_delay_micros = kDelayMicros;
+  PeriodicFunction periodic_function(
+      [&mu, &counter, &listener]() {
+        mutex_lock l(mu);
+        counter++;
+        listener->Notify();
+      },
+      kPeriodMicros, options);
+
+  fake_clock_env.BlockUntilThreadsAsleep(1);
+  fake_clock_env.AdvanceByMicroseconds(kDelayMicros);
+  listener->WaitForNotification();
+  {
+    mutex_lock l(mu);
+    EXPECT_EQ(1, counter);
+    // A notification can only be notified once.
+    listener.reset(new Notification);
+  }
+  fake_clock_env.BlockUntilThreadsAsleep(1);
+  fake_clock_env.AdvanceByMicroseconds(kPeriodMicros);
+  listener->WaitForNotification();
+  {
+    mutex_lock l(mu);
+    EXPECT_EQ(2, counter);
+  }
+  StopPeriodicFunction(&periodic_function, &fake_clock_env, kPeriodMicros);
+}
+
+// If this test hangs forever, its probably a deadlock caused by setting the
+// PeriodicFunction's interval to 0ms.
+TEST(PeriodicFunctionTest, MinInterval) {
+  PeriodicFunction periodic_function(
+      []() { Env::Default()->SleepForMicroseconds(20 * 1000); }, 0);
+}
+
+class PeriodicFunctionWithFakeClockEnvTest : public ::testing::Test {
+ protected:
+  const int64 kPeriodMicros = 50;
+  PeriodicFunctionWithFakeClockEnvTest()
+      : fake_clock_env_(Env::Default()),
+        counter_(0),
+        pf_(
+            [this]() {
+              mutex_lock l(counter_mu_);
+              ++counter_;
+            },
+            kPeriodMicros, GetPeriodicFunctionOptions()) {}
+
+  PeriodicFunction::Options GetPeriodicFunctionOptions() {
+    PeriodicFunction::Options options;
+    options.thread_name_prefix = "ignore";
+    options.env = &fake_clock_env_;
+    return options;
+  }
+
+  void SetUp() override {
+    // Note: counter_ gets initially incremented at time 0.
+    ASSERT_TRUE(AwaitCount(1));
+  }
+
+  void TearDown() override {
+    StopPeriodicFunction(&pf_, &fake_clock_env_, kPeriodMicros);
+  }
+
+  // The FakeClockEnv tests below advance simulated time and then expect the
+  // PeriodicFunction thread to run its function. This method helps the tests
+  // wait for the thread to execute, and then check the count matches the
+  // expectation.
+  bool AwaitCount(int expected_counter) {
+    fake_clock_env_.BlockUntilThreadsAsleep(1);
+    {
+      mutex_lock lock(counter_mu_);
+      return counter_ == expected_counter;
+    }
+  }
+
+  FakeClockEnv fake_clock_env_;
+  mutex counter_mu_;
+  int counter_;
+  PeriodicFunction pf_;
+};
+
+TEST_F(PeriodicFunctionWithFakeClockEnvTest, FasterThanRealTime) {
+  fake_clock_env_.AdvanceByMicroseconds(kPeriodMicros / 2);
+  for (int i = 2; i < 7; ++i) {
+    fake_clock_env_.AdvanceByMicroseconds(
+        kPeriodMicros);  // advance past a tick
+    EXPECT_TRUE(AwaitCount(i));
+  }
+}
+
+TEST_F(PeriodicFunctionWithFakeClockEnvTest, SlowerThanRealTime) {
+  Env::Default()->SleepForMicroseconds(
+      125 * 1000);  // wait for any unexpected breakage
+  EXPECT_TRUE(AwaitCount(1));
+}
+
+TEST(PeriodicFunctionDeathTest, BadInterval) {
+  EXPECT_DEBUG_DEATH(PeriodicFunction periodic_function([]() {}, -1),
+                     ".* should be >= 0");
+
+  EXPECT_DEBUG_DEATH(PeriodicFunction periodic_function(
+                         []() {}, -1, PeriodicFunction::Options()),
+                     ".* should be >= 0");
+}
+
+}  // namespace
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
index c4c2087cc0b..6cdaa318705 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@@ -18,18 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import entropy as entropy_lib
+from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
@@ -89,11 +82,11 @@ class ElboRatioTest(test.TestCase):
 
     with self.test_session():
       q = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
       p = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
 
       # In this case, the log_ratio is the KL.
       sample_kl = -1 * entropy.elbo_ratio(
@@ -102,12 +95,12 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.sample,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
       self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.03)
+      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
 
   def test_convergence_to_kl_using_analytic_entropy_form_on_3dim_normal(self):
     # Test that the sample mean KL is the same as analytic when we use an
@@ -117,11 +110,11 @@ class ElboRatioTest(test.TestCase):
     vector_shape = (2, 3)
     with self.test_session():
       q = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
       p = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
 
       # In this case, the log_ratio is the KL.
       sample_kl = -1 * entropy.elbo_ratio(
@@ -130,12 +123,12 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.analytic_entropy,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
       self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
+      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.1)
 
   def test_sample_kl_zero_when_p_and_q_are_the_same_distribution(self):
     n_samples = 50
@@ -143,8 +136,8 @@ class ElboRatioTest(test.TestCase):
     vector_shape = (2, 3)
     with self.test_session():
       q = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
 
       # In this case, the log_ratio is the KL.
       sample_kl = -1 * entropy.elbo_ratio(
@@ -229,7 +222,7 @@ class RenyiRatioTest(test.TestCase):
     mu_true = np.array([1.0, -1.0], dtype=np.float64)
     chol_true = np.array([[2.0, 0.0], [0.5, 1.0]], dtype=np.float64)
     with self.test_session() as sess:
-      target = distributions.MultivariateNormalCholesky(mu_true, chol_true)
+      target = distributions.MultivariateNormalTriL(mu_true, chol_true)
 
       # Set up q distribution by defining mean/covariance as Variables
       mu = variables.Variable(
@@ -237,7 +230,7 @@ class RenyiRatioTest(test.TestCase):
       mat = variables.Variable(
           np.zeros(chol_true.shape), dtype=chol_true.dtype, name='mat')
       chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-      q = distributions.MultivariateNormalCholesky(mu, chol)
+      q = distributions.MultivariateNormalTriL(mu, chol)
       for alpha in [0.25, 0.75]:
 
         negative_renyi_divergence = entropy.renyi_ratio(
@@ -260,16 +253,18 @@ class RenyiRatioTest(test.TestCase):
 
         # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
         # pass.
-        self.assertAllClose(target.mu.eval(), q.mu.eval(), rtol=0.06)
-        self.assertAllClose(target.sigma.eval(), q.sigma.eval(), rtol=0.02)
+        self.assertAllClose(target.loc.eval(), q.loc.eval(), rtol=0.06)
+        self.assertAllClose(target.scale.to_dense().eval(),
+                            q.scale.to_dense().eval(),
+                            rtol=0.1)
 
   def test_divergence_between_identical_distributions_is_zero(self):
     n = 1000
     vector_shape = (2, 3)
     with self.test_session():
       q = distributions.MultivariateNormalDiag(
-          mu=self._rng.rand(*vector_shape),
-          diag_stddev=self._rng.rand(*vector_shape))
+          loc=self._rng.rand(*vector_shape),
+          scale_diag=self._rng.rand(*vector_shape))
       for alpha in [0.25, 0.75]:
 
         negative_renyi_divergence = entropy.renyi_ratio(
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
index 5cf5674d828..fd3c79976a5 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
@@ -18,16 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo as monte_carlo_lib
+from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo_lib
+from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import _get_samples
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
@@ -75,9 +69,9 @@ class ExpectationImportanceSampleTest(test.TestCase):
     n = 1000
     with self.test_session():
       p = distributions.MultivariateNormalDiag(
-          mu=[0.0, 0.0], diag_stddev=[1.0, 1.0])
+          loc=[0.0, 0.0], scale_diag=[1.0, 1.0])
       q = distributions.MultivariateNormalDiag(
-          mu=[0.5, 0.5], diag_stddev=[3., 3.])
+          loc=[0.5, 0.5], scale_diag=[3., 3.])
 
       # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x).
       # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0).
@@ -156,7 +150,7 @@ class GetSamplesTest(test.TestCase):
       n = None
       seed = None
       with self.assertRaisesRegexp(ValueError, 'exactly one'):
-        monte_carlo._get_samples(dist, z, n, seed)
+        _get_samples(dist, z, n, seed)
 
   def test_raises_if_both_z_and_n_are_not_none(self):
     with self.test_session():
@@ -165,7 +159,7 @@ class GetSamplesTest(test.TestCase):
       n = 1
       seed = None
       with self.assertRaisesRegexp(ValueError, 'exactly one'):
-        monte_carlo._get_samples(dist, z, n, seed)
+        _get_samples(dist, z, n, seed)
 
   def test_returns_n_samples_if_n_provided(self):
     with self.test_session():
@@ -173,7 +167,7 @@ class GetSamplesTest(test.TestCase):
       z = None
       n = 10
       seed = None
-      z = monte_carlo._get_samples(dist, z, n, seed)
+      z = _get_samples(dist, z, n, seed)
       self.assertEqual((10,), z.get_shape())
 
   def test_returns_z_if_z_provided(self):
@@ -182,7 +176,7 @@ class GetSamplesTest(test.TestCase):
       z = dist.sample(10, seed=42)
       n = None
       seed = None
-      z = monte_carlo._get_samples(dist, z, n, seed)
+      z = _get_samples(dist, z, n, seed)
       self.assertEqual((10,), z.get_shape())
 
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
index 9fa15dcdeda..44e27db03b1 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph
+from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,7 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 st = stochastic_tensor
-sg = stochastic_graph
+sg = stochastic_graph_impl
 distributions = distributions_lib
 
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
index ac13a8311f2..6d0cff46789 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@@ -19,18 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions as distributions_lib
+
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
+from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
 sge = stochastic_gradient_estimators
-st = stochastic_tensor
+st = stochastic_tensor_impl
 
 
 class StochasticTensorTest(test.TestCase):
@@ -42,21 +42,18 @@ class StochasticTensorTest(test.TestCase):
       sigma2 = constant_op.constant([0.1, 0.2, 0.3])
 
       prior_default = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma))
+          normal.Normal(loc=mu, scale=sigma))
       self.assertTrue(isinstance(prior_default.value_type, st.SampleValue))
       prior_0 = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma),
+          normal.Normal(loc=mu, scale=sigma),
           dist_value_type=st.SampleValue())
       self.assertTrue(isinstance(prior_0.value_type, st.SampleValue))
 
       with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior.value_type, st.SampleValue))
         likelihood = st.StochasticTensor(
-            distributions.Normal(
-                loc=prior, scale=sigma2))
+            normal.Normal(loc=prior, scale=sigma2))
         self.assertTrue(isinstance(likelihood.value_type, st.SampleValue))
 
       coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
@@ -85,7 +82,7 @@ class StochasticTensorTest(test.TestCase):
       sigma = constant_op.constant([1.1, 1.2, 1.3])
 
       with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior.value_type, st.MeanValue))
 
       prior_mean = prior.mean()
@@ -102,8 +99,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue()):
         prior_single = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
 
       prior_single_value = prior_single.value()
       self.assertEqual(prior_single_value.get_shape(), (2, 3))
@@ -113,8 +109,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue(1)):
         prior_single = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior_single.value_type, st.SampleValue))
 
       prior_single_value = prior_single.value()
@@ -125,8 +120,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue(2)):
         prior_double = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
 
       prior_double_value = prior_double.value()
       self.assertEqual(prior_double_value.get_shape(), (2, 2, 3))
@@ -139,10 +133,10 @@ class StochasticTensorTest(test.TestCase):
       mu = [0.0, -1.0, 1.0]
       sigma = constant_op.constant([1.1, 1.2, 1.3])
       with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         entropy = prior.entropy()
         deep_entropy = prior.distribution.entropy()
-        expected_deep_entropy = distributions.Normal(
+        expected_deep_entropy = normal.Normal(
             loc=mu, scale=sigma).entropy()
         entropies = sess.run([entropy, deep_entropy, expected_deep_entropy])
         self.assertAllEqual(entropies[2], entropies[0])
@@ -155,7 +149,7 @@ class StochasticTensorTest(test.TestCase):
 
       # With default
       with st.value_type(st.MeanValue(stop_gradient=True)):
-        dt = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        dt = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
       loss = dt.loss([constant_op.constant(2.0)])
       self.assertTrue(loss is not None)
       self.assertAllClose(
@@ -163,8 +157,7 @@ class StochasticTensorTest(test.TestCase):
 
       # With passed-in loss_fn.
       dt = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma),
+          normal.Normal(loc=mu, scale=sigma),
           dist_value_type=st.MeanValue(stop_gradient=True),
           loss_fn=sge.get_score_function_with_constant_baseline(
               baseline=constant_op.constant(8.0)))
@@ -199,8 +192,7 @@ class ObservedStochasticTensorTest(test.TestCase):
       sigma = constant_op.constant([1.1, 1.2, 1.3])
       obs = array_ops.zeros((2, 3))
       z = st.ObservedStochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma), value=obs)
+          normal.Normal(loc=mu, scale=sigma), value=obs)
       [obs_val, z_val] = sess.run([obs, z.value()])
       self.assertAllEqual(obs_val, z_val)
 
@@ -212,15 +204,13 @@ class ObservedStochasticTensorTest(test.TestCase):
     sigma = array_ops.placeholder(dtypes.float32)
     obs = array_ops.placeholder(dtypes.float32)
     z = st.ObservedStochasticTensor(
-        distributions.Normal(
-            loc=mu, scale=sigma), value=obs)
+        normal.Normal(loc=mu, scale=sigma), value=obs)
 
     mu2 = array_ops.placeholder(dtypes.float32, shape=[None])
     sigma2 = array_ops.placeholder(dtypes.float32, shape=[None])
     obs2 = array_ops.placeholder(dtypes.float32, shape=[None, None])
     z2 = st.ObservedStochasticTensor(
-        distributions.Normal(
-            loc=mu2, scale=sigma2), value=obs2)
+        normal.Normal(loc=mu2, scale=sigma2), value=obs2)
 
     coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
     self.assertEqual(coll, [z, z2])
@@ -231,22 +221,18 @@ class ObservedStochasticTensorTest(test.TestCase):
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
+        normal.Normal(loc=mu, scale=sigma),
         value=array_ops.zeros((3,)))
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
+        normal.Normal(loc=mu, scale=sigma),
         value=array_ops.zeros((3, 1)))
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
-        value=array_ops.zeros(
-            (1, 2), dtype=dtypes.int32))
+        normal.Normal(loc=mu, scale=sigma),
+        value=array_ops.zeros((1, 2), dtype=dtypes.int32))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
index 7bdd0a3269c..9ee59a03ca7 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
@@ -22,7 +22,7 @@ import numpy as np
 from tensorflow.contrib import distributions
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
 from tensorflow.contrib.bayesflow.python.ops import stochastic_variables
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
+from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 
 sv = stochastic_variables
 st = stochastic_tensor
-vi = variational_inference
+vi = variational_inference_impl
 dist = distributions
 
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
index 49ece025f27..fff6b74b2ef 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
@@ -18,27 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
+from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 st = stochastic_tensor
-vi = variational_inference
+vi = variational_inference_impl
 distributions = distributions_lib
 
 
@@ -75,7 +68,7 @@ class VariationalInferenceTest(test.TestCase):
   def testDefaultVariationalAndPrior(self):
     _, prior, variational, _, log_likelihood = mini_vae()
     elbo = vi.elbo(log_likelihood)
-    expected_elbo = log_likelihood - kullback_leibler.kl(
+    expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
         variational.distribution, prior)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
@@ -87,7 +80,7 @@ class VariationalInferenceTest(test.TestCase):
       prior = normal.Normal(loc=3., scale=2.)
       elbo = vi.elbo(
           log_likelihood, variational_with_prior={variational: prior})
-      expected_elbo = log_likelihood - kullback_leibler.kl(
+      expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
           variational.distribution, prior)
       sess.run(variables.global_variables_initializer())
       self.assertAllEqual(*sess.run([expected_elbo, elbo]))
@@ -128,7 +121,7 @@ class VariationalInferenceTest(test.TestCase):
 
     # No analytic KL available between prior and variational distributions.
     with self.assertRaisesRegexp(NotImplementedError, "No KL"):
-      distributions.kl(variational.distribution, prior)
+      distributions.kl_divergence(variational.distribution, prior)
 
     elbo = vi.elbo(
         variational_with_prior={variational: prior},
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy.py b/tensorflow/contrib/bayesflow/python/ops/entropy.py
index 56490c390cf..a22e1c1d4e0 100644
--- a/tensorflow/contrib/bayesflow/python/ops/entropy.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,411 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""Entropy Ops.
-
-## Background
-
-Common Shannon entropy, the Evidence Lower BOund (ELBO), KL divergence, and more
-all have information theoretic use and interpretations.  They are also often
-used in variational inference.  This library brings together `Ops` for
-estimating them, e.g. using Monte Carlo expectations.
-
-## Examples
-
-Example of fitting a variational posterior with the ELBO.
-
-```python
-# We start by assuming knowledge of the log of a joint density p(z, x) over
-# latent variable z and fixed measurement x.  Since x is fixed, the Python
-# function does not take x as an argument.
-def log_joint(z):
-  theta = tf.Variable(0.)  # Trainable variable that helps define log_joint.
-  ...
-
-# Next, define a Normal distribution with trainable parameters.
-q = distributions.Normal(mu=tf.Variable(0.), sigma=tf.Variable(1.))
-
-# Now, define a loss function (negative ELBO) that, when minimized, will adjust
-# mu, sigma, and theta, increasing the ELBO, which we hope will both reduce the
-# KL divergence between q(z) and p(z | x), and increase p(x).  Note that we
-# cannot guarantee both, but in general we expect both to happen.
-elbo = entropy.elbo_ratio(log_p, q, n=10)
-loss = -elbo
-
-# Minimize the loss
-train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
-tf.global_variables_initializer().run()
-for step in range(100):
-  train_op.run()
-```
-
-## Ops
-
-@@elbo_ratio
-@@entropy_shannon
-@@renyi_ratio
-@@renyi_alpha
-
-"""
+"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.entropy_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
 
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-# Make utility functions from monte_carlo available.
-# pylint: disable=protected-access
-_get_samples = monte_carlo._get_samples
-_logspace_mean = monte_carlo._logspace_mean
-_sample_mean = monte_carlo._sample_mean
-
-# pylint: enable=protected-access
-
-__all__ = [
-    'elbo_ratio',
-    'entropy_shannon',
-    'renyi_ratio',
-    'renyi_alpha',
+_allowed_symbols = [
+    'ELBOForms', 'elbo_ratio', 'entropy_shannon', 'renyi_ratio', 'renyi_alpha'
 ]
 
-
-ELBOForms = variational_inference.ELBOForms  # pylint: disable=invalid-name
-
-
-def elbo_ratio(log_p,
-               q,
-               z=None,
-               n=None,
-               seed=None,
-               form=None,
-               name='elbo_ratio'):
-  r"""Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
-
-  With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
-
-  ```
-  E_q[ Log[p(Z) / q(Z)] ]
-  ```
-
-  The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
-  The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
-  if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
-
-  This log-ratio appears in different contexts:
-
-  #### `KL[q || p]`
-
-  If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
-  the negative Kullback-Leibler divergence.
-
-  ```
-  elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
-  KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
-  ```
-
-  Note that if `p` is a `Distribution`, then `distributions.kl(q, p)` may be
-  defined and available as an exact result.
-
-  #### ELBO
-
-  If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
-  the Evidence Lower BOund (ELBO):
-
-  ```
-  ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
-        = Log[p(x)] - KL[q || p]
-       <= Log[p(x)]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q:  `tf.contrib.distributions.Distribution`.
-    z:  `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
-      shape of `q`, and `dtype` is the same as `q`.
-
-  Raises:
-    ValueError:  If `form` is not handled by this function.
-  """
-  form = ELBOForms.default if form is None else form
-
-  with ops.name_scope(name, values=[n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    entropy = entropy_shannon(q, z=z, form=form)
-
-    # If log_p(z) = Log[p(z)], cross entropy = -E_q[log(p(Z))]
-    negative_cross_entropy = _sample_mean(log_p(z))
-
-    return entropy + negative_cross_entropy
-
-
-def entropy_shannon(p,
-                    z=None,
-                    n=None,
-                    seed=None,
-                    form=None,
-                    name='entropy_shannon'):
-  r"""Monte Carlo or deterministic computation of Shannon's entropy.
-
-  Depending on the kwarg `form`, this `Op` returns either the analytic entropy
-  of the distribution `p`, or the sampled entropy:
-
-  ```
-  -n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
-      \approx - E_p[ Log[p(Z)] ]
-      = Entropy[p]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    p:  `tf.contrib.distributions.Distribution`
-    z:  `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
-
-  Raises:
-    ValueError:  If `form` not handled by this function.
-    ValueError:  If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
-  """
-  form = ELBOForms.default if form is None else form
-
-  if n is not None and form == ELBOForms.analytic_entropy:
-    raise ValueError('If form == ELBOForms.analytic_entropy, n must be None.')
-
-  with ops.name_scope(name, values=[n, z]):
-    # Entropy: -E_p[log(p(Z))].
-    entropy = None
-
-    # Try analytic path
-    if form in [ELBOForms.default, ELBOForms.analytic_entropy]:
-      try:
-        entropy = p.entropy()
-        logging.info('Using analytic entropy(p:%s)', p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    elif form != ELBOForms.sample:
-      raise ValueError('ELBOForm not handled by this function: %s' % form)
-
-    # Sample path
-    if entropy is None:
-      logging.info('Using sampled entropy(p:%s)', p)
-      entropy = -1. * monte_carlo.expectation(
-          p.log_prob, p, z=z, n=n, seed=seed)
-
-    return entropy
-
-
-def renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio'):
-  r"""Monte Carlo estimate of the ratio appearing in Renyi divergence.
-
-  This can be used to compute the Renyi (alpha) divergence, or a log evidence
-  approximation based on Renyi divergence.
-
-  #### Definition
-
-  With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
-  the (biased for finite `n`) estimate:
-
-  ```
-  (1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
-  \approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
-  ```
-
-  This ratio appears in different contexts:
-
-  #### Renyi divergence
-
-  If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
-  `alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
-
-  ```
-  # Choose reasonably high n to limit bias, see below.
-  renyi_ratio(log_p, q, alpha, n=100)
-                  \approx -1 * D_alpha[q || p],  where
-  D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
-  ```
-
-  The Renyi (or "alpha") divergence is non-negative and equal to zero iff
-  `q = p`.  Various limits of `alpha` lead to different special case results:
-
-  ```
-  alpha       D_alpha[q || p]
-  -----       ---------------
-  --> 0       Log[ int_{q > 0} p(z) dz ]
-  = 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
-  --> 1       KL[q || p]
-  = 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
-  --> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
-  ```
-
-  See "Renyi Divergence Variational Inference", by Li and Turner.
-
-  #### Log evidence approximation
-
-  If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
-  an alternative to the ELBO common in variational inference.
-
-  ```
-  L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
-  ```
-
-  If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
-  `ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
-  interpolation between the ELBO and the true evidence.
-
-  #### Stability notes
-
-  Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
-  is subject to underflow/overflow issues.  For that reason, it is evaluated in
-  log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
-  that reason, one may wish to shrink `alpha` gradually.  See the `Op`
-  `renyi_alpha`.  Using `float64` will also help.
-
-
-  #### Bias for finite sample size
-
-  Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
-  `E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
-  estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
-  with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
-  same result as `elbo_ratio`, and as `n` increases the expected value
-  of the estimator increases.
-
-  #### Call signature
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q: `tf.contrib.distributions.Distribution`.
-       `float64` `dtype` recommended.
-       `log_p` and `q` should be supported on the same set.
-    alpha:  `Tensor` with shape `q.batch_shape` and values not equal to 1.
-    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-    n:  Integer `Tensor`.  The number of samples to use if `z` is not provided.
-      Note that this can be highly biased for small `n`, see docstring.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    renyi_result:  The scaled log of sample mean.  `Tensor` with `shape` equal
-      to batch shape of `q`, and `dtype` = `q.dtype`.
-  """
-  with ops.name_scope(name, values=[alpha, n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    # Evaluate sample mean in logspace.  Note that _logspace_mean will compute
-    # (among other things) the mean of q.log_prob(z), which could also be
-    # obtained with q.entropy().  However, DON'T use analytic entropy, because
-    # that increases variance, and could result in NaN/Inf values of a sensitive
-    # term.
-
-    # log_values
-    # = (1 - alpha) * ( Log p - Log q )
-    log_values = (1. - alpha) * (log_p(z) - q.log_prob(z))
-
-    # log_mean_values
-    # = Log[ E[ values ] ]
-    # = Log[ E[ (p / q)^{1-alpha} ] ]
-    log_mean_values = _logspace_mean(log_values)
-
-    return log_mean_values / (1. - alpha)
-
-
-def renyi_alpha(step,
-                decay_time,
-                alpha_min,
-                alpha_max=0.99999,
-                name='renyi_alpha'):
-  r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.
-
-  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
-  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
-  `NaN` and `inf` values when `alpha` is far from `1`.
-
-  For that reason, it is often desirable to start the optimization with `alpha`
-  very close to 1, and reduce it to a final `alpha_min` according to some
-  schedule.  The user may even want to optimize using `elbo_ratio` for
-  some fixed time before switching to Renyi based methods.
-
-  This `Op` returns an `alpha` decaying exponentially with step:
-
-  ```
-  s(step) = (exp{step / decay_time} - 1) / (e - 1)
-  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
-  alpha(t) = (1 - t) alpha_min + t alpha_max
-  ```
-
-  Args:
-    step:  Non-negative scalar `Tensor`.  Typically the global step or an
-      offset version thereof.
-    decay_time:  Positive scalar `Tensor`.
-    alpha_min:  `float` or `double` `Tensor`.
-      The minimal, final value of `alpha`, achieved when `step >= decay_time`
-    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
-      The maximal, beginning value of `alpha`, achieved when `step == 0`
-    name:  A name to give this `Op`.
-
-  Returns:
-    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
-  """
-  with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
-    alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
-    dtype = alpha_min.dtype
-
-    alpha_max = ops.convert_to_tensor(alpha_max, dtype=dtype, name='alpha_max')
-    decay_time = math_ops.cast(decay_time, dtype)
-    step = math_ops.cast(step, dtype)
-
-    check_scalars = [
-        check_ops.assert_rank(step, 0, message='step must be scalar'),
-        check_ops.assert_rank(
-            decay_time, 0, message='decay_time must be scalar'),
-        check_ops.assert_rank(alpha_min, 0, message='alpha_min must be scalar'),
-        check_ops.assert_rank(alpha_max, 0, message='alpha_max must be scalar'),
-    ]
-    check_sign = [
-        check_ops.assert_non_negative(
-            step, message='step must be non-negative'),
-        check_ops.assert_positive(
-            decay_time, message='decay_time must be positive'),
-    ]
-
-    with ops.control_dependencies(check_scalars + check_sign):
-      theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
-      theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
-      return alpha_max * (1. - theta) + alpha_min * theta
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
new file mode 100644
index 00000000000..f155de5032b
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
@@ -0,0 +1,385 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}.
+
+@@elbo_ratio
+@@entropy_shannon
+@@renyi_ratio
+@@renyi_alpha
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
+from tensorflow.contrib.bayesflow.python.ops import variational_inference
+from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import _get_samples as get_samples
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+# Make utility functions from monte_carlo available.
+# pylint: disable=protected-access
+_get_samples = get_samples
+_logspace_mean = monte_carlo._logspace_mean
+_sample_mean = monte_carlo._sample_mean
+
+# pylint: enable=protected-access
+
+__all__ = [
+    'elbo_ratio',
+    'entropy_shannon',
+    'renyi_ratio',
+    'renyi_alpha',
+]
+
+ELBOForms = variational_inference.ELBOForms  # pylint: disable=invalid-name
+
+
+def elbo_ratio(log_p,
+               q,
+               z=None,
+               n=None,
+               seed=None,
+               form=None,
+               name='elbo_ratio'):
+  r"""Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
+
+  With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
+
+  ```
+  E_q[ Log[p(Z) / q(Z)] ]
+  ```
+
+  The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
+  The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
+  if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
+
+  This log-ratio appears in different contexts:
+
+  #### `KL[q || p]`
+
+  If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
+  the negative Kullback-Leibler divergence.
+
+  ```
+  elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
+  KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
+  ```
+
+  Note that if `p` is a `Distribution`, then
+  `distributions.kl_divergence(q, p)` may be defined and available as an
+  exact result.
+
+  #### ELBO
+
+  If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
+  the Evidence Lower BOund (ELBO):
+
+  ```
+  ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
+        = Log[p(x)] - KL[q || p]
+       <= Log[p(x)]
+  ```
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    log_p:  Callable mapping samples from `q` to `Tensors` with
+      shape broadcastable to `q.batch_shape`.
+      For example, `log_p` works "just like" `q.log_prob`.
+    q:  `tf.contrib.distributions.Distribution`.
+    z:  `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
+    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
+    seed:  Python integer to seed the random number generator.
+    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
+      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
+      (attempt analytic entropy, fallback on sample).
+      Default value is `ELBOForms.default`.
+    name:  A name to give this `Op`.
+
+  Returns:
+    Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
+      shape of `q`, and `dtype` is the same as `q`.
+
+  Raises:
+    ValueError:  If `form` is not handled by this function.
+  """
+  form = ELBOForms.default if form is None else form
+
+  with ops.name_scope(name, values=[n, z]):
+    z = _get_samples(q, z, n, seed)
+
+    entropy = entropy_shannon(q, z=z, form=form)
+
+    # If log_p(z) = Log[p(z)], cross entropy = -E_q[log(p(Z))]
+    negative_cross_entropy = _sample_mean(log_p(z))
+
+    return entropy + negative_cross_entropy
+
+
+def entropy_shannon(p,
+                    z=None,
+                    n=None,
+                    seed=None,
+                    form=None,
+                    name='entropy_shannon'):
+  r"""Monte Carlo or deterministic computation of Shannon's entropy.
+
+  Depending on the kwarg `form`, this `Op` returns either the analytic entropy
+  of the distribution `p`, or the sampled entropy:
+
+  ```
+  -n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
+      \approx - E_p[ Log[p(Z)] ]
+      = Entropy[p]
+  ```
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    p:  `tf.contrib.distributions.Distribution`
+    z:  `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
+    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
+    seed:  Python integer to seed the random number generator.
+    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
+      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
+      (attempt analytic entropy, fallback on sample).
+      Default value is `ELBOForms.default`.
+    name:  A name to give this `Op`.
+
+  Returns:
+    A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
+
+  Raises:
+    ValueError:  If `form` not handled by this function.
+    ValueError:  If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
+  """
+  form = ELBOForms.default if form is None else form
+
+  if n is not None and form == ELBOForms.analytic_entropy:
+    raise ValueError('If form == ELBOForms.analytic_entropy, n must be None.')
+
+  with ops.name_scope(name, values=[n, z]):
+    # Entropy: -E_p[log(p(Z))].
+    entropy = None
+
+    # Try analytic path
+    if form in [ELBOForms.default, ELBOForms.analytic_entropy]:
+      try:
+        entropy = p.entropy()
+        logging.info('Using analytic entropy(p:%s)', p)
+      except NotImplementedError as e:
+        if form == ELBOForms.analytic_entropy:
+          raise e
+    elif form != ELBOForms.sample:
+      raise ValueError('ELBOForm not handled by this function: %s' % form)
+
+    # Sample path
+    if entropy is None:
+      logging.info('Using sampled entropy(p:%s)', p)
+      entropy = -1. * monte_carlo.expectation(
+          p.log_prob, p, z=z, n=n, seed=seed)
+
+    return entropy
+
+
+def renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio'):
+  r"""Monte Carlo estimate of the ratio appearing in Renyi divergence.
+
+  This can be used to compute the Renyi (alpha) divergence, or a log evidence
+  approximation based on Renyi divergence.
+
+  #### Definition
+
+  With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
+  the (biased for finite `n`) estimate:
+
+  ```
+  (1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
+  \approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
+  ```
+
+  This ratio appears in different contexts:
+
+  #### Renyi divergence
+
+  If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
+  `alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
+
+  ```
+  # Choose reasonably high n to limit bias, see below.
+  renyi_ratio(log_p, q, alpha, n=100)
+                  \approx -1 * D_alpha[q || p],  where
+  D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
+  ```
+
+  The Renyi (or "alpha") divergence is non-negative and equal to zero iff
+  `q = p`.  Various limits of `alpha` lead to different special case results:
+
+  ```
+  alpha       D_alpha[q || p]
+  -----       ---------------
+  --> 0       Log[ int_{q > 0} p(z) dz ]
+  = 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
+  --> 1       KL[q || p]
+  = 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
+  --> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
+  ```
+
+  See "Renyi Divergence Variational Inference", by Li and Turner.
+
+  #### Log evidence approximation
+
+  If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
+  an alternative to the ELBO common in variational inference.
+
+  ```
+  L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
+  ```
+
+  If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
+  `ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
+  interpolation between the ELBO and the true evidence.
+
+  #### Stability notes
+
+  Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
+  is subject to underflow/overflow issues.  For that reason, it is evaluated in
+  log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
+  that reason, one may wish to shrink `alpha` gradually.  See the `Op`
+  `renyi_alpha`.  Using `float64` will also help.
+
+
+  #### Bias for finite sample size
+
+  Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
+  `E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
+  estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
+  with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
+  same result as `elbo_ratio`, and as `n` increases the expected value
+  of the estimator increases.
+
+  #### Call signature
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    log_p:  Callable mapping samples from `q` to `Tensors` with
+      shape broadcastable to `q.batch_shape`.
+      For example, `log_p` works "just like" `q.log_prob`.
+    q: `tf.contrib.distributions.Distribution`.
+       `float64` `dtype` recommended.
+       `log_p` and `q` should be supported on the same set.
+    alpha:  `Tensor` with shape `q.batch_shape` and values not equal to 1.
+    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
+    n:  Integer `Tensor`.  The number of samples to use if `z` is not provided.
+      Note that this can be highly biased for small `n`, see docstring.
+    seed:  Python integer to seed the random number generator.
+    name:  A name to give this `Op`.
+
+  Returns:
+    renyi_result:  The scaled log of sample mean.  `Tensor` with `shape` equal
+      to batch shape of `q`, and `dtype` = `q.dtype`.
+  """
+  with ops.name_scope(name, values=[alpha, n, z]):
+    z = _get_samples(q, z, n, seed)
+
+    # Evaluate sample mean in logspace.  Note that _logspace_mean will compute
+    # (among other things) the mean of q.log_prob(z), which could also be
+    # obtained with q.entropy().  However, DON'T use analytic entropy, because
+    # that increases variance, and could result in NaN/Inf values of a sensitive
+    # term.
+
+    # log_values
+    # = (1 - alpha) * ( Log p - Log q )
+    log_values = (1. - alpha) * (log_p(z) - q.log_prob(z))
+
+    # log_mean_values
+    # = Log[ E[ values ] ]
+    # = Log[ E[ (p / q)^{1-alpha} ] ]
+    log_mean_values = _logspace_mean(log_values)
+
+    return log_mean_values / (1. - alpha)
+
+
+def renyi_alpha(step,
+                decay_time,
+                alpha_min,
+                alpha_max=0.99999,
+                name='renyi_alpha'):
+  r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.
+
+  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
+  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
+  `NaN` and `inf` values when `alpha` is far from `1`.
+
+  For that reason, it is often desirable to start the optimization with `alpha`
+  very close to 1, and reduce it to a final `alpha_min` according to some
+  schedule.  The user may even want to optimize using `elbo_ratio` for
+  some fixed time before switching to Renyi based methods.
+
+  This `Op` returns an `alpha` decaying exponentially with step:
+
+  ```
+  s(step) = (exp{step / decay_time} - 1) / (e - 1)
+  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
+  alpha(t) = (1 - t) alpha_min + t alpha_max
+  ```
+
+  Args:
+    step:  Non-negative scalar `Tensor`.  Typically the global step or an
+      offset version thereof.
+    decay_time:  Positive scalar `Tensor`.
+    alpha_min:  `float` or `double` `Tensor`.
+      The minimal, final value of `alpha`, achieved when `step >= decay_time`
+    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
+      The maximal, beginning value of `alpha`, achieved when `step == 0`
+    name:  A name to give this `Op`.
+
+  Returns:
+    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
+  """
+  with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
+    alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
+    dtype = alpha_min.dtype
+
+    alpha_max = ops.convert_to_tensor(alpha_max, dtype=dtype, name='alpha_max')
+    decay_time = math_ops.cast(decay_time, dtype)
+    step = math_ops.cast(step, dtype)
+
+    check_scalars = [
+        check_ops.assert_rank(step, 0, message='step must be scalar'),
+        check_ops.assert_rank(
+            decay_time, 0, message='decay_time must be scalar'),
+        check_ops.assert_rank(alpha_min, 0, message='alpha_min must be scalar'),
+        check_ops.assert_rank(alpha_max, 0, message='alpha_max must be scalar'),
+    ]
+    check_sign = [
+        check_ops.assert_non_negative(
+            step, message='step must be non-negative'),
+        check_ops.assert_positive(
+            decay_time, message='decay_time must be positive'),
+    ]
+
+    with ops.control_dependencies(check_scalars + check_sign):
+      theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
+      theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
+      return alpha_max * (1. - theta) + alpha_min * theta
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
index 198d755dffc..5770bcdd706 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
@@ -12,308 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""Monte Carlo integration and helpers.
-
-## Background
-
-Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in R^k` with density `p`,
-the expectation of function `f` can be approximated like:
-
-```
-E_p[f(Z)] = \int f(z) p(z) dz
-          ~ S_n
-          := n^{-1} \sum_{i=1}^n f(z_i),  z_i iid samples from p.
-```
-
-If `E_p[|f(Z)|] < infinity`, then `S_n --> E_p[f(Z)]` by the strong law of large
-numbers.  If `E_p[f(Z)^2] < infinity`, then `S_n` is asymptotically normal with
-variance `Var[f(Z)] / n`.
-
-Practitioners of Bayesian statistics often find themselves wanting to estimate
-`E_p[f(Z)]` when the distribution `p` is known only up to a constant.  For
-example, the joint distribution `p(z, x)` may be known, but the evidence
-`p(x) = \int p(z, x) dz` may be intractable.  In that case, a parameterized
-distribution family `q_lambda(z)` may be chosen, and the optimal `lambda` is the
-one minimizing the KL divergence between `q_lambda(z)` and
-`p(z | x)`.  We only know `p(z, x)`, but that is sufficient to find `lambda`.
-
-
-## Log-space evaluation and subtracting the maximum.
-
-Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `E_q[f(Z) p(Z) / q(Z)]`
-involves the ratio of two terms `p(Z) / q(Z)`, each of which must have tails
-dropping off faster than `O(|z|^{-(k + 1)})` in order to have finite integral.
-This ratio would often be zero or infinity up to numerical precision.
-
-For that reason, we write
-
-```
-Log E_q[ f(Z) p(Z) / q(Z) ]
-   = Log E_q[ exp{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C} ] + C,  where
-C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
-```
-
-The maximum value of the exponentiated term will be 0.0, and the expectation
-can be evaluated in a stable manner.
-
-## Ops
-
-@@expectation
-@@expectation_importance_sampler
-@@expectation_importance_sampler_logspace
+"""Monte Carlo integration and helpers.
 
+See the @{$python/contrib.bayesflow.monte_carlo} guide.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
 
-__all__ = [
+_allowed_symbols = [
     'expectation',
     'expectation_importance_sampler',
     'expectation_importance_sampler_logspace',
 ]
 
-
-def expectation_importance_sampler(f,
-                                   log_p,
-                                   sampling_dist_q,
-                                   z=None,
-                                   n=None,
-                                   seed=None,
-                                   name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
-
-  With `p(z) := exp{log_p(z)}`, this `Op` returns
-
-  ```
-  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-  \approx E_q[ f(Z) p(Z) / q(Z) ]
-  =       E_p[f(Z)]
-  ```
-
-  This integral is done in log-space with max-subtraction to better handle the
-  often extreme values that `f(z) p(z) / q(z)` can take on.
-
-  If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
-  `expectation_importance_sampler_logspace` applied to `Log[f]`.
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
-      broadcastable to `q.batch_shape`.
-      For example, `f` works "just like" `q.log_prob`.
-    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
-    sampling_dist_q:  The sampling distribution.
-      `tf.contrib.distributions.Distribution`.
-      `float64` `dtype` recommended.
-      `log_p` and `q` should be supported on the same set.
-    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    The importance sampling estimate.  `Tensor` with `shape` equal
-      to batch shape of `q`, and `dtype` = `q.dtype`.
-  """
-  q = sampling_dist_q
-  with ops.name_scope(name, values=[z, n]):
-    z = _get_samples(q, z, n, seed)
-
-    log_p_z = log_p(z)
-    q_log_prob_z = q.log_prob(z)
-
-    def _importance_sampler_positive_f(log_f_z):
-      # Same as expectation_importance_sampler_logspace, but using Tensors
-      # rather than samples and functions.  Allows us to sample once.
-      log_values = log_f_z + log_p_z - q_log_prob_z
-      return _logspace_mean(log_values)
-
-    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
-    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
-    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
-    # Without incurring bias, 1 is added to each to prevent zeros in logspace.
-    # The logarithm is approximately linear around 1 + epsilon, so this is good
-    # for small values of 'z' as well.
-    f_z = f(z)
-    log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.)
-    log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.)
-
-    log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z)
-    log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z)
-
-  return math_ops.exp(log_f_plus_integral) - math_ops.exp(log_f_minus_integral)
-
-
-def expectation_importance_sampler_logspace(
-    log_f,
-    log_p,
-    sampling_dist_q,
-    z=None,
-    n=None,
-    seed=None,
-    name='expectation_importance_sampler_logspace'):
-  r"""Importance sampling with a positive function, in log-space.
-
-  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-  returns
-
-  ```
-  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-  =       Log[E_p[f(Z)]]
-  ```
-
-  This integral is done in log-space with max-subtraction to better handle the
-  often extreme values that `f(z) p(z) / q(z)` can take on.
-
-  In contrast to `expectation_importance_sampler`, this `Op` returns values in
-  log-space.
-
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_f: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_f` works "just like" `sampling_dist_q.log_prob`.
-    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    sampling_dist_q:  The sampling distribution.
-      `tf.contrib.distributions.Distribution`.
-      `float64` `dtype` recommended.
-      `log_p` and `q` should be supported on the same set.
-    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    Logarithm of the importance sampling estimate.  `Tensor` with `shape` equal
-      to batch shape of `q`, and `dtype` = `q.dtype`.
-  """
-  q = sampling_dist_q
-  with ops.name_scope(name, values=[z, n]):
-    z = _get_samples(q, z, n, seed)
-    log_values = log_f(z) + log_p(z) - q.log_prob(z)
-    return _logspace_mean(log_values)
-
-
-def _logspace_mean(log_values):
-  """Evaluate `Log[E[values]]` in a stable manner.
-
-  Args:
-    log_values:  `Tensor` holding `Log[values]`.
-
-  Returns:
-    `Tensor` of same `dtype` as `log_values`, reduced across dim 0.
-      `Log[Mean[values]]`.
-  """
-  # center = Max[Log[values]],  with stop-gradient
-  # The center hopefully keep the exponentiated term small.  It is cancelled
-  # from the final result, so putting stop gradient on it will not change the
-  # final result.  We put stop gradient on to eliminate unnecessary computation.
-  center = array_ops.stop_gradient(_sample_max(log_values))
-
-  # centered_values = exp{Log[values] - E[Log[values]]}
-  centered_values = math_ops.exp(log_values - center)
-
-  # log_mean_of_values = Log[ E[centered_values] ] + center
-  #                    = Log[ E[exp{log_values - E[log_values]}] ] + center
-  #                    = Log[E[values]] - E[log_values] + center
-  #                    = Log[E[values]]
-  log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center
-
-  return log_mean_of_values
-
-
-def expectation(f, p, z=None, n=None, seed=None, name='expectation'):
-  r"""Monte Carlo estimate of an expectation:  `E_p[f(Z)]` with sample mean.
-
-  This `Op` returns
-
-  ```
-  n^{-1} sum_{i=1}^n f(z_i),  where z_i ~ p
-  \approx E_p[f(Z)]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    f: Callable mapping samples from `p` to `Tensors`.
-    p:  `tf.contrib.distributions.Distribution`.
-    z:  `Tensor` of samples from `p`, produced by `p.sample` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    A `Tensor` with the same `dtype` as `p`.
-
-  Example:
-
-  ```python
-  N_samples = 10000
-
-  distributions = tf.contrib.distributions
-
-  dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
-  elementwise_mean = lambda x: x
-  mean_sum = lambda x: tf.reduce_sum(x, 1)
-
-  estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
-                                                         dist,
-                                                         n=N_samples)
-  estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
-                                                 dist,
-                                                 n=N_samples)
-
-  with tf.Session() as sess:
-    estimate_elementwise_mean, estimate_mean_sum = (
-        sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
-  print estimate_elementwise_mean
-  >>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
-  print estimate_mean_sum
-  >>> 1.49571
-
-  ```
-
-  """
-  with ops.name_scope(name, values=[n, z]):
-    z = _get_samples(p, z, n, seed)
-    return _sample_mean(f(z))
-
-
-def _sample_mean(values):
-  """Mean over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_mean(values, reduction_indices=[0])
-
-
-def _sample_max(values):
-  """Max over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_max(values, reduction_indices=[0])
-
-
-def _get_samples(dist, z, n, seed):
-  """Check args and return samples."""
-  with ops.name_scope('get_samples', values=[z, n]):
-    if (n is None) == (z is None):
-      raise ValueError(
-          'Must specify exactly one of arguments "n" and "z".  Found: '
-          'n = %s, z = %s' % (n, z))
-    if n is not None:
-      return dist.sample(n, seed=seed)
-    else:
-      return ops.convert_to_tensor(z, name='z')
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
new file mode 100644
index 00000000000..3590f940acf
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -0,0 +1,274 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Monte Carlo integration and helpers.
+
+See the @{$python/contrib.bayesflow.monte_carlo} guide.
+
+@@expectation
+@@expectation_importance_sampler
+@@expectation_importance_sampler_logspace
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+__all__ = [
+    'expectation',
+    'expectation_importance_sampler',
+    'expectation_importance_sampler_logspace',
+]
+
+
+def expectation_importance_sampler(f,
+                                   log_p,
+                                   sampling_dist_q,
+                                   z=None,
+                                   n=None,
+                                   seed=None,
+                                   name='expectation_importance_sampler'):
+  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
+
+  With `p(z) := exp{log_p(z)}`, this `Op` returns
+
+  ```
+  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
+  \approx E_q[ f(Z) p(Z) / q(Z) ]
+  =       E_p[f(Z)]
+  ```
+
+  This integral is done in log-space with max-subtraction to better handle the
+  often extreme values that `f(z) p(z) / q(z)` can take on.
+
+  If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
+  `expectation_importance_sampler_logspace` applied to `Log[f]`.
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
+      broadcastable to `q.batch_shape`.
+      For example, `f` works "just like" `q.log_prob`.
+    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
+      shape broadcastable to `q.batch_shape`.
+      For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
+    sampling_dist_q:  The sampling distribution.
+      `tf.contrib.distributions.Distribution`.
+      `float64` `dtype` recommended.
+      `log_p` and `q` should be supported on the same set.
+    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
+    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
+    seed:  Python integer to seed the random number generator.
+    name:  A name to give this `Op`.
+
+  Returns:
+    The importance sampling estimate.  `Tensor` with `shape` equal
+      to batch shape of `q`, and `dtype` = `q.dtype`.
+  """
+  q = sampling_dist_q
+  with ops.name_scope(name, values=[z, n]):
+    z = _get_samples(q, z, n, seed)
+
+    log_p_z = log_p(z)
+    q_log_prob_z = q.log_prob(z)
+
+    def _importance_sampler_positive_f(log_f_z):
+      # Same as expectation_importance_sampler_logspace, but using Tensors
+      # rather than samples and functions.  Allows us to sample once.
+      log_values = log_f_z + log_p_z - q_log_prob_z
+      return _logspace_mean(log_values)
+
+    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
+    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
+    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
+    # Without incurring bias, 1 is added to each to prevent zeros in logspace.
+    # The logarithm is approximately linear around 1 + epsilon, so this is good
+    # for small values of 'z' as well.
+    f_z = f(z)
+    log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.)
+    log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.)
+
+    log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z)
+    log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z)
+
+  return math_ops.exp(log_f_plus_integral) - math_ops.exp(log_f_minus_integral)
+
+
+def expectation_importance_sampler_logspace(
+    log_f,
+    log_p,
+    sampling_dist_q,
+    z=None,
+    n=None,
+    seed=None,
+    name='expectation_importance_sampler_logspace'):
+  r"""Importance sampling with a positive function, in log-space.
+
+  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
+  returns
+
+  ```
+  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
+  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
+  =       Log[E_p[f(Z)]]
+  ```
+
+  This integral is done in log-space with max-subtraction to better handle the
+  often extreme values that `f(z) p(z) / q(z)` can take on.
+
+  In contrast to `expectation_importance_sampler`, this `Op` returns values in
+  log-space.
+
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    log_f: Callable mapping samples from `sampling_dist_q` to `Tensors` with
+      shape broadcastable to `q.batch_shape`.
+      For example, `log_f` works "just like" `sampling_dist_q.log_prob`.
+    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
+      shape broadcastable to `q.batch_shape`.
+      For example, `log_p` works "just like" `q.log_prob`.
+    sampling_dist_q:  The sampling distribution.
+      `tf.contrib.distributions.Distribution`.
+      `float64` `dtype` recommended.
+      `log_p` and `q` should be supported on the same set.
+    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
+    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
+    seed:  Python integer to seed the random number generator.
+    name:  A name to give this `Op`.
+
+  Returns:
+    Logarithm of the importance sampling estimate.  `Tensor` with `shape` equal
+      to batch shape of `q`, and `dtype` = `q.dtype`.
+  """
+  q = sampling_dist_q
+  with ops.name_scope(name, values=[z, n]):
+    z = _get_samples(q, z, n, seed)
+    log_values = log_f(z) + log_p(z) - q.log_prob(z)
+    return _logspace_mean(log_values)
+
+
+def _logspace_mean(log_values):
+  """Evaluate `Log[E[values]]` in a stable manner.
+
+  Args:
+    log_values:  `Tensor` holding `Log[values]`.
+
+  Returns:
+    `Tensor` of same `dtype` as `log_values`, reduced across dim 0.
+      `Log[Mean[values]]`.
+  """
+  # center = Max[Log[values]],  with stop-gradient
+  # The center hopefully keep the exponentiated term small.  It is canceled
+  # from the final result, so putting stop gradient on it will not change the
+  # final result.  We put stop gradient on to eliminate unnecessary computation.
+  center = array_ops.stop_gradient(_sample_max(log_values))
+
+  # centered_values = exp{Log[values] - E[Log[values]]}
+  centered_values = math_ops.exp(log_values - center)
+
+  # log_mean_of_values = Log[ E[centered_values] ] + center
+  #                    = Log[ E[exp{log_values - E[log_values]}] ] + center
+  #                    = Log[E[values]] - E[log_values] + center
+  #                    = Log[E[values]]
+  log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center
+
+  return log_mean_of_values
+
+
+def expectation(f, p, z=None, n=None, seed=None, name='expectation'):
+  r"""Monte Carlo estimate of an expectation:  `E_p[f(Z)]` with sample mean.
+
+  This `Op` returns
+
+  ```
+  n^{-1} sum_{i=1}^n f(z_i),  where z_i ~ p
+  \approx E_p[f(Z)]
+  ```
+
+  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+
+  Args:
+    f: Callable mapping samples from `p` to `Tensors`.
+    p:  `tf.contrib.distributions.Distribution`.
+    z:  `Tensor` of samples from `p`, produced by `p.sample` for some `n`.
+    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
+    seed:  Python integer to seed the random number generator.
+    name:  A name to give this `Op`.
+
+  Returns:
+    A `Tensor` with the same `dtype` as `p`.
+
+  Example:
+
+  ```python
+  N_samples = 10000
+
+  distributions = tf.contrib.distributions
+
+  dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
+  elementwise_mean = lambda x: x
+  mean_sum = lambda x: tf.reduce_sum(x, 1)
+
+  estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
+                                                         dist,
+                                                         n=N_samples)
+  estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
+                                                 dist,
+                                                 n=N_samples)
+
+  with tf.Session() as sess:
+    estimate_elementwise_mean, estimate_mean_sum = (
+        sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
+  print estimate_elementwise_mean
+  >>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
+  print estimate_mean_sum
+  >>> 1.49571
+
+  ```
+
+  """
+  with ops.name_scope(name, values=[n, z]):
+    z = _get_samples(p, z, n, seed)
+    return _sample_mean(f(z))
+
+
+def _sample_mean(values):
+  """Mean over sample indices.  In this module this is always [0]."""
+  return math_ops.reduce_mean(values, reduction_indices=[0])
+
+
+def _sample_max(values):
+  """Max over sample indices.  In this module this is always [0]."""
+  return math_ops.reduce_max(values, reduction_indices=[0])
+
+
+def _get_samples(dist, z, n, seed):
+  """Check args and return samples."""
+  with ops.name_scope('get_samples', values=[z, n]):
+    if (n is None) == (z is None):
+      raise ValueError(
+          'Must specify exactly one of arguments "n" and "z".  Found: '
+          'n = %s, z = %s' % (n, z))
+    if n is not None:
+      return dist.sample(n, seed=seed)
+    else:
+      return ops.convert_to_tensor(z, name='z')
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
index f9f3721047b..695310837e0 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Stochastic gradient estimators.
 
-These functions are meant to be used in conjuction with `StochasticTensor`
+These functions are meant to be used in conjunction with `StochasticTensor`
 (`loss_fn` parameter) and `surrogate_loss`.
 
 See Gradient Estimation Using Stochastic Computation Graphs
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
index 81c04d8e9b5..b8e38b6f9bf 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and helper functions for Stochastic Computation Graphs.
+"""Support for Stochastic Computation Graphs.
 
-## Stochastic Computation Graph Helper Functions
+See the @{$python/contrib.bayesflow.stochastic_graph} guide.
 
 @@surrogate_loss
 """
@@ -23,153 +23,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.stochastic_graph_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _upstream_stochastic_nodes(tensors):
-  """Map tensors to the stochastic tensors upstream of them.
+_allowed_symbols = [
+    "surrogate_loss"
+]
 
-  Args:
-    tensors: a list of Tensors.
-
-  Returns:
-    A dict that maps the tensors passed in to the `StochasticTensor` objects
-    upstream of them.
-  """
-  reverse_map = _stochastic_dependencies_map(tensors)
-  upstream = collections.defaultdict(set)
-  for st, ts in reverse_map.items():
-    for t in ts:
-      upstream[t].add(st)
-  return upstream
-
-
-def _stochastic_dependencies_map(fixed_losses, stochastic_tensors=None):
-  """Map stochastic tensors to the fixed losses that depend on them.
-
-  Args:
-    fixed_losses: a list of `Tensor`s.
-    stochastic_tensors: a list of `StochasticTensor`s to map to fixed losses.
-      If `None`, all `StochasticTensor`s in the graph will be used.
-
-  Returns:
-    A dict `dependencies` that maps `StochasticTensor` objects to subsets of
-    `fixed_losses`.
-
-    If `loss in dependencies[st]`, for some `loss` in `fixed_losses` then there
-    is a direct path from `st.value()` to `loss` in the graph.
-  """
-  stoch_value_collection = stochastic_tensors or ops.get_collection(
-      stochastic_tensor.STOCHASTIC_TENSOR_COLLECTION)
-
-  if not stoch_value_collection:
-    return {}
-
-  stoch_value_map = dict(
-      (node.value(), node) for node in stoch_value_collection)
-
-  # Step backwards through the graph to see which surrogate losses correspond
-  # to which fixed_losses.
-  #
-  # TODO(ebrevdo): Ensure that fixed_losses and stochastic values are in the
-  # same frame.
-  stoch_dependencies_map = collections.defaultdict(set)
-  for loss in fixed_losses:
-    boundary = set([loss])
-    while boundary:
-      edge = boundary.pop()
-      edge_stoch_node = stoch_value_map.get(edge, None)
-      if edge_stoch_node:
-        stoch_dependencies_map[edge_stoch_node].add(loss)
-      boundary.update(edge.op.inputs)
-
-  return stoch_dependencies_map
-
-
-def surrogate_loss(sample_losses,
-                   stochastic_tensors=None,
-                   name="SurrogateLoss"):
-  """Surrogate loss for stochastic graphs.
-
-  This function will call `loss_fn` on each `StochasticTensor`
-  upstream of `sample_losses`, passing the losses that it influenced.
-
-  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
-  instantiated in `while_loop`s or other control structures.
-
-  Args:
-    sample_losses: a list or tuple of final losses. Each loss should be per
-      example in the batch (and possibly per sample); that is, it should have
-      dimensionality of 1 or greater. All losses should have the same shape.
-    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
-      If None, defaults to all `StochasticTensor`s in the graph upstream of
-      the `Tensor`s in `sample_losses`.
-    name: the name with which to prepend created ops.
-
-  Returns:
-    `Tensor` loss, which is the sum of `sample_losses` and the
-    `loss_fn`s returned by the `StochasticTensor`s.
-
-  Raises:
-    TypeError: if `sample_losses` is not a list or tuple, or if its elements
-      are not `Tensor`s.
-    ValueError: if any loss in `sample_losses` does not have dimensionality 1
-      or greater.
-  """
-  with ops.name_scope(name, values=sample_losses):
-    if not isinstance(sample_losses, (list, tuple)):
-      raise TypeError("sample_losses must be a list or tuple")
-    for loss in sample_losses:
-      if not isinstance(loss, ops.Tensor):
-        raise TypeError("loss is not a Tensor: %s" % loss)
-      ndims = loss.get_shape().ndims
-      if not (ndims is not None and ndims >= 1):
-        raise ValueError("loss must have dimensionality 1 or greater: %s" %
-                         loss)
-
-    stoch_dependencies_map = _stochastic_dependencies_map(
-        sample_losses, stochastic_tensors=stochastic_tensors)
-    if not stoch_dependencies_map:
-      logging.warn(
-          "No collection of Stochastic Tensors found for current graph.")
-      return math_ops.add_n(sample_losses)
-
-    # Iterate through all of the stochastic dependencies, adding
-    # surrogate terms where necessary.
-    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
-    loss_terms = sample_losses
-    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
-      dependent_losses = list(dependent_losses)
-
-      logging.info("Losses influenced by StochasticTensor %s: [%s]",
-                   stoch_node.name, ", ".join(
-                       [loss.name for loss in dependent_losses]))
-
-      # Sum up the downstream losses for this ST
-      influenced_loss = _add_n_or_sum(dependent_losses)
-
-      # Compute surrogate loss term
-      loss_term = stoch_node.loss(array_ops.stop_gradient(influenced_loss))
-      if loss_term is not None:
-        loss_terms.append(loss_term)
-
-    return _add_n_or_sum(loss_terms)
-
-
-def _add_n_or_sum(terms):
-  # add_n works for Tensors of the same dtype and shape
-  shape = terms[0].get_shape()
-  dtype = terms[0].dtype
-
-  if all(term.get_shape().is_fully_defined() and
-         term.get_shape().is_compatible_with(shape) and term.dtype == dtype
-         for term in terms):
-    return math_ops.add_n(terms)
-  else:
-    return sum(terms)
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
new file mode 100644
index 00000000000..b2338bca8c9
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
@@ -0,0 +1,175 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and helper functions for Stochastic Computation Graphs.
+
+## Stochastic Computation Graph Helper Functions
+
+@@surrogate_loss
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _upstream_stochastic_nodes(tensors):
+  """Map tensors to the stochastic tensors upstream of them.
+
+  Args:
+    tensors: a list of Tensors.
+
+  Returns:
+    A dict that maps the tensors passed in to the `StochasticTensor` objects
+    upstream of them.
+  """
+  reverse_map = _stochastic_dependencies_map(tensors)
+  upstream = collections.defaultdict(set)
+  for st, ts in reverse_map.items():
+    for t in ts:
+      upstream[t].add(st)
+  return upstream
+
+
+def _stochastic_dependencies_map(fixed_losses, stochastic_tensors=None):
+  """Map stochastic tensors to the fixed losses that depend on them.
+
+  Args:
+    fixed_losses: a list of `Tensor`s.
+    stochastic_tensors: a list of `StochasticTensor`s to map to fixed losses.
+      If `None`, all `StochasticTensor`s in the graph will be used.
+
+  Returns:
+    A dict `dependencies` that maps `StochasticTensor` objects to subsets of
+    `fixed_losses`.
+
+    If `loss in dependencies[st]`, for some `loss` in `fixed_losses` then there
+    is a direct path from `st.value()` to `loss` in the graph.
+  """
+  stoch_value_collection = stochastic_tensors or ops.get_collection(
+      stochastic_tensor_impl.STOCHASTIC_TENSOR_COLLECTION)
+
+  if not stoch_value_collection:
+    return {}
+
+  stoch_value_map = dict(
+      (node.value(), node) for node in stoch_value_collection)
+
+  # Step backwards through the graph to see which surrogate losses correspond
+  # to which fixed_losses.
+  #
+  # TODO(ebrevdo): Ensure that fixed_losses and stochastic values are in the
+  # same frame.
+  stoch_dependencies_map = collections.defaultdict(set)
+  for loss in fixed_losses:
+    boundary = set([loss])
+    while boundary:
+      edge = boundary.pop()
+      edge_stoch_node = stoch_value_map.get(edge, None)
+      if edge_stoch_node:
+        stoch_dependencies_map[edge_stoch_node].add(loss)
+      boundary.update(edge.op.inputs)
+
+  return stoch_dependencies_map
+
+
+def surrogate_loss(sample_losses,
+                   stochastic_tensors=None,
+                   name="SurrogateLoss"):
+  """Surrogate loss for stochastic graphs.
+
+  This function will call `loss_fn` on each `StochasticTensor`
+  upstream of `sample_losses`, passing the losses that it influenced.
+
+  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
+  instantiated in `while_loop`s or other control structures.
+
+  Args:
+    sample_losses: a list or tuple of final losses. Each loss should be per
+      example in the batch (and possibly per sample); that is, it should have
+      dimensionality of 1 or greater. All losses should have the same shape.
+    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
+      If None, defaults to all `StochasticTensor`s in the graph upstream of
+      the `Tensor`s in `sample_losses`.
+    name: the name with which to prepend created ops.
+
+  Returns:
+    `Tensor` loss, which is the sum of `sample_losses` and the
+    `loss_fn`s returned by the `StochasticTensor`s.
+
+  Raises:
+    TypeError: if `sample_losses` is not a list or tuple, or if its elements
+      are not `Tensor`s.
+    ValueError: if any loss in `sample_losses` does not have dimensionality 1
+      or greater.
+  """
+  with ops.name_scope(name, values=sample_losses):
+    if not isinstance(sample_losses, (list, tuple)):
+      raise TypeError("sample_losses must be a list or tuple")
+    for loss in sample_losses:
+      if not isinstance(loss, ops.Tensor):
+        raise TypeError("loss is not a Tensor: %s" % loss)
+      ndims = loss.get_shape().ndims
+      if not (ndims is not None and ndims >= 1):
+        raise ValueError("loss must have dimensionality 1 or greater: %s" %
+                         loss)
+
+    stoch_dependencies_map = _stochastic_dependencies_map(
+        sample_losses, stochastic_tensors=stochastic_tensors)
+    if not stoch_dependencies_map:
+      logging.warn(
+          "No collection of Stochastic Tensors found for current graph.")
+      return math_ops.add_n(sample_losses)
+
+    # Iterate through all of the stochastic dependencies, adding
+    # surrogate terms where necessary.
+    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
+    loss_terms = sample_losses
+    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
+      dependent_losses = list(dependent_losses)
+
+      logging.info("Losses influenced by StochasticTensor %s: [%s]",
+                   stoch_node.name, ", ".join(
+                       [loss.name for loss in dependent_losses]))
+
+      # Sum up the downstream losses for this ST
+      influenced_loss = _add_n_or_sum(dependent_losses)
+
+      # Compute surrogate loss term
+      loss_term = stoch_node.loss(array_ops.stop_gradient(influenced_loss))
+      if loss_term is not None:
+        loss_terms.append(loss_term)
+
+    return _add_n_or_sum(loss_terms)
+
+
+def _add_n_or_sum(terms):
+  # add_n works for Tensors of the same dtype and shape
+  shape = terms[0].get_shape()
+  dtype = terms[0].dtype
+
+  if all(term.get_shape().is_fully_defined() and
+         term.get_shape().is_compatible_with(shape) and term.dtype == dtype
+         for term in terms):
+    return math_ops.add_n(terms)
+  else:
+    return sum(terms)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
index fd907e9054d..4d39a7918b3 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
@@ -12,26 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and helper functions for creating Stochastic Tensors.
+"""Support for creating Stochastic Tensors.
 
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
+See the @{$python/contrib.bayesflow.stochastic_tensor} guide.
 
 @@BaseStochasticTensor
 @@StochasticTensor
-
-## Stochastic Tensor Value Types
-
 @@MeanValue
 @@SampleValue
-
 @@value_type
 @@get_current_value_type
 """
@@ -40,436 +28,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import collections
-import contextlib
-import threading
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.stochastic_tensor_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
 
-import six
 
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-
-STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
-
-
-@six.add_metaclass(abc.ABCMeta)
-class BaseStochasticTensor(object):
-  """Base Class for Tensor-like objects that emit stochastic values."""
-
-  def __init__(self):
-    # Add self to this graph's Stochsatic Tensor collection for
-    # purposes of later performing correct surrogate loss calculation.
-    ops.add_to_collection(STOCHASTIC_TENSOR_COLLECTION, self)
-
-  @abc.abstractproperty
-  def name(self):
-    pass
-
-  @abc.abstractproperty
-  def dtype(self):
-    pass
-
-  @abc.abstractproperty
-  def graph(self):
-    pass
-
-  @abc.abstractmethod
-  def value(self, name=None):
-    pass
-
-  @abc.abstractmethod
-  def loss(self, sample_loss):
-    """Returns the term to add to the surrogate loss.
-
-    This method is called by `surrogate_loss`.  The input `sample_loss` should
-    have already had `stop_gradient` applied to it.  This is because the
-    surrogate_loss usually provides a Monte Carlo sample term of the form
-    `differentiable_surrogate * sample_loss` where `sample_loss` is considered
-    constant with respect to the input for purposes of the gradient.
-
-    Args:
-      sample_loss: `Tensor`, sample loss downstream of this `StochasticTensor`.
-
-    Returns:
-      Either `None` or a `Tensor`.
-    """
-    raise NotImplementedError("surrogate_loss not implemented")
-
-  @staticmethod
-  def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
-    _ = name
-    if dtype and not dtype.is_compatible_with(v.dtype):
-      raise ValueError(
-          "Incompatible type conversion requested to type '%s' for variable "
-          "of type '%s'" % (dtype.name, v.dtype.name))
-    if as_ref:
-      raise ValueError("%s: Ref type is not supported." % v)
-    return v.value()
-
-
-# pylint: disable=protected-access
-ops.register_tensor_conversion_function(
-    BaseStochasticTensor, BaseStochasticTensor._tensor_conversion_function)
-
-# pylint: enable=protected-access
-
-
-class _StochasticValueType(object):
-  """Interface for the ValueType classes.
-
-  This is the base class for MeanValue, SampleValue, and their descendants.
-  """
-
-  def pushed_above(self, unused_value_type):
-    pass
-
-  def popped_above(self, unused_value_type):
-    pass
-
-  def declare_inputs(self, unused_stochastic_tensor, unused_inputs_dict):
-    pass
-
-  @abc.abstractproperty
-  def stop_gradient(self):
-    """Whether the value should be wrapped in stop_gradient.
-
-    StochasticTensors must respect this property.
-    """
-    pass
-
-
-class MeanValue(_StochasticValueType):
-
-  def __init__(self, stop_gradient=False):
-    self._stop_gradient = stop_gradient
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-class SampleValue(_StochasticValueType):
-  """Draw samples, possibly adding new outer dimensions along the way.
-
-  This ValueType draws samples from StochasticTensors run within its
-  context, increasing the rank according to the requested shape.
-
-  Examples:
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue()):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 1 sample and does not reshape
-  assertEqual(st.value().get_shape(), (2, 3))
-  ```
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue(4)):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 4 samples each with shape (2, 3) and concatenates
-  assertEqual(st.value().get_shape(), (4, 2, 3))
-  ```
-  """
-
-  def __init__(self, shape=(), stop_gradient=False):
-    """Sample according to shape.
-
-    For the given StochasticTensor `st` using this value type,
-    the shape of `st.value()` will match that of
-    `st.distribution.sample(shape)`.
-
-    Args:
-      shape: A shape tuple or int32 tensor.  The sample shape.
-        Default is a scalar: take one sample and do not change the size.
-      stop_gradient: If `True`, StochasticTensors' values are wrapped in
-        `stop_gradient`, to avoid backpropagation through.
-    """
-    self._shape = shape
-    self._stop_gradient = stop_gradient
-
-  @property
-  def shape(self):
-    return self._shape
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-# Keeps track of how a StochasticTensor's value should be accessed.
-# Used by value_type and get_current_value_type below.
-_STOCHASTIC_VALUE_STACK = collections.defaultdict(list)
-
-
-@contextlib.contextmanager
-def value_type(dist_value_type):
-  """Creates a value type context for any StochasticTensor created within.
-
-  Typical usage:
-
-  ```
-  with sg.value_type(sg.MeanValue(stop_gradients=True)):
-    st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
-                             sigma=sigma)
-  ```
-
-  In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
-  be the mean value of the Normal distribution, i.e., `mu` (possibly
-  broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
-  was marked with `stop_gradients=True`, this value will have been wrapped
-  in a `stop_gradients` call to disable any possible backpropagation.
-
-  Args:
-    dist_value_type: An instance of `MeanValue`, `SampleValue`, or
-      any other stochastic value type.
-
-  Yields:
-    A context for `StochasticTensor` objects that controls the
-    value created when they are initialized.
-
-  Raises:
-    TypeError: if `dist_value_type` is not an instance of a stochastic value
-      type.
-  """
-  if not isinstance(dist_value_type, _StochasticValueType):
-    raise TypeError("dist_value_type must be a Distribution Value Type")
-  thread_id = threading.current_thread().ident
-  stack = _STOCHASTIC_VALUE_STACK[thread_id]
-  if stack:
-    stack[-1].pushed_above(dist_value_type)
-  stack.append(dist_value_type)
-  yield
-  stack.pop()
-  if stack:
-    stack[-1].popped_above(dist_value_type)
-
-
-class NoValueTypeSetError(ValueError):
-  pass
-
-
-def get_current_value_type():
-  thread_id = threading.current_thread().ident
-  if not _STOCHASTIC_VALUE_STACK[thread_id]:
-    raise NoValueTypeSetError(
-        "No value type currently set for this thread (%s).  Did you forget to "
-        "wrap 'with stochastic_graph.value_type(...)'?" % thread_id)
-  return _STOCHASTIC_VALUE_STACK[thread_id][-1]
-
-
-class StochasticTensor(BaseStochasticTensor):
-  """StochasticTensor is a BaseStochasticTensor backed by a distribution."""
-
-  def __init__(self,
-               dist,
-               name="StochasticTensor",
-               dist_value_type=None,
-               loss_fn=sge.score_function):
-    """Construct a `StochasticTensor`.
-
-    `StochasticTensor` is backed by the `dist` distribution and its `value`
-    method will return the same value each time it is called. What `value` is
-    returned is controlled by the `dist_value_type` (defaults to
-    `SampleValue`).
-
-    Some distributions' sample functions are not differentiable (e.g. a sample
-    from a discrete distribution like a Bernoulli) and so to differentiate
-    wrt parameters upstream of the sample requires a gradient estimator like
-    the score function estimator. This is accomplished by passing a
-    differentiable `loss_fn` to the `StochasticTensor`, which
-    defaults to a function whose derivative is the score function estimator.
-    Calling `stochastic_graph.surrogate_loss(final_losses)` will call
-    `loss()` on every `StochasticTensor` upstream of final losses.
-
-    `loss()` will return None for `StochasticTensor`s backed by
-    reparameterized distributions; it will also return None if the value type is
-    `MeanValueType` or if `loss_fn=None`.
-
-    Args:
-      dist: an instance of `Distribution`.
-      name: a name for this `StochasticTensor` and its ops.
-      dist_value_type: a `_StochasticValueType`, which will determine what the
-          `value` of this `StochasticTensor` will be. If not provided, the
-          value type set with the `value_type` context manager will be used.
-      loss_fn: callable that takes
-          `(st, st.value(), influenced_loss)`, where
-          `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
-          default, `loss_fn` is the `score_function`, or more precisely, the
-          integral of the score function, such that when the gradient is taken,
-          the score function results. See the `stochastic_gradient_estimators`
-          module for additional loss functions and baselines.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      TypeError: if `loss_fn` is not `callable`.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    if dist_value_type is None:
-      try:
-        self._value_type = get_current_value_type()
-      except NoValueTypeSetError:
-        self._value_type = SampleValue()
-    else:
-      # We want to enforce a value type here, but use the value_type()
-      # context manager to enforce some error checking.
-      with value_type(dist_value_type):
-        self._value_type = get_current_value_type()
-
-    if loss_fn is not None and not callable(loss_fn):
-      raise TypeError("loss_fn must be callable")
-    self._loss_fn = loss_fn
-
-    with ops.name_scope(name) as scope:
-      self._name = scope
-      self._dist = dist
-      self._value = self._create_value()
-
-    super(StochasticTensor, self).__init__()
-
-  @property
-  def value_type(self):
-    return self._value_type
-
-  @property
-  def distribution(self):
-    return self._dist
-
-  def _create_value(self):
-    """Create the value Tensor based on the value type, store as self._value."""
-
-    if isinstance(self._value_type, MeanValue):
-      value_tensor = self._dist.mean()
-    elif isinstance(self._value_type, SampleValue):
-      value_tensor = self._dist.sample(self._value_type.shape)
-    else:
-      raise TypeError("Unrecognized Distribution Value Type: %s",
-                      self._value_type)
-
-    if self._value_type.stop_gradient:
-      # stop_gradient is being enforced by the value type
-      return array_ops.stop_gradient(value_tensor)
-
-    if isinstance(self._value_type, MeanValue):
-      return value_tensor  # Using pathwise-derivative for this one.
-    if self._dist.is_continuous and (
-        self._dist.reparameterization_type
-        is distribution.FULLY_REPARAMETERIZED):
-      return value_tensor  # Using pathwise-derivative for this one.
-    else:
-      # Will have to perform some variant of score function
-      # estimation.  Call stop_gradient on the sampler just in case we
-      # may accidentally leak some gradient from it.
-      return array_ops.stop_gradient(value_tensor)
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def graph(self):
-    return self._value.graph
-
-  @property
-  def dtype(self):
-    return self._dist.dtype
-
-  def entropy(self, name="entropy"):
-    return self._dist.entropy(name=name)
-
-  def mean(self, name="mean"):
-    return self._dist.mean(name=name)
-
-  def value(self, name="value"):
-    return self._value
-
-  def loss(self, final_loss, name="Loss"):
-    # Return a loss based on final_loss and the distribution. Returns
-    # None if pathwise derivatives are supported, if the loss_fn
-    # was explicitly set to None, or if the value type is MeanValue.
-    if self._loss_fn is None:
-      return None
-
-    if (self._dist.is_continuous and
-        self._dist.reparameterization_type is distribution.FULLY_REPARAMETERIZED
-        and not self._value_type.stop_gradient):
-      # Can perform pathwise-derivative on this one; no additional loss needed.
-      return None
-
-    with ops.name_scope(self.name, values=[final_loss]):
-      with ops.name_scope(name):
-        if (self._value_type.stop_gradient or
-            isinstance(self._value_type, SampleValue)):
-          return self._loss_fn(self, self._value, final_loss)
-        elif isinstance(self._value_type, MeanValue):
-          return None  # MeanValue generally provides its own gradient
-        else:
-          raise TypeError("Unrecognized Distribution Value Type: %s",
-                          self._value_type)
-
-
-class ObservedStochasticTensor(StochasticTensor):
-  """A StochasticTensor with an observed value."""
-
-  # pylint: disable=super-init-not-called
-  def __init__(self, dist, value, name=None):
-    """Construct an `ObservedStochasticTensor`.
-
-    `ObservedStochasticTensor` is backed by distribution `dist` and uses the
-    provided value instead of using the current value type to draw a value from
-    the distribution. The provided value argument must be appropriately shaped
-    to have come from the distribution.
-
-    Args:
-      dist: an instance of `Distribution`.
-      value: a Tensor containing the observed value
-      name: a name for this `ObservedStochasticTensor` and its ops.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      ValueError: if `value` is not compatible with the distribution.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    with ops.name_scope(name, "ObservedStochasticTensor", [value]) as scope:
-      self._name = scope
-      self._dist = dist
-      dist_shape = self._dist.batch_shape.concatenate(
-          self._dist.event_shape)
-      value = ops.convert_to_tensor(value)
-      value_shape = value.get_shape()
-
-      if not value_shape.is_compatible_with(dist_shape):
-        if value_shape.ndims < dist_shape.ndims:
-          raise ValueError(
-              "Rank of observed value (%d) must be >= rank of a sample from the"
-              " distribution (%d)." % (value_shape.ndims, dist_shape.ndims))
-        sample_shape = value_shape[(value_shape.ndims - dist_shape.ndims):]
-        if not sample_shape.is_compatible_with(dist_shape):
-          raise ValueError(
-              "Shape of observed value %s is incompatible with the shape of a "
-              "sample from the distribution %s." % (value_shape, dist_shape))
-      if value.dtype != self._dist.dtype:
-        raise ValueError("Type of observed value (%s) does not match type of "
-                         "distribution (%s)." % (value.dtype, self._dist.dtype))
-      self._value = array_ops.identity(value)
-    # pylint: disable=non-parent-init-called
-    BaseStochasticTensor.__init__(self)
-
-  def loss(self, final_loss, name=None):
-    return None
-
-
-__all__ = [
+_allowed_symbols = [
     "BaseStochasticTensor",
     "StochasticTensor",
     "ObservedStochasticTensor",
@@ -478,3 +44,5 @@ __all__ = [
     "value_type",
     "get_current_value_type",
 ]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
new file mode 100644
index 00000000000..ce5fdd98c69
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
@@ -0,0 +1,477 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and helper functions for creating Stochastic Tensors.
+
+`StochasticTensor` objects wrap `Distribution` objects.  Their
+values may be samples from the underlying distribution, or the distribution
+mean (as governed by `value_type`).  These objects provide a `loss`
+method for use when sampling from a non-reparameterized distribution.
+The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
+to produce a single differentiable loss in stochastic graphs having
+both continuous and discrete stochastic nodes.
+
+## Stochastic Tensor Classes
+
+@@BaseStochasticTensor
+@@StochasticTensor
+
+## Stochastic Tensor Value Types
+
+@@MeanValue
+@@SampleValue
+
+@@value_type
+@@get_current_value_type
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import contextlib
+import threading
+
+import six
+
+from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import distribution
+
+STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseStochasticTensor(object):
+  """Base Class for Tensor-like objects that emit stochastic values."""
+
+  def __init__(self):
+    # Add self to this graph's Stochsatic Tensor collection for
+    # purposes of later performing correct surrogate loss calculation.
+    ops.add_to_collection(STOCHASTIC_TENSOR_COLLECTION, self)
+
+  @abc.abstractproperty
+  def name(self):
+    pass
+
+  @abc.abstractproperty
+  def dtype(self):
+    pass
+
+  @abc.abstractproperty
+  def graph(self):
+    pass
+
+  @abc.abstractmethod
+  def value(self, name=None):
+    pass
+
+  @abc.abstractmethod
+  def loss(self, sample_loss):
+    """Returns the term to add to the surrogate loss.
+
+    This method is called by `surrogate_loss`.  The input `sample_loss` should
+    have already had `stop_gradient` applied to it.  This is because the
+    surrogate_loss usually provides a Monte Carlo sample term of the form
+    `differentiable_surrogate * sample_loss` where `sample_loss` is considered
+    constant with respect to the input for purposes of the gradient.
+
+    Args:
+      sample_loss: `Tensor`, sample loss downstream of this `StochasticTensor`.
+
+    Returns:
+      Either `None` or a `Tensor`.
+    """
+    raise NotImplementedError("surrogate_loss not implemented")
+
+  @staticmethod
+  def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
+    _ = name
+    if dtype and not dtype.is_compatible_with(v.dtype):
+      raise ValueError(
+          "Incompatible type conversion requested to type '%s' for variable "
+          "of type '%s'" % (dtype.name, v.dtype.name))
+    if as_ref:
+      raise ValueError("%s: Ref type is not supported." % v)
+    return v.value()
+
+
+# pylint: disable=protected-access
+ops.register_tensor_conversion_function(
+    BaseStochasticTensor, BaseStochasticTensor._tensor_conversion_function)
+
+# pylint: enable=protected-access
+
+
+class _StochasticValueType(object):
+  """Interface for the ValueType classes.
+
+  This is the base class for MeanValue, SampleValue, and their descendants.
+  """
+
+  def pushed_above(self, unused_value_type):
+    pass
+
+  def popped_above(self, unused_value_type):
+    pass
+
+  def declare_inputs(self, unused_stochastic_tensor, unused_inputs_dict):
+    pass
+
+  @abc.abstractproperty
+  def stop_gradient(self):
+    """Whether the value should be wrapped in stop_gradient.
+
+    StochasticTensors must respect this property.
+    """
+    pass
+
+
+class MeanValue(_StochasticValueType):
+
+  def __init__(self, stop_gradient=False):
+    self._stop_gradient = stop_gradient
+
+  @property
+  def stop_gradient(self):
+    return self._stop_gradient
+
+
+class SampleValue(_StochasticValueType):
+  """Draw samples, possibly adding new outer dimensions along the way.
+
+  This ValueType draws samples from StochasticTensors run within its
+  context, increasing the rank according to the requested shape.
+
+  Examples:
+
+  ```python
+  mu = tf.zeros((2,3))
+  sigma = tf.ones((2, 3))
+  with sg.value_type(sg.SampleValue()):
+    st = sg.StochasticTensor(
+      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
+  # draws 1 sample and does not reshape
+  assertEqual(st.value().get_shape(), (2, 3))
+  ```
+
+  ```python
+  mu = tf.zeros((2,3))
+  sigma = tf.ones((2, 3))
+  with sg.value_type(sg.SampleValue(4)):
+    st = sg.StochasticTensor(
+      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
+  # draws 4 samples each with shape (2, 3) and concatenates
+  assertEqual(st.value().get_shape(), (4, 2, 3))
+  ```
+  """
+
+  def __init__(self, shape=(), stop_gradient=False):
+    """Sample according to shape.
+
+    For the given StochasticTensor `st` using this value type,
+    the shape of `st.value()` will match that of
+    `st.distribution.sample(shape)`.
+
+    Args:
+      shape: A shape tuple or int32 tensor.  The sample shape.
+        Default is a scalar: take one sample and do not change the size.
+      stop_gradient: If `True`, StochasticTensors' values are wrapped in
+        `stop_gradient`, to avoid backpropagation through.
+    """
+    self._shape = shape
+    self._stop_gradient = stop_gradient
+
+  @property
+  def shape(self):
+    return self._shape
+
+  @property
+  def stop_gradient(self):
+    return self._stop_gradient
+
+
+# Keeps track of how a StochasticTensor's value should be accessed.
+# Used by value_type and get_current_value_type below.
+_STOCHASTIC_VALUE_STACK = collections.defaultdict(list)
+
+
+@contextlib.contextmanager
+def value_type(dist_value_type):
+  """Creates a value type context for any StochasticTensor created within.
+
+  Typical usage:
+
+  ```
+  with sg.value_type(sg.MeanValue(stop_gradients=True)):
+    st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
+                             sigma=sigma)
+  ```
+
+  In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
+  be the mean value of the Normal distribution, i.e., `mu` (possibly
+  broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
+  was marked with `stop_gradients=True`, this value will have been wrapped
+  in a `stop_gradients` call to disable any possible backpropagation.
+
+  Args:
+    dist_value_type: An instance of `MeanValue`, `SampleValue`, or
+      any other stochastic value type.
+
+  Yields:
+    A context for `StochasticTensor` objects that controls the
+    value created when they are initialized.
+
+  Raises:
+    TypeError: if `dist_value_type` is not an instance of a stochastic value
+      type.
+  """
+  if not isinstance(dist_value_type, _StochasticValueType):
+    raise TypeError("dist_value_type must be a Distribution Value Type")
+  thread_id = threading.current_thread().ident
+  stack = _STOCHASTIC_VALUE_STACK[thread_id]
+  if stack:
+    stack[-1].pushed_above(dist_value_type)
+  stack.append(dist_value_type)
+  yield
+  stack.pop()
+  if stack:
+    stack[-1].popped_above(dist_value_type)
+
+
+class NoValueTypeSetError(ValueError):
+  pass
+
+
+def get_current_value_type():
+  thread_id = threading.current_thread().ident
+  if not _STOCHASTIC_VALUE_STACK[thread_id]:
+    raise NoValueTypeSetError(
+        "No value type currently set for this thread (%s).  Did you forget to "
+        "wrap 'with stochastic_graph.value_type(...)'?" % thread_id)
+  return _STOCHASTIC_VALUE_STACK[thread_id][-1]
+
+
+class StochasticTensor(BaseStochasticTensor):
+  """StochasticTensor is a BaseStochasticTensor backed by a distribution."""
+
+  def __init__(self,
+               dist,
+               name="StochasticTensor",
+               dist_value_type=None,
+               loss_fn=sge.score_function):
+    """Construct a `StochasticTensor`.
+
+    `StochasticTensor` is backed by the `dist` distribution and its `value`
+    method will return the same value each time it is called. What `value` is
+    returned is controlled by the `dist_value_type` (defaults to
+    `SampleValue`).
+
+    Some distributions' sample functions are not differentiable (e.g. a sample
+    from a discrete distribution like a Bernoulli) and so to differentiate
+    wrt parameters upstream of the sample requires a gradient estimator like
+    the score function estimator. This is accomplished by passing a
+    differentiable `loss_fn` to the `StochasticTensor`, which
+    defaults to a function whose derivative is the score function estimator.
+    Calling `stochastic_graph.surrogate_loss(final_losses)` will call
+    `loss()` on every `StochasticTensor` upstream of final losses.
+
+    `loss()` will return None for `StochasticTensor`s backed by
+    reparameterized distributions; it will also return None if the value type is
+    `MeanValueType` or if `loss_fn=None`.
+
+    Args:
+      dist: an instance of `Distribution`.
+      name: a name for this `StochasticTensor` and its ops.
+      dist_value_type: a `_StochasticValueType`, which will determine what the
+          `value` of this `StochasticTensor` will be. If not provided, the
+          value type set with the `value_type` context manager will be used.
+      loss_fn: callable that takes
+          `(st, st.value(), influenced_loss)`, where
+          `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
+          default, `loss_fn` is the `score_function`, or more precisely, the
+          integral of the score function, such that when the gradient is taken,
+          the score function results. See the `stochastic_gradient_estimators`
+          module for additional loss functions and baselines.
+
+    Raises:
+      TypeError: if `dist` is not an instance of `Distribution`.
+      TypeError: if `loss_fn` is not `callable`.
+    """
+    if not isinstance(dist, distribution.Distribution):
+      raise TypeError("dist must be an instance of Distribution")
+    if dist_value_type is None:
+      try:
+        self._value_type = get_current_value_type()
+      except NoValueTypeSetError:
+        self._value_type = SampleValue()
+    else:
+      # We want to enforce a value type here, but use the value_type()
+      # context manager to enforce some error checking.
+      with value_type(dist_value_type):
+        self._value_type = get_current_value_type()
+
+    if loss_fn is not None and not callable(loss_fn):
+      raise TypeError("loss_fn must be callable")
+    self._loss_fn = loss_fn
+
+    with ops.name_scope(name) as scope:
+      self._name = scope
+      self._dist = dist
+      self._value = self._create_value()
+
+    super(StochasticTensor, self).__init__()
+
+  @property
+  def value_type(self):
+    return self._value_type
+
+  @property
+  def distribution(self):
+    return self._dist
+
+  def _create_value(self):
+    """Create the value Tensor based on the value type, store as self._value."""
+
+    if isinstance(self._value_type, MeanValue):
+      value_tensor = self._dist.mean()
+    elif isinstance(self._value_type, SampleValue):
+      value_tensor = self._dist.sample(self._value_type.shape)
+    else:
+      raise TypeError("Unrecognized Distribution Value Type: %s",
+                      self._value_type)
+
+    if self._value_type.stop_gradient:
+      # stop_gradient is being enforced by the value type
+      return array_ops.stop_gradient(value_tensor)
+
+    if isinstance(self._value_type, MeanValue):
+      return value_tensor  # Using pathwise-derivative for this one.
+    if self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED:
+      return value_tensor  # Using pathwise-derivative for this one.
+    else:
+      # Will have to perform some variant of score function
+      # estimation.  Call stop_gradient on the sampler just in case we
+      # may accidentally leak some gradient from it.
+      return array_ops.stop_gradient(value_tensor)
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def graph(self):
+    return self._value.graph
+
+  @property
+  def dtype(self):
+    return self._dist.dtype
+
+  def entropy(self, name="entropy"):
+    return self._dist.entropy(name=name)
+
+  def mean(self, name="mean"):
+    return self._dist.mean(name=name)
+
+  def value(self, name="value"):
+    return self._value
+
+  def loss(self, final_loss, name="Loss"):
+    # Return a loss based on final_loss and the distribution. Returns
+    # None if pathwise derivatives are supported, if the loss_fn
+    # was explicitly set to None, or if the value type is MeanValue.
+    if self._loss_fn is None:
+      return None
+
+    if (self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED
+        and not self._value_type.stop_gradient):
+      # Can perform pathwise-derivative on this one; no additional loss needed.
+      return None
+
+    with ops.name_scope(self.name, values=[final_loss]):
+      with ops.name_scope(name):
+        if (self._value_type.stop_gradient or
+            isinstance(self._value_type, SampleValue)):
+          return self._loss_fn(self, self._value, final_loss)
+        elif isinstance(self._value_type, MeanValue):
+          return None  # MeanValue generally provides its own gradient
+        else:
+          raise TypeError("Unrecognized Distribution Value Type: %s",
+                          self._value_type)
+
+
+class ObservedStochasticTensor(StochasticTensor):
+  """A StochasticTensor with an observed value."""
+
+  # pylint: disable=super-init-not-called
+  def __init__(self, dist, value, name=None):
+    """Construct an `ObservedStochasticTensor`.
+
+    `ObservedStochasticTensor` is backed by distribution `dist` and uses the
+    provided value instead of using the current value type to draw a value from
+    the distribution. The provided value argument must be appropriately shaped
+    to have come from the distribution.
+
+    Args:
+      dist: an instance of `Distribution`.
+      value: a Tensor containing the observed value
+      name: a name for this `ObservedStochasticTensor` and its ops.
+
+    Raises:
+      TypeError: if `dist` is not an instance of `Distribution`.
+      ValueError: if `value` is not compatible with the distribution.
+    """
+    if not isinstance(dist, distribution.Distribution):
+      raise TypeError("dist must be an instance of Distribution")
+    with ops.name_scope(name, "ObservedStochasticTensor", [value]) as scope:
+      self._name = scope
+      self._dist = dist
+      dist_shape = self._dist.batch_shape.concatenate(
+          self._dist.event_shape)
+      value = ops.convert_to_tensor(value)
+      value_shape = value.get_shape()
+
+      if not value_shape.is_compatible_with(dist_shape):
+        if value_shape.ndims < dist_shape.ndims:
+          raise ValueError(
+              "Rank of observed value (%d) must be >= rank of a sample from the"
+              " distribution (%d)." % (value_shape.ndims, dist_shape.ndims))
+        sample_shape = value_shape[(value_shape.ndims - dist_shape.ndims):]
+        if not sample_shape.is_compatible_with(dist_shape):
+          raise ValueError(
+              "Shape of observed value %s is incompatible with the shape of a "
+              "sample from the distribution %s." % (value_shape, dist_shape))
+      if value.dtype != self._dist.dtype:
+        raise ValueError("Type of observed value (%s) does not match type of "
+                         "distribution (%s)." % (value.dtype, self._dist.dtype))
+      self._value = array_ops.identity(value)
+    # pylint: disable=non-parent-init-called
+    BaseStochasticTensor.__init__(self)
+
+  def loss(self, final_loss, name=None):
+    return None
+
+
+__all__ = [
+    "BaseStochasticTensor",
+    "StochasticTensor",
+    "ObservedStochasticTensor",
+    "MeanValue",
+    "SampleValue",
+    "value_type",
+    "get_current_value_type",
+]
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference.py b/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
index 9c2d8435644..6316361da2a 100644
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
@@ -14,315 +14,21 @@
 # ==============================================================================
 """Variational inference.
 
-## Ops
-
-@@elbo
-@@elbo_with_log_joint
-@@ELBOForms
-@@register_prior
-
+See the ${@python/contrib.bayesflow.variational_inference} guide.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph as sg
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor as st
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.variational_inference_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
 
-VI_PRIORS = "__vi_priors__"
+_allowed_symbols = [
+    "elbo", "elbo_with_log_joint", "ELBOForms", "register_prior"
+]
 
-
-def register_prior(variational, prior):
-  """Associate a variational `StochasticTensor` with a `Distribution` prior.
-
-  This is a helper function used in conjunction with `elbo` that allows users
-  to specify the mapping between variational distributions and their priors
-  without having to pass in `variational_with_prior` explicitly.
-
-  Args:
-    variational: `StochasticTensor` q(Z). Approximating distribution.
-    prior: `Distribution` p(Z). Prior distribution.
-
-  Returns:
-    None
-
-  Raises:
-    ValueError: if variational is not a `StochasticTensor` or `prior` is not
-      a `Distribution`.
-  """
-  if not isinstance(variational, st.StochasticTensor):
-    raise TypeError("variational must be a StochasticTensor")
-  if not isinstance(prior, distribution.Distribution):
-    raise TypeError("prior must be a Distribution")
-  ops.add_to_collection(VI_PRIORS, (variational, prior))
-
-
-class _ELBOForm(object):
-  pass
-
-
-class ELBOForms(object):
-  """Constants to control the `elbo` calculation.
-
-  `analytic_kl` uses the analytic KL divergence between the
-  variational distribution(s) and the prior(s).
-
-  `analytic_entropy` uses the analytic entropy of the variational
-  distribution(s).
-
-  `sample` uses the sample KL or the sample entropy is the joint is provided.
-
-  See `elbo` for what is used with `default`.
-  """
-  default, analytic_kl, analytic_entropy, sample = (_ELBOForm()
-                                                    for _ in range(4))
-
-  @staticmethod
-  def check_form(form):
-    if form not in {
-        ELBOForms.default, ELBOForms.analytic_kl, ELBOForms.analytic_entropy,
-        ELBOForms.sample
-    }:
-      raise TypeError("form must be an ELBOForms constant")
-
-
-def elbo(log_likelihood,
-         variational_with_prior=None,
-         keep_batch_dim=True,
-         form=None,
-         name="ELBO"):
-  r"""Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  Optimization objective for inference of hidden variables by variational
-  inference.
-
-  This function is meant to be used in conjunction with `StochasticTensor`.
-  The user should build out the inference network, using `StochasticTensor`s
-  as latent variables, and the generative network. `elbo` at minimum needs
-  `p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
-  the variational distributions. Use `register_prior` to register `Distribution`
-  priors for each `StochasticTensor`. Alternatively, pass in
-  `variational_with_prior` specifying all variational distributions and their
-  priors.
-
-  Mathematical details:
-
-  ```
-  log p(x) =  log \int p(x, Z) dZ
-           =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
-           =  log E_q[\frac {p(x, Z)}{q(Z)}]
-           >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
-
-  L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
-             = E_q[log p(x|Z)p(Z)] + H[q]           (1)
-             = E_q[log p(x|Z)] - KL(q || p)         (2)
-
-  H - Entropy
-  KL - Kullback-Leibler divergence
-  ```
-
-  See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
-  more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
-  in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
-
-  `form` specifies which form of the ELBO is used. `form=ELBOForms.default`
-  tries, in order of preference: analytic KL, analytic entropy, sampling.
-
-  Multiple entries in the `variational_with_prior` dict implies a factorization.
-  e.g. `q(Z) = q(z1)q(z2)q(z3)`.
-
-  Args:
-    log_likelihood: `Tensor` log p(x|Z).
-    variational_with_prior: dict from `StochasticTensor` q(Z) to
-      `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
-      objects upstream of `log_likelihood` with priors registered with
-      `register_prior`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy/KL term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_likelihood`.
-
-  Raises:
-    TypeError: if variationals in `variational_with_prior` are not
-      `StochasticTensor`s or if priors are not `Distribution`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational_with_prior` is None and there are no
-      `StochasticTensor`s upstream of `log_likelihood`.
-    ValueError: if any variational does not have a prior passed or registered.
-  """
-  if form is None:
-    form = ELBOForms.default
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_likelihood)
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior)
-    return _elbo(form, log_likelihood, None, variational_with_prior,
-                 keep_batch_dim)
-
-
-def elbo_with_log_joint(log_joint,
-                        variational=None,
-                        keep_batch_dim=True,
-                        form=None,
-                        name="ELBO"):
-  """Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
-  See `elbo` for further details.
-
-  Because only the joint is specified, analytic KL is not available.
-
-  Args:
-    log_joint: `Tensor` log p(x, Z).
-    variational: list of `StochasticTensor` q(Z). If `None`, defaults to all
-      `StochasticTensor` objects upstream of `log_joint`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_joint`.
-
-  Raises:
-    TypeError: if variationals in `variational` are not `StochasticTensor`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational` is None and there are no `StochasticTensor`s
-      upstream of `log_joint`.
-    ValueError: if form is ELBOForms.analytic_kl.
-  """
-  if form is None:
-    form = ELBOForms.default
-  if form == ELBOForms.analytic_kl:
-    raise ValueError("ELBOForms.analytic_kl is not available when using "
-                     "elbo_with_log_joint. Use elbo or a different form.")
-
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_joint)
-
-    variational_with_prior = None
-    if variational is not None:
-      variational_with_prior = dict(zip(variational, [None] * len(variational)))
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior, require_prior=False)
-    return _elbo(form, None, log_joint, variational_with_prior, keep_batch_dim)
-
-
-def _elbo(form, log_likelihood, log_joint, variational_with_prior,
-          keep_batch_dim):
-  """Internal implementation of ELBO. Users should use `elbo`.
-
-  Args:
-    form: ELBOForms constant. Controls how the ELBO is computed.
-    log_likelihood: `Tensor` log p(x|Z).
-    log_joint: `Tensor` log p(x, Z).
-    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
-      distributions to prior distributions.
-    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
-      the entropy/KL.
-
-  Returns:
-    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
-  """
-  ELBOForms.check_form(form)
-
-  # Order of preference
-  # 1. Analytic KL: log_likelihood - KL(q||p)
-  # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
-  # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
-  #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)
-
-  def _reduce(val):
-    if keep_batch_dim:
-      return val
-    else:
-      return math_ops.reduce_sum(val)
-
-  kl_terms = []
-  entropy_terms = []
-  prior_terms = []
-  for q, z, p in [(qz.distribution, qz.value(), pz)
-                  for qz, pz in variational_with_prior.items()]:
-    # Analytic KL
-    kl = None
-    if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
-      try:
-        kl = kullback_leibler.kl(q, p)
-        logging.info("Using analytic KL between q:%s, p:%s", q, p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_kl:
-          raise e
-    if kl is not None:
-      kl_terms.append(-1. * _reduce(kl))
-      continue
-
-    # Analytic entropy
-    entropy = None
-    if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
-      try:
-        entropy = q.entropy()
-        logging.info("Using analytic entropy for q:%s", q)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    if entropy is not None:
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-      continue
-
-    # Sample
-    if form in {ELBOForms.default, ELBOForms.sample}:
-      entropy = -q.log_prob(z)
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-
-  first_term = log_joint if log_joint is not None else log_likelihood
-  return sum([first_term] + kl_terms + entropy_terms + prior_terms)
-
-
-def _find_variational_and_priors(model,
-                                 variational_with_prior,
-                                 require_prior=True):
-  """Find upstream StochasticTensors and match with registered priors."""
-  if variational_with_prior is None:
-    # pylint: disable=protected-access
-    upstreams = sg._upstream_stochastic_nodes([model])
-    # pylint: enable=protected-access
-    upstreams = list(upstreams[model])
-    if not upstreams:
-      raise ValueError("No upstream stochastic nodes found for tensor: %s",
-                       model)
-    prior_map = dict(ops.get_collection(VI_PRIORS))
-    variational_with_prior = {}
-    for q in upstreams:
-      if require_prior and (q not in prior_map or prior_map[q] is None):
-        raise ValueError("No prior specified for StochasticTensor: %s", q)
-      variational_with_prior[q] = prior_map.get(q)
-
-  if not all(
-      [isinstance(q, st.StochasticTensor) for q in variational_with_prior]):
-    raise TypeError("variationals must be StochasticTensors")
-  if not all([
-      p is None or isinstance(p, distribution.Distribution)
-      for p in variational_with_prior.values()
-  ]):
-    raise TypeError("priors must be Distribution objects")
-
-  return variational_with_prior
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
new file mode 100644
index 00000000000..8d932a7c340
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
@@ -0,0 +1,327 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Variational inference.
+
+See the ${@python/contrib.bayesflow.variational_inference} guide.
+
+@@elbo
+@@elbo_with_log_joint
+@@ELBOForms
+@@register_prior
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl as sg
+from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl as st
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.platform import tf_logging as logging
+
+VI_PRIORS = "__vi_priors__"
+
+
+def register_prior(variational, prior):
+  """Associate a variational `StochasticTensor` with a `Distribution` prior.
+
+  This is a helper function used in conjunction with `elbo` that allows users
+  to specify the mapping between variational distributions and their priors
+  without having to pass in `variational_with_prior` explicitly.
+
+  Args:
+    variational: `StochasticTensor` q(Z). Approximating distribution.
+    prior: `Distribution` p(Z). Prior distribution.
+
+  Returns:
+    None
+
+  Raises:
+    ValueError: if variational is not a `StochasticTensor` or `prior` is not
+      a `Distribution`.
+  """
+  if not isinstance(variational, st.StochasticTensor):
+    raise TypeError("variational must be a StochasticTensor")
+  if not isinstance(prior, distribution.Distribution):
+    raise TypeError("prior must be a Distribution")
+  ops.add_to_collection(VI_PRIORS, (variational, prior))
+
+
+class _ELBOForm(object):
+  pass
+
+
+class ELBOForms(object):
+  """Constants to control the `elbo` calculation.
+
+  `analytic_kl` uses the analytic KL divergence between the
+  variational distribution(s) and the prior(s).
+
+  `analytic_entropy` uses the analytic entropy of the variational
+  distribution(s).
+
+  `sample` uses the sample KL or the sample entropy is the joint is provided.
+
+  See `elbo` for what is used with `default`.
+  """
+  default, analytic_kl, analytic_entropy, sample = (_ELBOForm()
+                                                    for _ in range(4))
+
+  @staticmethod
+  def check_form(form):
+    if form not in {
+        ELBOForms.default, ELBOForms.analytic_kl, ELBOForms.analytic_entropy,
+        ELBOForms.sample
+    }:
+      raise TypeError("form must be an ELBOForms constant")
+
+
+def elbo(log_likelihood,
+         variational_with_prior=None,
+         keep_batch_dim=True,
+         form=None,
+         name="ELBO"):
+  r"""Evidence Lower BOund. `log p(x) >= ELBO`.
+
+  Optimization objective for inference of hidden variables by variational
+  inference.
+
+  This function is meant to be used in conjunction with `StochasticTensor`.
+  The user should build out the inference network, using `StochasticTensor`s
+  as latent variables, and the generative network. `elbo` at minimum needs
+  `p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
+  the variational distributions. Use `register_prior` to register `Distribution`
+  priors for each `StochasticTensor`. Alternatively, pass in
+  `variational_with_prior` specifying all variational distributions and their
+  priors.
+
+  Mathematical details:
+
+  ```
+  log p(x) =  log \int p(x, Z) dZ
+           =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
+           =  log E_q[\frac {p(x, Z)}{q(Z)}]
+           >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
+
+  L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
+             = E_q[log p(x|Z)p(Z)] + H[q]           (1)
+             = E_q[log p(x|Z)] - KL(q || p)         (2)
+
+  H - Entropy
+  KL - Kullback-Leibler divergence
+  ```
+
+  See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
+  more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
+  in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
+
+  `form` specifies which form of the ELBO is used. `form=ELBOForms.default`
+  tries, in order of preference: analytic KL, analytic entropy, sampling.
+
+  Multiple entries in the `variational_with_prior` dict implies a factorization.
+  e.g. `q(Z) = q(z1)q(z2)q(z3)`.
+
+  Args:
+    log_likelihood: `Tensor` log p(x|Z).
+    variational_with_prior: dict from `StochasticTensor` q(Z) to
+      `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
+      objects upstream of `log_likelihood` with priors registered with
+      `register_prior`.
+    keep_batch_dim: bool. Whether to keep the batch dimension when summing
+      entropy/KL term. When the sample is per data point, this should be True;
+      otherwise (e.g. in a Bayesian NN), this should be False.
+    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
+      ELBOForms.default.
+    name: name to prefix ops with.
+
+  Returns:
+    `Tensor` ELBO of the same type and shape as `log_likelihood`.
+
+  Raises:
+    TypeError: if variationals in `variational_with_prior` are not
+      `StochasticTensor`s or if priors are not `Distribution`s.
+    TypeError: if form is not a valid ELBOForms constant.
+    ValueError: if `variational_with_prior` is None and there are no
+      `StochasticTensor`s upstream of `log_likelihood`.
+    ValueError: if any variational does not have a prior passed or registered.
+  """
+  if form is None:
+    form = ELBOForms.default
+  with ops.name_scope(name):
+    model = ops.convert_to_tensor(log_likelihood)
+    variational_with_prior = _find_variational_and_priors(
+        model, variational_with_prior)
+    return _elbo(form, log_likelihood, None, variational_with_prior,
+                 keep_batch_dim)
+
+
+def elbo_with_log_joint(log_joint,
+                        variational=None,
+                        keep_batch_dim=True,
+                        form=None,
+                        name="ELBO"):
+  """Evidence Lower BOund. `log p(x) >= ELBO`.
+
+  This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
+  See `elbo` for further details.
+
+  Because only the joint is specified, analytic KL is not available.
+
+  Args:
+    log_joint: `Tensor` log p(x, Z).
+    variational: list of `StochasticTensor` q(Z). If `None`, defaults to all
+      `StochasticTensor` objects upstream of `log_joint`.
+    keep_batch_dim: bool. Whether to keep the batch dimension when summing
+      entropy term. When the sample is per data point, this should be True;
+      otherwise (e.g. in a Bayesian NN), this should be False.
+    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
+      ELBOForms.default.
+    name: name to prefix ops with.
+
+  Returns:
+    `Tensor` ELBO of the same type and shape as `log_joint`.
+
+  Raises:
+    TypeError: if variationals in `variational` are not `StochasticTensor`s.
+    TypeError: if form is not a valid ELBOForms constant.
+    ValueError: if `variational` is None and there are no `StochasticTensor`s
+      upstream of `log_joint`.
+    ValueError: if form is ELBOForms.analytic_kl.
+  """
+  if form is None:
+    form = ELBOForms.default
+  if form == ELBOForms.analytic_kl:
+    raise ValueError("ELBOForms.analytic_kl is not available when using "
+                     "elbo_with_log_joint. Use elbo or a different form.")
+
+  with ops.name_scope(name):
+    model = ops.convert_to_tensor(log_joint)
+
+    variational_with_prior = None
+    if variational is not None:
+      variational_with_prior = dict(zip(variational, [None] * len(variational)))
+    variational_with_prior = _find_variational_and_priors(
+        model, variational_with_prior, require_prior=False)
+    return _elbo(form, None, log_joint, variational_with_prior, keep_batch_dim)
+
+
+def _elbo(form, log_likelihood, log_joint, variational_with_prior,
+          keep_batch_dim):
+  """Internal implementation of ELBO. Users should use `elbo`.
+
+  Args:
+    form: ELBOForms constant. Controls how the ELBO is computed.
+    log_likelihood: `Tensor` log p(x|Z).
+    log_joint: `Tensor` log p(x, Z).
+    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
+      distributions to prior distributions.
+    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
+      the entropy/KL.
+
+  Returns:
+    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
+  """
+  ELBOForms.check_form(form)
+
+  # Order of preference
+  # 1. Analytic KL: log_likelihood - KL(q||p)
+  # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
+  # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
+  #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)
+
+  def _reduce(val):
+    if keep_batch_dim:
+      return val
+    else:
+      return math_ops.reduce_sum(val)
+
+  kl_terms = []
+  entropy_terms = []
+  prior_terms = []
+  for q, z, p in [(qz.distribution, qz.value(), pz)
+                  for qz, pz in variational_with_prior.items()]:
+    # Analytic KL
+    kl = None
+    if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
+      try:
+        kl = kullback_leibler.kl_divergence(q, p)
+        logging.info("Using analytic KL between q:%s, p:%s", q, p)
+      except NotImplementedError as e:
+        if form == ELBOForms.analytic_kl:
+          raise e
+    if kl is not None:
+      kl_terms.append(-1. * _reduce(kl))
+      continue
+
+    # Analytic entropy
+    entropy = None
+    if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
+      try:
+        entropy = q.entropy()
+        logging.info("Using analytic entropy for q:%s", q)
+      except NotImplementedError as e:
+        if form == ELBOForms.analytic_entropy:
+          raise e
+    if entropy is not None:
+      entropy_terms.append(_reduce(entropy))
+      if log_likelihood is not None:
+        prior = p.log_prob(z)
+        prior_terms.append(_reduce(prior))
+      continue
+
+    # Sample
+    if form in {ELBOForms.default, ELBOForms.sample}:
+      entropy = -q.log_prob(z)
+      entropy_terms.append(_reduce(entropy))
+      if log_likelihood is not None:
+        prior = p.log_prob(z)
+        prior_terms.append(_reduce(prior))
+
+  first_term = log_joint if log_joint is not None else log_likelihood
+  return sum([first_term] + kl_terms + entropy_terms + prior_terms)
+
+
+def _find_variational_and_priors(model,
+                                 variational_with_prior,
+                                 require_prior=True):
+  """Find upstream StochasticTensors and match with registered priors."""
+  if variational_with_prior is None:
+    # pylint: disable=protected-access
+    upstreams = sg._upstream_stochastic_nodes([model])
+    # pylint: enable=protected-access
+    upstreams = list(upstreams[model])
+    if not upstreams:
+      raise ValueError("No upstream stochastic nodes found for tensor: %s",
+                       model)
+    prior_map = dict(ops.get_collection(VI_PRIORS))
+    variational_with_prior = {}
+    for q in upstreams:
+      if require_prior and (q not in prior_map or prior_map[q] is None):
+        raise ValueError("No prior specified for StochasticTensor: %s", q)
+      variational_with_prior[q] = prior_map.get(q)
+
+  if not all(
+      [isinstance(q, st.StochasticTensor) for q in variational_with_prior]):
+    raise TypeError("variationals must be StochasticTensors")
+  if not all([
+      p is None or isinstance(p, distribution.Distribution)
+      for p in variational_with_prior.values()
+  ]):
+    raise TypeError("priors must be Distribution objects")
+
+  return variational_with_prior
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
new file mode 100644
index 00000000000..c1600bdabd7
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -0,0 +1,21 @@
+# TensorFlow code for training gradient boosted trees.
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = [
+    "//visibility:public",
+])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+package_group(name = "friends")
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
new file mode 100644
index 00000000000..011c02d720f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -0,0 +1,249 @@
+# Description:
+#   This directory contains common utilities used in boosted_trees.
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(
+    default_visibility = [
+        "//tensorflow/contrib/boosted_trees:__subpackages__",
+        "//tensorflow/contrib/boosted_trees:friends",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "weighted_quantiles",
+    srcs = [],
+    hdrs = [
+        "quantiles/weighted_quantiles_buffer.h",
+        "quantiles/weighted_quantiles_stream.h",
+        "quantiles/weighted_quantiles_summary.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_buffer_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_buffer_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_summary_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_summary_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_stream_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_stream_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils/batch_features.cc",
+        "utils/dropout_utils.cc",
+        "utils/examples_iterable.cc",
+        "utils/parallel_for.cc",
+        "utils/sparse_column_iterable.cc",
+        "utils/tensor_utils.cc",
+    ],
+    hdrs = [
+        "utils/batch_features.h",
+        "utils/dropout_utils.h",
+        "utils/example.h",
+        "utils/examples_iterable.h",
+        "utils/macros.h",
+        "utils/optional_value.h",
+        "utils/parallel_for.h",
+        "utils/random.h",
+        "utils/sparse_column_iterable.h",
+        "utils/tensor_utils.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_test(
+    name = "sparse_column_iterable_test",
+    size = "small",
+    srcs = ["utils/sparse_column_iterable_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "examples_iterable_test",
+    size = "small",
+    srcs = ["utils/examples_iterable_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "batch_features_test",
+    size = "small",
+    srcs = ["utils/batch_features_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "dropout_utils_test",
+    size = "small",
+    srcs = ["utils/dropout_utils_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "models",
+    srcs = ["models/multiple_additive_trees.cc"],
+    hdrs = ["models/multiple_additive_trees.h"],
+    deps = [
+        ":trees",
+        ":utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "multiple_additive_trees_test",
+    size = "small",
+    srcs = ["models/multiple_additive_trees_test.cc"],
+    deps = [
+        ":batch_features_testutil",
+        ":models",
+        ":random_tree_gen",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "trees",
+    srcs = ["trees/decision_tree.cc"],
+    hdrs = ["trees/decision_tree.h"],
+    deps = [
+        ":utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "trees_test",
+    size = "small",
+    srcs = ["trees/decision_tree_test.cc"],
+    deps = [
+        ":trees",
+        ":utils",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "batch_features_testutil",
+    testonly = 1,
+    srcs = ["testutil/batch_features_testutil.cc"],
+    hdrs = ["testutil/batch_features_testutil.h"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "random_tree_gen",
+    srcs = ["testutil/random_tree_gen.cc"],
+    hdrs = ["testutil/random_tree_gen.h"],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_binary(
+    name = "random_tree_gen_main",
+    srcs = ["testutil/random_tree_gen_main.cc"],
+    deps = [
+        ":random_tree_gen",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
new file mode 100644
index 00000000000..16bffd9becc
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
@@ -0,0 +1,140 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h"
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+namespace {
+void CalculateTreesToKeep(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const std::vector<int32>& trees_to_drop, const int32 num_trees,
+    const bool only_finalized, std::vector<int32>* trees_to_keep) {
+  trees_to_keep->reserve(num_trees - trees_to_drop.size());
+
+  int32 index = 0;
+  // This assumes that trees_to_drop is a sorted list of tree ids.
+  for (int32 tree = 0; tree < num_trees; ++tree) {
+    if ((!trees_to_drop.empty() && index < trees_to_drop.size() &&
+         trees_to_drop[index] == tree) ||
+        (only_finalized && config.tree_metadata_size() > 0 &&
+         !config.tree_metadata(tree).is_finalized())) {
+      ++index;
+      continue;
+    }
+    trees_to_keep->push_back(tree);
+  }
+}
+
+void UpdatePredictions(
+    const int32 index_1, const int32 index_2, const float value,
+    tensorflow::TTypes<float>::Matrix* output_predictions,
+    tensorflow::TTypes<float>::Matrix* additional_output_predictions) {
+  (*output_predictions)(index_1, index_2) += value;
+
+  if (additional_output_predictions != nullptr) {
+    (*additional_output_predictions)(index_1, index_2) += value;
+  }
+}
+
+void UpdatePredictionsBasedOnTree(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const int32 tree_idx, const boosted_trees::utils::Example& example,
+    tensorflow::TTypes<float>::Matrix* output_predictions,
+    tensorflow::TTypes<float>::Matrix* additional_output_predictions) {
+  const boosted_trees::trees::DecisionTreeConfig& tree = config.trees(tree_idx);
+  const float tree_weight = config.tree_weights(tree_idx);
+  const int leaf_idx = trees::DecisionTree::Traverse(tree, 0, example);
+  QCHECK(leaf_idx >= 0) << "Invalid tree: " << tree.DebugString();
+  const auto& leaf_node = tree.nodes(leaf_idx);
+  QCHECK(leaf_node.has_leaf())
+      << "Invalid leaf node: " << leaf_node.DebugString();
+  if (leaf_node.leaf().has_sparse_vector()) {
+    const auto& leaf = leaf_node.leaf().sparse_vector();
+    QCHECK_EQ(leaf.index_size(), leaf.value_size());
+    for (size_t class_idx = 0; class_idx < leaf.index_size(); ++class_idx) {
+      const float value = tree_weight * leaf.value(class_idx);
+
+      UpdatePredictions(example.example_idx, leaf.index(class_idx), value,
+                        output_predictions, additional_output_predictions);
+    }
+  } else {
+    QCHECK(leaf_node.leaf().has_vector()) << "Unknown leaf type";
+    const auto& leaf = leaf_node.leaf().vector();
+    for (size_t i = 0; i < leaf.value_size(); ++i) {
+      const float value = tree_weight * leaf.value(i);
+      UpdatePredictions(example.example_idx, i, value, output_predictions,
+                        additional_output_predictions);
+    }
+  }
+}
+
+}  // namespace
+
+void MultipleAdditiveTrees::Predict(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const bool only_finalized_trees, const std::vector<int32>& trees_to_drop,
+    const boosted_trees::utils::BatchFeatures& features,
+    tensorflow::thread::ThreadPool* worker_threads,
+    tensorflow::TTypes<float>::Matrix output_predictions,
+    tensorflow::TTypes<float>::Matrix no_dropout_predictions) {
+  // Zero out predictions as the model is additive.
+  output_predictions.setZero();
+  no_dropout_predictions.setZero();
+
+  // Get batch size.
+  const int64 batch_size = features.batch_size();
+  if (batch_size <= 0) {
+    return;
+  }
+
+  // Prepare the list of trees to keep.
+  std::vector<int32> trees_to_keep;
+  CalculateTreesToKeep(config, trees_to_drop, config.trees_size(),
+                       only_finalized_trees, &trees_to_keep);
+
+  // Lambda for doing a block of work.
+  auto update_predictions = [&config, &features, &trees_to_keep, &trees_to_drop,
+                             &output_predictions,
+                             &no_dropout_predictions](int64 start, int64 end) {
+    auto examples_iterable = features.examples_iterable(start, end);
+    for (const auto& example : examples_iterable) {
+      for (const int32 tree_idx : trees_to_keep) {
+        UpdatePredictionsBasedOnTree(config, tree_idx, example,
+                                     &output_predictions,
+                                     &no_dropout_predictions);
+      }
+
+      // Now do predictions for dropped trees
+      for (const int32 tree_idx : trees_to_drop) {
+        UpdatePredictionsBasedOnTree(config, tree_idx, example,
+                                     &no_dropout_predictions, nullptr);
+      }
+    }
+  };
+
+  // TODO(salehay): parallelize this for low latency in serving path where
+  // batch size tends to be small but ensemble size tends to be large.
+  boosted_trees::utils::ParallelFor(batch_size, worker_threads->NumThreads(),
+                                    worker_threads, update_predictions);
+}
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
new file mode 100644
index 00000000000..fedade20261
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
@@ -0,0 +1,50 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+// Multiple additive trees prediction model.
+// This class does not hold state and is thread safe.
+class MultipleAdditiveTrees {
+ public:
+  // Predict runs tree ensemble on the given batch and updates
+  // output predictions accordingly. The method also returns predictions that
+  // we would get if no dropout was applied.
+  static void Predict(
+      const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+      const bool only_finalized_trees, const std::vector<int32>& trees_to_drop,
+      const boosted_trees::utils::BatchFeatures& features,
+      thread::ThreadPool* const thread_pool,
+      TTypes<float>::Matrix output_predictions,
+      TTypes<float>::Matrix no_dropout_predictions);
+};
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
new file mode 100644
index 00000000000..5f0924b48f2
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
@@ -0,0 +1,381 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h"
+
+#include "tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h"
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+using boosted_trees::trees::DecisionTreeEnsembleConfig;
+using test::AsTensor;
+
+namespace boosted_trees {
+namespace models {
+namespace {
+
+const int32 kNumThreadsMultiThreaded = 6;
+const int32 kNumThreadsSingleThreaded = 1;
+
+class MultipleAdditiveTreesTest : public ::testing::Test {
+ protected:
+  MultipleAdditiveTreesTest() : batch_features_(2) {
+    // Create a batch of two examples having one dense feature each.
+    // The shape of the dense matrix is therefore 2x1 as in one row per example
+    // and one column per feature per example.
+    auto dense_matrix = test::AsTensor<float>({7.0f, -2.0f}, {2, 1});
+    TF_EXPECT_OK(
+        batch_features_.Initialize({dense_matrix}, {}, {}, {}, {}, {}, {}));
+  }
+
+  boosted_trees::utils::BatchFeatures batch_features_;
+};
+
+TEST_F(MultipleAdditiveTreesTest, Empty) {
+  // Create empty tree ensemble.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto output_tensor = AsTensor<float>({9.0f, 23.0f}, {2, 1});
+  auto output_matrix = output_tensor.matrix<float>();
+  auto no_dropout_output_matrix = output_tensor.matrix<float>();
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                 false,  // include non-finalized trees
+                                 {}, batch_features_, &threads, output_matrix,
+                                 no_dropout_output_matrix);
+  EXPECT_EQ(0, output_matrix(0, 0));
+  EXPECT_EQ(0, output_matrix(1, 0));
+
+  // There was no dropout
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, SingleClass) {
+  // Add one bias and one stump to ensemble for a single class.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  bias_leaf->add_index(0);
+  bias_leaf->add_value(-0.4f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf1->add_index(0);
+  leaf1->add_value(0.9f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf2->add_index(0);
+  leaf2->add_value(0.2f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  auto output_tensor = AsTensor<float>({0.0f, 0.0f}, {2, 1});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor = AsTensor<float>({0.0f, 0.0f}, {2, 1});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1).
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+    }
+  }
+  // Weighted case
+  {
+    DecisionTreeEnsembleConfig weighted = tree_ensemble_config;
+    weighted.set_tree_weights(0, 6.0);
+    weighted.set_tree_weights(1, 3.2);
+    MultipleAdditiveTrees::Predict(weighted,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.2 * 3.2, output_matrix(0, 0));
+    // -0.4 (bias) + 0.9 (leaf 1).
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.9 * 3.2, output_matrix(1, 0));
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+    }
+  }
+  // Drop first tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 0));  // 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 1).
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+  // Drop second tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias).
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias).
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+  // Drop all trees.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0, 1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.0, output_matrix(1, 0));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, MultiClass) {
+  // Add one bias and one stump to ensemble for two classes.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  bias_leaf->add_index(0);
+  bias_leaf->add_value(-0.4f);
+  bias_leaf->add_index(1);
+  bias_leaf->add_value(-0.7f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf1->add_index(0);
+  leaf1->add_value(0.9f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf2->add_index(1);
+  leaf2->add_value(0.2f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  auto output_tensor = AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f}, {2, 2});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f}, {2, 2});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.5f, output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(1, 1));  // -0.7 (bias)
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        EXPECT_EQ(output_matrix(i, j), no_dropout_output_matrix(i, j));
+      }
+    }
+  }
+  // Weighted case.
+  {
+    DecisionTreeEnsembleConfig weighted = tree_ensemble_config;
+    weighted.set_tree_weights(0, 6.0);
+    weighted.set_tree_weights(1, 3.2);
+    MultipleAdditiveTrees::Predict(weighted,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    // bias
+    EXPECT_FLOAT_EQ(-0.4f * 6, output_matrix(0, 0));
+    // bias + leaf 2
+    EXPECT_FLOAT_EQ(-0.7f * 6 + 0.2f * 3.2, output_matrix(0, 1));
+    // bias + leaf 2
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.9f * 3.2f, output_matrix(1, 0));
+    // bias
+    EXPECT_FLOAT_EQ(-0.7f * 6, output_matrix(1, 1));
+  }
+  // Dropout first tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 1));  // 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 1));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+  // Dropout second tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(0, 1));  // -0.7 (bias)
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(1, 1));  // -0.7 (bias)
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+  // Drop both trees.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0, 1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 1));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 0));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 1));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, DenseLeaves) {
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_vector();
+  bias_leaf->add_value(-0.4f);
+  bias_leaf->add_value(-0.7f);
+  bias_leaf->add_value(3.0f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_vector();
+  leaf1->add_value(0.9f);
+  leaf1->add_value(0.8f);
+  leaf1->add_value(0.7f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_vector();
+  leaf2->add_value(0.2f);
+  leaf2->add_value(0.3f);
+  leaf2->add_value(0.4f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  auto output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {2, 3});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {2, 3});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (tree1) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 1));  // -0.7 (tree1) + 0.3 (leaf 2)
+    EXPECT_FLOAT_EQ(3.4f, output_matrix(0, 2));   // 3.0 -(tree1) + 0.4 (leaf 2)
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (tree1) + 0.9 (leaf 1)
+    EXPECT_FLOAT_EQ(0.1f, output_matrix(1, 1));   // -0.7 (tree1) + 0.8 (leaf 1)
+    EXPECT_FLOAT_EQ(3.7f, output_matrix(1, 2));   // 3.0 (tree1) + 0.7 (leaf 1)
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        EXPECT_EQ(output_matrix(i, j), no_dropout_output_matrix(i, j));
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
new file mode 100644
index 00000000000..5e316538cef
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -0,0 +1,132 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Buffering container ideally suited for scenarios where we need
+// to sort and dedupe/compact fixed chunks of a stream of weighted elements.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesBuffer {
+ public:
+  struct BufferEntry {
+    BufferEntry(const ValueType& v, const WeightType& w)
+        : value(v), weight(w) {}
+    BufferEntry() : value(0), weight(0) {}
+
+    bool operator<(const BufferEntry& other) const {
+      return kCompFn(value, other.value);
+    }
+    bool operator==(const BufferEntry& other) const {
+      return value == other.value && weight == other.weight;
+    }
+    friend std::ostream& operator<<(std::ostream& strm,
+                                    const BufferEntry& entry) {
+      return strm << "{" << entry.value << ", " << entry.weight << "}";
+    }
+    ValueType value;
+    WeightType weight;
+  };
+
+  explicit WeightedQuantilesBuffer(int64 block_size, int64 max_elements)
+      : max_size_(std::min(block_size << 1, max_elements)) {
+    QCHECK(max_size_ > 0) << "Invalid buffer specification: (" << block_size
+                          << ", " << max_elements << ")";
+    vec_.reserve(max_size_);
+  }
+
+  // Disallow copying as it's semantically non-sensical in the Squawd algorithm
+  // but enable move semantics.
+  WeightedQuantilesBuffer(const WeightedQuantilesBuffer& other) = delete;
+  WeightedQuantilesBuffer& operator=(const WeightedQuantilesBuffer&) = delete;
+  WeightedQuantilesBuffer(WeightedQuantilesBuffer&& other) = default;
+  WeightedQuantilesBuffer& operator=(WeightedQuantilesBuffer&& other) = default;
+
+  // Push entry to buffer and maintain a compact representation within
+  // pre-defined size limit.
+  void PushEntry(const ValueType& value, const WeightType& weight) {
+    // Callers are expected to act on a full compacted buffer after the
+    // PushEntry call returns.
+    QCHECK(!IsFull()) << "Buffer already full: " << max_size_;
+
+    // Ignore zero and negative weight entries.
+    if (weight <= 0) {
+      return;
+    }
+
+    // Push back the entry to the buffer.
+    vec_.push_back(BufferEntry(value, weight));
+  }
+
+  // Returns a sorted vector view of the base buffer and clears the buffer.
+  // Callers should minimize how often this is called, ideally only right after
+  // the buffer becomes full.
+  std::vector<BufferEntry> GenerateEntryList() {
+    std::vector<BufferEntry> ret;
+    if (vec_.size() == 0) {
+      return ret;
+    }
+    ret.swap(vec_);
+    vec_.reserve(max_size_);
+    std::sort(ret.begin(), ret.end());
+    size_t num_entries = 0;
+    for (size_t i = 1; i < ret.size(); ++i) {
+      if (ret[i].value != ret[i - 1].value) {
+        BufferEntry tmp = ret[i];
+        ++num_entries;
+        ret[num_entries] = tmp;
+      } else {
+        ret[num_entries].weight += ret[i].weight;
+      }
+    }
+    ret.resize(num_entries + 1);
+    return ret;
+  }
+
+  int64 Size() const { return vec_.size(); }
+  bool IsFull() const { return vec_.size() >= max_size_; }
+  void Clear() { vec_.clear(); }
+
+ private:
+  using BufferVector = typename std::vector<BufferEntry>;
+
+  // Comparison function.
+  static constexpr decltype(CompareFn()) kCompFn = CompareFn();
+
+  // Base buffer.
+  size_t max_size_;
+  BufferVector vec_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+constexpr decltype(CompareFn())
+    WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>::kCompFn;
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc
new file mode 100644
index 00000000000..8e403186651
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc
@@ -0,0 +1,100 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+using Buffer =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>;
+using BufferEntry =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<double,
+                                                      double>::BufferEntry;
+
+class WeightedQuantilesBufferTest : public ::testing::Test {};
+
+TEST_F(WeightedQuantilesBufferTest, Invalid) {
+  EXPECT_DEATH(
+      ({
+        boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>
+            buffer(2, 0);
+      }),
+      "Invalid buffer specification");
+  EXPECT_DEATH(
+      ({
+        boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>
+            buffer(0, 2);
+      }),
+      "Invalid buffer specification");
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryNotFull) {
+  Buffer buffer(20, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(3, 0);  // This entry will be ignored.
+
+  EXPECT_FALSE(buffer.IsFull());
+  EXPECT_EQ(buffer.Size(), 3);
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryFull) {
+  // buffer capacity is 4.
+  Buffer buffer(2, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(2, 1);
+
+  std::vector<BufferEntry> expected;
+  expected.emplace_back(-1, 7);
+  expected.emplace_back(2, 4);
+  expected.emplace_back(5, 9);
+
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
+  EXPECT_TRUE(buffer.IsFull());
+  EXPECT_EQ(buffer.GenerateEntryList(), expected);
+  EXPECT_FALSE(buffer.IsFull());
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryFullDeath) {
+  // buffer capacity is 4.
+  Buffer buffer(2, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(2, 1);
+
+  std::vector<BufferEntry> expected;
+  expected.emplace_back(-1, 7);
+  expected.emplace_back(2, 4);
+  expected.emplace_back(5, 9);
+
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
+  EXPECT_TRUE(buffer.IsFull());
+  // Can't push any more entries before clearing.
+  EXPECT_DEATH(({ buffer.PushEntry(6, 6); }), "Buffer already full");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
new file mode 100644
index 00000000000..fd577ad712f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -0,0 +1,325 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+
+#include <memory>
+#include <vector>
+#include <cmath>
+
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Class to compute approximate quantiles with error bound guarantees for
+// weighted data sets.
+// This implementation is an adaptation of techniques from the following papers:
+// * (2001) Space-efficient online computation of quantile summaries.
+// * (2004) Power-conserving computation of order-statistics over
+//          sensor networks.
+// * (2007) A fast algorithm for approximate quantiles in high speed
+//          data streams.
+// * (2016) XGBoost: A Scalable Tree Boosting System.
+//
+// The key ideas at play are the following:
+// - Maintain an in-memory multi-level quantile summary in a way to guarantee
+//   a maximum approximation error of eps * W per bucket where W is the total
+//   weight across all points in the input dataset.
+// - Two base operations are defined: MERGE and COMPRESS. MERGE combines two
+//   summaries guaranteeing a epsNew = max(eps1, eps2). COMPRESS compresses
+//   a summary to b + 1 elements guaranteeing epsNew = epsOld + 1/b.
+// - b * sizeof(summary entry) must ideally be small enough to fit in an
+//   average CPU L2 cache.
+// - To distribute this algorithm with maintaining error bounds, we need
+//   the worker-computed summaries to have no more than eps / h error
+//   where h is the height of the distributed computation graph which
+//   is 2 for an MR with no combiner.
+//
+// We mainly want to max out IO bw by ensuring we're not compute-bound and
+// using a reasonable amount of RAM.
+//
+// Complexity:
+// Compute: O(n * log(1/eps * log(eps * n))).
+// Memory: O(1/eps * log^2(eps * n)) <- for one worker streaming through the
+//                                      entire dataset.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesStream {
+ public:
+  using Buffer = WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>;
+  using BufferEntry = typename Buffer::BufferEntry;
+  using Summary = WeightedQuantilesSummary<ValueType, WeightType, CompareFn>;
+  using SummaryEntry = typename Summary::SummaryEntry;
+
+  explicit WeightedQuantilesStream(double eps, int64 max_elements)
+      : eps_(eps), buffer_(1LL, 2LL), finalized_(false) {
+    std::tie(max_levels_, block_size_) = GetQuantileSpecs(eps, max_elements);
+    buffer_ = Buffer(block_size_, max_elements);
+    summary_levels_.reserve(max_levels_);
+  }
+
+  // Disallow copy and assign but enable move semantics for the stream.
+  WeightedQuantilesStream(const WeightedQuantilesStream& other) = delete;
+  WeightedQuantilesStream& operator=(const WeightedQuantilesStream&) = delete;
+  WeightedQuantilesStream(WeightedQuantilesStream&& other) = default;
+  WeightedQuantilesStream& operator=(WeightedQuantilesStream&& other) = default;
+
+  // Pushes one entry while maintaining approximation error invariants.
+  void PushEntry(const ValueType& value, const WeightType& weight) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Push element to base buffer.
+    buffer_.PushEntry(value, weight);
+
+    // When compacted buffer is full we need to compress
+    // and push weighted quantile summary up the level chain.
+    if (buffer_.IsFull()) {
+      PushBuffer(buffer_);
+    }
+  }
+
+  // Pushes full buffer while maintaining approximation error invariants.
+  void PushBuffer(Buffer& buffer) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Create local compressed summary and propagate.
+    local_summary_.BuildFromBufferEntries(buffer.GenerateEntryList());
+    local_summary_.Compress(block_size_, eps_);
+    PropagateLocalSummary();
+  }
+
+  // Pushes full summary while maintaining approximation error invariants.
+  void PushSummary(const std::vector<SummaryEntry>& summary) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Create local compressed summary and propagate.
+    local_summary_.BuildFromSummaryEntries(summary);
+    local_summary_.Compress(block_size_, eps_);
+    PropagateLocalSummary();
+  }
+
+  // Flushes approximator and finalizes state.
+  void Finalize() {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() may only be called once.";
+
+    // Flush any remaining buffer elements.
+    PushBuffer(buffer_);
+
+    // Create final merged summary.
+    local_summary_.Clear();
+    for (auto& summary : summary_levels_) {
+      local_summary_.Merge(summary);
+      summary.Clear();
+    }
+    summary_levels_.clear();
+    summary_levels_.shrink_to_fit();
+    finalized_ = true;
+  }
+
+  // Generates requested number of quantiles after finalizing stream.
+  // The returned quantiles can be queried using std::lower_bound to get
+  // the bucket for a given value.
+  std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before generating quantiles.";
+    return local_summary_.GenerateQuantiles(num_quantiles);
+  }
+
+  // Generates requested number of boundaries after finalizing stream.
+  // The returned boundaries can be queried using std::lower_bound to get
+  // the bucket for a given value.
+  // The boundaries, while still guaranteeing approximation bounds, don't
+  // necessarily represent the actual quantiles of the distribution.
+  // Boundaries are preferable over quantiles when the caller is less
+  // interested in the actual quantiles distribution and more interested in
+  // getting a representative sample of boundary values.
+  std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before generating boundaries.";
+    return local_summary_.GenerateBoundaries(num_boundaries);
+  }
+
+  // Calculates approximation error for the specified level.
+  // If the passed level is negative, the approximation error for the entire
+  // summary is returned. Note that after Finalize is called, only the overall
+  // error is available.
+  WeightType ApproximationError(int64 level = -1) const {
+    if (finalized_) {
+      QCHECK(level <= 0) << "Only overall error is available after Finalize()";
+      return local_summary_.ApproximationError();
+    }
+
+    if (summary_levels_.empty()) {
+      // No error even if base buffer isn't empty.
+      return 0;
+    }
+
+    // If level is negative, we get the approximation error
+    // for the top-most level which is the max approximation error
+    // in all summaries by construction.
+    if (level < 0) {
+      level = summary_levels_.size() - 1;
+    }
+    QCHECK(level < summary_levels_.size()) << "Invalid level.";
+    return summary_levels_[level].ApproximationError();
+  }
+
+  size_t MaxDepth() const { return summary_levels_.size(); }
+
+  // Generates requested number of quantiles after finalizing stream.
+  const Summary& GetFinalSummary() const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before requesting final summary.";
+    return local_summary_;
+  }
+
+  // Helper method which, given the desired approximation error
+  // and an upper bound on the number of elements, computes the optimal
+  // number of levels and block size and returns them in the tuple.
+  static std::tuple<int64, int64> GetQuantileSpecs(double eps,
+                                                   int64 max_elements);
+
+  // Serializes the internal state of the stream.
+  std::vector<Summary> SerializeInternalSummaries() const {
+    // The buffer should be empty for serialize to work.
+    QCHECK_EQ(buffer_.Size(), 0);
+    std::vector<Summary> result;
+    result.reserve(summary_levels_.size() + 1);
+    for (const Summary& summary : summary_levels_) {
+      result.push_back(summary);
+    }
+    result.push_back(local_summary_);
+    return result;
+  }
+
+  // Resets the state of the stream with a serialized state.
+  void DeserializeInternalSummaries(const std::vector<Summary>& summaries) {
+    // Clear the state before deserializing.
+    buffer_.Clear();
+    summary_levels_.clear();
+    local_summary_.Clear();
+    QCHECK_GT(max_levels_, summaries.size() - 1);
+    for (int i = 0; i < summaries.size() - 1; ++i) {
+      summary_levels_.push_back(summaries[i]);
+    }
+    local_summary_ = summaries[summaries.size() - 1];
+  }
+
+ private:
+  // Propagates local summary through summary levels while maintaining
+  // approximation error invariants.
+  void PropagateLocalSummary() {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // No-op if there's nothing to add.
+    if (local_summary_.Size() <= 0) {
+      return;
+    }
+
+    // Propagate summary through levels.
+    size_t level = 0;
+    for (bool settled = false; !settled; ++level) {
+      // Ensure we have enough depth.
+      if (summary_levels_.size() <= level) {
+        summary_levels_.emplace_back();
+      }
+
+      // Merge summaries.
+      Summary& current_summary = summary_levels_[level];
+      local_summary_.Merge(current_summary);
+
+      // Check if we need to compress and propagate summary higher.
+      if (current_summary.Size() == 0 ||
+          local_summary_.Size() <= block_size_ + 1) {
+        current_summary = std::move(local_summary_);
+        settled = true;
+      } else {
+        // Compress, empty current level and propagate.
+        local_summary_.Compress(block_size_, eps_);
+        current_summary.Clear();
+      }
+    }
+  }
+
+  // Desired approximation precision.
+  double eps_;
+  // Maximum number of levels.
+  int64 max_levels_;
+  // Max block size per level.
+  int64 block_size_;
+  // Base buffer.
+  Buffer buffer_;
+  // Local summary used to minimize memory allocation and cache misses.
+  // After the stream is finalized, this summary holds the final quantile
+  // estimates.
+  Summary local_summary_;
+  // Summary levels;
+  std::vector<Summary> summary_levels_;
+  // Flag indicating whether the stream is finalized.
+  bool finalized_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+inline std::tuple<int64, int64>
+WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
+    double eps, int64 max_elements) {
+  int64 max_level = 1LL;
+  int64 block_size = 2LL;
+  QCHECK(eps >= 0 && eps < 1);
+  QCHECK_GT(max_elements, 0);
+
+  if (eps <= std::numeric_limits<double>::epsilon()) {
+    // Exact quantile computation at the expense of RAM.
+    max_level = 1;
+    block_size = std::max(max_elements, 2LL);
+  } else {
+    // The bottom-most level will become full at most
+    // (max_elements / block_size) times, the level above will become full
+    // (max_elements / 2 * block_size) times and generally level l becomes
+    // full (max_elements / 2^l * block_size) times until the last
+    // level max_level becomes full at most once meaning when the inequality
+    // (2^max_level * block_size >= max_elements) is satisfied.
+    // In what follows, we jointly solve for max_level and block_size by
+    // gradually increasing the level until the inequality above is satisfied.
+    // We could alternatively set max_level = ceil(log2(eps * max_elements));
+    // and block_size = ceil(max_level / eps) + 1 but that tends to give more
+    // pessimistic bounds and wastes RAM needlessly.
+    for (max_level = 1, block_size = 2;
+         (1LL << max_level) * block_size < max_elements; ++max_level) {
+      // Update upper bound on block size at current level, we always
+      // increase the estimate by 2 to hold the min/max elements seen so far.
+      block_size = static_cast<size_t>(ceil(max_level / eps)) + 1;
+    }
+  }
+  return std::make_tuple(max_level, std::max(block_size, 2LL));
+}
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
new file mode 100644
index 00000000000..4481c0d0e44
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc
@@ -0,0 +1,264 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+using Tuple = std::tuple<int64, int64>;
+
+using Summary =
+    boosted_trees::quantiles::WeightedQuantilesSummary<double, double>;
+using SummaryEntry =
+    boosted_trees::quantiles::WeightedQuantilesSummary<double,
+                                                       double>::SummaryEntry;
+using Stream =
+    boosted_trees::quantiles::WeightedQuantilesStream<double, double>;
+
+TEST(GetQuantileSpecs, InvalidEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(-0.01, 0L); }, "eps >= 0");
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(1.01, 0L); }, "eps < 1");
+}
+
+TEST(GetQuantileSpecs, ZeroEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(0.0, 0L); }, "max_elements > 0");
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.0, 1LL), Tuple(1LL, 2LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.0, 20LL), Tuple(1LL, 20LL));
+}
+
+TEST(GetQuantileSpecs, NonZeroEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(0.01, 0L); }, "max_elements > 0");
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.1, 320LL), Tuple(4LL, 31LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 25600LL), Tuple(6LL, 501LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 104857600LL), Tuple(17LL, 1601LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.1, 104857600LL), Tuple(20LL, 191LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 1LL << 40), Tuple(29LL, 2801LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.001, 1LL << 40), Tuple(26LL, 25001LL));
+}
+
+class WeightedQuantilesStreamTest : public ::testing::Test {};
+
+// Stream generators.
+void GenerateFixedUniformSummary(int32 worker_id, int64 max_elements,
+                                 double *total_weight, Stream *stream) {
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = static_cast<double>(i) / max_elements;
+    stream->PushEntry(x, 1.0);
+    ++(*total_weight);
+  }
+  stream->Finalize();
+}
+
+void GenerateFixedNonUniformSummary(int32 worker_id, int64 max_elements,
+                                    double *total_weight, Stream *stream) {
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = static_cast<double>(i) / max_elements;
+    stream->PushEntry(x, x);
+    (*total_weight) += x;
+  }
+  stream->Finalize();
+}
+
+void GenerateRandUniformFixedWeightsSummary(int32 worker_id, int64 max_elements,
+                                            double *total_weight,
+                                            Stream *stream) {
+  // Simulate uniform distribution stream.
+  random::PhiloxRandom philox(13 + worker_id);
+  random::SimplePhilox rand(&philox);
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = rand.RandDouble();
+    stream->PushEntry(x, 1);
+    ++(*total_weight);
+  }
+  stream->Finalize();
+}
+
+void GenerateRandUniformRandWeightsSummary(int32 worker_id, int64 max_elements,
+                                           double *total_weight,
+                                           Stream *stream) {
+  // Simulate uniform distribution stream.
+  random::PhiloxRandom philox(13 + worker_id);
+  random::SimplePhilox rand(&philox);
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = rand.RandDouble();
+    const double w = rand.RandDouble();
+    stream->PushEntry(x, w);
+    (*total_weight) += w;
+  }
+  stream->Finalize();
+}
+
+// Single worker tests.
+void TestSingleWorkerStreams(
+    double eps, int64 max_elements,
+    const std::function<void(int32, int64, double *, Stream *)>
+        &worker_summary_generator,
+    std::initializer_list<double> expected_quantiles,
+    double quantiles_matcher_epsilon) {
+  // Generate single stream.
+  double total_weight = 0;
+  Stream stream(eps, max_elements);
+  worker_summary_generator(0, max_elements, &total_weight, &stream);
+
+  // Ensure we didn't lose track of any elements and are
+  // within approximation error bound.
+  EXPECT_LE(stream.ApproximationError(), eps);
+  EXPECT_NEAR(stream.GetFinalSummary().TotalWeight(), total_weight, 1e-6);
+
+  // Verify expected quantiles.
+  int i = 0;
+  auto actuals = stream.GenerateQuantiles(expected_quantiles.size() - 1);
+  for (auto expected_quantile : expected_quantiles) {
+    EXPECT_NEAR(actuals[i], expected_quantile, quantiles_matcher_epsilon);
+    ++i;
+  }
+}
+
+// Stream generators.
+void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight,
+                      Stream *stream) {
+  stream->PushEntry(10, 1);
+  ++(*total_weight);
+  stream->Finalize();
+}
+
+TEST(WeightedQuantilesStreamTest, OneValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneValue,
+                          {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedUniform) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateFixedUniformSummary,
+                          {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+                          1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedNonUniform) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateFixedNonUniformSummary,
+                          {0, std::sqrt(0.1), std::sqrt(0.2), std::sqrt(0.3),
+                           std::sqrt(0.4), std::sqrt(0.5), std::sqrt(0.6),
+                           std::sqrt(0.7), std::sqrt(0.8), std::sqrt(0.9), 1.0},
+                          1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformFixedWeights) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(
+      eps, max_elements, GenerateRandUniformFixedWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformRandWeights) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(
+      eps, max_elements, GenerateRandUniformRandWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+// Distributed tests.
+void TestDistributedStreams(
+    int32 num_workers, double eps, int64 max_elements,
+    const std::function<void(int32, int64, double *, Stream *)>
+        &worker_summary_generator,
+    std::initializer_list<double> expected_quantiles,
+    double quantiles_matcher_epsilon) {
+  // Simulate streams on each worker running independently
+  double total_weight = 0;
+  std::vector<std::vector<SummaryEntry>> worker_summaries;
+  for (int32 i = 0; i < num_workers; ++i) {
+    Stream stream(eps / 2, max_elements);
+    worker_summary_generator(i, max_elements / num_workers, &total_weight,
+                             &stream);
+    worker_summaries.push_back(stream.GetFinalSummary().GetEntryList());
+  }
+
+  // In the accumulation phase, we aggregate the summaries from each worker
+  // and build an overall summary while maintaining error bounds by ensuring we
+  // don't increase the error by more than eps / 2.
+  Stream reducer_stream(eps, max_elements);
+  for (const auto &summary : worker_summaries) {
+    reducer_stream.PushSummary(summary);
+  }
+  reducer_stream.Finalize();
+
+  // Ensure we didn't lose track of any elements and are
+  // within approximation error bound.
+  EXPECT_LE(reducer_stream.ApproximationError(), eps);
+  EXPECT_NEAR(reducer_stream.GetFinalSummary().TotalWeight(), total_weight,
+              total_weight);
+
+  // Verify expected quantiles.
+  int i = 0;
+  auto actuals =
+      reducer_stream.GenerateQuantiles(expected_quantiles.size() - 1);
+  for (auto expected_quantile : expected_quantiles) {
+    EXPECT_NEAR(actuals[i], expected_quantile, quantiles_matcher_epsilon);
+    ++i;
+  }
+}
+
+TEST(WeightedQuantilesStreamTest, FixedUniformDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateFixedUniformSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedNonUniformDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(num_workers, eps, max_elements,
+                         GenerateFixedNonUniformSummary,
+                         {0, std::sqrt(0.1), std::sqrt(0.2), std::sqrt(0.3),
+                          std::sqrt(0.4), std::sqrt(0.5), std::sqrt(0.6),
+                          std::sqrt(0.7), std::sqrt(0.8), std::sqrt(0.9), 1.0},
+                         1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformFixedWeightsDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateRandUniformFixedWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformRandWeightsDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateRandUniformRandWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
new file mode 100644
index 00000000000..314c44fddc5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -0,0 +1,319 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Summary holding a sorted block of entries with upper bound guarantees
+// over the approximation error.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesSummary {
+ public:
+  using Buffer = WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>;
+  using BufferEntry = typename Buffer::BufferEntry;
+
+  struct SummaryEntry {
+    SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min,
+                 const WeightType& max)
+        : value(v), weight(w), min_rank(min), max_rank(max) {}
+
+    SummaryEntry() : value(0), weight(0), min_rank(0), max_rank(0) {}
+
+    bool operator==(const SummaryEntry& other) const {
+      return value == other.value && weight == other.weight &&
+             min_rank == other.min_rank && max_rank == other.max_rank;
+    }
+    friend std::ostream& operator<<(std::ostream& strm,
+                                    const SummaryEntry& entry) {
+      return strm << "{" << entry.value << ", " << entry.weight << ", "
+                  << entry.min_rank << ", " << entry.max_rank << "}";
+    }
+
+    // Max rank estimate for previous smaller value.
+    WeightType PrevMaxRank() const { return max_rank - weight; }
+
+    // Min rank estimate for next larger value.
+    WeightType NextMinRank() const { return min_rank + weight; }
+
+    ValueType value;
+    WeightType weight;
+    WeightType min_rank;
+    WeightType max_rank;
+  };
+
+  // Re-construct summary from the specified buffer.
+  void BuildFromBufferEntries(const std::vector<BufferEntry>& buffer_entries) {
+    entries_.clear();
+    entries_.reserve(buffer_entries.size());
+    WeightType cumulative_weight = 0;
+    for (const auto& entry : buffer_entries) {
+      WeightType current_weight = entry.weight;
+      entries_.emplace_back(entry.value, entry.weight, cumulative_weight,
+                            cumulative_weight + current_weight);
+      cumulative_weight += current_weight;
+    }
+  }
+
+  // Re-construct summary from the specified summary entries.
+  void BuildFromSummaryEntries(
+      const std::vector<SummaryEntry>& summary_entries) {
+    entries_.clear();
+    entries_.reserve(summary_entries.size());
+    entries_.insert(entries_.begin(), summary_entries.begin(),
+                    summary_entries.end());
+  }
+
+  // Merges two summaries through an algorithm that's derived from MergeSort
+  // for summary entries while guaranteeing that the max approximation error
+  // of the final merged summary is no greater than the approximation errors
+  // of each individual summary.
+  // For example consider summaries where each entry is of the form
+  // (element, weight, min rank, max rank):
+  // summary entries 1: (1, 3, 0, 3), (4, 2, 3, 5)
+  // summary entries 2: (3, 1, 0, 1), (4, 1, 1, 2)
+  // merged: (1, 3, 0, 3), (3, 1, 3, 4), (4, 3, 4, 7).
+  void Merge(const WeightedQuantilesSummary& other_summary) {
+    // Make sure we have something to merge.
+    const auto& other_entries = other_summary.entries_;
+    if (other_entries.empty()) {
+      return;
+    }
+    if (entries_.empty()) {
+      BuildFromSummaryEntries(other_summary.entries_);
+      return;
+    }
+
+    // Move current entries to make room for a new buffer.
+    std::vector<SummaryEntry> base_entries(std::move(entries_));
+    entries_.clear();
+    entries_.reserve(base_entries.size() + other_entries.size());
+
+    // Merge entries maintaining ranks. The idea is to stack values
+    // in order which we can do in linear time as the two summaries are
+    // already sorted. We keep track of the next lower rank from either
+    // summary and update it as we pop elements from the summaries.
+    // We handle the special case when the next two elements from either
+    // summary are equal, in which case we just merge the two elements
+    // and simultaneously update both ranks.
+    auto it1 = base_entries.cbegin();
+    auto it2 = other_entries.cbegin();
+    WeightType next_min_rank1 = 0;
+    WeightType next_min_rank2 = 0;
+    while (it1 != base_entries.cend() && it2 != other_entries.cend()) {
+      if (kCompFn(it1->value, it2->value)) {  // value1 < value2
+        // Take value1 and use the last added value2 to compute
+        // the min rank and the current value2 to compute the max rank.
+        entries_.emplace_back(it1->value, it1->weight,
+                              it1->min_rank + next_min_rank2,
+                              it1->max_rank + it2->PrevMaxRank());
+        // Update next min rank 1.
+        next_min_rank1 = it1->NextMinRank();
+        ++it1;
+      } else if (kCompFn(it2->value, it1->value)) {  // value1 > value2
+        // Take value2 and use the last added value1 to compute
+        // the min rank and the current value1 to compute the max rank.
+        entries_.emplace_back(it2->value, it2->weight,
+                              it2->min_rank + next_min_rank1,
+                              it2->max_rank + it1->PrevMaxRank());
+        // Update next min rank 2.
+        next_min_rank2 = it2->NextMinRank();
+        ++it2;
+      } else {  // value1 == value2
+        // Straight additive merger of the two entries into one.
+        entries_.emplace_back(it1->value, it1->weight + it2->weight,
+                              it1->min_rank + it2->min_rank,
+                              it1->max_rank + it2->max_rank);
+        // Update next min ranks for both.
+        next_min_rank1 = it1->NextMinRank();
+        next_min_rank2 = it2->NextMinRank();
+        ++it1;
+        ++it2;
+      }
+    }
+
+    // Fill in any residual.
+    while (it1 != base_entries.cend()) {
+      entries_.emplace_back(it1->value, it1->weight,
+                            it1->min_rank + next_min_rank2,
+                            it1->max_rank + other_entries.back().max_rank);
+      ++it1;
+    }
+    while (it2 != other_entries.cend()) {
+      entries_.emplace_back(it2->value, it2->weight,
+                            it2->min_rank + next_min_rank1,
+                            it2->max_rank + base_entries.back().max_rank);
+      ++it2;
+    }
+  }
+
+  // Compresses buffer into desired size. The size specification is
+  // considered a hint as we always keep the first and last elements and
+  // maintain strict approximation error bounds.
+  // The approximation error delta is taken as the max of either the requested
+  // min error or 1 / size_hint.
+  // After compression, the approximation error is guaranteed to increase
+  // by no more than that error delta.
+  // This algorithm is linear in the original size of the summary and is
+  // designed to be cache-friendly.
+  void Compress(int64 size_hint, double min_eps = 0) {
+    // No-op if we're already within the size requirement.
+    size_hint = std::max(size_hint, 2LL);
+    if (entries_.size() <= size_hint) {
+      return;
+    }
+
+    // First compute the max error bound delta resulting from this compression.
+    double eps_delta = TotalWeight() * std::max(1.0 / size_hint, min_eps);
+
+    // Compress elements ensuring approximation bounds and elements diversity
+    // are both maintained.
+    int64 add_accumulator = 0, add_step = entries_.size();
+    auto write_it = entries_.begin() + 1, last_it = write_it;
+    for (auto read_it = entries_.begin(); read_it + 1 != entries_.end();) {
+      auto next_it = read_it + 1;
+      while (next_it != entries_.end() && add_accumulator < add_step &&
+             next_it->PrevMaxRank() - read_it->NextMinRank() <= eps_delta) {
+        add_accumulator += size_hint;
+        ++next_it;
+      }
+      if (read_it == next_it - 1) {
+        ++read_it;
+      } else {
+        read_it = next_it - 1;
+      }
+      (*write_it++) = (*read_it);
+      last_it = read_it;
+      add_accumulator -= add_step;
+    }
+    // Write last element and resize.
+    if (last_it + 1 != entries_.end()) {
+      (*write_it++) = entries_.back();
+    }
+    entries_.resize(write_it - entries_.begin());
+  }
+
+  // To construct the boundaries we first run a soft compress over a copy
+  // of the summary and retrieve the values.
+  // The resulting boundaries are guaranteed to both contain at least
+  // num_boundaries unique elements and maintain approximation bounds.
+  std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    // Generate soft compressed summary.
+    WeightedQuantilesSummary<ValueType, WeightType, CompareFn>
+        compressed_summary;
+    compressed_summary.BuildFromSummaryEntries(entries_);
+    // Set an epsilon for compression that's at most 1.0 / num_boundaries
+    // more than epsilon of original our summary since the compression operation
+    // adds ~1.0/num_boundaries to final approximation error.
+    float compression_eps = ApproximationError() + (1.0 / num_boundaries);
+    compressed_summary.Compress(num_boundaries, compression_eps);
+
+    // Return boundaries.
+    std::vector<ValueType> output;
+    output.reserve(compressed_summary.entries_.size());
+    for (const auto& entry : compressed_summary.entries_) {
+      output.push_back(entry.value);
+    }
+    return output;
+  }
+
+  // To construct the desired n-quantiles we repetitively query n ranks from the
+  // original summary. The following algorithm is an efficient cache-friendly
+  // O(n) implementation of that idea which avoids the cost of the repetitive
+  // full rank queries O(nlogn).
+  std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
+    std::vector<ValueType> output;
+    num_quantiles = std::max(num_quantiles, 2LL);
+    output.reserve(num_quantiles + 1);
+
+    // Make successive rank queries to get boundaries.
+    // We always keep the first (min) and last (max) entries.
+    for (size_t cur_idx = 0, rank = 0; rank <= num_quantiles; ++rank) {
+      // This step boils down to finding the next element sub-range defined by
+      // r = (rmax[i + 1] + rmin[i + 1]) / 2 where the desired rank d < r.
+      WeightType d_2 = 2 * (rank * entries_.back().max_rank / num_quantiles);
+      size_t next_idx = cur_idx + 1;
+      while (next_idx < entries_.size() &&
+             d_2 >= entries_[next_idx].min_rank + entries_[next_idx].max_rank) {
+        ++next_idx;
+      }
+      cur_idx = next_idx - 1;
+
+      // Determine insertion order.
+      if (next_idx == entries_.size() ||
+          d_2 < entries_[cur_idx].NextMinRank() +
+                    entries_[next_idx].PrevMaxRank()) {
+        output.push_back(entries_[cur_idx].value);
+      } else {
+        output.push_back(entries_[next_idx].value);
+      }
+    }
+    return output;
+  }
+
+  // Calculates current approximation error which should always be <= eps.
+  double ApproximationError() const {
+    if (entries_.empty()) {
+      return 0;
+    }
+
+    WeightType max_gap = 0;
+    for (auto it = entries_.cbegin() + 1; it < entries_.end(); ++it) {
+      max_gap = std::max(max_gap,
+                         std::max(it->max_rank - it->min_rank - it->weight,
+                                  it->PrevMaxRank() - (it - 1)->NextMinRank()));
+    }
+    return static_cast<double>(max_gap) / TotalWeight();
+  }
+
+  ValueType MinValue() const {
+    return !entries_.empty() ? entries_.front().value
+                             : std::numeric_limits<ValueType>::max();
+  }
+  ValueType MaxValue() const {
+    return !entries_.empty() ? entries_.back().value
+                             : std::numeric_limits<ValueType>::max();
+  }
+  WeightType TotalWeight() const {
+    return !entries_.empty() ? entries_.back().max_rank : 0;
+  }
+  int64 Size() const { return entries_.size(); }
+  void Clear() { entries_.clear(); }
+  const std::vector<SummaryEntry>& GetEntryList() const { return entries_; }
+
+ private:
+  // Comparison function.
+  static constexpr decltype(CompareFn()) kCompFn = CompareFn();
+
+  // Summary entries.
+  std::vector<SummaryEntry> entries_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+constexpr decltype(CompareFn())
+    WeightedQuantilesSummary<ValueType, WeightType, CompareFn>::kCompFn;
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
new file mode 100644
index 00000000000..8de154483e6
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
@@ -0,0 +1,224 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+using Buffer = boosted_trees::quantiles::WeightedQuantilesBuffer<float, float>;
+using BufferEntry =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<float,
+                                                      float>::BufferEntry;
+using Summary =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float, float>;
+using SummaryEntry =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float,
+                                                       float>::SummaryEntry;
+
+class WeightedQuantilesSummaryTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Constructs a buffer of 10 weighted unique entries.
+    buffer1_.reset(new Buffer(10, 1000));
+    buffer1_->PushEntry(5, 9);
+    buffer1_->PushEntry(2, 3);
+    buffer1_->PushEntry(-1, 7);
+    buffer1_->PushEntry(-7, 1);
+    buffer1_->PushEntry(3, 2);
+    buffer1_->PushEntry(-2, 3);
+    buffer1_->PushEntry(21, 8);
+    buffer1_->PushEntry(-13, 4);
+    buffer1_->PushEntry(8, 2);
+    buffer1_->PushEntry(-5, 6);
+
+    // Constructs a buffer of 7 weighted unique entries.
+    buffer2_.reset(new Buffer(7, 1000));
+    buffer2_->PushEntry(9, 2);
+    buffer2_->PushEntry(-7, 3);
+    buffer2_->PushEntry(2, 1);
+    buffer2_->PushEntry(4, 13);
+    buffer2_->PushEntry(0, 5);
+    buffer2_->PushEntry(-5, 3);
+    buffer2_->PushEntry(11, 3);
+  }
+
+  void TearDown() override { buffer1_->Clear(); }
+
+  std::unique_ptr<Buffer> buffer1_;
+  std::unique_ptr<Buffer> buffer2_;
+  const double buffer1_min_value_ = -13;
+  const double buffer1_max_value_ = 21;
+  const double buffer1_total_weight_ = 45;
+  const double buffer2_min_value_ = -7;
+  const double buffer2_max_value_ = 11;
+  const double buffer2_total_weight_ = 30;
+};
+
+TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
+  Summary summary;
+  summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+
+  // We expect no approximation error because no compress operation occurred.
+  EXPECT_EQ(summary.ApproximationError(), 0);
+
+  // Check first and last elements in the summary.
+  const auto& entries = summary.GetEntryList();
+  // First element's rmin should be zero.
+  EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+  EXPECT_EQ(entries.front(), SummaryEntry(-13, 4, 0, 4));
+  // Last element's rmax should be cumulative weight.
+  EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+  EXPECT_EQ(entries.back(), SummaryEntry(21, 8, 37, 45));
+  // Check total weight.
+  EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressSeparately) {
+  const auto entry_list = buffer1_->GenerateEntryList();
+  for (int new_size = 9; new_size >= 2; --new_size) {
+    Summary summary;
+    summary.BuildFromBufferEntries(entry_list);
+    summary.Compress(new_size);
+
+    // Expect a max approximation error of 1 / n
+    // ie. eps0 + 1/n but eps0 = 0.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
+
+    // Min/Max elements and total weight should not change.
+    EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+    EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+    EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressSequentially) {
+  Summary summary;
+  summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  for (int new_size = 9; new_size >= 2; new_size -= 2) {
+    double prev_eps = summary.ApproximationError();
+    summary.Compress(new_size);
+
+    // Expect a max approximation error of prev_eps + 1 / n.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), prev_eps + 1.0 / new_size);
+
+    // Min/Max elements and total weight should not change.
+    EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+    EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+    EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressRandomized) {
+  // Check multiple size compressions and ensure approximation bounds
+  // are always respected.
+  int prev_size = 1;
+  int size = 2;
+  float max_value = 1 << 20;
+  while (size < (1 << 16)) {
+    // Create buffer of size from uniform random elements.
+    Buffer buffer(size, size << 4);
+    random::PhiloxRandom philox(13);
+    random::SimplePhilox rand(&philox);
+    for (int i = 0; i < size; ++i) {
+      buffer.PushEntry(rand.RandFloat() * max_value,
+                       rand.RandFloat() * max_value);
+    }
+
+    // Create summary and compress.
+    Summary summary;
+    summary.BuildFromBufferEntries(buffer.GenerateEntryList());
+    int new_size = std::max(rand.Uniform(size), 2u);
+    summary.Compress(new_size);
+
+    // Ensure approximation error is acceptable.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
+
+    // Update size to next fib number.
+    size_t last_size = size;
+    size += prev_size;
+    prev_size = last_size;
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, MergeSymmetry) {
+  // Create two separate summaries and merge.
+  const auto list_1 = buffer1_->GenerateEntryList();
+  const auto list_2 = buffer2_->GenerateEntryList();
+  Summary summary1;
+  summary1.BuildFromBufferEntries(list_1);
+  Summary summary2;
+  summary2.BuildFromBufferEntries(list_2);
+
+  // Merge summary 2 into 1 and verify.
+  summary1.Merge(summary2);
+  EXPECT_EQ(summary1.ApproximationError(), 0.0);
+  EXPECT_EQ(summary1.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary1.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary1.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+  EXPECT_EQ(summary1.Size(), 14);  // 14 unique values.
+
+  // Merge summary 1 into 2 and verify same result.
+  summary1.BuildFromBufferEntries(list_1);
+  summary2.Merge(summary1);
+  EXPECT_EQ(summary2.ApproximationError(), 0.0);
+  EXPECT_EQ(summary2.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary2.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary2.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+  EXPECT_EQ(summary2.Size(), 14);  // 14 unique values.
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressThenMerge) {
+  // Create two separate summaries and merge.
+  Summary summary1;
+  summary1.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  Summary summary2;
+  summary2.BuildFromBufferEntries(buffer2_->GenerateEntryList());
+
+  // Compress summaries.
+  summary1.Compress(5);  // max error is 1/5.
+  const auto eps1 = 1.0 / 5;
+  EXPECT_LE(summary1.ApproximationError(), eps1);
+  summary2.Compress(3);  // max error is 1/3.
+  const auto eps2 = 1.0 / 3;
+  EXPECT_LE(summary2.ApproximationError(), eps2);
+
+  // Merge guarantees an approximation error of max(eps1, eps2).
+  // Merge summary 2 into 1 and verify.
+  summary1.Merge(summary2);
+  EXPECT_LE(summary1.ApproximationError(), std::max(eps1, eps2));
+  EXPECT_EQ(summary1.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary1.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary1.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc
new file mode 100644
index 00000000000..39c2fbe9c99
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc
@@ -0,0 +1,88 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+using tensorflow::Tensor;
+
+void RandomlyInitializeBatchFeatures(
+    tensorflow::random::SimplePhilox* rng, uint32 num_dense_float_features,
+    uint32 num_sparse_float_features, double sparsity_lo, double sparsity_hi,
+    boosted_trees::utils::BatchFeatures* batch_features) {
+  const int64 batch_size = static_cast<int64>(batch_features->batch_size());
+
+  // Populate dense features.
+  std::vector<tensorflow::Tensor> dense_float_features_list;
+  for (int i = 0; i < num_dense_float_features; ++i) {
+    std::vector<float> values;
+    for (int64 j = 0; j < batch_size; ++j) {
+      values.push_back(rng->RandFloat());
+    }
+    auto dense_tensor = Tensor(tensorflow::DT_FLOAT, {batch_size, 1});
+    tensorflow::test::FillValues<float>(&dense_tensor, values);
+    dense_float_features_list.push_back(dense_tensor);
+  }
+
+  // Populate sparse features.
+  std::vector<tensorflow::Tensor> sparse_float_feature_indices_list;
+  std::vector<tensorflow::Tensor> sparse_float_feature_values_list;
+  std::vector<tensorflow::Tensor> sparse_float_feature_shapes_list;
+  for (int i = 0; i < num_sparse_float_features; ++i) {
+    std::set<uint64> indices;
+    const double sparsity =
+        sparsity_lo + rng->RandDouble() * (sparsity_hi - sparsity_lo);
+    const double density = 1 - sparsity;
+    for (int64 k = 0; k < static_cast<int64>(density * batch_size) + 1; ++k) {
+      indices.insert(rng->Uniform64(batch_size));
+    }
+    const int64 sparse_values_size = indices.size();
+    std::vector<int64> indices_vector;
+    for (auto idx : indices) {
+      indices_vector.push_back(idx);
+      indices_vector.push_back(0);
+    }
+    auto indices_tensor = Tensor(tensorflow::DT_INT64, {sparse_values_size, 2});
+    tensorflow::test::FillValues<int64>(&indices_tensor, indices_vector);
+    sparse_float_feature_indices_list.push_back(indices_tensor);
+
+    std::vector<float> values;
+    for (int64 j = 0; j < sparse_values_size; ++j) {
+      values.push_back(rng->RandFloat());
+    }
+    auto values_tensor = Tensor(tensorflow::DT_FLOAT, {sparse_values_size});
+    tensorflow::test::FillValues<float>(&values_tensor, values);
+    sparse_float_feature_values_list.push_back(values_tensor);
+
+    auto shape_tensor = Tensor(tensorflow::DT_INT64, {2});
+    tensorflow::test::FillValues<int64>(&shape_tensor, {batch_size, 1});
+    sparse_float_feature_shapes_list.push_back(shape_tensor);
+  }
+
+  // TODO(salehay): Add categorical feature generation support.
+  TF_EXPECT_OK(batch_features->Initialize(
+      dense_float_features_list, sparse_float_feature_indices_list,
+      sparse_float_feature_values_list, sparse_float_feature_shapes_list, {},
+      {}, {}));
+}
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
new file mode 100644
index 00000000000..d95878ec87b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
@@ -0,0 +1,45 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+// This method calls Initialize on the given 'batch_features', which will be
+// populated with randomly generated feature values when the call returns.
+// 'tensors' returns a vector of all tensors used in the initialization,
+// because they must outlive 'batch_features'.
+//
+// All float features will be either missing or uniformly randomly chosen
+// from [0, 1). For sparse (float) features, a sparsity is uniformly randomly
+// chosen from ['sparsity_lo', 'sparsity_hi') per feature, and each instance
+// will have a probability of sparsity of missing that feature, in other words,
+// sparsity = 1 - density.
+void RandomlyInitializeBatchFeatures(
+    tensorflow::random::SimplePhilox* rng, uint32 num_dense_float_features,
+    uint32 num_sparse_float_features, double sparsity_lo, double sparsity_hi,
+    boosted_trees::utils::BatchFeatures* batch_features);
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
new file mode 100644
index 00000000000..cbe26ba918d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -0,0 +1,211 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+using tensorflow::boosted_trees::trees::DecisionTreeConfig;
+using tensorflow::boosted_trees::trees::TreeNode;
+using boosted_trees::trees::DenseFloatBinarySplit;
+
+namespace {
+
+// Append the given nodes to tree with transfer of pointer ownership.
+// nodes will not be usable upon return.
+template <typename T>
+void AppendNodes(DecisionTreeConfig* tree, T* nodes) {
+  std::reverse(nodes->pointer_begin(), nodes->pointer_end());
+  while (!nodes->empty()) {
+    tree->mutable_nodes()->AddAllocated(nodes->ReleaseLast());
+  }
+}
+
+DenseFloatBinarySplit* GetSplit(TreeNode* node) {
+  switch (node->node_case()) {
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft:
+      return node->mutable_sparse_float_binary_split_default_left()
+          ->mutable_split();
+    case TreeNode::kSparseFloatBinarySplitDefaultRight:
+      return node->mutable_sparse_float_binary_split_default_right()
+          ->mutable_split();
+    case TreeNode::kDenseFloatBinarySplit:
+      return node->mutable_dense_float_binary_split();
+    default:
+      LOG(FATAL) << "Unknown node type encountered.";
+  }
+  return nullptr;
+}
+
+}  // namespace
+
+RandomTreeGen::RandomTreeGen(tensorflow::random::SimplePhilox* rng,
+                             int dense_feature_size, int sparse_feature_size)
+    : rng_(rng),
+      dense_feature_size_(dense_feature_size),
+      sparse_feature_size_(sparse_feature_size) {}
+
+namespace {
+void AddWeightAndMetadata(
+    boosted_trees::trees::DecisionTreeEnsembleConfig* ret) {
+  // Assign the weight of the tree to 1 and say that this weight was updated
+  // only once.
+  ret->add_tree_weights(1.0);
+  auto* meta = ret->add_tree_metadata();
+  meta->set_num_tree_weight_updates(1);
+}
+
+}  //  namespace
+
+boosted_trees::trees::DecisionTreeEnsembleConfig
+RandomTreeGen::GenerateEnsemble(int depth, int tree_count) {
+  boosted_trees::trees::DecisionTreeEnsembleConfig ret;
+  *(ret.add_trees()) = Generate(depth);
+  AddWeightAndMetadata(&ret);
+  for (int i = 1; i < tree_count; ++i) {
+    *(ret.add_trees()) = Generate(ret.trees(0));
+    AddWeightAndMetadata(&ret);
+  }
+  return ret;
+}
+
+DecisionTreeConfig RandomTreeGen::Generate(const DecisionTreeConfig& tree) {
+  DecisionTreeConfig ret = tree;
+  for (auto& node : *ret.mutable_nodes()) {
+    if (node.node_case() == TreeNode::kLeaf) {
+      node.mutable_leaf()->mutable_sparse_vector()->set_value(
+          0, rng_->RandFloat());
+      continue;
+    }
+    // Original node is a split. Re-generate it's type but retain the split node
+    // indices.
+    DenseFloatBinarySplit* split = GetSplit(&node);
+    const int left_id = split->left_id();
+    const int right_id = split->right_id();
+    GenerateSplit(&node, left_id, right_id);
+  }
+  return ret;
+}
+
+DecisionTreeConfig RandomTreeGen::Generate(int depth) {
+  DecisionTreeConfig ret;
+  // Add root,
+  TreeNode* node = ret.add_nodes();
+  GenerateSplit(node, 1, 2);
+  if (depth == 1) {
+    // Add left and right leaves.
+    TreeNode* left = ret.add_nodes();
+    left->mutable_leaf()->mutable_sparse_vector()->add_index(0);
+    left->mutable_leaf()->mutable_sparse_vector()->add_value(rng_->RandFloat());
+    TreeNode* right = ret.add_nodes();
+    right->mutable_leaf()->mutable_sparse_vector()->add_index(0);
+    right->mutable_leaf()->mutable_sparse_vector()->add_value(
+        rng_->RandFloat());
+    return ret;
+  } else {
+    DecisionTreeConfig left_branch = Generate(depth - 1);
+    DecisionTreeConfig right_branch = Generate(depth - 1);
+    Combine(&ret, &left_branch, &right_branch);
+    return ret;
+  }
+}
+
+void RandomTreeGen::Combine(DecisionTreeConfig* root,
+                            DecisionTreeConfig* left_branch,
+                            DecisionTreeConfig* right_branch) {
+  const int left_branch_size = left_branch->nodes_size();
+  CHECK_EQ(1, root->nodes_size());
+  // left_branch starts its index at 1. right_branch starts its index at
+  // (left_branch_size + 1).
+  auto* root_node = root->mutable_nodes(0);
+  DenseFloatBinarySplit* root_split = GetSplit(root_node);
+  root_split->set_left_id(1);
+  root_split->set_right_id(left_branch_size + 1);
+  // Shift left/right branch's indices internally so that everything is
+  // consistent.
+  ShiftNodeIndex(left_branch, 1);
+  ShiftNodeIndex(right_branch, left_branch_size + 1);
+
+  // Complexity O(branch node size). No proto copying though.
+  AppendNodes(root, left_branch->mutable_nodes());
+  AppendNodes(root, right_branch->mutable_nodes());
+}
+
+void RandomTreeGen::ShiftNodeIndex(DecisionTreeConfig* tree, int shift) {
+  for (TreeNode& node : *(tree->mutable_nodes())) {
+    DenseFloatBinarySplit* split = nullptr;
+    switch (node.node_case()) {
+      case TreeNode::kLeaf:
+        break;
+      case TreeNode::kSparseFloatBinarySplitDefaultLeft:
+        split = node.mutable_sparse_float_binary_split_default_left()
+                    ->mutable_split();
+        break;
+      case TreeNode::kSparseFloatBinarySplitDefaultRight:
+        split = node.mutable_sparse_float_binary_split_default_right()
+                    ->mutable_split();
+        break;
+      case TreeNode::kDenseFloatBinarySplit:
+        split = node.mutable_dense_float_binary_split();
+        break;
+      default:
+        LOG(FATAL) << "Unknown node type encountered.";
+    }
+    if (split) {
+      split->set_left_id(shift + split->left_id());
+      split->set_right_id(shift + split->right_id());
+    }
+  }
+}
+
+void RandomTreeGen::GenerateSplit(TreeNode* node, int left_id, int right_id) {
+  const double denseSplitProb =
+      sparse_feature_size_ == 0
+          ? 1.0
+          : static_cast<double>(dense_feature_size_) /
+                (dense_feature_size_ + sparse_feature_size_);
+  // Generate the tree such that it has equal probability of going left and
+  // right when the feature is missing.
+  static constexpr float kLeftProb = 0.5;
+
+  DenseFloatBinarySplit* split;
+  int feature_size;
+  if (rng_->RandFloat() < denseSplitProb) {
+    feature_size = dense_feature_size_;
+    split = node->mutable_dense_float_binary_split();
+  } else {
+    feature_size = sparse_feature_size_;
+    if (rng_->RandFloat() < kLeftProb) {
+      split = node->mutable_sparse_float_binary_split_default_left()
+                  ->mutable_split();
+    } else {
+      split = node->mutable_sparse_float_binary_split_default_right()
+                  ->mutable_split();
+    }
+  }
+  split->set_threshold(rng_->RandFloat());
+  split->set_feature_column(rng_->Uniform(feature_size));
+  split->set_left_id(left_id);
+  split->set_right_id(right_id);
+}
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
new file mode 100644
index 00000000000..5e12429ba77
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
@@ -0,0 +1,75 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+
+#include <memory>
+
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+// Randomly generate a balanced tree, for performance benchmarking purposes,
+// that assume all features are sparse float features, for now.
+class RandomTreeGen {
+ public:
+  RandomTreeGen(tensorflow::random::SimplePhilox* rng, int dense_feature_size,
+                int sparse_feature_size);
+
+  // Required: depth must be >= 1.
+  // If one wants to generate multiple trees with the same depth, see also the
+  // overload below.
+  boosted_trees::trees::DecisionTreeConfig Generate(int depth);
+
+  // Randomly generate a new tree with the same depth (and tree structure)
+  // as the given tree. This is faster.
+  boosted_trees::trees::DecisionTreeConfig Generate(
+      const boosted_trees::trees::DecisionTreeConfig& tree);
+
+  // Required: depth >= 1; tree_count >= 1.
+  boosted_trees::trees::DecisionTreeEnsembleConfig GenerateEnsemble(
+      int dept, int tree_count);
+
+ private:
+  tensorflow::random::SimplePhilox* rng_;
+  const int dense_feature_size_;
+  const int sparse_feature_size_;
+
+  // Put together a deeper tree by combining two trees.
+  void Combine(boosted_trees::trees::DecisionTreeConfig* root,
+               boosted_trees::trees::DecisionTreeConfig* left_branch,
+               boosted_trees::trees::DecisionTreeConfig* right_branch);
+
+  // For each node in the provided tree, shift its referenced left/right index
+  // by shift.
+  void ShiftNodeIndex(boosted_trees::trees::DecisionTreeConfig* tree,
+                      int shift);
+
+  // Generate a sparse split in the node.
+  void GenerateSplit(boosted_trees::trees::TreeNode* node, int left_id,
+                     int right_id);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomTreeGen);
+};
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc
new file mode 100644
index 00000000000..5ea81e8d9a4
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc
@@ -0,0 +1,67 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+// Randomly generate a tree ensemble and write to file.
+
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::Flag;
+using tensorflow::Flags;
+using tensorflow::int32;
+using tensorflow::string;
+
+int main(int argc, char* argv[]) {
+  int32 dense_feature_size = 100;
+  int32 sparse_feature_size = 100;
+  int32 depth = 8;
+  int32 tree_count = 10;
+  string filename = "/tmp/trees.pb";
+  std::vector<Flag> flag_list = {
+      Flag("dense_feature_size", &dense_feature_size, "dense feature size"),
+      Flag("sparse_feature_size", &sparse_feature_size, "sparse_feature_size"),
+      Flag("depth", &depth, "tree depth"),
+      Flag("tree_count", &tree_count, "tree count"),
+      Flag("filename", &filename, "Output filename."),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return -1;
+  }
+
+  tensorflow::random::PhiloxRandom philox(1);
+  tensorflow::random::SimplePhilox rng(&philox);
+  tensorflow::boosted_trees::testutil::RandomTreeGen tree_gen(
+      &rng, dense_feature_size, sparse_feature_size);
+  const auto& trees = tree_gen.GenerateEnsemble(depth, tree_count);
+  tensorflow::Status status =
+      tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, trees);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to write: " << filename << " : " << status;
+  } else {
+    LOG(INFO) << "Tree ensemble written to: " << filename;
+  }
+  return 0;
+}
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
new file mode 100644
index 00000000000..cd9a747d519
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -0,0 +1,203 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+
+constexpr int kInvalidLeaf = -1;
+int DecisionTree::Traverse(const DecisionTreeConfig& config,
+                           const int32 sub_root_id,
+                           const utils::Example& example) {
+  if (TF_PREDICT_FALSE(config.nodes_size() <= sub_root_id)) {
+    return kInvalidLeaf;
+  }
+
+  // Traverse tree starting at the provided sub-root.
+  int32 node_id = sub_root_id;
+  while (true) {
+    const auto& current_node = config.nodes(node_id);
+    switch (current_node.node_case()) {
+      case TreeNode::kLeaf: {
+        return node_id;
+      }
+      case TreeNode::kDenseFloatBinarySplit: {
+        const auto& split = current_node.dense_float_binary_split();
+        node_id = example.dense_float_features[split.feature_column()] <=
+                          split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+        const auto& split =
+            current_node.sparse_float_binary_split_default_left().split();
+        auto sparse_feature =
+            example.sparse_float_features[split.feature_column()];
+        node_id = !sparse_feature.has_value() ||
+                          sparse_feature.get_value() <= split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+        const auto& split =
+            current_node.sparse_float_binary_split_default_right().split();
+        auto sparse_feature =
+            example.sparse_float_features[split.feature_column()];
+        node_id = sparse_feature.has_value() &&
+                          sparse_feature.get_value() <= split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kCategoricalIdBinarySplit: {
+        const auto& split = current_node.categorical_id_binary_split();
+        node_id = example.sparse_int_features[split.feature_column()].count(
+                      split.feature_id()) > 0
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+        const auto& split =
+            current_node.categorical_id_set_membership_binary_split();
+        bool found = false;
+        for (const int64 feature_id :
+             example.sparse_int_features[split.feature_column()]) {
+          const auto iter =
+              std::lower_bound(split.feature_ids().begin(),
+                               split.feature_ids().end(), feature_id);
+          if (iter != split.feature_ids().end() && *iter == feature_id) {
+            node_id = split.left_id();
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          node_id = split.right_id();
+        }
+        break;
+      }
+      case TreeNode::NODE_NOT_SET: {
+        QCHECK(false) << "Invalid node in tree: " << current_node.DebugString();
+        break;
+      }
+    }
+    DCHECK_NE(node_id, 0) << "Malformed tree, cycles found to root:"
+                          << current_node.DebugString();
+  }
+}
+
+void DecisionTree::LinkChildren(const std::vector<int32>& children,
+                                TreeNode* parent_node) {
+  // Decide how to link children depending on the parent node's type.
+  auto children_it = children.begin();
+  switch (parent_node->node_case()) {
+    case TreeNode::kLeaf: {
+      // Essentially no-op.
+      QCHECK(children.empty()) << "A leaf node cannot have children.";
+      break;
+    }
+    case TreeNode::kDenseFloatBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split = parent_node->mutable_dense_float_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_sparse_float_binary_split_default_left()
+              ->mutable_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_sparse_float_binary_split_default_right()
+              ->mutable_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kCategoricalIdBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split = parent_node->mutable_categorical_id_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_categorical_id_set_membership_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::NODE_NOT_SET: {
+      QCHECK(false) << "A non-set node cannot have children.";
+      break;
+    }
+  }
+}
+
+std::vector<int32> DecisionTree::GetChildren(const TreeNode& node) {
+  // A node's children depend on its type.
+  switch (node.node_case()) {
+    case TreeNode::kLeaf: {
+      return {};
+    }
+    case TreeNode::kDenseFloatBinarySplit: {
+      const auto& split = node.dense_float_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+      const auto& split = node.sparse_float_binary_split_default_left().split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+      const auto& split =
+          node.sparse_float_binary_split_default_right().split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kCategoricalIdBinarySplit: {
+      const auto& split = node.categorical_id_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+      const auto& split = node.categorical_id_set_membership_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::NODE_NOT_SET: {
+      return {};
+    }
+  }
+}
+
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
new file mode 100644
index 00000000000..604ff02744b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
@@ -0,0 +1,49 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/example.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+
+// Decision tree class to encapsulate tree traversal and mutation logic.
+// This class does not hold state and is thread safe.
+class DecisionTree {
+ public:
+  // Traverse given an instance, a sub-root and its set of features
+  // and return the leaf index or -1 if the tree is empty or
+  // the sub-root is invalid.
+  static int Traverse(const DecisionTreeConfig& config, int32 sub_root_id,
+                      const utils::Example& example);
+
+  // Links the specified children to the parent, the children must
+  // already be added to the decision tree config so this method
+  // just ensures nodes are re-linked.
+  static void LinkChildren(const std::vector<int32>& children,
+                           TreeNode* parent_node);
+
+  // Retrieves node children indices if any.
+  static std::vector<int32> GetChildren(const TreeNode& node);
+};
+
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
new file mode 100644
index 00000000000..c55d09807ea
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
@@ -0,0 +1,346 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+namespace {
+
+class DecisionTreeTest : public ::testing::Test {
+ protected:
+  DecisionTreeTest() : batch_features_(2) {
+    // Create a batch of two examples having one dense float, two sparse float
+    // and one sparse int features.
+    // The first example is missing the second sparse feature column and the
+    // second example is missing the first sparse feature column.
+    // This looks like the following:
+    // Instance | DenseF1 | SparseF1 | SparseF2 | SparseI1 |
+    // 0        |   7     |   -3     |          |    3     |
+    // 1        |  -2     |          |   4      |          |
+    auto dense_float_matrix = test::AsTensor<float>({7.0f, -2.0f}, {2, 1});
+    auto sparse_float_indices1 = test::AsTensor<int64>({0, 0}, {1, 2});
+    auto sparse_float_values1 = test::AsTensor<float>({-3.0f});
+    auto sparse_float_shape1 = test::AsTensor<int64>({2, 1});
+    auto sparse_float_indices2 = test::AsTensor<int64>({1, 0}, {1, 2});
+    auto sparse_float_values2 = test::AsTensor<float>({4.0f});
+    auto sparse_float_shape2 = test::AsTensor<int64>({2, 1});
+    auto sparse_int_indices1 = test::AsTensor<int64>({0, 0}, {1, 2});
+    auto sparse_int_values1 = test::AsTensor<int64>({3});
+    auto sparse_int_shape1 = test::AsTensor<int64>({2, 1});
+    TF_EXPECT_OK(batch_features_.Initialize(
+        {dense_float_matrix}, {sparse_float_indices1, sparse_float_indices2},
+        {sparse_float_values1, sparse_float_values2},
+        {sparse_float_shape1, sparse_float_shape2}, {sparse_int_indices1},
+        {sparse_int_values1}, {sparse_int_shape1}));
+  }
+
+  template <typename SplitType>
+  void TestLinkChildrenBinary(TreeNode* node, SplitType* split) {
+    // Verify children were linked.
+    DecisionTree::LinkChildren({3, 8}, node);
+    EXPECT_EQ(3, split->left_id());
+    EXPECT_EQ(8, split->right_id());
+
+    // Invalid cases.
+    EXPECT_DEATH(DecisionTree::LinkChildren({}, node),
+                 "A binary split node must have exactly two children.");
+    EXPECT_DEATH(DecisionTree::LinkChildren({3}, node),
+                 "A binary split node must have exactly two children.");
+    EXPECT_DEATH(DecisionTree::LinkChildren({1, 2, 3}, node),
+                 "A binary split node must have exactly two children.");
+  }
+
+  void TestGetChildren(const TreeNode& node,
+                       const std::vector<uint32>& expected_children) {
+    // Verify children were linked.
+    auto children = DecisionTree::GetChildren(node);
+    EXPECT_EQ(children.size(), expected_children.size());
+    for (size_t idx = 0; idx < children.size(); ++idx) {
+      EXPECT_EQ(children[idx], expected_children[idx]);
+    }
+  }
+
+  utils::BatchFeatures batch_features_;
+};
+
+TEST_F(DecisionTreeTest, TraverseEmpty) {
+  DecisionTreeConfig tree_config;
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(-1, DecisionTree::Traverse(tree_config, 0, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseBias) {
+  DecisionTreeConfig tree_config;
+  tree_config.add_nodes()->mutable_leaf();
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(0, DecisionTree::Traverse(tree_config, 0, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseInvalidSubRoot) {
+  DecisionTreeConfig tree_config;
+  tree_config.add_nodes()->mutable_leaf();
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(-1, DecisionTree::Traverse(tree_config, 10, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseDenseBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node =
+      tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split_node->set_feature_column(0);
+  split_node->set_threshold(0.0f);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect right child to be picked as !(7 <= 0);
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect left child to be picked as (-2 <= 0);
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
+  // Test first sparse feature which is missing for the second example.
+  DecisionTreeConfig tree_config1;
+  auto* split_node1 = tree_config1.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_left()
+                          ->mutable_split();
+  split_node1->set_feature_column(0);
+  split_node1->set_threshold(-20.0f);
+  split_node1->set_left_id(1);
+  split_node1->set_right_id(2);
+  tree_config1.add_nodes()->mutable_leaf();
+  tree_config1.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect right child to be picked as !(-3 <= -20).
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config1, 0, *example_it));
+
+  // Expect left child to be picked as default direction.
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config1, 0, *++example_it));
+
+  // Test second sparse feature which is missing for the first example.
+  DecisionTreeConfig tree_config2;
+  auto* split_node2 = tree_config2.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_right()
+                          ->mutable_split();
+  split_node2->set_feature_column(1);
+  split_node2->set_threshold(4.0f);
+  split_node2->set_left_id(1);
+  split_node2->set_right_id(2);
+  tree_config2.add_nodes()->mutable_leaf();
+  tree_config2.add_nodes()->mutable_leaf();
+
+  // Expect right child to be picked as default direction.
+  example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config2, 0, *example_it));
+
+  // Expect left child to be picked as (4 <= 4).
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config2, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseCategoricalIdBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node =
+      tree_config.add_nodes()->mutable_categorical_id_binary_split();
+  split_node->set_feature_column(0);
+  split_node->set_feature_id(3);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect left child to be picked as 3 == 3;
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect right child to be picked as the feature is missing;
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseCategoricalIdSetMembershipBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node = tree_config.add_nodes()
+                         ->mutable_categorical_id_set_membership_binary_split();
+  split_node->set_feature_column(0);
+  split_node->add_feature_ids(3);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect left child to be picked as 3 in {3};
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect right child to be picked as the feature is missing;
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseHybridSplits) {
+  DecisionTreeConfig tree_config;
+  auto* split_node1 =
+      tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split_node1->set_feature_column(0);
+  split_node1->set_threshold(9.0f);
+  split_node1->set_left_id(1);   // sparse split.
+  split_node1->set_right_id(2);  // leaf
+  auto* split_node2 = tree_config.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_left()
+                          ->mutable_split();
+  tree_config.add_nodes()->mutable_leaf();
+  split_node2->set_feature_column(0);
+  split_node2->set_threshold(-20.0f);
+  split_node2->set_left_id(3);
+  split_node2->set_right_id(4);
+  auto* split_node3 =
+      tree_config.add_nodes()->mutable_categorical_id_binary_split();
+  split_node3->set_feature_column(0);
+  split_node3->set_feature_id(2);
+  split_node3->set_left_id(5);
+  split_node3->set_right_id(6);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect will go left through the first dense split as (7.0f <= 9.0f),
+  // then will go right through the sparse split as !(-3 <= -20).
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(4, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect will go left through the first dense split as (-2.0f <= 9.0f),
+  // then will go left the default direction as the sparse feature is missing,
+  // then will go right as 2 != 3 on the categorical split.
+  EXPECT_EQ(6, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenLeaf) {
+  // Create leaf node.
+  TreeNode node;
+  node.mutable_leaf();
+
+  // No-op.
+  DecisionTree::LinkChildren({}, &node);
+
+  // Invalid case.
+  EXPECT_DEATH(DecisionTree::LinkChildren({1}, &node),
+               "A leaf node cannot have children.");
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenDenseFloatBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_dense_float_binary_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenSparseFloatBinarySplitDefaultLeft) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_left()->mutable_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenSparseFloatBinarySplitDefaultRight) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_right()->mutable_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenCategoricalSingleIdBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_categorical_id_binary_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenNodeNotSet) {
+  // Create unset node.
+  TreeNode node;
+
+  // Invalid case.
+  EXPECT_DEATH(DecisionTree::LinkChildren({1}, &node),
+               "A non-set node cannot have children.");
+}
+
+TEST_F(DecisionTreeTest, GetChildrenLeaf) {
+  TreeNode node;
+  node.mutable_leaf();
+  TestGetChildren(node, {});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenDenseFloatBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_dense_float_binary_split();
+  split->set_left_id(23);
+  split->set_right_id(24);
+  TestGetChildren(node, {23, 24});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenSparseFloatBinarySplitDefaultLeft) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_left()->mutable_split();
+  split->set_left_id(12);
+  split->set_right_id(13);
+  TestGetChildren(node, {12, 13});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenSparseFloatBinarySplitDefaultRight) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_right()->mutable_split();
+  split->set_left_id(1);
+  split->set_right_id(2);
+  TestGetChildren(node, {1, 2});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenCategoricalSingleIdBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_categorical_id_binary_split();
+  split->set_left_id(7);
+  split->set_right_id(8);
+  TestGetChildren(node, {7, 8});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenNodeNotSet) {
+  TreeNode node;
+  TestGetChildren(node, {});
+}
+
+}  // namespace
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
new file mode 100644
index 00000000000..12b377dda78
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc
@@ -0,0 +1,152 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/macros.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+Status BatchFeatures::Initialize(
+    std::vector<Tensor> dense_float_features_list,
+    std::vector<Tensor> sparse_float_feature_indices_list,
+    std::vector<Tensor> sparse_float_feature_values_list,
+    std::vector<Tensor> sparse_float_feature_shapes_list,
+    std::vector<Tensor> sparse_int_feature_indices_list,
+    std::vector<Tensor> sparse_int_feature_values_list,
+    std::vector<Tensor> sparse_int_feature_shapes_list) {
+  // Validate number of feature columns.
+  auto num_dense_float_features = dense_float_features_list.size();
+  auto num_sparse_float_features = sparse_float_feature_indices_list.size();
+  auto num_sparse_int_features = sparse_int_feature_indices_list.size();
+  QCHECK(num_dense_float_features + num_sparse_float_features +
+             num_sparse_int_features >
+         0)
+      << "Must have at least one feature column.";
+
+  // Read dense float features.
+  dense_float_feature_columns_.reserve(num_dense_float_features);
+  for (uint32 dense_feat_idx = 0; dense_feat_idx < num_dense_float_features;
+       ++dense_feat_idx) {
+    auto dense_float_feature = dense_float_features_list[dense_feat_idx];
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsMatrix(dense_float_feature.shape()),
+        errors::InvalidArgument("Dense float feature must be a matrix."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        dense_float_feature.dim_size(0) == batch_size_,
+        errors::InvalidArgument(
+            "Dense float vector must have batch_size rows: ", batch_size_,
+            " vs. ", dense_float_feature.dim_size(0)));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        dense_float_feature.dim_size(1) == 1,
+        errors::InvalidArgument(
+            "Dense float features may not be multi-valent: dim_size(1) = ",
+            dense_float_feature.dim_size(1)));
+    dense_float_feature_columns_.emplace_back(dense_float_feature);
+  }
+
+  // Read sparse float features.
+  sparse_float_feature_columns_.reserve(num_sparse_float_features);
+  TF_CHECK_AND_RETURN_IF_ERROR(
+      sparse_float_feature_values_list.size() == num_sparse_float_features &&
+          sparse_float_feature_shapes_list.size() == num_sparse_float_features,
+      errors::InvalidArgument("Inconsistent number of sparse float features."));
+  for (uint32 sparse_feat_idx = 0; sparse_feat_idx < num_sparse_float_features;
+       ++sparse_feat_idx) {
+    auto sparse_float_feature_indices =
+        sparse_float_feature_indices_list[sparse_feat_idx];
+    auto sparse_float_feature_values =
+        sparse_float_feature_values_list[sparse_feat_idx];
+    auto sparse_float_feature_shape =
+        sparse_float_feature_shapes_list[sparse_feat_idx];
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsMatrix(sparse_float_feature_indices.shape()),
+        errors::InvalidArgument(
+            "Sparse float feature indices must be a matrix."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsVector(sparse_float_feature_values.shape()),
+        errors::InvalidArgument(
+            "Sparse float feature values must be a vector."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsVector(sparse_float_feature_shape.shape()),
+        errors::InvalidArgument(
+            "Sparse float feature shape must be a vector."));
+    auto shape_flat = sparse_float_feature_shape.flat<int64>();
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        shape_flat.size() == 2,
+        errors::InvalidArgument(
+            "Sparse float feature column must be two-dimensional."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        shape_flat(0) == batch_size_,
+        errors::InvalidArgument(
+            "Sparse float feature shape incompatible with batch size."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        shape_flat(1) <= 1,
+        errors::InvalidArgument(
+            "Sparse float features may not be multi-valent."));
+    auto tensor_shape = TensorShape({shape_flat(0), shape_flat(1)});
+    auto order_dims = sparse::SparseTensor::VarDimArray({0, 1});
+    sparse_float_feature_columns_.emplace_back(sparse_float_feature_indices,
+                                               sparse_float_feature_values,
+                                               tensor_shape, order_dims);
+  }
+
+  // Read sparse int features.
+  sparse_int_feature_columns_.reserve(num_sparse_int_features);
+  TF_CHECK_AND_RETURN_IF_ERROR(
+      sparse_int_feature_values_list.size() == num_sparse_int_features &&
+          sparse_int_feature_shapes_list.size() == num_sparse_int_features,
+      errors::InvalidArgument("Inconsistent number of sparse int features."));
+  for (uint32 sparse_feat_idx = 0; sparse_feat_idx < num_sparse_int_features;
+       ++sparse_feat_idx) {
+    auto sparse_int_feature_indices =
+        sparse_int_feature_indices_list[sparse_feat_idx];
+    auto sparse_int_feature_values =
+        sparse_int_feature_values_list[sparse_feat_idx];
+    auto sparse_int_feature_shape =
+        sparse_int_feature_shapes_list[sparse_feat_idx];
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsMatrix(sparse_int_feature_indices.shape()),
+        errors::InvalidArgument(
+            "Sparse int feature indices must be a matrix."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsVector(sparse_int_feature_values.shape()),
+        errors::InvalidArgument("Sparse int feature values must be a vector."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        TensorShapeUtils::IsVector(sparse_int_feature_shape.shape()),
+        errors::InvalidArgument("Sparse int feature shape must be a vector."));
+    auto shape_flat = sparse_int_feature_shape.flat<int64>();
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        shape_flat.size() == 2,
+        errors::InvalidArgument(
+            "Sparse int feature column must be two-dimensional."));
+    TF_CHECK_AND_RETURN_IF_ERROR(
+        shape_flat(0) == batch_size_,
+        errors::InvalidArgument(
+            "Sparse int feature shape incompatible with batch size."));
+    auto tensor_shape = TensorShape({shape_flat(0), shape_flat(1)});
+    auto order_dims = sparse::SparseTensor::VarDimArray({0, 1});
+    sparse_int_feature_columns_.emplace_back(sparse_int_feature_indices,
+                                             sparse_int_feature_values,
+                                             tensor_shape, order_dims);
+  }
+  return Status::OK();
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
new file mode 100644
index 00000000000..bb11dc9a077
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -0,0 +1,79 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+
+#include <vector>
+#include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+class BatchFeatures {
+ public:
+  // Constructs batch features with a fixed batch size.
+  explicit BatchFeatures(int64 batch_size) : batch_size_(batch_size) {}
+
+  // Disallow copy and assign.
+  BatchFeatures(const BatchFeatures& other) = delete;
+  BatchFeatures& operator=(const BatchFeatures& other) = delete;
+
+  // Method to initialize batch features from op kernel context.
+  Status Initialize(std::vector<Tensor> dense_float_features_list,
+                    std::vector<Tensor> sparse_float_feature_indices_list,
+                    std::vector<Tensor> sparse_float_feature_values_list,
+                    std::vector<Tensor> sparse_float_feature_shapes_list,
+                    std::vector<Tensor> sparse_int_feature_indices_list,
+                    std::vector<Tensor> sparse_int_feature_values_list,
+                    std::vector<Tensor> sparse_int_feature_shapes_list);
+
+  // Creates an example iterable for the requested slice.
+  ExamplesIterable examples_iterable(int64 example_start,
+                                     int64 example_end) const {
+    QCHECK(example_start >= 0 && example_end >= 0);
+    QCHECK(example_start < batch_size_ && example_end <= batch_size_);
+    return ExamplesIterable(
+        dense_float_feature_columns_, sparse_float_feature_columns_,
+        sparse_int_feature_columns_, example_start, example_end);
+  }
+
+  // Returns the fixed batch size.
+  int64 batch_size() const { return batch_size_; }
+
+ private:
+  // Total number of examples in the batch.
+  const int64 batch_size_;
+
+  // Dense float feature columns.
+  std::vector<Tensor> dense_float_feature_columns_;
+
+  // Sparse float feature columns.
+  std::vector<sparse::SparseTensor> sparse_float_feature_columns_;
+
+  // Sparse int feature columns.
+  std::vector<sparse::SparseTensor> sparse_int_feature_columns_;
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
new file mode 100644
index 00000000000..7f523d527ad
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+namespace {
+
+using test::AsTensor;
+using errors::InvalidArgument;
+
+class BatchFeaturesTest : public ::testing::Test {};
+
+TEST_F(BatchFeaturesTest, InvalidNumFeatures) {
+  BatchFeatures batch_features(8);
+  EXPECT_DEATH(({ batch_features.Initialize({}, {}, {}, {}, {}, {}, {}); })
+                   .IgnoreError(),
+               "Must have at least one feature column.");
+}
+
+TEST_F(BatchFeaturesTest, DenseFloatFeatures_WrongShape) {
+  BatchFeatures batch_features(8);
+  auto dense_vec = AsTensor<float>({3.0f, 7.0f});
+  auto expected_error =
+      InvalidArgument("Dense float feature must be a matrix.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize({dense_vec}, {}, {}, {}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, DenseFloatFeatures_WrongBatchDimension) {
+  BatchFeatures batch_features(8);
+  auto dense_vec = AsTensor<float>({3.0f, 7.0f}, {2, 1});
+  auto expected_error =
+      InvalidArgument("Dense float vector must have batch_size rows: 8 vs. 2");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize({dense_vec}, {}, {}, {}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, DenseFloatFeatures_Multivalent) {
+  BatchFeatures batch_features(1);
+  auto dense_vec = AsTensor<float>({3.0f, 7.0f}, {1, 2});
+  auto expected_error = InvalidArgument(
+      "Dense float features may not be multi-valent: dim_size(1) = 2");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize({dense_vec}, {}, {}, {}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_WrongShapeIndices) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f});
+  auto sparse_float_feature_shape = AsTensor<int64>({2, 1});
+  auto expected_error =
+      InvalidArgument("Sparse float feature indices must be a matrix.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_WrongShapeValues) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f}, {1, 2});
+  auto sparse_float_feature_shape = AsTensor<int64>({2, 1});
+  auto expected_error =
+      InvalidArgument("Sparse float feature values must be a vector.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_WrongShapeShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f});
+  auto sparse_float_feature_shape = AsTensor<int64>({2, 1}, {1, 2});
+  auto expected_error =
+      InvalidArgument("Sparse float feature shape must be a vector.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_WrongSizeShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f});
+  auto sparse_float_feature_shape = AsTensor<int64>({2, 1, 9});
+  auto expected_error =
+      InvalidArgument("Sparse float feature column must be two-dimensional.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_IncompatibleShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f});
+  auto sparse_float_feature_shape = AsTensor<int64>({8, 1});
+  auto expected_error = InvalidArgument(
+      "Sparse float feature shape incompatible with batch size.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseFloatFeatures_Multivalent) {
+  BatchFeatures batch_features(2);
+  auto sparse_float_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_float_feature_values = AsTensor<float>({3.0f, 7.0f});
+  auto sparse_float_feature_shape = AsTensor<int64>({2, 2});
+  auto expected_error =
+      InvalidArgument("Sparse float features may not be multi-valent.");
+  EXPECT_EQ(expected_error, batch_features.Initialize(
+                                {}, {sparse_float_feature_indices},
+                                {sparse_float_feature_values},
+                                {sparse_float_feature_shape}, {}, {}, {}));
+}
+
+TEST_F(BatchFeaturesTest, SparseIntFeatures_WrongShapeIndices) {
+  BatchFeatures batch_features(2);
+  auto sparse_int_feature_indices = AsTensor<int64>({0, 0, 1, 0});
+  auto sparse_int_feature_values = AsTensor<int64>({3, 7});
+  auto sparse_int_feature_shape = AsTensor<int64>({2, 1});
+  auto expected_error =
+      InvalidArgument("Sparse int feature indices must be a matrix.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize(
+                {}, {}, {}, {}, {sparse_int_feature_indices},
+                {sparse_int_feature_values}, {sparse_int_feature_shape}));
+}
+
+TEST_F(BatchFeaturesTest, SparseIntFeatures_WrongShapeValues) {
+  BatchFeatures batch_features(2);
+  auto sparse_int_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_int_feature_values = AsTensor<int64>({3, 7}, {1, 2});
+  auto sparse_int_feature_shape = AsTensor<int64>({2, 1});
+  auto expected_error =
+      InvalidArgument("Sparse int feature values must be a vector.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize(
+                {}, {}, {}, {}, {sparse_int_feature_indices},
+                {sparse_int_feature_values}, {sparse_int_feature_shape}));
+}
+
+TEST_F(BatchFeaturesTest, SparseIntFeatures_WrongShapeShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_int_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_int_feature_values = AsTensor<int64>({3, 7});
+  auto sparse_int_feature_shape = AsTensor<int64>({2, 1}, {1, 2});
+  auto expected_error =
+      InvalidArgument("Sparse int feature shape must be a vector.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize(
+                {}, {}, {}, {}, {sparse_int_feature_indices},
+                {sparse_int_feature_values}, {sparse_int_feature_shape}));
+}
+
+TEST_F(BatchFeaturesTest, SparseIntFeatures_WrongSizeShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_int_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_int_feature_values = AsTensor<int64>({3, 7});
+  auto sparse_int_feature_shape = AsTensor<int64>({2, 1, 9});
+  auto expected_error =
+      InvalidArgument("Sparse int feature column must be two-dimensional.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize(
+                {}, {}, {}, {}, {sparse_int_feature_indices},
+                {sparse_int_feature_values}, {sparse_int_feature_shape}));
+}
+
+TEST_F(BatchFeaturesTest, SparseIntFeatures_IncompatibleShape) {
+  BatchFeatures batch_features(2);
+  auto sparse_int_feature_indices = AsTensor<int64>({0, 0, 1, 0}, {2, 2});
+  auto sparse_int_feature_values = AsTensor<int64>({3, 7});
+  auto sparse_int_feature_shape = AsTensor<int64>({8, 1});
+  auto expected_error =
+      InvalidArgument("Sparse int feature shape incompatible with batch size.");
+  EXPECT_EQ(expected_error,
+            batch_features.Initialize(
+                {}, {}, {}, {}, {sparse_int_feature_indices},
+                {sparse_int_feature_values}, {sparse_int_feature_shape}));
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
new file mode 100644
index 00000000000..38f0151255b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -0,0 +1,151 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
+
+#include <iterator>
+#include <numeric>
+#include <unordered_set>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
+using tensorflow::random::PhiloxRandom;
+using tensorflow::random::SimplePhilox;
+using tensorflow::Status;
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+Status DropoutUtils::DropOutTrees(
+    const uint64 seed, const LearningRateDropoutDrivenConfig& config,
+    const std::unordered_set<int32>& trees_not_to_drop,
+    const std::vector<float>& weights, std::vector<int32>* dropped_trees,
+    std::vector<float>* original_weights) {
+  // Verify params.
+  if (dropped_trees == nullptr) {
+    return errors::Internal("Dropped trees is nullptr.");
+  }
+  if (original_weights == nullptr) {
+    return errors::InvalidArgument("Original weights is nullptr.");
+  }
+  const float dropout_probability = config.dropout_probability();
+  if (dropout_probability < 0 || dropout_probability > 1) {
+    return errors::InvalidArgument(
+        "Dropout probability must be in [0,1] range");
+  }
+  const float probability_of_skipping_dropout =
+      config.probability_of_skipping_dropout();
+  if (probability_of_skipping_dropout < 0 ||
+      probability_of_skipping_dropout > 1) {
+    return errors::InvalidArgument(
+        "Probability of skiping dropout must be in [0,1] range");
+  }
+  const auto num_trees = weights.size();
+
+  dropped_trees->clear();
+  original_weights->clear();
+
+  // If dropout is no op, return.
+  if (dropout_probability == 0 || probability_of_skipping_dropout == 1.0) {
+    return Status::OK();
+  }
+
+  // Roll the dice for each tree.
+  PhiloxRandom philox(seed);
+  SimplePhilox rng(&philox);
+
+  std::vector<int32> trees_to_keep;
+
+  // What is the probability of skipping dropout altogether.
+  if (probability_of_skipping_dropout != 0) {
+    // First roll the dice - do we do dropout
+    double roll = rng.RandDouble();
+    if (roll < probability_of_skipping_dropout) {
+      // don't do dropout
+      return Status::OK();
+    }
+  }
+
+  for (int32 i = 0; i < num_trees; ++i) {
+    // We can't drop some of the trees: for example, bias tree in batch mode,
+    // or current tree that is built, in the batch mode.
+    if (trees_not_to_drop.find(i) != trees_not_to_drop.end()) {
+      continue;
+    }
+    double roll = rng.RandDouble();
+    if (roll >= dropout_probability) {
+      trees_to_keep.push_back(i);
+    } else {
+      dropped_trees->push_back(i);
+    }
+  }
+
+  // Sort the dropped trees indices.
+  std::sort(dropped_trees->begin(), dropped_trees->end());
+  for (const int32 dropped_tree : *dropped_trees) {
+    original_weights->push_back(weights[dropped_tree]);
+  }
+
+  return Status::OK();
+}
+
+void DropoutUtils::GetTreesWeightsForAddingTrees(
+    const std::vector<int32>& dropped_trees,
+    const std::vector<float>& dropped_trees_original_weights,
+    const int32 new_trees_first_index, const int32 num_trees_to_add,
+    std::vector<float>* current_weights, std::vector<int32>* num_updates) {
+  CHECK(num_updates->size() == current_weights->size());
+  // combined weight of trees that were dropped out
+
+  const float dropped_sum =
+      std::accumulate(dropped_trees_original_weights.begin(),
+                      dropped_trees_original_weights.end(), 0.0);
+
+  const int num_dropped = dropped_trees.size();
+
+  // Allocate additional weight for the new tree
+  const float total_new_trees_weight = dropped_sum / (num_dropped + 1);
+
+  for (int i = 0; i < num_trees_to_add; ++i) {
+    const int32 new_tree_index = new_trees_first_index + i;
+    if (new_tree_index < current_weights->size()) {
+      // We have the entries in weights and updates for this tree already
+      (*current_weights)[new_tree_index] =
+          total_new_trees_weight / num_trees_to_add;
+      (*num_updates)[new_tree_index]++;
+    } else {
+      // We need to add a new entry. This is non-batch mode.
+      current_weights->push_back(total_new_trees_weight / num_trees_to_add);
+      num_updates->push_back(1);
+    }
+  }
+
+  for (int32 i = 0; i < dropped_trees.size(); ++i) {
+    const int32 dropped = dropped_trees[i];
+    const float original_weight = dropped_trees_original_weights[i];
+    const float new_weight = original_weight * num_dropped / (num_dropped + 1);
+    (*current_weights)[dropped] = new_weight;
+    // Update the number of updates per tree.
+    ++(*num_updates)[dropped];
+  }
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
new file mode 100644
index 00000000000..c3f1c918ca5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
@@ -0,0 +1,77 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"  // NOLINT
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Utils for deciding on what trees to be/were dropped when building a new tree.
+class DropoutUtils {
+ public:
+  // This method determines what trees should be dropped and returns their
+  // indices and the weights they had when this method ran.
+  // seed: random seed to be used
+  // config: dropout config, that defines the probability of dropout etc
+  // trees_not_to_drop: indices of trees that can't be dropped, for example bias
+  // (0) and the last tree in the batch mode.
+  // number_of_trees_to_consider: how many trees are currently in the ensemble
+  // weights: weights of those trees
+  // Returns sorted vector of indices of trees to be dropped and their original
+  // weights.
+  static tensorflow::Status DropOutTrees(
+      const uint64 seed, const learner::LearningRateDropoutDrivenConfig& config,
+      const std::unordered_set<int32>& trees_not_to_drop,
+      const std::vector<float>& weights, std::vector<int32>* dropped_trees,
+      std::vector<float>* original_weights);
+
+  // Recalculates the weights of the trees when the new trees are added to
+  // ensemble.
+  // dropped_trees: ids of trees that were dropped when trees to add were built.
+  // dropped_trees_original_weights: the weight dropped trees had during dropout
+  // new_trees_first_index: index of the last tree. If it is already in the
+  // ensemble, its weight and num updates are adjusted. Otherwise, its weight
+  // and num updates are added as new entries to current_weights and
+  // num_updates. num_trees_to_add: how many trees are being added to the
+  // ensemble. Returns current_weights: updated vector of the tree weights.
+  // Weights of dropped trees are updated. Note that the size of returned vector
+  // will be total_num_trees + num_trees_to_add (the last elements are the
+  // weights of the new trees to be added) if new_trees_first_index
+  // >=current_weights.size num_updates: updated vector with increased number of
+  // updates for dropped trees.
+  static void GetTreesWeightsForAddingTrees(
+      const std::vector<int32>& dropped_trees,
+      const std::vector<float>& dropped_trees_original_weights,
+      const int32 new_trees_first_index, const int32 num_trees_to_add,
+      // Current weights and num_updates will be updated as a result of this
+      // func
+      std::vector<float>* current_weights,
+      // How many weight assignements have been done for each tree already.
+      std::vector<int32>* num_updates);
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
new file mode 100644
index 00000000000..f658532acb2
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -0,0 +1,453 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
+
+#include <sys/types.h>
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include <functional>
+#include <iterator>
+#include <unordered_set>
+#include <utility>
+
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
+using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
+using std::unordered_set;
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+namespace {
+
+const uint32 kSeed = 123;
+const int32 kNumTrees = 1000;
+
+class DropoutUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Fill an weights.
+    for (int i = 0; i < kNumTrees; ++i) {
+      weights_.push_back(1.1 + 0.4 * i);
+    }
+  }
+
+ protected:
+  std::vector<float> weights_;
+};
+
+TEST_F(DropoutUtilsTest, DropoutProbabilityTest) {
+  std::vector<int32> dropped_trees;
+  std::vector<float> original_weights;
+  std::unordered_set<int32> trees_not_to_drop;
+
+  // Do not drop any trees
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.0);
+    config.set_learning_rate(1.0);
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights));
+
+    // Nothing changed
+    EXPECT_TRUE(dropped_trees.empty());
+    EXPECT_TRUE(original_weights.empty());
+  }
+  // Drop out all trees
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(1.0);
+    config.set_learning_rate(1.0);
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights));
+
+    // No trees left
+    EXPECT_EQ(kNumTrees, dropped_trees.size());
+    EXPECT_EQ(kNumTrees, original_weights.size());
+    EXPECT_EQ(original_weights, weights_);
+  }
+  // 50% probability of dropping a tree
+  {
+    const int32 kNumRuns = 1000;
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.5);
+    config.set_learning_rate(1.0);
+
+    int32 total_num_trees = 0;
+    for (int i = 0; i < kNumRuns; ++i) {
+      // draw random seeds
+      uint random_generator_seed = static_cast<uint>(std::clock());
+      uint32 seed = rand_r(&random_generator_seed) % 100 + i;
+      TF_EXPECT_OK(DropoutUtils::DropOutTrees(seed, config, trees_not_to_drop,
+                                              weights_, &dropped_trees,
+                                              &original_weights));
+
+      // We would expect 400-600 trees left
+      EXPECT_NEAR(500, kNumTrees - dropped_trees.size(), 100);
+      total_num_trees += kNumTrees - dropped_trees.size();
+
+      // Trees dropped are unique
+      unordered_set<int32> ids;
+      for (const auto& tree : dropped_trees) {
+        ids.insert(tree);
+      }
+      EXPECT_EQ(ids.size(), dropped_trees.size());
+    }
+    EXPECT_NEAR(500, total_num_trees / kNumRuns, 5);
+  }
+}
+
+TEST_F(DropoutUtilsTest, DropoutIgnoresNotToDropTest) {
+  std::vector<int32> dropped_trees;
+  std::vector<float> original_weights;
+
+  // Empty do not drop set.
+  {
+    std::unordered_set<int32> trees_not_to_drop;
+
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(1.0);
+    config.set_learning_rate(1.0);
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights));
+
+    // No trees left
+    EXPECT_EQ(kNumTrees, dropped_trees.size());
+    EXPECT_EQ(kNumTrees, original_weights.size());
+    EXPECT_EQ(original_weights, weights_);
+  }
+
+  // Do not drop any trees
+  {
+    std::unordered_set<int32> trees_not_to_drop;
+    for (int i = 0; i < kNumTrees; ++i) {
+      trees_not_to_drop.insert(i);
+    }
+
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(1.0);
+    config.set_learning_rate(1.0);
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights));
+
+    // No trees were dropped - they all were in do not drop set.
+    EXPECT_EQ(0, dropped_trees.size());
+    EXPECT_EQ(0, original_weights.size());
+  }
+  // Do not drop some trees
+  {
+    std::unordered_set<int32> trees_not_to_drop;
+    trees_not_to_drop.insert(0);
+    trees_not_to_drop.insert(34);
+
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(1.0);
+    config.set_learning_rate(1.0);
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights));
+
+    // No trees were dropped - they all were in do not drop set.
+    EXPECT_EQ(kNumTrees - 2, dropped_trees.size());
+    EXPECT_EQ(kNumTrees - 2, original_weights.size());
+    EXPECT_TRUE(std::find(dropped_trees.begin(), dropped_trees.end(), 0) ==
+                dropped_trees.end());
+    EXPECT_TRUE(std::find(dropped_trees.begin(), dropped_trees.end(), 34) ==
+                dropped_trees.end());
+  }
+}
+
+TEST_F(DropoutUtilsTest, DropoutSeedTest) {
+  std::unordered_set<int32> trees_not_to_drop;
+  // Different seeds remove different trees
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.5);
+    config.set_learning_rate(1.0);
+
+    std::vector<int32> dropped_trees_1;
+    std::vector<float> original_weights_1;
+    std::vector<int32> dropped_trees_2;
+    std::vector<float> original_weights_2;
+
+    DecisionTreeEnsembleConfig new_ensemble_1;
+    DecisionTreeEnsembleConfig new_ensemble_2;
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(
+        kSeed + 1, config, trees_not_to_drop, weights_, &dropped_trees_1,
+        &original_weights_1));
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(
+        kSeed + 2, config, trees_not_to_drop, weights_, &dropped_trees_2,
+        &original_weights_2));
+
+    EXPECT_FALSE(dropped_trees_1 == dropped_trees_2);
+    EXPECT_FALSE(original_weights_1 == original_weights_2);
+  }
+  //  The same seed produces the same result
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.5);
+    config.set_learning_rate(1.0);
+
+    std::vector<int32> dropped_trees_1;
+    std::vector<float> original_weights_1;
+    std::vector<int32> dropped_trees_2;
+    std::vector<float> original_weights_2;
+
+    DecisionTreeEnsembleConfig new_ensemble_1;
+    DecisionTreeEnsembleConfig new_ensemble_2;
+
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees_1,
+                                            &original_weights_1));
+    TF_EXPECT_OK(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees_2,
+                                            &original_weights_2));
+
+    EXPECT_TRUE(dropped_trees_1 == dropped_trees_2);
+    EXPECT_TRUE(original_weights_1 == original_weights_2);
+  }
+}
+
+TEST_F(DropoutUtilsTest, InvalidConfigTest) {
+  std::vector<int32> dropped_trees;
+  std::vector<float> original_weights;
+  std::unordered_set<int32> trees_not_to_drop;
+  // Negative prob
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(-1.34);
+
+    EXPECT_FALSE(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights)
+                     .ok());
+  }
+  // Larger than 1 prob of dropping a tree.
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(1.34);
+
+    EXPECT_FALSE(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights)
+                     .ok());
+  }
+  // Negative probability of skipping dropout.
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.5);
+    config.set_probability_of_skipping_dropout(-10);
+
+    DecisionTreeEnsembleConfig new_ensemble;
+    EXPECT_FALSE(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights)
+                     .ok());
+  }
+  // Larger than 1 probability of skipping dropout.
+  {
+    LearningRateDropoutDrivenConfig config;
+    config.set_dropout_probability(0.5);
+    config.set_probability_of_skipping_dropout(1.2);
+
+    DecisionTreeEnsembleConfig new_ensemble;
+    EXPECT_FALSE(DropoutUtils::DropOutTrees(kSeed, config, trees_not_to_drop,
+                                            weights_, &dropped_trees,
+                                            &original_weights)
+                     .ok());
+  }
+}
+namespace {
+
+void ExpectVecsEquiv(const std::vector<float>& vec1,
+                     const std::vector<float>& vec2) {
+  EXPECT_EQ(vec1.size(), vec2.size());
+  for (int i = 0; i < vec1.size(); ++i) {
+    EXPECT_NEAR(vec1[i], vec2[i], 1e-3);
+  }
+}
+
+std::vector<float> GetWeightsByIndex(const std::vector<float>& weights,
+                                     const std::vector<int>& indices) {
+  std::vector<float> res;
+  res.reserve(indices.size());
+  for (const int index : indices) {
+    res.push_back(weights[index]);
+  }
+  return res;
+}
+
+void MergeLastElements(const int32 last_n, std::vector<float>* weights) {
+  float sum = 0.0;
+  for (int i = 0; i < last_n; ++i) {
+    sum += weights->back();
+    weights->pop_back();
+  }
+  weights->push_back(sum);
+}
+
+}  // namespace
+
+TEST_F(DropoutUtilsTest, GetTreesWeightsForAddingTreesTest) {
+  // Adding trees should give the same res in any order
+  {
+    std::vector<float> weights = {1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<int32> dropped_1 = {0, 3};
+
+    std::vector<int32> dropped_2 = {0};
+
+    std::vector<float> res_1;
+    std::vector<float> res_2;
+    // Do one order
+    {
+      std::vector<float> current_weights = weights;
+      std::vector<int32> num_updates =
+          std::vector<int32>(current_weights.size(), 1);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_1, GetWeightsByIndex(current_weights, dropped_1),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_2, GetWeightsByIndex(current_weights, dropped_2),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      res_1 = current_weights;
+    }
+    // Do another order
+    {
+      std::vector<float> current_weights = weights;
+      std::vector<int32> num_updates =
+          std::vector<int32>(current_weights.size(), 1);
+
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_2, GetWeightsByIndex(current_weights, dropped_2),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_1, GetWeightsByIndex(current_weights, dropped_1),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      res_2 = current_weights;
+    }
+    // The vectors are the same, but the last two elements have the same sum.
+    EXPECT_EQ(res_1.size(), 7);
+    EXPECT_EQ(res_2.size(), 7);
+
+    MergeLastElements(2, &res_1);
+    MergeLastElements(2, &res_2);
+
+    EXPECT_EQ(res_1, res_2);
+  }
+  // Now when the weights are not all 1s
+  {
+    std::vector<float> weights = {1.1, 2.1, 3.1, 4.1, 5.1};
+    std::vector<int32> dropped_1 = {0, 3};
+
+    std::vector<int32> dropped_2 = {0};
+
+    std::vector<float> res_1;
+    std::vector<float> res_2;
+    // Do one order
+    {
+      std::vector<float> current_weights = weights;
+      std::vector<int32> num_updates =
+          std::vector<int32>(current_weights.size(), 1);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_1, GetWeightsByIndex(current_weights, dropped_1),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_2, GetWeightsByIndex(current_weights, dropped_2),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      res_1 = current_weights;
+    }
+    // Do another order
+    {
+      std::vector<float> current_weights = weights;
+      std::vector<int32> num_updates =
+          std::vector<int32>(current_weights.size(), 1);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_2, GetWeightsByIndex(current_weights, dropped_2),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_1, GetWeightsByIndex(current_weights, dropped_1),
+          current_weights.size(), 1, &current_weights, &num_updates);
+      res_2 = current_weights;
+    }
+    EXPECT_EQ(res_1.size(), 7);
+    EXPECT_EQ(res_2.size(), 7);
+
+    // The vectors are the same, but the last two elements have the same sum.
+    MergeLastElements(2, &res_1);
+    MergeLastElements(2, &res_2);
+
+    ExpectVecsEquiv(res_1, res_2);
+  }
+}
+
+TEST_F(DropoutUtilsTest, GetTreesWeightsForAddingTreesIndexTest) {
+  std::vector<float> weights = {1.0, 1.0, 1.0, 1.0, 1.0};
+  std::vector<int32> dropped = {0, 3};
+
+  std::vector<float> res;
+  std::vector<float> res_2;
+
+  // The tree that is added does not yet have an entry in weights vector.
+  {
+    std::vector<float> current_weights = weights;
+    std::vector<int32> num_updates =
+        std::vector<int32>(current_weights.size(), 1);
+    DropoutUtils::GetTreesWeightsForAddingTrees(
+        dropped, GetWeightsByIndex(current_weights, dropped),
+        current_weights.size(), 1, &current_weights, &num_updates);
+    EXPECT_EQ(current_weights.size(), weights.size() + 1);
+    EXPECT_EQ(num_updates.size(), weights.size() + 1);
+
+    std::vector<int32> expected_num_updates = {2, 1, 1, 2, 1, 1};
+    std::vector<float> expected_weights = {2.0 / 3, 1, 1, 2.0 / 3, 1, 2.0 / 3};
+    EXPECT_EQ(expected_weights, current_weights);
+    EXPECT_EQ(expected_num_updates, num_updates);
+  }
+  // The tree that is added has already an entry in weights and updates (batch
+  // mode).
+  {
+    std::vector<float> current_weights = weights;
+    std::vector<int32> num_updates =
+        std::vector<int32>(current_weights.size(), 1);
+    DropoutUtils::GetTreesWeightsForAddingTrees(
+        dropped, GetWeightsByIndex(current_weights, dropped),
+        current_weights.size() - 1, 1, &current_weights, &num_updates);
+    EXPECT_EQ(current_weights.size(), weights.size());
+    EXPECT_EQ(num_updates.size(), weights.size());
+
+    std::vector<int32> expected_num_updates = {2, 1, 1, 2, 2};
+    std::vector<float> expected_weights = {2.0 / 3, 1, 1, 2.0 / 3, 2.0 / 3};
+    EXPECT_EQ(expected_weights, current_weights);
+    EXPECT_EQ(expected_num_updates, num_updates);
+  }
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h
new file mode 100644
index 00000000000..4681eb06aa2
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@@ -0,0 +1,50 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+
+#include <unordered_set>
+#include <vector>
+#include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Holds data for one example and enables lookup by feature column.
+struct Example {
+  // Default constructor creates an empty example.
+  Example() : example_idx(-1) {}
+
+  // Example index.
+  int64 example_idx;
+
+  // Dense and sparse float features indexed by feature column.
+  // TODO(salehay): figure out a design to support multivalent float features.
+  std::vector<float> dense_float_features;
+  std::vector<OptionalValue<float>> sparse_float_features;
+
+  // Sparse integer features indexed by feature column.
+  // Note that all integer features are assumed to be categorical, i.e. will
+  // never be compared by order. Also these features can be multivalent.
+  std::vector<std::unordered_set<int64>> sparse_int_features;
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc
new file mode 100644
index 00000000000..c73dc8e15d4
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc
@@ -0,0 +1,83 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+using Iterator = ExamplesIterable::Iterator;
+
+ExamplesIterable::ExamplesIterable(
+    const std::vector<Tensor>& dense_float_feature_columns,
+    const std::vector<sparse::SparseTensor>& sparse_float_feature_columns,
+    const std::vector<sparse::SparseTensor>& sparse_int_feature_columns,
+    int64 example_start, int64 example_end)
+    : example_start_(example_start), example_end_(example_end) {
+  // Create dense float column values.
+  dense_float_column_values_.reserve(dense_float_feature_columns.size());
+  for (auto& dense_float_column : dense_float_feature_columns) {
+    dense_float_column_values_.emplace_back(
+        dense_float_column.template matrix<float>());
+  }
+
+  // Create sparse float column iterables and values.
+  sparse_float_column_iterables_.reserve(sparse_float_feature_columns.size());
+  sparse_float_column_values_.reserve(sparse_float_feature_columns.size());
+  for (auto& sparse_float_column : sparse_float_feature_columns) {
+    sparse_float_column_iterables_.emplace_back(
+        sparse_float_column.indices().template matrix<int64>(), example_start,
+        example_end);
+    sparse_float_column_values_.emplace_back(
+        sparse_float_column.values().template vec<float>());
+  }
+
+  // Create sparse int column iterables and values.
+  sparse_int_column_iterables_.reserve(sparse_int_feature_columns.size());
+  sparse_int_column_values_.reserve(sparse_int_feature_columns.size());
+  for (auto& sparse_int_column : sparse_int_feature_columns) {
+    sparse_int_column_iterables_.emplace_back(
+        sparse_int_column.indices().template matrix<int64>(), example_start,
+        example_end);
+    sparse_int_column_values_.emplace_back(
+        sparse_int_column.values().template vec<int64>());
+  }
+}
+
+Iterator::Iterator(ExamplesIterable* iter, int64 example_idx)
+    : iter_(iter), example_idx_(example_idx) {
+  // Create sparse iterators.
+  sparse_float_column_iterators_.reserve(
+      iter->sparse_float_column_iterables_.size());
+  for (auto& iterable : iter->sparse_float_column_iterables_) {
+    sparse_float_column_iterators_.emplace_back(iterable.begin());
+  }
+  sparse_int_column_iterators_.reserve(
+      iter->sparse_int_column_iterables_.size());
+  for (auto& iterable : iter->sparse_int_column_iterables_) {
+    sparse_int_column_iterators_.emplace_back(iterable.begin());
+  }
+
+  // Pre-size example features.
+  example_.dense_float_features.resize(
+      iter_->dense_float_column_values_.size());
+  example_.sparse_float_features.resize(
+      iter_->sparse_float_column_values_.size());
+  example_.sparse_int_features.resize(iter_->sparse_int_column_values_.size());
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
new file mode 100644
index 00000000000..67efb82a227
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
@@ -0,0 +1,172 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/example.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Enables row-wise iteration through examples from feature columns.
+class ExamplesIterable {
+ public:
+  // Constructs an iterable given the desired examples slice and corresponding
+  // feature columns.
+  ExamplesIterable(
+      const std::vector<Tensor>& dense_float_feature_columns,
+      const std::vector<sparse::SparseTensor>& sparse_float_feature_columns,
+      const std::vector<sparse::SparseTensor>& sparse_int_feature_columns,
+      int64 example_start, int64 example_end);
+
+  // Helper class to iterate through examples.
+  class Iterator {
+   public:
+    Iterator(ExamplesIterable* iter, int64 example_idx);
+
+    Iterator& operator++() {
+      // Advance to next example.
+      ++example_idx_;
+
+      // Update sparse column iterables.
+      for (auto& it : sparse_float_column_iterators_) {
+        ++it;
+      }
+      for (auto& it : sparse_int_column_iterators_) {
+        ++it;
+      }
+      return (*this);
+    }
+
+    Iterator operator++(int) {
+      Iterator tmp(*this);
+      ++(*this);
+      return tmp;
+    }
+
+    bool operator!=(const Iterator& other) const {
+      QCHECK_EQ(iter_, other.iter_);
+      return (example_idx_ != other.example_idx_);
+    }
+
+    bool operator==(const Iterator& other) const {
+      QCHECK_EQ(iter_, other.iter_);
+      return (example_idx_ == other.example_idx_);
+    }
+
+    const Example& operator*() {
+      // Set example index based on iterator.
+      example_.example_idx = example_idx_;
+
+      // Get dense float values per column.
+      auto& dense_float_features = example_.dense_float_features;
+      for (size_t dense_float_idx = 0;
+           dense_float_idx < dense_float_features.size(); ++dense_float_idx) {
+        dense_float_features[dense_float_idx] =
+            iter_->dense_float_column_values_[dense_float_idx](example_idx_, 0);
+      }
+
+      // Get sparse float values per column.
+      auto& sparse_float_features = example_.sparse_float_features;
+      for (size_t sparse_float_idx = 0;
+           sparse_float_idx < sparse_float_features.size();
+           ++sparse_float_idx) {
+        const auto& row_range =
+            (*sparse_float_column_iterators_[sparse_float_idx]);
+        DCHECK_EQ(example_idx_, row_range.example_idx);
+        if (row_range.start < row_range.end) {
+          DCHECK_EQ(1, row_range.end - row_range.start);
+          sparse_float_features[sparse_float_idx] = OptionalValue<float>(
+              iter_->sparse_float_column_values_[sparse_float_idx](
+                  row_range.start));
+        } else {
+          sparse_float_features[sparse_float_idx] = OptionalValue<float>();
+        }
+      }
+
+      // Get sparse int values per column.
+      auto& sparse_int_features = example_.sparse_int_features;
+      for (size_t sparse_int_idx = 0;
+           sparse_int_idx < sparse_int_features.size(); ++sparse_int_idx) {
+        const auto& row_range = (*sparse_int_column_iterators_[sparse_int_idx]);
+        DCHECK_EQ(example_idx_, row_range.example_idx);
+        sparse_int_features[sparse_int_idx].clear();
+        if (row_range.start < row_range.end) {
+          sparse_int_features[sparse_int_idx].reserve(row_range.end -
+                                                      row_range.start);
+          for (int64 row_idx = row_range.start; row_idx < row_range.end;
+               ++row_idx) {
+            sparse_int_features[sparse_int_idx].insert(
+                iter_->sparse_int_column_values_[sparse_int_idx](row_idx));
+          }
+        }
+      }
+
+      return example_;
+    }
+
+   private:
+    // Examples iterable (not owned).
+    const ExamplesIterable* iter_;
+
+    // Example index.
+    int64 example_idx_;
+
+    // Sparse float column iterators.
+    std::vector<SparseColumnIterable::Iterator> sparse_float_column_iterators_;
+
+    // Sparse int column iterators.
+    std::vector<SparseColumnIterable::Iterator> sparse_int_column_iterators_;
+
+    // Example placeholder.
+    Example example_;
+  };
+
+  Iterator begin() { return Iterator(this, example_start_); }
+  Iterator end() { return Iterator(this, example_end_); }
+
+ private:
+  // Example slice spec.
+  const int64 example_start_;
+  const int64 example_end_;
+
+  // Dense float column values.
+  std::vector<TTypes<float>::ConstMatrix> dense_float_column_values_;
+
+  // Sparse float column iterables.
+  std::vector<SparseColumnIterable> sparse_float_column_iterables_;
+
+  // Sparse float column values.
+  std::vector<TTypes<float>::ConstVec> sparse_float_column_values_;
+
+  // Sparse int column iterables.
+  std::vector<SparseColumnIterable> sparse_int_column_iterables_;
+
+  // Sparse int column values.
+  std::vector<TTypes<int64>::ConstVec> sparse_int_column_values_;
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
new file mode 100644
index 00000000000..d12618217ad
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
@@ -0,0 +1,182 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+namespace {
+
+class ExamplesIterableTest : public ::testing::Test {};
+
+TEST_F(ExamplesIterableTest, Iterate) {
+  // Create a batch of 8 examples having one dense float, two sparse float and
+  // two sparse int features.
+  // The data looks like the following:
+  // Instance | DenseF1 | SparseF1 | SparseF2 | SparseI1 | SparseI2 |
+  // 0        |   7     |   -3     |          |   1, 8   |          |
+  // 1        |  -2     |          |    4     |    0     |    7     |
+  // 2        |   8     |    0     |          |          |    13    |
+  // 3        |   1     |    5     |    7     |   2, 0   |    4     |
+  // 4        |   0     |    0     |          |          |    0     |
+  // 5        |  -4     |          |    9     |          |          |
+  // 6        |   7     |          |          |          |          |
+  // 7        |  -2     |          |   -4     |     5    |          |
+  auto dense_float_tensor = test::AsTensor<float>(
+      {7.0f, -2.0f, 8.0f, 1.0f, 0.0f, -4.0f, 7.0f, -2.0f}, {8, 1});
+  auto sparse_float_indices1 =
+      test::AsTensor<int64>({0, 0, 2, 0, 3, 0, 4, 0}, {4, 2});
+  auto sparse_float_values1 = test::AsTensor<float>({-3.0f, 0.0f, 5.0f, 0.0f});
+  auto sparse_float_shape1 = TensorShape({8, 1});
+  sparse::SparseTensor sparse_float_tensor1(
+      sparse_float_indices1, sparse_float_values1, sparse_float_shape1);
+  auto sparse_float_indices2 =
+      test::AsTensor<int64>({1, 0, 3, 0, 5, 0, 7, 0}, {4, 2});
+  auto sparse_float_values2 = test::AsTensor<float>({4.0f, 7.0f, 9.0f, -4.0f});
+  auto sparse_float_shape2 = TensorShape({8, 1});
+  sparse::SparseTensor sparse_float_tensor2(
+      sparse_float_indices2, sparse_float_values2, sparse_float_shape2);
+  auto sparse_int_indices1 =
+      test::AsTensor<int64>({0, 0, 0, 1, 1, 0, 3, 0, 3, 1, 7, 0}, {6, 2});
+  auto sparse_int_values1 = test::AsTensor<int64>({1, 8, 0, 2, 0, 5});
+  auto sparse_int_shape1 = TensorShape({8, 2});
+  sparse::SparseTensor sparse_int_tensor1(
+      sparse_int_indices1, sparse_int_values1, sparse_int_shape1);
+  auto sparse_int_indices2 =
+      test::AsTensor<int64>({1, 0, 2, 0, 3, 0, 4, 0}, {4, 2});
+  auto sparse_int_values2 = test::AsTensor<int64>({7, 13, 4, 0});
+  auto sparse_int_shape2 = TensorShape({8, 1});
+  sparse::SparseTensor sparse_int_tensor2(
+      sparse_int_indices2, sparse_int_values2, sparse_int_shape2);
+
+  auto validate_example_features = [](int64 example_idx,
+                                      const Example& example) {
+    EXPECT_EQ(1, example.dense_float_features.size());
+    EXPECT_EQ(2, example.sparse_float_features.size());
+
+    switch (example_idx) {
+      case 0: {
+        EXPECT_EQ(0, example.example_idx);
+        EXPECT_EQ(7.0f, example.dense_float_features[0]);
+        EXPECT_TRUE(example.sparse_float_features[0].has_value());
+        EXPECT_EQ(-3.0f, example.sparse_float_features[0].get_value());
+        EXPECT_FALSE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(2, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[0].count(1));
+        EXPECT_EQ(1, example.sparse_int_features[0].count(8));
+        EXPECT_EQ(0, example.sparse_int_features[1].size());
+      } break;
+      case 1: {
+        EXPECT_EQ(1, example.example_idx);
+        EXPECT_EQ(-2.0f, example.dense_float_features[0]);
+        EXPECT_FALSE(example.sparse_float_features[0].has_value());
+        EXPECT_TRUE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(4.0f, example.sparse_float_features[1].get_value());
+        EXPECT_EQ(1, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[0].count(0));
+        EXPECT_EQ(1, example.sparse_int_features[1].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].count(7));
+      } break;
+      case 2: {
+        EXPECT_EQ(2, example.example_idx);
+        EXPECT_EQ(8.0f, example.dense_float_features[0]);
+        EXPECT_TRUE(example.sparse_float_features[0].has_value());
+        EXPECT_EQ(0.0f, example.sparse_float_features[0].get_value());
+        EXPECT_FALSE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(0, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].count(13));
+      } break;
+      case 3: {
+        EXPECT_EQ(3, example.example_idx);
+        EXPECT_EQ(1.0f, example.dense_float_features[0]);
+        EXPECT_TRUE(example.sparse_float_features[0].has_value());
+        EXPECT_EQ(5.0f, example.sparse_float_features[0].get_value());
+        EXPECT_TRUE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(7.0f, example.sparse_float_features[1].get_value());
+        EXPECT_EQ(2, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[0].count(2));
+        EXPECT_EQ(1, example.sparse_int_features[0].count(0));
+        EXPECT_EQ(1, example.sparse_int_features[1].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].count(4));
+      } break;
+      case 4: {
+        EXPECT_EQ(4, example.example_idx);
+        EXPECT_EQ(0.0f, example.dense_float_features[0]);
+        EXPECT_TRUE(example.sparse_float_features[0].has_value());
+        EXPECT_EQ(0.0f, example.sparse_float_features[0].get_value());
+        EXPECT_FALSE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(0, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].size());
+        EXPECT_EQ(1, example.sparse_int_features[1].count(0));
+      } break;
+      case 5: {
+        EXPECT_EQ(5, example.example_idx);
+        EXPECT_EQ(-4.0f, example.dense_float_features[0]);
+        EXPECT_FALSE(example.sparse_float_features[0].has_value());
+        EXPECT_TRUE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(9.0f, example.sparse_float_features[1].get_value());
+        EXPECT_EQ(0, example.sparse_int_features[0].size());
+      } break;
+      case 6: {
+        EXPECT_EQ(6, example.example_idx);
+        EXPECT_EQ(7.0f, example.dense_float_features[0]);
+        EXPECT_FALSE(example.sparse_float_features[0].has_value());
+        EXPECT_FALSE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(0, example.sparse_int_features[0].size());
+      } break;
+      case 7: {
+        EXPECT_EQ(7, example.example_idx);
+        EXPECT_EQ(-2.0f, example.dense_float_features[0]);
+        EXPECT_FALSE(example.sparse_float_features[0].has_value());
+        EXPECT_TRUE(example.sparse_float_features[1].has_value());
+        EXPECT_EQ(-4.0f, example.sparse_float_features[1].get_value());
+        EXPECT_EQ(1, example.sparse_int_features[0].size());
+        EXPECT_EQ(1, example.sparse_int_features[0].count(5));
+      } break;
+      default: { QCHECK(false) << "Invalid example index."; } break;
+    }
+  };
+
+  // Iterate through all examples sequentially.
+  ExamplesIterable full_iterable(
+      {dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2},
+      {sparse_int_tensor1, sparse_int_tensor2}, 0, 8);
+  int64 example_idx = 0;
+  for (const auto& example : full_iterable) {
+    validate_example_features(example_idx, example);
+    ++example_idx;
+  }
+  EXPECT_EQ(8, example_idx);
+
+  // Iterate through slice (2, 6) of examples.
+  ExamplesIterable slice_iterable(
+      {dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2},
+      {sparse_int_tensor1, sparse_int_tensor2}, 2, 6);
+  example_idx = 2;
+  for (const auto& example : slice_iterable) {
+    validate_example_features(example_idx, example);
+    ++example_idx;
+  }
+  EXPECT_EQ(6, example_idx);
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/macros.h b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
new file mode 100644
index 00000000000..28ea0a4dc19
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
@@ -0,0 +1,26 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+
+#include "tensorflow/core/platform/macros.h"
+
+#define TF_CHECK_AND_RETURN_IF_ERROR(EXP, STATUS) \
+  if (!TF_PREDICT_TRUE(EXP)) {                    \
+    return (STATUS);                              \
+  }
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
new file mode 100644
index 00000000000..c141fe059d4
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
@@ -0,0 +1,47 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Utility class holding an optional value.
+template <typename T>
+class OptionalValue {
+ public:
+  OptionalValue() : value_(), has_value_(false) {}
+  explicit OptionalValue(T value) : value_(value), has_value_(true) {}
+
+  bool has_value() const { return has_value_; }
+  const T& get_value() const {
+    QCHECK(has_value());
+    return value_;
+  }
+
+ private:
+  T value_;
+  bool has_value_;
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.cc b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.cc
new file mode 100644
index 00000000000..b00d80b522d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.cc
@@ -0,0 +1,51 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+void ParallelFor(int64 batch_size, int64 desired_parallelism,
+                 thread::ThreadPool* thread_pool,
+                 std::function<void(int64, int64)> do_work) {
+  // Parallelize work over the batch.
+  if (desired_parallelism <= 0) {
+    do_work(0, batch_size);
+    return;
+  }
+  const int num_shards = std::max<int>(
+      1, std::min(static_cast<int64>(desired_parallelism), batch_size));
+  const int64 block_size = (batch_size + num_shards - 1) / num_shards;
+  CHECK_GT(block_size, 0);
+  const int num_shards_used = (batch_size + block_size - 1) / block_size;
+  BlockingCounter counter(num_shards_used - 1);
+  for (int64 start = block_size; start < batch_size; start += block_size) {
+    auto end = std::min(start + block_size, batch_size);
+    thread_pool->Schedule([&do_work, &counter, start, end]() {
+      do_work(start, end);
+      counter.DecrementCount();
+    });
+  }
+
+  // Execute first shard on main thread.
+  do_work(0, std::min(block_size, batch_size));
+  counter.Wait();
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
new file mode 100644
index 00000000000..c80431b5587
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
@@ -0,0 +1,33 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Executes a parallel for over the batch for the desired parallelism level.
+void ParallelFor(int64 batch_size, int64 desired_parallelism,
+                 thread::ThreadPool* thread_pool,
+                 std::function<void(int64, int64)> do_work);
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/random.h b/tensorflow/contrib/boosted_trees/lib/utils/random.h
new file mode 100644
index 00000000000..6dd55fcacc4
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/random.h
@@ -0,0 +1,39 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Generates a poisson distributed number with mean 1 for use in bootstrapping.
+inline int32 PoissonBootstrap(random::SimplePhilox* rng) {
+  // Knuth, special cased for lambda = 1.0 for efficiency.
+  static const float lbound = exp(-1.0f);
+  int32 n = 0;
+  for (float r = 1; r > lbound; r *= rng->RandFloat()) {
+    ++n;
+  }
+  return n - 1;
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/random_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/random_test.cc
new file mode 100644
index 00000000000..51162f410e8
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/random_test.cc
@@ -0,0 +1,56 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/random.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+namespace {
+
+TEST(RandomTest, Poisson) {
+  random::PhiloxRandom philox(77L);
+  random::SimplePhilox rng(&philox);
+  for (int trial = 0; trial < 10; ++trial) {
+    const int32 num_bootstrap = 10000;
+    double sum = 0;
+    double zeros = 0;
+    double ones = 0;
+    for (int i = 0; i < num_bootstrap; ++i) {
+      auto n = PoissonBootstrap(&rng);
+      sum += n;
+      zeros += (n == 0) ? 1 : 0;
+      ones += (n == 1) ? 1 : 0;
+    }
+
+    // Ensure mean is near expected value.
+    const double expected_mean = 1.0;  // lambda
+    const double mean_std_error = 1.0 / sqrt(num_bootstrap);
+    double mean = sum / num_bootstrap;
+    EXPECT_NEAR(mean, expected_mean, 3 * mean_std_error);
+
+    // Ensure probability mass for values 0 and 1 are near expected value.
+    const double expected_p = 0.368;
+    const double proportion_std_error =
+        sqrt(expected_p * (1 - expected_p) / num_bootstrap);
+    EXPECT_NEAR(zeros / num_bootstrap, expected_p, 3 * proportion_std_error);
+    EXPECT_NEAR(ones / num_bootstrap, expected_p, 3 * proportion_std_error);
+  }
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
new file mode 100644
index 00000000000..bc0a93db8c3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
@@ -0,0 +1,122 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+using ExampleRowRange = SparseColumnIterable::ExampleRowRange;
+using Iterator = SparseColumnIterable::Iterator;
+
+namespace {
+
+// Iterator over indices matrix rows.
+class IndicesRowIterator
+    : public std::iterator<std::random_access_iterator_tag, const int64> {
+ public:
+  IndicesRowIterator() : iter_(nullptr), row_idx_(-1) {}
+  IndicesRowIterator(SparseColumnIterable* iter, int row_idx)
+      : iter_(iter), row_idx_(row_idx) {}
+  IndicesRowIterator(const IndicesRowIterator& other)
+      : iter_(other.iter_), row_idx_(other.row_idx_) {}
+
+  IndicesRowIterator& operator=(const IndicesRowIterator& other) {
+    iter_ = other.iter_;
+    row_idx_ = other.row_idx_;
+    return (*this);
+  }
+
+  IndicesRowIterator& operator++() {
+    ++row_idx_;
+    return (*this);
+  }
+
+  IndicesRowIterator operator++(int) {
+    IndicesRowIterator tmp(*this);
+    ++row_idx_;
+    return tmp;
+  }
+
+  reference operator*() { return iter_->ix()(row_idx_, 0); }
+
+  pointer operator->() { return &iter_->ix()(row_idx_, 0); }
+
+  IndicesRowIterator& operator--() {
+    --row_idx_;
+    return (*this);
+  }
+
+  IndicesRowIterator operator--(int) {
+    IndicesRowIterator tmp(*this);
+    --row_idx_;
+    return tmp;
+  }
+
+  IndicesRowIterator& operator+=(const difference_type& step) {
+    row_idx_ += step;
+    return (*this);
+  }
+  IndicesRowIterator& operator-=(const difference_type& step) {
+    row_idx_ -= step;
+    return (*this);
+  }
+
+  IndicesRowIterator operator+(const difference_type& step) const {
+    IndicesRowIterator tmp(*this);
+    tmp += step;
+    return tmp;
+  }
+
+  IndicesRowIterator operator-(const difference_type& step) const {
+    IndicesRowIterator tmp(*this);
+    tmp -= step;
+    return tmp;
+  }
+
+  difference_type operator-(const IndicesRowIterator& other) const {
+    return row_idx_ - other.row_idx_;
+  }
+
+  bool operator!=(const IndicesRowIterator& other) const {
+    QCHECK_EQ(iter_, other.iter_);
+    return (row_idx_ != other.row_idx_);
+  }
+
+  bool operator==(const IndicesRowIterator& other) const {
+    QCHECK_EQ(iter_, other.iter_);
+    return (row_idx_ == other.row_idx_);
+  }
+
+  Eigen::Index row_idx() const { return row_idx_; }
+
+ private:
+  SparseColumnIterable* iter_;
+  Eigen::Index row_idx_;
+};
+}  // namespace
+
+Iterator::Iterator(SparseColumnIterable* iter, int64 example_idx)
+    : iter_(iter), example_idx_(example_idx), end_(iter->ix_.dimension(0)) {
+  cur_ = next_ = std::lower_bound(IndicesRowIterator(iter, 0),
+                                  IndicesRowIterator(iter, end_), example_idx_)
+                     .row_idx();
+  UpdateNext();
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
new file mode 100644
index 00000000000..78a5752730c
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
@@ -0,0 +1,128 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+// Enables row-wise iteration through examples on sparse feature columns.
+class SparseColumnIterable {
+ public:
+  // Indicates a contiguous range for an example: [start, end).
+  struct ExampleRowRange {
+    int64 example_idx;
+    int64 start;
+    int64 end;
+  };
+
+  // Helper class to iterate through examples and return the corresponding
+  // indices row range. Note that the row range can be empty in case a given
+  // example has no corresponding indices.
+  // An Iterator can be initialized from any example start offset, the
+  // corresponding range indicators will be initialized in log time.
+  class Iterator {
+   public:
+    Iterator(SparseColumnIterable* iter, int64 example_idx);
+
+    Iterator& operator++() {
+      ++example_idx_;
+      if (cur_ < end_ && iter_->ix()(cur_, 0) < example_idx_) {
+        cur_ = next_;
+        UpdateNext();
+      }
+      return (*this);
+    }
+
+    Iterator operator++(int) {
+      Iterator tmp(*this);
+      ++(*this);
+      return tmp;
+    }
+
+    bool operator!=(const Iterator& other) const {
+      QCHECK_EQ(iter_, other.iter_);
+      return (example_idx_ != other.example_idx_);
+    }
+
+    bool operator==(const Iterator& other) const {
+      QCHECK_EQ(iter_, other.iter_);
+      return (example_idx_ == other.example_idx_);
+    }
+
+    const ExampleRowRange& operator*() {
+      range_.example_idx = example_idx_;
+      if (cur_ < end_ && iter_->ix()(cur_, 0) == example_idx_) {
+        range_.start = cur_;
+        range_.end = next_;
+      } else {
+        range_.start = 0;
+        range_.end = 0;
+      }
+      return range_;
+    }
+
+   private:
+    void UpdateNext() {
+      next_ = std::min(next_ + 1, end_);
+      while (next_ < end_ && iter_->ix()(cur_, 0) == iter_->ix()(next_, 0)) {
+        ++next_;
+      }
+    }
+
+    const SparseColumnIterable* iter_;
+    int64 example_idx_;
+    int64 cur_;
+    int64 next_;
+    const int64 end_;
+    ExampleRowRange range_;
+  };
+
+  // Constructs an iterable given the desired examples slice and corresponding
+  // feature columns.
+  SparseColumnIterable(TTypes<int64>::ConstMatrix ix, int64 example_start,
+                       int64 example_end)
+      : ix_(ix), example_start_(example_start), example_end_(example_end) {
+    QCHECK(example_start >= 0 && example_end >= 0);
+  }
+
+  Iterator begin() { return Iterator(this, example_start_); }
+  Iterator end() { return Iterator(this, example_end_); }
+
+  const TTypes<int64>::ConstMatrix& ix() const { return ix_; }
+  int64 example_start() const { return example_start_; }
+  int64 example_end() const { return example_end_; }
+
+ private:
+  // Sparse indices matrix.
+  TTypes<int64>::ConstMatrix ix_;
+
+  // Example slice spec.
+  const int64 example_start_;
+  const int64 example_end_;
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
new file mode 100644
index 00000000000..7792bd8c66c
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable_test.cc
@@ -0,0 +1,100 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+namespace {
+
+using test::AsTensor;
+using ExampleRowRange = SparseColumnIterable::ExampleRowRange;
+
+class SparseColumnIterableTest : public ::testing::Test {};
+
+TEST_F(SparseColumnIterableTest, Empty) {
+  const auto indices = Tensor(DT_INT64, {0, 2});
+  SparseColumnIterable iterable(indices.template matrix<int64>(), 0, 0);
+  EXPECT_EQ(iterable.begin(), iterable.end());
+}
+
+TEST_F(SparseColumnIterableTest, Iterate) {
+  // 8 examples having 7 sparse features with the third multi-valent.
+  // This can be visualized like the following:
+  // Instance | Sparse |
+  // 0        |   x    |
+  // 1        |        |
+  // 2        |        |
+  // 3        |  xxx   |
+  // 4        |   x    |
+  // 5        |        |
+  // 6        |        |
+  // 7        |   xx   |
+  const auto indices =
+      AsTensor<int64>({0, 0, 3, 0, 3, 1, 3, 2, 4, 0, 7, 0, 7, 1}, {7, 2});
+
+  auto validate_example_range = [](const ExampleRowRange& range) {
+    switch (range.example_idx) {
+      case 0: {
+        EXPECT_EQ(0, range.start);
+        EXPECT_EQ(1, range.end);
+      } break;
+      case 3: {
+        EXPECT_EQ(1, range.start);
+        EXPECT_EQ(4, range.end);
+      } break;
+      case 4: {
+        EXPECT_EQ(4, range.start);
+        EXPECT_EQ(5, range.end);
+      } break;
+      case 7: {
+        EXPECT_EQ(5, range.start);
+        EXPECT_EQ(7, range.end);
+      } break;
+      default: {
+        // Empty examples.
+        EXPECT_GE(range.start, range.end);
+      } break;
+    }
+  };
+
+  // Iterate through all examples sequentially.
+  SparseColumnIterable full_iterable(indices.template matrix<int64>(), 0, 8);
+  int64 expected_example_idx = 0;
+  for (const ExampleRowRange& range : full_iterable) {
+    EXPECT_EQ(expected_example_idx, range.example_idx);
+    validate_example_range(range);
+    ++expected_example_idx;
+  }
+  EXPECT_EQ(8, expected_example_idx);
+
+  // Iterate through slice (2, 6) of examples.
+  SparseColumnIterable slice_iterable(indices.template matrix<int64>(), 2, 6);
+  expected_example_idx = 2;
+  for (const ExampleRowRange& range : slice_iterable) {
+    EXPECT_EQ(expected_example_idx, range.example_idx);
+    validate_example_range(range);
+    ++expected_example_idx;
+  }
+  EXPECT_EQ(6, expected_example_idx);
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc
new file mode 100644
index 00000000000..be2f787fd81
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc
@@ -0,0 +1,103 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/macros.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+std::vector<Tensor> TensorUtils::OpInputListToTensorVec(
+    const OpInputList& input_list) {
+  std::vector<Tensor> tensor_vec;
+  tensor_vec.reserve(input_list.size());
+  for (const Tensor& tensor : input_list) {
+    tensor_vec.emplace_back(tensor);
+  }
+  return tensor_vec;
+}
+
+Status TensorUtils::ReadDenseFloatFeatures(OpKernelContext* const context,
+                                           OpInputList* features_list) {
+  // Constants.
+  constexpr auto kDenseFloatFeaturesName = "dense_float_features";
+
+  // Read dense float features list;
+  TF_RETURN_IF_ERROR(
+      context->input_list(kDenseFloatFeaturesName, features_list));
+  return Status::OK();
+}
+
+Status TensorUtils::ReadSparseFloatFeatures(OpKernelContext* const context,
+                                            OpInputList* features_indices_list,
+                                            OpInputList* feature_values_list,
+                                            OpInputList* feature_shapes_list) {
+  // Constants.
+  constexpr auto kSparseFloatFeatureIndicesName =
+      "sparse_float_feature_indices";
+  constexpr auto kSparseFloatFeatureValuesName = "sparse_float_feature_values";
+  constexpr auto kSparseFloatFeatureShapesName = "sparse_float_feature_shapes";
+
+  // Read sparse float features list;
+  TF_RETURN_IF_ERROR(context->input_list(kSparseFloatFeatureIndicesName,
+                                         features_indices_list));
+  TF_RETURN_IF_ERROR(
+      context->input_list(kSparseFloatFeatureValuesName, feature_values_list));
+  TF_RETURN_IF_ERROR(
+      context->input_list(kSparseFloatFeatureShapesName, feature_shapes_list));
+  return Status::OK();
+}
+
+Status TensorUtils::ReadSparseIntFeatures(OpKernelContext* const context,
+                                          OpInputList* features_indices_list,
+                                          OpInputList* feature_values_list,
+                                          OpInputList* feature_shapes_list) {
+  // Constants.
+  constexpr auto kSparseIntFeatureIndicesName = "sparse_int_feature_indices";
+  constexpr auto kSparseIntFeatureValuesName = "sparse_int_feature_values";
+  constexpr auto kSparseIntFeatureShapesName = "sparse_int_feature_shapes";
+
+  // Read sparse int features list;
+  TF_RETURN_IF_ERROR(
+      context->input_list(kSparseIntFeatureIndicesName, features_indices_list));
+  TF_RETURN_IF_ERROR(
+      context->input_list(kSparseIntFeatureValuesName, feature_values_list));
+  TF_RETURN_IF_ERROR(
+      context->input_list(kSparseIntFeatureShapesName, feature_shapes_list));
+  return Status::OK();
+}
+
+int64 TensorUtils::InferBatchSize(
+    const OpInputList& dense_float_features_list,
+    const OpInputList& sparse_float_feature_shapes_list,
+    const OpInputList& sparse_int_feature_shapes_list) {
+  if (dense_float_features_list.size() > 0) {
+    return dense_float_features_list[0].dim_size(0);
+  }
+  if (sparse_float_feature_shapes_list.size() > 0) {
+    return sparse_float_feature_shapes_list[0].flat<int64>()(0);
+  }
+  if (sparse_int_feature_shapes_list.size() > 0) {
+    return sparse_int_feature_shapes_list[0].flat<int64>()(0);
+  }
+  QCHECK(false) << "Could not infer batch size due to empty feature set.";
+}
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
new file mode 100644
index 00000000000..58f5e5a0d18
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
@@ -0,0 +1,60 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace utils {
+
+class TensorUtils {
+ public:
+  // Read an input list into a vector of tensors.
+  static std::vector<Tensor> OpInputListToTensorVec(
+      const OpInputList& input_list);
+
+  // Reads the dense float features input list.
+  static Status ReadDenseFloatFeatures(OpKernelContext* const context,
+                                       OpInputList* features_list);
+
+  // Reads the sparse float features input list.
+  static Status ReadSparseFloatFeatures(OpKernelContext* const context,
+                                        OpInputList* features_indices_list,
+                                        OpInputList* feature_values_list,
+                                        OpInputList* feature_shapes_list);
+
+  // Reads the sparse int features input list.
+  static Status ReadSparseIntFeatures(OpKernelContext* const context,
+                                      OpInputList* features_indices_list,
+                                      OpInputList* feature_values_list,
+                                      OpInputList* feature_shapes_list);
+
+  // Infers the batch size by looking at the op input features.
+  static int64 InferBatchSize(
+      const OpInputList& dense_float_features_list,
+      const OpInputList& sparse_float_feature_shapes_list,
+      const OpInputList& sparse_int_feature_shapes_list);
+};
+
+}  // namespace utils
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
new file mode 100644
index 00000000000..c99d8849bd5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -0,0 +1,41 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "learner_proto",
+    srcs = [
+        "learner.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "quantiles_proto",
+    srcs = [
+        "quantiles.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "tree_config_proto",
+    srcs = ["tree_config.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
new file mode 100644
index 00000000000..06ee223467b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -0,0 +1,136 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.boosted_trees.learner;
+
+// Tree regularization config.
+message TreeRegularizationConfig {
+  // Classic L1/L2.
+  float l1 = 1;
+  float l2 = 2;
+
+  // Tree complexity penalizes overall model complexity effectively
+  // limiting how deep the tree can grow in regions with small gain.
+  float tree_complexity = 3;
+}
+
+// Tree constraints config.
+message TreeConstraintsConfig {
+  // Maximum depth of the trees.
+  uint32 max_tree_depth = 1;
+
+  // Min hessian weight per node.
+  float min_node_weight = 2;
+}
+
+// LearningRateConfig describes all supported learning rate tuners.
+message LearningRateConfig {
+  oneof tuner {
+    LearningRateFixedConfig fixed = 1;
+    LearningRateDropoutDrivenConfig dropout = 2;
+    LearningRateLineSearchConfig line_search = 3;
+  }
+}
+
+// Config for a fixed learning rate.
+message LearningRateFixedConfig {
+  float learning_rate = 1;
+}
+
+// Config for a tuned learning rate.
+message LearningRateLineSearchConfig {
+  // Max learning rate. Must be strictly positive.
+  float max_learning_rate = 1;
+
+  // Number of learning rate values to consider between [0, max_learning_rate).
+  int32 num_steps = 2;
+}
+
+// When we have a sequence of trees 1, 2, 3 ... n, these essentially represent
+// weights updates in functional space, and thus we can use averaging of weight
+// updates to achieve better performance. For example, we can say that our final
+// ensemble will be an average of ensembles of tree 1, and ensemble of tree 1
+// and tree 2 etc .. ensemble of all trees.
+// Note that this averaging will apply ONLY DURING PREDICTION. The training
+// stays the same.
+message AveragingConfig {
+  oneof config {
+    float average_last_n_trees = 1;
+    // Between 0 and 1. If set to 1.0, we are averaging ensembles of tree 1,
+    // ensemble of tree 1 and tree 2, etc ensemble of all trees. If set to 0.5,
+    // last half of the trees are averaged etc.
+    float average_last_percent_trees = 2;
+  }
+}
+
+message LearningRateDropoutDrivenConfig {
+  // Probability of dropping each tree in an existing so far ensemble.
+  float dropout_probability = 1;
+
+  // When trees are built after dropout happen, they don't "advance" to the
+  // optimal solution, they just rearrange the path. However you can still
+  // choose to skip dropout periodically, to allow a new tree that "advances"
+  // to be added.
+  // For example, if running for 200 steps with probability of dropout 1/100,
+  // you would expect the dropout to start happening for sure for all iterations
+  // after 100. However you can add probability_of_skipping_dropout of 0.1, this
+  // way iterations 100-200 will include approx 90 iterations of dropout and 10
+  // iterations of normal steps.Set it to 0 if you want just keep building
+  // the refinement trees after dropout kicks in.
+  float probability_of_skipping_dropout = 2;
+
+  // Between 0 and 1.
+  float learning_rate = 3;
+}
+
+message LearnerConfig {
+  enum PruningMode {
+    PRE_PRUNE = 0;
+    POST_PRUNE = 1;
+  }
+
+  enum GrowingMode {
+    WHOLE_TREE = 0;
+    // Layer by layer is only supported by the batch learner.
+    LAYER_BY_LAYER = 1;
+  }
+
+  enum MultiClassStrategy {
+    TREE_PER_CLASS = 0;
+    FULL_HESSIAN = 1;
+    DIAGONAL_HESSIAN = 2;
+  }
+
+  // Number of classes.
+  uint32 num_classes = 1;
+
+  // Fraction of features to consider in each tree sampled randomly
+  // from all available features.
+  oneof feature_fraction {
+    float feature_fraction_per_tree = 2;
+    float feature_fraction_per_level = 3;
+  };
+
+  // Regularization.
+  TreeRegularizationConfig regularization = 4;
+
+  // Constraints.
+  TreeConstraintsConfig constraints = 5;
+
+  // Pruning.
+  PruningMode pruning_mode = 8;
+
+  // Growing Mode.
+  GrowingMode growing_mode = 9;
+
+  // Learning rate.
+  LearningRateConfig learning_rate_tuner = 6;
+
+  // Multi-class strategy.
+  MultiClassStrategy multi_class_strategy = 10;
+
+  // If you want to average the ensembles (for regularization), provide the
+  // config below.
+  AveragingConfig averaging_config = 11;
+}
diff --git a/tensorflow/contrib/boosted_trees/proto/quantiles.proto b/tensorflow/contrib/boosted_trees/proto/quantiles.proto
new file mode 100644
index 00000000000..7f872d2aa71
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/quantiles.proto
@@ -0,0 +1,32 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package boosted_trees;
+
+message QuantileConfig {
+  // Maximum eps error when computing quantile summaries.
+  double eps = 1;
+  // Number of quantiles to generate.
+  int64 num_quantiles = 2;
+}
+
+message QuantileEntry {
+  // Value for the entry.
+  float value = 1;
+  // Weight for the entry.
+  float weight = 2;
+  // We need the minimum and maximum rank possible for this entry.
+  // Rank is 0.0 for the absolute minimum and sum of the weights for the maximum
+  // value in the input.
+  float min_rank = 3;
+  float max_rank = 4;
+}
+
+message QuantileSummaryState {
+  repeated QuantileEntry entries = 1;
+}
+
+message QuantileStreamState {
+  repeated QuantileSummaryState summaries = 1;
+}
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
new file mode 100644
index 00000000000..2e9d45efd71
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -0,0 +1,138 @@
+syntax = "proto3";
+option cc_enable_arenas = true;
+
+package tensorflow.boosted_trees.trees;
+
+// TreeNode describes a node in a tree.
+message TreeNode {
+  oneof node {
+    Leaf leaf = 1;
+    DenseFloatBinarySplit dense_float_binary_split = 2;
+    SparseFloatBinarySplitDefaultLeft sparse_float_binary_split_default_left =
+        3;
+    SparseFloatBinarySplitDefaultRight sparse_float_binary_split_default_right =
+        4;
+    CategoricalIdBinarySplit categorical_id_binary_split = 5;
+    CategoricalIdSetMembershipBinarySplit
+        categorical_id_set_membership_binary_split = 6;
+  }
+  TreeNodeMetadata node_metadata = 777;
+}
+
+// TreeNodeMetadata encodes metadata associated with each node in a tree.
+message TreeNodeMetadata {
+  // The gain associated with this node.
+  float gain = 1;
+
+  // The original leaf node before this node was split.
+  Leaf original_leaf = 2;
+}
+
+// Leaves can either hold dense or sparse information.
+message Leaf {
+  oneof leaf {
+    // See third_party/tensorflow/contrib/decision_trees/
+    // proto/generic_tree_model.proto
+    // for a description of how vector and sparse_vector might be used.
+    Vector vector = 1;
+    SparseVector sparse_vector = 2;
+  }
+}
+
+message Vector {
+  repeated float value = 1;
+}
+
+message SparseVector {
+  repeated int32 index = 1;
+  repeated float value = 2;
+}
+
+// Split rule for dense float features.
+message DenseFloatBinarySplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_column = 1;
+  float threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// Split rule for sparse float features defaulting left for missing features.
+message SparseFloatBinarySplitDefaultLeft {
+  DenseFloatBinarySplit split = 1;
+}
+
+// Split rule for sparse float features defaulting right for missing features.
+message SparseFloatBinarySplitDefaultRight {
+  DenseFloatBinarySplit split = 1;
+}
+
+// Split rule for categorical features with a single feature Id.
+message CategoricalIdBinarySplit {
+  // Categorical feature column and Id describing
+  // the rule feature == Id.
+  int32 feature_column = 1;
+  int64 feature_id = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// Split rule for categorical features with a set of feature Ids.
+message CategoricalIdSetMembershipBinarySplit {
+  // Categorical feature column and Id describing
+  // the rule feature ∈ feature_ids.
+  int32 feature_column = 1;
+  // Sorted list of Ids in the set.
+  repeated int64 feature_ids = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// DecisionTreeConfig describes a list of connected nodes.
+// Node 0 must be the root and can carry any payload including a leaf
+// in the case of representing the bias.
+// Note that each node id is implicitly its index in the list of nodes.
+message DecisionTreeConfig {
+  repeated TreeNode nodes = 1;
+}
+
+message DecisionTreeMetadata {
+  // How many times tree weight was updated (due to reweighting of the final
+  // ensemble, dropout, shrinkage etc).
+  int32 num_tree_weight_updates = 1;
+
+  // Number of layers grown for this tree.
+  int32 num_layers_grown = 2;
+
+  // Whether the tree is finalized in that no more layers can be grown.
+  bool is_finalized = 3;
+}
+
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+}
+
+// DecisionTreeEnsembleConfig describes an ensemble of decision trees.
+message DecisionTreeEnsembleConfig {
+  repeated DecisionTreeConfig trees = 1;
+  repeated float tree_weights = 2;
+  repeated DecisionTreeMetadata tree_metadata = 3;
+
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
+}
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
new file mode 100644
index 00000000000..5dfdf8f4896
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -0,0 +1,53 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(
+    default_visibility = [
+        "//tensorflow/contrib/boosted_trees:__subpackages__",
+        "//tensorflow/contrib/boosted_trees:friends",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "stamped_resource",
+    hdrs = ["stamped_resource.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "quantile_stream_resource",
+    hdrs = ["quantile_stream_resource.h"],
+    deps = [
+        ":stamped_resource",
+        "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "decision_tree_ensemble_resource",
+    hdrs = ["decision_tree_ensemble_resource.h"],
+    deps = [
+        ":stamped_resource",
+        "//tensorflow/contrib/boosted_trees/lib:trees",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
new file mode 100644
index 00000000000..45c3bbadfc8
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -0,0 +1,77 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class DecisionTreeEnsembleResource : public StampedResource {
+ public:
+  // Constructor.
+  explicit DecisionTreeEnsembleResource()
+      : decision_tree_ensemble_(
+            protobuf::Arena::CreateMessage<
+                boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_)) {}
+
+  string DebugString() override {
+    return strings::StrCat("GTFlowDecisionTreeEnsemble[size=",
+                           decision_tree_ensemble_->trees_size(), "]");
+  }
+
+  const boosted_trees::trees::DecisionTreeEnsembleConfig&
+  decision_tree_ensemble() const {
+    return *decision_tree_ensemble_;
+  }
+
+  boosted_trees::trees::DecisionTreeEnsembleConfig*
+  mutable_decision_tree_ensemble() {
+    return decision_tree_ensemble_;
+  }
+
+  // Resets the resource and frees the protos in arena.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset() {
+    // Reset stamp.
+    set_stamp(-1);
+
+    // Clear tree ensemle.
+    arena_.Reset();
+    CHECK_EQ(0, arena_.SpaceAllocated());
+    decision_tree_ensemble_ = protobuf::Arena::CreateMessage<
+        boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_);
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+ private:
+  protobuf::Arena arena_;
+  mutex mu_;
+  boosted_trees::trees::DecisionTreeEnsembleConfig* decision_tree_ensemble_;
+};
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
new file mode 100644
index 00000000000..fb29f79e578
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -0,0 +1,104 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
+#include "tensorflow/contrib/boosted_trees/proto/quantiles.pb.h"  // NOLINT
+#include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+using QuantileStream =
+    boosted_trees::quantiles::WeightedQuantilesStream<float, float>;
+
+// Resource for accumulating summaries for multiple columns.
+class QuantileStreamResource : public StampedResource {
+ public:
+  QuantileStreamResource(const float epsilon, const int32 num_quantiles,
+                         const int64 max_elements, int64 stamp_token)
+      : stream_(epsilon, max_elements),
+        are_buckets_ready_(false),
+        epsilon_(epsilon),
+        num_quantiles_(num_quantiles),
+        max_elements_(max_elements) {
+    set_stamp(stamp_token);
+  }
+
+  string DebugString() override { return "QuantileStreamResource"; }
+
+  tensorflow::mutex* mutex() { return &mu_; }
+
+  QuantileStream* stream(int64 stamp) {
+    CHECK(is_stamp_valid(stamp));
+    return &stream_;
+  }
+
+  const std::vector<float>& boundaries(int64 stamp) {
+    CHECK(is_stamp_valid(stamp));
+    return boundaries_;
+  }
+
+  void set_boundaries(int64 stamp, const std::vector<float>& boundaries) {
+    CHECK(is_stamp_valid(stamp));
+    are_buckets_ready_ = true;
+    boundaries_ = boundaries;
+  }
+
+  float epsilon() const { return epsilon_; }
+  int32 num_quantiles() const { return num_quantiles_; }
+
+  void Reset(int64 stamp) {
+    set_stamp(stamp);
+    stream_ = QuantileStream(epsilon_, max_elements_);
+  }
+
+  bool are_buckets_ready() const { return are_buckets_ready_; }
+  void set_buckets_ready(bool are_buckets_ready) {
+    are_buckets_ready_ = are_buckets_ready;
+  }
+
+ private:
+  ~QuantileStreamResource() override {}
+
+  // Mutex for the whole resource.
+  tensorflow::mutex mu_;
+
+  // Quantile stream.
+  QuantileStream stream_;
+
+  // Stores the boundaries from the previous iteration. Empty during the first
+  // iteration.
+  std::vector<float> boundaries_;
+
+  // Whether boundaries are created. Initially boundaries are empty until
+  // set_boundaries are called.
+  bool are_buckets_ready_;
+
+  const float epsilon_;
+  const int32 num_quantiles_;
+  // An upper-bound for the number of elements.
+  int64 max_elements_;
+  TF_DISALLOW_COPY_AND_ASSIGN(QuantileStreamResource);
+};
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
new file mode 100644
index 00000000000..aabeeb98516
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
@@ -0,0 +1,42 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+// A StampedResource is a resource that has a stamp token associated with it.
+// Before reading from or applying updates to the resource, the stamp should
+// be checked to verify that the update is not stale.
+class StampedResource : public ResourceBase {
+ public:
+  StampedResource() : stamp_(-1) {}
+
+  bool is_stamp_valid(int64 stamp) const { return stamp_ == stamp; }
+
+  int64 stamp() const { return stamp_; }
+  void set_stamp(int64 stamp) { stamp_ = stamp; }
+
+ private:
+  int64 stamp_;
+};
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
new file mode 100644
index 00000000000..840997223fb
--- /dev/null
+++ b/tensorflow/contrib/cloud/BUILD
@@ -0,0 +1,73 @@
+# Description:
+# BigQueryReader implementation
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_py_test",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["bigquery_reader_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_bigquery_reader_ops",
+    out = "python/ops/gen_bigquery_reader_ops.py",
+    require_shape_functions = True,
+    deps = [":bigquery_reader_ops_op_lib"],
+)
+
+py_library(
+    name = "cloud_py",
+    srcs = [
+        "__init__.py",
+        "python/ops/bigquery_reader_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_bigquery_reader_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+    ],
+)
+
+tf_py_test(
+    name = "bigquery_reader_ops_test",
+    size = "small",
+    srcs = ["python/ops/bigquery_reader_ops_test.py"],
+    additional_deps = [
+        ":bigquery_reader_ops_op_lib",
+        ":cloud_py",
+        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+    ],
+    tags = ["manual"],
+)
diff --git a/tensorflow/contrib/cloud/__init__.py b/tensorflow/contrib/cloud/__init__.py
new file mode 100644
index 00000000000..8870264b95d
--- /dev/null
+++ b/tensorflow/contrib/cloud/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for cloud ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=line-too-long,wildcard-import
+from tensorflow.contrib.cloud.python.ops.bigquery_reader_ops import *
+# pylint: enable=line-too-long,wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['BigQueryReader']
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
new file mode 100644
index 00000000000..35bab9abfbf
--- /dev/null
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -0,0 +1,94 @@
+# Description:
+# BigQueryReader implementation
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_kernel_library",
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_kernel_library(
+    name = "bigquery_reader_ops",
+    srcs = [
+        "bigquery_reader_ops.cc",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bigquery_table_accessor",
+        ":bigquery_table_partition_proto_cc",
+        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:reader_base",
+    ],
+)
+
+cc_library(
+    name = "bigquery_table_accessor",
+    srcs = [
+        "bigquery_table_accessor.cc",
+    ],
+    hdrs = [
+        "bigquery_table_accessor.h",
+    ],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":bigquery_table_partition_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform/cloud:google_auth_provider",
+        "//tensorflow/core/platform/cloud:http_request",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "bigquery_table_accessor_test",
+    size = "small",
+    srcs = [
+        "bigquery_table_accessor_test.cc",
+        "bigquery_table_accessor_test_data.h",
+    ],
+    deps = [
+        ":bigquery_table_accessor",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform/cloud:http_request_fake",
+    ],
+)
+
+tf_proto_library(
+    name = "bigquery_table_partition_proto",
+    srcs = [
+        "bigquery_table_partition.proto",
+    ],
+    cc_api_version = 2,
+)
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
new file mode 100644
index 00000000000..093000559b7
--- /dev/null
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@@ -0,0 +1,192 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <memory>
+#include <set>
+
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h"
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_partition.pb.h"
+#include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr int64 kDefaultRowBufferSize = 1000;  // Number of rows to buffer.
+
+// This is a helper function for reading table attributes from context.
+Status GetTableAttrs(OpKernelConstruction* context, string* project_id,
+                     string* dataset_id, string* table_id,
+                     int64* timestamp_millis, std::vector<string>* columns,
+                     string* test_end_point) {
+  TF_RETURN_IF_ERROR(context->GetAttr("project_id", project_id));
+  TF_RETURN_IF_ERROR(context->GetAttr("dataset_id", dataset_id));
+  TF_RETURN_IF_ERROR(context->GetAttr("table_id", table_id));
+  TF_RETURN_IF_ERROR(context->GetAttr("timestamp_millis", timestamp_millis));
+  TF_RETURN_IF_ERROR(context->GetAttr("columns", columns));
+  TF_RETURN_IF_ERROR(context->GetAttr("test_end_point", test_end_point));
+  return Status::OK();
+}
+
+}  // namespace
+
+// Note that overridden methods with names ending in "Locked" are called by
+// ReaderBase while a mutex is held.
+// See comments for ReaderBase.
+class BigQueryReader : public ReaderBase {
+ public:
+  explicit BigQueryReader(BigQueryTableAccessor* bigquery_table_accessor,
+                          const string& node_name)
+      : ReaderBase(strings::StrCat("BigQueryReader '", node_name, "'")),
+        bigquery_table_accessor_(CHECK_NOTNULL(bigquery_table_accessor)) {}
+
+  Status OnWorkStartedLocked() override {
+    BigQueryTablePartition partition;
+    if (!partition.ParseFromString(current_work())) {
+      return errors::InvalidArgument(
+          "Could not parse work as as valid partition.");
+    }
+    TF_RETURN_IF_ERROR(bigquery_table_accessor_->SetPartition(partition));
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    *at_end = false;
+    *produced = false;
+    if (bigquery_table_accessor_->Done()) {
+      *at_end = true;
+      return Status::OK();
+    }
+
+    Example example;
+    int64 row_id;
+    TF_RETURN_IF_ERROR(bigquery_table_accessor_->ReadRow(&row_id, &example));
+
+    *key = std::to_string(row_id);
+    *value = example.SerializeAsString();
+    *produced = true;
+    return Status::OK();
+  }
+
+ private:
+  // Not owned.
+  BigQueryTableAccessor* bigquery_table_accessor_;
+};
+
+class BigQueryReaderOp : public ReaderOpKernel {
+ public:
+  explicit BigQueryReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    string table_id;
+    string project_id;
+    string dataset_id;
+    int64 timestamp_millis;
+    std::vector<string> columns;
+    string test_end_point;
+
+    OP_REQUIRES_OK(context,
+                   GetTableAttrs(context, &project_id, &dataset_id, &table_id,
+                                 &timestamp_millis, &columns, &test_end_point));
+    OP_REQUIRES_OK(context,
+                   BigQueryTableAccessor::New(
+                       project_id, dataset_id, table_id, timestamp_millis,
+                       kDefaultRowBufferSize, test_end_point, columns,
+                       BigQueryTablePartition(), &bigquery_table_accessor_));
+
+    SetReaderFactory([this]() {
+      return new BigQueryReader(bigquery_table_accessor_.get(), name());
+    });
+  }
+
+ private:
+  std::unique_ptr<BigQueryTableAccessor> bigquery_table_accessor_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BigQueryReader").Device(DEVICE_CPU),
+                        BigQueryReaderOp);
+
+class GenerateBigQueryReaderPartitionsOp : public OpKernel {
+ public:
+  explicit GenerateBigQueryReaderPartitionsOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string project_id;
+    string dataset_id;
+    string table_id;
+    int64 timestamp_millis;
+    std::vector<string> columns;
+    string test_end_point;
+
+    OP_REQUIRES_OK(context,
+                   GetTableAttrs(context, &project_id, &dataset_id, &table_id,
+                                 &timestamp_millis, &columns, &test_end_point));
+    OP_REQUIRES_OK(context,
+                   BigQueryTableAccessor::New(
+                       project_id, dataset_id, table_id, timestamp_millis,
+                       kDefaultRowBufferSize, test_end_point, columns,
+                       BigQueryTablePartition(), &bigquery_table_accessor_));
+    OP_REQUIRES_OK(context, InitializeNumberOfPartitions(context));
+    OP_REQUIRES_OK(context, InitializeTotalNumberOfRows());
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const int64 partition_size = tensorflow::MathUtil::CeilOfRatio<int64>(
+        total_num_rows_, num_partitions_);
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num_partitions_}),
+                                            &output_tensor));
+
+    auto output = output_tensor->template flat<string>();
+    for (int64 i = 0; i < num_partitions_; ++i) {
+      BigQueryTablePartition partition;
+      partition.set_start_index(i * partition_size);
+      partition.set_end_index(
+          std::min(total_num_rows_, (i + 1) * partition_size) - 1);
+      output(i) = partition.SerializeAsString();
+    }
+  }
+
+ private:
+  Status InitializeTotalNumberOfRows() {
+    total_num_rows_ = bigquery_table_accessor_->total_num_rows();
+    if (total_num_rows_ <= 0) {
+      return errors::FailedPrecondition("Invalid total number of rows.");
+    }
+    return Status::OK();
+  }
+
+  Status InitializeNumberOfPartitions(OpKernelConstruction* context) {
+    TF_RETURN_IF_ERROR(context->GetAttr("num_partitions", &num_partitions_));
+    if (num_partitions_ <= 0) {
+      return errors::FailedPrecondition("Invalid number of partitions.");
+    }
+    return Status::OK();
+  }
+
+  int64 num_partitions_;
+  int64 total_num_rows_;
+  std::unique_ptr<BigQueryTableAccessor> bigquery_table_accessor_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("GenerateBigQueryReaderPartitions").Device(DEVICE_CPU),
+    GenerateBigQueryReaderPartitionsOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cloud/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
similarity index 99%
rename from tensorflow/core/kernels/cloud/bigquery_table_accessor.cc
rename to tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
index 3e9adfa3727..5e95db55b62 100644
--- a/tensorflow/core/kernels/cloud/bigquery_table_accessor.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/cloud/bigquery_table_accessor.h"
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h"
 
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/strings/numbers.h"
diff --git a/tensorflow/core/kernels/cloud/bigquery_table_accessor.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
similarity index 99%
rename from tensorflow/core/kernels/cloud/bigquery_table_accessor.h
rename to tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
index 33d1905b8a9..1cd0482186d 100644
--- a/tensorflow/core/kernels/cloud/bigquery_table_accessor.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
@@ -19,8 +19,9 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
+
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_partition.pb.h"
 #include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/kernels/cloud/bigquery_table_partition.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
diff --git a/tensorflow/core/kernels/cloud/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
similarity index 82%
rename from tensorflow/core/kernels/cloud/bigquery_table_accessor_test.cc
rename to tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index 57a4b892518..4851d485cc6 100644
--- a/tensorflow/core/kernels/cloud/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/cloud/bigquery_table_accessor.h"
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h"
+#include "tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h"
 #include "tensorflow/core/example/feature.pb.h"
-#include "tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -274,6 +273,88 @@ TEST_F(BigQueryTableAccessorTest, ReadOneRowWithNullsTest) {
   EXPECT_TRUE(accessor_->Done());
 }
 
+TEST_F(BigQueryTableAccessorTest, ReadOneRowTwoRecords) {
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/\n"
+      "Auth Token: fake_token\n",
+      kSampleSchemaTwoRecords));
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/data?maxResults=1&startIndex=2\n"
+      "Auth Token: fake_token\n",
+      kTestRowWithTwoRecords));
+  BigQueryTablePartition partition;
+  partition.set_start_index(2);
+  partition.set_end_index(2);
+  TF_EXPECT_OK(CreateTableAccessor(
+      kTestProject, kTestDataset, kTestTable, 1, 1,
+      {"rec_field2.bool_field", "rec_field1.float_field"}, partition));
+
+  int64 row_id;
+  Example example;
+  TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
+
+  // Validate returned result.
+  Example expected_example;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      kTestExampleProtoWithTwoRecords, &expected_example));
+  EXPECT_EQ(DeterministicSerialization(expected_example),
+            DeterministicSerialization(example));
+  EXPECT_EQ(row_id, 2);
+  EXPECT_TRUE(accessor_->Done());
+}
+
+TEST_F(BigQueryTableAccessorTest, NonExistentColumns) {
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/\n"
+      "Auth Token: fake_token\n",
+      kSampleSchemaTwoRecords));
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/data?maxResults=1&startIndex=2\n"
+      "Auth Token: fake_token\n",
+      kTestRowWithTwoRecords));
+  BigQueryTablePartition partition;
+  partition.set_start_index(2);
+  partition.set_end_index(2);
+  TF_EXPECT_OK(CreateTableAccessor(kTestProject, kTestDataset, kTestTable, 1, 1,
+                                   {"bool_field", "float_field"}, partition));
+  int64 row_id;
+  Example example;
+  TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
+
+  // Validate returned result.
+  EXPECT_EQ(row_id, 2);
+  EXPECT_TRUE(accessor_->Done());
+}
+
+TEST_F(BigQueryTableAccessorTest, EmptyRow) {
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/\n"
+      "Auth Token: fake_token\n",
+      kSampleSchemaTwoRecords));
+  requests_.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
+      "datasets/test-dataset/tables/test-table/data?maxResults=1&startIndex=2\n"
+      "Auth Token: fake_token\n",
+      kTestEmptyRow));
+  BigQueryTablePartition partition;
+  partition.set_start_index(2);
+  partition.set_end_index(2);
+  TF_EXPECT_OK(CreateTableAccessor(kTestProject, kTestDataset, kTestTable, 1, 1,
+                                   {}, partition));
+  int64 row_id;
+  Example example;
+  TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
+
+  // Validate returned result.
+  EXPECT_EQ(row_id, 2);
+  EXPECT_TRUE(accessor_->Done());
+}
+
 TEST_F(BigQueryTableAccessorTest, BrokenRowTest) {
   requests_.emplace_back(new FakeHttpRequest(
       "Uri: https://www.googleapis.com/bigquery/v2/projects/test-project/"
@@ -340,6 +421,7 @@ TEST_F(BigQueryTableAccessorTest, MultiplePagesTest) {
   TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
   EXPECT_EQ(3, row_id);
   EXPECT_TRUE(accessor_->Done());
+  
   Example expected_example;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(kTestExampleProtoWithNulls,
                                                     &expected_example));
@@ -387,7 +469,7 @@ TEST_F(BigQueryTableAccessorTest, SwitchingPartitionsTest) {
 
   partition.set_start_index(3);
   partition.set_end_index(-1);
-  accessor_->SetPartition(partition);
+  TF_EXPECT_OK(accessor_->SetPartition(partition));
   TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
   EXPECT_EQ(3, row_id);
   EXPECT_TRUE(accessor_->Done());
@@ -396,7 +478,7 @@ TEST_F(BigQueryTableAccessorTest, SwitchingPartitionsTest) {
 
   partition.set_start_index(0);
   partition.set_end_index(1);
-  accessor_->SetPartition(partition);
+  TF_EXPECT_OK(accessor_->SetPartition(partition));
   TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
   EXPECT_EQ(0, row_id);
   EXPECT_FALSE(accessor_->Done());
diff --git a/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
similarity index 78%
rename from tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h
rename to tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index e339ff25ff0..b2b11f4f578 100644
--- a/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -73,6 +73,43 @@ const string kSampleSchema = R"({
   "numRows": "4"
 })";
 
+const string kSampleSchemaTwoRecords = R"({
+  "kind": "bigquery#table",
+  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
+  "id": "test-project:test-dataset.test-table",
+  "schema": {
+    "fields": [
+    {
+      "name": "rec_field1",
+      "type": "RECORD",
+      "fields": [
+      {
+        "name": "int_field",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+      }, {
+        "name": "float_field",
+        "type": "FLOAT",
+        "mode": "NULLABLE"
+      }]
+    },{
+      "name": "rec_field2",
+      "type": "RECORD",
+      "fields": [
+      {
+         "name": "bool_field",
+         "type": "BOOLEAN",
+         "mode": "NULLABLE"
+      },{
+         "name": "bytes_field",
+         "type": "BYTES",
+         "mode": "NULLABLE"
+      }]
+    }]
+  },
+  "numRows": "4"
+})";
+
 const string kTestRow = R"({
   "kind": "bigquery#table",
   "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
@@ -309,6 +346,26 @@ const string kTestPartialExampleProto = R"(features {
 }
 )";
 
+const string kTestExampleProtoWithTwoRecords = R"(features {
+  feature {
+    key: "rec_field1.float_field"
+    value {
+      float_list {
+        value: 1.23456
+      }
+    }
+  }
+  feature {
+    key: "rec_field2.bool_field"
+    value {
+      int64_list {
+        value: 1
+      }
+    }
+  }
+}
+)";
+
 const string kTestTwoRows = R"({
   "kind": "bigquery#table",
   "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
@@ -319,6 +376,28 @@ const string kTestTwoRows = R"({
     {"f": [{"v": "2222"},{},{},{},{},{},{},{},{}]}
   ]})";
 
+const string kTestRowWithTwoRecords = R"({
+  "kind": "bigquery#table",
+  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
+  "id": "test-project:test-dataset.test-table",
+  "rows": [
+  {
+    "f": [
+    {"v": {"f": [{}, {"v": "1.23456"}]}},
+    {"v": {"f": [{"v": "true"}, {}]}
+    }]}]})";
+
+const string kTestEmptyRow = R"({
+  "kind": "bigquery#table",
+  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
+  "id": "test-project:test-dataset.test-table",
+  "rows": [
+  {
+    "f": [
+    {"v": {"f": [{}, {}]}},
+    {"v": {"f": [{"v": null}, {}]}
+    }]}]})";
+
 }  // namespace
 }  // namepsace tensorflow
 
diff --git a/tensorflow/core/kernels/cloud/bigquery_table_partition.proto b/tensorflow/contrib/cloud/kernels/bigquery_table_partition.proto
similarity index 100%
rename from tensorflow/core/kernels/cloud/bigquery_table_partition.proto
rename to tensorflow/contrib/cloud/kernels/bigquery_table_partition.proto
diff --git a/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc
new file mode 100644
index 00000000000..fbba04a31aa
--- /dev/null
+++ b/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file registers Bigquery reader ops. */
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("BigQueryReader")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("project_id: string")
+    .Attr("dataset_id: string")
+    .Attr("table_id: string")
+    .Attr("columns: list(string)")
+    .Attr("timestamp_millis: int")
+    .Attr("test_end_point: string = ''")
+    .Output("reader_handle: Ref(string)")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A Reader that outputs rows from a BigQuery table as tensorflow Examples.
+
+container: If non-empty, this reader is placed in the given container.
+           Otherwise, a default container is used.
+shared_name: If non-empty, this reader is named in the given bucket
+             with this shared_name. Otherwise, the node name is used instead.
+project_id: GCP project ID.
+dataset_id: BigQuery Dataset ID.
+table_id: Table to read.
+columns: List of columns to read. Leave empty to read all columns.
+timestamp_millis: Table snapshot timestamp in millis since epoch. Relative
+(negative or zero) snapshot times are not allowed. For more details, see
+'Table Decorators' in BigQuery docs.
+test_end_point: Do not use. For testing purposes only.
+reader_handle: The handle to reference the Reader.
+)doc");
+
+REGISTER_OP("GenerateBigQueryReaderPartitions")
+    .Attr("project_id: string")
+    .Attr("dataset_id: string")
+    .Attr("table_id: string")
+    .Attr("columns: list(string)")
+    .Attr("timestamp_millis: int")
+    .Attr("num_partitions: int")
+    .Attr("test_end_point: string = ''")
+    .Output("partitions: string")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Generates serialized partition messages suitable for batch reads.
+
+This op should not be used directly by clients. Instead, the
+bigquery_reader_ops.py file defines a clean interface to the reader.
+
+project_id: GCP project ID.
+dataset_id: BigQuery Dataset ID.
+table_id: Table to read.
+columns: List of columns to read. Leave empty to read all columns.
+timestamp_millis: Table snapshot timestamp in millis since epoch. Relative
+(negative or zero) snapshot times are not allowed. For more details, see
+'Table Decorators' in BigQuery docs.
+num_partitions: Number of partitions to split the table into.
+test_end_point: Do not use. For testing purposes only.
+partitions: Serialized table partitions.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/ops/cloud/bigquery_reader_ops.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
similarity index 91%
rename from tensorflow/python/ops/cloud/bigquery_reader_ops.py
rename to tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
index 7786aea025a..136707da18a 100644
--- a/tensorflow/python/ops/cloud/bigquery_reader_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
@@ -18,21 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.cloud.python.ops import gen_bigquery_reader_ops
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_cloud_ops
 from tensorflow.python.ops import io_ops
 
 
 class BigQueryReader(io_ops.ReaderBase):
   """A Reader that outputs keys and tf.Example values from a BigQuery table.
 
-  Note(1): This op is currently not linked into the binary. It will be linked
-  by default after more perf testing.
-
-  Note(2): This op currently returns example proto as its output. This is not
-  final and we are experimenting with adding support for returning csv. Support
-  for example proto may be deprecated after that.
-
   Example use:
     ```python
     # Assume a BigQuery has the following schema,
@@ -121,7 +114,7 @@ class BigQueryReader(io_ops.ReaderBase):
     self._num_partitions = num_partitions
     self._test_end_point = test_end_point
 
-    reader = gen_cloud_ops.big_query_reader(
+    reader = gen_bigquery_reader_ops.big_query_reader(
         name=name,
         project_id=self._project_id,
         dataset_id=self._dataset_id,
@@ -143,7 +136,7 @@ class BigQueryReader(io_ops.ReaderBase):
     Returns:
       `1-D` string `Tensor` of serialized `BigQueryTablePartition` messages.
     """
-    return gen_cloud_ops.generate_big_query_reader_partitions(
+    return gen_bigquery_reader_ops.generate_big_query_reader_partitions(
         name=name,
         project_id=self._project_id,
         dataset_id=self._dataset_id,
diff --git a/tensorflow/python/ops/cloud/bigquery_reader_ops_test.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
similarity index 90%
rename from tensorflow/python/ops/cloud/bigquery_reader_ops_test.py
rename to tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
index 196991f68a7..493b3c6f1b5 100644
--- a/tensorflow/python/ops/cloud/bigquery_reader_ops_test.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
@@ -19,19 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import os
 import re
+import socket
 import threading
 
 from six.moves import SimpleHTTPServer
 from six.moves import socketserver
 
+from tensorflow.contrib.cloud.python.ops import bigquery_reader_ops as cloud
 from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops.cloud import cloud
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
@@ -44,7 +46,7 @@ _TABLE = "test-table"
 # The values for rows are generated such that some columns have null values. The
 # general formula here is:
 #   - The int64 column is present in every row.
-#   - The string column is only avaiable in even rows.
+#   - The string column is only available in even rows.
 #   - The float column is only available in every third row.
 _ROWS = [[0, "s_0", 0.1], [1, None, None], [2, "s_2", None], [3, None, 3.1],
          [4, "s_4", None], [5, None, None], [6, "s_6", 6.1], [7, None, None],
@@ -92,6 +94,10 @@ def _ConvertRowToExampleProto(row):
   return example
 
 
+class IPv6TCPServer(socketserver.TCPServer):
+  address_family = socket.AF_INET6
+
+
 class FakeBigQueryServer(threading.Thread):
   """Fake http server to return schema and data for sample table."""
 
@@ -104,7 +110,12 @@ class FakeBigQueryServer(threading.Thread):
     """
     threading.Thread.__init__(self)
     self.handler = BigQueryRequestHandler
-    self.httpd = socketserver.TCPServer((address, port), self.handler)
+    try:
+      self.httpd = socketserver.TCPServer((address, port), self.handler)
+      self.host_port = "{}:{}".format(*self.httpd.server_address)
+    except IOError:
+      self.httpd = IPv6TCPServer((address, port), self.handler)
+      self.host_port = "[{}]:{}".format(*self.httpd.server_address)
 
   def run(self):
     self.httpd.serve_forever()
@@ -171,10 +182,13 @@ class BigQueryReaderOpsTest(test.TestCase):
 
   def setUp(self):
     super(BigQueryReaderOpsTest, self).setUp()
-    self.server = FakeBigQueryServer("127.0.0.1", 0)
+    self.server = FakeBigQueryServer("localhost", 0)
     self.server.start()
-    logging.info("server address is %s:%s", self.server.httpd.server_address[0],
-                 self.server.httpd.server_address[1])
+    logging.info("server address is %s", self.server.host_port)
+
+    # An override to bypass the GCP auth token retrieval logic
+    # in google_auth_provider.cc.
+    os.environ["GOOGLE_AUTH_TOKEN_FOR_TESTING"] = "not-used"
 
   def tearDown(self):
     self.server.shutdown()
@@ -199,8 +213,7 @@ class BigQueryReaderOpsTest(test.TestCase):
           num_partitions=4,
           features=feature_configs,
           timestamp_millis=1,
-          test_end_point=("%s:%s" % (self.server.httpd.server_address[0],
-                                     self.server.httpd.server_address[1])))
+          test_end_point=self.server.host_port)
 
       key, value = _SetUpQueue(reader)
 
@@ -249,8 +262,7 @@ class BigQueryReaderOpsTest(test.TestCase):
           num_partitions=4,
           columns=["int64_col", "float_col", "string_col"],
           timestamp_millis=1,
-          test_end_point=("%s:%s" % (self.server.httpd.server_address[0],
-                                     self.server.httpd.server_address[1])))
+          test_end_point=self.server.host_port)
       key, value = _SetUpQueue(reader)
       seen_rows = []
       for row_index in range(num_rows):
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
new file mode 100644
index 00000000000..6792ebd615c
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -0,0 +1,48 @@
+# Description: Operations defined for Cluster Resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+py_library(
+    name = "cluster_resolver_py",
+    srcs = [
+        "python/training/__init__.py",
+        "python/training/cluster_resolver.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "cluster_resolver_py_test",
+    size = "small",
+    srcs = ["python/training/cluster_resolver_test.py"],
+    additional_deps = [
+        ":cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    main = "python/training/cluster_resolver_test.py",
+)
diff --git a/tensorflow/contrib/cluster_resolver/README.md b/tensorflow/contrib/cluster_resolver/README.md
new file mode 100644
index 00000000000..6fe6871eb41
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/README.md
@@ -0,0 +1,5 @@
+# Cluster Resolvers
+
+Cluster Resolvers are a new way of specifying cluster information for distributed execution. Built on top of existing `ClusterSpec` framework, Cluster Resolvers allow users to simply specify a configuration and a cluster management service and a `ClusterResolver` will automatically fetch the relevant information from the service and populate `ClusterSpec`s.
+
+`ClusterResolvers` are designed to work well with `ManagedTrainingSession` and `ClusterSpec` propagation so that distributed training sessions remain robust in the face of node and network failures.
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
new file mode 100644
index 00000000000..3520467bc6a
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library Imports for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
new file mode 100644
index 00000000000..87da24f22d8
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class ClusterResolver(object):
+  """Abstract class for all implementations of ClusterResolvers.
+
+  This defines the skeleton for all implementations of ClusterResolvers.
+  ClusterResolvers are a way for TensorFlow to communicate with various cluster
+  management systems (e.g. GCE, AWS, etc...).
+
+  By letting TensorFlow communicate with these systems, we will be able to
+  automatically discover and resolve IP addresses for various TensorFlow
+  workers. This will eventually allow us to automatically recover from
+  underlying machine failures and scale TensorFlow worker clusters up and down.
+  """
+
+  @abc.abstractmethod
+  def cluster_spec(self):
+    """Retrieve the current state of the cluster and returns a ClusterSpec.
+
+    Returns:
+      A ClusterSpec representing the state of the cluster at the moment this
+      function is called.
+
+    Implementors of this function must take care in ensuring that the
+    ClusterSpec returned is up-to-date at the time of calling this function.
+    This usually means retrieving the information from the underlying cluster
+    management system every time this function is invoked and reconstructing
+    a cluster_spec, rather than attempting to cache anything.
+    """
+    raise NotImplementedError(
+        'cluster_spec is not implemented for {}.'.format(self))
+
+
+class SimpleClusterResolver(ClusterResolver):
+  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
+
+  def __init__(self, cluster_spec):
+    """Creates a SimpleClusterResolver from a ClusterSpec."""
+    super(SimpleClusterResolver, self).__init__()
+
+    if not isinstance(cluster_spec, ClusterSpec):
+      raise TypeError('cluster_spec must be a ClusterSpec.')
+    self._cluster_spec = cluster_spec
+
+  def cluster_spec(self):
+    """Returns the ClusterSpec passed into the constructor."""
+    return self._cluster_spec
+
+
+class UnionClusterResolver(ClusterResolver):
+  """Performs a union on underlying ClusterResolvers.
+
+  This class performs a union given two or more existing ClusterResolvers. It
+  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
+  when as_cluster_spec is called. The details of the merge function is
+  documented in the as_cluster_spec function.
+  """
+
+  def __init__(self, *args):
+    """Initializes a UnionClusterResolver with other ClusterResolvers.
+
+    Args:
+      *args: `ClusterResolver` objects to be unionized.
+
+    Raises:
+      TypeError: If any argument is not a subclass of `ClusterResolvers`.
+    """
+    super(UnionClusterResolver, self).__init__()
+
+    for cluster_resolver in args:
+      if not isinstance(cluster_resolver, ClusterResolver):
+        raise TypeError('All arguments must be a sub-class of '
+                        '`ClusterResolver.`')
+    self._cluster_resolvers = args
+
+  def cluster_spec(self):
+    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
+
+    Returns:
+      A ClusterSpec containing host information merged from all the underlying
+      ClusterResolvers.
+
+    Raises:
+      KeyError: If there are conflicting keys detected when merging two or
+      more dictionaries, this exception is raised.
+
+    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
+    same job name, we will merge the list/dict of workers.
+
+    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
+    concatenate the lists of workers, starting with the list of workers from
+    the first ClusterResolver passed into the constructor.
+
+    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
+    treat all the sets of workers as dicts (even if they are returned as lists)
+    and will only merge them into a dict if there is no conflicting keys. If
+    there is a conflicting key, we will raise a `KeyError`.
+    """
+
+    merged_cluster = {}
+
+    # We figure out whether it is all lists for a particular job, or whether
+    # there are dicts inside.
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if job_name in merged_cluster:
+          # If we see a dict, then we write a dict out regardless.
+          if isinstance(tasks, dict):
+            merged_cluster[job_name] = {}
+        else:
+          # We take whichever type is present.
+          if isinstance(tasks, list):
+            merged_cluster[job_name] = []
+          else:
+            merged_cluster[job_name] = {}
+
+    # We then do the merge as appropriate in merged_cluster[job].
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if isinstance(merged_cluster[job_name], list):
+          # We all have lists, we can just concatenate and be done.
+          merged_cluster[job_name].extend(tasks)
+        else:
+          if isinstance(tasks, list):
+            # We convert to a dictionary if the type is a list.
+            task_dict = dict(zip(range(0, len(tasks)), tasks))
+          else:
+            # We can simply make a copy (for update) and be done.
+            task_dict = tasks.copy()
+
+          # We detect if there are duplicates, and raise an error if so.
+          task_keys = set(task_dict)
+          merged_keys = set(merged_cluster[job_name].keys())
+          intersected_keys = task_keys.intersection(merged_keys)
+          if intersected_keys:
+            raise KeyError('Duplicate keys detected when merging two '
+                           'ClusterSpecs: %s' % repr(intersected_keys))
+
+          # We do the merge after all the processing.
+          merged_cluster[job_name].update(task_dict)
+
+    return ClusterSpec(merged_cluster)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
new file mode 100644
index 00000000000..dbfb77723cd
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
@@ -0,0 +1,238 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class UnionClusterResolverTest(test.TestCase):
+  # TODO(frankchn): Transform to parameterized test after it is included in the
+  # TF open source codebase.
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testSingleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    union_resolver = UnionClusterResolver(simple_resolver)
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
+                         tasks { key: 1 value: 'worker1:2222' }
+                         tasks { key: 2 value: 'worker2:2222' } }
+    """
+    actual_cluster_spec = union_resolver.cluster_spec()
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testTwoNonOverlappingJobMergedClusterResolver(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": [
+            "ps0:2222",
+            "ps1:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": [
+            "worker0:2222",
+            "worker1:2222",
+            "worker2:2222"
+        ]
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
+                         tasks { key: 1 value: 'worker1:2222' }
+                         tasks { key: 2 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testOverlappingJobMergedClusterResolver(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": [
+            "worker0:2222",
+            "worker1:2222",
+            "worker2:2222"
+        ]
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
+                         tasks { key: 1 value: 'worker5:2222' }
+                         tasks { key: 2 value: 'worker0:2222' }
+                         tasks { key: 3 value: 'worker1:2222' }
+                         tasks { key: 4 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testOverlappingSparseJobMergedClusterResolverThrowError(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": {
+            7: "worker4:2222",
+            9: "worker5:2222"
+        }
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    self.assertRaises(KeyError, union_cluster.cluster_spec)
+
+  def testOverlappingDictAndListThrowError(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            1: "worker0:2222",
+            2: "worker1:2222",
+            3: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    self.assertRaises(KeyError, union_cluster.cluster_spec)
+
+  def testOverlappingJobNonOverlappingKey(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": {
+            5: "worker4:2222",
+            9: "worker5:2222"
+        }
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 3 value: 'worker0:2222' }
+                         tasks { key: 5 value: 'worker4:2222' }
+                         tasks { key: 6 value: 'worker1:2222' }
+                         tasks { key: 7 value: 'worker2:2222' }
+                         tasks { key: 9 value: 'worker5:2222' }}
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testMixedModeNonOverlappingKey(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
+                         tasks { key: 1 value: 'worker5:2222' }
+                         tasks { key: 3 value: 'worker0:2222' }
+                         tasks { key: 6 value: 'worker1:2222' }
+                         tasks { key: 7 value: 'worker2:2222' }}
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testRetainSparseJobWithNoMerging(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "worker": {
+            1: "worker0:2222",
+            3: "worker1:2222",
+            5: "worker2:2222"
+        }
+    })
+
+    base_cluster_resolver = SimpleClusterResolver(base_cluster_spec)
+    union_cluster = UnionClusterResolver(base_cluster_resolver)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 1 value: 'worker0:2222' }
+                         tasks { key: 3 value: 'worker1:2222' }
+                         tasks { key: 5 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 68929da5c9f..9ffe08eded8 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -22,13 +22,16 @@ option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
+option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
 option(tensorflow_BUILD_PYTHON_BINDINGS "Build the Python bindings" ON)
 option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
 option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib/..." ON)
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
+option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@@ -56,17 +59,25 @@ mark_as_advanced(DOWNLOAD_LOCATION)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
-  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
-  add_definitions(-DEIGEN_VECTORIZE_SSE3)  # Needed to suppress denormals without __SSE3__ in MSVC
-  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
-  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
+  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
+  add_definitions(-DTF_COMPILE_LIBRARY)
+  add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
   add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
   add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
+  add_definitions(/wd4003 /wd4244 /wd4267 /wd4503 /wd4506 /wd4800 /wd4996)
+  # Suppress linker warnings.
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -81,15 +92,33 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+# MSVC SIMD instructions
+if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  if (WIN32)
+    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
+    else()
+      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
+    endif()
+  endif()
+endif()
+
+if (tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  add_definitions(-DTENSORFLOW_USE_JEMALLOC -DJEMALLOC_EXPORT=)
+endif()
+
 # External dependencies
 include(zlib)
 include(gif)
 include(png)
 include(jpeg)
+include(lmdb)
 include(eigen)
 include(gemmlowp)
 include(jsoncpp)
 include(farmhash)
+include(fft2d)
 include(highwayhash)
 include(protobuf)
 if (tensorflow_BUILD_CC_TESTS)
@@ -101,8 +130,10 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${gif_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
+    ${lmdb_STATIC_LIBRARIES}
     ${jsoncpp_STATIC_LIBRARIES}
     ${farmhash_STATIC_LIBRARIES}
+    ${fft2d_STATIC_LIBRARIES}
     ${highwayhash_STATIC_LIBRARIES}
     ${protobuf_STATIC_LIBRARIES}
 )
@@ -111,12 +142,14 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     gif_copy_headers_to_destination
     png_copy_headers_to_destination
     jpeg_copy_headers_to_destination
+    lmdb_copy_headers_to_destination
     jsoncpp
     farmhash_copy_headers_to_destination
     highwayhash_copy_headers_to_destination
     protobuf
     eigen
     gemmlowp
+    fft2d
 )
 
 include_directories(
@@ -128,6 +161,7 @@ include_directories(
     ${gif_INCLUDE_DIR}
     ${png_INCLUDE_DIR}
     ${jpeg_INCLUDE_DIR}
+    ${lmdb_INCLUDE_DIR}
     ${eigen_INCLUDE_DIRS}
     ${gemmlowp_INCLUDE_DIR}
     ${jsoncpp_INCLUDE_DIR}
@@ -148,6 +182,12 @@ if(tensorflow_ENABLE_GRPC_SUPPORT)
   list(APPEND tensorflow_EXTERNAL_DEPENDENCIES grpc)
   include_directories(${GRPC_INCLUDE_DIRS})
 endif()
+if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  include(jemalloc)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${jemalloc_STATIC_LIBRARIES})
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
+  include_directories(${jemalloc_INCLUDE_DIRS})
+endif()
 if(WIN32)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi)
 endif()
@@ -169,9 +209,12 @@ if (tensorflow_ENABLE_GPU)
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
 
     # add cudnn
+    if(NOT CUDNN_HOME)
+      set(CUDNN_HOME ${CUDA_TOOLKIT_TARGET_DIR})
+    endif(NOT CUDNN_HOME)
     include_directories(${CUDNN_HOME})
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
-      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
 
     # create cuda_config.h
     FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
@@ -191,6 +234,8 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
     include_directories(${tensorflow_source_dir}/third_party/gpus)
@@ -199,9 +244,14 @@ if (tensorflow_ENABLE_GPU)
   endif()
 endif()
 
+# Find python executable
+include(FindPythonInterp)
+if(NOT ${PYTHONINTERP_FOUND})
+    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+endif()
+
 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
@@ -217,15 +267,21 @@ include(tf_core_kernels.cmake)
 if(tensorflow_ENABLE_GRPC_SUPPORT)
   include(tf_core_distributed_runtime.cmake)
 endif()
+# We include tf_cc_ops first, because tf_c depends on tf_cc.
 include(tf_cc_ops.cmake)
+include(tf_c.cmake)
+include(tf_grappler.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
 endif()
+include(tf_tools.cmake)
 if(tensorflow_BUILD_PYTHON_BINDINGS)
-  include(tensorboard)
   include(tf_python.cmake)
 endif()
-if (tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
+if(tensorflow_BUILD_SHARED_LIB)
+  include(tf_shared_lib.cmake)
+endif()
+if(tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
   include(tf_tests.cmake)
 endif()
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 8e7f43b5119..664d0f4b6b0 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -13,7 +13,7 @@ Linux.
 Current Status
 --------------
 
-CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/get_started/os_setup.html#pip-installation-on-windows)
+CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/install_windows)
 for instructions on how to install a pre-built TensorFlow package on Windows.
 
 ### Current known limitations
@@ -24,7 +24,7 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
  - QuantizeAndDequantize
  - QuantizedAvgPool
  - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd 
+ - QuantizedBiasAdd
  - QuantizedConcat
  - QuantizedConv2D
  - QuantizedMatmul
@@ -45,7 +45,7 @@ bindings.
 
 ### Pre-requisites
 
-* CMake version 3.5 up to 3.6
+* CMake version 3.5 or later.
 
 * [Git](http://git-scm.com)
 
@@ -68,10 +68,10 @@ bindings.
   - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 8.0] (https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 5.1] (https://developer.nvidia.com/cudnn)
+  - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
+  - [NVidia CUDNN 5.1](https://developer.nvidia.com/cudnn)
   - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
-  
+
 * Ubuntu 14.04
   - Makefile generator
   - Docker 1.9.1 (for automated testing)
@@ -87,7 +87,7 @@ bindings.
 
   - The following Python APIs are not currently implemented:
     * Loading custom op libraries via `tf.load_op_library()`. In order to use your
-      custom op, please put the source code under the tensorflow/core/user_ops 
+      custom op, please put the source code under the tensorflow/core/user_ops
       directory, and a shape function is required (not optional) for each op.
     * Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
       functional.
@@ -132,12 +132,12 @@ Step-by-step Windows build
      D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
      ```
 
-   * When building with GPU support after installing the CUDNN zip file from NVidia, append its 
+   * When building with GPU support after installing the CUDNN zip file from NVidia, append its
      bin directory to your PATH environment variable.
-     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable. 
+     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable.
      It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
      For example:
-     
+
      ```
      D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
      D:\local\cuda\bin
@@ -174,14 +174,18 @@ Step-by-step Windows build
    D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
    More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
    More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib 
+   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
    ```
    To build with GPU support add "^" at the end of the last line above following with:
    ```
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
-    
+   To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+   ```
+   More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+   ```
+
    Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
    configuration that you choose when invoking `msbuild`. The known-good
    values are `Release` and `RelWithDebInfo`. The `Debug` build type is
@@ -218,11 +222,11 @@ Step-by-step Windows build
      SSL support (for making secure HTTP requests) in the TensorFlow runtime.
      This support is incomplete, and will be used for Google Cloud Storage
      support.
-     
+
    * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
      GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
      CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
- 
+
    * `-Dtensorflow_BUILD_CC_TESTS=(ON|OFF)`. Defaults to `OFF`. This builds cc unit tests.
      There are many of them and building will take a few hours.
      After cmake, build and execute the tests with
@@ -230,7 +234,7 @@ Step-by-step Windows build
      MSBuild /p:Configuration=RelWithDebInfo ALL_BUILD.vcxproj
      ctest -C RelWithDebInfo
      ```
- 
+
    * `-Dtensorflow_BUILD_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python kernel tests.
      After building the python wheel, you need to install the new wheel before running the tests.
      To execute the tests, use
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index b91c7bf3830..2ae591d3fae 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(boringssl_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src/boringssl/include)
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index a2f5774b9fe..45a0096085c 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 #new_http_archive(
 #  name = "eigen_archive",
 #  urls = ["https://bitbucket.org/eigen/eigen/get/..."],
@@ -36,4 +50,5 @@ ExternalProject_Add(eigen
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
         -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
+        -DBUILD_TESTING:BOOL=OFF
 )
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index f6805a33aad..41b0e8c92ba 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(farmhash_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive/util)
@@ -19,7 +33,7 @@ if(WIN32)
       URL_HASH ${farmhash_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
-      PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/patches/farmhash/CMakeLists.txt ${farmhash_BUILD}
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/farmhash/CMakeLists.txt ${farmhash_BUILD}
       INSTALL_DIR ${farmhash_INSTALL}
       CMAKE_CACHE_ARGS
           -DCMAKE_BUILD_TYPE:STRING=Release
@@ -53,5 +67,5 @@ add_custom_target(farmhash_copy_headers_to_destination
 
 foreach(header_file ${farmhash_HEADERS})
     add_custom_command(TARGET farmhash_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${farmhash_INCLUDE_DIR}/)
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${farmhash_INCLUDE_DIR}/)
 endforeach()
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
new file mode 100644
index 00000000000..85f77e9879f
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+include (ExternalProject)
+
+set(fft2d_URL http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz)
+set(fft2d_HASH SHA256=52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296)
+set(fft2d_BUILD ${CMAKE_CURRENT_BINARY_DIR}/fft2d/)
+set(fft2d_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/fft2d/src)
+
+if(WIN32)
+  set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/lib/fft2d.lib)
+
+  ExternalProject_Add(fft2d
+      PREFIX fft2d
+      URL ${fft2d_URL}
+      URL_HASH ${fft2d_HASH}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+      INSTALL_DIR ${fft2d_INSTALL}
+      CMAKE_CACHE_ARGS
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          -DCMAKE_INSTALL_PREFIX:STRING=${fft2d_INSTALL})
+else()
+  set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/fft2d/libfft2d.a)
+
+  ExternalProject_Add(fft2d
+      PREFIX fft2d
+      URL ${fft2d_URL}
+      URL_HASH ${fft2d_HASH}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+      INSTALL_DIR $(fft2d_INSTALL)
+      INSTALL_COMMAND echo
+      BUILD_COMMAND $(MAKE))
+    
+endif()
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 93a0c8d864d..eee61ffd57b 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(gemmlowp_URL http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz)
@@ -11,5 +25,5 @@ ExternalProject_Add(gemmlowp
     URL_HASH ${gemmlowp_HASH}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/patches/gemmlowp/CMakeLists.txt ${gemmlowp_BUILD}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/gemmlowp/CMakeLists.txt ${gemmlowp_BUILD}
     INSTALL_COMMAND "")
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index da20561b880..5011239c14d 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(gif_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/gif_archive/giflib-5.1.4/)
@@ -19,7 +33,7 @@ if(WIN32)
       PREFIX gif
       URL ${gif_URL}
       URL_HASH ${gif_HASH}
-      PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/patches/gif/CMakeLists.txt ${gif_BUILD}
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/gif/CMakeLists.txt ${gif_BUILD}
       INSTALL_DIR ${gif_INSTALL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       CMAKE_CACHE_ARGS
@@ -29,7 +43,7 @@ if(WIN32)
   )
 
   ExternalProject_Add_Step(gif copy_unistd
-      COMMAND ${CMAKE_COMMAND} -E copy
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
           ${CMAKE_SOURCE_DIR}/patches/gif/unistd.h ${gif_BUILD}/lib/unistd.h
       DEPENDEES patch
       DEPENDERS build
@@ -67,5 +81,5 @@ add_custom_target(gif_copy_headers_to_destination
 
 foreach(header_file ${gif_HEADERS})
     add_custom_command(TARGET gif_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${gif_INCLUDE_DIR}/)
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${gif_INCLUDE_DIR}/)
 endforeach()
diff --git a/tensorflow/contrib/cmake/external/googletest.cmake b/tensorflow/contrib/cmake/external/googletest.cmake
index e6daf62a51a..d09bb02890f 100644
--- a/tensorflow/contrib/cmake/external/googletest.cmake
+++ b/tensorflow/contrib/cmake/external/googletest.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(googletest_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/include)
@@ -7,7 +21,7 @@ set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
 
 if(WIN32)
   set(googletest_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
 else()
   set(googletest_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
@@ -19,7 +33,7 @@ ExternalProject_Add(googletest
     GIT_TAG ${googletest_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    #PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
+    #PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index c33b0dd81e2..d7201680ceb 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
@@ -7,9 +21,9 @@ set(GRPC_TAG 3bc78cd0b5bd784a235c01612d634b1ec5f8fb97)
 
 if(WIN32)
   set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc++_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/gpr.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
 else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
@@ -24,7 +38,7 @@ ExternalProject_Add(grpc
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/tensorflow/contrib/cmake/external/highwayhash.cmake b/tensorflow/contrib/cmake/external/highwayhash.cmake
index 9f80be32cbc..2c23bef8a33 100644
--- a/tensorflow/contrib/cmake/external/highwayhash.cmake
+++ b/tensorflow/contrib/cmake/external/highwayhash.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(highwayhash_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/highwayhash)
@@ -28,7 +42,7 @@ ExternalProject_Add(highwayhash
     GIT_TAG ${highwayhash_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/patches/highwayhash/CMakeLists.txt ${highwayhash_BUILD}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/highwayhash/CMakeLists.txt ${highwayhash_BUILD}
     INSTALL_DIR ${highwayhash_INSTALL}
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
new file mode 100644
index 00000000000..e4737a1dd82
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -0,0 +1,47 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
+set(jemalloc_URL https://github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
+set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
+set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
+
+if (WIN32)
+    set(jemalloc_INCLUDE_DIRS
+        ${jemalloc_INCLUDE_DIRS} 
+        ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
+    )
+    set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+else()
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
+endif()
+
+ExternalProject_Add(jemalloc
+    PREFIX jemalloc
+    URL ${jemalloc_URL}
+    URL_HASH ${jemalloc_HASH}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dwith-jemalloc-prefix:STRING=jemalloc_
+        -Dwithout-export:BOOL=ON
+        ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+)
diff --git a/tensorflow/contrib/cmake/external/jpeg.cmake b/tensorflow/contrib/cmake/external/jpeg.cmake
index cde037949cd..f2797d13b24 100644
--- a/tensorflow/contrib/cmake/external/jpeg.cmake
+++ b/tensorflow/contrib/cmake/external/jpeg.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(jpeg_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/jpeg_archive)
@@ -32,7 +46,7 @@ if (WIN32)
         PREFIX jpeg
         URL ${jpeg_URL}
         URL_HASH ${jpeg_HASH}
-        PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/patches/jpeg/CMakeLists.txt ${jpeg_BUILD}
+        PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/jpeg/CMakeLists.txt ${jpeg_BUILD}
         INSTALL_DIR ${jpeg_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
         CMAKE_CACHE_ARGS
@@ -42,7 +56,7 @@ if (WIN32)
     )
 
     ExternalProject_Add_Step(jpeg copy_jconfig
-        COMMAND ${CMAKE_COMMAND} -E copy
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different
             ${jpeg_BUILD}/jconfig.vc ${jpeg_BUILD}/jconfig.h
         DEPENDEES patch
         DEPENDERS build
@@ -76,5 +90,5 @@ add_custom_target(jpeg_copy_headers_to_destination
 
 foreach(header_file ${jpeg_HEADERS})
     add_custom_command(TARGET jpeg_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${jpeg_INCLUDE_DIR})
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${jpeg_INCLUDE_DIR})
 endforeach()
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 43d6e0456cd..5127d7e8f79 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(jsoncpp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/jsoncpp/src/jsoncpp)
@@ -9,7 +23,7 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
 set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
 
 if(WIN32)
-  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/${CMAKE_BUILD_TYPE}/jsoncpp.lib)
+  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
 else()
   set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
new file mode 100644
index 00000000000..28ec833babe
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb)
+set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz)
+set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326)
+set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
+set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
+
+ExternalProject_Add(lmdb
+    PREFIX lmdb
+    URL ${lmdb_URL}
+    URL_HASH ${lmdb_HASH}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
+    INSTALL_DIR ${lmdb_INSTALL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+if(WIN32)
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
+set(lmdb_HEADERS
+    "${lmdb_INSTALL}/include/lmdb.h"
+    "${lmdb_INSTALL}/include/midl.h"
+)
+
+## put lmdb includes in the directory where they are expected
+add_custom_target(lmdb_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR}
+    DEPENDS lmdb)
+
+add_custom_target(lmdb_copy_headers_to_destination
+    DEPENDS lmdb_create_destination_dir)
+
+foreach(header_file ${lmdb_HEADERS})
+  add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/)
+endforeach()
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index bbf626f87ba..2b2bd47d1c9 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
@@ -7,7 +21,9 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  set(png_STATIC_LIBRARIES 
+    debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
+    optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
 else()
   set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
 endif()
@@ -42,5 +58,5 @@ add_custom_target(png_copy_headers_to_destination
 
 foreach(header_file ${png_HEADERS})
   add_custom_command(TARGET png_copy_headers_to_destination PRE_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${png_INCLUDE_DIR}/)
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${png_INCLUDE_DIR}/)
 endforeach()
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 5ee6987175f..d600d8c3c0d 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
@@ -5,8 +19,10 @@ set(PROTOBUF_URL https://github.com/mrry/protobuf.git)  # Includes MSVC fix.
 set(PROTOBUF_TAG 1d2c7b6c7376f396c8c7dd9b6afd2d4f83f3cb05)
 
 if(WIN32)
-  set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/libprotobuf.lib)
-  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/protoc.exe)
+  set(protobuf_STATIC_LIBRARIES 
+    debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
+    optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
+  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
   set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
 else()
   set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
diff --git a/tensorflow/contrib/cmake/external/tensorboard.cmake b/tensorflow/contrib/cmake/external/tensorboard.cmake
deleted file mode 100644
index dd2613a15c8..00000000000
--- a/tensorflow/contrib/cmake/external/tensorboard.cmake
+++ /dev/null
@@ -1,134 +0,0 @@
-include (ExternalProject)
-
-set(tensorboard_dependencies)
-add_custom_target(tensorboard_copy_dependencies)
-
-function(tb_new_http_archive)
-  cmake_parse_arguments(_TB "" "NAME;URL" "FILES" ${ARGN})
-  ExternalProject_Add(${_TB_NAME}
-    PREFIX ${_TB_NAME}
-    URL ${_TB_URL}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}/${_TB_NAME}"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-    INSTALL_COMMAND ""
-  )
-
-  set(src_dir "${CMAKE_CURRENT_BINARY_DIR}/${_TB_NAME}/src/${_TB_NAME}")
-  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}")
-
-  foreach(src_file ${_TB_FILES})
-    add_custom_command(
-      TARGET tensorboard_copy_dependencies PRE_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
-    )
-  endforeach()
-  
-  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
-endfunction()
-
-function(tb_http_file)
-  cmake_parse_arguments(_TB "" "NAME;URL" "" ${ARGN})
-  get_filename_component(src_file ${_TB_URL} NAME)
-  file(DOWNLOAD ${_TB_URL} "${DOWNLOAD_LOCATION}/${_TB_NAME}/${src_file}")
-  
-  set(src_dir "${DOWNLOAD_LOCATION}/${_TB_NAME}")
-  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}/file")
-  
-  add_custom_command(
-    TARGET tensorboard_copy_dependencies PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
-  )
-  
-  add_custom_target(${_TB_NAME} DEPENDS ${src_dir}/${src_file})
-  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
-endfunction()
-
-# Parse TensorBoard dependency names and URLs from Bazel's WORKSPACE file.
-set(tb_dep_names)
-file(STRINGS ${PROJECT_SOURCE_DIR}/../../../WORKSPACE workspace_contents)
-foreach(line ${workspace_contents})
-  if(line MATCHES "# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT")
-    set(tb_deps_started 1)
-  endif()
-
-  if(NOT tb_deps_started)
-    continue()
-  endif()
-
-  if(line MATCHES "new_http_archive\\(")
-    set(tb_dep_is_archive 1)
-    continue()
-  elseif(line MATCHES "http_file\\(")
-    set(tb_dep_is_archive 0)
-    continue()
-  endif()
-
-  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
-  if(has_name)
-    set(tb_dep_name ${CMAKE_MATCH_1})
-    continue()
-  endif()
-
-  string(REGEX MATCH "url.*=.*\"(.*)\"" has_url ${line})
-  if(has_url)
-    list(APPEND tb_dep_names ${tb_dep_name})
-    set(${tb_dep_name}_is_archive ${tb_dep_is_archive})
-    set(${tb_dep_name}_url ${CMAKE_MATCH_1})
-  endif()
-endforeach()
-
-# Parse the files needed for each TensorBoard dependency from Bazel's bower.BUILD file.
-# Due to CMAKE quirkiness, cannot use file(strings) with files that contain '[' and ']'.
-file(READ ${PROJECT_SOURCE_DIR}/../../../bower.BUILD bower_build_contents)
-string(REPLACE "\[" "OB" bower_build_contents "${bower_build_contents}")
-string(REPLACE "\]" "CB" bower_build_contents "${bower_build_contents}")
-string(REPLACE ";" "\\\\;" bower_build_contents "${bower_build_contents}")
-string(REPLACE "\n" "E;" bower_build_contents "${bower_build_contents}")
-foreach(line ${bower_build_contents})
-  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
-  if(has_name)
-    set(tb_dep_name ${CMAKE_MATCH_1})
-    set(${tb_dep_name}_files)
-    continue()
-  endif()
-
-  string(REGEX MATCH "srcs.*=.*\"(.*)\"CB" has_single_line_src ${line})
-  if(has_single_line_src)
-    list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
-    continue()
-  endif()
-
-  if(line MATCHES "srcs.*=.*OB")
-    set(inside_files_def 1)
-    continue()
-  elseif(line MATCHES "CB,")
-    set(inside_files_def 0)
-    continue()
-  endif()
-
-  if(inside_files_def)
-   string(REGEX MATCH "\"(.*)\"," has_file ${line})
-   if(has_file)
-     list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
-   endif()
-  endif()
-endforeach()
-
-# Generate a target for each dependency.
-foreach(tb_dep_name ${tb_dep_names})
-  if (${tb_dep_name}_is_archive)
-    tb_new_http_archive(
-      NAME ${tb_dep_name}
-      URL ${${tb_dep_name}_url}
-      FILES ${${tb_dep_name}_files}
-    )
-  else()
-    tb_http_file(
-      NAME ${tb_dep_name}
-      URL ${${tb_dep_name}_url}
-    )
-  endif()
-endforeach()
-
-add_dependencies(tensorboard_copy_dependencies ${tensorboard_dependencies})
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index afe5e366aec..c8af611e1ea 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
@@ -8,7 +22,8 @@ set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
 
 if(WIN32)
   set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+      optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
 else()
   set(zlib_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
@@ -42,5 +57,5 @@ add_custom_target(zlib_copy_headers_to_destination
 
 foreach(header_file ${ZLIB_HEADERS})
     add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${zlib_INCLUDE_DIR})
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
 endforeach()
diff --git a/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
new file mode 100644
index 00000000000..b31ea3ed980
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(fft2d)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(FFT2D_SRCS
+    "fftsg.c"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(fft2d ${FFT2D_SRCS})
+
+install(TARGETS fft2d
+  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+  ARCHIVE DESTINATION lib COMPONENT Development)
diff --git a/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
new file mode 100644
index 00000000000..19fa607a101
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(liblmdb)
+
+set(LIBLMDB_SRCS
+    "libraries/liblmdb/mdb.c"
+    "libraries/liblmdb/midl.c"
+)
+
+set(LIBLMDB_INCLUDES
+    "libraries/liblmdb/lmdb.h"
+    "libraries/liblmdb/midl.h"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(lmdb ${LIBLMDB_SRCS})
+
+install(TARGETS lmdb
+  RUNTIME DESTINATION bin COMPONENT RuntimeLibraries
+  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+  ARCHIVE DESTINATION lib COMPONENT Development)
+
+foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES})
+  install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development)
+endforeach()
diff --git a/tensorflow/contrib/cmake/setup.py b/tensorflow/contrib/cmake/setup.py
deleted file mode 100644
index 6ad48d5369b..00000000000
--- a/tensorflow/contrib/cmake/setup.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import fnmatch
-import os
-import re
-import sys
-
-from setuptools import find_packages, setup, Command
-from setuptools.command.install import install as InstallCommandBase
-from setuptools.dist import Distribution
-
-_VERSION = '1.0.0-rc1-cmake-experimental'
-
-REQUIRED_PACKAGES = [
-    'numpy >= 1.11.0',
-    'six >= 1.10.0',
-    'protobuf == 3.0.0',
-]
-
-# python3 requires wheel 0.26
-if sys.version_info.major == 3:
-  REQUIRED_PACKAGES.append('wheel >= 0.26')
-else:
-  REQUIRED_PACKAGES.append('wheel')
-  # mock comes with unittest.mock for python3, need to install for python2
-  REQUIRED_PACKAGES.append('mock >= 2.0.0')
-
-# pylint: disable=line-too-long
-CONSOLE_SCRIPTS = [
-    'tensorboard = tensorflow.tensorboard.tensorboard:main',
-]
-# pylint: enable=line-too-long
-
-TEST_PACKAGES = [
-    'scipy >= 0.15.1',
-]
-
-class BinaryDistribution(Distribution):
-  def is_pure(self):
-    return False
-
-
-class InstallCommand(InstallCommandBase):
-  """Override the dir where the headers go."""
-
-  def finalize_options(self):
-    ret = InstallCommandBase.finalize_options(self)
-    self.install_headers = os.path.join(self.install_purelib,
-                                        'tensorflow', 'include')
-    return ret
-
-
-class InstallHeaders(Command):
-  """Override how headers are copied.
-
-  The install_headers that comes with setuptools copies all files to
-  the same directory. But we need the files to be in a specific directory
-  hierarchy for -I <include_dir> to work correctly.
-  """
-  description = 'install C/C++ header files'
-
-  user_options = [('install-dir=', 'd',
-                   'directory to install header files to'),
-                  ('force', 'f',
-                   'force installation (overwrite existing files)'),
-                 ]
-
-  boolean_options = ['force']
-
-  def initialize_options(self):
-    self.install_dir = None
-    self.force = 0
-    self.outfiles = []
-
-  def finalize_options(self):
-    self.set_undefined_options('install',
-                               ('install_headers', 'install_dir'),
-                               ('force', 'force'))
-
-  def mkdir_and_copy_file(self, header):
-    install_dir = os.path.join(self.install_dir, os.path.dirname(header))
-    # Get rid of some extra intervening directories so we can have fewer
-    # directories for -I
-    install_dir = re.sub('/google/protobuf/src', '', install_dir)
-
-    # Copy eigen code into tensorflow/include.
-    # A symlink would do, but the wheel file that gets created ignores
-    # symlink within the directory hierarchy.
-    # NOTE(keveman): Figure out how to customize bdist_wheel package so
-    # we can do the symlink.
-    if 'external/eigen_archive/' in install_dir:
-      extra_dir = install_dir.replace('external/eigen_archive', '')
-      if not os.path.exists(extra_dir):
-        self.mkpath(extra_dir)
-      self.copy_file(header, extra_dir)
-
-    if not os.path.exists(install_dir):
-      self.mkpath(install_dir)
-    return self.copy_file(header, install_dir)
-
-  def run(self):
-    hdrs = self.distribution.headers
-    if not hdrs:
-      return
-
-    self.mkpath(self.install_dir)
-    for header in hdrs:
-      (out, _) = self.mkdir_and_copy_file(header)
-      self.outfiles.append(out)
-
-  def get_inputs(self):
-    return self.distribution.headers or []
-
-  def get_outputs(self):
-    return self.outfiles
-
-
-def find_files(pattern, root):
-  """Return all the files matching pattern below root dir."""
-  for path, _, files in os.walk(root):
-    for filename in fnmatch.filter(files, pattern):
-      yield os.path.join(path, filename)
-
-
-matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
-if os.name == 'nt':
-  EXTENSION_NAME = 'python/_pywrap_tensorflow.pyd'
-else:
-  EXTENSION_NAME = 'python/_pywrap_tensorflow.so'
-
-
-# TODO(mrry): Add support for development headers.
-# headers = (list(find_files('*.h', 'tensorflow/core')) +
-#            list(find_files('*.h', 'google/protobuf/src')) +
-#            list(find_files('*', 'third_party/eigen3')) +
-#            list(find_files('*', 'external/eigen_archive')))
-
-
-setup(
-    name='tensorflow',
-    version=_VERSION,
-    description='TensorFlow helps the tensors flow',
-    long_description='',
-    url='http://tensorflow.org/',
-    author='Google Inc.',
-    author_email='opensource@google.com',
-    # Contained modules and scripts.
-    packages=find_packages(),
-    entry_points={
-        'console_scripts': CONSOLE_SCRIPTS,
-    },
-    # headers=headers,
-    install_requires=REQUIRED_PACKAGES,
-    tests_require=REQUIRED_PACKAGES + TEST_PACKAGES,
-    # Add in any packaged data.
-    include_package_data=True,
-    package_data={
-        'tensorflow': [EXTENSION_NAME] + matches,
-    },
-    zip_safe=False,
-    distclass=BinaryDistribution,
-    cmdclass={
-        # 'install_headers': InstallHeaders,
-        'install': InstallCommand,
-    },
-    # PyPI package information.
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Education',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python :: 2.7',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-        'Topic :: Software Development :: Libraries',
-        ],
-    license='Apache 2.0',
-    keywords='tensorflow tensor machine learning',
-    )
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
new file mode 100644
index 00000000000..069cdfa3526
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_c_framework library
+########################################################
+set(tf_c_srcs
+    "${tensorflow_source_dir}/tensorflow/c/c_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/c_api.h"
+    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
+    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
+    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
+    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
+)
+
+add_library(tf_c OBJECT ${tf_c_srcs})
+add_dependencies(tf_c tf_cc_framework tf_core_lib tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index bca700aca21..b53f428461d 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_cc_framework library
 ########################################################
@@ -5,6 +19,7 @@ set(tf_cc_framework_srcs
     "${tensorflow_source_dir}/tensorflow/cc/framework/ops.h"
     "${tensorflow_source_dir}/tensorflow/cc/framework/ops.cc"
     "${tensorflow_source_dir}/tensorflow/cc/framework/scope.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope_internal.h"
     "${tensorflow_source_dir}/tensorflow/cc/framework/scope.cc"
 )
 
@@ -120,3 +135,43 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
 
 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
+
+set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+add_custom_target(tf_extension_ops)
+
+function(AddUserOps)
+  cmake_parse_arguments(_AT "" "" "TARGET;SOURCES;GPUSOURCES;DEPENDS;DISTCOPY" ${ARGN})
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+    # if gpu build is enabled and we have gpu specific code,
+    # hint to cmake that this needs to go to nvcc
+    set (gpu_source ${_AT_GPUSOURCES})
+    set (gpu_lib "${_AT_TARGET}_gpu")
+    set_source_files_properties(${gpu_source} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    cuda_compile(gpu_lib ${gpu_source})
+  endif()
+  # create shared library from source and cuda obj
+  add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
+  target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
+  if(WIN32)
+    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+        # some ops call out to cuda directly; need to link libs for the cuda dlls
+        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+    endif()
+    if (_AT_DISTCOPY)
+        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
+    endif()
+  endif()
+  if (_AT_DEPENDS)
+    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
+  endif()
+  # make sure TF_COMPILE_LIBRARY is not defined for this target
+  get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
+  if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
+    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+  else()
+    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+  endif()
+  set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
+  add_dependencies(tf_extension_ops ${_AT_TARGET})
+endfunction(AddUserOps)
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 970d87748ea..1c80ffcd7b1 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_cpu library
 ########################################################
@@ -6,10 +20,16 @@ file(GLOB_RECURSE tf_core_cpu_srcs
     "${tensorflow_source_dir}/tensorflow/cc/saved_model/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/*.h"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/debug/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/debug/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/server_lib.h"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/server_lib.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/*.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/*/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/*/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/public/*.h"
 )
 
@@ -26,8 +46,12 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.cc"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc"
 )
-list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs}) 
+list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs})
 
 # We need to include stubs for the GPU tracer, which are in the exclude glob.
 list(APPEND tf_core_cpu_srcs
@@ -40,6 +64,8 @@ if (tensorflow_ENABLE_GPU)
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/devices.h"
+    "${tensorflow_source_dir}/tensorflow/core/grappler/devices.cc"
   )
   file(GLOB_RECURSE tf_core_gpu_exclude_srcs
      "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_direct_session.cmake b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
index 712f04ddc1d..de2fa866957 100644
--- a/tensorflow/contrib/cmake/tf_core_direct_session.cmake
+++ b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_direct_session library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
index ffa57105340..2c1b6d1f6e5 100644
--- a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
+++ b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_distributed_runtime library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a60e4e74d1b..a048194a197 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # RELATIVE_PROTOBUF_GENERATE_CPP function
 ########################################################
@@ -8,7 +22,7 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
     message(SEND_ERROR "Error: RELATIVE_PROTOBUF_GENERATE_CPP() called without any proto files")
     return()
   endif()
-  
+
   set(${SRCS})
   set(${HDRS})
   foreach(FIL ${ARGN})
@@ -93,6 +107,7 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
     "tensorflow/core/framework/op_def.proto"
+    "tensorflow/core/framework/remote_fused_graph_execute_info.proto"
     "tensorflow/core/framework/resource_handle.proto"
     "tensorflow/core/framework/step_stats.proto"
     "tensorflow/core/framework/summary.proto"
@@ -103,8 +118,11 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/types.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
+    "tensorflow/core/protobuf/cluster.proto"
     "tensorflow/core/protobuf/config.proto"
     "tensorflow/core/protobuf/debug.proto"
+    "tensorflow/core/protobuf/device_properties.proto"
+    "tensorflow/core/protobuf/rewriter_config.proto"
     "tensorflow/core/protobuf/tensor_bundle.proto"
     "tensorflow/core/protobuf/saver.proto"
     "tensorflow/core/util/memmapped_file_system.proto"
@@ -191,13 +209,12 @@ add_dependencies(tf_core_lib ${tensorflow_EXTERNAL_DEPENDENCIES} tf_protos_cc)
 # target.
 set(VERSION_INFO_CC ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 add_custom_target(force_rebuild_target ALL DEPENDS ${VERSION_INFO_CC})
-add_custom_command(OUTPUT __force_rebuild COMMAND cmake -E echo)
+add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
     --raw_generate ${VERSION_INFO_CC}
     DEPENDS __force_rebuild)
-
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
 ########################################################
@@ -236,3 +253,9 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
+
+if(WIN32)
+  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
+  # Instead of defining this global, limit it to tf_core_framework where its used.
+  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
+endif()
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index b053d85f576..500b917ac99 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_kernels library
 ########################################################
@@ -27,20 +41,27 @@ endif(tensorflow_BUILD_ALL_KERNELS)
 if(tensorflow_BUILD_CONTRIB_KERNELS)
   set(tf_contrib_kernels_srcs
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/decode_audio_op.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/encode_audio_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/bucketization_kernel.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/kernels/generate_vocab_remapping_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/kernels/load_and_remap_matrix_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/checkpoint_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc"
@@ -49,7 +70,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/topn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc"
@@ -62,6 +82,8 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/text/kernels/skip_gram_kernels.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc"
     )
   list(APPEND tf_core_kernels_srcs ${tf_contrib_kernels_srcs})
 endif(tensorflow_BUILD_CONTRIB_KERNELS)
@@ -69,8 +91,8 @@ endif(tensorflow_BUILD_CONTRIB_KERNELS)
 if(NOT tensorflow_ENABLE_SSL_SUPPORT)
   # Cloud libraries require boringssl.
   file(GLOB tf_core_kernels_cloud_srcs
-      "${tensorflow_source_dir}/tensorflow/core/kernels/cloud/*.h"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/cloud/*.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/cloud/kernels/*.h"
+      "${tensorflow_source_dir}/tensorflow/contrib/cloud/kernels/*.cc"
   )
 list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_cloud_srcs})
 endif()
@@ -82,8 +104,9 @@ file(GLOB_RECURSE tf_core_kernels_exclude_srcs
    "${tensorflow_source_dir}/tensorflow/core/kernels/*testutil.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*main.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
-   "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops.h"  # stream_executor dependency
-   "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops.cc"  # stream_executor dependency
+   "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/*"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute*.cc"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform*.cc"
 )
 list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 
@@ -93,6 +116,17 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
+      # no in tensorflow.dll - comes from .so
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+      # temporarily disable nccl (nccl itself needs to be ported to windows first)
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
@@ -100,6 +134,7 @@ endif(WIN32)
 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
    "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
+   "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
 )
 
 if(WIN32 AND tensorflow_ENABLE_GPU)
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 763b77dc428..3c2f89c6c82 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -1,13 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_op_lib_names
     "array_ops"
     "candidate_sampling_ops"
     "control_flow_ops"
     "ctc_ops"
     "data_flow_ops"
+    "dataset_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
+    "lookup_ops"
     "logging_ops"
     "math_ops"
     "nn_ops"
@@ -18,10 +34,12 @@ set(tf_op_lib_names
     "resource_variable_ops"
     "script_ops"
     "sdca_ops"
-    "set_ops"  
+    "set_ops"
     "sendrecv_ops"
     "sparse_ops"
+    "spectral_ops"
     "state_ops"
+    "stateless_random_ops"
     "string_ops"
     "training_ops"
 )
@@ -44,11 +62,27 @@ function(GENERATE_CONTRIB_OP_LIBRARY op_lib_name cc_srcs)
     add_dependencies(tf_contrib_${op_lib_name}_ops tf_core_framework)
 endfunction()
 
+file(GLOB_RECURSE tensor_forest_hybrid_srcs
+     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/*.cc"
+)
+
 GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(framework_checkpoint "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/checkpoint_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(input_pipeline "${tensorflow_source_dir}/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(image "${tensorflow_source_dir}/tensorflow/contrib/image/ops/image_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
+GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(rnn_lstm "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(seq2seq_beam_search "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tensor_forest "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_hybrid "${tensor_forest_hybrid_srcs}")
+GENERATE_CONTRIB_OP_LIBRARY(text_skip_gram "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(bigquery_reader "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc")
 
 ########################################################
 # tf_user_ops library
@@ -81,7 +115,7 @@ file(GLOB_RECURSE tf_core_ops_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/user_ops/*.cu.cc"
 )
 
-list(REMOVE_ITEM tf_core_ops_srcs ${tf_core_ops_exclude_srcs}) 
+list(REMOVE_ITEM tf_core_ops_srcs ${tf_core_ops_exclude_srcs})
 
 add_library(tf_core_ops OBJECT ${tf_core_ops_srcs})
 
diff --git a/tensorflow/contrib/cmake/tf_grappler.cmake b/tensorflow/contrib/cmake/tf_grappler.cmake
new file mode 100644
index 00000000000..4811c8cce9c
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_grappler.cmake
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_grappler library
+########################################################
+file(GLOB tf_grappler_srcs
+   "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.cc"
+   "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
+   "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.cc"
+   "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.h"
+ )
+ 
+add_library(tf_grappler OBJECT ${tf_grappler_srcs})
+
+add_dependencies(tf_grappler tf_core_cpu)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_label_image_example.cmake b/tensorflow/contrib/cmake/tf_label_image_example.cmake
index cfd21311488..0d3a4699ebb 100644
--- a/tensorflow/contrib/cmake/tf_label_image_example.cmake
+++ b/tensorflow/contrib/cmake/tf_label_image_example.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_label_image_example_srcs
     "${tensorflow_source_dir}/tensorflow/examples/label_image/main.cc"
 )
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
old mode 100644
new mode 100755
index 9ab6f176c70..a969bb03eec
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -1,12 +1,25 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # CMake rules for generating the TensorFlow Python bindings.
 #
 # Known limitations:
 # * Generates output in a hard-coded path ${CMAKE_CURRENT_BINARY_DIR}/tf_python.
 # * No support for dynamic library loading.
-# * No support for tf.contrib. (TODO(mrry): Add rules for building op libraries.)
-# * No support for Python 3. (TODO(mrry): Add override for FindPythonInterp.)
+# * Limited support for tf.contrib.
 #
-# The _pywrap_tensorflow target builds everything.
+# The _pywrap_tensorflow_internal target builds everything.
 
 ########################################################
 # Resolve installed dependencies
@@ -14,7 +27,6 @@
 
 # 1. Resolve the installed version of Python (for Python.h and python).
 # TODO(mrry): Parameterize the build script to enable Python 3 building.
-include(FindPythonInterp)
 if(NOT PYTHON_INCLUDE_DIR)
   set(PYTHON_NOT_FOUND false)
   exec_program("${PYTHON_EXECUTABLE}"
@@ -112,7 +124,9 @@ file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
     ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
@@ -124,7 +138,9 @@ RELATIVE_PROTOBUF_GENERATE_PYTHON(
 file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
@@ -158,8 +174,15 @@ function(add_python_module MODULE_NAME)
     if(NOT ${ADD_PYTHON_MODULE_DONTCOPY})
         foreach(script ${module_python_srcs})
             get_filename_component(REL_DIR ${script} DIRECTORY)
-            add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
-              COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/${script} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/${script})
+            # NOTE(mrry): This rule may exclude modules that should be part of
+            # the distributed PIP package
+            # (e.g. tensorflow/contrib/testing/python/framework/util_test.py),
+            # so we currently add explicit commands to include those files
+            # later on in this script.
+            if (NOT "${script}" MATCHES "_test\.py$")
+	        add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
+                  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/${script} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/${script})
+            endif()
         endforeach()
     endif()
 endfunction()
@@ -180,21 +203,29 @@ add_python_module("tensorflow/python/client")
 add_python_module("tensorflow/python/debug")
 add_python_module("tensorflow/python/debug/cli")
 add_python_module("tensorflow/python/debug/examples")
+add_python_module("tensorflow/python/debug/lib")
 add_python_module("tensorflow/python/debug/wrappers")
+add_python_module("tensorflow/python/estimator")
+add_python_module("tensorflow/python/estimator/export")
+add_python_module("tensorflow/python/estimator/inputs")
+add_python_module("tensorflow/python/estimator/inputs/queues")
+add_python_module("tensorflow/python/feature_column")
 add_python_module("tensorflow/python/framework")
+add_python_module("tensorflow/python/grappler")
 add_python_module("tensorflow/python/kernel_tests")
+add_python_module("tensorflow/python/kernel_tests/distributions")
 add_python_module("tensorflow/python/layers")
 add_python_module("tensorflow/python/lib")
 add_python_module("tensorflow/python/lib/core")
 add_python_module("tensorflow/python/lib/io")
 add_python_module("tensorflow/python/ops")
+add_python_module("tensorflow/python/ops/distributions")
 add_python_module("tensorflow/python/ops/losses")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
 add_python_module("tensorflow/python/platform/summary")
 add_python_module("tensorflow/python/saved_model")
 add_python_module("tensorflow/python/summary")
-add_python_module("tensorflow/python/summary/impl")
 add_python_module("tensorflow/python/summary/writer")
 add_python_module("tensorflow/python/tools")
 add_python_module("tensorflow/python/training")
@@ -203,9 +234,16 @@ add_python_module("tensorflow/python/util")
 add_python_module("tensorflow/python/util/protobuf")
 add_python_module("tensorflow/tensorboard")
 add_python_module("tensorflow/tensorboard/backend")
-add_python_module("tensorflow/tensorboard/lib/python")
+add_python_module("tensorflow/tensorboard/backend/event_processing")
 add_python_module("tensorflow/tensorboard/plugins")
+add_python_module("tensorflow/tensorboard/plugins/audio")
+add_python_module("tensorflow/tensorboard/plugins/distributions")
+add_python_module("tensorflow/tensorboard/plugins/graphs")
+add_python_module("tensorflow/tensorboard/plugins/histograms")
+add_python_module("tensorflow/tensorboard/plugins/images")
 add_python_module("tensorflow/tensorboard/plugins/projector")
+add_python_module("tensorflow/tensorboard/plugins/scalars")
+add_python_module("tensorflow/tensorboard/plugins/text")
 add_python_module("tensorflow/tensorboard/scripts")
 add_python_module("tensorflow/contrib")
 add_python_module("tensorflow/contrib/android")
@@ -221,6 +259,11 @@ add_python_module("tensorflow/contrib/bayesflow/examples/reinforce_simple")
 add_python_module("tensorflow/contrib/bayesflow/python")
 add_python_module("tensorflow/contrib/bayesflow/python/kernel_tests")
 add_python_module("tensorflow/contrib/bayesflow/python/ops")
+add_python_module("tensorflow/contrib/cloud")
+add_python_module("tensorflow/contrib/cloud/kernels")
+add_python_module("tensorflow/contrib/cloud/ops")
+add_python_module("tensorflow/contrib/cloud/python")
+add_python_module("tensorflow/contrib/cloud/python/ops")
 add_python_module("tensorflow/contrib/compiler")
 add_python_module("tensorflow/contrib/copy_graph")
 add_python_module("tensorflow/contrib/copy_graph/python")
@@ -235,11 +278,18 @@ add_python_module("tensorflow/contrib/cudnn_rnn/ops")
 add_python_module("tensorflow/contrib/cudnn_rnn/python")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
+add_python_module("tensorflow/contrib/data")
+add_python_module("tensorflow/contrib/data/python")
+add_python_module("tensorflow/contrib/data/python/framework")
+add_python_module("tensorflow/contrib/data/python/kernel_tests")
+add_python_module("tensorflow/contrib/data/python/ops")
+add_python_module("tensorflow/contrib/data/python/util")
 add_python_module("tensorflow/contrib/deprecated")
 add_python_module("tensorflow/contrib/distributions")
 add_python_module("tensorflow/contrib/distributions/python")
 add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
 add_python_module("tensorflow/contrib/distributions/python/ops")
+add_python_module("tensorflow/contrib/distributions/python/ops/bijectors")
 add_python_module("tensorflow/contrib/factorization")
 add_python_module("tensorflow/contrib/factorization/examples")
 add_python_module("tensorflow/contrib/factorization/kernels")
@@ -263,10 +313,13 @@ add_python_module("tensorflow/contrib/grid_rnn")
 add_python_module("tensorflow/contrib/grid_rnn/python")
 add_python_module("tensorflow/contrib/grid_rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/grid_rnn/python/ops")
+add_python_module("tensorflow/contrib/hooks")
 add_python_module("tensorflow/contrib/image")
+add_python_module("tensorflow/contrib/image/ops")
 add_python_module("tensorflow/contrib/image/python")
 add_python_module("tensorflow/contrib/image/python/ops")
 add_python_module("tensorflow/contrib/input_pipeline")
+add_python_module("tensorflow/contrib/input_pipeline/ops")
 add_python_module("tensorflow/contrib/input_pipeline/python")
 add_python_module("tensorflow/contrib/input_pipeline/python/ops")
 add_python_module("tensorflow/contrib/integrate")
@@ -278,11 +331,56 @@ add_python_module("tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
 add_python_module("tensorflow/contrib/ios_examples/benchmark/data")
 add_python_module("tensorflow/contrib/ios_examples/camera")
 add_python_module("tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/camera/data")
 add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
 add_python_module("tensorflow/contrib/ios_examples/simple")
 add_python_module("tensorflow/contrib/ios_examples/simple/data")
 add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
+add_python_module("tensorflow/contrib/keras")
+add_python_module("tensorflow/contrib/keras/api")
+add_python_module("tensorflow/contrib/keras/api/keras")
+add_python_module("tensorflow/contrib/keras/api/keras/activations")
+add_python_module("tensorflow/contrib/keras/api/keras/applications")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/xception")
+add_python_module("tensorflow/contrib/keras/api/keras/backend")
+add_python_module("tensorflow/contrib/keras/api/keras/callbacks")
+add_python_module("tensorflow/contrib/keras/api/keras/constraints")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/boston_housing")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar10")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar100")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/imdb")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/mnist")
+add_python_module("tensorflow/contrib/keras/api/keras/datasets/reuters")
+add_python_module("tensorflow/contrib/keras/api/keras/initializers")
+add_python_module("tensorflow/contrib/keras/api/keras/layers")
+add_python_module("tensorflow/contrib/keras/api/keras/losses")
+add_python_module("tensorflow/contrib/keras/api/keras/metrics")
+add_python_module("tensorflow/contrib/keras/api/keras/models")
+add_python_module("tensorflow/contrib/keras/api/keras/optimizers")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/image")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/sequence")
+add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/text")
+add_python_module("tensorflow/contrib/keras/api/keras/regularizers")
+add_python_module("tensorflow/contrib/keras/api/keras/utils")
+add_python_module("tensorflow/contrib/keras/api/keras/wrappers")
+add_python_module("tensorflow/contrib/keras/api/keras/wrappers/scikit_learn")
+add_python_module("tensorflow/contrib/keras/python")
+add_python_module("tensorflow/contrib/keras/python/keras")
+add_python_module("tensorflow/contrib/keras/python/keras/applications")
+add_python_module("tensorflow/contrib/keras/python/keras/datasets")
+add_python_module("tensorflow/contrib/keras/python/keras/engine")
+add_python_module("tensorflow/contrib/keras/python/keras/layers")
+add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
+add_python_module("tensorflow/contrib/keras/python/keras/utils")
+add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
+add_python_module("tensorflow/contrib/kernel_methods")
+add_python_module("tensorflow/contrib/kernel_methods/python")
+add_python_module("tensorflow/contrib/kernel_methods/python/mappers")
 add_python_module("tensorflow/contrib/labeled_tensor")
 add_python_module("tensorflow/contrib/labeled_tensor/python")
 add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
@@ -328,6 +426,12 @@ add_python_module("tensorflow/contrib/losses/python")
 add_python_module("tensorflow/contrib/losses/python/losses")
 add_python_module("tensorflow/contrib/makefile")
 add_python_module("tensorflow/contrib/makefile/test")
+add_python_module("tensorflow/contrib/memory_stats")
+add_python_module("tensorflow/contrib/memory_stats/kernels")
+add_python_module("tensorflow/contrib/memory_stats/ops")
+add_python_module("tensorflow/contrib/memory_stats/python")
+add_python_module("tensorflow/contrib/memory_stats/python/kernel_tests")
+add_python_module("tensorflow/contrib/memory_stats/python/ops")
 add_python_module("tensorflow/contrib/metrics")
 add_python_module("tensorflow/contrib/metrics/kernels")
 add_python_module("tensorflow/contrib/metrics/ops")
@@ -340,6 +444,11 @@ add_python_module("tensorflow/contrib/ndlstm/python")
 add_python_module("tensorflow/contrib/nn")
 add_python_module("tensorflow/contrib/nn/python")
 add_python_module("tensorflow/contrib/nn/python/ops")
+add_python_module("tensorflow/contrib/nccl")
+add_python_module("tensorflow/contrib/nccl/kernels")
+add_python_module("tensorflow/contrib/nccl/ops")
+add_python_module("tensorflow/contrib/nccl/python")
+add_python_module("tensorflow/contrib/nccl/python/ops")
 add_python_module("tensorflow/contrib/opt")
 add_python_module("tensorflow/contrib/opt/python")
 add_python_module("tensorflow/contrib/opt/python/training")
@@ -355,15 +464,21 @@ add_python_module("tensorflow/contrib/rnn/ops")
 add_python_module("tensorflow/contrib/rnn/python")
 add_python_module("tensorflow/contrib/rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/rnn/python/ops")
+add_python_module("tensorflow/contrib/saved_model")
+add_python_module("tensorflow/contrib/saved_model/python")
+add_python_module("tensorflow/contrib/saved_model/python/saved_model")
 add_python_module("tensorflow/contrib/seq2seq")
+add_python_module("tensorflow/contrib/seq2seq/kernels")
+add_python_module("tensorflow/contrib/seq2seq/ops")
 add_python_module("tensorflow/contrib/seq2seq/python")
 add_python_module("tensorflow/contrib/seq2seq/python/kernel_tests")
 add_python_module("tensorflow/contrib/seq2seq/python/ops")
 add_python_module("tensorflow/contrib/session_bundle")
 add_python_module("tensorflow/contrib/session_bundle/example")
 add_python_module("tensorflow/contrib/session_bundle/testdata")
-add_python_module("tensorflow/contrib/session_bundle/testdata/saved_model_half_plus_two")
-add_python_module("tensorflow/contrib/session_bundle/testdata/saved_model_half_plus_two/variables")
+add_python_module("tensorflow/contrib/signal")
+add_python_module("tensorflow/contrib/signal/python")
+add_python_module("tensorflow/contrib/signal/python/ops")
 add_python_module("tensorflow/contrib/slim")
 add_python_module("tensorflow/contrib/slim/python")
 add_python_module("tensorflow/contrib/slim/python/slim")
@@ -377,7 +492,9 @@ add_python_module("tensorflow/contrib/sparsemax/python")
 add_python_module("tensorflow/contrib/sparsemax/python/ops")
 add_python_module("tensorflow/contrib/specs")
 add_python_module("tensorflow/contrib/specs/python")
+add_python_module("tensorflow/contrib/staging")
 add_python_module("tensorflow/contrib/stat_summarizer")
+add_python_module("tensorflow/contrib/stateless")
 add_python_module("tensorflow/contrib/tensorboard")
 add_python_module("tensorflow/contrib/tensorboard/plugins")
 add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
@@ -389,6 +506,7 @@ add_python_module("tensorflow/contrib/tensor_forest/data")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid/core")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid/core/ops")
+add_python_module("tensorflow/contrib/tensor_forest/hybrid/ops")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid/python")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests")
 add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/layers")
@@ -400,6 +518,11 @@ add_python_module("tensorflow/contrib/tensor_forest/python/ops")
 add_python_module("tensorflow/contrib/testing")
 add_python_module("tensorflow/contrib/testing/python")
 add_python_module("tensorflow/contrib/testing/python/framework")
+add_python_module("tensorflow/contrib/text")
+add_python_module("tensorflow/contrib/text/kernels")
+add_python_module("tensorflow/contrib/text/ops")
+add_python_module("tensorflow/contrib/text/python")
+add_python_module("tensorflow/contrib/text/python/ops")
 add_python_module("tensorflow/contrib/tfprof" DONTCOPY)  # SWIG wrapper not implemented.
 #add_python_module("tensorflow/contrib/tfprof/python")
 #add_python_module("tensorflow/contrib/tfprof/python/tools")
@@ -410,13 +533,6 @@ add_python_module("tensorflow/contrib/training/python/training")
 add_python_module("tensorflow/contrib/util")
 
 
-# Additional directories with no Python sources.
-add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist")
-add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css")
-
-
 ########################################################
 # tf_python_op_gen_main library
 ########################################################
@@ -424,6 +540,7 @@ set(tf_python_op_gen_main_srcs
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
 )
 
 add_library(tf_python_op_gen_main OBJECT ${tf_python_op_gen_main_srcs})
@@ -456,7 +573,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     else()
       set(require_shape_fn 1)
     endif()
-    
+
     # Create a C++ executable that links in the appropriate op
     # registrations and generates Python wrapper code based on the
     # registered ops.
@@ -493,10 +610,12 @@ GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
+GENERATE_PYTHON_OP_LIB("dataset_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
@@ -506,6 +625,7 @@ GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
 GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
+GENERATE_PYTHON_OP_LIB("spectral_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
@@ -517,10 +637,36 @@ GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_factorization_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_framework_checkpoint_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/framework/python/ops/gen_checkpoint_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_framework_variable_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/framework/python/ops/gen_variable_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_input_pipeline_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/input_pipeline/ops/gen_input_pipeline_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_image_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/image/ops/gen_image_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
+GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_rnn_gru_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/ops/gen_gru_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_rnn_lstm_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/ops/gen_lstm_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_seq2seq_beam_search_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/ops/gen_beam_search_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_ops"
-	  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_tensor_forest_ops.py)
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_tensor_forest_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_hybrid_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/hybrid/ops/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_text_skip_gram_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/text/python/ops/gen_skip_gram_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
+GENERATE_PYTHON_OP_LIB("stateless_random_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 
 add_custom_target(tf_python_ops SOURCES ${tf_python_ops_generated_files} ${PYTHON_PROTO_GENFILES})
 add_dependencies(tf_python_ops tf_python_op_gen_main)
@@ -536,27 +682,22 @@ find_package(SWIG REQUIRED)
 # always re-link the Python extension, but we don't have to track the
 # individual headers on which the SWIG wrapper depends.
 add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/pywrap_tensorflow.py"
-             "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.cc"
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/pywrap_tensorflow_internal.py"
+             "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
       DEPENDS tf_python_touchup_modules __force_rebuild
       COMMAND ${SWIG_EXECUTABLE}
       ARGS -python -c++
            -I${tensorflow_source_dir}
            -I${CMAKE_CURRENT_BINARY_DIR}
-           -module pywrap_tensorflow
+           -module pywrap_tensorflow_internal
            -outdir ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python
-           -o ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.cc
+           -o ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc
            -globals ''
            ${tensorflow_source_dir}/tensorflow/python/tensorflow.i
       COMMENT "Running SWIG to generate Python wrappers"
       VERBATIM )
 
-# pywrap_tensorflow is a shared library containing all of the TensorFlow
-# runtime and the standard ops and kernels. These are installed into
-# tf_python/tensorflow/python/.
-# TODO(mrry): Refactor this to expose a framework library that
-# facilitates `tf.load_op_library()`.
-add_library(pywrap_tensorflow SHARED
+set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
@@ -565,6 +706,8 @@ add_library(pywrap_tensorflow SHARED
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_reader.h"
@@ -573,29 +716,97 @@ add_library(pywrap_tensorflow SHARED
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_writer.cc"
     "${tensorflow_source_dir}/tensorflow/python/util/kernel_registry.h"
     "${tensorflow_source_dir}/tensorflow/python/util/kernel_registry.cc"
-    "${tensorflow_source_dir}/tensorflow/c/c_api.cc"
-    "${tensorflow_source_dir}/tensorflow/c/c_api.h"
-    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
-    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
-    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
-    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/ops.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope.cc"
+    "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
+)
+
+if(WIN32)
+    # Windows: build a static library with the same objects as tensorflow.dll.
+    # This can be used to build for a standalone exe and also helps us to
+    # find all symbols that need to be exported from the dll which is needed
+    # to provide the tensorflow c/c++ api in tensorflow.dll.
+    # From the static library we create the def file with all symbols that need to
+    # be exported from tensorflow.dll. Because there is a limit of 64K sybmols
+    # that can be exported, we filter the symbols with a python script to the namespaces
+    # we need.
+    #
+    add_library(pywrap_tensorflow_internal_static STATIC
+        ${pywrap_tensorflow_internal_src}
+        $<TARGET_OBJECTS:tf_c>
+        $<TARGET_OBJECTS:tf_core_lib>
+        $<TARGET_OBJECTS:tf_core_cpu>
+        $<TARGET_OBJECTS:tf_core_framework>
+        $<TARGET_OBJECTS:tf_cc>
+        $<TARGET_OBJECTS:tf_cc_ops>
+        $<TARGET_OBJECTS:tf_core_ops>
+        $<TARGET_OBJECTS:tf_core_direct_session>
+        $<TARGET_OBJECTS:tf_grappler>
+        $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+        $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+        $<TARGET_OBJECTS:tf_core_kernels>
+        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    )
+
+    target_include_directories(pywrap_tensorflow_internal_static PUBLIC
+        ${PYTHON_INCLUDE_DIR}
+        ${NUMPY_INCLUDE_DIR}
+    )
+    #target_link_libraries(pywrap_tensorflow_internal_static
+    #	tf_protos_cc
+    #	tf_python_protos_cc
+    #)
+    add_dependencies(pywrap_tensorflow_internal_static tf_protos_cc tf_python_protos_cc)
+    set(pywrap_tensorflow_internal_static_dependencies
+        $<TARGET_FILE:pywrap_tensorflow_internal_static>
+        $<TARGET_FILE:tf_protos_cc>
+        $<TARGET_FILE:tf_python_protos_cc>
+    )
+
+    set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
+
+    add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
+            --input "${pywrap_tensorflow_internal_static_dependencies}"
+            --output "${pywrap_tensorflow_deffile}"
+            --target _pywrap_tensorflow_internal.pyd
+    )
+endif(WIN32)
+
+# pywrap_tensorflow_internal is a shared library containing all of the
+# TensorFlow runtime and the standard ops and kernels. These are installed into
+# tf_python/tensorflow/python/.
+add_library(pywrap_tensorflow_internal SHARED
+    ${pywrap_tensorflow_internal_src}
+    $<TARGET_OBJECTS:tf_c>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_cc>
+    $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_grappler>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    ${pywrap_tensorflow_deffile}
 )
-target_include_directories(pywrap_tensorflow PUBLIC
+
+if(WIN32)
+    add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
+endif(WIN32)
+
+target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${PYTHON_INCLUDE_DIR}
     ${NUMPY_INCLUDE_DIR}
 )
-target_link_libraries(pywrap_tensorflow
+
+target_link_libraries(pywrap_tensorflow_internal PRIVATE
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
@@ -603,27 +814,95 @@ target_link_libraries(pywrap_tensorflow
     ${PYTHON_LIBRARIES}
 )
 
+if(WIN32)
+    # include contrib/rnn as .so
+    #
+    set(tf_gru_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+    )
+    set(tf_gru_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops_gpu.cu.cc"
+    )
+
+    set(tf_lstm_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+    )
+    set(tf_lstm_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc"
+    )
+
+    AddUserOps(TARGET _gru_ops
+        SOURCES "${tf_gru_srcs}"
+        GPUSOURCES ${tf_gru_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+
+    AddUserOps(TARGET _lstm_ops
+        SOURCES "${tf_lstm_srcs}"
+        GPUSOURCES ${tf_lstm_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+endif(WIN32)
+
+if(WIN32)
+    # include contrib/seq2seq as .so
+    #
+    set(tf_beam_search_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
+    )
+
+    set(tf_beam_search_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
+    )
+
+    AddUserOps(TARGET _beam_search_ops
+        SOURCES "${tf_beam_search_srcs}"
+        GPUSOURCES ${tf_beam_search_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
+endif(WIN32)
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
 add_custom_target(tf_python_build_pip_package)
 add_dependencies(tf_python_build_pip_package
-    pywrap_tensorflow
-    tensorboard_copy_dependencies
+    pywrap_tensorflow_internal
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
-    tf_python_ops)
+    tf_python_ops
+    tf_extension_ops)
+
+# Fix-up Python files that were not included by the add_python_module() macros.
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
+# This file is unfortunately excluded by the regex that excludes *_test.py
+# files, but it is imported into tf.contrib, so we add it explicitly.
+add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
+
 if(WIN32)
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.dll
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow.pyd)
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow.so)
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
@@ -633,24 +912,17 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 
 # Copy resources for TensorBoard.
+file(DOWNLOAD http://mirror.bazel.build/tensorboard/index.html ${DOWNLOAD_LOCATION}/tensorboard/index.html
+  EXPECTED_HASH SHA256=25554e708552ad8587152f7a444db3f4ca753f9ed72d9f8105203c1d1806d521)
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/bazel-html-imports.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/index.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/tf-tensorboard.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/lib/css/global.css
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css/)
+  COMMAND ${CMAKE_COMMAND} -E copy ${DOWNLOAD_LOCATION}/tensorboard/index.html
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/TAG
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external
-                                             ${CMAKE_CURRENT_BINARY_DIR}/tf_python/external)
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
 
 # Copy datasets for tf.contrib.learn.
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
@@ -665,7 +937,81 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/datasets/data/text_train.csv
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/learn/python/learn/datasets/data/)
-					   
+
+# Create include header directory
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/)
+
+# tensorflow headers
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow)
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow/core)
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow/stream_executor)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${tensorflow_source_dir}/tensorflow/core
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow/core)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/core
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow/core)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${tensorflow_source_dir}/tensorflow/stream_executor
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/tensorflow/stream_executor)
+
+# google protobuf headers
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/google)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/google)
+
+# Eigen directory
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/Eigen)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/Eigen)
+
+# external directory
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/external)
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/external/eigen_archive)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/external/eigen_archive)
+
+# third_party eigen directory
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/third_party)
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/third_party/eigen3)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${tensorflow_source_dir}/third_party/eigen3
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/third_party/eigen3)
+
+# unsupported Eigen directory
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/unsupported)
+add_custom_command(TARGET tf_python_build_pip_package PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/unsupported/Eigen)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/include/unsupported/Eigen)
+
 if(${tensorflow_ENABLE_GPU})
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
     COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel --project_name tensorflow_gpu
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
new file mode 100644
index 00000000000..9385ac52e90
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+if(WIN32)
+  # Windows: build a static library with the same objects as tensorflow.dll.
+  # This can be used to build for a standalone exe and also helps us to
+  # find all symbols that need to be exported from the dll which is needed
+  # to provide the tensorflow c/c++ api in tensorflow.dll.
+  # From the static library we create the def file with all symbols that need to
+  # be exported from tensorflow.dll. Because there is a limit of 64K sybmols
+  # that can be exported, we filter the symbols with a python script to the namespaces
+  # we need.
+  #
+  add_library(tensorflow_static STATIC
+      $<TARGET_OBJECTS:tf_c>
+      $<TARGET_OBJECTS:tf_cc>
+      $<TARGET_OBJECTS:tf_cc_framework>
+      $<TARGET_OBJECTS:tf_cc_ops>
+      $<TARGET_OBJECTS:tf_core_lib>
+      $<TARGET_OBJECTS:tf_core_cpu>
+      $<TARGET_OBJECTS:tf_core_framework>
+      $<TARGET_OBJECTS:tf_core_ops>
+      $<TARGET_OBJECTS:tf_core_direct_session>
+      $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+      $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+      $<TARGET_OBJECTS:tf_core_kernels>
+      $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+      $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+  )
+
+  add_dependencies(tensorflow_static tf_protos_cc)
+  set(tensorflow_static_dependencies
+      $<TARGET_FILE:tensorflow_static>
+      $<TARGET_FILE:tf_protos_cc>
+  )
+    
+  set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
+
+  add_custom_command(TARGET tensorflow_static POST_BUILD
+      COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
+          --input "${tensorflow_static_dependencies}"
+          --output "${tensorflow_deffile}"
+          --target tensorflow.dll
+  )
+endif(WIN32)
+
+# tensorflow is a shared library containing all of the
+# TensorFlow runtime and the standard ops and kernels.
+add_library(tensorflow SHARED
+    $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_cc>
+    $<TARGET_OBJECTS:tf_cc_framework>
+    $<TARGET_OBJECTS:tf_cc_ops>
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    ${tensorflow_deffile}
+)
+
+target_link_libraries(tensorflow PRIVATE
+    ${tf_core_gpu_kernels_lib}
+    ${tensorflow_EXTERNAL_LIBRARIES}
+    tf_protos_cc
+)
+
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(tensorflow PRIVATE gcc_s gcc)
+endif()
+
+if(WIN32)
+  add_dependencies(tensorflow tensorflow_static)
+endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index bf45bb08631..3d84f1ebb9c 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 #cc_library(
 #    name = "stream_executor",
 #    srcs = glob(
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 34c981a3f22..55e9e311f92 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 enable_testing()
 
 #
@@ -115,23 +129,44 @@ if (tensorflow_BUILD_PYTHON_TESTS)
   #
 
   # include all test
+  if (WIN32)
+    file(GLOB_RECURSE tf_test_rnn_src_py
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py"
+    )
+  endif()
+
   file(GLOB_RECURSE tf_test_src_py
+    ${tf_test_rnn_src_py}
+    "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
     "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
     # NOTE: tensor_forest tests in tensor_forest/hybrid/... still don't pass.
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/*_test.py"
   )
 
-  # exclude the onces we don't want
+  # exclude the ones we don't want
   set(tf_test_src_py_exclude
+    # Python source line inspection tests are flaky on Windows (b/36375074).
+    "${tensorflow_source_dir}/tensorflow/python/debug/cli/analyzer_cli_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/debug/cli/profile_analyzer_cli_test.py"
+    # Windows does not have the curses library and uses readline.
+    "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
     # generally not working
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/__init__.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/resource_variable_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
+    # requires scipy
+    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
   )
   if (WIN32)
     set(tf_test_src_py_exclude
@@ -147,18 +182,26 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cast_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/string_to_number_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/clip_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/tensor_array_ops_test.py"  # Needs portpicker.
+      # Matrix_set_diag failing on GPU on windows.
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cholesky_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/diag_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/linalg_ops_test.py"
       # misc
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reshape_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
       "${tensorflow_source_dir}/tensorflow/tensorboard/backend/server_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/diag_op_test.py"  # Silently failing with GPU kernel disabled.
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py"  # Depends on gemmlowp -> pthread.
       # int32/int64 mixup
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
       # training tests
       "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"  # Needs tf.contrib fix.
       "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"  # Needs portpicker.
       "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/quantize_training_test.py"  # Needs quantization ops to be included in windows.
       "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py"  # Overflow error.
       "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
@@ -166,14 +209,22 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Broken TensorBoard tests due to different paths in windows
       "${tensorflow_source_dir}/tensorflow/tensorboard/backend/application_test.py"
       "${tensorflow_source_dir}/tensorflow/tensorboard/lib/python/http_util_test.py"
+      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py"
+      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/images/images_plugin_test.py"
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/debugger/plugin_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py"  # Bad placement.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/topn_test.py"  # Results inaccurate
-    )
+      "${tensorflow_source_dir}/tensorflow/python/ops/cloud/bigquery_reader_ops_test.py"  # No libcurl support
+      # Newly running on Windows since TensorBoard backend move. Fail on Windows and need debug.
+      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py"
+      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
+  )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
 
@@ -237,6 +288,10 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/graph_transferer_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
   )
 
   if (NOT tensorflow_ENABLE_GPU)
@@ -275,11 +330,10 @@ if (tensorflow_BUILD_CC_TESTS)
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops_test.cc" # status 5
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops_test.cc" # status 5
 
-      # TODO: not compiling 
+      # TODO: not compiling
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantization_utils_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_activation_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_bias_add_op_test.cc"
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 2aaa9ed53eb..6ef95989630 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_tools_proto_text_src_dir "${tensorflow_source_dir}/tensorflow/tools/proto_text")
 
 file(GLOB tf_tools_proto_text_srcs
@@ -59,6 +73,8 @@ add_executable(${transform_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${transform_graph} PUBLIC
@@ -79,6 +95,8 @@ add_executable(${summarize_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${summarize_graph} PUBLIC
@@ -99,6 +117,8 @@ add_executable(${compare_graphs}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${compare_graphs} PUBLIC
@@ -106,3 +126,24 @@ target_link_libraries(${compare_graphs} PUBLIC
   ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+set(benchmark_model "benchmark_model")
+
+add_executable(${benchmark_model}
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model.cc"
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model_main.cc"
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+)
+
+target_link_libraries(${benchmark_model} PUBLIC
+  tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
+  ${tensorflow_EXTERNAL_LIBRARIES}
+)
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index d6547d6eacc..858e7dda92e 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_tutorials_example_trainer_srcs
     "${tensorflow_source_dir}/tensorflow/cc/tutorials/example_trainer.cc"
 )
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
new file mode 100644
index 00000000000..b16a5eadb05
--- /dev/null
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""create_def_file.py - tool to create a windows def file.
+
+The def file can be used to export symbols from the tensorflow dll to enable
+tf.load_library().
+
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+# External tools we use that come with visual studio sdk and
+# we assume that the caller has the correct PATH to the sdk
+UNDNAME = "undname.exe"
+DUMPBIN = "dumpbin.exe"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+                           r"tensorflow::internal::LogMessage|"
+                           r"tensorflow::internal::LogString|"
+                           r"tensorflow::internal::CheckOpMessageBuilder|"
+                           r"tensorflow::internal::PickUnusedPortOrDie|"
+                           r"tensorflow::internal::ValidateDevice|"
+                           r"tensorflow::ops::internal::Enter|"
+                           r"tensorflow::strings::internal::AppendPieces|"
+                           r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::io::internal::JoinPathImpl")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
+                        r"tensorflow::|"
+                        r"functor::|"
+                        r"perftools::gputools")
+
+# We want to identify data members explicitly in the DEF file, so that no one
+# can implicitly link against the DLL if they use one of the variables exported
+# from the DLL and the header they use does not decorate the symbol with
+# __declspec(dllimport). It is easier to detect what a data symbol does 
+# NOT look like, so doing it with the below regex.
+DATA_EXCLUDE_RE = re.compile(r"[)(]|"
+                             r"vftable|"
+                             r"vbtable|"
+                             r"vcall|"
+                             r"RTTI|"
+                             r"protobuf::internal::ExplicitlyConstructed")
+      
+def get_args():
+  """Parse command line."""
+  filename_list = lambda x: x.split(";")
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=filename_list,
+                      help="paths to input libraries separated by semicolons",
+                      required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  parser.add_argument("--target", help="name of the target", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from libs.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  for lib_path in args.input:
+    proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
+                            stdout=subprocess.PIPE)
+    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+      cols = line.split()
+      if len(cols) < 2:
+        continue
+      sym = cols[1]
+      tmpfile.file.write(sym + "\n")
+      candidates.append(sym)
+    exit_code = proc.wait()
+    if exit_code != 0:
+      print("{} failed, exit={}".format(DUMPBIN, exit_code))
+      return exit_code
+  tmpfile.file.close()
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file.
+    def_fp.write("LIBRARY " + args.target + "\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      if "deleting destructor" in line:
+        # Some of the symbols convered by INCLUDEPRE_RE export deleting
+        # destructor symbols, which is a bad idea.
+        # So we filter out such symbols here.
+        continue
+
+      if DATA_EXCLUDE_RE.search(line):
+        def_fp.write("\t" + decorated + "\n")
+      else:
+        def_fp.write("\t" + decorated + " DATA\n")
+      taken.add(decorated)
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 2ae33250f24..388d8e6ed6d 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -36,6 +36,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/contrib/compiler/jit.py b/tensorflow/contrib/compiler/jit.py
index 028b318e70e..c516ab658d7 100644
--- a/tensorflow/contrib/compiler/jit.py
+++ b/tensorflow/contrib/compiler/jit.py
@@ -24,8 +24,19 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import ops
 
 
+_XLA_SCOPE_KEY = ("__xla_scope",)
+
+
+class _XlaScope(object):
+  """Keeps track of previous XLA scope calls, and depth of current call."""
+
+  def __init__(self, count, depth):
+    self.count = count
+    self.depth = depth
+
+
 @contextlib.contextmanager
-def experimental_jit_scope(compile_ops=True):
+def experimental_jit_scope(compile_ops=True, separate_compiled_gradients=False):
   """Enable or disable JIT compilation of operators within the scope.
 
   NOTE: This is an experimental feature.
@@ -41,10 +52,27 @@ def experimental_jit_scope(compile_ops=True):
         compile_ops=lambda node_def: 'matmul' in node_def.op.lower()):
       e = tf.matmul(a, b) + d  # matmul is compiled, the addition is not.
 
+  Example of separate_compiled_gradients:
+    # In the example below, the computations for f, g and h will all be compiled
+    # in separate scopes.
+    with tf.contrib.compiler.experimental_jit_scope(
+        separate_compiled_gradients=True):
+      f = tf.matmul(a, b)
+    g = tf.gradients([f], [a, b], name='mygrads1')
+    h = tf.gradients([f], [a, b], name='mygrads2')
+
   Args:
     compile_ops: Whether to enable or disable compilation in the scope.
       Either a Python bool, or a callable that accepts the parameter
       `node_def` and returns a python bool.
+    separate_compiled_gradients: If true put each gradient subgraph into a
+      separate compilation scope. This gives fine-grained control over which
+      portions of the graph will be compiled as a single unit. Compiling
+      gradients separately may yield better performance for some graphs.
+      The scope is named based on the scope of the forward computation as well
+      as the name of the gradients. As a result, the gradients will be compiled
+      in a scope that is separate from both the forward computation, and from
+      other gradients.
   Yields:
     The current scope, enabling or disabling compilation.
 
@@ -54,18 +82,37 @@ def experimental_jit_scope(compile_ops=True):
       return attr_value_pb2.AttrValue(b=compile_ops(node_def))
   else:
     xla_compile = attr_value_pb2.AttrValue(b=compile_ops)
-  attrs = {"_XlaCompile": xla_compile}
 
-  # TODO(ebrevdo): Keep a global XlaScope counter and here create a
-  # special scope that checks if already within a xla scope or creates
-  # a new one with a new scope string.  Add a new attr _XlaScope
-  # taking this string.  Modify the xla fusion to respect scope
-  # boundaries.  Modify gradients_impl to either create a new gradient
-  # scope with a suffix from the fw scope or to try to fuse with
-  # the fw scope of the given op.  Should be backwards compatible to
-  # avoid having to modify Defun compilation attributes.
+  attrs = {
+      "_XlaCompile":
+          xla_compile,
+      "_XlaSeparateCompiledGradients":
+          attr_value_pb2.AttrValue(b=bool(separate_compiled_gradients))
+  }
+
+  # Find the singleton counter for the current scoped graph.  If it
+  # doesn't exist, create one.
+  xla_scope_counter = ops.get_collection(_XLA_SCOPE_KEY)
+  if not xla_scope_counter:
+    xla_scope_counter = _XlaScope(0, 0)
+    ops.add_to_collection(_XLA_SCOPE_KEY, xla_scope_counter)
+  else:
+    xla_scope_counter = xla_scope_counter[0]
+
+  if xla_scope_counter.depth == 0:
+    # If we're at the root xla scope, we can increase the counter so
+    # future calls to jit_scope use a different scope value.
+    # If we're already within a scope, we'll be fusing using the scope
+    # controlled by the parent.
+    attrs["_XlaScope"] = attr_value_pb2.AttrValue(
+        s=("jit_scope_%d" % xla_scope_counter.count).encode())
+    xla_scope_counter.count += 1
+
+  xla_scope_counter.depth += 1
 
   # pylint: disable=protected-access
   with ops.get_default_graph()._attr_scope(attrs):
     yield
   # pylint: enable=protected-access
+
+  xla_scope_counter.depth -= 1
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 9b7b74838f0..5e03833260d 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -18,21 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-
-# TODO(keveman): #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes  # pylint: disable=g-import-not-at-top
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-
-# pylint: disable=g-import-not-at-top
 from tensorflow.contrib.compiler import jit
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -74,17 +68,17 @@ class JITTest(test.TestCase):
     v_true_1_t, v_true_1 = self.compute(enable_jit_nonstateful, create_ops)
     _, v_true_2 = self.compute(enable_jit_nonstateful, create_ops)
     v_all_true_t, _ = self.compute(True, create_ops)
-    self.assertEqual(False, v_false_1_t.op.get_attr("_XlaCompile"))
+    self.assertFalse(v_false_1_t.op.get_attr("_XlaCompile"))
     v_true_1_t_sampler_op = v_true_1_t.graph.get_operation_by_name(
         "root/random_uniform/RandomUniform")
     v_all_true_t_sampler_op = v_all_true_t.graph.get_operation_by_name(
         "root/random_uniform/RandomUniform")
 
-    self.assertEqual(False, v_true_1_t_sampler_op.get_attr("_XlaCompile"))
-    self.assertEqual(True, v_all_true_t_sampler_op.get_attr("_XlaCompile"))
+    self.assertFalse(v_true_1_t_sampler_op.get_attr("_XlaCompile"))
+    self.assertTrue(v_all_true_t_sampler_op.get_attr("_XlaCompile"))
 
-    self.assertEqual(True, v_true_1_t.op.get_attr("_XlaCompile"))
-    self.assertEqual(True, v_all_true_t.op.get_attr("_XlaCompile"))
+    self.assertTrue(v_true_1_t.op.get_attr("_XlaCompile"))
+    self.assertTrue(v_all_true_t.op.get_attr("_XlaCompile"))
 
     # Additionally ensure that where no JIT compilation happens on the
     # random_uniform op, the output values are identical to the case
@@ -93,6 +87,33 @@ class JITTest(test.TestCase):
     self.assertAllClose(v_true_1, v_true_2)
     self.assertAllClose(v_false_1, v_true_1)
 
+  def testJITXlaScope(self):
+    with self.test_session(graph=ops.Graph()):
+      with jit.experimental_jit_scope(True):
+        # XlaScope 0
+        a1 = constant_op.constant(1)
+      with jit.experimental_jit_scope(True):
+        # XlaScope 1
+        a2 = constant_op.constant(1)
+        with jit.experimental_jit_scope(True):
+          # XlaScope still 1, depth 1
+          a3 = constant_op.constant(1)
+          with jit.experimental_jit_scope(True):
+            # XlaScope still 1, depth 2
+            a4 = constant_op.constant(1)
+          # XlaScope still 1, depth 1
+          a5 = constant_op.constant(1)
+      with jit.experimental_jit_scope(True):
+        # XlaScope now 2, depth 0
+        a6 = constant_op.constant(1)
+
+    self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
+    self.assertEqual(b"jit_scope_1", a2.op.get_attr("_XlaScope"))
+    self.assertEqual(b"jit_scope_1", a3.op.get_attr("_XlaScope"))
+    self.assertEqual(b"jit_scope_1", a4.op.get_attr("_XlaScope"))
+    self.assertEqual(b"jit_scope_1", a5.op.get_attr("_XlaScope"))
+    self.assertEqual(b"jit_scope_2", a6.op.get_attr("_XlaScope"))
+
   def testJITVariableSeed(self):
     """Test that the stateful initializer is not marked for compilation.
 
@@ -117,5 +138,131 @@ class JITTest(test.TestCase):
     self.assertAllClose(v_false_1, v_true_1)
 
 
+class CompilationEnabledInGradientTest(test.TestCase):
+
+  def testCompilationInGradient(self):
+    with self.test_session():
+      x = constant_op.constant(3)
+      y_nc = math_ops.add(x, x, name="not_compiled")
+      with jit.experimental_jit_scope():
+        y_c = math_ops.add(y_nc, y_nc, name="compiled")
+      x_grads = gradients.gradients([y_c], [x])[0]
+      operations = x_grads.graph.get_operations()
+      c_grad_ops = [
+          op for op in operations if "gradients/compiled" in op.name]
+      nc_grad_ops = [
+          op for op in operations if "gradients/not_compiled" in op.name]
+      self.assertGreater(len(c_grad_ops), 0)
+      self.assertGreater(len(nc_grad_ops), 0)
+      for cg in c_grad_ops:
+        self.assertTrue(cg.get_attr("_XlaCompile"))
+      for ncg in nc_grad_ops:
+        with self.assertRaisesRegexp(ValueError, "No attr named"):
+          ncg.get_attr("_XlaCompile")
+
+      # d/dx (4 * x)
+      self.assertAllClose(4, x_grads.eval())
+
+  def testCompilationGradientScopeNames(self):
+    with self.test_session(graph=ops.Graph()):
+      with jit.experimental_jit_scope():
+        # XlaScope 0
+        a1 = constant_op.constant(1)
+        a1t = a1 + a1
+      with jit.experimental_jit_scope():
+        # XlaScope 1
+        a2 = constant_op.constant(1)
+        a2t = a2 + a2
+
+      self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
+      self.assertEqual(b"jit_scope_1", a2.op.get_attr("_XlaScope"))
+      grad_a1 = gradients.gradients(a1t, a1, name="GA")[0]
+      grad_a2 = gradients.gradients(a2t, a2, name="GB")[0]
+      grad_a1 = grad_a1.op.inputs[0]
+      grad_a2 = grad_a2.op.inputs[0]
+      self.assertTrue(grad_a1.op.get_attr("_XlaCompile"))
+      self.assertTrue(grad_a2.op.get_attr("_XlaCompile"))
+      self.assertEqual(b"jit_scope_0", grad_a1.op.get_attr("_XlaScope"))
+      self.assertEqual(b"jit_scope_1", grad_a2.op.get_attr("_XlaScope"))
+
+  def testCompilationSeparateGradientScopeNames(self):
+    with self.test_session(graph=ops.Graph()):
+      with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
+        # XlaScope 0
+        a1 = constant_op.constant(1)
+        a1t = a1 + a1
+      with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
+        # XlaScope 1
+        a2 = constant_op.constant(1)
+        a2t = a2 + a2
+
+      self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
+      self.assertEqual(b"jit_scope_1", a2.op.get_attr("_XlaScope"))
+      grad_a1 = gradients.gradients(a1t, a1, name="GA")[0]
+      grad_a2 = gradients.gradients(a2t, a2, name="GB")[0]
+      grad_a1 = grad_a1.op.inputs[0]
+      grad_a2 = grad_a2.op.inputs[0]
+      self.assertTrue(grad_a1.op.get_attr("_XlaCompile"))
+      self.assertTrue(grad_a2.op.get_attr("_XlaCompile"))
+      self.assertEqual(b"jit_scope_0_grad_GA",
+                       grad_a1.op.get_attr("_XlaScope"))
+      self.assertEqual(b"jit_scope_1_grad_GB",
+                       grad_a2.op.get_attr("_XlaScope"))
+
+  def testPlaysNicelyWithDefun(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      with jit.experimental_jit_scope(True):  # This should be ignored
+        @function.Defun(compiled=True, noinline=True)
+        def mulop(x1, x2):
+          return x1 * x2
+        x = constant_op.constant(1.0)
+        r = mulop(x, x)
+        g_r = gradients.gradients(r, x, name="GA")[0]
+
+      # Ensure the forward function is compiled.
+      graph_def = r.graph.as_graph_def()
+      func_attrs = graph_def.library.function[0].attr
+      self.assertTrue(func_attrs["_XlaCompile"].b)
+      self.assertEqual(b"function_mulop", func_attrs["_XlaScope"].s)
+
+      # Ensure the gradient (SymbolicGradient) is compiled, with the same
+      # _XlaScope as the function itself.
+      grad_op = g_r.op.inputs[0].op
+      self.assertTrue(grad_op.get_attr("_XlaCompile"))
+      self.assertEqual(b"function_mulop", grad_op.get_attr("_XlaScope"))
+
+      # Ensure the ops run: grad(x1*x1) = 2*x1
+      self.assertAllClose([1.0, 1.0, 2.0], sess.run([x, r, g_r]))
+
+  def testPlaysNicelyWithDefunSeparateGradientScope(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      with jit.experimental_jit_scope(True):  # This should be ignored
+
+        @function.Defun(
+            compiled=True, noinline=True, separate_compiled_gradients=True)
+        def mulop(x1, x2):
+          return x1 * x2
+
+        x = constant_op.constant(1.0)
+        r = mulop(x, x)
+        g_r = gradients.gradients(r, x, name="GA")[0]
+
+      # Ensure the forward function is compiled.
+      graph_def = r.graph.as_graph_def()
+      func_attrs = graph_def.library.function[0].attr
+      self.assertTrue(func_attrs["_XlaCompile"].b)
+      self.assertEqual(b"function_mulop", func_attrs["_XlaScope"].s)
+
+      # Ensure the gradient (SymbolicGradient) is compiled, with a different
+      # _XlaScope from the function itself.
+      grad_op = g_r.op.inputs[0].op
+      self.assertTrue(grad_op.get_attr("_XlaCompile"))
+      self.assertEqual(b"function_mulop_grad_GA",
+                       grad_op.get_attr("_XlaScope"))
+
+      # Ensure the ops run: grad(x1*x1) = 2*x1
+      self.assertAllClose([1.0, 1.0, 2.0], sess.run([x, r, g_r]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index d46e83c7eda..f47a084db14 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "copy_graph_py",
     srcs = [
@@ -25,6 +27,7 @@ py_library(
 
 py_test(
     name = "copy_test",
+    size = "small",
     srcs = glob(["python/util/copy_test.py"]),
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/copy_graph/__init__.py b/tensorflow/contrib/copy_graph/__init__.py
index 96dc0d7df2d..30a0aac140b 100644
--- a/tensorflow/contrib/copy_graph/__init__.py
+++ b/tensorflow/contrib/copy_graph/__init__.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions for copying elements from one graph to another.
+"""Functions to copy elements between graphs.
 
+See the @{$python/contrib.copy_graph} guide.
 """
 
 from __future__ import absolute_import
@@ -21,7 +22,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.copy_graph.python.util import copy_elements
+# pylint: disable=wildcard-import
 from tensorflow.contrib.copy_graph.python.util.copy_elements import *
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 19d67ad4904..3f124be771d 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -44,15 +44,15 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   (default `""`).
 
   Args:
-  org_instance: A `Variable` from some `Graph`.
-  to_graph: The `Graph` to copy the `Variable` to.
-  scope: A scope for the new `Variable` (default `""`).
+    org_instance: A `Variable` from some `Graph`.
+    to_graph: The `Graph` to copy the `Variable` to.
+    scope: A scope for the new `Variable` (default `""`).
 
   Returns:
-      The copied `Variable` from `to_graph`.
+    The copied `Variable` from `to_graph`.
 
   Raises:
-      TypeError: If `org_instance` is not a `Variable`.
+    TypeError: If `org_instance` is not a `Variable`.
   """
 
   if not isinstance(org_instance, Variable):
@@ -112,17 +112,17 @@ def copy_op_to_graph(org_instance, to_graph, variables,
   to evaluate `org_instance` must be provided as input.
 
   Args:
-  org_instance: An `Operation` from some `Graph`. Could be a
+    org_instance: An `Operation` from some `Graph`. Could be a
       `Placeholder` as well.
-  to_graph: The `Graph` to copy `org_instance` to.
-  variables: An iterable of `Variable` instances to copy `org_instance` to.
-  scope: A scope for the new `Variable` (default `""`).
+    to_graph: The `Graph` to copy `org_instance` to.
+    variables: An iterable of `Variable` instances to copy `org_instance` to.
+    scope: A scope for the new `Variable` (default `""`).
 
   Returns:
-      The copied `Operation` from `to_graph`.
+    The copied `Operation` from `to_graph`.
 
   Raises:
-      TypeError: If `org_instance` is not an `Operation` or `Tensor`.
+    TypeError: If `org_instance` is not an `Operation` or `Tensor`.
   """
 
   #The name of the new instance
@@ -243,12 +243,12 @@ def get_copied_op(org_instance, graph, scope=""):
   `scope`, it will be returned.
 
   Args:
-  org_instance: An `Operation` from some `Graph`.
-  graph: The `Graph` to be searched for a copr of `org_instance`.
-  scope: The scope `org_instance` is present in.
+    org_instance: An `Operation` from some `Graph`.
+    graph: The `Graph` to be searched for a copr of `org_instance`.
+    scope: The scope `org_instance` is present in.
 
   Returns:
-      The `Operation` copy from `graph`.
+    The `Operation` copy from `graph`.
   """
 
   #The name of the copied instance
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index 7f7818c845d..80a31cc3341 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Linear-chain CRF layer.
 
-## This package provides functions for building a linear-chain CRF layer.
+See the @{$python/contrib.crf} guide.
 
 @@crf_sequence_score
 @@crf_log_norm
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index ce683ad5ced..448bcafffe6 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -19,15 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import sys
 
 import numpy as np
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.crf.python.ops import crf
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 4bcf93e78ff..a19c70717a5 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -41,11 +41,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope as vs
 
 __all__ = [
@@ -225,7 +225,7 @@ def crf_binary_score(tag_indices, sequence_lengths, transition_params):
   return binary_scores
 
 
-class CrfForwardRnnCell(core_rnn_cell.RNNCell):
+class CrfForwardRnnCell(rnn_cell.RNNCell):
   """Computes the alpha values in a linear-chain CRF.
 
   See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 6bebf22b3ae..b1caac476a2 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -13,7 +13,9 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_cudnn_rnn_ops.so",
@@ -26,6 +28,18 @@ tf_custom_op_library(
     ],
 )
 
+tf_kernel_library(
+    name = "cudnn_rnn_kernels",
+    srcs = ["kernels/cudnn_rnn_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_gen_op_libs(
     op_lib_names = ["cudnn_rnn_ops"],
     deps = [
@@ -38,15 +52,19 @@ tf_gen_op_wrapper_py(
     deps = [":cudnn_rnn_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "cudnn_rnn_py",
     srcs = [
         "__init__.py",
         "python/ops/cudnn_rnn_ops.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_cudnn_rnn_ops.so",
     ],
+    kernels = [
+        ":cudnn_rnn_kernels",
+        ":cudnn_rnn_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -64,7 +82,7 @@ py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
@@ -108,6 +126,9 @@ cuda_py_test(
     ],
     tags = [
         "manual",
+        "noasan",  # http://b/62067814
+        "nomsan",
+        "notsan",
         "requires_cudnn5",
     ],
 )
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 7a8224fa5eb..34b6bcdd977 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ops for fused Cudnn RNN models."""
+"""Ops for fused Cudnn RNN models.
+
+@@CudnnGRU
+@@CudnnLSTM
+@@CudnnRNNRelu
+@@CudnnRNNTanh
+@@RNNParamsSaveable
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,4 +32,13 @@ from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanh
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import RNNParamsSaveable
 
 from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "CudnnGRU",
+    "CudnnLSTM",
+    "CudnnRNNRelu",
+    "CudnnRNNTanh",
+    "RNNParamsSaveable",
+]
+
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 6049d2afdab..7c3ca46e106 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -67,7 +68,7 @@ limitations under the License.
  * TensorFlow is responsible for making sure the memory is alive long enough
  * and recycles afterwards.
  *
-*/
+ */
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -203,9 +204,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 }
 
 inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                            static_cast<int>(s.code())),
-                                        s.error_message());
+  return s.ok() ? Status::OK()
+                : Status(static_cast<tensorflow::error::Code>(
+                             static_cast<int>(s.code())),
+                         s.error_message());
 }
 
 template <typename T>
@@ -227,8 +229,9 @@ inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
 // should be alive for the span of the Cudnn RNN itself.
 class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
  public:
-  virtual ~CudnnRNNWorkspaceAllocator() {}
-  CudnnRNNWorkspaceAllocator(OpKernelContext* context) : context_(context) {}
+  ~CudnnRNNWorkspaceAllocator() override {}
+  explicit CudnnRNNWorkspaceAllocator(OpKernelContext* context)
+      : context_(context) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
@@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return StatusOr<DeviceMemory<uint8>>(
         AsDeviceMemory<uint8>(&temporary_memory));
   }
   int64 TotalByteSize() { return total_byte_size_; }
@@ -263,7 +265,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
 template <typename T>
 class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
  public:
-  virtual ~CudnnRNNReserveSpaceAllocator() {}
+  ~CudnnRNNReserveSpaceAllocator() override {}
   CudnnRNNReserveSpaceAllocator(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
@@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
   int output_index_;
 };
 
+// A helper to allocate persistent memory for Cudnn RNN models, which is
+// expected to live between kernel invocations.
+// This class is not thread-safe.
+class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
+ public:
+  explicit CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
+      : context_(context) {}
+
+  ~CudnnRNNPersistentSpaceAllocator() override {}
+
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      perftools::gputools::Stream* stream, int64 byte_size) override {
+    if (total_byte_size_ != 0) {
+      return Status(error::FAILED_PRECONDITION,
+                    "Persistent space allocator can only be called once");
+    }
+
+    Status allocation_status = context_->allocate_persistent(
+        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  PersistentTensor handle_;
+  OpKernelContext* context_;  // not owned
+};
+
 struct CudnnModelTypes {
   RnnMode rnn_mode;
   TFRNNInputMode rnn_input_mode;
@@ -317,6 +356,16 @@ struct CudnnModelShapes {
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
+  // At present only fields related to cached RnnDescriptor are concerned.
+  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
+           num_units == rhs.num_units && dir_count == rhs.dir_count;
+  }
+  string RnnDescDebugString() {
+    return strings::Printf(
+        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
+        num_layers, input_size, num_units, dir_count);
+  }
 };
 
 // Extract and checks the forward input tensors, parameters, and shapes from the
@@ -399,11 +448,24 @@ void RestoreParams(const OpInputList params_input,
 
 }  // namespace
 
+// Note: all following kernels depend on a RnnDescriptor instance, which
+// according to Cudnn official doc should be kept around and reused across all
+// Cudnn kernels in the same model.
+// In Tensorflow, we don't pass the reference across different OpKernels,
+// rather, recreate it separately in each OpKernel, which does no cause issue:
+// CudnnDropoutDescriptor keeps a reference to a memory for
+// random number generator state. During recreation, this state is lost.
+// However, only forward-pass Cudnn APIs make use of the state.
+
 // A common base class for RNN kernels. It extracts common attributes and
 // shape validations.
 class CudnnRNNKernelCommon : public OpKernel {
  protected:
-  CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
+  explicit CudnnRNNKernelCommon(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
     string str;
     OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
     OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
@@ -413,6 +475,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
     OP_REQUIRES_OK(
         context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
+    // Reset CudnnRnnDescriptor and related random number generate states in
+    // every Compute() call.
+    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               false, &reset_rnd_gen_state_));
   }
 
   bool HasInputC() const { return model_types_.HasInputC(); }
@@ -422,6 +488,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     return model_types_.rnn_direction_mode;
   }
   CudnnModelTypes model_types() const { return model_types_; }
+  float dropout() const { return dropout_; }
+  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
+  bool ResetRndGenState() { return reset_rnd_gen_state_; }
 
   template <typename T>
   Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
@@ -448,11 +517,14 @@ class CudnnRNNKernelCommon : public OpKernel {
     RnnInputMode input_mode;
     TF_RETURN_IF_ERROR(
         ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
+
     auto* stream = context->op_device_context()->stream();
+    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
+    // random number generator, therefore set state_allocator to nullptr.
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
-        nullptr /*state_allocator*/);
+        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
     }
@@ -461,6 +533,11 @@ class CudnnRNNKernelCommon : public OpKernel {
   }
 
  private:
+  int seed_;
+  int seed2_;
+  float dropout_;
+  bool reset_rnd_gen_state_;
+
   CudnnModelTypes model_types_;
 };
 
@@ -560,9 +637,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
         context->set_output(i, input.Slice(start, end));
       } else {
         Tensor* output = nullptr;
-        OP_REQUIRES_OK(
-            context,
-            context->allocate_output(i, TensorShape({width, height}), &output));
+        OP_REQUIRES_OK(context, context->allocate_output(
+                                    i, TensorShape({width, height}), &output));
         DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
             input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
             size_in_bytes);
@@ -571,14 +647,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
       }
     }
 
-    CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
-        << "Number of params mismatch. Expected " << num_params_ << ", got "
-        << rnn_desc->ParamsBiasRegions().size();
+    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
+                errors::InvalidArgument("Number of params mismatch. Expected ",
+                                        num_params_, ", got ",
+                                        rnn_desc->ParamsBiasRegions().size()));
     for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
       int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
       int64 size = size_in_bytes / sizeof(T);
-      CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
-                               << ", got " << size;
+      OP_REQUIRES(context, size == num_units,
+                  errors::InvalidArgument("Params size mismatch. Expected ",
+                                          num_units, ", got ", size));
+
       // If data is aligned, use slice view to avoid expensive memcpy.
       bool start_aligned =
           rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
@@ -698,16 +777,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
     auto data_type = ToDataType<T>::value;
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes_->num_layers, model_shapes_->num_units,
+            model_shapes_->input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -753,21 +848,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnForward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, &output_data, *hidden_state_desc, &output_h_data,
-                *hidden_state_desc, &output_c_data, is_training_,
-                &reserve_space_allocator, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnForward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, &output_data, *hidden_state_desc,
+                  &output_h_data, *hidden_state_desc, &output_c_data,
+                  is_training_, &reserve_space_allocator, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnForward"));
   }
 
  private:
+  mutex mu_;
   bool is_training_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -808,9 +912,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* output_h = nullptr;
     OP_REQUIRES_OK(context, context->input("output_h", &output_h));
     OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument("Invalid output_h shape: ",
-                                        output_h->shape().DebugString(), " ",
-                                        hidden_state_shape.DebugString()));
+                errors::InvalidArgument(
+                    "Invalid output_h shape: ", output_h->shape().DebugString(),
+                    " ", hidden_state_shape.DebugString()));
     const Tensor* output_c = nullptr;
     if (HasInputC()) {
       // Only LSTM uses input_c and output_c. So for all other models, we only
@@ -881,15 +985,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -939,21 +1060,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnBackward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, output_data, *hidden_state_desc, output_h_data,
-                *hidden_state_desc, output_c_data, output_backprop_data,
-                output_h_backprop_data, output_c_backprop_data,
-                &input_backprop_data, &input_h_backprop_data,
-                &input_c_backprop_data, &params_backprop_data,
-                &reserve_space_uint8, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnBackward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, output_data, *hidden_state_desc, output_h_data,
+                  *hidden_state_desc, output_c_data, output_backprop_data,
+                  output_h_backprop_data, output_c_backprop_data,
+                  &input_backprop_data, &input_h_backprop_data,
+                  &input_c_backprop_data, &params_backprop_data,
+                  &reserve_space_uint8, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnBackward"));
   }
+
+ private:
+  mutex mu_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index b5c2390de17..2c631b064b5 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
     input_size == num_units; otherwise, it implies 'linear_input'.
 direction: Indicates whether a bidirectional model will be used.
     dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
 )doc";
 
 constexpr auto kCudnnRNNParamsBuffer = R"doc(
@@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .Output("params_size: S")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(1));
@@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
     .Input("input_h: T")
     .Input("input_c: T")
     .Input("params: T")
+    .SetIsStateful()
     .Output("output: T")
     .Output("output_h: T")
     .Output("output_c: T")
@@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
-    .Attr("dropout: float")
+    .Attr("dropout: float = 0.0")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("is_training: bool = true")
@@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
 Computes the RNN from the input and initial states, with respect to the params
 buffer.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 is_training: Indicates whether this operation is used for inferenece or
     training.
 reserve_space: an opaque tensor that can be used in backprop calculation. It
@@ -177,6 +185,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Input("output_h_backprop: T")
     .Input("output_c_backprop: T")
     .Input("reserve_space: T")
+    .SetIsStateful()
     .Output("input_backprop: T")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
@@ -185,6 +194,9 @@ REGISTER_OP("CudnnRNNBackprop")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       auto input_shape = c->input(0);
       auto input_h_shape = c->input(1);
@@ -199,7 +211,8 @@ REGISTER_OP("CudnnRNNBackprop")
     .Doc(strings::StrCat(R"doc(
 Compute the backprop of both data and weights in a RNN.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
 output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
     pass.
@@ -228,11 +241,14 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
       int num_params;
-      c->GetAttr("num_params", &num_params);
+      TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params));
       // Set shape for weight matrices
       for (int i = 0; i < num_params; i++) {
         c->set_output(i,
@@ -268,6 +284,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
@@ -281,7 +300,6 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+)doc", kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 34b2d49d264..6ca38c2e479 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -22,7 +22,6 @@ import time
 
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -131,9 +131,9 @@ class CudnnRNNBenchmark(test.Benchmark):
         ]
         initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
 
-        cell = core_rnn_cell_impl.LSTMCell(
+        cell = rnn_cell.LSTMCell(
             num_units=num_units, initializer=initializer, state_is_tuple=True)
-        multi_cell = core_rnn_cell_impl.MultiRNNCell(
+        multi_cell = rnn_cell.MultiRNNCell(
             [cell() for _ in range(num_layers)])
         outputs, final_state = core_rnn.static_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
@@ -159,7 +159,7 @@ class CudnnRNNBenchmark(test.Benchmark):
         ]
         cell = lambda: lstm_ops.LSTMBlockCell(num_units=num_units)  # pylint: disable=cell-var-from-loop
 
-        multi_cell = core_rnn_cell_impl.MultiRNNCell(
+        multi_cell = rnn_cell.MultiRNNCell(
             [cell() for _ in range(num_layers)])
         outputs, final_state = core_rnn.static_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 945791578ac..08ec3076e49 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -38,15 +38,25 @@ from tensorflow.python.training import saver as saver_lib
 
 class CudnnRNNTest(TensorFlowTestCase):
 
-  def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
+  def _CreateModel(self,
+                   rnn_mode,
+                   num_layers,
+                   num_units,
+                   input_size,
+                   input_mode="linear_input",
+                   dropout=0.):
     if rnn_mode == "lstm":
-      model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnLSTM(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "gru":
-      model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnGRU(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_tanh":
-      model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNTanh(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_relu":
-      model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNRelu(
+          num_layers, num_units, input_size, dropout=dropout)
     else:
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
@@ -58,9 +68,8 @@ class CudnnRNNTest(TensorFlowTestCase):
       params: a Variable for weight and bias parameters.
       model: a CudnnRNN model.
     """
-    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(model.params_to_canonical,
-                                                      model.canonical_to_params,
-                                                      params)
+    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(
+        model.params_to_canonical, model.canonical_to_params, [params])
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
 
   def _testSaveRestoreVariable(self, rnn_mode):
@@ -175,9 +184,12 @@ class CudnnRNNTest(TensorFlowTestCase):
         self._testOneLSTMParamsSize(num_layers, num_units, input_size)
 
   def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, expected,
-                              tolerance):
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+                              batch_size, seq_length, dir_count, dropout,
+                              expected, tolerance):
+    random_seed.set_random_seed(5678)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              input_mode="auto_select",
+                              dropout=dropout)
     has_input_c = (rnn_mode == "lstm")
     params_size_t = model.params_size()
     input_data = array_ops.ones([seq_length, batch_size, input_size])
@@ -207,18 +219,24 @@ class CudnnRNNTest(TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       total_sum_v = sess.run([total_sum])
+
       self.assertAllClose(
           total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleInference(self):
+    # Cudnn scales result for dropout during training, therefore dropout has no
+    # impact for inference results.
+    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
+    # demonstrative of the dropout-invariant nature of CudnnRnn.)
     test_configs = [
-        [
-            "lstm",
-            231833.22,
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "expected": 231833.22,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -226,12 +244,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -239,12 +258,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -252,12 +272,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            130688,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "expected": 130688,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 8,
                 "input_size": 4,
@@ -265,24 +286,32 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 2,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        expected = config[1]
-        tolerance = config[2]
-        shapes = config[3]
-        self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
-                                     shapes["num_units"], shapes["input_size"],
-                                     shapes["batch_size"], shapes["seq_length"],
-                                     shapes["dir_count"], expected, tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        expected = config["expected"]
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleInference(
+              rnn_mode, shape["num_layers"], shape["num_units"],
+              shape["input_size"], shape["batch_size"], shape["seq_length"],
+              shape["dir_count"], dropout, expected, tolerance)
 
   def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, tolerance):
+                             batch_size, seq_length, dir_count, dropout,
+                             tolerance):
+    # Gradient checking runs two forward ops with almost the same input. Need to
+    # make sure the drop patterns across the two runs are the same.
+    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
     has_input_c = (rnn_mode == "lstm")
     random_seed.set_random_seed(1234)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              dropout=dropout)
     params_size_t = model.params_size()
     input_data = variables.Variable(
         random_ops.random_uniform([seq_length, batch_size, input_size]))
@@ -295,6 +324,7 @@ class CudnnRNNTest(TensorFlowTestCase):
       input_c = variables.Variable(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units]))
+
       output, output_h, output_c = model(
           input_data=input_data,
           input_h=input_h,
@@ -323,18 +353,22 @@ class CudnnRNNTest(TensorFlowTestCase):
       sess.run(variables.global_variables_initializer())
       all_inputs = [entry[0] for entry in inputs_and_shapes]
       all_shapes = [entry[1] for entry in inputs_and_shapes]
+
       err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
                                                     total_sum, [1])
+
       self.assertLess(err, tolerance)
+      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTraining(self):
     test_configs = [
-        [
-            "lstm",
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -342,11 +376,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            4e-3,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -354,11 +389,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            5e-3,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 5e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -366,11 +402,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            3e-1,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-1,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -378,17 +415,19 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        tolerance = config[1]
-        shape = config[2]
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    shape["dir_count"], tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
+                                      shape["num_units"], shape["input_size"],
+                                      shape["batch_size"], shape["seq_length"],
+                                      shape["dir_count"], dropout, tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 9ab337df15c..cc0c7b08296 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
 
-
 _cudnn_rnn_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
 
@@ -48,8 +48,8 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
   def __init__(self,
                params_to_canonical,
                canonical_to_params,
-               name="params_canonical",
-               *param_variables):
+               param_variables,
+               name="params_canonical"):
     """Creates a RNNParamsSaveable object.
 
        RNNParamsSaveable is saveable/restorable in a checkpoint file and is used
@@ -83,11 +83,11 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           must return a scalar (e.g. in the case of cuDNN) or a tuple. This
           function could be _CudnnRNN.canonical_to_params() or a
           user-defined function.
-      name: the name of the RNNParamsSaveable object.
-      *param_variables: a list of Variables for parameters in a specific form.
+      param_variables: a list of Variables for parameters in a specific form.
           For cuDNN RNN ops, this is a single merged variable for both weights
           and biases; for other RNN ops, this might be multiple unmerged or
           partially merged variables respectively for weights and biases.
+      name: the name of the RNNParamsSaveable object.
     """
     # There is only a single merged parameter variable for cuDNN when saving.
     weights, biases = params_to_canonical(param_variables[0])
@@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     if not isinstance(params, tuple):
       params = (params,)
     assign_ops = [
-        state_ops.assign(
-            variable, param, validate_shape=False)
+        state_ops.assign(variable, param, validate_shape=False)
         for variable, param in zip(self._variables, params)
     ]
     return control_flow_ops.group(*assign_ops)
 
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -141,7 +141,7 @@ _cudnn_rnn_common_doc_string = """
     * Once a while, the user saves the parameter buffer into model checkpoints
         with Saver.save().
     * When restoring, the user creates a RNNParamsSaveable object and uses
-      Saver.restore() to restore the paramter buffer from the canonical format
+      Saver.restore() to restore the parameter buffer from the canonical format
       to a user-defined format, as well as to restore other savable objects
       in the checkpoint file.
 """
@@ -160,11 +160,10 @@ class _CudnnRNN(object):
                num_layers,
                num_units,
                input_size,
-               input_mode="auto_select",
+               input_mode="linear_input",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a CudnnRNN model from model spec.
 
     Args:
@@ -175,16 +174,18 @@ class _CudnnRNN(object):
       input_size: the size of the input, it could be different from the
           num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and The actual computation before the first layer. It could be
-          'skip_input', 'linear_input' or 'auto_select'.
+          input and the actual computation before the first layer. It could be
+          'linear_input', 'skip_input' or 'auto_select'.
+          'linear_input' (default) always applies a linear projection of input
+          onto RNN hidden state. (standard RNN behavior).
           'skip_input' is only allowed when input_size == num_units;
           'auto_select' implies 'skip_input' when input_size == num_units;
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+          for behavior.
     """
     self._num_layers = num_layers
     self._num_units = num_units
@@ -193,8 +194,10 @@ class _CudnnRNN(object):
     self._input_mode = input_mode
     self._direction = direction
     self._dropout = dropout
-    self._seed = seed
-    self._seed2 = seed2
+    # get graph and op seed.
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    if self._seed is None and self._seed2 is None:
+      self._seed, self._seed2 = 0, 0
 
   def params_size(self):
     """Calculates the size of the opaque parameter buffer needed for this model.
@@ -208,6 +211,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         T=dtypes.float32,
         S=dtypes.int32,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)[0]
@@ -258,6 +264,9 @@ class _CudnnRNN(object):
         num_units=self._num_units,
         input_size=self._input_size,
         params=params,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
@@ -280,6 +289,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         weights=weights,
         biases=biases,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)
@@ -299,8 +311,7 @@ class CudnnLSTM(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn LSTM model from model spec.
 
     Args:
@@ -317,8 +328,7 @@ class CudnnLSTM(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(CudnnLSTM, self).__init__(
         "lstm",
@@ -328,8 +338,7 @@ class CudnnLSTM(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -346,11 +355,8 @@ class CudnnLSTM(_CudnnRNN):
       output_h: the final state for h.
       output_c: the final state for c.
     """
-    output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
-                                                                 input_h,
-                                                                 input_c,
-                                                                 params,
-                                                                 is_training)
+    output, output_h, output_c = super(CudnnLSTM, self).__call__(
+        input_data, input_h, input_c, params, is_training=is_training)
     return (output, output_h, output_c)
 
 
@@ -365,8 +371,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn RNN model from model without hidden-state C.
 
     Args:
@@ -383,8 +388,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
@@ -394,8 +398,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -411,7 +414,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       output_h: the final state for h.
     """
     output, output_h, _ = super(_CudnnRNNNoInputC, self).__call__(
-        input_data, input_h, None, params, is_training=True)
+        input_data, input_h, None, params, is_training=is_training)
     return (output, output_h)
 
 
@@ -459,6 +462,9 @@ def _cudnn_rnn_backward(op, *grad):
       output_h_backprop=grad[1],
       output_c_backprop=grad[2],
       reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
new file mode 100644
index 00000000000..7b916d82c1c
--- /dev/null
+++ b/tensorflow/contrib/data/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "data",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
new file mode 100644
index 00000000000..9505f5c4653
--- /dev/null
+++ b/tensorflow/contrib/data/README.md
@@ -0,0 +1,631 @@
+# Using the `Dataset` API for TensorFlow Input Pipelines
+
+The `Dataset` API is designed to let you build complex input pipelines from
+simple, reusable pieces. For example, the pipeline for an image model might
+aggregate data from files in a distributed file system, apply random
+perturbations to each image, and merge randomly selected images into a batch
+for training. The pipeline for a text model might involve extracting symbols
+from raw text data, converting them to embedding identifiers with a lookup
+table, and batching together sequences of different lengths. The `Dataset` API
+makes it easy to deal with large amounts of data, different data formats, and
+complicated transformations.
+
+The `Dataset` API introduces two new abstractions to TensorFlow:
+
+* A `tf.contrib.data.Dataset` represents a sequence of elements, in which
+  each element contains one or more `Tensor` objects. For example, in an image
+  pipeline, an element might be a single training example, with a pair of
+  tensors representing the image data and a label. A `Dataset` can either be a
+  *source* (e.g. `Dataset.from_tensor_slices()` constructs a dataset from one
+  or more `tf.Tensor` objects), or a *transformation* (e.g. `Dataset.batch()`
+  constructs a dataset by stacking consecutive elements of another dataset into
+  a single element).
+
+* A `tf.contrib.data.Iterator` provides the main way to extract elements from a
+  dataset. The `Iterator.get_next()` operation yields the next element of a
+  `Dataset`, and typically acts as the interface between input pipeline code and
+  your model. The simplest iterator is a "one-shot iterator", which is
+  associated with a particular `Dataset` and iterates through it once. For more
+  sophisticated uses, the `Iterator.initializer` operation enables you to
+  reinitialize and parameterize an iterator with different datasets, so that
+  you can, for example, iterate over training and validation data multiple times
+  in the same program.
+
+## Tutorial
+
+This programmers' guide includes step-by-step instructions for a variety of
+input data use cases. Also see the `Dataset` and `Iterator` class references
+for more detailed information about the API.
+
+### Basic mechanics
+
+This section of the guide describes the fundamentals of creating different kinds
+of `Dataset` and `Iterator` objects, and how to extract data from them.
+
+#### Defining a source dataset
+
+You can build a `Dataset` using one of the following *source* dataset
+constructors:
+
+* From in-memory data:
+  * `tf.contrib.data.Dataset.from_tensors()`
+  * `tf.contrib.data.Dataset.from_tensor_slices()`
+
+* From on-disk data:
+  * `tf.contrib.data.FixedLengthRecordDataset()`
+  * `tf.contrib.data.TextLineDataset()`
+  * `tf.contrib.data.TFRecordDataset()`
+
+* From parameters:
+  * `tf.contrib.data.Dataset.range()`
+
+#### Transforming a dataset
+
+The `tf.contrib.data.Dataset` class has many methods that can be chained
+together to *transform* one dataset into another:
+
+* Per-element transformations:
+  * `Dataset.filter()`
+  * `Dataset.flat_map()`
+  * `Dataset.map()`
+  * `Dataset.zip()`
+
+* Multi-element transformations:
+  * `Dataset.batch()`
+  * `Dataset.dense_to_sparse_batch()`
+  * `Dataset.group_by_window()`
+  * `Dataset.padded_batch()`
+  * `Dataset.repeat()`
+  * `Dataset.shuffle()`
+  * `Dataset.skip()`
+  * `Dataset.take()`
+
+The following sections contain examples of how to use these transformations to
+solve common problems.
+
+#### Dataset structure
+
+A dataset comprises elements that each have the same structure. An element
+contains one or more `tf.Tensor` objects, called *components*. Each component
+has a `tf.DType` representing the type of elements in the tensor, and a
+`tf.TensorShape` representing the (possibly partially specified) static shape of
+each element. The `Dataset.output_types` and `Dataset.output_shapes` properties
+allow you to inspect the inferred types and shapes of each component of a
+dataset element. The *nested structure* of these properties map to the structure
+of an element, which may be a single tensor, a tuple of tensors, or a nested
+tuple of tensors. For example:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+print(dataset1.output_types)  # ==> "tf.float32"
+print(dataset1.output_shapes)  # ==> "(10,)"
+
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices(
+   (tf.random_uniform([4]),
+    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))
+print(dataset2.output_types)  # ==> "(tf.float32, tf.int32)"
+print(dataset2.output_shapes)  # ==> "((), (100,))"
+
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
+print(dataset3.output_shapes)  # ==> "(10, ((), (100,)))"
+```
+
+The `Dataset` transformations support datasets of any structure. When using the
+`Dataset.map()`, `Dataset.flat_map()` and `Dataset.filter()` transformations,
+which apply a function to each element, the element structure determines the
+arguments of the function:
+
+```python
+dataset1 = dataset1.map(lambda x: ...)
+
+dataset2 = dataset2.flat_map(lambda x, y: ...)
+
+# *N.B.* Lambda argument destructuring is not available in Python 3.
+dataset3 = dataset3.filter(lambda x, (y, z): ...)
+```
+
+#### Creating an iterator
+
+One you have built a `Dataset` to represent your input data, the next step is to
+create an `Iterator` to access elements from that dataset.  The `Dataset` API
+currently supports three kinds of iterator, in increasing level of
+sophistication:
+
+A *one-shot* iterator is the simplest form of iterator, which only supports
+iterating once through a dataset, with no need for explicit initialization.
+One-shot iterators handle almost all of the cases that the existing queue-based
+input pipelines support, but they do not support parameterization. Using the
+example of `Dataset.range()`:
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+iterator = dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+An *initializable* iterator requires you to run an explicit
+`iterator.initializer` operation before using it. In exchange for this
+inconvenience, it enables you to *parameterize* the definition of the dataset,
+using one or more `tf.placeholder()` tensors that can be fed when you
+initialize the iterator. Continuing the `Dataset.range()` example:
+
+```python
+max_value = tf.placeholder(tf.int64, shape=[])
+dataset = tf.contrib.data.Dataset.range(max_value)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Initialize an iterator over a dataset with 10 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 10})
+for i in range(10):
+  value = sess.run(next_element)
+  assert i == value
+
+# Initialize the same iterator over a dataset with 100 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 100})
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+A *reinitializable* iterator can be initialized from multiple different
+`Dataset` objects. For example, you might have a training input pipeline that
+uses random perturbations to the input images to improve generalization, and
+a validation input pipeline that evaluates predictions on unmodified data. These
+pipelines will typically use different `Dataset` objects that have the same
+structure (i.e. the same types and compatible shapes for each component). 
+
+```python
+training_dataset = tf.contrib.data.Dataset.range(100).map(
+    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
+validation_dataset = tf.contrib.data.Dataset.range(50)
+
+# A reinitializable iterator is defined by its structure. We could use the
+# `output_types` and `output_shapes` properties of either `training_dataset`
+# or `validation_dataset` here, because they are compatible.
+iterator = Iterator.from_structure(training_dataset.output_types,
+                                   training_dataset.output_shapes)
+next_element = iterator.get_next()
+
+training_init_op = iterator.make_initializer(training_dataset)
+validation_init_op = iterator.make_initializer(validation_dataset)
+
+# Run 20 epochs in which the training dataset is traversed, followed by the
+# validation dataset.
+for _ in range(20):
+  # Initialize an iterator over the training dataset.
+  sess.run(training_init_op)
+  for _ in range(100):
+    sess.run(next_element)
+
+  # Initialize an iterator over the validation dataset.
+  sess.run(validation_init_op)
+  for _ in range(50):
+    sess.run(next_element)
+```
+
+#### Consuming values from an iterator
+
+The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that
+correspond to the symbolic next element of an iterator. Each time these tensors
+are evaluated, they take the value of the next element in the underlying
+dataset. (Note that, like other stateful objects in TensorFlow, calling
+`Iterator.get_next()` does not immediately advance the iterator. Instead you
+must use the returned `tf.Tensor` objects in a TensorFlow expression, and pass
+the result of that expression to `tf.Session.run()` to get the next elements and
+advance the iterator.)
+
+If the iterator reaches the end of the dataset, executing
+the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`.
+After this point the iterator will be in an unusable state, and you must
+initialize it again if you want to use it further.
+
+```python
+dataset = tf.contrib.data.Dataset.range(5)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Typically `result` will be the output of a model, or an optimizer's
+# training operation.
+result = tf.add(next_element, next_element)
+
+sess.run(iterator.initializer)
+print(sess.run(result))  # ==> "0"
+print(sess.run(result))  # ==> "2"
+print(sess.run(result))  # ==> "4"
+print(sess.run(result))  # ==> "6"
+print(sess.run(result))  # ==> "8"
+try:
+  sess.run(result)
+except tf.errors.OutOfRangeError:
+  print("End of dataset")  # ==> "End of dataset"
+```
+
+A common pattern is to wrap the "training loop" in a `try`-`except` block:
+
+```python
+sess.run(iterator.initializer)
+while True:
+  try:
+    sess.run(result)
+  except tf.errors.OutOfRangeError:
+    break
+```
+
+If each element of the dataset has a nested structure, the return value of
+`Iterator.get_next()` will be one or more `tf.Tensor` objects in the same
+nested structure:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100])))
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+
+iterator = dataset3.make_initializable_iterator()
+
+sess.run(iterator.initializer)
+next1, (next2, next3) = iterator.get_next()
+```
+
+Note that evaluating *any* of `next1`, `next2`, or `next3` will advance the
+iterator for all components. A typical consumer of an iterator will include all
+components in a single expression.
+
+### Reading input data
+
+#### Consuming NumPy arrays
+
+If all of your input data fit in memory, the simplest way to create a `Dataset`
+from them is to convert them to `tf.Tensor` objects and use
+`Dataset.from_tensor_slices()`.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features, labels))
+```
+
+Note that the above code snippet will embed the `features` and `labels` arrays
+in your TensorFlow graph as constants. This works well for a small dataset, but
+wastes memory, and can run into the 2GB limit for the `tf.GraphDef` protocol
+buffer.
+
+As an alternative, you can define the `Dataset` in terms of `tf.placeholder()`
+tensors, and *feed* the NumPy arrays when you initialize an `Iterator` over the
+dataset.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+features_placeholder = tf.placeholder(features.dtype, features.shape)
+labels_placeholder = tf.placeholder(labels.dtype, labels.shape)
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+# [Other transformations on `dataset`...]
+dataset = ...
+iterator = dataset.make_initializable_iterator()
+
+sess.run(iterator.initializer, feed_dict={features_placeholder: features,
+                                          labels_placeholder: labels})
+```
+
+#### Consuming TFRecord data
+
+The `Dataset` API supports a variety of file formats so that you can process
+large datasets that do not fit in memory. The TFRecord file format is a
+simple record-oriented binary format that many TensorFlow applications use for
+training data. The `tf.contrib.data.TFRecordDataset` class enables you to
+stream over the contents of one or more TFRecord files as part of an input
+pipeline.
+
+```python
+# Creates a dataset that reads all of the examples from two files.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+```
+
+The `filenames` argument to the `TFRecordDataset` initializer can be a
+`tf.Tensor` of strings. Therefore if you have two sets of files for training
+and validation purposes, you can use a `tf.placeholder(tf.string)` to represent
+the filenames, and initialize an iterator from the appropriate filenames:
+
+```python
+filenames = tf.placeholder(tf.string, shape=[None])
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+# [Other transformations on `dataset`...]
+dataset = ...
+iterator = dataset.make_initializable_iterator()
+
+# You can feed the initializer with the appropriate filenames for the current
+# phase of execution, e.g. training vs. validation.
+
+# Initialize `iterator` with training data.
+training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
+
+# Initialize `iterator` with validation data.
+validation_filenames = ["/var/data/validation1.tfrecord", ...]
+sess.run(iterator.initializer, feed_dict={filenames: validation_filenames})
+```
+
+#### Consuming text data
+
+Many datasets are distributed as one or more text files. The
+`tf.contrib.data.TextLineDataset` provides an easy way to extract lines from
+one or more text files. Given one or more filenames, a `TextLineDataset` will
+produce one string-valued element per line of those files. Like a
+`TFRecordDataset`, `TextLineDataset` accepts `filenames` as a `tf.Tensor`, so
+you can parameterize it by passing a `tf.placeholder(tf.string)`.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+dataset = tf.contrib.data.TextLineDataset(filenames)
+```
+
+By default, a `TextLineDataset` yields *every* line of each file, which may
+not be desirable, for example if the file starts with a header line, or contains
+comments. These lines can be removed using the `Dataset.skip()` and
+`Dataset.filter()` transformations. To apply these transformations to each
+file separately, we use `Dataset.flat_map()` to create a nested `Dataset` for
+each file.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)
+
+# Use `Dataset.flat_map()` to transform each file separately.
+# * Skip the first line (header row).
+# * Filter out lines beginning with "#" (comments).
+dataset = dataset.flat_map(
+    lambda filename: (
+        tf.contrib.data.Dataset.TextLineDataset(filename)
+        .skip(1)
+        .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
+```
+
+<!--
+TODO(mrry): Add these sections.
+
+#### Consuming from a Python generator
+#### Consuming from an index file and images
+-->
+
+### Preprocessing data with `Dataset.map()`
+
+The `Dataset.map(f)` transformation produces a new dataset by applying a given
+function `f` to each element of the input dataset. It is based on
+the
+[`map()` function](https://en.wikipedia.org/wiki/Map_(higher-order_function))
+that is commonly applied to lists (and other structures) in functional
+programming languages.  The function `f` takes the `tf.Tensor` objects that
+represent a single element in the input, and returns the `tf.Tensor` objects
+that will represent a single element in the new dataset. Its implementation uses
+standard TensorFlow operations to transform one element into another.
+
+This section covers common examples of how to use `Dataset.map()`.
+
+#### Parsing `tf.Example` protocol buffer messages
+
+Many input pipelines extract `tf.train.Example` protocol buffer messages from a
+TFRecord-format file (written, for example, using
+`tf.python_io.TFRecordWriter`). Each `tf.train.Example` record contains one or
+more "features", and the input pipeline typically converts these features into
+tensors.
+
+```python
+# Transforms a scalar string `example_proto` into a pair of a scalar string and
+# a scalar integer, representing an image and its label, respectively.
+def _parse_function(example_proto):
+  features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
+              "label": tf.FixedLenFeature((), tf.int32, default_value=0)}
+  parsed_features = tf.parse_single_example(example_proto, features)
+  return parsed_features["image"], parsed_features["label"]
+
+# Creates a dataset that reads all of the examples from two files, and extracts
+# the image and label features.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(_parse_function)
+```
+
+#### Decoding image data and resizing it
+
+When training a neural network on real-world image data, it is often necessary
+to convert images of different sizes to a common size, so that they may be
+batched into a fixed size.
+
+```python
+# Reads an image from a file, decodes it into a dense tensor, and resizes it
+# to a fixed shape.
+def _parse_function(filename, label):
+  image_string = tf.read_file(filename)
+  image_decoded = tf.image.decode_image(image_string)
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
+labels = tf.constant([0, 37, 29, 1, ...])
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(_parse_function)
+```
+
+#### Applying arbitrary Python logic with `tf.py_func()`
+
+For performance reasons, we encourage you to use TensorFlow operations for
+preprocessing your data whenever possible. However, it is sometimes useful to
+be able to call upon external Python libraries when parsing your input data,
+and you can do this by invoking the `tf.py_func()` operation in a
+`Dataset.map()` transformation.
+
+```python
+import cv2
+
+# Use a custom OpenCV function to read the image, instead of the standard
+# TensorFlow `tf.read_file()` operation.
+def _read_py_function(filename, label):
+  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
+  return image_decoded, label
+
+# Use standard TensorFlow operations to resize the image to a fixed shape.
+def _resize_function(image_decoded, label):
+  image_decoded.set_shape([None, None, None])
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
+labels = [0, 37, 29, 1, ...]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(
+    lambda filename, label: tf.py_func(
+        _read_py_function, [filename, label], [tf.uint8, label.dtype]))
+dataset = dataset.map(_resize_function)
+```
+
+<!--
+TODO(mrry): Add this section.
+
+#### Handling text data with unusual sizes
+-->
+
+### Batching dataset elements
+
+#### Simple batching
+
+The simplest form of batching stacks `n` consecutive elements of a dataset into
+a single element. The `Dataset.batch()` transformation does exactly this, with
+the same constraints as the `tf.stack()` operator, applied to each component
+of the elements: i.e. for each component *i*, all elements must have a tensor
+of the exact same shape.
+
+```python
+inc_dataset = tf.contrib.data.Dataset.range(100)
+dec_dataset = tf.contrib.data.Dataset.range(0, -100, -1)
+dataset = tf.contrib.data.Dataset.zip((inc_dataset, dec_dataset))
+batched_dataset = dataset.batch(4)
+
+iterator = batched_dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> ([0, 1, 2,   3],   [ 0, -1,  -2,  -3])
+print(sess.run(next_element))  # ==> ([4, 5, 6,   7],   [-4, -5,  -6,  -7])
+print(sess.run(next_element))  # ==> ([8, 9, 10, 11],   [-8, -9, -10, -11])
+```
+
+#### Batching tensors with padding
+
+The above recipe works for tensors that all have the same size. However, many
+models (e.g. sequence models) work with input data that can have varying size
+(e.g. sequences of different lengths). To handle this case, the
+`Dataset.padded_batch()` transformation enables you to batch tensors of
+different shape by specifying one or more dimensions in which they may be
+padded.
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
+dataset = dataset.padded_batch(4, padded_shapes=[None])
+
+iterator = batched_dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]]
+print(sess.run(next_element))  # ==> [[4, 4, 4, 4, 0, 0, 0],
+                               #      [5, 5, 5, 5, 5, 0, 0],
+                               #      [6, 6, 6, 6, 6, 6, 0],
+                               #      [7, 7, 7, 7, 7, 7, 7]]
+```
+
+The `Dataset.padded_batch()` transformation allows you to set different padding
+for each dimension of each component, and it may be variable-length (signified
+by `None` in the example above) or constant-length. It is also possible to
+override the padding value, which defaults to 0.
+
+<!--
+TODO(mrry): Add this section.
+
+#### Dense ragged -> tf.SparseTensor
+-->
+
+### Training workflows
+
+#### Processing multiple epochs
+
+The `Dataset` API offers two main ways to process multiple epochs of the same
+data.
+
+The simplest way to iterate over a dataset in multiple epochs is to use the
+`Dataset.repeat()` transformation. For example, to create a dataset that repeats
+its input for 10 epochs:
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.repeat(10)
+dataset = dataset.batch(32)
+```
+
+Applying the `Dataset.repeat()` transformation with no arguments will repeat
+the input indefinitely. The `Dataset.repeat()` transformation concatenates its
+arguments without signaling the end of one epoch and the beginning of the next
+epoch.
+
+If you want to receive a signal at the end of each epoch, you can write a
+training loop that catches the `tf.errors.OutOfRangeError` at the end of a
+dataset. At that point you might collect some statistics (e.g. the validation
+error) for the epoch.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.batch(32)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Compute for 100 epochs.
+for _ in range(100):
+  sess.run(iterator.initializer)
+  while True:
+    try:
+      sess.run(next_element)
+    except tf.errors.OutOfRangeError:
+      break
+
+  # [Perform end-of-epoch calculations here.]
+```
+
+#### Randomly shuffling input data
+
+The `Dataset.shuffle()` transformation randomly shuffles the input dataset
+using a similar algorithm to `tf.RandomShuffleQueue`: it maintains a fixed-size
+buffer and chooses the next element uniformly at random from that buffer.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.repeat()
+dataset = dataset.shuffle(buffer_size=10000)
+dataset = dataset.batch(32)
+```
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
new file mode 100644
index 00000000000..5308ab64ace
--- /dev/null
+++ b/tensorflow/contrib/data/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.contrib.data.Dataset` API for input pipelines.
+
+@@Dataset
+@@Iterator
+@@TFRecordDataset
+@@FixedLengthRecordDataset
+@@TextLineDataset
+
+@@read_batch_features
+@@rejection_resample
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
+from tensorflow.contrib.data.python.ops.dataset_ops import FixedLengthRecordDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import Iterator
+from tensorflow.contrib.data.python.ops.dataset_ops import read_batch_features
+from tensorflow.contrib.data.python.ops.dataset_ops import rejection_resample
+from tensorflow.contrib.data.python.ops.dataset_ops import TextLineDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import TFRecordDataset
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/data/python/framework/BUILD b/tensorflow/contrib/data/python/framework/BUILD
new file mode 100644
index 00000000000..7b84825bb4f
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "function",
+    srcs = ["function.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+    ],
+)
+
+py_test(
+    name = "function_test",
+    size = "medium",
+    srcs = ["function_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/framework/function.py b/tensorflow/contrib/data/python/framework/function.py
new file mode 100644
index 00000000000..8c6bcb858fa
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/function.py
@@ -0,0 +1,268 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An experimental fork of the Python TensorFlow-function library.
+
+NOTE: functions are currently experimental and subject to change!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import tf_inspect
+
+# NOTE(mrry): This is an experimental extension of a core class that wasn't
+# designed to be extended, so we disable protected access checks for the
+# whole file.
+# pylint: disable=protected-access
+
+
+class _ExperimentalFuncGraph(function._FuncGraph):
+  """A helper for construction a function (supporting capture-by-value).
+
+  _ExperimentalFuncGraph overrides ops.Graph's create_op() so that we can keep
+  track of every inputs into every op created inside the function.  If
+  any input is from other graphs, we keep track of it in self.capture
+  and substitute the input with a place holder.
+
+  Each captured input's corresponding place holder is converted into a
+  function argument and the caller passes in the captured tensor.
+  """
+
+  def __init__(self, capture_by_value, *args, **kwargs):
+    super(_ExperimentalFuncGraph, self).__init__(*args, **kwargs)
+    self._capture_by_value = capture_by_value
+    self._building_function = True
+    self._outer_graph = ops.get_default_graph()
+    self._vscope = vs.get_variable_scope()
+    self._old_custom_getter = self._vscope.custom_getter
+    self._captured = {}
+    self.extra_inputs = []
+    self.extra_args = []
+    self.extra_vars = []
+
+  def create_op(self, op_type, inputs, data_types, **kwargs):
+    for i, x in enumerate(inputs):
+      if x.graph is not self:
+        # Referring to a tensor from other graph.
+        if x in self._captured:
+          # Captured already.
+          inputs[i] = self._captured[x]
+        elif self._capture_by_value:
+          inputs[i] = self._add_tensor_and_parents(x)
+        else:
+          # Substitute with a placeholder.
+          self.extra_inputs.append(x)
+          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # pylint: disable=protected-access
+          ph._handle_data = x._handle_data
+          # pylint: enable=protected-access
+          inputs[i] = ph
+          self._captured[x] = ph
+          self.extra_args.append(ph)
+    return super(_ExperimentalFuncGraph, self).create_op(op_type, inputs,
+                                                         data_types, **kwargs)
+
+  def _add_tensor_and_parents(self, tensor):
+    op = self._add_op_and_parents(tensor.op)
+    return op.outputs[tensor.value_index]
+
+  def _add_op_and_parents(self, op):
+    op_def = function._get_op_def(op)
+    if op_def.is_stateful:
+      raise ValueError("Cannot capture a stateful node (name:%s, type:%s) "
+                       "by value." % (op.name, op.type))
+    elif op.type in ("Placeholder", "PlaceholderV2"):
+      raise ValueError("Cannot capture a placeholder (name:%s, type:%s) "
+                       "by value." % (op.name, op.type))
+
+    captured_inputs = [self._add_tensor_and_parents(x) for x in op.inputs]
+
+    captured_op = self.create_op(op.type, captured_inputs,
+                                 [o.dtype for o in op.outputs],
+                                 name=op.name, attrs=op.node_def.attr,
+                                 op_def=op_def)
+
+    for t, captured_t in zip(op.outputs, captured_op.outputs):
+      self._captured[t] = captured_t
+
+    return captured_op
+
+
+class _ExperimentalDefinedFunction(function._DefinedFunction):
+  """Overrides _DefinedFunction with support for capture-by-value."""
+
+  def __init__(self,
+               func,
+               argnames,
+               input_types,
+               func_name=None,
+               grad_func=None,
+               python_grad_func=None,
+               out_names=None,
+               shape_func=None,
+               capture_by_value=False,
+               **kwargs):
+    """Creates an _ExperimentalDefinedFunction.
+
+    Args:
+      func:  A python callable which constructs a tf function body.
+      argnames: A list of strings for function argument names.
+      input_types: The function's argument types. Can be a tuple, list of
+        tf data types.
+      func_name: The function name. Defaults to None, in which derives from
+        'func'.
+      grad_func: This function's gradient function, if not None. Defaults
+        to None.
+      python_grad_func: A python callable implementing the gradient of
+        the function python-side.
+      out_names: An optional list of strings for the function return value
+        names.
+      shape_func: An optional function mapping an op to a list of static
+        output shapes.
+      capture_by_value: Boolean (defaults to False). If True, captured values
+        will be copied into the function body.
+      **kwargs: The keyword arguments. **kwargs is passed to every call
+        site of this function.
+
+    Raises:
+      ValueError: The function definition is invalid.
+    """
+    super(_ExperimentalDefinedFunction, self).__init__(
+        func, argnames, input_types, func_name, grad_func, python_grad_func,
+        out_names, shape_func, **kwargs)
+    self._capture_by_value = capture_by_value
+
+  def _create_definition_if_needed(self):
+    """Creates the function definition if it's not created yet."""
+
+    if self._definition is not None:
+      return
+
+    # Create the func_def object.
+    temp_graph = _ExperimentalFuncGraph(capture_by_value=self._capture_by_value)
+    with temp_graph.as_default():
+      # List of placeholders for the function_def.
+      inputs = []
+      for (argname, argtype) in self._args:
+        argholder = array_ops.placeholder(argtype, name=argname)
+        inputs.append(argholder)
+      # Call func and gather the output tensors.
+      with vs.variable_scope("", custom_getter=temp_graph.getvar):
+        outputs = self._func(*inputs)
+      # If func only returned one value, make it a tuple.
+      if not isinstance(outputs, (list, tuple)):
+        outputs = (outputs,)
+      if any([_ is None for _ in outputs]):
+        raise ValueError("Function can not return None.")
+      # Ensures each output is a Tensor.
+      outputs = [ops.convert_to_tensor(_) for _ in outputs]
+    self._extra_inputs = temp_graph.extra_inputs
+    inputs.extend(temp_graph.extra_args)
+    self._sub_functions = temp_graph._functions
+
+    # Build the FunctionDef
+    self._definition = function._graph_to_function_def(
+        temp_graph, temp_graph.get_operations(), inputs, outputs,
+        out_names=self._out_names)
+
+    # Extra kwargs are treated as attrs on the function def.
+    sig_pre_func_name = self._func_name or function._get_func_name(self._func)
+    kwargs_attr = function._parse_kwargs_as_attrs(
+        sig_pre_func_name, **self._extra_kwargs)
+    for k in kwargs_attr:
+      self._definition.attr[k].CopyFrom(kwargs_attr[k])
+
+    # Hash the definition and its dependencies.
+    self._hash_str = self._create_hash_str(
+        self._definition.signature.input_arg,
+        self._definition.signature.output_arg,
+        self._definition.node_def)
+
+    # Finally, we decide the function name to use.  If not specified,
+    # make up something which is almost certainly unique (but deterministic).
+    if not self._func_name:
+      self._func_name = "_".join([function._get_func_name(self._func),
+                                  self._hash_str])
+    self._definition.signature.name = self._func_name
+    if self._func.__doc__:
+      self._definition.signature.description = self._func.__doc__
+
+
+class Defun(function.Defun):
+  """Experimental version of Defun supporting capture-by-value."""
+
+  def __init__(self, *input_types, **kwargs):
+    """Create an experimental `Defun` decorator.
+
+    Args:
+      *input_types: A list of `tf.DType`
+      **kwargs: Optional keyword arguments (see `function.Defun`) plus:
+        capture_by_value - Boolean (defaults to False). If True, captured values
+        will be copied into the function body.
+    """
+    super(Defun, self).__init__(*input_types, **kwargs)
+
+  def __call__(self, func):
+    # Various sanity checks on the callable func.
+    if not callable(func):
+      raise ValueError("func %s must be callable" % func)
+
+    # Func should not use kwargs and defaults.
+    argspec = tf_inspect.getargspec(func)
+    if argspec.keywords or argspec.defaults:
+      raise ValueError("Functions with argument defaults or keyword "
+                       "arguments are not supported.")
+
+    # Computes how many arguments 'func' has.
+    min_args = len(argspec.args)
+    max_args = min_args
+    if argspec.varargs:
+      max_args = 1000000
+    argnames = argspec.args
+    if tf_inspect.ismethod(func):
+      # 1st argument is the "class" type.
+      min_args -= 1
+      argnames = argnames[1:]
+
+    if self._input_types:
+      # If Defun is given a list of types for the inputs, the number
+      # of input types should be compatible with 'func'.
+      num = len(self._input_types)
+      if num < min_args or num > max_args:
+        raise ValueError(
+            "The function has fewer arguments than the number of specified "
+            "input types.")
+      return _ExperimentalDefinedFunction(
+          func, argnames, self._input_types, self._func_name, self._grad_func,
+          self._python_grad_func, out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # 'func' expects no arguments and input types is an empty list.
+    if min_args == 0 and max_args == 0:
+      return _ExperimentalDefinedFunction(
+          func, [], [], self._func_name, self._grad_func,
+          self._python_grad_func, out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # Input types are unknown. It's an overloaded function and hence
+    # its definition needs to be deferred until it's called.
+    return function._OverloadedFunction(
+        func, argnames, self._func_name, self._grad_func,
+        self._python_grad_func, out_names=self._out_names, **self._extra_kwargs)
diff --git a/tensorflow/contrib/data/python/framework/function_test.py b/tensorflow/contrib/data/python/framework/function_test.py
new file mode 100644
index 00000000000..c493170d282
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/function_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for experimental capture-by-value feature in TF functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.framework import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FunctionTest(test.TestCase):
+
+  def testCaptureByValue(self):
+    g = ops.Graph()
+    with g.as_default():
+      w = constant_op.constant([[1.0]])
+      b = constant_op.constant([2.0])
+
+      # Foo() captures w and b.
+      @function.Defun(dtypes.float32, capture_by_value=True)
+      def Foo(x):
+
+        # Plus() captures b.
+        @function.Defun(dtypes.float32, capture_by_value=True)
+        def Plus(y):
+          return y + b
+
+        self.assertEqual(0, len(Plus.captured_inputs))
+
+        return Plus(math_ops.matmul(w, x))
+
+      y = Foo(constant_op.constant([[10.]]))
+
+    self.assertEqual(0, len(Foo.captured_inputs))
+
+    with self.test_session(graph=g):
+      self.assertAllEqual(y.eval(), [[12.0]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
new file mode 100644
index 00000000000..ab4d80c3275
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -0,0 +1,238 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "bucketing_test",
+    size = "small",
+    srcs = ["bucketing_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "dataset_constructor_op_test",
+    size = "small",
+    srcs = ["dataset_constructor_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "nomac",  # b/62040583
+    ],
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "flat_map_dataset_op_test",
+    size = "small",
+    srcs = ["flat_map_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "reader_dataset_ops_test",
+    size = "small",
+    srcs = ["reader_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "resample_test",
+    size = "medium",
+    srcs = ["resample_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "sequence_dataset_op_test",
+    size = "small",
+    srcs = ["sequence_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "cache_dataset_op_test",
+    size = "small",
+    srcs = ["cache_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
new file mode 100644
index 00000000000..1f87f141872
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -0,0 +1,292 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class BatchDatasetTest(test.TestCase):
+
+  def testBatchDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> BatchDataset(batch_size).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+                .repeat(count).batch(batch_size).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [t.shape.as_list() for t in get_next])
+
+    with self.test_session() as sess:
+      # Batch of a finite input, where the batch_size divides the
+      # total number of elements.
+      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
+      num_batches = (28 * 7) // 14
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of a finite input, where the batch_size does not
+      # divide the total number of elements.
+      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
+
+      # We expect (num_batches - 1) full-sized batches.
+      num_batches = int(math.ceil((14 * 7) / 8))
+      for i in range(num_batches - 1):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(8):
+            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+                                result_component[j])
+      result = sess.run(get_next)
+      for component, result_component in zip(components, result):
+        for j in range((14 * 7) % 8):
+          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
+                              result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of an empty input should fail straight away.
+      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Empty batch should be an initialization time error.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+
+  def testPaddedBatchDataset(self):
+    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
+    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
+                .map(lambda x: array_ops.fill([x], x)).padded_batch(
+                    4,
+                    padded_shapes=padded_shape).make_initializable_iterator())
+
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Test with random sequence lengths, and max padding.
+      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        padded_len = np.max(result)
+        self.assertEqual((4, padded_len), result.shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with random sequence lengths, and constant padding.
+      sess.run(init_op, feed_dict={padded_shape: [25],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        self.assertEqual((4, 25), result.shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test correct handling of empty tensors.
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: [0, 0, 0, 0]})
+      result = sess.run(get_next)
+      self.assertAllEqual([[], [], [], []], result)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test error handling with constant sequence lengths, and
+      # too-short padding.
+      sess.run(init_op, feed_dict={padded_shape: [5],
+                                   seq_lens: [6, 5, 5, 5]})
+      with self.assertRaises(errors.DataLossError):
+        result = sess.run(get_next)
+
+  def testPaddedBatchDatasetNonDefaultPadding(self):
+    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
+    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
+
+    def fill_tuple(x):
+      filled = array_ops.fill([x], x)
+      return (filled, string_ops.as_string(filled))
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+                .padded_batch(
+                    4,
+                    padded_shapes=(padded_shape, padded_shape),
+                    padding_values=(-1, "<end>")).make_initializable_iterator())
+
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Test with random sequence lengths, and max padding.
+      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        padded_len = np.max(result[0])
+        self.assertEqual((4, padded_len), result[0].shape)
+        self.assertEqual((4, padded_len), result[1].shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[0][j, seq_len:],
+                              [-1] * (padded_len - seq_len))
+          self.assertAllEqual(result[1][j, :seq_len],
+                              [compat.as_bytes(str(seq_len))] * seq_len)
+          self.assertAllEqual(result[1][j, seq_len:],
+                              [b"<end>"] * (padded_len - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testPaddedBatchDatasetShapeSpecifications(self):
+    int_placeholder = array_ops.placeholder(dtypes.int32)
+    float_placeholder = array_ops.placeholder(dtypes.float32)
+    string_placeholder = array_ops.placeholder(dtypes.string)
+    input_dataset = dataset_ops.Dataset.from_tensors(
+        (int_placeholder, float_placeholder, string_placeholder))
+
+    # Test different ways of specifying the `padded_shapes` argument.
+    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
+        32,
+        padded_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([None, None]),
+                       tensor_shape.TensorShape([37])))
+    dynamic_padding_from_lists = input_dataset.padded_batch(
+        32, padded_shapes=([None], [None, None], [37]))
+    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
+        32, padded_shapes=([-1], [-1, -1], [37]))
+    dynamic_padding_from_tensors = input_dataset.padded_batch(
+        32,
+        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
+                       constant_op.constant([-1, -1], dtype=dtypes.int64),
+                       constant_op.constant([37], dtype=dtypes.int64)))
+
+    for dataset in [dynamic_padding_from_tensor_shapes,
+                    dynamic_padding_from_lists,
+                    dynamic_padding_from_lists_with_minus_one,
+                    dynamic_padding_from_tensors]:
+      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+
+  def testDenseToSparseBatchDataset(self):
+    components = np.random.randint(12, size=(100,)).astype(np.int32)
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(lambda x: array_ops.fill([x], x)).dense_to_sparse_batch(
+                    4, [12]).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      for start in range(0, len(components), 4):
+        results = sess.run(get_next)
+        self.assertAllEqual(
+            [[i, j] for i, c in enumerate(components[start:start+4])
+             for j in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start+4] for _ in range(c)],
+            results.values)
+        self.assertAllEqual(
+            [min(4, len(components) - start), 12], results.dense_shape)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDenseToSparseBatchDatasetShapeErrors(self):
+    input_tensor = array_ops.placeholder(dtypes.int32)
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
+                .dense_to_sparse_batch(4, [12]).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      # Initialize with an input tensor of incompatible rank.
+      sess.run(init_op, feed_dict={input_tensor: [[1]]})
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "incompatible with the row shape"):
+        sess.run(get_next)
+
+      # Initialize with an input tensor that is larger than `row_shape`.
+      sess.run(init_op, feed_dict={input_tensor: range(13)})
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   "larger than the row shape"):
+        sess.run(get_next)
+
+  def testUnbatchDataset(self):
+    data = [math_ops.range(10) for _ in range(3)]
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    data = data.batch(2)
+    data = data.unbatch()
+
+    iter = data.make_one_shot_iterator()
+    op = iter.get_next()
+
+    with self.test_session() as sess:
+      for i in range(3):
+        self.assertAllClose([range(10)], sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
new file mode 100644
index 00000000000..20d66d7f231
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -0,0 +1,292 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class BucketingTest(test.TestCase):
+
+  def testSimple(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
+        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          result = sess.run(get_next)
+          self.assertTrue(
+              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              for x in result)
+          counts.append(result.shape[0])
+
+      self.assertEqual(len(components), sum(counts))
+      num_full_batches = len([c for c in counts if c == 4])
+      self.assertGreaterEqual(num_full_batches, 23)
+      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
+
+  def testImmediateOutput(self):
+    components = np.array(
+        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1)
+        .group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      # The input is infinite, so this test demonstrates that:
+      # 1. We produce output without having to consume the entire input,
+      # 2. Different buckets can produce output at different rates, and
+      # 3. For deterministic input, the output is deterministic.
+      for _ in range(3):
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+
+  def testSmallGroups(self):
+    components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      # The small outputs at the end are deterministically produced in key
+      # order.
+      self.assertAllEqual([0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1], sess.run(get_next))
+
+  def testReduceFuncError(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+
+    def reduce_func(_, xs):
+      # Introduce an incorrect padded shape that cannot (currently) be
+      # detected at graph construction time.
+      return xs.padded_batch(
+          4,
+          padded_shapes=(tensor_shape.TensorShape([]),
+                         constant_op.constant([5], dtype=dtypes.int64) * -1))
+
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: (x, ops.convert_to_tensor([x * x])))
+        .group_by_window(lambda x, _: x % 2, reduce_func, 32))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+  def testConsumeWindowDatasetMoreThanOnce(self):
+    components = np.random.randint(50, size=(200,)).astype(np.int64)
+
+    def reduce_func(key, window):
+      # Apply two different kinds of padding to the input: tight
+      # padding, and quantized (to a multiple of 10) padding.
+      return dataset_ops.Dataset.zip((window.padded_batch(
+          4,
+          padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch(
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
+        .group_by_window(
+            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
+            reduce_func, 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          tight_result, multiple_of_10_result = sess.run(get_next)
+          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
+          self.assertAllEqual(tight_result,
+                              multiple_of_10_result[:, :tight_result.shape[1]])
+          counts.append(tight_result.shape[0])
+      self.assertEqual(len(components), sum(counts))
+
+
+# NOTE(mrry): These tests are based on the tests in
+# bucket_ops_test.py. Currently, different batch sizes for each key
+# are not supported, although this would be possible to add to
+# `Dataset.group_by_window()`.
+class BucketTest(test.TestCase):
+
+  def _dynamicPad(self, bucket, window, window_size):
+    # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a
+    # generic form of padded_batch that pads every component
+    # dynamically and does not rely on static shape information about
+    # the arguments.
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
+            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
+                 tensor_shape.TensorShape([3])))))
+
+  def testSingleBucket(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32),
+        32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      which_bucket, bucketed_values = sess.run(get_next)
+
+      self.assertEqual(0, which_bucket)
+
+      expected_scalar_int = np.arange(32, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
+      for i in range(32):
+        expected_unk_int64[i, :i] = i
+      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
+
+  def testEvenOddBuckets(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches (one containing even values, one containing odds)
+      which_bucket_even, bucketed_values_even = sess.run(get_next)
+      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+
+      # Count number of bucket_tensors.
+      self.assertEqual(3, len(bucketed_values_even))
+      self.assertEqual(3, len(bucketed_values_odd))
+
+      # Ensure bucket 0 was used for all minibatch entries.
+      self.assertAllEqual(0, which_bucket_even)
+      self.assertAllEqual(1, which_bucket_odd)
+
+      # Test the first bucket outputted, the events starting at 0
+      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i] = 2 * i
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
+
+      # Test the second bucket outputted, the odds starting at 1
+      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
+
+  def testEvenOddBucketsFilterOutAllOdd(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
+        .filter(lambda x, y, z: math_ops.equal(x % 2, 0)))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
+      which_bucket0, bucketed_values_even0 = sess.run(get_next)
+      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+
+      # Ensure that bucket 1 was completely filtered out
+      self.assertAllEqual(0, which_bucket0)
+      self.assertAllEqual(0, which_bucket1)
+      self.assertAllEqual(
+          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0[0])
+      self.assertAllEqual(
+          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
new file mode 100644
index 00000000000..364c1be8eaf
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
@@ -0,0 +1,299 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class FilesystemCacheDatasetTest(test.TestCase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+    self.cache_prefix = path.join(self.tmp_dir, "cache")
+
+  def tearDown(self):
+    if self.tmp_dir:
+      shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def testCacheDatasetPassthrough(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+    count_placeholder = array_ops.placeholder_with_default(
+        constant_op.constant(5, dtypes.int64), shape=[])
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+
+    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .repeat(count_placeholder))
+
+    cache_dataset = repeat_dataset.cache(filename_placeholder)
+
+    self.assertEqual(
+        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
+
+    # Create initialization ops for iterators without and with
+    # caching, respectively.
+    iterator = dataset_ops.Iterator.from_structure(cache_dataset.output_types,
+                                                   cache_dataset.output_shapes)
+    init_fifo_op = iterator.make_initializer(repeat_dataset)
+    init_cache_op = iterator.make_initializer(cache_dataset)
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # First run without caching to collect the "ground truth".
+      sess.run(init_fifo_op)
+      elements = []
+      for _ in range(20):
+        elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Assert that the cached dataset has the same elements as the
+      # "ground truth".
+      sess.run(
+          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
+      cached_elements = []
+      for _ in range(20):
+        cached_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertAllEqual(elements, cached_elements)
+
+      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
+      # if we didn't use the cache).
+      sess.run(
+          init_cache_op,
+          feed_dict={
+              count_placeholder: 0,
+              filename_placeholder: self.cache_prefix
+          })
+      replayed_elements = []
+      for _ in range(20):
+        replayed_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(cached_elements, replayed_elements)
+
+      # Re-initialize with an empty upstream and a missing cache file (should
+      # throw errors.OutOfRangeError immediately).
+      sess.run(
+          init_cache_op,
+          feed_dict={
+              count_placeholder: 0,
+              filename_placeholder: self.cache_prefix + "nonsense"
+          })
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConcurrentWriters(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+
+    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .cache(filename_placeholder))
+    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .cache(filename_placeholder))
+
+    iterator1 = cache_dataset1.make_initializable_iterator()
+    iterator2 = cache_dataset2.make_initializable_iterator()
+    init_cache_op1 = iterator1.initializer
+    init_cache_op2 = iterator2.initializer
+
+    get_next1 = iterator1.get_next()
+    get_next2 = iterator2.get_next()
+
+    with self.test_session() as sess:
+      sess.run(
+          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
+      sess.run(get_next1)  # this should succeed
+
+      sess.run(
+          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
+      with self.assertRaises(errors.AlreadyExistsError):
+        sess.run(get_next2)
+
+      sess.run(get_next1)  # this should continue to succeed
+
+  def testConcurrentReaders(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+
+    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .cache(filename_placeholder))
+    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .cache(filename_placeholder))
+
+    iterator1 = cache_dataset1.make_initializable_iterator()
+    iterator2 = cache_dataset2.make_initializable_iterator()
+    init_cache_op1 = iterator1.initializer
+    init_cache_op2 = iterator2.initializer
+
+    get_next1 = iterator1.get_next()
+    get_next2 = iterator2.get_next()
+
+    with self.test_session() as sess:
+      sess.run(
+          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
+      elements = []
+      for _ in range(4):
+        elements.append(sess.run(get_next1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next1)
+
+      # Re-initialize
+      sess.run(
+          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
+      sess.run(
+          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
+
+      # Reading concurrently should succeed.
+      elements_itr1 = []
+      elements_itr2 = []
+      elements_itr2.append(sess.run(get_next2))
+      elements_itr1.append(sess.run(get_next1))
+      elements_itr2.append(sess.run(get_next2))
+      elements_itr1.append(sess.run(get_next1))
+      # Intentionally reversing the order
+      elements_itr1.append(sess.run(get_next1))
+      elements_itr2.append(sess.run(get_next2))
+      elements_itr1.append(sess.run(get_next1))
+      elements_itr2.append(sess.run(get_next2))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next2)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next1)
+
+      self.assertAllEqual(elements, elements_itr1)
+      self.assertAllEqual(elements, elements_itr2)
+
+
+class MemoryCacheDatasetTest(test.TestCase):
+
+  def testCacheDatasetPassthrough(self):
+    repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
+    dataset = dataset_ops.Dataset.range(3).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
+
+    cached_dataset = dataset.cache().repeat(2)
+    uncached_dataset = dataset.repeat(2)
+
+    # Needs to be initializable to capture the variable.
+    cached_iterator = cached_dataset.make_initializable_iterator()
+    cached_next = cached_iterator.get_next()
+    uncached_iterator = uncached_dataset.make_initializable_iterator()
+    uncached_next = uncached_iterator.get_next()
+
+    with self.test_session() as sess:
+
+      sess.run(repeat_count.initializer)
+      sess.run(cached_iterator.initializer)
+      sess.run(uncached_iterator.initializer)
+
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(sess.run(cached_next), i)
+          self.assertEqual(sess.run(uncached_next), i)
+
+      sess.run(repeat_count.assign(0))
+
+      # The uncached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(uncached_next)
+
+      # The cached iterator replays from cache.
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(sess.run(cached_next), i)
+
+      # The cached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(cached_next)
+
+  def testEmptyCacheReading(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+    count_placeholder = array_ops.placeholder_with_default(
+        constant_op.constant(5, dtypes.int64), shape=[])
+
+    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .repeat(count_placeholder))
+
+    cache_dataset = repeat_dataset.cache()
+
+    # Create initialization ops for iterators without and with
+    # caching, respectively.
+    iterator = cache_dataset.make_initializable_iterator()
+    init_cache_op = iterator.initializer
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Initialize with an empty upstream and a missing cache file (should
+      # throw errors.OutOfRangeError immediately).
+      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConcurrentReaders(self):
+    count_placeholder = array_ops.placeholder_with_default(
+        constant_op.constant(5, dtypes.int64), shape=[])
+    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
+    d1 = dataset.map(lambda x: x + 1)
+    d2 = dataset.map(lambda x: x + 6)
+
+    i1 = d1.make_initializable_iterator()
+    i2 = d2.make_initializable_iterator()
+
+    with self.test_session() as sess:
+      sess.run(i1.initializer)
+
+      self.assertEqual(1, sess.run(i1.get_next()))
+      self.assertEqual(2, sess.run(i1.get_next()))
+      self.assertEqual(3, sess.run(i1.get_next()))
+
+      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
+
+      self.assertEqual(6, sess.run(i2.get_next()))
+      self.assertEqual(7, sess.run(i2.get_next()))
+      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
+      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(i1.get_next())
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(i2.get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
new file mode 100644
index 00000000000..6a7bc99fa88
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -0,0 +1,260 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class DatasetConstructorTest(test.TestCase):
+
+  def testTensorDataset(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testTensorSliceDataset(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
+            np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    )
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(4):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testTensorSliceDatasetWithDict(self):
+    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
+    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
+    self.assertEqual((), iterator.output_shapes["foo"])
+    self.assertEqual((1,), iterator.output_shapes["bar"])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(3):
+        results = sess.run(get_next)
+        self.assertEqual(components["foo"][i], results["foo"])
+        self.assertEqual(components["bar"][i], results["bar"])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparseTensorSliceDataset(self):
+    """Test a dataset based on slices of a `tf.SparseTensor`."""
+    st = array_ops.sparse_placeholder(dtypes.float64)
+    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+
+      # Test with sparse tensor in the appropriate order.
+      indices = np.array(
+          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
+      values = np.array([val for s in slices for val in s])
+      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
+      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
+                                                    dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      for i, s in enumerate(slices):
+        results = sess.run(get_next)
+        self.assertAllEqual(s, results.values)
+        expected_indices = np.array(
+            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
+        self.assertAllEqual(expected_indices, results.indices)
+        self.assertAllEqual(dense_shape[1:], results.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with sparse tensor in the reverse order, which is not
+      # currently supported.
+      reverse_order_indices = indices[::-1, :]
+      reverse_order_values = values[::-1]
+      sparse_feed = sparse_tensor.SparseTensorValue(
+          reverse_order_indices, reverse_order_values, dense_shape)
+      with self.assertRaises(errors.UnimplementedError):
+        sess.run(init_op, feed_dict={st: sparse_feed})
+
+      # Test with an empty sparse tensor.
+      empty_indices = np.empty((0, 4), dtype=np.int64)
+      empty_values = np.empty((0,), dtype=np.float64)
+      empty_dense_shape = [0, 4, 37, 9]
+      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
+                                                    empty_dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  # pylint: disable=g-long-lambda,unnecessary-lambda
+  def testNestedStructure(self):
+    components = (np.array([1, 2, 3]), (np.array([4., 5.]), np.array([6., 7.])),
+                  np.array([8, 9, 10]))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.shuffle(10, 10)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.repeat(-1)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.filter(lambda x, y, z: True)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.take(5)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
+                                                       (y[0], y[1])))
+    )
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.batch(32)
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                      nest.pack_sequence_as(dataset.output_shapes, [
+                          s.as_list()
+                          for s in nest.flatten(dataset.output_shapes)
+                      ]))
+
+    iterator = dataset.make_one_shot_iterator()
+    (w, x), (y, z) = iterator.get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    iterator = dataset.make_initializable_iterator()
+    (w, x), (y, z) = iterator.get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    # Define a separate set of components with matching leading
+    # dimension for the from-slices constructor.
+    components_for_slices = (np.array([1, 2, 3]), (np.array(
+        [4., 5., 6.]), np.array([7., 8., 9.])), np.array([10, 11, 12]))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+
+  def testNonSequenceNestedStructure(self):
+    components = np.array([1, 2, 3])
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.filter(
+        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([2, 3], dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    self.assertEquals(dtypes.int64, get_next.dtype)
+    self.assertEquals([3], get_next.shape)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
new file mode 100644
index 00000000000..3ea783ad899
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FilterDatasetTest(test.TestCase):
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    modulus = array_ops.placeholder(dtypes.int64)
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(count)
+        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test that we can dynamically feed a different modulus value for each
+      # iterator.
+      def do_test(count_val, modulus_val):
+        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
+        for _ in range(count_val):
+          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+      do_test(14, 2)
+      do_test(4, 18)
+
+      # Test an empty dataset.
+      do_test(0, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
new file mode 100644
index 00000000000..3c9c714bde4
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.client import session
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class FlatMapDatasetTest(test.TestCase):
+
+  # pylint: disable=g-long-lambda
+  def testFlatMapDataset(self):
+    repeats = [1, 2, 3, 4, 5, 0, 1]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in repeats:
+        for _ in range(i):
+          self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
+                            .repeat(y))).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for row in repeats:
+        for i in row:
+          for _ in range(i):
+            self.assertEqual(i, sess.run(get_next))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSharedResourceNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
+                            .repeat(y))).make_initializable_iterator(
+                                shared_name="shared_flat_map_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    # Create two concurrent sessions that share the same iterator
+    # resource on the same server, and verify that a random
+    # interleaving of `Session.run(get_next)` calls on the two
+    # sessions yields the expected result.
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target) as sess1:
+      with session.Session(server.target) as sess2:
+        for _ in range(3):
+          sess = random.choice([sess1, sess2])
+          sess.run(init_op)
+          for row in repeats:
+            for i in row:
+              for _ in range(i):
+                sess = random.choice([sess1, sess2])
+                self.assertEqual(i, sess.run(get_next))
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess = random.choice([sess1, sess2])
+          sess.run(get_next)
+  # pylint: enable=g-long-lambda
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
new file mode 100644
index 00000000000..d6dd134a5b9
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -0,0 +1,266 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class IteratorTest(test.TestCase):
+
+  def testAttemptingGradientsRaiseExceptions(self):
+    component = constant_op.constant([1])
+    side = constant_op.constant(0)
+    add = lambda x: x + side
+    dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
+    value = dataset.make_one_shot_iterator().get_next()
+    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
+      gradients_impl.gradients(value, component)
+    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
+      gradients_impl.gradients(value, side)
+    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
+      gradients_impl.gradients(value, [component, side])
+
+  def testOneShotIterator(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+                .repeat(14).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOneShotIteratorCaptureByValue(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    tensor_components = tuple([ops.convert_to_tensor(c) for c in components])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(tensor_components)
+                .map(_map_fn).repeat(14).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOneShotIteratorInsideContainer(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def within_container():
+      def _map_fn(x, y, z):
+        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+      iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                  .map(_map_fn).repeat(14).make_one_shot_iterator())
+      return iterator.get_next()
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two iterators within unique containers, and run them to
+    # make sure that the resources aren't shared.
+    #
+    # The test below would fail if cname were the same across both
+    # sessions.
+    for i in range(2):
+      with session.Session(server.target) as sess:
+        cname = "iteration%d" % i
+        with ops.container(cname):
+          get_next = within_container()
+
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testSimpleSharedResource(self):
+    components = (
+        np.array(1, dtype=np.int64),
+        np.array([1, 2, 3], dtype=np.int64),
+        np.array(37.0, dtype=np.float64)
+    )
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two non-overlapping sessions that share the same iterator
+    # resource on the same server, and verify that an action of the
+    # first session (initializing the iterator) is visible in the
+    # second session.
+    with ops.Graph().as_default():
+      iterator = (dataset_ops.Dataset.from_tensors(components)
+                  .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
+                      shared_name="shared_iterator"))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        sess.run(init_op)
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Re-initialize the iterator in the first session.
+        sess.run(init_op)
+
+    with ops.Graph().as_default():
+      # Re-define the iterator manually, without defining any of the
+      # functions in this graph, to ensure that we are not
+      # accidentally redefining functions with the same names in the
+      # new graph.
+      iterator = dataset_ops.Iterator.from_structure(
+          shared_name="shared_iterator",
+          output_types=(dtypes.int64, dtypes.int64, dtypes.float64),
+          output_shapes=([], [3], []))
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        # Use the iterator without re-initializing in the second session.
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testNotInitializedError(self):
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .make_initializable_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.FailedPreconditionError,
+                                   "iterator has not been initialized"):
+        sess.run(get_next)
+
+  def testReinitializableIterator(self):
+    dataset_3 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([1, 2, 3]))
+    dataset_4 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([4, 5, 6, 7]))
+    iterator = dataset_ops.Iterator.from_structure(dataset_3.output_types,
+                                                   [None])
+
+    dataset_3_init_op = iterator.make_initializer(dataset_3)
+    dataset_4_init_op = iterator.make_initializer(dataset_4)
+    get_next = iterator.get_next()
+
+    self.assertEqual(dataset_3.output_types, iterator.output_types)
+    self.assertEqual(dataset_4.output_types, iterator.output_types)
+    self.assertEqual([None], iterator.output_shapes.as_list())
+
+    with self.test_session() as sess:
+      # The iterator is initially uninitialized.
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(get_next)
+
+      # Initialize with one dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Initialize with a different dataset.
+      sess.run(dataset_4_init_op)
+      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Reinitialize with the first dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testReinitializableIteratorStaticErrors(self):
+    # Non-matching structure for types and shapes.
+    with self.assertRaises(TypeError):
+      iterator = dataset_ops.Iterator.from_structure((dtypes.int64,
+                                                      dtypes.float64), [None])
+
+    # Test validation of dataset argument.
+    iterator = dataset_ops.Iterator.from_structure((dtypes.int64,
+                                                    dtypes.float64))
+
+    # Incompatible structure.
+    with self.assertRaises(ValueError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors(((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
+
+    # Incompatible types.
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int32), constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float32))))
+
+    # Incompatible shapes.
+    iterator = dataset_ops.Iterator.from_structure(
+        (dtypes.int64, dtypes.float64), ([None], []))
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float64))))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
new file mode 100644
index 00000000000..b5956ac49c3
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -0,0 +1,328 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+class MapDatasetTest(test.TestCase):
+
+  def _buildMapDataset(self, components, count):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+            .repeat(count))
+
+  def testMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildMapDataset(components, count)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test single-threaded access to the iterator.
+      sess.run(init_op, feed_dict={count: 14})
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test multi-threaded access to the same iterator.
+      sess.run(init_op, feed_dict={count: 18})
+      results = []
+      def iterator_thread():
+        while True:
+          try:
+            results.append(sess.run(get_next))
+          except errors.OutOfRangeError:
+            return
+      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      # `results` will contain the same elements components**2
+      # repeated 18 times, but in a non-deterministic order. Sort the
+      # results, and assert that each element of components**2 is
+      # produced 18 times.
+      results.sort(key=lambda x: x[0])
+      for i in range(7):
+        for j in range(18):
+          for component, result_component in zip(components,
+                                                 results[i * 18 + j]):
+            self.assertAllEqual(component[i]**2, result_component)
+
+  def _buildParallelMapDataset(self, components, count, num_threads,
+                               output_buffer_size):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_threads=num_threads, output_buffer_size=output_buffer_size)
+            .repeat(count))
+
+  def testParallelMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    num_threads = array_ops.placeholder(dtypes.int32, shape=[])
+    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildParallelMapDataset(components, count, num_threads,
+                                            output_buffer_size)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      def do_test(num_threads_val, output_buffer_size_val):
+        # Test single-threaded access to the iterator.
+        sess.run(init_op, feed_dict={
+            count: 14,
+            num_threads: num_threads_val,
+            output_buffer_size: output_buffer_size_val})
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Test multi-threaded access to the same iterator.
+        sess.run(init_op, feed_dict={
+            count: 18,
+            num_threads: num_threads_val,
+            output_buffer_size: output_buffer_size_val})
+        results = []
+        def iterator_thread():
+          while True:
+            try:
+              results.append(sess.run(get_next))
+            except errors.OutOfRangeError:
+              return
+        threads = [self.checkedThread(target=iterator_thread)
+                   for _ in range(64)]
+        for t in threads:
+          t.start()
+        for t in threads:
+          t.join()
+
+        # `results` will contain the same elements components**2
+        # repeated 18 times, but in a non-deterministic order. Sort the
+        # results, and assert that each element of components**2 is
+        # produced 18 times.
+        results.sort(key=lambda x: x[0])
+        for i in range(7):
+          for j in range(18):
+            for component, result_component in zip(components,
+                                                   results[i * 18 + j]):
+              self.assertAllEqual(component[i]**2, result_component)
+
+      for num_threads_val, output_buffer_size_val in [
+          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
+        do_test(num_threads_val, output_buffer_size_val)
+
+  def _testDisposeParallelMapDataset(self, explicit_dispose):
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(1000).
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    if explicit_dispose:
+      dispose_op = iterator.dispose_op()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      if explicit_dispose:
+        sess.run(dispose_op)
+
+  def testExplicitDisposeParallelMapDataset(self):
+    self._testDisposeParallelMapDataset(True)
+
+  def testImplicitDisposeParallelMapDataset(self):
+    self._testDisposeParallelMapDataset(False)
+
+  def testParallelMapError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message")))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+      sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureHashTable(self):
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (input_sentences
+                .map(lambda x: string_ops.string_split([x]).values)
+                .map(table.lookup)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+
+      print(sess.run(get_next))
+      print(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureQueue(self):
+    elements = np.random.randint(100, size=[200])
+    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
+    enqueue_op = queue.enqueue_many(elements)
+    close_op = queue.close()
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
+                .map(lambda _: queue.dequeue()).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(enqueue_op)
+      sess.run(close_op)
+      sess.run(init_op)
+      for element in elements:
+        self.assertEqual(element, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureVariable(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: counter_var.assign_add(1))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(counter_var.initializer)
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i, sess.run(counter_var))
+        self.assertEqual(i + 1, sess.run(get_next))
+      self.assertEqual(10, sess.run(counter_var))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(10, sess.run(counter_var))
+
+  def testCaptureUninitializedVariableError(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: counter_var.assign_add(1))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.FailedPreconditionError,
+                                   "Failed to capture resource"):
+        sess.run(init_op)
+
+  def testSeededStatefulOperatorIsProperlyStateful(self):
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      random_values = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values.extend(sess.run(get_next))
+      self.assertEqual(10, len(random_values))
+      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
+      sess.run(init_op)
+      random_values_2 = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values_2.extend(sess.run(get_next))
+
+      # Randomness is repeatable given same seed
+      self.assertAllClose(random_values, random_values_2)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
new file mode 100644
index 00000000000..a8edbbd20c8
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -0,0 +1,180 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test RangeDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class RangeDatasetTest(test.TestCase):
+
+  def testStop(self):
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={stop: 5})
+      for i in range(5):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStartStop(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start,
+                                         stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 5})
+      for i in range(2, 5):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStartStopStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
+      for i in range(2, 10, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testZeroStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
+
+  def testNegativeStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(2, 10, -1):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStart(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start,
+                                         stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(10, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(10, 2, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
+      for i in range(10, 2, -1):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testEnumerateDataset(self):
+    components = (["a", "b"], [1, 2], [37.0, 38])
+    start = constant_op.constant(20, dtype=dtypes.int64)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).enumerate(
+        start=start).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(dtypes.int64, get_next[0].dtype)
+    self.assertEqual((), get_next[0].shape)
+    self.assertEqual([tensor_shape.TensorShape([])] * 3,
+                     [t.shape for t in get_next[1]])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertEqual((20, (b"a", 1, 37.0)), sess.run(get_next))
+      self.assertEqual((21, (b"b", 2, 38.0)), sess.run(get_next))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
new file mode 100644
index 00000000000..133165a1c25
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -0,0 +1,525 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class TextLineDatasetTest(test.TestCase):
+
+  def _lineText(self, f, l):
+    return compat.as_bytes("%d: %d" % (f, l))
+
+  def _createFiles(self, num_files, num_lines, crlf=False):
+    filenames = []
+    for i in range(num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        for j in range(num_lines):
+          f.write(self._lineText(i, j))
+          # Always include a newline after the record unless it is
+          # at the end of the file, in which case we include it sometimes.
+          if j + 1 != num_lines or i == 0:
+            f.write(b"\r\n" if crlf else b"\n")
+    return filenames
+
+  def testTextLineDataset(self):
+    test_filenames = self._createFiles(2, 5, crlf=True)
+    filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = dataset_ops.TextLineDataset(filenames).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[0]],
+                                   num_epochs: 1})
+      for i in range(5):
+        self.assertEqual(self._lineText(0, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from file 1.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[1]],
+                                   num_epochs: 1})
+      for i in range(5):
+        self.assertEqual(self._lineText(1, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 1})
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual(self._lineText(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual(self._lineText(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(init_batch_op, feed_dict={filenames: test_filenames,
+                                         num_epochs: 10,
+                                         batch_size: 5})
+      for _ in range(10):
+        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
+                            sess.run(get_next))
+
+
+class FixedLengthRecordReaderTest(test.TestCase):
+
+  def setUp(self):
+    super(FixedLengthRecordReaderTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self._header_bytes = 5
+    self._record_bytes = 3
+    self._footer_bytes = 2
+
+  def _record(self, f, r):
+    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        for j in range(self._num_records):
+          f.write(self._record(i, j))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
+  def testFixedLengthRecordDataset(self):
+    test_filenames = self._createFiles()
+    filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = (dataset_ops.FixedLengthRecordDataset(
+        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
+                      .repeat(num_epochs))
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[0]],
+                                   num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertEqual(self._record(0, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from file 1.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[1]],
+                                   num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertEqual(self._record(1, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 1})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertEqual(self._record(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(self._num_files):
+          for i in range(self._num_records):
+            self.assertEqual(self._record(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(init_batch_op, feed_dict={filenames: test_filenames,
+                                         num_epochs: 10,
+                                         batch_size: self._num_records})
+      for _ in range(10):
+        for j in range(self._num_files):
+          self.assertAllEqual([self._record(j, i)
+                               for i in range(self._num_records)],
+                              sess.run(get_next))
+
+
+class TFRecordDatasetTest(test.TestCase):
+
+  def setUp(self):
+    super(TFRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+    self.test_filenames = self._createFiles()
+
+    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    self.num_epochs = array_ops.placeholder_with_default(
+        constant_op.constant(1, dtypes.int64), shape=[])
+    self.compression_type = array_ops.placeholder_with_default("", shape=[])
+    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = dataset_ops.TFRecordDataset(
+        self.filenames, self.compression_type).repeat(self.num_epochs)
+    batch_dataset = repeat_dataset.batch(self.batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    self.init_op = iterator.make_initializer(repeat_dataset)
+    self.init_batch_op = iterator.make_initializer(batch_dataset)
+    self.get_next = iterator.get_next()
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def testReadOneEpoch(self):
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: [self.test_filenames[0]],
+                          self.num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+      # Basic test: read from file 1.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: [self.test_filenames[1]],
+                          self.num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+      # Basic test: read from both files.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: self.test_filenames,
+                          self.num_epochs: 1})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadTenEpochs(self):
+    with self.test_session() as sess:
+      sess.run(self.init_op, feed_dict={self.filenames: self.test_filenames,
+                                        self.num_epochs: 10})
+      for _ in range(10):
+        for j in range(self._num_files):
+          for i in range(self._num_records):
+            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadTenEpochsOfBatches(self):
+    with self.test_session() as sess:
+      sess.run(self.init_batch_op,
+               feed_dict={self.filenames: self.test_filenames,
+                          self.num_epochs: 10,
+                          self.batch_size: self._num_records})
+      for _ in range(10):
+        for j in range(self._num_files):
+          values = sess.run(self.get_next)
+          self.assertAllEqual([self._record(j, i)
+                               for i in range(self._num_records)], values)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadZlibFiles(self):
+    zlib_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+
+    with self.test_session() as sess:
+      sess.run(self.init_op,
+               feed_dict={self.filenames: zlib_files,
+                          self.compression_type: "ZLIB"})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadGzipFiles(self):
+    gzip_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(gzfn, "wb") as gzf:
+          gzf.write(f.read())
+        gzip_files.append(gzfn)
+
+    with self.test_session() as sess:
+      sess.run(self.init_op,
+               feed_dict={self.filenames: gzip_files,
+                          self.compression_type: "GZIP"})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+
+class ReadBatchFeaturesTest(test.TestCase):
+
+  def setUp(self):
+    super(ReadBatchFeaturesTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self.test_filenames = self._createFiles()
+
+  def _read_batch_features(self, filenames, num_epochs, batch_size):
+    self.filenames = filenames
+    self.num_epochs = num_epochs
+    self.batch_size = batch_size
+
+    return dataset_ops.read_batch_features(
+        file_pattern=self.filenames,
+        batch_size=self.batch_size,
+        features={
+            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "keywords": parsing_ops.VarLenFeature(dtypes.string)
+        },
+        reader=dataset_ops.TFRecordDataset,
+        randomize_input=False,
+        num_epochs=self.num_epochs)
+
+  def _record(self, f, r):
+    example = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            "file":
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[f])),
+            "record":
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[r])),
+            "keywords":
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=self._get_keywords(f, r)))
+        }))
+    return example.SerializeToString()
+
+  def _get_keywords(self, f, r):
+    num_keywords = 1 + (f + r) % 2
+    keywords = []
+    for index in range(num_keywords):
+      keywords.append(compat.as_bytes("keyword%d" % index))
+    return keywords
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def _next_actual_batch(self, sess):
+    file_op = self.outputs["file"]
+    keywords_indices_op = self.outputs["keywords"].indices
+    keywords_values_op = self.outputs["keywords"].values
+    keywords_dense_shape_op = self.outputs["keywords"].dense_shape
+    record_op = self.outputs["record"]
+    return sess.run([
+        file_op, keywords_indices_op, keywords_values_op,
+        keywords_dense_shape_op, record_op
+    ])
+
+  def _next_expected_batch(self, file_indices, batch_size, num_epochs):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    file_batch = []
+    keywords_batch_indices = []
+    keywords_batch_values = []
+    keywords_batch_max_len = 0
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      for record in _next_record(file_indices):
+        f = record[0]
+        r = record[1]
+        file_batch.append(f)
+        record_batch.append(r)
+        keywords = self._get_keywords(f, r)
+        keywords_batch_values.extend(keywords)
+        keywords_batch_indices.extend([[batch_index, i]
+                                       for i in range(len(keywords))])
+        batch_index += 1
+        keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
+        if len(file_batch) == batch_size:
+          yield [
+              file_batch, keywords_batch_indices, keywords_batch_values,
+              [batch_size, keywords_batch_max_len], record_batch
+          ]
+          file_batch = []
+          keywords_batch_indices = []
+          keywords_batch_values = []
+          keywords_batch_max_len = 0
+          record_batch = []
+          batch_index = 0
+    if file_batch:
+      yield [
+          file_batch, keywords_batch_indices, keywords_batch_values,
+          [len(file_batch), keywords_batch_max_len], record_batch
+      ]
+
+  def _verify_records(self, sess, batch_size, file_index=None, num_epochs=1):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(file_indices, batch_size,
+                                                    num_epochs):
+      actual_batch = self._next_actual_batch(sess)
+      for i in range(len(expected_batch)):
+        self.assertAllEqual(expected_batch[i], actual_batch[i])
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from file 0.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames[0],
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, 0, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from file 1.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames[1],
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, 1, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from both files.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames,
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+  def testReadWithEquivalentDataset(self):
+    # TODO(mrry): Add support for tf.SparseTensor as a Dataset component.
+    features = {
+        "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+        "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+    }
+    dataset = (dataset_ops.TFRecordDataset(self.test_filenames)
+               .map(lambda x: parsing_ops.parse_single_example(x, features))
+               .repeat(10)
+               .batch(2))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for file_batch, _, _, _, record_batch in self._next_expected_batch(
+          range(self._num_files), 2, 10):
+        actual_batch = sess.run(next_element)
+        self.assertAllEqual(file_batch, actual_batch["file"])
+        self.assertAllEqual(record_batch, actual_batch["record"])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
new file mode 100644
index 00000000000..fb66acdcace
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ResampleTest(test.TestCase):
+
+  def testInitialKnownDistribution(self):
+    self._testDistribution(initial_known=True)
+
+  def testInitialNotKnownDistribution(self):
+    self._testDistribution(initial_known=False)
+
+  def _testDistribution(self, initial_known):
+    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
+    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
+    initial_dist = [0.2] * 5 if initial_known else None
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.rejection_resample(
+            (dataset_ops.Dataset.from_tensor_slices(classes)
+             .shuffle(200, seed=21)
+             .map(lambda c: (c, string_ops.as_string(c)))),
+            target_dist=target_dist,
+            initial_dist=initial_dist,
+            class_func=lambda c, _: c,
+            seed=27))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    variable_init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(variable_init_op)
+      sess.run(init_op)
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
+    returned_classes, returned_classes_and_data = zip(*returned)
+    _, returned_data = zip(*returned_classes_and_data)
+    self.assertAllEqual([compat.as_bytes(str(c))
+                         for c in returned_classes], returned_data)
+    total_returned = len(returned_classes)
+    # Subsampling rejects a large percentage of the initial data in
+    # this case.
+    self.assertGreater(total_returned, 20000 * 0.2)
+    class_counts = np.array([
+        len([True for v in returned_classes if v == c])
+        for c in range(5)])
+    returned_dist = class_counts / total_returned
+    self.assertAllClose(target_dist, returned_dist, atol=1e-2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
new file mode 100644
index 00000000000..91615e9f620
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -0,0 +1,211 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class SequenceDatasetTest(test.TestCase):
+
+  def testRepeatTensorDataset(self):
+    """Test a dataset that repeats its input multiple times."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    # This placeholder can be fed when dataset-definition subgraph
+    # runs (i.e. `init_op` below) to configure the number of
+    # repetitions used in a particular iterator.
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .repeat(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test a finite repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 3})
+      for _ in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test a different finite repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 7})
+      for _ in range(7):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test an empty repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test an infinite repetition.
+      # NOTE(mrry): There's not a good way to test that the sequence
+      # actually is infinite.
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      for _ in range(17):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+
+  def testTakeTensorDataset(self):
+    components = (np.arange(10),)
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .take(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Take fewer than input size
+      sess.run(init_op, feed_dict={count_placeholder: 4})
+      for i in range(4):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take more than input size
+      sess.run(init_op, feed_dict={count_placeholder: 25})
+      for i in range(10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take all of input
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      for i in range(10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take nothing
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSkipTensorDataset(self):
+    components = (np.arange(10),)
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .skip(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Skip fewer than input size, we should skip
+      # the first 4 elements and then read the rest.
+      sess.run(init_op, feed_dict={count_placeholder: 4})
+      for i in range(4, 10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip more than input size: get nothing.
+      sess.run(init_op, feed_dict={count_placeholder: 25})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip exactly input size.
+      sess.run(init_op, feed_dict={count_placeholder: 10})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Set -1 for 'count': skip the entire dataset.
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip nothing
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+      for i in range(0, 10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testRepeatRepeatTensorDataset(self):
+    """Test the composition of repeat datasets."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
+    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
+                .repeat(outer_count).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
+      for _ in range(7 * 14):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testRepeatEmptyDataset(self):
+    """Test that repeating an empty dataset does not hang."""
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
+                .repeat(-1).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.OutOfRangeError,
+          "Attempted to repeat an empty dataset infinitely."):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
new file mode 100644
index 00000000000..d9bfca30bbf
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ShuffleDatasetTest(test.TestCase):
+
+  def testShuffleDataset(self):
+    components = (
+        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+        np.array([9.0, 10.0, 11.0, 12.0])
+    )
+    count_placeholder = array_ops.placeholder_with_default(
+        constant_op.constant(5, dtypes.int64), shape=[])
+    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .repeat(count_placeholder))
+
+    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
+                                             seed_placeholder)
+
+    self.assertEqual(tuple([c.shape[1:] for c in components]),
+                     shuffle_dataset.output_shapes)
+
+    # Create initialization ops for iterators without and with
+    # shuffling, respectively.
+    iterator = dataset_ops.Iterator.from_structure(
+        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
+    init_fifo_op = iterator.make_initializer(repeat_dataset)
+    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # First run without shuffling to collect the "ground truth".
+      sess.run(init_fifo_op)
+      unshuffled_elements = []
+      for _ in range(20):
+        unshuffled_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Assert that the shuffled dataset has the same elements as the
+      # "ground truth".
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 37})
+      shuffled_elements = []
+      for _ in range(20):
+        shuffled_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertAllEqual(
+          sorted(unshuffled_elements), sorted(shuffled_elements))
+
+      # Assert that shuffling twice with the same seeds gives the same sequence.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 37})
+      reshuffled_elements_same_seed = []
+      for _ in range(20):
+        reshuffled_elements_same_seed.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
+
+      # Assert that shuffling twice with a different seed gives a different
+      # permutation of the same elements.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 1037})
+      reshuffled_elements_different_seed = []
+      for _ in range(20):
+        reshuffled_elements_different_seed.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
+      self.assertAllEqual(
+          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
+
+      # Assert that the shuffled dataset has the same elements as the
+      # "ground truth" when the buffer size is smaller than the input
+      # dataset.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 2,
+                     seed_placeholder: 37})
+      reshuffled_elements_small_buffer = []
+      for _ in range(20):
+        reshuffled_elements_small_buffer.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertAllEqual(
+          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
+
+      # Test the case of shuffling an empty dataset.
+      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
+                                           seed_placeholder: 37,
+                                           count_placeholder: 0})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDefaultArguments(self):
+    components = [0, 1, 2, 3, 4]
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
+                .repeat().make_one_shot_iterator())
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      counts = collections.defaultdict(lambda: 0)
+      for _ in range(10):
+        for _ in range(5):
+          counts[sess.run(get_next)] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
new file mode 100644
index 00000000000..b0e72183019
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ZipDatasetTest(test.TestCase):
+
+  def testZipDataset(self):
+    component_placeholders = [
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.float64)
+    ]
+
+    datasets = tuple([
+        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
+        for component_placeholder in component_placeholders
+    ])
+    zipped = dataset_ops.Dataset.zip(datasets)
+
+    iterator = zipped.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      equal_length_components = [
+          np.tile(np.array([[1], [2], [3], [4]]), 20),
+          np.tile(np.array([[12], [13], [14], [15]]), 22),
+          np.array([37.0, 38.0, 39.0, 40.0])
+      ]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, equal_length_components)})
+      for i in range(4):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            equal_length_components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, variable_length_components)})
+      for i in range(2):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            variable_length_components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedZipDataset(self):
+    component_placeholders = [
+        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
+        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
+        array_ops.placeholder(dtypes.float64, shape=[4])
+    ]
+
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
+        for component_placeholder in component_placeholders
+    ]
+    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+    iterator = zipped.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([20], get_next[0].shape)
+    self.assertEqual([22], get_next[1][0].shape)
+    self.assertEqual([], get_next[1][1].shape)
+
+    with self.test_session() as sess:
+      equal_length_components = [
+          np.tile(np.array([[1], [2], [3], [4]]), 20),
+          np.tile(np.array([[12], [13], [14], [15]]), 22),
+          np.array([37.0, 38.0, 39.0, 40.0])
+      ]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, equal_length_components)})
+      for i in range(4):
+        result1, (result2, result3) = sess.run(get_next)
+        self.assertAllEqual(equal_length_components[0][i], result1)
+        self.assertAllEqual(equal_length_components[1][i], result2)
+        self.assertAllEqual(equal_length_components[2][i], result3)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
new file mode 100644
index 00000000000..08a2774ece2
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "dataset_ops",
+    srcs = ["dataset_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/framework:function",
+        "//tensorflow/contrib/data/python/util:nest",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
new file mode 100644
index 00000000000..89410bf8447
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -0,0 +1,1944 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import numpy as np
+
+from tensorflow.contrib.data.python.framework import function
+from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import gfile
+
+
+class Iterator(object):
+  """Represents the state of iterating through a `Dataset`."""
+
+  def __init__(self, iterator_resource, initializer, output_types,
+               output_shapes):
+    """Creates a new iterator from the given iterator resource.
+
+    NOTE(mrry): Most users will not call this initializer directly, and will
+    instead use `Iterator.from_dataset()` or `Dataset.make_one_shot_iterator()`.
+
+    Args:
+      iterator_resource: A `tf.resource` scalar `tf.Tensor` representing the
+        iterator.
+      initializer: A `tf.Operation` that should be run to initialize this
+        iterator.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this iterator.
+      output_shapes: A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element of this dataset.
+    """
+    self._iterator_resource = iterator_resource
+    self._initializer = initializer
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+
+  @staticmethod
+  def from_dataset(dataset, shared_name=None):
+    """Creates a new, uninitialized `Iterator` from the given `Dataset`.
+
+    To initialize this iterator, you must run its `initializer`:
+
+    ```python
+    dataset = ...
+    iterator = Iterator.from_dataset(dataset)
+    # ...
+    sess.run(iterator.initializer)
+    ```
+
+    Args:
+      dataset: A `Dataset` object.
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator`.
+    """
+    if shared_name is None:
+      shared_name = ""
+    iterator_resource = gen_dataset_ops.iterator(
+        container="",
+        shared_name=shared_name,
+        output_types=nest.flatten(dataset.output_types),
+        output_shapes=nest.flatten(dataset.output_shapes))
+    initializer = gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
+                                                iterator_resource)
+    return Iterator(iterator_resource, initializer, dataset.output_types,
+                    dataset.output_shapes)
+
+  @staticmethod
+  def from_structure(output_types, output_shapes=None, shared_name=None):
+    """Creates a new, uninitialized `Iterator` with the given structure.
+
+    This iterator-constructing method can be used to create an iterator that
+    is reusable with many different datasets.
+
+    The returned iterator is not bound to a particular dataset, and it has
+    no `initializer`. To initialize the iterator, run the operation returned by
+    `Iterator.make_initializer(dataset)`.
+
+    The following is an example
+
+    ```python
+    iterator = Iterator.from_structure(tf.int64, tf.TensorShape([]))
+
+    dataset_range = Dataset.range(10)
+    range_initializer = iterator.make_initializer(dataset_range)
+
+    dataset_evens = dataset_range.filter(lambda x: x % 2 == 0)
+    evens_initializer = iterator.make_initializer(dataset_evens)
+
+    # Define a model based on the iterator; in this example, the model_fn
+    # is expected to take scalar tf.int64 Tensors as input (see
+    # the definition of 'iterator' above).
+    prediction, loss = model_fn(iterator.get_next())
+
+    # Train for `num_epochs`, where for each epoch, we first iterate over
+    # dataset_range, and then iterate over dataset_evens.
+    for _ in range(num_epochs):
+      # Initialize the iterator to `dataset_range`
+      sess.run(range_initializer)
+      while True:
+        try:
+          pred, loss_val = sess.run([prediction, loss])
+        except tf.errors.OutOfRangeError:
+          break
+
+      # Initialize the iterator to `dataset_evens`
+      sess.run(evens_initializer)
+      while True:
+        try:
+          pred, loss_val = sess.run([prediction, loss])
+        except tf.errors.OutOfRangeError:
+          break
+    ```
+
+    Args:
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this iterator.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element of this dataset. If
+        omitted, each component will have an unconstrainted shape.
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator`.
+
+    Raises:
+      TypeError: If the structures of `output_shapes` and `output_types` are
+        not the same.
+    """
+    output_types = nest.map_structure(dtypes.as_dtype, output_types)
+    if output_shapes is None:
+      output_shapes = nest.map_structure(
+          lambda _: tensor_shape.TensorShape(None), output_types)
+    else:
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+    nest.assert_same_structure(output_types, output_shapes)
+    if shared_name is None:
+      shared_name = ""
+    iterator_resource = gen_dataset_ops.iterator(
+        container="",
+        shared_name=shared_name,
+        output_types=nest.flatten(output_types),
+        output_shapes=nest.flatten(output_shapes))
+    return Iterator(iterator_resource, None, output_types, output_shapes)
+
+  @property
+  def initializer(self):
+    """A `tf.Operation` that should be run to initialize this iterator.
+
+    Returns:
+      A `tf.Operation` that should be run to initialize this iterator
+
+    Raises:
+      ValueError: If this iterator initializes itself automatically.
+    """
+    if self._initializer is not None:
+      return self._initializer
+    else:
+      # TODO(mrry): Consider whether one-shot iterators should have
+      # initializers that simply reset their state to the beginning.
+      raise ValueError("Iterator does not have an initializer.")
+
+  def make_initializer(self, dataset):
+    """Returns a `tf.Operation` that initializes this iterator on `dataset`.
+
+    Args:
+      dataset: A `Dataset` with compatible structure to this iterator.
+
+    Returns:
+      A `tf.Operation` that can be run to initialize this iterator on the given
+      `dataset`.
+
+    Raises:
+      TypeError: If `dataset` and this iterator do not have a compatible
+        element structure.
+    """
+    nest.assert_same_structure(self._output_types, dataset.output_types)
+    nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+    for iterator_dtype, dataset_dtype in zip(
+        nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
+      if iterator_dtype != dataset_dtype:
+        raise TypeError(
+            "Expected output types %r but got dataset with output types %r." %
+            (self._output_types, dataset.output_types))
+    for iterator_shape, dataset_shape in zip(
+        nest.flatten(self._output_shapes), nest.flatten(dataset.output_shapes)):
+      if not iterator_shape.is_compatible_with(dataset_shape):
+        raise TypeError("Expected output shapes compatible with %r but got "
+                        "dataset with output shapes %r." %
+                        (self._output_shapes, dataset.output_shapes))
+    return gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
+                                         self._iterator_resource)
+
+  def get_next(self, name=None):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+
+    Args:
+      name: (Optional.) A name for the created operation.
+
+    Returns:
+      A nested structure of `tf.Tensor` objects.
+    """
+    return nest.pack_sequence_as(
+        self._output_types,
+        gen_dataset_ops.iterator_get_next(
+            self._iterator_resource,
+            output_types=nest.flatten(self._output_types),
+            output_shapes=nest.flatten(self._output_shapes),
+            name=name))
+
+  def dispose_op(self, name=None):
+    """Returns a `tf.Operation` that destroys this iterator.
+
+    The returned operation may be used to release any resources consumed by
+    this iterator without closing the session.
+
+    Args:
+      name: (Optional.) A name for the created operation.
+
+    Returns:
+      A `tf.Operation`.
+    """
+    return gen_dataset_ops.iterator_dispose(self._iterator_resource, name=name)
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this iterator.
+    """
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this iterator.
+    """
+    return self._output_types
+
+
+def _calculate_acceptance_probs(initial_probs, target_probs):
+  """Calculate the per-class acceptance rates.
+
+  Args:
+    initial_probs: The class probabilities of the data.
+    target_probs: The desired class proportion in minibatches.
+  Returns:
+    A list of the per-class acceptance probabilities.
+
+  This method is based on solving the following analysis:
+
+  Let F be the probability of a rejection (on any example).
+  Let p_i be the proportion of examples in the data in class i (init_probs)
+  Let a_i is the rate the rejection sampler should *accept* class i
+  Let t_i is the target proportion in the minibatches for class i (target_probs)
+
+  ```
+  F = sum_i(p_i * (1-a_i))
+    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1
+  ```
+
+  An example with class `i` will be accepted if `k` rejections occur, then an
+  example with class `i` is seen by the rejector, and it is accepted. This can
+  be written as follows:
+
+  ```
+  t_i = sum_k=0^inf(F^k * p_i * a_i)
+      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
+      = p_i * a_i / sum_j(p_j * a_j)        using F from above
+  ```
+
+  Note that the following constraints hold:
+  ```
+  0 <= p_i <= 1, sum_i(p_i) = 1
+  0 <= a_i <= 1
+  0 <= t_i <= 1, sum_i(t_i) = 1
+  ```
+
+
+  A solution for a_i in terms of the other variabes is the following:
+    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
+  """
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  ratio_l = target_probs / denom
+
+  # Calculate list of acceptance probabilities.
+  max_ratio = math_ops.reduce_max(ratio_l)
+  return ratio_l / max_ratio
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: A `ResourceVariable` containing counts.
+      Type `int64`, shape `[num_classes]`.
+
+  Returns:
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in
+  # batch.  But do this asynchronously to avoid performing a
+  # cross-device round-trip.  Just use the cached value.
+  num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
+      math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64),
+          0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  return math_ops.cast(init_prob_estimate, dtypes.float32)
+
+
+class Dataset(object):
+  """Represents a potentially large set of elements.
+
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self):
+    pass
+
+  @abc.abstractmethod
+  def make_dataset_resource(self):
+    """Creates a `tf.Tensor` of  `tf.resource` tensor representing this dataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.resource` type, which represents this dataset.
+    """
+    raise NotImplementedError("Dataset.make_dataset_resource")
+
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    **N.B.** The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it.
+
+    Args:
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    return Iterator.from_dataset(self, shared_name)
+
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    **N.B.** The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      return self.make_dataset_resource()
+
+    _make_dataset.add_to_graph(ops.get_default_graph())
+
+    return Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset,
+            output_types=nest.flatten(self.output_types),
+            output_shapes=nest.flatten(self.output_shapes)), None,
+        self.output_types, self.output_shapes)
+
+  @abc.abstractproperty
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_shapes")
+
+  @abc.abstractproperty
+  def output_types(self):
+    """Returns the type of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_types")
+
+  def __repr__(self):
+    output_shapes = nest.map_structure(str, self.output_shapes)
+    output_shapes = str(output_shapes).replace("'", "")
+    output_types = nest.map_structure(repr, self.output_types)
+    output_types = str(output_types).replace("'", "")
+    return ("<%s shapes: %s, types: %s>"
+            % (type(self).__name__, output_shapes, output_types))
+
+  @staticmethod
+  def from_tensors(tensors):
+    """Creates a `Dataset` with a single element, comprising the given tensors.
+
+    Args:
+      tensors: A nested structure of tensors.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TensorDataset(tensors)
+
+  @staticmethod
+  def from_tensor_slices(tensors):
+    """Creates a `Dataset` whose elements are slices of the given tensors.
+
+    Args:
+      tensors: A nested structure of tensors, each having the same size in the
+        0th dimension.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TensorSliceDataset(tensors)
+
+  @staticmethod
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return SparseTensorSliceDataset(sparse_tensor)
+
+  @staticmethod
+  def range(*args):
+    """Creates a `Dataset` of a step-separated range of values.
+
+    For example:
+
+    ```python
+    Dataset.range(5) == [0, 1, 2, 3, 4]
+    Dataset.range(2, 5) == [2, 3, 4]
+    Dataset.range(1, 5, 2) == [1, 3]
+    Dataset.range(1, 5, -2) == []
+    Dataset.range(5, 1) == []
+    Dataset.range(5, 1, -2) == [5, 3]
+    ```
+
+    Args:
+      *args: follow same semantics as python's xrange.
+        len(args) == 1 -> start = 0, stop = args[0], step = 1
+        len(args) == 2 -> start = args[0], stop = args[1], step = 1
+        len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
+
+    Returns:
+      A `RangeDataset`.
+
+    Raises:
+      ValueError: if len(args) == 0.
+    """
+    return RangeDataset(*args)
+
+  @staticmethod
+  def zip(datasets):
+    """Creates a `Dataset` by zipping together the given datasets.
+
+    This method has similar semantics to the built-in `zip()` function
+    in Python, with the main difference being that the `datasets`
+    argument can be an arbitrary nested structure of `Dataset` objects.
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { 4, 5, 6 }
+    c = { (7, 8), (9, 10), (11, 12) }
+    d = { 13, 14 }
+
+    # The nested structure of the `datasets` argument determines the
+    # structure of elements in the resulting dataset.
+    Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
+    Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
+
+    # The `datasets` argument may contain an arbitrary number of
+    # datasets.
+    Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
+                                (2, 5, (9, 10)),
+                                (3, 6, (11, 12)) }
+
+    # The number of elements in the resulting dataset is the same as
+    # the size of the smallest dataset in `datasets`.
+    Dataset.zip((a, d)) == { (1, 13), (2, 14) }
+    ```
+
+    Args:
+      datasets: A nested structure of datasets.
+
+    Returns:
+      A `Dataset`.
+    """
+    return ZipDataset(datasets)
+
+  @staticmethod
+  def read_batch_features(file_pattern,
+                          batch_size,
+                          features,
+                          reader,
+                          reader_args=None,
+                          randomize_input=True,
+                          num_epochs=None,
+                          capacity=10000):
+    """Reads batches of Examples.
+
+    Args:
+      file_pattern: A string pattern or a placeholder with list of filenames.
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+      features: A `dict` mapping feature keys to `FixedLenFeature` or
+        `VarLenFeature` values. See `tf.parse_example`.
+      reader: A function or class that can be called with a `filenames` tensor
+        and (optional) `reader_args` and returns a `Dataset` of serialized
+        Examples.
+      reader_args: Additional arguments to pass to the reader class.
+      randomize_input: Whether the input should be randomized.
+      num_epochs: Integer specifying the number of times to read through the
+        dataset. If None, cycles through the dataset forever.
+      capacity: Capacity of the ShuffleDataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    if isinstance(file_pattern, str):
+      filenames = _get_file_names(file_pattern, randomize_input)
+    else:
+      filenames = file_pattern
+    if reader_args:
+      dataset = reader(filenames, *reader_args)
+    else:
+      dataset = reader(filenames)
+    dataset = dataset.repeat(num_epochs)
+    if randomize_input:
+      dataset = dataset.shuffle(capacity)
+    dataset = dataset.map(
+        lambda x: _parse_example(nest.flatten(x), features)
+    )
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def repeat(self, count=None):
+    """Repeats this dataset `count` times.
+
+    Args:
+      count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        number of times the elements of this dataset should be repeated. The
+        default behavior (if `count` is `None` or `-1`) is for the elements to
+        be repeated indefinitely.
+
+    Returns:
+      A `Dataset`.
+    """
+    return RepeatDataset(self, count)
+
+  def enumerate(self, start=0):
+    """Enumerate the elements of this dataset.  Similar to python's `enumerate`.
+
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { (7, 8), (9, 10), (11, 12) }
+
+    # The nested structure of the `datasets` argument determines the
+    # structure of elements in the resulting dataset.
+    a.enumerate(start=5) == { (5, 1), (6, 2), (7, 3) }
+    b.enumerate() == { (0, (7, 8)), (1, (9, 10)), (2, (11, 12)) }
+    ```
+
+    Args:
+      start: A `tf.int64` scalar `tf.Tensor`, representing the start
+        value for enumeration.
+
+    Returns:
+      A `Dataset`.
+    """
+    max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
+    return Dataset.zip((Dataset.range(start, max_value), self))
+
+  def shuffle(self, buffer_size, seed=None):
+    """Randomly shuffles the elements of this dataset.
+
+    Args:
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of elements from this dataset from which the new
+        dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
+
+    Returns:
+      A `Dataset`.
+    """
+    return ShuffleDataset(self, buffer_size, seed)
+
+  def cache(self, filename=""):
+    """Caches the elements in this dataset.
+
+    Args:
+      filename: A `tf.string` scalar `tf.Tensor`, representing the name of a
+        directory on the filesystem to use for caching tensors in this Dataset.
+        If a filename is not provided, the dataset will be cached in memory.
+
+    Returns:
+      A `Dataset`.
+    """
+    return CacheDataset(self, filename)
+
+  def take(self, count):
+    """Creates a `Dataset` with at most `count` elements from this dataset.
+
+    Args:
+      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        elements of this dataset that should be taken to form the new dataset.
+        If `count` is -1, or if `count` is greater than the size of this
+        dataset, the new dataset will contain all elements of this dataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TakeDataset(self, count)
+
+  def skip(self, count):
+    """Creates a `Dataset` that skips `count` elements from this dataset.
+
+    Args:
+      count: A `tf.int64` scalar `tf.Tensor`, representing the number
+        of elements of this dataset that should be skipped to form the
+        new dataset.  If `count` is greater than the size of this
+        dataset, the new dataset will contain no elements.  If `count`
+        is -1, skips the entire dataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    return SkipDataset(self, count)
+
+  def batch(self, batch_size):
+    """Combines consecutive elements of this dataset into batches.
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+
+    Returns:
+      A `Dataset`.
+    """
+    return BatchDataset(self, batch_size)
+
+  def padded_batch(self, batch_size, padded_shapes, padding_values=None):
+    """Combines consecutive elements of this dataset into padded batches.
+
+    Like `Dataset.dense_to_sparse_batch()`, this method combines
+    multiple consecutive elements of this dataset, which might have
+    different shapes, into a single element. The tensors in the
+    resulting element have an additional outer dimension, and are
+    padded to the respective shape in `padded_shapes`.
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+      padded_shapes: A nested structure of `tf.TensorShape` or
+        `tf.int64` vector tensor-like objects representing the shape
+        to which the respective component of each input element should
+        be padded prior to batching. Any unknown dimensions
+        (e.g. `tf.Dimension(None)` in a `tf.TensorShape` or `-1` in a
+        tensor-like object) will be padded to the maximum size of that
+        dimension in each batch.
+      padding_values: (Optional.) A nested structure of scalar-shaped
+        `tf.Tensor`, representing the padding values to use for the
+        respective components.  Defaults are `0` for numeric types and
+        the empty string for string types.
+
+    Returns:
+      A `Dataset`.
+    """
+    return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
+
+  def dense_to_sparse_batch(self, batch_size, row_shape):
+    """Batches ragged elements of this dataset into `tf.SparseTensor`s.
+
+    Like `Dataset.padded_batch()`, this method combines multiple
+    consecutive elements of this dataset, which might have different
+    shapes, into a single element. The resulting element has three
+    components (`indices`, `values`, and `dense_shape`), which
+    comprise a `tf.SparseTensor` that represents the same data. The
+    `row_shape` represents the dense shape of each row in the
+    resulting `tf.SparseTensor`, to which the effective batch size is
+    prepended. For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+    a.dense_to_sparse_batch(batch_size=2, row_shape=[6]) == {
+        ([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],  # indices
+         ['a', 'b', 'c', 'a', 'b'],                 # values
+         [2, 6]),                                   # dense_shape
+        ([[2, 0], [2, 1], [2, 2], [2, 3]],
+         ['a', 'b', 'c', 'd'],
+         [1, 6])
+    }
+    ```
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of consecutive elements of this dataset to combine in a
+        single batch.
+      row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like
+        object representing the equivalent dense shape of a row in the
+        resulting `tf.SparseTensor`. Each element of this dataset must
+        have the same rank as `row_shape`, and must have size less
+        than or equal to `row_shape` in each dimension.
+
+    Returns:
+      A `Dataset`.
+    """
+    return DenseToSparseBatchDataset(self, batch_size, row_shape)
+
+  def group_by_window(self, key_func, reduce_func, window_size):
+    """Performs a windowed "group-by" operation on this dataset.
+
+    This method maps each consecutive element in this dataset to a key
+    using `key_func` and groups the elements by key. It then applies
+    `reduce_func` to at most `window_size` elements matching the same
+    key. All execpt the final window for each key will contain
+    `window_size` elements; the final window may be smaller.
+
+    Args:
+      key_func: A function mapping a nested structure of tensors
+        (having shapes and types defined by `self.output_shapes` and
+        `self.output_types`) to a scalar `tf.int64` tensor.
+      reduce_func: A function mapping a key and a dataset of up to `batch_size`
+        consecutive elements matching that key to another dataset.
+      window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements matching the same key to combine in a single
+        batch, which will be passed to `reduce_func`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return GroupByWindowDataset(self, key_func, reduce_func, window_size)
+
+  def map(self, map_func, num_threads=None, output_buffer_size=None):
+    """Maps `map_func` across this datset.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having
+        shapes and types defined by `self.output_shapes` and
+       `self.output_types`) to another nested structure of tensors.
+      num_threads: (Optional.) A `tf.int32` scalar `tf.Tensor`, representing
+        the number of threads to use for processing elements in parallel. If
+        not specified, elements will be processed sequentially without
+        buffering.
+      output_buffer_size: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+        representing the maximum number of processed elements that will be
+        buffered when processing in parallel.
+
+    Returns:
+      A `Dataset`.
+    """
+    return MapDataset(self, map_func, num_threads, output_buffer_size)
+
+  def flat_map(self, map_func):
+    """Maps `map_func` across this dataset and flattens the result.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        `Dataset`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return FlatMapDataset(self, map_func)
+
+  def unbatch(self):
+    """Splits elements of this dataset into sequences of consecutive elements.
+
+    For example, if elements of this dataset are shaped `[B, a0, a1, ...]`,
+    where `B` may vary from element to element, then for each element in
+    this dataset, the unbatched dataset will contain `B` consecutive elements
+    of shape `[a0, a1, ...]`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return self.flat_map(
+      map_func=lambda *args: Dataset.from_tensor_slices(args))
+
+  def filter(self, predicate):
+    """Filters this dataset according to `predicate`.
+
+    Args:
+      predicate: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        scalar `tf.bool` tensor.
+
+    Returns:
+      A `Dataset`.
+    """
+    return FilterDataset(self, predicate)
+
+
+class TensorDataset(Dataset):
+  """A `Dataset` with a single element, viz. a nested structure of tensors."""
+
+  def __init__(self, tensors):
+    """See `Dataset.from_tensors()` for details."""
+    super(TensorDataset, self).__init__()
+    with ops.name_scope("tensors"):
+      self._tensors = nest.pack_sequence_as(tensors, [
+          ops.convert_to_tensor(t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(tensors))
+      ])
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tensor_dataset(
+        nest.flatten(self._tensors),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.shape for t in nest.flatten(self._tensors)])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.dtype for t in nest.flatten(self._tensors)])
+
+
+class TensorSliceDataset(Dataset):
+  """A `Dataset` of slices from a nested structure of tensors."""
+
+  def __init__(self, tensors):
+    """See `Dataset.from_tensor_slices()` for details."""
+    super(TensorSliceDataset, self).__init__()
+    with ops.name_scope("tensors"):
+      flat_tensors = [
+          ops.convert_to_tensor(t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(tensors))
+      ]
+
+    self._tensors = nest.pack_sequence_as(tensors, flat_tensors)
+    batch_dim = flat_tensors[0].get_shape()[0]
+    for t in flat_tensors[1:]:
+      batch_dim.assert_is_compatible_with(t.get_shape()[0])
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tensor_slice_dataset(
+        nest.flatten(self._tensors),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._tensors, [
+        tensor_shape.TensorShape(t.shape[1:])
+        for t in nest.flatten(self._tensors)
+    ])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.dtype for t in nest.flatten(self._tensors)])
+
+
+class SparseTensorSliceDataset(Dataset):
+  """A `Dataset` that splits a rank-N `tf.SparseTensor` into its rows."""
+
+  def __init__(self, sparse_tensor):
+    """See `Dataset.from_sparse_tensor_slices()` for details."""
+    super(SparseTensorSliceDataset, self).__init__()
+    if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
+      raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
+    self._sparse_tensor = sparse_tensor
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.sparse_tensor_slice_dataset(
+        self._sparse_tensor.indices, self._sparse_tensor.values,
+        self._sparse_tensor.dense_shape)
+
+  @property
+  def output_shapes(self):
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape[1] - 1).merge_with(shape_shape[0] - 1)
+    num_values = tensor_shape.Dimension(None)
+    return (tensor_shape.TensorShape([num_values, rank]),
+            tensor_shape.TensorShape([num_values]), tensor_shape.TensorShape(
+                [rank]))
+
+  @property
+  def output_types(self):
+    return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
+
+
+class ZipDataset(Dataset):
+  """A `Dataset` that zips its inputs together."""
+
+  def __init__(self, datasets):
+    """See `Dataset.zip()` for details."""
+    super(ZipDataset, self).__init__()
+    self._datasets = datasets
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.zip_dataset(
+        [ds.make_dataset_resource() for ds in nest.flatten(self._datasets)],
+        output_shapes=[
+            s
+            for ds in nest.flatten(self._datasets)
+            for s in nest.flatten(ds.output_shapes)
+        ],
+        output_types=[
+            t
+            for ds in nest.flatten(self._datasets)
+            for t in nest.flatten(ds.output_types)
+        ])
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._datasets, [
+        ds.output_shapes for ds in nest.flatten(self._datasets)])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._datasets, [
+        ds.output_types for ds in nest.flatten(self._datasets)])
+
+
+class RepeatDataset(Dataset):
+  """A `Dataset` that repeats its input several times."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.repeat()` for details."""
+    super(RepeatDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(count, dtype=dtypes.int64,
+                                          name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.repeat_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class RangeDataset(Dataset):
+  """A `Dataset` of a step separated range of values."""
+
+  def __init__(self, *args):
+    """See `Dataset.range()` for details."""
+    super(RangeDataset, self).__init__()
+    self._parse_args(*args)
+
+  def _parse_args(self, *args):
+    if len(args) == 1:
+      self._start = self._build_tensor(0, "start")
+      self._stop = args[0]
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 2:
+      self._start = args[0]
+      self._stop = args[1]
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 3:
+      self._start = args[0]
+      self._stop = args[1]
+      self._step = args[2]
+    else:
+      raise ValueError("Invalid arguments to RangeDataset: %s" % str(args))
+
+  def _build_tensor(self, int64_value, name):
+    return constant_op.constant(int64_value, dtype=dtypes.int64, name=name)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.range_dataset(
+        start=self._start,
+        stop=self._stop,
+        step=self._step,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
+
+
+class CacheDataset(Dataset):
+  """A `Dataset` that caches elements of its input."""
+
+  def __init__(self, input_dataset, filename):
+    """See `Dataset.cache()` for details."""
+    super(CacheDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._filename = ops.convert_to_tensor(
+        filename, dtype=dtypes.string, name="filename")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.cache_dataset(
+        self._input_dataset.make_dataset_resource(),
+        filename=self._filename,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class ShuffleDataset(Dataset):
+  """A `Dataset` that randomly shuffles the elements of its input."""
+
+  def __init__(self, input_dataset, buffer_size, seed=None):
+    """See `Dataset.shuffle()` for details."""
+    super(ShuffleDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64,
+                                          name="seed2")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.shuffle_dataset(
+        self._input_dataset.make_dataset_resource(),
+        buffer_size=self._buffer_size,
+        seed=self._seed,
+        seed2=self._seed2,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class TakeDataset(Dataset):
+  """A `Dataset` containing the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.take()` for details."""
+    super(TakeDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.take_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class SkipDataset(Dataset):
+  """A `Dataset` skipping the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.skip()` for details."""
+    super(SkipDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.skip_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class BatchDataset(Dataset):
+  """A `Dataset` that batches contiguous elements from its input."""
+
+  def __init__(self, input_dataset, batch_size):
+    """See `Dataset.batch()` for details."""
+    super(BatchDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        batch_size=self._batch_size,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    input_shapes = self._input_dataset.output_shapes
+    return nest.pack_sequence_as(input_shapes, [
+        tensor_shape.vector(None).concatenate(s)
+        for s in nest.flatten(self._input_dataset.output_shapes)
+    ])
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+def _partial_shape_to_tensor(shape_like):
+  try:
+    # First attempt to convert the input to a shape, and return the
+    # "canonical" tensor representation, which uses `-1` in place of
+    # `None`.
+    shape_like = tensor_shape.as_shape(shape_like)
+    return ops.convert_to_tensor(
+        [dim if dim is not None else -1 for dim in shape_like.as_list()],
+        dtype=dtypes.int64)
+  except (TypeError, ValueError):
+    # The argument was not trivially convertible to a
+    # `tf.TensorShape`, so fall back on the conversion to tensor
+    # machinery.
+    return ops.convert_to_tensor(shape_like, dtype=dtypes.int64)
+
+
+def _padding_value_to_tensor(value, output_type):
+  """Converts the padding value to a tensor.
+
+  Args:
+    value: The padding value.
+    output_type: Its expected dtype.
+
+  Returns:
+    A scalar `Tensor`.
+
+  Raises:
+    ValueError: if the padding value is not a scalar.
+    TypeError: if the padding value's type does not match `output_type`.
+  """
+  value = ops.convert_to_tensor(value, name="padding_value")
+  if not value.shape.is_compatible_with(tensor_shape.scalar()):
+    raise ValueError(
+        "Padding value should be a scalar, but is not: %s" % value)
+  if value.dtype != output_type:
+    raise TypeError(
+        "Padding value tensor (%s) does not match output type: %s"
+        % (value, output_type))
+  return value
+
+
+class PaddedBatchDataset(Dataset):
+  """A `Dataset` that batches and pads contiguous elements from its input."""
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """See `Dataset.batch()` for details."""
+    super(PaddedBatchDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    padding_values = (padding_values if padding_values is not None else
+                      self._default_padding(input_dataset))
+    self._padded_shapes = nest.map_structure_up_to(input_dataset.output_shapes,
+                                                   _partial_shape_to_tensor,
+                                                   padded_shapes)
+    self._padding_values = nest.map_structure_up_to(input_dataset.output_shapes,
+                                                    _padding_value_to_tensor,
+                                                    padding_values,
+                                                    input_dataset.output_types)
+
+  def _default_padding(self, input_dataset):
+    def make_zero(t):
+      if t.base_dtype == dtypes.string:
+        return ""
+      else:
+        return np.zeros_like(t.as_numpy_dtype())
+    return nest.map_structure(make_zero, input_dataset.output_types)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.padded_batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.vector(None).concatenate(
+          tensor_util.constant_value_as_shape(s))
+    return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class DenseToSparseBatchDataset(Dataset):
+  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
+
+  def __init__(self, input_dataset, batch_size, row_shape):
+    """See `Dataset.dense_to_sparse_batch()` for more details."""
+    super(DenseToSparseBatchDataset, self).__init__()
+    if not isinstance(input_dataset.output_types, dtypes.DType):
+      raise TypeError("DenseToSparseDataset requires an input whose elements "
+                      "have a single component, whereas the input has %r."
+                      % input_dataset.output_types)
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    self._row_shape = _partial_shape_to_tensor(row_shape)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._batch_size,
+        self._row_shape,
+        output_shapes=self.output_shapes,
+        output_types=self.output_types)
+
+  @property
+  def output_shapes(self):
+    num_elements = tensor_shape.Dimension(None)
+    return (tensor_shape.matrix(num_elements, self._row_shape.shape[0] + 1),
+            tensor_shape.vector(num_elements),
+            tensor_shape.vector(self._row_shape.shape[0] + 1))
+
+  @property
+  def output_types(self):
+    return (dtypes.int64, self._input_dataset.output_types, dtypes.int64)
+
+
+class _ResourceDataset(Dataset):
+  """A Dataset wrapper for a tf.resource-typed function argument."""
+
+  def __init__(self, dataset_resource, output_types, output_shapes):
+    super(_ResourceDataset, self).__init__()
+    self._dataset_resource = dataset_resource,
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+
+  def make_dataset_resource(self):
+    return self._dataset_resource
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class GroupByWindowDataset(Dataset):
+  """A `Dataset` that groups its input and performs a windowed reduction."""
+
+  def __init__(self, input_dataset, key_func, reduce_func, window_size):
+    """See `Dataset.group_by_window()` for details."""
+    super(GroupByWindowDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._window_size = window_size
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_key_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+      if nest.is_sequence(nested_args):
+        ret = key_func(*nested_args)
+      else:
+        ret = key_func(nested_args)
+      ret = ops.convert_to_tensor(ret, dtype=dtypes.int64)
+      if ret.dtype != dtypes.int64:
+        raise ValueError("`key_func` must return a single tf.int64 tensor.")
+      return ret
+
+    self._key_func = tf_key_func
+    self._key_func.add_to_graph(ops.get_default_graph())
+
+    @function.Defun(dtypes.int64, dtypes.resource)
+    def tf_reduce_func(key, window_dataset_resource):
+      """A wrapper for Defun that facilitates shape inference."""
+      key.set_shape([])
+      window_dataset = _ResourceDataset(window_dataset_resource,
+                                        input_dataset.output_types,
+                                        input_dataset.output_shapes)
+      output_dataset = reduce_func(key, window_dataset)
+      if not isinstance(output_dataset, Dataset):
+        raise TypeError("`reduce_func` must return a `Dataset` object.")
+      self._output_types = output_dataset.output_types
+      self._output_shapes = output_dataset.output_shapes
+      return output_dataset.make_dataset_resource()
+
+    self._reduce_func = tf_reduce_func
+    self._reduce_func.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.group_by_window_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._key_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._window_size,
+        key_func=self._key_func,
+        reduce_func=self._reduce_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+def _most_specific_compatible_shape(s1, s2):
+  """Returns the most specific shape compatible with `s1` and `s2`."""
+  if s1.dims is None:
+    return s1
+  if s2.dims is None:
+    return s2
+  s1.assert_same_rank(s2)
+  dims = []
+  for dim1, dim2 in zip(s1, s2):
+    if dim1.value is None or dim2.value is None or dim1.value != dim2.value:
+      dims.append(tensor_shape.Dimension(None))
+    else:
+      dims.append(dim1.value)
+  return tensor_shape.TensorShape(dims)
+
+
+class MapDataset(Dataset):
+  """A `Dataset` that maps a function over elements in its input."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               num_threads=None,
+               output_buffer_size=None):
+    """See `Dataset.map()` for details."""
+    super(MapDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    self._output_shapes = None
+    self._output_types = None
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_map_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        ret = map_func(*nested_args)
+      else:
+        ret = map_func(nested_args)
+
+      # Extract shape information from the returned values.
+      flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)]
+      self._output_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in flattened_ret])
+      self._output_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in flattened_ret])
+
+      return flattened_ret
+
+    self._map_func = tf_map_func
+    self._map_func.add_to_graph(ops.get_default_graph())
+    if num_threads is not None:
+      self._num_threads = ops.convert_to_tensor(
+          num_threads, dtype=dtypes.int32, name="num_threads")
+      if output_buffer_size is not None:
+        self._output_buffer_size = ops.convert_to_tensor(
+            output_buffer_size, dtype=dtypes.int64, name="output_buffer_size")
+      else:
+        self._output_buffer_size = ops.convert_to_tensor(
+            self._num_threads, dtype=dtypes.int64, name="output_buffer_size")
+    else:
+      self._num_threads = None
+      self._output_buffer_size = None
+
+  def make_dataset_resource(self):
+    input_resource = self._input_dataset.make_dataset_resource()
+    if self._num_threads is None:
+      return gen_dataset_ops.map_dataset(
+          input_resource,
+          self._map_func.captured_inputs,
+          f=self._map_func,
+          output_types=nest.flatten(self.output_types),
+          output_shapes=nest.flatten(self.output_shapes))
+    else:
+      return gen_dataset_ops.parallel_map_dataset(
+          input_resource,
+          self._map_func.captured_inputs,
+          f=self._map_func,
+          num_threads=self._num_threads,
+          output_buffer_size=self._output_buffer_size,
+          output_types=nest.flatten(self.output_types),
+          output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class FlatMapDataset(Dataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(self,
+               input_dataset,
+               map_func):
+    """See `Dataset.flat_map()` for details."""
+    super(FlatMapDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_map_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        dataset = map_func(*nested_args)
+      else:
+        dataset = map_func(nested_args)
+
+      if not isinstance(dataset, Dataset):
+        raise TypeError("`map_func` must return a `Dataset` object.")
+
+      self._output_types = dataset.output_types
+      self._output_shapes = dataset.output_shapes
+
+      return dataset.make_dataset_resource()
+
+    self._map_func = tf_map_func
+    self._map_func.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.flat_map_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class FilterDataset(Dataset):
+  """A `Dataset` that filters its input according to a predicate function."""
+
+  def __init__(self, input_dataset, predicate):
+    """See `Dataset.filter()` for details."""
+    super(FilterDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_predicate(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        ret = predicate(*nested_args)
+      else:
+        ret = predicate(nested_args)
+
+      ret = ops.convert_to_tensor(ret, dtype=dtypes.bool)
+      if not (ret.dtype == dtypes.bool and
+              ret.shape.is_compatible_with(tensor_shape.scalar())):
+        raise ValueError("`predicate` must return a scalar boolean tensor.")
+
+      return ret
+
+    self._predicate = tf_predicate
+    self._predicate.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.filter_dataset(
+        self._input_dataset.make_dataset_resource(),
+        other_arguments=self._predicate.captured_inputs,
+        predicate=self._predicate,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class TextLineDataset(Dataset):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames):
+    """Creates a `TextLineDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+    """
+    super(TextLineDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.text_line_dataset(self._filenames)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+class TFRecordDataset(Dataset):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None):
+    """Creates a `TFRecordDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      compression_type: A `tf.string` scalar evaluating to one of `""` (no
+        compression), `"ZLIB"`, or `"GZIP"`.
+    """
+    super(TFRecordDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(filenames, name="filenames")
+    if compression_type is not None:
+      self._compression_type = ops.convert_to_tensor(
+          compression_type, dtype=dtypes.string, name="compression_type")
+    else:
+      self._compression_type = constant_op.constant("", name="compression_type")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tf_record_dataset(self._filenames,
+                                             self._compression_type)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+class FixedLengthRecordDataset(Dataset):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None):
+    """Creates a `FixedLengthRecordDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      record_bytes: A `tf.int64` scalar representing the number of bytes in
+        each record.
+      header_bytes: (Optional.) A `tf.int64` scalar representing the number of
+        bytes to skip at the start of a file.
+      footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
+        bytes to ignore at the end of a file.
+    """
+    super(FixedLengthRecordDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+    self._record_bytes = ops.convert_to_tensor(
+        record_bytes, dtype=dtypes.int64, name="record_bytes")
+    if header_bytes is not None:
+      self._header_bytes = ops.convert_to_tensor(
+          header_bytes, dtype=dtypes.int64, name="header_bytes")
+    else:
+      self._header_bytes = constant_op.constant(
+          0, dtype=dtypes.int64, name="header_bytes")
+    if footer_bytes is not None:
+      self._footer_bytes = ops.convert_to_tensor(
+          footer_bytes, dtype=dtypes.int64, name="footer_bytes")
+    else:
+      self._footer_bytes = constant_op.constant(
+          0, dtype=dtypes.int64, name="footer_bytes")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.fixed_length_record_dataset(
+        self._filenames, self._header_bytes, self._record_bytes,
+        self._footer_bytes)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+def rejection_resample(dataset, class_func, target_dist,
+                       initial_dist=None, seed=None):
+  """Resamples this dataset to achieve a target class distribution.
+
+  **NOTE** Resampling is performed via rejection sampling; some fraction
+  of the input values will be dropped.
+
+  Args:
+    dataset: A `Dataset` object.
+    class_func: A function mapping a nested structure of tensors (having
+      shapes and types defined by `dataset.output_shapes` and
+      `dataset.output_types`) to a scalar `tf.int32` tensor.  Values should
+      be in `[0, num_classes)`.
+    target_dist: A floating point type tensor, shaped `[num_classes].
+    initial_dist: (Optional.)  A floating point type tensor, shaped
+      `[num_classes]`.  If not provided, the true class distribution is
+      estimated live in a streaming fashion.
+    seed: (Optional.) Python integer seed for the resampler.
+
+  Returns:
+    A `Dataset`.
+  """
+  dist_estimation_batch_size = 32
+  target_dist = ops.convert_to_tensor(target_dist, name="initial_dist")
+  class_values_ds = dataset.map(class_func)
+  if initial_dist is not None:
+    initial_dist = ops.convert_to_tensor(
+        initial_dist, name="initial_dist")
+    acceptance_dist = _calculate_acceptance_probs(initial_dist, target_dist)
+    initial_dist_ds = Dataset.from_tensors(initial_dist).repeat()
+    acceptance_dist_ds = Dataset.from_tensors(acceptance_dist).repeat()
+  else:
+    num_classes = (target_dist.shape[0].value
+                   or array_ops.shape(target_dist)[0])
+    smoothing_constant = 10
+    num_examples_per_class_seen = resource_variable_ops.ResourceVariable(
+        initial_value=array_ops.fill(
+            [num_classes], np.int64(smoothing_constant)),
+        trainable=False,
+        name="class_count",
+        dtype=dtypes.int64)
+    def update_estimate_and_tile(c):
+      return array_ops.tile(
+          array_ops.expand_dims(
+              _estimate_data_distribution(c, num_examples_per_class_seen), 0),
+          [dist_estimation_batch_size, 1])
+    initial_dist_ds = (class_values_ds
+                       .batch(dist_estimation_batch_size)
+                       .map(update_estimate_and_tile)
+                       .unbatch())
+    acceptance_dist_ds = initial_dist_ds.map(
+        lambda initial: _calculate_acceptance_probs(initial, target_dist))
+
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum(
+        (1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (
+      Dataset.zip((acceptance_dist_ds, initial_dist_ds))
+      .map(maybe_warn_on_large_rejection))
+
+  current_probabilities_ds = (Dataset
+                              .zip((acceptance_dist_ds, class_values_ds))
+                              .map(array_ops.gather))
+  filtered_ds = (
+      Dataset.zip((class_values_ds, current_probabilities_ds, dataset))
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def read_batch_features(file_pattern,
+                        batch_size,
+                        features,
+                        reader,
+                        reader_args=None,
+                        randomize_input=True,
+                        num_epochs=None,
+                        capacity=10000):
+  """Reads batches of Examples.
+
+  Example:
+
+  ```
+  serialized_examples = [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
+    },
+    features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  features: {
+    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+    "gender": FixedLenFeature([], dtype=tf.string),
+    "kws": VarLenFeature(dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+    "kws": SparseTensor(
+      indices=[[0, 0], [0, 1], [1, 0]],
+      values=["code", "art", "sports"]
+      dense_shape=[2, 2]),
+  }
+  ```
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing
+      `Example` records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of consecutive elements of this
+      dataset to combine in a single batch.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values. See `tf.parse_example`.
+    reader: A function or class that can be called with a `filenames` tensor
+      and (optional) `reader_args` and returns a `Dataset` of serialized
+      Examples.
+    reader_args: Additional arguments to pass to the reader class.
+    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever.
+    capacity: Capacity of the ShuffleDataset. A large capacity ensures better
+      shuffling but would increase memory usage and startup time.
+
+  Returns:
+    A dict from keys in features to Tensor or SparseTensor objects.
+  """
+  filenames = _get_file_names(file_pattern, randomize_input)
+  if reader_args:
+    dataset = reader(filenames, *reader_args)
+  else:
+    dataset = reader(filenames)
+  dataset = dataset.repeat(num_epochs)
+  if randomize_input:
+    dataset = dataset.shuffle(capacity)
+  dataset = dataset.batch(batch_size)
+  dataset = dataset.map(lambda x: _parse_example(x, features))
+  iterator = dataset.make_one_shot_iterator()
+  outputs = iterator.get_next()
+  index = 0
+  result = {}
+  for key in sorted(features.keys()):
+    feature = features[key]
+    if isinstance(feature, parsing_ops.FixedLenFeature):
+      result[key] = outputs[index]
+      index += 1
+    else:
+      result[key] = sparse_tensor_lib.SparseTensor(
+          indices=outputs[index],
+          values=outputs[index + 1],
+          dense_shape=outputs[index + 2])
+      index += 3
+  return result
+
+
+def _parse_example(serialized, features):
+  parsed = parsing_ops.parse_example(serialized, features)
+  result = []
+  for key in sorted(features.keys()):
+    val = parsed[key]
+    if isinstance(val, sparse_tensor_lib.SparseTensor):
+      result.extend([val.indices, val.values, val.dense_shape])
+    else:
+      result.append(val)
+  return tuple(result)
+
+
+def _get_file_names(file_pattern, randomize_input):
+  """Parse list of file names from pattern, optionally shuffled.
+
+  Args:
+    file_pattern: File glob pattern, or list of glob patterns.
+    randomize_input: Whether to shuffle the order of file names.
+
+  Returns:
+    List of file names matching `file_pattern`.
+
+  Raises:
+    ValueError: If `file_pattern` is empty, or pattern matches no files.
+  """
+  if isinstance(file_pattern, list):
+    if not file_pattern:
+      raise ValueError("File pattern is empty.")
+    file_names = []
+    for entry in file_pattern:
+      file_names.extend(gfile.Glob(entry))
+  else:
+    file_names = list(gfile.Glob(file_pattern))
+
+  if not file_names:
+    raise ValueError("No files match %s." % file_pattern)
+
+  # Sort files so it will be deterministic for unit tests.
+  if not randomize_input:
+    file_names = sorted(file_names)
+  return file_names
diff --git a/tensorflow/contrib/data/python/util/BUILD b/tensorflow/contrib/data/python/util/BUILD
new file mode 100644
index 00000000000..b9691c8e491
--- /dev/null
+++ b/tensorflow/contrib/data/python/util/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "nest",
+    srcs = ["nest.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "nest_test",
+    size = "small",
+    srcs = ["nest_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nest",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/util/nest.py b/tensorflow/contrib/data/python/util/nest.py
new file mode 100644
index 00000000000..91c8416d5ae
--- /dev/null
+++ b/tensorflow/contrib/data/python/util/nest.py
@@ -0,0 +1,513 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""## Functions for working with arbitrarily nested sequences of elements.
+
+NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
+makes two changes:
+
+1. It adds support for dictionaries as a level of nesting in nested structures.
+2. It removes support for lists as a level of nesting in nested structures.
+
+The motivation for this change is twofold:
+
+1. Many input-processing functions (e.g. `tf.parse_example()`) return
+   dictionaries, and we would like to support them natively in datasets.
+2. It seems more natural for lists to be treated (e.g. in Dataset constructors)
+   as tensors, rather than lists of (lists of...) tensors.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+
+import six as _six
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+def _sequence_like(instance, args):
+  """Converts the sequence `args` to the same type as `instance`.
+
+  Args:
+    instance: an instance of `tuple`, `list`, or a `namedtuple` class.
+    args: elements to be converted to a sequence.
+
+  Returns:
+    `args` with the type of `instance`.
+  """
+  if isinstance(instance, dict):
+    # This is a dict. Iterate over the keys in sorted order to make
+    # this deterministic.
+    return {k: v for k, v in zip(sorted(instance.keys()), args)}
+  elif (isinstance(instance, tuple) and
+        hasattr(instance, "_fields") and
+        isinstance(instance._fields, _collections.Sequence) and
+        all(isinstance(f, _six.string_types) for f in instance._fields)):
+    # This is a namedtuple
+    return type(instance)(*args)
+  else:
+    # Not a namedtuple
+    return type(instance)(args)
+
+
+def _elements_of(nest):
+  if isinstance(nest, dict):
+    # Iterate over dict keys in sorted order to make this deterministic.
+    return [v for _, v in sorted(nest.items())]
+  else:
+    return nest
+
+
+def _yield_flat_nest(nest):
+  for n in _elements_of(nest):
+    if is_sequence(n):
+      for ni in _yield_flat_nest(n):
+        yield ni
+    else:
+      yield n
+
+
+def is_sequence(seq):
+  """Returns a true if `seq` is a Sequence or dict (except strings/lists).
+
+  NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
+  which *does* treat a Python list as a sequence. For ergonomic
+  reasons, `tf.contrib.data` users would prefer to treat lists as
+  implict `tf.Tensor` objects, and dicts as (nested) sequences.
+
+  Args:
+    seq: an input sequence.
+
+  Returns:
+    True if the sequence is a not a string or list and is a
+    collections.Sequence.
+  """
+  return (isinstance(seq, (_collections.Sequence, dict))
+          and not isinstance(seq, (list, _six.string_types)))
+
+
+def flatten(nest):
+  """Returns a flat sequence from a given nested structure.
+
+  If `nest` is not a sequence, this returns a single-element list: `[nest]`.
+
+  Args:
+    nest: an arbitrarily nested structure or a scalar object.
+      Note, numpy arrays are considered scalars.
+
+  Returns:
+    A Python list, the flattened version of the input.
+  """
+  return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
+
+
+def _recursive_assert_same_structure(nest1, nest2, check_types):
+  is_sequence_nest1 = is_sequence(nest1)
+  if is_sequence_nest1 != is_sequence(nest2):
+    raise ValueError(
+        "The two structures don't have the same nested structure. "
+        "First structure: %s, second structure: %s." % (nest1, nest2))
+
+  if is_sequence_nest1:
+    type_nest1 = type(nest1)
+    type_nest2 = type(nest2)
+    if check_types and type_nest1 != type_nest2:
+      raise TypeError(
+          "The two structures don't have the same sequence type. First "
+          "structure has type %s, while second structure has type %s."
+          % (type_nest1, type_nest2))
+
+    for n1, n2 in zip(_elements_of(nest1), _elements_of(nest2)):
+      _recursive_assert_same_structure(n1, n2, check_types)
+
+
+def assert_same_structure(nest1, nest2, check_types=True):
+  """Asserts that two structures are nested in the same way.
+
+  Args:
+    nest1: an arbitrarily nested structure.
+    nest2: an arbitrarily nested structure.
+    check_types: if `True` (default) types of sequences are checked as
+      well. If set to `False`, for example a list and a tuple of objects will
+      look same if they have the same size.
+
+  Raises:
+    ValueError: If the two structures do not have the same number of elements or
+      if the two structures are not nested in the same way.
+    TypeError: If the two structures differ in the type of sequence in any of
+      their substructures. Only possible if `check_types` is `True`.
+  """
+  len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
+  len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
+  if len_nest1 != len_nest2:
+    raise ValueError("The two structures don't have the same number of "
+                     "elements. First structure: %s, second structure: %s."
+                     % (nest1, nest2))
+  _recursive_assert_same_structure(nest1, nest2, check_types)
+
+
+def _packed_nest_with_indices(structure, flat, index):
+  """Helper function for pack_nest_as.
+
+  Args:
+    structure: Substructure (tuple of elements and/or tuples) to mimic
+    flat: Flattened values to output substructure for.
+    index: Index at which to start reading from flat.
+
+  Returns:
+    The tuple (new_index, child), where:
+      * new_index - the updated index into `flat` having processed `structure`.
+      * packed - the subset of `flat` corresponding to `structure`,
+                 having started at `index`, and packed into the same nested
+                 format.
+
+  Raises:
+    ValueError: if `structure` contains more elements than `flat`
+      (assuming indexing starts from `index`).
+  """
+  packed = []
+  for s in structure:
+    if is_sequence(s):
+      new_index, child = _packed_nest_with_indices(s, flat, index)
+      packed.append(_sequence_like(s, child))
+      index = new_index
+    else:
+      packed.append(flat[index])
+      index += 1
+  return index, packed
+
+
+def pack_sequence_as(structure, flat_sequence):
+  """Returns a given flattened sequence packed into a nest.
+
+  If `structure` is a scalar, `flat_sequence` must be a single-element list;
+  in this case the return value is `flat_sequence[0]`.
+
+  Args:
+    structure: tuple or list constructed of scalars and/or other tuples/lists,
+      or a scalar.  Note: numpy arrays are considered scalars.
+    flat_sequence: flat sequence to pack.
+
+  Returns:
+    packed: `flat_sequence` converted to have the same recursive structure as
+      `structure`.
+
+  Raises:
+    ValueError: If nest and structure have different element counts.
+  """
+  if not (is_sequence(flat_sequence) or isinstance(flat_sequence, list)):
+    raise TypeError("flat_sequence must be a sequence")
+
+  if not is_sequence(structure):
+    if len(flat_sequence) != 1:
+      raise ValueError("Structure is a scalar but len(flat_sequence) == %d > 1"
+                       % len(flat_sequence))
+    return flat_sequence[0]
+
+  flat_structure = flatten(structure)
+  if len(flat_structure) != len(flat_sequence):
+    raise ValueError(
+        "Could not pack sequence. Structure had %d elements, but flat_sequence "
+        "had %d elements.  Structure: %s, flat_sequence: %s."
+        % (len(flat_structure), len(flat_sequence), structure, flat_sequence))
+
+  _, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+  return _sequence_like(structure, packed)
+
+
+def map_structure(func, *structure, **check_types_dict):
+  """Applies `func` to each entry in `structure` and returns a new structure.
+
+  Applies `func(x[0], x[1], ...)` where x[i] is an entry in
+  `structure[i]`.  All structures in `structure` must have the same arity,
+  and the return value will contain the results in the same structure.
+
+  Args:
+    func: A callable that acceps as many arguments are there are structures.
+    *structure: scalar, or tuple or list of constructed scalars and/or other
+      tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
+    **check_types_dict: only valid keyword argument is `check_types`. If set to
+      `True` (default) the types of iterables within the  structures have to be
+      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
+      exception). To allow this set this argument to `False`.
+
+  Returns:
+    A new structure with the same arity as `structure`, whose values correspond
+    to `func(x[0], x[1], ...)` where `x[i]` is a value in the corresponding
+    location in `structure[i]`. If there are different sequence types and
+    `check_types` is `False` the sequence types of the first structure will be
+    used.
+
+  Raises:
+    TypeError: If `func` is not callable or if the structures do not match
+      each other by depth tree.
+    ValueError: If no structure is provided or if the structures do not match
+      each other by type.
+    ValueError: If wrong keyword arguments are provided.
+  """
+  if not callable(func):
+    raise TypeError("func must be callable, got: %s" % func)
+
+  if not structure:
+    raise ValueError("Must provide at least one structure")
+
+  if check_types_dict:
+    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
+      raise ValueError("Only valid keyword argument is check_types")
+    check_types = check_types_dict["check_types"]
+  else:
+    check_types = True
+
+  for other in structure[1:]:
+    assert_same_structure(structure[0], other, check_types=check_types)
+
+  flat_structure = [flatten(s) for s in structure]
+  entries = zip(*flat_structure)
+
+  return pack_sequence_as(
+      structure[0], [func(*x) for x in entries])
+
+
+def _yield_flat_up_to(shallow_tree, input_tree):
+  """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
+  if is_sequence(shallow_tree):
+    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+      for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
+        yield input_leaf
+  else:
+    yield input_tree
+
+
+def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
+  """Asserts that `shallow_tree` is a shallow structure of `input_tree`.
+
+  That is, this function tests if the `input_tree` structure can be created from
+  the `shallow_tree` structure by replacing its leaf nodes with deeper
+  tree structures.
+
+  Examples:
+
+  The following code will raise an exception:
+  ```python
+    shallow_tree = ["a", "b"]
+    input_tree = ["c", ["d", "e"], "f"]
+    assert_shallow_structure(shallow_tree, input_tree)
+  ```
+
+  The following code will not raise an exception:
+  ```python
+    shallow_tree = ["a", "b"]
+    input_tree = ["c", ["d", "e"]]
+    assert_shallow_structure(shallow_tree, input_tree)
+  ```
+
+  Args:
+    shallow_tree: an arbitrarily nested structure.
+    input_tree: an arbitrarily nested structure.
+    check_types: if `True` (default) the sequence types of `shallow_tree` and
+      `input_tree` have to be the same.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`. Only raised if `check_types` is `True`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+  """
+  if is_sequence(shallow_tree):
+    if not is_sequence(input_tree):
+      raise TypeError(
+          "If shallow structure is a sequence, input must also be a sequence. "
+          "Input has type: %s." % type(input_tree))
+
+    if check_types and not isinstance(input_tree, type(shallow_tree)):
+      raise TypeError(
+          "The two structures don't have the same sequence type. Input "
+          "structure has type %s, while shallow structure has type %s."
+          % (type(input_tree), type(shallow_tree)))
+
+    if len(input_tree) != len(shallow_tree):
+      raise ValueError(
+          "The two structures don't have the same sequence length. Input "
+          "structure has length %s, while shallow structure has length %s."
+          % (len(input_tree), len(shallow_tree)))
+
+    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+      assert_shallow_structure(shallow_branch, input_branch,
+                               check_types=check_types)
+
+
+def flatten_up_to(shallow_tree, input_tree):
+  """Flattens `input_tree` up to `shallow_tree`.
+
+  Any further depth in structure in `input_tree` is retained as elements in the
+  partially flatten output.
+
+  If `shallow_tree` and `input_tree` are not sequences, this returns a
+  single-element list: `[input_tree]`.
+
+  Use Case:
+
+  Sometimes we may wish to partially flatten a nested sequence, retaining some
+  of the nested structure. We achieve this by specifying a shallow structure,
+  `shallow_tree`, we wish to flatten up to.
+
+  The input, `input_tree`, can be thought of as having the same structure as
+  `shallow_tree`, but with leaf nodes that are themselves tree structures.
+
+  Examples:
+
+  ```python
+  input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+  shallow_tree = [[True, True], [False, True]]
+
+  flattened_input_tree = flatten_up_to(shallow_tree, input_tree)
+  flattened_shallow_tree = flatten_up_to(shallow_tree, shallow_tree)
+
+  # Output is:
+  # [[2, 2], [3, 3], [4, 9], [5, 5]]
+  # [True, True, False, True]
+  ```
+
+  ```python
+  input_tree = [[('a', 1), [('b', 2), [('c', 3), [('d', 4)]]]]]
+  shallow_tree = [['level_1', ['level_2', ['level_3', ['level_4']]]]]
+
+  input_tree_flattened_as_shallow_tree = flatten_up_to(shallow_tree, input_tree)
+  input_tree_flattened = flatten(input_tree)
+
+  # Output is:
+  # [('a', 1), ('b', 2), ('c', 3), ('d', 4)]
+  # ['a', 1, 'b', 2, 'c', 3, 'd', 4]
+  ```
+
+  Non-Sequence Edge Cases:
+
+  ```python
+  flatten_up_to(0, 0)  # Output: [0]
+  flatten_up_to(0, [0, 1, 2])  # Output: [[0, 1, 2]]
+  flatten_up_to([0, 1, 2], 0)  # Output: TypeError
+  flatten_up_to([0, 1, 2], [0, 1, 2])  # Output: [0, 1, 2]
+  ```
+
+  Args:
+    shallow_tree: a possibly pruned structure of input_tree.
+    input_tree: an arbitrarily nested structure or a scalar object.
+      Note, numpy arrays are considered scalars.
+
+  Returns:
+    A Python list, the partially flattened version of `input_tree` according to
+    the structure of `shallow_tree`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+  """
+  assert_shallow_structure(shallow_tree, input_tree)
+  return list(_yield_flat_up_to(shallow_tree, input_tree))
+
+
+def map_structure_up_to(shallow_tree, func, *inputs):
+  """Applies a function or op to a number of partially flattened inputs.
+
+  The `inputs` are flattened up to `shallow_tree` before being mapped.
+
+  Use Case:
+
+  Sometimes we wish to apply a function to a partially flattened
+  sequence (for example when the function itself takes sequence inputs). We
+  achieve this by specifying a shallow structure, `shallow_tree` we wish to
+  flatten up to.
+
+  The `inputs`, can be thought of as having the same structure as
+  `shallow_tree`, but with leaf nodes that are themselves tree structures.
+
+  This function therefore will return something with the same base structure as
+  `shallow_tree`.
+
+  Examples:
+
+  ```python
+  ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+  op_tuple = collections.namedtuple("op_tuple", "add, mul")
+  inp_val = ab_tuple(a=2, b=3)
+  inp_ops = ab_tuple(a=op_tuple(add=1, mul=2), b=op_tuple(add=2, mul=3))
+  out = map_structure_up_to(inp_val, lambda val, ops: (val + ops.add) * ops.mul,
+                            inp_val, inp_ops)
+
+  # Output is: ab_tuple(a=6, b=15)
+  ```
+
+  ```python
+  data_list = [[2, 4, 6, 8], [[1, 3, 5, 7, 9], [3, 5, 7]]]
+  name_list = ['evens', ['odds', 'primes']]
+  out = map_structure_up_to(
+      name_list,
+      lambda name, sec: "first_{}_{}".format(len(sec), name),
+      name_list, data_list)
+
+  # Output is: ['first_4_evens', ['first_5_odds', 'first_3_primes']]
+  ```
+
+  Args:
+    shallow_tree: a shallow tree, common to all the inputs.
+    func: callable which will be applied to each input individually.
+    *inputs: arbitrarily nested combination of objects that are compatible with
+        shallow_tree. The function `func` is applied to corresponding
+        partially flattened elements of each input, so the function must support
+        arity of `len(inputs)`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+
+  Returns:
+    result of repeatedly applying `func`, with same structure as
+    `shallow_tree`.
+  """
+  if not inputs:
+    raise ValueError("Cannot map over no sequences")
+  for input_tree in inputs:
+    assert_shallow_structure(shallow_tree, input_tree)
+
+  # Flatten each input separately, apply the function to corresponding elements,
+  # then repack based on the structure of the first input.
+  all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
+                         for input_tree in inputs]
+  results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
+  return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
+
+
+_allowed_symbols = [
+    "assert_same_structure",
+    "is_sequence",
+    "flatten",
+    "pack_sequence_as",
+    "map_structure",
+    "assert_shallow_structure",
+    "flatten_up_to",
+    "map_structure_up_to",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/data/python/util/nest_test.py b/tensorflow/contrib/data/python/util/nest_test.py
new file mode 100644
index 00000000000..7852e4f8617
--- /dev/null
+++ b/tensorflow/contrib/data/python/util/nest_test.py
@@ -0,0 +1,309 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities working with arbitrarily nested structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class NestTest(test.TestCase):
+
+  def testFlattenAndPack(self):
+    structure = ((3, 4), 5, (6, 7, (9, 10), 8))
+    flat = ["a", "b", "c", "d", "e", "f", "g", "h"]
+    self.assertEqual(nest.flatten(structure), [3, 4, 5, 6, 7, 9, 10, 8])
+    self.assertEqual(
+        nest.pack_sequence_as(structure, flat), (("a", "b"), "c",
+                                                 ("d", "e", ("f", "g"), "h")))
+    point = collections.namedtuple("Point", ["x", "y"])
+    structure = (point(x=4, y=2), ((point(x=1, y=0),),))
+    flat = [4, 2, 1, 0]
+    self.assertEqual(nest.flatten(structure), flat)
+    restructured_from_flat = nest.pack_sequence_as(structure, flat)
+    self.assertEqual(restructured_from_flat, structure)
+    self.assertEqual(restructured_from_flat[0].x, 4)
+    self.assertEqual(restructured_from_flat[0].y, 2)
+    self.assertEqual(restructured_from_flat[1][0][0].x, 1)
+    self.assertEqual(restructured_from_flat[1][0][0].y, 0)
+
+    self.assertEqual([5], nest.flatten(5))
+    self.assertEqual([np.array([5])], nest.flatten(np.array([5])))
+
+    self.assertEqual("a", nest.pack_sequence_as(5, ["a"]))
+    self.assertEqual(
+        np.array([5]), nest.pack_sequence_as("scalar", [np.array([5])]))
+
+    with self.assertRaisesRegexp(ValueError, "Structure is a scalar"):
+      nest.pack_sequence_as("scalar", [4, 5])
+
+    with self.assertRaisesRegexp(TypeError, "flat_sequence"):
+      nest.pack_sequence_as([4, 5], "bad_sequence")
+
+    with self.assertRaises(ValueError):
+      nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
+
+  def testIsSequence(self):
+    self.assertFalse(nest.is_sequence("1234"))
+    self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))
+    self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
+    self.assertFalse(nest.is_sequence([]))
+    self.assertFalse(nest.is_sequence(set([1, 2])))
+    ones = array_ops.ones([2, 3])
+    self.assertFalse(nest.is_sequence(ones))
+    self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
+    self.assertFalse(nest.is_sequence(np.ones((4, 5))))
+    self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2}))
+
+  def testAssertSameStructure(self):
+    structure1 = (((1, 2), 3), 4, (5, 6))
+    structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
+    structure_different_num_elements = ("spam", "eggs")
+    structure_different_nesting = (((1, 2), 3), 4, 5, (6,))
+    nest.assert_same_structure(structure1, structure2)
+    nest.assert_same_structure("abc", 1.0)
+    nest.assert_same_structure("abc", np.array([0, 1]))
+    nest.assert_same_structure("abc", constant_op.constant([0, 1]))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same number of elements"):
+      nest.assert_same_structure(structure1, structure_different_num_elements)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same number of elements"):
+      nest.assert_same_structure((0, 1), np.array([0, 1]))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same number of elements"):
+      nest.assert_same_structure(0, (0, 1))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same nested structure"):
+      nest.assert_same_structure(structure1, structure_different_nesting)
+
+    named_type_0 = collections.namedtuple("named_0", ("a", "b"))
+    named_type_1 = collections.namedtuple("named_1", ("a", "b"))
+    self.assertRaises(TypeError, nest.assert_same_structure, (0, 1),
+                      named_type_0("a", "b"))
+
+    nest.assert_same_structure(named_type_0(3, 4), named_type_0("a", "b"))
+
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      named_type_0(3, 4), named_type_1(3, 4))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same nested structure"):
+      nest.assert_same_structure(named_type_0(3, 4), named_type_0((3,), 4))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same nested structure"):
+      nest.assert_same_structure(((3,), 4), (3, (4,)))
+
+    structure1_list = {"a": ((1, 2), 3), "b": 4, "c": (5, 6)}
+    with self.assertRaisesRegexp(TypeError,
+                                 "don't have the same sequence type"):
+      nest.assert_same_structure(structure1, structure1_list)
+    nest.assert_same_structure(structure1, structure2, check_types=False)
+    nest.assert_same_structure(structure1, structure1_list, check_types=False)
+
+  def testMapStructure(self):
+    structure1 = (((1, 2), 3), 4, (5, 6))
+    structure2 = (((7, 8), 9), 10, (11, 12))
+    structure1_plus1 = nest.map_structure(lambda x: x + 1, structure1)
+    nest.assert_same_structure(structure1, structure1_plus1)
+    self.assertAllEqual(
+        [2, 3, 4, 5, 6, 7],
+        nest.flatten(structure1_plus1))
+    structure1_plus_structure2 = nest.map_structure(
+        lambda x, y: x + y, structure1, structure2)
+    self.assertEqual(
+        (((1 + 7, 2 + 8), 3 + 9), 4 + 10, (5 + 11, 6 + 12)),
+        structure1_plus_structure2)
+
+    self.assertEqual(3, nest.map_structure(lambda x: x - 1, 4))
+
+    self.assertEqual(7, nest.map_structure(lambda x, y: x + y, 3, 4))
+
+    with self.assertRaisesRegexp(TypeError, "callable"):
+      nest.map_structure("bad", structure1_plus1)
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, 3, (3,))
+
+    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), {"a": (3, 4), "b": 5})
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)),
+                         check_types=False)
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, foo="a")
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
+
+  def testAssertShallowStructure(self):
+    inp_ab = ("a", "b")
+    inp_abc = ("a", "b", "c")
+    expected_message = (
+        "The two structures don't have the same sequence length. Input "
+        "structure has length 2, while shallow structure has length 3.")
+    with self.assertRaisesRegexp(ValueError, expected_message):
+      nest.assert_shallow_structure(inp_abc, inp_ab)
+
+    inp_ab1 = ((1, 1), (2, 2))
+    inp_ab2 = {"a": (1, 1), "b": (2, 2)}
+    expected_message = (
+        "The two structures don't have the same sequence type. Input structure "
+        "has type <(type|class) 'tuple'>, while shallow structure has type "
+        "<(type|class) 'dict'>.")
+    with self.assertRaisesRegexp(TypeError, expected_message):
+      nest.assert_shallow_structure(inp_ab2, inp_ab1)
+    nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
+
+  def testFlattenUpTo(self):
+    input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
+    shallow_tree = ((True, True), (False, True))
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [(2, 2), (3, 3), (4, 9), (5, 5)])
+    self.assertEqual(flattened_shallow_tree, [True, True, False, True])
+
+    input_tree = ((("a", 1), (("b", 2), (("c", 3), (("d", 4))))))
+    shallow_tree = (("level_1", ("level_2", ("level_3", ("level_4")))))
+    input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
+                                                              input_tree)
+    input_tree_flattened = nest.flatten(input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [("a", 1), ("b", 2), ("c", 3), ("d", 4)])
+    self.assertEqual(input_tree_flattened, ["a", 1, "b", 2, "c", 3, "d", 4])
+
+    ## Shallow non-list edge-case.
+    # Using iterable elements.
+    input_tree = ["input_tree"]
+    shallow_tree = "shallow_tree"
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = ("input_tree_0", "input_tree_1")
+    shallow_tree = "shallow_tree"
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Using non-iterable elements.
+    input_tree = (0,)
+    shallow_tree = 9
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = (0, 1)
+    shallow_tree = 9
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Both non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = "shallow_tree"
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = 0
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Input non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = ("shallow_tree",)
+    expected_message = ("If shallow structure is a sequence, input must also "
+                        "be a sequence. Input has type: <(type|class) 'str'>.")
+    with self.assertRaisesRegexp(TypeError, expected_message):
+      flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree, list(shallow_tree))
+
+    input_tree = "input_tree"
+    shallow_tree = ("shallow_tree_9", "shallow_tree_8")
+    with self.assertRaisesRegexp(TypeError, expected_message):
+      flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree, list(shallow_tree))
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = (9,)
+    expected_message = ("If shallow structure is a sequence, input must also "
+                        "be a sequence. Input has type: <(type|class) 'int'>.")
+    with self.assertRaisesRegexp(TypeError, expected_message):
+      flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree, list(shallow_tree))
+
+    input_tree = 0
+    shallow_tree = (9, 8)
+    with self.assertRaisesRegexp(TypeError, expected_message):
+      flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree, list(shallow_tree))
+
+  def testMapStructureUpTo(self):
+    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    op_tuple = collections.namedtuple("op_tuple", "add, mul")
+    inp_val = ab_tuple(a=2, b=3)
+    inp_ops = ab_tuple(a=op_tuple(add=1, mul=2), b=op_tuple(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val, lambda val, ops: (val + ops.add) * ops.mul, inp_val, inp_ops)
+    self.assertEqual(out.a, 6)
+    self.assertEqual(out.b, 15)
+
+    data_list = ((2, 4, 6, 8), ((1, 3, 5, 7, 9), (3, 5, 7)))
+    name_list = ("evens", ("odds", "primes"))
+    out = nest.map_structure_up_to(
+        name_list, lambda name, sec: "first_{}_{}".format(len(sec), name),
+        name_list, data_list)
+    self.assertEqual(out, ("first_4_evens", ("first_5_odds", "first_3_primes")))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/decision_trees/BUILD b/tensorflow/contrib/decision_trees/BUILD
new file mode 100644
index 00000000000..4045b92f10d
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/BUILD
@@ -0,0 +1,19 @@
+# Files common to decision-tree algorithms.
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
new file mode 100644
index 00000000000..86174c5865f
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "LICENSE",
+    "generic_tree_model_proto.swig",
+])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_proto_library(
+    name = "generic_tree_model",
+    srcs = ["generic_tree_model.proto"],
+    cc_api_version = 2,
+    go_api_version = 2,
+    java_api_version = 2,
+)
+
+tf_proto_library(
+    name = "generic_tree_model_extensions",
+    srcs = ["generic_tree_model_extensions.proto"],
+    cc_api_version = 2,
+    go_api_version = 2,
+    protodeps = [":generic_tree_model"],
+)
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model.proto b/tensorflow/contrib/decision_trees/proto/generic_tree_model.proto
new file mode 100644
index 00000000000..dd80b37f52e
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model.proto
@@ -0,0 +1,183 @@
+// Generic representation of tree-based models.
+
+// This proto establishes a shared standard: "fully compatible" projects should
+// provide support for all reasonable models expressed through it. Therefore,
+// it should be kept as simple as possible, and should never contain
+// project-specific design choices.
+
+// Status: work in progress. This proto can change anytime without notice.
+
+syntax = "proto3";
+option cc_enable_arenas = true;
+
+package tensorflow.decision_trees;
+
+import "google/protobuf/any.proto";
+import "google/protobuf/wrappers.proto";
+
+// A generic handle for any type of model.
+message Model {
+  oneof model {
+    DecisionTree decision_tree = 1;
+    Ensemble ensemble = 2;
+    google.protobuf.Any custom_model = 3;
+  }
+  repeated google.protobuf.Any additional_data = 4;
+}
+
+message ModelAndFeatures {
+  message Feature {
+    // TODO(jonasz): Remove this field, as it's confusing. Ctx: cr/153569450.
+    FeatureId feature_id = 1 [deprecated = true];
+    repeated google.protobuf.Any additional_data = 2;
+  };
+  // Given a FeatureId feature_id, the feature's description is in
+  // features[feature_id.id.value].
+  map<string, Feature> features = 1;
+  Model model = 2;
+  repeated google.protobuf.Any additional_data = 3;
+}
+
+// An ordered sequence of models. This message can be used to express bagged or
+// boosted models, as well as custom ensembles.
+message Ensemble {
+  message Member {
+    Model submodel = 1;
+    google.protobuf.Int32Value submodel_id = 2;
+    repeated google.protobuf.Any additional_data = 3;
+  }
+  repeated Member members = 100; // A higher id for more readable printing.
+
+  // The presence of a certain combination_technique indicates how to combine
+  // the outputs of member models in order to compute the ensemble's output.
+  oneof combination_technique {
+    Summation summation_combination_technique = 1;
+    Averaging averaging_combination_technique = 2;
+    google.protobuf.Any custom_combination_technique = 3;
+  }
+  repeated google.protobuf.Any additional_data = 4;
+}
+
+// When present, the Ensemble's output is the sum of member models' outputs.
+message Summation {
+  repeated google.protobuf.Any additional_data = 1;
+};
+
+
+// When present, the Ensemble's output is the average of member models' outputs.
+message Averaging {
+  repeated google.protobuf.Any additional_data = 1;
+};
+
+
+message DecisionTree {
+  repeated TreeNode nodes = 1;
+  repeated google.protobuf.Any additional_data = 2;
+};
+
+
+message TreeNode {
+  // Following fields are provided for convenience and better readability.
+  // Filling them in is not required.
+  google.protobuf.Int32Value node_id = 1;
+  google.protobuf.Int32Value depth = 2;
+  google.protobuf.Int32Value subtree_size = 3;
+
+  oneof node_type {
+    BinaryNode binary_node = 4;
+    Leaf leaf = 5;
+    google.protobuf.Any custom_node_type = 6;
+  }
+
+  repeated google.protobuf.Any additional_data = 7;
+}
+
+
+message BinaryNode {
+  google.protobuf.Int32Value left_child_id = 1;
+  google.protobuf.Int32Value right_child_id = 2;
+  enum Direction {
+    LEFT = 0;
+    RIGHT = 1;
+  }
+  // When left_child_test is undefined for a particular datapoint (e.g. because
+  // it's not defined when feature value is missing), the datapoint should go
+  // in this direction.
+  Direction default_direction = 3;
+  // When a datapoint satisfies the test, it should be propagated to the left
+  // child.
+  oneof left_child_test {
+    InequalityTest inequality_left_child_test = 4;
+    google.protobuf.Any custom_left_child_test = 5;
+  }
+};
+
+// A SparseVector represents a vector in which only certain select elements
+// are non-zero.  Maps labels to values (e.g. class id to probability or count).
+message SparseVector {
+  map<int64, Value> sparse_value = 1;
+}
+
+message Vector {
+  repeated Value value = 1;
+}
+
+message Leaf {
+  oneof leaf {
+    // The interpretation of the values held in the leaves of a decision tree
+    // is application specific, but some common cases are:
+    // 1) len(vector) = 1, and the floating point value[0] holds the class 0
+    //    probability in a two class classification problem.
+    // 2) len(vector) = 1, and the integer value[0] holds the class prediction.
+    // 3) The floating point value[i] holds the class i probability prediction.
+    // 4) The floating point value[i] holds the i-th component of the
+    //    vector prediction in a regression problem.
+    // 5) sparse_vector holds the sparse class predictions for a classification
+    //    problem with a large number of classes.
+    Vector vector = 1;
+    SparseVector sparse_vector = 2;
+  }
+  // For non-standard handling of leaves.
+  repeated google.protobuf.Any additional_data = 3;
+};
+
+
+message FeatureId {
+  google.protobuf.StringValue id = 1;
+  repeated google.protobuf.Any additional_data = 2;
+};
+
+message ObliqueFeatures {
+  // total value is sum(features[i] * weights[i]).
+  repeated FeatureId features = 1;
+  repeated float weights = 2;
+}
+
+
+message InequalityTest {
+  // When the feature is missing, the test's outcome is undefined.
+  oneof FeatureSum {
+    FeatureId feature_id = 1;
+    ObliqueFeatures oblique = 4;
+  }
+  enum Type {
+    LESS_OR_EQUAL = 0;
+    LESS_THAN = 1;
+    GREATER_OR_EQUAL = 2;
+    GREATER_THAN = 3;
+  };
+  Type type = 2;
+  Value threshold = 3;
+};
+
+
+// Represents a single value of any type, e.g. 5 or "abc".
+message Value {
+  oneof value {
+    float float_value = 1;
+    double double_value = 2;
+    int32 int32_value = 3;
+    int64 int64_value = 4;
+    google.protobuf.Any custom_value = 5;
+  }
+};
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.proto b/tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.proto
new file mode 100644
index 00000000000..4c0cceaddca
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.proto
@@ -0,0 +1,18 @@
+// Messages in this file are not part of the basic standard established by
+// generic_tree_model.proto (see the toplevel comment in that file).
+
+syntax = "proto3";
+
+package tensorflow.decision_trees;
+
+import "tensorflow/contrib/decision_trees/proto/generic_tree_model.proto";
+
+// Used in generic_tree_model.BinaryNode.left_child_test.
+// Tests whether the feature's value belongs to the specified list,
+// (or does not belong if inverse=True).
+message MatchingValuesTest {
+  // When the feature is missing, the test's outcome is undefined.
+  FeatureId feature_id = 1;
+  repeated Value value = 2;
+  bool inverse = 3;
+}
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
new file mode 100644
index 00000000000..d3d201afd57
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
@@ -0,0 +1,14 @@
+////////// SWIG INCLUDE //////////
+
+%include "net/proto/swig/protofunc.swig"
+
+#ifndef MUST_USE_RESULT
+#error Use this file only as a %include or %import after google.swig.
+#endif
+
+%{
+#include "third_party/tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+%}
+
+PROTO_INPUT(tensorflow::decision_trees::DecisionTree, decision_tree);
+PROTO_IN_OUT(tensorflow::decision_trees::DecisionTree, decision_tree);
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 8ba445aec38..adcef730a87 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "deprecated_py",
     srcs = [
@@ -18,6 +20,7 @@ py_library(
 
 py_test(
     name = "summaries_test",
+    size = "small",
     srcs = ["summaries_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index ecf6d217abc..8dea2763f29 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -11,11 +11,10 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
-    name = "distributions_py",
-    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    name = "bijectors_py",
+    srcs = glob(["python/ops/bijectors/*.py"]),
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -30,6 +29,33 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "distributions_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        ":bijectors_py",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -68,6 +94,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    tags = ["no_pip"],
 )
 
 cuda_py_test(
@@ -166,38 +193,6 @@ cuda_py_test(
     tags = ["notap"],  # http://b/30441813
 )
 
-cuda_py_test(
-    name = "bernoulli_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bernoulli_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "beta_test",
-    size = "small",
-    srcs = ["python/kernel_tests/beta_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -211,24 +206,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "categorical_test",
-    size = "small",
-    srcs = ["python/kernel_tests/categorical_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-    ],
-)
-
 cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
@@ -244,53 +221,26 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "dirichlet_test",
+    name = "deterministic_test",
     size = "small",
-    srcs = ["python/kernel_tests/dirichlet_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "dirichlet_multinomial_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/dirichlet_multinomial_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "exponential_test",
-    srcs = ["python/kernel_tests/exponential_test.py"],
+    srcs = ["python/kernel_tests/deterministic_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
 cuda_py_test(
-    name = "gamma_test",
-    srcs = ["python/kernel_tests/gamma_test.py"],
+    name = "geometric_test",
+    size = "small",
+    srcs = ["python/kernel_tests/geometric_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -319,23 +269,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "multinomial_test",
-    srcs = ["python/kernel_tests/multinomial_test.py"],
+    name = "mvn_diag_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_diag_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -344,14 +280,49 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
 cuda_py_test(
-    name = "mvn_test",
+    name = "mvn_diag_plus_low_rank_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/mvn_diag_plus_low_rank_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "mvn_full_covariance_test",
     size = "small",
-    srcs = ["python/kernel_tests/mvn_test.py"],
+    srcs = ["python/kernel_tests/mvn_full_covariance_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "mvn_tril_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_tril_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -387,20 +358,18 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "normal_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/normal_test.py"],
+    name = "negative_binomial_test",
+    size = "small",
+    srcs = ["python/kernel_tests/negative_binomial_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
     ],
 )
 
@@ -419,9 +388,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "student_t_test",
+    name = "sample_stats_test",
     size = "small",
-    srcs = ["python/kernel_tests/student_t_test.py"],
+    srcs = ["python/kernel_tests/sample_stats_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -436,6 +405,22 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "vector_laplace_diag_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/vector_laplace_diag_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "vector_student_t_test",
     size = "medium",
@@ -452,22 +437,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "uniform_test",
-    size = "small",
-    srcs = ["python/kernel_tests/uniform_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 cuda_py_test(
     name = "wishart_test",
     size = "small",
@@ -486,18 +455,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "kullback_leibler_test",
-    size = "small",
-    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "normal_conjugate_posteriors_test",
     size = "small",
@@ -591,6 +548,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["python/kernel_tests/transformed_distribution_test.py"],
     additional_deps = [
+        ":bijectors_py",
         ":distributions_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/linalg:linalg_py",
@@ -619,22 +577,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-)
-
-cuda_py_test(
-    name = "special_math_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/special_math_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
+    tags = ["no_pip"],
 )
 
 cuda_py_test(
@@ -657,7 +600,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "shape_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/shape_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -671,44 +614,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "bijector_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/bijector_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    shard_count = 5,
-)
-
-cuda_py_test(
-    name = "conditional_bijector_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/conditional_bijector_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    shard_count = 5,
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -720,3 +625,253 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# === Bijector Tests ==========================================================
+
+cuda_py_test(
+    name = "conditional_bijector_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/conditional_bijector_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "affine_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/bijectors/affine_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+)
+
+cuda_py_test(
+    name = "affine_linear_operator_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/affine_linear_operator_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "chain_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/chain_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "cholesky_outer_product_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/cholesky_outer_product_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "exp_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/exp_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "inline_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/inline_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "invert_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/invert_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "power_transform_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/power_transform_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "sigmoid_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/sigmoid_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "sigmoid_centered_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/sigmoid_centered_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "softmax_centered_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/softmax_centered_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "softplus_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/softplus_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 0df58e8c909..1fddad53689 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -14,98 +14,7 @@
 # ==============================================================================
 """Classes representing statistical distributions and ops for working with them.
 
-## Classes for statistical distributions.
-
-Classes that represent batches of statistical distributions.  Each class is
-initialized with parameters that define the distributions.
-
-## Base classes
-
-@@ReparameterizationType
-@@Distribution
-
-## Univariate (scalar) distributions
-
-@@Binomial
-@@Bernoulli
-@@BernoulliWithSigmoidProbs
-@@Beta
-@@BetaWithSoftplusConcentration
-@@Categorical
-@@Chi2
-@@Chi2WithAbsDf
-@@Exponential
-@@ExponentialWithSoftplusRate
-@@Gamma
-@@GammaWithSoftplusConcentrationRate
-@@InverseGamma
-@@InverseGammaWithSoftplusConcentrationRate
-@@Laplace
-@@LaplaceWithSoftplusScale
-@@Logistic
-@@Normal
-@@NormalWithSoftplusScale
-@@Poisson
-@@StudentT
-@@StudentTWithAbsDfSoftplusScale
-@@Uniform
-
-## Multivariate distributions
-
-### Multivariate normal
-
-@@MultivariateNormalDiag
-@@MultivariateNormalFull
-@@MultivariateNormalCholesky
-@@MultivariateNormalDiagPlusVDVT
-@@MultivariateNormalDiagWithSoftplusStDev
-
-### Other multivariate distributions
-
-@@Dirichlet
-@@DirichletMultinomial
-@@Multinomial
-@@WishartCholesky
-@@WishartFull
-
-### Multivariate Utilities
-
-@@matrix_diag_transform
-
-## Transformed distributions
-
-@@TransformedDistribution
-@@QuantizedDistribution
-
-## Mixture Models
-
-@@Mixture
-
-## Posterior inference with conjugate priors.
-
-Functions that transform conjugate prior/likelihood pairs to distributions
-representing the posterior or posterior predictive.
-
-## Normal likelihood with conjugate prior.
-
-@@normal_conjugates_known_scale_posterior
-@@normal_conjugates_known_scale_predictive
-
-## Kullback-Leibler Divergence
-
-@@kl
-@@RegisterKL
-
-## Utilities
-
-@@softplus_inverse
-
-## Relaxed Discrete Distributions
-
-@@ExpRelaxedOneHotCategorical
-@@OneHotCategorical
-@@RelaxedBernoulli
-@@RelaxedOneHotCategorical
+See the @{$python/contrib.distributions} guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -113,48 +22,125 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
-from tensorflow.contrib.distributions.python.ops import bijector
-from tensorflow.contrib.distributions.python.ops import conditional_bijector
-from tensorflow.contrib.distributions.python.ops.bernoulli import *
-from tensorflow.contrib.distributions.python.ops.beta import *
+from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.binomial import *
-from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.dirichlet import *
-from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
-from tensorflow.contrib.distributions.python.ops.distribution import *
+from tensorflow.contrib.distributions.python.ops.deterministic import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
-from tensorflow.contrib.distributions.python.ops.exponential import *
-from tensorflow.contrib.distributions.python.ops.gamma import *
+from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
-from tensorflow.contrib.distributions.python.ops.kullback_leibler import *
-from tensorflow.contrib.distributions.python.ops.laplace import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
-from tensorflow.contrib.distributions.python.ops.multinomial import *
-from tensorflow.contrib.distributions.python.ops.mvn import *
-from tensorflow.contrib.distributions.python.ops.normal import *
+from tensorflow.contrib.distributions.python.ops.mvn_diag import *
+from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
+from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
+from tensorflow.contrib.distributions.python.ops.mvn_tril import *
+from tensorflow.contrib.distributions.python.ops.negative_binomial import *
 from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
 from tensorflow.contrib.distributions.python.ops.onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.poisson import *
 from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
-from tensorflow.contrib.distributions.python.ops.student_t import *
-from tensorflow.contrib.distributions.python.ops.transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.uniform import *
+from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
+from tensorflow.python.ops.distributions.bernoulli import *
+from tensorflow.python.ops.distributions.beta import *
+from tensorflow.python.ops.distributions.categorical import *
+from tensorflow.python.ops.distributions.dirichlet import *
+from tensorflow.python.ops.distributions.dirichlet_multinomial import *
+from tensorflow.python.ops.distributions.distribution import *
+from tensorflow.python.ops.distributions.exponential import *
+from tensorflow.python.ops.distributions.gamma import *
+from tensorflow.python.ops.distributions.kullback_leibler import *
+from tensorflow.python.ops.distributions.laplace import *
+from tensorflow.python.ops.distributions.multinomial import *
+from tensorflow.python.ops.distributions.normal import *
+from tensorflow.python.ops.distributions.student_t import *
+from tensorflow.python.ops.distributions.transformed_distribution import *
+from tensorflow.python.ops.distributions.uniform import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['bijector',
-                    'ConditionalDistribution',
-                    'ConditionalTransformedDistribution',
-                    'FULLY_REPARAMETERIZED', 'NOT_REPARAMETERIZED']
+_allowed_symbols = [
+    'bijectors',
+    'ConditionalDistribution',
+    'ConditionalTransformedDistribution',
+    'FULLY_REPARAMETERIZED',
+    'NOT_REPARAMETERIZED',
+    'Affine',
+    'AffineLinearOperator',
+    'Bijector',
+    'Chain',
+    'CholeskyOuterProduct',
+    'Exp',
+    'Identity',
+    'Inline',
+    'Invert',
+    'PowerTransform',
+    'SigmoidCentered',
+    'SoftmaxCentered',
+    'Softplus',
+    'ReparameterizationType',
+    'Distribution',
+    'Binomial',
+    'Bernoulli',
+    'BernoulliWithSigmoidProbs',
+    'Beta',
+    'BetaWithSoftplusConcentration',
+    'Categorical',
+    'Chi2',
+    'Chi2WithAbsDf',
+    'Deterministic',
+    'VectorDeterministic',
+    'Exponential',
+    'ExponentialWithSoftplusRate',
+    'Gamma',
+    'GammaWithSoftplusConcentrationRate',
+    'Geometric',
+    'InverseGamma',
+    'InverseGammaWithSoftplusConcentrationRate',
+    'Laplace',
+    'LaplaceWithSoftplusScale',
+    'Logistic',
+    'NegativeBinomial',
+    'Normal',
+    'NormalWithSoftplusScale',
+    'Poisson',
+    'StudentT',
+    'StudentTWithAbsDfSoftplusScale',
+    'Uniform',
+    'MultivariateNormalDiag',
+    'MultivariateNormalFullCovariance',
+    'MultivariateNormalTriL',
+    'MultivariateNormalDiagPlusLowRank',
+    'MultivariateNormalDiagWithSoftplusScale',
+    'Dirichlet',
+    'DirichletMultinomial',
+    'Multinomial',
+    'VectorLaplaceDiag',
+    'WishartCholesky',
+    'WishartFull',
+    'TransformedDistribution',
+    'QuantizedDistribution',
+    'Mixture',
+    'ExpRelaxedOneHotCategorical',
+    'OneHotCategorical',
+    'RelaxedBernoulli',
+    'RelaxedOneHotCategorical',
+    'kl_divergence',
+    'RegisterKL',
+    'matrix_diag_transform',
+    'normal_conjugates_known_scale_posterior',
+    'normal_conjugates_known_scale_predictive',
+    'softplus_inverse',
+    'percentile'
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
deleted file mode 100644
index 092758e6ef9..00000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
+++ /dev/null
@@ -1,1766 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import itertools
-import math
-
-import numpy as np
-import six
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib import linalg as linalg_lib
-from tensorflow.contrib.distributions.python.ops import bijector as bijector_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-bijectors = bijector_lib
-ds = distributions_lib
-linalg = linalg_lib
-rng = np.random.RandomState(42)
-
-
-def assert_finite(array):
-  if not np.isfinite(array).all():
-    raise AssertionError("array was not all finite. %s" % array[:15])
-
-
-def assert_strictly_increasing(array):
-  np.testing.assert_array_less(0.0, np.diff(array))
-
-
-def assert_strictly_decreasing(array):
-  np.testing.assert_array_less(np.diff(array), 0.0)
-
-
-def assert_strictly_monotonic(array):
-  if array[0] < array[-1]:
-    assert_strictly_increasing(array)
-  else:
-    assert_strictly_decreasing(array)
-
-
-def assert_scalar_congruency(bijector,
-                             lower_x,
-                             upper_x,
-                             n=10000,
-                             rtol=0.01,
-                             sess=None):
-  """Assert `bijector`'s forward/inverse/inverse_log_det_jacobian are congruent.
-
-  We draw samples `X ~ U(lower_x, upper_x)`, then feed these through the
-  `bijector` in order to check that:
-
-  1. the forward is strictly monotonic.
-  2. the forward/inverse methods are inverses of each other.
-  3. the jacobian is the correct change of measure.
-
-  This can only be used for a Bijector mapping open subsets of the real line
-  to themselves.  This is due to the fact that this test compares the `prob`
-  before/after transformation with the Lebesgue measure on the line.
-
-  Args:
-    bijector:  Instance of Bijector
-    lower_x:  Python scalar.
-    upper_x:  Python scalar.  Must have `lower_x < upper_x`, and both must be in
-      the domain of the `bijector`.  The `bijector` should probably not produce
-      huge variation in values in the interval `(lower_x, upper_x)`, or else
-      the variance based check of the Jacobian will require small `rtol` or
-      huge `n`.
-    n:  Number of samples to draw for the checks.
-    rtol:  Positive number.  Used for the Jacobian check.
-    sess:  `tf.Session`.  Defaults to the default session.
-
-  Raises:
-    AssertionError:  If tests fail.
-  """
-
-  # Checks and defaults.
-  assert bijector.event_ndims.eval() == 0
-  if sess is None:
-    sess = ops.get_default_session()
-
-  # Should be monotonic over this interval
-  ten_x_pts = np.linspace(lower_x, upper_x, num=10).astype(np.float32)
-  if bijector.dtype is not None:
-    ten_x_pts = ten_x_pts.astype(bijector.dtype.as_numpy_dtype)
-  forward_on_10_pts = bijector.forward(ten_x_pts)
-
-  # Set the lower/upper limits in the range of the bijector.
-  lower_y, upper_y = sess.run(
-      [bijector.forward(lower_x), bijector.forward(upper_x)])
-  if upper_y < lower_y:  # If bijector.forward is a decreasing function.
-    lower_y, upper_y = upper_y, lower_y
-
-  # Uniform samples from the domain, range.
-  uniform_x_samps = ds.Uniform(low=lower_x, high=upper_x).sample(n, seed=0)
-  uniform_y_samps = ds.Uniform(low=lower_y, high=upper_y).sample(n, seed=1)
-
-  # These compositions should be the identity.
-  inverse_forward_x = bijector.inverse(bijector.forward(uniform_x_samps))
-  forward_inverse_y = bijector.forward(bijector.inverse(uniform_y_samps))
-
-  # For a < b, and transformation y = y(x),
-  # (b - a) = \int_a^b dx = \int_{y(a)}^{y(b)} |dx/dy| dy
-  # "change_measure_dy_dx" below is a Monte Carlo approximation to the right
-  # hand side, which should then be close to the left, which is (b - a).
-  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(uniform_y_samps))
-  # E[|dx/dy|] under Uniform[lower_y, upper_y]
-  # = \int_{y(a)}^{y(b)} |dx/dy| dP(u), where dP(u) is the uniform measure
-  expectation_of_dy_dx_under_uniform = math_ops.reduce_mean(dy_dx)
-  # dy = dP(u) * (upper_y - lower_y)
-  change_measure_dy_dx = (
-      (upper_y - lower_y) * expectation_of_dy_dx_under_uniform)
-
-  # We'll also check that dy_dx = 1 / dx_dy.
-  dx_dy = math_ops.exp(
-      bijector.forward_log_det_jacobian(bijector.inverse(uniform_y_samps)))
-
-  (
-      forward_on_10_pts_v,
-      dy_dx_v,
-      dx_dy_v,
-      change_measure_dy_dx_v,
-      uniform_x_samps_v,
-      uniform_y_samps_v,
-      inverse_forward_x_v,
-      forward_inverse_y_v,) = sess.run([
-          forward_on_10_pts,
-          dy_dx,
-          dx_dy,
-          change_measure_dy_dx,
-          uniform_x_samps,
-          uniform_y_samps,
-          inverse_forward_x,
-          forward_inverse_y,
-      ])
-
-  assert_strictly_monotonic(forward_on_10_pts_v)
-  # Composition of forward/inverse should be the identity.
-  np.testing.assert_allclose(
-      inverse_forward_x_v, uniform_x_samps_v, atol=1e-5, rtol=1e-3)
-  np.testing.assert_allclose(
-      forward_inverse_y_v, uniform_y_samps_v, atol=1e-5, rtol=1e-3)
-  # Change of measure should be correct.
-  np.testing.assert_allclose(
-      upper_x - lower_x, change_measure_dy_dx_v, atol=0, rtol=rtol)
-  # Inverse Jacobian should be equivalent to the reciprocal of the forward
-  # Jacobian.
-  np.testing.assert_allclose(
-      dy_dx_v, np.divide(1., dx_dy_v), atol=1e-5, rtol=1e-3)
-
-
-def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
-  """Assert that forward/inverse (along with jacobians) are inverses and finite.
-
-  It is recommended to use x and y values that are very very close to the edge
-  of the Bijector's domain.
-
-  Args:
-    bijector:  A Bijector instance.
-    x:  np.array of values in the domain of bijector.forward.
-    y:  np.array of values in the domain of bijector.inverse.
-    atol:  Absolute tolerance.
-    rtol:  Relative tolerance.
-    sess:  TensorFlow session.  Defaults to the default session.
-
-  Raises:
-    AssertionError:  If tests fail.
-  """
-  sess = sess or ops.get_default_session()
-
-  # These are the incoming points, but people often create a crazy range of
-  # values for which these end up being bad, especially in 16bit.
-  assert_finite(x)
-  assert_finite(y)
-
-  f_x = bijector.forward(x)
-  g_y = bijector.inverse(y)
-
-  (
-      x_from_x,
-      y_from_y,
-      ildj_f_x,
-      fldj_x,
-      ildj_y,
-      fldj_g_y,
-      f_x_v,
-      g_y_v,) = sess.run([
-          bijector.inverse(f_x),
-          bijector.forward(g_y),
-          bijector.inverse_log_det_jacobian(f_x),
-          bijector.forward_log_det_jacobian(x),
-          bijector.inverse_log_det_jacobian(y),
-          bijector.forward_log_det_jacobian(g_y),
-          f_x,
-          g_y,
-      ])
-
-  assert_finite(x_from_x)
-  assert_finite(y_from_y)
-  assert_finite(ildj_f_x)
-  assert_finite(fldj_x)
-  assert_finite(ildj_y)
-  assert_finite(fldj_g_y)
-  assert_finite(f_x_v)
-  assert_finite(g_y_v)
-
-  np.testing.assert_allclose(x_from_x, x, atol=atol, rtol=rtol)
-  np.testing.assert_allclose(y_from_y, y, atol=atol, rtol=rtol)
-  np.testing.assert_allclose(-ildj_f_x, fldj_x, atol=atol, rtol=rtol)
-  np.testing.assert_allclose(-ildj_y, fldj_g_y, atol=atol, rtol=rtol)
-
-
-class BaseBijectorTest(test.TestCase):
-  """Tests properties of the Bijector base-class."""
-
-  def testBijector(self):
-    with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   ("Can't instantiate abstract class Bijector "
-                                    "with abstract methods __init__")):
-        bijectors.Bijector()
-
-
-class IntentionallyMissingError(Exception):
-  pass
-
-
-class BrokenBijectorWithInverseAndInverseLogDetJacobian(bijectors.Bijector):
-  """Bijector with broken directions.
-
-  This BrokenBijector implements _inverse_and_inverse_log_det_jacobian.
-  """
-
-  def __init__(self, forward_missing=False, inverse_missing=False):
-    super(BrokenBijectorWithInverseAndInverseLogDetJacobian, self).__init__(
-        event_ndims=0,
-        validate_args=False,
-        name="BrokenBijectorDual")
-    self._forward_missing = forward_missing
-    self._inverse_missing = inverse_missing
-
-  def _forward(self, x):
-    if self._forward_missing:
-      raise IntentionallyMissingError
-    return 2. * x
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    if self._inverse_missing:
-      raise IntentionallyMissingError
-    return y / 2., -math_ops.log(2.)
-
-  def _forward_log_det_jacobian(self, x):  # pylint:disable=unused-argument
-    if self._forward_missing:
-      raise IntentionallyMissingError
-    return math_ops.log(2.)
-
-
-class BrokenBijectorSeparateInverseAndInverseLogDetJacobian(bijectors.Bijector):
-  """Forward and inverse are not inverses of each other.
-
-  This BrokenBijector implements _inverse and _inverse_log_det_jacobian as
-  separate functions.
-  """
-
-  def __init__(self, forward_missing=False, inverse_missing=False):
-    super(BrokenBijectorSeparateInverseAndInverseLogDetJacobian, self).__init__(
-        event_ndims=0, validate_args=False, name="broken")
-    self._forward_missing = forward_missing
-    self._inverse_missing = inverse_missing
-
-  def _forward(self, x):
-    if self._forward_missing:
-      raise IntentionallyMissingError
-    return 2 * x
-
-  def _inverse(self, y):
-    if self._inverse_missing:
-      raise IntentionallyMissingError
-    return y / 2.
-
-  def _inverse_log_det_jacobian(self, y):  # pylint:disable=unused-argument
-    if self._inverse_missing:
-      raise IntentionallyMissingError
-    return -math_ops.log(2.)
-
-  def _forward_log_det_jacobian(self, x):  # pylint:disable=unused-argument
-    if self._forward_missing:
-      raise IntentionallyMissingError
-    return math_ops.log(2.)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class BijectorCachingTest(object):
-
-  @abc.abstractproperty
-  def broken_bijector_cls(self):
-    # return a BrokenBijector type Bijector, since this will test the caching.
-    raise IntentionallyMissingError("Not implemented")
-
-  def testCachingOfForwardResultsWhenCalledOneByOne(self):
-    broken_bijector = self.broken_bijector_cls(inverse_missing=True)
-    with self.test_session():
-      x = constant_op.constant(1.1)
-
-      # Call forward and forward_log_det_jacobian one-by-one (not together).
-      y = broken_bijector.forward(x)
-      _ = broken_bijector.forward_log_det_jacobian(x)
-
-      # Now, everything should be cached if the argument is y.
-      try:
-        broken_bijector.inverse(y)
-        broken_bijector.inverse_log_det_jacobian(y)
-        broken_bijector.inverse_and_inverse_log_det_jacobian(y)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed!  Cached values not used.")
-
-  def testCachingOfInverseResultsWhenCalledOneByOne(self):
-    broken_bijector = self.broken_bijector_cls(forward_missing=True)
-    with self.test_session():
-      y = constant_op.constant(1.1)
-
-      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
-      x = broken_bijector.inverse(y)
-      _ = broken_bijector.inverse_log_det_jacobian(y)
-
-      # Now, everything should be cached if the argument is x.
-      try:
-        broken_bijector.forward(x)
-        broken_bijector.forward_log_det_jacobian(x)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed!  Cached values not used.")
-
-  def testCachingOfInverseResultsWhenCalledTogether(self):
-    broken_bijector = self.broken_bijector_cls(forward_missing=True)
-    with self.test_session():
-      y = constant_op.constant(1.1)
-
-      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
-      x, _ = broken_bijector.inverse_and_inverse_log_det_jacobian(y)
-
-      # Now, everything should be cached if the argument is x.
-      try:
-        broken_bijector.forward(x)
-        broken_bijector.forward_log_det_jacobian(x)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed!  Cached values not used.")
-
-
-class SeparateCallsBijectorCachingTest(BijectorCachingTest, test.TestCase):
-  """Test caching with BrokenBijectorSeparateInverseAndInverseLogDetJacobian.
-
-  These bijectors implement forward, inverse,... all as separate functions.
-  """
-
-  @property
-  def broken_bijector_cls(self):
-    return BrokenBijectorSeparateInverseAndInverseLogDetJacobian
-
-
-class JointCallsBijectorCachingTest(BijectorCachingTest, test.TestCase):
-  """Test caching with BrokenBijectorWithInverseAndInverseLogDetJacobian.
-
-  These bijectors implement _inverse_and_inverse_log_det_jacobian, which is two
-  functionalities together.
-  """
-
-  @property
-  def broken_bijector_cls(self):
-    return BrokenBijectorWithInverseAndInverseLogDetJacobian
-
-
-class IdentityBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = X transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      bijector = bijectors.Identity()
-      self.assertEqual("identity", bijector.name)
-      x = [[[0.], [1.]]]
-      self.assertAllEqual(x, bijector.forward(x).eval())
-      self.assertAllEqual(x, bijector.inverse(x).eval())
-      self.assertAllEqual(0., bijector.inverse_log_det_jacobian(x).eval())
-      self.assertAllEqual(0., bijector.forward_log_det_jacobian(x).eval())
-      rev, jac = bijector.inverse_and_inverse_log_det_jacobian(x)
-      self.assertAllEqual(x, rev.eval())
-      self.assertAllEqual(0., jac.eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Identity()
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
-
-
-class ExpBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = exp(X) transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      bijector = bijectors.Exp(event_ndims=1)
-      self.assertEqual("exp", bijector.name)
-      x = [[[1.], [2.]]]
-      y = np.exp(x)
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-bijector.inverse_log_det_jacobian(np.exp(x)).eval(),
-                          bijector.forward_log_det_jacobian(x).eval())
-      rev, jac = bijector.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose(-np.sum(np.log(y), axis=-1), jac.eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Exp()
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
-
-  def testBijectiveAndFinite(self):
-    with self.test_session():
-      bijector = bijectors.Exp(event_ndims=0)
-      x = np.linspace(-10, 10, num=10).astype(np.float32)
-      y = np.logspace(-10, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y)
-
-
-class PowerTransformBijectorTest(test.TestCase):
-  """Tests correctness of the power transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      c = 0.2
-      bijector = bijectors.PowerTransform(
-          power=c, event_ndims=1, validate_args=True)
-      self.assertEqual("power_transform", bijector.name)
-      x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
-      y = (1. + x * c)**(1. / c)
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          (c - 1.) * np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
-          rtol=1e-4,
-          atol=0.)
-      rev, jac = bijector.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose((c - 1.) * np.sum(np.log(y), axis=-1), jac.eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.PowerTransform(power=0.2, validate_args=True)
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
-
-  def testBijectiveAndFinite(self):
-    with self.test_session():
-      bijector = bijectors.PowerTransform(
-          power=0.2, event_ndims=0, validate_args=True)
-      x = np.linspace(-4.999, 10, num=10).astype(np.float32)
-      y = np.logspace(0.001, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
-
-
-class InlineBijectorTest(test.TestCase):
-  """Tests correctness of the inline constructed bijector."""
-
-  def testBijector(self):
-    with self.test_session():
-      exp = bijectors.Exp(event_ndims=1)
-      inline = bijectors.Inline(
-          forward_fn=math_ops.exp,
-          inverse_fn=math_ops.log,
-          inverse_log_det_jacobian_fn=(
-              lambda y: -math_ops.reduce_sum(  # pylint: disable=g-long-lambda
-                  math_ops.log(y), reduction_indices=-1)),
-          forward_log_det_jacobian_fn=(
-              lambda x: math_ops.reduce_sum(x, reduction_indices=-1)),
-          name="exp")
-
-      self.assertEqual(exp.name, inline.name)
-      x = [[[1., 2.], [3., 4.], [5., 6.]]]
-      y = np.exp(x)
-      self.assertAllClose(y, inline.forward(x).eval())
-      self.assertAllClose(x, inline.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=-1),
-          inline.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-inline.inverse_log_det_jacobian(y).eval(),
-                          inline.forward_log_det_jacobian(x).eval())
-      rev, jac = inline.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose(-np.sum(np.log(y), axis=-1), jac.eval())
-
-  def testShapeGetters(self):
-    with self.test_session():
-      bijector = bijectors.Inline(
-          forward_event_shape_tensor_fn=lambda x: array_ops.concat((x, [1]), 0),
-          forward_event_shape_fn=lambda x: x.as_list() + [1],
-          inverse_event_shape_tensor_fn=lambda x: x[:-1],
-          inverse_event_shape_fn=lambda x: x[:-1],
-          name="shape_only")
-      x = tensor_shape.TensorShape([1, 2, 3])
-      y = tensor_shape.TensorShape([1, 2, 3, 1])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
-      self.assertAllEqual(
-          y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
-      self.assertAllEqual(
-          x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
-
-
-class AffineLinearOperatorTest(test.TestCase):
-
-  def testIdentity(self):
-    with self.test_session():
-      affine = bijectors.AffineLinearOperator(validate_args=True)
-      x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
-      y = x
-      ildj = 0.
-
-      self.assertEqual(affine.name, "affine_linear_operator")
-      self.assertAllClose(y, affine.forward(x).eval())
-      self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
-      rev, actual_ildj = affine.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose(ildj, actual_ildj.eval())
-
-  def testDiag(self):
-    with self.test_session():
-      shift = np.array([-1, 0, 1], dtype=np.float32)
-      diag = np.array([[1, 2, 3],
-                       [2, 5, 6]], dtype=np.float32)
-      scale = linalg.LinearOperatorDiag(diag, is_non_singular=True)
-      affine = bijectors.AffineLinearOperator(
-          shift=shift, scale=scale, validate_args=True)
-
-      x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
-      y = diag * x + shift
-      ildj = -np.sum(np.log(np.abs(diag)), axis=-1)
-
-      self.assertEqual(affine.name, "affine_linear_operator")
-      self.assertAllClose(y, affine.forward(x).eval())
-      self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
-      rev, actual_ildj = affine.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose(ildj, actual_ildj.eval())
-
-  def testTriL(self):
-    with self.test_session():
-      shift = np.array([-1, 0, 1], dtype=np.float32)
-      tril = np.array([[[1, 0, 0],
-                        [2, -1, 0],
-                        [3, 2, 1]],
-                       [[2, 0, 0],
-                        [3, -2, 0],
-                        [4, 3, 2]]],
-                      dtype=np.float32)
-      scale = linalg.LinearOperatorTriL(tril, is_non_singular=True)
-      affine = bijectors.AffineLinearOperator(
-          shift=shift, scale=scale, validate_args=True)
-
-      x = np.array([[[1, 0, -1],
-                     [2, 3, 4]],
-                    [[4, 1, -7],
-                     [6, 9, 8]]],
-                   dtype=np.float32)
-      # If we made the bijector do x*A+b then this would be simplified to:
-      # y = np.matmul(x, tril) + shift.
-      y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-      ildj = -np.sum(np.log(np.abs(np.diagonal(
-          tril, axis1=-2, axis2=-1))),
-                     axis=-1)
-
-      self.assertEqual(affine.name, "affine_linear_operator")
-      self.assertAllClose(y, affine.forward(x).eval())
-      self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
-      rev, actual_ildj = affine.inverse_and_inverse_log_det_jacobian(y)
-      self.assertAllClose(x, rev.eval())
-      self.assertAllClose(ildj, actual_ildj.eval())
-
-
-class AffineBijectorTest(test.TestCase):
-  """Tests correctness of the Y = scale @ x + shift transformation."""
-
-  def testProperties(self):
-    with self.test_session():
-      mu = -1.
-      # scale corresponds to 1.
-      bijector = bijectors.Affine(shift=mu, event_ndims=0)
-      self.assertEqual("affine", bijector.name)
-
-  def testNoBatchScalarViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2
-        bijector = bijectors.Affine(
-            shift=mu, scale_identity_multiplier=2., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 2, 3]  # Three scalar samples (no batches).
-        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
-        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testNoBatchScalarViaDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2
-        bijector = bijectors.Affine(shift=mu, scale_diag=[2.], event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 2, 3]  # Three scalar samples (no batches).
-        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
-        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testWeirdSampleNoBatchScalarViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2.
-        bijector = bijectors.Affine(
-            shift=mu, scale_identity_multiplier=2., event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [[1., 2, 3], [4, 5, 6]]  # Weird sample shape.
-        self.assertAllClose([[1., 3, 5],
-                             [7, 9, 11]],
-                            run(bijector.forward, x))
-        self.assertAllClose([[1., 1.5, 2.],
-                             [2.5, 3, 3.5]],
-                            run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testOneBatchScalarViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1.]
-        # One batch, scalar.
-        # Corresponds to scale = 1.
-        bijector = bijectors.Affine(shift=mu, event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1.]  # One sample from one batches.
-        self.assertAllClose([2.], run(bijector.forward, x))
-        self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testOneBatchScalarViaDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1.]
-        # One batch, scalar.
-        # Corresponds to scale = 1.
-        bijector = bijectors.Affine(shift=mu, scale_diag=[1.], event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1.]  # One sample from one batches.
-        self.assertAllClose([2.], run(bijector.forward, x))
-        self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testTwoBatchScalarIdentityViaIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Univariate, two batches.
-        # Corresponds to scale = 1.
-        bijector = bijectors.Affine(shift=mu, event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 1]  # One sample from each of two batches.
-        self.assertAllClose([2., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testTwoBatchScalarIdentityViaDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Univariate, two batches.
-        # Corresponds to scale = 1.
-        bijector = bijectors.Affine(shift=mu, scale_diag=[1.], event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1., 1]  # One sample from each of two batches.
-        self.assertAllClose([2., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testNoBatchMultivariateIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Multivariate
-        # Corresponds to scale = [[1., 0], [0, 1.]]
-        bijector = bijectors.Affine(shift=mu)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 1]
-        # matmul(sigma, x) + shift
-        # = [-1, -1] + [1, -1]
-        self.assertAllClose([2., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-
-        # x is a 2-batch of 2-vectors.
-        # The first vector is [1, 1], the second is [-1, -1].
-        # Each undergoes matmul(sigma, x) + shift.
-        x = [[1., 1], [-1., -1]]
-        self.assertAllClose([[2., 0], [0., -2]], run(bijector.forward, x))
-        self.assertAllClose([[0., 2], [-2., 0]], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
-
-  def testNoBatchMultivariateDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [1., -1]
-        # Multivariate
-        # Corresponds to scale = [[2., 0], [0, 1.]]
-        bijector = bijectors.Affine(shift=mu, scale_diag=[2., 1])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 1]
-        # matmul(sigma, x) + shift
-        # = [-1, -1] + [1, -1]
-        self.assertAllClose([3., 0], run(bijector.forward, x))
-        self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-        # x is a 2-batch of 2-vectors.
-        # The first vector is [1, 1], the second is [-1, -1].
-        # Each undergoes matmul(sigma, x) + shift.
-        x = [[1., 1],
-             [-1., -1]]
-        self.assertAllClose([[3., 0],
-                             [-1., -2]],
-                            run(bijector.forward, x))
-        self.assertAllClose([[0., 2],
-                             [-1., 0]],
-                            run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testNoBatchMultivariateFullDynamic(self):
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32, name="x")
-      mu = array_ops.placeholder(dtypes.float32, name="mu")
-      scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
-      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
-
-      x_value = np.array([[1., 1]], dtype=np.float32)
-      mu_value = np.array([1., -1], dtype=np.float32)
-      scale_diag_value = np.array([2., 2], dtype=np.float32)
-      event_ndims_value = np.array(1, dtype=np.int32)
-      feed_dict = {
-          x: x_value,
-          mu: mu_value,
-          scale_diag: scale_diag_value,
-          event_ndims: event_ndims_value
-      }
-
-      bijector = bijectors.Affine(
-          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
-      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
-      self.assertAllClose([[3., 1]], sess.run(bijector.forward(x), feed_dict))
-      self.assertAllClose([[0., 1]], sess.run(bijector.inverse(x), feed_dict))
-      self.assertAllClose(
-          -math.log(4),
-          sess.run(bijector.inverse_log_det_jacobian(x), feed_dict))
-
-  def testBatchMultivariateIdentity(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [[1., -1]]
-        # Corresponds to 1 2x2 matrix, with twos on the diagonal.
-        scale = 2.
-        bijector = bijectors.Affine(shift=mu, scale_identity_multiplier=scale)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [[[1., 1]]]
-        self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
-        self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(4),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testBatchMultivariateDiag(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = [[1., -1]]
-        # Corresponds to 1 2x2 matrix, with twos on the diagonal.
-        scale_diag = [[2., 2]]
-        bijector = bijectors.Affine(shift=mu, scale_diag=scale_diag)
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [[[1., 1]]]
-        self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
-        self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose([-math.log(4)],
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testBatchMultivariateFullDynamic(self):
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32, name="x")
-      mu = array_ops.placeholder(dtypes.float32, name="mu")
-      scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
-      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
-
-      x_value = np.array([[[1., 1]]], dtype=np.float32)
-      mu_value = np.array([[1., -1]], dtype=np.float32)
-      scale_diag_value = np.array([[2., 2]], dtype=np.float32)
-      event_ndims_value = 1
-
-      feed_dict = {
-          x: x_value,
-          mu: mu_value,
-          scale_diag: scale_diag_value,
-          event_ndims: event_ndims_value
-      }
-
-      bijector = bijectors.Affine(
-          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
-      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
-      self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
-      self.assertAllClose([[[0., 1]]], sess.run(bijector.inverse(x), feed_dict))
-      self.assertAllClose([-math.log(4)],
-                          sess.run(
-                              bijector.inverse_log_det_jacobian(x), feed_dict))
-
-  def testIdentityWithDiagUpdate(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = 2
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_identity_multiplier=1.,
-            scale_diag=[1.],
-            event_ndims=0)
-        self.assertEqual(0, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 2, 3]  # Three scalar samples (no batches).
-        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
-        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testIdentityWithTriL(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # scale = [[2., 0], [2, 2]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_identity_multiplier=1.,
-            scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [[1., 2]]  # One multivariate sample.
-        self.assertAllClose([[1., 5]], run(bijector.forward, x))
-        self.assertAllClose([[1., 0.5]], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(4.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testDiagWithTriL(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # scale = [[2., 0], [2, 3]]
-        bijector = bijectors.Affine(
-            shift=mu, scale_diag=[1., 2.], scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [[1., 2]]  # One multivariate sample.
-        self.assertAllClose([[1., 7]], run(bijector.forward, x))
-        self.assertAllClose([[1., 1 / 3.]], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(6.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testIdentityAndDiagWithTriL(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # scale = [[3., 0], [2, 4]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_identity_multiplier=1.0,
-            scale_diag=[1., 2.],
-            scale_tril=[[1., 0], [2., 1]])
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [[1., 2]]  # One multivariate sample.
-        self.assertAllClose([[2., 9]], run(bijector.forward, x))
-        self.assertAllClose([[2 / 3., 5 / 12.]], run(bijector.inverse, x))
-        self.assertAllClose(-math.log(12.),
-                            run(bijector.inverse_log_det_jacobian, x))
-
-  def testIdentityWithVDVTUpdate(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = [[10, 0, 0], [0, 2, 0], [0, 0, 3]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_identity_multiplier=2.,
-            scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = bijectors.Affine(shift=mu, scale_diag=[10., 2, 3])
-
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 2, 3]  # Vector.
-        self.assertAllClose([9., 3, 8], run(bijector.forward, x))
-        self.assertAllClose(
-            run(bijector_ref.forward, x), run(bijector.forward, x))
-
-        self.assertAllClose([0.2, 1.5, 4 / 3.], run(bijector.inverse, x))
-        self.assertAllClose(
-            run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-math.log(60.),
-                            run(bijector.inverse_log_det_jacobian, x))
-        self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
-
-  def testDiagWithVDVTUpdate(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = [[10, 0, 0], [0, 3, 0], [0, 0, 5]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_diag=[2., 3, 4],
-            scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = bijectors.Affine(shift=mu, scale_diag=[10., 3, 5])
-
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 2, 3]  # Vector.
-        self.assertAllClose([9., 5, 14], run(bijector.forward, x))
-        self.assertAllClose(
-            run(bijector_ref.forward, x), run(bijector.forward, x))
-        self.assertAllClose([0.2, 1., 0.8], run(bijector.inverse, x))
-        self.assertAllClose(
-            run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-math.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
-        self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
-
-  def testTriLWithVDVTUpdate(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = [[10, 0, 0], [1, 3, 0], [2, 3, 5]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_tril=[[2., 0, 0],
-                        [1, 3, 0],
-                        [2, 3, 4]],
-            scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = bijectors.Affine(
-            shift=mu, scale_tril=[[10., 0, 0],
-                                  [1, 3, 0],
-                                  [2, 3, 5]])
-
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 2, 3]  # Vector.
-        self.assertAllClose([9., 6, 22], run(bijector.forward, x))
-        self.assertAllClose(
-            run(bijector_ref.forward, x), run(bijector.forward, x))
-        self.assertAllClose([0.2, 14 / 15., 4 / 25.], run(bijector.inverse, x))
-        self.assertAllClose(
-            run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-math.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
-        self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
-
-  def testTriLWithVDVTUpdateNoDiagonal(self):
-    with self.test_session() as sess:
-
-      def static_run(fun, x):
-        return fun(x).eval()
-
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
-
-      for run in (static_run, dynamic_run):
-        mu = -1.
-        # Corresponds to scale = [[6, 0, 0], [1, 3, 0], [2, 3, 5]]
-        bijector = bijectors.Affine(
-            shift=mu,
-            scale_tril=[[2., 0, 0], [1, 3, 0], [2, 3, 4]],
-            scale_perturb_diag=None,
-            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
-        bijector_ref = bijectors.Affine(
-            shift=mu, scale_tril=[[6., 0, 0], [1, 3, 0], [2, 3, 5]])
-
-        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
-        x = [1., 2, 3]  # Vector.
-        self.assertAllClose([5., 6, 22], run(bijector.forward, x))
-        self.assertAllClose(
-            run(bijector_ref.forward, x), run(bijector.forward, x))
-        self.assertAllClose([1 / 3., 8 / 9., 4 / 30.], run(bijector.inverse, x))
-        self.assertAllClose(
-            run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-math.log(90.),
-                            run(bijector.inverse_log_det_jacobian, x))
-        self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
-
-  def testNoBatchMultivariateRaisesWhenSingular(self):
-    with self.test_session():
-      mu = [1., -1]
-      bijector = bijectors.Affine(
-          shift=mu,
-          # Has zero on the diagonal.
-          scale_diag=[0., 1],
-          validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
-        bijector.forward([1., 1.]).eval()
-
-  def testEventNdimsLargerThanOneRaises(self):
-    with self.test_session():
-      mu = [1., -1]
-      # Scale corresponds to 2x2 identity matrix.
-      bijector = bijectors.Affine(shift=mu, event_ndims=2, validate_args=True)
-      bijector.forward([1., 1.]).eval()
-
-  def testScaleZeroScalarRaises(self):
-    with self.test_session():
-      mu = -1.
-      # Check Identity matrix with zero scaling.
-      bijector = bijectors.Affine(
-          shift=mu,
-          scale_identity_multiplier=0.0,
-          event_ndims=0,
-          validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
-        bijector.forward(1.).eval()
-
-      # Check Diag matrix with zero scaling.
-      bijector = bijectors.Affine(
-          shift=mu, scale_diag=[0.0], event_ndims=0, validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
-        bijector.forward(1.).eval()
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Affine(
-          shift=3.6, scale_identity_multiplier=0.42, event_ndims=0)
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
-
-  def _makeScale(self,
-                 x,
-                 scale_identity_multiplier=None,
-                 scale_diag=None,
-                 scale_tril=None,
-                 scale_perturb_factor=None,
-                 scale_perturb_diag=None):
-    """Create a scale matrix. Return None if it can not be created."""
-    c = scale_identity_multiplier
-    d1 = scale_diag
-    tril = scale_tril
-    v = scale_perturb_factor
-    d2 = scale_perturb_diag
-
-    # Ambiguous low rank update.
-    if v is None and d2 is not None:
-      return None
-
-    if c is None and d1 is None and tril is None:
-      # Special case when no scale args are passed in. This means use an
-      # identity matrix.
-      if v is None and d2 is None:
-        c = 1.
-      # No scale.
-      else:
-        return None
-
-    matrix = np.float32(0.)
-    if c is not None:
-      # Infer the dimension from x.
-      matrix += c * self._matrix_diag(np.ones_like(x))
-    if d1 is not None:
-      matrix += self._matrix_diag(np.array(d1, dtype=np.float32))
-    if tril is not None:
-      matrix += np.array(tril, dtype=np.float32)
-    if v is not None:
-      v = np.array(v, dtype=np.float32)
-      if v.ndim < 2:
-        vt = v.T
-      else:
-        vt = np.swapaxes(v, axis1=v.ndim - 2, axis2=v.ndim - 1)
-      if d2 is not None:
-        d2 = self._matrix_diag(np.array(d2, dtype=np.float32))
-        right = np.matmul(d2, vt)
-      else:
-        right = vt
-      matrix += np.matmul(v, right)
-    return matrix
-
-  def _matrix_diag(self, d):
-    """Batch version of np.diag."""
-    orig_shape = d.shape
-    d = np.reshape(d, (int(np.prod(d.shape[:-1])), d.shape[-1]))
-    diag_list = []
-    for i in range(d.shape[0]):
-      diag_list.append(np.diag(d[i, ...]))
-    return np.reshape(diag_list, orig_shape + (d.shape[-1],))
-
-  def _testLegalInputs(self, shift=None, scale_params=None, x=None):
-
-    def _powerset(x):
-      s = list(x)
-      return itertools.chain.from_iterable(
-          itertools.combinations(s, r) for r in range(len(s) + 1))
-
-    for args in _powerset(scale_params.items()):
-      with self.test_session():
-        args = dict(args)
-
-        scale_args = dict({"x": x}, **args)
-        scale = self._makeScale(**scale_args)
-
-        bijector_args = dict({"event_ndims": 1}, **args)
-
-        # We haven't specified enough information for the scale.
-        if scale is None:
-          with self.assertRaisesRegexp(ValueError, ("must be specified.")):
-            bijector = bijectors.Affine(shift=shift, **bijector_args)
-        else:
-          bijector = bijectors.Affine(shift=shift, **bijector_args)
-          np_x = x
-          # For the case a vector is passed in, we need to make the shape
-          # match the matrix for matmul to work.
-          if x.ndim == scale.ndim - 1:
-            np_x = np.expand_dims(x, axis=-1)
-
-          forward = np.matmul(scale, np_x) + shift
-          if x.ndim == scale.ndim - 1:
-            forward = np.squeeze(forward, axis=-1)
-          self.assertAllClose(forward, bijector.forward(x).eval())
-
-          backward = np.linalg.solve(scale, np_x - shift)
-          if x.ndim == scale.ndim - 1:
-            backward = np.squeeze(backward, axis=-1)
-          self.assertAllClose(backward, bijector.inverse(x).eval())
-
-          ildj = -np.log(np.abs(np.linalg.det(scale)))
-          # TODO(jvdillon): We need to make it so the scale_identity_multiplier
-          # case does not deviate in expected shape. Fixing this will get rid of
-          # these special cases.
-          if (ildj.ndim > 0 and (len(scale_args) == 1 or (
-              len(scale_args) == 2 and
-              scale_args.get("scale_identity_multiplier", None) is not None))):
-            ildj = np.squeeze(ildj[0])
-          elif ildj.ndim < scale.ndim - 2:
-            ildj = np.reshape(ildj, scale.shape[0:-2])
-          self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(x).eval())
-
-  def testLegalInputs(self):
-    self._testLegalInputs(
-        shift=np.float32(-1),
-        scale_params={
-            "scale_identity_multiplier": 2.,
-            "scale_diag": [2., 3.],
-            "scale_tril": [[1., 0.],
-                           [-3., 3.]],
-            "scale_perturb_factor": [[1., 0],
-                                     [1.5, 3.]],
-            "scale_perturb_diag": [3., 1.]
-        },
-        x=np.array(
-            [1., 2], dtype=np.float32))
-
-  def testLegalInputsWithBatch(self):
-    # Shape of scale is [2, 1, 2, 2]
-    self._testLegalInputs(
-        shift=np.float32(-1),
-        scale_params={
-            "scale_identity_multiplier": 2.,
-            "scale_diag": [[[2., 3.]], [[1., 2]]],
-            "scale_tril": [[[[1., 0.], [-3., 3.]]], [[[0.5, 0.], [1., 1.]]]],
-            "scale_perturb_factor": [[[[1., 0], [1.5, 3.]]],
-                                     [[[1., 0], [1., 1.]]]],
-            "scale_perturb_diag": [[[3., 1.]], [[0.5, 1.]]]
-        },
-        x=np.array(
-            [[[1., 2]], [[3., 4]]], dtype=np.float32))
-
-  def testNegativeDetTrilPlusVDVT(self):
-    # scale = [[3.7, 2.7],
-    #          [-0.3, -1.3]]
-    # inv(scale) = [[0.325, 0.675],
-    #               [-0.075, -0.925]]
-    # eig(scale) = [3.5324, -1.1324]
-    self._testLegalInputs(
-        shift=np.float32(-1),
-        scale_params={
-            "scale_tril": [[1., 0], [-3, -4]],
-            "scale_perturb_factor": [[0.1, 0], [0.5, 0.3]],
-            "scale_perturb_diag": [3., 1]
-        },
-        x=np.array(
-            [1., 2], dtype=np.float32))
-
-  def testScalePropertyAssertsCorrectly(self):
-    with self.test_session():
-      with self.assertRaises(NotImplementedError):
-        scale = bijectors.Affine(  # pylint:disable=unused-variable
-            scale_tril=[[1., 0], [2, 1]],
-            scale_perturb_factor=[2., 1.]).scale
-
-
-class SoftplusBijectorTest(test.TestCase):
-  """Tests the correctness of the Y = g(X) = Log[1 + exp(X)] transformation."""
-
-  def _softplus(self, x):
-    return np.log(1 + np.exp(x))
-
-  def _softplus_inverse(self, y):
-    return np.log(np.exp(y) - 1)
-
-  def _softplus_ildj_before_reduction(self, y):
-    """Inverse log det jacobian, before being reduced."""
-    return -np.log(1 - np.exp(-y))
-
-  def testBijectorForwardInverseEventDimsZero(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=0)
-      self.assertEqual("softplus", bijector.name)
-      x = 2 * rng.randn(2, 10)
-      y = self._softplus(x)
-
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          x, bijector.inverse_and_inverse_log_det_jacobian(y)[0].eval())
-
-  def testBijectorLogDetJacobianEventDimsZero(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=0)
-      y = 2 * rng.rand(2, 10)
-      # No reduction needed if event_dims = 0.
-      ildj = self._softplus_ildj_before_reduction(y)
-
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(
-          ildj, bijector.inverse_and_inverse_log_det_jacobian(y)[1].eval())
-
-  def testBijectorForwardInverseEventDimsOne(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=1)
-      self.assertEqual("softplus", bijector.name)
-      x = 2 * rng.randn(2, 10)
-      y = self._softplus(x)
-
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          x, bijector.inverse_and_inverse_log_det_jacobian(y)[0].eval())
-
-  def testBijectorLogDetJacobianEventDimsOne(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=1)
-      y = 2 * rng.rand(2, 10)
-      ildj_before = self._softplus_ildj_before_reduction(y)
-      ildj = np.sum(ildj_before, axis=1)
-
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(
-          ildj, bijector.inverse_and_inverse_log_det_jacobian(y)[1].eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=0)
-      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
-
-  def testBijectiveAndFinite32bit(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=0)
-      x = np.linspace(-20., 20., 100).astype(np.float32)
-      y = np.logspace(-10, 10, 100).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-2, atol=1e-2)
-
-  def testBijectiveAndFinite16bit(self):
-    with self.test_session():
-      bijector = bijectors.Softplus(event_ndims=0)
-      # softplus(-20) is zero, so we can't use such a large range as in 32bit.
-      x = np.linspace(-10., 20., 100).astype(np.float16)
-      # Note that float16 is only in the open set (0, inf) for a smaller
-      # logspace range.  The actual range was (-7, 4), so use something smaller
-      # for the test.
-      y = np.logspace(-6, 3, 100).astype(np.float16)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-1, atol=1e-3)
-
-
-class SoftmaxCenteredBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = exp(X) / sum(exp(X)) transformation."""
-
-  def testBijectorScalar(self):
-    with self.test_session():
-      softmax = bijectors.SoftmaxCentered()  # scalar by default
-      self.assertEqual("softmax_centered", softmax.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
-      self.assertAllClose(y, softmax.forward(x).eval())
-      self.assertAllClose(x, softmax.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          softmax.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-  def testBijectorVector(self):
-    with self.test_session():
-      softmax = bijectors.SoftmaxCentered(event_ndims=1)
-      self.assertEqual("softmax_centered", softmax.name)
-      x = np.log([[2., 3, 4], [4., 8, 12]])
-      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
-      self.assertAllClose(y, softmax.forward(x).eval())
-      self.assertAllClose(x, softmax.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-  def testShapeGetters(self):
-    with self.test_session():
-      for x, y, b in ((tensor_shape.TensorShape([]),
-                       tensor_shape.TensorShape([2]), bijectors.SoftmaxCentered(
-                           event_ndims=0, validate_args=True)),
-                      (tensor_shape.TensorShape([4]),
-                       tensor_shape.TensorShape([5]), bijectors.SoftmaxCentered(
-                           event_ndims=1, validate_args=True))):
-        self.assertAllEqual(y, b.forward_event_shape(x))
-        self.assertAllEqual(y.as_list(),
-                            b.forward_event_shape_tensor(x.as_list()).eval())
-        self.assertAllEqual(x, b.inverse_event_shape(y))
-        self.assertAllEqual(x.as_list(),
-                            b.inverse_event_shape_tensor(y.as_list()).eval())
-
-  def testBijectiveAndFinite(self):
-    with self.test_session():
-      softmax = bijectors.SoftmaxCentered(event_ndims=1)
-      x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
-      # Make y values on the simplex with a wide range.
-      y_0 = np.ones(5).astype(np.float32)
-      y_1 = (1e-5 * rng.rand(5)).astype(np.float32)
-      y_2 = (1e1 * rng.rand(5)).astype(np.float32)
-      y = np.array([y_0, y_1, y_2])
-      y /= y.sum(axis=0)
-      y = y.T  # y.shape = [5, 3]
-      assert_bijective_and_finite(softmax, x, y)
-
-
-class SigmoidCenteredBijectorTest(test.TestCase):
-  """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      sigmoid = bijectors.SigmoidCentered()
-      self.assertEqual("sigmoid_centered", sigmoid.name)
-      x = np.log([[2., 3, 4],
-                  [4., 8, 12]])
-      y = [[[2. / 3, 1. / 3],
-            [3. / 4, 1. / 4],
-            [4. / 5, 1. / 5]],
-           [[4. / 5, 1. / 5],
-            [8. / 9, 1. / 9],
-            [12. / 13, 1. / 13]]]
-      self.assertAllClose(y, sigmoid.forward(x).eval())
-      self.assertAllClose(x, sigmoid.inverse(y).eval())
-      self.assertAllClose(
-          -np.sum(np.log(y), axis=2),
-          sigmoid.inverse_log_det_jacobian(y).eval(),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          -sigmoid.inverse_log_det_jacobian(y).eval(),
-          sigmoid.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-
-class CholeskyOuterProductBijectorTest(test.TestCase):
-  """Tests the correctness of the Y = X * X^T transformation."""
-
-  def testBijectorMatrix(self):
-    with self.test_session():
-      bijector = bijectors.CholeskyOuterProduct(
-          event_ndims=2, validate_args=True)
-      self.assertEqual("cholesky_outer_product", bijector.name)
-      x = [[[1., 0], [2, 1]], [[math.sqrt(2.), 0], [math.sqrt(8.), 1]]]
-      y = np.matmul(x, np.transpose(x, axes=(0, 2, 1)))
-      # Fairly easy to compute differentials since we have 2x2.
-      dx_dy = [[[2. * 1, 0, 0],
-                [2, 1, 0],
-                [0, 2 * 2, 2 * 1]],
-               [[2 * math.sqrt(2.), 0, 0],
-                [math.sqrt(8.), math.sqrt(2.), 0],
-                [0, 2 * math.sqrt(8.), 2 * 1]]]
-      ildj = -np.sum(
-          np.log(np.asarray(dx_dy).diagonal(
-              offset=0, axis1=1, axis2=2)),
-          axis=1)
-      self.assertAllEqual((2, 2, 2), bijector.forward(x).get_shape())
-      self.assertAllEqual((2, 2, 2), bijector.inverse(y).get_shape())
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
-      self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-  def testBijectorScalar(self):
-    with self.test_session():
-      bijector = bijectors.CholeskyOuterProduct(
-          event_ndims=0, validate_args=True)
-      self.assertEqual("cholesky_outer_product", bijector.name)
-      x = [[[1., 5],
-            [2, 1]],
-           [[math.sqrt(2.), 3],
-            [math.sqrt(8.), 1]]]
-      y = np.square(x)
-      ildj = -math.log(2.) - np.log(x)
-      self.assertAllClose(y, bijector.forward(x).eval())
-      self.assertAllClose(x, bijector.inverse(y).eval())
-      self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
-      self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
-          atol=0.,
-          rtol=1e-7)
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.CholeskyOuterProduct(
-          event_ndims=0, validate_args=True)
-      assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
-
-
-class ChainBijectorTest(test.TestCase):
-  """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      chain = bijectors.Chain((bijectors.Exp(event_ndims=1),
-                               bijectors.Softplus(event_ndims=1)))
-      self.assertEqual("chain_of_exp_of_softplus", chain.name)
-      x = np.asarray([[[1., 2.],
-                       [2., 3.]]])
-      self.assertAllClose(1. + np.exp(x), chain.forward(x).eval())
-      self.assertAllClose(np.log(x - 1.), chain.inverse(x).eval())
-      self.assertAllClose(
-          -np.sum(np.log(x - 1.), axis=2),
-          chain.inverse_log_det_jacobian(x).eval())
-      self.assertAllClose(
-          np.sum(x, axis=2), chain.forward_log_det_jacobian(x).eval())
-
-  def testBijectorIdentity(self):
-    with self.test_session():
-      chain = bijectors.Chain()
-      self.assertEqual("identity", chain.name)
-      x = np.asarray([[[1., 2.],
-                       [2., 3.]]])
-      self.assertAllClose(x, chain.forward(x).eval())
-      self.assertAllClose(x, chain.inverse(x).eval())
-      self.assertAllClose(0., chain.inverse_log_det_jacobian(x).eval())
-      self.assertAllClose(0., chain.forward_log_det_jacobian(x).eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Chain((bijectors.Exp(), bijectors.Softplus()))
-      assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
-
-  def testShapeGetters(self):
-    with self.test_session():
-      bijector = bijectors.Chain((bijectors.SoftmaxCentered(
-          event_ndims=1, validate_args=True), bijectors.SoftmaxCentered(
-              event_ndims=0, validate_args=True)))
-      x = tensor_shape.TensorShape([])
-      y = tensor_shape.TensorShape([2 + 1])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
-      self.assertAllEqual(
-          y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
-      self.assertAllEqual(
-          x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
-
-
-class InvertBijectorTest(test.TestCase):
-  """Tests the correctness of the Y = Invert(bij) transformation."""
-
-  def testBijector(self):
-    with self.test_session():
-      for fwd in [
-          bijectors.Identity(),
-          bijectors.Exp(event_ndims=1),
-          bijectors.Affine(
-              shift=[0., 1.], scale_diag=[2., 3.], event_ndims=1),
-          bijectors.Softplus(event_ndims=1),
-          bijectors.SoftmaxCentered(event_ndims=1),
-          bijectors.SigmoidCentered(),
-      ]:
-        rev = bijectors.Invert(fwd)
-        self.assertEqual("_".join(["invert", fwd.name]), rev.name)
-        x = [[[1., 2.],
-              [2., 3.]]]
-        self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
-        self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
-        self.assertAllClose(
-            fwd.forward_log_det_jacobian(x).eval(),
-            rev.inverse_log_det_jacobian(x).eval())
-        self.assertAllClose(
-            fwd.inverse_log_det_jacobian(x).eval(),
-            rev.forward_log_det_jacobian(x).eval())
-        inv, jac = rev.inverse_and_inverse_log_det_jacobian(x)
-        self.assertAllClose(fwd.forward(x).eval(), inv.eval())
-        self.assertAllClose(fwd.forward_log_det_jacobian(x).eval(), jac.eval())
-
-  def testScalarCongruency(self):
-    with self.test_session():
-      bijector = bijectors.Invert(bijectors.Exp())
-      assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
-
-  def testShapeGetters(self):
-    with self.test_session():
-      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
-      x = tensor_shape.TensorShape([2])
-      y = tensor_shape.TensorShape([])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
-      self.assertAllEqual(
-          y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
-      self.assertAllEqual(
-          x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
new file mode 100644
index 00000000000..0738754b217
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -0,0 +1,100 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AffineLinearOperator Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
+from tensorflow.python.platform import test
+
+
+class AffineLinearOperatorTest(test.TestCase):
+
+  def testIdentity(self):
+    with self.test_session():
+      affine = AffineLinearOperator(
+          validate_args=True)
+      x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
+      y = x
+      ildj = 0.
+
+      self.assertEqual(affine.name, "affine_linear_operator")
+      self.assertAllClose(y, affine.forward(x).eval())
+      self.assertAllClose(x, affine.inverse(y).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
+                          affine.forward_log_det_jacobian(x).eval())
+
+  def testDiag(self):
+    with self.test_session():
+      shift = np.array([-1, 0, 1], dtype=np.float32)
+      diag = np.array([[1, 2, 3],
+                       [2, 5, 6]], dtype=np.float32)
+      scale = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+      affine = AffineLinearOperator(
+          shift=shift, scale=scale, validate_args=True)
+
+      x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
+      y = diag * x + shift
+      ildj = -np.sum(np.log(np.abs(diag)), axis=-1)
+
+      self.assertEqual(affine.name, "affine_linear_operator")
+      self.assertAllClose(y, affine.forward(x).eval())
+      self.assertAllClose(x, affine.inverse(y).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
+                          affine.forward_log_det_jacobian(x).eval())
+
+  def testTriL(self):
+    with self.test_session():
+      shift = np.array([-1, 0, 1], dtype=np.float32)
+      tril = np.array([[[1, 0, 0],
+                        [2, -1, 0],
+                        [3, 2, 1]],
+                       [[2, 0, 0],
+                        [3, -2, 0],
+                        [4, 3, 2]]],
+                      dtype=np.float32)
+      scale = linalg.LinearOperatorTriL(tril, is_non_singular=True)
+      affine = AffineLinearOperator(
+          shift=shift, scale=scale, validate_args=True)
+
+      x = np.array([[[1, 0, -1],
+                     [2, 3, 4]],
+                    [[4, 1, -7],
+                     [6, 9, 8]]],
+                   dtype=np.float32)
+      # If we made the bijector do x*A+b then this would be simplified to:
+      # y = np.matmul(x, tril) + shift.
+      y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
+      ildj = -np.sum(np.log(np.abs(np.diagonal(
+          tril, axis1=-2, axis2=-1))),
+                     axis=-1)
+
+      self.assertEqual(affine.name, "affine_linear_operator")
+      self.assertAllClose(y, affine.forward(x).eval())
+      self.assertAllClose(x, affine.inverse(y).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
+                          affine.forward_log_det_jacobian(x).eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
new file mode 100644
index 00000000000..e8fd6aa2f73
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -0,0 +1,822 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Affine Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class AffineBijectorTest(test.TestCase):
+  """Tests correctness of the Y = scale @ x + shift transformation."""
+
+  def testProperties(self):
+    with self.test_session():
+      mu = -1.
+      # scale corresponds to 1.
+      bijector = Affine(shift=mu, event_ndims=0)
+      self.assertEqual("affine", bijector.name)
+
+  def testNoBatchScalarViaIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = 2
+        bijector = Affine(
+            shift=mu, scale_identity_multiplier=2., event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1., 2, 3]  # Three scalar samples (no batches).
+        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
+        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testNoBatchScalarViaDiag(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = 2
+        bijector = Affine(shift=mu, scale_diag=[2.], event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1., 2, 3]  # Three scalar samples (no batches).
+        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
+        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testWeirdSampleNoBatchScalarViaIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = 2.
+        bijector = Affine(
+            shift=mu, scale_identity_multiplier=2., event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [[1., 2, 3], [4, 5, 6]]  # Weird sample shape.
+        self.assertAllClose([[1., 3, 5],
+                             [7, 9, 11]],
+                            run(bijector.forward, x))
+        self.assertAllClose([[1., 1.5, 2.],
+                             [2.5, 3, 3.5]],
+                            run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testOneBatchScalarViaIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1.]
+        # One batch, scalar.
+        # Corresponds to scale = 1.
+        bijector = Affine(shift=mu, event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1.]  # One sample from one batches.
+        self.assertAllClose([2.], run(bijector.forward, x))
+        self.assertAllClose([0.], run(bijector.inverse, x))
+        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+
+  def testOneBatchScalarViaDiag(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1.]
+        # One batch, scalar.
+        # Corresponds to scale = 1.
+        bijector = Affine(shift=mu, scale_diag=[1.], event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1.]  # One sample from one batches.
+        self.assertAllClose([2.], run(bijector.forward, x))
+        self.assertAllClose([0.], run(bijector.inverse, x))
+        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+
+  def testTwoBatchScalarIdentityViaIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Univariate, two batches.
+        # Corresponds to scale = 1.
+        bijector = Affine(shift=mu, event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1., 1]  # One sample from each of two batches.
+        self.assertAllClose([2., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+
+  def testTwoBatchScalarIdentityViaDiag(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Univariate, two batches.
+        # Corresponds to scale = 1.
+        bijector = Affine(shift=mu, scale_diag=[1.], event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = [1., 1]  # One sample from each of two batches.
+        self.assertAllClose([2., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+
+  def testNoBatchMultivariateIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Multivariate
+        # Corresponds to scale = [[1., 0], [0, 1.]]
+        bijector = Affine(shift=mu)
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 1]
+        # matmul(sigma, x) + shift
+        # = [-1, -1] + [1, -1]
+        self.assertAllClose([2., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+
+        # x is a 2-batch of 2-vectors.
+        # The first vector is [1, 1], the second is [-1, -1].
+        # Each undergoes matmul(sigma, x) + shift.
+        x = [[1., 1], [-1., -1]]
+        self.assertAllClose([[2., 0], [0., -2]], run(bijector.forward, x))
+        self.assertAllClose([[0., 2], [-2., 0]], run(bijector.inverse, x))
+        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+
+  def testNoBatchMultivariateDiag(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [1., -1]
+        # Multivariate
+        # Corresponds to scale = [[2., 0], [0, 1.]]
+        bijector = Affine(shift=mu, scale_diag=[2., 1])
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 1]
+        # matmul(sigma, x) + shift
+        # = [-1, -1] + [1, -1]
+        self.assertAllClose([3., 0], run(bijector.forward, x))
+        self.assertAllClose([0., 2], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+        # x is a 2-batch of 2-vectors.
+        # The first vector is [1, 1], the second is [-1, -1].
+        # Each undergoes matmul(sigma, x) + shift.
+        x = [[1., 1],
+             [-1., -1]]
+        self.assertAllClose([[3., 0],
+                             [-1., -2]],
+                            run(bijector.forward, x))
+        self.assertAllClose([[0., 2],
+                             [-1., 0]],
+                            run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testNoBatchMultivariateFullDynamic(self):
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      mu = array_ops.placeholder(dtypes.float32, name="mu")
+      scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
+      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
+
+      x_value = np.array([[1., 1]], dtype=np.float32)
+      mu_value = np.array([1., -1], dtype=np.float32)
+      scale_diag_value = np.array([2., 2], dtype=np.float32)
+      event_ndims_value = np.array(1, dtype=np.int32)
+      feed_dict = {
+          x: x_value,
+          mu: mu_value,
+          scale_diag: scale_diag_value,
+          event_ndims: event_ndims_value
+      }
+
+      bijector = Affine(
+          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
+      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
+      self.assertAllClose([[3., 1]], sess.run(bijector.forward(x), feed_dict))
+      self.assertAllClose([[0., 1]], sess.run(bijector.inverse(x), feed_dict))
+      self.assertAllClose(
+          -np.log(4),
+          sess.run(bijector.inverse_log_det_jacobian(x), feed_dict))
+
+  def testBatchMultivariateIdentity(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value, dtype=np.float32)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [[1., -1]]
+        # Corresponds to 1 2x2 matrix, with twos on the diagonal.
+        scale = 2.
+        bijector = Affine(shift=mu, scale_identity_multiplier=scale)
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [[[1., 1]]]
+        self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
+        self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(4),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testBatchMultivariateDiag(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value, dtype=np.float32)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = [[1., -1]]
+        # Corresponds to 1 2x2 matrix, with twos on the diagonal.
+        scale_diag = [[2., 2]]
+        bijector = Affine(shift=mu, scale_diag=scale_diag)
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [[[1., 1]]]
+        self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
+        self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
+        self.assertAllClose([-np.log(4)],
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testBatchMultivariateFullDynamic(self):
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      mu = array_ops.placeholder(dtypes.float32, name="mu")
+      scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
+      event_ndims = array_ops.placeholder(dtypes.int32, name="event_ndims")
+
+      x_value = np.array([[[1., 1]]], dtype=np.float32)
+      mu_value = np.array([[1., -1]], dtype=np.float32)
+      scale_diag_value = np.array([[2., 2]], dtype=np.float32)
+      event_ndims_value = 1
+
+      feed_dict = {
+          x: x_value,
+          mu: mu_value,
+          scale_diag: scale_diag_value,
+          event_ndims: event_ndims_value
+      }
+
+      bijector = Affine(
+          shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
+      self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
+      self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
+      self.assertAllClose([[[0., 1]]], sess.run(bijector.inverse(x), feed_dict))
+      self.assertAllClose([-np.log(4)],
+                          sess.run(
+                              bijector.inverse_log_det_jacobian(x), feed_dict))
+
+  def testIdentityWithDiagUpdate(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = 2
+        bijector = Affine(
+            shift=mu,
+            scale_identity_multiplier=1.,
+            scale_diag=[1.],
+            event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 2, 3]  # Three scalar samples (no batches).
+        self.assertAllClose([1., 3, 5], run(bijector.forward, x))
+        self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(2.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testIdentityWithTriL(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # scale = [[2., 0], [2, 2]]
+        bijector = Affine(
+            shift=mu,
+            scale_identity_multiplier=1.,
+            scale_tril=[[1., 0], [2., 1]])
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [[1., 2]]  # One multivariate sample.
+        self.assertAllClose([[1., 5]], run(bijector.forward, x))
+        self.assertAllClose([[1., 0.5]], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(4.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testDiagWithTriL(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # scale = [[2., 0], [2, 3]]
+        bijector = Affine(
+            shift=mu, scale_diag=[1., 2.], scale_tril=[[1., 0], [2., 1]])
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [[1., 2]]  # One multivariate sample.
+        self.assertAllClose([[1., 7]], run(bijector.forward, x))
+        self.assertAllClose([[1., 1 / 3.]], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(6.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testIdentityAndDiagWithTriL(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # scale = [[3., 0], [2, 4]]
+        bijector = Affine(
+            shift=mu,
+            scale_identity_multiplier=1.0,
+            scale_diag=[1., 2.],
+            scale_tril=[[1., 0], [2., 1]])
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [[1., 2]]  # One multivariate sample.
+        self.assertAllClose([[2., 9]], run(bijector.forward, x))
+        self.assertAllClose([[2 / 3., 5 / 12.]], run(bijector.inverse, x))
+        self.assertAllClose(-np.log(12.),
+                            run(bijector.inverse_log_det_jacobian, x))
+
+  def testIdentityWithVDVTUpdate(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = [[10, 0, 0], [0, 2, 0], [0, 0, 3]]
+        bijector = Affine(
+            shift=mu,
+            scale_identity_multiplier=2.,
+            scale_perturb_diag=[2., 1],
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(shift=mu, scale_diag=[10., 2, 3])
+
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 2, 3]  # Vector.
+        self.assertAllClose([9., 3, 8], run(bijector.forward, x))
+        self.assertAllClose(
+            run(bijector_ref.forward, x), run(bijector.forward, x))
+
+        self.assertAllClose([0.2, 1.5, 4 / 3.], run(bijector.inverse, x))
+        self.assertAllClose(
+            run(bijector_ref.inverse, x), run(bijector.inverse, x))
+        self.assertAllClose(-np.log(60.),
+                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x),
+            run(bijector_ref.inverse_log_det_jacobian, x))
+
+  def testDiagWithVDVTUpdate(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = [[10, 0, 0], [0, 3, 0], [0, 0, 5]]
+        bijector = Affine(
+            shift=mu,
+            scale_diag=[2., 3, 4],
+            scale_perturb_diag=[2., 1],
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(shift=mu, scale_diag=[10., 3, 5])
+
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 2, 3]  # Vector.
+        self.assertAllClose([9., 5, 14], run(bijector.forward, x))
+        self.assertAllClose(
+            run(bijector_ref.forward, x), run(bijector.forward, x))
+        self.assertAllClose([0.2, 1., 0.8], run(bijector.inverse, x))
+        self.assertAllClose(
+            run(bijector_ref.inverse, x), run(bijector.inverse, x))
+        self.assertAllClose(-np.log(150.),
+                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x),
+            run(bijector_ref.inverse_log_det_jacobian, x))
+
+  def testTriLWithVDVTUpdate(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = [[10, 0, 0], [1, 3, 0], [2, 3, 5]]
+        bijector = Affine(
+            shift=mu,
+            scale_tril=[[2., 0, 0], [1, 3, 0], [2, 3, 4]],
+            scale_perturb_diag=[2., 1],
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(
+            shift=mu, scale_tril=[[10., 0, 0], [1, 3, 0], [2, 3, 5]])
+
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 2, 3]  # Vector.
+        self.assertAllClose([9., 6, 22], run(bijector.forward, x))
+        self.assertAllClose(
+            run(bijector_ref.forward, x), run(bijector.forward, x))
+        self.assertAllClose([0.2, 14 / 15., 4 / 25.], run(bijector.inverse, x))
+        self.assertAllClose(
+            run(bijector_ref.inverse, x), run(bijector.inverse, x))
+        self.assertAllClose(-np.log(150.),
+                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x),
+            run(bijector_ref.inverse_log_det_jacobian, x))
+
+  def testTriLWithVDVTUpdateNoDiagonal(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value)
+        x = array_ops.placeholder(dtypes.float32, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        mu = -1.
+        # Corresponds to scale = [[6, 0, 0], [1, 3, 0], [2, 3, 5]]
+        bijector = Affine(
+            shift=mu,
+            scale_tril=[[2., 0, 0], [1, 3, 0], [2, 3, 4]],
+            scale_perturb_diag=None,
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(
+            shift=mu, scale_tril=[[6., 0, 0], [1, 3, 0], [2, 3, 5]])
+
+        self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
+        x = [1., 2, 3]  # Vector.
+        self.assertAllClose([5., 6, 22], run(bijector.forward, x))
+        self.assertAllClose(
+            run(bijector_ref.forward, x), run(bijector.forward, x))
+        self.assertAllClose([1 / 3., 8 / 9., 4 / 30.], run(bijector.inverse, x))
+        self.assertAllClose(
+            run(bijector_ref.inverse, x), run(bijector.inverse, x))
+        self.assertAllClose(-np.log(90.),
+                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x),
+            run(bijector_ref.inverse_log_det_jacobian, x))
+
+  def testNoBatchMultivariateRaisesWhenSingular(self):
+    with self.test_session():
+      mu = [1., -1]
+      bijector = Affine(
+          shift=mu,
+          # Has zero on the diagonal.
+          scale_diag=[0., 1],
+          validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0"):
+        bijector.forward([1., 1.]).eval()
+
+  def testEventNdimsLargerThanOneRaises(self):
+    with self.test_session():
+      mu = [1., -1]
+      # Scale corresponds to 2x2 identity matrix.
+      bijector = Affine(shift=mu, event_ndims=2, validate_args=True)
+      bijector.forward([1., 1.]).eval()
+
+  def testScaleZeroScalarRaises(self):
+    with self.test_session():
+      mu = -1.
+      # Check Identity matrix with zero scaling.
+      bijector = Affine(
+          shift=mu,
+          scale_identity_multiplier=0.0,
+          event_ndims=0,
+          validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0"):
+        bijector.forward(1.).eval()
+
+      # Check Diag matrix with zero scaling.
+      bijector = Affine(
+          shift=mu, scale_diag=[0.0], event_ndims=0, validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0"):
+        bijector.forward(1.).eval()
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Affine(
+          shift=3.6, scale_identity_multiplier=0.42, event_ndims=0)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def _makeScale(self,
+                 x,
+                 scale_identity_multiplier=None,
+                 scale_diag=None,
+                 scale_tril=None,
+                 scale_perturb_factor=None,
+                 scale_perturb_diag=None):
+    """Create a scale matrix. Return None if it can not be created."""
+    c = scale_identity_multiplier
+    d1 = scale_diag
+    tril = scale_tril
+    v = scale_perturb_factor
+    d2 = scale_perturb_diag
+
+    # Ambiguous low rank update.
+    if v is None and d2 is not None:
+      return None
+
+    if c is None and d1 is None and tril is None:
+      # Special case when no scale args are passed in. This means use an
+      # identity matrix.
+      if v is None and d2 is None:
+        c = 1.
+      # No scale.
+      else:
+        return None
+
+    matrix = np.float32(0.)
+    if c is not None:
+      # Infer the dimension from x.
+      matrix += c * self._matrix_diag(np.ones_like(x))
+    if d1 is not None:
+      matrix += self._matrix_diag(np.array(d1, dtype=np.float32))
+    if tril is not None:
+      matrix += np.array(tril, dtype=np.float32)
+    if v is not None:
+      v = np.array(v, dtype=np.float32)
+      if v.ndim < 2:
+        vt = v.T
+      else:
+        vt = np.swapaxes(v, axis1=v.ndim - 2, axis2=v.ndim - 1)
+      if d2 is not None:
+        d2 = self._matrix_diag(np.array(d2, dtype=np.float32))
+        right = np.matmul(d2, vt)
+      else:
+        right = vt
+      matrix += np.matmul(v, right)
+    return matrix
+
+  def _matrix_diag(self, d):
+    """Batch version of np.diag."""
+    orig_shape = d.shape
+    d = np.reshape(d, (int(np.prod(d.shape[:-1])), d.shape[-1]))
+    diag_list = []
+    for i in range(d.shape[0]):
+      diag_list.append(np.diag(d[i, ...]))
+    return np.reshape(diag_list, orig_shape + (d.shape[-1],))
+
+  def _testLegalInputs(self, shift=None, scale_params=None, x=None):
+
+    def _powerset(x):
+      s = list(x)
+      return itertools.chain.from_iterable(
+          itertools.combinations(s, r) for r in range(len(s) + 1))
+
+    for args in _powerset(scale_params.items()):
+      with self.test_session():
+        args = dict(args)
+
+        scale_args = dict({"x": x}, **args)
+        scale = self._makeScale(**scale_args)
+
+        bijector_args = dict({"event_ndims": 1}, **args)
+
+        # We haven't specified enough information for the scale.
+        if scale is None:
+          with self.assertRaisesRegexp(ValueError, ("must be specified.")):
+            bijector = Affine(shift=shift, **bijector_args)
+        else:
+          bijector = Affine(shift=shift, **bijector_args)
+          np_x = x
+          # For the case a vector is passed in, we need to make the shape
+          # match the matrix for matmul to work.
+          if x.ndim == scale.ndim - 1:
+            np_x = np.expand_dims(x, axis=-1)
+
+          forward = np.matmul(scale, np_x) + shift
+          if x.ndim == scale.ndim - 1:
+            forward = np.squeeze(forward, axis=-1)
+          self.assertAllClose(forward, bijector.forward(x).eval())
+
+          backward = np.linalg.solve(scale, np_x - shift)
+          if x.ndim == scale.ndim - 1:
+            backward = np.squeeze(backward, axis=-1)
+          self.assertAllClose(backward, bijector.inverse(x).eval())
+
+          ildj = -np.log(np.abs(np.linalg.det(scale)))
+          # TODO(jvdillon): We need to make it so the scale_identity_multiplier
+          # case does not deviate in expected shape. Fixing this will get rid of
+          # these special cases.
+          if (ildj.ndim > 0 and (len(scale_args) == 1 or (
+              len(scale_args) == 2 and
+              scale_args.get("scale_identity_multiplier", None) is not None))):
+            ildj = np.squeeze(ildj[0])
+          elif ildj.ndim < scale.ndim - 2:
+            ildj = np.reshape(ildj, scale.shape[0:-2])
+          self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(x).eval())
+
+  def testLegalInputs(self):
+    self._testLegalInputs(
+        shift=np.float32(-1),
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [2., 3.],
+            "scale_tril": [[1., 0.],
+                           [-3., 3.]],
+            "scale_perturb_factor": [[1., 0],
+                                     [1.5, 3.]],
+            "scale_perturb_diag": [3., 1.]
+        },
+        x=np.array(
+            [1., 2], dtype=np.float32))
+
+  def testLegalInputsWithBatch(self):
+    # Shape of scale is [2, 1, 2, 2]
+    self._testLegalInputs(
+        shift=np.float32(-1),
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [[[2., 3.]], [[1., 2]]],
+            "scale_tril": [[[[1., 0.], [-3., 3.]]], [[[0.5, 0.], [1., 1.]]]],
+            "scale_perturb_factor": [[[[1., 0], [1.5, 3.]]],
+                                     [[[1., 0], [1., 1.]]]],
+            "scale_perturb_diag": [[[3., 1.]], [[0.5, 1.]]]
+        },
+        x=np.array(
+            [[[1., 2]], [[3., 4]]], dtype=np.float32))
+
+  def testNegativeDetTrilPlusVDVT(self):
+    # scale = [[3.7, 2.7],
+    #          [-0.3, -1.3]]
+    # inv(scale) = [[0.325, 0.675],
+    #               [-0.075, -0.925]]
+    # eig(scale) = [3.5324, -1.1324]
+    self._testLegalInputs(
+        shift=np.float32(-1),
+        scale_params={
+            "scale_tril": [[1., 0], [-3, -4]],
+            "scale_perturb_factor": [[0.1, 0], [0.5, 0.3]],
+            "scale_perturb_diag": [3., 1]
+        },
+        x=np.array(
+            [1., 2], dtype=np.float32))
+
+  def testScalePropertyAssertsCorrectly(self):
+    with self.test_session():
+      with self.assertRaises(NotImplementedError):
+        scale = Affine(  # pylint:disable=unused-variable
+            scale_tril=[[1., 0], [2, 1]],
+            scale_perturb_factor=[2., 1.]).scale
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
new file mode 100644
index 00000000000..20e75430844
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -0,0 +1,87 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Chain Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class ChainBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      chain = Chain((Exp(event_ndims=1), Softplus(event_ndims=1)))
+      self.assertEqual("chain_of_exp_of_softplus", chain.name)
+      x = np.asarray([[[1., 2.],
+                       [2., 3.]]])
+      self.assertAllClose(1. + np.exp(x), chain.forward(x).eval())
+      self.assertAllClose(np.log(x - 1.), chain.inverse(x).eval())
+      self.assertAllClose(
+          -np.sum(np.log(x - 1.), axis=2),
+          chain.inverse_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          np.sum(x, axis=2), chain.forward_log_det_jacobian(x).eval())
+
+  def testBijectorIdentity(self):
+    with self.test_session():
+      chain = Chain()
+      self.assertEqual("identity", chain.name)
+      x = np.asarray([[[1., 2.],
+                       [2., 3.]]])
+      self.assertAllClose(x, chain.forward(x).eval())
+      self.assertAllClose(x, chain.inverse(x).eval())
+      self.assertAllClose(0., chain.inverse_log_det_jacobian(x).eval())
+      self.assertAllClose(0., chain.forward_log_det_jacobian(x).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Chain((Exp(), Softplus()))
+      assert_scalar_congruency(
+          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+
+  def testShapeGetters(self):
+    with self.test_session():
+      bijector = Chain([
+          SoftmaxCentered(
+              event_ndims=1, validate_args=True),
+          SoftmaxCentered(
+              event_ndims=0, validate_args=True)
+      ])
+      x = tensor_shape.TensorShape([])
+      y = tensor_shape.TensorShape([2 + 1])
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(
+          y.as_list(),
+          bijector.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(
+          x.as_list(),
+          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
new file mode 100644
index 00000000000..0ff35304283
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -0,0 +1,88 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class InvertBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = Invert(bij) transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      for fwd in [
+          bijectors.Identity(),
+          bijectors.Exp(event_ndims=1),
+          bijectors.Affine(
+              shift=[0., 1.], scale_diag=[2., 3.], event_ndims=1),
+          bijectors.Softplus(event_ndims=1),
+          bijectors.SoftmaxCentered(event_ndims=1),
+          bijectors.SigmoidCentered(),
+      ]:
+        rev = bijectors.Invert(fwd)
+        self.assertEqual("_".join(["invert", fwd.name]), rev.name)
+        x = [[[1., 2.],
+              [2., 3.]]]
+        self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
+        self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
+        self.assertAllClose(
+            fwd.forward_log_det_jacobian(x).eval(),
+            rev.inverse_log_det_jacobian(x).eval())
+        self.assertAllClose(
+            fwd.inverse_log_det_jacobian(x).eval(),
+            rev.forward_log_det_jacobian(x).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = bijectors.Invert(bijectors.Exp())
+      assert_scalar_congruency(
+          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+
+  def testShapeGetters(self):
+    with self.test_session():
+      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
+      x = tensor_shape.TensorShape([2])
+      y = tensor_shape.TensorShape([])
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(
+          y.as_list(),
+          bijector.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(
+          x.as_list(),
+          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testDocstringExample(self):
+    with self.test_session():
+      exp_gamma_distribution = (
+          transformed_distribution_lib.TransformedDistribution(
+              distribution=gamma_lib.Gamma(concentration=1., rate=2.),
+              bijector=bijectors.Invert(bijectors.Exp())))
+      self.assertAllEqual(
+          [], array_ops.shape(exp_gamma_distribution.sample()).eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
similarity index 82%
rename from tensorflow/contrib/distributions/python/kernel_tests/conditional_bijector_test.py
rename to tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 17a1fe13cfb..26e0d2a539c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Bijector."""
+"""ConditionalBijector Tests."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import conditional_bijector
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import ConditionalBijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
 
 
-class _TestBijector(conditional_bijector.ConditionalBijector):
+class _TestBijector(ConditionalBijector):
 
   def __init__(self):
     super(_TestBijector, self).__init__(
@@ -43,9 +43,6 @@ class _TestBijector(conditional_bijector.ConditionalBijector):
   def _inverse_log_det_jacobian(self, _, arg1, arg2):
     raise ValueError("inverse_log_det_jacobian", arg1, arg2)
 
-  def _inverse_and_inverse_log_det_jacobian(self, _, arg1, arg2):
-    raise ValueError("inverse_and_inverse_log_det_jacobian", arg1, arg2)
-
   def _forward_log_det_jacobian(self, _, arg1, arg2):
     raise ValueError("forward_log_det_jacobian", arg1, arg2)
 
@@ -55,8 +52,7 @@ class ConditionalBijectorTest(test.TestCase):
   def testConditionalBijector(self):
     b = _TestBijector()
     for name in ["forward", "inverse", "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian",
-                 "inverse_and_inverse_log_det_jacobian"]:
+                 "forward_log_det_jacobian"]:
       method = getattr(b, name)
       with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
         method(1.0, arg1="b1", arg2="b2")
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
new file mode 100644
index 00000000000..9970c0b4d86
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -0,0 +1,61 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exp Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class ExpBijectorTest(test.TestCase):
+  """Tests correctness of the Y = g(X) = exp(X) transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      bijector = Exp(event_ndims=1)
+      self.assertEqual("exp", bijector.name)
+      x = [[[1.], [2.]]]
+      y = np.exp(x)
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=-1),
+          bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(-bijector.inverse_log_det_jacobian(np.exp(x)).eval(),
+                          bijector.forward_log_det_jacobian(x).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Exp()
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      bijector = Exp(event_ndims=0)
+      x = np.linspace(-10, 10, num=10).astype(np.float32)
+      y = np.logspace(-10, 10, num=10).astype(np.float32)
+      assert_bijective_and_finite(bijector, x, y)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
new file mode 100644
index 00000000000..739fa6d439a
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -0,0 +1,79 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.contrib.distributions.python.ops.bijectors.inline import Inline
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class InlineBijectorTest(test.TestCase):
+  """Tests correctness of the inline constructed bijector."""
+
+  def testBijector(self):
+    with self.test_session():
+      exp = Exp(event_ndims=1)
+      inline = Inline(
+          forward_fn=math_ops.exp,
+          inverse_fn=math_ops.log,
+          inverse_log_det_jacobian_fn=(
+              lambda y: -math_ops.reduce_sum(  # pylint: disable=g-long-lambda
+                  math_ops.log(y), reduction_indices=-1)),
+          forward_log_det_jacobian_fn=(
+              lambda x: math_ops.reduce_sum(x, reduction_indices=-1)),
+          name="exp")
+
+      self.assertEqual(exp.name, inline.name)
+      x = [[[1., 2.], [3., 4.], [5., 6.]]]
+      y = np.exp(x)
+      self.assertAllClose(y, inline.forward(x).eval())
+      self.assertAllClose(x, inline.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=-1),
+          inline.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(-inline.inverse_log_det_jacobian(y).eval(),
+                          inline.forward_log_det_jacobian(x).eval())
+
+  def testShapeGetters(self):
+    with self.test_session():
+      bijector = Inline(
+          forward_event_shape_tensor_fn=lambda x: array_ops.concat((x, [1]), 0),
+          forward_event_shape_fn=lambda x: x.as_list() + [1],
+          inverse_event_shape_tensor_fn=lambda x: x[:-1],
+          inverse_event_shape_fn=lambda x: x[:-1],
+          name="shape_only")
+      x = tensor_shape.TensorShape([1, 2, 3])
+      y = tensor_shape.TensorShape([1, 2, 3, 1])
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(
+          y.as_list(),
+          bijector.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(
+          x.as_list(),
+          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
new file mode 100644
index 00000000000..0ff35304283
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -0,0 +1,88 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class InvertBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = Invert(bij) transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      for fwd in [
+          bijectors.Identity(),
+          bijectors.Exp(event_ndims=1),
+          bijectors.Affine(
+              shift=[0., 1.], scale_diag=[2., 3.], event_ndims=1),
+          bijectors.Softplus(event_ndims=1),
+          bijectors.SoftmaxCentered(event_ndims=1),
+          bijectors.SigmoidCentered(),
+      ]:
+        rev = bijectors.Invert(fwd)
+        self.assertEqual("_".join(["invert", fwd.name]), rev.name)
+        x = [[[1., 2.],
+              [2., 3.]]]
+        self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
+        self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
+        self.assertAllClose(
+            fwd.forward_log_det_jacobian(x).eval(),
+            rev.inverse_log_det_jacobian(x).eval())
+        self.assertAllClose(
+            fwd.inverse_log_det_jacobian(x).eval(),
+            rev.forward_log_det_jacobian(x).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = bijectors.Invert(bijectors.Exp())
+      assert_scalar_congruency(
+          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+
+  def testShapeGetters(self):
+    with self.test_session():
+      bijector = bijectors.Invert(bijectors.SigmoidCentered(validate_args=True))
+      x = tensor_shape.TensorShape([2])
+      y = tensor_shape.TensorShape([])
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(
+          y.as_list(),
+          bijector.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(
+          x.as_list(),
+          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testDocstringExample(self):
+    with self.test_session():
+      exp_gamma_distribution = (
+          transformed_distribution_lib.TransformedDistribution(
+              distribution=gamma_lib.Gamma(concentration=1., rate=2.),
+              bijector=bijectors.Invert(bijectors.Exp())))
+      self.assertAllEqual(
+          [], array_ops.shape(exp_gamma_distribution.sample()).eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
new file mode 100644
index 00000000000..de1659aa9f4
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -0,0 +1,68 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import PowerTransform
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class PowerTransformBijectorTest(test.TestCase):
+  """Tests correctness of the power transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      c = 0.2
+      bijector = PowerTransform(
+          power=c, event_ndims=1, validate_args=True)
+      self.assertEqual("power_transform", bijector.name)
+      x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
+      y = (1. + x * c)**(1. / c)
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(
+          (c - 1.) * np.sum(np.log(y), axis=-1),
+          bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(y).eval(),
+          bijector.forward_log_det_jacobian(x).eval(),
+          rtol=1e-4,
+          atol=0.)
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = PowerTransform(
+          power=0.2, validate_args=True)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      bijector = PowerTransform(
+          power=0.2, event_ndims=0, validate_args=True)
+      x = np.linspace(-4.999, 10, num=10).astype(np.float32)
+      y = np.logspace(0.001, 10, num=10).astype(np.float32)
+      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
new file mode 100644
index 00000000000..4ff3f334ccb
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
@@ -0,0 +1,57 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import SigmoidCentered
+from tensorflow.python.platform import test
+
+
+class SigmoidCenteredBijectorTest(test.TestCase):
+  """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      sigmoid = SigmoidCentered()
+      self.assertEqual("sigmoid_centered", sigmoid.name)
+      x = np.log([[2., 3, 4],
+                  [4., 8, 12]])
+      y = [[[2. / 3, 1. / 3],
+            [3. / 4, 1. / 4],
+            [4. / 5, 1. / 5]],
+           [[4. / 5, 1. / 5],
+            [8. / 9, 1. / 9],
+            [12. / 13, 1. / 13]]]
+      self.assertAllClose(y, sigmoid.forward(x).eval())
+      self.assertAllClose(x, sigmoid.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=2),
+          sigmoid.inverse_log_det_jacobian(y).eval(),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -sigmoid.inverse_log_det_jacobian(y).eval(),
+          sigmoid.forward_log_det_jacobian(x).eval(),
+          atol=0.,
+          rtol=1e-7)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
new file mode 100644
index 00000000000..e4f9d72785c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sigmoid Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import special
+
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class SigmoidBijectorTest(test.TestCase):
+  """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      self.assertEqual("sigmoid", Sigmoid().name)
+      x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
+      y = special.expit(x)
+      ildj = -np.log(y) - np.log1p(-y)
+      self.assertAllClose(y, Sigmoid().forward(x).eval(), atol=0., rtol=1e-2)
+      self.assertAllClose(x, Sigmoid().inverse(y).eval(), atol=0., rtol=1e-4)
+      self.assertAllClose(ildj, Sigmoid().inverse_log_det_jacobian(y).eval(),
+                          atol=0., rtol=1e-6)
+      self.assertAllClose(-ildj, Sigmoid().forward_log_det_jacobian(x).eval(),
+                          atol=0., rtol=1e-4)
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      assert_scalar_congruency(Sigmoid(), lower_x=-7., upper_x=7.)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      x = np.linspace(-7., 7., 100).astype(np.float32)
+      eps = 1e-3
+      y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
+      assert_bijective_and_finite(Sigmoid(), x, y, atol=0., rtol=1e-4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
new file mode 100644
index 00000000000..62e3869db09
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -0,0 +1,111 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.platform import test
+
+
+rng = np.random.RandomState(42)
+
+
+class SoftmaxCenteredBijectorTest(test.TestCase):
+  """Tests correctness of the Y = g(X) = exp(X) / sum(exp(X)) transformation."""
+
+  def testBijectorScalar(self):
+    with self.test_session():
+      softmax = SoftmaxCentered()  # scalar by default
+      self.assertEqual("softmax_centered", softmax.name)
+      x = np.log([[2., 3, 4],
+                  [4., 8, 12]])
+      y = [[[2. / 3, 1. / 3],
+            [3. / 4, 1. / 4],
+            [4. / 5, 1. / 5]],
+           [[4. / 5, 1. / 5],
+            [8. / 9, 1. / 9],
+            [12. / 13, 1. / 13]]]
+      self.assertAllClose(y, softmax.forward(x).eval())
+      self.assertAllClose(x, softmax.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=2),
+          softmax.inverse_log_det_jacobian(y).eval(),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -softmax.inverse_log_det_jacobian(y).eval(),
+          softmax.forward_log_det_jacobian(x).eval(),
+          atol=0.,
+          rtol=1e-7)
+
+  def testBijectorVector(self):
+    with self.test_session():
+      softmax = SoftmaxCentered(event_ndims=1)
+      self.assertEqual("softmax_centered", softmax.name)
+      x = np.log([[2., 3, 4], [4., 8, 12]])
+      y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
+      self.assertAllClose(y, softmax.forward(x).eval())
+      self.assertAllClose(x, softmax.inverse(y).eval())
+      self.assertAllClose(
+          -np.sum(np.log(y), axis=1),
+          softmax.inverse_log_det_jacobian(y).eval(),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -softmax.inverse_log_det_jacobian(y).eval(),
+          softmax.forward_log_det_jacobian(x).eval(),
+          atol=0.,
+          rtol=1e-7)
+
+  def testShapeGetters(self):
+    with self.test_session():
+      for x, y, b in ((tensor_shape.TensorShape([]),
+                       tensor_shape.TensorShape([2]),
+                       SoftmaxCentered(
+                           event_ndims=0, validate_args=True)),
+                      (tensor_shape.TensorShape([4]),
+                       tensor_shape.TensorShape([5]),
+                       SoftmaxCentered(
+                           event_ndims=1, validate_args=True))):
+        self.assertAllEqual(y, b.forward_event_shape(x))
+        self.assertAllEqual(y.as_list(),
+                            b.forward_event_shape_tensor(x.as_list()).eval())
+        self.assertAllEqual(x, b.inverse_event_shape(y))
+        self.assertAllEqual(x.as_list(),
+                            b.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      softmax = SoftmaxCentered(event_ndims=1)
+      x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
+      # Make y values on the simplex with a wide range.
+      y_0 = np.ones(5).astype(np.float32)
+      y_1 = (1e-5 * rng.rand(5)).astype(np.float32)
+      y_2 = (1e1 * rng.rand(5)).astype(np.float32)
+      y = np.array([y_0, y_1, y_2])
+      y /= y.sum(axis=0)
+      y = y.T  # y.shape = [5, 3]
+      assert_bijective_and_finite(softmax, x, y)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
new file mode 100644
index 00000000000..d9af9aec50d
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -0,0 +1,153 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(42)
+
+
+class SoftplusBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = g(X) = Log[1 + exp(X)] transformation."""
+
+  def _softplus(self, x):
+    return np.log(1 + np.exp(x))
+
+  def _softplus_inverse(self, y):
+    return np.log(np.exp(y) - 1)
+
+  def _softplus_ildj_before_reduction(self, y):
+    """Inverse log det jacobian, before being reduced."""
+    return -np.log(1 - np.exp(-y))
+
+  def testHingeSoftnessZeroRaises(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      with self.assertRaisesOpError("must be non-zero"):
+        bijector.forward([1., 1.]).eval()
+
+  def testBijectorForwardInverseEventDimsZero(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0)
+      self.assertEqual("softplus", bijector.name)
+      x = 2 * rng.randn(2, 10)
+      y = self._softplus(x)
+
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+
+  def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      x = 2 * rng.randn(2, 10)
+      y = 1.5 * self._softplus(x / 1.5)
+
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+
+  def testBijectorLogDetJacobianEventDimsZero(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0)
+      y = 2 * rng.rand(2, 10)
+      # No reduction needed if event_dims = 0.
+      ildj = self._softplus_ildj_before_reduction(y)
+
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+
+  def testBijectorForwardInverseEventDimsOne(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=1)
+      self.assertEqual("softplus", bijector.name)
+      x = 2 * rng.randn(2, 10)
+      y = self._softplus(x)
+
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+
+  def testBijectorLogDetJacobianEventDimsOne(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=1)
+      y = 2 * rng.rand(2, 10)
+      ildj_before = self._softplus_ildj_before_reduction(y)
+      ildj = np.sum(ildj_before, axis=1)
+
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testScalarCongruencyWithPositiveHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testScalarCongruencyWithNegativeHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testBijectiveAndFinite32bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = -np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFinite16bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0)
+      # softplus(-20) is zero, so we can't use such a large range as in 32bit.
+      x = np.linspace(-10., 20., 100).astype(np.float16)
+      # Note that float16 is only in the open set (0, inf) for a smaller
+      # logspace range.  The actual range was (-7, 4), so use something smaller
+      # for the test.
+      y = np.logspace(-6, 3, 100).astype(np.float16)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-1, atol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
index 7d562b59a4c..d30f6e418d7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import binomial
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -69,19 +71,25 @@ class BinomialTest(test.TestCase):
       self.assertEqual((1, 3), binom.logits.get_shape())
       self.assertAllClose(logits, binom.logits.eval())
 
-  def testPmfNandCountsAgree(self):
+  def testPmfAndCdfNandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
       binom.prob([3., 1, 2]).eval()
+      binom.cdf([2., 3, 2]).eval()
+      binom.cdf([3., 1, 2]).eval()
       with self.assertRaisesOpError("Condition x >= 0.*"):
         binom.prob([-1., 4, 2]).eval()
       with self.assertRaisesOpError("Condition x <= y.*"):
         binom.prob([7., 3, 0]).eval()
+      with self.assertRaisesOpError("Condition x >= 0.*"):
+        binom.cdf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError("Condition x <= y.*"):
+        binom.cdf([7., 3, 0]).eval()
 
-  def testPmfNonIntegerCounts(self):
+  def testPmfAndCdfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
@@ -89,50 +97,73 @@ class BinomialTest(test.TestCase):
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
       binom.prob([3., 1, 2]).eval()
+      binom.cdf([2., 3, 2]).eval()
+      binom.cdf([3., 1, 2]).eval()
+      placeholder = array_ops.placeholder(dtypes.float32)
       # Both equality and integer checking fail.
       with self.assertRaisesOpError(
-          "counts cannot contain fractional components."):
-        binom.prob([1.0, 2.5, 1.5]).eval()
+          "cannot contain fractional components."):
+        binom.prob(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
+      with self.assertRaisesOpError(
+          "cannot contain fractional components."):
+        binom.cdf(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
 
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=False)
       binom.prob([1., 2., 3.]).eval()
+      binom.cdf([1., 2., 3.]).eval()
       # Non-integer arguments work.
       binom.prob([1.0, 2.5, 1.5]).eval()
+      binom.cdf([1.0, 2.5, 1.5]).eval()
 
-  def testPmfBothZeroBatches(self):
+  def testPmfAndCdfBothZeroBatches(self):
     with self.test_session():
       # Both zero-batches.  No broadcast
       p = 0.5
       counts = 1.
-      pmf = binomial.Binomial(total_count=1., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=1., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=1, p=p), cdf.eval())
       self.assertEqual((), pmf.get_shape())
+      self.assertEqual((), cdf.get_shape())
 
-  def testPmfBothZeroBatchesNontrivialN(self):
+  def testPmfAndCdfBothZeroBatchesNontrivialN(self):
     with self.test_session():
       # Both zero-batches.  No broadcast
       p = 0.1
       counts = 3.
       binom = binomial.Binomial(total_count=5., probs=p)
       pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(stats.binom.pmf(counts, n=5., p=p), pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=5., p=p), cdf.eval())
       self.assertEqual((), pmf.get_shape())
+      self.assertEqual((), cdf.get_shape())
 
-  def testPmfPStretchedInBroadcastWhenSameRank(self):
+  def testPmfAndCdfPStretchedInBroadcastWhenSameRank(self):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 2.]]
-      pmf = binomial.Binomial(total_count=3., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=3., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(stats.binom.pmf(counts, n=3., p=p), pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=3., p=p), cdf.eval())
       self.assertEqual((1, 2), pmf.get_shape())
+      self.assertEqual((1, 2), cdf.get_shape())
 
-  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+  def testPmfAndCdfPStretchedInBroadcastWhenLowerRank(self):
     with self.test_session():
       p = [0.1, 0.4]
       counts = [[1.], [0.]]
-      pmf = binomial.Binomial(total_count=1., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=1., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose([[0.1, 0.4], [0.9, 0.6]], pmf.eval())
+      self.assertAllClose([[1.0, 1.0], [0.9, 0.6]], cdf.eval())
       self.assertEqual((2, 2), pmf.get_shape())
+      self.assertEqual((2, 2), cdf.get_shape())
 
   def testBinomialMean(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py
index 6898fe681a0..982aa9c95b0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py
@@ -36,7 +36,6 @@ class ConditionalDistributionTest(distribution_test.DistributionTest):
         self._static_event_shape = tensor_shape.TensorShape(event_shape)
         super(_FakeDistribution, self).__init__(
             dtype=dtypes.float32,
-            is_continuous=False,
             reparameterization_type=distributions.NOT_REPARAMETERIZED,
             validate_args=True,
             allow_nan_stats=True,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index 33e59036843..545471907f1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -22,7 +22,7 @@ import numpy as np
 
 from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.kernel_tests import transformed_distribution_test
-from tensorflow.contrib.distributions.python.ops import conditional_bijector
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import ConditionalBijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,7 +32,7 @@ from tensorflow.python.platform import test
 ds = distributions
 
 
-class _ChooseLocation(conditional_bijector.ConditionalBijector):
+class _ChooseLocation(ConditionalBijector):
   """A Bijector which chooses between one of two location parameters."""
 
   def __init__(self, loc, name="ChooseLocation"):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
new file mode 100644
index 00000000000..90910f3839b
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
@@ -0,0 +1,295 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(0)
+
+
+class DeterministicTest(test.TestCase):
+
+  def testShape(self):
+    with self.test_session():
+      loc = rng.rand(2, 3, 4)
+      deterministic = deterministic_lib.Deterministic(loc)
+
+      self.assertAllEqual(deterministic.batch_shape_tensor().eval(), (2, 3, 4))
+      self.assertAllEqual(deterministic.batch_shape, (2, 3, 4))
+      self.assertAllEqual(deterministic.event_shape_tensor().eval(), [])
+      self.assertEqual(deterministic.event_shape, tensor_shape.TensorShape([]))
+
+  def testInvalidTolRaises(self):
+    loc = rng.rand(2, 3, 4).astype(np.float32)
+    deterministic = deterministic_lib.Deterministic(
+        loc, atol=-1, validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x >= 0"):
+        deterministic.prob(0.).eval()
+
+  def testProbWithNoBatchDimsIntegerType(self):
+    deterministic = deterministic_lib.Deterministic(0)
+    with self.test_session():
+      self.assertAllClose(1, deterministic.prob(0).eval())
+      self.assertAllClose(0, deterministic.prob(2).eval())
+      self.assertAllClose([1, 0], deterministic.prob([0, 2]).eval())
+
+  def testProbWithNoBatchDims(self):
+    deterministic = deterministic_lib.Deterministic(0.)
+    with self.test_session():
+      self.assertAllClose(1., deterministic.prob(0.).eval())
+      self.assertAllClose(0., deterministic.prob(2.).eval())
+      self.assertAllClose([1., 0.], deterministic.prob([0., 2.]).eval())
+
+  def testProbWithDefaultTol(self):
+    loc = [[0., 1.], [2., 3.]]
+    x = [[0., 1.1], [1.99, 3.]]
+    deterministic = deterministic_lib.Deterministic(loc)
+    expected_prob = [[1., 0.], [0., 1.]]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((2, 2), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroATol(self):
+    loc = [[0., 1.], [2., 3.]]
+    x = [[0., 1.1], [1.99, 3.]]
+    deterministic = deterministic_lib.Deterministic(loc, atol=0.05)
+    expected_prob = [[1., 0.], [1., 1.]]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((2, 2), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroATolIntegerType(self):
+    loc = [[0, 1], [2, 3]]
+    x = [[0, 2], [4, 2]]
+    deterministic = deterministic_lib.Deterministic(loc, atol=1)
+    expected_prob = [[1, 1], [0, 1]]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((2, 2), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroRTol(self):
+    loc = [[0., 1.], [100., 100.]]
+    x = [[0., 1.1], [100.1, 103.]]
+    deterministic = deterministic_lib.Deterministic(loc, rtol=0.01)
+    expected_prob = [[1., 0.], [1., 0.]]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((2, 2), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroRTolIntegerType(self):
+    loc = [[10, 10, 10], [10, 10, 10]]
+    x = [[10, 20, 30], [10, 20, 30]]
+    # Batch 0 will have rtol = 0
+    # Batch 1 will have rtol = 1 (100% slack allowed)
+    deterministic = deterministic_lib.Deterministic(loc, rtol=[[0], [1]])
+    expected_prob = [[1, 0, 0], [1, 1, 0]]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((2, 3), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testCdfWithDefaultTol(self):
+    loc = [[0., 0.], [0., 0.]]
+    x = [[-1., -0.1], [-0.01, 1.000001]]
+    deterministic = deterministic_lib.Deterministic(loc)
+    expected_cdf = [[0., 0.], [0., 1.]]
+    with self.test_session():
+      cdf = deterministic.cdf(x)
+      self.assertAllEqual((2, 2), cdf.get_shape())
+      self.assertAllEqual(expected_cdf, cdf.eval())
+
+  def testCdfWithNonzeroATol(self):
+    loc = [[0., 0.], [0., 0.]]
+    x = [[-1., -0.1], [-0.01, 1.000001]]
+    deterministic = deterministic_lib.Deterministic(loc, atol=0.05)
+    expected_cdf = [[0., 0.], [1., 1.]]
+    with self.test_session():
+      cdf = deterministic.cdf(x)
+      self.assertAllEqual((2, 2), cdf.get_shape())
+      self.assertAllEqual(expected_cdf, cdf.eval())
+
+  def testCdfWithNonzeroRTol(self):
+    loc = [[1., 1.], [100., 100.]]
+    x = [[0.9, 1.], [99.9, 97]]
+    deterministic = deterministic_lib.Deterministic(loc, rtol=0.01)
+    expected_cdf = [[0., 1.], [1., 0.]]
+    with self.test_session():
+      cdf = deterministic.cdf(x)
+      self.assertAllEqual((2, 2), cdf.get_shape())
+      self.assertAllEqual(expected_cdf, cdf.eval())
+
+  def testSampleNoBatchDims(self):
+    deterministic = deterministic_lib.Deterministic(0.)
+    for sample_shape in [(), (4,)]:
+      with self.test_session():
+        sample = deterministic.sample(sample_shape)
+        self.assertAllEqual(sample_shape, sample.get_shape())
+        self.assertAllClose(
+            np.zeros(sample_shape).astype(np.float32), sample.eval())
+
+  def testSampleWithBatchDims(self):
+    deterministic = deterministic_lib.Deterministic([0., 0.])
+    for sample_shape in [(), (4,)]:
+      with self.test_session():
+        sample = deterministic.sample(sample_shape)
+        self.assertAllEqual(sample_shape + (2,), sample.get_shape())
+        self.assertAllClose(
+            np.zeros(sample_shape + (2,)).astype(np.float32), sample.eval())
+
+  def testSampleDynamicWithBatchDims(self):
+    loc = array_ops.placeholder(np.float32)
+    sample_shape = array_ops.placeholder(np.int32)
+
+    deterministic = deterministic_lib.Deterministic(loc)
+    for sample_shape_ in [(), (4,)]:
+      with self.test_session():
+        sample_ = deterministic.sample(sample_shape).eval(
+            feed_dict={loc: [0., 0.],
+                       sample_shape: sample_shape_})
+        self.assertAllClose(
+            np.zeros(sample_shape_ + (2,)).astype(np.float32), sample_)
+
+
+class VectorDeterministicTest(test.TestCase):
+
+  def testShape(self):
+    with self.test_session():
+      loc = rng.rand(2, 3, 4)
+      deterministic = deterministic_lib.VectorDeterministic(loc)
+
+      self.assertAllEqual(deterministic.batch_shape_tensor().eval(), (2, 3))
+      self.assertAllEqual(deterministic.batch_shape, (2, 3))
+      self.assertAllEqual(deterministic.event_shape_tensor().eval(), [4])
+      self.assertEqual(deterministic.event_shape, tensor_shape.TensorShape([4]))
+
+  def testInvalidTolRaises(self):
+    loc = rng.rand(2, 3, 4).astype(np.float32)
+    deterministic = deterministic_lib.VectorDeterministic(
+        loc, atol=-1, validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x >= 0"):
+        deterministic.prob(loc).eval()
+
+  def testInvalidXRaises(self):
+    loc = rng.rand(2, 3, 4).astype(np.float32)
+    deterministic = deterministic_lib.VectorDeterministic(
+        loc, atol=-1, validate_args=True)
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "must have rank at least 1"):
+        deterministic.prob(0.).eval()
+
+  def testProbVectorDeterministicWithNoBatchDims(self):
+    # 0 batch of deterministics on R^1.
+    deterministic = deterministic_lib.VectorDeterministic([0.])
+    with self.test_session():
+      self.assertAllClose(1., deterministic.prob([0.]).eval())
+      self.assertAllClose(0., deterministic.prob([2.]).eval())
+      self.assertAllClose([1., 0.], deterministic.prob([[0.], [2.]]).eval())
+
+  def testProbWithDefaultTol(self):
+    # 3 batch of deterministics on R^2.
+    loc = [[0., 1.], [2., 3.], [4., 5.]]
+    x = [[0., 1.], [1.9, 3.], [3.99, 5.]]
+    deterministic = deterministic_lib.VectorDeterministic(loc)
+    expected_prob = [1., 0., 0.]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((3,), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroATol(self):
+    # 3 batch of deterministics on R^2.
+    loc = [[0., 1.], [2., 3.], [4., 5.]]
+    x = [[0., 1.], [1.9, 3.], [3.99, 5.]]
+    deterministic = deterministic_lib.VectorDeterministic(loc, atol=0.05)
+    expected_prob = [1., 0., 1.]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((3,), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbWithNonzeroRTol(self):
+    # 3 batch of deterministics on R^2.
+    loc = [[0., 1.], [1., 1.], [100., 100.]]
+    x = [[0., 1.], [0.9, 1.], [99.9, 100.1]]
+    deterministic = deterministic_lib.VectorDeterministic(loc, rtol=0.01)
+    expected_prob = [1., 0., 1.]
+    with self.test_session():
+      prob = deterministic.prob(x)
+      self.assertAllEqual((3,), prob.get_shape())
+      self.assertAllEqual(expected_prob, prob.eval())
+
+  def testProbVectorDeterministicWithNoBatchDimsOnRZero(self):
+    # 0 batch of deterministics on R^0.
+    deterministic = deterministic_lib.VectorDeterministic(
+        [], validate_args=True)
+    with self.test_session():
+      self.assertAllClose(1., deterministic.prob([]).eval())
+
+  def testProbVectorDeterministicWithNoBatchDimsOnRZeroRaisesIfXNotInSameRk(
+      self):
+    # 0 batch of deterministics on R^0.
+    deterministic = deterministic_lib.VectorDeterministic(
+        [], validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("not defined in the same space"):
+        deterministic.prob([1.]).eval()
+
+  def testSampleNoBatchDims(self):
+    deterministic = deterministic_lib.VectorDeterministic([0.])
+    for sample_shape in [(), (4,)]:
+      with self.test_session():
+        sample = deterministic.sample(sample_shape)
+        self.assertAllEqual(sample_shape + (1,), sample.get_shape())
+        self.assertAllClose(
+            np.zeros(sample_shape + (1,)).astype(np.float32), sample.eval())
+
+  def testSampleWithBatchDims(self):
+    deterministic = deterministic_lib.VectorDeterministic([[0.], [0.]])
+    for sample_shape in [(), (4,)]:
+      with self.test_session():
+        sample = deterministic.sample(sample_shape)
+        self.assertAllEqual(sample_shape + (2, 1), sample.get_shape())
+        self.assertAllClose(
+            np.zeros(sample_shape + (2, 1)).astype(np.float32), sample.eval())
+
+  def testSampleDynamicWithBatchDims(self):
+    loc = array_ops.placeholder(np.float32)
+    sample_shape = array_ops.placeholder(np.int32)
+
+    deterministic = deterministic_lib.VectorDeterministic(loc)
+    for sample_shape_ in [(), (4,)]:
+      with self.test_session():
+        sample_ = deterministic.sample(sample_shape).eval(
+            feed_dict={loc: [[0.], [0.]],
+                       sample_shape: sample_shape_})
+        self.assertAllClose(
+            np.zeros(sample_shape_ + (2, 1)).astype(np.float32), sample_)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 22145ff17a0..507ceb35853 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -134,7 +134,6 @@ class DistributionTest(test.TestCase):
         self._static_event_shape = tensor_shape.TensorShape(event_shape)
         super(FakeDistribution, self).__init__(
             dtype=dtypes.float32,
-            is_continuous=False,
             reparameterization_type=distributions.NOT_REPARAMETERIZED,
             validate_args=True,
             allow_nan_stats=True,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index a6fac14616f..1c67a1b8f6e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from scipy import special
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -35,47 +37,50 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 class AssertCloseTest(test.TestCase):
 
   def testAssertCloseIntegerDtype(self):
-    x = [1, 5, 10, 15, 20]
+    x = array_ops.placeholder(dtypes.int32)
     y = x
-    z = [2, 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.int32)
+    feed_dict = {x: [1, 5, 10, 15, 20], z: [2, 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval()
+          array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
   def testAssertCloseNonIntegerDtype(self):
-    x = np.array([1., 5, 10, 15, 20], dtype=np.float32)
+    x = array_ops.placeholder(dtypes.float32)
     y = x + 1e-8
-    z = [2., 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], z: [2., 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval()
+          array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
   def testAssertCloseEpsilon(self):
     x = [0., 5, 10, 15, 20]
@@ -97,30 +102,106 @@ class AssertCloseTest(test.TestCase):
 
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
-    x = [1., 5, 10, 15, 20]
-    y = [1.1, 5, 10, 15, 20]
+    x = array_ops.placeholder(dtypes.float32)
+    y = array_ops.placeholder(dtypes.float32)
     # First component isn't less than float32.eps = 1e-7
-    z = [1.0001, 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.float32)
     # This shouldn"t be detected as an integer.
-    w = [1e-8, 5, 10, 15, 20]
+    w = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], y: [1.1, 5, 10, 15, 20],
+                 z: [1.0001, 5, 10, 15, 20], w: [1e-8, 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_integer_form(x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(y)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(z)]):
-          array_ops.identity(z).eval()
+          array_ops.identity(z).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(w)]):
-          array_ops.identity(w).eval()
+          array_ops.identity(w).eval(feed_dict=feed_dict)
+
+
+class ShapesFromLocAndScaleTest(test.TestCase):
+
+  def test_static_loc_static_scale_non_matching_event_size_raises(self):
+    loc = constant_op.constant(np.zeros((2, 4)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    with self.assertRaisesRegexp(ValueError, "could not be broadcast"):
+      distribution_util.shapes_from_loc_and_scale(loc, scale)
+
+  def test_static_loc_static_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 2]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_static_loc_dynamic_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_static_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = constant_op.constant(np.ones((5, 2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session():
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+      # batch_shape depends on both args, and so is dynamic.  Since loc did not
+      # have static shape, we inferred event shape entirely from scale, and this
+      # is available statically.
+      self.assertAllEqual(
+          [5, 2], batch_shape.eval(feed_dict={loc: np.zeros((2, 3))}))
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_dynamic_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 2, 3)), loc: np.zeros((2, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_none_loc_static_scale(self):
+    loc = None
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 1]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_none_loc_dynamic_scale(self):
+    loc = None
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 1], batch_shape)
+      self.assertAllEqual([3], event_shape)
 
 
 class GetLogitsAndProbsTest(test.TestCase):
@@ -566,7 +647,7 @@ class SoftplusTest(test.TestCase):
           order="F")
       err = gradient_checker.compute_gradient_error(
           x, [2, 5], y, [2, 5], x_init_value=x_init)
-    print("softplus (float) gradient err = ", err)
+    logging.vlog(2, "softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testInverseSoftplusGradientNeverNan(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
new file mode 100644
index 00000000000..9ef68c4c2cb
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
@@ -0,0 +1,257 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the Geometric distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib.distributions.python.ops import geometric
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# In all tests that follow, we use scipy.stats.geom, which
+# represents the "Shifted" Geometric distribution. Hence, loc=-1 is passed
+# in to each scipy function for testing.
+class GeometricTest(test.TestCase):
+
+  def testGeometricShape(self):
+    with self.test_session():
+      probs = constant_op.constant([.1] * 5)
+      geom = geometric.Geometric(probs=probs)
+
+      self.assertEqual([5,], geom.batch_shape_tensor().eval())
+      self.assertAllEqual([], geom.event_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([5]), geom.batch_shape)
+      self.assertEqual(tensor_shape.TensorShape([]), geom.event_shape)
+
+  def testInvalidP(self):
+    invalid_ps = [-.01, -0.01, -2.]
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x >= 0"):
+        geom = geometric.Geometric(probs=invalid_ps)
+        geom.probs.eval()
+
+    invalid_ps = [1.1, 3., 5.]
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x <= y"):
+        geom = geometric.Geometric(probs=invalid_ps)
+        geom.probs.eval()
+
+  def testGeomLogPmf(self):
+    with self.test_session():
+      batch_size = 6
+      probs = constant_op.constant([.2] * batch_size)
+      probs_v = .2
+      x = np.array([2., 3., 4., 5., 6., 7.], dtype=np.float32)
+      geom = geometric.Geometric(probs=probs)
+      expected_log_prob = stats.geom.logpmf(x, probs_v, loc=-1)
+      log_prob = geom.log_prob(x)
+      self.assertEqual([6,], log_prob.get_shape())
+      self.assertAllClose(expected_log_prob, log_prob.eval())
+
+      pmf = geom.prob(x)
+      self.assertEqual([6,], pmf.get_shape())
+      self.assertAllClose(np.exp(expected_log_prob), pmf.eval())
+
+  def testGeometricLogPmf_validate_args(self):
+    with self.test_session():
+      batch_size = 6
+      probs = constant_op.constant([.9] * batch_size)
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
+      geom = geometric.Geometric(probs=probs)
+
+      with self.assertRaisesOpError("Condition x == y"):
+        log_prob = geom.log_prob(x)
+        log_prob.eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x >= 0"):
+        log_prob = geom.log_prob(np.array([-1.], dtype=np.float32))
+        log_prob.eval()
+
+      geom = geometric.Geometric(probs=probs, validate_args=False)
+      log_prob = geom.log_prob(x)
+      self.assertEqual([6,], log_prob.get_shape())
+      pmf = geom.prob(x)
+      self.assertEqual([6,], pmf.get_shape())
+
+  def testGeometricLogPmfMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      probs = constant_op.constant([[.2, .3, .5]] * batch_size)
+      probs_v = np.array([.2, .3, .5])
+      x = np.array([[2., 3., 4., 5., 6., 7.]], dtype=np.float32).T
+      geom = geometric.Geometric(probs=probs)
+      expected_log_prob = stats.geom.logpmf(x, probs_v, loc=-1)
+      log_prob = geom.log_prob(x)
+      log_prob_values = log_prob.eval()
+      self.assertEqual([6, 3], log_prob.get_shape())
+      self.assertAllClose(expected_log_prob, log_prob_values)
+
+      pmf = geom.prob(x)
+      pmf_values = pmf.eval()
+      self.assertEqual([6, 3], pmf.get_shape())
+      self.assertAllClose(np.exp(expected_log_prob), pmf_values)
+
+  def testGeometricCDF(self):
+    with self.test_session():
+      batch_size = 6
+      probs = constant_op.constant([[.2, .4, .5]] * batch_size)
+      probs_v = np.array([.2, .4, .5])
+      x = np.array([[2., 3., 4., 5.5, 6., 7.]], dtype=np.float32).T
+
+      geom = geometric.Geometric(probs=probs)
+      expected_cdf = stats.geom.cdf(x, probs_v, loc=-1)
+
+      cdf = geom.cdf(x)
+      self.assertEqual([6, 3], cdf.get_shape())
+      self.assertAllClose(expected_cdf, cdf.eval())
+
+  def testGeometricEntropy(self):
+    with self.test_session():
+      probs_v = np.array([.1, .3, .25], dtype=np.float32)
+      geom = geometric.Geometric(probs=probs_v)
+      expected_entropy = stats.geom.entropy(probs_v, loc=-1)
+      self.assertEqual([3], geom.entropy().get_shape())
+      self.assertAllClose(expected_entropy, geom.entropy().eval())
+
+  def testGeometricMean(self):
+    with self.test_session():
+      probs_v = np.array([.1, .3, .25])
+      geom = geometric.Geometric(probs=probs_v)
+      expected_means = stats.geom.mean(probs_v, loc=-1)
+      self.assertEqual([3], geom.mean().get_shape())
+      self.assertAllClose(expected_means, geom.mean().eval())
+
+  def testGeometricVariance(self):
+    with self.test_session():
+      probs_v = np.array([.1, .3, .25])
+      geom = geometric.Geometric(probs=probs_v)
+      expected_vars = stats.geom.var(probs_v, loc=-1)
+      self.assertEqual([3], geom.variance().get_shape())
+      self.assertAllClose(expected_vars, geom.variance().eval())
+
+  def testGeometricStddev(self):
+    with self.test_session():
+      probs_v = np.array([.1, .3, .25])
+      geom = geometric.Geometric(probs=probs_v)
+      expected_stddevs = stats.geom.std(probs_v, loc=-1)
+      self.assertEqual([3], geom.stddev().get_shape())
+      self.assertAllClose(geom.stddev().eval(), expected_stddevs)
+
+  def testGeometricMode(self):
+    with self.test_session():
+      probs_v = np.array([.1, .3, .25])
+      geom = geometric.Geometric(probs=probs_v)
+      self.assertEqual([3,], geom.mode().get_shape())
+      self.assertAllClose([0.] * 3, geom.mode().eval())
+
+  def testGeometricSample(self):
+    with self.test_session():
+      probs_v = [.3, .9]
+      probs = constant_op.constant(probs_v)
+      n = constant_op.constant(100000)
+      geom = geometric.Geometric(probs=probs)
+
+      samples = geom.sample(n, seed=12345)
+      self.assertEqual([100000, 2], samples.get_shape())
+
+      sample_values = samples.eval()
+      self.assertFalse(np.any(sample_values < 0.0))
+      for i in range(2):
+        self.assertAllClose(sample_values[:, i].mean(),
+                            stats.geom.mean(probs_v[i], loc=-1),
+                            rtol=.02)
+        self.assertAllClose(sample_values[:, i].var(),
+                            stats.geom.var(probs_v[i], loc=-1),
+                            rtol=.02)
+
+  def testGeometricSampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      probs_v = [.3, .9]
+      probs = constant_op.constant([probs_v] * batch_size)
+
+      geom = geometric.Geometric(probs=probs)
+
+      n = 400000
+      samples = geom.sample(n, seed=12345)
+      self.assertEqual([n, batch_size, 2], samples.get_shape())
+
+      sample_values = samples.eval()
+
+      self.assertFalse(np.any(sample_values < 0.0))
+      for i in range(2):
+        self.assertAllClose(sample_values[:, 0, i].mean(),
+                            stats.geom.mean(probs_v[i], loc=-1),
+                            rtol=.02)
+        self.assertAllClose(sample_values[:, 0, i].var(),
+                            stats.geom.var(probs_v[i], loc=-1),
+                            rtol=.02)
+        self.assertAllClose(sample_values[:, 1, i].mean(),
+                            stats.geom.mean(probs_v[i], loc=-1),
+                            rtol=.02)
+        self.assertAllClose(sample_values[:, 1, i].var(),
+                            stats.geom.var(probs_v[i], loc=-1),
+                            rtol=.02)
+
+  def testGeometricAtBoundary(self):
+    with self.test_session():
+      geom = geometric.Geometric(probs=1.)
+
+      x = np.array([0., 2., 3., 4., 5., 6., 7.], dtype=np.float32)
+      expected_log_prob = stats.geom.logpmf(x, [1.], loc=-1)
+      # Scipy incorrectly returns nan.
+      expected_log_prob[np.isnan(expected_log_prob)] = 0.
+
+      log_prob = geom.log_prob(x)
+      self.assertEqual([7,], log_prob.get_shape())
+      self.assertAllClose(expected_log_prob, log_prob.eval())
+
+      pmf = geom.prob(x)
+      self.assertEqual([7,], pmf.get_shape())
+      self.assertAllClose(np.exp(expected_log_prob), pmf.eval())
+
+      expected_log_cdf = stats.geom.logcdf(x, 1., loc=-1)
+
+      log_cdf = geom.log_cdf(x)
+      self.assertEqual([7,], log_cdf.get_shape())
+      self.assertAllClose(expected_log_cdf, log_cdf.eval())
+
+      cdf = geom.cdf(x)
+      self.assertEqual([7,], cdf.get_shape())
+      self.assertAllClose(np.exp(expected_log_cdf), cdf.eval())
+
+      expected_mean = stats.geom.mean(1., loc=-1)
+      self.assertEqual([], geom.mean().get_shape())
+      self.assertAllClose(expected_mean, geom.mean().eval())
+
+      expected_variance = stats.geom.var(1., loc=-1)
+      self.assertEqual([], geom.variance().get_shape())
+      self.assertAllClose(expected_variance, geom.variance().eval())
+
+      with self.assertRaisesOpError("Entropy is undefined"):
+        geom.entropy().eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
deleted file mode 100644
index 2eddb1bd66c..00000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for distributions KL mechanism."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-# pylint: disable=protected-access
-_DIVERGENCES = kullback_leibler._DIVERGENCES
-_registered_kl = kullback_leibler._registered_kl
-
-# pylint: enable=protected-access
-
-
-class KLTest(test.TestCase):
-
-  def testRegistration(self):
-
-    class MyDist(normal.Normal):
-      pass
-
-    # Register KL to a lambda that spits out the name parameter
-    @kullback_leibler.RegisterKL(MyDist, MyDist)
-    def _kl(a, b, name=None):  # pylint: disable=unused-argument,unused-variable
-      return name
-
-    a = MyDist(loc=0.0, scale=1.0)
-    # Run kl() with allow_nan=True because strings can't go through is_nan.
-    self.assertEqual("OK", kullback_leibler.kl(a, a, allow_nan=True, name="OK"))
-
-  def testDomainErrorExceptions(self):
-
-    class MyDistException(normal.Normal):
-      pass
-
-    # Register KL to a lambda that spits out the name parameter
-    @kullback_leibler.RegisterKL(MyDistException, MyDistException)
-    # pylint: disable=unused-argument,unused-variable
-    def _kl(a, b, name=None):
-      return array_ops.identity([float("nan")])
-
-    # pylint: disable=unused-argument,unused-variable
-
-    with self.test_session():
-      a = MyDistException(loc=0.0, scale=1.0)
-      kl = kullback_leibler.kl(a, a)
-      with self.assertRaisesOpError(
-          "KL calculation between .* and .* returned NaN values"):
-        kl.eval()
-      kl_ok = kullback_leibler.kl(a, a, allow_nan=True)
-      self.assertAllEqual([float("nan")], kl_ok.eval())
-
-  def testRegistrationFailures(self):
-
-    class MyDist(normal.Normal):
-      pass
-
-    with self.assertRaisesRegexp(TypeError, "must be callable"):
-      kullback_leibler.RegisterKL(MyDist, MyDist)("blah")
-
-    # First registration is OK
-    kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
-
-    # Second registration fails
-    with self.assertRaisesRegexp(ValueError, "has already been registered"):
-      kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
-
-  def testExactRegistrationsAllMatch(self):
-    for (k, v) in _DIVERGENCES.items():
-      self.assertEqual(v, _registered_kl(*k))
-
-  def testIndirectRegistration(self):
-
-    class Sub1(normal.Normal):
-      pass
-
-    class Sub2(normal.Normal):
-      pass
-
-    class Sub11(Sub1):
-      pass
-
-    # pylint: disable=unused-argument,unused-variable
-    @kullback_leibler.RegisterKL(Sub1, Sub1)
-    def _kl11(a, b, name=None):
-      return "sub1-1"
-
-    @kullback_leibler.RegisterKL(Sub1, Sub2)
-    def _kl12(a, b, name=None):
-      return "sub1-2"
-
-    @kullback_leibler.RegisterKL(Sub2, Sub1)
-    def _kl21(a, b, name=None):
-      return "sub2-1"
-
-    # pylint: enable=unused-argument,unused_variable
-
-    sub1 = Sub1(loc=0.0, scale=1.0)
-    sub2 = Sub2(loc=0.0, scale=1.0)
-    sub11 = Sub11(loc=0.0, scale=1.0)
-
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub1, allow_nan=True))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub1, sub2, allow_nan=True))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub1, allow_nan=True))
-    self.assertEqual(
-        "sub1-1", kullback_leibler.kl(sub11, sub11, allow_nan=True))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1, allow_nan=True))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2, allow_nan=True))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1, allow_nan=True))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2, allow_nan=True))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub11, allow_nan=True))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub11, allow_nan=True))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
index 7d7560c3f59..251be9ed4f6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 from scipy import stats
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import logistic
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.platform import test
 
 
@@ -71,6 +71,52 @@ class LogisticTest(test.TestCase):
       self.assertEqual(cdf.get_shape(), (6,))
       self.assertAllClose(cdf.eval(), expected_cdf)
 
+  def testLogisticLogCDF(self):
+    with self.test_session():
+      batch_size = 6
+      np_loc = np.array([2.0] * batch_size, dtype=np.float32)
+      loc = constant_op.constant(np_loc)
+      scale = 1.5
+
+      dist = logistic.Logistic(loc, scale)
+      x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
+      logcdf = dist.log_cdf(x)
+      expected_logcdf = stats.logistic.logcdf(x, np_loc, scale)
+
+      self.assertEqual(logcdf.get_shape(), (6,))
+      self.assertAllClose(logcdf.eval(), expected_logcdf)
+
+  def testLogisticSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 6
+      np_loc = np.array([2.0] * batch_size, dtype=np.float32)
+      loc = constant_op.constant(np_loc)
+      scale = 1.5
+
+      dist = logistic.Logistic(loc, scale)
+      x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
+      survival_function = dist.survival_function(x)
+      expected_survival_function = stats.logistic.sf(x, np_loc, scale)
+
+      self.assertEqual(survival_function.get_shape(), (6,))
+      self.assertAllClose(survival_function.eval(), expected_survival_function)
+
+  def testLogisticLogSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 6
+      np_loc = np.array([2.0] * batch_size, dtype=np.float32)
+      loc = constant_op.constant(np_loc)
+      scale = 1.5
+
+      dist = logistic.Logistic(loc, scale)
+      x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
+      logsurvival_function = dist.log_survival_function(x)
+      expected_logsurvival_function = stats.logistic.logsf(x, np_loc, scale)
+
+      self.assertEqual(logsurvival_function.get_shape(), (6,))
+      self.assertAllClose(logsurvival_function.eval(),
+                          expected_logsurvival_function)
+
   def testLogisticMean(self):
     with self.test_session():
       loc = [2.0, 1.5, 1.0]
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index e14fd98d4ba..aa523a95118 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -21,17 +21,22 @@ from __future__ import print_function
 import contextlib
 
 import numpy as np
+from scipy import stats
+
 from tensorflow.contrib import distributions
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 distributions_py = distributions
 
@@ -76,27 +81,42 @@ def _test_capture_normal_sample_outputs():
 
 
 def make_univariate_mixture(batch_shape, num_components):
+  batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32)
   logits = random_ops.random_uniform(
-      list(batch_shape) + [num_components], -1, 1, dtype=dtypes.float32) - 50.
+      array_ops.concat((batch_shape, [num_components]), axis=0),
+      -1, 1, dtype=dtypes.float32) - 50.
   components = [
       distributions_py.Normal(
-          loc=np.float32(np.random.randn(*list(batch_shape))),
-          scale=np.float32(10 * np.random.rand(*list(batch_shape))))
+          loc=random_ops.random_normal(batch_shape),
+          scale=10 * random_ops.random_uniform(batch_shape))
       for _ in range(num_components)
   ]
   cat = distributions_py.Categorical(logits, dtype=dtypes.int32)
   return distributions_py.Mixture(cat, components)
 
 
-def make_multivariate_mixture(batch_shape, num_components, event_shape):
+def make_multivariate_mixture(batch_shape, num_components, event_shape,
+                              batch_shape_tensor=None):
+  if batch_shape_tensor is None:
+    batch_shape_tensor = batch_shape
+  batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32)
   logits = random_ops.random_uniform(
-      list(batch_shape) + [num_components], -1, 1, dtype=dtypes.float32) - 50.
-  components = [
-      distributions_py.MultivariateNormalDiag(
-          mu=np.float32(np.random.randn(*list(batch_shape + event_shape))),
-          diag_stddev=np.float32(10 * np.random.rand(
-              *list(batch_shape + event_shape)))) for _ in range(num_components)
-  ]
+      array_ops.concat((batch_shape_tensor, [num_components]), 0),
+      -1, 1, dtype=dtypes.float32) - 50.
+  logits.set_shape(
+      tensor_shape.TensorShape(batch_shape).concatenate(num_components))
+  static_batch_and_event_shape = (
+      tensor_shape.TensorShape(batch_shape).concatenate(event_shape))
+  event_shape = ops.convert_to_tensor(event_shape, dtypes.int32)
+  batch_and_event_shape = array_ops.concat((batch_shape_tensor, event_shape), 0)
+  def create_component():
+    loc = random_ops.random_normal(batch_and_event_shape)
+    scale_diag = 10 * random_ops.random_uniform(batch_and_event_shape)
+    loc.set_shape(static_batch_and_event_shape)
+    scale_diag.set_shape(static_batch_and_event_shape)
+    return distributions_py.MultivariateNormalDiag(
+        loc=loc, scale_diag=scale_diag)
+  components = [create_component() for _ in range(num_components)]
   cat = distributions_py.Categorical(logits, dtype=dtypes.int32)
   return distributions_py.Mixture(cat, components)
 
@@ -179,13 +199,10 @@ class MixtureTest(test.TestCase):
           ])
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
       distributions_py.Mixture(distributions_py.Categorical([0.3, 0.2]), None)
-    with self.assertRaisesWithPredicateMatch(TypeError,
-                                             "either be continuous or not"):
-      distributions_py.Mixture(
-          cat, [
-              distributions_py.Normal(loc=[1.0], scale=[2.0]),
-              distributions_py.Bernoulli(dtype=dtypes.float32, logits=[1.0]),
-          ])
+
+    # TODO(ebrevdo): once distribution Domains have been added, add a
+    # test to ensure that the domains of the distributions in a
+    # mixture are checked for equivalence.
 
   def testMeanUnivariate(self):
     with self.test_session() as sess:
@@ -341,8 +358,9 @@ class MixtureTest(test.TestCase):
   def testSampleScalarBatchUnivariate(self):
     with self.test_session() as sess:
       num_components = 3
+      batch_shape = []
       dist = make_univariate_mixture(
-          batch_shape=[], num_components=num_components)
+          batch_shape=batch_shape, num_components=num_components)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -436,19 +454,36 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
-  def testSampleBatchMultivariate(self):
+  def _testSampleBatchMultivariate(self, fully_known_batch_shape):
     with self.test_session() as sess:
       num_components = 3
+      if fully_known_batch_shape:
+        batch_shape = [2, 3]
+        batch_shape_tensor = [2, 3]
+      else:
+        batch_shape = [None, 3]
+        batch_shape_tensor = array_ops.placeholder(dtype=dtypes.int32)
+
       dist = make_multivariate_mixture(
-          batch_shape=[2, 3], num_components=num_components, event_shape=[4])
+          batch_shape=batch_shape,
+          num_components=num_components, event_shape=[4],
+          batch_shape_tensor=batch_shape_tensor)
       n = 5
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
       self.assertEqual(samples.dtype, dtypes.float32)
-      self.assertEqual((5, 2, 3, 4), samples.get_shape())
+      if fully_known_batch_shape:
+        self.assertEqual((5, 2, 3, 4), samples.get_shape())
+      else:
+        self.assertEqual([5, None, 3, 4], samples.get_shape().as_list())
       cat_samples = dist.cat.sample(n, seed=123)
+      if fully_known_batch_shape:
+        feed_dict = {}
+      else:
+        feed_dict = {batch_shape_tensor: [2, 3]}
       sample_values, cat_sample_values, dist_sample_values = sess.run(
-          [samples, cat_samples, component_samples])
+          [samples, cat_samples, component_samples],
+          feed_dict=feed_dict)
       self.assertEqual((5, 2, 3, 4), sample_values.shape)
 
       for c in range(num_components):
@@ -460,6 +495,12 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1, :])
 
+  def testSampleBatchMultivariateFullyKnownBatchShape(self):
+    self._testSampleBatchMultivariate(fully_known_batch_shape=True)
+
+  def testSampleBatchMultivariateNotFullyKnownBatchShape(self):
+    self._testSampleBatchMultivariate(fully_known_batch_shape=False)
+
   def testEntropyLowerBoundMultivariate(self):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
@@ -484,6 +525,104 @@ class MixtureTest(test.TestCase):
 
         self.assertAllClose(true_entropy_lower_bound, entropy_lower_bound_value)
 
+  def testCdfScalarUnivariate(self):
+    """Tests CDF against scipy for a mixture of seven gaussians."""
+    # Construct a mixture of gaussians with seven components.
+    n_components = 7
+
+    # pre-softmax mixture probabilities.
+    mixture_weight_logits = np.random.uniform(
+        low=-1, high=1, size=(n_components,)).astype(np.float32)
+
+    def _scalar_univariate_softmax(x):
+      e_x = np.exp(x - np.max(x))
+      return e_x / e_x.sum()
+
+    # Construct the distributions_py.Mixture object.
+    mixture_weights = _scalar_univariate_softmax(mixture_weight_logits)
+    means = [np.random.uniform(low=-10, high=10, size=()).astype(np.float32)
+             for _ in range(n_components)]
+    sigmas = [np.ones(shape=(), dtype=np.float32) for _ in range(n_components)]
+    cat_tf = distributions_py.Categorical(probs=mixture_weights)
+    components_tf = [distributions_py.Normal(loc=mu, scale=sigma)
+                     for (mu, sigma) in zip(means, sigmas)]
+    mixture_tf = distributions_py.Mixture(cat=cat_tf, components=components_tf)
+
+    x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
+
+    # These are two test cases to verify.
+    xs_to_check = [
+        np.array(1.0, dtype=np.float32),
+        np.array(np.random.randn()).astype(np.float32)
+    ]
+
+    # Carry out the test for both d.cdf and exp(d.log_cdf).
+    x_cdf_tf = mixture_tf.cdf(x_tensor)
+    x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
+
+    with self.test_session() as sess:
+      for x_feed in xs_to_check:
+        x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
+            [x_cdf_tf, x_log_cdf_tf], feed_dict={x_tensor: x_feed})
+
+        # Compute the cdf with scipy.
+        scipy_component_cdfs = [stats.norm.cdf(x=x_feed, loc=mu, scale=sigma)
+                                for (mu, sigma) in zip(means, sigmas)]
+        scipy_cdf_result = np.dot(mixture_weights,
+                                  np.array(scipy_component_cdfs))
+        self.assertAllClose(x_cdf_tf_result, scipy_cdf_result)
+        self.assertAllClose(np.exp(x_log_cdf_tf_result), scipy_cdf_result)
+
+  def testCdfBatchUnivariate(self):
+    """Tests against scipy for a (batch of) mixture(s) of seven gaussians."""
+    n_components = 7
+    batch_size = 5
+    mixture_weight_logits = np.random.uniform(
+        low=-1, high=1, size=(batch_size, n_components)).astype(np.float32)
+
+    def _batch_univariate_softmax(x):
+      e_x = np.exp(x)
+      e_x_sum = np.expand_dims(np.sum(e_x, axis=1), axis=1)
+      return e_x / np.tile(e_x_sum, reps=[1, x.shape[1]])
+
+    psize = (batch_size,)
+    mixture_weights = _batch_univariate_softmax(mixture_weight_logits)
+    means = [np.random.uniform(low=-10, high=10, size=psize).astype(np.float32)
+             for _ in range(n_components)]
+    sigmas = [np.ones(shape=psize, dtype=np.float32)
+              for _ in range(n_components)]
+    cat_tf = distributions_py.Categorical(probs=mixture_weights)
+    components_tf = [distributions_py.Normal(loc=mu, scale=sigma)
+                     for (mu, sigma) in zip(means, sigmas)]
+    mixture_tf = distributions_py.Mixture(cat=cat_tf, components=components_tf)
+
+    x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
+    xs_to_check = [
+        np.array([1.0, 5.9, -3, 0.0, 0.0], dtype=np.float32),
+        np.random.randn(batch_size).astype(np.float32)
+    ]
+
+    x_cdf_tf = mixture_tf.cdf(x_tensor)
+    x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
+
+    with self.test_session() as sess:
+      for x_feed in xs_to_check:
+        x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
+            [x_cdf_tf, x_log_cdf_tf],
+            feed_dict={x_tensor: x_feed})
+
+        # Compute the cdf with scipy.
+        scipy_component_cdfs = [stats.norm.cdf(x=x_feed, loc=mu, scale=sigma)
+                                for (mu, sigma) in zip(means, sigmas)]
+        weights_and_cdfs = zip(np.transpose(mixture_weights, axes=[1, 0]),
+                               scipy_component_cdfs)
+        final_cdf_probs_per_component = [
+            np.multiply(c_p_value, d_cdf_value)
+            for (c_p_value, d_cdf_value) in weights_and_cdfs]
+        scipy_cdf_result = np.sum(final_cdf_probs_per_component, axis=0)
+        self.assertAllClose(x_cdf_tf_result, scipy_cdf_result)
+        self.assertAllClose(np.exp(x_log_cdf_tf_result), scipy_cdf_result)
+
 
 class MixtureBenchmark(test.Benchmark):
 
@@ -509,12 +648,13 @@ class MixtureBenchmark(test.Benchmark):
             name=("%s_%s_components_%d_batch_%d_features_%d_sample_%d" %
                   (name, use_gpu, num_components, batch_size, num_features,
                    sample_size)))
-        print("\t".join(["%s", "%d", "%d", "%d", "%d", "%g"]) %
-              (use_gpu, num_components, batch_size, num_features, sample_size,
-               reported["wall_time"]))
+        logging.vlog(2, "\t".join(["%s", "%d", "%d", "%d", "%d", "%g"]) % (
+            use_gpu, num_components, batch_size, num_features, sample_size,
+            reported["wall_time"]))
 
   def benchmarkSamplingMVNDiag(self):
-    print("mvn_diag\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")
+    logging.vlog(
+        2, "mvn_diag\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")
 
     def create_distribution(batch_size, num_components, num_features):
       cat = distributions_py.Categorical(
@@ -529,7 +669,7 @@ class MixtureBenchmark(test.Benchmark):
       ]
       components = list(
           distributions_py.MultivariateNormalDiag(
-              mu=mu, diag_stddev=sigma) for (mu, sigma) in zip(mus, sigmas))
+              loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
       return distributions_py.Mixture(cat, components)
 
     for use_gpu in False, True:
@@ -549,7 +689,8 @@ class MixtureBenchmark(test.Benchmark):
                   sample_size=sample_size)
 
   def benchmarkSamplingMVNFull(self):
-    print("mvn_full\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")
+    logging.vlog(
+        2, "mvn_full\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")
 
     def psd(x):
       """Construct batch-wise PSD matrices."""
@@ -567,8 +708,10 @@ class MixtureBenchmark(test.Benchmark):
               psd(np.random.rand(batch_size, num_features, num_features)))
           for _ in range(num_components)
       ]
-      components = list(distributions_py.MultivariateNormalFull(
-          mu=mu, sigma=sigma) for (mu, sigma) in zip(mus, sigmas))
+      components = list(
+          distributions_py.MultivariateNormalTriL(
+              loc=mu, scale_tril=linalg_ops.cholesky(sigma))
+          for (mu, sigma) in zip(mus, sigmas))
       return distributions_py.Mixture(cat, components)
 
     for use_gpu in False, True:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
new file mode 100644
index 00000000000..a924d2e3834
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
@@ -0,0 +1,392 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormal."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import distributions
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+ds = distributions
+
+
+class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testDiagBroadcastBothBatchAndEvent(self):
+    # batch_shape: [3], event_shape: [2]
+    diag = np.array([[1., 2], [3, 4], [5, 6]])
+    # batch_shape: [1], event_shape: []
+    identity_multiplier = np.array([5.])
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          scale_diag=diag,
+          scale_identity_multiplier=identity_multiplier,
+          validate_args=True)
+      self.assertAllClose(
+          np.array([[[1. + 5, 0],
+                     [0, 2 + 5]],
+                    [[3 + 5, 0],
+                     [0, 4 + 5]],
+                    [[5 + 5, 0],
+                     [0, 6 + 5]]]),
+          dist.scale.to_dense().eval())
+
+  def testDiagBroadcastBothBatchAndEvent2(self):
+    # This test differs from `testDiagBroadcastBothBatchAndEvent` in that it
+    # broadcasts batch_shape's from both the `scale_diag` and
+    # `scale_identity_multiplier` args.
+    # batch_shape: [3], event_shape: [2]
+    diag = np.array([[1., 2], [3, 4], [5, 6]])
+    # batch_shape: [3, 1], event_shape: []
+    identity_multiplier = np.array([[5.], [4], [3]])
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          scale_diag=diag,
+          scale_identity_multiplier=identity_multiplier,
+          validate_args=True)
+      self.assertAllEqual(
+          [3, 3, 2, 2],
+          dist.scale.to_dense().get_shape())
+
+  def testDiagBroadcastOnlyEvent(self):
+    # batch_shape: [3], event_shape: [2]
+    diag = np.array([[1., 2], [3, 4], [5, 6]])
+    # batch_shape: [3], event_shape: []
+    identity_multiplier = np.array([5., 4, 3])
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          scale_diag=diag,
+          scale_identity_multiplier=identity_multiplier,
+          validate_args=True)
+      self.assertAllClose(
+          np.array([[[1. + 5, 0],
+                     [0, 2 + 5]],
+                    [[3 + 4, 0],
+                     [0, 4 + 4]],
+                    [[5 + 3, 0],
+                     [0, 6 + 3]]]),   # shape: [3, 2, 2]
+          dist.scale.to_dense().eval())
+
+  def testDiagBroadcastMultiplierAndLoc(self):
+    # batch_shape: [], event_shape: [3]
+    loc = np.array([1., 0, -1])
+    # batch_shape: [3], event_shape: []
+    identity_multiplier = np.array([5., 4, 3])
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          loc=loc,
+          scale_identity_multiplier=identity_multiplier,
+          validate_args=True)
+      self.assertAllClose(
+          np.array([[[5, 0, 0],
+                     [0, 5, 0],
+                     [0, 0, 5]],
+                    [[4, 0, 0],
+                     [0, 4, 0],
+                     [0, 0, 4]],
+                    [[3, 0, 0],
+                     [0, 3, 0],
+                     [0, 0, 3]]]),
+          dist.scale.to_dense().eval())
+
+  def testMean(self):
+    mu = [-1.0, 1.0]
+    diag_large = [1.0, 5.0]
+    v = [[2.0], [3.0]]
+    diag_small = [3.0]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          loc=mu,
+          scale_diag=diag_large,
+          scale_perturb_factor=v,
+          scale_perturb_diag=diag_small,
+          validate_args=True)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testSample(self):
+    # TODO(jvdillon): This test should be the basis of a new test fixture which
+    # is applied to every distribution. When we make this fixture, we'll also
+    # separate the analytical- and sample-based tests as well as for each
+    # function tested. For now, we group things so we can recycle one batch of
+    # samples (thus saving resources).
+
+    mu = np.array([-1., 1, 0.5], dtype=np.float32)
+    diag_large = np.array([1., 0.5, 0.75], dtype=np.float32)
+    diag_small = np.array([-1.1, 1.2], dtype=np.float32)
+    v = np.array([[0.7, 0.8],
+                  [0.9, 1],
+                  [0.5, 0.6]], dtype=np.float32)  # shape: [k, r] = [3, 2]
+
+    true_mean = mu
+    true_scale = np.diag(diag_large) + np.matmul(np.matmul(
+        v, np.diag(diag_small)), v.T)
+    true_covariance = np.matmul(true_scale, true_scale.T)
+    true_variance = np.diag(true_covariance)
+    true_stddev = np.sqrt(true_variance)
+
+    with self.test_session() as sess:
+      dist = ds.MultivariateNormalDiagPlusLowRank(
+          loc=mu,
+          scale_diag=diag_large,
+          scale_perturb_factor=v,
+          scale_perturb_diag=diag_small,
+          validate_args=True)
+
+      # The following distributions will test the KL divergence calculation.
+      mvn_identity = ds.MultivariateNormalDiag(
+          loc=np.array([1., 2, 0.25], dtype=np.float32),
+          validate_args=True)
+      mvn_scaled = ds.MultivariateNormalDiag(
+          loc=mvn_identity.loc,
+          scale_identity_multiplier=2.2,
+          validate_args=True)
+      mvn_diag = ds.MultivariateNormalDiag(
+          loc=mvn_identity.loc,
+          scale_diag=np.array([0.5, 1.5, 1.], dtype=np.float32),
+          validate_args=True)
+      mvn_chol = ds.MultivariateNormalTriL(
+          loc=np.array([1., 2, -1], dtype=np.float32),
+          scale_tril=np.array([[6., 0, 0],
+                               [2, 5, 0],
+                               [1, 3, 4]], dtype=np.float32) / 10.,
+          validate_args=True)
+
+      scale = dist.scale.to_dense()
+
+      n = int(30e3)
+      samps = dist.sample(n, seed=0)
+      sample_mean = math_ops.reduce_mean(samps, 0)
+      x = samps - sample_mean
+      sample_covariance = math_ops.matmul(x, x, transpose_a=True) / n
+
+      sample_kl_identity = math_ops.reduce_mean(
+          dist.log_prob(samps) - mvn_identity.log_prob(samps), 0)
+      analytical_kl_identity = ds.kl_divergence(dist, mvn_identity)
+
+      sample_kl_scaled = math_ops.reduce_mean(
+          dist.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
+      analytical_kl_scaled = ds.kl_divergence(dist, mvn_scaled)
+
+      sample_kl_diag = math_ops.reduce_mean(
+          dist.log_prob(samps) - mvn_diag.log_prob(samps), 0)
+      analytical_kl_diag = ds.kl_divergence(dist, mvn_diag)
+
+      sample_kl_chol = math_ops.reduce_mean(
+          dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
+
+      n = int(10e3)
+      baseline = ds.MultivariateNormalDiag(
+          loc=np.array([-1., 0.25, 1.25], dtype=np.float32),
+          scale_diag=np.array([1.5, 0.5, 1.], dtype=np.float32),
+          validate_args=True)
+      samps = baseline.sample(n, seed=0)
+
+      sample_kl_identity_diag_baseline = math_ops.reduce_mean(
+          baseline.log_prob(samps) - mvn_identity.log_prob(samps), 0)
+      analytical_kl_identity_diag_baseline = ds.kl_divergence(
+          baseline, mvn_identity)
+
+      sample_kl_scaled_diag_baseline = math_ops.reduce_mean(
+          baseline.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
+      analytical_kl_scaled_diag_baseline = ds.kl_divergence(
+          baseline, mvn_scaled)
+
+      sample_kl_diag_diag_baseline = math_ops.reduce_mean(
+          baseline.log_prob(samps) - mvn_diag.log_prob(samps), 0)
+      analytical_kl_diag_diag_baseline = ds.kl_divergence(baseline, mvn_diag)
+
+      sample_kl_chol_diag_baseline = math_ops.reduce_mean(
+          baseline.log_prob(samps) - mvn_chol.log_prob(samps), 0)
+      analytical_kl_chol_diag_baseline = ds.kl_divergence(baseline, mvn_chol)
+
+      [
+          sample_mean_,
+          analytical_mean_,
+          sample_covariance_,
+          analytical_covariance_,
+          analytical_variance_,
+          analytical_stddev_,
+          scale_,
+          sample_kl_identity_, analytical_kl_identity_,
+          sample_kl_scaled_, analytical_kl_scaled_,
+          sample_kl_diag_, analytical_kl_diag_,
+          sample_kl_chol_, analytical_kl_chol_,
+          sample_kl_identity_diag_baseline_,
+          analytical_kl_identity_diag_baseline_,
+          sample_kl_scaled_diag_baseline_, analytical_kl_scaled_diag_baseline_,
+          sample_kl_diag_diag_baseline_, analytical_kl_diag_diag_baseline_,
+          sample_kl_chol_diag_baseline_, analytical_kl_chol_diag_baseline_,
+      ] = sess.run([
+          sample_mean,
+          dist.mean(),
+          sample_covariance,
+          dist.covariance(),
+          dist.variance(),
+          dist.stddev(),
+          scale,
+          sample_kl_identity, analytical_kl_identity,
+          sample_kl_scaled, analytical_kl_scaled,
+          sample_kl_diag, analytical_kl_diag,
+          sample_kl_chol, analytical_kl_chol,
+          sample_kl_identity_diag_baseline,
+          analytical_kl_identity_diag_baseline,
+          sample_kl_scaled_diag_baseline, analytical_kl_scaled_diag_baseline,
+          sample_kl_diag_diag_baseline, analytical_kl_diag_diag_baseline,
+          sample_kl_chol_diag_baseline, analytical_kl_chol_diag_baseline,
+      ])
+
+      sample_variance_ = np.diag(sample_covariance_)
+      sample_stddev_ = np.sqrt(sample_variance_)
+
+      logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
+      logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
+      logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_))
+
+      logging.vlog(2, "true_covariance:\n{}".format(true_covariance))
+      logging.vlog(2, "sample_covariance:\n{}".format(sample_covariance_))
+      logging.vlog(2, "analytical_covariance:\n{}".format(
+          analytical_covariance_))
+
+      logging.vlog(2, "true_variance:\n{}".format(true_variance))
+      logging.vlog(2, "sample_variance:\n{}".format(sample_variance_))
+      logging.vlog(2, "analytical_variance:\n{}".format(analytical_variance_))
+
+      logging.vlog(2, "true_stddev:\n{}".format(true_stddev))
+      logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
+      logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
+
+      logging.vlog(2, "true_scale:\n{}".format(true_scale))
+      logging.vlog(2, "scale:\n{}".format(scale_))
+
+      logging.vlog(2, "kl_identity:  analytical:{}  sample:{}".format(
+          analytical_kl_identity_, sample_kl_identity_))
+
+      logging.vlog(2, "kl_scaled:    analytical:{}  sample:{}".format(
+          analytical_kl_scaled_, sample_kl_scaled_))
+
+      logging.vlog(2, "kl_diag:      analytical:{}  sample:{}".format(
+          analytical_kl_diag_, sample_kl_diag_))
+
+      logging.vlog(2, "kl_chol:      analytical:{}  sample:{}".format(
+          analytical_kl_chol_, sample_kl_chol_))
+
+      logging.vlog(
+          2, "kl_identity_diag_baseline:  analytical:{}  sample:{}".format(
+              analytical_kl_identity_diag_baseline_,
+              sample_kl_identity_diag_baseline_))
+
+      logging.vlog(
+          2, "kl_scaled_diag_baseline:  analytical:{}  sample:{}".format(
+              analytical_kl_scaled_diag_baseline_,
+              sample_kl_scaled_diag_baseline_))
+
+      logging.vlog(2, "kl_diag_diag_baseline:  analytical:{}  sample:{}".format(
+          analytical_kl_diag_diag_baseline_,
+          sample_kl_diag_diag_baseline_))
+
+      logging.vlog(2, "kl_chol_diag_baseline:  analytical:{}  sample:{}".format(
+          analytical_kl_chol_diag_baseline_,
+          sample_kl_chol_diag_baseline_))
+
+      self.assertAllClose(true_mean, sample_mean_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(true_mean, analytical_mean_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_covariance, sample_covariance_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(true_covariance, analytical_covariance_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_variance, sample_variance_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(true_variance, analytical_variance_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_stddev, sample_stddev_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(true_stddev, analytical_stddev_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_scale, scale_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(sample_kl_identity_, analytical_kl_identity_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(sample_kl_scaled_, analytical_kl_scaled_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(sample_kl_diag_, analytical_kl_diag_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(sample_kl_chol_, analytical_kl_chol_,
+                          atol=0., rtol=0.02)
+
+      self.assertAllClose(
+          sample_kl_identity_diag_baseline_,
+          analytical_kl_identity_diag_baseline_,
+          atol=0., rtol=0.02)
+      self.assertAllClose(
+          sample_kl_scaled_diag_baseline_,
+          analytical_kl_scaled_diag_baseline_,
+          atol=0., rtol=0.02)
+      self.assertAllClose(
+          sample_kl_diag_diag_baseline_,
+          analytical_kl_diag_diag_baseline_,
+          atol=0., rtol=0.04)
+      self.assertAllClose(
+          sample_kl_chol_diag_baseline_,
+          analytical_kl_chol_diag_baseline_,
+          atol=0., rtol=0.02)
+
+  def testImplicitLargeDiag(self):
+    mu = np.array([[1., 2, 3],
+                   [11, 22, 33]])      # shape: [b, k] = [2, 3]
+    u = np.array([[[1., 2],
+                   [3, 4],
+                   [5, 6]],
+                  [[0.5, 0.75],
+                   [1, 0.25],
+                   [1.5, 1.25]]])      # shape: [b, k, r] = [2, 3, 2]
+    m = np.array([[0.1, 0.2],
+                  [0.4, 0.5]])         # shape: [b, r] = [2, 2]
+    scale = np.stack([
+        np.eye(3) + np.matmul(np.matmul(u[0], np.diag(m[0])),
+                              np.transpose(u[0])),
+        np.eye(3) + np.matmul(np.matmul(u[1], np.diag(m[1])),
+                              np.transpose(u[1])),
+    ])
+    cov = np.stack([np.matmul(scale[0], scale[0].T),
+                    np.matmul(scale[1], scale[1].T)])
+    logging.vlog(2, "expected_cov:\n{}".format(cov))
+    with self.test_session():
+      mvn = ds.MultivariateNormalDiagPlusLowRank(
+          loc=mu,
+          scale_perturb_factor=u,
+          scale_perturb_diag=m)
+      self.assertAllClose(cov, mvn.covariance().eval(), atol=0., rtol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
new file mode 100644
index 00000000000..3f4582eb7ee
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -0,0 +1,238 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormal."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib import distributions
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+
+
+class MultivariateNormalDiagTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testScalarParams(self):
+    mu = -1.
+    diag = -5.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
+        ds.MultivariateNormalDiag(mu, diag)
+
+  def testVectorParams(self):
+    mu = [-1.]
+    diag = [-5.]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([3, 1], dist.sample(3).get_shape())
+
+  def testDistWithBatchShapeOneThenTransformedThroughSoftplus(self):
+    # This complex combination of events resulted in a loss of static shape
+    # information when tensor_util.constant_value(self._needs_rotation) was
+    # being used incorrectly (resulting in always rotating).
+    # Batch shape = [1], event shape = [3]
+    mu = array_ops.zeros((1, 3))
+    diag = array_ops.ones((1, 3))
+    with self.test_session():
+      base_dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      dist = ds.TransformedDistribution(
+          base_dist,
+          validate_args=True,
+          bijector=bijectors.Softplus(event_ndims=1))
+      samps = dist.sample(5)  # Shape [5, 1, 3].
+      self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
+
+  def testMean(self):
+    mu = [-1., 1]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testMeanWithBroadcastLoc(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1., -1.], dist.mean().eval())
+
+  def testEntropy(self):
+    mu = [-1., 1]
+    diag = [-1., 5]
+    diag_mat = np.diag(diag)
+    scipy_mvn = stats.multivariate_normal(mean=mu, cov=diag_mat**2)
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      self.assertAllClose(scipy_mvn.entropy(), dist.entropy().eval(), atol=1e-4)
+
+  def testSample(self):
+    mu = [-1., 1]
+    diag = [1., -2]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      samps = dist.sample(int(1e3), seed=0).eval()
+      cov_mat = array_ops.matrix_diag(diag).eval()**2
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0., rtol=0.05)
+      self.assertAllClose(cov_mat, np.cov(samps.T),
+                          atol=0.05, rtol=0.05)
+
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu, mean.eval())
+
+      n = int(1e3)
+      samps = dist.sample(n, seed=0).eval()
+      cov_mat = array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps.transpose([1, 2, 0]),
+                             samps.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
+  def testCovariance(self):
+    with self.test_session():
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.diag(np.ones([3], dtype=np.float32)),
+          mvn.covariance().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllEqual([2], mvn.batch_shape)
+      self.assertAllEqual([3], mvn.event_shape)
+      self.assertAllClose(
+          np.array([[[3., 0, 0],
+                     [0, 3, 0],
+                     [0, 0, 3]],
+                    [[2, 0, 0],
+                     [0, 2, 0],
+                     [0, 0, 2]]])**2.,
+          mvn.covariance().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllEqual([2], mvn.batch_shape)
+      self.assertAllEqual([3], mvn.event_shape)
+      self.assertAllClose(
+          np.array([[[3., 0, 0],
+                     [0, 2, 0],
+                     [0, 0, 1]],
+                    [[4, 0, 0],
+                     [0, 5, 0],
+                     [0, 0, 6]]])**2.,
+          mvn.covariance().eval())
+
+  def testVariance(self):
+    with self.test_session():
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.ones([3], dtype=np.float32),
+          mvn.variance().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.array([[3., 3, 3],
+                    [2, 2, 2]])**2.,
+          mvn.variance().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1],
+                      [4, 5, 6]])
+      self.assertAllClose(
+          np.array([[3., 2, 1],
+                    [4, 5, 6]])**2.,
+          mvn.variance().eval())
+
+  def testStddev(self):
+    with self.test_session():
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.ones([3], dtype=np.float32),
+          mvn.stddev().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.array([[3., 3, 3],
+                    [2, 2, 2]]),
+          mvn.stddev().eval())
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllClose(
+          np.array([[3., 2, 1],
+                    [4, 5, 6]]),
+          mvn.stddev().eval())
+
+  def testMultivariateNormalDiagWithSoftplusScale(self):
+    mu = [-1.0, 1.0]
+    diag = [-1.0, -2.0]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiagWithSoftplusScale(
+          mu, diag, validate_args=True)
+      samps = dist.sample(1000, seed=0).eval()
+      cov_mat = array_ops.matrix_diag(nn_ops.softplus(diag)).eval()**2
+
+      self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
+      self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
new file mode 100644
index 00000000000..6dfab46ebe4
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormalFullCovariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib import distributions
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+rng = np.random.RandomState(42)
+
+
+class MultivariateNormalFullCovarianceTest(test.TestCase):
+
+  def _random_pd_matrix(self, *shape):
+    mat = rng.rand(*shape)
+    chol = ds.matrix_diag_transform(mat, transform=nn_ops.softplus)
+    chol = array_ops.matrix_band_part(chol, -1, 0)
+    return math_ops.matmul(chol, chol, adjoint_b=True).eval()
+
+  def testRaisesIfInitializedWithNonSymmetricMatrix(self):
+    with self.test_session():
+      mu = [1., 2.]
+      sigma = [[1., 0.], [1., 1.]]  # Nonsingular, but not symmetric
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      with self.assertRaisesOpError("not symmetric"):
+        mvn.covariance().eval()
+
+  def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
+    with self.test_session():
+      mu = rng.rand(10)
+      sigma = self._random_pd_matrix(10, 10)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      # Should not raise
+      mvn.covariance().eval()
+
+  def testLogPDFScalarBatch(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      sigma = self._random_pd_matrix(2, 2)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testLogPDFScalarBatchCovarianceNotProvided(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance_matrix=None, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      # Initialize a scipy_mvn with the default covariance.
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=np.eye(2))
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testShapes(self):
+    with self.test_session():
+      mu = rng.rand(3, 5, 2)
+      covariance = self._random_pd_matrix(3, 5, 2, 2)
+
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance, validate_args=True)
+
+      # Shapes known at graph construction time.
+      self.assertEqual((2,), tuple(mvn.event_shape.as_list()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape.as_list()))
+
+      # Shapes known at runtime.
+      self.assertEqual((2,), tuple(mvn.event_shape_tensor().eval()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape_tensor().eval()))
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = rng.randn(*mat_shape)
+    perm = np.arange(mat.ndim)
+    perm[-2:] = [perm[-1], perm[-2]]
+    sigma = np.matmul(mat, np.transpose(mat, perm))
+
+    mu_shape = batch_shape + event_shape
+    mu = rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalFullCovariance(
+          loc=mu_a,
+          covariance_matrix=sigma_a,
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalFullCovariance(
+          loc=mu_b,
+          covariance_matrix=sigma_b,
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
deleted file mode 100644
index 0776a5b4a51..00000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MultivariateNormal."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from scipy import stats
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import test
-
-distributions = distributions_lib
-
-
-class MultivariateNormalShapeTest(test.TestCase):
-
-  def _testPDFShapes(self, mvn_dist, mu, sigma):
-    with self.test_session() as sess:
-      mvn = mvn_dist(mu, sigma)
-      x = 2 * array_ops.ones_like(mu)
-
-      log_pdf = mvn.log_prob(x)
-      pdf = mvn.prob(x)
-
-      mu_value = np.ones([3, 3, 2])
-      sigma_value = np.zeros([3, 3, 2, 2])
-      sigma_value[:] = np.identity(2)
-      x_value = 2. * np.ones([3, 3, 2])
-      feed_dict = {mu: mu_value, sigma: sigma_value}
-
-      scipy_mvn = stats.multivariate_normal(
-          mean=mu_value[(0, 0)], cov=sigma_value[(0, 0)])
-      expected_log_pdf = scipy_mvn.logpdf(x_value[(0, 0)])
-      expected_pdf = scipy_mvn.pdf(x_value[(0, 0)])
-
-      log_pdf_evaled, pdf_evaled = sess.run([log_pdf, pdf], feed_dict=feed_dict)
-      self.assertAllEqual([3, 3], log_pdf_evaled.shape)
-      self.assertAllEqual([3, 3], pdf_evaled.shape)
-      self.assertAllClose(expected_log_pdf, log_pdf_evaled[0, 0])
-      self.assertAllClose(expected_pdf, pdf_evaled[0, 0])
-
-  def testPDFUnknownSize(self):
-    mu = array_ops.placeholder(dtypes.float32, shape=(3 * [None]))
-    sigma = array_ops.placeholder(dtypes.float32, shape=(4 * [None]))
-    self._testPDFShapes(distributions.MultivariateNormalFull, mu, sigma)
-    self._testPDFShapes(distributions.MultivariateNormalCholesky, mu, sigma)
-
-  def testPDFUnknownShape(self):
-    mu = array_ops.placeholder(dtypes.float32)
-    sigma = array_ops.placeholder(dtypes.float32)
-    self._testPDFShapes(distributions.MultivariateNormalFull, mu, sigma)
-    self._testPDFShapes(distributions.MultivariateNormalCholesky, mu, sigma)
-
-
-class MultivariateNormalDiagTest(test.TestCase):
-  """Well tested because this is a simple override of the base class."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def testMean(self):
-    mu = [-1.0, 1.0]
-    diag = [1.0, 5.0]
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiag(mu, diag)
-      self.assertAllEqual(mu, dist.mean().eval())
-
-  def testEntropy(self):
-    mu = [-1.0, 1.0]
-    diag = [1.0, 5.0]
-    diag_mat = np.diag(diag)
-    scipy_mvn = stats.multivariate_normal(mean=mu, cov=diag_mat**2)
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiag(mu, diag)
-      self.assertAllClose(scipy_mvn.entropy(), dist.entropy().eval(), atol=1e-4)
-
-  def testNonmatchingMuDiagDimensionsFailsStatic(self):
-    mu = [-1.0, 1.0]
-    diag = [[1.0, 5.0]]
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "shape.*should match"):
-        distributions.MultivariateNormalDiag(mu, diag)
-
-  def testNonmatchingMuDiagDimensionsFailsDynamic(self):
-    mu_v = [-1.0, 1.0]
-    diag_v = [[1.0, 5.0]]
-
-    with self.test_session():
-      mu_ph = array_ops.placeholder(dtypes.float32, name="mu_ph")
-      diag_ph = array_ops.placeholder(dtypes.float32, name="diag_ph")
-      dist = distributions.MultivariateNormalDiag(
-          mu_ph, diag_ph, validate_args=True)
-      with self.assertRaisesOpError("mu should have rank"):
-        dist.mean().eval(feed_dict={mu_ph: mu_v, diag_ph: diag_v})
-
-  def testSample(self):
-    mu = [-1.0, 1.0]
-    diag = [1.0, 2.0]
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiag(mu, diag)
-      samps = dist.sample(1000, seed=0).eval()
-      cov_mat = array_ops.matrix_diag(diag).eval()**2
-
-      self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
-      self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
-
-  def testMultivariateNormalDiagWithSoftplusStDev(self):
-    mu = [-1.0, 1.0]
-    diag = [-1.0, -2.0]
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiagWithSoftplusStDev(mu, diag)
-      samps = dist.sample(1000, seed=0).eval()
-      cov_mat = array_ops.matrix_diag(nn_ops.softplus(diag)).eval()**2
-
-      self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
-      self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
-
-
-class MultivariateNormalDiagPlusVDVTTest(test.TestCase):
-  """Well tested because this is a simple override of the base class."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def testMean(self):
-    mu = [-1.0, 1.0]
-    diag_large = [1.0, 5.0]
-    v = [[2.0], [3.0]]
-    diag_small = [3.0]
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiagPlusVDVT(
-          mu, diag_large, v, diag_small=diag_small)
-      self.assertAllEqual(mu, dist.mean().eval())
-
-  def testNonmatchingMuAndSigmaDimensionFailsStatic(self):
-    mu = self._rng.rand(2)
-    # With this diag_large and v, the covariance is 3 x 3
-    diag_large = self._rng.rand(3)
-    v = self._rng.rand(3, 2)  # v works with diag_large.
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "shape.*should match"):
-        distributions.MultivariateNormalDiagPlusVDVT(mu, diag_large, v)
-
-  def testNonmatchingMuDiagDimensionsFailsDynamic(self):
-    mu = self._rng.rand(2)
-    # With this diag_large and v, the covariance is 3 x 3
-    diag_large = self._rng.rand(3)
-    v = self._rng.rand(3, 2)  # v works with diag_large.
-
-    with self.test_session():
-      mu_ph = array_ops.placeholder(dtypes.float32, name="mu_ph")
-      v_ph = array_ops.placeholder(dtypes.float32, name="v_ph")
-      diag_ph = array_ops.placeholder(dtypes.float32, name="diag_ph")
-      dist = distributions.MultivariateNormalDiagPlusVDVT(
-          mu_ph, diag_ph, v_ph, validate_args=True)
-      with self.assertRaisesOpError("mu.*cov.*shape"):
-        dist.mean().eval(feed_dict={mu_ph: mu, diag_ph: diag_large, v_ph: v})
-
-  def testSample(self):
-    mu = [-1.0, 1.0]
-    diag_large = [1.0, 0.5]
-    v = [[0.2], [0.3]]
-    with self.test_session():
-      dist = distributions.MultivariateNormalDiagPlusVDVT(mu, diag_large, v)
-
-      samps = dist.sample(1000, seed=0).eval()
-      cov_mat = dist.sigma.eval()
-
-      self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
-      self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
-
-
-class MultivariateNormalCholeskyTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_chol(self, *shape):
-    mat = self._rng.rand(*shape)
-    chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-    chol = array_ops.matrix_band_part(chol, -1, 0)
-    sigma = math_ops.matmul(chol, chol, adjoint_b=True)
-    return chol.eval(), sigma.eval()
-
-  def testNonmatchingMuSigmaFailsStatic(self):
-    with self.test_session():
-      mu = self._rng.rand(2)
-      chol, _ = self._random_chol(2, 2, 2)
-      with self.assertRaisesRegexp(ValueError, "shape.*should match"):
-        distributions.MultivariateNormalCholesky(mu, chol)
-
-      mu = self._rng.rand(2, 1)
-      chol, _ = self._random_chol(2, 2, 2)
-      with self.assertRaisesRegexp(ValueError, "shape.*should match"):
-        distributions.MultivariateNormalCholesky(mu, chol)
-
-  def testNonmatchingMuSigmaFailsDynamic(self):
-    with self.test_session():
-      mu_ph = array_ops.placeholder(dtypes.float64)
-      chol_ph = array_ops.placeholder(dtypes.float64)
-
-      mu_v = self._rng.rand(2)
-      chol_v, _ = self._random_chol(2, 2, 2)
-      mvn = distributions.MultivariateNormalCholesky(
-          mu_ph, chol_ph, validate_args=True)
-      with self.assertRaisesOpError("mu should have rank 1 less than cov"):
-        mvn.mean().eval(feed_dict={mu_ph: mu_v, chol_ph: chol_v})
-
-      mu_v = self._rng.rand(2, 1)
-      chol_v, _ = self._random_chol(2, 2, 2)
-      mvn = distributions.MultivariateNormalCholesky(
-          mu_ph, chol_ph, validate_args=True)
-      with self.assertRaisesOpError("mu.shape and cov.shape.*should match"):
-        mvn.mean().eval(feed_dict={mu_ph: mu_v, chol_ph: chol_v})
-
-  def testLogPDFScalarBatch(self):
-    with self.test_session():
-      mu = self._rng.rand(2)
-      chol, sigma = self._random_chol(2, 2)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      x = self._rng.rand(2)
-
-      log_pdf = mvn.log_prob(x)
-      pdf = mvn.prob(x)
-
-      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
-
-      expected_log_pdf = scipy_mvn.logpdf(x)
-      expected_pdf = scipy_mvn.pdf(x)
-      self.assertEqual((), log_pdf.get_shape())
-      self.assertEqual((), pdf.get_shape())
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
-      self.assertAllClose(expected_pdf, pdf.eval())
-
-  def testLogPDFXIsHigherRank(self):
-    with self.test_session():
-      mu = self._rng.rand(2)
-      chol, sigma = self._random_chol(2, 2)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      x = self._rng.rand(3, 2)
-
-      log_pdf = mvn.log_prob(x)
-      pdf = mvn.prob(x)
-
-      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
-
-      expected_log_pdf = scipy_mvn.logpdf(x)
-      expected_pdf = scipy_mvn.pdf(x)
-      self.assertEqual((3,), log_pdf.get_shape())
-      self.assertEqual((3,), pdf.get_shape())
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
-      self.assertAllClose(expected_pdf, pdf.eval())
-
-  def testLogPDFXLowerDimension(self):
-    with self.test_session():
-      mu = self._rng.rand(3, 2)
-      chol, sigma = self._random_chol(3, 2, 2)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      x = self._rng.rand(2)
-
-      log_pdf = mvn.log_prob(x)
-      pdf = mvn.prob(x)
-
-      self.assertEqual((3,), log_pdf.get_shape())
-      self.assertEqual((3,), pdf.get_shape())
-
-      # scipy can't do batches, so just test one of them.
-      scipy_mvn = stats.multivariate_normal(mean=mu[1, :], cov=sigma[1, :, :])
-      expected_log_pdf = scipy_mvn.logpdf(x)
-      expected_pdf = scipy_mvn.pdf(x)
-
-      self.assertAllClose(expected_log_pdf, log_pdf.eval()[1])
-      self.assertAllClose(expected_pdf, pdf.eval()[1])
-
-  def testEntropy(self):
-    with self.test_session():
-      mu = self._rng.rand(2)
-      chol, sigma = self._random_chol(2, 2)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      entropy = mvn.entropy()
-
-      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
-      expected_entropy = scipy_mvn.entropy()
-      self.assertEqual(entropy.get_shape(), ())
-      self.assertAllClose(expected_entropy, entropy.eval())
-
-  def testEntropyMultidimensional(self):
-    with self.test_session():
-      mu = self._rng.rand(3, 5, 2)
-      chol, sigma = self._random_chol(3, 5, 2, 2)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      entropy = mvn.entropy()
-
-      # Scipy doesn't do batches, so test one of them.
-      expected_entropy = stats.multivariate_normal(
-          mean=mu[1, 1, :], cov=sigma[1, 1, :, :]).entropy()
-      self.assertEqual(entropy.get_shape(), (3, 5))
-      self.assertAllClose(expected_entropy, entropy.eval()[1, 1])
-
-  def testSample(self):
-    with self.test_session():
-      mu = self._rng.rand(2)
-      chol, sigma = self._random_chol(2, 2)
-
-      n = constant_op.constant(100000)
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      samples = mvn.sample(n, seed=137)
-      sample_values = samples.eval()
-      self.assertEqual(samples.get_shape(), (100000, 2))
-      self.assertAllClose(sample_values.mean(axis=0), mu, atol=1e-2)
-      self.assertAllClose(np.cov(sample_values, rowvar=0), sigma, atol=1e-1)
-
-  def testSampleWithSampleShape(self):
-    with self.test_session():
-      mu = self._rng.rand(3, 5, 2)
-      chol, sigma = self._random_chol(3, 5, 2, 2)
-
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      samples_val = mvn.sample((10, 11, 12), seed=137).eval()
-
-      # Check sample shape
-      self.assertEqual((10, 11, 12, 3, 5, 2), samples_val.shape)
-
-      # Check sample means
-      x = samples_val[:, :, :, 1, 1, :]
-      self.assertAllClose(
-          x.reshape(10 * 11 * 12, 2).mean(axis=0), mu[1, 1], atol=1e-2)
-
-      # Check that log_prob(samples) works
-      log_prob_val = mvn.log_prob(samples_val).eval()
-      x_log_pdf = log_prob_val[:, :, :, 1, 1]
-      expected_log_pdf = stats.multivariate_normal(
-          mean=mu[1, 1, :], cov=sigma[1, 1, :, :]).logpdf(x)
-      self.assertAllClose(expected_log_pdf, x_log_pdf)
-
-  def testSampleMultiDimensional(self):
-    with self.test_session():
-      mu = self._rng.rand(3, 5, 2)
-      chol, sigma = self._random_chol(3, 5, 2, 2)
-
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-      n = constant_op.constant(100000)
-      samples = mvn.sample(n, seed=137)
-      sample_values = samples.eval()
-
-      self.assertEqual(samples.get_shape(), (100000, 3, 5, 2))
-      self.assertAllClose(
-          sample_values[:, 1, 1, :].mean(axis=0), mu[1, 1, :], atol=0.05)
-      self.assertAllClose(
-          np.cov(sample_values[:, 1, 1, :], rowvar=0),
-          sigma[1, 1, :, :],
-          atol=1e-1)
-
-  def testShapes(self):
-    with self.test_session():
-      mu = self._rng.rand(3, 5, 2)
-      chol, _ = self._random_chol(3, 5, 2, 2)
-
-      mvn = distributions.MultivariateNormalCholesky(mu, chol)
-
-      # Shapes known at graph construction time.
-      self.assertEqual((2,), tuple(mvn.event_shape.as_list()))
-      self.assertEqual((3, 5), tuple(mvn.batch_shape.as_list()))
-
-      # Shapes known at runtime.
-      self.assertEqual((2,), tuple(mvn.event_shape_tensor().eval()))
-      self.assertEqual((3, 5), tuple(mvn.batch_shape_tensor().eval()))
-
-
-class MultivariateNormalFullTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_mu_and_sigma(self, batch_shape, event_shape):
-    # This ensures sigma is positive def.
-    mat_shape = batch_shape + event_shape + event_shape
-    mat = self._rng.randn(*mat_shape)
-    sigma = math_ops.matmul(mat, mat, adjoint_b=True).eval()
-
-    mu_shape = batch_shape + event_shape
-    mu = self._rng.randn(*mu_shape)
-
-    return mu, sigma
-
-  def testKLNonBatch(self):
-    batch_shape = ()
-    event_shape = (2,)
-    with self.test_session():
-      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
-      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
-      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
-      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
-
-      kl = distributions.kl(mvn_a, mvn_b)
-      self.assertEqual(batch_shape, kl.get_shape())
-
-      kl_v = kl.eval()
-      expected_kl = _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b)
-      self.assertAllClose(expected_kl, kl_v)
-
-  def testKLBatch(self):
-    batch_shape = (2,)
-    event_shape = (3,)
-    with self.test_session():
-      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
-      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
-      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
-      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
-
-      kl = distributions.kl(mvn_a, mvn_b)
-      self.assertEqual(batch_shape, kl.get_shape())
-
-      kl_v = kl.eval()
-      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
-                                            mu_b[0, :], sigma_b[0, :])
-      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
-                                            mu_b[1, :], sigma_b[1, :])
-      self.assertAllClose(expected_kl_0, kl_v[0])
-      self.assertAllClose(expected_kl_1, kl_v[1])
-
-  def testKLTwoIdenticalDistributionsIsZero(self):
-    batch_shape = (2,)
-    event_shape = (3,)
-    with self.test_session():
-      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
-      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
-
-      # Should be zero since KL(p || p) = =.
-      kl = distributions.kl(mvn_a, mvn_a)
-      self.assertEqual(batch_shape, kl.get_shape())
-
-      kl_v = kl.eval()
-      self.assertAllClose(np.zeros(*batch_shape), kl_v)
-
-
-def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
-  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
-  # Check using numpy operations
-  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
-  # So it is important to also check that KL(mvn, mvn) = 0.
-  sigma_b_inv = np.linalg.inv(sigma_b)
-
-  t = np.trace(sigma_b_inv.dot(sigma_a))
-  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
-  k = mu_a.shape[0]
-  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
-
-  return 0.5 * (t + q - k + l)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
new file mode 100644
index 00000000000..685f32883da
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -0,0 +1,425 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormal."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib import distributions
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+ds = distributions
+
+
+class MultivariateNormalTriLTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_chol(self, *shape):
+    mat = self._rng.rand(*shape)
+    chol = ds.matrix_diag_transform(mat, transform=nn_ops.softplus)
+    chol = array_ops.matrix_band_part(chol, -1, 0)
+    sigma = math_ops.matmul(chol, chol, adjoint_b=True)
+    return chol.eval(), sigma.eval()
+
+  def testLogPDFScalarBatch(self):
+    with self.test_session():
+      mu = self._rng.rand(2)
+      chol, sigma = self._random_chol(2, 2)
+      chol[1, 1] = -chol[1, 1]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      x = self._rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testLogPDFXIsHigherRank(self):
+    with self.test_session():
+      mu = self._rng.rand(2)
+      chol, sigma = self._random_chol(2, 2)
+      chol[0, 0] = -chol[0, 0]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      x = self._rng.rand(3, 2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((3,), log_pdf.get_shape())
+      self.assertEqual((3,), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval(), atol=0., rtol=0.02)
+      self.assertAllClose(expected_pdf, pdf.eval(), atol=0., rtol=0.03)
+
+  def testLogPDFXLowerDimension(self):
+    with self.test_session():
+      mu = self._rng.rand(3, 2)
+      chol, sigma = self._random_chol(3, 2, 2)
+      chol[0, 0, 0] = -chol[0, 0, 0]
+      chol[2, 1, 1] = -chol[2, 1, 1]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      x = self._rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      self.assertEqual((3,), log_pdf.get_shape())
+      self.assertEqual((3,), pdf.get_shape())
+
+      # scipy can't do batches, so just test one of them.
+      scipy_mvn = stats.multivariate_normal(mean=mu[1, :], cov=sigma[1, :, :])
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+
+      self.assertAllClose(expected_log_pdf, log_pdf.eval()[1])
+      self.assertAllClose(expected_pdf, pdf.eval()[1])
+
+  def testEntropy(self):
+    with self.test_session():
+      mu = self._rng.rand(2)
+      chol, sigma = self._random_chol(2, 2)
+      chol[0, 0] = -chol[0, 0]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      entropy = mvn.entropy()
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+      expected_entropy = scipy_mvn.entropy()
+      self.assertEqual(entropy.get_shape(), ())
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testEntropyMultidimensional(self):
+    with self.test_session():
+      mu = self._rng.rand(3, 5, 2)
+      chol, sigma = self._random_chol(3, 5, 2, 2)
+      chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
+      chol[2, 3, 1, 1] = -chol[2, 3, 1, 1]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      entropy = mvn.entropy()
+
+      # Scipy doesn't do batches, so test one of them.
+      expected_entropy = stats.multivariate_normal(
+          mean=mu[1, 1, :], cov=sigma[1, 1, :, :]).entropy()
+      self.assertEqual(entropy.get_shape(), (3, 5))
+      self.assertAllClose(expected_entropy, entropy.eval()[1, 1])
+
+  def testSample(self):
+    with self.test_session():
+      mu = self._rng.rand(2)
+      chol, sigma = self._random_chol(2, 2)
+      chol[0, 0] = -chol[0, 0]
+      sigma[0, 1] = -sigma[0, 1]
+      sigma[1, 0] = -sigma[1, 0]
+
+      n = constant_op.constant(100000)
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      samples = mvn.sample(n, seed=137)
+      sample_values = samples.eval()
+      self.assertEqual(samples.get_shape(), [int(100e3), 2])
+      self.assertAllClose(sample_values.mean(axis=0), mu, atol=1e-2)
+      self.assertAllClose(np.cov(sample_values, rowvar=0), sigma, atol=0.06)
+
+  def testSingularScaleRaises(self):
+    with self.test_session():
+      mu = None
+      chol = [[1., 0.], [0., 0.]]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      with self.assertRaisesOpError("Singular operator"):
+        mvn.sample().eval()
+
+  def testSampleWithSampleShape(self):
+    with self.test_session():
+      mu = self._rng.rand(3, 5, 2)
+      chol, sigma = self._random_chol(3, 5, 2, 2)
+      chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
+      chol[2, 3, 1, 1] = -chol[2, 3, 1, 1]
+
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      samples_val = mvn.sample((10, 11, 12), seed=137).eval()
+
+      # Check sample shape
+      self.assertEqual((10, 11, 12, 3, 5, 2), samples_val.shape)
+
+      # Check sample means
+      x = samples_val[:, :, :, 1, 1, :]
+      self.assertAllClose(
+          x.reshape(10 * 11 * 12, 2).mean(axis=0), mu[1, 1], atol=0.05)
+
+      # Check that log_prob(samples) works
+      log_prob_val = mvn.log_prob(samples_val).eval()
+      x_log_pdf = log_prob_val[:, :, :, 1, 1]
+      expected_log_pdf = stats.multivariate_normal(
+          mean=mu[1, 1, :], cov=sigma[1, 1, :, :]).logpdf(x)
+      self.assertAllClose(expected_log_pdf, x_log_pdf)
+
+  def testSampleMultiDimensional(self):
+    with self.test_session():
+      mu = self._rng.rand(3, 5, 2)
+      chol, sigma = self._random_chol(3, 5, 2, 2)
+      chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
+      chol[2, 3, 1, 1] = -chol[2, 3, 1, 1]
+
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      n = constant_op.constant(100000)
+      samples = mvn.sample(n, seed=137)
+      sample_values = samples.eval()
+
+      self.assertEqual(samples.get_shape(), (100000, 3, 5, 2))
+      self.assertAllClose(
+          sample_values[:, 1, 1, :].mean(axis=0), mu[1, 1, :], atol=0.05)
+      self.assertAllClose(
+          np.cov(sample_values[:, 1, 1, :], rowvar=0),
+          sigma[1, 1, :, :],
+          atol=1e-1)
+
+  def testShapes(self):
+    with self.test_session():
+      mu = self._rng.rand(3, 5, 2)
+      chol, _ = self._random_chol(3, 5, 2, 2)
+      chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
+      chol[2, 3, 1, 1] = -chol[2, 3, 1, 1]
+
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+
+      # Shapes known at graph construction time.
+      self.assertEqual((2,), tuple(mvn.event_shape.as_list()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape.as_list()))
+
+      # Shapes known at runtime.
+      self.assertEqual((2,), tuple(mvn.event_shape_tensor().eval()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape_tensor().eval()))
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = self._rng.randn(*mat_shape)
+    perm = np.arange(mat.ndim)
+    perm[-2:] = [perm[-1], perm[-2]]
+    sigma = np.matmul(mat, np.transpose(mat, perm))
+
+    mu_shape = batch_shape + event_shape
+    mu = self._rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLNonBatch(self):
+    batch_shape = ()
+    event_shape = (2,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalTriL(
+          loc=mu_a,
+          scale_tril=np.linalg.cholesky(sigma_a),
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalTriL(
+          loc=mu_b,
+          scale_tril=np.linalg.cholesky(sigma_b),
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl = _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b)
+      self.assertAllClose(expected_kl, kl_v)
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalTriL(
+          loc=mu_a,
+          scale_tril=np.linalg.cholesky(sigma_a),
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalTriL(
+          loc=mu_b,
+          scale_tril=np.linalg.cholesky(sigma_b),
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+  def testKLTwoIdenticalDistributionsIsZero(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalTriL(
+          loc=mu_a,
+          scale_tril=np.linalg.cholesky(sigma_a),
+          validate_args=True)
+
+      # Should be zero since KL(p || p) = =.
+      kl = ds.kl_divergence(mvn_a, mvn_a)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      self.assertAllClose(np.zeros(*batch_shape), kl_v)
+
+  def testSampleLarge(self):
+    mu = np.array([-1., 1], dtype=np.float32)
+    scale_tril = np.array([[3., 0], [1, -2]], dtype=np.float32) / 3.
+
+    true_mean = mu
+    true_scale = scale_tril
+    true_covariance = np.matmul(true_scale, true_scale.T)
+    true_variance = np.diag(true_covariance)
+    true_stddev = np.sqrt(true_variance)
+
+    with self.test_session() as sess:
+      dist = ds.MultivariateNormalTriL(
+          loc=mu,
+          scale_tril=scale_tril,
+          validate_args=True)
+
+      # The following distributions will test the KL divergence calculation.
+      mvn_chol = ds.MultivariateNormalTriL(
+          loc=np.array([0.5, 1.2], dtype=np.float32),
+          scale_tril=np.array([[3., 0], [1, 2]], dtype=np.float32),
+          validate_args=True)
+
+      n = int(10e3)
+      samps = dist.sample(n, seed=0)
+      sample_mean = math_ops.reduce_mean(samps, 0)
+      x = samps - sample_mean
+      sample_covariance = math_ops.matmul(x, x, transpose_a=True) / n
+
+      sample_kl_chol = math_ops.reduce_mean(
+          dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
+
+      scale = dist.scale.to_dense()
+
+      [
+          sample_mean_,
+          analytical_mean_,
+          sample_covariance_,
+          analytical_covariance_,
+          analytical_variance_,
+          analytical_stddev_,
+          sample_kl_chol_, analytical_kl_chol_,
+          scale_,
+      ] = sess.run([
+          sample_mean,
+          dist.mean(),
+          sample_covariance,
+          dist.covariance(),
+          dist.variance(),
+          dist.stddev(),
+          sample_kl_chol, analytical_kl_chol,
+          scale,
+      ])
+
+      sample_variance_ = np.diag(sample_covariance_)
+      sample_stddev_ = np.sqrt(sample_variance_)
+
+      logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
+      logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
+      logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_))
+
+      logging.vlog(2, "true_covariance:\n{}".format(true_covariance))
+      logging.vlog(2, "sample_covariance:\n{}".format(sample_covariance_))
+      logging.vlog(
+          2, "analytical_covariance:\n{}".format(analytical_covariance_))
+
+      logging.vlog(2, "true_variance:\n{}".format(true_variance))
+      logging.vlog(2, "sample_variance:\n{}".format(sample_variance_))
+      logging.vlog(2, "analytical_variance:\n{}".format(analytical_variance_))
+
+      logging.vlog(2, "true_stddev:\n{}".format(true_stddev))
+      logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
+      logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
+
+      logging.vlog(2, "true_scale:\n{}".format(true_scale))
+      logging.vlog(2, "scale:\n{}".format(scale_))
+
+      logging.vlog(2, "kl_chol:      analytical:{}  sample:{}".format(
+          analytical_kl_chol_, sample_kl_chol_))
+
+      self.assertAllClose(true_mean, sample_mean_,
+                          atol=0., rtol=0.03)
+      self.assertAllClose(true_mean, analytical_mean_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_covariance, sample_covariance_,
+                          atol=0., rtol=0.03)
+      self.assertAllClose(true_covariance, analytical_covariance_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_variance, sample_variance_,
+                          atol=0., rtol=0.02)
+      self.assertAllClose(true_variance, analytical_variance_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_stddev, sample_stddev_,
+                          atol=0., rtol=0.01)
+      self.assertAllClose(true_stddev, analytical_stddev_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(true_scale, scale_,
+                          atol=0., rtol=1e-6)
+
+      self.assertAllClose(sample_kl_chol_, analytical_kl_chol_,
+                          atol=0., rtol=0.02)
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
new file mode 100644
index 00000000000..c1a74c6483b
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
@@ -0,0 +1,246 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib.distributions.python.ops import negative_binomial
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+# In all tests that follow, we use scipy.stats.nbinom, which
+# represents a Negative Binomial distribution, with success and failure
+# probabilities flipped.
+class NegativeBinomialTest(test.TestCase):
+
+  def testNegativeBinomialShape(self):
+    with self.test_session():
+      probs = [.1] * 5
+      total_count = [2.0] * 5
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+
+      self.assertEqual([5], negbinom.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([5]), negbinom.batch_shape)
+      self.assertAllEqual([], negbinom.event_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), negbinom.event_shape)
+
+  def testNegativeBinomialShapeBroadcast(self):
+    with self.test_session():
+      probs = [[.1, .2, .3]] * 5
+      total_count = [[2.]] * 5
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+
+      self.assertAllEqual([5, 3], negbinom.batch_shape_tensor().eval())
+      self.assertAllEqual(
+          tensor_shape.TensorShape([5, 3]), negbinom.batch_shape)
+      self.assertAllEqual([], negbinom.event_shape_tensor().eval())
+      self.assertAllEqual(tensor_shape.TensorShape([]), negbinom.event_shape)
+
+  def testLogits(self):
+    logits = [[0., 9., -0.5]]
+    with self.test_session():
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=3., logits=logits)
+      self.assertEqual([1, 3], negbinom.probs.get_shape())
+      self.assertEqual([1, 3], negbinom.logits.get_shape())
+      self.assertAllClose(logits, negbinom.logits.eval())
+
+  def testInvalidP(self):
+    invalid_ps = [-.01, 0., -2.,]
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x >= 0"):
+        negbinom = negative_binomial.NegativeBinomial(
+            5., probs=invalid_ps, validate_args=True)
+        negbinom.probs.eval()
+
+    invalid_ps = [1.01, 2., 1.001,]
+    with self.test_session():
+      with self.assertRaisesOpError("probs has components greater than 1."):
+        negbinom = negative_binomial.NegativeBinomial(
+            5., probs=invalid_ps, validate_args=True)
+        negbinom.probs.eval()
+
+  def testInvalidNegativeCount(self):
+    invalid_rs = [-.01, 0., -2.,]
+    with self.test_session():
+      with self.assertRaisesOpError("Condition x > 0"):
+        negbinom = negative_binomial.NegativeBinomial(
+            total_count=invalid_rs, probs=0.1, validate_args=True)
+        negbinom.total_count.eval()
+
+  def testNegativeBinomialLogCdf(self):
+    with self.test_session():
+      batch_size = 6
+      probs = [.2] * batch_size
+      probs_v = .2
+      total_count = 5.
+      x = np.array([2., 3., 4., 5., 6., 7.], dtype=np.float32)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_log_cdf = stats.nbinom.logcdf(x, n=total_count, p=1 - probs_v)
+      log_cdf = negbinom.log_cdf(x)
+      self.assertEqual([6], log_cdf.get_shape())
+      self.assertAllClose(expected_log_cdf, log_cdf.eval())
+
+      cdf = negbinom.cdf(x)
+      self.assertEqual([6], cdf.get_shape())
+      self.assertAllClose(np.exp(expected_log_cdf), cdf.eval())
+
+  def testNegativeBinomialLogCdfValidateArgs(self):
+    with self.test_session():
+      batch_size = 6
+      probs = [.9] * batch_size
+      total_count = 5.
+      with self.assertRaisesOpError("Condition x >= 0"):
+        negbinom = negative_binomial.NegativeBinomial(
+            total_count=total_count, probs=probs, validate_args=True)
+        negbinom.log_cdf(-1.).eval()
+
+  def testNegativeBinomialLogPmf(self):
+    with self.test_session():
+      batch_size = 6
+      probs = [.2] * batch_size
+      probs_v = .2
+      total_count = 5.
+      x = np.array([2., 3., 4., 5., 6., 7.], dtype=np.float32)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_log_pmf = stats.nbinom.logpmf(x, n=total_count, p=1 - probs_v)
+      log_pmf = negbinom.log_prob(x)
+      self.assertEqual([6], log_pmf.get_shape())
+      self.assertAllClose(expected_log_pmf, log_pmf.eval())
+
+      pmf = negbinom.prob(x)
+      self.assertEqual([6], pmf.get_shape())
+      self.assertAllClose(np.exp(expected_log_pmf), pmf.eval())
+
+  def testNegativeBinomialLogPmfValidateArgs(self):
+    with self.test_session():
+      batch_size = 6
+      probs = [.9] * batch_size
+      total_count = 5.
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs, validate_args=True)
+
+      with self.assertRaisesOpError("Condition x == y"):
+        log_pmf = negbinom.log_prob(x)
+        log_pmf.eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x >= 0"):
+        log_pmf = negbinom.log_prob([-1.])
+        log_pmf.eval(feed_dict=feed_dict)
+
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs, validate_args=False)
+      log_pmf = negbinom.log_prob(x)
+      self.assertEqual([6], log_pmf.get_shape())
+      pmf = negbinom.prob(x)
+      self.assertEqual([6], pmf.get_shape())
+
+  def testNegativeBinomialLogPmfMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      probs = constant_op.constant([[.2, .3, .5]] * batch_size)
+      probs_v = np.array([.2, .3, .5])
+      total_count = 5.
+      x = np.array([[2., 3., 4., 5., 6., 7.]], dtype=np.float32).T
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_log_pmf = stats.nbinom.logpmf(
+          x, n=total_count, p=1 - probs_v)
+      log_pmf = negbinom.log_prob(x)
+      log_pmf_values = log_pmf.eval()
+      self.assertEqual([6, 3], log_pmf.get_shape())
+      self.assertAllClose(expected_log_pmf, log_pmf_values)
+
+      pmf = negbinom.prob(x)
+      pmf_values = pmf.eval()
+      self.assertEqual([6, 3], pmf.get_shape())
+      self.assertAllClose(np.exp(expected_log_pmf), pmf_values)
+
+  def testNegativeBinomialMean(self):
+    with self.test_session():
+      total_count = 5.
+      probs = np.array([.1, .3, .25], dtype=np.float32)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_means = stats.nbinom.mean(n=total_count, p=1 - probs)
+      self.assertEqual([3], negbinom.mean().get_shape())
+      self.assertAllClose(expected_means, negbinom.mean().eval())
+
+  def testNegativeBinomialVariance(self):
+    with self.test_session():
+      total_count = 5.
+      probs = np.array([.1, .3, .25], dtype=np.float32)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_vars = stats.nbinom.var(n=total_count, p=1 - probs)
+      self.assertEqual([3], negbinom.variance().get_shape())
+      self.assertAllClose(expected_vars, negbinom.variance().eval())
+
+  def testNegativeBinomialStddev(self):
+    with self.test_session():
+      total_count = 5.
+      probs = np.array([.1, .3, .25], dtype=np.float32)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+      expected_stds = stats.nbinom.std(n=total_count, p=1 - probs)
+      self.assertEqual([3], negbinom.stddev().get_shape())
+      self.assertAllClose(expected_stds, negbinom.stddev().eval())
+
+  def testNegativeBinomialSample(self):
+    with self.test_session() as sess:
+      probs = [.3, .9]
+      total_count = [4., 11.]
+      n = int(100e3)
+      negbinom = negative_binomial.NegativeBinomial(
+          total_count=total_count, probs=probs)
+
+      samples = negbinom.sample(n, seed=12345)
+      self.assertEqual([n, 2], samples.get_shape())
+
+      sample_mean = math_ops.reduce_mean(samples, axis=0)
+      sample_var = math_ops.reduce_mean(
+          (samples - sample_mean[array_ops.newaxis, ...])**2., axis=0)
+      sample_min = math_ops.reduce_min(samples)
+      [sample_mean_, sample_var_, sample_min_] = sess.run([
+          sample_mean, sample_var, sample_min])
+      self.assertAllEqual(np.ones(sample_min_.shape, dtype=np.bool),
+                          sample_min_ >= 0.0)
+      for i in range(2):
+        self.assertAllClose(sample_mean_[i],
+                            stats.nbinom.mean(total_count[i], 1 - probs[i]),
+                            atol=0.,
+                            rtol=.02)
+        self.assertAllClose(sample_var_[i],
+                            stats.nbinom.var(total_count[i], 1 - probs[i]),
+                            atol=0.,
+                            rtol=.02)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
index 56ad4a081bc..111f88eeb50 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.distributions.python.ops import onehot_categorical
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -27,6 +26,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -178,8 +178,8 @@ class OneHotCategoricalTest(test.TestCase):
           kl_expected = np.sum(
               prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1)
 
-          kl_actual = kullback_leibler.kl(p, q)
-          kl_same = kullback_leibler.kl(p, p)
+          kl_actual = kullback_leibler.kl_divergence(p, q)
+          kl_same = kullback_leibler.kl_divergence(p, p)
           x = p.sample(int(2e4), seed=0)
           x = math_ops.cast(x, dtype=dtypes.float32)
           # Compute empirical KL(p||q).
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
index 49ece78b0d2..6549992633d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
@@ -19,16 +19,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
-
 
 def softplus(x):
   return np.log(1 + np.exp(x))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
index dd59c649e10..35a7c7e6039 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
@@ -47,7 +47,7 @@ class OperatorPDFullTest(test.TestCase):
       operator = operator_pd_full.OperatorPDFull(matrix, verify_pd=True)
       # Could fail inside Cholesky decomposition, or later when we test the
       # diag.
-      with self.assertRaisesOpError("x > 0|LLT"):
+      with self.assertRaisesOpError("x > 0|Cholesky"):
         operator.to_dense().eval()
 
   def testNonSymmetricMatrixRaises(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
index 0adaf7d816d..f157c0d3edd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
@@ -21,7 +21,9 @@ import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -64,17 +66,18 @@ class PoissonTest(test.TestCase):
     with self.test_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
-      x = [2.5, 3.2, 4.3, 5.1, 6., 7.]
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
       poisson = poisson_lib.Poisson(rate=lam, validate_args=True)
 
       # Non-integer
-      with self.assertRaisesOpError("x has non-integer components"):
+      with self.assertRaisesOpError("cannot contain fractional components"):
         log_pmf = poisson.log_prob(x)
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         log_pmf = poisson.log_prob([-1.])
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       poisson = poisson_lib.Poisson(rate=lam, validate_args=False)
       log_pmf = poisson.log_prob(x)
@@ -170,6 +173,56 @@ class PoissonTest(test.TestCase):
       self.assertEqual((6,), poisson.mode().get_shape())
       self.assertAllClose(lam_v, poisson.mode().eval())
 
+  def testPoissonSample(self):
+    with self.test_session():
+      lam_v = 4.0
+      lam = constant_op.constant(lam_v)
+      # Choosing `n >= (k/rtol)**2, roughly ensures our sample mean should be
+      # within `k` std. deviations of actual up to rtol precision.
+      n = int(100e3)
+      poisson = poisson_lib.Poisson(rate=lam)
+      samples = poisson.sample(n, seed=123456)
+      sample_values = samples.eval()
+      self.assertEqual(samples.get_shape(), (n,))
+      self.assertEqual(sample_values.shape, (n,))
+      self.assertAllClose(
+          sample_values.mean(), stats.poisson.mean(lam_v), rtol=.01)
+      self.assertAllClose(
+          sample_values.var(), stats.poisson.var(lam_v), rtol=.01)
+
+  def testPoissonSampleMultidimensionalMean(self):
+    with self.test_session():
+      lam_v = np.array([np.arange(1, 51, dtype=np.float32)])  # 1 x 50
+      poisson = poisson_lib.Poisson(rate=lam_v)
+      # Choosing `n >= (k/rtol)**2, roughly ensures our sample mean should be
+      # within `k` std. deviations of actual up to rtol precision.
+      n = int(100e3)
+      samples = poisson.sample(n, seed=123456)
+      sample_values = samples.eval()
+      self.assertEqual(samples.get_shape(), (n, 1, 50))
+      self.assertEqual(sample_values.shape, (n, 1, 50))
+      self.assertAllClose(
+          sample_values.mean(axis=0),
+          stats.poisson.mean(lam_v),
+          rtol=.01,
+          atol=0)
+
+  def testPoissonSampleMultidimensionalVariance(self):
+    with self.test_session():
+      lam_v = np.array([np.arange(5, 15, dtype=np.float32)])  # 1 x 10
+      poisson = poisson_lib.Poisson(rate=lam_v)
+      # Choosing `n >= 2 * lam * (k/rtol)**2, roughly ensures our sample
+      # variance should be within `k` std. deviations of actual up to rtol
+      # precision.
+      n = int(300e3)
+      samples = poisson.sample(n, seed=123456)
+      sample_values = samples.eval()
+      self.assertEqual(samples.get_shape(), (n, 1, 10))
+      self.assertEqual(sample_values.shape, (n, 1, 10))
+
+      self.assertAllClose(
+          sample_values.var(axis=0), stats.poisson.var(lam_v), rtol=.03, atol=0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
index 0e2d1437323..6a7ee3a8bfa 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
@@ -373,15 +373,16 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self):
     with self.test_session():
+      low = array_ops.placeholder(dtypes.float32)
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
-          low=1.5,
+          low=low,
           high=10.,
           validate_args=True)
 
       self.assertTrue(qdist.validate_args)  # Default is True.
       with self.assertRaisesOpError("has non-integer components"):
-        qdist.sample().eval()
+        qdist.sample().eval(feed_dict={low: 1.5})
 
   def testCutoffsCanBeFloatValuedIfValidateArgsFalse(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
index 2fc63ff5a94..2cf12bbe50e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
@@ -101,12 +101,6 @@ class RelaxedBernoulliTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sample.eval()
 
-  def testContinuous(self):
-    temperature = 1.0
-    p = [0.1, 0.4]
-    dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p)
-    self.assertTrue(dist.is_continuous)
-
   def testDtype(self):
     temperature = constant_op.constant(1.0, dtype=dtypes.float32)
     p = constant_op.constant([0.1, 0.4], dtype=dtypes.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
new file mode 100644
index 00000000000..595d9f5df75
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -0,0 +1,227 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Sample Stats Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import sample_stats
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(0)
+
+
+class PercentileTestWithLowerInterpolation(test.TestCase):
+
+  _interpolation = "lower"
+
+  def test_one_dim_odd_input(self):
+    x = [1., 5., 3., 2., 4.]
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation, axis=0)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x, q=q, interpolation=self._interpolation, axis=[0])
+        self.assertAllEqual((), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_one_dim_even_input(self):
+    x = [1., 5., 3., 2., 4., 5.]
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation)
+      with self.test_session():
+        pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
+        self.assertAllEqual((), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_two_dim_odd_input_axis_0(self):
+    x = np.array([[-1., 50., -3.5, 2., -1], [0., 0., 3., 2., 4.]]).T
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation, axis=0)
+      with self.test_session():
+        # Get dim 1 with negative and positive indices.
+        pct_neg_index = sample_stats.percentile(
+            x, q=q, interpolation=self._interpolation, axis=[0])
+        pct_pos_index = sample_stats.percentile(
+            x, q=q, interpolation=self._interpolation, axis=[0])
+        self.assertAllEqual((2,), pct_neg_index.get_shape())
+        self.assertAllEqual((2,), pct_pos_index.get_shape())
+        self.assertAllClose(expected_percentile, pct_neg_index.eval())
+        self.assertAllClose(expected_percentile, pct_pos_index.eval())
+
+  def test_two_dim_even_axis_0(self):
+    x = np.array([[1., 2., 4., 50.], [1., 2., -4., 5.]]).T
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation, axis=0)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x, q=q, interpolation=self._interpolation, axis=[0])
+        self.assertAllEqual((2,), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_two_dim_even_input_and_keep_dims_true(self):
+    x = np.array([[1., 2., 4., 50.], [1., 2., -4., 5.]]).T
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation, keepdims=True, axis=0)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x,
+            q=q,
+            interpolation=self._interpolation,
+            keep_dims=True,
+            axis=[0])
+        self.assertAllEqual((1, 2), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_four_dimensional_input(self):
+    x = rng.rand(2, 3, 4, 5)
+    for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
+      expected_percentile = np.percentile(
+          x, q=0.77, interpolation=self._interpolation, axis=axis)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x,
+            q=0.77,
+            interpolation=self._interpolation,
+            axis=axis)
+        self.assertAllEqual(expected_percentile.shape, pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_four_dimensional_input_and_keepdims(self):
+    x = rng.rand(2, 3, 4, 5)
+    for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
+      expected_percentile = np.percentile(
+          x,
+          q=0.77,
+          interpolation=self._interpolation,
+          axis=axis,
+          keepdims=True)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x,
+            q=0.77,
+            interpolation=self._interpolation,
+            axis=axis,
+            keep_dims=True)
+        self.assertAllEqual(expected_percentile.shape, pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_four_dimensional_input_x_static_ndims_but_dynamic_sizes(self):
+    x = rng.rand(2, 3, 4, 5)
+    x_ph = array_ops.placeholder(dtypes.float64, shape=[None, None, None, None])
+    for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
+      expected_percentile = np.percentile(
+          x, q=0.77, interpolation=self._interpolation, axis=axis)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x_ph,
+            q=0.77,
+            interpolation=self._interpolation,
+            axis=axis)
+        self.assertAllClose(expected_percentile, pct.eval(feed_dict={x_ph: x}))
+
+  def test_four_dimensional_input_and_keepdims_x_static_ndims_dynamic_sz(self):
+    x = rng.rand(2, 3, 4, 5)
+    x_ph = array_ops.placeholder(dtypes.float64, shape=[None, None, None, None])
+    for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
+      expected_percentile = np.percentile(
+          x,
+          q=0.77,
+          interpolation=self._interpolation,
+          axis=axis,
+          keepdims=True)
+      with self.test_session():
+        pct = sample_stats.percentile(
+            x_ph,
+            q=0.77,
+            interpolation=self._interpolation,
+            axis=axis,
+            keep_dims=True)
+        self.assertAllClose(expected_percentile, pct.eval(feed_dict={x_ph: x}))
+
+  def test_with_integer_dtype(self):
+    x = [1, 5, 3, 2, 4]
+    for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation)
+      with self.test_session():
+        pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
+        self.assertEqual(dtypes.int32, pct.dtype)
+        self.assertAllEqual((), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+
+class PercentileTestWithHigherInterpolation(
+    PercentileTestWithLowerInterpolation):
+
+  _interpolation = "higher"
+
+
+class PercentileTestWithNearestInterpolation(test.TestCase):
+  """Test separately because np.round and tf.round make different choices."""
+
+  _interpolation = "nearest"
+
+  def test_one_dim_odd_input(self):
+    x = [1., 5., 3., 2., 4.]
+    for q in [0, 10.1, 25.1, 49.9, 50.1, 50.01, 89, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation)
+      with self.test_session():
+        pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
+        self.assertAllEqual((), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_one_dim_even_input(self):
+    x = [1., 5., 3., 2., 4., 5.]
+    for q in [0, 10.1, 25.1, 49.9, 50.1, 50.01, 89, 100]:
+      expected_percentile = np.percentile(
+          x, q=q, interpolation=self._interpolation)
+      with self.test_session():
+        pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
+        self.assertAllEqual((), pct.get_shape())
+        self.assertAllClose(expected_percentile, pct.eval())
+
+  def test_invalid_interpolation_raises(self):
+    x = [1., 5., 3., 2., 4.]
+    with self.assertRaisesRegexp(ValueError, "interpolation"):
+      sample_stats.percentile(x, q=0.5, interpolation="bad")
+
+  def test_vector_q_raises_static(self):
+    x = [1., 5., 3., 2., 4.]
+    with self.assertRaisesRegexp(ValueError, "Expected.*ndims"):
+      sample_stats.percentile(x, q=[0.5])
+
+  def test_vector_q_raises_dynamic(self):
+    x = [1., 5., 3., 2., 4.]
+    q_ph = array_ops.placeholder(dtypes.float32)
+    pct = sample_stats.percentile(x, q=q_ph, validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("rank"):
+        pct.eval(feed_dict={q_ph: [0.5]})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index da058dcc92d..c8d795c3f6a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -41,6 +41,481 @@ def _constant(x):
   return tensor_util.constant_value(x)
 
 
+class MakeBatchReadyTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_sample(self, sample_shape, dtype=np.float32):
+    return self._rng.random_sample(sample_shape).astype(dtype)
+
+  def _get_expected(self, x, batch_ndims, event_ndims, expand_batch_dim):
+    # Cast as int32 array explicitly, since an empty x.shape defaults
+    # to float64, and we can't index as float64 in numpy 1.12+.
+    x_shape = np.array(x.shape, dtype=np.int32)
+    n = x.ndim - batch_ndims - event_ndims
+    sample_shape = x_shape[:n]
+    y = np.reshape(x, np.concatenate([[-1], x_shape[n:]], 0))
+    y = np.transpose(y, np.roll(np.arange(y.ndim), -1))
+    if event_ndims == 0:
+      y = y[..., np.newaxis, :]
+    if batch_ndims == 0 and expand_batch_dim:
+      y = y[np.newaxis, ...]
+    return y, sample_shape
+
+  def _build_graph(self, x, batch_ndims, event_ndims, expand_batch_dim):
+    shaper = _DistributionShape(batch_ndims=batch_ndims,
+                                event_ndims=event_ndims)
+    y, sample_shape = shaper.make_batch_of_event_sample_matrices(
+        x, expand_batch_dim=expand_batch_dim)
+    should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
+        y, sample_shape, expand_batch_dim=expand_batch_dim)
+    return y, sample_shape, should_be_x_value
+
+  def _test_dynamic(self, x, batch_ndims, event_ndims, expand_batch_dim=True):
+    with self.test_session() as sess:
+      x_pl = array_ops.placeholder(x.dtype)
+      batch_ndims_pl = array_ops.placeholder(dtypes.int32)
+      event_ndims_pl = array_ops.placeholder(dtypes.int32)
+      [y_, sample_shape_, should_be_x_value_] = sess.run(
+          self._build_graph(
+              x_pl, batch_ndims_pl, event_ndims_pl, expand_batch_dim),
+          feed_dict={
+              x_pl: x,
+              batch_ndims_pl: batch_ndims,
+              event_ndims_pl: event_ndims})
+    expected_y, expected_sample_shape = self._get_expected(
+        x, batch_ndims, event_ndims, expand_batch_dim)
+    self.assertAllEqual(expected_sample_shape, sample_shape_)
+    self.assertAllEqual(expected_y, y_)
+    self.assertAllEqual(x, should_be_x_value_)
+
+  def _test_static(self, x, batch_ndims, event_ndims, expand_batch_dim):
+    with self.test_session() as sess:
+      [y_, sample_shape_, should_be_x_value_] = sess.run(
+          self._build_graph(x, batch_ndims, event_ndims, expand_batch_dim))
+    expected_y, expected_sample_shape = self._get_expected(
+        x, batch_ndims, event_ndims, expand_batch_dim)
+    self.assertAllEqual(expected_sample_shape, sample_shape_)
+    self.assertAllEqual(expected_y, y_)
+    self.assertAllEqual(x, should_be_x_value_)
+
+  # Group 1a: Static scalar input.
+
+  def testStaticScalarNdims00ExpandNo(self):
+    self._test_static(x=self._random_sample([]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticScalarNdims00ExpandYes(self):
+    self._test_static(x=self._random_sample([]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticScalarNdims01ExpandNo(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=0,
+                        event_ndims=1,
+                        expand_batch_dim=False)
+
+  def testStaticScalarNdims01ExpandYes(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=0,
+                        event_ndims=1,
+                        expand_batch_dim=True)
+
+  def testStaticScalarNdims10ExpandNo(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=1,
+                        event_ndims=0,
+                        expand_batch_dim=False)
+
+  def testStaticScalarNdims10ExpandYes(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=1,
+                        event_ndims=0,
+                        expand_batch_dim=True)
+
+  def testStaticScalarNdims11ExpandNo(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=1,
+                        event_ndims=1,
+                        expand_batch_dim=False)
+
+  def testStaticScalarNdims11ExpandYes(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([]),
+                        batch_ndims=1,
+                        event_ndims=1,
+                        expand_batch_dim=True)
+
+  # Group 1b: Dynamic scalar input.
+  def testDynamicScalar3Ndims00ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicScalar3Ndims00ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicScalarNdims01ExpandNo(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=0,
+                         event_ndims=1,
+                         expand_batch_dim=False)
+
+  def testDynamicScalarNdims01ExpandYes(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=0,
+                         event_ndims=1,
+                         expand_batch_dim=True)
+
+  def testDynamicScalarNdims10ExpandNo(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=1,
+                         event_ndims=0,
+                         expand_batch_dim=False)
+
+  def testDynamicScalarNdims10ExpandYes(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=1,
+                         event_ndims=0,
+                         expand_batch_dim=True)
+
+  def testDynamicScalarNdims11ExpandNo(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=1,
+                         event_ndims=1,
+                         expand_batch_dim=False)
+
+  def testDynamicScalarNdims11ExpandYes(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([]),
+                         batch_ndims=1,
+                         event_ndims=1,
+                         expand_batch_dim=True)
+
+  # Group 2a: Static vector input.
+
+  def testStaticVectorNdims00ExpandNo(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticVectorNdims00ExpandYes(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticVectorNdims01ExpandNo(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=False)
+
+  def testStaticVectorNdims01ExpandYes(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=True)
+
+  def testStaticVectorNdims10ExpandNo(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticVectorNdims10ExpandYes(self):
+    self._test_static(x=self._random_sample([3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticVectorNdims11ExpandNo(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([3]),
+                        batch_ndims=1,
+                        event_ndims=1,
+                        expand_batch_dim=False)
+
+  def testStaticVectorNdims11ExpandYes(self):
+    with self.assertRaises(ValueError):
+      self._test_static(x=self._random_sample([3]),
+                        batch_ndims=1,
+                        event_ndims=1,
+                        expand_batch_dim=True)
+
+  # Group 2b: Dynamic vector input.
+
+  def testDynamicVectorNdims00ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicVectorNdims00ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicVectorNdims01ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=False)
+
+  def testDynamicVectorNdims01ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=True)
+
+  def testDynamicVectorNdims10ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicVectorNdims10ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicVectorNdims11ExpandNo(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([3]),
+                         batch_ndims=1,
+                         event_ndims=1,
+                         expand_batch_dim=False)
+
+  def testDynamicVectorNdims11ExpandYes(self):
+    with self.assertRaisesOpError(""):
+      self._test_dynamic(x=self._random_sample([3]),
+                         batch_ndims=1,
+                         event_ndims=1,
+                         expand_batch_dim=True)
+
+  # Group 3a: Static matrix input.
+
+  def testStaticMatrixNdims00ExpandNo(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticMatrixNdims00ExpandYes(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticMatrixNdims01ExpandNo(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=False)
+
+  def testStaticMatrixNdims01ExpandYes(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=True)
+
+  def testStaticMatrixNdims10ExpandNo(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticMatrixNdims10ExpandYes(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticMatrixNdims11ExpandNo(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=1,
+                      event_ndims=1,
+                      expand_batch_dim=False)
+
+  def testStaticMatrixNdims11ExpandYes(self):
+    self._test_static(x=self._random_sample([2, 3]),
+                      batch_ndims=1,
+                      event_ndims=1,
+                      expand_batch_dim=True)
+
+  # Group 3b: Dynamic matrix input.
+
+  def testDynamicMatrixNdims00ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicMatrixNdims00ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicMatrixNdims01ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=False)
+
+  def testDynamicMatrixNdims01ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=True)
+
+  def testDynamicMatrixNdims10ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicMatrixNdims10ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicMatrixNdims11ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=1,
+                       event_ndims=1,
+                       expand_batch_dim=False)
+
+  def testDynamicMatrixNdims11ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([2, 3]),
+                       batch_ndims=1,
+                       event_ndims=1,
+                       expand_batch_dim=True)
+
+  # Group 4a: Static tensor input.
+
+  def testStaticTensorNdims00ExpandNo(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticTensorNdims00ExpandYes(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=0,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticTensorNdims01ExpandNo(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=False)
+
+  def testStaticTensorNdims01ExpandYes(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=0,
+                      event_ndims=1,
+                      expand_batch_dim=True)
+
+  def testStaticTensorNdims10ExpandNo(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=False)
+
+  def testStaticTensorNdims10ExpandYes(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=1,
+                      event_ndims=0,
+                      expand_batch_dim=True)
+
+  def testStaticTensorNdims11ExpandNo(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=1,
+                      event_ndims=1,
+                      expand_batch_dim=False)
+
+  def testStaticTensorNdims11ExpandYes(self):
+    self._test_static(x=self._random_sample([4, 1, 2, 3]),
+                      batch_ndims=1,
+                      event_ndims=1,
+                      expand_batch_dim=True)
+
+  # Group 4b: Dynamic tensor input.
+
+  def testDynamicTensorNdims00ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicTensorNdims00ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=0,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicTensorNdims01ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=False)
+
+  def testDynamicTensorNdims01ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=0,
+                       event_ndims=1,
+                       expand_batch_dim=True)
+
+  def testDynamicTensorNdims10ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=False)
+
+  def testDynamicTensorNdims10ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=1,
+                       event_ndims=0,
+                       expand_batch_dim=True)
+
+  def testDynamicTensorNdims11ExpandNo(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=1,
+                       event_ndims=1,
+                       expand_batch_dim=False)
+
+  def testDynamicTensorNdims11ExpandYes(self):
+    self._test_dynamic(x=self._random_sample([4, 1, 2, 3]),
+                       batch_ndims=1,
+                       event_ndims=1,
+                       expand_batch_dim=True)
+
+
 class DistributionShapeTest(test.TestCase):
 
   def setUp(self):
@@ -216,276 +691,5 @@ class DistributionShapeTest(test.TestCase):
           ([3], _empty_shape, [2]),
           sess.run(shaper.get_shape(y), feed_dict=feed_dict))
 
-  # TODO(jvdillon): Delete this test once we make expand_batch_dim=False
-  # the unalterable default.
-  def testDistributionShapeMakeBatchReadyStatic(self):
-    with self.test_session() as sess:
-      x = self._random_sample((1, 2, 3))
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual(np.transpose(x, axes=(1, 2, 0)), y.eval())
-      self.assertAllEqual((1,), sample_shape.eval())
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(x, should_be_x_value.eval())
-
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = self._random_sample((3, 4, 2), dtype=x.dtype)
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((3,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllClose(
-          np.transpose(np.reshape(x_value, (-1, 4, 2)), (1, 2, 0)),
-          sess.run(y, feed_dict=feed_dict),
-          rtol=1e-3)
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = np.ones((3,), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((3,), sess.run(sample_shape, feed_dict=feed_dict))
-      # The following check shows we don't need to manually set_shape in the
-      # ShapeUtil.
-      self.assertAllEqual((1, 1, None),
-                          y.get_shape().ndims and y.get_shape().as_list())
-      self.assertAllEqual(
-          np.ones(
-              (1, 1, 3), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-  def testDistributionShapeMakeBatchReadyStaticNoExpand(self):
-    with self.test_session() as sess:
-      x = self._random_sample((1, 2, 3))
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual(np.transpose(x, axes=(1, 2, 0)), y.eval())
-      self.assertAllEqual((1,), sample_shape.eval())
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(x, should_be_x_value.eval())
-
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = self._random_sample((3, 4, 2), dtype=x.dtype)
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual((3,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllClose(
-          np.transpose(np.reshape(x_value, (-1, 4, 2)), (1, 2, 0)),
-          sess.run(y, feed_dict=feed_dict),
-          rtol=1e-3)
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = np.ones([3], dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual([3], sess.run(sample_shape, feed_dict=feed_dict))
-      # The following check shows we don't need to manually set_shape in the
-      # ShapeUtil.
-      self.assertAllEqual([1, None],
-                          y.get_shape().ndims and y.get_shape().as_list())
-      self.assertAllEqual(
-          np.ones(
-              [1, 3], dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-  # TODO(jvdillon): Delete this test once we make expand_batch_dim=False
-  # the unalterable default.
-  def testDistributionShapeMakeBatchReadyDynamic(self):
-    with self.test_session() as sess:
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      x = array_ops.placeholder(dtypes.float32, shape=(1, 2, 3))
-      x_value = self._random_sample(x.get_shape().as_list(), dtype=x.dtype)
-      y, sample_shape = sess.run(shaper.make_batch_of_event_sample_matrices(x),
-                                 feed_dict={x: x_value})
-      self.assertAllEqual(np.transpose(x_value, (1, 2, 0)), y)
-      self.assertAllEqual((1,), sample_shape)
-
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((1,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.transpose(x_value, (1, 2, 0)), sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      batch_ndims = array_ops.placeholder(dtypes.int32)
-      event_ndims = array_ops.placeholder(dtypes.int32)
-      shaper = _DistributionShape(
-          batch_ndims=batch_ndims, event_ndims=event_ndims)
-
-      # batch_ndims = 1, event_ndims = 1.
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = np.ones((3, 4, 2), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 1, event_ndims: 1}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((3,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              (4, 2, 3), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 0, event_ndims = 0.
-      x_value = np.ones((3,), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 0, event_ndims: 0}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((3,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              (1, 1, 3), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 0, event_ndims = 1.
-      x_value = np.ones(
-          (
-              1,
-              2,), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 0, event_ndims: 1}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((1,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              (1, 2, 1), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 1, event_ndims = 0.
-      x_value = np.ones((1, 2), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 1, event_ndims: 0}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(x)
-      self.assertAllEqual((1,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              (2, 1, 1), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-  def testDistributionShapeMakeBatchReadyDynamicNoExpand(self):
-    with self.test_session() as sess:
-      shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
-      x = array_ops.placeholder(dtypes.float32, shape=(1, 2, 3))
-      x_value = self._random_sample(x.get_shape().as_list(), dtype=x.dtype)
-      y, sample_shape = sess.run(shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False),
-                                 feed_dict={x: x_value})
-      self.assertAllEqual(np.transpose(x_value, (1, 2, 0)), y)
-      self.assertAllEqual((1,), sample_shape)
-
-      feed_dict = {x: x_value}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual((1,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.transpose(x_value, (1, 2, 0)), sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      batch_ndims = array_ops.placeholder(dtypes.int32)
-      event_ndims = array_ops.placeholder(dtypes.int32)
-      shaper = _DistributionShape(
-          batch_ndims=batch_ndims, event_ndims=event_ndims)
-
-      # batch_ndims = 1, event_ndims = 1.
-      x = array_ops.placeholder(dtypes.float32)
-      x_value = np.ones((3, 4, 2), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 1, event_ndims: 1}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual([3], sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              [4, 2, 3], dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 0, event_ndims = 0.
-      x_value = np.ones((3,), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 0, event_ndims: 0}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual([3], sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              [1, 3], dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 0, event_ndims = 1.
-      x_value = np.ones([2], dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 0, event_ndims: 1}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual([], sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              [2, 1], dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-      # batch_ndims = 1, event_ndims = 0.
-      x_value = np.ones((1, 2), dtype=x.dtype.as_numpy_dtype())
-      feed_dict = {x: x_value, batch_ndims: 1, event_ndims: 0}
-      y, sample_shape = shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      self.assertAllEqual((1,), sess.run(sample_shape, feed_dict=feed_dict))
-      self.assertAllEqual(
-          np.ones(
-              (2, 1, 1), dtype=x.dtype.as_numpy_dtype()),
-          sess.run(y, feed_dict=feed_dict))
-      should_be_x_value = shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-      self.assertAllEqual(
-          x_value, sess.run(should_be_x_value, feed_dict=feed_dict))
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index 0b2680a19b7..4e0deb83aa9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -23,7 +23,7 @@ from scipy import stats
 
 from tensorflow.contrib import distributions
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import bijector as bijector_lib
+from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -31,7 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-bs = bijector_lib
+bs = bijectors
 ds = distributions
 la = linalg
 
@@ -119,8 +119,8 @@ class TransformedDistributionTest(test.TestCase):
           for i in range(len(diag))])
       fake_mvn = self._cls()(
           ds.MultivariateNormalDiag(
-              array_ops.zeros_like(shift),
-              array_ops.ones_like(diag),
+              loc=array_ops.zeros_like(shift),
+              scale_diag=array_ops.ones_like(diag),
               validate_args=True),
           bs.AffineLinearOperator(
               shift,
@@ -138,11 +138,11 @@ class ScalarToMultiTest(test.TestCase):
 
   def setUp(self):
     self._shift = np.array([-1, 0, 1], dtype=np.float32)
-    self._tril = np.array([[[-1., 0, 0],
+    self._tril = np.array([[[1., 0, 0],
                             [2, 1, 0],
                             [3, 2, 1]],
                            [[2, 0, 0],
-                            [3, -2, 0],
+                            [3, 2, 0],
                             [4, 3, 2]]],
                           dtype=np.float32)
 
@@ -265,16 +265,16 @@ class ScalarToMultiTest(test.TestCase):
   def testScalarBatchNonScalarEvent(self):
     self._testMVN(
         base_distribution_class=ds.MultivariateNormalDiag,
-        base_distribution_kwargs={"mu": [0., 0., 0.],
-                                  "diag_stddev": [1., 1, 1]},
+        base_distribution_kwargs={"loc": [0., 0., 0.],
+                                  "scale_diag": [1., 1, 1]},
         batch_shape=[2],
-        not_implemented_message="not implemented$")
+        not_implemented_message="not implemented")
 
     with self.test_session():
       # Can't override event_shape for scalar batch, non-scalar event.
       with self.assertRaisesRegexp(ValueError, "base distribution not scalar"):
         self._cls()(
-            distribution=ds.MultivariateNormalDiag(mu=[0.], diag_stddev=[1.]),
+            distribution=ds.MultivariateNormalDiag(loc=[0.], scale_diag=[1.]),
             bijector=bs.Affine(shift=self._shift, scale_tril=self._tril),
             batch_shape=[2],
             event_shape=[3],
@@ -303,8 +303,8 @@ class ScalarToMultiTest(test.TestCase):
       # non-scalar event.
       with self.assertRaisesRegexp(ValueError, "base distribution not scalar"):
         self._cls()(
-            distribution=ds.MultivariateNormalDiag(mu=[[0.]],
-                                                   diag_stddev=[[1.]]),
+            distribution=ds.MultivariateNormalDiag(loc=[[0.]],
+                                                   scale_diag=[[1.]]),
             bijector=bs.Affine(shift=self._shift, scale_tril=self._tril),
             batch_shape=[2],
             event_shape=[3],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
new file mode 100644
index 00000000000..c355adeedbf
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -0,0 +1,215 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VectorLaplaceLinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import distributions
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+
+
+class VectorLaplaceDiagTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testScalarParams(self):
+    mu = -1.
+    diag = -5.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
+        ds.VectorLaplaceDiag(mu, diag)
+
+  def testVectorParams(self):
+    mu = [-1.]
+    diag = [-5.]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([3, 1], dist.sample(3).get_shape())
+
+  def testDistWithBatchShapeOneThenTransformedThroughSoftplus(self):
+    # This complex combination of events resulted in a loss of static shape
+    # information when tensor_util.constant_value(self._needs_rotation) was
+    # being used incorrectly (resulting in always rotating).
+    # Batch shape = [1], event shape = [3]
+    mu = array_ops.zeros((1, 3))
+    diag = array_ops.ones((1, 3))
+    with self.test_session():
+      base_dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      dist = ds.TransformedDistribution(
+          base_dist,
+          validate_args=True,
+          bijector=bijectors.Softplus(event_ndims=1))
+      samps = dist.sample(5)  # Shape [5, 1, 3].
+      self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
+
+  def testMean(self):
+    mu = [-1., 1]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testMeanWithBroadcastLoc(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1., -1.], dist.mean().eval())
+
+  def testSample(self):
+    mu = [-1., 1]
+    diag = [1., -2]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      samps = dist.sample(int(1e4), seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0., rtol=0.05)
+      self.assertAllClose(cov_mat, np.cov(samps.T),
+                          atol=0.05, rtol=0.05)
+
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu, mean.eval())
+
+      n = int(1e4)
+      samps = dist.sample(n, seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps.transpose([1, 2, 0]),
+                             samps.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
+  def testCovariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.diag(np.ones([3], dtype=np.float32)),
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 3, 0],
+                          [0, 0, 3]],
+                         [[2, 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 2]]])**2.,
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 1]],
+                         [[4, 0, 0],
+                          [0, 5, 0],
+                          [0, 0, 6]]])**2.,
+          vla.covariance().eval())
+
+  def testVariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.ones([3], dtype=np.float32),
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          2. * np.array([[3., 3, 3],
+                         [2, 2, 2]])**2.,
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1],
+                      [4, 5, 6]])
+      self.assertAllClose(
+          2. * np.array([[3., 2, 1],
+                         [4, 5, 6]])**2.,
+          vla.variance().eval())
+
+  def testStddev(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.sqrt(2) * np.ones([3], dtype=np.float32),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 3, 3],
+                                 [2, 2, 2]]),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 2, 1],
+                                 [4, 5, 6]]),
+          vla.stddev().eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
index 0a4e7fb5b5b..b8a3a262ce0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
@@ -38,7 +38,7 @@ class _FakeVectorStudentT(object):
 
   Other `Vector*` implementations need only test new code. That we don't need
   to test every Vector* distribution is good because there aren't SciPy
-  analogues and reimplementing everything in NumPy sort of defeats the point of
+  analogs and reimplementing everything in NumPy sort of defeats the point of
   having the `TransformedDistribution + Affine` API.
   """
 
@@ -176,7 +176,7 @@ class VectorStudentTTest(test.TestCase):
     x = 2. * self._rng.rand(4, 3, 3).astype(np.float32) - 1.
 
     expected_mst = _FakeVectorStudentT(
-        df=np.tile(df, len(scale_diag)),
+        df=np.tile(df, reps=len(scale_diag)),
         loc=loc,
         scale_tril=scale_tril)
 
@@ -207,7 +207,7 @@ class VectorStudentTTest(test.TestCase):
     x = 2. * self._rng.rand(4, 3, 3).astype(np.float32) - 1.
 
     expected_mst = _FakeVectorStudentT(
-        df=np.tile(df, len(scale_diag)),
+        df=np.tile(df, reps=len(scale_diag)),
         loc=loc,
         scale_tril=scale_tril)
 
@@ -236,8 +236,9 @@ class VectorStudentTTest(test.TestCase):
 
     expected_mst = _FakeVectorStudentT(
         df=df,
-        loc=np.tile(loc[None, :], [len(df), 1]),
-        scale_tril=np.tile(scale_tril[None, :, :], [len(df), 1, 1]))
+        loc=np.tile(loc[array_ops.newaxis, :], reps=[len(df), 1]),
+        scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
+                           reps=[len(df), 1, 1]))
 
     with self.test_session():
       actual_mst = _VectorStudentT(df=df, loc=loc, scale_diag=scale_diag,
@@ -261,8 +262,9 @@ class VectorStudentTTest(test.TestCase):
 
     expected_mst = _FakeVectorStudentT(
         df=df,
-        loc=np.tile(loc[None, :], [len(df), 1]),
-        scale_tril=np.tile(scale_tril[None, :, :], [len(df), 1, 1]))
+        loc=np.tile(loc[array_ops.newaxis, :], reps=[len(df), 1]),
+        scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
+                           reps=[len(df), 1, 1]))
 
     with self.test_session():
       df_pl = array_ops.placeholder(dtypes.float32, name="df")
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index 1fa6ca0906d..d9dc978f23d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -339,7 +339,7 @@ class WishartCholeskyTest(test.TestCase):
                             chol_scale_deferred: chol_scale})
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "LLT decomposition was not successful"):
+                                   "Cholesky decomposition was not successful"):
         chol_w = distributions.WishartFull(
             df=df_deferred, scale=chol_scale_deferred)
         # np.ones((3, 3)) is not positive, definite.
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
deleted file mode 100644
index 5e003b3f5fa..00000000000
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ /dev/null
@@ -1,2488 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Bijector Ops.
-
-An API for invertible, differentiable transformations of random variables.
-
-## Background
-
-Differentiable, bijective transformations of continuous random variables alter
-the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
-manipulations.
-
-For more details and examples, see the `Bijector` docstring.
-
-To apply a `Bijector`, use `distributions.TransformedDistribution`.
-
-## Bijectors
-
-@@Affine
-@@AffineLinearOperator
-@@Bijector
-@@Chain
-@@CholeskyOuterProduct
-@@Exp
-@@Identity
-@@Inline
-@@Invert
-@@PowerTransform
-@@SigmoidCentered
-@@SoftmaxCentered
-@@Softplus
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import collections
-import contextlib
-import itertools
-import math
-import re
-
-import numpy as np
-import six
-
-from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.contrib.distributions.python.ops import operator_pd_diag
-from tensorflow.contrib.distributions.python.ops import operator_pd_identity
-from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.contrib.linalg.python.ops import linear_operator
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-
-
-__all__ = [
-    "Affine",
-    "AffineLinearOperator",
-    "Bijector",
-    "Chain",
-    "CholeskyOuterProduct",
-    "Exp",
-    "Identity",
-    "Inline",
-    "Invert",
-    "PowerTransform",
-    "SigmoidCentered",
-    "SoftmaxCentered",
-    "Softplus",
-]
-
-
-def _as_tensor(x, name):
-  """Convenience to convert to `Tensor` or leave as `None`."""
-  return None if x is None else ops.convert_to_tensor(x, name=name)
-
-
-class _Mapping(collections.namedtuple(
-    "_Mapping", ["x", "y", "ildj", "kwargs"])):
-  """Helper class to make it easier to manage caching in `Bijector`."""
-
-  def __new__(cls, x=None, y=None, ildj=None, kwargs=None):
-    """Custom __new__ so namedtuple items have defaults.
-
-    Args:
-      x: `Tensor`. Forward.
-      y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
-      kwargs: Python dictionary. Extra args supplied to
-        forward/inverse/etc functions.
-
-    Returns:
-      mapping: New instance of _Mapping.
-    """
-    return super(_Mapping, cls).__new__(cls, x, y, ildj, kwargs)
-
-  @property
-  def x_key(self):
-    """Returns key used for caching Y=g(X)."""
-    return (self.x,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
-
-  @property
-  def y_key(self):
-    """Returns key used for caching X=g^{-1}(Y)."""
-    return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
-
-  def merge(self, x=None, y=None, ildj=None, kwargs=None, mapping=None):
-    """Returns new _Mapping with args merged with self.
-
-    Args:
-      x: `Tensor`. Forward.
-      y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
-      kwargs: Python dictionary. Extra args supplied to
-        forward/inverse/etc functions.
-      mapping: Instance of _Mapping to merge. Can only be specified if no other
-        arg is specified.
-
-    Returns:
-      mapping: New instance of `_Mapping` which has inputs merged with self.
-
-    Raises:
-      ValueError: if mapping and any other arg is not `None`.
-    """
-    if mapping is None:
-      mapping = _Mapping(x=x, y=y, ildj=ildj, kwargs=kwargs)
-    elif not all(arg is None for arg in [x, y, ildj, kwargs]):
-      raise ValueError("Cannot specify mapping and individual args.")
-    return _Mapping(
-        x=self._merge(self.x, mapping.x),
-        y=self._merge(self.y, mapping.y),
-        ildj=self._merge(self.ildj, mapping.ildj),
-        kwargs=self._merge(self.kwargs, mapping.kwargs))
-
-  def _merge(self, old, new):
-    """Helper to merge which handles merging one value."""
-    if old is None:
-      return new
-    elif new is not None and old != new:
-      raise ValueError("Incompatible values: %s != %s" % (old, new))
-    return old
-
-  def _deep_tuple(self, x):
-    """Converts lists of lists to tuples of tuples."""
-    return (tuple(map(self._deep_tuple, x))
-            if isinstance(x, (list, tuple)) else x)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Bijector(object):
-  """Interface for transforming a `Distribution` sample.
-
-  A `Bijector` implements a
-  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
-  bijective, differentiable function. A `Bijector` is used by
-  `TransformedDistribution` but can be generally used for transforming a
-  `Distribution` generated `Tensor`.  A `Bijector` is characterized by three
-  operations:
-
-  1. Forward Evaluation
-
-     Useful for turning one random outcome into another random outcome from a
-     different distribution.
-
-  2. Inverse Evaluation
-
-     Useful for "reversing" a transformation to compute one probability in
-     terms of another.
-
-  3. (log o det o Jacobian o inverse)(x)
-
-     "The log of the determinant of the matrix of all first-order partial
-     derivatives of the inverse function."
-     Useful for inverting a transformation to compute one probability in terms
-     of another.  Geometrically, the det(Jacobian) is the volume of the
-     transformation and is used to scale the probability.
-
-  By convention, transformations of random variables are named in terms of the
-  forward transformation. The forward transformation creates samples, the
-  inverse is useful for computing probabilities.
-
-  Example Use:
-
-    - Basic properties:
-
-    ```python
-    x = ... # A tensor.
-    # Evaluate forward transformation.
-    fwd_x = my_bijector.forward(x)
-    x == my_bijector.inverse(fwd_x)
-    x != my_bijector.forward(fwd_x)  # Not equal because g(x) != g(g(x)).
-    ```
-
-    - Computing a log-likelihood:
-
-    ```python
-    def transformed_log_prob(bijector, log_prob, x):
-      return (bijector.inverse_log_det_jacobian(x) +
-              log_prob(bijector.inverse(x)))
-    ```
-
-    - Transforming a random outcome:
-
-    ```python
-    def transformed_sample(bijector, x):
-      return bijector.forward(x)
-    ```
-
-  Example transformations:
-
-    - "Exponential"
-
-      ```
-      Y = g(X) = exp(X)
-      X ~ Normal(0, 1)  # Univariate.
-      ```
-
-      Implies:
-
-      ```
-        g^{-1}(Y) = log(Y)
-        |Jacobian(g^{-1})(y)| = 1 / y
-        Y ~ LogNormal(0, 1), i.e.,
-        prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                  = (1 / y) Normal(log(y); 0, 1)
-      ```
-
-      Here is an example of how one might implement the `Exp` bijector:
-
-      ```
-        class Exp(Bijector):
-          def __init__(self, event_ndims=0, validate_args=False, name="exp"):
-            super(Exp, self).__init__(event_ndims=event_ndims,
-                                      validate_args=validate_args, name=name)
-          def _forward(self, x):
-            return math_ops.exp(x)
-          def _inverse_and_inverse_log_det_jacobian(self, y):
-            x = math_ops.log(y)
-            return x, -self._forward_log_det_jacobian(x)
-          def _forward_log_det_jacobian(self, x):
-            if self.event_ndims is None:
-              raise ValueError("Jacobian requires known event_ndims.")
-            event_dims = array_ops.shape(x)[-self.event_ndims:]
-            return math_ops.reduce_sum(x, reduction_indices=event_dims)
-        ```
-
-    - "Affine"
-
-      ```
-      Y = g(X) = sqrtSigma * X + mu
-      X ~ MultivariateNormal(0, I_d)
-      ```
-
-      Implies:
-
-      ```
-        g^{-1}(Y) = inv(sqrtSigma) * (Y - mu)
-        |Jacobian(g^{-1})(y)| = det(inv(sqrtSigma))
-        Y ~ MultivariateNormal(mu, sqrtSigma) , i.e.,
-        prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                  = det(sqrtSigma)^(-d) *
-                    MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
-      ```
-
-  Example of why a `Bijector` needs to understand sample, batch, event
-  partitioning:
-
-  - Consider the `Exp` `Bijector` applied to a `Tensor` which has sample, batch,
-    and event (S, B, E) shape semantics.  Suppose
-    the `Tensor`'s partitioned-shape is `(S=[4], B=[2], E=[3, 3])`.
-
-    For `Exp`, the shape of the `Tensor` returned by `forward` and `inverse` is
-    unchanged, i.e., `[4, 2, 3, 3]`. However the shape returned by
-    `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-    over the event dimensions.
-
-  Subclass Requirements:
-
-  - Typically subclasses implement `_forward` and one or both of:
-      - `_inverse`, `_inverse_log_det_jacobian`,
-      - `_inverse_and_inverse_log_det_jacobian`.
-
-  - If the `Bijector`'s use is limited to `TransformedDistribution` (or friends
-    like `QuantizedDistribution`) then depending on your use, you may not need
-    to implement all of `_forward` and `_inverse` functions.  Examples:
-      1. Sampling (e.g., `sample`) only requires `_forward`.
-      2. Probability functions (e.g., `prob`, `cdf`, `survival`) only require
-         `_inverse` (and related).
-      3. Only calling probability functions on the output of `sample` means
-        `_inverse` can be implemented as a cache lookup.
-
-    See `Example Use` [above] which shows how these functions are used to
-    transform a distribution.  (Note: `_forward` could theoretically be
-    implemented as a cache lookup but this would require controlling the
-    underlying sample generation mechanism.)
-
-  - If computation can be shared among `_inverse` and
-    `_inverse_log_det_jacobian` it is preferable to implement
-    `_inverse_and_inverse_log_det_jacobian`. This usually reduces
-    graph-construction overhead because a `Distribution`'s implementation of
-    `log_prob` will need to evaluate both the inverse Jacobian as well as the
-    inverse function.
-
-  - If an additional use case needs just `inverse` or just
-    `inverse_log_det_jacobian` then he or she may also wish to implement these
-    functions to avoid computing the `inverse_log_det_jacobian` or the
-    `inverse`, respectively.
-
-  - Subclasses should implement `_forward_event_shape`,
-    `_forward_event_shape_tensor` (and `inverse` counterparts) if the
-    transformation is shape-changing.  By default the event-shape is assumed
-    unchanged from input.
-
-  Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
-
-  - As case 3 [above] indicates, under some circumstances the inverse function
-    can be implemented as a cache lookup.
-
-  - The inverse `log o det o Jacobian` can be implemented as the negative of the
-    forward `log o det o Jacobian`.  This is useful if the `inverse` is
-    implemented as a cache or the inverse Jacobian is computationally more
-    expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
-    demonstrates the suggested implementation.
-
-    ```python
-    def _inverse_and_log_det_jacobian(self, y):
-       x = # ... implement inverse, possibly via cache.
-       return x, -self._forward_log_det_jac(x)  # Note negation.
-    ```
-
-    By overriding the `_inverse_and_log_det_jacobian` function we have access to
-    the inverse in one call.
-
-    The correctness of this approach can be seen from the following claim.
-
-    - Claim:
-
-        Assume `Y=g(X)` is a bijection whose derivative exists and is nonzero
-        for its domain, i.e., `d/dX g(X)!=0`. Then:
-
-        ```none
-        (log o det o jacobian o g^{-1})(Y) = -(log o det o jacobian o g)(X)
-        ```
-
-    - Proof:
-
-        From the bijective, nonzero differentiability of `g`, the
-        [inverse function theorem](
-            https://en.wikipedia.org/wiki/Inverse_function_theorem)
-        implies `g^{-1}` is differentiable in the image of `g`.
-        Applying the chain rule to `y = g(x) = g(g^{-1}(y))` yields
-        `I = g'(g^{-1}(y))*g^{-1}'(y)`.
-        The same theorem also implies `g{-1}'` is non-singular therefore:
-        `inv[ g'(g^{-1}(y)) ] = g^{-1}'(y)`.
-        The claim follows from [properties of determinant](
-  https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
-
-  - If possible, prefer a direct implementation of the inverse Jacobian. This
-    should have superior numerical stability and will often share subgraphs with
-    the `_inverse` implementation.
-
-  """
-
-  @abc.abstractmethod
-  def __init__(self,
-               event_ndims=None,
-               graph_parents=None,
-               is_constant_jacobian=False,
-               validate_args=False,
-               dtype=None,
-               name=None):
-    """Constructs Bijector.
-
-    A `Bijector` transforms random variables into new random variables.
-
-    Examples:
-
-    ```python
-    # Create the Y = g(X) = X transform which operates on vector events.
-    identity = Identity(event_ndims=1)
-
-    # Create the Y = g(X) = exp(X) transform which operates on matrices.
-    exp = Exp(event_ndims=2)
-    ```
-
-    See `Bijector` subclass docstring for more details and specific examples.
-
-    Args:
-      event_ndims: number of dimensions associated with event coordinates.
-      graph_parents: Python list of graph prerequisites of this `Bijector`.
-      is_constant_jacobian: `Boolean` indicating that the Jacobian is not a
-        function of the input.
-      validate_args: `Boolean`, default `False`.  Whether to validate input with
-        asserts. If `validate_args` is `False`, and the inputs are invalid,
-        correct behavior is not guaranteed.
-      dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
-        enforced.
-      name: The name to give Ops created by the initializer.
-    """
-    self._event_ndims = (
-        ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
-        if event_ndims is not None else None)
-    self._graph_parents = graph_parents or []
-    self._is_constant_jacobian = is_constant_jacobian
-    self._validate_args = validate_args
-    self._dtype = dtype
-    self._from_y = {}
-    self._from_x = {}
-    # Using abbreviation ildj for "inverse log det Jacobian."
-    # This variable is not `None` iff is_constant_jacobian is `True`.
-    self._constant_ildj = None
-    if name:
-      self._name = name
-    else:
-      # We want the default convention to be snake_case rather than CamelCase
-      # since `Chain` uses bijector.name as the kwargs dictionary key.
-      def camel_to_snake(name):
-        s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
-        return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
-      self._name = camel_to_snake(type(self).__name__)
-
-  @property
-  def event_ndims(self):
-    """Returns then number of event dimensions this bijector operates on."""
-    return self._event_ndims
-
-  @property
-  def graph_parents(self):
-    """Returns this `Bijector`'s graph_parents as a Python list."""
-    return self._graph_parents
-
-  @property
-  def is_constant_jacobian(self):
-    """Returns true iff the Jacobian is not a function of x.
-
-    Note: Jacobian is either constant for both forward and inverse or neither.
-
-    Returns:
-      `Boolean`.
-    """
-    return self._is_constant_jacobian
-
-  @property
-  def validate_args(self):
-    """Returns True if Tensor arguments will be validated."""
-    return self._validate_args
-
-  @property
-  def dtype(self):
-    """dtype of `Tensor`s transformable by this distribution."""
-    return self._dtype
-
-  @property
-  def name(self):
-    """Returns the string name of this `Bijector`."""
-    return self._name
-
-  def _forward_event_shape_tensor(self, input_shape):
-    """Subclass implementation for `forward_event_shape_tensor` function."""
-    return input_shape
-
-  def forward_event_shape_tensor(self,
-                                 input_shape,
-                                 name="forward_event_shape_tensor"):
-    """Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-    Args:
-      input_shape: `Tensor`, `int32` vector indicating event-portion shape
-        passed into `forward` function.
-      name: name to give to the op
-
-    Returns:
-      forward_event_shape_tensor: `Tensor`, `int32` vector indicating
-        event-portion shape after applying `forward`.
-    """
-    with self._name_scope(name, [input_shape]):
-      input_shape = ops.convert_to_tensor(input_shape, dtype=dtypes.int32,
-                                          name="input_shape")
-      return self._forward_event_shape_tensor(input_shape)
-
-  def _forward_event_shape(self, input_shape):
-    """Subclass implementation for `forward_event_shape` public function."""
-    return input_shape
-
-  def forward_event_shape(self, input_shape):
-    """Shape of a single sample from a single batch as a `TensorShape`.
-
-    Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-    Args:
-      input_shape: `TensorShape` indicating event-portion shape passed into
-        `forward` function.
-
-    Returns:
-      forward_event_shape_tensor: `TensorShape` indicating event-portion shape
-        after applying `forward`. Possibly unknown.
-    """
-    return self._forward_event_shape(tensor_shape.TensorShape(input_shape))
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    """Subclass implementation for `inverse_event_shape_tensor` function."""
-    return output_shape
-
-  def inverse_event_shape_tensor(self,
-                                 output_shape,
-                                 name="inverse_event_shape_tensor"):
-    """Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-    Args:
-      output_shape: `Tensor`, `int32` vector indicating event-portion shape
-        passed into `inverse` function.
-      name: name to give to the op
-
-    Returns:
-      inverse_event_shape_tensor: `Tensor`, `int32` vector indicating
-        event-portion shape after applying `inverse`.
-    """
-    with self._name_scope(name, [output_shape]):
-      output_shape = ops.convert_to_tensor(output_shape, dtype=dtypes.int32,
-                                           name="output_shape")
-      return self._inverse_event_shape_tensor(output_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    """Subclass implementation for `inverse_event_shape` public function."""
-    return self._inverse_event_shape(tensor_shape.TensorShape(output_shape))
-
-  def inverse_event_shape(self, output_shape):
-    """Shape of a single sample from a single batch as a `TensorShape`.
-
-    Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-    Args:
-      output_shape: `TensorShape` indicating event-portion shape passed into
-        `inverse` function.
-
-    Returns:
-      inverse_event_shape_tensor: `TensorShape` indicating event-portion shape
-        after applying `inverse`. Possibly unknown.
-    """
-    return self._inverse_event_shape(output_shape)
-
-  def _forward(self, x):
-    """Subclass implementation for `forward` public function."""
-    raise NotImplementedError("forward not implemented.")
-
-  def _call_forward(self, x, name, **kwargs):
-    with self._name_scope(name, [x]):
-      x = ops.convert_to_tensor(x, name="x")
-      self._maybe_assert_dtype(x)
-      mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.y is not None:
-        return mapping.y
-      mapping = mapping.merge(y=self._forward(x, **kwargs))
-      self._cache(mapping)
-      return mapping.y
-
-  def forward(self, x, name="forward"):
-    """Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-    Args:
-      x: `Tensor`. The input to the "forward" evaluation.
-      name: The name to give this op.
-
-    Returns:
-      `Tensor`.
-
-    Raises:
-      TypeError: if `self.dtype` is specified and `x.dtype` is not
-        `self.dtype`.
-      NotImplementedError: if `_forward` is not implemented.
-    """
-    return self._call_forward(x, name)
-
-  def _inverse(self, y):
-    """Subclass implementation for `inverse` public function."""
-    raise NotImplementedError("inverse not implemented")
-
-  def _call_inverse(self, y, name, **kwargs):
-    with self._name_scope(name, [y]):
-      y = ops.convert_to_tensor(y, name="y")
-      self._maybe_assert_dtype(y)
-      mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.x is not None:
-        return mapping.x
-      ildj = None
-      try:
-        x = self._inverse(y, **kwargs)
-      except NotImplementedError as original_error:
-        # Since _inverse was not implemented, try to see if it's implemented
-        # by the _inverse_and_inverse_log_det_jacobian member.
-        try:
-          x, ildj = self._inverse_and_inverse_log_det_jacobian(y, **kwargs)
-        except NotImplementedError:
-          raise original_error
-        if self._constant_ildj is not None:
-          ildj = self._constant_ildj  # Use the "global" result.
-        elif self.is_constant_jacobian:
-          self._constant_ildj = ildj
-      x = x if mapping.x is None else mapping.x
-      mapping = mapping.merge(x=x, ildj=ildj)
-      self._cache(mapping)
-      return mapping.x
-
-  def inverse(self, y, name="inverse"):
-    """Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-    Args:
-      y: `Tensor`. The input to the "inverse" evaluation.
-      name: The name to give this op.
-
-    Returns:
-      `Tensor`.
-
-    Raises:
-      TypeError: if `self.dtype` is specified and `y.dtype` is not
-        `self.dtype`.
-      NotImplementedError: if neither `_inverse` nor
-        `_inverse_and_inverse_log_det_jacobian` are implemented.
-    """
-    return self._call_inverse(y, name)
-
-  def _inverse_log_det_jacobian(self, y):
-    """Subclass implementation of `inverse_log_det_jacobian` public function."""
-    raise NotImplementedError("inverse_log_det_jacobian not implemented.")
-
-  def _call_inverse_log_det_jacobian(self, y, name, **kwargs):
-    with self._name_scope(name, [y]):
-      if self._constant_ildj is not None:
-        return self._constant_ildj
-      y = ops.convert_to_tensor(y, name="y")
-      self._maybe_assert_dtype(y)
-      mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return mapping.ildj
-      try:
-        x = mapping.x
-        ildj = self._inverse_log_det_jacobian(y, **kwargs)
-      except NotImplementedError as original_error:
-        # Since _inverse_log_det_jacobian was not implemented, try to see if
-        # it's implemented by the _inverse_and_inverse_log_det_jacobian member.
-        try:
-          x, ildj = self._inverse_and_inverse_log_det_jacobian(y, **kwargs)
-        except NotImplementedError:
-          raise original_error
-        if mapping.x is not None:
-          x = mapping.x
-      if self.is_constant_jacobian:
-        self._constant_ildj = ildj
-      x = x if mapping.x is None else mapping.x
-      mapping = mapping.merge(x=x, ildj=ildj)
-      self._cache(mapping)
-      return mapping.ildj
-
-  def inverse_log_det_jacobian(self, y, name="inverse_log_det_jacobian"):
-    """Returns the (log o det o Jacobian o inverse)(y).
-
-    Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-    Note that `forward_log_det_jacobian` is the negative of this function.
-
-    Args:
-      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
-      name: The name to give this op.
-
-    Returns:
-      `Tensor`.
-
-    Raises:
-      TypeError: if `self.dtype` is specified and `y.dtype` is not
-        `self.dtype`.
-      NotImplementedError: if neither `_inverse_log_det_jacobian` nor
-        `_inverse_and_inverse_log_det_jacobian` are implemented.
-    """
-    return self._call_inverse_log_det_jacobian(y, name)
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    """Subclass implementation of `inverse_and_inverse_log_det_jacobian`."""
-    raise NotImplementedError(
-        "inverse_and_inverse_log_det_jacobian not implemented.")
-
-  def _call_inverse_and_inverse_log_det_jacobian(self, y, name, **kwargs):
-    with self._name_scope(name, [y]):
-      y = ops.convert_to_tensor(y, name="y")
-      self._maybe_assert_dtype(y)
-      mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.x is not None and mapping.ildj is not None:
-        return mapping.x, mapping.ildj
-      try:
-        x, ildj = self._inverse_and_inverse_log_det_jacobian(y, **kwargs)
-      except NotImplementedError as original_error:
-        # Since _inverse_and_inverse_log_det_jacobian was not implemented, try
-        # to see if we can separately use _inverse and
-        # _inverse_log_det_jacobian members.
-        try:
-          # We want this same try/except to catch either NotImplementedError.
-          x = self._inverse(y, **kwargs)
-          if self._constant_ildj is None:
-            ildj = self._inverse_log_det_jacobian(y, **kwargs)
-        except NotImplementedError:
-          raise original_error
-      if self._constant_ildj is not None:
-        ildj = self._constant_ildj  # Ignore any ildj we may/not have.
-      elif self.is_constant_jacobian:
-        self._constant_ildj = ildj
-      # We use the mapped version of x, even if we re-computed x above with a
-      # call to self._inverse_and_inverse_log_det_jacobian.  This prevents
-      # re-evaluation of the inverse in a common case.
-      x = x if mapping.x is None else mapping.x
-      mapping = mapping.merge(x=x, ildj=ildj)
-      self._cache(mapping)
-      return mapping.x, mapping.ildj
-
-  def inverse_and_inverse_log_det_jacobian(
-      self, y, name="inverse_and_inverse_log_det_jacobian"):
-    """Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-    Enables possibly more efficient calculation when both inverse and
-    corresponding Jacobian are needed.
-
-    See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-    Args:
-      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
-      name: The name to give this op.
-
-    Returns:
-      `Tensor`.
-
-    Raises:
-      TypeError: if `self.dtype` is specified and `y.dtype` is not
-        `self.dtype`.
-      NotImplementedError: if neither `_inverse_and_inverse_log_det_jacobian`
-        nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-    """
-    return self._call_inverse_and_inverse_log_det_jacobian(y, name)
-
-  def _forward_log_det_jacobian(self, x):
-    """Subclass implementation of `forward_log_det_jacobian`."""
-    raise NotImplementedError(
-        "forward_log_det_jacobian not implemented.")
-
-  def _call_forward_log_det_jacobian(self, x, name, **kwargs):
-    with self._name_scope(name, [x]):
-      if self._constant_ildj is not None:
-        # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
-        return -1. * self._constant_ildj
-      x = ops.convert_to_tensor(x, name="x")
-      self._maybe_assert_dtype(x)
-      mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return -mapping.ildj
-      y = None
-      try:
-        ildj = -self._forward_log_det_jacobian(x, **kwargs)
-      except NotImplementedError as original_error:
-        try:
-          # We want this same try/except to catch either NotImplementedError.
-          # TODO(langmore) Add test that covers this branch.
-          y = self.forward(x, **kwargs) if y is None else y
-          ildj = self.inverse_log_det_jacobian(y, **kwargs)
-        except NotImplementedError:
-          raise original_error
-      if self.is_constant_jacobian:
-        self._constant_ildj = ildj
-      y = y if mapping.y is None else mapping.y
-      mapping = mapping.merge(y=y, ildj=ildj)
-      self._cache(mapping)
-      return -mapping.ildj
-
-  def forward_log_det_jacobian(self, x, name="forward_log_det_jacobian"):
-    """Returns both the forward_log_det_jacobian.
-
-    Args:
-      x: `Tensor`. The input to the "forward" Jacobian evaluation.
-      name: The name to give this op.
-
-    Returns:
-      `Tensor`.
-
-    Raises:
-      TypeError: if `self.dtype` is specified and `y.dtype` is not
-        `self.dtype`.
-      NotImplementedError: if neither `_forward_log_det_jacobian`
-        nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-    """
-    return self._call_forward_log_det_jacobian(x, name)
-
-  @contextlib.contextmanager
-  def _name_scope(self, name=None, values=None):
-    """Helper function to standardize op scope."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(
-          name, values=(values or []) + self.graph_parents) as scope:
-        yield scope
-
-  def _maybe_assert_dtype(self, x):
-    """Helper to check dtype when self.dtype is known."""
-    if self.dtype is not None and self.dtype.base_dtype != x.dtype.base_dtype:
-      raise TypeError("Input had dtype %s but expected %s." %
-                      (self.dtype, x.dtype))
-
-  def _cache(self, mapping):
-    """Helper which stores mapping info in forward/inverse dicts."""
-    if self._constant_ildj is not None:
-      # Fold in ildj if known constant Jacobian.
-      mapping = mapping.merge(ildj=self._constant_ildj)
-    # Merging from lookup is an added check that we're not overwriting anything
-    # which is not None.
-    mapping = mapping.merge(mapping=self._lookup(
-        mapping.x, mapping.y, mapping.kwargs))
-    if mapping.x is None and mapping.y is None:
-      raise ValueError("Caching expects at least one of (x,y) to be known, "
-                       "i.e., not None.")
-    self._from_x[mapping.x_key] = mapping
-    self._from_y[mapping.y_key] = mapping
-
-  def _lookup(self, x=None, y=None, kwargs=None):
-    """Helper which retrieves mapping info from forward/inverse dicts."""
-    mapping = _Mapping(x=x, y=y, kwargs=kwargs)
-    # Since _cache requires both x,y to be set, we only need to do one cache
-    # lookup since the mapping is always in both or neither.
-    if mapping.x is not None:
-      return self._from_x.get(mapping.x_key, mapping)
-    if mapping.y is not None:
-      return self._from_y.get(mapping.y_key, mapping)
-    return mapping
-
-  def _event_dims_tensor(self, sample):
-    """Return a 1D `int32` tensor: `range(rank(sample))[-event_ndims:]`."""
-    if self.event_ndims is None:
-      raise ValueError("Jacobian cannot be computed with unknown event_ndims")
-    static_event_ndims = tensor_util.constant_value(self.event_ndims)
-    static_rank = sample.get_shape().ndims
-    if static_event_ndims is not None and static_rank is not None:
-      return ops.convert_to_tensor(
-          static_rank + np.arange(-static_event_ndims, 0).astype(np.int32))
-
-    if static_event_ndims is not None:
-      event_range = np.arange(-static_event_ndims, 0).astype(np.int32)
-    else:
-      event_range = math_ops.range(-self.event_ndims, 0, dtype=dtypes.int32)
-
-    if static_rank is not None:
-      return event_range + static_rank
-    else:
-      return event_range + array_ops.rank(sample)
-
-
-class Inline(Bijector):
-  """Bijector constructed from custom callables.
-
-  Example Use:
-
-  ```python
-  exp = Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-    name="exp")
-  ```
-
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-  """
-
-  def __init__(self,
-               forward_fn=None,
-               inverse_fn=None,
-               inverse_log_det_jacobian_fn=None,
-               forward_log_det_jacobian_fn=None,
-               forward_event_shape_fn=None,
-               forward_event_shape_tensor_fn=None,
-               inverse_event_shape_fn=None,
-               inverse_event_shape_tensor_fn=None,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name="inline"):
-    """Creates a `Bijector` from callables.
-
-    Args:
-      forward_fn: Python callable implementing the forward transformation.
-      inverse_fn: Python callable implementing the inverse transformation.
-      inverse_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the inverse transformation.
-      forward_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the forward transformation.
-      forward_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      forward_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      is_constant_jacobian: `Boolean` indicating that the Jacobian is constant
-        for all input arguments.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object.
-    """
-    super(Inline, self).__init__(
-        event_ndims=0,
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-    self._forward_fn = forward_fn
-    self._inverse_fn = inverse_fn
-    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
-    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
-    self._forward_event_shape_fn = forward_event_shape_fn
-    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
-    self._inverse_event_shape_fn = inverse_event_shape_fn
-    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
-
-  def _forward_event_shape(self, input_shape):
-    if self._forward_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_fn(input_shape)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    if self._forward_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_tensor_fn(input_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    if self._inverse_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_fn(output_shape)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    if self._inverse_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_tensor_fn(output_shape)
-
-  def _forward(self, x, **kwargs):
-    if not callable(self._forward_fn):
-      raise NotImplementedError(
-          "forward_fn is not a callable function.")
-    return self._forward_fn(x, **kwargs)
-
-  def _inverse(self, y, **kwargs):
-    if not callable(self._inverse_fn):
-      raise NotImplementedError(
-          "inverse_fn is not a callable function.")
-    return self._inverse_fn(y, **kwargs)
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._inverse_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "inverse_log_det_jacobian_fn is not a callable function.")
-    return self._inverse_log_det_jacobian_fn(y, **kwargs)
-
-  def _forward_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._forward_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
-
-
-class Invert(Bijector):
-  """Bijector which inverts another Bijector.
-
-  Example Use: [ExpGammaDistribution (see Background & Context)](
-  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-  models `Y=log(X)` where `X ~ Gamma`.
-
-  ```python
-  exp_gamma_distribution = TransformedDistribution(
-    Gamma(alpha=1., beta=2.),
-    bijector.Invert(bijector.Exp())
-  ```
-
-  """
-
-  def __init__(self, bijector, validate_args=False, name=None):
-    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-    efficient if the base bijector implements `_forward_log_det_jacobian`. If
-    `_forward_log_det_jacobian` is not implemented then the following code is
-    used:
-
-    ```python
-    y = self.inverse(x, **kwargs)
-    return -self.inverse_log_det_jacobian(y, **kwargs)
-    ```
-
-    Args:
-      bijector: Bijector instance.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object.
-    """
-
-    self._bijector = bijector
-    super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
-        graph_parents=bijector.graph_parents,
-        is_constant_jacobian=bijector.is_constant_jacobian,
-        validate_args=validate_args,
-        dtype=bijector.dtype,
-        name=name or "_".join(["invert", bijector.name]))
-
-  def _forward_event_shape(self, input_shape):
-    return self.bijector.inverse_event_shape(input_shape)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self.bijector.inverse_event_shape_tensor(input_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    return self.bijector.forward_event_shape(output_shape)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self.bijector.forward_event_shape_tensor(output_shape)
-
-  @property
-  def bijector(self):
-    return self._bijector
-
-  def _forward(self, x, **kwargs):
-    return self.bijector.inverse(x, **kwargs)
-
-  def _inverse_and_inverse_log_det_jacobian(self, y, **kwargs):
-    return (self.bijector.forward(y, **kwargs),
-            self.bijector.forward_log_det_jacobian(y, **kwargs))
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    return self.bijector.inverse_log_det_jacobian(x, **kwargs)
-
-
-class Chain(Bijector):
-  """Bijector which applies a sequence of bijectors.
-
-  Example Use:
-
-  ```python
-  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-  ```
-
-  Results in:
-
-  * Forward:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).forward(x)
-   = exp.forward(softplus.forward(x))
-   = tf.exp(tf.log(1. + tf.exp(x)))
-   = 1. + tf.exp(x)
-   ```
-
-  * Inverse:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).inverse(y)
-   = softplus.inverse(exp.inverse(y))
-   = tf.log(tf.exp(tf.log(y)) - 1.)
-   = tf.log(y - 1.)
-   ```
-
-  """
-
-  def __init__(self, bijectors=(), validate_args=False, name=None):
-    """Instantiates `Chain` bijector.
-
-    Args:
-      bijectors: Python list of bijector instances. An empty list makes this
-        bijector equivalent to the `Identity` bijector.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String`, name given to ops managed by this object. Default: E.g.,
-        `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-    Raises:
-      ValueError: if bijectors have different dtypes.
-    """
-    self._bijectors = bijectors
-    dtype = list(set([b.dtype for b in bijectors]))
-    if len(dtype) > 2:
-      raise ValueError("incompatible dtypes: %s" % dtype)
-    elif len(dtype) == 2:
-      dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    elif len(dtype) == 1:
-      dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    else:
-      dtype = None
-      event_ndims = None
-
-    super(Chain, self).__init__(
-        graph_parents=list(itertools.chain.from_iterable(
-            b.graph_parents for b in bijectors)),
-        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
-        validate_args=validate_args,
-        dtype=dtype,
-        event_ndims=event_ndims,
-        name=name or ("identity" if not bijectors else
-                      "_of_".join(["chain"] + [b.name for b in bijectors])))
-
-  @property
-  def bijectors(self):
-    return self._bijectors
-
-  def _shape_helper(self, func_name, input_shape, reverse):
-    new_shape = input_shape
-    for b in reversed(self.bijectors) if reverse else self.bijectors:
-      func = getattr(b, func_name, None)
-      if func is None:
-        raise ValueError("unable to call %s on bijector %s (%s)" %
-                         (func_name, b.name, func))
-      new_shape = func(new_shape)
-    return new_shape
-
-  def _forward_event_shape(self, input_shape):
-    return self._shape_helper("forward_event_shape", input_shape,
-                              reverse=True)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self._shape_helper(
-        "forward_event_shape_tensor", input_shape, reverse=True)
-
-  def _inverse_event_shape(self, output_shape):
-    return self._shape_helper("inverse_event_shape", output_shape,
-                              reverse=False)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self._shape_helper("inverse_event_shape_tensor", output_shape,
-                              reverse=False)
-
-  def _forward(self, x, **kwargs):
-    y = x
-    for b in reversed(self.bijectors):
-      y = b.forward(y, **kwargs.get(b.name, {}))
-    return y
-
-  def _inverse_and_inverse_log_det_jacobian(self, y, **kwargs):
-    x = y
-    ildj = constant_op.constant(0., dtype=x.dtype,
-                                name="inverse_log_det_jacobian")
-    for b in self.bijectors:
-      x, j = b.inverse_and_inverse_log_det_jacobian(x, **kwargs.get(b.name, {}))
-      ildj += j
-    return x, ildj
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    y = x
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
-    for b in reversed(self.bijectors):
-      bijector_kwargs = kwargs.get(b.name, {})
-      fldj += b.forward_log_det_jacobian(y, **bijector_kwargs)
-      y = b.forward(y, **bijector_kwargs)
-    return fldj
-
-
-class Identity(Bijector):
-  """Compute Y = g(X) = X.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
-    # ndim and 1 event ndim (i.e., vector of vectors).
-    identity = Identity(event_ndims=1)
-    x = [[1., 2],
-         [3, 4]]
-    x == identity.forward(x) == identity.inverse(x)
-    ```
-
-  """
-
-  def __init__(self, validate_args=False, event_ndims=0, name="identity"):
-    super(Identity, self).__init__(
-        is_constant_jacobian=True,
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    return x
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    return y, constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
-
-
-class PowerTransform(Bijector):
-  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-  of this bijector.
-
-  This bijector is equivalent to the `Exp` bijector when `c=0`.
-  """
-
-  def __init__(self,
-               power=0.,
-               event_ndims=0,
-               validate_args=False,
-               name="power_transform"):
-    """Instantiates the `PowerTransform` bijector.
-
-    Args:
-      power: Python `float` scalar indicating the transform power, i.e.,
-        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `power < 0` or is not known statically.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[power]):
-      power = tensor_util.constant_value(
-          ops.convert_to_tensor(power, name="power"))
-    if power is None or power < 0:
-      raise ValueError("`power` must be a non-negative TF constant.")
-    self._power = power
-    super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def power(self):
-    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
-    return self._power
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    if self.power == 0.:
-      return math_ops.exp(x)
-    # TODO(jvdillon): If large x accuracy is an issue, consider using
-    # (1. + x * self.power)**(1. / self.power) when x >> 1.
-    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    if self.power == 0.:
-      x = math_ops.log(y)
-      ildj = -math_ops.reduce_sum(x, reduction_indices=event_dims)
-      return x, ildj
-    # TODO(jvdillon): If large y accuracy is an issue, consider using
-    # (y**self.power - 1.) / self.power when y >> 1.
-    x = math_ops.expm1(math_ops.log(y) * self.power) / self.power
-    ildj = (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y),
-        reduction_indices=event_dims)
-    return x, ildj
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    if self.power == 0.:
-      return math_ops.reduce_sum(x, reduction_indices=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        reduction_indices=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args or self.power == 0.:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        1. + self.power * x,
-        message="Forward transformation input must be at least {}.".format(
-            -1. / self.power))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_valid = check_ops.assert_positive(
-        y, message="Inverse transformation input must be greater than 0.")
-    return control_flow_ops.with_dependencies([is_valid], y)
-
-
-class Exp(PowerTransform):
-  """Compute `Y = g(X) = exp(X)`.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
-    x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
-    exp(x) == exp.forward(x)
-    log(x) == exp.inverse(x)
-    ```
-
-    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-    over the event space.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="exp"):
-    """Instantiates the `Exp` bijector.
-
-    Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
-    """
-    super(Exp, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-
-# TODO(srvasude): Deprecate this class with a dedicated Linear Operator
-# corresponding to TriL + V D V.T.
-class _TriLPlusVDVTLightweightOperatorPD(object):
-  """Helper/hidden class fake an OperatorPD for TriL+VDV.T."""
-
-  def __init__(self, tril, v, diag=None, validate_args=False):
-    """Creates an instance of _TriLPlusVDVTLightweightOperatorPD.
-
-    WARNING: This object is not to be used outside of `Affine` where it is
-    currently being temporarily used for refactoring purposes.
-
-    Args:
-      tril: `Tensor` of shape `[B1,..,Bb, d, d]`.
-      v: `Tensor` of shape `[B1,...,Bb, d, k]`.
-      diag: `Tensor` of shape `[B1,...,Bb, k, k]` or None
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-    """
-    self._m = tril
-    self._v = v
-    self._validate_args = validate_args
-    self._inputs = [tril, v]
-    if diag is not None:
-      self._inputs += [diag]
-      self._d = operator_pd_diag.OperatorPDDiag(diag, verify_pd=validate_args)
-      self._d_inv = operator_pd_diag.OperatorPDDiag(1. / diag,
-                                                    verify_pd=validate_args)
-      return
-    if v.get_shape().is_fully_defined():
-      v_shape = v.get_shape().as_list()
-      id_shape = v_shape[:-2] + [v_shape[-1], v_shape[-1]]
-    else:
-      v_shape = array_ops.shape(v)
-      id_shape = array_ops.concat([v_shape[:-2], [v_shape[-1], v_shape[-1]]], 0)
-    self._d = operator_pd_identity.OperatorPDIdentity(
-        id_shape, v.dtype, verify_pd=self.validate_args)
-    self._d_inv = self._d
-
-  @property
-  def inputs(self):
-    return self._inputs
-
-  @property
-  def dtype(self):
-    return self._m.dtype.base_dtype
-
-  @property
-  def validate_args(self):
-    return self._validate_args
-
-  def rank(self):
-    """Returns `rank(self)`."""
-    return array_ops.rank(self._m)
-
-  def sqrt_matmul(self, x):
-    """Computes `matmul(self, x)`.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    Args:
-      x: `Tensor`
-
-    Returns:
-      self_times_x: `Tensor`
-    """
-    m_x = math_ops.matmul(self._m, x)
-    vt_x = math_ops.matmul(self._v, x, adjoint_a=True)
-    d_vt_x = self._d.matmul(vt_x)
-    v_d_vt_x = math_ops.matmul(self._v, d_vt_x)
-    return m_x + v_d_vt_x
-
-  def sqrt_solve(self, x):
-    """Computes `solve(self, x)`.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    To compute (M + V D V.T), we use the the Woodbury matrix identity:
-      inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
-    where,
-      C = inv(D) + V.T inv(M) V.
-    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
-
-    Args:
-      x: `Tensor`
-
-    Returns:
-      inv_of_self_times_x: `Tensor`
-    """
-    minv_x = linalg_ops.matrix_triangular_solve(self._m, x)
-    vt_minv_x = math_ops.matmul(self._v, minv_x, transpose_a=True)
-    cinv_vt_minv_x = linalg_ops.matrix_solve(
-        self._woodbury_sandwiched_term(), vt_minv_x)
-    v_cinv_vt_minv_x = math_ops.matmul(self._v, cinv_vt_minv_x)
-    minv_v_cinv_vt_minv_x = linalg_ops.matrix_triangular_solve(
-        self._m, v_cinv_vt_minv_x)
-    return minv_x - minv_v_cinv_vt_minv_x
-
-  def sqrt_log_abs_det(self):
-    """Computes (log o abs o det)(X) for matrix X.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    To compute det(M + V D V.T), we use the matrix determinant lemma:
-      det(Tril + V D V.T) = det(C) det(D) det(M)
-    where C is defined as in `_inverse`, ie,
-      C = inv(D) + V.T inv(M) V.
-
-    See: https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-
-    Returns:
-      log_abs_det: `Tensor`.
-    """
-    log_det_c = math_ops.log(math_ops.abs(
-        linalg_ops.matrix_determinant(self._woodbury_sandwiched_term())))
-    # Reduction is ok because we always prepad inputs to this class.
-    log_det_m = math_ops.reduce_sum(math_ops.log(math_ops.abs(
-        array_ops.matrix_diag_part(self._m))), reduction_indices=[-1])
-    return log_det_c + 2. * self._d.sqrt_log_abs_det() + log_det_m
-
-  def _woodbury_sandwiched_term(self):
-    """Computes the sandwiched term in the Woodbury identity.
-
-    Computes the "`C`" in the the identity:
-       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
-    where,
-       C = inv(D) + V.T inv(M) V.
-
-    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
-
-    Returns:
-      woodbury_sandwich_term: A `Tensor` to be used like `C`, above.
-    """
-    minv_v = linalg_ops.matrix_triangular_solve(self._m, self._v)
-    vt_minv_v = math_ops.matmul(self._v, minv_v, adjoint_a=True)
-    return self._d_inv.add_to_tensor(vt_minv_v)
-
-
-class Affine(Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-  In TF parlance, the `scale` term is logically equivalent to:
-
-  ```python
-  scale = (
-    scale_identity_multiplier * tf.diag(tf.ones(d)) +
-    tf.diag(scale_diag) +
-    scale_tril +
-    scale_perturb_factor @ diag(scale_perturb_diag) @
-      tf.transpose([scale_perturb_factor])
-  )
-  ```
-
-  The `scale` term is applied without necessarily materializing constituent
-  matrices, i.e., the matmul is [matrix-free](
-  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-  Examples:
-
-  ```python
-  # Y = X
-  b = Affine()
-
-  # Y = X + shift
-  b = Affine(shift=[1., 2, 3])
-
-  # Y = 2 * I @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_identity_multiplier=2.)
-
-  # Y = tf.diag(d1) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-  # Y = (I + v * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[1., 3, 3],          # Implicitly 3x3.
-             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale_identity_multiplier=None,
-               scale_diag=None,
-               scale_tril=None,
-               scale_perturb_factor=None,
-               scale_perturb_diag=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine"):
-    """Instantiates the `Affine` bijector.
-
-    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-    giving the forward operation:
-
-    ```none
-    Y = g(X) = scale @ X + shift
-    ```
-
-    where the `scale` term is logically equivalent to:
-
-    ```python
-    scale = (
-      scale_identity_multiplier * tf.diag(tf.ones(d)) +
-      tf.diag(scale_diag) +
-      scale_tril +
-      scale_perturb_factor @ diag(scale_perturb_diag) @
-        tf.transpose([scale_perturb_factor])
-    )
-    ```
-
-    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-    specified then `scale += IdentityMatrix`. Otherwise specifying a
-    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-    Args:
-      shift: Numeric `Tensor`.  If this is set to `None`, no shift is applied.
-      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag=scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Numeric `Tensor` representing factor matrix with
-        last two dimensions of shape `(k, r)`.
-        When `None`, no rank-r update is added to `scale`.
-      scale_perturb_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-        r x r Diagonal matrix.
-        When `None` low rank updates will take the form `scale_perturb_factor *
-        scale_perturb_factor.T`.
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
-      TypeError: if `shift` has different `dtype` from `scale` arguments.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    # Ambiguous definition of low rank update.
-    if scale_perturb_diag is not None and scale_perturb_factor is None:
-      raise ValueError("When scale_perturb_diag is specified, "
-                       "scale_perturb_factor must be specified.")
-    # Special case, only handling a scaled identity matrix. We don't know its
-    # dimensions, so this is special cased.
-    # We don't check identity_multiplier, since below we set it to 1. if all
-    # other scale args are None.
-    self._is_only_identity_multiplier = (scale_tril is None and
-                                         scale_diag is None and
-                                         scale_perturb_factor is None)
-    # When no args are specified, pretend the scale matrix is the identity
-    # matrix.
-    if self._is_only_identity_multiplier and scale_identity_multiplier is None:
-      scale_identity_multiplier = 1.
-    with self._name_scope("init", values=[
-        shift, scale_identity_multiplier, scale_diag, scale_tril,
-        scale_perturb_diag, scale_perturb_factor, event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if validate_args:
-        is_less_than_two = check_ops.assert_less(
-            event_ndims, 2,
-            message="event_ndims must be 0 or 1")
-        event_ndims = control_flow_ops.with_dependencies(
-            [is_less_than_two], event_ndims)
-      self._shift = _as_tensor(shift, "shift")
-      # self._create_scale_operator returns an OperatorPD in all cases except if
-      # self._is_only_identity_multiplier; in which case it returns a scalar
-      # Tensor.
-      self._scale = self._create_scale_operator(
-          identity_multiplier=scale_identity_multiplier,
-          diag=scale_diag,
-          tril=scale_tril,
-          perturb_diag=scale_perturb_diag,
-          perturb_factor=scale_perturb_factor,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      if (self._shift is not None and
-          self._shift.dtype.base_dtype != self._scale.dtype.base_dtype):
-        raise TypeError("shift.dtype({}) does not match scale.dtype({})".format(
-            self._shift.dtype, self._scale.dtype))
-      self._shaper = _DistributionShape(
-          batch_ndims=self._infer_batch_ndims(),
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(Affine, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=(
-              [event_ndims] +
-              [self._scale] if contrib_framework.is_tensor(self._scale)
-              else self._scale.inputs +
-              [self._shift] if self._shift is not None else []),
-          is_constant_jacobian=True,
-          validate_args=validate_args,
-          name=name)
-
-  def _create_scale_operator(self, identity_multiplier, diag, tril,
-                             perturb_diag, perturb_factor, event_ndims,
-                             validate_args):
-    """Construct `scale` from various components.
-
-    Args:
-      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
-        done to the identity matrix.
-      diag: Numeric `Tensor` representing the diagonal matrix. `scale_diag` has
-        shape [N1, N2, ... k], which represents a k x k diagonal matrix.
-      tril: Numeric `Tensor` representing the diagonal matrix. `scale_tril` has
-        shape [N1, N2, ... k], which represents a k x k lower triangular matrix.
-      perturb_diag: Numeric `Tensor` representing the diagonal matrix of the
-        low rank update.
-      perturb_factor: Numeric `Tensor` representing factor matrix.
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.  Must be 0 or 1
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-
-    Returns:
-      scale. In the case of scaling by a constant, scale is a
-      floating point `Tensor`. Otherwise, scale is an `OperatorPD`.
-
-    Raises:
-      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
-    """
-    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
-    diag = _as_tensor(diag, "diag")
-    tril = _as_tensor(tril, "tril")
-    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
-    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
-
-    identity_multiplier = self._maybe_validate_identity_multiplier(
-        identity_multiplier, validate_args)
-
-    if perturb_factor is not None:
-      perturb_factor = self._process_matrix(
-          perturb_factor, min_rank=2, event_ndims=event_ndims)
-
-    if perturb_diag is not None:
-      perturb_diag = self._process_matrix(
-          perturb_diag, min_rank=1, event_ndims=event_ndims)
-
-    # The following if-statments are ordered by increasingly stronger
-    # assumptions in the base matrix, i.e., we process in the order:
-    # TriL, Diag, Identity.
-
-    if tril is not None:
-      tril = self._preprocess_tril(
-          identity_multiplier, diag, tril, event_ndims)
-      if perturb_factor is None:
-        return operator_pd_cholesky.OperatorPDCholesky(
-            tril, verify_pd=validate_args)
-      return _TriLPlusVDVTLightweightOperatorPD(
-          tril=tril, v=perturb_factor, diag=perturb_diag,
-          validate_args=validate_args)
-
-    if diag is not None:
-      diag = self._preprocess_diag(identity_multiplier, diag, event_ndims)
-      if perturb_factor is None:
-        return operator_pd_diag.OperatorPDSqrtDiag(
-            diag, verify_pd=validate_args)
-      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator=operator_pd_diag.OperatorPDDiag(
-              diag, verify_pd=validate_args),
-          v=perturb_factor,
-          diag=perturb_diag,
-          verify_pd=validate_args)
-
-    if identity_multiplier is not None:
-      if perturb_factor is None:
-        return identity_multiplier
-      # Infer the shape from the V and D.
-      v_shape = array_ops.shape(perturb_factor)
-      identity_shape = array_ops.concat((v_shape[:-1], (v_shape[-2],)), 0)
-      scaled_identity = operator_pd_identity.OperatorPDIdentity(
-          identity_shape,
-          perturb_factor.dtype.base_dtype,
-          scale=identity_multiplier,
-          verify_pd=validate_args)
-      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator=scaled_identity,
-          v=perturb_factor,
-          diag=perturb_diag,
-          verify_pd=validate_args)
-
-    raise ValueError("One of tril, diag and/or identity_multiplier must be "
-                     "specified.")
-
-  def _maybe_validate_identity_multiplier(self, identity_multiplier,
-                                          validate_args):
-    """Check that the init arg `identity_multiplier` is valid."""
-    if identity_multiplier is None or not validate_args:
-      return identity_multiplier
-    if validate_args:
-      identity_multiplier = control_flow_ops.with_dependencies(
-          [check_ops.assert_positive(identity_multiplier)],
-          identity_multiplier)
-    return identity_multiplier
-
-  def _preprocess_tril(self, identity_multiplier, diag, tril, event_ndims):
-    """Helper to preprocess a lower triangular matrix."""
-    tril = array_ops.matrix_band_part(tril, -1, 0)  # Zero out TriU.
-    if identity_multiplier is None and diag is None:
-      return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
-    new_diag = array_ops.matrix_diag_part(tril)
-    if identity_multiplier is not None:
-      new_diag += identity_multiplier
-    if diag is not None:
-      new_diag += diag
-    tril = array_ops.matrix_set_diag(tril, new_diag)
-    return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
-
-  def _preprocess_diag(self, identity_multiplier, diag, event_ndims):
-    """Helper to preprocess a diagonal matrix."""
-    if identity_multiplier is not None:
-      diag += identity_multiplier
-    return self._process_matrix(diag, min_rank=1, event_ndims=event_ndims)
-
-  def _process_matrix(self, matrix, min_rank, event_ndims):
-    """Helper to __init__ which gets matrix in batch-ready form."""
-    # Pad the matrix so that matmul works in the case of a matrix and vector
-    # input.  Keep track if the matrix was padded, to distinguish between a
-    # rank 3 tensor and a padded rank 2 tensor.
-    # TODO(srvasude): Remove side-effects from functions. Its currently unbroken
-    # but error-prone since the function call order may change in the future.
-    self._rank_two_event_ndims_one = math_ops.logical_and(
-        math_ops.equal(array_ops.rank(matrix), min_rank),
-        math_ops.equal(event_ndims, 1))
-    left = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
-    pad = array_ops.concat(
-        [array_ops.ones(
-            [left], dtype=dtypes.int32), array_ops.shape(matrix)],
-        0)
-    return array_ops.reshape(matrix, pad)
-
-  def _infer_batch_ndims(self):
-    """Return batch_ndims."""
-    if self._is_only_identity_multiplier:
-      return 0
-    # The real batch dims is one less when we pad in the case of event_ndims =
-    # 1, and the rank of the underlying scale being 2. This allows us to have
-    # non-negative sample dims.
-    return (self._scale.rank() - 2 -
-            array_ops.where(self._rank_two_event_ndims_one, 1, 0))
-
-  @property
-  def shift(self):
-    return self._shift
-
-  @property
-  def scale(self):
-    # TODO(srvasude): Remove this exception once TriLPlusVDVT is properly
-    # implemented.
-    if isinstance(self._scale, _TriLPlusVDVTLightweightOperatorPD):
-      raise NotImplementedError("Cannot access scale when Tril+VDV.T.")
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self._is_only_identity_multiplier:
-      y *= self._scale
-      if self.shift is not None:
-        return y + self.shift
-      return  y
-    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(y)
-    y = self._scale.sqrt_matmul(y)
-    y = self._shaper.undo_make_batch_of_event_sample_matrices(y, sample_shape)
-    if self.shift is not None:
-      return y + self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self._is_only_identity_multiplier:
-      return x / self._scale
-    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(x)
-    x = self._scale.sqrt_solve(x)
-    x = self._shaper.undo_make_batch_of_event_sample_matrices(x, sample_shape)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._is_only_identity_multiplier:
-      # TODO(jvdillon): We don't pad in this case and instead let the fldj be
-      # applied via broadcast.
-      d = math_ops.cast(array_ops.shape(x)[-1], dtype=self._scale.dtype)
-      return math_ops.log(math_ops.abs(self._scale)) * array_ops.where(
-          math_ops.equal(self._shaper.event_ndims, 0), 1., d)
-    fldj = self._scale.sqrt_log_abs_det()
-    # We need to squeeze off the padded dimension.
-    start = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
-    return array_ops.reshape(fldj, array_ops.shape(fldj)[start:])
-
-
-class AffineLinearOperator(Bijector):
-  """Compute `Y = g(X; shift, scale) = scale(X.T) + shift`.
-
-  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-  If `X` is a scalar then the forward transformation is: `scale * X + shift`
-  where `*` denotes the scalar product.
-
-  Note: we don't always simply transpose `X` (but write it this way for
-  brevity).  Actually the input `X` undergoes the following transformation
-  before being premultiplied by `scale`:
-
-  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-     `new_sample_shape = [1]`. Otherwise do nothing.
-  2. The sample shape is flattened to have one dimension, i.e.,
-     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-  3. The sample dim is cyclically rotated left by 1, i.e.,
-     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-     dimensions.
-
-  (For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-  The result of the above transformation is that `X` can be regarded as a batch
-  of matrices where each column is a draw from the distribution.  After
-  premultiplying by `scale`, we take the inverse of this procedure.  The input
-  `Y` also undergoes the same transformation before/after premultiplying by
-  `inv(scale)`.
-
-  Example Use:
-
-  ```python
-  linalg = tf.contrib.linalg
-
-  x = [1., 2, 3]
-
-  shift = [-1., 0., 1]
-  diag = [1., 2, 3]
-  scale = linalg.LinearOperatorDiag(diag)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # diag * scale + shift
-  y = affine.forward(x)  # [0., 4, 10]
-
-  shift = [2., 3, 1]
-  tril = [[1., 0, 0],
-          [2, 1, 0],
-          [3, 2, 1]]
-  scale = linalg.LinearOperatorTriL(tril)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-  y = affine.forward(x)  # [3., 7, 11]
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine_linear_operator"):
-    """Instantiates the `AffineLinearOperator` bijector.
-
-    Args:
-      shift: Numeric `Tensor`.
-      scale:  Subclass of `LinearOperator`.  Represents the (batch) positive
-        definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
-      TypeError: if `scale` is not a `LinearOperator`.
-      TypeError: if `shift.dtype` does not match `scale.dtype`.
-      ValueError: if not `scale.is_non_singular`.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    graph_parents = []
-    with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        graph_parents += [shift]
-      self._shift = shift
-
-      if scale is not None:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-        if not isinstance(scale, linear_operator.LinearOperator):
-          raise TypeError("scale is not an instance of tf.LinearOperator")
-        if validate_args and not scale.is_non_singular:
-          raise ValueError("Scale matrix must be non-singular.")
-        graph_parents += scale.graph_parents
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-          graph_parents += [batch_ndims]
-      else:
-        batch_ndims = 0  # We won't need shape inference when scale is None.
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=graph_parents,
-          is_constant_jacobian=True,
-          validate_args=validate_args,
-          name=name)
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X.T + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X.T + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self.scale is not None:
-      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          y, expand_batch_dim=False)
-      with ops.control_dependencies([self.scale.assert_non_singular()] if
-                                    self.validate_args else []):
-        y = self.scale.apply(y)
-      y = self._shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self.scale is not None:
-      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      # Solve fails if the op is singular so we may safely skip this assertion.
-      x = self.scale.solve(x)
-      x = self._shaper.undo_make_batch_of_event_sample_matrices(
-          x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
-    if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
-    with ops.control_dependencies([self.scale.assert_non_singular()] if
-                                  self.validate_args else []):
-      return self.scale.log_abs_determinant()
-
-
-class Softplus(Bijector):
-  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-  The softplus `Bijector` has the following two useful properties:
-
-  * The domain is the positive real numbers
-  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-    the `Exp` `Bijector`.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
-    x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
-    log(1 + exp(x)) == softplus.forward(x)
-    log(exp(x) - 1) == softplus.inverse(x)
-    ```
-
-    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-    reduction over the event space.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="softplus"):
-    super(Softplus, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    return nn_ops.softplus(x)
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    event_dims = self._event_dims_tensor(y)
-    # Could also do:
-    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
-    #                              reduction_indices=event_dims)
-    # but the following is more numerically stable. Ie,
-    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
-    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
-    #           = 1 / (1 - exp{-Y}),
-    # which is the most stable for large Y > 0.  For small Y, we use
-    # 1 - exp{-Y} approx Y.
-    ildj = -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                reduction_indices=event_dims)
-    return distribution_util.softplus_inverse(y), ildj
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
-    event_dims = self._event_dims_tensor(x)
-    return -math_ops.reduce_sum(
-        nn_ops.softplus(-x), reduction_indices=event_dims)
-
-
-class SoftmaxCentered(Bijector):
-  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-  bijection, the forward transformation appends a value to the input and the
-  inverse removes this coordinate.  The appended coordinate represents a pivot,
-  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-  coordinate.
-
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
-  Example Use:
-
-  ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-  # Result: [0.2, 0.3, 0.4, 0.1]
-  # Extra result: 0.1
-
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-  # Result: tf.log([2, 3, 4])
-  # Extra coordinate removed.
-  ```
-
-  At first blush it may seem like the [Invariance of domain](
-  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-  implementation is not a bijection.  However, the appended dimension
-  makes the (forward) image non-open and the theorem does not directly apply.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="softmax_centered"):
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
-    super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
-      return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
-
-  def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
-      return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
-
-  def _forward(self, x):
-    # Pad the last dim with a zeros vector. We need this because it lets us
-    # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
-             else array_ops.rank(y))
-    y = array_ops.pad(y,
-                      paddings=array_ops.concat(
-                          (array_ops.zeros(
-                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
-                          0))
-
-    # Set shape hints.
-    if x.get_shape().ndims is not None:
-      shape = x.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
-      y.get_shape().assert_is_compatible_with(shape)
-      y.set_shape(shape)
-
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
-    return nn_ops.softmax(y)
-
-  def _inverse(self, y):
-    # To derive the inverse mapping note that:
-    #   y[i] = exp(x[i]) / normalization
-    # and
-    #   y[end] = 1 / normalization.
-    # Thus:
-    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
-    #      = log(exp(x[i])/normalization) - log(y[end])
-    #      = log(y[i]) - log(y[end])
-    shape = (np.asarray(y.get_shape().as_list(), dtype=np.int32)
-             if y.get_shape().is_fully_defined()
-             else array_ops.shape(y, name="shape"))
-    ndims = y.get_shape().ndims or math_ops.rank(y, name="ndims")
-
-    # Do this first to make sure CSE catches that it'll happen again in
-    # _inverse_log_det_jacobian.
-    x = math_ops.log(y)
-
-    # We now extract the last coordinate of the rightmost dimension.
-    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
-    begin = array_ops.one_hot(indices=ndims-1,
-                              depth=ndims,
-                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
-                              dtype=shape.dtype)
-    size = array_ops.concat((shape[:-1], np.asarray([1], dtype=shape.dtype)), 0)
-    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
-
-    # Here we slice out all but the last coordinate; see above for idea.
-    begin = array_ops.zeros_like(shape)
-    size = array_ops.concat((shape[:-1], [shape[-1] - 1]), 0)
-    x = array_ops.strided_slice(x, begin, begin + size)
-
-    x += log_normalization
-
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
-
-    # Set shape hints.
-    if y.get_shape().ndims is not None:
-      shape = y.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
-      x.get_shape().assert_is_compatible_with(shape)
-      x.set_shape(shape)
-
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    # WLOG, consider the vector case:
-    #   x = log(y[:-1]) - log(y[-1])
-    # where,
-    #   y[-1] = 1 - sum(y[:-1]).
-    # We have:
-    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
-    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
-    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
-    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
-    #                        det(diag(y[:-1])) }                     (2)
-    #                = 1 / { y[-1] prod(y[:-1]) }
-    #                = 1 / prod(y)
-    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
-    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
-    #       docstring "Tip".
-    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-    return -math_ops.reduce_sum(math_ops.log(y), reduction_indices=-1)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, reduction_indices=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  reduction_indices=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
-
-
-class SigmoidCentered(SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
-  """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
-
-
-class CholeskyOuterProduct(Bijector):
-  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-  Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-  Examples:
-
-  ```python
-  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-  # Result: [[1, 1], [1, 5]], i.e., x x.T
-
-  bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
-  # Result: [[1, 0], [2, 1]], i.e., chol(y).
-  ```
-
-  """
-
-  def __init__(self, event_ndims=2, validate_args=False,
-               name="cholesky_outer_product"):
-    """Instantiates the `CholeskyOuterProduct` bijector.
-
-    Args:
-      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
-        dimensions associated with a particular draw from the distribution. Must
-        be 0 or 2.
-      validate_args: `Boolean` indicating whether arguments should be checked
-        for correctness.
-      name: `String` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if event_ndims is neither 0 or 2.
-    """
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-    if event_ndims is None or event_ndims not in [0, 2]:
-      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
-    self._static_event_ndims = event_ndims
-    super(CholeskyOuterProduct, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self._static_event_ndims == 0:
-      return math_ops.square(x)
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(x, 2)
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(shape[-2], shape[-1])
-      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
-    # For safety, explicitly zero-out the upper triangular part.
-    x = array_ops.matrix_band_part(x, -1, 0)
-    return math_ops.matmul(x, x, adjoint_b=True)
-
-  def _inverse_and_inverse_log_det_jacobian(self, y):
-    x = (math_ops.sqrt(y) if self._static_event_ndims == 0
-         else linalg_ops.cholesky(y))
-    return x, -self._forward_log_det_jacobian(x)
-
-  def _forward_log_det_jacobian(self, x):
-    # Let Y be a symmetric, positive definite matrix and write:
-    #   Y = X X.T
-    # where X is lower-triangular.
-    #
-    # Observe that,
-    #   dY[i,j]/dX[a,b]
-    #   = d/dX[a,b] { X[i,:] X[j,:] }
-    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
-    #
-    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
-    # symmetric and X is lower-triangular, we need vectors of dimension:
-    #   d = p (p + 1) / 2
-    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
-    #   k = { i (i + 1) / 2 + j   i>=j
-    #       { undef               i<j
-    # and assume zero-based indexes. When k is undef, the element is dropped.
-    # Example:
-    #           j      k
-    #        0 1 2 3  /
-    #    0 [ 0 . . . ]
-    # i  1 [ 1 2 . . ]
-    #    2 [ 3 4 5 . ]
-    #    3 [ 6 7 8 9 ]
-    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
-    # slight abuse: k(i,j)=undef means the element is dropped.)
-    #
-    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
-    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
-    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
-    # (1) j<=i<a thus i,j!=a.
-    # (2) i=a>j  thus i,j!=a.
-    #
-    # Since the Jacobian is lower-triangular, we need only compute the product
-    # of diagonal elements:
-    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
-    #   = X[j,j] + I[i=j] X[i,j]
-    #   = 2 X[j,j].
-    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
-    # conclude:
-    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
-    if self._static_event_ndims == 0:
-      if self.validate_args:
-        is_positive = check_ops.assert_positive(
-            x, message="All elements must be positive.")
-        x = control_flow_ops.with_dependencies([is_positive], x)
-      return math.log(2.) + math_ops.log(x)
-
-    diag = array_ops.matrix_diag_part(x)
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(
-          x, 2, message="Input must be a (batch of) matrix.")
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(
-          shape[-2], shape[-1],
-          message="Input must be a (batch of) square matrix.")
-      # Assuming lower-triangular means we only need check diag>0.
-      is_positive_definite = check_ops.assert_positive(
-          diag, message="Input must be positive definite.")
-      x = control_flow_ops.with_dependencies(
-          [is_matrix, is_square, is_positive_definite], x)
-
-    # Create a column vector equal to: [p, p-1, ..., 2, 1].T.
-    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
-      p = array_ops.shape(x)[-1]
-    else:
-      p = x.get_shape()[-1].value
-    exponents = array_ops.expand_dims(
-        math_ops.linspace(math_ops.cast(p, dtype=x.dtype), 1., p),
-        dim=1)
-
-    sum_weighted_log_diag = array_ops.squeeze(
-        math_ops.matmul(math_ops.log(diag), exponents), squeeze_dims=-1)
-    fldj = p * math.log(2.) + sum_weighted_log_diag
-
-    if x.get_shape().ndims is not None:
-      fldj.set_shape(x.get_shape()[:-2])
-
-    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
new file mode 100644
index 00000000000..1684a5fffe1
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bijector Ops.
+
+@@Affine
+@@AffineLinearOperator
+@@Bijector
+@@Chain
+@@CholeskyOuterProduct
+@@ConditionalBijector
+@@Exp
+@@Identity
+@@Inline
+@@Invert
+@@PowerTransform
+@@Sigmoid
+@@SigmoidCentered
+@@SoftmaxCentered
+@@Softplus
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
+
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import *
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import *
+from tensorflow.contrib.distributions.python.ops.bijectors.chain import *
+from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product import *
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import *
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import *
+from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
+from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
+from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.python.ops.distributions.bijector import *
+from tensorflow.python.ops.distributions.identity_bijector import Identity
+
+# pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
new file mode 100644
index 00000000000..940cceff04e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Affine bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Affine"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
new file mode 100644
index 00000000000..42865ed404d
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -0,0 +1,571 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Affine bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
+from tensorflow.contrib.distributions.python.ops import operator_pd_diag
+from tensorflow.contrib.distributions.python.ops import operator_pd_identity
+from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Affine",
+]
+
+
+def _as_tensor(x, name):
+  """Convenience to convert to `Tensor` or leave as `None`."""
+  return None if x is None else ops.convert_to_tensor(x, name=name)
+
+
+# TODO(srvasude): Deprecate this class with a dedicated Linear Operator
+# corresponding to TriL + V D V.T.
+class _TriLPlusVDVTLightweightOperatorPD(object):
+  """Helper/hidden class fake an OperatorPD for TriL+VDV.T."""
+
+  def __init__(self, tril, v, diag=None, validate_args=False):
+    """Creates an instance of _TriLPlusVDVTLightweightOperatorPD.
+
+    WARNING: This object is not to be used outside of `Affine` where it is
+    currently being temporarily used for refactoring purposes.
+
+    Args:
+      tril: `Tensor` of shape `[B1,..,Bb, d, d]`.
+      v: `Tensor` of shape `[B1,...,Bb, d, k]`.
+      diag: `Tensor` of shape `[B1,...,Bb, k, k]` or None
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+    """
+    self._m = tril
+    self._v = v
+    self._validate_args = validate_args
+    self._inputs = [tril, v]
+    if diag is not None:
+      self._inputs += [diag]
+      self._d = operator_pd_diag.OperatorPDDiag(diag, verify_pd=validate_args)
+      self._d_inv = operator_pd_diag.OperatorPDDiag(1. / diag,
+                                                    verify_pd=validate_args)
+      return
+    if v.get_shape().is_fully_defined():
+      v_shape = v.get_shape().as_list()
+      id_shape = v_shape[:-2] + [v_shape[-1], v_shape[-1]]
+    else:
+      v_shape = array_ops.shape(v)
+      id_shape = array_ops.concat([v_shape[:-2], [v_shape[-1], v_shape[-1]]], 0)
+    self._d = operator_pd_identity.OperatorPDIdentity(
+        id_shape, v.dtype, verify_pd=self.validate_args)
+    self._d_inv = self._d
+
+  @property
+  def inputs(self):
+    return self._inputs
+
+  @property
+  def dtype(self):
+    return self._m.dtype.base_dtype
+
+  @property
+  def validate_args(self):
+    return self._validate_args
+
+  def rank(self):
+    """Returns `rank(self)`."""
+    return array_ops.rank(self._m)
+
+  def sqrt_matmul(self, x):
+    """Computes `matmul(self, x)`.
+
+    Doesn't actually do the sqrt! Named as such to agree with API.
+
+    Args:
+      x: `Tensor`
+
+    Returns:
+      self_times_x: `Tensor`
+    """
+    m_x = math_ops.matmul(self._m, x)
+    vt_x = math_ops.matmul(self._v, x, adjoint_a=True)
+    d_vt_x = self._d.matmul(vt_x)
+    v_d_vt_x = math_ops.matmul(self._v, d_vt_x)
+    return m_x + v_d_vt_x
+
+  def sqrt_solve(self, x):
+    """Computes `solve(self, x)`.
+
+    Doesn't actually do the sqrt! Named as such to agree with API.
+
+    To compute (M + V D V.T), we use the Woodbury matrix identity:
+      inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
+    where,
+      C = inv(D) + V.T inv(M) V.
+    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+
+    Args:
+      x: `Tensor`
+
+    Returns:
+      inv_of_self_times_x: `Tensor`
+    """
+    minv_x = linalg_ops.matrix_triangular_solve(self._m, x)
+    vt_minv_x = math_ops.matmul(self._v, minv_x, transpose_a=True)
+    cinv_vt_minv_x = linalg_ops.matrix_solve(
+        self._woodbury_sandwiched_term(), vt_minv_x)
+    v_cinv_vt_minv_x = math_ops.matmul(self._v, cinv_vt_minv_x)
+    minv_v_cinv_vt_minv_x = linalg_ops.matrix_triangular_solve(
+        self._m, v_cinv_vt_minv_x)
+    return minv_x - minv_v_cinv_vt_minv_x
+
+  def sqrt_log_abs_det(self):
+    """Computes (log o abs o det)(X) for matrix X.
+
+    Doesn't actually do the sqrt! Named as such to agree with API.
+
+    To compute det(M + V D V.T), we use the matrix determinant lemma:
+      det(Tril + V D V.T) = det(C) det(D) det(M)
+    where C is defined as in `_inverse`, ie,
+      C = inv(D) + V.T inv(M) V.
+
+    See: https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+
+    Returns:
+      log_abs_det: `Tensor`.
+    """
+    log_det_c = math_ops.log(math_ops.abs(
+        linalg_ops.matrix_determinant(self._woodbury_sandwiched_term())))
+    # Reduction is ok because we always prepad inputs to this class.
+    log_det_m = math_ops.reduce_sum(math_ops.log(math_ops.abs(
+        array_ops.matrix_diag_part(self._m))), axis=[-1])
+    return log_det_c + 2. * self._d.sqrt_log_abs_det() + log_det_m
+
+  def _woodbury_sandwiched_term(self):
+    """Computes the sandwiched term in the Woodbury identity.
+
+    Computes the "`C`" in the identity:
+       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
+    where,
+       C = inv(D) + V.T inv(M) V.
+
+    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+
+    Returns:
+      woodbury_sandwich_term: A `Tensor` to be used like `C`, above.
+    """
+    minv_v = linalg_ops.matrix_triangular_solve(self._m, self._v)
+    vt_minv_v = math_ops.matmul(self._v, minv_v, adjoint_a=True)
+    return self._d_inv.add_to_tensor(vt_minv_v)
+
+
+class Affine(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
+
+  In TF parlance, the `scale` term is logically equivalent to:
+
+  ```python
+  scale = (
+    scale_identity_multiplier * tf.diag(tf.ones(d)) +
+    tf.diag(scale_diag) +
+    scale_tril +
+    scale_perturb_factor @ diag(scale_perturb_diag) @
+      tf.transpose([scale_perturb_factor])
+  )
+  ```
+
+  The `scale` term is applied without necessarily materializing constituent
+  matrices, i.e., the matmul is [matrix-free](
+  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
+
+  Examples:
+
+  ```python
+  # Y = X
+  b = Affine()
+
+  # Y = X + shift
+  b = Affine(shift=[1., 2, 3])
+
+  # Y = 2 * I @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_identity_multiplier=2.)
+
+  # Y = tf.diag(d1) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
+
+  # Y = (I + v * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[1., 3, 3],          # Implicitly 3x3.
+             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale_identity_multiplier=None,
+               scale_diag=None,
+               scale_tril=None,
+               scale_perturb_factor=None,
+               scale_perturb_diag=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine"):
+    """Instantiates the `Affine` bijector.
+
+    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
+    giving the forward operation:
+
+    ```none
+    Y = g(X) = scale @ X + shift
+    ```
+
+    where the `scale` term is logically equivalent to:
+
+    ```python
+    scale = (
+      scale_identity_multiplier * tf.diag(tf.ones(d)) +
+      tf.diag(scale_diag) +
+      scale_tril +
+      scale_perturb_factor @ diag(scale_perturb_diag) @
+        tf.transpose([scale_perturb_factor])
+    )
+    ```
+
+    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
+    specified then `scale += IdentityMatrix`. Otherwise specifying a
+    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
+    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
+
+    Args:
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
+      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+        scaling done to the identity matrix.
+        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+        to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+        When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
+        lower triangular matrix.
+        When `None` no `scale_tril` term is added to `scale`.
+        The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
+        represents an `r x r` diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
+      TypeError: if `shift` has different `dtype` from `scale` arguments.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    # Ambiguous definition of low rank update.
+    if scale_perturb_diag is not None and scale_perturb_factor is None:
+      raise ValueError("When scale_perturb_diag is specified, "
+                       "scale_perturb_factor must be specified.")
+    # Special case, only handling a scaled identity matrix. We don't know its
+    # dimensions, so this is special cased.
+    # We don't check identity_multiplier, since below we set it to 1. if all
+    # other scale args are None.
+    self._is_only_identity_multiplier = (scale_tril is None and
+                                         scale_diag is None and
+                                         scale_perturb_factor is None)
+    # When no args are specified, pretend the scale matrix is the identity
+    # matrix.
+    if self._is_only_identity_multiplier and scale_identity_multiplier is None:
+      scale_identity_multiplier = 1.
+    with self._name_scope("init", values=[
+        shift, scale_identity_multiplier, scale_diag, scale_tril,
+        scale_perturb_diag, scale_perturb_factor, event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      if validate_args:
+        is_less_than_two = check_ops.assert_less(
+            event_ndims, 2,
+            message="event_ndims must be 0 or 1")
+        event_ndims = control_flow_ops.with_dependencies(
+            [is_less_than_two], event_ndims)
+      self._shift = _as_tensor(shift, "shift")
+      # self._create_scale_operator returns an OperatorPD in all cases except if
+      # self._is_only_identity_multiplier; in which case it returns a scalar
+      # Tensor.
+      self._scale = self._create_scale_operator(
+          identity_multiplier=scale_identity_multiplier,
+          diag=scale_diag,
+          tril=scale_tril,
+          perturb_diag=scale_perturb_diag,
+          perturb_factor=scale_perturb_factor,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      if (self._shift is not None and
+          self._shift.dtype.base_dtype != self._scale.dtype.base_dtype):
+        raise TypeError("shift.dtype({}) does not match scale.dtype({})".format(
+            self._shift.dtype, self._scale.dtype))
+      self._shaper = _DistributionShape(
+          batch_ndims=self._infer_batch_ndims(),
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(Affine, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=(
+              [event_ndims] +
+              [self._scale] if tensor_util.is_tensor(self._scale)
+              else self._scale.inputs +
+              [self._shift] if self._shift is not None else []),
+          is_constant_jacobian=True,
+          dtype=self._scale.dtype,
+          validate_args=validate_args,
+          name=name)
+
+  def _create_scale_operator(self, identity_multiplier, diag, tril,
+                             perturb_diag, perturb_factor, event_ndims,
+                             validate_args):
+    """Construct `scale` from various components.
+
+    Args:
+      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
+        done to the identity matrix.
+      diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+      tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
+        triangular matrix.
+      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
+        the low rank update.
+      perturb_factor: Floating-point `Tensor` representing factor matrix.
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+
+    Returns:
+      scale. In the case of scaling by a constant, scale is a
+      floating point `Tensor`. Otherwise, scale is an `OperatorPD`.
+
+    Raises:
+      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
+    """
+    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
+    diag = _as_tensor(diag, "diag")
+    tril = _as_tensor(tril, "tril")
+    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
+    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
+
+    identity_multiplier = self._maybe_validate_identity_multiplier(
+        identity_multiplier, validate_args)
+
+    if perturb_factor is not None:
+      perturb_factor = self._process_matrix(
+          perturb_factor, min_rank=2, event_ndims=event_ndims)
+
+    if perturb_diag is not None:
+      perturb_diag = self._process_matrix(
+          perturb_diag, min_rank=1, event_ndims=event_ndims)
+
+    # The following if-statments are ordered by increasingly stronger
+    # assumptions in the base matrix, i.e., we process in the order:
+    # TriL, Diag, Identity.
+
+    if tril is not None:
+      tril = self._preprocess_tril(
+          identity_multiplier, diag, tril, event_ndims)
+      if perturb_factor is None:
+        return operator_pd_cholesky.OperatorPDCholesky(
+            tril, verify_pd=validate_args)
+      return _TriLPlusVDVTLightweightOperatorPD(
+          tril=tril, v=perturb_factor, diag=perturb_diag,
+          validate_args=validate_args)
+
+    if diag is not None:
+      diag = self._preprocess_diag(identity_multiplier, diag, event_ndims)
+      if perturb_factor is None:
+        return operator_pd_diag.OperatorPDSqrtDiag(
+            diag, verify_pd=validate_args)
+      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator=operator_pd_diag.OperatorPDDiag(
+              diag, verify_pd=validate_args),
+          v=perturb_factor,
+          diag=perturb_diag,
+          verify_pd=validate_args)
+
+    if identity_multiplier is not None:
+      if perturb_factor is None:
+        return identity_multiplier
+      # Infer the shape from the V and D.
+      v_shape = array_ops.shape(perturb_factor)
+      identity_shape = array_ops.concat([v_shape[:-1], [v_shape[-2]]], 0)
+      scaled_identity = operator_pd_identity.OperatorPDIdentity(
+          identity_shape,
+          perturb_factor.dtype.base_dtype,
+          scale=identity_multiplier,
+          verify_pd=validate_args)
+      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator=scaled_identity,
+          v=perturb_factor,
+          diag=perturb_diag,
+          verify_pd=validate_args)
+
+    raise ValueError("One of tril, diag and/or identity_multiplier must be "
+                     "specified.")
+
+  def _maybe_validate_identity_multiplier(self, identity_multiplier,
+                                          validate_args):
+    """Check that the init arg `identity_multiplier` is valid."""
+    if identity_multiplier is None or not validate_args:
+      return identity_multiplier
+    if validate_args:
+      identity_multiplier = control_flow_ops.with_dependencies(
+          [check_ops.assert_positive(identity_multiplier)],
+          identity_multiplier)
+    return identity_multiplier
+
+  def _preprocess_tril(self, identity_multiplier, diag, tril, event_ndims):
+    """Helper to preprocess a lower triangular matrix."""
+    tril = array_ops.matrix_band_part(tril, -1, 0)  # Zero out TriU.
+    if identity_multiplier is None and diag is None:
+      return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
+    new_diag = array_ops.matrix_diag_part(tril)
+    if identity_multiplier is not None:
+      new_diag += identity_multiplier
+    if diag is not None:
+      new_diag += diag
+    tril = array_ops.matrix_set_diag(tril, new_diag)
+    return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
+
+  def _preprocess_diag(self, identity_multiplier, diag, event_ndims):
+    """Helper to preprocess a diagonal matrix."""
+    if identity_multiplier is not None:
+      diag += identity_multiplier
+    return self._process_matrix(diag, min_rank=1, event_ndims=event_ndims)
+
+  def _process_matrix(self, matrix, min_rank, event_ndims):
+    """Helper to __init__ which gets matrix in batch-ready form."""
+    # Pad the matrix so that matmul works in the case of a matrix and vector
+    # input. Keep track if the matrix was padded, to distinguish between a
+    # rank 3 tensor and a padded rank 2 tensor.
+    # TODO(srvasude): Remove side-effects from functions. Its currently unbroken
+    # but error-prone since the function call order may change in the future.
+    self._rank_two_event_ndims_one = math_ops.logical_and(
+        math_ops.equal(array_ops.rank(matrix), min_rank),
+        math_ops.equal(event_ndims, 1))
+    left = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
+    pad = array_ops.concat(
+        [array_ops.ones(
+            [left], dtype=dtypes.int32), array_ops.shape(matrix)],
+        0)
+    return array_ops.reshape(matrix, pad)
+
+  def _infer_batch_ndims(self):
+    """Return batch_ndims."""
+    if self._is_only_identity_multiplier:
+      return 0
+    # The real batch dims is one less when we pad in the case of event_ndims =
+    # 1, and the rank of the underlying scale being 2. This allows us to have
+    # non-negative sample dims.
+    return (self._scale.rank() - 2 -
+            array_ops.where(self._rank_two_event_ndims_one, 1, 0))
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    # TODO(srvasude): Remove this exception once TriLPlusVDVT is properly
+    # implemented.
+    if isinstance(self._scale, _TriLPlusVDVTLightweightOperatorPD):
+      raise NotImplementedError("Cannot access scale when Tril+VDV.T.")
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self._is_only_identity_multiplier:
+      y *= self._scale
+      if self.shift is not None:
+        return y + self.shift
+      return  y
+    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(y)
+    y = self._scale.sqrt_matmul(y)
+    y = self._shaper.undo_make_batch_of_event_sample_matrices(y, sample_shape)
+    if self.shift is not None:
+      return y + self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self._is_only_identity_multiplier:
+      return x / self._scale
+    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(x)
+    x = self._scale.sqrt_solve(x)
+    x = self._shaper.undo_make_batch_of_event_sample_matrices(x, sample_shape)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._is_only_identity_multiplier:
+      # TODO(jvdillon): We don't pad in this case and instead let the fldj be
+      # applied via broadcast.
+      d = math_ops.cast(array_ops.shape(x)[-1], dtype=self._scale.dtype)
+      return math_ops.log(math_ops.abs(self._scale)) * array_ops.where(
+          math_ops.equal(self._shaper.event_ndims, 0), 1., d)
+    fldj = self._scale.sqrt_log_abs_det()
+    # We need to squeeze off the padded dimension.
+    start = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
+    return array_ops.reshape(fldj, array_ops.shape(fldj)[start:])
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
new file mode 100644
index 00000000000..aca04a89df7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AffineLinearOperator bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["AffineLinearOperator"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
new file mode 100644
index 00000000000..ae380b5cb2b
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
@@ -0,0 +1,231 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AffineLinearOperator bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.contrib.linalg.python.ops import linear_operator
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "AffineLinearOperator",
+]
+
+
+class AffineLinearOperator(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
+
+  If `X` is a scalar then the forward transformation is: `scale * X + shift`
+  where `*` denotes the scalar product.
+
+  Note: we don't always simply transpose `X` (but write it this way for
+  brevity). Actually the input `X` undergoes the following transformation
+  before being premultiplied by `scale`:
+
+  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
+     `new_sample_shape = [1]`. Otherwise do nothing.
+  2. The sample shape is flattened to have one dimension, i.e.,
+     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
+  3. The sample dim is cyclically rotated left by 1, i.e.,
+     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
+     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
+     dimensions.
+
+  (For more details see `shape.make_batch_of_event_sample_matrices`.)
+
+  The result of the above transformation is that `X` can be regarded as a batch
+  of matrices where each column is a draw from the distribution. After
+  premultiplying by `scale`, we take the inverse of this procedure. The input
+  `Y` also undergoes the same transformation before/after premultiplying by
+  `inv(scale)`.
+
+  Example Use:
+
+  ```python
+  linalg = tf.contrib.linalg
+
+  x = [1., 2, 3]
+
+  shift = [-1., 0., 1]
+  diag = [1., 2, 3]
+  scale = linalg.LinearOperatorDiag(diag)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # y = scale @ x + shift
+  y = affine.forward(x)  # [0., 4, 10]
+
+  shift = [2., 3, 1]
+  tril = [[1., 0, 0],
+          [2, 1, 0],
+          [3, 2, 1]]
+  scale = linalg.LinearOperatorTriL(tril)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
+  y = affine.forward(x)  # [3., 7, 11]
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine_linear_operator"):
+    """Instantiates the `AffineLinearOperator` bijector.
+
+    Args:
+      shift: Floating-point `Tensor`.
+      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
+        definite matrix `M` in `R^{k x k}`.
+      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `event_ndims` is not 0 or 1.
+      TypeError: if `scale` is not a `LinearOperator`.
+      TypeError: if `shift.dtype` does not match `scale.dtype`.
+      ValueError: if not `scale.is_non_singular`.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    graph_parents = []
+    with self._name_scope("init", values=[shift]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      if tensor_util.constant_value(event_ndims) is not None:
+        event_ndims = tensor_util.constant_value(event_ndims)
+        if event_ndims not in (0, 1):
+          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+        graph_parents += [event_ndims]
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        graph_parents += [shift]
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      if scale is not None:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+        if not isinstance(scale, linear_operator.LinearOperator):
+          raise TypeError("scale is not an instance of tf.LinearOperator")
+        if validate_args and not scale.is_non_singular:
+          raise ValueError("Scale matrix must be non-singular.")
+        graph_parents += scale.graph_parents
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+          graph_parents += [batch_ndims]
+        if scale.dtype is not None:
+          dtype = scale.dtype.base_dtype
+      else:
+        batch_ndims = 0  # We won't need shape inference when scale is None.
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(AffineLinearOperator, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=graph_parents,
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self.scale is not None:
+      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          y, expand_batch_dim=False)
+      with ops.control_dependencies(self._maybe_collect_assertions() if
+                                    self.validate_args else []):
+        y = self.scale.matmul(y)
+      y = self._shaper.undo_make_batch_of_event_sample_matrices(
+          y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self.scale is not None:
+      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          x, expand_batch_dim=False)
+      # Solve fails if the op is singular so we may safely skip this assertion.
+      x = self.scale.solve(x)
+      x = self._shaper.undo_make_batch_of_event_sample_matrices(
+          x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+    if self.scale is None:
+      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+    with ops.control_dependencies(self._maybe_collect_assertions() if
+                                  self.validate_args else []):
+      return self.scale.log_abs_determinant()
+
+  def _maybe_collect_assertions(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
new file mode 100644
index 00000000000..0db10fb75c8
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Chain bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.chain_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Chain"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
new file mode 100644
index 00000000000..defa36a1404
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
@@ -0,0 +1,161 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Chain bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Chain",
+]
+
+
+class Chain(bijector.Bijector):
+  """Bijector which applies a sequence of bijectors.
+
+  Example Use:
+
+  ```python
+  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
+  ```
+
+  Results in:
+
+  * Forward:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).forward(x)
+   = exp.forward(softplus.forward(x))
+   = tf.exp(tf.log(1. + tf.exp(x)))
+   = 1. + tf.exp(x)
+   ```
+
+  * Inverse:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).inverse(y)
+   = softplus.inverse(exp.inverse(y))
+   = tf.log(tf.exp(tf.log(y)) - 1.)
+   = tf.log(y - 1.)
+   ```
+
+  """
+
+  def __init__(self, bijectors=None, validate_args=False, name=None):
+    """Instantiates `Chain` bijector.
+
+    Args:
+      bijectors: Python `list` of bijector instances. An empty list makes this
+        bijector equivalent to the `Identity` bijector.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object. Default:
+        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
+
+    Raises:
+      ValueError: if bijectors have different dtypes.
+    """
+    if bijectors is None:
+      bijectors = ()
+    self._bijectors = bijectors
+    dtype = list(set([b.dtype for b in bijectors]))
+    if len(dtype) > 2:
+      raise ValueError("incompatible dtypes: %s" % dtype)
+    elif len(dtype) == 2:
+      dtype = dtype[1] if dtype[0] is None else dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    elif len(dtype) == 1:
+      dtype = dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    else:
+      dtype = None
+      event_ndims = None
+
+    super(Chain, self).__init__(
+        graph_parents=list(itertools.chain.from_iterable(
+            b.graph_parents for b in bijectors)),
+        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
+        validate_args=validate_args,
+        dtype=dtype,
+        event_ndims=event_ndims,
+        name=name or ("identity" if not bijectors else
+                      "_of_".join(["chain"] + [b.name for b in bijectors])))
+
+  @property
+  def bijectors(self):
+    return self._bijectors
+
+  def _shape_helper(self, func_name, input_shape, reverse):
+    new_shape = input_shape
+    for b in reversed(self.bijectors) if reverse else self.bijectors:
+      func = getattr(b, func_name, None)
+      if func is None:
+        raise ValueError("unable to call %s on bijector %s (%s)" %
+                         (func_name, b.name, func))
+      new_shape = func(new_shape)
+    return new_shape
+
+  def _forward_event_shape(self, input_shape):
+    return self._shape_helper("forward_event_shape", input_shape,
+                              reverse=True)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self._shape_helper(
+        "forward_event_shape_tensor", input_shape, reverse=True)
+
+  def _inverse_event_shape(self, output_shape):
+    return self._shape_helper("inverse_event_shape", output_shape,
+                              reverse=False)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self._shape_helper("inverse_event_shape_tensor", output_shape,
+                              reverse=False)
+
+  def _inverse(self, y, **kwargs):
+    for b in self.bijectors:
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return y
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    ildj = constant_op.constant(0., dtype=y.dtype,
+                                name="inverse_log_det_jacobian")
+    for b in self.bijectors:
+      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return ildj
+
+  def _forward(self, x, **kwargs):
+    for b in reversed(self.bijectors):
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return x
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    fldj = constant_op.constant(0., dtype=x.dtype,
+                                name="forward_log_det_jacobian")
+    for b in reversed(self.bijectors):
+      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
new file mode 100644
index 00000000000..4686af8bc42
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CholeskyOuterProduct bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["CholeskyOuterProduct"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
new file mode 100644
index 00000000000..dc05b2f611a
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
@@ -0,0 +1,218 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CholeskyOuterProduct bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+__all__ = [
+    "CholeskyOuterProduct",
+]
+
+
+class CholeskyOuterProduct(bijector.Bijector):
+  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
+
+  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+  Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+  Examples:
+
+  ```python
+  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
+
+  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
+  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
+  ```
+
+  """
+
+  def __init__(self, event_ndims=2, validate_args=False,
+               name="cholesky_outer_product"):
+    """Instantiates the `CholeskyOuterProduct` bijector.
+
+    Args:
+      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
+        dimensions associated with a particular draw from the distribution. Must
+        be 0 or 2.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if event_ndims is neither 0 or 2.
+    """
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+    if event_ndims is None or event_ndims not in [0, 2]:
+      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
+    self._static_event_ndims = event_ndims
+    super(CholeskyOuterProduct, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._static_event_ndims == 0:
+      return math_ops.square(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(x, 2)
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(shape[-2], shape[-1])
+      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
+    # For safety, explicitly zero-out the upper triangular part.
+    x = array_ops.matrix_band_part(x, -1, 0)
+    return math_ops.matmul(x, x, adjoint_b=True)
+
+  def _inverse(self, y):
+    return (math_ops.sqrt(y) if self._static_event_ndims == 0
+            else linalg_ops.cholesky(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(x=self._inverse(y))
+
+  def _forward_log_det_jacobian(self, x):
+    # Let Y be a symmetric, positive definite matrix and write:
+    #   Y = X X.T
+    # where X is lower-triangular.
+    #
+    # Observe that,
+    #   dY[i,j]/dX[a,b]
+    #   = d/dX[a,b] { X[i,:] X[j,:] }
+    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
+    #
+    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
+    # symmetric and X is lower-triangular, we need vectors of dimension:
+    #   d = p (p + 1) / 2
+    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
+    #   k = { i (i + 1) / 2 + j   i>=j
+    #       { undef               i<j
+    # and assume zero-based indexes. When k is undef, the element is dropped.
+    # Example:
+    #           j      k
+    #        0 1 2 3  /
+    #    0 [ 0 . . . ]
+    # i  1 [ 1 2 . . ]
+    #    2 [ 3 4 5 . ]
+    #    3 [ 6 7 8 9 ]
+    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
+    # slight abuse: k(i,j)=undef means the element is dropped.)
+    #
+    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
+    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
+    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
+    # (1) j<=i<a thus i,j!=a.
+    # (2) i=a>j  thus i,j!=a.
+    #
+    # Since the Jacobian is lower-triangular, we need only compute the product
+    # of diagonal elements:
+    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
+    #   = X[j,j] + I[i=j] X[i,j]
+    #   = 2 X[j,j].
+    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
+    # conclude:
+    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
+    if self._static_event_ndims == 0:
+      if self.validate_args:
+        is_positive = check_ops.assert_positive(
+            x, message="All elements must be positive.")
+        x = control_flow_ops.with_dependencies([is_positive], x)
+      return np.log(2.) + math_ops.log(x)
+
+    diag = array_ops.matrix_diag_part(x)
+
+    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
+    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
+    # output is unchanged.
+    diag = self._make_columnar(diag)
+
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(
+          x, 2, message="Input must be a (batch of) matrix.")
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(
+          shape[-2], shape[-1],
+          message="Input must be a (batch of) square matrix.")
+      # Assuming lower-triangular means we only need check diag>0.
+      is_positive_definite = check_ops.assert_positive(
+          diag, message="Input must be positive definite.")
+      x = control_flow_ops.with_dependencies(
+          [is_matrix, is_square, is_positive_definite], x)
+
+    # Create a vector equal to: [p, p-1, ..., 2, 1].
+    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+      p_int = array_ops.shape(x)[-1]
+      p_float = math_ops.cast(p_int, dtype=x.dtype)
+    else:
+      p_int = x.get_shape()[-1].value
+      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
+    exponents = math_ops.linspace(p_float, 1., p_int)
+
+    sum_weighted_log_diag = array_ops.squeeze(
+        math_ops.matmul(math_ops.log(diag),
+                        exponents[..., array_ops.newaxis]),
+        squeeze_dims=-1)
+    fldj = p_float * np.log(2.) + sum_weighted_log_diag
+
+    return fldj
+
+  def _make_columnar(self, x):
+    """Ensures non-scalar input has at least one column.
+
+    Example:
+      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
+
+      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
+
+      If `x = 1` then the output is unchanged.
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      columnar_x: `Tensor` with at least two dimensions.
+    """
+    if x.get_shape().ndims is not None:
+      if x.get_shape().ndims == 1:
+        x = x[array_ops.newaxis, :]
+      return x
+    shape = array_ops.shape(x)
+    maybe_expanded_shape = array_ops.concat([
+        shape[:-1],
+        distribution_util.pick_vector(
+            math_ops.equal(array_ops.rank(x), 1),
+            [1], np.array([], dtype=np.int32)),
+        shape[-1:],
+    ], 0)
+    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
new file mode 100644
index 00000000000..d254b635d28
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ConditionalBijector base."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["ConditionalBijector"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
new file mode 100644
index 00000000000..ccb1f029277
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
@@ -0,0 +1,55 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ConditionalBijector base."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+__all__ = ["ConditionalBijector"]
+
+
+class ConditionalBijector(bijector.Bijector):
+  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward(self, x, name="forward", **condition_kwargs):
+    return self._call_forward(x, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse(self, y, name="inverse", **condition_kwargs):
+    return self._call_inverse(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse_log_det_jacobian(
+      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward_log_det_jacobian(
+      self, x, name="forward_log_det_jacobian", **condition_kwargs):
+    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
new file mode 100644
index 00000000000..399d713098e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exp bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.exp_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Exp"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
new file mode 100644
index 00000000000..b1ff840d62a
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
@@ -0,0 +1,66 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exp bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
+
+
+__all__ = [
+    "Exp",
+]
+
+
+class Exp(power_transform.PowerTransform):
+  """Compute `Y = g(X) = exp(X)`.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    exp = Exp(event_ndims=2)
+    x = [[[1., 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]]]
+    exp(x) == exp.forward(x)
+    log(x) == exp.inverse(x)
+    ```
+
+    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
+    over the event space.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="exp"):
+    """Instantiates the `Exp` bijector.
+
+    Args:
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    super(Exp, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
new file mode 100644
index 00000000000..db10c3fc3a9
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inline bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.inline_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Inline"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
new file mode 100644
index 00000000000..fab1b22fbf9
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
@@ -0,0 +1,141 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inline bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Inline",
+]
+
+
+class Inline(bijector.Bijector):
+  """Bijector constructed from custom callables.
+
+  Example Use:
+
+  ```python
+  exp = Inline(
+    forward_fn=tf.exp,
+    inverse_fn=tf.log,
+    inverse_log_det_jacobian_fn=(
+      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
+    name="exp")
+  ```
+
+  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  """
+
+  def __init__(self,
+               forward_fn=None,
+               inverse_fn=None,
+               inverse_log_det_jacobian_fn=None,
+               forward_log_det_jacobian_fn=None,
+               forward_event_shape_fn=None,
+               forward_event_shape_tensor_fn=None,
+               inverse_event_shape_fn=None,
+               inverse_event_shape_tensor_fn=None,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name="inline"):
+    """Creates a `Bijector` from callables.
+
+    Args:
+      forward_fn: Python callable implementing the forward transformation.
+      inverse_fn: Python callable implementing the inverse transformation.
+      inverse_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the inverse transformation.
+      forward_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the forward transformation.
+      forward_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      forward_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is
+        constant for all input arguments.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    super(Inline, self).__init__(
+        event_ndims=0,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+    self._forward_fn = forward_fn
+    self._inverse_fn = inverse_fn
+    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
+    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
+    self._forward_event_shape_fn = forward_event_shape_fn
+    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
+    self._inverse_event_shape_fn = inverse_event_shape_fn
+    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
+
+  def _forward_event_shape(self, input_shape):
+    if self._forward_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_fn(input_shape)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    if self._forward_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_tensor_fn(input_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    if self._inverse_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_fn(output_shape)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self._inverse_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_tensor_fn(output_shape)
+
+  def _forward(self, x, **kwargs):
+    if not callable(self._forward_fn):
+      raise NotImplementedError(
+          "forward_fn is not a callable function.")
+    return self._forward_fn(x, **kwargs)
+
+  def _inverse(self, y, **kwargs):
+    if not callable(self._inverse_fn):
+      raise NotImplementedError(
+          "inverse_fn is not a callable function.")
+    return self._inverse_fn(y, **kwargs)
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._inverse_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "inverse_log_det_jacobian_fn is not a callable function.")
+    return self._inverse_log_det_jacobian_fn(y, **kwargs)
+
+  def _forward_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._forward_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "forward_log_det_jacobian_fn is not a callable function.")
+    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
new file mode 100644
index 00000000000..c134e10109c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Invert bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.invert_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Invert"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
new file mode 100644
index 00000000000..7f28a298572
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
@@ -0,0 +1,98 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Invert bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+__all__ = [
+    "Invert",
+]
+
+
+class Invert(bijector_lib.Bijector):
+  """Bijector which inverts another Bijector.
+
+  Example Use: [ExpGammaDistribution (see Background & Context)](
+  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
+  models `Y=log(X)` where `X ~ Gamma`.
+
+  ```python
+  exp_gamma_distribution = TransformedDistribution(
+    distribution=Gamma(concentration=1., rate=2.),
+    bijector=bijector.Invert(bijector.Exp())
+  ```
+
+  """
+
+  def __init__(self, bijector, validate_args=False, name=None):
+    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
+
+    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
+    efficient if the base bijector implements `_forward_log_det_jacobian`. If
+    `_forward_log_det_jacobian` is not implemented then the following code is
+    used:
+
+    ```python
+    y = self.inverse(x, **kwargs)
+    return -self.inverse_log_det_jacobian(y, **kwargs)
+    ```
+
+    Args:
+      bijector: Bijector instance.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+
+    self._bijector = bijector
+    super(Invert, self).__init__(
+        event_ndims=bijector.event_ndims,
+        graph_parents=bijector.graph_parents,
+        is_constant_jacobian=bijector.is_constant_jacobian,
+        validate_args=validate_args,
+        dtype=bijector.dtype,
+        name=name or "_".join(["invert", bijector.name]))
+
+  def _forward_event_shape(self, input_shape):
+    return self.bijector.inverse_event_shape(input_shape)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self.bijector.inverse_event_shape_tensor(input_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    return self.bijector.forward_event_shape(output_shape)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self.bijector.forward_event_shape_tensor(output_shape)
+
+  @property
+  def bijector(self):
+    return self._bijector
+
+  def _forward(self, x, **kwargs):
+    return self.bijector.inverse(x, **kwargs)
+
+  def _inverse(self, y, **kwargs):
+    return self.bijector.forward(y, **kwargs)
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    return self.bijector.forward_log_det_jacobian(y, **kwargs)
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    return self.bijector.inverse_log_det_jacobian(x, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
new file mode 100644
index 00000000000..a83199549cd
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""PowerTransform bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.power_transform_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["PowerTransform"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
new file mode 100644
index 00000000000..c37db61720d
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
@@ -0,0 +1,127 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""PowerTransform bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "PowerTransform",
+]
+
+
+class PowerTransform(bijector.Bijector):
+  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
+
+  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
+  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
+  of this bijector.
+
+  This bijector is equivalent to the `Exp` bijector when `c=0`.
+  """
+
+  def __init__(self,
+               power=0.,
+               event_ndims=0,
+               validate_args=False,
+               name="power_transform"):
+    """Instantiates the `PowerTransform` bijector.
+
+    Args:
+      power: Python `float` scalar indicating the transform power, i.e.,
+        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `power < 0` or is not known statically.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[power]):
+      power = tensor_util.constant_value(
+          ops.convert_to_tensor(power, name="power"))
+    if power is None or power < 0:
+      raise ValueError("`power` must be a non-negative TF constant.")
+    self._power = power
+    super(PowerTransform, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def power(self):
+    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
+    return self._power
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    if self.power == 0.:
+      return math_ops.exp(x)
+    # If large x accuracy is an issue, consider using:
+    # (1. + x * self.power)**(1. / self.power) when x >> 1.
+    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    if self.power == 0.:
+      return math_ops.log(y)
+    # If large y accuracy is an issue, consider using:
+    # (y**self.power - 1.) / self.power when y >> 1.
+    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return (self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log(y), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    if self.power == 0.:
+      return math_ops.reduce_sum(x, axis=event_dims)
+    return (1. / self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log1p(x * self.power),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args or self.power == 0.:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        1. + self.power * x,
+        message="Forward transformation input must be at least {}.".format(
+            -1. / self.power))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = check_ops.assert_positive(
+        y, message="Inverse transformation input must be greater than 0.")
+    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
new file mode 100644
index 00000000000..c20e76c0b73
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -0,0 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sigmoid bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Sigmoid"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
new file mode 100644
index 00000000000..448125230d2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SigmoidCentered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["SigmoidCentered"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
new file mode 100644
index 00000000000..223bc9d042c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SigmoidCentered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
+
+
+__all__ = [
+    "SigmoidCentered",
+]
+
+
+class SigmoidCentered(softmax_centered.SoftmaxCentered):
+  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
+
+  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
+
+  See `bijector.SoftmaxCentered` for more details.
+  """
+
+  def __init__(self, validate_args=False, name="sigmoid_centered"):
+    super(SigmoidCentered, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
new file mode 100644
index 00000000000..a640dfe7dfb
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sigmoid bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Sigmoid",
+]
+
+
+class Sigmoid(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
+
+  def __init__(self, validate_args=False, name="sigmoid"):
+    super(Sigmoid, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
+
+  def _forward(self, x):
+    return math_ops.sigmoid(x)
+
+  def _inverse(self, y):
+    return math_ops.log(y) - math_ops.log1p(-y)
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y) - math_ops.log1p(-y)
+
+  def _forward_log_det_jacobian(self, x):
+    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
new file mode 100644
index 00000000000..be6608f9788
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SoftmaxCentered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["SoftmaxCentered"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
new file mode 100644
index 00000000000..8645cc1b6b0
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
@@ -0,0 +1,245 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SoftmaxCentered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "SoftmaxCentered",
+]
+
+
+class SoftmaxCentered(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
+
+  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
+  bijection, the forward transformation appends a value to the input and the
+  inverse removes this coordinate. The appended coordinate represents a pivot,
+  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
+  coordinate.
+
+  Because we append a coordinate, this bijector only supports `event_ndim in [0,
+  1]`, i.e., scalars and vectors.
+
+  Example Use:
+
+  ```python
+  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  # Result: [0.2, 0.3, 0.4, 0.1]
+  # Extra result: 0.1
+
+  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  # Result: tf.log([2, 3, 4])
+  # Extra coordinate removed.
+  ```
+
+  At first blush it may seem like the [Invariance of domain](
+  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
+  implementation is not a bijection. However, the appended dimension
+  makes the (forward) image non-open and the theorem does not directly apply.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="softmax_centered"):
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+      if event_ndims is None or event_ndims not in [0, 1]:
+        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
+    self._static_event_ndims = event_ndims
+    super(SoftmaxCentered, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None:
+      return input_shape
+    if input_shape.ndims != self._static_event_ndims:
+      raise ValueError("input_shape.dims = %d != %d" %
+                       (input_shape.ndims, self._static_event_ndims))
+    if input_shape.ndims == 0:
+      return tensor_shape.TensorShape([2])
+    if input_shape.ndims == 1:
+      return tensor_shape.TensorShape(input_shape[0] + 1)
+    # Unreachable code:
+    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    ndims = array_ops.shape(input_shape)
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_zero_or_one = check_ops.assert_equal(
+          ndims, 0 if self._static_event_ndims == 0 else 1,
+          message="event_ndims must be 0 or 1")
+      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor(
+          [2], dtype=dtypes.int32, name="output_shape")
+    return input_shape + 1
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None:
+      return output_shape
+    if output_shape.ndims != 1:
+      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
+    if self._static_event_ndims == 0:
+      return tensor_shape.TensorShape([])
+    return tensor_shape.TensorShape(output_shape[0] - 1)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    ndims = array_ops.shape(output_shape)[0]
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_one = check_ops.assert_equal(
+          ndims, 1, message="event_ndims must be 1")
+      ndims = control_flow_ops.with_dependencies([is_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
+    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+
+  def _forward(self, x):
+    # Pad the last dim with a zeros vector. We need this because it lets us
+    # infer the scale in the inverse function.
+    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
+    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
+             else array_ops.rank(y))
+    y = array_ops.pad(y,
+                      paddings=array_ops.concat(
+                          (array_ops.zeros(
+                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
+                          0))
+
+    # Set shape hints.
+    if x.get_shape().ndims is not None:
+      shape = x.get_shape().as_list()
+      if self._static_event_ndims == 0:
+        shape += [2]
+      elif shape[-1] is not None:
+        shape[-1] += 1
+      shape = tensor_shape.TensorShape(shape)
+      y.get_shape().assert_is_compatible_with(shape)
+      y.set_shape(shape)
+
+    # Since we only support event_ndims in [0, 1] and we do padding, we always
+    # reduce over the last dimension, i.e., dim=-1 (which is the default).
+    return nn_ops.softmax(y)
+
+  def _inverse(self, y):
+    # To derive the inverse mapping note that:
+    #   y[i] = exp(x[i]) / normalization
+    # and
+    #   y[end] = 1 / normalization.
+    # Thus:
+    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
+    #      = log(exp(x[i])/normalization) - log(y[end])
+    #      = log(y[i]) - log(y[end])
+    shape = (np.asarray(y.get_shape().as_list(), dtype=np.int32)
+             if y.get_shape().is_fully_defined()
+             else array_ops.shape(y, name="shape"))
+    ndims = y.get_shape().ndims or math_ops.rank(y, name="ndims")
+
+    # Do this first to make sure CSE catches that it'll happen again in
+    # _inverse_log_det_jacobian.
+    x = math_ops.log(y)
+
+    # We now extract the last coordinate of the rightmost dimension.
+    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
+    begin = array_ops.one_hot(indices=ndims-1,
+                              depth=ndims,
+                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
+                              dtype=shape.dtype)
+    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
+    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
+
+    # Here we slice out all but the last coordinate; see above for idea.
+    begin = array_ops.zeros_like(shape)
+    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
+    x = array_ops.strided_slice(x, begin, begin + size)
+
+    x += log_normalization
+
+    if self._static_event_ndims == 0:
+      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
+
+    # Set shape hints.
+    if y.get_shape().ndims is not None:
+      shape = y.get_shape().as_list()
+      if self._static_event_ndims == 0:
+        shape = shape[:-1]
+      elif shape[-1] is not None:
+        shape[-1] -= 1
+      shape = tensor_shape.TensorShape(shape)
+      x.get_shape().assert_is_compatible_with(shape)
+      x.set_shape(shape)
+
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    # WLOG, consider the vector case:
+    #   x = log(y[:-1]) - log(y[-1])
+    # where,
+    #   y[-1] = 1 - sum(y[:-1]).
+    # We have:
+    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
+    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
+    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
+    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
+    #                        det(diag(y[:-1])) }                     (2)
+    #                = 1 / { y[-1] prod(y[:-1]) }
+    #                = 1 / prod(y)
+    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
+    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
+    #       docstring "Tip".
+    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._static_event_ndims == 0:
+      return x - 2. * nn_ops.softplus(x)
+    else:
+      # This code is similar to nn_ops.log_softmax but different because we have
+      # an implicit zero column to handle. I.e., instead of:
+      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+      # we must do:
+      #   log_normalization = 1 + reduce_sum(exp(logits))
+      #   -log_normalization + reduce_sum(logits - log_normalization)
+      log_normalization = nn_ops.softplus(
+          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+      fldj = (-log_normalization +
+              math_ops.reduce_sum(x - log_normalization,
+                                  axis=-1,
+                                  keep_dims=True))
+      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
new file mode 100644
index 00000000000..250a1144b53
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Softplus bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Softplus"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
new file mode 100644
index 00000000000..81957fcf789
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
@@ -0,0 +1,144 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Softplus bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+__all__ = [
+    "Softplus",
+]
+
+
+class Softplus(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
+
+  The softplus `Bijector` has the following two useful properties:
+
+  * The domain is the positive real numbers
+  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
+    the `Exp` `Bijector`.
+
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    softplus = Softplus(event_ndims=2)
+    x = [[[1., 2],
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
+    log(1 + exp(x)) == softplus.forward(x)
+    log(exp(x) - 1) == softplus.inverse(x)
+    ```
+
+    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
+    reduction over the event space.
+  """
+
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
+  def __init__(self,
+               event_ndims=0,
+               hinge_softness=None,
+               validate_args=False,
+               name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
+    super(Softplus, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
+
+  def _inverse(self, y):
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # Could also do:
+    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
+    #                              axis=event_dims)
+    # but the following is more numerically stable. Ie,
+    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
+    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
+    #           = 1 / (1 - exp{-Y}),
+    # which is the most stable for large Y > 0. For small Y, we use
+    # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
+    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
+                                axis=self._event_dims_tensor(y))
+
+  def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
+    return -math_ops.reduce_sum(nn_ops.softplus(-x),
+                                axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 273b7620314..9304a56491e 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 _binomial_sample_note = """
@@ -42,6 +42,28 @@ to integer values.
 """
 
 
+def _bdtr(k, n, p):
+  """The binomial cumulative distribution function.
+
+  Args:
+    k: floating point `Tensor`.
+    n: floating point `Tensor`.
+    p: floating point `Tensor`.
+
+  Returns:
+    `sum_{j=0}^k p^j (1 - p)^(n - j)`.
+  """
+  # Trick for getting safe backprop/gradients into n, k when
+  #   betainc(a = 0, ..) = nan
+  # Write:
+  #   where(unsafe, safe_output, betainc(where(unsafe, safe_input, input)))
+  ones = array_ops.ones_like(n - k)
+  k_eq_n = math_ops.equal(k, n)
+  safe_dn = array_ops.where(k_eq_n, ones, n - k)
+  dk = math_ops.betainc(a=safe_dn, b=k + 1, x=1 - p)
+  return array_ops.where(k_eq_n, ones, dk)
+
+
 class Binomial(distribution.Distribution):
   """Binomial distribution.
 
@@ -65,7 +87,7 @@ class Binomial(distribution.Distribution):
   where:
   * `total_count = n`,
   * `probs = p`,
-  * `Z` is the normalizaing constant, and,
+  * `Z` is the normalizing constant, and,
   * `n!` is the factorial of `n`.
 
   #### Examples
@@ -120,7 +142,7 @@ class Binomial(distribution.Distribution):
     Args:
       total_count: Non-negative floating point tensor with shape broadcastable
         to `[N1,..., Nm]` with `m >= 0` and the same dtype as `probs` or
-        `logits`.  Defines this as a batch of `N1 x ... x Nm` different Binomial
+        `logits`. Defines this as a batch of `N1 x ...  x Nm` different Binomial
         distributions. Its components should be equal to integer values.
       logits: Floating point tensor representing the log-odds of a
         positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
@@ -131,18 +153,18 @@ class Binomial(distribution.Distribution):
         `[N1,..., Nm]` `m >= 0`, `probs in [0, 1]`. Each entry represents the
         probability of success for independent Binomial distributions. Only one
         of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]) as ns:
+    with ops.name_scope(name, values=[total_count, logits, probs]):
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
           validate_args)
@@ -153,7 +175,6 @@ class Binomial(distribution.Distribution):
           name=name)
     super(Binomial, self).__init__(
         dtype=self._probs.dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
@@ -161,7 +182,7 @@ class Binomial(distribution.Distribution):
         graph_parents=[self._total_count,
                        self._logits,
                        self._probs],
-        name=ns)
+        name=name)
 
   @property
   def total_count(self):
@@ -202,6 +223,18 @@ class Binomial(distribution.Distribution):
   def _prob(self, counts):
     return math_ops.exp(self._log_prob(counts))
 
+  def _cdf(self, counts):
+    counts = self._maybe_assert_valid_sample(counts)
+    probs = self.probs
+    if not (counts.shape.is_fully_defined()
+            and self.probs.shape.is_fully_defined()
+            and counts.shape.is_compatible_with(self.probs.shape)):
+      # If both shapes are well defined and equal, we skip broadcasting.
+      probs += array_ops.zeros_like(counts)
+      counts += array_ops.zeros_like(self.probs)
+
+    return _bdtr(k=counts, n=self.total_count, p=probs)
+
   def _log_unnormalized_prob(self, counts):
     counts = self._maybe_assert_valid_sample(counts)
     return (counts * math_ops.log(self.probs) +
@@ -221,7 +254,7 @@ class Binomial(distribution.Distribution):
 
   @distribution_util.AppendDocstring(
       """Note that when `(1 + total_count) * probs` is an integer, there are
-      actually two modes.  Namely, `(1 + total_count) * probs` and
+      actually two modes. Namely, `(1 + total_count) * probs` and
       `(1 + total_count) * probs - 1` are both modes. Here we return only the
       larger of the two modes.""")
   def _mode(self):
@@ -236,21 +269,18 @@ class Binomial(distribution.Distribution):
             message="total_count must be non-negative."),
         distribution_util.assert_integer_form(
             total_count,
-            message="total_count cannot contain fractional componentes."),
+            message="total_count cannot contain fractional components."),
     ], total_count)
 
-  def _maybe_assert_valid_sample(self, counts):
+  def _maybe_assert_valid_sample(self, counts, check_integer=True):
     """Check counts for proper shape, values, then return tensor version."""
     if not self.validate_args:
       return counts
+
+    counts = distribution_util.embed_check_nonnegative_discrete(
+        counts, check_integer=check_integer)
     return control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            counts,
-            message="counts must be non-negative."),
         check_ops.assert_less_equal(
-            counts, self._total_count,
+            counts, self.total_count,
             message="counts are not less than or equal to n."),
-        distribution_util.assert_integer_form(
-            counts,
-            message="counts cannot contain fractional components."),
     ], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index d980c705f14..bdd5571c966 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
@@ -70,30 +70,30 @@ class Chi2(gamma.Gamma):
 
     Args:
       df: Floating point tensor, the degrees of freedom of the
-        distribution(s).  `df` must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+        distribution(s). `df` must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     # Even though all stats of chi2 are defined for valid parameters, this is
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[df]) as ns:
+    with ops.name_scope(name, values=[df]):
       self._df = ops.convert_to_tensor(df, name="df")
       super(Chi2, self).__init__(
           concentration=0.5 * self._df,
           rate=constant_op.constant(0.5, dtype=self._df.dtype),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
   @staticmethod
@@ -114,12 +114,12 @@ class Chi2WithAbsDf(Chi2):
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
     parameters = locals()
-    with ops.name_scope(name, values=[df]) as ns:
+    with ops.name_scope(name, values=[df]):
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
               math_ops.abs(df, name="abs_df"),
               name="floor_abs_df"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/conditional_bijector.py
deleted file mode 100644
index 5e097d2f653..00000000000
--- a/tensorflow/contrib/distributions/python/ops/conditional_bijector.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Conditional bijector."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import bijector
-from tensorflow.contrib.distributions.python.ops import distribution_util
-
-
-__all__ = ["ConditionalBijector"]
-
-
-class ConditionalBijector(bijector.Bijector):
-  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward(self, x, name="forward", **condition_kwargs):
-    return self._call_forward(x, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse(self, y, name="inverse", **condition_kwargs):
-    return self._call_inverse(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse_and_inverse_log_det_jacobian(
-      self, y, name="inverse_and_inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_and_inverse_log_det_jacobian(
-        y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
index e3ca5c5468e..ef25d4aedd6 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ConditionalDistribution(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 16787cdfb8a..2e1e68cf058 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -18,9 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # pylint: disable=protected-access
@@ -65,8 +65,8 @@ class ConditionalTransformedDistribution(
   def _log_prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(
-        y, **bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
+    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
     x = self._maybe_rotate_dims(x, rotate_right=True)
     log_prob = self.distribution.log_prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
@@ -77,8 +77,8 @@ class ConditionalTransformedDistribution(
   def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(
-        y, **bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
+    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
     x = self._maybe_rotate_dims(x, rotate_right=True)
     prob = self.distribution.prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
new file mode 100644
index 00000000000..850d08d1bd6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -0,0 +1,383 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Deterministic distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+
+__all__ = [
+    "Deterministic",
+    "VectorDeterministic",
+]
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _BaseDeterministic(distribution.Distribution):
+  """Base class for Deterministic distributions."""
+
+  def __init__(self,
+               loc,
+               atol=None,
+               rtol=None,
+               is_vector=False,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="_BaseDeterministic"):
+    """Initialize a batch of `_BaseDeterministic` distributions.
+
+    The `atol` and `rtol` parameters allow for some slack in `pmf`, `cdf`
+    computations, e.g. due to floating-point error.
+
+    ```
+    pmf(x; loc)
+      = 1, if Abs(x - loc) <= atol + rtol * Abs(loc),
+      = 0, otherwise.
+    ```
+
+    Args:
+      loc: Numeric `Tensor`.  The point (or batch of points) on which this
+        distribution is supported.
+      atol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The absolute tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      rtol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The relative tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      is_vector:  Python `bool`.  If `True`, this is for `VectorDeterministic`,
+        else `Deterministic`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError:  If `loc` is a scalar.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[loc, atol, rtol]):
+      loc = ops.convert_to_tensor(loc, name="loc")
+      if is_vector and validate_args:
+        msg = "Argument loc must be at least rank 1."
+        if loc.get_shape().ndims is not None:
+          if loc.get_shape().ndims < 1:
+            raise ValueError(msg)
+        else:
+          loc = control_flow_ops.with_dependencies(
+              [check_ops.assert_rank_at_least(loc, 1, message=msg)], loc)
+      self._loc = loc
+
+      super(_BaseDeterministic, self).__init__(
+          dtype=self._loc.dtype,
+          reparameterization_type=distribution.NOT_REPARAMETERIZED,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          parameters=parameters,
+          graph_parents=[self._loc],
+          name=name)
+
+      self._atol = self._get_tol(atol)
+      self._rtol = self._get_tol(rtol)
+      # Avoid using the large broadcast with self.loc if possible.
+      if rtol is None:
+        self._slack = self.atol
+      else:
+        self._slack = self.atol + self.rtol * math_ops.abs(self.loc)
+
+  def _get_tol(self, tol):
+    if tol is None:
+      return ops.convert_to_tensor(0, dtype=self.loc.dtype)
+
+    tol = ops.convert_to_tensor(tol, dtype=self.loc.dtype)
+    if self.validate_args:
+      tol = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              tol, message="Argument 'tol' must be non-negative")
+      ], tol)
+    return tol
+
+  @property
+  def loc(self):
+    """Point (or batch of points) at which this distribution is supported."""
+    return self._loc
+
+  @property
+  def atol(self):
+    """Absolute tolerance for comparing points to `self.loc`."""
+    return self._atol
+
+  @property
+  def rtol(self):
+    """Relative tolerance for comparing points to `self.loc`."""
+    return self._rtol
+
+  def _mean(self):
+    return array_ops.identity(self.loc)
+
+  def _variance(self):
+    return array_ops.zeros_like(self.loc)
+
+  def _mode(self):
+    return self.mean()
+
+  def _sample_n(self, n, seed=None):  # pylint: disable=unused-arg
+    n_static = tensor_util.constant_value(ops.convert_to_tensor(n))
+    if n_static is not None and self.loc.get_shape().ndims is not None:
+      ones = [1] * self.loc.get_shape().ndims
+      multiples = [n_static] + ones
+    else:
+      ones = array_ops.ones_like(array_ops.shape(self.loc))
+      multiples = array_ops.concat(([n], ones), axis=0)
+
+    return array_ops.tile(self.loc[array_ops.newaxis, ...], multiples=multiples)
+
+
+class Deterministic(_BaseDeterministic):
+  """Scalar `Deterministic` distribution on the real line.
+
+  The scalar `Deterministic` distribution is parameterized by a [batch] point
+  `loc` on the real line.  The distribution is supported at this point only,
+  and corresponds to a random variable that is constant, equal to `loc`.
+
+  See [Degenerate rv](https://en.wikipedia.org/wiki/Degenerate_distribution).
+
+  #### Mathematical Details
+
+  The probability mass function (pmf) and cumulative distribution function (cdf)
+  are
+
+  ```none
+  pmf(x; loc) = 1, if x == loc, else 0
+  cdf(x; loc) = 1, if x >= loc, else 0
+  ```
+
+  #### Examples
+
+  ```python
+  # Initialize a single Deterministic supported at zero.
+  constant = tf.contrib.distributions.Deterministic(0.)
+  constant.prob(0.)
+  ==> 1.
+  constant.prob(2.)
+  ==> 0.
+
+  # Initialize a [2, 2] batch of scalar constants.
+  loc = [[0., 1.], [2., 3.]]
+  x = [[0., 1.1], [1.99, 3.]]
+  constant = tf.contrib.distributions.Deterministic(loc)
+  constant.prob(x)
+  ==> [[1., 0.], [0., 1.]]
+  ```
+
+  """
+
+  def __init__(self,
+               loc,
+               atol=None,
+               rtol=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Deterministic"):
+    """Initialize a scalar `Deterministic` distribution.
+
+    The `atol` and `rtol` parameters allow for some slack in `pmf`, `cdf`
+    computations, e.g. due to floating-point error.
+
+    ```
+    pmf(x; loc)
+      = 1, if Abs(x - loc) <= atol + rtol * Abs(loc),
+      = 0, otherwise.
+    ```
+
+    Args:
+      loc: Numeric `Tensor` of shape `[B1, ..., Bb]`, with `b >= 0`.
+        The point (or batch of points) on which this distribution is supported.
+      atol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The absolute tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      rtol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The relative tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(Deterministic, self).__init__(
+        loc,
+        atol=atol,
+        rtol=rtol,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+
+  def _batch_shape_tensor(self):
+    return array_ops.shape(self.loc)
+
+  def _batch_shape(self):
+    return self.loc.get_shape()
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _prob(self, x):
+    return math_ops.cast(
+        math_ops.abs(x - self.loc) <= self._slack, dtype=self.dtype)
+
+  def _cdf(self, x):
+    return math_ops.cast(x >= self.loc - self._slack, dtype=self.dtype)
+
+
+class VectorDeterministic(_BaseDeterministic):
+  """Vector `Deterministic` distribution on `R^k`.
+
+  The `VectorDeterministic` distribution is parameterized by a [batch] point
+  `loc in R^k`.  The distribution is supported at this point only,
+  and corresponds to a random variable that is constant, equal to `loc`.
+
+  See [Degenerate rv](https://en.wikipedia.org/wiki/Degenerate_distribution).
+
+  #### Mathematical Details
+
+  The probability mass function (pmf) is
+
+  ```none
+  pmf(x; loc)
+    = 1, if All[Abs(x - loc) <= atol + rtol * Abs(loc)],
+    = 0, otherwise.
+  ```
+
+  #### Examples
+
+  ```python
+  # Initialize a single VectorDeterministic supported at [0., 2.] in R^2.
+  constant = tf.contrib.distributions.Deterministic([0., 2.])
+  constant.prob([0., 2.])
+  ==> 1.
+  constant.prob([0., 3.])
+  ==> 0.
+
+  # Initialize a [3] batch of constants on R^2.
+  loc = [[0., 1.], [2., 3.], [4., 5.]]
+  constant = constant_lib.VectorDeterministic(loc)
+  constant.prob([[0., 1.], [1.9, 3.], [3.99, 5.]])
+  ==> [1., 0., 0.]
+  ```
+
+  """
+
+  def __init__(self,
+               loc,
+               atol=None,
+               rtol=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorDeterministic"):
+    """Initialize a `VectorDeterministic` distribution on `R^k`, for `k >= 0`.
+
+    Note that there is only one point in `R^0`, the "point" `[]`.  So if `k = 0`
+    then `self.prob([]) == 1`.
+
+    The `atol` and `rtol` parameters allow for some slack in `pmf`
+    computations, e.g. due to floating-point error.
+
+    ```
+    pmf(x; loc)
+      = 1, if All[Abs(x - loc) <= atol + rtol * Abs(loc)],
+      = 0, otherwise
+    ```
+
+    Args:
+      loc: Numeric `Tensor` of shape `[B1, ..., Bb, k]`, with `b >= 0`, `k >= 0`
+        The point (or batch of points) on which this distribution is supported.
+      atol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The absolute tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      rtol:  Non-negative `Tensor` of same `dtype` as `loc` and broadcastable
+        shape.  The relative tolerance for comparing closeness to `loc`.
+        Default is `0`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(VectorDeterministic, self).__init__(
+        loc,
+        atol=atol,
+        rtol=rtol,
+        is_vector=True,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+
+  def _batch_shape_tensor(self):
+    return array_ops.shape(self.loc)[:-1]
+
+  def _batch_shape(self):
+    return self.loc.get_shape()[:-1]
+
+  def _event_shape_tensor(self):
+    return array_ops.shape(self.loc)[-1]
+
+  def _event_shape(self):
+    return self.loc.get_shape()[-1:]
+
+  def _prob(self, x):
+    if self.validate_args:
+      is_vector_check = check_ops.assert_rank_at_least(x, 1)
+      right_vec_space_check = check_ops.assert_equal(
+          self.event_shape_tensor(),
+          array_ops.gather(array_ops.shape(x), array_ops.rank(x) - 1),
+          message=
+          "Argument 'x' not defined in the same space R^k as this distribution")
+      with ops.control_dependencies([is_vector_check]):
+        with ops.control_dependencies([right_vec_space_check]):
+          x = array_ops.identity(x)
+    return math_ops.cast(
+        math_ops.reduce_all(math_ops.abs(x - self.loc) <= self._slack, axis=-1),
+        dtype=self.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 3695ff007af..5e3b42dd2aa 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -18,657 +18,181 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import hashlib
-import math
-import numpy as np
-
-from tensorflow.contrib import framework as contrib_framework
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib import linalg
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import
 
 
-def assert_close(
-    x, y, data=None, summarize=None, message=None, name="assert_close"):
-  """Assert that that x and y are within machine epsilon of each other.
+# TODO(b/35290280): Add unit-tests.
+def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
+                    validate_args, assert_positive, name=None):
+  """Creates a LinOp from `scale_diag`, `scale_identity_multiplier` kwargs."""
+  def _convert_to_tensor(x, name):
+    return None if x is None else ops.convert_to_tensor(x, name=name)
+
+  def _maybe_attach_assertion(x):
+    if not validate_args:
+      return x
+    if assert_positive:
+      return control_flow_ops.with_dependencies([
+          check_ops.assert_positive(
+              x, message="diagonal part must be positive"),
+      ], x)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_none_equal(
+            x,
+            array_ops.zeros([], x.dtype),
+            message="diagonal part must be non-zero")], x)
+
+  with ops.name_scope(name, "make_diag_scale",
+                      values=[loc, scale_diag, scale_identity_multiplier]):
+    loc = _convert_to_tensor(loc, name="loc")
+    scale_diag = _convert_to_tensor(scale_diag, name="scale_diag")
+    scale_identity_multiplier = _convert_to_tensor(
+        scale_identity_multiplier,
+        name="scale_identity_multiplier")
+
+    if scale_diag is not None:
+      if scale_identity_multiplier is not None:
+        scale_diag += scale_identity_multiplier[..., array_ops.newaxis]
+      return linalg.LinearOperatorDiag(
+          diag=_maybe_attach_assertion(scale_diag),
+          is_non_singular=True,
+          is_self_adjoint=True,
+          is_positive_definite=assert_positive)
+
+    # TODO(b/35290280): Consider inferring shape from scale_perturb_factor.
+    if loc is None:
+      raise ValueError(
+          "Cannot infer `event_shape` unless `loc` is specified.")
+
+    num_rows = util.dimension_size(loc, -1)
+
+    if scale_identity_multiplier is None:
+      return linalg.LinearOperatorIdentity(
+          num_rows=num_rows,
+          dtype=loc.dtype.base_dtype,
+          is_self_adjoint=True,
+          is_positive_definite=True,
+          assert_proper_shapes=validate_args)
+
+    return linalg.LinearOperatorScaledIdentity(
+        num_rows=num_rows,
+        multiplier=_maybe_attach_assertion(scale_identity_multiplier),
+        is_non_singular=True,
+        is_self_adjoint=True,
+        is_positive_definite=assert_positive,
+        assert_proper_shapes=validate_args)
+
+
+def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
+  """Infer distribution batch and event shapes from a location and scale.
+
+  Location and scale family distributions determine their batch/event shape by
+  broadcasting the `loc` and `scale` args.  This helper does that broadcast,
+  statically if possible.
+
+  Batch shape broadcasts as per the normal rules.
+  We allow the `loc` event shape to broadcast up to that of `scale`.  We do not
+  allow `scale`'s event shape to change.  Therefore, the last dimension of `loc`
+  must either be size `1`, or the same as `scale.range_dimension`.
+
+  See `MultivariateNormalLinearOperator` for a usage example.
 
   Args:
-    x: Numeric `Tensor`
-    y: Numeric `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
+    loc:  `N-D` `Tensor` with `N >= 1` (already converted to tensor) or `None`.
+      If `None`, both batch and event shape are determined by `scale`.
+    scale:  A `LinearOperator` instance.
+    name:  A string name to prepend to created ops.
 
   Returns:
-    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
-  """
-  message = message or ""
-  x = ops.convert_to_tensor(x, name="x")
-  y = ops.convert_to_tensor(y, name="y")
-
-  if data is None:
-    data = [
-        message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
-    ]
-
-  if x.dtype.is_integer:
-    return check_ops.assert_equal(
-        x, y, data=data, summarize=summarize, message=message, name=name)
-
-  with ops.name_scope(name, "assert_close", [x, y, data]):
-    tol = np.finfo(x.dtype.as_numpy_dtype).eps
-    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
-    return control_flow_ops.Assert(
-        condition, data, summarize=summarize)
-
-
-def assert_integer_form(
-    x, data=None, summarize=None, message=None, name="assert_integer_form"):
-  """Assert that x has integer components (or floats equal to integers).
-
-  Args:
-    x: Numeric `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-
-  Returns:
-    Op raising `InvalidArgumentError` if round(x) != x.
-  """
-
-  message = message or "x has non-integer components"
-  x = ops.convert_to_tensor(x, name="x")
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(
-      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
-      data=data, summarize=summarize, message=message, name=name)
-
-
-def assert_symmetric(matrix):
-  matrix_t = array_ops.matrix_transpose(matrix)
-  return control_flow_ops.with_dependencies(
-      [check_ops.assert_equal(matrix, matrix_t)], matrix)
-
-
-def same_dynamic_shape(a, b):
-  """Returns whether a and b have the same dynamic shape.
-
-  Args:
-    a: `Tensor`
-    b: `Tensor`
-
-  Returns:
-    `Boolean` `Tensor` representing if both tensors have the same shape.
-  """
-  a = ops.convert_to_tensor(a, name="a")
-  b = ops.convert_to_tensor(b, name="b")
-
-  # Here we can't just do math_ops.equal(a.shape, b.shape), since
-  # static shape inference may break the equality comparison between
-  # shape(a) and shape(b) in math_ops.equal.
-  def all_shapes_equal():
-    return math_ops.reduce_all(math_ops.equal(
-        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
-        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
-
-  # One of the shapes isn't fully defined, so we need to use the dynamic
-  # shape.
-  return control_flow_ops.cond(
-      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
-      all_shapes_equal,
-      lambda: constant_op.constant(False))
-
-
-def get_logits_and_probs(logits=None,
-                         probs=None,
-                         multidimensional=False,
-                         validate_args=False,
-                         name="get_logits_and_probs"):
-  """Converts logit to probabilities (or vice-versa), and returns both.
-
-  Args:
-    logits: Numeric `Tensor` representing log-odds.
-    probs: Numeric `Tensor` representing probabilities.
-    multidimensional: `Boolean`, default `False`.
-      If `True`, represents whether the last dimension of `logits` or `probs`,
-      a `[N1, N2, ... k]` dimensional tensor, representing the
-      logit or probability of `shape[-1]` classes.
-    validate_args: `Boolean`, default `False`.  When `True`, either assert `0 <=
-      probs <= 1` (if not `multidimensional`) or that the last dimension of
-      `probs` sums to one.
-    name: A name for this operation (optional).
-
-  Returns:
-    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
-      `1`, then the corresponding entry in the returned logit will be `-Inf` and
-      `Inf` respectively.
+    batch_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
+    event_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
 
   Raises:
-    ValueError: if neither `probs` nor `logits` were passed in, or both were.
+    ValueError:  If the last dimension of `loc` is determined statically to be
+      different than the range of `scale`.
   """
-  with ops.name_scope(name, values=[probs, logits]):
-    if (probs is None) == (logits is None):
-      raise ValueError("Must pass probs or logits, but not both.")
-
-    if probs is None:
-      logits = ops.convert_to_tensor(logits, name="logits")
-      if multidimensional:
-        return logits, nn.softmax(logits, name="probs")
-      return logits, math_ops.sigmoid(logits, name="probs")
-
-    probs = ops.convert_to_tensor(probs, name="probs")
-    if validate_args:
-      with ops.name_scope("validate_probs"):
-        one = constant_op.constant(1., probs.dtype)
-        dependencies = [check_ops.assert_non_negative(probs)]
-        if multidimensional:
-          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
-                                        message="probs does not sum to 1.")]
-        else:
-          dependencies += [check_ops.assert_less_equal(
-              probs, one, message="probs has components greater than 1.")]
-        probs = control_flow_ops.with_dependencies(dependencies, probs)
-
-    with ops.name_scope("logits"):
-      if multidimensional:
-        # Here we don't compute the multidimensional case, in a manner
-        # consistent with respect to the unidimensional case. We do so
-        # following the TF convention. Typically, you might expect to see
-        # logits = log(probs) - log(gather(probs, pivot)). A side-effect of
-        # being consistent with the TF approach is that the unidimensional case
-        # implicitly handles the second dimension but the multidimensional case
-        # explicitly keeps the pivot dimension.
-        return math_ops.log(probs), probs
-      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
-
-
-def log_combinations(n, counts, name="log_combinations"):
-  """Multinomial coefficient.
-
-  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
-  the multinomial coefficient as:
-
-  ```n! / sum_i n_i!```
-
-  where `i` runs over all `k` classes.
-
-  Args:
-    n: Numeric `Tensor` broadcastable with `counts`. This represents `n`
-      outcomes.
-    counts: Numeric `Tensor` broadcastable with `n`. This represents counts
-      in `k` classes, where `k` is the last dimension of the tensor.
-    name: A name for this operation (optional).
-
-  Returns:
-    `Tensor` representing the multinomial coefficient between `n` and `counts`.
-  """
-  # First a bit about the number of ways counts could have come in:
-  # E.g. if counts = [1, 2], then this is 3 choose 2.
-  # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts.  This is the
-  # "distribution" dimension. Here n a priori represents the sum of counts.
-  with ops.name_scope(name, values=[n, counts]):
-    n = ops.convert_to_tensor(n, name="n")
-    counts = ops.convert_to_tensor(counts, name="counts")
-    total_permutations = math_ops.lgamma(n + 1)
-    counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial,
-                                                 reduction_indices=[-1])
-    return total_permutations - redundant_permutations
-
-
-def matrix_diag_transform(matrix, transform=None, name=None):
-  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
-
-  Create a trainable covariance defined by a Cholesky factor:
-
-  ```python
-  # Transform network layer into 2 x 2 array.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-
-  # Make the diagonal positive.  If the upper triangle was zero, this would be a
-  # valid Cholesky factor.
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # OperatorPDCholesky ignores the upper triangle.
-  operator = OperatorPDCholesky(chol)
-  ```
-
-  Example of heteroskedastic 2-D linear regression.
-
-  ```python
-  # Get a trainable Cholesky factor.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # Get a trainable mean.
-  mu = tf.contrib.layers.fully_connected(activations, 2)
-
-  # This is a fully trainable multivariate normal!
-  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
-
-  # Standard log loss.  Minimizing this will "train" mu and chol, and then dist
-  # will be a distribution predicting labels as multivariate Gaussians.
-  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
-  ```
-
-  Args:
-    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
-      equal.
-    transform:  Element-wise function mapping `Tensors` to `Tensors`.  To
-      be applied to the diagonal of `matrix`.  If `None`, `matrix` is returned
-      unchanged.  Defaults to `None`.
-    name:  A name to give created ops.
-      Defaults to "matrix_diag_transform".
-
-  Returns:
-    A `Tensor` with same shape and `dtype` as `matrix`.
-  """
-  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
-    matrix = ops.convert_to_tensor(matrix, name="matrix")
-    if transform is None:
-      return matrix
-    # Replace the diag with transformed diag.
-    diag = array_ops.matrix_diag_part(matrix)
-    transformed_diag = transform(diag)
-    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
-
-  return transformed_mat
-
-
-def rotate_transpose(x, shift, name="rotate_transpose"):
-  """Circularly moves dims left or right.
-
-  Effectively identical to:
-
-  ```python
-  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
-  ```
-
-  When `validate_args=False` additional graph-runtime checks are
-  performed. These checks entail moving data from to GPU to CPU.
-
-  Example:
-
-    ```python
-    x = ... # Tensor of shape [1, 2, 3, 4].
-    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
-    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
-    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x, 7) == rotate_transpose(x, 3)
-    rotate_transpose(x, -7) == rotate_transpose(x, -3)
-    ```
-
-  Args:
-    x: `Tensor`.
-    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
-      transpose right (shift>0).
-    name: `String`. The name to give this op.
-
-  Returns:
-    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
-
-  Raises:
-    TypeError: if shift is not integer type.
-  """
-  with ops.name_scope(name, values=[x, shift]):
-    x = ops.convert_to_tensor(x, name="x")
-    shift = ops.convert_to_tensor(shift, name="shift")
-    # We do not assign back to preserve constant-ness.
-    check_ops.assert_integer(shift)
-    shift_value_static = tensor_util.constant_value(shift)
-    ndims = x.get_shape().ndims
-    if ndims is not None and shift_value_static is not None:
-      if ndims < 2: return x
-      shift_value_static = np.sign(shift_value_static) * (
-          abs(shift_value_static) % ndims)
-      if shift_value_static == 0: return x
-      perm = np.roll(np.arange(ndims), shift_value_static)
-      return array_ops.transpose(x, perm=perm)
+  with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    # Get event shape.
+    event_size = scale.range_dimension_tensor()
+    event_size_const = tensor_util.constant_value(event_size)
+    if event_size_const is not None:
+      event_shape = event_size_const.reshape([1])
     else:
-      # Consider if we always had a positive shift, and some specified
-      # direction.
-      # When shifting left we want the new array:
-      #   last(x, n-shift) + first(x, shift)
-      # and if shifting right then we want:
-      #   last(x, shift) + first(x, n-shift)
-      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
-      # Also, we can encode direction and shift as one: direction * shift.
-      # Combining these facts, we have:
-      #   a = cond(shift<0, -shift, n-shift)
-      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
-      # Finally, we transform shift by modulo length so it can be specified
-      # independently from the array upon which it operates (like python).
-      ndims = array_ops.rank(x)
-      shift = array_ops.where(math_ops.less(shift, 0),
-                              math_ops.mod(-shift, ndims),
-                              ndims - math_ops.mod(shift, ndims))
-      first = math_ops.range(0, shift)
-      last = math_ops.range(shift, ndims)
-      perm = array_ops.concat((last, first), 0)
-      return array_ops.transpose(x, perm=perm)
+      event_shape = event_size[array_ops.newaxis]
 
+    # Static check that event shapes match.
+    if loc is not None:
+      loc_event_size = loc.get_shape()[-1].value
+      if loc_event_size is not None and event_size_const is not None:
+        if loc_event_size != 1 and loc_event_size != event_size_const:
+          raise ValueError(
+              "Event size of 'scale' (%d) could not be broadcast up to that of "
+              "'loc' (%d)." % (loc_event_size, event_size_const))
 
-def pick_vector(cond,
-                true_vector,
-                false_vector,
-                name="pick_vector"):
-  """Picks possibly different length row `Tensor`s based on condition.
-
-  Value `Tensor`s should have exactly one dimension.
-
-  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
-  `false_vector` is immediately returned. I.e., no graph nodes are created and
-  no validation happens.
-
-  Args:
-    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
-    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
-    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
-    name: `String`. The name to give this op.
-
-  Example:
-
-  ```python
-  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [10, 11].
-  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [15, 16, 17].
-  ```
-
-  Returns:
-    true_or_false_vector: `Tensor`.
-
-  Raises:
-    TypeError: if `cond.dtype != tf.bool`
-    TypeError: if `cond` is not a constant and
-      `true_vector.dtype != false_vector.dtype`
-  """
-  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
-    cond = ops.convert_to_tensor(cond, name="cond")
-    if cond.dtype != dtypes.bool:
-      raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
-    cond_value_static = tensor_util.constant_value(cond)
-    if cond_value_static is not None:
-      return true_vector if cond_value_static else false_vector
-    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
-    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
-    if true_vector.dtype != false_vector.dtype:
-      raise TypeError(
-          "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
-    n = array_ops.shape(true_vector)[0]
-    return array_ops.slice(
-        array_ops.concat((true_vector, false_vector), 0),
-        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
-
-
-def gen_new_seed(seed, salt):
-  """Generate a new seed, from the given seed and salt."""
-  if seed is None:
-    return None
-  string = (str(seed) + salt).encode("utf-8")
-  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-
-
-def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
-  """Creates a (batch of) lower triangular matrix from a vector of inputs.
-
-  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
-  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
-  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
-
-  Although the non-batch complexity is O(n^2), large constants and sub-optimal
-  vectorization means the complexity of this function is 5x slower than zeroing
-  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`.  This
-  function becomes competitive only when several matmul/cholesky/etc ops can be
-  ellided in constructing the input.  Example: wiring a fully connected layer as
-  a covariance matrix; this function reduces the final layer by 2x and possibly
-  reduces the network arch complexity considerably.  In most cases it is better
-  to simply build a full matrix and zero out the upper triangular elements,
-  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
-  construct a lower triangular.
-
-  Example:
-
-  ```python
-  fill_lower_triangular([1, 2, 3, 4, 5, 6])
-  # Returns: [[1, 0, 0],
-  #           [2, 3, 0],
-  #           [4, 5, 6]]
-  ```
-
-  For comparison, a pure numpy version of this function can be found in
-  `distribution_util_test.py`, function `_fill_lower_triangular`.
-
-  Args:
-    x: `Tensor` representing lower triangular elements.
-    validate_args: `Boolean`, default `False`.  Whether to ensure the shape of
-      `x` can be mapped to a lower triangular matrix (controls non-static checks
-      only).
-    name: `String`. The name to give this op.
-
-  Returns:
-    tril: `Tensor` with lower triangular elements filled from `x`.
-
-  Raises:
-    ValueError: if shape if `x` has static shape which cannot be mapped to a
-      lower triangular matrix.
-  """
-  # TODO(jvdillon): Replace this code with dedicated op when it exists.
-  with ops.name_scope(name, values=(x,)):
-    x = ops.convert_to_tensor(x, name="x")
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[-1].value is not None):
-      d = x.get_shape()[-1].value
-      # d = n(n+1)/2 implies n is:
-      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
-      d_inferred = n * (n + 1) /2
-      if d != d_inferred:
-        raise ValueError("Input cannot be mapped to a lower triangular; "
-                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([n, n]))
+    # Get batch shape.
+    batch_shape = scale.batch_shape_tensor()
+    if loc is None:
+      batch_shape_const = tensor_util.constant_value(batch_shape)
+      batch_shape = (
+          batch_shape_const if batch_shape_const is not None else batch_shape)
     else:
-      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
-      # d = n(n+1)/2 implies n is:
-      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
-                        dtype=dtypes.int32)
-      if validate_args:
-        is_valid_input_shape = check_ops.assert_equal(
-            n * (n + 1) / 2, d,
-            message="Input cannot be mapped to a lower triangular.")
-        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([None, None]))
-
-    def tril_ids(n):
-      """Internal helper to create vector of linear indices into y."""
-      # Build the ids statically; chose 512 because it implies 1MiB.
-      if not contrib_framework.is_tensor(n) and n <= 512:
-        ids = np.arange(n**2, dtype=np.int32)
-        rows = (ids / n).astype(np.int32)  # Implicit floor.
-        # We need to stop incrementing the index when we encounter
-        # upper-triangular elements.  The idea here is to compute the
-        # lower-right number of zeros then by "symmetry" subtract this from the
-        # total number of zeros, n(n-1)/2.
-        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
-        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
-        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
-        # mask = (ids <= (n + 1) * rows).astype(np.int32)
+      loc_batch_shape = loc.get_shape().with_rank_at_least(1)[:-1]
+      if (loc.get_shape().ndims is None or
+          not loc_batch_shape.is_fully_defined()):
+        loc_batch_shape = array_ops.shape(loc)[:-1]
       else:
-        ids = math_ops.range(n**2)
-        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
-        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
-                               dtype=dtypes.int32)
-      return ids - offset
+        loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
+                                                name="loc_batch_shape")
+      batch_shape = prefer_static_broadcast_shape(batch_shape, loc_batch_shape)
 
-    # Special-case non-batch case.
-    if x.get_shape().ndims == 1:
-      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
-      y = array_ops.matrix_band_part(y, -1, 0)
-      y.set_shape(y.get_shape().merge_with(final_shape))
-      return y
-
-    # Make ids for each batch dim.
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[:-1].is_fully_defined()):
-      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
-      m = np.prod(batch_shape).astype(np.int32)
-    else:
-      batch_shape = array_ops.shape(x)[:-1]
-      m = array_ops.reduce_prod(array_ops.shape(x)[:-1])
-    batch_ids = math_ops.range(m)
-
-    # Assemble the tril_ids into batch,tril_id pairs.
-    idx = array_ops.stack([
-        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
-        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
-    ])
-    idx = array_ops.transpose(idx, [1, 2, 0])
-
-    # Gather up, reshape, and return.
-    y = array_ops.reshape(x, [-1, d])
-    y = array_ops.gather_nd(y, idx)
-    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
-    y = array_ops.matrix_band_part(y, -1, 0)
-    y.set_shape(y.get_shape().merge_with(final_shape))
-    return y
+  return batch_shape, event_shape
 
 
-# TODO(jvdillon): Merge this test back into:
-# tensorflow/python/ops/softplus_op_test.py
-# once TF core is accepting new ops.
-def softplus_inverse(x, name=None):
-  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
-
-  Mathematically this op is equivalent to:
-
-  ```none
-  softplus_inverse = log(exp(x) - 1.)
-  ```
+def prefer_static_broadcast_shape(
+    shape1, shape2, name="prefer_static_broadcast_shape"):
+  """Convenience function which statically broadcasts shape when possible.
 
   Args:
-    x: `Tensor`. Non-negative (not enforced), floating-point.
-    name: A name for the operation (optional).
+    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
+    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
+    name:  A string name to prepend to created ops.
 
   Returns:
-    `Tensor`. Has the same type/shape as input `x`.
+    The broadcast shape, either as `TensorShape` (if broadcast can be done
+      statically), or as a `Tensor`.
   """
-  with ops.name_scope(name, "softplus_inverse", values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    # We begin by deriving a more numerically stable softplus_inverse:
-    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
-    # ==> exp{x} = 1 + exp{y}                                (1)
-    # ==> y = Log[exp{x} - 1]                                (2)
-    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
-    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
-    #       = Log[1 - exp{-x}] + x                           (3)
-    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
-    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
-    # be zero.  To fix this, we use 1 - exp{-x} approx x for small x > 0.
-    #
-    # In addition to the numerically stable derivation above, we clamp
-    # small/large values to be congruent with the logic in:
-    # tensorflow/core/kernels/softplus_op.h
-    #
-    # Finally, we set the input to one whenever the input is too large or too
-    # small. This ensures that no unchosen codepath is +/- inf. This is
-    # necessary to ensure the gradient doesn't get NaNs. Recall that the
-    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
-    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
-    # to overwrite `x` with ones only when we will never actually use this
-    # value.  Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
-    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
-    is_too_small = math_ops.less(x, np.exp(threshold))
-    is_too_large = math_ops.greater(x, -threshold)
-    too_small_value = math_ops.log(x)
-    too_large_value = x
-    # This `where` will ultimately be a NOP because we won't select this
-    # codepath whenever we used the surrogate `ones_like`.
-    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
-                        array_ops.ones_like(x), x)
-    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
-    return array_ops.where(is_too_small, too_small_value,
-                           array_ops.where(is_too_large, too_large_value, y))
+  with ops.name_scope(name, values=[shape1, shape2]):
+    if (tensor_util.constant_value(shape1) is not None and
+        tensor_util.constant_value(shape2) is not None):
+      return array_ops.broadcast_static_shape(
+          tensor_shape.TensorShape(tensor_util.constant_value(shape1)),
+          tensor_shape.TensorShape(tensor_util.constant_value(shape2)))
+    return array_ops.broadcast_dynamic_shape(shape1, shape2)
 
 
-class AppendDocstring(object):
-  """Helper class to promote private subclass docstring to public counterpart.
+def is_diagonal_scale(scale):
+  """Returns `True` if `scale` is a `LinearOperator` that is known to be diag.
 
-  Example:
+  Args:
+    scale:  `LinearOperator` instance.
 
-  ```python
-  class TransformedDistribution(Distribution):
-    @distribution_util.AppendDocstring(
-      additional_note="A special note!",
-      kwargs_dict={"foo": "An extra arg."})
-    def _prob(self, y, foo=None):
-      pass
-  ```
+  Returns:
+    Python `bool`.
 
-  In this case, the `AppendDocstring` decorator appends the `additional_note` to
-  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
-  section with each dictionary item as a bullet-point.
-
-  For a more detailed example, see `TransformedDistribution`.
+  Raises:
+    TypeError:  If `scale` is not a `LinearOperator`.
   """
-
-  def __init__(self, additional_note="", kwargs_dict=None):
-    """Initializes the AppendDocstring object.
-
-    Args:
-      additional_note: Python string added as additional docstring to public
-        version of function.
-      kwargs_dict: Python string/string dictionary representing
-        specific kwargs expanded from the **kwargs input.
-
-    Raises:
-      ValueError: if kwargs_dict.key contains whitespace.
-      ValueError: if kwargs_dict.value contains newlines.
-    """
-    self._additional_note = additional_note
-    if kwargs_dict:
-      bullets = []
-      for key in sorted(kwargs_dict.keys()):
-        value = kwargs_dict[key]
-        if any(x.isspace() for x in key):
-          raise ValueError(
-              "Parameter name \"%s\" contains whitespace." % key)
-        value = value.lstrip()
-        if "\n" in value:
-          raise ValueError(
-              "Parameter description for \"%s\" contains newlines." % key)
-        bullets.append("*  `%s`: %s" % (key, value))
-      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
-                                "\n".join(bullets))
-
-  def __call__(self, fn):
-    @functools.wraps(fn)
-    def _fn(*args, **kwargs):
-      return fn(*args, **kwargs)
-    if _fn.__doc__ is None:
-      _fn.__doc__ = self._additional_note
-    else:
-      _fn.__doc__ += "\n%s" % self._additional_note
-    return _fn
+  if not isinstance(scale, linalg.LinearOperator):
+    raise TypeError("Expected argument 'scale' to be instance of LinearOperator"
+                    ". Found: %s" % scale)
+  return (isinstance(scale, linalg.LinearOperatorIdentity) or
+          isinstance(scale, linalg.LinearOperatorScaledIdentity) or
+          isinstance(scale, linalg.LinearOperatorDiag))
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
new file mode 100644
index 00000000000..918200830c3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -0,0 +1,210 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Geometric distribution class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class Geometric(distribution.Distribution):
+  """Geometric distribution.
+
+  The Geometric distribution is parameterized by p, the probability of a
+  positive event. It represents the probability that in k + 1 Bernoulli trials,
+  the first k trials failed, before seeing a success.
+
+  The pmf of this distribution is:
+
+  #### Mathematical Details
+
+  ```none
+  pmf(k; p) = (1 - p)**k * p
+  ```
+
+  where:
+
+  * `p` is the success probability, `0 < p <= 1`, and,
+  * `k` is a non-negative integer.
+
+  """
+
+  def __init__(self,
+               logits=None,
+               probs=None,
+               validate_args=True,
+               allow_nan_stats=False,
+               name="Geometric"):
+    """Construct Geometric distributions.
+
+    Args:
+      logits: Floating-point `Tensor` with shape `[B1, ..., Bb]` where `b >= 0`
+        indicates the number of batch dimensions. Each entry represents logits
+        for the probability of success for independent Geometric distributions
+        and must be in the range `(-inf, inf]`. Only one of `logits` or `probs`
+        should be specified.
+      probs: Positive floating-point `Tensor` with shape `[B1, ..., Bb]`
+        where `b >= 0` indicates the number of batch dimensions. Each entry
+        represents the probability of success for independent Geometric
+        distributions and must be in the range `(0, 1]`. Only one of `logits`
+        or `probs` should be specified.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+
+    parameters = locals()
+    with ops.name_scope(name, values=[logits, probs]):
+      self._logits, self._probs = distribution_util.get_logits_and_probs(
+          logits, probs, validate_args=validate_args, name=name)
+
+      with ops.control_dependencies(
+          [check_ops.assert_positive(self._probs)] if validate_args else []):
+        self._probs = array_ops.identity(self._probs, name="probs")
+
+    super(Geometric, self).__init__(
+        dtype=self._probs.dtype,
+        reparameterization_type=distribution.NOT_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._probs, self._logits],
+        name=name)
+
+  @property
+  def logits(self):
+    """Log-odds of a `1` outcome (vs `0`)."""
+    return self._logits
+
+  @property
+  def probs(self):
+    """Probability of a `1` outcome (vs `0`)."""
+    return self._probs
+
+  def _batch_shape_tensor(self):
+    return array_ops.shape(self._probs)
+
+  def _batch_shape(self):
+    return self.probs.get_shape()
+
+  def _event_shape_tensor(self):
+    return array_ops.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    # Uniform variates must be sampled from the open-interval `(0, 1)` rather
+    # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
+    # because it is the smallest, positive, "normal" number. A "normal" number
+    # is such that the mantissa has an implicit leading 1. Normal, positive
+    # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
+    # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
+    # 0.
+    sampled = random_ops.random_uniform(
+        array_ops.concat([[n], array_ops.shape(self._probs)], 0),
+        minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
+        maxval=1.,
+        seed=seed,
+        dtype=self.dtype)
+
+    return math_ops.floor(
+        math_ops.log(sampled) / math_ops.log1p(-self.probs))
+
+  def _cdf(self, counts):
+    if self.validate_args:
+      # We set `check_integer=False` since the CDF is defined on whole real
+      # line.
+      counts = math_ops.floor(
+          distribution_util.embed_check_nonnegative_discrete(
+              counts, check_integer=False))
+    counts *= array_ops.ones_like(self.probs)
+    return array_ops.where(
+        counts < 0.,
+        array_ops.zeros_like(counts),
+        -math_ops.expm1(
+            (counts + 1) * math_ops.log1p(-self.probs)))
+
+  def _log_prob(self, counts):
+    if self.validate_args:
+      counts = distribution_util.embed_check_nonnegative_discrete(
+          counts, check_integer=True)
+    counts *= array_ops.ones_like(self.probs)
+    probs = self.probs * array_ops.ones_like(counts)
+
+    safe_domain = array_ops.where(
+        math_ops.equal(counts, 0.),
+        array_ops.zeros_like(probs),
+        probs)
+    return counts * math_ops.log1p(-safe_domain) + math_ops.log(probs)
+
+  def _entropy(self):
+    probs = self._probs
+    if self.validate_args:
+      probs = control_flow_ops.with_dependencies(
+          [check_ops.assert_less(
+              probs,
+              constant_op.constant(1., probs.dtype),
+              message="Entropy is undefined when logits = inf or probs = 1.")],
+          probs)
+    # Claim: entropy(p) = softplus(s)/p - s
+    # where s=logits and p=probs.
+    #
+    # Proof:
+    #
+    # entropy(p)
+    # := -[(1-p)log(1-p) + plog(p)]/p
+    # = -[log(1-p) + plog(p/(1-p))]/p
+    # = -[-softplus(s) + ps]/p
+    # = softplus(s)/p - s
+    #
+    # since,
+    # log[1-sigmoid(s)]
+    # = log[1/(1+exp(s)]
+    # = -log[1+exp(s)]
+    # = -softplus(s)
+    #
+    # using the fact that,
+    # 1-sigmoid(s) = sigmoid(-s) = 1/(1+exp(s))
+    return nn.softplus(self.logits) / probs - self.logits
+
+  def _mean(self):
+    return math_ops.exp(-self.logits)
+
+  def _variance(self):
+    return self._mean() / self.probs
+
+  def _mode(self):
+    return array_ops.zeros(self.batch_shape_tensor(), dtype=self.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 8a445f87f13..ba8d3c639b3 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import math
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -30,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class _Gumbel(distribution.Distribution):
@@ -45,7 +44,7 @@ class _Gumbel(distribution.Distribution):
 
   where `loc = mu` and `scale = sigma`.
 
-  The cumulative densifyt function of this distribution is,
+  The cumulative density function of this distribution is,
 
   ```cdf(x; mu, sigma) = exp(-exp(-(x - mu) / sigma))```
 
@@ -110,35 +109,34 @@ class _Gumbel(distribution.Distribution):
       loc: Floating point tensor, the means of the distribution(s).
       scale: Floating point tensor, the scales of the distribution(s).
         scale must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]) as ns:
+    with ops.name_scope(name, values=[loc, scale]):
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
     super(_Gumbel, self).__init__(
         dtype=self._scale.dtype,
-        is_continuous=True,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._loc, self._scale],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -171,21 +169,19 @@ class _Gumbel(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
-    np_dtype = self.dtype.as_numpy_dtype
-    # Uniform variates must be sampled from the interval (0,1] rather than
-    # [0,1], as they are passed through log() to compute Gumbel variates.
-    # We need to use np.finfo(np_dtype).tiny because it is the smallest,
-    # positive, "normal" number. A "normal" number is such that the mantissa
-    # has an implicit leading 1. Normal, positive numbers x, y have the
-    # reasonable property that: x + y >= max(x, y).
-    # minval=np.nextafter(np.float32(0),1)) can cause
-    # tf.random_uniform(dtype=tf.float32) to sample 0.
-    uniform = random_ops.random_uniform(shape=shape,
-                                        minval=np.finfo(np_dtype).tiny,
-                                        maxval=1,
-                                        dtype=self.dtype,
-                                        seed=seed)
+    # Uniform variates must be sampled from the open-interval `(0, 1)` rather
+    # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
+    # because it is the smallest, positive, "normal" number. A "normal" number
+    # is such that the mantissa has an implicit leading 1. Normal, positive
+    # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
+    # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
+    # 0.
+    uniform = random_ops.random_uniform(
+        shape=array_ops.concat([[n], self.batch_shape_tensor()], 0),
+        minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
+        maxval=1.,
+        dtype=self.dtype,
+        seed=seed)
     sampled = -math_ops.log(-math_ops.log(uniform))
     return sampled * self.scale + self.loc
 
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 7b6700341eb..956dee38a37 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -111,22 +110,22 @@ class InverseGamma(distribution.Distribution):
         distribution(s). Must contain only positive values.
       rate: Floating point tensor, the inverse scale params of the
         distribution(s). Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
 
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]) as ns:
+    with ops.name_scope(name, values=[concentration, rate]):
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -134,18 +133,17 @@ class InverseGamma(distribution.Distribution):
         self._concentration = array_ops.identity(
             concentration, name="concentration")
         self._rate = array_ops.identity(rate, name="rate")
-        contrib_tensor_util.assert_same_float_dtype(
+        check_ops.assert_same_float_dtype(
             [self._concentration, self._rate])
     super(InverseGamma, self).__init__(
         dtype=self._concentration.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=True,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=[self._concentration,
                        self._rate],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -223,7 +221,7 @@ class InverseGamma(distribution.Distribution):
   @distribution_util.AppendDocstring(
       """The mean of an inverse gamma distribution is
       `rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-      otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
+      otherwise. If `self.allow_nan_stats` is `False`, an exception will be
       raised rather than returning `NaN`""")
   def _mean(self):
     mean = self.rate / (self.concentration - 1.)
@@ -269,7 +267,7 @@ class InverseGamma(distribution.Distribution):
     return self.rate / (1. + self.concentration)
 
   def _maybe_assert_valid_sample(self, x):
-    contrib_tensor_util.assert_same_float_dtype(
+    check_ops.assert_same_float_dtype(
         tensors=[x], dtype=self.dtype)
     if not self.validate_args:
       return x
@@ -288,12 +286,12 @@ class InverseGammaWithSoftplusConcentrationRate(InverseGamma):
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]) as ns:
+    with ops.name_scope(name, values=[concentration, rate]):
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
deleted file mode 100644
index 47411817bfe..00000000000
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Registration and usage mechanisms for KL-divergences."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import inspect
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-
-
-_DIVERGENCES = {}
-
-
-def _registered_kl(type_a, type_b):
-  """Get the KL function registered for classes a and b."""
-  hierarchy_a = inspect.getmro(type_a)
-  hierarchy_b = inspect.getmro(type_b)
-  dist_to_children = None
-  kl_fn = None
-  for mro_to_a, parent_a in enumerate(hierarchy_a):
-    for mro_to_b, parent_b in enumerate(hierarchy_b):
-      candidate_dist = mro_to_a + mro_to_b
-      candidate_kl_fn = _DIVERGENCES.get((parent_a, parent_b), None)
-      if not kl_fn or (candidate_kl_fn and candidate_dist < dist_to_children):
-        dist_to_children = candidate_dist
-        kl_fn = candidate_kl_fn
-  return kl_fn
-
-
-def kl(dist_a, dist_b, allow_nan=False, name=None):
-  """Get the KL-divergence KL(dist_a || dist_b).
-
-  If there is no KL method registered specifically for `type(dist_a)` and
-  `type(dist_b)`, then the class hierarchies of these types are searched.
-
-  If one KL method is registered between any pairs of classes in these two
-  parent hierarchies, it is used.
-
-  If more than one such registered method exists, the method whose registered
-  classes have the shortest sum MRO paths to the input types is used.
-
-  If more than one such shortest path exists, the first method
-  identified in the search is used (favoring a shorter MRO distance to
-  `type(dist_a)`).
-
-  Args:
-    dist_a: The first distribution.
-    dist_b: The second distribution.
-    allow_nan: If `False` (default), a runtime error is raised
-      if the KL returns NaN values for any batch entry of the given
-      distributions.  If `True`, the KL may return a NaN for the given entry.
-    name: (optional) Name scope to use for created operations.
-
-  Returns:
-    A Tensor with the batchwise KL-divergence between dist_a and dist_b.
-
-  Raises:
-    NotImplementedError: If no KL method is defined for distribution types
-      of dist_a and dist_b.
-  """
-  kl_fn = _registered_kl(type(dist_a), type(dist_b))
-  if kl_fn is None:
-    raise NotImplementedError(
-        "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
-        "type %s" % ((type(dist_a).__name__, type(dist_b).__name__)))
-
-  with ops.name_scope("KullbackLeibler"):
-    kl_t = kl_fn(dist_a, dist_b, name=name)
-    if allow_nan:
-      return kl_t
-
-    # Check KL for NaNs
-    kl_t = array_ops.identity(kl_t, name="kl")
-
-    with ops.control_dependencies([
-        control_flow_ops.Assert(
-            math_ops.logical_not(
-                math_ops.reduce_any(math_ops.is_nan(kl_t))),
-            ["KL calculation between %s and %s returned NaN values "
-             "(and was called with allow_nan=False).  Values:"
-             % (dist_a.name, dist_b.name), kl_t])]):
-      return array_ops.identity(kl_t, name="checked_kl")
-
-
-class RegisterKL(object):
-  """Decorator to register a KL divergence implementation function.
-
-  Usage:
-
-  @distributions.RegisterKL(distributions.Normal, distributions.Normal)
-  def _kl_normal_mvn(norm_a, norm_b):
-    # Return KL(norm_a || norm_b)
-  """
-
-  def __init__(self, dist_cls_a, dist_cls_b):
-    """Initialize the KL registrar.
-
-    Args:
-      dist_cls_a: the class of the first argument of the KL divergence.
-      dist_cls_b: the class of the second argument of the KL divergence.
-    """
-    self._key = (dist_cls_a, dist_cls_b)
-
-  def __call__(self, kl_fn):
-    """Perform the KL registration.
-
-    Args:
-      kl_fn: The function to use for the KL divergence.
-
-    Returns:
-      kl_fn
-
-    Raises:
-      TypeError: if kl_fn is not a callable.
-      ValueError: if a KL divergence function has already been registered for
-        the given argument classes.
-    """
-    if not callable(kl_fn):
-      raise TypeError("kl_fn must be callable, received: %s" % kl_fn)
-    if self._key in _DIVERGENCES:
-      raise ValueError("KL(%s || %s) has already been registered to: %s"
-                       % (self._key[0].__name__, self._key[1].__name__,
-                          _DIVERGENCES[self._key]))
-    _DIVERGENCES[self._key] = kl_fn
-    return kl_fn
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 780778aae18..48794a48828 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +30,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Logistic(distribution.Distribution):
@@ -108,13 +107,13 @@ class Logistic(distribution.Distribution):
       loc: Floating point tensor, the means of the distribution(s).
       scale: Floating point tensor, the scales of the distribution(s). Must
         contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
       name: The name to give Ops created by the initializer.
 
@@ -122,21 +121,20 @@ class Logistic(distribution.Distribution):
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]) as ns:
+    with ops.name_scope(name, values=[loc, scale]):
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
     super(Logistic, self).__init__(
         dtype=self._scale.dtype,
-        is_continuous=True,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._loc, self._scale],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -169,15 +167,20 @@ class Logistic(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
-    np_dtype = self.dtype.as_numpy_dtype
-    minval = np.nextafter(np_dtype(0), np_dtype(1))
-    uniform = random_ops.random_uniform(shape=shape,
-                                        minval=minval,
-                                        maxval=1,
-                                        dtype=self.dtype,
-                                        seed=seed)
-    sampled = math_ops.log(uniform) - math_ops.log(1-uniform)
+    # Uniform variates must be sampled from the open-interval `(0, 1)` rather
+    # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
+    # because it is the smallest, positive, "normal" number. A "normal" number
+    # is such that the mantissa has an implicit leading 1. Normal, positive
+    # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
+    # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
+    # 0.
+    uniform = random_ops.random_uniform(
+        shape=array_ops.concat([[n], self.batch_shape_tensor()], 0),
+        minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
+        maxval=1.,
+        dtype=self.dtype,
+        seed=seed)
+    sampled = math_ops.log(uniform) - math_ops.log1p(-1. * uniform)
     return sampled * self.scale + self.loc
 
   def _log_prob(self, x):
@@ -187,13 +190,13 @@ class Logistic(distribution.Distribution):
     return math_ops.exp(self._log_prob(x))
 
   def _log_cdf(self, x):
-    return nn_ops.softplus(-self._z(x))
+    return -nn_ops.softplus(-self._z(x))
 
   def _cdf(self, x):
     return math_ops.sigmoid(self._z(x))
 
   def _log_survival_function(self, x):
-    return nn_ops.softplus(self._z(x))
+    return -nn_ops.softplus(self._z(x))
 
   def _survival_function(self, x):
     return math_ops.sigmoid(-self._z(x))
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 2ba3d2546d4..6ecd034a130 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -31,6 +28,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Mixture(distribution.Distribution):
@@ -42,6 +42,27 @@ class Mixture(distribution.Distribution):
 
   Methods supported include `log_prob`, `prob`, `mean`, `sample`, and
   `entropy_lower_bound`.
+
+
+  #### Examples
+
+  ```python
+  # Create a mixture of two Gaussians:
+  ds = tf.contrib.distributions
+  mix = 0.3
+  bimix_gauss = ds.Mixture(
+    cat=ds.Categorical(probs=[mix, 1.-mix]),
+    components=[
+      ds.Normal(loc=-1., scale=0.1),
+      ds.Normal(loc=+1., scale=0.5),
+  ])
+
+  # Plot the PDF.
+  import matplotlib.pyplot as plt
+  x = tf.linspace(-2., 3., int(1e4)).eval()
+  plt.plot(x, bimix_gauss.prob(x).eval());
+  ```
+
   """
 
   def __init__(self,
@@ -66,13 +87,13 @@ class Mixture(distribution.Distribution):
       components: A list or tuple of `Distribution` instances.
         Each instance must have the same type, be defined on the same domain,
         and have matching `event_shape` and `batch_shape`.
-      validate_args: `Boolean`, default `False`.  If `True`, raise a runtime
+      validate_args: Python `bool`, default `False`. If `True`, raise a runtime
         error if batch or event ranks are inconsistent between cat and any of
-        the distributions.  This is only checked if the ranks cannot be
+        the distributions. This is only checked if the ranks cannot be
         determined statically at graph construction time.
-      allow_nan_stats: Boolean, default `True`.  If `False`, raise an
+      allow_nan_stats: Boolean, default `True`. If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member.  If `True`, batch members with valid parameters leading to
+        batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
 
@@ -106,11 +127,6 @@ class Mixture(distribution.Distribution):
     if not all(d.dtype == dtype for d in components):
       raise TypeError("All components must have the same dtype, but saw "
                       "dtypes: %s" % [(d.name, d.dtype) for d in components])
-    is_continuous = components[0].is_continuous
-    if not all(d.is_continuous == is_continuous for d in components):
-      raise TypeError(
-          "All components must either be continuous or not, but continuity "
-          "values are: %s" % [(d.name, d.is_continuous) for d in components])
     static_event_shape = components[0].event_shape
     static_batch_shape = cat.batch_shape
     for d in components:
@@ -122,7 +138,7 @@ class Mixture(distribution.Distribution):
           "none of the components provide a static number of ndims")
 
     # Ensure that all batch and event ndims are consistent.
-    with ops.name_scope(name, values=[cat.logits]) as ns:
+    with ops.name_scope(name, values=[cat.logits]):
       num_components = cat.event_size
       static_num_components = tensor_util.constant_value(num_components)
       if static_num_components is None:
@@ -170,12 +186,11 @@ class Mixture(distribution.Distribution):
     super(Mixture, self).__init__(
         dtype=dtype,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
-        is_continuous=is_continuous,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=graph_parents,
-        name=ns)
+        name=name)
 
   @property
   def cat(self):
@@ -235,6 +250,19 @@ class Mixture(distribution.Distribution):
       log_sum_exp = math_ops.reduce_logsumexp(concat_log_probs, [0])
       return log_sum_exp
 
+  def _log_cdf(self, x):
+    with ops.control_dependencies(self._assertions):
+      x = ops.convert_to_tensor(x, name="x")
+      distribution_log_cdfs = [d.log_cdf(x) for d in self.components]
+      cat_log_probs = self._cat_probs(log_probs=True)
+      final_log_cdfs = [
+          cat_lp + d_lcdf
+          for (cat_lp, d_lcdf) in zip(cat_log_probs, distribution_log_cdfs)
+      ]
+      concatted_log_cdfs = array_ops.stack(final_log_cdfs, axis=0)
+      mixture_log_cdf = math_ops.reduce_logsumexp(concatted_log_cdfs, [0])
+      return mixture_log_cdf
+
   def _prob(self, x):
     return math_ops.exp(self._log_prob(x))
 
@@ -258,14 +286,14 @@ class Mixture(distribution.Distribution):
         batch_size = static_batch_shape.num_elements()
       else:
         batch_shape = self.batch_shape_tensor()
-        batch_size = array_ops.reduce_prod(batch_shape)
+        batch_size = math_ops.reduce_prod(batch_shape)
       static_event_shape = self.event_shape
       if static_event_shape.is_fully_defined():
         event_shape = np.array(static_event_shape.as_list(), dtype=np.int32)
       else:
         event_shape = self.event_shape_tensor()
 
-      # Get indices into the raw cat sampling tensor.  We will
+      # Get indices into the raw cat sampling tensor. We will
       # need these to stitch sample values back out after sampling
       # within the component partitions.
       samples_raw_indices = array_ops.reshape(
@@ -315,7 +343,7 @@ class Mixture(distribution.Distribution):
         # For sample s, batch element b of component c, we get the
         # partitioned batch indices from
         # partitioned_batch_indices[c]; and shift each element by
-        # the sample index.  The final lookup can be thought of as
+        # the sample index. The final lookup can be thought of as
         # a matrix gather along locations (s, b) in
         # samples_class_c where the n_class rows correspond to
         # samples within this component and the batch_size columns
@@ -329,7 +357,7 @@ class Mixture(distribution.Distribution):
             partitioned_batch_indices[c])
         samples_class_c = array_ops.reshape(
             samples_class_c,
-            array_ops.concat(([n_class * batch_size], event_shape), 0))
+            array_ops.concat([[n_class * batch_size], event_shape], 0))
         samples_class_c = array_ops.gather(
             samples_class_c, lookup_partitioned_batch_indices,
             name="samples_class_c_gather")
@@ -340,8 +368,8 @@ class Mixture(distribution.Distribution):
           indices=partitioned_samples_indices, data=samples_class)
       # Reshape back to proper sample, batch, and event shape.
       ret = array_ops.reshape(lhs_flat_ret,
-                              array_ops.concat((samples_shape,
-                                                self.event_shape_tensor()), 0))
+                              array_ops.concat([samples_shape,
+                                                self.event_shape_tensor()], 0))
       ret.set_shape(
           tensor_shape.TensorShape(static_samples_shape).concatenate(
               self.event_shape))
@@ -361,7 +389,7 @@ class Mixture(distribution.Distribution):
     \\)
 
     where \\( p \\) is the prior distribution, \\( q \\) is the variational,
-    and \\( H[q] \\) is the entropy of \\( q \\).  If there is a lower bound
+    and \\( H[q] \\) is the entropy of \\( q \\). If there is a lower bound
     \\( G[q] \\) such that \\( H[q] \geq G[q] \\) then it can be used in
     place of \\( H[q] \\).
 
diff --git a/tensorflow/contrib/distributions/python/ops/mvn.py b/tensorflow/contrib/distributions/python/ops/mvn.py
deleted file mode 100644
index 12006674758..00000000000
--- a/tensorflow/contrib/distributions/python/ops/mvn.py
+++ /dev/null
@@ -1,773 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Multivariate Normal distribution classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.contrib.distributions.python.ops import operator_pd_diag
-from tensorflow.contrib.distributions.python.ops import operator_pd_full
-from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import random_ops
-
-
-__all__ = [
-    "MultivariateNormalDiag",
-    "MultivariateNormalDiagWithSoftplusStDev",
-    "MultivariateNormalCholesky",
-    "MultivariateNormalFull",
-    "MultivariateNormalDiagPlusVDVT",
-]
-
-_mvn_prob_note = """
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-"""
-
-
-class _MultivariateNormalOperatorPD(distribution.Distribution):
-  """The multivariate normal distribution on `R^k`.
-
-  This distribution is defined by a 1-D mean `mu` and an instance of
-  `OperatorPDBase`, which provides access to a symmetric positive definite
-  operator, which defines the covariance.
-
-  #### Mathematical details
-
-  With `C` the covariance matrix represented by the operator, the PDF of this
-  distribution is:
-
-  ```
-  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-  ```
-
-  #### Examples
-
-  A single multi-variate Gaussian distribution is defined by a vector of means
-  of length `k`, and a covariance matrix of shape `k x k`.
-
-  Extra leading dimensions, if provided, allow for batches.
-
-  ```python
-  # Initialize a single 3-variate Gaussian.
-  mu = [1, 2, 3]
-  chol = [[1, 0, 0.], [1, 3, 0], [1, 2, 3]]
-  cov = tf.contrib.distributions.OperatorPDCholesky(chol)
-  dist = tf.contrib.distributions._MultivariateNormalOperatorPD(mu, cov)
-
-  # Evaluate this on an observation in R^3, returning a scalar.
-  dist.pdf([-1, 0, 1.])
-
-  # Initialize a batch of two 3-variate Gaussians.
-  mu = [[1, 2, 3], [11, 22, 33.]]
-  chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-  cov = tf.contrib.distributions.OperatorPDCholesky(chol)
-  dist = tf.contrib.distributions._MultivariateNormalOperatorPD(mu, cov)
-
-  # Evaluate this on a two observations, each in R^3, returning a length two
-  # tensor.
-  x = [[-1, 0, 1], [-11, 0, 11.]]  # Shape 2 x 3.
-  dist.pdf(x)
-  ```
-
-  """
-
-  def __init__(self,
-               mu,
-               cov,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="MultivariateNormalCov"):
-    """Multivariate Normal distributions on `R^k`.
-
-    User must provide means `mu`, and an instance of `OperatorPDBase`, `cov`,
-    which determines the covariance.
-
-    Args:
-      mu: Floating point tensor with shape `[N1,...,Nb, k]`, `b >= 0`.
-      cov: Instance of `OperatorPDBase` with same `dtype` as `mu` and shape
-        `[N1,...,Nb, k, k]`.
-      validate_args: `Boolean`, default `False`.  Whether to validate input
-        with asserts.  If `validate_args` is `False`, and the inputs are
-        invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
-        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member If `True`, batch members with valid parameters leading to
-        undefined statistics will return NaN for this statistic.
-      name: The name to give Ops created by the initializer.
-
-    Raises:
-      TypeError: If `mu` and `cov` are different dtypes.
-    """
-    parameters = locals()
-    with ops.name_scope(name) as ns:
-      with ops.name_scope("init", values=[mu] + cov.inputs):
-        self._mu = array_ops.identity(mu, name="mu")
-        self._cov = cov
-        self._validate_args = validate_args  # Needed by _assert_valid_mu.
-        self._mu = self._assert_valid_mu(self._mu)
-        super(_MultivariateNormalOperatorPD, self).__init__(
-            dtype=self._mu.dtype,
-            reparameterization_type=distribution.FULLY_REPARAMETERIZED,
-            is_continuous=True,
-            validate_args=validate_args,
-            allow_nan_stats=allow_nan_stats,
-            parameters=parameters,
-            graph_parents=[self._mu] + cov.inputs,
-            name=ns)
-
-  def _assert_valid_mu(self, mu):
-    """Return `mu` after validity checks and possibly with assertations."""
-    cov = self._cov
-    if mu.dtype != cov.dtype:
-      raise TypeError(
-          "mu and cov must have the same dtype.  Found mu.dtype = %s, "
-          "cov.dtype = %s" % (mu.dtype, cov.dtype))
-
-    # Try to validate with static checks.
-    mu_shape = mu.get_shape()
-    cov_shape = cov.get_shape()
-    if mu_shape.is_fully_defined() and cov_shape.is_fully_defined():
-      if mu_shape != cov_shape[:-1]:
-        raise ValueError(
-            "mu.shape and cov.shape[:-1] should match.  Found: mu.shape=%s, "
-            "cov.shape=%s" % (mu_shape, cov_shape))
-      else:
-        return mu
-
-    # Static checks could not be run, so possibly do dynamic checks.
-    if not self.validate_args:
-      return mu
-    else:
-      assert_same_rank = check_ops.assert_equal(
-          array_ops.rank(mu) + 1,
-          cov.rank(),
-          data=["mu should have rank 1 less than cov.  Found: rank(mu) = ",
-                array_ops.rank(mu), " rank(cov) = ", cov.rank()],
-      )
-      with ops.control_dependencies([assert_same_rank]):
-        assert_same_shape = check_ops.assert_equal(
-            array_ops.shape(mu),
-            cov.vector_shape(),
-            data=["mu.shape and cov.shape[:-1] should match.  "
-                  "Found: shape(mu) = "
-                  , array_ops.shape(mu), " shape(cov) = ", cov.shape()],
-        )
-        return control_flow_ops.with_dependencies([assert_same_shape], mu)
-
-  @property
-  def mu(self):
-    return self._mu
-
-  @property
-  def sigma(self):
-    """Dense (batch) covariance matrix, if available."""
-    with ops.name_scope(self.name):
-      return self._cov.to_dense()
-
-  def log_sigma_det(self, name="log_sigma_det"):
-    """Log of determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self._cov.inputs):
-        return self._cov.log_det()
-
-  def sigma_det(self, name="sigma_det"):
-    """Determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self._cov.inputs):
-        return math_ops.exp(self._cov.log_det())
-
-  def _batch_shape_tensor(self):
-    return self._cov.batch_shape()
-
-  def _batch_shape(self):
-    return self._cov.get_batch_shape()
-
-  def _event_shape_tensor(self):
-    return array_ops.stack([self._cov.vector_space_dimension()])
-
-  def _event_shape(self):
-    return self._cov.get_shape()[-1:]
-
-  def _sample_n(self, n, seed=None):
-    # Recall _assert_valid_mu ensures mu and self._cov have same batch shape.
-    shape = array_ops.concat([self._cov.vector_shape(), [n]], 0)
-    white_samples = random_ops.random_normal(shape=shape,
-                                             mean=0.,
-                                             stddev=1.,
-                                             dtype=self.dtype,
-                                             seed=seed)
-
-    correlated_samples = self._cov.sqrt_matmul(white_samples)
-
-    # Move the last dimension to the front
-    perm = array_ops.concat(
-        (array_ops.stack([array_ops.rank(correlated_samples) - 1]),
-         math_ops.range(0, array_ops.rank(correlated_samples) - 1)), 0)
-
-    # TODO(ebrevdo): Once we get a proper tensor contraction op,
-    # perform the inner product using that instead of batch_matmul
-    # and this slow transpose can go away!
-    correlated_samples = array_ops.transpose(correlated_samples, perm)
-    samples = correlated_samples + self.mu
-    return samples
-
-  @distribution_util.AppendDocstring(_mvn_prob_note)
-  def _log_prob(self, x):
-    # Q:  Why are shape requirements as stated above?
-    # A:  The compatible shapes are precisely the ones that will broadcast to
-    #     a shape compatible with self._cov.
-    # See Operator base class for notes about shapes compatible with self._cov.
-    x = ops.convert_to_tensor(x)
-    contrib_tensor_util.assert_same_float_dtype((self._mu, x))
-
-    # _assert_valid_mu asserts that self.mu has same batch shape as self.cov.
-    # so batch shape of self.mu = that of self._cov and self, and the
-    # batch shape of x_centered is a broadcast version of these.  If this
-    # broadcast results in a shape like
-    # [M1,...,Mm] + self.batch_shape + self.event_shape
-    # OR
-    # self.batch_shape + self.event_shape
-    # then subsequent operator calls are guaranteed to work.
-    x_centered = x - self.mu
-
-    # Compute the term x^{-1} sigma^{-1} x which appears in the exponent of
-    # the pdf.
-    x_whitened_norm = self._cov.inv_quadratic_form_on_vectors(x_centered)
-
-    k = math_ops.cast(self._cov.vector_space_dimension(), self.dtype)
-    log_prob_value = -0.5 * (self.log_sigma_det() +
-                             k * math.log(2. * math.pi) +
-                             x_whitened_norm)
-
-    output_static_shape = x_centered.get_shape()[:-1]
-    log_prob_value.set_shape(output_static_shape)
-    return log_prob_value
-
-  @distribution_util.AppendDocstring(_mvn_prob_note)
-  def _prob(self, x):
-    return math_ops.exp(self.log_prob(x))
-
-  def _entropy(self):
-    log_sigma_det = self.log_sigma_det()
-    one_plus_log_two_pi = constant_op.constant(1 + math.log(2 * math.pi),
-                                               dtype=self.dtype)
-
-    # Use broadcasting rules to calculate the full broadcast sigma.
-    k = math_ops.cast(self._cov.vector_space_dimension(), dtype=self.dtype)
-    entropy_value = (k * one_plus_log_two_pi + log_sigma_det) / 2
-    entropy_value.set_shape(log_sigma_det.get_shape())
-    return entropy_value
-
-  def _mean(self):
-    return array_ops.identity(self._mu)
-
-  def _covariance(self):
-    return self.sigma
-
-  def _variance(self):
-    return array_ops.matrix_diag_part(self.sigma)
-
-  def _mode(self):
-    return array_ops.identity(self._mu)
-
-
-class MultivariateNormalDiag(_MultivariateNormalOperatorPD):
-  """The multivariate normal distribution on `R^k`.
-
-  This distribution is defined by a 1-D mean `mu` and a 1-D diagonal
-  `diag_stddev`, representing the standard deviations.  This distribution
-  assumes the random variables, `(X_1,...,X_k)` are independent, thus no
-  non-diagonal terms of the covariance matrix are needed.
-
-  This allows for `O(k)` pdf evaluation, sampling, and storage.
-
-  #### Mathematical details
-
-  The PDF of this distribution is defined in terms of the diagonal covariance
-  determined by `diag_stddev`: `C_{ii} = diag_stddev[i]**2`.
-
-  ```
-  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-  ```
-
-  #### Examples
-
-  A single multi-variate Gaussian distribution is defined by a vector of means
-  of length `k`, and the square roots of the (independent) random variables.
-
-  Extra leading dimensions, if provided, allow for batches.
-
-  ```python
-  # Initialize a single 3-variate Gaussian with diagonal standard deviation.
-  mu = [1, 2, 3.]
-  diag_stddev = [4, 5, 6.]
-  dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-  # Evaluate this on an observation in R^3, returning a scalar.
-  dist.pdf([-1, 0, 1])
-
-  # Initialize a batch of two 3-variate Gaussians.
-  mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-  diag_stddev = ...  # shape 2 x 3, positive.
-  dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-  # Evaluate this on a two observations, each in R^3, returning a length two
-  # tensor.
-  x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-  dist.pdf(x)
-  ```
-
-  """
-
-  def __init__(
-      self,
-      mu,
-      diag_stddev,
-      validate_args=False,
-      allow_nan_stats=True,
-      name="MultivariateNormalDiag"):
-    """Multivariate Normal distributions on `R^k`.
-
-    User must provide means `mu` and standard deviations `diag_stddev`.
-    Each batch member represents a random vector `(X_1,...,X_k)` of independent
-    random normals.
-    The mean of `X_i` is `mu[i]`, and the standard deviation is
-    `diag_stddev[i]`.
-
-    Args:
-      mu:  Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
-        `b >= 0`.
-      diag_stddev: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-        representing the standard deviations.  Must be positive.
-      validate_args: `Boolean`, default `False`.  Whether to validate
-        input with asserts.  If `validate_args` is `False`,
-        and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
-        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member If `True`, batch members with valid parameters leading to
-        undefined statistics will return NaN for this statistic.
-      name: The name to give Ops created by the initializer.
-
-    Raises:
-      TypeError: If `mu` and `diag_stddev` are different dtypes.
-    """
-    parameters = locals()
-    with ops.name_scope(name, values=[diag_stddev]) as ns:
-      cov = operator_pd_diag.OperatorPDSqrtDiag(diag_stddev,
-                                                verify_pd=validate_args)
-    super(MultivariateNormalDiag, self).__init__(
-        mu, cov,
-        allow_nan_stats=allow_nan_stats,
-        validate_args=validate_args,
-        name=ns)
-    self._parameters = parameters
-
-
-class MultivariateNormalDiagWithSoftplusStDev(MultivariateNormalDiag):
-  """MultivariateNormalDiag with `diag_stddev = softplus(diag_stddev)`."""
-
-  def __init__(self,
-               mu,
-               diag_stddev,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="MultivariateNormalDiagWithSoftplusStdDev"):
-    parameters = locals()
-    with ops.name_scope(name, values=[diag_stddev]) as ns:
-      super(MultivariateNormalDiagWithSoftplusStDev, self).__init__(
-          mu=mu,
-          diag_stddev=nn.softplus(diag_stddev),
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats,
-          name=ns)
-    self._parameters = parameters
-
-
-class MultivariateNormalDiagPlusVDVT(_MultivariateNormalOperatorPD):
-  """The multivariate normal distribution on `R^k`.
-
-  Every batch member of this distribution is defined by a mean and a lightweight
-  covariance matrix `C`.
-
-  #### Mathematical details
-
-  The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
-
-  ```
-  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-  ```
-
-  For every batch member, this distribution represents `k` random variables
-  `(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-  `C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-  The user initializes this class by providing the mean `mu`, and a lightweight
-  definition of `C`:
-
-  ```
-  C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-  M is diagonal (k x k)
-  V = is shape (k x r), typically r << k
-  D = is diagonal (r x r), optional (defaults to identity).
-  ```
-
-  This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
-  sampling and storage (per batch member).
-
-  #### Examples
-
-  A single multi-variate Gaussian distribution is defined by a vector of means
-  of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
-  leading dimensions, if provided, allow for batches.
-
-  ```python
-  # Initialize a single 3-variate Gaussian with covariance square root
-  # S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
-  mu = [1, 2, 3.]
-  diag_large = [1.1, 2.2, 3.3]
-  v = ... # shape 3 x 2
-  diag_small = [4., 5.]
-  dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-      mu, diag_large, v, diag_small=diag_small)
-
-  # Evaluate this on an observation in R^3, returning a scalar.
-  dist.pdf([-1, 0, 1])
-
-  # Initialize a batch of two 3-variate Gaussians.  This time, don't provide
-  # diag_small.  This means S = M + V V^T.
-  mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-  diag_large = ... # shape 2 x 3
-  v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
-  dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-      mu, diag_large, v)
-
-  # Evaluate this on a two observations, each in R^3, returning a length two
-  # tensor.
-  x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-  dist.pdf(x)
-  ```
-
-  """
-
-  def __init__(
-      self,
-      mu,
-      diag_large,
-      v,
-      diag_small=None,
-      validate_args=False,
-      allow_nan_stats=True,
-      name="MultivariateNormalDiagPlusVDVT"):
-    """Multivariate Normal distributions on `R^k`.
-
-    For every batch member, this distribution represents `k` random variables
-    `(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-    `C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-    The user initializes this class by providing the mean `mu`, and a
-    lightweight definition of `C`:
-
-    ```
-    C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-    M is diagonal (k x k)
-    V = is shape (k x r), typically r << k
-    D = is diagonal (r x r), optional (defaults to identity).
-    ```
-
-    Args:
-      mu:  Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
-        `n >= 0`.  The means.
-      diag_large:  Optional rank `n + 1` floating point tensor, shape
-        `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-      v:  Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
-        `n >= 0`.  Defines the matrix `V`.
-      diag_small:  Rank `n + 1` floating point tensor, shape
-        `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
-        is `None`, which means `D` will be the identity matrix.
-      validate_args: `Boolean`, default `False`.  Whether to validate input
-        with asserts.  If `validate_args` is `False`,
-        and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
-        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member If `True`, batch members with valid parameters leading to
-        undefined statistics will return NaN for this statistic.
-      name: The name to give Ops created by the initializer.
-    """
-    parameters = locals()
-    with ops.name_scope(name, values=[diag_large, v, diag_small]) as ns:
-      cov = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator_pd_diag.OperatorPDDiag(
-              diag_large, verify_pd=validate_args),
-          v,
-          diag=diag_small,
-          verify_pd=validate_args,
-          verify_shapes=validate_args)
-    super(MultivariateNormalDiagPlusVDVT, self).__init__(
-        mu, cov,
-        allow_nan_stats=allow_nan_stats,
-        validate_args=validate_args,
-        name=ns)
-    self._parameters = parameters
-
-
-class MultivariateNormalCholesky(_MultivariateNormalOperatorPD):
-  """The multivariate normal distribution on `R^k`.
-
-  This distribution is defined by a 1-D mean `mu` and a Cholesky factor `chol`.
-  Providing the Cholesky factor allows for `O(k^2)` pdf evaluation and sampling,
-  and requires `O(k^2)` storage.
-
-  #### Mathematical details
-
-  The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
-
-  The PDF of this distribution is then:
-
-  ```
-  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-  ```
-
-  #### Examples
-
-  A single multi-variate Gaussian distribution is defined by a vector of means
-  of length `k`, and a covariance matrix of shape `k x k`.
-
-  Extra leading dimensions, if provided, allow for batches.
-
-  ```python
-  # Initialize a single 3-variate Gaussian with diagonal covariance.
-  # Note, this would be more efficient with MultivariateNormalDiag.
-  mu = [1, 2, 3.]
-  chol = [[1, 0, 0], [0, 3, 0], [0, 0, 2]]
-  dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-  # Evaluate this on an observation in R^3, returning a scalar.
-  dist.pdf([-1, 0, 1])
-
-  # Initialize a batch of two 3-variate Gaussians.
-  mu = [[1, 2, 3], [11, 22, 33]]
-  chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-  dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-  # Evaluate this on a two observations, each in R^3, returning a length two
-  # tensor.
-  x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-  dist.pdf(x)
-  ```
-
-  Trainable (batch) Cholesky matrices can be created with
-  `tf.contrib.distributions.matrix_diag_transform()`
-
-  """
-
-  def __init__(self,
-               mu,
-               chol,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="MultivariateNormalCholesky"):
-    """Multivariate Normal distributions on `R^k`.
-
-    User must provide means `mu` and `chol` which holds the (batch) Cholesky
-    factors, such that the covariance of each batch member is `chol chol^T`.
-
-    Args:
-      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-        `b >= 0`.
-      chol: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-        `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
-        though it is zero), and the diagonal must be positive.
-      validate_args: `Boolean`, default `False`.  Whether to validate input
-        with asserts.  If `validate_args` is `False`, and the inputs are
-        invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
-        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member If `True`, batch members with valid parameters leading to
-        undefined statistics will return NaN for this statistic.
-      name: The name to give Ops created by the initializer.
-
-    Raises:
-      TypeError: If `mu` and `chol` are different dtypes.
-    """
-    parameters = locals()
-    with ops.name_scope(name, values=[chol]) as ns:
-      cov = operator_pd_cholesky.OperatorPDCholesky(chol,
-                                                    verify_pd=validate_args)
-    super(MultivariateNormalCholesky, self).__init__(
-        mu, cov,
-        allow_nan_stats=allow_nan_stats,
-        validate_args=validate_args,
-        name=ns)
-    self._parameters = parameters
-
-
-class MultivariateNormalFull(_MultivariateNormalOperatorPD):
-  """The multivariate normal distribution on `R^k`.
-
-  This distribution is defined by a 1-D mean `mu` and covariance matrix `sigma`.
-  Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations.
-
-  #### Mathematical details
-
-  With `C = sigma`, the PDF of this distribution is:
-
-  ```
-  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-  ```
-
-  #### Examples
-
-  A single multi-variate Gaussian distribution is defined by a vector of means
-  of length `k`, and a covariance matrix of shape `k x k`.
-
-  Extra leading dimensions, if provided, allow for batches.
-
-  ```python
-  # Initialize a single 3-variate Gaussian with diagonal covariance.
-  mu = [1, 2, 3.]
-  sigma = [[1, 0, 0], [0, 3, 0], [0, 0, 2.]]
-  dist = tf.contrib.distributions.MultivariateNormalFull(mu, chol)
-
-  # Evaluate this on an observation in R^3, returning a scalar.
-  dist.pdf([-1, 0, 1])
-
-  # Initialize a batch of two 3-variate Gaussians.
-  mu = [[1, 2, 3], [11, 22, 33.]]
-  sigma = ...  # shape 2 x 3 x 3, positive definite.
-  dist = tf.contrib.distributions.MultivariateNormalFull(mu, sigma)
-
-  # Evaluate this on a two observations, each in R^3, returning a length two
-  # tensor.
-  x = [[-1, 0, 1], [-11, 0, 11.]]  # Shape 2 x 3.
-  dist.pdf(x)
-  ```
-
-  """
-
-  def __init__(self,
-               mu,
-               sigma,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="MultivariateNormalFull"):
-    """Multivariate Normal distributions on `R^k`.
-
-    User must provide means `mu` and `sigma`, the mean and covariance.
-
-    Args:
-      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-        `b >= 0`.
-      sigma: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-        `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
-      validate_args: `Boolean`, default `False`.  Whether to validate input
-        with asserts.  If `validate_args` is `False`, and the inputs are
-        invalid, correct behavior is not guaranteed.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
-        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member If `True`, batch members with valid parameters leading to
-        undefined statistics will return NaN for this statistic.
-      name: The name to give Ops created by the initializer.
-
-    Raises:
-      TypeError: If `mu` and `sigma` are different dtypes.
-    """
-    parameters = locals()
-    with ops.name_scope(name, values=[sigma]) as ns:
-      cov = operator_pd_full.OperatorPDFull(sigma, verify_pd=validate_args)
-    super(MultivariateNormalFull, self).__init__(
-        mu, cov,
-        allow_nan_stats=allow_nan_stats,
-        validate_args=validate_args,
-        name=ns)
-    self._parameters = parameters
-
-
-@kullback_leibler.RegisterKL(
-    _MultivariateNormalOperatorPD, _MultivariateNormalOperatorPD)
-def _kl_mvn_mvn_brute_force(mvn_a, mvn_b, name=None):
-  """Batched KL divergence `KL(mvn_a || mvn_b)` for multivariate normals.
-
-  With `X`, `Y` both multivariate normals in `R^k` with means `mu_x`, `mu_y` and
-  covariance `C_x`, `C_y` respectively,
-
-  ```
-  KL(X || Y) = 0.5 * ( T + Q + - k + L ),
-  T := trace(C_b^{-1} C_a),
-  Q := (mu_b - mu_a)^T C_b^{-1} (mu_b - mu_a),
-  L := Log[Det(C_b)] - Log[Det(C_a)]
-  ```
-
-  This `Op` computes the trace by solving `C_b^{-1} C_a`.  Although efficient
-  methods for solving systems with `C_b` may be available, a dense version of
-  (the square root of) `C_a` is used, so performance is `O(B s k^2)` where `B`
-  is the batch size, and `s` is the cost of solving `C_b x = y` for vectors `x`
-  and `y`.
-
-  Args:
-    mvn_a:  Instance of subclass of `_MultivariateNormalOperatorPD`.
-    mvn_b:  Instance of subclass of `_MultivariateNormalOperatorPD`.
-    name:  (optional) name to use for created ops.  Default "kl_mvn_mvn".
-
-  Returns:
-    Batchwise `KL(mvn_a || mvn_b)`.
-  """
-  # Access the "private" OperatorPD that each mvn is built from.
-  cov_a = mvn_a._cov  # pylint: disable=protected-access
-  cov_b = mvn_b._cov  # pylint: disable=protected-access
-  mu_a = mvn_a.mu
-  mu_b = mvn_b.mu
-  inputs = [mu_a, mu_b] + cov_a.inputs + cov_b.inputs
-
-  with ops.name_scope(name, "kl_mvn_mvn", inputs):
-    # If Ca = AA', Cb = BB', then
-    # tr[inv(Cb) Ca] = tr[inv(B)' inv(B) A A']
-    #                = tr[inv(B) A A' inv(B)']
-    #                = tr[(inv(B) A) (inv(B) A)']
-    #                = sum_{ik} (inv(B) A)_{ik}^2
-    # The second equality follows from the cyclic permutation property.
-    b_inv_a = cov_b.sqrt_solve(cov_a.sqrt_to_dense())
-    t = math_ops.reduce_sum(
-        math_ops.square(b_inv_a),
-        reduction_indices=[-1, -2])
-    q = cov_b.inv_quadratic_form_on_vectors(mu_b - mu_a)
-    k = math_ops.cast(cov_a.vector_space_dimension(), mvn_a.dtype)
-    one_half_l = cov_b.sqrt_log_det() - cov_a.sqrt_log_det()
-    return 0.5 * (t + q - k) + one_half_l
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
new file mode 100644
index 00000000000..163cf75d990
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -0,0 +1,235 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mvn_linear_operator as mvn_linop
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn
+
+
+__all__ = [
+    "MultivariateNormalDiag",
+    "MultivariateNormalDiagWithSoftplusScale",
+]
+
+
+class MultivariateNormalDiag(
+    mvn_linop.MultivariateNormalLinearOperator):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-0.5 ||y||**2) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = (2 pi)**(0.5 k) |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+  ```
+
+  where:
+
+  * `scale_diag.shape = [k]`, and,
+  * `scale_identity_multiplier.shape = []`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+  `scale` is the Identity matrix.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 2-variate Gaussian.
+  mvn = ds.MultivariateNormalDiag(
+      loc=[1., -1],
+      scale_diag=[1, 2.])
+
+  mvn.mean().eval()
+  # ==> [1., -1]
+
+  mvn.stddev().eval()
+  # ==> [1., 2]
+
+  # Evaluate this on an observation in `R^2`, returning a scalar.
+  mvn.prob([-1., 0]).eval()  # shape: []
+
+  # Initialize a 3-batch, 2-variate scaled-identity Gaussian.
+  mvn = ds.MultivariateNormalDiag(
+      loc=[1., -1],
+      scale_identity_multiplier=[1, 2., 3])
+
+  mvn.mean().eval()  # shape: [3, 2]
+  # ==> [[1., -1]
+  #      [1, -1],
+  #      [1, -1]]
+
+  mvn.stddev().eval()  # shape: [3, 2]
+  # ==> [[1., 1],
+  #      [2, 2],
+  #      [3, 3]]
+
+  # Evaluate this on an observation in `R^2`, returning a length-3 vector.
+  mvn.prob([-1., 0]).eval()  # shape: [3]
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mvn = ds.MultivariateNormalDiag(
+      loc=[[1., 2, 3],
+           [11, 22, 33]]           # shape: [2, 3]
+      scale_diag=[[1., 2, 3],
+                  [0.5, 1, 1.5]])  # shape: [2, 3]
+
+  # Evaluate this on a two observations, each in `R^3`, returning a length-2
+  # vector.
+  x = [[-1., 0, 1],
+       [-11, 0, 11.]]   # shape: [2, 3].
+  mvn.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalDiag"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`, and,
+    * `scale_identity_multiplier.shape = []`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=False,
+            assert_positive=False)
+    super(MultivariateNormalDiag, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
+
+
+class MultivariateNormalDiagWithSoftplusScale(MultivariateNormalDiag):
+  """MultivariateNormalDiag with `diag_stddev = softplus(diag_stddev)`."""
+
+  def __init__(self,
+               loc,
+               scale_diag,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalDiagWithSoftplusScale"):
+    parameters = locals()
+    with ops.name_scope(name, values=[scale_diag]):
+      super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
+          loc=loc,
+          scale_diag=nn.softplus(scale_diag),
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
new file mode 100644
index 00000000000..ee3e02e0203
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -0,0 +1,255 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mvn_linear_operator as mvn_linop
+from tensorflow.python.framework import ops
+
+
+__all__ = [
+    "MultivariateNormalDiagPlusLowRank",
+]
+
+
+class MultivariateNormalDiagPlusLowRank(
+    mvn_linop.MultivariateNormalLinearOperator):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-0.5 ||y||**2) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = (2 pi)**(0.5 k) |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = diag(scale_diag + scale_identity_multiplier ones(k)) +
+        scale_perturb_factor @ diag(scale_perturb_diag) @ scale_perturb_factor.T
+  ```
+
+  where:
+
+  * `scale_diag.shape = [k]`,
+  * `scale_identity_multiplier.shape = []`,
+  * `scale_perturb_factor.shape = [k, r]`, typically `k >> r`, and,
+  * `scale_perturb_diag.shape = [r]`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+  `scale` is the Identity matrix.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 3-variate Gaussian with covariance `cov = S @ S.T`,
+  # `S = diag(d) + U @ diag(m) @ U.T`. The perturbation, `U @ diag(m) @ U.T`, is
+  # a rank-2 update.
+  mu = [-0.5., 0, 0.5]   # shape: [3]
+  d = [1.5, 0.5, 2]      # shape: [3]
+  U = [[1., 2],
+       [-1, 1],
+       [2, -0.5]]        # shape: [3, 2]
+  m = [4., 5]            # shape: [2]
+  mvn = ds.MultivariateNormalDiagPlusLowRank(
+      loc=mu
+      scale_diag=d
+      scale_perturb_factor=U,
+      scale_perturb_diag=m)
+
+  # Evaluate this on an observation in `R^3`, returning a scalar.
+  mvn.prob([-1, 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians; `S = diag(d) + U @ U.T`.
+  mu = [[1.,  2,  3],
+        [11, 22, 33]]      # shape: [b, k] = [2, 3]
+  U = [[[1., 2],
+        [3,  4],
+        [5,  6]],
+       [[0.5, 0.75],
+        [1,0, 0.25],
+        [1.5, 1.25]]]      # shape: [b, k, r] = [2, 3, 2]
+  m = [[0.1, 0.2],
+       [0.4, 0.5]]         # shape: [b, r] = [2, 2]
+
+  mvn = ds.MultivariateNormalDiagPlusLowRank(
+      loc=mu,
+      scale_perturb_factor=U,
+      scale_perturb_diag=m)
+
+  mvn.covariance().eval()   # shape: [2, 3, 3]
+  # ==> [[[  15.63   31.57    48.51]
+  #       [  31.57   69.31   105.05]
+  #       [  48.51  105.05   162.59]]
+  #
+  #      [[   2.59    1.41    3.35]
+  #       [   1.41    2.71    3.34]
+  #       [   3.35    3.34    8.35]]]
+
+  # Compute the pdf of two `R^3` observations (one from each batch);
+  # return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               scale_perturb_factor=None,
+               scale_perturb_diag=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalDiagPlusLowRank"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier ones(k)) +
+        scale_perturb_factor @ diag(scale_perturb_diag) @ scale_perturb_factor.T
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`,
+    * `scale_identity_multiplier.shape = []`,
+    * `scale_perturb_factor.shape = [k, r]`, typically `k >> r`, and,
+    * `scale_perturb_diag.shape = [r]`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      scale_perturb_factor: Floating-point `Tensor` representing a rank-`r`
+        perturbation added to `scale`. May have shape `[B1, ..., Bb, k, r]`,
+        `b >= 0`, and characterizes `b`-batches of rank-`r` updates to `scale`.
+        When `None`, no rank-`r` update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing a diagonal matrix
+        inside the rank-`r` perturbation added to `scale`. May have shape
+        `[B1, ..., Bb, r]`, `b >= 0`, and characterizes `b`-batches of `r x r`
+        diagonal matrices inside the perturbation added to `scale`. When
+        `None`, an identity matrix is used inside the perturbation. Can only be
+        specified if `scale_perturb_factor` is also specified.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    def _convert_to_tensor(x, name):
+      return None if x is None else ops.convert_to_tensor(x, name=name)
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier, scale_perturb_factor,
+          scale_perturb_diag]):
+        has_low_rank = (scale_perturb_factor is not None or
+                        scale_perturb_diag is not None)
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=validate_args,
+            assert_positive=has_low_rank)
+        scale_perturb_factor = _convert_to_tensor(
+            scale_perturb_factor,
+            name="scale_perturb_factor")
+        scale_perturb_diag = _convert_to_tensor(
+            scale_perturb_diag,
+            name="scale_perturb_diag")
+        if has_low_rank:
+          scale = linalg.LinearOperatorUDVHUpdate(
+              scale,
+              u=scale_perturb_factor,
+              diag_update=scale_perturb_diag,
+              is_diag_update_positive=scale_perturb_diag is None,
+              is_non_singular=True,  # Implied by is_positive_definite=True.
+              is_self_adjoint=True,
+              is_positive_definite=True,
+              is_square=True)
+    super(MultivariateNormalDiagPlusLowRank, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
new file mode 100644
index 00000000000..84809d8dc45
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -0,0 +1,187 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution class initialized with a full covariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import mvn_tril
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+
+
+__all__ = [
+    "MultivariateNormalFullCovariance",
+]
+
+
+class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `covariance_matrix` matrices that are the covariance.
+  This is different than the other multivariate normals, which are parameterized
+  by a matrix more akin to the standard deviation.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is, with `@` as matrix multiplication,
+
+  ```none
+  pdf(x; loc, covariance_matrix) = exp(-0.5 ||y||**2) / Z,
+  y = (x - loc)^T @ inv(covariance_matrix) @ (x - loc)
+  Z = (2 pi)**(0.5 k) |det(covariance_matrix)|**(0.5).
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `covariance_matrix` is an `R^{k x k}` symmetric positive definite matrix,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  Additional leading dimensions (if any) in `loc` and `covariance_matrix` allow
+  for batch dimensions.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed e.g. as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  scale = Cholesky(covariance_matrix)
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 3-variate Gaussian.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance_matrix=cov)
+
+  mvn.mean().eval()
+  # ==> [1., 2, 3]
+
+  # Covariance agrees with covariance_matrix.
+  mvn.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an observation in `R^3` ; return a scalar.
+  mvn.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance=covariance_matrix)
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               covariance_matrix=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalFullCovariance"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and
+    `covariance_matrix` arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `covariance_matrix`. The last dimension of `loc` (if provided) must
+    broadcast with this.
+
+    A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive
+    definite matrix.  In other words it is (real) symmetric with all eigenvalues
+    strictly positive.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      covariance_matrix: Floating-point, symmetric positive definite `Tensor` of
+        same `dtype` as `loc`.  The strict upper triangle of `covariance_matrix`
+        is ignored, so if `covariance_matrix` is not symmetric no error will be
+        raised (unless `validate_args is True`).  `covariance_matrix` has shape
+        `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if neither `loc` nor `covariance_matrix` are specified.
+    """
+    parameters = locals()
+
+    # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[loc, covariance_matrix]):
+        if covariance_matrix is None:
+          scale_tril = None
+        else:
+          covariance_matrix = ops.convert_to_tensor(
+              covariance_matrix, name="covariance_matrix")
+          if validate_args:
+            assert_symmetric = check_ops.assert_equal(
+                covariance_matrix,
+                array_ops.matrix_transpose(covariance_matrix),
+                message="Matrix was not symmetric.")
+            covariance_matrix = control_flow_ops.with_dependencies(
+                [assert_symmetric], covariance_matrix)
+          # No need to validate that covariance_matrix is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
+          # However, cholesky() ignores the upper triangular part, so we do need
+          # to separately assert symmetric.
+          scale_tril = linalg_ops.cholesky(covariance_matrix)
+        super(MultivariateNormalFullCovariance, self).__init__(
+            loc=loc,
+            scale_tril=scale_tril,
+            validate_args=validate_args,
+            allow_nan_stats=allow_nan_stats)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
new file mode 100644
index 00000000000..b25250d3671
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -0,0 +1,342 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
+from tensorflow.python.ops.distributions import transformed_distribution
+
+
+__all__ = [
+    "MultivariateNormalLinearOperator",
+]
+
+
+_mvn_sample_note = """
+`value` is a batch vector with compatible shape if `value` is a `Tensor` whose
+shape can be broadcast up to either:
+
+```python
+self.batch_shape + self.event_shape
+```
+
+or
+
+```python
+[M1, ..., Mm] + self.batch_shape + self.event_shape
+```
+
+"""
+
+
+# TODO(b/35290280): Import in `../../__init__.py` after adding unit-tests.
+class MultivariateNormalLinearOperator(
+    transformed_distribution.TransformedDistribution):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `scale` matrix; `covariance = scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-0.5 ||y||**2) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = (2 pi)**(0.5 k) |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 3-variate Gaussian.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+  scale = tf.cholesky(cov)
+  # ==> [[ 0.6,  0. ,  0. ],
+  #      [ 0.2,  0.5,  0. ],
+  #      [ 0.1, -0.3,  0.4]])
+
+  mvn = ds.MultivariateNormalLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorTriL(scale))
+
+  # Covariance agrees with cholesky(cov) parameterization.
+  mvn.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an`R^3` observation; return a scalar.
+  mvn.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  mvn = ds.MultivariateNormalLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorDiag(scale_diag))
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalLinearOperator"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
+        `[B1, ..., Bb, k, k]`.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are
+        invalid, correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError: if `scale` is unspecified.
+      TypeError: if not `scale.dtype.is_floating`
+    """
+    parameters = locals()
+    if scale is None:
+      raise ValueError("Missing required `scale` parameter.")
+    if not scale.dtype.is_floating:
+      raise TypeError("`scale` parameter must have floating-point dtype.")
+
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+
+    super(MultivariateNormalLinearOperator, self).__init__(
+        distribution=normal.Normal(
+            loc=array_ops.zeros([], dtype=scale.dtype),
+            scale=array_ops.ones([], dtype=scale.dtype)),
+        bijector=bijectors.AffineLinearOperator(
+            shift=loc, scale=scale, validate_args=validate_args),
+        batch_shape=batch_shape,
+        event_shape=event_shape,
+        validate_args=validate_args,
+        name=name)
+    self._parameters = parameters
+
+  @property
+  def loc(self):
+    """The `loc` `Tensor` in `Y = scale @ X + loc`."""
+    return self.bijector.shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
+    return self.bijector.scale
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _log_prob(self, x):
+    return super(MultivariateNormalLinearOperator, self)._log_prob(x)
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _prob(self, x):
+    return super(MultivariateNormalLinearOperator, self)._prob(x)
+
+  def _mean(self):
+    shape = self.batch_shape.concatenate(self.event_shape)
+    has_static_shape = shape.is_fully_defined()
+    if not has_static_shape:
+      shape = array_ops.concat([
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], 0)
+
+    if self.loc is None:
+      return array_ops.zeros(shape, self.dtype)
+
+    if has_static_shape and shape == self.loc.get_shape():
+      return array_ops.identity(self.loc)
+
+    # Add dummy tensor of zeros to broadcast.  This is only necessary if shape
+    # != self.loc.shape, but we could not determine if this is the case.
+    return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
+
+  def _covariance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
+    else:
+      return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
+
+  def _variance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return math_ops.square(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense()))
+    else:
+      return array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
+
+  def _stddev(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return math_ops.abs(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense())))
+    else:
+      return math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
+
+  def _mode(self):
+    return self._mean()
+
+
+@kullback_leibler.RegisterKL(MultivariateNormalLinearOperator,
+                             MultivariateNormalLinearOperator)
+def _kl_brute_force(a, b, name=None):
+  """Batched KL divergence `KL(a || b)` for multivariate Normals.
+
+  With `X`, `Y` both multivariate Normals in `R^k` with means `mu_a`, `mu_b` and
+  covariance `C_a`, `C_b` respectively,
+
+  ```
+  KL(a || b) = 0.5 * ( L - k + T + Q ),
+  L := Log[Det(C_b)] - Log[Det(C_a)]
+  T := trace(C_b^{-1} C_a),
+  Q := (mu_b - mu_a)^T C_b^{-1} (mu_b - mu_a),
+  ```
+
+  This `Op` computes the trace by solving `C_b^{-1} C_a`. Although efficient
+  methods for solving systems with `C_b` may be available, a dense version of
+  (the square root of) `C_a` is used, so performance is `O(B s k**2)` where `B`
+  is the batch size, and `s` is the cost of solving `C_b x = y` for vectors `x`
+  and `y`.
+
+  Args:
+    a: Instance of `MultivariateNormalLinearOperator`.
+    b: Instance of `MultivariateNormalLinearOperator`.
+    name: (optional) name to use for created ops. Default "kl_mvn".
+
+  Returns:
+    Batchwise `KL(a || b)`.
+  """
+
+  def squared_frobenius_norm(x):
+    """Helper to make KL calculation slightly more readable."""
+    # http://mathworld.wolfram.com/FrobeniusNorm.html
+    return math_ops.square(linalg_ops.norm(x, ord="fro", axis=[-2, -1]))
+
+  # TODO(b/35041439): See also b/35040945. Remove this function once LinOp
+  # supports something like:
+  #   A.inverse().solve(B).norm(order='fro', axis=[-1, -2])
+  def is_diagonal(x):
+    """Helper to identify if `LinearOperator` has only a diagonal component."""
+    return (isinstance(x, linalg.LinearOperatorIdentity) or
+            isinstance(x, linalg.LinearOperatorScaledIdentity) or
+            isinstance(x, linalg.LinearOperatorDiag))
+
+  with ops.name_scope(name, "kl_mvn", values=[a.loc, b.loc] +
+                      a.scale.graph_parents + b.scale.graph_parents):
+    # Calculation is based on:
+    # http://stats.stackexchange.com/questions/60680/kl-divergence-between-two-multivariate-gaussians
+    # and,
+    # https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
+    # i.e.,
+    #   If Ca = AA', Cb = BB', then
+    #   tr[inv(Cb) Ca] = tr[inv(B)' inv(B) A A']
+    #                  = tr[inv(B) A A' inv(B)']
+    #                  = tr[(inv(B) A) (inv(B) A)']
+    #                  = sum_{ij} (inv(B) A)_{ij}**2
+    #                  = ||inv(B) A||_F**2
+    # where ||.||_F is the Frobenius norm and the second equality follows from
+    # the cyclic permutation property.
+    if is_diagonal(a.scale) and is_diagonal(b.scale):
+      # Using `stddev` because it handles expansion of Identity cases.
+      b_inv_a = (a.stddev() / b.stddev())[..., array_ops.newaxis]
+    else:
+      b_inv_a = b.scale.solve(a.scale.to_dense())
+    kl_div = (b.scale.log_abs_determinant()
+              - a.scale.log_abs_determinant()
+              + 0.5 * (
+                  - math_ops.cast(a.scale.domain_dimension_tensor(), a.dtype)
+                  + squared_frobenius_norm(b_inv_a)
+                  + squared_frobenius_norm(b.scale.solve(
+                      (b.mean() - a.mean())[..., array_ops.newaxis]))))
+    kl_div.set_shape(array_ops.broadcast_static_shape(
+        a.batch_shape, b.batch_shape))
+    return kl_div
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
new file mode 100644
index 00000000000..d662b25e1e1
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -0,0 +1,204 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import mvn_linear_operator as mvn_linop
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+__all__ = [
+    "MultivariateNormalTriL",
+]
+
+
+class MultivariateNormalTriL(
+    mvn_linop.MultivariateNormalLinearOperator):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-0.5 ||y||**2) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = (2 pi)**(0.5 k) |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a matrix in `R^{k x k}`, `covariance = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = scale_tril
+  ```
+
+  where `scale_tril` is lower-triangular `k x k` matrix with non-zero diagonal,
+  i.e., `tf.diag_part(scale_tril) != 0`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  Y = scale @ X + loc
+  ```
+
+  Trainable (batch) lower-triangular matrices can be created with
+  `ds.matrix_diag_transform()` and/or `ds.fill_lower_triangular()`
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 3-variate Gaussian.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+  scale = tf.cholesky(cov)
+  # ==> [[ 0.6,  0. ,  0. ],
+  #      [ 0.2,  0.5,  0. ],
+  #      [ 0.1, -0.3,  0.4]])
+  mvn = ds.MultivariateNormalTriL(
+      loc=mu,
+      scale_tril=scale)
+
+  mvn.mean().eval()
+  # ==> [1., 2, 3]
+
+  # Covariance agrees with cholesky(cov) parameterization.
+  mvn.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an observation in `R^3` ; return a scalar.
+  mvn.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  tril = ...  # shape: [2, 3, 3], lower triangular, non-zero diagonal.
+  mvn = ds.MultivariateNormalTriL(
+      loc=mu,
+      scale_tril=tril)
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_tril=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalTriL"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
+
+    ```none
+    scale = scale_tril
+    ```
+
+    where `scale_tril` is lower-triangular `k x k` matrix with non-zero
+    diagonal, i.e., `tf.diag_part(scale_tril) != 0`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_tril: Floating-point, lower-triangular `Tensor` with non-zero
+        diagonal elements. `scale_tril` has shape `[B1, ..., Bb, k, k]` where
+        `b >= 0` and `k` is the event size.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if neither `loc` nor `scale_tril` are specified.
+    """
+    parameters = locals()
+    def _convert_to_tensor(x, name):
+      return None if x is None else ops.convert_to_tensor(x, name=name)
+    if loc is None and scale_tril is None:
+      raise ValueError("Must specify one or both of `loc`, `scale_tril`.")
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[loc, scale_tril]):
+        loc = _convert_to_tensor(loc, name="loc")
+        scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
+        if scale_tril is None:
+          scale = linalg.LinearOperatorIdentity(
+              num_rows=distribution_util.dimension_size(loc, -1),
+              dtype=loc.dtype,
+              is_self_adjoint=True,
+              is_positive_definite=True,
+              assert_proper_shapes=validate_args)
+        else:
+          # No need to validate that scale_tril is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
+          scale = linalg.LinearOperatorTriL(
+              scale_tril,
+              is_non_singular=True,
+              is_self_adjoint=False,
+              is_positive_definite=False)
+    super(MultivariateNormalTriL, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
new file mode 100644
index 00000000000..8895fd8b465
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Negative Binomial distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class NegativeBinomial(distribution.Distribution):
+  """NegativeBinomial distribution.
+
+  The NegativeBinomial distribution is related to the experiment of performing
+  Bernoulli trials in sequence. Given a Bernoulli trial with probability `p` of
+  success, the NegativeBinomial distribution represents the distribution over
+  the number of successes `s` that occur until we observe `f` failures.
+
+  The probability mass function (pmf) is,
+
+  ```none
+  pmf(s; f, p) = p**s (1 - p)**f / Z
+  Z = s! (f - 1)! / (s + f - 1)!
+  ```
+
+  where:
+  * `total_count = f`,
+  * `probs = p`,
+  * `Z` is the normalizaing constant, and,
+  * `n!` is the factorial of `n`.
+  """
+
+  def __init__(self,
+               total_count,
+               logits=None,
+               probs=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="NegativeBinomial"):
+    """Construct NegativeBinomial distributions.
+
+    Args:
+      total_count: Non-negative floating-point `Tensor` with shape
+        broadcastable to `[B1,..., Bb]` with `b >= 0` and the same dtype as
+        `probs` or `logits`. Defines this as a batch of `N1 x ... x Nm`
+        different Negative Binomial distributions. In practice, this represents
+        the number of negative Bernoulli trials to stop at (the `total_count`
+        of failures), but this is still a valid distribution when
+        `total_count` is a non-integer.
+      logits: Floating-point `Tensor` with shape broadcastable to
+        `[B1, ..., Bb]` where `b >= 0` indicates the number of batch dimensions.
+        Each entry represents logits for the probability of success for
+        independent Negative Binomial distributions and must be in the open
+        interval `(-inf, inf)`. Only one of `logits` or `probs` should be
+        specified.
+      probs: Positive floating-point `Tensor` with shape broadcastable to
+        `[B1, ..., Bb]` where `b >= 0` indicates the number of batch dimensions.
+        Each entry represents the probability of success for independent
+        Negative Binomial distributions and must be in the open interval
+        `(0, 1)`. Only one of `logits` or `probs` should be specified.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+
+    parameters = locals()
+    with ops.name_scope(name, values=[total_count, logits, probs]):
+      self._logits, self._probs = distribution_util.get_logits_and_probs(
+          logits, probs, validate_args=validate_args, name=name)
+      with ops.control_dependencies(
+          [check_ops.assert_positive(total_count)] if validate_args else []):
+        self._total_count = array_ops.identity(total_count)
+
+    super(NegativeBinomial, self).__init__(
+        dtype=self._probs.dtype,
+        reparameterization_type=distribution.NOT_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._total_count, self._probs, self._logits],
+        name=name)
+
+  @property
+  def total_count(self):
+    """Number of negative trials."""
+    return self._total_count
+
+  @property
+  def logits(self):
+    """Log-odds of a `1` outcome (vs `0`)."""
+    return self._logits
+
+  @property
+  def probs(self):
+    """Probability of a `1` outcome (vs `0`)."""
+    return self._probs
+
+  def _batch_shape_tensor(self):
+    return array_ops.broadcast_dynamic_shape(
+        array_ops.shape(self.total_count),
+        array_ops.shape(self.probs))
+
+  def _batch_shape(self):
+    return array_ops.broadcast_static_shape(
+        self.total_count.get_shape(),
+        self.probs.get_shape())
+
+  def _event_shape_tensor(self):
+    return array_ops.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    # Here we use the fact that if:
+    # lam ~ Gamma(concentration=total_count, rate=(1-probs)/probs)
+    # then X ~ Poisson(lam) is Negative Binomially distributed.
+    rate = random_ops.random_gamma(
+        shape=[n],
+        alpha=self.total_count,
+        beta=math_ops.exp(-self.logits),
+        dtype=self.dtype,
+        seed=seed)
+
+    return random_ops.random_poisson(
+        rate,
+        shape=[],
+        dtype=self.dtype,
+        seed=distribution_util.gen_new_seed(seed, "negative_binom"))
+
+  def _cdf(self, positive_counts):
+    if self.validate_args:
+      positive_counts = math_ops.floor(
+          distribution_util.embed_check_nonnegative_discrete(
+              positive_counts, check_integer=False))
+    return math_ops.betainc(
+        self.total_count, positive_counts + 1.,
+        math_ops.sigmoid(-self.logits))
+
+  def _log_prob(self, positive_counts):
+    return (self._log_unnormalized_prob(positive_counts)
+            - self._log_normalization(positive_counts))
+
+  def _log_unnormalized_prob(self, positive_counts):
+    if self.validate_args:
+      positive_counts = distribution_util.embed_check_nonnegative_discrete(
+          positive_counts, check_integer=True)
+    return self.total_count * math_ops.log1p(
+        -self.probs) + positive_counts * math_ops.log(self.probs)
+
+  def _log_normalization(self, positive_counts):
+    if self.validate_args:
+      positive_counts = distribution_util.embed_check_nonnegative_discrete(
+          positive_counts, check_integer=True)
+    return (-math_ops.lgamma(self.total_count + positive_counts)
+            + math_ops.lgamma(positive_counts + 1.)
+            + math_ops.lgamma(self.total_count))
+
+  def _mean(self):
+    return self.total_count * math_ops.exp(self.logits)
+
+  def _mode(self):
+    adjusted_count = array_ops.where(
+        1. < self.total_count,
+        self.total_count - 1.,
+        array_ops.zeros_like(self.total_count))
+    return math_ops.floor(adjusted_count * math_ops.exp(self.logits))
+
+  def _variance(self):
+    return self._mean() / math_ops.sigmoid(-self.logits)
diff --git a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
index bb4970ae908..4025285780b 100644
--- a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
+++ b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.normal import Normal  # pylint: disable=line-too-long
-
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal
 
 
 def normal_conjugates_known_scale_posterior(prior, scale, s, n):
@@ -28,7 +27,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
 
   This model assumes that `n` observations (with sum `s`) come from a
   Normal with unknown mean `loc` (described by the Normal `prior`)
-  and known variance `scale^2`.  The "known scale posterior" is
+  and known variance `scale**2`. The "known scale posterior" is
   the distribution of the unknown `loc`.
 
   Accepts a prior Normal distribution object, having parameters
@@ -38,12 +37,12 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
   `n` (the number(s) of observations).
 
   Returns a posterior (also Normal) distribution object, with parameters
-  `(loc', scale'^2)`, where:
+  `(loc', scale'**2)`, where:
 
   ```
-  mu ~ N(mu', sigma'^2)
-  sigma'^2 = 1/(1/sigma0^2 + n/sigma^2),
-  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma'^2.
+  mu ~ N(mu', sigma'**2)
+  sigma'**2 = 1/(1/sigma0**2 + n/sigma**2),
+  mu' = (mu0/sigma0**2 + s/sigma**2) * sigma'**2.
   ```
 
   Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
@@ -54,8 +53,8 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
       the prior distribution having parameters `(loc0, scale0)`.
     scale: tensor of type `dtype`, taking values `scale > 0`.
       The known stddev parameter(s).
-    s: Tensor of type `dtype`.  The sum(s) of observations.
-    n: Tensor of type `int`.  The number(s) of observations.
+    s: Tensor of type `dtype`. The sum(s) of observations.
+    n: Tensor of type `int`. The number(s) of observations.
 
   Returns:
     A new Normal posterior distribution object for the unknown observation
@@ -65,7 +64,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -77,7 +76,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2))
 
@@ -87,7 +86,7 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
 
   This model assumes that `n` observations (with sum `s`) come from a
   Normal with unknown mean `loc` (described by the Normal `prior`)
-  and known variance `scale^2`.  The "known scale predictive"
+  and known variance `scale**2`. The "known scale predictive"
   is the distribution of new observations, conditioned on the existing
   observations and our prior.
 
@@ -97,20 +96,20 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
   and statistical estimates `s` (the sum(s) of the observations) and
   `n` (the number(s) of observations).
 
-  Calculates the Normal distribution(s) `p(x | sigma^2)`:
+  Calculates the Normal distribution(s) `p(x | sigma**2)`:
 
   ```
-  p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.loc, prior.scale**2) dmu
-                 = N(x | prior.loc, 1/(sigma^2 + prior.scale**2))
+  p(x | sigma**2) = int N(x | mu, sigma**2)N(mu | prior.loc, prior.scale**2) dmu
+                  = N(x | prior.loc, 1 / (sigma**2 + prior.scale**2))
   ```
 
   Returns the predictive posterior distribution object, with parameters
-  `(loc', scale'^2)`, where:
+  `(loc', scale'**2)`, where:
 
   ```
-  sigma_n^2 = 1/(1/sigma0^2 + n/sigma^2),
-  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma_n^2.
-  sigma'^2 = sigma_n^2 + sigma^2,
+  sigma_n**2 = 1/(1/sigma0**2 + n/sigma**2),
+  mu' = (mu0/sigma0**2 + s/sigma**2) * sigma_n**2.
+  sigma'**2 = sigma_n**2 + sigma**2,
   ```
 
   Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
@@ -121,8 +120,8 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
       the prior distribution having parameters `(loc0, scale0)`.
     scale: tensor of type `dtype`, taking values `scale > 0`.
       The known stddev parameter(s).
-    s: Tensor of type `dtype`.  The sum(s) of observations.
-    n: Tensor of type `int`.  The number(s) of observations.
+    s: Tensor of type `dtype`. The sum(s) of observations.
+    n: Tensor of type `int`. The number(s) of observations.
 
   Returns:
     A new Normal predictive distribution object.
@@ -131,7 +130,7 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -143,6 +142,6 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2 + scale_2))
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 22f0d6d35f7..b76cebf79fa 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class OneHotCategorical(distribution.Distribution):
@@ -43,11 +43,11 @@ class OneHotCategorical(distribution.Distribution):
   K is the number of classes.
 
   This class provides methods to create indexed batches of OneHotCategorical
-  distributions.  If the provided `logits` or `probs` is rank 2 or higher, for
+  distributions. If the provided `logits` or `probs` is rank 2 or higher, for
   every fixed set of leading dimensions, the last dimension represents one
-  single OneHotCategorical distribution.  When calling distribution
+  single OneHotCategorical distribution. When calling distribution
   functions (e.g. `dist.prob(x)`), `logits` and `x` are broadcast to the
-  same shape (if possible).  In all cases, the last dimension of `logits,x`
+  same shape (if possible). In all cases, the last dimension of `logits,x`
   represents single OneHotCategorical distributions.
 
   #### Examples
@@ -105,18 +105,18 @@ class OneHotCategorical(distribution.Distribution):
         vector of probabilities for each class. Only one of `logits` or `probs`
         should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]) as ns:
+    with ops.name_scope(name, values=[logits, probs]):
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
           multidimensional=True)
@@ -136,14 +136,13 @@ class OneHotCategorical(distribution.Distribution):
 
     super(OneHotCategorical, self).__init__(
         dtype=dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._logits,
                        self._probs],
-        name=ns)
+        name=name)
 
   @property
   def event_size(self):
@@ -173,7 +172,7 @@ class OneHotCategorical(distribution.Distribution):
     return self.logits.get_shape().with_rank_at_least(1)[-1:]
 
   def _sample_n(self, n, seed=None):
-    sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0)
+    sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0)
     logits = self.logits
     if logits.get_shape().ndims == 2:
       logits_2d = logits
@@ -231,8 +230,8 @@ class OneHotCategorical(distribution.Distribution):
     return control_flow_ops.with_dependencies([
         check_ops.assert_non_positive(x),
         distribution_util.assert_close(
-            array_ops.zeros((), dtype=self.dtype),
-            math_ops.reduce_logsumexp(x, reduction_indices=[-1])),
+            array_ops.zeros([], dtype=self.dtype),
+            math_ops.reduce_logsumexp(x, axis=[-1])),
     ], x)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
index 548374bcde6..3ca341bb830 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
index c742855d80e..4cee2997909 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 
 from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -108,7 +107,7 @@ class OperatorPDIdentity(operator_pd.OperatorPDBase):
       return scale
 
     # Further check that this is a rank 0, positive tensor.
-    scale = contrib_tensor_util.assert_scalar(scale)
+    scale = check_ops.assert_scalar(scale)
     return control_flow_ops.with_dependencies(
         [check_ops.assert_positive(scale)], scale)
 
@@ -116,7 +115,7 @@ class OperatorPDIdentity(operator_pd.OperatorPDBase):
     """Static check that the argument `x` is proper `shape`, `dtype`."""
     # x is a typical argument e.g. to matmul or solve.  In both cases, x should
     # have the same type/shape since this is a square matrix.  These checks are
-    # ususally not needed since we ususally have some tensor backing this
+    # usually not needed since we usually have some tensor backing this
     # distribution, and the calls to tf.matmul do a shape/type check.
     #
     # Static checks only for efficiency, the identity should be fast.
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 799796ace0c..d9929183c1a 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -18,17 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "Poisson",
@@ -70,30 +69,29 @@ class Poisson(distribution.Distribution):
     Args:
       rate: Floating point tensor, the rate parameter of the
         distribution(s). `rate` must be positive.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[rate]) as ns:
+    with ops.name_scope(name, values=[rate]):
       with ops.control_dependencies([check_ops.assert_positive(rate)] if
                                     validate_args else []):
         self._rate = array_ops.identity(rate, name="rate")
     super(Poisson, self).__init__(
         dtype=self._rate.dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._rate],
-        name=ns)
+        name=name)
 
   @property
   def rate(self):
@@ -126,14 +124,20 @@ class Poisson(distribution.Distribution):
 
   @distribution_util.AppendDocstring(_poisson_sample_note)
   def _cdf(self, x):
-    x = self._assert_valid_sample(x, check_integer=False)
+    if self.validate_args:
+      # We set `check_integer=False` since the CDF is defined on whole real
+      # line.
+      x = distribution_util.embed_check_nonnegative_discrete(
+          x, check_integer=False)
     return math_ops.igammac(math_ops.floor(x + 1), self.rate)
 
   def _log_normalization(self):
     return self.rate
 
   def _log_unnormalized_prob(self, x):
-    x = self._assert_valid_sample(x, check_integer=True)
+    if self.validate_args:
+      x = distribution_util.embed_check_nonnegative_discrete(
+          x, check_integer=True)
     return x * math_ops.log(self.rate) - math_ops.lgamma(x + 1)
 
   def _mean(self):
@@ -148,11 +152,6 @@ class Poisson(distribution.Distribution):
   def _mode(self):
     return math_ops.floor(self.rate)
 
-  def _assert_valid_sample(self, x, check_integer=True):
-    if not self.validate_args:
-      return x
-    dependencies = [check_ops.assert_non_negative(x)]
-    if check_integer:
-      dependencies += [distribution_util.assert_integer_form(
-          x, message="x has non-integer components.")]
-    return control_flow_ops.with_dependencies(dependencies, x)
+  def _sample_n(self, n, seed=None):
+    return random_ops.random_poisson(
+        self.rate, [n], dtype=self.dtype, seed=seed)
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 7466b1c2209..8aebb79b913 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -20,14 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution as distributions
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distributions
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = ["QuantizedDistribution"]
 
@@ -38,8 +37,9 @@ def _logsum_expbig_minus_expsmall(big, small):
   To work correctly, we should have the pointwise relation:  `small <= big`.
 
   Args:
-    big: Numeric `Tensor`
-    small: Numeric `Tensor` with same `dtype` as `big` and broadcastable shape.
+    big: Floating-point `Tensor`
+    small: Floating-point `Tensor` with same `dtype` as `big` and broadcastable
+      shape.
 
   Returns:
     `Tensor` of same `dtype` of `big` and broadcast shape.
@@ -61,14 +61,14 @@ P[Y = y] := P[X <= low],  if y == low,
 """
 
 _prob_note = _prob_base_note + """
-The base distribution's `cdf` method must be defined on `y - 1`.  If the
+The base distribution's `cdf` method must be defined on `y - 1`. If the
 base distribution has a `survival_function` method, results will be more
 accurate for large values of `y`, and in this case the `survival_function` must
 also be defined on `y - 1`.
 """
 
 _log_prob_note = _prob_base_note + """
-The base distribution's `log_cdf` method must be defined on `y - 1`.  If the
+The base distribution's `log_cdf` method must be defined on `y - 1`. If the
 base distribution has a `log_survival_function` method results will be more
 accurate for large values of `y`, and in this case the `log_survival_function`
 must also be defined on `y - 1`.
@@ -194,19 +194,19 @@ class QuantizedDistribution(distributions.Distribution):
       distribution:  The base distribution class to transform. Typically an
         instance of `Distribution`.
       low: `Tensor` with same `dtype` as this distribution and shape
-        able to be added to samples.  Should be a whole number.  Default `None`.
+        able to be added to samples. Should be a whole number. Default `None`.
         If provided, base distribution's `prob` should be defined at
         `low`.
       high: `Tensor` with same `dtype` as this distribution and shape
-        able to be added to samples.  Should be a whole number.  Default `None`.
+        able to be added to samples. Should be a whole number. Default `None`.
         If provided, base distribution's `prob` should be defined at
         `high - 1`.
         `high` must be strictly greater than `low`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: If `dist_cls` is not a subclass of
@@ -217,14 +217,14 @@ class QuantizedDistribution(distributions.Distribution):
     values = (
         list(distribution.parameters.values()) +
         [low, high])
-    with ops.name_scope(name, values=values) as ns:
+    with ops.name_scope(name, values=values):
       self._dist = distribution
 
       if low is not None:
         low = ops.convert_to_tensor(low, name="low")
       if high is not None:
         high = ops.convert_to_tensor(high, name="high")
-      contrib_tensor_util.assert_same_float_dtype(
+      check_ops.assert_same_float_dtype(
           tensors=[self.distribution, low, high])
 
       # We let QuantizedDistribution access _graph_parents since this class is
@@ -232,7 +232,7 @@ class QuantizedDistribution(distributions.Distribution):
       graph_parents = self._dist._graph_parents  # pylint: disable=protected-access
 
       checks = []
-      if low is not None and high is not None:
+      if validate_args and low is not None and high is not None:
         message = "low must be strictly less than high."
         checks.append(
             check_ops.assert_less(
@@ -252,13 +252,12 @@ class QuantizedDistribution(distributions.Distribution):
 
     super(QuantizedDistribution, self).__init__(
         dtype=self._dist.dtype,
-        is_continuous=False,
         reparameterization_type=distributions.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=self._dist.allow_nan_stats,
         parameters=parameters,
         graph_parents=graph_parents,
-        name=ns)
+        name=name)
 
   def _batch_shape_tensor(self):
     return self.distribution.batch_shape_tensor()
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b257b64c7e9..5b57a95c55e 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -18,15 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import bijector
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import logistic
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
+# Bijectors must be directly imported because `remove_undocumented` prevents
+# individual file imports.
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
@@ -151,43 +152,37 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
       probs: An N-D `Tensor` representing the probability of a positive event.
         Each entry in the `Tensor` parameterizes an independent Bernoulli
         distribution. Only one of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]) as ns:
+    with ops.name_scope(name, values=[logits, probs, temperature]):
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
         self._temperature = array_ops.identity(temperature, name="temperature")
-
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits, probs=probs, validate_args=validate_args)
-      dist = logistic.Logistic(self._logits / self._temperature,
-                               1. / self._temperature,
-                               validate_args=validate_args,
-                               allow_nan_stats=allow_nan_stats,
-                               name=ns)
-      self._parameters = parameters
-
-    def inverse_log_det_jacobian_fn(y):
-      return -math_ops.log(y) - math_ops.log1p(-y)
-
-    sigmoid_bijector = bijector.Inline(
-        forward_fn=math_ops.sigmoid,
-        inverse_fn=(lambda y: math_ops.log(y) - math_ops.log1p(-y)),
-        inverse_log_det_jacobian_fn=inverse_log_det_jacobian_fn,
-        name="sigmoid")
-    super(RelaxedBernoulli, self).__init__(dist, sigmoid_bijector, name=name)
+      super(RelaxedBernoulli, self).__init__(
+          distribution=logistic.Logistic(
+              self._logits / self._temperature,
+              1. / self._temperature,
+              validate_args=validate_args,
+              allow_nan_stats=allow_nan_stats,
+              name=name + "/Logistic"),
+          bijector=Sigmoid(validate_args=validate_args),
+          validate_args=validate_args,
+          name=name)
+    self._parameters = parameters
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -207,5 +202,3 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
   def probs(self):
     """Probability of `1`."""
     return self._probs
-
-
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index bd22b4d25ac..da1cd72a6f1 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -19,10 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import bijector
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
+from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +28,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ExpRelaxedOneHotCategorical(distribution.Distribution):
@@ -59,8 +59,8 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a continuous distribution, whoe exp approximates a 3-class one-hot
-  categorical distiribution. The 2nd class is the most likely to be the
+  Creates a continuous distribution, whose exp approximates a 3-class one-hot
+  categorical distribution. The 2nd class is the most likely to be the
   largest component in samples drawn from this distribution. If those samples
   are followed by a `tf.exp` op, then they are distributed as a relaxed onehot
   categorical.
@@ -76,7 +76,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
   ```
 
   Creates a continuous distribution, whose exp approximates a 3-class one-hot
-  categorical distiribution. The 2nd class is the most likely to be the
+  categorical distribution. The 2nd class is the most likely to be the
   largest component in samples drawn from this distribution.
 
   ```python
@@ -90,7 +90,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
   ```
 
   Creates a continuous distribution, whose exp approximates a 3-class one-hot
-  categorical distiribution. Because the temperature is very low, samples from
+  categorical distribution. Because the temperature is very low, samples from
   this distribution are almost discrete, with one component almost 0 and the
   others very negative. The 2nd class is the most likely to be the largest
   component in samples drawn from this distribution.
@@ -106,7 +106,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
   ```
 
   Creates a continuous distribution, whose exp approximates a 3-class one-hot
-  categorical distiribution. Because the temperature is very high, samples from
+  categorical distribution. Because the temperature is very high, samples from
   this distribution are usually close to the (-log(3), -log(3), -log(3)) vector.
   The 2nd class is still the most likely to be the largest component
   in samples drawn from this distribution.
@@ -151,18 +151,18 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
         the last dimension represents a vector of probabilities for each
         class. Only one of `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs, temperature]) as ns:
+    with ops.name_scope(name, values=[logits, probs, temperature]):
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
         self._temperature = array_ops.identity(temperature, name="temperature")
@@ -187,7 +187,6 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
 
     super(ExpRelaxedOneHotCategorical, self).__init__(
         dtype=dtype,
-        is_continuous=True,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
@@ -195,7 +194,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
         graph_parents=[self._logits,
                        self._probs,
                        self._temperature],
-        name=ns)
+        name=name)
 
   @property
   def event_size(self):
@@ -230,25 +229,22 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
     return self.logits.get_shape().with_rank_at_least(1)[-1:]
 
   def _sample_n(self, n, seed=None):
-    sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0)
+    sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0)
     logits = self.logits * array_ops.ones(sample_shape)
     logits_2d = array_ops.reshape(logits, [-1, self.event_size])
-    np_dtype = self.dtype.as_numpy_dtype
-
-    # Uniform variates must be sampled from the interval (0,1] rather than
-    # [0,1], as they are passed through log() to compute Gumbel variates.
-    # We need to use np.finfo(np_dtype).tiny because it is the smallest,
-    # positive, "normal" number.  A "normal" number is such that the mantissa
-    # has an implicit leading 1.  Normal, positive numbers x, y have the
-    # reasonable property that: x + y >= max(x, y).
-    # minval=np.nextafter(np.float32(0),1)) can cause
-    # tf.random_uniform(dtype=tf.float32) to sample 0.
-
-    uniform = random_ops.random_uniform(shape=array_ops.shape(logits_2d),
-                                        minval=np.finfo(np_dtype).tiny,
-                                        maxval=1,
-                                        dtype=self.dtype,
-                                        seed=seed)
+    # Uniform variates must be sampled from the open-interval `(0, 1)` rather
+    # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
+    # because it is the smallest, positive, "normal" number. A "normal" number
+    # is such that the mantissa has an implicit leading 1. Normal, positive
+    # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
+    # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
+    # 0.
+    uniform = random_ops.random_uniform(
+        shape=array_ops.shape(logits_2d),
+        minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
+        maxval=1.,
+        dtype=self.dtype,
+        seed=seed)
     gumbel = -math_ops.log(-math_ops.log(uniform))
     noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d)
     samples = nn_ops.log_softmax(noisy_logits)
@@ -290,8 +286,8 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
     return control_flow_ops.with_dependencies([
         check_ops.assert_non_positive(x),
         distribution_util.assert_close(
-            array_ops.zeros((), dtype=self.dtype),
-            math_ops.reduce_logsumexp(x, reduction_indices=[-1])),
+            array_ops.zeros([], dtype=self.dtype),
+            math_ops.reduce_logsumexp(x, axis=[-1])),
     ], x)
 
 
@@ -317,7 +313,7 @@ class RelaxedOneHotCategorical(
   #### Examples
 
   Creates a continuous distribution, which approximates a 3-class one-hot
-  categorical distiribution. The 2nd class is the most likely to be the
+  categorical distribution. The 2nd class is the most likely to be the
   largest component in samples drawn from this distribution.
 
   ```python
@@ -327,7 +323,7 @@ class RelaxedOneHotCategorical(
   ```
 
   Creates a continuous distribution, which approximates a 3-class one-hot
-  categorical distiribution. The 2nd class is the most likely to be the
+  categorical distribution. The 2nd class is the most likely to be the
   largest component in samples drawn from this distribution.
 
   ```python
@@ -337,7 +333,7 @@ class RelaxedOneHotCategorical(
   ```
 
   Creates a continuous distribution, which approximates a 3-class one-hot
-  categorical distiribution. Because the temperature is very low, samples from
+  categorical distribution. Because the temperature is very low, samples from
   this distribution are almost discrete, with one component almost 1 and the
   others nearly 0. The 2nd class is the most likely to be the largest component
   in samples drawn from this distribution.
@@ -349,7 +345,7 @@ class RelaxedOneHotCategorical(
   ```
 
   Creates a continuous distribution, which approximates a 3-class one-hot
-  categorical distiribution. Because the temperature is very high, samples from
+  categorical distribution. Because the temperature is very high, samples from
   this distribution are usually close to the (1/3, 1/3, 1/3) vector. The 2nd
   class is still the most likely to be the largest component
   in samples drawn from this distribution.
@@ -394,9 +390,9 @@ class RelaxedOneHotCategorical(
         of `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
       validate_args: Unused in this distribution.
-      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
         exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-        batch member.  If `True`, batch members with valid parameters leading to
+        batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
     """
@@ -407,5 +403,5 @@ class RelaxedOneHotCategorical(
                                        validate_args=validate_args,
                                        allow_nan_stats=allow_nan_stats)
     super(RelaxedOneHotCategorical, self).__init__(dist,
-                                                   bijector.Exp(event_ndims=1),
+                                                   bijectors.Exp(event_ndims=1),
                                                    name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
new file mode 100644
index 00000000000..26cf922d0af
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -0,0 +1,344 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for computing statistics of samples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+
+__all__ = [
+    "percentile",
+]
+
+
+# TODO(langmore) To make equivalent to numpy.percentile:
+#  Make work with a sequence of floats or single float for 'q'.
+#  Make work with "linear", "midpoint" interpolation. (linear should be default)
+def percentile(x,
+               q,
+               axis=None,
+               interpolation=None,
+               keep_dims=False,
+               validate_args=False,
+               name=None):
+  """Compute the `q`-th percentile of `x`.
+
+  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
+  way from the minimum to the maximum in in a sorted copy of `x`.
+
+  The values and distances of the two nearest neighbors as well as the
+  `interpolation` parameter will determine the percentile if the normalized
+  ranking does not match the location of `q` exactly.
+
+  This function is the same as the median if `q = 50`, the same as the minimum
+  if `q = 0` and the same as the maximum if `q = 100`.
+
+
+  ```python
+  # Get 30th percentile with default ('nearest') interpolation.
+  x = [1., 2., 3., 4.]
+  percentile(x, q=30.)
+  ==> 2.0
+
+  # Get 30th percentile with 'lower' interpolation
+  x = [1., 2., 3., 4.]
+  percentile(x, q=30., interpolation='lower')
+  ==> 1.0
+
+  # Get 100th percentile (maximum).  By default, this is computed over every dim
+  x = [[1., 2.]
+       [3., 4.]]
+  percentile(x, q=100.)
+  ==> 4.0
+
+  # Treat the leading dim as indexing samples, and find the 100th quantile (max)
+  # over all such samples.
+  x = [[1., 2.]
+       [3., 4.]]
+  percentile(x, q=100., axis=[0])
+  ==> [3., 4.]
+  ```
+
+  Compare to `numpy.percentile`.
+
+  Args:
+    x:  Floating point `N-D` `Tensor` with `N > 0`.  If `axis` is not `None`,
+      `x` must have statically known number of dimensions.
+    q:  Scalar `Tensor` in `[0, 100]`. The percentile.
+    axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values.
+      The axis that hold independent samples over which to return the desired
+      percentile.  If `None` (the default), treat every dimension as a sample
+      dimension, returning a scalar.
+    interpolation : {"lower", "higher", "nearest"}.  Default: "nearest"
+      This optional parameter specifies the interpolation method to
+      use when the desired quantile lies between two data points `i < j`:
+        * lower: `i`.
+        * higher: `j`.
+        * nearest: `i` or `j`, whichever is nearest.
+    keep_dims:  Python `bool`. If `True`, the last dimension is kept with size 1
+      If `False`, the last dimension is removed from the output shape.
+    validate_args:  Whether to add runtime checks of argument validity.
+      If False, and arguments are incorrect, correct behavior is not guaranteed.
+    name:  A Python string name to give this `Op`.  Default is "percentile"
+
+  Returns:
+    A `(N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if
+      `axis` is `None`, a scalar.
+
+  Raises:
+    ValueError:  If argument 'interpolation' is not an allowed type.
+  """
+  name = name or "percentile"
+  allowed_interpolations = {"lower", "higher", "nearest"}
+
+  if interpolation is None:
+    interpolation = "nearest"
+  else:
+    if interpolation not in allowed_interpolations:
+      raise ValueError("Argument 'interpolation' must be in %s.  Found %s" %
+                       (allowed_interpolations, interpolation))
+
+  with ops.name_scope(name, [x, q]):
+    x = ops.convert_to_tensor(x, name="x")
+    q = math_ops.to_float(q, name="q")
+    _get_static_ndims(q, expect_ndims=0)
+
+    if validate_args:
+      q = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(q, 0), check_ops.assert_greater_equal(q, 0.),
+          check_ops.assert_less_equal(q, 100.)
+      ], q)
+
+    if axis is None:
+      y = array_ops.reshape(x, [-1])
+    else:
+      axis = ops.convert_to_tensor(axis, name="axis")
+      check_ops.assert_integer(axis)
+      axis_ndims = _get_static_ndims(
+          axis, expect_static=True, expect_ndims_no_more_than=1)
+      axis_const = tensor_util.constant_value(axis)
+      if axis_const is None:
+        raise ValueError(
+            "Expected argument 'axis' to be statically available.  Found: %s" %
+            axis)
+      axis = axis_const
+      if axis_ndims == 0:
+        axis = [axis]
+      axis = [int(a) for a in axis]
+      x_ndims = _get_static_ndims(
+          x, expect_static=True, expect_ndims_at_least=1)
+      axis = _make_static_axis_non_negative(axis, x_ndims)
+      y = _move_dims_to_flat_end(x, axis, x_ndims)
+
+    frac_at_q_or_above = 1. - q / 100.
+    d = math_ops.to_float(array_ops.shape(y)[-1])
+
+    if interpolation == "lower":
+      index = math_ops.ceil((d - 1) * frac_at_q_or_above)
+    elif interpolation == "higher":
+      index = math_ops.floor((d - 1) * frac_at_q_or_above)
+    elif interpolation == "nearest":
+      index = math_ops.round((d - 1) * frac_at_q_or_above)
+
+    # Sort everything, not just the top 'k' entries, which allows multiple calls
+    # to sort only once (under the hood) and use CSE.
+    sorted_y = _sort_tensor(y)
+
+    # result.shape = B
+    result = sorted_y[..., math_ops.to_int32(index)]
+    result.set_shape(y.get_shape()[:-1])
+
+    if keep_dims:
+      if axis is None:
+        # ones_vec = [1, 1,..., 1], total length = len(S) + len(B).
+        ones_vec = array_ops.ones(
+            shape=[_get_best_effort_ndims(x)], dtype=dtypes.int32)
+        result *= array_ops.ones(ones_vec, dtype=x.dtype)
+      else:
+        result = _insert_back_keep_dims(result, axis)
+
+    return result
+
+
+def _get_static_ndims(x,
+                      expect_static=False,
+                      expect_ndims=None,
+                      expect_ndims_no_more_than=None,
+                      expect_ndims_at_least=None):
+  """Get static number of dimensions and assert that some expectations are met.
+
+  This function returns the number of dimensions "ndims" of x, as a Python int.
+
+  The optional expect arguments are used to check the ndims of x, but this is
+  only done if the static ndims of x is not None.
+
+  Args:
+    x:  A Tensor.
+    expect_static:  Expect `x` to have statically defined `ndims`.
+    expect_ndims:  Optional Python integer.  If provided, assert that x has
+      number of dimensions equal to this.
+    expect_ndims_no_more_than:  Optional Python integer.  If provided, assert
+      that x has no more than this many dimensions.
+    expect_ndims_at_least:  Optional Python integer.  If provided, assert that
+      x has at least this many dimensions.
+
+  Returns:
+    ndims:  A Python integer.
+
+  Raises:
+    ValueError:  If any of the expectations above are violated.
+  """
+  ndims = x.get_shape().ndims
+  if ndims is None:
+    shape_const = tensor_util.constant_value(array_ops.shape(x))
+    if shape_const is not None:
+      ndims = shape_const.ndim
+
+  if ndims is None:
+    if expect_static:
+      raise ValueError(
+          "Expected argument 'x' to have statically defined 'ndims'.  Found: " %
+          x)
+    return
+
+  if expect_ndims is not None:
+    ndims_message = ("Expected argument 'x' to have ndims %s.  Found tensor %s"
+                     % (expect_ndims, x))
+    if ndims != expect_ndims:
+      raise ValueError(ndims_message)
+
+  if expect_ndims_at_least is not None:
+    ndims_at_least_message = (
+        "Expected argument 'x' to have ndims >= %d.  Found tensor %s" % (
+            expect_ndims_at_least, x))
+    if ndims < expect_ndims_at_least:
+      raise ValueError(ndims_at_least_message)
+
+  if expect_ndims_no_more_than is not None:
+    ndims_no_more_than_message = (
+        "Expected argument 'x' to have ndims <= %d.  Found tensor %s" % (
+            expect_ndims_no_more_than, x))
+    if ndims > expect_ndims_no_more_than:
+      raise ValueError(ndims_no_more_than_message)
+
+  return ndims
+
+
+def _get_best_effort_ndims(x,
+                           expect_ndims=None,
+                           expect_ndims_at_least=None,
+                           expect_ndims_no_more_than=None):
+  """Get static ndims if possible.  Fallback on `tf.rank(x)`."""
+  ndims_static = _get_static_ndims(
+      x,
+      expect_ndims=expect_ndims,
+      expect_ndims_at_least=expect_ndims_at_least,
+      expect_ndims_no_more_than=expect_ndims_no_more_than)
+  if ndims_static is not None:
+    return ndims_static
+  return array_ops.rank(x)
+
+
+def _insert_back_keep_dims(x, axis):
+  """Insert the dims in `axis` back as singletons after being removed.
+
+  Args:
+    x:  `Tensor`.
+    axis:  Python list of integers.
+
+  Returns:
+    `Tensor` with same values as `x`, but additional singleton dimensions.
+  """
+  for i in sorted(axis):
+    x = array_ops.expand_dims(x, axis=i)
+  return x
+
+
+def _make_static_axis_non_negative(axis, ndims):
+  """Convert possibly negatively indexed axis to non-negative.
+
+  Args:
+    axis:  Iterable over Python integers.
+    ndims:  Number of dimensions into which axis indexes.
+
+  Returns:
+    A list of non-negative Python integers.
+
+  Raises:
+    ValueError: If values in `axis` are too big/small to index into `ndims`.
+  """
+  non_negative_axis = []
+  for d in axis:
+    if d >= 0:
+      if d >= ndims:
+        raise ValueError("dim %d not in the interval [0, %d]." % (d, ndims - 1))
+      non_negative_axis.append(d)
+    else:
+      if d < -1 * ndims:
+        raise ValueError(
+            "Negatively indexed dim %d not in the interval [-%d, -1]" % (d,
+                                                                         ndims))
+      non_negative_axis.append(ndims + d)
+  return non_negative_axis
+
+
+def _move_dims_to_flat_end(x, axis, x_ndims):
+  """Move dims corresponding to `axis` in `x` to the end, then flatten.
+
+  Args:
+    x: `Tensor` with shape `[B0,B1,...,Bb]`.
+    axis:  Python list of indices into dimensions of `x`.
+    x_ndims:  Python integer holding number of dimensions in `x`.
+
+  Returns:
+    `Tensor` with value from `x` and dims in `axis` moved to end into one single
+      dimension.
+  """
+  # Suppose x.shape = [a, b, c, d]
+  # Suppose axis = [1, 3]
+
+  # front_dims = [0, 2] in example above.
+  front_dims = sorted(set(range(x_ndims)).difference(axis))
+  # x_permed.shape = [a, c, b, d]
+  x_permed = array_ops.transpose(x, perm=front_dims + list(axis))
+
+  if x.get_shape().is_fully_defined():
+    x_shape = x.get_shape().as_list()
+    # front_shape = [a, c], end_shape = [b * d]
+    front_shape = [x_shape[i] for i in front_dims]
+    end_shape = [np.prod([x_shape[i] for i in axis])]
+    full_shape = front_shape + end_shape
+  else:
+    front_shape = array_ops.shape(x_permed)[:x_ndims - len(axis)]
+    end_shape = [-1]
+    full_shape = array_ops.concat([front_shape, end_shape], axis=0)
+  return array_ops.reshape(x_permed, shape=full_shape)
+
+
+def _sort_tensor(tensor):
+  """Use `top_k` to sort a `Tensor` along the last dimension."""
+  sorted_, _ = nn_ops.top_k(tensor, k=array_ops.shape(tensor)[-1])
+  return sorted_
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index 1785d096e3b..516d7b60fec 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -27,6 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class _DistributionShape(object):
@@ -39,7 +39,7 @@ class _DistributionShape(object):
       - `dims`: indexes into `shape`; useful for transpose, reduce.
 
     `Tensor`s sampled from a `Distribution` can be partitioned by `sample_dims`,
-    `batch_dims`, and `event_dims`.  To understand the semantics of these
+    `batch_dims`, and `event_dims`. To understand the semantics of these
     dimensions, consider when two of the three are fixed and the remaining
     is varied:
       - `sample_dims`: indexes independent draws from identical
@@ -77,8 +77,8 @@ class _DistributionShape(object):
 
       ```python
       sample_dims = [0]
-      tf.reduce_mean(Normal(mu=1.3, sigma=1.).sample_n(1000),
-                     reduction_indices=sample_dims)  # ~= 1.3
+      tf.reduce_mean(Normal(loc=1.3, scale=1.).sample_n(1000),
+                     axis=sample_dims)  # ~= 1.3
       ```
 
     - Batch dimensions:
@@ -91,15 +91,15 @@ class _DistributionShape(object):
       ```
       P(X=x) = integral P(X=x|y) P(Y=y) dy
             ~= 1/n sum_{i=1}^n P(X=x|y_i),   y_i ~iid Laplace(0,1)
-             = tf.reduce_mean(Normal(mu=Laplace(0., 1.).sample_n(n=1000),
-                                     sigma=tf.ones(1000)).prob(x),
-                              reduction_indices=batch_dims)
+             = tf.reduce_mean(Normal(loc=Laplace(0., 1.).sample_n(n=1000),
+                                     scale=tf.ones(1000)).prob(x),
+                              axis=batch_dims)
       ```
 
       The `Laplace` distribution generates a `Tensor` of shape `[1000]`. When
       fed to a `Normal`, this is interpreted as 1000 different locations, i.e.,
-      1000 non-identical Normals.  Therefore a single call to `prob(x)` yields
-      1000 probabilities, one for every location.  The average over this batch
+      1000 non-identical Normals. Therefore a single call to `prob(x)` yields
+      1000 probabilities, one for every location. The average over this batch
       yields the marginal.
 
     - Event dimensions:
@@ -139,8 +139,8 @@ class _DistributionShape(object):
     # E = [2, 2]
 
     # 100 iid samples from two, non-identical trivariate Normal distributions.
-    mu    = ... # shape(2, 3)
-    sigma = ... # shape(2, 3, 3)
+    mu    = ...  # shape(2, 3)
+    sigma = ...  # shape(2, 3, 3)
     X = MultivariateNormal(mu, sigma).sample(shape=[4, 25])
     # S = [4, 25]
     # B = [2]
@@ -154,7 +154,7 @@ class _DistributionShape(object):
 
     For example, when `validate_args=False` and `event_ndims` is a
     non-constant `Tensor`, it is checked to be a non-negative integer at graph
-    execution. (Same for `batch_ndims`).  Constant `Tensor`s and non-`Tensor`
+    execution. (Same for `batch_ndims`). Constant `Tensor`s and non-`Tensor`
     arguments are always checked for correctness since this can be done for
     "free," i.e., during graph construction.
   """
@@ -167,7 +167,7 @@ class _DistributionShape(object):
     """Construct `DistributionShape` with fixed `batch_ndims`, `event_ndims`.
 
     `batch_ndims` and `event_ndims` are fixed throughout the lifetime of a
-    `Distribution`.  They may only be known at graph execution.
+    `Distribution`. They may only be known at graph execution.
 
     If both `batch_ndims` and `event_ndims` are python scalars (rather than
     either being a `Tensor`), functions in this class automatically perform
@@ -175,16 +175,16 @@ class _DistributionShape(object):
 
     Args:
       batch_ndims: `Tensor`. Number of `dims` (`rank`) of the batch portion of
-        indexes of a `Tensor`.  A "batch" is a non-identical distribution, i.e,
+        indexes of a `Tensor`. A "batch" is a non-identical distribution, i.e,
         Normal with different parameters.
       event_ndims: `Tensor`. Number of `dims` (`rank`) of the event portion of
         indexes of a `Tensor`. An "event" is what is sampled from a
         distribution, i.e., a trivariate Normal has an event shape of [3] and a
         4 dimensional Wishart has an event shape of [4, 4].
-      validate_args: `Boolean`, default `False`. When `True`, non-`tf.constant`
-        `Tensor` arguments are checked for correctness. (`tf.constant`
-        arguments are always checked.)
-      name: `String`. The name prepended to Ops created by this class.
+      validate_args: Python `bool`, default `False`. When `True`,
+        non-`tf.constant` `Tensor` arguments are checked for correctness.
+        (`tf.constant` arguments are always checked.)
+      name: Python `str`. The name prepended to Ops created by this class.
 
     Raises:
       ValueError: if either `batch_ndims` or `event_ndims` are: `None`,
@@ -195,8 +195,8 @@ class _DistributionShape(object):
     self._batch_ndims = batch_ndims
     self._event_ndims = event_ndims
     self._validate_args = validate_args
-    with ops.name_scope(name) as ns:
-      self._name = ns
+    with ops.name_scope(name):
+      self._name = name
       with ops.name_scope("init"):
         self._batch_ndims = self._assert_non_negative_int32_scalar(
             ops.convert_to_tensor(
@@ -234,7 +234,7 @@ class _DistributionShape(object):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       ndims: Scalar number of dimensions associated with a `Tensor`.
@@ -251,7 +251,7 @@ class _DistributionShape(object):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_ndims: `Tensor` (0D, `int32`).
@@ -285,7 +285,7 @@ class _DistributionShape(object):
     Example:
 
     ```python
-    x = ... # Tensor with shape [4, 3, 2, 1]
+    x = ...  # Tensor with shape [4, 3, 2, 1]
     sample_dims, batch_dims, event_dims = _DistributionShape(
       batch_ndims=2, event_ndims=1).get_dims(x)
     # sample_dims == [0]
@@ -296,7 +296,7 @@ class _DistributionShape(object):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_dims: `Tensor` (1D, `int32`).
@@ -306,8 +306,8 @@ class _DistributionShape(object):
     with self._name_scope(name, values=[x]):
       def make_dims(start_sum, size, name):
         """Closure to make dims range."""
-        start_sum = start_sum if start_sum else (
-            array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
+        start_sum = start_sum if start_sum else [
+            array_ops.zeros([], dtype=dtypes.int32, name="zero")]
         if self._is_all_constant_helper(size, *start_sum):
           start = sum(tensor_util.constant_value(s) for s in start_sum)
           stop = start + tensor_util.constant_value(size)
@@ -317,9 +317,9 @@ class _DistributionShape(object):
           start = sum(start_sum)
           return math_ops.range(start, start + size)
       sample_ndims = self.get_sample_ndims(x, name=name)
-      return (make_dims((), sample_ndims, name="sample_dims"),
-              make_dims((sample_ndims,), self.batch_ndims, name="batch_dims"),
-              make_dims((sample_ndims, self.batch_ndims),
+      return (make_dims([], sample_ndims, name="sample_dims"),
+              make_dims([sample_ndims], self.batch_ndims, name="batch_dims"),
+              make_dims([sample_ndims, self.batch_ndims],
                         self.event_ndims, name="event_dims"))
 
   def get_shape(self, x, name="get_shape"):
@@ -327,7 +327,7 @@ class _DistributionShape(object):
 
     Args:
       x: `Tensor`.
-      name: `String`. The name to give this op.
+      name: Python `str`. The name to give this op.
 
     Returns:
       sample_shape: `Tensor` (1D, `int32`).
@@ -338,8 +338,8 @@ class _DistributionShape(object):
       x = ops.convert_to_tensor(x, name="x")
       def slice_shape(start_sum, size, name):
         """Closure to slice out shape."""
-        start_sum = start_sum if start_sum else (
-            array_ops.zeros((), dtype=dtypes.int32, name="zero"),)
+        start_sum = start_sum if start_sum else [
+            array_ops.zeros([], dtype=dtypes.int32, name="zero")]
         if (x.get_shape().ndims is not None and
             self._is_all_constant_helper(size, *start_sum)):
           start = sum(tensor_util.constant_value(s) for s in start_sum)
@@ -347,14 +347,13 @@ class _DistributionShape(object):
           slice_ = x.get_shape()[start:stop].as_list()
           if all(s is not None for s in slice_):
             return ops.convert_to_tensor(slice_, dtype=dtypes.int32, name=name)
-          # Fall-through intended.
-        return array_ops.slice(array_ops.shape(x), (sum(start_sum),), (size,))
+        return array_ops.slice(array_ops.shape(x), [sum(start_sum)], [size])
       sample_ndims = self.get_sample_ndims(x, name=name)
-      return (slice_shape((), sample_ndims,
+      return (slice_shape([], sample_ndims,
                           name="sample_shape"),
-              slice_shape((sample_ndims,), self.batch_ndims,
+              slice_shape([sample_ndims], self.batch_ndims,
                           name="batch_shape"),
-              slice_shape((sample_ndims, self.batch_ndims), self.event_ndims,
+              slice_shape([sample_ndims, self.batch_ndims], self.event_ndims,
                           name="event_shape"))
 
   # TODO(jvdillon): Make remove expand_batch_dim and make expand_batch_dim=False
@@ -371,9 +370,9 @@ class _DistributionShape(object):
 
     Args:
       x: `Tensor`.
-      expand_batch_dim: Python `Boolean` scalar. If `True` the batch dims will
-        be expanded such that batch_ndims>=1.
-      name: `String`. The name to give this op.
+      expand_batch_dim: Python `bool`. If `True` the batch dims will be expanded
+        such that `batch_ndims >= 1`.
+      name: Python `str`. The name to give this op.
 
     Returns:
       x: `Tensor`. Input transposed/reshaped to `B_+E_+S_`.
@@ -381,6 +380,7 @@ class _DistributionShape(object):
     """
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
+      # x.shape: S+B+E
       sample_shape, batch_shape, event_shape = self.get_shape(x)
       event_shape = distribution_util.pick_vector(
           self._event_ndims_is_0, [1], event_shape)
@@ -389,7 +389,9 @@ class _DistributionShape(object):
             self._batch_ndims_is_0, [1], batch_shape)
       new_shape = array_ops.concat([[-1], batch_shape, event_shape], 0)
       x = array_ops.reshape(x, shape=new_shape)
+      # x.shape: [prod(S)]+B_+E_
       x = distribution_util.rotate_transpose(x, shift=-1)
+      # x.shape: B_+E_+[prod(S)]
       return x, sample_shape
 
   # TODO(jvdillon): Make remove expand_batch_dim and make expand_batch_dim=False
@@ -409,23 +411,29 @@ class _DistributionShape(object):
     Args:
       x: `Tensor` of shape `B_+E_+S_`.
       sample_shape: `Tensor` (1D, `int32`).
-      expand_batch_dim: Python `Boolean` scalar. If `True` the batch dims will
-        be expanded such that batch_ndims>=1.
-      name: `String`. The name to give this op.
+      expand_batch_dim: Python `bool`. If `True` the batch dims will be expanded
+        such that `batch_ndims>=1`.
+      name: Python `str`. The name to give this op.
 
     Returns:
       x: `Tensor`. Input transposed/reshaped to `S+B+E`.
     """
     with self._name_scope(name, values=[x, sample_shape]):
       x = ops.convert_to_tensor(x, name="x")
+      # x.shape: _B+_E+[prod(S)]
       sample_shape = ops.convert_to_tensor(sample_shape, name="sample_shape")
       x = distribution_util.rotate_transpose(x, shift=1)
+      # x.shape: [prod(S)]+_B+_E
       if self._is_all_constant_helper(self.batch_ndims, self.event_ndims):
         if self._batch_ndims_is_0 or self._event_ndims_is_0:
-          b = ([min(-2, -1 - self._event_ndims_static)]
-               if self._batch_ndims_is_0 and expand_batch_dim else [])
-          e = [-1] if self._event_ndims_is_0 else []
-          x = array_ops.squeeze(x, squeeze_dims=b + e)
+          squeeze_dims = []
+          if self._event_ndims_is_0:
+            squeeze_dims += [-1]
+          if self._batch_ndims_is_0 and expand_batch_dim:
+            squeeze_dims += [1]
+          if squeeze_dims:
+            x = array_ops.squeeze(x, squeeze_dims=squeeze_dims)
+            # x.shape: [prod(S)]+B+E
         _, batch_shape, event_shape = self.get_shape(x)
       else:
         s = (x.get_shape().as_list() if x.get_shape().is_fully_defined()
@@ -437,8 +445,9 @@ class _DistributionShape(object):
             math_ops.logical_and(expand_batch_dim, self._batch_ndims_is_0),
             2, 1 + self.batch_ndims)
         event_shape = s[event_start:event_start+self.event_ndims]
-      new_shape = array_ops.concat((sample_shape, batch_shape, event_shape), 0)
+      new_shape = array_ops.concat([sample_shape, batch_shape, event_shape], 0)
       x = array_ops.reshape(x, shape=new_shape)
+      # x.shape: S+B+E
       return x
 
   @contextlib.contextmanager
diff --git a/tensorflow/contrib/distributions/python/ops/special_math.py b/tensorflow/contrib/distributions/python/ops/special_math.py
deleted file mode 100644
index bc0a8c2b273..00000000000
--- a/tensorflow/contrib/distributions/python/ops/special_math.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Special Math Ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-__all__ = [
-    "ndtr",
-    "log_ndtr",
-    "log_cdf_laplace",
-]
-
-
-# log_ndtr uses different functions over the ranges
-# (-infty, lower](lower, upper](upper, infty)
-# Lower bound values were chosen by examining where the support of ndtr
-# appears to be zero, relative to scipy's (which is always 64bit). They were
-# then made more conservative just to be safe. (Conservative means use the
-# expansion more than we probably need to.) See `NdtrTest` in
-# special_math_test.py.
-LOGNDTR_FLOAT64_LOWER = -20
-LOGNDTR_FLOAT32_LOWER = -10
-
-# Upper bound values were chosen by examining for which values of 'x'
-# Log[cdf(x)] is 0, after which point we need to use the approximation
-# Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x).  We chose a value slightly
-# conservative, meaning we use the approximation earlier than needed.
-LOGNDTR_FLOAT64_UPPER = 8
-LOGNDTR_FLOAT32_UPPER = 5
-
-
-def ndtr(x, name="ndtr"):
-  """Normal distribution function.
-
-  Returns the area under the Gaussian probability density function, integrated
-  from minus infinity to x:
-
-  ```
-                    1       / x
-     ndtr(x)  = ----------  |    exp(-0.5 t^2) dt
-                sqrt(2 pi)  /-inf
-
-              = 0.5 (1 + erf(x / sqrt(2)))
-              = 0.5 erfc(x / sqrt(2))
-  ```
-
-  Args:
-    x: `Tensor` of type `float32`, `float64`.
-    name: Python string. A name for the operation (default="ndtr").
-
-  Returns:
-    ndtr: `Tensor` with `dtype=x.dtype`.
-
-  Raises:
-    TypeError: if `x` is not floating-type.
-  """
-
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    if x.dtype.as_numpy_dtype not in [np.float32, np.float64]:
-      raise TypeError(
-          "x.dtype=%s is not handled, see docstring for supported types."
-          % x.dtype)
-    return _ndtr(x)
-
-
-def _ndtr(x):
-  """Implements ndtr core logic."""
-  half_sqrt_2 = constant_op.constant(
-      0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
-  w = x * half_sqrt_2
-  z = math_ops.abs(w)
-  y = array_ops.where(math_ops.less(z, half_sqrt_2),
-                      1. + math_ops.erf(w),
-                      array_ops.where(math_ops.greater(w, 0.),
-                                      2. - math_ops.erfc(z),
-                                      math_ops.erfc(z)))
-  return 0.5 * y
-
-
-def log_ndtr(x, series_order=3, name="log_ndtr"):
-  """Log Normal distribution function.
-
-  For details of the Normal distribution function see `ndtr`.
-
-  This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or
-  using an asymptotic series.  Specifically:
-  - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on
-    `log(1-x) ~= -x, x << 1`.
-  - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique
-    and take a log.
-  - For `x <= lower_segment`, we use the series approximation of erf to compute
-    the log CDF directly.
-
-  The `lower_segment` is set based on the precision of the input:
-
-  ```
-  lower_segment = { -20,  x.dtype=float64
-                  { -10,  x.dtype=float32
-  upper_segment = {   8,  x.dtype=float64
-                  {   5,  x.dtype=float32
-  ```
-
-  When `x < lower_segment`, the `ndtr` asymptotic series approximation is:
-
-  ```
-     ndtr(x) = scale * (1 + sum) + R_N
-     scale   = exp(-0.5 x^2) / (-x sqrt(2 pi))
-     sum     = Sum{(-1)^n (2n-1)!! / (x^2)^n, n=1:N}
-     R_N     = O(exp(-0.5 x^2) (2N+1)!! / |x|^{2N+3})
-  ```
-
-  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ... (3) (1)` is a
-  [double-factorial](https://en.wikipedia.org/wiki/Double_factorial).
-
-
-  Args:
-    x: `Tensor` of type `float32`, `float64`.
-    series_order: Positive Python `integer`. Maximum depth to
-      evaluate the asymptotic expansion.  This is the `N` above.
-    name: Python string. A name for the operation (default="log_ndtr").
-
-  Returns:
-    log_ndtr: `Tensor` with `dtype=x.dtype`.
-
-  Raises:
-    TypeError: if `x.dtype` is not handled.
-    TypeError: if `series_order` is a not Python `integer.`
-    ValueError:  if `series_order` is not in `[0, 30]`.
-  """
-  if not isinstance(series_order, int):
-    raise TypeError("series_order must be a Python integer.")
-  if series_order < 0:
-    raise ValueError("series_order must be non-negative.")
-  if series_order > 30:
-    raise ValueError("series_order must be <= 30.")
-
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-
-    if x.dtype.as_numpy_dtype == np.float64:
-      lower_segment = LOGNDTR_FLOAT64_LOWER
-      upper_segment = LOGNDTR_FLOAT64_UPPER
-    elif x.dtype.as_numpy_dtype == np.float32:
-      lower_segment = LOGNDTR_FLOAT32_LOWER
-      upper_segment = LOGNDTR_FLOAT32_UPPER
-    else:
-      raise TypeError("x.dtype=%s is not supported." % x.dtype)
-
-    # The basic idea here was ported from py/scipy/special/cephes/ndtr.c.
-    # We copy the main idea, with a few changes
-    # * For x >> 1, and X ~ Normal(0, 1),
-    #     Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x],
-    #     which extends the range of validity of this function.
-    # * We use one fixed series_order for all of 'x', rather than adaptive.
-    # * Our docstring properly reflects that this is an asymptotic series, not a
-    #   Tayor series.  We also provided a correct bound on the remainder.
-    # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when
-    #   x=0. This happens even though the branch is unchosen because when x=0
-    #   the gradient of a select involves the calculation 1*dy+0*(-inf)=nan
-    #   regardless of whether dy is finite. Note that the minimum is a NOP if
-    #   the branch is chosen.
-    return array_ops.where(
-        math_ops.greater(x, upper_segment),
-        -_ndtr(-x),  # log(1-x) ~= -x, x << 1
-        array_ops.where(math_ops.greater(x, lower_segment),
-                        math_ops.log(_ndtr(math_ops.maximum(x, lower_segment))),
-                        _log_ndtr_lower(math_ops.minimum(x, lower_segment),
-                                        series_order)))
-
-
-def _log_ndtr_lower(x, series_order):
-  """Asymptotic expansion version of `Log[cdf(x)]`, apppropriate for `x<<-1`."""
-  x_2 = math_ops.square(x)
-  # Log of the term multiplying (1 + sum)
-  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
-  return log_scale + math_ops.log(_log_ndtr_asymptotic_series(x, series_order))
-
-
-def _log_ndtr_asymptotic_series(x, series_order):
-  """Calculates the asymptotic series used in log_ndtr."""
-  if series_order <= 0:
-    return 1.
-  x_2 = math_ops.square(x)
-  even_sum = 0.
-  odd_sum = 0.
-  x_2n = x_2  # Start with x^{2*1} = x^{2*n} with n = 1.
-  for n in range(1, series_order + 1):
-    if n % 2:
-      odd_sum += _double_factorial(2 * n - 1) / x_2n
-    else:
-      even_sum += _double_factorial(2 * n - 1) / x_2n
-    x_2n *= x_2
-  return 1. + even_sum - odd_sum
-
-
-def _double_factorial(n):
-  """The double factorial function for small Python integer `n`."""
-  return np.prod(np.arange(n, 1, -2))
-
-
-def log_cdf_laplace(x, name="log_cdf_laplace"):
-  """Log Laplace distribution function.
-
-  This function calculates `Log[L(x)]`, where `L(x)` is the cumulative
-  distribution function of the Laplace distribution, i.e.
-
-  ```L(x) := 0.5 * int_{-infty}^x e^{-|t|} dt```
-
-  For numerical accuracy, `L(x)` is computed in different ways depending on `x`,
-
-  ```
-  x <= 0:
-    Log[L(x)] = Log[0.5] + x, which is exact
-
-  0 < x:
-    Log[L(x)] = Log[1 - 0.5 * e^{-x}], which is exact
-  ```
-
-  Args:
-    x: `Tensor` of type `float32`, `float64`.
-    name: Python string. A name for the operation (default="log_ndtr").
-
-  Returns:
-    `Tensor` with `dtype=x.dtype`.
-
-  Raises:
-    TypeError: if `x.dtype` is not handled.
-  """
-
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-
-    # For x < 0, L(x) = 0.5 * exp{x} exactly, so Log[L(x)] = log(0.5) + x.
-    lower_solution = -np.log(2.) + x
-
-    # safe_exp_neg_x = exp{-x} for x > 0, but is
-    # bounded above by 1, which avoids
-    #   log[1 - 1] = -inf for x = log(1/2), AND
-    #   exp{-x} --> inf, for x << -1
-    safe_exp_neg_x = math_ops.exp(-math_ops.abs(x))
-
-    # log1p(z) = log(1 + z) approx z for |z| << 1.  This approxmation is used
-    # internally by log1p, rather than being done explicitly here.
-    upper_solution = math_ops.log1p(-0.5 * safe_exp_neg_x)
-
-    return array_ops.where(x < 0., lower_solution, upper_solution)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
new file mode 100644
index 00000000000..0e3867809a8
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution of a vectorized Laplace, with uncorrelated components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import vector_laplace_linear_operator as vector_laplace_linop
+from tensorflow.python.framework import ops
+
+
+__all__ = [
+    "VectorLaplaceDiag",
+]
+
+
+class VectorLaplaceDiag(
+    vector_laplace_linop.VectorLaplaceLinearOperator):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+  ```
+
+  where:
+
+  * `scale_diag.shape = [k]`, and,
+  * `scale_identity_multiplier.shape = []`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+  `scale` is the Identity matrix.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 2-variate VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_diag=[1, 2.])
+
+  vla.mean().eval()
+  # ==> [1., -1]
+
+  vla.stddev().eval()
+  # ==> [1., 2] * sqrt(2)
+
+  # Evaluate this on an observation in `R^2`, returning a scalar.
+  vla.prob([-1., 0]).eval()  # shape: []
+
+  # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_identity_multiplier=[1, 2., 3])
+
+  vla.mean().eval()  # shape: [3, 2]
+  # ==> [[1., -1]
+  #      [1, -1],
+  #      [1, -1]]
+
+  vla.stddev().eval()  # shape: [3, 2]
+  # ==> sqrt(2) * [[1., 1],
+  #                [2, 2],
+  #                [3, 3]]
+
+  # Evaluate this on an observation in `R^2`, returning a length-3 vector.
+  vla.prob([-1., 0]).eval()  # shape: [3]
+
+  # Initialize a 2-batch of 3-variate VectorLaplace's.
+  vla = ds.VectorLaplaceDiag(
+      loc=[[1., 2, 3],
+           [11, 22, 33]]           # shape: [2, 3]
+      scale_diag=[[1., 2, 3],
+                  [0.5, 1, 1.5]])  # shape: [2, 3]
+
+  # Evaluate this on a two observations, each in `R^3`, returning a length-2
+  # vector.
+  x = [[-1., 0, 1],
+       [-11, 0, 11.]]   # shape: [2, 3].
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceDiag"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`, and,
+    * `scale_identity_multiplier.shape = []`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=False,
+            assert_positive=False)
+    super(VectorLaplaceDiag, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
new file mode 100644
index 00000000000..fdee57695e4
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -0,0 +1,294 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Vectorized Laplace distribution class, directly using LinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import laplace
+from tensorflow.python.ops.distributions import transformed_distribution
+
+
+__all__ = [
+    "VectorLaplaceLinearOperator"
+]
+
+_mvn_sample_note = """
+`value` is a batch vector with compatible shape if `value` is a `Tensor` whose
+shape can be broadcast up to either:
+
+```python
+self.batch_shape + self.event_shape
+```
+
+or
+
+```python
+[M1, ..., Mm] + self.batch_shape + self.event_shape
+```
+
+"""
+
+
+class VectorLaplaceLinearOperator(
+    transformed_distribution.TransformedDistribution):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 3-variate VectorLaplace with some desired covariance.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+
+  scale = tf.cholesky(cov)
+  # ==> [[ 0.6,  0. ,  0. ],
+  #      [ 0.2,  0.5,  0. ],
+  #      [ 0.1, -0.3,  0.4]])
+
+  # Divide scale by sqrt(2) so that the final covariance will be what we want.
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorTriL(scale / tf.sqrt(2)))
+
+  # Covariance agrees with cholesky(cov) parameterization.
+  vla.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an`R^3` observation; return a scalar.
+  vla.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Vector Laplace's.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorDiag(scale_diag))
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceLinearOperator"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
+        `[B1, ..., Bb, k, k]`.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are
+        invalid, correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError: if `scale` is unspecified.
+      TypeError: if not `scale.dtype.is_floating`
+    """
+    parameters = locals()
+    if scale is None:
+      raise ValueError("Missing required `scale` parameter.")
+    if not scale.dtype.is_floating:
+      raise TypeError("`scale` parameter must have floating-point dtype.")
+
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+
+      super(VectorLaplaceLinearOperator, self).__init__(
+          distribution=laplace.Laplace(
+              loc=array_ops.zeros([], dtype=scale.dtype),
+              scale=array_ops.ones([], dtype=scale.dtype)),
+          bijector=bijectors.AffineLinearOperator(
+              shift=loc, scale=scale, validate_args=validate_args),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=validate_args,
+          name=name)
+      self._parameters = parameters
+
+  @property
+  def loc(self):
+    """The `loc` `Tensor` in `Y = scale @ X + loc`."""
+    return self.bijector.shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
+    return self.bijector.scale
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _log_prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._log_prob(x)
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._prob(x)
+
+  def _mean(self):
+    shape = self.batch_shape.concatenate(self.event_shape)
+    has_static_shape = shape.is_fully_defined()
+    if not has_static_shape:
+      shape = array_ops.concat([
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], 0)
+
+    if self.loc is None:
+      return array_ops.zeros(shape, self.dtype)
+
+    if has_static_shape and shape == self.loc.get_shape():
+      return array_ops.identity(self.loc)
+
+    # Add dummy tensor of zeros to broadcast.  This is only necessary if shape
+    # != self.loc.shape, but we could not determine if this is the case.
+    return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
+
+  def _covariance(self):
+    # Let
+    #   W = (w1,...,wk), with wj ~ iid Laplace(0, 1).
+    # Then this distribution is
+    #   X = loc + LW,
+    # and since E[X] = loc,
+    #   Cov(X) = E[LW W^T L^T] = L E[W W^T] L^T.
+    # Since E[wi wj] = 0 if i != j, and 2 if i == j, we have
+    #   Cov(X) = 2 LL^T
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
+    else:
+      return 2. * self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
+
+  def _variance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * math_ops.square(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return array_ops.matrix_diag_part(
+          2. * self.scale.matmul(self.scale.to_dense()))
+    else:
+      return 2. * array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
+
+  def _stddev(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return np.sqrt(2) * math_ops.abs(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense())))
+    else:
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
+
+  def _mode(self):
+    return self._mean()
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 9446b7cc990..ae804b61727 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -18,15 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import bijector as bijectors
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import student_t
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
+from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import student_t
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # TODO(jvdillon): Add unittests for this once we know where will put this code
@@ -141,7 +141,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
   and should not be confused with the [Multivate Student's t-distribution](
-  https://en.wikipedia.org/wiki/Multivariate_t-distribution).  The
+  https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
   https://en.wikipedia.org/wiki/Elliptical_distribution); it has PDF:
@@ -191,7 +191,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   ```
 
   For more examples of how to construct the `scale` matrix, see the
-  `bijector.Affine` docstring.
+  `tf.contrib.distributions.bijectors.Affine` docstring.
 
   """
 
@@ -215,54 +215,51 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
     The `event_shape` is the event shape of `Affine.event_shape`.
 
     Args:
-      df: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-        `df` must contain only positive values.
-        Must be scalar if `loc`, `scale_*` imply non-scalar batch_shape or
-        must have the same `batch_shape` implied by `loc`, `scale_*`.
-      loc: Numeric `Tensor`.  If this is set to `None`, no `loc` is applied.
+      df: Floating-point `Tensor`. The degrees of freedom of the
+        distribution(s). `df` must contain only positive values. Must be
+        scalar if `loc`, `scale_*` imply non-scalar batch_shape or must have the
+        same `batch_shape` implied by `loc`, `scale_*`.
+      loc: Floating-point `Tensor`. If this is set to `None`, no `loc` is
+        applied.
       scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag=scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Numeric `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Numeric `Tensor` representing factor matrix with
-        last two dimensions of shape `(k, r)`.
-        When `None`, no rank-r update is added to `scale`.
-      scale_perturb_diag: Numeric `Tensor` representing the diagonal matrix.
-        `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-        r x r Diagonal matrix.
-        When `None` low rank updates will take the form `scale_perturb_factor *
-        scale_perturb_factor.T`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+        scaling done to the identity matrix. When `scale_identity_multiplier =
+        scale_diag=scale_tril = None` then `scale += IdentityMatrix`. Otherwise
+        no scaled-identity-matrix is added to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ..., k], which represents a k x k
+        diagonal matrix. When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ..., k, k], which represents a k x k
+        lower triangular matrix. When `None` no `scale_tril` term is added to
+        `scale`. The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ..., r], which
+        represents an r x r Diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name):
       with ops.name_scope("init", values=graph_parents):
         # The shape of the _VectorStudentT distribution is governed by the
         # relationship between df.batch_shape and affine.batch_shape. In
         # pseudocode the basic procedure is:
         #   if df.batch_shape is scalar:
         #     if affine.batch_shape is not scalar:
-        #       # broadcast self._distribution.sample so
+        #       # broadcast distribution.sample so
         #       # it has affine.batch_shape.
         #     self.batch_shape = affine.batch_shape
         #   else:
@@ -273,8 +270,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
         # Here we really only need to collect the affine.batch_shape and decide
         # what we're going to pass in to TransformedDistribution's
         # (override) batch_shape arg.
-        self._distribution = student_t.StudentT(df=df, loc=0., scale=1.)
-        self._affine = bijectors.Affine(
+        affine = bijectors.Affine(
             shift=loc,
             scale_identity_multiplier=scale_identity_multiplier,
             scale_diag=scale_diag,
@@ -282,32 +278,36 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
             scale_perturb_factor=scale_perturb_factor,
             scale_perturb_diag=scale_perturb_diag,
             validate_args=validate_args)
-        self._batch_shape, self._override_event_shape = _infer_shapes(
-            self._affine.scale, self._affine.shift)
-        self._override_batch_shape = distribution_util.pick_vector(
-            self._distribution.is_scalar_batch(),
-            self._batch_shape,
+        distribution = student_t.StudentT(
+            df=df,
+            loc=array_ops.zeros([], dtype=affine.dtype),
+            scale=array_ops.ones([], dtype=affine.dtype))
+        batch_shape, override_event_shape = _infer_shapes(
+            affine.scale, affine.shift)
+        override_batch_shape = distribution_util.pick_vector(
+            distribution.is_scalar_batch(),
+            batch_shape,
             constant_op.constant([], dtype=dtypes.int32))
         super(_VectorStudentT, self).__init__(
-            distribution=self._distribution,
-            bijector=self._affine,
-            batch_shape=self._override_batch_shape,
-            event_shape=self._override_event_shape,
+            distribution=distribution,
+            bijector=affine,
+            batch_shape=override_batch_shape,
+            event_shape=override_event_shape,
             validate_args=validate_args,
-            name=ns)
+            name=name)
         self._parameters = parameters
 
   @property
   def df(self):
     """Degrees of freedom in these Student's t distribution(s)."""
-    return self._distribution.df
+    return self.distribution.df
 
   @property
   def loc(self):
     """Locations of these Student's t distribution(s)."""
-    return self._affine.shift
+    return self.bijector.shift
 
   @property
   def scale(self):
     """Dense (batch) covariance matrix, if available."""
-    return self._affine.scale
+    return self.bijector.scale
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 911c951668f..e162a796100 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -21,11 +21,8 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_full
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,6 +33,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -87,20 +86,20 @@ class _WishartOperatorPD(distribution.Distribution):
       df: `float` or `double` tensor, the degrees of freedom of the
         distribution(s). `df` must be greater than or equal to `k`.
       scale_operator_pd: `float` or `double` instance of `OperatorPDBase`.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if scale is not floating-type
@@ -110,7 +109,7 @@ class _WishartOperatorPD(distribution.Distribution):
     """
     parameters = locals()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name):
       with ops.name_scope("init", values=[df, scale_operator_pd]):
         if not scale_operator_pd.dtype.is_floating:
           raise TypeError(
@@ -121,7 +120,7 @@ class _WishartOperatorPD(distribution.Distribution):
             df,
             dtype=scale_operator_pd.dtype,
             name="df")
-        contrib_tensor_util.assert_same_float_dtype(
+        check_ops.assert_same_float_dtype(
             (self._df, self._scale_operator_pd))
         if (self._scale_operator_pd.get_shape().ndims is None or
             self._scale_operator_pd.get_shape()[-1].value is None):
@@ -156,12 +155,11 @@ class _WishartOperatorPD(distribution.Distribution):
         dtype=self._scale_operator_pd.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=True,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
                        self._scale_operator_pd.inputs),
-        name=ns)
+        name=name)
 
   @property
   def df(self):
@@ -210,9 +208,9 @@ class _WishartOperatorPD(distribution.Distribution):
     batch_ndims = array_ops.shape(batch_shape)[0]
 
     ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
-    shape = array_ops.concat(((n,), batch_shape, event_shape), 0)
+    shape = array_ops.concat([[n], batch_shape, event_shape], 0)
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     x = random_ops.random_normal(shape=shape,
                                  mean=0.,
                                  stddev=1.,
@@ -222,7 +220,7 @@ class _WishartOperatorPD(distribution.Distribution):
     # Complexity: O(nbk)
     # This parametrization is equivalent to Chi2, i.e.,
     # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
-    g = random_ops.random_gamma(shape=(n,),
+    g = random_ops.random_gamma(shape=[n],
                                 alpha=self._multi_gamma_sequence(
                                     0.5 * self.df, self.dimension),
                                 beta=0.5,
@@ -230,30 +228,30 @@ class _WishartOperatorPD(distribution.Distribution):
                                 seed=distribution_util.gen_new_seed(
                                     seed, "wishart"))
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     x = array_ops.matrix_band_part(x, -1, 0)  # Tri-lower.
 
     # Complexity: O(nbk)
     x = array_ops.matrix_set_diag(x, math_ops.sqrt(g))
 
     # Make batch-op ready.
-    # Complexity: O(nbk^2)
-    perm = array_ops.concat((math_ops.range(1, ndims), (0,)), 0)
+    # Complexity: O(nbk**2)
+    perm = array_ops.concat([math_ops.range(1, ndims), [0]], 0)
     x = array_ops.transpose(x, perm)
-    shape = array_ops.concat((batch_shape, (event_shape[0], -1)), 0)
+    shape = array_ops.concat([batch_shape, [event_shape[0]], [-1]], 0)
     x = array_ops.reshape(x, shape)
 
     # Complexity: O(nbM) where M is the complexity of the operator solving a
-    # vector system.  E.g., for OperatorPDDiag, each matmul is O(k^2), so
-    # this complexity is O(nbk^2). For OperatorPDCholesky, each matmul is
+    # vector system. E.g., for OperatorPDDiag, each matmul is O(k**2), so
+    # this complexity is O(nbk**2). For OperatorPDCholesky, each matmul is
     # O(k^3) so this step has complexity O(nbk^3).
     x = self.scale_operator_pd.sqrt_matmul(x)
 
     # Undo make batch-op ready.
-    # Complexity: O(nbk^2)
-    shape = array_ops.concat((batch_shape, event_shape, (n,)), 0)
+    # Complexity: O(nbk**2)
+    shape = array_ops.concat([batch_shape, event_shape, [n]], 0)
     x = array_ops.reshape(x, shape)
-    perm = array_ops.concat(((ndims - 1,), math_ops.range(0, ndims - 1)), 0)
+    perm = array_ops.concat([[ndims - 1], math_ops.range(0, ndims - 1)], 0)
     x = array_ops.transpose(x, perm)
 
     if not self.cholesky_input_output_matrices:
@@ -278,7 +276,7 @@ class _WishartOperatorPD(distribution.Distribution):
         array_ops.shape(x_sqrt), [0], [sample_ndims])
 
     # We need to be able to pre-multiply each matrix by its corresponding
-    # batch scale matrix.  Since a Distribution Tensor supports multiple
+    # batch scale matrix. Since a Distribution Tensor supports multiple
     # samples per batch, this means we need to reshape the input matrix `x`
     # so that the first b dimensions are batch dimensions and the last two
     # are of shape [dimension, dimensions*number_of_samples]. Doing these
@@ -288,10 +286,10 @@ class _WishartOperatorPD(distribution.Distribution):
     # this reshaping so what we're left with is a Tensor partitionable by
     # sample, batch, event dimensions.
 
-    # Complexity: O(nbk^2) since transpose must access every element.
+    # Complexity: O(nbk**2) since transpose must access every element.
     scale_sqrt_inv_x_sqrt = x_sqrt
-    perm = array_ops.concat((math_ops.range(sample_ndims, ndims),
-                             math_ops.range(0, sample_ndims)), 0)
+    perm = array_ops.concat([math_ops.range(sample_ndims, ndims),
+                             math_ops.range(0, sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)
     shape = array_ops.concat(
         (batch_shape, (math_ops.cast(
@@ -300,37 +298,37 @@ class _WishartOperatorPD(distribution.Distribution):
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
 
     # Complexity: O(nbM*k) where M is the complexity of the operator solving
-    # a vector system.  E.g., for OperatorPDDiag, each solve is O(k), so
-    # this complexity is O(nbk^2). For OperatorPDCholesky, each solve is
-    # O(k^2) so this step has complexity O(nbk^3).
+    # a vector system. E.g., for OperatorPDDiag, each solve is O(k), so
+    # this complexity is O(nbk**2). For OperatorPDCholesky, each solve is
+    # O(k**2) so this step has complexity O(nbk^3).
     scale_sqrt_inv_x_sqrt = self.scale_operator_pd.sqrt_solve(
         scale_sqrt_inv_x_sqrt)
 
     # Undo make batch-op ready.
-    # Complexity: O(nbk^2)
-    shape = array_ops.concat((batch_shape, event_shape, sample_shape), 0)
+    # Complexity: O(nbk**2)
+    shape = array_ops.concat([batch_shape, event_shape, sample_shape], 0)
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
-    perm = array_ops.concat((math_ops.range(ndims - sample_ndims, ndims),
-                             math_ops.range(0, ndims - sample_ndims)), 0)
+    perm = array_ops.concat([math_ops.range(ndims - sample_ndims, ndims),
+                             math_ops.range(0, ndims - sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)
 
     # Write V = SS', X = LL'. Then:
     # tr[inv(V) X] = tr[inv(S)' inv(S) L L']
     #              = tr[inv(S) L L' inv(S)']
     #              = tr[(inv(S) L) (inv(S) L)']
-    #              = sum_{ik} (inv(S) L)_{ik}^2
+    #              = sum_{ik} (inv(S) L)_{ik}**2
     # The second equality follows from the cyclic permutation property.
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     trace_scale_inv_x = math_ops.reduce_sum(
         math_ops.square(scale_sqrt_inv_x_sqrt),
-        reduction_indices=[-2, -1])
+        axis=[-2, -1])
 
     # Complexity: O(nbk)
     half_log_det_x = math_ops.reduce_sum(
         math_ops.log(array_ops.matrix_diag_part(x_sqrt)),
-        reduction_indices=[-1])
+        axis=[-1])
 
-    # Complexity: O(nbk^2)
+    # Complexity: O(nbk**2)
     log_prob = ((self.df - self.dimension - 1.) * half_log_det_x -
                 0.5 * trace_scale_inv_x -
                 self.log_normalization())
@@ -420,14 +418,14 @@ class _WishartOperatorPD(distribution.Distribution):
       seq = self._multi_gamma_sequence(a, p)
       return (0.25 * p * (p - 1.) * math.log(math.pi) +
               math_ops.reduce_sum(math_ops.lgamma(seq),
-                                  reduction_indices=(-1,)))
+                                  axis=[-1]))
 
   def _multi_digamma(self, a, p, name="multi_digamma"):
     """Computes the multivariate digamma function; Psi_p(a)."""
     with self._name_scope(name, values=[a, p]):
       seq = self._multi_gamma_sequence(a, p)
       return math_ops.reduce_sum(math_ops.digamma(seq),
-                                 reduction_indices=(-1,))
+                                 axis=[-1])
 
 
 class WishartCholesky(_WishartOperatorPD):
@@ -469,7 +467,7 @@ class WishartCholesky(_WishartOperatorPD):
   dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
 
   # Evaluate this on an observation in R^3, returning a scalar.
-  x = ... # A 3x3 positive definite matrix.
+  x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.
 
   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
@@ -506,23 +504,23 @@ class WishartCholesky(_WishartOperatorPD):
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The Cholesky factorization of
         the symmetric positive definite scale matrix of the distribution.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]) as ns:
+    with ops.name_scope(name, values=[scale]):
       super(WishartCholesky, self).__init__(
           df=df,
           scale_operator_pd=operator_pd_cholesky.OperatorPDCholesky(
@@ -530,7 +528,7 @@ class WishartCholesky(_WishartOperatorPD):
           cholesky_input_output_matrices=cholesky_input_output_matrices,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
@@ -569,7 +567,7 @@ class WishartFull(_WishartOperatorPD):
   dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
 
   # Evaluate this on an observation in R^3, returning a scalar.
-  x = ... # A 3x3 positive definite matrix.
+  x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.
 
   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
@@ -606,20 +604,20 @@ class WishartFull(_WishartOperatorPD):
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The symmetric positive definite
         scale matrix of the distribution.
-      cholesky_input_output_matrices: `Boolean`. Any function which whose input
-        or output is a matrix assumes the input is Cholesky and returns a
+      cholesky_input_output_matrices: Python `bool`. Any function which whose
+        input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     with ops.name_scope(name, values=[scale]) as ns:
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 4f613bc5b70..0b4dc5667f3 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -12,8 +12,10 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-py_library(
+tf_custom_op_py_library(
     name = "factorization_py",
     srcs = [
         "__init__.py",
@@ -21,13 +23,19 @@ py_library(
         "python/ops/factorization_ops.py",
         "python/ops/gmm.py",
         "python/ops/gmm_ops.py",
+        "python/ops/wals.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_clustering_ops.so",
         ":python/ops/_factorization_ops.so",
     ],
+    kernels = [
+        ":all_ops",
+        "//tensorflow/contrib/factorization/kernels:all_kernels",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":factorization_ops_test_utils_py",
         ":gen_clustering_ops",
         ":gen_factorization_ops",
         "//tensorflow/contrib/framework:framework_py",
@@ -76,6 +84,7 @@ tf_custom_op_library(
         "ops/factorization_ops.cc",
     ],
     deps = [
+        "//tensorflow/contrib/factorization/kernels:masked_matmul_ops",
         "//tensorflow/contrib/factorization/kernels:wals_solver_ops",
     ],
 )
@@ -112,6 +121,7 @@ tf_gen_op_wrapper_py(
 # Ops tests
 tf_py_test(
     name = "gmm_test",
+    size = "large",
     srcs = [
         "python/ops/gmm_test.py",
     ],
@@ -128,12 +138,14 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = [
+        "no_pip",  # b/38283730
         "notsan",  # Flaky: b/30756419
     ],
 )
 
 tf_py_test(
     name = "gmm_ops_test",
+    size = "large",
     srcs = [
         "python/ops/gmm_ops_test.py",
     ],
@@ -151,12 +163,28 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "factorization_ops_test_utils_py",
+    srcs = [
+        "python/ops/factorization_ops_test_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 tf_py_test(
     name = "factorization_ops_test",
     srcs = ["python/ops/factorization_ops_test.py"],
     additional_deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        ":factorization_ops_test_utils_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -167,6 +195,28 @@ tf_py_test(
     ],
 )
 
+# Estimators tests
+tf_py_test(
+    name = "wals_test",
+    size = "medium",
+    srcs = ["python/ops/wals_test.py"],
+    additional_deps = [
+        ":factorization_py",
+        ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        ":factorization_ops_test_utils_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["manual"],
+)
+
 # Kernel tests
 tf_py_test(
     name = "wals_solver_ops_test",
@@ -195,6 +245,40 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "masked_matmul_ops_test",
+    srcs = ["python/kernel_tests/masked_matmul_ops_test.py"],
+    additional_deps = [
+        ":gen_factorization_ops",
+        ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "masked_matmul_benchmark",
+    srcs = ["python/kernel_tests/masked_matmul_benchmark.py"],
+    additional_deps = [
+        ":gen_factorization_ops",
+        ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:variables",
+    ],
+    main = "python/kernel_tests/masked_matmul_benchmark.py",
+)
+
 # All files
 filegroup(
     name = "all_files",
diff --git a/tensorflow/contrib/factorization/__init__.py b/tensorflow/contrib/factorization/__init__.py
index 63243a7fdb3..f0ca8792590 100644
--- a/tensorflow/contrib/factorization/__init__.py
+++ b/tensorflow/contrib/factorization/__init__.py
@@ -23,4 +23,5 @@ from tensorflow.contrib.factorization.python.ops.clustering_ops import *
 from tensorflow.contrib.factorization.python.ops.factorization_ops import *
 from tensorflow.contrib.factorization.python.ops.gmm import *
 from tensorflow.contrib.factorization.python.ops.gmm_ops import *
+from tensorflow.contrib.factorization.python.ops.wals import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/factorization/examples/mnist.py b/tensorflow/contrib/factorization/examples/mnist.py
index b04c44b5bf5..06a62db0049 100644
--- a/tensorflow/contrib/factorization/examples/mnist.py
+++ b/tensorflow/contrib/factorization/examples/mnist.py
@@ -20,7 +20,7 @@ into the 'distance to clusters' space. These are then fed into hidden layers to
 learn the supervised objective.
 
 To train this model on real mnist data, run this model as follows:
-  mnist --nofake_data --max_steps=2000
+  mnist --fake_data=False --max_steps=2000
 """
 
 from __future__ import absolute_import
@@ -29,6 +29,7 @@ from __future__ import print_function
 
 import argparse
 import math
+import sys
 import tempfile
 import time
 
@@ -102,13 +103,14 @@ def do_eval(sess,
   """
   # And run one epoch of eval.
   true_count = 0  # Counts the number of correct predictions.
-  steps_per_epoch = data_set.num_examples // FLAGS.batch_size
-  num_examples = steps_per_epoch * FLAGS.batch_size
-  for step in xrange(steps_per_epoch):
+  batch_size = min(FLAGS.batch_size, data_set.num_examples)
+  steps_per_epoch = data_set.num_examples // batch_size
+  num_examples = steps_per_epoch * batch_size
+  for _ in xrange(steps_per_epoch):
     feed_dict = fill_feed_dict(data_set,
                                images_placeholder,
                                labels_placeholder,
-                               FLAGS.batch_size)
+                               batch_size)
     true_count += sess.run(eval_correct, feed_dict=feed_dict)
   precision = true_count / num_examples
   print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
@@ -140,7 +142,8 @@ def inference(inp, num_clusters, hidden1_units, hidden2_units):
       # initial_clusters=tf.contrib.factorization.KMEANS_PLUS_PLUS_INIT,
       use_mini_batch=True)
 
-  all_scores, _, clustering_scores, kmeans_training_op = kmeans.training_graph()
+  (all_scores, _, clustering_scores, _, kmeans_init,
+   kmeans_training_op) = kmeans.training_graph()
   # Some heuristics to approximately whiten this output.
   all_scores = (all_scores[0] - 0.5) * 5
   # Here we avoid passing the gradients from the supervised objective back to
@@ -174,7 +177,7 @@ def inference(inp, num_clusters, hidden1_units, hidden2_units):
     biases = tf.Variable(tf.zeros([NUM_CLASSES]),
                          name='biases')
     logits = tf.matmul(hidden2, weights) + biases
-  return logits, clustering_loss, kmeans_training_op
+  return logits, clustering_loss, kmeans_init, kmeans_training_op
 
 
 def run_training():
@@ -190,10 +193,11 @@ def run_training():
     images_placeholder, labels_placeholder = placeholder_inputs()
 
     # Build a Graph that computes predictions from the inference model.
-    logits, clustering_loss, kmeans_training_op = inference(images_placeholder,
-                                                            FLAGS.num_clusters,
-                                                            FLAGS.hidden1,
-                                                            FLAGS.hidden2)
+    logits, clustering_loss, kmeans_init, kmeans_training_op = inference(
+        images_placeholder,
+        FLAGS.num_clusters,
+        FLAGS.hidden1,
+        FLAGS.hidden2)
 
     # Add to the Graph the Ops for loss calculation.
     loss = mnist.loss(logits, labels_placeholder)
@@ -211,12 +215,15 @@ def run_training():
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
 
+    # Run the Op to initialize the variables.
+    sess.run(init)
+
     feed_dict = fill_feed_dict(data_sets.train,
                                images_placeholder,
                                labels_placeholder,
-                               batch_size=5000)
-    # Run the Op to initialize the variables.
-    sess.run(init, feed_dict=feed_dict)
+                               batch_size=max(FLAGS.batch_size, 5000))
+    # Run the Op to initialize the clusters.
+    sess.run(kmeans_init, feed_dict=feed_dict)
 
     # Start the training loop.
     max_test_prec = 0
@@ -279,6 +286,7 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser(
       description='Basic model parameters as external flags.'
   )
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
   parser.add_argument(
       '--learning_rate',
       type=float,
@@ -323,10 +331,10 @@ if __name__ == '__main__':
   )
   parser.add_argument(
       '--fake_data',
-      type=bool,
+      type='bool',
       default=True,
       help='Use fake input data.'
   )
   FLAGS, unparsed = parser.parse_known_args()
-
+  sys.argv = [sys.argv[0]] + unparsed
   tf.test.main()
diff --git a/tensorflow/contrib/factorization/g3doc/gmm.md b/tensorflow/contrib/factorization/g3doc/gmm.md
new file mode 100644
index 00000000000..ec11329e9ff
--- /dev/null
+++ b/tensorflow/contrib/factorization/g3doc/gmm.md
@@ -0,0 +1,14 @@
+# Gaussian Mixture Models
+
+Gaussian mixture models assume that the input is generated by a mixture of a
+finite number of Gaussian distributions with unknown parameters.
+
+![gmm](gmm.png)
+
+Typically an EM (Expectation Maximization) algorithm is used to estimate these
+parameters, which include the mean, covariance and mixture ratios of the
+underlying components.
+
+We provide a distributed implementation for running the GMM algorithm. Options
+are provided to let the user decide on which parameters to learn. The learned
+covariance can be either full or diagonal.
diff --git a/tensorflow/contrib/factorization/g3doc/gmm.png b/tensorflow/contrib/factorization/g3doc/gmm.png
new file mode 100644
index 00000000000..1d44e55cb3c
Binary files /dev/null and b/tensorflow/contrib/factorization/g3doc/gmm.png differ
diff --git a/tensorflow/contrib/factorization/g3doc/kmeans.md b/tensorflow/contrib/factorization/g3doc/kmeans.md
new file mode 100644
index 00000000000..b55c9d09ad3
--- /dev/null
+++ b/tensorflow/contrib/factorization/g3doc/kmeans.md
@@ -0,0 +1,30 @@
+# K-Means Clustering
+
+Given a set of input $$x_i$$, K-means clustering finds a set C of cluster
+centers that minimizes the inertia, defined as $$ \sum_{i=0}^{n} \min_{\mu_j \in
+C} (||x_i - \mu_j||^2) $$.
+
+![kmeans](kmeans.png)
+
+The **K-means** algorithm, or **Lloyd's algorithm**, is an iterative approach
+where each iteration involves assigning each input point to its closest cluster,
+followed by recomputing the new value of the cluster center as the mean of the
+points assigned to it. It runs full-batch, which means each iteration goes
+through the full set of inputs to compute a new set of cluster centers.
+Convergence is typically pretty fast, in terms of number of iterations, but
+there can be local minima.
+
+**[Mini-batch K-means](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)**
+is a stochastic variant of Lloyd's algorithm which updates the cluster centers
+based on a mini-batch of data. It converges much faster than the full-batch
+version, especially for large data sets, and the quality is typically close to
+the full-batch version.
+
+**[K-Means++](http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf)** is an
+approach for computing the initial cluster assignments that is expensive but is
+typically less prone to getting stuck in bad local minima.
+
+We provide distributed implementations of both full-batch and mini-batch
+K-Means algorithm. Both K-Means++ and random initialization are supported.
+The user can also choose between **Cosine** and **Squared Euclidean** distance
+metrics.
diff --git a/tensorflow/contrib/factorization/g3doc/kmeans.png b/tensorflow/contrib/factorization/g3doc/kmeans.png
new file mode 100644
index 00000000000..b470f745ca0
Binary files /dev/null and b/tensorflow/contrib/factorization/g3doc/kmeans.png differ
diff --git a/tensorflow/contrib/factorization/g3doc/wals.md b/tensorflow/contrib/factorization/g3doc/wals.md
new file mode 100644
index 00000000000..a428b393ba7
--- /dev/null
+++ b/tensorflow/contrib/factorization/g3doc/wals.md
@@ -0,0 +1,67 @@
+# WALS Factorization
+
+$$
+% commands
+\newcommand\bracket[2]{\left\langle #1, #2 \right\rangle}
+\newcommand\trace{\text{trace}}
+\newcommand\Rbb{\mathbb{R}}
+$$
+
+### Problem formulation
+
+WALS (Weighed Alternating Least Squares) is an algorithm for factorizing a
+sparse matrix $$A \in \Rbb^{n \times m}$$ into low rank factors, $$U \in \Rbb^{n
+\times k}$$ and $$V \in \Rbb^{m \times k}$$, such that the product $$UV^T$$ is a
+"good" approximation of the full matrix.
+
+![wals](wals.png)
+
+Typically, it involves minimizing the following loss function:
+
+$$ min_{U,V}
+(\|\sqrt{W} \odot (A- UV^T)\|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)),
+$$
+
+where
+
+-   $$\lambda$$ is a regularization parameter,
+-   $$\odot$$ denotes the component-wise product,
+-   $$W$$ is a weight matrix of the form $$W_{i, j} = w_0 + 1_{A_{ij} \neq 0}R_i
+    C_j$$, where $$w_0$$ is the weight of unobserved entries, and $$R \in
+    \Rbb^n$$ and $$C \in \Rbb^m$$ are the row and column weights respectively.
+    This form of the weight matrix lends this problem to an efficient
+    implementation.
+
+### Solution method
+
+The WALS algorithm proceeds in phases, or "sweeps", where each sweep involves
+fixing $$U$$ and solving for $$V$$, and then fixing $$V$$ and solving for $$U$$.
+Note that each subproblem is an unconstrained quadratic minimization problem and
+can be solved exactly. Convergence is typically pretty fast (10-20 sweeps).
+
+### Loss computation
+
+The product $$UV^T$$ is dense, and can be too large to compute. So we use the
+following reformulation of the loss to be able to compute it efficiently. First,
+we decompose the norm into two terms, corresponding to the sparsity pattern of
+$$A$$. Let $$S = \{(i, j) : A_{i, j} \neq 0\}$$.
+
+$$
+\begin{align}
+\|\sqrt W \odot (A - UV^T)\|_F^2
+&= \sum_{(i, j) \in S} (w_0 + R_i C_j) (A_{ij} - \bracket{u_i}{v_j})^2 +
+\sum_{(i, j) \not\in S} w_0 (\bracket{u_i}{v_j})^2 \\
+&= \sum_{(i, j) \in S} \left( (w_0 + R_i C_j) (A_{ij} - \bracket{u_i}{v_j})^2 -
+w_0 \bracket{u_i}{v_j}^2 \right) + w_0\|UV^T\|_F^2
+\end{align}
+$$
+
+The last term can be computed efficiently by observing that
+
+$$
+\|UV^T\|_F^2 = \trace(UV^TVU^T) = \trace(U^TUV^TV)
+$$
+
+Each of the Gramian matrices $$U^TU$$ and $$V^TV$$ are $$k\times k$$ and are
+cheap to store. Additionally, $$\|U\|_F^2 = \trace(U^TU)$$ and similarly for
+$$V$$, so we can use the trace of the individual Gramians to compute the norms.
diff --git a/tensorflow/contrib/factorization/g3doc/wals.png b/tensorflow/contrib/factorization/g3doc/wals.png
new file mode 100644
index 00000000000..eb90a888a10
Binary files /dev/null and b/tensorflow/contrib/factorization/g3doc/wals.png differ
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 301ab4c95ea..1884ef8c93a 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -6,14 +6,13 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-
 cc_library(
     name = "all_kernels",
     deps = [
         ":clustering_ops",
+        ":masked_matmul_ops",
         ":wals_solver_ops",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
 )
 
@@ -23,7 +22,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
     alwayslink = 1,
 )
@@ -34,13 +33,25 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "masked_matmul_ops",
+    srcs = ["masked_matmul_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf//:protobuf_headers",
     ],
     alwayslink = 1,
 )
 
 cc_test(
     name = "clustering_ops_test",
+    size = "small",
     srcs = ["clustering_ops_test.cc"],
     deps = [
         ":clustering_ops",
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
index 3a964311820..a2136c08bbc 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@@ -375,8 +375,8 @@ class NearestNeighborsOp : public OpKernel {
       const Eigen::Ref<const Eigen::VectorXf>& points_half_squared_norm,
       const Eigen::Ref<const MatrixXfRowMajor>& centers,
       const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
-      Eigen::Ref<MatrixXi64RowMajor> nearest_center_indices,
-      Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
+      const Eigen::Ref<MatrixXi64RowMajor>& nearest_center_indices,
+      const Eigen::Ref<MatrixXfRowMajor>& nearest_center_distances) {
     CHECK_LE(k, centers.rows());
     if (centers.rows() <= kNearestNeighborsCentersMaxBlockSize) {
       FindKNearestCentersOneBlock(k, points, points_half_squared_norm, centers,
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
new file mode 100644
index 00000000000..31d08bfb65e
--- /dev/null
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -0,0 +1,192 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations under
+// the License.
+// =============================================================================
+
+// TensorFlow kernels and Ops for computing a masked matrix product.
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+using tensorflow::DEVICE_CPU;
+using tensorflow::DT_BOOL;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT64;
+using tensorflow::OpKernel;
+using tensorflow::OpKernelConstruction;
+using tensorflow::OpKernelContext;
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::TensorShapeUtils;
+using tensorflow::errors::InvalidArgument;
+
+namespace tensorflow {
+
+typedef Eigen::Map<
+    Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+    EigenMatInt64Map;
+typedef Eigen::Map<
+    Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+    EigenMatFloatMap;
+typedef Eigen::Map<
+    const Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+    ConstEigenMatInt64Map;
+typedef Eigen::Map<
+    const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+    ConstEigenMatFloatMap;
+
+class MaskedMatmulOp : public OpKernel {
+ public:
+  explicit MaskedMatmulOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->MatchSignature(
+        {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL},
+        {DT_FLOAT}));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Computes the product a * b, but only for indices (i, j) in mask_indices.
+    // The result is stored in prod_values, a 1-tensor, such that for all i,
+    // prod_values[i] = (a * b)[mask_indices[i, 0], mask_indices[i, 1]].
+    const Tensor& a = context->input(0);
+    const Tensor& b = context->input(1);
+    const Tensor& mask_indices = context->input(2);
+    const Tensor& transpose_a = context->input(3);
+    const Tensor& transpose_b = context->input(4);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(a.shape()),
+                InvalidArgument("Input a should be a matrix."));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(b.shape()),
+                InvalidArgument("Input b should be a matrix."));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(mask_indices.shape()),
+                InvalidArgument("Input mask_indices should be a matrix."));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(transpose_a.shape()),
+                InvalidArgument("Input transpose_a should be a scalar."));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(transpose_b.shape()),
+                InvalidArgument("Input transpose_b should be a scalar."));
+
+    const bool adj_a = transpose_a.scalar<bool>()();
+    const bool adj_b = transpose_b.scalar<bool>()();
+    const int64 a_dim_0 = a.dim_size(adj_a ? 1 : 0);
+    const int64 a_dim_1 = a.dim_size(adj_a ? 0 : 1);
+    const int64 b_dim_0 = b.dim_size(adj_b ? 1 : 0);
+    const int64 b_dim_1 = b.dim_size(adj_b ? 0 : 1);
+    const int64 num_nonzero_elements = mask_indices.dim_size(0);
+
+    OP_REQUIRES(context, a_dim_1 == b_dim_0,
+                InvalidArgument("Matrix shapes are incompatible: a has shape ",
+                                a.shape().DebugString(), ", while b has shape ",
+                                b.shape().DebugString(), "."));
+    OP_REQUIRES(context, mask_indices.dim_size(1) == 2,
+                InvalidArgument("mask_indices should be a matrix of shape ",
+                                "[nnz 2], where nnz is the number of non-zero ",
+                                "elements."));
+
+    ConstEigenMatFloatMap a_mat(a.matrix<float>().data(), a.dim_size(0),
+                                a.dim_size(1));
+    ConstEigenMatFloatMap b_mat(b.matrix<float>().data(), b.dim_size(0),
+                                b.dim_size(1));
+    ConstEigenMatInt64Map indices_mat(mask_indices.matrix<int64>().data(),
+                                      num_nonzero_elements, 2);
+
+    Tensor* prod_values_tensor;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({num_nonzero_elements}),
+                       &prod_values_tensor));
+    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(),
+                                 1, num_nonzero_elements);
+
+    auto get_a_index = [&indices_mat, &a_dim_0](int64 i) {
+      int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0));
+      CHECK(FastBoundsCheck(a_index, a_dim_0))
+          << "In mask_indices[" << i << ", :], the row index " << a_index
+          << " is out of bounds [0, " << a_dim_0 << ").";
+      return a_index;
+    };
+    auto get_b_index = [&indices_mat, &b_dim_1](int64 i) {
+      int64 b_index = internal::SubtleMustCopy(indices_mat(i, 1));
+      CHECK(FastBoundsCheck(b_index, b_dim_1))
+          << "In mask_indices[" << i << ", :], the column index " << b_index
+          << " is out of bounds [0, " << b_dim_1 << ").";
+      return b_index;
+    };
+    auto get_dot_product = [&adj_a, &adj_b, &a_mat, &b_mat](int64 i, int64 j) {
+      if (adj_a) {
+        if (adj_b) {
+          return a_mat.col(i).dot(b_mat.row(j));
+        } else {
+          return a_mat.col(i).dot(b_mat.col(j));
+        }
+      } else {
+        if (adj_b) {
+          return a_mat.row(i).dot(b_mat.row(j));
+        } else {
+          return a_mat.row(i).dot(b_mat.col(j));
+        }
+      }
+    };
+
+    std::vector<int64> perm(num_nonzero_elements);
+    std::iota(perm.begin(), perm.end(), 0);
+    // TODO(walidk): improve performance in the case adj_a and not adj_b
+    // TODO(walidk): benchmark smaller inputs, and potentially skip the sort
+    // when the input fits in L3 cache.
+    // Compute a permutation to sort either the a or b matrix, to take advantage
+    // of CPU caching. Since row access is efficient (given the RowMajor
+    // ordering), we prefer to
+    //   sort according to a when a is transposed,
+    //   sort according to b when b is not transpose.
+    auto compare_a_index = [&get_a_index](int64 i, int64 j) {
+      return get_a_index(i) < get_a_index(j);
+    };
+    auto compare_b_index = [&get_b_index](int64 i, int64 j) {
+      return get_b_index(i) < get_b_index(j);
+    };
+    if (adj_a) {
+      std::stable_sort(perm.begin(), perm.end(), compare_a_index);
+    } else if (!adj_b) {
+      std::stable_sort(perm.begin(), perm.end(), compare_b_index);
+    }
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    // Based on benchmarks, the cost is on the order of 20 cycles per dimension
+    const int64 cost_per_unit = 20 * a_dim_1;
+    // Lambda encapsulating the per-shard computation.
+    auto work = [&](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        const int64 p = perm[i];
+        const int64 a_index = get_a_index(p);
+        const int64 b_index = get_b_index(p);
+        prod_values(p) = get_dot_product(a_index, b_index);
+      }
+    };
+    // Shard the work.
+    worker_threads.workers->ParallelFor(
+        num_nonzero_elements, cost_per_unit, work);
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU),
+                        MaskedMatmulOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc b/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
index 92606225d77..cd1f23bba27 100644
--- a/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
@@ -144,33 +144,27 @@ class WALSComputePartialLhsAndRhsOp : public OpKernel {
     typedef std::pair<int64, int64> Shard;
     std::vector<Shard> shards;
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    const int num_threads = worker_threads.num_threads;
     int64 shard_total = 0;
-    if (num_threads == 1) {
-      shards.emplace_back(0, num_nonzero_elements);
-      shard_total += num_nonzero_elements;
-    } else {
-      // Compute a permutation such that get_input_index(perm[i]) is sorted, use
-      // stable_sort to preserve spatial locality.
-      std::stable_sort(perm.begin(), perm.end(),
-                       [&get_input_index](int64 i, int64 j) {
-                         return get_input_index(i) < get_input_index(j);
-                       });
+    // Compute a permutation such that get_input_index(perm[i]) is sorted, use
+    // stable_sort to preserve spatial locality.
+    std::stable_sort(perm.begin(), perm.end(),
+                     [&get_input_index](int64 i, int64 j) {
+                       return get_input_index(i) < get_input_index(j);
+                     });
 
-      // Compute the start and end of runs with identical input_index.
-      // These are the shards of work that can be processed in parallel
-      // without locking.
-      int64 start = 0;
-      int64 end = 0;
-      while (end < num_nonzero_elements) {
-        start = end;
-        while (end < num_nonzero_elements &&
-               get_input_index(perm[start]) == get_input_index(perm[end])) {
-          ++end;
-        }
-        shards.emplace_back(start, end);
-        shard_total += end - start;
+    // Compute the start and end of runs with identical input_index.
+    // These are the shards of work that can be processed in parallel
+    // without locking.
+    int64 start = 0;
+    int64 end = 0;
+    while (end < num_nonzero_elements) {
+      start = end;
+      while (end < num_nonzero_elements &&
+             get_input_index(perm[start]) == get_input_index(perm[end])) {
+        ++end;
       }
+      shards.emplace_back(start, end);
+      shard_total += end - start;
     }
     CHECK_EQ(shard_total, num_nonzero_elements);
     CHECK_LE(shards.size(), num_nonzero_elements);
diff --git a/tensorflow/contrib/factorization/ops/clustering_ops.cc b/tensorflow/contrib/factorization/ops/clustering_ops.cc
index 5f0d05254e3..f2dfcf7ed0f 100644
--- a/tensorflow/contrib/factorization/ops/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/ops/clustering_ops.cc
@@ -13,6 +13,7 @@
 // the License.
 // ==============================================================================
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
@@ -23,6 +24,7 @@ REGISTER_OP("KmeansPlusPlusInitialization")
     .Input("seed: int64")
     .Input("num_retries_per_sample: int64")
     .Output("samples: float32")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"(
 Selects num_to_sample rows of input using the KMeans++ criterion.
 
@@ -48,6 +50,7 @@ REGISTER_OP("NearestNeighbors")
     .Input("k: int64")
     .Output("nearest_center_indices: int64")
     .Output("nearest_center_distances: float32")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"(
 Selects the k nearest centers for each point.
 
diff --git a/tensorflow/contrib/factorization/ops/factorization_ops.cc b/tensorflow/contrib/factorization/ops/factorization_ops.cc
index f72ea536fbd..11ea36946e9 100644
--- a/tensorflow/contrib/factorization/ops/factorization_ops.cc
+++ b/tensorflow/contrib/factorization/ops/factorization_ops.cc
@@ -13,6 +13,7 @@
 // the License.
 // ==============================================================================
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
@@ -28,6 +29,7 @@ REGISTER_OP("WALSComputePartialLhsAndRhs")
     .Input("input_is_transpose: bool")
     .Output("partial_lhs: float32")
     .Output("partial_rhs: float32")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"(
 Computes the partial left-hand side and right-hand side of WALS update.
 
@@ -43,4 +45,41 @@ partial_lhs: 3-D tensor with size input_block_size x k x k.
 partial_rhs: Matrix with size input_block_size x k.
 )");
 
+REGISTER_OP("MaskedMatmul")
+    .Input("a: float32")
+    .Input("b: float32")
+    .Input("mask_indices: int64")
+    .Input("transpose_a: bool")
+    .Input("transpose_b: bool")
+    .Output("prod_values: float32")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"(
+Computes the product a * b, but only for indices (i, j) in mask_indices. The
+result is stored in prod_values, a rank 1 tensor, such that for all i,
+prod_values[i] = (a * b)[mask_indices[i, 0], mask_indices[i, 1]].
+Note that the shapes of the input matrices a, b should be compatible (after
+transposing as specified by the arguments transpose_a and transpose_b).
+
+Input arguments:
+a: A rank 2 tensor of shape [m, n].
+b: A rank 2 tensor of shape [s, t]. The inner dimensions of a and b should match
+  after transposition.
+mask_indices: A rank 2 tensor, of shape [nnz, 2] where nnz is the number of
+  non-zero elements in the output. The indices are not assumed to be in
+  lexicographic, or any particular order.
+  For all i, mask_indices[i, :] should represent a valid index of the product
+  matrix (a * b) (after transposition). That is:
+  mask_indices[i, 0] should be in [0, m) if !transpose_a, and in [0, n)
+    otherwise.
+  mask_indices[i, 1] should be in [0, t) if !transpose_b, and in [0, s)
+    otherwise.
+transpose_a: A boolean, specifies whether to transpose the matrix a.
+transpose_b: A boolean, specifies whether to transpose the matrix b.
+
+Output arguments:
+prod_values: A rank 1 tensor of shape [nnz], representing the values of the
+  non-zero elements in the product, such that for all i,
+  prod_values[i] = (a * b)[mask_indices[i, 0], mask_indices[i, 1]].
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
index f6d035a2c63..450f64063a2 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_benchmark.py b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_benchmark.py
new file mode 100644
index 00000000000..a5d2cbf2cc9
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_benchmark.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for masked_matmul_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-todo, g-import-not-at-top
+import time
+
+from tensorflow.contrib.factorization.python.ops import gen_factorization_ops
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class MaskedmatmulBenchmark(test.Benchmark):
+  """Benchmark masked_matmul_ops."""
+
+  def _make_sparse_mask(self, mask_shape, nnz, sort=False):
+    """Creates a sparse tensor to be used as a mask in masked_matmul.
+
+    Args:
+      mask_shape: int list, the shape of the mask.
+      nnz: int, the number of non-zero elements in the mask.
+      sort: boolean, whether to sort the indices of the mask (in lexicographic
+        order).
+    Returns:
+      A sparse tensor, with nnz indices, drawn uniformly at random.
+    """
+    num_rows = mask_shape[0]
+    num_cols = mask_shape[1]
+    row_idx = random_ops.random_uniform(
+        [nnz], minval=0, maxval=num_rows, dtype=dtypes.int64)
+    col_idx = random_ops.random_uniform(
+        [nnz], minval=0, maxval=num_cols, dtype=dtypes.int64)
+    indices = array_ops.stack([row_idx, col_idx], axis=1)
+    values = array_ops.ones([nnz])
+    unordered_mask = sparse_tensor.SparseTensor(indices, values, mask_shape)
+    return sparse_ops.sparse_reorder(unordered_mask) if sort else unordered_mask
+
+  def _run_graph(self, a_shape, b_shape, nnz, num_iters, sort=False,
+                 transpose_a=False, transpose_b=False):
+    """Run the graph and return its average execution time.
+
+    Args:
+      a_shape: int list, the shape of the a matrix.
+      b_shape: int list, the shape of the b matrix.
+      nnz: int, the number of non-zero elements in the mask.
+      num_iters: int, the number of iterations to run (the output is the average
+        execution time, over num_iters).
+      sort: Boolean, whether to sort the indices in the mask.
+      transpose_a: boolean, whether to transpose the a matrix.
+      transpose_b: boolean, whether to transpose the b matrix.
+
+    Returns:
+      The average duration of the masked_matmul op in seconds.
+    """
+    graph = ops.Graph()
+
+    with graph.as_default(), session_lib.Session(graph=graph) as session:
+      mask_shape = [a_shape[0], b_shape[1]]
+      a_shape = a_shape if not transpose_a else [a_shape[1], a_shape[0]]
+      b_shape = b_shape if not transpose_b else [b_shape[1], b_shape[0]]
+      a_var = variables.Variable(random_ops.random_normal(a_shape))
+      b_var = variables.Variable(random_ops.random_normal(b_shape))
+      mask_indices_ph = array_ops.placeholder(dtypes.int64, shape=[nnz, 2])
+      a_ph = array_ops.placeholder(dtypes.float32, shape=a_shape)
+      b_ph = array_ops.placeholder(dtypes.float32, shape=b_shape)
+      mask = self._make_sparse_mask(mask_shape, nnz, sort)
+      masked_prod = gen_factorization_ops.masked_matmul(
+          a_ph, b_ph, mask_indices_ph, transpose_a, transpose_b)
+      with ops.control_dependencies([masked_prod]):
+        result = control_flow_ops.no_op()
+
+      variables.global_variables_initializer().run()
+      avg_wall_time = 0
+      for _ in range(num_iters):
+        a, b, mask_indices = session.run([a_var, b_var, mask.indices])
+        feed_dict = {
+            mask_indices_ph: mask_indices,
+            a_ph: a,
+            b_ph: b
+        }
+        start_time = time.time()
+        session.run(result, feed_dict=feed_dict)
+        avg_wall_time += (time.time() - start_time)/num_iters
+
+      bench_name = (
+          "cpu nnz:{nnz} a_shape:{a_shape} b_shape:{b_shape} tr_a:{tr_a} "
+          "tr_b:{tr_b} sort:{sort}"
+      ).format(
+          nnz=nnz,
+          a_shape=a_shape,
+          b_shape=b_shape,
+          tr_a=int(transpose_a),
+          tr_b=int(transpose_b),
+          sort=int(sort)
+      )
+      print(bench_name + " - %f secs" % avg_wall_time)
+      name = bench_name.replace(", ", "_").replace(":", "_").replace(" ", "_")
+      self.report_benchmark(
+          name=name,
+          iters=num_iters,
+          wall_time=avg_wall_time)
+
+    return avg_wall_time
+
+  # TODO(walidk): compare benchmarks to using existing tf ops.
+  def benchmark_matmul(self):
+    num_iters = 10
+    nnz = 100000
+    for transpose_a in [False, True]:
+      for transpose_b in [False, True]:
+        for dim in [200, 400, 800]:
+          for sort in [False, True]:
+            a_shape = [10000, dim]
+            b_shape = [dim, 10000]
+            self._run_graph(a_shape, b_shape, nnz, num_iters, sort, transpose_a,
+                            transpose_b)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py
new file mode 100644
index 00000000000..3a909e2373c
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py
@@ -0,0 +1,104 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for masked_matmul_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-todo, g-import-not-at-top
+import numpy as np
+
+from tensorflow.contrib.factorization.python.ops import gen_factorization_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+def MakeMask():
+  inds = [[0, 0], [0, 2], [1, 1], [2, 0], [2, 3]] * 100
+  indices = np.array(inds).astype(np.int64)
+  shape = np.array([5, 4]).astype(np.int64)
+  return (indices, shape)
+
+
+class MaskedProductOpsTest(test.TestCase):
+
+  def setUp(self):
+    a = [
+        [0.1, 0.2, 0.3],
+        [0.4, 0.5, 0.6],
+        [0.7, 0.8, 0.9],
+        [1.1, 1.2, 1.3],
+        [1.4, 1.5, 1.6],
+    ]
+    b = [
+        [0.1, 0.4, 0.7, 1.1],
+        [0.2, 0.5, 0.8, 1.2],
+        [0.3, 0.6, 0.9, 1.3],
+    ]
+    self._dot_products = np.array([0.14, 0.5, 0.77, 0.5, 2.9] * 100)
+    self._a = np.array(a).astype(np.float32)
+    self._b = np.array(b).astype(np.float32)
+    self._mask_ind, self._mask_shape = MakeMask()
+
+  def _runTestMaskedProduct(self, transpose_a, transpose_b):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      a = self._a if not transpose_a else array_ops.transpose(self._a)
+      b = self._b if not transpose_b else array_ops.transpose(self._b)
+
+      def AssertClose(sp_x, sp_y):
+        x_inds, x_vals, y_inds, y_vals = sess.run(
+            [sp_x.indices, sp_x.values,
+             sp_y.indices, sp_y.values])
+        self.assertAllClose(x_inds, y_inds)
+        self.assertAllClose(x_vals, y_vals)
+
+      values = gen_factorization_ops.masked_matmul(
+          a, b, self._mask_ind, transpose_a, transpose_b)
+      result = sparse_tensor.SparseTensor(
+          self._mask_ind, values, self._mask_shape)
+      true_result = sparse_tensor.SparseTensor(
+          self._mask_ind, self._dot_products, self._mask_shape)
+      AssertClose(result, true_result)
+
+  def _runTestEmptyMaskedProduct(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      empty_mask = constant_op.constant(0, shape=[0, 2], dtype=dtypes.int64)
+      values = gen_factorization_ops.masked_matmul(
+          self._a, self._b, empty_mask, False, False)
+      self.assertEqual(len(values.eval(session=sess)), 0)
+
+  def testMaskedProduct(self):
+    self._runTestMaskedProduct(False, False)
+
+  def testMaskedProductTransposeA(self):
+    self._runTestMaskedProduct(True, False)
+
+  def testMaskedProductTransposeB(self):
+    self._runTestMaskedProduct(False, True)
+
+  def testMaskedProductTransposeAAndB(self):
+    self._runTestMaskedProduct(True, True)
+
+  def testEmptyMaskedProduct(self):
+    self._runTestEmptyMaskedProduct()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
index 28bcccbde6d..ba30fd99770 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
@@ -18,16 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
-from tensorflow.contrib.factorization.python.ops import factorization_ops
+from tensorflow.contrib.factorization.python.ops import gen_factorization_ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
 
@@ -59,7 +52,7 @@ class WalsSolverOpsTest(test.TestCase):
     sparse_block = SparseBlock3x3()
     with self.test_session():
       [lhs_tensor,
-       rhs_matrix] = factorization_ops.wals_compute_partial_lhs_and_rhs(
+       rhs_matrix] = gen_factorization_ops.wals_compute_partial_lhs_and_rhs(
            self._column_factors, self._column_weights, self._unobserved_weights,
            self._row_weights, sparse_block.indices, sparse_block.values,
            sparse_block.dense_shape[0], False)
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 72b1326f472..2e9b5e22c73 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.factorization.python.ops import gen_clustering_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-# pylint: enable=wildcard-import
-
-from tensorflow.contrib.factorization.python.ops import gen_clustering_ops
 from tensorflow.contrib.factorization.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 from tensorflow.python.platform import resource_loader
 
@@ -62,10 +61,36 @@ class KMeans(object):
                initial_clusters=RANDOM_INIT,
                distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
                use_mini_batch=False,
+               mini_batch_steps_per_iteration=1,
                random_seed=0,
                kmeans_plus_plus_num_retries=2):
     """Creates an object for generating KMeans clustering graph.
 
+    This class implements the following variants of K-means algorithm:
+
+    If use_mini_batch is False, it runs standard full batch K-means. Each step
+    runs a single iteration of K-Means. This step can be run sharded across
+    multiple workers by passing a list of sharded inputs to this class. Note
+    however that a single step needs to process the full input at once.
+
+    If use_mini_batch is True, it runs a generalization of the mini-batch
+    K-means algorithm. It runs multiple iterations, where each iteration is
+    composed of mini_batch_steps_per_iteration steps. Two copies of cluster
+    centers are maintained: one that is updated at the end of each iteration,
+    and one that is updated every step. The first copy is used to compute
+    cluster allocations for each step, and for inference, while the second copy
+    is the one updated each step using the mini-batch update rule. After each
+    iteration is complete, this second copy is copied back the first copy.
+
+    Note that for use_mini_batch=True, when mini_batch_steps_per_iteration=1,
+    the algorithm reduces to the standard mini-batch algorithm. Also by setting
+    mini_batch_steps_per_iteration = num_inputs / batch_size, the algorithm
+    becomes an asynchronous version of the full-batch algorithm. Note however
+    that there is no guarantee by this implementation that each input is seen
+    exactly once per iteration. Also, different updates are applied
+    asynchronously without locking. So this asynchronous version may not behave
+    exactly like a full-batch version.
+
     Args:
       inputs: An input tensor or list of input tensors
       num_clusters: number of clusters.
@@ -76,6 +101,8 @@ class KMeans(object):
       distance_metric: distance metric used for clustering.
       use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
         full batch.
+      mini_batch_steps_per_iteration: number of steps after which the updated
+        cluster centers are synced back to a master copy.
       random_seed: Seed for PRNG used to initialize seeds.
       kmeans_plus_plus_num_retries: For each point that is sampled during
         kmeans++ initialization, this parameter specifies the number of
@@ -86,10 +113,13 @@ class KMeans(object):
     self._inputs = inputs if isinstance(inputs, list) else [inputs]
     assert num_clusters > 0, num_clusters
     self._num_clusters = num_clusters
+    if initial_clusters is None:
+      initial_clusters = RANDOM_INIT
     self._initial_clusters = initial_clusters
     assert distance_metric in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]
     self._distance_metric = distance_metric
     self._use_mini_batch = use_mini_batch
+    self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
     self._random_seed = random_seed
     self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
 
@@ -134,11 +164,12 @@ class KMeans(object):
       with ops.colocate_with(inp):
         # Computes Euclidean distance. Note the first and third terms are
         # broadcast additions.
-        squared_distance = (math_ops.reduce_sum(
-            math_ops.square(inp), 1, keep_dims=True) - 2 * math_ops.matmul(
-                inp, clusters, transpose_b=True) + array_ops.transpose(
-                    math_ops.reduce_sum(
-                        math_ops.square(clusters), 1, keep_dims=True)))
+        squared_distance = (
+            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
+            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
+            array_ops.transpose(
+                math_ops.reduce_sum(
+                    math_ops.square(clusters), 1, keep_dims=True)))
         output.append(squared_distance)
 
     return output
@@ -191,7 +222,7 @@ class KMeans(object):
     if (self._distance_metric == COSINE_DISTANCE and
         not self._clusters_l2_normalized()):
       # The cosine distance between normalized vectors x and y is the same as
-      # 2 * squared_euclidian_distance. We are using this fact and reusing the
+      # 2 * squared_euclidean_distance. We are using this fact and reusing the
       # nearest_neighbors op.
       # TODO(ands): Support COSINE distance in nearest_neighbors and remove
       # this.
@@ -199,12 +230,12 @@ class KMeans(object):
         clusters = nn_impl.l2_normalize(clusters, dim=1)
     for inp, score in zip(inputs, scores):
       with ops.colocate_with(inp):
-        (indices,
-         distances) = gen_clustering_ops.nearest_neighbors(inp, clusters, 1)
+        (indices, distances) = gen_clustering_ops.nearest_neighbors(
+            inp, clusters, 1)
         if self._distance_metric == COSINE_DISTANCE:
           distances *= 0.5
-        output.append(
-            (score, array_ops.squeeze(distances), array_ops.squeeze(indices)))
+        output.append((score, array_ops.squeeze(distances),
+                       array_ops.squeeze(indices)))
     return zip(*output)
 
   def _init_clusters_random(self):
@@ -231,25 +262,25 @@ class KMeans(object):
 
   def _clusters_l2_normalized(self):
     """Returns True if clusters centers are kept normalized."""
-    return self._distance_metric == COSINE_DISTANCE and not self._use_mini_batch
+    return (self._distance_metric == COSINE_DISTANCE and
+            (not self._use_mini_batch or
+             self._mini_batch_steps_per_iteration > 1))
 
-  def _init_clusters(self):
-    """Initialization of clusters.
+  def _initialize_clusters(self, cluster_centers, cluster_centers_initialized,
+                           cluster_centers_updated):
+    """Returns an op to initialize the cluster centers."""
 
-    Returns:
-    Tuple with following elements:
-      cluster_centers: a Tensor for storing cluster centers
-      cluster_counts: a Tensor for storing counts of points assigned to this
-        cluster. This is used by mini-batch training.
-    """
     init = self._initial_clusters
     if init == RANDOM_INIT:
       clusters_init = self._init_clusters_random()
     elif init == KMEANS_PLUS_PLUS_INIT:
       # Points from only the first shard are used for initializing centers.
       # TODO(ands): Use all points.
+      inp = self._inputs[0]
+      if self._distance_metric == COSINE_DISTANCE:
+        inp = nn_impl.l2_normalize(inp, dim=1)
       clusters_init = gen_clustering_ops.kmeans_plus_plus_initialization(
-          self._inputs[0], self._num_clusters, self._random_seed,
+          inp, self._num_clusters, self._random_seed,
           self._kmeans_plus_plus_num_retries)
     elif callable(init):
       clusters_init = init(self._inputs, self._num_clusters)
@@ -259,14 +290,66 @@ class KMeans(object):
       assert False, 'Unsupported init passed to Kmeans %s' % str(init)
     if self._distance_metric == COSINE_DISTANCE and clusters_init is not None:
       clusters_init = nn_impl.l2_normalize(clusters_init, dim=1)
-    clusters_init = clusters_init if clusters_init is not None else []
-    cluster_centers = variables.Variable(
-        clusters_init, name='clusters', validate_shape=False)
-    cluster_counts = (variables.Variable(
-        array_ops.ones(
-            [self._num_clusters], dtype=dtypes.int64)) if self._use_mini_batch
-                      else None)
-    return cluster_centers, cluster_counts
+
+    with ops.colocate_with(cluster_centers_initialized):
+      initialized = control_flow_ops.with_dependencies(
+          [clusters_init], array_ops.identity(cluster_centers_initialized))
+    with ops.colocate_with(cluster_centers):
+      assign_centers = state_ops.assign(
+          cluster_centers, clusters_init, validate_shape=False)
+      if cluster_centers_updated != cluster_centers:
+        assign_centers = control_flow_ops.group(assign_centers,
+                                                state_ops.assign(
+                                                    cluster_centers_updated,
+                                                    clusters_init,
+                                                    validate_shape=False))
+    assign_centers = control_flow_ops.with_dependencies(
+        [assign_centers], state_ops.assign(cluster_centers_initialized, True))
+    return control_flow_ops.cond(initialized, control_flow_ops.no_op,
+                                 lambda: assign_centers).op
+
+  def _create_variables(self):
+    """Creates variables.
+
+    Returns:
+    Tuple with following elements:
+      cluster_centers: a Tensor for storing cluster centers
+      cluster_centers_initialized: bool Variable indicating whether clusters
+        are initialized.
+      cluster_counts: a Tensor for storing counts of points assigned to this
+        cluster. This is used by mini-batch training.
+      cluster_centers_updated: Tensor representing copy of cluster centers that
+        are updated every step.
+      update_in_steps: numbers of steps left before we sync
+        cluster_centers_updated back to cluster_centers.
+    """
+    init_value = array_ops.constant([], dtype=dtypes.float32)
+    cluster_centers = variable_scope.variable(
+        init_value, name='clusters', validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(
+        False, dtype=dtypes.bool, name='initialized')
+
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      # Copy of cluster centers actively updated each step according to
+      # mini-batch update rule.
+      cluster_centers_updated = variable_scope.variable(
+          init_value, name='clusters_updated', validate_shape=False)
+      # How many steps till we copy the updated clusters to cluster_centers.
+      update_in_steps = variable_scope.variable(
+          self._mini_batch_steps_per_iteration,
+          dtype=dtypes.int64,
+          name='update_in_steps')
+      # Count of points assigned to cluster_centers_updated.
+      cluster_counts = variable_scope.variable(
+          array_ops.zeros([self._num_clusters], dtype=dtypes.int64))
+    else:
+      cluster_centers_updated = cluster_centers
+      update_in_steps = None
+      cluster_counts = (variable_scope.variable(
+          array_ops.ones([self._num_clusters], dtype=dtypes.int64))
+                        if self._use_mini_batch else None)
+    return (cluster_centers, cluster_centers_initialized, cluster_counts,
+            cluster_centers_updated, update_in_steps)
 
   @classmethod
   def _l2_normalize_data(cls, inputs):
@@ -290,11 +373,18 @@ class KMeans(object):
         corresponding to the input.
       scores: Similar to cluster_idx but specifies the distance to the
         assigned cluster instead.
+      cluster_centers_initialized: scalar indicating whether clusters have been
+        initialized.
+      init_op: an op to initialize the clusters.
       training_op: an op that runs an iteration of training.
     """
     # Implementation of kmeans.
     inputs = self._inputs
-    cluster_centers_var, total_counts = self._init_clusters()
+    (cluster_centers_var, cluster_centers_initialized, total_counts,
+     cluster_centers_updated, update_in_steps) = self._create_variables()
+    init_op = self._initialize_clusters(cluster_centers_var,
+                                        cluster_centers_initialized,
+                                        cluster_centers_updated)
     cluster_centers = cluster_centers_var
 
     if self._distance_metric == COSINE_DISTANCE:
@@ -304,18 +394,64 @@ class KMeans(object):
 
     all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
     if self._use_mini_batch:
-      training_op = self._mini_batch_training_op(inputs, cluster_idx,
-                                                 cluster_centers,
-                                                 cluster_centers_var,
-                                                 total_counts)
+      sync_updates_op = self._mini_batch_sync_updates_op(
+          update_in_steps, cluster_centers_var, cluster_centers_updated,
+          total_counts)
+      assert sync_updates_op is not None
+      with ops.control_dependencies([sync_updates_op]):
+        training_op = self._mini_batch_training_op(
+            inputs, cluster_idx, cluster_centers_updated, total_counts)
     else:
       assert cluster_centers == cluster_centers_var
       training_op = self._full_batch_training_op(inputs, cluster_idx,
                                                  cluster_centers_var)
-    return all_scores, cluster_idx, scores, training_op
+
+    return (all_scores, cluster_idx, scores, cluster_centers_initialized,
+            init_op, training_op)
+
+  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
+                                  cluster_centers_updated, total_counts):
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      assert update_in_steps is not None
+      with ops.colocate_with(update_in_steps):
+
+        def _f():
+          # Note that there is a race condition here, so we do a best effort
+          # updates here. We reset update_in_steps first so that other workers
+          # don't duplicate the updates. Also we update cluster_center_vars
+          # before resetting total_counts to avoid large updates to
+          # cluster_centers_updated based on partially updated
+          # cluster_center_vars.
+          with ops.control_dependencies([
+              state_ops.assign(update_in_steps,
+                               self._mini_batch_steps_per_iteration - 1)
+          ]):
+            with ops.colocate_with(
+                cluster_centers_updated, ignore_existing=True):
+              if self._distance_metric == COSINE_DISTANCE:
+                cluster_centers = nn_impl.l2_normalize(
+                    cluster_centers_updated, dim=1)
+              else:
+                cluster_centers = cluster_centers_updated
+            with ops.colocate_with(cluster_centers_var):
+              with ops.control_dependencies(
+                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
+                with ops.colocate_with(
+                    cluster_centers_var, ignore_existing=True):
+                  with ops.control_dependencies([
+                      state_ops.assign(total_counts,
+                                       array_ops.zeros_like(total_counts))
+                  ]):
+                    return array_ops.identity(update_in_steps)
+
+        return control_flow_ops.cond(
+            update_in_steps <= 0, _f,
+            lambda: state_ops.assign_sub(update_in_steps, 1))
+    else:
+      return control_flow_ops.no_op()
 
   def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
-                              cluster_centers_var, total_counts):
+                              total_counts):
     """Creates an op for training for mini batch case.
 
     Args:
@@ -323,8 +459,7 @@ class KMeans(object):
       cluster_idx_list: A vector (or list of vectors). Each element in the
         vector corresponds to an input row in 'inp' and specifies the cluster id
         corresponding to the input.
-      cluster_centers: Tensor of cluster centers, possibly normalized.
-      cluster_centers_var: Tensor Ref of cluster centers.
+      cluster_centers: Tensor Ref of cluster centers.
       total_counts: Tensor Ref of cluster counts.
 
     Returns:
@@ -340,16 +475,15 @@ class KMeans(object):
         unique_ids, unique_idx = array_ops.unique(cluster_idx)
         num_unique_cluster_idx = array_ops.size(unique_ids)
         # Fetch the old values of counts and cluster_centers.
-        with ops.colocate_with(total_counts):
+        with ops.colocate_with(total_counts, ignore_existing=True):
           old_counts = array_ops.gather(total_counts, unique_ids)
-        with ops.colocate_with(cluster_centers):
+        # TODO(agarwal): This colocation seems to run into problems. Fix it.
+        with ops.colocate_with(cluster_centers, ignore_existing=True):
           old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
         # Locally aggregate the increment to counts.
         count_updates = math_ops.unsorted_segment_sum(
-            array_ops.ones_like(
-                unique_idx, dtype=total_counts.dtype),
-            unique_idx,
-            num_unique_cluster_idx)
+            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
+            unique_idx, num_unique_cluster_idx)
         # Locally compute the sum of inputs mapped to each id.
         # For a cluster with old cluster value x, old count n, and with data
         # d_1,...d_k newly assigned to it, we recompute the new value as
@@ -359,13 +493,12 @@ class KMeans(object):
             inp, unique_idx, num_unique_cluster_idx)
         # Shape to enable broadcasting count_updates and learning_rate to inp.
         # It extends the shape with 1's to match the rank of inp.
-        broadcast_shape = array_ops.concat(
-            [
-                array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones(
-                    array_ops.reshape(array_ops.rank(inp) - 1, [1]),
-                    dtype=dtypes.int32)
-            ],
-            0)
+        broadcast_shape = array_ops.concat([
+            array_ops.reshape(num_unique_cluster_idx, [1]),
+            array_ops.ones(
+                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
+                dtype=dtypes.int32)
+        ], 0)
         # Subtract k * x, see comment above.
         cluster_center_updates -= math_ops.cast(
             array_ops.reshape(count_updates, broadcast_shape),
@@ -378,9 +511,8 @@ class KMeans(object):
         # Apply the updates.
       update_counts = state_ops.scatter_add(total_counts, unique_ids,
                                             count_updates)
-      update_cluster_centers = state_ops.scatter_add(cluster_centers_var,
-                                                     unique_ids,
-                                                     cluster_center_updates)
+      update_cluster_centers = state_ops.scatter_add(
+          cluster_centers, unique_ids, cluster_center_updates)
       update_ops.extend([update_counts, update_cluster_centers])
     return control_flow_ops.group(*update_ops)
 
@@ -401,7 +533,7 @@ class KMeans(object):
     cluster_counts = []
     epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
     for inp, cluster_idx in zip(inputs, cluster_idx_list):
-      with ops.colocate_with(inp):
+      with ops.colocate_with(inp, ignore_existing=True):
         cluster_sums.append(
             math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters))
         cluster_counts.append(
@@ -410,7 +542,7 @@ class KMeans(object):
                     array_ops.ones(
                         array_ops.reshape(array_ops.shape(inp)[0], [-1])),
                     [-1, 1]), cluster_idx, self._num_clusters))
-    with ops.colocate_with(cluster_centers):
+    with ops.colocate_with(cluster_centers, ignore_existing=True):
       new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast(
           math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon)
       if self._clusters_l2_normalized():
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 25cc66ca81d..000e2403a70 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -23,10 +23,8 @@ import numbers
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-# pylint: disable=wildcard-import,undefined-variable
-# pylint: enable=wildcard-import
 
-from tensorflow.contrib.factorization.python.ops.gen_factorization_ops import *
+from tensorflow.contrib.factorization.python.ops import gen_factorization_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,6 +40,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import resource_loader
 
@@ -53,10 +52,13 @@ class WALSModel(object):
   r"""A model for Weighted Alternating Least Squares matrix factorization.
 
   It minimizes the following loss function over U, V:
-   \\( ||W \odot (A - U V^T) ||_F^2 + \lambda (||U||_F^2 + ||V||_F^2) )\\
+   \\(
+   \|\sqrt W \odot (A - U V^T) \|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
+   )\\
     where,
     A: input matrix,
-    W: weight matrix,
+    W: weight matrix. Note that the (element-wise) square root of the weights
+      is used in the objective function.
     U, V: row_factors and column_factors matrices,
     \\(\lambda)\\: regularization.
   Also we assume that W is of the following special form:
@@ -75,6 +77,18 @@ class WALSModel(object):
   creating the worker caches and instead the relevant weight and factor values
   are looked up from parameter servers at each step.
 
+  Loss computation: The loss can be computed efficiently by decomposing it into
+  a sparse term and a Gramian term, see wals.md.
+  The loss is returned by the update_{col, row}_factors(sp_input), and is
+  normalized as follows:
+  _, _, minibatch_loss = update_row_factors(sp_input)
+  if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
+  total rows, then minibatch_loss is
+   \\(
+   (\|\sqrt W \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
+   \lambda \|V\|_F^2
+   )\\
+
   A typical usage example (pseudocode):
 
     with tf.Graph().as_default():
@@ -87,15 +101,15 @@ class WALSModel(object):
       model_init_op = model.initialize_op
 
       # To be run once per worker after session is available, prior to
-      # the gramian_prep_ops for row(column) can be run.
+      # the prep_gramian_op for row(column) can be run.
       worker_init_op = model.worker_init
 
       # To be run once per interation sweep before the row(column) update
       # initialize ops can be run. Note that in the distributed training
       # situations, this should only be run by the chief trainer. All other
       # trainers need to block until this is done.
-      row_update_gramian_prep_op = model.row_update_prep_gramian_op
-      col_update_gramian_prep_op = model.col_update_prep_gramian_op
+      row_update_prep_gramian_op = model.row_update_prep_gramian_op
+      col_update_prep_gramian_op = model.col_update_prep_gramian_op
 
       # To be run once per worker per iteration sweep. Must be run before
       # any actual update ops can be run.
@@ -105,17 +119,17 @@ class WALSModel(object):
       # Ops to upate row(column). This can either take the entire sparse tensor
       # or slices of sparse tensor. For distributed trainer, each trainer
       # handles just part of the matrix.
-      row_update_op = model.update_row_factors(
-           sp_input=matrix_slices_from_queue_for_worker_shard)[1]
-      col_update_op = model.update_col_factors(
+      _, row_update_op, row_loss = model.update_row_factors(
+           sp_input=matrix_slices_from_queue_for_worker_shard)
+      _, col_update_op, col_loss = model.update_col_factors(
            sp_input=transposed_matrix_slices_from_queue_for_worker_shard,
-           transpose_input=True)[1]
+           transpose_input=True)
 
       ...
 
       # model_init_op is passed to Supervisor. Chief trainer runs it. Other
       # trainers wait.
-      sv = tf.Supervisor(is_chief=is_chief,
+      sv = tf.train.Supervisor(is_chief=is_chief,
                          ...,
                          init_op=tf.group(..., model_init_op, ...), ...)
       ...
@@ -134,7 +148,7 @@ class WALSModel(object):
 
           # Row update sweep.
           if is_chief:
-            row_update_gramian_prep_op.run(session=sess)
+            row_update_prep_gramian_op.run(session=sess)
           else:
             wait_for_chief
 
@@ -152,7 +166,7 @@ class WALSModel(object):
 
           # Column update sweep.
           if is_chief:
-            col_update_gramian_prep_op.run(session=sess)
+            col_update_prep_gramian_op.run(session=sess)
           else:
             wait_for_chief
 
@@ -177,7 +191,8 @@ class WALSModel(object):
                num_col_shards=1,
                row_weights=1,
                col_weights=1,
-               use_factors_weights_cache=True):
+               use_factors_weights_cache=True,
+               use_gramian_cache=True):
     """Creates model for WALS matrix factorization.
 
     Args:
@@ -211,6 +226,8 @@ class WALSModel(object):
       col_weights: See row_weights.
       use_factors_weights_cache: When True, the factors and weights will be
         cached on the workers before the updates start. Defaults to True.
+      use_gramian_cache: When True, the Gramians will be cached on the workers
+        before the updates start. Defaults to True.
     """
     self._input_rows = input_rows
     self._input_cols = input_cols
@@ -218,10 +235,10 @@ class WALSModel(object):
     self._num_col_shards = num_col_shards
     self._n_components = n_components
     self._unobserved_weight = unobserved_weight
-    self._regularization = (array_ops.diag(
-        constant_op.constant(
-            regularization, shape=[self._n_components], dtype=dtypes.float32))
-                            if regularization is not None else None)
+    self._regularization = regularization
+    self._regularization_matrix = (
+        regularization * linalg_ops.eye(self._n_components)
+        if regularization is not None else None)
     assert (row_weights is None) == (col_weights is None)
     self._row_weights = WALSModel._create_weights(row_weights, self._input_rows,
                                                   self._num_row_shards,
@@ -230,6 +247,7 @@ class WALSModel(object):
                                                   self._num_col_shards,
                                                   "col_weights")
     self._use_factors_weights_cache = use_factors_weights_cache
+    self._use_gramian_cache = use_gramian_cache
     self._row_factors = self._create_factors(self._input_rows,
                                              self._n_components,
                                              self._num_row_shards, row_init,
@@ -311,7 +329,7 @@ class WALSModel(object):
       var_name = "%s_shard_%d" % (name, i)
       var_init = make_initializer(i, size)
       sharded_matrix.append(
-          variables.Variable(
+          variable_scope.variable(
               var_init, dtype=dtypes.float32, name=var_name))
 
     return sharded_matrix
@@ -370,7 +388,7 @@ class WALSModel(object):
       var_name = "%s_shard_%d" % (name, i)
       var_init = make_wt_initializer(i, size)
       sharded_weight.append(
-          variables.Variable(
+          variable_scope.variable(
               var_init, dtype=dtypes.float32, name=var_name))
 
     return sharded_weight
@@ -387,7 +405,7 @@ class WALSModel(object):
     Returns:
       A gramian Tensor with shape of [n_components, n_components].
     """
-    return variables.Variable(
+    return variable_scope.variable(
         array_ops.zeros([n_components, n_components]),
         dtype=dtypes.float32,
         name=name)
@@ -395,7 +413,7 @@ class WALSModel(object):
   @staticmethod
   def _transient_var(name):
     """Helper function to create a Variable."""
-    return variables.Variable(
+    return variable_scope.variable(
         1.0,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -482,10 +500,13 @@ class WALSModel(object):
     """Creates local cache of factors, weights and gramian for rows and columns.
 
     Note that currently the caching strategy is as follows:
-    When initiating a row(column) update, the column(row) gramian is computed
-    and cached while the row gramian is reset; optionally, column(row) factors
-    and weights are cached and row(column) factors and weights are reset when
-    use_factors_weights_cache is True.
+    When initiating a row (resp. column) update:
+      - The column (resp. row) gramian is computed.
+      - Optionally, if use_gramian_cache is True, the column (resp. row) Gramian
+        is cached, while the row (resp. column) gramian is reset.
+      - Optionally, if use_factors_weights_cache is True, the column (resp. row)
+        factors and weights are cached, while the row (resp. column) factors and
+        weights are reset.
     """
 
     (self._row_factors_cache, row_factors_cache_init,
@@ -502,18 +523,20 @@ class WALSModel(object):
         self._row_weights,
         "row_wt_cache",
         pass_through=not self._use_factors_weights_cache)
-
     (self._col_wt_cache, col_wt_cache_init, _) = self._cached_copy(
         self._col_weights,
         "col_wt_cache",
         pass_through=not self._use_factors_weights_cache)
-
     (self._row_gramian_cache, row_gramian_cache_init,
      row_gramian_cache_reset) = self._cached_copy(
-         self._row_gramian, "row_gramian_cache", pass_through=False)
+         self._row_gramian,
+         "row_gramian_cache",
+         pass_through=not self._use_gramian_cache)
     (self._col_gramian_cache, col_gramian_cache_init,
      col_gramian_cache_reset) = self._cached_copy(
-         self._col_gramian, "col_gramian_cache", pass_through=False)
+         self._col_gramian,
+         "col_gramian_cache",
+         pass_through=not self._use_gramian_cache)
 
     self._row_updates_init = control_flow_ops.group(col_factors_cache_init,
                                                     row_factors_cache_reset,
@@ -584,13 +607,14 @@ class WALSModel(object):
     return func
 
   @classmethod
-  def scatter_update(cls, factor, indices, values, sharding_func):
+  def scatter_update(cls, factor, indices, values, sharding_func, name=None):
     """Helper function for doing sharded scatter update."""
     assert isinstance(factor, list)
     if len(factor) == 1:
       with ops.colocate_with(factor[0]):
         # TODO(agarwal): assign instead of scatter update for full batch update.
-        return state_ops.scatter_update(factor[0], indices, values).op
+        return state_ops.scatter_update(factor[0], indices, values,
+                                        name=name).op
     else:
       num_shards = len(factor)
       assignments, new_ids = sharding_func(indices)
@@ -602,13 +626,12 @@ class WALSModel(object):
                                                        num_shards)
       updates = []
       for i in xrange(num_shards):
-        updates.append(
-            state_ops.scatter_update(factor[i], sharded_ids[i], sharded_values[
-                i]))
-      return control_flow_ops.group(*updates)
+        updates.append(state_ops.scatter_update(factor[i], sharded_ids[i],
+                                                sharded_values[i]))
+      return control_flow_ops.group(*updates, name=name)
 
   def update_row_factors(self, sp_input=None, transpose_input=False):
-    """Updates the row factors.
+    r"""Updates the row factors.
 
     Args:
       sp_input: A SparseTensor representing a subset of rows of the full input
@@ -618,16 +641,22 @@ class WALSModel(object):
         rows corresponding to the transposed input are updated.
 
     Returns:
-      A tuple consisting of the following two elements:
+      A tuple consisting of the following elements:
       new_values: New values for the row factors.
       update_op: An op that assigns the newly computed values to the row
         factors.
+      loss: A tensor (scalar) that contains the normalized minibatch loss,
+        corresponding to sp_input.
+        if sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
+        A has n total rows, then loss is:
+        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) *
+        n / |I| + \lambda \|V\|_F^2.
     """
-    return self._process_input_helper(
-        True, sp_input=sp_input, transpose_input=transpose_input)
+    return self._process_input_helper(True, sp_input=sp_input,
+                                      transpose_input=transpose_input)
 
   def update_col_factors(self, sp_input=None, transpose_input=False):
-    """Updates the column factors.
+    r"""Updates the column factors.
 
     Args:
       sp_input: A SparseTensor representing a subset of columns of the full
@@ -641,13 +670,17 @@ class WALSModel(object):
       new_values: New values for the column factors.
       update_op: An op that assigns the newly computed values to the column
         factors.
+      loss: A tensor (scalar) that contains the normalized minibatch loss,
+        corresponding to sp_input.
+        If sp_input contains the columns {A_{:, j}, j \in J}, and the input
+        matrix A has m total columns, then loss is:
+        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 + \lambda \|V_J\|_F^2) *
+        m / |J| + \lambda \|U\|_F^2.
     """
-    return self._process_input_helper(
-        False, sp_input=sp_input, transpose_input=transpose_input)
+    return self._process_input_helper(False, sp_input=sp_input,
+                                      transpose_input=transpose_input)
 
-  def project_row_factors(self,
-                          sp_input=None,
-                          transpose_input=False,
+  def project_row_factors(self, sp_input=None, transpose_input=False,
                           projection_weights=None):
     """Projects the row factors.
 
@@ -672,11 +705,9 @@ class WALSModel(object):
     """
     if projection_weights is None:
       projection_weights = 1
-    return self._process_input_helper(
-        True,
-        sp_input=sp_input,
-        transpose_input=transpose_input,
-        row_weights=projection_weights)[0]
+    return self._process_input_helper(True, sp_input=sp_input,
+                                      transpose_input=transpose_input,
+                                      row_weights=projection_weights)[0]
 
   def project_col_factors(self,
                           sp_input=None,
@@ -705,16 +736,12 @@ class WALSModel(object):
     """
     if projection_weights is None:
       projection_weights = 1
-    return self._process_input_helper(
-        False,
-        sp_input=sp_input,
-        transpose_input=transpose_input,
-        row_weights=projection_weights)[0]
+    return self._process_input_helper(False, sp_input=sp_input,
+                                      transpose_input=transpose_input,
+                                      row_weights=projection_weights)[0]
 
-  def _process_input_helper(self,
-                            update_row_factors,
-                            sp_input=None,
-                            transpose_input=False,
+  def _process_input_helper(self, update_row_factors,
+                            sp_input=None, transpose_input=False,
                             row_weights=None):
     """Creates the graph for processing a sparse slice of input.
 
@@ -734,10 +761,12 @@ class WALSModel(object):
         of columns to be updated/projected.
 
     Returns:
-      A tuple consisting of the following two elements:
+      A tuple consisting of the following three elements:
       new_values: New values for the row/column factors.
       update_op: An op that assigns the newly computed values to the row/column
         factors.
+      loss: A tensor (scalar) that contains the normalized minibatch loss,
+        corresponding to sp_input.
     """
     assert isinstance(sp_input, sparse_tensor.SparseTensor)
 
@@ -746,6 +775,7 @@ class WALSModel(object):
       right_factors = self._col_factors_cache
       row_wt = self._row_wt_cache
       col_wt = self._col_wt_cache
+      total_rows = self._input_rows
       sharding_func = WALSModel._get_sharding_func(self._input_rows,
                                                    self._num_row_shards)
       gramian = self._col_gramian_cache
@@ -754,6 +784,7 @@ class WALSModel(object):
       right_factors = self._row_factors_cache
       row_wt = self._col_wt_cache
       col_wt = self._row_wt_cache
+      total_rows = self._input_cols
       sharding_func = WALSModel._get_sharding_func(self._input_cols,
                                                    self._num_col_shards)
       gramian = self._row_gramian_cache
@@ -799,8 +830,8 @@ class WALSModel(object):
 
     # Compute lhs and rhs of the normal equations
     total_lhs = (self._unobserved_weight * gramian)
-    if self._regularization is not None:
-      total_lhs += self._regularization
+    if self._regularization_matrix is not None:
+      total_lhs += self._regularization_matrix
     if self._row_weights is None:
       # Special case of ALS. Use a much simpler update rule.
       total_rhs = (self._unobserved_weight *
@@ -819,30 +850,68 @@ class WALSModel(object):
         row_weights_slice = embedding_ops.embedding_lookup(
             row_wt, update_indices, partition_strategy="div")
       else:
+        num_indices = array_ops.shape(update_indices)[0]
         with ops.control_dependencies(
             [check_ops.assert_less_equal(array_ops.rank(row_weights), 1)]):
           row_weights_slice = control_flow_ops.cond(
               math_ops.equal(array_ops.rank(row_weights), 0),
-              lambda: (array_ops.ones([array_ops.shape(update_indices)[0]]) * row_weights),
+              lambda: (array_ops.ones([num_indices]) * row_weights),
               lambda: math_ops.cast(row_weights, dtypes.float32))
 
       col_weights = embedding_ops.embedding_lookup(
           col_wt, gather_indices, partition_strategy="div")
-      partial_lhs, total_rhs = wals_compute_partial_lhs_and_rhs(
-          right,
-          col_weights,
-          self._unobserved_weight,
-          row_weights_slice,
-          new_sp_input.indices,
-          new_sp_input.values,
-          num_rows,
-          transpose_input,
-          name="wals_compute_partial_lhs_rhs")
+      partial_lhs, total_rhs = (
+          gen_factorization_ops.wals_compute_partial_lhs_and_rhs(
+              right,
+              col_weights,
+              self._unobserved_weight,
+              row_weights_slice,
+              new_sp_input.indices,
+              new_sp_input.values,
+              num_rows,
+              transpose_input,
+              name="wals_compute_partial_lhs_rhs"))
       total_lhs = array_ops.expand_dims(total_lhs, 0) + partial_lhs
       total_rhs = array_ops.expand_dims(total_rhs, -1)
       new_left_values = array_ops.squeeze(
           linalg_ops.matrix_solve(total_lhs, total_rhs), [2])
 
-    return (new_left_values, self.scatter_update(left, update_indices,
-                                                 new_left_values,
-                                                 sharding_func))
+    update_op_name = "row_update" if update_row_factors else "col_update"
+    update_op = self.scatter_update(left, update_indices, new_left_values,
+                                    sharding_func, name=update_op_name)
+
+    # Create the loss subgraph
+    loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
+                     if transpose_input else new_sp_input)
+    # sp_approx is the low rank estimate of the input matrix, formed by
+    # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
+    sp_approx_vals = gen_factorization_ops.masked_matmul(
+        new_left_values, right, loss_sp_input.indices, transpose_a=False,
+        transpose_b=True)
+    sp_approx = sparse_tensor.SparseTensor(
+        loss_sp_input.indices, sp_approx_vals, loss_sp_input.dense_shape)
+    sp_approx_sq = math_ops.square(sp_approx)
+    sp_residual = sparse_ops.sparse_add(loss_sp_input, sp_approx * (-1))
+    sp_residual_sq = math_ops.square(sp_residual)
+    row_wt_mat = (constant_op.constant(0.) if self._row_weights is None else
+                  array_ops.expand_dims(row_weights_slice, 1))
+    col_wt_mat = (constant_op.constant(0.) if self._col_weights is None else
+                  array_ops.expand_dims(col_weights, 0))
+    # We return the normalized loss
+    partial_row_gramian = math_ops.matmul(
+        new_left_values, new_left_values, transpose_a=True)
+    normalization_factor = total_rows / math_ops.cast(num_rows, dtypes.float32)
+    loss = (
+        self._unobserved_weight * (
+            sparse_ops.sparse_reduce_sum(sp_residual_sq) -
+            sparse_ops.sparse_reduce_sum(sp_approx_sq) +
+            math_ops.trace(math_ops.matmul(partial_row_gramian, gramian))
+        ) +
+        sparse_ops.sparse_reduce_sum(row_wt_mat * (sp_residual_sq * col_wt_mat))
+    ) * normalization_factor
+    if self._regularization is not None:
+      loss += self._regularization * (
+          math_ops.trace(partial_row_gramian) * normalization_factor +
+          math_ops.trace(gramian)
+      )
+    return (new_left_values, update_op, loss)
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index 75ef87d15d5..bcee8818545 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -18,75 +18,56 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.factorization.python.ops import factorization_ops
+from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-INPUT_MATRIX = np.array(
-    [[0.1, 0.0, 0.2, 0.0, 0.4, 0.5, 0.0],
-     [0.0, 1.1, 0.0, 1.3, 1.4, 0.0, 1.6],
-     [2.0, 0.0, 0.0, 2.3, 0.0, 2.5, 0.0],
-     [3.0, 0.0, 3.2, 3.3, 0.0, 3.5, 0.0],
-     [0.0, 4.1, 0.0, 0.0, 4.4, 0.0, 4.6]]).astype(np.float32)
 
-
-def np_matrix_to_tf_sparse(np_matrix,
-                           row_slices=None,
-                           col_slices=None,
-                           transpose=False,
-                           shuffle=False):
-  """Simple util to slice non-zero np matrix elements as tf.SparseTensor."""
-  indices = np.nonzero(np_matrix)
-
-  # Only allow slices of whole rows or whole columns.
-  assert not (row_slices is not None and col_slices is not None)
-
-  if row_slices is not None:
-    selected_ind = np.concatenate(
-        [np.where(indices[0] == r)[0] for r in row_slices], 0)
-    indices = (indices[0][selected_ind], indices[1][selected_ind])
-
-  if col_slices is not None:
-    selected_ind = np.concatenate(
-        [np.where(indices[1] == c)[0] for c in col_slices], 0)
-    indices = (indices[0][selected_ind], indices[1][selected_ind])
-
-  if shuffle:
-    shuffled_ind = [x for x in range(len(indices[0]))]
-    random.shuffle(shuffled_ind)
-    indices = (indices[0][shuffled_ind], indices[1][shuffled_ind])
-
-  ind = (np.concatenate((np.expand_dims(indices[1], 1),
-                         np.expand_dims(indices[0], 1)), 1).astype(np.int64) if
-         transpose else np.concatenate((np.expand_dims(indices[0], 1),
-                                        np.expand_dims(indices[1], 1)),
-                                       1).astype(np.int64))
-  val = np_matrix[indices].astype(np.float32)
-  shape = (np.array([max(indices[1]) + 1, max(indices[0]) + 1]).astype(np.int64)
-           if transpose else np.array(
-               [max(indices[0]) + 1, max(indices[1]) + 1]).astype(np.int64))
-  return sparse_tensor.SparseTensor(ind, val, shape)
-
-
-def sparse_input():
-  return np_matrix_to_tf_sparse(INPUT_MATRIX)
+INPUT_MATRIX = factorization_ops_test_utils.INPUT_MATRIX
+np_matrix_to_tf_sparse = factorization_ops_test_utils.np_matrix_to_tf_sparse
 
 
 class WalsModelTest(test.TestCase):
 
+  def sparse_input(self):
+    return np_matrix_to_tf_sparse(INPUT_MATRIX)
+
+  def count_rows(self, sp_input):
+    return math_ops.cast(
+        array_ops.shape(array_ops.unique(sp_input.indices[:, 0])[0])[0],
+        dtypes.float32)
+
+  def count_cols(self, sp_input):
+    return math_ops.cast(
+        array_ops.shape(array_ops.unique(sp_input.indices[:, 1])[0])[0],
+        dtypes.float32)
+
+  def calculate_loss_from_wals_model(self, wals_model, sp_inputs):
+    current_rows = embedding_ops.embedding_lookup(
+        wals_model.row_factors, math_ops.range(wals_model._input_rows),
+        partition_strategy="div")
+    current_cols = embedding_ops.embedding_lookup(
+        wals_model.col_factors, math_ops.range(wals_model._input_cols),
+        partition_strategy="div")
+    row_wts = embedding_ops.embedding_lookup(
+        wals_model._row_weights, math_ops.range(wals_model._input_rows),
+        partition_strategy="div")
+    col_wts = embedding_ops.embedding_lookup(
+        wals_model._col_weights, math_ops.range(wals_model._input_cols),
+        partition_strategy="div")
+    return factorization_ops_test_utils.calculate_loss(
+        sp_inputs, current_rows, current_cols, wals_model._regularization,
+        wals_model._unobserved_weight, row_wts, col_wts)
+
   def setUp(self):
     self.col_init = [
         # shard 0
@@ -103,7 +84,6 @@ class WalsModelTest(test.TestCase):
 
     self.row_wts = [[0.1, 0.2, 0.3], [0.4, 0.5]]
     self.col_wts = [[0.1, 0.2, 0.3], [0.4, 0.5], [0.6, 0.7]]
-    self._wals_inputs = sparse_input()
 
     # Values of factor shards after running one iteration of row and column
     # updates.
@@ -120,13 +100,19 @@ class WalsModelTest(test.TestCase):
     self._col_factors_2 = [[3.3459, -1.3341, -3.3008],
                            [0.57366, 1.83729, 1.26798]]
 
-  def _run_test_process_input(self, use_factors_weights_cache):
-    with self.test_session():
+  def _run_test_process_input(self,
+                              use_factors_weights_cache,
+                              compute_loss=False):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      self._wals_inputs = self.sparse_input()
       sp_feeder = array_ops.sparse_placeholder(dtypes.float32)
+      num_rows = 5
+      num_cols = 7
+      factor_dim = 3
       wals_model = factorization_ops.WALSModel(
-          5,
-          7,
-          3,
+          num_rows,
+          num_cols,
+          factor_dim,
           num_row_shards=2,
           num_col_shards=3,
           regularization=0.01,
@@ -152,8 +138,8 @@ class WalsModelTest(test.TestCase):
       # Here we feed in scattered rows of the input.
       wals_model.row_update_prep_gramian_op.run()
       wals_model.initialize_row_update_op.run()
-      process_input_op = wals_model.update_row_factors(
-          sp_input=sp_feeder, transpose_input=False)[1]
+      _, process_input_op, factor_loss = wals_model.update_row_factors(
+          sp_input=sp_feeder, transpose_input=False)
       for inp in input_scattered_rows:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -189,6 +175,19 @@ class WalsModelTest(test.TestCase):
           [[0.569082, 0.715088, 0.31777], [1.915879, 1.992677, 1.109057]],
           atol=1e-3)
 
+      if compute_loss:
+        # Test loss computation after the row update
+        loss = sum(
+            sess.run(factor_loss * self.count_rows(inp) / num_rows,
+                     feed_dict={sp_feeder: inp})
+            for inp in input_scattered_rows)
+        true_loss = self.calculate_loss_from_wals_model(
+            wals_model, self._wals_inputs)
+        self.assertNear(
+            loss, true_loss, err=.001,
+            msg="""After row update, computed loss = {}, does not match
+            the true loss = {}.""".format(loss, true_loss))
+
       # Split input into multiple sparse tensors with scattered columns. Note
       # that here the elements in the sparse tensors are not ordered and also
       # do not need to consist of consecutive columns. However, each column
@@ -201,13 +200,14 @@ class WalsModelTest(test.TestCase):
           INPUT_MATRIX, col_slices=[3, 6], shuffle=True).eval()
 
       input_scattered_cols = [sp_c0, sp_c1, sp_c2, sp_c3]
+      input_scattered_cols_non_duplicate = [sp_c0, sp_c1, sp_c2]
 
       # Test updating column factors.
       # Here we feed in scattered columns of the input.
       wals_model.col_update_prep_gramian_op.run()
       wals_model.initialize_col_update_op.run()
-      process_input_op = wals_model.update_col_factors(
-          sp_input=sp_feeder, transpose_input=False)[1]
+      _, process_input_op, factor_loss = wals_model.update_col_factors(
+          sp_input=sp_feeder, transpose_input=False)
       for inp in input_scattered_cols:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -248,13 +248,31 @@ class WalsModelTest(test.TestCase):
            [0.346433, 1.360644, 1.677121]],
           atol=1e-3)
 
-  def _run_test_process_input_transposed(self, use_factors_weights_cache):
-    with self.test_session():
+      if compute_loss:
+        # Test loss computation after the column update.
+        loss = sum(
+            sess.run(factor_loss * self.count_cols(inp) / num_cols,
+                     feed_dict={sp_feeder: inp})
+            for inp in input_scattered_cols_non_duplicate)
+        true_loss = self.calculate_loss_from_wals_model(
+            wals_model, self._wals_inputs)
+        self.assertNear(
+            loss, true_loss, err=.001,
+            msg="""After col update, computed loss = {}, does not match the true
+            loss = {}.""".format(loss, true_loss))
+
+  def _run_test_process_input_transposed(self, use_factors_weights_cache,
+                                         compute_loss=False):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      self._wals_inputs = self.sparse_input()
       sp_feeder = array_ops.sparse_placeholder(dtypes.float32)
+      num_rows = 5
+      num_cols = 7
+      factor_dim = 3
       wals_model = factorization_ops.WALSModel(
-          5,
-          7,
-          3,
+          num_rows,
+          num_cols,
+          factor_dim,
           num_row_shards=2,
           num_col_shards=3,
           regularization=0.01,
@@ -278,7 +296,7 @@ class WalsModelTest(test.TestCase):
       sp_r2_t = np_matrix_to_tf_sparse(INPUT_MATRIX, [2], transpose=True).eval()
       sp_r3_t = sp_r1_t
       input_scattered_rows = [sp_r0_t, sp_r1_t, sp_r2_t, sp_r3_t]
-
+      input_scattered_rows_non_duplicate = [sp_r0_t, sp_r1_t, sp_r2_t]
       # Test updating row factors.
       # Here we feed in scattered rows of the input.
       # Note that the needed suffix of placeholder are in the order of test
@@ -286,8 +304,8 @@ class WalsModelTest(test.TestCase):
       # they appear.
       wals_model.row_update_prep_gramian_op.run()
       wals_model.initialize_row_update_op.run()
-      process_input_op = wals_model.update_row_factors(
-          sp_input=sp_feeder, transpose_input=True)[1]
+      _, process_input_op, factor_loss = wals_model.update_row_factors(
+          sp_input=sp_feeder, transpose_input=True)
       for inp in input_scattered_rows:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -323,6 +341,19 @@ class WalsModelTest(test.TestCase):
           [[1.915879, 1.992677, 1.109057], [0.569082, 0.715088, 0.31777]],
           atol=1e-3)
 
+      if compute_loss:
+        # Test loss computation after the row update
+        loss = sum(
+            sess.run(factor_loss * self.count_cols(inp) / num_rows,
+                     feed_dict={sp_feeder: inp})
+            for inp in input_scattered_rows_non_duplicate)
+        true_loss = self.calculate_loss_from_wals_model(
+            wals_model, self._wals_inputs)
+        self.assertNear(
+            loss, true_loss, err=.001,
+            msg="""After row update, computed loss = {}, does not match the true
+            loss = {}.""".format(loss, true_loss))
+
       # Split input into multiple SparseTensors with scattered columns.
       # Here the inputs are transposed. But the same constraints as described in
       # the previous non-transposed test case apply to these inputs (before they
@@ -338,13 +369,14 @@ class WalsModelTest(test.TestCase):
 
       sp_c4_t = sp_c2_t
       input_scattered_cols = [sp_c0_t, sp_c1_t, sp_c2_t, sp_c3_t, sp_c4_t]
+      input_scattered_cols_non_duplicate = [sp_c0_t, sp_c1_t, sp_c2_t, sp_c3_t]
 
       # Test updating column factors.
       # Here we feed in scattered columns of the input.
       wals_model.col_update_prep_gramian_op.run()
       wals_model.initialize_col_update_op.run()
-      process_input_op = wals_model.update_col_factors(
-          sp_input=sp_feeder, transpose_input=True)[1]
+      _, process_input_op, factor_loss = wals_model.update_col_factors(
+          sp_input=sp_feeder, transpose_input=True)
       for inp in input_scattered_cols:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -377,15 +409,28 @@ class WalsModelTest(test.TestCase):
           [[3.585139, -0.487476, -3.852232],
            [0.557937, 1.813907, 1.331171]],
           atol=1e-3)
+      if compute_loss:
+        # Test loss computation after the col update
+        loss = sum(
+            sess.run(factor_loss * self.count_rows(inp) / num_cols,
+                     feed_dict={sp_feeder: inp})
+            for inp in input_scattered_cols_non_duplicate)
+        true_loss = self.calculate_loss_from_wals_model(
+            wals_model, self._wals_inputs)
+        self.assertNear(
+            loss, true_loss, err=.001,
+            msg="""After col update, computed loss = {}, does not match the true
+            loss = {}.""".format(loss, true_loss))
 
-  # Note that when row_weights and col_weights are 0, WALS gives dentical
+  # Note that when row_weights and col_weights are 0, WALS gives identical
   # results as ALS (Alternating Least Squares). However our implementation does
   # not handle the case of zero weights differently. Instead, when row_weights
   # and col_weights are set to None, we interpret that as the ALS case, and
   # trigger the more efficient ALS updates.
   # Here we test that those two give identical results.
   def _run_test_als(self, use_factors_weights_cache):
-    with self.test_session():
+    with ops.Graph().as_default(), self.test_session():
+      self._wals_inputs = self.sparse_input()
       col_init = np.random.rand(7, 3)
       als_model = factorization_ops.WALSModel(
           5,
@@ -463,7 +508,8 @@ class WalsModelTest(test.TestCase):
           atol=1e-2)
 
   def _run_test_als_transposed(self, use_factors_weights_cache):
-    with self.test_session():
+    with ops.Graph().as_default(), self.test_session():
+      self._wals_inputs = self.sparse_input()
       col_init = np.random.rand(7, 3)
       als_model = factorization_ops.WALSModel(
           5,
@@ -552,7 +598,7 @@ class WalsModelTest(test.TestCase):
     rows = 15
     cols = 11
     dims = 3
-    with self.test_session():
+    with ops.Graph().as_default(), self.test_session():
       data = np.dot(np.random.rand(rows, 3),
                     np.random.rand(3, cols)).astype(np.float32) / 3.0
       indices = [[i, j] for i in xrange(rows) for j in xrange(cols)]
@@ -582,7 +628,7 @@ class WalsModelTest(test.TestCase):
     cols = 11
     dims = 3
 
-    with self.test_session():
+    with ops.Graph().as_default(), self.test_session():
       data = np.dot(np.random.rand(rows, 3),
                     np.random.rand(3, cols)).astype(np.float32) / 3.0
       indices = [[i, j] for i in xrange(rows) for j in xrange(cols)]
@@ -615,7 +661,7 @@ class WalsModelTest(test.TestCase):
     def keep_index(x):
       return not (x[0] + x[1]) % 4
 
-    with self.test_session():
+    with ops.Graph().as_default(), self.test_session():
       row_wts = 0.1 + np.random.rand(rows)
       col_wts = 0.1 + np.random.rand(cols)
       data = np.dot(np.random.rand(rows, 3),
@@ -683,6 +729,17 @@ class WalsModelTest(test.TestCase):
   def test_train_matrix_completion_wals_without_cache(self):
     self._run_test_train_matrix_completion_wals(False)
 
+  def test_loss_transposed_with_cache(self):
+    self._run_test_process_input_transposed(True, compute_loss=True)
+
+  def test_loss_transposed_without_cache(self):
+    self._run_test_process_input_transposed(False, compute_loss=True)
+
+  def test_loss_with_cache(self):
+    self._run_test_process_input(True, compute_loss=True)
+
+  def test_loss_without_cache(self):
+    self._run_test_process_input(False, compute_loss=True)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
new file mode 100644
index 00000000000..ead9474805c
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utils for factorization_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+
+
+INPUT_MATRIX = np.array(
+    [[0.1, 0.0, 0.2, 0.0, 0.4, 0.5, 0.0],
+     [0.0, 1.1, 0.0, 1.3, 1.4, 0.0, 1.6],
+     [2.0, 0.0, 0.0, 2.3, 0.0, 2.5, 0.0],
+     [3.0, 0.0, 3.2, 3.3, 0.0, 3.5, 0.0],
+     [0.0, 4.1, 0.0, 0.0, 4.4, 0.0, 4.6]]).astype(np.float32)
+
+
+def remove_empty_rows_columns(np_matrix):
+  """Simple util to remove empty rows and columns of a matrix.
+
+  Args:
+    np_matrix: A numpy array.
+  Returns:
+    A tuple consisting of:
+    mat: A numpy matrix obtained by removing empty rows and columns from
+      np_matrix.
+    nz_row_ids: A numpy array of the ids of non-empty rows, such that
+      nz_row_ids[i] is the old row index corresponding to new index i.
+    nz_col_ids: A numpy array of the ids of non-empty columns, such that
+      nz_col_ids[j] is the old column index corresponding to new index j.
+  """
+  nz_row_ids = np.where(np.sum(np_matrix, axis=1) != 0)[0]
+  nz_col_ids = np.where(np.sum(np_matrix, axis=0) != 0)[0]
+  mat = np_matrix[np.ix_(nz_row_ids, nz_col_ids)]
+  return mat, nz_row_ids, nz_col_ids
+
+
+def np_matrix_to_tf_sparse(np_matrix,
+                           row_slices=None,
+                           col_slices=None,
+                           transpose=False,
+                           shuffle=False):
+  """Simple util to slice non-zero np matrix elements as tf.SparseTensor."""
+  indices = np.nonzero(np_matrix)
+
+  # Only allow slices of whole rows or whole columns.
+  assert not (row_slices is not None and col_slices is not None)
+
+  if row_slices is not None:
+    selected_ind = np.concatenate(
+        [np.where(indices[0] == r)[0] for r in row_slices], 0)
+    indices = (indices[0][selected_ind], indices[1][selected_ind])
+
+  if col_slices is not None:
+    selected_ind = np.concatenate(
+        [np.where(indices[1] == c)[0] for c in col_slices], 0)
+    indices = (indices[0][selected_ind], indices[1][selected_ind])
+
+  if shuffle:
+    shuffled_ind = [x for x in range(len(indices[0]))]
+    random.shuffle(shuffled_ind)
+    indices = (indices[0][shuffled_ind], indices[1][shuffled_ind])
+
+  ind = (np.concatenate((np.expand_dims(indices[1], 1),
+                         np.expand_dims(indices[0], 1)), 1).astype(np.int64) if
+         transpose else np.concatenate((np.expand_dims(indices[0], 1),
+                                        np.expand_dims(indices[1], 1)),
+                                       1).astype(np.int64))
+  val = np_matrix[indices].astype(np.float32)
+  shape = (np.array([max(indices[1]) + 1, max(indices[0]) + 1]).astype(np.int64)
+           if transpose else np.array(
+               [max(indices[0]) + 1, max(indices[1]) + 1]).astype(np.int64))
+  return sparse_tensor.SparseTensor(ind, val, shape)
+
+
+def calculate_loss(input_mat, row_factors, col_factors, regularization=None,
+                   w0=1., row_weights=None, col_weights=None):
+  """Calculates the loss of a given factorization.
+
+  Using a non distributed method, different than the one implemented in the
+  WALS model. The weight of an observed entry (i, j) (i.e. such that
+  input_mat[i, j] is non zero) is (w0 + row_weights[i]col_weights[j]).
+
+  Args:
+    input_mat: The input matrix, a SparseTensor of rank 2.
+    row_factors: The row factors, a dense Tensor of rank 2.
+    col_factors: The col factors, a dense Tensor of rank 2.
+    regularization: the regularization coefficient, a scalar.
+    w0: the weight of unobserved entries. A scalar.
+    row_weights: A dense tensor of rank 1.
+    col_weights: A dense tensor of rank 1.
+
+  Returns:
+    The total loss.
+  """
+  wr = (array_ops.expand_dims(row_weights, 1) if row_weights is not None
+        else constant_op.constant(1.))
+  wc = (array_ops.expand_dims(col_weights, 0) if col_weights is not None
+        else constant_op.constant(1.))
+  reg = (regularization if regularization is not None
+         else constant_op.constant(0.))
+
+  row_indices, col_indices = array_ops.split(input_mat.indices,
+                                             axis=1,
+                                             num_or_size_splits=2)
+  gathered_row_factors = array_ops.gather(row_factors, row_indices)
+  gathered_col_factors = array_ops.gather(col_factors, col_indices)
+  sp_approx_vals = array_ops.squeeze(math_ops.matmul(
+      gathered_row_factors, gathered_col_factors, adjoint_b=True))
+  sp_approx = sparse_tensor.SparseTensor(
+      indices=input_mat.indices,
+      values=sp_approx_vals,
+      dense_shape=input_mat.dense_shape)
+
+  sp_approx_sq = math_ops.square(sp_approx)
+  row_norm = math_ops.reduce_sum(math_ops.square(row_factors))
+  col_norm = math_ops.reduce_sum(math_ops.square(col_factors))
+  row_col_norm = math_ops.reduce_sum(math_ops.square(math_ops.matmul(
+      row_factors, col_factors, transpose_b=True)))
+
+  resid = sparse_ops.sparse_add(input_mat, sp_approx * (-1))
+  resid_sq = math_ops.square(resid)
+  loss = w0 * (
+      sparse_ops.sparse_reduce_sum(resid_sq) -
+      sparse_ops.sparse_reduce_sum(sp_approx_sq)
+      )
+  loss += (sparse_ops.sparse_reduce_sum(wr * (resid_sq * wc)) +
+           w0 * row_col_norm + reg * (row_norm + col_norm))
+  return loss.eval()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 72d01fbb2ae..396dd286b6c 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -102,12 +102,12 @@ class GMM(estimator.Estimator):
     results = self.evaluate(input_fn=input_fn, batch_size=batch_size,
                             steps=steps)
     return np.sum(results[GMM.SCORES])
-  
+
   def weights(self):
     """Returns the cluster weights."""
     return checkpoint_utils.load_variable(
         self.model_dir, gmm_ops.GmmAlgorithm.CLUSTERS_WEIGHT)
-    
+
   def clusters(self):
     """Returns cluster centers."""
     clusters = checkpoint_utils.load_variable(
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index fbf7afc1251..b092eab3166 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 from tensorflow.python.summary import summary
 
@@ -85,7 +85,7 @@ def _init_clusters_random(data, num_clusters, random_seed):
         maxval=math_ops.cast(num_data, dtypes.int64),
         seed=random_seed,
         dtype=dtypes.int64)
-  indices = math_ops.cast(indices, dtypes.int32) % num_data
+  indices %= math_ops.cast(num_data, dtypes.int64)
   clusters_init = embedding_lookup(data, indices, partition_strategy='div')
   return clusters_init
 
@@ -161,14 +161,14 @@ class GmmAlgorithm(object):
     first_shard = data[0]
     # Initialize means: num_classes X 1 X dimensions.
     if initial_means is not None:
-      self._means = variables.Variable(
+      self._means = variable_scope.variable(
           array_ops.expand_dims(initial_means, 1),
           name=self.CLUSTERS_VARIABLE,
           validate_shape=False,
           dtype=dtypes.float32)
     else:
       # Sample data randomly
-      self._means = variables.Variable(
+      self._means = variable_scope.variable(
           array_ops.expand_dims(
               _init_clusters_random(data, self._num_classes, self._random_seed),
               1),
@@ -187,14 +187,14 @@ class GmmAlgorithm(object):
       covs = array_ops.tile(
           array_ops.expand_dims(array_ops.diag_part(cov), 0),
           [self._num_classes, 1])
-    self._covs = variables.Variable(
+    self._covs = variable_scope.variable(
         covs, name=self.CLUSTERS_COVS_VARIABLE, validate_shape=False)
     # Mixture weights, representing the probability that a randomly
     # selected unobservable data (in EM terms) was generated by component k.
-    self._alpha = variables.Variable(
+    self._alpha = variable_scope.variable(
         array_ops.tile([1.0 / self._num_classes], [self._num_classes]),
-                       name=self.CLUSTERS_WEIGHT,
-                       validate_shape=False)
+        name=self.CLUSTERS_WEIGHT,
+        validate_shape=False)
 
   def training_ops(self):
     """Returns the training operation."""
@@ -315,9 +315,8 @@ class GmmAlgorithm(object):
     Args:
       shard_id: id of current shard_id.
     """
-    self._prior_probs[shard_id] = math_ops.log(
-        math_ops.reduce_sum(
-            math_ops.exp(self._probs[shard_id]), 1, keep_dims=True))
+    self._prior_probs[shard_id] = math_ops.reduce_logsumexp(
+        self._probs[shard_id], axis=1, keep_dims=True)
 
   def _define_expectation_operation(self, shard_id):
     # Shape broadcasting.
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
index 0c06e4f5d8f..df8fc40ffae 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -18,14 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import time
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index c951a6981f6..758c54fbf49 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -208,6 +201,27 @@ class GMMTest(test.TestCase):
   def test_compare_diag(self):
     self._compare_with_sklearn('diag')
 
+  def test_random_input_large(self):
+    # sklearn version.
+    iterations = 5  # that should be enough to know whether this diverges
+    np.random.seed(5)
+    num_classes = 20
+    x = np.array([[np.random.random() for _ in range(100)]
+                  for _ in range(num_classes)], dtype=np.float32)
+
+    # skflow version.
+    gmm = gmm_lib.GMM(num_classes,
+                      covariance_type='full',
+                      config=run_config.RunConfig(tf_random_seed=2))
+
+    def get_input_fn(x):
+      def input_fn():
+        return constant_op.constant(x.astype(np.float32)), None
+      return input_fn
+
+    gmm.fit(input_fn=get_input_fn(x), steps=iterations)
+    self.assertFalse(np.isnan(gmm.clusters()).any())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
new file mode 100644
index 00000000000..0bc0ef39ec9
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -0,0 +1,622 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Weighted Alternating Least Squares (WALS) on the tf.learn API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.factorization.python.ops import factorization_ops
+from tensorflow.contrib.framework.python.ops import variables as framework_variables
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import session_run_hook
+
+
+class _SweepHook(session_run_hook.SessionRunHook):
+  """Keeps track of row/col sweeps, and runs prep ops before each sweep."""
+
+  def __init__(self,
+               is_row_sweep_var,
+               train_op,
+               num_rows,
+               num_cols,
+               processed_row_indices,
+               processed_col_indices,
+               row_prep_ops,
+               col_prep_ops,
+               cache_init_ops,
+               completed_sweeps_var):
+    """Initializes SweepHook.
+
+    Args:
+      is_row_sweep_var: A Boolean tf.Variable, determines whether we are
+        currently doing a row or column sweep. It is updated by the hook.
+      train_op: An op. All the ops created by the hook will have
+        control_dependencies on train_op.
+      num_rows: int, the total number of rows to be processed.
+      num_cols: int, the total number of columns to be processed.
+      processed_row_indices: A Tensor of type int64. The indices of the input
+        rows that are processed during the current sweep. All elements of
+        processed_row_indices must be in [0, num_rows).
+      processed_col_indices: A Tensor of type int64. The indices of the input
+        columns that are processed during the current sweep. All elements of
+        processed_col_indices must be in [0, num_cols).
+      row_prep_ops: list of ops, to be run before the beginning of each row
+        sweep, in the given order.
+      col_prep_ops: list of ops, to be run before the beginning of each column
+        sweep, in the given order.
+      cache_init_ops: list of ops, to be run once before training, in the given
+        order. These are typically local initialization ops (such as cache
+        initialization).
+      completed_sweeps_var: An integer tf.Variable, indicates the number of
+        completed sweeps. It is updated by the hook.
+    """
+    self._num_rows = num_rows
+    self._num_cols = num_cols
+    self._row_prep_ops = row_prep_ops
+    self._col_prep_ops = col_prep_ops
+    self._cache_init_ops = cache_init_ops
+    self._is_row_sweep_var = is_row_sweep_var
+    self._completed_sweeps_var = completed_sweeps_var
+    # Boolean variable that determines whether the cache_init_ops have been run.
+    self._is_initialized = False
+    # Boolean variable that is set to True when a sweep is completed.
+    # Used to run the prep_ops at the beginning of a sweep, in before_run().
+    self._is_sweep_done = False
+    # Ops to run jointly with train_op, responsible for updating
+    # _is_row_sweep_var and incrementing the global_step and completed_sweeps
+    # counters. They have control_dependencies on train_op.
+    self._fetches = self._create_switch_ops(processed_row_indices,
+                                            processed_col_indices,
+                                            train_op)
+
+  def _create_switch_ops(self,
+                         processed_row_indices,
+                         processed_col_indices,
+                         train_op):
+    """Creates ops to update is_row_sweep_var, global_step and completed_sweeps.
+
+    Creates two boolean tensors processed_rows and processed_cols, which keep
+    track of which rows/cols have been processed during the current sweep.
+    Returns ops that should be run after each row / col update.
+      - When is_row_sweep_var is True, it sets
+        processed_rows[processed_row_indices] to True.
+      - When is_row_sweep_var is False, it sets
+        processed_cols[processed_col_indices] to True .
+    When all rows or all cols have been processed, negates is_row_sweep_var,
+    increments the completed_sweeps counter, and resets processed_rows and
+    processed_cols to False.
+    All of the ops created by this function have control_dependencies on
+    train_op.
+
+    Args:
+      processed_row_indices: A Tensor. The indices of the input rows that are
+        processed during the current sweep.
+      processed_col_indices: A Tensor. The indices of the input columns that
+        are processed during the current sweep.
+      train_op: An op. All the ops created by this function have
+        control_dependencies on train_op.
+    Returns:
+      A list consisting of:
+        is_sweep_done: A Boolean tensor, determines whether the sweep is done,
+          i.e. all rows (during a row sweep) or all columns (during a column
+          sweep) have been processed.
+        switch_ops: An op that updates is_row_sweep_var when is_sweep_done is
+          True. Has control_dependencies on train_op.
+        incr_ops: An op that increments the global_step and completed_sweeps
+          counters. Has control_dependenciens on switch_ops.
+    """
+
+    processed_rows_init = array_ops.fill(dims=[self._num_rows], value=False)
+    with ops.colocate_with(processed_rows_init):
+      processed_rows = variable_scope.variable(
+          processed_rows_init,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+          trainable=False,
+          name="sweep_hook_processed_rows")
+    processed_cols_init = array_ops.fill(dims=[self._num_cols], value=False)
+    with ops.colocate_with(processed_cols_init):
+      processed_cols = variable_scope.variable(
+          processed_cols_init,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+          trainable=False,
+          name="sweep_hook_processed_cols")
+    # After running the train_op, update processed_rows or processed_cols
+    # tensors, depending on whether we are currently doing a row or a col sweep
+    with ops.control_dependencies([train_op]):
+      def get_row_update_op():
+        with ops.colocate_with(processed_rows):
+          return state_ops.scatter_update(
+              processed_rows, processed_row_indices,
+              array_ops.ones_like(processed_row_indices, dtype=dtypes.bool))
+
+      def get_col_update_op():
+        with ops.colocate_with(processed_cols):
+          return state_ops.scatter_update(
+              processed_cols, processed_col_indices,
+              array_ops.ones_like(processed_col_indices, dtype=dtypes.bool))
+
+      update_processed_op = control_flow_ops.cond(
+          self._is_row_sweep_var, get_row_update_op, get_col_update_op)
+
+      # After update_processed_op, check whether we have completed a sweep.
+      # If this is the case, flip the is_row_sweep_var and reset processed_rows
+      # and processed_cols tensors.
+      with ops.control_dependencies([update_processed_op]):
+        def get_switch_op():
+          return state_ops.assign(
+              self._is_row_sweep_var,
+              gen_math_ops.logical_not(self._is_row_sweep_var)).op
+
+        def get_reset_op():
+          return control_flow_ops.group(
+              state_ops.assign(processed_rows, processed_rows_init).op,
+              state_ops.assign(processed_cols, processed_cols_init).op)
+
+        is_sweep_done = control_flow_ops.cond(
+            self._is_row_sweep_var,
+            lambda: math_ops.reduce_all(processed_rows),
+            lambda: math_ops.reduce_all(processed_cols),
+            name="sweep_hook_is_sweep_done")
+        switch_op = control_flow_ops.cond(
+            is_sweep_done, get_switch_op, control_flow_ops.no_op,
+            name="sweep_hook_switch_op")
+        reset_op = control_flow_ops.cond(
+            is_sweep_done, get_reset_op, control_flow_ops.no_op,
+            name="sweep_hook_reset_op")
+        switch_ops = control_flow_ops.group(switch_op, reset_op,
+                                            name="sweep_hook_switch_ops")
+
+        with ops.control_dependencies([switch_ops]):
+          # Op to increment the completed_sweeps counter.
+          completed_sweeps_incr_op = control_flow_ops.cond(
+              is_sweep_done,
+              lambda: state_ops.assign_add(self._completed_sweeps_var, 1).op,
+              control_flow_ops.no_op,
+              name="completed_sweeps_incr")
+
+          # Op to increment the global_step counter.
+          global_step = framework_variables.get_global_step()
+          if global_step is not None:
+            global_step_incr_op = state_ops.assign_add(
+                global_step, 1, name="global_step_incr").op
+          else:
+            global_step_incr_op = control_flow_ops.no_op(
+                name="global_step_incr")
+
+          incr_ops = control_flow_ops.group(
+              completed_sweeps_incr_op, global_step_incr_op,
+              name="counter_incr_ops")
+
+    return [is_sweep_done, switch_ops, incr_ops]
+
+  def begin(self):
+    pass
+
+  def before_run(self, run_context):
+    """Runs the appropriate prep ops, and requests running update ops."""
+    # Run the appropriate cache_init and prep ops
+    sess = run_context.session
+    if not self._is_initialized:
+      logging.info("SweepHook running cache init ops.")
+      for init_op in self._cache_init_ops:
+        sess.run(init_op)
+
+    if self._is_sweep_done or not self._is_initialized:
+      logging.info("SweepHook running sweep prep ops.")
+      row_sweep = sess.run(self._is_row_sweep_var)
+      prep_ops = self._row_prep_ops if row_sweep else self._col_prep_ops
+      for prep_op in prep_ops:
+        sess.run(prep_op)
+
+    self._is_initialized = True
+
+    # Request running the switch_ops and the incr_ops
+    logging.info("Partial fit starting.")
+    return session_run_hook.SessionRunArgs(fetches=self._fetches)
+
+  def after_run(self, run_context, run_values):
+    self._is_sweep_done = run_values.results[0]
+    logging.info("Partial fit done.")
+
+
+class _StopAtSweepHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a given sweep."""
+
+  def __init__(self, last_sweep):
+    """Initializes a `StopAtSweepHook`.
+
+    This hook requests stop at a given sweep. Relies on the tensor named
+    COMPLETED_SWEEPS in the default graph.
+
+    Args:
+      last_sweep: Integer, number of the last sweep to run.
+    """
+    self._last_sweep = last_sweep
+
+  def begin(self):
+    try:
+      self._completed_sweeps_var = ops.get_default_graph().get_tensor_by_name(
+          WALSMatrixFactorization.COMPLETED_SWEEPS+":0")
+    except KeyError:
+      raise RuntimeError(WALSMatrixFactorization.COMPLETED_SWEEPS +
+                         " counter should be created to use StopAtSweepHook.")
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._completed_sweeps_var)
+
+  def after_run(self, run_context, run_values):
+    completed_sweeps = run_values.results
+    if completed_sweeps >= self._last_sweep:
+      run_context.request_stop()
+
+
+def _wals_factorization_model_function(features, labels, mode, params):
+  """Model function for the WALSFactorization estimator.
+
+  Args:
+    features: Dictionary of features. See WALSMatrixFactorization.
+    labels: Must be None.
+    mode: A model_fn.ModeKeys object.
+    params: Dictionary of parameters containing arguments passed to the
+      WALSMatrixFactorization constructor.
+
+  Returns:
+    A ModelFnOps object.
+  """
+  assert labels is None
+  use_factors_weights_cache = (
+      params["use_factors_weights_cache_for_training"]
+      and mode == model_fn.ModeKeys.TRAIN)
+  use_gramian_cache = (
+      params["use_gramian_cache_for_training"]
+      and mode == model_fn.ModeKeys.TRAIN)
+  max_sweeps = params["max_sweeps"]
+  model = factorization_ops.WALSModel(
+      params["num_rows"],
+      params["num_cols"],
+      params["embedding_dimension"],
+      unobserved_weight=params["unobserved_weight"],
+      regularization=params["regularization_coeff"],
+      row_init=params["row_init"],
+      col_init=params["col_init"],
+      num_row_shards=params["num_row_shards"],
+      num_col_shards=params["num_col_shards"],
+      row_weights=params["row_weights"],
+      col_weights=params["col_weights"],
+      use_factors_weights_cache=use_factors_weights_cache,
+      use_gramian_cache=use_gramian_cache)
+
+  # Get input rows and cols. We either update rows or columns depending on
+  # the value of row_sweep, which is maintained using a session hook
+  input_rows = features[WALSMatrixFactorization.INPUT_ROWS]
+  input_cols = features[WALSMatrixFactorization.INPUT_COLS]
+  input_row_indices, _ = array_ops.unique(input_rows.indices[:, 0])
+  input_col_indices, _ = array_ops.unique(input_cols.indices[:, 0])
+
+  # Train ops, controlled using the SweepHook
+  # We need to run the following ops:
+  # Before a row sweep:
+  #   row_update_prep_gramian_op
+  #   initialize_row_update_op
+  # During a row sweep:
+  #   update_row_factors_op
+  # Before a col sweep:
+  #   col_update_prep_gramian_op
+  #   initialize_col_update_op
+  # During a col sweep:
+  #   update_col_factors_op
+
+  is_row_sweep_var = variable_scope.variable(
+      True,
+      trainable=False,
+      name="is_row_sweep",
+      collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+  completed_sweeps_var = variable_scope.variable(
+      0,
+      trainable=False,
+      name=WALSMatrixFactorization.COMPLETED_SWEEPS,
+      collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+  # The row sweep is determined by is_row_sweep_var (controlled by the
+  # sweep_hook) in TRAIN mode, and manually in EVAL mode.
+  is_row_sweep = (features[WALSMatrixFactorization.PROJECT_ROW]
+                  if mode == model_fn.ModeKeys.EVAL else is_row_sweep_var)
+
+  def update_row_factors():
+    return model.update_row_factors(sp_input=input_rows, transpose_input=False)
+  def update_col_factors():
+    return model.update_col_factors(sp_input=input_cols, transpose_input=True)
+  _, train_op, loss = control_flow_ops.cond(
+      is_row_sweep, update_row_factors, update_col_factors)
+
+  row_prep_ops = [model.row_update_prep_gramian_op,
+                  model.initialize_row_update_op]
+  col_prep_ops = [model.col_update_prep_gramian_op,
+                  model.initialize_col_update_op]
+  cache_init_ops = [model.worker_init]
+
+  sweep_hook = _SweepHook(
+      is_row_sweep_var,
+      train_op,
+      params["num_rows"],
+      params["num_cols"],
+      input_row_indices,
+      input_col_indices,
+      row_prep_ops,
+      col_prep_ops,
+      cache_init_ops,
+      completed_sweeps_var,
+  )
+  training_hooks = [sweep_hook]
+  if max_sweeps is not None:
+    training_hooks.append(_StopAtSweepHook(max_sweeps))
+
+  summary.scalar("loss", loss)
+  summary.scalar("completed_sweeps", completed_sweeps_var)
+
+  # Prediction ops (only return predictions in INFER mode)
+  predictions = {}
+  if mode == model_fn.ModeKeys.INFER:
+    project_row = features[WALSMatrixFactorization.PROJECT_ROW]
+    projection_weights = features.get(
+        WALSMatrixFactorization.PROJECTION_WEIGHTS)
+    def get_row_projection():
+      return model.project_row_factors(
+          sp_input=input_rows,
+          projection_weights=projection_weights,
+          transpose_input=False)
+    def get_col_projection():
+      return model.project_col_factors(
+          sp_input=input_cols,
+          projection_weights=projection_weights,
+          transpose_input=True)
+
+    predictions[WALSMatrixFactorization.PROJECTION_RESULT] = (
+        control_flow_ops.cond(
+            project_row, get_row_projection, get_col_projection))
+
+  return model_fn.ModelFnOps(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      eval_metric_ops={},
+      train_op=train_op,
+      training_hooks=training_hooks)
+
+
+class WALSMatrixFactorization(estimator.Estimator):
+  """An Estimator for Weighted Matrix Factorization, using the WALS method.
+
+  WALS (Weighted Alternating Least Squares) is an algorithm for weighted matrix
+  factorization. It computes a low-rank approximation of a given sparse (n x m)
+  matrix A, by a product of two matrices, U * V^T, where U is a (n x k) matrix
+  and V is a (m x k) matrix. Here k is the rank of the approximation, also
+  called the embedding dimension. We refer to U as the row factors, and V as the
+  column factors.
+  See tensorflow/contrib/factorization/g3doc/wals.md for the precise problem
+  formulation.
+
+  The training proceeds in sweeps: during a row_sweep, we fix V and solve for U.
+  During a column sweep, we fix U and solve for V. Each one of these problems is
+  an unconstrained quadratic minimization problem and can be solved exactly (it
+  can also be solved in mini-batches, since the solution decouples nicely).
+  The alternating between sweeps is achieved by using a hook during training,
+  which is responsible for keeping track of the sweeps and running preparation
+  ops at the beginning of each sweep. It also updates the global_step variable,
+  which keeps track of the number of batches processed since the beginning of
+  training.
+  The current implementation assumes that the training is run on a single
+  machine, and will fail if config.num_worker_replicas is not equal to one.
+  Training is done by calling self.fit(input_fn=input_fn), where input_fn
+  provides two tensors: one for rows of the input matrix, and one for rows of
+  the transposed input matrix (i.e. columns of the original matrix). Note that
+  during a row sweep, only row batches are processed (ignoring column batches)
+  and vice-versa.
+  Also note that every row (respectively every column) of the input matrix
+  must be processed at least once for the sweep to be considered complete. In
+  particular, training will not make progress if input_fn does not generate some
+  rows.
+
+  For prediction, given a new set of input rows A' (e.g. new rows of the A
+  matrix), we compute a corresponding set of row factors U', such that U' * V^T
+  is a good approximation of A'. We call this operation a row projection. A
+  similar operation is defined for columns.
+  Projection is done by calling self.get_projections(input_fn=input_fn), where
+  input_fn satisfies the constraints given below.
+
+  The input functions must satisfy the following constraints: Calling input_fn
+  must return a tuple (features, labels) where labels is None, and features is
+  a dict containing the following keys:
+  TRAIN:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows of the input matrix to process (or to project).
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns of the input matrix to process (or to project), transposed.
+  INFER:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows to project.
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns to project.
+    - WALSMatrixFactorization.PROJECT_ROW: Boolean Tensor. Whether to project
+      the rows or columns.
+    - WALSMatrixFactorization.PROJECTION_WEIGHTS (Optional): float32 Tensor
+      (vector). The weights to use in the projection.
+  EVAL:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows to project.
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns to project.
+    - WALSMatrixFactorization.PROJECT_ROW: Boolean Tensor. Whether to project
+      the rows or columns.
+  """
+  # Keys to be used in model_fn
+  # Features keys
+  INPUT_ROWS = "input_rows"
+  INPUT_COLS = "input_cols"
+  PROJECT_ROW = "project_row"
+  PROJECTION_WEIGHTS = "projection_weights"
+  # Predictions key
+  PROJECTION_RESULT = "projection"
+  # Name of the completed_sweeps variable
+  COMPLETED_SWEEPS = "completed_sweeps"
+
+  def __init__(self,
+               num_rows,
+               num_cols,
+               embedding_dimension,
+               unobserved_weight=0.1,
+               regularization_coeff=None,
+               row_init="random",
+               col_init="random",
+               num_row_shards=1,
+               num_col_shards=1,
+               row_weights=1,
+               col_weights=1,
+               use_factors_weights_cache_for_training=True,
+               use_gramian_cache_for_training=True,
+               max_sweeps=None,
+               model_dir=None,
+               config=None):
+    """Creates a model for matrix factorization using the WALS method.
+
+    Args:
+      num_rows: Total number of rows for input matrix.
+      num_cols: Total number of cols for input matrix.
+      embedding_dimension: Dimension to use for the factors.
+      unobserved_weight: Weight of the unobserved entries of matrix.
+      regularization_coeff: Weight of the L2 regularization term. Defaults to
+        None, in which case the problem is not regularized.
+      row_init: Initializer for row factor. Must be either:
+        - A tensor: The row factor matrix is initialized to this tensor,
+        - A numpy constant,
+        - "random": The rows are initialized using a normal distribution.
+      col_init: Initializer for column factor. See row_init.
+      num_row_shards: Number of shards to use for the row factors.
+      num_col_shards: Number of shards to use for the column factors.
+      row_weights: Must be in one of the following three formats:
+        - None: In this case, the weight of every entry is the unobserved_weight
+          and the problem simplifies to ALS. Note that, in this case,
+          col_weights must also be set to "None".
+        - List of lists of non-negative scalars, of the form
+          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          where the number of inner lists equal to the number of row factor
+          shards and the elements in each inner list are the weights for the
+          rows of that shard. In this case,
+          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+        - A non-negative scalar: This value is used for all row weights.
+          Note that it is allowed to have row_weights as a list and col_weights
+          as a scalar, or vice-versa.
+      col_weights: See row_weights.
+      use_factors_weights_cache_for_training: Boolean, whether the factors and
+        weights will be cached on the workers before the updates start, during
+        training. Defaults to True.
+        Note that caching is disabled during prediction.
+      use_gramian_cache_for_training: Boolean, whether the Gramians will be
+        cached on the workers before the updates start, during training.
+        Defaults to True. Note that caching is disabled during prediction.
+      max_sweeps: integer, optional. Specifies the number of sweeps for which
+        to train the model, where a sweep is defined as a full update of all the
+        row factors (resp. column factors).
+        If `steps` or `max_steps` is also specified in model.fit(), training
+        stops when either of the steps condition or sweeps condition is met.
+      model_dir: The directory to save the model results and log files.
+      config: A Configuration object. See Estimator.
+
+    Raises:
+      ValueError: If config.num_worker_replicas is strictly greater than one.
+        The current implementation only supports running on a single worker.
+    """
+    # TODO(walidk): Support power-law based weight computation.
+    # TODO(walidk): Add factor lookup by indices, with caching.
+    # TODO(walidk): Support caching during prediction.
+    # TODO(walidk): Provide input pipelines that handle missing rows.
+
+    params = {
+        "num_rows": num_rows,
+        "num_cols": num_cols,
+        "embedding_dimension": embedding_dimension,
+        "unobserved_weight": unobserved_weight,
+        "regularization_coeff": regularization_coeff,
+        "row_init": row_init,
+        "col_init": col_init,
+        "num_row_shards": num_row_shards,
+        "num_col_shards": num_col_shards,
+        "row_weights": row_weights,
+        "col_weights": col_weights,
+        "max_sweeps": max_sweeps,
+        "use_factors_weights_cache_for_training":
+            use_factors_weights_cache_for_training,
+        "use_gramian_cache_for_training": use_gramian_cache_for_training
+    }
+    self._row_factors_names = ["row_factors_shard_%d" % i
+                               for i in range(num_row_shards)]
+    self._col_factors_names = ["col_factors_shard_%d" % i
+                               for i in range(num_col_shards)]
+
+    super(WALSMatrixFactorization, self).__init__(
+        model_fn=_wals_factorization_model_function,
+        params=params,
+        model_dir=model_dir,
+        config=config)
+
+    if self._config is not None and self._config.num_worker_replicas > 1:
+      raise ValueError("WALSMatrixFactorization must be run on a single worker "
+                       "replica.")
+
+  def get_row_factors(self):
+    """Returns the row factors of the model, loading them from checkpoint.
+
+    Should only be run after training.
+
+    Returns:
+      A list of the row factors of the model.
+    """
+    return [self.get_variable_value(name) for name in self._row_factors_names]
+
+  def get_col_factors(self):
+    """Returns the column factors of the model, loading them from checkpoint.
+
+    Should only be run after training.
+
+    Returns:
+      A list of the column factors of the model.
+    """
+    return [self.get_variable_value(name) for name in self._col_factors_names]
+
+  def get_projections(self, input_fn):
+    """Computes the projections of the rows or columns given in input_fn.
+
+    Runs predict() with the given input_fn, and returns the results. Should only
+    be run after training.
+
+    Args:
+      input_fn: Input function which specifies the rows or columns to project.
+    Returns:
+      A generator of the projected factors.
+    """
+    return (result[WALSMatrixFactorization.PROJECTION_RESULT]
+            for result in self.predict(input_fn=input_fn))
diff --git a/tensorflow/contrib/factorization/python/ops/wals_test.py b/tensorflow/contrib/factorization/python/ops/wals_test.py
new file mode 100644
index 00000000000..b5c1bb1151e
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/wals_test.py
@@ -0,0 +1,514 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for WALSMatrixFactorization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import json
+import numpy as np
+
+from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils
+from tensorflow.contrib.factorization.python.ops import wals as wals_lib
+from tensorflow.contrib.learn.python.learn import run_config
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import monitored_session
+
+
+class WALSMatrixFactorizationTest(test.TestCase):
+  INPUT_MATRIX = factorization_ops_test_utils.INPUT_MATRIX
+
+  def np_array_to_sparse(self, np_array):
+    """Transforms an np.array to a tf.SparseTensor."""
+    return factorization_ops_test_utils.np_matrix_to_tf_sparse(np_array)
+
+  def calculate_loss(self):
+    """Calculates the loss of the current (trained) model."""
+    current_rows = embedding_ops.embedding_lookup(
+        self._model.get_row_factors(), math_ops.range(self._num_rows),
+        partition_strategy='div')
+    current_cols = embedding_ops.embedding_lookup(
+        self._model.get_col_factors(), math_ops.range(self._num_cols),
+        partition_strategy='div')
+    row_wts = embedding_ops.embedding_lookup(
+        self._row_weights, math_ops.range(self._num_rows),
+        partition_strategy='div')
+    col_wts = embedding_ops.embedding_lookup(
+        self._col_weights, math_ops.range(self._num_cols),
+        partition_strategy='div')
+    sp_inputs = self.np_array_to_sparse(self.INPUT_MATRIX)
+    return factorization_ops_test_utils.calculate_loss(
+        sp_inputs, current_rows, current_cols, self._regularization_coeff,
+        self._unobserved_weight, row_wts, col_wts)
+
+  # TODO(walidk): Replace with input_reader_utils functions once open sourced.
+  def remap_sparse_tensor_rows(self, sp_x, row_ids, shape):
+    """Remaps the row ids of a tf.SparseTensor."""
+    old_row_ids, old_col_ids = array_ops.split(
+        value=sp_x.indices, num_or_size_splits=2, axis=1)
+    new_row_ids = array_ops.gather(row_ids, old_row_ids)
+    new_indices = array_ops.concat([new_row_ids, old_col_ids], 1)
+    return sparse_tensor.SparseTensor(
+        indices=new_indices, values=sp_x.values, dense_shape=shape)
+
+  # TODO(walidk): Add an option to shuffle inputs.
+  def input_fn(self, np_matrix, batch_size, mode,
+               project_row=None, projection_weights=None,
+               remove_empty_rows_columns=False):
+    """Returns an input_fn that selects row and col batches from np_matrix.
+
+    This simple utility creates an input function from a numpy_array. The
+    following transformations are performed:
+    * The empty rows and columns in np_matrix are removed (if
+      remove_empty_rows_columns is true)
+    * np_matrix is converted to a SparseTensor.
+    * The rows of the sparse matrix (and the rows of its transpose) are batched.
+    * A features dictionary is created, which contains the row / column batches.
+
+    In TRAIN mode, one only needs to specify the np_matrix and the batch_size.
+    In INFER and EVAL modes, one must also provide project_row, a boolean which
+    specifies whether we are projecting rows or columns.
+
+    Args:
+      np_matrix: A numpy array. The input matrix to use.
+      batch_size: Integer.
+      mode: Can be one of model_fn.ModeKeys.{TRAIN, INFER, EVAL}.
+      project_row: A boolean. Used in INFER and EVAL modes. Specifies whether
+        to project rows or columns.
+      projection_weights: A float numpy array. Used in INFER mode. Specifies
+        the weights to use in the projection (the weights are optional, and
+        default to 1.).
+      remove_empty_rows_columns: A boolean. When true, this will remove empty
+        rows and columns in the np_matrix. Note that this will result in
+        modifying the indices of the input matrix. The mapping from new indices
+        to old indices is returned in the form of two numpy arrays.
+
+    Returns:
+      A tuple consisting of:
+      _fn: A callable. Calling _fn returns a features dict.
+      nz_row_ids: A numpy array of the ids of non-empty rows, such that
+        nz_row_ids[i] is the old row index corresponding to new index i.
+      nz_col_ids: A numpy array of the ids of non-empty columns, such that
+        nz_col_ids[j] is the old column index corresponding to new index j.
+    """
+    if remove_empty_rows_columns:
+      np_matrix, nz_row_ids, nz_col_ids = (
+          factorization_ops_test_utils.remove_empty_rows_columns(np_matrix))
+    else:
+      nz_row_ids = np.arange(np.shape(np_matrix)[0])
+      nz_col_ids = np.arange(np.shape(np_matrix)[1])
+
+    def extract_features(row_batch, col_batch, shape):
+      row_ids = row_batch[0]
+      col_ids = col_batch[0]
+      rows = self.remap_sparse_tensor_rows(row_batch[1], row_ids, shape)
+      cols = self.remap_sparse_tensor_rows(col_batch[1], col_ids, shape)
+      features = {
+          wals_lib.WALSMatrixFactorization.INPUT_ROWS: rows,
+          wals_lib.WALSMatrixFactorization.INPUT_COLS: cols,
+      }
+      return features
+
+    def _fn():
+      num_rows = np.shape(np_matrix)[0]
+      num_cols = np.shape(np_matrix)[1]
+      row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
+      col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
+      sp_mat = self.np_array_to_sparse(np_matrix)
+      sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
+      row_batch = input_lib.batch(
+          [row_ids, sp_mat],
+          batch_size=min(batch_size, num_rows),
+          capacity=10,
+          enqueue_many=True)
+      col_batch = input_lib.batch(
+          [col_ids, sp_mat_t],
+          batch_size=min(batch_size, num_cols),
+          capacity=10,
+          enqueue_many=True)
+
+      features = extract_features(row_batch, col_batch, sp_mat.dense_shape)
+
+      if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL:
+        self.assertTrue(
+            project_row is not None,
+            msg='project_row must be specified in INFER or EVAL mode.')
+        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
+            constant_op.constant(project_row))
+
+      if mode == model_fn.ModeKeys.INFER and projection_weights is not None:
+        weights_batch = input_lib.batch(
+            projection_weights,
+            batch_size=batch_size,
+            capacity=10,
+            enqueue_many=True)
+        features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
+            weights_batch)
+
+      labels = None
+      return features, labels
+
+    return _fn, nz_row_ids, nz_col_ids
+
+  @property
+  def input_matrix(self):
+    return self.INPUT_MATRIX
+
+  @property
+  def row_steps(self):
+    return np.ceil(self._num_rows / self.batch_size)
+
+  @property
+  def col_steps(self):
+    return np.ceil(self._num_cols / self.batch_size)
+
+  @property
+  def batch_size(self):
+    return 5
+
+  @property
+  def use_cache(self):
+    return False
+
+  @property
+  def max_sweeps(self):
+    return None
+
+  def setUp(self):
+    self._num_rows = 5
+    self._num_cols = 7
+    self._embedding_dimension = 3
+    self._unobserved_weight = 0.1
+    self._num_row_shards = 2
+    self._num_col_shards = 3
+    self._regularization_coeff = 0.01
+    self._col_init = [
+        # Shard 0.
+        [[-0.36444709, -0.39077035, -0.32528427],
+         [1.19056475, 0.07231052, 2.11834812],
+         [0.93468881, -0.71099287, 1.91826844]],
+        # Shard 1.
+        [[1.18160152, 1.52490723, -0.50015002],
+         [1.82574749, -0.57515913, -1.32810032]],
+        # Shard 2.
+        [[-0.15515432, -0.84675711, 0.13097958],
+         [-0.9246484, 0.69117504, 1.2036494]],
+    ]
+    self._row_weights = [[0.1, 0.2, 0.3], [0.4, 0.5]]
+    self._col_weights = [[0.1, 0.2, 0.3], [0.4, 0.5], [0.6, 0.7]]
+
+    # Values of row and column factors after running one iteration or factor
+    # updates.
+    self._row_factors_0 = [[0.097689, -0.219293, -0.020780],
+                           [0.50842, 0.64626, 0.22364],
+                           [0.401159, -0.046558, -0.192854]]
+    self._row_factors_1 = [[1.20597, -0.48025, 0.35582],
+                           [1.5564, 1.2528, 1.0528]]
+    self._col_factors_0 = [[2.4725, -1.2950, -1.9980],
+                           [0.44625, 1.50771, 1.27118],
+                           [1.39801, -2.10134, 0.73572]]
+    self._col_factors_1 = [[3.36509, -0.66595, -3.51208],
+                           [0.57191, 1.59407, 1.33020]]
+    self._col_factors_2 = [[3.3459, -1.3341, -3.3008],
+                           [0.57366, 1.83729, 1.26798]]
+    self._model = wals_lib.WALSMatrixFactorization(
+        self._num_rows,
+        self._num_cols,
+        self._embedding_dimension,
+        self._unobserved_weight,
+        col_init=self._col_init,
+        regularization_coeff=self._regularization_coeff,
+        num_row_shards=self._num_row_shards,
+        num_col_shards=self._num_col_shards,
+        row_weights=self._row_weights,
+        col_weights=self._col_weights,
+        max_sweeps=self.max_sweeps,
+        use_factors_weights_cache_for_training=self.use_cache,
+        use_gramian_cache_for_training=self.use_cache)
+
+  def test_fit(self):
+    # Row sweep.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    row_factors = self._model.get_row_factors()
+    self.assertAllClose(row_factors[0], self._row_factors_0, atol=1e-3)
+    self.assertAllClose(row_factors[1], self._row_factors_1, atol=1e-3)
+
+    # Col sweep.
+    # Running fit a second time will resume training from the checkpoint.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    col_factors = self._model.get_col_factors()
+    self.assertAllClose(col_factors[0], self._col_factors_0, atol=1e-3)
+    self.assertAllClose(col_factors[1], self._col_factors_1, atol=1e-3)
+    self.assertAllClose(col_factors[2], self._col_factors_2, atol=1e-3)
+
+  def test_predict(self):
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
+    # Project rows 1 and 4 from the input matrix.
+    proj_input_fn = self.input_fn(
+        np_matrix=self.INPUT_MATRIX[[1, 4], :],
+        batch_size=2,
+        mode=model_fn.ModeKeys.INFER,
+        project_row=True,
+        projection_weights=[[0.2, 0.5]])[0]
+
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    projections = self._model.get_projections(proj_input_fn)
+    projected_rows = list(itertools.islice(projections, 2))
+
+    self.assertAllClose(
+        projected_rows,
+        [self._row_factors_0[1], self._row_factors_1[1]],
+        atol=1e-3)
+
+    # Project columns 5, 3, 1 from the input matrix.
+    proj_input_fn = self.input_fn(
+        np_matrix=self.INPUT_MATRIX[:, [5, 3, 1]],
+        batch_size=3,
+        mode=model_fn.ModeKeys.INFER,
+        project_row=False,
+        projection_weights=[[0.6, 0.4, 0.2]])[0]
+
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    projections = self._model.get_projections(proj_input_fn)
+    projected_cols = list(itertools.islice(projections, 3))
+    self.assertAllClose(
+        projected_cols,
+        [self._col_factors_2[0], self._col_factors_1[0],
+         self._col_factors_0[1]],
+        atol=1e-3)
+
+  def test_eval(self):
+    # Do a row sweep then evaluate the model on row inputs.
+    # The evaluate function returns the loss of the projected rows, but since
+    # projection is idempotent, the eval loss must match the model loss.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    eval_input_fn_row = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=True,
+                                      remove_empty_rows_columns=True)[0]
+    loss = self._model.evaluate(
+        input_fn=eval_input_fn_row, steps=self._num_rows)['loss']
+
+    with self.test_session():
+      true_loss = self.calculate_loss()
+
+    self.assertNear(
+        loss, true_loss, err=.001,
+        msg="""After row update, eval loss = {}, does not match the true
+        loss = {}.""".format(loss, true_loss))
+
+    # Do a col sweep then evaluate the model on col inputs.
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    eval_input_fn_col = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=False,
+                                      remove_empty_rows_columns=True)[0]
+    loss = self._model.evaluate(
+        input_fn=eval_input_fn_col, steps=self._num_cols)['loss']
+
+    with self.test_session():
+      true_loss = self.calculate_loss()
+
+    self.assertNear(
+        loss, true_loss, err=.001,
+        msg="""After row update, eval loss = {}, does not match the true
+        loss = {}.""".format(loss, true_loss))
+
+
+class WALSMatrixFactorizationTestSweeps(WALSMatrixFactorizationTest):
+
+  @property
+  def max_sweeps(self):
+    return 2
+
+  # We set the column steps to None so that we rely only on max_sweeps to stop
+  # training.
+  @property
+  def col_steps(self):
+    return None
+
+
+class WALSMatrixFactorizationTestCached(WALSMatrixFactorizationTest):
+
+  @property
+  def use_cache(self):
+    return True
+
+
+class WALSMatrixFactorizaiontTestPaddedInput(WALSMatrixFactorizationTest):
+  PADDED_INPUT_MATRIX = np.pad(
+      WALSMatrixFactorizationTest.INPUT_MATRIX,
+      [(1, 0), (1, 0)], mode='constant')
+
+  @property
+  def input_matrix(self):
+    return self.PADDED_INPUT_MATRIX
+
+
+class WALSMatrixFactorizationUnsupportedTest(test.TestCase):
+
+  def setUp(self):
+    pass
+
+  def testDistributedWALSUnsupported(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.WORKER,
+            'index': 1
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      config = run_config.RunConfig()
+    self.assertEqual(config.num_worker_replicas, 2)
+    with self.assertRaises(ValueError):
+      self._model = wals_lib.WALSMatrixFactorization(1, 1, 1, config=config)
+
+
+class SweepHookTest(test.TestCase):
+
+  def setUp(self):
+    self._num_rows = 5
+    self._num_cols = 7
+    self._train_op = control_flow_ops.no_op()
+    self._row_prep_done = variables.Variable(False)
+    self._col_prep_done = variables.Variable(False)
+    self._init_done = variables.Variable(False)
+    self._row_prep_ops = [state_ops.assign(self._row_prep_done, True)]
+    self._col_prep_ops = [state_ops.assign(self._col_prep_done, True)]
+    self._init_ops = [state_ops.assign(self._init_done, True)]
+    self._input_row_indices_ph = array_ops.placeholder(dtypes.int64)
+    self._input_col_indices_ph = array_ops.placeholder(dtypes.int64)
+
+  def test_sweeps(self):
+    def ind_feed(row_indices, col_indices):
+      return {
+          self._input_row_indices_ph: row_indices,
+          self._input_col_indices_ph: col_indices
+      }
+
+    with self.test_session() as sess:
+      is_row_sweep_var = variables.Variable(True)
+      completed_sweeps_var = variables.Variable(0)
+      sweep_hook = wals_lib._SweepHook(
+          is_row_sweep_var,
+          self._train_op,
+          self._num_rows,
+          self._num_cols,
+          self._input_row_indices_ph,
+          self._input_col_indices_ph,
+          self._row_prep_ops,
+          self._col_prep_ops,
+          self._init_ops,
+          completed_sweeps_var)
+      mon_sess = monitored_session._HookedSession(sess, [sweep_hook])
+      sess.run([variables.global_variables_initializer()])
+
+      # Init ops should run before the first run. Row sweep not completed.
+      mon_sess.run(self._train_op, ind_feed([0, 1, 2], []))
+      self.assertTrue(sess.run(self._init_done),
+                      msg='init ops not run by the sweep_hook')
+      self.assertTrue(sess.run(self._row_prep_done),
+                      msg='row_prep not run by the sweep_hook')
+      self.assertTrue(sess.run(is_row_sweep_var),
+                      msg='Row sweep is not complete but is_row_sweep is '
+                      'False.')
+      # Row sweep completed.
+      mon_sess.run(self._train_op, ind_feed([3, 4], [0, 1, 2, 3, 4, 5, 6]))
+      self.assertFalse(sess.run(is_row_sweep_var),
+                       msg='Row sweep is complete but is_row_sweep is True.')
+      self.assertTrue(sess.run(completed_sweeps_var) == 1,
+                      msg='Completed sweeps should be equal to 1.')
+      self.assertTrue(sweep_hook._is_sweep_done,
+                      msg='Sweep is complete but is_sweep_done is False.')
+      # Col init ops should run. Col sweep not completed.
+      mon_sess.run(self._train_op, ind_feed([], [0, 1, 2, 3, 4]))
+      self.assertTrue(sess.run(self._col_prep_done),
+                      msg='col_prep not run by the sweep_hook')
+      self.assertFalse(sess.run(is_row_sweep_var),
+                       msg='Col sweep is not complete but is_row_sweep is '
+                       'True.')
+      self.assertFalse(sweep_hook._is_sweep_done,
+                       msg='Sweep is not complete but is_sweep_done is True.')
+      # Col sweep completed.
+      mon_sess.run(self._train_op, ind_feed([], [4, 5, 6]))
+      self.assertTrue(sess.run(is_row_sweep_var),
+                      msg='Col sweep is complete but is_row_sweep is False')
+      self.assertTrue(sweep_hook._is_sweep_done,
+                      msg='Sweep is complete but is_sweep_done is False.')
+      self.assertTrue(sess.run(completed_sweeps_var) == 2,
+                      msg='Completed sweeps should be equal to 2.')
+
+
+class StopAtSweepHookTest(test.TestCase):
+
+  def test_stop(self):
+    hook = wals_lib._StopAtSweepHook(last_sweep=10)
+    completed_sweeps = variables.Variable(
+        8, name=wals_lib.WALSMatrixFactorization.COMPLETED_SWEEPS)
+    train_op = state_ops.assign_add(completed_sweeps, 1)
+    hook.begin()
+
+    with self.test_session() as sess:
+      sess.run([variables.global_variables_initializer()])
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)
+      # completed_sweeps is 9 after running train_op.
+      self.assertFalse(mon_sess.should_stop())
+      mon_sess.run(train_op)
+      # completed_sweeps is 10 after running train_op.
+      self.assertTrue(mon_sess.should_stop())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index 95d8b74b009..31a286939b6 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -83,6 +83,7 @@ tf_gen_op_wrapper_py(
 
 tf_py_test(
     name = "decode_audio_op_test",
+    size = "small",
     srcs = ["decode_audio_op_test.py"],
     additional_deps = [
         ":ffmpeg_ops_py",
@@ -97,6 +98,7 @@ tf_py_test(
 
 tf_py_test(
     name = "encode_audio_op_test",
+    size = "small",
     srcs = ["encode_audio_op_test.py"],
     additional_deps = [
         ":ffmpeg_ops_py",
@@ -121,6 +123,7 @@ py_library(
     deps = [
         ":decode_audio_op_py",
         ":encode_audio_op_py",
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 59d5aac6031..2bcb7284e10 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -13,23 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
-"""## Encoding and decoding audio using FFmpeg
+"""Working with audio using FFmpeg.
 
-TensorFlow provides Ops to decode and encode audio files using the
-[FFmpeg](https://www.ffmpeg.org/) library. FFmpeg must be
-locally [installed](https://ffmpeg.org/download.html) for these Ops to succeed.
-
-Example:
-
-```python
-from tensorflow.contrib import ffmpeg
-
-audio_binary = tf.read_file('song.mp3')
-waveform = ffmpeg.decode_audio(
-    audio_binary, file_format='mp3', samples_per_second=44100, channel_count=2)
-uncompressed_binary = ffmpeg.encode_audio(
-    waveform, file_format='wav', samples_per_second=44100)
-```
+See the @{$python/contrib.ffmpeg} guide.
 
 @@decode_audio
 @@encode_audio
@@ -41,3 +27,8 @@ from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['decode_audio', 'encode_audio']
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 15a31f9d311..a6a945094b6 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -53,7 +53,7 @@ class FileDeleter {
   explicit FileDeleter(const string& filename) : filename_(filename) {}
   ~FileDeleter() {
     Env& env = *Env::Default();
-    env.DeleteFile(filename_);
+    env.DeleteFile(filename_).IgnoreError();
   }
 
  private:
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 584f9eb6541..bf8419c04a6 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -17,7 +17,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index a34c64d328c..888f5c38a27 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -69,7 +69,9 @@ bool IsBinaryInstalled(const string& binary_name) {
   for (const string& dir : str_util::Split(path, ':')) {
     const string binary_path = io::JoinPath(dir, binary_name);
     char absolute_path[PATH_MAX + 1];
-    ::realpath(binary_path.c_str(), absolute_path);
+    if (::realpath(binary_path.c_str(), absolute_path) == nullptr) {
+      continue;
+    }
     struct stat statinfo;
     int result = ::stat(absolute_path, &statinfo);
     if (result < 0) {
@@ -139,7 +141,7 @@ template <typename UInt>
 string LittleEndianData(UInt data) {
   static_assert(std::is_unsigned<UInt>::value, "UInt must be unsigned");
   string str;
-  for (int i = 0; i < sizeof(UInt); ++i) {
+  for (size_t i = 0; i < sizeof(UInt); ++i) {
     const unsigned char bits = static_cast<unsigned char>(data & 0xFFU);
     char ch;
     ::memcpy(&ch, &bits, sizeof(bits));
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 2c6e278fec7..2871c146289 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -94,6 +94,7 @@ TEST(FfmpegLibTest, TestRoundTripGeneratedWav) {
   }
 
   std::vector<float> sine_wave;
+  sine_wave.reserve(20000);
   for (int i = 0; i < 20000; ++i) {
     sine_wave.push_back(std::sin(6.28 * 440.0 * i / 20000.0));
   }
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index d9159c505f7..5f608cdb828 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -18,15 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import load_library
+from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
+
+_ffmpeg_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile('ffmpeg.so'))
 
 
 def decode_audio(contents, file_format=None, samples_per_second=None,
@@ -83,32 +82,3 @@ def encode_audio(audio, file_format=None, samples_per_second=None):
 
 
 ops.NotDifferentiable('EncodeAudio')
-
-
-def _load_library(name, op_list=None):
-  """Loads a .so file containing the specified operators.
-
-  Args:
-    name: The name of the .so file to load.
-    op_list: A list of names of operators that the library should have. If None
-        then the .so file's contents will not be verified.
-
-  Raises:
-    NameError if one of the required ops is missing.
-  """
-  try:
-    filename = resource_loader.get_path_to_datafile(name)
-    library = load_library.load_op_library(filename)
-    for expected_op in (op_list or []):
-      for lib_op in library.OP_LIST.op:
-        if lib_op.name == expected_op:
-          break
-      else:
-        raise NameError('Could not find operator %s in dynamic library %s' %
-                        (expected_op, name))
-  except errors.NotFoundError:
-    logging.warning('%s file could not be loaded.', name)
-
-
-if os.name != 'nt':
-  _load_library('ffmpeg.so', ['DecodeAudio', 'EncodeAudio'])
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index e4062b26237..61fe729fd75 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -11,11 +11,14 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
-py_library(
+tf_custom_op_py_library(
     name = "framework_py",
     srcs = [
         "__init__.py",
@@ -25,15 +28,25 @@ py_library(
         "python/framework/tensor_util.py",
         "python/ops/__init__.py",
         "python/ops/arg_scope.py",
+        "python/ops/checkpoint_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
         "python/ops/variables.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_variable_ops.so",
+        ":python/ops/_checkpoint_ops.so",
+    ],
+    kernels = [
+        ":checkpoint_ops_op_lib",
+        ":generate_vocab_remapping_kernel",
+        ":load_and_remap_matrix_kernel",
+        ":variable_kernels",
+        ":variable_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":gen_checkpoint_ops",
         ":gen_variable_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
@@ -57,6 +70,49 @@ py_library(
     ],
 )
 
+tf_kernel_library(
+    name = "variable_kernels",
+    srcs = [
+        "kernels/zero_initializer_op.cc",
+        "kernels/zero_initializer_op.h",
+    ],
+    gpu_srcs = [
+        "kernels/zero_initializer_op_gpu.cu.cc",
+        "kernels/zero_initializer_op.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_kernel_library(
+    name = "generate_vocab_remapping_kernel",
+    srcs = ["kernels/generate_vocab_remapping_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:lookup_table_init_op",
+        "//tensorflow/core/kernels:lookup_table_op",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_kernel_library(
+    name = "load_and_remap_matrix_kernel",
+    srcs = ["kernels/load_and_remap_matrix_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/tensor_bundle",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_library(
     name = "python/ops/_variable_ops.so",
     srcs = [
@@ -70,13 +126,30 @@ tf_custom_op_library(
     ],
 )
 
+tf_custom_op_library(
+    name = "python/ops/_checkpoint_ops.so",
+    srcs = [
+        "kernels/generate_vocab_remapping_op.cc",
+        "kernels/load_and_remap_matrix_op.cc",
+        "ops/checkpoint_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:lookup_headers_lib",
+        "//tensorflow/core/util/tensor_bundle:tensor_bundle_headers_lib",
+    ],
+)
+
 tf_gen_op_libs(
-    op_lib_names = ["variable_ops"],
+    op_lib_names = [
+        "checkpoint_ops",
+        "variable_ops",
+    ],
 )
 
 cc_library(
     name = "all_ops",
     deps = [
+        ":checkpoint_ops_op_lib",
         ":variable_ops_op_lib",
     ],
 )
@@ -89,6 +162,14 @@ tf_gen_op_wrapper_py(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gen_checkpoint_ops",
+    out = "python/ops/gen_checkpoint_ops.py",
+    deps = [
+        ":checkpoint_ops_op_lib",
+    ],
+)
+
 py_test(
     name = "arg_scope_test",
     size = "small",
@@ -153,6 +234,7 @@ py_test(
 
 py_test(
     name = "experimental_test",
+    size = "small",
     srcs = ["python/framework/experimental_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -165,6 +247,7 @@ py_test(
 
 py_test(
     name = "tensor_util_test",
+    size = "small",
     srcs = ["python/framework/tensor_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -203,6 +286,43 @@ py_test(
     ],
 )
 
+filegroup(
+    name = "checkpoint_ops_testdata",
+    srcs = [
+        "testdata/bundle_checkpoint.data-00000-of-00001",
+        "testdata/bundle_checkpoint.index",
+        "testdata/bundle_checkpoint_vocab.txt",
+        "testdata/bundle_checkpoint_vocab_with_oov.txt",
+        "testdata/keyword.txt",
+        "testdata/keyword_new.txt",
+        "testdata/keyword_shifted.txt",
+    ],
+)
+
+py_test(
+    name = "checkpoint_ops_test",
+    size = "medium",
+    srcs = ["python/ops/checkpoint_ops_test.py"],
+    data = [":checkpoint_ops_testdata"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":framework_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 0765ee33ea2..2081a11f47d 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -15,6 +15,8 @@
 
 """Framework utilities.
 
+See the @{$python/contrib.framework} guide.
+
 @@assert_same_float_dtype
 @@assert_scalar
 @@assert_scalar_int
@@ -29,18 +31,18 @@
 @@with_shape
 @@with_same_shape
 
-## Deprecation
 @@deprecated
 @@deprecated_args
 @@deprecated_arg_values
 
-## Arg_Scope
 @@arg_scope
 @@add_arg_scope
 @@has_arg_scope
 @@arg_scoped_arguments
 
-## Variables
+@@prepend_name_scope
+@@strip_name_scope
+
 @@add_model_variable
 @@assert_global_step
 @@assert_or_get_global_step
@@ -54,9 +56,12 @@
 @@get_or_create_global_step
 @@get_local_variables
 @@get_model_variables
+@@get_name_scope
+@@get_trainable_variables
 @@get_unique_variable
 @@get_variables_by_name
 @@get_variables_by_suffix
+@@get_variable_full_name
 @@get_variables_to_restore
 @@get_variables
 @@local_variable
@@ -65,12 +70,14 @@
 @@VariableDeviceChooser
 @@zero_initializer
 
-## Checkpoint utilities
-
 @@load_checkpoint
 @@list_variables
 @@load_variable
 @@init_from_checkpoint
+@@load_and_remap_matrix_initializer
+@@load_embedding_initializer
+@@load_linear_multiclass_bias_initializer
+@@load_variable_slot_initializer
 """
 
 from __future__ import absolute_import
@@ -82,7 +89,11 @@ from tensorflow.contrib.framework.python.framework import *
 from tensorflow.contrib.framework.python.ops import *
 # pylint: enable=unused-import,wildcard-import
 
+from tensorflow.python.framework.ops import prepend_name_scope
+from tensorflow.python.framework.ops import strip_name_scope
+
 from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = ['nest']
 
-remove_undocumented(__name__)
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/framework/kernels/generate_vocab_remapping_op.cc b/tensorflow/contrib/framework/kernels/generate_vocab_remapping_op.cc
new file mode 100644
index 00000000000..247c1f24577
--- /dev/null
+++ b/tensorflow/contrib/framework/kernels/generate_vocab_remapping_op.cc
@@ -0,0 +1,173 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/lookup_table_init_op.h"
+#include "tensorflow/core/kernels/lookup_table_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+// lookup::InitializeTableFromTextFile requires a delimiter even though we use
+// the entire line for vocabularies.
+constexpr char kUnusedLookupDelim = '\t';
+}  // namespace
+
+// This Op generates a vocab remapping Tensor from an old and new vocabulary
+// file that maps new ID's to old ID's.
+class GenerateVocabRemappingOp : public OpKernel {
+ public:
+  explicit GenerateVocabRemappingOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("new_vocab_offset", &new_vocab_offset_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_new_vocab", &num_new_vocab_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* new_vocab_file_tensor;
+    OP_REQUIRES_OK(context,
+                   context->input("new_vocab_file", &new_vocab_file_tensor));
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsScalar(new_vocab_file_tensor->shape()),
+                errors::InvalidArgument(
+                    "new_vocab_file should be a single string, but got ",
+                    new_vocab_file_tensor->shape().DebugString()));
+
+    // Build a new ID->token lookup table.
+    const string& new_vocab_filename =
+        new_vocab_file_tensor->scalar<string>()();
+    OP_REQUIRES(context, !new_vocab_filename.empty(),
+                errors::InvalidArgument("new vocab filename cannot be empty."));
+    lookup::HashTable<int64, string>* new_vocab_table =
+        new lookup::HashTable<int64, string>(context, this);
+    core::ScopedUnref unref_new(new_vocab_table);
+    // Note: we pass -1 (unknown) for vocab_size, which is supposed to be the
+    // total elements in file.  This is different from num_new_vocab_, which
+    // accounts for partitioning.
+    OP_REQUIRES_OK(context, lookup::InitializeTableFromTextFile(
+                                new_vocab_filename,
+                                -1,  // vocab_size
+                                kUnusedLookupDelim,
+                                -1,  // key_index, use the line number.
+                                -2,  // value_index, use the whole line/token.
+                                context->env(), new_vocab_table));
+    OP_REQUIRES(context,
+                new_vocab_offset_ + num_new_vocab_ <= new_vocab_table->size(),
+                errors::InvalidArgument("lookup table size must be larger than "
+                                        "last new vocab entry's line"));
+
+    const Tensor* old_vocab_file_tensor;
+    OP_REQUIRES_OK(context,
+                   context->input("old_vocab_file", &old_vocab_file_tensor));
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsScalar(old_vocab_file_tensor->shape()),
+                errors::InvalidArgument(
+                    "old_vocab_file should be a single string, but got ",
+                    old_vocab_file_tensor->shape().DebugString()));
+    // Build a token->old ID lookup table.
+    const string& old_vocab_filename =
+        old_vocab_file_tensor->scalar<string>()();
+    OP_REQUIRES(context, !old_vocab_filename.empty(),
+                errors::InvalidArgument("new vocab filename cannot be empty."));
+    lookup::HashTable<string, int64>* old_vocab_table =
+        new lookup::HashTable<string, int64>(context, this);
+    core::ScopedUnref unref_old(old_vocab_table);
+    // Note: we pass -1 (unknown) for vocab_size, which is supposed to be the
+    // total elements in file.  This is different from num_new_vocab_, which
+    // accounts for partitioning.
+    OP_REQUIRES_OK(context, lookup::InitializeTableFromTextFile(
+                                old_vocab_filename,
+                                -1,  // vocab_size
+                                kUnusedLookupDelim,
+                                -2,  // key_index, use the whole line/token.
+                                -1,  // value_index, use the line number.
+                                context->env(), old_vocab_table));
+
+    // Fill out new_ids = [new_vocab_offset, new_vocab_offset + 1, ...,
+    //                     new_vocab_offset + num_new_vocab_]
+    // The double look-up requires a few temporary Tensors.
+    Tensor new_ids;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DT_INT64, TensorShape({num_new_vocab_}),
+                                        &new_ids));
+    auto new_ids_vec = new_ids.vec<int64>();
+    // Note that we should always be able to find tokens for all new ID's, given
+    // that the lookup table is constructed with the vocabulary file itself
+    // (see the check on offset and table size post-initialization).
+    Tensor default_token;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DT_STRING, TensorShape({num_new_vocab_}), &default_token));
+    auto default_token_vec = default_token.vec<string>();
+    default_token_vec.setConstant("" /* NOT_FOUND_TOKEN */);
+
+    Tensor default_id;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DT_INT64, TensorShape({num_new_vocab_}),
+                                        &default_id));
+    auto default_id_vec = default_id.vec<int64>();
+    default_id_vec.setConstant(-1 /* NOT_FOUND_ID */);
+
+    for (int i = 0; i < num_new_vocab_; ++i) {
+      new_ids_vec(i) = static_cast<int64>(i + new_vocab_offset_);
+    }
+    Tensor tokens;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DT_STRING, TensorShape({num_new_vocab_}), &tokens));
+    Tensor* remapping;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       "remapping", TensorShape({num_new_vocab_}), &remapping));
+    // In the corner case where num_new_vocab_ is 0 (we are dealing with an
+    // OOV-only partition), we should not do this lookup.
+    if (num_new_vocab_ != 0) {
+      OP_REQUIRES_OK(context, new_vocab_table->Find(context, new_ids, &tokens,
+                                                    default_token));
+      OP_REQUIRES_OK(context, old_vocab_table->Find(context, tokens, remapping,
+                                                    default_id));
+    }
+    // Iterate through remapping to calculate num_present.
+    const auto remapping_vec = remapping->vec<int64>();
+    int num_present = 0;
+    for (int i = 0; i < num_new_vocab_; ++i) {
+      if (remapping_vec(i) != -1 /* NOT_FOUND_ID */) {
+        ++num_present;
+      }
+    }
+    Tensor* num_present_t;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_present", TensorShape({}),
+                                            &num_present_t));
+    num_present_t->scalar<int>()() = num_present;
+  }
+
+ private:
+  int new_vocab_offset_;
+  int num_new_vocab_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GenerateVocabRemapping").Device(DEVICE_CPU),
+                        GenerateVocabRemappingOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/kernels/load_and_remap_matrix_op.cc b/tensorflow/contrib/framework/kernels/load_and_remap_matrix_op.cc
new file mode 100644
index 00000000000..a74ad986635
--- /dev/null
+++ b/tensorflow/contrib/framework/kernels/load_and_remap_matrix_op.cc
@@ -0,0 +1,307 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+
+namespace {
+// Returning a Status instead of using OP_REQUIRES directly since that doesn't
+// seem to work outside the main OpKernel functions.
+Status RemapVectorToMap(const TTypes<const int64>::Vec& remapping,
+                        std::vector<bool>* id_present,
+                        std::unordered_map<int64, int64>* old_id_to_new_id) {
+  id_present->clear();
+  id_present->resize(remapping.size(), false);
+  for (int i = 0; i < remapping.size(); ++i) {
+    const int64 old_id = remapping(i);
+    if (old_id < 0) continue;
+    (*id_present)[i] = true;
+    if (!gtl::InsertIfNotPresent(old_id_to_new_id, old_id, i)) {
+      return errors::Unimplemented(
+          strings::StrCat("Old ID ", old_id, " is mapped to both new ID ",
+                          old_id_to_new_id->at(old_id), " and ", i,
+                          ", which is not supported."));
+    }
+  }
+  return Status::OK();
+}
+}  // anonymous namespace
+
+// This op loads a rank-2 Tensor (matrix) from a TensorFlow checkpoint (V2) and
+// swaps around the rows/columns according to row_remapping/col_remapping.
+// "Missing" cells are initialized with values from initializing_values.
+class LoadAndRemapMatrixOp : public OpKernel {
+ public:
+  explicit LoadAndRemapMatrixOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_rows", &num_rows_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_cols", &num_cols_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("max_rows_in_memory", &max_rows_in_memory_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Checks what we're remapping and inverts the relevant remapping Tensors to
+    // be maps with key = old ID, value = new ID.
+    std::unordered_map<int64, int64> old_row_to_new_row_map;
+    std::vector<bool> row_id_present;
+    const Tensor* row_remapping_t;
+    OP_REQUIRES_OK(context, context->input("row_remapping", &row_remapping_t));
+    const auto row_remapping = row_remapping_t->vec<int64>();
+    OP_REQUIRES(context, row_remapping.size() == num_rows_,
+                errors::InvalidArgument(strings::StrCat(
+                    "Size of row_remapping is ", row_remapping.size(),
+                    " intead of being equal to num_rows=", num_rows_)));
+    OP_REQUIRES_OK(context, RemapVectorToMap(row_remapping, &row_id_present,
+                                             &old_row_to_new_row_map));
+
+    // Calculates the min/max old row ID that we need to read, to save us from
+    // reading some unnecessary slices of the old tensor.
+    int64 min_old_row = -1;
+    int64 max_old_row = -1;
+    for (int i = 0; i < row_remapping.size(); ++i) {
+      if (min_old_row < 0 ||
+          (row_remapping(i) >= 0 && row_remapping(i) < min_old_row)) {
+        min_old_row = row_remapping(i);
+      }
+      if (max_old_row < 0 ||
+          (row_remapping(i) >= 0 && row_remapping(i) > max_old_row)) {
+        max_old_row = row_remapping(i);
+      }
+    }
+
+    // Processes the remapping for columns.
+    std::unordered_map<int64, int64> old_col_to_new_col_map;
+    std::vector<bool> col_id_present;
+    const Tensor* col_remapping_t;
+    OP_REQUIRES_OK(context, context->input("col_remapping", &col_remapping_t));
+    const auto col_remapping = col_remapping_t->vec<int64>();
+    // Note that we always "remap rows", even when the row vocabulary does
+    // not change, because partitioning requires a mapping from partitioned
+    // Variables to the full checkpoints we load.
+    const bool remap_cols = col_remapping.size() > 0;
+    if (remap_cols) {
+      OP_REQUIRES(
+          context, col_remapping.size() == num_cols_,
+          errors::InvalidArgument(strings::StrCat(
+              "Provided col_remapping, but its size is ", col_remapping.size(),
+              " instead of being equal to num_cols=", num_cols_)));
+      OP_REQUIRES_OK(context, RemapVectorToMap(col_remapping, &col_id_present,
+                                               &old_col_to_new_col_map));
+    } else {
+      col_id_present.clear();
+      col_id_present.resize(num_cols_, true);
+    }
+
+    // Processes the checkpoint source and the provided Tensor name.
+    const Tensor* ckpt_path_t;
+    OP_REQUIRES_OK(context, context->input("ckpt_path", &ckpt_path_t));
+    const string ckpt_path = *(ckpt_path_t->scalar<string>().data());
+    const Tensor* old_tensor_name_t;
+    OP_REQUIRES_OK(context,
+                   context->input("old_tensor_name", &old_tensor_name_t));
+    const string old_tensor_name =
+        *(old_tensor_name_t->scalar<string>().data());
+
+    LOG(INFO) << "Processing checkpoint : " << ckpt_path;
+    BundleReader reader(context->env(), ckpt_path);
+    OP_REQUIRES_OK(context, reader.status());
+
+    DataType tensor_type;
+    TensorShape tensor_shape;
+    OP_REQUIRES_OK(context, reader.LookupDtypeAndShape(
+                                old_tensor_name, &tensor_type, &tensor_shape));
+    OP_REQUIRES(context, tensor_type == DT_FLOAT,
+                errors::InvalidArgument(strings::StrCat(
+                    "Tensor ", old_tensor_name, " has invalid type ",
+                    DataTypeString(tensor_type), " instead of expected type ",
+                    DataTypeString(DT_FLOAT))));
+    // This op is limited to loading Tensors of rank 2 (matrices).
+    OP_REQUIRES(
+        context, tensor_shape.dims() == 2,
+        errors::InvalidArgument(strings::StrCat(
+            "Tensor ", old_tensor_name, " has shape ",
+            tensor_shape.DebugString(), " of invalid rank ",
+            tensor_shape.dims(), " instead of expected shape of rank 2.")));
+
+    if (!remap_cols) {
+      // TODO(weiho): Consider relaxing this restriction to allow partial column
+      // loading (even when no column remapping is specified) if there turns out
+      // to be a use case for it.
+      OP_REQUIRES(context, num_cols_ == tensor_shape.dim_size(1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "Tensor ", old_tensor_name, " has shape ",
+                      tensor_shape.DebugString(),
+                      ", where the size of its 2nd dimension is ",
+                      tensor_shape.dim_size(1),
+                      " instead of being equal to num_cols=", num_cols_)));
+    }
+
+    // Uses TensorSlice to potentially load the old tensor in chunks in case
+    // memory usage is a concern.
+    std::vector<TensorSlice> tensor_slices;
+    TensorSlice slice(tensor_shape.dims());
+    if (min_old_row >= 0 && max_old_row >= 0) {
+      int64 row_start = min_old_row;
+      // TODO(weiho): Given the list of old row IDs of interest (the keys of
+      // old_row_to_new_row_map), we could also try something smarter to
+      // find some minimal set of covering ranges for the list of old row IDs
+      // such that the size of each range is less than max_rows_in_memory_.
+      while (row_start <= max_old_row) {
+        const int64 slice_length =
+            max_rows_in_memory_ <= 0
+                // If max_rows_in_memory_ <= 0, we just load the entire chunk.
+                ? max_old_row - row_start + 1
+                : std::min(max_rows_in_memory_, max_old_row - row_start + 1);
+        slice.set_start(0, row_start);
+        slice.set_length(0, slice_length);
+        tensor_slices.push_back(slice);
+        row_start += slice_length;
+      }
+    }
+
+    // Allocates the output matrix.
+    Tensor* output_matrix_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output_matrix",
+                                            TensorShape({num_rows_, num_cols_}),
+                                            &output_matrix_t));
+    auto output_matrix = output_matrix_t->matrix<float>();
+
+    // Iterates through tensor slices and copies over values from the old tensor
+    // to the output matrix.
+    int64 row_index = min_old_row;
+    int64 rows_copied = 0;
+    Tensor loaded_tensor_t;
+    for (const TensorSlice& tensor_slice : tensor_slices) {
+      LOG(INFO) << "Loading slice " << tensor_slice.DebugString();
+      TensorShape slice_shape;
+      OP_REQUIRES_OK(context,
+                     tensor_slice.SliceTensorShape(tensor_shape, &slice_shape));
+      // Potentially re-allocates the tensor buffer since the last slice may
+      // have fewer rows than the other slices.
+      if (loaded_tensor_t.shape() != slice_shape) {
+        loaded_tensor_t = Tensor(DT_FLOAT, slice_shape);
+      }
+      OP_REQUIRES_OK(context, reader.LookupSlice(old_tensor_name, tensor_slice,
+                                                 &loaded_tensor_t));
+
+      // Iterates through the old loaded tensor slice row-by-row.
+      for (int row = 0; row < loaded_tensor_t.dim_size(0); ++row, ++row_index) {
+        if (row_index % 500000 == min_old_row) {
+          LOG(INFO) << "Processing old row " << row_index;
+        }
+
+        // If the old row ID is not found in old_row_to_new_row_map, continue
+        // to the next row; otherwise, copy it to the output matrix.
+        const int64* new_row_ptr =
+            gtl::FindOrNull(old_row_to_new_row_map, row_index);
+        if (new_row_ptr == nullptr) {
+          continue;
+        }
+        ++rows_copied;
+        const int64 new_row = *new_row_ptr;
+
+        // Copies over the row element-by-element, in case remapping is needed
+        // along the column axis.
+        const auto& loaded_tensor = loaded_tensor_t.matrix<float>();
+        for (int old_col = 0; old_col < loaded_tensor_t.dim_size(1);
+             ++old_col) {
+          int64 new_col = old_col;
+          if (remap_cols) {
+            const int64* new_col_ptr =
+                gtl::FindOrNull(old_col_to_new_col_map, old_col);
+            if (new_col_ptr == nullptr) {
+              // Column remapping is specified, but this column is not found in
+              // old_col_to_new_col_map, so we leave it uninitialized, to be
+              // filled in with initializing_values later.
+              continue;
+            }
+            new_col = *new_col_ptr;
+          }
+
+          OP_REQUIRES(context,
+                      new_row < num_rows_ && new_col < num_cols_ &&
+                          new_row >= 0 && new_col >= 0,
+                      errors::Internal(strings::StrCat(
+                          "new_row=", new_row, " and new_col=", new_col,
+                          " should have been less than num_rows_=", num_rows_,
+                          " and num_cols_=", num_cols_,
+                          " and non-negative. This should never have happened "
+                          "if the code were correct. Please file a bug.")));
+          output_matrix(new_row, new_col) = loaded_tensor(row, old_col);
+        }
+      }
+    }
+    LOG(INFO) << "Copied " << rows_copied << " rows from old matrix (with "
+              << tensor_shape.dim_size(0) << " rows) to new matrix (with "
+              << num_rows_ << " rows).";
+
+    // At this point, there are potentially whole rows/columns uninitialized
+    // (corresponding to the indices where row_id_present/col_id_present are
+    // false). We fill this in cell-by-cell using row_id_present and
+    // col_id_present while dequeuing from the initializing_values vector.
+    const Tensor* initializing_values_t;
+    OP_REQUIRES_OK(
+        context, context->input("initializing_values", &initializing_values_t));
+    const auto initializing_values = initializing_values_t->flat<float>();
+    int64 initializing_values_index = 0;
+    for (int i = 0; i < num_rows_; ++i) {
+      for (int j = 0; j < num_cols_; ++j) {
+        if (row_id_present[i] && col_id_present[j]) continue;
+        OP_REQUIRES(
+            context, initializing_values_index < initializing_values.size(),
+            errors::InvalidArgument(
+                "initializing_values contained ", initializing_values.size(),
+                " elements, but more missing values remain."));
+        output_matrix(i, j) = initializing_values(initializing_values_index);
+        ++initializing_values_index;
+      }
+    }
+
+    // Checks that we used all the given initializing values.
+    OP_REQUIRES(
+        context, initializing_values_index == initializing_values.size(),
+        errors::InvalidArgument(
+            "initializing_values contained ", initializing_values.size(),
+            " elements, but only ", initializing_values_index,
+            " elements were used to fill in missing values."));
+  }
+
+ private:
+  int64 num_rows_;
+  int64 num_cols_;
+  int64 max_rows_in_memory_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("LoadAndRemapMatrix").Device(DEVICE_CPU),
+                        LoadAndRemapMatrixOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/ops/checkpoint_ops.cc b/tensorflow/contrib/framework/ops/checkpoint_ops.cc
new file mode 100644
index 00000000000..b49d7b4d40f
--- /dev/null
+++ b/tensorflow/contrib/framework/ops/checkpoint_ops.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("GenerateVocabRemapping")
+    .Input("new_vocab_file: string")
+    .Input("old_vocab_file: string")
+    .Attr("new_vocab_offset: int >= 0")
+    .Attr("num_new_vocab: int >= 0")
+    .Output("remapping: int64")
+    .Output("num_present: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      int64 new_vocab_offset;
+      TF_RETURN_IF_ERROR(c->GetAttr("new_vocab_offset", &new_vocab_offset));
+      int64 num_new_vocab;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_new_vocab", &num_new_vocab));
+
+      c->set_output(0, c->Vector(num_new_vocab));
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Given a path to new and old vocabulary files, returns a remapping Tensor of
+length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+in the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables
+use in the partitioned variable case, and should generally be set through
+examining partitioning info.  The format of the files should be a text file,
+with each line containing a single entity within the vocabulary.
+
+For example, with `new_vocab_file` a text file containing each of the following
+elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+`[0, -1, 2]`.
+
+The op also returns a count of how many entries in the new vocabulary
+were present in the old vocabulary, which is used to calculate the number of
+values to initialize in a weight matrix remapping
+
+This functionality can be used to remap both row vocabularies (typically,
+features) and column vocabularies (typically, classes) from TensorFlow
+checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+corresponding to div-partitioned variables.  Moreover, the underlying remapping
+uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+use the corresponding index_table_from_file() as the FeatureColumn framework
+does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+
+new_vocab_file: Path to the new vocab file.
+old_vocab_file: Path to the old vocab file.
+new_vocab_offset: How many entries into the new vocab file to start reading.
+num_new_vocab: Number of entries in the new vocab file to remap.
+remapping: A Tensor of length num_new_vocab where the element at index i
+  is equal to the old ID that maps to the new ID i.  This element is -1 for any
+  new ID that is not found in the old vocabulary.
+num_present: Number of new vocab entries found in old vocab.
+)doc");
+
+REGISTER_OP("LoadAndRemapMatrix")
+    .Input("ckpt_path: string")
+    .Input("old_tensor_name: string")
+    .Input("row_remapping: int64")
+    .Input("col_remapping: int64")
+    .Input("initializing_values: float")
+    .Attr("num_rows: int >= 0")
+    .Attr("num_cols: int >= 1")
+    .Attr("max_rows_in_memory: int = -1")
+    .Output("output_matrix: float")
+    // TODO(b/30502450): Setting the op as being stateful prevents it from being
+    // executed more often than expected (possibly due to stateful ops not being
+    // subject to constant folding?). This op is usually slow and may require
+    // multiple disk reads, so we want to minimize the number of times it's
+    // executed redundantly.
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      int64 num_rows;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_rows", &num_rows));
+      int64 num_cols;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_cols", &num_cols));
+
+      c->set_output(0, c->Matrix(num_rows, num_cols));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+at `ckpt_path` and potentially reorders its rows and columns using the
+specified remappings.
+
+Most users should use one of the wrapper initializers (such as
+`tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+function directly.
+
+The remappings are 1-D tensors with the following properties:
+
+* `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+  matrix will be initialized from the row corresponding to index
+  `row_remapping[i]` in the old `Tensor` from the checkpoint.
+* `col_remapping` must have either 0 entries (indicating that no column
+  reordering is needed) or `num_cols` entries. If specified, column `j` of the
+  output matrix will be initialized from the column corresponding to index
+  `col_remapping[j]` in the old `Tensor` from the checkpoint.
+* A value of -1 in either of the remappings signifies a "missing" entry. In that
+  case, values from the `initializing_values` tensor will be used to fill that
+  missing row or column. If `row_remapping` has `r` missing entries and
+  `col_remapping` has `c` missing entries, then the following condition must be
+  true:
+
+`(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+
+The remapping tensors can be generated using the GenerateVocabRemapping op.
+
+As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+the value from row i, column j of the old tensor in the checkpoint, the output
+matrix will look like the following:
+
+[[w(1, 0),  w(1, 2),  0.5],
+ [w(0, 0),  w(0, 2), -0.5],
+ [0.25,    -0.25,      42]]
+
+ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+  which the old matrix `Tensor` will be loaded.
+old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+row_remapping: An int `Tensor` of row remappings (generally created by
+  `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+  still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+  index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+col_remapping: An int `Tensor` of column remappings (generally created by
+  `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+  is to be done (e.g. column ordering is the same).
+initializing_values: A float `Tensor` containing  values to fill in for cells
+  in the output matrix that are not loaded from the checkpoint. Length must be
+  exactly the same as the number of missing / new cells.
+num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+max_rows_in_memory: The maximum number of rows to load from the checkpoint at
+  once. If less than or equal to 0, the entire matrix will be loaded into
+  memory. Setting this arg trades increased disk reads for lower memory usage.
+output_matrix: Output matrix containing existing values loaded from the
+  checkpoint, and with any missing values filled in from initializing_values.
+)doc");
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 8f909f8ba77..1ee8e1498cf 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
-using shape_inference::Shape;
 
 REGISTER_OP("ZeroInitializer")
     .Input("ref: Ref(T)")
diff --git a/tensorflow/contrib/framework/python/framework/__init__.py b/tensorflow/contrib/framework/python/framework/__init__.py
index d30d6d9df3f..c8e6a468549 100644
--- a/tensorflow/contrib/framework/python/framework/__init__.py
+++ b/tensorflow/contrib/framework/python/framework/__init__.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.framework.python.framework.experimental import experimen
 from tensorflow.contrib.framework.python.framework.tensor_util import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_arg_values
 from tensorflow.python.util.deprecation import deprecated_args
-
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
index 8922c1404a8..9e356dd9656 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@@ -150,29 +150,31 @@ def _collect_partitioned_variable(name, var_scope):
 
 
 def init_from_checkpoint(checkpoint_dir, assignment_map):
-  """Using assingment map initializes current variables with loaded tensors.
+  """Using assignment map initializes current variables with loaded tensors.
 
   Note: This overrides default initialization ops of specified variables and
   redefines dtype.
 
   Assignment map supports following syntax:
-    `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
-      current `scope_name` from `checkpoint_scope_name` with matching variable
-      names.
-    `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
-      will initalize `scope_name/variable_name` variable
-      from `checkpoint_scope_name/some_other_variable`.
-    `'scope_variable_name': variable` - will initialize given `tf.Variable`
-      object with variable from the checkpoint.
-    `'scope_variable_name': list(variable)` - will initialize list of
-      partitioned variables with variable from the checkpoint.
-    `'/': 'scope_name/'` - will load all variables in current `scope_name` from
-      checkpoint's root (e.g. no scope).
+
+  * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
+    current `scope_name` from `checkpoint_scope_name` with matching variable
+    names.
+  * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
+    will initialize `scope_name/variable_name` variable
+    from `checkpoint_scope_name/some_other_variable`.
+  * `'scope_variable_name': variable` - will initialize given `tf.Variable`
+    object with variable from the checkpoint.
+  * `'scope_variable_name': list(variable)` - will initialize list of
+    partitioned variables with variable from the checkpoint.
+  * `'/': 'scope_name/'` - will load all variables in current `scope_name` from
+    checkpoint's root (e.g. no scope).
 
   Supports loading into partitioned variables, which are represented as
-  '<variable>/part_<part #>'.
+  `'<variable>/part_<part #>'`.
 
   Example:
+
   ```python
     # Create variables.
     with tf.variable_scope('test'):
@@ -182,7 +184,7 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
     var3 = tf.get_variable(name="my1", shape=[100, 100],
                            partitioner=lambda shape, dtype: [5, 1])
     ...
-    # Specify which variables to intialize from checkpoint.
+    # Specify which variables to initialize from checkpoint.
     init_from_checkpoint(checkpoint_dir, {
       'some_var': 'test/my_var',
       'some_scope/': 'test2/'})
@@ -270,7 +272,7 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
       # and create variable to variable mapping.
       scope_variables = set()
       for var_name in var_scope._vars:
-        if var_name.startswith(scopes):
+        if not scopes or var_name.startswith(scopes + "/"):
           # Consume /part_ if partitioned variable.
           if "/part_" in var_name:
             var_name = var_name[:var_name.index("/part_")]
@@ -278,10 +280,11 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
       for var_name in scope_variables:
         # Lookup name with specified prefix and suffix from current variable.
         # If tensor_name given is '/' (root), don't use it for full name.
+        full_tensor_name = var_name[len(scopes):]
+        if current_var_or_name != "/":
+          full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
-          full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:]
-        else:
-          full_tensor_name = var_name[len(scopes) + 1:]
+          full_tensor_name = tensor_name_in_ckpt + full_tensor_name
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
index 09eecb56dcb..9396f027d31 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@@ -144,6 +144,25 @@ class CheckpointsTest(test.TestCase):
         # Check that tensors are not explicitly in the graph.
         self.assertLess(len(str(session.graph.as_graph_def())), 27000)
 
+  def testInitWithScopeDoesNotCaptureSuffixes(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    with ops.Graph().as_default() as g:
+      with variable_scope.variable_scope("useful_scope"):
+        my4 = variable_scope.get_variable("var4", [9, 9])
+      with variable_scope.variable_scope("useful_scope_1"):
+        my5_init = [[1.0, 2.0], [3.0, 4.0]]
+        my5 = variable_scope.get_variable("var5", initializer=my5_init)
+
+      checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                            {"useful_scope/": "useful_scope/"})
+      with self.test_session(graph=g) as session:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my4.eval(session), v4)
+        self.assertAllEqual(my5.eval(session), my5_init)
+
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
@@ -168,6 +187,29 @@ class CheckpointsTest(test.TestCase):
         self.assertAllEqual(my3.eval(session), v3)
         self.assertAllEqual(my4.eval(session), v4)
 
+  def testInitToRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = variable_scope.get_variable("var1", [1, 10])
+        my2 = variable_scope.get_variable("var2", [10, 10])
+        my3 = variable_scope.get_variable("var3", [100, 100])
+        with variable_scope.variable_scope("useful_scope"):
+          my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
   def testInitFromPartitionVar(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py
index a326b78a5f2..ec68d3b170e 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util.py
@@ -21,10 +21,11 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
 
 
 __all__ = [
@@ -39,97 +40,15 @@ __all__ = [
     'with_same_shape']
 
 
+# Temporary for backwards compatibility
+is_tensor = tensor_util.is_tensor
+assert_same_float_dtype = check_ops.assert_same_float_dtype
+assert_scalar = check_ops.assert_scalar
+
 convert_to_tensor_or_sparse_tensor = (
     sparse_tensor.convert_to_tensor_or_sparse_tensor)
 
 
-def _assert_same_base_type(items, expected_type=None):
-  r"""Asserts all items are of the same base type.
-
-  Args:
-    items: List of graph items (e.g., `Variable`, `Tensor`, `SparseTensor`,
-        `Operation`, or `IndexedSlices`). Can include `None` elements, which
-        will be ignored.
-    expected_type: Expected type. If not specified, assert all items are
-        of the same base type.
-
-  Returns:
-    Validated type, or none if neither expected_type nor items provided.
-
-  Raises:
-    ValueError: If any types do not match.
-  """
-  original_item_str = None
-  for item in items:
-    if item is not None:
-      item_type = item.dtype.base_dtype
-      if not expected_type:
-        expected_type = item_type
-        original_item_str = item.name if hasattr(item, 'name') else str(item)
-      elif expected_type != item_type:
-        raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
-            item.name if hasattr(item, 'name') else str(item),
-            item_type, expected_type,
-            (' as %s' % original_item_str) if original_item_str else ''))
-  return expected_type
-
-
-def assert_same_float_dtype(tensors=None, dtype=None):
-  """Validate and return float type based on `tensors` and `dtype`.
-
-  For ops such as matrix multiplication, inputs and weights must be of the
-  same float type. This function validates that all `tensors` are the same type,
-  validates that type is `dtype` (if supplied), and returns the type. Type must
-  be `dtypes.float32` or `dtypes.float64`. If neither `tensors` nor
-  `dtype` is supplied, default to `dtypes.float32`.
-
-  Args:
-    tensors: Tensors of input values. Can include `None` elements, which will be
-        ignored.
-    dtype: Expected type.
-  Returns:
-    Validated type.
-  Raises:
-    ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
-        float.
-  """
-  if tensors:
-    dtype = _assert_same_base_type(tensors, dtype)
-  if not dtype:
-    dtype = dtypes.float32
-  elif not dtype.is_floating:
-    raise ValueError('Expected float, got %s.' % dtype)
-  return dtype
-
-
-def assert_scalar_int(tensor, name=None):
-  """Assert `tensor` is 0-D, of type `tf.int32` or `tf.int64`.
-
-  Args:
-    tensor: `Tensor` to test.
-    name: Name of the op and of the new `Tensor` if one is created.
-  Returns:
-    `tensor`, for chaining.
-  Raises:
-    ValueError: if `tensor` is not 0-D, of type `tf.int32` or `tf.int64`.
-  """
-  with ops.name_scope(name, 'assert_scalar_int', [tensor]) as name_scope:
-    tensor = ops.convert_to_tensor(tensor)
-    data_type = tensor.dtype
-    if data_type.base_dtype not in [dtypes.int32, dtypes.int64]:
-      raise ValueError('Unexpected type %s for %s.' % (data_type, tensor.name))
-    return assert_scalar(tensor, name=name_scope)
-
-
-def assert_scalar(tensor, name=None):
-  with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
-    tensor = ops.convert_to_tensor(tensor, name=name_scope)
-    shape = tensor.get_shape()
-    if shape.ndims != 0:
-      raise ValueError('Unexpected shape %s for %s.' % (shape, tensor.name))
-    return tensor
-
-
 def reduce_sum_n(tensors, name=None):
   """Reduce tensors to a scalar sum.
 
@@ -283,22 +202,6 @@ def with_same_shape(expected_tensor, tensor):
     return with_shape(expected_shape, tensor)
 
 
-def is_tensor(x):
-  """Check for tensor types.
-
-  Check whether an object is a tensor. Equivalent to
-  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
-
-  Args:
-    x: An python object to check.
-
-  Returns:
-    `True` if `x` is a tensor, `False` if not.
-  """
-  tensor_types = (ops.Tensor, sparse_tensor.SparseTensor, variables.Variable)
-  return isinstance(x, tensor_types)
-
-
 def with_shape(expected_shape, tensor):
   """Asserts tensor has expected shape.
 
@@ -319,7 +222,7 @@ def with_shape(expected_shape, tensor):
     raise ValueError('SparseTensor not supported.')
 
   # Shape type must be 1D int32.
-  if is_tensor(expected_shape):
+  if tensor_util.is_tensor(expected_shape):
     if expected_shape.dtype.base_dtype != dtypes.int32:
       raise ValueError(
           'Invalid dtype %s for shape %s expected of tensor %s.' % (
@@ -344,18 +247,20 @@ def with_shape(expected_shape, tensor):
 
   actual_shape = tensor.get_shape()
 
-  if not actual_shape.is_fully_defined() or is_tensor(expected_shape):
+  if (not actual_shape.is_fully_defined()
+      or tensor_util.is_tensor(expected_shape)):
     with ops.name_scope('%s/' % tensor.op.name, values=[tensor]):
-      if not is_tensor(expected_shape) and (len(expected_shape) < 1):
+      if (not tensor_util.is_tensor(expected_shape)
+          and (len(expected_shape) < 1)):
         # TODO(irving): Remove scalar special case
         return array_ops.reshape(tensor, [])
       with ops.control_dependencies([_assert_shape_op(expected_shape, tensor)]):
         result = array_ops.identity(tensor)
-      if not is_tensor(expected_shape):
+      if not tensor_util.is_tensor(expected_shape):
         result.set_shape(expected_shape)
       return result
 
-  if (not is_tensor(expected_shape) and
+  if (not tensor_util.is_tensor(expected_shape) and
       not actual_shape.is_compatible_with(expected_shape)):
     if (len(expected_shape) < 1) and actual_shape.is_compatible_with([1]):
       # TODO(irving): Remove scalar special case.
@@ -365,3 +270,23 @@ def with_shape(expected_shape, tensor):
         tensor.name, expected_shape, actual_shape))
 
   return tensor
+
+
+def assert_scalar_int(tensor, name=None):
+  """Assert `tensor` is 0-D, of type `tf.int32` or `tf.int64`.
+
+  Args:
+    tensor: `Tensor` to test.
+    name: Name of the op and of the new `Tensor` if one is created.
+  Returns:
+    `tensor`, for chaining.
+  Raises:
+    ValueError: if `tensor` is not 0-D, of integer type.
+  """
+  with ops.name_scope(name, 'assert_scalar_int', [tensor]) as name_scope:
+    tensor = ops.convert_to_tensor(tensor)
+    data_type = tensor.dtype
+    if not data_type.base_dtype.is_integer:
+      raise ValueError('Expected integer type for %s, received type: %s.'
+                       % (tensor.name, data_type))
+    return check_ops.assert_scalar(tensor, name=name_scope)
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index bf44f8c662c..bc6bc952ee4 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -33,83 +33,6 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
 
-class FloatDTypeTest(test.TestCase):
-
-  def test_assert_same_float_dtype(self):
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype(None, None))
-    self.assertIs(dtypes.float32, tensor_util.assert_same_float_dtype([], None))
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype([], dtypes.float32))
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype(None, dtypes.float32))
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype([None, None], None))
-    self.assertIs(
-        dtypes.float32,
-        tensor_util.assert_same_float_dtype([None, None], dtypes.float32))
-
-    const_float = constant_op.constant(3.0, dtype=dtypes.float32)
-    self.assertIs(
-        dtypes.float32,
-        tensor_util.assert_same_float_dtype([const_float], dtypes.float32))
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [const_float], dtypes.int32)
-
-    sparse_float = sparse_tensor.SparseTensor(
-        constant_op.constant([[111], [232]], dtypes.int64),
-        constant_op.constant([23.4, -43.2], dtypes.float32),
-        constant_op.constant([500], dtypes.int64))
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype([sparse_float],
-                                                      dtypes.float32))
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [sparse_float], dtypes.int32)
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [const_float, None, sparse_float], dtypes.float64)
-
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype(
-                      [const_float, sparse_float]))
-    self.assertIs(dtypes.float32,
-                  tensor_util.assert_same_float_dtype(
-                      [const_float, sparse_float], dtypes.float32))
-
-    const_int = constant_op.constant(3, dtype=dtypes.int32)
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [sparse_float, const_int])
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [sparse_float, const_int], dtypes.int32)
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [sparse_float, const_int], dtypes.float32)
-    self.assertRaises(ValueError, tensor_util.assert_same_float_dtype,
-                      [const_int])
-
-
-class AssertScalarTest(test.TestCase):
-
-  def test_assert_scalar(self):
-    tensor_util.assert_scalar(constant_op.constant(3))
-    tensor_util.assert_scalar(constant_op.constant("foo"))
-    tensor_util.assert_scalar(3)
-    tensor_util.assert_scalar("foo")
-    with self.assertRaisesRegexp(ValueError, "Unexpected shape"):
-      tensor_util.assert_scalar(constant_op.constant([3, 4]))
-
-  def test_assert_scalar_int(self):
-    tensor_util.assert_scalar_int(constant_op.constant(3, dtype=dtypes.int32))
-    tensor_util.assert_scalar_int(constant_op.constant(3, dtype=dtypes.int64))
-    tensor_util.assert_scalar_int(3)
-    with self.assertRaisesRegexp(ValueError, "Unexpected type"):
-      tensor_util.assert_scalar_int(
-          constant_op.constant(
-              3, dtype=dtypes.float32))
-    with self.assertRaisesRegexp(ValueError, "Unexpected shape"):
-      tensor_util.assert_scalar_int(
-          constant_op.constant(
-              [3, 4], dtype=dtypes.int32))
-
-
 class LocalVariabletest(test.TestCase):
 
   def test_local_variable(self):
@@ -136,6 +59,22 @@ class ReduceSumNTest(test.TestCase):
       self.assertEqual(21, tensor_util.reduce_sum_n([a, b, c]).eval())
 
 
+class AssertScalarIntTest(test.TestCase):
+
+  def test_assert_scalar_int(self):
+    tensor_util.assert_scalar_int(constant_op.constant(3, dtype=dtypes.int32))
+    tensor_util.assert_scalar_int(constant_op.constant(3, dtype=dtypes.int64))
+    tensor_util.assert_scalar_int(3)
+    with self.assertRaisesRegexp(ValueError, "Expected integer"):
+      tensor_util.assert_scalar_int(
+          constant_op.constant(
+              3, dtype=dtypes.float32))
+    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+      tensor_util.assert_scalar_int(
+          constant_op.constant(
+              [3, 4], dtype=dtypes.int32))
+
+
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index 4d1ec6e35ab..edef37cf0c0 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # TODO(ptucker): Add these to tf.contrib.variables?
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.ops.arg_scope import *
+from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
 from tensorflow.contrib.framework.python.ops.variables import *
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index b7ec9ba9369..9c194ec202a 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -30,11 +30,15 @@
     net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
-  The first call to conv2d will use predefined args:
-    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', ..., scope='conv1')
+  The first call to conv2d will behave as follows:
+    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv1')
 
-  The second call to conv2d will overwrite padding:
-    layers.conv2d(inputs, 256, [5, 5], padding='SAME', ..., scope='conv2')
+  The second call to conv2d will also use the arg_scope's default for padding:
+    layers.conv2d(inputs, 256, [5, 5], padding='SAME',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv2')
 
   Example of how to reuse an arg_scope:
 
@@ -49,7 +53,7 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
@@ -57,8 +61,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import contextlib
-import functools
+
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
 
 __all__ = ['arg_scope',
            'add_arg_scope',
@@ -102,7 +107,7 @@ def _add_op(op):
     _DECORATED_OPS[key_op] = _kwarg_names(op)
 
 
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def arg_scope(list_ops_or_scope, **kwargs):
   """Stores the default arguments for the given set of list_ops.
 
@@ -166,7 +171,6 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
-  @functools.wraps(func)
   def func_with_args(*args, **kwargs):
     current_scope = _current_arg_scope()
     current_args = kwargs
@@ -177,8 +181,7 @@ def add_arg_scope(func):
     return func(*args, **current_args)
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
-  setattr(func_with_args, '__doc__', func.__doc__)
-  return func_with_args
+  return tf_decorator.make_decorator(func, func_with_args)
 
 
 def has_arg_scope(func):
diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py
new file mode 100644
index 00000000000..92228f8916c
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py
@@ -0,0 +1,609 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for generating and loading vocab remappings."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.contrib.framework.python.ops import gen_checkpoint_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import resource_loader
+
+_checkpoint_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_checkpoint_ops.so"))
+
+ops.NotDifferentiable("GenerateVocabRemapping")
+ops.NotDifferentiable("LoadAndRemapMatrix")
+
+
+def _load_and_remap_matrix(ckpt_path,
+                           old_tensor_name,
+                           new_row_vocab_offset,
+                           num_rows_to_load,
+                           new_col_vocab_size,
+                           initializer,
+                           old_row_vocab_file=None,
+                           new_row_vocab_file=None,
+                           old_col_vocab_file=None,
+                           new_col_vocab_file=None,
+                           num_row_oov_buckets=0,
+                           num_col_oov_buckets=0,
+                           max_rows_in_memory=-1):
+  """Loads a 2-D (matrix) `Tensor` from checkpoint.
+
+  Generates 1D-remappings for rows and columns using the
+  `GenerateVocabRemapping` op, and initializes any anticipated values with the
+  provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
+  matrix that loads existing values from the checkpoint, while filling out
+  "missing" values with the newly initialized values. See
+  contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
+  functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
+  row remapping or only col remapping. If only row remapping is desired,
+  {new,old}_col_vocab_file should be `None`, and vice versa for column
+  remapping.
+
+  NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
+  (row axis) via `new_row_vocab_offset`.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_row_vocab_offset: A 0-indexed integer representing what line to
+      start reading at in the new row vocabulary. Used for partitioned
+      variables.
+    num_rows_to_load: Number of rows to load for the new vocabulary (note: to
+      support variable partitioning and partial loading, this does not need to
+      be the same as the number of entries in `new_row_vocab_file`).
+    new_col_vocab_size: Number of columns to load - should be the same as the
+      number of entries in `new_col_vocab_file`, since we don't support
+      partitioning along the column axis.
+    initializer: Callable initializer function that accepts a 1-D tensor as the
+      arg to specify the shape of the returned tensor. Used to initialize
+      missing values.
+    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old row vocabulary file. Can be None, which represents no
+      remapping on the row axis.
+    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new row vocabulary file. Can be None, which represents no remapping
+      on the row axis - in which case, `new_row_vocab_offset` and
+      `num_rows_to_load` work under the assumption that the new row vocab is the
+      same as the old row vocab.
+    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new column vocabulary file. Can be None, which represents no
+      remapping on the column axis - in which case, `new_col_vocab_size` works
+      under the assumption that the new col vocab is the same as the old col
+      vocab.
+    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
+      to append. Must be >= 0.
+    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
+      columns to append. Must be >= 0.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
+    new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
+    specified tensor in the checkpoint, and any missing or OOV values
+    initialized with the given `initializer`.
+
+  Raises:
+    ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
+    ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
+      provided, while the other is not. Same for `old_col_vocab_file` and
+      `new_col_vocab_file`.
+    ValueError: If neither row vocabs or col vocabs are provided.
+  """
+  if num_row_oov_buckets < 0:
+    raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
+                     num_row_oov_buckets)
+  if num_col_oov_buckets < 0:
+    raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
+                     num_col_oov_buckets)
+
+  if bool(old_row_vocab_file) != bool(new_row_vocab_file):
+    raise ValueError(
+        "old_row_vocab_file and new_row_vocab_file must both be specified or "
+        "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
+        format(old_row_vocab_file, new_row_vocab_file))
+  if bool(old_col_vocab_file) != bool(new_col_vocab_file):
+    raise ValueError(
+        "old_col_vocab_file and new_col_vocab_file must both be specified or "
+        "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
+        format(old_col_vocab_file, new_col_vocab_file))
+
+  remap_rows = new_row_vocab_file and old_row_vocab_file
+  remap_cols = new_col_vocab_file and old_col_vocab_file
+  if not (remap_rows or remap_cols):
+    raise ValueError(
+        "Must provide either row or column vocab files. If no remapping is "
+        "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
+        "instead.")
+
+  num_rows_present = num_rows_to_load
+  if remap_rows:
+    row_remapping, num_rows_present = (
+        gen_checkpoint_ops.generate_vocab_remapping(
+            new_vocab_file=new_row_vocab_file,
+            old_vocab_file=old_row_vocab_file,
+            new_vocab_offset=new_row_vocab_offset,
+            num_new_vocab=num_rows_to_load))
+  else:
+    # Even when the rows are not being reordered, we still need to generate a
+    # remapping to account for initializing partitioned Variables (when
+    # new_row_vocab_offset is non-zero).
+    row_remapping = math_ops.range(
+        new_row_vocab_offset,
+        new_row_vocab_offset + num_rows_to_load,
+        dtype=dtypes.int64)
+
+  col_remapping = []
+  num_cols_present = new_col_vocab_size
+  if remap_cols:
+    col_remapping, num_cols_present = (
+        gen_checkpoint_ops.generate_vocab_remapping(
+            new_vocab_file=new_col_vocab_file,
+            old_vocab_file=old_col_vocab_file,
+            new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
+            num_new_vocab=new_col_vocab_size))
+
+  init_vals = initializer([
+      num_rows_to_load * new_col_vocab_size -
+      num_rows_present * num_cols_present, 1
+  ])
+  return_tensor = gen_checkpoint_ops.load_and_remap_matrix(
+      ckpt_path=ckpt_path,
+      old_tensor_name=old_tensor_name,
+      row_remapping=row_remapping,
+      col_remapping=col_remapping,
+      initializing_values=init_vals,
+      num_rows=num_rows_to_load,
+      num_cols=new_col_vocab_size,
+      max_rows_in_memory=max_rows_in_memory)
+
+  # Add OOV row(s) and column(s).
+  if num_row_oov_buckets > 0:
+    init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
+    init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
+    return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
+  if num_col_oov_buckets > 0:
+    # We need to add any row OOV to the new column shape.
+    init_col_oov_val = initializer(
+        [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
+    init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
+    return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
+
+  return return_tensor
+
+
+def load_and_remap_matrix_initializer(ckpt_path,
+                                      old_tensor_name,
+                                      new_row_vocab_size,
+                                      new_col_vocab_size,
+                                      old_row_vocab_file=None,
+                                      new_row_vocab_file=None,
+                                      old_col_vocab_file=None,
+                                      new_col_vocab_file=None,
+                                      num_row_oov_buckets=0,
+                                      num_col_oov_buckets=0,
+                                      initializer=None,
+                                      max_rows_in_memory=-1):
+  r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
+
+  The returned initializer loads a 2-D (matrix) `Tensor` with name
+  `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
+  rows/columns according to the specified vocab files and append additional
+  out-of-vocabulary rows/columns according to the number of OOV buckets.
+
+  The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
+  a text file, with each line containing a single entity within the vocabulary.
+  Let the function `line_of(f, "x")` return the 0-indexed line number of the
+  entity "x" in file f, and the function `entity_at(f, i)` return the entity at
+  line i of file f. Then, row i of the new output matrix will be taken from row
+  `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
+  matrix. If any entity in `new_row_vocab_file` is not found in
+  `old_row_vocab_file`, that row is considered a "missing" row, and its values
+  will be initialized using the `initializer` arg. The same logic also applies
+  for the columns.
+
+  For example, assuming that:
+
+  * `old_row_vocab_file` contains "mercury\nvenus\nmars"
+  * `new_row_vocab_file` contains "venus\njupiter\nmercury"
+  * `old_col_vocab_file` contains "good\nbetter\nbest"
+  * `new_col_vocab_file` contains "good\nbest\nfantastic"
+  * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
+  * `w(i, j)` represents the value from row i, column j of the old matrix
+
+  Then the new output matrix will look like:
+
+  `[[w(1, 0), w(1, 2), 1],
+    [2,       3,       4],
+    [w(0, 0), w(0, 2), 5]]`
+
+  If we further specify that:
+
+  * `num_row_oov_buckets` == 2
+  * `num_col_oov_buckets` == 1
+
+  Then the new output matrix will look like:
+
+  `[[w(1, 0), w(1, 2), 1,  12],
+    [2,       3,       4,  13],
+    [w(0, 0), w(0, 2), 5,  14],
+    [6,       7,       8,  15],
+    [9,       10,      11, 16]]`
+
+  If `{old,new}_row_vocab_file` are None, we assume that the old and new row
+  vocab files are the same, and no row remapping is done. If
+  `{old,new}_col_vocab_file` are None, we assume that the old and new column
+  vocab files are the same, and no column remapping is done.
+
+  The returned initializer only supports div-partitioning along the row axis. It
+  does not support partitioning along the column axis or mod-partitioning.
+
+  NOTE: When this is used to warm-start variables, client code should use
+  `tf.lookup.index_table_from_tensor()` like
+  contrib/layers/python/layers/feature_column.py does, as opposed to
+  `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
+  same.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_row_vocab_size: `int` specifying the number of entries in
+      `new_row_vocab_file`. If no row remapping is needed (no row vocab
+      provided), this should be equal to the number of rows to load from the old
+      matrix (which can theoretically be smaller than the number of rows in the
+      old matrix).
+    new_col_vocab_size: `int` specifying the number of entries in
+      `new_col_vocab_file`. If no column remapping is needed (no column vocab
+      provided), this should be equal to the number of columns in the old
+      matrix.
+    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old row vocabulary file. Can be None, which represents no
+      remapping on the row axis.
+    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new row vocabulary file. Can be None, which represents no remapping
+      on the row axis.
+    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
+      to append. Must be >= 0.
+    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
+      columns to append. Must be >= 0.
+    initializer: Initializer function to initialize missing values. Accepts a
+      1-D tensor as the arg to specify the shape of the returned tensor. If
+      `None`, defaults to using `zeros_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function that should be used to initialize a
+    (potentially partitioned) `Variable` whose complete shape is
+    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
+    num_col_oov_buckets]`.
+
+  Raises:
+    TypeError: If `initializer` is specified but not callable.
+  """
+  if initializer is None:
+    # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
+    # Glorot and Bengio, 2010.
+    initializer = init_ops.zeros_initializer()
+
+  if not callable(initializer):
+    raise TypeError(
+        "initializer must be callable, instead of being {} of type {}.".format(
+            initializer, type(initializer)))
+
+  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
+    """Variable initializer.
+
+    Args:
+      shape: Shape of `Tensor` to return. Should include OOV on both axes.
+      dtype: Must be float32.
+      partition_info: variable_scope._PartitionInfo.
+
+    Returns:
+      `Tensor` of shape `shape`.
+
+    Raises:
+      TypeError: If `dtype` is anything other than float32.
+      ValueError: For shape mismatch upon invocation.
+    """
+    # Sanity checks.
+    if dtype != dtypes.float32:
+      raise TypeError(
+          "Currently, only float32 is supported. Received dtype: {}".format(
+              dtype))
+    if len(shape) != 2:
+      raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
+    if shape[0] <= 0:
+      raise ValueError(
+          "Expected 1st dim of shape to be > 0, but received shape: {}".format(
+              shape))
+    if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
+      raise ValueError(
+          "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
+          "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
+              new_col_vocab_size, num_col_oov_buckets,
+              new_col_vocab_size + num_col_oov_buckets, shape))
+
+    offset = 0
+    if partition_info is not None:
+      offset = partition_info.single_offset(shape)
+
+    if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
+      raise ValueError(
+          "Trying to initialize {} additional rows after {} rows have already "
+          "been initialized, which would exceed expected total row count of "
+          "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
+              shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
+              new_row_vocab_size + num_row_oov_buckets))
+
+    row_oov_buckets_to_use = min(shape[0],
+                                 max(0, offset + shape[0] - new_row_vocab_size))
+    num_rows_to_load = shape[0] - row_oov_buckets_to_use
+
+    return _load_and_remap_matrix(
+        ckpt_path=ckpt_path,
+        old_tensor_name=old_tensor_name,
+        new_row_vocab_offset=offset,
+        num_rows_to_load=num_rows_to_load,
+        new_col_vocab_size=new_col_vocab_size,
+        initializer=initializer,
+        old_row_vocab_file=old_row_vocab_file,
+        new_row_vocab_file=new_row_vocab_file,
+        old_col_vocab_file=old_col_vocab_file,
+        new_col_vocab_file=new_col_vocab_file,
+        num_row_oov_buckets=row_oov_buckets_to_use,
+        num_col_oov_buckets=num_col_oov_buckets,
+        max_rows_in_memory=max_rows_in_memory)
+
+  return _initializer
+
+
+def load_embedding_initializer(ckpt_path,
+                               embedding_tensor_name,
+                               new_vocab_size,
+                               embedding_dim,
+                               old_vocab_file,
+                               new_vocab_file,
+                               num_oov_buckets=0,
+                               initializer=None,
+                               max_rows_in_memory=-1):
+  """Returns a variable initializer for loading pre-trained embeddings.
+
+  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
+  embedding weights and remapping according to the provided vocab files. See
+  docs for `load_and_remap_matrix_initializer()` for more details.
+
+  NOTE: Only for use with div-partitioned variables / vocabularies.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_vocab_size: Number of entries in the new vocab.
+    embedding_dim: `int` specifying the dimension of the embedding vectors from
+      the checkpoint. Must match the number of columns in the old embedding
+      matrix.
+    old_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old vocabulary file.
+    new_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the new vocabulary file.
+    num_oov_buckets: `int` specifying the number of out-of-vocabulary
+      buckets to use. Must be >= 0.
+    initializer: Initializer function that accepts a 1-D tensor as the arg to
+      specify the shape of the returned tensor. If `None`, defaults to using
+      `truncated_normal_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function.
+  """
+  if initializer is None:
+    # TODO(b/25671353): This should be kept in sync with the stddev used by
+    # feature_column.py's _EmbeddingColumn.
+    initializer = init_ops.truncated_normal_initializer(
+        stddev=1.0 / math.sqrt(embedding_dim))
+
+  return load_and_remap_matrix_initializer(
+      ckpt_path=ckpt_path,
+      old_tensor_name=embedding_tensor_name,
+      new_row_vocab_size=new_vocab_size,
+      new_col_vocab_size=embedding_dim,
+      old_row_vocab_file=old_vocab_file,
+      new_row_vocab_file=new_vocab_file,
+      old_col_vocab_file=None,
+      new_col_vocab_file=None,
+      num_row_oov_buckets=num_oov_buckets,
+      num_col_oov_buckets=0,
+      initializer=initializer,
+      max_rows_in_memory=max_rows_in_memory)
+
+
+def load_linear_multiclass_bias_initializer(ckpt_path,
+                                            bias_tensor_name,
+                                            new_class_vocab_size,
+                                            old_class_vocab_file,
+                                            new_class_vocab_file,
+                                            num_class_oov_buckets=0,
+                                            initializer=None,
+                                            max_rows_in_memory=-1):
+  """Loads pre-trained multi-class biases for linear models from checkpoint.
+
+  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
+  multi-class bias and remapping according to the provided vocab files. See docs
+  for `load_and_remap_matrix_initializer()` for more details. In this case, the
+  provided row_vocab is the class vocabulary, and the expected shape is
+  `[new_class_vocab_size, 1]`.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    bias_tensor_name: Tensor name to load from in the checkpoints.
+    new_class_vocab_size: Number of entries in the new class vocab.
+    old_class_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old class vocabulary file.
+    new_class_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the new class vocabulary file.
+    num_class_oov_buckets: `int` specifying the number of out-of-vocabulary
+      buckets to use for the classes. Must be >= 0.
+    initializer: Initializer function that accepts a 1-D tensor as the arg to
+      specify the shape of the returned tensor. If `None`, defaults to using
+      `zeros_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function.
+  """
+  # Linear multi-class biases should be zero-initialized.
+  if initializer is None:
+    initializer = init_ops.zeros_initializer()
+
+  return load_and_remap_matrix_initializer(
+      ckpt_path=ckpt_path,
+      old_tensor_name=bias_tensor_name,
+      new_row_vocab_size=new_class_vocab_size,
+      new_col_vocab_size=1,
+      old_row_vocab_file=old_class_vocab_file,
+      new_row_vocab_file=new_class_vocab_file,
+      old_col_vocab_file=None,
+      new_col_vocab_file=None,
+      num_row_oov_buckets=num_class_oov_buckets,
+      num_col_oov_buckets=0,
+      initializer=initializer,
+      max_rows_in_memory=max_rows_in_memory)
+
+
+def load_variable_slot_initializer(ckpt_path,
+                                   old_tensor_name,
+                                   primary_partition_info,
+                                   new_row_vocab_size,
+                                   new_col_vocab_size,
+                                   old_row_vocab_file=None,
+                                   new_row_vocab_file=None,
+                                   old_col_vocab_file=None,
+                                   new_col_vocab_file=None,
+                                   num_row_oov_buckets=0,
+                                   num_col_oov_buckets=0,
+                                   initializer=None,
+                                   max_rows_in_memory=-1):
+  """Loads pre-trained multi-class slots for linear models from checkpoint.
+
+  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
+  multi-class slots (such as optimizer accumulators) and remapping them
+  according to the provided vocab files. See docs for
+  `load_and_remap_matrix_initializer()` for more details.  Takes in a
+  `variable_scope._PartitionInfo` representing the slot's primary `Variable`'s
+  partitioning.  This is necessary since accumulator `Variable` creation ignores
+  primary scoping and partitioning information.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    primary_partition_info: A `variable_scope._PartitionInfo` containing this
+      slot's primary `Variable`'s partitioning information.  This is used to
+      calculate the offset and override the partition_info passed to the call to
+      _initialize.
+    new_row_vocab_size: `int` specifying the number of entries in
+      `new_row_vocab_file`. If no row remapping is needed (no row vocab
+      provided), this should be equal to the number of rows to load from the old
+      matrix (which can theoretically be smaller than the number of rows in the
+      old matrix).
+    new_col_vocab_size: `int` specifying the number of entries in
+      `new_col_vocab_file`. If no column remapping is needed (no column vocab
+      provided), this should be equal to the number of columns in the old
+      matrix.
+    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old row vocabulary file. Can be None, which represents no
+      remapping on the row axis.
+    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new row vocabulary file. Can be None, which represents no remapping
+      on the row axis.
+    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
+      to append. Must be >= 0.
+    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
+      columns to append. Must be >= 0.
+    initializer: Initializer function to initialize missing values. Accepts a
+      1-D tensor as the arg to specify the shape of the returned tensor. If
+      `None`, defaults to using `zeros_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function that should be used to initialize a
+    (potentially partitioned) `Variable` whose complete shape is
+    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
+    num_col_oov_buckets]`.
+
+  Raises:
+    TypeError: If `initializer` is specified but not callable.
+  """
+  initializer_fn = load_and_remap_matrix_initializer(
+      ckpt_path=ckpt_path,
+      old_tensor_name=old_tensor_name,
+      new_row_vocab_size=new_row_vocab_size,
+      new_col_vocab_size=new_col_vocab_size,
+      old_row_vocab_file=old_row_vocab_file,
+      new_row_vocab_file=new_row_vocab_file,
+      old_col_vocab_file=old_col_vocab_file,
+      new_col_vocab_file=new_col_vocab_file,
+      num_row_oov_buckets=num_row_oov_buckets,
+      num_col_oov_buckets=num_col_oov_buckets,
+      initializer=initializer,
+      max_rows_in_memory=max_rows_in_memory)
+
+  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
+    del partition_info  # Unused by this override.
+    return initializer_fn(shape, dtype, partition_info=primary_partition_info)
+
+  return _initializer
diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
new file mode 100644
index 00000000000..911c5a210c7
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
@@ -0,0 +1,821 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for the op to generate vocab remapping."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib.framework.python.ops import checkpoint_ops
+from tensorflow.contrib.framework.python.ops import gen_checkpoint_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver
+
+FLAGS = flags.FLAGS
+_TESTDATA_PATH = 'contrib/framework/testdata'
+
+
+class GenerateVocabRemappingTest(test.TestCase):
+  """Tests for the generate_vocab_remapping() method."""
+
+  def setUp(self):
+    self.new_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword_shifted.txt')
+    self.old_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt')
+
+  def test_generate_remapping_with_no_vocab_changes(self):
+    """Tests where vocab does not change at all."""
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
+        new_vocab_file=self.old_vocab_file,
+        old_vocab_file=self.old_vocab_file,
+        num_new_vocab=3,
+        new_vocab_offset=0)
+    expected_remapping = range(0, 3)
+    expected_num_present = 3
+    with self.test_session():
+      self.assertAllEqual(expected_remapping, remapping.eval())
+      self.assertAllEqual(expected_num_present, num_present.eval())
+
+  def test_generate_remapping_with_shifted_vocab(self):
+    """Tests where vocab is the same, but shifted / ordered differently."""
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
+        new_vocab_file=self.new_vocab_file,
+        old_vocab_file=self.old_vocab_file,
+        num_new_vocab=3,
+        new_vocab_offset=0)
+    expected_remapping = [2, 0, 1]
+    expected_num_present = 3
+    with self.test_session():
+      self.assertAllEqual(expected_remapping, remapping.eval())
+      self.assertAllEqual(expected_num_present, num_present.eval())
+
+  def test_generate_remapping_with_offset(self):
+    """Tests offset and num_new_vocab logic."""
+    remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
+        new_vocab_file=self.new_vocab_file,
+        old_vocab_file=self.old_vocab_file,
+        num_new_vocab=1,
+        new_vocab_offset=1)
+    expected_remapping = [0]
+    expected_num_present = 1
+    with self.test_session():
+      self.assertAllEqual(expected_remapping, remapping.eval())
+      self.assertAllEqual(expected_num_present, num_present.eval())
+
+
+class LoadAndRemapMatrixTest(test.TestCase):
+  """Tests for the load_and_remap_matrix() op."""
+
+  def setUp(self):
+    ops.reset_default_graph()
+    self.old_num_rows = 5
+    self.old_num_cols = 16
+    self.matrix_value = np.reshape(
+        range(0, self.old_num_rows * self.old_num_cols), (self.old_num_rows,
+                                                          self.old_num_cols))
+    with variable_scope.variable_scope('some_scope'):
+      matrix = variable_scope.get_variable(
+          'matrix',
+          dtype=dtypes.float32,
+          initializer=constant_op.constant(
+              self.matrix_value, dtype=dtypes.float32))
+      self.old_tensor_name = 'some_scope/matrix'
+
+    save = saver.Saver([matrix])
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      self.bundle_file = os.path.join(test.get_temp_dir(), 'bundle_checkpoint')
+      save.save(sess, self.bundle_file)
+
+  def test_load_and_remap_no_missing(self):
+    """Tests the op's load and remap where there are no missing entries."""
+
+    # No column remapping, new weight matrix has second row, then first row.
+    row_remapping = [1, 0]
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=row_remapping,
+        col_remapping=[],
+        initializing_values=[],
+        num_rows=2,
+        num_cols=self.old_num_cols)
+    with self.test_session():
+      self.assertAllClose(self.matrix_value[row_remapping],
+                          remapped_matrix.eval())
+
+    # No row remapping, new weight matrix has third col, then first col.
+    row_remapping = list(range(self.old_num_rows))
+    col_remapping = [2, 0]
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=row_remapping,
+        col_remapping=col_remapping,
+        initializing_values=[],
+        num_rows=len(row_remapping),
+        num_cols=len(col_remapping))
+    with self.test_session():
+      self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
+                          remapped_matrix.eval())
+
+    # Both row and column remappings.
+    row_remapping = [1, 0, 4]
+    col_remapping = [1, 15]
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=row_remapping,
+        col_remapping=col_remapping,
+        initializing_values=[],
+        num_rows=len(row_remapping),
+        num_cols=len(col_remapping))
+    with self.test_session():
+      self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
+                          remapped_matrix.eval())
+
+  def test_load_and_remap_with_init(self):
+    """Tests the op's load and remap where there are missing entries."""
+    init_val = 42
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=[2, -1, 0],
+        col_remapping=[1, -1],
+        initializing_values=[init_val] * 4,
+        num_rows=3,
+        num_cols=2)
+
+    expected_remapped_matrix = np.reshape(
+        [33, init_val, init_val, init_val, 1, init_val], [3, 2])
+
+    with self.test_session():
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+
+  def test_load_and_remap_all_missing_rows(self):
+    """Tests when all the rows are missing and need to be initialized."""
+    num_rows = 7
+    initializing_values = [42] * num_rows * self.old_num_cols
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=[-1] * num_rows,
+        col_remapping=[],
+        initializing_values=initializing_values,
+        num_rows=num_rows,
+        num_cols=self.old_num_cols)
+    with self.test_session():
+      self.assertAllClose(
+          np.reshape(initializing_values, (num_rows, self.old_num_cols)),
+          remapped_matrix.eval())
+
+  def test_load_and_remap_all_missing_rows_and_cols(self):
+    """Tests when all the rows & cols are missing and need to be initialized."""
+    num_rows = 7
+    num_cols = 4
+    initializing_values = [42] * num_rows * num_cols
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=[-1] * num_rows,
+        col_remapping=[-1] * num_cols,
+        initializing_values=initializing_values,
+        num_rows=num_rows,
+        num_cols=num_cols)
+    with self.test_session():
+      self.assertAllClose(
+          np.reshape(initializing_values, (num_rows, num_cols)),
+          remapped_matrix.eval())
+
+  def test_load_and_remap_invalid_remapping(self):
+    """Tests that errors are raised when an ID maps to multiple new IDs.
+
+    (This should usually not happen when using public APIs).
+    """
+    invalid_remapping = [1, 0, 0, 0, 1, 2]
+
+    # Invalid row remapping.
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=invalid_remapping,
+        col_remapping=[],
+        initializing_values=[],
+        num_rows=len(invalid_remapping),
+        num_cols=self.old_num_cols)
+    with self.test_session(), self.assertRaises(errors.UnimplementedError):
+      remapped_matrix.eval()
+
+    # Invalid column remapping.
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=list(range(self.old_num_rows)),
+        col_remapping=invalid_remapping,
+        initializing_values=[],
+        num_rows=self.old_num_rows,
+        num_cols=len(invalid_remapping))
+    with self.test_session(), self.assertRaises(errors.UnimplementedError):
+      remapped_matrix.eval()
+
+  def test_load_and_remap_incorrect_initializing_values(self):
+    """Tests that errors are raised with incorrect number of init values."""
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=[2, -1, 0],
+        col_remapping=[1, -1],
+        # Too few initializing values - there should be 4. For some reason,
+        # initializing_values must contain no element (instead of 3 or fewer) to
+        # ensure that a seg fault would reliably occur if the check raising the
+        # InvalidArgumentError were not present.
+        initializing_values=[],
+        num_rows=3,
+        num_cols=2)
+    with self.test_session(), self.assertRaises(errors.InvalidArgumentError):
+      remapped_matrix.eval()
+
+    remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+        ckpt_path=[self.bundle_file],
+        old_tensor_name=self.old_tensor_name,
+        row_remapping=[2, -1, 0],
+        col_remapping=[1, -1],
+        # Too many initializing values - there should be 4.
+        initializing_values=[0] * 5,
+        num_rows=3,
+        num_cols=2)
+    with self.test_session(), self.assertRaises(errors.InvalidArgumentError):
+      remapped_matrix.eval()
+
+
+class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
+  """Tests for the load_and_remap_matrix() op.
+
+  (Specifically focused on the max_rows_in_memory arg and its effects on
+  TensorBundle's BundleReader and TensorSlice logic).
+  """
+
+  def _test_loading_variable_with_max_rows(self, np_value, partitioner,
+                                           max_rows_in_memory):
+    """Helper function for various tests using max_rows_in_memory."""
+    ops.reset_default_graph()
+    old_tensor_name = 'matrix_to_load_and_remap'
+    matrix = variable_scope.get_variable(
+        old_tensor_name,
+        dtype=dtypes.float32,
+        initializer=constant_op.constant(np_value, dtype=dtypes.float32),
+        partitioner=partitioner)
+
+    with self.test_session() as sess:
+      ckpt_path = os.path.join(test.get_temp_dir(), 'temp_ckpt')
+      save = saver.Saver([matrix])
+      variables.global_variables_initializer().run()
+      save.save(sess, ckpt_path)
+      num_rows, num_cols = np_value.shape
+
+      # Tests loading the entire tensor (except reversed).
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+          ckpt_path=ckpt_path,
+          old_tensor_name=old_tensor_name,
+          # Simply reverses the rows of the matrix.
+          row_remapping=list(range(num_rows - 1, -1, -1)),
+          col_remapping=[],
+          initializing_values=[],
+          num_rows=num_rows,
+          num_cols=num_cols,
+          max_rows_in_memory=max_rows_in_memory)
+      self.assertAllClose(np_value[::-1], remapped_matrix.eval())
+
+      # Tests loading the tensor (except for the first and last rows), with
+      # uninitialized values. Requires num_rows to be at least 3 since we're
+      # skipping the first and last rows.
+      self.assertGreater(num_rows, 2)
+      prefix_rows = 2
+      suffix_rows = 3
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+          ckpt_path=ckpt_path,
+          old_tensor_name=old_tensor_name,
+          # Reverses the rows of the matrix, then prepends and appends
+          # uninitialized rows.
+          row_remapping=([-1] * prefix_rows + list(range(1, num_rows - 1)) +
+                         [-1] * suffix_rows),
+          col_remapping=[],
+          initializing_values=[42] * (prefix_rows + suffix_rows) * num_cols,
+          num_rows=num_rows - 2 + prefix_rows + suffix_rows,
+          num_cols=num_cols,
+          max_rows_in_memory=max_rows_in_memory)
+      self.assertAllClose(
+          np.vstack([
+              np.tile(42, [prefix_rows, num_cols]), np_value[1:-1],
+              np.tile(42, [suffix_rows, num_cols])
+          ]), remapped_matrix.eval())
+
+      # Tests when everything is taken from initializing_values.
+      new_rows = 7
+      initializing_values = [42] * new_rows * num_cols
+      remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
+          ckpt_path=ckpt_path,
+          old_tensor_name=old_tensor_name,
+          # Nothing is loaded from the old tensor.
+          row_remapping=[-1] * new_rows,
+          col_remapping=[],
+          initializing_values=initializing_values,
+          num_rows=new_rows,
+          num_cols=num_cols,
+          max_rows_in_memory=max_rows_in_memory)
+      self.assertAllClose(
+          np.reshape(initializing_values, (new_rows, num_cols)),
+          remapped_matrix.eval())
+
+  def test_loading_rows_divisible_by_max_rows(self):
+    """Tests loading normal var when rows are evenly divisible by max_rows."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=None,
+        # 9 is evenly divisible by 3.
+        max_rows_in_memory=3)
+
+  def test_loading_rows_not_divisible_by_max_rows(self):
+    """Tests loading normal var when rows aren't divisible by max_rows."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=None,
+        # 9 is not evenly divisible by 4.
+        max_rows_in_memory=4)
+
+  def test_loading_rows_less_than_max_rows(self):
+    """Tests loading normal var as a single slice.
+
+    (When the specified max_rows_in_memory is larger than the number of rows)
+    """
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=None,
+        # 10 > 9.
+        max_rows_in_memory=10)
+
+  def test_loading_no_max_rows(self):
+    """Tests loading normal var as a single slice with no valid max_rows."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 18)), (6, 3)),
+        partitioner=None,
+        max_rows_in_memory=-1)
+
+  def test_loading_partitions_equals_max_rows(self):
+    """Tests loading partitioned var sliced on partition boundary."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=partitioned_variables.fixed_size_partitioner(3),
+        # With a tensor of shape [9, 3] and 3 partitions, each partition has
+        # exactly 3 rows.
+        max_rows_in_memory=3)
+
+  def test_loading_partitions_greater_than_max_rows(self):
+    """Tests loading partitioned var with more slices than partitions."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=partitioned_variables.fixed_size_partitioner(3),
+        # Even though each partition has 3 rows, we'll only load the tensor one
+        # row at a time.
+        max_rows_in_memory=1)
+
+  def test_loading_partitions_less_than_max_rows(self):
+    """Tests loading partitioned var as a single slice.
+
+    (When the specified max_rows_in_memory is larger than the number of rows)
+    """
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=partitioned_variables.fixed_size_partitioner(3),
+        max_rows_in_memory=10)
+
+  def test_loading_partitions_no_max_rows(self):
+    """Tests loading partitioned var as single slice with no valid max_rows."""
+    self._test_loading_variable_with_max_rows(
+        np_value=np.reshape(list(range(0, 36)), (9, 4)),
+        partitioner=partitioned_variables.fixed_size_partitioner(3),
+        max_rows_in_memory=-1)
+
+
+class LoadAndRemapWrappersTest(test.TestCase):
+  """Tests for the functionality of the Python wrappers."""
+
+  def setUp(self):
+    self.bundle_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint')
+    self.new_feature_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint_vocab.txt')
+    self.old_feature_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH),
+        'bundle_checkpoint_vocab_with_oov.txt')
+    self.new_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword_new.txt')
+    self.old_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt')
+    self.init_val = 42
+
+    def _init_val_initializer(shape, dtype=None, partition_info=None):
+      del dtype, partition_info  # Unused by this unit-testing initializer.
+      return array_ops.tile(
+          constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape)
+
+    self.initializer = _init_val_initializer
+
+  def test_load_and_remap_matrix(self):
+    """Tests the end-to-end loading / remapping of weights."""
+    # _load_and_remap_matrix() is the generalized wrapper that takes in row and
+    # column vocabulary files, calls the relevant remappings, and returns the
+    # weight matrix.  Take this example to be linear multi-class by providing
+    # both row and column vocabularies.
+    remapped_matrix = checkpoint_ops._load_and_remap_matrix(
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_rows_to_load=4,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.bundle_file],
+        new_row_vocab_offset=1,
+        initializer=self.initializer,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1)
+
+    # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes].  The offset
+    # means we read
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]),
+            np.reshape([16, 32, 48, self.init_val, self.init_val], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1]),
+            np.reshape([17, 33, 49, self.init_val, self.init_val], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1])
+        ],
+        axis=1)
+
+    with self.test_session():
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+
+  def test_load_and_remap_output_layer_weight_initializer_linear(self):
+    """Tests for the output layer initializer in the linear multi-class case."""
+    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.bundle_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]),
+            np.reshape([0, 16, 32, 48, self.init_val, self.init_val], [6, 1]),
+            np.reshape([self.init_val] * 6, [6, 1]),
+            np.reshape([1, 17, 33, 49, self.init_val, self.init_val], [6, 1]),
+            np.reshape([self.init_val] * 6, [6, 1])
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV].  Use a
+    # partitioned variable to confirm that the offset logic works.
+    remapped_matrix = variable_scope.get_variable(
+        name='linear/obtained_weight_matrix',
+        shape=[6, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_and_remap_output_layer_weight_initializer_dnn_output(self):
+    """Tests for the output layer initializer in the DNN output case."""
+    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.bundle_file],
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, 66], [5, 1]),
+            np.reshape([0, 16, 32, 48, 64], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1]),
+            np.reshape([1, 17, 33, 49, 65], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1])
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5-sized input layer, 4 class vocab + 1 class OOV].
+    remapped_matrix = variable_scope.get_variable(
+        name='dnn_output/obtained_weight_matrix',
+        shape=[5, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_initializer_with_oov_only_partition(self):
+    """Tests for the output layer initializer where one partition is all OOV."""
+    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.bundle_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=5,
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]),
+            np.reshape([0, 16, 32, 48] + [self.init_val] * 6, [10, 1]),
+            np.reshape([self.init_val] * 10, [10, 1]),
+            np.reshape([1, 17, 33, 49] + [self.init_val] * 6, [10, 1]),
+            np.reshape([self.init_val] * 10, [10, 1]),
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV].  The
+    # second partition has only OOV.
+    remapped_matrix = variable_scope.get_variable(
+        name='linear_all_oov/obtained_weight_matrix',
+        shape=[10, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_and_remap_linear_multiclass_initializer_default_init(self):
+    """Tests where the zeros_initializer default is used for linear."""
+    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.bundle_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, 0, 0], [6, 1]),
+            np.reshape([0, 16, 32, 48, 0, 0], [6, 1]),
+            np.reshape([0] * 6, [6, 1]),
+            np.reshape([1, 17, 33, 49, 0, 0], [6, 1]),
+            np.reshape([0] * 6, [6, 1])
+        ],
+        axis=1)
+
+    remapped_matrix = variable_scope.get_variable(
+        name='linear_init_fallback/obtained_weight_matrix',
+        shape=[6, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_embedding_initializer(self):
+    """Tests for the load_embedding_initializer wrapper."""
+    embedding_loading_initializer = (
+        contrib_framework.load_embedding_initializer(
+            new_vocab_file=self.new_feature_vocab_file,
+            old_vocab_file=self.old_feature_vocab_file,
+            new_vocab_size=5,
+            embedding_dim=16,
+            embedding_tensor_name='some_scope/embeddings',
+            ckpt_path=[self.bundle_file],
+            num_oov_buckets=1,
+            initializer=self.initializer))
+
+    expected_remapped_embeddings = np.concatenate(
+        [
+            np.reshape(range(64), [4, 16]),
+            np.reshape([self.init_val] * 32, [2, 16]),
+        ],
+        axis=0)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
+    # last vocab row (2nd last row) is newly initialized (wasn't found in
+    # previous vocab) and the actual last row is OOV and also newly initialized.
+    # Use a partitioned variable to confirm that the offset logic works.
+    remapped_embeddings = variable_scope.get_variable(
+        name='embedding/obtained_embedding_matrix',
+        shape=[6, 16],
+        initializer=embedding_loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_embeddings,
+                          remapped_embeddings.as_tensor().eval())
+
+
+class LoadMulticlassBiasTest(test.TestCase):
+  """Tests for the load_linear_multiclass_bias_initializer functionality."""
+
+  def setUp(self):
+    ops.reset_default_graph()
+    dim = 1
+    num = 3
+    with ops.name_scope('some_scope'):
+      # Basically from 0 to dim*num-1.
+      flat_data = math_ops.linspace(0.0, dim * num - 1, dim * num)
+      bias = variables.Variable(
+          array_ops.reshape(flat_data, (num, dim)), name='bias')
+    save = saver.Saver([bias])
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      self.bundle_file = os.path.join(test.get_temp_dir(), 'bias_checkpoint')
+      save.save(sess, self.bundle_file)
+
+    self.new_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword_new.txt')
+    self.old_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt')
+    self.init_val = 42
+
+    def _init_val_initializer(shape, dtype=None, partition_info=None):
+      del dtype, partition_info  # Unused by this unit-testing initializer.
+      return array_ops.tile(
+          constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape)
+
+    self.initializer = _init_val_initializer
+
+  def test_load_linear_multiclass_bias_initializer(self):
+    """Tests for the bias initializer wrapper."""
+    bias_loading_initializer = (
+        contrib_framework.load_linear_multiclass_bias_initializer(
+            new_class_vocab_file=self.new_class_vocab_file,
+            old_class_vocab_file=self.old_class_vocab_file,
+            new_class_vocab_size=4,
+            bias_tensor_name='some_scope/bias',
+            ckpt_path=[self.bundle_file],
+            num_class_oov_buckets=1,
+            initializer=self.initializer))
+
+    expected_remapped_bias_vector = np.reshape(
+        [2, 0, self.init_val, 1, self.init_val], [5, 1])
+
+    # The new bias vector is of size [4 class vocab + 1 class OOV, 1].
+    remapped_bias_vector = variable_scope.get_variable(
+        name='bias/obtained_bias_vector',
+        shape=[5, 1],
+        initializer=bias_loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(3))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_bias_vector,
+                          remapped_bias_vector.as_tensor().eval())
+
+
+class LoadVariableSlotTest(test.TestCase):
+  """Tests for the load_variable_slot_initializer functionality."""
+
+  def setUp(self):
+    ops.reset_default_graph()
+    dim = 1
+    num = 3
+    with ops.name_scope('some_scope'):
+      # Basically from 0 to dim*num-1.
+      flat_data = math_ops.linspace(0.0, dim * num - 1, dim * num)
+      accum = variables.Variable(
+          array_ops.reshape(flat_data, (num, dim)), name='accum')
+    save = saver.Saver([accum])
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      self.bundle_file = os.path.join(test.get_temp_dir(), 'accum_checkpoint')
+      save.save(sess, self.bundle_file)
+
+    self.new_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword_new.txt')
+    self.old_class_vocab_file = os.path.join(
+        test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt')
+    self.init_val = 42
+
+    def _init_val_initializer(shape, dtype=None, partition_info=None):
+      del dtype, partition_info  # Unused by this unit-testing initializer.
+      return array_ops.tile(
+          constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape)
+
+    self.initializer = _init_val_initializer
+
+  def test_load_variable_slot_initializer(self):
+    """Tests for the slot initializer wrapper."""
+    # We have an initializer for each of two partitioned variables, which will
+    # be [3, 1] and [2, 1].  The partitioning information is passed here in
+    # initializer construction, as opposed to through a variable scope during
+    # variable creation.
+    variable_slot_initializer_part_0 = (
+        contrib_framework.load_variable_slot_initializer(
+            new_row_vocab_file=self.new_class_vocab_file,
+            old_row_vocab_file=self.old_class_vocab_file,
+            new_row_vocab_size=4,
+            new_col_vocab_size=1,
+            primary_partition_info=variable_scope._PartitionInfo(
+                full_shape=[5, 1], var_offset=[0, 0]),
+            old_tensor_name='some_scope/accum',
+            ckpt_path=[self.bundle_file],
+            num_row_oov_buckets=1,
+            initializer=self.initializer))
+    variable_slot_initializer_part_1 = (
+        contrib_framework.load_variable_slot_initializer(
+            new_row_vocab_file=self.new_class_vocab_file,
+            old_row_vocab_file=self.old_class_vocab_file,
+            new_row_vocab_size=4,
+            new_col_vocab_size=1,
+            primary_partition_info=variable_scope._PartitionInfo(
+                full_shape=[5, 1], var_offset=[3, 0]),
+            old_tensor_name='some_scope/accum',
+            ckpt_path=[self.bundle_file],
+            num_row_oov_buckets=1,
+            initializer=self.initializer))
+
+    expected_remapped_accum_vector_part_0 = np.reshape([2, 0, self.init_val],
+                                                       [3, 1])
+
+    expected_remapped_accum_vector_part_1 = np.reshape([1, self.init_val],
+                                                       [2, 1])
+
+    # Since there is no variable scope here, partition_info will be None, so
+    # if variable_slot_initializer_part_0 and variable_slot_initializer_part_1
+    # were instead instances of load_and_remap_matrix_initializer, the part_0
+    # obtained vector would still be [2, 0, self.init_val], but the part_1
+    # obtained vector would be [2, 0], since the partition_info would default to
+    # assuming a single partition.
+    remapped_accum_vector_part_0 = variable_scope.get_variable(
+        name='accum/obtained_accum_vector_part_0',
+        shape=[3, 1],
+        initializer=variable_slot_initializer_part_0)
+    remapped_accum_vector_part_1 = variable_scope.get_variable(
+        name='accum/obtained_accum_vector_part_1',
+        shape=[2, 1],
+        initializer=variable_slot_initializer_part_1)
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_accum_vector_part_0,
+                          remapped_accum_vector_part_0.eval())
+      self.assertAllClose(expected_remapped_accum_vector_part_1,
+                          remapped_accum_vector_part_1.eval())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/ops.py b/tensorflow/contrib/framework/python/ops/ops.py
index f403942fe7d..4fccc2ceac7 100644
--- a/tensorflow/contrib/framework/python/ops/ops.py
+++ b/tensorflow/contrib/framework/python/ops/ops.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 
 
-__all__ = ['get_graph_from_inputs']
+__all__ = ['get_graph_from_inputs',
+           'get_name_scope']
 
 
 def get_graph_from_inputs(op_input_list, graph=None):
@@ -52,3 +53,21 @@ def get_graph_from_inputs(op_input_list, graph=None):
   """
   # pylint: disable=protected-access
   return ops._get_graph_from_inputs(op_input_list, graph)
+
+
+def get_name_scope():
+  """Returns the current name scope of the default graph.
+
+  For example:
+
+    ```python
+    with tf.name_scope('scope1'):
+      with tf.name_scope('scope2'):
+        print(tf.contrib.framework.get_name_scope())
+    ```
+    would print the string `scope1/scope2`.
+
+  Returns:
+    A string represnting the current name scope.
+  """
+  return ops.get_default_graph().get_name_scope()
diff --git a/tensorflow/contrib/framework/python/ops/ops_test.py b/tensorflow/contrib/framework/python/ops/ops_test.py
index 321ca6b82d4..19bcb5d22e0 100644
--- a/tensorflow/contrib/framework/python/ops/ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/ops_test.py
@@ -57,6 +57,15 @@ class OpsTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not from the passed-in graph"):
       ops_lib.get_graph_from_inputs(values, g1)
 
+  def testGetNameScope(self):
+    with ops.name_scope("scope1"):
+      with ops.name_scope("scope2"):
+        with ops.name_scope("scope3"):
+          self.assertEqual("scope1/scope2/scope3", ops_lib.get_name_scope())
+        self.assertEqual("scope1/scope2", ops_lib.get_name_scope())
+      self.assertEqual("scope1", ops_lib.get_name_scope())
+    self.assertEqual("", ops_lib.get_name_scope())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index f8a41bb5264..f02a7c63606 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import re
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope as contrib_add_arg_scope
@@ -30,9 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
@@ -53,9 +52,11 @@ __all__ = ['add_model_variable',
            'get_or_create_global_step',
            'get_local_variables',
            'get_model_variables',
+           'get_trainable_variables',
            'get_unique_variable',
            'get_variables_by_name',
            'get_variables_by_suffix',
+           'get_variable_full_name',
            'get_variables_to_restore',
            'get_variables',
            'local_variable',
@@ -117,46 +118,32 @@ def get_global_step(graph=None):
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
+  This API is deprecated. Use core framework training version instead.
+
   Args:
-    graph: The graph in which to create the global step. If missing, use default
-        graph.
+    graph: The graph in which to create the global step tensor. If missing,
+      use default graph.
 
   Returns:
     Global step tensor.
 
   Raises:
-    ValueError: if global step key is already defined.
+    ValueError: if global step tensor is already defined.
   """
-  graph = ops.get_default_graph() if graph is None else graph
-  if get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP]
-    return variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        collections=collections)
+  return training_util.create_global_step(graph)
 
 
 def get_or_create_global_step(graph=None):
-  """Returns and create (if necessary) the global step variable.
+  """Returns and create (if necessary) the global step tensor.
 
   Args:
-    graph: The graph in which to create the global step. If missing, use default
-        graph.
+    graph: The graph in which to create the global step tensor. If missing, use
+      default graph.
 
   Returns:
-    the tensor representing the global step variable.
+    The global step tensor.
   """
-  graph = ops.get_default_graph() if graph is None else graph
-  globalstep = get_global_step(graph)
-  if globalstep is None:
-    globalstep = create_global_step(graph)
-  return globalstep
+  return training_util.get_or_create_global_step(graph)
 
 
 def local_variable(initial_value, validate_shape=True, name=None):
@@ -169,7 +156,7 @@ def local_variable(initial_value, validate_shape=True, name=None):
   Returns:
     New variable.
   """
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=validate_shape, name=name)
@@ -179,7 +166,7 @@ def local_variable(initial_value, validate_shape=True, name=None):
 def variable(name, shape=None, dtype=None, initializer=None,
              regularizer=None, trainable=True, collections=None,
              caching_device=None, device=None,
-             partitioner=None, custom_getter=None):
+             partitioner=None, custom_getter=None, use_resource=None):
   """Gets an existing variable with these parameters or creates a new one.
 
   Args:
@@ -204,17 +191,20 @@ def variable(name, shape=None, dtype=None, initializer=None,
       partitions for each axis (currently only one axis can be partitioned).
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
 
   Returns:
     The created or existing variable.
   """
-  collections = list(collections or [ops.GraphKeys.GLOBAL_VARIABLES])
+  collections = list(collections if collections is not None
+                     else [ops.GraphKeys.GLOBAL_VARIABLES])
 
   # Remove duplicates
   collections = set(collections)
   getter = variable_scope.get_variable
   if custom_getter is not None:
-    getter = custom_getter
+    getter = functools.partial(custom_getter,
+                               reuse=variable_scope.get_variable_scope().reuse)
   with ops.device(device or ''):
     return getter(name, shape=shape, dtype=dtype,
                   initializer=initializer,
@@ -222,14 +212,15 @@ def variable(name, shape=None, dtype=None, initializer=None,
                   trainable=trainable,
                   collections=collections,
                   caching_device=caching_device,
-                  partitioner=partitioner)
+                  partitioner=partitioner,
+                  use_resource=use_resource)
 
 
 @contrib_add_arg_scope
 def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
                    regularizer=None, trainable=True, collections=None,
                    caching_device=None, device=None, partitioner=None,
-                   custom_getter=None):
+                   custom_getter=None, use_resource=None):
   """Gets an existing model variable with these parameters or creates a new one.
 
   Args:
@@ -255,6 +246,7 @@ def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
       partitions for each axis (currently only one axis can be partitioned).
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
 
   Returns:
     The created or existing variable.
@@ -265,7 +257,8 @@ def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
                  initializer=initializer, regularizer=regularizer,
                  trainable=trainable, collections=collections,
                  caching_device=caching_device, device=device,
-                 partitioner=partitioner, custom_getter=custom_getter)
+                 partitioner=partitioner, custom_getter=custom_getter,
+                 use_resource=use_resource)
   return var
 
 
@@ -328,6 +321,19 @@ def get_local_variables(scope=None, suffix=None):
   return get_variables(scope, suffix, ops.GraphKeys.LOCAL_VARIABLES)
 
 
+def get_trainable_variables(scope=None, suffix=None):
+  """Gets the list of trainable variables, filtered by scope and/or suffix.
+
+  Args:
+    scope: an optional scope for filtering the variables to return.
+    suffix: an optional suffix for filtering the variables to return.
+
+  Returns:
+    a list of variables in the trainable collection with scope and suffix.
+  """
+  return get_variables(scope, suffix, ops.GraphKeys.TRAINABLE_VARIABLES)
+
+
 def get_variables_to_restore(include=None, exclude=None):
   """Gets the list of the variables to restore.
 
@@ -494,8 +500,31 @@ def assign_from_values_fn(var_names_to_values):
 # pylint: disable=protected-access
 # Currently variable_scope doesn't provide very good APIs to access
 # all variables under scope and retrieve and check existing scopes.
-#
+def get_variable_full_name(var):
+  """Returns the full name of a variable.
+
+  For normal Variables, this is the same as the var.op.name.  For
+  sliced or PartitionedVariables, this name is the same for all the
+  slices/partitions. In both cases, this is normally the name used in
+  a checkpoint file.
+
+  Args:
+    var: A `Variable` object.
+
+  Returns:
+    A string that is the full name.
+  """
+  if var._save_slice_info:
+    return var._save_slice_info.full_name
+  else:
+    return var.op.name
+
+
 # TODO(nsilberman): add flag to load exponential moving averages instead
+#
+# TODO(sguada): Update docs in slim/g3doc/index.md to describe
+# the new feature where the var_list dictionary can have values that
+# are each a list of Variables.
 def assign_from_checkpoint(model_path, var_list):
   """Creates an operation to assign specific variables from a checkpoint.
 
@@ -524,10 +553,7 @@ def assign_from_checkpoint(model_path, var_list):
   grouped_vars = {}
   if isinstance(var_list, (tuple, list)):
     for var in var_list:
-      if var._save_slice_info:
-        ckpt_name = var._save_slice_info.full_name
-      else:
-        ckpt_name = var.op.name
+      ckpt_name = get_variable_full_name(var)
       if ckpt_name not in grouped_vars:
         grouped_vars[ckpt_name] = []
       grouped_vars[ckpt_name].append(var)
@@ -554,7 +580,7 @@ def assign_from_checkpoint(model_path, var_list):
       placeholder_tensor = array_ops.placeholder(
           dtype=var.dtype.base_dtype,
           shape=var.get_shape(),
-          name='placeholder/' + ckpt_name)
+          name='placeholder/' + var.op.name)
       assign_ops.append(var.assign(placeholder_tensor))
 
       if not var._save_slice_info:
@@ -571,6 +597,7 @@ def assign_from_checkpoint(model_path, var_list):
         slice_dims = [(start, start + size) for (start, size) in slice_dims]
         slice_dims = [slice(*x) for x in slice_dims]
         slice_value = ckpt_value[slice_dims]
+        slice_value = slice_value.reshape(var._save_slice_info.var_shape)
         feed_dict[placeholder_tensor] = slice_value
 
   assign_op = control_flow_ops.group(*assign_ops)
@@ -582,12 +609,15 @@ def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
                               reshape_variables=False):
   """Returns a function that assigns specific variables from a checkpoint.
 
+  If ignore_missing_vars is True and no variables are found in the checkpoint
+  it returns None.
+
   Args:
     model_path: The full path to the model checkpoint. To get latest checkpoint
         use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
     var_list: A list of `Variable` objects or a dictionary mapping names in the
-        checkpoint to the correspoing variables to initialize. If empty or None,
-        it would return  no_op(), None.
+        checkpoint to the corresponding variables to initialize. If empty or
+        `None`, it would return `no_op(), None`.
     ignore_missing_vars: Boolean, if True it would ignore variables missing in
         the checkpoint with a warning instead of failing.
     reshape_variables: Boolean, if True it would automatically reshape variables
@@ -596,12 +626,14 @@ def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
 
   Returns:
     A function that takes a single argument, a `tf.Session`, that applies the
-    assignment operation.
+    assignment operation. If no matching variables were found in the checkpoint
+    then `None` is returned.
 
   Raises:
-    ValueError: If the checkpoint specified at `model_path` is missing one of
-      the variables in `var_list`.
+    ValueError: If var_list is empty.
   """
+  if not var_list:
+    raise ValueError('var_list cannot be empty')
   if ignore_missing_vars:
     reader = pywrap_tensorflow.NewCheckpointReader(model_path)
     if isinstance(var_list, dict):
@@ -616,10 +648,14 @@ def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
         logging.warning(
             'Variable %s missing in checkpoint %s', var, model_path)
     var_list = available_vars
-  saver = tf_saver.Saver(var_list, reshape=reshape_variables)
-  def callback(session):
-    saver.restore(session, model_path)
-  return callback
+  if var_list:
+    saver = tf_saver.Saver(var_list, reshape=reshape_variables)
+    def callback(session):
+      saver.restore(session, model_path)
+    return callback
+  else:
+    logging.warning('No Variables to restore')
+    return None
 
 
 class VariableDeviceChooser(object):
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 7da103f32bc..cb278707202 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -495,6 +495,17 @@ class ModelVariablesTest(test.TestCase):
       self.assertEquals([a], variables_lib2.get_model_variables('A'))
       self.assertEquals([b], variables_lib2.get_model_variables('B'))
 
+  def testGetTrainableVariables(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        variables_lib2.local_variable([5])
+        a = variables_lib.Variable([5])
+      with variable_scope.variable_scope('B'):
+        variables_lib2.local_variable([5])
+        b = variables_lib.Variable([5])
+      self.assertEquals([a], variables_lib2.get_trainable_variables('A'))
+      self.assertEquals([b], variables_lib2.get_trainable_variables('B'))
+
   def testGetLocalVariables(self):
     with self.test_session():
       with variable_scope.variable_scope('A'):
@@ -676,6 +687,23 @@ class GetVariablesByNameTest(test.TestCase):
       self.assertEquals([fooa], matched_variables)
 
 
+class GetVariableFullNameTest(test.TestCase):
+
+  def testVariable(self):
+    my_var0 = variables_lib2.variable('my_var0', shape=[])
+    full_name = variables_lib2.get_variable_full_name(my_var0)
+    self.assertEquals(full_name, my_var0.op.name)
+
+  def testPartitionedVariable(self):
+    input_full_name = 'my_var0'
+    partitioner = partitioned_variables.variable_axis_size_partitioner(2)
+    my_var0 = variables_lib2.variable(
+        'my_var0', shape=[2, 2], partitioner=partitioner)
+    for part_var in list(my_var0):
+      computed_full_name = variables_lib2.get_variable_full_name(part_var)
+      self.assertEquals(input_full_name, computed_full_name)
+
+
 class AssignFromValuesTest(test.TestCase):
 
   def testNoScopes(self):
diff --git a/tensorflow/contrib/framework/testdata/bundle_checkpoint.data-00000-of-00001 b/tensorflow/contrib/framework/testdata/bundle_checkpoint.data-00000-of-00001
new file mode 100644
index 00000000000..a0e27e341d0
Binary files /dev/null and b/tensorflow/contrib/framework/testdata/bundle_checkpoint.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/framework/testdata/bundle_checkpoint.index b/tensorflow/contrib/framework/testdata/bundle_checkpoint.index
new file mode 100644
index 00000000000..264b524fe2b
Binary files /dev/null and b/tensorflow/contrib/framework/testdata/bundle_checkpoint.index differ
diff --git a/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab.txt b/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab.txt
new file mode 100644
index 00000000000..fa220f83637
--- /dev/null
+++ b/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab.txt
@@ -0,0 +1,5 @@
+zero
+one
+two
+three
+four
diff --git a/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab_with_oov.txt b/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab_with_oov.txt
new file mode 100644
index 00000000000..d7ed9e6a2d5
--- /dev/null
+++ b/tensorflow/contrib/framework/testdata/bundle_checkpoint_vocab_with_oov.txt
@@ -0,0 +1,4 @@
+zero
+one
+two
+three
diff --git a/tensorflow/contrib/framework/testdata/keyword.txt b/tensorflow/contrib/framework/testdata/keyword.txt
new file mode 100644
index 00000000000..374f36fd738
--- /dev/null
+++ b/tensorflow/contrib/framework/testdata/keyword.txt
@@ -0,0 +1,3 @@
+knitting
+eminem
+MISSING
diff --git a/tensorflow/contrib/framework/testdata/keyword_new.txt b/tensorflow/contrib/framework/testdata/keyword_new.txt
new file mode 100644
index 00000000000..56a9c4d415d
--- /dev/null
+++ b/tensorflow/contrib/framework/testdata/keyword_new.txt
@@ -0,0 +1,4 @@
+MISSING
+knitting
+flask
+eminem
diff --git a/tensorflow/contrib/framework/testdata/keyword_shifted.txt b/tensorflow/contrib/framework/testdata/keyword_shifted.txt
new file mode 100644
index 00000000000..b2315e53a57
--- /dev/null
+++ b/tensorflow/contrib/framework/testdata/keyword_shifted.txt
@@ -0,0 +1,3 @@
+MISSING
+knitting
+eminem
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 188d5abf2c3..d570a6d702d 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -7,12 +7,13 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "graph_editor_py",
     srcs = [
         "__init__.py",
         "edit.py",
-        "match.py",
         "reroute.py",
         "select.py",
         "subgraph.py",
@@ -28,6 +29,15 @@ py_library(
     ],
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "graph_editor_pip",
+    deps = [
+        ":graph_editor_py",
+        ":match",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -40,8 +50,16 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+py_library(
+    name = "match",
+    srcs = ["tests/match.py"],
+    srcs_version = "PY2AND3",
+    deps = [":graph_editor_py"],
+)
+
 py_test(
     name = "util_test",
+    size = "small",
     srcs = ["tests/util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -56,6 +74,7 @@ py_test(
 
 py_test(
     name = "select_test",
+    size = "small",
     srcs = ["tests/select_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -70,10 +89,12 @@ py_test(
 
 py_test(
     name = "match_test",
+    size = "small",
     srcs = ["tests/match_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
+        ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -84,6 +105,7 @@ py_test(
 
 py_test(
     name = "subgraph_test",
+    size = "small",
     srcs = ["tests/subgraph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -98,10 +120,12 @@ py_test(
 
 py_test(
     name = "reroute_test",
+    size = "small",
     srcs = ["tests/reroute_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
+        ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -112,10 +136,12 @@ py_test(
 
 py_test(
     name = "edit_test",
+    size = "small",
     srcs = ["tests/edit_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
+        ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -126,10 +152,12 @@ py_test(
 
 py_test(
     name = "transform_test",
+    size = "small",
     srcs = ["tests/transform_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_editor_py",
+        ":match",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/graph_editor/__init__.py b/tensorflow/contrib/graph_editor/__init__.py
index 47905cc9927..51b7f45274a 100644
--- a/tensorflow/contrib/graph_editor/__init__.py
+++ b/tensorflow/contrib/graph_editor/__init__.py
@@ -14,100 +14,7 @@
 # ==============================================================================
 """TensorFlow Graph Editor.
 
-The TensorFlow Graph Editor library allows for modification of an existing
-`tf.Graph` instance in-place.
-
-The author's github username is [purpledog](https://github.com/purpledog).
-
-## Library overview
-
-Appending new nodes is the only graph editing operation allowed by the
-TensorFlow core library. The Graph Editor library is an attempt to allow for
-other kinds of editing operations, namely, *rerouting* and *transforming*.
-
-* *rerouting* is a local operation consisting in re-plugging existing tensors
-  (the edges of the graph). Operations (the nodes) are not modified by this
-  operation. For example, rerouting can be used to insert an operation adding
-  noise in place of an existing tensor.
-* *transforming* is a global operation consisting in transforming a graph into
-  another. By default, a transformation is a simple copy but it can be
-  customized to achieved other goals. For instance, a graph can be transformed
-  into another one in which noise is added after all the operations of a
-  specific type.
-
-**Important: modifying a graph in-place with the Graph Editor must be done
-`offline`, that is, without any active sessions.**
-
-Of course new operations can be appended online but Graph Editor specific
-operations like rerouting and transforming can currently only be done offline.
-
-Here is an example of what you **cannot** do:
-
-* Build a graph.
-* Create a session and run the graph.
-* Modify the graph with the Graph Editor.
-* Re-run the graph with the `same` previously created session.
-
-To edit an already running graph, follow these steps:
-
-* Build a graph.
-* Create a session and run the graph.
-* Save the graph state and terminate the session
-* Modify the graph with the Graph Editor.
-* create a new session and restore the graph state
-* Re-run the graph with the newly created session.
-
-Note that this procedure is very costly because a new session must be created
-after any modifications. Among other things, it takes time because the entire
-graph state must be saved and restored again.
-
-## Sub-graph
-
-Most of the functions in the Graph Editor library operate on *sub-graph*.
-More precisely, they take as input arguments instances of the SubGraphView class
-(or anything which can be converted to it). Doing so allows the same function
-to transparently operate on single operations as well as sub-graph of any size.
-
-A subgraph can be created in several ways:
-
-* using a list of ops:
-
-```python
-my_sgv = ge.sgv(ops)
-```
-
-* from a name scope:
-
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
-
-* using regular expression:
-
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
-
-Note that the Graph Editor is meant to manipulate several graphs at the same
-time, typically during transform or copy operation. For that reason,
-to avoid any confusion, the default graph is never used and the graph on
-which to operate must always be given explicitly. This is the reason why
-*`graph=tf.get_default_graph()`* is used in the code snippets above.
-
-## Modules overview
-
-* util: utility functions.
-* select: various selection methods of TensorFlow tensors and operations.
-* match: TensorFlow graph matching. Think of this as regular expressions for
-  graphs (but not quite yet).
-* reroute: various ways of rerouting tensors to different consuming ops like
-  *swap* or *reroute_a2b*.
-* subgraph: the SubGraphView class, which enables subgraph manipulations in a
-  TensorFlow `tf.Graph`.
-* edit: various editing functions operating on subgraphs like *detach*,
-  *connect* or *bypass*.
-* transform: the Transformer class, which enables transforming
-  (or simply copying) a subgraph into another one.
+See the @{$python/contrib.graph_editor} guide.
 """
 
 from __future__ import absolute_import
@@ -116,7 +23,6 @@ from __future__ import print_function
 
 # pylint: disable=wildcard-import
 from tensorflow.contrib.graph_editor.edit import *
-from tensorflow.contrib.graph_editor.match import *
 from tensorflow.contrib.graph_editor.reroute import *
 from tensorflow.contrib.graph_editor.select import *
 from tensorflow.contrib.graph_editor.subgraph import *
diff --git a/tensorflow/contrib/graph_editor/reroute.py b/tensorflow/contrib/graph_editor/reroute.py
index 4c5f281badd..42968ae63b7 100644
--- a/tensorflow/contrib/graph_editor/reroute.py
+++ b/tensorflow/contrib/graph_editor/reroute.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.graph_editor import subgraph
-from tensorflow.contrib.graph_editor import util
-from tensorflow.python.framework import ops as tf_ops
+from tensorflow.contrib.graph_editor import subgraph as _subgraph
+from tensorflow.contrib.graph_editor import util as _util
+from tensorflow.python.framework import ops as _tf_ops
 
-__all__ = [
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
     "swap_ts",
     "reroute_ts",
     "swap_inputs",
@@ -46,8 +48,8 @@ def _check_ts_compatibility(ts0, ts1):
     ValueError: if any pair of tensors (same index in ts0 and ts1) have
       a dtype or a shape which is not compatible.
   """
-  ts0 = util.make_list_of_t(ts0)
-  ts1 = util.make_list_of_t(ts1)
+  ts0 = _util.make_list_of_t(ts0)
+  ts1 = _util.make_list_of_t(ts1)
   if len(ts0) != len(ts1):
     raise ValueError("ts0 and ts1 have different sizes: {} != {}".format(
         len(ts0), len(ts1)))
@@ -176,13 +178,13 @@ def _reroute_ts(ts0, ts1, mode, can_modify=None, cannot_modify=None):
       converted to a list of `tf.Operation`.
   """
   a2b, b2a = _RerouteMode.check(mode)
-  ts0 = util.make_list_of_t(ts0)
-  ts1 = util.make_list_of_t(ts1)
+  ts0 = _util.make_list_of_t(ts0)
+  ts1 = _util.make_list_of_t(ts1)
   _check_ts_compatibility(ts0, ts1)
   if cannot_modify is not None:
-    cannot_modify = frozenset(util.make_list_of_op(cannot_modify))
+    cannot_modify = frozenset(_util.make_list_of_op(cannot_modify))
   if can_modify is not None:
-    can_modify = frozenset(util.make_list_of_op(can_modify))
+    can_modify = frozenset(_util.make_list_of_op(can_modify))
   nb_update_inputs = 0
   precomputed_consumers = []
   # precompute consumers to avoid issue with repeated tensors:
@@ -268,11 +270,11 @@ def _reroute_sgv_remap(sgv0, sgv1, mode):
     ValueError: if sgv0 and sgv1 do not belong to the same graph.
   """
   a2b, b2a = _RerouteMode.check(mode)
-  if not isinstance(sgv0, subgraph.SubGraphView):
+  if not isinstance(sgv0, _subgraph.SubGraphView):
     raise TypeError("Expected a SubGraphView, got {}".format(type(sgv0)))
-  if not isinstance(sgv1, subgraph.SubGraphView):
+  if not isinstance(sgv1, _subgraph.SubGraphView):
     raise TypeError("Expected a SubGraphView, got {}".format(type(sgv1)))
-  util.check_graphs(sgv0, sgv1)
+  _util.check_graphs(sgv0, sgv1)
   sgv0_ = sgv0.copy()
   sgv1_ = sgv1.copy()
   # pylint: disable=protected-access
@@ -327,13 +329,13 @@ def _reroute_sgv_inputs(sgv0, sgv1, mode):
     StandardError: if sgv0 or sgv1 cannot be converted to a SubGraphView using
       the same rules than the function subgraph.make_view.
   """
-  sgv0 = subgraph.make_view(sgv0)
-  sgv1 = subgraph.make_view(sgv1)
-  util.check_graphs(sgv0, sgv1)
+  sgv0 = _subgraph.make_view(sgv0)
+  sgv1 = _subgraph.make_view(sgv1)
+  _util.check_graphs(sgv0, sgv1)
   can_modify = sgv0.ops + sgv1.ops
   # also allow consumers of passthrough to be modified:
-  can_modify += util.get_consuming_ops(sgv0.passthroughs)
-  can_modify += util.get_consuming_ops(sgv1.passthroughs)
+  can_modify += _util.get_consuming_ops(sgv0.passthroughs)
+  can_modify += _util.get_consuming_ops(sgv1.passthroughs)
   _reroute_ts(sgv0.inputs, sgv1.inputs, mode, can_modify=can_modify)
   _reroute_sgv_remap(sgv0, sgv1, mode)
   return sgv0, sgv1
@@ -357,9 +359,9 @@ def _reroute_sgv_outputs(sgv0, sgv1, mode):
     StandardError: if sgv0 or sgv1 cannot be converted to a SubGraphView using
       the same rules than the function subgraph.make_view.
   """
-  sgv0 = subgraph.make_view(sgv0)
-  sgv1 = subgraph.make_view(sgv1)
-  util.check_graphs(sgv0, sgv1)
+  sgv0 = _subgraph.make_view(sgv0)
+  sgv1 = _subgraph.make_view(sgv1)
+  _util.check_graphs(sgv0, sgv1)
   cannot_modify = sgv0.ops + sgv1.ops
   _reroute_ts(sgv0.outputs, sgv1.outputs, mode, cannot_modify=cannot_modify)
   return sgv0, sgv1
@@ -368,7 +370,7 @@ def _reroute_sgv_outputs(sgv0, sgv1, mode):
 def _reroute_sgv(sgv0, sgv1, mode):
   """Re-route both the inputs and the outputs of the two subgraph views.
 
-  This involves swapping all the inputs/ouputs of the two subgraph views.
+  This involves swapping all the inputs/outputs of the two subgraph views.
 
   Args:
     sgv0: the first subgraph to be swapped. This argument is converted to a
@@ -432,9 +434,9 @@ def remove_control_inputs(op, cops):
     TypeError: if op is not a `tf.Operation`.
     ValueError: if any cop in cops is not a control input of op.
   """
-  if not isinstance(op, tf_ops.Operation):
+  if not isinstance(op, _tf_ops.Operation):
     raise TypeError("Expected a tf.Operation, got: {}", type(op))
-  cops = util.make_list_of_op(cops, allow_graph=False)
+  cops = _util.make_list_of_op(cops, allow_graph=False)
   for cop in cops:
     if cop not in op.control_inputs:
       raise ValueError("{} is not a control_input of {}".format(op.name,
@@ -446,7 +448,7 @@ def remove_control_inputs(op, cops):
 
 
 def add_control_inputs(op, cops):
-  """Add the control inputs cops to co.
+  """Add the control inputs cops to op.
 
   Warning: this function is directly manipulating the internals of the tf.Graph.
 
@@ -457,14 +459,16 @@ def add_control_inputs(op, cops):
     TypeError: if op is not a tf.Operation
     ValueError: if any cop in cops is already a control input of op.
   """
-  if not isinstance(op, tf_ops.Operation):
+  if not isinstance(op, _tf_ops.Operation):
     raise TypeError("Expected a tf.Operation, got: {}", type(op))
-  cops = util.make_list_of_op(cops, allow_graph=False)
+  cops = _util.make_list_of_op(cops, allow_graph=False)
   for cop in cops:
     if cop in op.control_inputs:
-      raise ValueError("{} is already a control_input of {}".format(op.name,
-                                                                    cop.name))
+      raise ValueError("{} is already a control_input of {}".format(cop.name,
+                                                                    op.name))
   # pylint: disable=protected-access
   op._control_inputs += cops
   op._recompute_node_def()
   # pylint: enable=protected-access
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/graph_editor/tests/edit_test.py b/tensorflow/contrib/graph_editor/tests/edit_test.py
index 8adaf84b42b..2f669c5d20e 100644
--- a/tensorflow/contrib/graph_editor/tests/edit_test.py
+++ b/tensorflow/contrib/graph_editor/tests/edit_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import graph_editor as ge
+from tensorflow.contrib.graph_editor.tests import match
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -54,7 +55,7 @@ class EditTest(test.TestCase):
     ge.detach(sgv, control_ios=control_outputs)
     # make sure the detached graph is as expected.
     self.assertTrue(
-        ge.OpMatcher("^foo/c$").input_ops("a", "geph__b_0")(self.c.op))
+        match.OpMatcher("^foo/c$").input_ops("a", "geph__b_0")(self.c.op))
 
   def test_connect(self):
     """Test for ge.connect."""
@@ -66,13 +67,13 @@ class EditTest(test.TestCase):
     sgv = ge.sgv(x.op, y.op, z.op)
     ge.connect(sgv, ge.sgv(self.e.op).remap_inputs([0]))
     self.assertTrue(
-        ge.OpMatcher("^foo/bar/e$").input_ops("^z$", "foo/d$")(self.e.op))
+        match.OpMatcher("^foo/bar/e$").input_ops("^z$", "foo/d$")(self.e.op))
 
   def test_bypass(self):
     """Test for ge.bypass."""
     ge.bypass(ge.sgv(self.f.op).remap_inputs([0]))
     self.assertTrue(
-        ge.OpMatcher("^foo/bar/h$").input_ops("^foo/c$", "foo/bar/g$")(
+        match.OpMatcher("^foo/bar/h$").input_ops("^foo/c$", "foo/bar/g$")(
             self.h.op))
 
 
diff --git a/tensorflow/contrib/graph_editor/match.py b/tensorflow/contrib/graph_editor/tests/match.py
similarity index 100%
rename from tensorflow/contrib/graph_editor/match.py
rename to tensorflow/contrib/graph_editor/tests/match.py
diff --git a/tensorflow/contrib/graph_editor/tests/match_test.py b/tensorflow/contrib/graph_editor/tests/match_test.py
index bcb8f3f0e3b..d81dc34dba0 100644
--- a/tensorflow/contrib/graph_editor/tests/match_test.py
+++ b/tensorflow/contrib/graph_editor/tests/match_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import graph_editor as ge
+from tensorflow.contrib.graph_editor.tests import match
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -42,20 +42,21 @@ class MatchTest(test.TestCase):
             self.h = math_ops.add(self.f, self.g, name="h")
 
   def test_simple_match(self):
-    self.assertTrue(ge.OpMatcher("^.*/f$")(self.f.op))
+    self.assertTrue(match.OpMatcher("^.*/f$")(self.f.op))
     self.assertTrue(
-        ge.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$")(self.f.op))
-    self.assertTrue(ge.OpMatcher("^.*/f$").input_ops(True, "^.*/d$")(self.f.op))
+        match.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$")(self.f.op))
     self.assertTrue(
-        ge.OpMatcher("^.*/f$").input_ops(
-            ge.match.op_type("Add"), ge.match.op_type("Const"))(self.f.op))
+        match.OpMatcher("^.*/f$").input_ops(True, "^.*/d$")(self.f.op))
     self.assertTrue(
-        ge.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$")
-        .output_ops(ge.OpMatcher("^.*/h$")
+        match.OpMatcher("^.*/f$").input_ops(
+            match.op_type("Add"), match.op_type("Const"))(self.f.op))
+    self.assertTrue(
+        match.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$")
+        .output_ops(match.OpMatcher("^.*/h$")
                     .control_input_ops("^.*/c$"))(self.f.op))
     self.assertTrue(
-        ge.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$").output_ops(
-            ge.OpMatcher("^.*/h$").control_input_ops("^.*/c$")
+        match.OpMatcher("^.*/f$").input_ops("^.*/c$", "^.*/d$").output_ops(
+            match.OpMatcher("^.*/h$").control_input_ops("^.*/c$")
             .output_ops([]))(self.f.op))
 
 
diff --git a/tensorflow/contrib/graph_editor/tests/reroute_test.py b/tensorflow/contrib/graph_editor/tests/reroute_test.py
index d663c8839da..3c00304add4 100644
--- a/tensorflow/contrib/graph_editor/tests/reroute_test.py
+++ b/tensorflow/contrib/graph_editor/tests/reroute_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import graph_editor as ge
+from tensorflow.contrib.graph_editor.tests import match
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -41,25 +42,25 @@ class RerouteTest(test.TestCase):
 
   def test_swap(self):
     ge.swap_ts([self.a0, self.b0], [self.a1, self.b1])
-    self.assertTrue(ge.OpMatcher("c0").input_ops("a1", "b1")(self.c0.op))
-    self.assertTrue(ge.OpMatcher("c1").input_ops("a0", "b0")(self.c1.op))
+    self.assertTrue(match.OpMatcher("c0").input_ops("a1", "b1")(self.c0.op))
+    self.assertTrue(match.OpMatcher("c1").input_ops("a0", "b0")(self.c1.op))
 
   def test_multiswap(self):
     with self.graph.as_default():
       a3 = constant_op.constant(3.0, shape=[2], name="a3")
     ge.swap_ios(ge.sgv(a3.op).remap_outputs([0, 0]),
                 ge.sgv(self.a0.op, self.a1.op))
-    self.assertTrue(ge.OpMatcher("c0").input_ops("a3", "b0")(self.c0.op))
-    self.assertTrue(ge.OpMatcher("c1").input_ops("a3", "b1")(self.c1.op))
+    self.assertTrue(match.OpMatcher("c0").input_ops("a3", "b0")(self.c0.op))
+    self.assertTrue(match.OpMatcher("c1").input_ops("a3", "b1")(self.c1.op))
 
   def test_reroute(self):
     ge.reroute_ts([self.a0, self.b0], [self.a1, self.b1])
-    self.assertTrue(ge.OpMatcher("c0").input_ops("a0", "b0")(self.c0.op))
-    self.assertTrue(ge.OpMatcher("c1").input_ops("a0", "b0")(self.c1.op))
+    self.assertTrue(match.OpMatcher("c0").input_ops("a0", "b0")(self.c0.op))
+    self.assertTrue(match.OpMatcher("c1").input_ops("a0", "b0")(self.c1.op))
 
     ge.reroute_ts([self.a1, self.b1], [self.a0, self.b0])
-    self.assertTrue(ge.OpMatcher("c0").input_ops("a1", "b1")(self.c0.op))
-    self.assertTrue(ge.OpMatcher("c1").input_ops("a1", "b1")(self.c1.op))
+    self.assertTrue(match.OpMatcher("c0").input_ops("a1", "b1")(self.c0.op))
+    self.assertTrue(match.OpMatcher("c1").input_ops("a1", "b1")(self.c1.op))
 
   def test_compatibility(self):
     with self.assertRaises(ValueError):
@@ -84,9 +85,9 @@ class RerouteTest(test.TestCase):
 
     ge.swap_outputs(sgv0, sgv1)
     self.assertTrue(
-        ge.OpMatcher("g").input_ops("a", ge.OpMatcher("c").input_ops("a", "b"))(
-            g.op))
-    self.assertTrue(ge.OpMatcher("d").input_ops("e", "f")(d.op))
+        match.OpMatcher("g").input_ops(
+            "a", match.OpMatcher("c").input_ops("a", "b"))(g.op))
+    self.assertTrue(match.OpMatcher("d").input_ops("e", "f")(d.op))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index 33f1217412c..a4105645c68 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import collections
 import numpy as np
 from tensorflow.contrib import graph_editor as ge
+from tensorflow.contrib.graph_editor.tests import match
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -100,12 +101,12 @@ class TransformTest(test.TestCase):
 
     graph = ops.Graph()
     transformer(self.graph, graph, "", "")
-    matcher0 = ge.OpMatcher("AddNoise").input_ops(
-        "Noise", ge.OpMatcher("Add").input_ops("Const", "Input"))
-    matcher1 = ge.OpMatcher("AddNoise_1").input_ops(
-        "Noise_1", ge.OpMatcher("Add_1").input_ops("Const_1", matcher0))
-    matcher2 = ge.OpMatcher("AddNoise_2").input_ops(
-        "Noise_2", ge.OpMatcher("Add_2").input_ops("Const_2", matcher1))
+    matcher0 = match.OpMatcher("AddNoise").input_ops(
+        "Noise", match.OpMatcher("Add").input_ops("Const", "Input"))
+    matcher1 = match.OpMatcher("AddNoise_1").input_ops(
+        "Noise_1", match.OpMatcher("Add_1").input_ops("Const_1", matcher0))
+    matcher2 = match.OpMatcher("AddNoise_2").input_ops(
+        "Noise_2", match.OpMatcher("Add_2").input_ops("Const_2", matcher1))
     top = ge.select_ops("^AddNoise_2$", graph=graph)[0]
     self.assertTrue(matcher2(top))
 
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 57fd7d71887..762bc448141 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -137,7 +137,7 @@ def copy_op_handler(info, op, copy_shape=True):
     op: the `tf.Operation` to be copied.
     copy_shape: also copy the shape of the tensor
   Returns:
-    A `(op, op_outputs)` tuple containgin the transformed op and its outputs.
+    A `(op, op_outputs)` tuple containing the transformed op and its outputs.
   """
   # pylint: disable=protected-access
 
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index d8824f67923..959905e9826 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility funtions for the graph_editor.
+"""Utility functions for the graph_editor.
 """
 
 from __future__ import absolute_import
@@ -39,10 +39,21 @@ __all__ = [
 
 
 def concatenate_unique(la, lb):
-  """Add all the elements of lb in la if they are not there already."""
+  """Add all the elements of `lb` to `la` if they are not there already.
+
+  The elements added to `la` maintain ordering with respect to `lb`.
+
+  Args:
+    la: List of Python objects.
+    lb: List of Python objects.
+  Returns:
+    `la`: The list `la` with missing elements from `lb`.
+  """
+  la_set = set(la)
   for l in lb:
-    if l not in la:
+    if l not in la_set:
       la.append(l)
+      la_set.add(l)
   return la
 
 
@@ -119,7 +130,7 @@ def transform_tree(tree, fn, iterable_type=tuple):
     tree: iterable or not. If iterable, its elements (child) can also be
       iterable or not.
     fn: function to apply to each leaves.
-    iterable_type: type use to construct the resulting tree for unknwon
+    iterable_type: type use to construct the resulting tree for unknown
       iterable, typically `list` or `tuple`.
   Returns:
     A tree whose leaves has been transformed by `fn`.
diff --git a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
index e2a5a5556f0..fed8a771cc1 100644
--- a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
+++ b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
@@ -18,21 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
-from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -41,180 +34,228 @@ from tensorflow.python.platform import test
 class GridRNNCellTest(test.TestCase):
 
   def testGrid2BasicLSTMCell(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)) as root_scope:
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2BasicLSTMCell(2)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36617181, 0.36617181]])
-        self.assertAllClose(res[1], [[0.71053141, 0.71053141, 0.36617181,
-                                      0.36617181, 0.72320831, 0.80555487,
-                                      0.39102408, 0.42150158]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g, ([[0.36617181, 0.36617181]],))
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
 
         # emulate a loop through the input sequence,
         # where we call cell() multiple times
         root_scope.reuse_variables()
         g2, s2 = cell(x, m)
-        self.assertEqual(g2.get_shape(), (1, 2))
-        self.assertEqual(s2.get_shape(), (1, 8))
+        self.assertEqual(g2[0].get_shape(), (1, 2))
+        self.assertEqual(s2[0].c.get_shape(), (1, 2))
+        self.assertEqual(s2[0].h.get_shape(), (1, 2))
+        self.assertEqual(s2[1].c.get_shape(), (1, 2))
+        self.assertEqual(s2[1].h.get_shape(), (1, 2))
 
-        res = sess.run([g2, s2], {x: np.array([[2., 2., 2.]]), m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.58847463, 0.58847463]])
-        self.assertAllClose(res[1], [[1.40469193, 1.40469193, 0.58847463,
-                                      0.58847463, 0.97726452, 1.04626071,
-                                      0.4927212, 0.51137757]])
+        res_g2, res_s2 = sess.run([g2, s2],
+                                  {x: np.array([[2., 2., 2.]]),
+                                   m: res_s})
+        self.assertEqual(res_g2[0].shape, (1, 2))
+        self.assertEqual(res_s2[0].c.shape, (1, 2))
+        self.assertEqual(res_s2[0].h.shape, (1, 2))
+        self.assertEqual(res_s2[1].c.shape, (1, 2))
+        self.assertEqual(res_s2[1].h.shape, (1, 2))
+        self.assertAllClose(res_g2[0], [[0.58847463, 0.58847463]])
+        self.assertAllClose(
+            res_s2, (([[1.40469193, 1.40469193]], [[0.58847463, 0.58847463]]),
+                     ([[0.97726452, 1.04626071]], [[0.4927212, 0.51137757]])))
 
   def testGrid2BasicLSTMCellTied(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2BasicLSTMCell(2, tied=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36617181, 0.36617181]])
-        self.assertAllClose(res[1], [[0.71053141, 0.71053141, 0.36617181,
-                                      0.36617181, 0.72320831, 0.80555487,
-                                      0.39102408, 0.42150158]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
 
-        res = sess.run([g, s], {x: np.array([[1., 1., 1.]]), m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36703536, 0.36703536]])
-        self.assertAllClose(res[1], [[0.71200621, 0.71200621, 0.36703536,
-                                      0.36703536, 0.80941606, 0.87550586,
-                                      0.40108523, 0.42199609]])
+        self.assertAllClose(res_g[0], [[0.36617181, 0.36617181]])
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
+
+        res_g, res_s = sess.run([g, s], {x: np.array([[1., 1., 1.]]), m: res_s})
+        self.assertEqual(res_g[0].shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.36703536, 0.36703536]])
+        self.assertAllClose(
+            res_s, (([[0.71200621, 0.71200621]], [[0.36703536, 0.36703536]]),
+                    ([[0.80941606, 0.87550586]], [[0.40108523, 0.42199609]])))
 
   def testGrid2BasicLSTMCellWithRelu(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2BasicLSTMCell(
             2, tied=False, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.31667367, 0.31667367]])
-        self.assertAllClose(res[1], [[0.29530135, 0.37520045, 0.17044567,
-                                      0.21292259]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertAllClose(res_g[0], [[0.31667367, 0.31667367]])
+        self.assertAllClose(res_s, (([[0.29530135, 0.37520045]],
+                                     [[0.17044567, 0.21292259]]),))
 
   """LSTMCell
   """
 
   def testGrid2LSTMCell(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res[1], [[2.41515064, 2.41515064, 0.95686918,
-                                      0.95686918, 1.38917875, 1.49043763,
-                                      0.83884692, 0.86036491]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellTied(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2LSTMCell(2, tied=True, use_peepholes=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res[1], [[2.41515064, 2.41515064, 0.95686918,
-                                      0.95686918, 1.38917875, 1.49043763,
-                                      0.83884692, 0.86036491]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellWithRelu(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2LSTMCell(
             2, use_peepholes=True, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[2.1831727, 2.1831727]])
-        self.assertAllClose(res[1], [[0.92270052, 1.02325559, 0.66159075,
-                                      0.70475441]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertAllClose(res_g[0], [[2.1831727, 2.1831727]])
+        self.assertAllClose(res_s, (([[0.92270052, 1.02325559]],
+                                     [[0.66159075, 0.70475441]]),))
 
   """RNNCell
   """
@@ -224,74 +265,84 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
-        m = array_ops.zeros([2, 4])
+        m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, (2, 2))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (2, 2))
-        self.assertEqual(s.get_shape(), (2, 4))
+        self.assertEqual(g[0].get_shape(), (2, 2))
+        self.assertEqual(s[0].get_shape(), (2, 2))
+        self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1.], [2., 2.]]),
-            m: np.array([[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
         })
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 4))
-        self.assertAllClose(res[0], [[0.94685763, 0.94685763],
-                                     [0.99480951, 0.99480951]])
-        self.assertAllClose(res[1],
-                            [[0.94685763, 0.94685763, 0.80049908, 0.80049908],
-                             [0.99480951, 0.99480951, 0.97574311, 0.97574311]])
+        self.assertEqual(res_g[0].shape, (2, 2))
+        self.assertEqual(res_s[0].shape, (2, 2))
+        self.assertEqual(res_s[1].shape, (2, 2))
+
+        self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellTied(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
-        m = array_ops.zeros([2, 4])
+        m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2, tied=True)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, (2, 2))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (2, 2))
-        self.assertEqual(s.get_shape(), (2, 4))
+        self.assertEqual(g[0].get_shape(), (2, 2))
+        self.assertEqual(s[0].get_shape(), (2, 2))
+        self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1.], [2., 2.]]),
-            m: np.array([[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
         })
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 4))
-        self.assertAllClose(res[0], [[0.94685763, 0.94685763],
-                                     [0.99480951, 0.99480951]])
-        self.assertAllClose(res[1],
-                            [[0.94685763, 0.94685763, 0.80049908, 0.80049908],
-                             [0.99480951, 0.99480951, 0.97574311, 0.97574311]])
+        self.assertEqual(res_g[0].shape, (2, 2))
+        self.assertEqual(res_s[0].shape, (2, 2))
+        self.assertEqual(res_s[1].shape, (2, 2))
+
+        self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellWithRelu(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
+        m = (array_ops.zeros([1, 2]),)
         cell = grid_rnn_cell.Grid2BasicRNNCell(2, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 2)
+        self.assertEqual(cell.state_size, (2,))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 2))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s],
-                       {x: np.array([[1., 1.]]),
-                        m: np.array([[0.1, 0.1]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 2))
-        self.assertAllClose(res[0], [[1.80049896, 1.80049896]])
-        self.assertAllClose(res[1], [[0.80049896, 0.80049896]])
+        res_g, res_s = sess.run(
+            [g, s], {x: np.array([[1., 1.]]),
+                     m: np.array([[0.1, 0.1]])})
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].shape, (1, 2))
+        self.assertAllClose(res_g, ([[1.80049896, 1.80049896]],))
+        self.assertAllClose(res_s, ([[0.80049896, 0.80049896]],))
 
   """1-LSTM
   """
@@ -301,51 +352,59 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)) as root_scope:
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid1LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.91287315, 0.91287315]])
-        self.assertAllClose(res[1],
-                            [[2.26285243, 2.26285243, 0.91287315, 0.91287315]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+
+        self.assertAllClose(res_g, ([[0.91287315, 0.91287315]],))
+        self.assertAllClose(res_s, (([[2.26285243, 2.26285243]],
+                                     [[0.91287315, 0.91287315]]),))
 
         root_scope.reuse_variables()
 
         x2 = array_ops.zeros([0, 0])
         g2, s2 = cell(x2, m)
-        self.assertEqual(g2.get_shape(), (1, 2))
-        self.assertEqual(s2.get_shape(), (1, 4))
+        self.assertEqual(g2[0].get_shape(), (1, 2))
+        self.assertEqual(s2[0].c.get_shape(), (1, 2))
+        self.assertEqual(s2[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g2, s2], {m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.9032144, 0.9032144]])
-        self.assertAllClose(res[1],
-                            [[2.79966092, 2.79966092, 0.9032144, 0.9032144]])
+        res_g2, res_s2 = sess.run([g2, s2], {m: res_s})
+        self.assertEqual(res_g2[0].shape, (1, 2))
+        self.assertEqual(res_s2[0].c.shape, (1, 2))
+        self.assertEqual(res_s2[0].h.shape, (1, 2))
+
+        self.assertAllClose(res_g2, ([[0.9032144, 0.9032144]],))
+        self.assertAllClose(res_s2, (([[2.79966092, 2.79966092]],
+                                      [[0.9032144, 0.9032144]]),))
 
         g3, s3 = cell(x2, m)
-        self.assertEqual(g3.get_shape(), (1, 2))
-        self.assertEqual(s3.get_shape(), (1, 4))
+        self.assertEqual(g3[0].get_shape(), (1, 2))
+        self.assertEqual(s3[0].c.get_shape(), (1, 2))
+        self.assertEqual(s3[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g3, s3], {m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.92727238, 0.92727238]])
-        self.assertAllClose(res[1],
-                            [[3.3529923, 3.3529923, 0.92727238, 0.92727238]])
+        res_g3, res_s3 = sess.run([g3, s3], {m: res_s2})
+        self.assertEqual(res_g3[0].shape, (1, 2))
+        self.assertEqual(res_s3[0].c.shape, (1, 2))
+        self.assertEqual(res_s3[0].h.shape, (1, 2))
+        self.assertAllClose(res_g3, ([[0.92727238, 0.92727238]],))
+        self.assertAllClose(res_s3, (([[3.3529923, 3.3529923]],
+                                      [[0.92727238, 0.92727238]]),))
 
   """3-LSTM
   """
@@ -355,32 +414,42 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 12])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid3LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 12)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 12))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
+        self.assertEqual(s[2].c.get_shape(), (1, 2))
+        self.assertEqual(s[2].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
+        res_g, res_s = sess.run([g, s], {
             x:
                 np.array([[1., 1., 1.]]),
-            m:
-                np.array([[
-                    0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, -0.1, -0.2, -0.3,
-                    -0.4
-                ]])
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])), (np.array(
+                    [[-0.1, -0.2]]), np.array([[-0.3, -0.4]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 12))
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+        self.assertEqual(res_s[2].c.shape, (1, 2))
+        self.assertEqual(res_s[2].h.shape, (1, 2))
 
-        self.assertAllClose(res[0], [[0.96892911, 0.96892911]])
-        self.assertAllClose(res[1], [[2.45227885, 2.45227885, 0.96892911,
-                                      0.96892911, 1.33592629, 1.4373529,
-                                      0.80867189, 0.83247656, 0.7317788,
-                                      0.63205892, 0.56548983, 0.50446129]])
+        self.assertAllClose(res_g, ([[0.96892911, 0.96892911]],))
+        self.assertAllClose(
+            res_s, (([[2.45227885, 2.45227885]], [[0.96892911, 0.96892911]]),
+                    ([[1.33592629, 1.4373529]], [[0.80867189, 0.83247656]]),
+                    ([[0.7317788, 0.63205892]], [[0.56548983, 0.50446129]])))
 
   """Edge cases
   """
@@ -390,7 +459,7 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([3, 2])
-        m = array_ops.zeros([0, 0])
+        m = ()
 
         # this is equivalent to relu
         cell = grid_rnn_cell.GridRNNCell(
@@ -401,21 +470,22 @@ class GridRNNCellTest(test.TestCase):
             non_recurrent_dims=0,
             non_recurrent_fn=nn_ops.relu)
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (3, 2))
-        self.assertEqual(s.get_shape(), (0, 0))
+        self.assertEqual(g[0].get_shape(), (3, 2))
+        self.assertEqual(s, ())
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
-        self.assertEqual(res[0].shape, (3, 2))
-        self.assertEqual(res[1].shape, (0, 0))
-        self.assertAllClose(res[0], [[0, 0], [0, 0], [0.5, 0.5]])
+        res_g, res_s = sess.run([g, s],
+                                {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
+        self.assertEqual(res_g[0].shape, (3, 2))
+        self.assertEqual(res_s, ())
+        self.assertAllClose(res_g, ([[0, 0], [0, 0], [0.5, 0.5]],))
 
   def testGridRNNEdgeCasesNoOutput(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
 
         # This cell produces no output
         cell = grid_rnn_cell.GridRNNCell(
@@ -426,16 +496,18 @@ class GridRNNCellTest(test.TestCase):
             non_recurrent_dims=0,
             non_recurrent_fn=nn_ops.relu)
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (0, 0))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g, ())
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1.]]),
-             m: np.array([[0.1, 0.1, 0.1, 0.1]])})
-        self.assertEqual(res[0].shape, (0, 0))
-        self.assertEqual(res[1].shape, (1, 4))
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1.]]),
+            m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])),)
+        })
+        self.assertEqual(res_g, ())
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
 
   """Test with tf.nn.rnn
   """
@@ -455,23 +527,32 @@ class GridRNNCellTest(test.TestCase):
               dtypes.float32, shape=(batch_size, input_size))
       ]
 
-      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 8))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid2LSTMCellReLUWithRNN(self):
     batch_size = 3
@@ -485,27 +566,33 @@ class GridRNNCellTest(test.TestCase):
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
-      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 4))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid3LSTMCellReLUWithRNN(self):
     batch_size = 3
@@ -519,27 +606,35 @@ class GridRNNCellTest(test.TestCase):
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
-      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 8))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid1LSTMCellWithRNN(self):
     batch_size = 3
@@ -557,23 +652,94 @@ class GridRNNCellTest(test.TestCase):
               dtypes.float32, shape=(batch_size, input_size))
       ] + (max_length - 1) * [array_ops.zeros([batch_size, input_size])])
 
-      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 4))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape(), (3, num_units))
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape(), (3, num_units))
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
+  def testGrid2LSTMCellWithRNNAndDynamicBatchSize(self):
+    """Test for #4296."""
+    input_size = 5
+    max_length = 6  # unrolled up to this length
+    num_units = 2
+
+    with variable_scope.variable_scope(
+        'root', initializer=init_ops.constant_initializer(0.5)):
+      cell = grid_rnn_cell.Grid2LSTMCell(num_units=num_units)
+
+      inputs = max_length * [
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
+      ]
+
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+
+    self.assertEqual(len(outputs), len(inputs))
+
+    for out, inp in zip(outputs, inputs):
+      self.assertEqual(len(out), 1)
+      self.assertTrue(out[0].get_shape()[0].value is None)
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      input_value = np.ones((3, input_size))
+      values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
+
+  def testGrid2LSTMCellLegacy(self):
+    """Test for legacy case (when state_is_tuple=False)."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 8])
+        cell = grid_rnn_cell.Grid2LSTMCell(
+            2, use_peepholes=True, state_is_tuple=False, output_is_tuple=False)
+        self.assertEqual(cell.state_size, 8)
+
+        g, s = cell(x, m)
+        self.assertEqual(g.get_shape(), (1, 2))
+        self.assertEqual(s.get_shape(), (1, 8))
+
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+        self.assertEqual(res[1].shape, (1, 8))
+        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(res[1], [[
+            2.41515064, 2.41515064, 0.95686918, 0.95686918, 1.38917875,
+            1.49043763, 0.83884692, 0.86036491
+        ]])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
index 269b2245819..252788140f8 100644
--- a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
+++ b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
@@ -25,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.contrib import layers
 from tensorflow.contrib import rnn
 
@@ -53,7 +55,9 @@ class GridRNNCell(rnn.RNNCell):
                non_recurrent_dims=None,
                tied=False,
                cell_fn=None,
-               non_recurrent_fn=None):
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     """Initialize the parameters of a Grid RNN cell
 
     Args:
@@ -68,26 +72,47 @@ class GridRNNCell(rnn.RNNCell):
       non_recurrent_dims: int or list, List of dimensions that are not
         recurrent.
               The transfer function for non-recurrent dimensions is specified
-                via `non_recurrent_fn`,
-              which is default to be `tensorflow.nn.relu`.
+                via `non_recurrent_fn`, which is
+                default to be `tensorflow.nn.relu`.
       tied: bool, Whether to share the weights among the dimensions of this
         GridRNN cell.
               If there are non-recurrent dimensions in the grid, weights are
-                shared between each
-              group of recurrent and non-recurrent dimensions.
-      cell_fn: function, a function which returns the recurrent cell object. Has
-        to be in the following signature:
-              def cell_func(num_units, input_size):
+                shared between each group of recurrent and non-recurrent
+                dimensions.
+      cell_fn: function, a function which returns the recurrent cell object.
+        Has to be in the following signature:
+              ```
+              def cell_func(num_units):
                 # ...
-
+              ```
               and returns an object of type `RNNCell`. If None, LSTMCell with
                 default parameters will be used.
+        Note that if you use a custom RNNCell (with `cell_fn`), it is your
+        responsibility to make sure the inner cell use `state_is_tuple=True`.
+
       non_recurrent_fn: a tensorflow Op that will be the transfer function of
         the non-recurrent dimensions
+      state_is_tuple: If True, accepted and returned states are tuples of the
+        states of the recurrent dimensions. If False, they are concatenated
+        along the column axis. The latter behavior will soon be deprecated.
+
+        Note that if you use a custom RNNCell (with `cell_fn`), it is your
+        responsibility to make sure the inner cell use `state_is_tuple=True`.
+
+      output_is_tuple: If True, the output is a tuple of the outputs of the
+        recurrent dimensions. If False, they are concatenated along the
+        column axis. The later behavior will soon be deprecated.
 
     Raises:
       TypeError: if cell_fn does not return an RNNCell instance.
     """
+    if not state_is_tuple:
+      logging.warning('%s: Using a concatenated state is slower and will '
+                      'soon be deprecated.  Use state_is_tuple=True.', self)
+    if not output_is_tuple:
+      logging.warning('%s: Using a concatenated output is slower and will'
+                      'soon be deprecated.  Use output_is_tuple=True.', self)
+
     if num_dims < 1:
       raise ValueError('dims must be >= 1: {}'.format(num_dims))
 
@@ -96,37 +121,41 @@ class GridRNNCell(rnn.RNNCell):
                                      non_recurrent_fn or nn.relu, tied,
                                      num_units)
 
-    cell_input_size = (self._config.num_dims - 1) * num_units
+    self._state_is_tuple = state_is_tuple
+    self._output_is_tuple = output_is_tuple
+
     if cell_fn is None:
       my_cell_fn = functools.partial(
-          rnn.LSTMCell,
-          num_units=num_units, input_size=cell_input_size,
-          state_is_tuple=False)
+          rnn.LSTMCell, num_units=num_units, state_is_tuple=state_is_tuple)
     else:
-      my_cell_fn = lambda: cell_fn(num_units, cell_input_size)
+      my_cell_fn = lambda: cell_fn(num_units)
     if tied:
       self._cells = [my_cell_fn()] * num_dims
     else:
       self._cells = [my_cell_fn() for _ in range(num_dims)]
     if not isinstance(self._cells[0], rnn.RNNCell):
-      raise TypeError(
-          'cell_fn must return an RNNCell instance, saw: %s'
-          % type(self._cells[0]))
+      raise TypeError('cell_fn must return an RNNCell instance, saw: %s' %
+                      type(self._cells[0]))
 
-  @property
-  def input_size(self):
-    # temporarily using num_units as the input_size of each dimension.
-    # The actual input size only determined when this cell get invoked,
-    # so this information can be considered unreliable.
-    return self._config.num_units * len(self._config.inputs)
+    if self._output_is_tuple:
+      self._output_size = tuple(self._cells[0].output_size
+                                for _ in self._config.outputs)
+    else:
+      self._output_size = self._cells[0].output_size * len(self._config.outputs)
+
+    if self._state_is_tuple:
+      self._state_size = tuple(self._cells[0].state_size
+                               for _ in self._config.recurrents)
+    else:
+      self._state_size = self._cell_state_size() * len(self._config.recurrents)
 
   @property
   def output_size(self):
-    return self._cells[0].output_size * len(self._config.outputs)
+    return self._output_size
 
   @property
   def state_size(self):
-    return self._cells[0].state_size * len(self._config.recurrents)
+    return self._state_size
 
   def __call__(self, inputs, state, scope=None):
     """Run one step of GridRNN.
@@ -145,76 +174,148 @@ class GridRNNCell(rnn.RNNCell):
       - A 2D, batch x state_size, Tensor representing the new state of the cell
         after reading "inputs" when previous state was "state".
     """
-    state_sz = state.get_shape().as_list()[1]
-    if self.state_size != state_sz:
-      raise ValueError(
-          'Actual state size not same as specified: {} vs {}.'.format(
-              state_sz, self.state_size))
-
     conf = self._config
-    dtype = inputs.dtype if inputs is not None else state.dtype
+    dtype = inputs.dtype
 
-    # c_prev is `m`, and m_prev is `h` in the paper.
-    # Keep c and m here for consistency with the codebase
-    c_prev = [None] * self._config.num_dims
-    m_prev = [None] * self._config.num_dims
-    cell_output_size = self._cells[0].state_size - conf.num_units
-
-    # for LSTM   : state = memory cell + output, hence cell_output_size > 0
-    # for GRU/RNN: state = output (whose size is equal to _num_units),
-    #              hence cell_output_size = 0
-    for recurrent_dim, start_idx in zip(self._config.recurrents, range(
-        0, self.state_size, self._cells[0].state_size)):
-      if cell_output_size > 0:
-        c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
-                                                [-1, conf.num_units])
-        m_prev[recurrent_dim] = array_ops.slice(
-            state, [0, start_idx + conf.num_units], [-1, cell_output_size])
-      else:
-        m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
-                                                [-1, conf.num_units])
+    c_prev, m_prev, cell_output_size = self._extract_states(state)
 
     new_output = [None] * conf.num_dims
     new_state = [None] * conf.num_dims
 
     with vs.variable_scope(scope or type(self).__name__):  # GridRNNCell
+      # project input, populate c_prev and m_prev
+      self._project_input(inputs, c_prev, m_prev, cell_output_size > 0)
 
-      # project input
-      if inputs is not None and sum(inputs.get_shape().as_list()) > 0 and len(
-          conf.inputs) > 0:
-        input_splits = array_ops.split(
-            value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
-        input_sz = input_splits[0].get_shape().as_list()[1]
-
-        for i, j in enumerate(conf.inputs):
-          input_project_m = vs.get_variable(
-              'project_m_{}'.format(j), [input_sz, conf.num_units], dtype=dtype)
-          m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)
-
-          if cell_output_size > 0:
-            input_project_c = vs.get_variable(
-                'project_c_{}'.format(j), [input_sz, conf.num_units],
-                dtype=dtype)
-            c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
-
+      # propagate along dimensions, first for non-priority dimensions
+      # then priority dimensions
       _propagate(conf.non_priority, conf, self._cells, c_prev, m_prev,
                  new_output, new_state, True)
       _propagate(conf.priority, conf, self._cells,
                  c_prev, m_prev, new_output, new_state, False)
 
+      # collect outputs and states
       output_tensors = [new_output[i] for i in self._config.outputs]
-      output = array_ops.zeros(
-          [0, 0], dtype) if len(output_tensors) == 0 else array_ops.concat(
-              output_tensors, 1)
+      if self._output_is_tuple:
+        output = tuple(output_tensors)
+      else:
+        if output_tensors:
+          output = array_ops.concat(output_tensors, 1)
+        else:
+          output = array_ops.zeros([0, 0], dtype)
 
-      state_tensors = [new_state[i] for i in self._config.recurrents]
-      states = array_ops.zeros(
-          [0, 0],
-          dtype) if len(state_tensors) == 0 else array_ops.concat(state_tensors,
-                                                                  1)
+      if self._state_is_tuple:
+        states = tuple(new_state[i] for i in self._config.recurrents)
+      else:
+        # concat each state first, then flatten the whole thing
+        state_tensors = [
+            x for i in self._config.recurrents for x in new_state[i]
+        ]
+        if state_tensors:
+          states = array_ops.concat(state_tensors, 1)
+        else:
+          states = array_ops.zeros([0, 0], dtype)
 
     return output, states
 
+  def _extract_states(self, state):
+    """Extract the cell and previous output tensors from the given state.
+
+    Args:
+      state: The RNN state.
+
+    Returns:
+      Tuple of the cell value, previous output, and cell_output_size.
+
+    Raises:
+      ValueError: If len(self._config.recurrents) != len(state).
+    """
+    conf = self._config
+
+    # c_prev is `m` (cell value), and
+    # m_prev is `h` (previous output) in the paper.
+    # Keeping c and m here for consistency with the codebase
+    c_prev = [None] * conf.num_dims
+    m_prev = [None] * conf.num_dims
+
+    # for LSTM   : state = memory cell + output, hence cell_output_size > 0
+    # for GRU/RNN: state = output (whose size is equal to _num_units),
+    #              hence cell_output_size = 0
+    total_cell_state_size = self._cell_state_size()
+    cell_output_size = total_cell_state_size - conf.num_units
+
+    if self._state_is_tuple:
+      if len(conf.recurrents) != len(state):
+        raise ValueError('Expected state as a tuple of {} '
+                         'element'.format(len(conf.recurrents)))
+
+      for recurrent_dim, recurrent_state in zip(conf.recurrents, state):
+        if cell_output_size > 0:
+          c_prev[recurrent_dim], m_prev[recurrent_dim] = recurrent_state
+        else:
+          m_prev[recurrent_dim] = recurrent_state
+    else:
+      for recurrent_dim, start_idx in zip(conf.recurrents,
+                                          range(0, self.state_size,
+                                                total_cell_state_size)):
+        if cell_output_size > 0:
+          c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
+                                                  [-1, conf.num_units])
+          m_prev[recurrent_dim] = array_ops.slice(
+              state, [0, start_idx + conf.num_units], [-1, cell_output_size])
+        else:
+          m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
+                                                  [-1, conf.num_units])
+    return c_prev, m_prev, cell_output_size
+
+  def _project_input(self, inputs, c_prev, m_prev, with_c):
+    """Fills in c_prev and m_prev with projected input, for input dimensions.
+
+    Args:
+      inputs: inputs tensor
+      c_prev: cell value
+      m_prev: previous output
+      with_c: boolean; whether to include project_c.
+
+    Raises:
+      ValueError: if len(self._config.input) != len(inputs)
+    """
+    conf = self._config
+
+    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0 and
+        conf.inputs):
+      if isinstance(inputs, tuple):
+        if len(conf.inputs) != len(inputs):
+          raise ValueError('Expect inputs as a tuple of {} '
+                           'tensors'.format(len(conf.inputs)))
+        input_splits = inputs
+      else:
+        input_splits = array_ops.split(
+            value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
+      input_sz = input_splits[0].get_shape().with_rank(2)[1].value
+
+      for i, j in enumerate(conf.inputs):
+        input_project_m = vs.get_variable(
+            'project_m_{}'.format(j), [input_sz, conf.num_units],
+            dtype=inputs.dtype)
+        m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)
+
+        if with_c:
+          input_project_c = vs.get_variable(
+              'project_c_{}'.format(j), [input_sz, conf.num_units],
+              dtype=inputs.dtype)
+          c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
+
+  def _cell_state_size(self):
+    """Total size of the state of the inner cell used in this grid.
+
+    Returns:
+      Total size of the state of the inner cell.
+    """
+    state_sizes = self._cells[0].state_size
+    if isinstance(state_sizes, tuple):
+      return sum(state_sizes)
+    return state_sizes
+
 
 """Specialized cells, for convenience
 """
@@ -223,11 +324,17 @@ class GridRNNCell(rnn.RNNCell):
 class Grid1BasicRNNCell(GridRNNCell):
   """1D BasicRNN cell"""
 
-  def __init__(self, num_units):
+  def __init__(self, num_units, state_is_tuple=True, output_is_tuple=True):
     super(Grid1BasicRNNCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0, tied=False,
-        cell_fn=lambda n, i: rnn.BasicRNNCell(num_units=n, input_size=i))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicRNNCell(GridRNNCell):
@@ -240,71 +347,112 @@ class Grid2BasicRNNCell(GridRNNCell):
   specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2BasicRNNCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.BasicRNNCell(num_units=n, input_size=i),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1BasicLSTMCell(GridRNNCell):
-  """1D BasicLSTM cell"""
+  """1D BasicLSTM cell."""
 
-  def __init__(self, num_units, forget_bias=1):
+  def __init__(self,
+               num_units,
+               forget_bias=1,
+               state_is_tuple=True,
+               output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid1BasicLSTMCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0, tied=False,
-        cell_fn=lambda n, i: rnn.BasicLSTMCell(
-            num_units=n,
-            forget_bias=forget_bias, input_size=i,
-            state_is_tuple=False))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicLSTMCell(GridRNNCell):
-  """2D BasicLSTM cell
+  """2D BasicLSTM cell.
 
-    This creates a 2D cell which receives input and gives output in the first
-    dimension.
+  This creates a 2D cell which receives input and gives output in the first
+  dimension.
 
-    The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
-    specified.
+  The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
+  specified.
   """
 
   def __init__(self,
                num_units,
                tied=False,
                non_recurrent_fn=None,
-               forget_bias=1):
+               forget_bias=1,
+               state_is_tuple=True,
+               output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid2BasicLSTMCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.BasicLSTMCell(
-            num_units=n, forget_bias=forget_bias, input_size=i,
-            state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1LSTMCell(GridRNNCell):
-  """1D LSTM cell
+  """1D LSTM cell.
 
-    This is different from Grid1BasicLSTMCell because it gives options to
-    specify the forget bias and enabling peepholes
+  This is different from Grid1BasicLSTMCell because it gives options to
+  specify the forget bias and enabling peepholes.
   """
 
-  def __init__(self, num_units, use_peepholes=False, forget_bias=1.0):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid1LSTMCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, use_peepholes=use_peepholes,
-            forget_bias=forget_bias, state_is_tuple=False))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2LSTMCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -317,19 +465,30 @@ class Grid2LSTMCell(GridRNNCell):
                tied=False,
                non_recurrent_fn=None,
                use_peepholes=False,
-               forget_bias=1.0):
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid2LSTMCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, forget_bias=forget_bias,
-            use_peepholes=use_peepholes, state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid3LSTMCell(GridRNNCell):
-  """3D BasicLSTM cell
+  """3D BasicLSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -343,19 +502,30 @@ class Grid3LSTMCell(GridRNNCell):
                tied=False,
                non_recurrent_fn=None,
                use_peepholes=False,
-               forget_bias=1.0):
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid3LSTMCell, self).__init__(
-        num_units=num_units, num_dims=3,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=3,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, forget_bias=forget_bias,
-            use_peepholes=use_peepholes, state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2GRUCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -363,21 +533,31 @@ class Grid2GRUCell(GridRNNCell):
     specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2GRUCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.GRUCell(num_units=n, input_size=i),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=lambda n: rnn.GRUCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
-"""Helpers
-"""
+# Helpers
 
-_GridRNNDimension = namedtuple(
-    '_GridRNNDimension',
-    ['idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'])
+_GridRNNDimension = namedtuple('_GridRNNDimension', [
+    'idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'
+])
 
 _GridRNNConfig = namedtuple('_GridRNNConfig',
                             ['num_dims', 'dims', 'inputs', 'outputs',
@@ -387,7 +567,6 @@ _GridRNNConfig = namedtuple('_GridRNNConfig',
 
 def _parse_rnn_config(num_dims, ls_input_dims, ls_output_dims, ls_priority_dims,
                       ls_non_recurrent_dims, non_recurrent_fn, tied, num_units):
-
   def check_dim_list(ls):
     if ls is None:
       ls = []
@@ -412,8 +591,8 @@ def _parse_rnn_config(num_dims, ls_input_dims, ls_output_dims, ls_priority_dims,
             is_input=(i in input_dims),
             is_output=(i in output_dims),
             is_priority=(i in priority_dims),
-            non_recurrent_fn=non_recurrent_fn if i in non_recurrent_dims else
-            None))
+            non_recurrent_fn=non_recurrent_fn
+            if i in non_recurrent_dims else None))
   return _GridRNNConfig(
       num_dims=num_dims,
       dims=rnn_dims,
@@ -440,34 +619,40 @@ def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
   if conf.num_dims > 1:
     ls_cell_inputs = [None] * (conf.num_dims - 1)
     for d in conf.dims[:-1]:
-      ls_cell_inputs[d.idx] = new_output[d.idx] if new_output[
-          d.idx] is not None else m_prev[d.idx]
+      if new_output[d.idx] is None:
+        ls_cell_inputs[d.idx] = m_prev[d.idx]
+      else:
+        ls_cell_inputs[d.idx] = new_output[d.idx]
     cell_inputs = array_ops.concat(ls_cell_inputs, 1)
   else:
     cell_inputs = array_ops.zeros([m_prev[0].get_shape().as_list()[0], 0],
                                   m_prev[0].dtype)
 
-  last_dim_output = new_output[-1] if new_output[-1] is not None else m_prev[-1]
+  last_dim_output = (new_output[-1]
+                     if new_output[-1] is not None else m_prev[-1])
 
   for i in dim_indices:
     d = conf.dims[i]
     if d.non_recurrent_fn:
-      linear_args = array_ops.concat(
-          [cell_inputs, last_dim_output],
-          1) if conf.num_dims > 1 else last_dim_output
+      if conf.num_dims > 1:
+        linear_args = array_ops.concat([cell_inputs, last_dim_output], 1)
+      else:
+        linear_args = last_dim_output
       with vs.variable_scope('non_recurrent' if conf.tied else
                              'non_recurrent/cell_{}'.format(i)):
         if conf.tied and not (first_call and i == dim_indices[0]):
           vs.get_variable_scope().reuse_variables()
-        new_output[d.idx] = layers.legacy_fully_connected(
+
+        new_output[d.idx] = layers.fully_connected(
             linear_args,
-            num_output_units=conf.num_units,
+            num_outputs=conf.num_units,
             activation_fn=d.non_recurrent_fn,
-            weight_init=vs.get_variable_scope().initializer or
-            layers.initializers.xavier_initializer)
+            weights_initializer=(vs.get_variable_scope().initializer or
+                                 layers.initializers.xavier_initializer),
+            weights_regularizer=vs.get_variable_scope().regularizer)
     else:
       if c_prev[i] is not None:
-        cell_state = array_ops.concat([c_prev[i], last_dim_output], 1)
+        cell_state = (c_prev[i], last_dim_output)
       else:
         # for GRU/RNN, the state is just the previous output
         cell_state = last_dim_output
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
new file mode 100644
index 00000000000..b6920ffd3d9
--- /dev/null
+++ b/tensorflow/contrib/hooks/BUILD
@@ -0,0 +1,54 @@
+# Description:
+#   Contains `SessionRunHook`s for use with `MonitoredSession` and the
+#   wrappers around it.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "hooks",
+    srcs = [
+        "__init__.py",
+        "python/training/__init__.py",
+        "python/training/profiler_hook.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "profiler_hook_test",
+    size = "small",
+    srcs = ["python/training/profiler_hook_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":hooks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/hooks/README.md b/tensorflow/contrib/hooks/README.md
new file mode 100644
index 00000000000..84dd6ac8792
--- /dev/null
+++ b/tensorflow/contrib/hooks/README.md
@@ -0,0 +1,30 @@
+# TensorFlow Experimental SessionRunHooks
+
+These hooks complement those in tensorflow/python/training. They are instances
+of `SessionRunHook` and are to be used with helpers like `MonitoredSession`
+and `learn.Estimator` that wrap `tensorflow.Session`.
+
+The hooks are called between invocations of `Session.run()` to perform custom
+behavior.
+
+For example the `ProfilerHook` periodically collects `RunMetadata` after
+`Session.run()` and saves profiling information that can be viewed in a
+neat timeline through a Chromium-based web browser (via
+[about:tracing](chrome://tracing)) or the standalone [Catapult](https://github.com/catapult-project/catapult/blob/master/tracing/README.md) tool.
+
+```python
+from tensorflow.contrib.hooks import ProfilerHook
+
+hooks = [ProfilerHook(save_secs=30, output_dir="profiling")]
+with SingularMonitoredSession(hooks=hooks) as sess:
+  while not sess.should_stop():
+    sess.run(some_op)
+```
+
+Or similarly with contrib.learn:
+
+```python
+hooks = [ProfilerHook(save_steps=10, output_dir="profiling")]
+estimator = learn.Estimator(...)
+estimator.fit(input_fn, monitors=hooks)
+```
diff --git a/tensorflow/contrib/hooks/__init__.py b/tensorflow/contrib/hooks/__init__.py
new file mode 100644
index 00000000000..4b7319f843d
--- /dev/null
+++ b/tensorflow/contrib/hooks/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""hooks: A module containing `SessionRunHook`s for use with `MonitoredSession`.
+
+@@ProfilerHook
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.hooks.python.training import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['ProfilerHook']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/hooks/python/__init__.py b/tensorflow/contrib/hooks/python/__init__.py
new file mode 100644
index 00000000000..8f9c49896e9
--- /dev/null
+++ b/tensorflow/contrib/hooks/python/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Experimental `SessionRunHooks` for use with `MonitoredSession`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.hooks.python.training import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/hooks/python/training/__init__.py b/tensorflow/contrib/hooks/python/training/__init__.py
new file mode 100644
index 00000000000..cc4726497f8
--- /dev/null
+++ b/tensorflow/contrib/hooks/python/training/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""hooks: A module containing `SessionRunHook`s for use with `MonitoredSession`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.hooks.python.training.profiler_hook import ProfilerHook
diff --git a/tensorflow/contrib/hooks/python/training/profiler_hook.py b/tensorflow/contrib/hooks/python/training/profiler_hook.py
new file mode 100644
index 00000000000..35aa25edfde
--- /dev/null
+++ b/tensorflow/contrib/hooks/python/training/profiler_hook.py
@@ -0,0 +1,104 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Additional `SessionRunHook` implementations to complement those in
+tensorflow/python/training.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import timeline
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+
+class ProfilerHook(session_run_hook.SessionRunHook):
+  """Captures CPU/GPU profiling information every N steps or seconds.
+
+  This produces files called "timeline-<step>.json", which are in Chrome
+  Trace format.
+
+  For more information see:
+  https://github.com/catapult-project/catapult/blob/master/tracing/README.md"""
+
+  def __init__(self,
+               save_steps=None,
+               save_secs=None,
+               output_dir="",
+               show_dataflow=True,
+               show_memory=False):
+    """Initializes a hook that takes periodic profiling snapshots.
+
+    Args:
+      save_steps: `int`, save profile traces every N steps. Exactly one of
+          `save_secs` and `save_steps` should be set.
+      save_secs: `int`, save profile traces every N seconds.
+      output_dir: `string`, the directory to save the profile traces to.
+          Defaults to the current directory.
+      show_dataflow: `bool`, if True, add flow events to the trace connecting
+          producers and consumers of tensors.
+      show_memory: `bool`, if True, add object snapshot events to the trace
+          showing the sizes and lifetimes of tensors.
+    """
+    self._output_file = os.path.join(output_dir, "timeline-{}.json")
+    self._show_dataflow = show_dataflow
+    self._show_memory = show_memory
+    self._timer = SecondOrStepTimer(every_secs=save_secs,
+                                    every_steps=save_steps)
+
+  def begin(self):
+    self._next_step = None
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use ProfilerHook.")
+
+  def before_run(self, run_context):
+    self._request_summary = (
+        self._next_step is None or
+        self._timer.should_trigger_for_step(self._next_step))
+    requests = {"global_step": self._global_step_tensor}
+    opts = (config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+            if self._request_summary else None)
+
+    return SessionRunArgs(requests, options=opts)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_values.results["global_step"]
+
+    if self._request_summary:
+      self._timer.update_last_triggered_step(global_step)
+      self._save(global_step,
+                 self._output_file.format(global_step),
+                 run_values.run_metadata.step_stats)
+
+    self._next_step = global_step + 1
+
+  def _save(self, step, save_path, step_stats):
+    logging.info("Saving timeline for %d into '%s'.", step, save_path)
+    with gfile.Open(save_path, "w") as f:
+      trace = timeline.Timeline(step_stats)
+      f.write(trace.generate_chrome_trace_format(
+          show_dataflow=self._show_dataflow,
+          show_memory=self._show_memory))
diff --git a/tensorflow/contrib/hooks/python/training/profiler_hook_test.py b/tensorflow/contrib/hooks/python/training/profiler_hook_test.py
new file mode 100644
index 00000000000..e7ecb5eb2fc
--- /dev/null
+++ b/tensorflow/contrib/hooks/python/training/profiler_hook_test.py
@@ -0,0 +1,122 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for profiler_hook."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import shutil
+import tempfile
+
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.hooks.python.training import ProfilerHook
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
+
+class ProfilerHookTest(test.TestCase):
+
+  def setUp(self):
+    super(ProfilerHookTest, self).setUp()
+    self.output_dir = tempfile.mkdtemp()
+    self.graph = ops.Graph()
+    self.filepattern = os.path.join(self.output_dir, "timeline-*.json")
+    with self.graph.as_default():
+      self.global_step = variables.get_or_create_global_step()
+      self.train_op = state_ops.assign_add(self.global_step, 1)
+
+  def tearDown(self):
+    super(ProfilerHookTest, self).tearDown()
+    shutil.rmtree(self.output_dir, ignore_errors=True)
+
+  def _count_timeline_files(self):
+    return len(gfile.Glob(self.filepattern))
+
+  def test_raise_in_both_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      ProfilerHook(save_secs=10, save_steps=20)
+
+  def test_raise_in_none_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      ProfilerHook(save_secs=None, save_steps=None)
+
+  def test_save_secs_saves_in_first_step(self):
+    with self.graph.as_default():
+      hook = ProfilerHook(save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)
+        self.assertEqual(1, self._count_timeline_files())
+
+  @test.mock.patch('time.time')
+  def test_save_secs_saves_periodically(self, mock_time):
+    # Pick a fixed start time.
+    current_time = 1484863632.320497
+
+    with self.graph.as_default():
+      mock_time.return_value = current_time
+      hook = ProfilerHook(save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(1, self._count_timeline_files())
+        sess.run(self.train_op)  # Not saved.
+        self.assertEqual(1, self._count_timeline_files())
+        # Simulate 2.5 seconds of sleep.
+        mock_time.return_value = current_time + 2.5
+        sess.run(self.train_op)  # Saved.
+
+        # Pretend some small amount of time has passed.
+        mock_time.return_value = current_time + 0.1
+        sess.run(self.train_op)  # Not saved.
+        # Edge test just before we should save the timeline.
+        mock_time.return_value = current_time + 1.9
+        sess.run(self.train_op)  # Not saved.
+        self.assertEqual(2, self._count_timeline_files())
+
+        mock_time.return_value = current_time + 4.5
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(3, self._count_timeline_files())
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      hook = ProfilerHook(save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        sess.run(self.train_op)  # Not saved.
+        self.assertEqual(1, self._count_timeline_files())
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      hook = ProfilerHook(save_steps=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        self.assertEqual(0, self._count_timeline_files())
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(1, self._count_timeline_files())
+        sess.run(self.train_op)  # Not saved.
+        self.assertEqual(1, self._count_timeline_files())
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(2, self._count_timeline_files())
+        sess.run(self.train_op)  # Not saved.
+        self.assertEqual(2, self._count_timeline_files())
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(3, self._count_timeline_files())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
new file mode 100644
index 00000000000..5a6f2f3086d
--- /dev/null
+++ b/tensorflow/contrib/hvx/README.md
@@ -0,0 +1,139 @@
+# TensorFlow Runtime with HVX Acceleration
+
+## Description
+
+This README explain how to build and use the TensorFlow Runtime with HVX Acceleration. HVX is an extension of Hexagon which is a DSP provided by qualcomm which can compute vector calculations faster using lower energy than ARM processors.
+
+## Quick Start Guide
+
+We provides several tools to build and run inference with this runtime quickly.
+
+#### All-in-one script to run inception model with prebuild hexagon library
+If you don’t need to build your own implementation of hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+
+```
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow
+NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/build_all_android.sh -X
+```
+(-X downloads dependencies to hexagon HVX and graphs, and copy all dependencies to android and execute a test)
+
+#### All-in-one script to run inception model by building entire libraries from source code
+ If you want to build your own implementation of hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads source and build everything for hexagon.
+
+```
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow
+QUALCOMM_SDK="/path/to/qualcomm/sdk" NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+```
+
+## Building libraries
+
+If you've finished walking through the quick start guide, you may want to try building each binary manually.
+
+#### Build libhexagon_nn_skel.so
+Download hexagon nn library from codeaurora.org and build it.
+
+```
+git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
+cd nnlib
+```
+
+(Just follow instructions in README.HOW_TO_BUILD. You can find libhexagon_nn_skel.so in hexagon_Release_dynamic_toolv72_v60/ship)
+Then copy the generated binary to GEN_LIBS_DIR
+
+```
+GEN_LIBS_DIR="/path/to/a/dir/to/store/hexagon/libraries"
+cp -v "hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" "${GEN_LIBS_DIR}"
+```
+
+#### Build libhexagon_controller.so
+Download tensorflow and build hexagon controller.
+
+```
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow
+TF_ROOT_DIR="$(pwd)"
+QUALCOMM_SDK="/path/to/qualcomm/sdk"
+GENERATED_NNLIB_DIRECTORY="/path/to/nnlib"
+GENERATED_HEXAGON_CONTROLLER_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
+rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
+cp -af "${TF_ROOT_DIR}/tensorflow/contrib/hvx/hexagon_controller" \
+   "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
+cp -afv "${GENERATED_NNLIB_DIRECTORY}/interface" \
+"${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/"
+cp -afv "${GENERATED_NNLIB_DIRECTORY}/glue" \
+"${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/"
+make clean V=android_Release
+rm -rf android_Release
+make tree VERBOSE=1 V=android_Release
+cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" "${GEN_LIBS_DIR}"
+```
+
+#### Build tensorflow linking hexagon library
+Build tensorflow with the build_all_android.sh with specifying -x option.
+
+```
+BUILD_ALL_ANDROID_PATH="${TF_ROOT_DIR}/tensorflow/contrib/makefile/build_all_android.sh"
+NDK_ROOT="/path/to/ndk/root"
+
+CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
+-x "${GEN_LIBS_DIR}" \
+-s "${TF_ROOT_DIR}/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in" \
+-t hexagon_graph_execution
+```
+
+#### Push binaries to your Android device
+
+Before running tests on your Android device, you need to push several binaries to it.
+
+```
+adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
+adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
+adb push -p \
+"${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+"/data/local/tmp/"
+adb wait-for-device
+ANDROID_EXEC_FILE_MODE=755
+adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
+"/data/local/tmp/hexagon_graph_execution"
+adb wait-for-device
+```
+
+#### Run tests on the device
+
+Finally, you can run the inference tests on your device.
+
+```
+adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
+"/data/local/tmp/hexagon_graph_execution"
+```
+
+#### Troubleshooting
+If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+
+If the lack of a testsig library is your problem, you will see errors of the type:
+`vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
+appearing in adb logcat.
+
+There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
+1. Run adb as root: `adb root`
+2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
+3. Convert the decimal number you get as output to hex
+4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
+5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
+6. Push the shared library to your device:
+```
+adb root
+adb wait-for-device
+adb remount
+adb wait-for-device
+adb shell mkdir /system/lib/rfsa
+adb shell mkdir /system/lib/rfsa/adsp
+adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+```
+
+After rebooting your device, you should be able to run the sample application.
+
+Maintainers:
+- Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
new file mode 100644
index 00000000000..621466c9edd
--- /dev/null
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
@@ -0,0 +1,63 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_binary(
+    name = "clock_cycle_profiling",
+    testonly = 1,
+    srcs = ["clock_cycle_profiling_main.cc"],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-s",
+            "-landroid",
+            "-ljnigraphics",
+            "-llog",
+            "-lm",
+            "-z defs",
+            "-s",
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:tensorflow",
+            "//tensorflow/core:test",
+        ],
+    }),
+)
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/clock_cycle_profiling_main.cc b/tensorflow/contrib/hvx/clock_cycle_profiling/clock_cycle_profiling_main.cc
new file mode 100644
index 00000000000..a87ef953b7e
--- /dev/null
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/clock_cycle_profiling_main.cc
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/profile_utils/clock_cycle_profiler.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+
+int main(int argc, char** argv) {
+  static constexpr int LOOP_COUNT = 1000000;
+
+#if defined(__ANDROID_API__)
+#if defined(__aarch64__)
+  LOG(INFO) << "android arm 64 bit";
+#endif
+#if defined(__ARM_ARCH_7A__)
+  LOG(INFO) << "android arm 32 bit";
+#endif
+  LOG(INFO) << "Android API = " << __ANDROID_API__;
+  if (__ANDROID_API__ < 21) {
+    LOG(INFO) << "Cpu utils requires API level 21 or above.";
+    return 0;
+  }
+#endif
+
+  tensorflow::profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+
+  tensorflow::ClockCycleProfiler prof_global;
+  tensorflow::ClockCycleProfiler prof_internal;
+
+  prof_global.Start();
+  for (int i = 0; i < LOOP_COUNT; ++i) {
+    prof_internal.Start();
+    prof_internal.Stop();
+  }
+  prof_global.Stop();
+
+  prof_global.DumpStatistics("prof_global");
+  prof_internal.DumpStatistics("prof_internal");
+
+  return 0;
+}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
index 567485b0350..7c821585224 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
@@ -39,8 +39,8 @@ static float s_output_values[300 * 300 * 3 * 4];
 extern void init_graph(uint32_t id);
 extern void init_graph_v1(uint32_t id);
 extern uint8_t inception_dummy_int_data_299x299[];
-extern uint8_t inception_sample_int_data_224x224[];
-extern float inception_dummy_float_data_299x299_299x299[];
+extern uint8_t inception_dummy_int_data_224x224[];
+extern float inception_dummy_float_data_299x299[];
 
 enum InceptionVersion {
   INCEPTION_V1,
@@ -167,6 +167,7 @@ bool hexagon_controller_ConstructGraph(uint32_t nn_id) {
   int err;
   if ((err = hexagon_nn_prepare(nn_id)) != 0) {
     TFMLOGE("Prepare failed! returned 0x%x\n", err);
+    hexagon_controller_PrintLog(nn_id);
     return false;
   } else {
     TFMLOGD("Prepare success!\n");
@@ -282,10 +283,10 @@ void hexagon_controller_DumpPerf(uint32_t nn_id) {
   unsigned long long int total_cycles = 0;
   unsigned long long int cum_cycles = 0;
   unsigned long long int counter = 0;
-  int n_nodes;
+  unsigned int n_nodes;
   int i;
   TFMLOGD("Perf dump follows:");
-  if (hexagon_nn_get_perfinfo(nn_id, info, MAX_NODES,&n_nodes) != 0) {
+  if (hexagon_nn_get_perfinfo(nn_id, info, MAX_NODES, &n_nodes) != 0) {
     TFMLOGE("perf info failure");
     return;
   }
@@ -321,7 +322,7 @@ void hexagon_controller_DumpNodeName(uint32_t nn_id) {
   unsigned long long int total_cycles = 0;
   unsigned long long int cum_cycles = 0;
   unsigned long long int counter = 0;
-  int node_count;
+  unsigned int node_count;
   int i;
   TFMLOGD("Perf dump follows:");
   if (hexagon_nn_get_perfinfo(id, info, MAX_NODES, &node_count) != 0) {
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index fe329e2f590..31caebf8728 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -36,12 +36,14 @@ static const bool DBG_SHOW_ID = false;
 
 static const uint32_t OUTPUT_PARAM_MAX_LINE_SIZE = 1000;
 
+static const uint32_t PRINT_BUFSIZE = 2 * 1024 * 1024;
+
 // extern pre-generated inception dummy data
 extern uint8_t inception_dummy_int_data_224x224[];
 extern uint8_t inception_dummy_int_data_299x299[];
-extern float inception_dummy_float_data_299x299_299x299[];
+extern float inception_dummy_float_data_299x299[];
 
-#define GEMM_WRAPPER_VERSION 1
+#define HEXAGON_CONTROLLER_VERSION 92
 
 // allocate print bufsize in advance @MB
 #define PRINT_BUFSIZE (2 * 1024 * 1024)
@@ -78,7 +80,7 @@ void hexagon_controller_InitInputNodeDataToInceptionDummyData(int version) {
       hexagon_controller_CopyByteNodeData(
           INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
           INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
-          sizeof(float), (uint8_t*)inception_dummy_float_data_299x299_299x299);
+          sizeof(float), (uint8_t*)inception_dummy_float_data_299x299);
     } else {
       hexagon_controller_CopyByteNodeData(
           INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
@@ -157,12 +159,12 @@ void hexagon_controller_PrintGraph(uint32_t id) {
 }
 
 int hexagon_controller_GetWrapperVersion() {
-  return GEMM_WRAPPER_VERSION;
+  return HEXAGON_CONTROLLER_VERSION;
 }
 
 int hexagon_controller_GetHexagonBinaryVersion() {
   int retval = 0;
-  hexagon_nn_GetHexagonBinaryVersion(&retval);
+  hexagon_nn_version(&retval);
   return retval;
 }
 
@@ -229,7 +231,8 @@ bool hexagon_controller_CopyByteNodeData(
 
 int hexagon_controller_InitHexagonWithMaxAttributes(
     int enable_dcvs, int bus_usage, int version) {
-  TFMLOGI("Init hexagon with max attributes");
+  TFMLOGI("Init hexagon with max attributes (Controller version = %d)",
+          HEXAGON_CONTROLLER_VERSION);
   const int MCPS = 1000;
   const int MBPS = 12000;
 
@@ -250,7 +253,7 @@ int hexagon_controller_InitHexagonWithMaxAttributes(
   };
   int retval = 0;
   if (!enable_dcvs) {
-    retval = hexagon_nn_disableDcvs();
+    retval = hexagon_nn_disable_dcvs();
     if (retval) {
       TFMLOGE("Failed to disable DSP DCVS: %x\n", retval);
     }
@@ -372,3 +375,13 @@ void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable) {
 bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled() {
   return s_dbg_use_inception_dummy_data;
 }
+
+void hexagon_controller_PrintLog(uint32_t nn_id) {
+  unsigned char *buf;
+  if ((buf = malloc(PRINT_BUFSIZE)) == NULL) {
+    return;
+  }
+  hexagon_nn_getlog(nn_id, buf, PRINT_BUFSIZE);
+  TFMLOGE("DUMP HEXAGON LOG: %s", buf);
+  free(buf);
+}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
index eaf4a58751f..ab8c80c0f32 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
@@ -117,6 +117,8 @@ void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable);
 
 bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled();
 
+void hexagon_controller_PrintLog(uint32_t nn_id);
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
new file mode 100644
index 00000000000..5309724580c
--- /dev/null
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -0,0 +1,35 @@
+# Description:
+#   Contains a tool to dump TensorFlow ops which are not supported
+#   in TensorFlow HVX runtime.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+cc_binary(
+    name = "hvx_ops_support_checker",
+    testonly = 1,
+    srcs = ["hvx_ops_support_checker_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+        "//tensorflow/core/kernels/hexagon:graph_transferer",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+)
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
new file mode 100644
index 00000000000..6ae7c4a7420
--- /dev/null
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// bazel build tensorflow/contrib/hvx/hvx_ops_support_checker &&
+// bazel-bin/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker
+// \
+// --in_graph=graph_def.pb
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace {
+static int ParseFlags(int argc, char* argv[], string* in_graph) {
+  std::vector<Flag> flag_list = {
+      Flag("in_graph", in_graph, "input graph file name"),
+  };
+  CHECK(Flags::Parse(&argc, argv, flag_list));
+  // We need to call this to set up global state for TensorFlow.
+  port::InitMain(argv[0], &argc, &argv);
+
+  string usage = Flags::Usage(argv[0], flag_list);
+  CHECK(!in_graph->empty()) << "in_graph graph can't be empty.\n" << usage;
+
+  return 0;
+}
+
+static void SummarizeNode(const NodeDef& node_def) {
+  LOG(INFO) << "Node(" << node_def.name() << ")";
+  LOG(INFO) << "  op: " << node_def.op();
+  for (const string& input : node_def.input()) {
+    LOG(INFO) << " Input: " << input;
+  }
+}
+
+static void DumpRemoteFusedGraph(const NodeDef& node_def) {
+  LOG(INFO) << "Remote fused graph found.";
+  RemoteFusedGraphExecuteInfo info;
+  string serialized_proto;
+  GetNodeAttr(node_def,
+              RemoteFusedGraphExecuteUtils::
+                  ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+              &serialized_proto)
+      .IgnoreError();
+  info.ParseFromString(serialized_proto);
+  LOG(INFO) << "Node name: " << node_def.name();
+  LOG(INFO) << "Executor name: " << info.executor_name();
+  for (const string& input : info.graph_input_node_name()) {
+    LOG(INFO) << "Input: " << input;
+  }
+  for (const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type :
+       info.default_graph_input_tensor_shape()) {
+    LOG(INFO) << "Input shape type: " << shape_type.DebugString();
+  }
+  for (const string& output : info.graph_output_node_name()) {
+    LOG(INFO) << "Output: " << output;
+  }
+  for (const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type :
+       info.default_graph_output_tensor_shape()) {
+    LOG(INFO) << "Output shape type: " << shape_type.DebugString();
+  }
+  const int subgraph_node_size = info.remote_graph().node_size();
+  LOG(INFO) << "Nodes in the graph: " << subgraph_node_size;
+  for (int i = 0; i < subgraph_node_size; ++i) {
+    LOG(INFO) << "node(" << i << "): " << info.remote_graph().node(i).name();
+  }
+}
+
+static void CheckOpsSupport(const GraphDef& graph_def) {
+  const IGraphTransferOpsDefinitions& ops_definition =
+      HexagonOpsDefinitions::getInstance();
+  LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
+
+  std::unordered_set<string> unsupported_ops;
+  bool all_supported = true;
+  bool contains_remote_graph = false;
+  for (const NodeDef& node : graph_def.node()) {
+    if (node.op() == "RemoteFusedGraphExecute") {
+      contains_remote_graph = true;
+      DumpRemoteFusedGraph(node);
+      continue;
+    }
+    // TODO(satok): Set correct data type if it's given.
+    const int op_id = ops_definition.GetOpIdFor(node.op(), {});
+    if (op_id == IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+      all_supported = false;
+      LOG(ERROR) << "OP type: " << node.op() << " is not supported on hvx. "
+                 << "Name = " << node.name();
+      unsupported_ops.emplace(node.op());
+    }
+  }
+
+  LOG(INFO) << "\n";
+  LOG(INFO) << "Unsupported ops:";
+  int count = 0;
+  for (const string& op_type : unsupported_ops) {
+    LOG(INFO) << "(" << (++count) << ") " << op_type;
+  }
+  if (count == 0) {
+    LOG(INFO) << "All ops supported!";
+  } else {
+    LOG(INFO) << count << " ops are not supported.";
+  }
+
+  if (contains_remote_graph) {
+    for (const NodeDef& node : graph_def.node()) {
+      SummarizeNode(node);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::string in_graph;
+  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph);
+  if (ret != 0) {
+    return ret;
+  }
+
+  tensorflow::GraphDef graph_def;
+  TF_CHECK_OK(tensorflow::graph_transforms::LoadTextOrBinaryGraphFile(
+      in_graph, &graph_def));
+
+  tensorflow::CheckOpsSupport(graph_def);
+  return 0;
+}
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
old mode 100644
new mode 100755
index 3dfe954eaf8..a095f0e048a
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -12,12 +12,15 @@ load(
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_image_ops.so",
     srcs = [
+        "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
         "ops/image_ops.cc",
@@ -28,6 +31,25 @@ tf_custom_op_library(
     ],
 )
 
+tf_kernel_library(
+    name = "image_ops_kernels",
+    srcs = [
+        "kernels/bipartite_match_op.cc",
+        "kernels/image_ops.cc",
+        "kernels/image_ops.h",
+    ],
+    gpu_srcs = [
+        "kernels/image_ops_gpu.cu.cc",
+        "kernels/image_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_gen_op_libs(
     op_lib_names = ["image_ops"],
 )
@@ -37,18 +59,23 @@ tf_gen_op_wrapper_py(
     deps = [":image_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "image_py",
     srcs = [
         "__init__.py",
         "python/ops/image_ops.py",
     ],
-    data = [":python/ops/_image_ops.so"],
+    dso = [":python/ops/_image_ops.so"],
+    kernels = [
+        ":image_ops_kernels",
+        ":image_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":image_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
@@ -62,6 +89,7 @@ cuda_py_test(
     srcs = ["python/kernel_tests/image_ops_test.py"],
     additional_deps = [
         ":image_py",
+        ":single_image_random_dot_stereograms_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -71,6 +99,33 @@ cuda_py_test(
     ],
 )
 
+tf_custom_op_library(
+    name = "python/ops/_single_image_random_dot_stereograms.so",
+    srcs = [
+        "kernels/single_image_random_dot_stereograms_ops.cc",
+        "ops/single_image_random_dot_stereograms_ops.cc",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["single_image_random_dot_stereograms_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "single_image_random_dot_stereograms_ops",
+    deps = [":single_image_random_dot_stereograms_ops_op_lib"],
+)
+
+py_library(
+    name = "single_image_random_dot_stereograms_py",
+    srcs = glob(["python/ops/single*.py"]) + ["__init__.py"],
+    data = [":python/ops/_single_image_random_dot_stereograms.so"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":single_image_random_dot_stereograms_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
old mode 100644
new mode 100755
index 9d3ea508683..fee1a6c2bc9
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -16,21 +16,28 @@
 
 ### API
 
-This module provides functions for image manipulation; currently, only projective
-transforms (including rotation) are supported.
+This module provides functions for image manipulation; currently, only
+projective transforms (including rotation) are supported.
 
 ## Image `Ops`
 
+@@angles_to_projective_transforms
+@@compose_transforms
 @@rotate
 @@transform
+@@bipartite_match
+@@single_image_random_dot_stereograms
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=line-too-long
+from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
+from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
+from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms import single_image_random_dot_stereograms
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/image/kernels/bipartite_match_op.cc b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
new file mode 100644
index 00000000000..7d207c388b1
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <queue>
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace {
+
+struct DistancePair {
+  DistancePair(int i1, int i2, double d) : index1(i1), index2(i2), dist(d) {}
+
+  bool operator<(const DistancePair& b1) const { return b1.dist < dist; }
+
+  int index1, index2;
+  float dist;
+};
+
+}  // namespace
+
+namespace tensorflow {
+
+class BipartiteMatchOp : public OpKernel {
+ public:
+  explicit BipartiteMatchOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("top_k", &top_k_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_distance_mat = context->input(0);
+    OP_REQUIRES(context, input_distance_mat.dims() == 2,
+                errors::InvalidArgument(
+                    "distance_mat should be 2-dimensional, but got ",
+                    input_distance_mat.shape().DebugString()));
+    const int num_input_rows = input_distance_mat.dim_size(0);
+    const int num_input_columns = input_distance_mat.dim_size(1);
+
+    const Tensor& input_num_valid_rows = context->input(1);
+    OP_REQUIRES(
+        context, input_num_valid_rows.NumElements() == 1,
+        errors::InvalidArgument(
+            "num_valid_rows argument should be a tensor with 1 element, "
+            "but got ",
+            input_num_valid_rows.NumElements()));
+
+    const float num_valid_rows_f = input_num_valid_rows.flat<float>()(0);
+    int num_valid_rows = num_input_rows;
+    // If num_valid_rows_f is non-negative, use it to set num_valid_rows.
+    if (num_valid_rows_f >= 0) {
+      num_valid_rows = static_cast<int>(num_valid_rows_f + 0.1);
+    }
+    OP_REQUIRES(
+        context, num_input_rows >= num_valid_rows,
+        errors::InvalidArgument("There should be at least ", num_valid_rows,
+                                " rows in distance_mat, but only got ",
+                                num_input_rows, " rows."));
+
+    // If negative or zero then set it to the maximum possible matches.
+    auto valid_top_k = top_k_;
+
+    if (valid_top_k <= 0) {
+      valid_top_k = num_valid_rows * num_input_columns;
+    }
+
+    // Create output tensors.
+    Tensor* row_to_column_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num_input_rows}),
+                                            &row_to_column_match_indices));
+    Tensor* column_to_row_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({num_input_columns}),
+                                            &column_to_row_match_indices));
+
+    typename TTypes<float, 2>::ConstTensor distance_mat =
+        input_distance_mat.shaped<float, 2>(
+            {num_input_rows, num_input_columns});
+
+    // Greedy bi-partite matching.
+    std::priority_queue<DistancePair> match_queue;
+
+    for (int index1 = 0; index1 < num_valid_rows; index1++) {
+      for (int index2 = 0; index2 < num_input_columns; index2++) {
+        match_queue.push(
+            DistancePair(index1, index2, distance_mat(index1, index2)));
+      }
+    }
+
+    std::vector<int> row_to_col_match_vec(num_input_rows, -1);
+    std::vector<int> col_to_row_match_vec(num_input_columns, -1);
+    int index = 0;
+    while (!match_queue.empty()) {
+      const auto& match = match_queue.top();
+      if (row_to_col_match_vec[match.index1] == -1 &&
+          col_to_row_match_vec[match.index2] == -1) {
+        row_to_col_match_vec[match.index1] = match.index2;
+        col_to_row_match_vec[match.index2] = match.index1;
+
+        index++;
+        if (index >= valid_top_k) {
+          break;
+        }
+      }
+      match_queue.pop();
+    }
+
+    // Set the output tensors.
+    row_to_column_match_indices->vec<int>() =
+        TTypes<int>::Vec(row_to_col_match_vec.data(), num_input_rows);
+    column_to_row_match_indices->vec<int>() =
+        TTypes<int>::Vec(col_to_row_match_vec.data(), num_input_columns);
+  }
+
+ private:
+  int top_k_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BipartiteMatch").Device(DEVICE_CPU),
+                        BipartiteMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 8d50541771b..8a97f07732c 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,13 +43,29 @@ template class FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::INTERPOLATION_BILINEAR;
+using generator::INTERPOLATION_NEAREST;
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
 class ImageProjectiveTransform : public OpKernel {
+ private:
+  Interpolation interpolation_;
+
  public:
-  explicit ImageProjectiveTransform(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
+  explicit ImageProjectiveTransform(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string interpolation_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
+    if (interpolation_str == "NEAREST") {
+      interpolation_ = INTERPOLATION_NEAREST;
+    } else if (interpolation_str == "BILINEAR") {
+      interpolation_ = INTERPOLATION_BILINEAR;
+    } else {
+      LOG(FATAL) << "Invalid interpolation " << interpolation_str
+                 << ". Supported types: NEAREST, BILINEAR";
+    }
+  }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
@@ -68,8 +84,8 @@ class ImageProjectiveTransform : public OpKernel {
     Tensor* output_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
-    const FillProjectiveTransform<Device, T> functor;
-    functor(ctx->eigen_device<Device>(), &output, images, transform);
+    (FillProjectiveTransform<Device, T>(interpolation_))(
+        ctx->eigen_device<Device>(), &output, images, transform);
   }
 };
 
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 92b908a1c68..ad501330617 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -28,6 +28,8 @@ namespace tensorflow {
 
 namespace generator {
 
+enum Interpolation { INTERPOLATION_NEAREST, INTERPOLATION_BILINEAR };
+
 using Eigen::array;
 using Eigen::DenseIndex;
 
@@ -36,20 +38,19 @@ class ProjectiveGenerator {
  private:
   typename TTypes<T, 4>::ConstTensor input_;
   typename TTypes<float>::ConstMatrix transforms_;
+  const Interpolation interpolation_;
 
  public:
   static const int kNumParameters = 8;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
-                      typename TTypes<float>::ConstMatrix transforms)
-      : input_(input), transforms_(transforms) {}
+                      typename TTypes<float>::ConstMatrix transforms,
+                      const Interpolation interpolation)
+      : input_(input), transforms_(transforms), interpolation_(interpolation) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
-    array<DenseIndex, 4> input_coords;
-    input_coords[0] = coords[0];
-
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -57,24 +58,82 @@ class ProjectiveGenerator {
             ? transforms_.data()
             : &transforms_.data()[transforms_.dimension(1) * coords[0]];
     float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
-    const int64 input_x = std::round(
+    const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
-        projection);
-    const int64 input_y = std::round(
+        projection;
+    const float input_y =
         (transform[3] * output_x + transform[4] * output_y + transform[5]) /
-        projection);
+        projection;
 
-    if (!(0 <= input_y && input_y < input_.dimension(1) && 0 <= input_x &&
-          input_x < input_.dimension(2))) {
-      // TODO(ringwalt): Add a fill value input.
-      return T(0);
+    // TODO(ringwalt): Add a fill value input.
+#if (defined __CUDA_ARCH__) && (CUDART_VERSION < 8000)
+    // On CUDA versions previous to 8.0, only __shared__ variables
+    // could be declared as static in the device code.
+    const T fill_value = T(0);
+#else
+    static const T fill_value = T(0);
+#endif
+    switch (interpolation_) {
+      case INTERPOLATION_NEAREST:
+        // Switch the order of x and y again for indexing into the image.
+        return nearest_interpolation(coords[0], input_y, input_x, coords[3],
+                                     fill_value);
+      case INTERPOLATION_BILINEAR:
+        return bilinear_interpolation(coords[0], input_y, input_x, coords[3],
+                                      fill_value);
     }
-    input_coords[1] = input_y;
-    input_coords[2] = input_x;
+    // Unreachable; ImageProjectiveTransform only uses INTERPOLATION_NEAREST
+    // or INTERPOLATION_BILINEAR.
+    return T(0);
+  }
 
-    input_coords[3] = coords[3];
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  nearest_interpolation(const DenseIndex batch, const float y, const float x,
+                        const DenseIndex channel, const T fill_value) const {
+    return read_with_fill_value(batch, DenseIndex(std::round(y)),
+                                DenseIndex(std::round(x)), channel, fill_value);
+  }
 
-    return input_(input_coords);
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  bilinear_interpolation(const DenseIndex batch, const float y, const float x,
+                         const DenseIndex channel, const T fill_value) const {
+    const float y_floor = std::floor(y);
+    const float x_floor = std::floor(x);
+    const float y_ceil = y_floor + 1;
+    const float x_ceil = x_floor + 1;
+    // f(x, y_floor) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_floor)
+    //               + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_floor)
+    const float value_yfloor =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y_ceil) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_ceil)
+    //              + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_ceil)
+    const float value_yceil =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y) = (y_ceil - y) / (y_ceil - y_floor) * f(x, y_floor)
+    //         + (y - y_floor) / (y_ceil - y_floor) * f(x, y_ceil)
+    return T((y_ceil - y) * value_yfloor + (y - y_floor) * value_yceil);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T read_with_fill_value(
+      const DenseIndex batch, const DenseIndex y, const DenseIndex x,
+      const DenseIndex channel, const T fill_value) const {
+    // batch and channel must be correct, because they are passed unchanged from
+    // the input.
+    return (0 <= y && y < input_.dimension(1) && 0 <= x &&
+            x < input_.dimension(2))
+               ? input_(array<DenseIndex, 4>{batch, y, x, channel})
+               : fill_value;
   }
 };
 
@@ -85,6 +144,7 @@ class ProjectiveGenerator {
 // some Eigen device code.
 namespace functor {
 
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -92,15 +152,17 @@ struct FillProjectiveTransform {
   typedef typename TTypes<T, 4>::Tensor OutputType;
   typedef typename TTypes<T, 4>::ConstTensor InputType;
   typedef typename TTypes<float, 2>::ConstTensor TransformsType;
+  const Interpolation interpolation_;
 
-  FillProjectiveTransform() {}
+  FillProjectiveTransform(Interpolation interpolation)
+      : interpolation_(interpolation) {}
 
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    ProjectiveGenerator<Device, T> generator(images, transform);
-    output->device(device) = images.generate(generator);
+    output->device(device) = images.generate(
+        ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
 
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
new file mode 100755
index 00000000000..6efcc29654f
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -0,0 +1,408 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SingleImageRandomDotStereogramsOp : public OpKernel {
+ private:
+  int E2Epixels;  // Pixels from eye to eye = eye_to_eye_inches * DPI
+
+  int input_Xvalue;  // X value of input Z values (width)
+  int input_Yvalue;  // Y value of input Z values (height)
+
+  int output_Ximage;  // X value of output image (width)
+  int output_Yimage;  // Y value of output image (height)
+  int output_Cimage;  // color value of output image (color, 1 or 3)  (3 not
+                      // implemented)
+
+  int data_box_left;    // X starting value for DATA window
+  int data_box_top;     // Y starting value for DATA window
+  int data_box_width;   // width of scan line
+  int data_box_height;  // hight of image
+
+  int converge_dot_box_end;  // Row convergences dots end on
+
+  uint8* outputImage;  // Output Image flat as a buffer (Tensor Connection)
+  double* ZBuffer;     // For internal use, allow for MASK, etc later, actual Z
+                       // used for Stereogram, XxY (X is the row index, y is col
+                       // index like a screen)
+                       // 0 (far) -> 1.0(near) range
+  bool hidden_surface_removal;
+  int convergence_dots_size;
+  int dots_per_inch;
+  float eye_separation;
+  float mu;
+  bool normalize;
+  float normalize_max;
+  float normalize_min;
+  float border_level;
+  int number_colors;
+  ::tensorflow::TensorShapeProto output_image_shape;
+  ::tensorflow::TensorShapeProto output_data_window;
+
+  uint8 Cblack = 0;
+  uint8 Cwhite = 255;
+
+  int indexMode = 0;  // 0 - truncate XY, 1 - round XY, 2 - Interpolate XY (not
+                      // implemented yet, keep default of 0)
+  int interp_x, interp_y;  // 1 - yes, 0 - no  interpolation directions (not
+                           // implemented yet)
+
+  bool debugging = false;
+
+  inline int separation(double z) {
+    return (std::round((1 - mu * z) * E2Epixels / (2 - mu * z)));
+  }
+
+  inline int get_far_width() { return (separation(0.0)); }
+  inline int get_near_width() { return (separation(1.0)); }
+
+ public:
+  explicit SingleImageRandomDotStereogramsOp(OpKernelConstruction* context)
+      : OpKernel(context) {  // Constructor
+    OP_REQUIRES_OK(context, context->GetAttr("hidden_surface_removal",
+                                             &hidden_surface_removal));
+    OP_REQUIRES_OK(context, context->GetAttr("convergence_dots_size",
+                                             &convergence_dots_size));
+    OP_REQUIRES_OK(context, context->GetAttr("dots_per_inch", &dots_per_inch));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("eye_separation", &eye_separation));
+    OP_REQUIRES_OK(context, context->GetAttr("mu", &mu));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize", &normalize));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_max", &normalize_max));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_min", &normalize_min));
+    OP_REQUIRES_OK(context, context->GetAttr("border_level", &border_level));
+    OP_REQUIRES_OK(context, context->GetAttr("number_colors", &number_colors));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_image_shape", &output_image_shape));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_data_window", &output_data_window));
+
+    E2Epixels =
+        eye_separation * dots_per_inch;  // Initialize pixels from eye to eye
+  }
+
+  ~SingleImageRandomDotStereogramsOp() override {  // Destructor
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    input_Xvalue = input_tensor.shape().dim_size(
+        1);  // X value is the number of columns of the input matrix
+    input_Yvalue =
+        input_tensor.shape().dim_size(0);  // Y value is the number of rows
+
+    output_Ximage = output_image_shape.dim(0).size();
+    output_Yimage = output_image_shape.dim(1).size();
+    output_Cimage = output_image_shape.dim(2).size();
+
+    if (number_colors > 256)  // Go to full color image
+      output_Cimage = 3;
+
+    int data_Xwindow = output_data_window.dim(0).size();
+    int data_Ywindow = output_data_window.dim(1).size();
+
+    int deltaX_border_image = output_Ximage - data_Xwindow;
+    int deltaY_border_image = output_Yimage - data_Ywindow;
+
+    if (convergence_dots_size >
+        0)  // 3 frame sections in Y direction due to DOTS
+    {
+      deltaY_border_image =
+          deltaY_border_image -
+          convergence_dots_size;  // Take off space for Convergence Dots
+      deltaY_border_image = std::max(0, deltaY_border_image);
+      data_box_top = deltaY_border_image / 3;
+
+      if (deltaY_border_image >= 0) {
+        converge_dot_box_end = output_Yimage - 1 - data_box_top;
+      } else {
+        converge_dot_box_end = output_Yimage - 1;
+      }
+    } else  // Otherwise only 2, no convergence dot
+    {
+      data_box_top = deltaY_border_image / 2;  // Center DATA in Y dimension
+      converge_dot_box_end = output_Yimage - 1;
+    }
+
+    data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
+    data_box_width = data_Xwindow;             // width of scan line
+    data_box_height = data_Ywindow;            // hight of image
+
+    const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
+
+    BuildZBuffer(inputZ);
+
+    // Output a scalar string.
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({output_Yimage, output_Ximage, output_Cimage}),
+            &output_tensor));
+
+    outputImage = output_tensor->flat<uint8>().data();
+
+    generate_stereogram();
+
+    delete[] ZBuffer;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  // Move input into standard Z format to reduce complexity of algorithm
+  //
+  void BuildZBuffer(const T* Z, bool log = false) {
+    double MaxValue = 1.0;
+    double MinValue = 0.0;
+    ZBuffer = new double[input_Xvalue * input_Yvalue];  // Used to computer
+                                                        // final Z values before
+                                                        // rendering to output
+
+    if (normalize) {
+      // Init Min/Max to first value
+      if (normalize_max < normalize_min)  // Autoscale if MIN>MAX
+      {
+        MaxValue = *Z;
+        MinValue = *Z;
+
+        for (int y = 0; y < input_Yvalue; ++y)
+          for (int x = 0; x < input_Xvalue; ++x) {
+            double value = getZfromInputImage(Z, x, y);
+            if (value > MaxValue) MaxValue = value;
+            if (value < MinValue) MinValue = value;
+          }
+      } else {
+        MaxValue = normalize_max;
+        MinValue = normalize_min;
+      }
+    }
+
+    for (int y = 0; y < input_Yvalue; ++y)
+      for (int x = 0; x < input_Xvalue; ++x) {
+        double value = getZfromInputImage(Z, x, y);
+
+        if (normalize) {
+          value = (value - MinValue) / (MaxValue - MinValue);
+        }
+
+        if (value > 1.0) value = 1.0;
+        if (value < 0.0) value = 0.0;
+
+        *(ZBuffer + (input_Xvalue * y + x)) = value;
+      }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  double getZfromInputImage(const T* Z, int x, int y) {
+    return *(Z + input_Xvalue * y + x);
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  // All normalized, not checking required
+  // Possible Projection issue if DATA is bigger or smaller than Input
+  //  Modes include:
+  //         Truncate value (Default)
+  //         Round-off value
+  //         Interpolate between values
+  //
+  double getZfromZbuffer(double x, double y) {
+    int xi, yi;
+
+    switch (indexMode) {
+      case 0:  // Truncate
+        xi = int(x);
+        yi = int(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      case 1:  // Round-off
+        xi = std::round(x);
+        yi = std::round(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      case 2:  // Interpolate (Not implemented yet, will need 4 points
+               // [x,y],[x+1,y],[x,y+1],[x+1,y+1], then interpolate)
+        xi = int(x);
+        yi = int(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      default:  // Round-off is the default
+        xi = int(x + 0.5);
+        yi = int(y + 0.5);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+    }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  int getOutputImageIndex(int x, int y,
+                          int channel) {  // No error checking for some
+                                          // optimization, calling routine
+                                          // required to make sure there is no
+                                          // violation
+    return ((output_Ximage * output_Cimage) * y + x * output_Cimage + channel);
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  double getZFromOutputPixel(int x, int y) {
+    // Convert pixel units to Z units, do this as "double"
+    double xofz = static_cast<double>(input_Xvalue) * (x - data_box_left) /
+                  (static_cast<double>(data_box_width));
+    double yofz = static_cast<double>(input_Yvalue) * (y - data_box_top) /
+                  (static_cast<double>(data_box_height));
+
+    if ((xofz < 0) || (yofz < 0) || (yofz >= input_Yvalue) ||
+        (xofz >= input_Xvalue)) {  // Top of left side border hit or Right
+                                   // side or bottom border hit,
+                                   // send BORDER Z value
+      return border_level;
+    } else {
+      return getZfromZbuffer(xofz, yofz);
+    }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void generate_stereogram() {
+    int s, left, right, visible, t, l;
+    double zt, gz;
+    // Scan line
+    uint8* pix;  // Scan row color for each pixel
+    int* same;   // Used to determine if Pixel needs to be the same as another
+                 // pixel in the row
+
+    pix = new uint8[output_Ximage * output_Cimage];
+    same = new int[output_Ximage];
+
+    for (int y = 0; y < output_Yimage; ++y) {
+      // Set no dependencies on any pixels, tie each one back to itself
+      for (int x = 0; x < output_Ximage; ++x) same[x] = x;
+
+      for (int x = 0; x < output_Ximage; ++x) {
+        gz = getZFromOutputPixel(x, y);
+        s = separation(gz);
+        left = x - s / 2;
+        right = left + s;
+
+        if ((left >= 0) && (right < output_Ximage)) {
+          t = 1;
+          visible = 1;
+          if (hidden_surface_removal) do {
+              zt = gz + 2 * (2 - mu * gz) * t / (mu * E2Epixels);
+              visible = (getZFromOutputPixel(x - t, y) < zt) &&
+                        (getZFromOutputPixel(x + t, y) < zt);
+              ++t;
+            } while ((visible) && (zt < 1));
+
+          if (visible) {
+            l = same[left];
+            while ((l != left) && (l != right))
+              if (l < right) {
+                left = l;
+                l = same[left];
+              } else {
+                same[left] = right;
+                left = right;
+                l = same[left];
+                right = l;
+              }
+            same[left] = right;
+          }
+        }
+      }
+      // Set colors for scan row, use channels and number_colors
+      for (int x = output_Ximage - 1; x >= 0; x--) {
+        for (int channel = 0; channel < output_Cimage; ++channel) {
+          if (same[x] == x) {  // Pick a random color
+            if (number_colors == 2) {
+              if ((rand() % 2) == 0) {
+                pix[x * output_Cimage + channel] = Cblack;
+              } else {
+                pix[x * output_Cimage + channel] = Cwhite;
+              }
+            } else {
+              pix[x * output_Cimage + channel] = rand() % 256;
+            }
+          } else
+            pix[x * output_Cimage + channel] =
+                pix[same[x] * output_Cimage + channel];
+
+          setpixel(x, y, channel, pix[x * output_Cimage + channel]);
+        }
+      }
+    }
+
+    draw_convergence_dots();
+
+    delete[] pix;
+    delete[] same;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void draw_convergence_dots() {
+    int x1, x2;  // center position for convergence dots
+
+    if (convergence_dots_size == 0)  // No dot, return
+      return;
+
+    x1 = output_Ximage / 2 - get_far_width() / 2;
+    x2 = output_Ximage / 2 + get_far_width() / 2;
+
+    for (int lloop = 0; lloop < convergence_dots_size; ++lloop)
+      for (int wloop = 0; wloop < convergence_dots_size; ++wloop)
+        for (int channel = 0; channel < output_Cimage; ++channel) {
+          setpixel(x1 - (convergence_dots_size / 2) + wloop,
+                   converge_dot_box_end - lloop, channel, Cblack);
+          setpixel(x2 - (convergence_dots_size / 2) + wloop,
+                   converge_dot_box_end - lloop, channel, Cblack);
+        }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void setpixel(int x, int y, int channel, uint8 color) {
+    *(outputImage + getOutputImageIndex(x, y, channel)) = color;
+  }
+};
+
+#define REGISTER_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SingleImageRandomDotStereograms") \
+                              .Device(DEVICE_CPU)                 \
+                              .TypeConstraint<T>("T"),            \
+                          SingleImageRandomDotStereogramsOp<T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+
+#undef REGISTER_KERNEL
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 18c16cf1bb6..4527fdd87a8 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -20,21 +20,20 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// TODO(ringwalt): Add an "interpolation" argument with "none", "bilinear", etc.
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
 // implement "same" and "valid" modes in the Python function.
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
+    .Attr("interpolation: string")
     .Output("transformed_images: dtype")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
-      c->set_output_handle_dtype(0, c->input_handle_dtype(0));
-      c->set_output_handle_shape(0, c->input_handle_shape(0));
       return Status::OK();
     })
     .Doc(R"doc(
@@ -59,4 +58,44 @@ transformed_images: 4D `Tensor`, image(s) in NHWC format, generated by applying
 the `transforms` to the `images`. Satisfies the description above.
 )doc");
 
+REGISTER_OP("BipartiteMatch")
+    .Input("distance_mat: float")
+    .Input("num_valid_rows: float")
+    .Attr("top_k: int = -1")
+    .Output("row_to_col_match_indices: int32")
+    .Output("col_to_row_match_indices: int32")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      c->set_output(0, c->MakeShape({c->Dim(input, 0)}));
+      c->set_output(1, c->MakeShape({c->Dim(input, 1)}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find bipartite matching based on a given distance matrix.
+
+A greedy bi-partite matching algorithm is used to obtain the matching with the
+(greedy) minimum distance.
+
+distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+  pair-wise distance matrix between the entities represented by each row and
+  each column. It is an asymmetric matrix. The smaller the distance is, the more
+  similar the pairs are. The bipartite matching is to minimize the distances.
+num_valid_rows: A scalar or a 1-D tensor with one element describing the
+  number of valid rows of distance_mat to consider for the bipartite matching.
+  If set to be negative, then all rows from `distance_mat` are used.
+top_k: A scalar that specifies the number of top-k matches to retrieve.
+  If set to be negative, then is set according to the maximum number of
+  matches from `distance_mat`.
+row_to_col_match_indices: A vector of length num_rows, which is the number of
+  rows of the input `distance_matrix`.
+  If `row_to_col_match_indices[i]` is not -1, row i is matched to column
+  `row_to_col_match_indices[i]`.
+col_to_row_match_indices: A vector of length num_columns, which is the number
+  of columns of the input ditance matrix.
+  If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+  `col_to_row_match_indices[j]`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
new file mode 100755
index 00000000000..2b679921382
--- /dev/null
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("SingleImageRandomDotStereograms")
+    .Attr("T: {double,float,int64,int32}")
+    .Input("depth_values: T")
+    .Output("image: uint8")
+    .Attr("hidden_surface_removal: bool = true")
+    .Attr("convergence_dots_size: int = 8")
+    .Attr("dots_per_inch: int = 72")
+    .Attr("eye_separation: float = 2.5")
+    .Attr("mu: float = .3333")
+    .Attr("normalize: bool = true")
+    .Attr("normalize_max: float = -100.0")
+    .Attr("normalize_min: float = 100.0")
+    .Attr("border_level: float = 0.0")
+    .Attr("number_colors: int = 256")
+    .Attr(
+        "output_image_shape: shape = { dim {size:1024} dim {size: 768} dim "
+        "{size: 1}}")
+    .Attr("output_data_window: shape = { dim {size:1022} dim {size: 757}}")
+    .Doc(R"doc(
+Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
+
+Given the 2-D tensor 'depth_values' with encoded Z values, this operation will 
+encode 3-D data into a 2-D image.  The output of this Op is suitable for the
+encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
+encode 3-D data witin the image.
+
+This Op is based upon:
+'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
+
+Example use which outputs a SIRDS image as picture_out.png:
+```python
+img=[[1,2,3,3,2,1],
+     [1,2,3,4,5,2],
+     [1,2,3,4,5,3],
+     [1,2,3,4,5,4],
+     [6,5,4,4,5,5]]
+
+session = tf.InteractiveSession()
+
+sirds = single_image_random_dot_stereograms(img,convergence_dots_size=8,number_colors=256,normalize=True)
+
+out = sirds.eval()
+
+png = tf.image.encode_png(out).eval()
+
+with open('picture_out.png', 'wb') as f:
+    f.write(png)
+```
+
+depth_values: Z values of data to encode into 'output_data_window' window, 
+  lower values are further away {0.0 floor(far), 1.0 ceiling(near) after normalization}, must be 2-D tensor
+hidden_surface_removal: Activate hidden surface removal
+convergence_dots_size: Black dot size in pixels to help view converge image, drawn on bottom of image
+dots_per_inch: Output device in dots/inch
+eye_separation: Separation between eyes in inches
+mu: Depth of field, Fraction of viewing distance (eg. 1/3 = .3333)
+normalize: Normalize input data to [0.0, 1.0] 
+normalize_max: Fix MAX value for Normalization - if < MIN, autoscale
+normalize_min: Fix MIN value for Normalization - if > MAX, autoscale
+border_level: Value of border depth 0.0 {far} to 1.0 {near}
+number_colors: 2 (Black & White),256 (grayscale), and Numbers > 256 (Full Color) are all that are supported currently
+output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale, 3 color (1024, 768, 1),
+  channels will be updated to 3 if 'number_colors' > 256
+output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
+  and use 'convergence_dots_size' for best fit to avoid overlap if possible
+
+image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 42000310283..b8a0706b614 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.image.python.ops import image_ops
@@ -32,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -39,11 +33,10 @@ _DTYPES = set(
     [dtypes.uint8, dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
 
 
-class ImageOpsTestCpu(test_util.TensorFlowTestCase):
-  _use_gpu = False
+class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_zeros(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session():
       for dtype in _DTYPES:
         for shape in [(5, 5), (24, 24), (2, 24, 24, 3)]:
           for angle in [0, 1, np.pi / 2.0]:
@@ -53,7 +46,7 @@ class ImageOpsTestCpu(test_util.TensorFlowTestCase):
                 np.zeros(shape, dtype.as_numpy_dtype()))
 
   def test_rotate_even(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session():
       for dtype in _DTYPES:
         image = array_ops.reshape(
             math_ops.cast(math_ops.range(36), dtype), (6, 6))
@@ -75,7 +68,7 @@ class ImageOpsTestCpu(test_util.TensorFlowTestCase):
                               [1, 7, 13, 19, 25, 31], [0, 6, 12, 18, 24, 30]]])
 
   def test_rotate_odd(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session():
       for dtype in _DTYPES:
         image = array_ops.reshape(
             math_ops.cast(math_ops.range(25), dtype), (5, 5))
@@ -94,10 +87,163 @@ class ImageOpsTestCpu(test_util.TensorFlowTestCase):
                               [22, 17, 12, 7, 2], [23, 18, 13, 8, 3],
                               [24, 19, 14, 9, 4]]])
 
+  def test_compose(self):
+    with self.test_session():
+      for dtype in _DTYPES:
+        image = constant_op.constant(
+            [[1, 1, 1, 0],
+             [1, 0, 0, 0],
+             [1, 1, 1, 0],
+             [0, 0, 0, 0]], dtype=dtype)
+        # Rotate counter-clockwise by pi / 2.
+        rotation = image_ops.angles_to_projective_transforms(np.pi / 2, 4, 4)
+        # Translate right by 1 (the transformation matrix is always inverted,
+        # hence the -1).
+        translation = constant_op.constant([1, 0, -1,
+                                            0, 1, 0,
+                                            0, 0],
+                                           dtype=dtypes.float32)
+        composed = image_ops.compose_transforms(rotation, translation)
+        image_transformed = image_ops.transform(image, composed)
+        self.assertAllEqual(image_transformed.eval(),
+                            [[0, 0, 0, 0],
+                             [0, 1, 0, 1],
+                             [0, 1, 0, 1],
+                             [0, 1, 1, 1]])
 
-class ImageOpsTestGpu(ImageOpsTestCpu):
-  _use_gpu = True
+  def test_bilinear(self):
+    with self.test_session():
+      image = constant_op.constant(
+          [[0, 0, 0, 0, 0],
+           [0, 1, 1, 1, 0],
+           [0, 1, 0, 1, 0],
+           [0, 1, 1, 1, 0],
+           [0, 0, 0, 0, 0]],
+          dtypes.float32)
+      # The following result matches:
+      # >>> scipy.ndimage.rotate(image, 45, order=1, reshape=False)
+      # which uses spline interpolation of order 1, equivalent to bilinear
+      # interpolation.
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.000, 0.000, 0.343, 0.000, 0.000],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.343, 0.914, 0.000, 0.914, 0.343],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.000, 0.000, 0.343, 0.000, 0.000]],
+          atol=0.001)
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="NEAREST").eval(),
+          [[0, 0, 1, 0, 0],
+           [0, 1, 1, 1, 0],
+           [1, 1, 0, 1, 1],
+           [0, 1, 1, 1, 0],
+           [0, 0, 1, 0, 0]])
+
+  def test_bilinear_uint8(self):
+    with self.test_session():
+      image = constant_op.constant(
+          np.asarray(
+              [[0.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 255, 0.0, 255, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 0.0, 0.0, 0.0, 0.0]],
+              np.uint8),
+          dtypes.uint8)
+      # == np.rint((expected image above) * 255)
+      self.assertAllEqual(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.0, 0.0, 87., 0.0, 0.0],
+           [0.0, 149, 233, 149, 0.0],
+           [87., 233, 0.0, 233, 87.],
+           [0.0, 149, 233, 149, 0.0],
+           [0.0, 0.0, 87., 0.0, 0.0]])
+
+  def _test_grad(self, shape_to_test):
+    with self.test_session():
+      test_image_shape = shape_to_test
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      output_shape = test_image_shape
+      output = image_ops.transform(test_image_tensor, test_transform)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
+  def test_grad(self):
+    self._test_grad([16, 16])
+    self._test_grad([4, 12, 12])
+    self._test_grad([3, 4, 12, 12])
+
+
+class BipartiteMatchTest(test_util.TensorFlowTestCase):
+
+  def _BipartiteMatchTest(self, distance_mat, distance_mat_shape,
+                          num_valid_rows,
+                          expected_row_to_col_match,
+                          expected_col_to_row_match):
+    distance_mat_np = np.array(distance_mat, dtype=np.float32).reshape(
+        distance_mat_shape)
+    expected_row_to_col_match_np = np.array(expected_row_to_col_match,
+                                            dtype=np.int32)
+    expected_col_to_row_match_np = np.array(expected_col_to_row_match,
+                                            dtype=np.int32)
+
+    with self.test_session():
+      distance_mat_tf = constant_op.constant(distance_mat_np,
+                                             shape=distance_mat_shape)
+      location_to_prior, prior_to_location = image_ops.bipartite_match(
+          distance_mat_tf, num_valid_rows)
+      location_to_prior_np = location_to_prior.eval()
+      prior_to_location_np = prior_to_location.eval()
+      self.assertAllEqual(location_to_prior_np, expected_row_to_col_match_np)
+      self.assertAllEqual(prior_to_location_np, expected_col_to_row_match_np)
+
+  def testBipartiteMatch(self):
+    distance_mat = [0.5, 0.8, 0.1,
+                    0.3, 0.2, 0.15]
+    num_valid_rows = 2
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less than num-of-rows-in-distance-mat.
+    num_valid_rows = 1
+    expected_row_to_col_match = [2, -1]
+    expected_col_to_row_match = [-1, -1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows being 0.
+    num_valid_rows = 0
+    expected_row_to_col_match = [-1, -1]
+    expected_col_to_row_match = [-1, -1, -1]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less being -1.
+    num_valid_rows = -1
+    # The expected results are the same as num_valid_rows being 2.
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
 
 
 if __name__ == "__main__":
   googletest.main()
+
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 6db3f61a9e0..b396dcea211 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -17,15 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.image.ops import gen_image_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 
-_image_ops = loader.load_op_library(
+_image_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_image_ops.so"))
 
 _IMAGE_DTYPES = set(
@@ -34,7 +37,7 @@ _IMAGE_DTYPES = set(
 ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
-def rotate(images, angles):
+def rotate(images, angles, interpolation="NEAREST"):
   """Rotate image(s) by the passed angle(s) in radians.
 
   Args:
@@ -43,6 +46,7 @@ def rotate(images, angles):
        (num_rows, num_columns) (HW).
     angles: A scalar angle to rotate all images by, or (if images has rank 4)
        a vector of length num_images, with an angle for each image in the batch.
+    interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, rotated by the given
@@ -52,8 +56,6 @@ def rotate(images, angles):
     TypeError: If `image` is an invalid type.
   """
   image_or_images = ops.convert_to_tensor(images, name="images")
-  angle_or_angles = ops.convert_to_tensor(
-      angles, name="angles", dtype=dtypes.float32)
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
   if len(image_or_images.get_shape()) == 2:
@@ -65,14 +67,41 @@ def rotate(images, angles):
   else:
     raise TypeError("Images should have rank between 2 and 4.")
 
+  image_height = math_ops.cast(array_ops.shape(images)[1], dtypes.float32)[None]
+  image_width = math_ops.cast(array_ops.shape(images)[2], dtypes.float32)[None]
+  output = transform(
+      images,
+      angles_to_projective_transforms(angles, image_height, image_width),
+      interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return output[0, :, :, 0]
+  elif len(image_or_images.get_shape()) == 3:
+    return output[0, :, :, :]
+  else:
+    return output
+
+
+def angles_to_projective_transforms(angles, image_height, image_width):
+  """Returns projective transform(s) for the given angle(s).
+
+  Args:
+    angles: A scalar angle to rotate all images by, or (for batches of images)
+      a vector with an angle to rotate each image in the batch.
+    image_height: Height of the image(s) to be transformed.
+    image_width: Width of the image(s) to be transformed.
+
+  Returns:
+    A tensor of shape (num_images, 8). Projective transforms which can be given
+      to `tf.contrib.image.transform`.
+  """
+  angle_or_angles = ops.convert_to_tensor(
+      angles, name="angles", dtype=dtypes.float32)
   if len(angle_or_angles.get_shape()) == 0:  # pylint: disable=g-explicit-length-test
     angles = angle_or_angles[None]
   elif len(angle_or_angles.get_shape()) == 1:
     angles = angle_or_angles
   else:
     raise TypeError("Angles should have rank 0 or 1.")
-  image_width = math_ops.cast(array_ops.shape(images)[2], dtypes.float32)[None]
-  image_height = math_ops.cast(array_ops.shape(images)[1], dtypes.float32)[None]
   x_offset = ((image_width - 1) - (math_ops.cos(angles) *
                                    (image_width - 1) - math_ops.sin(angles) *
                                    (image_height - 1))) / 2.0
@@ -80,7 +109,7 @@ def rotate(images, angles):
                                     (image_width - 1) + math_ops.cos(angles) *
                                     (image_height - 1))) / 2.0
   num_angles = array_ops.shape(angles)[0]
-  transforms = array_ops.concat(
+  return array_ops.concat(
       values=[
           math_ops.cos(angles)[:, None],
           -math_ops.sin(angles)[:, None],
@@ -91,17 +120,9 @@ def rotate(images, angles):
           array_ops.zeros((num_angles, 2), dtypes.float32),
       ],
       axis=1)
-  # pylint: disable=protected-access
-  output = transform(images, transforms)
-  if len(image_or_images.get_shape()) == 2:
-    return output[0, :, :, 0]
-  elif len(image_or_images.get_shape()) == 3:
-    return output[0, :, :, :]
-  else:
-    return output
 
 
-def transform(images, transforms):
+def transform(images, transforms, interpolation="NEAREST"):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -113,7 +134,9 @@ def transform(images, transforms):
        [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
        `(x, y)` to a transformed *input* point
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
-       where `k = c0 x + c1 y + 1`.
+       where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
+       the transform mapping input points to output points.
+     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -143,8 +166,8 @@ def transform(images, transforms):
     transforms = transform_or_transforms
   else:
     raise TypeError("Transforms should have rank 1 or 2.")
-  # pylint: disable=protected-access
-  output = _image_ops.image_projective_transform(images, transforms)
+  output = gen_image_ops.image_projective_transform(
+      images, transforms, interpolation=interpolation.upper())
   if len(image_or_images.get_shape()) == 2:
     return output[0, :, :, 0]
   elif len(image_or_images.get_shape()) == 3:
@@ -153,4 +176,124 @@ def transform(images, transforms):
     return output
 
 
-ops.NotDifferentiable("ImageProjectiveTransform")
+def compose_transforms(*transforms):
+  """Composes the transforms tensors.
+
+  Args:
+    *transforms: List of image projective transforms to be composed. Each
+        transform is length 8 (single transform) or shape (N, 8) (batched
+        transforms). The shapes of all inputs must be equal, and at least one
+        input must be given.
+
+  Returns:
+    A composed transform tensor. When passed to `tf.contrib.image.transform`,
+        equivalent to applying each of the given transforms to the image in
+        order.
+  """
+  assert transforms, "transforms cannot be empty"
+  composed = _flat_transforms_to_matrices(transforms[0])
+  for tr in transforms[1:]:
+    # Multiply batches of matrices.
+    composed = math_ops.matmul(composed, _flat_transforms_to_matrices(tr))
+  return _transform_matrices_to_flat(composed)
+
+
+def _flat_transforms_to_matrices(transforms):
+  # Make the transform(s) 2D in case the input is a single transform.
+  transforms = array_ops.reshape(transforms, constant_op.constant([-1, 8]))
+  num_transforms = array_ops.shape(transforms)[0]
+  # Add a column of ones for the implicit last entry in the matrix.
+  return array_ops.reshape(
+      array_ops.concat(
+          [transforms, array_ops.ones([num_transforms, 1])], axis=1),
+      constant_op.constant([-1, 3, 3]))
+
+
+def _transform_matrices_to_flat(transform_matrices):
+  # Flatten each matrix.
+  transforms = array_ops.reshape(
+      transform_matrices, constant_op.constant([-1, 9]))
+  # Divide each matrix by the last entry (normally 1).
+  transforms /= transforms[:, 8:9]
+  return transforms[:, :8]
+
+
+@ops.RegisterGradient("ImageProjectiveTransform")
+def _image_projective_transform_grad(op, grad):
+  """Computes the gradient for ImageProjectiveTransform."""
+  images = op.inputs[0]
+  transforms = op.inputs[1]
+  interpolation = op.get_attr("interpolation")
+
+  image_or_images = ops.convert_to_tensor(images, name="images")
+  transform_or_transforms = ops.convert_to_tensor(
+      transforms, name="transforms", dtype=dtypes.float32)
+
+  if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
+    raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
+  if len(transform_or_transforms.get_shape()) == 1:
+    transforms = transform_or_transforms[None]
+  elif len(transform_or_transforms.get_shape()) == 2:
+    transforms = transform_or_transforms
+  else:
+    raise TypeError("Transforms should have rank 1 or 2.")
+
+  # Invert transformations
+  transforms = _flat_transforms_to_matrices(transforms=transforms)
+  inverse = linalg_ops.matrix_inverse(transforms)
+  transforms = _transform_matrices_to_flat(inverse)
+  output = gen_image_ops.image_projective_transform(
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
+
+
+def bipartite_match(
+    distance_mat,
+    num_valid_rows,
+    top_k=-1):
+  """Find bipartite matching based on a given distance matrix.
+
+  A greedy bi-partite matching algorithm is used to obtain the matching with
+  the (greedy) minimum distance.
+
+  Args:
+    distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+      pair-wise distance matrix between the entities represented by each row and
+      each column. It is an asymmetric matrix. The smaller the distance is, the
+      more similar the pairs are. The bipartite matching is to minimize the
+      distances.
+    num_valid_rows: A scalar or a 1-D tensor with one element describing the
+      number of valid rows of distance_mat to consider for the bipartite
+      matching. If set to be negative, then all rows from `distance_mat` are
+      used.
+    top_k: A scalar that specifies the number of top-k matches to retrieve.
+      If set to be negative, then is set according to the maximum number of
+      matches from `distance_mat`.
+
+  Returns:
+    row_to_col_match_indices: A vector of length num_rows, which is the number
+      of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
+      is not -1, row i is matched to column `row_to_col_match_indices[i]`.
+    col_to_row_match_indices: A vector of length num_columns, which is the
+      number of columns of the input ditance matrix.
+      If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+      `col_to_row_match_indices[j]`.
+  """
+  result = gen_image_ops.bipartite_match(distance_mat, num_valid_rows, top_k)
+  return result
+
+
+ops.NotDifferentiable("BipartiteMatch")
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
new file mode 100755
index 00000000000..79261c5e750
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -0,0 +1,125 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python layer for image_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+_sirds_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile(
+        "_single_image_random_dot_stereograms.so"))
+
+def single_image_random_dot_stereograms(
+    depth_values,
+    hidden_surface_removal=None,
+    convergence_dots_size=None,
+    dots_per_inch=None,
+    eye_separation=None, mu=None,
+    normalize=None, normalize_max=None,
+    normalize_min=None,
+    border_level=None,
+    number_colors=None,
+    output_image_shape=None,
+    output_data_window=None):
+  """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
+
+  Given the 2-D tensor 'depth_values' with encoded Z values, this operation
+  will encode 3-D data into a 2-D image.  The output of this Op is suitable
+  for the encode_PNG/JPG ops.  Be careful with image compression as this may
+  corrupt the encode 3-D data witin the image.
+
+  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+
+  This outputs a SIRDS image as picture_out.png:
+
+  ```python
+  img=[[1,2,3,3,2,1],
+       [1,2,3,4,5,2],
+       [1,2,3,4,5,3],
+       [1,2,3,4,5,4],
+       [6,5,4,4,5,5]]
+  session = tf.InteractiveSession()
+  sirds = single_image_random_dot_stereograms(
+      img,
+      convergence_dots_size=8,
+      number_colors=256,normalize=True)
+
+  out = sirds.eval()
+  png = tf.image.encode_png(out).eval()
+  with open('picture_out.png', 'wb') as f:
+    f.write(png)
+  ```
+
+  Args:
+    depth_values: A `Tensor`. Must be one of the following types: 
+      `float64`, `float32`, `int64`, `int32`.  Z values of data to encode
+      into 'output_data_window' window, lower further away {0.0 floor(far),
+      1.0 ceiling(near) after norm}, must be 2-D tensor
+    hidden_surface_removal: An optional `bool`. Defaults to `True`.
+      Activate hidden surface removal
+    convergence_dots_size: An optional `int`. Defaults to `8`.
+      Black dot size in pixels to help view converge image, drawn on bottom
+      of the image
+    dots_per_inch: An optional `int`. Defaults to `72`.
+      Output device in dots/inch
+    eye_separation: An optional `float`. Defaults to `2.5`.
+      Separation between eyes in inches
+    mu: An optional `float`. Defaults to `0.3333`.
+      Depth of field, Fraction of viewing distance (eg. 1/3 = 0.3333)
+    normalize: An optional `bool`. Defaults to `True`.
+      Normalize input data to [0.0, 1.0] 
+    normalize_max: An optional `float`. Defaults to `-100`.
+      Fix MAX value for Normalization (0.0) - if < MIN, autoscale
+    normalize_min: An optional `float`. Defaults to `100`.
+      Fix MIN value for Normalization (0.0) - if > MAX, autoscale
+    border_level: An optional `float`. Defaults to `0`.
+      Value of bord in depth 0.0 {far} to 1.0 {near} 
+    number_colors: An optional `int`. Defaults to `256`. 2 (Black &
+      White), 256 (grayscale), and Numbers > 256 (Full Color) are
+      supported
+    output_image_shape: An optional `tf.TensorShape` or list of `ints`. 
+      Defaults to shape `[1024, 768, 1]`. Defines output shape of returned
+      image in '[X,Y, Channels]' 1-grayscale, 3 color; channels will be
+      updated to 3 if number_colors > 256
+    output_data_window: An optional `tf.TensorShape` or list of `ints`.
+      Defaults to `[1022, 757]`. Size of "DATA" window, must be equal to or
+      smaller than `output_image_shape`, will be centered and use
+      `convergence_dots_size` for best fit to avoid overlap if possible
+
+  Returns:
+    A `Tensor` of type `uint8` of shape 'output_image_shape' with encoded
+    'depth_values'
+  """
+
+  result = _sirds_ops.single_image_random_dot_stereograms(
+      depth_values=depth_values,
+      hidden_surface_removal=hidden_surface_removal,
+      convergence_dots_size=convergence_dots_size,
+      dots_per_inch=dots_per_inch,
+      eye_separation=eye_separation, mu=mu,
+      normalize=normalize,
+      normalize_max=normalize_max,
+      normalize_min=normalize_min,
+      border_level=border_level,
+      number_colors=number_colors,
+      output_image_shape=output_image_shape,
+      output_data_window=output_data_window)
+  return result
+
+ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/imperative/BUILD b/tensorflow/contrib/imperative/BUILD
new file mode 100644
index 00000000000..c01d3f2d05e
--- /dev/null
+++ b/tensorflow/contrib/imperative/BUILD
@@ -0,0 +1,71 @@
+# Description:
+#   Imperative mode for TensorFlow.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "imperative",
+    srcs = [
+        "__init__.py",
+        "imperative_graph.py",
+        "imperative_mode.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":imperative_graph",
+        ":imperative_mode",
+    ],
+)
+
+py_library(
+    name = "imperative_graph",
+    srcs = ["imperative_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "imperative_mode",
+    srcs = ["imperative_mode.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":imperative_graph",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework",
+    ],
+)
+
+cuda_py_test(
+    name = "imperative_test",
+    size = "small",
+    srcs = ["imperative_test.py"],
+    additional_deps = [
+        ":imperative_mode",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/imperative/README.md b/tensorflow/contrib/imperative/README.md
new file mode 100644
index 00000000000..ea643a45d13
--- /dev/null
+++ b/tensorflow/contrib/imperative/README.md
@@ -0,0 +1,155 @@
+## Imperative programming in TensorFlow
+
+In the standard TensorFlow library, the specification of the computation is done
+statically in terms of a computation graph, and is separate from the execution
+of the graph. This model of programming is referred to as *lazy*, *deferred*,
+*dynamic*, or, *asynchronous*. This library brings imperative style programming (à
+la [NumPy](http://www.numpy.org)) to TensorFlow. Using this library, you can:
+
+* Write code in an imperative style: the results of the computation are available
+  right after the execution of a line of code.
+* Use TensorFlow operations on tensors, and get all the benefits of GPU
+  acceleration.
+* Include any Python control flow statements like `while` and `if` when
+  specifying the computation.
+* Perform automatic differentiation on your code with the
+  standard
+  [`tf.gradients`](https://www.tensorflow.org/api_docs/python/train/gradient_computation#gradients) function.
+
+### Getting started
+
+This library is a thin wrapper over the standard TensorFlow Python library. The
+source code is
+available
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/imperative). You
+can get started on Linux by installing the nightly PIP package linked off 
+[the main page](https://github.com/tensorflow/tensorflow). Please
+consult [this](https://github.com/tensorflow/tensorflow#installation) document for other platforms and the PIP package including GPU
+support.
+
+
+### Write your first imperative TensorFlow program
+
+```shell
+$ python
+```
+
+```python
+>>> import tensorflow.contrib.imperative as tf
+>>> x = tf.constant([[7.], [6]])
+>>> y = tf.constant([[6., 7]])
+>>> tf.matmul(x, y)
+array([[ 42.,  49.],
+       [ 36.,  42.]], dtype=float32)
+```
+
+Note that this code is identical in terms of the programmer's mental model to
+the following NumPy code:
+
+```python
+>>> import numpy as np
+>>> x = np.array([[7.], [6]])
+>>> y = np.array([[6., 7]])
+>>> x.dot(y)
+array([[ 42.,  49.],
+       [ 36.,  42.]])
+```
+
+The library can be imported as `import tensorflow.contrib.imperative as tf`
+(contrast with importing standard TensorFlow, which is done as `import
+tensorflow as tf`). This import statement makes all of standard TensorFlow
+available in the `tf` symbol. However, it is not necessary to create a session
+object and set it up to run and fetch tensors.
+
+
+### Features
+
+The library provides the following additional features on top of standard
+TensorFlow:
+
+* Tensors are automatically fetched when used in contexts that expect their
+  value.
+
+  - Printing
+
+  ```python
+  x = tf.constant(10)
+  y = tf.constant(32)
+  print(x + y)
+  42
+  ```
+
+  - Use in conditionals
+
+  ```python
+  x = tf.constant(30)
+  if x > 4:
+    print('Greater than 4')
+  Greater than 4
+
+  x = tf.random_normal([3])
+  y = x * 2
+  while tf.global_norm([y]) < 1000:
+    y = y * 2
+  print(y)
+  [ -213.2868042   -511.02456665  1026.66882324]
+  ```
+
+* Variables are automatically initialized, no need to run the
+  [`tf.global_variables_initializer()`](https://www.tensorflow.org/api_docs/python/state_ops/variable_helper_functions#global_variables_initializer) operation.
+
+  ```python
+  x = tf.Variable(np.random.normal(size=[2, 2]), dtype=tf.float32)
+  y = tf.constant([[1, 2.]])
+  z = tf.matmul(y, x)
+  print(z)
+  array([[-1.231673  ,  3.14744973]], dtype=float32)
+  ```
+
+* Gradients work as expected using the standard `tf.gradients` function.
+
+   ```python
+   x = tf.Variable(np.random.rand(1, 3))
+   y = tf.exp(x)
+   dy = tf.gradients(y, x)
+   # dy/dx should be equal to y (= exp(x))
+   print(y, dy)
+   (array([[ 1.79997761,  2.00581881,  2.37302414]]), [array([[ 1.79997761,  2.00581881,  2.37302414]])])
+   ```
+
+### Caveats
+
+The library is implemented on top of standard TensorFlow. It still constructs a
+graph in the background and defers op execution. But when an op executes for the
+first time, its results are cached and the cached value is returned for future
+executions, thus providing imperative semantics. Because of this implementation
+choice, this library comes with the following caveats:
+
+* **Use inside Python loops:** A graph is constructed and kept around in
+  the background, both for just executing using the standard TensorFlow runtime,
+  and also for allowing automatic differentiation via `tf.gradients`. This means
+  that the graph keeps growing when TensorFlow functions are called inside a
+  Python loop. This library provides a `tf.new_step` method that clears the
+  graph as well as the cached tensors that have been kept around for gradient
+  computation. `tf.new_step` can be used as a context manager around, say, a
+  training loop to clear the graph after each training step.
+
+  ```python
+  x = tf.Variable(constant_op.constant(1.0))
+  for i in range(10):
+    # Create a new training step
+    with tf.new_step() as step:
+      # Perform computation and variable updates
+      step.run(tf.assign_sub(x, 0.1))
+      self.assertAllClose(tf.identity(x), 1.0 - (i + 1) * 0.1)
+      # The graph within this context is cleared at this point.
+  ```
+
+* **Speed:** Redundant graph construction and caching of tensor values adds
+  overheads that are not present in standard TensorFlow, where typically the
+  graph is constructed once and executed multiple times. This library is
+  intended as a vehicle to prototype the imperative programming model in
+  TensorFlow. The runtime overheads can be alleviated with various optimizations
+  to the runtime that would equally benefit the deferred execution mode as
+  well.
+
diff --git a/tensorflow/contrib/imperative/__init__.py b/tensorflow/contrib/imperative/__init__.py
new file mode 100644
index 00000000000..b017df036da
--- /dev/null
+++ b/tensorflow/contrib/imperative/__init__.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imperative mode for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow import *  # pylint: disable=wildcard-import
+from tensorflow.contrib.imperative import imperative_mode
+
+
+class _InteractiveMode(object):
+  """Imperative mode suitable for interactive execution.
+
+  This module has a global _InteractiveMode object that enables
+  writing code as follows:
+
+  ```python
+  import tensorflow.contrib.imperative as tf
+  print(tf.constant(42))
+  ```
+  """
+
+  def __init__(self, target=None):
+    if not target:
+      target = train.Server.create_local_server().target
+    self.target = target
+    self.imperative_mode = imperative_mode.ImperativeMode(self.target)
+    self.imperative_mode.__enter__()
+
+  def new_step(self):
+    return self.imperative_mode.new_step()
+
+
+_default_interactive_mode = _InteractiveMode()
+
+
+def new_step():
+  return _default_interactive_mode.new_step()
diff --git a/tensorflow/contrib/imperative/examples/mnist.py b/tensorflow/contrib/imperative/examples/mnist.py
new file mode 100644
index 00000000000..a87856f0097
--- /dev/null
+++ b/tensorflow/contrib/imperative/examples/mnist.py
@@ -0,0 +1,124 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST training in imperative mode TensorFlow."""
+
+# pylint: disable=redefined-outer-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow.contrib.imperative as tf
+from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
+
+
+IMAGE_SIZE = 28
+IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE
+NUM_CLASSES = 10
+BATCH_SIZE = 100
+NUM_EPOCHS = 2
+LEARNING_RATE = 0.1
+
+
+class Model(object):
+  """Fully connected model for MNIST."""
+
+  def __init__(self, hidden1_units, hidden2_units):
+    """Create the model parameters."""
+    self.params = []
+    # Hidden 1
+    with tf.name_scope('hidden1'):
+      self.weights1 = tf.Variable(
+          np.random.normal(scale=1.0 / np.sqrt(float(IMAGE_PIXELS)),
+                           size=[IMAGE_PIXELS, hidden1_units]),
+          dtype=tf.float32,
+          name='weights')
+      self.biases1 = tf.Variable(
+          np.zeros([hidden1_units]),
+          dtype=tf.float32,
+          name='biases')
+    # Hidden 2
+    with tf.name_scope('hidden2'):
+      self.weights2 = tf.Variable(
+          np.random.normal(scale=1.0 / np.sqrt(float(hidden1_units)),
+                           size=[hidden1_units, hidden2_units]),
+          dtype=tf.float32,
+          name='weights')
+    self.biases2 = tf.Variable(
+        np.zeros([hidden2_units]),
+        dtype=tf.float32,
+        name='biases')
+    # Linear
+    with tf.name_scope('softmax_linear'):
+      self.sm_w = tf.Variable(
+          np.random.normal(scale=1.0 / np.sqrt(float(hidden2_units)),
+                           size=[hidden2_units, NUM_CLASSES]),
+          dtype=tf.float32,
+          name='weights')
+      self.sm_b = tf.Variable(
+          np.zeros([NUM_CLASSES]),
+          dtype=tf.float32,
+          name='biases')
+    self.params = [self.weights1, self.biases1,
+                   self.weights2, self.biases2,
+                   self.sm_w, self.sm_b]
+
+  def __call__(self, images):
+    """Run the model's forward prop on `images`."""
+    hidden1 = tf.nn.relu(tf.matmul(images, self.weights1) + self.biases1)
+    hidden2 = tf.nn.relu(tf.matmul(hidden1, self.weights2) + self.biases2)
+    logits = tf.matmul(hidden2, self.sm_w) + self.sm_b
+    return logits
+
+
+model = Model(128, 32)
+data = read_data_sets('/tmp/mnist_train')
+
+
+def get_test_accuracy():
+  """Gets the model's classification accuracy on test data."""
+  num_examples = data.test.num_examples
+  test_images = np.split(data.test.images, num_examples/BATCH_SIZE)
+  test_labels = np.split(data.test.labels.astype(np.int32),
+                         num_examples/BATCH_SIZE)
+  num_correct = 0
+  for _, (images, labels) in enumerate(zip(test_images, test_labels)):
+    with tf.new_step():
+      logits = model(images)
+      predictions = tf.argmax(tf.nn.softmax(logits), axis=1)
+      num_correct += np.sum(predictions.value == labels)
+  return float(num_correct) / float(num_examples)
+
+num_examples = data.train.num_examples
+train_images = np.split(data.train.images, num_examples/BATCH_SIZE)
+train_labels = np.split(data.train.labels.astype(np.int32),
+                        num_examples/BATCH_SIZE)
+
+for epoch in range(NUM_EPOCHS):
+  for i, (images, labels) in enumerate(zip(train_images, train_labels)):
+    with tf.new_step() as step:
+      logits = model(images)
+      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits, name='xentropy')
+      loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+      gradients = tf.gradients(loss, model.params)
+      step.run([v.assign_sub(LEARNING_RATE * g)
+                for g, v in zip(gradients, model.params)])
+      if i % 10 == 0:
+        print('Loss after {} steps = {}'.format(i, loss))
+      if i % 100 == 0:
+        print('Test accuracy after {} steps = {}'
+              .format(i, get_test_accuracy()))
diff --git a/tensorflow/contrib/imperative/imperative_graph.py b/tensorflow/contrib/imperative/imperative_graph.py
new file mode 100644
index 00000000000..e5fa684a23d
--- /dev/null
+++ b/tensorflow/contrib/imperative/imperative_graph.py
@@ -0,0 +1,493 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imperative mode graph for TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import uuid
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.util import compat
+
+# Stateful operators (with ref type input/outputs) that are allowed to be
+# present in an ImperativeGraph.
+_REF_OPS_WHITELIST = frozenset(['Variable', 'VariableV2', 'Assign', 'AssignAdd',
+                                'AssignSub', 'ScatterAdd', 'ScatterSub',
+                                'ScatterUpdate'])
+
+# These ops are returned as is in create_op without the extra logic. This
+# saves some space used for auxiliary variables.
+_PASS_THROUGH_OPS = frozenset(['Identity'])
+
+
+class ImperativeGraph(ops.Graph):
+  """A class implementing an imperative mode TensorFlow graph.
+
+  The ops constructed in an ImperativeGraph are augmented with extra logic to
+  enable its execution in an imperative manner. Imperative graphs are organized
+  hierarchically. A new step created from an `ImperativeMode` object creates a
+  new graph that is a child of this graph. In that case, an object of this
+  class is expected to be initialized with a parent graph, passed as
+  `parent_graph` to the initializer. Note that `parent_graph` is expected to
+  be set only when initialized from the `ImperativeMode` initializer.
+  """
+
+  def __init__(self, parent_graph=None):
+    """Initializes an ImperativeGraph.
+
+    Args:
+      parent_graph: (Optional) An ImperativeGraph.
+    """
+    self._parent_graph = parent_graph
+    # Whether the create_op function should augment an op with extra logic for
+    # imperative execution.
+    self._return_as_is = False
+    # Operation -> list of Tensors map. Used for overriding the op.outputs
+    # property, useful during gradient computation.
+    self._outputs_map = {}
+    # Operation -> function map. Used for overriding the gradient function
+    # for an op.
+    self._gradient_function_map = {}
+    # Unique name for the graph. Used for naming the container in which
+    # temporary variables are placed.
+    self._name = uuid.uuid4().hex
+    # Names for op types used for marking ops so we can override their
+    # gradient functions.
+    self._merge_op_type = 'ImperativeMerge' + self._name
+    self._imperative_op_type = 'ImperativeOp' + self._name
+    # The list of 'assign' ops that initialize variables.
+    self._init_ops = []
+    # Names of variables whose init ops have been already recorded in _init_ops.
+    self._init_variable_names = set()
+    # A flag to indicate whether a variable and the corresponding initialization
+    # ops are being created. Typically set by the initializer of Variable class.
+    self._in_variable_creation = False
+    self._variable_cleanup_ops = []
+    # Call the parent's initializer.
+    super(ImperativeGraph, self).__init__()
+
+    # Register a simple 'pass through' function to be used for ops that have
+    # _merge_op_type as the _gradient_op_type attribute.
+    ops.RegisterGradient(self._merge_op_type)(
+        lambda op, grad, _: [grad] * len(op.inputs))
+
+    # For ops that have _imperative_op_grad as the _gradient_op_type attribute,
+    # temporarily replace their outputs with the values in _output_map before
+    # calling the original gradient function.
+    def _imperative_op_grad(op, *grad):
+      with self.replace_outputs(op):
+        return self._gradient_function_map[op.name](op, *grad)
+
+    ops.RegisterGradient(self._imperative_op_type)(_imperative_op_grad)
+
+  def op_in_graph(self, op):
+    """Checks if op belongs in this graph or its ancestors."""
+    # pylint: disable=protected-access
+    if op._graph == self:
+      return True
+    # pylint: enable=protected-access
+    if self._parent_graph:
+      return self._parent_graph.op_in_graph(op)
+    return False
+
+  def is_child_graph(self, child_graph):
+    """Checks if this graph is an ancestor of `child_graph`."""
+    # pylint: disable=protected-access
+    if not child_graph or not child_graph._parent_graph:
+      return False
+    if child_graph._parent_graph == self:
+      return True
+    return self.is_child_graph(child_graph._parent_graph)
+    # pylint: enable=protected-access
+
+  # pylint: disable=g-doc-return-or-yield
+  @contextlib.contextmanager
+  def record_variable_inits(self):
+    """Context manager to record Variable initializations.
+
+    Sets _in_variable_creation to True before a Variable is initialized.
+
+    NOTE(keveman): This is used for recording the list of assign ops
+    that are used to initialize variables. It relies on the fact that
+    the constructor of Variable class creates exactly one assign op that is
+    used for initializing the variable. Variable ops not created using the
+    variables.Variable class are not added to _init_ops and hence not
+    initialized automatically.
+
+    """
+    old_init = getattr(variables.Variable, '__init__')
+
+    def record(*args, **kwargs):
+      self._in_variable_creation = True
+      old_init(*args, **kwargs)
+      self._in_variable_creation = False
+
+    setattr(variables.Variable, '__init__', record)
+    yield
+    setattr(variables.Variable, '__init__', old_init)
+  # pylint: enable=g-doc-return-or-yield
+
+  @contextlib.contextmanager
+  def return_as_is(self):
+    """Prevents adding the extra logic during `create_op`."""
+    old_return_as_is = self._return_as_is
+    self._return_as_is = True
+    yield
+    self._return_as_is = old_return_as_is
+
+  @contextlib.contextmanager
+  def replace_outputs(self, op):
+    """Replaces the outputs of `op` with values recorded in `_outputs_map`."""
+    # pylint: disable=protected-access
+    old_outputs = op._outputs
+    op._outputs = self._outputs_map[op.name]
+    yield
+    op._outputs = old_outputs
+    # pylint: enable=protected-access
+
+  def add_pending_init(self, init_op):
+    """Records assign ops in `_init_ops`."""
+    if init_op.type != 'Assign':
+      raise TypeError('Init op should be an Assign')
+
+    var_name = init_op.inputs[0].op.name
+    if var_name not in self._init_variable_names:
+      self._init_variable_names.add(var_name)
+      self._init_ops.append(init_op)
+
+  def run_pending_inits(self, session):
+    """Runs the pending variable initializations using `session`."""
+    while self._init_ops:
+      session.run(self._init_ops.pop(0))
+
+  def _wrap(self, op):
+    return OperationProxy(op)
+
+  def create_op(self, *args, **kwargs):
+    """Creates an `Operation`.
+
+    For operations of the following form
+
+      orig_value = op(*args, **kwargs)
+
+    this function constructs the following subgraph :
+
+      v = Variable()
+      if v is not initialized:
+        orig_value = op(*args, **kwargs)
+        v.assign(orig_value) # Initializes v
+        return orig_value
+      else:
+        return v
+
+    The above transformation is not performed and the original op is returned
+    as is if any of the following is true:
+    * `_return_as_is` flag is set to true.
+    * op_type is listed in _PASS_THROUGH_OPS
+    * op has no outputs.
+    * One of the op's return value has a ref type.
+
+    Args:
+      *args: Arguments for create_op()
+      **kwargs: Keyword arguments for create_op(). Refer to
+        tensorflow.python.framework.ops.Graph.create_op() for the mandatory
+        and optional arguments.
+
+    Returns:
+      An Operation.
+
+    Raises:
+      UnimplementedError: if output type is a reference and the op's type
+        is not one of the supported types in `_REF_OPS_WHITELIST`.
+    """
+    op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0]
+    output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2]
+    output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes]
+
+    if self._return_as_is or op_type in _PASS_THROUGH_OPS:
+      return self._wrap(super(ImperativeGraph, self).create_op(*args, **kwargs))
+
+    if not output_dtypes:
+      return self._wrap(
+          super(ImperativeGraph, self).create_op(*args, **kwargs))
+
+    output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes])  # pylint: disable=protected-access
+
+    if output_has_ref:
+      if op_type not in _REF_OPS_WHITELIST:
+        raise errors.UnimplementedError(None, None,
+                                        op_type + ' op not supported in '
+                                        'imperative graph')
+
+      ret = super(ImperativeGraph, self).create_op(*args, **kwargs)
+
+      if self._in_variable_creation:
+        if op_type == 'Assign':
+          self.add_pending_init(ret)
+
+      return self._wrap(ret)
+
+    with self.return_as_is():
+      # Declares the variables to hold the output values of this op.
+      op_output_var = [state_ops.variable_op_v2(
+          tensor_shape.TensorShape(None), dtype, container=self._name)
+                       for dtype in output_dtypes]
+      # Ops to free the resources used by the temporary cache variables.
+      # The following two ops are created for each cache variable,
+      # having no control dependencies on any other ops :
+      # var_handle_op ----> destroy_resource_op
+      for dtype, v in zip(output_dtypes, op_output_var):
+        with ops.control_dependencies(None):
+          self._variable_cleanup_ops += [
+              gen_resource_variable_ops.destroy_resource_op(
+                  gen_resource_variable_ops.var_handle_op(
+                      dtype, tensor_shape.TensorShape(None),
+                      container=self._name, shared_name=v.op.name),
+                  ignore_lookup_error=True)]
+
+      # Create the conditional to run the original op only when the variable
+      # corresponding to the first output is not initialized.
+      inited = state_ops.is_variable_initialized(op_output_var[0])
+      v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited)
+      # pylint: disable=protected-access
+      v_f_op = gen_array_ops._ref_identity(v_f)
+      v_t_op = gen_array_ops._ref_identity(v_t)
+      # pylint: enable=protected-access
+
+      with ops.control_dependencies([v_f_op.op]):
+        # Create the original op
+        orig_op = self._wrap(
+            super(ImperativeGraph, self).create_op(*args, **kwargs))
+      shapes = [val.get_shape() for val in orig_op.outputs]
+
+      controls = []
+      for var, val in zip(op_output_var, orig_op.outputs):
+        if (not val.get_shape().is_fully_defined() or
+            val.get_shape().num_elements() > 0):
+          assign_op = state_ops.assign(var, val, validate_shape=False)
+          assign_op.set_shape(val.get_shape())
+          controls.append(assign_op)
+
+      values = []
+      if len(controls) > 1:
+        if control_flow_ops.IsSwitch(orig_op):
+          # pylint: disable=protected-access
+          controls = gen_control_flow_ops._ref_merge(controls)
+          # pylint: enable=protected-access
+        else:
+          controls = control_flow_ops.tuple(controls)
+
+      for var, val in zip(op_output_var, orig_op.outputs):
+        with ops.control_dependencies(controls):
+          with self.colocate_with(v_f_op):
+            real_val = array_ops.identity(val)
+        with ops.control_dependencies([v_t_op.op]):
+          with self.colocate_with(v_t_op):
+            stored_val = array_ops.identity(var)
+          stored_val.set_shape(val.get_shape())
+          real_val, _ = control_flow_ops.merge([real_val, stored_val])
+        real_val.op.node_def.attr['_gradient_op_type'].CopyFrom(
+            attr_value_pb2.AttrValue(s=compat.as_bytes(self._merge_op_type)))
+        values.append(real_val)
+
+      for i, _ in enumerate(shapes):
+        values[i].set_shape(shapes[i])
+      self._outputs_map[orig_op.name] = values
+      try:
+        self._gradient_function_map[orig_op.name] = ops.get_gradient_function(
+            orig_op)
+      except (KeyError, LookupError):
+        pass
+      else:
+        orig_op.node_def.attr['_gradient_op_type'].CopyFrom(
+            attr_value_pb2.AttrValue(
+                s=compat.as_bytes(self._imperative_op_type)))
+
+      return MultiOutputOperation(values)
+
+
+class MultiOutputOperation(object):
+  """A 'duck-type' wrapper class for a list of Tensors, acting as an Operation.
+
+  NOTE(keveman): `create_op` produces a list of values but collected from
+  multiple ops. So there is no one `Operation` that we can pass to the
+  consumers of `create_op`. But the consumers of `create_op` only require
+  the object passed in to have the `outputs` property defined. This class
+  simply defines the `outputs` property, so the consumers of
+  `create_op` work correctly.
+  """
+
+  def __init__(self, outputs):
+    self.outputs = outputs
+
+
+class OperationProxy(ops.Operation):
+  """A proxy for the `ops.Operation` class.
+
+  Imperative graphs are organized hierarchically. Operations in an imperative
+  graph can be constructed out of operations belonging to any of the parent
+  graphs available in the lexical scope. This class provides the illusion that
+  all such operations belong to the current default graph.
+  """
+  __slots__ = ['_name', '_original_graph']
+
+  def __init__(self, real_op):
+    # object.__setattr__ is used for setting '_name' and '_original_graph'
+    # attributes (instead of self._name, for eg.) as this class provides
+    # its own __setattr__ method for proxying purposes.
+    object.__setattr__(self, '_name', real_op.name)
+    object.__setattr__(self, '_original_graph', real_op.graph)
+
+    # pylint: disable=protected-access
+    for output in real_op._outputs:
+      output._op = self
+    real_op._outputs = [TensorProxy(output) for output in real_op._outputs]
+    # pylint: enable=protected-access
+
+  def __getattribute__(self, name):
+    """Forwards to the methods in the current graph's `Operation` object."""
+    op_name = object.__getattribute__(self, '_name')
+    graph = ops.get_default_graph()
+
+    # Short-circuit getting some of these attributes that are readily
+    # available without forwarding to the actual operation. This is done
+    # because `get_operation_by_name` tries to acquire the parent graph's
+    # lock protecting the nodes_by_* data structures, and these attributes
+    # (not requiring the lock) could be queried by other function holding
+    # the lock.
+    if name == 'name':
+      return op_name
+    elif name == '_as_graph_element':
+      return lambda: self
+    elif name == '__class__':
+      return OperationProxy
+    elif name == 'graph':
+      original_graph = object.__getattribute__(self, '_original_graph')
+      if original_graph.is_child_graph(graph):
+        return graph
+      else:
+        return original_graph
+    else:
+      op = graph.get_operation_by_name(op_name)
+      return getattr(op, name)
+
+  def __setattr__(self, name, value):
+    # `replace_outputs` overrides _outputs temporarily, so support
+    # setting that attribute.
+    if name != '_outputs':
+      raise NotImplementedError('"op.%s = ..." not implemented' % name)
+    op_name = object.__getattribute__(self, '_name')
+    graph = ops.get_default_graph()
+    op = graph.get_operation_by_name(op_name)
+    setattr(op, name, value)
+
+
+class TensorProxy(ops.Tensor):
+  """Forwards to the methods in the current graph's `Tensor` object."""
+  __slots__ = ['_name', '_original_tensor', '_original_graph']
+
+  def __init__(self, real_tensor):
+    setattr(self, '_name', real_tensor.name)
+    setattr(self, '_original_tensor', real_tensor)
+    setattr(self, '_original_graph', real_tensor.graph)
+
+  def __str__(self):
+    sess = getattr(ops.Tensor, 'session', None)
+    if sess:
+      return str(sess.run(self))
+    else:
+      return ops.Tensor.__str__(self)
+
+  def __repr__(self):
+    sess = getattr(ops.Tensor, 'session', None)
+    if sess:
+      return repr(sess.run(self))
+    else:
+      return ops.Tensor.__repr__(self)
+
+  def __bool__(self):
+    sess = getattr(ops.Tensor, 'session', None)
+    if sess:
+      return bool(sess.run(self))
+    else:
+      return ops.Tensor.__bool__(self)
+
+  def __nonzero__(self):
+    sess = getattr(ops.Tensor, 'session', None)
+    if sess:
+      return bool(sess.run(self))
+    else:
+      return ops.Tensor.__nonzero__(self)
+
+  def __getattribute__(self, name):
+    tensor_name = object.__getattribute__(self, '_name')
+    graph = ops.get_default_graph()
+
+    if name == 'name':
+      return tensor_name
+    elif name == '_as_graph_element':
+      return lambda: self
+    elif name == '__class__':
+      return TensorProxy
+    elif name == 'graph':
+      original_graph = object.__getattribute__(self, '_original_graph')
+      if original_graph.is_child_graph(graph):
+        return graph
+      else:
+        return original_graph
+    elif name == 'value':
+      sess = getattr(ops.Tensor, 'session', None)
+      if sess:
+        return sess.run(self)
+      raise AttributeError('Current session not set on Tensor')
+    else:
+      tensor = object.__getattribute__(
+          graph.get_tensor_by_name(tensor_name), '_original_tensor')
+      return getattr(tensor, name)
+
+
+@contextlib.contextmanager
+def add_session_attr(typename, session):
+  """Sets the `session` property on the typename for the duration of a context.
+
+  This allows us to convert a `tf.Tensor` to numpy array by calling run()
+  using the `.session` property.
+
+  Args:
+    typename: The class to which value attribute should be added.
+    session: Session to be stored.
+
+  Yields:
+    None.
+  """
+  old_session = getattr(typename, 'session', None)
+  setattr(typename, 'session', session)
+  yield
+  if old_session:
+    setattr(typename, 'session', old_session)
diff --git a/tensorflow/contrib/imperative/imperative_mode.py b/tensorflow/contrib/imperative/imperative_mode.py
new file mode 100644
index 00000000000..1f48d796fd3
--- /dev/null
+++ b/tensorflow/contrib/imperative/imperative_mode.py
@@ -0,0 +1,154 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imperative mode for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.imperative import imperative_graph
+from tensorflow.python.client import session
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+
+
+class ImperativeMode(object):
+  """Imperative mode execution of TensorFlow graphs.
+
+  This class is a container for an ImperativeGraph, a session, and other
+  context managers that enable imperative mode execution. The following is
+  the common usage pattern:
+
+  ```python
+  server = tf.train.Server.create_local_server()
+  with ImperativeMode(server.target):
+    a = tf.random_normal([])
+    b = tf.random_normal([])
+    c = a + b
+    c_val = c.value
+    d = c + 1.0
+    d_val = d.value
+    # Expect d_val == c_val + 1.0
+  ```
+
+  ImperativeMode provides the illusion of immediate execution. It still
+  constructs a graph and defers op execution. But when an op executes for
+  the first time, its results are cached and the cached value is returned for
+  future executions. The __exit__ method clears this graph and cached values.
+  To use ImperativeMode inside a loop, the `new_step` method can be used to
+  create a temporary context around the loop body to clear the cache at loop
+  exit as follows:
+
+  ```python
+  server = tf.train.Server.create_local_server()
+  with ImperativeMode(server.target) as mode:
+    w = tf.get_variable('w', [])
+    for i in range(10):
+      with mode.new_step():
+        x = tf.random_uniform([])
+        y = tf.random_uniform([])
+        z = w.assign_add(x + y)
+        print(z.value)
+  ```
+
+  ImperativeMode graph does not support all TensorFlow operations and features.
+  Here are the current known limitations of ImperativeMode :
+  * Stateful operations returned ref-typed tensors are limited to
+  TensorFlow Variables and the associated operations. Data structures such as
+  queues barriers, etc. are not supported in ImperativeMode.
+  * Variables created and managed via `tf.variable_scope` and the associated
+  `tf.get_variable` are not supported. (These use auxiliary data structures in
+  addition to the graph, which are not aware of the imperative mode execution.)
+
+  TODO(keveman): Remove the above restrictions on ImperativeMode.
+  """
+
+  def __init__(self, target, parent_graph=None):
+    """Initializes an ImperativeMode.
+
+    Args:
+      target: The TensorFlow execution engine to connect to.
+      parent_graph: (Optional) An ImperativeGraph.
+
+    Raises:
+      UnimplementedError: if non-None parent_graph is not an ImperativeGraph.
+    """
+    self._target = target
+    self._parent_graph = parent_graph
+    # Create a new graph
+    self._graph = imperative_graph.ImperativeGraph(
+        parent_graph=self._parent_graph)
+    self._default_graph = self._graph.as_default()
+    # Context manager to record variable inits
+    self._record_variable_inits = self._graph.record_variable_inits()
+    if self._parent_graph:
+      if not isinstance(self._parent_graph, imperative_graph.ImperativeGraph):
+        raise errors.UnimplementedError(None, None, 'ImperativeMode needs an '
+                                        'ImperativeGraph')
+      # Clone the `_parent_graph` in to the current graph. This is so that
+      # operations used from the enclosing ImperativeMode context are
+      # available in the current context.
+      with self._graph.as_default(), self._graph.return_as_is():
+        importer.import_graph_def(self._parent_graph.as_graph_def(), name='')
+    self._session = session.Session(graph=self._graph, target=self._target)
+    # Override the `_session`'s run, so that variable inits can be
+    # called before the actual run.
+    self._old_run = self._session.run
+    self._session.run = self.run
+    self._context_managers = [
+        self._session.as_default(),
+        self._default_graph,
+        self._record_variable_inits,
+        imperative_graph.add_session_attr(ops.Tensor, self._session)]
+
+  def run(self, *args, **kwargs):
+    """Runs the variable init ops before calling the original run method."""
+    self._graph.run_pending_inits(self._session)
+    ret = self._old_run(*args, **kwargs)
+    return ret
+
+  def __enter__(self):
+    """Enters the runtime contexts of the `_context_managers`."""
+    for c in self._context_managers:
+      c.__enter__()
+    return self
+
+  def __exit__(self, exec_type, exec_value, exec_tb):
+    """Cleans up resources, exits the runtime contexts in reverse order."""
+    # pylint: disable=protected-access
+    if self._graph._variable_cleanup_ops:
+      self._session.run(self._graph._variable_cleanup_ops)
+    # pylint: enable=protected-access
+    self._session.close()
+
+    for c in reversed(self._context_managers):
+      c.__exit__(exec_type, exec_value, exec_tb)
+
+  def new_step(self):
+    """Returns a new 'child' ImperativeMode.
+
+    `new_step` enables running the imperative mode inside a Python loop. The
+    ImperativeGraph object and the tensors created and cached during the
+    execution of that graph are destroyed when the context entered with the
+    object returned from this function is 'exited'. However, the operations
+    in `self._graph` and any of its ancestors can be freely used as
+    operands to operations in the graph contained in the object returned
+    by this function.
+
+    Returns:
+      A new ImperativeMode object.
+    """
+    self._graph.run_pending_inits(self._session)
+    return ImperativeMode(self._target, parent_graph=self._graph)
diff --git a/tensorflow/contrib/imperative/imperative_test.py b/tensorflow/contrib/imperative/imperative_test.py
new file mode 100644
index 00000000000..4fc5a396b18
--- /dev/null
+++ b/tensorflow/contrib/imperative/imperative_test.py
@@ -0,0 +1,202 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for imperative mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.imperative import imperative_graph
+from tensorflow.contrib.imperative import imperative_mode
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+FLAGS = flags.FLAGS
+
+
+class ImperativeTest(test.TestCase):
+
+  def setUp(self):
+    self._server = training.Server.create_local_server()
+    self._target = self._server.target
+
+  def testBasic(self):
+    """Tests basic functionality.
+
+    Fetching the value of `d` with `d.value` will evaluate `c` again
+    in non-imperative mode. However, in imperative mode, `c` should
+    have the value it had when it was first evaluated with `c.value`.
+    """
+    with imperative_mode.ImperativeMode(self._target):
+      a = random_ops.random_normal([])
+      b = random_ops.random_normal([])
+      c = a + b
+      c_val = c.value
+      d = c + 1.0
+      d_val = d.value
+      self.assertAllClose(c_val + 1.0, d_val)
+
+  def testExpGrad(self):
+    """Tests gradients."""
+    with imperative_mode.ImperativeMode(self._target):
+      x = variables.Variable(np.random.rand(1, 3))
+      x_init = x.value().value
+      y = math_ops.exp(x)
+      dy = gradients_impl.gradients(y, x)
+      self.assertAllClose(np.exp(x_init), y.value)
+      # dy/dx should be equal to y (= exp(x))
+      self.assertAllClose(y.value, dy[0].value)
+
+  def testLoopGrads(self):
+    """Tests gradients in the presence of Python loops."""
+    with imperative_mode.ImperativeMode(self._target):
+      w = variables.Variable(np.eye(3))
+      x = constant_op.constant(np.eye(3))
+
+      for _ in range(3):
+        x = math_ops.add(x, w)
+
+      y = gradients_impl.gradients(x, w)
+      self.assertAllClose(y[0].value, np.array([3.] * 9).reshape(3, 3))
+
+  def testVariable(self):
+    """Makes sure that variables can be evaluated before running initializer."""
+    with imperative_mode.ImperativeMode(self._target):
+      x = variables.Variable(1, name='xy')
+      self.assertEqual(x.value().value, 1)
+      x = x.assign_add(41)
+      self.assertEqual(x.value, 1 + 41)
+      y = variables.Variable(3, name='y')
+      self.assertEqual(y.value().value, 3)
+
+  def testNewStep(self):
+    """Tests the `new_step` functionality."""
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      for _ in range(4):
+        with mode.new_step() as step:
+          a = random_ops.random_uniform([])
+          a_init = a.value
+          for _ in range(4):
+            with step.new_step():
+              # Values coming from outside this step's scope should not
+              # be changing.
+              self.assertEqual(a.value, a_init)
+              b = a + random_ops.random_uniform([], minval=0.1)
+              self.assertGreaterEqual(b.value, a.value)
+
+  def testGradientThroughNewStep(self):
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      x = constant_op.constant(np.random.rand(3))
+      y = math_ops.tanh(x)
+
+      with mode.new_step():
+        z = constant_op.constant(np.random.rand(3))
+        w = math_ops.multiply(y, z)
+        dx = gradients_impl.gradients(w, x)
+        self.assertAllClose(dx[0].value, z.value * (1.0 - y.value ** 2))
+
+  def testEscape(self):
+    """Makes sure that values don't escape a `new_step` scope."""
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      x = constant_op.constant(1)
+      with mode.new_step():
+        y = math_ops.add(x, constant_op.constant(3))
+        self.assertEqual(y.value, 4)
+      with mode.new_step():
+        with imperative_graph.add_session_attr(ops.Tensor, None):
+          with self.assertRaises(KeyError):
+            _ = y + constant_op.constant(1)
+
+  def testZeroSized(self):
+    """Tests evaluating zero-sized tensors."""
+    with imperative_mode.ImperativeMode(self._target):
+      x = constant_op.constant(1)
+      y = array_ops.shape(x)
+      self.assertEqual(list(y.value), [])
+
+  def testTrainingLoop(self):
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      w = variables.Variable(np.random.rand(3))
+      x = constant_op.constant(np.random.rand(3))
+      y = math_ops.multiply(x, w)
+      dw = gradients_impl.gradients(y, w)
+      self.assertAllClose(dw[0].value, x.value)
+
+      for _ in range(3):
+        with mode.new_step():
+          x = constant_op.constant(np.random.rand(3))
+          y = math_ops.multiply(x, w)
+          dw = gradients_impl.gradients(y, w)
+          self.assertAllClose(dw[0].value, x.value)
+
+  def testUseAfterNewStep(self):
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      x = constant_op.constant(1)
+      self.assertAllClose(x.value, 1)
+      with mode.new_step():
+        pass
+      self.assertAllClose(x.value, 1)
+
+  def testStringify(self):
+    with imperative_mode.ImperativeMode(self._target):
+      np_a = np.random.rand(2, 2)
+      a = constant_op.constant(np_a)
+      self.assertEqual(str(a), str(np_a))
+
+  def testBoolCoercion(self):
+    with imperative_mode.ImperativeMode(self._target):
+      self.assertFalse(not constant_op.constant([1.0]))
+      with self.assertRaises(ValueError) as ve:
+        _ = not constant_op.constant(np.random.rand(2))
+      self.assertTrue('The truth value of an array with'
+                      ' more than one element is ambiguous.'
+                      ' Use a.any() or a.all()' in str(ve.exception))
+
+  def testMeanGrad(self):
+    with imperative_mode.ImperativeMode(self._target):
+      x = constant_op.constant([1.0, 2.0])
+      y = math_ops.reduce_mean(x)
+      dy = gradients_impl.gradients(y, x)[0]
+      self.assertAllEqual(dy.value, [0.5, 0.5])
+
+  def testVarUseInNewStep(self):
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      x = variables.Variable(1.0)
+      with mode.new_step():
+        self.assertEqual(array_ops.identity(x).value, 1.0)
+
+  def testVarChange(self):
+    with imperative_mode.ImperativeMode(self._target) as mode:
+      x = variables.Variable(constant_op.constant(1.0))
+      for i in range(10):
+        with mode.new_step() as step:
+          step.run(state_ops.assign_sub(x, 0.1))
+          self.assertAllClose(array_ops.identity(x).value, 1.0 - (i + 1) * 0.1)
+
+
+if __name__ == '__main__':
+  FLAGS.rpc_default_rate_acl = 'INSECURE'
+  test.main()
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index a6bca863899..9aa5763efcc 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -10,6 +10,7 @@ package(default_visibility = ["//visibility:public"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_custom_op_library",
     "tf_cc_tests",
     "tf_gen_op_libs",
@@ -20,6 +21,7 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
@@ -50,10 +52,14 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "input_pipeline_py",
     srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
-    data = [":python/ops/_input_pipeline_ops.so"],
+    dso = [":python/ops/_input_pipeline_ops.so"],
+    kernels = [
+        ":input_pipeline_ops_kernels",
+        ":input_pipeline_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":input_pipeline_ops",
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index e22e16eef2f..99bf1c87430 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -13,7 +13,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc
index 052dbfec33c..01628c8e1b0 100644
--- a/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc
+++ b/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
diff --git a/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py b/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py
index 101e400e639..be22b5c5795 100644
--- a/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py
+++ b/tensorflow/contrib/input_pipeline/python/ops/input_pipeline_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import random
 
+from tensorflow.contrib.input_pipeline.ops import gen_input_pipeline_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,7 +43,7 @@ def obtain_next(string_list_tensor, counter):
     An op that produces the element at counter + 1 in the list, round
     robin style.
   """
-  return _input_pipeline_ops.obtain_next(string_list_tensor, counter)
+  return gen_input_pipeline_ops.obtain_next(string_list_tensor, counter)
 
 
 def _maybe_randomize_list(string_list, shuffle):
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 68bf362244e..d70be594b21 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "integrate_py",
     srcs = [
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index c951efd3d9f..68bf511099a 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -13,42 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Integration and ODE solvers for TensorFlow.
+"""Integration and ODE solvers.
 
-## Example: Lorenz attractor
-
-We can use `odeint` to solve the
-[Lorentz system](https://en.wikipedia.org/wiki/Lorenz_system) of ordinary
-differential equations, a prototypical example of chaotic dynamics:
-
-```python
-rho = 28.0
-sigma = 10.0
-beta = 8.0/3.0
-
-def lorenz_equation(state, t):
-  x, y, z = tf.unstack(state)
-  dx = sigma * (y - x)
-  dy = x * (rho - z) - y
-  dz = x * y - beta * z
-  return tf.stack([dx, dy, dz])
-
-init_state = tf.constant([0, 2, 20], dtype=tf.float64)
-t = np.linspace(0, 50, num=5000)
-tensor_state, tensor_info = tf.contrib.integrate.odeint(
-    lorenz_equation, init_state, t, full_output=True)
-
-sess = tf.Session()
-state, info = sess.run([tensor_state, tensor_info])
-x, y, z = state.T
-plt.plot(x, z)
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/lorenz_attractor.png" alt>
-</div>
-
-## Ops
+See the @{$python/contrib.integrate} guide.
 
 @@odeint
 """
diff --git a/tensorflow/contrib/ios_examples/README.md b/tensorflow/contrib/ios_examples/README.md
deleted file mode 100644
index 6bac33c0ec3..00000000000
--- a/tensorflow/contrib/ios_examples/README.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# TensorFlow iOS Examples
-
-This folder contains examples of how to build applications for iOS devices using TensorFlow.
-
-## Building the Examples
-
- - You'll need Xcode 7.3 or later, with the command-line tools installed.
-
- - Follow the instructions at
-   [tensorflow/contrib/makefile](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile)
-   under "iOS" to compile a static library containing the core TensorFlow code.
-
- - From the root of the Tensorflow folder, download
-   [Inception v1](https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip),
-   and extract the label and graph files into the data folders inside both the
-   simple and camera examples:
-
-```bash
-mkdir -p ~/graphs
-curl -o ~/graphs/inception5h.zip \
- https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip \
- && unzip ~/graphs/inception5h.zip -d ~/graphs/inception5h
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/benchmark/data/
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/camera/data/
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/simple/data/
-```
-
- - Load the Xcode project inside the `simple` subfolder, and press Command-R to
-   build and run it on the simulator or your connected device.
-
- - You should see a single-screen app with a "Run Model" button. Tap that, and
-   you should see some debug output appear below indicating that the example
-   Grace Hopper image has been analyzed, with a military uniform recognized.
-
- - Once you have success there, make sure you have a real device connected and
-   open up the Xcode project in the `camera` subfolder. Once you build and run
-   that, you should get a live camera view that you can point at objects to get
-   real-time recognition results.
-
-## Troubleshooting
-
-If you're hitting problems, here's a checklist of common things to investigate:
-
- - Make sure that you've run the `build_all_ios.sh` script.
-   This will run `download_dependencies.sh`,`compile_ios_protobuf.sh` and `compile_ios_tensorflow.sh`.
-   (check each one if they have run successful.)
-
- - Check that you have version 7.3 of Xcode.
-
- - If there's a complaint about no Sessions registered, that means that the C++
-   global constructors that TensorFlow relies on for registration haven't been
-   linked in properly. You'll have to make sure your project uses force_load, as
-   described below.
-
-## Creating your Own App
-
-You'll need to update various settings in your app to link against
-TensorFlow. You can view them in the example projects, but here's a full
-rundown:
-
- - The `compile_ios_tensorflow.sh` script builds a universal static library in
-   `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`. You'll need to add
-   this to your linking build stage, and in Search Paths add
-   `tensorflow/contrib/makefile/gen/lib` to the Library Search Paths setting.
-
- - You'll also need to add `libprotobuf.a` and `libprotobuf-lite.a` from
-   `tensorflow/contrib/makefile/gen/protobuf_ios/lib` to your _Build Stages_ and
-   _Library Search Paths_.
-
- - The _Header Search_ paths needs to contain:
-   - the root folder of tensorflow,
-   - `tensorflow/contrib/makefile/downloads/protobuf/src`
-   - `tensorflow/contrib/makefile/downloads`,
-   - `tensorflow/contrib/makefile/downloads/eigen`, and
-   - `tensorflow/contrib/makefile/gen/proto`.
-
- - In the Linking section, you need to add `-force_load` followed by the path to
-   the TensorFlow static library in the _Other Linker_ Flags section. This ensures
-   that the global C++ objects that are used to register important classes
-   inside the library are not stripped out. To the linker, they can appear
-   unused because no other code references the variables, but in fact their
-   constructors have the important side effect of registering the class.
-
- - You'll need to include the Accelerate framework in the "Link Binary with
-   Libraries" build phase of your project.
-
- - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
-   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
-
- - The library doesn't currently support bitcode, so you'll need to disable that
-   in your project settings.
-
- - Remove any use of the `-all_load` flag in your project. The protocol buffers
-   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
-   flag will cause these duplicates to become link errors. If you were using
-   `-all_load` to avoid issues with Objective-C categories in static libraries,
-   you may be able to replace it with the `-ObjC` flag.
-
-## Reducing the binary size
-
-TensorFlow is a comparatively large library for a mobile device, so it will
-increase the size of your app. Currently on iOS we see around a 11 MB binary
-footprint per CPU architecture, though we're actively working on reducing that.
-It can be tricky to set up the right configuration in your own app to keep the
-size minimized, so if you do run into this issue we recommend you start by
-looking at the simple example to examine its size. Here's how you do that:
-
- - Open the Xcode project in tensorflow/contrib/ios_examples/simple.
-
- - Make sure you've followed the steps above to get the data files.
-
- - Choose "Generic iOS Device" as the build configuration.
-
- - Select Product->Build.
-
- - Once the build's complete, open the Report Navigator and select the logs.
-
- - Near the bottom, you'll see a line saying "Touch tf_ios_makefile_example.app".
-
- - Expand that line using the icon on the right, and copy the first argument to
-   the Touch command.
-
- - Go to the terminal, type `ls -lah ` and then paste the path you copied.
-
- - For example it might look like `ls -lah /Users/petewarden/Library/Developer/Xcode/DerivedData/tf_ios_makefile_example-etdbksqytcnzeyfgdwiihzkqpxwr/Build/Products/Debug-iphoneos/tf_ios_makefile_example.app`
-
- - Running this command will show the size of the executable as the
-   `tf_ios_makefile_example` line.
-
-Right now you'll see a size of around 23 MB, since it's including two
-architectures (armv7 and arm64). As a first step, you should make sure the size
-increase you see in your own app is similar, and if it's larger, look at the
-"Other Linker Flags" used in the Simple Xcode project settings to strip the
-executable.
-
-After that, you can manually look at modifying the list of kernels
-included in tensorflow/contrib/makefile/tf_op_files.txt to reduce the number of
-implementations to the ones you're actually using in your own model. We're
-hoping to automate this step in the future, but for now manually removing them
-is the best approach.
diff --git a/tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj/project.pbxproj b/tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj/project.pbxproj
deleted file mode 100644
index 5cd173b416b..00000000000
--- a/tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,367 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D861D02091F00DF5523 /* libprotobuf-lite.a */; };
-		590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D871D02091F00DF5523 /* libprotobuf.a */; };
-		5993C7701D5D4E7F0048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */; };
-		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
-		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
-		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
-		59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
-		59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
-		59A3D0091CF4E68100C4259F /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
-		59A3D00B1CF4E68100C4259F /* BenchmarkViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */; };
-		59A3D00C1CF4E68100C4259F /* BenchmarkViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */; };
-		59A3D0141CF4E82500C4259F /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 59A3D0131CF4E82500C4259F /* CoreGraphics.framework */; };
-		59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 59A3D0171CF4E86100C4259F /* UIKit.framework */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		590E7D861D02091F00DF5523 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
-		590E7D871D02091F00DF5523 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
-		5911579B1CF4011C00C31E3A /* benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
-		59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = cropped_panda.jpg; sourceTree = "<group>"; };
-		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		59A3CFF61CF4E68100C4259F /* imagenet_2012_challenge_label_map_proto.pbtxt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_2012_challenge_label_map_proto.pbtxt; sourceTree = "<group>"; };
-		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		59A3CFF81CF4E68100C4259F /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = "<group>"; };
-		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
-		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
-		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "Benchmark-Info.plist"; sourceTree = "<group>"; };
-		59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BenchmarkViewController.h; sourceTree = "<group>"; };
-		59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkViewController.mm; sourceTree = "<group>"; };
-		59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = BenchmarkViewController.xib; sourceTree = "<group>"; };
-		59A3D0131CF4E82500C4259F /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		59A3D0151CF4E83D00C4259F /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
-		59A3D0171CF4E86100C4259F /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		591157981CF4011C00C31E3A /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				5993C7701D5D4E7F0048CE6A /* Accelerate.framework in Frameworks */,
-				590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */,
-				590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */,
-				59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */,
-				59A3D0141CF4E82500C4259F /* CoreGraphics.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */,
-				590E7D861D02091F00DF5523 /* libprotobuf-lite.a */,
-				590E7D871D02091F00DF5523 /* libprotobuf.a */,
-				59A3D0171CF4E86100C4259F /* UIKit.framework */,
-				59A3D0151CF4E83D00C4259F /* Foundation.framework */,
-				59A3D0131CF4E82500C4259F /* CoreGraphics.framework */,
-				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
-				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
-				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
-				59A3CFFC1CF4E68100C4259F /* main.mm */,
-				59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */,
-				59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */,
-				59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */,
-				59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */,
-				5911579C1CF4011C00C31E3A /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				5911579B1CF4011C00C31E3A /* benchmark.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */,
-				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
-				59A3CFF61CF4E68100C4259F /* imagenet_2012_challenge_label_map_proto.pbtxt */,
-				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
-				59A3CFF81CF4E68100C4259F /* LICENSE */,
-				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		5911579A1CF4011C00C31E3A /* benchmark */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "benchmark" */;
-			buildPhases = (
-				591157971CF4011C00C31E3A /* Sources */,
-				591157981CF4011C00C31E3A /* Frameworks */,
-				591157991CF4011C00C31E3A /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = benchmark;
-			productName = benchmark;
-			productReference = 5911579B1CF4011C00C31E3A /* benchmark.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0720;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					5911579A1CF4011C00C31E3A = {
-						CreatedOnToolsVersion = 7.2;
-						DevelopmentTeam = 85Z3VXS37U;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "benchmark" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				5911579A1CF4011C00C31E3A /* benchmark */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		591157991CF4011C00C31E3A /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D00C1CF4E68100C4259F /* BenchmarkViewController.xib in Resources */,
-				59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */,
-				59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */,
-				59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		591157971CF4011C00C31E3A /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D0091CF4E68100C4259F /* main.mm in Sources */,
-				59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */,
-				59A3D00B1CF4E68100C4259F /* BenchmarkViewController.mm in Sources */,
-				59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		591157B31CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../../..",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-					"$(SRCROOT)/../../makefile/gen/lib",
-				);
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-					"-Xlinker",
-					"-S",
-					"-Xlinker",
-					"-x",
-					"-Xlinker",
-					"-dead_strip",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.TF-Test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Debug;
-		};
-		591157B41CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../../..",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-					"$(SRCROOT)/../../makefile/gen/lib",
-				);
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-					"-Xlinker",
-					"-S",
-					"-Xlinker",
-					"-x",
-					"-Xlinker",
-					"-dead_strip",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.TF-Test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "benchmark" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "benchmark" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B31CF4011D00C31E3A /* Debug */,
-				591157B41CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
deleted file mode 100644
index e122fc3012f..00000000000
--- a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,401 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		591D3EC51CFF7F130059011C /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EC41CFF7F120059011C /* AVFoundation.framework */; };
-		591D3ECB1CFF7F5F0059011C /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECA1CFF7F5F0059011C /* CoreMedia.framework */; };
-		591D3ECD1CFF7F9F0059011C /* AssetsLibrary.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECC1CFF7F9F0059011C /* AssetsLibrary.framework */; };
-		591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECE1CFF7FCE0059011C /* ImageIO.framework */; };
-		591D3ED21CFF85C30059011C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED11CFF85C30059011C /* ios_image_load.mm */; };
-		591D3ED51CFF85FD0059011C /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */; };
-		591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */; };
-		591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */; };
-		591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */; };
-		591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */; };
-		591D3EE01CFFAD230059011C /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EDE1CFFAD230059011C /* libprotobuf.a */; };
-		592FF8B918ECBD7600C164F8 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 592FF8B818ECBD7600C164F8 /* Foundation.framework */; };
-		592FF8BB18ECBD7600C164F8 /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 592FF8BA18ECBD7600C164F8 /* CoreGraphics.framework */; };
-		592FF90218ECC66200C164F8 /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 592FF90118ECC66200C164F8 /* main.mm */; };
-		592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 592FF90A18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard */; };
-		592FF92518EE240200C164F8 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 592FF92218EE240200C164F8 /* CameraExampleAppDelegate.m */; };
-		592FF92618EE240200C164F8 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 592FF92418EE240200C164F8 /* CameraExampleViewController.mm */; };
-		5993C7721D5D4E980048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C7711D5D4E980048CE6A /* Accelerate.framework */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		591D3EC41CFF7F120059011C /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/AVFoundation.framework; sourceTree = DEVELOPER_DIR; };
-		591D3EC61CFF7F370059011C /* CoreFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreFoundation.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/CoreFoundation.framework; sourceTree = DEVELOPER_DIR; };
-		591D3EC81CFF7F500059011C /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/CoreImage.framework; sourceTree = DEVELOPER_DIR; };
-		591D3ECA1CFF7F5F0059011C /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/CoreMedia.framework; sourceTree = DEVELOPER_DIR; };
-		591D3ECC1CFF7F9F0059011C /* AssetsLibrary.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AssetsLibrary.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/AssetsLibrary.framework; sourceTree = DEVELOPER_DIR; };
-		591D3ECE1CFF7FCE0059011C /* ImageIO.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = ImageIO.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.2.sdk/System/Library/Frameworks/ImageIO.framework; sourceTree = DEVELOPER_DIR; };
-		591D3ED01CFF85C30059011C /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = SOURCE_ROOT; };
-		591D3ED11CFF85C30059011C /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = SOURCE_ROOT; };
-		591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = SOURCE_ROOT; };
-		591D3ED41CFF85FD0059011C /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = SOURCE_ROOT; };
-		591D3ED71CFFA83A0059011C /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-		591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
-		591D3EDE1CFFAD230059011C /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
-		592FF8B518ECBD7600C164F8 /* CameraExample.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = CameraExample.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		592FF8B818ECBD7600C164F8 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
-		592FF8BA18ECBD7600C164F8 /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		592FF90118ECC66200C164F8 /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = SOURCE_ROOT; };
-		592FF90318ECCB8300C164F8 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = SOURCE_ROOT; };
-		592FF90B18EDD0DA00C164F8 /* en */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = en; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
-		592FF92118EE240200C164F8 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = SOURCE_ROOT; };
-		592FF92218EE240200C164F8 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = SOURCE_ROOT; };
-		592FF92318EE240200C164F8 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = SOURCE_ROOT; };
-		592FF92418EE240200C164F8 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = SOURCE_ROOT; };
-		5993C7711D5D4E980048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.3.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		592FF8B218ECBD7600C164F8 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				5993C7721D5D4E980048CE6A /* Accelerate.framework in Frameworks */,
-				591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */,
-				591D3EE01CFFAD230059011C /* libprotobuf.a in Frameworks */,
-				591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */,
-				591D3ECD1CFF7F9F0059011C /* AssetsLibrary.framework in Frameworks */,
-				591D3ECB1CFF7F5F0059011C /* CoreMedia.framework in Frameworks */,
-				591D3EC51CFF7F130059011C /* AVFoundation.framework in Frameworks */,
-				592FF8BB18ECBD7600C164F8 /* CoreGraphics.framework in Frameworks */,
-				592FF8B918ECBD7600C164F8 /* Foundation.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		591D3ED61CFFA83A0059011C /* data */ = {
-			isa = PBXGroup;
-			children = (
-				591D3ED71CFFA83A0059011C /* grace_hopper.jpg */,
-				591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */,
-				591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = SOURCE_ROOT;
-		};
-		592FF8AA18ECBD3600C164F8 = {
-			isa = PBXGroup;
-			children = (
-				592FF8BE18ECBD7600C164F8 /* CameraExample */,
-				592FF8B718ECBD7600C164F8 /* Frameworks */,
-				592FF8B618ECBD7600C164F8 /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		592FF8B618ECBD7600C164F8 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				592FF8B518ECBD7600C164F8 /* CameraExample.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		592FF8B718ECBD7600C164F8 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				5993C7711D5D4E980048CE6A /* Accelerate.framework */,
-				591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */,
-				591D3EDE1CFFAD230059011C /* libprotobuf.a */,
-				591D3ECE1CFF7FCE0059011C /* ImageIO.framework */,
-				591D3ECC1CFF7F9F0059011C /* AssetsLibrary.framework */,
-				591D3ECA1CFF7F5F0059011C /* CoreMedia.framework */,
-				591D3EC81CFF7F500059011C /* CoreImage.framework */,
-				591D3EC61CFF7F370059011C /* CoreFoundation.framework */,
-				591D3EC41CFF7F120059011C /* AVFoundation.framework */,
-				592FF8B818ECBD7600C164F8 /* Foundation.framework */,
-				592FF8BA18ECBD7600C164F8 /* CoreGraphics.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		592FF8BE18ECBD7600C164F8 /* CameraExample */ = {
-			isa = PBXGroup;
-			children = (
-				591D3ED61CFFA83A0059011C /* data */,
-				592FF90718EDD0DA00C164F8 /* en.lproj */,
-				592FF92118EE240200C164F8 /* CameraExampleAppDelegate.h */,
-				592FF92218EE240200C164F8 /* CameraExampleAppDelegate.m */,
-				592FF92318EE240200C164F8 /* CameraExampleViewController.h */,
-				592FF92418EE240200C164F8 /* CameraExampleViewController.mm */,
-				592FF90318ECCB8300C164F8 /* Info.plist */,
-				591D3ED01CFF85C30059011C /* ios_image_load.h */,
-				591D3ED11CFF85C30059011C /* ios_image_load.mm */,
-				592FF90118ECC66200C164F8 /* main.mm */,
-				591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */,
-				591D3ED41CFF85FD0059011C /* tensorflow_utils.h */,
-			);
-			name = CameraExample;
-			path = SimpleExample;
-			sourceTree = "<group>";
-		};
-		592FF90718EDD0DA00C164F8 /* en.lproj */ = {
-			isa = PBXGroup;
-			children = (
-				592FF90A18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard */,
-			);
-			path = en.lproj;
-			sourceTree = SOURCE_ROOT;
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		592FF8B418ECBD7600C164F8 /* CameraExample */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 592FF8E318ECBD7600C164F8 /* Build configuration list for PBXNativeTarget "CameraExample" */;
-			buildPhases = (
-				592FF8B118ECBD7600C164F8 /* Sources */,
-				592FF8B218ECBD7600C164F8 /* Frameworks */,
-				592FF8B318ECBD7600C164F8 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = CameraExample;
-			productName = SimpleExample;
-			productReference = 592FF8B518ECBD7600C164F8 /* CameraExample.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		592FF8AB18ECBD3600C164F8 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0720;
-			};
-			buildConfigurationList = 592FF8AE18ECBD3600C164F8 /* Build configuration list for PBXProject "camera_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = 592FF8AA18ECBD3600C164F8;
-			productRefGroup = 592FF8B618ECBD7600C164F8 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				592FF8B418ECBD7600C164F8 /* CameraExample */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		592FF8B318ECBD7600C164F8 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */,
-				591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */,
-				592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */,
-				591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		592FF8B118ECBD7600C164F8 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				592FF90218ECC66200C164F8 /* main.mm in Sources */,
-				591D3ED21CFF85C30059011C /* ios_image_load.mm in Sources */,
-				592FF92618EE240200C164F8 /* CameraExampleViewController.mm in Sources */,
-				592FF92518EE240200C164F8 /* CameraExampleAppDelegate.m in Sources */,
-				591D3ED51CFF85FD0059011C /* tensorflow_utils.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		592FF90A18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				592FF90B18EDD0DA00C164F8 /* en */,
-			);
-			name = MainStoryboard_iPhone.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		592FF8AF18ECBD3600C164F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ENABLE_TESTABILITY = YES;
-				ONLY_ACTIVE_ARCH = YES;
-			};
-			name = Debug;
-		};
-		592FF8B018ECBD3600C164F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-			};
-			name = Release;
-		};
-		592FF8DF18ECBD7600C164F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "compiler-default";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = NO;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				ENABLE_BITCODE = NO;
-				FRAMEWORK_SEARCH_PATHS = "$(inherited)";
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PRECOMPILE_PREFIX_HEADER = YES;
-				GCC_PREFIX_HEADER = "";
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../../..",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/lib",
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-				);
-				ONLY_ACTIVE_ARCH = NO;
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.google.CameraExample;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALID_ARCHS = "arm64 armv7 armv7s";
-				WRAPPER_EXTENSION = app;
-			};
-			name = Debug;
-		};
-		592FF8E018ECBD7600C164F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "compiler-default";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = NO;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = YES;
-				ENABLE_BITCODE = NO;
-				ENABLE_NS_ASSERTIONS = NO;
-				FRAMEWORK_SEARCH_PATHS = "$(inherited)";
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_PRECOMPILE_PREFIX_HEADER = YES;
-				GCC_PREFIX_HEADER = "";
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../../..",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/lib",
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-				);
-				ONLY_ACTIVE_ARCH = NO;
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.google.CameraExample;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-				VALID_ARCHS = "arm64 armv7 armv7s";
-				WRAPPER_EXTENSION = app;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		592FF8AE18ECBD3600C164F8 /* Build configuration list for PBXProject "camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				592FF8AF18ECBD3600C164F8 /* Debug */,
-				592FF8B018ECBD3600C164F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		592FF8E318ECBD7600C164F8 /* Build configuration list for PBXNativeTarget "CameraExample" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				592FF8DF18ECBD7600C164F8 /* Debug */,
-				592FF8E018ECBD7600C164F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 592FF8AB18ECBD3600C164F8 /* Project object */;
-}
diff --git a/tensorflow/contrib/ios_examples/camera/squarePNG.png b/tensorflow/contrib/ios_examples/camera/squarePNG.png
deleted file mode 100644
index e26ff840ed9..00000000000
Binary files a/tensorflow/contrib/ios_examples/camera/squarePNG.png and /dev/null differ
diff --git a/tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj/project.pbxproj b/tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj/project.pbxproj
deleted file mode 100644
index 94a0037e4fd..00000000000
--- a/tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,377 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D861D02091F00DF5523 /* libprotobuf-lite.a */; };
-		590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D871D02091F00DF5523 /* libprotobuf.a */; };
-		5993C7741D5D4EAF0048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C7731D5D4EAF0048CE6A /* Accelerate.framework */; };
-		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
-		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
-		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
-		59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
-		59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
-		59A3D0091CF4E68100C4259F /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
-		59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */; };
-		59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* RunModelViewController.xib */; };
-		59A3D0141CF4E82500C4259F /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 59A3D0131CF4E82500C4259F /* CoreGraphics.framework */; };
-		59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 59A3D0171CF4E86100C4259F /* UIKit.framework */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		590E7D861D02091F00DF5523 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
-		590E7D871D02091F00DF5523 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
-		5911579B1CF4011C00C31E3A /* tf_ios_makefile_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_ios_makefile_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		5993C7731D5D4EAF0048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
-		59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = cropped_panda.jpg; sourceTree = "<group>"; };
-		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		59A3CFF61CF4E68100C4259F /* imagenet_2012_challenge_label_map_proto.pbtxt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_2012_challenge_label_map_proto.pbtxt; sourceTree = "<group>"; };
-		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		59A3CFF81CF4E68100C4259F /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = "<group>"; };
-		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
-		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
-		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "RunModel-Info.plist"; sourceTree = "<group>"; };
-		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
-		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
-		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
-		59A3D0131CF4E82500C4259F /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		59A3D0151CF4E83D00C4259F /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
-		59A3D0171CF4E86100C4259F /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		591157981CF4011C00C31E3A /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				5993C7741D5D4EAF0048CE6A /* Accelerate.framework in Frameworks */,
-				590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */,
-				590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */,
-				59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */,
-				59A3D0141CF4E82500C4259F /* CoreGraphics.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				5993C7731D5D4EAF0048CE6A /* Accelerate.framework */,
-				590E7D861D02091F00DF5523 /* libprotobuf-lite.a */,
-				590E7D871D02091F00DF5523 /* libprotobuf.a */,
-				59A3D0171CF4E86100C4259F /* UIKit.framework */,
-				59A3D0151CF4E83D00C4259F /* Foundation.framework */,
-				59A3D0131CF4E82500C4259F /* CoreGraphics.framework */,
-				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
-				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
-				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
-				59A3CFFC1CF4E68100C4259F /* main.mm */,
-				59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */,
-				59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */,
-				59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */,
-				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
-				5911579C1CF4011C00C31E3A /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				5911579B1CF4011C00C31E3A /* tf_ios_makefile_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */,
-				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
-				59A3CFF61CF4E68100C4259F /* imagenet_2012_challenge_label_map_proto.pbtxt */,
-				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
-				59A3CFF81CF4E68100C4259F /* LICENSE */,
-				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		5911579A1CF4011C00C31E3A /* tf_ios_makefile_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_ios_makefile_example" */;
-			buildPhases = (
-				591157971CF4011C00C31E3A /* Sources */,
-				591157981CF4011C00C31E3A /* Frameworks */,
-				591157991CF4011C00C31E3A /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tf_ios_makefile_example;
-			productName = tf_ios_makefile_example;
-			productReference = 5911579B1CF4011C00C31E3A /* tf_ios_makefile_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0720;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					5911579A1CF4011C00C31E3A = {
-						CreatedOnToolsVersion = 7.2;
-						DevelopmentTeam = 85Z3VXS37U;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_ios_makefile_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				5911579A1CF4011C00C31E3A /* tf_ios_makefile_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		591157991CF4011C00C31E3A /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */,
-				59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */,
-				59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */,
-				59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		591157971CF4011C00C31E3A /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D0091CF4E68100C4259F /* main.mm in Sources */,
-				59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */,
-				59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */,
-				59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		591157B31CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_DEBUG_INFORMATION_LEVEL = default;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				GCC_ENABLE_CPP_EXCEPTIONS = YES;
-				GCC_ENABLE_CPP_RTTI = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../../..",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-					"$(SRCROOT)/../../makefile/gen/lib",
-				);
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-					"-Xlinker",
-					"-S",
-					"-Xlinker",
-					"-x",
-					"-Xlinker",
-					"-dead_strip",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.TF-Test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SEPARATE_STRIP = NO;
-			};
-			name = Debug;
-		};
-		591157B41CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_DEBUG_INFORMATION_LEVEL = default;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				GCC_ENABLE_CPP_EXCEPTIONS = YES;
-				GCC_ENABLE_CPP_RTTI = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(SRCROOT)/../../../..",
-					"$(SRCROOT)/../../makefile/downloads/protobuf/src/",
-					"$(SRCROOT)/../../makefile/downloads",
-					"$(SRCROOT)/../../makefile/gen/proto",
-					"$(SRCROOT)/../../makefile/downloads/eigen",
-				);
-				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = (
-					"$(SRCROOT)/../../makefile/gen/protobuf_ios/lib",
-					"$(SRCROOT)/../../makefile/gen/lib",
-				);
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				OTHER_LDFLAGS = (
-					"-force_load",
-					"$(SRCROOT)/../../makefile/gen/lib/libtensorflow-core.a",
-					"-Xlinker",
-					"-S",
-					"-Xlinker",
-					"-x",
-					"-Xlinker",
-					"-dead_strip",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.TF-Test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SEPARATE_STRIP = NO;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_ios_makefile_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_ios_makefile_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B31CF4011D00C31E3A /* Debug */,
-				591157B41CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
new file mode 100644
index 00000000000..71ce6540d62
--- /dev/null
+++ b/tensorflow/contrib/keras/BUILD
@@ -0,0 +1,596 @@
+# Description:
+#   Contains the Keras API (internal TensorFlow version).
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "keras",
+    srcs = [
+        "__init__.py",
+        "api/__init__.py",
+        "api/keras/__init__.py",
+        "api/keras/activations/__init__.py",
+        "api/keras/applications/__init__.py",
+        "api/keras/applications/inception_v3/__init__.py",
+        "api/keras/applications/resnet50/__init__.py",
+        "api/keras/applications/vgg16/__init__.py",
+        "api/keras/applications/vgg19/__init__.py",
+        "api/keras/applications/xception/__init__.py",
+        "api/keras/backend/__init__.py",
+        "api/keras/callbacks/__init__.py",
+        "api/keras/constraints/__init__.py",
+        "api/keras/datasets/__init__.py",
+        "api/keras/datasets/boston_housing/__init__.py",
+        "api/keras/datasets/cifar10/__init__.py",
+        "api/keras/datasets/cifar100/__init__.py",
+        "api/keras/datasets/imdb/__init__.py",
+        "api/keras/datasets/mnist/__init__.py",
+        "api/keras/datasets/reuters/__init__.py",
+        "api/keras/initializers/__init__.py",
+        "api/keras/layers/__init__.py",
+        "api/keras/losses/__init__.py",
+        "api/keras/metrics/__init__.py",
+        "api/keras/models/__init__.py",
+        "api/keras/optimizers/__init__.py",
+        "api/keras/preprocessing/__init__.py",
+        "api/keras/preprocessing/image/__init__.py",
+        "api/keras/preprocessing/sequence/__init__.py",
+        "api/keras/preprocessing/text/__init__.py",
+        "api/keras/regularizers/__init__.py",
+        "api/keras/utils/__init__.py",
+        "api/keras/wrappers/__init__.py",
+        "api/keras/wrappers/scikit_learn/__init__.py",
+        "python/keras/__init__.py",
+        "python/keras/activations.py",
+        "python/keras/applications/__init__.py",
+        "python/keras/applications/imagenet_utils.py",
+        "python/keras/applications/inception_v3.py",
+        "python/keras/applications/resnet50.py",
+        "python/keras/applications/vgg16.py",
+        "python/keras/applications/vgg19.py",
+        "python/keras/applications/xception.py",
+        "python/keras/backend.py",
+        "python/keras/callbacks.py",
+        "python/keras/constraints.py",
+        "python/keras/datasets/__init__.py",
+        "python/keras/datasets/boston_housing.py",
+        "python/keras/datasets/cifar.py",
+        "python/keras/datasets/cifar10.py",
+        "python/keras/datasets/cifar100.py",
+        "python/keras/datasets/imdb.py",
+        "python/keras/datasets/mnist.py",
+        "python/keras/datasets/reuters.py",
+        "python/keras/engine/__init__.py",
+        "python/keras/engine/topology.py",
+        "python/keras/engine/training.py",
+        "python/keras/initializers.py",
+        "python/keras/layers/__init__.py",
+        "python/keras/layers/advanced_activations.py",
+        "python/keras/layers/convolutional.py",
+        "python/keras/layers/convolutional_recurrent.py",
+        "python/keras/layers/core.py",
+        "python/keras/layers/embeddings.py",
+        "python/keras/layers/local.py",
+        "python/keras/layers/merge.py",
+        "python/keras/layers/noise.py",
+        "python/keras/layers/normalization.py",
+        "python/keras/layers/pooling.py",
+        "python/keras/layers/recurrent.py",
+        "python/keras/layers/serialization.py",
+        "python/keras/layers/wrappers.py",
+        "python/keras/losses.py",
+        "python/keras/metrics.py",
+        "python/keras/models.py",
+        "python/keras/optimizers.py",
+        "python/keras/preprocessing/__init__.py",
+        "python/keras/preprocessing/image.py",
+        "python/keras/preprocessing/sequence.py",
+        "python/keras/preprocessing/text.py",
+        "python/keras/regularizers.py",
+        "python/keras/testing_utils.py",
+        "python/keras/utils/__init__.py",
+        "python/keras/utils/conv_utils.py",
+        "python/keras/utils/data_utils.py",
+        "python/keras/utils/generic_utils.py",
+        "python/keras/utils/io_utils.py",
+        "python/keras/utils/layer_utils.py",
+        "python/keras/utils/np_utils.py",
+        "python/keras/utils/vis_utils.py",
+        "python/keras/wrappers/__init__.py",
+        "python/keras/wrappers/scikit_learn.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/tensorboard:projector",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "integration_test",
+    size = "medium",
+    srcs = ["python/keras/integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "activations_test",
+    size = "small",
+    srcs = ["python/keras/activations_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "constraints_test",
+    size = "small",
+    srcs = ["python/keras/constraints_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "initializers_test",
+    size = "small",
+    srcs = ["python/keras/initializers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "regularizers_test",
+    size = "small",
+    srcs = ["python/keras/regularizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "optimizers_test",
+    size = "medium",
+    srcs = ["python/keras/optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "losses_test",
+    size = "small",
+    srcs = ["python/keras/losses_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "metrics_test",
+    size = "small",
+    srcs = ["python/keras/metrics_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "inception_v3_test",
+    size = "medium",
+    srcs = ["python/keras/applications/inception_v3_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "resnet50_test",
+    size = "small",
+    srcs = ["python/keras/applications/resnet50_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "vgg16_test",
+    size = "small",
+    srcs = ["python/keras/applications/vgg16_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "vgg19_test",
+    size = "small",
+    srcs = ["python/keras/applications/vgg19_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "xception_test",
+    size = "medium",
+    srcs = ["python/keras/applications/xception_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "advanced_activations_test",
+    size = "small",
+    srcs = ["python/keras/layers/advanced_activations_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "convolutional_recurrent_test",
+    size = "medium",
+    srcs = ["python/keras/layers/convolutional_recurrent_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "convolutional_test",
+    size = "medium",
+    srcs = ["python/keras/layers/convolutional_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notsan",
+    ],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "pooling_test",
+    size = "small",
+    srcs = ["python/keras/layers/pooling_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "core_test",
+    size = "small",
+    srcs = ["python/keras/layers/core_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "embeddings_test",
+    size = "small",
+    srcs = ["python/keras/layers/embeddings_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "local_test",
+    size = "medium",
+    srcs = ["python/keras/layers/local_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "merge_test",
+    size = "small",
+    srcs = ["python/keras/layers/merge_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "noise_test",
+    size = "small",
+    srcs = ["python/keras/layers/noise_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "normalization_test",
+    size = "small",
+    srcs = ["python/keras/layers/normalization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "simplernn_test",
+    size = "medium",
+    srcs = ["python/keras/layers/simplernn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "gru_test",
+    size = "medium",
+    srcs = ["python/keras/layers/gru_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # http://b/62136390
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "lstm_test",
+    size = "medium",
+    srcs = ["python/keras/layers/lstm_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # http://b/62189182
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "serialization_test",
+    size = "small",
+    srcs = ["python/keras/layers/serialization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "wrappers_test",
+    size = "small",
+    srcs = ["python/keras/layers/wrappers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "scikit_learn_test",
+    size = "small",
+    srcs = ["python/keras/wrappers/scikit_learn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "image_test",
+    size = "medium",
+    srcs = ["python/keras/preprocessing/image_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "sequence_test",
+    size = "small",
+    srcs = ["python/keras/preprocessing/sequence_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "text_test",
+    size = "small",
+    srcs = ["python/keras/preprocessing/text_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "callbacks_test",
+    size = "small",
+    srcs = ["python/keras/callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "training_test",
+    size = "small",
+    srcs = ["python/keras/engine/training_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "topology_test",
+    size = "small",
+    srcs = ["python/keras/engine/topology_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "models_test",
+    size = "small",
+    srcs = ["python/keras/models_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "backend_test",
+    size = "small",
+    srcs = ["python/keras/backend_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "testing_utils",
+    srcs = [
+        "python/keras/testing_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/keras/README.md b/tensorflow/contrib/keras/README.md
new file mode 100644
index 00000000000..db2556fe422
--- /dev/null
+++ b/tensorflow/contrib/keras/README.md
@@ -0,0 +1,6 @@
+Keras is an object-oriented API for defining and training neural networks.
+
+This module contains a pure-TensorFlow implementation of the Keras API,
+allowing for deep integration with TensorFlow functionality.
+
+See [keras.io](https://keras.io) for complete documentation and user guides.
diff --git a/tensorflow/contrib/keras/__init__.py b/tensorflow/contrib/keras/__init__.py
new file mode 100644
index 00000000000..86eae6ddda4
--- /dev/null
+++ b/tensorflow/contrib/keras/__init__.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the Keras API meant to be a high-level API for TensorFlow.
+
+Detailed documentation and user guides are available at
+[keras.io](https://keras.io).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.keras.api.keras import *
+
+try:
+  from tensorflow.contrib.keras import python  # pylint: disable=g-import-not-at-top
+  del python
+except ImportError:
+  pass
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/g3doc/__init__.py b/tensorflow/contrib/keras/api/__init__.py
similarity index 100%
rename from tensorflow/g3doc/__init__.py
rename to tensorflow/contrib/keras/api/__init__.py
diff --git a/tensorflow/contrib/keras/api/keras/__init__.py b/tensorflow/contrib/keras/api/keras/__init__.py
new file mode 100644
index 00000000000..53fb4a30c9b
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/__init__.py
@@ -0,0 +1,44 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the Keras API meant to be a high-level API for TensorFlow.
+
+Detailed documentation and user guides are available at
+[keras.io](https://keras.io).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras import activations
+from tensorflow.contrib.keras.api.keras import applications
+from tensorflow.contrib.keras.api.keras import backend
+from tensorflow.contrib.keras.api.keras import callbacks
+from tensorflow.contrib.keras.api.keras import constraints
+from tensorflow.contrib.keras.api.keras import datasets
+from tensorflow.contrib.keras.api.keras import initializers
+from tensorflow.contrib.keras.api.keras import layers
+from tensorflow.contrib.keras.api.keras import losses
+from tensorflow.contrib.keras.api.keras import metrics
+from tensorflow.contrib.keras.api.keras import models
+from tensorflow.contrib.keras.api.keras import optimizers
+from tensorflow.contrib.keras.api.keras import preprocessing
+from tensorflow.contrib.keras.api.keras import regularizers
+from tensorflow.contrib.keras.api.keras import utils
+from tensorflow.contrib.keras.api.keras import wrappers
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/activations/__init__.py b/tensorflow/contrib/keras/api/keras/activations/__init__.py
new file mode 100644
index 00000000000..e4d4b1e42cb
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/activations/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in activation functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Activation functions.
+from tensorflow.contrib.keras.python.keras.activations import elu
+from tensorflow.contrib.keras.python.keras.activations import hard_sigmoid
+from tensorflow.contrib.keras.python.keras.activations import linear
+from tensorflow.contrib.keras.python.keras.activations import relu
+from tensorflow.contrib.keras.python.keras.activations import sigmoid
+from tensorflow.contrib.keras.python.keras.activations import softmax
+from tensorflow.contrib.keras.python.keras.activations import softplus
+from tensorflow.contrib.keras.python.keras.activations import softsign
+from tensorflow.contrib.keras.python.keras.activations import tanh
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.activations import deserialize
+from tensorflow.contrib.keras.python.keras.activations import serialize
+from tensorflow.contrib.keras.python.keras.activations import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/__init__.py b/tensorflow/contrib/keras/api/keras/applications/__init__.py
new file mode 100644
index 00000000000..fee5b7103ac
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Applications are canned architectures with pre-trained weights."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.applications import inception_v3
+from tensorflow.contrib.keras.api.keras.applications import resnet50
+from tensorflow.contrib.keras.api.keras.applications import vgg16
+from tensorflow.contrib.keras.api.keras.applications import vgg19
+from tensorflow.contrib.keras.api.keras.applications import xception
+from tensorflow.contrib.keras.api.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.api.keras.applications.resnet50 import ResNet50
+from tensorflow.contrib.keras.api.keras.applications.vgg16 import VGG16
+from tensorflow.contrib.keras.api.keras.applications.vgg19 import VGG19
+from tensorflow.contrib.keras.api.keras.applications.xception import Xception
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
new file mode 100644
index 00000000000..d8ca73fb97f
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inception V3 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
new file mode 100644
index 00000000000..e9b25b66d5a
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet50 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
new file mode 100644
index 00000000000..2a1f789cc51
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VGG16 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
new file mode 100644
index 00000000000..22b5e7c8e49
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VGG19 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
new file mode 100644
index 00000000000..23d1b6a0b37
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Xception Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.xception import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.xception import preprocess_input
+from tensorflow.contrib.keras.python.keras.applications.xception import Xception
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/backend/__init__.py b/tensorflow/contrib/keras/api/keras/backend/__init__.py
new file mode 100644
index 00000000000..f3721a8dcb1
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/backend/__init__.py
@@ -0,0 +1,163 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras backend API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=redefined-builtin
+from tensorflow.contrib.keras.python.keras.backend import abs
+from tensorflow.contrib.keras.python.keras.backend import all
+from tensorflow.contrib.keras.python.keras.backend import any
+from tensorflow.contrib.keras.python.keras.backend import arange
+from tensorflow.contrib.keras.python.keras.backend import argmax
+from tensorflow.contrib.keras.python.keras.backend import argmin
+from tensorflow.contrib.keras.python.keras.backend import backend
+from tensorflow.contrib.keras.python.keras.backend import batch_dot
+from tensorflow.contrib.keras.python.keras.backend import batch_flatten
+from tensorflow.contrib.keras.python.keras.backend import batch_get_value
+from tensorflow.contrib.keras.python.keras.backend import batch_normalization
+from tensorflow.contrib.keras.python.keras.backend import batch_set_value
+from tensorflow.contrib.keras.python.keras.backend import bias_add
+from tensorflow.contrib.keras.python.keras.backend import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import cast
+from tensorflow.contrib.keras.python.keras.backend import cast_to_floatx
+from tensorflow.contrib.keras.python.keras.backend import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import clear_session
+from tensorflow.contrib.keras.python.keras.backend import clip
+from tensorflow.contrib.keras.python.keras.backend import concatenate
+from tensorflow.contrib.keras.python.keras.backend import constant
+from tensorflow.contrib.keras.python.keras.backend import conv1d
+from tensorflow.contrib.keras.python.keras.backend import conv2d
+from tensorflow.contrib.keras.python.keras.backend import conv2d_transpose
+from tensorflow.contrib.keras.python.keras.backend import conv3d
+from tensorflow.contrib.keras.python.keras.backend import cos
+from tensorflow.contrib.keras.python.keras.backend import count_params
+from tensorflow.contrib.keras.python.keras.backend import ctc_batch_cost
+from tensorflow.contrib.keras.python.keras.backend import ctc_decode
+from tensorflow.contrib.keras.python.keras.backend import ctc_label_dense_to_sparse
+from tensorflow.contrib.keras.python.keras.backend import dot
+from tensorflow.contrib.keras.python.keras.backend import dropout
+from tensorflow.contrib.keras.python.keras.backend import dtype
+from tensorflow.contrib.keras.python.keras.backend import elu
+from tensorflow.contrib.keras.python.keras.backend import epsilon
+from tensorflow.contrib.keras.python.keras.backend import equal
+from tensorflow.contrib.keras.python.keras.backend import eval
+from tensorflow.contrib.keras.python.keras.backend import exp
+from tensorflow.contrib.keras.python.keras.backend import expand_dims
+from tensorflow.contrib.keras.python.keras.backend import eye
+from tensorflow.contrib.keras.python.keras.backend import flatten
+from tensorflow.contrib.keras.python.keras.backend import floatx
+from tensorflow.contrib.keras.python.keras.backend import foldl
+from tensorflow.contrib.keras.python.keras.backend import foldr
+from tensorflow.contrib.keras.python.keras.backend import function
+from tensorflow.contrib.keras.python.keras.backend import gather
+from tensorflow.contrib.keras.python.keras.backend import get_session
+from tensorflow.contrib.keras.python.keras.backend import get_uid
+from tensorflow.contrib.keras.python.keras.backend import get_value
+from tensorflow.contrib.keras.python.keras.backend import gradients
+from tensorflow.contrib.keras.python.keras.backend import greater
+from tensorflow.contrib.keras.python.keras.backend import greater_equal
+from tensorflow.contrib.keras.python.keras.backend import hard_sigmoid
+from tensorflow.contrib.keras.python.keras.backend import image_data_format
+from tensorflow.contrib.keras.python.keras.backend import in_test_phase
+from tensorflow.contrib.keras.python.keras.backend import in_top_k
+from tensorflow.contrib.keras.python.keras.backend import in_train_phase
+from tensorflow.contrib.keras.python.keras.backend import int_shape
+from tensorflow.contrib.keras.python.keras.backend import is_sparse
+from tensorflow.contrib.keras.python.keras.backend import l2_normalize
+from tensorflow.contrib.keras.python.keras.backend import learning_phase
+from tensorflow.contrib.keras.python.keras.backend import less
+from tensorflow.contrib.keras.python.keras.backend import less_equal
+from tensorflow.contrib.keras.python.keras.backend import log
+from tensorflow.contrib.keras.python.keras.backend import manual_variable_initialization
+from tensorflow.contrib.keras.python.keras.backend import map_fn
+from tensorflow.contrib.keras.python.keras.backend import max
+from tensorflow.contrib.keras.python.keras.backend import maximum
+from tensorflow.contrib.keras.python.keras.backend import mean
+from tensorflow.contrib.keras.python.keras.backend import min
+from tensorflow.contrib.keras.python.keras.backend import minimum
+from tensorflow.contrib.keras.python.keras.backend import moving_average_update
+from tensorflow.contrib.keras.python.keras.backend import name_scope
+from tensorflow.contrib.keras.python.keras.backend import ndim
+from tensorflow.contrib.keras.python.keras.backend import normalize_batch_in_training
+from tensorflow.contrib.keras.python.keras.backend import not_equal
+from tensorflow.contrib.keras.python.keras.backend import one_hot
+from tensorflow.contrib.keras.python.keras.backend import ones
+from tensorflow.contrib.keras.python.keras.backend import ones_like
+from tensorflow.contrib.keras.python.keras.backend import permute_dimensions
+from tensorflow.contrib.keras.python.keras.backend import placeholder
+from tensorflow.contrib.keras.python.keras.backend import pool2d
+from tensorflow.contrib.keras.python.keras.backend import pool3d
+from tensorflow.contrib.keras.python.keras.backend import pow
+from tensorflow.contrib.keras.python.keras.backend import print_tensor
+from tensorflow.contrib.keras.python.keras.backend import prod
+from tensorflow.contrib.keras.python.keras.backend import random_binomial
+from tensorflow.contrib.keras.python.keras.backend import random_normal
+from tensorflow.contrib.keras.python.keras.backend import random_normal_variable
+from tensorflow.contrib.keras.python.keras.backend import random_uniform
+from tensorflow.contrib.keras.python.keras.backend import random_uniform_variable
+from tensorflow.contrib.keras.python.keras.backend import relu
+from tensorflow.contrib.keras.python.keras.backend import repeat
+from tensorflow.contrib.keras.python.keras.backend import repeat_elements
+from tensorflow.contrib.keras.python.keras.backend import reset_uids
+from tensorflow.contrib.keras.python.keras.backend import reshape
+from tensorflow.contrib.keras.python.keras.backend import resize_images
+from tensorflow.contrib.keras.python.keras.backend import resize_volumes
+from tensorflow.contrib.keras.python.keras.backend import reverse
+from tensorflow.contrib.keras.python.keras.backend import rnn
+from tensorflow.contrib.keras.python.keras.backend import round
+from tensorflow.contrib.keras.python.keras.backend import separable_conv2d
+from tensorflow.contrib.keras.python.keras.backend import set_epsilon
+from tensorflow.contrib.keras.python.keras.backend import set_floatx
+from tensorflow.contrib.keras.python.keras.backend import set_image_data_format
+from tensorflow.contrib.keras.python.keras.backend import set_learning_phase
+from tensorflow.contrib.keras.python.keras.backend import set_session
+from tensorflow.contrib.keras.python.keras.backend import set_value
+from tensorflow.contrib.keras.python.keras.backend import shape
+from tensorflow.contrib.keras.python.keras.backend import sigmoid
+from tensorflow.contrib.keras.python.keras.backend import sign
+from tensorflow.contrib.keras.python.keras.backend import sin
+from tensorflow.contrib.keras.python.keras.backend import softmax
+from tensorflow.contrib.keras.python.keras.backend import softplus
+from tensorflow.contrib.keras.python.keras.backend import softsign
+from tensorflow.contrib.keras.python.keras.backend import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.backend import spatial_2d_padding
+from tensorflow.contrib.keras.python.keras.backend import spatial_3d_padding
+from tensorflow.contrib.keras.python.keras.backend import sqrt
+from tensorflow.contrib.keras.python.keras.backend import square
+from tensorflow.contrib.keras.python.keras.backend import squeeze
+from tensorflow.contrib.keras.python.keras.backend import stack
+from tensorflow.contrib.keras.python.keras.backend import std
+from tensorflow.contrib.keras.python.keras.backend import stop_gradient
+from tensorflow.contrib.keras.python.keras.backend import sum
+from tensorflow.contrib.keras.python.keras.backend import switch
+from tensorflow.contrib.keras.python.keras.backend import tanh
+from tensorflow.contrib.keras.python.keras.backend import temporal_padding
+from tensorflow.contrib.keras.python.keras.backend import to_dense
+from tensorflow.contrib.keras.python.keras.backend import transpose
+from tensorflow.contrib.keras.python.keras.backend import truncated_normal
+from tensorflow.contrib.keras.python.keras.backend import update
+from tensorflow.contrib.keras.python.keras.backend import update_add
+from tensorflow.contrib.keras.python.keras.backend import update_sub
+from tensorflow.contrib.keras.python.keras.backend import var
+from tensorflow.contrib.keras.python.keras.backend import variable
+from tensorflow.contrib.keras.python.keras.backend import zeros
+from tensorflow.contrib.keras.python.keras.backend import zeros_like
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
new file mode 100644
index 00000000000..36db34f592d
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras callback classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.callbacks import BaseLogger
+from tensorflow.contrib.keras.python.keras.callbacks import Callback
+from tensorflow.contrib.keras.python.keras.callbacks import CSVLogger
+from tensorflow.contrib.keras.python.keras.callbacks import EarlyStopping
+from tensorflow.contrib.keras.python.keras.callbacks import History
+from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback
+from tensorflow.contrib.keras.python.keras.callbacks import LearningRateScheduler
+from tensorflow.contrib.keras.python.keras.callbacks import ModelCheckpoint
+from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
+from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
+from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
+from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/constraints/__init__.py b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
new file mode 100644
index 00000000000..6b9e3bf46e3
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in constraints functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Constraints functions / callable classes.
+from tensorflow.contrib.keras.python.keras.constraints import Constraint
+from tensorflow.contrib.keras.python.keras.constraints import max_norm
+from tensorflow.contrib.keras.python.keras.constraints import MaxNorm
+from tensorflow.contrib.keras.python.keras.constraints import min_max_norm
+from tensorflow.contrib.keras.python.keras.constraints import MinMaxNorm
+from tensorflow.contrib.keras.python.keras.constraints import non_neg
+from tensorflow.contrib.keras.python.keras.constraints import NonNeg
+from tensorflow.contrib.keras.python.keras.constraints import unit_norm
+from tensorflow.contrib.keras.python.keras.constraints import UnitNorm
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.constraints import deserialize
+from tensorflow.contrib.keras.python.keras.constraints import serialize
+from tensorflow.contrib.keras.python.keras.constraints import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/__init__.py
new file mode 100644
index 00000000000..4513231bb47
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.datasets import boston_housing
+from tensorflow.contrib.keras.api.keras.datasets import cifar10
+from tensorflow.contrib.keras.api.keras.datasets import cifar100
+from tensorflow.contrib.keras.api.keras.datasets import imdb
+from tensorflow.contrib.keras.api.keras.datasets import mnist
+from tensorflow.contrib.keras.api.keras.datasets import reuters
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
new file mode 100644
index 00000000000..0bfd3df5401
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boston housing price regression dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.boston_housing import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
new file mode 100644
index 00000000000..f5fac6982ac
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR10 small image classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.cifar10 import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
new file mode 100644
index 00000000000..a7e69961363
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR100 small image classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.cifar100 import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
new file mode 100644
index 00000000000..f141c8a8e98
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""IMDB movie review sentiment classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.imdb import get_word_index
+from tensorflow.contrib.keras.python.keras.datasets.imdb import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
new file mode 100644
index 00000000000..50b74f149c1
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST handwritten digits classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.mnist import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
new file mode 100644
index 00000000000..fc7f1235a3a
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reuters newswire topic classification dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets.reuters import get_word_index
+from tensorflow.contrib.keras.python.keras.datasets.reuters import load_data
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/initializers/__init__.py b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
new file mode 100644
index 00000000000..f0c1540d9ad
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
@@ -0,0 +1,48 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Initializer functions / callable classes.
+from tensorflow.contrib.keras.python.keras.initializers import Constant
+from tensorflow.contrib.keras.python.keras.initializers import Identity
+from tensorflow.contrib.keras.python.keras.initializers import Initializer
+from tensorflow.contrib.keras.python.keras.initializers import Ones
+from tensorflow.contrib.keras.python.keras.initializers import Orthogonal
+from tensorflow.contrib.keras.python.keras.initializers import RandomNormal
+from tensorflow.contrib.keras.python.keras.initializers import RandomUniform
+from tensorflow.contrib.keras.python.keras.initializers import TruncatedNormal
+from tensorflow.contrib.keras.python.keras.initializers import VarianceScaling
+from tensorflow.contrib.keras.python.keras.initializers import Zeros
+
+# Functional interface.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.initializers import glorot_normal
+from tensorflow.contrib.keras.python.keras.initializers import glorot_uniform
+from tensorflow.contrib.keras.python.keras.initializers import he_normal
+from tensorflow.contrib.keras.python.keras.initializers import he_uniform
+from tensorflow.contrib.keras.python.keras.initializers import lecun_uniform
+
+# Auxiliary utils.
+from tensorflow.contrib.keras.python.keras.initializers import deserialize
+from tensorflow.contrib.keras.python.keras.initializers import serialize
+from tensorflow.contrib.keras.python.keras.initializers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
new file mode 100644
index 00000000000..3c6dce5ee8f
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -0,0 +1,145 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Generic layers.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+
+# Advanced activations.
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import PReLU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import ELU
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import ThresholdedReLU
+
+# Convolution layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConv2D
+
+# Convolution layer aliases.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConvolution2D
+
+# Image processing layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping1D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping2D
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Cropping3D
+
+# Convolutional-recurrent layers.
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import ConvLSTM2D
+
+# Core layers.
+from tensorflow.contrib.keras.python.keras.layers.core import Masking
+from tensorflow.contrib.keras.python.keras.layers.core import Dropout
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout1D
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout2D
+from tensorflow.contrib.keras.python.keras.layers.core import SpatialDropout3D
+from tensorflow.contrib.keras.python.keras.layers.core import Activation
+from tensorflow.contrib.keras.python.keras.layers.core import Reshape
+from tensorflow.contrib.keras.python.keras.layers.core import Permute
+from tensorflow.contrib.keras.python.keras.layers.core import Flatten
+from tensorflow.contrib.keras.python.keras.layers.core import RepeatVector
+from tensorflow.contrib.keras.python.keras.layers.core import Lambda
+from tensorflow.contrib.keras.python.keras.layers.core import Dense
+from tensorflow.contrib.keras.python.keras.layers.core import ActivityRegularization
+
+# Embedding layers.
+from tensorflow.contrib.keras.python.keras.layers.embeddings import Embedding
+
+# Locally-connected layers.
+from tensorflow.contrib.keras.python.keras.layers.local import LocallyConnected1D
+from tensorflow.contrib.keras.python.keras.layers.local import LocallyConnected2D
+
+# Merge layers.
+from tensorflow.contrib.keras.python.keras.layers.merge import Add
+from tensorflow.contrib.keras.python.keras.layers.merge import Multiply
+from tensorflow.contrib.keras.python.keras.layers.merge import Average
+from tensorflow.contrib.keras.python.keras.layers.merge import Maximum
+from tensorflow.contrib.keras.python.keras.layers.merge import Concatenate
+from tensorflow.contrib.keras.python.keras.layers.merge import Dot
+from tensorflow.contrib.keras.python.keras.layers.merge import add
+from tensorflow.contrib.keras.python.keras.layers.merge import multiply
+from tensorflow.contrib.keras.python.keras.layers.merge import average
+from tensorflow.contrib.keras.python.keras.layers.merge import maximum
+from tensorflow.contrib.keras.python.keras.layers.merge import concatenate
+from tensorflow.contrib.keras.python.keras.layers.merge import dot
+
+# Noise layers.
+from tensorflow.contrib.keras.python.keras.layers.noise import GaussianNoise
+from tensorflow.contrib.keras.python.keras.layers.noise import GaussianDropout
+
+# Normalization layers.
+from tensorflow.contrib.keras.python.keras.layers.normalization import BatchNormalization
+
+# Pooling layers.
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPooling3D
+
+# Pooling layer aliases.
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AvgPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import GlobalMaxPool3D
+
+# Recurrent layers.
+from tensorflow.contrib.keras.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.contrib.keras.python.keras.layers.recurrent import GRU
+from tensorflow.contrib.keras.python.keras.layers.recurrent import LSTM
+
+# Wrapper functions
+from tensorflow.contrib.keras.python.keras.layers.wrappers import Wrapper 
+from tensorflow.contrib.keras.python.keras.layers.wrappers import Bidirectional 
+from tensorflow.contrib.keras.python.keras.layers.wrappers import TimeDistributed
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
new file mode 100644
index 00000000000..2d2fee2698d
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in loss functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Loss functions.
+from tensorflow.contrib.keras.python.keras.losses import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
+from tensorflow.contrib.keras.python.keras.losses import hinge
+from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.losses import poisson
+from tensorflow.contrib.keras.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import squared_hinge
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.losses import deserialize
+from tensorflow.contrib.keras.python.keras.losses import serialize
+from tensorflow.contrib.keras.python.keras.losses import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
new file mode 100644
index 00000000000..ba43ffece81
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -0,0 +1,46 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Metrics functions.
+from tensorflow.contrib.keras.python.keras.metrics import binary_accuracy
+from tensorflow.contrib.keras.python.keras.metrics import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import categorical_accuracy
+from tensorflow.contrib.keras.python.keras.metrics import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import cosine_proximity
+from tensorflow.contrib.keras.python.keras.metrics import hinge
+from tensorflow.contrib.keras.python.keras.metrics import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.metrics import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_squared_error
+from tensorflow.contrib.keras.python.keras.metrics import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.metrics import poisson
+from tensorflow.contrib.keras.python.keras.metrics import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import squared_hinge
+from tensorflow.contrib.keras.python.keras.metrics import top_k_categorical_accuracy
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.metrics import deserialize
+from tensorflow.contrib.keras.python.keras.metrics import serialize
+from tensorflow.contrib.keras.python.keras.metrics import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/models/__init__.py b/tensorflow/contrib/keras/api/keras/models/__init__.py
new file mode 100644
index 00000000000..4e5b2a1ed08
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/models/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras models API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.models import load_model
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.models import model_from_config
+from tensorflow.contrib.keras.python.keras.models import model_from_json
+from tensorflow.contrib.keras.python.keras.models import model_from_yaml
+from tensorflow.contrib.keras.python.keras.models import save_model
+from tensorflow.contrib.keras.python.keras.models import Sequential
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
new file mode 100644
index 00000000000..b3531d7933f
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in optimizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Optimizer classes.
+from tensorflow.contrib.keras.python.keras.optimizers import Adadelta
+from tensorflow.contrib.keras.python.keras.optimizers import Adagrad
+from tensorflow.contrib.keras.python.keras.optimizers import Adam
+from tensorflow.contrib.keras.python.keras.optimizers import Adamax
+from tensorflow.contrib.keras.python.keras.optimizers import Nadam
+from tensorflow.contrib.keras.python.keras.optimizers import Optimizer
+from tensorflow.contrib.keras.python.keras.optimizers import RMSprop
+from tensorflow.contrib.keras.python.keras.optimizers import SGD
+
+# Auxiliary utils.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.optimizers import deserialize
+from tensorflow.contrib.keras.python.keras.optimizers import serialize
+from tensorflow.contrib.keras.python.keras.optimizers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py
new file mode 100644
index 00000000000..4a200e3f584
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.preprocessing import image
+from tensorflow.contrib.keras.api.keras.preprocessing import sequence
+from tensorflow.contrib.keras.api.keras.preprocessing import text
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
new file mode 100644
index 00000000000..18ce1becc29
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for image data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.image import apply_transform
+from tensorflow.contrib.keras.python.keras.preprocessing.image import array_to_img
+from tensorflow.contrib.keras.python.keras.preprocessing.image import DirectoryIterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import flip_axis
+from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import img_to_array
+from tensorflow.contrib.keras.python.keras.preprocessing.image import Iterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import load_img
+from tensorflow.contrib.keras.python.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_channel_shift
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_rotation
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_shear
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_shift
+from tensorflow.contrib.keras.python.keras.preprocessing.image import random_zoom
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
new file mode 100644
index 00000000000..2621e9bf53e
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for sequence data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import make_sampling_table
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import pad_sequences
+from tensorflow.contrib.keras.python.keras.preprocessing.sequence import skipgrams
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
new file mode 100644
index 00000000000..a6b68c3ba68
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras data preprocessing utils for text data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing.text import one_hot
+from tensorflow.contrib.keras.python.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.contrib.keras.python.keras.preprocessing.text import Tokenizer
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
new file mode 100644
index 00000000000..a3b0062d5c8
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in regularizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Regularizer functions / callable classes.
+from tensorflow.contrib.keras.python.keras.regularizers import L1L2
+from tensorflow.contrib.keras.python.keras.regularizers import Regularizer
+
+# Functional interface.
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.keras.python.keras.regularizers import l1
+from tensorflow.contrib.keras.python.keras.regularizers import l2
+from tensorflow.contrib.keras.python.keras.regularizers import l1_l2
+
+# Auxiliary utils.
+from tensorflow.contrib.keras.python.keras.regularizers import deserialize
+from tensorflow.contrib.keras.python.keras.regularizers import serialize
+from tensorflow.contrib.keras.python.keras.regularizers import get
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
new file mode 100644
index 00000000000..7f14fa20657
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.contrib.keras.python.keras.utils.np_utils import normalize
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.contrib.keras.python.keras.utils.vis_utils import plot_model
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/__init__.py
new file mode 100644
index 00000000000..d2c7c4bf144
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/wrappers/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrappers for Keras models, providing compatibility with other frameworks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.api.keras.wrappers import scikit_learn
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
new file mode 100644
index 00000000000..ba1d28c5c68
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras scikit-learn API wrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.wrappers.scikit_learn import KerasClassifier
+from tensorflow.contrib.keras.python.keras.wrappers.scikit_learn import KerasRegressor
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
new file mode 100644
index 00000000000..1c1485c0cda
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Keras API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import applications
+from tensorflow.contrib.keras.python.keras import backend
+from tensorflow.contrib.keras.python.keras import callbacks
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import datasets
+from tensorflow.contrib.keras.python.keras import engine
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras import losses
+from tensorflow.contrib.keras.python.keras import metrics
+from tensorflow.contrib.keras.python.keras import models
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras import preprocessing
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras import utils
+from tensorflow.contrib.keras.python.keras import wrappers
+from tensorflow.contrib.keras.python.keras.layers import Input
+
+__version__ = '2.0.4-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
new file mode 100644
index 00000000000..35d15e74c26
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -0,0 +1,113 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in activation functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.platform import tf_logging as logging
+
+
+def softmax(x, axis=-1):
+  """Softmax activation function.
+
+  Arguments:
+      x : Tensor.
+      axis: Integer, axis along which the softmax normalization is applied.
+
+  Returns:
+      Tensor, output of softmax transformation.
+
+  Raises:
+      ValueError: In case `dim(x) == 1`.
+  """
+  ndim = K.ndim(x)
+  if ndim == 2:
+    return K.softmax(x)
+  elif ndim > 2:
+    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
+    s = K.sum(e, axis=axis, keepdims=True)
+    return e / s
+  else:
+    raise ValueError('Cannot apply softmax to a tensor that is 1D')
+
+
+def elu(x, alpha=1.0):
+  return K.elu(x, alpha)
+
+
+def softplus(x):
+  return K.softplus(x)
+
+
+def softsign(x):
+  return K.softsign(x)
+
+
+def relu(x, alpha=0., max_value=None):
+  return K.relu(x, alpha=alpha, max_value=max_value)
+
+
+def tanh(x):
+  return K.tanh(x)
+
+
+def sigmoid(x):
+  return K.sigmoid(x)
+
+
+def hard_sigmoid(x):
+  return K.hard_sigmoid(x)
+
+
+def linear(x):
+  return x
+
+
+def serialize(activation):
+  return activation.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='activation function')
+
+
+def get(identifier):
+  if identifier is None:
+    return linear
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    if isinstance(identifier, Layer):
+      logging.warning(
+          'Do not pass a layer instance (such as {identifier}) as the '
+          'activation argument of another layer. Instead, advanced '
+          'activation layers should be used just like any other '
+          'layer in a model.'.format(identifier=identifier.__class__.__name__))
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'activation function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/activations_test.py b/tensorflow/contrib/keras/python/keras/activations_test.py
new file mode 100644
index 00000000000..eec4d257f2c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/activations_test.py
@@ -0,0 +1,157 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras activation functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def _ref_softmax(values):
+  m = np.max(values)
+  e = np.exp(values - m)
+  return e / np.sum(e)
+
+
+class KerasActivationsTest(test.TestCase):
+
+  def test_serialization(self):
+    all_activations = ['softmax', 'relu', 'elu', 'tanh',
+                       'sigmoid', 'hard_sigmoid', 'linear',
+                       'softplus', 'softsign']
+    for name in all_activations:
+      fn = keras.activations.get(name)
+      ref_fn = getattr(keras.activations, name)
+      assert fn == ref_fn
+      config = keras.activations.serialize(fn)
+      fn = keras.activations.deserialize(config)
+      assert fn == ref_fn
+
+  def test_softmax(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softmax(x)])
+      test_values = np.random.random((2, 5))
+
+      result = f([test_values])[0]
+    expected = _ref_softmax(test_values[0])
+    self.assertAllClose(result[0], expected, rtol=1e-05)
+
+  def test_temporal_softmax(self):
+    with self.test_session():
+      x = keras.backend.placeholder(shape=(2, 2, 3))
+      f = keras.backend.function([x], [keras.activations.softmax(x)])
+      test_values = np.random.random((2, 2, 3)) * 10
+      result = f([test_values])[0]
+    expected = _ref_softmax(test_values[0, 0])
+    self.assertAllClose(result[0, 0], expected, rtol=1e-05)
+
+  def test_softplus(self):
+    def softplus(x):
+      return np.log(np.ones_like(x) + np.exp(x))
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softplus(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = softplus(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_softsign(self):
+    def softsign(x):
+      return np.divide(x, np.ones_like(x) + np.absolute(x))
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.softsign(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = softsign(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_sigmoid(self):
+    def ref_sigmoid(x):
+      if x >= 0:
+        return 1 / (1 + np.exp(-x))
+      else:
+        z = np.exp(x)
+        return z / (1 + z)
+    sigmoid = np.vectorize(ref_sigmoid)
+
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.sigmoid(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = sigmoid(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_hard_sigmoid(self):
+    def ref_hard_sigmoid(x):
+      x = (x * 0.2) + 0.5
+      z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
+      return z
+    hard_sigmoid = np.vectorize(ref_hard_sigmoid)
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    expected = hard_sigmoid(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_relu(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.relu(x)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+    # No negative values in test values...
+    self.assertAllClose(result, test_values, rtol=1e-05)
+
+  def test_elu(self):
+    with self.test_session():
+      x = keras.backend.placeholder(ndim=2)
+      f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
+      test_values = np.random.random((2, 5))
+      result = f([test_values])[0]
+      self.assertAllClose(result, test_values, rtol=1e-05)
+      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+      result = f([negative_values])[0]
+      true_result = (np.exp(negative_values) - 1) / 2
+    self.assertAllClose(result, true_result)
+
+  def test_tanh(self):
+    with self.test_session():
+      test_values = np.random.random((2, 5))
+      x = keras.backend.placeholder(ndim=2)
+      exp = keras.activations.tanh(x)
+      f = keras.backend.function([x], [exp])
+      result = f([test_values])[0]
+    expected = np.tanh(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
+  def test_linear(self):
+    x = np.random.random((10, 5))
+    self.assertAllClose(x, keras.activations.linear(x))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/__init__.py b/tensorflow/contrib/keras/python/keras/applications/__init__.py
new file mode 100644
index 00000000000..c6af9ea9f16
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Applications: models with automatic loading of pre-trained weights.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
+from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
+from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
+from tensorflow.contrib.keras.python.keras.applications.xception import Xception
+
diff --git a/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py
new file mode 100644
index 00000000000..a64021ae499
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils.py
@@ -0,0 +1,155 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by models pre-trained on ImageNet.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+CLASS_INDEX = None
+CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
+
+
+def preprocess_input(x, data_format=None):
+  """Preprocesses a tensor encoding a batch of images.
+
+  Arguments:
+      x: input Numpy tensor, 4D.
+      data_format: data format of the image tensor.
+
+  Returns:
+      Preprocessed tensor.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  assert data_format in {'channels_last', 'channels_first'}
+
+  if data_format == 'channels_first':
+    # 'RGB'->'BGR'
+    x = x[:, ::-1, :, :]
+    # Zero-center by mean pixel
+    x[:, 0, :, :] -= 103.939
+    x[:, 1, :, :] -= 116.779
+    x[:, 2, :, :] -= 123.68
+  else:
+    # 'RGB'->'BGR'
+    x = x[:, :, :, ::-1]
+    # Zero-center by mean pixel
+    x[:, :, :, 0] -= 103.939
+    x[:, :, :, 1] -= 116.779
+    x[:, :, :, 2] -= 123.68
+  return x
+
+
+def decode_predictions(preds, top=5):
+  """Decodes the prediction of an ImageNet model.
+
+  Arguments:
+      preds: Numpy tensor encoding a batch of predictions.
+      top: integer, how many top-guesses to return.
+
+  Returns:
+      A list of lists of top class prediction tuples
+      `(class_name, class_description, score)`.
+      One list of tuples per sample in batch input.
+
+  Raises:
+      ValueError: in case of invalid shape of the `pred` array
+          (must be 2D).
+  """
+  global CLASS_INDEX
+  if len(preds.shape) != 2 or preds.shape[1] != 1000:
+    raise ValueError('`decode_predictions` expects '
+                     'a batch of predictions '
+                     '(i.e. a 2D array of shape (samples, 1000)). '
+                     'Found array with shape: ' + str(preds.shape))
+  if CLASS_INDEX is None:
+    fpath = get_file(
+        'imagenet_class_index.json', CLASS_INDEX_PATH, cache_subdir='models')
+    CLASS_INDEX = json.load(open(fpath))
+  results = []
+  for pred in preds:
+    top_indices = pred.argsort()[-top:][::-1]
+    result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
+    result.sort(key=lambda x: x[2], reverse=True)
+    results.append(result)
+  return results
+
+
+def _obtain_input_shape(input_shape, default_size, min_size, data_format,
+                        include_top):
+  """Internal utility to compute/validate an ImageNet model's input shape.
+
+  Arguments:
+      input_shape: either None (will return the default network input shape),
+          or a user-provided shape to be validated.
+      default_size: default input width/height for the model.
+      min_size: minimum input width/height accepted by the model.
+      data_format: image data format to use.
+      include_top: whether the model is expected to
+          be linked to a classifier via a Flatten layer.
+
+  Returns:
+      An integer shape tuple (may include None entries).
+
+  Raises:
+      ValueError: in case of invalid argument values.
+  """
+  if data_format == 'channels_first':
+    default_shape = (3, default_size, default_size)
+  else:
+    default_shape = (default_size, default_size, 3)
+  if include_top:
+    if input_shape is not None:
+      if input_shape != default_shape:
+        raise ValueError('When setting`include_top=True`, '
+                         '`input_shape` should be ' + str(default_shape) + '.')
+    input_shape = default_shape
+  else:
+    if data_format == 'channels_first':
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[0] != 3:
+          raise ValueError('The input must have 3 channels; got '
+                           '`input_shape=' + str(input_shape) + '`')
+        if ((input_shape[1] is not None and input_shape[1] < min_size) or
+            (input_shape[2] is not None and input_shape[2] < min_size)):
+          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
+                           + str(min_size) + ', got '
+                           '`input_shape=' + str(input_shape) + '`')
+      else:
+        input_shape = (3, None, None)
+    else:
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[-1] != 3:
+          raise ValueError('The input must have 3 channels; got '
+                           '`input_shape=' + str(input_shape) + '`')
+        if ((input_shape[0] is not None and input_shape[0] < min_size) or
+            (input_shape[1] is not None and input_shape[1] < min_size)):
+          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
+                           + str(min_size) + ', got '
+                           '`input_shape=' + str(input_shape) + '`')
+      else:
+        input_shape = (None, None, 3)
+  return input_shape
diff --git a/tensorflow/contrib/keras/python/keras/applications/inception_v3.py b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
new file mode 100644
index 00000000000..f77e4a83416
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/inception_v3.py
@@ -0,0 +1,391 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""Inception V3 model for Keras.
+
+Note that the input image format for this model is different than for
+the VGG16 and ResNet models (299x299 instead of 224x224),
+and that the input preprocessing function is also different (same as Xception).
+
+# Reference
+
+- [Rethinking the Inception Architecture for Computer
+Vision](http://arxiv.org/abs/1512.00567)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def conv2d_bn(x,
+              filters,
+              num_row,
+              num_col,
+              padding='same',
+              strides=(1, 1),
+              name=None):
+  """Utility function to apply conv + BN.
+
+  Arguments:
+      x: input tensor.
+      filters: filters in `Conv2D`.
+      num_row: height of the convolution kernel.
+      num_col: width of the convolution kernel.
+      padding: padding mode in `Conv2D`.
+      strides: strides in `Conv2D`.
+      name: name of the ops; will become `name + '_conv'`
+          for the convolution and `name + '_bn'` for the
+          batch norm layer.
+
+  Returns:
+      Output tensor after applying `Conv2D` and `BatchNormalization`.
+  """
+  if name is not None:
+    bn_name = name + '_bn'
+    conv_name = name + '_conv'
+  else:
+    bn_name = None
+    conv_name = None
+  if K.image_data_format() == 'channels_first':
+    bn_axis = 1
+  else:
+    bn_axis = 3
+  x = Conv2D(
+      filters, (num_row, num_col),
+      strides=strides,
+      padding=padding,
+      use_bias=False,
+      name=conv_name)(x)
+  x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
+  x = Activation('relu', name=name)(x)
+  return x
+
+
+def InceptionV3(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  """Instantiates the Inception v3 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+  Note that the default input image size for this model is 299x299.
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(299, 299, 3)` (with `channels_last` data format)
+          or `(3, 299, 299)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 139.
+          E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=299,
+      min_size=139,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  if K.image_data_format() == 'channels_first':
+    channel_axis = 1
+  else:
+    channel_axis = 3
+
+  x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='valid')
+  x = conv2d_bn(x, 32, 3, 3, padding='valid')
+  x = conv2d_bn(x, 64, 3, 3)
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  x = conv2d_bn(x, 80, 1, 1, padding='valid')
+  x = conv2d_bn(x, 192, 3, 3, padding='valid')
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  # mixed 0, 1, 2: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed0')
+
+  # mixed 1: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed1')
+
+  # mixed 2: 35 x 35 x 256
+  branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+  branch5x5 = conv2d_bn(x, 48, 1, 1)
+  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed2')
+
+  # mixed 3: 17 x 17 x 768
+  branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='valid')
+
+  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+  branch3x3dbl = conv2d_bn(
+      branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='valid')
+
+  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
+  x = layers.concatenate(
+      [branch3x3, branch3x3dbl, branch_pool], axis=channel_axis, name='mixed3')
+
+  # mixed 4: 17 x 17 x 768
+  branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+  branch7x7 = conv2d_bn(x, 128, 1, 1)
+  branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
+  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+  branch7x7dbl = conv2d_bn(x, 128, 1, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed4')
+
+  # mixed 5, 6: 17 x 17 x 768
+  for i in range(2):
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+    branch7x7 = conv2d_bn(x, 160, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+    branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+        axis=channel_axis,
+        name='mixed' + str(5 + i))
+
+  # mixed 7: 17 x 17 x 768
+  branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+  branch7x7 = conv2d_bn(x, 192, 1, 1)
+  branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
+  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+  branch7x7dbl = conv2d_bn(x, 192, 1, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+  x = layers.concatenate(
+      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+      axis=channel_axis,
+      name='mixed7')
+
+  # mixed 8: 8 x 8 x 1280
+  branch3x3 = conv2d_bn(x, 192, 1, 1)
+  branch3x3 = conv2d_bn(branch3x3, 320, 3, 3, strides=(2, 2), padding='valid')
+
+  branch7x7x3 = conv2d_bn(x, 192, 1, 1)
+  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
+  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
+  branch7x7x3 = conv2d_bn(
+      branch7x7x3, 192, 3, 3, strides=(2, 2), padding='valid')
+
+  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
+  x = layers.concatenate(
+      [branch3x3, branch7x7x3, branch_pool], axis=channel_axis, name='mixed8')
+
+  # mixed 9: 8 x 8 x 2048
+  for i in range(2):
+    branch1x1 = conv2d_bn(x, 320, 1, 1)
+
+    branch3x3 = conv2d_bn(x, 384, 1, 1)
+    branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
+    branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
+    branch3x3 = layers.concatenate(
+        [branch3x3_1, branch3x3_2], axis=channel_axis, name='mixed9_' + str(i))
+
+    branch3x3dbl = conv2d_bn(x, 448, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
+    branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
+    branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
+    branch3x3dbl = layers.concatenate(
+        [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis)
+
+    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch3x3, branch3x3dbl, branch_pool],
+        axis=channel_axis,
+        name='mixed' + str(9 + i))
+  if include_top:
+    # Classification block
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='inception_v3')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models',
+          md5_hash='9a0d58056eeedaa3f26cb7ebd46da564')
+    else:
+      weights_path = get_file(
+          'inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models',
+          md5_hash='bcbd6486424b2319ff4ef7d526e38f63')
+    model.load_weights(weights_path)
+  return model
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
diff --git a/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py b/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py
new file mode 100644
index 00000000000..586f0da2705
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/inception_v3_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Inception V3 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class InceptionV3Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.InceptionV3(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.InceptionV3(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.InceptionV3(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50.py b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
new file mode 100644
index 00000000000..0de13c9592e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
@@ -0,0 +1,286 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""ResNet50 model for Keras.
+
+# Reference:
+
+- [Deep Residual Learning for Image
+Recognition](https://arxiv.org/abs/1512.03385)
+
+Adapted from code contributed by BigMoyan.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import ZeroPadding2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def identity_block(input_tensor, kernel_size, filters, stage, block):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Arguments:
+      input_tensor: input tensor
+      kernel_size: default 3, the kernel size of middle conv layer at main path
+      filters: list of integers, the filterss of 3 conv layer at main path
+      stage: integer, current stage label, used for generating layer names
+      block: 'a','b'..., current block label, used for generating layer names
+
+  Returns:
+      Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = Conv2D(filters1, (1, 1), name=conv_name_base + '2a')(input_tensor)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+  x = layers.add([x, input_tensor])
+  x = Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
+                                                                          2)):
+  """conv_block is the block that has a conv layer at shortcut.
+
+  Arguments:
+      input_tensor: input tensor
+      kernel_size: default 3, the kernel size of middle conv layer at main path
+      filters: list of integers, the filterss of 3 conv layer at main path
+      stage: integer, current stage label, used for generating layer names
+      block: 'a','b'..., current block label, used for generating layer names
+      strides: Tuple of integers.
+
+  Returns:
+      Output tensor for the block.
+
+  Note that from stage 3, the first conv layer at main path is with
+  strides=(2,2)
+  And the shortcut should have strides=(2,2) as well
+  """
+  filters1, filters2, filters3 = filters
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = Conv2D(
+      filters1, (1, 1), strides=strides,
+      name=conv_name_base + '2a')(input_tensor)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+  x = Activation('relu')(x)
+
+  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
+  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+  shortcut = Conv2D(
+      filters3, (1, 1), strides=strides,
+      name=conv_name_base + '1')(input_tensor)
+  shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
+
+  x = layers.add([x, shortcut])
+  x = Activation('relu')(x)
+  return x
+
+
+def ResNet50(include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the ResNet50 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 197.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=197,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  if K.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+
+  x = ZeroPadding2D((3, 3))(img_input)
+  x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
+  x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
+  x = Activation('relu')(x)
+  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+  x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+
+  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
+
+  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
+
+  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
+
+  x = AveragePooling2D((7, 7), name='avg_pool')(x)
+
+  if include_top:
+    x = Flatten()(x)
+    x = Dense(classes, activation='softmax', name='fc1000')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='resnet50')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models',
+          md5_hash='a7b3fe01876f51b976af0dea6bc144eb')
+    else:
+      weights_path = get_file(
+          'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models',
+          md5_hash='a268eb855778b3df3c7506639542a6af')
+    model.load_weights(weights_path)
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py b/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py
new file mode 100644
index 00000000000..0ef701af93c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ResNet50 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ResNet50Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.ResNet50(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.ResNet50(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.ResNet50(weights=None,
+                                        include_top=False,
+                                        pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg16.py b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
new file mode 100644
index 00000000000..89bbb040e6a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg16.py
@@ -0,0 +1,211 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""VGG16 model for Keras.
+
+# Reference
+
+- [Very Deep Convolutional Networks for Large-Scale Image
+Recognition](https://arxiv.org/abs/1409.1556)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import layer_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def VGG16(include_top=True,
+          weights='imagenet',
+          input_tensor=None,
+          input_shape=None,
+          pooling=None,
+          classes=1000):
+  """Instantiates the VGG16 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the 3 fully-connected
+          layers at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 48.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=48,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  # Block 1
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same',
+      name='block1_conv1')(img_input)
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
+
+  # Block 2
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
+
+  # Block 3
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
+
+  # Block 4
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
+
+  # Block 5
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
+
+  if include_top:
+    # Classification block
+    x = Flatten(name='flatten')(x)
+    x = Dense(4096, activation='relu', name='fc1')(x)
+    x = Dense(4096, activation='relu', name='fc2')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='vgg16')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'vgg16_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
+
+    if K.image_data_format() == 'channels_first':
+      if include_top:
+        maxpool = model.get_layer(name='block5_pool')
+        shape = maxpool.output_shape[1:]
+        dense = model.get_layer(name='fc1')
+        layer_utils.convert_dense_weights_data_format(dense, shape,
+                                                      'channels_first')
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py b/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py
new file mode 100644
index 00000000000..d0e707d675c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg16_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VGG16 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class VGG16Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.VGG16(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.VGG16(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 512))
+
+  def test_with_pooling(self):
+    model = keras.applications.VGG16(weights=None,
+                                     include_top=False,
+                                     pooling='avg')
+    self.assertEqual(model.output_shape, (None, 512))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg19.py b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
new file mode 100644
index 00000000000..522a516ecfc
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg19.py
@@ -0,0 +1,217 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""VGG19 model for Keras.
+
+# Reference
+
+- [Very Deep Convolutional Networks for Large-Scale Image
+Recognition](https://arxiv.org/abs/1409.1556)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import Flatten
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import layer_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def VGG19(include_top=True,
+          weights='imagenet',
+          input_tensor=None,
+          input_shape=None,
+          pooling=None,
+          classes=1000):
+  """Instantiates the VGG19 architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format="channels_last"` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with both
+  TensorFlow and Theano. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      include_top: whether to include the 3 fully-connected
+          layers at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 48.
+          E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=48,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  # Block 1
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same',
+      name='block1_conv1')(img_input)
+  x = Conv2D(
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
+
+  # Block 2
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+  x = Conv2D(
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
+
+  # Block 3
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+  x = Conv2D(
+      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
+
+  # Block 4
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
+
+  # Block 5
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+  x = Conv2D(
+      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
+  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
+
+  if include_top:
+    # Classification block
+    x = Flatten(name='flatten')(x)
+    x = Dense(4096, activation='relu', name='fc1')(x)
+    x = Dense(4096, activation='relu', name='fc2')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='vgg19')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'vgg19_weights_tf_dim_ordering_tf_kernels.h5',
+          WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
+
+    if K.image_data_format() == 'channels_first':
+      if include_top:
+        maxpool = model.get_layer(name='block5_pool')
+        shape = maxpool.output_shape[1:]
+        dense = model.get_layer(name='fc1')
+        layer_utils.convert_dense_weights_data_format(dense, shape,
+                                                      'channels_first')
+  return model
diff --git a/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py b/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py
new file mode 100644
index 00000000000..f2db0da4f4d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/vgg19_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VGG19 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class VGG19Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.VGG19(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.VGG19(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 512))
+
+  def test_with_pooling(self):
+    model = keras.applications.VGG19(weights=None,
+                                     include_top=False,
+                                     pooling='avg')
+    self.assertEqual(model.output_shape, (None, 512))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/applications/xception.py b/tensorflow/contrib/keras/python/keras/applications/xception.py
new file mode 100644
index 00000000000..49fb6008f6e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/xception.py
@@ -0,0 +1,306 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+"""Xception V1 model for Keras.
+
+On ImageNet, this model gets to a top-1 validation accuracy of 0.790
+and a top-5 validation accuracy of 0.945.
+
+Do note that the input image format for this model is different than for
+the VGG16 and ResNet models (299x299 instead of 224x224),
+and that the input preprocessing function
+is also different (same as Inception V3).
+
+Also do note that this model is only available for the TensorFlow backend,
+due to its reliance on `SeparableConvolution` layers.
+
+# Reference
+
+- [Xception: Deep Learning with Depthwise Separable
+Convolutions](https://arxiv.org/abs/1610.02357)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dense
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import SeparableConv2D
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+
+
+TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
+TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def Xception(include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the Xception architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. This model is available for TensorFlow only,
+  and can only be used with inputs following the TensorFlow
+  data format `(width, height, channels)`.
+  You should set `image_data_format="channels_last"` in your Keras config
+  located at ~/.keras/keras.json.
+
+  Note that the default input image size for this model is 299x299.
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization)
+          or "imagenet" (pre-training on ImageNet).
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(299, 299, 3)`.
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 71.
+          E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('The Xception model is only available with '
+                       'the TensorFlow backend.')
+  if K.image_data_format() != 'channels_last':
+    logging.warning(
+        'The Xception model is only available for the '
+        'input data format "channels_last" '
+        '(width, height, channels). '
+        'However your settings specify the default '
+        'data format "channels_first" (channels, width, height). '
+        'You should set `image_data_format="channels_last"` in your Keras '
+        'config located at ~/.keras/keras.json. '
+        'The model being returned right now will expect inputs '
+        'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=299,
+      min_size=71,
+      data_format=K.image_data_format(),
+      include_top=include_top)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    img_input = Input(tensor=input_tensor, shape=input_shape)
+
+  x = Conv2D(
+      32, (3, 3), strides=(2, 2), use_bias=False,
+      name='block1_conv1')(img_input)
+  x = BatchNormalization(name='block1_conv1_bn')(x)
+  x = Activation('relu', name='block1_conv1_act')(x)
+  x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
+  x = BatchNormalization(name='block1_conv2_bn')(x)
+  x = Activation('relu', name='block1_conv2_act')(x)
+
+  residual = Conv2D(
+      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = SeparableConv2D(
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
+  x = BatchNormalization(name='block2_sepconv1_bn')(x)
+  x = Activation('relu', name='block2_sepconv2_act')(x)
+  x = SeparableConv2D(
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
+  x = BatchNormalization(name='block2_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
+  x = layers.add([x, residual])
+
+  residual = Conv2D(
+      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block3_sepconv1_act')(x)
+  x = SeparableConv2D(
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
+  x = BatchNormalization(name='block3_sepconv1_bn')(x)
+  x = Activation('relu', name='block3_sepconv2_act')(x)
+  x = SeparableConv2D(
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
+  x = BatchNormalization(name='block3_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
+  x = layers.add([x, residual])
+
+  residual = Conv2D(
+      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block4_sepconv1_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
+  x = BatchNormalization(name='block4_sepconv1_bn')(x)
+  x = Activation('relu', name='block4_sepconv2_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
+  x = BatchNormalization(name='block4_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
+  x = layers.add([x, residual])
+
+  for i in range(8):
+    residual = x
+    prefix = 'block' + str(i + 5)
+
+    x = Activation('relu', name=prefix + '_sepconv1_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv1')(x)
+    x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
+    x = Activation('relu', name=prefix + '_sepconv2_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv2')(x)
+    x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
+    x = Activation('relu', name=prefix + '_sepconv3_act')(x)
+    x = SeparableConv2D(
+        728, (3, 3), padding='same', use_bias=False,
+        name=prefix + '_sepconv3')(x)
+    x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
+
+    x = layers.add([x, residual])
+
+  residual = Conv2D(
+      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+  residual = BatchNormalization()(residual)
+
+  x = Activation('relu', name='block13_sepconv1_act')(x)
+  x = SeparableConv2D(
+      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
+  x = BatchNormalization(name='block13_sepconv1_bn')(x)
+  x = Activation('relu', name='block13_sepconv2_act')(x)
+  x = SeparableConv2D(
+      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
+  x = BatchNormalization(name='block13_sepconv2_bn')(x)
+
+  x = MaxPooling2D(
+      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
+  x = layers.add([x, residual])
+
+  x = SeparableConv2D(
+      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
+  x = BatchNormalization(name='block14_sepconv1_bn')(x)
+  x = Activation('relu', name='block14_sepconv1_act')(x)
+
+  x = SeparableConv2D(
+      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
+  x = BatchNormalization(name='block14_sepconv2_bn')(x)
+  x = Activation('relu', name='block14_sepconv2_act')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+  # Create model.
+  model = Model(inputs, x, name='xception')
+
+  # load weights
+  if weights == 'imagenet':
+    if include_top:
+      weights_path = get_file(
+          'xception_weights_tf_dim_ordering_tf_kernels.h5',
+          TF_WEIGHTS_PATH,
+          cache_subdir='models')
+    else:
+      weights_path = get_file(
+          'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
+          TF_WEIGHTS_PATH_NO_TOP,
+          cache_subdir='models')
+    model.load_weights(weights_path)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+  return model
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
diff --git a/tensorflow/contrib/keras/python/keras/applications/xception_test.py b/tensorflow/contrib/keras/python/keras/applications/xception_test.py
new file mode 100644
index 00000000000..bb3cc1678eb
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/xception_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Xception application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class XceptionTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.Xception(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.Xception(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 2048))
+
+  def test_with_pooling(self):
+    model = keras.applications.Xception(weights=None,
+                                        include_top=False,
+                                        pooling='avg')
+    self.assertEqual(model.output_shape, (None, 2048))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
new file mode 100644
index 00000000000..7a005603084
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -0,0 +1,3870 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+# pylint: disable=redefined-outer-name
+# pylint: disable=redefined-builtin
+"""Keras backend API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_module
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import ctc_ops as ctc
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients as gradients_module
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.training import moving_averages
+from tensorflow.python.util import tf_inspect
+
+
+py_all = all
+py_sum = sum
+
+# INTERNAL UTILS
+
+# This is the default internal TF session used by Keras.
+# It can be set manually via `set_session(sess)`.
+_SESSION = None
+
+# This dictionary holds a mapping {graph: learning_phase}.
+# A learning phase is a bool tensor used to run Keras models in
+# either train mode (learning_phase == 1) or test mode (learning_phase == 0).
+_GRAPH_LEARNING_PHASES = {}
+
+# This dictionary holds a mapping {graph: UID_DICT}.
+# each UID_DICT is a dictionary mapping name prefixes to a current index,
+# used for generatic graph-specific string UIDs
+# for various names (e.g. layer names).
+_GRAPH_UID_DICTS = {}
+
+# This boolean flag can be set to True to leave variable initialization
+# up to the user.
+# Change its value via `manual_variable_initialization(value)`.
+_MANUAL_VAR_INIT = False
+
+# The type of float to use throughout a session.
+_FLOATX = 'float32'
+
+# Epsilon fuzz factor used throughout the codebase.
+_EPSILON = 10e-8
+
+# Default image data format, one of "channels_last", "channels_first".
+_IMAGE_DATA_FORMAT = 'channels_last'
+
+
+def backend():
+  """Publicly accessible method for determining the current backend.
+
+  Only exists for API compatibility with multi-backend Keras.
+
+  Returns:
+      The string "tensorflow".
+  """
+  return 'tensorflow'
+
+
+def epsilon():
+  """Returns the value of the fuzz factor used in numeric expressions.
+
+  Returns:
+      A float.
+
+  Example:
+  ```python
+      >>> keras.backend.epsilon()
+      1e-08
+  ```
+  """
+  return _EPSILON
+
+
+def set_epsilon(value):
+  """Sets the value of the fuzz factor used in numeric expressions.
+
+  Arguments:
+      value: float. New value of epsilon.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.epsilon()
+      1e-08
+      >>> K.set_epsilon(1e-05)
+      >>> K.epsilon()
+      1e-05
+  ```
+  """
+  global _EPSILON
+  _EPSILON = value
+
+
+def floatx():
+  """Returns the default float type, as a string.
+
+  E.g. 'float16', 'float32', 'float64'.
+
+  Returns:
+      String, the current default float type.
+
+  Example:
+  ```python
+      >>> keras.backend.floatx()
+      'float32'
+  ```
+  """
+  return _FLOATX
+
+
+def set_floatx(value):
+  """Sets the default float type.
+
+  Arguments:
+      value: String; 'float16', 'float32', or 'float64'.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.floatx()
+      'float32'
+      >>> K.set_floatx('float16')
+      >>> K.floatx()
+      'float16'
+  ```
+
+  Raises:
+      ValueError: In case of invalid value.
+  """
+  global _FLOATX
+  if value not in {'float16', 'float32', 'float64'}:
+    raise ValueError('Unknown floatx type: ' + str(value))
+  _FLOATX = str(value)
+
+
+def cast_to_floatx(x):
+  """Cast a Numpy array to the default Keras float type.
+
+  Arguments:
+      x: Numpy array.
+
+  Returns:
+      The same Numpy array, cast to its new type.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.floatx()
+      'float32'
+      >>> arr = numpy.array([1.0, 2.0], dtype='float64')
+      >>> arr.dtype
+      dtype('float64')
+      >>> new_arr = K.cast_to_floatx(arr)
+      >>> new_arr
+      array([ 1.,  2.], dtype=float32)
+      >>> new_arr.dtype
+      dtype('float32')
+  ```
+  """
+  return np.asarray(x, dtype=_FLOATX)
+
+
+def image_data_format():
+  """Returns the default image data format convention.
+
+  Returns:
+      A string, either `'channels_first'` or `'channels_last'`
+
+  Example:
+  ```python
+      >>> keras.backend.image_data_format()
+      'channels_first'
+  ```
+  """
+  return _IMAGE_DATA_FORMAT
+
+
+def set_image_data_format(data_format):
+  """Sets the value of the image data format convention.
+
+  Arguments:
+      data_format: string. `'channels_first'` or `'channels_last'`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> K.image_data_format()
+      'channels_first'
+      >>> K.set_image_data_format('channels_last')
+      >>> K.image_data_format()
+      'channels_last'
+  ```
+
+  Raises:
+      ValueError: In case of invalid `data_format` value.
+  """
+  global _IMAGE_DATA_FORMAT
+  if data_format not in {'channels_last', 'channels_first'}:
+    raise ValueError('Unknown data_format:', data_format)
+  _IMAGE_DATA_FORMAT = str(data_format)
+
+
+def get_uid(prefix=''):
+  """Associates a string prefix with an integer counter in a TensorFlow graph.
+
+  Arguments:
+    prefix: String prefix to index.
+
+  Returns:
+    Unique integer ID.
+
+  Example:
+
+  ```
+    >>> get_uid('dense')
+    1
+    >>> get_uid('dense')
+    2
+  ```
+  """
+  graph = ops.get_default_graph()
+  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  layer_name_uids[prefix] += 1
+  return layer_name_uids[prefix]
+
+
+def reset_uids():
+  layer_name_uids_collection = ops.get_collection_ref('LAYER_NAME_UIDS')
+  if layer_name_uids_collection:
+    layer_name_uids_collection.pop()
+
+
+def clear_session():
+  """Destroys the current TF graph and creates a new one.
+
+  Useful to avoid clutter from old models / layers.
+  """
+  global _SESSION
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  ops.reset_default_graph()
+  reset_uids()
+  _SESSION = None
+  phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  _GRAPH_LEARNING_PHASES = {}
+  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
+
+
+def manual_variable_initialization(value):
+  """Sets the manual variable initialization flag.
+
+  This boolean flag determines whether
+  variables should be initialized
+  as they are instantiated (default), or if
+  the user should handle the initialization
+  (e.g. via `tf.initialize_all_variables()`).
+
+  Arguments:
+      value: Python boolean.
+  """
+  global _MANUAL_VAR_INIT
+  _MANUAL_VAR_INIT = value
+
+
+def learning_phase():
+  """Returns the learning phase flag.
+
+  The learning phase flag is a bool tensor (0 = test, 1 = train)
+  to be passed as input to any Keras function
+  that uses a different behavior at train time and test time.
+
+  Returns:
+      Learning phase (scalar integer tensor or Python integer).
+  """
+  graph = ops.get_default_graph()
+  if graph not in _GRAPH_LEARNING_PHASES:
+    phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+    _GRAPH_LEARNING_PHASES[graph] = phase
+  return _GRAPH_LEARNING_PHASES[graph]
+
+
+def set_learning_phase(value):
+  """Sets the learning phase to a fixed value.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+
+  Raises:
+      ValueError: if `value` is neither `0` nor `1`.
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  if value not in {0, 1}:
+    raise ValueError('Expected learning phase to be ' '0 or 1.')
+  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+
+
+def get_session():
+  """Returns the TF session to be used by the backend.
+
+  If a default TensorFlow session is available, we will return it.
+
+  Else, we will return the global Keras session.
+
+  If no global Keras session exists at this point:
+  we will create a new global session.
+
+  Note that you can manually set the global session
+  via `K.set_session(sess)`.
+
+  Returns:
+      A TensorFlow session.
+  """
+  global _SESSION
+  if ops.get_default_session() is not None:
+    session = ops.get_default_session()
+  else:
+    if _SESSION is None:
+      if not os.environ.get('OMP_NUM_THREADS'):
+        config = config_pb2.ConfigProto(allow_soft_placement=True)
+      else:
+        num_thread = int(os.environ.get('OMP_NUM_THREADS'))
+        config = config_pb2.ConfigProto(
+            intra_op_parallelism_threads=num_thread, allow_soft_placement=True)
+      _SESSION = session_module.Session(config=config)
+    session = _SESSION
+  if not _MANUAL_VAR_INIT:
+    with session.graph.as_default():
+      _initialize_variables()
+  return session
+
+
+def set_session(session):
+  """Sets the global TensorFlow session.
+
+  Arguments:
+      session: A TF Session.
+  """
+  global _SESSION
+  _SESSION = session
+
+
+# VARIABLE MANIPULATION
+
+
+def _convert_string_dtype(dtype):
+  """Get the type from a string.
+
+  Arguments:
+      dtype: A string representation of a type.
+
+  Returns:
+      The type requested.
+
+  Raises:
+      ValueError: if `dtype` is not supported.
+  """
+  if dtype == 'float16':
+    return dtypes_module.float16
+  if dtype == 'float32':
+    return dtypes_module.float32
+  elif dtype == 'float64':
+    return dtypes_module.float64
+  elif dtype == 'int16':
+    return dtypes_module.int16
+  elif dtype == 'int32':
+    return dtypes_module.int32
+  elif dtype == 'int64':
+    return dtypes_module.int64
+  elif dtype == 'uint8':
+    return dtypes_module.int8
+  elif dtype == 'uint16':
+    return dtypes_module.uint16
+  else:
+    raise ValueError('Unsupported dtype:', dtype)
+
+
+def _to_tensor(x, dtype):
+  """Convert the input `x` to a tensor of type `dtype`.
+
+  Arguments:
+      x: An object to be converted (numpy array, list, tensors).
+      dtype: The destination type.
+
+  Returns:
+      A tensor.
+  """
+  x = ops.convert_to_tensor(x)
+  if x.dtype != dtype:
+    x = math_ops.cast(x, dtype)
+  return x
+
+
+def is_sparse(tensor):
+  """Returns whether a tensor is a sparse tensor.
+
+  Arguments:
+      tensor: A tensor instance.
+
+  Returns:
+      A boolean.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> a = K.placeholder((2, 2), sparse=False)
+      >>> print(K.is_sparse(a))
+      False
+      >>> b = K.placeholder((2, 2), sparse=True)
+      >>> print(K.is_sparse(b))
+      True
+  ```
+  """
+  return isinstance(tensor, sparse_tensor.SparseTensor)
+
+
+def to_dense(tensor):
+  """Converts a sparse tensor into a dense tensor and returns it.
+
+  Arguments:
+      tensor: A tensor instance (potentially sparse).
+
+  Returns:
+      A dense tensor.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> b = K.placeholder((2, 2), sparse=True)
+      >>> print(K.is_sparse(b))
+      True
+      >>> c = K.to_dense(b)
+      >>> print(K.is_sparse(c))
+      False
+  ```
+  """
+  if is_sparse(tensor):
+    return sparse_ops.sparse_tensor_to_dense(tensor)
+  else:
+    return tensor
+
+
+name_scope = ops.name_scope
+
+
+def variable(value, dtype=None, name=None):
+  """Instantiates a variable and returns it.
+
+  Arguments:
+      value: Numpy array, initial value of the tensor.
+      dtype: Tensor type.
+      name: Optional name string for the tensor.
+
+  Returns:
+      A variable instance (with Keras metadata included).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val, dtype='float64', name='example_var')
+      >>> K.dtype(kvar)
+      'float64'
+      >>> print(kvar)
+      example_var
+      >>> kvar.eval()
+      array([[ 1.,  2.],
+             [ 3.,  4.]])
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  if hasattr(value, 'tocoo'):
+    sparse_coo = value.tocoo()
+    indices = np.concatenate((np.expand_dims(sparse_coo.row, 1), np.expand_dims(
+        sparse_coo.col, 1)), 1)
+    v = sparse_tensor.SparseTensor(
+        indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
+    v._uses_learning_phase = False
+    return v
+  v = variables_module.Variable(
+      value, dtype=_convert_string_dtype(dtype), name=name)
+  v._uses_learning_phase = False
+  return v
+
+
+def _initialize_variables():
+  """Utility to initialize uninitialized variables on the fly.
+  """
+  variables = variables_module.global_variables()
+  uninitialized_variables = []
+  for v in variables:
+    if not hasattr(v, '_keras_initialized') or not v._keras_initialized:
+      uninitialized_variables.append(v)
+      v._keras_initialized = True
+  if uninitialized_variables:
+    sess = get_session()
+    sess.run(variables_module.variables_initializer(uninitialized_variables))
+
+
+def constant(value, dtype=None, shape=None, name=None):
+  """Creates a constant tensor.
+
+  Arguments:
+      value: A constant value (or list)
+      dtype: The type of the elements of the resulting tensor.
+      shape: Optional dimensions of resulting tensor.
+      name: Optional name for the tensor.
+
+  Returns:
+      A Constant Tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
+
+
+def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
+  """Instantiates a placeholder tensor and returns it.
+
+  Arguments:
+      shape: Shape of the placeholder
+          (integer tuple, may include `None` entries).
+      ndim: Number of axes of the tensor.
+          At least one of {`shape`, `ndim`} must be specified.
+          If both are specified, `shape` is used.
+      dtype: Placeholder type.
+      sparse: Boolean, whether the placeholder should have a sparse type.
+      name: Optional name string for the placeholder.
+
+  Returns:
+      Tensor instance (with Keras metadata included).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input_ph = K.placeholder(shape=(2, 4, 5))
+      >>> input_ph
+      <tf.Tensor 'Placeholder_4:0' shape=(2, 4, 5) dtype=float32>
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  if not shape:
+    if ndim:
+      shape = tuple([None for _ in range(ndim)])
+  if sparse:
+    x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
+  else:
+    x = array_ops.placeholder(dtype, shape=shape, name=name)
+  x._uses_learning_phase = False
+  return x
+
+
+def shape(x):
+  """Returns the symbolic shape of a tensor or variable.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A symbolic shape (which is itself a tensor).
+
+  Examples:
+  ```
+      # TensorFlow example
+      >>> from keras import backend as K
+      >>> tf_session = K.get_session()
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> input = keras.backend.placeholder(shape=(2, 4, 5))
+      >>> K.shape(kvar)
+      <tf.Tensor 'Shape_8:0' shape=(2,) dtype=int32>
+      >>> K.shape(input)
+      <tf.Tensor 'Shape_9:0' shape=(3,) dtype=int32>
+      # To get integer shape (Instead, you can use K.int_shape(x))
+      >>> K.shape(kvar).eval(session=tf_session)
+      array([2, 2], dtype=int32)
+      >>> K.shape(input).eval(session=tf_session)
+      array([2, 4, 5], dtype=int32)
+  ```
+  """
+  return array_ops.shape(x)
+
+
+def int_shape(x):
+  """Returns the shape tensor or variable as a tuple of int or None entries.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tuple of integers (or None entries).
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder(shape=(2, 4, 5))
+      >>> K.int_shape(input)
+      (2, 4, 5)
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> K.int_shape(kvar)
+      (2, 2)
+  ```
+  """
+  shape = x.get_shape()
+  try:
+    return tuple([i.__int__() for i in shape])
+  except ValueError:
+    return None
+
+
+def ndim(x):
+  """Returns the number of axes in a tensor, as an integer.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      Integer (scalar), number of axes.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder(shape=(2, 4, 5))
+      >>> val = np.array([[1, 2], [3, 4]])
+      >>> kvar = K.variable(value=val)
+      >>> K.ndim(input)
+      3
+      >>> K.ndim(kvar)
+      2
+  ```
+  """
+  dims = x.get_shape()._dims
+  if dims is not None:
+    return len(dims)
+  return None
+
+
+def dtype(x):
+  """Returns the dtype of a Keras tensor or variable, as a string.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      String, dtype of `x`.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> K.dtype(K.placeholder(shape=(2,4,5)))
+      'float32'
+      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float32'))
+      'float32'
+      >>> K.dtype(K.placeholder(shape=(2,4,5), dtype='float64'))
+      'float64'
+      # Keras variable
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
+      >>> K.dtype(kvar)
+      'float32_ref'
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+      >>> K.dtype(kvar)
+      'float32_ref'
+  ```
+  """
+  return x.dtype.name
+
+
+def eval(x):
+  """Evaluates the value of a variable.
+
+  Arguments:
+      x: A variable.
+
+  Returns:
+      A Numpy array.
+
+  Examples:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
+      >>> K.eval(kvar)
+      array([[ 1.,  2.],
+             [ 3.,  4.]], dtype=float32)
+  ```
+  """
+  return to_dense(x).eval(session=get_session())
+
+
+def zeros(shape, dtype=None, name=None):
+  """Instantiates an all-zeros variable and returns it.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable
+      dtype: String, data type of returned Keras variable
+      name: String, name of returned Keras variable
+
+  Returns:
+      A variable (including Keras metadata), filled with `0.0`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.zeros((3,4))
+      >>> K.eval(kvar)
+      array([[ 0.,  0.,  0.,  0.],
+             [ 0.,  0.,  0.,  0.],
+             [ 0.,  0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  return variable(
+      init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
+
+
+def ones(shape, dtype=None, name=None):
+  """Instantiates an all-ones tensor variable and returns it.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      dtype: String, data type of returned Keras variable.
+      name: String, name of returned Keras variable.
+
+  Returns:
+      A Keras variable, filled with `1.0`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.ones((3,4))
+      >>> K.eval(kvar)
+      array([[ 1.,  1.,  1.,  1.],
+             [ 1.,  1.,  1.,  1.],
+             [ 1.,  1.,  1.,  1.]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  return variable(
+      init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
+
+
+def eye(size, dtype=None, name=None):
+  """Instantiate an identity matrix and returns it.
+
+  Arguments:
+      size: Integer, number of rows/columns.
+      dtype: String, data type of returned Keras variable.
+      name: String, name of returned Keras variable.
+
+  Returns:
+      A Keras variable, an identity matrix.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.eye(3)
+      >>> K.eval(kvar)
+      array([[ 1.,  0.,  0.],
+             [ 0.,  1.,  0.],
+             [ 0.,  0.,  1.]], dtype=float32)
+  ```
+
+  """
+  return variable(np.eye(size), dtype, name)
+
+
+def zeros_like(x, dtype=None, name=None):
+  """Instantiates an all-zeros variable of the same shape as another tensor.
+
+  Arguments:
+      x: Keras variable or Keras tensor.
+      dtype: String, dtype of returned Keras variable.
+           None uses the dtype of x.
+      name: String, name for the variable to create.
+
+  Returns:
+      A Keras variable with the shape of x filled with zeros.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.random.random((2,3)))
+      >>> kvar_zeros = K.zeros_like(kvar)
+      >>> K.eval(kvar_zeros)
+      array([[ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  return array_ops.zeros_like(x, dtype=dtype, name=name)
+
+
+def ones_like(x, dtype=None, name=None):
+  """Instantiates an all-ones variable of the same shape as another tensor.
+
+  Arguments:
+      x: Keras variable or tensor.
+      dtype: String, dtype of returned Keras variable.
+           None uses the dtype of x.
+      name: String, name for the variable to create.
+
+  Returns:
+      A Keras variable with the shape of x filled with ones.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> kvar = K.variable(np.random.random((2,3)))
+      >>> kvar_ones = K.ones_like(kvar)
+      >>> K.eval(kvar_ones)
+      array([[ 1.,  1.,  1.],
+             [ 1.,  1.,  1.]], dtype=float32)
+  ```
+  """
+  return array_ops.ones_like(x, dtype=dtype, name=name)
+
+
+def identity(x):
+  """Returns a tensor with the same content as the input tensor.
+
+  Arguments:
+      x: The input tensor.
+
+  Returns:
+      A tensor of the same shape, type and content.
+  """
+  return array_ops.identity(x)
+
+
+def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
+  """Instantiates a variable with values drawn from a uniform distribution.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      low: Float, lower boundary of the output interval.
+      high: Float, upper boundary of the output interval.
+      dtype: String, dtype of returned Keras variable.
+      name: String, name of returned Keras variable.
+      seed: Integer, random seed.
+
+  Returns:
+      A Keras variable, filled with drawn samples.
+
+  Example:
+  ```python
+      # TensorFlow example
+      >>> kvar = K.random_uniform_variable((2,3), 0, 1)
+      >>> kvar
+      <tensorflow.python.ops.variables.Variable object at 0x10ab40b10>
+      >>> K.eval(kvar)
+      array([[ 0.10940075,  0.10047495,  0.476143  ],
+             [ 0.66137183,  0.00869417,  0.89220798]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  if seed is None:
+    # ensure that randomness is conditioned by the Numpy RNG
+    seed = np.random.randint(10e8)
+  value = init_ops.random_uniform_initializer(
+      low, high, dtype=tf_dtype, seed=seed)(shape)
+  return variable(value, dtype=dtype, name=name)
+
+
+def random_normal_variable(shape, mean, scale, dtype=None, name=None,
+                           seed=None):
+  """Instantiates a variable with values drawn from a normal distribution.
+
+  Arguments:
+      shape: Tuple of integers, shape of returned Keras variable.
+      mean: Float, mean of the normal distribution.
+      scale: Float, standard deviation of the normal distribution.
+      dtype: String, dtype of returned Keras variable.
+      name: String, name of returned Keras variable.
+      seed: Integer, random seed.
+
+  Returns:
+      A Keras variable, filled with drawn samples.
+
+  Example:
+  ```python
+      # TensorFlow example
+      >>> kvar = K.random_normal_variable((2,3), 0, 1)
+      >>> kvar
+      <tensorflow.python.ops.variables.Variable object at 0x10ab12dd0>
+      >>> K.eval(kvar)
+      array([[ 1.19591331,  0.68685907, -0.63814116],
+             [ 0.92629528,  0.28055015,  1.70484698]], dtype=float32)
+  ```
+  """
+  if dtype is None:
+    dtype = floatx()
+  shape = tuple(map(int, shape))
+  tf_dtype = _convert_string_dtype(dtype)
+  if seed is None:
+    # ensure that randomness is conditioned by the Numpy RNG
+    seed = np.random.randint(10e8)
+  value = init_ops.random_normal_initializer(
+      mean, scale, dtype=tf_dtype, seed=seed)(shape)
+  return variable(value, dtype=dtype, name=name)
+
+
+def count_params(x):
+  """Returns the number of scalars in a Keras variable.
+
+  Arguments:
+      x: Keras variable.
+
+  Returns:
+      Integer, the number of scalars in `x`.
+
+  Example:
+  ```python
+      >>> kvar = K.zeros((2,3))
+      >>> K.count_params(kvar)
+      6
+      >>> K.eval(kvar)
+      array([[ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]], dtype=float32)
+  ```
+  """
+  shape = x.get_shape()
+  return np.prod([shape[i]._value for i in range(len(shape))])
+
+
+def cast(x, dtype):
+  """Casts a tensor to a different dtype and returns it.
+
+  You can cast a Keras variable but it still returns a Keras tensor.
+
+  Arguments:
+      x: Keras tensor (or variable).
+      dtype: String, either (`'float16'`, `'float32'`, or `'float64'`).
+
+  Returns:
+      Keras tensor with dtype `dtype`.
+
+  Example:
+  ```python
+      >>> from keras import backend as K
+      >>> input = K.placeholder((2, 3), dtype='float32')
+      >>> input
+      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
+      # It doesn't work in-place as below.
+      >>> K.cast(input, dtype='float16')
+      <tf.Tensor 'Cast_1:0' shape=(2, 3) dtype=float16>
+      >>> input
+      <tf.Tensor 'Placeholder_2:0' shape=(2, 3) dtype=float32>
+      # you need to assign it.
+      >>> input = K.cast(input, dtype='float16')
+      >>> input
+      <tf.Tensor 'Cast_2:0' shape=(2, 3) dtype=float16>
+  ```
+  """
+  return math_ops.cast(x, dtype)
+
+
+# UPDATES OPS
+
+
+def update(x, new_x):
+  return state_ops.assign(x, new_x)
+
+
+def update_add(x, increment):
+  """Update the value of `x` by adding `increment`.
+
+  Arguments:
+      x: A Variable.
+      increment: A tensor of same shape as `x`.
+
+  Returns:
+      The variable `x` updated.
+  """
+  return state_ops.assign_add(x, increment)
+
+
+def update_sub(x, decrement):
+  """Update the value of `x` by subtracting `decrement`.
+
+  Arguments:
+      x: A Variable.
+      decrement: A tensor of same shape as `x`.
+
+  Returns:
+      The variable `x` updated.
+  """
+  return state_ops.assign_sub(x, decrement)
+
+
+def moving_average_update(x, value, momentum):
+  """Compute the moving average of a variable.
+
+  Arguments:
+      x: A Variable.
+      value: A tensor with the same shape as `variable`.
+      momentum: The moving average momentum.
+
+  Returns:
+      An Operation to update the variable.
+  """
+  return moving_averages.assign_moving_average(
+      x, value, momentum, zero_debias=False)
+
+
+# LINEAR ALGEBRA
+
+
+def dot(x, y):
+  """Multiplies 2 tensors (and/or variables) and returns a *tensor*.
+
+  When attempting to multiply a nD tensor
+  with a nD tensor, it reproduces the Theano behavior.
+  (e.g. `(2, 3) * (4, 3, 5) -> (2, 4, 5)`)
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor, dot product of `x` and `y`.
+
+  Examples:
+  ```python
+      # dot product between tensors
+      >>> x = K.placeholder(shape=(2, 3))
+      >>> y = K.placeholder(shape=(3, 4))
+      >>> xy = K.dot(x, y)
+      >>> xy
+      <tf.Tensor 'MatMul_9:0' shape=(2, 4) dtype=float32>
+  ```
+
+  ```python
+      # dot product between tensors
+      >>> x = K.placeholder(shape=(32, 28, 3))
+      >>> y = K.placeholder(shape=(3, 4))
+      >>> xy = K.dot(x, y)
+      >>> xy
+      <tf.Tensor 'MatMul_9:0' shape=(32, 28, 4) dtype=float32>
+  ```
+
+  ```python
+      # Theano-like behavior example
+      >>> x = K.random_uniform_variable(shape=(2, 3), low=0, high=1)
+      >>> y = K.ones((4, 3, 5))
+      >>> xy = K.dot(x, y)
+      >>> K.int_shape(xy)
+      (2, 4, 5)
+  ```
+  """
+  if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
+    x_shape = []
+    for i, s in zip(int_shape(x), array_ops.unstack(array_ops.shape(x))):
+      if i is not None:
+        x_shape.append(i)
+      else:
+        x_shape.append(s)
+    x_shape = tuple(x_shape)
+    y_shape = []
+    for i, s in zip(int_shape(y), array_ops.unstack(array_ops.shape(y))):
+      if i is not None:
+        y_shape.append(i)
+      else:
+        y_shape.append(s)
+    y_shape = tuple(y_shape)
+    y_permute_dim = list(range(ndim(y)))
+    y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
+    xt = array_ops.reshape(x, [-1, x_shape[-1]])
+    yt = array_ops.reshape(
+        array_ops.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
+    return array_ops.reshape(
+        math_ops.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
+  if is_sparse(x):
+    out = sparse_ops.sparse_tensor_dense_matmul(x, y)
+  else:
+    out = math_ops.matmul(x, y)
+  return out
+
+
+def batch_dot(x, y, axes=None):
+  """Batchwise dot product.
+
+  `batch_dot` is used to compute dot product of `x` and `y` when
+  `x` and `y` are data in batch, i.e. in a shape of
+  `(batch_size, :)`.
+  `batch_dot` results in a tensor or variable with less dimensions
+  than the input. If the number of dimensions is reduced to 1,
+  we use `expand_dims` to make sure that ndim is at least 2.
+
+  Arguments:
+      x: Keras tensor or variable with `ndim >= 2`.
+      y: Keras tensor or variable with `ndim >= 2`.
+      axes: list of (or single) int with target dimensions.
+          The lengths of `axes[0]` and `axes[1]` should be the same.
+
+  Returns:
+      A tensor with shape equal to the concatenation of `x`'s shape
+      (less the dimension that was summed over) and `y`'s shape
+      (less the batch dimension and the dimension that was summed over).
+      If the final rank is 1, we reshape it to `(batch_size, 1)`.
+
+  Examples:
+      Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]`
+      `batch_dot(x, y, axes=1) = [[17, 53]]` which is the main diagonal
+      of `x.dot(y.T)`, although we never have to calculate the off-diagonal
+      elements.
+
+      Shape inference:
+      Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
+      If `axes` is (1, 2), to find the output shape of resultant tensor,
+          loop through each dimension in `x`'s shape and `y`'s shape:
+
+      * `x.shape[0]` : 100 : append to output shape
+      * `x.shape[1]` : 20 : do not append to output shape,
+          dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
+      * `y.shape[0]` : 100 : do not append to output shape,
+          always ignore first dimension of `y`
+      * `y.shape[1]` : 30 : append to output shape
+      * `y.shape[2]` : 20 : do not append to output shape,
+          dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
+      `output_shape` = `(100, 30)`
+
+  ```python
+      >>> x_batch = K.ones(shape=(32, 20, 1))
+      >>> y_batch = K.ones(shape=(32, 30, 20))
+      >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
+      >>> K.int_shape(xy_batch_dot)
+      (32, 1, 30)
+  ```
+  """
+  if isinstance(axes, int):
+    axes = (axes, axes)
+  x_ndim = ndim(x)
+  y_ndim = ndim(y)
+  if x_ndim > y_ndim:
+    diff = x_ndim - y_ndim
+    y = array_ops.reshape(y,
+                          array_ops.concat(
+                              [array_ops.shape(y), [1] * (diff)], axis=0))
+  elif y_ndim > x_ndim:
+    diff = y_ndim - x_ndim
+    x = array_ops.reshape(x,
+                          array_ops.concat(
+                              [array_ops.shape(x), [1] * (diff)], axis=0))
+  else:
+    diff = 0
+  if ndim(x) == 2 and ndim(y) == 2:
+    if axes[0] == axes[1]:
+      out = math_ops.reduce_sum(math_ops.multiply(x, y), axes[0])
+    else:
+      out = math_ops.reduce_sum(
+          math_ops.multiply(array_ops.transpose(x, [1, 0]), y), axes[1])
+  else:
+    if axes is not None:
+      adj_x = None if axes[0] == ndim(x) - 1 else True
+      adj_y = True if axes[1] == ndim(y) - 1 else None
+    else:
+      adj_x = None
+      adj_y = None
+    out = math_ops.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
+  if diff:
+    if x_ndim > y_ndim:
+      idx = x_ndim + y_ndim - 3
+    else:
+      idx = x_ndim - 1
+    out = array_ops.squeeze(out, list(range(idx, idx + diff)))
+  if ndim(out) == 1:
+    out = expand_dims(out, 1)
+  return out
+
+
+def transpose(x):
+  """Transposes a tensor and returns it.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+
+  Examples:
+  ```python
+      >>> var = K.variable([[1, 2, 3], [4, 5, 6]])
+      >>> K.eval(var)
+      array([[ 1.,  2.,  3.],
+             [ 4.,  5.,  6.]], dtype=float32)
+      >>> var_transposed = K.transpose(var)
+      >>> K.eval(var_transposed)
+      array([[ 1.,  4.],
+             [ 2.,  5.],
+             [ 3.,  6.]], dtype=float32)
+  ```
+
+  ```python
+      >>> input = K.placeholder((2, 3))
+      >>> input
+      <tf.Tensor 'Placeholder_11:0' shape=(2, 3) dtype=float32>
+      >>> input_transposed = K.transpose(input)
+      >>> input_transposed
+      <tf.Tensor 'transpose_4:0' shape=(3, 2) dtype=float32>
+
+  ```
+  """
+  return array_ops.transpose(x)
+
+
+def gather(reference, indices):
+  """Retrieves the elements of indices `indices` in the tensor `reference`.
+
+  Arguments:
+      reference: A tensor.
+      indices: An integer tensor of indices.
+
+  Returns:
+      A tensor of same type as `reference`.
+  """
+  return array_ops.gather(reference, indices)
+
+
+# ELEMENT-WISE OPERATIONS
+
+
+def _normalize_axis(axis, ndim):
+  """Converts negative axes to positive values.
+
+  Arguments:
+      axis: Integer axis (possibly negative).
+      ndim: Rank of the tensor considered.
+
+  Returns:
+      Positive integer axis.
+  """
+  if isinstance(axis, tuple):
+    axis = list(axis)
+  if isinstance(axis, list):
+    for i, a in enumerate(axis):
+      if a is not None and a < 0:
+        axis[i] = a % ndim
+  else:
+    if axis is not None and axis < 0:
+      axis %= ndim
+  return axis
+
+
+def max(x, axis=None, keepdims=False):
+  """Maximum value in a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to find maximum values.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with maximum values of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_max(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def min(x, axis=None, keepdims=False):
+  """Minimum value in a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to find minimum values.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with miminum values of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_min(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def sum(x, axis=None, keepdims=False):
+  """Sum of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to sum over.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with sum of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_sum(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def prod(x, axis=None, keepdims=False):
+  """Multiplies the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the product.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the product of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_prod(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def cumsum(x, axis=0):
+  """Cumulative sum of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the sum.
+
+  Returns:
+      A tensor of the cumulative sum of values of `x` along `axis`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.cumsum(x, axis=axis)
+
+
+def cumprod(x, axis=0):
+  """Cumulative product of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the product.
+
+  Returns:
+      A tensor of the cumulative product of values of `x` along `axis`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.cumprod(x, axis=axis)
+
+
+def var(x, axis=None, keepdims=False):
+  """Variance of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the variance.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the variance of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  m = math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=True)
+  devs_squared = math_ops.square(x - m)
+  return math_ops.reduce_mean(
+      devs_squared, reduction_indices=axis, keep_dims=keepdims)
+
+
+def std(x, axis=None, keepdims=False):
+  """Standard deviation of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the standard deviation.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`,
+          the reduced dimension is retained with length 1.
+
+  Returns:
+      A tensor with the standard deviation of elements of `x`.
+  """
+  return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
+
+
+def mean(x, axis=None, keepdims=False):
+  """Mean of a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: A list of integer. Axes to compute the mean.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1 for each entry in `axis`. If `keep_dims` is `True`,
+          the reduced dimensions are retained with length 1.
+
+  Returns:
+      A tensor with the mean of elements of `x`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  return math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def any(x, axis=None, keepdims=False):
+  """Bitwise reduction (logical OR).
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+      keepdims: whether the drop or broadcast the reduction axes.
+
+  Returns:
+      A uint8 tensor (0s and 1s).
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  x = math_ops.cast(x, dtypes_module.bool)
+  return math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def all(x, axis=None, keepdims=False):
+  """Bitwise reduction (logical AND).
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+      keepdims: whether the drop or broadcast the reduction axes.
+
+  Returns:
+      A uint8 tensor (0s and 1s).
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  x = math_ops.cast(x, dtypes_module.bool)
+  return math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
+
+
+def argmax(x, axis=-1):
+  """Returns the index of the maximum value along an axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+
+  Returns:
+      A tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.argmax(x, axis)
+
+
+def argmin(x, axis=-1):
+  """Returns the index of the minimum value along an axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform the reduction.
+
+  Returns:
+      A tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.argmin(x, axis)
+
+
+def square(x):
+  """Element-wise square.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.square(x)
+
+
+def abs(x):
+  """Element-wise absolute value.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.abs(x)
+
+
+def sqrt(x):
+  """Element-wise square root.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  zero = _to_tensor(0., x.dtype.base_dtype)
+  inf = _to_tensor(np.inf, x.dtype.base_dtype)
+  x = clip_ops.clip_by_value(x, zero, inf)
+  return math_ops.sqrt(x)
+
+
+def exp(x):
+  """Element-wise exponential.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.exp(x)
+
+
+def log(x):
+  """Element-wise log.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.log(x)
+
+
+def logsumexp(x, axis=None, keepdims=False):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
+
+  This function is more numerically stable than log(sum(exp(x))).
+  It avoids overflows caused by taking the exp of large inputs and
+  underflows caused by taking the log of small inputs.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to reduce over.
+      keepdims: A boolean, whether to keep the dimensions or not.
+          If `keepdims` is `False`, the rank of the tensor is reduced
+          by 1. If `keepdims` is `True`, the reduced dimension is
+          retained with length 1.
+
+  Returns:
+      The reduced tensor.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.reduce_logsumexp(x, axis=axis, keep_dims=keepdims)
+
+
+def round(x):
+  """Element-wise rounding to the closest integer.
+
+  In case of tie, the rounding mode used is "half to even".
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.round(x)
+
+
+def sign(x):
+  """Element-wise sign.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.sign(x)
+
+
+def pow(x, a):
+  """Element-wise exponentiation.
+
+  Arguments:
+      x: Tensor or variable.
+      a: Python integer.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.pow(x, a)
+
+
+def clip(x, min_value, max_value):
+  """Element-wise value clipping.
+
+  Arguments:
+      x: Tensor or variable.
+      min_value: Python float or integer.
+      max_value: Python float or integer.
+
+  Returns:
+      A tensor.
+  """
+  if max_value is not None and max_value < min_value:
+    max_value = min_value
+  if max_value is None:
+    max_value = np.inf
+  min_value = _to_tensor(min_value, x.dtype.base_dtype)
+  max_value = _to_tensor(max_value, x.dtype.base_dtype)
+  return clip_ops.clip_by_value(x, min_value, max_value)
+
+
+def equal(x, y):
+  """Element-wise equality between two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.equal(x, y)
+
+
+def not_equal(x, y):
+  """Element-wise inequality between two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.not_equal(x, y)
+
+
+def greater(x, y):
+  """Element-wise truth value of (x > y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.greater(x, y)
+
+
+def greater_equal(x, y):
+  """Element-wise truth value of (x >= y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.greater_equal(x, y)
+
+
+def less(x, y):
+  """Element-wise truth value of (x < y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.less(x, y)
+
+
+def less_equal(x, y):
+  """Element-wise truth value of (x <= y).
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A bool tensor.
+  """
+  return math_ops.less_equal(x, y)
+
+
+def maximum(x, y):
+  """Element-wise maximum of two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.maximum(x, y)
+
+
+def minimum(x, y):
+  """Element-wise minimum of two tensors.
+
+  Arguments:
+      x: Tensor or variable.
+      y: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.minimum(x, y)
+
+
+def sin(x):
+  """Computes sin of x element-wise.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.sin(x)
+
+
+def cos(x):
+  """Computes cos of x element-wise.
+
+  Arguments:
+      x: Tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return math_ops.cos(x)
+
+
+def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
+  """Computes mean and std for batch then apply batch_normalization on batch.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(
+      x, reduction_axes, shift=None, name=None, keep_dims=False)
+  if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+    normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  else:
+    # need broadcasting
+    target_shape = []
+    for axis in range(ndim(x)):
+      if axis in reduction_axes:
+        target_shape.append(1)
+      else:
+        target_shape.append(array_ops.shape(x)[axis])
+    target_shape = array_ops.stack(target_shape)
+
+    broadcast_mean = array_ops.reshape(mean, target_shape)
+    broadcast_var = array_ops.reshape(var, target_shape)
+    if gamma is None:
+      broadcast_gamma = None
+    else:
+      broadcast_gamma = array_ops.reshape(gamma, target_shape)
+    if beta is None:
+      broadcast_beta = None
+    else:
+      broadcast_beta = array_ops.reshape(beta, target_shape)
+    normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
+                                    broadcast_beta, broadcast_gamma, epsilon)
+  return normed, mean, var
+
+
+def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
+  """Applies batch normalization on x given mean, var, beta and gamma.
+
+  I.e. returns:
+  `output = (x - mean) / (sqrt(var) + epsilon) * gamma + beta`
+
+  Arguments:
+      x: Input tensor or variable.
+      mean: Mean of batch.
+      var: Variance of batch.
+      beta: Tensor with which to center the input.
+      gamma: Tensor by which to scale the input.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tensor.
+  """
+  return nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+
+
+# SHAPE OPERATIONS
+
+
+def concatenate(tensors, axis=-1):
+  """Concatenates a list of tensors alongside the specified axis.
+
+  Arguments:
+      tensors: list of tensors to concatenate.
+      axis: concatenation axis.
+
+  Returns:
+      A tensor.
+  """
+  if axis < 0:
+    rank = ndim(tensors[0])
+    if rank:
+      axis %= rank
+    else:
+      axis = 0
+
+  if py_all([is_sparse(x) for x in tensors]):
+    return sparse_ops.sparse_concat(axis, tensors)
+  else:
+    return array_ops.concat([to_dense(x) for x in tensors], axis)
+
+
+def reshape(x, shape):
+  """Reshapes a tensor to the specified shape.
+
+  Arguments:
+      x: Tensor or variable.
+      shape: Target shape tuple.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.reshape(x, shape)
+
+
+def permute_dimensions(x, pattern):
+  """Permutes axes in a tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      pattern: A tuple of
+          dimension indices, e.g. `(0, 2, 1)`.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.transpose(x, perm=pattern)
+
+
+def resize_images(x, height_factor, width_factor, data_format):
+  """Resizes the images contained in a 4D tensor.
+
+  Arguments:
+      x: Tensor or variable to resize.
+      height_factor: Positive integer.
+      width_factor: Positive integer.
+      data_format: One of `"channels_first"`, `"channels_last"`.
+
+  Returns:
+      A tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  if data_format == 'channels_first':
+    original_shape = int_shape(x)
+    new_shape = array_ops.shape(x)[2:]
+    new_shape *= constant_op.constant(
+        np.array([height_factor, width_factor]).astype('int32'))
+    x = permute_dimensions(x, [0, 2, 3, 1])
+    x = image_ops.resize_nearest_neighbor(x, new_shape)
+    x = permute_dimensions(x, [0, 3, 1, 2])
+    x.set_shape((None, None, original_shape[2] * height_factor
+                 if original_shape[2] is not None else None,
+                 original_shape[3] * width_factor
+                 if original_shape[3] is not None else None))
+    return x
+  elif data_format == 'channels_last':
+    original_shape = int_shape(x)
+    new_shape = array_ops.shape(x)[1:3]
+    new_shape *= constant_op.constant(
+        np.array([height_factor, width_factor]).astype('int32'))
+    x = image_ops.resize_nearest_neighbor(x, new_shape)
+    x.set_shape((None, original_shape[1] * height_factor
+                 if original_shape[1] is not None else None,
+                 original_shape[2] * width_factor
+                 if original_shape[2] is not None else None, None))
+    return x
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
+  """Resizes the volume contained in a 5D tensor.
+
+  Arguments:
+      x: Tensor or variable to resize.
+      depth_factor: Positive integer.
+      height_factor: Positive integer.
+      width_factor: Positive integer.
+      data_format: One of `"channels_first"`, `"channels_last"`.
+
+  Returns:
+      A tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  if data_format == 'channels_first':
+    output = repeat_elements(x, depth_factor, axis=2)
+    output = repeat_elements(output, height_factor, axis=3)
+    output = repeat_elements(output, width_factor, axis=4)
+    return output
+  elif data_format == 'channels_last':
+    output = repeat_elements(x, depth_factor, axis=1)
+    output = repeat_elements(output, height_factor, axis=2)
+    output = repeat_elements(output, width_factor, axis=3)
+    return output
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def repeat_elements(x, rep, axis):
+  """Repeats the elements of a tensor along an axis, like `np.repeat`.
+
+  If `x` has shape `(s1, s2, s3)` and `axis` is `1`, the output
+  will have shape `(s1, s2 * rep, s3)`.
+
+  Arguments:
+      x: Tensor or variable.
+      rep: Python integer, number of times to repeat.
+      axis: Axis along which to repeat.
+
+  Raises:
+      ValueError: In case `x.shape[axis]` is undefined.
+
+  Returns:
+      A tensor.
+  """
+  x_shape = x.get_shape().as_list()
+  if x_shape[axis] is None:
+    raise ValueError('Axis ' + str(axis) + ' of input tensor '
+                     'should have a defined dimension, but is None. '
+                     'Full tensor shape: ' + str(tuple(x_shape)) + '. '
+                     'Typically you need to pass a fully-defined '
+                     '`input_shape` argument to your first layer.')
+  # slices along the repeat axis
+  splits = array_ops.split(value=x, num_or_size_splits=x_shape[axis], axis=axis)
+  # repeat each slice the given number of reps
+  x_rep = [s for s in splits for _ in range(rep)]
+  return concatenate(x_rep, axis)
+
+
+def repeat(x, n):
+  """Repeats a 2D tensor.
+
+  if `x` has shape (samples, dim) and `n` is `2`,
+  the output will have shape `(samples, 2, dim)`.
+
+  Arguments:
+      x: Tensor or variable.
+      n: Python integer, number of times to repeat.
+
+  Returns:
+      A tensor.
+  """
+  assert ndim(x) == 2
+  x = array_ops.expand_dims(x, 1)
+  pattern = array_ops.stack([1, n, 1])
+  return array_ops.tile(x, pattern)
+
+
+def arange(start, stop=None, step=1, dtype='int32'):
+  """Creates a 1D tensor containing a sequence of integers.
+
+  The function arguments use the same convention as
+  Theano's arange: if only one argument is provided,
+  it is in fact the "stop" argument.
+
+  The default type of the returned tensor is `'int32'` to
+  match TensorFlow's default.
+
+  Arguments:
+      start: Start value.
+      stop: Stop value.
+      step: Difference between two successive values.
+      dtype: Integer dtype to use.
+
+  Returns:
+      An integer tensor.
+
+  """
+  # Match the behavior of numpy and Theano by returning an empty seqence.
+  if stop is None and start < 0:
+    start = 0
+  result = math_ops.range(start, limit=stop, delta=step, name='arange')
+  if dtype != 'int32':
+    result = cast(result, dtype)
+  return result
+
+
+def tile(x, n):
+  """Creates a tensor by tiling `x` by `n`.
+
+  Arguments:
+      x: A tensor or variable
+      n: A list of integer. The length must be the same as the number of
+          dimensions in `x`.
+
+  Returns:
+      A tiled tensor.
+  """
+  if isinstance(n, int):
+    n = [n]
+  return array_ops.tile(x, n)
+
+
+def flatten(x):
+  """Flatten a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor, reshaped into 1-D
+  """
+  return array_ops.reshape(x, [-1])
+
+
+def batch_flatten(x):
+  """Turn a nD tensor into a 2D tensor with same 0th dimension.
+
+  In other words, it flattens each data samples of a batch.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  x = array_ops.reshape(x, array_ops.stack([-1, prod(shape(x)[1:])]))
+  return x
+
+
+def expand_dims(x, axis=-1):
+  """Adds a 1-sized dimension at index "axis".
+
+  Arguments:
+      x: A tensor or variable.
+      axis: Position where to add a new axis.
+
+  Returns:
+      A tensor with expanded dimensions.
+  """
+  return array_ops.expand_dims(x, axis)
+
+
+def squeeze(x, axis):
+  """Removes a 1-dimension from the tensor at index "axis".
+
+  Arguments:
+      x: A tensor or variable.
+      axis: Axis to drop.
+
+  Returns:
+      A tensor with the same data as `x` but reduced dimensions.
+  """
+  return array_ops.squeeze(x, [axis])
+
+
+def temporal_padding(x, padding=(1, 1)):
+  """Pads the middle dimension of a 3D tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 2 integers, how many zeros to
+          add at the start and end of dim 1.
+
+  Returns:
+      A padded 3D tensor.
+  """
+  assert len(padding) == 2
+  pattern = [[0, 0], [padding[0], padding[1]], [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
+  """Pads the 2nd and 3rd dimensions of a 4D tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 2 tuples, padding pattern.
+      data_format: One of `channels_last` or `channels_first`.
+
+  Returns:
+      A padded 4D tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+  """
+  assert len(padding) == 2
+  assert len(padding[0]) == 2
+  assert len(padding[1]) == 2
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if data_format == 'channels_first':
+    pattern = [[0, 0], [0, 0], list(padding[0]), list(padding[1])]
+  else:
+    pattern = [[0, 0], list(padding[0]), list(padding[1]), [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
+  """Pads 5D tensor with zeros along the depth, height, width dimensions.
+
+  Pads these dimensions with respectively
+  "padding[0]", "padding[1]" and "padding[2]" zeros left and right.
+
+  For 'channels_last' data_format,
+  the 2nd, 3rd and 4th dimension will be padded.
+  For 'channels_first' data_format,
+  the 3rd, 4th and 5th dimension will be padded.
+
+  Arguments:
+      x: Tensor or variable.
+      padding: Tuple of 3 tuples, padding pattern.
+      data_format: One of `channels_last` or `channels_first`.
+
+  Returns:
+      A padded 5D tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+
+  """
+  assert len(padding) == 3
+  assert len(padding[0]) == 2
+  assert len(padding[1]) == 2
+  assert len(padding[2]) == 2
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if data_format == 'channels_first':
+    pattern = [[0, 0], [0, 0], [padding[0][0], padding[0][1]],
+               [padding[1][0], padding[1][1]], [padding[2][0], padding[2][1]]]
+  else:
+    pattern = [[0, 0], [padding[0][0], padding[0][1]],
+               [padding[1][0], padding[1][1]], [padding[2][0],
+                                                padding[2][1]], [0, 0]]
+  return array_ops.pad(x, pattern)
+
+
+def stack(x, axis=0):
+  """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
+
+  Arguments:
+      x: List of tensors.
+      axis: Axis along which to perform stacking.
+
+  Returns:
+      A tensor.
+  """
+  return array_ops.stack(x, axis=axis)
+
+
+def one_hot(indices, num_classes):
+  """Computes the one-hot representation of an integer tensor.
+
+  Arguments:
+      indices: nD integer tensor of shape
+          `(batch_size, dim1, dim2, ... dim(n-1))`
+      num_classes: Integer, number of classes to consider.
+
+  Returns:
+      (n + 1)D one hot representation of the input
+      with shape `(batch_size, dim1, dim2, ... dim(n-1), num_classes)`
+
+  Returns:
+      The one-hot tensor.
+  """
+  return array_ops.one_hot(indices, depth=num_classes, axis=-1)
+
+
+def reverse(x, axes):
+  """Reverse a tensor along the specified axes.
+
+  Arguments:
+      x: Tensor to reverse.
+      axes: Integer or iterable of integers.
+          Axes to reverse.
+
+  Returns:
+      A tensor.
+  """
+  if isinstance(axes, int):
+    axes = [axes]
+  return array_ops.reverse(x, axes)
+
+
+# VALUE MANIPULATION
+
+
+def get_value(x):
+  """Returns the value of a variable.
+
+  Arguments:
+      x: input variable.
+
+  Returns:
+      A Numpy array.
+  """
+  return x.eval(session=get_session())
+
+
+def batch_get_value(tensors):
+  """Returns the value of more than one tensor variable.
+
+  Arguments:
+      tensors: list of ops to run.
+
+  Returns:
+      A list of Numpy arrays.
+  """
+  if tensors:
+    return get_session().run(tensors)
+  else:
+    return []
+
+
+def set_value(x, value):
+  """Sets the value of a variable, from a Numpy array.
+
+  Arguments:
+      x: Tensor to set to a new value.
+      value: Value to set the tensor to, as a Numpy array
+          (of the same shape).
+  """
+  value = np.asarray(value)
+  tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+  if hasattr(x, '_assign_placeholder'):
+    assign_placeholder = x._assign_placeholder
+    assign_op = x._assign_op
+  else:
+    assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+    assign_op = x.assign(assign_placeholder)
+    x._assign_placeholder = assign_placeholder
+    x._assign_op = assign_op
+  get_session().run(assign_op, feed_dict={assign_placeholder: value})
+
+
+def batch_set_value(tuples):
+  """Sets the values of many tensor variables at once.
+
+  Arguments:
+      tuples: a list of tuples `(tensor, value)`.
+          `value` should be a Numpy array.
+  """
+  if tuples:
+    assign_ops = []
+    feed_dict = {}
+    for x, value in tuples:
+      value = np.asarray(value)
+      tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+      if hasattr(x, '_assign_placeholder'):
+        assign_placeholder = x._assign_placeholder
+        assign_op = x._assign_op
+      else:
+        assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+        assign_op = x.assign(assign_placeholder)
+        x._assign_placeholder = assign_placeholder
+        x._assign_op = assign_op
+      assign_ops.append(assign_op)
+      feed_dict[assign_placeholder] = value
+    get_session().run(assign_ops, feed_dict=feed_dict)
+
+
+def print_tensor(x, message=''):
+  """Prints `message` and the tensor value when evaluated.
+
+  Arguments:
+      x: Tensor to print.
+      message: Message to print jointly with the tensor.
+
+  Returns:
+      The same tensor `x`, unchanged.
+  """
+  return logging_ops.Print(x, [x], message)
+
+
+# GRAPH MANIPULATION
+
+
+class Function(object):
+  """Runs a computation graph.
+
+  Arguments:
+      inputs: Feed placeholders to the computation graph.
+      outputs: Output tensors to fetch.
+      updates: Additional update ops to be run at function call.
+      name: a name to help users identify what this function does.
+  """
+
+  def __init__(self, inputs, outputs, updates=None, name=None,
+               **session_kwargs):
+    updates = updates or []
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` to a TensorFlow backend function '
+                      'should be a list or tuple.')
+    if not isinstance(outputs, (list, tuple)):
+      raise TypeError('`outputs` of a TensorFlow backend function '
+                      'should be a list or tuple.')
+    if not isinstance(updates, (list, tuple)):
+      raise TypeError('`updates` in a TensorFlow backend function '
+                      'should be a list or tuple.')
+    self.inputs = list(inputs)
+    self.outputs = list(outputs)
+    with ops.control_dependencies(self.outputs):
+      updates_ops = []
+      for update in updates:
+        if isinstance(update, tuple):
+          p, new_p = update
+          updates_ops.append(state_ops.assign(p, new_p))
+        else:
+          # assumed already an op
+          updates_ops.append(update)
+      self.updates_op = control_flow_ops.group(*updates_ops)
+    self.name = name
+    self.session_kwargs = session_kwargs
+
+  def __call__(self, inputs):
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` should be a list or tuple.')
+    feed_dict = {}
+    for tensor, value in zip(self.inputs, inputs):
+      if is_sparse(tensor):
+        sparse_coo = value.tocoo()
+        indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
+                                  np.expand_dims(sparse_coo.col, 1)), 1)
+        value = (indices, sparse_coo.data, sparse_coo.shape)
+      feed_dict[tensor] = value
+    session = get_session()
+    updated = session.run(
+        self.outputs + [self.updates_op],
+        feed_dict=feed_dict,
+        **self.session_kwargs)
+    return updated[:len(self.outputs)]
+
+
+def function(inputs, outputs, updates=None, **kwargs):
+  """Instantiates a Keras function.
+
+  Arguments:
+      inputs: List of placeholder tensors.
+      outputs: List of output tensors.
+      updates: List of update ops.
+      **kwargs: Passed to `tf.Session.run`.
+
+  Returns:
+      Output values as Numpy arrays.
+
+  Raises:
+      ValueError: if invalid kwargs are passed in.
+  """
+  if kwargs:
+    for key in kwargs:
+      if (key not in tf_inspect.getargspec(session_module.Session.run)[0] and
+          key not in tf_inspect.getargspec(Function.__init__)[0]):
+        msg = ('Invalid argument "%s" passed to K.function with Tensorflow '
+               'backend') % key
+        raise ValueError(msg)
+  return Function(inputs, outputs, updates=updates, **kwargs)
+
+
+def gradients(loss, variables):
+  """Returns the gradients of `variables` w.r.t. `loss`.
+
+  Arguments:
+      loss: Scalar tensor to minimize.
+      variables: List of variables.
+
+  Returns:
+      A gradients tensor.
+  """
+  return gradients_module.gradients(
+      loss, variables, colocate_gradients_with_ops=True)
+
+
+def stop_gradient(variables):
+  """Returns `variables` but with zero gradient w.r.t. every other variable.
+
+  Arguments:
+      variables: List of variables.
+
+  Returns:
+      The same list of variables.
+  """
+  return array_ops.stop_gradient(variables)
+
+
+# CONTROL FLOW
+
+
+def rnn(step_function,
+        inputs,
+        initial_states,
+        go_backwards=False,
+        mask=None,
+        constants=None,
+        unroll=False):
+  """Iterates over the time dimension of a tensor.
+
+  Arguments:
+      step_function: RNN step function.
+          Parameters;
+              input; tensor with shape `(samples, ...)` (no time dimension),
+                  representing input for the batch of samples at a certain
+                  time step.
+              states; list of tensors.
+          Returns;
+              output; tensor with shape `(samples, output_dim)`
+                  (no time dimension).
+              new_states; list of tensors, same length and shapes
+                  as 'states'. The first state in the list must be the
+                  output tensor at the previous timestep.
+      inputs: tensor of temporal data of shape `(samples, time, ...)`
+          (at least 3D).
+      initial_states: tensor with shape (samples, output_dim)
+          (no time dimension),
+          containing the initial values for the states used in
+          the step function.
+      go_backwards: boolean. If True, do the iteration over the time
+          dimension in reverse order and return the reversed sequence.
+      mask: binary tensor with shape `(samples, time, 1)`,
+          with a zero for every element that is masked.
+      constants: a list of constant values passed at each step.
+      unroll: whether to unroll the RNN or to use a symbolic loop
+          (`while_loop` or `scan` depending on backend).
+
+  Returns:
+      A tuple, `(last_output, outputs, new_states)`.
+          last_output: the latest output of the rnn, of shape `(samples, ...)`
+          outputs: tensor with shape `(samples, time, ...)` where each
+              entry `outputs[s, t]` is the output of the step function
+              at time `t` for sample `s`.
+          new_states: list of tensors, latest states returned by
+              the step function, of shape `(samples, ...)`.
+
+  Raises:
+      ValueError: if input dimension is less than 3.
+      ValueError: if `unroll` is `True` but input timestep is not a fixed
+      number.
+      ValueError: if `mask` is provided (not `None`) but states is not provided
+          (`len(states)` == 0).
+  """
+  ndim = len(inputs.get_shape())
+  if ndim < 3:
+    raise ValueError('Input should be at least 3D.')
+  axes = [1, 0] + list(range(2, ndim))
+  inputs = array_ops.transpose(inputs, (axes))
+
+  if mask is not None:
+    if mask.dtype != dtypes_module.bool:
+      mask = math_ops.cast(mask, dtypes_module.bool)
+    if len(mask.get_shape()) == ndim - 1:
+      mask = expand_dims(mask)
+    mask = array_ops.transpose(mask, axes)
+
+  if constants is None:
+    constants = []
+
+  if unroll:
+    if not inputs.get_shape()[0]:
+      raise ValueError('Unrolling requires a ' 'fixed number of timesteps.')
+    states = initial_states
+    successive_states = []
+    successive_outputs = []
+
+    input_list = array_ops.unstack(inputs)
+    if go_backwards:
+      input_list.reverse()
+
+    if mask is not None:
+      mask_list = array_ops.unstack(mask)
+      if go_backwards:
+        mask_list.reverse()
+
+      for inp, mask_t in zip(input_list, mask_list):
+        output, new_states = step_function(inp, states + constants)
+
+        # tf.where needs its condition tensor
+        # to be the same shape as its two
+        # result tensors, but in our case
+        # the condition (mask) tensor is
+        # (nsamples, 1), and A and B are (nsamples, ndimensions).
+        # So we need to
+        # broadcast the mask to match the shape of A and B.
+        # That's what the tile call does,
+        # it just repeats the mask along its second dimension
+        # n times.
+        tiled_mask_t = array_ops.tile(mask_t,
+                                      array_ops.stack(
+                                          [1, array_ops.shape(output)[1]]))
+
+        if not successive_outputs:
+          prev_output = zeros_like(output)
+        else:
+          prev_output = successive_outputs[-1]
+
+        output = array_ops.where(tiled_mask_t, output, prev_output)
+
+        return_states = []
+        for state, new_state in zip(states, new_states):
+          # (see earlier comment for tile explanation)
+          tiled_mask_t = array_ops.tile(mask_t,
+                                        array_ops.stack(
+                                            [1,
+                                             array_ops.shape(new_state)[1]]))
+          return_states.append(array_ops.where(tiled_mask_t, new_state, state))
+        states = return_states
+        successive_outputs.append(output)
+        successive_states.append(states)
+      last_output = successive_outputs[-1]
+      new_states = successive_states[-1]
+      outputs = array_ops.stack(successive_outputs)
+    else:
+      for inp in input_list:
+        output, states = step_function(inp, states + constants)
+        successive_outputs.append(output)
+        successive_states.append(states)
+      last_output = successive_outputs[-1]
+      new_states = successive_states[-1]
+      outputs = array_ops.stack(successive_outputs)
+
+  else:
+    if go_backwards:
+      inputs = reverse(inputs, 0)
+
+    states = tuple(initial_states)
+
+    time_steps = array_ops.shape(inputs)[0]
+    outputs, _ = step_function(inputs[0], initial_states + constants)
+    output_ta = tensor_array_ops.TensorArray(
+        dtype=outputs.dtype, size=time_steps, tensor_array_name='output_ta')
+    input_ta = tensor_array_ops.TensorArray(
+        dtype=inputs.dtype, size=time_steps, tensor_array_name='input_ta')
+    input_ta = input_ta.unstack(inputs)
+    time = constant_op.constant(0, dtype='int32', name='time')
+
+    if mask is not None:
+      if not states:
+        raise ValueError('No initial states provided! '
+                         'When using masking in an RNN, you should '
+                         'provide initial states '
+                         '(and your step function should return '
+                         'as its first state at time `t` '
+                         'the output at time `t-1`).')
+      if go_backwards:
+        mask = reverse(mask, 0)
+
+      mask_ta = tensor_array_ops.TensorArray(
+          dtype=dtypes_module.bool,
+          size=time_steps,
+          tensor_array_name='mask_ta')
+      mask_ta = mask_ta.unstack(mask)
+
+      def _step(time, output_ta_t, *states):
+        """RNN step function.
+
+        Arguments:
+            time: Current timestep value.
+            output_ta_t: TensorArray.
+            *states: List of states.
+
+        Returns:
+            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+        """
+        current_input = input_ta.read(time)
+        mask_t = mask_ta.read(time)
+        output, new_states = step_function(current_input,
+                                           tuple(states) + tuple(constants))
+        for state, new_state in zip(states, new_states):
+          new_state.set_shape(state.get_shape())
+        tiled_mask_t = array_ops.tile(mask_t,
+                                      array_ops.stack(
+                                          [1, array_ops.shape(output)[1]]))
+        output = array_ops.where(tiled_mask_t, output, states[0])
+        new_states = [
+            array_ops.where(tiled_mask_t, new_states[i], states[i])
+            for i in range(len(states))
+        ]
+        output_ta_t = output_ta_t.write(time, output)
+        return (time + 1, output_ta_t) + tuple(new_states)
+    else:
+
+      def _step(time, output_ta_t, *states):
+        """RNN step function.
+
+        Arguments:
+            time: Current timestep value.
+            output_ta_t: TensorArray.
+            *states: List of states.
+
+        Returns:
+            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+        """
+        current_input = input_ta.read(time)
+        output, new_states = step_function(current_input,
+                                           tuple(states) + tuple(constants))
+        for state, new_state in zip(states, new_states):
+          new_state.set_shape(state.get_shape())
+        output_ta_t = output_ta_t.write(time, output)
+        return (time + 1, output_ta_t) + tuple(new_states)
+
+    final_outputs = control_flow_ops.while_loop(
+        cond=lambda time, *_: time < time_steps,
+        body=_step,
+        loop_vars=(time, output_ta) + states,
+        parallel_iterations=32,
+        swap_memory=True)
+    last_time = final_outputs[0]
+    output_ta = final_outputs[1]
+    new_states = final_outputs[2:]
+
+    outputs = output_ta.stack()
+    last_output = output_ta.read(last_time - 1)
+
+  axes = [1, 0] + list(range(2, len(outputs.get_shape())))
+  outputs = array_ops.transpose(outputs, axes)
+  return last_output, outputs, new_states
+
+
+def switch(condition, then_expression, else_expression):
+  """Switches between two operations depending on a scalar value.
+
+  Note that both `then_expression` and `else_expression`
+  should be symbolic tensors of the *same shape*.
+
+  Arguments:
+      condition: scalar tensor (`int` or `bool`).
+      then_expression: either a tensor, or a callable that returns a tensor.
+      else_expression: either a tensor, or a callable that returns a tensor.
+
+  Returns:
+      The selected tensor.
+  """
+  if condition.dtype != dtypes_module.bool:
+    condition = math_ops.cast(condition, 'bool')
+  if not callable(then_expression):
+
+    def then_expression_fn():
+      return then_expression
+  else:
+    then_expression_fn = then_expression
+  if not callable(else_expression):
+
+    def else_expression_fn():
+      return else_expression
+  else:
+    else_expression_fn = else_expression
+  x = control_flow_ops.cond(condition, then_expression_fn, else_expression_fn)
+  return x
+
+
+def in_train_phase(x, alt, training=None):
+  """Selects `x` in train phase, and `alt` otherwise.
+
+  Note that `alt` should have the *same shape* as `x`.
+
+  Arguments:
+      x: What to return in train phase
+          (tensor or callable that returns a tensor).
+      alt: What to return otherwise
+          (tensor or callable that returns a tensor).
+      training: Optional scalar tensor
+          (or Python boolean, or Python integer)
+          specifying the learning phase.
+
+  Returns:
+      Either `x` or `alt` based on the `training` flag.
+      the `training` flag defaults to `K.learning_phase()`.
+  """
+  if training is None:
+    training = learning_phase()
+    uses_learning_phase = True
+  else:
+    uses_learning_phase = False
+
+  if training is 1 or training is True:
+    if callable(x):
+      return x()
+    else:
+      return x
+
+  elif training is 0 or training is False:
+    if callable(alt):
+      return alt()
+    else:
+      return alt
+
+  # else: assume learning phase is a placeholder tensor.
+  x = switch(training, x, alt)
+  if uses_learning_phase:
+    x._uses_learning_phase = True
+  return x
+
+
+def in_test_phase(x, alt, training=None):
+  """Selects `x` in test phase, and `alt` otherwise.
+
+  Note that `alt` should have the *same shape* as `x`.
+
+  Arguments:
+      x: What to return in test phase
+          (tensor or callable that returns a tensor).
+      alt: What to return otherwise
+          (tensor or callable that returns a tensor).
+      training: Optional scalar tensor
+          (or Python boolean, or Python integer)
+          specifying the learning phase.
+
+  Returns:
+      Either `x` or `alt` based on `K.learning_phase`.
+  """
+  return in_train_phase(alt, x, training=training)
+
+
+# NN OPERATIONS
+
+
+def relu(x, alpha=0., max_value=None):
+  """Rectified linear unit.
+
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Arguments:
+      x: A tensor or variable.
+      alpha: A scalar, slope of negative section (default=`0.`).
+      max_value: Saturation threshold.
+
+  Returns:
+      A tensor.
+  """
+  if alpha != 0.:
+    negative_part = nn.relu(-x)
+  x = nn.relu(x)
+  if max_value is not None:
+    max_value = _to_tensor(max_value, x.dtype.base_dtype)
+    zero = _to_tensor(0., x.dtype.base_dtype)
+    x = clip_ops.clip_by_value(x, zero, max_value)
+  if alpha != 0.:
+    alpha = _to_tensor(alpha, x.dtype.base_dtype)
+    x -= alpha * negative_part
+  return x
+
+
+def elu(x, alpha=1.):
+  """Exponential linear unit.
+
+  Arguments:
+      x: A tenor or variable to compute the activation function for.
+      alpha: A scalar, slope of positive section.
+
+  Returns:
+      A tensor.
+  """
+  res = nn.elu(x)
+  if alpha == 1:
+    return res
+  else:
+    return array_ops.where(x > 0, res, alpha * res)
+
+
+def softmax(x):
+  """Softmax of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softmax(x)
+
+
+def softplus(x):
+  """Softplus of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softplus(x)
+
+
+def softsign(x):
+  """Softsign of a tensor.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.softsign(x)
+
+
+def categorical_crossentropy(output, target, from_logits=False):
+  """Categorical crossentropy between an output tensor and a target tensor.
+
+  Arguments:
+      output: A tensor resulting from a softmax
+          (unless `from_logits` is True, in which
+          case `output` is expected to be the logits).
+      target: A tensor of the same shape as `output`.
+      from_logits: Boolean, whether `output` is the
+          result of a softmax, or is a tensor of logits.
+
+  Returns:
+      Output tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    # scale preds so that the class probas of each sample sum to 1
+    output /= math_ops.reduce_sum(
+        output, reduction_indices=len(output.get_shape()) - 1, keep_dims=True)
+    # manual computation of crossentropy
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1. - epsilon)
+    return -math_ops.reduce_sum(
+        target * math_ops.log(output),
+        reduction_indices=len(output.get_shape()) - 1)
+  else:
+    return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
+
+
+def sparse_categorical_crossentropy(output, target, from_logits=False):
+  """Categorical crossentropy with integer targets.
+
+  Arguments:
+      output: A tensor resulting from a softmax
+          (unless `from_logits` is True, in which
+          case `output` is expected to be the logits).
+      target: An integer tensor.
+      from_logits: Boolean, whether `output` is the
+          result of a softmax, or is a tensor of logits.
+
+  Returns:
+      Output tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1 - epsilon)
+    output = math_ops.log(output)
+
+  output_shape = output.get_shape()
+  targets = cast(flatten(target), 'int64')
+  logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
+  res = nn.sparse_softmax_cross_entropy_with_logits(
+      labels=targets, logits=logits)
+  if len(output_shape) == 3:
+    # if our output includes timesteps we need to reshape
+    return array_ops.reshape(res, array_ops.shape(output)[:-1])
+  else:
+    return res
+
+
+def binary_crossentropy(output, target, from_logits=False):
+  """Binary crossentropy between an output tensor and a target tensor.
+
+  Arguments:
+      output: A tensor.
+      target: A tensor with the same shape as `output`.
+      from_logits: Whether `output` is expected to be a logits tensor.
+          By default, we consider that `output`
+          encodes a probability distribution.
+
+  Returns:
+      A tensor.
+  """
+  # Note: nn.softmax_cross_entropy_with_logits
+  # expects logits, Keras expects probabilities.
+  if not from_logits:
+    # transform back to logits
+    epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
+    output = clip_ops.clip_by_value(output, epsilon, 1 - epsilon)
+    output = math_ops.log(output / (1 - output))
+  return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
+
+
+def sigmoid(x):
+  """Element-wise sigmoid.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.sigmoid(x)
+
+
+def hard_sigmoid(x):
+  """Segment-wise linear approximation of sigmoid.
+
+  Faster than sigmoid.
+  Returns `0.` if `x < -2.5`, `1.` if `x > 2.5`.
+  In `-2.5 <= x <= 2.5`, returns `0.2 * x + 0.5`.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  x = (0.2 * x) + 0.5
+  zero = _to_tensor(0., x.dtype.base_dtype)
+  one = _to_tensor(1., x.dtype.base_dtype)
+  x = clip_ops.clip_by_value(x, zero, one)
+  return x
+
+
+def tanh(x):
+  """Element-wise tanh.
+
+  Arguments:
+      x: A tensor or variable.
+
+  Returns:
+      A tensor.
+  """
+  return nn.tanh(x)
+
+
+def dropout(x, level, noise_shape=None, seed=None):
+  """Sets entries in `x` to zero at random, while scaling the entire tensor.
+
+  Arguments:
+      x: tensor
+      level: fraction of the entries in the tensor
+          that will be set to 0.
+      noise_shape: shape for randomly generated keep/drop flags,
+          must be broadcastable to the shape of `x`
+      seed: random seed to ensure determinism.
+
+  Returns:
+      A tensor.
+  """
+  retain_prob = 1. - level
+  if seed is None:
+    seed = np.random.randint(10e6)
+  # the dummy 1. works around a TF bug
+  # (float32_ref vs. float32 incomptability)
+  return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
+
+
+def l2_normalize(x, axis):
+  """Normalizes a tensor wrt the L2 norm alongside the specified axis.
+
+  Arguments:
+      x: Tensor or variable.
+      axis: axis along which to perform normalization.
+
+  Returns:
+      A tensor.
+  """
+  if axis < 0:
+    axis %= len(x.get_shape())
+  return nn.l2_normalize(x, dim=axis)
+
+
+def in_top_k(predictions, targets, k):
+  """Returns whether the `targets` are in the top `k` `predictions`.
+
+  Arguments:
+      predictions: A tensor of shape `(batch_size, classes)` and type `float32`.
+      targets: A 1D tensor of length `batch_size` and type `int32` or `int64`.
+      k: An `int`, number of top elements to consider.
+
+  Returns:
+      A 1D tensor of length `batch_size` and type `bool`.
+      `output[i]` is `True` if `predictions[i, targets[i]]` is within top-`k`
+      values of `predictions[i]`.
+  """
+  return nn.in_top_k(predictions, targets, k)
+
+
+# CONVOLUTIONS
+
+
+def _preprocess_deconv_output_shape(x, shape, data_format):
+  """Get the output_shape for the deconvolution.
+
+  Arguments:
+      x: input tensor.
+      shape: output shape.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      The output shape.
+  """
+  if data_format == 'channels_first':
+    shape = (shape[0], shape[2], shape[3], shape[1])
+
+  if shape[0] is None:
+    shape = (array_ops.shape(x)[0],) + tuple(shape[1:])
+    shape = array_ops.stack(list(shape))
+  return shape
+
+
+def _preprocess_conv2d_input(x, data_format):
+  """Transpose and cast the input before the conv2d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
+  if dtype(x) == 'float64':
+    x = math_ops.cast(x, 'float32')
+  if data_format == 'channels_first':
+    # TF uses the last dimension as channel dimension,
+    # instead of the 2nd one.
+    # TH input shape: (samples, input_depth, rows, cols)
+    # TF input shape: (samples, rows, cols, input_depth)
+    x = array_ops.transpose(x, (0, 2, 3, 1))
+  return x
+
+
+def _preprocess_conv3d_input(x, data_format):
+  """Transpose and cast the input before the conv3d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
+  if dtype(x) == 'float64':
+    x = math_ops.cast(x, 'float32')
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 2, 3, 4, 1))
+  return x
+
+
+def _preprocess_conv2d_kernel(kernel, data_format):
+  """Transpose and cast the kernel before the conv2d.
+
+  Arguments:
+      kernel: kernel tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
+  if dtype(kernel) == 'float64':
+    kernel = math_ops.cast(kernel, 'float32')
+  if data_format == 'channels_first':
+    kernel = array_ops.transpose(kernel, (2, 3, 1, 0))
+  return kernel
+
+
+def _preprocess_conv3d_kernel(kernel, data_format):
+  """Transpose and cast the kernel before the conv3d.
+
+  Arguments:
+      kernel: kernel tensor.
+      data_format: string, one of 'channels_last', 'channels_first'.
+
+  Returns:
+      A tensor.
+  """
+  if dtype(kernel) == 'float64':
+    kernel = math_ops.cast(kernel, 'float32')
+  if data_format == 'channels_first':
+    kernel = array_ops.transpose(kernel, (2, 3, 4, 1, 0))
+  return kernel
+
+
+def _preprocess_padding(padding):
+  """Convert keras' padding to tensorflow's padding.
+
+  Arguments:
+      padding: string, one of 'same' , 'valid'
+
+  Returns:
+      a string, one of 'SAME', 'VALID'.
+
+  Raises:
+      ValueError: if invalid `padding'`
+  """
+  if padding == 'same':
+    padding = 'SAME'
+  elif padding == 'valid':
+    padding = 'VALID'
+  else:
+    raise ValueError('Invalid padding:', padding)
+  return padding
+
+
+def _postprocess_conv2d_output(x, data_format):
+  """Transpose and cast the output from conv2d if needed.
+
+  Arguments:
+      x: A tensor.
+      data_format: string, one of "channels_last", "channels_first".
+
+  Returns:
+      A tensor.
+  """
+
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 3, 1, 2))
+
+  if floatx() == 'float64':
+    x = math_ops.cast(x, 'float64')
+  return x
+
+
+def _postprocess_conv3d_output(x, data_format):
+  """Transpose and cast the output from conv3d if needed.
+
+  Arguments:
+      x: A tensor.
+      data_format: string, one of "channels_last", "channels_first".
+
+  Returns:
+      A tensor.
+  """
+  if data_format == 'channels_first':
+    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
+
+  if floatx() == 'float64':
+    x = math_ops.cast(x, 'float64')
+  return x
+
+
+def conv1d(x,
+           kernel,
+           strides=1,
+           padding='valid',
+           data_format=None,
+           dilation_rate=1):
+  """1D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: stride integer.
+      padding: string, `"same"`, `"causal"` or `"valid"`.
+      data_format: string, one of "channels_last", "channels_first".
+      dilation_rate: integer dilate rate.
+
+  Returns:
+      A tensor, result of 1D convolution.
+  """
+  kernel_shape = kernel.get_shape().as_list()
+  if padding == 'causal':
+    # causal (dilated) convolution:
+    left_pad = dilation_rate * (kernel_shape[0] - 1)
+    x = temporal_padding(x, (left_pad, 0))
+    padding = 'valid'
+  padding = _preprocess_padding(padding)
+  if data_format == 'channels_last':
+    tf_data_format = 'NWC'
+  else:
+    tf_data_format = 'NCW'
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=(dilation_rate,),
+      strides=(strides,),
+      padding=padding,
+      data_format=tf_data_format)
+  return x
+
+
+def conv2d(x,
+           kernel,
+           strides=(1, 1),
+           padding='valid',
+           data_format=None,
+           dilation_rate=(1, 1)):
+  """2D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/outputs.
+      dilation_rate: tuple of 2 integers.
+
+  Returns:
+      A tensor, result of 2D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  # With 4d inputs, nn.convolution only supports
+  # data_format NHWC, so we transpose the inputs
+  # in case we are in data_format channels_first.
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=dilation_rate,
+      strides=strides,
+      padding=padding,
+      data_format='NHWC')
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def conv2d_transpose(x,
+                     kernel,
+                     output_shape,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format=None):
+  """2D deconvolution (i.e.
+
+  transposed convolution).
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      output_shape: 1D int tensor for the output shape.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/outputs.
+
+  Returns:
+      A tensor, result of transposed 2D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+  if isinstance(output_shape, (tuple, list)):
+    output_shape = array_ops.stack(output_shape)
+
+  x = _preprocess_conv2d_input(x, data_format)
+  output_shape = _preprocess_deconv_output_shape(x, output_shape, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.conv2d_transpose(x, kernel, output_shape, strides, padding=padding)
+  x = _postprocess_conv2d_output(x, data_format)
+  return x
+
+
+def separable_conv2d(x,
+                     depthwise_kernel,
+                     pointwise_kernel,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=(1, 1)):
+  """2D convolution with separable filters.
+
+  Arguments:
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      pointwise_kernel: kernel for the 1x1 convolution.
+      strides: strides tuple (length 2).
+      padding: padding mode, "valid" or "same".
+      data_format: data format, "channels_first" or "channels_last".
+      dilation_rate: tuple of integers,
+          dilation rates for the separable convolution.
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.separable_conv2d(
+      x,
+      depthwise_kernel,
+      pointwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate)
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def conv3d(x,
+           kernel,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format=None,
+           dilation_rate=(1, 1, 1)):
+  """3D convolution.
+
+  Arguments:
+      x: Tensor or variable.
+      kernel: kernel tensor.
+      strides: strides tuple.
+      padding: string, `"same"` or `"valid"`.
+      data_format: `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow data format
+          for inputs/kernels/outputs.
+      dilation_rate: tuple of 3 integers.
+
+  Returns:
+      A tensor, result of 3D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  # With 5d inputs, nn.convolution only supports
+  # data_format NDHWC, so we transpose the inputs
+  # in case we are in data_format channels_first.
+  x = _preprocess_conv3d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  x = nn.convolution(
+      input=x,
+      filter=kernel,
+      dilation_rate=dilation_rate,
+      strides=strides,
+      padding=padding,
+      data_format='NDHWC')
+  return _postprocess_conv3d_output(x, data_format)
+
+
+def pool2d(x,
+           pool_size,
+           strides=(1, 1),
+           padding='valid',
+           data_format=None,
+           pool_mode='max'):
+  """2D Pooling.
+
+  Arguments:
+      x: Tensor or variable.
+      pool_size: tuple of 2 integers.
+      strides: tuple of 2 integers.
+      padding: one of `"valid"`, `"same"`.
+      data_format: one of `"channels_first"`, `"channels_last"`.
+      pool_mode: one of `"max"`, `"avg"`.
+
+  Returns:
+      A tensor, result of 2D pooling.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+      ValueError: if `pool_mode` is neither `max` or `avg`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+  pool_size = (1,) + pool_size + (1,)
+
+  x = _preprocess_conv2d_input(x, data_format)
+
+  if pool_mode == 'max':
+    x = nn.max_pool(x, pool_size, strides, padding=padding)
+  elif pool_mode == 'avg':
+    x = nn.avg_pool(x, pool_size, strides, padding=padding)
+  else:
+    raise ValueError('Invalid pooling mode:', pool_mode)
+
+  return _postprocess_conv2d_output(x, data_format)
+
+
+def pool3d(x,
+           pool_size,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format=None,
+           pool_mode='max'):
+  """3D Pooling.
+
+  Arguments:
+      x: Tensor or variable.
+      pool_size: tuple of 3 integers.
+      strides: tuple of 3 integers.
+      padding: one of `"valid"`, `"same"`.
+      data_format: one of `"channels_first"`, `"channels_last"`.
+      pool_mode: one of `"max"`, `"avg"`.
+
+  Returns:
+      A tensor, result of 3D pooling.
+
+  Raises:
+      ValueError: if `data_format` is neither
+          `channels_last` or `channels_first`.
+      ValueError: if `pool_mode` is neither `max` or `avg`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+  pool_size = (1,) + pool_size + (1,)
+
+  x = _preprocess_conv3d_input(x, data_format)
+
+  if pool_mode == 'max':
+    x = nn.max_pool3d(x, pool_size, strides, padding=padding)
+  elif pool_mode == 'avg':
+    x = nn.avg_pool3d(x, pool_size, strides, padding=padding)
+  else:
+    raise ValueError('Invalid pooling mode:', pool_mode)
+
+  return _postprocess_conv3d_output(x, data_format)
+
+
+def bias_add(x, bias, data_format=None):
+  """Adds a bias vector to a tensor.
+
+  Arguments:
+      x: Tensor or variable.
+      bias: Bias tensor to add.
+      data_format: Data format for 3D, 4D or 5D tensors:
+          one of "channels_first", "channels_last".
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: In case of invalid `data_format` argument.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+  if ndim(x) == 5:
+    if data_format == 'channels_first':
+      x += reshape(bias, (1, int_shape(bias)[0], 1, 1, 1))
+    elif data_format == 'channels_last':
+      x += reshape(bias, (1, 1, 1, 1, int_shape(bias)[0]))
+  elif ndim(x) == 4:
+    if data_format == 'channels_first':
+      # No support yet for NCHW in bias_add.
+      x += reshape(bias, (1, int_shape(bias)[0], 1, 1))
+    elif data_format == 'channels_last':
+      x = nn.bias_add(x, bias, data_format='NHWC')
+  elif ndim(x) == 3:
+    if data_format == 'channels_first':
+      x += reshape(bias, (1, int_shape(bias)[0], 1))
+    elif data_format == 'channels_last':
+      x += reshape(bias, (1, 1, int_shape(bias)[0]))
+  else:
+    x = nn.bias_add(x, bias)
+  return x
+
+
+# RANDOMNESS
+
+
+def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+  """Returns a tensor with normal distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      mean: A float, mean of the normal distribution to draw samples.
+      stddev: A float, standard deviation of the normal distribution
+          to draw samples.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.random_normal(
+      shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
+
+
+def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
+  """Returns a tensor with uniform distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      minval: A float, lower boundary of the uniform distribution
+          to draw samples.
+      maxval: A float, upper boundary of the uniform distribution
+          to draw samples.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.random_uniform(
+      shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
+
+
+def random_binomial(shape, p=0.0, dtype=None, seed=None):
+  """Returns a tensor with random binomial distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      p: A float, `0. <= p <= 1`, probability of binomial distribution.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return array_ops.where(
+      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
+      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
+
+
+def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+  """Returns a tensor with truncated random normal distribution of values.
+
+  The generated values follow a normal distribution
+  with specified mean and standard deviation,
+  except that values whose magnitude is more than
+  two standard deviations from the mean are dropped and re-picked.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      mean: Mean of the values.
+      stddev: Standard deviation of the values.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return random_ops.truncated_normal(
+      shape, mean, stddev, dtype=dtype, seed=seed)
+
+
+# CTC
+# tensorflow has a native implemenation, but it uses sparse tensors
+# and therefore requires a wrapper for Keras. The functions below convert
+# dense to sparse tensors and also wraps up the beam search code that is
+# in tensorflow's CTC implementation
+
+
+def ctc_label_dense_to_sparse(labels, label_lengths):
+  """Converts CTC labels from dense to sparse.
+
+  Arguments:
+      labels: dense CTC labels.
+      label_lengths: length of the labels.
+
+  Returns:
+      A sparse tensor representation of the lablels.
+  """
+  label_shape = array_ops.shape(labels)
+  num_batches_tns = array_ops.stack([label_shape[0]])
+  max_num_labels_tns = array_ops.stack([label_shape[1]])
+
+  def range_less_than(_, current_input):
+    return array_ops.expand_dims(
+        math_ops.range(label_shape[1]), 0) < array_ops.fill(
+            max_num_labels_tns, current_input)
+
+  init = math_ops.cast(
+      array_ops.fill([1, label_shape[1]], 0), dtypes_module.bool)
+  dense_mask = functional_ops.scan(
+      range_less_than, label_lengths, initializer=init, parallel_iterations=1)
+  dense_mask = dense_mask[:, 0, :]
+
+  label_array = array_ops.reshape(
+      array_ops.tile(math_ops.range(0, label_shape[1]), num_batches_tns),
+      label_shape)
+  label_ind = array_ops.boolean_mask(label_array, dense_mask)
+
+  batch_array = array_ops.transpose(
+      array_ops.reshape(
+          array_ops.tile(math_ops.range(0, label_shape[0]), max_num_labels_tns),
+          reverse(label_shape, 0)))
+  batch_ind = array_ops.boolean_mask(batch_array, dense_mask)
+  indices = array_ops.transpose(
+      array_ops.reshape(concatenate([batch_ind, label_ind], axis=0), [2, -1]))
+
+  vals_sparse = array_ops.gather_nd(labels, indices)
+
+  return sparse_tensor.SparseTensor(
+      math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
+
+
+def ctc_batch_cost(y_true, y_pred, input_length, label_length):
+  """Runs CTC loss algorithm on each batch element.
+
+  Arguments:
+      y_true: tensor `(samples, max_string_length)`
+          containing the truth labels.
+      y_pred: tensor `(samples, time_steps, num_categories)`
+          containing the prediction, or output of the softmax.
+      input_length: tensor `(samples, 1)` containing the sequence length for
+          each batch item in `y_pred`.
+      label_length: tensor `(samples, 1)` containing the sequence length for
+          each batch item in `y_true`.
+
+  Returns:
+      Tensor with shape (samples,1) containing the
+          CTC loss of each element.
+  """
+  label_length = math_ops.to_int32(array_ops.squeeze(label_length))
+  input_length = math_ops.to_int32(array_ops.squeeze(input_length))
+  sparse_labels = math_ops.to_int32(
+      ctc_label_dense_to_sparse(y_true, label_length))
+
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+
+  return array_ops.expand_dims(
+      ctc.ctc_loss(
+          inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
+
+
+def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
+  """Decodes the output of a softmax.
+
+  Can use either greedy search (also known as best path)
+  or a constrained dictionary search.
+
+  Arguments:
+      y_pred: tensor `(samples, time_steps, num_categories)`
+          containing the prediction, or output of the softmax.
+      input_length: tensor `(samples, )` containing the sequence length for
+          each batch item in `y_pred`.
+      greedy: perform much faster best-path search if `true`.
+          This does not use a dictionary.
+      beam_width: if `greedy` is `false`: a beam search decoder will be used
+          with a beam of this width.
+      top_paths: if `greedy` is `false`,
+          how many of the most probable paths will be returned.
+
+  Returns:
+      Tuple:
+          List: if `greedy` is `true`, returns a list of one element that
+              contains the decoded sequence.
+              If `false`, returns the `top_paths` most probable
+              decoded sequences.
+              Important: blank labels are returned as `-1`.
+          Tensor `(top_paths, )` that contains
+              the log probability of each decoded sequence.
+  """
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  input_length = math_ops.to_int32(input_length)
+
+  if greedy:
+    (decoded, log_prob) = ctc.ctc_greedy_decoder(
+        inputs=y_pred, sequence_length=input_length)
+  else:
+    (decoded, log_prob) = ctc.ctc_beam_search_decoder(
+        inputs=y_pred,
+        sequence_length=input_length,
+        beam_width=beam_width,
+        top_paths=top_paths)
+  decoded_dense = [
+      sparse_ops.sparse_to_dense(
+          st.indices, st.dense_shape, st.values, default_value=-1)
+      for st in decoded
+  ]
+  return (decoded_dense, log_prob)
+
+
+# HIGH ORDER FUNCTIONS
+
+
+def map_fn(fn, elems, name=None, dtype=None):
+  """Map the function fn over the elements elems and return the outputs.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems
+      elems: tensor
+      name: A string name for the map node in the graph
+      dtype: Output data type.
+
+  Returns:
+      Tensor with dtype `dtype`.
+  """
+  return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
+
+
+def foldl(fn, elems, initializer=None, name=None):
+  """Reduce elems using fn to combine them from left to right.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems and an
+          accumulator, for instance `lambda acc, x: acc + x`
+      elems: tensor
+      initializer: The first value used (`elems[0]` in case of None)
+      name: A string name for the foldl node in the graph
+
+  Returns:
+      Tensor with same type and shape as `initializer`.
+  """
+  return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
+
+
+def foldr(fn, elems, initializer=None, name=None):
+  """Reduce elems using fn to combine them from right to left.
+
+  Arguments:
+      fn: Callable that will be called upon each element in elems and an
+          accumulator, for instance `lambda acc, x: acc + x`
+      elems: tensor
+      initializer: The first value used (`elems[-1]` in case of None)
+      name: A string name for the foldr node in the graph
+
+  Returns:
+      Same type and shape as initializer
+  """
+  return functional_ops.foldr(fn, elems, initializer=initializer, name=name)
+
+
+# Load Keras default configuration from config file if present.
+_keras_base_dir = os.path.expanduser('~')
+_keras_dir = os.path.join(_keras_base_dir, '.keras')
+_config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
+if os.path.exists(_config_path):
+  try:
+    _config = json.load(open(_config_path))
+  except ValueError:
+    _config = {}
+  _floatx = _config.get('floatx', floatx())
+  assert _floatx in {'float16', 'float32', 'float64'}
+  _epsilon = _config.get('epsilon', epsilon())
+  assert isinstance(_epsilon, float)
+  _image_data_format = _config.get('image_data_format', image_data_format())
+  assert _image_data_format in {'channels_last', 'channels_first'}
+  set_floatx(_floatx)
+  set_epsilon(_epsilon)
+  set_image_data_format(_image_data_format)
+
+# Save config file.
+if not os.path.exists(_keras_dir):
+  try:
+    os.makedirs(_keras_dir)
+  except OSError:
+    # Except permission denied and potential race conditions
+    # in multi-threaded environments.
+    pass
+
+if not os.path.exists(_config_path):
+  _config = {
+      'floatx': floatx(),
+      'epsilon': epsilon(),
+      'backend': 'tensorflow',
+      'image_data_format': image_data_format()
+  }
+  try:
+    with open(_config_path, 'w') as f:
+      f.write(json.dumps(_config, indent=4))
+  except IOError:
+    # Except permission denied.
+    pass
diff --git a/tensorflow/contrib/keras/python/keras/backend_test.py b/tensorflow/contrib/keras/python/keras/backend_test.py
new file mode 100644
index 00000000000..2da5aee58e5
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/backend_test.py
@@ -0,0 +1,456 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras backend."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+def compare_single_input_op_to_numpy(keras_op,
+                                     np_op,
+                                     input_shape,
+                                     dtype='float32',
+                                     negative_values=True,
+                                     keras_args=None,
+                                     keras_kwargs=None,
+                                     np_args=None,
+                                     np_kwargs=None):
+  keras_args = keras_args or []
+  keras_kwargs = keras_kwargs or {}
+  np_args = np_args or []
+  np_kwargs = np_kwargs or {}
+  inputs = 2. * np.random.random(input_shape)
+  if negative_values:
+    inputs -= 1.
+  keras_output = keras_op(keras.backend.variable(inputs, dtype=dtype),
+                          *keras_args, **keras_kwargs)
+  keras_output = keras.backend.eval(keras_output)
+  np_output = np_op(inputs.astype(dtype), *np_args, **np_kwargs)
+  try:
+    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+  except AssertionError:
+    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
+                         'Expected ' + str(np_output) + ' but got ' +
+                         str(keras_output))
+
+
+def compare_two_inputs_op_to_numpy(keras_op,
+                                   np_op,
+                                   input_shape_a,
+                                   input_shape_b,
+                                   dtype='float32',
+                                   keras_args=None,
+                                   keras_kwargs=None,
+                                   np_args=None,
+                                   np_kwargs=None):
+  keras_args = keras_args or []
+  keras_kwargs = keras_kwargs or {}
+  np_args = np_args or []
+  np_kwargs = np_kwargs or {}
+  input_a = np.random.random(input_shape_a)
+  input_b = np.random.random(input_shape_b)
+  keras_output = keras_op(keras.backend.variable(input_a, dtype=dtype),
+                          keras.backend.variable(input_b, dtype=dtype),
+                          *keras_args, **keras_kwargs)
+  keras_output = keras.backend.eval(keras_output)
+  np_output = np_op(input_a.astype(dtype), input_b.astype(dtype),
+                    *np_args, **np_kwargs)
+  try:
+    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+  except AssertionError:
+    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
+                         'Expected ' + str(np_output) + ' but got ' +
+                         str(keras_output))
+
+
+class BackendUtilsTest(test.TestCase):
+
+  def test_backend(self):
+    self.assertEqual(keras.backend.backend(), 'tensorflow')
+
+  def test_espilon(self):
+    epsilon = 1e-2
+    keras.backend.set_epsilon(epsilon)
+    self.assertEqual(keras.backend.epsilon(), epsilon)
+    keras.backend.set_epsilon(1e-7)
+
+  def test_floatx(self):
+    floatx = 'float64'
+    keras.backend.set_floatx(floatx)
+    self.assertEqual(keras.backend.floatx(), floatx)
+    keras.backend.set_floatx('float32')
+
+  def test_image_data_format(self):
+    image_data_format = 'channels_first'
+    keras.backend.set_image_data_format(image_data_format)
+    self.assertEqual(keras.backend.image_data_format(), image_data_format)
+    keras.backend.set_image_data_format('channels_last')
+
+  def test_get_uid(self):
+    self.assertEqual(keras.backend.get_uid('foo'), 1)
+    self.assertEqual(keras.backend.get_uid('foo'), 2)
+
+
+class BackendVariableTest(test.TestCase):
+
+  def test_zeros(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.zeros((3, 4)))
+
+  def test_ones(self):
+    with self.test_session():
+      x = keras.backend.ones((3, 4))
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.ones((3, 4)))
+
+  def test_eye(self):
+    with self.test_session():
+      x = keras.backend.eye(4)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val, np.eye(4))
+
+  def test_zeros_like(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      y = keras.backend.zeros_like(x)
+      val = keras.backend.eval(y)
+      self.assertAllClose(val, np.zeros((3, 4)))
+
+  def test_ones_like(self):
+    with self.test_session():
+      x = keras.backend.zeros((3, 4))
+      y = keras.backend.ones_like(x)
+      val = keras.backend.eval(y)
+      self.assertAllClose(val, np.ones((3, 4)))
+
+  def test_random_uniform_variable(self):
+    with self.test_session():
+      x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+      self.assertAllClose(val.max(), 2., atol=1e-1)
+      self.assertAllClose(val.min(), 1., atol=1e-1)
+
+  def test_random_normal_variable(self):
+    with self.test_session():
+      x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
+                                               seed=0)
+      val = keras.backend.eval(x)
+      self.assertAllClose(val.mean(), 1., atol=1e-1)
+      self.assertAllClose(val.std(), 0.5, atol=1e-1)
+
+  def test_count_params(self):
+    with self.test_session():
+      x = keras.backend.zeros((4, 5))
+      val = keras.backend.count_params(x)
+      self.assertAllClose(val, 20)
+
+
+class BackendLinearAlgebraTest(test.TestCase):
+
+  def test_dot(self):
+    x = keras.backend.placeholder(shape=(2, 3))
+    y = keras.backend.placeholder(shape=(3, 4))
+    xy = keras.backend.dot(x, y)
+    self.assertEqual(xy.get_shape().as_list(), [2, 4])
+
+    x = keras.backend.placeholder(shape=(32, 28, 3))
+    y = keras.backend.placeholder(shape=(3, 4))
+    xy = keras.backend.dot(x, y)
+    self.assertEqual(xy.get_shape().as_list(), [32, 28, 4])
+
+  def test_batch_dot(self):
+    x = keras.backend.ones(shape=(32, 20, 1))
+    y = keras.backend.ones(shape=(32, 30, 20))
+    xy = keras.backend.batch_dot(x, y, axes=[1, 2])
+    self.assertEqual(xy.get_shape().as_list(), [32, 1, 30])
+
+  def test_reduction_ops(self):
+    ops_to_test = [
+        (keras.backend.max, np.max),
+        (keras.backend.min, np.min),
+        (keras.backend.sum, np.sum),
+        (keras.backend.prod, np.prod),
+        (keras.backend.var, np.var),
+        (keras.backend.std, np.std),
+        (keras.backend.mean, np.mean),
+        (keras.backend.argmin, np.argmin),
+        (keras.backend.argmax, np.argmax),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': 1},
+                                         np_kwargs={'axis': 1})
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': -1},
+                                         np_kwargs={'axis': -1})
+        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
+          compare_single_input_op_to_numpy(keras_op, np_op,
+                                           input_shape=(4, 7, 5),
+                                           keras_kwargs={'axis': 1,
+                                                         'keepdims': True},
+                                           np_kwargs={'axis': 1,
+                                                      'keepdims': True})
+
+  def test_elementwise_ops(self):
+    ops_to_test = [
+        (keras.backend.square, np.square),
+        (keras.backend.abs, np.abs),
+        (keras.backend.round, np.round),
+        (keras.backend.sign, np.sign),
+        (keras.backend.sin, np.sin),
+        (keras.backend.cos, np.cos),
+        (keras.backend.exp, np.exp),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
+
+    ops_to_test = [
+        (keras.backend.sqrt, np.sqrt),
+        (keras.backend.log, np.log),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_single_input_op_to_numpy(keras_op, np_op,
+                                         input_shape=(4, 7),
+                                         negative_values=False)
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.clip, np.clip,
+          input_shape=(6, 4),
+          keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
+          np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.pow, np.power,
+          input_shape=(6, 4),
+          keras_args=[3],
+          np_args=[3])
+
+  def test_two_tensor_ops(self):
+    ops_to_test = [
+        (keras.backend.equal, np.equal),
+        (keras.backend.not_equal, np.not_equal),
+        (keras.backend.greater, np.greater),
+        (keras.backend.greater_equal, np.greater_equal),
+        (keras.backend.less, np.less),
+        (keras.backend.less_equal, np.less_equal),
+        (keras.backend.maximum, np.maximum),
+        (keras.backend.minimum, np.minimum),
+    ]
+    for keras_op, np_op in ops_to_test:
+      with self.test_session():
+        compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                       input_shape_a=(4, 7),
+                                       input_shape_b=(4, 7))
+
+
+class BackendShapeOpsTest(test.TestCase):
+
+  def test_reshape(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
+                                       input_shape=(4, 7),
+                                       keras_args=[(2, 14)],
+                                       np_args=[(2, 14)])
+
+  def test_concatenate(self):
+    a = keras.backend.variable(np.ones((1, 2, 3)))
+    b = keras.backend.variable(np.ones((1, 2, 2)))
+    y = keras.backend.concatenate([a, b], axis=-1)
+    self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
+
+  def test_permute_dimensions(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
+                                       np.transpose,
+                                       input_shape=(4, 7),
+                                       keras_args=[(1, 0)],
+                                       np_args=[(1, 0)])
+
+  def test_resize_images(self):
+    height_factor = 2
+    width_factor = 2
+    data_format = 'channels_last'
+    x = keras.backend.variable(np.ones((1, 2, 2, 3)))
+    y = keras.backend.resize_images(x,
+                                    height_factor,
+                                    width_factor,
+                                    data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 3])
+
+    data_format = 'channels_first'
+    x = keras.backend.variable(np.ones((1, 3, 2, 2)))
+    y = keras.backend.resize_images(x,
+                                    height_factor,
+                                    width_factor,
+                                    data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4])
+
+  def test_resize_volumes(self):
+    height_factor = 2
+    width_factor = 2
+    depth_factor = 2
+    data_format = 'channels_last'
+    x = keras.backend.variable(np.ones((1, 2, 2, 2, 3)))
+    y = keras.backend.resize_volumes(x,
+                                     depth_factor,
+                                     height_factor,
+                                     width_factor,
+                                     data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 4, 3])
+
+    data_format = 'channels_first'
+    x = keras.backend.variable(np.ones((1, 3, 2, 2, 2)))
+    y = keras.backend.resize_volumes(x,
+                                     depth_factor,
+                                     height_factor,
+                                     width_factor,
+                                     data_format)
+    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4, 4])
+
+  def test_repeat_elements(self):
+    x = keras.backend.variable(np.ones((1, 3, 2)))
+    y = keras.backend.repeat_elements(x, 3, axis=1)
+    self.assertEqual(y.get_shape().as_list(), [1, 9, 2])
+
+  def test_repeat(self):
+    x = keras.backend.variable(np.ones((1, 3)))
+    y = keras.backend.repeat(x, 2)
+    self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
+
+  def test_flatten(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.flatten,
+                                       np.reshape,
+                                       input_shape=(4, 7, 6),
+                                       np_args=[(4 * 7 * 6,)])
+
+  def test_batch_flatten(self):
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.batch_flatten,
+                                       np.reshape,
+                                       input_shape=(4, 7, 6),
+                                       np_args=[(4, 7 * 6)])
+
+  def test_temporal_padding(self):
+
+    def ref_op(x, padding):
+      shape = list(x.shape)
+      shape[1] += padding[0] + padding[1]
+      y = np.zeros(tuple(shape))
+      y[:, padding[0]:-padding[1], :] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(keras.backend.temporal_padding,
+                                       ref_op,
+                                       input_shape=(4, 7, 6),
+                                       keras_args=[(2, 3)],
+                                       np_args=[(2, 3)])
+
+  def test_spatial_2d_padding(self):
+
+    def ref_op(x, padding, data_format='channels_last'):
+      shape = list(x.shape)
+      if data_format == 'channels_last':
+        shape[1] += padding[0][0] + padding[0][1]
+        shape[2] += padding[1][0] + padding[1][1]
+        y = np.zeros(tuple(shape))
+        y[:, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1], :] = x
+      else:
+        shape[2] += padding[0][0] + padding[0][1]
+        shape[3] += padding[1][0] + padding[1][1]
+        y = np.zeros(tuple(shape))
+        y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_2d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3),
+          keras_args=[((2, 3), (1, 2))],
+          keras_kwargs={'data_format': 'channels_last'},
+          np_args=[((2, 3), (1, 2))],
+          np_kwargs={'data_format': 'channels_last'})
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_2d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3),
+          keras_args=[((2, 3), (1, 2))],
+          keras_kwargs={'data_format': 'channels_first'},
+          np_args=[((2, 3), (1, 2))],
+          np_kwargs={'data_format': 'channels_first'})
+
+  def test_spatial_3d_padding(self):
+
+    def ref_op(x, padding, data_format='channels_last'):
+      shape = list(x.shape)
+      if data_format == 'channels_last':
+        shape[1] += padding[0][0] + padding[0][1]
+        shape[2] += padding[1][0] + padding[1][1]
+        shape[3] += padding[2][0] + padding[2][1]
+        y = np.zeros(tuple(shape))
+        y[:,
+          padding[0][0]:-padding[0][1],
+          padding[1][0]:-padding[1][1],
+          padding[2][0]:-padding[2][1],
+          :] = x
+      else:
+        shape[2] += padding[0][0] + padding[0][1]
+        shape[3] += padding[1][0] + padding[1][1]
+        shape[4] += padding[2][0] + padding[2][1]
+        y = np.zeros(tuple(shape))
+        y[:, :,
+          padding[0][0]:-padding[0][1],
+          padding[1][0]:-padding[1][1],
+          padding[2][0]:-padding[2][1]] = x
+      return y
+
+    with self.test_session():
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_3d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3, 2),
+          keras_args=[((2, 3), (1, 2), (2, 3))],
+          keras_kwargs={'data_format': 'channels_last'},
+          np_args=[((2, 3), (1, 2), (2, 3))],
+          np_kwargs={'data_format': 'channels_last'})
+      compare_single_input_op_to_numpy(
+          keras.backend.spatial_3d_padding,
+          ref_op,
+          input_shape=(2, 3, 2, 3, 2),
+          keras_args=[((2, 3), (1, 2), (2, 3))],
+          keras_kwargs={'data_format': 'channels_first'},
+          np_args=[((2, 3), (1, 2), (2, 3))],
+          np_kwargs={'data_format': 'channels_first'})
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
new file mode 100644
index 00000000000..d0587a549b2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -0,0 +1,1081 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras callbacks: utilities called at certain points during model training.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import deque
+from collections import Iterable
+from collections import OrderedDict
+import csv
+import json
+import os
+import time
+
+import numpy as np
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.tensorboard.plugins import projector
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver as saver_lib
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import requests
+except ImportError:
+  requests = None
+# pylint: enable=g-import-not-at-top
+
+
+class CallbackList(object):
+  """Container abstracting a list of callbacks.
+
+  Arguments:
+      callbacks: List of `Callback` instances.
+      queue_length: Queue length for keeping
+          running statistics over callback execution time.
+  """
+
+  def __init__(self, callbacks=None, queue_length=10):
+    callbacks = callbacks or []
+    self.callbacks = [c for c in callbacks]
+    self.queue_length = queue_length
+
+  def append(self, callback):
+    self.callbacks.append(callback)
+
+  def set_params(self, params):
+    for callback in self.callbacks:
+      callback.set_params(params)
+
+  def set_model(self, model):
+    for callback in self.callbacks:
+      callback.set_model(model)
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Called at the start of an epoch.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_begin(epoch, logs)
+    self._delta_t_batch = 0.
+    self._delta_ts_batch_begin = deque([], maxlen=self.queue_length)
+    self._delta_ts_batch_end = deque([], maxlen=self.queue_length)
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Called at the end of an epoch.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_end(epoch, logs)
+
+  def on_batch_begin(self, batch, logs=None):
+    """Called right before processing a batch.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      callback.on_batch_begin(batch, logs)
+    self._delta_ts_batch_begin.append(time.time() - t_before_callbacks)
+    delta_t_median = np.median(self._delta_ts_batch_begin)
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      logging.warning(
+          'Method on_batch_begin() is slow compared '
+          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+    self._t_enter_batch = time.time()
+
+  def on_batch_end(self, batch, logs=None):
+    """Called at the end of a batch.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    if not hasattr(self, '_t_enter_batch'):
+      self._t_enter_batch = time.time()
+    self._delta_t_batch = time.time() - self._t_enter_batch
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      callback.on_batch_end(batch, logs)
+    self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
+    delta_t_median = np.median(self._delta_ts_batch_end)
+    if (self._delta_t_batch > 0. and
+        (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
+      logging.warning(
+          'Method on_batch_end() is slow compared '
+          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+
+  def on_train_begin(self, logs=None):
+    """Called at the beginning of training.
+
+    Arguments:
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_train_begin(logs)
+
+  def on_train_end(self, logs=None):
+    """Called at the end of training.
+
+    Arguments:
+        logs: dictionary of logs.
+    """
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_train_end(logs)
+
+  def __iter__(self):
+    return iter(self.callbacks)
+
+
+class Callback(object):
+  """Abstract base class used to build new callbacks.
+
+  # Properties
+      params: dict. Training parameters
+          (eg. verbosity, batch size, number of epochs...).
+      model: instance of `keras.models.Model`.
+          Reference of the model being trained.
+
+  The `logs` dictionary that callback methods
+  take as argument will contain keys for quantities relevant to
+  the current batch or epoch.
+
+  Currently, the `.fit()` method of the `Sequential` model class
+  will include the following quantities in the `logs` that
+  it passes to its callbacks:
+
+      on_epoch_end: logs include `acc` and `loss`, and
+          optionally include `val_loss`
+          (if validation is enabled in `fit`), and `val_acc`
+          (if validation and accuracy monitoring are enabled).
+      on_batch_begin: logs include `size`,
+          the number of samples in the current batch.
+      on_batch_end: logs include `loss`, and optionally `acc`
+          (if accuracy monitoring is enabled).
+  """
+
+  def __init__(self):
+    self.validation_data = None
+
+  def set_params(self, params):
+    self.params = params
+
+  def set_model(self, model):
+    self.model = model
+
+  def on_epoch_begin(self, epoch, logs=None):
+    pass
+
+  def on_epoch_end(self, epoch, logs=None):
+    pass
+
+  def on_batch_begin(self, batch, logs=None):
+    pass
+
+  def on_batch_end(self, batch, logs=None):
+    pass
+
+  def on_train_begin(self, logs=None):
+    pass
+
+  def on_train_end(self, logs=None):
+    pass
+
+
+class BaseLogger(Callback):
+  """Callback that accumulates epoch averages of metrics.
+
+  This callback is automatically applied to every Keras model.
+  """
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.seen = 0
+    self.totals = {}
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    batch_size = logs.get('size', 0)
+    self.seen += batch_size
+
+    for k, v in logs.items():
+      if k in self.totals:
+        self.totals[k] += v * batch_size
+      else:
+        self.totals[k] = v * batch_size
+
+  def on_epoch_end(self, epoch, logs=None):
+    if logs is not None:
+      for k in self.params['metrics']:
+        if k in self.totals:
+          # Make value available to next callbacks.
+          logs[k] = self.totals[k] / self.seen
+
+
+class TerminateOnNaN(Callback):
+  """Callback that terminates training when a NaN loss is encountered."""
+
+  def __init__(self):
+    super(TerminateOnNaN, self).__init__()
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    loss = logs.get('loss')
+    if loss is not None:
+      if np.isnan(loss) or np.isinf(loss):
+        print('Batch %d: Invalid loss, terminating training' % (batch))
+        self.model.stop_training = True
+
+
+class ProgbarLogger(Callback):
+  """Callback that prints metrics to stdout.
+
+  Arguments:
+      count_mode: One of "steps" or "samples".
+          Whether the progress bar should
+          count samples seens or steps (batches) seen.
+
+  Raises:
+      ValueError: In case of invalid `count_mode`.
+  """
+
+  def __init__(self, count_mode='samples'):
+    super(ProgbarLogger, self).__init__()
+    if count_mode == 'samples':
+      self.use_steps = False
+    elif count_mode == 'steps':
+      self.use_steps = True
+    else:
+      raise ValueError('Unknown `count_mode`: ' + str(count_mode))
+
+  def on_train_begin(self, logs=None):
+    self.verbose = self.params['verbose']
+    self.epochs = self.params['epochs']
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if self.verbose:
+      print('Epoch %d/%d' % (epoch + 1, self.epochs))
+      if self.use_steps:
+        target = self.params['steps']
+      else:
+        target = self.params['samples']
+      self.target = target
+      self.progbar = Progbar(target=self.target, verbose=self.verbose)
+    self.seen = 0
+
+  def on_batch_begin(self, batch, logs=None):
+    if self.seen < self.target:
+      self.log_values = []
+
+  def on_batch_end(self, batch, logs=None):
+    logs = logs or {}
+    batch_size = logs.get('size', 0)
+    if self.use_steps:
+      self.seen += 1
+    else:
+      self.seen += batch_size
+
+    for k in self.params['metrics']:
+      if k in logs:
+        self.log_values.append((k, logs[k]))
+
+    # Skip progbar update for the last batch;
+    # will be handled by on_epoch_end.
+    if self.verbose and self.seen < self.target:
+      self.progbar.update(self.seen, self.log_values)
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    for k in self.params['metrics']:
+      if k in logs:
+        self.log_values.append((k, logs[k]))
+    if self.verbose:
+      self.progbar.update(self.seen, self.log_values, force=True)
+
+
+class History(Callback):
+  """Callback that records events into a `History` object.
+
+  This callback is automatically applied to
+  every Keras model. The `History` object
+  gets returned by the `fit` method of models.
+  """
+
+  def on_train_begin(self, logs=None):
+    self.epoch = []
+    self.history = {}
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    self.epoch.append(epoch)
+    for k, v in logs.items():
+      self.history.setdefault(k, []).append(v)
+
+
+class ModelCheckpoint(Callback):
+  """Save the model after every epoch.
+
+  `filepath` can contain named formatting options,
+  which will be filled the value of `epoch` and
+  keys in `logs` (passed in `on_epoch_end`).
+
+  For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`,
+  then the model checkpoints will be saved with the epoch number and
+  the validation loss in the filename.
+
+  Arguments:
+      filepath: string, path to save the model file.
+      monitor: quantity to monitor.
+      verbose: verbosity mode, 0 or 1.
+      save_best_only: if `save_best_only=True`,
+          the latest best model according to
+          the quantity monitored will not be overwritten.
+      mode: one of {auto, min, max}.
+          If `save_best_only=True`, the decision
+          to overwrite the current save file is made
+          based on either the maximization or the
+          minimization of the monitored quantity. For `val_acc`,
+          this should be `max`, for `val_loss` this should
+          be `min`, etc. In `auto` mode, the direction is
+          automatically inferred from the name of the monitored quantity.
+      save_weights_only: if True, then only the model's weights will be
+          saved (`model.save_weights(filepath)`), else the full model
+          is saved (`model.save(filepath)`).
+      period: Interval (number of epochs) between checkpoints.
+  """
+
+  def __init__(self,
+               filepath,
+               monitor='val_loss',
+               verbose=0,
+               save_best_only=False,
+               save_weights_only=False,
+               mode='auto',
+               period=1):
+    super(ModelCheckpoint, self).__init__()
+    self.monitor = monitor
+    self.verbose = verbose
+    self.filepath = filepath
+    self.save_best_only = save_best_only
+    self.save_weights_only = save_weights_only
+    self.period = period
+    self.epochs_since_last_save = 0
+
+    if mode not in ['auto', 'min', 'max']:
+      logging.warning('ModelCheckpoint mode %s is unknown, '
+                      'fallback to auto mode.' % (mode))
+      mode = 'auto'
+
+    if mode == 'min':
+      self.monitor_op = np.less
+      self.best = np.Inf
+    elif mode == 'max':
+      self.monitor_op = np.greater
+      self.best = -np.Inf
+    else:
+      if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+        self.monitor_op = np.greater
+        self.best = -np.Inf
+      else:
+        self.monitor_op = np.less
+        self.best = np.Inf
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    self.epochs_since_last_save += 1
+    if self.epochs_since_last_save >= self.period:
+      self.epochs_since_last_save = 0
+      filepath = self.filepath.format(epoch=epoch, **logs)
+      if self.save_best_only:
+        current = logs.get(self.monitor)
+        if current is None:
+          logging.warning('Can save best model only with %s available, '
+                          'skipping.' % (self.monitor))
+        else:
+          if self.monitor_op(current, self.best):
+            if self.verbose > 0:
+              print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+                    ' saving model to %s' % (epoch, self.monitor, self.best,
+                                             current, filepath))
+            self.best = current
+            if self.save_weights_only:
+              self.model.save_weights(filepath, overwrite=True)
+            else:
+              self.model.save(filepath, overwrite=True)
+          else:
+            if self.verbose > 0:
+              print('Epoch %05d: %s did not improve' % (epoch, self.monitor))
+      else:
+        if self.verbose > 0:
+          print('Epoch %05d: saving model to %s' % (epoch, filepath))
+        if self.save_weights_only:
+          self.model.save_weights(filepath, overwrite=True)
+        else:
+          self.model.save(filepath, overwrite=True)
+
+
+class EarlyStopping(Callback):
+  """Stop training when a monitored quantity has stopped improving.
+
+  Arguments:
+      monitor: quantity to be monitored.
+      min_delta: minimum change in the monitored quantity
+          to qualify as an improvement, i.e. an absolute
+          change of less than min_delta, will count as no
+          improvement.
+      patience: number of epochs with no improvement
+          after which training will be stopped.
+      verbose: verbosity mode.
+      mode: one of {auto, min, max}. In `min` mode,
+          training will stop when the quantity
+          monitored has stopped decreasing; in `max`
+          mode it will stop when the quantity
+          monitored has stopped increasing; in `auto`
+          mode, the direction is automatically inferred
+          from the name of the monitored quantity.
+  """
+
+  def __init__(self,
+               monitor='val_loss',
+               min_delta=0,
+               patience=0,
+               verbose=0,
+               mode='auto'):
+    super(EarlyStopping, self).__init__()
+
+    self.monitor = monitor
+    self.patience = patience
+    self.verbose = verbose
+    self.min_delta = min_delta
+    self.wait = 0
+    self.stopped_epoch = 0
+
+    if mode not in ['auto', 'min', 'max']:
+      logging.warning('EarlyStopping mode %s is unknown, '
+                      'fallback to auto mode.' % (self.mode))
+      mode = 'auto'
+
+    if mode == 'min':
+      self.monitor_op = np.less
+    elif mode == 'max':
+      self.monitor_op = np.greater
+    else:
+      if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
+        self.monitor_op = np.greater
+      else:
+        self.monitor_op = np.less
+
+    if self.monitor_op == np.greater:
+      self.min_delta *= 1
+    else:
+      self.min_delta *= -1
+
+  def on_train_begin(self, logs=None):
+    # Allow instances to be re-used
+    self.wait = 0
+    self.stopped_epoch = 0
+    self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+
+  def on_epoch_end(self, epoch, logs=None):
+    current = logs.get(self.monitor)
+    if current is None:
+      logging.warning('Early stopping requires %s available!' % (self.monitor))
+
+    if self.monitor_op(current - self.min_delta, self.best):
+      self.best = current
+      self.wait = 0
+    else:
+      if self.wait >= self.patience:
+        self.stopped_epoch = epoch
+        self.model.stop_training = True
+      self.wait += 1
+
+  def on_train_end(self, logs=None):
+    if self.stopped_epoch > 0 and self.verbose > 0:
+      print('Epoch %05d: early stopping' % (self.stopped_epoch))
+
+
+class RemoteMonitor(Callback):
+  """Callback used to stream events to a server.
+
+  Requires the `requests` library.
+  Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
+  HTTP POST, with a `data` argument which is a
+  JSON-encoded dictionary of event data.
+
+  Arguments:
+      root: String; root url of the target server.
+      path: String; path relative to `root` to which the events will be sent.
+      field: String; JSON field under which the data will be stored.
+      headers: Dictionary; optional custom HTTP headers.
+          Defaults to:
+          `{'Accept': 'application/json', 'Content-Type': 'application/json'}`
+  """
+
+  def __init__(self,
+               root='http://localhost:9000',
+               path='/publish/epoch/end/',
+               field='data',
+               headers=None):
+    super(RemoteMonitor, self).__init__()
+    if headers is None:
+      headers = {
+          'Accept': 'application/json',
+          'Content-Type': 'application/json'
+      }
+    self.root = root
+    self.path = path
+    self.field = field
+    self.headers = headers
+
+  def on_epoch_end(self, epoch, logs=None):
+    if requests is None:
+      raise ImportError('RemoteMonitor requires ' 'the `requests` library.')
+    logs = logs or {}
+    send = {}
+    send['epoch'] = epoch
+    for k, v in logs.items():
+      send[k] = v
+    try:
+      requests.post(
+          self.root + self.path, {self.field: json.dumps(send)},
+          headers=self.headers)
+    except requests.exceptions.RequestException:
+      logging.warning('Warning: could not reach RemoteMonitor '
+                      'root server at ' + str(self.root))
+
+
+class LearningRateScheduler(Callback):
+  """Learning rate scheduler.
+
+  Arguments:
+      schedule: a function that takes an epoch index as input
+          (integer, indexed from 0) and returns a new
+          learning rate as output (float).
+  """
+
+  def __init__(self, schedule):
+    super(LearningRateScheduler, self).__init__()
+    self.schedule = schedule
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if not hasattr(self.model.optimizer, 'lr'):
+      raise ValueError('Optimizer must have a "lr" attribute.')
+    lr = self.schedule(epoch)
+    if not isinstance(lr, (float, np.float32, np.float64)):
+      raise ValueError('The output of the "schedule" function '
+                       'should be float.')
+    K.set_value(self.model.optimizer.lr, lr)
+
+
+class TensorBoard(Callback):
+  # pylint: disable=line-too-long
+  """Tensorboard basic visualizations.
+
+  This callback writes a log for TensorBoard, which allows
+  you to visualize dynamic graphs of your training and test
+  metrics, as well as activation histograms for the different
+  layers in your model.
+
+  TensorBoard is a visualization tool provided with TensorFlow.
+
+  If you have installed TensorFlow with pip, you should be able
+  to launch TensorBoard from the command line:
+
+  ```
+  tensorboard --logdir=/full_path_to_your_logs
+  ```
+
+  You can find more information about TensorBoard
+  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+  Arguments:
+      log_dir: the path of the directory where to save the log
+          files to be parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation
+          and weight histograms for the layers of the model. If set to 0,
+          histograms won't be computed. Validation data (or split) must be
+          specified for histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard.
+          The log file can become quite large when
+          write_graph is set to True.
+      write_grads: whether to visualize gradient histograms in TensorBoard.
+          `histogram_freq` must be greater than 0.
+      batch_size: size of batch of inputs to feed to the network
+          for histograms computation.
+      write_images: whether to write model weights to visualize as
+          image in TensorBoard.
+      embeddings_freq: frequency (in epochs) at which selected embedding
+          layers will be saved.
+      embeddings_layer_names: a list of names of layers to keep eye on. If
+          None or empty list all the embedding layer will be watched.
+      embeddings_metadata: a dictionary which maps layer name to a file name
+          in which metadata for this embedding layer is saved. See the
+          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+          about metadata files format. In case if the same metadata file is
+          used for all embedding layers, string can be passed.
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               log_dir='./logs',
+               histogram_freq=0,
+               batch_size=32,
+               write_graph=True,
+               write_grads=False,
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None):
+    super(TensorBoard, self).__init__()
+    self.log_dir = log_dir
+    self.histogram_freq = histogram_freq
+    self.merged = None
+    self.write_graph = write_graph
+    self.write_grads = write_grads
+    self.write_images = write_images
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata or {}
+    self.batch_size = batch_size
+
+  def set_model(self, model):
+    self.model = model
+    self.sess = K.get_session()
+    if self.histogram_freq and self.merged is None:
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          tf_summary.histogram(weight.name, weight)
+          if self.write_grads:
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+            tf_summary.histogram('{}_grad'.format(weight.name), grads)
+          if self.write_images:
+            w_img = array_ops.squeeze(weight)
+            shape = K.int_shape(w_img)
+            if len(shape) == 2:  # dense layer kernel case
+              if shape[0] > shape[1]:
+                w_img = array_ops.transpose(w_img)
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+            elif len(shape) == 3:  # convnet case
+              if K.image_data_format() == 'channels_last':
+                # switch to channels_first to display
+                # every kernel as a separate image
+                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img,
+                                        [shape[0], shape[1], shape[2], 1])
+            elif len(shape) == 1:  # bias case
+              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+            else:
+              # not possible to handle 3D convnets etc.
+              continue
+
+            shape = K.int_shape(w_img)
+            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
+            tf_summary.image(weight.name, w_img)
+
+        if hasattr(layer, 'output'):
+          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+    self.merged = tf_summary.merge_all()
+
+    if self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
+
+    if self.embeddings_freq:
+      embeddings_layer_names = self.embeddings_layer_names
+
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      embeddings = {
+          layer.name: layer.weights[0]
+          for layer in self.model.layers if layer.name in embeddings_layer_names
+      }
+
+      self.saver = saver_lib.Saver(list(embeddings.values()))
+
+      embeddings_metadata = {}
+
+      if not isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = self.embeddings_metadata
+      else:
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings.keys()
+        }
+
+      config = projector.ProjectorConfig()
+      self.embeddings_ckpt_path = os.path.join(self.log_dir,
+                                               'keras_embedding.ckpt')
+
+      for layer_name, tensor in embeddings.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        if layer_name in embeddings_metadata:
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+
+    if self.validation_data and self.histogram_freq:
+      if epoch % self.histogram_freq == 0:
+
+        val_data = self.validation_data
+        tensors = (
+            self.model.inputs + self.model.targets + self.model.sample_weights)
+
+        if self.model.uses_learning_phase:
+          tensors += [K.learning_phase()]
+
+        assert len(val_data) == len(tensors)
+        val_size = val_data[0].shape[0]
+        i = 0
+        while i < val_size:
+          step = min(self.batch_size, val_size - i)
+          batch_val = []
+          batch_val.append(val_data[0][i:i + step])
+          batch_val.append(val_data[1][i:i + step])
+          batch_val.append(val_data[2][i:i + step])
+          if self.model.uses_learning_phase:
+            batch_val.append(val_data[3])
+          feed_dict = dict(zip(tensors, batch_val))
+          result = self.sess.run([self.merged], feed_dict=feed_dict)
+          summary_str = result[0]
+          self.writer.add_summary(summary_str, epoch)
+          i += self.batch_size
+
+    if self.embeddings_freq and self.embeddings_ckpt_path:
+      if epoch % self.embeddings_freq == 0:
+        self.saver.save(self.sess, self.embeddings_ckpt_path, epoch)
+
+    for name, value in logs.items():
+      if name in ['batch', 'size']:
+        continue
+      summary = tf_summary.Summary()
+      summary_value = summary.value.add()
+      summary_value.simple_value = value.item()
+      summary_value.tag = name
+      self.writer.add_summary(summary, epoch)
+    self.writer.flush()
+
+  def on_train_end(self, _):
+    self.writer.close()
+
+
+class ReduceLROnPlateau(Callback):
+  """Reduce learning rate when a metric has stopped improving.
+
+  Models often benefit from reducing the learning rate by a factor
+  of 2-10 once learning stagnates. This callback monitors a
+  quantity and if no improvement is seen for a 'patience' number
+  of epochs, the learning rate is reduced.
+
+  Example:
+
+  ```python
+  reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
+                                patience=5, min_lr=0.001)
+  model.fit(X_train, Y_train, callbacks=[reduce_lr])
+  ```
+
+  Arguments:
+      monitor: quantity to be monitored.
+      factor: factor by which the learning rate will
+          be reduced. new_lr = lr * factor
+      patience: number of epochs with no improvement
+          after which learning rate will be reduced.
+      verbose: int. 0: quiet, 1: update messages.
+      mode: one of {auto, min, max}. In `min` mode,
+          lr will be reduced when the quantity
+          monitored has stopped decreasing; in `max`
+          mode it will be reduced when the quantity
+          monitored has stopped increasing; in `auto`
+          mode, the direction is automatically inferred
+          from the name of the monitored quantity.
+      epsilon: threshold for measuring the new optimum,
+          to only focus on significant changes.
+      cooldown: number of epochs to wait before resuming
+          normal operation after lr has been reduced.
+      min_lr: lower bound on the learning rate.
+  """
+
+  def __init__(self,
+               monitor='val_loss',
+               factor=0.1,
+               patience=10,
+               verbose=0,
+               mode='auto',
+               epsilon=1e-4,
+               cooldown=0,
+               min_lr=0):
+    super(ReduceLROnPlateau, self).__init__()
+
+    self.monitor = monitor
+    if factor >= 1.0:
+      raise ValueError('ReduceLROnPlateau ' 'does not support a factor >= 1.0.')
+    self.factor = factor
+    self.min_lr = min_lr
+    self.epsilon = epsilon
+    self.patience = patience
+    self.verbose = verbose
+    self.cooldown = cooldown
+    self.cooldown_counter = 0  # Cooldown counter.
+    self.wait = 0
+    self.best = 0
+    self.mode = mode
+    self.monitor_op = None
+    self._reset()
+
+  def _reset(self):
+    """Resets wait counter and cooldown counter.
+    """
+    if self.mode not in ['auto', 'min', 'max']:
+      logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
+                      'fallback to auto mode.' % (self.mode))
+      self.mode = 'auto'
+    if (self.mode == 'min' or
+        (self.mode == 'auto' and 'acc' not in self.monitor)):
+      self.monitor_op = lambda a, b: np.less(a, b - self.epsilon)
+      self.best = np.Inf
+    else:
+      self.monitor_op = lambda a, b: np.greater(a, b + self.epsilon)
+      self.best = -np.Inf
+    self.cooldown_counter = 0
+    self.wait = 0
+    self.lr_epsilon = self.min_lr * 1e-4
+
+  def on_train_begin(self, logs=None):
+    self._reset()
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    logs['lr'] = K.get_value(self.model.optimizer.lr)
+    current = logs.get(self.monitor)
+    if current is None:
+      logging.warning('Learning Rate Plateau Reducing requires %s available!' %
+                      self.monitor)
+    else:
+      if self.in_cooldown():
+        self.cooldown_counter -= 1
+        self.wait = 0
+
+      if self.monitor_op(current, self.best):
+        self.best = current
+        self.wait = 0
+      elif not self.in_cooldown():
+        if self.wait >= self.patience:
+          old_lr = float(K.get_value(self.model.optimizer.lr))
+          if old_lr > self.min_lr + self.lr_epsilon:
+            new_lr = old_lr * self.factor
+            new_lr = max(new_lr, self.min_lr)
+            K.set_value(self.model.optimizer.lr, new_lr)
+            if self.verbose > 0:
+              print('\nEpoch %05d: reducing learning rate to %s.' % (epoch,
+                                                                     new_lr))
+            self.cooldown_counter = self.cooldown
+            self.wait = 0
+        self.wait += 1
+
+  def in_cooldown(self):
+    return self.cooldown_counter > 0
+
+
+class CSVLogger(Callback):
+  """Callback that streams epoch results to a csv file.
+
+  Supports all values that can be represented as a string,
+  including 1D iterables such as np.ndarray.
+
+  Example:
+      ```python
+      csv_logger = CSVLogger('training.log')
+      model.fit(X_train, Y_train, callbacks=[csv_logger])
+      ```
+
+  Arguments:
+      filename: filename of the csv file, e.g. 'run/log.csv'.
+      separator: string used to separate elements in the csv file.
+      append: True: append if file exists (useful for continuing
+          training). False: overwrite existing file,
+  """
+
+  def __init__(self, filename, separator=',', append=False):
+    self.sep = separator
+    self.filename = filename
+    self.append = append
+    self.writer = None
+    self.keys = None
+    self.append_header = True
+    self.file_flags = 'b' if six.PY2 and os.name == 'nt' else ''
+    super(CSVLogger, self).__init__()
+
+  def on_train_begin(self, logs=None):
+    if self.append:
+      if os.path.exists(self.filename):
+        with open(self.filename, 'r' + self.file_flags) as f:
+          self.append_header = not bool(len(f.readline()))
+      self.csv_file = open(self.filename, 'a' + self.file_flags)
+    else:
+      self.csv_file = open(self.filename, 'w' + self.file_flags)
+
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+
+    def handle_value(k):
+      is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
+      if isinstance(k, six.string_types):
+        return k
+      elif isinstance(k, Iterable) and not is_zero_dim_ndarray:
+        return '"[%s]"' % (', '.join(map(str, k)))
+      else:
+        return k
+
+    if not self.writer:
+      self.keys = sorted(logs.keys())
+
+      class CustomDialect(csv.excel):
+        delimiter = self.sep
+
+      self.writer = csv.DictWriter(
+          self.csv_file,
+          fieldnames=['epoch'] + self.keys,
+          dialect=CustomDialect)
+      if self.append_header:
+        self.writer.writeheader()
+
+    row_dict = OrderedDict({'epoch': epoch})
+    row_dict.update((key, handle_value(logs[key])) for key in self.keys)
+    self.writer.writerow(row_dict)
+    self.csv_file.flush()
+
+  def on_train_end(self, logs=None):
+    self.csv_file.close()
+    self.writer = None
+
+
+class LambdaCallback(Callback):
+  """Callback for creating simple, custom callbacks on-the-fly.
+
+  This callback is constructed with anonymous functions that will be called
+  at the appropriate time. Note that the callbacks expects positional
+  arguments, as:
+
+   - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
+      `epoch`, `logs`
+   - `on_batch_begin` and `on_batch_end` expect two positional arguments:
+      `batch`, `logs`
+   - `on_train_begin` and `on_train_end` expect one positional argument:
+      `logs`
+
+  Arguments:
+      on_epoch_begin: called at the beginning of every epoch.
+      on_epoch_end: called at the end of every epoch.
+      on_batch_begin: called at the beginning of every batch.
+      on_batch_end: called at the end of every batch.
+      on_train_begin: called at the beginning of model training.
+      on_train_end: called at the end of model training.
+
+  Example:
+      ```python
+      # Print the batch number at the beginning of every batch.
+      batch_print_callback = LambdaCallback(
+          on_batch_begin=lambda batch,logs: print(batch))
+
+      # Plot the loss after every epoch.
+      import numpy as np
+      import matplotlib.pyplot as plt
+      plot_loss_callback = LambdaCallback(
+          on_epoch_end=lambda epoch, logs: plt.plot(np.arange(epoch),
+                                                    logs['loss']))
+
+      # Terminate some processes after having finished model training.
+      processes = ...
+      cleanup_callback = LambdaCallback(
+          on_train_end=lambda logs: [
+              p.terminate() for p in processes if p.is_alive()])
+
+      model.fit(...,
+                callbacks=[batch_print_callback,
+                           plot_loss_callback,
+                           cleanup_callback])
+      ```
+  """
+
+  def __init__(self,
+               on_epoch_begin=None,
+               on_epoch_end=None,
+               on_batch_begin=None,
+               on_batch_end=None,
+               on_train_begin=None,
+               on_train_end=None,
+               **kwargs):
+    super(LambdaCallback, self).__init__()
+    self.__dict__.update(kwargs)
+    if on_epoch_begin is not None:
+      self.on_epoch_begin = on_epoch_begin
+    else:
+      self.on_epoch_begin = lambda epoch, logs: None
+    if on_epoch_end is not None:
+      self.on_epoch_end = on_epoch_end
+    else:
+      self.on_epoch_end = lambda epoch, logs: None
+    if on_batch_begin is not None:
+      self.on_batch_begin = on_batch_begin
+    else:
+      self.on_batch_begin = lambda batch, logs: None
+    if on_batch_end is not None:
+      self.on_batch_end = on_batch_end
+    else:
+      self.on_batch_end = lambda batch, logs: None
+    if on_train_begin is not None:
+      self.on_train_begin = on_train_begin
+    else:
+      self.on_train_begin = lambda logs: None
+    if on_train_end is not None:
+      self.on_train_end = on_train_end
+    else:
+      self.on_train_end = lambda logs: None
diff --git a/tensorflow/contrib/keras/python/keras/callbacks_test.py b/tensorflow/contrib/keras/python/keras/callbacks_test.py
new file mode 100644
index 00000000000..15a7304b603
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/callbacks_test.py
@@ -0,0 +1,650 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import multiprocessing
+import os
+import re
+import shutil
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 10
+NUM_CLASSES = 2
+INPUT_DIM = 3
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class KerasCallbacksTest(test.TestCase):
+
+  def test_ModelCheckpoint(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      np.random.seed(1337)
+
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      filepath = os.path.join(temp_dir, 'checkpoint.h5')
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      # case 1
+      monitor = 'val_loss'
+      save_best_only = False
+      mode = 'auto'
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 2
+      mode = 'min'
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 3
+      mode = 'max'
+      monitor = 'val_acc'
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 4
+      save_best_only = True
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+      assert os.path.exists(filepath)
+      os.remove(filepath)
+
+      # case 5
+      save_best_only = False
+      period = 2
+      mode = 'auto'
+
+      filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
+      cbks = [
+          keras.callbacks.ModelCheckpoint(
+              filepath,
+              monitor=monitor,
+              save_best_only=save_best_only,
+              mode=mode,
+              period=period)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=4,
+          verbose=0)
+      assert os.path.exists(filepath.format(epoch=1))
+      assert os.path.exists(filepath.format(epoch=3))
+      os.remove(filepath.format(epoch=1))
+      os.remove(filepath.format(epoch=3))
+      assert not os.path.exists(filepath.format(epoch=0))
+      assert not os.path.exists(filepath.format(epoch=2))
+
+  def test_EarlyStopping(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      mode = 'max'
+      monitor = 'val_acc'
+      patience = 0
+      cbks = [
+          keras.callbacks.EarlyStopping(
+              patience=patience, monitor=monitor, mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20,
+          verbose=0)
+
+      mode = 'auto'
+      monitor = 'val_acc'
+      patience = 2
+      cbks = [
+          keras.callbacks.EarlyStopping(
+              patience=patience, monitor=monitor, mode=mode)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20,
+          verbose=0)
+
+  def test_EarlyStopping_reuse(self):
+    with self.test_session():
+      np.random.seed(1337)
+      patience = 3
+      data = np.random.random((100, 1))
+      labels = np.where(data > 0.5, 1, 0)
+      model = keras.models.Sequential((keras.layers.Dense(
+          1, input_dim=1, activation='relu'), keras.layers.Dense(
+              1, activation='sigmoid'),))
+      model.compile(
+          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
+      stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
+      weights = model.get_weights()
+
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+      assert len(hist.epoch) >= patience
+
+      # This should allow training to go for at least `patience` epochs
+      model.set_weights(weights)
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+    assert len(hist.epoch) >= patience
+
+  def test_LearningRateScheduler(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      cbks = [keras.callbacks.LearningRateScheduler(lambda x: 1. / (1. + x))]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert (float(keras.backend.get_value(model.optimizer.lr)) - 0.2
+             ) < keras.backend.epsilon()
+
+  def test_ReduceLROnPlateau(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def make_model():
+        np.random.seed(1337)
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+
+        model.compile(
+            loss='categorical_crossentropy',
+            optimizer=keras.optimizers.SGD(lr=0.1),
+            metrics=['accuracy'])
+        return model
+
+      model = make_model()
+
+      # This should reduce the LR after the first epoch (due to high epsilon).
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss',
+              factor=0.1,
+              epsilon=10,
+              patience=1,
+              cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert np.allclose(
+          float(keras.backend.get_value(model.optimizer.lr)),
+          0.01,
+          atol=keras.backend.epsilon())
+
+      model = make_model()
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.1, epsilon=0, patience=1, cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      assert np.allclose(
+          float(keras.backend.get_value(model.optimizer.lr)),
+          0.1,
+          atol=keras.backend.epsilon())
+
+  def test_CSVLogger(self):
+    with self.test_session():
+      np.random.seed(1337)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+      filepath = os.path.join(temp_dir, 'log.tsv')
+
+      sep = '\t'
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def make_model():
+        np.random.seed(1337)
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+
+        model.compile(
+            loss='categorical_crossentropy',
+            optimizer=keras.optimizers.SGD(lr=0.1),
+            metrics=['accuracy'])
+        return model
+
+      # case 1, create new file with defined separator
+      model = make_model()
+      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep)]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      assert os.path.exists(filepath)
+      with open(filepath) as csvfile:
+        dialect = csv.Sniffer().sniff(csvfile.read())
+      assert dialect.delimiter == sep
+      del model
+      del cbks
+
+      # case 2, append data to existing file, skip header
+      model = make_model()
+      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep, append=True)]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      # case 3, reuse of CSVLogger object
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=1,
+          verbose=0)
+
+      with open(filepath) as csvfile:
+        output = ' '.join(csvfile.readlines())
+        assert len(re.findall('epoch', output)) == 1
+
+      os.remove(filepath)
+
+  def test_TerminateOnNaN(self):
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+    cbks = [keras.callbacks.TerminateOnNaN()]
+    model = keras.models.Sequential()
+    initializer = keras.initializers.Constant(value=1e5)
+    for _ in range(5):
+      model.add(keras.layers.Dense(2,
+                                   input_dim=INPUT_DIM,
+                                   activation='relu',
+                                   kernel_initializer=initializer))
+    model.add(keras.layers.Dense(NUM_CLASSES))
+    model.compile(loss='mean_squared_error',
+                  optimizer='rmsprop')
+
+    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE,
+                        validation_data=(x_test, y_test),
+                        callbacks=cbks, epochs=20)
+    loss = history.history['loss']
+    assert len(loss) == 1
+    assert loss[0] == np.inf
+
+  def test_TensorBoard(self):
+    np.random.seed(1337)
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def data_generator(train):
+      if train:
+        max_batch_index = len(x_train) // BATCH_SIZE
+      else:
+        max_batch_index = len(x_test) // BATCH_SIZE
+      i = 0
+      while 1:
+        if train:
+          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        else:
+          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        i += 1
+        i %= max_batch_index
+
+    # case: Sequential
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      tsb = keras.callbacks.TensorBoard(
+          log_dir=temp_dir, histogram_freq=1, write_images=True,
+          write_grads=True, embeddings_freq=1,
+          embeddings_layer_names=['dense_1'], batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      # fit with validation data and accuracy
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      # fit generator with validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator with validation data and accuracy
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data and accuracy
+      model.fit_generator(
+          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
+      assert os.path.exists(temp_dir)
+
+  def test_LambdaCallback(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+
+      # Start an arbitrary process that should run during model
+      # training and be terminated after training has completed.
+      def target():
+        while True:
+          pass
+
+      p = multiprocessing.Process(target=target)
+      p.start()
+      cleanup_callback = keras.callbacks.LambdaCallback(
+          on_train_end=lambda logs: p.terminate())
+
+      cbks = [cleanup_callback]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=0)
+      p.join()
+      assert not p.is_alive()
+
+  def test_TensorBoard_with_ReduceLROnPlateau(self):
+    with self.test_session():
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.5, patience=4, verbose=1),
+          keras.callbacks.TensorBoard(log_dir=temp_dir)
+      ]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      assert os.path.exists(temp_dir)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/constraints.py b/tensorflow/contrib/keras/python/keras/constraints.py
new file mode 100644
index 00000000000..91d61538623
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/constraints.py
@@ -0,0 +1,199 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constraints: functions that impose constraints on weights values.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+
+
+class Constraint(object):
+
+  def __call__(self, w):
+    return w
+
+  def get_config(self):
+    return {}
+
+
+class MaxNorm(Constraint):
+  """MaxNorm weight constraint.
+
+  Constrains the weights incident to each hidden unit
+  to have a norm less than or equal to a desired value.
+
+  Arguments:
+      m: the maximum norm for the incoming weights.
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `data_format="channels_last"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+
+  References:
+      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+        Srivastava, Hinton, et al.
+        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+  """
+
+  def __init__(self, max_value=2, axis=0):
+    self.max_value = max_value
+    self.axis = axis
+
+  def __call__(self, w):
+    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    desired = K.clip(norms, 0, self.max_value)
+    w *= (desired / (K.epsilon() + norms))
+    return w
+
+  def get_config(self):
+    return {'max_value': self.max_value, 'axis': self.axis}
+
+
+class NonNeg(Constraint):
+  """Constrains the weights to be non-negative.
+  """
+
+  def __call__(self, w):
+    w *= K.cast(w >= 0., K.floatx())
+    return w
+
+
+class UnitNorm(Constraint):
+  """Constrains the weights incident to each hidden unit to have unit norm.
+
+  Arguments:
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `data_format="channels_last"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+  """
+
+  def __init__(self, axis=0):
+    self.axis = axis
+
+  def __call__(self, w):
+    return w / (
+        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+
+  def get_config(self):
+    return {'axis': self.axis}
+
+
+class MinMaxNorm(Constraint):
+  """MinMaxNorm weight constraint.
+
+  Constrains the weights incident to each hidden unit
+  to have the norm between a lower bound and an upper bound.
+
+  Arguments:
+      min_value: the minimum norm for the incoming weights.
+      max_value: the maximum norm for the incoming weights.
+      rate: rate for enforcing the constraint: weights will be
+          rescaled to yield
+          `(1 - rate) * norm + rate * norm.clip(min_value, max_value)`.
+          Effectively, this means that rate=1.0 stands for strict
+          enforcement of the constraint, while rate<1.0 means that
+          weights will be rescaled at each step to slowly move
+          towards a value inside the desired interval.
+      axis: integer, axis along which to calculate weight norms.
+          For instance, in a `Dense` layer the weight matrix
+          has shape `(input_dim, output_dim)`,
+          set `axis` to `0` to constrain each weight vector
+          of length `(input_dim,)`.
+          In a `Convolution2D` layer with `dim_ordering="tf"`,
+          the weight tensor has shape
+          `(rows, cols, input_depth, output_depth)`,
+          set `axis` to `[0, 1, 2]`
+          to constrain the weights of each filter tensor of size
+          `(rows, cols, input_depth)`.
+  """
+
+  def __init__(self, min_value=0.0, max_value=1.0, rate=1.0, axis=0):
+    self.min_value = min_value
+    self.max_value = max_value
+    self.rate = rate
+    self.axis = axis
+
+  def __call__(self, w):
+    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    desired = (self.rate * K.clip(norms, self.min_value, self.max_value) +
+               (1 - self.rate) * norms)
+    w *= (desired / (K.epsilon() + norms))
+    return w
+
+  def get_config(self):
+    return {
+        'min_value': self.min_value,
+        'max_value': self.max_value,
+        'rate': self.rate,
+        'axis': self.axis
+    }
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+max_norm = MaxNorm
+non_neg = NonNeg
+unit_norm = UnitNorm
+min_max_norm = MinMaxNorm
+
+# pylint: enable=invalid-name
+
+
+def serialize(constraint):
+  return serialize_keras_object(constraint)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='constraint')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret constraint identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/constraints_test.py b/tensorflow/contrib/keras/python/keras/constraints_test.py
new file mode 100644
index 00000000000..36fbee7fd56
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/constraints_test.py
@@ -0,0 +1,103 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras weights constraints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+def get_test_values():
+  return [0.1, 0.5, 3, 8, 1e-7]
+
+
+def get_example_array():
+  np.random.seed(3537)
+  example_array = np.random.random((100, 100)) * 100. - 50.
+  example_array[0, 0] = 0.  # 0 could possibly cause trouble
+  return example_array
+
+
+class KerasConstraintsTest(test.TestCase):
+
+  def test_serialization(self):
+    all_activations = ['max_norm', 'non_neg',
+                       'unit_norm', 'min_max_norm']
+    for name in all_activations:
+      fn = keras.constraints.get(name)
+      ref_fn = getattr(keras.constraints, name)()
+      assert fn.__class__ == ref_fn.__class__
+      config = keras.constraints.serialize(fn)
+      fn = keras.constraints.deserialize(config)
+      assert fn.__class__ == ref_fn.__class__
+
+  def test_max_norm(self):
+    with self.test_session():
+      array = get_example_array()
+      for m in get_test_values():
+        norm_instance = keras.constraints.max_norm(m)
+        normed = norm_instance(keras.backend.variable(array))
+        assert np.all(keras.backend.eval(normed) < m)
+
+      # a more explicit example
+      norm_instance = keras.constraints.max_norm(2.0)
+      x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
+      x_normed_target = np.array([[0, 0, 0], [1.0, 0, 0],
+                                  [2.0, 0, 0],
+                                  [2. / np.sqrt(3),
+                                   2. / np.sqrt(3),
+                                   2. / np.sqrt(3)]]).T
+      x_normed_actual = keras.backend.eval(
+          norm_instance(keras.backend.variable(x)))
+      self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
+
+  def test_non_neg(self):
+    with self.test_session():
+      non_neg_instance = keras.constraints.non_neg()
+      normed = non_neg_instance(keras.backend.variable(get_example_array()))
+      assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
+
+  def test_unit_norm(self):
+    with self.test_session():
+      unit_norm_instance = keras.constraints.unit_norm()
+      normalized = unit_norm_instance(
+          keras.backend.variable(get_example_array()))
+      norm_of_normalized = np.sqrt(
+          np.sum(keras.backend.eval(normalized) ** 2, axis=0))
+      # In the unit norm constraint, it should be equal to 1.
+      difference = norm_of_normalized - 1.
+      largest_difference = np.max(np.abs(difference))
+      assert np.abs(largest_difference) < 10e-5
+
+  def test_min_max_norm(self):
+    with self.test_session():
+      array = get_example_array()
+      for m in get_test_values():
+        norm_instance = keras.constraints.min_max_norm(min_value=m,
+                                                       max_value=m * 2)
+        normed = norm_instance(keras.backend.variable(array))
+        value = keras.backend.eval(normed)
+        l2 = np.sqrt(np.sum(np.square(value), axis=0))
+        assert not l2[l2 < m]
+        assert not l2[l2 > m * 2 + 1e-5]
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/datasets/__init__.py b/tensorflow/contrib/keras/python/keras/datasets/__init__.py
new file mode 100644
index 00000000000..fe8dee54db3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras datasets: utilities for downloading and pre-processing common datasets.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.datasets import boston_housing
+from tensorflow.contrib.keras.python.keras.datasets import cifar10
+from tensorflow.contrib.keras.python.keras.datasets import cifar100
+from tensorflow.contrib.keras.python.keras.datasets import imdb
+from tensorflow.contrib.keras.python.keras.datasets import mnist
+from tensorflow.contrib.keras.python.keras.datasets import reuters
+
diff --git a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
new file mode 100644
index 00000000000..36b20451ff6
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
@@ -0,0 +1,59 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boston housing price regression dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
+  """Loads the Boston Housing dataset.
+
+  Arguments:
+      path: path where to cache the dataset locally
+          (relative to ~/.keras/datasets).
+      seed: Random seed for shuffling the data
+          before computing the test split.
+      test_split: fraction of the data to reserve as test set.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  assert 0 <= test_split < 1
+  fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
+  path = get_file(
+      path,
+      origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
+      file_hash=fh)
+  f = np.load(path)
+  x = f['x']
+  y = f['y']
+  f.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(x)
+  np.random.seed(seed)
+  np.random.shuffle(y)
+
+  x_train = np.array(x[:int(len(x) * (1 - test_split))])
+  y_train = np.array(y[:int(len(x) * (1 - test_split))])
+  x_test = np.array(x[int(len(x) * (1 - test_split)):])
+  y_test = np.array(y[int(len(x) * (1 - test_split)):])
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar.py b/tensorflow/contrib/keras/python/keras/datasets/cifar.py
new file mode 100644
index 00000000000..564709c0eed
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar.py
@@ -0,0 +1,53 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by the CIFAR10 and CIFAR100 datasets.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from six.moves import cPickle
+
+
+def load_batch(fpath, label_key='labels'):
+  """Internal utility for parsing CIFAR data.
+
+  Arguments:
+      fpath: path the file to parse.
+      label_key: key for label data in the retrieve
+          dictionary.
+
+  Returns:
+      A tuple `(data, labels)`.
+  """
+  f = open(fpath, 'rb')
+  if sys.version_info < (3,):
+    d = cPickle.load(f)
+  else:
+    d = cPickle.load(f, encoding='bytes')
+    # decode utf8
+    d_decoded = {}
+    for k, v in d.items():
+      d_decoded[k.decode('utf8')] = v
+    d = d_decoded
+  f.close()
+  data = d['data']
+  labels = d[label_key]
+
+  data = data.reshape(data.shape[0], 3, 32, 32)
+  return data, labels
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar10.py b/tensorflow/contrib/keras/python/keras/datasets/cifar10.py
new file mode 100644
index 00000000000..11618b8552b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar10.py
@@ -0,0 +1,61 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR10 small image classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.datasets.cifar import load_batch
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data():
+  """Loads CIFAR10 dataset.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  dirname = 'cifar-10-batches-py'
+  origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+  path = get_file(dirname, origin=origin, untar=True)
+
+  num_train_samples = 50000
+
+  x_train = np.zeros((num_train_samples, 3, 32, 32), dtype='uint8')
+  y_train = np.zeros((num_train_samples,), dtype='uint8')
+
+  for i in range(1, 6):
+    fpath = os.path.join(path, 'data_batch_' + str(i))
+    data, labels = load_batch(fpath)
+    x_train[(i - 1) * 10000:i * 10000, :, :, :] = data
+    y_train[(i - 1) * 10000:i * 10000] = labels
+
+  fpath = os.path.join(path, 'test_batch')
+  x_test, y_test = load_batch(fpath)
+
+  y_train = np.reshape(y_train, (len(y_train), 1))
+  y_test = np.reshape(y_test, (len(y_test), 1))
+
+  if K.image_data_format() == 'channels_last':
+    x_train = x_train.transpose(0, 2, 3, 1)
+    x_test = x_test.transpose(0, 2, 3, 1)
+
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/cifar100.py b/tensorflow/contrib/keras/python/keras/datasets/cifar100.py
new file mode 100644
index 00000000000..eba3ee64150
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/cifar100.py
@@ -0,0 +1,62 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR100 small image classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.datasets.cifar import load_batch
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(label_mode='fine'):
+  """Loads CIFAR100 dataset.
+
+  Arguments:
+      label_mode: one of "fine", "coarse".
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Raises:
+      ValueError: in case of invalid `label_mode`.
+  """
+  if label_mode not in ['fine', 'coarse']:
+    raise ValueError('label_mode must be one of "fine" "coarse".')
+
+  dirname = 'cifar-100-python'
+  origin = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
+  path = get_file(dirname, origin=origin, untar=True)
+
+  fpath = os.path.join(path, 'train')
+  x_train, y_train = load_batch(fpath, label_key=label_mode + '_labels')
+
+  fpath = os.path.join(path, 'test')
+  x_test, y_test = load_batch(fpath, label_key=label_mode + '_labels')
+
+  y_train = np.reshape(y_train, (len(y_train), 1))
+  y_test = np.reshape(y_test, (len(y_test), 1))
+
+  if K.image_data_format() == 'channels_last':
+    x_train = x_train.transpose(0, 2, 3, 1)
+    x_test = x_test.transpose(0, 2, 3, 1)
+
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/imdb.py b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
new file mode 100644
index 00000000000..04ab154f9f3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
@@ -0,0 +1,150 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""IMDB movie review sentiment classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='imdb.npz',
+              num_words=None,
+              skip_top=0,
+              maxlen=None,
+              seed=113,
+              start_char=1,
+              oov_char=2,
+              index_from=3):
+  """Loads the IMDB dataset.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+      num_words: max number of words to include. Words are ranked
+          by how often they occur (in the training set) and only
+          the most frequent words are kept
+      skip_top: skip the top N most frequently occurring words
+          (which may not be informative).
+      maxlen: truncate sequences after this length.
+      seed: random seed for sample shuffling.
+      start_char: The start of a sequence will be marked with this character.
+          Set to 1 because 0 is usually the padding character.
+      oov_char: words that were cut out because of the `num_words`
+          or `skip_top` limit will be replaced with this character.
+      index_from: index actual words with this index and higher.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Raises:
+      ValueError: in case `maxlen` is so low
+          that no input sequence could be kept.
+
+  Note that the 'out of vocabulary' character is only used for
+  words that were present in the training set but are not included
+  because they're not making the `num_words` cut here.
+  Words that were not seen in the training set but are in the test set
+  have simply been skipped.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/text-datasets/imdb.npz')
+  f = np.load(path)
+  x_train = f['x_train']
+  labels_train = f['y_train']
+  x_test = f['x_test']
+  labels_test = f['y_test']
+  f.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(x_train)
+  np.random.seed(seed)
+  np.random.shuffle(labels_train)
+
+  np.random.seed(seed * 2)
+  np.random.shuffle(x_test)
+  np.random.seed(seed * 2)
+  np.random.shuffle(labels_test)
+
+  xs = np.concatenate([x_train, x_test])
+  labels = np.concatenate([labels_train, labels_test])
+
+  if start_char is not None:
+    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+  elif index_from:
+    xs = [[w + index_from for w in x] for x in xs]
+
+  if maxlen:
+    new_xs = []
+    new_labels = []
+    for x, y in zip(xs, labels):
+      if len(x) < maxlen:
+        new_xs.append(x)
+        new_labels.append(y)
+    xs = new_xs
+    labels = new_labels
+    if not xs:
+      raise ValueError('After filtering for sequences shorter than maxlen=' +
+                       str(maxlen) + ', no sequence was kept. '
+                       'Increase maxlen.')
+  if not num_words:
+    num_words = max([max(x) for x in xs])
+
+  # by convention, use 2 as OOV word
+  # reserve 'index_from' (=3 by default) characters:
+  # 0 (padding), 1 (start), 2 (OOV)
+  if oov_char is not None:
+    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
+          for x in xs]
+  else:
+    new_xs = []
+    for x in xs:
+      nx = []
+      for w in x:
+        if skip_top <= w < num_words:
+          nx.append(w)
+      new_xs.append(nx)
+    xs = new_xs
+
+  x_train = np.array(xs[:len(x_train)])
+  y_train = np.array(labels[:len(x_train)])
+
+  x_test = np.array(xs[len(x_train):])
+  y_test = np.array(labels[len(x_train):])
+
+  return (x_train, y_train), (x_test, y_test)
+
+
+def get_word_index(path='imdb_word_index.json'):
+  """Retrieves the dictionary mapping word indices back to words.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+
+  Returns:
+      The word index dictionary.
+  """
+  path = get_file(
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
+  f = open(path)
+  data = json.load(f)
+  f.close()
+  return data
diff --git a/tensorflow/contrib/keras/python/keras/datasets/mnist.py b/tensorflow/contrib/keras/python/keras/datasets/mnist.py
new file mode 100644
index 00000000000..aaced003d0f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/mnist.py
@@ -0,0 +1,44 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST handwritten digits classification dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='mnist.npz'):
+  """Loads the MNIST dataset.
+
+  Arguments:
+      path: path where to cache the dataset locally
+          (relative to ~/.keras/datasets).
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/img-datasets/mnist.npz')
+  f = np.load(path)
+  x_train = f['x_train']
+  y_train = f['y_train']
+  x_test = f['x_test']
+  y_test = f['y_test']
+  f.close()
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/contrib/keras/python/keras/datasets/reuters.py b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
new file mode 100644
index 00000000000..2904eb5bf6f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
@@ -0,0 +1,136 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reuters newswire topic classification dataset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+
+def load_data(path='reuters.npz',
+              num_words=None,
+              skip_top=0,
+              maxlen=None,
+              test_split=0.2,
+              seed=113,
+              start_char=1,
+              oov_char=2,
+              index_from=3):
+  """Loads the Reuters newswire classification dataset.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+      num_words: max number of words to include. Words are ranked
+          by how often they occur (in the training set) and only
+          the most frequent words are kept
+      skip_top: skip the top N most frequently occurring words
+          (which may not be informative).
+      maxlen: truncate sequences after this length.
+      test_split: Fraction of the dataset to be used as test data.
+      seed: random seed for sample shuffling.
+      start_char: The start of a sequence will be marked with this character.
+          Set to 1 because 0 is usually the padding character.
+      oov_char: words that were cut out because of the `num_words`
+          or `skip_top` limit will be replaced with this character.
+      index_from: index actual words with this index and higher.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  Note that the 'out of vocabulary' character is only used for
+  words that were present in the training set but are not included
+  because they're not making the `num_words` cut here.
+  Words that were not seen in the training set but are in the test set
+  have simply been skipped.
+  """
+  path = get_file(
+      path, origin='https://s3.amazonaws.com/text-datasets/reuters.npz')
+  npzfile = np.load(path)
+  xs = npzfile['x']
+  labels = npzfile['y']
+  npzfile.close()
+
+  np.random.seed(seed)
+  np.random.shuffle(xs)
+  np.random.seed(seed)
+  np.random.shuffle(labels)
+
+  if start_char is not None:
+    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+  elif index_from:
+    xs = [[w + index_from for w in x] for x in xs]
+
+  if maxlen:
+    new_xs = []
+    new_labels = []
+    for x, y in zip(xs, labels):
+      if len(x) < maxlen:
+        new_xs.append(x)
+        new_labels.append(y)
+    xs = new_xs
+    labels = new_labels
+
+  if not num_words:
+    num_words = max([max(x) for x in xs])
+
+  # by convention, use 2 as OOV word
+  # reserve 'index_from' (=3 by default) characters:
+  # 0 (padding), 1 (start), 2 (OOV)
+  if oov_char is not None:
+    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
+          for x in xs]
+  else:
+    new_xs = []
+    for x in xs:
+      nx = []
+      for w in x:
+        if skip_top <= w < num_words:
+          nx.append(w)
+      new_xs.append(nx)
+    xs = new_xs
+
+  x_train = np.array(xs[:int(len(xs) * (1 - test_split))])
+  y_train = np.array(labels[:int(len(xs) * (1 - test_split))])
+
+  x_test = np.array(xs[int(len(xs) * (1 - test_split)):])
+  y_test = np.array(labels[int(len(xs) * (1 - test_split)):])
+
+  return (x_train, y_train), (x_test, y_test)
+
+
+def get_word_index(path='reuters_word_index.json'):
+  """Retrieves the dictionary mapping word indices back to words.
+
+  Arguments:
+      path: where to cache the data (relative to `~/.keras/dataset`).
+
+  Returns:
+      The word index dictionary.
+  """
+  path = get_file(
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json')
+  f = open(path)
+  data = json.load(f)
+  f.close()
+  return data
diff --git a/tensorflow/contrib/keras/python/keras/engine/__init__.py b/tensorflow/contrib/keras/python/keras/engine/__init__.py
new file mode 100644
index 00000000000..0a1dc3dd2de
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Keras Engine: graph topology and training loop functionality.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.engine.topology import Input
+from tensorflow.contrib.keras.python.keras.engine.topology import InputLayer
+from tensorflow.contrib.keras.python.keras.engine.topology import InputSpec
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.contrib.keras.python.keras.engine.training import Model
+
+
+# Note: topology.Node is an internal class,
+# it isn't meant to be used by Keras users.
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
new file mode 100644
index 00000000000..07d708ada3c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -0,0 +1,2741 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Base layer code and base model (Container) code.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+import re
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+InputSpec = tf_base_layers.InputSpec  # pylint: disable=invalid-name
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer.inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer.outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      input_masks: list of input masks (a mask can be a tensor, or None).
+      output_masks: list of output masks (a mask can be a tensor, or None).
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+      A.outbound_nodes
+      B.inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               input_masks,
+               output_masks,
+               arguments=None):
+    # Layer instance (NOT a list).
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output masks.
+    # List of tensors, 1:1 mapping with input_tensor.
+    self.input_masks = input_masks
+    # List of tensors, created by outbound_layer.compute_mask().
+    self.output_masks = output_masks
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [K.int_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [K.int_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        layer.outbound_nodes.append(self)
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer':
+            self.outbound_layer.name if self.outbound_layer else None,
+        'inbound_layers':
+            inbound_names,
+        'node_indices':
+            self.node_indices,
+        'tensor_indices':
+            self.tensor_indices
+    }
+
+
+class Layer(tf_base_layers.Layer):
+  """Abstract base layer class.
+
+  # Properties
+      name: String, must be unique within a model.
+      input_spec: List of InputSpec class instances
+          each entry describes one required input:
+              - ndim
+              - dtype
+          A layer with `n` input tensors must have
+          an `input_spec` of length `n`.
+      trainable: Boolean, whether the layer weights
+          will be updated during training.
+      uses_learning_phase: Whether any operation
+          of the layer uses `K.in_training_phase()`
+          or `K.in_test_phase()`.
+      input_shape: Shape tuple. Provided for convenience,
+          but note that there may be cases in which this
+          attribute is ill-defined (e.g. a shared layer
+          with multiple input shapes), in which case
+          requesting `input_shape` will raise an Exception.
+          Prefer using `layer.get_input_shape_for(input_shape)`,
+          or `layer.get_input_shape_at(node_index)`.
+      output_shape: Shape tuple. See above.
+      inbound_nodes: List of nodes.
+      outbound_nodes: List of nodes.
+      input, output: Input/output tensor(s). Note that if the layer is used
+          more than once (shared layer), this is ill-defined
+          and will raise an exception. In such cases, use
+          `layer.get_input_at(node_index)`.
+      input_mask, output_mask: Same as above, for masks.
+      trainable_weights: List of variables.
+      non_trainable_weights: List of variables.
+      weights: The concatenation of the lists trainable_weights and
+          non_trainable_weights (in this order).
+      constraints: Dict mapping weights to constraints.
+
+  # Methods
+      call(x, mask=None): Where the layer's logic lives.
+      __call__(x, mask=None): Wrapper around the layer logic (`call`).
+          If x is a Keras tensor:
+              - Connect current layer with last layer from tensor:
+                  `self._add_inbound_node(last_layer)`
+              - Add layer to tensor history
+          If layer is not built:
+              - Build from inputs shape
+      get_weights()
+      set_weights(weights)
+      get_config()
+      count_params()
+      _compute_output_shape(input_shape)
+      compute_mask(x, mask)
+      get_input_at(node_index)
+      get_output_at(node_index)
+      get_input_shape_at(node_index)
+      get_output_shape_at(node_index)
+      get_input_mask_at(node_index)
+      get_output_mask_at(node_index)
+
+  # Class Methods
+      from_config(config)
+
+  # Internal methods:
+      build(input_shape)
+      _add_inbound_node(layer, index=0)
+  """
+
+  def __init__(self, **kwargs):
+    # These properties should be set by the user via keyword arguments.
+    # note that 'dtype', 'input_shape' and 'batch_input_shape'
+    # are only applicable to input layers: do not pass these keywords
+    # to non-input layers.
+    allowed_kwargs = {
+        'input_shape',
+        'batch_input_shape',
+        'batch_size',
+        'dtype',
+        'name',
+        'trainable',
+        'weights',
+    }
+    # Validate optional keyword arguments.
+    for kwarg in kwargs:
+      if kwarg not in allowed_kwargs:
+        raise TypeError('Keyword argument not understood:', kwarg)
+
+    # Get layer name.
+    name = kwargs.get('name')
+
+    # Get `trainable` status.
+    trainable = kwargs.get('trainable', True)
+
+    # Get `dtype`.
+    dtype = kwargs.get('dtype')
+    if dtype is None:
+      dtype = K.floatx()
+
+    # Call super, which will set all properties common to Keras layers
+    # and core TF layers.
+    super(Layer, self).__init__(name=name, dtype=dtype, trainable=trainable)
+
+    # Add properties that are Keras-only for now.
+    self.input_spec = None
+    self.supports_masking = False
+    self._constraints = {}  # dict {tensor: constraint instance}
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+
+    # Manage input shape information if passed.
+    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
+      # In this case we will later create an input layer
+      # to insert before the current layer
+      if 'batch_input_shape' in kwargs:
+        batch_input_shape = tuple(kwargs['batch_input_shape'])
+      elif 'input_shape' in kwargs:
+        if 'batch_size' in kwargs:
+          batch_size = kwargs['batch_size']
+        else:
+          batch_size = None
+        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+      self.batch_input_shape = batch_input_shape
+
+    # Manage initial weight values if passed.
+    if 'weights' in kwargs:
+      self._initial_weights = kwargs['weights']
+    else:
+      self._initial_weights = None
+
+  @property
+  def constraints(self):
+    return self._constraints
+
+  @constraints.setter
+  def constraints(self, constraints):
+    self._constraints = constraints
+
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=True,
+                 constraint=None):
+    """Adds a weight variable to the layer.
+
+    Arguments:
+        name: String, the name for the weight variable.
+        shape: The shape tuple of the weight.
+        dtype: The dtype of the weight.
+        initializer: An Initializer instance (callable).
+        regularizer: An optional Regularizer instance.
+        trainable: A boolean, whether the weight should
+            be trained via backprop or not (assuming
+            that the layer itself is also trainable).
+        constraint: An optional Constraint instance.
+
+    Returns:
+        The created weight variable.
+    """
+    if dtype is None:
+      dtype = K.floatx()
+    weight = self.add_variable(
+        name, shape, dtype=dtype,
+        initializer=initializer, regularizer=regularizer, trainable=trainable)
+    if constraint is not None:
+      self.constraints[weight] = constraint
+    return weight
+
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+    """This is where the layer's logic lives.
+
+    Arguments:
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
+
+    Returns:
+        A tensor or list/tuple of tensors.
+    """
+    return inputs
+
+  def __call__(self, inputs, **kwargs):
+    """Wrapper around self.call(), for handling internal references.
+
+    If a Keras tensor is passed:
+        - We call self._add_inbound_node().
+        - If necessary, we `build` the layer to match
+            the shape of the input(s).
+        - We update the _keras_history of the output tensor(s)
+            with the current layer.
+            This is done as part of _add_inbound_node().
+
+    Arguments:
+        inputs: Can be a tensor or list/tuple of tensors.
+        **kwargs: Additional keyword arguments to be passed to `call()`.
+
+    Returns:
+        Output of the layer's `call` method.
+
+    Raises:
+        ValueError: in case the layer is missing shape information
+            for its `build` call.
+    """
+    if isinstance(inputs, list):
+      inputs = inputs[:]
+
+    # Handle mask propagation.
+    previous_mask = _collect_previous_mask(inputs)
+    user_kwargs = copy.copy(kwargs)
+    if not _is_all_none(previous_mask):
+      # The previous layer generated a mask.
+      if 'mask' in tf_inspect.getargspec(self.call).args:
+        if 'mask' not in kwargs:
+          # If mask is explicitly passed to __call__,
+          # we should override the default mask.
+          kwargs['mask'] = previous_mask
+
+    # Actually call the layer (optionally building it).
+    output = super(Layer, self).__call__(inputs, **kwargs)
+
+    # Handle mask computation.
+    with K.name_scope(self.name):
+      output_mask = self.compute_mask(inputs, previous_mask)
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = _to_list(output)
+    inputs_ls = _to_list(inputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in inputs_ls:
+        x = K.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      output = output_ls_copy[0]
+    else:
+      output = output_ls_copy
+
+    # Add an inbound node to the layer, so that it keeps track
+    # of the call and of all new variables created during the call.
+    # This also updates the layer history of the output tensor(s).
+    # If the input tensor(s) had not previous Keras history,
+    # this does nothing.
+    self._add_inbound_node(
+        input_tensors=inputs,
+        output_tensors=output,
+        input_masks=previous_mask,
+        output_masks=output_mask,
+        arguments=user_kwargs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
+    return output
+
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        input_masks,
+                        output_masks,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        input_masks: list of input masks (a mask can be a tensor, or None).
+        output_masks: list of output masks (a mask can be a tensor, or None).
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = _to_list(input_tensors)
+    output_tensors = _to_list(output_tensors)
+    input_masks = _to_list(input_masks)
+    output_masks = _to_list(output_masks)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      if hasattr(x, '_keras_history'):
+        inbound_layer, node_index, tensor_index = x._keras_history
+        inbound_layers.append(inbound_layer)
+        node_indices.append(node_index)
+        tensor_indices.append(tensor_index)
+      else:
+        inbound_layers.append(None)
+        node_indices.append(None)
+        tensor_indices.append(None)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        input_masks=input_masks,
+        output_masks=output_masks,
+        arguments=arguments)
+
+    # Update tensor history and `_uses_learning_phase`.
+    for i in range(len(output_tensors)):
+      uses_lp = any(
+          [getattr(x, '_uses_learning_phase', False) for x in input_tensors])
+      uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
+      output_tensors[i]._uses_learning_phase = getattr(
+          output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      output_tensors[i]._keras_history = (self, len(self.inbound_nodes) - 1, i)
+
+  def _compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer.
+
+    Assumes that the layer will be built
+    to match that input shape provided.
+
+    Arguments:
+        input_shape: Shape tuple (tuple of integers)
+            or list of shape tuples (one per output tensor of the layer).
+            Shape tuples can include None for free dimensions,
+            instead of an integer.
+
+    Returns:
+        An input shape tuple.
+    """
+    if isinstance(input_shape, list):
+      return [tensor_shape.TensorShape(shape) for shape in input_shape]
+    else:
+      return tensor_shape.TensorShape(input_shape)
+
+  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
+    """Computes an output mask tensor.
+
+    Arguments:
+        inputs: Tensor or list of tensors.
+        mask: Tensor or list of tensors.
+
+    Returns:
+        None or a tensor (or list of tensors,
+            one per output tensor of the layer).
+    """
+    if not self.supports_masking:
+      if mask is not None:
+        if isinstance(mask, list):
+          if any(m is not None for m in mask):
+            raise TypeError('Layer ' + self.name + ' does not support masking, '
+                            'but was passed an input_mask: ' + str(mask))
+        else:
+          raise TypeError('Layer ' + self.name + ' does not support masking, '
+                          'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask
+      return None
+    # if masking is explicitly supported, by default
+    # carry over the input mask
+    return mask
+
+  def build(self, input_shape):  # pylint: disable=unused-argument
+    """Creates the layer weights.
+
+    Must be implemented on all layers that have weights.
+
+    Arguments:
+        input_shape: Keras tensor (future input to layer)
+            or list/tuple of Keras tensors to reference
+            for weight shape computations.
+    """
+    self.built = True
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Retrieves an attribute (e.g. input_tensors) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes.
+        ValueError: If the index is does not match any node.
+    """
+    if not self.inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self.inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self.inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self.inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  def get_input_mask_at(self, node_index):
+    """Retrieves the input mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple inputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_masks',
+                                             'input mask')
+
+  def get_output_mask_at(self, node_index):
+    """Retrieves the output mask tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A mask tensor
+        (or list of tensors if the layer has multiple outputs).
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_masks',
+                                             'output mask')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) > 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer input" '
+                           'is ill-defined. '
+                           'Use `get_input_at(node_index)` instead.')
+    elif not self.inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Output tensor or list of output tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    if len(self.inbound_nodes) > 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer output" '
+                           'is ill-defined. '
+                           'Use `get_output_at(node_index)` instead.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_mask(self):
+    """Retrieves the input mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input mask tensor (potentially None) or list of input
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) != 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, ' +
+                           'hence the notion of "layer input mask" '
+                           'is ill-defined. '
+                           'Use `get_input_mask_at(node_index)` '
+                           'instead.')
+    return self._get_node_attribute_at_index(0, 'input_masks', 'input mask')
+
+  @property
+  def output_mask(self):
+    """Retrieves the output mask tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Output mask tensor (potentially None) or list of output
+        mask tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if len(self.inbound_nodes) != 1:
+      raise AttributeError('Layer ' + self.name +
+                           ' has multiple inbound nodes, '
+                           'hence the notion of "layer output mask" '
+                           'is ill-defined. '
+                           'Use `get_output_mask_at(node_index)` '
+                           'instead.')
+    return self._get_node_attribute_at_index(0, 'output_masks', 'output mask')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one inbound node,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self.inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self.inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one inbound node,
+    or if all inbound nodes have the same output shape.
+
+    Returns:
+        Output shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+    """
+    if not self.inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self.inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self.inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.')
+
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = K.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    K.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current weights of the layer.
+
+    Returns:
+        Weights values as a list of numpy arrays.
+    """
+    params = self.weights
+    return K.batch_get_value(params)
+
+  def get_config(self):
+    """Returns the config of the layer.
+
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
+
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Container` (one layer of abstraction above).
+
+    Returns:
+        Python dictionary.
+    """
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, 'batch_input_shape'):
+      config['batch_input_shape'] = self.batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Container), nor weights (handled by `set_weights`).
+
+    Arguments:
+        config: A Python dictionary, typically the
+            output of get_config.
+
+    Returns:
+        A layer instance.
+    """
+    return cls(**config)
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        RuntimeError: if the layer isn't yet built
+            (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise RuntimeError('You tried to call `count_params` on ' + self.name +
+                           ', but the layer isn\'t built. '
+                           'You can build it manually via: `' + self.name +
+                           '.build(batch_input_shape)`.')
+    return sum([K.count_params(p) for p in self.weights])
+
+
+class InputLayer(Layer):
+  """Layer to be used as an entry point into a graph.
+
+  It can either wrap an existing tensor (pass an `input_tensor` argument)
+  or create its a placeholder tensor (pass arguments `input_shape`
+  or `batch_input_shape` as well as `dtype`).
+
+  Arguments:
+      input_shape: Shape tuple, not including the batch axis.
+      batch_size: Optional input batch size (integer or None).
+      batch_input_shape: Shape tuple, including the batch axis.
+      dtype: Datatype of the input.
+      input_tensor: Optional tensor to use as layer input
+          instead of creating a placeholder.
+      sparse: Boolean, whether the placeholder created
+          is meant to be sparse.
+      name: Name of the layer (string).
+  """
+
+  def __init__(self,
+               input_shape=None,
+               batch_size=None,
+               batch_input_shape=None,
+               dtype=None,
+               input_tensor=None,
+               sparse=False,
+               name=None):
+    if not name:
+      prefix = 'input'
+      name = prefix + '_' + str(K.get_uid(prefix))
+    if not dtype:
+      if input_tensor is None:
+        dtype = K.floatx()
+      else:
+        dtype = K.dtype(input_tensor)
+    super(InputLayer, self).__init__(dtype=dtype, name=name)
+    self.built = True
+    self.sparse = sparse
+
+    if input_shape and batch_input_shape:
+      raise ValueError('Only provide the input_shape OR '
+                       'batch_input_shape argument to '
+                       'InputLayer, not both at the same time.')
+    if input_tensor is not None:
+      # Attempt automatic input shape inference.
+      try:
+        batch_input_shape = K.int_shape(input_tensor)
+      except TypeError:
+        if not input_shape and not batch_input_shape:
+          raise ValueError('InputLayer was provided '
+                           'an input_tensor argument, '
+                           'but its input shape cannot be '
+                           'automatically inferred. '
+                           'You should pass an input_shape or '
+                           'batch_input_shape argument.')
+    if not batch_input_shape:
+      if not input_shape:
+        raise ValueError('An Input layer should be passed either '
+                         'a `batch_input_shape` or an `input_shape`.')
+      else:
+        batch_input_shape = (batch_size,) + tuple(input_shape)
+    else:
+      batch_input_shape = tuple(batch_input_shape)
+    self.batch_input_shape = batch_input_shape
+
+    if input_tensor is None:
+      self.is_placeholder = True
+      input_tensor = K.placeholder(
+          shape=batch_input_shape,
+          dtype=dtype,
+          sparse=self.sparse,
+          name=self.name)
+    else:
+      self.is_placeholder = False
+    # Create an input node to add to self.outbound_node
+    # and set output_tensors' _keras_history.
+    input_tensor._uses_learning_phase = False
+    input_tensor._keras_history = (self, 0, 0)
+    Node(
+        self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=[input_tensor],
+        output_tensors=[input_tensor],
+        input_masks=[None],
+        output_masks=[None])
+
+  def get_config(self):
+    config = {
+        'batch_input_shape': self.batch_input_shape,
+        'dtype': self.dtype,
+        'sparse': self.sparse,
+        'name': self.name
+    }
+    return config
+
+
+def Input(  # pylint: disable=invalid-name
+    shape=None,
+    batch_shape=None,
+    name=None,
+    dtype=K.floatx(),
+    sparse=False,
+    tensor=None):
+  """`Input()` is used to instantiate a Keras tensor.
+
+  A Keras tensor is a tensor object from the underlying backend
+  (Theano or TensorFlow), which we augment with certain
+  attributes that allow us to build a Keras model
+  just by knowing the inputs and outputs of the model.
+
+  For instance, if a, b and c are Keras tensors,
+  it becomes possible to do:
+  `model = Model(input=[a, b], output=c)`
+
+  The added Keras attribute is:
+      `_keras_history`: Last layer applied to the tensor.
+          the entire layer graph is retrievable from that layer,
+          recursively.
+
+  Arguments:
+      shape: A shape tuple (integer), not including the batch size.
+          For instance, `shape=(32,)` indicates that the expected input
+          will be batches of 32-dimensional vectors.
+      batch_shape: A shape tuple (integer), including the batch size.
+          For instance, `batch_shape=(10, 32)` indicates that
+          the expected input will be batches of 10 32-dimensional vectors.
+          `batch_shape=(None, 32)` indicates batches of an arbitrary number
+          of 32-dimensional vectors.
+      name: An optional name string for the layer.
+          Should be unique in a model (do not reuse the same name twice).
+          It will be autogenerated if it isn't provided.
+      dtype: The data type expected by the input, as a string
+          (`float32`, `float64`, `int32`...)
+      sparse: A boolean specifying whether the placeholder
+          to be created is sparse.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+          If set, the layer will not create a placeholder tensor.
+
+  Returns:
+      A tensor.
+
+  Example:
+
+      ```python
+      # this is a logistic regression in Keras
+      x = Input(shape=(32,))
+      y = Dense(16, activation='softmax')(x)
+      model = Model(x, y)
+      ```
+  """
+  if not batch_shape and tensor is None:
+    assert shape, ('Please provide to Input either a `shape`'
+                   ' or a `batch_shape` argument. Note that '
+                   '`shape` does not include the batch '
+                   'dimension.')
+  if shape and not batch_shape:
+    batch_shape = (None,) + tuple(shape)
+  input_layer = InputLayer(
+      batch_input_shape=batch_shape,
+      name=name,
+      dtype=dtype,
+      sparse=sparse,
+      input_tensor=tensor)
+  # Return tensor including `_keras_history`.
+  # Note that in this case train_output and test_output are the same pointer.
+  outputs = input_layer.inbound_nodes[0].output_tensors
+  if len(outputs) == 1:
+    return outputs[0]
+  else:
+    return outputs
+
+
+class Container(Layer):
+  """A Container is a directed acyclic graph of layers.
+
+  It is the topological form of a "model". A Model
+  is simply a Container with added training routines.
+
+  # Properties
+      name
+      inputs
+      outputs
+      input_layers
+      output_layers
+      input_spec (list of class instances)
+          each entry describes one required input:
+              - ndim
+              - dtype
+      trainable (boolean)
+      input_shape
+      output_shape
+      inbound_nodes: list of nodes
+      outbound_nodes: list of nodes
+      trainable_weights (list of variables)
+      non_trainable_weights (list of variables)
+      constraints (list of tuples (weight, constraint))
+
+  # Methods
+      summary
+      get_layer
+      get_weights
+      set_weights
+      get_config
+      compute_output_shape
+
+  # Class Methods
+      from_config
+  """
+
+  def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
+    # Handle `name` argument.
+    if not name:
+      prefix = self.__class__.__name__.lower()
+      name = prefix + '_' + str(K.get_uid(prefix))
+    self.name = name
+    self.supports_masking = False
+    self.trainable = True
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
+
+    # Container-specific properties.
+    if isinstance(inputs, (list, tuple)):
+      self.inputs = list(inputs)  # Tensor or list of tensors.
+    else:
+      self.inputs = [inputs]
+    if isinstance(outputs, (list, tuple)):
+      self.outputs = list(outputs)
+    else:
+      self.outputs = [outputs]
+
+    # Check for redundancy in inputs.
+    inputs_set = set(self.inputs)
+    if len(inputs_set) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    # List of initial layers (1 to 1 mapping with self.inputs,
+    # hence the same layer might appear twice)
+    self.input_layers = []
+    self.input_layers_node_indices = []
+    self.input_layers_tensor_indices = []
+    # list of layers (1 to 1 mapping with self.inputs,
+    # hence the same layer might appear twice)
+    self.output_layers = []
+    self.output_layers_node_indices = []
+    self.output_layers_tensor_indices = []
+    # all layers in order of horizontal graph traversal.
+    # Entries are unique. Includes input and output layers.
+    self.layers = []
+
+    # This is for performance optimization
+    # when calling the Container on new inputs.
+    # every time the Container is called on a set on input tensors,
+    # we compute the output tensors,
+    # output masks and output shapes in one pass,
+    # then cache them here. When of of these output is queried later,
+    # we retrieve it from there instead of recomputing it.
+    self._output_mask_cache = {}
+    self._output_tensor_cache = {}
+    self._output_shape_cache = {}
+
+    # User-provided arguments validation.
+    for x in self.inputs:
+      # Check that x is a Keras tensor.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise TypeError('Input tensors to a ' + cls_name + ' ' +
+                        'must be Keras tensors. Found: ' + str(x) +
+                        ' (missing Keras metadata).')
+      # Check that x is an input tensor.
+      layer, node_index, tensor_index = x._keras_history
+      if len(layer.inbound_nodes) > 1 or (
+          layer.inbound_nodes and layer.inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        'a Keras Input layer, '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise TypeError('Output tensors to a ' + cls_name + ' must be '
+                        'Keras tensors. Found: ' + str(x))
+    # Build self.output_layers:
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history
+      self.output_layers.append(layer)
+      self.output_layers_node_indices.append(node_index)
+      self.output_layers_tensor_indices.append(tensor_index)
+
+    # Fill in the output mask cache.
+    masks = []
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history
+      node = layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    mask_cache_key = ','.join([str(id(x)) for x in self.inputs])
+    mask_cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    masks = []
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history
+      node = layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    if len(masks) == 1:
+      mask = masks[0]
+    else:
+      mask = masks
+    self._output_mask_cache[mask_cache_key] = mask
+
+    # Build self.input_layers:
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history
+      # It's supposed to be an input layer, so only one node
+      # and one tensor output.
+      assert node_index == 0
+      assert tensor_index == 0
+      self.input_layers.append(layer)
+      self.input_layers_node_indices.append(node_index)
+      self.input_layers_tensor_indices.append(tensor_index)
+
+    # Build self.input_names and self.output_names.
+    self.input_names = []
+    self.output_names = []
+    self._feed_input_names = []
+    self._feed_inputs = []
+    self._feed_input_shapes = []
+    for i, layer in enumerate(self.input_layers):
+      self.input_names.append(layer.name)
+      if layer.is_placeholder:
+        self._feed_input_names.append(layer.name)
+        self._feed_inputs.append(layer.input)
+        self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+    for layer in self.output_layers:
+      self.output_names.append(layer.name)
+
+    self.internal_input_shapes = [K.int_shape(x) for x in self.inputs]
+    self.internal_output_shapes = [K.int_shape(x) for x in self.outputs]
+
+    # Container_nodes: set of nodes included in the graph
+    # (not all nodes included in the layers
+    # are relevant to the current graph).
+    container_nodes = set()  # ids of all nodes relevant to the Container
+    nodes_depths = {}  # dict {node: depth value}
+    layers_depths = {}  # dict {layer: depth value}
+    layer_indices = {}  # dict {layer: index in traversal}
+    nodes_in_decreasing_depth = []
+
+    def build_map_of_graph(tensor,
+                           finished_nodes,
+                           nodes_in_progress,
+                           layer=None,
+                           node_index=None,
+                           tensor_index=None):
+      """Builds a map of the graph of layers.
+
+      This recursively updates the map `layer_indices`,
+      the list `nodes_in_decreasing_depth` and the set `container_nodes`.
+
+      Arguments:
+          tensor: Some tensor in a graph.
+          finished_nodes: Set of nodes whose subgraphs have been traversed
+              completely. Useful to prevent duplicated work.
+          nodes_in_progress: Set of nodes that are currently active on the
+              recursion stack. Useful to detect cycles.
+          layer: Layer from which `tensor` comes from. If not provided,
+              will be obtained from `tensor._keras_history`.
+          node_index: Node index from which `tensor` comes from.
+          tensor_index: Tensor_index from which `tensor` comes from.
+
+      Raises:
+          RuntimeError: if a cycle is detected.
+      """
+      if not layer or node_index is None or tensor_index is None:
+        layer, node_index, tensor_index = tensor._keras_history
+      node = layer.inbound_nodes[node_index]
+
+      # Prevent cycles.
+      if node in nodes_in_progress:
+        raise RuntimeError('The tensor ' + str(tensor) + ' at layer "' +
+                           layer.name + '" is part of a cycle.')
+
+      # Don't repeat work for shared subgraphs
+      if node in finished_nodes:
+        return
+
+      node_key = layer.name + '_ib-' + str(node_index)
+      # Update container_nodes.
+      container_nodes.add(node_key)
+
+      # Store the traversal order for layer sorting.
+      if layer not in layer_indices:
+        layer_indices[layer] = len(layer_indices)
+
+      nodes_in_progress.add(node)
+
+      # Propagate to all previous tensors connected to this node.
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        tensor_index = node.tensor_indices[i]
+        build_map_of_graph(x, finished_nodes, nodes_in_progress, layer,
+                           node_index, tensor_index)
+
+      finished_nodes.add(node)
+      nodes_in_progress.remove(node)
+
+      nodes_in_decreasing_depth.append(node)
+
+    finished_nodes = set()
+    nodes_in_progress = set()
+    for x in self.outputs:
+      build_map_of_graph(x, finished_nodes, nodes_in_progress)
+
+    for node in reversed(nodes_in_decreasing_depth):
+      # If the depth is not set, the node has no outbound nodes (depth 0).
+      depth = nodes_depths.setdefault(node, 0)
+
+      # Update the depth of the corresponding layer
+      previous_depth = layers_depths.get(node.outbound_layer, 0)
+      # If we've seen this layer before at a higher depth,
+      # we should use that depth instead of the node depth.
+      # This is necessary for shared layers that have inputs at different
+      # depth levels in the graph.
+      depth = max(depth, previous_depth)
+      layers_depths[node.outbound_layer] = depth
+      nodes_depths[node] = depth
+
+      # Update the depth of inbound nodes.
+      for i in range(len(node.inbound_layers)):
+        inbound_layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        inbound_node = inbound_layer.inbound_nodes[node_index]
+        previous_depth = nodes_depths.get(inbound_node, 0)
+        nodes_depths[inbound_node] = max(depth + 1, previous_depth)
+
+    # Build a dict {depth: list of nodes with this depth}
+    nodes_by_depth = {}
+    for node, depth in nodes_depths.items():
+      if depth not in nodes_by_depth:
+        nodes_by_depth[depth] = []
+      nodes_by_depth[depth].append(node)
+
+    # Build a dict {depth: list of layers with this depth}
+    layers_by_depth = {}
+    for layer, depth in layers_depths.items():
+      if depth not in layers_by_depth:
+        layers_by_depth[depth] = []
+      layers_by_depth[depth].append(layer)
+
+    # Get sorted list of layer depths.
+    depth_keys = list(layers_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Set self.layers and self.layers_by_depth.
+    layers = []
+    for depth in depth_keys:
+      layers_for_depth = layers_by_depth[depth]
+      # Container.layers needs to have a deterministic order:
+      # here we order them by traversal order.
+      layers_for_depth.sort(key=lambda x: layer_indices[x])
+      for layer in layers_for_depth:
+        layers.append(layer)
+    self.layers = layers
+    self.layers_by_depth = layers_by_depth
+
+    # Get sorted list of node depths.
+    depth_keys = list(nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Check that all tensors required are computable.
+    # computable_tensors: all tensors in the graph
+    # that can be computed from the inputs provided.
+    computable_tensors = []
+    for x in self.inputs:
+      computable_tensors.append(x)
+
+    layers_with_complete_input = []  # To provide a better error msg.
+    for depth in depth_keys:
+      for node in nodes_by_depth[depth]:
+        layer = node.outbound_layer
+        if layer:
+          for x in node.input_tensors:
+            if x not in computable_tensors:
+              raise RuntimeError('Graph disconnected: '
+                                 'cannot obtain value for tensor ' + str(x) +
+                                 ' at layer "' + layer.name + '". '
+                                 'The following previous layers '
+                                 'were accessed without issue: ' +
+                                 str(layers_with_complete_input))
+          for x in node.output_tensors:
+            computable_tensors.append(x)
+          layers_with_complete_input.append(layer.name)
+
+    # Set self.nodes and self.nodes_by_depth.
+    self.container_nodes = container_nodes
+    self.nodes_by_depth = nodes_by_depth
+
+    # Ensure name unicity, which will be crucial for serialization
+    # (since serialized nodes refer to layers by their name).
+    all_names = [layer.name for layer in self.layers]
+    for name in all_names:
+      if all_names.count(name) != 1:
+        raise RuntimeError('The name "' + name + '" is used ' +
+                           str(all_names.count(name)) + ' times in the model. '
+                           'All layer names should be unique.')
+
+    # Layer parameters.
+    # The new container starts with a single inbound node
+    # for its inputs, and no outbound nodes.
+    self.outbound_nodes = []  # Will be appended to by future calls to __call__
+    self.inbound_nodes = [
+    ]  # Will be appended to below, and by future calls to __call__
+    # Create the node linking internal inputs to internal outputs.
+    Node(
+        outbound_layer=self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=self.inputs,
+        output_tensors=self.outputs,
+        # No container-level masking for now.
+        input_masks=[None for _ in self.inputs],
+        output_masks=[None for _ in self.outputs])
+    self.built = True
+
+    # The following are implemented as property functions:
+    # self.constraints
+    # self.trainable_weights
+    # self.non_trainable_weights
+    # self.input_spec
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # It would be unreliable to build a dictionary
+    # based on layer names, because names can potentially
+    # be changed at any point by the user
+    # without the container being notified of it.
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+    else:
+      if not name:
+        raise ValueError('Provide either a layer name or layer index.')
+    layer = None
+    for layer in self.layers:
+      if layer.name == name:
+        return layer
+    if not layer:
+      raise ValueError('No such layer: ' + name)
+
+  @property
+  def updates(self):
+    """Retrieve the model's updates.
+
+    Will only include updates that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include updates that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of update ops.
+    """
+    updates = []
+    for layer in self.layers:
+      if hasattr(layer, 'updates'):
+        # Collect updates that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer.inbound_nodes):
+          node_key = layer.name + '_ib-' + str(node_index)
+          if node_key in self.container_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            updates += layer.get_updates_for(inputs)
+        # Collect unconditional updates.
+        updates += layer.get_updates_for(None)
+    return updates
+
+  @property
+  def losses(self):
+    """Retrieve the model's losses.
+
+    Will only include losses that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include losses that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of loss tensors.
+    """
+    losses = []
+    # Retrieve losses for all internal layers.
+    for layer in self.layers:
+      if hasattr(layer, 'losses'):
+        # Collect losses that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer.inbound_nodes):
+          node_key = layer.name + '_ib-' + str(node_index)
+          if node_key in self.container_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            losses += layer.get_losses_for(inputs)
+        # Collect unconditional losses.
+        losses += layer.get_losses_for(None)
+    # Add any potential unconditional model-level loss.
+    losses += self.get_losses_for(None)
+    return losses
+
+  @property
+  def uses_learning_phase(self):
+    return any([x._uses_learning_phase for x in self.outputs])
+
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  def state_updates(self):
+    """Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def constraints(self):
+    cons = {}
+    for layer in self.layers:
+      for key, value in layer.constraints.items():
+        if key in cons and cons[key] != value:
+          raise ValueError('Received multiple constraints '
+                           'for one weight tensor: ' + str(key))
+        cons[key] = value
+    return cons
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for layer in self.layers:
+      weights += layer.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for layer in self.layers:
+      weights += layer.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for layer in self.layers:
+        trainable_weights += layer.trainable_weights
+      return trainable_weights + weights
+    return weights
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    weights = []
+    for layer in self.layers:
+      weights += layer.weights
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: A list of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    tuples = []
+    for layer in self.layers:
+      num_param = len(layer.weights)
+      layer_weights = weights[:num_param]
+      for sw, w in zip(layer.weights, layer_weights):
+        tuples.append((sw, w))
+      weights = weights[num_param:]
+    K.batch_set_value(tuples)
+
+  @property
+  def input_spec(self):
+    """Gets the model's input specs.
+
+    Returns:
+        A list of `InputSpec` instances (one per input to the model)
+            or a single instance if the model has only one input.
+    """
+    specs = []
+    for layer in getattr(self, 'input_layers', []):
+      if layer.input_spec is None:
+        specs.append(None)
+      else:
+        if not isinstance(layer.input_spec, list):
+          raise TypeError('Layer ' + layer.name +
+                          ' has an input_spec attribute that '
+                          'is not a list. We expect a list. '
+                          'Found input_spec = ' + str(layer.input_spec))
+        specs += layer.input_spec
+    if len(specs) == 1:
+      return specs[0]
+    return specs
+
+  def call(self, inputs, mask=None):
+    """Call the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    A model is callable on non-Keras tensors.
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    inputs = _to_list(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = _to_list(mask)
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    if cache_key in self._output_tensor_cache:
+      return self._output_tensor_cache[cache_key]
+    else:
+      output_tensors, _, _ = self.run_internal_graph(inputs, masks)
+      return output_tensors
+
+  def compute_mask(self, inputs, mask):
+    inputs = _to_list(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = _to_list(mask)
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    if cache_key in self._output_mask_cache:
+      return self._output_mask_cache[cache_key]
+    else:
+      _, output_masks, _ = self.run_internal_graph(inputs, masks)
+      return output_masks
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shapes = []
+      for shape in input_shape:
+        if shape is not None:
+          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
+        else:
+          input_shapes.append(None)
+    else:
+      if input_shape is not None:
+        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
+      else:
+        input_shapes = [None]
+
+    if len(input_shapes) != len(self.input_layers):
+      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
+                       ': model has ' + str(len(self.input_layers)) +
+                       ' tensor inputs.')
+
+    cache_key = ','.join([str(x) for x in input_shapes])
+    if cache_key in self._output_shape_cache:
+      output_shapes = self._output_shape_cache[cache_key]
+      if isinstance(output_shapes, list):
+        if len(output_shapes) == 1:
+          return tensor_shape.TensorShape(output_shapes[0])
+        else:
+          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+      else:
+        return tensor_shape.TensorShape(output_shapes)
+    else:
+      # Bad luck, we have to run the graph manually.
+      layers_to_output_shapes = {}
+      for i in range(len(input_shapes)):
+        layer = self.input_layers[i]
+        input_shape = input_shapes[i]
+        # It's an input layer: compute_output_shape is identity,
+        # and there is only one node and one tensor output.
+        shape_key = layer.name + '_0_0'
+        layers_to_output_shapes[shape_key] = input_shape
+
+      depth_keys = list(self.nodes_by_depth.keys())
+      depth_keys.sort(reverse=True)
+      # Iterate over nodes, by depth level.
+      if len(depth_keys) > 1:
+        for depth in depth_keys:
+          nodes = self.nodes_by_depth[depth]
+          for node in nodes:
+            # This is always a single layer, never a list.
+            layer = node.outbound_layer
+            if layer in self.input_layers:
+              # We've already covered the input layers
+              # a few lines above.
+              continue
+            # Potentially redundant list,
+            # same size of node.input_tensors.
+            input_shapes = []
+            for j in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[j]
+              node_index = node.node_indices[j]
+              tensor_index = node.tensor_indices[j]
+              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
+                                                           tensor_index)
+              input_shape = layers_to_output_shapes[shape_key]
+              input_shapes.append(input_shape)
+
+            if len(input_shapes) == 1:
+              output_shape = layer._compute_output_shape(input_shapes[0])
+            else:
+              output_shape = layer._compute_output_shape(input_shapes)
+            if isinstance(output_shape, list):
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(shape).as_list())
+                  for shape in output_shape
+              ]
+            else:
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(output_shape).as_list())
+              ]
+
+            node_index = layer.inbound_nodes.index(node)
+            for j in range(len(output_shapes)):
+              shape_key = layer.name + '_%s_%s' % (node_index, j)
+              layers_to_output_shapes[shape_key] = output_shapes[j]
+
+      # Read final output shapes from layers_to_output_shapes.
+      output_shapes = []
+      output_shape_keys = []
+      for i in range(len(self.output_layers)):
+        layer = self.output_layers[i]
+        node_index = self.output_layers_node_indices[i]
+        tensor_index = self.output_layers_tensor_indices[i]
+        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+        output_shape_keys.append(shape_key)
+
+      for i, key in enumerate(output_shape_keys):
+        assert key in layers_to_output_shapes
+        output_shapes.append(layers_to_output_shapes[key])
+      # Store in cache.
+      self._output_shape_cache[cache_key] = output_shapes
+      if isinstance(output_shapes, list):
+        if len(output_shapes) == 1:
+          return tensor_shape.TensorShape(output_shapes[0])
+        else:
+          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+      else:
+        return tensor_shape.TensorShape(output_shapes)
+
+  def run_internal_graph(self, inputs, masks=None):
+    """Computes output tensors for new inputs.
+
+    # Note:
+        - Expects `inputs` to be a list (potentially with 1 element).
+        - Can be run on non-Keras tensors.
+
+    Arguments:
+        inputs: List of tensors
+        masks: List of masks (tensors or None).
+
+    Returns:
+        Three lists: output_tensors, output_masks, output_shapes
+    """
+    if masks is None:
+      masks = [None for _ in range(len(inputs))]
+
+    # Dictionary mapping reference tensors to tuples
+    # (computed tensor, compute mask)
+    # we assume a 1:1 mapping from tensor to mask
+    # TODO(fchollet): raise exception when a `.compute_mask()` call
+    # does not return a list the same size as `call`
+    tensor_map = {}
+    for x, y, mask in zip(self.inputs, inputs, masks):
+      tensor_map[str(id(x))] = (y, mask)
+
+    depth_keys = list(self.nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    for depth in depth_keys:
+      nodes = self.nodes_by_depth[depth]
+      for node in nodes:
+        # This is always a single layer, never a list.
+        layer = node.outbound_layer
+
+        reference_input_tensors = node.input_tensors
+        reference_output_tensors = node.output_tensors
+
+        # If all previous input tensors are available in tensor_map,
+        # then call node.inbound_layer on them.
+        computed_data = []  # List of tuples (input, mask).
+        for x in reference_input_tensors:
+          if str(id(x)) in tensor_map:
+            computed_data.append(tensor_map[str(id(x))])
+
+        if len(computed_data) == len(reference_input_tensors):
+          # call layer
+          with K.name_scope(layer.name):
+            if node.arguments:
+              kwargs = node.arguments
+            else:
+              kwargs = {}
+            if len(computed_data) == 1:
+              computed_tensor, computed_mask = computed_data[0]
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_mask
+              output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
+              output_masks = _to_list(
+                  layer.compute_mask(computed_tensor, computed_mask))
+              computed_tensors = [computed_tensor]
+              computed_masks = [computed_mask]
+            else:
+              computed_tensors = [x[0] for x in computed_data]
+              computed_masks = [x[1] for x in computed_data]
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_masks
+              output_tensors = _to_list(layer.call(computed_tensors, **kwargs))
+              output_masks = _to_list(
+                  layer.compute_mask(computed_tensors, computed_masks))
+
+            # Apply activity regularizer if any:
+            if hasattr(layer, 'activity_regularizer'
+                      ) and layer.activity_regularizer is not None:
+              regularization_losses = [
+                  layer.activity_regularizer(x) for x in computed_tensors
+              ]
+              layer.add_loss(regularization_losses, computed_tensors)
+
+          # Update model updates and losses:
+          # Keep track of updates that depend on the inputs
+          # (e.g. BN updates).
+          self.add_update(layer.get_updates_for(computed_tensors), inputs)
+          # Keep track of unconditional updates (e.g. a counter).
+          self.add_update(layer.get_updates_for(None), None)
+          # Keep track of losses that depend on the inputs
+          # (e.g. activity regularizers).
+          self.add_loss(layer.get_losses_for(computed_tensors), inputs)
+          # Keep track of unconditional losses
+          # (e.g. weight regularizers).
+          self.add_loss(layer.get_losses_for(None), None)
+
+          # Update `_uses_learning_phase`.
+          if len(computed_tensors) == 1:
+            uses_learning_phase = getattr(computed_tensors[0],
+                                          '_uses_learning_phase', False)
+          else:
+            uses_learning_phase = any([
+                getattr(x, '_uses_learning_phase', False)
+                for x in computed_tensors
+            ])
+          for x in output_tensors:
+            x._uses_learning_phase = getattr(x, '_uses_learning_phase',
+                                             False) or uses_learning_phase
+
+          # Update tensor_map.
+          for x, y, mask in zip(reference_output_tensors, output_tensors,
+                                output_masks):
+            tensor_map[str(id(x))] = (y, mask)
+
+    output_tensors = []
+    output_masks = []
+    output_shapes = []
+    for x in self.outputs:
+      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
+      tensor, mask = tensor_map[str(id(x))]
+      output_shapes.append(K.int_shape(x))
+      output_tensors.append(tensor)
+      output_masks.append(mask)
+
+    # Update cache;
+    # keys are based on ids on input tensors and inputs masks.
+    cache_key = ','.join([str(id(x)) for x in inputs])
+    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+
+    if len(output_tensors) == 1:
+      output_tensors = output_tensors[0]
+      self._output_tensor_cache[cache_key] = output_tensors
+    else:
+      self._output_tensor_cache[cache_key] = output_tensors
+
+    if len(output_masks) == 1:
+      output_masks = output_masks[0]
+      self._output_mask_cache[cache_key] = output_masks
+    else:
+      self._output_mask_cache[cache_key] = output_masks
+
+    if output_shapes is not None:
+      input_shapes = [K.int_shape(x) for x in inputs]
+      cache_key = ','.join([str(x) for x in input_shapes])
+      if len(output_shapes) == 1:
+        output_shapes = output_shapes[0]
+        self._output_shape_cache[cache_key] = output_shapes
+      else:
+        self._output_shape_cache[cache_key] = output_shapes
+    return output_tensors, output_masks, output_shapes
+
+  def get_config(self):
+    config = {
+        'name': self.name,
+    }
+    node_conversion_map = {}
+    for layer in self.layers:
+      if issubclass(layer.__class__, Container):
+        # Containers start with a pre-existing node
+        # linking their input to output.
+        kept_nodes = 1
+      else:
+        kept_nodes = 0
+      for original_node_index, node in enumerate(layer.inbound_nodes):
+        node_key = layer.name + '_ib-' + str(original_node_index)
+        if node_key in self.container_nodes:
+          node_conversion_map[node_key] = kept_nodes
+          kept_nodes += 1
+    layer_configs = []
+    for layer in self.layers:  # From the earliest layers on.
+      layer_class_name = layer.__class__.__name__
+      layer_config = layer.get_config()
+      filtered_inbound_nodes = []
+      for original_node_index, node in enumerate(layer.inbound_nodes):
+        node_key = layer.name + '_ib-' + str(original_node_index)
+        if node_key in self.container_nodes:
+          # The node is relevant to the model:
+          # add to filtered_inbound_nodes.
+          if node.arguments:
+            try:
+              json.dumps(node.arguments)
+              kwargs = node.arguments
+            except TypeError:
+              logging.warning(
+                  'Layer ' + layer.name +
+                  ' was passed non-serializable keyword arguments: ' +
+                  str(node.arguments) + '. They will not be included '
+                  'in the serialized model (and thus will be missing '
+                  'at deserialization time).')
+              kwargs = {}
+          else:
+            kwargs = {}
+          if node.inbound_layers:
+            node_data = []
+            for i in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[i]
+              node_index = node.node_indices[i]
+              tensor_index = node.tensor_indices[i]
+              node_key = inbound_layer.name + '_ib-' + str(node_index)
+              new_node_index = node_conversion_map.get(node_key, 0)
+              node_data.append(
+                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
+            filtered_inbound_nodes.append(node_data)
+      layer_configs.append({
+          'name': layer.name,
+          'class_name': layer_class_name,
+          'config': layer_config,
+          'inbound_nodes': filtered_inbound_nodes,
+      })
+    config['layers'] = layer_configs
+
+    # Gather info about inputs and outputs.
+    model_inputs = []
+    for i in range(len(self.input_layers)):
+      layer = self.input_layers[i]
+      node_index = self.input_layers_node_indices[i]
+      node_key = layer.name + '_ib-' + str(node_index)
+      new_node_index = node_conversion_map[node_key]
+      tensor_index = self.input_layers_tensor_indices[i]
+      model_inputs.append([layer.name, new_node_index, tensor_index])
+    config['input_layers'] = model_inputs
+    model_outputs = []
+    for i in range(len(self.output_layers)):
+      layer = self.output_layers[i]
+      node_index = self.output_layers_node_indices[i]
+      node_key = layer.name + '_ib-' + str(node_index)
+      new_node_index = node_conversion_map[node_key]
+      tensor_index = self.output_layers_tensor_indices[i]
+      model_outputs.append([layer.name, new_node_index, tensor_index])
+    config['output_layers'] = model_outputs
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Instantiates a Model from its config (output of `get_config()`).
+
+    Arguments:
+        config: Model config dictionary.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A model instance.
+
+    Raises:
+        ValueError: In case of improperly formatted config dict.
+    """
+    # layer instances created during
+    # the graph reconstruction process
+    created_layers = {}
+
+    def process_layer(layer_data):
+      """Deserialize a layer, then call it on appropriate inputs.
+
+      Arguments:
+          layer_data: layer config dict.
+
+      Raises:
+          ValueError: In case of improperly formatted `layer_data` dict.
+      """
+      layer_name = layer_data['name']
+
+      # Instantiate layer.
+      from tensorflow.contrib.keras.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+      created_layers[layer_name] = layer
+
+      # Gather layer inputs.
+      inbound_nodes_data = layer_data['inbound_nodes']
+      for node_data in inbound_nodes_data:
+        input_tensors = []
+        for input_data in node_data:
+          inbound_layer_name = input_data[0]
+          inbound_node_index = input_data[1]
+          inbound_tensor_index = input_data[2]
+          if len(input_data) == 3:
+            kwargs = {}
+          elif len(input_data) == 4:
+            kwargs = input_data[3]
+          else:
+            raise ValueError('Improperly formatted model config.')
+          if inbound_layer_name not in created_layers:
+            raise ValueError('Missing layer: ' + inbound_layer_name)
+          inbound_layer = created_layers[inbound_layer_name]
+          inbound_node = inbound_layer.inbound_nodes[inbound_node_index]
+          input_tensors.append(
+              inbound_node.output_tensors[inbound_tensor_index])
+        # Call layer on its inputs, thus creating the node
+        # and building the layer if needed.
+        if input_tensors:
+          if len(input_tensors) == 1:
+            layer(input_tensors[0], **kwargs)
+          else:
+            layer(input_tensors, **kwargs)
+
+    for layer_data in config['layers']:
+      process_layer(layer_data)
+
+    name = config.get('name')
+    input_tensors = []
+    output_tensors = []
+    for layer_data in config['input_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer.inbound_nodes[node_index].output_tensors
+      input_tensors.append(layer_output_tensors[tensor_index])
+    for layer_data in config['output_layers']:
+      layer_name, node_index, tensor_index = layer_data
+      assert layer_name in created_layers
+      layer = created_layers[layer_name]
+      layer_output_tensors = layer.inbound_nodes[node_index].output_tensors
+      output_tensors.append(layer_output_tensors[tensor_index])
+    return cls(inputs=input_tensors, outputs=output_tensors, name=name)
+
+  def save(self, filepath, overwrite=True, include_optimizer=True):
+    """Save the model to a single HDF5 file.
+
+    The savefile includes:
+        - The model architecture, allowing to re-instantiate the model.
+        - The model weights.
+        - The state of the optimizer, allowing to resume training
+            exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model`
+    is a compiled model ready to be used (unless the saved model
+    was never compiled in the first place).
+
+    Arguments:
+        filepath: String, path to the file to save the weights to.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    from tensorflow.contrib.keras.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    save_model(self, filepath, overwrite, include_optimizer)
+
+  def save_weights(self, filepath, overwrite=True):
+    """Dumps all layer weights to a HDF5 file.
+
+    The weight file has:
+        - `layer_names` (attribute), a list of strings
+            (ordered names of model layers).
+        - For every layer, a `group` named `layer.name`
+            - For every such layer group, a group attribute `weight_names`,
+                a list of strings
+                (ordered names of weights tensor of the layer).
+            - For every weight in the layer, a dataset
+                storing the weight value, named after the weight tensor.
+
+    Arguments:
+        filepath: String, path to the file to save the weights to.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+
+    Raises:
+        ImportError: If h5py is not available.
+    """
+    if h5py is None:
+      raise ImportError('`save_weights` requires h5py.')
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+    f = h5py.File(filepath, 'w')
+    save_weights_to_hdf5_group(f, self.layers)
+    f.flush()
+    f.close()
+
+  def load_weights(self, filepath, by_name=False):
+    """Loads all layer weights from a HDF5 save file.
+
+    If `by_name` is False (default) weights are loaded
+    based on the network's topology, meaning the architecture
+    should be the same as when the weights were saved.
+    Note that layers that don't have weights are not taken
+    into account in the topological ordering, so adding or
+    removing layers is fine as long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers
+    only if they share the same name. This is useful
+    for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Arguments:
+        filepath: String, path to the weights file to load.
+        by_name: Boolean, whether to load weights by name
+            or by topological order.
+
+    Raises:
+        ImportError: If h5py is not available.
+    """
+    if h5py is None:
+      raise ImportError('`load_weights` requires h5py.')
+    f = h5py.File(filepath, mode='r')
+    if 'layer_names' not in f.attrs and 'model_weights' in f:
+      f = f['model_weights']
+    if by_name:
+      load_weights_from_hdf5_group_by_name(f, self.layers)
+    else:
+      load_weights_from_hdf5_group(f, self.layers)
+
+    if hasattr(f, 'close'):
+      f.close()
+
+  def _updated_config(self):
+    """Util hared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': K.backend()
+    }
+    return model_config
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+
+    def get_json_type(obj):
+      # If obj is any numpy type
+      if type(obj).__module__ == np.__name__:
+        return obj.item()
+
+      # If obj is a python 'type'
+      if type(obj).__name__ == type.__name__:
+        return obj.__name__
+
+      raise TypeError('Not JSON Serializable:', obj)
+
+    model_config = self._updated_config()
+    return json.dumps(model_config, default=get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError('Requires yaml module installed.')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def summary(self, line_length=None, positions=None):
+    print_layer_summary(self, line_length=line_length, positions=positions)
+
+
+def get_source_inputs(tensor, layer=None, node_index=None):
+  """Returns the list of input tensors necessary to compute `tensor`.
+
+  Output will always be a list of tensors
+  (potentially with 1 element).
+
+  Arguments:
+      tensor: The tensor to start from.
+      layer: Origin layer of the tensor. Will be
+          determined via tensor._keras_history if not provided.
+      node_index: Origin node index of the tensor.
+
+  Returns:
+      List of input tensors.
+  """
+  if not hasattr(tensor, '_keras_history'):
+    return tensor
+
+  if layer is None or node_index:
+    layer, node_index, _ = tensor._keras_history
+  if not layer.inbound_nodes:
+    return [tensor]
+  else:
+    node = layer.inbound_nodes[node_index]
+    if not node.inbound_layers:
+      # Reached an Input layer, stop recursion.
+      return node.input_tensors
+    else:
+      source_tensors = []
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        previous_sources = get_source_inputs(x, layer, node_index)
+        # Avoid input redundancy.
+        for x in previous_sources:
+          if x not in source_tensors:
+            source_tensors.append(x)
+      return source_tensors
+
+
+def _to_list(x):
+  """Normalizes a list/tensor into a list.
+
+  If a tensor is passed, we return
+  a list of size 1 containing the tensor.
+
+  Arguments:
+      x: target object to be normalized.
+
+  Returns:
+      A list.
+  """
+  if isinstance(x, list):
+    return x
+  return [x]
+
+
+def _object_list_uid(object_list):
+  object_list = _to_list(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def _is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
+
+
+def _collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = _to_list(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_history'):
+      inbound_layer, node_index, tensor_index = x._keras_history
+      node = inbound_layer.inbound_nodes[node_index]
+      mask = node.output_masks[tensor_index]
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def _to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def _collect_input_shape(input_tensors):
+  """Collects the output shape(s) of a list of Keras tensors.
+
+  Arguments:
+      input_tensors: list of input tensors (or single input tensor).
+
+  Returns:
+      List of shape tuples (or single tuple), one tuple per input.
+  """
+  input_tensors = _to_list(input_tensors)
+  shapes = []
+  for x in input_tensors:
+    shapes.append(K.int_shape(x))
+  if len(shapes) == 1:
+    return shapes[0]
+  return shapes
+
+
+def save_weights_to_hdf5_group(f, layers):
+  from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+
+  for layer in layers:
+    g = f.create_group(layer.name)
+    symbolic_weights = layer.weights
+    weight_values = K.batch_get_value(symbolic_weights)
+    weight_names = []
+    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+      if hasattr(w, 'name') and w.name:
+        name = str(w.name)
+      else:
+        name = 'param_' + str(i)
+      weight_names.append(name.encode('utf8'))
+    g.attrs['weight_names'] = weight_names
+    for name, val in zip(weight_names, weight_values):
+      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def preprocess_weights_for_loading(layer,
+                                   weights,
+                                   original_keras_version=None,
+                                   original_backend=None):
+  """Converts layers weights from Keras 1 format to Keras 2.
+
+  Arguments:
+      layer: Layer instance.
+      weights: List of weights values (Numpy arrays).
+      original_keras_version: Keras version for the weights, as a string.
+      original_backend: Keras backend the weights were trained with,
+          as a string.
+
+  Returns:
+      A list of weights values (Numpy arrays).
+  """
+  if original_keras_version == '1':
+    if layer.__class__.__name__ == 'Bidirectional':
+      num_weights_per_layer = len(weights) // 2
+
+      forward_weights = preprocess_weights_for_loading(
+          layer.forward_layer, weights[:num_weights_per_layer],
+          original_keras_version, original_backend)
+      backward_weights = preprocess_weights_for_loading(
+          layer.backward_layer, weights[num_weights_per_layer:],
+          original_keras_version, original_backend)
+      weights = forward_weights + backward_weights
+
+    if layer.__class__.__name__ == 'TimeDistributed':
+      weights = preprocess_weights_for_loading(
+          layer.layer, weights, original_keras_version, original_backend)
+
+    if layer.__class__.__name__ == 'Conv1D':
+      shape = weights[0].shape
+      # Handle Keras 1.1 format
+      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
+        # Legacy shape:
+        # (filters, input_dim, filter_length, 1)
+        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
+                                                           1)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+      weights[0] = weights[0][:, 0, :, :]
+
+    if layer.__class__.__name__ == 'Conv2D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+    if layer.__class__.__name__ == 'Conv2DTranspose':
+      if layer.data_format == 'channels_last':
+        # old: (kernel_rows, kernel_cols, stack_size, filters)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+    if layer.__class__.__name__ == 'Conv3D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, ...)
+        # new: (..., stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+    if layer.__class__.__name__ == 'GRU':
+      if len(weights) == 9:
+        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[4], weights[7]], axis=-1)
+        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'LSTM':
+      if len(weights) == 12:
+        # old: i, c, f, o
+        # new: i, f, c, o
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      if len(weights) == 12:
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        if layer.data_format == 'channels_first':
+          # old: (filters, stack_size, kernel_rows, kernel_cols)
+          # new: (kernel_rows, kernel_cols, stack_size, filters)
+          kernel = np.transpose(kernel, (2, 3, 1, 0))
+          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
+        weights = [kernel, recurrent_kernel, bias]
+
+  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
+  if layer.__class__.__name__ in conv_layers:
+    if original_backend and K.backend() != original_backend:
+      weights[0] = conv_utils.convert_kernel(weights[0])
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = conv_utils.convert_kernel(weights[1])
+    if K.int_shape(layer.weights[0]) != weights[0].shape:
+      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+  return weights
+
+
+def load_weights_from_hdf5_group(f, layers):
+  """Implements topological (order-based) weight loading.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  filtered_layers = []
+  for layer in layers:
+    weights = layer.weights
+    if weights:
+      filtered_layers.append(layer)
+
+  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
+  filtered_layer_names = []
+  for name in layer_names:
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    if weight_names:
+      filtered_layer_names.append(name)
+  layer_names = filtered_layer_names
+  if len(layer_names) != len(filtered_layers):
+    raise ValueError('You are trying to load a weight file '
+                     'containing ' + str(len(layer_names)) +
+                     ' layers into a model with ' + str(len(filtered_layers)) +
+                     ' layers.')
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    weight_values = [g[weight_name] for weight_name in weight_names]
+    layer = filtered_layers[k]
+    symbolic_weights = layer.weights
+    weight_values = preprocess_weights_for_loading(
+        layer, weight_values, original_keras_version, original_backend)
+    if len(weight_values) != len(symbolic_weights):
+      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                       '" in the current model) was found to '
+                       'correspond to layer ' + name + ' in the save file. '
+                       'However the new layer ' + layer.name + ' expects ' +
+                       str(len(symbolic_weights)) +
+                       ' weights, but the saved weights have ' +
+                       str(len(weight_values)) + ' elements.')
+    weight_value_tuples += zip(symbolic_weights, weight_values)
+  K.batch_set_value(weight_value_tuples)
+
+
+def load_weights_from_hdf5_group_by_name(f, layers):
+  """Implements name-based weight loading.
+
+  (instead of topological weight loading).
+
+  Layers that have no matching name are skipped.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  # New file format.
+  layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
+
+  # Reverse index of layer name to list of layers with name.
+  index = {}
+  for layer in layers:
+    if layer.name:
+      index.setdefault(layer.name, []).append(layer)
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
+    weight_values = [g[weight_name] for weight_name in weight_names]
+
+    for layer in index.get(name, []):
+      symbolic_weights = layer.weights
+      weight_values = preprocess_weights_for_loading(
+          layer, weight_values, original_keras_version, original_backend)
+      if len(weight_values) != len(symbolic_weights):
+        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                         '") expects ' + str(len(symbolic_weights)) +
+                         ' weight(s), but the saved weights' + ' have ' +
+                         str(len(weight_values)) + ' element(s).')
+      # Set values.
+      for i in range(len(weight_values)):
+        weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
+  K.batch_set_value(weight_value_tuples)
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
new file mode 100644
index 00000000000..531ed4be3e3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -0,0 +1,512 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#,============================================================================
+"""Tests for layer graphs construction & handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+try:
+  import yaml  # pylint:disable=g-import-not-at-top
+except ImportError:
+  yaml = None
+
+
+class TopologyConstructionTest(test.TestCase):
+
+  def test_get_updates_for(self):
+    a = keras.layers.Input(shape=(2,))
+    dense_layer = keras.layers.Dense(1)
+    dense_layer.add_update(0, inputs=a)
+    dense_layer.add_update(1, inputs=None)
+
+    self.assertListEqual(dense_layer.get_updates_for(a), [0])
+    self.assertListEqual(dense_layer.get_updates_for(None), [1])
+
+  def test_get_losses_for(self):
+    a = keras.layers.Input(shape=(2,))
+    dense_layer = keras.layers.Dense(1)
+    dense_layer.add_loss(0, inputs=a)
+    dense_layer.add_loss(1, inputs=None)
+
+    self.assertListEqual(dense_layer.get_losses_for(a), [0])
+    self.assertListEqual(dense_layer.get_losses_for(None), [1])
+
+  def test_trainable_weights(self):
+    a = keras.layers.Input(shape=(2,))
+    b = keras.layers.Dense(1)(a)
+    model = keras.models.Model(a, b)
+
+    weights = model.weights
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    model.trainable = True
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.layers[1].trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    # sequential model
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_dim=2))
+    weights = model.weights
+
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+    model.trainable = True
+    self.assertListEqual(model.trainable_weights, weights)
+    self.assertListEqual(model.non_trainable_weights, [])
+
+    model.layers[0].trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+    self.assertListEqual(model.non_trainable_weights, weights)
+
+  def test_learning_phase(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      a_2 = keras.layers.Dense(16, name='dense_1')(a)
+      dp = keras.layers.Dropout(0.5, name='dropout')
+      b_2 = dp(b)
+
+      self.assertFalse(a_2._uses_learning_phase)
+      self.assertTrue(b_2._uses_learning_phase)
+
+      # test merge
+      m = keras.layers.concatenate([a_2, b_2])
+      self.assertTrue(m._uses_learning_phase)
+
+      # Test recursion
+      model = keras.models.Model([a, b], [a_2, b_2])
+      self.assertTrue(model.uses_learning_phase)
+
+      c = keras.layers.Input(shape=(32,), name='input_c')
+      d = keras.layers.Input(shape=(32,), name='input_d')
+
+      c_2, b_2 = model([c, d])
+      self.assertTrue(c_2._uses_learning_phase)
+      self.assertTrue(b_2._uses_learning_phase)
+
+      # try actually running graph
+      fn = keras.backend.function(
+          model.inputs + [keras.backend.learning_phase()], model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs_no_dp = fn([input_a_np, input_b_np, 0])
+      fn_outputs_dp = fn([input_a_np, input_b_np, 1])
+      # output a: nothing changes
+      self.assertEqual(fn_outputs_no_dp[0].sum(), fn_outputs_dp[0].sum())
+      # output b: dropout applied
+      self.assertNotEqual(fn_outputs_no_dp[1].sum(), fn_outputs_dp[1].sum())
+
+  def test_layer_call_arguments(self):
+    # Test the ability to pass and serialize arguments to `call`.
+    inp = keras.layers.Input(shape=(2,))
+    x = keras.layers.Dense(3)(inp)
+    x = keras.layers.Dropout(0.5)(x, training=True)
+    model = keras.models.Model(inp, x)
+    self.assertFalse(model.uses_learning_phase)
+
+    # Test that argument is kept when applying the model
+    inp2 = keras.layers.Input(shape=(2,))
+    out2 = model(inp2)
+    self.assertFalse(out2._uses_learning_phase)
+
+    # Test that argument is kept after loading a model
+    config = model.get_config()
+    model = keras.models.Model.from_config(config)
+    self.assertFalse(model.uses_learning_phase)
+
+  def test_node_construction(self):
+    # test basics
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    self.assertListEqual(a.get_shape().as_list(), [None, 32])
+    a_layer, a_node_index, a_tensor_index = a._keras_history
+    b_layer, _, _ = b._keras_history
+    self.assertEqual(len(a_layer.inbound_nodes), 1)
+    self.assertEqual(a_tensor_index, 0)
+    node = a_layer.inbound_nodes[a_node_index]
+    self.assertEqual(node.outbound_layer, a_layer)
+
+    self.assertListEqual(node.inbound_layers, [])
+    self.assertListEqual(node.input_tensors, [a])
+    self.assertListEqual(node.input_masks, [None])
+    self.assertListEqual(node.input_shapes, [(None, 32)])
+    self.assertListEqual(node.output_tensors, [a])
+    self.assertListEqual(node.output_shapes, [(None, 32)])
+    self.assertListEqual(node.output_masks, [None])
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+
+    self.assertEqual(len(dense.inbound_nodes), 2)
+    self.assertEqual(len(dense.outbound_nodes), 0)
+    self.assertListEqual(dense.inbound_nodes[0].inbound_layers, [a_layer])
+    self.assertEqual(dense.inbound_nodes[0].outbound_layer, dense)
+    self.assertListEqual(dense.inbound_nodes[1].inbound_layers, [b_layer])
+    self.assertEqual(dense.inbound_nodes[1].outbound_layer, dense)
+    self.assertListEqual(dense.inbound_nodes[0].input_tensors, [a])
+    self.assertListEqual(dense.inbound_nodes[1].input_tensors, [b])
+
+    # test layer properties
+    test_layer = keras.layers.Dense(16, name='test_layer')
+    a_test = test_layer(a)
+    self.assertListEqual(test_layer.kernel.get_shape().as_list(), [32, 16])
+    self.assertEqual(test_layer.input, a)
+    self.assertEqual(test_layer.output, a_test)
+    self.assertEqual(test_layer.input_mask, None)
+    self.assertEqual(test_layer.output_mask, None)
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, (None, 16))
+
+    # pylint: disable=pointless-statement
+    with self.assertRaises(Exception):
+      dense.input
+    with self.assertRaises(Exception):
+      dense.output
+    with self.assertRaises(Exception):
+      dense.input_mask
+    with self.assertRaises(Exception):
+      dense.output_mask
+    # pylint: enable=pointless-statement
+
+    self.assertEqual(dense.get_input_at(0), a)
+    self.assertEqual(dense.get_input_at(1), b)
+    self.assertEqual(dense.get_output_at(0), a_2)
+    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+    self.assertEqual(dense.get_input_mask_at(0), None)
+    self.assertEqual(dense.get_input_mask_at(1), None)
+    self.assertEqual(dense.get_output_mask_at(0), None)
+    self.assertEqual(dense.get_output_mask_at(1), None)
+
+  def test_multi_input_layer(self):
+    with self.test_session():
+      # test multi-input layer
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      self.assertListEqual(merged.get_shape().as_list(), [None, 16 * 2])
+      merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
+
+      self.assertEqual(merge_node_index, 0)
+      self.assertEqual(merge_tensor_index, 0)
+
+      self.assertEqual(len(merge_layer.inbound_nodes), 1)
+      self.assertEqual(len(merge_layer.outbound_nodes), 0)
+
+      self.assertEqual(len(merge_layer.inbound_nodes[0].input_tensors), 2)
+      self.assertEqual(len(merge_layer.inbound_nodes[0].inbound_layers), 2)
+
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      self.assertEqual(len(model.layers), 6)
+      output_shapes = model._compute_output_shape([(None, 32), (None, 32)])
+      self.assertListEqual(output_shapes[0].as_list(), [None, 64])
+      self.assertListEqual(output_shapes[1].as_list(), [None, 5])
+      self.assertListEqual(
+          model.compute_mask([a, b], [None, None]), [None, None])
+
+      # we don't check names of first 2 layers (inputs) because
+      # ordering of same-level layers is not fixed
+      self.assertListEqual([l.name for l in model.layers][2:],
+                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
+      self.assertListEqual([l.name for l in model.input_layers],
+                           ['input_a', 'input_b'])
+      self.assertListEqual([l.name for l in model.output_layers],
+                           ['dense_2', 'dense_3'])
+
+      # actually run model
+      fn = keras.backend.function(model.inputs, model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
+
+      # test get_source_inputs
+      self.assertListEqual(keras.engine.topology.get_source_inputs(c), [a, b])
+
+      # serialization / deserialization
+      json_config = model.to_json()
+      recreated_model = keras.models.model_from_json(json_config)
+      recreated_model.compile('rmsprop', 'mse')
+
+      self.assertListEqual([l.name for l in recreated_model.layers][2:],
+                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
+      self.assertListEqual([l.name for l in recreated_model.input_layers],
+                           ['input_a', 'input_b'])
+      self.assertListEqual([l.name for l in recreated_model.output_layers],
+                           ['dense_2', 'dense_3'])
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
+
+  def test_recursion(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+      e = keras.layers.Input(shape=(32,), name='input_e')
+      f = keras.layers.Input(shape=(32,), name='input_f')
+      g, h = model([e, f])
+
+      self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
+      self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
+
+      # test separate manipulation of different layer outputs
+      i = keras.layers.Dense(7, name='dense_4')(h)
+
+      final_model = keras.models.Model(
+          inputs=[e, f], outputs=[i, g], name='final')
+      self.assertEqual(len(final_model.inputs), 2)
+      self.assertEqual(len(final_model.outputs), 2)
+      self.assertEqual(len(final_model.layers), 4)
+
+      # we don't check names of first 2 layers (inputs) because
+      # ordering of same-level layers is not fixed
+      self.assertListEqual([layer.name for layer in final_model.layers][2:],
+                           ['model', 'dense_4'])
+      self.assertListEqual(
+          model.compute_mask([e, f], [None, None]), [None, None])
+      self.assertListEqual(
+          final_model._compute_output_shape([(10, 32), (10, 32)]), [(10, 7),
+                                                                    (10, 64)])
+
+      # run recursive model
+      fn = keras.backend.function(final_model.inputs, final_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
+
+      # test serialization
+      model_config = final_model.get_config()
+      recreated_model = keras.models.Model.from_config(model_config)
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      input_a_np = np.random.random((10, 32))
+      input_b_np = np.random.random((10, 32))
+      fn_outputs = fn([input_a_np, input_b_np])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
+
+  def test_multi_input_multi_output_recursion(self):
+    with self.test_session():
+      # test multi-input multi-output
+      a = keras.layers.Input(shape=(32,), name='input_a')
+      b = keras.layers.Input(shape=(32,), name='input_b')
+
+      dense = keras.layers.Dense(16, name='dense_1')
+      a_2 = dense(a)
+      b_2 = dense(b)
+      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      c = keras.layers.Dense(64, name='dense_2')(merged)
+      d = keras.layers.Dense(5, name='dense_3')(c)
+
+      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+      j = keras.layers.Input(shape=(32,), name='input_j')
+      k = keras.layers.Input(shape=(32,), name='input_k')
+      _, n = model([j, k])
+
+      o = keras.layers.Input(shape=(32,), name='input_o')
+      p = keras.layers.Input(shape=(32,), name='input_p')
+      q, _ = model([o, p])
+
+      self.assertListEqual(n.get_shape().as_list(), [None, 5])
+      self.assertListEqual(q.get_shape().as_list(), [None, 64])
+      s = keras.layers.concatenate([n, q], name='merge_nq')
+      self.assertListEqual(s.get_shape().as_list(), [None, 64 + 5])
+
+      # test with single output as 1-elem list
+      multi_io_model = keras.models.Model([j, k, o, p], [s])
+
+      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      # test with single output as tensor
+      multi_io_model = keras.models.Model([j, k, o, p], s)
+
+      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      # note that the output of the function will still be a 1-elem list
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      # test serialization
+      model_config = multi_io_model.get_config()
+      recreated_model = keras.models.Model.from_config(model_config)
+
+      fn = keras.backend.function(recreated_model.inputs,
+                                  recreated_model.outputs)
+      fn_outputs = fn([
+          np.random.random((10, 32)), np.random.random((10, 32)),
+          np.random.random((10, 32)), np.random.random((10, 32))
+      ])
+      # note that the output of the function will still be a 1-elem list
+      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+      config = model.get_config()
+      keras.models.Model.from_config(config)
+
+      model.summary()
+      json_str = model.to_json()
+      keras.models.model_from_json(json_str)
+
+      if yaml is not None:
+        yaml_str = model.to_yaml()
+        keras.models.model_from_yaml(yaml_str)
+
+  def test_invalid_graphs(self):
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+    merged = keras.layers.concatenate([a_2, b_2], name='merge')
+    c = keras.layers.Dense(64, name='dense_2')(merged)
+    d = keras.layers.Dense(5, name='dense_3')(c)
+
+    model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+    # input is not an Input tensor
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    j = keras.layers.Dense(32)(j)
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k], [m, n])
+
+    # disconnected graph
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j], [m, n])
+
+    # redundant outputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+
+    keras.models.Model([j, k], [m, n, n])
+
+    # redundant inputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k, j], [m, n])
+
+    # i have not idea what I'm doing: garbage as inputs/outputs
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    with self.assertRaises(Exception):
+      keras.models.Model([j, k], [m, n, 0])
+
+  def test_raw_tf_compatibility(self):
+    # test calling layers/models on TF tensors
+    a = keras.layers.Input(shape=(32,), name='input_a')
+    b = keras.layers.Input(shape=(32,), name='input_b')
+
+    dense = keras.layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+    merged = keras.layers.concatenate([a_2, b_2], name='merge')
+    c = keras.layers.Dense(64, name='dense_2')(merged)
+    d = keras.layers.Dense(5, name='dense_3')(c)
+
+    model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+
+    j = keras.layers.Input(shape=(32,), name='input_j')
+    k = keras.layers.Input(shape=(32,), name='input_k')
+    m, n = model([j, k])
+    tf_model = keras.models.Model([j, k], [m, n])
+
+    j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
+    k_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
+    m_tf, n_tf = tf_model([j_tf, k_tf])
+    self.assertListEqual(m_tf.get_shape().as_list(), [None, 64])
+    self.assertListEqual(n_tf.get_shape().as_list(), [None, 5])
+
+    # test merge
+    keras.layers.concatenate([j_tf, k_tf], axis=1)
+    keras.layers.add([j_tf, k_tf])
+
+    # test tensor input
+    x = array_ops.placeholder(shape=(None, 2), dtype=dtypes.float32)
+    keras.layers.InputLayer(input_tensor=x)
+
+    x = keras.layers.Input(tensor=x)
+    keras.layers.Dense(2)(x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
new file mode 100644
index 00000000000..09459fd713c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -0,0 +1,2154 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import multiprocessing
+import threading
+import time
+
+import numpy as np
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import callbacks as cbks
+from tensorflow.contrib.keras.python.keras import losses
+from tensorflow.contrib.keras.python.keras import metrics as metrics_module
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras.engine.topology import Container
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.platform import tf_logging as logging
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import queue
+except ImportError:
+  import Queue as queue
+# pylint: enable=g-import-not-at-top
+
+
+def _standardize_input_data(data,
+                            names,
+                            shapes=None,
+                            check_batch_axis=True,
+                            exception_prefix=''):
+  """Normalizes inputs and targets provided by users.
+
+  Users may pass data as a list of arrays, dictionary of arrays,
+  or as a single array. We normalize this to an ordered list of
+  arrays (same order as `names`), while checking that the provided
+  arrays have shapes that match the network's expectations.
+
+  Arguments:
+      data: User-provided input data (polymorphic).
+      names: List of expected array names.
+      shapes: Optional list of expected array shapes.
+      check_batch_axis: Boolean; whether to check that
+          the batch axis of the arrays matches the expected
+          value found in `shapes`.
+      exception_prefix: String prefix used for exception formatting.
+
+  Returns:
+      List of standardized input arrays (one array per model input).
+
+  Raises:
+      ValueError: in case of improperly formatted user-provided data.
+  """
+  if not names:
+    return []
+  if data is None:
+    return [None for _ in range(len(names))]
+  if isinstance(data, dict):
+    arrays = []
+    for name in names:
+      if name not in data:
+        raise ValueError('No data provided for "' + name +
+                         '". Need data for each key in: ' + str(names))
+      arrays.append(data[name])
+  elif isinstance(data, list):
+    if len(data) != len(names):
+      if data and hasattr(data[0], 'shape'):
+        raise ValueError(
+            'Error when checking model ' + exception_prefix +
+            ': the list of Numpy arrays '
+            'that you are passing to your model '
+            'is not the size the model expected. '
+            'Expected to see ' + str(len(names)) + ' arrays but instead got '
+            'the following list of ' + str(len(data)) + ' arrays: ' +
+            str(data)[:200] + '...')
+      else:
+        if len(names) == 1:
+          data = [np.asarray(data)]
+        else:
+          raise ValueError('Error when checking model ' + exception_prefix +
+                           ': you are passing a list as '
+                           'input to your model, '
+                           'but the model expects '
+                           'a list of ' + str(len(names)) +
+                           ' Numpy arrays instead. '
+                           'The list you passed was: ' + str(data)[:200])
+    arrays = data
+  else:
+    if not hasattr(data, 'shape'):
+      raise TypeError('Error when checking model ' + exception_prefix +
+                      ': data should be a Numpy array, '
+                      'or list/dict of Numpy arrays. '
+                      'Found: ' + str(data)[:200] + '...')
+    if len(names) > 1:
+      # Case: model expects multiple inputs but only received
+      # a single Numpy array.
+      raise ValueError('The model expects ' + str(len(names)) + exception_prefix
+                       + ' arrays, but only received one array. '
+                       'Found: array with shape ' + str(data.shape))
+    arrays = [data]
+
+  # Make arrays at least 2D.
+  for i in range(len(names)):
+    array = arrays[i]
+    if len(array.shape) == 1:
+      array = np.expand_dims(array, 1)
+      arrays[i] = array
+
+  # Check shapes compatibility.
+  if shapes:
+    for i in range(len(names)):
+      if shapes[i] is None:
+        continue
+      array = arrays[i]
+      if len(array.shape) != len(shapes[i]):
+        raise ValueError(
+            'Error when checking ' + exception_prefix + ': expected ' + names[i]
+            + ' to have ' + str(len(shapes[i])) +
+            ' dimensions, but got array with shape ' + str(array.shape))
+      for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
+        if not j and not check_batch_axis:
+          # skip the first axis
+          continue
+        if ref_dim:
+          if ref_dim != dim:
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shapes[i]) + ' but got array with shape ' +
+                             str(array.shape))
+  return arrays
+
+
+def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
+  """Maps `sample_weight` or `class_weight` to model outputs.
+
+  Arguments:
+      x_weight: User-provided `sample_weight` or `class_weight` argument.
+      output_names: List of output names (strings) in the model.
+      weight_type: A string used purely for exception printing.
+
+  Returns:
+      A list of `sample_weight` or `class_weight` where there are exactly
+          one element per model output.
+
+  Raises:
+      ValueError: In case of invalid user-provided argument.
+  """
+  if x_weight is None or len(x_weight) == 0:  # pylint: disable=g-explicit-length-test
+    return [None for _ in output_names]
+  if len(output_names) == 1:
+    if isinstance(x_weight, list) and len(x_weight) == 1:
+      return x_weight
+    if isinstance(x_weight, dict) and output_names[0] in x_weight:
+      return [x_weight[output_names[0]]]
+    else:
+      return [x_weight]
+  if isinstance(x_weight, list):
+    if len(x_weight) != len(output_names):
+      raise ValueError('Provided `' + weight_type + '` was a list of ' +
+                       str(len(x_weight)) + ' elements, but the model has ' +
+                       str(len(output_names)) + ' outputs. '
+                       'You should provide one `' + weight_type + '`'
+                       'array per model output.')
+    return x_weight
+  if isinstance(x_weight, dict):
+    x_weights = []
+    for name in output_names:
+      x_weights.append(x_weight.get(name))
+    return x_weights
+  else:
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list of a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
+
+
+def _standardize_class_weights(class_weight, output_names):
+  return _standardize_sample_or_class_weights(class_weight, output_names,
+                                              'class_weight')
+
+
+def _standardize_sample_weights(sample_weight, output_names):
+  return _standardize_sample_or_class_weights(sample_weight, output_names,
+                                              'sample_weight')
+
+
+def _check_array_lengths(inputs, targets, weights):
+  """Does user input validation for numpy arrays.
+
+  Arguments:
+      inputs: list of Numpy arrays of inputs.
+      targets: list of Numpy arrays of targets.
+      weights: list of Numpy arrays of sample weights.
+
+  Raises:
+      ValueError: in case of incorrectly formatted data.
+  """
+  x_lengths = [x.shape[0] for x in inputs]
+  y_lengths = [y.shape[0] for y in targets]
+  w_lengths = [w.shape[0] for w in weights]
+  set_x = set(x_lengths)
+  if len(set_x) > 1:
+    raise ValueError('All input arrays (x) should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [x.shape for x in inputs]))
+  set_y = set(y_lengths)
+  if len(set_y) > 1:
+    raise ValueError('All target arrays (y) should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [y.shape for y in targets]))
+  set_w = set(w_lengths)
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [w.shape for w in weights]))
+  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
+    raise ValueError('Input arrays should have '
+                     'the same number of samples as target arrays. '
+                     'Found ' + str(list(set_x)[0]) + ' input samples '
+                     'and ' + str(list(set_y)[0]) + ' target samples.')
+  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
+    raise ValueError('Sample_weight arrays should have '
+                     'the same number of samples as target arrays. Got ' +
+                     str(list(set_y)[0]) + ' input samples and ' +
+                     str(list(set_w)[0]) + ' target samples.')
+
+
+def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
+  """Does validation on the compatibility of targets and loss functions.
+
+  This helps prevent users from using loss functions incorrectly.
+
+  Arguments:
+      targets: list of Numpy arrays of targets.
+      loss_fns: list of loss functions.
+      output_shapes: list of shapes of model outputs.
+
+  Raises:
+      ValueError: if a loss function or target array
+          is incompatible with an output.
+  """
+  key_losses = {
+      'mean_square_error', 'binary_crossentropy', 'categorical_crossentropy'
+  }
+  for y, loss, shape in zip(targets, loss_fns, output_shapes):
+    if loss is None:
+      continue
+    if loss.__name__ == 'categorical_crossentropy':
+      if y.shape[-1] == 1:
+        raise ValueError('You are passing a target array of shape ' + str(
+            y.shape) + ' while using as loss `categorical_crossentropy`. '
+                         '`categorical_crossentropy` expects '
+                         'targets to be binary matrices (1s and 0s) '
+                         'of shape (samples, classes). '
+                         'If your targets are integer classes, '
+                         'you can convert them to the expected format via:\n'
+                         '```\n'
+                         'from keras.utils.np_utils import to_categorical\n'
+                         'y_binary = to_categorical(y_int)\n'
+                         '```\n'
+                         '\n'
+                         'Alternatively, you can use the loss function '
+                         '`sparse_categorical_crossentropy` instead, '
+                         'which does expect integer targets.')
+    if loss.__name__ in key_losses:
+      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
+        if out_dim is not None and target_dim != out_dim:
+          raise ValueError('A target array with shape ' + str(y.shape) +
+                           ' was passed for an output of shape ' + str(shape) +
+                           ' while using as loss `' + loss.__name__ + '`. '
+                           'This loss expects '
+                           'targets to have the same shape '
+                           'as the output.')
+
+
+def _collect_metrics(metrics, output_names):
+  """Maps metric functions to model outputs.
+
+  Arguments:
+      metrics: a list or dict of metric functions.
+      output_names: a list of the names (strings) of model outputs.
+
+  Returns:
+      A list (one entry per model output) of lists of metric functions.
+      For instance, if the model has 2 outputs, and for the first output
+      we want to compute "binary_accuracy" and "binary_crossentropy",
+      and just "binary_accuracy" for the second output,
+      the list would look like:
+          `[[binary_accuracy, binary_crossentropy], [binary_accuracy]]`
+
+  Raises:
+      TypeError: if an incorrect type is passed for the `metrics` argument.
+  """
+  if not metrics:
+    return [[] for _ in output_names]
+  if isinstance(metrics, list):
+    # we then apply all metrics to all outputs.
+    return [copy.copy(metrics) for _ in output_names]
+  elif isinstance(metrics, dict):
+    nested_metrics = []
+    for name in output_names:
+      output_metrics = metrics.get(name, [])
+      if not isinstance(output_metrics, list):
+        output_metrics = [output_metrics]
+      nested_metrics.append(output_metrics)
+    return nested_metrics
+  else:
+    raise TypeError('Type of `metrics` argument not understood. '
+                    'Expected a list or dictionary, found: ' + str(metrics))
+
+
+def _batch_shuffle(index_array, batch_size):
+  """Shuffles an array in a batch-wise fashion.
+
+  Useful for shuffling HDF5 arrays
+  (where one cannot access arbitrary indices).
+
+  Arguments:
+      index_array: array of indices to be shuffled.
+      batch_size: integer.
+
+  Returns:
+      The `index_array` array, shuffled in a batch-wise fashion.
+  """
+  batch_count = int(len(index_array) / batch_size)
+  # to reshape we need to be cleanly divisible by batch size
+  # we stash extra items and reappend them after shuffling
+  last_batch = index_array[batch_count * batch_size:]
+  index_array = index_array[:batch_count * batch_size]
+  index_array = index_array.reshape((batch_count, batch_size))
+  np.random.shuffle(index_array)
+  index_array = index_array.flatten()
+  return np.append(index_array, last_batch)
+
+
+def _make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def _slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `_slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+  """
+  if isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [x[start] for x in arrays]
+    else:
+      return [x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    else:
+      return arrays[start:stop]
+
+
+def _weighted_masked_objective(fn):
+  """Adds support for masking and sample-weighting to an objective function.
+
+  It transforms an objective function `fn(y_true, y_pred)`
+  into a sample-weighted, cost-masked objective function
+  `fn(y_true, y_pred, weights, mask)`.
+
+  Arguments:
+      fn: The objective function to wrap,
+          with signature `fn(y_true, y_pred)`.
+
+  Returns:
+      A function with signature `fn(y_true, y_pred, weights, mask)`.
+  """
+  if fn is None:
+    return None
+
+  def weighted(y_true, y_pred, weights, mask=None):
+    """Wrapper function.
+
+    Arguments:
+        y_true: `y_true` argument of `fn`.
+        y_pred: `y_pred` argument of `fn`.
+        weights: Weights tensor.
+        mask: Mask tensor.
+
+    Returns:
+        Scalar tensor.
+    """
+    # score_array has ndim >= 2
+    score_array = fn(y_true, y_pred)
+    if mask is not None:
+      mask = K.cast(mask, K.floatx())
+      # mask should have the same shape as score_array
+      score_array *= mask
+      #  the loss per batch should be proportional
+      #  to the number of unmasked samples.
+      score_array /= K.mean(mask)
+
+    # reduce score_array to same ndim as weight array
+    ndim = K.ndim(score_array)
+    weight_ndim = K.ndim(weights)
+    score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
+
+    # apply sample weighting
+    if weights is not None:
+      score_array *= weights
+      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
+    return K.mean(score_array)
+
+  return weighted
+
+
+def _masked_objective(fn):
+  """Adds support for masking to an objective function.
+
+  It transforms an objective function `fn(y_true, y_pred)`
+  into a cost-masked objective function
+  `fn(y_true, y_pred, mask)`.
+
+  Arguments:
+      fn: The objective function to wrap,
+          with signature `fn(y_true, y_pred)`.
+
+  Returns:
+      A function with signature `fn(y_true, y_pred, mask)`.
+  """
+
+  def masked(y_true, y_pred, mask=None):
+    """Wrapper function.
+
+    Arguments:
+        y_true: `y_true` argument of `fn`.
+        y_pred: `y_pred` argument of `fn`.
+        mask: Mask tensor.
+
+    Returns:
+        Scalar tensor.
+    """
+    # score_array has ndim >= 2
+    score_array = fn(y_true, y_pred)
+    if mask is not None:
+      mask = K.cast(mask, K.floatx())
+      # mask should have the same shape as score_array
+      score_array *= mask
+      #  the loss per batch should be proportional
+      #  to the number of unmasked samples.
+      score_array /= K.mean(mask)
+
+    return K.mean(score_array)
+
+  return masked
+
+
+def _standardize_weights(y,
+                         sample_weight=None,
+                         class_weight=None,
+                         sample_weight_mode=None):
+  """Performs sample weight validation and standardization.
+
+  Everything gets normalized to a single sample-wise (or timestep-wise)
+  weight array.
+
+  Arguments:
+      y: Numpy array of model targets to be weighted.
+      sample_weight: User-provided `sample_weight` argument.
+      class_weight: User-provided `class_weight` argument.
+      sample_weight_mode: One of `None` or `"temporal"`.
+          `"temporal"` indicated that we expect 2D weight data
+          that will be applied to the last 2 dimensions of
+          the targets (i.e. we are weighting timesteps, not samples).
+
+  Returns:
+      A numpy array of target weights, one entry per sample to weight.
+
+  Raises:
+      ValueError: In case of invalid user-provided arguments.
+  """
+  if sample_weight_mode is not None:
+    if sample_weight_mode != 'temporal':
+      raise ValueError('"sample_weight_mode '
+                       'should be None or "temporal". '
+                       'Found: ' + str(sample_weight_mode))
+    if len(y.shape) < 3:
+      raise ValueError('Found a sample_weight array for '
+                       'an input with shape ' + str(y.shape) + '. '
+                       'Timestep-wise sample weighting (use of '
+                       'sample_weight_mode="temporal") is restricted to '
+                       'outputs that are at least 3D, i.e. that have '
+                       'a time dimension.')
+    if sample_weight is not None and len(sample_weight.shape) != 2:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weighting, '
+                       'you should pass a 2D sample_weight array.')
+  else:
+    if sample_weight is not None and len(sample_weight.shape) != 1:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weights, '
+                       'you should specify '
+                       'sample_weight_mode="temporal" '
+                       'in compile(). If you just mean to use '
+                       'sample-wise weights, make sure your '
+                       'sample_weight array is 1D.')
+
+  if sample_weight is not None:
+    if len(sample_weight.shape) > len(y.shape):
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
+
+    if y.shape[:sample_weight.ndim] != sample_weight.shape:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
+                       'sample_weight cannot be broadcast.')
+    return sample_weight
+  elif isinstance(class_weight, dict):
+    if len(y.shape) > 2:
+      raise ValueError('class_weight not supported for '
+                       '3+ dimensional targets.')
+    if y.shape[1] > 1:
+      y_classes = y.argmax(axis=1)
+    elif y.shape[1] == 1:
+      y_classes = np.reshape(y, y.shape[0])
+    else:
+      y_classes = y
+    weights = np.asarray([class_weight[cls] for cls in y_classes])
+    return weights
+  else:
+    if sample_weight_mode is None:
+      return np.ones((y.shape[0],), dtype=K.floatx())
+    else:
+      return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
+
+
+class GeneratorEnqueuer(object):
+  """Builds a queue out of a data generator.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      generator: a generator function which endlessly yields data
+      pickle_safe: use multiprocessing if True, otherwise threading
+  """
+
+  def __init__(self, generator, pickle_safe=False):
+    self._generator = generator
+    self._pickle_safe = pickle_safe
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+  def start(self, workers=1, max_q_size=10, wait_time=0.05):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_q_size: queue size (when full, threads could block on put())
+        wait_time: time to sleep in-between calls to put()
+    """
+
+    def data_generator_task():
+      while not self._stop_event.is_set():
+        try:
+          if self._pickle_safe or self.queue.qsize() < max_q_size:
+            generator_output = next(self._generator)
+            self.queue.put(generator_output)
+          else:
+            time.sleep(wait_time)
+        except Exception:
+          self._stop_event.set()
+          raise
+
+    try:
+      if self._pickle_safe:
+        self.queue = multiprocessing.Queue(maxsize=max_q_size)
+        self._stop_event = multiprocessing.Event()
+      else:
+        self.queue = queue.Queue()
+        self._stop_event = threading.Event()
+
+      for _ in range(workers):
+        if self._pickle_safe:
+          # Reset random seed else all children processes
+          # share the same seed
+          np.random.seed()
+          thread = multiprocessing.Process(target=data_generator_task)
+          thread.daemon = True
+        else:
+          thread = threading.Thread(target=data_generator_task)
+        self._threads.append(thread)
+        thread.start()
+    except:
+      self.stop()
+      raise
+
+  def is_running(self):
+    return self._stop_event is not None and not self._stop_event.is_set()
+
+  def stop(self, timeout=None):
+    """Stop running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called start().
+
+    Arguments:
+        timeout: maximum time to wait on thread.join()
+    """
+    if self.is_running():
+      self._stop_event.set()
+
+    for thread in self._threads:
+      if thread.is_alive():
+        if self._pickle_safe:
+          thread.terminate()
+        else:
+          thread.join(timeout)
+
+    if self._pickle_safe:
+      if self.queue is not None:
+        self.queue.close()
+
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+
+class Model(Container):
+  """The `Model` class adds training & evaluation routines to a `Container`.
+  """
+
+  def compile(self,
+              optimizer,
+              loss,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              **kwargs):
+    """Configures the model for training.
+
+    Arguments:
+        optimizer: str (name of optimizer) or optimizer object.
+            See [optimizers](/optimizers).
+        loss: str (name of objective function) or objective function.
+            See [losses](/losses).
+            If the model has multiple outputs, you can use a different loss
+            on each output by passing a dictionary or a list of losses.
+            The loss value that will be minimized by the model
+            will then be the sum of all individual losses.
+        metrics: list of metrics to be evaluated by the model
+            during training and testing.
+            Typically you will use `metrics=['accuracy']`.
+            To specify different metrics for different outputs of a
+            multi-output model, you could also pass a dictionary,
+            such as `metrics={'output_a': 'accuracy'}`.
+        loss_weights: Optional list or dictionary specifying scalar
+            coefficients (Python floats) to weight the loss contributions
+            of different model outputs.
+            The loss value that will be minimized by the model
+            will then be the *weighted sum* of all individual losses,
+            weighted by the `loss_weights` coefficients.
+            If a list, it is expected to have a 1:1 mapping
+            to the model's outputs. If a tensor, it is expected to map
+            output names (strings) to scalar coefficients.
+        sample_weight_mode: if you need to do timestep-wise
+            sample weighting (2D weights), set this to `"temporal"`.
+            `None` defaults to sample-wise weights (1D).
+            If the model has multiple outputs, you can use a different
+            `sample_weight_mode` on each output by passing a
+            dictionary or a list of modes.
+        **kwargs: Additional arguments passed to `tf.Session.run`.
+
+    Raises:
+        ValueError: In case of invalid arguments for
+            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
+        RuntimeError: If the model has no loss to optimize.
+    """
+    loss = loss or {}
+    self.optimizer = optimizers.get(optimizer)
+    self.sample_weight_mode = sample_weight_mode
+    self.loss = loss
+    self.loss_weights = loss_weights
+
+    # Prepare loss functions.
+    if isinstance(loss, dict):
+      for name in loss:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in loss '
+                           'dictionary: "' + name + '". '
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
+      loss_functions = []
+      for name in self.output_names:
+        if name not in loss:
+          logging.warning(
+              'Output "' + name + '" missing from loss dictionary. '
+              'We assume this was done on purpose, '
+              'and we will not be expecting '
+              'any data to be passed to "' + name + '" during training.',
+              stacklevel=2)
+        loss_functions.append(losses.get(loss.get(name)))
+    elif isinstance(loss, list):
+      if len(loss) != len(self.outputs):
+        raise ValueError('When passing a list as loss, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed loss=' + str(loss))
+      loss_functions = [losses.get(l) for l in loss]
+    else:
+      loss_function = losses.get(loss)
+      loss_functions = [loss_function for _ in range(len(self.outputs))]
+    self.loss_functions = loss_functions
+    weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
+    skip_indices = []
+    self._feed_outputs = []
+    self._feed_output_names = []
+    self._feed_output_shapes = []
+    self._feed_loss_fns = []
+    for i in range(len(weighted_losses)):
+      if weighted_losses[i] is None:
+        skip_indices.append(i)
+      else:
+        self._feed_outputs.append(self.outputs[i])
+        self._feed_output_names.append(self.output_names[i])
+        self._feed_output_shapes.append(self.internal_output_shapes[i])
+        self._feed_loss_fns.append(self.loss_functions[i])
+
+    # Prepare output masks.
+    masks = self.compute_mask(self.inputs, mask=None)
+    if masks is None:
+      masks = [None for _ in self.outputs]
+    if not isinstance(masks, list):
+      masks = [masks]
+
+    # Prepare loss weights.
+    if loss_weights is None:
+      loss_weights_list = [1. for _ in range(len(self.outputs))]
+    elif isinstance(loss_weights, dict):
+      for name in loss_weights:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in loss_weights '
+                           'dictionary: "' + name + '". '
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
+      loss_weights_list = []
+      for name in self.output_names:
+        loss_weights_list.append(loss_weights.get(name, 1.))
+    elif isinstance(loss_weights, list):
+      if len(loss_weights) != len(self.outputs):
+        raise ValueError('When passing a list as loss_weights, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed loss_weights=' +
+                         str(loss_weights))
+      loss_weights_list = loss_weights
+    else:
+      raise TypeError('Could not interpret loss_weights argument: ' +
+                      str(loss_weights) + ' - expected a list of dicts.')
+
+    # Prepare sample weights.
+    sample_weights = []
+    sample_weight_modes = []
+    if isinstance(sample_weight_mode, dict):
+      for name in sample_weight_mode:
+        if name not in self.output_names:
+          raise ValueError('Unknown entry in '
+                           'sample_weight_mode dictionary: "' + name + '". '
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
+      for i, name in enumerate(self.output_names):
+        if i in skip_indices:
+          weight = None
+          sample_weight_modes.append(None)
+        else:
+          if name not in sample_weight_mode:
+            raise ValueError('Output "' + name +
+                             '" missing from sample_weight_modes '
+                             'dictionary')
+          if sample_weight_mode.get(name) == 'temporal':
+            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
+            sample_weight_modes.append('temporal')
+          else:
+            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            sample_weight_modes.append(None)
+        sample_weights.append(weight)
+    elif isinstance(sample_weight_mode, list):
+      if len(sample_weight_mode) != len(self.outputs):
+        raise ValueError('When passing a list as sample_weight_mode, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed '
+                         'sample_weight_mode=' + str(sample_weight_mode))
+      for i in range(len(self.output_names)):
+        if i in skip_indices:
+          weight = None
+          sample_weight_modes.append(None)
+        else:
+          mode = sample_weight_mode[i]
+          name = self.output_names[i]
+          if mode == 'temporal':
+            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
+            sample_weight_modes.append('temporal')
+          else:
+            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            sample_weight_modes.append(None)
+        sample_weights.append(weight)
+    else:
+      for i, name in enumerate(self.output_names):
+        if i in skip_indices:
+          sample_weight_modes.append(None)
+          sample_weights.append(None)
+        else:
+          if sample_weight_mode == 'temporal':
+            sample_weights.append(
+                K.placeholder(ndim=2, name=name + '_sample_weights'))
+            sample_weight_modes.append('temporal')
+          else:
+            sample_weights.append(
+                K.placeholder(ndim=1, name=name + '_sample_weights'))
+            sample_weight_modes.append(None)
+    self.sample_weight_modes = sample_weight_modes
+    self._feed_sample_weight_modes = []
+    for i in range(len(self.outputs)):
+      if i not in skip_indices:
+        self._feed_sample_weight_modes.append(self.sample_weight_modes[i])
+
+    # Prepare targets of model.
+    self.targets = []
+    self._feed_targets = []
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        self.targets.append(None)
+      else:
+        shape = self.internal_output_shapes[i]
+        name = self.output_names[i]
+        target = K.placeholder(
+            ndim=len(shape),
+            name=name + '_target',
+            sparse=K.is_sparse(self.outputs[i]),
+            dtype=K.dtype(self.outputs[i]))
+        self.targets.append(target)
+        self._feed_targets.append(target)
+
+    # Prepare metrics.
+    self.metrics = metrics
+    self.metrics_names = ['loss']
+    self.metrics_tensors = []
+
+    # Compute total loss.
+    total_loss = None
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        continue
+      y_true = self.targets[i]
+      y_pred = self.outputs[i]
+      weighted_loss = weighted_losses[i]
+      sample_weight = sample_weights[i]
+      mask = masks[i]
+      loss_weight = loss_weights_list[i]
+      output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+      if len(self.outputs) > 1:
+        self.metrics_tensors.append(output_loss)
+        self.metrics_names.append(self.output_names[i] + '_loss')
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+    if total_loss is None:
+      if not self.losses:
+        raise RuntimeError('The model cannot be compiled '
+                           'because it has no loss to optimize.')
+      else:
+        total_loss = 0.
+
+    # Add regularization penalties
+    # and other layer-specific losses.
+    for loss_tensor in self.losses:
+      total_loss += loss_tensor
+
+    # List of same size as output_names.
+    # contains tuples (metrics for output, names of metrics).
+    nested_metrics = _collect_metrics(metrics, self.output_names)
+
+    def append_metric(layer_num, metric_name, metric_tensor):
+      """Helper function used in loop below."""
+      if len(self.output_names) > 1:
+        metric_name = self.output_layers[layer_num].name + '_' + metric_name
+      self.metrics_names.append(metric_name)
+      self.metrics_tensors.append(metric_tensor)
+
+    for i in range(len(self.outputs)):
+      if i in skip_indices:
+        continue
+      y_true = self.targets[i]
+      y_pred = self.outputs[i]
+      output_metrics = nested_metrics[i]
+      for metric in output_metrics:
+        if metric == 'accuracy' or metric == 'acc':
+          # custom handling of accuracy
+          # (because of class mode duality)
+          output_shape = self.internal_output_shapes[i]
+          acc_fn = None
+          if (output_shape[-1] == 1 or
+              self.loss_functions[i] == losses.binary_crossentropy):
+            # case: binary accuracy
+            acc_fn = metrics_module.binary_accuracy
+          elif self.loss_functions[i] == losses.sparse_categorical_crossentropy:
+            # case: categorical accuracy with sparse targets
+            acc_fn = metrics_module.sparse_categorical_accuracy
+          else:
+            acc_fn = metrics_module.categorical_accuracy
+
+          masked_fn = _masked_objective(acc_fn)
+          append_metric(i, 'acc', masked_fn(y_true, y_pred, mask=masks[i]))
+        else:
+          metric_fn = metrics_module.get(metric)
+          masked_metric_fn = _masked_objective(metric_fn)
+          metric_result = masked_metric_fn(y_true, y_pred, mask=masks[i])
+          metric_result = {metric_fn.__name__: metric_result}
+          for name, tensor in six.iteritems(metric_result):
+            append_metric(i, name, tensor)
+
+    # Prepare gradient updates and state updates.
+    self.total_loss = total_loss
+    self.sample_weights = sample_weights
+    self._feed_sample_weights = []
+    for i in range(len(self.sample_weights)):
+      if i not in skip_indices:
+        self._feed_sample_weights.append(sample_weights[i])
+
+    # Functions for train, test and predict will
+    # be compiled lazily when required.
+    # This saves time when the user is not using all functions.
+    self.train_function = None
+    self.test_function = None
+    self.predict_function = None
+    self._function_kwargs = kwargs
+
+    # Collected trainable weights and sort them deterministically.
+    trainable_weights = self.trainable_weights
+    # Sort weights by name.
+    if trainable_weights:
+      trainable_weights.sort(key=lambda x: x.name)
+    self._collected_trainable_weights = trainable_weights
+
+  def _make_train_function(self):
+    if not hasattr(self, 'train_function'):
+      raise RuntimeError('You must compile your model before using it.')
+    if self.train_function is None:
+      inputs = (
+          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs += [K.learning_phase()]
+
+      training_updates = self.optimizer.get_updates(
+          self._collected_trainable_weights, self.constraints, self.total_loss)
+      updates = self.updates + training_updates
+      # Gets loss and metrics. Updates weights at each call.
+      self.train_function = K.function(
+          inputs, [self.total_loss] + self.metrics_tensors,
+          updates=updates,
+          name='train_function',
+          **self._function_kwargs)
+
+  def _make_test_function(self):
+    if not hasattr(self, 'test_function'):
+      raise RuntimeError('You must compile your model before using it.')
+    if self.test_function is None:
+      inputs = (
+          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs += [K.learning_phase()]
+      # Return loss and metrics, no gradient updates.
+      # Does update the network states.
+      self.test_function = K.function(
+          inputs, [self.total_loss] + self.metrics_tensors,
+          updates=self.state_updates,
+          name='test_function',
+          **self._function_kwargs)
+
+  def _make_predict_function(self):
+    if not hasattr(self, 'predict_function'):
+      self.predict_function = None
+      self._function_kwargs = {}
+    if self.predict_function is None:
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        inputs = self._feed_inputs + [K.learning_phase()]
+      else:
+        inputs = self._feed_inputs
+      # Gets network outputs. Does not update weights.
+      # Does update the network states.
+      self.predict_function = K.function(
+          inputs,
+          self.outputs,
+          updates=self.state_updates,
+          name='predict_function',
+          **self._function_kwargs)
+
+  def _fit_loop(self,
+                f,
+                ins,
+                out_labels=None,
+                batch_size=32,
+                epochs=100,
+                verbose=1,
+                callbacks=None,
+                val_f=None,
+                val_ins=None,
+                shuffle=True,
+                callback_metrics=None,
+                initial_epoch=0):
+    """Abstract fit function for `f(ins)`.
+
+    Assume that f returns a list, labeled by out_labels.
+
+    Arguments:
+        f: Keras function returning a list of tensors
+        ins: list of tensors to be fed to `f`
+        out_labels: list of strings, display names of
+            the outputs of `f`
+        batch_size: integer batch size
+        epochs: number of times to iterate over the data
+        verbose: verbosity mode, 0, 1 or 2
+        callbacks: list of callbacks to be called during training
+        val_f: Keras function to call for validation
+        val_ins: list of tensors to be fed to `val_f`
+        shuffle: whether to shuffle the data at the beginning of each epoch
+        callback_metrics: list of strings, the display names of the metrics
+            passed to the callbacks. They should be the
+            concatenation of list the display names of the outputs of
+             `f` and the list of display names of the outputs of `f_val`.
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        `History` object.
+    """
+    do_validation = False
+    if val_f and val_ins:
+      do_validation = True
+      if verbose:
+        print('Train on %d samples, validate on %d samples' %
+              (ins[0].shape[0], val_ins[0].shape[0]))
+
+    if ins and hasattr(ins[0], 'shape'):
+      num_train_samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `fit` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `fit` over a single batch.
+      num_train_samples = batch_size
+      verbose = 2
+    index_array = np.arange(num_train_samples)
+
+    self.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+    if verbose:
+      callbacks += [cbks.ProgbarLogger()]
+    callbacks = cbks.CallbackList(callbacks)
+    out_labels = out_labels or []
+
+    # it's possible to callback a different model than self
+    # (used by Sequential models)
+    if hasattr(self, 'callback_model') and self.callback_model:
+      callback_model = self.callback_model
+    else:
+      callback_model = self
+
+    callbacks.set_model(callback_model)
+    callbacks.set_params({
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'samples': num_train_samples,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics or [],
+    })
+    callbacks.on_train_begin()
+    callback_model.stop_training = False
+    for cbk in callbacks:
+      cbk.validation_data = val_ins
+
+    for epoch in range(initial_epoch, epochs):
+      callbacks.on_epoch_begin(epoch)
+      if shuffle == 'batch':
+        index_array = _batch_shuffle(index_array, batch_size)
+      elif shuffle:
+        np.random.shuffle(index_array)
+
+      batches = _make_batches(num_train_samples, batch_size)
+      epoch_logs = {}
+      for batch_index, (batch_start, batch_end) in enumerate(batches):
+        batch_ids = index_array[batch_start:batch_end]
+        try:
+          if isinstance(ins[-1], float):
+            # Do not slice the training phase flag.
+            ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+          else:
+            ins_batch = _slice_arrays(ins, batch_ids)
+        except TypeError:
+          raise TypeError('TypeError while preparing batch. '
+                          'If using HDF5 input data, '
+                          'pass shuffle="batch".')
+        batch_logs = {}
+        batch_logs['batch'] = batch_index
+        batch_logs['size'] = len(batch_ids)
+        callbacks.on_batch_begin(batch_index, batch_logs)
+        outs = f(ins_batch)
+        if not isinstance(outs, list):
+          outs = [outs]
+        for l, o in zip(out_labels, outs):
+          batch_logs[l] = o
+
+        callbacks.on_batch_end(batch_index, batch_logs)
+        if callback_model.stop_training:
+          break
+
+        if batch_index == len(batches) - 1:  # Last batch.
+          if do_validation:
+            val_outs = self._test_loop(
+                val_f, val_ins, batch_size=batch_size, verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # Same labels assumed.
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+      callbacks.on_epoch_end(epoch, epoch_logs)
+      if callback_model.stop_training:
+        break
+    callbacks.on_train_end()
+    return self.history
+
+  def _predict_loop(self, f, ins, batch_size=32, verbose=0):
+    """Abstract method to loop over some data in batches.
+
+    Arguments:
+        f: Keras function returning a list of tensors.
+        ins: list of tensors to be fed to `f`.
+        batch_size: integer batch size.
+        verbose: verbosity mode.
+
+    Returns:
+        Array of predictions (if the model has a single output)
+        or list of arrays of predictions
+        (if the model has multiple outputs).
+    """
+    if ins and hasattr(ins[0], 'shape'):
+      samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `predict` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `predict` over a single batch.
+      samples = batch_size
+      verbose = 2
+    outs = []
+    if verbose == 1:
+      progbar = Progbar(target=samples)
+    batches = _make_batches(samples, batch_size)
+    index_array = np.arange(samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if ins and isinstance(ins[-1], float):
+        # Do not slice the training phase flag.
+        ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = _slice_arrays(ins, batch_ids)
+
+      batch_outs = f(ins_batch)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+      if batch_index == 0:
+        for batch_out in batch_outs:
+          shape = (samples,) + batch_out.shape[1:]
+          outs.append(np.zeros(shape, dtype=batch_out.dtype))
+
+      for i, batch_out in enumerate(batch_outs):
+        outs[i][batch_start:batch_end] = batch_out
+      if verbose == 1:
+        progbar.update(batch_end)
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+  def _test_loop(self, f, ins, batch_size=32, verbose=0):
+    """Abstract method to loop over some data in batches.
+
+    Arguments:
+        f: Keras function returning a list of tensors.
+        ins: list of tensors to be fed to `f`.
+        batch_size: integer batch size.
+        verbose: verbosity mode.
+
+    Returns:
+        Scalar loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    if ins and hasattr(ins[0], 'shape'):
+      samples = ins[0].shape[0]
+    else:
+      # May happen if we are running `evaluate` without Numpy input data,
+      # i.e. if all inputs to the models are data tensors
+      # instead of placeholders.
+      # In that case we will run `evaluate` over a single batch.
+      samples = batch_size
+      verbose = 2
+
+    outs = []
+    if verbose == 1:
+      progbar = Progbar(target=samples)
+    batches = _make_batches(samples, batch_size)
+    index_array = np.arange(samples)
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      if isinstance(ins[-1], float):
+        # Do not slice the training phase flag.
+        ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+      else:
+        ins_batch = _slice_arrays(ins, batch_ids)
+
+      batch_outs = f(ins_batch)
+      if isinstance(batch_outs, list):
+        if batch_index == 0:
+          for batch_out in enumerate(batch_outs):
+            outs.append(0.)
+        for i, batch_out in enumerate(batch_outs):
+          outs[i] += batch_out * len(batch_ids)
+      else:
+        if batch_index == 0:
+          outs.append(0.)
+        outs[0] += batch_outs * len(batch_ids)
+
+      if verbose == 1:
+        progbar.update(batch_end)
+    for i in range(len(outs)):
+      outs[i] /= samples
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+  def _standardize_user_data(self,
+                             x,
+                             y,
+                             sample_weight=None,
+                             class_weight=None,
+                             check_batch_axis=True,
+                             batch_size=None):
+    if not hasattr(self, 'optimizer'):
+      raise RuntimeError('You must compile a model before '
+                         'training/testing. '
+                         'Use `model.compile(optimizer, loss)`.')
+
+    output_shapes = []
+    for output_shape, loss_fn in zip(self._feed_output_shapes,
+                                     self._feed_loss_fns):
+      if loss_fn.__name__ == 'sparse_categorical_crossentropy':
+        output_shapes.append(output_shape[:-1] + (1,))
+      elif getattr(losses, loss_fn.__name__, None) is None:
+        output_shapes.append(None)
+      else:
+        output_shapes.append(output_shape)
+    x = _standardize_input_data(
+        x,
+        self._feed_input_names,
+        self._feed_input_shapes,
+        check_batch_axis=False,
+        exception_prefix='input')
+    y = _standardize_input_data(
+        y,
+        self._feed_output_names,
+        output_shapes,
+        check_batch_axis=False,
+        exception_prefix='target')
+    sample_weights = _standardize_sample_weights(sample_weight,
+                                                 self._feed_output_names)
+    class_weights = _standardize_class_weights(class_weight,
+                                               self._feed_output_names)
+    sample_weights = [
+        _standardize_weights(ref, sw, cw, mode)
+        for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
+                                       self._feed_sample_weight_modes)
+    ]
+    _check_array_lengths(x, y, sample_weights)
+    _check_loss_and_target_compatibility(y, self._feed_loss_fns,
+                                         self._feed_output_shapes)
+    if self.stateful and batch_size:
+      if x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples')
+    return x, y, sample_weights
+
+  def _get_deduped_metrics_names(self):
+    out_labels = self.metrics_names
+
+    # Rename duplicated metrics name
+    # (can happen with an output layer shared among multiple dataflows).
+    deduped_out_labels = []
+    for i, label in enumerate(out_labels):
+      new_label = label
+      if out_labels.count(label) > 1:
+        dup_idx = out_labels[:i].count(label)
+        new_label += '_' + str(dup_idx + 1)
+      deduped_out_labels.append(new_label)
+    return deduped_out_labels
+
+  def fit(self,
+          x=None,
+          y=None,
+          batch_size=32,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0):
+    """Trains the model for a fixed number of epochs (iterations on a dataset).
+
+    Arguments:
+        x: Numpy array of training data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        batch_size: integer. Number of samples per gradient update.
+        epochs: integer, the number of times to iterate
+            over the training data arrays.
+        verbose: 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = verbose, 2 = one log line per epoch.
+        callbacks: list of callbacks to be called during training.
+            See [callbacks](/callbacks).
+        validation_split: float between 0 and 1:
+            fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+        validation_data: data on which to evaluate
+            the loss and any model metrics
+            at the end of each epoch. The model will not
+            be trained on this data.
+            This could be a tuple (x_val, y_val)
+            or a tuple (x_val, y_val, val_sample_weights).
+        shuffle: boolean, whether to shuffle the training data
+            before each epoch.
+        class_weight: optional dictionary mapping
+            class indices (integers) to
+            a weight (float) to apply to the model's loss for the samples
+            from this class during training.
+            This can be useful to tell the model to "pay more attention" to
+            samples from an under-represented class.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` instance. Its `history` attribute contains
+        all information collected during training.
+
+    Raises:
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects.
+    """
+    # Validate user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        check_batch_axis=False,
+        batch_size=batch_size)
+    # Prepare validation data.
+    if validation_data:
+      do_validation = True
+      if len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError(
+            'When passing validation_data, '
+            'it must contain 2 (x_val, y_val) '
+            'or 3 (x_val, y_val, val_sample_weights) '
+            'items, however it contains %d items' % len(validation_data))
+
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x,
+          val_y,
+          sample_weight=val_sample_weight,
+          check_batch_axis=False,
+          batch_size=batch_size)
+      self._make_test_function()
+      val_f = self.test_function
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_ins = val_x + val_y + val_sample_weights + [0.]
+      else:
+        val_ins = val_x + val_y + val_sample_weights
+
+    elif validation_split and 0. < validation_split < 1.:
+      do_validation = True
+      split_at = int(len(x[0]) * (1. - validation_split))
+      x, val_x = (_slice_arrays(x, 0, split_at), _slice_arrays(x, split_at))
+      y, val_y = (_slice_arrays(y, 0, split_at), _slice_arrays(y, split_at))
+      sample_weights, val_sample_weights = (_slice_arrays(
+          sample_weights, 0, split_at), _slice_arrays(sample_weights, split_at))
+      self._make_test_function()
+      val_f = self.test_function
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_ins = val_x + val_y + val_sample_weights + [0.]
+      else:
+        val_ins = val_x + val_y + val_sample_weights
+    else:
+      do_validation = False
+      val_f = None
+      val_ins = None
+
+    # Prepare input arrays and training function.
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [1.]
+    else:
+      ins = x + y + sample_weights
+    self._make_train_function()
+    f = self.train_function
+
+    # Prepare display labels.
+    out_labels = self._get_deduped_metrics_names()
+
+    if do_validation:
+      callback_metrics = copy.copy(out_labels) + [
+          'val_' + n for n in out_labels
+      ]
+    else:
+      callback_metrics = copy.copy(out_labels)
+
+    # Delegate logic to `_fit_loop`.
+    return self._fit_loop(
+        f,
+        ins,
+        out_labels=out_labels,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_f=val_f,
+        val_ins=val_ins,
+        shuffle=shuffle,
+        callback_metrics=callback_metrics,
+        initial_epoch=initial_epoch)
+
+  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
+    """Returns the loss value & metrics values for the model in test mode.
+
+    Computation is done in batches.
+
+    Arguments:
+        x: Numpy array of test data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        batch_size: integer. Number of samples per gradient update.
+        verbose: verbosity mode, 0 or 1.
+        sample_weight: Array of weights to weight the contribution
+            of different samples to the loss and metrics.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    # Validate user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        check_batch_axis=False,
+        batch_size=batch_size)
+    # Prepare inputs, delegate logic to `_test_loop`.
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [0.]
+    else:
+      ins = x + y + sample_weights
+    self._make_test_function()
+    f = self.test_function
+    return self._test_loop(f, ins, batch_size=batch_size, verbose=verbose)
+
+  def predict(self, x, batch_size=32, verbose=0):
+    """Generates output predictions for the input samples.
+
+    Computation is done in batches.
+
+    Arguments:
+        x: the input data, as a Numpy array
+            (or list of Numpy arrays if the model has multiple outputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case of mismatch between the provided
+            input data and the model's expectations,
+            or in case a stateful model receives a number of samples
+            that is not a multiple of the batch size.
+    """
+    # Validate user data.
+    x = _standardize_input_data(
+        x,
+        self._feed_input_names,
+        self._feed_input_shapes,
+        check_batch_axis=False)
+    if self.stateful:
+      if x[0].shape[0] > batch_size and x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples. '
+                         'Batch size: ' + str(batch_size) + '.')
+
+    # Prepare inputs, delegate logic to `_predict_loop`.
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + [0.]
+    else:
+      ins = x
+    self._make_predict_function()
+    f = self.predict_function
+    return self._predict_loop(f, ins, batch_size=batch_size, verbose=verbose)
+
+  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
+    """Runs a single gradient update on a single batch of data.
+
+    Arguments:
+        x: Numpy array of training data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        class_weight: optional dictionary mapping
+            class indices (integers) to
+            a weight (float) to apply to the model's loss for the samples
+            from this class during training.
+            This can be useful to tell the model to "pay more attention" to
+            samples from an under-represented class.
+
+    Returns:
+        Scalar training loss
+        (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        check_batch_axis=True)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [1.]
+    else:
+      ins = x + y + sample_weights
+    self._make_train_function()
+    outputs = self.train_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def test_on_batch(self, x, y, sample_weight=None):
+    """Test the model on a single batch of samples.
+
+    Arguments:
+        x: Numpy array of test data,
+            or list of Numpy arrays if the model has multiple inputs.
+            If all inputs in the model are named,
+            you can also pass a dictionary
+            mapping input names to Numpy arrays.
+        y: Numpy array of target data,
+            or list of Numpy arrays if the model has multiple outputs.
+            If all outputs in the model are named,
+            you can also pass a dictionary
+            mapping output names to Numpy arrays.
+        sample_weight: optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+    """
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight, check_batch_axis=True)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + y + sample_weights + [0.]
+    else:
+      ins = x + y + sample_weights
+    self._make_test_function()
+    outputs = self.test_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: Input samples, as a Numpy array.
+
+    Returns:
+        Numpy array(s) of predictions.
+    """
+    x = _standardize_input_data(x, self._feed_input_names,
+                                self._feed_input_shapes)
+    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      ins = x + [0.]
+    else:
+      ins = x
+    self._make_predict_function()
+    outputs = self.predict_function(ins)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_q_size=10,
+                    workers=1,
+                    pickle_safe=False,
+                    initial_epoch=0):
+    """Fits the model on data yielded batch-by-batch by a Python generator.
+
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
+
+    Arguments:
+        generator: a generator.
+            The output of the generator must be either
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+            All arrays should contain the same number of samples.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `steps_per_epoch`
+            batches have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of unique samples if your dataset
+            divided by the batch size.
+        epochs: integer, total number of iterations on the data.
+        verbose: verbosity mode, 0, 1, or 2.
+        callbacks: list of callbacks to be called during training.
+        validation_data: this can be either
+            - a generator for the validation data
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator. Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        class_weight: dictionary mapping class indices to a weight
+            for the class.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: if True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object.
+
+    Example:
+
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create numpy arrays of input data
+                    # and labels, from each line in the file
+                    x1, x2, y = process_line(line)
+                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
+                f.close()
+
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            steps_per_epoch=10000, epochs=10)
+    ```
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    wait_time = 0.01  # in seconds
+    epoch = initial_epoch
+
+    do_validation = bool(validation_data)
+    self._make_train_function()
+    if do_validation:
+      self._make_test_function()
+
+    # python 2 has 'next', 3 has '__next__'
+    # avoid any explicit version checks
+    val_gen = (hasattr(validation_data, 'next') or
+               hasattr(validation_data, '__next__'))
+    if val_gen and not validation_steps:
+      raise ValueError('When using a generator for validation data, '
+                       'you must specify a value for '
+                       '`validation_steps`.')
+
+    # Prepare display labels.
+    out_labels = self._get_deduped_metrics_names()
+    callback_metrics = out_labels + ['val_' + n for n in out_labels]
+
+    # prepare callbacks
+    self.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+    if verbose:
+      callbacks += [cbks.ProgbarLogger(count_mode='steps')]
+    callbacks = cbks.CallbackList(callbacks)
+
+    # it's possible to callback a different model than self:
+    if hasattr(self, 'callback_model') and self.callback_model:
+      callback_model = self.callback_model
+    else:
+      callback_model = self
+    callbacks.set_model(callback_model)
+    callbacks.set_params({
+        'epochs': epochs,
+        'steps': steps_per_epoch,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics,
+    })
+    callbacks.on_train_begin()
+
+    if do_validation and not val_gen:
+      if len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError('validation_data should be a tuple '
+                         '`(val_x, val_y, val_sample_weight)` '
+                         'or `(val_x, val_y)`. Found: ' + str(validation_data))
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x, val_y, val_sample_weight)
+      val_data = val_x + val_y + val_sample_weights
+      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+        val_data += [0.]
+      for cbk in callbacks:
+        cbk.validation_data = val_data
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(max_q_size=max_q_size, workers=workers)
+
+      callback_model.stop_training = False
+      while epoch < epochs:
+        callbacks.on_epoch_begin(epoch)
+        steps_done = 0
+        batch_index = 0
+        while steps_done < steps_per_epoch:
+          generator_output = None
+          while enqueuer.is_running():
+            if not enqueuer.queue.empty():
+              generator_output = enqueuer.queue.get()
+              break
+            else:
+              time.sleep(wait_time)
+
+          if not hasattr(generator_output, '__len__'):
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+          if len(generator_output) == 2:
+            x, y = generator_output  # pylint: disable=unpacking-non-sequence
+            sample_weight = None
+          elif len(generator_output) == 3:
+            x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+          # build batch logs
+          batch_logs = {}
+          if isinstance(x, list):
+            batch_size = x[0].shape[0]
+          elif isinstance(x, dict):
+            batch_size = list(x.values())[0].shape[0]
+          else:
+            batch_size = x.shape[0]
+          batch_logs['batch'] = batch_index
+          batch_logs['size'] = batch_size
+          callbacks.on_batch_begin(batch_index, batch_logs)
+
+          outs = self.train_on_batch(
+              x, y, sample_weight=sample_weight, class_weight=class_weight)
+
+          if not isinstance(outs, list):
+            outs = [outs]
+          for l, o in zip(out_labels, outs):
+            batch_logs[l] = o
+
+          callbacks.on_batch_end(batch_index, batch_logs)
+
+          # Construct epoch logs.
+          epoch_logs = {}
+          batch_index += 1
+          steps_done += 1
+
+          # Epoch finished.
+          if steps_done >= steps_per_epoch and do_validation:
+            if val_gen:
+              val_outs = self.evaluate_generator(
+                  validation_data,
+                  validation_steps,
+                  max_q_size=max_q_size,
+                  workers=workers,
+                  pickle_safe=pickle_safe)
+            else:
+              # No need for try/except because
+              # data has already been validated.
+              val_outs = self.evaluate(
+                  val_x,
+                  val_y,
+                  batch_size=batch_size,
+                  sample_weight=val_sample_weights,
+                  verbose=0)
+            if not isinstance(val_outs, list):
+              val_outs = [val_outs]
+            # Same labels assumed.
+            for l, o in zip(out_labels, val_outs):
+              epoch_logs['val_' + l] = o
+
+        callbacks.on_epoch_end(epoch, epoch_logs)
+        epoch += 1
+        if callback_model.stop_training:
+          break
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    callbacks.on_train_end()
+    return self.history
+
+  def evaluate_generator(self,
+                         generator,
+                         steps,
+                         max_q_size=10,
+                         workers=1,
+                         pickle_safe=False):
+    """Evaluates the model on a data generator.
+
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
+
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: if True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    self._make_test_function()
+
+    steps_done = 0
+    wait_time = 0.01
+    all_outs = []
+    batch_sizes = []
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(workers=workers, max_q_size=max_q_size)
+
+      while steps_done < steps:
+        generator_output = None
+        while enqueuer.is_running():
+          if not enqueuer.queue.empty():
+            generator_output = enqueuer.queue.get()
+            break
+          else:
+            time.sleep(wait_time)
+
+        if not hasattr(generator_output, '__len__'):
+          raise ValueError('output of generator should be a tuple '
+                           '(x, y, sample_weight) '
+                           'or (x, y). Found: ' + str(generator_output))
+        if len(generator_output) == 2:
+          x, y = generator_output  # pylint: disable=unpacking-non-sequence
+          sample_weight = None
+        elif len(generator_output) == 3:
+          x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+        else:
+          raise ValueError('output of generator should be a tuple '
+                           '(x, y, sample_weight) '
+                           'or (x, y). Found: ' + str(generator_output))
+        outs = self.test_on_batch(x, y, sample_weight=sample_weight)
+
+        if isinstance(x, list):
+          batch_size = len(x[0])
+        elif isinstance(x, dict):
+          batch_size = len(list(x.values())[0])
+        else:
+          batch_size = len(x)
+        all_outs.append(outs)
+
+        steps_done += 1
+        batch_sizes.append(batch_size)
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    if not isinstance(outs, list):
+      return np.average(np.asarray(all_outs), weights=batch_sizes)
+    else:
+      averages = []
+      for i in range(len(outs)):
+        averages.append(
+            np.average([out[i] for out in all_outs], weights=batch_sizes))
+      return averages
+
+  def predict_generator(self,
+                        generator,
+                        steps,
+                        max_q_size=10,
+                        workers=1,
+                        pickle_safe=False,
+                        verbose=0):
+    """Generates predictions for the input samples from a data generator.
+
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
+
+    Arguments:
+        generator: Generator yielding batches of input samples.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: Maximum size for the generator queue.
+        workers: Maximum number of processes to spin up
+            when using process based threading
+        pickle_safe: If `True`, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
+    """
+    self._make_predict_function()
+
+    steps_done = 0
+    wait_time = 0.01
+    all_outs = []
+    enqueuer = None
+
+    try:
+      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
+      enqueuer.start(workers=workers, max_q_size=max_q_size)
+
+      if verbose == 1:
+        progbar = Progbar(target=steps)
+
+      while steps_done < steps:
+        generator_output = None
+        while enqueuer.is_running():
+          if not enqueuer.queue.empty():
+            generator_output = enqueuer.queue.get()
+            break
+          else:
+            time.sleep(wait_time)
+
+        if isinstance(generator_output, tuple):
+          # Compatibility with the generators
+          # used for training.
+          if len(generator_output) == 2:
+            x, _ = generator_output  # pylint: disable=unpacking-non-sequence
+          elif len(generator_output) == 3:
+            x, _, _ = generator_output  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError('output of generator should be '
+                             'a tuple `(x, y, sample_weight)` '
+                             'or `(x, y)`. Found: ' + str(generator_output))
+        else:
+          # Assumes a generator that only
+          # yields inputs (not targets and sample weights).
+          x = generator_output
+
+        outs = self.predict_on_batch(x)
+        if not isinstance(outs, list):
+          outs = [outs]
+
+        if not all_outs:
+          for out in outs:
+            all_outs.append([])
+
+        for i, out in enumerate(outs):
+          all_outs[i].append(out)
+        steps_done += 1
+        if verbose == 1:
+          progbar.update(steps_done)
+
+    finally:
+      if enqueuer is not None:
+        enqueuer.stop()
+
+    if len(all_outs) == 1:
+      if steps_done == 1:
+        return all_outs[0][0]
+      else:
+        return np.concatenate(all_outs[0])
+    if steps_done == 1:
+      return [out for out in all_outs]
+    else:
+      return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/contrib/keras/python/keras/engine/training_test.py b/tensorflow/contrib/keras/python/keras/engine/training_test.py
new file mode 100644
index 00000000000..a23838f7b4f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/engine/training_test.py
@@ -0,0 +1,695 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.contrib.keras.python.keras.engine.training import _weighted_masked_objective
+from tensorflow.python.platform import test
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 3))
+
+      output_d_np = np.random.random((10, 4))
+      output_e_np = np.random.random((10, 4))
+
+      # Test fit at different verbosity
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+      model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+      # Test with validation data
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=2,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                      output_e_np]),
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+      # Test with validation split
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=0,
+          validation_split=0.2)
+
+      # Test with dictionary inputs
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          validation_data=({
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          }),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.train_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      }, {'dense': output_d_np,
+          'dropout': output_e_np})
+
+      # Test with lists for loss, metrics
+      loss = ['mae', 'mse']
+      metrics = ['acc', 'mae']
+      model.compile(optimizer, loss, metrics=metrics)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+
+      # Test with dictionaries for loss, metrics, loss weights
+      loss = {'dense': 'mse', 'dropout': 'mae'}
+      loss_weights = {'dense': 1., 'dropout': 0.5}
+      metrics = {'dense': 'mse', 'dropout': 'mae'}
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+
+  def test_evaluate_predict_on_arrays(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(
+          optimizer,
+          loss,
+          metrics=metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=None)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 3))
+
+      output_d_np = np.random.random((10, 4))
+      output_e_np = np.random.random((10, 4))
+
+      # Test evaluate at different verbosity
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=0)
+      self.assertEqual(len(out), 5)
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=1)
+      self.assertEqual(len(out), 5)
+      out = model.evaluate(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          batch_size=5,
+          verbose=2)
+      self.assertEqual(len(out), 5)
+      out = model.test_on_batch([input_a_np, input_b_np],
+                                [output_d_np, output_e_np])
+      self.assertEqual(len(out), 5)
+
+      # Test evaluate with dictionary inputs
+      model.evaluate(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          batch_size=5,
+          verbose=0)
+      model.evaluate(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {'dense': output_d_np,
+              'dropout': output_e_np},
+          batch_size=5,
+          verbose=1)
+
+      # Test predict
+      out = model.predict([input_a_np, input_b_np], batch_size=5)
+      self.assertEqual(len(out), 2)
+      out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+      self.assertEqual(len(out), 2)
+      out = model.predict_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      })
+      self.assertEqual(len(out), 2)
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+      model.add(keras.layers.Activation('relu'))
+      model.add(keras.layers.Dense(num_classes))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          class_weight=class_weight,
+          validation_data=(x_train, y_train, sample_weight))
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 2,
+          verbose=0,
+          class_weight=class_weight)
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 2,
+          verbose=0,
+          class_weight=class_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+      ref_score = model.evaluate(x_test, y_test, verbose=0)
+      score = model.evaluate(
+          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+      self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+      model.add(keras.layers.Activation('relu'))
+      model.add(keras.layers.Dense(num_classes))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=sample_weight)
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=sample_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          x_train[:batch_size],
+          y_train[:batch_size],
+          sample_weight=sample_weight[:batch_size])
+      model.test_on_batch(
+          x_train[:batch_size],
+          y_train[:batch_size],
+          sample_weight=sample_weight[:batch_size])
+      ref_score = model.evaluate(x_test, y_test, verbose=0)
+      score = model.evaluate(
+          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+      self.assertLess(score, ref_score)
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+      model.add(keras.layers.Activation('softmax'))
+
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      int_y_test = y_test.copy()
+      int_y_train = y_train.copy()
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      y_test = keras.utils.to_categorical(y_test, num_classes)
+      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+      class_weight[weighted_class] = 2.
+
+      sample_weight = np.ones((y_train.shape[0]))
+      sample_weight[int_y_train == weighted_class] = 2.
+
+      temporal_x_train = np.reshape(x_train, (len(x_train), 1,
+                                              x_train.shape[1]))
+      temporal_x_train = np.repeat(temporal_x_train, timesteps, axis=1)
+      temporal_x_test = np.reshape(x_test, (len(x_test), 1, x_test.shape[1]))
+      temporal_x_test = np.repeat(temporal_x_test, timesteps, axis=1)
+
+      temporal_y_train = np.reshape(y_train, (len(y_train), 1,
+                                              y_train.shape[1]))
+      temporal_y_train = np.repeat(temporal_y_train, timesteps, axis=1)
+      temporal_y_test = np.reshape(y_test, (len(y_test), 1, y_test.shape[1]))
+      temporal_y_test = np.repeat(temporal_y_test, timesteps, axis=1)
+
+      temporal_sample_weight = np.reshape(sample_weight, (len(sample_weight),
+                                                          1))
+      temporal_sample_weight = np.repeat(
+          temporal_sample_weight, timesteps, axis=1)
+
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer='rmsprop',
+          sample_weight_mode='temporal')
+
+      model.fit(
+          temporal_x_train,
+          temporal_y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=temporal_sample_weight)
+      model.fit(
+          temporal_x_train,
+          temporal_y_train,
+          batch_size=batch_size,
+          epochs=epochs // 3,
+          verbose=0,
+          sample_weight=temporal_sample_weight,
+          validation_split=0.1)
+
+      model.train_on_batch(
+          temporal_x_train[:batch_size],
+          temporal_y_train[:batch_size],
+          sample_weight=temporal_sample_weight[:batch_size])
+      model.test_on_batch(
+          temporal_x_train[:batch_size],
+          temporal_y_train[:batch_size],
+          sample_weight=temporal_sample_weight[:batch_size])
+      ref_score = model.evaluate(temporal_x_test, temporal_y_test, verbose=0)
+      score = model.evaluate(
+          temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
+      self.assertLess(score, ref_score)
+
+
+class LossMaskingTest(test.TestCase):
+
+  def test_masking(self):
+    with self.test_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(1, kernel_initializer='one')))
+      model.compile(loss='mse', optimizer='sgd')
+      y = np.array([[[1], [1]], [[1], [1]]])
+      loss = model.train_on_batch(x, y)
+      self.assertEqual(loss, 0)
+
+  def test_loss_masking(self):
+    with self.test_session():
+      weighted_loss = _weighted_masked_objective(keras.losses.get('mae'))
+      shape = (3, 4, 2)
+      x = np.arange(24).reshape(shape)
+      y = 2 * x
+
+      # Normally the trailing 1 is added by standardize_weights
+      weights = np.ones((3,))
+      mask = np.ones((3, 4))
+      mask[1, 0] = 0
+
+      keras.backend.eval(
+          weighted_loss(
+              keras.backend.variable(x),
+              keras.backend.variable(y),
+              keras.backend.variable(weights), keras.backend.variable(mask)))
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_argument(self):
+    with self.test_session():
+      x = np.random.random((5, 3))
+      y = np.random.random((5, 2))
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+      model.compile('rmsprop', 'mse')
+      out = model.predict(x)
+      model.train_on_batch(x, y)
+      out_2 = model.predict(x)
+      self.assertAllClose(out, out_2)
+
+      # test with nesting
+      inputs = keras.layers.Input(shape=(3,))
+      output = model(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile('rmsprop', 'mse')
+      out = model.predict(x)
+      model.train_on_batch(x, y)
+      out_2 = model.predict(x)
+      self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    with self.test_session():
+      # with constructor argument, in Sequential
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+      self.assertListEqual(model.trainable_weights, [])
+
+      # by setting the `trainable` argument, in Sequential
+      model = keras.models.Sequential()
+      layer = keras.layers.Dense(2, input_dim=1)
+      model.add(layer)
+      self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+      layer.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+      # with constructor argument, in Model
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2, trainable=False)(x)
+      model = keras.models.Model(x, y)
+      self.assertListEqual(model.trainable_weights, [])
+
+      # by setting the `trainable` argument, in Model
+      x = keras.layers.Input(shape=(1,))
+      layer = keras.layers.Dense(2)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+      layer.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    with self.test_session():
+      # a non-trainable model has no trainable weights
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      model = keras.models.Model(x, y)
+      model.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+      # same for Sequential
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=1))
+      model.trainable = False
+      self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+    with self.test_session():
+      # a Sequential inside a Model
+      inner_model = keras.models.Sequential()
+      inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+      x = keras.layers.Input(shape=(1,))
+      y = inner_model(x)
+      outer_model = keras.models.Model(x, y)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Sequential inside a Sequential
+      inner_model = keras.models.Sequential()
+      inner_model.add(keras.layers.Dense(2, input_dim=1))
+      outer_model = keras.models.Sequential()
+      outer_model.add(inner_model)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Model inside a Model
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      inner_model = keras.models.Model(x, y)
+      x = keras.layers.Input(shape=(1,))
+      y = inner_model(x)
+      outer_model = keras.models.Model(x, y)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+      # a Model inside a Sequential
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(2)(x)
+      inner_model = keras.models.Model(x, y)
+      outer_model = keras.models.Sequential()
+      outer_model.add(inner_model)
+      self.assertListEqual(outer_model.trainable_weights,
+                           inner_model.trainable_weights)
+      inner_model.trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+      inner_model.trainable = True
+      inner_model.layers[-1].trainable = False
+      self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestGeneratorMethods(test.TestCase):
+
+  def test_generator_methods(self):
+    arr_data = np.random.randint(0, 256, (50, 2))
+    arr_labels = np.random.randint(0, 2, 50)
+
+    def custom_generator():
+      batch_size = 10
+      n_samples = 50
+      while True:
+        batch_index = np.random.randint(0, n_samples - batch_size)
+        start = batch_index
+        end = start + batch_size
+        x = arr_data[start: end]
+        y = arr_labels[start: end]
+        yield x, y
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        workers=4,
+                        pickle_safe=True)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        pickle_safe=False)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_q_size=10,
+                        pickle_safe=False,
+                        validation_data=custom_generator(),
+                        validation_steps=10)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_q_size=10,
+                            workers=2,
+                            pickle_safe=True)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_q_size=10,
+                            pickle_safe=False)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_q_size=10,
+                             workers=2,
+                             pickle_safe=True)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_q_size=10,
+                             pickle_safe=False)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/initializers.py b/tensorflow/contrib/keras/python/keras/initializers.py
new file mode 100644
index 00000000000..b0b71e7cb4b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/initializers.py
@@ -0,0 +1,203 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras initializer classes (soon to be replaced with core TF initializers).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops.init_ops import Constant
+from tensorflow.python.ops.init_ops import Initializer
+from tensorflow.python.ops.init_ops import Ones
+from tensorflow.python.ops.init_ops import Orthogonal
+from tensorflow.python.ops.init_ops import RandomNormal
+from tensorflow.python.ops.init_ops import RandomUniform
+from tensorflow.python.ops.init_ops import TruncatedNormal
+from tensorflow.python.ops.init_ops import VarianceScaling
+from tensorflow.python.ops.init_ops import Zeros
+
+
+class Identity(Initializer):
+  """Initializer that generates the identity matrix.
+
+  Only use for square 2D matrices.
+
+  Arguments:
+      gain: Multiplicative factor to apply to the identity matrix.
+  """
+
+  def __init__(self, gain=1.):
+    self.gain = gain
+
+  def __call__(self, shape, dtype=None):
+    if len(shape) != 2 or shape[0] != shape[1]:
+      raise ValueError('Identity matrix initializer can only be used '
+                       'for 2D square matrices.')
+    else:
+      return self.gain * np.identity(shape[0])
+
+  def get_config(self):
+    return {'gain': self.gain}
+
+
+def lecun_uniform(seed=None):
+  """LeCun uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(3 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      LeCun 98, Efficient Backprop,
+      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_in', distribution='uniform', seed=seed)
+
+
+def glorot_normal(seed=None):
+  """Glorot normal initializer, also called Xavier normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      Glorot & Bengio, AISTATS 2010
+      http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_avg', distribution='normal', seed=seed)
+
+
+def glorot_uniform(seed=None):
+  """Glorot uniform initializer, also called Xavier uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      Glorot & Bengio, AISTATS 2010
+      http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_avg', distribution='uniform', seed=seed)
+
+
+def he_normal(seed=None):
+  """He normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      He et al., http://arxiv.org/abs/1502.01852
+  """
+  return VarianceScaling(
+      scale=2., mode='fan_in', distribution='normal', seed=seed)
+
+
+def he_uniform(seed=None):
+  """He uniform variance scaling initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      He et al., http://arxiv.org/abs/1502.01852
+  """
+  return VarianceScaling(
+      scale=2., mode='fan_in', distribution='uniform', seed=seed)
+
+
+# Compatibility aliases
+
+# pylint: disable=invalid-name
+zero = zeros = Zeros
+one = ones = Ones
+constant = Constant
+uniform = random_uniform = RandomUniform
+normal = random_normal = RandomNormal
+truncated_normal = TruncatedNormal
+identity = Identity
+orthogonal = Orthogonal
+
+# pylint: enable=invalid-name
+
+# Utility functions
+
+
+def serialize(initializer):
+  return serialize_keras_object(initializer)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='initializer')
+
+
+def get(identifier):
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret initializer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/initializers_test.py b/tensorflow/contrib/keras/python/keras/initializers_test.py
new file mode 100644
index 00000000000..0a07eddd89a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/initializers_test.py
@@ -0,0 +1,153 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.ops import init_ops
+from tensorflow.python.platform import test
+
+
+class KerasInitializersTest(test.TestCase):
+
+  def _runner(self, init, shape, target_mean=None, target_std=None,
+              target_max=None, target_min=None):
+    variable = keras.backend.variable(init(shape))
+    output = keras.backend.get_value(variable)
+    lim = 3e-2
+    if target_std is not None:
+      self.assertGreater(lim, abs(output.std() - target_std))
+    if target_mean is not None:
+      self.assertGreater(lim, abs(output.mean() - target_mean))
+    if target_max is not None:
+      self.assertGreater(lim, abs(output.max() - target_max))
+    if target_min is not None:
+      self.assertGreater(lim, abs(output.min() - target_min))
+
+    # Test serialization (assumes deterministic behavior).
+    config = init.get_config()
+    reconstructed_init = init.__class__.from_config(config)
+    variable = keras.backend.variable(reconstructed_init(shape))
+    output_2 = keras.backend.get_value(variable)
+    self.assertAllClose(output, output_2, atol=1e-4)
+
+  def test_uniform(self):
+    tensor_shape = (9, 6, 7)
+    with self.test_session():
+      self._runner(keras.initializers.RandomUniform(minval=-1,
+                                                    maxval=1,
+                                                    seed=124),
+                   tensor_shape,
+                   target_mean=0., target_max=1, target_min=-1)
+
+  def test_normal(self):
+    tensor_shape = (8, 12, 99)
+    with self.test_session():
+      self._runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
+                   tensor_shape,
+                   target_mean=0., target_std=1)
+
+  def test_truncated_normal(self):
+    tensor_shape = (12, 99, 7)
+    with self.test_session():
+      self._runner(keras.initializers.TruncatedNormal(mean=0,
+                                                      stddev=1,
+                                                      seed=126),
+                   tensor_shape,
+                   target_mean=0., target_std=None, target_max=2)
+
+  def test_constant(self):
+    tensor_shape = (5, 6, 4)
+    with self.test_session():
+      self._runner(keras.initializers.Constant(2), tensor_shape,
+                   target_mean=2, target_max=2, target_min=2)
+
+  def test_lecun_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(3. / fan_in)
+      self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_glorot_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(6. / (fan_in + fan_out))
+      self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_he_uniform(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(6. / fan_in)
+      self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
+
+  def test_glorot_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(2. / (fan_in + fan_out))
+      self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
+
+  def test_he_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(2. / fan_in)
+      self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
+
+  def test_orthogonal(self):
+    tensor_shape = (20, 20)
+    with self.test_session():
+      self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
+                   target_mean=0.)
+
+  def test_identity(self):
+    with self.test_session():
+      tensor_shape = (3, 4, 5)
+      with self.assertRaises(ValueError):
+        self._runner(keras.initializers.identity(), tensor_shape,
+                     target_mean=1. / tensor_shape[0], target_max=1.)
+
+      tensor_shape = (3, 3)
+      self._runner(keras.initializers.identity(), tensor_shape,
+                   target_mean=1. / tensor_shape[0], target_max=1.)
+
+  def test_zero(self):
+    tensor_shape = (4, 5)
+    with self.test_session():
+      self._runner(keras.initializers.zeros(), tensor_shape,
+                   target_mean=0., target_max=0.)
+
+  def test_one(self):
+    tensor_shape = (4, 5)
+    with self.test_session():
+      self._runner(keras.initializers.ones(), tensor_shape,
+                   target_mean=1., target_max=1.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
new file mode 100644
index 00000000000..bcd844201c1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -0,0 +1,166 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class KerasIntegrationTest(test.TestCase):
+
+  def test_vector_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(10,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential([
+          keras.layers.Dense(16,
+                             activation='relu',
+                             input_shape=x_train.shape[1:]),
+          keras.layers.Dropout(0.1),
+          keras.layers.Dense(y_train.shape[-1], activation='softmax')
+      ])
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_vector_classification_functional(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(10,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      inputs = keras.layers.Input(shape=x_train.shape[1:])
+      x = keras.layers.Dense(16, activation='relu')(inputs)
+      x = keras.layers.Dropout(0.1)(x)
+      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
+
+      model = keras.models.Model(inputs, outputs)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_temporal_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1336)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(4, 8),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.LSTM(3, return_sequences=True,
+                                  input_shape=x_train.shape[1:]))
+      model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='adam',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_image_classification_declarative(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(8, 8, 3),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Conv2D(
+          8, 3,
+          activation='relu',
+          input_shape=x_train.shape[1:]))
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Conv2D(
+          8, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='adam',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_video_classification_functional(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(4, 8, 8, 3),
+          num_classes=3)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      inputs = keras.layers.Input(shape=x_train.shape[1:])
+      x = keras.layers.TimeDistributed(
+          keras.layers.Conv2D(4, 3, activation='relu'))(inputs)
+      x = keras.layers.BatchNormalization()(x)
+      x = keras.layers.TimeDistributed(keras.layers.GlobalMaxPooling2D())(x)
+      x = keras.layers.Conv1D(8, 3, activation='relu')(x)
+      x = keras.layers.Flatten()(x)
+      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
+
+      model = keras.models.Model(inputs, outputs)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.70)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/__init__.py b/tensorflow/contrib/keras/python/keras/layers/__init__.py
new file mode 100644
index 00000000000..9a428f31141
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers module.
+"""
+# pylint: disable=wildcard-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.core import *
+from tensorflow.contrib.keras.python.keras.layers.embeddings import *
+from tensorflow.contrib.keras.python.keras.layers.local import *
+from tensorflow.contrib.keras.python.keras.layers.merge import *
+from tensorflow.contrib.keras.python.keras.layers.noise import *
+from tensorflow.contrib.keras.python.keras.layers.normalization import *
+from tensorflow.contrib.keras.python.keras.layers.pooling import *
+from tensorflow.contrib.keras.python.keras.layers.recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.serialization import deserialize
+from tensorflow.contrib.keras.python.keras.layers.serialization import serialize
+from tensorflow.contrib.keras.python.keras.layers.wrappers import *
+
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
new file mode 100644
index 00000000000..2c957ece446
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -0,0 +1,222 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layers that act as activation functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class LeakyReLU(Layer):
+  """Leaky version of a Rectified Linear Unit.
+
+  It allows a small gradient when the unit is not active:
+  `f(x) = alpha * x for x < 0`,
+  `f(x) = x for x >= 0`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha: float >= 0. Negative slope coefficient.
+
+  """
+
+  def __init__(self, alpha=0.3, **kwargs):
+    super(LeakyReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha = K.cast_to_floatx(alpha)
+
+  def call(self, inputs):
+    return K.relu(inputs, alpha=self.alpha)
+
+  def get_config(self):
+    config = {'alpha': self.alpha}
+    base_config = super(LeakyReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class PReLU(Layer):
+  """Parametric Rectified Linear Unit.
+
+  It follows:
+  `f(x) = alpha * x for x < 0`,
+  `f(x) = x for x >= 0`,
+  where `alpha` is a learned array with the same shape as x.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha_initializer: initializer function for the weights.
+      alpha_regularizer: regularizer for the weights.
+      alpha_constraint: constraint for the weights.
+      shared_axes: the axes along which to share learnable
+          parameters for the activation function.
+          For example, if the incoming feature maps
+          are from a 2D convolution
+          with output shape `(batch, height, width, channels)`,
+          and you wish to share parameters across space
+          so that each filter only has one set of parameters,
+          set `shared_axes=[1, 2]`.
+
+  """
+
+  def __init__(self,
+               alpha_initializer='zeros',
+               alpha_regularizer=None,
+               alpha_constraint=None,
+               shared_axes=None,
+               **kwargs):
+    super(PReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha_initializer = initializers.get(alpha_initializer)
+    self.alpha_regularizer = regularizers.get(alpha_regularizer)
+    self.alpha_constraint = constraints.get(alpha_constraint)
+    if shared_axes is None:
+      self.shared_axes = None
+    elif not isinstance(shared_axes, (list, tuple)):
+      self.shared_axes = [shared_axes]
+    else:
+      self.shared_axes = list(shared_axes)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    param_shape = input_shape[1:]
+    self.param_broadcast = [False] * len(param_shape)
+    if self.shared_axes is not None:
+      for i in self.shared_axes:
+        param_shape[i - 1] = 1
+        self.param_broadcast[i - 1] = True
+    self.alpha = self.add_weight(
+        shape=param_shape,
+        name='alpha',
+        initializer=self.alpha_initializer,
+        regularizer=self.alpha_regularizer,
+        constraint=self.alpha_constraint)
+    # Set input spec
+    axes = {}
+    if self.shared_axes:
+      for i in range(1, len(input_shape)):
+        if i not in self.shared_axes:
+          axes[i] = input_shape[i]
+    self.input_spec = InputSpec(ndim=len(input_shape), axes=axes)
+    self.built = True
+
+  def call(self, inputs, mask=None):
+    pos = K.relu(inputs)
+    if K.backend() == 'theano':
+      neg = (K.pattern_broadcast(self.alpha, self.param_broadcast) *
+             (inputs - K.abs(inputs)) * 0.5)
+    else:
+      neg = -self.alpha * K.relu(-inputs)
+    return pos + neg
+
+  def get_config(self):
+    config = {
+        'alpha_initializer': initializers.serialize(self.alpha_initializer),
+        'alpha_regularizer': regularizers.serialize(self.alpha_regularizer),
+        'alpha_constraint': constraints.serialize(self.alpha_constraint),
+        'shared_axes': self.shared_axes
+    }
+    base_config = super(PReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ELU(Layer):
+  """Exponential Linear Unit.
+
+  It follows:
+  `f(x) =  alpha * (exp(x) - 1.) for x < 0`,
+  `f(x) = x for x >= 0`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      alpha: scale for the negative factor.
+
+  """
+
+  def __init__(self, alpha=1.0, **kwargs):
+    super(ELU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.alpha = K.cast_to_floatx(alpha)
+
+  def call(self, inputs):
+    return K.elu(inputs, self.alpha)
+
+  def get_config(self):
+    config = {'alpha': float(self.alpha)}
+    base_config = super(ELU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ThresholdedReLU(Layer):
+  """Thresholded Rectified Linear Unit.
+
+  It follows:
+  `f(x) = x for x > theta`,
+  `f(x) = 0 otherwise`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      theta: float >= 0. Threshold location of activation.
+
+  """
+
+  def __init__(self, theta=1.0, **kwargs):
+    super(ThresholdedReLU, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.theta = K.cast_to_floatx(theta)
+
+  def call(self, inputs, mask=None):
+    return inputs * K.cast(inputs > self.theta, K.floatx())
+
+  def get_config(self):
+    config = {'theta': float(self.theta)}
+    base_config = super(ThresholdedReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py
new file mode 100644
index 00000000000..1be56123d8b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations_test.py
@@ -0,0 +1,61 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for advanced activation layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class AdvancedActivationsTest(test.TestCase):
+
+  def test_leaky_relu(self):
+    with self.test_session():
+      for alpha in [0., .5, -1.]:
+        testing_utils.layer_test(keras.layers.LeakyReLU,
+                                 kwargs={'alpha': alpha},
+                                 input_shape=(2, 3, 4))
+
+  def test_prelu(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.PReLU, kwargs={},
+                               input_shape=(2, 3, 4))
+
+  def test_prelu_share(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.PReLU,
+                               kwargs={'shared_axes': 1},
+                               input_shape=(2, 3, 4))
+
+  def test_elu(self):
+    with self.test_session():
+      for alpha in [0., .5, -1.]:
+        testing_utils.layer_test(keras.layers.ELU,
+                                 kwargs={'alpha': alpha},
+                                 input_shape=(2, 3, 4))
+
+  def test_thresholded_relu(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.ThresholdedReLU,
+                               kwargs={'theta': 0.5},
+                               input_shape=(2, 3, 4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
new file mode 100644
index 00000000000..9ee5aa21217
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -0,0 +1,1553 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras convolution layers and image transformation layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+# imports for backwards namespace compatibility
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling3D
+# pylint: enable=unused-import
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import convolutional as tf_convolutional_layers
+
+
+class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+  """1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input over a single spatial (or temporal) dimension
+  to produce a tensor of outputs.
+  If `use_bias` is True, a bias vector is created and added to the outputs.
+  Finally, if `activation` is not `None`,
+  it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide an `input_shape` argument
+  (tuple of integers or `None`, e.g.
+  `(10, 128)` for sequences of 10 vectors of 128-dimensional vectors,
+  or `(None, 128)` for variable-length sequences of 128-dimensional vectors.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer,
+          specifying the length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+          specifying the stride length of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+          `"causal"` results in causal (dilated) convolutions, e.g. output[t]
+          does not depend on input[t+1:]. Useful when modeling temporal data
+          where the model should not violate the temporal order.
+          See [WaveNet: A Generative Model for Raw Audio, section
+            2.1](https://arxiv.org/abs/1609.03499).
+      dilation_rate: an integer or tuple/list of a single integer, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, input_dim)`
+
+  Output shape:
+      3D tensor with shape: `(batch_size, new_steps, filters)`
+      `steps` value might have changed due to padding or strides.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(Conv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format='channels_last',
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv1D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Conv2D(tf_convolutional_layers.Conv2D, Layer):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input to produce a tensor of
+  outputs. If `use_bias` is True,
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(Conv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv2D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+  """3D convolution layer (e.g. spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  with the layer input to produce a tensor of
+  outputs. If `use_bias` is True,
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 128, 3)` for 128x128x128 volumes
+  with a single channel,
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+          width and height of the 3D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+          specifying the strides of the convolution along each spatial
+            dimension.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 3 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      5D tensor with shape:
+      `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if
+        data_format='channels_last'.
+
+  Output shape:
+      5D tensor with shape:
+      `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if
+        data_format='channels_last'.
+      `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
+        changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(Conv3D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv3D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+  in `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+
+  References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(Conv2DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv2DTranspose, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv2DTranspose, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
+  """Depthwise separable 2D convolution.
+
+  Separable convolutions consist in first performing
+  a depthwise spatial convolution
+  (which acts on each input channel separately)
+  followed by a pointwise convolution which mixes together the resulting
+  output channels. The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Intuitively, separable convolutions can be understood as
+  a way to factorize a convolution kernel into two smaller kernels,
+  or as an extreme version of an Inception block.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filterss_in * depth_multiplier`.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix.
+      pointwise_initializer: Initializer for the pointwise kernel matrix.
+      bias_initializer: Initializer for the bias vector.
+      depthwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix.
+      pointwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      depthwise_constraint: Constraint function applied to
+          the depthwise kernel matrix.
+      pointwise_constraint: Constraint function applied to
+          the pointwise kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(SeparableConv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        depthwise_initializer=initializers.get(depthwise_initializer),
+        pointwise_initializer=initializers.get(pointwise_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        depthwise_regularizer=regularizers.get(depthwise_regularizer),
+        pointwise_regularizer=regularizers.get(pointwise_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(SeparableConv2D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.depthwise_constraint:
+      self.constraints[self.depthwise_kernel] = self.depthwise_constraint
+    if self.pointwise_constraint:
+      self.constraints[self.pointwise_kernel] = self.pointwise_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'depthwise_initializer': initializers.serialize(
+            self.depthwise_initializer),
+        'pointwise_initializer': initializers.serialize(
+            self.pointwise_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer': regularizers.serialize(
+            self.depthwise_regularizer),
+        'pointwise_regularizer': regularizers.serialize(
+            self.pointwise_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'depthwise_constraint': constraints.serialize(
+            self.depthwise_constraint),
+        'pointwise_constraint': constraints.serialize(
+            self.pointwise_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(SeparableConv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling1D(Layer):
+  """Upsampling layer for 1D inputs.
+
+  Repeats each temporal step `size` times along the time axis.
+
+  Arguments:
+      size: integer. Upsampling factor.
+
+  Input shape:
+      3D tensor with shape: `(batch, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch, upsampled_steps, features)`.
+  """
+
+  def __init__(self, size=2, **kwargs):
+    super(UpSampling1D, self).__init__(**kwargs)
+    self.size = int(size)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    size = self.size * input_shape[1] if input_shape[1] is not None else None
+    return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
+
+  def call(self, inputs):
+    output = K.repeat_elements(inputs, self.size, axis=1)
+    return output
+
+  def get_config(self):
+    config = {'size': self.size}
+    base_config = super(UpSampling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling2D(Layer):
+  """Upsampling layer for 2D inputs.
+
+  Repeats the rows and columns of the data
+  by size[0] and size[1] respectively.
+
+  Arguments:
+      size: int, or tuple of 2 integers.
+          The upsampling factors for rows and columns.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, upsampled_rows, upsampled_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, upsampled_rows, upsampled_cols)`
+  """
+
+  def __init__(self, size=(2, 2), data_format=None, **kwargs):
+    super(UpSampling2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.size = conv_utils.normalize_tuple(size, 2, 'size')
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      height = self.size[0] * input_shape[
+          2] if input_shape[2] is not None else None
+      width = self.size[1] * input_shape[
+          3] if input_shape[3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], height, width])
+    else:
+      height = self.size[0] * input_shape[
+          1] if input_shape[1] is not None else None
+      width = self.size[1] * input_shape[
+          2] if input_shape[2] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], height, width, input_shape[3]])
+
+  def call(self, inputs):
+    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+
+  def get_config(self):
+    config = {'size': self.size, 'data_format': self.data_format}
+    base_config = super(UpSampling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling3D(Layer):
+  """Upsampling layer for 3D inputs.
+
+  Repeats the 1st, 2nd and 3rd dimensions
+  of the data by size[0], size[1] and size[2] respectively.
+
+  Arguments:
+      size: int, or tuple of 3 integers.
+          The upsampling factors for dim1, dim2 and dim3.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, dim1, dim2, dim3, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, dim1, dim2, dim3)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)`
+  """
+
+  def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.size = conv_utils.normalize_tuple(size, 3, 'size')
+    self.input_spec = InputSpec(ndim=5)
+    super(UpSampling3D, self).__init__(**kwargs)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      dim1 = self.size[0] * input_shape[
+          2] if input_shape[2] is not None else None
+      dim2 = self.size[1] * input_shape[
+          3] if input_shape[3] is not None else None
+      dim3 = self.size[2] * input_shape[
+          4] if input_shape[4] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    else:
+      dim1 = self.size[0] * input_shape[
+          1] if input_shape[1] is not None else None
+      dim2 = self.size[1] * input_shape[
+          2] if input_shape[2] is not None else None
+      dim3 = self.size[2] * input_shape[
+          3] if input_shape[3] is not None else None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+
+  def call(self, inputs):
+    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
+                            self.data_format)
+
+  def get_config(self):
+    config = {'size': self.size, 'data_format': self.data_format}
+    base_config = super(UpSampling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding1D(Layer):
+  """Zero-padding layer for 1D input (e.g. temporal sequence).
+
+  Arguments:
+      padding: int, or tuple of int (length 2), or dictionary.
+          - If int:
+          How many zeros to add at the beginning and end of
+          the padding dimension (axis 1).
+          - If tuple of int (length 2):
+          How many zeros to add at the beginning and at the end of
+          the padding dimension (`(left_pad, right_pad)`).
+
+  Input shape:
+      3D tensor with shape `(batch, axis_to_pad, features)`
+
+  Output shape:
+      3D tensor with shape `(batch, padded_axis, features)`
+  """
+
+  def __init__(self, padding=1, **kwargs):
+    super(ZeroPadding1D, self).__init__(**kwargs)
+    self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    if input_shape[1] is not None:
+      length = input_shape[1] + self.padding[0] + self.padding[1]
+    else:
+      length = None
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def call(self, inputs):
+    return K.temporal_padding(inputs, padding=self.padding)
+
+  def get_config(self):
+    config = {'padding': self.padding}
+    base_config = super(ZeroPadding1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding2D(Layer):
+  """Zero-padding layer for 2D input (e.g. picture).
+
+  This layer can add rows and columns or zeros
+  at the top, bottom, left and right side of an image tensor.
+
+  Arguments:
+      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric padding
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric padding values for height and width:
+              `(symmetric_height_pad, symmetric_width_pad)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((top_pad, bottom_pad), (left_pad, right_pad))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, padded_rows, padded_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, padded_rows, padded_cols)`
+  """
+
+  def __init__(self, padding=(1, 1), data_format=None, **kwargs):
+    super(ZeroPadding2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(padding, int):
+      self.padding = ((padding, padding), (padding, padding))
+    elif hasattr(padding, '__len__'):
+      if len(padding) != 2:
+        raise ValueError('`padding` should have two elements. '
+                         'Found: ' + str(padding))
+      height_padding = conv_utils.normalize_tuple(padding[0], 2,
+                                                  '1st entry of padding')
+      width_padding = conv_utils.normalize_tuple(padding[1], 2,
+                                                 '2nd entry of padding')
+      self.padding = (height_padding, width_padding)
+    else:
+      raise ValueError('`padding` should be either an int, '
+                       'a tuple of 2 ints '
+                       '(symmetric_height_pad, symmetric_width_pad), '
+                       'or a tuple of 2 tuples of 2 ints '
+                       '((top_pad, bottom_pad), (left_pad, right_pad)). '
+                       'Found: ' + str(padding))
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      if input_shape[2] is not None:
+        rows = input_shape[2] + self.padding[0][0] + self.padding[0][1]
+      else:
+        rows = None
+      if input_shape[3] is not None:
+        cols = input_shape[3] + self.padding[1][0] + self.padding[1][1]
+      else:
+        cols = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        rows = input_shape[1] + self.padding[0][0] + self.padding[0][1]
+      else:
+        rows = None
+      if input_shape[2] is not None:
+        cols = input_shape[2] + self.padding[1][0] + self.padding[1][1]
+      else:
+        cols = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
+
+  def call(self, inputs):
+    return K.spatial_2d_padding(
+        inputs, padding=self.padding, data_format=self.data_format)
+
+  def get_config(self):
+    config = {'padding': self.padding, 'data_format': self.data_format}
+    base_config = super(ZeroPadding2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ZeroPadding3D(Layer):
+  """Zero-padding layer for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric padding
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric padding values for height and width:
+              `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
+                right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_axis_to_pad, second_axis_to_pad,
+            third_axis_to_pad)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_padded_axis, second_padded_axis,
+            third_axis_to_pad)`
+  """
+
+  def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
+    super(ZeroPadding3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(padding, int):
+      self.padding = ((padding, padding), (padding, padding), (padding,
+                                                               padding))
+    elif hasattr(padding, '__len__'):
+      if len(padding) != 3:
+        raise ValueError('`padding` should have 3 elements. '
+                         'Found: ' + str(padding))
+      dim1_padding = conv_utils.normalize_tuple(padding[0], 2,
+                                                '1st entry of padding')
+      dim2_padding = conv_utils.normalize_tuple(padding[1], 2,
+                                                '2nd entry of padding')
+      dim3_padding = conv_utils.normalize_tuple(padding[2], 2,
+                                                '3rd entry of padding')
+      self.padding = (dim1_padding, dim2_padding, dim3_padding)
+    else:
+      raise ValueError(
+          '`padding` should be either an int, '
+          'a tuple of 3 ints '
+          '(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), '
+          'or a tuple of 3 tuples of 2 ints '
+          '((left_dim1_pad, right_dim1_pad),'
+          ' (left_dim2_pad, right_dim2_pad),'
+          ' (left_dim3_pad, right_dim2_pad)). '
+          'Found: ' + str(padding))
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      if input_shape[2] is not None:
+        dim1 = input_shape[2] + 2 * self.padding[0][0]
+      else:
+        dim1 = None
+      if input_shape[3] is not None:
+        dim2 = input_shape[3] + 2 * self.padding[1][0]
+      else:
+        dim2 = None
+      if input_shape[4] is not None:
+        dim3 = input_shape[4] + 2 * self.padding[2][0]
+      else:
+        dim3 = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        dim1 = input_shape[1] + 2 * self.padding[0][1]
+      else:
+        dim1 = None
+      if input_shape[2] is not None:
+        dim2 = input_shape[2] + 2 * self.padding[1][1]
+      else:
+        dim2 = None
+      if input_shape[3] is not None:
+        dim3 = input_shape[3] + 2 * self.padding[2][1]
+      else:
+        dim3 = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+
+  def call(self, inputs):
+    return K.spatial_3d_padding(
+        inputs, padding=self.padding, data_format=self.data_format)
+
+  def get_config(self):
+    config = {'padding': self.padding, 'data_format': self.data_format}
+    base_config = super(ZeroPadding3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping1D(Layer):
+  """Cropping layer for 1D input (e.g. temporal sequence).
+
+  It crops along the time dimension (axis 1).
+
+  Arguments:
+      cropping: int or tuple of int (length 2)
+          How many units should be trimmed off at the beginning and end of
+          the cropping dimension (axis 1).
+          If a single int is provided,
+          the same value will be used for both.
+
+  Input shape:
+      3D tensor with shape `(batch, axis_to_crop, features)`
+
+  Output shape:
+      3D tensor with shape `(batch, cropped_axis, features)`
+  """
+
+  def __init__(self, cropping=(1, 1), **kwargs):
+    super(Cropping1D, self).__init__(**kwargs)
+    self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if input_shape[1] is not None:
+      length = input_shape[1] - self.cropping[0] - self.cropping[1]
+    else:
+      length = None
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def call(self, inputs):
+    if self.cropping[1] == 0:
+      return inputs[:, self.cropping[0]:, :]
+    else:
+      return inputs[:, self.cropping[0]:-self.cropping[1], :]
+
+  def get_config(self):
+    config = {'cropping': self.cropping}
+    base_config = super(Cropping1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping2D(Layer):
+  """Cropping layer for 2D input (e.g. picture).
+
+  It crops along spatial dimensions, i.e. width and height.
+
+  Arguments:
+      cropping: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric cropping
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric cropping values for height and width:
+              `(symmetric_height_crop, symmetric_width_crop)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((top_crop, bottom_crop), (left_crop, right_crop))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, rows, cols)`
+
+  Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, cropped_rows, cropped_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, channels, cropped_rows, cropped_cols)`
+
+  Examples:
+
+  ```python
+      # Crop the input 2D images or feature maps
+      model = Sequential()
+      model.add(Cropping2D(cropping=((2, 2), (4, 4)),
+                           input_shape=(28, 28, 3)))
+      # now model.output_shape == (None, 24, 20, 3)
+      model.add(Conv2D(64, (3, 3), padding='same))
+      model.add(Cropping2D(cropping=((2, 2), (2, 2))))
+      # now model.output_shape == (None, 20, 16. 64)
+  ```
+  """
+
+  def __init__(self, cropping=((0, 0), (0, 0)), data_format=None, **kwargs):
+    super(Cropping2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(cropping, int):
+      self.cropping = ((cropping, cropping), (cropping, cropping))
+    elif hasattr(cropping, '__len__'):
+      if len(cropping) != 2:
+        raise ValueError('`cropping` should have two elements. '
+                         'Found: ' + str(cropping))
+      height_cropping = conv_utils.normalize_tuple(cropping[0], 2,
+                                                   '1st entry of cropping')
+      width_cropping = conv_utils.normalize_tuple(cropping[1], 2,
+                                                  '2nd entry of cropping')
+      self.cropping = (height_cropping, width_cropping)
+    else:
+      raise ValueError('`cropping` should be either an int, '
+                       'a tuple of 2 ints '
+                       '(symmetric_height_crop, symmetric_width_crop), '
+                       'or a tuple of 2 tuples of 2 ints '
+                       '((top_crop, bottom_crop), (left_crop, right_crop)). '
+                       'Found: ' + str(cropping))
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([
+          input_shape[0], input_shape[1],
+          input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+          if input_shape[2] else None,
+          input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+          if input_shape[3] else None
+      ])
+    else:
+      return tensor_shape.TensorShape([
+          input_shape[0],
+          input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+          if input_shape[1] else None,
+          input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+          if input_shape[2] else None, input_shape[3]
+      ])
+    # pylint: enable=invalid-unary-operand-type
+
+  def call(self, inputs):
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      if self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1]]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:]
+      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                    self.cropping[1][0]:-self.cropping[1][1]]
+    else:
+      if self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:, :]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], :]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, :]
+      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
+          1][0]:-self.cropping[1][1], :]  # pylint: disable=invalid-unary-operand-type
+    # pylint: enable=invalid-unary-operand-type
+
+  def get_config(self):
+    config = {'cropping': self.cropping, 'data_format': self.data_format}
+    base_config = super(Cropping2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Cropping3D(Layer):
+  """Cropping layer for 3D data (e.g.
+
+  spatial or spatio-temporal).
+
+  Arguments:
+      cropping: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+          - If int: the same symmetric cropping
+              is applied to width and height.
+          - If tuple of 2 ints:
+              interpreted as two different
+              symmetric cropping values for height and width:
+              `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
+          - If tuple of 2 tuples of 2 ints:
+              interpreted as
+              `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
+                right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_axis_to_crop, second_axis_to_crop,
+            third_axis_to_crop)`
+
+  Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis,
+            depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch, depth, first_cropped_axis, second_cropped_axis,
+            third_cropped_axis)`
+  """
+
+  def __init__(self,
+               cropping=((1, 1), (1, 1), (1, 1)),
+               data_format=None,
+               **kwargs):
+    super(Cropping3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    if isinstance(cropping, int):
+      self.cropping = ((cropping, cropping), (cropping, cropping), (cropping,
+                                                                    cropping))
+    elif hasattr(cropping, '__len__'):
+      if len(cropping) != 3:
+        raise ValueError('`cropping` should have 3 elements. '
+                         'Found: ' + str(cropping))
+      dim1_cropping = conv_utils.normalize_tuple(cropping[0], 2,
+                                                 '1st entry of cropping')
+      dim2_cropping = conv_utils.normalize_tuple(cropping[1], 2,
+                                                 '2nd entry of cropping')
+      dim3_cropping = conv_utils.normalize_tuple(cropping[2], 2,
+                                                 '3rd entry of cropping')
+      self.cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
+    else:
+      raise ValueError(
+          '`cropping` should be either an int, '
+          'a tuple of 3 ints '
+          '(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop), '
+          'or a tuple of 3 tuples of 2 ints '
+          '((left_dim1_crop, right_dim1_crop),'
+          ' (left_dim2_crop, right_dim2_crop),'
+          ' (left_dim3_crop, right_dim2_crop)). '
+          'Found: ' + str(cropping))
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      if input_shape[2] is not None:
+        dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+      else:
+        dim1 = None
+      if input_shape[3] is not None:
+        dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+      else:
+        dim2 = None
+      if input_shape[4] is not None:
+        dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][1]
+      else:
+        dim3 = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], dim1, dim2, dim3])
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+      else:
+        dim1 = None
+      if input_shape[2] is not None:
+        dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+      else:
+        dim2 = None
+      if input_shape[3] is not None:
+        dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][1]
+      else:
+        dim3 = None
+      return tensor_shape.TensorShape(
+          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+    # pylint: enable=invalid-unary-operand-type
+
+  def call(self, inputs):
+    # pylint: disable=invalid-unary-operand-type
+    if self.data_format == 'channels_first':
+      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:]
+      elif self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, self.cropping[2][0]:]
+      elif self.cropping[0][1] == self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], self.cropping[2][0]:]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][
+            0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
+                      cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][1]]
+      elif self.cropping[2][1] == 0:
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
+                      cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:]
+      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                    self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][
+                        0]:-self.cropping[2][1]]
+    else:
+      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
+                      self.cropping[2][0]:-self.cropping[2][1], :]
+      elif self.cropping[1][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:, self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
+                      -self.cropping[1][1], self.cropping[2][0]:, :]
+      elif self.cropping[0][1] == 0:
+        return inputs[:, self.cropping[0][0]:, self.cropping[1][
+            0]:-self.cropping[1][1], self.cropping[2][0]:
+                      -self.cropping[2][1], :]
+      elif self.cropping[1][1] == 0:
+        return inputs[:, self.cropping[0][
+            0]:-self.cropping[0][1], self.cropping[1][0]:, self.cropping[2][0]:
+                      -self.cropping[2][1], :]
+      elif self.cropping[2][1] == 0:
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:-self.cropping[1][1], self.cropping[
+                          2][0]:, :]
+      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
+          1][0]:-self.cropping[1][1], self.cropping[2][0]:  # pylint: disable=invalid-unary-operand-type
+                    -self.cropping[2][1], :]  # pylint: disable=invalid-unary-operand-type
+    # pylint: enable=invalid-unary-operand-type
+
+  def get_config(self):
+    config = {'cropping': self.cropping, 'data_format': self.data_format}
+    base_config = super(Cropping3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+# Aliases
+
+Convolution1D = Conv1D
+Convolution2D = Conv2D
+Convolution3D = Conv3D
+SeparableConvolution2D = SeparableConv2D
+Convolution2DTranspose = Conv2DTranspose
+Deconvolution2D = Deconv2D = Conv2DTranspose
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
new file mode 100644
index 00000000000..9ab2e72bf1b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
@@ -0,0 +1,592 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolutional-recurrent layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.layers.recurrent import Recurrent
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class ConvRecurrent2D(Recurrent):
+  """Abstract base class for convolutional recurrent layers.
+
+  Do not use in a model -- it's not a functional layer!
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+          dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+          specifying the strides of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, time, ..., channels)`
+          while `channels_first` corresponds to
+          inputs with shape `(batch, time, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of n integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, rocess the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+
+  Input shape:
+      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+
+  Output shape:
+      - if `return_sequences`: 5D tensor with shape
+          `(num_samples, timesteps, channels, rows, cols)`.
+      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
+
+  # Masking
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an `Embedding` layer with the `mask_zero` parameter
+      set to `True`.
+      **Note:** for the time being, masking is only supported with Theano.
+
+  # Note on using statefulness in RNNs
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch.
+      This assumes a one-to-one mapping between
+      samples in different successive batches.
+
+      To enable statefulness:
+          - specify `stateful=True` in the layer constructor.
+          - specify a fixed batch size for your model, by passing
+              a `batch_input_size=(...)` to the first layer in your model.
+              This is the expected shape of your inputs *including the batch
+              size*.
+              It should be a tuple of integers, e.g. `(32, 10, 100)`.
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    super(ConvRecurrent2D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
+    self.return_sequences = return_sequences
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.input_spec = [InputSpec(ndim=5)]
+    self.state_spec = None
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[3]
+      cols = input_shape[4]
+    elif self.data_format == 'channels_last':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    rows = conv_utils.conv_output_length(
+        rows,
+        self.kernel_size[0],
+        padding=self.padding,
+        stride=self.strides[0],
+        dilation=self.dilation_rate[0])
+    cols = conv_utils.conv_output_length(
+        cols,
+        self.kernel_size[1],
+        padding=self.padding,
+        stride=self.strides[1],
+        dilation=self.dilation_rate[1])
+    if self.return_sequences:
+      if self.data_format == 'channels_first':
+        return tensor_shape.TensorShape(
+            [input_shape[0], input_shape[1], self.filters, rows, cols])
+      elif self.data_format == 'channels_last':
+        return tensor_shape.TensorShape(
+            [input_shape[0], input_shape[1], rows, cols, self.filters])
+    else:
+      if self.data_format == 'channels_first':
+        return tensor_shape.TensorShape(
+            [input_shape[0], self.filters, rows, cols])
+      elif self.data_format == 'channels_last':
+        return tensor_shape.TensorShape(
+            [input_shape[0], rows, cols, self.filters])
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'return_sequences': self.return_sequences,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful
+    }
+    base_config = super(ConvRecurrent2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ConvLSTM2D(ConvRecurrent2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+          dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+          specifying the strides of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, time, ..., channels)`
+          while `channels_first` corresponds to
+          inputs with shape `(batch, time, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of n integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+          If True, add 1 to the bias of the forget gate at initialization.
+          Use in combination with `bias_initializer="zeros"`.
+          This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, rocess the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  Input shape:
+      - if data_format='channels_first'
+          5D tensor with shape:
+          `(samples,time, channels, rows, cols)`
+      - if data_format='channels_last'
+          5D tensor with shape:
+          `(samples,time, rows, cols, channels)`
+
+   Output shape:
+      - if `return_sequences`
+           - if data_format='channels_first'
+              5D tensor with shape:
+              `(samples, time, filters, output_row, output_col)`
+           - if data_format='channels_last'
+              5D tensor with shape:
+              `(samples, time, output_row, output_col, filters)`
+      - else
+          - if data_format ='channels_first'
+              4D tensor with shape:
+              `(samples, filters, output_row, output_col)`
+          - if data_format='channels_last'
+              4D tensor with shape:
+              `(samples, output_row, output_col, filters)`
+          where o_row and o_col depend on the shape of the filter and
+          the padding
+
+  Raises:
+      ValueError: in case of invalid constructor arguments.
+
+  References:
+      - [Convolutional LSTM Network: A Machine Learning Approach for
+      Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+      The current implementation does not include the feedback loop on the
+      cells output
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(ConvLSTM2D, self).__init__(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        return_sequences=return_sequences,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
+
+    if self.stateful:
+      self.reset_states()
+    else:
+      # initial states: 2 all-zero tensor of shape (filters)
+      self.states = [None, None]
+
+    if self.data_format == 'channels_first':
+      channel_axis = 2
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    state_shape = [None] * 4
+    state_shape[channel_axis] = input_dim
+    state_shape = tuple(state_shape)
+    self.state_spec = [
+        InputSpec(shape=state_shape),
+        InputSpec(shape=state_shape)
+    ]
+    kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
+    self.kernel_shape = kernel_shape
+    recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
+
+    self.kernel = self.add_weight(
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        shape=recurrent_kernel_shape,
+        initializer=self.recurrent_initializer,
+        name='recurrent_kernel',
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(self.filters * 4,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+      if self.unit_forget_bias:
+        bias_value = np.zeros((self.filters * 4,))
+        bias_value[self.filters:self.filters * 2] = 1.
+        K.set_value(self.bias, bias_value)
+    else:
+      self.bias = None
+
+    self.kernel_i = self.kernel[:, :, :, :self.filters]
+    self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
+    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
+                                                    self.filters * 2]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
+                                                    self.filters * 3]
+    self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
+    self.recurrent_kernel_o = self.recurrent_kernel[:, :, :, self.filters * 3:]
+
+    if self.use_bias:
+      self.bias_i = self.bias[:self.filters]
+      self.bias_f = self.bias[self.filters:self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_o = self.bias[self.filters * 3:]
+    else:
+      self.bias_i = None
+      self.bias_f = None
+      self.bias_c = None
+      self.bias_o = None
+    self.built = True
+
+  def get_initial_state(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.kernel_shape)
+    shape[-1] = self.filters
+    initial_state = self.input_conv(
+        initial_state, K.zeros(tuple(shape)), padding=self.padding)
+
+    initial_states = [initial_state for _ in range(2)]
+    return initial_states
+
+  def reset_states(self):
+    if not self.stateful:
+      raise RuntimeError('Layer must be stateful.')
+    input_shape = self.input_spec[0].shape
+    output_shape = self._compute_output_shape(input_shape)
+
+    if not input_shape[0]:
+      raise ValueError('If a RNN is stateful, a complete '
+                       'input_shape must be provided '
+                       '(including batch size). '
+                       'Got input shape: ' + str(input_shape))
+
+    if self.return_sequences:
+      out_row, out_col, out_filter = output_shape[2:]
+    else:
+      out_row, out_col, out_filter = output_shape[1:]
+
+    if hasattr(self, 'states'):
+      K.set_value(self.states[0],
+                  np.zeros((input_shape[0], out_row, out_col, out_filter)))
+      K.set_value(self.states[1],
+                  np.zeros((input_shape[0], out_row, out_col, out_filter)))
+    else:
+      self.states = [
+          K.zeros((input_shape[0], out_row, out_col, out_filter)),
+          K.zeros((input_shape[0], out_row, out_col, out_filter))
+      ]
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation == 0 and 0 < self.dropout < 1:
+      ones = K.zeros_like(inputs)
+      ones = K.sum(ones, axis=1)
+      ones += 1
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+
+    if 0 < self.recurrent_dropout < 1:
+      shape = list(self.kernel_shape)
+      shape[-1] = self.filters
+      ones = K.zeros_like(inputs)
+      ones = K.sum(ones, axis=1)
+      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
+      ones += 1.
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def input_conv(self, x, w, b=None, padding='valid'):
+    conv_out = K.conv2d(
+        x,
+        w,
+        strides=self.strides,
+        padding=padding,
+        data_format=self.data_format,
+        dilation_rate=self.dilation_rate)
+    if b is not None:
+      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+    return conv_out
+
+  def reccurent_conv(self, x, w):
+    conv_out = K.conv2d(
+        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    return conv_out
+
+  def step(self, inputs, states):
+    assert len(states) == 4
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    x_i = self.input_conv(
+        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
+    x_f = self.input_conv(
+        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
+    x_c = self.input_conv(
+        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
+    x_o = self.input_conv(
+        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
+    h_i = self.reccurent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
+    h_f = self.reccurent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
+    h_c = self.reccurent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
+    h_o = self.reccurent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+    return h, [h, c]
+
+  def get_config(self):
+    config = {
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(ConvLSTM2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py
new file mode 100644
index 00000000000..06b2be6b680
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent_test.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional recurrent layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class ConvLSTMTest(test.TestCase):
+
+  def test_conv_lstm(self):
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, sequence_len,
+                                input_channel,
+                                input_num_row, input_num_col)
+      else:
+        inputs = np.random.rand(num_samples, sequence_len,
+                                input_num_row, input_num_col,
+                                input_channel)
+
+      for return_sequences in [True, False]:
+        # test for output shape:
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.ConvLSTM2D,
+              kwargs={'data_format': data_format,
+                      'return_sequences': return_sequences,
+                      'filters': filters,
+                      'kernel_size': (num_row, num_col),
+                      'padding': 'valid'},
+              input_shape=inputs.shape)
+
+  def test_conv_lstm_statefulness(self):
+    # Tests for statefulness
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    inputs = np.random.rand(num_samples, sequence_len,
+                            input_num_row, input_num_col,
+                            input_channel)
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      kwargs = {'data_format': 'channels_last',
+                'return_sequences': False,
+                'filters': filters,
+                'kernel_size': (num_row, num_col),
+                'stateful': True,
+                'batch_input_shape': inputs.shape,
+                'padding': 'same'}
+      layer = keras.layers.ConvLSTM2D(**kwargs)
+
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones_like(inputs))
+
+      # train once so that the states change
+      model.train_on_batch(np.ones_like(inputs),
+                           np.random.random(out1.shape))
+      out2 = model.predict(np.ones_like(inputs))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones_like(inputs))
+      self.assertNotEqual(out3.max(), out2.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones_like(inputs))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones_like(inputs))
+      self.assertNotEqual(out4.max(), out5.max())
+
+  def test_conv_lstm_regularizers(self):
+    # check regularizers
+    num_row = 3
+    num_col = 3
+    filters = 2
+    num_samples = 1
+    input_channel = 2
+    input_num_row = 5
+    input_num_col = 5
+    sequence_len = 2
+    inputs = np.random.rand(num_samples, sequence_len,
+                            input_num_row, input_num_col,
+                            input_channel)
+
+    with self.test_session():
+      kwargs = {'data_format': 'channels_last',
+                'return_sequences': False,
+                'kernel_size': (num_row, num_col),
+                'stateful': True,
+                'filters': filters,
+                'batch_input_shape': inputs.shape,
+                'kernel_regularizer': keras.regularizers.L1L2(l1=0.01),
+                'recurrent_regularizer': keras.regularizers.L1L2(l1=0.01),
+                'activity_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'kernel_constraint': 'max_norm',
+                'recurrent_constraint': 'max_norm',
+                'bias_constraint': 'max_norm',
+                'padding': 'same'}
+
+      layer = keras.layers.ConvLSTM2D(**kwargs)
+      layer.build(inputs.shape)
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones(inputs.shape)))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_conv_lstm_dropout(self):
+    # check dropout
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.ConvLSTM2D,
+          kwargs={'data_format': 'channels_last',
+                  'return_sequences': False,
+                  'filters': 2,
+                  'kernel_size': (3, 3),
+                  'padding': 'same',
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(1, 2, 5, 5, 2))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
new file mode 100644
index 00000000000..7e567d3fb02
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -0,0 +1,723 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class Convolution1DTest(test.TestCase):
+
+  def test_dilated_conv1d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv1D,
+          input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
+          kwargs={
+              'filters': 1,
+              'kernel_size': 2,
+              'dilation_rate': 1,
+              'padding': 'valid',
+              'kernel_initializer': 'ones',
+              'use_bias': False,
+          },
+          expected_output=[[[1], [3], [5]]])
+
+  def test_conv_1d(self):
+    batch_size = 2
+    steps = 8
+    input_dim = 2
+    kernel_size = 3
+    filters = 3
+
+    for padding in ['valid', 'same']:
+      for strides in [1, 2]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.Conv1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': kernel_size,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(batch_size, steps, input_dim))
+
+  def test_conv_1d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class Conv2DTest(test.TestCase):
+
+  def test_convolution_2d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    kernel_size = (3, 2)
+    num_row = 7
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session(use_gpu=True):
+          # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+          # TODO(b/62340061): Support channels_first on CPU.
+          if test.is_gpu_available(cuda_only=True):
+            testing_utils.layer_test(
+                keras.layers.Conv2D,
+                kwargs={
+                    'filters': filters,
+                    'kernel_size': kernel_size,
+                    'padding': padding,
+                    'strides': strides,
+                    'data_format': 'channels_first'
+                },
+                input_shape=(num_samples, stack_size, num_row, num_col))
+
+  def test_convolution_2d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_dilated_conv_2d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    kernel_size = (3, 2)
+    num_row = 7
+    num_col = 6
+
+    # Test dilation
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv2D,
+          kwargs={
+              'filters': filters,
+              'kernel_size': kernel_size,
+              'dilation_rate': (2, 2)
+          },
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+
+class Conv2DTransposeTest(test.TestCase):
+
+  def test_conv2d_transpose(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.Conv2DTranspose,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_conv2dtranspose_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class SeparableConv2DTest(test.TestCase):
+
+  def test_separable_conv_2d(self):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1), (2, 2)]:
+        for multiplier in [1, 2]:
+          if padding == 'same' and strides != (1, 1):
+            continue
+
+          with self.test_session(use_gpu=True):
+            testing_utils.layer_test(
+                keras.layers.SeparableConv2D,
+                kwargs={
+                    'filters': filters,
+                    'kernel_size': (3, 3),
+                    'padding': padding,
+                    'strides': strides,
+                    'depth_multiplier': multiplier
+                },
+                input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_separable_conv2d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': 'unit_norm',
+        'depthwise_constraint': 'unit_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class Conv3DTest(test.TestCase):
+
+  def test_convolution_3d(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+
+    input_len_dim1 = 9
+    input_len_dim2 = 8
+    input_len_dim3 = 8
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1, 1), (2, 2, 2)]:
+        if padding == 'same' and strides != (1, 1, 1):
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.Convolution3D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(num_samples, input_len_dim1, input_len_dim2,
+                           input_len_dim3, stack_size))
+
+  def test_convolution_3d_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3D(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3D(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+class ZeroPaddingTest(test.TestCase):
+
+  def test_zero_padding_1d(self):
+    num_samples = 2
+    input_dim = 2
+    num_steps = 5
+    shape = (num_samples, num_steps, input_dim)
+    inputs = np.ones(shape)
+
+    # basic test
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding1D,
+          kwargs={'padding': 2},
+          input_shape=inputs.shape)
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding1D,
+          kwargs={'padding': (1, 2)},
+          input_shape=inputs.shape)
+
+    # correctness test
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.ZeroPadding1D(padding=2)
+      layer.build(shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for offset in [0, 1, -1, -2]:
+        np.testing.assert_allclose(np_output[:, offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 2:-2, :], 1.)
+
+      layer = keras.layers.ZeroPadding1D(padding=(1, 2))
+      layer.build(shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for left_offset in [0]:
+        np.testing.assert_allclose(np_output[:, left_offset, :], 0.)
+      for right_offset in [-1, -2]:
+        np.testing.assert_allclose(np_output[:, right_offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 1:-2, :], 1.)
+      layer.get_config()
+
+  def test_zero_padding_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 4
+    input_num_col = 5
+    for data_format in ['channels_first', 'channels_last']:
+      inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
+      inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
+
+      # basic test
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding2D,
+            kwargs={'padding': (2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+        testing_utils.layer_test(
+            keras.layers.ZeroPadding2D,
+            kwargs={'padding': ((1, 2), (3, 4)),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+      # correctness test
+      with self.test_session(use_gpu=True):
+        layer = keras.layers.ZeroPadding2D(
+            padding=(2, 2), data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+        elif data_format == 'channels_first':
+          for offset in [0, 1, -1, -2]:
+            np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
+            np.testing.assert_allclose(np_output[:, :, :, offset], 0.)
+          np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+
+        layer = keras.layers.ZeroPadding2D(
+            padding=((1, 2), (3, 4)), data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        if data_format == 'channels_last':
+          for top_offset in [0]:
+            np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
+          for bottom_offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, bottom_offset, :, :], 0.)
+          for left_offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, left_offset, :], 0.)
+          for right_offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, right_offset, :], 0.)
+          np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.)
+        elif data_format == 'channels_first':
+          for top_offset in [0]:
+            np.testing.assert_allclose(np_output[:, :, top_offset, :], 0.)
+          for bottom_offset in [-1, -2]:
+            np.testing.assert_allclose(np_output[:, :, bottom_offset, :], 0.)
+          for left_offset in [0, 1, 2]:
+            np.testing.assert_allclose(np_output[:, :, :, left_offset], 0.)
+          for right_offset in [-1, -2, -3, -4]:
+            np.testing.assert_allclose(np_output[:, :, :, right_offset], 0.)
+          np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
+
+  def test_zero_padding_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 4
+    input_len_dim2 = 5
+    input_len_dim3 = 3
+
+    inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
+                      input_len_dim3, stack_size))
+
+    # basic test
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.ZeroPadding3D,
+          kwargs={'padding': (2, 2, 2)},
+          input_shape=inputs.shape)
+
+    # correctness test
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2))
+      layer.build(inputs.shape)
+      output = layer(keras.backend.variable(inputs))
+      np_output = keras.backend.eval(output)
+      for offset in [0, 1, -1, -2]:
+        np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
+        np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
+        np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
+      np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
+
+
+class UpSamplingTest(test.TestCase):
+
+  def test_upsampling_1d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
+
+  def test_upsampling_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 11
+    input_num_col = 12
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_num_row,
+                                input_num_col)
+      else:
+        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
+                                stack_size)
+
+      # basic test
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.UpSampling2D,
+            kwargs={'size': (2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+        for length_row in [2]:
+          for length_col in [2, 3]:
+            layer = keras.layers.UpSampling2D(
+                size=(length_row, length_col), data_format=data_format)
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            np_output = keras.backend.eval(output)
+            if data_format == 'channels_first':
+              assert np_output.shape[2] == length_row * input_num_row
+              assert np_output.shape[3] == length_col * input_num_col
+            else:  # tf
+              assert np_output.shape[1] == length_row * input_num_row
+              assert np_output.shape[2] == length_col * input_num_col
+
+            # compare with numpy
+            if data_format == 'channels_first':
+              expected_out = np.repeat(inputs, length_row, axis=2)
+              expected_out = np.repeat(expected_out, length_col, axis=3)
+            else:  # tf
+              expected_out = np.repeat(inputs, length_row, axis=1)
+              expected_out = np.repeat(expected_out, length_col, axis=2)
+
+            np.testing.assert_allclose(np_output, expected_out)
+
+  def test_upsampling_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 10
+    input_len_dim2 = 11
+    input_len_dim3 = 12
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2, input_len_dim3)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                input_len_dim3, stack_size)
+
+      # basic test
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.UpSampling3D,
+            kwargs={'size': (2, 2, 2),
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+
+        for length_dim1 in [2, 3]:
+          for length_dim2 in [2]:
+            for length_dim3 in [3]:
+              layer = keras.layers.UpSampling3D(
+                  size=(length_dim1, length_dim2, length_dim3),
+                  data_format=data_format)
+              layer.build(inputs.shape)
+              output = layer(keras.backend.variable(inputs))
+              np_output = keras.backend.eval(output)
+              if data_format == 'channels_first':
+                assert np_output.shape[2] == length_dim1 * input_len_dim1
+                assert np_output.shape[3] == length_dim2 * input_len_dim2
+                assert np_output.shape[4] == length_dim3 * input_len_dim3
+              else:  # tf
+                assert np_output.shape[1] == length_dim1 * input_len_dim1
+                assert np_output.shape[2] == length_dim2 * input_len_dim2
+                assert np_output.shape[3] == length_dim3 * input_len_dim3
+
+              # compare with numpy
+              if data_format == 'channels_first':
+                expected_out = np.repeat(inputs, length_dim1, axis=2)
+                expected_out = np.repeat(expected_out, length_dim2, axis=3)
+                expected_out = np.repeat(expected_out, length_dim3, axis=4)
+              else:  # tf
+                expected_out = np.repeat(inputs, length_dim1, axis=1)
+                expected_out = np.repeat(expected_out, length_dim2, axis=2)
+                expected_out = np.repeat(expected_out, length_dim3, axis=3)
+
+              np.testing.assert_allclose(np_output, expected_out)
+
+
+class CroppingTest(test.TestCase):
+
+  def test_cropping_1d(self):
+    num_samples = 2
+    time_length = 4
+    input_len_dim1 = 2
+    inputs = np.random.rand(num_samples, time_length, input_len_dim1)
+
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Cropping1D,
+          kwargs={'cropping': (2, 2)},
+          input_shape=inputs.shape)
+
+  def test_cropping_2d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 9
+    input_len_dim2 = 9
+    cropping = ((2, 2), (3, 3))
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                stack_size)
+      # basic test
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.Cropping2D,
+            kwargs={'cropping': cropping,
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+      # correctness test
+      with self.test_session(use_gpu=True):
+        layer = keras.layers.Cropping2D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with numpy
+        if data_format == 'channels_first':
+          expected_out = inputs[:, :, cropping[0][0]:-cropping[0][1], cropping[
+              1][0]:-cropping[1][1]]
+        else:
+          expected_out = inputs[:, cropping[0][0]:-cropping[0][1], cropping[1][
+              0]:-cropping[1][1], :]
+        np.testing.assert_allclose(np_output, expected_out)
+
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                stack_size)
+      # another correctness test (no cropping)
+      with self.test_session(use_gpu=True):
+        cropping = ((0, 0), (0, 0))
+        layer = keras.layers.Cropping2D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with input
+        np.testing.assert_allclose(np_output, inputs)
+
+  def test_cropping_3d(self):
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 8
+    input_len_dim2 = 8
+    input_len_dim3 = 8
+    cropping = ((2, 2), (1, 1), (2, 3))
+
+    for data_format in ['channels_last', 'channels_first']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
+                                input_len_dim2, input_len_dim3)
+      else:
+        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
+                                input_len_dim3, stack_size)
+      # basic test
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.Cropping3D,
+            kwargs={'cropping': cropping,
+                    'data_format': data_format},
+            input_shape=inputs.shape)
+      # correctness test
+      with self.test_session(use_gpu=True):
+        layer = keras.layers.Cropping3D(
+            cropping=cropping, data_format=data_format)
+        layer.build(inputs.shape)
+        output = layer(keras.backend.variable(inputs))
+        np_output = keras.backend.eval(output)
+        # compare with numpy
+        if data_format == 'channels_first':
+          expected_out = inputs[:, :,
+                                cropping[0][0]:-cropping[0][1],
+                                cropping[1][0]:-cropping[1][1],
+                                cropping[2][0]:-cropping[2][1]]
+        else:
+          expected_out = inputs[:,
+                                cropping[0][0]:-cropping[0][1],
+                                cropping[1][0]:-cropping[1][1],
+                                cropping[2][0]:-cropping[2][1], :]
+        print(expected_out.shape)
+        np.testing.assert_allclose(np_output, expected_out)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
new file mode 100644
index 00000000000..d287fa56d91
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -0,0 +1,780 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core Keras layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import types as python_types
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.util import tf_inspect
+
+
+class Masking(Layer):
+  """Masks a sequence by using a mask value to skip timesteps.
+
+  For each timestep in the input tensor (dimension #1 in the tensor),
+  if all values in the input tensor at that timestep
+  are equal to `mask_value`, then the timestep will be masked (skipped)
+  in all downstream layers (as long as they support masking).
+
+  If any downstream layer does not support masking yet receives such
+  an input mask, an exception will be raised.
+
+  Example:
+
+  Consider a Numpy data array `x` of shape `(samples, timesteps, features)`,
+  to be fed to a LSTM layer.
+  You want to mask timestep #3 and #5 because you lack data for
+  these timesteps. You can:
+
+      - set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
+      - insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
+
+  ```python
+      model = Sequential()
+      model.add(Masking(mask_value=0., input_shape=(timesteps, features)))
+      model.add(LSTM(32))
+  ```
+  """
+
+  def __init__(self, mask_value=0., **kwargs):
+    super(Masking, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.mask_value = mask_value
+
+  def compute_mask(self, inputs, mask=None):
+    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+
+  def call(self, inputs):
+    boolean_mask = K.any(
+        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+    return inputs * K.cast(boolean_mask, K.floatx())
+
+  def get_config(self):
+    config = {'mask_value': self.mask_value}
+    base_config = super(Masking, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Dropout(tf_core_layers.Dropout, Layer):
+  """Applies Dropout to the input.
+
+  Dropout consists in randomly setting
+  a fraction `rate` of input units to 0 at each update during training time,
+  which helps prevent overfitting.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      noise_shape: 1D integer tensor representing the shape of the
+          binary dropout mask that will be multiplied with the input.
+          For instance, if your inputs have shape
+          `(batch_size, timesteps, features)` and
+          you want the dropout mask to be the same for all timesteps,
+          you can use `noise_shape=(batch_size, 1, features)`.
+      seed: A Python integer to use as random seed.
+  """
+
+  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+    self.supports_masking = True
+    # Inheritance call order:
+    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
+    super(Dropout, self).__init__(**kwargs)
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = K.learning_phase()
+    output = super(Dropout, self).call(inputs, training=training)
+    if training is K.learning_phase():
+      output._uses_learning_phase = True  # pylint: disable=protected-access
+    return output
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(Dropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SpatialDropout1D(Dropout):
+  """Spatial 1D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 1D feature maps instead of individual elements. If adjacent frames
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout1D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+
+  Input shape:
+      3D tensor with shape:
+      `(samples, timesteps, channels)`
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, **kwargs):
+    super(SpatialDropout1D, self).__init__(rate, **kwargs)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    noise_shape = (input_shape[0], 1, input_shape[2])
+    return noise_shape
+
+
+class SpatialDropout2D(Dropout):
+  """Spatial 2D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 2D feature maps instead of individual elements. If adjacent pixels
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout2D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'.
+          In 'channels_first' mode, the channels dimension
+          (the depth) is at index 1,
+          in 'channels_last' mode is it at index 3.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, data_format=None, **kwargs):
+    super(SpatialDropout2D, self).__init__(rate, **kwargs)
+    if data_format is None:
+      data_format = K.image_data_format()
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError('data_format must be in '
+                       '{"channels_last", "channels_first"}')
+    self.data_format = data_format
+    self.input_spec = InputSpec(ndim=4)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    if self.data_format == 'channels_first':
+      noise_shape = (input_shape[0], input_shape[1], 1, 1)
+    elif self.data_format == 'channels_last':
+      noise_shape = (input_shape[0], 1, 1, input_shape[3])
+    else:
+      raise ValueError('Invalid data_format:', self.data_format)
+    return noise_shape
+
+
+class SpatialDropout3D(Dropout):
+  """Spatial 3D version of Dropout.
+
+  This version performs the same function as Dropout, however it drops
+  entire 3D feature maps instead of individual elements. If adjacent voxels
+  within feature maps are strongly correlated (as is normally the case in
+  early convolution layers) then regular dropout will not regularize the
+  activations and will otherwise just result in an effective learning rate
+  decrease. In this case, SpatialDropout3D will help promote independence
+  between feature maps and should be used instead.
+
+  Arguments:
+      rate: float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'.
+          In 'channels_first' mode, the channels dimension (the depth)
+          is at index 1, in 'channels_last' mode is it at index 4.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      5D tensor with shape:
+      `(samples, channels, dim1, dim2, dim3)` if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, dim1, dim2, dim3, channels)` if data_format='channels_last'.
+
+  Output shape:
+      Same as input
+
+  References:
+      - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+  """
+
+  def __init__(self, rate, data_format=None, **kwargs):
+    super(SpatialDropout3D, self).__init__(rate, **kwargs)
+    if data_format is None:
+      data_format = K.image_data_format()
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError('data_format must be in '
+                       '{"channels_last", "channels_first"}')
+    self.data_format = data_format
+    self.input_spec = InputSpec(ndim=5)
+
+  def _get_noise_shape(self, inputs):
+    input_shape = K.shape(inputs)
+    if self.data_format == 'channels_first':
+      noise_shape = (input_shape[0], input_shape[1], 1, 1, 1)
+    elif self.data_format == 'channels_last':
+      noise_shape = (input_shape[0], 1, 1, 1, input_shape[4])
+    else:
+      raise ValueError('Invalid data_format:', self.data_format)
+    return noise_shape
+
+
+class Activation(Layer):
+  """Applies an activation function to an output.
+
+  Arguments:
+      activation: name of activation function to use
+          or alternatively, a Theano or TensorFlow operation.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, activation, **kwargs):
+    super(Activation, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.activation = activations.get(activation)
+
+  def call(self, inputs):
+    return self.activation(inputs)
+
+  def get_config(self):
+    config = {'activation': activations.serialize(self.activation)}
+    base_config = super(Activation, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Reshape(Layer):
+  """Reshapes an output to a certain shape.
+
+  Arguments:
+      target_shape: target shape. Tuple of integers,
+          does not include the samples dimension (batch size).
+
+  Input shape:
+      Arbitrary, although all dimensions in the input shaped must be fixed.
+      Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      `(batch_size,) + target_shape`
+
+  Example:
+
+  ```python
+      # as first layer in a Sequential model
+      model = Sequential()
+      model.add(Reshape((3, 4), input_shape=(12,)))
+      # now: model.output_shape == (None, 3, 4)
+      # note: `None` is the batch dimension
+
+      # as intermediate layer in a Sequential model
+      model.add(Reshape((6, 2)))
+      # now: model.output_shape == (None, 6, 2)
+
+      # also supports shape inference using `-1` as dimension
+      model.add(Reshape((-1, 2, 2)))
+      # now: model.output_shape == (None, 3, 2, 2)
+  ```
+  """
+
+  def __init__(self, target_shape, **kwargs):
+    super(Reshape, self).__init__(**kwargs)
+    self.target_shape = tuple(target_shape)
+
+  def _fix_unknown_dimension(self, input_shape, output_shape):
+    """Find and replace a missing dimension in an output shape.
+
+    This is a near direct port of the internal Numpy function
+    `_fix_unknown_dimension` in `numpy/core/src/multiarray/shape.c`
+
+    Arguments:
+        input_shape: shape of array being reshaped
+        output_shape: desired shape of the array with at most
+            a single -1 which indicates a dimension that should be
+            derived from the input shape.
+
+    Returns:
+        The new output shape with a -1 replaced with its computed value.
+
+        Raises a ValueError if the total array size of the output_shape is
+        different then the input_shape, or more then one unknown dimension
+        is specified.
+
+    Raises:
+        ValueError: in case of invalid values
+            for `input_shape` or `input_shape`.
+    """
+    output_shape = list(output_shape)
+    msg = 'total size of new array must be unchanged'
+
+    known, unknown = 1, None
+    for index, dim in enumerate(output_shape):
+      if dim < 0:
+        if unknown is None:
+          unknown = index
+        else:
+          raise ValueError('Can only specify one unknown dimension.')
+      else:
+        known *= dim
+
+    original = np.prod(input_shape, dtype=int)
+    if unknown is not None:
+      if known == 0 or original % known != 0:
+        raise ValueError(msg)
+      output_shape[unknown] = original // known
+    elif original != known:
+      raise ValueError(msg)
+    return output_shape
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    output_shape += self._fix_unknown_dimension(input_shape[1:],
+                                                self.target_shape)
+    return tensor_shape.TensorShape(output_shape)
+
+  def call(self, inputs):
+    # In case the target shape is not fully defined,
+    # we need access to the shape of x.
+    target_shape = self.target_shape
+    if -1 in target_shape:
+      # target shape not fully defined
+      target_shape = self._compute_output_shape(inputs.get_shape())
+      target_shape = target_shape.as_list()[1:]
+    return K.reshape(inputs, (-1,) + tuple(target_shape))
+
+  def get_config(self):
+    config = {'target_shape': self.target_shape}
+    base_config = super(Reshape, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Permute(Layer):
+  """Permutes the dimensions of the input according to a given pattern.
+
+  Useful for e.g. connecting RNNs and convnets together.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Permute((2, 1), input_shape=(10, 64)))
+      # now: model.output_shape == (None, 64, 10)
+      # note: `None` is the batch dimension
+  ```
+
+  Arguments:
+      dims: Tuple of integers. Permutation pattern, does not include the
+          samples dimension. Indexing starts at 1.
+          For instance, `(2, 1)` permutes the first and second dimension
+          of the input.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same as the input shape, but with the dimensions re-ordered according
+      to the specified pattern.
+  """
+
+  def __init__(self, dims, **kwargs):
+    super(Permute, self).__init__(**kwargs)
+    self.dims = tuple(dims)
+    self.input_spec = InputSpec(ndim=len(self.dims) + 1)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = copy.copy(input_shape)
+    for i, dim in enumerate(self.dims):
+      target_dim = input_shape[dim]
+      output_shape[i + 1] = target_dim
+    return tensor_shape.TensorShape(output_shape)
+
+  def call(self, inputs):
+    return K.permute_dimensions(inputs, (0,) + self.dims)
+
+  def get_config(self):
+    config = {'dims': self.dims}
+    base_config = super(Permute, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Flatten(Layer):
+  """Flattens the input. Does not affect the batch size.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Convolution2D(64, 3, 3,
+                              border_mode='same',
+                              input_shape=(3, 32, 32)))
+      # now: model.output_shape == (None, 64, 32, 32)
+
+      model.add(Flatten())
+      # now: model.output_shape == (None, 65536)
+  ```
+  """
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not all(input_shape[1:]):
+      raise ValueError('The shape of the input to "Flatten" '
+                       'is not fully defined '
+                       '(got ' + str(input_shape[1:]) + '. '
+                       'Make sure to pass a complete "input_shape" '
+                       'or "batch_input_shape" argument to the first '
+                       'layer in your model.')
+    return tensor_shape.TensorShape([input_shape[0], np.prod(input_shape[1:])])
+
+  def call(self, inputs):
+    outputs = K.batch_flatten(inputs)
+    outputs.set_shape(self._compute_output_shape(inputs.get_shape()))
+    return outputs
+
+
+class RepeatVector(Layer):
+  """Repeats the input n times.
+
+  Example:
+
+  ```python
+      model = Sequential()
+      model.add(Dense(32, input_dim=32))
+      # now: model.output_shape == (None, 32)
+      # note: `None` is the batch dimension
+
+      model.add(RepeatVector(3))
+      # now: model.output_shape == (None, 3, 32)
+  ```
+
+  Arguments:
+      n: integer, repetition factor.
+
+  Input shape:
+      2D tensor of shape `(num_samples, features)`.
+
+  Output shape:
+      3D tensor of shape `(num_samples, n, features)`.
+  """
+
+  def __init__(self, n, **kwargs):
+    super(RepeatVector, self).__init__(**kwargs)
+    self.n = n
+    self.input_spec = InputSpec(ndim=2)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    return tensor_shape.TensorShape([input_shape[0], self.n, input_shape[1]])
+
+  def call(self, inputs):
+    return K.repeat(inputs, self.n)
+
+  def get_config(self):
+    config = {'n': self.n}
+    base_config = super(RepeatVector, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Lambda(Layer):
+  """Wraps arbitrary expression as a `Layer` object.
+
+  Examples:
+
+  ```python
+      # add a x -> x^2 layer
+      model.add(Lambda(lambda x: x ** 2))
+  ```
+  ```python
+      # add a layer that returns the concatenation
+      # of the positive part of the input and
+      # the opposite of the negative part
+
+      def antirectifier(x):
+          x -= K.mean(x, axis=1, keepdims=True)
+          x = K.l2_normalize(x, axis=1)
+          pos = K.relu(x)
+          neg = K.relu(-x)
+          return K.concatenate([pos, neg], axis=1)
+
+      model.add(Lambda(antirectifier))
+  ```
+
+  Arguments:
+      function: The function to be evaluated.
+          Takes input tensor as first argument.
+      arguments: optional dictionary of keyword arguments to be passed
+          to the function.
+
+  Input shape:
+      Arbitrary. Use the keyword argument input_shape
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Specified by `output_shape` argument
+      (or auto-inferred when using TensorFlow).
+  """
+
+  def __init__(self, function, mask=None, arguments=None, **kwargs):
+    super(Lambda, self).__init__(**kwargs)
+    self.function = function
+    self.arguments = arguments if arguments else {}
+    if mask is not None:
+      self.supports_masking = True
+    self.mask = mask
+
+  def call(self, inputs, mask=None):
+    arguments = self.arguments
+    arg_spec = tf_inspect.getargspec(self.function)
+    if 'mask' in arg_spec.args:
+      arguments['mask'] = mask
+    return self.function(inputs, **arguments)
+
+  def compute_mask(self, inputs, mask=None):
+    if callable(self.mask):
+      return self.mask(inputs, mask)
+    return self.mask
+
+  def get_config(self):
+    if isinstance(self.function, python_types.LambdaType):
+      function = func_dump(self.function)
+      function_type = 'lambda'
+    else:
+      function = self.function.__name__
+      function_type = 'function'
+
+    config = {
+        'function': function,
+        'function_type': function_type,
+        'arguments': self.arguments
+    }
+    base_config = super(Lambda, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    globs = globals()
+    if custom_objects:
+      globs = dict(list(globs.items()) + list(custom_objects.items()))
+    function_type = config.pop('function_type')
+    if function_type == 'function':
+      # Simple lookup in custom objects
+      function = deserialize_keras_object(
+          config['function'],
+          custom_objects=custom_objects,
+          printable_module_name='function in Lambda layer')
+    elif function_type == 'lambda':
+      # Unsafe deserialization from bytecode
+      function = func_load(config['function'], globs=globs)
+    else:
+      raise TypeError('Unknown function type:', function_type)
+
+    config['function'] = function
+    return cls(**config)
+
+
+class Dense(tf_core_layers.Dense, Layer):
+  """Just your regular densely-connected NN layer.
+
+  `Dense` implements the operation:
+  `output = activation(dot(input, kernel) + bias)`
+  where `activation` is the element-wise activation function
+  passed as the `activation` argument, `kernel` is a weights matrix
+  created by the layer, and `bias` is a bias vector created by the layer
+  (only applicable if `use_bias` is `True`).
+
+  Note: if the input to the layer has a rank greater than 2, then
+  it is flattened prior to the initial dot product with `kernel`.
+
+  Example:
+
+  ```python
+      # as first layer in a sequential model:
+      model = Sequential()
+      model.add(Dense(32, input_shape=(16,)))
+      # now the model will take as input arrays of shape (*, 16)
+      # and output arrays of shape (*, 32)
+
+      # after the first layer, you don't need to specify
+      # the size of the input anymore:
+      model.add(Dense(32))
+  ```
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      nD tensor with shape: `(batch_size, ..., input_dim)`.
+      The most common situation would be
+      a 2D input with shape `(batch_size, input_dim)`.
+
+  Output shape:
+      nD tensor with shape: `(batch_size, ..., units)`.
+      For instance, for a 2D input with shape `(batch_size, input_dim)`,
+      the output would have shape `(batch_size, units)`.
+  """
+
+  def __init__(self,
+               units,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
+
+    # Inheritance call order:
+    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
+    super(Dense, self).__init__(
+        units,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    super(Dense, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Dense, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class ActivityRegularization(Layer):
+  """Layer that applies an update to the cost function based input activity.
+
+  Arguments:
+      l1: L1 regularization factor (positive float).
+      l2: L2 regularization factor (positive float).
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, l1=0., l2=0., **kwargs):
+    super(ActivityRegularization, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.l1 = l1
+    self.l2 = l2
+    self.activity_regularizer = regularizers.L1L2(l1=l1, l2=l2)
+
+  def get_config(self):
+    config = {'l1': self.l1, 'l2': self.l2}
+    base_config = super(ActivityRegularization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/core_test.py b/tensorflow/contrib/keras/python/keras/layers/core_test.py
new file mode 100644
index 00000000000..7066af0ef6c
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/core_test.py
@@ -0,0 +1,188 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras core layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class CoreLayersTest(test.TestCase):
+
+  def test_masking(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+
+  def test_dropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dropout,
+          kwargs={'rate': 0.5,
+                  'noise_shape': [3, 1]},
+          input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SpatialDropout1D,
+          kwargs={'rate': 0.5},
+          input_shape=(2, 3, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SpatialDropout2D,
+          kwargs={'rate': 0.5},
+          input_shape=(2, 3, 4, 5))
+
+  def test_activation(self):
+    # with string argument
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Activation,
+          kwargs={'activation': 'relu'},
+          input_shape=(3, 2))
+
+    # with function argument
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Activation,
+          kwargs={'activation': keras.backend.relu},
+          input_shape=(3, 2))
+
+  def test_reshape(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (8, 1)},
+          input_shape=(3, 2, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (-1, 1)},
+          input_shape=(3, 2, 4))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (1, -1)},
+          input_shape=(3, 2, 4))
+
+  def test_permute(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
+
+  def test_flatten(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
+
+  def test_repeat_vector(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+
+  def test_lambda(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Lambda,
+          kwargs={'function': lambda x: x + 1},
+          input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Lambda,
+          kwargs={
+              'function': lambda x, a, b: x * a + b,
+              'arguments': {
+                  'a': 0.6,
+                  'b': 0.4
+              }
+          },
+          input_shape=(3, 2))
+
+    with self.test_session():
+      # test serialization with function
+      def f(x):
+        return x + 1
+
+      ld = keras.layers.Lambda(f)
+      config = ld.get_config()
+      ld = keras.layers.deserialize({
+          'class_name': 'Lambda',
+          'config': config
+      })
+
+      # test with lambda
+      ld = keras.layers.Lambda(
+          lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
+      config = ld.get_config()
+      ld = keras.layers.Lambda.from_config(config)
+
+  def test_dense(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
+
+    # Test regularization
+    with self.test_session():
+      layer = keras.layers.Dense(
+          3,
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l1',
+          activity_regularizer='l2',
+          name='dense_reg')
+      layer(keras.backend.variable(np.ones((2, 4))))
+      self.assertEqual(3, len(layer.losses))
+
+    # Test constraints
+    with self.test_session():
+      layer = keras.layers.Dense(
+          3, kernel_constraint='max_norm', bias_constraint='max_norm')
+      layer(keras.backend.variable(np.ones((2, 4))))
+      self.assertEqual(2, len(layer.constraints))
+
+  def test_activity_regularization(self):
+    with self.test_session():
+      layer = keras.layers.ActivityRegularization(l1=0.1)
+      layer(keras.backend.variable(np.ones((2, 4))))
+      self.assertEqual(1, len(layer.losses))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
new file mode 100644
index 00000000000..bc0bae67d05
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -0,0 +1,167 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Embedding layer.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class Embedding(Layer):
+  """Turns positive integers (indexes) into dense vectors of fixed size.
+
+  eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
+
+  This layer can only be used as the first layer in a model.
+
+  Example:
+
+  ```python
+    model = Sequential()
+    model.add(Embedding(1000, 64, input_length=10))
+    # the model will take as input an integer matrix of size (batch,
+    input_length).
+    # the largest integer (i.e. word index) in the input should be no larger
+    than 999 (vocabulary size).
+    # now model.output_shape == (None, 10, 64), where None is the batch
+    dimension.
+
+    input_array = np.random.randint(1000, size=(32, 10))
+
+    model.compile('rmsprop', 'mse')
+    output_array = model.predict(input_array)
+    assert output_array.shape == (32, 10, 64)
+  ```
+
+  Arguments:
+    input_dim: int > 0. Size of the vocabulary,
+        i.e. maximum integer index + 1.
+    output_dim: int >= 0. Dimension of the dense embedding.
+    embeddings_initializer: Initializer for the `embeddings` matrix.
+    embeddings_regularizer: Regularizer function applied to
+          the `embeddings` matrix.
+    embeddings_constraint: Constraint function applied to
+          the `embeddings` matrix.
+    mask_zero: Whether or not the input value 0 is a special "padding"
+        value that should be masked out.
+        This is useful when using recurrent layers,
+        which may take variable length inputs.
+        If this is `True` then all subsequent layers
+        in the model need to support masking or an exception will be raised.
+        If mask_zero is set to True, as a consequence, index 0 cannot be
+        used in the vocabulary (input_dim should equal size of
+        vocabulary + 1).
+    input_length: Length of input sequences, when it is constant.
+        This argument is required if you are going to connect
+        `Flatten` then `Dense` layers upstream
+        (without it, the shape of the dense outputs cannot be computed).
+
+  Input shape:
+      2D tensor with shape: `(batch_size, sequence_length)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               input_dim,
+               output_dim,
+               embeddings_initializer='uniform',
+               embeddings_regularizer=None,
+               activity_regularizer=None,
+               embeddings_constraint=None,
+               mask_zero=False,
+               input_length=None,
+               **kwargs):
+    kwargs['dtype'] = 'int32'
+    if 'input_shape' not in kwargs:
+      if input_length:
+        kwargs['input_shape'] = (input_length,)
+      else:
+        kwargs['input_shape'] = (None,)
+    super(Embedding, self).__init__(**kwargs)
+
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.embeddings_initializer = initializers.get(embeddings_initializer)
+    self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.embeddings_constraint = constraints.get(embeddings_constraint)
+    self.mask_zero = mask_zero
+    self.input_length = input_length
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    self.embeddings = self.add_weight(
+        shape=(self.input_dim, self.output_dim),
+        initializer=self.embeddings_initializer,
+        name='embeddings',
+        regularizer=self.embeddings_regularizer,
+        constraint=self.embeddings_constraint)
+    self.built = True
+
+  def compute_mask(self, inputs, mask=None):
+    if not self.mask_zero:
+      return None
+    else:
+      return K.not_equal(inputs, 0)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not self.input_length:
+      input_length = input_shape[1]
+    else:
+      input_length = self.input_length
+    return tensor_shape.TensorShape(
+        [input_shape[0], input_length, self.output_dim])
+
+  def call(self, inputs):
+    if K.dtype(inputs) != 'int32':
+      inputs = K.cast(inputs, 'int32')
+    out = K.gather(self.embeddings, inputs)
+    return out
+
+  def get_config(self):
+    config = {
+        'input_dim':
+            self.input_dim,
+        'output_dim':
+            self.output_dim,
+        'embeddings_initializer':
+            initializers.serialize(self.embeddings_initializer),
+        'embeddings_regularizer':
+            regularizers.serialize(self.embeddings_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'embeddings_constraint':
+            constraints.serialize(self.embeddings_constraint),
+        'mask_zero':
+            self.mask_zero,
+        'input_length':
+            self.input_length
+    }
+    base_config = super(Embedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
new file mode 100644
index 00000000000..ca7ca3efd81
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
@@ -0,0 +1,51 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for embedding layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class EmbeddingTest(test.TestCase):
+
+  def test_embedding(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'input_length': 2},
+          input_shape=(3, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True},
+          input_shape=(3, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/gru_test.py b/tensorflow/contrib/keras/python/keras/layers/gru_test.py
new file mode 100644
index 00000000000..327d2b05a1b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/gru_test.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GRU layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class GRULayerTest(test.TestCase):
+
+  def test_return_sequences_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GRU,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GRU,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_GRU(self):
+    layer_class = keras.layers.GRU
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.GRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
new file mode 100644
index 00000000000..863674c1cbd
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -0,0 +1,477 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Locally-connected layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+
+
+class LocallyConnected1D(Layer):
+  """Locally-connected layer for 1D inputs.
+
+  The `LocallyConnected1D` layer works similarly to
+  the `Conv1D` layer, except that weights are unshared,
+  that is, a different set of filters is applied at each different patch
+  of the input.
+
+  Example:
+  ```python
+      # apply a unshared weight convolution 1d of length 3 to a sequence with
+      # 10 timesteps, with 64 output filters
+      model = Sequential()
+      model.add(LocallyConnected1D(64, 3, input_shape=(10, 32)))
+      # now model.output_shape == (None, 8, 64)
+      # add a new conv1d on top
+      model.add(LocallyConnected1D(32, 3))
+      # now model.output_shape == (None, 6, 32)
+  ```
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer,
+          specifying the length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+          specifying the stride length of the convolution.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: Currently only supports `"valid"` (case-insensitive).
+          `"same"` may be supported in the future.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, input_dim)`
+
+  Output shape:
+      3D tensor with shape: `(batch_size, new_steps, filters)`
+      `steps` value might have changed due to padding or strides.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(LocallyConnected1D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    if self.padding != 'valid':
+      raise ValueError('Invalid border mode for LocallyConnected1D '
+                       '(only "valid" is supported): ' + padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=3)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    input_dim = input_shape[2]
+    if input_dim is None:
+      raise ValueError('Axis 2 of input should be fully-defined. '
+                       'Found shape:', input_shape)
+    output_length = conv_utils.conv_output_length(
+        input_shape[1], self.kernel_size[0], self.padding, self.strides[0])
+    self.kernel_shape = (output_length, self.kernel_size[0] * input_dim,
+                         self.filters)
+    self.kernel = self.add_weight(
+        shape=self.kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(output_length, self.filters),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
+    self.built = True
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, self.filters])
+
+  def call(self, inputs):
+    stride = self.strides[0]
+    output_length, feature_dim, filters = self.kernel_shape
+
+    xs = []
+    for i in range(output_length):
+      slice_length = slice(i * stride, i * stride + self.kernel_size[0])
+      xs.append(K.reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+    x_aggregate = K.concatenate(xs, axis=0)
+    # Shape: `(output_length, batch_size, filters)`.
+    output = K.batch_dot(x_aggregate, self.kernel)
+    output = K.permute_dimensions(output, (1, 0, 2))
+
+    if self.use_bias:
+      output += K.reshape(self.bias, (1, output_length, filters))
+    if self.activation is not None:
+      output = self.activation(output)
+    return output
+
+  def get_config(self):
+    config = {
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(LocallyConnected1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class LocallyConnected2D(Layer):
+  """Locally-connected layer for 2D inputs.
+
+  The `LocallyConnected2D` layer works similarly
+  to the `Conv2D` layer, except that weights are unshared,
+  that is, a different set of filters is applied at each
+  different patch of the input.
+
+  Examples:
+  ```python
+      # apply a 3x3 unshared weights convolution with 64 output filters on a
+      32x32 image
+      # with `data_format="channels_last"`:
+      model = Sequential()
+      model.add(LocallyConnected2D(64, (3, 3), input_shape=(32, 32, 3)))
+      # now model.output_shape == (None, 30, 30, 64)
+      # notice that this layer will consume (30*30)*(3*3*3*64) + (30*30)*64
+      parameters
+
+      # add a 3x3 unshared weights convolution on top, with 32 output filters:
+      model.add(LocallyConnected2D(32, (3, 3)))
+      # now model.output_shape == (None, 28, 28, 32)
+  ```
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      padding: Currently only support `"valid"` (case-insensitive).
+          `"same"` will be supported in future.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to the kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(LocallyConnected2D, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    if self.padding != 'valid':
+      raise ValueError('Invalid border mode for LocallyConnected2D '
+                       '(only "valid" is supported): ' + padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=4)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      input_row, input_col = input_shape[1:-1]
+      input_filter = input_shape[3]
+    else:
+      input_row, input_col = input_shape[2:]
+      input_filter = input_shape[1]
+    if input_row is None or input_col is None:
+      raise ValueError('The spatial dimensions of the inputs to '
+                       ' a LocallyConnected2D layer '
+                       'should be fully-defined, but layer received '
+                       'the inputs shape ' + str(input_shape))
+
+    output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
+                                               self.padding, self.strides[0])
+    output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
+                                               self.padding, self.strides[1])
+    self.output_row = output_row
+    self.output_col = output_col
+    self.kernel_shape = (
+        output_row * output_col,
+        self.kernel_size[0] * self.kernel_size[1] * input_filter, self.filters)
+    self.kernel = self.add_weight(
+        shape=self.kernel_shape,
+        initializer=self.kernel_initializer,
+        name='kernel',
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(output_row, output_col, self.filters),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    if self.data_format == 'channels_first':
+      self.input_spec = InputSpec(ndim=4, axes={1: input_filter})
+    else:
+      self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
+    self.built = True
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding, self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding, self.strides[1])
+
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], self.filters, rows, cols])
+    elif self.data_format == 'channels_last':
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, self.filters])
+
+  def call(self, inputs):
+    stride_row, stride_col = self.strides
+    _, feature_dim, filters = self.kernel_shape
+
+    if self.data_format == 'channels_first':
+      if K.backend() == 'theano':
+        output = []
+        for i in range(self.output_row):
+          for j in range(self.output_col):
+            slice_row = slice(i * stride_row,
+                              i * stride_row + self.kernel_size[0])
+            slice_col = slice(j * stride_col,
+                              j * stride_col + self.kernel_size[1])
+            x_flatten = K.reshape(inputs[:, :, slice_row, slice_col],
+                                  (1, -1, feature_dim))
+            output.append(
+                K.dot(x_flatten, self.kernel[i * self.output_col + j, :, :]))
+        output = K.concatenate(output, axis=0)
+      else:
+        xs = []
+        for i in range(self.output_row):
+          for j in range(self.output_col):
+            slice_row = slice(i * stride_row,
+                              i * stride_row + self.kernel_size[0])
+            slice_col = slice(j * stride_col,
+                              j * stride_col + self.kernel_size[1])
+            xs.append(
+                K.reshape(inputs[:, :, slice_row, slice_col], (1, -1,
+                                                               feature_dim)))
+        x_aggregate = K.concatenate(xs, axis=0)
+        output = K.batch_dot(x_aggregate, self.kernel)
+      output = K.reshape(output, (self.output_row, self.output_col, -1,
+                                  filters))
+      output = K.permute_dimensions(output, (2, 3, 0, 1))
+
+    elif self.data_format == 'channels_last':
+      xs = []
+      for i in range(self.output_row):
+        for j in range(self.output_col):
+          slice_row = slice(i * stride_row,
+                            i * stride_row + self.kernel_size[0])
+          slice_col = slice(j * stride_col,
+                            j * stride_col + self.kernel_size[1])
+          xs.append(
+              K.reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim
+                                                            )))
+      x_aggregate = K.concatenate(xs, axis=0)
+      output = K.batch_dot(x_aggregate, self.kernel)
+      output = K.reshape(output, (self.output_row, self.output_col, -1,
+                                  filters))
+      output = K.permute_dimensions(output, (2, 0, 1, 3))
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        output += K.reshape(self.bias, (1, filters, self.output_row,
+                                        self.output_col))
+      elif self.data_format == 'channels_last':
+        output += K.reshape(self.bias, (1, self.output_row, self.output_col,
+                                        filters))
+    output = self.activation(output)
+    return output
+
+  def get_config(self):
+    config = {
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(LocallyConnected2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/local_test.py b/tensorflow/contrib/keras/python/keras/layers/local_test.py
new file mode 100644
index 00000000000..f84f4e91d5a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/local_test.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for locally-connected layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class LocallyConnectedLayersTest(test.TestCase):
+
+  def test_locallyconnected_1d(self):
+    num_samples = 2
+    num_steps = 8
+    input_dim = 5
+    filter_length = 3
+    filters = 4
+
+    for padding in ['valid']:
+      for strides in [1]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides
+              },
+              input_shape=(num_samples, num_steps, input_dim))
+
+  def test_locallyconnected_1d_regularization(self):
+    num_samples = 2
+    num_steps = 8
+    input_dim = 5
+    filter_length = 3
+    filters = 4
+    kwargs = {
+        'filters': filters,
+        'kernel_size': filter_length,
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+    }
+
+    with self.test_session():
+      layer = keras.layers.LocallyConnected1D(**kwargs)
+      layer.build((num_samples, num_steps, input_dim))
+      self.assertEqual(len(layer.losses), 2)
+      layer(
+          keras.backend.variable(np.ones((num_samples, num_steps, input_dim))))
+      self.assertEqual(len(layer.losses), 3)
+
+    kwargs = {
+        'filters': filters,
+        'kernel_size': filter_length,
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected1D(**kwargs)
+      layer.build((num_samples, num_steps, input_dim))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_locallyconnected_2d(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+
+    for padding in ['valid']:
+      for strides in [(1, 1), (2, 2)]:
+        if padding == 'same' and strides != (1, 1):
+          continue
+
+        with self.test_session():
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected2D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'kernel_regularizer': 'l2',
+                  'bias_regularizer': 'l2',
+                  'activity_regularizer': 'l2',
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_locallyconnected_2d_channels_first(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LocallyConnected2D,
+          kwargs={
+              'filters': filters,
+              'kernel_size': 3,
+              'data_format': 'channels_first'
+          },
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_locallyconnected_2d_regularization(self):
+    num_samples = 8
+    filters = 3
+    stack_size = 4
+    num_row = 6
+    num_col = 10
+    kwargs = {
+        'filters': filters,
+        'kernel_size': 3,
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected2D(**kwargs)
+      layer.build((num_samples, num_row, num_col, stack_size))
+      self.assertEqual(len(layer.losses), 2)
+      layer(
+          keras.backend.variable(
+              np.ones((num_samples, num_row, num_col, stack_size))))
+      self.assertEqual(len(layer.losses), 3)
+
+    kwargs = {
+        'filters': filters,
+        'kernel_size': 3,
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+    }
+    with self.test_session():
+      layer = keras.layers.LocallyConnected2D(**kwargs)
+      layer.build((num_samples, num_row, num_col, stack_size))
+      self.assertEqual(len(layer.constraints), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
new file mode 100644
index 00000000000..90bf95a781a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
@@ -0,0 +1,297 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class LSTMLayerTest(test.TestCase):
+
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LSTM,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.LSTM,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_LSTM(self):
+    layer_class = keras.layers.LSTM
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_LSTM(self):
+    layer_class = keras.layers.LSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      # Test with Keras tensor
+      inputs = keras.Input((timesteps, embedding_dim))
+      initial_state = [keras.Input((units,)) for _ in range(num_states)]
+      layer = keras.layers.LSTM(units)
+      if len(initial_state) == 1:
+        output = layer(inputs, initial_state=initial_state[0])
+      else:
+        output = layer(inputs, initial_state=initial_state)
+      assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+
+      model = keras.models.Model([inputs] + initial_state, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      initial_state = [np.random.random((num_samples, units))
+                       for _ in range(num_states)]
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch([inputs] + initial_state, targets)
+
+  def test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      # Test with non-Keras tensor
+      inputs = keras.Input((timesteps, embedding_dim))
+      initial_state = [keras.backend.random_normal_variable(
+          (num_samples, units), 0, 1)
+                       for _ in range(num_states)]
+      layer = keras.layers.LSTM(units)
+      output = layer(inputs, initial_state=initial_state)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch(inputs, targets)
+
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      layer = keras.layers.LSTM(units, stateful=True)
+      layer.build((num_samples, timesteps, embedding_dim))
+      layer.reset_states()
+      assert len(layer.states) == num_states
+      assert layer.states[0] is not None
+      np.testing.assert_allclose(
+          keras.backend.eval(layer.states[0]),
+          np.zeros(keras.backend.int_shape(layer.states[0])),
+          atol=1e-4)
+      state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+      values = [np.ones(shape) for shape in state_shapes]
+      if len(values) == 1:
+        values = values[0]
+      layer.reset_states(values)
+      np.testing.assert_allclose(
+          keras.backend.eval(layer.states[0]),
+          np.ones(keras.backend.int_shape(layer.states[0])),
+          atol=1e-4)
+
+      # Test with invalid data
+      with self.assertRaises(ValueError):
+        layer.reset_states([1] * (len(layer.states) + 1))
+
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, embedding_dim))
+      _ = keras.layers.Masking()(inputs)
+      initial_state = [keras.Input((units,)) for _ in range(num_states)]
+      output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
+
+      model = keras.models.Model([inputs] + initial_state, output)
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      initial_state = [np.random.random((num_samples, units))
+                       for _ in range(num_states)]
+      targets = np.random.random((num_samples, units))
+      model.train_on_batch([inputs] + initial_state, targets)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
new file mode 100644
index 00000000000..84c03fdebdb
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -0,0 +1,558 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=not-callable
+# pylint: disable=redefined-builtin
+"""Layers can merge several input tensors into a single output tensor.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+class _Merge(Layer):
+  """Generic merge layer for elementwise merge functions.
+
+  Used to implement `Sum`, `Average`, etc.
+
+  Arguments:
+      **kwargs: standard layer keyword arguments.
+  """
+
+  def __init__(self, **kwargs):
+    super(_Merge, self).__init__(**kwargs)
+    self.supports_masking = True
+
+  def _merge_function(self, inputs):
+    raise NotImplementedError
+
+  def _compute_elemwise_op_output_shape(self, shape1, shape2):
+    """Computes the shape of the resultant of an elementwise operation.
+
+    Arguments:
+        shape1: tuple or None. Shape of the first tensor
+        shape2: tuple or None. Shape of the second tensor
+
+    Returns:
+        expected output shape when an element-wise operation is
+        carried out on 2 tensors with shapes shape1 and shape2.
+        tuple or None.
+
+    Raises:
+        ValueError: if shape1 and shape2 are not compatible for
+            element-wise operations.
+    """
+    if None in [shape1, shape2]:
+      return None
+    elif len(shape1) < len(shape2):
+      return self._compute_elemwise_op_output_shape(shape2, shape1)
+    elif not shape2:
+      return shape1
+    output_shape = list(shape1[:-len(shape2)])
+    for i, j in zip(shape1[-len(shape2):], shape2):
+      if i is None or j is None:
+        output_shape.append(None)
+      elif i == 1:
+        output_shape.append(j)
+      elif j == 1:
+        output_shape.append(i)
+      else:
+        if i != j:
+          raise ValueError('Operands could not be broadcast '
+                           'together with shapes ' + str(shape1) + ' ' +
+                           str(shape2))
+        output_shape.append(i)
+    return tuple(output_shape)
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list):
+      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+    if len(input_shape) < 2:
+      raise ValueError('A merge layer should be called '
+                       'on a list of at least 2 inputs. '
+                       'Got ' + str(len(input_shape)) + ' inputs.')
+    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
+    batch_sizes = [s[0] for s in input_shape if s is not None]
+    batch_sizes = set(batch_sizes)
+    batch_sizes -= set([None])
+    if len(batch_sizes) > 1:
+      raise ValueError('Can not merge tensors with different '
+                       'batch sizes. Got tensors with shapes : ' +
+                       str(input_shape))
+    if input_shape[0] is None:
+      output_shape = None
+    else:
+      output_shape = input_shape[0][1:]
+    for i in range(1, len(input_shape)):
+      if input_shape[i] is None:
+        shape = None
+      else:
+        shape = input_shape[i][1:]
+      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+    # If the inputs have different ranks, we have to reshape them
+    # to make them broadcastable.
+    if None not in input_shape and len(set(map(len, input_shape))) == 1:
+      self._reshape_required = False
+    else:
+      self._reshape_required = True
+    self.built = True
+
+  def call(self, inputs):
+    if self._reshape_required:
+      reshaped_inputs = []
+      input_ndims = list(map(K.ndim, inputs))
+      if None not in input_ndims:
+        # If ranks of all inputs are available,
+        # we simply expand each of them at axis=1
+        # until all of them have the same rank.
+        max_ndim = max(input_ndims)
+        for x in inputs:
+          x_ndim = K.ndim(x)
+          for _ in range(max_ndim - x_ndim):
+            x = K.expand_dims(x, 1)
+          reshaped_inputs.append(x)
+        return self._merge_function(reshaped_inputs)
+      else:
+        # Transpose all inputs so that batch size is the last dimension.
+        # (batch_size, dim1, dim2, ... ) -> (dim1, dim2, ... , batch_size)
+        transposed = False
+        for x in inputs:
+          x_ndim = K.ndim(x)
+          if x_ndim is None:
+            x_shape = K.shape(x)
+            batch_size = x_shape[0]
+            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
+            x_transposed = K.reshape(x,
+                                     K.stack([batch_size,
+                                              K.prod(x_shape[1:])]))
+            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
+            x_transposed = K.reshape(x_transposed, new_shape)
+            reshaped_inputs.append(x_transposed)
+            transposed = True
+          elif x_ndim > 1:
+            dims = list(range(1, x_ndim)) + [0]
+            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            transposed = True
+          else:
+            # We don't transpose inputs if they are 1D vectors or scalars.
+            reshaped_inputs.append(x)
+        y = self._merge_function(reshaped_inputs)
+        y_ndim = K.ndim(y)
+        if transposed:
+          # If inputs have been transposed, we have to transpose the output too.
+          if y_ndim is None:
+            y_shape = K.shape(y)
+            y_ndim = K.shape(y_shape)[0]
+            batch_size = y_shape[y_ndim - 1]
+            new_shape = K.concatenate(
+                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
+            y = K.reshape(y, (-1, batch_size))
+            y = K.permute_dimensions(y, (1, 0))
+            y = K.reshape(y, new_shape)
+          elif y_ndim > 1:
+            dims = [y_ndim - 1] + list(range(y_ndim - 1))
+            y = K.permute_dimensions(y, dims)
+        return y
+    else:
+      return self._merge_function(inputs)
+
+  def compute_output_shape(self, input_shape):
+    if input_shape[0] is None:
+      output_shape = None
+    else:
+      output_shape = input_shape[0][1:]
+    for i in range(1, len(input_shape)):
+      if input_shape[i] is None:
+        shape = None
+      else:
+        shape = input_shape[i][1:]
+      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+    batch_sizes = [s[0] for s in input_shape if s is not None]
+    batch_sizes = set(batch_sizes)
+    batch_sizes -= set([None])
+    if len(batch_sizes) == 1:
+      output_shape = (list(batch_sizes)[0],) + output_shape
+    else:
+      output_shape = (None,) + output_shape
+    return output_shape
+
+  def compute_mask(self, inputs, mask=None):
+    if mask is None:
+      return None
+    if not isinstance(mask, list):
+      raise ValueError('`mask` should be a list.')
+    if not isinstance(inputs, list):
+      raise ValueError('`inputs` should be a list.')
+    if len(mask) != len(inputs):
+      raise ValueError('The lists `inputs` and `mask` '
+                       'should have the same length.')
+    if all([m is None for m in mask]):
+      return None
+    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
+
+
+class Add(_Merge):
+  """Layer that adds a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output += inputs[i]
+    return output
+
+
+class Multiply(_Merge):
+  """Layer that multiplies (element-wise) a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output *= inputs[i]
+    return output
+
+
+class Average(_Merge):
+  """Layer that averages a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output += inputs[i]
+    return output / len(inputs)
+
+
+class Maximum(_Merge):
+  """Layer that computes the maximum (element-wise) a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output = K.maximum(output, inputs[i])
+    return output
+
+
+class Concatenate(_Merge):
+  """Layer that concatenates a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape expect for the concatenation axis,
+  and returns a single tensor, the concatenation of all inputs.
+
+  Arguments:
+      axis: Axis along which to concatenate.
+      **kwargs: standard layer keyword arguments.
+  """
+
+  def __init__(self, axis=-1, **kwargs):
+    super(Concatenate, self).__init__(**kwargs)
+    self.axis = axis
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list):
+      raise ValueError('`Concatenate` layer should be called '
+                       'on a list of inputs')
+    if all([shape is None for shape in input_shape]):
+      return
+    reduced_inputs_shapes = [
+        tensor_shape.TensorShape(shape).as_list() for shape in input_shape
+    ]
+    shape_set = set()
+    for i in range(len(reduced_inputs_shapes)):
+      del reduced_inputs_shapes[i][self.axis]
+      shape_set.add(tuple(reduced_inputs_shapes[i]))
+    if len(shape_set) > 1:
+      raise ValueError('`Concatenate` layer requires '
+                       'inputs with matching shapes '
+                       'except for the concat axis. '
+                       'Got inputs shapes: %s' % (input_shape))
+    self.built = True
+
+  def call(self, inputs):
+    if not isinstance(inputs, list):
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of inputs.')
+    return K.concatenate(inputs, axis=self.axis)
+
+  def _compute_output_shape(self, input_shape):
+    if not isinstance(input_shape, list):
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of inputs.')
+    input_shapes = input_shape
+    output_shape = tensor_shape.TensorShape(input_shapes[0]).as_list()
+    for shape in input_shapes[1:]:
+      shape = tensor_shape.TensorShape(shape).as_list()
+      if output_shape[self.axis] is None or shape[self.axis] is None:
+        output_shape[self.axis] = None
+        break
+      output_shape[self.axis] += shape[self.axis]
+    return tensor_shape.TensorShape(output_shape)
+
+  def compute_mask(self, inputs, mask=None):
+    if mask is None:
+      return None
+    if not isinstance(mask, list):
+      raise ValueError('`mask` should be a list.')
+    if not isinstance(inputs, list):
+      raise ValueError('`inputs` should be a list.')
+    if len(mask) != len(inputs):
+      raise ValueError('The lists `inputs` and `mask` '
+                       'should have the same length.')
+    if all([m is None for m in mask]):
+      return None
+    # Make a list of masks while making sure
+    # the dimensionality of each mask
+    # is the same as the corresponding input.
+    masks = []
+    for input_i, mask_i in zip(inputs, mask):
+      if mask_i is None:
+        # Input is unmasked. Append all 1s to masks,
+        # but cast it to bool first
+        masks.append(K.cast(K.ones_like(input_i), 'bool'))
+      elif K.ndim(mask_i) < K.ndim(input_i):
+        # Mask is smaller than the input, expand it
+        masks.append(K.expand_dims(mask_i))
+      else:
+        masks.append(mask_i)
+    concatenated = K.concatenate(masks, axis=self.axis)
+    return K.all(concatenated, axis=-1, keepdims=False)
+
+  def get_config(self):
+    config = {
+        'axis': self.axis,
+    }
+    base_config = super(Concatenate, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Dot(_Merge):
+  """Layer that computes a dot product between samples in two tensors.
+
+  E.g. if applied to two tensors `a` and `b` of shape `(batch_size, n)`,
+  the output will be a tensor of shape `(batch_size, 1)`
+  where each entry `i` will be the dot product between
+  `a[i]` and `b[i]`.
+
+  Arguments:
+      axes: Integer or tuple of integers,
+          axis or axes along which to take the dot product.
+      normalize: Whether to L2-normalize samples along the
+          dot product axis before taking the dot product.
+          If set to True, then the output of the dot product
+          is the cosine proximity between the two samples.
+      **kwargs: Standard layer keyword arguments.
+  """
+
+  def __init__(self, axes, normalize=False, **kwargs):
+    super(Dot, self).__init__(**kwargs)
+    if not isinstance(axes, int):
+      if not isinstance(axes, (list, tuple)):
+        raise TypeError('Invalid type for `axes` - '
+                        'should be a list or an int.')
+      if len(axes) != 2:
+        raise ValueError('Invalid format for `axes` - '
+                         'should contain two elements.')
+      if not isinstance(axes[0], int) or not isinstance(axes[1], int):
+        raise ValueError('Invalid format for `axes` - '
+                         'list elements should be "int".')
+    self.axes = axes
+    self.normalize = normalize
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    # Used purely for shape validation.
+    if not isinstance(input_shape, list) or len(input_shape) != 2:
+      raise ValueError('A `Dot` layer should be called '
+                       'on a list of 2 inputs.')
+    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
+    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    if shape1 is None or shape2 is None:
+      return
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % len(shape1), self.axes % len(shape2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = self.axes
+    if shape1[axes[0]] != shape2[axes[1]]:
+      raise ValueError('Dimension incompatibility '
+                       '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
+                       'Layer shapes: %s, %s' % (shape1, shape2))
+    self.built = True
+
+  def call(self, inputs):
+    x1 = inputs[0]
+    x2 = inputs[1]
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % K.ndim(x1), self.axes % K.ndim(x2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = []
+      for i in range(len(self.axes)):
+        if self.axes[i] < 0:
+          axes.append(self.axes[i] % K.ndim(inputs[i]))
+        else:
+          axes.append(self.axes[i])
+    if self.normalize:
+      x1 = K.l2_normalize(x1, axis=axes[0])
+      x2 = K.l2_normalize(x2, axis=axes[1])
+    output = K.batch_dot(x1, x2, axes)
+    return output
+
+  def _compute_output_shape(self, input_shape):
+    if not isinstance(input_shape, list) or len(input_shape) != 2:
+      raise ValueError('A `Dot` layer should be called '
+                       'on a list of 2 inputs.')
+    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
+    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    if isinstance(self.axes, int):
+      if self.axes < 0:
+        axes = [self.axes % len(shape1), self.axes % len(shape2)]
+      else:
+        axes = [self.axes] * 2
+    else:
+      axes = self.axes
+    shape1.pop(axes[0])
+    shape2.pop(axes[1])
+    shape2.pop(0)
+    output_shape = shape1 + shape2
+    if len(output_shape) == 1:
+      output_shape += [1]
+    return tensor_shape.TensorShape(output_shape)
+
+  def compute_mask(self, inputs, mask=None):
+    return None
+
+  def get_config(self):
+    config = {
+        'axes': self.axes,
+        'normalize': self.normalize,
+    }
+    base_config = super(Dot, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def add(inputs, **kwargs):
+  """Functional interface to the `Add` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the sum of the inputs.
+  """
+  return Add(**kwargs)(inputs)
+
+
+def multiply(inputs, **kwargs):
+  """Functional interface to the `Multiply` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the element-wise product of the inputs.
+  """
+  return Multiply(**kwargs)(inputs)
+
+
+def average(inputs, **kwargs):
+  """Functional interface to the `Average` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the average of the inputs.
+  """
+  return Average(**kwargs)(inputs)
+
+
+def maximum(inputs, **kwargs):
+  """Functional interface to the `Maximum` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the element-wise maximum of the inputs.
+  """
+  return Maximum(**kwargs)(inputs)
+
+
+def concatenate(inputs, axis=-1, **kwargs):
+  """Functional interface to the `Concatenate` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      axis: Concatenation axis.
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the concatenation of the inputs alongside axis `axis`.
+  """
+  return Concatenate(axis=axis, **kwargs)(inputs)
+
+
+def dot(inputs, axes, normalize=False, **kwargs):
+  """Functional interface to the `Dot` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      axes: Integer or tuple of integers,
+          axis or axes along which to take the dot product.
+      normalize: Whether to L2-normalize samples along the
+          dot product axis before taking the dot product.
+          If set to True, then the output of the dot product
+          is the cosine proximity between the two samples.
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the dot product of the samples from the inputs.
+  """
+  return Dot(axes=axes, normalize=normalize, **kwargs)(inputs)
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge_test.py b/tensorflow/contrib/keras/python/keras/layers/merge_test.py
new file mode 100644
index 00000000000..2887fb851b0
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/merge_test.py
@@ -0,0 +1,178 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for merge layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class MergeLayersTest(test.TestCase):
+
+  def test_merge_add(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      i3 = keras.layers.Input(shape=(4, 5))
+
+      o = keras.layers.add([i1, i2, i3])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2, i3], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      x3 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2, x3])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
+
+      # test masking
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      m1 = keras.layers.Masking()(i1)
+      layer = keras.layers.Add()
+      o = layer([m1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      mask = layer.output_mask
+      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+
+  def test_merge_elementwise_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1, i2])
+    with self.assertRaises(ValueError):
+      keras.layers.add(i1)
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+
+  def test_merge_multiply(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      i3 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.multiply([i1, i2, i3])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2, i3], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      x3 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2, x3])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
+
+  def test_merge_average(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.average([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
+
+  def test_merge_maximum(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.maximum([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
+
+  def test_merge_concatenate(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.concatenate([i1, i2], axis=1)
+      self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 8, 5))
+      self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
+
+  def test_concatenate_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(3, 5))
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate([i1, i2], axis=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate(i1, axis=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.concatenate([i1], axis=-1)
+
+  def test_merge_dot(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4,))
+      i2 = keras.layers.Input(shape=(4,))
+      o = keras.layers.dot([i1, i2], axes=1)
+      self.assertListEqual(o.get_shape().as_list(), [None, 1])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4))
+      x2 = np.random.random((2, 4))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 1))
+      expected = np.zeros((2, 1))
+      expected[0, 0] = np.dot(x1[0], x2[0])
+      expected[1, 0] = np.dot(x1[1], x2[1])
+      self.assertAllClose(out, expected, atol=1e-4)
+
+      # Test with negative tuple of axes.
+      o = keras.layers.dot([i1, i2], axes=(-1, -1))
+      self.assertListEqual(o.get_shape().as_list(), [None, 1])
+      model = keras.models.Model([i1, i2], o)
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 1))
+      self.assertAllClose(out, expected, atol=1e-4)
+
+      # test _compute_output_shape
+      layer = keras.layers.Dot(axes=-1)
+      self.assertEqual(layer._compute_output_shape([(4, 5), (4, 5)]), (4, 1))
+
+  def test_dot_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    i3 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1, i2], axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot(i1, axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1], axes=-1)
+    with self.assertRaises(ValueError):
+      keras.layers.dot([i1, i2, i3], axes=-1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise.py b/tensorflow/contrib/keras/python/keras/layers/noise.py
new file mode 100644
index 00000000000..adc88a4fce4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/noise.py
@@ -0,0 +1,111 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layers for regularization models via the addition of noise.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import Layer
+
+
+class GaussianNoise(Layer):
+  """Apply additive zero-centered Gaussian noise.
+
+  This is useful to mitigate overfitting
+  (you could see it as a form of random data augmentation).
+  Gaussian Noise (GS) is a natural choice as corruption process
+  for real valued inputs.
+
+  As it is a regularization layer, it is only active at training time.
+
+  Arguments:
+      stddev: float, standard deviation of the noise distribution.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
+
+  def __init__(self, stddev, **kwargs):
+    super(GaussianNoise, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.stddev = stddev
+
+  def call(self, inputs, training=None):
+
+    def noised():
+      return inputs + K.random_normal(
+          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+
+    return K.in_train_phase(noised, inputs, training=training)
+
+  def get_config(self):
+    config = {'stddev': self.stddev}
+    base_config = super(GaussianNoise, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GaussianDropout(Layer):
+  """Apply multiplicative 1-centered Gaussian noise.
+
+  As it is a regularization layer, it is only active at training time.
+
+  Arguments:
+      rate: float, drop probability (as with `Dropout`).
+          The multiplicative noise will have
+          standard deviation `sqrt(rate / (1 - rate))`.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+        Srivastava, Hinton, et al.
+        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+  """
+
+  def __init__(self, rate, **kwargs):
+    super(GaussianDropout, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.rate = rate
+
+  def call(self, inputs, training=None):
+    if 0 < self.rate < 1:
+
+      def noised():
+        stddev = np.sqrt(self.rate / (1.0 - self.rate))
+        return inputs * K.random_normal(
+            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+
+      return K.in_train_phase(noised, inputs, training=training)
+    return inputs
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(GaussianDropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise_test.py b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
new file mode 100644
index 00000000000..b0257b167a7
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
@@ -0,0 +1,44 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for noise layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class NoiseLayersTest(test.TestCase):
+
+  def test_GaussianNoise(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GaussianNoise,
+          kwargs={'stddev': 1.},
+          input_shape=(3, 2, 3))
+
+  def test_GaussianDropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.GaussianDropout,
+          kwargs={'rate': 0.5},
+          input_shape=(3, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization.py b/tensorflow/contrib/keras/python/keras/layers/normalization.py
new file mode 100644
index 00000000000..4537814b61a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization.py
@@ -0,0 +1,143 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Normalization layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.layers import normalization as tf_normalization_layers
+
+
+class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
+  """Batch normalization layer (Ioffe and Szegedy, 2014).
+
+  Normalize the activations of the previous layer at each batch,
+  i.e. applies a transformation that maintains the mean activation
+  close to 0 and the activation standard deviation close to 1.
+
+  Arguments:
+      axis: Integer, the axis that should be normalized
+          (typically the features axis).
+          For instance, after a `Conv2D` layer with
+          `data_format="channels_first"`,
+          set `axis=1` in `BatchNormalization`.
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor.
+          If False, `beta` is ignored.
+      scale: If True, multiply by `gamma`.
+          If False, `gamma` is not used.
+          When the next layer is linear (also e.g. `nn.relu`),
+          this can be disabled since the scaling
+          will be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Batch Normalization: Accelerating Deep Network Training by Reducing
+        Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+  """
+
+  def __init__(self,
+               axis=-1,
+               momentum=0.99,
+               epsilon=1e-3,
+               center=True,
+               scale=True,
+               beta_initializer='zeros',
+               gamma_initializer='ones',
+               moving_mean_initializer='zeros',
+               moving_variance_initializer='ones',
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               **kwargs):
+    self.supports_masking = True
+    super(BatchNormalization, self).__init__(
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=initializers.get(beta_initializer),
+        gamma_initializer=initializers.get(gamma_initializer),
+        moving_mean_initializer=initializers.get(moving_mean_initializer),
+        moving_variance_initializer=initializers.get(
+            moving_variance_initializer),
+        beta_regularizer=regularizers.get(beta_regularizer),
+        gamma_regularizer=regularizers.get(gamma_regularizer),
+        **kwargs
+    )
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+
+  def build(self, input_shape):
+    super(BatchNormalization, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.center and self.beta_constraint:
+      self.constraints[self.beta] = self.beta_constraint
+    if self.scale and self.gamma_constraint:
+      self.constraints[self.gamma] = self.gamma_constraint
+
+  def call(self, inputs, training=None):
+    if training is None:
+      training = K.learning_phase()
+    output = super(BatchNormalization, self).call(inputs, training=training)
+    if training is K.learning_phase():
+      output._uses_learning_phase = True  # pylint: disable=protected-access
+    return output
+
+  def get_config(self):
+    config = {
+        'axis': self.axis,
+        'momentum': self.momentum,
+        'epsilon': self.epsilon,
+        'center': self.center,
+        'scale': self.scale,
+        'beta_initializer': initializers.serialize(self.beta_initializer),
+        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'moving_mean_initializer':
+            initializers.serialize(self.moving_mean_initializer),
+        'moving_variance_initializer':
+            initializers.serialize(self.moving_variance_initializer),
+        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint': constraints.serialize(self.beta_constraint),
+        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+    }
+    base_config = super(BatchNormalization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
new file mode 100644
index 00000000000..dc410f84d85
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
@@ -0,0 +1,144 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for normalization layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class NoiseLayersTest(test.TestCase):
+
+  def basic_batchnorm_test(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={
+              'momentum': 0.9,
+              'epsilon': 0.1,
+              'gamma_regularizer': keras.regularizers.l2(0.01),
+              'beta_regularizer': keras.regularizers.l2(0.01)
+          },
+          input_shape=(3, 4, 2))
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={
+              'gamma_initializer': 'ones',
+              'beta_initializer': 'ones',
+              'moving_mean_initializer': 'zeros',
+              'moving_variance_initializer': 'ones'
+          },
+          input_shape=(3, 4, 2))
+      testing_utils.layer_test(
+          keras.layers.BatchNormalization,
+          kwargs={'scale': False,
+                  'center': False},
+          input_shape=(3, 3))
+
+  def batchnorm_weights_test(self):
+    with self.test_session():
+      layer = keras.layers.BatchNormalization(scale=False, center=False)
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.trainable_weights), 0)
+      self.assertEqual(len(layer.weights), 2)
+
+      layer = keras.layers.BatchNormalization()
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.trainable_weights), 2)
+      self.assertEqual(len(layer.weights), 4)
+
+  def batchnorm_regularization_test(self):
+    with self.test_session():
+      layer = keras.layers.BatchNormalization(
+          gamma_regularizer='l1', beta_regularizer='l1')
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.losses), 2)
+      layer = keras.layers.BatchNormalization(
+          gamma_constraint='l1', beta_constraint='l1')
+      layer.build((None, 3, 4))
+      self.assertEqual(len(layer.constraints), 2)
+
+  def test_batchnorm_correctness(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+  def test_batchnorm_convnet(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=1, input_shape=(3, 4, 4), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  def test_shared_batchnorm(self):
+    """Test that a BN layer can be shared across different data streams.
+    """
+    with self.test_session():
+      # Test single layer reuse
+      bn = keras.layers.BatchNormalization()
+      x1 = keras.layers.Input(shape=(10,))
+      _ = bn(x1)
+
+      x2 = keras.layers.Input(shape=(10,))
+      y2 = bn(x2)
+
+      x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
+      model = keras.models.Model(x2, y2)
+
+      model.compile('sgd', 'mse')
+      model.train_on_batch(x, x)
+
+      assert len(model.updates) == 2
+
+      # Test model-level reuse
+      x3 = keras.layers.Input(shape=(10,))
+      y3 = model(x3)
+      new_model = keras.models.Model(x3, y3)
+      assert len(model.updates) == 2
+      new_model.compile('sgd', 'mse')
+      new_model.train_on_batch(x, x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling.py b/tensorflow/contrib/keras/python/keras/layers/pooling.py
new file mode 100644
index 00000000000..704f05e494e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling.py
@@ -0,0 +1,595 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pooling layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import pooling as tf_pooling_layers
+
+
+class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
+  """Max pooling operation for temporal data.
+
+  Arguments:
+      pool_size: Integer, size of the max pooling windows.
+      strides: Integer, or None. Factor by which to downscale.
+          E.g. 2 will halve the input.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+  """
+
+  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    if strides is None:
+      strides = pool_size
+    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(MaxPooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
+  """Average pooling for temporal data.
+
+  Arguments:
+      pool_size: Integer, size of the max pooling windows.
+      strides: Integer, or None. Factor by which to downscale.
+          E.g. 2 will halve the input.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+  """
+
+  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    if strides is None:
+      strides = pool_size
+    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
+                                           **kwargs)
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(AveragePooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
+  """Max pooling operation for spatial data.
+
+  Arguments:
+      pool_size: integer or tuple of 2 integers,
+          factors by which to downscale (vertical, horizontal).
+          (2, 2) will halve the input in both spatial dimension.
+          If only one integer is specified, the same window length
+          will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+          Strides values.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, pooled_rows, pooled_cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, pooled_rows, pooled_cols)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
+    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
+                                       **kwargs)
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(MaxPooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
+  """Average pooling operation for spatial data.
+
+  Arguments:
+      pool_size: integer or tuple of 2 integers,
+          factors by which to downscale (vertical, horizontal).
+          (2, 2) will halve the input in both spatial dimension.
+          If only one integer is specified, the same window length
+          will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+          Strides values.
+          If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, pooled_rows, pooled_cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, pooled_rows, pooled_cols)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
+    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
+                                           data_format, **kwargs)
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(AveragePooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
+  """Max pooling operation for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      pool_size: tuple of 3 integers,
+          factors by which to downscale (dim1, dim2, dim3).
+          (2, 2, 2) will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
+    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
+                                       **kwargs)
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(MaxPooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
+  """Average pooling operation for 3D data (spatial or spatio-temporal).
+
+  Arguments:
+      pool_size: tuple of 3 integers,
+          factors by which to downscale (dim1, dim2, dim3).
+          (2, 2, 2) will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+  """
+
+  def __init__(self,
+               pool_size=(2, 2, 2),
+               strides=None,
+               padding='valid',
+               data_format=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
+    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
+                                           data_format, **kwargs)
+
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(AveragePooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class _GlobalPooling1D(Layer):
+  """Abstract class for different global pooling 1D layers.
+  """
+
+  def __init__(self, **kwargs):
+    super(_GlobalPooling1D, self).__init__(**kwargs)
+    self.input_spec = InputSpec(ndim=3)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+
+class GlobalAveragePooling1D(_GlobalPooling1D):
+  """Global average pooling operation for temporal data.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    return K.mean(inputs, axis=1)
+
+
+class GlobalMaxPooling1D(_GlobalPooling1D):
+  """Global max pooling operation for temporal data.
+
+  Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    return K.max(inputs, axis=1)
+
+
+class _GlobalPooling2D(Layer):
+  """Abstract class for different global pooling 2D layers.
+  """
+
+  def __init__(self, data_format=None, **kwargs):
+    super(_GlobalPooling2D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[3]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(_GlobalPooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GlobalAveragePooling2D(_GlobalPooling2D):
+  """Global average pooling operation for spatial data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.mean(inputs, axis=[1, 2])
+    else:
+      return K.mean(inputs, axis=[2, 3])
+
+
+class GlobalMaxPooling2D(_GlobalPooling2D):
+  """Global max pooling operation for spatial data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          4D tensor with shape:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format='channels_first'`:
+          4D tensor with shape:
+          `(batch_size, channels, rows, cols)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.max(inputs, axis=[1, 2])
+    else:
+      return K.max(inputs, axis=[2, 3])
+
+
+class _GlobalPooling3D(Layer):
+  """Abstract class for different global pooling 3D layers.
+  """
+
+  def __init__(self, data_format=None, **kwargs):
+    super(_GlobalPooling3D, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[4]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+
+  def call(self, inputs):
+    raise NotImplementedError
+
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(_GlobalPooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GlobalAveragePooling3D(_GlobalPooling3D):
+  """Global Average pooling operation for 3D data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.mean(inputs, axis=[1, 2, 3])
+    else:
+      return K.mean(inputs, axis=[2, 3, 4])
+
+
+class GlobalMaxPooling3D(_GlobalPooling3D):
+  """Global Max pooling operation for 3D data.
+
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+          while `channels_first` corresponds to inputs with shape
+          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
+  Input shape:
+      - If `data_format='channels_last'`:
+          5D tensor with shape:
+          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+          5D tensor with shape:
+          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+  Output shape:
+      2D tensor with shape:
+      `(batch_size, channels)`
+  """
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      return K.max(inputs, axis=[1, 2, 3])
+    else:
+      return K.max(inputs, axis=[2, 3, 4])
+
+
+# Aliases
+
+AvgPool1D = AveragePooling1D
+MaxPool1D = MaxPooling1D
+AvgPool2D = AveragePooling2D
+MaxPool2D = MaxPooling2D
+AvgPool3D = AveragePooling3D
+MaxPool3D = MaxPooling3D
+GlobalMaxPool1D = GlobalMaxPooling1D
+GlobalMaxPool2D = GlobalMaxPooling2D
+GlobalMaxPool3D = GlobalMaxPooling3D
+GlobalAvgPool1D = GlobalAveragePooling1D
+GlobalAvgPool2D = GlobalAveragePooling2D
+GlobalAvgPool3D = GlobalAveragePooling3D
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
new file mode 100644
index 00000000000..d8a6a1673bb
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -0,0 +1,182 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pooling layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class GlobalPoolingTest(test.TestCase):
+
+  def test_globalpooling_1d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
+                               input_shape=(3, 4, 5))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
+
+  def test_globalpooling_2d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling2D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 5, 6))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling2D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 5, 6, 4))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling2D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 5, 6))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling2D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 5, 6, 4))
+
+  def test_globalpooling_3d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling3D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalMaxPooling3D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling3D,
+          kwargs={'data_format': 'channels_first'},
+          input_shape=(3, 4, 3, 4, 3))
+      testing_utils.layer_test(
+          keras.layers.pooling.GlobalAveragePooling3D,
+          kwargs={'data_format': 'channels_last'},
+          input_shape=(3, 4, 3, 4, 3))
+
+
+class Pooling2DTest(test.TestCase):
+
+  def test_maxpooling_2d(self):
+    pool_size = (3, 3)
+    with self.test_session(use_gpu=True):
+      for strides in [(1, 1), (2, 2)]:
+        testing_utils.layer_test(
+            keras.layers.MaxPooling2D,
+            kwargs={
+                'strides': strides,
+                'padding': 'valid',
+                'pool_size': pool_size
+            },
+            input_shape=(3, 5, 6, 4))
+
+  def test_averagepooling_2d(self):
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.AveragePooling2D,
+          kwargs={'strides': (2, 2),
+                  'padding': 'same',
+                  'pool_size': (2, 2)},
+          input_shape=(3, 5, 6, 4))
+      testing_utils.layer_test(
+          keras.layers.AveragePooling2D,
+          kwargs={'strides': (2, 2),
+                  'padding': 'valid',
+                  'pool_size': (3, 3)},
+          input_shape=(3, 5, 6, 4))
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.AveragePooling2D,
+            kwargs={
+                'strides': (1, 1),
+                'padding': 'valid',
+                'pool_size': (2, 2),
+                'data_format': 'channels_first'
+            },
+            input_shape=(3, 4, 5, 6))
+
+
+class Pooling3DTest(test.TestCase):
+
+  def test_maxpooling_3d(self):
+    pool_size = (3, 3, 3)
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.MaxPooling3D,
+          kwargs={'strides': 2,
+                  'padding': 'valid',
+                  'pool_size': pool_size},
+          input_shape=(3, 11, 12, 10, 4))
+      testing_utils.layer_test(
+          keras.layers.MaxPooling3D,
+          kwargs={
+              'strides': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'pool_size': pool_size
+          },
+          input_shape=(3, 4, 11, 12, 10))
+
+  def test_averagepooling_3d(self):
+    pool_size = (3, 3, 3)
+    with self.test_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.AveragePooling3D,
+          kwargs={'strides': 2,
+                  'padding': 'valid',
+                  'pool_size': pool_size},
+          input_shape=(3, 11, 12, 10, 4))
+      testing_utils.layer_test(
+          keras.layers.AveragePooling3D,
+          kwargs={
+              'strides': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'pool_size': pool_size
+          },
+          input_shape=(3, 4, 11, 12, 10))
+
+
+class Pooling1DTest(test.TestCase):
+
+  def test_maxpooling_1d(self):
+    with self.test_session(use_gpu=True):
+      for padding in ['valid', 'same']:
+        for stride in [1, 2]:
+          testing_utils.layer_test(
+              keras.layers.MaxPooling1D,
+              kwargs={'strides': stride,
+                      'padding': padding},
+              input_shape=(3, 5, 4))
+
+  def test_averagepooling_1d(self):
+    with self.test_session(use_gpu=True):
+      for padding in ['valid', 'same']:
+        for stride in [1, 2]:
+          testing_utils.layer_test(
+              keras.layers.AveragePooling1D,
+              kwargs={'strides': stride,
+                      'padding': padding},
+              input_shape=(3, 5, 4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
new file mode 100644
index 00000000000..cdef55f599f
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -0,0 +1,1278 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Recurrent layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import activations
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+
+
+# pylint: disable=access-member-before-definition
+
+
+def _time_distributed_dense(x,
+                            w,
+                            b=None,
+                            dropout=None,
+                            input_dim=None,
+                            output_dim=None,
+                            timesteps=None,
+                            training=None):
+  """Apply `y . w + b` for every temporal slice y of x.
+
+  Arguments:
+      x: input tensor.
+      w: weight matrix.
+      b: optional bias vector.
+      dropout: wether to apply dropout (same dropout mask
+          for every temporal slice of the input).
+      input_dim: integer; optional dimensionality of the input.
+      output_dim: integer; optional dimensionality of the output.
+      timesteps: integer; optional number of timesteps.
+      training: training phase tensor or boolean.
+
+  Returns:
+      Output tensor.
+  """
+  if not input_dim:
+    input_dim = K.shape(x)[2]
+  if not timesteps:
+    timesteps = K.shape(x)[1]
+  if not output_dim:
+    output_dim = K.shape(w)[1]
+
+  if dropout is not None and 0. < dropout < 1.:
+    # apply the same dropout pattern at every timestep
+    ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
+    dropout_matrix = K.dropout(ones, dropout)
+    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
+    x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
+
+  # collapse time dimension and batch dimension together
+  x = K.reshape(x, (-1, input_dim))
+  x = K.dot(x, w)
+  if b is not None:
+    x = K.bias_add(x, b)
+  # reshape to 3D tensor
+  if K.backend() == 'tensorflow':
+    x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
+    x.set_shape([None, None, output_dim])
+  else:
+    x = K.reshape(x, (-1, timesteps, output_dim))
+  return x
+
+
+class Recurrent(Layer):
+  """Abstract base class for recurrent layers.
+
+  Do not use in a model -- it's not a valid layer!
+  Use its children classes `LSTM`, `GRU` and `SimpleRNN` instead.
+
+  All recurrent layers (`LSTM`, `GRU`, `SimpleRNN`) also
+  follow the specifications of this class and accept
+  the keyword arguments listed below.
+
+  Example:
+
+  ```python
+      # as the first layer in a Sequential model
+      model = Sequential()
+      model.add(LSTM(32, input_shape=(10, 64)))
+      # now model.output_shape == (None, 32)
+      # note: `None` is the batch dimension.
+
+      # for subsequent layers, no need to specify the input size:
+      model.add(LSTM(16))
+
+      # to stack recurrent layers, you must use return_sequences=True
+      # on any recurrent layer that feeds into another recurrent layer.
+      # note that you only need to specify the input size on the first layer.
+      model = Sequential()
+      model.add(LSTM(64, input_dim=64, input_length=10, return_sequences=True))
+      model.add(LSTM(32, return_sequences=True))
+      model.add(LSTM(10))
+  ```
+
+  Arguments:
+      weights: list of Numpy arrays to set as initial weights.
+          The list should have 3 elements, of shapes:
+          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+      implementation: one of {0, 1, or 2}.
+          If set to 0, the RNN will use
+          an implementation that uses fewer, larger matrix products,
+          thus running faster on CPU but consuming more memory.
+          If set to 1, the RNN will use more matrix products,
+          but smaller ones, thus running slower
+          (may actually be faster on GPU) while consuming less memory.
+          If set to 2 (LSTM/GRU only),
+          the RNN will combine the input gate,
+          the forget gate and the output gate into a single matrix,
+          enabling more time-efficient parallelization on the GPU.
+          Note: RNN dropout must be shared for all gates,
+          resulting in a slightly reduced regularization.
+      input_dim: dimensionality of the input (integer).
+          This argument (or alternatively, the keyword argument `input_shape`)
+          is required when using this layer as the first layer in a model.
+      input_length: Length of input sequences, to be specified
+          when it is constant.
+          This argument is required if you are going to connect
+          `Flatten` then `Dense` layers upstream
+          (without it, the shape of the dense outputs cannot be computed).
+          Note that if the recurrent layer is not the first layer
+          in your model, you would need to specify the input length
+          at the level of the first layer
+          (e.g. via the `input_shape` argument)
+
+  Input shape:s
+      3D tensor with shape `(batch_size, timesteps, input_dim)`,
+      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+
+  Output shape:
+      - if `return_sequences`: 3D tensor with shape
+          `(batch_size, timesteps, units)`.
+      - else, 2D tensor with shape `(batch_size, units)`.
+
+  # Masking
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an `Embedding` layer with the `mask_zero` parameter
+      set to `True`.
+
+  # Note on using statefulness in RNNs
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch. This assumes a one-to-one mapping
+      between samples in different successive batches.
+
+      To enable statefulness:
+          - specify `stateful=True` in the layer constructor.
+          - specify a fixed batch size for your model, by passing
+              if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+              else for functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+              This is the expected shape of your inputs
+              *including the batch size*.
+              It should be a tuple of integers, e.g. `(32, 10, 100)`.
+          - specify `shuffle=False` when calling fit().
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+
+  # Note on specifying the initial state of RNNs
+      You can specify the initial state of RNN layers symbolically by
+      calling them with the keyword argument `initial_state`. The value of
+      `initial_state` should be a tensor or list of tensors representing
+      the initial state of the RNN layer.
+
+      You can specify the initial state of RNN layers numerically by
+      calling `reset_states` with the keyword argument `states`. The value of
+      `states` should be a numpy array or list of numpy arrays representing
+      the initial state of the RNN layer.
+  """
+
+  def __init__(self,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               implementation=0,
+               **kwargs):
+    super(Recurrent, self).__init__(**kwargs)
+    self.return_sequences = return_sequences
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.unroll = unroll
+    self.implementation = implementation
+    self.supports_masking = True
+    self.input_spec = [InputSpec(ndim=3)]
+    self.state_spec = None
+    self.dropout = 0
+    self.recurrent_dropout = 0
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.return_sequences:
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], self.units])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], self.units])
+
+  def compute_mask(self, inputs, mask):
+    if self.return_sequences:
+      if isinstance(mask, list):
+        return mask[0]
+      return mask
+    else:
+      return None
+
+  def step(self, inputs, states):
+    raise NotImplementedError
+
+  def get_constants(self, inputs, training=None):
+    return []
+
+  def get_initial_state(self, inputs):
+    # build an all-zero tensor of shape (samples, output_dim)
+    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
+    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
+    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = K.tile(initial_state, [1,
+                                           self.units])  # (samples, output_dim)
+    initial_state = [initial_state for _ in range(len(self.states))]
+    return initial_state
+
+  def preprocess_input(self, inputs, training=None):
+    return inputs
+
+  def __call__(self, inputs, initial_state=None, **kwargs):
+    # If `initial_state` is specified,
+    # and if it a Keras tensor,
+    # then add it to the inputs and temporarily
+    # modify the input spec to include the state.
+    if initial_state is None:
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+    if not isinstance(initial_state, (list, tuple)):
+      initial_state = [initial_state]
+
+    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+    for tensor in initial_state:
+      if hasattr(tensor, '_keras_history') != is_keras_tensor:
+        raise ValueError('The initial state of an RNN layer cannot be'
+                         ' specified with a mix of Keras tensors and'
+                         ' non-Keras tensors')
+
+    if is_keras_tensor:
+      # Compute the full input spec, including state
+      input_spec = self.input_spec
+      state_spec = self.state_spec
+      if not isinstance(input_spec, list):
+        input_spec = [input_spec]
+      if not isinstance(state_spec, list):
+        state_spec = [state_spec]
+      self.input_spec = input_spec + state_spec
+
+      # Compute the full inputs, including state
+      inputs = [inputs] + list(initial_state)
+
+      # Perform the call
+      output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+      # Restore original input spec
+      self.input_spec = input_spec
+      return output
+    else:
+      kwargs['initial_state'] = initial_state
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+    elif initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    input_shape = K.int_shape(inputs)
+    if self.unroll and input_shape[1] is None:
+      raise ValueError('Cannot unroll a RNN if the '
+                       'time dimension is undefined. \n'
+                       '- If using a Sequential model, '
+                       'specify the time dimension by passing '
+                       'an `input_shape` or `batch_input_shape` '
+                       'argument to your first layer. If your '
+                       'first layer is an Embedding, you can '
+                       'also use the `input_length` argument.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a `shape` '
+                       'or `batch_shape` argument to your Input layer.')
+    constants = self.get_constants(inputs, training=None)
+    preprocessed_input = self.preprocess_input(inputs, training=None)
+    last_output, outputs, states = K.rnn(
+        self.step,
+        preprocessed_input,
+        initial_state,
+        go_backwards=self.go_backwards,
+        mask=mask,
+        constants=constants,
+        unroll=self.unroll)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append((self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    # Properly set learning phase
+    if 0 < self.dropout + self.recurrent_dropout:
+      last_output._uses_learning_phase = True
+      outputs._uses_learning_phase = True
+
+    if self.return_sequences:
+      return outputs
+    else:
+      return last_output
+
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    batch_size = self.input_spec[0].shape[0]
+    if not batch_size:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.')
+    # initialize state if None
+    if self.states[0] is None:
+      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
+    elif states is None:
+      for state in self.states:
+        K.set_value(state, np.zeros((batch_size, self.units)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, '
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if value.shape != (batch_size, self.units):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' + self.name +
+                           ': expected shape=' + str((batch_size, self.units)) +
+                           ', found shape=' + str(value.shape))
+        K.set_value(state, value)
+
+  def get_config(self):
+    config = {
+        'return_sequences': self.return_sequences,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful,
+        'unroll': self.unroll,
+        'implementation': self.implementation
+    }
+    base_config = super(Recurrent, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class SimpleRNN(Recurrent):
+  """Fully-connected RNN where the output is to be fed back to input.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(SimpleRNN, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = InputSpec(shape=(None, self.units))
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        shape=(self.input_dim, self.units),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(self.units,),
+          name='bias',
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation > 0:
+      return inputs
+    else:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+      return _time_distributed_dense(
+          inputs,
+          self.kernel,
+          self.bias,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+
+  def step(self, inputs, states):
+    if self.implementation == 0:
+      h = inputs
+    else:
+      if 0 < self.dropout < 1:
+        h = K.dot(inputs * states[1], self.kernel)
+      else:
+        h = K.dot(inputs, self.kernel)
+      if self.bias is not None:
+        h = K.bias_add(h, self.bias)
+
+    prev_output = states[0]
+    if 0 < self.recurrent_dropout < 1:
+      prev_output *= states[2]
+    output = h + K.dot(prev_output, self.recurrent_kernel)
+    if self.activation is not None:
+      output = self.activation(output)
+
+    # Properly set learning phase on output tensor.
+    if 0 < self.dropout + self.recurrent_dropout:
+      output._uses_learning_phase = True
+    return output, [output]
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(rec_dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+    return constants
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(SimpleRNN, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class GRU(Recurrent):
+  """Gated Recurrent Unit - Cho et al.
+
+  2014.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [On the Properties of Neural Machine Translation: Encoder-Decoder
+        Approaches](https://arxiv.org/abs/1409.1259)
+      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+        Modeling](http://arxiv.org/abs/1412.3555v1)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(GRU, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = InputSpec(shape=(None, self.units))
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        shape=(self.input_dim, self.units * 3),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 3),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(self.units * 3,),
+          name='bias',
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+
+    self.kernel_z = self.kernel[:, :self.units]
+    self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
+    self.kernel_r = self.kernel[:, self.units:self.units * 2]
+    self.recurrent_kernel_r = self.recurrent_kernel[:, self.units:
+                                                    self.units * 2]
+    self.kernel_h = self.kernel[:, self.units * 2:]
+    self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
+
+    if self.use_bias:
+      self.bias_z = self.bias[:self.units]
+      self.bias_r = self.bias[self.units:self.units * 2]
+      self.bias_h = self.bias[self.units * 2:]
+    else:
+      self.bias_z = None
+      self.bias_r = None
+      self.bias_h = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_z = _time_distributed_dense(
+          inputs,
+          self.kernel_z,
+          self.bias_z,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_r = _time_distributed_dense(
+          inputs,
+          self.kernel_r,
+          self.bias_r,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_h = _time_distributed_dense(
+          inputs,
+          self.kernel_h,
+          self.bias_h,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_z, x_r, x_h], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(3)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(3)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]  # previous memory
+    dp_mask = states[1]  # dropout matrices for recurrent units
+    rec_dp_mask = states[2]
+
+    if self.implementation == 2:
+      matrix_x = K.dot(inputs * dp_mask[0], self.kernel)
+      if self.use_bias:
+        matrix_x = K.bias_add(matrix_x, self.bias)
+      matrix_inner = K.dot(h_tm1 * rec_dp_mask[0],
+                           self.recurrent_kernel[:, :2 * self.units])
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units:2 * self.units]
+      recurrent_z = matrix_inner[:, :self.units]
+      recurrent_r = matrix_inner[:, self.units:2 * self.units]
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      x_h = matrix_x[:, 2 * self.units:]
+      recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0],
+                          self.recurrent_kernel[:, 2 * self.units:])
+      hh = self.activation(x_h + recurrent_h)
+    else:
+      if self.implementation == 0:
+        x_z = inputs[:, :self.units]
+        x_r = inputs[:, self.units:2 * self.units]
+        x_h = inputs[:, 2 * self.units:]
+      elif self.implementation == 1:
+        x_z = K.dot(inputs * dp_mask[0], self.kernel_z)
+        x_r = K.dot(inputs * dp_mask[1], self.kernel_r)
+        x_h = K.dot(inputs * dp_mask[2], self.kernel_h)
+        if self.use_bias:
+          x_z = K.bias_add(x_z, self.bias_z)
+          x_r = K.bias_add(x_r, self.bias_r)
+          x_h = K.bias_add(x_h, self.bias_h)
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+      z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_z))
+      r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_r))
+
+      hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2],
+                                       self.recurrent_kernel_h))
+    h = z * h_tm1 + (1 - z) * hh
+    if 0 < self.dropout + self.recurrent_dropout:
+      h._uses_learning_phase = True
+    return h, [h]
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(GRU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class LSTM(Recurrent):
+  """Long-Short Term Memory unit - Hochreiter 1997.
+
+  For a step-by-step description of the algorithm, see
+  [this tutorial](http://deeplearning.net/tutorial/lstm.html).
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+          If True, add 1 to the bias of the forget gate at initialization.
+          Setting it to true will also force `bias_initializer="zeros"`.
+          This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+
+  References:
+      - [Long short-term
+        memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
+        (original 1997 paper)
+      - [Supervised sequence labeling with recurrent neural
+        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    super(LSTM, self).__init__(**kwargs)
+    self.units = units
+    self.activation = activations.get(activation)
+    self.recurrent_activation = activations.get(recurrent_activation)
+    self.use_bias = use_bias
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.dropout = min(1., max(0., dropout))
+    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    self.state_spec = [
+        InputSpec(shape=(None, self.units)),
+        InputSpec(shape=(None, self.units))
+    ]
+
+  def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None, None]
+    if self.stateful:
+      self.reset_states()
+
+    self.kernel = self.add_weight(
+        shape=(self.input_dim, self.units * 4),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 4),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.use_bias:
+      if self.unit_forget_bias:
+
+        def bias_initializer(_, *args, **kwargs):
+          return K.concatenate([
+              self.bias_initializer((self.units,), *args, **kwargs),
+              initializers.Ones()((self.units,), *args, **kwargs),
+              self.bias_initializer((self.units * 2,), *args, **kwargs),
+          ])
+      else:
+        bias_initializer = self.bias_initializer
+      self.bias = self.add_weight(
+          shape=(self.units * 4,),
+          name='bias',
+          initializer=bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+
+    self.kernel_i = self.kernel[:, :self.units]
+    self.kernel_f = self.kernel[:, self.units:self.units * 2]
+    self.kernel_c = self.kernel[:, self.units * 2:self.units * 3]
+    self.kernel_o = self.kernel[:, self.units * 3:]
+
+    self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
+    self.recurrent_kernel_f = self.recurrent_kernel[:, self.units:
+                                                    self.units * 2]
+    self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2:
+                                                    self.units * 3]
+    self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
+
+    if self.use_bias:
+      self.bias_i = self.bias[:self.units]
+      self.bias_f = self.bias[self.units:self.units * 2]
+      self.bias_c = self.bias[self.units * 2:self.units * 3]
+      self.bias_o = self.bias[self.units * 3:]
+    else:
+      self.bias_i = None
+      self.bias_f = None
+      self.bias_c = None
+      self.bias_o = None
+    self.built = True
+
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_i = _time_distributed_dense(
+          inputs,
+          self.kernel_i,
+          self.bias_i,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_f = _time_distributed_dense(
+          inputs,
+          self.kernel_f,
+          self.bias_f,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_c = _time_distributed_dense(
+          inputs,
+          self.kernel_c,
+          self.bias_c,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_o = _time_distributed_dense(
+          inputs,
+          self.kernel_o,
+          self.bias_o,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
+
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
+
+      dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
+
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
+
+      rec_dp_mask = [
+          K.in_train_phase(dropped_inputs, ones, training=training)
+          for _ in range(4)
+      ]
+      constants.append(rec_dp_mask)
+    else:
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    if self.implementation == 2:
+      z = K.dot(inputs * dp_mask[0], self.kernel)
+      z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel)
+      if self.use_bias:
+        z = K.bias_add(z, self.bias)
+
+      z0 = z[:, :self.units]
+      z1 = z[:, self.units:2 * self.units]
+      z2 = z[:, 2 * self.units:3 * self.units]
+      z3 = z[:, 3 * self.units:]
+
+      i = self.recurrent_activation(z0)
+      f = self.recurrent_activation(z1)
+      c = f * c_tm1 + i * self.activation(z2)
+      o = self.recurrent_activation(z3)
+    else:
+      if self.implementation == 0:
+        x_i = inputs[:, :self.units]
+        x_f = inputs[:, self.units:2 * self.units]
+        x_c = inputs[:, 2 * self.units:3 * self.units]
+        x_o = inputs[:, 3 * self.units:]
+      elif self.implementation == 1:
+        x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
+        x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
+        x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
+        x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+
+      i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_i))
+      f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_f))
+      c = f * c_tm1 + i * self.activation(
+          x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c))
+      o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
+                                                self.recurrent_kernel_o))
+    h = o * self.activation(c)
+    if 0 < self.dropout + self.recurrent_dropout:
+      h._uses_learning_phase = True
+    return h, [h, c]
+
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(LSTM, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/serialization.py b/tensorflow/contrib/keras/python/keras/layers/serialization.py
new file mode 100644
index 00000000000..f9c21a3e671
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/serialization.py
@@ -0,0 +1,63 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer serialization/deserialization functions.
+"""
+# pylint: disable=wildcard-import
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.engine import Input
+from tensorflow.contrib.keras.python.keras.engine import InputLayer
+from tensorflow.contrib.keras.python.keras.layers.advanced_activations import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional import *
+from tensorflow.contrib.keras.python.keras.layers.convolutional_recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.core import *
+from tensorflow.contrib.keras.python.keras.layers.embeddings import *
+from tensorflow.contrib.keras.python.keras.layers.local import *
+from tensorflow.contrib.keras.python.keras.layers.merge import *
+from tensorflow.contrib.keras.python.keras.layers.noise import *
+from tensorflow.contrib.keras.python.keras.layers.normalization import *
+from tensorflow.contrib.keras.python.keras.layers.pooling import *
+from tensorflow.contrib.keras.python.keras.layers.recurrent import *
+from tensorflow.contrib.keras.python.keras.layers.wrappers import *
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def serialize(layer):
+  return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
+
+
+def deserialize(config, custom_objects=None):
+  """Instantiates a layer from a config dictionary.
+
+  Arguments:
+      config: dict of the form {'class_name': str, 'config': dict}
+      custom_objects: dict mapping class names (or function names)
+          of custom (non-Keras) objects to class/functions
+
+  Returns:
+      Layer instance (may be Model, Sequential, Layer...)
+  """
+  from tensorflow.contrib.keras.python.keras import models  # pylint: disable=g-import-not-at-top
+  globs = globals()  # All layers.
+  globs['Model'] = models.Model
+  globs['Sequential'] = models.Sequential
+  return deserialize_keras_object(
+      config,
+      module_objects=globs,
+      custom_objects=custom_objects,
+      printable_module_name='layer')
diff --git a/tensorflow/contrib/keras/python/keras/layers/serialization_test.py b/tensorflow/contrib/keras/python/keras/layers/serialization_test.py
new file mode 100644
index 00000000000..fb2e506a4c3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/serialization_test.py
@@ -0,0 +1,41 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer serialization utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class LayerSerializationTest(test.TestCase):
+
+  def test_serialize_deserialize(self):
+    layer = keras.layers.Dense(
+        3, activation='relu', kernel_initializer='ones', bias_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.activation, keras.activations.relu)
+    self.assertEqual(new_layer.bias_regularizer.__class__,
+                     keras.regularizers.L1L2)
+    self.assertEqual(new_layer.kernel_initializer.__class__,
+                     keras.initializers.Ones)
+    self.assertEqual(new_layer.units, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py b/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py
new file mode 100644
index 00000000000..21ba152da49
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/simplernn_test.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SimpleRNN layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class SimpleRNNLayerTest(test.TestCase):
+
+  def test_return_sequences_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SimpleRNN,
+          kwargs={'units': units,
+                  'return_sequences': True},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dynamic_behavior_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
+      model = keras.models.Sequential()
+      model.add(layer)
+      model.compile('sgd', 'mse')
+      x = np.random.random((num_samples, timesteps, embedding_dim))
+      y = np.random.random((num_samples, units))
+      model.train_on_batch(x, y)
+
+  def test_dropout_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.SimpleRNN,
+          kwargs={'units': units,
+                  'dropout': 0.1,
+                  'recurrent_dropout': 0.1},
+          input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_implementation_mode_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    with self.test_session():
+      for mode in [0, 1, 2]:
+        testing_utils.layer_test(
+            keras.layers.SimpleRNN,
+            kwargs={'units': units,
+                    'implementation': mode},
+            input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_statefulness_SimpleRNN(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer='sgd', loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_regularization_SimpleRNN(self):
+    embedding_dim = 4
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_constraint=keras.constraints.max_norm(0.01),
+          recurrent_constraint=keras.constraints.max_norm(0.01),
+          bias_constraint='max_norm')
+      layer.build((None, None, embedding_dim))
+      self.assertEqual(len(layer.constraints), 3)
+
+  def test_with_masking_layer_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    with self.test_session():
+      inputs = np.random.random((2, 3, 4))
+      targets = np.abs(np.random.random((2, 3, 5)))
+      targets /= targets.sum(axis=-1, keepdims=True)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(input_shape=(3, 4)))
+      model.add(layer_class(units=5, return_sequences=True, unroll=False))
+      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
new file mode 100644
index 00000000000..dbc79fb1933
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -0,0 +1,375 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Wrapper layers: layers that augment the functionality of another layer.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import tf_inspect
+
+
+class Wrapper(Layer):
+  """Abstract wrapper base class.
+
+  Wrappers take another layer and augment it in various ways.
+  Do not use this class as a layer, it is only an abstract base class.
+  Two usable wrappers are the `TimeDistributed` and `Bidirectional` wrappers.
+
+  Arguments:
+      layer: The layer to be wrapped.
+  """
+
+  def __init__(self, layer, **kwargs):
+    self.layer = layer
+    super(Wrapper, self).__init__(**kwargs)
+
+  def build(self, input_shape=None):
+    self.built = True
+
+  @property
+  def activity_regularizer(self):
+    if hasattr(self.layer, 'activity_regularizer'):
+      return self.layer.activity_regularizer
+    else:
+      return None
+
+  @property
+  def trainable_weights(self):
+    return self.layer.trainable_weights
+
+  @property
+  def non_trainable_weights(self):
+    return self.layer.non_trainable_weights
+
+  @property
+  def updates(self):
+    if hasattr(self.layer, 'updates'):
+      return self.layer.updates
+    return []
+
+  def get_updates_for(self, inputs=None):
+    if inputs is None:
+      updates = self.layer.get_updates_for(None)
+      return updates + super(Wrapper, self).get_updates_for(None)
+    return super(Wrapper, self).get_updates_for(inputs)
+
+  @property
+  def losses(self):
+    if hasattr(self.layer, 'losses'):
+      return self.layer.losses
+    return []
+
+  def get_losses_for(self, inputs=None):
+    if inputs is None:
+      losses = self.layer.get_losses_for(None)
+      return losses + super(Wrapper, self).get_losses_for(None)
+    return super(Wrapper, self).get_losses_for(inputs)
+
+  @property
+  def constraints(self):
+    return self.layer.constraints
+
+  def get_weights(self):
+    return self.layer.get_weights()
+
+  def set_weights(self, weights):
+    self.layer.set_weights(weights)
+
+  def get_config(self):
+    config = {
+        'layer': {
+            'class_name': self.layer.__class__.__name__,
+            'config': self.layer.get_config()
+        }
+    }
+    base_config = super(Wrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    from tensorflow.contrib.keras.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    layer = deserialize_layer(
+        config.pop('layer'), custom_objects=custom_objects)
+    return cls(layer, **config)
+
+
+class TimeDistributed(Wrapper):
+  """This wrapper allows to apply a layer to every temporal slice of an input.
+
+  The input should be at least 3D, and the dimension of index one
+  will be considered to be the temporal dimension.
+
+  Consider a batch of 32 samples,
+  where each sample is a sequence of 10 vectors of 16 dimensions.
+  The batch input shape of the layer is then `(32, 10, 16)`,
+  and the `input_shape`, not including the samples dimension, is `(10, 16)`.
+
+  You can then use `TimeDistributed` to apply a `Dense` layer
+  to each of the 10 timesteps, independently:
+
+  ```python
+      # as the first layer in a model
+      model = Sequential()
+      model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
+      # now model.output_shape == (None, 10, 8)
+  ```
+
+  The output will then have shape `(32, 10, 8)`.
+
+  In subsequent layers, there is no need for the `input_shape`:
+
+  ```python
+      model.add(TimeDistributed(Dense(32)))
+      # now model.output_shape == (None, 10, 32)
+  ```
+
+  The output will then have shape `(32, 10, 32)`.
+
+  `TimeDistributed` can be used with arbitrary layers, not just `Dense`,
+  for instance with a `Conv2D` layer:
+
+  ```python
+      model = Sequential()
+      model.add(TimeDistributed(Conv2D(64, (3, 3)),
+                                input_shape=(10, 299, 299, 3)))
+  ```
+
+  Arguments:
+      layer: a layer instance.
+  """
+
+  def __init__(self, layer, **kwargs):
+    super(TimeDistributed, self).__init__(layer, **kwargs)
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    assert len(input_shape) >= 3
+    self.input_spec = InputSpec(shape=input_shape)
+    child_input_shape = [input_shape[0]] + input_shape[2:]
+    if not self.layer.built:
+      self.layer.build(child_input_shape)
+      self.layer.built = True
+    super(TimeDistributed, self).build()
+    self.built = True
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    child_input_shape = tensor_shape.TensorShape([input_shape[0]] +
+                                                 input_shape[2:])
+    child_output_shape = self.layer._compute_output_shape(  # pylint: disable=protected-access
+        child_input_shape).as_list()
+    timesteps = input_shape[1]
+    return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
+                                    child_output_shape[1:])
+
+  def call(self, inputs, mask=None):
+    input_shape = K.int_shape(inputs)
+    if input_shape[0]:
+      # batch size matters, use rnn-based implementation
+      def step(x, _):
+        output = self.layer.call(x)
+        return output, []
+
+      _, outputs, _ = K.rnn(step, inputs, initial_states=[], unroll=False)
+      y = outputs
+    else:
+      # No batch size specified, therefore the layer will be able
+      # to process batches of any size.
+      # We can go with reshape-based implementation for performance.
+      input_length = input_shape[1]
+      if not input_length:
+        input_length = K.shape(inputs)[1]
+      # Shape: (num_samples * timesteps, ...)
+      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      y = self.layer.call(inputs)  # (num_samples * timesteps, ...)
+      # Shape: (num_samples, timesteps, ...)
+      output_shape = self._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
+      y = K.reshape(y, [-1, input_length] + output_shape[2:])
+
+    # Apply activity regularizer if any:
+    if (hasattr(self.layer, 'activity_regularizer') and
+        self.layer.activity_regularizer is not None):
+      regularization_loss = self.layer.activity_regularizer(y)
+      self.add_loss(regularization_loss, inputs)
+    return y
+
+
+class Bidirectional(Wrapper):
+  """Bidirectional wrapper for RNNs.
+
+  Arguments:
+      layer: `Recurrent` instance.
+      merge_mode: Mode by which outputs of the
+          forward and backward RNNs will be combined.
+          One of {'sum', 'mul', 'concat', 'ave', None}.
+          If None, the outputs will not be combined,
+          they will be returned as a list.
+
+  Raises:
+      ValueError: In case of invalid `merge_mode` argument.
+
+  Examples:
+
+  ```python
+      model = Sequential()
+      model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
+      10)))
+      model.add(Bidirectional(LSTM(10)))
+      model.add(Dense(5))
+      model.add(Activation('softmax'))
+      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+  ```
+  """
+
+  def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
+    super(Bidirectional, self).__init__(layer, **kwargs)
+    if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
+      raise ValueError('Invalid merge mode. '
+                       'Merge mode should be one of '
+                       '{"sum", "mul", "ave", "concat", None}')
+    self.forward_layer = copy.copy(layer)
+    config = layer.get_config()
+    config['go_backwards'] = not config['go_backwards']
+    self.backward_layer = layer.__class__.from_config(config)
+    self.forward_layer.name = 'forward_' + self.forward_layer.name
+    self.backward_layer.name = 'backward_' + self.backward_layer.name
+    self.merge_mode = merge_mode
+    if weights:
+      nw = len(weights)
+      self.forward_layer.initial_weights = weights[:nw // 2]
+      self.backward_layer.initial_weights = weights[nw // 2:]
+    self.stateful = layer.stateful
+    self.return_sequences = layer.return_sequences
+    self.supports_masking = True
+
+  def get_weights(self):
+    return self.forward_layer.get_weights() + self.backward_layer.get_weights()
+
+  def set_weights(self, weights):
+    nw = len(weights)
+    self.forward_layer.set_weights(weights[:nw // 2])
+    self.backward_layer.set_weights(weights[nw // 2:])
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.merge_mode in ['sum', 'ave', 'mul']:
+      return self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
+    elif self.merge_mode == 'concat':
+      shape = self.forward_layer._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
+      shape[-1] *= 2
+      return tensor_shape.TensorShape(shape)
+    elif self.merge_mode is None:
+      shape = self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
+      return [shape, copy.copy(shape)]
+
+  def call(self, inputs, training=None, mask=None):
+    kwargs = {}
+    func_args = tf_inspect.getargspec(self.layer.call).args
+    if 'training' in func_args:
+      kwargs['training'] = training
+    if 'mask' in func_args:
+      kwargs['mask'] = mask
+
+    y = self.forward_layer.call(inputs, **kwargs)
+    y_rev = self.backward_layer.call(inputs, **kwargs)
+    if self.return_sequences:
+      y_rev = K.reverse(y_rev, 1)
+    if self.merge_mode == 'concat':
+      output = K.concatenate([y, y_rev])
+    elif self.merge_mode == 'sum':
+      output = y + y_rev
+    elif self.merge_mode == 'ave':
+      output = (y + y_rev) / 2
+    elif self.merge_mode == 'mul':
+      output = y * y_rev
+    elif self.merge_mode is None:
+      output = [y, y_rev]
+
+    # Properly set learning phase
+    if 0 < self.layer.dropout + self.layer.recurrent_dropout:
+      if self.merge_mode is None:
+        for out in output:
+          out._uses_learning_phase = True
+      else:
+        output._uses_learning_phase = True
+    return output
+
+  def reset_states(self):
+    self.forward_layer.reset_states()
+    self.backward_layer.reset_states()
+
+  def build(self, input_shape):
+    with K.name_scope(self.forward_layer.name):
+      self.forward_layer.build(input_shape)
+    with K.name_scope(self.backward_layer.name):
+      self.backward_layer.build(input_shape)
+    self.built = True
+
+  def compute_mask(self, inputs, mask):
+    if self.return_sequences:
+      if not self.merge_mode:
+        return [mask, mask]
+      else:
+        return mask
+    else:
+      return None
+
+  @property
+  def trainable_weights(self):
+    if hasattr(self.forward_layer, 'trainable_weights'):
+      return (self.forward_layer.trainable_weights +
+              self.backward_layer.trainable_weights)
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if hasattr(self.forward_layer, 'non_trainable_weights'):
+      return (self.forward_layer.non_trainable_weights +
+              self.backward_layer.non_trainable_weights)
+    return []
+
+  @property
+  def updates(self):
+    if hasattr(self.forward_layer, 'updates'):
+      return self.forward_layer.updates + self.backward_layer.updates
+    return []
+
+  @property
+  def losses(self):
+    if hasattr(self.forward_layer, 'losses'):
+      return self.forward_layer.losses + self.backward_layer.losses
+    return []
+
+  @property
+  def constraints(self):
+    constraints = {}
+    if hasattr(self.forward_layer, 'constraints'):
+      constraints.update(self.forward_layer.constraints)
+      constraints.update(self.backward_layer.constraints)
+    return constraints
+
+  def get_config(self):
+    config = {'merge_mode': self.merge_mode}
+    base_config = super(Bidirectional, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
new file mode 100644
index 00000000000..b892681adab
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
@@ -0,0 +1,199 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer wrappers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TimeDistributedTest(test.TestCase):
+
+  def test_timedistributed_dense(self):
+    # first, test with Dense layer
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2), input_shape=(3, 4)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.fit(
+          np.random.random((10, 3, 4)),
+          np.random.random((10, 3, 2)),
+          epochs=1,
+          batch_size=10)
+
+      # test config
+      model.get_config()
+
+  def test_timedistributed_conv2d(self):
+    # test with Conv2D
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Conv2D(5, (2, 2), padding='same'),
+              input_shape=(2, 4, 4, 3)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.random.random((1, 2, 4, 4, 3)), np.random.random((1, 2, 4, 4, 5)))
+
+      model = keras.models.model_from_json(model.to_json())
+      model.summary()
+
+  def test_timedistributed_stacked(self):
+    # test stacked layers
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2), input_shape=(3, 4)))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+
+      model.fit(
+          np.random.random((10, 3, 4)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+  def test_timedistributed_sequential(self):
+    # test wrapping Sequential model
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(3, input_dim=2))
+      outer_model = keras.models.Sequential()
+      outer_model.add(keras.layers.TimeDistributed(model, input_shape=(3, 2)))
+      outer_model.compile(optimizer='rmsprop', loss='mse')
+      outer_model.fit(
+          np.random.random((10, 3, 2)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+      # test with functional API
+      x = keras.layers.Input(shape=(3, 2))
+      y = keras.layers.TimeDistributed(model)(x)
+      outer_model = keras.models.Model(x, y)
+      outer_model.compile(optimizer='rmsprop', loss='mse')
+      outer_model.fit(
+          np.random.random((10, 3, 2)),
+          np.random.random((10, 3, 3)),
+          epochs=1,
+          batch_size=10)
+
+  def test_regularizers(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(2, kernel_regularizer='l1'),
+              input_shape=(3, 4)))
+      model.add(keras.layers.Activation('relu'))
+      model.compile(optimizer='rmsprop', loss='mse')
+      self.assertEqual(len(model.losses), 1)
+
+
+class BidirectionalTest(test.TestCase):
+
+  def test_bidirectional(self):
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    with self.test_session():
+      for mode in ['sum', 'concat']:
+        x = np.random.random((samples, timesteps, dim))
+        target_dim = 2 * output_dim if mode == 'concat' else output_dim
+        y = np.random.random((samples, target_dim))
+
+        # test with Sequential model
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
+        model.compile(loss='mse', optimizer='sgd')
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+  def test_bidirectional_stacked(self):
+    # test stacked bidirectional layers
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    mode = 'sum'
+
+    with self.test_session():
+      x = np.random.random((samples, timesteps, dim))
+      target_dim = 2 * output_dim if mode == 'concat' else output_dim
+      y = np.random.random((samples, target_dim))
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Bidirectional(
+              rnn(output_dim, return_sequences=True),
+              merge_mode=mode,
+              input_shape=(timesteps, dim)))
+      model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+      # test with functional API
+      inputs = keras.layers.Input((timesteps, dim))
+      output = keras.layers.Bidirectional(
+          rnn(output_dim), merge_mode=mode)(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+  def test_bidirectional_statefulness(self):
+    # Bidirectional and stateful
+    rnn = keras.layers.SimpleRNN
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    mode = 'sum'
+
+    with self.test_session():
+      x = np.random.random((samples, timesteps, dim))
+      target_dim = 2 * output_dim if mode == 'concat' else output_dim
+      y = np.random.random((samples, target_dim))
+
+      inputs = keras.layers.Input(batch_shape=(1, timesteps, dim))
+      output = keras.layers.Bidirectional(
+          rnn(output_dim, stateful=True), merge_mode=mode)(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='mse', optimizer='sgd')
+      model.fit(x, y, epochs=1, batch_size=1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/losses.py b/tensorflow/contrib/keras/python/keras/losses.py
new file mode 100644
index 00000000000..777ec440ac3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/losses.py
@@ -0,0 +1,129 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Built-in Keras loss functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def mean_squared_error(y_true, y_pred):
+  return K.mean(K.square(y_pred - y_true), axis=-1)
+
+
+def mean_absolute_error(y_true, y_pred):
+  return K.mean(K.abs(y_pred - y_true), axis=-1)
+
+
+def mean_absolute_percentage_error(y_true, y_pred):
+  # Equivalent to MAE, but sometimes easier to interpret.
+  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  return 100. * K.mean(diff, axis=-1)
+
+
+def mean_squared_logarithmic_error(y_true, y_pred):
+  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(K.square(first_log - second_log), axis=-1)
+
+
+def squared_hinge(y_true, y_pred):
+  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+
+
+def hinge(y_true, y_pred):
+  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+
+
+def categorical_hinge(y_true, y_pred):
+  pos = K.sum(y_true * y_pred, axis=-1)
+  neg = K.max((1. - y_true) * y_pred, axis=-1)
+  return K.maximum(neg - pos + 1., 0.)
+
+
+def logcosh(y_true, y_pred):
+
+  def cosh(x):
+    return (K.exp(x) + K.exp(-x)) / 2
+
+  return K.mean(K.log(cosh(y_pred - y_true)), axis=-1)
+
+
+def categorical_crossentropy(y_true, y_pred):
+  return K.categorical_crossentropy(y_pred, y_true)
+
+
+def sparse_categorical_crossentropy(y_true, y_pred):
+  return K.sparse_categorical_crossentropy(y_pred, y_true)
+
+
+def binary_crossentropy(y_true, y_pred):
+  return K.mean(K.binary_crossentropy(y_pred, y_true), axis=-1)
+
+
+def kullback_leibler_divergence(y_true, y_pred):
+  y_true = K.clip(y_true, K.epsilon(), 1)
+  y_pred = K.clip(y_pred, K.epsilon(), 1)
+  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+
+
+def poisson(y_true, y_pred):
+  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+
+
+def cosine_proximity(y_true, y_pred):
+  y_true = K.l2_normalize(y_true, axis=-1)
+  y_pred = K.l2_normalize(y_pred, axis=-1)
+  return -K.mean(y_true * y_pred, axis=-1)
+
+
+# Aliases.
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
+kld = KLD = kullback_leibler_divergence
+cosine = cosine_proximity
+
+
+def serialize(loss):
+  return loss.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='loss function')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'loss function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/losses_test.py b/tensorflow/contrib/keras/python/keras/losses_test.py
new file mode 100644
index 00000000000..6bdcc0b5ff3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/losses_test.py
@@ -0,0 +1,88 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras loss functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+ALL_LOSSES = [keras.losses.mean_squared_error,
+              keras.losses.mean_absolute_error,
+              keras.losses.mean_absolute_percentage_error,
+              keras.losses.mean_squared_logarithmic_error,
+              keras.losses.squared_hinge,
+              keras.losses.hinge,
+              keras.losses.categorical_crossentropy,
+              keras.losses.binary_crossentropy,
+              keras.losses.kullback_leibler_divergence,
+              keras.losses.poisson,
+              keras.losses.cosine_proximity,
+              keras.losses.logcosh,
+              keras.losses.categorical_hinge]
+
+
+class KerasLossesTest(test.TestCase):
+
+  def test_objective_shapes_3d(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((5, 6, 7)))
+      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
+      for obj in ALL_LOSSES:
+        objective_output = obj(y_a, y_b)
+        self.assertListEqual(objective_output.get_shape().as_list(), [5, 6])
+
+  def test_objective_shapes_2d(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((6, 7)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      for obj in ALL_LOSSES:
+        objective_output = obj(y_a, y_b)
+        self.assertListEqual(objective_output.get_shape().as_list(), [6,])
+
+  def test_cce_one_hot(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.randint(0, 7, (5, 6)))
+      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
+      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert keras.backend.eval(objective_output).shape == (5, 6)
+
+      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert keras.backend.eval(objective_output).shape == (6,)
+
+  def test_serialization(self):
+    fn = keras.losses.get('mse')
+    config = keras.losses.serialize(fn)
+    new_fn = keras.losses.deserialize(config)
+    self.assertEqual(fn, new_fn)
+
+  def test_categorical_hinge(self):
+    y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                              [0.1, 0.2, 0.7]]))
+    y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+    expected_loss = ((0.3 - 0.2 + 1) + (0.7 - 0.1 + 1)) / 2.0
+    loss = keras.backend.eval(keras.losses.categorical_hinge(y_true, y_pred))
+    self.assertAllClose(expected_loss, np.mean(loss))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
new file mode 100644
index 00000000000..93c8684f910
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -0,0 +1,91 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Built-in Keras metrics functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.losses import binary_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
+from tensorflow.contrib.keras.python.keras.losses import hinge
+from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import logcosh
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
+from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
+from tensorflow.contrib.keras.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.contrib.keras.python.keras.losses import poisson
+from tensorflow.contrib.keras.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import squared_hinge
+# pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+
+
+def binary_accuracy(y_true, y_pred):
+  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+
+
+def categorical_accuracy(y_true, y_pred):
+  return K.cast(
+      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
+
+
+def sparse_categorical_accuracy(y_true, y_pred):
+  return K.cast(
+      K.equal(
+          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
+                                         K.floatx())), K.floatx())
+
+
+def top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+
+
+# Aliases
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
+cosine = cosine_proximity
+
+
+def serialize(metric):
+  return metric.__name__
+
+
+def deserialize(name, custom_objects=None):
+  return deserialize_keras_object(
+      name,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='metric function')
+
+
+def get(identifier):
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    return deserialize(identifier)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret '
+                     'metric function identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/metrics_test.py b/tensorflow/contrib/keras/python/keras/metrics_test.py
new file mode 100644
index 00000000000..ac0a1372c64
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/metrics_test.py
@@ -0,0 +1,62 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class KerasMetricsTest(test.TestCase):
+
+  def test_metrics(self):
+    with self.test_session():
+      y_a = keras.backend.variable(np.random.random((6, 7)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      for metric in [keras.metrics.binary_accuracy,
+                     keras.metrics.categorical_accuracy]:
+        output = metric(y_a, y_b)
+        self.assertEqual(keras.backend.eval(output).shape, (6,))
+
+  def test_sparse_categorical_accuracy(self):
+    with self.test_session():
+      metric = keras.metrics.sparse_categorical_accuracy
+      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
+      y_b = keras.backend.variable(np.random.random((6, 7)))
+      self.assertEqual(keras.backend.eval(metric(y_a, y_b)).shape, (6,))
+
+  def test_top_k_categorical_accuracy(self):
+    with self.test_session():
+      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                                [0.1, 0.2, 0.7]]))
+      y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = keras.backend.eval(
+          keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
new file mode 100644
index 00000000000..0ae373da3cd
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -0,0 +1,1205 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Home of the Sequential model, and the `save_model`/`load_model` functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import layers as layer_module
+from tensorflow.contrib.keras.python.keras import optimizers
+from tensorflow.contrib.keras.python.keras.engine import topology
+from tensorflow.contrib.keras.python.keras.engine.topology import Input
+from tensorflow.contrib.keras.python.keras.engine.topology import Layer
+from tensorflow.contrib.keras.python.keras.engine.training import Model
+from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
+  """Save a model to a HDF5 file.
+
+  The saved model contains:
+      - the model's configuration (topology)
+      - the model's weights
+      - the model's optimizer's state (if any)
+
+  Thus the saved model can be reinstantiated in
+  the exact same state, without any of the code
+  used for model definition or training.
+
+  Arguments:
+      model: Keras model instance to be saved.
+      filepath: String, path where to save the model.
+      overwrite: Whether we should overwrite any existing
+          model at the target location, or instead
+          ask the user with a manual prompt.
+      include_optimizer: If True, save optimizer's state together.
+
+  Raises:
+      ImportError: if h5py is not available.
+  """
+
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  def get_json_type(obj):
+    """Serialize any object to a JSON-serializable structure.
+
+    Arguments:
+        obj: the object to serialize
+
+    Returns:
+        JSON-serializable structure representing `obj`.
+
+    Raises:
+        TypeError: if `obj` cannot be serialized.
+    """
+    # if obj is a serializable Keras class instance
+    # e.g. optimizer, layer
+    if hasattr(obj, 'get_config'):
+      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+    # if obj is any numpy type
+    if type(obj).__module__ == np.__name__:
+      return obj.item()
+
+    # misc functions (e.g. loss function)
+    if callable(obj):
+      return obj.__name__
+
+    # if obj is a python 'type'
+    if type(obj).__name__ == type.__name__:
+      return obj.__name__
+
+    raise TypeError('Not JSON Serializable:', obj)
+
+  from tensorflow.contrib.keras.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  # If file exists and should not be overwritten.
+  if not overwrite and os.path.isfile(filepath):
+    proceed = ask_to_proceed_with_overwrite(filepath)
+    if not proceed:
+      return
+
+  f = h5py.File(filepath, 'w')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['model_config'] = json.dumps(
+      {
+          'class_name': model.__class__.__name__,
+          'config': model.get_config()
+      },
+      default=get_json_type).encode('utf8')
+
+  model_weights_group = f.create_group('model_weights')
+  model_layers = model.layers
+  topology.save_weights_to_hdf5_group(model_weights_group, model_layers)
+
+  if include_optimizer and hasattr(model, 'optimizer'):
+    if isinstance(model.optimizer, optimizers.TFOptimizer):
+      logging.warning(
+          'TensorFlow optimizers do not '
+          'make it possible to access '
+          'optimizer attributes or optimizer state '
+          'after instantiation. '
+          'As a result, we cannot save the optimizer '
+          'as part of the model save file.'
+          'You will have to compile your model again after loading it. '
+          'Prefer using a Keras optimizer instead '
+          '(see keras.io/optimizers).')
+    else:
+      f.attrs['training_config'] = json.dumps(
+          {
+              'optimizer_config': {
+                  'class_name': model.optimizer.__class__.__name__,
+                  'config': model.optimizer.get_config()
+              },
+              'loss': model.loss,
+              'metrics': model.metrics,
+              'sample_weight_mode': model.sample_weight_mode,
+              'loss_weights': model.loss_weights,
+          },
+          default=get_json_type).encode('utf8')
+
+      # Save optimizer weights.
+      symbolic_weights = getattr(model.optimizer, 'weights')
+      if symbolic_weights:
+        optimizer_weights_group = f.create_group('optimizer_weights')
+        weight_values = K.batch_get_value(symbolic_weights)
+        weight_names = []
+        for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+          # Default values of symbolic_weights is /variable for theano
+          if K.backend() == 'theano':
+            if hasattr(w, 'name') and w.name != '/variable':
+              name = str(w.name)
+            else:
+              name = 'param_' + str(i)
+          else:
+            if hasattr(w, 'name') and w.name:
+              name = str(w.name)
+            else:
+              name = 'param_' + str(i)
+          weight_names.append(name.encode('utf8'))
+        optimizer_weights_group.attrs['weight_names'] = weight_names
+        for name, val in zip(weight_names, weight_values):
+          param_dset = optimizer_weights_group.create_dataset(
+              name, val.shape, dtype=val.dtype)
+          if not val.shape:
+            # scalar
+            param_dset[()] = val
+          else:
+            param_dset[:] = val
+  f.flush()
+  f.close()
+
+
+def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
+  """Loads a model saved via `save_model`.
+
+  Arguments:
+      filepath: String, path to the saved model.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+      compile: Boolean, whether to compile the model
+          after loading.
+
+  Returns:
+      A Keras model instance. If an optimizer was found
+      as part of the saved model, the model is already
+      compiled. Otherwise, the model is uncompiled and
+      a warning will be displayed. When `compile` is set
+      to False, the compilation is omitted without any
+      warning.
+
+  Raises:
+      ImportError: if h5py is not available.
+      ValueError: In case of an invalid savefile.
+  """
+  if h5py is None:
+    raise ImportError('`load_model` requires h5py.')
+
+  if not custom_objects:
+    custom_objects = {}
+
+  def convert_custom_objects(obj):
+    """Handles custom object lookup.
+
+    Arguments:
+        obj: object, dict, or list.
+
+    Returns:
+        The same structure, where occurrences
+            of a custom object name have been replaced
+            with the custom object.
+    """
+    if isinstance(obj, list):
+      deserialized = []
+      for value in obj:
+        if value in custom_objects:
+          deserialized.append(custom_objects[value])
+        else:
+          deserialized.append(value)
+      return deserialized
+    if isinstance(obj, dict):
+      deserialized = {}
+      for key, value in obj.items():
+        deserialized[key] = []
+        if isinstance(value, list):
+          for element in value:
+            if element in custom_objects:
+              deserialized[key].append(custom_objects[element])
+            else:
+              deserialized[key].append(element)
+        elif value in custom_objects:
+          deserialized[key] = custom_objects[value]
+        else:
+          deserialized[key] = value
+      return deserialized
+    if obj in custom_objects:
+      return custom_objects[obj]
+    return obj
+
+  f = h5py.File(filepath, mode='r')
+
+  # instantiate model
+  model_config = f.attrs.get('model_config')
+  if model_config is None:
+    raise ValueError('No model found in config file.')
+  model_config = json.loads(model_config.decode('utf-8'))
+  model = model_from_config(model_config, custom_objects=custom_objects)
+
+  # set weights
+  topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+  # Early return if compilation is not required.
+  if not compile:
+    f.close()
+    return model
+
+  # instantiate optimizer
+  training_config = f.attrs.get('training_config')
+  if training_config is None:
+    logging.warning('No training configuration found in save file: '
+                    'the model was *not* compiled. Compile it manually.')
+    f.close()
+    return model
+  training_config = json.loads(training_config.decode('utf-8'))
+  optimizer_config = training_config['optimizer_config']
+  optimizer = optimizers.deserialize(
+      optimizer_config, custom_objects=custom_objects)
+
+  # Recover loss functions and metrics.
+  loss = convert_custom_objects(training_config['loss'])
+  metrics = convert_custom_objects(training_config['metrics'])
+  sample_weight_mode = training_config['sample_weight_mode']
+  loss_weights = training_config['loss_weights']
+
+  # Compile model.
+  model.compile(
+      optimizer=optimizer,
+      loss=loss,
+      metrics=metrics,
+      loss_weights=loss_weights,
+      sample_weight_mode=sample_weight_mode)
+
+  # Set optimizer weights.
+  if 'optimizer_weights' in f:
+    # Build train function (to get weight updates).
+    if isinstance(model, Sequential):
+      model.model._make_train_function()
+    else:
+      model._make_train_function()
+    optimizer_weights_group = f['optimizer_weights']
+    optimizer_weight_names = [
+        n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']
+    ]
+    optimizer_weight_values = [
+        optimizer_weights_group[n] for n in optimizer_weight_names
+    ]
+    model.optimizer.set_weights(optimizer_weight_values)
+  f.close()
+  return model
+
+
+def model_from_config(config, custom_objects=None):
+  """Instantiates a Keras model from its config.
+
+  Arguments:
+      config: Configuration dictionary.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      TypeError if `config` is not a dictionary
+  """
+  if isinstance(config, list):
+    raise TypeError('`model_from_config` expects a dictionary, not a list. '
+                    'Maybe you meant to use '
+                    '`Sequential.from_config(config)`?')
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+def model_from_yaml(yaml_string, custom_objects=None):
+  """Parses a yaml model configuration file and returns a model instance.
+
+  Arguments:
+      yaml_string: YAML string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      ImportError: if yaml module is not found.
+  """
+  if yaml is None:
+    raise ImportError('Requires yaml module installed.')
+  config = yaml.load(yaml_string)
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+def model_from_json(json_string, custom_objects=None):
+  """Parses a JSON model configuration file and returns a model instance.
+
+  Arguments:
+      json_string: JSON string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  config = json.loads(json_string)
+  return layer_module.deserialize(config, custom_objects=custom_objects)
+
+
+class Sequential(Model):
+  """Linear stack of layers.
+
+  Arguments:
+      layers: list of layers to add to the model.
+
+  # Note
+      The first layer passed to a Sequential model
+      should have a defined input shape. What that
+      means is that it should have received an `input_shape`
+      or `batch_input_shape` argument,
+      or for some type of layers (recurrent, Dense...)
+      an `input_dim` argument.
+
+  Example:
+
+      ```python
+          model = Sequential()
+          # first layer must have a defined input shape
+          model.add(Dense(32, input_dim=500))
+          # afterwards, Keras does automatic shape inference
+          model.add(Dense(32))
+
+          # also possible (equivalent to the above):
+          model = Sequential()
+          model.add(Dense(32, input_shape=(500,)))
+          model.add(Dense(32))
+
+          # also possible (equivalent to the above):
+          model = Sequential()
+          # here the batch dimension is None,
+          # which means any batch size will be accepted by the model.
+          model.add(Dense(32, batch_input_shape=(None, 500)))
+          model.add(Dense(32))
+      ```
+  """
+
+  def __init__(self, layers=None, name=None):
+    self.layers = []  # Stack of layers.
+    self.model = None  # Internal Model instance.
+    self.inputs = []  # List of input tensors
+    self.outputs = []  # List of length 1: the output tensor (unique).
+    self._trainable = True
+    self._initial_weights = None
+
+    # Model attributes.
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+    self.built = False
+
+    # Set model name.
+    if not name:
+      prefix = 'sequential_'
+      name = prefix + str(K.get_uid(prefix))
+    self.name = name
+
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF's variable scoping mechanism.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
+
+    # Add to the model any layers passed to the constructor.
+    if layers:
+      for layer in layers:
+        self.add(layer)
+
+  def add(self, layer):
+    """Adds a layer instance on top of the layer stack.
+
+    Arguments:
+        layer: layer instance.
+
+    Raises:
+        TypeError: If `layer` is not a layer instance.
+        ValueError: In case the `layer` argument does not
+            know its input shape.
+        ValueError: In case the `layer` argument has
+            multiple output tensors, or is already connected
+            somewhere else (forbidden in `Sequential` models).
+    """
+    if not isinstance(layer, Layer):
+      raise TypeError('The added layer must be '
+                      'an instance of class Layer. '
+                      'Found: ' + str(layer))
+    if not self.outputs:
+      # first layer in model: check that it is an input layer
+      if not layer.inbound_nodes:
+        # create an input layer
+        if not hasattr(layer, 'batch_input_shape'):
+          raise ValueError('The first layer in a '
+                           'Sequential model must '
+                           'get an `input_shape` or '
+                           '`batch_input_shape` argument.')
+        # Instantiate the input layer.
+        x = Input(
+            batch_shape=layer.batch_input_shape,
+            dtype=layer.dtype,
+            name=layer.name + '_input')
+        # This will build the current layer
+        # and create the node connecting the current layer
+        # to the input layer we just created.
+        layer(x)
+
+      if len(layer.inbound_nodes) != 1:
+        raise ValueError('A layer added to a Sequential model must '
+                         'not already be connected somewhere else. '
+                         'Model received layer ' + layer.name + ' which has ' +
+                         str(len(layer.inbound_nodes)) +
+                         ' pre-existing inbound connections.')
+
+      if len(layer.inbound_nodes[0].output_tensors) != 1:
+        raise ValueError('All layers in a Sequential model '
+                         'should have a single output tensor. '
+                         'For multi-output layers, '
+                         'use the functional API.')
+
+      self.outputs = [layer.inbound_nodes[0].output_tensors[0]]
+      self.inputs = topology.get_source_inputs(self.outputs[0])
+
+      # We create an input node, which we will keep updated
+      # as we add more layers
+      topology.Node(
+          outbound_layer=self,
+          inbound_layers=[],
+          node_indices=[],
+          tensor_indices=[],
+          input_tensors=self.inputs,
+          output_tensors=self.outputs,
+          # no model-level masking for now
+          input_masks=[None for _ in self.inputs],
+          output_masks=[None])
+    else:
+      output_tensor = layer(self.outputs[0])
+      if isinstance(output_tensor, list):
+        raise TypeError('All layers in a Sequential model '
+                        'should have a single output tensor. '
+                        'For multi-output layers, '
+                        'use the functional API.')
+      self.outputs = [output_tensor]
+      # update self.inbound_nodes
+      self.inbound_nodes[0].output_tensors = self.outputs
+      self.inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
+
+    self.layers.append(layer)
+    self.built = False
+
+  def pop(self):
+    """Removes the last layer in the model.
+
+    Raises:
+        TypeError: if there are no layers in the model.
+    """
+    if not self.layers:
+      raise TypeError('There are no layers in the model.')
+
+    self.layers.pop()
+    if not self.layers:
+      self.outputs = []
+      self.inbound_nodes = []
+      self.outbound_nodes = []
+    else:
+      self.layers[-1].outbound_nodes = []
+      self.outputs = [self.layers[-1].output]
+      # update self.inbound_nodes
+      self.inbound_nodes[0].output_tensors = self.outputs
+      self.inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
+    self.built = False
+
+  def get_layer(self, name=None, index=None):
+    """Retrieve a layer that is part of the model.
+
+    Returns a layer based on either its name (unique)
+    or its index in the graph. Indices are based on
+    order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: string, name of layer.
+        index: integer, index of layer.
+
+    Returns:
+        A layer instance.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.get_layer(name, index)
+
+  def call(self, inputs, mask=None):
+    if self.model is None:
+      self.build()
+    return self.model.call(inputs, mask)
+
+  def build(self, input_shape=None):
+    if not self.inputs or not self.outputs:
+      raise TypeError('Sequential model cannot be built: model is empty.'
+                      ' Add some layers first.')
+    # actually create the model
+    self.model = Model(self.inputs, self.outputs[0], name=self.name + '_model')
+    self.model.trainable = self.trainable
+
+    # mirror model attributes
+    self.supports_masking = self.model.supports_masking
+    self._output_mask_cache = self.model._output_mask_cache
+    self._output_tensor_cache = self.model._output_tensor_cache
+    self._output_shape_cache = self.model._output_shape_cache
+    self.input_layers = self.model.input_layers
+    self.input_layers_node_indices = self.model.input_layers_node_indices
+    self.input_layers_tensor_indices = self.model.input_layers_tensor_indices
+    self.output_layers = self.model.output_layers
+    self.output_layers_node_indices = self.model.output_layers_node_indices
+    self.output_layers_tensor_indices = self.model.output_layers_tensor_indices
+    self.nodes_by_depth = self.model.nodes_by_depth
+    self.container_nodes = self.model.container_nodes
+    self.output_names = self.model.output_names
+    self.input_names = self.model.input_names
+    self._feed_input_names = self.model._feed_input_names
+    self._feed_inputs = self.model._feed_inputs
+
+    # Make sure child model callbacks
+    # will call the parent Sequential model.
+    self.model.callback_model = self
+
+    self.built = True
+
+  @property
+  def uses_learning_phase(self):
+    if self.model is None:
+      self.build()
+    return self.model.uses_learning_phase
+
+  def _gather_list_attr(self, attr):
+    all_attrs = []
+    for layer in self.layers:
+      all_attrs += getattr(layer, attr, [])
+    return all_attrs
+
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    if self.model:
+      self.model.trainable = value
+    self._trainable = value
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    return self._gather_list_attr('trainable_weights')
+
+  @property
+  def non_trainable_weights(self):
+    weights = self._gather_list_attr('non_trainable_weights')
+    if not self.trainable:
+      trainable_weights = self._gather_list_attr('trainable_weights')
+      return trainable_weights + weights
+    return weights
+
+  @property
+  def updates(self):
+    if self.model is None:
+      self.build()
+    return self.model.updates
+
+  @property
+  def state_updates(self):
+    if self.model is None:
+      self.build()
+    return self.model.state_updates
+
+  def get_updates_for(self, inputs):
+    if self.model is None:
+      self.build()
+    return self.model.get_updates_for(inputs)
+
+  @property
+  def losses(self):
+    if self.model is None:
+      self.build()
+    return self.model.losses
+
+  def get_losses_for(self, inputs):
+    if self.model is None:
+      self.build()
+    return self.model.get_losses_for(inputs)
+
+  @property
+  def regularizers(self):
+    if self.model is None:
+      self.build()
+    return self.model.regularizers
+
+  @property
+  def constraints(self):
+    if self.model is None:
+      self.build()
+    return self.model.constraints
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays
+        (one array per model weight).
+    """
+    if self.model is None:
+      self.build()
+    return self.model.get_weights()
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: Should be a list
+            of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    if self.model is None:
+      self.build()
+    self.model.set_weights(weights)
+
+  def load_weights(self, filepath, by_name=False):
+    if h5py is None:
+      raise ImportError('`load_weights` requires h5py.')
+    f = h5py.File(filepath, mode='r')
+    if 'layer_names' not in f.attrs and 'model_weights' in f:
+      f = f['model_weights']
+    layers = self.layers
+    if by_name:
+      topology.load_weights_from_hdf5_group_by_name(f, layers)
+    else:
+      topology.load_weights_from_hdf5_group(f, layers)
+    if hasattr(f, 'close'):
+      f.close()
+
+  def save_weights(self, filepath, overwrite=True):
+    if h5py is None:
+      raise ImportError('`save_weights` requires h5py.')
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+    layers = self.layers
+    f = h5py.File(filepath, 'w')
+    topology.save_weights_to_hdf5_group(f, layers)
+    f.flush()
+    f.close()
+
+  def compile(self,
+              optimizer,
+              loss,
+              metrics=None,
+              sample_weight_mode=None,
+              **kwargs):
+    """Configures the learning process.
+
+    Arguments:
+        optimizer: str (name of optimizer) or optimizer object.
+            See [optimizers](/optimizers).
+        loss: str (name of objective function) or objective function.
+            See [losses](/losses).
+        metrics: list of metrics to be evaluated by the model
+            during training and testing.
+            Typically you will use `metrics=['accuracy']`.
+            See [metrics](/metrics).
+        sample_weight_mode: if you need to do timestep-wise
+            sample weighting (2D weights), set this to "temporal".
+            "None" defaults to sample-wise weights (1D).
+        **kwargs: for Theano backend, these are passed into K.function.
+            When using the Tensorflow backend, these are passed into
+            `tf.Session.run`.
+
+    Example:
+        ```python
+            model = Sequential()
+            model.add(Dense(32, input_shape=(500,)))
+            model.add(Dense(10, activation='softmax'))
+            model.compile(optimizer='rmsprop',
+                          loss='categorical_crossentropy',
+                          metrics=['accuracy'])
+        ```
+    """
+    # create the underlying model
+    self.build()
+    # call compile method of Model class
+    self.model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        sample_weight_mode=sample_weight_mode,
+        **kwargs)
+    self.optimizer = self.model.optimizer
+    self.loss = self.model.loss
+    self.total_loss = self.model.total_loss
+    self.loss_weights = self.model.loss_weights
+    self.metrics = self.model.metrics
+    self.metrics_tensors = self.model.metrics_tensors
+    self.metrics_names = self.model.metrics_names
+    self.sample_weight_mode = self.model.sample_weight_mode
+    self.sample_weights = self.model.sample_weights
+    self.targets = self.model.targets
+
+  def fit(self,
+          x,
+          y,
+          batch_size=32,
+          epochs=10,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0):
+    """Trains the model for a fixed number of epochs.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        batch_size: integer. Number of samples per gradient update.
+        epochs: integer, the number of epochs to train the model.
+        verbose: 0 for no logging to stdout,
+            1 for progress bar logging, 2 for one log line per epoch.
+        callbacks: list of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
+            See [callbacks](/callbacks).
+        validation_split: float (0. < x < 1).
+            Fraction of the data to use as held-out validation data.
+        validation_data: tuple (x_val, y_val) or tuple
+            (x_val, y_val, val_sample_weights) to be used as held-out
+            validation data. Will override validation_split.
+        shuffle: boolean or str (for 'batch').
+            Whether to shuffle the samples at each epoch.
+            'batch' is a special option for dealing with the
+            limitations of HDF5 data; it shuffles in batch-sized chunks.
+        class_weight: dictionary mapping classes to a weight value,
+            used for scaling the loss function (during training only).
+        sample_weight: Numpy array of weights for
+            the training samples, used for scaling the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile().
+        initial_epoch: epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.fit(
+        x,
+        y,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_split=validation_split,
+        validation_data=validation_data,
+        shuffle=shuffle,
+        class_weight=class_weight,
+        sample_weight=sample_weight,
+        initial_epoch=initial_epoch)
+
+  def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
+    """Computes the loss on some input data, batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        batch_size: integer. Number of samples per gradient update.
+        verbose: verbosity mode, 0 or 1.
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.evaluate(
+        x,
+        y,
+        batch_size=batch_size,
+        verbose=verbose,
+        sample_weight=sample_weight)
+
+  def predict(self, x, batch_size=32, verbose=0):
+    """Generates output predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: the input data, as a Numpy array.
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict(x, batch_size=batch_size, verbose=verbose)
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict_on_batch(x)
+
+  def train_on_batch(self, x, y, class_weight=None, sample_weight=None):
+    """Single gradient update over one batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        class_weight: dictionary mapping classes to a weight value,
+            used for scaling the loss function (during training only).
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar training loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.train_on_batch(
+        x, y, sample_weight=sample_weight, class_weight=class_weight)
+
+  def test_on_batch(self, x, y, sample_weight=None):
+    """Evaluates the model over a single batch of samples.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        y: labels, as a Numpy array.
+        sample_weight: sample weights, as a Numpy array.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.test_on_batch(x, y, sample_weight=sample_weight)
+
+  def predict_proba(self, x, batch_size=32, verbose=1):
+    """Generates class probability predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of probability predictions.
+    """
+    preds = self.predict(x, batch_size, verbose)
+    if preds.min() < 0. or preds.max() > 1.:
+      logging.warning('Network returning invalid probability values. '
+                      'The last layer might not normalize predictions '
+                      'into probabilities '
+                      '(like softmax or sigmoid would).')
+    return preds
+
+  def predict_classes(self, x, batch_size=32, verbose=1):
+    """Generate class predictions for the input samples.
+
+    The input samples are processed batch by batch.
+
+    Arguments:
+        x: input data, as a Numpy array or list of Numpy arrays
+            (if the model has multiple inputs).
+        batch_size: integer.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A numpy array of class predictions.
+    """
+    proba = self.predict(x, batch_size=batch_size, verbose=verbose)
+    if proba.shape[-1] > 1:
+      return proba.argmax(axis=-1)
+    else:
+      return (proba > 0.5).astype('int32')
+
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_q_size=10,
+                    workers=1,
+                    pickle_safe=False,
+                    initial_epoch=0):
+    """Fits the model on data generated batch-by-batch by a Python generator.
+
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
+
+    Arguments:
+        generator: A generator.
+            The output of the generator must be either
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+            All arrays should contain the same number of samples.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `steps_per_epoch`
+            batches have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of unique samples of your dataset
+            divided by the batch size.
+        epochs: Integer, total number of iterations on the data.
+        verbose: Verbosity mode, 0, 1, or 2.
+        callbacks: List of callbacks to be called during training.
+        validation_data: This can be either
+            - A generator for the validation data
+            - A tuple (inputs, targets)
+            - A tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator.
+            Number of steps to yield from validation generator
+            at the end of every epoch. It should typically
+            be equal to the number of unique samples of your
+            validation dataset divided by the batch size.
+        class_weight: Dictionary mapping class indices to a weight
+            for the class.
+        max_q_size: Maximum size for the generator queue
+        workers: Maximum number of processes to spin up
+        pickle_safe: Ff True, use process based threading.
+            Note that because
+            this implementation relies on multiprocessing,
+            you should not pass
+            non picklable arguments to the generator
+            as they can't be passed
+            easily to children processes.
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+
+    Example:
+
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create Numpy arrays of input data
+                    # and labels, from each line in the file
+                    x, y = process_line(line)
+                    yield (x, y)
+                    f.close()
+
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            steps_per_epoch=1000, epochs=10)
+    ```
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.fit_generator(
+        generator,
+        steps_per_epoch,
+        epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        class_weight=class_weight,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe,
+        initial_epoch=initial_epoch)
+
+  def evaluate_generator(self,
+                         generator,
+                         steps,
+                         max_q_size=10,
+                         workers=1,
+                         pickle_safe=False):
+    """Evaluates the model on a data generator.
+
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
+
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+        pickle_safe: if True, use process based threading.
+            Note that because this implementation
+            relies on multiprocessing, you should not pass
+            non picklable arguments to the generator
+            as they can't be passed easily to children processes.
+
+    Returns:
+        Scalar test loss (if the model has no metrics)
+        or list of scalars (if the model computes other metrics).
+        The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: if the model was never compiled.
+    """
+    if self.model is None:
+      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+    return self.model.evaluate_generator(
+        generator,
+        steps,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe)
+
+  def predict_generator(self,
+                        generator,
+                        steps,
+                        max_q_size=10,
+                        workers=1,
+                        pickle_safe=False,
+                        verbose=0):
+    """Generates predictions for the input samples from a data generator.
+
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
+
+    Arguments:
+        generator: generator yielding batches of input samples.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+        max_q_size: maximum size for the generator queue
+        workers: maximum number of processes to spin up
+        pickle_safe: if True, use process based threading.
+            Note that because this implementation
+            relies on multiprocessing, you should not pass
+            non picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        A Numpy array of predictions.
+    """
+    if self.model is None:
+      self.build()
+    return self.model.predict_generator(
+        generator,
+        steps,
+        max_q_size=max_q_size,
+        workers=workers,
+        pickle_safe=pickle_safe,
+        verbose=verbose)
+
+  def get_config(self):
+    config = []
+    for layer in self.layers:
+      config.append({
+          'class_name': layer.__class__.__name__,
+          'config': layer.get_config()
+      })
+    return copy.deepcopy(config)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    model = cls()
+    for conf in config:
+      layer = layer_module.deserialize(conf, custom_objects=custom_objects)
+      model.add(layer)
+    return model
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
new file mode 100644
index 00000000000..99fd6e1cbe1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -0,0 +1,193 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class TestModelSaving(test.TestCase):
+
+  def test_sequential_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy],
+                    sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      new_model = keras.models.load_model(fname)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+      new_model.train_on_batch(x, y)
+      out = model.predict(x)
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_sequential_model_saving_2(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      # test with custom optimizer, loss
+
+      class CustomOp(keras.optimizers.RMSprop):
+        pass
+
+      def custom_loss(y_true, y_pred):
+        return keras.losses.mse(y_true, y_pred)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss=custom_loss, optimizer=CustomOp(), metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(
+          fname,
+          custom_objects={'CustomOp': CustomOp,
+                          'custom_loss': custom_loss})
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_functional_model_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+      out2 = model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_saving_without_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+  def test_saving_right_after_compilation(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+      model.model._make_train_function()
+
+      _, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.remove(fname)
+
+
+class TestSequential(test.TestCase):
+  """Most Sequential model API tests are covered in `training_test.py`.
+  """
+
+  def test_sequential_pop(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+      model.add(keras.layers.Dense(num_classes))
+      model.compile(loss='mse', optimizer='sgd')
+      x = np.random.random((batch_size, input_dim))
+      y = np.random.random((batch_size, num_classes))
+      model.fit(x, y, epochs=1)
+      model.pop()
+      self.assertEqual(len(model.layers), 1)
+      self.assertEqual(model.output_shape, (None, num_hidden))
+      model.compile(loss='mse', optimizer='sgd')
+      y = np.random.random((batch_size, num_hidden))
+      model.fit(x, y, epochs=1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/optimizers.py b/tensorflow/contrib/keras/python/keras/optimizers.py
new file mode 100644
index 00000000000..75fce5c96f6
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/optimizers.py
@@ -0,0 +1,758 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras optimizer classes (will eventually be replaced with core optimizers).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.training import optimizer as tf_optimizer_module
+
+
+def clip_norm(g, c, n):
+  if c > 0:
+    g = K.switch(n >= c, g * c / n, g)
+  return g
+
+
+class Optimizer(object):
+  """Abstract optimizer base class.
+
+  Note: this is the parent class of all optimizers, not an actual optimizer
+  that can be used for training models.
+
+  All Keras optimizers support the following keyword arguments:
+
+      clipnorm: float >= 0. Gradients will be clipped
+          when their L2 norm exceeds this value.
+      clipvalue: float >= 0. Gradients will be clipped
+          when their absolute value exceeds this value.
+  """
+
+  def __init__(self, **kwargs):
+    allowed_kwargs = {'clipnorm', 'clipvalue'}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError('Unexpected keyword argument '
+                        'passed to optimizer: ' + str(k))
+    self.__dict__.update(kwargs)
+    self.updates = []
+    self.weights = []
+
+  def get_updates(self, params, constraints, loss):
+    raise NotImplementedError
+
+  def get_gradients(self, loss, params):
+    grads = K.gradients(loss, params)
+    if hasattr(self, 'clipnorm') and self.clipnorm > 0:
+      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
+    if hasattr(self, 'clipvalue') and self.clipvalue > 0:
+      grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
+    return grads
+
+  def set_weights(self, weights):
+    """Sets the weights of the optimizer, from Numpy arrays.
+
+    Should only be called after computing the gradients
+    (otherwise the optimizer has no weights).
+
+    Arguments:
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the optimizer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: in case of incompatible weight shapes.
+    """
+    params = self.weights
+    weight_value_tuples = []
+    param_values = K.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    K.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current value of the weights of the optimizer.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    return K.batch_get_value(self.weights)
+
+  def get_config(self):
+    config = {}
+    if hasattr(self, 'clipnorm'):
+      config['clipnorm'] = self.clipnorm
+    if hasattr(self, 'clipvalue'):
+      config['clipvalue'] = self.clipvalue
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class SGD(Optimizer):
+  """Stochastic gradient descent optimizer.
+
+  Includes support for momentum,
+  learning rate decay, and Nesterov momentum.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      momentum: float >= 0. Parameter updates momentum.
+      decay: float >= 0. Learning rate decay over each update.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+  """
+
+  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
+    super(SGD, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.momentum = K.variable(momentum, name='momentum')
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.nesterov = nesterov
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    # momentum
+    shapes = [K.int_shape(p) for p in params]
+    moments = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + moments
+    for p, g, m in zip(params, grads, moments):
+      v = self.momentum * m - lr * g  # velocity
+      self.updates.append(K.update(m, v))
+
+      if self.nesterov:
+        new_p = p + self.momentum * v - lr * g
+      else:
+        new_p = p + v
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'momentum': float(K.get_value(self.momentum)),
+        'decay': float(K.get_value(self.decay)),
+        'nesterov': self.nesterov
+    }
+    base_config = super(SGD, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class RMSprop(Optimizer):
+  # pylint: disable=line-too-long
+  """RMSProp optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values
+  (except the learning rate, which can be freely tuned).
+
+  This optimizer is usually a good choice for recurrent
+  neural networks.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [rmsprop: Divide the gradient by a running average of its recent
+        magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., **kwargs):
+    super(RMSprop, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.rho = K.variable(rho, name='rho')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a in zip(params, grads, accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      self.updates.append(K.update(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': float(K.get_value(self.rho)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(RMSprop, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adagrad(Optimizer):
+  # pylint: disable=line-too-long
+  """Adagrad optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      epsilon: float >= 0.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+    super(Adagrad, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a in zip(params, grads, accumulators):
+      new_a = a + K.square(g)  # update accumulator
+      self.updates.append(K.update(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adagrad, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adadelta(Optimizer):
+  # pylint: disable=line-too-long
+  """Adadelta optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+          It is recommended to leave it at the default value.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adadelta - an adaptive learning rate
+        method](http://arxiv.org/abs/1212.5701)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., **kwargs):
+    super(Adadelta, self).__init__(**kwargs)
+    self.lr = K.variable(lr, name='lr')
+    self.rho = rho
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.iterations = K.variable(0., name='iterations')
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    delta_accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators + delta_accumulators
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+      self.updates.append(K.update_add(self.iterations, 1))
+
+    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * K.square(g)
+      self.updates.append(K.update(a, new_a))
+
+      # use the new accumulator and the *old* delta_accumulator
+      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
+
+      new_p = p - lr * update
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+
+      # update delta_accumulator
+      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
+      self.updates.append(K.update(d_a, new_d_a))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': self.rho,
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adadelta, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adam(Optimizer):
+  # pylint: disable=line-too-long
+  """Adam optimizer.
+
+  Default parameters follow those provided in the original paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1: float, 0 < beta < 1. Generally close to 1.
+      beta_2: float, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adam - A Method for Stochastic
+        Optimization](http://arxiv.org/abs/1412.6980v8)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               decay=0.,
+               **kwargs):
+    super(Adam, self).__init__(**kwargs)
+    self.iterations = K.variable(0, name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+
+    t = self.iterations + 1
+    lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
+                 (1. - K.pow(self.beta_1, t)))
+
+    shapes = [K.int_shape(p) for p in params]
+    ms = [K.zeros(shape) for shape in shapes]
+    vs = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + ms + vs
+
+    for p, g, m, v in zip(params, grads, ms, vs):
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(v, v_t))
+
+      new_p = p_t
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adamax(Optimizer):
+  # pylint: disable=line-too-long
+  """Adamax optimizer from Adam paper's Section 7.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+      decay: float >= 0. Learning rate decay over each update.
+
+  References:
+      - [Adam - A Method for Stochastic
+        Optimization](http://arxiv.org/abs/1412.6980v8)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               decay=0.,
+               **kwargs):
+    super(Adamax, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr *= (1. / (1. + self.decay * self.iterations))
+
+    t = self.iterations + 1
+    lr_t = lr / (1. - K.pow(self.beta_1, t))
+
+    shapes = [K.int_shape(p) for p in params]
+    # zero init of 1st moment
+    ms = [K.zeros(shape) for shape in shapes]
+    # zero init of exponentially weighted infinity norm
+    us = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + ms + us
+
+    for p, g, m, u in zip(params, grads, ms, us):
+
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      p_t = p - lr_t * m_t / (u_t + self.epsilon)
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(u, u_t))
+
+      new_p = p_t
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adamax, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Nadam(Optimizer):
+  # pylint: disable=line-too-long
+  """Nesterov Adam optimizer.
+
+  Much like Adam is essentially RMSprop with momentum,
+  Nadam is Adam RMSprop with Nesterov momentum.
+
+  Default parameters follow those provided in the paper.
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+
+  References:
+      - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
+      - [On the importance of initialization and momentum in deep
+        learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               schedule_decay=0.004,
+               **kwargs):
+    super(Nadam, self).__init__(**kwargs)
+    self.iterations = K.variable(0., name='iterations')
+    self.m_schedule = K.variable(1., name='m_schedule')
+    self.lr = K.variable(lr, name='lr')
+    self.beta_1 = K.variable(beta_1, name='beta_1')
+    self.beta_2 = K.variable(beta_2, name='beta_2')
+    self.epsilon = epsilon
+    self.schedule_decay = schedule_decay
+
+  def get_updates(self, params, constraints, loss):
+    grads = self.get_gradients(loss, params)
+    self.updates = [K.update_add(self.iterations, 1)]
+
+    t = self.iterations + 1
+
+    # Due to the recommendations in [2], i.e. warming momentum schedule
+    momentum_cache_t = self.beta_1 * (1. - 0.5 *
+                                      (K.pow(0.96, t * self.schedule_decay)))
+    momentum_cache_t_1 = self.beta_1 * (1. - 0.5 *
+                                        (K.pow(0.96,
+                                               (t + 1) * self.schedule_decay)))
+    m_schedule_new = self.m_schedule * momentum_cache_t
+    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
+    self.updates.append((self.m_schedule, m_schedule_new))
+
+    shapes = [K.int_shape(p) for p in params]
+    ms = [K.zeros(shape) for shape in shapes]
+    vs = [K.zeros(shape) for shape in shapes]
+
+    self.weights = [self.iterations] + ms + vs
+
+    for p, g, m, v in zip(params, grads, ms, vs):
+      # the following equations given in [1]
+      g_prime = g / (1. - m_schedule_new)
+      m_t = self.beta_1 * m + (1. - self.beta_1) * g
+      m_t_prime = m_t / (1. - m_schedule_next)
+      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
+      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      m_t_bar = (
+          1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
+
+      self.updates.append(K.update(m, m_t))
+      self.updates.append(K.update(v, v_t))
+
+      p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
+      new_p = p_t
+
+      # apply constraints
+      if p in constraints:
+        c = constraints[p]
+        new_p = c(new_p)
+      self.updates.append(K.update(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'epsilon': self.epsilon,
+        'schedule_decay': self.schedule_decay
+    }
+    base_config = super(Nadam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class TFOptimizer(Optimizer):
+  """Wrapper class for native TensorFlow optimizers.
+  """
+
+  def __init__(self, optimizer):  # pylint: disable=super-init-not-called
+    self.optimizer = optimizer
+    self.iterations = K.variable(0., name='iterations')
+    self.updates = []
+
+  def get_updates(self, params, constraints, loss):
+    if constraints:
+      raise ValueError('TF optimizers do not support '
+                       'weights constraints. Either remove '
+                       'all weights constraints in your model, '
+                       'or use a Keras optimizer.')
+    grads = self.optimizer.compute_gradients(loss, params)
+    opt_update = self.optimizer.apply_gradients(
+        grads, global_step=self.iterations)
+    self.updates.append(opt_update)
+    return self.updates
+
+  @property
+  def weights(self):
+    raise NotImplementedError
+
+  def get_config(self):
+    raise NotImplementedError
+
+  def from_config(self, config):
+    raise NotImplementedError
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+sgd = SGD
+rmsprop = RMSprop
+adagrad = Adagrad
+adadelta = Adadelta
+adam = Adam
+adamax = Adamax
+nadam = Nadam
+
+# pylint: enable=invalid-name
+
+
+def serialize(optimizer):
+  return serialize_keras_object(optimizer)
+
+
+def deserialize(config, custom_objects=None):
+  """Inverse of the `serialize` function.
+
+  Arguments:
+      config: Optimizer configuration dictionary.
+      custom_objects: Optional dictionary mapping
+          names (strings) to custom objects
+          (classes and functions)
+          to be considered during deserialization.
+
+  Returns:
+      A Keras Optimizer instance.
+  """
+  all_classes = {
+      'sgd': SGD,
+      'rmsprop': RMSprop,
+      'adagrad': Adagrad,
+      'adadelta': Adadelta,
+      'adam': Adam,
+      'adamax': Adamax,
+      'nadam': Nadam,
+      'tfoptimizer': TFOptimizer,
+  }
+  # Make deserialization case-insensitive for built-in optimizers.
+  if config['class_name'].lower() in all_classes:
+    config['class_name'] = config['class_name'].lower()
+  return deserialize_keras_object(
+      config,
+      module_objects=all_classes,
+      custom_objects=custom_objects,
+      printable_module_name='optimizer')
+
+
+def get(identifier):
+  """Retrieves a Keras Optimizer instance.
+
+  Arguments:
+      identifier: Optimizer identifier, one of
+          - String: name of an optimizer
+          - Dictionary: configuration dictionary.
+          - Keras Optimizer instance (it will be returned unchanged).
+          - TensorFlow Optimizer instance
+              (it will be wrapped as a Keras Optimizer).
+
+  Returns:
+      A Keras Optimizer instance.
+
+  Raises:
+      ValueError: If `identifier` cannot be interpreted.
+  """
+  # Wrap TF optimizer instances
+  if isinstance(identifier, tf_optimizer_module.Optimizer):
+    return TFOptimizer(identifier)
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  if isinstance(identifier, Optimizer):
+    return identifier
+  else:
+    raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/optimizers_test.py b/tensorflow/contrib/keras/python/keras/optimizers_test.py
new file mode 100644
index 00000000000..af5e3c99b96
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/optimizers_test.py
@@ -0,0 +1,107 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras optimizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+def _get_model(input_dim, num_hidden, output_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(num_hidden,
+                               activation='relu',
+                               input_shape=(input_dim,)))
+  model.add(keras.layers.Dense(output_dim, activation='softmax'))
+  return model
+
+
+def _test_optimizer(optimizer, target=0.75):
+  np.random.seed(1337)
+  (x_train, y_train), _ = testing_utils.get_test_data(train_samples=1000,
+                                                      test_samples=200,
+                                                      input_shape=(10,),
+                                                      num_classes=2)
+  y_train = keras.utils.to_categorical(y_train)
+  model = _get_model(x_train.shape[1], 20, y_train.shape[1])
+  model.compile(loss='categorical_crossentropy',
+                optimizer=optimizer,
+                metrics=['accuracy'])
+  history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+  assert history.history['acc'][-1] >= target
+  config = keras.optimizers.serialize(optimizer)
+  optim = keras.optimizers.deserialize(config)
+  new_config = keras.optimizers.serialize(optim)
+  new_config['class_name'] = new_config['class_name'].lower()
+  assert config == new_config
+
+
+class KerasOptimizersTest(test.TestCase):
+
+  def test_sgd(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           nesterov=True))
+
+  def test_rmsprop(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.RMSprop())
+      _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+
+  def test_adagrad(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adagrad())
+      _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+
+  def test_adadelta(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.6)
+
+  def test_adam(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adam())
+      _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+
+  def test_adamax(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Adamax())
+      _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+
+  def test_nadam(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.Nadam())
+
+  def test_clipnorm(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           clipnorm=0.5))
+
+  def test_clipvalue(self):
+    with self.test_session():
+      _test_optimizer(keras.optimizers.SGD(lr=0.01,
+                                           momentum=0.9,
+                                           clipvalue=0.5))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py b/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py
new file mode 100644
index 00000000000..9ae14c9674e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data preprocessing module.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.preprocessing import image
+from tensorflow.contrib.keras.python.keras.preprocessing import sequence
+from tensorflow.contrib.keras.python.keras.preprocessing import text
+
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
new file mode 100644
index 00000000000..0d69396e8b5
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -0,0 +1,1098 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fairly basic set of tools for real-time data augmentation on image data.
+
+Can easily be extended to include new transformations,
+new preprocessing methods, etc...
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import threading
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.python.platform import tf_logging as logging
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  from PIL import Image as pil_image
+except ImportError:
+  pil_image = None
+try:
+  from scipy import linalg
+  import scipy.ndimage as ndi
+except ImportError:
+  linalg = None
+  ndi = None
+# pylint: enable=g-import-not-at-top
+
+
+def random_rotation(x,
+                    rg,
+                    row_axis=1,
+                    col_axis=2,
+                    channel_axis=0,
+                    fill_mode='nearest',
+                    cval=0.):
+  """Performs a random rotation of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      rg: Rotation range, in degrees.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Rotated Numpy image tensor.
+  """
+  theta = np.pi / 180 * np.random.uniform(-rg, rg)
+  rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
+                              [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_shift(x,
+                 wrg,
+                 hrg,
+                 row_axis=1,
+                 col_axis=2,
+                 channel_axis=0,
+                 fill_mode='nearest',
+                 cval=0.):
+  """Performs a random spatial shift of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      wrg: Width shift range, as a float fraction of the width.
+      hrg: Height shift range, as a float fraction of the height.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Shifted Numpy image tensor.
+  """
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  tx = np.random.uniform(-hrg, hrg) * h
+  ty = np.random.uniform(-wrg, wrg) * w
+  translation_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+
+  transform_matrix = translation_matrix  # no need to do offset
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_shear(x,
+                 intensity,
+                 row_axis=1,
+                 col_axis=2,
+                 channel_axis=0,
+                 fill_mode='nearest',
+                 cval=0.):
+  """Performs a random spatial shear of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      intensity: Transformation intensity.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Sheared Numpy image tensor.
+  """
+  shear = np.random.uniform(-intensity, intensity)
+  shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
+                           [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_zoom(x,
+                zoom_range,
+                row_axis=1,
+                col_axis=2,
+                channel_axis=0,
+                fill_mode='nearest',
+                cval=0.):
+  """Performs a random spatial zoom of a Numpy image tensor.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      zoom_range: Tuple of floats; zoom range for width and height.
+      row_axis: Index of axis for rows in the input tensor.
+      col_axis: Index of axis for columns in the input tensor.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      Zoomed Numpy image tensor.
+
+  Raises:
+      ValueError: if `zoom_range` isn't a tuple.
+  """
+  if len(zoom_range) != 2:
+    raise ValueError('zoom_range should be a tuple or list of two floats. '
+                     'Received arg: ', zoom_range)
+
+  if zoom_range[0] == 1 and zoom_range[1] == 1:
+    zx, zy = 1, 1
+  else:
+    zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
+  zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
+
+  h, w = x.shape[row_axis], x.shape[col_axis]
+  transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
+  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
+  return x
+
+
+def random_channel_shift(x, intensity, channel_axis=0):
+  x = np.rollaxis(x, channel_axis, 0)
+  min_x, max_x = np.min(x), np.max(x)
+  channel_images = [
+      np.clip(x_channel + np.random.uniform(-intensity, intensity), min_x,
+              max_x) for x_channel in x
+  ]
+  x = np.stack(channel_images, axis=0)
+  x = np.rollaxis(x, 0, channel_axis + 1)
+  return x
+
+
+def transform_matrix_offset_center(matrix, x, y):
+  o_x = float(x) / 2 + 0.5
+  o_y = float(y) / 2 + 0.5
+  offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
+  reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
+  transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
+  return transform_matrix
+
+
+def apply_transform(x,
+                    transform_matrix,
+                    channel_axis=0,
+                    fill_mode='nearest',
+                    cval=0.):
+  """Apply the image transformation specified by a matrix.
+
+  Arguments:
+      x: 2D numpy array, single image.
+      transform_matrix: Numpy array specifying the geometric transformation.
+      channel_axis: Index of axis for channels in the input tensor.
+      fill_mode: Points outside the boundaries of the input
+          are filled according to the given mode
+          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+      cval: Value used for points outside the boundaries
+          of the input if `mode='constant'`.
+
+  Returns:
+      The transformed version of the input.
+  """
+  x = np.rollaxis(x, channel_axis, 0)
+  final_affine_matrix = transform_matrix[:2, :2]
+  final_offset = transform_matrix[:2, 2]
+  channel_images = [
+      ndi.interpolation.affine_transform(
+          x_channel,
+          final_affine_matrix,
+          final_offset,
+          order=0,
+          mode=fill_mode,
+          cval=cval) for x_channel in x
+  ]
+  x = np.stack(channel_images, axis=0)
+  x = np.rollaxis(x, 0, channel_axis + 1)
+  return x
+
+
+def flip_axis(x, axis):
+  x = np.asarray(x).swapaxes(axis, 0)
+  x = x[::-1, ...]
+  x = x.swapaxes(0, axis)
+  return x
+
+
+def array_to_img(x, data_format=None, scale=True):
+  """Converts a 3D Numpy array to a PIL Image instance.
+
+  Arguments:
+      x: Input Numpy array.
+      data_format: Image data format.
+      scale: Whether to rescale image values
+          to be within [0, 255].
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+      ValueError: if invalid `x` or `data_format` is passed.
+  """
+  if pil_image is None:
+    raise ImportError('Could not import PIL.Image. '
+                      'The use of `array_to_img` requires PIL.')
+  x = np.asarray(x, dtype=K.floatx())
+  if x.ndim != 3:
+    raise ValueError('Expected image array to have rank 3 (single image). '
+                     'Got array with shape:', x.shape)
+
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Invalid data_format:', data_format)
+
+  # Original Numpy array x has format (height, width, channel)
+  # or (channel, height, width)
+  # but target PIL image has format (width, height, channel)
+  if data_format == 'channels_first':
+    x = x.transpose(1, 2, 0)
+  if scale:
+    x = x + max(-np.min(x), 0)  # pylint: disable=g-no-augmented-assignment
+    x_max = np.max(x)
+    if x_max != 0:
+      x /= x_max
+    x *= 255
+  if x.shape[2] == 3:
+    # RGB
+    return pil_image.fromarray(x.astype('uint8'), 'RGB')
+  elif x.shape[2] == 1:
+    # grayscale
+    return pil_image.fromarray(x[:, :, 0].astype('uint8'), 'L')
+  else:
+    raise ValueError('Unsupported channel number: ', x.shape[2])
+
+
+def img_to_array(img, data_format=None):
+  """Converts a PIL Image instance to a Numpy array.
+
+  Arguments:
+      img: PIL Image instance.
+      data_format: Image data format.
+
+  Returns:
+      A 3D Numpy array.
+
+  Raises:
+      ValueError: if invalid `img` or `data_format` is passed.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format: ', data_format)
+  # Numpy array x has format (height, width, channel)
+  # or (channel, height, width)
+  # but original PIL image has format (width, height, channel)
+  x = np.asarray(img, dtype=K.floatx())
+  if len(x.shape) == 3:
+    if data_format == 'channels_first':
+      x = x.transpose(2, 0, 1)
+  elif len(x.shape) == 2:
+    if data_format == 'channels_first':
+      x = x.reshape((1, x.shape[0], x.shape[1]))
+    else:
+      x = x.reshape((x.shape[0], x.shape[1], 1))
+  else:
+    raise ValueError('Unsupported image shape: ', x.shape)
+  return x
+
+
+def load_img(path, grayscale=False, target_size=None):
+  """Loads an image into PIL format.
+
+  Arguments:
+      path: Path to image file
+      grayscale: Boolean, whether to load the image as grayscale.
+      target_size: Either `None` (default to original size)
+          or tuple of ints `(img_height, img_width)`.
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+  """
+  if pil_image is None:
+    raise ImportError('Could not import PIL.Image. '
+                      'The use of `array_to_img` requires PIL.')
+  img = pil_image.open(path)
+  if grayscale:
+    if img.mode != 'L':
+      img = img.convert('L')
+  else:
+    if img.mode != 'RGB':
+      img = img.convert('RGB')
+  if target_size:
+    hw_tuple = (target_size[1], target_size[0])
+    if img.size != hw_tuple:
+      img = img.resize(hw_tuple)
+  return img
+
+
+def list_pictures(directory, ext='jpg|jpeg|bmp|png'):
+  return [
+      os.path.join(root, f)
+      for root, _, files in os.walk(directory) for f in files
+      if re.match(r'([\w]+\.(?:' + ext + '))', f)
+  ]
+
+
+class ImageDataGenerator(object):
+  """Generate minibatches of image data with real-time data augmentation.
+
+  Arguments:
+      featurewise_center: set input mean to 0 over the dataset.
+      samplewise_center: set each sample mean to 0.
+      featurewise_std_normalization: divide inputs by std of the dataset.
+      samplewise_std_normalization: divide each input by its std.
+      zca_whitening: apply ZCA whitening.
+      zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
+      rotation_range: degrees (0 to 180).
+      width_shift_range: fraction of total width.
+      height_shift_range: fraction of total height.
+      shear_range: shear intensity (shear angle in radians).
+      zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
+          in the range [1-z, 1+z]. A sequence of two can be passed instead
+          to select this range.
+      channel_shift_range: shift range for each channels.
+      fill_mode: points outside the boundaries are filled according to the
+          given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
+          is 'nearest'.
+      cval: value used for points outside the boundaries when fill_mode is
+          'constant'. Default is 0.
+      horizontal_flip: whether to randomly flip images horizontally.
+      vertical_flip: whether to randomly flip images vertically.
+      rescale: rescaling factor. If None or 0, no rescaling is applied,
+          otherwise we multiply the data by the value provided
+          (before applying any other transformation).
+      preprocessing_function: function that will be implied on each input.
+          The function will run before any other modification on it.
+          The function should take one argument:
+          one image (Numpy tensor with rank 3),
+          and should output a Numpy tensor with the same shape.
+      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
+        mode, the channels dimension
+          (the depth) is at index 1, in 'channels_last' mode it is at index 3.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+  """
+
+  def __init__(self,
+               featurewise_center=False,
+               samplewise_center=False,
+               featurewise_std_normalization=False,
+               samplewise_std_normalization=False,
+               zca_whitening=False,
+               zca_epsilon=1e-6,
+               rotation_range=0.,
+               width_shift_range=0.,
+               height_shift_range=0.,
+               shear_range=0.,
+               zoom_range=0.,
+               channel_shift_range=0.,
+               fill_mode='nearest',
+               cval=0.,
+               horizontal_flip=False,
+               vertical_flip=False,
+               rescale=None,
+               preprocessing_function=None,
+               data_format=None):
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.featurewise_center = featurewise_center
+    self.samplewise_center = samplewise_center
+    self.featurewise_std_normalization = featurewise_std_normalization
+    self.samplewise_std_normalization = samplewise_std_normalization
+    self.zca_whitening = zca_whitening
+    self.zca_epsilon = zca_epsilon
+    self.rotation_range = rotation_range
+    self.width_shift_range = width_shift_range
+    self.height_shift_range = height_shift_range
+    self.shear_range = shear_range
+    self.zoom_range = zoom_range
+    self.channel_shift_range = channel_shift_range
+    self.fill_mode = fill_mode
+    self.cval = cval
+    self.horizontal_flip = horizontal_flip
+    self.vertical_flip = vertical_flip
+    self.rescale = rescale
+    self.preprocessing_function = preprocessing_function
+
+    if data_format not in {'channels_last', 'channels_first'}:
+      raise ValueError(
+          'data_format should be "channels_last" (channel after row and '
+          'column) or "channels_first" (channel before row and column). '
+          'Received arg: ', data_format)
+    self.data_format = data_format
+    if data_format == 'channels_first':
+      self.channel_axis = 1
+      self.row_axis = 2
+      self.col_axis = 3
+    if data_format == 'channels_last':
+      self.channel_axis = 3
+      self.row_axis = 1
+      self.col_axis = 2
+
+    self.mean = None
+    self.std = None
+    self.principal_components = None
+
+    if np.isscalar(zoom_range):
+      self.zoom_range = [1 - zoom_range, 1 + zoom_range]
+    elif len(zoom_range) == 2:
+      self.zoom_range = [zoom_range[0], zoom_range[1]]
+    else:
+      raise ValueError('zoom_range should be a float or '
+                       'a tuple or list of two floats. '
+                       'Received arg: ', zoom_range)
+
+  def flow(self,
+           x,
+           y=None,
+           batch_size=32,
+           shuffle=True,
+           seed=None,
+           save_to_dir=None,
+           save_prefix='',
+           save_format='png'):
+    return NumpyArrayIterator(
+        x,
+        y,
+        self,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=self.data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format)
+
+  def flow_from_directory(self,
+                          directory,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          follow_links=False):
+    return DirectoryIterator(
+        directory,
+        self,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links)
+
+  def standardize(self, x):
+    """Apply the normalization configuration to a batch of inputs.
+
+    Arguments:
+        x: batch of inputs to be normalized.
+
+    Returns:
+        The inputs, normalized.
+    """
+    if self.preprocessing_function:
+      x = self.preprocessing_function(x)
+    if self.rescale:
+      x *= self.rescale
+    # x is a single image, so it doesn't have image number at index 0
+    img_channel_axis = self.channel_axis - 1
+    if self.samplewise_center:
+      x -= np.mean(x, axis=img_channel_axis, keepdims=True)
+    if self.samplewise_std_normalization:
+      x /= (np.std(x, axis=img_channel_axis, keepdims=True) + 1e-7)
+
+    if self.featurewise_center:
+      if self.mean is not None:
+        x -= self.mean
+      else:
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_center`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
+    if self.featurewise_std_normalization:
+      if self.std is not None:
+        x /= (self.std + 1e-7)
+      else:
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_std_normalization`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
+    if self.zca_whitening:
+      if self.principal_components is not None:
+        flatx = np.reshape(x, (x.size))
+        whitex = np.dot(flatx, self.principal_components)
+        x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))
+      else:
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening`, but it hasn\'t'
+                        'been fit on any training data. Fit it '
+                        'first by calling `.fit(numpy_data)`.')
+    return x
+
+  def random_transform(self, x):
+    """Randomly augment a single image tensor.
+
+    Arguments:
+        x: 3D tensor, single image.
+
+    Returns:
+        A randomly transformed version of the input (same shape).
+
+    Raises:
+        ImportError: if Scipy is not available.
+    """
+    if ndi is None:
+      raise ImportError('Scipy is required for image transformations.')
+
+    # x is a single image, so it doesn't have image number at index 0
+    img_row_axis = self.row_axis - 1
+    img_col_axis = self.col_axis - 1
+    img_channel_axis = self.channel_axis - 1
+
+    # use composition of homographies
+    # to generate final transform that needs to be applied
+    if self.rotation_range:
+      theta = np.pi / 180 * np.random.uniform(-self.rotation_range,
+                                              self.rotation_range)
+    else:
+      theta = 0
+
+    if self.height_shift_range:
+      tx = np.random.uniform(-self.height_shift_range,
+                             self.height_shift_range) * x.shape[img_row_axis]
+    else:
+      tx = 0
+
+    if self.width_shift_range:
+      ty = np.random.uniform(-self.width_shift_range,
+                             self.width_shift_range) * x.shape[img_col_axis]
+    else:
+      ty = 0
+
+    if self.shear_range:
+      shear = np.random.uniform(-self.shear_range, self.shear_range)
+    else:
+      shear = 0
+
+    if self.zoom_range[0] == 1 and self.zoom_range[1] == 1:
+      zx, zy = 1, 1
+    else:
+      zx, zy = np.random.uniform(self.zoom_range[0], self.zoom_range[1], 2)
+
+    transform_matrix = None
+    if theta != 0:
+      rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
+                                  [np.sin(theta),
+                                   np.cos(theta), 0], [0, 0, 1]])
+      transform_matrix = rotation_matrix
+
+    if tx != 0 or ty != 0:
+      shift_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+      transform_matrix = shift_matrix if transform_matrix is None else np.dot(
+          transform_matrix, shift_matrix)
+
+    if shear != 0:
+      shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
+                               [0, 0, 1]])
+      transform_matrix = shear_matrix if transform_matrix is None else np.dot(
+          transform_matrix, shear_matrix)
+
+    if zx != 1 or zy != 1:
+      zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
+      transform_matrix = zoom_matrix if transform_matrix is None else np.dot(
+          transform_matrix, zoom_matrix)
+
+    if transform_matrix is not None:
+      h, w = x.shape[img_row_axis], x.shape[img_col_axis]
+      transform_matrix = transform_matrix_offset_center(transform_matrix, h, w)
+      x = apply_transform(
+          x,
+          transform_matrix,
+          img_channel_axis,
+          fill_mode=self.fill_mode,
+          cval=self.cval)
+
+    if self.channel_shift_range != 0:
+      x = random_channel_shift(x, self.channel_shift_range, img_channel_axis)
+    if self.horizontal_flip:
+      if np.random.random() < 0.5:
+        x = flip_axis(x, img_col_axis)
+
+    if self.vertical_flip:
+      if np.random.random() < 0.5:
+        x = flip_axis(x, img_row_axis)
+
+    return x
+
+  def fit(self, x, augment=False, rounds=1, seed=None):
+    """Fits internal statistics to some sample data.
+
+    Required for featurewise_center, featurewise_std_normalization
+    and zca_whitening.
+
+    Arguments:
+        x: Numpy array, the data to fit on. Should have rank 4.
+            In case of grayscale data,
+            the channels axis should have value 1, and in case
+            of RGB data, it should have value 3.
+        augment: Whether to fit on randomly augmented samples
+        rounds: If `augment`,
+            how many augmentation passes to do over the data
+        seed: random seed.
+
+    Raises:
+        ValueError: in case of invalid input `x`.
+        ImportError: if Scipy is not available.
+    """
+    x = np.asarray(x, dtype=K.floatx())
+    if x.ndim != 4:
+      raise ValueError('Input to `.fit()` should have rank 4. '
+                       'Got array with shape: ' + str(x.shape))
+    if x.shape[self.channel_axis] not in {1, 3, 4}:
+      raise ValueError(
+          'Expected input to be images (as Numpy array) '
+          'following the data format convention "' + self.data_format + '" '
+          '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
+          'either 1, 3 or 4 channels on axis ' + str(self.channel_axis) + '. '
+          'However, it was passed an array with shape ' + str(x.shape) + ' (' +
+          str(x.shape[self.channel_axis]) + ' channels).')
+
+    if seed is not None:
+      np.random.seed(seed)
+
+    x = np.copy(x)
+    if augment:
+      ax = np.zeros(
+          tuple([rounds * x.shape[0]] + list(x.shape)[1:]), dtype=K.floatx())
+      for r in range(rounds):
+        for i in range(x.shape[0]):
+          ax[i + r * x.shape[0]] = self.random_transform(x[i])
+      x = ax
+
+    if self.featurewise_center:
+      self.mean = np.mean(x, axis=(0, self.row_axis, self.col_axis))
+      broadcast_shape = [1, 1, 1]
+      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+      self.mean = np.reshape(self.mean, broadcast_shape)
+      x -= self.mean
+
+    if self.featurewise_std_normalization:
+      self.std = np.std(x, axis=(0, self.row_axis, self.col_axis))
+      broadcast_shape = [1, 1, 1]
+      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+      self.std = np.reshape(self.std, broadcast_shape)
+      x /= (self.std + K.epsilon())
+
+    if self.zca_whitening:
+      if linalg is None:
+        raise ImportError('Scipy is required for zca_whitening.')
+
+      flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
+      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
+      u, s, _ = linalg.svd(sigma)
+      self.principal_components = np.dot(
+          np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
+
+
+class Iterator(object):
+  """Abstract base class for image data iterators.
+
+  Arguments:
+      n: Integer, total number of samples in the dataset to loop over.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seeding for data shuffling.
+  """
+
+  def __init__(self, n, batch_size, shuffle, seed):
+    self.n = n
+    self.batch_size = batch_size
+    self.shuffle = shuffle
+    self.batch_index = 0
+    self.total_batches_seen = 0
+    self.lock = threading.Lock()
+    self.index_generator = self._flow_index(n, batch_size, shuffle, seed)
+
+  def reset(self):
+    self.batch_index = 0
+
+  def _flow_index(self, n, batch_size=32, shuffle=False, seed=None):
+    # Ensure self.batch_index is 0.
+    self.reset()
+    while 1:
+      if seed is not None:
+        np.random.seed(seed + self.total_batches_seen)
+      if self.batch_index == 0:
+        index_array = np.arange(n)
+        if shuffle:
+          index_array = np.random.permutation(n)
+
+      current_index = (self.batch_index * batch_size) % n
+      if n > current_index + batch_size:
+        current_batch_size = batch_size
+        self.batch_index += 1
+      else:
+        current_batch_size = n - current_index
+        self.batch_index = 0
+      self.total_batches_seen += 1
+      yield (index_array[current_index:current_index + current_batch_size],
+             current_index, current_batch_size)
+
+  def __iter__(self):  # pylint: disable=non-iterator-returned
+    # Needed if we want to do something like:
+    # for x, y in data_gen.flow(...):
+    return self
+
+  def __next__(self, *args, **kwargs):
+    return self.next(*args, **kwargs)
+
+
+class NumpyArrayIterator(Iterator):
+  """Iterator yielding data from a Numpy array.
+
+  Arguments:
+      x: Numpy array of input data.
+      y: Numpy array of targets data.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+  """
+
+  def __init__(self,
+               x,
+               y,
+               image_data_generator,
+               batch_size=32,
+               shuffle=False,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='png'):
+    if y is not None and len(x) != len(y):
+      raise ValueError('X (images tensor) and y (labels) '
+                       'should have the same length. '
+                       'Found: X.shape = %s, y.shape = %s' %
+                       (np.asarray(x).shape, np.asarray(y).shape))
+
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.x = np.asarray(x, dtype=K.floatx())
+
+    if self.x.ndim != 4:
+      raise ValueError('Input data in `NumpyArrayIterator` '
+                       'should have rank 4. You passed an array '
+                       'with shape', self.x.shape)
+    channels_axis = 3 if data_format == 'channels_last' else 1
+    if self.x.shape[channels_axis] not in {1, 3, 4}:
+      raise ValueError(
+          'NumpyArrayIterator is set to use the '
+          'data format convention "' + data_format + '" '
+          '(channels on axis ' + str(channels_axis) + '), i.e. expected '
+          'either 1, 3 or 4 channels on axis ' + str(channels_axis) + '. '
+          'However, it was passed an array with shape ' + str(self.x.shape) +
+          ' (' + str(self.x.shape[channels_axis]) + ' channels).')
+    if y is not None:
+      self.y = np.asarray(y)
+    else:
+      self.y = None
+    self.image_data_generator = image_data_generator
+    self.data_format = data_format
+    self.save_to_dir = save_to_dir
+    self.save_prefix = save_prefix
+    self.save_format = save_format
+    super(NumpyArrayIterator, self).__init__(x.shape[0], batch_size, shuffle,
+                                             seed)
+
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    # Keeps under lock only the mechanism which advances
+    # the indexing of each batch.
+    with self.lock:
+      index_array, current_index, current_batch_size = next(
+          self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    batch_x = np.zeros(
+        tuple([current_batch_size] + list(self.x.shape)[1:]), dtype=K.floatx())
+    for i, j in enumerate(index_array):
+      x = self.x[j]
+      x = self.image_data_generator.random_transform(x.astype(K.floatx()))
+      x = self.image_data_generator.standardize(x)
+      batch_x[i] = x
+    if self.save_to_dir:
+      for i in range(current_batch_size):
+        img = array_to_img(batch_x[i], self.data_format, scale=True)
+        fname = '{prefix}_{index}_{hash}.{format}'.format(
+            prefix=self.save_prefix,
+            index=current_index + i,
+            hash=np.random.randint(1e4),
+            format=self.save_format)
+        img.save(os.path.join(self.save_to_dir, fname))
+    if self.y is None:
+      return batch_x
+    batch_y = self.y[index_array]
+    return batch_x, batch_y
+
+
+class DirectoryIterator(Iterator):
+  """Iterator capable of reading images from a directory on disk.
+
+  Arguments:
+      directory: Path to the directory to read images from.
+          Each subdirectory in this directory will be
+          considered to contain images from one class,
+          or alternatively you could specify class subdirectories
+          via the `classes` argument.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
+      classes: Optional list of strings, names of sudirectories
+          containing images from each class (e.g. `["dogs", "cats"]`).
+          It will be computed automatically if not set.
+      class_mode: Mode for yielding the targets:
+          `"binary"`: binary targets (if there are only two classes),
+          `"categorical"`: categorical targets,
+          `"sparse"`: integer targets,
+          `"input"`: targets are images identical to input images (mainly
+              used to work with autoencoders),
+          `None`: no targets get yielded (only input images are yielded).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+  """
+
+  def __init__(self,
+               directory,
+               image_data_generator,
+               target_size=(256, 256),
+               color_mode='rgb',
+               classes=None,
+               class_mode='categorical',
+               batch_size=32,
+               shuffle=True,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='png',
+               follow_links=False):
+    if data_format is None:
+      data_format = K.image_data_format()
+    self.directory = directory
+    self.image_data_generator = image_data_generator
+    self.target_size = tuple(target_size)
+    if color_mode not in {'rgb', 'grayscale'}:
+      raise ValueError('Invalid color mode:', color_mode,
+                       '; expected "rgb" or "grayscale".')
+    self.color_mode = color_mode
+    self.data_format = data_format
+    if self.color_mode == 'rgb':
+      if self.data_format == 'channels_last':
+        self.image_shape = self.target_size + (3,)
+      else:
+        self.image_shape = (3,) + self.target_size
+    else:
+      if self.data_format == 'channels_last':
+        self.image_shape = self.target_size + (1,)
+      else:
+        self.image_shape = (1,) + self.target_size
+    self.classes = classes
+    if class_mode not in {'categorical', 'binary', 'sparse', 'input', None}:
+      raise ValueError('Invalid class_mode:', class_mode,
+                       '; expected one of "categorical", '
+                       '"binary", "sparse", "input"'
+                       ' or None.')
+    self.class_mode = class_mode
+    self.save_to_dir = save_to_dir
+    self.save_prefix = save_prefix
+    self.save_format = save_format
+
+    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'}
+
+    # first, count the number of samples and classes
+    self.samples = 0
+
+    if not classes:
+      classes = []
+      for subdir in sorted(os.listdir(directory)):
+        if os.path.isdir(os.path.join(directory, subdir)):
+          classes.append(subdir)
+    self.num_class = len(classes)
+    self.class_indices = dict(zip(classes, range(len(classes))))
+
+    def _recursive_list(subpath):
+      return sorted(
+          os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+    for subdir in classes:
+      subpath = os.path.join(directory, subdir)
+      for root, _, files in _recursive_list(subpath):
+        for fname in files:
+          is_valid = False
+          for extension in white_list_formats:
+            if fname.lower().endswith('.' + extension):
+              is_valid = True
+              break
+          if is_valid:
+            self.samples += 1
+    print('Found %d images belonging to %d classes.' % (self.samples,
+                                                        self.num_class))
+
+    # second, build an index of the images in the different class subfolders
+    self.filenames = []
+    self.classes = np.zeros((self.samples,), dtype='int32')
+    i = 0
+    for subdir in classes:
+      subpath = os.path.join(directory, subdir)
+      for root, _, files in _recursive_list(subpath):
+        for fname in files:
+          is_valid = False
+          for extension in white_list_formats:
+            if fname.lower().endswith('.' + extension):
+              is_valid = True
+              break
+          if is_valid:
+            self.classes[i] = self.class_indices[subdir]
+            i += 1
+            # add filename relative to directory
+            absolute_path = os.path.join(root, fname)
+            self.filenames.append(os.path.relpath(absolute_path, directory))
+    super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
+                                            seed)
+
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    with self.lock:
+      index_array, current_index, current_batch_size = next(
+          self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    batch_x = np.zeros(
+        (current_batch_size,) + self.image_shape, dtype=K.floatx())
+    grayscale = self.color_mode == 'grayscale'
+    # build batch of image data
+    for i, j in enumerate(index_array):
+      fname = self.filenames[j]
+      img = load_img(
+          os.path.join(self.directory, fname),
+          grayscale=grayscale,
+          target_size=self.target_size)
+      x = img_to_array(img, data_format=self.data_format)
+      x = self.image_data_generator.random_transform(x)
+      x = self.image_data_generator.standardize(x)
+      batch_x[i] = x
+    # optionally save augmented images to disk for debugging purposes
+    if self.save_to_dir:
+      for i in range(current_batch_size):
+        img = array_to_img(batch_x[i], self.data_format, scale=True)
+        fname = '{prefix}_{index}_{hash}.{format}'.format(
+            prefix=self.save_prefix,
+            index=current_index + i,
+            hash=np.random.randint(1e4),
+            format=self.save_format)
+        img.save(os.path.join(self.save_to_dir, fname))
+    # build batch of labels
+    if self.class_mode == 'input':
+      batch_y = batch_x.copy()
+    elif self.class_mode == 'sparse':
+      batch_y = self.classes[index_array]
+    elif self.class_mode == 'binary':
+      batch_y = self.classes[index_array].astype(K.floatx())
+    elif self.class_mode == 'categorical':
+      batch_y = np.zeros((len(batch_x), self.num_class), dtype=K.floatx())
+      for i, label in enumerate(self.classes[index_array]):
+        batch_y[i, label] = 1.
+    else:
+      return batch_x
+    return batch_x, batch_y
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
new file mode 100644
index 00000000000..0dedf8f8500
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
@@ -0,0 +1,231 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+try:
+  import PIL  # pylint:disable=g-import-not-at-top
+except ImportError:
+  PIL = None
+
+
+def _generate_test_images():
+  img_w = img_h = 20
+  rgb_images = []
+  gray_images = []
+  for _ in range(8):
+    bias = np.random.rand(img_w, img_h, 1) * 64
+    variance = np.random.rand(img_w, img_h, 1) * (255 - 64)
+    imarray = np.random.rand(img_w, img_h, 3) * variance + bias
+    im = keras.preprocessing.image.array_to_img(imarray, scale=False)
+    rgb_images.append(im)
+
+    imarray = np.random.rand(img_w, img_h, 1) * variance + bias
+    im = keras.preprocessing.image.array_to_img(imarray, scale=False)
+    gray_images.append(im)
+
+  return [rgb_images, gray_images]
+
+
+class TestImage(test.TestCase):
+
+  def test_image_data_generator(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    for test_images in _generate_test_images():
+      img_list = []
+      for im in test_images:
+        img_list.append(keras.preprocessing.image.img_to_array(im)[None, ...])
+
+      images = np.vstack(img_list)
+      generator = keras.preprocessing.image.ImageDataGenerator(
+          featurewise_center=True,
+          samplewise_center=True,
+          featurewise_std_normalization=True,
+          samplewise_std_normalization=True,
+          zca_whitening=True,
+          rotation_range=90.,
+          width_shift_range=0.1,
+          height_shift_range=0.1,
+          shear_range=0.5,
+          zoom_range=0.2,
+          channel_shift_range=0.,
+          fill_mode='nearest',
+          cval=0.5,
+          horizontal_flip=True,
+          vertical_flip=True)
+      generator.fit(images, augment=True)
+
+      for x, _ in generator.flow(
+          images,
+          np.arange(images.shape[0]),
+          shuffle=True):
+        self.assertEqual(x.shape[1:], images.shape[1:])
+        break
+
+  def test_image_data_generator_invalid_data(self):
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_last')
+    # Test fit with invalid data
+    with self.assertRaises(ValueError):
+      x = np.random.random((3, 10, 10))
+      generator.fit(x)
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 3, 10, 10))
+      generator.fit(x)
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10, 5))
+      generator.fit(x)
+    # Test flow with invalid data
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10, 5))
+      generator.flow(np.arange(x.shape[0]))
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 10, 10))
+      generator.flow(np.arange(x.shape[0]))
+    with self.assertRaises(ValueError):
+      x = np.random.random((32, 3, 10, 10))
+      generator.flow(np.arange(x.shape[0]))
+
+  def test_image_data_generator_fit(self):
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_last')
+    # Test grayscale
+    x = np.random.random((32, 10, 10, 1))
+    generator.fit(x)
+    # Test RBG
+    x = np.random.random((32, 10, 10, 3))
+    generator.fit(x)
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        featurewise_center=True,
+        samplewise_center=True,
+        featurewise_std_normalization=True,
+        samplewise_std_normalization=True,
+        zca_whitening=True,
+        data_format='channels_first')
+    # Test grayscale
+    x = np.random.random((32, 1, 10, 10))
+    generator.fit(x)
+    # Test RBG
+    x = np.random.random((32, 3, 10, 10))
+    generator.fit(x)
+
+  def test_directory_iterator(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    num_classes = 2
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    # create folders and subfolders
+    paths = []
+    for cl in range(num_classes):
+      class_directory = 'class-{}'.format(cl)
+      classpaths = [
+          class_directory, os.path.join(class_directory, 'subfolder-1'),
+          os.path.join(class_directory, 'subfolder-2'), os.path.join(
+              class_directory, 'subfolder-1', 'sub-subfolder')
+      ]
+      for path in classpaths:
+        os.mkdir(os.path.join(temp_dir, path))
+      paths.append(classpaths)
+
+    # save the images in the paths
+    count = 0
+    filenames = []
+    for test_images in _generate_test_images():
+      for im in test_images:
+        # rotate image class
+        im_class = count % num_classes
+        # rotate subfolders
+        classpaths = paths[im_class]
+        filename = os.path.join(classpaths[count % len(classpaths)],
+                                'image-{}.jpg'.format(count))
+        filenames.append(filename)
+        im.save(os.path.join(temp_dir, filename))
+        count += 1
+
+    # create iterator
+    generator = keras.preprocessing.image.ImageDataGenerator()
+    dir_iterator = generator.flow_from_directory(temp_dir)
+
+    # check number of classes and images
+    self.assertEqual(len(dir_iterator.class_indices), num_classes)
+    self.assertEqual(len(dir_iterator.classes), count)
+    self.assertEqual(sorted(dir_iterator.filenames), sorted(filenames))
+
+  def test_img_utils(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    height, width = 10, 8
+
+    # Test channels_first data format
+    x = np.random.random((3, height, width))
+    img = keras.preprocessing.image.array_to_img(
+        x, data_format='channels_first')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(
+        img, data_format='channels_first')
+    self.assertEqual(x.shape, (3, height, width))
+    # Test 2D
+    x = np.random.random((1, height, width))
+    img = keras.preprocessing.image.array_to_img(
+        x, data_format='channels_first')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(
+        img, data_format='channels_first')
+    self.assertEqual(x.shape, (1, height, width))
+
+    # Test channels_last data format
+    x = np.random.random((height, width, 3))
+    img = keras.preprocessing.image.array_to_img(x, data_format='channels_last')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
+    self.assertEqual(x.shape, (height, width, 3))
+    # Test 2D
+    x = np.random.random((height, width, 1))
+    img = keras.preprocessing.image.array_to_img(x, data_format='channels_last')
+    self.assertEqual(img.size, (width, height))
+    x = keras.preprocessing.image.img_to_array(img, data_format='channels_last')
+    self.assertEqual(x.shape, (height, width, 1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
new file mode 100644
index 00000000000..382aa386d4e
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocessing utilities for sequence data.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='pre',
+                  truncating='pre',
+                  value=0.):
+  """Pads each sequence to the same length (length of the longest sequence).
+
+  If maxlen is provided, any sequence longer
+  than maxlen is truncated to maxlen.
+  Truncation happens off either the beginning (default) or
+  the end of the sequence.
+
+  Supports post-padding and pre-padding (default).
+
+  Arguments:
+      sequences: list of lists where each element is a sequence
+      maxlen: int, maximum length
+      dtype: type to cast the resulting sequence.
+      padding: 'pre' or 'post', pad either before or after each sequence.
+      truncating: 'pre' or 'post', remove values from sequences larger than
+          maxlen either in the beginning or in the end of the sequence
+      value: float, value to pad the sequences to the desired value.
+
+  Returns:
+      x: numpy array with dimensions (number_of_sequences, maxlen)
+
+  Raises:
+      ValueError: in case of invalid values for `truncating` or `padding`,
+          or in case of invalid shape for a `sequences` entry.
+  """
+  if not hasattr(sequences, '__len__'):
+    raise ValueError('`sequences` must be iterable.')
+  lengths = []
+  for x in sequences:
+    if not hasattr(x, '__len__'):
+      raise ValueError('`sequences` must be a list of iterables. '
+                       'Found non-iterable: ' + str(x))
+    lengths.append(len(x))
+
+  num_samples = len(sequences)
+  if maxlen is None:
+    maxlen = np.max(lengths)
+
+  # take the sample shape from the first non empty sequence
+  # checking for consistency in the main loop below.
+  sample_shape = tuple()
+  for s in sequences:
+    if len(s) > 0:  # pylint: disable=g-explicit-length-test
+      sample_shape = np.asarray(s).shape[1:]
+      break
+
+  x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
+  for idx, s in enumerate(sequences):
+    if not len(s):  # pylint: disable=g-explicit-length-test
+      continue  # empty list/array was found
+    if truncating == 'pre':
+      trunc = s[-maxlen:]  # pylint: disable=invalid-unary-operand-type
+    elif truncating == 'post':
+      trunc = s[:maxlen]
+    else:
+      raise ValueError('Truncating type "%s" not understood' % truncating)
+
+    # check `trunc` has expected shape
+    trunc = np.asarray(trunc, dtype=dtype)
+    if trunc.shape[1:] != sample_shape:
+      raise ValueError(
+          'Shape of sample %s of sequence at position %s is different from '
+          'expected shape %s'
+          % (trunc.shape[1:], idx, sample_shape))
+
+    if padding == 'post':
+      x[idx, :len(trunc)] = trunc
+    elif padding == 'pre':
+      x[idx, -len(trunc):] = trunc
+    else:
+      raise ValueError('Padding type "%s" not understood' % padding)
+  return x
+
+
+def make_sampling_table(size, sampling_factor=1e-5):
+  """Generates a word rank-based probabilistic sampling table.
+
+  This generates an array where the ith element
+  is the probability that a word of rank i would be sampled,
+  according to the sampling distribution used in word2vec.
+
+  The word2vec formula is:
+      p(word) = min(1, sqrt(word.frequency/sampling_factor) /
+      (word.frequency/sampling_factor))
+
+  We assume that the word frequencies follow Zipf's law (s=1) to derive
+  a numerical approximation of frequency(rank):
+     frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
+      where gamma is the Euler-Mascheroni constant.
+
+  Arguments:
+      size: int, number of possible words to sample.
+      sampling_factor: the sampling factor in the word2vec formula.
+
+  Returns:
+      A 1D Numpy array of length `size` where the ith entry
+      is the probability that a word of rank i should be sampled.
+  """
+  gamma = 0.577
+  rank = np.array(list(range(size)))
+  rank[0] = 1
+  inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
+  f = sampling_factor * inv_fq
+
+  return np.minimum(1., f / np.sqrt(f))
+
+
+def skipgrams(sequence,
+              vocabulary_size,
+              window_size=4,
+              negative_samples=1.,
+              shuffle=True,
+              categorical=False,
+              sampling_table=None):
+  """Generates skipgram word pairs.
+
+  Takes a sequence (list of indexes of words),
+  returns couples of [word_index, other_word index] and labels (1s or 0s),
+  where label = 1 if 'other_word' belongs to the context of 'word',
+  and label=0 if 'other_word' is randomly sampled
+
+  Arguments:
+      sequence: a word sequence (sentence), encoded as a list
+          of word indices (integers). If using a `sampling_table`,
+          word indices are expected to match the rank
+          of the words in a reference dataset (e.g. 10 would encode
+          the 10-th most frequently occurring token).
+          Note that index 0 is expected to be a non-word and will be skipped.
+      vocabulary_size: int. maximum possible word index + 1
+      window_size: int. actually half-window.
+          The window of a word wi will be [i-window_size, i+window_size+1]
+      negative_samples: float >= 0. 0 for no negative (=random) samples.
+          1 for same number as positive samples. etc.
+      shuffle: whether to shuffle the word couples before returning them.
+      categorical: bool. if False, labels will be
+          integers (eg. [0, 1, 1 .. ]),
+          if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
+      sampling_table: 1D array of size `vocabulary_size` where the entry i
+          encodes the probabibily to sample a word of rank i.
+
+  Returns:
+      couples, labels: where `couples` are int pairs and
+          `labels` are either 0 or 1.
+
+  # Note
+      By convention, index 0 in the vocabulary is
+      a non-word and will be skipped.
+  """
+  couples = []
+  labels = []
+  for i, wi in enumerate(sequence):
+    if not wi:
+      continue
+    if sampling_table is not None:
+      if sampling_table[wi] < random.random():
+        continue
+
+    window_start = max(0, i - window_size)
+    window_end = min(len(sequence), i + window_size + 1)
+    for j in range(window_start, window_end):
+      if j != i:
+        wj = sequence[j]
+        if not wj:
+          continue
+        couples.append([wi, wj])
+        if categorical:
+          labels.append([0, 1])
+        else:
+          labels.append(1)
+
+  if negative_samples > 0:
+    num_negative_samples = int(len(labels) * negative_samples)
+    words = [c[0] for c in couples]
+    random.shuffle(words)
+
+    couples += [[words[i % len(words)],
+                 random.randint(1, vocabulary_size - 1)]
+                for i in range(num_negative_samples)]
+    if categorical:
+      labels += [[1, 0]] * num_negative_samples
+    else:
+      labels += [0] * num_negative_samples
+
+  if shuffle:
+    seed = random.randint(0, 10e6)
+    random.seed(seed)
+    random.shuffle(couples)
+    random.seed(seed)
+    random.shuffle(labels)
+
+  return couples, labels
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py
new file mode 100644
index 00000000000..4e54b95c8bf
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence_test.py
@@ -0,0 +1,99 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequence data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TestSequence(test.TestCase):
+
+  def test_pad_sequences(self):
+    a = [[1], [1, 2], [1, 2, 3]]
+
+    # test padding
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='pre')
+    self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='post')
+    self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
+
+    # test truncating
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='pre')
+    self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='post')
+    self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])
+
+    # test value
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, value=1)
+    self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
+
+  def test_pad_sequences_vector(self):
+    a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
+
+    # test padding
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='pre')
+    self.assertAllClose(b, [[[0, 0], [0, 0], [1, 1]], [[0, 0], [2, 1], [2, 2]],
+                            [[3, 1], [3, 2], [3, 3]]])
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, padding='post')
+    self.assertAllClose(b, [[[1, 1], [0, 0], [0, 0]], [[2, 1], [2, 2], [0, 0]],
+                            [[3, 1], [3, 2], [3, 3]]])
+
+    # test truncating
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='pre')
+    self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3,
+                                                                          3]]])
+
+    b = keras.preprocessing.sequence.pad_sequences(
+        a, maxlen=2, truncating='post')
+    self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3,
+                                                                          2]]])
+
+    # test value
+    b = keras.preprocessing.sequence.pad_sequences(a, maxlen=3, value=1)
+    self.assertAllClose(b, [[[1, 1], [1, 1], [1, 1]], [[1, 1], [2, 1], [2, 2]],
+                            [[3, 1], [3, 2], [3, 3]]])
+
+  def test_make_sampling_table(self):
+    a = keras.preprocessing.sequence.make_sampling_table(3)
+    self.assertAllClose(
+        a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
+
+  def test_skipgrams(self):
+    # test with no window size and binary labels
+    couples, labels = keras.preprocessing.sequence.skipgrams(
+        np.arange(3), vocabulary_size=3)
+    for couple in couples:
+      assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]
+
+    # test window size and categorical labels
+    couples, labels = keras.preprocessing.sequence.skipgrams(
+        np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
+    for couple in couples:
+      assert couple[0] - couple[1] <= 3
+    for l in labels:
+      assert len(l) == 2
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
new file mode 100644
index 00000000000..93e629af17b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -0,0 +1,278 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for text input preprocessing.
+
+May benefit from a fast Cython rewrite.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+import string
+import sys
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+if sys.version_info < (3,):
+  maketrans = string.maketrans
+else:
+  maketrans = str.maketrans
+
+
+def text_to_word_sequence(text,
+                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                          lower=True,
+                          split=' '):
+  """Converts a text to a sequence of words (or tokens).
+
+  Arguments:
+      text: Input text (string).
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of words (or tokens).
+  """
+  if lower:
+    text = text.lower()
+  text = text.translate(maketrans(filters, split * len(filters)))
+  seq = text.split(split)
+  return [i for i in seq if i]
+
+
+def one_hot(text,
+            n,
+            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+            lower=True,
+            split=' '):
+  seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
+  return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
+
+
+class Tokenizer(object):
+  """Text tokenization utility class.
+
+  This class allows to vectorize a text corpus, by turning each
+  text into either a sequence of integers (each integer being the index
+  of a token in a dictionary) or into a vector where the coefficient
+  for each token could be binary, based on word count, based on tf-idf...
+
+  Arguments:
+      num_words: the maximum number of words to keep, based
+          on word frequency. Only the most common `num_words` words will
+          be kept.
+      filters: a string where each element is a character that will be
+          filtered from the texts. The default is all punctuation, plus
+          tabs and line breaks, minus the `'` character.
+      lower: boolean. Whether to convert the texts to lowercase.
+      split: character or string to use for token splitting.
+      char_level: if True, every character will be treated as a token.
+
+  By default, all punctuation is removed, turning the texts into
+  space-separated sequences of words
+  (words maybe include the `'` character). These sequences are then
+  split into lists of tokens. They will then be indexed or vectorized.
+
+  `0` is a reserved index that won't be assigned to any word.
+  """
+
+  def __init__(self,
+               num_words=None,
+               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+               lower=True,
+               split=' ',
+               char_level=False):
+    self.word_counts = OrderedDict()
+    self.word_docs = {}
+    self.filters = filters
+    self.split = split
+    self.lower = lower
+    self.num_words = num_words
+    self.document_count = 0
+    self.char_level = char_level
+
+  def fit_on_texts(self, texts):
+    """Updates internal vocabulary based on a list of texts.
+
+    Required before using `texts_to_sequences` or `texts_to_matrix`.
+
+    Arguments:
+        texts: can be a list of strings,
+            or a generator of strings (for memory-efficiency)
+    """
+    self.document_count = 0
+    for text in texts:
+      self.document_count += 1
+      seq = text if self.char_level else text_to_word_sequence(
+          text, self.filters, self.lower, self.split)
+      for w in seq:
+        if w in self.word_counts:
+          self.word_counts[w] += 1
+        else:
+          self.word_counts[w] = 1
+      for w in set(seq):
+        if w in self.word_docs:
+          self.word_docs[w] += 1
+        else:
+          self.word_docs[w] = 1
+
+    wcounts = list(self.word_counts.items())
+    wcounts.sort(key=lambda x: x[1], reverse=True)
+    sorted_voc = [wc[0] for wc in wcounts]
+    # note that index 0 is reserved, never assigned to an existing word
+    self.word_index = dict(
+        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+
+    self.index_docs = {}
+    for w, c in list(self.word_docs.items()):
+      self.index_docs[self.word_index[w]] = c
+
+  def fit_on_sequences(self, sequences):
+    """Updates internal vocabulary based on a list of sequences.
+
+    Required before using `sequences_to_matrix`
+    (if `fit_on_texts` was never called).
+
+    Arguments:
+        sequences: A list of sequence.
+            A "sequence" is a list of integer word indices.
+    """
+    self.document_count = len(sequences)
+    self.index_docs = {}
+    for seq in sequences:
+      seq = set(seq)
+      for i in seq:
+        if i not in self.index_docs:
+          self.index_docs[i] = 1
+        else:
+          self.index_docs[i] += 1
+
+  def texts_to_sequences(self, texts):
+    """Transforms each text in texts in a sequence of integers.
+
+    Only top "num_words" most frequent words will be taken into account.
+    Only words known by the tokenizer will be taken into account.
+
+    Arguments:
+        texts: A list of texts (strings).
+
+    Returns:
+        A list of sequences.
+    """
+    res = []
+    for vect in self.texts_to_sequences_generator(texts):
+      res.append(vect)
+    return res
+
+  def texts_to_sequences_generator(self, texts):
+    """Transforms each text in texts in a sequence of integers.
+
+    Only top "num_words" most frequent words will be taken into account.
+    Only words known by the tokenizer will be taken into account.
+
+    Arguments:
+        texts: A list of texts (strings).
+
+    Yields:
+        Yields individual sequences.
+    """
+    num_words = self.num_words
+    for text in texts:
+      seq = text if self.char_level else text_to_word_sequence(
+          text, self.filters, self.lower, self.split)
+      vect = []
+      for w in seq:
+        i = self.word_index.get(w)
+        if i is not None:
+          if num_words and i >= num_words:
+            continue
+          else:
+            vect.append(i)
+      yield vect
+
+  def texts_to_matrix(self, texts, mode='binary'):
+    """Convert a list of texts to a Numpy matrix.
+
+    Arguments:
+        texts: list of strings.
+        mode: one of "binary", "count", "tfidf", "freq".
+
+    Returns:
+        A Numpy matrix.
+    """
+    sequences = self.texts_to_sequences(texts)
+    return self.sequences_to_matrix(sequences, mode=mode)
+
+  def sequences_to_matrix(self, sequences, mode='binary'):
+    """Converts a list of sequences into a Numpy matrix.
+
+    Arguments:
+        sequences: list of sequences
+            (a sequence is a list of integer word indices).
+        mode: one of "binary", "count", "tfidf", "freq"
+
+    Returns:
+        A Numpy matrix.
+
+    Raises:
+        ValueError: In case of invalid `mode` argument,
+            or if the Tokenizer requires to be fit to sample data.
+    """
+    if not self.num_words:
+      if self.word_index:
+        num_words = len(self.word_index) + 1
+      else:
+        raise ValueError('Specify a dimension (num_words argument), '
+                         'or fit on some text data first.')
+    else:
+      num_words = self.num_words
+
+    if mode == 'tfidf' and not self.document_count:
+      raise ValueError('Fit the Tokenizer on some data '
+                       'before using tfidf mode.')
+
+    x = np.zeros((len(sequences), num_words))
+    for i, seq in enumerate(sequences):
+      if not seq:
+        continue
+      counts = {}
+      for j in seq:
+        if j >= num_words:
+          continue
+        if j not in counts:
+          counts[j] = 1.
+        else:
+          counts[j] += 1
+      for j, c in list(counts.items()):
+        if mode == 'count':
+          x[i][j] = c
+        elif mode == 'freq':
+          x[i][j] = c / len(seq)
+        elif mode == 'binary':
+          x[i][j] = 1
+        elif mode == 'tfidf':
+          # Use weighting scheme 2 in
+          # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+          tf = 1 + np.log(c)
+          idf = np.log(1 + self.document_count /
+                       (1 + self.index_docs.get(j, 0)))
+          x[i][j] = tf * idf
+        else:
+          raise ValueError('Unknown vectorization mode:', mode)
+    return x
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
new file mode 100644
index 00000000000..e94b9019b28
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for text data preprocessing utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class TestText(test.TestCase):
+
+  def test_one_hot(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.one_hot(text, 5)
+    self.assertEqual(len(encoded), 6)
+    assert np.max(encoded) <= 4
+    assert np.min(encoded) >= 0
+
+  def test_tokenizer(self):
+    texts = [
+        'The cat sat on the mat.',
+        'The dog sat on the log.',
+        'Dogs and cats living together.'
+    ]
+    tokenizer = keras.preprocessing.text.Tokenizer(num_words=10)
+    tokenizer.fit_on_texts(texts)
+
+    sequences = []
+    for seq in tokenizer.texts_to_sequences_generator(texts):
+      sequences.append(seq)
+    assert np.max(np.max(sequences)) < 10
+    self.assertEqual(np.min(np.min(sequences)), 1)
+
+    tokenizer.fit_on_sequences(sequences)
+
+    for mode in ['binary', 'count', 'tfidf', 'freq']:
+      matrix = tokenizer.texts_to_matrix(texts, mode)
+      self.assertEqual(matrix.shape, (3, 10))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/regularizers.py b/tensorflow/contrib/keras/python/keras/regularizers.py
new file mode 100644
index 00000000000..36cc5c47e41
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/regularizers.py
@@ -0,0 +1,102 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras built-in regularizers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+
+
+class Regularizer(object):
+  """Regularizer base class.
+  """
+
+  def __call__(self, x):
+    return 0.
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class L1L2(Regularizer):
+  """Regularizer for L1 and L2 regularization.
+
+  Arguments:
+      l1: Float; L1 regularization factor.
+      l2: Float; L2 regularization factor.
+  """
+
+  def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
+    self.l1 = K.cast_to_floatx(l1)
+    self.l2 = K.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    regularization = 0.
+    if self.l1:
+      regularization += K.sum(self.l1 * K.abs(x))
+    if self.l2:
+      regularization += K.sum(self.l2 * K.square(x))
+    return regularization
+
+  def get_config(self):
+    return {'l1': float(self.l1), 'l2': float(self.l2)}
+
+
+# Aliases.
+
+
+def l1(l=0.01):
+  return L1L2(l1=l)
+
+
+def l2(l=0.01):
+  return L1L2(l2=l)
+
+
+def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
+  return L1L2(l1=l1, l2=l2)
+
+
+def serialize(regularizer):
+  return serialize_keras_object(regularizer)
+
+
+def deserialize(config, custom_objects=None):
+  return deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name='regularizer')
+
+
+def get(identifier):
+  if identifier is None:
+    return None
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
+  elif isinstance(identifier, six.string_types):
+    config = {'class_name': str(identifier), 'config': {}}
+    return deserialize(config)
+  elif callable(identifier):
+    return identifier
+  else:
+    raise ValueError('Could not interpret regularizer identifier:', identifier)
diff --git a/tensorflow/contrib/keras/python/keras/regularizers_test.py b/tensorflow/contrib/keras/python/keras/regularizers_test.py
new file mode 100644
index 00000000000..528024994f3
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/regularizers_test.py
@@ -0,0 +1,76 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras regularizers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+DATA_DIM = 5
+NUM_CLASSES = 2
+
+
+def get_data():
+  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+      train_samples=10,
+      test_samples=10,
+      input_shape=(DATA_DIM,),
+      num_classes=NUM_CLASSES)
+  y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
+  y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
+  return (x_train, y_train), (x_test, y_test)
+
+
+def create_model(kernel_regularizer=None, activity_regularizer=None):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(NUM_CLASSES,
+                               kernel_regularizer=kernel_regularizer,
+                               activity_regularizer=activity_regularizer,
+                               input_shape=(DATA_DIM,)))
+  return model
+
+
+class KerasRegularizersTest(test.TestCase):
+
+  def test_kernel_regularization(self):
+    with self.test_session():
+      (x_train, y_train), _ = get_data()
+      for reg in [keras.regularizers.l1(),
+                  keras.regularizers.l2(),
+                  keras.regularizers.l1_l2()]:
+        model = create_model(kernel_regularizer=reg)
+        model.compile(loss='categorical_crossentropy', optimizer='sgd')
+        assert len(model.losses) == 1
+        model.fit(x_train, y_train, batch_size=10,
+                  epochs=1, verbose=0)
+
+  def test_activity_regularization(self):
+    with self.test_session():
+      (x_train, y_train), _ = get_data()
+      for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
+        model = create_model(activity_regularizer=reg)
+        model.compile(loss='categorical_crossentropy', optimizer='sgd')
+        assert len(model.losses) == 1
+        model.fit(x_train, y_train, batch_size=10,
+                  epochs=1, verbose=0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/testing_utils.py b/tensorflow/contrib/keras/python/keras/testing_utils.py
new file mode 100644
index 00000000000..bf6f661adff
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/testing_utils.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for unit-testing Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.util import tf_inspect
+
+
+def get_test_data(train_samples,
+                  test_samples,
+                  input_shape,
+                  num_classes):
+  """Generates test data to train a model on.
+
+  Arguments:
+    train_samples: Integer, how many training samples to generate.
+    test_samples: Integer, how many test samples to generate.
+    input_shape: Tuple of integers, shape of the inputs.
+    num_classes: Integer, number of classes for the data and targets.
+      Only relevant if `classification=True`.
+
+  Returns:
+    A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  num_sample = train_samples + test_samples
+  templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
+  y = np.random.randint(0, num_classes, size=(num_sample,))
+  x = np.zeros((num_sample,) + input_shape)
+  for i in range(num_sample):
+    x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
+  return ((x[:train_samples], y[:train_samples]),
+          (x[train_samples:], y[train_samples:]))
+
+
+def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
+               input_data=None, expected_output=None,
+               expected_output_dtype=None):
+  """Test routine for a layer with a single input and single output.
+
+  Arguments:
+    layer_cls: Layer class object.
+    kwargs: Optional dictionary of keyword arguments for instantiating the
+      layer.
+    input_shape: Input shape tuple.
+    input_dtype: Data type of the input data.
+    input_data: Numpy array of input data.
+    expected_output: Shape tuple for the expected shape of the output.
+    expected_output_dtype: Data type expected for the output.
+
+  Returns:
+    The output data (Numpy array) returned by the layer, for additional
+    checks to be done by the calling code.
+  """
+  if input_data is None:
+    assert input_shape
+    if not input_dtype:
+      input_dtype = 'float32'
+    input_data_shape = list(input_shape)
+    for i, e in enumerate(input_data_shape):
+      if e is None:
+        input_data_shape[i] = np.random.randint(1, 4)
+    input_data = 10 * np.random.random(input_data_shape)
+    if input_dtype[:4] == 'float':
+      input_data -= 0.5
+    input_data = input_data.astype(input_dtype)
+  elif input_shape is None:
+    input_shape = input_data.shape
+  if input_dtype is None:
+    input_dtype = input_data.dtype
+  if expected_output_dtype is None:
+    expected_output_dtype = input_dtype
+
+  # instantiation
+  kwargs = kwargs or {}
+  layer = layer_cls(**kwargs)
+
+  # test get_weights , set_weights at layer level
+  weights = layer.get_weights()
+  layer.set_weights(weights)
+
+  # test and instantiation from weights
+  if 'weights' in tf_inspect.getargspec(layer_cls.__init__):
+    kwargs['weights'] = weights
+    layer = layer_cls(**kwargs)
+
+  # test in functional API
+  x = keras.layers.Input(shape=input_shape[1:], dtype=input_dtype)
+  y = layer(x)
+  assert keras.backend.dtype(y) == expected_output_dtype
+
+  # check shape inference
+  model = keras.models.Model(x, y)
+  expected_output_shape = tuple(
+      layer._compute_output_shape(input_shape).as_list())  # pylint: disable=protected-access
+  actual_output = model.predict(input_data)
+  actual_output_shape = actual_output.shape
+  for expected_dim, actual_dim in zip(expected_output_shape,
+                                      actual_output_shape):
+    if expected_dim is not None:
+      assert expected_dim == actual_dim
+  if expected_output is not None:
+    np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3)
+
+  # test serialization, weight setting at model level
+  model_config = model.get_config()
+  recovered_model = keras.models.Model.from_config(model_config)
+  if model.weights:
+    weights = model.get_weights()
+    recovered_model.set_weights(weights)
+    output = recovered_model.predict(input_data)
+    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+
+  # test training mode (e.g. useful for dropout tests)
+  model.compile('rmsprop', 'mse')
+  model.train_on_batch(input_data, actual_output)
+
+  # test as first layer in Sequential API
+  layer_config = layer.get_config()
+  layer_config['batch_input_shape'] = input_shape
+  layer = layer.__class__.from_config(layer_config)
+
+  model = keras.models.Sequential()
+  model.add(layer)
+  actual_output = model.predict(input_data)
+  actual_output_shape = actual_output.shape
+  for expected_dim, actual_dim in zip(expected_output_shape,
+                                      actual_output_shape):
+    if expected_dim is not None:
+      assert expected_dim == actual_dim
+  if expected_output is not None:
+    np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3)
+
+  # test serialization, weight setting at model level
+  model_config = model.get_config()
+  recovered_model = keras.models.Sequential.from_config(model_config)
+  if model.weights:
+    weights = model.get_weights()
+    recovered_model.set_weights(weights)
+    output = recovered_model.predict(input_data)
+    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+
+  # test training mode (e.g. useful for dropout tests)
+  model.compile('rmsprop', 'mse')
+  model.train_on_batch(input_data, actual_output)
+
+  # for further checks in the caller function
+  return actual_output
diff --git a/tensorflow/contrib/keras/python/keras/utils/__init__.py b/tensorflow/contrib/keras/python/keras/utils/__init__.py
new file mode 100644
index 00000000000..68c28ab585b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils import data_utils
+from tensorflow.contrib.keras.python.keras.utils import generic_utils
+from tensorflow.contrib.keras.python.keras.utils import io_utils
+from tensorflow.contrib.keras.python.keras.utils import np_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.contrib.keras.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.contrib.keras.python.keras.utils.np_utils import normalize
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.contrib.keras.python.keras.utils.vis_utils import plot_model
+
+
+# Globally-importable utils.
diff --git a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
new file mode 100644
index 00000000000..570a63b6069
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
@@ -0,0 +1,169 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used by convolution layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.keras.python.keras import backend as K
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single int or iterable of ints into an int tuple.
+
+  Arguments:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the tuple to be returned.
+      name: The name of the argument being validated, e.g. "strides" or
+        "kernel_size". This is only used to format error messages.
+
+  Returns:
+      A tuple of n integers.
+
+  Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except ValueError:
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+  return value_tuple
+
+
+def normalize_data_format(value):
+  if value is None:
+    value = K.image_data_format()
+  data_format = value.lower()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('The `data_format` argument must be one of '
+                     '"channels_first", "channels_last". Received: ' +
+                     str(value))
+  return data_format
+
+
+def normalize_padding(value):
+  padding = value.lower()
+  if padding not in {'valid', 'same', 'causal'}:
+    raise ValueError('The `padding` argument must be one of '
+                     '"valid", "same" (or "causal", only for `Conv1D). '
+                     'Received: ' + str(padding))
+  return padding
+
+
+def convert_kernel(kernel):
+  """Converts a Numpy kernel matrix from Theano format to TensorFlow format.
+
+  Also works reciprocally, since the transformation is its own inverse.
+
+  Arguments:
+      kernel: Numpy array (3D, 4D or 5D).
+
+  Returns:
+      The converted kernel.
+
+  Raises:
+      ValueError: in case of invalid kernel shape or invalid data_format.
+  """
+  kernel = np.asarray(kernel)
+  if not 3 <= kernel.ndim <= 5:
+    raise ValueError('Invalid kernel shape:', kernel.shape)
+  slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
+  no_flip = (slice(None, None), slice(None, None))
+  slices[-2:] = no_flip
+  return np.copy(kernel[slices])
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full', 'causal'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  elif padding == 'causal':
+    output_length = input_length
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_length(dim_size, stride_size, kernel_size, padding):
+  if dim_size is None:
+    return None
+  dim_size *= stride_size
+  if padding == 'valid':
+    dim_size += max(kernel_size - stride_size, 0)
+  elif padding == 'full':
+    dim_size -= (stride_size + kernel_size - 2)
+  return dim_size
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
new file mode 100644
index 00000000000..61a11b95e8d
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -0,0 +1,302 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for file download and caching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+import os
+import shutil
+import sys
+import tarfile
+import zipfile
+
+import six
+from six.moves.urllib.error import HTTPError
+from six.moves.urllib.error import URLError
+from six.moves.urllib.request import urlopen
+
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+
+
+if sys.version_info[0] == 2:
+
+  def urlretrieve(url, filename, reporthook=None, data=None):
+    """Replacement for `urlretrive` for Python 2.
+
+    Under Python 2, `urlretrieve` relies on `FancyURLopener` from legacy
+    `urllib` module, known to have issues with proxy management.
+
+    Arguments:
+        url: url to retrieve.
+        filename: where to store the retrieved data locally.
+        reporthook: a hook function that will be called once
+            on establishment of the network connection and once
+            after each block read thereafter.
+            The hook will be passed three arguments;
+            a count of blocks transferred so far,
+            a block size in bytes, and the total size of the file.
+        data: `data` argument passed to `urlopen`.
+    """
+
+    def chunk_read(response, chunk_size=8192, reporthook=None):
+      content_type = response.info().get('Content-Length')
+      total_size = -1
+      if content_type is not None:
+        total_size = int(content_type.strip())
+      count = 0
+      while 1:
+        chunk = response.read(chunk_size)
+        count += 1
+        if not chunk:
+          reporthook(count, total_size, total_size)
+          break
+        if reporthook:
+          reporthook(count, chunk_size, total_size)
+        yield chunk
+
+    response = urlopen(url, data)
+    with open(filename, 'wb') as fd:
+      for chunk in chunk_read(response, reporthook=reporthook):
+        fd.write(chunk)
+else:
+  from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
+
+
+def _extract_archive(file_path, path='.', archive_format='auto'):
+  """Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
+
+  Arguments:
+      file_path: path to the archive file
+      path: path to extract the archive file
+      archive_format: Archive format to try for extracting the file.
+          Options are 'auto', 'tar', 'zip', and None.
+          'tar' includes tar, tar.gz, and tar.bz files.
+          The default 'auto' is ['tar', 'zip'].
+          None or an empty list will return no matches found.
+
+  Returns:
+      True if a match was found and an archive extraction was completed,
+      False otherwise.
+  """
+  if archive_format is None:
+    return False
+  if archive_format is 'auto':
+    archive_format = ['tar', 'zip']
+  if isinstance(archive_format, six.string_types):
+    archive_format = [archive_format]
+
+  for archive_type in archive_format:
+    if archive_type is 'tar':
+      open_fn = tarfile.open
+      is_match_fn = tarfile.is_tarfile
+    if archive_type is 'zip':
+      open_fn = zipfile.ZipFile
+      is_match_fn = zipfile.is_zipfile
+
+    if is_match_fn(file_path):
+      with open_fn(file_path) as archive:
+        try:
+          archive.extractall(path)
+        except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
+          if os.path.exists(path):
+            if os.path.isfile(path):
+              os.remove(path)
+            else:
+              shutil.rmtree(path)
+          raise
+      return True
+  return False
+
+
+def get_file(fname,
+             origin,
+             untar=False,
+             md5_hash=None,
+             file_hash=None,
+             cache_subdir='datasets',
+             hash_algorithm='auto',
+             extract=False,
+             archive_format='auto',
+             cache_dir=None):
+  """Downloads a file from a URL if it not already in the cache.
+
+  By default the file at the url `origin` is downloaded to the
+  cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
+  and given the filename `fname`. The final location of a file
+  `example.txt` would therefore be `~/.keras/datasets/example.txt`.
+
+  Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
+  Passing a hash will verify the file after download. The command line
+  programs `shasum` and `sha256sum` can compute the hash.
+
+  Arguments:
+      fname: Name of the file. If an absolute path `/path/to/file.txt` is
+          specified the file will be saved at that location.
+      origin: Original URL of the file.
+      untar: Deprecated in favor of 'extract'.
+          boolean, whether the file should be decompressed
+      md5_hash: Deprecated in favor of 'file_hash'.
+          md5 hash of the file for verification
+      file_hash: The expected hash string of the file after download.
+          The sha256 and md5 hash algorithms are both supported.
+      cache_subdir: Subdirectory under the Keras cache dir where the file is
+          saved. If an absolute path `/path/to/folder` is
+          specified the file will be saved at that location.
+      hash_algorithm: Select the hash algorithm to verify the file.
+          options are 'md5', 'sha256', and 'auto'.
+          The default 'auto' detects the hash algorithm in use.
+      extract: True tries extracting the file as an Archive, like tar or zip.
+      archive_format: Archive format to try for extracting the file.
+          Options are 'auto', 'tar', 'zip', and None.
+          'tar' includes tar, tar.gz, and tar.bz files.
+          The default 'auto' is ['tar', 'zip'].
+          None or an empty list will return no matches found.
+      cache_dir: Location to store cached files, when None it
+          defaults to the [Keras
+            Directory](/faq/#where-is-the-keras-configuration-filed-stored).
+
+  Returns:
+      Path to the downloaded file
+  """
+  if cache_dir is None:
+    cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
+  if md5_hash is not None and file_hash is None:
+    file_hash = md5_hash
+    hash_algorithm = 'md5'
+  datadir_base = os.path.expanduser(cache_dir)
+  if not os.access(datadir_base, os.W_OK):
+    datadir_base = os.path.join('/tmp', '.keras')
+  datadir = os.path.join(datadir_base, cache_subdir)
+  if not os.path.exists(datadir):
+    os.makedirs(datadir)
+
+  if untar:
+    untar_fpath = os.path.join(datadir, fname)
+    fpath = untar_fpath + '.tar.gz'
+  else:
+    fpath = os.path.join(datadir, fname)
+
+  download = False
+  if os.path.exists(fpath):
+    # File found; verify integrity if a hash was provided.
+    if file_hash is not None:
+      if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+        print('A local file was found, but it seems to be '
+              'incomplete or outdated because the ' + hash_algorithm +
+              ' file hash does not match the original value of ' + file_hash +
+              ' so we will re-download the data.')
+        download = True
+  else:
+    download = True
+
+  if download:
+    print('Downloading data from', origin)
+
+    class ProgressTracker(object):
+      # Maintain progbar for the lifetime of download.
+      # This design was chosen for Python 2.7 compatibility.
+      progbar = None
+
+    def dl_progress(count, block_size, total_size):
+      if ProgressTracker.progbar is None:
+        if total_size is -1:
+          total_size = None
+        ProgressTracker.progbar = Progbar(total_size)
+      else:
+        ProgressTracker.progbar.update(count * block_size)
+
+    error_msg = 'URL fetch failure on {}: {} -- {}'
+    try:
+      try:
+        urlretrieve(origin, fpath, dl_progress)
+      except URLError as e:
+        raise Exception(error_msg.format(origin, e.errno, e.reason))
+      except HTTPError as e:
+        raise Exception(error_msg.format(origin, e.code, e.msg))
+    except (Exception, KeyboardInterrupt) as e:
+      if os.path.exists(fpath):
+        os.remove(fpath)
+      raise
+    ProgressTracker.progbar = None
+
+  if untar:
+    if not os.path.exists(untar_fpath):
+      _extract_archive(fpath, datadir, archive_format='tar')
+    return untar_fpath
+
+  if extract:
+    _extract_archive(fpath, datadir, archive_format)
+
+  return fpath
+
+
+def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
+  """Calculates a file sha256 or md5 hash.
+
+  Example:
+
+  ```python
+     >>> from keras.data_utils import _hash_file
+     >>> _hash_file('/path/to/file.zip')
+     'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+  ```
+
+  Arguments:
+      fpath: path to the file being validated
+      algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
+          The default 'auto' detects the hash algorithm in use.
+      chunk_size: Bytes to read at a time, important for large files.
+
+  Returns:
+      The file hash
+  """
+  if (algorithm is 'sha256') or (algorithm is 'auto' and len(hash) is 64):
+    hasher = hashlib.sha256()
+  else:
+    hasher = hashlib.md5()
+
+  with open(fpath, 'rb') as fpath_file:
+    for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
+      hasher.update(chunk)
+
+  return hasher.hexdigest()
+
+
+def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
+  """Validates a file against a sha256 or md5 hash.
+
+  Arguments:
+      fpath: path to the file being validated
+      file_hash:  The expected hash string of the file.
+          The sha256 and md5 hash algorithms are both supported.
+      algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+          The default 'auto' detects the hash algorithm in use.
+      chunk_size: Bytes to read at a time, important for large files.
+
+  Returns:
+      Whether the file is valid
+  """
+  if ((algorithm is 'sha256') or
+      (algorithm is 'auto' and len(file_hash) is 64)):
+    hasher = 'sha256'
+  else:
+    hasher = 'md5'
+
+  if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
+    return True
+  else:
+    return False
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
new file mode 100644
index 00000000000..5cae694d548
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -0,0 +1,345 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python utilities required by Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import marshal
+import sys
+import time
+import types as python_types
+
+import numpy as np
+import six
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+_GLOBAL_CUSTOM_OBJECTS = {}
+
+
+class CustomObjectScope(object):
+  """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
+
+  Code within a `with` statement will be able to access custom objects
+  by name. Changes to global custom objects persist
+  within the enclosing `with` statement. At end of the `with` statement,
+  global custom objects are reverted to state
+  at beginning of the `with` statement.
+
+  Example:
+
+  Consider a custom object `MyObject`
+
+  ```python
+      with CustomObjectScope({'MyObject':MyObject}):
+          layer = Dense(..., kernel_regularizer='MyObject')
+          # save, load, etc. will recognize custom object by name
+  ```
+  """
+
+  def __init__(self, *args):
+    self.custom_objects = args
+    self.backup = None
+
+  def __enter__(self):
+    self.backup = _GLOBAL_CUSTOM_OBJECTS.copy()
+    for objects in self.custom_objects:
+      _GLOBAL_CUSTOM_OBJECTS.update(objects)
+    return self
+
+  def __exit__(self, *args, **kwargs):
+    _GLOBAL_CUSTOM_OBJECTS.clear()
+    _GLOBAL_CUSTOM_OBJECTS.update(self.backup)
+
+
+def custom_object_scope(*args):
+  """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
+
+  Convenience wrapper for `CustomObjectScope`.
+  Code within a `with` statement will be able to access custom objects
+  by name. Changes to global custom objects persist
+  within the enclosing `with` statement. At end of the `with` statement,
+  global custom objects are reverted to state
+  at beginning of the `with` statement.
+
+  Example:
+
+  Consider a custom object `MyObject`
+
+  ```python
+      with custom_object_scope({'MyObject':MyObject}):
+          layer = Dense(..., kernel_regularizer='MyObject')
+          # save, load, etc. will recognize custom object by name
+  ```
+
+  Arguments:
+      *args: Variable length list of dictionaries of name,
+          class pairs to add to custom objects.
+
+  Returns:
+      Object of type `CustomObjectScope`.
+  """
+  return CustomObjectScope(*args)
+
+
+def get_custom_objects():
+  """Retrieves a live reference to the global dictionary of custom objects.
+
+  Updating and clearing custom objects using `custom_object_scope`
+  is preferred, but `get_custom_objects` can
+  be used to directly access `_GLOBAL_CUSTOM_OBJECTS`.
+
+  Example:
+
+  ```python
+      get_custom_objects().clear()
+      get_custom_objects()['MyObject'] = MyObject
+  ```
+
+  Returns:
+      Global dictionary of names to classes (`_GLOBAL_CUSTOM_OBJECTS`).
+  """
+  return _GLOBAL_CUSTOM_OBJECTS
+
+
+def serialize_keras_object(instance):
+  _, instance = tf_decorator.unwrap(instance)
+  if instance is None:
+    return None
+  if hasattr(instance, 'get_config'):
+    return {
+        'class_name': instance.__class__.__name__,
+        'config': instance.get_config()
+    }
+  if hasattr(instance, '__name__'):
+    return instance.__name__
+  else:
+    raise ValueError('Cannot serialize', instance)
+
+
+def deserialize_keras_object(identifier,
+                             module_objects=None,
+                             custom_objects=None,
+                             printable_module_name='object'):
+  if isinstance(identifier, dict):
+    # In this case we are dealing with a Keras config dictionary.
+    config = identifier
+    if 'class_name' not in config or 'config' not in config:
+      raise ValueError('Improper config format: ' + str(config))
+    class_name = config['class_name']
+    if custom_objects and class_name in custom_objects:
+      cls = custom_objects[class_name]
+    elif class_name in _GLOBAL_CUSTOM_OBJECTS:
+      cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
+    else:
+      module_objects = module_objects or {}
+      cls = module_objects.get(class_name)
+      if cls is None:
+        raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+    if hasattr(cls, 'from_config'):
+      arg_spec = tf_inspect.getargspec(cls.from_config)
+      custom_objects = custom_objects or {}
+
+      if 'custom_objects' in arg_spec.args:
+        return cls.from_config(
+            config['config'],
+            custom_objects=dict(
+                list(_GLOBAL_CUSTOM_OBJECTS.items()) +
+                list(custom_objects.items())))
+      with CustomObjectScope(custom_objects):
+        return cls.from_config(config['config'])
+    else:
+      # Then `cls` may be a function returning a class.
+      # in this case by convention `config` holds
+      # the kwargs of the function.
+      custom_objects = custom_objects or {}
+      with CustomObjectScope(custom_objects):
+        return cls(**config['config'])
+  elif isinstance(identifier, six.string_types):
+    function_name = identifier
+    if custom_objects and function_name in custom_objects:
+      fn = custom_objects.get(function_name)
+    elif function_name in _GLOBAL_CUSTOM_OBJECTS:
+      fn = _GLOBAL_CUSTOM_OBJECTS[function_name]
+    else:
+      fn = module_objects.get(function_name)
+      if fn is None:
+        raise ValueError('Unknown ' + printable_module_name + ':' +
+                         function_name)
+    return fn
+  else:
+    raise ValueError('Could not interpret serialized ' + printable_module_name +
+                     ': ' + identifier)
+
+
+def func_dump(func):
+  """Serializes a user defined function.
+
+  Arguments:
+      func: the function to serialize.
+
+  Returns:
+      A tuple `(code, defaults, closure)`.
+  """
+  code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
+  defaults = func.__defaults__
+  if func.__closure__:
+    closure = tuple(c.cell_contents for c in func.__closure__)
+  else:
+    closure = None
+  return code, defaults, closure
+
+
+def func_load(code, defaults=None, closure=None, globs=None):
+  """Deserializes a user defined function.
+
+  Arguments:
+      code: bytecode of the function.
+      defaults: defaults of the function.
+      closure: closure of the function.
+      globs: dictionary of global objects.
+
+  Returns:
+      A function object.
+  """
+  if isinstance(code, (tuple, list)):  # unpack previous dump
+    code, defaults, closure = code
+    if isinstance(defaults, list):
+      defaults = tuple(defaults)
+  code = marshal.loads(code.encode('raw_unicode_escape'))
+  if globs is None:
+    globs = globals()
+  return python_types.FunctionType(
+      code, globs, name=code.co_name, argdefs=defaults, closure=closure)
+
+
+class Progbar(object):
+  """Displays a progress bar.
+
+  Arguments:
+      target: Total number of steps expected, None if unknown.
+      interval: Minimum visual progress update interval (in seconds).
+  """
+
+  def __init__(self, target, width=30, verbose=1, interval=0.05):
+    self.width = width
+    if target is None:
+      target = -1
+    self.target = target
+    self.sum_values = {}
+    self.unique_values = []
+    self.start = time.time()
+    self.last_update = 0
+    self.interval = interval
+    self.total_width = 0
+    self.seen_so_far = 0
+    self.verbose = verbose
+
+  def update(self, current, values=None, force=False):
+    """Updates the progress bar.
+
+    Arguments:
+        current: Index of current step.
+        values: List of tuples (name, value_for_last_step).
+            The progress bar will display averages for these values.
+        force: Whether to force visual progress update.
+    """
+    values = values or []
+    for k, v in values:
+      if k not in self.sum_values:
+        self.sum_values[k] = [
+            v * (current - self.seen_so_far), current - self.seen_so_far
+        ]
+        self.unique_values.append(k)
+      else:
+        self.sum_values[k][0] += v * (current - self.seen_so_far)
+        self.sum_values[k][1] += (current - self.seen_so_far)
+    self.seen_so_far = current
+
+    now = time.time()
+    if self.verbose == 1:
+      if not force and (now - self.last_update) < self.interval:
+        return
+
+      prev_total_width = self.total_width
+      sys.stdout.write('\b' * prev_total_width)
+      sys.stdout.write('\r')
+
+      if self.target is not -1:
+        numdigits = int(np.floor(np.log10(self.target))) + 1
+        barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
+        bar = barstr % (current, self.target)
+        prog = float(current) / self.target
+        prog_width = int(self.width * prog)
+        if prog_width > 0:
+          bar += ('=' * (prog_width - 1))
+          if current < self.target:
+            bar += '>'
+          else:
+            bar += '='
+        bar += ('.' * (self.width - prog_width))
+        bar += ']'
+        sys.stdout.write(bar)
+        self.total_width = len(bar)
+
+      if current:
+        time_per_unit = (now - self.start) / current
+      else:
+        time_per_unit = 0
+      eta = time_per_unit * (self.target - current)
+      info = ''
+      if current < self.target and self.target is not -1:
+        info += ' - ETA: %ds' % eta
+      else:
+        info += ' - %ds' % (now - self.start)
+      for k in self.unique_values:
+        info += ' - %s:' % k
+        if isinstance(self.sum_values[k], list):
+          avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
+          if abs(avg) > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        else:
+          info += ' %s' % self.sum_values[k]
+
+      self.total_width += len(info)
+      if prev_total_width > self.total_width:
+        info += ((prev_total_width - self.total_width) * ' ')
+
+      sys.stdout.write(info)
+      sys.stdout.flush()
+
+      if current >= self.target:
+        sys.stdout.write('\n')
+
+    if self.verbose == 2:
+      if current >= self.target:
+        info = '%ds' % (now - self.start)
+        for k in self.unique_values:
+          info += ' - %s:' % k
+          avg = self.sum_values[k][0] / max(1, self.sum_values[k][1])
+          if avg > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        sys.stdout.write(info + '\n')
+
+    self.last_update = now
+
+  def add(self, n, values=None):
+    self.update(self.seen_so_far + n, values)
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils.py b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
new file mode 100644
index 00000000000..55c135b5ebf
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
@@ -0,0 +1,138 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to disk I/O."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+import sys
+
+import numpy as np
+
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class HDF5Matrix(object):
+  """Representation of HDF5 dataset to be used instead of a Numpy array.
+
+  Example:
+
+  ```python
+      x_data = HDF5Matrix('input/file.hdf5', 'data')
+      model.predict(x_data)
+  ```
+
+  Providing `start` and `end` allows use of a slice of the dataset.
+
+  Optionally, a normalizer function (or lambda) can be given. This will
+  be called on every slice of data retrieved.
+
+  Arguments:
+      datapath: string, path to a HDF5 file
+      dataset: string, name of the HDF5 dataset in the file specified
+          in datapath
+      start: int, start of desired slice of the specified dataset
+      end: int, end of desired slice of the specified dataset
+      normalizer: function to be called on data when retrieved
+
+  Returns:
+      An array-like HDF5 dataset.
+  """
+  refs = defaultdict(int)
+
+  def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
+    if h5py is None:
+      raise ImportError('The use of HDF5Matrix requires '
+                        'HDF5 and h5py installed.')
+
+    if datapath not in list(self.refs.keys()):
+      f = h5py.File(datapath)
+      self.refs[datapath] = f
+    else:
+      f = self.refs[datapath]
+    self.data = f[dataset]
+    self.start = start
+    if end is None:
+      self.end = self.data.shape[0]
+    else:
+      self.end = end
+    self.normalizer = normalizer
+
+  def __len__(self):
+    return self.end - self.start
+
+  def __getitem__(self, key):
+    if isinstance(key, slice):
+      start, stop = key.start, key.stop
+      if start is None:
+        start = 0
+      if stop is None:
+        stop = self.data.shape[0]
+      if stop + self.start <= self.end:
+        idx = slice(start + self.start, stop + self.start)
+      else:
+        raise IndexError
+    elif isinstance(key, int):
+      if key + self.start < self.end:
+        idx = key + self.start
+      else:
+        raise IndexError
+    elif isinstance(key, np.ndarray):
+      if np.max(key) + self.start < self.end:
+        idx = (self.start + key).tolist()
+      else:
+        raise IndexError
+    elif isinstance(key, list):
+      if max(key) + self.start < self.end:
+        idx = [x + self.start for x in key]
+      else:
+        raise IndexError
+    else:
+      raise IndexError
+    if self.normalizer is not None:
+      return self.normalizer(self.data[idx])
+    else:
+      return self.data[idx]
+
+  @property
+  def shape(self):
+    return (self.end - self.start,) + self.data.shape[1:]
+
+
+def ask_to_proceed_with_overwrite(filepath):
+  """Produces a prompt asking about overwriting a file.
+
+  Arguments:
+      filepath: the path to the file to be overwritten.
+
+  Returns:
+      True if we can proceed with overwrite, False otherwise.
+  """
+  get_input = input
+  if sys.version_info[:2] <= (2, 7):
+    get_input = raw_input
+  overwrite = get_input('[WARNING] %s already exists - overwrite? '
+                        '[y/n]' % (filepath))
+  while overwrite not in ['y', 'n']:
+    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  if overwrite == 'n':
+    return False
+  print('[TIP] Next time specify overwrite=True!')
+  return True
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
new file mode 100644
index 00000000000..154070fb932
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -0,0 +1,210 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to Keras layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras.utils.conv_utils import convert_kernel
+
+
+def print_summary(model, line_length=None, positions=None):
+  """Prints a summary of a model.
+
+  Arguments:
+      model: Keras model instance.
+      line_length: total length of printed lines
+      positions: relative or absolute positions of log elements in each line.
+          If not provided, defaults to `[.33, .55, .67, 1.]`.
+  """
+  if model.__class__.__name__ == 'Sequential':
+    sequential_like = True
+  else:
+    sequential_like = True
+    for v in model.nodes_by_depth.values():
+      if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
+        # If the model has multiple nodes or if the nodes have
+        # multiple inbound_layers, the model is no longer sequential.
+        sequential_like = False
+        break
+
+  if sequential_like:
+    line_length = line_length or 65
+    positions = positions or [.45, .85, 1.]
+    if positions[-1] <= 1:
+      positions = [int(line_length * p) for p in positions]
+    # header names for the different log elements
+    to_display = ['Layer (type)', 'Output Shape', 'Param #']
+  else:
+    line_length = line_length or 100
+    positions = positions or [.33, .55, .67, 1.]
+    if positions[-1] <= 1:
+      positions = [int(line_length * p) for p in positions]
+    # header names for the different log elements
+    to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
+    relevant_nodes = []
+    for v in model.nodes_by_depth.values():
+      relevant_nodes += v
+
+  def print_row(fields, positions):
+    line = ''
+    for i in range(len(fields)):
+      if i > 0:
+        line = line[:-1] + ' '
+      line += str(fields[i])
+      line = line[:positions[i]]
+      line += ' ' * (positions[i] - len(line))
+    print(line)
+
+  print('_' * line_length)
+  print_row(to_display, positions)
+  print('=' * line_length)
+
+  def print_layer_summary(layer):
+    try:
+      output_shape = layer.output_shape
+    except AttributeError:
+      output_shape = 'multiple'
+    name = layer.name
+    cls_name = layer.__class__.__name__
+    fields = [name + ' (' + cls_name + ')', output_shape, layer.count_params()]
+    print_row(fields, positions)
+
+  def print_layer_summary_with_connections(layer):
+    """Prints a summary for a single layer.
+
+    Arguments:
+        layer: target layer.
+    """
+    try:
+      output_shape = layer.output_shape
+    except AttributeError:
+      output_shape = 'multiple'
+    connections = []
+    for node in layer.inbound_nodes:
+      if relevant_nodes and node not in relevant_nodes:
+        # node is not part of the current network
+        continue
+      for i in range(len(node.inbound_layers)):
+        inbound_layer = node.inbound_layers[i].name
+        inbound_node_index = node.node_indices[i]
+        inbound_tensor_index = node.tensor_indices[i]
+        connections.append(inbound_layer + '[' + str(inbound_node_index) + ']['
+                           + str(inbound_tensor_index) + ']')
+
+    name = layer.name
+    cls_name = layer.__class__.__name__
+    if not connections:
+      first_connection = ''
+    else:
+      first_connection = connections[0]
+    fields = [
+        name + ' (' + cls_name + ')', output_shape,
+        layer.count_params(), first_connection
+    ]
+    print_row(fields, positions)
+    if len(connections) > 1:
+      for i in range(1, len(connections)):
+        fields = ['', '', '', connections[i]]
+        print_row(fields, positions)
+
+  layers = model.layers
+  for i in range(len(layers)):
+    if sequential_like:
+      print_layer_summary(layers[i])
+    else:
+      print_layer_summary_with_connections(layers[i])
+    if i == len(layers) - 1:
+      print('=' * line_length)
+    else:
+      print('_' * line_length)
+
+  trainable_count = int(
+      np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
+  non_trainable_count = int(
+      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
+
+  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
+  print('Trainable params: {:,}'.format(trainable_count))
+  print('Non-trainable params: {:,}'.format(non_trainable_count))
+  print('_' * line_length)
+
+
+def convert_all_kernels_in_model(model):
+  """Converts all convolution kernels in a model from Theano to TensorFlow.
+
+  Also works from TensorFlow to Theano.
+
+  Arguments:
+      model: target model for the conversion.
+  """
+  # Note: SeparableConvolution not included
+  # since only supported by TF.
+  conv_classes = {
+      'Conv1D',
+      'Conv2D',
+      'Conv3D',
+      'Conv2DTranspose',
+  }
+  to_assign = []
+  for layer in model.layers:
+    if layer.__class__.__name__ in conv_classes:
+      original_kernel = K.get_value(layer.kernel)
+      converted_kernel = convert_kernel(original_kernel)
+      to_assign.append((layer.kernel, converted_kernel))
+  K.batch_set_value(to_assign)
+
+
+def convert_dense_weights_data_format(dense,
+                                      previous_feature_map_shape,
+                                      target_data_format='channels_first'):
+  """Utility useful when changing a convnet's `data_format`.
+
+  When porting the weights of a convnet from one data format to the other,
+  if the convnet includes a `Flatten` layer
+  (applied to the last convolutional feature map)
+  followed by a `Dense` layer, the weights of that `Dense` layer
+  should be updated to reflect the new dimension ordering.
+
+  Arguments:
+      dense: The target `Dense` layer.
+      previous_feature_map_shape: A shape tuple of 3 integers,
+          e.g. `(512, 7, 7)`. The shape of the convolutional
+          feature map right before the `Flatten` layer that
+          came before the target `Dense` layer.
+      target_data_format: One of "channels_last", "channels_first".
+          Set it "channels_last"
+          if converting a "channels_first" model to "channels_last",
+          or reciprocally.
+  """
+  assert target_data_format in {'channels_last', 'channels_first'}
+  kernel, bias = dense.get_weights()
+  for i in range(kernel.shape[1]):
+    if target_data_format == 'channels_first':
+      c, h, w = previous_feature_map_shape
+      original_fm_shape = (h, w, c)
+      ki = kernel[:, i].reshape(original_fm_shape)
+      ki = np.transpose(ki, (2, 0, 1))  # last -> first
+    else:
+      h, w, c = previous_feature_map_shape
+      original_fm_shape = (c, h, w)
+      ki = kernel[:, i].reshape(original_fm_shape)
+      ki = np.transpose(ki, (1, 2, 0))  # first -> last
+    kernel[:, i] = np.reshape(ki, (np.prod(previous_feature_map_shape),))
+  dense.set_weights([kernel, bias])
diff --git a/tensorflow/contrib/keras/python/keras/utils/np_utils.py b/tensorflow/contrib/keras/python/keras/utils/np_utils.py
new file mode 100644
index 00000000000..a23172d342a
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/np_utils.py
@@ -0,0 +1,58 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Numpy-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def to_categorical(y, num_classes=None):
+  """Converts a class vector (integers) to binary class matrix.
+
+  E.g. for use with categorical_crossentropy.
+
+  Arguments:
+      y: class vector to be converted into a matrix
+          (integers from 0 to num_classes).
+      num_classes: total number of classes.
+
+  Returns:
+      A binary matrix representation of the input.
+  """
+  y = np.array(y, dtype='int').ravel()
+  if not num_classes:
+    num_classes = np.max(y) + 1
+  n = y.shape[0]
+  categorical = np.zeros((n, num_classes))
+  categorical[np.arange(n), y] = 1
+  return categorical
+
+
+def normalize(x, axis=-1, order=2):
+  """Normalizes a Numpy array.
+
+  Arguments:
+      x: Numpy array to normalize.
+      axis: axis along which to normalize.
+      order: Normalization order (e.g. 2 for L2 norm).
+
+  Returns:
+      A normalized copy of the array.
+  """
+  l2 = np.atleast_1d(np.linalg.norm(x, order, axis))
+  l2[l2 == 0] = 1
+  return x / np.expand_dims(l2, axis)
diff --git a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
new file mode 100644
index 00000000000..949767299b9
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
@@ -0,0 +1,154 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to model visualization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+try:
+  # pydot-ng is a fork of pydot that is better maintained.
+  import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
+except ImportError:
+  # Fall back on pydot if necessary.
+  # Silence a `print` statement that occurs in case of import error,
+  # by temporarily replacing sys.stdout.
+  _stdout = sys.stdout
+  sys.stdout = sys.stderr
+  try:
+    import pydot  # pylint: disable=g-import-not-at-top
+  except ImportError:
+    pydot = None
+  finally:
+    # Restore sys.stdout.
+    sys.stdout = _stdout
+
+
+def _check_pydot():
+  try:
+    # Attempt to create an image of a blank graph
+    # to check the pydot/graphviz installation.
+    pydot.Dot.create(pydot.Dot())
+  except Exception:
+    # pydot raises a generic Exception here,
+    # so no specific class can be caught.
+    raise ImportError('Failed to import pydot. You must install pydot'
+                      ' and graphviz for `pydotprint` to work.')
+
+
+def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
+  """Convert a Keras model to dot format.
+
+  Arguments:
+      model: A Keras model instance.
+      show_shapes: whether to display shape information.
+      show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot:
+          'TB' creates a vertical plot;
+          'LR' creates a horizontal plot.
+
+  Returns:
+      A `pydot.Dot` instance representing the Keras model.
+  """
+  from tensorflow.contrib.keras.python.keras.layers.wrappers import Wrapper  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.keras.python.keras.models import Sequential  # pylint: disable=g-import-not-at-top
+
+  _check_pydot()
+  dot = pydot.Dot()
+  dot.set('rankdir', rankdir)
+  dot.set('concentrate', True)
+  dot.set_node_defaults(shape='record')
+
+  if isinstance(model, Sequential):
+    if not model.built:
+      model.build()
+    model = model.model
+  layers = model.layers
+
+  # Create graph nodes.
+  for layer in layers:
+    layer_id = str(id(layer))
+
+    # Append a wrapped layer's label to node's label, if it exists.
+    layer_name = layer.name
+    class_name = layer.__class__.__name__
+    if isinstance(layer, Wrapper):
+      layer_name = '{}({})'.format(layer_name, layer.layer.name)
+      child_class_name = layer.layer.__class__.__name__
+      class_name = '{}({})'.format(class_name, child_class_name)
+
+    # Create node's label.
+    if show_layer_names:
+      label = '{}: {}'.format(layer_name, class_name)
+    else:
+      label = class_name
+
+    # Rebuild the label as a table including input/output shapes.
+    if show_shapes:
+      try:
+        outputlabels = str(layer.output_shape)
+      except AttributeError:
+        outputlabels = 'multiple'
+      if hasattr(layer, 'input_shape'):
+        inputlabels = str(layer.input_shape)
+      elif hasattr(layer, 'input_shapes'):
+        inputlabels = ', '.join([str(ishape) for ishape in layer.input_shapes])
+      else:
+        inputlabels = 'multiple'
+      label = '%s\n|{input:|output:}|{{%s}|{%s}}' % (label, inputlabels,
+                                                     outputlabels)
+    node = pydot.Node(layer_id, label=label)
+    dot.add_node(node)
+
+  # Connect nodes with edges.
+  for layer in layers:
+    layer_id = str(id(layer))
+    for i, node in enumerate(layer.inbound_nodes):
+      node_key = layer.name + '_ib-' + str(i)
+      if node_key in model.container_nodes:
+        for inbound_layer in node.inbound_layers:
+          inbound_layer_id = str(id(inbound_layer))
+          layer_id = str(id(layer))
+          dot.add_edge(pydot.Edge(inbound_layer_id, layer_id))
+  return dot
+
+
+def plot_model(model,
+               to_file='model.png',
+               show_shapes=False,
+               show_layer_names=True,
+               rankdir='TB'):
+  """Converts a Keras model to dot format and save to a file.
+
+  Arguments:
+      model: A Keras model instance
+      to_file: File name of the plot image.
+      show_shapes: whether to display shape information.
+      show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot:
+          'TB' creates a vertical plot;
+          'LR' creates a horizontal plot.
+  """
+  dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
+  _, extension = os.path.splitext(to_file)
+  if not extension:
+    extension = 'png'
+  else:
+    extension = extension[1:]
+  dot.write(to_file, format=extension)
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/__init__.py b/tensorflow/contrib/keras/python/keras/wrappers/__init__.py
new file mode 100644
index 00000000000..51244ff6810
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras API wrappers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.wrappers import scikit_learn
+
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
new file mode 100644
index 00000000000..0d04fc120f1
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
@@ -0,0 +1,356 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""API wrapper allowing to use certain Keras models with the Scikit-Learn API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import types
+
+import numpy as np
+
+from tensorflow.contrib.keras.python.keras.models import Sequential
+from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.util import tf_inspect
+
+
+class BaseWrapper(object):
+  """Base class for the Keras scikit-learn wrapper.
+
+  Warning: This class should not be used directly.
+  Use descendant classes instead.
+
+  Arguments:
+      build_fn: callable function or class instance
+      **sk_params: model parameters & fitting parameters
+
+  The build_fn should construct, compile and return a Keras model, which
+  will then be used to fit/predict. One of the following
+  three values could be passed to build_fn:
+  1. A function
+  2. An instance of a class that implements the __call__ method
+  3. None. This means you implement a class that inherits from either
+  `KerasClassifier` or `KerasRegressor`. The __call__ method of the
+  present class will then be treated as the default build_fn.
+
+  `sk_params` takes both model parameters and fitting parameters. Legal model
+  parameters are the arguments of `build_fn`. Note that like all other
+  estimators in scikit-learn, 'build_fn' should provide default values for
+  its arguments, so that you could create the estimator without passing any
+  values to `sk_params`.
+
+  `sk_params` could also accept parameters for calling `fit`, `predict`,
+  `predict_proba`, and `score` methods (e.g., `epochs`, `batch_size`).
+  fitting (predicting) parameters are selected in the following order:
+
+  1. Values passed to the dictionary arguments of
+  `fit`, `predict`, `predict_proba`, and `score` methods
+  2. Values passed to `sk_params`
+  3. The default values of the `keras.models.Sequential`
+  `fit`, `predict`, `predict_proba` and `score` methods
+
+  When using scikit-learn's `grid_search` API, legal tunable parameters are
+  those you could pass to `sk_params`, including fitting parameters.
+  In other words, you could use `grid_search` to search for the best
+  `batch_size` or `epochs` as well as the model parameters.
+  """
+
+  def __init__(self, build_fn=None, **sk_params):
+    self.build_fn = build_fn
+    self.sk_params = sk_params
+    self.check_params(sk_params)
+
+  def check_params(self, params):
+    """Checks for user typos in "params".
+
+    Arguments:
+        params: dictionary; the parameters to be checked
+
+    Raises:
+        ValueError: if any member of `params` is not a valid argument.
+    """
+    legal_params_fns = [
+        Sequential.fit, Sequential.predict, Sequential.predict_classes,
+        Sequential.evaluate
+    ]
+    if self.build_fn is None:
+      legal_params_fns.append(self.__call__)
+    elif (not isinstance(self.build_fn, types.FunctionType) and
+          not isinstance(self.build_fn, types.MethodType)):
+      legal_params_fns.append(self.build_fn.__call__)
+    else:
+      legal_params_fns.append(self.build_fn)
+
+    legal_params = []
+    for fn in legal_params_fns:
+      legal_params += tf_inspect.getargspec(fn)[0]
+    legal_params = set(legal_params)
+
+    for params_name in params:
+      if params_name not in legal_params:
+        if params_name != 'nb_epoch':
+          raise ValueError('{} is not a legal parameter'.format(params_name))
+
+  def get_params(self, **params):  # pylint: disable=unused-argument
+    """Gets parameters for this estimator.
+
+    Arguments:
+        **params: ignored (exists for API compatibility).
+
+    Returns:
+        Dictionary of parameter names mapped to their values.
+    """
+    res = copy.deepcopy(self.sk_params)
+    res.update({'build_fn': self.build_fn})
+    return res
+
+  def set_params(self, **params):
+    """Sets the parameters of this estimator.
+
+    Arguments:
+        **params: Dictionary of parameter names mapped to their values.
+
+    Returns:
+        self
+    """
+    self.check_params(params)
+    self.sk_params.update(params)
+    return self
+
+  def fit(self, x, y, **kwargs):
+    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
+
+    Arguments:
+        x : array-like, shape `(n_samples, n_features)`
+            Training samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.fit`
+
+    Returns:
+        history : object
+            details about the training history at each epoch.
+    """
+    if self.build_fn is None:
+      self.model = self.__call__(**self.filter_sk_params(self.__call__))
+    elif (not isinstance(self.build_fn, types.FunctionType) and
+          not isinstance(self.build_fn, types.MethodType)):
+      self.model = self.build_fn(
+          **self.filter_sk_params(self.build_fn.__call__))
+    else:
+      self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
+
+    loss_name = self.model.loss
+    if hasattr(loss_name, '__name__'):
+      loss_name = loss_name.__name__
+    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+      y = to_categorical(y)
+
+    fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
+    fit_args.update(kwargs)
+
+    history = self.model.fit(x, y, **fit_args)
+
+    return history
+
+  def filter_sk_params(self, fn, override=None):
+    """Filters `sk_params` and return those in `fn`'s arguments.
+
+    Arguments:
+        fn : arbitrary function
+        override: dictionary, values to override sk_params
+
+    Returns:
+        res : dictionary dictionary containing variables
+            in both sk_params and fn's arguments.
+    """
+    override = override or {}
+    res = {}
+    fn_args = tf_inspect.getargspec(fn)[0]
+    for name, value in self.sk_params.items():
+      if name in fn_args:
+        res.update({name: value})
+    res.update(override)
+    return res
+
+
+class KerasClassifier(BaseWrapper):
+  """Implementation of the scikit-learn classifier API for Keras.
+  """
+
+  def fit(self, x, y, **kwargs):
+    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
+
+    Arguments:
+        x : array-like, shape `(n_samples, n_features)`
+            Training samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.fit`
+
+    Returns:
+        history : object
+            details about the training history at each epoch.
+
+    Raises:
+        ValueError: In case of invalid shape for `y` argument.
+    """
+    y = np.array(y)
+    if len(y.shape) == 2 and y.shape[1] > 1:
+      self.classes_ = np.arange(y.shape[1])
+    elif (len(y.shape) == 2 and y.shape[1] == 1) or len(y.shape) == 1:
+      self.classes_ = np.unique(y)
+      y = np.searchsorted(self.classes_, y)
+    else:
+      raise ValueError('Invalid shape for y: ' + str(y.shape))
+    self.n_classes_ = len(self.classes_)
+    return super(KerasClassifier, self).fit(x, y, **kwargs)
+
+  def predict(self, x, **kwargs):
+    """Returns the class predictions for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments
+            of `Sequential.predict_classes`.
+
+    Returns:
+        preds: array-like, shape `(n_samples,)`
+            Class predictions.
+    """
+    kwargs = self.filter_sk_params(Sequential.predict_classes, kwargs)
+    classes = self.model.predict_classes(x, **kwargs)
+    return self.classes_[classes]
+
+  def predict_proba(self, x, **kwargs):
+    """Returns class probability estimates for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments
+            of `Sequential.predict_classes`.
+
+    Returns:
+        proba: array-like, shape `(n_samples, n_outputs)`
+            Class probability estimates.
+            In the case of binary classification,
+            tp match the scikit-learn API,
+            will return an array of shape '(n_samples, 2)'
+            (instead of `(n_sample, 1)` as in Keras).
+    """
+    kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
+    probs = self.model.predict_proba(x, **kwargs)
+
+    # check if binary classification
+    if probs.shape[1] == 1:
+      # first column is probability of class 0 and second is of class 1
+      probs = np.hstack([1 - probs, probs])
+    return probs
+
+  def score(self, x, y, **kwargs):
+    """Returns the mean accuracy on the given test data and labels.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for x.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.evaluate`.
+
+    Returns:
+        score: float
+            Mean accuracy of predictions on X wrt. y.
+
+    Raises:
+        ValueError: If the underlying model isn't configured to
+            compute accuracy. You should pass `metrics=["accuracy"]` to
+            the `.compile()` method of the model.
+    """
+    y = np.searchsorted(self.classes_, y)
+    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+
+    loss_name = self.model.loss
+    if hasattr(loss_name, '__name__'):
+      loss_name = loss_name.__name__
+    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+      y = to_categorical(y)
+
+    outputs = self.model.evaluate(x, y, **kwargs)
+    if not isinstance(outputs, list):
+      outputs = [outputs]
+    for name, output in zip(self.model.metrics_names, outputs):
+      if name == 'acc':
+        return output
+    raise ValueError('The model is not configured to compute accuracy. '
+                     'You should pass `metrics=["accuracy"]` to '
+                     'the `model.compile()` method.')
+
+
+class KerasRegressor(BaseWrapper):
+  """Implementation of the scikit-learn regressor API for Keras.
+  """
+
+  def predict(self, x, **kwargs):
+    """Returns predictions for the given test data.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.predict`.
+
+    Returns:
+        preds: array-like, shape `(n_samples,)`
+            Predictions.
+    """
+    kwargs = self.filter_sk_params(Sequential.predict, kwargs)
+    return np.squeeze(self.model.predict(x, **kwargs))
+
+  def score(self, x, y, **kwargs):
+    """Returns the mean loss on the given test data and labels.
+
+    Arguments:
+        x: array-like, shape `(n_samples, n_features)`
+            Test samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y: array-like, shape `(n_samples,)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.evaluate`.
+
+    Returns:
+        score: float
+            Mean accuracy of predictions on X wrt. y.
+    """
+    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
+    loss = self.model.evaluate(x, y, **kwargs)
+    if isinstance(loss, list):
+      return loss[0]
+    return loss
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py
new file mode 100644
index 00000000000..95e0b951ebf
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn_test.py
@@ -0,0 +1,190 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Scikit-learn API wrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.contrib.keras.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+INPUT_DIM = 5
+HIDDEN_DIM = 5
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 5
+NUM_CLASSES = 2
+BATCH_SIZE = 5
+EPOCHS = 1
+
+
+def build_fn_clf(hidden_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(hidden_dim))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(NUM_CLASSES))
+  model.add(keras.layers.Activation('softmax'))
+  model.compile(
+      optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
+  return model
+
+
+def assert_classification_works(clf):
+  np.random.seed(42)
+  (x_train, y_train), (x_test, _) = testing_utils.get_test_data(
+      train_samples=TRAIN_SAMPLES,
+      test_samples=TEST_SAMPLES,
+      input_shape=(INPUT_DIM,),
+      num_classes=NUM_CLASSES)
+
+  clf.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
+
+  score = clf.score(x_train, y_train, batch_size=BATCH_SIZE)
+  assert np.isscalar(score) and np.isfinite(score)
+
+  preds = clf.predict(x_test, batch_size=BATCH_SIZE)
+  assert preds.shape == (TEST_SAMPLES,)
+  for prediction in np.unique(preds):
+    assert prediction in range(NUM_CLASSES)
+
+  proba = clf.predict_proba(x_test, batch_size=BATCH_SIZE)
+  assert proba.shape == (TEST_SAMPLES, NUM_CLASSES)
+  assert np.allclose(np.sum(proba, axis=1), np.ones(TEST_SAMPLES))
+
+
+def build_fn_reg(hidden_dim):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(hidden_dim))
+  model.add(keras.layers.Activation('relu'))
+  model.add(keras.layers.Dense(1))
+  model.add(keras.layers.Activation('linear'))
+  model.compile(
+      optimizer='sgd', loss='mean_absolute_error', metrics=['accuracy'])
+  return model
+
+
+def assert_regression_works(reg):
+  np.random.seed(42)
+  (x_train, y_train), (x_test, _) = testing_utils.get_test_data(
+      train_samples=TRAIN_SAMPLES,
+      test_samples=TEST_SAMPLES,
+      input_shape=(INPUT_DIM,),
+      num_classes=NUM_CLASSES)
+
+  reg.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
+
+  score = reg.score(x_train, y_train, batch_size=BATCH_SIZE)
+  assert np.isscalar(score) and np.isfinite(score)
+
+  preds = reg.predict(x_test, batch_size=BATCH_SIZE)
+  assert preds.shape == (TEST_SAMPLES,)
+
+
+class ScikitLearnAPIWrapperTest(test.TestCase):
+
+  def test_classify_build_fn(self):
+    with self.test_session():
+      clf = keras.wrappers.scikit_learn.KerasClassifier(
+          build_fn=build_fn_clf,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_classify_class_build_fn(self):
+
+    class ClassBuildFnClf(object):
+
+      def __call__(self, hidden_dim):
+        return build_fn_clf(hidden_dim)
+
+    with self.test_session():
+      clf = keras.wrappers.scikit_learn.KerasClassifier(
+          build_fn=ClassBuildFnClf(),
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_classify_inherit_class_build_fn(self):
+
+    class InheritClassBuildFnClf(keras.wrappers.scikit_learn.KerasClassifier):
+
+      def __call__(self, hidden_dim):
+        return build_fn_clf(hidden_dim)
+
+    with self.test_session():
+      clf = InheritClassBuildFnClf(
+          build_fn=None,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_classification_works(clf)
+
+  def test_regression_build_fn(self):
+    with self.test_session():
+      reg = keras.wrappers.scikit_learn.KerasRegressor(
+          build_fn=build_fn_reg,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+  def test_regression_class_build_fn(self):
+
+    class ClassBuildFnReg(object):
+
+      def __call__(self, hidden_dim):
+        return build_fn_reg(hidden_dim)
+
+    with self.test_session():
+      reg = keras.wrappers.scikit_learn.KerasRegressor(
+          build_fn=ClassBuildFnReg(),
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+  def test_regression_inherit_class_build_fn(self):
+
+    class InheritClassBuildFnReg(keras.wrappers.scikit_learn.KerasRegressor):
+
+      def __call__(self, hidden_dim):
+        return build_fn_reg(hidden_dim)
+
+    with self.test_session():
+      reg = InheritClassBuildFnReg(
+          build_fn=None,
+          hidden_dim=HIDDEN_DIM,
+          batch_size=BATCH_SIZE,
+          epochs=EPOCHS)
+
+      assert_regression_works(reg)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
new file mode 100644
index 00000000000..fccaa3abd4d
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -0,0 +1,84 @@
+# Description:
+#   Contains kernel methods for TensorFlow.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "kernel_methods",
+    srcs = [
+        "__init__.py",
+        "python/kernel_estimators.py",
+        "python/mappers/random_fourier_features.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dense_kernel_mapper_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "dense_kernel_mapper_py",
+    srcs = ["python/mappers/dense_kernel_mapper.py"],
+    srcs_version = "PY2AND3",
+    deps = ["@six_archive//:six"],
+)
+
+py_test(
+    name = "random_fourier_features_test",
+    srcs = ["python/mappers/random_fourier_features_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dense_kernel_mapper_py",
+        ":kernel_methods",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+py_test(
+    name = "kernel_estimators_test",
+    srcs = ["python/kernel_estimators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":kernel_methods",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/kernel_methods/README.md b/tensorflow/contrib/kernel_methods/README.md
new file mode 100644
index 00000000000..44ed9670a09
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/README.md
@@ -0,0 +1,53 @@
+# TensorFlow contrib kernel_methods.
+
+This module contains operations and estimators that enable the use of primal
+(explicit) kernel methods in TensorFlow. See also the [tutorial](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/g3doc/tutorial.md) on how to use this module to improve the quality of
+classification or regression tasks.
+
+## Kernel Mappers
+Implement explicit kernel mapping Ops over tensors. Kernel mappers add
+Tensor-In-Tensor-Out (TITO) Ops to the TensorFlow graph. They can be used in
+conjunction with other layers or ML models.
+
+Sample usage:
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.SomeKernelMapper(...)
+out_tensor = kernel_mapper.map(in_tensor)
+...  # code that consumes out_tensor.
+```
+
+Currently, there is a [RandomFourierFeatureMapper](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py) implemented that maps dense input to dense
+output. More mappers are on the way.
+
+## Kernel-based Estimators
+These are estimators inheriting from the @{tf.contrib.learn.Estimator} class and
+use kernel mappers internally to discover non-linearities in the data. These
+canned estimators map their input features using kernel mapper Ops and then
+apply linear models to the mapped features. Combining kernel mappers with linear
+models and different loss functions leads to a variety of models: linear and
+non-linear SVMs, linear regression (with and without kernels) and (multinomial)
+logistic regression (with and without kernels).
+
+Currently there is a [KernelLinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/kernel_estimators.py) implemented but more pre-packaged estimators
+are on the way.
+
+Sample usage:
+
+```python
+real_column_a = tf.contrib.layers.real_valued_column(name='real_column_a',...)
+sparse_column_b = tf.contrib.layers.sparse_column_with_hash_bucket(...)
+kernel_mappers = {real_column_a : [tf.contrib.kernel_methods.SomeKernelMapper(...)]}
+optimizer = ...
+
+kernel_classifier = tf.contrib.kernel_methods.KernelLinearClassifier(
+    feature_columns=[real_column_a, sparse_column_b],
+    model_dir=...,
+    optimizer=optimizer,
+    kernel_mappers=kernel_mappers)
+
+# Construct input_fns
+kernel_classifier.fit(...)
+kernel_classifier.evaluate(...)
+```
+
diff --git a/tensorflow/contrib/kernel_methods/__init__.py b/tensorflow/contrib/kernel_methods/__init__.py
new file mode 100644
index 00000000000..7272e595160
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and estimators that enable explicit kernel methods in TensorFlow.
+
+@@KernelLinearClassifier
+@@RandomFourierFeatureMapper
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kernel_methods.python.kernel_estimators import KernelLinearClassifier
+from tensorflow.contrib.kernel_methods.python.mappers.random_fourier_features import RandomFourierFeatureMapper
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png
new file mode 100644
index 00000000000..1028bb39017
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png
new file mode 100644
index 00000000000..b3384e053b2
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png
new file mode 100644
index 00000000000..e63303dab45
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/tutorial.md b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
new file mode 100644
index 00000000000..9877375c2c1
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
@@ -0,0 +1,279 @@
+# Improving Linear Models Using Explicit Kernel Methods
+
+In this tutorial, we demonstrate how combining (explicit) kernel methods with
+linear models can drastically increase the latters' quality of predictions
+without significantly increasing training and inference times. Unlike dual
+kernel methods, explicit (primal) kernel methods scale well with the size of the
+training dataset both in terms of training/inference times and in terms of
+memory requirements.
+
+Currently, explicit kernel mappings are supported for dense features. Support
+for sparse features is in the works.
+
+We will use [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn) (TensorFlow's high-level Machine Learning API) Estimators for our ML models. The
+tf.contrib.learn API reduces the boilerplate code one needs to write for
+configuring, training and evaluating models and will let us focus on the core
+ideas. If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn) is a good place to start. We
+will use MNIST, a widely-used dataset containing images of handwritten digits
+(between 0 and 9). The tutorial consists of the following steps:
+
+* Load and prepare MNIST data for classification.
+* Construct a simple linear model, train it and evaluate it on the eval data.
+* Replace the linear model with a kernelized linear model, re-train and
+re-evaluate.
+
+## Load and prepare MNIST data for classification
+The first step is to prepare the data to be fed to the ML models. The following
+utility command from tf.contrib.learn loads the MNIST dataset:
+
+```python
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+```
+This loads the entire MNIST dataset (containing 70K samples) and splits it into
+train, validation and test data with 55K, 5K and 10K samples respectively. Each
+split contains one numpy array for images (with shape [sample_size, 784]) and
+one for labels (with shape [sample_size, 1]). In this tutorial, we only use the
+train and validation splits (to train and evaluate our models respectively).
+
+In order to feed data to a tf.contrib.learn Estimator, it is helpful to convert
+it to Tensors. For this, we will use an `input function` which adds Ops to the
+TensorFlow graph that, when executed, create mini-batches of Tensors to be used
+downstream. For more background on input functions, check
+[Building Input Functions with tf.contrib.learn](https://www.tensorflow.org/get_started/input_fn).
+In this example, we will use the `tf.train.shuffle_batch` Op which, besides
+converting numpy arrays to Tensors, allows us to specify the batch_size and
+whether to randomize the input every time the input_fn Ops are executed
+(randomization typically expedites convergence during training). The full code
+for loading and preparing the data is shown in the snippet below. In this
+example, we use mini-batches of size 256 for training and the entire sample (5K
+entries) for evaluation. Feel free to experiment with different batch sizes.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000):
+
+  def _input_fn():
+    images_batch, labels_batch = tf.train.shuffle_batch(
+        tensors=[dataset_split.images, dataset_split.labels.astype(np.int32)],
+        batch_size=batch_size,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        enqueue_many=True,
+        num_threads=4)
+    features_map = {'images': images_batch}
+    return features_map, labels_batch
+
+  return _input_fn
+
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+
+train_input_fn = get_input_fn(data.train, batch_size=256)
+eval_input_fn = get_input_fn(data.validation, batch_size=5000)
+
+```
+
+## Training a simple linear model
+We can now train a linear model over the MNIST dataset. We will use the
+[tf.contrib.learn.LinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py) estimator with 10 classes (representing the 10 digits).
+The input features form a 784-dimensional (dense) vector which can be specified
+as follows:
+
+```python
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+```
+
+The full code for constructing, training and evaluating a LinearClassifier
+estimator is shown below.
+
+```python
+import time
+
+# Specify the feature(s) to be used by the estimator.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+estimator = tf.contrib.learn.LinearClassifier(feature_columns=[image_column], n_classes=10)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+On eval data, the loss (i.e., the value of the objective function being
+minimized during training) lies between **0.25** and **0.30** (depending on the
+parameters used) while the accuracy of the classifier is approximately **92.5%**
+(training is randomized so the exact loss and accuracy will vary). Also, the
+training time is around 25 seconds (this will also vary based on the machine you
+run the code on).
+
+In addition to experimenting with the (training) batch size and the number of
+training steps, there are a couple other parameters that can be tuned as well.
+For instance, you can change the optimization method used to minimize the loss
+by explicitly selecting another optimizer from the collection of
+[available optimizers](https://www.tensorflow.org/code/tensorflow/python/training).
+As an example, the following code constructs a LinearClassifier estimator that
+uses the Follow-The-Regularized-Leader (FTRL) optimization strategy with a
+specific learning rate and L2-regularization.
+
+
+```python
+optimizer = tf.train.FtrlOptimizer(learning_rate=5.0, l2_regularization_strength=1.0)
+estimator = tf.contrib.learn.LinearClassifier(
+    feature_columns=[image_column], n_classes=10, optimizer=optimizer)
+```
+
+Regardless of the values of the parameters, the max accuracy a linear model can
+achieve on this dataset caps at around **93%**.
+
+## Using explicit kernel mappings with the linear model.
+The relatively high error (~7%) of the linear model over MNIST indicates that
+the input data is not linearly separable. We will use explicit kernel mappings
+to reduce the classification error.
+
+**Intuition:** The high-level idea is to use a non-linear map to transform the
+input space to another feature space (of possibly higher dimension) where the
+(transformed) features are (almost) linearly separable and then apply a linear
+model on the mapped features. This is shown in the following figure:
+
+![image](./kernel_mapping.png)
+
+**Technical details overview:** In this example we will use **Random Fourier
+Features** (introduced in the
+["Random Features for Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf) paper by
+Rahimi and Recht) to map the input data. Random Fourier Features map a vector
+\\(\mathbf{x} \in \mathbb{R}^d\\) to \\(\mathbf{x'} \in \mathbb{R}^D\\) via the
+following mapping:
+
+$$
+RFFM(\cdot): \mathbb{R}^d \to \mathbb{R}^D, \quad
+RFFM(\mathbf{x}) =  \cos(\mathbf{\Omega} \cdot \mathbf{x}+ \mathbf{b})
+$$
+
+where \\(\mathbf{\Omega} \in \mathbb{R}^{D \times d}\\),
+\\(\mathbf{x} \in \mathbb{R}^d,\\) \\(\mathbf{b} \in \mathbb{R}^D\\) and the
+cosine is applied element-wise.
+
+In this example, the entries of \\(\mathbf{\Omega}\\) and \\(\mathbf{b}\\) are
+sampled from distributions such that the mapping satisfies the following
+property:
+
+$$
+RFFM(\mathbf{x})^T \cdot RFFM(\mathbf{y}) \approx
+e^{-\frac{\|\mathbf{x} - \mathbf{y}\|^2}{2 \sigma^2}}
+$$
+
+The right-hand-side quantity of the expression above is known as the RBF (or
+Gaussian) kernel function. This function is one of the most-widely used kernel
+functions in Machine Learning and measures (implicitly) similarity in a
+different (much higher dimensional) space than the original one. See
+[Radial basis function kernel](https://en.wikipedia.org/wiki/Radial_basis_function_kernel)
+for more details.
+
+**Kernel Classifier:** `tf.contrib.kernel_methods.KernelLinearClassifier` is a
+pre-packaged `tf.contrib.learn` estimator that combines the power of explicit
+kernel mappings with linear models. Its API is very similar to that of the
+LinearClassifier with the additional ability to specify a list of explicit
+kernel mappings to be applied to each feature used by the classifier. The
+following code snippet demonstrates how to replace LinearClassifier with
+KernelLinearClassifier.
+
+
+```python
+# Specify the feature(s) to be used by the estimator. This is identical to the
+# code used for the LinearClassifier.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+optimizer = tf.train.FtrlOptimizer(
+   learning_rate=50.0, l2_regularization_strength=0.001)
+
+
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+The only additional parameter passed to `KernelLinearClassifier` is a dictionary
+from feature_columns to a list of kernel mappings to be applied to the
+corresponding feature column. In this example, the lines
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+```
+instruct the classifier to first map the initial 784-dimensional images to
+2000-dimensional vectors using random Fourier features and then learn a linear
+model on the transformed vectors. Note that, besides the output dimension, there
+is one more parameter (stddev) involved. This parameter is the standard
+deviation (\\(\sigma\\)) of the approximated RBF kernel and controls the
+similarity measure used in classification. This parameter is typically
+determined via hyperparameter tuning.
+
+Running the code above yields a loss of approximately **0.10** while the
+accuracy is increased to approximately **97%** on eval data (an increase of 4%
+over the plain linear model). The training time hovers around 35 seconds. We can
+increase the accuracy even more, by increasing the output dimension of the
+mapping and tuning the standard deviation even more.
+
+**On the role of stddev:** The classification quality is very sensitive to the
+value of the stddev parameter used to define the similarity measure between the
+pairs of input features. The following table shows the accuracy of the
+classifier on the eval data for different values of stddev (for all experiments
+the output dimension was fixed to 3000). The optimal value is stddev=5.0. Notice
+how too small or too high stddev values can dramatically decrease the accuracy
+of the classification.
+
+stddev | eval accuracy
+:----- | :------------
+1.0    | 0.1362
+2.0    | 0.4764
+4.0    | 0.9654
+5.0    | 0.9766
+8.0    | 0.9714
+16.0   | 0.8878
+
+**On the role of the output dimension:** Intuitively, the larger the output
+dimension of the mapping, the closer the inner product of two mapped vectors
+approximates the kernel which typically translates to better classification
+accuracy. Another way to think about this is that the output dimension equals
+the number of weights of the linear model (the larger this dimension, the larger
+the "degrees of freedom" of the model). However, after a certain threshold,
+higher output dimensions increase the accuracy by very little (while still
+increasing the training time). This is shown in the following 2 Figures which
+depict the eval accuracy as a function of the output dimension and the training
+time respectively.
+
+![image](./acc_vs_outdim.png)  ![image](./acc-vs-trn_time.png)
+
+
+## Explicit kernel mappings: summary and practical tips
+* Explicit kernel mappings combine the predictive power of non-linear models
+with the scalability of linear models.
+* Unlike traditional dual kernel methods, they can scale to millions or hundreds
+of millions of samples.
+* Random Fourier Features can be particularly effective for datasets with dense
+features.
+* The parameters of the kernel mapping are often data-dependent. Model quality
+can be very sensitive to these parameters. Use hyperparameter tuning to find the
+optimal values.
+* If you have multiple numerical features, concatenate them into a single
+multi-dimensional feature and apply the kernel mapping to the concatenated
+vector.
+
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
new file mode 100644
index 00000000000..de7530231db
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
@@ -0,0 +1,339 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimators that combine explicit kernel mappings with linear models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.kernel_methods.python.mappers import dense_kernel_mapper as dkm
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import linear
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+
+_FEATURE_COLUMNS = "feature_columns"
+_KERNEL_MAPPERS = "kernel_mappers"
+_OPTIMIZER = "optimizer"
+
+
+def _check_valid_kernel_mappers(kernel_mappers):
+  """Checks that the input kernel_mappers are valid."""
+  if kernel_mappers is None:
+    return True
+  for kernel_mappers_list in six.itervalues(kernel_mappers):
+    for kernel_mapper in kernel_mappers_list:
+      if not isinstance(kernel_mapper, dkm.DenseKernelMapper):
+        return False
+  return True
+
+
+def _check_valid_head(head):
+  """Returns true if the provided head is supported."""
+  if head is None:
+    return False
+  # pylint: disable=protected-access
+  return isinstance(head, head_lib._BinaryLogisticHead) or isinstance(
+      head, head_lib._MultiClassHead)
+  # pylint: enable=protected-access
+
+
+def _update_features_and_columns(features, feature_columns,
+                                 kernel_mappers_dict):
+  """Updates features and feature_columns based on provided kernel mappers.
+
+  Currently supports the update of `RealValuedColumn`s only.
+
+  Args:
+    features: Initial features dict. The key is a `string` (feature column name)
+      and the value is a tensor.
+    feature_columns: Initial iterable containing all the feature columns to be
+      consumed (possibly after being updated) by the model. All items should be
+      instances of classes derived from `FeatureColumn`.
+    kernel_mappers_dict: A dict from feature column (type: _FeatureColumn) to
+      objects inheriting from KernelMapper class.
+
+  Returns:
+    updated features and feature_columns based on provided kernel_mappers_dict.
+  """
+  if kernel_mappers_dict is None:
+    return features, feature_columns
+
+  # First construct new columns and features affected by kernel_mappers_dict.
+  mapped_features = dict()
+  mapped_columns = set()
+  for feature_column in kernel_mappers_dict:
+    column_name = feature_column.name
+    # Currently only mappings over RealValuedColumns are supported.
+    if not isinstance(feature_column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
+      logging.warning(
+          "Updates are currently supported on RealValuedColumns only. Metadata "
+          "for FeatureColumn {} will not be updated.".format(column_name))
+      continue
+    mapped_column_name = column_name + "_MAPPED"
+    # Construct new feature columns based on provided kernel_mappers.
+    column_kernel_mappers = kernel_mappers_dict[feature_column]
+    new_dim = sum([mapper.output_dim for mapper in column_kernel_mappers])
+    mapped_columns.add(
+        layers.feature_column.real_valued_column(mapped_column_name, new_dim))
+
+    # Get mapped features by concatenating mapped tensors (one mapped tensor
+    # per kernel mappers from the list of kernel mappers corresponding to each
+    # feature column).
+    output_tensors = []
+    for kernel_mapper in column_kernel_mappers:
+      output_tensors.append(kernel_mapper.map(features[column_name]))
+    tensor = array_ops.concat(output_tensors, 1)
+    mapped_features[mapped_column_name] = tensor
+
+  # Finally update features dict and feature_columns.
+  features = features.copy()
+  features.update(mapped_features)
+  feature_columns = set(feature_columns)
+  feature_columns.update(mapped_columns)
+
+  return features, feature_columns
+
+
+def _kernel_model_fn(features, labels, mode, params, config=None):
+  """model_fn for the Estimator using kernel methods.
+
+  Args:
+    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction. See
+      `ModeKeys`.
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * head: A `Head` instance.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * optimizer: string, `Optimizer` object, or callable that defines the
+          optimizer to use for training. If `None`, will use a FTRL optimizer.
+      * kernel_mappers: Dictionary of kernel mappers to be applied to the input
+          features before training.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    A `ModelFnOps` instance.
+
+  Raises:
+    ValueError: If mode is not any of the `ModeKeys`.
+  """
+  feature_columns = params[_FEATURE_COLUMNS]
+  kernel_mappers = params[_KERNEL_MAPPERS]
+
+  updated_features, updated_columns = _update_features_and_columns(
+      features, feature_columns, kernel_mappers)
+  params[_FEATURE_COLUMNS] = updated_columns
+
+  return linear._linear_model_fn(  # pylint: disable=protected-access
+      updated_features, labels, mode, params, config)
+
+
+class _KernelEstimator(estimator.Estimator):
+  """Generic kernel-based linear estimator."""
+
+  def __init__(self,
+               feature_columns=None,
+               model_dir=None,
+               weight_column_name=None,
+               head=None,
+               optimizer=None,
+               kernel_mappers=None,
+               config=None):
+    """Constructs a `_KernelEstimator` object."""
+    if not feature_columns and not kernel_mappers:
+      raise ValueError(
+          "You should set at least one of feature_columns, kernel_mappers.")
+    if not _check_valid_kernel_mappers(kernel_mappers):
+      raise ValueError("Invalid kernel mappers.")
+
+    if not _check_valid_head(head):
+      raise ValueError(
+          "head type: {} is not supported. Supported head types: "
+          "_BinaryLogisticHead, _MultiClassHead.".format(type(head)))
+
+    params = {
+        "head": head,
+        _FEATURE_COLUMNS: feature_columns or [],
+        _OPTIMIZER: optimizer,
+        _KERNEL_MAPPERS: kernel_mappers,
+    }
+    super(_KernelEstimator, self).__init__(
+        model_fn=_kernel_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params)
+
+
+class KernelLinearClassifier(_KernelEstimator):
+  """Linear classifier using kernel methods as feature preprocessing.
+
+  It trains a linear model after possibly mapping initial input features into
+  a mapped space using explicit kernel mappings. Due to the kernel mappings,
+  training a linear classifier in the mapped (output) space can detect
+  non-linearities in the input space.
+
+  The user can provide a list of kernel mappers to be applied to all or a subset
+  of existing feature_columns. This way, the user can effectively provide 2
+  types of feature columns:
+
+  * those passed as elements of feature_columns in the classifier's constructor
+  * those appearing as a key of the kernel_mappers dict.
+
+  If a column appears in feature_columns only, no mapping is applied to it. If
+  it appears as a key in kernel_mappers, the corresponding kernel mappers are
+  applied to it. Note that it is possible that a column appears in both places.
+  Currently kernel_mappers are supported for _RealValuedColumns only.
+
+  Example usage:
+  ```
+  real_column_a = real_valued_column(name='real_column_a',...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+  kernel_mappers = {real_column_a : [RandomFourierFeatureMapper(...)]}
+  optimizer = ...
+
+  # real_column_a is used as a feature in both its initial and its transformed
+  # (mapped) form. sparse_column_b is not affected by kernel mappers.
+  kernel_classifier = KernelLinearClassifier(
+      feature_columns=[real_column_a, sparse_column_b],
+      model_dir=...,
+      optimizer=optimizer,
+      kernel_mappers=kernel_mappers)
+
+  # real_column_a is used as a feature in its transformed (mapped) form only.
+  # sparse_column_b is not affected by kernel mappers.
+  kernel_classifier = KernelLinearClassifier(
+      feature_columns=[sparse_column_b],
+      model_dir=...,
+      optimizer=optimizer,
+      kernel_mappers=kernel_mappers)
+
+  # Input builders
+  def train_input_fn: # returns x, y
+    ...
+  def eval_input_fn: # returns x, y
+    ...
+
+  kernel_classifier.fit(input_fn=train_input_fn)
+  kernel_classifier.evaluate(input_fn=eval_input_fn)
+  kernel_classifier.predict(...)
+  ```
+
+  Input of `fit` and `evaluate` should have following features, otherwise there
+  will be a `KeyError`:
+
+  * if `weight_column_name` is not `None`, a feature with
+    `key=weight_column_name` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `SparseColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `WeightedSparseColumn`, two features: the first with
+      `key` the id column name, the second with `key` the weight column name.
+      Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               feature_columns=None,
+               model_dir=None,
+               n_classes=2,
+               weight_column_name=None,
+               optimizer=None,
+               kernel_mappers=None,
+               config=None):
+    """Construct a `KernelLinearClassifier` estimator object.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph etc. This can also be
+        used to load checkpoints from the directory into an estimator to
+        continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Note that class labels are integers representing the class index (i.e.
+        values from 0 to n_classes-1). For arbitrary label values (e.g. string
+        labels), convert to class indices first.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      optimizer: The optimizer used to train the model. If specified, it should
+        be an instance of `tf.Optimizer`. If `None`, the Ftrl optimizer is used
+        by default.
+      kernel_mappers: Dictionary of kernel mappers to be applied to the input
+        features before training a (linear) model. Keys are feature columns and
+        values are lists of mappers to be applied to the corresponding feature
+        column. Currently only _RealValuedColumns are supported and therefore
+        all mappers should conform to the `DenseKernelMapper` interface (see
+        ./mappers/dense_kernel_mapper.py).
+      config: `RunConfig` object to configure the runtime settings.
+
+    Returns:
+      A `KernelLinearClassifier` estimator.
+
+    Raises:
+      ValueError: if n_classes < 2.
+      ValueError: if neither feature_columns nor kernel_mappers are provided.
+      ValueError: if mappers provided as kernel_mappers values are invalid.
+    """
+    super(KernelLinearClassifier, self).__init__(
+        feature_columns=feature_columns,
+        model_dir=model_dir,
+        weight_column_name=weight_column_name,
+        head=head_lib.multi_class_head(
+            n_classes=n_classes, weight_column_name=weight_column_name),
+        optimizer=optimizer,
+        kernel_mappers=kernel_mappers,
+        config=config)
+
+  def predict_classes(self, input_fn=None):
+    """Runs inference to determine the predicted class per instance.
+
+    Args:
+      input_fn: The input function providing features.
+
+    Returns:
+      A generator of predicted classes for the features provided by input_fn.
+      Each predicted class is represented by its class index (i.e. integer from
+      0 to n_classes-1)
+    """
+    key = prediction_key.PredictionKey.CLASSES
+    predictions = super(KernelLinearClassifier, self).predict(
+        input_fn=input_fn, outputs=[key])
+    return (pred[key] for pred in predictions)
+
+  def predict_proba(self, input_fn=None):
+    """Runs inference to determine the class probability predictions.
+
+    Args:
+      input_fn: The input function providing features.
+
+    Returns:
+      A generator of predicted class probabilities for the features provided by
+        input_fn.
+    """
+    key = prediction_key.PredictionKey.PROBABILITIES
+    predictions = super(KernelLinearClassifier, self).predict(
+        input_fn=input_fn, outputs=[key])
+    return (pred[key] for pred in predictions)
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators_test.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators_test.py
new file mode 100644
index 00000000000..a461ba81345
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators_test.py
@@ -0,0 +1,269 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernel_estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.kernel_methods.python import kernel_estimators
+from tensorflow.contrib.kernel_methods.python.mappers.random_fourier_features import RandomFourierFeatureMapper
+from tensorflow.contrib.learn.python.learn.estimators import test_data
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.platform import googletest
+
+
+def _linearly_separable_binary_input_fn():
+  """Returns linearly-separable data points (binary classification)."""
+  return {
+      'feature1': constant_op.constant([[0.0], [1.0], [3.0]]),
+      'feature2': constant_op.constant([[1.0], [-1.2], [1.0]]),
+  }, constant_op.constant([[1], [0], [1]])
+
+
+def _linearly_inseparable_binary_input_fn():
+  """Returns non-linearly-separable data points (binary classification)."""
+  return {
+      'multi_dim_feature':
+          constant_op.constant([[1.0, 1.0], [1.0, -1.0], [-1.0, -1.0],
+                                [-1.0, 1.0]]),
+  }, constant_op.constant([[1], [0], [1], [0]])
+
+
+class KernelLinearClassifierTest(TensorFlowTestCase):
+
+  def testNoFeatureColumnsOrKernelMappers(self):
+    """Tests that at least one of feature columns or kernels is provided."""
+    with self.assertRaises(ValueError):
+      _ = kernel_estimators.KernelLinearClassifier()
+
+  def testInvalidKernelMapper(self):
+    """ValueError raised when the kernel mappers provided have invalid type."""
+
+    class DummyKernelMapper(object):
+
+      def __init__(self):
+        pass
+
+    feature = layers.real_valued_column('feature')
+    kernel_mappers = {feature: [DummyKernelMapper()]}
+    with self.assertRaises(ValueError):
+      _ = kernel_estimators.KernelLinearClassifier(
+          feature_columns=[feature], kernel_mappers=kernel_mappers)
+
+  def testInvalidNumberOfClasses(self):
+    """ValueError raised when the kernel mappers provided have invalid type."""
+
+    feature = layers.real_valued_column('feature')
+    with self.assertRaises(ValueError):
+      _ = kernel_estimators.KernelLinearClassifier(
+          feature_columns=[feature], n_classes=1)
+
+  def testLinearlySeparableBinaryDataNoKernels(self):
+    """Tests classifier w/o kernels (log. regression) for lin-separable data."""
+
+    feature1 = layers.real_valued_column('feature1')
+    feature2 = layers.real_valued_column('feature2')
+
+    logreg_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[feature1, feature2])
+    logreg_classifier.fit(
+        input_fn=_linearly_separable_binary_input_fn, steps=100)
+
+    metrics = logreg_classifier.evaluate(
+        input_fn=_linearly_separable_binary_input_fn, steps=1)
+    # Since the data is linearly separable, the classifier should have small
+    # loss and perfect accuracy.
+    self.assertLess(metrics['loss'], 0.1)
+    self.assertEqual(metrics['accuracy'], 1.0)
+
+    # As a result, it should assign higher probability to class 1 for the 1st
+    # and 3rd example and higher probability to class 0 for the second example.
+    logreg_prob_predictions = list(
+        logreg_classifier.predict_proba(input_fn=
+                                        _linearly_separable_binary_input_fn))
+    self.assertGreater(logreg_prob_predictions[0][1], 0.5)
+    self.assertGreater(logreg_prob_predictions[1][0], 0.5)
+    self.assertGreater(logreg_prob_predictions[2][1], 0.5)
+
+  def testLinearlyInseparableBinaryDataWithAndWithoutKernels(self):
+    """Tests classifier w/ and w/o kernels on non-linearly-separable data."""
+    multi_dim_feature = layers.real_valued_column(
+        'multi_dim_feature', dimension=2)
+
+    # Data points are non-linearly separable so there will be at least one
+    # mis-classified sample (accuracy < 0.8). In fact, the loss is minimized for
+    # w1=w2=0.0, in which case each example incurs a loss of ln(2). The overall
+    # (average) loss should then be ln(2) and the logits should be approximately
+    # 0.0 for each sample.
+    logreg_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[multi_dim_feature])
+    logreg_classifier.fit(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=50)
+    logreg_metrics = logreg_classifier.evaluate(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=1)
+    logreg_loss = logreg_metrics['loss']
+    logreg_accuracy = logreg_metrics['accuracy']
+    logreg_predictions = logreg_classifier.predict(
+        input_fn=_linearly_inseparable_binary_input_fn, as_iterable=False)
+    self.assertAlmostEqual(logreg_loss, np.log(2), places=3)
+    self.assertLess(logreg_accuracy, 0.8)
+    self.assertAllClose(logreg_predictions['logits'], [[0.0], [0.0], [0.0],
+                                                       [0.0]])
+
+    # Using kernel mappers allows to discover non-linearities in data. Mapping
+    # the data to a higher dimensional feature space using approx RBF kernels,
+    # substantially reduces the loss and leads to perfect classification
+    # accuracy.
+    kernel_mappers = {
+        multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')]
+    }
+    kernelized_logreg_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[], kernel_mappers=kernel_mappers)
+    kernelized_logreg_classifier.fit(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=50)
+    kernelized_logreg_metrics = kernelized_logreg_classifier.evaluate(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=1)
+    kernelized_logreg_loss = kernelized_logreg_metrics['loss']
+    kernelized_logreg_accuracy = kernelized_logreg_metrics['accuracy']
+    self.assertLess(kernelized_logreg_loss, 0.2)
+    self.assertEqual(kernelized_logreg_accuracy, 1.0)
+
+  def testVariablesWithAndWithoutKernels(self):
+    """Tests variables w/ and w/o kernel."""
+    multi_dim_feature = layers.real_valued_column(
+        'multi_dim_feature', dimension=2)
+
+    linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[multi_dim_feature])
+    linear_classifier.fit(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=50)
+    linear_variables = linear_classifier.get_variable_names()
+    self.assertIn('linear/multi_dim_feature/weight', linear_variables)
+    self.assertIn('linear/bias_weight', linear_variables)
+    linear_weights = linear_classifier.get_variable_value(
+        'linear/multi_dim_feature/weight')
+    linear_bias = linear_classifier.get_variable_value('linear/bias_weight')
+
+    kernel_mappers = {
+        multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')]
+    }
+    kernel_linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[], kernel_mappers=kernel_mappers)
+    kernel_linear_classifier.fit(
+        input_fn=_linearly_inseparable_binary_input_fn, steps=50)
+    kernel_linear_variables = kernel_linear_classifier.get_variable_names()
+    self.assertIn('linear/multi_dim_feature_MAPPED/weight',
+                  kernel_linear_variables)
+    self.assertIn('linear/bias_weight', kernel_linear_variables)
+    kernel_linear_weights = kernel_linear_classifier.get_variable_value(
+        'linear/multi_dim_feature_MAPPED/weight')
+    kernel_linear_bias = kernel_linear_classifier.get_variable_value(
+        'linear/bias_weight')
+
+    # The feature column used for linear classification (no kernels) has
+    # dimension 2 so the model will learn a 2-dimension weights vector (and a
+    # scalar for the bias). In the kernelized model, the features are mapped to
+    # a 30-dimensional feature space and so the weights variable will also have
+    # dimension 30.
+    self.assertEqual(2, len(linear_weights))
+    self.assertEqual(1, len(linear_bias))
+    self.assertEqual(30, len(kernel_linear_weights))
+    self.assertEqual(1, len(kernel_linear_bias))
+
+  def testClassifierWithAndWithoutKernelsNoRealValuedColumns(self):
+    """Tests kernels have no effect for non-real valued columns ."""
+
+    def input_fn():
+      return {
+          'price':
+              constant_op.constant([[0.4], [0.6], [0.3]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+      }, constant_op.constant([[1], [0], [1]])
+
+    price = layers.real_valued_column('price')
+    country = layers.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+
+    linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[price, country])
+    linear_classifier.fit(input_fn=input_fn, steps=100)
+    linear_metrics = linear_classifier.evaluate(input_fn=input_fn, steps=1)
+    linear_loss = linear_metrics['loss']
+    linear_accuracy = linear_metrics['accuracy']
+
+    kernel_mappers = {
+        country: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')]
+    }
+
+    kernel_linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[price, country], kernel_mappers=kernel_mappers)
+    kernel_linear_classifier.fit(input_fn=input_fn, steps=100)
+    kernel_linear_metrics = kernel_linear_classifier.evaluate(
+        input_fn=input_fn, steps=1)
+    kernel_linear_loss = kernel_linear_metrics['loss']
+    kernel_linear_accuracy = kernel_linear_metrics['accuracy']
+
+    # The kernel mapping is applied to a non-real-valued feature column and so
+    # it should have no effect on the model. The loss and accuracy of the
+    # "kernelized" model should match the loss and accuracy of the initial model
+    # (without kernels).
+    self.assertAlmostEqual(linear_loss, kernel_linear_loss, delta=0.01)
+    self.assertAlmostEqual(linear_accuracy, kernel_linear_accuracy, delta=0.01)
+
+  def testMulticlassDataWithAndWithoutKernels(self):
+    """Tests classifier w/ and w/o kernels on multiclass data."""
+    feature_column = layers.real_valued_column('feature', dimension=4)
+
+    # Metrics for linear classifier (no kernels).
+    linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[feature_column], n_classes=3)
+    linear_classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=50)
+    linear_metrics = linear_classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=1)
+    linear_loss = linear_metrics['loss']
+    linear_accuracy = linear_metrics['accuracy']
+
+    # Using kernel mappers allows to discover non-linearities in data (via RBF
+    # kernel approximation), reduces loss and increases accuracy.
+    kernel_mappers = {
+        feature_column: [
+            RandomFourierFeatureMapper(
+                input_dim=4, output_dim=50, stddev=1.0, name='rffm')
+        ]
+    }
+    kernel_linear_classifier = kernel_estimators.KernelLinearClassifier(
+        feature_columns=[], n_classes=3, kernel_mappers=kernel_mappers)
+    kernel_linear_classifier.fit(
+        input_fn=test_data.iris_input_multiclass_fn, steps=50)
+    kernel_linear_metrics = kernel_linear_classifier.evaluate(
+        input_fn=test_data.iris_input_multiclass_fn, steps=1)
+    kernel_linear_loss = kernel_linear_metrics['loss']
+    kernel_linear_accuracy = kernel_linear_metrics['accuracy']
+    self.assertLess(kernel_linear_loss, linear_loss)
+    self.assertGreater(kernel_linear_accuracy, linear_accuracy)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py b/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py
new file mode 100644
index 00000000000..db38b471520
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""API class for dense (approximate) kernel mappers.
+
+See ./random_fourier_features.py for a concrete instantiation of this class.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+class InvalidShapeError(Exception):
+  """Exception thrown when a tensor's shape deviates from an expected shape."""
+
+
+@six.add_metaclass(abc.ABCMeta)
+class DenseKernelMapper(object):
+  """Abstract class for a kernel mapper that maps dense inputs to dense outputs.
+
+  This class is abstract. Users should not create instances of this class.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def map(self, input_tensor):
+    """Main Dense-Tensor-In-Dense-Tensor-Out (DTIDTO) map method.
+
+    Should be implemented by subclasses.
+    Args:
+      input_tensor: The dense input tensor to be mapped using the (approximate)
+      kernel mapper.
+    """
+    raise NotImplementedError('map is not implemented for {}.'.format(self))
+
+  @abc.abstractproperty
+  def name(self):
+    """Returns the name of the kernel mapper."""
+    pass
+
+  @abc.abstractproperty
+  def output_dim(self):
+    """Returns the output dimension of the mapping."""
+    pass
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
new file mode 100644
index 00000000000..9dc01124ab1
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -0,0 +1,162 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Approximate kernel mapper for RBF kernel based on Random Fourier Features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.contrib.kernel_methods.python.mappers import dense_kernel_mapper as dkm
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+
+
+# TODO(sibyl-vie3Poto,felixyu): add an option to control whether the parameters in the
+# kernel map are trainable.
+class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
+  r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
+
+  The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
+  ```
+  exp(-||x-y||_2^2 / (2 * sigma^2))
+  ```
+
+  The implementation of RFFM is based on the following paper:
+  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
+  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+
+  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
+  where `d` is the input dimension (number of dense input features) and `D` is
+  the output dimension (i.e., dimension of the feature space the input is mapped
+  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
+  distribution and each entry of `b` is sampled independently and uniformly from
+  [0, 2 * pi].
+
+  For a single input feature vector x in R^d, its RFFM is defined as:
+  ```
+      sqrt(2/D) * cos(x * Omega + b)
+  ```
+  where `cos` is the element-wise cosine function and `x, b` are represented as
+  row vectors. The aforementioned paper shows that the linear kernel of
+  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+
+  """
+
+  def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
+    """Constructs a RandomFourierFeatureMapper instance.
+
+    Args:
+      input_dim: The dimension (number of features) of the tensors to be mapped.
+      output_dim: The output dimension of the mapping.
+      stddev: The standard deviation of the Gaussian kernel to be approximated.
+        The error of the classifier trained using this approximation is very
+        sensitive to this parameter.
+      seed: An integer used to initialize the parameters (`Omega` and `b`) of
+        the mapper. For repeatable sequences across different invocations of the
+        mapper object (for instance, to ensure consistent mapping both at
+        training and eval/inference if these happen in different invocations),
+        set this to the same integer.
+      name: name for the mapper object.
+    """
+    # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
+    # provided). input_dim can be inferred lazily, the first time map is called.
+    # output_dim can be inferred from input_dim using heuristics on the error of
+    # the approximation (and, by extension, the error of the classification
+    # based on the approximation).
+    self._input_dim = input_dim
+    self._output_dim = output_dim
+    self._stddev = stddev
+    self._seed = seed
+    self._name = name
+
+  @property
+  def name(self):
+    """Returns a name for the `RandomFourierFeatureMapper` instance.
+
+    If the name provided in the constructor is `None`, then the object's unique
+    id is returned.
+
+    Returns:
+      A name for the `RandomFourierFeatureMapper` instance.
+    """
+    return self._name or str(id(self))
+
+  @property
+  def input_dim(self):
+    return self._input_dim
+
+  @property
+  def output_dim(self):
+    return self._output_dim
+
+  def map(self, input_tensor):
+    """Maps each row of input_tensor using random Fourier features.
+
+    Args:
+      input_tensor: a `Tensor` containing input features. It's shape is
+      [batch_size, self._input_dim].
+
+    Returns:
+      A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped
+      features.
+
+    Raises:
+      InvalidShapeError: if the shape of the `input_tensor` is inconsistent with
+        expected input dimension.
+    """
+    input_tensor_shape = input_tensor.get_shape()
+    if len(input_tensor_shape) != 2:
+      raise dkm.InvalidShapeError(
+          'The shape of the tensor should be 2. Got %d instead.' %
+          len(input_tensor_shape))
+
+    features_dim = input_tensor_shape[1]
+    if features_dim != self._input_dim:
+      raise dkm.InvalidShapeError(
+          'Invalid dimension: expected %d input features, got %d instead.' %
+          (self._input_dim, features_dim))
+
+    # Add ops that compute (deterministically) omega_matrix and bias based on
+    # the provided seed.
+    # TODO(sibyl-vie3Poto): Storing the mapper's parameters (omega_matrix and bias) as
+    # constants incurs no RPC calls to the parameter server during distributed
+    # training. However, if the parameters grow too large (for instance if they
+    # don't fit into memory or if they blow up the size of the GraphDef proto),
+    # stroring them as constants is no longer an option. In this case, we should
+    # have a heuristic to choose out of one of the following alternatives:
+    # a) store them as variables (in the parameter server)
+    # b) store them as worker local variables
+    # c) generating on the fly the omega matrix at each step
+    np.random.seed(self._seed)
+    omega_matrix_shape = [self._input_dim, self._output_dim]
+    bias_shape = [self._output_dim]
+
+    omega_matrix = constant_op.constant(
+        np.random.normal(
+            scale=1.0 / self._stddev, size=omega_matrix_shape),
+        dtype=dtypes.float32)
+    bias = constant_op.constant(
+        np.random.uniform(
+            low=0.0, high=2 * np.pi, size=bias_shape),
+        dtype=dtypes.float32)
+
+    x_omega_plus_bias = math_ops.add(
+        math_ops.matmul(input_tensor, omega_matrix), bias)
+    return math.sqrt(2.0 / self._output_dim) * math_ops.cos(x_omega_plus_bias)
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
new file mode 100644
index 00000000000..6f4a2644859
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -0,0 +1,166 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RandomFourierFeatureMapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.kernel_methods.python.mappers import dense_kernel_mapper
+from tensorflow.contrib.kernel_methods.python.mappers.random_fourier_features import RandomFourierFeatureMapper
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import googletest
+
+
+def _inner_product(x, y):
+  """Inner product between tensors x and y.
+
+  The input tensors are assumed to be in ROW representation, that is, the method
+  returns x * y^T.
+
+  Args:
+    x: input tensor in row format
+    y: input tensor in row format
+
+  Returns:
+    the inner product of x, y
+  """
+  return math_ops.matmul(x, y, transpose_b=True)
+
+
+def _compute_exact_rbf_kernel(x, y, stddev):
+  """Computes exact RBF kernel given input tensors x and y and stddev."""
+  diff = math_ops.subtract(x, y)
+  diff_squared_norm = _inner_product(diff, diff)
+  return math_ops.exp(-diff_squared_norm / (2 * stddev * stddev))
+
+
+class RandomFourierFeatureMapperTest(TensorFlowTestCase):
+
+  def testInvalidInputShape(self):
+    x = constant_op.constant([[2.0, 1.0]])
+
+    with self.test_session():
+      rffm = RandomFourierFeatureMapper(3, 10)
+      with self.assertRaisesWithPredicateMatch(
+          dense_kernel_mapper.InvalidShapeError,
+          r'Invalid dimension: expected 3 input features, got 2 instead.'):
+        rffm.map(x)
+
+  def testMappedShape(self):
+    x1 = constant_op.constant([[2.0, 1.0, 0.0]])
+    x2 = constant_op.constant([[1.0, -1.0, 2.0], [-1.0, 10.0, 1.0],
+                               [4.0, -2.0, -1.0]])
+
+    with self.test_session():
+      rffm = RandomFourierFeatureMapper(3, 10, 1.0)
+      mapped_x1 = rffm.map(x1)
+      mapped_x2 = rffm.map(x2)
+      self.assertEqual([1, 10], mapped_x1.get_shape())
+      self.assertEqual([3, 10], mapped_x2.get_shape())
+
+  def testSameOmegaReused(self):
+    x = constant_op.constant([[2.0, 1.0, 0.0]])
+
+    with self.test_session():
+      rffm = RandomFourierFeatureMapper(3, 100)
+      mapped_x = rffm.map(x)
+      mapped_x_copy = rffm.map(x)
+      # Two different evaluations of tensors output by map on the same input
+      # are identical because the same parameters are used for the mappings.
+      self.assertAllClose(mapped_x.eval(), mapped_x_copy.eval(), atol=0.001)
+
+  def testTwoMapperObjects(self):
+    x = constant_op.constant([[2.0, 1.0, 0.0]])
+    y = constant_op.constant([[1.0, -1.0, 2.0]])
+    stddev = 3.0
+
+    with self.test_session():
+      # The mapped dimension is fairly small, so the kernel approximation is
+      # very rough.
+      rffm1 = RandomFourierFeatureMapper(3, 100, stddev)
+      rffm2 = RandomFourierFeatureMapper(3, 100, stddev)
+      mapped_x1 = rffm1.map(x)
+      mapped_y1 = rffm1.map(y)
+      mapped_x2 = rffm2.map(x)
+      mapped_y2 = rffm2.map(y)
+
+      approx_kernel_value1 = _inner_product(mapped_x1, mapped_y1)
+      approx_kernel_value2 = _inner_product(mapped_x2, mapped_y2)
+      self.assertAllClose(
+          approx_kernel_value1.eval(), approx_kernel_value2.eval(), atol=0.01)
+
+  def testBadKernelApproximation(self):
+    x = constant_op.constant([[2.0, 1.0, 0.0]])
+    y = constant_op.constant([[1.0, -1.0, 2.0]])
+    stddev = 3.0
+
+    with self.test_session():
+      # The mapped dimension is fairly small, so the kernel approximation is
+      # very rough.
+      rffm = RandomFourierFeatureMapper(3, 100, stddev, seed=0)
+      mapped_x = rffm.map(x)
+      mapped_y = rffm.map(y)
+      exact_kernel_value = _compute_exact_rbf_kernel(x, y, stddev)
+      approx_kernel_value = _inner_product(mapped_x, mapped_y)
+      self.assertAllClose(
+          exact_kernel_value.eval(), approx_kernel_value.eval(), atol=0.2)
+
+  def testGoodKernelApproximationAmortized(self):
+    # Parameters.
+    num_points = 20
+    input_dim = 5
+    mapped_dim = 5000
+    stddev = 5.0
+
+    # TODO(sibyl-vie3Poto): Reduce test's running time before moving to third_party. One
+    # possible way to speed the test up is to compute both the approximate and
+    # the exact kernel matrix directly using matrix operations instead of
+    # computing the values for each pair of points separately.
+    points_shape = [1, input_dim]
+    points = [
+        random_ops.random_uniform(shape=points_shape, maxval=1.0)
+        for _ in xrange(num_points)
+    ]
+
+    normalized_points = [nn.l2_normalize(point, dim=1) for point in points]
+    total_absolute_error = 0.0
+    with self.test_session():
+      rffm = RandomFourierFeatureMapper(input_dim, mapped_dim, stddev, seed=0)
+      # Cache mappings so that they are not computed multiple times.
+      cached_mappings = dict((point, rffm.map(point))
+                             for point in normalized_points)
+      for x in normalized_points:
+        mapped_x = cached_mappings[x]
+        for y in normalized_points:
+          mapped_y = cached_mappings[y]
+          exact_kernel_value = _compute_exact_rbf_kernel(x, y, stddev)
+          approx_kernel_value = _inner_product(mapped_x, mapped_y)
+          abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
+          total_absolute_error += abs_error
+      self.assertAllClose(
+          [[0.0]],
+          total_absolute_error.eval() / (num_points * num_points),
+          atol=0.02)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 411819bf9e6..55258f264ae 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "labeled_tensor",
     srcs = ["__init__.py"],
@@ -20,6 +22,15 @@ py_library(
     ],
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "labeled_tensor_pip",
+    deps = [
+        ":labeled_tensor",
+        ":test_util",
+    ],
+)
+
 py_library(
     name = "_typecheck",
     srcs = ["python/ops/_typecheck.py"],
diff --git a/tensorflow/contrib/labeled_tensor/__init__.py b/tensorflow/contrib/labeled_tensor/__init__.py
index 64c83cbad8b..d19811d49ca 100644
--- a/tensorflow/contrib/labeled_tensor/__init__.py
+++ b/tensorflow/contrib/labeled_tensor/__init__.py
@@ -72,6 +72,8 @@ digamma = _core.digamma
 erf = _core.erf
 erfc = _core.erfc
 logical_not = _core.logical_not
+tanh = _core.tanh
+sigmoid = _core.sigmoid
 
 add = _core.add
 sub = _core.sub
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
index 4a939cb22c5..80fa17ec1f7 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
@@ -24,9 +24,9 @@ from __future__ import print_function
 
 import collections
 import functools
-import inspect
 import re
 
+from tensorflow.python.util import tf_inspect
 
 # used for register_type_abbreviation and _type_repr below.
 _TYPE_ABBREVIATIONS = {}
@@ -230,7 +230,7 @@ def accepts(*types):
 
   def check_accepts(f):
     """Check the types."""
-    spec = inspect.getargspec(f)
+    spec = tf_inspect.getargspec(f)
 
     num_function_arguments = len(spec.args)
     if len(types) != num_function_arguments:
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index 393c7f93f36..04bf26a5dd2 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -618,7 +618,7 @@ def identity(labeled_tensor, name=None):
 def slice_function(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
 
-  This is an analogue of tf.slice.
+  This is an analog of tf.slice.
   For example:
   >>> tensor = tf.reshape(tf.range(0, 6), [3, 2])
   >>> labeled_tensor = lt.LabeledTensor(tensor, ['a', ('b', ['foo', 'bar'])])
@@ -704,7 +704,7 @@ def transpose(labeled_tensor, axis_order=None, name=None):
     axis_names = list(labeled_tensor.axes.keys())
     permutation = [axis_names.index(n) for n in axis_order]
 
-    # Note: TensorFlow doesn't copy data for the identity tranpose.
+    # Note: TensorFlow doesn't copy data for the identity transpose.
     transpose_tensor = array_ops.transpose(
         labeled_tensor.tensor, permutation, name=scope)
 
@@ -810,7 +810,7 @@ def axis_order_scope(axis_order=None):
   Example usage:
 
     with lt.axis_order_scope(['x', 'y', 'z']):
-      # result is guranteed to have the correct axis order
+      # result is guaranteed to have the correct axis order
       result = w + b
 
   You can nest scopes, in which case only the inner-most scope applies, e.g.,
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index 98842494fac..c957b41a49b 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -51,8 +51,7 @@ def _gather_1d_on_axis(labeled_tensor, indexer, axis, name=None):
 @tc.returns(core.LabeledTensor)
 @tc.accepts(core.LabeledTensorLike,
             tc.Mapping(string_types,
-                       tc.Union(slice, collections.Hashable,
-                                collections.Sequence)),
+                       tc.Union(slice, collections.Hashable, list)),
             tc.Optional(string_types))
 def select(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
@@ -110,23 +109,22 @@ def select(labeled_tensor, selection, name=None):
 
         slices[axis_name] = slice(start, stop)
 
-      else:
-        # We're allowing anything NumPy treats as a scalar or 1D array.
-        value = np.asarray(value)
-        if value.ndim == 0:
-          slices[axis_name] = axis.index(value.item())
-        elif value.ndim == 1:
-          if indexers:
-            raise NotImplementedError(
-                'select does not yet support more than one list selection at '
-                'the same time')
-          indexer = [axis.index(v) for v in value.tolist()]
-          indexers[axis_name] = ops.convert_to_tensor(
-              indexer, dtype=dtypes.int64)
-        else:
+      # Needs to be after checking for slices, since slice objects claim to be
+      # instances of collections.Hashable but hash() on them fails.
+      elif isinstance(value, collections.Hashable):
+        slices[axis_name] = axis.index(value)
+
+      elif isinstance(value, list):
+        if indexers:
           raise NotImplementedError(
-              'select does not yet support selections with more than one '
-              'dimension: %s on axis %r' % (value, axis_name))
+              'select does not yet support more than one list selection at '
+              'the same time')
+        indexer = [axis.index(v) for v in value]
+        indexers[axis_name] = ops.convert_to_tensor(indexer, dtype=dtypes.int64)
+
+      else:
+        # If type checking is working properly, this shouldn't be possible.
+        raise TypeError('cannot handle arbitrary types')
 
     if indexers and slices:
       raise NotImplementedError(
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index ea5e0087523..0727f4cf887 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -121,6 +121,13 @@ class SelectTest(Base):
     golden_lt = core.LabeledTensor(self.tensor[1, 1, :, :], [self.a2, self.a3])
     self.assertLabeledTensorsEqual(select_lt, golden_lt)
 
+  def test_tuple(self):
+    original_lt = core.LabeledTensor(constant_op.constant([5, 6]),
+                                     [('x', [(1, 2), (3, 4)])])
+    select_lt = ops.select(original_lt, {'x': (1, 2)})
+    golden_lt = core.LabeledTensor(constant_op.constant(5), [])
+    self.assertLabeledTensorsEqual(select_lt, golden_lt)
+
   def test_invalid_input(self):
     with self.assertRaises(ValueError):
       ops.select(self.original_lt, {'foo': 1})
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 015b4afb4a8..5cd338f7918 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -11,40 +11,13 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
-tf_custom_op_library(
-    # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
-    name = "python/ops/_bucketization_op.so",
-    srcs = [
-        "ops/bucketization_op.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["bucketization_op"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "bucketization_op",
-    deps = [":bucketization_op_op_lib"],
-)
-
-tf_kernel_library(
-    name = "bucketization_op_kernel",
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
 tf_custom_op_library(
     # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
     name = "python/ops/_sparse_feature_cross_op.so",
@@ -74,7 +47,7 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "layers_py",
     srcs = [
         "__init__.py",
@@ -94,13 +67,15 @@ py_library(
         "python/ops/sparse_feature_cross_op.py",
         "python/ops/sparse_ops.py",
     ],
-    data = [
-        ":python/ops/_bucketization_op.so",
+    dso = [
         ":python/ops/_sparse_feature_cross_op.so",
     ],
+    kernels = [
+        ":sparse_feature_cross_op_kernel",
+        ":sparse_feature_cross_op_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
-        ":bucketization_op",
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/lookup:lookup_py",
@@ -110,6 +85,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:common_shapes",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
@@ -123,13 +99,16 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:standard_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "@six_archive//:six",
     ],
 )
@@ -203,6 +182,7 @@ py_test(
 
 py_test(
     name = "optimizers_test",
+    size = "small",
     srcs = ["python/layers/optimizers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -258,6 +238,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
@@ -304,22 +285,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "bucketization_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bucketization_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":layers_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "sparse_feature_cross_op_test",
     size = "medium",
@@ -341,6 +306,7 @@ py_test(
 py_test(
     name = "embedding_ops_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/layers/embedding_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/layers/README.md b/tensorflow/contrib/layers/README.md
index 7b374172f5c..9310b194dff 100644
--- a/tensorflow/contrib/layers/README.md
+++ b/tensorflow/contrib/layers/README.md
@@ -18,17 +18,14 @@ these arguments.
      …,
      weight_init=<DEFAULT>,
      bias_init=<DEFAULT>,
-     weight_collections=(tf.GraphKeys.WEIGHTS,),
-     bias_collections=(tf.GraphKeys.BIASES,),
-     output_collections=(tf.GraphKeys.ACTIVATIONS,),
      weight_regularizer=None,
      bias_regularizer=None,
      name=None) : Tensor`
 
 `x` is the input tensor.
 
-Weights, biases, and activations (i.e., outputs) are, by default, added to the specified collections. Weights and biases are also added to
-`tf.GraphKeys.GLOBAL_VARIABLES` and `tf.GraphKeys.TRAINABLE_VARIABLES`.
+Weights and biases are added to `tf.GraphKeys.GLOBAL_VARIABLES` and
+`tf.GraphKeys.TRAINABLE_VARIABLES`.
 
 ## optimizers.py
 
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index c563b29de90..ef05dbaa651 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -14,11 +14,7 @@
 # ==============================================================================
 """Ops for building neural network layers, regularizers, summaries, etc.
 
-## Higher level ops for building neural network layers.
-
-This package provides several ops that take care of creating variables that are
-used internally in a consistent way and provide the building blocks for many
-common machine learning algorithms.
+See the @{$python/contrib.layers} guide.
 
 @@avg_pool2d
 @@batch_norm
@@ -28,6 +24,8 @@ common machine learning algorithms.
 @@conv2d_transpose
 @@convolution2d_transpose
 @@dropout
+@@elu
+@@embedding_lookup_unique
 @@flatten
 @@fully_connected
 @@layer_norm
@@ -38,64 +36,34 @@ common machine learning algorithms.
 @@relu6
 @@repeat
 @@safe_embedding_lookup_sparse
+@@scale_gradient
 @@separable_conv2d
 @@separable_convolution2d
 @@softmax
 @@stack
 @@unit_norm
+@@bow_encoder
 @@embed_sequence
 
-Aliases for fully_connected which set a default activation function are
-available: `relu`, `relu6` and `linear`.
-
-`stack` operation is also available. It builds a stack of layers by applying
-a layer repeatedly.
-
-## Regularizers
-
-Regularization can help prevent overfitting. These have the signature
-`fn(weights)`. The loss is typically added to
-`tf.GraphKeys.REGULARIZATION_LOSSES`.
-
 @@apply_regularization
+@@l1_l2_regularizer
 @@l1_regularizer
 @@l2_regularizer
 @@sum_regularizer
 
-## Initializers
-
-Initializers are used to initialize variables with sensible values given their
-size, data type, and purpose.
-
 @@xavier_initializer
 @@xavier_initializer_conv2d
 @@variance_scaling_initializer
 
-## Optimization
-
-Optimize weights given a loss.
-
 @@optimize_loss
 
-## Summaries
-
-Helper functions to summarize specific variables or ops.
-
 @@summarize_activation
 @@summarize_tensor
 @@summarize_tensors
 @@summarize_collection
 
-The layers module defines convenience functions `summarize_variables`,
-`summarize_weights` and `summarize_biases`, which set the `collection` argument
-of `summarize_collection` to `VARIABLES`, `WEIGHTS` and `BIASES`, respectively.
-
 @@summarize_activations
 
-## Feature columns
-
-Feature columns provide a mechanism to map data to a model.
-
 @@bucketized_column
 @@check_feature_columns
 @@create_feature_spec_for_parsing
@@ -103,6 +71,7 @@ Feature columns provide a mechanism to map data to a model.
 @@embedding_column
 @@scattered_embedding_column
 @@input_from_feature_columns
+@@transform_features
 @@joint_weighted_sum_from_feature_columns
 @@make_place_holder_tensors_for_base_features
 @@multi_class_target
@@ -118,7 +87,6 @@ Feature columns provide a mechanism to map data to a model.
 @@weighted_sum_from_feature_columns
 @@infer_real_valued_columns
 @@sequence_input_from_feature_columns
-
 """
 
 from __future__ import absolute_import
@@ -133,6 +101,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['bias_add',
                     'conv2d',
+                    'elu',
                     'feature_column',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index b2bc8a4ce76..15b984f9389 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -7,17 +7,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-cc_library(
-    name = "bucketization_kernel",
-    srcs = ["bucketization_kernel.cc"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf//:protobuf",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "sparse_feature_cross_kernel",
     srcs = ["sparse_feature_cross_kernel.cc"],
@@ -25,7 +14,7 @@ cc_library(
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc b/tensorflow/contrib/layers/kernels/bucketization_kernel.cc
deleted file mode 100644
index 5cfa39de764..00000000000
--- a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <vector>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-template <typename T>
-class BucketizeOp : public OpKernel {
- public:
-  explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("boundaries", &boundaries_));
-    OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()),
-                errors::InvalidArgument("Expected sorted boundaries"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
-                                                     &output_tensor));
-    auto output = output_tensor->template flat<int32>();
-
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output(i) = CalculateBucketIndex(input(i));
-    }
-  }
-
- private:
-  int32 CalculateBucketIndex(const T value) {
-    auto first_bigger_it =
-        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
-    return first_bigger_it - boundaries_.begin();
-  }
-  std::vector<float> boundaries_;
-};
-
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BucketizeOp<T>);
-
-REGISTER_KERNEL(int32);
-REGISTER_KERNEL(int64);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-#undef REGISTER_KERNEL
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 47a5b2a2077..72df272af89 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -41,13 +41,7 @@ class ColumnInterface {
   virtual int64 FeatureCount(int64 batch) const = 0;
 
   // Returns the fingerprint of nth feature from the specified batch.
-  InternalType Feature(int64 batch, int64 n) const {
-    InternalType not_used = InternalType();
-    return DoFeature(batch, n, not_used);
-  }
-
-  virtual InternalType DoFeature(int64 batch, int64 n,
-                                 InternalType not_used) const = 0;
+  virtual InternalType Feature(int64 batch, int64 n) const = 0;
 
   virtual ~ColumnInterface() {}
 };
@@ -68,26 +62,7 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
     return feature_counts_[batch];
   }
 
-  // InternalType is int64 only when using HashCrosser.
-  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
-    const int64 start = feature_start_indices_[batch];
-    if (DT_STRING == values_.dtype())
-      return Fingerprint64(values_.vec<string>().data()[start + n]);
-    return values_.vec<int64>().data()[start + n];
-  }
-
-  // InternalType is string or StringPiece when using StringCrosser.
-  string DoFeature(int64 batch, int64 n, string not_used) const {
-    const int64 start = feature_start_indices_[batch];
-    if (DT_STRING == values_.dtype())
-      return values_.vec<string>().data()[start + n];
-    return std::to_string(values_.vec<int64>().data()[start + n]);
-  }
-
-  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
-    const int64 start = feature_start_indices_[batch];
-    return values_.vec<string>().data()[start + n];
-  }
+  InternalType Feature(int64 batch, int64 n) const override;
 
   ~SparseTensorColumn() override {}
 
@@ -97,6 +72,31 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
   std::vector<int64> feature_start_indices_;
 };
 
+// InternalType is int64 only when using HashCrosser.
+template <>
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<string>().data()[start + n]);
+  return values_.vec<int64>().data()[start + n];
+}
+
+// InternalType is string or StringPiece when using StringCrosser.
+template <>
+string SparseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<string>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
+                                                     int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<string>().data()[start + n];
+}
+
 // A column that is backed by a dense tensor.
 template <typename InternalType>
 class DenseTensorColumn : public ColumnInterface<InternalType> {
@@ -105,22 +105,7 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 
   int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
 
-  // InternalType is int64 only when using HashCrosser.
-  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
-    if (DT_STRING == tensor_.dtype())
-      return Fingerprint64(tensor_.matrix<string>()(batch, n));
-    return tensor_.matrix<int64>()(batch, n);
-  }
-
-  // Internal type is string or StringPiece when using StringCrosser.
-  string DoFeature(int64 batch, int64 n, string not_used) const {
-    if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
-    return std::to_string(tensor_.matrix<int64>()(batch, n));
-  }
-
-  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
-    return tensor_.matrix<string>()(batch, n);
-  }
+  InternalType Feature(int64 batch, int64 n) const override;
 
   ~DenseTensorColumn() override {}
 
@@ -128,6 +113,27 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
   const Tensor& tensor_;
 };
 
+// InternalType is int64 only when using HashCrosser.
+template <>
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<string>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+// Internal type is string or StringPiece when using StringCrosser.
+template <>
+string DenseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
+                                                    int64 n) const {
+  return tensor_.matrix<string>()(batch, n);
+}
+
 // Updates Output tensors with sparse crosses.
 template <typename OutType>
 class OutputUpdater {
@@ -494,6 +500,7 @@ class SparseFeatureCrossOp : public OpKernel {
     ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
                        &feature_start_indices);
 
+    columns.reserve(values_list_in.size());
     for (int i = 0; i < values_list_in.size(); ++i) {
       columns.emplace_back(new SparseTensorColumn<InternalType>(
           values_list_in[i], std::move(feature_counts[i]),
diff --git a/tensorflow/contrib/layers/ops/bucketization_op.cc b/tensorflow/contrib/layers/ops/bucketization_op.cc
deleted file mode 100644
index 686c3ffdea3..00000000000
--- a/tensorflow/contrib/layers/ops/bucketization_op.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("Bucketize")
-    .Input("input: T")
-    .Output("output: int32")
-    .Attr("T: {int32, int64, float, double}")
-    .Attr("boundaries: list(float)")
-    .Doc(R"doc(
-Bucketizes 'input' based on 'boundaries'.
-
-For example, if the inputs are
-    boundaries = [0, 10, 100]
-    input = [[-5, 10000]
-             [150,   10]
-             [5,    100]]
-
-then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
-
-input: Any shape of Tensor contains with int or float type.
-boundaries: A sorted list of floats gives the boundary of the buckets.
-output: Same shape with 'input', each value of input replaced with bucket index.
-
-)doc");
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
deleted file mode 100644
index 1e0e5ec403d..00000000000
--- a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for bucketization_op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-from tensorflow.contrib.layers.python.ops import bucketization_op
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.platform import test
-
-
-class BucketizationOpTest(test.TestCase):
-
-  def test_normal_usecase(self):
-    op = bucketization_op.bucketize(
-        constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
-        boundaries=[0, 3, 8, 11])
-    expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session() as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
-
-  def test_invalid_boundaries_order(self):
-    op = bucketization_op.bucketize(
-        constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
-    with self.test_session() as sess:
-      with self.assertRaises(errors_impl.InvalidArgumentError):
-        sess.run(op)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index 7f05b7d75de..f701647c2b2 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy
 
 from tensorflow.contrib import layers
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index f0ed31d1d1d..f8f4122d1db 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -17,24 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 __all__ = [
     "safe_embedding_lookup_sparse", "scattered_embedding_lookup",
-    "scattered_embedding_lookup_sparse", "embedding_lookup_unique"
+    "scattered_embedding_lookup_sparse", "embedding_lookup_unique",
+    "embedding_lookup_sparse_with_distributed_aggregation"
 ]
 
 
@@ -93,7 +100,13 @@ def safe_embedding_lookup_sparse(embedding_weights,
     logging.warn("The default value of combiner will change from \"mean\" "
                  "to \"sqrtn\" after 2016/11/01.")
     combiner = "mean"
-  if embedding_weights is None or len(embedding_weights) < 1:
+  if embedding_weights is None:
+    raise ValueError("Missing embedding_weights %s." % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
     raise ValueError("Missing embedding_weights %s." % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
@@ -344,7 +357,7 @@ def _sampled_scattered_embedding_lookup(
     # No need to validate the indices since we have checked the params
     # dimensions and we know the largest id.
     result = embedding_ops.embedding_lookup(
-        params, ids, partition_strategy="div", validate_indices=False)
+        params, ids, partition_strategy="div")
 
     return array_ops.reshape(result,
                              array_ops.concat([values_shape, [dimension]], 0))
@@ -548,3 +561,346 @@ def _sampled_scattered_embedding_lookup_sparse(params,
     return math_ops.unsorted_segment_sum(embeddings, segment_ids,
                                          num_segments=num_segments,
                                          name=name_scope)
+
+
+def embedding_lookup_sparse_with_distributed_aggregation(
+    params,
+    sp_ids,
+    sp_weights,
+    partition_strategy="mod",
+    name=None,
+    combiner=None,
+    max_norm=None):
+  """Computes embeddings for the given ids and weights.
+
+  Embeddings belonging to same param are aggregated on that device first. This
+  op is intended to decrease data transmission and improve parallelism. See
+  `tf.nn.embedding_lookup_sparse` for the functionality and example of this op.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+      where N is typically batch size and M is arbitrary.
+    sp_weights: either a SparseTensor of float / double weights, or None to
+      indicate all weights should be taken to be 1. If specified, sp_weights
+      must have exactly the same shape and indices as sp_ids.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: Optional name for the op.
+    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
+      and "sum" are supported.
+      "sum" computes the weighted sum of the embedding results for each row.
+      "mean" is the weighted sum divided by the total weight.
+      "sqrtn" is the weighted sum divided by the square root of the sum of the
+      squares of the weights.
+    max_norm: If not None, each embedding is normalized to have l2 norm equal
+      to max_norm before combining.
+
+  Returns:
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    looks up the embeddings for all ids in that row, multiplies them by the
+    corresponding weight, and combines these embeddings as specified.
+
+  Raises:
+    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
+      None nor SparseTensor.
+    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+  """
+  if combiner is None:
+    logging.warn("The default value of combiner will change from \"mean\" "
+                 "to \"sqrtn\" after 2016/11/01.")
+    combiner = "mean"
+  if combiner not in ("mean", "sqrtn", "sum"):
+    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+  if not isinstance(sp_ids, sparse_tensor.SparseTensor):
+    raise TypeError("sp_ids must be SparseTensor")
+  ignore_weights = sp_weights is None
+  if not ignore_weights:
+    if not isinstance(sp_weights, sparse_tensor.SparseTensor):
+      raise TypeError("sp_weights must be either None or SparseTensor")
+    sp_ids.values.get_shape().assert_is_compatible_with(
+        sp_weights.values.get_shape())
+    sp_ids.indices.get_shape().assert_is_compatible_with(
+        sp_weights.indices.get_shape())
+    sp_ids.dense_shape.get_shape().assert_is_compatible_with(
+        sp_weights.dense_shape.get_shape())
+    # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
+    # sp_weights have equal indices and shapes.
+
+  with ops.name_scope(name, "embedding_lookup_sparse",
+                      params + [sp_ids]) as name:
+    segment_ids = sp_ids.indices[:, 0]
+    if segment_ids.dtype != dtypes.int32:
+      segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+
+    ids = sp_ids.values
+    if ignore_weights:
+      ids, idx = array_ops.unique(ids)
+    else:
+      idx = None
+
+    weights = None if ignore_weights else sp_weights.values
+    embeddings = _embedding_lookup_with_distributed_aggregation(
+        params,
+        ids,
+        partition_strategy=partition_strategy,
+        max_norm=max_norm,
+        weights=weights,
+        idx=idx,
+        segment_ids=segment_ids)
+    # Set weights to all one if ignore weights.
+    if ignore_weights:
+      weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
+    if weights.dtype != embeddings.dtype:
+      weights = math_ops.cast(weights, embeddings.dtype)
+    # Reshape weights.
+    ones = array_ops.fill(
+        array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
+    bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0)
+    orig_weights_shape = weights.get_shape()
+    weights = array_ops.reshape(weights, bcast_weights_shape)
+    if embeddings.get_shape().ndims is not None:
+      weights.set_shape(
+          orig_weights_shape.concatenate(
+              [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+
+    if combiner == "mean":
+      weight_sum = math_ops.segment_sum(weights, segment_ids)
+      embeddings = math_ops.div(embeddings, weight_sum)
+    elif combiner == "sqrtn":
+      weights_squared = math_ops.pow(weights, 2)
+      weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
+      weight_sum_sqrt = math_ops.sqrt(weight_sum)
+      embeddings = math_ops.div(embeddings, weight_sum_sqrt)
+    elif combiner != "sum":
+      assert False, "Unrecognized combiner"
+    return embeddings
+
+
+def _do_gather(params, ids, name=None):
+  """Deals with doing gather differently for resource variables."""
+  if isinstance(params, resource_variable_ops.ResourceVariable):
+    return params.sparse_read(ids, name=name)
+  return array_ops.gather(params, ids, name=name)
+
+
+def _embedding_lookup_with_distributed_aggregation(params,
+                                                   ids,
+                                                   partition_strategy="mod",
+                                                   name=None,
+                                                   max_norm=None,
+                                                   weights=None,
+                                                   idx=None,
+                                                   segment_ids=None):
+  """Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
+  if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
+    raise ValueError("Need at least one param")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+
+  def maybe_normalize(x):
+    if max_norm is not None:
+      if x.get_shape().ndims is not None:
+        ndims = x.get_shape().ndims
+      else:
+        ndims = array_ops.size(array_ops.shape(x))
+      return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
+    return x
+
+  with ops.name_scope(name, "embedding_lookup_with_distributed_aggregation",
+                      params + [ids]) as name:
+    np = len(params)  # Number of partitions
+    # Preserve the resource variable status to avoid accidental dense reads.
+    if not any(
+        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
+      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
+    if np == 1:
+      with ops.colocate_with(params[0]):
+        ret = maybe_normalize(_do_gather(params[0], ids))
+        ignore_weights = weights is None
+        if not ignore_weights:
+          if weights.dtype != ret.dtype:
+            weights = math_ops.cast(weights, ret.dtype)
+          # Reshape to allow broadcast
+          ones = array_ops.fill(
+              array_ops.expand_dims(array_ops.rank(ret) - 1, 0), 1)
+          bcast_weights_shape = array_ops.concat(
+              [array_ops.shape(weights), ones], 0)
+          orig_weights_shape = weights.get_shape()
+          weights = array_ops.reshape(weights, bcast_weights_shape)
+          # Set weights shape after reshape
+          if ret.get_shape().ndims is not None:
+            weights.set_shape(
+                orig_weights_shape.concatenate(
+                    [1 for _ in range(ret.get_shape().ndims - 1)]))
+          ret *= weights
+          return math_ops.segment_sum(ret, segment_ids, name=name)
+        else:
+          return math_ops.sparse_segment_sum(ret, idx, segment_ids, name=name)
+    else:
+      ids = ops.convert_to_tensor(ids, name="ids")
+      flat_ids = array_ops.reshape(ids, [-1])
+      original_indices = math_ops.range(array_ops.size(flat_ids))
+
+      # Create p_assignments and set new_ids depending on the strategy.
+      if partition_strategy == "mod":
+        p_assignments = flat_ids % np
+        new_ids = flat_ids // np
+      elif partition_strategy == "div":
+        # Compute num_total_ids as the sum of dim-0 of params, then assign to
+        # partitions based on a constant number of ids per partition. Optimize
+        # if we already know the full shape statically.
+        dim_0_size = params[0].get_shape()[0]
+        for p in xrange(1, np):
+          dim_0_size += params[p].get_shape()[0]
+        if dim_0_size.value:
+          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
+        else:
+          dim_0_sizes = []
+          for p in xrange(np):
+            if params[p].get_shape()[0].value is not None:
+              dim_0_sizes.append(params[p].get_shape()[0].value)
+            else:
+              with ops.colocate_with(params[p]):
+                dim_0_sizes.append(array_ops.shape(params[p])[0])
+          num_total_ids = math_ops.reduce_sum(
+              math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
+        ids_per_partition = num_total_ids // np
+        extras = num_total_ids % np
+
+        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (
+            flat_ids - extras) // ids_per_partition)
+
+        # Emulate a conditional using a boolean indicator tensor
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) * (
+                       (flat_ids - extras) % ids_per_partition))
+      else:
+        raise ValueError("Unrecognized partition strategy: " +
+                         partition_strategy)
+
+      # Cast partition assignments to int32 for use in dynamic_partition.
+      # There really should not be more than 2^32 partitions.
+      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
+      # Partition list of ids based on assignments into np separate lists
+      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
+      # Similarly, partition the original indices.
+      pindices = data_flow_ops.dynamic_partition(original_indices,
+                                                 p_assignments, np)
+      # Do np separate lookups, finding embeddings for plist[p] in params[p]
+      partitioned_result = []
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result.append(_do_gather(params[p], gather_ids[p]))
+
+      ignore_weights = weights is None
+      if not ignore_weights:
+        # Partition weights according to pindices.
+        partitioned_weight = []
+        for p in xrange(np):
+          partitioned_weight.append(array_ops.gather(weights, pindices[p]))
+      # Reshape each partition result.
+      element_shape = params[0].get_shape()[1:]
+      for p in params[1:]:
+        element_shape = element_shape.merge_with(p.get_shape()[1:])
+      if element_shape.is_fully_defined():
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([array_ops.shape(pindices[p]), element_shape],
+                                 0))
+      else:
+        with ops.colocate_with(params[0]):
+          params_shape = array_ops.shape(params[0])
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([
+                    array_ops.shape(pindices[p]), array_ops.slice(
+                        params_shape, [1], [-1])
+                ], 0))
+      # Normalize each partition result.
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result[p] = maybe_normalize(partitioned_result[p])
+      if not ignore_weights:
+        # Multiply each partition result with partition weights.
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            if partitioned_weight[p].dtype != partitioned_result[p].dtype:
+              partitioned_weight[p] = math_ops.cast(partitioned_weight[p],
+                                                    partitioned_result[p].dtype)
+            # Reshape partition weights.
+            ones = array_ops.fill(
+                array_ops.expand_dims(
+                    array_ops.rank(partitioned_result[p]) - 1, 0), 1)
+            bcast_weights_shape = array_ops.concat(
+                [array_ops.shape(partitioned_weight[p]), ones], 0)
+            orig_weights_shape = partitioned_weight[p].get_shape()
+            partitioned_weight[p] = array_ops.reshape(partitioned_weight[p],
+                                                      bcast_weights_shape)
+            if partitioned_result[p].get_shape().ndims is not None:
+              partitioned_weight[p].set_shape(
+                  orig_weights_shape.concatenate([
+                      1
+                      for _ in range(partitioned_result[p].get_shape().ndims -
+                                     1)
+                  ]))
+            partitioned_result[p] *= partitioned_weight[p]
+      partitioned_segment_ids = []
+      for p in xrange(np):
+        if not ignore_weights:
+          # Partition segment_ids according to pindices.
+          p_segment_ids = array_ops.gather(segment_ids, pindices[p])
+          # Number the p_segment_ids to meet segment_sum's requirements. Note
+          # that unique_p_segment_ids contains unique segment ids of this
+          # partiton and these ids' order is unchanged.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          # segment_sum this partition's result.
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.segment_sum(
+                partitioned_result[p], unique_p_segment_idx)
+        else:
+          # When ignore weights, we need to get indexs of elements in idx and
+          # segment_ids.
+          _, exclude_idx = array_ops.setdiff1d(idx, pindices[p])
+          all_idx = math_ops.range(array_ops.shape(idx)[0])
+          _, include_idx = array_ops.setdiff1d(all_idx, exclude_idx)
+          # Gather segment_ids and idx according to indexs.
+          p_segment_ids = array_ops.gather(segment_ids, include_idx)
+          p_idx = array_ops.gather(idx, include_idx)
+          # Number the p_segment_ids, same as ignore_weights case above.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          _, unique_p_idx_idx = array_ops.unique(p_idx)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.sparse_segment_sum(
+                partitioned_result[p], unique_p_idx_idx, unique_p_segment_idx)
+      # Concat each partition's segment_ids and result for final segment_sum.
+      concat_segment_ids = array_ops.concat(partitioned_segment_ids, 0)
+      concat_partitioned_result = array_ops.concat(partitioned_result, 0)
+      return math_ops.unsorted_segment_sum(
+          concat_partitioned_result,
+          concat_segment_ids,
+          math_ops.reduce_max(concat_segment_ids) + 1,
+          name=name)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 61b6bc84d71..bf251449820 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -23,11 +23,6 @@ import itertools
 import math
 import sys
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import embedding_ops
@@ -36,10 +31,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class SafeEmbeddingLookupSparseTest(test.TestCase):
@@ -148,8 +146,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertAllClose(
           embedding_lookup_result,
           [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-           [0] * 4, embedding_weights[0][2],
-           (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
+           [0] * 4, embedding_weights[0][2], (
+               embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.test_session():
@@ -174,8 +172,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -188,11 +186,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, sparse_weights).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
-            3.0, [0] * 4, [0] * 4],
-           [embedding_weights[0][2], [0] * 4, [0] * 4]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
+          [0] * 4, [0] * 4
+      ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.test_session():
@@ -218,14 +215,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, None).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-            [0] * 4], [
-                embedding_weights[0][2],
-                (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
-                [0] * 4
-            ]])
+      self.assertAllClose(embedding_lookup_result, [[(
+          embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
+              0
+          ] * 4], [
+              embedding_weights[0][2],
+              (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
+          ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.test_session():
@@ -236,13 +232,12 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
-      self.assertAllClose(embedding_lookup_result,
-                          [[(embedding_weights[0] + embedding_weights[1]) / 2.0,
-                            [0] * 4, [0] * 4], [
-                                embedding_weights[2],
-                                (embedding_weights[0] + embedding_weights[1]) /
-                                2.0, [0] * 4
-                            ]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4
+      ], [
+          embedding_weights[2],
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
+      ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
@@ -254,8 +249,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -304,8 +299,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertAllEqual(embedding_lookup_result[0],
                           embedding_lookup_result[1])
       # Different embedding expected for different value.
-      embedding_diff = np.min((embedding_lookup_result[2] -
-                               embedding_lookup_result[0])**2)
+      embedding_diff = np.min(
+          (embedding_lookup_result[2] - embedding_lookup_result[0])**2)
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
@@ -323,8 +318,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
   def test_scattered_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
 
       embedding_lookup_result = embedding_ops.scattered_embedding_lookup(
           embedding_weights, values, dimension=10).eval()
@@ -343,8 +338,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
 
       embedding_lookup_result = (
           embedding_ops.scattered_embedding_lookup_sparse(
-              embedding_weights, sparse_tensor, dimension=5, combiner="mean")
-          .eval())
+              embedding_weights, sparse_tensor, dimension=5,
+              combiner="mean").eval())
 
       self.assertAllEqual(embedding_lookup_result.shape, [5, 5])
       # Same non-zero embedding for the empty rows filled with a default value.
@@ -436,8 +431,8 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
   def test_hashed_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
       sampled_candidates = constant_op.constant(
           [[[1, 3, 4, 6], [1, 7, 8, 9], [1, 7, 8, 9]],
            [[1, 7, 8, 9], [1, 7, 8, 9], [1, 3, 4, 6]]])
@@ -494,8 +489,8 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
       result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
           params, sp_values, dimension=5, hash_key=self._hash_key)
 
-      self.assertAllClose(result.eval(), [[0., 0., 0., 0., 0.],
-                                          [.3, .2, .2, .3, .1],
+      self.assertAllClose(result.eval(), [[0., 0., 0., 0.,
+                                           0.], [.3, .2, .2, .3, .1],
                                           [0., 0., 0., 0., 0.]])
 
   def test_output_values_with_sampled_candidates(self):
@@ -568,5 +563,224 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
       self.assertAllClose(result.eval(), result_abc.eval())
 
 
+def _PName(param_id):
+  return "p" + str(param_id)
+
+
+def _EmbeddingParams(num_shards,
+                     vocab_size,
+                     dtype=dtypes.float32,
+                     shape=None,
+                     use_shapeless_placeholder=False):
+  p = []
+  params = {}
+  feed_dict = {}
+  if not shape:
+    shape = [10]
+  for i in range(num_shards):
+    shard_shape = [vocab_size // num_shards] + shape
+    if i < vocab_size % num_shards:  # Excess goes evenly on the first shards
+      shard_shape[0] += 1
+
+    param_name = _PName(i)
+
+    if use_shapeless_placeholder:
+      param = array_ops.placeholder(dtype, shape=None, name=param_name)
+    else:
+      param = constant_op.constant(
+          1.0, shape=shard_shape, dtype=dtype, name=param_name)
+    p.append(param)
+    np_type = "f" if dtype == dtypes.float32 else "d"
+    val = (np.random.rand(*shard_shape).astype(np_type)) + 1
+    params[param_name + ":0"] = val
+    feed_dict[param.name] = val
+  return p, params, feed_dict
+
+
+def _EmbeddingResult(params,
+                     id_vals,
+                     num_shards,
+                     vocab_size,
+                     partition_strategy="mod",
+                     weight_vals=None):
+  if weight_vals is None:
+    weight_vals = np.copy(id_vals)
+    weight_vals.fill(1)
+  values = []
+  weights = []
+  weights_squared = []
+  for ids, wts in zip(id_vals, weight_vals):
+    value_aggregation = None
+    weight_aggregation = None
+    squared_weight_aggregation = None
+    if isinstance(ids, compat.integral_types):
+      ids = [ids]
+      wts = [wts]
+    for i, weight_value in zip(ids, wts):
+      if partition_strategy == "mod":
+        val = np.copy(params[_PName(i % num_shards) + ":0"][
+            i // num_shards, :]) * weight_value
+      elif partition_strategy == "div":
+        ids_per_partition, extras = divmod(vocab_size, num_shards)
+        threshold = extras * (ids_per_partition + 1)
+        if i < threshold:
+          partition = i // (ids_per_partition + 1)
+          offset = i % (ids_per_partition + 1)
+        else:
+          partition = extras + (i - threshold) // ids_per_partition
+          offset = (i - threshold) % ids_per_partition
+        val = np.copy(
+            params[_PName(partition) + ":0"][offset, :]) * weight_value
+      else:
+        assert False
+      if value_aggregation is None:
+        assert weight_aggregation is None
+        assert squared_weight_aggregation is None
+        value_aggregation = val
+        weight_aggregation = weight_value
+        squared_weight_aggregation = weight_value * weight_value
+      else:
+        assert weight_aggregation is not None
+        assert squared_weight_aggregation is not None
+        value_aggregation += val
+        weight_aggregation += weight_value
+        squared_weight_aggregation += weight_value * weight_value
+    values.append(value_aggregation)
+    weights.append(weight_aggregation)
+    weights_squared.append(squared_weight_aggregation)
+  values = np.array(values).astype(np.float32)
+  weights = np.array(weights).astype(np.float32)
+  weights_squared = np.array(weights_squared).astype(np.float32)
+  return values, weights, weights_squared
+
+
+class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
+
+  def _RandomIdsAndWeights(self, batch_size, vocab_size):
+    max_val_per_entry = 6
+    vals_per_batch_entry = np.random.randint(
+        1, max_val_per_entry, size=batch_size)
+    num_vals = np.sum(vals_per_batch_entry)
+
+    ids = np.random.randint(vocab_size, size=num_vals)
+    weights = 1 + np.random.rand(num_vals)
+
+    indices = []
+    for batch_entry, num_val in enumerate(vals_per_batch_entry):
+      for val_index in range(num_val):
+        indices.append([batch_entry, val_index])
+
+    shape = [batch_size, max_val_per_entry]
+
+    sp_ids = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(ids, dtypes.int32),
+        constant_op.constant(shape, dtypes.int64))
+    sp_weights = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(weights, dtypes.float32),
+        constant_op.constant(shape, dtypes.int64))
+
+    return sp_ids, sp_weights, ids, weights, vals_per_batch_entry
+
+  def _GroupByBatchEntry(self, vals, vals_per_batch_entry):
+    grouped_vals = []
+    index = 0
+    for num_val in vals_per_batch_entry:
+      grouped_vals.append(list(vals[index:(index + num_val)]))
+      index += num_val
+    return grouped_vals
+
+  def testEmbeddingLookupSparse(self):
+    vocab_size = 13
+    batch_size = 10
+    param_shape = [2, 5]
+    expected_lookup_result_shape = [None] + param_shape
+
+    sp_ids, sp_weights, ids, weights, vals_per_batch_entry = (
+        self._RandomIdsAndWeights(batch_size, vocab_size))
+
+    grouped_ids = self._GroupByBatchEntry(ids, vals_per_batch_entry)
+    grouped_weights = self._GroupByBatchEntry(weights, vals_per_batch_entry)
+    grouped_ignored_weights = self._GroupByBatchEntry(
+        np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+
+      with self.test_session():
+        p, params, feed_dict = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+        embedding_sum = \
+            embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+                p,
+                sp_ids,
+                None if ignore_weights else sp_weights,
+                combiner=combiner)
+
+        self.assertEqual(embedding_sum.get_shape().as_list(),
+                         expected_lookup_result_shape)
+
+        tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
+
+        np_embedding_sum, np_weight_sum, np_weight_sq_sum = _EmbeddingResult(
+            params,
+            grouped_ids,
+            num_shards,
+            vocab_size,
+            weight_vals=grouped_ignored_weights
+            if ignore_weights else grouped_weights)
+        if combiner == "mean":
+          np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
+        if combiner == "sqrtn":
+          np_embedding_sum /= np.reshape(
+              np.sqrt(np_weight_sq_sum), (batch_size, 1, 1))
+        self.assertAllClose(np_embedding_sum, tf_embedding_sum)
+
+  def testGradientsEmbeddingLookupSparse(self):
+    vocab_size = 12
+    batch_size = 4
+    param_shape = [2, 3]
+    sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+        batch_size, vocab_size))
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+      with self.test_session():
+        x, params, _ = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+
+        y = embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x,
+            sp_ids,
+            None if ignore_weights else sp_weights,
+            combiner=combiner)
+        x_name = [_PName(i) for i in range(num_shards)]
+        x_init_value = [params[x_n + ":0"] for x_n in x_name]
+        x_shape = [i.shape for i in x_init_value]
+        y_shape = [batch_size] + list(params[_PName(0) + ":0"].shape[1:])
+        err = gradient_checker.compute_gradient_error(
+            x, x_shape, y, y_shape, x_init_value=x_init_value)
+      self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
+      sp_ids = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1], [1, 0]], dtypes.int64),
+          constant_op.constant([0, 1, 2], dtypes.int32),
+          constant_op.constant([2, 2], dtypes.int64))
+      sp_weights = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1]], dtypes.int64),
+          constant_op.constant([12.0, 5.0], dtypes.float32),
+          constant_op.constant([1, 2], dtypes.int64))
+
+      with self.assertRaises(ValueError):
+        embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x, sp_ids, sp_weights, combiner="mean")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index 8b6abb4b456..89c9d37bd09 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -121,7 +121,7 @@ def embed_sequence(ids,
     `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences.
 
   Raises:
-    ValueError: if `embed_dim` or `vocab_size` are not specified when not
+    ValueError: if `embed_dim` or `vocab_size` are not specified when
       `reuse` is `None` or `False`.
   """
   if not (reuse or (vocab_size and embed_dim)):
diff --git a/tensorflow/contrib/layers/python/layers/encoders_test.py b/tensorflow/contrib/layers/python/layers/encoders_test.py
index 7b0e999a3cf..e8528e9890f 100644
--- a/tensorflow/contrib/layers/python/layers/encoders_test.py
+++ b/tensorflow/contrib/layers/python/layers/encoders_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.layers.python.layers import encoders
 from tensorflow.contrib.layers.python.ops import sparse_ops
 from tensorflow.python.ops import init_ops
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index af9b794b24b..68159fe9b99 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -24,12 +24,14 @@ When using FeatureColumns with tf.learn models, the type of feature column you
 should choose depends on (1) the feature type and (2) the model type.
 
 (1) Feature type:
+
  * Continuous features can be represented by `real_valued_column`.
  * Categorical features can be represented by any `sparse_column_with_*`
  column (`sparse_column_with_keys`, `sparse_column_with_vocabulary_file`,
  `sparse_column_with_hash_bucket`, `sparse_column_with_integerized_feature`).
 
 (2) Model type:
+
  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
 
    Continuous features can be directly fed into deep neural network models.
@@ -37,9 +39,13 @@ should choose depends on (1) the feature type and (2) the model type.
      age_column = real_valued_column("age")
 
    To feed sparse features into DNN models, wrap the column with
-   `embedding_column` or `one_hot_column`. `one_hot_column` is recommended for
-   features with only a few possible values. For features with many possible
-   values, `embedding_column` is recommended.
+   `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense
+   boolean tensor with an entry for each possible value, and thus the
+   computation cost is linear in the number of possible values versus the number
+   of values that occur in the sparse tensor. Thus using a "one_hot_column" is
+   only recommended for features with only a few possible values. For features
+   with many possible values or for very sparse features, `embedding_column` is
+   recommended.
 
      embedded_dept_column = embedding_column(
        sparse_column_with_keys("department", ["math", "philosphy", ...]),
@@ -47,10 +53,12 @@ should choose depends on (1) the feature type and (2) the model type.
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
 
-   Sparse features can be fed directly into linear models.
+   Sparse features can be fed directly into linear models. When doing so
+   an embedding_lookups are used to efficiently perform the sparse matrix
+   multiplication.
 
      dept_column = sparse_column_with_keys("department",
-       ["math", "philosphy", "english"])
+       ["math", "philosophy", "english"])
 
    It is recommended that continuous features be bucketized before being
    fed into linear models.
@@ -123,18 +131,27 @@ import math
 import six
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
+from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import embedding_ops
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.layers.python.ops import bucketization_op
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
@@ -163,7 +180,8 @@ class _DeepEmbeddingLookupArguments(
                             "dimension",
                             "shared_embedding_name",
                             "hash_key",
-                            "max_norm"])):
+                            "max_norm",
+                            "trainable"])):
   """Represents the information needed from a column for embedding lookup.
 
   Used to to compute DNN inputs and weighted sum.
@@ -279,11 +297,13 @@ class _FeatureColumn(object):
 
 
 # TODO(b/30410315): Support warm starting in all feature columns.
-class _SparseColumn(_FeatureColumn,
-                    collections.namedtuple("_SparseColumn",
-                                           ["column_name", "is_integerized",
-                                            "bucket_size", "lookup_config",
-                                            "combiner", "dtype"])):
+class _SparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_SparseColumn", [
+        "column_name", "is_integerized", "bucket_size", "lookup_config",
+        "combiner", "dtype"
+    ])):
   """Represents a sparse feature column also known as categorical features.
 
   Instances of this class are immutable. A sparse column means features are
@@ -307,7 +327,7 @@ class _SparseColumn(_FeatureColumn,
         * "mean": do l1 normalization on features in the column
         * "sqrtn": do l2 normalization on features in the column
       For more information: `tf.embedding_lookup_sparse`.
-    dtype: Type of features, such as `tf.string` or `tf.int64`.
+    dtype: Type of features, either `tf.string` or `tf.int64`.
 
   Raises:
     TypeError: if lookup_config is not a _SparseIdLookupConfig.
@@ -328,6 +348,9 @@ class _SparseColumn(_FeatureColumn,
     if is_integerized and not dtype.is_integer:
       raise ValueError("dtype must be an integer if is_integerized is True. "
                        "dtype: {}, column_name: {}.".format(dtype, column_name))
+    if dtype != dtypes.string and not dtype.is_integer:
+      raise ValueError("dtype must be string or integer. "
+                       "dtype: {}, column_name: {}".format(dtype, column_name))
 
     if bucket_size is None and lookup_config is None:
       raise ValueError("one of bucket_size or lookup_config must be set. "
@@ -354,9 +377,14 @@ class _SparseColumn(_FeatureColumn,
       raise ValueError("vocab_size must be defined. "
                        "column_name: {}".format(column_name))
 
-    return super(_SparseColumn, cls).__new__(cls, column_name, is_integerized,
-                                             bucket_size, lookup_config,
-                                             combiner, dtype)
+    return super(_SparseColumn, cls).__new__(
+        cls,
+        column_name,
+        is_integerized=is_integerized,
+        bucket_size=bucket_size,
+        lookup_config=lookup_config,
+        combiner=combiner,
+        dtype=dtype)
 
   @property
   def name(self):
@@ -406,9 +434,8 @@ class _SparseColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer(),
         combiner=self.combiner)
 
-  def _get_input_sparse_tensor(self, columns_to_tensors):
-    """Looks up the input tensor for transformation and sparsify it if dense."""
-    input_tensor = columns_to_tensors[self.name]
+  def _get_input_sparse_tensor(self, input_tensor):
+    """sparsify input_tensor if dense."""
     if not isinstance(input_tensor, sparse_tensor_py.SparseTensor):
       # To avoid making any assumptions about which values are to be ignored,
       # we set ignore_value to -1 for numeric tensors to avoid excluding valid
@@ -417,13 +444,14 @@ class _SparseColumn(_FeatureColumn,
         ignore_value = ""
       else:
         ignore_value = -1
+      input_tensor = _reshape_real_valued_tensor(input_tensor, 2, self.name)
       input_tensor = contrib_sparse_ops.dense_to_sparse_tensor(
           input_tensor, ignore_value=ignore_value)
 
     return input_tensor
 
   def is_compatible(self, other_column):
-    """Check compatability of two sparse columns."""
+    """Check compatibility of two sparse columns."""
     if self.lookup_config and other_column.lookup_config:
       return self.lookup_config == other_column.lookup_config
     compatible = (self.length == other_column.length and
@@ -434,32 +462,44 @@ class _SparseColumn(_FeatureColumn,
                    format(self.name, other_column.name))
     return compatible
 
+  @abc.abstractmethod
+  def _do_transform(self, input_tensor):
+    pass
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name])
+    columns_to_tensors[self] = self._do_transform(input_tensor)
+
+  def _transform_feature(self, inputs):
+    input_tensor = self._get_input_sparse_tensor(inputs.get(self.name))
+    return self._do_transform(input_tensor)
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
+
 
 class _SparseColumnIntegerized(_SparseColumn):
   """See `sparse_column_with_integerized_feature`."""
 
-  def __new__(cls, column_name, bucket_size, combiner="sqrtn",
-              dtype=dtypes.int64):
-    if not dtype.is_integer:
-      raise ValueError("dtype must be an integer. "
-                       "dtype: {}, column_name: {}".format(dtype, column_name))
-
-    return super(_SparseColumnIntegerized, cls).__new__(
-        cls,
-        column_name,
-        is_integerized=True,
-        bucket_size=bucket_size,
-        combiner=combiner,
-        dtype=dtype)
-
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
                                     name="mod")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_integerized_feature(column_name,
@@ -503,33 +543,14 @@ def sparse_column_with_integerized_feature(column_name,
     ValueError: dtype is not integer.
   """
   return _SparseColumnIntegerized(
-      column_name, bucket_size, combiner=combiner, dtype=dtype)
+      column_name, is_integerized=True, bucket_size=bucket_size,
+      combiner=combiner, dtype=dtype)
 
 
 class _SparseColumnHashed(_SparseColumn):
   """See `sparse_column_with_hash_bucket`."""
 
-  def __new__(cls,
-              column_name,
-              hash_bucket_size,
-              combiner="sum",
-              dtype=dtypes.string):
-
-    if dtype != dtypes.string and not dtype.is_integer:
-      raise ValueError("dtype must be string or integer. "
-                       "dtype: {}, column_name: {}".format(dtype, column_name))
-
-    return super(_SparseColumnHashed, cls).__new__(
-        cls,
-        column_name,
-        bucket_size=hash_bucket_size,
-        combiner=combiner,
-        dtype=dtype)
-
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     if self.dtype.is_integer:
       sparse_values = string_ops.as_string(input_tensor.values)
     else:
@@ -537,8 +558,8 @@ class _SparseColumnHashed(_SparseColumn):
 
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
         sparse_values, self.bucket_size, name="lookup")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_hash_bucket(column_name,
@@ -571,34 +592,27 @@ def sparse_column_with_hash_bucket(column_name,
     ValueError: hash_bucket_size is not greater than 2.
     ValueError: dtype is neither string nor integer.
   """
-  return _SparseColumnHashed(column_name, hash_bucket_size, combiner, dtype)
+  return _SparseColumnHashed(
+      column_name,
+      bucket_size=hash_bucket_size,
+      combiner=combiner,
+      dtype=dtype)
 
 
 class _SparseColumnKeys(_SparseColumn):
   """See `sparse_column_with_keys`."""
 
-  def __new__(cls, column_name, keys, default_value=-1, combiner="sum"):
-    return super(_SparseColumnKeys, cls).__new__(
-        cls,
-        column_name,
-        combiner=combiner,
-        lookup_config=_SparseIdLookupConfig(
-            keys=keys, vocab_size=len(keys), default_value=default_value),
-        dtype=dtypes.string)
-
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
-    table = lookup.string_to_index_table_from_tensor(
-        mapping=list(self.lookup_config.keys),
+  def _do_transform(self, input_tensor):
+    table = lookup.index_table_from_tensor(
+        mapping=tuple(self.lookup_config.keys),
         default_value=self.lookup_config.default_value,
+        dtype=self.dtype,
         name="lookup")
-    columns_to_tensors[self] = table.lookup(input_tensor)
+    return table.lookup(input_tensor)
 
 
 def sparse_column_with_keys(
-    column_name, keys, default_value=-1, combiner="sum"):
+    column_name, keys, default_value=-1, combiner="sum", dtype=dtypes.string):
   """Creates a _SparseColumn with keys.
 
   Look up logic is as follows:
@@ -606,7 +620,7 @@ def sparse_column_with_keys(
 
   Args:
     column_name: A string defining sparse column name.
-    keys: a string list defining vocabulary.
+    keys: A list or tuple defining vocabulary. Must be castable to `dtype`.
     default_value: The value to use for out-of-vocabulary feature values.
       Default is -1.
     combiner: A string specifying how to reduce if the sparse column is
@@ -617,44 +631,24 @@ def sparse_column_with_keys(
         * "mean": do l1 normalization on features in the column
         * "sqrtn": do l2 normalization on features in the column
       For more information: `tf.embedding_lookup_sparse`.
+    dtype: Type of features. Only integer and string are supported.
 
   Returns:
     A _SparseColumnKeys with keys configuration.
   """
+  keys = tuple(keys)
   return _SparseColumnKeys(
-      column_name, tuple(keys), default_value=default_value, combiner=combiner)
+      column_name,
+      lookup_config=_SparseIdLookupConfig(
+          keys=keys, vocab_size=len(keys), default_value=default_value),
+      combiner=combiner,
+      dtype=dtype)
 
 
 class _SparseColumnVocabulary(_SparseColumn):
   """See `sparse_column_with_vocabulary_file`."""
 
-  def __new__(cls,
-              column_name,
-              vocabulary_file,
-              num_oov_buckets=0,
-              vocab_size=None,
-              default_value=-1,
-              combiner="sum",
-              dtype=dtypes.string):
-
-    if dtype != dtypes.string and not dtype.is_integer:
-      raise ValueError("dtype must be string or integer. "
-                       "dtype: {}, column_name: {}".format(dtype, column_name))
-
-    return super(_SparseColumnVocabulary, cls).__new__(
-        cls,
-        column_name,
-        combiner=combiner,
-        lookup_config=_SparseIdLookupConfig(
-            vocabulary_file=vocabulary_file,
-            num_oov_buckets=num_oov_buckets,
-            vocab_size=vocab_size,
-            default_value=default_value),
-        dtype=dtype)
-
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    st = self._get_input_sparse_tensor(columns_to_tensors)
+  def _do_transform(self, st):
     if self.dtype.is_integer:
       sparse_string_values = string_ops.as_string(st.values)
       sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices,
@@ -663,13 +657,13 @@ class _SparseColumnVocabulary(_SparseColumn):
     else:
       sparse_string_tensor = st
 
-    table = lookup.string_to_index_table_from_file(
+    table = lookup.index_table_from_file(
         vocabulary_file=self.lookup_config.vocabulary_file,
         num_oov_buckets=self.lookup_config.num_oov_buckets,
         vocab_size=self.lookup_config.vocab_size,
         default_value=self.lookup_config.default_value,
         name=self.name + "_lookup")
-    columns_to_tensors[self] = table.lookup(sparse_string_tensor)
+    return table.lookup(sparse_string_tensor)
 
 
 def sparse_column_with_vocabulary_file(column_name,
@@ -716,17 +710,21 @@ def sparse_column_with_vocabulary_file(column_name,
 
   return _SparseColumnVocabulary(
       column_name,
-      vocabulary_file,
-      num_oov_buckets=num_oov_buckets,
-      vocab_size=vocab_size,
-      default_value=default_value,
+      lookup_config=_SparseIdLookupConfig(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=num_oov_buckets,
+          vocab_size=vocab_size,
+          default_value=default_value),
       combiner=combiner,
       dtype=dtype)
 
 
-class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
-    "_WeightedSparseColumn",
-    ["sparse_id_column", "weight_column_name", "dtype"])):
+class _WeightedSparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_WeightedSparseColumn",
+                           ["sparse_id_column", "weight_column_name",
+                            "dtype"])):
   """See `weighted_sparse_column`."""
 
   def __new__(cls, sparse_id_column, weight_column_name, dtype):
@@ -755,22 +753,6 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Inserts a tuple with the id and weight tensors."""
-    if self.sparse_id_column not in columns_to_tensors:
-      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
-
-    weight_tensor = columns_to_tensors[self.weight_column_name]
-    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
-      # The weight tensor can be a regular Tensor. In such case, sparsify it.
-      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
-    if not self.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
-    columns_to_tensors[self] = tuple([
-        columns_to_tensors[self.sparse_id_column],
-        weight_tensor
-    ])
-
   def id_tensor(self, input_tensor):
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor[0]
@@ -798,6 +780,43 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner=self.sparse_id_column.combiner)
 
+  def _do_transform(self, id_tensor, weight_tensor):
+    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
+      # The weight tensor can be a regular Tensor. In such case, sparsify it.
+      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
+    if not self.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return tuple([id_tensor, weight_tensor])
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Inserts a tuple with the id and weight tensors."""
+    if self.sparse_id_column not in columns_to_tensors:
+      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
+
+    weight_tensor = columns_to_tensors[self.weight_column_name]
+    columns_to_tensors[self] = self._do_transform(
+        columns_to_tensors[self.sparse_id_column], weight_tensor)
+
+  def _transform_feature(self, inputs):
+    return self._do_transform(
+        inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
+
 
 def weighted_sparse_column(sparse_id_column,
                            weight_column_name,
@@ -830,9 +849,11 @@ def weighted_sparse_column(sparse_id_column,
       weight or value of the corresponding sparse id feature.
     dtype: Type of weights, such as `tf.float32`. Only floating and integer
       weights are supported.
+
   Returns:
     A _WeightedSparseColumn composed of two sparse features: one represents id,
     the other represents weight (value) of the id feature in that example.
+
   Raises:
     ValueError: if dtype is not convertible to float.
   """
@@ -843,9 +864,10 @@ def weighted_sparse_column(sparse_id_column,
   return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype)
 
 
-class _OneHotColumn(_FeatureColumn,
-                    collections.namedtuple("_OneHotColumn",
-                                           ["sparse_id_column"])):
+class _OneHotColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_OneHotColumn", ["sparse_id_column"])):
   """Represents a one-hot column for use in deep networks.
 
   Args:
@@ -893,7 +915,7 @@ class _OneHotColumn(_FeatureColumn,
       output_rank: the desired rank of the output `Tensor`.
 
     Returns:
-      A multihot Tensor to be fed into the first layer of neural network.
+      A multi-hot Tensor to be fed into the first layer of neural network.
 
     Raises:
       ValueError: When using one_hot_column with weighted_sparse_column.
@@ -925,12 +947,31 @@ class _OneHotColumn(_FeatureColumn,
     return math_ops.reduce_sum(
         one_hot_id_tensor, reduction_indices=[output_rank - 1])
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.length])
 
-class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
-    "_EmbeddingColumn",
-    ["sparse_id_column", "dimension", "combiner", "initializer",
-     "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
-     "shared_vocab_size", "max_norm"])):
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  def _transform_feature(self, inputs):
+    return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+
+class _EmbeddingColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_EmbeddingColumn", [
+        "sparse_id_column", "dimension", "combiner", "initializer",
+        "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
+        "shared_vocab_size", "max_norm", "trainable"
+    ])):
   """Represents an embedding column.
 
   Args:
@@ -961,6 +1002,7 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
       embedding space.
     max_norm: (Optional). If not None, embedding values are l2-normalized to
       the value of max_norm.
+    trainable: (Optional). Should the embedding be trainable. Default is True.
 
   Raises:
     ValueError: if `initializer` is specified and is not callable. Also,
@@ -976,7 +1018,8 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
               tensor_name_in_ckpt=None,
               shared_embedding_name=None,
               shared_vocab_size=None,
-              max_norm=None):
+              max_norm=None,
+              trainable=True):
     if initializer is not None and not callable(initializer):
       raise ValueError("initializer must be callable if specified. "
                        "Embedding of column_name: {}".format(
@@ -998,7 +1041,8 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
                                                 tensor_name_in_ckpt,
                                                 shared_embedding_name,
                                                 shared_vocab_size,
-                                                max_norm)
+                                                max_norm,
+                                                trainable)
 
   @property
   def name(self):
@@ -1039,7 +1083,8 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
         combiner=self.combiner,
         shared_embedding_name=self.shared_embedding_name,
         hash_key=None,
-        max_norm=self.max_norm)
+        max_norm=self.max_norm,
+        trainable=self.trainable)
 
   def _checkpoint_path(self):
     if self.ckpt_to_load_from is not None:
@@ -1051,6 +1096,139 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
     raise ValueError("Column {} is not supported in linear models. "
                      "Please use sparse_column.".format(self))
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.sparse_id_column)
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+
+def _is_variable(v):
+  """Returns true if `v` is a variable."""
+  return isinstance(v, (variables.Variable,
+                        resource_variable_ops.ResourceVariable))
+
+
+def _embeddings_from_arguments(column,
+                               args,
+                               weight_collections,
+                               trainable,
+                               output_rank=2):
+  """Returns embeddings for a column based on the computed arguments.
+
+  Args:
+   column: the column name.
+   args: the _DeepEmbeddingLookupArguments for this column.
+   weight_collections: collections to store weights in.
+   trainable: whether these embeddings should be trainable.
+   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
+     be combined to produce the desired rank.
+
+  Returns:
+   the embeddings.
+
+  Raises:
+   ValueError: if not possible to create.
+  """
+  # pylint: disable=protected-access
+  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
+  weight_tensor = None
+  if args.weight_tensor is not None:
+    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
+  # pylint: enable=protected-access
+
+  # This option is only enabled for scattered_embedding_column.
+  if args.hash_key:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+    return embedding_ops.scattered_embedding_lookup_sparse(
+        embeddings,
+        input_tensor,
+        args.dimension,
+        hash_key=args.hash_key,
+        combiner=args.combiner,
+        name="lookup")
+
+  if args.shared_embedding_name is not None:
+    shared_embedding_collection_name = (
+        "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper())
+    graph = ops.get_default_graph()
+    shared_embedding_collection = (
+        graph.get_collection_ref(shared_embedding_collection_name))
+    shape = [args.vocab_size, args.dimension]
+    if shared_embedding_collection:
+      if len(shared_embedding_collection) > 1:
+        raise ValueError(
+            "Collection %s can only contain one "
+            "(partitioned) variable." % shared_embedding_collection_name)
+      else:
+        embeddings = shared_embedding_collection[0]
+        if embeddings.get_shape() != shape:
+          raise ValueError(
+              "The embedding variable with name {} already "
+              "exists, but its shape does not match required "
+              "embedding shape  here. Please make sure to use "
+              "different shared_embedding_name for different "
+              "shared embeddings.".format(args.shared_embedding_name))
+    else:
+      embeddings = contrib_variables.model_variable(
+          name=args.shared_embedding_name,
+          shape=shape,
+          dtype=dtypes.float32,
+          initializer=args.initializer,
+          trainable=(trainable and args.trainable),
+          collections=weight_collections)
+      graph.add_to_collection(shared_embedding_collection_name, embeddings)
+  else:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size, args.dimension],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+  if _is_variable(embeddings):
+    embeddings = [embeddings]
+  else:
+    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings)
+  return embedding_ops.safe_embedding_lookup_sparse(
+      embeddings,
+      input_tensor,
+      sparse_weights=weight_tensor,
+      combiner=args.combiner,
+      name=column.name + "weights",
+      max_norm=args.max_norm)
+
+
+def _maybe_restore_from_checkpoint(checkpoint_path, variable):
+  if checkpoint_path is not None:
+    path, tensor_name = checkpoint_path
+    weights_to_restore = variable
+    if len(variable) == 1:
+      weights_to_restore = variable[0]
+    checkpoint_utils.init_from_checkpoint(path,
+                                          {tensor_name: weights_to_restore})
+
 
 def one_hot_column(sparse_id_column):
   """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
@@ -1073,7 +1251,8 @@ def embedding_column(sparse_id_column,
                      initializer=None,
                      ckpt_to_load_from=None,
                      tensor_name_in_ckpt=None,
-                     max_norm=None):
+                     max_norm=None,
+                     trainable=True):
   """Creates an `_EmbeddingColumn` for feeding sparse data into a DNN.
 
   Args:
@@ -1102,13 +1281,14 @@ def embedding_column(sparse_id_column,
       `ckpt_to_load_from` is not None.
     max_norm: (Optional). If not None, embedding values are l2-normalized to
       the value of max_norm.
+    trainable: (Optional). Should the embedding be trainable. Default is True
 
   Returns:
     An `_EmbeddingColumn`.
   """
   return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer,
                           ckpt_to_load_from, tensor_name_in_ckpt,
-                          max_norm=max_norm)
+                          max_norm=max_norm, trainable=trainable)
 
 
 def shared_embedding_columns(sparse_id_columns,
@@ -1118,7 +1298,8 @@ def shared_embedding_columns(sparse_id_columns,
                              initializer=None,
                              ckpt_to_load_from=None,
                              tensor_name_in_ckpt=None,
-                             max_norm=None):
+                             max_norm=None,
+                             trainable=True):
   """Creates a list of `_EmbeddingColumn` sharing the same embedding.
 
   Args:
@@ -1150,6 +1331,7 @@ def shared_embedding_columns(sparse_id_columns,
       `ckpt_to_load_from` is not None.
     max_norm: (Optional). If not None, embedding values are l2-normalized to
       the value of max_norm.
+    trainable: (Optional). Should the embedding be trainable. Default is True
 
   Returns:
     A tuple of `_EmbeddingColumn` with shared embedding space.
@@ -1177,7 +1359,8 @@ def shared_embedding_columns(sparse_id_columns,
     return [
         _EmbeddingColumn(sparse_id_columns[0], dimension, combiner, initializer,
                          ckpt_to_load_from, tensor_name_in_ckpt,
-                         shared_embedding_name, max_norm=max_norm)]
+                         shared_embedding_name, max_norm=max_norm,
+                         trainable=trainable)]
   else:
     # check compatibility of sparse_id_columns
     compatible = True
@@ -1207,16 +1390,17 @@ def shared_embedding_columns(sparse_id_columns,
           _EmbeddingColumn(column, dimension, combiner, initializer,
                            ckpt_to_load_from, tensor_name_in_ckpt,
                            shared_embedding_name, shared_vocab_size,
-                           max_norm=max_norm))
+                           max_norm=max_norm, trainable=trainable))
     return tuple(embedded_columns)
 
 
 class _ScatteredEmbeddingColumn(
-    collections.namedtuple(
-        "_ScatteredEmbeddingColumn",
-        ["column_name", "size", "dimension", "hash_key", "combiner",
-         "initializer"]),
-    _EmbeddingColumn):
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_ScatteredEmbeddingColumn", [
+        "column_name", "size", "dimension", "hash_key", "combiner",
+        "initializer"
+    ])):
   """See `scattered_embedding_column`."""
 
   def __new__(cls,
@@ -1248,6 +1432,11 @@ class _ScatteredEmbeddingColumn(
   def config(self):
     return {self.column_name: parsing_ops.VarLenFeature(dtypes.string)}
 
+  @property
+  def key(self):
+    """Returns a string which will be used as a key when we do sorting."""
+    return self._key_without_properties(["initializer"])
+
   def insert_transformed_feature(self, columns_to_tensors):
     columns_to_tensors[self] = columns_to_tensors[self.column_name]
 
@@ -1261,7 +1450,25 @@ class _ScatteredEmbeddingColumn(
         dimension=self.dimension,
         shared_embedding_name=None,
         hash_key=self.hash_key,
-        max_norm=None)
+        max_norm=None,
+        trainable=True)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.column_name)
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
 
 
 def scattered_embedding_column(column_name,
@@ -1272,21 +1479,29 @@ def scattered_embedding_column(column_name,
                                initializer=None):
   """Creates an embedding column of a sparse feature using parameter hashing.
 
-  The i-th embedding component of a value v is found by retrieving an
-  embedding weight whose index is a fingerprint of the pair (v,i).
+  This is a useful shorthand when you have a sparse feature you want to use an
+  embedding for, but also want to hash the embedding's values in each dimension
+  to a variable based on a different hash.
+
+  Specifically, the i-th embedding component of a value v is found by retrieving
+  an embedding weight whose index is a fingerprint of the pair (v,i).
 
   An embedding column with sparse_column_with_hash_bucket such as
-    embedding_column(
+
+      embedding_column(
         sparse_column_with_hash_bucket(column_name, bucket_size),
         dimension)
 
   could be replaced by
-    scattered_embedding_column(
-        column_name, size=bucket_size * dimension, dimension=dimension,
+
+      scattered_embedding_column(
+        column_name,
+        size=bucket_size * dimension,
+        dimension=dimension,
         hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
 
-  for the same number of embedding parameters and hopefully reduced impact of
-  collisions with a cost of slowing down training.
+  for the same number of embedding parameters. This should hopefully reduce the
+  impact of collisions, but adds the cost of slowing down training.
 
   Args:
     column_name: A string defining sparse column name.
@@ -1375,17 +1590,155 @@ def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None):
   return layers._inner_flatten(input_tensor, output_rank)  # pylint: disable=protected-access
 
 
-class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
-    "_RealValuedColumn",
-    ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
+class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
+    "_RealValuedVarLenColumn",
+    ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])):
+  """Represents a real valued feature column for variable length Features.
+
+  Instances of this class are immutable.
+  If is_sparse=False, the dictionary returned by InputBuilder contains a
+  ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension).
+  If is_sparse=True, the dictionary contains a ("column_name", SparseTensor)
+  pair instead with shape inferred after parsing.
+  """
+
+  @property
+  def name(self):
+    return self.column_name
+
+  @property
+  def config(self):
+    if self.is_sparse:
+      return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
+    else:
+      return {self.column_name: parsing_ops.FixedLenSequenceFeature(
+          [], self.dtype, allow_missing=True,
+          default_value=self.default_value)}
+
+  @property
+  def key(self):
+    """Returns a string which will be used as a key when we do sorting."""
+    return self._key_without_properties(["normalizer"])
+
+  @property
+  def normalizer_fn(self):
+    """Returns the function used to normalize the column."""
+    return self.normalizer
+
+  def _normalized_input_tensor(self, input_tensor):
+    """Returns the input tensor after custom normalization is applied."""
+    if self.normalizer is None:
+      return input_tensor
+    if self.is_sparse:
+      return sparse_tensor_py.SparseTensor(
+          input_tensor.indices,
+          self.normalizer(input_tensor.values),
+          input_tensor.dense_shape)
+    else:
+      return self.normalizer(input_tensor)
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Apply transformation and inserts it into columns_to_tensors.
+
+    Args:
+      columns_to_tensors: A mapping from feature columns to tensors. 'string'
+        key means a base feature (not-transformed). It can have _FeatureColumn
+        as a key too. That means that _FeatureColumn is already transformed.
+    """
+    # Transform the input tensor according to the normalizer function.
+    input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
+    columns_to_tensors[self] = math_ops.to_float(input_tensor)
+
+  # pylint: disable=unused-argument
+  def _to_dnn_input_layer(self,
+                          input_tensor,
+                          weight_collections=None,
+                          trainable=True,
+                          output_rank=2):
+    return _reshape_real_valued_tensor(
+        self._to_dense_tensor(input_tensor), output_rank, self.name)
+
+  def _to_dense_tensor(self, input_tensor):
+    if not self.is_sparse:
+      return input_tensor
+    raise ValueError("Set is_sparse to False if you want a dense Tensor for "
+                     "column_name: {}".format(self.name))
+
+
+@experimental
+def _real_valued_var_len_column(column_name,
+                                default_value=None,
+                                dtype=dtypes.float32,
+                                normalizer=None,
+                                is_sparse=False):
+  """Creates a `_RealValuedVarLenColumn` for variable-length numeric data.
+
+  Note, this is not integrated with any of the DNNEstimators, except the RNN
+  ones DynamicRNNEstimator and the StateSavingRNNEstimator.
+
+  It can either create a parsing config for a SparseTensor (with is_sparse=True)
+  or a padded Tensor.
+  The (dense_)shape of the result will be [batch_size, None], which can be used
+  with is_sparse=False as input into an RNN (see DynamicRNNEstimator or
+  StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see
+  gtflow).
+
+  Use real_valued_column if the Feature has a fixed length. Use some
+  SparseColumn for columns to be embedded / one-hot-encoded.
+
+  Args:
+    column_name: A string defining real valued column name.
+    default_value: A scalar value compatible with dtype. Needs to be specified
+      if is_sparse=False.
+    dtype: Defines the type of values. Default value is tf.float32. Needs to be
+      convertible to tf.float32.
+    normalizer: If not None, a function that can be used to normalize the value
+      of the real valued column after default_value is applied for parsing.
+      Normalizer function takes the input tensor as its argument, and returns
+      the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
+      is_sparse=False, the normalizer will be run on the values of the
+      `SparseTensor`.
+    is_sparse: A boolean defining whether to create a SparseTensor or a Tensor.
+  Returns:
+    A _RealValuedSparseColumn.
+  Raises:
+    TypeError: if default_value is not a scalar value compatible with dtype.
+    TypeError: if dtype is not convertible to tf.float32.
+    ValueError: if default_value is None and is_sparse is False.
+  """
+  if not (dtype.is_integer or dtype.is_floating):
+    raise TypeError("dtype must be convertible to float. "
+                    "dtype: {}, column_name: {}".format(dtype, column_name))
+
+  if default_value is None and not is_sparse:
+    raise ValueError("default_value must be provided when is_sparse=False to "
+                     "parse a padded Tensor. "
+                     "column_name: {}".format(column_name))
+  if isinstance(default_value, list):
+    raise ValueError(
+        "Only scalar default value. default_value: {}, column_name: {}".format(
+            default_value, column_name))
+  if default_value is not None:
+    if dtype.is_integer:
+      default_value = int(default_value)
+    elif dtype.is_floating:
+      default_value = float(default_value)
+
+  return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer,
+                                 is_sparse)
+
+
+class _RealValuedColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple(
+        "_RealValuedColumn",
+        ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
   """Represents a real valued feature column also known as continuous features.
 
-  Instances of this class are immutable. A real valued column with a specified
-  dimension means features are dense, otherwise they're sparse.
-  In the dense case, the dictionary returned by InputBuilder contains a
-  ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension).
-  In the sparse shape, the dictionary contains a ("column_name", SparseTensor)
-  pair instead with shape inferred after parsing.
+  Instances of this class are immutable. The dictionary returned by InputBuilder
+  contains a ("column_name", Tensor) pair with a Tensor shape of
+  (batch_size, dimension).
   """
 
   def __new__(cls, column_name, dimension, default_value,
@@ -1402,15 +1755,12 @@ class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
 
   @property
   def config(self):
-    if self.dimension is None:
-      return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
-    else:
-      default_value = self.default_value
-      if default_value is not None:
-        default_value = list(default_value)
-      return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
-                                                            self.dtype,
-                                                            default_value)}
+    default_value = self.default_value
+    if default_value is not None:
+      default_value = list(default_value)
+    return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
+                                                          self.dtype,
+                                                          default_value)}
 
   @property
   def key(self):
@@ -1451,13 +1801,25 @@ class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
 
   def _to_dense_tensor(self, input_tensor):
-    if isinstance(input_tensor, sparse_tensor_py.SparseTensor):
-      default_value = (self.default_value[0] if self.default_value is not None
-                       else 0)
-      return sparse_ops.sparse_tensor_to_dense(
-          input_tensor, default_value=default_value)
     return input_tensor
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  def _transform_feature(self, inputs):
+    return math_ops.to_float(
+        self._normalized_input_tensor(inputs.get(self.name)))
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
 
 def real_valued_column(column_name,
                        dimension=1,
@@ -1469,10 +1831,7 @@ def real_valued_column(column_name,
   Args:
     column_name: A string defining real valued column name.
     dimension: An integer specifying dimension of the real valued column.
-      The default is 1. When dimension is not None, the Tensor representing
-      the _RealValuedColumn will have the shape of [batch_size, dimension].
-      A None dimension means the feature column should be treat as variable
-      length and will be parsed as a `SparseTensor`.
+      The default is 1.
     default_value: A single value compatible with dtype or a list of values
       compatible with dtype which the column takes on during tf.Example parsing
       if data is missing. When dimension is not None, a default value of None
@@ -1497,18 +1856,22 @@ def real_valued_column(column_name,
     TypeError: if default_value is a list but its length is not equal to the
       value of `dimension`.
     TypeError: if default_value is not compatible with dtype.
-    ValueError: if dtype is not convertable to tf.float32.
+    ValueError: if dtype is not convertible to tf.float32.
   """
 
-  if dimension is not None:
-    if not isinstance(dimension, int):
-      raise TypeError("dimension must be an integer. "
-                      "dimension: {}, column_name: {}".format(dimension,
-                                                              column_name))
-    if dimension < 1:
-      raise ValueError("dimension must be greater than 0. "
-                       "dimension: {}, column_name: {}".format(dimension,
-                                                               column_name))
+  if dimension is None:
+    raise TypeError("dimension must be an integer. Use the "
+                    "_real_valued_var_len_column for variable length features."
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
+  if not isinstance(dimension, int):
+    raise TypeError("dimension must be an integer. "
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
+  if dimension < 1:
+    raise ValueError("dimension must be greater than 0. "
+                     "dimension: {}, column_name: {}".format(dimension,
+                                                             column_name))
 
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError("dtype must be convertible to float. "
@@ -1539,11 +1902,6 @@ def real_valued_column(column_name,
                                normalizer)
 
   if isinstance(default_value, list):
-    if dimension is None:
-      raise ValueError(
-          "Only scalar default value is supported when dimension is None. "
-          "default_value: {}, column_name: {}".format(
-              default_value, column_name))
     if len(default_value) != dimension:
       raise ValueError(
           "The length of default_value must be equal to dimension. "
@@ -1577,8 +1935,12 @@ def real_valued_column(column_name,
                       default_value, dtype, column_name))
 
 
-class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
-    "_BucketizedColumn", ["source_column", "boundaries"])):
+class _BucketizedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_BucketizedColumn", ["source_column",
+                                                 "boundaries"])):
   """Represents a bucketization transformation also known as binning.
 
   Instances of this class are immutable. Values in `source_column` will be
@@ -1592,8 +1954,9 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
 
   Attributes:
     source_column: A _RealValuedColumn defining dense column.
-    boundaries: A list of floats specifying the boundaries. It has to be sorted.
-      [a, b, c] defines following buckets: (-inf., a), [a, b), [b, c), [c, inf.)
+    boundaries: A list or tuple of floats specifying the boundaries. It has to
+      be sorted. [a, b, c] defines following buckets: (-inf., a), [a, b),
+      [b, c), [c, inf.)
   Raises:
     ValueError: if 'boundaries' is empty or not sorted.
   """
@@ -1607,8 +1970,9 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
       raise ValueError("source_column must have a defined dimension. "
                        "source_column: {}".format(source_column))
 
-    if not isinstance(boundaries, list) or not boundaries:
-      raise ValueError("boundaries must be a non-empty list. "
+    if (not isinstance(boundaries, list) and
+        not isinstance(boundaries, tuple)) or not boundaries:
+      raise ValueError("boundaries must be a non-empty list or tuple. "
                        "boundaries: {}".format(boundaries))
 
     # We allow bucket boundaries to be monotonically increasing
@@ -1646,15 +2010,6 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    # Bucketize the source column.
-    if self.source_column not in columns_to_tensors:
-      self.source_column.insert_transformed_feature(columns_to_tensors)
-    columns_to_tensors[self] = bucketization_op.bucketize(
-        columns_to_tensors[self.source_column],
-        boundaries=list(self.boundaries),
-        name="bucketize")
-
   # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
                           input_tensor,
@@ -1684,7 +2039,7 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
                   math_ops.range(0, batch_size), 1, name="expand_dims"),
               [1, dimension],
               name="tile"), [-1],
-          name="rehsape")
+          name="reshape")
       i2 = array_ops.tile(
           math_ops.range(0, dimension), [batch_size], name="tile")
       # Flatten the bucket indices and unique them across dimensions
@@ -1712,13 +2067,51 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner="sum")
 
+  def _transform_feature(self, inputs):
+    """Handles cross transformation."""
+    # Bucketize the source column.
+    return bucketization_op.bucketize(
+        inputs.get(self.source_column),
+        boundaries=list(self.boundaries),
+        name="bucketize")
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length * self.source_column.dimension
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.to_sparse_tensor(inputs.get(self)), None)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        [self.length * self.source_column.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return self._to_dnn_input_layer(
+        inputs.get(self), weight_collections, trainable)
+
 
 def bucketized_column(source_column, boundaries):
   """Creates a _BucketizedColumn for discretizing dense input.
 
   Args:
     source_column: A _RealValuedColumn defining dense column.
-    boundaries: A list of floats specifying the boundaries. It has to be sorted.
+    boundaries: A list or tuple of floats specifying the boundaries. It has to
+      be sorted.
 
   Returns:
     A _BucketizedColumn.
@@ -1729,13 +2122,14 @@ def bucketized_column(source_column, boundaries):
   return _BucketizedColumn(source_column, boundaries)
 
 
-class _CrossedColumn(_FeatureColumn,
-                     collections.namedtuple("_CrossedColumn",
-                                            ["columns", "hash_bucket_size",
-                                             "hash_key",
-                                             "combiner", "ckpt_to_load_from",
-                                             "tensor_name_in_ckpt"])):
-  """Represents a cross transformation also known as conjuction or combination.
+class _CrossedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_CrossedColumn", [
+        "columns", "hash_bucket_size", "hash_key", "combiner",
+        "ckpt_to_load_from", "tensor_name_in_ckpt"
+    ])):
+  """Represents a cross transformation also known as conjunction or combination.
 
   Instances of this class are immutable. It crosses given `columns`. Crossed
   column output will be hashed to hash_bucket_size.
@@ -1852,48 +2246,20 @@ class _CrossedColumn(_FeatureColumn,
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor
 
-  # pylint: disable=unused-argument
   def weight_tensor(self, input_tensor):
     """Returns the weight tensor from the given transformed input_tensor."""
+    del input_tensor
     return None
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles cross transformation."""
-
-    def _collect_leaf_level_columns(cross):
-      """Collects base columns contained in the cross."""
-      leaf_level_columns = []
-      for c in cross.columns:
-        if isinstance(c, _CrossedColumn):
-          leaf_level_columns.extend(_collect_leaf_level_columns(c))
-        else:
-          leaf_level_columns.append(c)
-      return leaf_level_columns
-
-    feature_tensors = []
-    for c in _collect_leaf_level_columns(self):
-      if isinstance(c, _SparseColumn):
-        feature_tensors.append(columns_to_tensors[c.name])
-      else:
-        if c not in columns_to_tensors:
-          c.insert_transformed_feature(columns_to_tensors)
-        if isinstance(c, _BucketizedColumn):
-          feature_tensors.append(c.to_sparse_tensor(columns_to_tensors[c]))
-        else:
-          feature_tensors.append(columns_to_tensors[c])
-    columns_to_tensors[self] = sparse_feature_cross_op.sparse_feature_cross(
-        feature_tensors,
-        hashed_output=True,
-        num_buckets=self.hash_bucket_size,
-        hash_key=self.hash_key,
-        name="cross")
-
-  # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
                           input_tensor,
                           weight_collections=None,
                           trainable=True,
                           output_rank=2):
+    del input_tensor
+    del weight_collections
+    del trainable
+    del output_rank
     raise ValueError("CrossedColumn is not supported in DNN. "
                      "Please use embedding_column. column: {}".format(self))
 
@@ -1910,6 +2276,74 @@ class _CrossedColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer(),
         combiner=self.combiner)
 
+  def _transform_feature(self, inputs):
+    """Handles cross transformation."""
+
+    def _collect_leaf_level_columns(cross):
+      """Collects base columns contained in the cross."""
+      leaf_level_columns = []
+      for c in cross.columns:
+        if isinstance(c, _CrossedColumn):
+          leaf_level_columns.extend(_collect_leaf_level_columns(c))
+        else:
+          leaf_level_columns.append(c)
+      return leaf_level_columns
+
+    feature_tensors = []
+    for c in _collect_leaf_level_columns(self):
+      if isinstance(c, _SparseColumn):
+        feature_tensors.append(inputs.get(c.name))
+      else:
+        if isinstance(c, _BucketizedColumn):
+          feature_tensors.append(c.to_sparse_tensor(inputs.get(c)))
+        else:
+          feature_tensors.append(inputs.get(c))
+    return sparse_feature_cross_op.sparse_feature_cross(
+        feature_tensors,
+        hashed_output=True,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key,
+        name="cross")
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
+
+  @property
+  def _parse_example_spec(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None)  # pylint: disable=protected-access
+
+
+class _LazyBuilderByColumnsToTensor(object):
+
+  def __init__(self, columns_to_tensors):
+    self._columns_to_tensors = columns_to_tensors
+
+  def get(self, key):
+    """Gets the transformed feature column."""
+    if key in self._columns_to_tensors:
+      return self._columns_to_tensors[key]
+    if isinstance(key, str):
+      raise ValueError(
+          "features dictionary doesn't contain key ({})".format(key))
+    if not isinstance(key, _FeatureColumn):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      "Provided: {}".format(key))
+
+    key.insert_transformed_feature(self._columns_to_tensors)
+    return self._columns_to_tensors[key]
+
 
 def crossed_column(columns, hash_bucket_size, combiner="sum",
                    ckpt_to_load_from=None,
@@ -2033,8 +2467,9 @@ def _get_feature_config(feature_column):
         "Given column is {}".format(feature_column))
   if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn,
                                  _EmbeddingColumn, _RealValuedColumn,
+                                 _RealValuedVarLenColumn,
                                  _BucketizedColumn, _CrossedColumn,
-                                 _OneHotColumn)):
+                                 _OneHotColumn, _ScatteredEmbeddingColumn)):
     return feature_column.config
 
   raise TypeError("Not supported _FeatureColumn type. "
@@ -2102,7 +2537,8 @@ def _create_sequence_feature_spec_for_parsing(sequence_feature_columns,
   feature_spec = create_feature_spec_for_parsing(sequence_feature_columns)
   sequence_feature_spec = {}
   for key, feature in feature_spec.items():
-    if isinstance(feature, parsing_ops.VarLenFeature):
+    if (isinstance(feature, parsing_ops.VarLenFeature) or
+        isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
       sequence_feature = feature
     elif isinstance(feature, parsing_ops.FixedLenFeature):
       default_is_set = feature.default_value is not None
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index b3fc61f2f92..fa0047f05d8 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.framework import checkpoint_utils
+import functools
+
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import embedding_ops
@@ -34,106 +35,60 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 
-def _embeddings_from_arguments(column,
-                               args,
-                               weight_collections,
-                               trainable,
-                               output_rank=2):
-  """Returns embeddings for a column based on the computed arguments.
+def _maybe_reshape_input_tensor(tensor, column_name, output_rank):
+  """Reshape the input tensor by the following rule.
+
+  1. If `output_rank > input_rank + 1`, raise a `ValueError`.
+  2. If `output_rank == input_rank + 1`, expand the tensor by one dimension.
+  3. If `output_rank == input_rank`, do nothing.
+  4. If `output_rank < input_rank`, flatten the inner dimensions of the tensor.
 
   Args:
-   column: the column name.
-   args: the _DeepEmbeddingLookupArguments for this column.
-   weight_collections: collections to store weights in.
-   trainable: whether these embeddings should be trainable.
-   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
-     be combined to produce the desired rank.
-
+    tensor: A Tensor or SparseTensor to be reshaped.
+    column_name: A string name of the feature column for the tensor.
+    output_rank: the desired rank of the tensor.
   Returns:
-   the embeddings.
-
+    A reshaped Tensor or SparseTensor.
   Raises:
-   ValueError: if not possible to create.
+    ValueError: if `output_rank > input_rank + 1` for the input tensor.
   """
-  # pylint: disable=protected-access
-  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
-  weight_tensor = None
-  if args.weight_tensor is not None:
-    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
-  # pylint: enable=protected-access
+  input_rank = tensor.get_shape().ndims
 
-  # This option is only enabled for scattered_embedding_column.
-  if args.hash_key:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=trainable,
-        collections=weight_collections)
+  if input_rank is None and isinstance(tensor, sparse_tensor_py.SparseTensor):
+    # Try to get the rank of a sparse tensor by its dense_shape's shape.
+    input_rank = tensor.dense_shape.get_shape().as_list()[0]
 
-    return embedding_ops.scattered_embedding_lookup_sparse(
-        embeddings, input_tensor, args.dimension,
-        hash_key=args.hash_key,
-        combiner=args.combiner, name='lookup')
+  if input_rank is None:
+    raise ValueError('Error while processing column {}. Rank of input Tensor '
+                     'can not be None.'.format(column_name))
 
-  if args.shared_embedding_name is not None:
-    shared_embedding_collection_name = (
-        'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper())
-    graph = ops.get_default_graph()
-    shared_embedding_collection = (
-        graph.get_collection_ref(shared_embedding_collection_name))
-    shape = [args.vocab_size, args.dimension]
-    if shared_embedding_collection:
-      if len(shared_embedding_collection) > 1:
-        raise ValueError('Collection %s can only contain one '
-                         '(partitioned) variable.'
-                         % shared_embedding_collection_name)
-      else:
-        embeddings = shared_embedding_collection[0]
-        if embeddings.get_shape() != shape:
-          raise ValueError('The embedding variable with name {} already '
-                           'exists, but its shape does not match required '
-                           'embedding shape  here. Please make sure to use '
-                           'different shared_embedding_name for different '
-                           'shared embeddings.'.format(
-                               args.shared_embedding_name))
+  if output_rank > input_rank + 1:
+    raise ValueError('Error while processing column {}. Rank of input Tensor '
+                     '({}) should be the same as output_rank ({}). For '
+                     'example, sequence data should typically be 3 '
+                     'dimensional (rank 3) while non-sequence data is '
+                     'typically 2 dimensional (rank 2).'.format(
+                         column_name, input_rank, output_rank))
+  elif output_rank == input_rank + 1:
+    # Expand the tensor's shape by 1 dimension.
+    if isinstance(tensor, sparse_tensor_py.SparseTensor):
+      output_shape = array_ops.concat([tensor.dense_shape, [1]], 0)
+      return sparse_ops.sparse_reshape(tensor, output_shape)
     else:
-      embeddings = contrib_variables.model_variable(
-          name=args.shared_embedding_name,
-          shape=shape,
-          dtype=dtypes.float32,
-          initializer=args.initializer,
-          trainable=trainable,
-          collections=weight_collections)
-      graph.add_to_collection(shared_embedding_collection_name, embeddings)
+      reshaped = array_ops.expand_dims(tensor, -1)
+      # Try to calculate the new shape.
+      static_shape = tensor.get_shape()
+      if static_shape is not None and static_shape.dims is not None:
+        reshaped.set_shape(static_shape.as_list() + [1])
+      return reshaped
+  elif output_rank < input_rank:
+    return layers._inner_flatten(tensor, output_rank)  # pylint: disable=protected-access
   else:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size, args.dimension],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=trainable,
-        collections=weight_collections)
-
-  if isinstance(embeddings, variables.Variable):
-    embeddings = [embeddings]
-  else:
-    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
-  # pylint: disable=protected-access
-  _maybe_restore_from_checkpoint(
-      column._checkpoint_path(), embeddings)
-  return embedding_ops.safe_embedding_lookup_sparse(
-      embeddings,
-      input_tensor,
-      sparse_weights=weight_tensor,
-      combiner=args.combiner,
-      name=column.name + 'weights',
-      max_norm=args.max_norm)
+    return tensor
 
 
 def _input_from_feature_columns(columns_to_tensors,
@@ -144,6 +99,7 @@ def _input_from_feature_columns(columns_to_tensors,
                                 output_rank,
                                 default_name):
   """Implementation of `input_from(_sequence)_feature_columns`."""
+  columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
   with variable_scope.variable_scope(scope,
                                      default_name=default_name,
@@ -159,16 +115,23 @@ def _input_from_feature_columns(columns_to_tensors,
                                          default_name=column.name,
                                          values=columns_to_tensors.values()):
         transformed_tensor = transformer.transform(column)
+        if output_rank == 3:
+          transformed_tensor = nest.map_structure(
+              functools.partial(
+                  _maybe_reshape_input_tensor,
+                  column_name=column.name,
+                  output_rank=output_rank), transformed_tensor)
         try:
           # pylint: disable=protected-access
           arguments = column._deep_embedding_lookup_arguments(
               transformed_tensor)
-          output_tensors.append(_embeddings_from_arguments(
-              column,
-              arguments,
-              weight_collections,
-              trainable,
-              output_rank=output_rank))
+          output_tensors.append(
+              fc._embeddings_from_arguments(  # pylint: disable=protected-access
+                  column,
+                  arguments,
+                  weight_collections,
+                  trainable,
+                  output_rank=output_rank))
 
         except NotImplementedError as ee:
           try:
@@ -189,7 +152,7 @@ def input_from_feature_columns(columns_to_tensors,
                                weight_collections=None,
                                trainable=True,
                                scope=None):
-  """A tf.contrib.layer style input layer builder based on FeatureColumns.
+  """A tf.contrib.layers style input layer builder based on FeatureColumns.
 
   Generally a single example in training data is described with feature columns.
   At the first layer of the model, this column oriented data should be converted
@@ -226,7 +189,7 @@ def input_from_feature_columns(columns_to_tensors,
     columns_to_tensors: A mapping from feature column to tensors. 'string' key
       means a base feature (not-transformed). It can have FeatureColumn as a
       key too. That means that FeatureColumn is already transformed by input
-      pipeline. For example, `inflow` may have handled transformations.
+      pipeline.
     feature_columns: A set containing all the feature columns. All items in the
       set should be instances of classes derived by FeatureColumn.
     weight_collections: List of graph collections to which weights are added.
@@ -268,7 +231,7 @@ def sequence_input_from_feature_columns(columns_to_tensors,
     columns_to_tensors: A mapping from feature column to tensors. 'string' key
       means a base feature (not-transformed). It can have FeatureColumn as a
       key too. That means that FeatureColumn is already transformed by input
-      pipeline. For example, `inflow` may have handled transformations.
+      pipeline.
     feature_columns: A set containing all the feature columns. All items in the
       set should be instances of classes derived by FeatureColumn.
     weight_collections: List of graph collections to which weights are added.
@@ -324,7 +287,7 @@ def _create_embedding_lookup(column,
         initializer=embedding_lookup_arguments.initializer,
         trainable=trainable,
         collections=weight_collections)
-    if isinstance(variable, variables.Variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -337,16 +300,6 @@ def _create_embedding_lookup(column,
     return variable, predictions
 
 
-def _maybe_restore_from_checkpoint(checkpoint_path, variable):
-  if checkpoint_path is not None:
-    path, tensor_name = checkpoint_path
-    weights_to_restore = variable
-    if len(variable) == 1:
-      weights_to_restore = variable[0]
-    checkpoint_utils.init_from_checkpoint(path,
-                                          {tensor_name: weights_to_restore})
-
-
 def _create_joint_embedding_lookup(columns_to_tensors,
                                    embedding_lookup_arguments,
                                    num_outputs,
@@ -382,7 +335,7 @@ def _create_joint_embedding_lookup(columns_to_tensors,
         initializer=init_ops.zeros_initializer(),
         trainable=trainable,
         collections=weight_collections)
-    if isinstance(variable, variables.Variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -422,14 +375,15 @@ def joint_weighted_sum_from_feature_columns(columns_to_tensors,
   Returns:
     A tuple containing:
 
-      * A Tensor which represents predictions of a linear model.
-      * A list of Variables storing the weights.
-      * A Variable which is used for bias.
+    * A Tensor which represents predictions of a linear model.
+    * A list of Variables storing the weights.
+    * A Variable which is used for bias.
 
   Raises:
     ValueError: if FeatureColumn cannot be used for linear predictions.
 
   """
+  columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
   with variable_scope.variable_scope(
       scope,
@@ -471,7 +425,7 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
                                       weight_collections=None,
                                       trainable=True,
                                       scope=None):
-  """A tf.contrib.layer style linear prediction builder based on FeatureColumns.
+  """A tf.contrib.layers style linear prediction builder based on FeatureColumn.
 
   Generally a single example in training data is described with feature columns.
   This function generates weighted sum for each num_outputs. Weighted sum refers
@@ -518,6 +472,7 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
   Raises:
     ValueError: if FeatureColumn cannot be used for linear predictions.
   """
+  columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
   with variable_scope.variable_scope(
       scope,
@@ -545,7 +500,8 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
             default_name=column.name,
             values=columns_to_tensors.values()):
           tensor = column._to_dense_tensor(transformed_tensor)
-          tensor = fc._reshape_real_valued_tensor(tensor, 2, column.name)
+          tensor = _maybe_reshape_input_tensor(
+              tensor, column.name, output_rank=2)
           variable = [
               contrib_variables.model_variable(
                   name='weight',
@@ -562,7 +518,7 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
           predictions, shape=(-1, num_outputs)))
       column_to_variable[column] = variable
       _log_variable(variable)
-      _maybe_restore_from_checkpoint(column._checkpoint_path(), variable)
+      fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable)  # pylint: disable=protected-access
     # pylint: enable=protected-access
     predictions_no_bias = math_ops.add_n(output_tensors)
     bias = contrib_variables.model_variable(
@@ -684,8 +640,8 @@ def transform_features(features, feature_columns):
   Returns:
     A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
   """
-  check_feature_columns(feature_columns)
   columns_to_tensor = features.copy()
+  check_feature_columns(feature_columns)
   transformer = _Transformer(columns_to_tensor)
   for column in sorted(set(feature_columns), key=lambda x: x.key):
     transformer.transform(column)
@@ -755,10 +711,10 @@ def parse_feature_columns_from_sequence_examples(
 def _log_variable(variable):
   if isinstance(variable, list):
     for var in variable:
-      if isinstance(variable, variables.Variable):
+      if fc._is_variable(variable):  # pylint: disable=protected-access
         logging.info('Created variable %s, with device=%s', var.name,
                      var.device)
-  elif isinstance(variable, variables.Variable):
+  elif fc._is_variable(variable):  # pylint: disable=protected-access
     logging.info('Created variable %s, with device=%s', variable.name,
                  variable.device)
 
@@ -798,11 +754,14 @@ def check_feature_columns(feature_columns):
   """Checks the validity of the set of FeatureColumns.
 
   Args:
-    feature_columns: A set of instances or subclasses of FeatureColumn.
+    feature_columns: An iterable of instances or subclasses of FeatureColumn.
 
   Raises:
+    ValueError: If `feature_columns` is a dict.
     ValueError: If there are duplicate feature column keys.
   """
+  if isinstance(feature_columns, dict):
+    raise ValueError('Expected feature_columns to be iterable, found dict.')
   seen_keys = set()
   for f in feature_columns:
     key = f.key
@@ -897,7 +856,8 @@ def _add_variable_collection(weight_collections):
 # pylint: disable=protected-access
 _SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
                                fc._EmbeddingColumn,
-                               fc._RealValuedColumn)
+                               fc._RealValuedColumn,
+                               fc._RealValuedVarLenColumn)
 
 _FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
                                fc._BucketizedColumn,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 6dd688b30d8..797a7c11dbf 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
@@ -33,14 +27,15 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -58,8 +53,8 @@ class TransformerTest(test.TestCase):
       self.assertAllEqual(output.eval(), [[20.], [110], [-3]])
 
   def testSparseRealValuedColumnIdentityTransformation(self):
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None)
+    sparse_real_valued = feature_column._real_valued_var_len_column(
+        "rating", is_sparse=True)
     rating_tensor = sparse_tensor.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     features = {"rating": rating_tensor}
@@ -74,11 +69,10 @@ class TransformerTest(test.TestCase):
   def testSparseRealValuedColumnWithTransformation(self):
 
     def square_fn(x):
-      return sparse_tensor.SparseTensor(
-          values=x.values**2, indices=x.indices, dense_shape=x.dense_shape)
+      return x**2
 
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None, normalizer=square_fn)
+    sparse_real_valued = feature_column._real_valued_var_len_column(
+        "rating", normalizer=square_fn, is_sparse=True)
     rating_tensor = sparse_tensor.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     features = {"rating": rating_tensor}
@@ -193,27 +187,28 @@ class TransformerTest(test.TestCase):
       self.assertAllEqual(output.dense_shape.eval(), [2, 2])
 
   def testEmbeddingColumn(self):
-    hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=["omar", "stringer", "marlo"],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
     features = {"wire": wire_tensor}
-    output = feature_column_ops._Transformer(features).transform(
-        feature_column.embedding_column(hashed_sparse, 10))
-    expected = feature_column_ops._Transformer(features).transform(
-        hashed_sparse)
-    with self.test_session():
-      self.assertAllEqual(output.values.eval(), expected.values.eval())
-      self.assertAllEqual(output.indices.eval(), expected.indices.eval())
-      self.assertAllEqual(output.dense_shape.eval(),
-                          expected.dense_shape.eval())
+    hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
+    wire_embedding = feature_column.embedding_column(hashed_sparse, 10)
 
     # Test transform features.
     output = feature_column_ops.transform_features(
-        features=features, feature_columns=[hashed_sparse])
-    self.assertEqual(len(output), 1)
+        features=features, feature_columns=[hashed_sparse, wire_embedding])
+    # Check that features dict haven't changed
+    self.assertEqual({"wire": wire_tensor}, features)
+    self.assertEqual(len(output), 2)
     self.assertIn(hashed_sparse, output)
+    self.assertIn(wire_embedding, output)
+    with self.test_session():
+      self.assertAllEqual(output[wire_embedding].indices.eval(),
+                          wire_tensor.indices.eval())
+      self.assertAllEqual(output[wire_embedding].dense_shape.eval(), [2, 2])
+      self.assertAllEqual(output[wire_embedding].values.eval(),
+                          output[hashed_sparse].values.eval())
 
   def testSparseColumnWithKeys(self):
     keys_sparse = feature_column.sparse_column_with_keys(
@@ -229,7 +224,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(keys_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[keys_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[keys_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[keys_sparse].indices.eval(),
@@ -247,7 +242,7 @@ class TransformerTest(test.TestCase):
     output = feature_column_ops._Transformer(features).transform(keys_sparse)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # While the input is a dense Tensor, the output should be a SparseTensor.
       self.assertIsInstance(output, sparse_tensor.SparseTensor)
       self.assertEqual(output.dtype, dtypes.int64)
@@ -316,7 +311,7 @@ class TransformerTest(test.TestCase):
     self.assertIn(weighted_ids, output)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[weighted_ids][0].dense_shape.eval(),
                           ids_tensor.dense_shape.eval())
       self.assertAllEqual(output[weighted_ids][0].indices.eval(),
@@ -346,7 +341,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -368,7 +363,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -392,7 +387,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -414,7 +409,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -557,15 +552,18 @@ class TransformerTest(test.TestCase):
 
 class CreateInputLayersForDNNsTest(test.TestCase):
 
-  def testAllDNNColumns(self):
-    sparse_column = feature_column.sparse_column_with_keys(
-        "ids", ["a", "b", "c", "unseen"])
+  def testFeatureColumnDictFails(self):
+    real_valued = feature_column.real_valued_column("price")
+    features = {"price": constant_op.constant([[20.], [110], [-3]])}
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Expected feature_columns to be iterable, found dict"):
+      feature_column_ops.input_from_feature_columns(
+          features, {"feature": real_valued})
 
-    real_valued_column = feature_column.real_valued_column("income", 2)
-    sparse_real_valued_column = feature_column.real_valued_column(
-        "rating", dimension=None)
-    one_hot_column = feature_column.one_hot_column(sparse_column)
-    embedding_column = feature_column.embedding_column(sparse_column, 10)
+  def testSparseTensorRealValuedColumn(self):
+    var_len_sparse_real_valued_column = (
+        feature_column._real_valued_var_len_column("rating", is_sparse=True))
     features = {
         "ids":
             sparse_tensor.SparseTensor(
@@ -578,14 +576,36 @@ class CreateInputLayersForDNNsTest(test.TestCase):
             sparse_tensor.SparseTensor(
                 values=[3.5, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     }
-    output = feature_column_ops.input_from_feature_columns(features, [
-        one_hot_column, embedding_column, real_valued_column,
-        sparse_real_valued_column
-    ])
+    with self.assertRaisesRegexp(
+        ValueError,
+        "dd"):
+      feature_column_ops.input_from_feature_columns(
+          features, [var_len_sparse_real_valued_column])
+
+  def testAllDNNColumns(self):
+    sparse_column = feature_column.sparse_column_with_keys(
+        "ids", ["a", "b", "c", "unseen"])
+    real_valued_column = feature_column.real_valued_column("income", 2)
+    one_hot_column = feature_column.one_hot_column(sparse_column)
+    embedding_column = feature_column.embedding_column(sparse_column, 10)
+    features = {
+        "ids":
+            sparse_tensor.SparseTensor(
+                values=["c", "b", "a"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "income":
+            constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
+    }
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    output = feature_column_ops.input_from_feature_columns(features, columns)
+    output_core = fc_core.input_layer(features, columns)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
-      self.assertAllEqual(output.eval().shape, [3, 3 + 4 + 10])
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(output.eval().shape, [3, 2 + 4 + 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
@@ -594,6 +614,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -604,17 +627,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
-  def testRealValuedColumnSparse(self):
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None, default_value=-1)
-    rating_tensor = sparse_tensor.SparseTensor(
-        values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    features = {"rating": rating_tensor}
-    output = feature_column_ops.input_from_feature_columns(features,
-                                                           [sparse_real_valued])
-    with self.test_session():
-      self.assertAllClose(output.eval(), [[2.0], [-1.0], [5.0]])
+  def testRealValuedColumnDense(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
+    features = {"rating": constant_op.constant(rating)}
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating, output)
+
+  def testRealValuedColumnTypeConversion(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0, 1, 2, -1],
+                       [3, 4, 5, 6]])
+    features = {"rating": constant_op.constant(rating, dtype=dtypes.int64)}
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating.astype(np.float32), output)
 
   def testRealValuedColumnWithNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -624,6 +661,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensionsAndNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -635,6 +675,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testBucketizedColumnWithNormalizerSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -647,6 +690,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     expected = [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testBucketizedColumnWithMultiDimensionsSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -661,6 +706,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                 [1, 0, 0, 0, 1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testOneHotColumnFromWeightedSparseColumnSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -679,11 +726,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_column = feature_column.one_hot_column(weighted_ids_column)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_column])
+    output_core = fc_core.input_layer(features, [one_hot_column])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 10., 0], [0, 20., 0, 0], [30., 0, 40., 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -696,12 +746,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromMultivalentSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -714,12 +767,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithIntegerizedFeaturePassesForDNN(self):
     ids_column = feature_column.sparse_column_with_integerized_feature(
@@ -734,10 +790,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     }
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithHashBucketSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("feat", 10)
@@ -749,10 +808,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_sparse = feature_column.one_hot_column(hashed_sparse)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([3, 10], output.eval().shape)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -764,9 +826,12 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(hashed_sparse, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [4, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testScatteredEmbeddingColumnSucceedsForDNN(self):
     wire_tensor = sparse_tensor.SparseTensor(
@@ -782,14 +847,24 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         features, [embedded_sparse], weight_collections=["my_collection"])
     weights = ops.get_collection("my_collection")
     grad = gradients_impl.gradients(output, weights)
+    # Calcuates the tensors calculated by FC core libs. Later, the values will
+    # be compared with the contrib version.
+    output_core = fc_core.input_layer(
+        features, [embedded_sparse], weight_collections=["my_collection_core"])
+    weights_core = ops.get_collection("my_collection_core")
+    grad_core = gradients_impl.gradients(output_core, weights_core)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       gradient_values = []
+      gradient_values_core = []
       # Collect the gradient from the different partitions (one in this test)
       for p in range(len(grad)):
         gradient_values.extend(grad[p].values.eval())
+        gradient_values_core.extend(grad_core[p].values.eval())
       gradient_values.sort()
+      gradient_values_core.sort()
       self.assertAllEqual(gradient_values, [0.5] * 6 + [2] * 3)
+      self.assertAllEqual(gradient_values, gradient_values_core)
 
   def testEmbeddingColumnWithInitializerSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -805,12 +880,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         initializer=init_ops.constant_initializer(init_value))
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       output_eval = output.eval()
       self.assertAllEqual(output_eval.shape, [2, 10])
       self.assertAllClose(output_eval, np.tile(init_value, [2, 10]))
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnWithMultipleInitializersFails(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -856,10 +934,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(weighted_ids, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
+
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testEmbeddingColumnWithIntegerWeightedSparseColumnSucceedsForDNN(self):
     """Same as the previous test, but with integer weights."""
@@ -881,7 +963,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
 
   def testEmbeddingColumnWithCrossedColumnSucceedsForDNN(self):
@@ -932,7 +1014,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           "Error creating input layer for column: ids_weighted_by_weights"):
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         feature_column_ops.input_from_feature_columns(features, [weighted_ids])
 
   def testCrossedColumnFailsForDNN(self):
@@ -1039,7 +1121,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # score: (sum of weights)
       self.assertAllEqual(output.eval(), [[10.], [50.], [0.]])
 
@@ -1094,6 +1176,51 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     # There should  one trainable variable for embeded sparse
     self.assertEqual(1, len(variables_lib.trainable_variables()))
 
+  def testInputLayerWithNonTrainableEmbeddingForDNN(self):
+    sparse_1 = feature_column.sparse_column_with_hash_bucket("wire_1", 10)
+    sparse_2 = feature_column.sparse_column_with_hash_bucket("wire_2", 10)
+    features = {
+        "wire_1":
+            sparse_tensor.SparseTensor(
+                values=["omar", "stringer", "marlo"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "wire_2":
+            sparse_tensor.SparseTensor(
+                values=["jack", "jill"],
+                indices=[[0, 0], [1, 0]],
+                dense_shape=[4, 1])
+    }
+    dims_1 = 10
+    init_1 = 3.14
+    embeded_1 = feature_column.embedding_column(
+        sparse_1, dims_1, initializer=init_ops.constant_initializer(init_1),
+        trainable=False)
+    output_1 = feature_column_ops.input_from_feature_columns(
+        features, [embeded_1])
+    # There should be no trainable variables for sparse_1
+    self.assertEqual(0, len(variables_lib.trainable_variables()))
+
+    dims_2 = 7
+    init_2 = 6.14
+    embeded_2 = feature_column.embedding_column(
+        sparse_2, dims_2, initializer=init_ops.constant_initializer(init_2),
+        trainable=True)
+    output_2 = feature_column_ops.input_from_feature_columns(
+        features, [embeded_2])
+    # There should be one trainable variables for sparse_2
+    self.assertEqual(1, len(variables_lib.trainable_variables()))
+
+    with self.test_session():
+      variables_lib.global_variables_initializer().run()
+      output_1_eval = output_1.eval()
+      output_2_eval = output_2.eval()
+      self.assertAllEqual(output_1_eval.shape, [3, dims_1])
+      self.assertAllClose(output_1_eval, np.tile(init_1, [3, dims_1]))
+      self.assertAllEqual(output_2_eval.shape, [4, dims_2])
+      self.assertAllClose(output_2_eval, np.concatenate(
+          (np.tile(init_2, [2, dims_2]), np.tile(0, [2, dims_2]))))
+
 
 class SequenceInputFromFeatureColumnTest(test.TestCase):
 
@@ -1147,6 +1274,19 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
       model_inputs = sess.run(model_input_tensor)
     self.assertAllClose(measurement_input, model_inputs)
 
+  def testRealValuedVarLenColumn(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
+    features = {"rating": constant_op.constant(rating)}
+    with self.test_session() as sess:
+      output = sess.run(
+          feature_column_ops.sequence_input_from_feature_columns(
+              features, [var_len_real_valued]))
+    reshaped_rating = np.reshape(rating, [2, 4, 1])
+    self.assertAllClose(reshaped_rating, output)
+
   def testRealValuedColumnWithExtraDimensions(self):
     batch_size = 4
     sequence_length = 8
@@ -1232,7 +1372,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, 4])
@@ -1266,7 +1406,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, hash_buckets])
@@ -1296,7 +1436,36 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
+      model_input = sess.run(model_input_tensor)
+
+    self.assertAllEqual(expected_input_shape, model_input.shape)
+
+  def testEmbeddingColumnWithAutoReshape(self):
+    hash_buckets = 10
+    embedding_dimension = 5
+    ids_tensor = sparse_tensor.SparseTensor(
+        values=["c", "b",
+                "a", "c", "b",
+                "b"],
+        indices=[[0, 0], [0, 1],
+                 [1, 0], [1, 1], [1, 2],
+                 [3, 2]],
+        dense_shape=[4, 3])
+
+    expected_input_shape = np.array([4, 3, embedding_dimension])
+
+    hashed_ids_column = feature_column.sparse_column_with_hash_bucket(
+        "ids", hash_buckets)
+    embedded_column = feature_column.embedding_column(hashed_ids_column,
+                                                      embedding_dimension)
+    columns_to_tensors = {"ids": ids_tensor}
+    model_input_tensor = feature_column_ops.sequence_input_from_feature_columns(
+        columns_to_tensors, [embedded_column])
+
+    with self.test_session() as sess:
+      variables_lib.global_variables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     self.assertAllEqual(expected_input_shape, model_input.shape)
@@ -1326,14 +1495,14 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
                                                embedding_weights)
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input, gradients = sess.run([model_input_tensor, gradient_tensor])
 
     expected_input_shape = [4, 3, embedding_dimension]
     self.assertAllEqual(expected_input_shape, model_input.shape)
 
-    # `ids_tensor` consists of 7 instances of <empty>, 3 occurences of "b",
-    # 2 occurences of "c" and 1 instance of "a".
+    # `ids_tensor` consists of 7 instances of <empty>, 3 occurrences of "b",
+    # 2 occurrences of "c" and 1 instance of "a".
     expected_gradient_values = sorted([0., 3., 2., 1.] * embedding_dimension)
     actual_gradient_values = np.sort(gradients[0].values, axis=None)
     self.assertAllClose(expected_gradient_values, actual_gradient_values)
@@ -1393,7 +1562,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = [
@@ -1405,6 +1574,19 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
 class WeightedSumTest(test.TestCase):
 
+  def testFeatureColumnDictFails(self):
+    hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=["omar", "stringer", "marlo"],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    features = {"wire": wire_tensor}
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Expected feature_columns to be iterable, found dict"):
+      feature_column_ops.weighted_sum_from_feature_columns(
+          features, {"feature": hashed_sparse}, num_outputs=5)
+
   def testSparseColumn(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = sparse_tensor.SparseTensor(
@@ -1414,9 +1596,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseIntColumn(self):
     """Tests a sparse column with int values."""
@@ -1429,9 +1614,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseColumnWithDenseInputTensor(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1440,9 +1628,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumn(self):
     ids = feature_column.sparse_column_with_keys("ids",
@@ -1459,10 +1650,13 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumnWithDenseInputTensor(self):
     ids = feature_column.sparse_column_with_keys(
@@ -1474,11 +1668,14 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testCrossedColumn(self):
     a = feature_column.sparse_column_with_hash_bucket(
@@ -1493,9 +1690,12 @@ class WeightedSumTest(test.TestCase):
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [crossed], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testEmbeddingColumn(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1529,9 +1729,11 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [movies], num_outputs=1))
+      logits_core = fc_core.linear_model(features, [movies])
+
       with self.test_session() as sess:
         variables_lib.initialize_all_variables().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (3, 1))
@@ -1539,6 +1741,8 @@ class WeightedSumTest(test.TestCase):
         # score for first example = 0.3 (matrix) + 0.1 (head-on) = 0.4
         # score for second example = 0.5 (winter sleep)
         self.assertAllClose(output.eval(), [[0.4], [0.5]])
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval().shape, logits_core.eval().shape)
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -1583,9 +1787,13 @@ class WeightedSumTest(test.TestCase):
     }
     output, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [real_valued, bucket, hashed_sparse, crossed], num_outputs=5)
+    output_core = fc_core.linear_model(
+        features, [real_valued, bucket, hashed_sparse, crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [3, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testPredictions(self):
     language = feature_column.sparse_column_with_keys(
@@ -1606,7 +1814,7 @@ class WeightedSumTest(test.TestCase):
               features, [age, language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1646,7 +1854,7 @@ class WeightedSumTest(test.TestCase):
       self.assertEqual(len(variables), 1)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1710,7 +1918,7 @@ class WeightedSumTest(test.TestCase):
               features, [weighted_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1738,7 +1946,7 @@ class WeightedSumTest(test.TestCase):
               features, [language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # score: 0.1 + language_weight['hindi'] + language_weight['english']
         sess.run(bias.assign([0.1]))
@@ -1761,7 +1969,7 @@ class WeightedSumTest(test.TestCase):
               features, [movies], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (15, 1))
@@ -1795,7 +2003,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1819,7 +2027,7 @@ class WeightedSumTest(test.TestCase):
               features, [language_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[language_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1852,7 +2060,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1893,7 +2101,7 @@ class WeightedSumTest(test.TestCase):
                 scope=scope))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertEqual(2, len(column_to_variable[country]))
         self.assertEqual(3, len(column_to_variable[language]))
@@ -1930,7 +2138,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, incomes], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         incomes_weights = column_to_variable[incomes][0]
         sess.run(incomes_weights.assign([[0.1], [0.2], [0.3]]))
@@ -1942,7 +2150,9 @@ class WeightedSumTest(test.TestCase):
     age = feature_column.real_valued_column("age")
     # The following RealValuedColumn has no predefined dimension so it
     # can be missing.
-    height = feature_column.real_valued_column("height", dimension=None)
+    height = feature_column._real_valued_var_len_column("height",
+                                                        default_value=0,
+                                                        is_sparse=False)
     # The following RealValuedColumn has 3 dimensions.
     incomes = feature_column.real_valued_column("incomes", 3)
     with ops.Graph().as_default():
@@ -1952,10 +2162,7 @@ class WeightedSumTest(test.TestCase):
           "incomes":
               constant_op.constant([[100., 200., 300.], [10., 20., 30.]]),
           "height":
-              sparse_tensor.SparseTensor(
-                  values=[5.0, 4.0, 6.0],
-                  indices=[[0, 0], [0, 1], [1, 1]],
-                  dense_shape=[2, 2]),
+              constant_op.constant([[5., 4.], [0., 6.]]),
           "country":
               sparse_tensor.SparseTensor(
                   values=["US", "SV"],
@@ -1967,7 +2174,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, height, incomes], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         height_weights = column_to_variable[height][0]
         sess.run(
@@ -1995,9 +2202,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         sess.run(column_to_variable[bucket][0].assign([[0.1], [0.2], [0.3],
                                                        [0.4]]))
@@ -2023,9 +2233,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket, country], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket, country])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         # dimension = 2, bucket_size = 4, num_classes = 1
         sess.run(column_to_variable[bucket][0].assign(
@@ -2054,7 +2267,7 @@ class WeightedSumTest(test.TestCase):
               features, [bucket, country], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # dimension = 2, bucket_size = 4, num_classes = 5
         sess.run(column_to_variable[bucket][0].assign(
@@ -2090,7 +2303,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2129,7 +2342,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2153,7 +2366,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2168,7 +2381,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2183,7 +2396,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.6], [0.7]])
@@ -2204,7 +2417,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2216,7 +2429,7 @@ class WeightedSumTest(test.TestCase):
           features, [feature_column.real_valued_column("age")], num_outputs=3)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         sess.run(bias.assign([0.1, 0.2, 0.3]))
         self.assertAllClose(output.eval(), [[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
                                             [0.1, 0.2, 0.3], [0.1, 0.2, 0.3]])
@@ -2230,7 +2443,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (1, 3))
         sess.run(weights.assign([[0.01, 0.03, 0.05]]))
@@ -2254,7 +2467,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
         sess.run(
@@ -2280,7 +2493,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2320,7 +2533,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2349,7 +2562,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2414,7 +2627,7 @@ class ParseExampleTest(test.TestCase):
     self.assertIn(bucket, output)
     self.assertIn(wire_cast, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[bucket].eval(), [[2, 3, 0]])
       self.assertAllEqual(output[wire_cast].indices.eval(), [[0, 0], [0, 1]])
       self.assertAllEqual(output[wire_cast].values.eval(), [2, 0])
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index a3b2c98c807..ce3bc23cf66 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -18,23 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import itertools
 import os
-import sys
 import tempfile
 
-# pylint: disable=g-bad-todo
-# TODO(#6568): Remove this hack that makes dlopen() not crash.
-# pylint: enable=g-bad-todo
-# pylint: disable=g-import-not-at-top
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -108,6 +101,14 @@ class FeatureColumnTest(test.TestCase):
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
 
+  def testWeightedSparseColumnDeepCopy(self):
+    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
+    weighted = fc.weighted_sparse_column(ids, "weights")
+    weighted_copy = copy.deepcopy(weighted)
+    self.assertEqual(weighted_copy.sparse_id_column.name, "ids")
+    self.assertEqual(weighted_copy.weight_column_name, "weights")
+    self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
+
   def testEmbeddingColumn(self):
     a = fc.sparse_column_with_hash_bucket(
         "aaa", hash_bucket_size=100, combiner="sum")
@@ -116,6 +117,35 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(b.dimension, 4)
     self.assertEqual(b.combiner, "mean")
 
+  def testEmbeddingColumnDeepCopy(self):
+    a = fc.sparse_column_with_hash_bucket(
+        "aaa", hash_bucket_size=100, combiner="sum")
+    column = fc.embedding_column(a, dimension=4, combiner="mean")
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.name, "aaa_embedding")
+    self.assertEqual(column_copy.sparse_id_column.name, "aaa")
+    self.assertEqual(column_copy.dimension, 4)
+    self.assertEqual(column_copy.combiner, "mean")
+
+  def testScatteredEmbeddingColumn(self):
+    column = fc.scattered_embedding_column(
+        "aaa", size=100, dimension=10, hash_key=1)
+    self.assertEqual(column.column_name, "aaa")
+    self.assertEqual(column.size, 100)
+    self.assertEqual(column.dimension, 10)
+    self.assertEqual(column.hash_key, 1)
+    self.assertEqual(column.name, "aaa_scattered_embedding")
+
+  def testScatteredEmbeddingColumnDeepCopy(self):
+    column = fc.scattered_embedding_column(
+        "aaa", size=100, dimension=10, hash_key=1)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.column_name, "aaa")
+    self.assertEqual(column_copy.size, 100)
+    self.assertEqual(column_copy.dimension, 10)
+    self.assertEqual(column_copy.hash_key, 1)
+    self.assertEqual(column_copy.name, "aaa_scattered_embedding")
+
   def testSharedEmbeddingColumn(self):
     a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
     a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
@@ -144,7 +174,7 @@ class FeatureColumnTest(test.TestCase):
     for i in range(len(b1_value)):
       self.assertAllClose(b1_value[i], b2_value[i])
 
-    # Test the case when a shared_embedding_name is explictly specified.
+    # Test the case when a shared_embedding_name is explicitly specified.
     d = fc.shared_embedding_columns(
         [a1, a2],
         dimension=4,
@@ -153,7 +183,7 @@ class FeatureColumnTest(test.TestCase):
     # a3 is a completely different sparse column with a1 and a2, but since the
     # same shared_embedding_name is passed in, a3 will have the same embedding
     # as a1 and a2
-    a3 = fc.sparse_column_with_keys("a3", ["cathy", "tom", "anderson"])
+    a3 = fc.sparse_column_with_keys("a3", [42, 1, -1000], dtype=dtypes.int32)
     e = fc.shared_embedding_columns(
         [a3],
         dimension=4,
@@ -200,6 +230,17 @@ class FeatureColumnTest(test.TestCase):
       ])
       fc.shared_embedding_columns(invalid_set, dimension=2, combiner="mean")
 
+  def testSharedEmbeddingColumnDeepCopy(self):
+    a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
+    a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
+    columns = fc.shared_embedding_columns(
+        [a1, a2], dimension=4, combiner="mean")
+    columns_copy = copy.deepcopy(columns)
+    self.assertEqual(
+        columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
+    self.assertEqual(
+        columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
+
   def testOneHotColumn(self):
     a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
     onehot_a = fc.one_hot_column(a)
@@ -239,6 +280,59 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
     self.assertEqual(one_hot.length, 3)
 
+  def testOneHotColumnDeepCopy(self):
+    a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
+    column = fc.one_hot_column(a)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.sparse_id_column.name, "a")
+    self.assertEqual(column.name, "a_one_hot")
+    self.assertEqual(column.length, 4)
+
+  def testRealValuedVarLenColumn(self):
+    c = fc._real_valued_var_len_column("ccc", is_sparse=True)
+    self.assertTrue(c.is_sparse)
+    self.assertTrue(c.default_value is None)
+    # default_value is an integer.
+    c5 = fc._real_valued_var_len_column("c5", default_value=2)
+    self.assertEqual(c5.default_value, 2)
+    # default_value is a float.
+    d4 = fc._real_valued_var_len_column("d4", is_sparse=True)
+    self.assertEqual(d4.default_value, None)
+    self.assertEqual(d4.is_sparse, True)
+    # Default value is a list but dimension is None.
+    with self.assertRaisesRegexp(ValueError,
+                                 "Only scalar default value.*"):
+      fc._real_valued_var_len_column("g5", default_value=[2., 3.])
+
+  def testRealValuedVarLenColumnDtypes(self):
+    rvc = fc._real_valued_var_len_column("rvc", is_sparse=True)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
+        }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
+                                         is_sparse=False)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
+                                                       dtype=dtypes.float32,
+                                                       allow_missing=True,
+                                                       default_value=0.0)
+        }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
+                                         default_value=0, is_sparse=True)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
+        }, rvc.config)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 "dtype must be convertible to float"):
+      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
+                                     default_value="", is_sparse=True)
+
   def testRealValuedColumn(self):
     a = fc.real_valued_column("aaa")
     self.assertEqual(a.name, "aaa")
@@ -246,9 +340,6 @@ class FeatureColumnTest(test.TestCase):
     b = fc.real_valued_column("bbb", 10)
     self.assertEqual(b.dimension, 10)
     self.assertTrue(b.default_value is None)
-    c = fc.real_valued_column("ccc", dimension=None)
-    self.assertIsNone(c.dimension)
-    self.assertTrue(c.default_value is None)
 
     with self.assertRaisesRegexp(TypeError, "dimension must be an integer"):
       fc.real_valued_column("d3", dimension=1.0)
@@ -271,8 +362,6 @@ class FeatureColumnTest(test.TestCase):
     c4 = fc.real_valued_column(
         "c4", dimension=4, default_value=2, dtype=dtypes.int32)
     self.assertListEqual(list(c4.default_value), [2, 2, 2, 2])
-    c5 = fc.real_valued_column("c5", dimension=None, default_value=2)
-    self.assertListEqual(list(c5.default_value), [2])
 
     # default_value is a float.
     d1 = fc.real_valued_column("d1", default_value=2.)
@@ -282,8 +371,6 @@ class FeatureColumnTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError,
                                  "default_value must be compatible with dtype"):
       fc.real_valued_column("d3", default_value=2., dtype=dtypes.int32)
-    d4 = fc.real_valued_column("d4", dimension=None, default_value=2.)
-    self.assertListEqual(list(d4.default_value), [2.])
 
     # default_value is neither integer nor float.
     with self.assertRaisesRegexp(TypeError,
@@ -314,12 +401,6 @@ class FeatureColumnTest(test.TestCase):
         ValueError, "The length of default_value must be equal to dimension"):
       fc.real_valued_column("g4", dimension=3, default_value=[2.])
 
-    # Default value is a list but dimension is None.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only scalar default value is supported "
-                                 "when dimension is None"):
-      fc.real_valued_column("g5", dimension=None, default_value=[2., 3.])
-
     # Test that the normalizer_fn gets stored for a real_valued_column
     normalizer = lambda x: x - 1
     h1 = fc.real_valued_column("h1", normalizer=normalizer)
@@ -356,30 +437,21 @@ class FeatureColumnTest(test.TestCase):
   def testRealValuedColumnDensification(self):
     """Tests densification behavior of `RealValuedColumn`."""
     # No default value, dimension 1 float.
-    real_valued_column = fc.real_valued_column(
-        "sparse_real_valued1", dimension=None)
+    real_valued_column = fc._real_valued_var_len_column(
+        "sparse_real_valued1", is_sparse=True)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    densified_output = real_valued_column._to_dnn_input_layer(sparse_tensor)
+    with self.assertRaisesRegexp(
+        ValueError, "Set is_sparse to False"):
+      real_valued_column._to_dnn_input_layer(sparse_tensor)
 
-    # With default value, dimension 2 int.
-    real_valued_column_with_default = fc.real_valued_column(
-        "sparse_real_valued2",
-        dimension=None,
-        default_value=-1,
-        dtype=dtypes.int32)
-    sparse_tensor2 = sparse_tensor_lib.SparseTensor(
-        values=[2, 5, 9, 0],
-        indices=[[0, 0], [1, 1], [2, 0], [2, 1]],
-        dense_shape=[3, 2])
-    densified_output2 = real_valued_column_with_default._to_dnn_input_layer(
-        sparse_tensor2)
-
-    with self.test_session() as sess:
-      densified_output_eval, densified_output_eval2 = sess.run(
-          [densified_output, densified_output2])
-      self.assertAllEqual(densified_output_eval, [[2.0], [0.0], [5.0]])
-      self.assertAllEqual(densified_output_eval2, [[2, -1], [-1, 5], [9, 0]])
+  def testRealValuedColumnDeepCopy(self):
+    column = fc.real_valued_column(
+        "aaa", dimension=3, default_value=[1, 2, 3], dtype=dtypes.int32)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.name, "aaa")
+    self.assertEqual(column_copy.dimension, 3)
+    self.assertEqual(column_copy.default_value, (1, 2, 3))
 
   def testBucketizedColumnNameEndsWithUnderscoreBucketized(self):
     a = fc.bucketized_column(fc.real_valued_column("aaa"), [0, 4])
@@ -396,9 +468,11 @@ class FeatureColumnTest(test.TestCase):
               column_name="bbb", bucket_size=10), [0])
 
   def testBucketizedColumnRequiresRealValuedColumnDimension(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "source_column must have a defined dimension"):
-      fc.bucketized_column(fc.real_valued_column("bbb", dimension=None), [0])
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn.*"):
+      fc.bucketized_column(fc._real_valued_var_len_column("bbb",
+                                                          is_sparse=True),
+                           [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
     with self.assertRaisesRegexp(ValueError,
@@ -411,6 +485,20 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(a_bucketized.name, "a_bucketized")
     self.assertTupleEqual(a_bucketized.boundaries, (1., 2., 3.))
 
+  def testBucketizedColumnDeepCopy(self):
+    """Tests that we can do a deepcopy of a bucketized column.
+
+    This test requires that the bucketized column also accept boundaries
+    as tuples.
+    """
+    bucketized = fc.bucketized_column(
+        fc.real_valued_column("a"), [1., 2., 2., 3., 3.])
+    self.assertEqual(bucketized.name, "a_bucketized")
+    self.assertTupleEqual(bucketized.boundaries, (1., 2., 3.))
+    bucketized_copy = copy.deepcopy(bucketized)
+    self.assertEqual(bucketized_copy.name, "a_bucketized")
+    self.assertTupleEqual(bucketized_copy.boundaries, (1., 2., 3.))
+
   def testCrossedColumnNameCreatesSortedNames(self):
     a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
     b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
@@ -431,7 +519,27 @@ class FeatureColumnTest(test.TestCase):
       fc.crossed_column(
           set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
 
-  def testWeightedSparseColumnDtypes(self):
+  def testCrossedColumnDeepCopy(self):
+    a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
+    b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
+    bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4])
+    crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000)
+    crossed_copy = copy.deepcopy(crossed)
+    self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed_copy.name,
+                     "name should be generated by sorted column names")
+    self.assertEqual("aaa", crossed_copy.columns[0].name)
+    self.assertEqual("bbb", crossed_copy.columns[1].name)
+    self.assertEqual("cost_bucketized", crossed_copy.columns[2].name)
+
+  def testFloat32WeightedSparseInt32ColumnDtypes(self):
+    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32)
+    weighted_ids = fc.weighted_sparse_column(ids, "weights")
+    self.assertDictEqual({
+        "ids": parsing_ops.VarLenFeature(dtypes.int32),
+        "weights": parsing_ops.VarLenFeature(dtypes.float32)
+    }, weighted_ids.config)
+
+  def testFloat32WeightedSparseStringColumnDtypes(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     self.assertDictEqual({
@@ -439,6 +547,8 @@ class FeatureColumnTest(test.TestCase):
         "weights": parsing_ops.VarLenFeature(dtypes.float32)
     }, weighted_ids.config)
 
+  def testInt32WeightedSparseStringColumnDtypes(self):
+    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
     self.assertDictEqual({
         "ids": parsing_ops.VarLenFeature(dtypes.string),
@@ -450,6 +560,19 @@ class FeatureColumnTest(test.TestCase):
       weighted_ids = fc.weighted_sparse_column(
           ids, "weights", dtype=dtypes.string)
 
+  def testInt32WeightedSparseInt64ColumnDtypes(self):
+    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64)
+    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
+    self.assertDictEqual({
+        "ids": parsing_ops.VarLenFeature(dtypes.int64),
+        "weights": parsing_ops.VarLenFeature(dtypes.int32)
+    }, weighted_ids.config)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype is not convertible to float"):
+      weighted_ids = fc.weighted_sparse_column(
+          ids, "weights", dtype=dtypes.string)
+
   def testRealValuedColumnDtypes(self):
     rvc = fc.real_valued_column("rvc")
     self.assertDictEqual(
@@ -459,12 +582,6 @@ class FeatureColumnTest(test.TestCase):
         },
         rvc.config)
 
-    rvc = fc.real_valued_column("rvc", dimension=None)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
-        }, rvc.config)
-
     rvc = fc.real_valued_column("rvc", dtype=dtypes.int32)
     self.assertDictEqual(
         {
@@ -473,20 +590,10 @@ class FeatureColumnTest(test.TestCase):
         },
         rvc.config)
 
-    rvc = fc.real_valued_column("rvc", dimension=None, dtype=dtypes.int32)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
-        }, rvc.config)
-
     with self.assertRaisesRegexp(ValueError,
                                  "dtype must be convertible to float"):
       fc.real_valued_column("rvc", dtype=dtypes.string)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "dtype must be convertible to float"):
-      fc.real_valued_column("rvc", dimension=None, dtype=dtypes.string)
-
   def testSparseColumnDtypes(self):
     sc = fc.sparse_column_with_integerized_feature("sc", 10)
     self.assertDictEqual(
@@ -511,6 +618,69 @@ class FeatureColumnTest(test.TestCase):
         }, sc.config)
     self.assertEqual(1, sc._wide_embedding_lookup_arguments(None).vocab_size)
 
+  def testSparseColumnAcceptsDenseScalar(self):
+    """Tests that `SparseColumn`s accept dense scalar inputs."""
+    batch_size = 4
+    dense_scalar_input = [1, 2, 3, 4]
+    sparse_column = fc.sparse_column_with_integerized_feature("values", 10)
+    features = {"values":
+                constant_op.constant(dense_scalar_input, dtype=dtypes.int64)}
+    sparse_column.insert_transformed_feature(features)
+    sparse_output = features[sparse_column]
+    expected_shape = [batch_size, 1]
+    with self.test_session() as sess:
+      sparse_result = sess.run(sparse_output)
+    self.assertEquals(expected_shape, list(sparse_result.dense_shape))
+
+  def testSparseColumnIntegerizedDeepCopy(self):
+    """Tests deepcopy of sparse_column_with_integerized_feature."""
+    column = fc.sparse_column_with_integerized_feature("a", 10)
+    self.assertEqual("a", column.name)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual("a", column_copy.name)
+    self.assertEqual(10, column_copy.bucket_size)
+    self.assertTrue(column_copy.is_integerized)
+
+  def testSparseColumnHashBucketDeepCopy(self):
+    """Tests deepcopy of sparse_column_with_hash_bucket."""
+    column = fc.sparse_column_with_hash_bucket("a", 10)
+    self.assertEqual("a", column.name)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual("a", column_copy.name)
+    self.assertEqual(10, column_copy.bucket_size)
+    self.assertFalse(column_copy.is_integerized)
+
+  def testSparseColumnKeysDeepCopy(self):
+    """Tests deepcopy of sparse_column_with_keys."""
+    column = fc.sparse_column_with_keys(
+        "a", keys=["key0", "key1", "key2"])
+    self.assertEqual("a", column.name)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual("a", column_copy.name)
+    self.assertEqual(
+        fc._SparseIdLookupConfig(  # pylint: disable=protected-access
+            keys=("key0", "key1", "key2"),
+            vocab_size=3,
+            default_value=-1),
+        column_copy.lookup_config)
+    self.assertFalse(column_copy.is_integerized)
+
+  def testSparseColumnVocabularyDeepCopy(self):
+    """Tests deepcopy of sparse_column_with_vocabulary_file."""
+    column = fc.sparse_column_with_vocabulary_file(
+        "a", vocabulary_file="path_to_file", vocab_size=3)
+    self.assertEqual("a", column.name)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual("a", column_copy.name)
+    self.assertEqual(
+        fc._SparseIdLookupConfig(  # pylint: disable=protected-access
+            vocabulary_file="path_to_file",
+            num_oov_buckets=0,
+            vocab_size=3,
+            default_value=-1),
+        column_copy.lookup_config)
+    self.assertFalse(column_copy.is_integerized)
+
   def testCreateFeatureSpec(self):
     sparse_col = fc.sparse_column_with_hash_bucket(
         "sparse_column", hash_bucket_size=100)
@@ -518,14 +688,16 @@ class FeatureColumnTest(test.TestCase):
         fc.sparse_column_with_hash_bucket(
             "sparse_column_for_embedding", hash_bucket_size=10),
         dimension=4)
-    sparse_id_col = fc.sparse_column_with_keys("id_column",
-                                               ["marlo", "omar", "stringer"])
-    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
-                                                "id_weights_column")
+    str_sparse_id_col = fc.sparse_column_with_keys(
+        "str_id_column", ["marlo", "omar", "stringer"])
+    int32_sparse_id_col = fc.sparse_column_with_keys(
+        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
+    int64_sparse_id_col = fc.sparse_column_with_keys(
+        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
+    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
+                                                "str_id_weights_column")
     real_valued_col1 = fc.real_valued_column("real_valued_column1")
     real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
-    real_valued_col3 = fc.real_valued_column(
-        "real_valued_column3", dimension=None)
     bucketized_col1 = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
     bucketized_col2 = fc.bucketized_column(
@@ -534,19 +706,28 @@ class FeatureColumnTest(test.TestCase):
     a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
     b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
     cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
+    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
+        "sparse_column_for_one_hot", hash_bucket_size=100))
+    scattered_embedding_col = fc.scattered_embedding_column(
+        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
     feature_columns = set([
-        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
-        real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2,
-        cross_col
+        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
+        int64_sparse_id_col, real_valued_col1, real_valued_col2,
+        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
+        scattered_embedding_col
     ])
     expected_config = {
         "sparse_column":
             parsing_ops.VarLenFeature(dtypes.string),
         "sparse_column_for_embedding":
             parsing_ops.VarLenFeature(dtypes.string),
-        "id_column":
+        "str_id_column":
             parsing_ops.VarLenFeature(dtypes.string),
-        "id_weights_column":
+        "int32_id_column":
+            parsing_ops.VarLenFeature(dtypes.int32),
+        "int64_id_column":
+            parsing_ops.VarLenFeature(dtypes.int64),
+        "str_id_weights_column":
             parsing_ops.VarLenFeature(dtypes.float32),
         "real_valued_column1":
             parsing_ops.FixedLenFeature(
@@ -554,8 +735,6 @@ class FeatureColumnTest(test.TestCase):
         "real_valued_column2":
             parsing_ops.FixedLenFeature(
                 [5], dtype=dtypes.float32),
-        "real_valued_column3":
-            parsing_ops.VarLenFeature(dtype=dtypes.float32),
         "real_valued_column_for_bucketization1":
             parsing_ops.FixedLenFeature(
                 [1], dtype=dtypes.float32),
@@ -565,12 +744,20 @@ class FeatureColumnTest(test.TestCase):
         "cross_aaa":
             parsing_ops.VarLenFeature(dtypes.string),
         "cross_bbb":
-            parsing_ops.VarLenFeature(dtypes.string)
+            parsing_ops.VarLenFeature(dtypes.string),
+        "sparse_column_for_one_hot":
+            parsing_ops.VarLenFeature(dtypes.string),
+        "scattered_embedding_column":
+            parsing_ops.VarLenFeature(dtypes.string),
     }
 
     config = fc.create_feature_spec_for_parsing(feature_columns)
     self.assertDictEqual(expected_config, config)
 
+    # Tests that contrib feature columns work with core library:
+    config_core = fc_core.make_parse_example_spec(feature_columns)
+    self.assertDictEqual(expected_config, config_core)
+
     # Test that the same config is parsed out if we pass a dictionary.
     feature_columns_dict = {
         str(i): val
@@ -579,6 +766,23 @@ class FeatureColumnTest(test.TestCase):
     config = fc.create_feature_spec_for_parsing(feature_columns_dict)
     self.assertDictEqual(expected_config, config)
 
+  def testCreateFeatureSpec_ExperimentalColumns(self):
+    real_valued_col0 = fc._real_valued_var_len_column(
+        "real_valued_column0", is_sparse=True)
+    real_valued_col1 = fc._real_valued_var_len_column(
+        "real_valued_column1", dtype=dtypes.int64, default_value=0,
+        is_sparse=False)
+    feature_columns = set([real_valued_col0, real_valued_col1])
+    expected_config = {
+        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column1":
+            parsing_ops.FixedLenSequenceFeature(
+                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
+    }
+
+    config = fc.create_feature_spec_for_parsing(feature_columns)
+    self.assertDictEqual(expected_config, config)
+
   def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self):
     real_valued_col1 = fc.real_valued_column(
         "real_valued_column1", default_value=2)
@@ -588,14 +792,17 @@ class FeatureColumnTest(test.TestCase):
         "real_valued_column3", default_value=[8])
     real_valued_col4 = fc.real_valued_column(
         "real_valued_column4", 3, default_value=[1, 0, 6])
-    real_valued_col5 = fc.real_valued_column(
-        "real_valued_column5", dimension=None, default_value=2)
+    real_valued_col5 = fc._real_valued_var_len_column(
+        "real_valued_column5", default_value=2, is_sparse=True)
+    real_valued_col6 = fc._real_valued_var_len_column(
+        "real_valued_column6", dtype=dtypes.int64, default_value=1,
+        is_sparse=False)
     feature_columns = [
         real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
-        real_valued_col5
+        real_valued_col5, real_valued_col6
     ]
     config = fc.create_feature_spec_for_parsing(feature_columns)
-    self.assertEqual(5, len(config))
+    self.assertEqual(6, len(config))
     self.assertDictEqual(
         {
             "real_valued_column1":
@@ -613,7 +820,11 @@ class FeatureColumnTest(test.TestCase):
                 parsing_ops.FixedLenFeature(
                     [3], dtype=dtypes.float32, default_value=[1., 0., 6.]),
             "real_valued_column5":
-                parsing_ops.VarLenFeature(dtype=dtypes.float32)
+                parsing_ops.VarLenFeature(dtype=dtypes.float32),
+            "real_valued_column6":
+                parsing_ops.FixedLenSequenceFeature(
+                    [], dtype=dtypes.int64, allow_missing=True,
+                    default_value=1)
         },
         config)
 
@@ -631,12 +842,14 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
     real_valued_col2 = fc.real_valued_column(
         "real_valued_default_column", dimension=5, default_value=3.0)
-    real_valued_col3 = fc.real_valued_column(
-        "real_valued_var_len_column", dimension=None, default_value=3.0)
+    real_valued_col3 = fc._real_valued_var_len_column(
+        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
+    real_valued_col4 = fc._real_valued_var_len_column(
+        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)
 
     feature_columns = set([
         sparse_col, embedding_col, weighted_id_col, real_valued_col1,
-        real_valued_col2, real_valued_col3
+        real_valued_col2, real_valued_col3, real_valued_col4
     ])
 
     feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)
@@ -657,7 +870,11 @@ class FeatureColumnTest(test.TestCase):
             parsing_ops.FixedLenSequenceFeature(
                 shape=[5], dtype=dtypes.float32, allow_missing=True),
         "real_valued_var_len_column":
-            parsing_ops.VarLenFeature(dtype=dtypes.float32)
+            parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_var_len_dense_column":
+            parsing_ops.FixedLenSequenceFeature(
+                shape=[], dtype=dtypes.float32, allow_missing=True,
+                default_value=4.0),
     }
 
     self.assertDictEqual(expected_feature_spec, feature_spec)
@@ -666,8 +883,8 @@ class FeatureColumnTest(test.TestCase):
     sparse_col = fc.sparse_column_with_hash_bucket(
         "sparse_column", hash_bucket_size=100)
     real_valued_col = fc.real_valued_column("real_valued_column", 5)
-    vlen_real_valued_col = fc.real_valued_column(
-        "vlen_real_valued_column", dimension=None)
+    vlen_real_valued_col = fc._real_valued_var_len_column(
+        "vlen_real_valued_column", is_sparse=True)
 
     bucketized_col = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4])
@@ -755,11 +972,13 @@ class FeatureColumnTest(test.TestCase):
   def testInitCrossedColumnWeightsFromCkpt(self):
     sparse_col_1 = fc.sparse_column_with_hash_bucket(
         column_name="col_1", hash_bucket_size=4)
-    sparse_col_2 = fc.sparse_column_with_hash_bucket(
-        column_name="col_2", hash_bucket_size=4)
+    sparse_col_2 = fc.sparse_column_with_keys(
+        column_name="col_2", keys=("foo", "bar", "baz"))
+    sparse_col_3 = fc.sparse_column_with_keys(
+        column_name="col_3", keys=(42, 1, -1000), dtype=dtypes.int64)
 
     crossed_col = fc.crossed_column(
-        columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4)
+        columns=[sparse_col_1, sparse_col_2, sparse_col_3], hash_bucket_size=4)
 
     input_tensor = sparse_tensor_lib.SparseTensor(
         indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
@@ -775,7 +994,8 @@ class FeatureColumnTest(test.TestCase):
         _, col_weights, _ = (
             feature_column_ops.weighted_sum_from_feature_columns({
                 sparse_col_1.name: input_tensor,
-                sparse_col_2.name: input_tensor
+                sparse_col_2.name: input_tensor,
+                sparse_col_3.name: input_tensor
             }, [crossed_col], 1))
         # Update the weights since default initializer initializes all weights
         # to 0.0.
@@ -798,9 +1018,9 @@ class FeatureColumnTest(test.TestCase):
         columns=[sparse_col_1, sparse_col_2],
         hash_bucket_size=4,
         ckpt_to_load_from=checkpoint_path,
-        tensor_name_in_ckpt=("run_1/col_1_X_col_2/"
+        tensor_name_in_ckpt=("run_1/col_1_X_col_2_X_col_3/"
                              "weighted_sum_from_feature_columns/"
-                             "col_1_X_col_2/weights"))
+                             "col_1_X_col_2_X_col_3/weights"))
 
     with variable_scope.variable_scope("run_2"):
       # This will initialize the crossed column weights from provided checkpoint
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 4359d0c63e3..271b3c01ffc 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -34,9 +34,10 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   This function implements the weight initialization from:
 
   Xavier Glorot and Yoshua Bengio (2010):
-           Understanding the difficulty of training deep feedforward neural
+           [Understanding the difficulty of training deep feedforward neural
            networks. International conference on artificial intelligence and
-           statistics.
+           statistics.](
+           http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)
 
   This initializer is designed to keep the scale of the gradients roughly the
   same in all layers. In uniform distribution this ends up being the range:
@@ -46,8 +47,7 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   Args:
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
@@ -97,8 +97,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
     mode: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
diff --git a/tensorflow/contrib/layers/python/layers/initializers_test.py b/tensorflow/contrib/layers/python/layers/initializers_test.py
index fe044f4bb76..b7fe8788930 100644
--- a/tensorflow/contrib/layers/python/layers/initializers_test.py
+++ b/tensorflow/contrib/layers/python/layers/initializers_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import layers
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index e73b20d187d..d3b10949630 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.layers import convolutional as convolutional_layers
@@ -58,6 +59,7 @@ __all__ = ['avg_pool2d',
            'convolution2d_in_plane',
            'convolution2d_transpose',
            'dropout',
+           'elu',
            'flatten',
            'fully_connected',
            'layer_norm',
@@ -68,6 +70,7 @@ __all__ = ['avg_pool2d',
            'relu',
            'relu6',
            'repeat',
+           'scale_gradient',
            'separable_conv2d',
            'separable_convolution2d',
            'softmax',
@@ -155,17 +158,18 @@ def _fused_batch_norm(
 
   Can be used as a normalizer function for conv2d and fully_connected.
 
-  Note: When is_training is True the moving_mean and moving_variance need to be
-  updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so
-  they need to be added as a dependency to the `train_op`, example:
+  Note: when training, the moving_mean and moving_variance need to be updated.
+  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
+  need to be added as a dependency to the `train_op`. For example:
 
+  ```python
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)
+  ```
 
   One can set updates_collections=None to force the updates in place, but that
-  can have speed penalty, especially in distributed settings.
+  can have a speed penalty, especially in distributed settings.
 
   Args:
     inputs: A tensor with 2 or more dimensions, where the first dimension has
@@ -221,6 +225,7 @@ def _fused_batch_norm(
       scope, 'BatchNorm', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     original_shape = inputs.get_shape()
+    original_inputs = inputs
     original_rank = original_shape.ndims
     if original_rank is None:
       raise ValueError('Inputs %s has undefined rank' % inputs.name)
@@ -275,7 +280,7 @@ def _fused_batch_norm(
         trainable=trainable_gamma)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections.
+    # appropriate collections.
     moving_mean_collections = utils.get_variable_collections(
         variables_collections, 'moving_mean')
     moving_mean_initializer = param_initializers.get(
@@ -351,7 +356,7 @@ def _fused_batch_norm(
 
     outputs.set_shape(inputs_shape)
     if original_shape.ndims == 2:
-      outputs = array_ops.reshape(outputs, original_shape)
+      outputs = array_ops.reshape(outputs, array_ops.shape(original_inputs))
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections,
@@ -359,25 +364,28 @@ def _fused_batch_norm(
 
 
 @add_arg_scope
-def batch_norm(
-    inputs,
-    decay=0.999,
-    center=True,
-    scale=False,
-    epsilon=0.001,
-    activation_fn=None,
-    param_initializers=None,
-    updates_collections=ops.GraphKeys.UPDATE_OPS,
-    is_training=True,
-    reuse=None,
-    variables_collections=None,
-    outputs_collections=None,
-    trainable=True,
-    batch_weights=None,
-    fused=False,
-    data_format=DATA_FORMAT_NHWC,
-    zero_debias_moving_mean=False,
-    scope=None):
+def batch_norm(inputs,
+               decay=0.999,
+               center=True,
+               scale=False,
+               epsilon=0.001,
+               activation_fn=None,
+               param_initializers=None,
+               param_regularizers=None,
+               updates_collections=ops.GraphKeys.UPDATE_OPS,
+               is_training=True,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               batch_weights=None,
+               fused=False,
+               data_format=DATA_FORMAT_NHWC,
+               zero_debias_moving_mean=False,
+               scope=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_decay=0.99):
   """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
     "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -387,17 +395,18 @@ def batch_norm(
 
   Can be used as a normalizer function for conv2d and fully_connected.
 
-  Note: When is_training is True the moving_mean and moving_variance need to be
-  updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so
-  they need to be added as a dependency to the `train_op`, example:
+  Note: when training, the moving_mean and moving_variance need to be updated.
+  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
+  need to be added as a dependency to the `train_op`. For example:
 
+  ```python
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)
+  ```
 
   One can set updates_collections=None to force the updates in place, but that
-  can have speed penalty, especially in distributed settings.
+  can have a speed penalty, especially in distributed settings.
 
   Args:
     inputs: A tensor with 2 or more dimensions, where the first dimension has
@@ -419,6 +428,7 @@ def batch_norm(
       maintain a linear activation.
     param_initializers: Optional initializers for beta, gamma, moving mean and
       moving variance.
+    param_regularizers: Optional regularizer for beta and gamma.
     updates_collections: Collections to collect the update ops for computation.
       The updates_ops need to be executed with the train_op.
       If None, a control dependency would be added to make sure the updates are
@@ -444,12 +454,26 @@ def batch_norm(
     zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
       pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
     scope: Optional scope for `variable_scope`.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_decay: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `decay` is still applied
+      to get the means and variances for inference.
 
   Returns:
     A `Tensor` representing the output of the operation.
 
   Raises:
     ValueError: If `batch_weights` is not None and `fused` is True.
+    ValueError: If `param_regularizers` is not None and `fused` is True.
     ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: If the rank of `inputs` is undefined.
     ValueError: If rank or channels dimension of `inputs` is undefined.
@@ -458,6 +482,11 @@ def batch_norm(
     if batch_weights is not None:
       raise ValueError('Weighted mean and variance is not currently '
                        'supported for fused batch norm.')
+    if param_regularizers is not None:
+      raise ValueError('Regularizers are not currently '
+                       'supported for fused batch norm.')
+    if renorm:
+      raise ValueError('Renorm is not supported for fused batch norm.')
     return _fused_batch_norm(
         inputs,
         decay=decay,
@@ -501,6 +530,10 @@ def batch_norm(
           'moving_mean', init_ops.zeros_initializer())
       moving_variance_initializer = param_initializers.get(
           'moving_variance', init_ops.ones_initializer())
+      if not param_regularizers:
+        param_regularizers = {}
+      beta_regularizer = param_regularizers.get('beta')
+      gamma_regularizer = param_regularizers.get('gamma')
       layer = normalization_layers.BatchNormalization(
           axis=axis,
           momentum=decay,
@@ -511,7 +544,12 @@ def batch_norm(
           gamma_initializer=gamma_initializer,
           moving_mean_initializer=moving_mean_initializer,
           moving_variance_initializer=moving_variance_initializer,
+          beta_regularizer=beta_regularizer,
+          gamma_regularizer=gamma_regularizer,
           trainable=trainable,
+          renorm=renorm,
+          renorm_clipping=renorm_clipping,
+          renorm_momentum=renorm_decay,
           name=sc.name,
           _scope=sc,
           _reuse=reuse)
@@ -525,7 +563,8 @@ def batch_norm(
       if layer.beta:
         _add_variable_to_collections(layer.beta, variables_collections, 'beta')
       if layer.gamma:
-        _add_variable_to_collections(layer.beta, variables_collections, 'gamma')
+        _add_variable_to_collections(
+            layer.gamma, variables_collections, 'gamma')
 
       if activation_fn is not None:
         outputs = activation_fn(outputs)
@@ -538,6 +577,9 @@ def batch_norm(
     # Custom updates collections are not supported because the update logic
     # is different in this case, in particular w.r.t. "forced updates" and
     # update op reuse.
+    if renorm:
+      raise ValueError('renorm is not supported with batch_weights, '
+                       'updates_collections or zero_debias_moving_mean')
     inputs_shape = inputs.get_shape()
     inputs_rank = inputs_shape.ndims
     if inputs_rank is None:
@@ -594,7 +636,7 @@ def batch_norm(
                                        trainable=trainable)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections. We disable variable partitioning while creating
+    # appropriate collections. We disable variable partitioning while creating
     # them, because assign_moving_average is not yet supported for partitioned
     # variables.
     partitioner = variable_scope.get_variable_scope().partitioner
@@ -806,7 +848,7 @@ def convolution(inputs,
   variable would be created and added the activations. Finally, if
   `activation_fn` is not `None`, it is applied to the activations as well.
 
-  Performs a'trous convolution with input stride/dilation rate equal to `rate`
+  Performs atrous convolution with input stride/dilation rate equal to `rate`
   if a value > 1 for any dimension of `rate` is specified.  In this case
   `stride` values != 1 are not supported.
 
@@ -829,10 +871,10 @@ def convolution(inputs,
       the `input` and output is the last dimension (default, or if `data_format`
       does not start with "NC"), or the second dimension (if `data_format`
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
-      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-      N=3, currently the only valid value is "NDHWC".
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     rate: A sequence of N positive integers specifying the dilation rate to use
-      for a'trous convolution.  Can be a single integer to specify the same
+      for atrous convolution.  Can be a single integer to specify the same
       value for all spatial dimensions.  Specifying any `rate` value != 1 is
       incompatible with specifying any `stride` value != 1.
     activation_fn: Activation function. The default value is a ReLU function.
@@ -862,7 +904,7 @@ def convolution(inputs,
     ValueError: If `data_format` is invalid.
     ValueError: Both 'rate' and `stride` are not uniformly 1.
   """
-  if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC']:
+  if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
     raise ValueError('Invalid data_format: %r' % (data_format,))
 
   layer_variable_getter = _build_variable_getter(
@@ -1049,7 +1091,7 @@ def convolution2d_transpose(
   """Adds a convolution2d_transpose with an optional batch normalization layer.
 
   The function creates a variable called `weights`, representing the
-  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  kernel, that is convolved with the input. If `normalizer_fn` is `None`, a
   second variable called 'biases' is added to the result of the operation.
 
   Args:
@@ -1219,7 +1261,7 @@ def flatten(inputs,
     batch_dim, spatial_dims = input_shape[0], input_shape[1:]
     if all(spatial_dims):
       outputs.set_shape([batch_dim,
-                        functools.reduce(lambda x, y: x * y, spatial_dims)])
+                         functools.reduce(lambda x, y: x * y, spatial_dims)])
     else:
       outputs.set_shape([batch_dim, None])
 
@@ -1228,6 +1270,13 @@ def flatten(inputs,
 
 def _sparse_inner_flatten(inputs, new_rank):
   """Helper function for `inner_flatten`."""
+  inputs_rank = inputs.dense_shape.get_shape().as_list()[0]
+  if inputs_rank < new_rank:
+    raise ValueError(
+        'Inputs has rank less than new_rank. {} must have rank at least'
+        ' {}. Received rank {}, shape {}'.format(inputs, new_rank, inputs_rank,
+                                                 inputs.get_shape()))
+
   outer_dimensions = inputs.dense_shape[:new_rank - 1]
   inner_dimensions = inputs.dense_shape[new_rank - 1:]
   new_shape = array_ops.concat((outer_dimensions,
@@ -1300,7 +1349,8 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
 def _model_variable_getter(getter, name, shape=None, dtype=None,
                            initializer=None, regularizer=None, trainable=True,
                            collections=None, caching_device=None,
-                           partitioner=None, rename=None, **_):
+                           partitioner=None, rename=None, use_resource=None,
+                           **_):
   """Getter that uses model_variable for compatibility with core layers."""
   short_name = name.split('/')[-1]
   if rename and short_name in rename:
@@ -1311,16 +1361,13 @@ def _model_variable_getter(getter, name, shape=None, dtype=None,
       name, shape=shape, dtype=dtype, initializer=initializer,
       regularizer=regularizer, collections=collections, trainable=trainable,
       caching_device=caching_device, partitioner=partitioner,
-      custom_getter=getter)
+      custom_getter=getter, use_resource=use_resource)
 
 
 def _build_variable_getter(rename=None):
   """Build a model variable getter that respects scope getter and renames."""
-  # Respect current getter, if one is set.
-  current_custom_getter = variable_scope.get_variable_scope().custom_getter
+  # VariableScope will nest the getters
   def layer_variable_getter(getter, *args, **kwargs):
-    if current_custom_getter is not None:
-      getter = functools.partial(current_custom_getter, getter)
     kwargs['rename'] = rename
     return _model_variable_getter(getter, *args, **kwargs)
   return layer_variable_getter
@@ -1450,18 +1497,39 @@ def layer_norm(inputs,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
+               begin_norm_axis=1,
+               begin_params_axis=-1,
                scope=None):
-  """Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450.
+  """Adds a Layer Normalization layer.
+
+  Based on the paper:
 
     "Layer Normalization"
 
     Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
 
+    https://arxiv.org/abs/1607.06450.
+
   Can be used as a normalizer function for conv2d and fully_connected.
 
+  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
+  is performed over axes `begin_norm_axis ... R - 1`.  Scaling and centering,
+  if requested, is performed over axes `begin_shift_axis .. R - 1`.
+
+  By default, `begin_norm_axis = 1` and `begin_params_axis = -1`,
+  meaning that normalization is performed over all but the first axis
+  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
+  parameters are calculated for the rightmost axis (the `C` if `inputs` is
+  `NHWC`).  Scaling and recentering is performed via broadcast of the
+  `beta` and `gamma` parameters with the normalized tensor.
+
+  The shapes of `beta` and `gamma` are `inputs.shape[begin_params_axis:]`,
+  and this part of the inputs' shape must be fully defined.
+
   Args:
-    inputs: A tensor with 2 or more dimensions. The normalization
-            occurs over all but the first dimension.
+    inputs: A tensor having rank `R`. The normalization is performed over
+      axes `begin_norm_axis ... R - 1` and centering and scaling parameters
+      are calculated over `begin_params_axis ... R - 1`.
     center: If True, add offset of `beta` to normalized tensor. If False, `beta`
       is ignored.
     scale: If True, multiply by `gamma`. If False, `gamma` is
@@ -1475,27 +1543,43 @@ def layer_norm(inputs,
     outputs_collections: Collections to add the outputs.
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    begin_norm_axis: The first normalization dimension: normalization will be
+      performed along dimensions `begin_norm_axis : rank(inputs)`
+    begin_params_axis: The first parameter (beta, gamma) dimension: scale
+      and centering parameters will have dimensions
+      `begin_params_axis : rank(inputs)` and will be broadcast with the
+      normalized inputs accordingly.
     scope: Optional scope for `variable_scope`.
 
   Returns:
-    A `Tensor` representing the output of the operation.
+    A `Tensor` representing the output of the operation, having the same
+    shape and dtype as `inputs`.
 
   Raises:
-    ValueError: If rank or last dimension of `inputs` is undefined.
+    ValueError: If the rank of `inputs` is not known at graph build time,
+      or if `inputs.shape[begin_params_axis:]` is not fully defined at
+      graph build time.
   """
   with variable_scope.variable_scope(scope, 'LayerNorm', [inputs],
                                      reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    inputs_shape = inputs.get_shape()
+    inputs_shape = inputs.shape
     inputs_rank = inputs_shape.ndims
     if inputs_rank is None:
       raise ValueError('Inputs %s has undefined rank.' % inputs.name)
     dtype = inputs.dtype.base_dtype
-    axis = list(range(1, inputs_rank))
-    params_shape = inputs_shape[-1:]
+    if begin_norm_axis < 0:
+      begin_norm_axis = inputs_rank + begin_norm_axis
+    if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
+      raise ValueError(
+          'begin_params_axis (%d) and begin_norm_axis (%d) '
+          'must be < rank(inputs) (%d)'
+          % (begin_params_axis, begin_norm_axis, inputs_rank))
+    params_shape = inputs_shape[begin_params_axis:]
     if not params_shape.is_fully_defined():
-      raise ValueError('Inputs %s has undefined last dimension %s.' % (
-          inputs.name, params_shape))
+      raise ValueError(
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (
+              inputs.name, begin_params_axis, inputs_shape))
     # Allocate parameters for the beta and gamma of the normalization.
     beta, gamma = None, None
     if center:
@@ -1519,11 +1603,13 @@ def layer_norm(inputs,
           collections=gamma_collections,
           trainable=trainable)
     # Calculate the moments on the last axis (layer activations).
-    mean, variance = nn.moments(inputs, axis, keep_dims=True)
+    norm_axes = list(range(begin_norm_axis, inputs_rank))
+    mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
     # Compute layer normalization using the batch_normalization function.
-    variance_epsilon = 1E-12
+    variance_epsilon = 1e-12
     outputs = nn.batch_normalization(
-        inputs, mean, variance, beta, gamma, variance_epsilon)
+        inputs, mean, variance, offset=beta, scale=gamma,
+        variance_epsilon=variance_epsilon)
     outputs.set_shape(inputs_shape)
     if activation_fn is not None:
       outputs = activation_fn(outputs)
@@ -1609,8 +1695,8 @@ def pool(inputs,
       the `input` and output is the last dimension (default, or if `data_format`
       does not start with "NC"), or the second dimension (if `data_format`
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
-      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-      N=3, currently the only valid value is "NDHWC".
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     dilation_rate: Optional.  Dilation rate.  Sequence of N ints >= 1.  Defaults
       to [1]*N.  Can also be a single integer to specify the same value for all
       spatial dimensions.  If any value of dilation_rate is > 1, then all values
@@ -1736,6 +1822,48 @@ def repeat(inputs, repetitions, layer, *args, **kwargs):
     return outputs
 
 
+def _scale_gradient_shape(op):
+  """Shape helper function for scale_gradient function below."""
+  return [op.inputs[0].shape]
+
+
+def _scale_gradient_grad(op, grad):
+  """Python gradient helper function for scale_gradient function below."""
+  return [grad * op.inputs[1], None]
+
+
+@function.Defun(python_grad_func=_scale_gradient_grad,
+                shape_func=_scale_gradient_shape)
+def scale_gradient(inputs, gradient_multiplier):
+  """Identity operation, but with the gradient multiplied by a tensor.
+
+  The TensorFlow gradient system will compute the gradient with respect to
+  `inputs` as the product of the gradient with respect to the `output`
+  multiplied by a specified `gradient_multiplier` tensor.  If
+  `gradient_multiplier` is equal to 1, then this results in the true gradient.
+  Otherwise, it results in a scaled gradient.
+
+  This can be useful for adjusting the relative learning rate of different
+  parameter tensors when performing gradient descent, and because this rescaling
+  can be inserted at arbitrary locations within a graph, is often more
+  convenient to apply than simply rescaling the final computed gradients.
+
+  Args:
+    inputs: Tensor to be output.
+    gradient_multiplier: Tensor by which to multiply the gradient with respect
+      to `output` to compute the gradient with respect to `inputs`.  Its shape
+      must be broadcastable to the shape of `inputs`.
+
+  Returns:
+    output Tensor, equal to `inputs`.
+  """
+  # gradient_multiplier is implicitly saved by decorator, and only used for
+  # gradient computation.
+  del gradient_multiplier
+
+  return inputs
+
+
 @add_arg_scope
 def separable_convolution2d(
     inputs,
@@ -1762,9 +1890,9 @@ def separable_convolution2d(
   This op first performs a depthwise convolution that acts separately on
   channels, creating a variable called `depthwise_weights`. If `num_outputs`
   is not None, it adds a pointwise convolution that mixes channels, creating a
-  variable called `pointwise_weights`. Then, if `batch_norm_params` is None,
-  it adds bias to the result, creating a variable called 'biases', otherwise
-  it adds a batch normalization layer. It finally applies an activation function
+  variable called `pointwise_weights`. Then, if `normalizer_fn` is None,
+  it adds bias to the result, creating a variable called 'biases', otherwise,
+  the `normalizer_fn` is applied. It finally applies an activation function
   to produce the end result.
 
   Args:
@@ -1780,7 +1908,7 @@ def separable_convolution2d(
       depthwise convolution stride. Can be an int if both strides are the same.
     padding: One of 'VALID' or 'SAME'.
     rate: A list of length 2: [rate_height, rate_width], specifying the dilation
-      rates for a'trous convolution. Can be an int if both rates are the same.
+      rates for atrous convolution. Can be an int if both rates are the same.
       If any value is larger than one, then both stride values need to be one.
     activation_fn: Activation function. The default value is a ReLU function.
       Explicitly set it to None to skip it and maintain a linear activation.
@@ -1796,7 +1924,7 @@ def separable_convolution2d(
     reuse: Whether or not the layer and its variables should be reused. To be
       able to reuse the layer scope must be given.
     variables_collections: Optional list of collections for all the variables or
-      a dictionay containing a different list of collection per variable.
+      a dictionary containing a different list of collection per variable.
     outputs_collections: Collection to add the outputs.
     trainable: Whether or not the variables should be trainable or not.
     scope: Optional scope for variable_scope.
@@ -1889,6 +2017,7 @@ def separable_convolution2d(
                                             dtype=dtype,
                                             initializer=biases_initializer,
                                             regularizer=biases_regularizer,
+                                            trainable=trainable,
                                             collections=biases_collections)
           outputs = nn.bias_add(outputs, biases)
 
@@ -2148,6 +2277,7 @@ def legacy_fully_connected(x,
 
 # TODO(eiderm): Verify and fix autocomplete in colab (also relu6).
 # Simple aliases which remove the activation_fn parameter.
+elu = functools.partial(fully_connected, activation_fn=nn.elu)
 legacy_relu = functools.partial(legacy_fully_connected, activation_fn=nn.relu)
 legacy_linear = functools.partial(legacy_fully_connected, activation_fn=None)
 relu = functools.partial(fully_connected, activation_fn=nn.relu)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 5561ccd5f5c..b49c33e9969 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import math
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
@@ -77,7 +71,6 @@ class AvgPool2DTest(test.TestCase):
     height, width = 3, 6
     images = np.random.uniform(size=(5, 2, height, width))
     output = _layers.avg_pool2d(images, [3, 3], data_format='NCHW')
-    self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
     self.assertListEqual(output.get_shape().as_list(), [5, 2, 1, 2])
 
   def testCollectOutputs(self):
@@ -253,7 +246,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConv(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 4))
+      images = np.random.uniform(size=(5, height, width, 4)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3])
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
@@ -265,7 +258,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConvNCHW(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, 4, height, width))
+      images = np.random.uniform(size=(5, 4, height, width)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3], data_format='NCHW')
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 32, height, width])
@@ -1471,6 +1464,30 @@ class PartialFlattenTest(test.TestCase):
     flattened5 = _layers._inner_flatten(inputs, 5)
     self.assertEqual([2, None, 4, None, 30], flattened5.get_shape().as_list())
 
+  def testDenseFlattenRankAssertion(self):
+    """Test `_inner_flatten` rank assertion for dense tensors."""
+    shape = [2, 3]
+    new_rank = 3
+    inputs = array_ops.placeholder(dtypes.int32)
+    inputs.set_shape(shape)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'inputs has rank less than new_rank'):
+      _layers._inner_flatten(inputs, new_rank)
+
+  def testSparseFlattenRankAssertion(self):
+    """Test `_inner_flatten` rank assertion for sparse tensors."""
+    shape = [2, 3]
+    new_rank = 3
+    np.random.seed(10301)
+    random_ = np.random.rand(*shape)
+    indices, values, _ = _sparsify(random_)
+    inputs = sparse_tensor.SparseTensor(indices, values, shape)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Inputs has rank less than new_rank'):
+      _layers._inner_flatten(inputs, new_rank)
+
 
 class FCTest(test.TestCase):
 
@@ -1685,6 +1702,13 @@ class BatchNormTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'Weighted mean and variance'):
         _layers.batch_norm(inputs, batch_weights=batch_weights, fused=True)
 
+  def testParamRegularizersFused(self):
+    with ops.Graph().as_default() as g, self.test_session(g):
+      inputs = array_ops.placeholder(dtype=dtypes.float32, shape=(5, 3, 3, 7))
+      with self.assertRaisesRegexp(ValueError,
+                                   'Regularizers are not currently'):
+        _layers.batch_norm(inputs, param_regularizers={}, fused=True)
+
   def _testCreateOp(self, fused):
     height, width = 3, 3
     with self.test_session():
@@ -1694,6 +1718,8 @@ class BatchNormTest(test.TestCase):
                        'BatchNorm/batchnorm')
       self.assertTrue(output.op.name.startswith(expected_name))
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES), [])
 
   def testCreateOpDefault(self):
     self._testCreateOp(False)
@@ -1701,6 +1727,29 @@ class BatchNormTest(test.TestCase):
   def testCreateOpFused(self):
     self._testCreateOp(True)
 
+  def testCreateOpBetaRegularizer(self):
+    height, width = 3, 3
+    with self.test_session():
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      images = np.random.uniform(size=(5, height, width, 3)).astype('f')
+      _layers.batch_norm(images, param_regularizers={'beta': reg})
+      self.assertEqual(
+          len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
+      beta_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
+      self.assertEqual(beta_decay.op.name, 'BatchNorm/beta/Regularizer/mul')
+
+  def testCreateOpGammaRegularizer(self):
+    height, width = 3, 3
+    with self.test_session():
+      reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+      images = np.random.uniform(size=(5, height, width, 3)).astype('f')
+      _layers.batch_norm(
+          images, param_regularizers={'gamma': reg}, scale=True)
+      self.assertEqual(
+          len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
+      gamma_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
+      self.assertEqual(gamma_decay.op.name, 'BatchNorm/gamma/Regularizer/mul')
+
   def testCreateVariables(self):
     height, width = 3, 3
     with self.test_session():
@@ -1754,6 +1803,22 @@ class BatchNormTest(test.TestCase):
       self.assertEqual(update_moving_variance.op.name,
                        'BatchNorm/AssignMovingAvg_1')
 
+  def testVariablesCollections(self):
+    variables_collections = {
+        'beta': ['beta'],
+        'gamma': ['gamma'],
+        'moving_mean': ['moving_mean'],
+        'moving_variance': ['moving_variance'],
+    }
+    images = random_ops.random_uniform((5, 5, 5, 3), seed=1)
+    _layers.batch_norm(
+        images, scale=True, variables_collections=variables_collections)
+    for var_name, collection_names in variables_collections.items():
+      collection = ops.get_collection(collection_names[0])
+      self.assertEqual(len(collection), 1)
+      var_name_in_collection = collection[0].op.name
+      self.assertEqual(var_name_in_collection, 'BatchNorm/' + var_name)
+
   def testReuseVariables(self):
     height, width = 3, 3
     with self.test_session():
@@ -2519,11 +2584,11 @@ class LayerNormTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'undefined rank'):
         _layers.layer_norm(inputs)
 
-  def testUnknownLastDim(self):
+  def testParamsDimsNotFullyDefined(self):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5, 3, 3, None)))
-      with self.assertRaisesRegexp(ValueError, 'undefined last dimension'):
+      with self.assertRaisesRegexp(ValueError, 'is not fully defined'):
         _layers.layer_norm(inputs)
 
   def testCreateOp(self):
@@ -2569,39 +2634,72 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape, tol=1e-3):
+  def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1,
+                   dtype=dtypes.float64):
+    expected_mean = np.zeros(input_shape[:begin_norm_axis])
+    expected_var = np.ones(input_shape[:begin_norm_axis])
     for mu in [0.0, 1e2]:
       for sigma in [1.0, 0.1]:
-        input_values = np.random.rand(*input_shape) * sigma + mu
-        expected_mean = np.zeros(input_shape[0])
-        expected_var = np.ones(input_shape[0])
+        input_values = np.random.randn(*input_shape) * sigma + mu
         with ops.Graph().as_default() as g:
           with self.test_session(graph=g) as sess:
-            inputs = constant_op.constant(input_values, shape=input_shape,
-                                          dtype=dtypes.float32)
-            output_op = _layers.layer_norm(inputs, scope='LN')
+            inputs = constant_op.constant(
+                input_values, shape=input_shape, dtype=dtype)
+            output_t = _layers.layer_norm(
+                inputs, begin_norm_axis=begin_norm_axis, scope='LN')
             # Initialize all variables
             sess.run(variables_lib.global_variables_initializer())
             # The mean and variance of the output should be close to 0 and 1
             # respectively.
-            moments_axis = tuple([i for i in range(1, len(input_shape))])
-            outputs = sess.run(output_op)
+            if begin_norm_axis < 0:
+              begin_norm_axis = len(input_shape) + begin_norm_axis
+            moments_axis = tuple(range(begin_norm_axis, len(input_shape)))
+            with variable_scope.variable_scope('LN', reuse=True):
+              beta_var = variable_scope.get_variable('beta', dtype=dtype)
+              gamma_var = variable_scope.get_variable('gamma', dtype=dtype)
+            outputs, beta, gamma = sess.run((output_t, beta_var, gamma_var))
             # Make sure that there are no NaNs
             self.assertFalse(np.isnan(outputs).any())
             mean = np.mean(outputs, axis=moments_axis)
             var = np.var(outputs, axis=moments_axis)
-            self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
-            self.assertAllClose(var, expected_var, rtol=tol, atol=tol)
+            # Layer-norm implemented in numpy
+            eps = 1e-12
+            expected_out = (
+                (gamma * (
+                    input_values
+                    - np.mean(input_values, axis=moments_axis, keepdims=True))
+                 / np.sqrt(
+                     eps
+                     + np.var(input_values, axis=moments_axis, keepdims=True)))
+                + beta)
+            self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
+            self.assertAllClose(expected_var, var, atol=tol)
+            # The full computation gets a bigger tolerance
+            self.assertAllClose(expected_out, outputs, atol=5 * tol)
 
   def testOutput2DInput(self):
     self.doOutputTest((10, 300))
 
+  def testOutput2DInputDegenerateNormAxis(self):
+    with self.assertRaisesRegexp(ValueError, r'must be < rank\(inputs\)'):
+      self.doOutputTest((10, 300), begin_norm_axis=2)
+
   def testOutput4DInput(self):
     self.doOutputTest((100, 10, 10, 3))
 
+  def testOutput4DInputNormOnInnermostAxis(self):
+    # Equivalent tests
+    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4,
+                      dtype=dtypes.float64)
+    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4,
+                      dtype=dtypes.float64)
+
   def testOutputSmallInput(self):
     self.doOutputTest((10, 10, 10, 30))
 
+  def testOutputSmallInputNormOnInnermostAxis(self):
+    self.doOutputTest((10, 10, 10, 30), begin_norm_axis=3)
+
   def testOutputBigInput(self):
     self.doOutputTest((1, 100, 100, 1))
 
@@ -2626,7 +2724,6 @@ class MaxPool2DTest(test.TestCase):
     height, width = 3, 6
     images = np.random.uniform(size=(5, 3, height, width)).astype(np.float32)
     output = _layers.max_pool2d(images, [3, 3], data_format='NCHW')
-    self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2])
 
   def testCollectOutputs(self):
@@ -2714,7 +2811,7 @@ class RepeatTests(test.TestCase):
   def testRepeat(self):
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3))
+      images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
       self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
@@ -2749,15 +2846,6 @@ class SeparableConv2dTest(test.TestCase):
       self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
 
-  def testCreateConvFloat64(self):
-    height, width = 3, 3
-    with self.test_session():
-      images = random_ops.random_uniform(
-          (5, height, width, 3), seed=1, dtype=dtypes.float64)
-      output = layers_lib.separable_conv2d(images, 32, [3, 3], 2)
-      self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
-      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
-
   def testCreateDepthwiseConv(self):
     height, width = 3, 3
     with self.test_session():
@@ -2937,6 +3025,36 @@ class SeparableConv2dTest(test.TestCase):
       sess.run(init_op)
       sess.run(net, feed_dict={images_placeholder: images})
 
+  def testTrainableFlagIsPassedOn(self):
+    for trainable in [True, False]:
+      for num_filters in [None, 8]:
+        with ops.Graph().as_default():
+          input_size = [5, 10, 12, 3]
+
+          images = random_ops.random_uniform(input_size, seed=1)
+          layers_lib.separable_conv2d(
+              images, num_filters, [3, 3], 1, trainable=trainable)
+          model_variables = variables.get_model_variables()
+          trainable_variables = variables_lib.trainable_variables()
+          for model_variable in model_variables:
+            self.assertEqual(trainable, model_variable in trainable_variables)
+
+
+class ScaleGradientTests(test.TestCase):
+  """Simple tests of the scale_gradient function."""
+
+  def testBasic(self):
+    with self.test_session():
+      x = np.array([42], np.float32)
+      gradient_scale = np.array([2], np.float32)
+
+      x = ops.convert_to_tensor(x)
+      y = layers_lib.scale_gradient(x, gradient_scale)
+
+      np.testing.assert_array_equal(x.eval(), y.eval())
+      g_x, = gradients_impl.gradients(y, [x], [np.array([3], np.float32)])
+      np.testing.assert_array_equal([3 * 2], g_x.eval())
+
 
 class SoftmaxTests(test.TestCase):
 
@@ -3004,6 +3122,14 @@ class StackTests(test.TestCase):
       self.assertEqual(output.op.name, 'Stack/fully_connected_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 30])
 
+  def testStackFullyConnectedFailOnReuse(self):
+    height, width = 3, 3
+    with self.test_session():
+      with variable_scope.variable_scope('test', reuse=True):
+        images = np.random.uniform(size=(5, height * width * 3))
+        with self.assertRaises(ValueError):
+          _layers.stack(images, _layers.fully_connected, [10, 20, 30])
+
   def testStackRelu(self):
     height, width = 3, 3
     with self.test_session():
@@ -3013,6 +3139,15 @@ class StackTests(test.TestCase):
       self.assertEqual(output.op.name, 'Stack/fully_connected_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 30])
 
+  def testStackElu(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = random_ops.random_uniform(
+          (5, height * width * 3), seed=1, name='images')
+      output = _layers.stack(images, layers_lib.elu, [10, 20, 30])
+      self.assertEqual(output.op.name, 'Stack/fully_connected_3/Elu')
+      self.assertListEqual(output.get_shape().as_list(), [5, 30])
+
   def testStackConvolution2d(self):
     height, width = 3, 3
     with self.test_session():
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index bab59d00489..50c11c696a9 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -51,6 +51,7 @@ OPTIMIZER_SUMMARIES = [
     "loss",
     "gradients",
     "gradient_norm",
+    "global_gradient_norm",
 ]
 
 
@@ -66,7 +67,8 @@ def optimize_loss(loss,
                   variables=None,
                   name=None,
                   summaries=None,
-                  colocate_gradients_with_ops=False):
+                  colocate_gradients_with_ops=False,
+                  increment_global_step=True):
   """Given loss and parameters for optimizer, returns a training op.
 
   Various ways of passing optimizers, include:
@@ -87,9 +89,10 @@ def optimize_loss(loss,
 
   Args:
     loss: Scalar `Tensor`.
-    global_step: Scalar int `Tensor`, step counter for each update. If not
-                 supplied, it will be fetched from the default graph (see
-                 `tf.contrib.framework.get_global_step` for details). If it's
+    global_step: Scalar int `Tensor`, step counter to update on each step
+                 unless `increment_global_step` is `False`. If not supplied,
+                 it will be fetched from the default graph (see
+                 `tf.train.get_global_step` for details). If it's
                  not been created, no step will be incremented with each weight
                  update. `learning_rate_decay_fn` requires `global_step`.
     learning_rate: float or `Tensor`, magnitude of update per each training
@@ -129,6 +132,10 @@ def optimize_loss(loss,
                complete list is in OPTIMIZER_SUMMARIES.
     colocate_gradients_with_ops: If True, try colocating gradients with the
                                  corresponding op.
+    increment_global_step: Whether to increment `global_step`. If your model
+      calls `optimize_loss` multiple times per training step (e.g. to optimize
+      different parts of the model), use this arg to avoid incrementing
+      `global_step` more times than necessary.
 
   Returns:
     Training op.
@@ -142,6 +149,7 @@ def optimize_loss(loss,
         * `clip_gradients` is not float or callable.
         * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
           `global_step` is available.
+        * `gradients` is empty
   """
   loss = ops.convert_to_tensor(loss)
   contrib_framework.assert_scalar(loss)
@@ -175,7 +183,7 @@ def optimize_loss(loss,
                          "Got %s of type %s" % (str(learning_rate),
                                                 str(type(learning_rate))))
     if summaries is None:
-      summaries = ["loss", "learning_rate"]
+      summaries = ["loss", "learning_rate", "global_gradient_norm"]
     else:
       for summ in summaries:
         if summ not in OPTIMIZER_SUMMARIES:
@@ -238,8 +246,12 @@ def optimize_loss(loss,
     # Multiply some gradients.
     if gradient_multipliers is not None:
       gradients = _multiply_gradients(gradients, gradient_multipliers)
+      if not gradients:
+        raise ValueError(
+            "Empty list of (gradient, var) pairs encountered. This is most "
+            "likely to be caused by an improper value of gradient_multipliers.")
 
-    if "gradient_norm" in summaries:
+    if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
       summary.scalar("global_norm/gradient_norm",
                      clip_ops.global_norm(list(zip(*gradients))[0]))
 
@@ -249,8 +261,8 @@ def optimize_loss(loss,
     elif callable(clip_gradients):
       gradients = clip_gradients(gradients)
     elif clip_gradients is not None:
-      raise ValueError("Unknown type %s for clip_gradients" %
-                       type(clip_gradients))
+      raise ValueError(
+          "Unknown type %s for clip_gradients" % type(clip_gradients))
 
     # Add scalar summary for loss.
     if "loss" in summaries:
@@ -271,13 +283,16 @@ def optimize_loss(loss,
           summary.scalar("gradient_norm/%s" % var_name,
                          clip_ops.global_norm([grad_values]))
 
-    if clip_gradients is not None and "gradient_norm" in summaries:
+    if clip_gradients is not None and ("global_gradient_norm" in summaries or
+                                       "gradient_norm" in summaries):
       summary.scalar("global_norm/clipped_gradient_norm",
                      clip_ops.global_norm(list(zip(*gradients))[0]))
 
     # Create gradient updates.
     grad_updates = opt.apply_gradients(
-        gradients, global_step=global_step, name="train")
+        gradients,
+        global_step=global_step if increment_global_step else None,
+        name="train")
 
     # Ensure the train_tensor computes grad_updates.
     train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index d048a99047a..8813a99f199 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import optimizers as optimizers_lib
@@ -365,6 +358,30 @@ class OptimizersTest(test.TestCase):
         self.assertEqual(20, update_var.eval())
         self.assertEqual(1, global_step.eval())
 
+  def testUpdateOpNoIncrementGlobalStep(self):
+    optimizers = [
+        "SGD", gradient_descent.GradientDescentOptimizer,
+        gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
+    ]
+    for optimizer in optimizers:
+      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+        x, var, loss, global_step = _setup_model()
+        update_var = variable_scope.get_variable(
+            "update", [], initializer=init_ops.constant_initializer(10))
+        update_op = state_ops.assign(update_var, 20)
+        train = optimizers_lib.optimize_loss(
+            loss,
+            global_step,
+            learning_rate=0.1,
+            optimizer=optimizer,
+            update_ops=[update_op],
+            increment_global_step=False)
+        variables.global_variables_initializer().run()
+        session.run(train, feed_dict={x: 5})
+        self.assertEqual(9.5, var.eval())
+        self.assertEqual(20, update_var.eval())
+        self.assertEqual(0, global_step.eval())
+
   def testUpdateOpWithNoOpDecay(self):
     optimizers = [
         "SGD", gradient_descent.GradientDescentOptimizer,
diff --git a/tensorflow/contrib/layers/python/layers/regularizers.py b/tensorflow/contrib/layers/python/layers/regularizers.py
index 02eb2b390c6..2c3774c5608 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers.py
@@ -124,7 +124,15 @@ def l1_l2_regularizer(scale_l1=1.0, scale_l2=1.0, scope=None):
   Raises:
     ValueError: If scale is negative or if scale is not a float.
   """
+  if isinstance(scale_l1, numbers.Integral):
+    raise ValueError('scale_l1 cannot be an integer: %s' % (scale_l1,))
+  if isinstance(scale_l2, numbers.Integral):
+    raise ValueError('scale_l2 cannot be an integer: %s' % (scale_l2,))
   scope = scope or 'l1_l2_regularizer'
+  if scale_l1 == 0.:
+    return l2_regularizer(scale_l2, scope)
+  if scale_l2 == 0.:
+    return l1_regularizer(scale_l1, scope)
   return sum_regularizer([l1_regularizer(scale_l1),
                           l2_regularizer(scale_l2)],
                          scope=scope)
diff --git a/tensorflow/contrib/layers/python/layers/regularizers_test.py b/tensorflow/contrib/layers/python/layers/regularizers_test.py
index 89a5557aa27..07191eeda7e 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import regularizers
@@ -86,6 +79,30 @@ class RegularizerTest(test.TestCase):
       self.assertEquals(loss.op.name, 'l1_l2_regularizer')
       self.assertAlmostEqual(loss.eval(), num_elem + num_elem / 2, 5)
 
+  def test_l1_l2_scale_l1Zero(self):
+    shape = [5, 5, 5]
+    num_elem = 5 * 5 * 5
+    tensor = constant_op.constant(1.0, shape=shape)
+    loss = regularizers.l1_l2_regularizer(0.0, 1.0)(tensor)
+    with self.test_session():
+      self.assertEquals(loss.op.name, 'l1_l2_regularizer')
+      self.assertAlmostEqual(loss.eval(), num_elem / 2, 5)
+
+  def test_l1_l2_scale_l2Zero(self):
+    shape = [5, 5, 5]
+    num_elem = 5 * 5 * 5
+    tensor = constant_op.constant(1.0, shape=shape)
+    loss = regularizers.l1_l2_regularizer(1.0, 0.0)(tensor)
+    with self.test_session():
+      self.assertEquals(loss.op.name, 'l1_l2_regularizer')
+      self.assertAlmostEqual(loss.eval(), num_elem, 5)
+
+  def test_l1_l2_scales_Zero(self):
+    shape = [5, 5, 5]
+    tensor = constant_op.constant(1.0, shape=shape)
+    loss = regularizers.l1_l2_regularizer(0.0, 0.0)(tensor)
+    self.assertEquals(loss, None)
+
   def testL1L2RegularizerWithScope(self):
     with self.test_session():
       shape = [5, 5, 5]
diff --git a/tensorflow/contrib/layers/python/layers/summaries_test.py b/tensorflow/contrib/layers/python/layers/summaries_test.py
index 6f3690b7d69..a1ef06feec5 100644
--- a/tensorflow/contrib/layers/python/layers/summaries_test.py
+++ b/tensorflow/contrib/layers/python/layers/summaries_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.layers.python.layers import summaries as summaries_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/layers/python/layers/target_column_test.py b/tensorflow/contrib/layers/python/layers/target_column_test.py
index 31defe55172..d5d03fb1ebc 100644
--- a/tensorflow/contrib/layers/python/layers/target_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/target_column_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.layers.python.layers import target_column as target_column_lib
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
@@ -35,7 +28,7 @@ from tensorflow.python.platform import test
 
 class RegressionTargetColumnTest(test.TestCase):
 
-  # TODO(zakaria): test multilabel regresssion.
+  # TODO(zakaria): test multilabel regression.
   def testRegression(self):
     target_column = target_column_lib.regression_target()
     with ops.Graph().as_default(), session.Session() as sess:
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 0bea3e779a7..6e35f63256f 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import utils
diff --git a/tensorflow/contrib/layers/python/ops/bucketization_op.py b/tensorflow/contrib/layers/python/ops/bucketization_op.py
index 6d232da4adb..f498352855f 100644
--- a/tensorflow/contrib/layers/python/ops/bucketization_op.py
+++ b/tensorflow/contrib/layers/python/ops/bucketization_op.py
@@ -17,12 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-_bucketization_op = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_bucketization_op.so"))
+from tensorflow.python.ops import math_ops
 
 
 def bucketize(input_tensor, boundaries, name=None):
@@ -42,10 +37,5 @@ def bucketize(input_tensor, boundaries, name=None):
   Raises:
     TypeError: If boundaries is not a list.
   """
-  if not isinstance(boundaries, list):
-    raise TypeError("boundaries must be a list")
-
-  return _bucketization_op.bucketize(input_tensor, boundaries, name=name)
-
-
-ops.NotDifferentiable("Bucketize")
+  return math_ops._bucketize(  # pylint: disable=protected-access
+      input_tensor, boundaries=boundaries, name=name)
diff --git a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
index 688315fd12e..91684dc61e4 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 from tensorflow.contrib.framework import deprecated_arg_values
+from tensorflow.contrib.layers.ops import gen_sparse_feature_cross_op
 from tensorflow.contrib.util import loader
-from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -96,7 +95,7 @@ def sparse_feature_cross(inputs, hashed_output=False, num_buckets=0,
 
   if hash_key:
     indices_out, values_out, shape_out = (
-        _sparse_feature_cross_op.sparse_feature_cross_v2(
+        gen_sparse_feature_cross_op.sparse_feature_cross_v2(
             indices,
             values,
             shapes,
@@ -109,7 +108,7 @@ def sparse_feature_cross(inputs, hashed_output=False, num_buckets=0,
             name=name))
   else:
     indices_out, values_out, shape_out = (
-        _sparse_feature_cross_op.sparse_feature_cross(
+        gen_sparse_feature_cross_op.sparse_feature_cross(
             indices,
             values,
             shapes,
diff --git a/tensorflow/contrib/layers/python/ops/sparse_ops.py b/tensorflow/contrib/layers/python/ops/sparse_ops.py
index 8c24f37d77e..7e79630c5e0 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_ops.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_ops.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat
 
 
 def _multiplier_helper(shape):
@@ -37,14 +38,28 @@ def _multiplier_helper(shape):
   return multipliers
 
 
+def _ignore_value_tensor(dtype, ignore_value=None):
+  """Create `Tensor` from provided `ignore_value` and  `dtype`."""
+  if ignore_value is None:
+    if dtype == dtypes.string:
+      # Exception due to TF strings are converted to numpy objects by default.
+      ignore_value = ""
+    else:
+      # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
+      # constructing a new numpy object of the given type, which yields the
+      # default value for that type.
+      ignore_value = dtype.as_numpy_dtype()
+  return math_ops.cast(ignore_value, dtype, name="ignore_value")
+
+
 def dense_to_sparse_tensor(dense_tensor, ignore_value=None):
-  """Converts a dense Tensor to a SparseTensor, dropping ignore_value cells.
+  """Converts dense `Tensor` to `SparseTensor`, dropping `ignore_value` cells.
 
   Args:
     dense_tensor: A `Tensor`.
     ignore_value: Entries in `dense_tensor` equal to this value will be
       absent from the return `SparseTensor`. If `None`, default value of
-      dense_tensor's dtype will be used (e.g. '' for `str`, 0 for `int`).
+      `dense_tensor` dtype will be used (e.g. '' for `str`, 0 for `int`).
 
   Returns:
     A `SparseTensor` with the same shape as `dense_tensor`.
@@ -53,31 +68,160 @@ def dense_to_sparse_tensor(dense_tensor, ignore_value=None):
     ValueError: when `dense_tensor`'s rank is `None`.
   """
   with ops.name_scope("DenseToSparseTensor"):
-    dense_t = ops.convert_to_tensor(dense_tensor)
-    if dense_t.get_shape().ndims is None:
-      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
-      raise ValueError("dense_tensor.get_shape() should be defined, got None.")
-    if ignore_value is None:
-      if dense_t.dtype == dtypes.string:
-        # Exception due to TF strings are converted to numpy objects by default.
-        ignore_value = ""
-      else:
-        ignore_value = dense_t.dtype.as_numpy_dtype()
-    dense_shape = math_ops.cast(array_ops.shape(dense_t), dtypes.int64)
+    dense_tensor = ops.convert_to_tensor(dense_tensor)
+    ignore_value = _ignore_value_tensor(dense_tensor.dtype, ignore_value)
     indices = array_ops.where(
-        math_ops.not_equal(dense_t, math_ops.cast(ignore_value, dense_t.dtype)))
-    index_dims = len(dense_t.get_shape())
-    # Flattens the tensor and indices for use with gather.
-    flat_tensor = array_ops.reshape(dense_t, [-1])
-    flat_indices = indices[:, index_dims - 1]
-    # Computes the correct flattened indices for 2d (or higher) tensors.
-    if index_dims > 1:
-      higher_dims = indices[:, :index_dims - 1]
-      shape_multipliers = array_ops.stack(
-          _multiplier_helper(array_ops.unstack(dense_shape)[1:]))
-      offsets = math_ops.reduce_sum(
-          math_ops.multiply(higher_dims, shape_multipliers),
-          reduction_indices=[1])
-      flat_indices = math_ops.add(flat_indices, offsets)
-    values = array_ops.gather(flat_tensor, flat_indices)
-    return sparse_tensor.SparseTensor(indices, values, dense_shape)
+        math_ops.not_equal(dense_tensor, ignore_value), name="indices")
+    return sparse_tensor.SparseTensor(
+        indices=indices,
+        values=array_ops.gather_nd(dense_tensor, indices, name="values"),
+        dense_shape=array_ops.shape(
+            dense_tensor, out_type=dtypes.int64, name="dense_shape"))
+
+
+def indicators_to_sparse_ids(indicators, ignore_value=None, dtype=dtypes.int64):
+  """Convert a dense indicator tensor to sparse IDs.
+
+  This is commonly used for converting a dense classification label to sparse.
+  In the following example, we have an input of shape (2, 2, num_classes),
+  where num_classes=4.
+
+  ```python
+  indicators = [
+    [
+      [0, 0, 1, 0],
+      [0, 0, 0, 0]
+    ], [
+      [1, 0, 1, 1],
+      [0, 0, 1, 0]
+    ]
+  ]
+  sparse_ids = indicator_to_sparse_ids(indicators)
+  ```
+
+  `sparse_ids` in "jagged" format:
+  [
+    [
+      [2],
+      []
+    ], [
+      [0, 2, 3],
+      [2]
+    ]
+  ]
+
+  `sparse_ids` in `SparseTensor` format:
+  ```python
+  {
+    indices: [[0, 0, 1], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 1, 0]],
+    values: [2, 0, 2, 3, 2],
+    dense_shape: [2, 2, 3]
+  }
+  ```
+
+  Args:
+    indicators: Dense `Tensor` of shape `(d0, ..., dn, num_classes)`.
+      `ignore_value` values are ignored. For other values (typically, ones), the
+      index along the last dimension is returned.
+    ignore_value: Entries in `indicators` equal to this value will be
+      absent from the returned `SparseTensor`. If `None`, default value of
+      `indicators` dtype will be used (e.g. '' for `str`, 0 for `int`).
+    dtype: Type of result, must be integer type.
+
+  Returns:
+    `SparseTensor` of type `dtype` and shape `(d0, ..., dn, max_num_labels)`,
+      where `max_num_labels` is the maximum number of non-zero values in any
+      row (in the example above, row (1, 1) has 3 non-zero values, so the result
+      shape is (2, 2, 3)). The values of this `SparseTensor` are in the range
+      `[0, num_classes)` and correspond to the index of non-ignore values along
+      the last dimension of `indicators`.
+
+  Raises:
+    ValueError: if `dtype` is not integer.
+  """
+  if not dtype.is_integer:
+    raise ValueError("Invalid dtype {} not integer.".format(dtype))
+  with ops.name_scope(
+      None, "indicators_to_sparse_ids", (indicators, ignore_value)):
+    # Convert indicators to binary ones and zeros. We use int64 since
+    # SparseTensor requires int64 indices.
+    indicators = ops.convert_to_tensor(indicators, name="indicators")
+    missing_indicators = math_ops.equal(
+        indicators, _ignore_value_tensor(indicators.dtype, ignore_value),
+        name="missing")
+    zeros_like_indicators = array_ops.zeros_like(
+        indicators, dtype=dtypes.int64, name="zeros")
+    binary_indicators = array_ops.where(
+        missing_indicators, zeros_like_indicators,
+        array_ops.ones_like(indicators, dtype=dtypes.int64, name="ones"),
+        name="binary_indicators")
+
+    # Use cumsum along the last dimension to generate per-row indexes.
+    # Note that these are 1-based (since 0 indicates missing values), so they're
+    # off-by-1 from the actual indices. We'll subtract 1 below. Since they're
+    # off-by-one, the max value is the size of the last dimension (i.e.,
+    # last_index + 1).
+    row_index_indicators = array_ops.where(
+        missing_indicators, zeros_like_indicators,
+        math_ops.cumsum(binary_indicators, axis=-1), "row_index_indicators")
+    result_last_dim = array_ops.reshape(
+        math_ops.reduce_max(row_index_indicators), shape=(1,),
+        name="result_last_dim")
+
+    # Convert to a SparseTensor. The values of this SparseTensor are the last
+    # indices of our result, and the last indices of this SparseTensor (i.e.,
+    # the class IDs indicated by `indicators`) are the values of our result, so
+    # we use tensor slicing and concat to swap them.
+    sparse_row_index_indicators = dense_to_sparse_tensor(
+        row_index_indicators, ignore_value=0)
+    return sparse_tensor.SparseTensor(
+        indices=array_ops.concat((
+            sparse_row_index_indicators.indices[:, :-1],
+            array_ops.reshape(sparse_row_index_indicators.values - 1, (-1, 1))
+        ), axis=1, name="indices"),
+        values=math_ops.cast(
+            sparse_row_index_indicators.indices[:, -1], dtype=dtype,
+            name="values"),
+        dense_shape=array_ops.concat(
+            (sparse_row_index_indicators.dense_shape[0:-1], result_last_dim),
+            axis=0, name="dense_shape"))
+
+
+def sparse_row_envelope(sparse_input, row_axis=0, col_axis=1, name=None):
+  """Returns the length of each 'row' in a `SparseTensor`.
+
+  For example, if `sparse_input` has indices `[[0,0], [2, 0], [2, 1], [2, 2]]`
+  and shape `[3, 3]`, this function will return `[1, 0, 3]`.
+
+  Args:
+    sparse_input: a `SparseTensor` of rank at least 2.
+    row_axis: An integer. The axis for the row of the envelope matrix. Default
+      is 0.
+    col_axis: An integer. The axis for the col of the envelope matrix. Default
+      is 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A one-dimensional `Tensor` whose entries correspond to the length of each
+    row of `SparseTensor`.
+
+  Raises:
+    ValueError: If row_axis and col_axis are the same axis or they are not
+      integers.
+  """
+  if not (isinstance(row_axis, compat.integral_types) and
+          isinstance(col_axis, compat.integral_types)):
+    raise ValueError("`row_axis` and `col_axis` must be integers.")
+
+  if row_axis == col_axis:
+    raise ValueError("Row and column can not be the same axis.")
+
+  with ops.name_scope(name, "sparse_row_envelope", [sparse_input]):
+    indices = sparse_input.indices
+    row_indices = indices[:, row_axis]
+    col_indices = indices[:, col_axis]
+    num_rows = math_ops.cast(sparse_input.dense_shape[row_axis], dtypes.int32)
+    row_envelope = math_ops.unsorted_segment_max(
+        col_indices + 1, row_indices, num_rows, name=name)
+    zeros = array_ops.zeros_like(row_envelope)
+    return array_ops.where(row_envelope > zeros, row_envelope, zeros)
diff --git a/tensorflow/contrib/layers/python/ops/sparse_ops_test.py b/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
index 664f7e3c1f9..d50750001ec 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
@@ -18,22 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.ops import sparse_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class SparseOpsTest(test.TestCase):
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class DenseToSparseTensorTest(test.TestCase):
 
   def test_dense_to_sparse_tensor_1d(self):
     with self.test_session() as sess:
@@ -110,7 +116,7 @@ class SparseOpsTest(test.TestCase):
     self.assertAllEqual([1, 2, 3, 4, 5, 7, 8, 9], result.values)
     self.assertAllEqual([2, 2, 4], result.dense_shape)
 
-  def test_dense_to_sparse_tensor_1d_no_shape(self):
+  def test_dense_to_sparse_tensor_unknown_1d_shape(self):
     with self.test_session() as sess:
       tensor = array_ops.placeholder(shape=[None], dtype=dtypes.int32)
       st = sparse_ops.dense_to_sparse_tensor(tensor)
@@ -119,7 +125,7 @@ class SparseOpsTest(test.TestCase):
     self.assertAllEqual([100, 3], result.values)
     self.assertAllEqual([4], result.dense_shape)
 
-  def test_dense_to_sparse_tensor_3d_no_shape(self):
+  def test_dense_to_sparse_tensor_unknown_3d_shape(self):
     with self.test_session() as sess:
       tensor = array_ops.placeholder(
           shape=[None, None, None], dtype=dtypes.int32)
@@ -134,11 +140,197 @@ class SparseOpsTest(test.TestCase):
     self.assertAllEqual([1, 2, 3, 4, 5, 7, 8, 9], result.values)
     self.assertAllEqual([2, 2, 4], result.dense_shape)
 
-  def test_convert_to_sparse_undef_shape(self):
+  def test_dense_to_sparse_unknown_rank(self):
+    ph = array_ops.placeholder(dtype=dtypes.int32)
+    with self.test_session() as sess:
+      st = sparse_ops.dense_to_sparse_tensor(ph)
+      result = sess.run(st, feed_dict={ph: [[1, 2, 0, 0], [3, 4, 5, 0]]})
+    self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
+                        result.indices)
+    self.assertAllEqual([1, 2, 3, 4, 5], result.values)
+    self.assertAllEqual([2, 4], result.dense_shape)
+
+
+class SparseRowEnvelopeTest(test.TestCase):
+
+  def test_sparse_row_envelope(self):
+    expected_sparse_row_envelope = [1, 0, 3]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [2, 0], [2, 1], [2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[3, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_unsorted_indices(self):
+    expected_sparse_row_envelope = [1, 0, 3]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[2, 0], [2, 2], [2, 1], [0, 0]],
+          values=[0, 1, 2, 3],
+          dense_shape=[3, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_empty_in_the_end(self):
+    expected_sparse_row_envelope = [1, 0, 3, 0, 0]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [2, 0], [2, 1], [2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[5, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_empty_3d(self):
+    expected_sparse_row_envelope = [1, 0, 3, 0, 0]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0, 0], [0, 2, 0], [0, 2, 1], [0, 2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[1, 5, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input, 1, 2))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+
+class IndicatorToSparseIdsTest(test.TestCase):
+
+  def test_indicators_to_sparse_ids_1d(self):
+    indicators = (0, 0, 1, 0)
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
     with self.test_session():
-      with self.assertRaises(ValueError):
-        tensor = array_ops.placeholder(dtype=dtypes.int32)
-        sparse_ops.dense_to_sparse_tensor(tensor)
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0,),),
+          values=(2,),
+          dense_shape=(1,),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_2d(self):
+    indicators = (
+        (0, 0, 1, 0),
+        (1, 0, 0, 1),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 3),
+          dense_shape=(2, 2),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_3d(self):
+    indicators = (
+        ((0, 0, 1, 0, 0), (0, 0, 0, 0, 0)),
+        ((1, 0, 0, 1, 0), (0, 0, 1, 0, 0)),
+        ((0, 0, 0, 0, 0), (0, 0, 0, 0, 0)),
+        ((1, 0, 0, 1, 1), (0, 0, 1, 0, 0)),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=(
+              (0, 0, 0),
+              (1, 0, 0), (1, 0, 1), (1, 1, 0),
+              (3, 0, 0), (3, 0, 1), (3, 0, 2), (3, 1, 0)
+          ), values=(
+              2,
+              0, 3, 2,
+              0, 3, 4, 2
+          ), dense_shape=(4, 2, 3),
+      ), sparse_ids.eval())
+
+  def test_int16_to_sparse_ids_2d(self):
+    indicators = (
+        (0, 0, 1, 0),
+        (1, 0, 0, 1),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(
+        indicators, dtype=dtypes.int16)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=np.array((2, 0, 3), dtype=np.int16),
+          dense_shape=(2, 2),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_ignore_value(self):
+    indicators = (
+        ((-1, -1, 10, -1), (-1, -1, -1, -1)),
+        ((11, -1, -1, 12), (-1, -1, 13, -1)),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(
+        indicators, ignore_value=-1)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_string_indicators_to_sparse_ids(self):
+    indicators = (
+        (('', '', 'A', ''), ('', '', '', '')),
+        (('B', '', '', 'C'), ('', '', 'D', '')),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_string_indicators_to_sparse_ids_ignore_value(self):
+    indicators = (
+        (('x', 'x', 'A', 'x'), ('x', 'x', 'x', 'x')),
+        (('B', 'x', 'x', 'C'), ('x', 'x', 'D', 'x')),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(
+        indicators, ignore_value='x')
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_unknown_3d_shape(self):
+    indicators_values = (
+        ((0, 0, 1, 0), (0, 0, 0, 0)),
+        ((1, 0, 0, 1), (0, 0, 1, 0)),
+    )
+    indicators = array_ops.placeholder(
+        dtype=dtypes.int32, shape=(None, None, None))
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval(feed_dict={indicators: indicators_values}))
+
+  def test_indicators_to_sparse_ids_unknown_rank(self):
+    indicators_values = (
+        ((0, 0, 1, 0), (0, 0, 0, 0)),
+        ((1, 0, 0, 1), (0, 0, 1, 0)),
+    )
+    indicators = array_ops.placeholder(dtype=dtypes.int32)
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval(feed_dict={indicators: indicators_values}))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 5fac58b0bca..0ab39e35d0b 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -7,9 +7,11 @@ exports_files(["LICENSE"])
 
 package(default_visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
-    "//tensorflow:__subpackages__",
+    "//tensorflow:internal",
 ])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "learn",
     srcs = [
@@ -64,6 +66,9 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:inputs",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -74,6 +79,16 @@ py_library(
     ],
 )
 
+# Exposes constants without having to build the entire :learn target.
+py_library(
+    name = "estimator_constants_py",
+    srcs = [
+        "python/learn/estimators/constants.py",
+        "python/learn/estimators/prediction_key.py",
+    ],
+    srcs_version = "PY2AND3",
+)
+
 py_test(
     name = "data_feeder_test",
     size = "small",
@@ -463,6 +478,7 @@ py_test(
 
 py_test(
     name = "tensor_signature_test",
+    size = "small",
     srcs = ["python/learn/estimators/tensor_signature_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -508,6 +524,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "estimator_input_test",
+    size = "medium",
+    srcs = ["python/learn/estimators/estimator_input_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn/python/learn/datasets",
+        "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "logistic_regressor_test",
     size = "small",
@@ -568,6 +605,7 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -629,7 +667,6 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],  # b/33965977
     deps = [
         ":learn",
         "//tensorflow/contrib/factorization:factorization_py",
@@ -725,6 +762,7 @@ py_test(
     srcs = ["python/learn/estimators/linear_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -800,6 +838,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "model_fn_test",
+    size = "small",
+    srcs = ["python/learn/estimators/model_fn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "multioutput_test",
     size = "small",
@@ -841,6 +892,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rnn_common_test",
+    size = "medium",
+    srcs = ["python/learn/estimators/rnn_common_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_test(
     name = "ops_test",
     size = "small",
@@ -966,10 +1033,12 @@ py_test(
 py_test(
     name = "export_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/learn/utils/export_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # http://b/31032996
+        "notap",  # TODO(b/37950026): Test is flaky
     ],
     deps = [
         ":learn",
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 2cc38fbbec7..c06343b731d 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -14,11 +14,9 @@
 # ==============================================================================
 
 # TODO(ptucker,ipolosukhin): Improve descriptions.
-"""High level API for learning with TensorFlow.
+"""High level API for learning.
 
-## Estimators
-
-Train and evaluate TensorFlow models.
+See the @{$python/contrib.learn} guide.
 
 @@BaseEstimator
 @@Estimator
@@ -30,22 +28,33 @@ Train and evaluate TensorFlow models.
 @@MetricSpec
 @@PredictionKey
 @@DNNClassifier
+@@DNNEstimator
 @@DNNRegressor
 @@DNNLinearCombinedRegressor
+@@DNNLinearCombinedEstimator
 @@DNNLinearCombinedClassifier
+@@DynamicRnnEstimator
 @@LinearClassifier
+@@LinearEstimator
 @@LinearRegressor
 @@LogisticRegressor
+@@StateSavingRnnEstimator
+@@SVM
+@@SKCompat
+
+@@Head
+@@multi_class_head
+@@multi_label_head
+@@binary_svm_head
+@@regression_head
+@@poisson_regression_head
+@@multi_head
+@@no_op_train_fn
 
-## Distributed training utilities
 @@Experiment
 @@ExportStrategy
 @@TaskType
 
-## Graph actions
-
-Perform various training, evaluation, and inference actions on a graph.
-
 @@NanLossDuringTrainingError
 @@RunConfig
 @@evaluate
@@ -54,10 +63,6 @@ Perform various training, evaluation, and inference actions on a graph.
 @@run_n
 @@train
 
-## Input processing
-
-Queue and read batched input data.
-
 @@extract_dask_data
 @@extract_dask_labels
 @@extract_pandas_data
@@ -69,10 +74,10 @@ Queue and read batched input data.
 @@read_batch_features
 @@read_batch_record_features
 
-Export utilities
-
-@@build_parsing_serving_input_fn
+@@InputFnOps
 @@ProblemType
+@@build_parsing_serving_input_fn
+@@make_export_strategy
 """
 
 from __future__ import absolute_import
@@ -83,9 +88,11 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn import *
 # pylint: enable=wildcard-import
 
+from tensorflow.contrib.learn.python.learn import learn_runner_lib as learn_runner
+
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['datasets', 'head', 'io', 'models',
+_allowed_symbols = ['datasets', 'head', 'io', 'learn_runner', 'models',
                     'monitors', 'NotFittedError', 'ops', 'preprocessing',
                     'utils', 'graph_actions']
 
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
deleted file mode 100644
index c7e0dc816e2..00000000000
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ /dev/null
@@ -1,218 +0,0 @@
-# TF Learn
-
-TF Learn is a simplified interface for TensorFlow, to get people started on predictive analytics and data mining. The library covers a variety of needs: from linear models to *Deep Learning* applications like text and image understanding.
-
-### Why *TensorFlow*?
-
-* TensorFlow provides a good backbone for building different shapes of machine learning applications.
-* It will continue to evolve both in the distributed direction and as general pipelinining machinery.
-
-### Why *TensorFlow Learn*?
-
-- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.fit)/[predict](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.predict) and slide into TensorFlow APIs as you are getting comfortable.
-- To provide a set of reference models that will be easy to integrate with existing code.
-
-## Installation
-
-[Install TensorFlow](../../../../g3doc/get_started/os_setup.md), and then simply import `learn` via `from tensorflow.contrib.learn` or use `tf.contrib.learn`.
-
-Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [pandas](http://pandas.pydata.org/) for additional functionality.
-
-### Tutorials
-
-- [TF Learn Quickstart](../../../../g3doc/tutorials/tflearn/index.md). Build, train, and evaluate a neural network with just a few lines of code.
-- [Linear Model](../../../../g3doc/tutorials/wide/index.md). Learn the basics of building linear models.
-- [Logging and Monitoring](../../../../g3doc/tutorials/monitors/index.md). Use the Monitor API to audit training of a neural network.
-- [Wide and Deep Learning](../../../../g3doc/tutorials/wide_and_deep/index.md). Jointly train a linear model and a deep neural network.
--  More coming soon.
-
-### Community
-
-- Twitter [#tensorflow](https://twitter.com/search?q=tensorflow&src=typd).
-- StackOverflow with [tensorflow tag](http://stackoverflow.com/questions/tagged/tensorflow) for questions and struggles.
-- GitHub [issues](https://github.com/tensorflow/tensorflow/issues) for technical discussions and feature requests.
-
-### Usage
-
-Below are a few simple examples of the API. For more examples, please see [examples](https://www.tensorflow.org/code/tensorflow/examples/learn).
-
-General tips:
-
--  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](../../../../g3doc/api_docs/python/contrib.learn.md#estimators). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
-
--  Categorical variables should be managed before passing input to the estimator.
-
-## Linear Classifier
-
-Simple linear classification:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.LinearClassifier(n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Linear Regressor
-
-Simple linear regression:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics, preprocessing
-
-boston = datasets.load_boston()
-x = preprocessing.StandardScaler().fit_transform(boston.data)
-feature_columns = learn.infer_real_valued_columns_from_input(x)
-regressor = learn.LinearRegressor(feature_columns=feature_columns)
-regressor.fit(x, boston.target, steps=200, batch_size=32)
-boston_predictions = list(regressor.predict(x, as_iterable=True))
-score = metrics.mean_squared_error(boston_predictions, boston.target)
-print ("MSE: %f" % score)
-```
-
-## Deep Neural Network
-
-Example of 3 layer network with 10, 20 and 10 hidden units respectively:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Custom model
-
-Example of how to pass a custom model to the Estimator:
-
-```python
-from sklearn import datasets
-from sklearn import metrics
-import tensorflow as tf
-import tensorflow.contrib.layers.python.layers as layers
-import tensorflow.contrib.learn.python.learn as learn
-
-iris = datasets.load_iris()
-
-def my_model(features, labels):
-  """DNN with three hidden layers."""
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  labels = tf.one_hot(labels, 3, 1, 0)
-
-  # Create three fully connected layers respectively of size 10, 20, and 10.
-  features = layers.stack(features, layers.fully_connected, [10, 20, 10])
-
-  # Create two tensors respectively for prediction and loss.
-  prediction, loss = (
-      tf.contrib.learn.models.logistic_regression(features, labels)
-  )
-
-  # Create a tensor for training op.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss, tf.contrib.framework.get_global_step(), optimizer='Adagrad',
-      learning_rate=0.1)
-
-  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op
-
-classifier = learn.Estimator(model_fn=my_model)
-classifier.fit(iris.data, iris.target, steps=1000)
-
-y_predicted = [
-  p['class'] for p in classifier.predict(iris.data, as_iterable=True)]
-score = metrics.accuracy_score(iris.target, y_predicted)
-print('Accuracy: {0:f}'.format(score))
-```
-
-## Saving / Restoring models
-
-Each estimator supports a `model_dir` argument, which takes a folder path where all model information will be saved:
-
-```python
-classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-```
-
-If you run multiple `fit` operations on the same `Estimator`, training will resume where the last operation left off, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 2.40115, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.338706, step = 101
-INFO:tensorflow:loss = 0.159414, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0953846.
-
-<strong>classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.113173, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.175782, step = 401
-INFO:tensorflow:loss = 0.119735, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0518137.</pre>
-
-To restore checkpoints to a new `Estimator`, just pass it the same `model_dir` argument, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 1.16335, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.176995, step = 101
-INFO:tensorflow:loss = 0.184573, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0512496.
-
-<strong>classifier2 = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier2.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.0543797, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.101036, step = 401
-INFO:tensorflow:loss = 0.137956, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0162506.</pre>
-
-## Summaries
-
-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](../../../../g3doc/api_docs/python/train.md#summary-operations) operations.)
-
-To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:
-
-```shell
-tensorboard --logdir=/tmp/tf_examples/my_model_1
-```
-
-and then load the reported URL.
-
-**Graph visualization**
-
-![Text classification RNN Graph](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_graph.png)
-
-**Loss visualization**
-
-![Text classification RNN Loss](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_loss.png)
-
-## More examples
-
-See the [examples folder](https://www.tensorflow.org/code/tensorflow/examples/learn) for:
-
--  An easy way to handle [categorical variables](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification.py) (words are just an example of a categorical variable)
--  Text Classification: see examples for [RNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_rnn.py) and [CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_cnn.py) on characters
--  [Digit recognition using a CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/mnist.py)
--  And much more!
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
index d65f2465285..afb36318240 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
@@ -40,18 +40,23 @@ from tensorflow.contrib.learn.python.learn.dataframe.transforms.reader_source im
 #     .transforms.split_mask import SplitMask
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.sum import Sum
 
-
-# pylint: disable=g-import-not-at-top,g-bad-import-order
-
-# Unary Transform registration
+# pylint: disable=g-bad-import-order
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import unary_transforms as _ut
-for ut_def in _ut.UNARY_TRANSFORMS:
-  _ut.register_unary_op(*ut_def)
-
-# Comparison Transform registration
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import binary_transforms as _bt
-for bt_def in _bt.BINARY_TRANSFORMS:
-  _bt.register_binary_op(*bt_def)
+
+from tensorflow.python.util import deprecation
+
+
+# Suppress deprecation warnings in these registrations.
+with deprecation.silence():
+  # Unary Transform registration
+  for ut_def in _ut.UNARY_TRANSFORMS:
+    _ut.register_unary_op(*ut_def)
+
+  # Comparison Transform registration
+  for bt_def in _bt.BINARY_TRANSFORMS:
+    _bt.register_binary_op(*bt_def)
+
 
 __all__ = ['DataFrame', 'Series', 'PredefinedSeries', 'TransformedSeries',
            'TensorFlowDataFrame', 'TensorFlowTransform', 'parameter',
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
index dd836c1dec3..cc08a47c391 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
@@ -24,10 +24,13 @@ import collections
 from .series import Series
 from .transform import Transform
 
+from tensorflow.python.util.deprecation import deprecated
+
 
 class DataFrame(object):
   """A DataFrame is a container for ingesting and preprocessing data."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self):
     self._columns = {}
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
index 0ca8afe498b..377299ed57d 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.learn.python.learn.dataframe import series as ss
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.deprecation import deprecated
 
 
 def _to_feature_spec(tensor, default_value=None):
@@ -89,6 +90,7 @@ def _build_alternate_universe(
   return new_feature_series_dict, feature_specs
 
 
+@deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
 def to_feature_columns_and_input_fn(dataframe,
                                     base_input_keys_with_defaults,
                                     feature_keys,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
index 44f0e435bbf..c3ac4ad84d1 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@@ -18,301 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import random
-import numpy as np
+# pylint: disable=unused-import
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _ArrayFeedFn
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _OrderedDictNumpyFeedFn
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _PandasFeedFn
+# pylint: enable=unused-import
 
-from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_queue_runner as fqr
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary
-from tensorflow.python.training import queue_runner
-
-# pylint: disable=g-import-not-at-top
-try:
-  import pandas as pd
-  HAS_PANDAS = True
-except ImportError:
-  HAS_PANDAS = False
+from tensorflow.python.util.deprecation import deprecated
 
 
-class _ArrayFeedFn(object):
-  """Creates feed dictionaries from numpy arrays."""
-
-  def __init__(self,
-               placeholders,
-               array,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != 2:
-      raise ValueError("_array_feed_fn expects 2 placeholders; got {}.".format(
-          len(placeholders)))
-    self._placeholders = placeholders
-    self._array = array
-    self._max = len(array)
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    if self._num_epochs and self._epoch >= self._num_epochs:
-      raise errors.OutOfRangeError(None, None,
-                                   "Already emitted %s epochs." % self._epoch)
-
-    integer_indexes = [
-        j % self._max for j in range(self._trav, self._trav + self._batch_size)
-    ]
-
-    if self._epoch_end in integer_indexes:
-      # after this batch we will have processed self._epoch epochs, possibly
-      # overshooting a bit to fill out a batch.
-      self._epoch += 1
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    return {
-        self._placeholders[0]: integer_indexes,
-        self._placeholders[1]: self._array[integer_indexes]
-    }
-
-
-class _OrderedDictNumpyFeedFn(object):
-  """Creates feed dictionaries from `OrderedDict`s of numpy arrays."""
-
-  def __init__(self,
-               placeholders,
-               ordered_dict_of_arrays,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != len(ordered_dict_of_arrays) + 1:
-      raise ValueError("Expected {} placeholders; got {}.".format(
-          len(ordered_dict_of_arrays), len(placeholders)))
-    self._index_placeholder = placeholders[0]
-    self._col_placeholders = placeholders[1:]
-    self._ordered_dict_of_arrays = ordered_dict_of_arrays
-    self._max = len(next(iter(ordered_dict_of_arrays.values())))
-    for _, v in ordered_dict_of_arrays.items():
-      if len(v) != self._max:
-        raise ValueError("Array lengths must match.")
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    if self._num_epochs and self._epoch >= self._num_epochs:
-      raise errors.OutOfRangeError(None, None,
-                                   "Already emitted %s epochs." % self._epoch)
-
-    integer_indexes = [
-        j % self._max for j in range(self._trav, self._trav + self._batch_size)
-    ]
-
-    if self._epoch_end in integer_indexes:
-      # after this batch we will have processed self._epoch epochs, possibly
-      # overshooting a bit to fill out a batch.
-      self._epoch += 1
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    feed_dict = {self._index_placeholder: integer_indexes}
-    cols = [
-        column[integer_indexes]
-        for column in self._ordered_dict_of_arrays.values()
-    ]
-    feed_dict.update(dict(zip(self._col_placeholders, cols)))
-    return feed_dict
-
-
-class _PandasFeedFn(object):
-  """Creates feed dictionaries from pandas `DataFrames`."""
-
-  def __init__(self,
-               placeholders,
-               dataframe,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != len(dataframe.columns) + 1:
-      raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
-    self._index_placeholder = placeholders[0]
-    self._col_placeholders = placeholders[1:]
-    self._dataframe = dataframe
-    self._max = len(dataframe)
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    if self._num_epochs and self._epoch >= self._num_epochs:
-      raise errors.OutOfRangeError(None, None,
-                                   "Already emitted %s epochs." % self._epoch)
-
-    integer_indexes = [
-        j % self._max for j in range(self._trav, self._trav + self._batch_size)
-    ]
-
-    if self._epoch_end in integer_indexes:
-      # after this batch we will have processed self._epoch epochs, possibly
-      # overshooting a bit to fill out a batch.
-      self._epoch += 1
-      if self._epoch == self._num_epochs:
-        # trim this batch, so as not to overshoot the last epoch.
-        batch_end_inclusive = integer_indexes.index(self._epoch_end)
-        integer_indexes = integer_indexes[:(batch_end_inclusive + 1)]
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    result = self._dataframe.iloc[integer_indexes]
-    cols = [result[col].values for col in result.columns]
-    feed_dict = dict(zip(self._col_placeholders, cols))
-    feed_dict[self._index_placeholder] = result.index.values
-    return feed_dict
-
-
-def enqueue_data(data,
-                 capacity,
-                 shuffle=False,
-                 min_after_dequeue=None,
-                 num_threads=1,
-                 seed=None,
-                 name="enqueue_input",
-                 enqueue_size=1,
-                 num_epochs=None):
-  """Creates a queue filled from a numpy array or pandas `DataFrame`.
-
-    Returns a queue filled with the rows of the given (`OrderedDict` of) array
-    or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued
-    `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of)
-    numpy arrays, the first enqueued `Tensor` contains the row number.
-
-  Args:
-    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or pandas
-      `DataFrame` that will be read into the queue.
-    capacity: the capacity of the queue.
-    shuffle: whether or not to shuffle the rows of the array.
-    min_after_dequeue: minimum number of elements that can remain in the queue
-    after a dequeue operation. Only used when `shuffle` is true. If not set,
-    defaults to `capacity` / 4.
-    num_threads: number of threads used for reading and enqueueing.
-    seed: used to seed shuffling and reader starting points.
-    name: a scope name identifying the data.
-    enqueue_size: the number of rows to enqueue per step.
-    num_epochs: limit enqueuing to a specified number of epochs, if provided.
-
-  Returns:
-    A queue filled with the rows of the given (`OrderedDict` of) array or
-      `DataFrame`.
-
-  Raises:
-    TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
-      arrays  or a numpy `ndarray`.
-  """
-  with ops.name_scope(name):
-    if isinstance(data, np.ndarray):
-      types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
-      queue_shapes = [(), data.shape[1:]]
-      get_feed_fn = _ArrayFeedFn
-    elif isinstance(data, collections.OrderedDict):
-      types = [dtypes.int64] + [
-          dtypes.as_dtype(col.dtype) for col in data.values()
-      ]
-      queue_shapes = [()] + [col.shape[1:] for col in data.values()]
-      get_feed_fn = _OrderedDictNumpyFeedFn
-    elif HAS_PANDAS and isinstance(data, pd.DataFrame):
-      types = [
-          dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)
-      ]
-      queue_shapes = [() for _ in types]
-      get_feed_fn = _PandasFeedFn
-    else:
-      raise TypeError(
-          "data must be either a numpy array or pandas DataFrame if pandas is "
-          "installed; got {}".format(type(data).__name__))
-
-    # TODO(jamieas): TensorBoard warnings for all warnings below once available.
-
-    if num_threads > 1 and num_epochs is not None:
-      logging.warning(
-          "enqueue_data was called with num_epochs and num_threads > 1. "
-          "num_epochs is applied per thread, so this will produce more "
-          "epochs than you probably intend. "
-          "If you want to limit epochs, use one thread.")
-
-    if shuffle and num_threads > 1 and num_epochs is not None:
-      logging.warning(
-          "enqueue_data was called with shuffle=True, num_threads > 1, and "
-          "num_epochs. This will create multiple threads, all reading the "
-          "array/dataframe in order adding to the same shuffling queue; the "
-          "results will likely not be sufficiently shuffled.")
-
-    if not shuffle and num_threads > 1:
-      logging.warning(
-          "enqueue_data was called with shuffle=False and num_threads > 1. "
-          "This will create multiple threads, all reading the "
-          "array/dataframe in order. If you want examples read in order, use"
-          " one thread; if you want multiple threads, enable shuffling.")
-
-    if shuffle:
-      min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
-                              min_after_dequeue)
-      queue = data_flow_ops.RandomShuffleQueue(
-          capacity,
-          min_after_dequeue,
-          dtypes=types,
-          shapes=queue_shapes,
-          seed=seed)
-    else:
-      min_after_dequeue = 0  # just for the summary text
-      queue = data_flow_ops.FIFOQueue(
-          capacity, dtypes=types, shapes=queue_shapes)
-
-    enqueue_ops = []
-    feed_fns = []
-
-    for i in range(num_threads):
-      # Note the placeholders have no shapes, so they will accept any
-      # enqueue_size.  enqueue_many below will break them up.
-      placeholders = [array_ops.placeholder(t) for t in types]
-
-      enqueue_ops.append(queue.enqueue_many(placeholders))
-      seed_i = None if seed is None else (i + 1) * seed
-      feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs))
-
-    runner = fqr.FeedingQueueRunner(
-        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
-    queue_runner.add_queue_runner(runner)
-
-    full = (math_ops.cast(
-        math_ops.maximum(0, queue.size() - min_after_dequeue),
-        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
-    # Note that name contains a '/' at the end so we intentionally do not place
-    # a '/' after %s below.
-    summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
-                    (queue.name, min_after_dequeue,
-                     capacity - min_after_dequeue))
-    summary.scalar(summary_name, full)
-    return queue
+@deprecated('2017-06-15', 'Moved to tf.contrib.training.enqueue_data.')
+def enqueue_data(*args, **kwargs):
+  return _enqueue_data(*args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
index 72f3bbc3f47..4f6a4b82a68 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
@@ -19,162 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import queue_runner as qr
+from tensorflow.python.estimator.inputs.queues.feeding_queue_runner import _FeedingQueueRunner
+from tensorflow.python.util.deprecation import deprecated
 
 
-class FeedingQueueRunner(qr.QueueRunner):
-  """A queue runner that allows the feeding of values such as numpy arrays."""
+class FeedingQueueRunner(_FeedingQueueRunner):
 
-  def __init__(self, queue=None, enqueue_ops=None, close_op=None,
-               cancel_op=None, feed_fns=None,
-               queue_closed_exception_types=None):
-    """Initialize the queue runner.
-
-    For further documentation, see `queue_runner.py`. Note that
-    `FeedingQueueRunner` does not support construction from protobuffer nor
-    serialization to protobuffer.
-
-    Args:
-      queue: A `Queue`.
-      enqueue_ops: List of enqueue ops to run in threads later.
-      close_op: Op to close the queue. Pending enqueue ops are preserved.
-      cancel_op: Op to close the queue and cancel pending enqueue ops.
-      feed_fns: a list of functions that return a dictionary mapping fed
-        `Tensor`s to values. Must be the same length as `enqueue_ops`.
-      queue_closed_exception_types: Optional tuple of Exception types that
-        indicate that the queue has been closed when raised during an enqueue
-        operation.  Defaults to
-        `(tf.errors.OutOfRangeError, tf.errors.CancelledError)`.
-
-    Raises:
-      ValueError: `feed_fns` is not `None` and has different length than
-        `enqueue_ops`.
-    """
-    if queue_closed_exception_types is None:
-      queue_closed_exception_types = (
-          errors.OutOfRangeError, errors.CancelledError)
-    super(FeedingQueueRunner, self).__init__(
-        queue, enqueue_ops, close_op,
-        cancel_op, queue_closed_exception_types=queue_closed_exception_types)
-    if feed_fns is None:
-      self._feed_fns = [None for _ in enqueue_ops]
-    else:
-      if len(feed_fns) != len(enqueue_ops):
-        raise ValueError(
-            "If feed_fns is not None, it must have the same length as "
-            "enqueue_ops.")
-      self._feed_fns = feed_fns
-
-  # pylint: disable=broad-except
-  def _run(self, sess, enqueue_op, feed_fn, coord=None):
-    """Execute the enqueue op in a loop, close the queue in case of error.
-
-    Args:
-      sess: A `Session`.
-      enqueue_op: The `Operation` to run.
-      feed_fn: the feed function to pass to `sess.run`.
-      coord: Optional `Coordinator` object for reporting errors and checking
-        for stop conditions.
-
-    """
-    # TODO(jamieas): Reduce code duplication with `QueueRunner`.
-    if coord:
-      coord.register_thread(threading.current_thread())
-    decremented = False
-    try:
-      while True:
-        if coord and coord.should_stop():
-          break
-        try:
-          feed_dict = None if feed_fn is None else feed_fn()
-          sess.run(enqueue_op, feed_dict=feed_dict)
-        except (errors.OutOfRangeError, errors.CancelledError):
-          # This exception indicates that a queue was closed.
-          with self._lock:
-            self._runs_per_session[sess] -= 1
-            decremented = True
-            if self._runs_per_session[sess] == 0:
-              try:
-                sess.run(self._close_op)
-              except Exception as e:
-                # Intentionally ignore errors from close_op.
-                logging.vlog(1, "Ignored exception: %s", str(e))
-            return
-    except Exception as e:
-      # This catches all other exceptions.
-      if coord:
-        coord.request_stop(e)
-      else:
-        logging.error("Exception in QueueRunner: %s", str(e))
-        with self._lock:
-          self._exceptions_raised.append(e)
-        raise
-    finally:
-      # Make sure we account for all terminations: normal or errors.
-      if not decremented:
-        with self._lock:
-          self._runs_per_session[sess] -= 1
-
-  def create_threads(self, sess, coord=None, daemon=False, start=False):
-    """Create threads to run the enqueue ops for the given session.
-
-    This method requires a session in which the graph was launched.  It creates
-    a list of threads, optionally starting them.  There is one thread for each
-    op passed in `enqueue_ops`.
-
-    The `coord` argument is an optional coordinator, that the threads will use
-    to terminate together and report exceptions.  If a coordinator is given,
-    this method starts an additional thread to close the queue when the
-    coordinator requests a stop.
-
-    If previously created threads for the given session are still running, no
-    new threads will be created.
-
-    Args:
-      sess: A `Session`.
-      coord: Optional `Coordinator` object for reporting errors and checking
-        stop conditions.
-      daemon: Boolean.  If `True` make the threads daemon threads.
-      start: Boolean.  If `True` starts the threads.  If `False` the
-        caller must call the `start()` method of the returned threads.
-
-    Returns:
-      A list of threads.
-    """
-    with self._lock:
-      try:
-        if self._runs_per_session[sess] > 0:
-          # Already started: no new threads to return.
-          return []
-      except KeyError:
-        # We haven't seen this session yet.
-        pass
-      self._runs_per_session[sess] = len(self._enqueue_ops)
-      self._exceptions_raised = []
-
-    ret_threads = [threading.Thread(target=self._run,
-                                    args=(sess, op, feed_fn, coord))
-                   for op, feed_fn in zip(self._enqueue_ops, self._feed_fns)]
-    if coord:
-      ret_threads.append(threading.Thread(target=self._close_on_stop,
-                                          args=(sess, self._cancel_op, coord)))
-    for t in ret_threads:
-      if daemon:
-        t.daemon = True
-      if start:
-        t.start()
-    return ret_threads
-
-  def _init_from_proto(self, queue_runner_def):
-    raise NotImplementedError(
-        "{} does not support initialization from proto.".format(type(
-            self).__name__))
-
-  def to_proto(self):
-    raise NotImplementedError(
-        "{} does not support serialization to proto.".format(type(
-            self).__name__))
+  @deprecated('2017-06-15', 'Moved to tf.contrib.training.FeedingQueueRunner.')
+  def __init__(self, *args, **kwargs):
+    super(FeedingQueueRunner, self).__init__(*args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/series.py b/tensorflow/contrib/learn/python/learn/dataframe/series.py
index 5893db3aad2..39ffcd74697 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/series.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/series.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from abc import ABCMeta
 
+from tensorflow.python.util.deprecation import deprecated
+
 
 class Series(object):
   """A single output series.
@@ -106,6 +108,7 @@ class Series(object):
 class PredefinedSeries(Series):
   """A `Series` that requires the cache to already map a given name."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self, name, feature_spec):
     super(PredefinedSeries, self).__init__()
     self._name = name
@@ -144,6 +147,7 @@ class PredefinedSeries(Series):
 class TransformedSeries(Series):
   """A `Series` that results from applying a `Transform` to a list of inputs."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self, input_series, transform, output_name):
     super(TransformedSeries, self).__init__()
     self._input_series = input_series
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
index b17a4b8d05b..f316c5c9804 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
@@ -97,7 +97,7 @@ class TensorFlowDataFrame(df.DataFrame):
       graph: the `Graph` in which the `DataFrame` should be built.
       session: the `Session` in which to run the columns of the `DataFrame`.
       start_queues: if true, queues will be started before running and halted
-        after producting `n` batches.
+        after producing `n` batches.
       initialize_variables: if true, variables will be initialized.
       **kwargs: Additional keyword arguments e.g. `num_epochs`.
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transform.py b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
index c28da59ac76..1fac1e79cf3 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transform.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
@@ -24,11 +24,13 @@ from abc import abstractmethod
 from abc import abstractproperty
 
 import collections
-import inspect
 
 from .series import Series
 from .series import TransformedSeries
 
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
+
 
 def _make_list_of_series(x):
   """Converts `x` into a list of `Series` if possible.
@@ -86,7 +88,12 @@ def _make_tuple_of_string(x):
                   "got %s" % type(x).__name__)
 
 
+@deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
 def parameter(func):
+  return _parameter(func)
+
+
+def _parameter(func):
   """Tag functions annotated with `@parameter` for later retrieval.
 
   Note that all `@parameter`s are automatically `@property`s as well.
@@ -109,6 +116,7 @@ class Transform(object):
 
   __metaclass__ = ABCMeta
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self):
     self._return_type = None
 
@@ -120,7 +128,7 @@ class Transform(object):
   def parameters(self):
     """A dict of names to values of properties marked with `@parameter`."""
     property_param_names = [name
-                            for name, func in inspect.getmembers(type(self))
+                            for name, func in tf_inspect.getmembers(type(self))
                             if (hasattr(func, "fget") and hasattr(
                                 getattr(func, "fget"), "is_parameter"))]
     return {name: getattr(self, name) for name in property_param_names}
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
index 200ec57b672..d614636b66a 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
@@ -38,15 +38,15 @@ class AbstractBatchTransform(transform.TensorFlowTransform):
     self._queue_capacity = (self.batch_size * 10 if queue_capacity is None
                             else queue_capacity)
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def batch_size(self):
     return self._batch_size
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def num_threads(self):
     return self._num_threads
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def queue_capacity(self):
     return self._queue_capacity
 
@@ -113,11 +113,11 @@ class ShuffleBatch(AbstractBatchTransform):
                                   else min_after_dequeue)
     self._seed = seed
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def min_after_dequeue(self):
     return self._min_after_dequeue
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def seed(self):
     return self._seed
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
index aa3949c8c03..0ff8a2434b5 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
@@ -75,7 +75,7 @@ class ScalarBinaryTransform(transform.TensorFlowTransform):
     super(ScalarBinaryTransform, self).__init__()
     self._scalar = scalar
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def scalar(self):
     return self._scalar
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
index 08620ae62ed..c526e0be76e 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
@@ -50,11 +50,11 @@ class CSVParser(transform.TensorFlowTransform):
   def _output_names(self):
     return self.column_names
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def column_names(self):
     return self._column_names
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def default_values(self):
     return self._default_values
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
index 61abf41339a..2d9cb89b838 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
@@ -31,7 +31,7 @@ class Densify(transform.TensorFlowTransform):
     super(Densify, self).__init__()
     self._default_value = default_value
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def default_value(self):
     return self._default_value
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
index f0176600c36..0a4d184276d 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
@@ -57,7 +57,7 @@ class ExampleParser(transform.TensorFlowTransform):
   def _output_names(self):
     return list(self._ordered_features.keys())
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def feature_definitions(self):
     return self._ordered_features
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
index 674107e4961..36f9e768772 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
@@ -49,39 +49,39 @@ class BaseInMemorySource(transform.TensorFlowTransform):
     self._seed = seed
     self._data_name = data_name
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def data(self):
     return self._data
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def num_threads(self):
     return self._num_threads
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def enqueue_size(self):
     return self._enqueue_size
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def batch_size(self):
     return self._batch_size
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def queue_capacity(self):
     return self._queue_capacity
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def shuffle(self):
     return self._shuffle
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def min_after_dequeue(self):
     return self._min_after_dequeue
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def seed(self):
     return self._seed
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def data_name(self):
     return self._data_name
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
index e8fa402bd60..382408b85d4 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
@@ -75,43 +75,43 @@ class ReaderSource(transform.TensorFlowTransform):
     self._num_threads = num_threads
     self._seed = seed
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def reader_cls(self):
     return self._reader_cls
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def work_units(self):
     return self._work_units
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def reader_kwargs(self):
     return self._reader_kwargs
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def enqueue_size(self):
     return self._enqueue_size
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def batch_size(self):
     return self._batch_size
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def queue_capacity(self):
     return self._queue_capacity
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def shuffle(self):
     return self._shuffle
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def min_after_dequeue(self):
     return self._min_after_dequeue
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def num_threads(self):
     return self._num_threads
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def seed(self):
     return self._seed
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
index de83a0e19f4..9a23a041438 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
@@ -36,7 +36,7 @@ class Sparsify(transform.TensorFlowTransform):
     super(Sparsify, self).__init__()
     self._strip_value = strip_value
 
-  @transform.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def strip_value(self):
     return self._strip_value
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index ef62089f663..de9b786a2c8 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -1,11 +1,13 @@
 # Prepare training and testing data.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 filegroup(
     name = "data_csv",
     srcs = glob(["data/*.csv"]),
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base_test.py b/tensorflow/contrib/learn/python/learn/datasets/base_test.py
index 6a8abcbd25b..bc60d3797dc 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/load_csv_test.py b/tensorflow/contrib/learn/python/learn/datasets/load_csv_test.py
index 6683193a8b9..9a62feac575 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/load_csv_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/load_csv_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 59bdea72931..af4acccaecd 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -26,8 +26,10 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
 
-SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
+# CVDF mirror of http://yann.lecun.com/exdb/mnist/
+SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
 
 
 def _read32(bytestream):
@@ -108,12 +110,16 @@ class DataSet(object):
                fake_data=False,
                one_hot=False,
                dtype=dtypes.float32,
-               reshape=True):
+               reshape=True,
+               seed=None):
     """Construct a DataSet.
     one_hot arg is used only if fake_data is true.  `dtype` can be either
     `uint8` to leave the input as `[0, 255]`, or `float32` to rescale into
-    `[0, 1]`.
+    `[0, 1]`.  Seed arg provides for convenient deterministic testing.
     """
+    seed1, seed2 = random_seed.get_seed(seed)
+    # If op level seed is not set, use whatever graph level seed is returned
+    numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
       raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
@@ -193,8 +199,8 @@ class DataSet(object):
       start = 0
       self._index_in_epoch = batch_size - rest_num_examples
       end = self._index_in_epoch
-      images_new_part = self.images[start:end]
-      labels_new_part = self.labels[start:end]
+      images_new_part = self._images[start:end]
+      labels_new_part = self._labels[start:end]
       return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
     else:
       self._index_in_epoch += batch_size
@@ -207,11 +213,13 @@ def read_data_sets(train_dir,
                    one_hot=False,
                    dtype=dtypes.float32,
                    reshape=True,
-                   validation_size=5000):
+                   validation_size=5000,
+                   seed=None):
   if fake_data:
 
     def fake():
-      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)
+      return DataSet(
+          [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)
 
     train = fake()
     validation = fake()
@@ -253,13 +261,13 @@ def read_data_sets(train_dir,
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-  train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
-  validation = DataSet(validation_images,
-                       validation_labels,
-                       dtype=dtype,
-                       reshape=reshape)
-  test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)
-
+  
+  options = dict(dtype=dtype, reshape=reshape, seed=seed)
+  
+  train = DataSet(train_images, train_labels, **options)
+  validation = DataSet(validation_images, validation_labels, **options)
+  test = DataSet(test_images, test_labels, **options)
+  
   return base.Datasets(train=train, validation=validation, test=test)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 2c1c0e6dd5b..bba479a00ee 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -28,16 +28,16 @@ They support regression and classification problems.
     * `LinearRegressor`: Uses linear model.
     * `DNNRegressor`: Uses DNN.
     * `DNNLinearCombinedRegressor`: Uses Wide & Deep.
-    * `TensorForestEstimator`: Uses RandomForest. Use `.predict()` for
-       regression problems.
+    * `TensorForestEstimator`: Uses RandomForest.
+      See tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator.
     * `Estimator`: Use when you need a custom model.
 
 * For **Classification** problems use one of the following:
     * `LinearClassifier`: Multiclass classifier using Linear model.
     * `DNNClassifier`: Multiclass classifier using DNN.
     * `DNNLinearCombinedClassifier`: Multiclass classifier using Wide & Deep.
-    * `TensorForestEstimator`: Uses RandomForest. Use `.predict_proba()` when
-      using for binary classification problems.
+    * `TensorForestEstimator`: Uses RandomForest.
+      See tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator.
     * `SVM`: Binary classifier using linear SVMs.
     * `LogisticRegressor`: Use when you need custom model for binary
        classification.
@@ -206,30 +206,17 @@ estimator = SVM(
     l2_regularization=10.0)
 ```
 
-#### TensorForestEstimator
+#### DynamicRnnEstimator
 
-Supports regression and binary classification.
+An `Estimator` that uses a recurrent neural network with dynamic unrolling.
 
 ```python
-params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
-    num_classes=2, num_features=40, num_trees=10, max_nodes=1000)
+problem_type = ProblemType.CLASSIFICATION  # or REGRESSION
+prediction_type = PredictionType.SINGLE_VALUE  # or MULTIPLE_VALUE
 
-# Estimator using the default graph builder.
-estimator = TensorForestEstimator(params, model_dir=model_dir)
-
-# Or estimator using TrainingLossForest as the graph builder.
-estimator = TensorForestEstimator(
-    params, graph_builder_class=tensor_forest.TrainingLossForest,
-    model_dir=model_dir)
-
-# Input builders
-def input_fn_train: # returns x, y
-  ...
-def input_fn_eval: # returns x, y
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x)
+estimator = DynamicRnnEstimator(problem_type,
+                                prediction_type,
+                                my_feature_columns)
 ```
 
 ### Use the estimator
@@ -308,22 +295,36 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import NotFittedError
 from tensorflow.contrib.learn.python.learn.estimators.constants import ProblemType
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNClassifier
+from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNEstimator
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNRegressor
 from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined import DNNLinearCombinedClassifier
+from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined import DNNLinearCombinedEstimator
 from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined import DNNLinearCombinedRegressor
+from tensorflow.contrib.learn.python.learn.estimators.dynamic_rnn_estimator import DynamicRnnEstimator
 from tensorflow.contrib.learn.python.learn.estimators.estimator import BaseEstimator
 from tensorflow.contrib.learn.python.learn.estimators.estimator import Estimator
 from tensorflow.contrib.learn.python.learn.estimators.estimator import infer_real_valued_columns_from_input
 from tensorflow.contrib.learn.python.learn.estimators.estimator import infer_real_valued_columns_from_input_fn
 from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
+from tensorflow.contrib.learn.python.learn.estimators.head import binary_svm_head
+from tensorflow.contrib.learn.python.learn.estimators.head import Head
+from tensorflow.contrib.learn.python.learn.estimators.head import loss_only_head
+from tensorflow.contrib.learn.python.learn.estimators.head import multi_class_head
+from tensorflow.contrib.learn.python.learn.estimators.head import multi_head
+from tensorflow.contrib.learn.python.learn.estimators.head import multi_label_head
+from tensorflow.contrib.learn.python.learn.estimators.head import no_op_train_fn
+from tensorflow.contrib.learn.python.learn.estimators.head import poisson_regression_head
+from tensorflow.contrib.learn.python.learn.estimators.head import regression_head
 from tensorflow.contrib.learn.python.learn.estimators.kmeans import KMeansClustering
 from tensorflow.contrib.learn.python.learn.estimators.linear import LinearClassifier
+from tensorflow.contrib.learn.python.learn.estimators.linear import LinearEstimator
 from tensorflow.contrib.learn.python.learn.estimators.linear import LinearRegressor
 from tensorflow.contrib.learn.python.learn.estimators.logistic_regressor import LogisticRegressor
 from tensorflow.contrib.learn.python.learn.estimators.metric_key import MetricKey
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModeKeys
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.contrib.learn.python.learn.estimators.prediction_key import PredictionKey
+from tensorflow.contrib.learn.python.learn.estimators.rnn_common import PredictionType
 from tensorflow.contrib.learn.python.learn.estimators.run_config import ClusterConfig
 from tensorflow.contrib.learn.python.learn.estimators.run_config import Environment
 from tensorflow.contrib.learn.python.learn.estimators.run_config import RunConfig
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index fe1812c61b7..14750961efa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
@@ -138,7 +131,7 @@ class ComposableModelTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     age = feature_column.real_valued_column('age')
 
-    head = head_lib._multi_class_head(n_classes=2)
+    head = head_lib.multi_class_head(n_classes=2)
     classifier = _linear_estimator(head, feature_columns=[age, language])
 
     classifier.fit(input_fn=input_fn, steps=1000)
@@ -164,7 +157,7 @@ class ComposableModelTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     age = feature_column.sparse_column_with_hash_bucket('age', 2)
 
-    head = head_lib._multi_class_head(n_classes=2)
+    head = head_lib.multi_class_head(n_classes=2)
     classifier = _joint_linear_estimator(head, feature_columns=[age, language])
 
     classifier.fit(input_fn=input_fn, steps=1000)
@@ -178,7 +171,7 @@ class ComposableModelTest(test.TestCase):
     """Tests multi-class classification using matrix data as input."""
     cont_features = [feature_column.real_valued_column('feature', dimension=4)]
 
-    head = head_lib._multi_class_head(n_classes=3)
+    head = head_lib.multi_class_head(n_classes=3)
     classifier = _dnn_estimator(
         head, feature_columns=cont_features, hidden_units=[3, 3])
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/constants.py b/tensorflow/contrib/learn/python/learn/estimators/constants.py
index aee45416270..fc69e810244 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/constants.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/constants.py
@@ -13,14 +13,33 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Constants regarding Estimators."""
+"""Constants regarding Estimators.
+
+This file is obsoleted in the move of Estimator to core.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 
 class ProblemType(object):
+  """Enum-like values for the type of problem that the model solves.
+
+  These values are used when exporting the model to produce the appropriate
+  signature function for serving.
+
+  The following values are supported:
+    UNSPECIFIED: Produces a predict signature_fn.
+    CLASSIFICATION: Produces a classify signature_fn.
+    LINEAR_REGRESSION: Produces a regression signature_fn.
+    LOGISTIC_REGRESSION: Produces a classify signature_fn.
+  """
   UNSPECIFIED = 0
   CLASSIFICATION = 1
   LINEAR_REGRESSION = 2
   LOGISTIC_REGRESSION = 3
+
+
+# CollectionDef key for the input feature keys.
+# TODO(b/34388557): This is a stopgap; please follow the bug to learn of changes
+COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS = "input_feature_keys"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cbd687bef2a..04607dcdb13 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -24,6 +24,7 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
@@ -32,13 +33,12 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
 
-_CENTERED_BIAS_WEIGHT = "centered_bias_weight"
-
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
 _LEARNING_RATE = 0.05
@@ -57,6 +57,22 @@ def _get_optimizer(optimizer):
     return optimizer
 
 
+_ACTIVATION_FUNCTIONS = {
+    "relu": nn.relu,
+    "tanh": nn.tanh,
+    "sigmoid": nn.sigmoid
+}
+
+
+def _get_activation_fn(activation_fn):
+  if not isinstance(activation_fn, six.string_types):
+    return activation_fn
+  if activation_fn not in _ACTIVATION_FUNCTIONS.keys():
+    raise ValueError("Activation name should be one of [%s], you provided %s." %
+                     (", ".join(_ACTIVATION_FUNCTIONS.keys()), activation_fn))
+  return _ACTIVATION_FUNCTIONS[activation_fn]
+
+
 def _add_hidden_layer_summary(value, tag):
   summary.scalar("%s_fraction_of_zero_values" % tag, nn.zero_fraction(value))
   summary.histogram("%s_activation" % tag, value)
@@ -81,7 +97,9 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
           optimizer to use for training. If `None`, will use the Adagrad
           optimizer with a default learning rate of 0.05.
       * activation_fn: Activation function applied to each layer. If `None`,
-          will use `tf.nn.relu`.
+          will use `tf.nn.relu`. Note that a string containing the unqualified
+          name of the op may also be provided, e.g., "relu", "tanh", or
+          "sigmoid".
       * dropout: When not `None`, the probability we will drop out a given
           coordinate.
       * gradient_clip_norm: A float > 0. If provided, gradients are
@@ -102,7 +120,7 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
   hidden_units = params["hidden_units"]
   feature_columns = params["feature_columns"]
   optimizer = params.get("optimizer") or "Adagrad"
-  activation_fn = params.get("activation_fn")
+  activation_fn = _get_activation_fn(params.get("activation_fn"))
   dropout = params.get("dropout")
   gradient_clip_norm = params.get("gradient_clip_norm")
   input_layer_min_slice_size = (
@@ -127,11 +145,20 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      net = layers.input_from_feature_columns(
-          columns_to_tensors=features,
-          feature_columns=feature_columns,
-          weight_collections=[parent_scope],
-          scope=input_layer_scope)
+      if all([
+          isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+          for fc in feature_columns
+      ]):
+        net = layers.input_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope],
+            scope=input_layer_scope)
+      else:
+        net = fc_core.input_layer(
+            features=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope])
 
     for layer_id, num_hidden_units in enumerate(hidden_units):
       with variable_scope.variable_scope(
@@ -218,7 +245,31 @@ class DNNClassifier(estimator.Estimator):
   def input_fn_eval: # returns x, y (where y represents label's class index).
     pass
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  def input_fn_predict: # returns x, None
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = DNNClassifier(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      hidden_units=[1024, 512, 256],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -250,7 +301,8 @@ class DNNClassifier(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                embedding_lr_multipliers=None,
-               input_layer_min_slice_size=None):
+               input_layer_min_slice_size=None,
+               label_keys=None):
     """Initializes a DNNClassifier instance.
 
     Args:
@@ -273,7 +325,8 @@ class DNNClassifier(estimator.Estimator):
       optimizer: An instance of `tf.Optimizer` used to train the model. If
         `None`, will use an Adagrad optimizer.
       activation_fn: Activation function applied to each layer. If `None`, will
-        use `tf.nn.relu`.
+        use tf.nn.relu. Note that a string containing the unqualified
+        name of the op may also be provided, e.g., "relu", "tanh", or "sigmoid".
       dropout: When not `None`, the probability we will drop out a given
         coordinate.
       gradient_clip_norm: A float > 0. If provided, gradients are
@@ -284,14 +337,15 @@ class DNNClassifier(estimator.Estimator):
         residual after centered bias.
       config: `RunConfig` object to configure the runtime settings.
       feature_engineering_fn: Feature engineering function. Takes features and
-                        labels which are the output of `input_fn` and
-                        returns features and labels which will be fed
-                        into the model.
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
       embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
-          a `float` multiplier. Multiplier will be used to multiply with
-          learning rate for the embedding variables.
+        a `float` multiplier. Multiplier will be used to multiply with learning
+        rate for the embedding variables.
       input_layer_min_slice_size: Optional. The min slice size of input layer
-          partitions. If not provided, will use the default of 64M.
+        partitions. If not provided, will use the default of 64M.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
 
     Returns:
       A `DNNClassifier` estimator.
@@ -299,19 +353,18 @@ class DNNClassifier(estimator.Estimator):
     Raises:
       ValueError: If `n_classes` < 2.
     """
-    self._hidden_units = hidden_units
     self._feature_columns = tuple(feature_columns or [])
-    self._enable_centered_bias = enable_centered_bias
     super(DNNClassifier, self).__init__(
         model_fn=_dnn_model_fn,
         model_dir=model_dir,
         config=config,
         params={
             "head":
-                head_lib._multi_class_head(  # pylint: disable=protected-access
+                head_lib.multi_class_head(
                     n_classes,
                     weight_column_name=weight_column_name,
-                    enable_centered_bias=enable_centered_bias),
+                    enable_centered_bias=enable_centered_bias,
+                    label_keys=label_keys),
             "hidden_units": hidden_units,
             "feature_columns": self._feature_columns,
             "optimizer": optimizer,
@@ -436,6 +489,7 @@ class DNNClassifier(estimator.Estimator):
       return (pred[key] for pred in preds)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -461,36 +515,6 @@ class DNNClassifier(estimator.Estimator):
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
 
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def weights_(self):
-    hiddenlayer_weights = [
-        self.get_variable_value("dnn/hiddenlayer_%d/weights" % i)
-        for i, _ in enumerate(self._hidden_units)
-    ]
-    logits_weights = [self.get_variable_value("dnn/logits/weights")]
-    return hiddenlayer_weights + logits_weights
-
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def bias_(self):
-    hiddenlayer_bias = [
-        self.get_variable_value("dnn/hiddenlayer_%d/biases" % i)
-        for i, _ in enumerate(self._hidden_units)
-    ]
-    logits_bias = [self.get_variable_value("dnn/logits/biases")]
-    if self._enable_centered_bias:
-      centered_bias = [self.get_variable_value(_CENTERED_BIAS_WEIGHT)]
-    else:
-      centered_bias = []
-    return hiddenlayer_bias + logits_bias + centered_bias
-
 
 class DNNRegressor(estimator.Estimator):
   """A regressor for TensorFlow DNN models.
@@ -528,7 +552,9 @@ class DNNRegressor(estimator.Estimator):
   def input_fn_eval: # returns x, y
     pass
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x)
+  def input_fn_predict: # returns x, None
+    pass
+  estimator.predict_scores(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -579,7 +605,8 @@ class DNNRegressor(estimator.Estimator):
       optimizer: An instance of `tf.Optimizer` used to train the model. If
         `None`, will use an Adagrad optimizer.
       activation_fn: Activation function applied to each layer. If `None`, will
-        use `tf.nn.relu`.
+        use `tf.nn.relu`. Note that a string containing the unqualified name of
+        the op may also be provided, e.g., "relu", "tanh", or "sigmoid".
       dropout: When not `None`, the probability we will drop out a given
         coordinate.
       gradient_clip_norm: A `float` > 0. If provided, gradients are clipped
@@ -612,7 +639,7 @@ class DNNRegressor(estimator.Estimator):
         config=config,
         params={
             "head":
-                head_lib._regression_head(  # pylint: disable=protected-access
+                head_lib.regression_head(
                     label_dimension=label_dimension,
                     weight_column_name=weight_column_name,
                     enable_centered_bias=enable_centered_bias),
@@ -739,6 +766,7 @@ class DNNRegressor(estimator.Estimator):
       return (pred[key] for pred in preds)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -761,3 +789,128 @@ class DNNRegressor(estimator.Estimator):
         prediction_key=prediction_key.PredictionKey.SCORES,
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
+
+
+class DNNEstimator(estimator.Estimator):
+  """A Estimator for TensorFlow DNN models with user specified _Head.
+
+  Example:
+
+  ```python
+  sparse_feature_a = sparse_column_with_hash_bucket(...)
+  sparse_feature_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+  To create a DNNEstimator for binary classification, where
+  estimator = DNNEstimator(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      head=tf.contrib.learn.multi_class_head(n_classes=2),
+      hidden_units=[1024, 512, 256])
+
+  If your label is keyed with "y" in your labels dict, and weights are keyed
+  with "w" in features dict, and you want to enable centered bias,
+  head = tf.contrib.learn.multi_class_head(
+      n_classes=2,
+      label_name="x",
+      weight_column_name="w",
+      enable_centered_bias=True)
+  estimator = DNNEstimator(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      head=head,
+      hidden_units=[1024, 512, 256])
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  ```
+
+  Input of `fit` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column_name` is not `None`, a feature with
+     `key=weight_column_name` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `SparseColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `WeightedSparseColumn`, two features: the first with
+      `key` the id column name, the second with `key` the weight column name.
+      Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               head,
+               hidden_units,
+               feature_columns,
+               model_dir=None,
+               optimizer=None,
+               activation_fn=nn.relu,
+               dropout=None,
+               gradient_clip_norm=None,
+               config=None,
+               feature_engineering_fn=None,
+               embedding_lr_multipliers=None,
+               input_layer_min_slice_size=None):
+    """Initializes a `DNNEstimator` instance.
+
+    Args:
+      head: `Head` instance.
+      hidden_units: List of hidden units per layer. All layers are fully
+        connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
+        has 32.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      optimizer: An instance of `tf.Optimizer` used to train the model. If
+        `None`, will use an Adagrad optimizer.
+      activation_fn: Activation function applied to each layer. If `None`, will
+        use `tf.nn.relu`. Note that a string containing the unqualified name of
+        the op may also be provided, e.g., "relu", "tanh", or "sigmoid".
+      dropout: When not `None`, the probability we will drop out a given
+        coordinate.
+      gradient_clip_norm: A float > 0. If provided, gradients are
+        clipped to their global norm with this clipping ratio. See
+        `tf.clip_by_global_norm` for more details.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+                        labels which are the output of `input_fn` and
+                        returns features and labels which will be fed
+                        into the model.
+      embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
+          a `float` multiplier. Multiplier will be used to multiply with
+          learning rate for the embedding variables.
+      input_layer_min_slice_size: Optional. The min slice size of input layer
+          partitions. If not provided, will use the default of 64M.
+
+    Returns:
+      A `DNNEstimator` estimator.
+    """
+    super(DNNEstimator, self).__init__(
+        model_fn=_dnn_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params={
+            "head": head,
+            "hidden_units": hidden_units,
+            "feature_columns": feature_columns,
+            "optimizer": optimizer,
+            "activation_fn": activation_fn,
+            "dropout": dropout,
+            "gradient_clip_norm": gradient_clip_norm,
+            "embedding_lr_multipliers": embedding_lr_multipliers,
+            "input_layer_min_slice_size": input_layer_min_slice_size,
+        },
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_benchmark_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_benchmark_test.py
index 34887d5b368..86b3eee6ad1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_benchmark_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_benchmark_test.py
@@ -51,6 +51,7 @@ class DNNClassifierBenchmark(test.Benchmark):
                 for k, v in metrics.items() if k in _METRIC_KEYS})
 
   def _report_predictions(self,
+                          benchmark_name_override,
                           classifier,
                           input_fn,
                           iters,
@@ -76,7 +77,8 @@ class DNNClassifierBenchmark(test.Benchmark):
         }.update({
             'inference.example%d_class' % i: classes[i]
             for i in range(n_examples)
-        }))
+        }),
+        name=benchmark_name_override)
 
   def benchmarkLogisticMatrixData(self):
     classifier = dnn.DNNClassifier(
@@ -178,7 +180,9 @@ class DNNClassifierBenchmark(test.Benchmark):
         iters=metrics['global_step'],
         n_examples=3,
         n_classes=2,
-        expected_classes=(1, 0, 0))
+        expected_classes=(1, 0, 0),
+        benchmark_name_override=(
+            'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions'))
 
   def benchmarkLogisticFloatLabel(self):
 
@@ -224,7 +228,9 @@ class DNNClassifierBenchmark(test.Benchmark):
         n_examples=3,
         n_classes=n_classes,
         expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)),
-        expected_classes=(1, 0, 0))
+        expected_classes=(1, 0, 0),
+        benchmark_name_override=(
+            'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'))
 
   def benchmarkMultiClassMatrixData(self):
     """Tests multi-class classification using matrix data as input."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index d1113678a95..72661223505 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -20,13 +20,11 @@ from __future__ import division
 from __future__ import print_function
 
 import math
-import re
 import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -35,22 +33,30 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import sync_replicas_optimizer
+from tensorflow.python.training import training_util
 
 
-_CENTERED_BIAS_WEIGHT = "centered_bias_weight"
-
 # The default learning rates are a historical artifact of the initial
 # implementation, but seem a reasonable choice.
 _DNN_LEARNING_RATE = 0.05
 _LINEAR_LEARNING_RATE = 0.2
 
 
+_FIX_GLOBAL_STEP_INCREMENT_DATE = "2017-04-15"
+_FIX_GLOBAL_STEP_INCREMENT_INSTRUCTIONS = (
+    "Please set fix_global_step_increment_bug=True and update training steps "
+    "in your pipeline. See pydoc for details.")
+
+
 def _as_iterable(preds, output):
   for pred in preds:
     yield pred[output]
@@ -69,6 +75,14 @@ def _get_optimizer(optimizer):
     return optimizer
 
 
+def _check_no_sync_replicas_optimizer(optimizer):
+  if isinstance(optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
+    raise ValueError(
+        "SyncReplicasOptimizer is not supported in DNNLinearCombined model. "
+        "If you want to use this optimizer, please use either DNN or Linear "
+        "model.")
+
+
 def _linear_learning_rate(num_linear_feature_columns):
   """Returns the default learning rate of the linear model.
 
@@ -87,9 +101,13 @@ def _linear_learning_rate(num_linear_feature_columns):
 
 
 def _add_hidden_layer_summary(value, tag):
-  logging_ops.scalar_summary("%s/fraction_of_zero_values" % tag,
-                             nn.zero_fraction(value))
-  logging_ops.histogram_summary("%s/activation" % tag, value)
+  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s/activation" % tag, value)
+
+
+def _add_layer_summary(value, tag):
+  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s/activation" % tag, value)
 
 
 def _get_embedding_variable(column, collection_key, input_layer_scope):
@@ -149,12 +167,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           DNN coordinate.
       * gradient_clip_norm: A float > 0. If provided, gradients are
           clipped to their global norm with this clipping ratio.
-      * num_ps_replicas: The number of parameter server replicas.
       * embedding_lr_multipliers: Optional. A dictionary from
           `EmbeddingColumn` to a `float` multiplier. Multiplier will be used to
           multiply with learning rate for the embedding variables.
-      * input_layer_min_slice_size: Optional. The min slice size of input layer
-          partitions. If not provided, will use the default of 64M.
+      * input_layer_partitioner: Optional. Partitioner for input layer.
     config: `RunConfig` object to configure the runtime settings.
 
   Returns:
@@ -162,7 +178,7 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
 
   Raises:
     ValueError: If both `linear_feature_columns` and `dnn_features_columns`
-      are empty at the same time.
+      are empty at the same time, or `input_layer_partitioner` is missing.
   """
   head = params["head"]
   linear_feature_columns = params.get("linear_feature_columns")
@@ -171,13 +187,17 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
   dnn_feature_columns = params.get("dnn_feature_columns")
   dnn_optimizer = params.get("dnn_optimizer") or "Adagrad"
   dnn_hidden_units = params.get("dnn_hidden_units")
-  dnn_activation_fn = params.get("dnn_activation_fn")
+  dnn_activation_fn = params.get("dnn_activation_fn") or nn.relu
   dnn_dropout = params.get("dnn_dropout")
   gradient_clip_norm = params.get("gradient_clip_norm")
-  input_layer_min_slice_size = (
-      params.get("input_layer_min_slice_size") or 64 << 20)
   num_ps_replicas = config.num_ps_replicas if config else 0
+  input_layer_partitioner = params.get("input_layer_partitioner") or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas,
+          min_slice_size=64 << 20))
   embedding_lr_multipliers = params.get("embedding_lr_multipliers", {})
+  fix_global_step_increment_bug = params.get(
+      "fix_global_step_increment_bug", True)
 
   if not linear_feature_columns and not dnn_feature_columns:
     raise ValueError(
@@ -185,6 +205,11 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
 
   features = _get_feature_dict(features)
 
+  linear_optimizer = _get_optimizer(linear_optimizer)
+  _check_no_sync_replicas_optimizer(linear_optimizer)
+  dnn_optimizer = _get_optimizer(dnn_optimizer)
+  _check_no_sync_replicas_optimizer(dnn_optimizer)
+
   # Build DNN Logits.
   dnn_parent_scope = "dnn"
 
@@ -193,7 +218,8 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
   else:
     if not dnn_hidden_units:
       raise ValueError(
-          "dnn_hidden_units must be defined when dnn_feature_columns is specified.")
+          "dnn_hidden_units must be defined when dnn_feature_columns is "
+          "specified.")
     dnn_partitioner = (
         partitioned_variables.min_max_variable_partitioner(
             max_partitions=num_ps_replicas))
@@ -201,19 +227,24 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         dnn_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=dnn_partitioner):
-      input_layer_partitioner = (
-          partitioned_variables.min_max_variable_partitioner(
-              max_partitions=num_ps_replicas,
-              min_slice_size=input_layer_min_slice_size))
       with variable_scope.variable_scope(
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        net = layers.input_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=dnn_feature_columns,
-            weight_collections=[dnn_parent_scope],
-            scope=dnn_input_scope)
+        if all([
+            isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+            for fc in dnn_feature_columns
+        ]):
+          net = layers.input_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope],
+              scope=dnn_input_scope)
+        else:
+          net = fc_core.input_layer(
+              features=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope])
 
       for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
         with variable_scope.variable_scope(
@@ -230,7 +261,7 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
                 net,
                 keep_prob=(1.0 - dnn_dropout))
         # TODO(b/31209633): Consider adding summary before dropout.
-        _add_hidden_layer_summary(net, dnn_hidden_layer_scope.name)
+        _add_layer_summary(net, dnn_hidden_layer_scope.name)
 
       with variable_scope.variable_scope(
           "logits",
@@ -241,7 +272,7 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
             activation_fn=None,
             variables_collections=[dnn_parent_scope],
             scope=dnn_logits_scope)
-      _add_hidden_layer_summary(dnn_logits, dnn_logits_scope.name)
+      _add_layer_summary(dnn_logits, dnn_logits_scope.name)
 
   # Build Linear logits.
   linear_parent_scope = "linear"
@@ -256,20 +287,30 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if joint_linear_weights:
-        linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+              for fc in linear_feature_columns]):
+        if joint_linear_weights:
+          linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
+        else:
+          linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
       else:
-        linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
+        linear_logits = fc_core.linear_model(
+            features=features,
             feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+            units=head.logits_dimension,
+            weight_collections=[linear_parent_scope])
+
+      _add_layer_summary(linear_logits, scope.name)
 
   # Combine logits and build full model.
   if dnn_logits is not None and linear_logits is not None:
@@ -282,13 +323,14 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
   def _make_training_op(training_loss):
     """Training op for the DNN linear combined model."""
     train_ops = []
+    global_step = training_util.get_global_step()
     if dnn_logits is not None:
       train_ops.append(
           optimizers.optimize_loss(
               loss=training_loss,
-              global_step=contrib_variables.get_global_step(),
+              global_step=global_step,
               learning_rate=_DNN_LEARNING_RATE,
-              optimizer=_get_optimizer(dnn_optimizer),
+              optimizer=dnn_optimizer,
               gradient_multipliers=_extract_embedding_lr_multipliers(  # pylint: disable=protected-access
                   embedding_lr_multipliers, dnn_parent_scope,
                   dnn_input_scope.name),
@@ -296,21 +338,28 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
               variables=ops.get_collection(dnn_parent_scope),
               name=dnn_parent_scope,
               # Empty summaries, because head already logs "loss" summary.
-              summaries=[]))
+              summaries=[],
+              increment_global_step=not fix_global_step_increment_bug))
     if linear_logits is not None:
       train_ops.append(
           optimizers.optimize_loss(
               loss=training_loss,
-              global_step=contrib_variables.get_global_step(),
+              global_step=global_step,
               learning_rate=_linear_learning_rate(len(linear_feature_columns)),
-              optimizer=_get_optimizer(linear_optimizer),
+              optimizer=linear_optimizer,
               clip_gradients=gradient_clip_norm,
               variables=ops.get_collection(linear_parent_scope),
               name=linear_parent_scope,
               # Empty summaries, because head already logs "loss" summary.
-              summaries=[]))
+              summaries=[],
+              increment_global_step=not fix_global_step_increment_bug))
 
-    return control_flow_ops.group(*train_ops)
+    train_op = control_flow_ops.group(*train_ops)
+    if fix_global_step_increment_bug:
+      with ops.control_dependencies([train_op]):
+        with ops.colocate_with(global_step):
+          return state_ops.assign_add(global_step, 1).op
+    return train_op
 
   return head.create_model_fn_ops(
       features=features,
@@ -320,23 +369,30 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
       logits=logits)
 
 
-class _DNNLinearCombinedEstimator(estimator.Estimator):
+class DNNLinearCombinedEstimator(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined training models.
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
-      otherwise there will be a `KeyError`:
-        if `weight_column_name` is not `None`, a feature with
-          `key=weight_column_name` whose value is a `Tensor`.
-        for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-        - if `column` is a `SparseColumn`, a feature with `key=column.name`
-          whose `value` is a `SparseTensor`.
-        - if `column` is a `WeightedSparseColumn`, two features: the first with
-          `key` the id column name, the second with `key` the weight column
-          name. Both features' `value` must be a `SparseTensor`.
-        - if `column` is a `RealValuedColumn, a feature with `key=column.name`
-          whose `value` is a `Tensor`.
+  Note: New users must set `fix_global_step_increment_bug=True` when creating an
+  estimator.
+
+  Input of `fit`, `train`, and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+      if `weight_column_name` is not `None`, a feature with
+        `key=weight_column_name` whose value is a `Tensor`.
+      for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+      - if `column` is a `SparseColumn`, a feature with `key=column.name`
+        whose `value` is a `SparseTensor`.
+      - if `column` is a `WeightedSparseColumn`, two features: the first with
+        `key` the id column name, the second with `key` the weight column
+        name. Both features' `value` must be a `SparseTensor`.
+      - if `column` is a `RealValuedColumn, a feature with `key=column.name`
+        whose `value` is a `Tensor`.
   """
 
+  @deprecated_arg_values(
+      _FIX_GLOBAL_STEP_INCREMENT_DATE,
+      _FIX_GLOBAL_STEP_INCREMENT_INSTRUCTIONS,
+      fix_global_step_increment_bug=False)
   def __init__(self,  # _joint_linear_weights pylint: disable=invalid-name
                head,
                model_dir=None,
@@ -346,13 +402,18 @@ class _DNNLinearCombinedEstimator(estimator.Estimator):
                dnn_feature_columns=None,
                dnn_optimizer=None,
                dnn_hidden_units=None,
-               dnn_activation_fn=nn.relu,
+               dnn_activation_fn=None,
                dnn_dropout=None,
                gradient_clip_norm=None,
                config=None,
                feature_engineering_fn=None,
-               embedding_lr_multipliers=None):
-    """Initializes a _DNNLinearCombinedEstimator instance.
+               embedding_lr_multipliers=None,
+               fix_global_step_increment_bug=False,
+               input_layer_partitioner=None):
+    """Initializes a DNNLinearCombinedEstimator instance.
+
+    Note: New users must set `fix_global_step_increment_bug=True` when creating
+    an estimator.
 
     Args:
       head: A _Head object.
@@ -384,12 +445,16 @@ class _DNNLinearCombinedEstimator(estimator.Estimator):
         tf.clip_by_global_norm for more details.
       config: RunConfig object to configure the runtime settings.
       feature_engineering_fn: Feature engineering function. Takes features and
-                        labels which are the output of `input_fn` and
-                        returns features and labels which will be fed
-                        into the model.
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
       embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
-          a `float` multiplier. Multiplier will be used to multiply with
-          learning rate for the embedding variables.
+        a `float` multiplier. Multiplier will be used to multiply with
+        learning rate for the embedding variables.
+      fix_global_step_increment_bug: If `False`, the estimator needs two fit
+        steps to optimize both linear and dnn parts. If `True`, this bug is
+        fixed. New users must set this to `True`, but the default value is
+        `False` for backwards compatibility.
+      input_layer_partitioner: Optional. Partitioner for input layer.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -400,7 +465,7 @@ class _DNNLinearCombinedEstimator(estimator.Estimator):
     if not linear_feature_columns + dnn_feature_columns:
       raise ValueError("Either linear_feature_columns or dnn_feature_columns "
                        "must be defined.")
-    super(_DNNLinearCombinedEstimator, self).__init__(
+    super(DNNLinearCombinedEstimator, self).__init__(
         model_fn=_dnn_linear_combined_model_fn,
         model_dir=model_dir,
         config=config,
@@ -416,6 +481,8 @@ class _DNNLinearCombinedEstimator(estimator.Estimator):
             "dnn_dropout": dnn_dropout,
             "gradient_clip_norm": gradient_clip_norm,
             "embedding_lr_multipliers": embedding_lr_multipliers,
+            "fix_global_step_increment_bug": fix_global_step_increment_bug,
+            "input_layer_partitioner": input_layer_partitioner
         },
         feature_engineering_fn=feature_engineering_fn)
 
@@ -423,6 +490,9 @@ class _DNNLinearCombinedEstimator(estimator.Estimator):
 class DNNLinearCombinedClassifier(estimator.Estimator):
   """A classifier for TensorFlow Linear and DNN joined training models.
 
+  Note: New users must set `fix_global_step_increment_bug=True` when creating an
+  estimator.
+
   Example:
 
   ```python
@@ -453,16 +523,44 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = DNNLinearCombinedClassifier(
+      n_classes=n_classes,
+      linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
+      dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      dnn_hidden_units=[1000, 500, 100],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
-      if `weight_column_name` is not `None`, a feature with
+
+  * if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
-      for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
       - if `column` is a `SparseColumn`, a feature with `key=column.name`
         whose `value` is a `SparseTensor`.
       - if `column` is a `WeightedSparseColumn`, two features: the first with
@@ -472,6 +570,10 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         whose `value` is a `Tensor`.
   """
 
+  @deprecated_arg_values(
+      _FIX_GLOBAL_STEP_INCREMENT_DATE,
+      _FIX_GLOBAL_STEP_INCREMENT_INSTRUCTIONS,
+      fix_global_step_increment_bug=False)
   def __init__(self,  # _joint_linear_weights pylint: disable=invalid-name
                model_dir=None,
                n_classes=2,
@@ -489,9 +591,14 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                embedding_lr_multipliers=None,
-               input_layer_min_slice_size=None):
+               input_layer_min_slice_size=None,
+               label_keys=None,
+               fix_global_step_increment_bug=False):
     """Constructs a DNNLinearCombinedClassifier instance.
 
+    Note: New users must set `fix_global_step_increment_bug=True` when creating
+    an estimator.
+
     Args:
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator
@@ -530,36 +637,45 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         residual after centered bias.
       config: RunConfig object to configure the runtime settings.
       feature_engineering_fn: Feature engineering function. Takes features and
-                        labels which are the output of `input_fn` and
-                        returns features and labels which will be fed
-                        into the model.
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
       embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
-          a `float` multiplier. Multiplier will be used to multiply with
-          learning rate for the embedding variables.
+        a `float` multiplier. Multiplier will be used to multiply with
+        learning rate for the embedding variables.
       input_layer_min_slice_size: Optional. The min slice size of input layer
-          partitions. If not provided, will use the default of 64M.
+        partitions. If not provided, will use the default of 64M.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      fix_global_step_increment_bug: If `False`, the estimator needs two fit
+        steps to optimize both linear and dnn parts. If `True`, this bug is
+        fixed. New users must set this to `True`, but it the default value is
+        `False` for backwards compatibility.
 
     Raises:
       ValueError: If `n_classes` < 2.
       ValueError: If both `linear_feature_columns` and `dnn_features_columns`
         are empty at the same time.
     """
-    if n_classes < 2:
-      raise ValueError("n_classes should be greater than 1. Given: {}".format(
-          n_classes))
-    self._linear_optimizer = linear_optimizer or "Ftrl"
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     linear_feature_columns = tuple(linear_feature_columns or [])
     dnn_feature_columns = tuple(dnn_feature_columns or [])
     self._feature_columns = linear_feature_columns + dnn_feature_columns
     if not self._feature_columns:
       raise ValueError("Either linear_feature_columns or dnn_feature_columns "
                        "must be defined.")
-    self._dnn_hidden_units = dnn_hidden_units
-    self._enable_centered_bias = enable_centered_bias
-    head = head_lib._multi_class_head(  # pylint: disable=protected-access
-        n_classes=n_classes,
-        weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+
+    # TODO(b/35922130): Replace with `input_layer_partitioner` arg.
+    input_layer_partitioner = None
+    if input_layer_min_slice_size is not None:
+      input_layer_partitioner = (
+          partitioned_variables.min_max_variable_partitioner(
+              max_partitions=config.num_ps_replicas if config else 0,
+              min_slice_size=input_layer_min_slice_size))
+
     super(DNNLinearCombinedClassifier, self).__init__(
         model_fn=_dnn_linear_combined_model_fn,
         model_dir=model_dir,
@@ -567,7 +683,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         params={
             "head": head,
             "linear_feature_columns": linear_feature_columns,
-            "linear_optimizer": self._linear_optimizer,
+            "linear_optimizer": linear_optimizer,
             "joint_linear_weights": _joint_linear_weights,
             "dnn_feature_columns": dnn_feature_columns,
             "dnn_optimizer": dnn_optimizer,
@@ -576,7 +692,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
             "dnn_dropout": dnn_dropout,
             "gradient_clip_norm": gradient_clip_norm,
             "embedding_lr_multipliers": embedding_lr_multipliers,
-            "input_layer_min_slice_size": input_layer_min_slice_size,
+            "input_layer_partitioner": input_layer_partitioner,
+            "fix_global_step_increment_bug": fix_global_step_increment_bug,
         },
         feature_engineering_fn=feature_engineering_fn)
 
@@ -687,6 +804,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
       return _as_iterable(preds, output=key)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -710,71 +828,13 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
 
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def dnn_weights_(self):
-    hiddenlayer_weights = [
-        self.get_variable_value("dnn/hiddenlayer_%d/weights" % i)
-        for i, _ in enumerate(self._dnn_hidden_units)
-    ]
-    logits_weights = [self.get_variable_value("dnn/logits/weights")]
-    return hiddenlayer_weights + logits_weights
-
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def linear_weights_(self):
-    values = {}
-    if isinstance(self._linear_optimizer, str):
-      optimizer_name = self._linear_optimizer
-    else:
-      optimizer_name = self._linear_optimizer.get_name()
-    optimizer_regex = r".*/"+optimizer_name + r"(_\d)?$"
-    for name in self.get_variable_names():
-      if (name.startswith("linear/") and
-          name != "linear/bias_weight" and
-          name != "linear/learning_rate" and
-          not re.match(optimizer_regex, name)):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
-
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def dnn_bias_(self):
-    hiddenlayer_bias = [self.get_variable_value("dnn/hiddenlayer_%d/biases" % i)
-                        for i, _ in enumerate(self._dnn_hidden_units)]
-    logits_bias = [self.get_variable_value("dnn/logits/biases")]
-    if not self._enable_centered_bias:
-      return hiddenlayer_bias + logits_bias
-    centered_bias = [self.get_variable_value(_CENTERED_BIAS_WEIGHT)]
-    return hiddenlayer_bias + logits_bias  + centered_bias
-
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def linear_bias_(self):
-    linear_bias = self.get_variable_value("linear/bias_weight")
-    if not self._enable_centered_bias:
-      return linear_bias
-    centered_bias = [self.get_variable_value(_CENTERED_BIAS_WEIGHT)]
-    return linear_bias  + centered_bias
-
 
 class DNNLinearCombinedRegressor(estimator.Estimator):
   """A regressor for TensorFlow Linear and DNN joined training models.
 
+  Note: New users must set `fix_global_step_increment_bug=True` when creating an
+  estimator.
+
   Example:
 
   ```python
@@ -811,9 +871,11 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y
     ...
+  def input_fn_predict: # returns x, None
+    ...
   estimator.train(input_fn_train)
   estimator.evaluate(input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(input_fn_predict)
   ```
 
   Input of `fit`, `train`, and `evaluate` should have following features,
@@ -830,6 +892,10 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
         whose `value` is a `Tensor`.
   """
 
+  @deprecated_arg_values(
+      _FIX_GLOBAL_STEP_INCREMENT_DATE,
+      _FIX_GLOBAL_STEP_INCREMENT_INSTRUCTIONS,
+      fix_global_step_increment_bug=False)
   def __init__(self,  # _joint_linear_weights pylint: disable=invalid-name
                model_dir=None,
                weight_column_name=None,
@@ -847,9 +913,13 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                embedding_lr_multipliers=None,
-               input_layer_min_slice_size=None):
+               input_layer_min_slice_size=None,
+               fix_global_step_increment_bug=False):
     """Initializes a DNNLinearCombinedRegressor instance.
 
+    Note: New users must set `fix_global_step_increment_bug=True` when creating
+    an estimator.
+
     Args:
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator
@@ -887,15 +957,17 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
         (typically, these have shape `[batch_size, label_dimension]`).
       config: RunConfig object to configure the runtime settings.
       feature_engineering_fn: Feature engineering function. Takes features and
-                        labels which are the output of `input_fn` and
-                        returns features and labels which will be fed
-                        into the model.
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
       embedding_lr_multipliers: Optional. A dictionary from `EmbeddingColumn` to
-          a `float` multiplier. Multiplier will be used to multiply with
-          learning rate for the embedding variables.
+        a `float` multiplier. Multiplier will be used to multiply with
+        learning rate for the embedding variables.
       input_layer_min_slice_size: Optional. The min slice size of input layer
-          partitions. If not provided, will use the default of 64M.
-
+        partitions. If not provided, will use the default of 64M.
+      fix_global_step_increment_bug: If `False`, the estimator needs two fit
+        steps to optimize both linear and dnn parts. If `True`, this bug is
+        fixed. New users must set this to `True`, but it the default value is
+        `False` for backwards compatibility.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -907,7 +979,16 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
     if not self._feature_columns:
       raise ValueError("Either linear_feature_columns or dnn_feature_columns "
                        "must be defined.")
-    head = head_lib._regression_head(  # pylint: disable=protected-access
+
+    # TODO(b/35922130): Replace with `input_layer_partitioner` arg.
+    input_layer_partitioner = None
+    if input_layer_min_slice_size is not None:
+      input_layer_partitioner = (
+          partitioned_variables.min_max_variable_partitioner(
+              max_partitions=config.num_ps_replicas if config else 0,
+              min_slice_size=input_layer_min_slice_size))
+
+    head = head_lib.regression_head(
         weight_column_name=weight_column_name,
         label_dimension=label_dimension,
         enable_centered_bias=enable_centered_bias)
@@ -927,7 +1008,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
             "dnn_dropout": dnn_dropout,
             "gradient_clip_norm": gradient_clip_norm,
             "embedding_lr_multipliers": embedding_lr_multipliers,
-            "input_layer_min_slice_size": input_layer_min_slice_size,
+            "input_layer_partitioner": input_layer_partitioner,
+            "fix_global_step_increment_bug": fix_global_step_increment_bug,
         },
         feature_engineering_fn=feature_engineering_fn)
 
@@ -1041,6 +1123,7 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
       return (pred[key] for pred in preds)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -1062,3 +1145,7 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
         prediction_key=prediction_key.PredictionKey.SCORES,
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
+
+# Aliases
+# TODO(zakaria): Remove these aliases, See b/34751732
+_DNNLinearCombinedEstimator = DNNLinearCombinedEstimator
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 01e14c32e5a..57e70e169ca 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -20,17 +20,10 @@ from __future__ import print_function
 
 import functools
 import json
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn import experiment
 from tensorflow.contrib.learn.python.learn.datasets import base
@@ -43,8 +36,10 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -57,6 +52,9 @@ from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import learning_rate_decay
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import sync_replicas_optimizer
+from tensorflow.python.training import training_util
 
 
 def _assert_metrics_in_range(keys, metrics):
@@ -66,7 +64,7 @@ def _assert_metrics_in_range(keys, metrics):
                                          metrics)
 
 
-class _CheckCallsHead(head_lib._Head):  # pylint: disable=protected-access
+class _CheckCallsHead(head_lib.Head):
   """Head that checks whether head_ops is called."""
 
   def __init__(self):
@@ -94,6 +92,21 @@ class _CheckCallsHead(head_lib._Head):  # pylint: disable=protected-access
     return self._head_ops_called_times
 
 
+class _StepCounterHook(session_run_hook.SessionRunHook):
+  """Counts the number of training steps."""
+
+  def __init__(self):
+    self._steps = 0
+
+  def after_run(self, run_context, run_values):
+    del run_context, run_values
+    self._steps += 1
+
+  @property
+  def steps(self):
+    return self._steps
+
+
 class EmbeddingMultiplierTest(test.TestCase):
   """dnn_model_fn tests."""
 
@@ -103,7 +116,7 @@ class EmbeddingMultiplierTest(test.TestCase):
 
     params = {
         'dnn_feature_columns': [one_hot_language],
-        'head': head_lib._multi_class_head(2),
+        'head': head_lib.multi_class_head(2),
         'dnn_hidden_units': [1],
         # Set lr mult to 0. to keep embeddings constant.
         'embedding_lr_multipliers': {
@@ -137,53 +150,54 @@ class EmbeddingMultiplierTest(test.TestCase):
 
     params = {
         'dnn_feature_columns': [embedding_language, embedding_wire],
-        'head': head_lib._multi_class_head(2),
+        'head': head_lib.multi_class_head(2),
         'dnn_hidden_units': [1],
-        # Set lr mult to 0. to keep embeddings constant.
+        # Set lr mult to 0. to keep language embeddings constant, whereas wire
+        # embeddings will be trained.
         'embedding_lr_multipliers': {
             embedding_language: 0.0
         },
         'dnn_optimizer': 'Adagrad',
     }
-    features = {
-        'language':
-            sparse_tensor.SparseTensor(
-                values=['en', 'fr', 'zh'],
-                indices=[[0, 0], [1, 0], [2, 0]],
-                dense_shape=[3, 1]),
-        'wire':
-            sparse_tensor.SparseTensor(
-                values=['omar', 'stringer', 'marlo'],
-                indices=[[0, 0], [1, 0], [2, 0]],
-                dense_shape=[3, 1]),
-    }
-    labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32)
-    model_ops = dnn_linear_combined._dnn_linear_combined_model_fn(
-        features, labels, model_fn.ModeKeys.TRAIN, params)
-    with monitored_session.MonitoredSession() as sess:
-      language_var = dnn_linear_combined._get_embedding_variable(
-          embedding_language, 'dnn', 'dnn/input_from_feature_columns')
-      wire_var = dnn_linear_combined._get_embedding_variable(
-          embedding_wire, 'dnn', 'dnn/input_from_feature_columns')
-      for _ in range(2):
-        _, language_value, wire_value = sess.run(
-            [model_ops.train_op, language_var, wire_var])
-      initial_value = np.full_like(language_value, 0.1)
-      self.assertTrue(np.all(np.isclose(language_value, initial_value)))
-      self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
+    with ops.Graph().as_default():
+      features = {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['en', 'fr', 'zh'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 1]),
+          'wire':
+              sparse_tensor.SparseTensor(
+                  values=['omar', 'stringer', 'marlo'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 1]),
+      }
+      labels = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+      training_util.create_global_step()
+      model_ops = dnn_linear_combined._dnn_linear_combined_model_fn(
+          features, labels, model_fn.ModeKeys.TRAIN, params)
+      with monitored_session.MonitoredSession() as sess:
+        language_var = dnn_linear_combined._get_embedding_variable(
+            embedding_language, 'dnn', 'dnn/input_from_feature_columns')
+        language_initial_value = sess.run(language_var)
+        for _ in range(2):
+          _, language_value = sess.run([model_ops.train_op, language_var])
+
+    self.assertAllClose(language_value, language_initial_value)
+    # We could also test that wire_value changed, but that test would be flaky.
 
 
 class DNNLinearCombinedEstimatorTest(test.TestCase):
 
   def testEstimatorContract(self):
     estimator_test_utils.assert_estimator_contract(
-        self, dnn_linear_combined._DNNLinearCombinedEstimator)
+        self, dnn_linear_combined.DNNLinearCombinedEstimator)
 
   def testNoFeatureColumns(self):
     with self.assertRaisesRegexp(
         ValueError,
         'Either linear_feature_columns or dnn_feature_columns must be defined'):
-      dnn_linear_combined._DNNLinearCombinedEstimator(
+      dnn_linear_combined.DNNLinearCombinedEstimator(
           head=_CheckCallsHead(),
           linear_feature_columns=None,
           dnn_feature_columns=None,
@@ -198,7 +212,7 @@ class DNNLinearCombinedEstimatorTest(test.TestCase):
     bucketized_feature = [feature_column.bucketized_column(
         cont_features[0], test_data.get_quantile_based_buckets(iris.data, 10))]
 
-    estimator = dnn_linear_combined._DNNLinearCombinedEstimator(
+    estimator = dnn_linear_combined.DNNLinearCombinedEstimator(
         head,
         linear_feature_columns=bucketized_feature,
         dnn_feature_columns=cont_features,
@@ -256,11 +270,33 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        'dnn_hidden_units must be defined when dnn_feature_columns is specified'):
+        'dnn_hidden_units must be defined when dnn_feature_columns is '
+        'specified'):
       classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
-        dnn_feature_columns=[age, language])
+          dnn_feature_columns=[age, language])
       classifier.fit(input_fn=_input_fn, steps=2)
 
+  def testSyncReplicasOptimizerUnsupported(self):
+    cont_features = [feature_column.real_valued_column('feature', dimension=4)]
+
+    sync_optimizer = sync_replicas_optimizer.SyncReplicasOptimizer(
+        opt=adagrad.AdagradOptimizer(learning_rate=0.1),
+        replicas_to_aggregate=1,
+        total_num_replicas=1)
+    sync_hook = sync_optimizer.make_session_run_hook(is_chief=True)
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        n_classes=3,
+        dnn_feature_columns=cont_features,
+        dnn_hidden_units=[3, 3],
+        dnn_optimizer=sync_optimizer)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'SyncReplicasOptimizer is not supported in DNNLinearCombined model'):
+      classifier.fit(
+          input_fn=test_data.iris_input_multiclass_fn, steps=100,
+          monitors=[sync_hook])
+
   def testEmbeddingMultiplier(self):
     embedding_language = feature_column.embedding_column(
         feature_column.sparse_column_with_hash_bucket('language', 10),
@@ -306,7 +342,7 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
         input_layer_min_slice_size=1)
 
     # Ensure the param is passed in.
-    self.assertEqual(1, classifier.params['input_layer_min_slice_size'])
+    self.assertTrue(callable(classifier.params['input_layer_partitioner']))
 
     # Ensure the partition count is 10.
     classifier.fit(input_fn=_input_fn_float_label, steps=50)
@@ -385,6 +421,52 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=100)
     _assert_metrics_in_range(('accuracy', 'auc'), scores)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+    """Tests binary classification using Tensor data as input."""
+
+    def _input_fn():
+      iris = test_data.prepare_iris_data_for_logistic_regression()
+      features = {}
+      for i in range(4):
+        # The following shows how to provide the Tensor data for
+        # RealValuedColumns.
+        features.update({
+            str(i):
+                array_ops.reshape(
+                    constant_op.constant(iris.data[:, i], dtype=dtypes.float32),
+                    [-1, 1])
+        })
+      # The following shows how to provide the SparseTensor data for
+      # a SparseColumn.
+      features['dummy_sparse_column'] = sparse_tensor.SparseTensor(
+          values=['en', 'fr', 'zh'],
+          indices=[[0, 0], [0, 1], [60, 0]],
+          dense_shape=[len(iris.target), 2])
+      labels = array_ops.reshape(
+          constant_op.constant(iris.target, dtype=dtypes.int32), [-1, 1])
+      return features, labels
+
+    iris = test_data.prepare_iris_data_for_logistic_regression()
+    cont_features = [fc_core.numeric_column(str(i)) for i in range(4)]
+    linear_features = [
+        fc_core.bucketized_column(
+            cont_features[i],
+            sorted(set(test_data.get_quantile_based_buckets(
+                iris.data[:, i], 10)))) for i in range(4)
+    ]
+    linear_features.append(
+        fc_core.categorical_column_with_hash_bucket(
+            'dummy_sparse_column', hash_bucket_size=100))
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=linear_features,
+        dnn_feature_columns=cont_features,
+        dnn_hidden_units=[3, 3])
+
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=100)
+    _assert_metrics_in_range(('accuracy', 'auc'), scores)
+
   def testTrainWithPartitionedVariables(self):
     """Tests training with partitioned variables."""
 
@@ -458,6 +540,59 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
         input_fn=test_data.iris_input_multiclass_fn, steps=100)
     _assert_metrics_in_range(('accuracy',), scores)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        n_classes=3,
+        linear_feature_columns=[language_column],
+        dnn_feature_columns=[
+            feature_column.embedding_column(
+                language_column, dimension=1),
+            feature_column.real_valued_column('age')
+        ],
+        dnn_hidden_units=[3, 3],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    _assert_metrics_in_range(('accuracy',), scores)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLoss(self):
     """Tests loss calculation."""
 
@@ -606,7 +741,7 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     ]
 
     def _optimizer_exp_decay():
-      global_step = variables.get_global_step()
+      global_step = training_util.get_global_step()
       learning_rate = learning_rate_decay.exponential_decay(
           learning_rate=0.1,
           global_step=global_step,
@@ -741,7 +876,7 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
           })
 
   def testVariableQuery(self):
-    """Tests bias is centered or not."""
+    """Tests get_variable_names and get_variable_value."""
 
     def _input_fn_train():
       # Create 4 rows, three (y = x), one (y=Not(x))
@@ -843,6 +978,105 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     classifier.fit(input_fn=_input_fn_train, steps=500)
     self.assertNotIn('centered_bias_weight', classifier.get_variable_names())
 
+  def testGlobalStepLinearOnly(self):
+    """Tests global step update for linear-only model."""
+
+    def input_fn():
+      return {
+          'age': constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[1]])
+
+    language = feature_column.sparse_column_with_hash_bucket('language', 10)
+    age = feature_column.real_valued_column('age')
+
+    step_counter = _StepCounterHook()
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[age, language])
+    classifier.fit(input_fn=input_fn, steps=100, monitors=[step_counter])
+
+    self.assertEqual(100, step_counter.steps)
+
+  def testGlobalStepDNNOnly(self):
+    """Tests global step update for dnn-only model."""
+
+    def input_fn():
+      return {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[1]])
+
+    language = feature_column.sparse_column_with_hash_bucket('language', 10)
+
+    step_counter = _StepCounterHook()
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        dnn_feature_columns=[
+            feature_column.embedding_column(language, dimension=1)],
+        dnn_hidden_units=[3, 3])
+    classifier.fit(input_fn=input_fn, steps=100, monitors=[step_counter])
+
+    self.assertEqual(100, step_counter.steps)
+
+  def testGlobalStepDNNLinearCombinedBug(self):
+    """Tests global step update for dnn-linear combined model."""
+
+    def input_fn():
+      return {
+          'age': constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[1]])
+
+    language = feature_column.sparse_column_with_hash_bucket('language', 10)
+    age = feature_column.real_valued_column('age')
+
+    step_counter = _StepCounterHook()
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[age, language],
+        dnn_feature_columns=[
+            feature_column.embedding_column(language, dimension=1)],
+        dnn_hidden_units=[3, 3],
+        fix_global_step_increment_bug=False)
+    classifier.fit(input_fn=input_fn, steps=100, monitors=[step_counter])
+    global_step = classifier.get_variable_value('global_step')
+
+    if global_step == 100:
+      # Expected is 100, but because of the global step increment bug, is 50.
+      self.assertEqual(50, step_counter.steps)
+    else:
+      # Occasionally, training stops when global_step == 101, due to a race
+      # condition.
+      self.assertEqual(51, step_counter.steps)
+
+  def testGlobalStepDNNLinearCombinedBugFixed(self):
+    """Tests global step update for dnn-linear combined model."""
+
+    def input_fn():
+      return {
+          'age': constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[1]])
+
+    language = feature_column.sparse_column_with_hash_bucket('language', 10)
+    age = feature_column.real_valued_column('age')
+
+    step_counter = _StepCounterHook()
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[age, language],
+        dnn_feature_columns=[
+            feature_column.embedding_column(language, dimension=1)],
+        dnn_hidden_units=[3, 3],
+        fix_global_step_increment_bug=True)
+    classifier.fit(input_fn=input_fn, steps=100, monitors=[step_counter])
+
+    self.assertEqual(100, step_counter.steps)
+
   def testLinearOnly(self):
     """Tests that linear-only instantiation works."""
 
@@ -866,13 +1100,16 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
     self.assertLess(loss2, loss1)
 
-    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
-    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
-    self.assertEquals(1, len(classifier.linear_bias_))
-    self.assertEquals(2, len(classifier.linear_weights_))
-    self.assertEquals(1, len(classifier.linear_weights_['linear/age/weight']))
+    variable_names = classifier.get_variable_names()
+    self.assertNotIn('dnn/logits/biases', variable_names)
+    self.assertNotIn('dnn/logits/weights', variable_names)
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/age/weight', variable_names)
+    self.assertIn('linear/language/weights', variable_names)
     self.assertEquals(
-        100, len(classifier.linear_weights_['linear/language/weights']))
+        1, len(classifier.get_variable_value('linear/age/weight')))
+    self.assertEquals(
+        100, len(classifier.get_variable_value('linear/language/weights')))
 
   def testLinearOnlyOneFeature(self):
     """Tests that linear-only instantiation works for one feature only."""
@@ -894,10 +1131,15 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
     self.assertLess(loss2, loss1)
 
-    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
-    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
-    self.assertEquals(1, len(classifier.linear_bias_))
-    self.assertEquals(99, len(classifier.linear_weights_))
+    variable_names = classifier.get_variable_names()
+    self.assertNotIn('dnn/logits/biases', variable_names)
+    self.assertNotIn('dnn/logits/weights', variable_names)
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/language/weights', variable_names)
+    self.assertEquals(
+        1, len(classifier.get_variable_value('linear/bias_weight')))
+    self.assertEquals(
+        99, len(classifier.get_variable_value('linear/language/weights')))
 
   def testDNNOnly(self):
     """Tests that DNN-only instantiation works."""
@@ -909,11 +1151,15 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=1000)
     classifier.evaluate(input_fn=test_data.iris_input_multiclass_fn, steps=100)
 
-    self.assertEquals(3, len(classifier.dnn_bias_))
-    self.assertEquals(3, len(classifier.dnn_weights_))
-    self.assertNotIn('linear/bias_weight', classifier.get_variable_names())
-    self.assertNotIn('linear/feature_BUCKETIZED_weights',
-                     classifier.get_variable_names())
+    variable_names = classifier.get_variable_names()
+    self.assertIn('dnn/hiddenlayer_0/weights', variable_names)
+    self.assertIn('dnn/hiddenlayer_0/biases', variable_names)
+    self.assertIn('dnn/hiddenlayer_1/weights', variable_names)
+    self.assertIn('dnn/hiddenlayer_1/biases', variable_names)
+    self.assertIn('dnn/logits/weights', variable_names)
+    self.assertIn('dnn/logits/biases', variable_names)
+    self.assertNotIn('linear/bias_weight', variable_names)
+    self.assertNotIn('linear/feature_BUCKETIZED/weight', variable_names)
 
   def testDNNWeightsBiasesNames(self):
     """Tests the names of DNN weights and biases in the checkpoints."""
@@ -930,10 +1176,13 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
         dnn_hidden_units=[3, 3])
 
     classifier.fit(input_fn=_input_fn_train, steps=5)
-    # hiddenlayer_0/weights,hiddenlayer_1/weights and dnn_logits/weights.
-    self.assertEquals(3, len(classifier.dnn_weights_))
-    # hiddenlayer_0/biases, hiddenlayer_1/biases, dnn_logits/biases.
-    self.assertEquals(3, len(classifier.dnn_bias_))
+    variable_names = classifier.get_variable_names()
+    self.assertIn('dnn/hiddenlayer_0/weights', variable_names)
+    self.assertIn('dnn/hiddenlayer_0/biases', variable_names)
+    self.assertIn('dnn/hiddenlayer_1/weights', variable_names)
+    self.assertIn('dnn/hiddenlayer_1/biases', variable_names)
+    self.assertIn('dnn/logits/weights', variable_names)
+    self.assertIn('dnn/logits/biases', variable_names)
 
 
 class DNNLinearCombinedRegressorTest(test.TestCase):
@@ -1542,14 +1791,14 @@ class FeatureEngineeringFunctionTest(test.TestCase):
         dnn_hidden_units=[3, 3],
         config=run_config.RunConfig(tf_random_seed=1),
         feature_engineering_fn=feature_engineering_fn)
-    estimator_with_fe_fn.fit(input_fn=input_fn, steps=100)
+    estimator_with_fe_fn.fit(input_fn=input_fn, steps=110)
 
     estimator_without_fe_fn = dnn_linear_combined.DNNLinearCombinedRegressor(
         linear_feature_columns=[feature_column.real_valued_column('x')],
         dnn_feature_columns=[feature_column.real_valued_column('x')],
         dnn_hidden_units=[3, 3],
         config=run_config.RunConfig(tf_random_seed=1))
-    estimator_without_fe_fn.fit(input_fn=input_fn, steps=100)
+    estimator_without_fe_fn.fit(input_fn=input_fn, steps=110)
 
     # predictions = y
     prediction_with_fe_fn = next(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 42c75f0e135..71a82ccf56f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import functools
 import json
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -44,6 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -65,7 +60,7 @@ class EmbeddingMultiplierTest(test.TestCase):
 
     params = {
         'feature_columns': [one_hot_language],
-        'head': head_lib._multi_class_head(2),
+        'head': head_lib.multi_class_head(2),
         'hidden_units': [1],
         # Set lr mult to 0. to keep embeddings constant.
         'embedding_lr_multipliers': {
@@ -96,7 +91,7 @@ class EmbeddingMultiplierTest(test.TestCase):
 
     params = {
         'feature_columns': [embedding_language, embedding_wire],
-        'head': head_lib._multi_class_head(2),
+        'head': head_lib.multi_class_head(2),
         'hidden_units': [1],
         # Set lr mult to 0. to keep embeddings constant.
         'embedding_lr_multipliers': {
@@ -131,6 +126,94 @@ class EmbeddingMultiplierTest(test.TestCase):
       self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
 
 
+class ActivationFunctionTest(test.TestCase):
+
+  def _getModelForActivation(self, activation_fn):
+    embedding_language = feature_column.embedding_column(
+        feature_column.sparse_column_with_hash_bucket('language', 10),
+        dimension=1,
+        initializer=init_ops.constant_initializer(0.1))
+    params = {
+        'feature_columns': [embedding_language],
+        'head': head_lib.multi_class_head(2),
+        'hidden_units': [1],
+        'activation_fn': activation_fn,
+    }
+    features = {
+        'language':
+            sparse_tensor.SparseTensor(
+                values=['en', 'fr', 'zh'],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+    }
+    labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32)
+    return dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN, params)
+
+  def testValidActivation(self):
+    _ = self._getModelForActivation('relu')
+
+  def testRaisesOnBadActivationName(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Activation name should be one of'):
+      self._getModelForActivation('max_pool')
+
+
+class DNNEstimatorTest(test.TestCase):
+
+  def _assertInRange(self, expected_min, expected_max, actual):
+    self.assertLessEqual(expected_min, actual)
+    self.assertGreaterEqual(expected_max, actual)
+
+  def testExperimentIntegration(self):
+    exp = experiment.Experiment(
+        estimator=dnn.DNNClassifier(
+            n_classes=3,
+            feature_columns=[
+                feature_column.real_valued_column(
+                    'feature', dimension=4)
+            ],
+            hidden_units=[3, 3]),
+        train_input_fn=test_data.iris_input_multiclass_fn,
+        eval_input_fn=test_data.iris_input_multiclass_fn)
+    exp.test()
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(self, dnn.DNNEstimator)
+
+  def testTrainWithWeights(self):
+    """Tests training with given weight column."""
+
+    def _input_fn_train():
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      # First row has more weight than others. Model should fit (y=x) better
+      # than (y=Not(x)) due to the relative higher weight of the first row.
+      labels = constant_op.constant([[1], [0], [0], [0]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[100.], [3.], [2.], [2.]])
+      }
+      return features, labels
+
+    def _input_fn_eval():
+      # Create 4 rows (y = x)
+      labels = constant_op.constant([[1], [1], [1], [1]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    dnn_estimator = dnn.DNNEstimator(
+        head=head_lib.multi_class_head(2, weight_column_name='w'),
+        feature_columns=[feature_column.real_valued_column('x')],
+        hidden_units=[3, 3],
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    dnn_estimator.fit(input_fn=_input_fn_train, steps=5)
+    scores = dnn_estimator.evaluate(input_fn=_input_fn_eval, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+
+
 class DNNClassifierTest(test.TestCase):
 
   def testExperimentIntegration(self):
@@ -273,6 +356,49 @@ class DNNClassifierTest(test.TestCase):
       for i in range(expected_n_classes):
         self._assertInRange(0.0, 1.0, probabilities[b][i])
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [
+        fc_core.embedding_column(language_column, dimension=1),
+        fc_core.numeric_column('age')
+    ]
+
+    classifier = dnn.DNNClassifier(
+        n_classes=2,
+        feature_columns=feature_columns,
+        hidden_units=[10, 10],
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(input_fn=predict_input_fn, as_iterable=True))
+    self._assertBinaryPredictions(3, predicted_classes)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_TensorData(self):
     """Tests binary classification using tensor data as input."""
 
@@ -423,6 +549,60 @@ class DNNClassifierTest(test.TestCase):
     scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
     self._assertInRange(0.0, 1.0, scores['accuracy'])
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [
+        feature_column.embedding_column(
+            language_column, dimension=1),
+        feature_column.real_valued_column('age')
+    ]
+
+    classifier = dnn.DNNClassifier(
+        n_classes=3,
+        feature_columns=feature_columns,
+        hidden_units=[10, 10],
+        label_keys=label_keys,
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLoss(self):
     """Tests loss calculation."""
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
index 508a815cf7a..1724d7599d0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
@@ -19,12 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib import metrics
-from tensorflow.contrib import rnn as contrib_rnn
-from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.learn.python.learn.estimators import rnn_common
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -35,95 +35,16 @@ from tensorflow.python.training import momentum as momentum_opt
 from tensorflow.python.util import nest
 
 
-class ProblemType(object):
-  REGRESSION = 1
-  CLASSIFICATION = 2
-
-
+# TODO(jtbates): Remove PredictionType when all non-experimental targets which
+# depend on it point to rnn_common.PredictionType.
 class PredictionType(object):
   SINGLE_VALUE = 1
   MULTIPLE_VALUE = 2
 
 
-class RNNKeys(object):
-  SEQUENCE_LENGTH_KEY = 'sequence_length'
-  STATE_PREFIX = 'rnn_cell_state'
-  PREDICTIONS_KEY = 'predictions'
-  PROBABILITIES_KEY = 'probabilities'
-
-_CELL_TYPES = {'basic_rnn': contrib_rnn.BasicRNNCell,
-               'lstm': contrib_rnn.LSTMCell,
-               'gru': contrib_rnn.GRUCell,}
-
-
-def mask_activations_and_labels(activations, labels, sequence_lengths):
-  """Remove entries outside `sequence_lengths` and returned flattened results.
-
-  Args:
-    activations: Output of the RNN, shape `[batch_size, padded_length, k]`.
-    labels: Label values, shape `[batch_size, padded_length]`.
-    sequence_lengths: A `Tensor` of shape `[batch_size]` with the unpadded
-      length of each sequence. If `None`, then each sequence is unpadded.
-
-  Returns:
-    activations_masked: `logit` values with those beyond `sequence_lengths`
-      removed for each batch. Batches are then concatenated. Shape
-      `[tf.sum(sequence_lengths), k]` if `sequence_lengths` is not `None` and
-      shape `[batch_size * padded_length, k]` otherwise.
-    labels_masked: Label values after removing unneeded entries. Shape
-      `[tf.sum(sequence_lengths)]` if `sequence_lengths` is not `None` and shape
-      `[batch_size * padded_length]` otherwise.
-  """
-  with ops.name_scope('mask_activations_and_labels',
-                      values=[activations, labels, sequence_lengths]):
-    labels_shape = array_ops.shape(labels)
-    batch_size = labels_shape[0]
-    padded_length = labels_shape[1]
-    if sequence_lengths is None:
-      flattened_dimension = padded_length * batch_size
-      activations_masked = array_ops.reshape(activations,
-                                             [flattened_dimension, -1])
-      labels_masked = array_ops.reshape(labels, [flattened_dimension])
-    else:
-      mask = array_ops.sequence_mask(sequence_lengths, padded_length)
-      activations_masked = array_ops.boolean_mask(activations, mask)
-      labels_masked = array_ops.boolean_mask(labels, mask)
-    return activations_masked, labels_masked
-
-
-def select_last_activations(activations, sequence_lengths):
-  """Selects the nth set of activations for each n in `sequence_length`.
-
-  Reuturns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
-  `None`, then `output[i, :] = activations[i, sequence_length[i], :]`. If
-  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
-
-  Args:
-    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
-    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
-  Returns:
-    A `Tensor` of shape `[batch_size, k]`.
-  """
-  with ops.name_scope('select_last_activations',
-                      values=[activations, sequence_lengths]):
-    activations_shape = array_ops.shape(activations)
-    batch_size = activations_shape[0]
-    padded_length = activations_shape[1]
-    num_label_columns = activations_shape[2]
-    if sequence_lengths is None:
-      sequence_lengths = padded_length
-    reshaped_activations = array_ops.reshape(activations,
-                                             [-1, num_label_columns])
-    indices = math_ops.range(batch_size) * padded_length + sequence_lengths - 1
-    last_activations = array_ops.gather(reshaped_activations, indices)
-    last_activations.set_shape(
-        [activations.get_shape()[0], activations.get_shape()[2]])
-    return last_activations
-
-
 def _get_state_name(i):
   """Constructs the name string for state component `i`."""
-  return '{}_{}'.format(RNNKeys.STATE_PREFIX, i)
+  return '{}_{}'.format(rnn_common.RNNKeys.STATE_PREFIX, i)
 
 
 def state_tuple_to_dict(state):
@@ -194,7 +115,7 @@ def dict_to_state_tuple(input_dict, cell):
 
 
 def _concatenate_context_input(sequence_input, context_input):
-  """Replicates `context_input` accross all timesteps of `sequence_input`.
+  """Replicates `context_input` across all timesteps of `sequence_input`.
 
   Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
   This value is appended to `sequence_input` on dimension 2 and the result is
@@ -256,7 +177,7 @@ def build_sequence_input(features,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features i.e. features that apply accross all time
+      describing context features i.e. features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     weight_collections: List of graph collections to which weights are added.
@@ -265,6 +186,10 @@ def build_sequence_input(features,
     A `Tensor` of dtype `float32` and shape `[batch_size, padded_length, ?]`.
     This will be used as input to an RNN.
   """
+  features = features.copy()
+  features.update(layers.transform_features(
+      features,
+      list(sequence_feature_columns) + list(context_feature_columns or [])))
   sequence_input = layers.sequence_input_from_feature_columns(
       columns_to_tensors=features,
       feature_columns=sequence_feature_columns,
@@ -327,99 +252,11 @@ def construct_rnn(initial_state,
     return activations, final_state
 
 
-def _get_eval_metric_ops(problem_type, prediction_type, sequence_length,
-                         prediction_dict, labels):
-  """Returns eval metric ops for given `problem_type` and `prediction_type`.
-
-  Args:
-    problem_type: `ProblemType.CLASSIFICATION` or`ProblemType.REGRESSION`.
-    prediction_type: `PredictionType.SINGLE_VALUE` or
-      `PredictionType.MULTIPLE_VALUE`.
-    sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32`
-      containing the length of each sequence in the batch. If `None`, sequences
-      are assumed to be unpadded.
-    prediction_dict: A dict of prediction tensors.
-    labels: The label `Tensor`.
-
-  Returns:
-    A `dict` mapping strings to the result of calling the metric_fn.
-  """
-  eval_metric_ops = {}
-  if problem_type == ProblemType.CLASSIFICATION:
-    # Multi value classification
-    if prediction_type == PredictionType.MULTIPLE_VALUE:
-      masked_predictions, masked_labels = mask_activations_and_labels(
-          prediction_dict[RNNKeys.PREDICTIONS_KEY], labels, sequence_length)
-      eval_metric_ops['accuracy'] = metrics.streaming_accuracy(
-          predictions=masked_predictions,
-          labels=masked_labels)
-    # Single value classification
-    elif prediction_type == PredictionType.SINGLE_VALUE:
-      eval_metric_ops['accuracy'] = metrics.streaming_accuracy(
-          predictions=prediction_dict[RNNKeys.PREDICTIONS_KEY],
-          labels=labels)
-  elif problem_type == ProblemType.REGRESSION:
-    # Multi value regression
-    if prediction_type == PredictionType.MULTIPLE_VALUE:
-      pass
-    # Single value regression
-    elif prediction_type == PredictionType.SINGLE_VALUE:
-      pass
-  return eval_metric_ops
-
-
-def _multi_value_predictions(
-    activations, target_column, predict_probabilities):
-  """Maps `activations` from the RNN to predictions for multi value models.
-
-  If `predict_probabilities` is `False`, this function returns a `dict`
-  containing single entry with key `PREDICTIONS_KEY`. If `predict_probabilities`
-  is `True`, it will contain a second entry with key `PROBABILITIES_KEY`. The
-  value of this entry is a `Tensor` of probabilities with shape
-  `[batch_size, padded_length, num_classes]`.
-
-  Note that variable length inputs will yield some predictions that don't have
-  meaning. For example, if `sequence_length = [3, 2]`, then prediction `[1, 2]`
-  has no meaningful interpretation.
-
-  Args:
-    activations: Output from an RNN. Should have dtype `float32` and shape
-      `[batch_size, padded_length, ?]`.
-    target_column: An initialized `TargetColumn`, calculate predictions.
-    predict_probabilities: A Python boolean, indicating whether probabilities
-      should be returned. Should only be set to `True` for
-      classification/logistic regression problems.
-  Returns:
-    A `dict` mapping strings to `Tensors`.
-  """
-  with ops.name_scope('MultiValuePrediction'):
-    activations_shape = array_ops.shape(activations)
-    flattened_activations = array_ops.reshape(activations,
-                                              [-1, activations_shape[2]])
-    prediction_dict = {}
-    if predict_probabilities:
-      flat_probabilities = target_column.logits_to_predictions(
-          flattened_activations, proba=True)
-      flat_predictions = math_ops.argmax(flat_probabilities, 1)
-      if target_column.num_label_columns == 1:
-        probability_shape = array_ops.concat([activations_shape[:2], [2]], 0)
-      else:
-        probability_shape = activations_shape
-      probabilities = array_ops.reshape(
-          flat_probabilities, probability_shape, name=RNNKeys.PROBABILITIES_KEY)
-      prediction_dict[RNNKeys.PROBABILITIES_KEY] = probabilities
-    else:
-      flat_predictions = target_column.logits_to_predictions(
-          flattened_activations, proba=False)
-    predictions = array_ops.reshape(
-        flat_predictions, [activations_shape[0], activations_shape[1]],
-        name=RNNKeys.PREDICTIONS_KEY)
-    prediction_dict[RNNKeys.PREDICTIONS_KEY] = predictions
-    return prediction_dict
-
-
-def _single_value_predictions(
-    activations, sequence_length, target_column, predict_probabilities):
+def _single_value_predictions(activations,
+                              sequence_length,
+                              target_column,
+                              problem_type,
+                              predict_probabilities):
   """Maps `activations` from the RNN to predictions for single value models.
 
   If `predict_probabilities` is `False`, this function returns a `dict`
@@ -435,6 +272,8 @@ def _single_value_predictions(
       containing the length of each sequence in the batch. If `None`, sequences
       are assumed to be unpadded.
     target_column: An initialized `TargetColumn`, calculate predictions.
+    problem_type: Either `ProblemType.CLASSIFICATION` or
+      `ProblemType.LINEAR_REGRESSION`.
     predict_probabilities: A Python boolean, indicating whether probabilities
       should be returned. Should only be set to `True` for
       classification/logistic regression problems.
@@ -442,17 +281,21 @@ def _single_value_predictions(
     A `dict` mapping strings to `Tensors`.
   """
   with ops.name_scope('SingleValuePrediction'):
-    last_activations = select_last_activations(activations, sequence_length)
+    last_activations = rnn_common.select_last_activations(
+        activations, sequence_length)
+    predictions_name = (prediction_key.PredictionKey.CLASSES
+                        if problem_type == constants.ProblemType.CLASSIFICATION
+                        else prediction_key.PredictionKey.SCORES)
     if predict_probabilities:
       probabilities = target_column.logits_to_predictions(
           last_activations, proba=True)
       prediction_dict = {
-          RNNKeys.PROBABILITIES_KEY: probabilities,
-          RNNKeys.PREDICTIONS_KEY: math_ops.argmax(probabilities, 1)}
+          prediction_key.PredictionKey.PROBABILITIES: probabilities,
+          predictions_name: math_ops.argmax(probabilities, 1)}
     else:
       predictions = target_column.logits_to_predictions(
           last_activations, proba=False)
-      prediction_dict = {RNNKeys.PREDICTIONS_KEY: predictions}
+      prediction_dict = {predictions_name: predictions}
     return prediction_dict
 
 
@@ -474,7 +317,7 @@ def _multi_value_loss(
     A scalar `Tensor` containing the loss.
   """
   with ops.name_scope('MultiValueLoss'):
-    activations_masked, labels_masked = mask_activations_and_labels(
+    activations_masked, labels_masked = rnn_common.mask_activations_and_labels(
         activations, labels, sequence_length)
     return target_column.loss(activations_masked, labels_masked, features)
 
@@ -498,61 +341,76 @@ def _single_value_loss(
   """
 
   with ops.name_scope('SingleValueLoss'):
-    last_activations = select_last_activations(activations, sequence_length)
+    last_activations = rnn_common.select_last_activations(
+        activations, sequence_length)
     return target_column.loss(last_activations, labels, features)
 
 
-def apply_dropout(
-    cell, input_keep_probability, output_keep_probability, random_seed=None):
-  """Apply dropout to the outputs and inputs of `cell`.
+def _get_output_alternatives(prediction_type,
+                             problem_type,
+                             prediction_dict):
+  """Constructs output alternatives dict for `ModelFnOps`.
 
   Args:
-    cell: An `RNNCell`.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    random_seed: Seed for random dropout.
+    prediction_type: either `MULTIPLE_VALUE` or `SINGLE_VALUE`.
+    problem_type: either `CLASSIFICATION` or `LINEAR_REGRESSION`.
+    prediction_dict: a dictionary mapping strings to `Tensor`s containing
+      predictions.
 
   Returns:
-    An `RNNCell`, the result of applying the supplied dropouts to `cell`.
+    `None` or a dictionary mapping a string to an output alternative.
+
+  Raises:
+    ValueError: `prediction_type` is not one of `SINGLE_VALUE` or
+    `MULTIPLE_VALUE`.
   """
-  input_prob_none = input_keep_probability is None
-  output_prob_none = output_keep_probability is None
-  if input_prob_none and output_prob_none:
-    return cell
-  if input_prob_none:
-    input_keep_probability = 1.0
-  if output_prob_none:
-    output_keep_probability = 1.0
-  return contrib_rnn.DropoutWrapper(
-      cell, input_keep_probability, output_keep_probability, random_seed)
+  if prediction_type == rnn_common.PredictionType.MULTIPLE_VALUE:
+    return None
+  if prediction_type == rnn_common.PredictionType.SINGLE_VALUE:
+    prediction_dict_no_state = {
+        k: v
+        for k, v in prediction_dict.items()
+        if rnn_common.RNNKeys.STATE_PREFIX not in k
+    }
+    return {'dynamic_rnn_output': (problem_type, prediction_dict_no_state)}
+  raise ValueError('Unrecognized prediction_type: {}'.format(prediction_type))
 
 
-def _get_dynamic_rnn_model_fn(cell,
-                              target_column,
-                              problem_type,
-                              prediction_type,
-                              optimizer,
-                              sequence_feature_columns,
-                              context_feature_columns=None,
-                              predict_probabilities=False,
-                              learning_rate=None,
-                              gradient_clipping_norm=None,
-                              input_keep_probability=None,
-                              output_keep_probability=None,
-                              sequence_length_key=RNNKeys.SEQUENCE_LENGTH_KEY,
-                              dtype=dtypes.float32,
-                              parallel_iterations=None,
-                              swap_memory=True,
-                              name='DynamicRNNModel'):
+def _get_dynamic_rnn_model_fn(
+    cell_type,
+    num_units,
+    target_column,
+    problem_type,
+    prediction_type,
+    optimizer,
+    sequence_feature_columns,
+    context_feature_columns=None,
+    predict_probabilities=False,
+    learning_rate=None,
+    gradient_clipping_norm=None,
+    dropout_keep_probabilities=None,
+    sequence_length_key=rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY,
+    dtype=dtypes.float32,
+    parallel_iterations=None,
+    swap_memory=True,
+    name='DynamicRNNModel'):
   """Creates an RNN model function for an `Estimator`.
 
+  The model function returns an instance of `ModelFnOps`. When
+  `problem_type == ProblemType.CLASSIFICATION` and
+  `predict_probabilities == True`, the returned `ModelFnOps` includes an output
+  alternative containing the classes and their associated probabilities. When
+  `predict_probabilities == False`, only the classes are included. When
+  `problem_type == ProblemType.LINEAR_REGRESSION`, the output alternative
+  contains only the predicted values.
+
   Args:
-    cell: An initialized `RNNCell` to be used in the RNN.
+    cell_type: A string, a subclass of `RNNCell` or an instance of an `RNNCell`.
+    num_units: A single `int` or a list of `int`s. The size of the `RNNCell`s.
     target_column: An initialized `TargetColumn`, used to calculate prediction
       and loss.
-    problem_type: `ProblemType.CLASSIFICATION` or`ProblemType.REGRESSION`.
+    problem_type: `ProblemType.CLASSIFICATION` or
+      `ProblemType.LINEAR_REGRESSION`.
     prediction_type: `PredictionType.SINGLE_VALUE` or
       `PredictionType.MULTIPLE_VALUE`.
     optimizer: A subclass of `Optimizer`, an instance of an `Optimizer` or a
@@ -561,18 +419,17 @@ def _get_dynamic_rnn_model_fn(cell,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes. Must only be used with `ProblemType.CLASSIFICATION`.
+      for all classes. Must only be used with
+      `ProblemType.CLASSIFICATION`.
     learning_rate: Learning rate used for optimization. This argument has no
       effect if `optimizer` is an instance of an `Optimizer`.
     gradient_clipping_norm: A float. Gradients will be clipped to this value.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
+    dropout_keep_probabilities: a list of dropout keep probabilities or `None`.
+      If a list is given, it must have length `len(num_units) + 1`.
     sequence_length_key: The key that will be used to look up sequence length in
       the `features` dict.
     dtype: The dtype of the state and output of the given `cell`.
@@ -589,62 +446,64 @@ def _get_dynamic_rnn_model_fn(cell,
     A model function to be passed to an `Estimator`.
 
   Raises:
-    ValueError: `problem_type` is not one of `ProblemType.REGRESSION` or
-      `ProblemType.CLASSIFICATION`.
+    ValueError: `problem_type` is not one of
+      `ProblemType.LINEAR_REGRESSION` or `ProblemType.CLASSIFICATION`.
     ValueError: `prediction_type` is not one of `PredictionType.SINGLE_VALUE`
       or `PredictionType.MULTIPLE_VALUE`.
     ValueError: `predict_probabilities` is `True` for `problem_type` other
       than `ProblemType.CLASSIFICATION`.
+    ValueError: `len(dropout_keep_probabilities)` is not `len(num_units) + 1`.
   """
-  if problem_type not in (ProblemType.CLASSIFICATION, ProblemType.REGRESSION):
+  if problem_type not in (constants.ProblemType.CLASSIFICATION,
+                          constants.ProblemType.LINEAR_REGRESSION):
     raise ValueError(
-        'problem_type must be ProblemType.REGRESSION or '
+        'problem_type must be ProblemType.LINEAR_REGRESSION or '
         'ProblemType.CLASSIFICATION; got {}'.
         format(problem_type))
-  if prediction_type not in (
-      PredictionType.SINGLE_VALUE, PredictionType.MULTIPLE_VALUE):
+  if prediction_type not in (rnn_common.PredictionType.SINGLE_VALUE,
+                             rnn_common.PredictionType.MULTIPLE_VALUE):
     raise ValueError(
         'prediction_type must be PredictionType.MULTIPLE_VALUEs or '
         'PredictionType.SINGLE_VALUE; got {}'.
         format(prediction_type))
-  if problem_type != ProblemType.CLASSIFICATION and predict_probabilities:
+  if (problem_type != constants.ProblemType.CLASSIFICATION
+      and predict_probabilities):
     raise ValueError(
         'predict_probabilities can only be set to True for problem_type'
         ' ProblemType.CLASSIFICATION; got {}.'.format(problem_type))
-
   def _dynamic_rnn_model_fn(features, labels, mode):
     """The model to be passed to an `Estimator`."""
     with ops.name_scope(name):
-      initial_state = dict_to_state_tuple(features, cell)
       sequence_length = features.get(sequence_length_key)
       sequence_input = build_sequence_input(features,
                                             sequence_feature_columns,
                                             context_feature_columns)
-      if mode == model_fn.ModeKeys.TRAIN:
-        cell_for_mode = apply_dropout(
-            cell, input_keep_probability, output_keep_probability)
-      else:
-        cell_for_mode = cell
+      dropout = (dropout_keep_probabilities
+                 if mode == model_fn.ModeKeys.TRAIN
+                 else None)
+      # This class promises to use the cell type selected by that function.
+      cell = rnn_common.construct_rnn_cell(num_units, cell_type, dropout)
+      initial_state = dict_to_state_tuple(features, cell)
       rnn_activations, final_state = construct_rnn(
           initial_state,
           sequence_input,
-          cell_for_mode,
+          cell,
           target_column.num_label_columns,
           dtype=dtype,
           parallel_iterations=parallel_iterations,
           swap_memory=swap_memory)
 
       loss = None  # Created below for modes TRAIN and EVAL.
-      if prediction_type == PredictionType.MULTIPLE_VALUE:
-        prediction_dict = _multi_value_predictions(
-            rnn_activations, target_column, predict_probabilities)
+      if prediction_type == rnn_common.PredictionType.MULTIPLE_VALUE:
+        prediction_dict = rnn_common.multi_value_predictions(
+            rnn_activations, target_column, problem_type, predict_probabilities)
         if mode != model_fn.ModeKeys.INFER:
           loss = _multi_value_loss(
               rnn_activations, labels, sequence_length, target_column, features)
-      elif prediction_type == PredictionType.SINGLE_VALUE:
+      elif prediction_type == rnn_common.PredictionType.SINGLE_VALUE:
         prediction_dict = _single_value_predictions(
             rnn_activations, sequence_length, target_column,
-            predict_probabilities)
+            problem_type, predict_probabilities)
         if mode != model_fn.ModeKeys.INFER:
           loss = _single_value_loss(
               rnn_activations, labels, sequence_length, target_column, features)
@@ -653,7 +512,7 @@ def _get_dynamic_rnn_model_fn(cell,
 
       eval_metric_ops = None
       if mode != model_fn.ModeKeys.INFER:
-        eval_metric_ops = _get_eval_metric_ops(
+        eval_metric_ops = rnn_common.get_eval_metric_ops(
             problem_type, prediction_type, sequence_length, prediction_dict,
             labels)
 
@@ -667,468 +526,166 @@ def _get_dynamic_rnn_model_fn(cell,
             clip_gradients=gradient_clipping_norm,
             summaries=optimizers.OPTIMIZER_SUMMARIES)
 
+    output_alternatives = _get_output_alternatives(prediction_type,
+                                                   problem_type,
+                                                   prediction_dict)
+
     return model_fn.ModelFnOps(mode=mode,
                                predictions=prediction_dict,
                                loss=loss,
                                train_op=train_op,
-                               eval_metric_ops=eval_metric_ops)
+                               eval_metric_ops=eval_metric_ops,
+                               output_alternatives=output_alternatives)
   return _dynamic_rnn_model_fn
 
 
-def _to_rnn_cell(cell_or_type, num_units, num_layers):
-  """Constructs and return an `RNNCell`.
+class DynamicRnnEstimator(estimator.Estimator):
 
-  Args:
-    cell_or_type: Either a string identifying the `RNNCell` type, a subclass of
-      `RNNCell` or an instance of an `RNNCell`.
-    num_units: The number of units in the `RNNCell`.
-    num_layers: The number of layers in the RNN.
-  Returns:
-    An initialized `RNNCell`.
-  Raises:
-    ValueError: `cell_or_type` is an invalid `RNNCell` name.
-    TypeError: `cell_or_type` is not a string or a subclass of `RNNCell`.
-  """
-  if isinstance(cell_or_type, contrib_rnn.RNNCell):
-    return cell_or_type
-  if isinstance(cell_or_type, str):
-    cell_or_type = _CELL_TYPES.get(cell_or_type)
-    if cell_or_type is None:
-      raise ValueError('The supported cell types are {}; got {}'.format(
-          list(_CELL_TYPES.keys()), cell_or_type))
-  if not issubclass(cell_or_type, contrib_rnn.RNNCell):
-    raise TypeError(
-        'cell_or_type must be a subclass of RNNCell or one of {}.'.format(
-            list(_CELL_TYPES.keys())))
-  single_cell = lambda: cell_or_type(num_units=num_units)
-  if num_layers > 1:
-    cell = contrib_rnn.MultiRNNCell(
-        [single_cell() for _ in range(num_layers)], state_is_tuple=True)
-  else:
-    cell = single_cell()
-  return cell
-
-
-@experimental
-def multi_value_rnn_regressor(num_units,
-                              sequence_feature_columns,
-                              context_feature_columns=None,
-                              cell_type='basic_rnn',
-                              num_rnn_layers=1,
-                              optimizer_type='SGD',
-                              learning_rate=0.1,
-                              momentum=None,
-                              gradient_clipping_norm=5.0,
-                              input_keep_probability=None,
-                              output_keep_probability=None,
-                              model_dir=None,
-                              config=None,
-                              feature_engineering_fn=None):
-  """Creates a RNN `Estimator` that predicts sequences of values.
-
-  The input function passed to this `Estimator` optionally contains keys
-  `RNNKeys.SEQUENCE_LENGTH_KEY`. The value corresponding to
-  `RNNKeys.SEQUENCE_LENGTH_KEY` must be vector of size `batch_size` where entry
-  `n` corresponds to the length of the `n`th sequence in the batch. The sequence
-  length feature is required for batches of varying sizes. It will be used to
-  calculate loss and evaluation metrics. If `RNNKeys.SEQUENCE_LENGTH_KEY` is not
-  included, all sequences are assumed to have length equal to the size of
-  dimension 1 of the input to the RNN.
-
-  In order to specify an initial state, the input function must include keys
-  `STATE_PREFIX_i` for all `0 <= i < n` where `n` is the number of nested
-  elements in `cell.state_size`. The input function must contain values for all
-  state components or none of them. If none are included, then the default
-  (zero) state is used as an initial state. See the documentation for
-  `dict_to_state_tuple` and `state_tuple_to_dict` for further details.
-
-  The `predict()` method of the `Estimator` returns a dictionary with keys
-  `RNNKeys.PREDICTIONS_KEY` and `STATE_PREFIX_i` for `0 <= i < n` where `n` is
-  the number of nested elements in `cell.state_size`. The value keyed by
-  `RNNKeys.PREDICTIONS_KEY` has shape `[batch_size, padded_length]`.  Here,
-  `padded_length` is the largest value in the `RNNKeys.SEQUENCE_LENGTH` `Tensor`
-  passed as input. Entry `[i, j]` is the prediction associated with sequence `i`
-  and time step `j`.
-
-  Args:
-    num_units: The size of the RNN cells. This argument has no effect
-      if `cell_type` is an instance of `RNNCell`.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell`, an instance of an `RNNCell` or one of
-      'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = _to_rnn_cell(cell_type, num_units, num_rnn_layers)
-  target_column = layers.regression_target()
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  dynamic_rnn_model_fn = _get_dynamic_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.REGRESSION,
-      prediction_type=PredictionType.MULTIPLE_VALUE,
-      optimizer=optimizer_type,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='MultiValueRnnRegressor')
-
-  return estimator.Estimator(model_fn=dynamic_rnn_model_fn,
-                             model_dir=model_dir,
-                             config=config,
-                             feature_engineering_fn=feature_engineering_fn)
-
-
-@experimental
-def multi_value_rnn_classifier(num_classes,
-                               num_units,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               cell_type='basic_rnn',
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               predict_probabilities=False,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               input_keep_probability=None,
-                               output_keep_probability=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None):
-  """Creates a RNN `Estimator` that predicts sequences of labels.
+  def __init__(self,
+               problem_type,
+               prediction_type,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_classes=None,
+               num_units=None,
+               cell_type='basic_rnn',
+               optimizer='SGD',
+               learning_rate=0.1,
+               predict_probabilities=False,
+               momentum=None,
+               gradient_clipping_norm=5.0,
+               dropout_keep_probabilities=None,
+               model_dir=None,
+               feature_engineering_fn=None,
+               config=None):
+    """Initializes a `DynamicRnnEstimator`.
 
     The input function passed to this `Estimator` optionally contains keys
-  `RNNKeys.SEQUENCE_LENGTH_KEY`. The value corresponding to
-  `RNNKeys.SEQUENCE_LENGTH_KEY` must be vector of size `batch_size` where entry
-  `n` corresponds to the length of the `n`th sequence in the batch. The sequence
-  length feature is required for batches of varying sizes. It will be used to
-  calculate loss and evaluation metrics. If `RNNKeys.SEQUENCE_LENGTH_KEY` is not
-  included, all sequences are assumed to have length equal to the size of
-  dimension 1 of the input to the RNN.
+    `RNNKeys.SEQUENCE_LENGTH_KEY`. The value corresponding to
+    `RNNKeys.SEQUENCE_LENGTH_KEY` must be vector of size `batch_size` where
+    entry `n` corresponds to the length of the `n`th sequence in the batch. The
+    sequence length feature is required for batches of varying sizes. It will be
+    used to calculate loss and evaluation metrics. If
+    `RNNKeys.SEQUENCE_LENGTH_KEY` is not included, all sequences are assumed to
+    have length equal to the size of dimension 1 of the input to the RNN.
 
-  In order to specify an initial state, the input function must include keys
-  `STATE_PREFIX_i` for all `0 <= i < n` where `n` is the number of nested
-  elements in `cell.state_size`. The input function must contain values for all
-  state components or none of them. If none are included, then the default
-  (zero) state is used as an initial state. See the documentation for
-  `dict_to_state_tuple` and `state_tuple_to_dict` for further details.
+    In order to specify an initial state, the input function must include keys
+    `STATE_PREFIX_i` for all `0 <= i < n` where `n` is the number of nested
+    elements in `cell.state_size`. The input function must contain values for
+    all state components or none of them. If none are included, then the default
+    (zero) state is used as an initial state. See the documentation for
+    `dict_to_state_tuple` and `state_tuple_to_dict` for further details.
+    The input function can call rnn_common.construct_rnn_cell() to obtain the
+    same cell type that this class will select from arguments to __init__.
 
-  The `predict()` method of the `Estimator` returns a dictionary with keys
-  `RNNKeys.PREDICTIONS_KEY` and `STATE_PREFIX_i` for `0 <= i < n` where `n` is
-  the number of nested elements in `cell.state_size`. The value keyed by
-  `RNNKeys.PREDICTIONS_KEY` has shape `[batch_size, padded_length]`.  Here,
-  `padded_length` is the largest value in the `RNNKeys.SEQUENCE_LENGTH` `Tensor`
-  passed as input. Entry `[i, j]` is the prediction associated with sequence `i`
-  and time step `j`.
+    The `predict()` method of the `Estimator` returns a dictionary with keys
+    `STATE_PREFIX_i` for `0 <= i < n` where `n` is the number of nested elements
+    in `cell.state_size`, along with `PredictionKey.CLASSES` for problem type
+    `CLASSIFICATION` or `PredictionKey.SCORES` for problem type
+    `LINEAR_REGRESSION`.  The value keyed by
+    `PredictionKey.CLASSES` or `PredictionKey.SCORES` has shape
+    `[batch_size, padded_length]` in the multi-value case and shape
+    `[batch_size]` in the single-value case.  Here, `padded_length` is the
+    largest value in the `RNNKeys.SEQUENCE_LENGTH` `Tensor` passed as input.
+    Entry `[i, j]` is the prediction associated with sequence `i` and time step
+    `j`. If the problem type is `CLASSIFICATION` and `predict_probabilities` is
+    `True`, it will also include key`PredictionKey.PROBABILITIES`.
 
-  If `predict_probabilities` is set to true, the `dict` returned by `predict()`
-  contains an additional key `RNNKeys.PROBABILITIES_KEY`. The associated array
-  has shape `[batch_size, padded_length, num_classes]` where entry `[i, j, k]`
-  is the probability assigned to class `k` at time step `j` in sequence `i` of
-  the batch.
+    Args:
+      problem_type: whether the `Estimator` is intended for a regression or
+        classification problem. Value must be one of
+        `ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`.
+      prediction_type: whether the `Estimator` should return a value for each
+        step in the sequence, or just a single value for the final time step.
+        Must be one of `PredictionType.SINGLE_VALUE` or
+        `PredictionType.MULTIPLE_VALUE`.
+      sequence_feature_columns: An iterable containing all the feature columns
+        describing sequence features. All items in the iterable should be
+        instances of classes derived from `FeatureColumn`.
+      context_feature_columns: An iterable containing all the feature columns
+        describing context features, i.e., features that apply across all time
+        steps. All items in the set should be instances of classes derived from
+        `FeatureColumn`.
+      num_classes: the number of classes for a classification problem. Only
+        used when `problem_type=ProblemType.CLASSIFICATION`.
+      num_units: A list of integers indicating the number of units in the
+        `RNNCell`s in each layer.
+      cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
+      optimizer: The type of optimizer to use. Either a subclass of
+        `Optimizer`, an instance of an `Optimizer`, a callback that returns an
+        optimizer, or a string. Strings must be one of 'Adagrad', 'Adam',
+        'Ftrl', 'Momentum', 'RMSProp' or 'SGD. See `layers.optimize_loss` for
+        more details.
+      learning_rate: Learning rate. This argument has no effect if `optimizer`
+        is an instance of an `Optimizer`.
+      predict_probabilities: A boolean indicating whether to predict
+        probabilities for all classes. Used only if `problem_type` is
+        `ProblemType.CLASSIFICATION`
+      momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
+      gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
+        then no clipping is performed.
+      dropout_keep_probabilities: a list of dropout probabilities or `None`.
+        If a list is given, it must have length `len(num_units) + 1`. If
+        `None`, then no dropout is applied.
+      model_dir: The directory in which to save and restore the model graph,
+        parameters, etc.
+      feature_engineering_fn: Takes features and labels which are the output of
+        `input_fn` and returns features and labels which will be fed into
+        `model_fn`. Please check `model_fn` for a definition of features and
+        labels.
+      config: A `RunConfig` instance.
 
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells. This argument has no effect
-      if `cell_type` is an instance of `RNNCell`.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell`, an instance of an `RNNCell or one of
-      'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = _to_rnn_cell(cell_type, num_units, num_rnn_layers)
-  target_column = layers.multi_class_target(n_classes=num_classes)
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  dynamic_rnn_model_fn = _get_dynamic_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.CLASSIFICATION,
-      prediction_type=PredictionType.MULTIPLE_VALUE,
-      optimizer=optimizer_type,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      predict_probabilities=predict_probabilities,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='MultiValueRnnClassifier')
+    Raises:
+      ValueError: `problem_type` is not one of
+        `ProblemType.LINEAR_REGRESSION` or `ProblemType.CLASSIFICATION`.
+      ValueError: `problem_type` is `ProblemType.CLASSIFICATION` but
+        `num_classes` is not specifieProblemType
+      ValueError: `prediction_type` is not one of
+        `PredictionType.MULTIPLE_VALUE` or `PredictionType.SINGLE_VALUE`.
+    """
+    if prediction_type == rnn_common.PredictionType.MULTIPLE_VALUE:
+      name = 'MultiValueDynamicRNN'
+    elif prediction_type == rnn_common.PredictionType.SINGLE_VALUE:
+      name = 'SingleValueDynamicRNN'
+    else:
+      raise ValueError(
+          'prediction_type must be one of PredictionType.MULTIPLE_VALUE or '
+          'PredictionType.SINGLE_VALUE; got {}'.format(prediction_type))
 
-  return estimator.Estimator(model_fn=dynamic_rnn_model_fn,
-                             model_dir=model_dir,
-                             config=config,
-                             feature_engineering_fn=feature_engineering_fn)
+    if problem_type == constants.ProblemType.LINEAR_REGRESSION:
+      name += 'Regressor'
+      target_column = layers.regression_target()
+    elif problem_type == constants.ProblemType.CLASSIFICATION:
+      if not num_classes:
+        raise ValueError('For CLASSIFICATION problem_type, num_classes must be '
+                         'specified.')
+      target_column = layers.multi_class_target(n_classes=num_classes)
+      name += 'Classifier'
+    else:
+      raise ValueError(
+          'problem_type must be either ProblemType.LINEAR_REGRESSION '
+          'or ProblemType.CLASSIFICATION; got {}'.format(
+              problem_type))
 
+    if optimizer == 'Momentum':
+      optimizer = momentum_opt.MomentumOptimizer(learning_rate, momentum)
+    dynamic_rnn_model_fn = _get_dynamic_rnn_model_fn(
+        cell_type=cell_type,
+        num_units=num_units,
+        target_column=target_column,
+        problem_type=problem_type,
+        prediction_type=prediction_type,
+        optimizer=optimizer,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        predict_probabilities=predict_probabilities,
+        learning_rate=learning_rate,
+        gradient_clipping_norm=gradient_clipping_norm,
+        dropout_keep_probabilities=dropout_keep_probabilities,
+        name=name)
 
-@experimental
-def single_value_rnn_regressor(num_units,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               cell_type='basic_rnn',
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               input_keep_probability=None,
-                               output_keep_probability=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None):
-  """Create a RNN `Estimator` that predicts single values.
-
-  The input function passed to this `Estimator` optionally contains keys
-  `RNNKeys.SEQUENCE_LENGTH_KEY`. The value corresponding to
-  `RNNKeys.SEQUENCE_LENGTH_KEY` must be vector of size `batch_size` where entry
-  `n` corresponds to the length of the `n`th sequence in the batch. The sequence
-  length feature is required for batches of varying sizes. It will be used to
-  calculate loss and evaluation metrics. If `RNNKeys.SEQUENCE_LENGTH_KEY` is not
-  included, all sequences are assumed to have length equal to the size of
-  dimension 1 of the input to the RNN.
-
-  In order to specify an initial state, the input function must include keys
-  `STATE_PREFIX_i` for all `0 <= i < n` where `n` is the number of nested
-  elements in `cell.state_size`. The input function must contain values for all
-  state components or none of them. If none are included, then the default
-  (zero) state is used as an initial state. See the documentation for
-  `dict_to_state_tuple` and `state_tuple_to_dict` for further details.
-
-  The `predict()` method of the `Estimator` returns a dictionary with keys
-  `RNNKeys.PREDICTIONS_KEY` and `STATE_PREFIX_i` for `0 <= i < n` where `n` is
-  the number of nested elements in `cell.state_size`. The value keyed by
-  `RNNKeys.PREDICTIONS_KEY` has shape `[batch_size, padded_length]`.  Here,
-  `padded_length` is the largest value in the `RNNKeys.SEQUENCE_LENGTH` `Tensor`
-  passed as input. Entry `[i, j]` is the prediction associated with sequence `i`
-  and time step `j`.
-
-  Args:
-    num_units: The size of the RNN cells. This argument has no effect
-      if `cell_type` is an instance of `RNNCell`.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell`, an instance of an `RNNCell` or one of
-      'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = _to_rnn_cell(cell_type, num_units, num_rnn_layers)
-  target_column = layers.regression_target()
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  dynamic_rnn_model_fn = _get_dynamic_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.REGRESSION,
-      prediction_type=PredictionType.SINGLE_VALUE,
-      optimizer=optimizer_type,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='SingleValueRnnRegressor')
-
-  return estimator.Estimator(model_fn=dynamic_rnn_model_fn,
-                             model_dir=model_dir,
-                             config=config,
-                             feature_engineering_fn=feature_engineering_fn)
-
-
-@experimental
-def single_value_rnn_classifier(num_classes,
-                                num_units,
-                                sequence_feature_columns,
-                                context_feature_columns=None,
-                                cell_type='basic_rnn',
-                                num_rnn_layers=1,
-                                optimizer_type='SGD',
-                                learning_rate=0.1,
-                                predict_probabilities=False,
-                                momentum=None,
-                                gradient_clipping_norm=5.0,
-                                input_keep_probability=None,
-                                output_keep_probability=None,
-                                model_dir=None,
-                                config=None,
-                                feature_engineering_fn=None):
-  """Creates a RNN `Estimator` that predicts single labels.
-
-  The input function passed to this `Estimator` optionally contains keys
-  `RNNKeys.SEQUENCE_LENGTH_KEY`. The value corresponding to
-  `RNNKeys.SEQUENCE_LENGTH_KEY` must be vector of size `batch_size` where entry
-  `n` corresponds to the length of the `n`th sequence in the batch. The sequence
-  length feature is required for batches of varying sizes. It will be used to
-  calculate loss and evaluation metrics.
-
-  In order to specify an initial state, the input function must include keys
-  `STATE_PREFIX_i` for all `0 <= i < n` where `n` is the number of nested
-  elements in `cell.state_size`. The input function must contain values for all
-  state components or none of them. If none are included, then the default
-  (zero) state is used as an initial state. See the documentation for
-  `dict_to_state_tuple` and `state_tuple_to_dict` for further details.
-
-  The `predict()` method of the `Estimator` returns a dictionary with keys
-  `RNNKeys.PREDICTIONS_KEY` and `STATE_PREFIX_i` for `0 <= i < n` where `n` is
-  the number of nested elements in `cell.state_size`. The value keyed by
-  `RNNKeys.PREDICTIONS_KEY` has shape `[batch_size, padded_length]`.  Here,
-  `padded_length` is the largest value in the `RNNKeys.SEQUENCE_LENGTH` `Tensor`
-  passed as input. Entry `[i, j]` is the prediction associated with sequence `i`
-  and time step `j`.
-
-  If `predict_probabilities` is set to true, the `dict` returned by `predict()`
-  contains an additional key `RNNKeys.PROBABILITIES_KEY`. The associated array
-  has shape `[batch_size, num_classes]` where entry `[i, j]`
-  is the probability assigned to class `k` in sequence `i` of the batch.
-
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells. This argument has no effect
-      if `cell_type` is an instance of `RNNCell`.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell`, an instance of an `RNNCell or one of
-      'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = _to_rnn_cell(cell_type, num_units, num_rnn_layers)
-  target_column = layers.multi_class_target(n_classes=num_classes)
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  dynamic_rnn_model_fn = _get_dynamic_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.CLASSIFICATION,
-      prediction_type=PredictionType.SINGLE_VALUE,
-      optimizer=optimizer_type,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      predict_probabilities=predict_probabilities,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='SingleValueRnnClassifier')
-
-  return estimator.Estimator(model_fn=dynamic_rnn_model_fn,
-                             model_dir=model_dir,
-                             config=config,
-                             feature_engineering_fn=feature_engineering_fn)
+    super(DynamicRnnEstimator, self).__init__(
+        model_fn=dynamic_rnn_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 955b57e893f..d518e38fe0f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -18,33 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import rnn
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import target_column as target_column_lib
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import dynamic_rnn_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.learn.python.learn.estimators import rnn_common
 from tensorflow.contrib.learn.python.learn.estimators import run_config
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -110,7 +107,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
 
   def setUp(self):
     super(DynamicRnnEstimatorTest, self).setUp()
-    self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
+    self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
     self.mock_target_column = MockTargetColumn(
         num_label_columns=self.NUM_LABEL_COLUMNS)
 
@@ -160,7 +157,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
         self.context_feature_columns)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       sequence_input_val = sess.run(sequence_input)
     expected_shape = np.array([
         3,  # expected batch size
@@ -181,7 +178,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     # Obtain values of activations and final state.
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       activations, final_state = sess.run([activations_t, final_state_t])
 
     expected_activations_shape = np.array([3, 2, self.NUM_LABEL_COLUMNS])
@@ -189,92 +186,38 @@ class DynamicRnnEstimatorTest(test.TestCase):
     expected_state_shape = np.array([3, self.NUM_RNN_CELL_UNITS])
     self.assertAllEqual(expected_state_shape, final_state.shape)
 
-  def testMaskActivationsAndLabels(self):
-    """Test `mask_activations_and_labels`."""
-    batch_size = 4
-    padded_length = 6
-    num_classes = 4
-    np.random.seed(1234)
-    sequence_length = np.random.randint(0, padded_length + 1, batch_size)
-    activations = np.random.rand(batch_size, padded_length, num_classes)
-    labels = np.random.randint(0, num_classes, [batch_size, padded_length])
-    (activations_masked_t,
-     labels_masked_t) = dynamic_rnn_estimator.mask_activations_and_labels(
-         constant_op.constant(
-             activations, dtype=dtypes.float32),
-         constant_op.constant(
-             labels, dtype=dtypes.int32),
-         constant_op.constant(
-             sequence_length, dtype=dtypes.int32))
+  def testGetOutputAlternatives(self):
+    test_cases = (
+        (rnn_common.PredictionType.SINGLE_VALUE,
+         constants.ProblemType.CLASSIFICATION,
+         {prediction_key.PredictionKey.CLASSES: True,
+          prediction_key.PredictionKey.PROBABILITIES: True,
+          dynamic_rnn_estimator._get_state_name(0): True},
+         {'dynamic_rnn_output':
+          (constants.ProblemType.CLASSIFICATION,
+           {prediction_key.PredictionKey.CLASSES: True,
+            prediction_key.PredictionKey.PROBABILITIES: True})}),
 
-    with session.Session() as sess:
-      activations_masked, labels_masked = sess.run(
-          [activations_masked_t, labels_masked_t])
+        (rnn_common.PredictionType.SINGLE_VALUE,
+         constants.ProblemType.LINEAR_REGRESSION,
+         {prediction_key.PredictionKey.SCORES: True,
+          dynamic_rnn_estimator._get_state_name(0): True,
+          dynamic_rnn_estimator._get_state_name(1): True},
+         {'dynamic_rnn_output':
+          (constants.ProblemType.LINEAR_REGRESSION,
+           {prediction_key.PredictionKey.SCORES: True})}),
 
-    expected_activations_shape = [sum(sequence_length), num_classes]
-    np.testing.assert_equal(
-        expected_activations_shape, activations_masked.shape,
-        'Wrong activations shape. Expected {}; got {}.'.format(
-            expected_activations_shape, activations_masked.shape))
+        (rnn_common.PredictionType.MULTIPLE_VALUE,
+         constants.ProblemType.CLASSIFICATION,
+         {prediction_key.PredictionKey.CLASSES: True,
+          prediction_key.PredictionKey.PROBABILITIES: True,
+          dynamic_rnn_estimator._get_state_name(0): True},
+         None))
 
-    expected_labels_shape = [sum(sequence_length)]
-    np.testing.assert_equal(expected_labels_shape, labels_masked.shape,
-                            'Wrong labels shape. Expected {}; got {}.'.format(
-                                expected_labels_shape, labels_masked.shape))
-    masked_index = 0
-    for i in range(batch_size):
-      for j in range(sequence_length[i]):
-        actual_activations = activations_masked[masked_index]
-        expected_activations = activations[i, j, :]
-        np.testing.assert_almost_equal(
-            expected_activations,
-            actual_activations,
-            err_msg='Unexpected logit value at index [{}, {}, :].'
-            '  Expected {}; got {}.'.format(i, j, expected_activations,
-                                            actual_activations))
-
-        actual_labels = labels_masked[masked_index]
-        expected_labels = labels[i, j]
-        np.testing.assert_almost_equal(
-            expected_labels,
-            actual_labels,
-            err_msg='Unexpected logit value at index [{}, {}].'
-            ' Expected {}; got {}.'.format(i, j, expected_labels,
-                                           actual_labels))
-        masked_index += 1
-
-  def testSelectLastActivations(self):
-    """Test `select_last_activations`."""
-    batch_size = 4
-    padded_length = 6
-    num_classes = 4
-    np.random.seed(4444)
-    sequence_length = np.random.randint(0, padded_length + 1, batch_size)
-    activations = np.random.rand(batch_size, padded_length, num_classes)
-    last_activations_t = dynamic_rnn_estimator.select_last_activations(
-        constant_op.constant(
-            activations, dtype=dtypes.float32),
-        constant_op.constant(
-            sequence_length, dtype=dtypes.int32))
-
-    with session.Session() as sess:
-      last_activations = sess.run(last_activations_t)
-
-    expected_activations_shape = [batch_size, num_classes]
-    np.testing.assert_equal(
-        expected_activations_shape, last_activations.shape,
-        'Wrong activations shape. Expected {}; got {}.'.format(
-            expected_activations_shape, last_activations.shape))
-
-    for i in range(batch_size):
-      actual_activations = last_activations[i, :]
-      expected_activations = activations[i, sequence_length[i] - 1, :]
-      np.testing.assert_almost_equal(
-          expected_activations,
-          actual_activations,
-          err_msg='Unexpected logit value at index [{}, :].'
-          '  Expected {}; got {}.'.format(i, expected_activations,
-                                          actual_activations))
+    for pred_type, prob_type, pred_dict, expected_alternatives in test_cases:
+      actual_alternatives = dynamic_rnn_estimator._get_output_alternatives(
+          pred_type, prob_type, pred_dict)
+      self.assertEqual(expected_alternatives, actual_alternatives)
 
   # testGetDynamicRnnModelFn{Train,Eval,Infer}() test which fields
   # of ModelFnOps are set depending on mode.
@@ -305,11 +248,12 @@ class DynamicRnnEstimatorTest(test.TestCase):
   def _GetModelFnOpsForMode(self, mode):
     """Helper for testGetDynamicRnnModelFn{Train,Eval,Infer}()."""
     model_fn = dynamic_rnn_estimator._get_dynamic_rnn_model_fn(
-        self.rnn_cell,
+        cell_type='basic_rnn',
+        num_units=[10],
         target_column=target_column_lib.multi_class_target(n_classes=2),
         # Only CLASSIFICATION yields eval metrics to test for.
-        problem_type=dynamic_rnn_estimator.ProblemType.CLASSIFICATION,
-        prediction_type=dynamic_rnn_estimator.PredictionType.MULTIPLE_VALUE,
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
         optimizer='SGD',
         sequence_feature_columns=self.sequence_feature_columns,
         context_feature_columns=self.context_feature_columns,
@@ -338,7 +282,9 @@ class DynamicRnnEstimatorTest(test.TestCase):
     model_dir = tempfile.mkdtemp()
 
     def estimator_fn():
-      return dynamic_rnn_estimator.multi_value_rnn_classifier(
+      return dynamic_rnn_estimator.DynamicRnnEstimator(
+          problem_type=constants.ProblemType.CLASSIFICATION,
+          prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
           num_classes=2,
           num_units=self.NUM_RNN_CELL_UNITS,
           sequence_feature_columns=self.sequence_feature_columns,
@@ -366,19 +312,19 @@ class DynamicRnnEstimatorTest(test.TestCase):
     # A MultiRNNCell of LSTMCells is both a common choice and an interesting
     # test case, because it has two levels of nesting, with an inner class that
     # is not a plain tuple.
-    cell = core_rnn_cell_impl.MultiRNNCell(
-        [core_rnn_cell_impl.LSTMCell(i) for i in cell_sizes])
+    cell = rnn_cell.MultiRNNCell(
+        [rnn_cell.LSTMCell(i) for i in cell_sizes])
     state_dict = {
         dynamic_rnn_estimator._get_state_name(i):
         array_ops.expand_dims(math_ops.range(cell_size), 0)
         for i, cell_size in enumerate([5, 5, 3, 3, 7, 7])
     }
-    expected_state = (core_rnn_cell_impl.LSTMStateTuple(
+    expected_state = (rnn_cell.LSTMStateTuple(
         np.reshape(np.arange(5), [1, -1]), np.reshape(np.arange(5), [1, -1])),
-                      core_rnn_cell_impl.LSTMStateTuple(
+                      rnn_cell.LSTMStateTuple(
                           np.reshape(np.arange(3), [1, -1]),
                           np.reshape(np.arange(3), [1, -1])),
-                      core_rnn_cell_impl.LSTMStateTuple(
+                      rnn_cell.LSTMStateTuple(
                           np.reshape(np.arange(7), [1, -1]),
                           np.reshape(np.arange(7), [1, -1])))
     actual_state = dynamic_rnn_estimator.dict_to_state_tuple(state_dict, cell)
@@ -441,13 +387,14 @@ class DynamicRnnEstimatorTest(test.TestCase):
 
     seq_columns = [feature_column.real_valued_column('inputs', dimension=1)]
     config = run_config.RunConfig(tf_random_seed=21212)
-    cell = core_rnn_cell_impl.MultiRNNCell(
-        [core_rnn_cell_impl.BasicLSTMCell(size) for size in cell_sizes])
-    sequence_estimator = dynamic_rnn_estimator.multi_value_rnn_classifier(
+    cell_type = 'lstm'
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
         num_classes=2,
-        num_units=None,
+        num_units=cell_sizes,
         sequence_feature_columns=seq_columns,
-        cell_type=cell,
+        cell_type=cell_type,
         learning_rate=learning_rate,
         config=config,
         predict_probabilities=True)
@@ -470,6 +417,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     learning_rate = 0.1
     train_sequence_length = 21
     train_steps = 121
+    dropout_keep_probabilities = [0.5, 0.5, 0.5]
     prediction_steps = [3, 2, 5, 11, 6]
 
     def get_input_fn(batch_size, sequence_length, state_dict, starting_step=0):
@@ -493,15 +441,16 @@ class DynamicRnnEstimatorTest(test.TestCase):
 
     seq_columns = [feature_column.real_valued_column('inputs', dimension=1)]
     config = run_config.RunConfig(tf_random_seed=21212)
-    cell = core_rnn_cell_impl.MultiRNNCell(
-        [core_rnn_cell_impl.BasicLSTMCell(size) for size in cell_sizes])
 
     model_dir = tempfile.mkdtemp()
-    sequence_estimator = dynamic_rnn_estimator.multi_value_rnn_classifier(
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
         num_classes=2,
-        num_units=None,
         sequence_feature_columns=seq_columns,
-        cell_type=cell,
+        num_units=cell_sizes,
+        cell_type='lstm',
+        dropout_keep_probabilities=dropout_keep_probabilities,
         learning_rate=learning_rate,
         config=config,
         model_dir=model_dir)
@@ -527,7 +476,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
         incremental_state_dict = {
             k: v
             for (k, v) in prediction_dict.items()
-            if k.startswith(dynamic_rnn_estimator.RNNKeys.STATE_PREFIX)
+            if k.startswith(rnn_common.RNNKeys.STATE_PREFIX)
         }
       return prediction_dict
 
@@ -539,12 +488,13 @@ class DynamicRnnEstimatorTest(test.TestCase):
     # Check that the last `prediction_steps[-1]` steps give the same
     # predictions.
     np.testing.assert_array_equal(
-        pred_all_at_once['predictions'][:, -1 * prediction_steps[-1]:],
-        pred_step_by_step['predictions'],
+        pred_all_at_once[prediction_key.PredictionKey.CLASSES]
+        [:, -1 * prediction_steps[-1]:],
+        pred_step_by_step[prediction_key.PredictionKey.CLASSES],
         err_msg='Mismatch on last {} predictions.'.format(prediction_steps[-1]))
     # Check that final states are identical.
     for k, v in pred_all_at_once.items():
-      if k.startswith(dynamic_rnn_estimator.RNNKeys.STATE_PREFIX):
+      if k.startswith(rnn_common.RNNKeys.STATE_PREFIX):
         np.testing.assert_array_equal(
             v, pred_step_by_step[k], err_msg='Mismatch on state {}.'.format(k))
 
@@ -559,7 +509,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
     sequence_length = 64
     train_steps = 200
     eval_steps = 20
-    cell_size = 4
+    cell_size = [4]
     learning_rate = 0.1
     loss_threshold = 0.02
 
@@ -587,15 +537,16 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
 
     seq_columns = [
         feature_column.real_valued_column(
-            'inputs', dimension=cell_size)
+            'inputs', dimension=cell_size[0])
     ]
     config = run_config.RunConfig(tf_random_seed=1234)
-    sequence_estimator = dynamic_rnn_estimator.multi_value_rnn_regressor(
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.LINEAR_REGRESSION,
+        prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
         num_units=cell_size,
         sequence_feature_columns=seq_columns,
         learning_rate=learning_rate,
-        input_keep_probability=0.9,
-        output_keep_probability=0.9,
+        dropout_keep_probabilities=[0.9, 0.9],
         config=config)
 
     train_input_fn = get_sin_input_fn(
@@ -648,7 +599,9 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
             'inputs', dimension=cell_size)
     ]
     config = run_config.RunConfig(tf_random_seed=21212)
-    sequence_estimator = dynamic_rnn_estimator.multi_value_rnn_classifier(
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
         num_classes=2,
         num_units=cell_size,
         sequence_feature_columns=seq_columns,
@@ -674,13 +627,13 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
     self.assertListEqual(
         sorted(list(prediction_dict.keys())),
         sorted([
-            dynamic_rnn_estimator.RNNKeys.PREDICTIONS_KEY,
-            dynamic_rnn_estimator.RNNKeys.PROBABILITIES_KEY,
+            prediction_key.PredictionKey.CLASSES,
+            prediction_key.PredictionKey.PROBABILITIES,
             dynamic_rnn_estimator._get_state_name(0)
         ]))
-    predictions = prediction_dict[dynamic_rnn_estimator.RNNKeys.PREDICTIONS_KEY]
+    predictions = prediction_dict[prediction_key.PredictionKey.CLASSES]
     probabilities = prediction_dict[
-        dynamic_rnn_estimator.RNNKeys.PROBABILITIES_KEY]
+        prediction_key.PredictionKey.PROBABILITIES]
     self.assertListEqual(list(predictions.shape), [batch_size, sequence_length])
     self.assertListEqual(
         list(probabilities.shape), [batch_size, sequence_length, 2])
@@ -725,11 +678,13 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
             'inputs', dimension=cell_size)
     ]
     config = run_config.RunConfig(tf_random_seed=6)
-    sequence_regressor = dynamic_rnn_estimator.single_value_rnn_regressor(
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.LINEAR_REGRESSION,
+        prediction_type=rnn_common.PredictionType.SINGLE_VALUE,
         num_units=cell_size,
         sequence_feature_columns=seq_columns,
         cell_type=cell_type,
-        optimizer_type=optimizer_type,
+        optimizer=optimizer_type,
         learning_rate=learning_rate,
         momentum=momentum,
         config=config)
@@ -737,8 +692,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
     train_input_fn = get_mean_input_fn(batch_size, sequence_length, 121)
     eval_input_fn = get_mean_input_fn(batch_size, sequence_length, 212)
 
-    sequence_regressor.fit(input_fn=train_input_fn, steps=train_steps)
-    evaluation = sequence_regressor.evaluate(
+    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)
+    evaluation = sequence_estimator.evaluate(
         input_fn=eval_input_fn, steps=eval_steps)
     loss = evaluation['loss']
     self.assertLess(loss, loss_threshold,
@@ -778,12 +733,14 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
             'inputs', dimension=cell_size)
     ]
     config = run_config.RunConfig(tf_random_seed=77)
-    sequence_classifier = dynamic_rnn_estimator.single_value_rnn_classifier(
+    sequence_estimator = dynamic_rnn_estimator.DynamicRnnEstimator(
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        prediction_type=rnn_common.PredictionType.SINGLE_VALUE,
         num_classes=2,
         num_units=cell_size,
         sequence_feature_columns=seq_columns,
         cell_type=cell_type,
-        optimizer_type=optimizer_type,
+        optimizer=optimizer_type,
         learning_rate=learning_rate,
         momentum=momentum,
         config=config,
@@ -792,8 +749,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
     train_input_fn = get_majority_input_fn(batch_size, sequence_length, 1111)
     eval_input_fn = get_majority_input_fn(batch_size, sequence_length, 2222)
 
-    sequence_classifier.fit(input_fn=train_input_fn, steps=train_steps)
-    evaluation = sequence_classifier.evaluate(
+    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)
+    evaluation = sequence_estimator.evaluate(
         input_fn=eval_input_fn, steps=eval_steps)
     accuracy = evaluation['accuracy']
     self.assertGreater(accuracy, accuracy_threshold,
@@ -801,19 +758,19 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
                            accuracy_threshold, accuracy))
 
     # Testing `predict` when `predict_probabilities=True`.
-    prediction_dict = sequence_classifier.predict(
+    prediction_dict = sequence_estimator.predict(
         input_fn=eval_input_fn, as_iterable=False)
     self.assertListEqual(
         sorted(list(prediction_dict.keys())),
         sorted([
-            dynamic_rnn_estimator.RNNKeys.PREDICTIONS_KEY,
-            dynamic_rnn_estimator.RNNKeys.PROBABILITIES_KEY,
+            prediction_key.PredictionKey.CLASSES,
+            prediction_key.PredictionKey.PROBABILITIES,
             dynamic_rnn_estimator._get_state_name(0),
             dynamic_rnn_estimator._get_state_name(1)
         ]))
-    predictions = prediction_dict[dynamic_rnn_estimator.RNNKeys.PREDICTIONS_KEY]
+    predictions = prediction_dict[prediction_key.PredictionKey.CLASSES]
     probabilities = prediction_dict[
-        dynamic_rnn_estimator.RNNKeys.PROBABILITIES_KEY]
+        prediction_key.PredictionKey.PROBABILITIES]
     self.assertListEqual(list(predictions.shape), [batch_size])
     self.assertListEqual(list(probabilities.shape), [batch_size, 2])
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index ebb3a5d3789..b87b75d5c4c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import abc
 import copy
-import inspect
 import os
 import tempfile
 
@@ -32,7 +31,6 @@ from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers
 from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import list_variables
 from tensorflow.contrib.framework import load_variable
@@ -42,6 +40,7 @@ from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import monitors as monitor_lib
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn as sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.contrib.learn.python.learn.estimators import run_config
@@ -58,7 +57,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
@@ -70,6 +70,8 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import summary_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 AS_ITERABLE_DATE = '2016-09-15'
@@ -87,7 +89,7 @@ SCIKIT_DECOUPLE_INSTRUCTIONS = (
 
 
 def _verify_input_args(x, y, input_fn, feed_fn, batch_size):
-  """Verifies validity of co-existance of input arguments."""
+  """Verifies validity of co-existence of input arguments."""
   if input_fn is None:
     if x is None:
       raise ValueError('Either x or input_fn must be provided.')
@@ -173,17 +175,27 @@ def infer_real_valued_columns_from_input(x):
   return infer_real_valued_columns_from_input_fn(input_fn)
 
 
-def _get_arguments(func):
-  """Returns list of arguments this function has."""
-  if hasattr(func, '__code__'):
-    # Regular function.
-    return inspect.getargspec(func).args
-  elif hasattr(func, '__call__'):
-    # Callable object.
-    return _get_arguments(func.__call__)
-  elif hasattr(func, 'func'):
-    # Partial function.
-    return _get_arguments(func.func)
+def _model_fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  _, fn = tf_decorator.unwrap(fn)
+  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
+    # Handle functools.partial and similar objects.
+    return tuple([
+        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
+        if arg not in set(fn.keywords.keys())
+    ])
+  # Handle function.
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 def _get_replica_device_setter(config):
@@ -197,7 +209,9 @@ def _get_replica_device_setter(config):
   """
   ps_ops = [
       'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
-      'MutableHashTableOfTensors', 'MutableDenseHashTable'
+      'MutableHashTableV2', 'MutableHashTableOfTensors',
+      'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
+      'MutableDenseHashTableV2'
   ]
 
   if config.task_type:
@@ -319,14 +333,21 @@ def _write_dict_to_summary(output_dir,
   for key in dictionary:
     if dictionary[key] is None:
       continue
+    if key == 'global_step':
+      continue
     value = summary_proto.value.add()
     value.tag = key
     if (isinstance(dictionary[key], np.float32) or
         isinstance(dictionary[key], float)):
       value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
     else:
-      logging.warn('Skipping summary for %s, must be a float or np.float32.',
-                   key)
+      logging.warn(
+          'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
+          key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
 
@@ -339,7 +360,7 @@ class BaseEstimator(
   """
   __metaclass__ = abc.ABCMeta
 
-  # Note that for Google users, this is overriden with
+  # Note that for Google users, this is overridden with
   # learn_runner.EstimatorConfig.
   # TODO(wicke): Remove this once launcher takes over config functionality
   _Config = run_config.RunConfig  # pylint: disable=invalid-name
@@ -350,22 +371,40 @@ class BaseEstimator(
     Args:
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same.
       config: A RunConfig instance.
     """
-    # Model directory.
-    self._model_dir = model_dir
-    if self._model_dir is None:
-      self._model_dir = tempfile.mkdtemp()
-      logging.warning('Using temporary folder as model directory: %s',
-                      self._model_dir)
-
     # Create a run configuration.
     if config is None:
       self._config = BaseEstimator._Config()
       logging.info('Using default config.')
     else:
       self._config = config
+
+    if self._config.session_config is None:
+      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    else:
+      self._session_config = self._config.session_config
+
+    # Model directory.
+    if (model_dir is not None) and (self._config.model_dir is not None):
+      if model_dir != self._config.model_dir:
+        # TODO(b/9965722): remove this suppression after it is no longer
+        #                  necessary.
+        # pylint: disable=g-doc-exception
+        raise ValueError(
+            "model_dir are set both in constructor and RunConfig, but with "
+            "different values. In constructor: '{}', in RunConfig: "
+            "'{}' ".format(model_dir, self._config.model_dir))
+
+    self._model_dir = model_dir or self._config.model_dir
+    if self._model_dir is None:
+      self._model_dir = tempfile.mkdtemp()
+      logging.warning('Using temporary folder as model directory: %s',
+                      self._model_dir)
+    if self._config.model_dir is None:
+      self._config = self._config.replace(model_dir=self._model_dir)
     logging.info('Using config: %s', str(vars(self._config)))
 
     # Set device function depending if there are replicas or not.
@@ -576,15 +615,7 @@ class BaseEstimator(
   def model_dir(self):
     return self._model_dir
 
-  @deprecated_arg_values(
-      '2016-09-23',
-      'The signature of the input_fn accepted by export is changing to be '
-      'consistent with what\'s used by tf.Learn Estimator\'s train/evaluate. '
-      'input_fn (and in most cases, input_feature_key) will become required '
-      'args, and use_deprecated_input_fn will default to False and be removed '
-      'altogether.',
-      use_deprecated_input_fn=True,
-      input_fn=None)
+  @deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
   def export(self,
              export_dir,
              input_fn=export._default_input_fn,  # pylint: disable=protected-access
@@ -674,7 +705,7 @@ class BaseEstimator(
   def _get_eval_ops(self, features, labels, metrics):
     """Method that builds model graph and returns evaluation ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
 
     Args:
       features: `Tensor` or `dict` of `Tensor` objects.
@@ -798,7 +829,8 @@ class BaseEstimator(
       features, labels = input_fn()
       self._check_inputs(features, labels)
 
-      eval_dict = self._get_eval_ops(features, labels, metrics).eval_metric_ops
+      model_fn_results = self._get_eval_ops(features, labels, metrics)
+      eval_dict = model_fn_results.eval_metric_ops
 
       update_op, eval_dict = self._extract_metric_update_ops(eval_dict)
 
@@ -819,10 +851,11 @@ class BaseEstimator(
       eval_results = evaluation.evaluate_once(
           checkpoint_path=checkpoint_path,
           master=self._config.evaluation_master,
+          scaffold=model_fn_results.scaffold,
           eval_ops=update_op,
           final_ops=eval_dict,
           hooks=hooks,
-          config=config_pb2.ConfigProto(allow_soft_placement=True))
+          config=self._session_config)
       current_global_step = eval_results[global_step_key]
 
       _write_dict_to_summary(eval_dir, eval_results, current_global_step)
@@ -856,7 +889,8 @@ class BaseEstimator(
       mon_sess = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
-              config=config_pb2.ConfigProto(allow_soft_placement=True)))
+              scaffold=infer_ops.scaffold,
+              config=self._session_config))
       if not as_iterable:
         with mon_sess:
           if not mon_sess.should_stop():
@@ -923,6 +957,7 @@ class BaseEstimator(
       self._check_inputs(features, labels)
       model_fn_ops = self._get_train_ops(features, labels)
       ops.add_to_collection(ops.GraphKeys.LOSSES, model_fn_ops.loss)
+      all_hooks.extend(hooks)
       all_hooks.extend([
           basic_session_run_hooks.NanTensorHook(model_fn_ops.loss),
           basic_session_run_hooks.LoggingTensorHook(
@@ -932,7 +967,6 @@ class BaseEstimator(
               },
               every_n_iter=100)
       ])
-      all_hooks.extend(hooks)
 
       scaffold = model_fn_ops.scaffold or monitored_session.Scaffold()
       if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)):
@@ -941,7 +975,8 @@ class BaseEstimator(
             saver.Saver(
                 sharded=True,
                 max_to_keep=self._config.keep_checkpoint_max,
-                defer_build=True))
+                defer_build=True,
+                save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
@@ -968,7 +1003,7 @@ class BaseEstimator(
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=config_pb2.ConfigProto(allow_soft_placement=True)
+          config=self._session_config
       ) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
@@ -1057,10 +1092,11 @@ class Estimator(BaseEstimator):
     super(Estimator, self).__init__(model_dir=model_dir, config=config)
     if model_fn is not None:
       # Check number of arguments of the given function matches requirements.
-      model_fn_args = _get_arguments(model_fn)
+      model_fn_args = _model_fn_args(model_fn)
       if params is not None and 'params' not in model_fn_args:
-        raise ValueError('Estimator\'s model_fn (%s) has less than 4 '
-                         'arguments, but not None params (%s) are passed.' %
+        raise ValueError('Estimator\'s model_fn (%s) does not have a params '
+                         'argument, but params (%s) were passed to the '
+                         'Estimator\'s constructor.' %
                          (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
@@ -1087,7 +1123,7 @@ class Estimator(BaseEstimator):
       ValueError: if model_fn returns invalid objects.
     """
     features, labels = self._feature_engineering_fn(features, labels)
-    model_fn_args = _get_arguments(self._model_fn)
+    model_fn_args = _model_fn_args(self._model_fn)
     kwargs = {}
     if 'mode' in model_fn_args:
       kwargs['mode'] = mode
@@ -1102,7 +1138,7 @@ class Estimator(BaseEstimator):
     if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
       return model_fn_results
 
-    # Here model_fn_ops should be a tuple with 3 elements.
+    # Here model_fn_results should be a tuple with 3 elements.
     if len(model_fn_results) != 3:
       raise ValueError('Unrecognized value returned by model_fn, '
                        'please return ModelFnOps.')
@@ -1115,7 +1151,7 @@ class Estimator(BaseEstimator):
   def _get_train_ops(self, features, labels):
     """Method that builds model graph and returns trainer ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1131,7 +1167,7 @@ class Estimator(BaseEstimator):
   def _get_eval_ops(self, features, labels, metrics):
     """Method that builds model graph and returns evaluation ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1156,6 +1192,7 @@ class Estimator(BaseEstimator):
     model_fn_ops = self._call_model_fn(
         features, labels, model_fn_lib.ModeKeys.EVAL)
 
+    features, labels = self._feature_engineering_fn(features, labels)
     # Custom metrics should overwrite defaults.
     if metrics:
       model_fn_ops.eval_metric_ops.update(_make_metrics_ops(
@@ -1169,7 +1206,7 @@ class Estimator(BaseEstimator):
   def _get_predict_ops(self, features):
     """Method that builds model graph and returns prediction ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1226,6 +1263,13 @@ class Estimator(BaseEstimator):
       input_alternatives, features = (
           saved_model_export_utils.get_input_alternatives(input_ops))
 
+      # TODO(b/34388557) This is a stopgap, pending recording model provenance.
+      # Record which features are expected at serving time.  It is assumed that
+      # these are the features that were used in training.
+      for feature_key in input_ops.features.keys():
+        ops.add_to_collection(
+            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS, feature_key)
+
       # Call the model_fn and collect the output alternatives.
       model_fn_ops = self._call_model_fn(features, None,
                                          model_fn_lib.ModeKeys.INFER)
@@ -1248,17 +1292,17 @@ class Estimator(BaseEstimator):
       export_dir = saved_model_export_utils.get_timestamped_export_dir(
           export_dir_base)
 
+      if (model_fn_ops.scaffold is not None and
+          model_fn_ops.scaffold.saver is not None):
+        saver_for_restore = model_fn_ops.scaffold.saver
+      else:
+        saver_for_restore = saver.Saver(sharded=True)
       with tf_session.Session('') as session:
-        variables.initialize_local_variables()
-        data_flow_ops.tables_initializer()
-        saver_for_restore = saver.Saver(
-            variables.global_variables(),
-            sharded=True)
         saver_for_restore.restore(session, checkpoint_path)
-
         init_op = control_flow_ops.group(
             variables.local_variables_initializer(),
-            data_flow_ops.tables_initializer())
+            resources.initialize_resources(resources.shared_resources()),
+            lookup_ops.tables_initializer())
 
         # Perform the export
         builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
new file mode 100644
index 00000000000..248c6c733ff
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -0,0 +1,331 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Estimator input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn import metric_spec
+from tensorflow.contrib.learn.python.learn import models
+from tensorflow.contrib.learn.python.learn.datasets import base
+from tensorflow.contrib.learn.python.learn.estimators import _sklearn
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import queue_runner_impl
+
+
+_BOSTON_INPUT_DIM = 13
+_IRIS_INPUT_DIM = 4
+
+
+def boston_input_fn(num_epochs=None):
+  boston = base.load_boston()
+  features = input_lib.limit_epochs(
+      array_ops.reshape(
+          constant_op.constant(boston.data), [-1, _BOSTON_INPUT_DIM]),
+      num_epochs=num_epochs)
+  labels = array_ops.reshape(constant_op.constant(boston.target), [-1, 1])
+  return features, labels
+
+
+def boston_input_fn_with_queue(num_epochs=None):
+  features, labels = boston_input_fn(num_epochs=num_epochs)
+
+  # Create a minimal queue runner.
+  fake_queue = data_flow_ops.FIFOQueue(30, dtypes.int32)
+  queue_runner = queue_runner_impl.QueueRunner(fake_queue,
+                                               [constant_op.constant(0)])
+  queue_runner_impl.add_queue_runner(queue_runner)
+
+  return features, labels
+
+
+def iris_input_fn():
+  iris = base.load_iris()
+  features = array_ops.reshape(
+      constant_op.constant(iris.data), [-1, _IRIS_INPUT_DIM])
+  labels = array_ops.reshape(constant_op.constant(iris.target), [-1])
+  return features, labels
+
+
+def iris_input_fn_labels_dict():
+  iris = base.load_iris()
+  features = array_ops.reshape(
+      constant_op.constant(iris.data), [-1, _IRIS_INPUT_DIM])
+  labels = {
+      'labels': array_ops.reshape(constant_op.constant(iris.target), [-1])
+  }
+  return features, labels
+
+
+def boston_eval_fn():
+  boston = base.load_boston()
+  n_examples = len(boston.target)
+  features = array_ops.reshape(
+      constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
+  labels = array_ops.reshape(
+      constant_op.constant(boston.target), [n_examples, 1])
+  return array_ops.concat([features, features], 0), array_ops.concat(
+      [labels, labels], 0)
+
+
+def extract(data, key):
+  if isinstance(data, dict):
+    assert key in data
+    return data[key]
+  else:
+    return data
+
+
+def linear_model_params_fn(features, labels, mode, params):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
+
+  assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+                  model_fn.ModeKeys.INFER)
+  prediction, loss = (models.linear_regression_zero_init(features, labels))
+  train_op = optimizers.optimize_loss(
+      loss,
+      variables.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=params['learning_rate'])
+  return prediction, loss, train_op
+
+
+def linear_model_fn(features, labels, mode):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
+  assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+                  model_fn.ModeKeys.INFER)
+  if isinstance(features, dict):
+    (_, features), = features.items()
+  prediction, loss = (models.linear_regression_zero_init(features, labels))
+  train_op = optimizers.optimize_loss(
+      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+  return prediction, loss, train_op
+
+
+def linear_model_fn_with_model_fn_ops(features, labels, mode):
+  """Same as linear_model_fn, but returns `ModelFnOps`."""
+  assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+                  model_fn.ModeKeys.INFER)
+  prediction, loss = (models.linear_regression_zero_init(features, labels))
+  train_op = optimizers.optimize_loss(
+      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+  return model_fn.ModelFnOps(
+      mode=mode, predictions=prediction, loss=loss, train_op=train_op)
+
+
+def logistic_model_no_mode_fn(features, labels):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
+  labels = array_ops.one_hot(labels, 3, 1, 0)
+  prediction, loss = (models.logistic_regression_zero_init(features, labels))
+  train_op = optimizers.optimize_loss(
+      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+  return {
+      'class': math_ops.argmax(prediction, 1),
+      'prob': prediction
+  }, loss, train_op
+
+
+VOCAB_FILE_CONTENT = 'emerson\nlake\npalmer\n'
+EXTRA_FILE_CONTENT = 'kermit\npiggy\nralph\n'
+
+
+class EstimatorInputTest(test.TestCase):
+
+  def testContinueTrainingDictionaryInput(self):
+    boston = base.load_boston()
+    output_dir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=50)
+    scores = est.evaluate(
+        x=boston_input,
+        y=float64_target,
+        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+    del est
+    # Create another estimator object with the same output dir.
+    est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
+
+    # Check we can evaluate and predict.
+    scores2 = est2.evaluate(
+        x=boston_input,
+        y=float64_target,
+        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+    self.assertAllClose(scores2['MSE'], scores['MSE'])
+    predictions = np.array(list(est2.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions,
+                                              float64_target['labels'])
+    self.assertAllClose(other_score, scores['MSE'])
+
+  def testBostonAll(self):
+    boston = base.load_boston()
+    est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn))
+    float64_labels = boston.target.astype(np.float64)
+    est.fit(x=boston.data, y=float64_labels, steps=100)
+    scores = est.score(
+        x=boston.data,
+        y=float64_labels,
+        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+    predictions = np.array(list(est.predict(x=boston.data)))
+    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    self.assertAllClose(scores['MSE'], other_score)
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(100, scores['global_step'])
+
+  def testBostonAllDictionaryInput(self):
+    boston = base.load_boston()
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=100)
+    scores = est.evaluate(
+        x=boston_input,
+        y=float64_target,
+        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+    predictions = np.array(list(est.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    self.assertAllClose(other_score, scores['MSE'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
+  def testIrisAll(self):
+    iris = base.load_iris()
+    est = estimator.SKCompat(
+        estimator.Estimator(model_fn=logistic_model_no_mode_fn))
+    est.fit(iris.data, iris.target, steps=100)
+    scores = est.score(
+        x=iris.data,
+        y=iris.target,
+        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+    predictions = est.predict(x=iris.data)
+    predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
+    self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
+    self.assertAllClose(predictions['class'], predictions_class)
+    self.assertAllClose(
+        predictions['class'], np.argmax(
+            predictions['prob'], axis=1))
+    other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
+    self.assertAllClose(scores['accuracy'], other_score)
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(100, scores['global_step'])
+
+  def testIrisAllDictionaryInput(self):
+    iris = base.load_iris()
+    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
+    iris_data = {'input': iris.data}
+    iris_target = {'labels': iris.target}
+    est.fit(iris_data, iris_target, steps=100)
+    scores = est.evaluate(
+        x=iris_data,
+        y=iris_target,
+        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+    predictions = list(est.predict(x=iris_data))
+    predictions_class = list(est.predict(x=iris_data, outputs=['class']))
+    self.assertEqual(len(predictions), iris.target.shape[0])
+    classes_batch = np.array([p['class'] for p in predictions])
+    self.assertAllClose(classes_batch,
+                        np.array([p['class'] for p in predictions_class]))
+    self.assertAllClose(
+        classes_batch,
+        np.argmax(
+            np.array([p['prob'] for p in predictions]), axis=1))
+    other_score = _sklearn.accuracy_score(iris.target, classes_batch)
+    self.assertAllClose(other_score, scores['accuracy'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
+  def testIrisInputFn(self):
+    iris = base.load_iris()
+    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
+    est.fit(input_fn=iris_input_fn, steps=100)
+    _ = est.evaluate(input_fn=iris_input_fn, steps=1)
+    predictions = list(est.predict(x=iris.data))
+    self.assertEqual(len(predictions), iris.target.shape[0])
+
+  def testIrisInputFnLabelsDict(self):
+    iris = base.load_iris()
+    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
+    est.fit(input_fn=iris_input_fn_labels_dict, steps=100)
+    _ = est.evaluate(
+        input_fn=iris_input_fn_labels_dict,
+        steps=1,
+        metrics={
+            'accuracy':
+                metric_spec.MetricSpec(
+                    metric_fn=metric_ops.streaming_accuracy,
+                    prediction_key='class',
+                    label_key='labels')
+        })
+    predictions = list(est.predict(x=iris.data))
+    self.assertEqual(len(predictions), iris.target.shape[0])
+
+  def testTrainInputFn(self):
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    est.fit(input_fn=boston_input_fn, steps=1)
+    _ = est.evaluate(input_fn=boston_eval_fn, steps=1)
+
+  def testPredictInputFn(self):
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    boston = base.load_boston()
+    est.fit(input_fn=boston_input_fn, steps=1)
+    input_fn = functools.partial(boston_input_fn, num_epochs=1)
+    output = list(est.predict(input_fn=input_fn))
+    self.assertEqual(len(output), boston.target.shape[0])
+
+  def testPredictInputFnWithQueue(self):
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    boston = base.load_boston()
+    est.fit(input_fn=boston_input_fn, steps=1)
+    input_fn = functools.partial(boston_input_fn_with_queue, num_epochs=2)
+    output = list(est.predict(input_fn=input_fn))
+    self.assertEqual(len(output), boston.target.shape[0] * 2)
+
+  def testPredictConstInputFn(self):
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    boston = base.load_boston()
+    est.fit(input_fn=boston_input_fn, steps=1)
+
+    def input_fn():
+      features = array_ops.reshape(
+          constant_op.constant(boston.data), [-1, _BOSTON_INPUT_DIM])
+      labels = array_ops.reshape(constant_op.constant(boston.target), [-1, 1])
+      return features, labels
+
+    output = list(est.predict(input_fn=input_fn))
+    self.assertEqual(len(output), boston.target.shape[0])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 99876281b5c..54e6595aa85 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -22,32 +22,25 @@ import functools
 import itertools
 import json
 import os
-import sys
 import tempfile
 
-# pylint: disable=g-bad-todo
-# TODO(#6568): Remove this hack that makes dlopen() not crash.
-# pylint: enable=g-bad-todo
-# pylint: disable=g-import-not-at-top
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from google.protobuf import text_format
+
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import experiment
-from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import models
 from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import linear
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
@@ -59,8 +52,10 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables as variables_lib
@@ -69,9 +64,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_state_pb2
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import monitored_session
-from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
 
@@ -89,18 +85,6 @@ def boston_input_fn(num_epochs=None):
   return features, labels
 
 
-def boston_input_fn_with_queue(num_epochs=None):
-  features, labels = boston_input_fn(num_epochs=num_epochs)
-
-  # Create a minimal queue runner.
-  fake_queue = data_flow_ops.FIFOQueue(30, dtypes.int32)
-  queue_runner = queue_runner_impl.QueueRunner(fake_queue,
-                                               [constant_op.constant(0)])
-  queue_runner_impl.add_queue_runner(queue_runner)
-
-  return features, labels
-
-
 def iris_input_fn():
   iris = base.load_iris()
   features = array_ops.reshape(
@@ -235,6 +219,52 @@ def _build_estimator_for_export_tests(tmpdir):
   return est, serving_input_fn_with_asset
 
 
+def _build_estimator_for_resource_export_test():
+
+  def _input_fn():
+    iris = base.load_iris()
+    return {
+        'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
+    }, constant_op.constant(
+        iris.target, shape=[150], dtype=dtypes.int32)
+
+  feature_columns = [
+      feature_column_lib.real_valued_column('feature', dimension=4)
+  ]
+
+  def resource_constant_model_fn(unused_features, unused_labels, mode):
+    """A model_fn that loads a constant from a resource and serves it."""
+    assert mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+                    model_fn.ModeKeys.INFER)
+
+    const = constant_op.constant(-1, dtype=dtypes.int64)
+    table = lookup.MutableHashTable(
+        dtypes.string, dtypes.int64, const, name='LookupTableModel')
+    update_global_step = variables.get_global_step().assign_add(1)
+    if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL):
+      key = constant_op.constant(['key'])
+      value = constant_op.constant([42], dtype=dtypes.int64)
+      train_op_1 = table.insert(key, value)
+      training_state = lookup.MutableHashTable(
+          dtypes.string, dtypes.int64, const, name='LookupTableTrainingState')
+      training_op_2 = training_state.insert(key, value)
+      return (const, const,
+              control_flow_ops.group(train_op_1, training_op_2,
+                                     update_global_step))
+    if mode == model_fn.ModeKeys.INFER:
+      key = constant_op.constant(['key'])
+      prediction = table.lookup(key)
+      return prediction, const, update_global_step
+
+  est = estimator.Estimator(model_fn=resource_constant_model_fn)
+  est.fit(input_fn=_input_fn, steps=1)
+
+  feature_spec = feature_column_lib.create_feature_spec_for_parsing(
+      feature_columns)
+  serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
+  return est, serving_input_fn
+
+
 class CheckCallsMonitor(monitors_lib.BaseMonitor):
 
   def __init__(self, expect_calls):
@@ -260,33 +290,93 @@ class CheckCallsMonitor(monitors_lib.BaseMonitor):
             self.begin_calls == self.expect_calls)
 
 
-class EstimatorTest(test.TestCase):
+def _model_fn_ops(
+    expected_features, expected_labels, actual_features, actual_labels, mode):
+  assert_ops = tuple([
+      check_ops.assert_equal(
+          expected_features[k], actual_features[k], name='assert_%s' % k)
+      for k in expected_features
+  ] + [
+      check_ops.assert_equal(
+          expected_labels, actual_labels, name='assert_labels')
+  ])
+  with ops.control_dependencies(assert_ops):
+    return model_fn.ModelFnOps(
+        mode=mode,
+        predictions=constant_op.constant(0.),
+        loss=constant_op.constant(0.),
+        train_op=variables.get_global_step().assign_add(1))
 
-  def testExperimentIntegration(self):
-    exp = experiment.Experiment(
-        estimator=estimator.Estimator(model_fn=linear_model_fn),
-        train_input_fn=boston_input_fn,
-        eval_input_fn=boston_input_fn)
-    exp.test()
+
+def _make_input_fn(features, labels):
+  def _input_fn():
+    return {
+        k: constant_op.constant(v)
+        for k, v in six.iteritems(features)
+    }, constant_op.constant(labels)
+  return _input_fn
+
+
+class EstimatorModelFnTest(test.TestCase):
 
   def testModelFnArgs(self):
-    expected_param = {'some_param': 'some_value'}
+    features = {'x': 42., 'y': 43.}
+    labels = 44.
+    expected_params = {'some_param': 'some_value'}
     expected_config = run_config.RunConfig()
     expected_config.i_am_test = True
 
-    def _argument_checker(features, labels, mode, params, config):
-      _, _ = features, labels
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # `features` and `labels` are passed by position, `arg0` and `arg1` here.
+    def _model_fn(arg0, arg1, mode, params, config):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(features.keys(), arg0.keys())
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
-      self.assertEqual(expected_param, params)
+      self.assertEqual(expected_params, params)
       self.assertTrue(config.i_am_test)
-      return constant_op.constant(0.), constant_op.constant(
-          0.), constant_op.constant(0.)
+      return _model_fn_ops(features, labels, arg0, arg1, mode)
 
     est = estimator.Estimator(
-        model_fn=_argument_checker,
-        params=expected_param,
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.fit(input_fn=_make_input_fn(features, labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def testPartialModelFnArgs(self):
+    features = {'x': 42., 'y': 43.}
+    labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+    expected_foo = 45.
+    expected_bar = 46.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # `features` and `labels` are passed by position, `arg0` and `arg1` here.
+    def _model_fn(arg0, arg1, foo, mode, params, config, bar):
+      model_fn_call_count[0] += 1
+      self.assertEqual(expected_foo, foo)
+      self.assertEqual(expected_bar, bar)
+      self.assertItemsEqual(features.keys(), arg0.keys())
+      self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _model_fn_ops(features, labels, arg0, arg1, mode)
+    partial_model_fn = functools.partial(
+        _model_fn, foo=expected_foo, bar=expected_bar)
+
+    est = estimator.Estimator(
+        model_fn=partial_model_fn, params=expected_params,
         config=expected_config)
-    est.fit(input_fn=boston_input_fn, steps=1)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.fit(input_fn=_make_input_fn(features, labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
 
   def testModelFnWithModelDir(self):
     expected_param = {'some_param': 'some_value'}
@@ -297,8 +387,8 @@ class EstimatorTest(test.TestCase):
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
       self.assertEqual(expected_param, params)
       self.assertEqual(model_dir, expected_model_dir)
-      return constant_op.constant(0.), constant_op.constant(
-          0.), constant_op.constant(0.)
+      return (constant_op.constant(0.), constant_op.constant(0.),
+              variables.get_global_step().assign_add(1))
     est = estimator.Estimator(model_fn=_argument_checker,
                               params=expected_param,
                               model_dir=expected_model_dir)
@@ -309,11 +399,13 @@ class EstimatorTest(test.TestCase):
     def _invalid_model_fn(features, labels):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
-      loss = 100.0 - w
+      update_global_step = variables.get_global_step().assign_add(1)
+      with ops.control_dependencies([update_global_step]):
+        loss = 100.0 - w
       return None, loss, None
 
     est = estimator.Estimator(model_fn=_invalid_model_fn)
-    with self.assertRaisesRegexp(ValueError, 'Missing training_op'):
+    with self.assertRaisesRegexp(ValueError, 'Missing train_op'):
       est.fit(input_fn=boston_input_fn, steps=1)
 
   def testInvalidModelFn_no_loss(self):
@@ -322,7 +414,9 @@ class EstimatorTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      train_op = w.assign_add(loss / 100.0)
+      update_global_step = variables.get_global_step().assign_add(1)
+      with ops.control_dependencies([update_global_step]):
+        train_op = w.assign_add(loss / 100.0)
       predictions = loss
       if mode == model_fn.ModeKeys.EVAL:
         loss = None
@@ -339,7 +433,9 @@ class EstimatorTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      train_op = w.assign_add(loss / 100.0)
+      update_global_step = variables.get_global_step().assign_add(1)
+      with ops.control_dependencies([update_global_step]):
+        train_op = w.assign_add(loss / 100.0)
       return None, loss, train_op
 
     est = estimator.Estimator(model_fn=_invalid_model_fn)
@@ -354,7 +450,7 @@ class EstimatorTest(test.TestCase):
               boston_input_fn, num_epochs=1),
           as_iterable=True)
 
-  def testModelFnScaffold(self):
+  def testModelFnScaffoldInTraining(self):
     self.is_init_fn_called = False
 
     def _init_fn(scaffold, session):
@@ -367,13 +463,61 @@ class EstimatorTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant(0.),
           loss=constant_op.constant(0.),
-          train_op=constant_op.constant(0.),
+          train_op=variables.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(init_fn=_init_fn))
 
     est = estimator.Estimator(model_fn=_model_fn_scaffold)
     est.fit(input_fn=boston_input_fn, steps=1)
     self.assertTrue(self.is_init_fn_called)
 
+  def testModelFnScaffoldSaverUsage(self):
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables_lib.Variable(1., 'weight')
+      real_saver = saver_lib.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      return model_fn.ModelFnOps(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=variables.get_global_step().assign_add(1),
+          scaffold=monitored_session.Scaffold(saver=self.mock_saver))
+
+    def input_fn():
+      return {
+          'x': constant_op.constant([[1.]]),
+      }, constant_op.constant([[1.]])
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.fit(input_fn=input_fn, steps=1)
+    self.assertTrue(self.mock_saver.save.called)
+    est.evaluate(input_fn=input_fn, steps=1)
+    self.assertTrue(self.mock_saver.restore.called)
+    est.predict(input_fn=input_fn)
+    self.assertTrue(self.mock_saver.restore.called)
+    def serving_input_fn():
+      serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
+                                                    shape=[None],
+                                                    name='input_example_tensor')
+      features, labels = input_fn()
+      return input_fn_utils.InputFnOps(
+          features, labels, {'examples': serialized_tf_example})
+
+    est.export_savedmodel(est.model_dir + '/export', serving_input_fn)
+    self.assertTrue(self.mock_saver.restore.called)
+
+
+class EstimatorTest(test.TestCase):
+
+  def testExperimentIntegration(self):
+    exp = experiment.Experiment(
+        estimator=estimator.Estimator(model_fn=linear_model_fn),
+        train_input_fn=boston_input_fn,
+        eval_input_fn=boston_input_fn)
+    exp.test()
+
   def testCheckpointSaverHookSuppressesTheDefaultOne(self):
     saver_hook = test.mock.Mock(
         spec=basic_session_run_hooks.CheckpointSaverHook)
@@ -403,6 +547,44 @@ class EstimatorTest(test.TestCase):
     # If input_fn ran, it will have given us the random seed set on the graph.
     self.assertEquals(test_random_seed, test_input.random_seed)
 
+  def testRunConfigModelDir(self):
+    config = run_config.RunConfig(model_dir='test_dir')
+    est = estimator.Estimator(model_fn=linear_model_fn,
+                              config=config)
+    self.assertEqual('test_dir', est.config.model_dir)
+    self.assertEqual('test_dir', est.model_dir)
+
+  def testModelDirAndRunConfigModelDir(self):
+    config = run_config.RunConfig(model_dir='test_dir')
+    est = estimator.Estimator(model_fn=linear_model_fn,
+                              config=config,
+                              model_dir='test_dir')
+    self.assertEqual('test_dir', est.config.model_dir)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'model_dir are set both in constructor and RunConfig, '
+        'but with different'):
+      estimator.Estimator(model_fn=linear_model_fn,
+                          config=config,
+                          model_dir='different_dir')
+
+  def testModelDirIsCopiedToRunConfig(self):
+    config = run_config.RunConfig()
+    self.assertIsNone(config.model_dir)
+
+    est = estimator.Estimator(model_fn=linear_model_fn,
+                              model_dir='test_dir',
+                              config=config)
+    self.assertEqual('test_dir', est.config.model_dir)
+    self.assertEqual('test_dir', est.model_dir)
+
+  def testModelDirAsTempDir(self):
+    with test.mock.patch.object(tempfile, 'mkdtemp', return_value='temp_dir'):
+      est = estimator.Estimator(model_fn=linear_model_fn)
+      self.assertEqual('temp_dir', est.config.model_dir)
+      self.assertEqual('temp_dir', est.model_dir)
+
   def testCheckInputs(self):
     est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn))
     # Lambdas so we have to different objects to compare
@@ -469,32 +651,6 @@ class EstimatorTest(test.TestCase):
     with self.assertRaises(learn.NotFittedError):
       est.predict(x=boston.data)
 
-  def testContinueTrainingDictionaryInput(self):
-    boston = base.load_boston()
-    output_dir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
-    boston_input = {'input': boston.data}
-    float64_target = {'labels': boston.target.astype(np.float64)}
-    est.fit(x=boston_input, y=float64_target, steps=50)
-    scores = est.evaluate(
-        x=boston_input,
-        y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
-    del est
-    # Create another estimator object with the same output dir.
-    est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
-
-    # Check we can evaluate and predict.
-    scores2 = est2.evaluate(
-        x=boston_input,
-        y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
-    self.assertAllClose(scores2['MSE'], scores['MSE'])
-    predictions = np.array(list(est2.predict(x=boston_input)))
-    other_score = _sklearn.mean_squared_error(predictions,
-                                              float64_target['labels'])
-    self.assertAllClose(other_score, scores['MSE'])
-
   def testContinueTraining(self):
     boston = base.load_boston()
     output_dir = tempfile.mkdtemp()
@@ -531,6 +687,38 @@ class EstimatorTest(test.TestCase):
         metrics={'MSE': metric_ops.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est.fit(input_fn=boston_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est1.fit(input_fn=boston_input_fn, steps=5)
+
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    self.assertEqual(5, est2.get_variable_value('global_step'))
+    est2.fit(input_fn=boston_input_fn, steps=5)
+    self.assertEqual(10, est2.get_variable_value('global_step'))
+
   def testEstimatorParams(self):
     boston = base.load_boston()
     est = estimator.SKCompat(
@@ -538,91 +726,6 @@ class EstimatorTest(test.TestCase):
             model_fn=linear_model_params_fn, params={'learning_rate': 0.01}))
     est.fit(x=boston.data, y=boston.target, steps=100)
 
-  def testBostonAll(self):
-    boston = base.load_boston()
-    est = estimator.SKCompat(estimator.Estimator(model_fn=linear_model_fn))
-    float64_labels = boston.target.astype(np.float64)
-    est.fit(x=boston.data, y=float64_labels, steps=100)
-    scores = est.score(
-        x=boston.data,
-        y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
-    predictions = np.array(list(est.predict(x=boston.data)))
-    other_score = _sklearn.mean_squared_error(predictions, boston.target)
-    self.assertAllClose(scores['MSE'], other_score)
-    self.assertTrue('global_step' in scores)
-    self.assertEqual(100, scores['global_step'])
-
-  def testBostonAllDictionaryInput(self):
-    boston = base.load_boston()
-    est = estimator.Estimator(model_fn=linear_model_fn)
-    boston_input = {'input': boston.data}
-    float64_target = {'labels': boston.target.astype(np.float64)}
-    est.fit(x=boston_input, y=float64_target, steps=100)
-    scores = est.evaluate(
-        x=boston_input,
-        y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
-    predictions = np.array(list(est.predict(x=boston_input)))
-    other_score = _sklearn.mean_squared_error(predictions, boston.target)
-    self.assertAllClose(other_score, scores['MSE'])
-    self.assertTrue('global_step' in scores)
-    self.assertEqual(scores['global_step'], 100)
-
-  def testIrisAll(self):
-    iris = base.load_iris()
-    est = estimator.SKCompat(
-        estimator.Estimator(model_fn=logistic_model_no_mode_fn))
-    est.fit(iris.data, iris.target, steps=100)
-    scores = est.score(
-        x=iris.data,
-        y=iris.target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
-    predictions = est.predict(x=iris.data)
-    predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
-    self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
-    self.assertAllClose(predictions['class'], predictions_class)
-    self.assertAllClose(
-        predictions['class'], np.argmax(
-            predictions['prob'], axis=1))
-    other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
-    self.assertAllClose(scores['accuracy'], other_score)
-    self.assertTrue('global_step' in scores)
-    self.assertEqual(100, scores['global_step'])
-
-  def testIrisAllDictionaryInput(self):
-    iris = base.load_iris()
-    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
-    iris_data = {'input': iris.data}
-    iris_target = {'labels': iris.target}
-    est.fit(iris_data, iris_target, steps=100)
-    scores = est.evaluate(
-        x=iris_data,
-        y=iris_target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
-    predictions = list(est.predict(x=iris_data))
-    predictions_class = list(est.predict(x=iris_data, outputs=['class']))
-    self.assertEqual(len(predictions), iris.target.shape[0])
-    classes_batch = np.array([p['class'] for p in predictions])
-    self.assertAllClose(classes_batch,
-                        np.array([p['class'] for p in predictions_class]))
-    self.assertAllClose(
-        classes_batch,
-        np.argmax(
-            np.array([p['prob'] for p in predictions]), axis=1))
-    other_score = _sklearn.accuracy_score(iris.target, classes_batch)
-    self.assertAllClose(other_score, scores['accuracy'])
-    self.assertTrue('global_step' in scores)
-    self.assertEqual(scores['global_step'], 100)
-
-  def testIrisInputFn(self):
-    iris = base.load_iris()
-    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
-    est.fit(input_fn=iris_input_fn, steps=100)
-    _ = est.evaluate(input_fn=iris_input_fn, steps=1)
-    predictions = list(est.predict(x=iris.data))
-    self.assertEqual(len(predictions), iris.target.shape[0])
-
   def testHooksNotChanged(self):
     est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
     # We pass empty array and expect it to remain empty after calling
@@ -633,23 +736,6 @@ class EstimatorTest(test.TestCase):
     _ = est.evaluate(input_fn=iris_input_fn, steps=1, hooks=my_array)
     self.assertEqual(my_array, [])
 
-  def testIrisInputFnLabelsDict(self):
-    iris = base.load_iris()
-    est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
-    est.fit(input_fn=iris_input_fn_labels_dict, steps=100)
-    _ = est.evaluate(
-        input_fn=iris_input_fn_labels_dict,
-        steps=1,
-        metrics={
-            'accuracy':
-                metric_spec.MetricSpec(
-                    metric_fn=metric_ops.streaming_accuracy,
-                    prediction_key='class',
-                    label_key='labels')
-        })
-    predictions = list(est.predict(x=iris.data))
-    self.assertEqual(len(predictions), iris.target.shape[0])
-
   def testIrisIterator(self):
     iris = base.load_iris()
     est = estimator.Estimator(model_fn=logistic_model_no_mode_fn)
@@ -691,11 +777,6 @@ class EstimatorTest(test.TestCase):
     y_iter = ([np.int32(v)] for v in iris.target)
     est.fit(x_iter, y_iter, steps=100)
 
-  def testTrainInputFn(self):
-    est = estimator.Estimator(model_fn=linear_model_fn)
-    est.fit(input_fn=boston_input_fn, steps=1)
-    _ = est.evaluate(input_fn=boston_eval_fn, steps=1)
-
   def testTrainStepsIsIncremental(self):
     est = estimator.Estimator(model_fn=linear_model_fn)
     est.fit(input_fn=boston_input_fn, steps=10)
@@ -717,36 +798,6 @@ class EstimatorTest(test.TestCase):
     output = list(est.predict(x=boston.data, batch_size=10))
     self.assertEqual(len(output), boston.target.shape[0])
 
-  def testPredictInputFn(self):
-    est = estimator.Estimator(model_fn=linear_model_fn)
-    boston = base.load_boston()
-    est.fit(input_fn=boston_input_fn, steps=1)
-    input_fn = functools.partial(boston_input_fn, num_epochs=1)
-    output = list(est.predict(input_fn=input_fn))
-    self.assertEqual(len(output), boston.target.shape[0])
-
-  def testPredictInputFnWithQueue(self):
-    est = estimator.Estimator(model_fn=linear_model_fn)
-    boston = base.load_boston()
-    est.fit(input_fn=boston_input_fn, steps=1)
-    input_fn = functools.partial(boston_input_fn_with_queue, num_epochs=2)
-    output = list(est.predict(input_fn=input_fn))
-    self.assertEqual(len(output), boston.target.shape[0] * 2)
-
-  def testPredictConstInputFn(self):
-    est = estimator.Estimator(model_fn=linear_model_fn)
-    boston = base.load_boston()
-    est.fit(input_fn=boston_input_fn, steps=1)
-
-    def input_fn():
-      features = array_ops.reshape(
-          constant_op.constant(boston.data), [-1, _BOSTON_INPUT_DIM])
-      labels = array_ops.reshape(constant_op.constant(boston.target), [-1, 1])
-      return features, labels
-
-    output = list(est.predict(input_fn=input_fn))
-    self.assertEqual(len(output), boston.target.shape[0])
-
   def testWithModelFnOps(self):
     """Test for model_fn that returns `ModelFnOps`."""
     est = estimator.Estimator(model_fn=linear_model_fn_with_model_fn_ops)
@@ -904,6 +955,53 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
+        self.assertSameElements(
+            ['bogus_lookup', 'feature'],
+            graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS))
+
+    # cleanup
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_savedmodel_with_resource(self):
+    tmpdir = tempfile.mkdtemp()
+    est, serving_input_fn = _build_estimator_for_resource_export_test()
+
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(export_dir_base, serving_input_fn)
+
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(
+                compat.as_bytes(export_dir), compat.as_bytes(
+                    'saved_model.pb'))))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(
+                compat.as_bytes(export_dir), compat.as_bytes('variables'))))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(
+                compat.as_bytes(export_dir),
+                compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(
+                compat.as_bytes(export_dir),
+                compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session_lib.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertTrue('LookupTableModel' in graph_ops)
+        self.assertFalse('LookupTableTrainingState' in graph_ops)
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
@@ -986,7 +1084,8 @@ class InferRealValuedColumnsTest(test.TestCase):
         ValueError, 'on integer or non floating types are not supported'):
       # pylint: disable=g-long-lambda
       estimator.infer_real_valued_columns_from_input_fn(
-          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool), None))
+          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool),
+                   None))
 
   def testStringInput(self):
     with self.assertRaisesRegexp(
@@ -1001,7 +1100,9 @@ class InferRealValuedColumnsTest(test.TestCase):
       # pylint: disable=g-long-lambda
       estimator.infer_real_valued_columns_from_input_fn(
           lambda: (
-              constant_op.constant([['%d.0' % i for i in xrange(8)] for _ in xrange(7)]),
+              constant_op.constant([['%d.0' % i
+                                     for i in xrange(8)]
+                                    for _ in xrange(7)]),
               None))
 
   def testBostonInputFn(self):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
index eb0cf51e098..fd47710e301 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
+from tensorflow.python.util import tf_inspect
 
 
 def assert_estimator_contract(tester, estimator_class):
@@ -31,7 +31,7 @@ def assert_estimator_contract(tester, estimator_class):
     tester: A tf.test.TestCase.
     estimator_class: 'type' object of pre-canned estimator.
   """
-  attributes = inspect.getmembers(estimator_class)
+  attributes = tf_inspect.getmembers(estimator_class)
   attribute_names = [a[0] for a in attributes]
 
   tester.assertTrue('config' in attribute_names)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 00032d9f917..a458e7abf67 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -19,23 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
+from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import datasets
+from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn.estimators import estimator as estimator_lib
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import accuracy_score
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split
 from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum as momentum_lib
 
@@ -62,11 +57,12 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     def model_fn(features, labels):
       # dummy variable:
-      _ = variables.Variable([0.])
+      _ = variables_lib.Variable([0.])
       _ = labels
       predictions = features["transformed_x"]
       loss = constant_op.constant([2.])
-      return predictions, loss, control_flow_ops.no_op()
+      update_global_step = variables.get_global_step().assign_add(1)
+      return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
         model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
@@ -74,6 +70,12 @@ class FeatureEngineeringFunctionTest(test.TestCase):
     prediction = next(estimator.predict(input_fn=input_fn, as_iterable=True))
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
+    metrics = estimator.evaluate(
+        input_fn=input_fn, steps=1,
+        metrics={"label":
+                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+    # labels = transformed_y (99)
+    self.assertEqual(99., metrics["label"])
 
   def testNoneFeatureEngineeringFn(self):
 
@@ -94,11 +96,12 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     def model_fn(features, labels):
       # dummy variable:
-      _ = variables.Variable([0.])
+      _ = variables_lib.Variable([0.])
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      return predictions, loss, control_flow_ops.no_op()
+      update_global_step = variables.get_global_step().assign_add(1)
+      return predictions, loss, update_global_step
 
     estimator_with_fe_fn = estimator_lib.Estimator(
         model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md b/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md
new file mode 100644
index 00000000000..238ac075c50
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md
@@ -0,0 +1,30 @@
+# Support Vector Machines (SVMs)
+
+Support Vector Machines (SVMs) is a class of supervised machine learning
+algorithms that scale well in high dimensional spaces.  SVMs search for the
+boundary (hyperplane for linear SVM classifier) that maximizes the distance to
+the nearest training data point of any class (i.e., the **margin**).
+
+![svm](svm.png)
+
+The [**tf.contrib.learn.SVM**]
+(https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
+Estimator currently implements linear SVMs for binary classification and uses
+the hinge loss. Note that tf.contrib.learn.SVM is a **soft margin** classifier,
+that is, it returns a linear boundary even when the training data is not
+linearly separable. More specifically, tf.contrib.learn.SVM finds the optimal
+hyperplane by solving the following optimization problem:
+
+$$
+\min_{{\bf w}, {\bf \xi}} \quad \frac{1}{N}\sum_{i=1}^N \xi_i
++  \frac{\lambda}{2}\|{\bf w}\|^2 \\
+\textrm{subject to: }
+\forall i, \, \xi_i \ge 0, \, \xi_i \ge 1 - y_i {\bf w}^T {\bf x}_i
+$$
+
+For the solution of the optimization problem, the Stochastic Dual Coordinate
+Ascent ([SDCA](https://www.tensorflow.org/code/tensorflow/contrib/linear_optimizer/kernels/g3doc/readme.md))
+algorithm is currently being used as the default optimizer. tf.contrib.learn.SVM
+supports (global) $$L1$$ and $$L2$$ regularization and also works with weighted
+examples. The implementation works in both single-machine and distributed
+settings.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.png b/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.png
new file mode 100644
index 00000000000..ebabafa2aab
Binary files /dev/null and b/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.png differ
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 4b8bf635fc3..6e15e7891e9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+
 import six
 
 from tensorflow.contrib import framework as framework_lib
 from tensorflow.contrib import layers as layers_lib
+from tensorflow.contrib import lookup as lookup_lib
 # TODO(ptucker): Use tf.losses and tf.metrics.
 from tensorflow.contrib import losses as losses_lib
 from tensorflow.contrib import metrics as metrics_lib
@@ -39,20 +41,150 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training
-
-# TODO(zakaria): add functions that creates a head and returns ModelOpFn
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
-def _regression_head(label_name=None,
-                     weight_column_name=None,
-                     label_dimension=1,
-                     enable_centered_bias=False,
-                     head_name=None):
-  """Creates a _Head for linear regression.
+class Head(object):
+  """Interface for the head/top of a model.
+
+  Given logits (or output of a hidden layer), a Head knows how to compute
+  predictions, loss, default metric and export signature. It is meant to,
+
+  1) Simplify writing model_fn and to make model_fn more configurable
+  2) Support wide range of machine learning models. Since most heads can work
+      with logits, they can support DNN, RNN, Wide, Wide&Deep,
+      Global objectives, Gradient boosted trees and many other types
+      of machine learning models.
+  2) To allow users to seamlessly switch between 1 to n heads for multi
+  objective learning (See _MultiHead implementation for more details)
+
+  Common usage:
+  Here is simplified model_fn to build a multiclass DNN model.
+    ```python
+    def _my_dnn_model_fn(features, labels, mode, params, config=None):
+      # Optionally your callers can pass head to model_fn as a param.
+      head = tf.contrib.learn.multi_class_head(...)
+      input = tf.contrib.layers.input_from_feature_columns(features, ...)
+      last_hidden_layer_out = tf.contrib.layers.stack(
+          input, tf.contrib.layers.fully_connected, [1000, 500])
+      logits = tf.contrib.layers.fully_connected(
+          last_hidden_layer_out, head.logits_dimension, activation_fn=None)
+
+      def _train_op_fn(loss):
+        return optimizer.minimize(loss)
+
+      return head.create_model_fn_ops(
+          features=features,
+          labels=labels,
+          mode=mode,
+          train_op_fn=_train_op_fn,
+          logits=logits,
+          scope=...)
+    ```
+
+  Most heads also support logits_input which is typically the output of the last
+  hidden layer. Some heads (like heads responsible for candidate sampling or
+  hierarchical softmax) intrinsically will not support logits and you have
+  to pass logits_input. Here is a common usage,
+    ```python
+    return head.create_model_fn_ops(
+        features=features,
+        labels=labels,
+        mode=mode,
+        train_op_fn=_train_op_fn,
+        logits_input=last_hidden_layer_out,
+        scope=...)
+    ```python
+
+  There are cases where computing and applying gradients can not be meaningfully
+  captured with train_op_fn we support (for example, with sync optimizer). In
+  such case, you can take the responsibility on your own. Here is a common
+  use case,
+    ```python
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        labels=labels,
+        mode=mode,
+        train_op_fn=tf.contrib.learn.no_op_train_fn,
+        logits=logits,
+        scope=...)
+    if mode == tf.contrib.learn.ModeKeys.TRAIN:
+      optimizer = ...
+      sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
+      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
+                                                  loss=model_fn_ops.loss, ...)
+      hooks = [sync.make_session_run_hook(is_chief)]
+      ... upate train_op and hooks in ModelFnOps and return
+    ```
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def logits_dimension(self):
+    """Size of the last dimension of the logits `Tensor`.
+
+    Typically, logits is of shape `[batch_size, logits_dimension]`.
+
+    Returns:
+      The expected size of the `logits` tensor.
+    """
+    raise NotImplementedError("Calling an abstract method.")
+
+  @abc.abstractmethod
+  def create_model_fn_ops(self,
+                          features,
+                          mode,
+                          labels=None,
+                          train_op_fn=None,
+                          logits=None,
+                          logits_input=None,
+                          scope=None):
+    """Returns `ModelFnOps` that a model_fn can return.
+
+    Please note that,
+    + Exactly one of `logits` and `logits_input` must be provided.
+    + All args must be passed via name.
+
+    Args:
+      features: Input `dict` of `Tensor` objects.
+      mode: Estimator's `ModeKeys`.
+      labels: Labels `Tensor`, or `dict` of same.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
+          to optimize the model with the loss. This is used in TRAIN mode and
+          must not be None. None is allowed in other modes. If you want to
+          optimize loss yourself you can pass `no_op_train_fn` and then use
+          ModeFnOps.loss to compute and apply gradients.
+      logits: logits `Tensor` to be used by the head.
+      logits_input: `Tensor` from which to build logits, often needed when you
+        don't want to compute the logits. Typically this is the activation of
+        the last hidden layer in a DNN. Some heads (like the ones responsible
+        for candidate sampling) intrinsically avoid computing full logits and
+        only accepts logits_input.
+      scope: Optional scope for `variable_scope`.
+
+    Returns:
+      An instance of `ModelFnOps`.
+
+    Raises:
+      ValueError: If `mode` is not recognized.
+      ValueError: If neither or both of `logits` and `logits_input` is provided.
+    """
+    raise NotImplementedError("Calling an abstract method.")
+
+
+def regression_head(label_name=None,
+                    weight_column_name=None,
+                    label_dimension=1,
+                    enable_centered_bias=False,
+                    head_name=None):
+  """Creates a `Head` for linear regression.
 
   Args:
     label_name: String, name of the key in label dict. Can be null if label
@@ -71,30 +203,74 @@ def _regression_head(label_name=None,
       will be `head_name`.
 
   Returns:
-    An instance of _Head
+    An instance of `Head` for linear regression.
   """
   return _RegressionHead(
       label_name=label_name,
       weight_column_name=weight_column_name,
       label_dimension=label_dimension,
       enable_centered_bias=enable_centered_bias,
-      head_name=head_name)
+      head_name=head_name,
+      loss_fn=_mean_squared_loss,
+      link_fn=array_ops.identity)
 
 
-# TODO(zakaria): Add logistic_regression_head
+def poisson_regression_head(label_name=None,
+                            weight_column_name=None,
+                            label_dimension=1,
+                            enable_centered_bias=False,
+                            head_name=None):
+  """Creates a `Head` for poisson regression.
+
+  Args:
+    label_name: String, name of the key in label dict. Can be null if label
+        is a tensor (single headed models).
+    weight_column_name: A string defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+    enable_centered_bias: A bool. If True, estimator will learn a centered
+      bias variable for each class. Rest of the model structure learns the
+      residual after centered bias.
+    head_name: name of the head. If provided, predictions, summary and metrics
+      keys will be suffixed by `"/" + head_name` and the default variable scope
+      will be `head_name`.
+
+  Returns:
+    An instance of `Head` for poisson regression.
+  """
+  return _RegressionHead(
+      label_name=label_name,
+      weight_column_name=weight_column_name,
+      label_dimension=label_dimension,
+      enable_centered_bias=enable_centered_bias,
+      head_name=head_name,
+      loss_fn=_poisson_loss,
+      link_fn=math_ops.exp)
+
+# TODO(zakaria): Consider adding a _RegressionHead for logistic_regression
 
 
-def _multi_class_head(n_classes,
-                      label_name=None,
-                      weight_column_name=None,
-                      enable_centered_bias=False,
-                      head_name=None,
-                      thresholds=None,
-                      metric_class_ids=None):
-  """Creates a _Head for multi class single label classification.
+def multi_class_head(n_classes,
+                     label_name=None,
+                     weight_column_name=None,
+                     enable_centered_bias=False,
+                     head_name=None,
+                     thresholds=None,
+                     metric_class_ids=None,
+                     loss_fn=None,
+                     label_keys=None):
+  """Creates a `Head` for multi class single label classification.
 
   The Head uses softmax cross entropy loss.
 
+  This head expects to be fed integer labels specifying the class index. But
+  if `label_keys` is specified, then labels must be strings from this
+  vocabulary, and the predicted classes will be strings from the same
+  vocabulary.
+
   Args:
     n_classes: Integer, number of classes, must be >= 2
     label_name: String, name of the key in label dict. Can be null if label
@@ -112,27 +288,39 @@ def _multi_class_head(n_classes,
     metric_class_ids: List of class IDs for which we should report per-class
       metrics. Must all be in the range `[0, n_classes)`. Invalid if
       `n_classes` is 2.
+    loss_fn: Optional function that takes (`labels`, `logits`, `weights`) as
+      parameter and returns a weighted scalar loss. `weights` should be
+      optional. See `tf.losses`
+    label_keys: Optional list of strings with size `[n_classes]` defining the
+      label vocabulary. Only supported for `n_classes` > 2.
 
   Returns:
-    An instance of _MultiClassHead.
+    An instance of `Head` for multi class classification.
 
   Raises:
-    ValueError: if `n_classes` is < 2, or `metric_class_ids` is provided when
-      `n_classes` is 2.
+    ValueError: if `n_classes` is < 2.
+    ValueError: If `metric_class_ids` is provided when `n_classes` is 2.
+    ValueError: If `len(label_keys) != n_classes`.
   """
   if (n_classes is None) or (n_classes < 2):
     raise ValueError("n_classes must be > 1 for classification: %s." %
                      n_classes)
+  if loss_fn:
+    _verify_loss_fn_args(loss_fn)
 
+  loss_fn = _wrap_custom_loss_fn(loss_fn) if loss_fn else None
   if n_classes == 2:
     if metric_class_ids:
       raise ValueError("metric_class_ids invalid for n_classes==2.")
+    if label_keys:
+      raise ValueError("label_keys is not supported for n_classes=2.")
     return _BinaryLogisticHead(
         label_name=label_name,
         weight_column_name=weight_column_name,
         enable_centered_bias=enable_centered_bias,
         head_name=head_name,
-        thresholds=thresholds)
+        thresholds=thresholds,
+        loss_fn=loss_fn)
 
   return _MultiClassHead(
       n_classes=n_classes,
@@ -141,16 +329,18 @@ def _multi_class_head(n_classes,
       enable_centered_bias=enable_centered_bias,
       head_name=head_name,
       thresholds=thresholds,
-      metric_class_ids=metric_class_ids)
+      metric_class_ids=metric_class_ids,
+      loss_fn=loss_fn,
+      label_keys=label_keys)
 
 
-def _binary_svm_head(
+def binary_svm_head(
     label_name=None,
     weight_column_name=None,
     enable_centered_bias=False,
     head_name=None,
     thresholds=None,):
-  """Creates a `_Head` for binary classification with SVMs.
+  """Creates a `Head` for binary classification with SVMs.
 
   The head uses binary hinge loss.
 
@@ -169,8 +359,7 @@ def _binary_svm_head(
     thresholds: thresholds for eval metrics, defaults to [.5]
 
   Returns:
-    An instance of `_Head`.
-
+    An instance of `Head` for binary classification with SVM.
   """
   return _BinarySvmHead(
       label_name=label_name,
@@ -180,16 +369,22 @@ def _binary_svm_head(
       thresholds=thresholds)
 
 
-def _multi_label_head(n_classes,
-                      label_name=None,
-                      weight_column_name=None,
-                      enable_centered_bias=False,
-                      head_name=None,
-                      thresholds=None,
-                      metric_class_ids=None):
-  """Creates a _Head for multi label classification.
+def multi_label_head(n_classes,
+                     label_name=None,
+                     weight_column_name=None,
+                     enable_centered_bias=False,
+                     head_name=None,
+                     thresholds=None,
+                     metric_class_ids=None,
+                     loss_fn=None):
+  """Creates a Head for multi label classification.
 
-  The Head uses softmax cross entropy loss.
+  Multi-label classification handles the case where each example may have zero
+  or more associated labels, from a discrete set.  This is distinct from
+  `multi_class_head` which has exactly one label from a discrete set.
+
+  This head by default uses sigmoid cross entropy loss, which expects as input
+  a multi-hot tensor of shape `(batch_size, num_classes)`.
 
   Args:
     n_classes: Integer, number of classes, must be >= 2
@@ -207,15 +402,22 @@ def _multi_label_head(n_classes,
     thresholds: thresholds for eval metrics, defaults to [.5]
     metric_class_ids: List of class IDs for which we should report per-class
       metrics. Must all be in the range `[0, n_classes)`.
+    loss_fn: Optional function that takes (`labels`, `logits`, `weights`) as
+      parameter and returns a weighted scalar loss. `weights` should be
+      optional. See `tf.losses`
 
   Returns:
-    An instance of _MultiClassHead.
+    An instance of `Head` for multi label classification.
 
   Raises:
-    ValueError: if n_classes is < 2
+    ValueError: If n_classes is < 2
+    ValueError: If loss_fn does not have expected signature.
   """
   if n_classes < 2:
     raise ValueError("n_classes must be > 1 for classification.")
+  if loss_fn:
+    _verify_loss_fn_args(loss_fn)
+
   return _MultiLabelHead(
       n_classes=n_classes,
       label_name=label_name,
@@ -223,19 +425,37 @@ def _multi_label_head(n_classes,
       enable_centered_bias=enable_centered_bias,
       head_name=head_name,
       thresholds=thresholds,
-      metric_class_ids=metric_class_ids)
+      metric_class_ids=metric_class_ids,
+      loss_fn=_wrap_custom_loss_fn(loss_fn) if loss_fn else None)
 
 
-def _multi_head(heads, loss_weights=None):
+def loss_only_head(loss_fn, head_name=None):
+  """Creates a Head that contains only loss terms.
+
+  Loss only head holds additional loss terms to be added to other heads and
+  usually represents additional regularization terms in the objective function.
+
+  Args:
+    loss_fn: a function that takes no argument and returns a list of
+        scalar tensors.
+    head_name: a name for for the head.
+
+  Returns:
+    An instance of `Head` to hold the additional losses.
+  """
+  return _LossOnlyHead(loss_fn, head_name=head_name)
+
+
+def multi_head(heads, loss_weights=None):
   """Creates a MultiHead stemming from same logits/hidden layer.
 
   Args:
-    heads: list of _Head objects.
-    loss_weights: optional list of weights to be used to combine losses from
+    heads: list of Head objects.
+    loss_weights: optional list of weights to be used to merge losses from
         each head. All losses are weighted equally if not provided.
 
   Returns:
-    A _Head instance that combines multiple heads.
+    A instance of `Head` that merges multiple heads.
 
   Raises:
     ValueError: if heads and loss_weights have different size.
@@ -244,7 +464,7 @@ def _multi_head(heads, loss_weights=None):
     if len(loss_weights) != len(heads):
       raise ValueError("heads and loss_weights must have same size")
 
-  def _weighted_loss_combiner(losses):
+  def _weighted_loss_merger(losses):
     if loss_weights:
       if len(losses) != len(loss_weights):
         raise ValueError("losses and loss_weights must have same size")
@@ -255,66 +475,15 @@ def _multi_head(heads, loss_weights=None):
     else:
       return math_ops.add_n(losses)
 
-  return _MultiHead(heads, loss_combiner=_weighted_loss_combiner)
+  return _MultiHead(heads, loss_merger=_weighted_loss_merger)
 
 
-# TODO(zakaria): Make the classes public once we are ready for users to subclass
-#   them.
-class _Head(object):
-  """Interface for the head/top of a model.
-
-  Given logits or output of a hidden layer, a Head knows how to compute
-  predictions, loss, default metric and export signature.
-  """
-  __metaclass__ = abc.ABCMeta
-
-  @abc.abstractproperty
-  def logits_dimension(self):
-    """Size of the last dimension of the logits `Tensor`.
-
-    Typically, logits is of shape `[batch_size, logits_dimension]`.
-
-    Returns:
-      Number of logits values per example.
-    """
-    raise NotImplementedError("Calling an abstract method.")
-
-  @abc.abstractmethod
-  def create_model_fn_ops(self,
-                          features,
-                          mode,
-                          labels=None,
-                          train_op_fn=None,
-                          logits=None,
-                          logits_input=None,
-                          scope=None):
-    """Returns ops for a model_fn.
-
-    Exactly one of `logits` and `logits_input` must be provided.
-
-    All args must be passed via name.
-
-    Args:
-      features: Input `dict` of `Tensor` objects.
-      mode: Estimator's `ModeKeys`.
-      labels: Labels `Tensor`, or `dict` of same.
-      train_op_fn: Function that takes a scalar loss and returns an op to
-          optimize with the loss.
-      logits: logits `Tensor`, or `dict` of same, to be used for the head.
-      logits_input: `Tensor` from which to build logits.
-      scope: Optional scope for `variable_scope`.
-
-    Returns:
-      `ModelFnOps`.
-
-    Raises:
-      ValueError: if `mode` is not recognized, or neither or both of `logits`
-          and `logits_input` is provided.
-    """
-    raise NotImplementedError("Calling an abstract method.")
+def no_op_train_fn(loss):
+  del loss
+  return control_flow_ops.no_op()
 
 
-class _SingleHead(_Head):
+class _SingleHead(Head):
   """Interface for a single head/top of a model."""
   __metaclass__ = abc.ABCMeta
 
@@ -324,7 +493,7 @@ class _SingleHead(_Head):
     if problem_type is None:
       raise ValueError("Invalid problem_type %s." % problem_type)
     if logits_dimension is None or logits_dimension < 1:
-      raise ValueError("Invalid logits_dimension %s." % problem_type)
+      raise ValueError("Invalid logits_dimension %s." % logits_dimension)
     self._problem_type = problem_type
     self._logits_dimension = logits_dimension
     self._label_name = label_name
@@ -369,7 +538,7 @@ class _SingleHead(_Head):
 
 
 # TODO(zakaria): use contrib losses.
-def _mean_squared_loss(logits, labels):
+def _mean_squared_loss(labels, logits, weights=None):
   with ops.name_scope(None, "mean_squared_loss", (logits, labels)) as name:
     logits = ops.convert_to_tensor(logits)
     labels = ops.convert_to_tensor(labels)
@@ -380,7 +549,25 @@ def _mean_squared_loss(logits, labels):
     if len(logits.get_shape()) == 1:
       logits = array_ops.expand_dims(logits, dim=(1,))
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
-    return math_ops.square(logits - math_ops.to_float(labels), name=name)
+    loss = math_ops.square(logits - math_ops.to_float(labels), name=name)
+    return _compute_weighted_loss(loss, weights)
+
+
+def _poisson_loss(labels, logits, weights=None):
+  """Computes poisson loss from logits."""
+  with ops.name_scope(None, "_poisson_loss", (logits, labels)) as name:
+    logits = ops.convert_to_tensor(logits)
+    labels = ops.convert_to_tensor(labels)
+    # To prevent broadcasting inside "-".
+    if len(labels.get_shape()) == 1:
+      labels = array_ops.expand_dims(labels, dim=(1,))
+    # TODO(zakaria): make sure it does not recreate the broadcast bug.
+    if len(logits.get_shape()) == 1:
+      logits = array_ops.expand_dims(logits, dim=(1,))
+    logits.get_shape().assert_is_compatible_with(labels.get_shape())
+    loss = nn.log_poisson_loss(labels, logits, compute_full_loss=True,
+                               name=name)
+    return _compute_weighted_loss(loss, weights)
 
 
 def _logits(logits_input, logits, logits_dimension):
@@ -421,22 +608,75 @@ def _logits(logits_input, logits, logits_dimension):
   return logits
 
 
+def _create_model_fn_ops(features,
+                         mode,
+                         loss_fn,
+                         logits_to_predictions_fn,
+                         metrics_fn,
+                         create_output_alternatives_fn,
+                         labels=None,
+                         train_op_fn=None,
+                         logits=None,
+                         logits_dimension=None,
+                         head_name=None,
+                         weight_column_name=None,
+                         enable_centered_bias=False):
+  """Returns a `ModelFnOps` object."""
+  _check_mode_valid(mode)
+
+  centered_bias = None
+  if enable_centered_bias:
+    centered_bias = _centered_bias(logits_dimension, head_name)
+    logits = nn.bias_add(logits, centered_bias)
+
+  predictions = logits_to_predictions_fn(logits)
+  loss = None
+  train_op = None
+  eval_metric_ops = None
+  if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
+    weight_tensor = _weight_tensor(features, weight_column_name)
+    loss, weighted_average_loss = loss_fn(labels, logits, weight_tensor)
+    # Uses the deprecated API to set the tag explicitly.
+    # Without it, training and eval losses will show up in different graphs.
+    logging_ops.scalar_summary(
+        _summary_key(head_name, mkey.LOSS), weighted_average_loss)
+
+    if mode == model_fn.ModeKeys.TRAIN:
+      if train_op_fn is None:
+        raise ValueError("train_op_fn can not be None in TRAIN mode")
+      batch_size = array_ops.shape(logits)[0]
+      train_op = _train_op(loss, labels, train_op_fn, centered_bias,
+                           batch_size, loss_fn, weight_tensor)
+    eval_metric_ops = metrics_fn(
+        weighted_average_loss, predictions, labels, weight_tensor)
+  return model_fn.ModelFnOps(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops=eval_metric_ops,
+      output_alternatives=create_output_alternatives_fn(predictions))
+
+
 class _RegressionHead(_SingleHead):
-  """_Head for regression."""
+  """`Head` for regression with a generalized linear model."""
 
   def __init__(self,
                label_dimension,
+               loss_fn,
+               link_fn,
                label_name=None,
                weight_column_name=None,
                enable_centered_bias=False,
-               head_name=None,
-               loss_fn=_mean_squared_loss):
-    """Base type for all single heads.
+               head_name=None):
+    """`Head` for regression.
 
     Args:
       label_dimension: Number of regression labels per example. This is the
         size of the last dimension of the labels `Tensor` (typically, this has
         shape `[batch_size, label_dimension]`).
+      loss_fn: Loss function, takes logits and labels and returns loss.
+      link_fn: Link function, takes a logits tensor and returns the output.
       label_name: String, name of the key in label dict. Can be null if label
           is a tensor (single headed models).
       weight_column_name: A string defining feature column name representing
@@ -448,7 +688,6 @@ class _RegressionHead(_SingleHead):
       head_name: name of the head. Predictions, summary and metrics keys are
         suffixed by `"/" + head_name` and the default variable scope is
         `head_name`.
-      loss_fn: Loss function.
     """
     super(_RegressionHead, self).__init__(
         problem_type=constants.ProblemType.LINEAR_REGRESSION,
@@ -458,6 +697,7 @@ class _RegressionHead(_SingleHead):
         head_name=head_name)
 
     self._loss_fn = loss_fn
+    self._link_fn = link_fn
     self._enable_centered_bias = enable_centered_bias
 
   def create_model_fn_ops(self,
@@ -468,46 +708,36 @@ class _RegressionHead(_SingleHead):
                           logits=None,
                           logits_input=None,
                           scope=None):
-    """See `_Head`."""
-    _check_mode_valid(mode)
-
+    """See `Head`."""
     with variable_scope.variable_scope(
         scope,
-        self.head_name or "regression_head",
+        default_name=self.head_name or "regression_head",
         values=(tuple(six.itervalues(features)) +
                 (labels, logits, logits_input))):
+      labels = self._transform_labels(mode=mode, labels=labels)
       logits = _logits(logits_input, logits, self.logits_dimension)
-      centered_bias = None
-      if self._enable_centered_bias:
-        centered_bias = _centered_bias(self.logits_dimension, self.head_name)
-        logits = nn.bias_add(logits, centered_bias)
+      return _create_model_fn_ops(
+          features=features,
+          mode=mode,
+          loss_fn=self._loss_fn,
+          logits_to_predictions_fn=self._logits_to_predictions,
+          metrics_fn=self._metrics,
+          create_output_alternatives_fn=self._create_output_alternatives,
+          labels=labels,
+          train_op_fn=train_op_fn,
+          logits=logits,
+          logits_dimension=self.logits_dimension,
+          head_name=self.head_name,
+          weight_column_name=self.weight_column_name,
+          enable_centered_bias=self._enable_centered_bias)
 
-      predictions = self._logits_to_predictions(logits)
-      loss = None
-      train_op = None
-      eval_metric_ops = None
-      if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-        labels_tensor = _to_labels_tensor(labels, self._label_name)
-        loss, weighted_average_loss = _loss(
-            self._loss_fn(logits, labels_tensor),
-            _weight_tensor(features, self.weight_column_name))
-        logging_ops.scalar_summary(
-            _summary_key(self.head_name, mkey.LOSS), weighted_average_loss)
-
-        if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
-          train_op = _train_op(loss, labels_tensor, train_op_fn, centered_bias,
-                               self.logits_dimension, self._loss_fn)
-        with ops.name_scope("default_metrics", values=[weighted_average_loss]):
-          eval_metric_ops = {_summary_key(self.head_name, mkey.LOSS):
-                             metrics_lib.streaming_mean(weighted_average_loss)}
-
-    return model_fn.ModelFnOps(
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metric_ops=eval_metric_ops,
-        output_alternatives=self._create_output_alternatives(predictions))
+  def _transform_labels(self, mode, labels):
+    """Applies transformations to labels tensor."""
+    if (mode == model_fn.ModeKeys.INFER) or (labels is None):
+      return None
+    labels_tensor = _to_labels_tensor(labels, self._label_name)
+    _check_no_sparse_tensor(labels_tensor)
+    return labels_tensor
 
   def _logits_to_predictions(self, logits):
     """Returns a dict of predictions.
@@ -522,10 +752,18 @@ class _RegressionHead(_SingleHead):
     with ops.name_scope(None, "predictions", (logits,)):
       if self.logits_dimension == 1:
         logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key)
-      return {key: logits}
+      return {key: self._link_fn(logits)}
+
+  def _metrics(self, eval_loss, predictions, labels, weights):
+    """Returns a dict of metrics keyed by name."""
+    del predictions, labels, weights  # Unused by this head.
+    with ops.name_scope("metrics", values=[eval_loss]):
+      return {
+          _summary_key(self.head_name, mkey.LOSS):
+              metrics_lib.streaming_mean(eval_loss)}
 
 
-def _log_loss_with_two_classes(logits, labels):
+def _log_loss_with_two_classes(labels, logits, weights=None):
   with ops.name_scope(None, "log_loss_with_two_classes",
                       (logits, labels)) as name:
     logits = ops.convert_to_tensor(logits)
@@ -534,8 +772,9 @@ def _log_loss_with_two_classes(logits, labels):
     # sigmoid_cross_entropy_with_logits requires [batch_size, 1] labels.
     if len(labels.get_shape()) == 1:
       labels = array_ops.expand_dims(labels, dim=(1,))
-    return nn.sigmoid_cross_entropy_with_logits(
-        labels=labels, logits=logits, name=name)
+    loss = nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits,
+                                                name=name)
+    return _compute_weighted_loss(loss, weights)
 
 
 def _one_class_to_two_class_logits(logits):
@@ -543,16 +782,16 @@ def _one_class_to_two_class_logits(logits):
 
 
 class _BinaryLogisticHead(_SingleHead):
-  """_Head for binary logistic classifciation."""
+  """`Head` for binary classification with logistic regression."""
 
   def __init__(self,
                label_name=None,
                weight_column_name=None,
                enable_centered_bias=False,
                head_name=None,
-               loss_fn=_log_loss_with_two_classes,
+               loss_fn=None,
                thresholds=None):
-    """Base type for all single heads.
+    """`Head` for binary classification with logistic regression.
 
     Args:
       label_name: String, name of the key in label dict. Can be `None` if label
@@ -579,7 +818,7 @@ class _BinaryLogisticHead(_SingleHead):
         weight_column_name=weight_column_name,
         head_name=head_name)
     self._thresholds = thresholds if thresholds else (.5,)
-    self._loss_fn = loss_fn
+    self._loss_fn = loss_fn if loss_fn else _log_loss_with_two_classes
     self._enable_centered_bias = enable_centered_bias
 
   def create_model_fn_ops(self,
@@ -590,46 +829,37 @@ class _BinaryLogisticHead(_SingleHead):
                           logits=None,
                           logits_input=None,
                           scope=None):
-    """See `_Head`."""
-    _check_mode_valid(mode)
-
+    """See `Head`."""
     with variable_scope.variable_scope(
         scope,
-        self.head_name or "binary_logistic_head",
+        default_name=self.head_name or "binary_logistic_head",
         values=(tuple(six.itervalues(features)) +
                 (labels, logits, logits_input))):
+      labels = self._transform_labels(mode=mode, labels=labels)
       logits = _logits(logits_input, logits, self.logits_dimension)
-      centered_bias = None
-      if self._enable_centered_bias:
-        centered_bias = _centered_bias(1, self.head_name)
-        logits = nn.bias_add(logits, centered_bias)
+      return _create_model_fn_ops(
+          features=features,
+          mode=mode,
+          loss_fn=self._loss_fn,
+          logits_to_predictions_fn=self._logits_to_predictions,
+          metrics_fn=self._metrics,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type),
+          labels=labels,
+          train_op_fn=train_op_fn,
+          logits=logits,
+          logits_dimension=self.logits_dimension,
+          head_name=self.head_name,
+          weight_column_name=self.weight_column_name,
+          enable_centered_bias=self._enable_centered_bias)
 
-      predictions = self._logits_to_predictions(logits)
-      loss = None
-      train_op = None
-      eval_metric_ops = None
-      if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-        weight_tensor = _weight_tensor(features, self.weight_column_name)
-        labels_tensor = _to_labels_tensor(labels, self._label_name)
-        loss, weighted_average_loss = _loss(
-            self._loss_fn(logits, labels_tensor), weight_tensor)
-        logging_ops.scalar_summary(
-            _summary_key(self.head_name, mkey.LOSS), weighted_average_loss)
-
-        if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
-          train_op = _train_op(loss, labels_tensor, train_op_fn, centered_bias,
-                               self.logits_dimension, self._loss_fn)
-        eval_metric_ops = self._default_metrics(weighted_average_loss,
-                                                predictions, labels_tensor,
-                                                weight_tensor)
-
-    return model_fn.ModelFnOps(
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metric_ops=eval_metric_ops,
-        output_alternatives=self._create_output_alternatives(predictions))
+  def _transform_labels(self, mode, labels):
+    """Applies transformations to labels tensor."""
+    if (mode == model_fn.ModeKeys.INFER) or (labels is None):
+      return None
+    labels_tensor = _to_labels_tensor(labels, self._label_name)
+    _check_no_sparse_tensor(labels_tensor)
+    return labels_tensor
 
   def _logits_to_predictions(self, logits):
     """Returns a dict of predictions.
@@ -659,9 +889,9 @@ class _BinaryLogisticHead(_SingleHead):
                   name=prediction_key.PredictionKey.CLASSES)
       }
 
-  def _default_metrics(self, eval_loss, predictions, labels, weights):
+  def _metrics(self, eval_loss, predictions, labels, weights):
     """Returns a dict of metrics keyed by name."""
-    with ops.name_scope("default_metrics", values=(
+    with ops.name_scope("metrics", values=(
         [eval_loss, labels, weights] + list(six.itervalues(predictions)))):
       classes = predictions[prediction_key.PredictionKey.CLASSES]
       logistic = predictions[prediction_key.PredictionKey.LOGISTIC]
@@ -683,6 +913,8 @@ class _BinaryLogisticHead(_SingleHead):
           _indicator_labels_streaming_mean(labels, weights))
       metrics[_summary_key(self.head_name, mkey.AUC)] = (
           _streaming_auc(logistic, labels, weights))
+      metrics[_summary_key(self.head_name, mkey.AUC_PR)] = (
+          _streaming_auc(logistic, labels, weights, curve="PR"))
 
       for threshold in self._thresholds:
         metrics[_summary_key(
@@ -703,7 +935,7 @@ class _BinaryLogisticHead(_SingleHead):
     return metrics
 
 
-def _softmax_cross_entropy_loss(logits, labels):
+def _softmax_cross_entropy_loss(labels, logits, weights=None):
   with ops.name_scope(
       None, "softmax_cross_entropy_loss", (logits, labels,)) as name:
     labels = ops.convert_to_tensor(labels)
@@ -711,16 +943,26 @@ def _softmax_cross_entropy_loss(logits, labels):
     if not labels.dtype.is_integer:
       raise ValueError("Labels dtype should be integer "
                        "Instead got %s." % labels.dtype)
-    # TODO(ptucker): This will break for dynamic shapes.
+
     # sparse_softmax_cross_entropy_with_logits requires [batch_size] labels.
+    is_squeezed_labels = False
+    # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
       labels = array_ops.squeeze(labels, squeeze_dims=(1,))
-    return nn.sparse_softmax_cross_entropy_with_logits(
+      is_squeezed_labels = True
+
+    loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels, logits=logits, name=name)
 
+    # Restore squeezed dimension, if necessary, so loss matches weights shape.
+    if is_squeezed_labels:
+      loss = array_ops.expand_dims(loss, axis=(1,))
+
+    return _compute_weighted_loss(loss, weights)
+
 
 class _MultiClassHead(_SingleHead):
-  """_Head for classification."""
+  """'Head' for multi class classification."""
 
   def __init__(self,
                n_classes,
@@ -728,10 +970,16 @@ class _MultiClassHead(_SingleHead):
                weight_column_name=None,
                enable_centered_bias=False,
                head_name=None,
-               loss_fn=_softmax_cross_entropy_loss,
+               loss_fn=None,
                thresholds=None,
-               metric_class_ids=None):
-    """_Head for classification.
+               metric_class_ids=None,
+               label_keys=None):
+    """'Head' for multi class classification.
+
+    This head expects to be fed integer labels specifying the class index. But
+    if `label_keys` is specified, then labels must be strings from this
+    vocabulary, and the predicted classes will be strings from the same
+    vocabulary.
 
     Args:
       n_classes: Number of classes, must be greater than 2 (for 2 classes, use
@@ -747,13 +995,15 @@ class _MultiClassHead(_SingleHead):
       head_name: name of the head. If provided, predictions, summary, metrics
         keys will be suffixed by `"/" + head_name` and the default variable
         scope will be `head_name`.
-      loss_fn: Loss function.
+      loss_fn: Loss function. Defaults to softmax cross entropy loss.
       thresholds: thresholds for eval.
       metric_class_ids: List of class IDs for which we should report per-class
         metrics. Must all be in the range `[0, n_classes)`.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary.
 
     Raises:
-      ValueError: if `n_classes` or `metric_class_ids` is invalid.
+      ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
     """
     super(_MultiClassHead, self).__init__(
         problem_type=constants.ProblemType.CLASSIFICATION,
@@ -765,7 +1015,309 @@ class _MultiClassHead(_SingleHead):
     if (n_classes is None) or (n_classes <= 2):
       raise ValueError("n_classes must be > 2: %s." % n_classes)
     self._thresholds = thresholds if thresholds else (.5,)
-    self._loss_fn = loss_fn
+    self._loss_fn = loss_fn if loss_fn else _softmax_cross_entropy_loss
+    self._enable_centered_bias = enable_centered_bias
+    self._metric_class_ids = tuple([] if metric_class_ids is None else
+                                   metric_class_ids)
+    for class_id in self._metric_class_ids:
+      if (class_id < 0) or (class_id >= n_classes):
+        raise ValueError("Class ID %s not in [0, %s)." % (class_id, n_classes))
+    if label_keys and len(label_keys) != n_classes:
+      raise ValueError("Length of label_keys must equal n_classes.")
+    self._label_keys = label_keys
+
+  def create_model_fn_ops(self,
+                          features,
+                          mode,
+                          labels=None,
+                          train_op_fn=None,
+                          logits=None,
+                          logits_input=None,
+                          scope=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        scope,
+        default_name=self.head_name or "multi_class_head",
+        values=(tuple(six.itervalues(features)) +
+                (labels, logits, logits_input))):
+      labels = self._transform_labels(mode=mode, labels=labels)
+      logits = _logits(logits_input, logits, self.logits_dimension)
+      return _create_model_fn_ops(
+          features=features,
+          mode=mode,
+          loss_fn=self._wrapped_loss_fn,
+          logits_to_predictions_fn=self._logits_to_predictions,
+          metrics_fn=self._metrics,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type, self._label_keys),
+          labels=labels,
+          train_op_fn=train_op_fn,
+          logits=logits,
+          logits_dimension=self.logits_dimension,
+          head_name=self.head_name,
+          weight_column_name=self.weight_column_name,
+          enable_centered_bias=self._enable_centered_bias)
+
+  def _transform_labels(self, mode, labels):
+    """Returns a dict that contains both the original labels and label IDs."""
+    if (mode == model_fn.ModeKeys.INFER) or (labels is None):
+      return None
+    labels_tensor = _to_labels_tensor(labels, self._label_name)
+    _check_no_sparse_tensor(labels_tensor)
+    if self._label_keys:
+      table = lookup_lib.string_to_index_table_from_tensor(
+          mapping=self._label_keys,
+          name="label_id_lookup")
+      return {
+          "labels": labels_tensor,
+          "label_ids": table.lookup(labels_tensor),
+      }
+    return {
+        "labels": labels_tensor,
+        "label_ids": labels_tensor,
+    }
+
+  def _labels(self, labels_dict):
+    """Returns labels `Tensor` of the same type as classes."""
+    return labels_dict["labels"]
+
+  def _label_ids(self, labels_dict):
+    """Returns integer label ID `Tensor`."""
+    return labels_dict["label_ids"]
+
+  def _wrapped_loss_fn(self, labels, logits, weights=None):
+    return self._loss_fn(self._label_ids(labels), logits, weights=weights)
+
+  def _logits_to_predictions(self, logits):
+    """Returns a dict of predictions.
+
+    Args:
+      logits: logits `Tensor` after applying possible centered bias.
+
+    Returns:
+      Dict of prediction `Tensor` keyed by `PredictionKey`.
+    """
+    with ops.name_scope(None, "predictions", (logits,)):
+      class_ids = math_ops.argmax(
+          logits, 1, name=prediction_key.PredictionKey.CLASSES)
+      if self._label_keys:
+        table = lookup_lib.index_to_string_table_from_tensor(
+            mapping=self._label_keys,
+            name="class_string_lookup")
+        classes = table.lookup(class_ids)
+      else:
+        classes = class_ids
+      return {
+          prediction_key.PredictionKey.LOGITS: logits,
+          prediction_key.PredictionKey.PROBABILITIES:
+              nn.softmax(
+                  logits, name=prediction_key.PredictionKey.PROBABILITIES),
+          prediction_key.PredictionKey.CLASSES: classes
+      }
+
+  def _metrics(self, eval_loss, predictions, labels, weights):
+    """Returns a dict of metrics keyed by name."""
+    with ops.name_scope(
+        "metrics",
+        values=((eval_loss, self._labels(labels), self._label_ids(labels),
+                 weights) + tuple(six.itervalues(predictions)))):
+      logits = predictions[prediction_key.PredictionKey.LOGITS]
+      probabilities = predictions[prediction_key.PredictionKey.PROBABILITIES]
+      classes = predictions[prediction_key.PredictionKey.CLASSES]
+
+      metrics = {_summary_key(self.head_name, mkey.LOSS):
+                 metrics_lib.streaming_mean(eval_loss)}
+      # TODO(b/29366811): This currently results in both an "accuracy" and an
+      # "accuracy/threshold_0.500000_mean" metric for binary classification.
+      metrics[_summary_key(self.head_name, mkey.ACCURACY)] = (
+          metrics_lib.streaming_accuracy(
+              classes, self._labels(labels), weights))
+
+      if not self._label_keys:
+        # Classes are IDs. Add some metrics.
+        for class_id in self._metric_class_ids:
+          metrics[_summary_key(
+              self.head_name, mkey.CLASS_PREDICTION_MEAN % class_id)] = (
+                  _class_predictions_streaming_mean(classes, weights, class_id))
+          # TODO(ptucker): Add per-class accuracy, precision, recall.
+          metrics[_summary_key(
+              self.head_name, mkey.CLASS_LABEL_MEAN % class_id)] = (
+                  _class_labels_streaming_mean(
+                      self._label_ids(labels), weights, class_id))
+          metrics[_summary_key(
+              self.head_name, mkey.CLASS_PROBABILITY_MEAN % class_id)] = (
+                  _predictions_streaming_mean(probabilities, weights, class_id))
+          metrics[_summary_key(
+              self.head_name, mkey.CLASS_LOGITS_MEAN % class_id)] = (
+                  _predictions_streaming_mean(logits, weights, class_id))
+
+    return metrics
+
+
+def _to_labels_tensor(labels, label_name):
+  """Returns label as a tensor.
+
+  Args:
+    labels: Label `Tensor` or `SparseTensor` or a dict containing labels.
+    label_name: Label name if labels is a dict.
+
+  Returns:
+    Label `Tensor` or `SparseTensor`.
+  """
+  labels = labels[label_name] if isinstance(labels, dict) else labels
+  return framework_lib.convert_to_tensor_or_sparse_tensor(labels)
+
+
+def _check_no_sparse_tensor(x):
+  """Raises ValueError if the given tensor is `SparseTensor`."""
+  if isinstance(x, sparse_tensor.SparseTensor):
+    raise ValueError("SparseTensor is not supported.")
+
+
+def _sparse_labels_to_indicator(labels, num_classes):
+  """If labels is `SparseTensor`, converts it to indicator `Tensor`.
+
+  Args:
+    labels: Label `Tensor` or `SparseTensor`.
+    num_classes: Number of classes.
+
+  Returns:
+    Dense label `Tensor`.
+
+  Raises:
+    ValueError: If labels is `SparseTensor` and `num_classes` < 2.
+  """
+  if isinstance(labels, sparse_tensor.SparseTensor):
+    if num_classes < 2:
+      raise ValueError("Must set num_classes >= 2 when passing labels as a "
+                       "SparseTensor.")
+    return math_ops.to_int64(
+        sparse_ops.sparse_to_indicator(labels, num_classes))
+  return labels
+
+
+def _assert_labels_rank(labels):
+  return control_flow_ops.Assert(
+      math_ops.less_equal(array_ops.rank(labels), 2),
+      ("labels shape should be either [batch_size, 1] or [batch_size]",))
+
+
+class _BinarySvmHead(_SingleHead):
+  """`Head` for binary classification using SVM."""
+
+  def __init__(self, label_name, weight_column_name, enable_centered_bias,
+               head_name, thresholds):
+
+    def _loss_fn(labels, logits, weights=None):
+      with ops.name_scope(None, "hinge_loss", (logits, labels)) as name:
+        with ops.control_dependencies((_assert_labels_rank(labels),)):
+          labels = array_ops.reshape(labels, shape=(-1, 1))
+        loss = losses_lib.hinge_loss(logits=logits, labels=labels, scope=name)
+        return _compute_weighted_loss(loss, weights)
+
+    super(_BinarySvmHead, self).__init__(
+        problem_type=constants.ProblemType.LOGISTIC_REGRESSION,
+        logits_dimension=1,
+        label_name=label_name,
+        weight_column_name=weight_column_name,
+        head_name=head_name)
+    self._thresholds = thresholds if thresholds else (.5,)
+    self._loss_fn = _loss_fn
+    self._enable_centered_bias = enable_centered_bias
+
+  def create_model_fn_ops(self,
+                          features,
+                          mode,
+                          labels=None,
+                          train_op_fn=None,
+                          logits=None,
+                          logits_input=None,
+                          scope=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        scope,
+        default_name=self.head_name or "binary_svm_head",
+        values=(tuple(six.itervalues(features)) +
+                (labels, logits, logits_input))):
+      labels = self._transform_labels(mode=mode, labels=labels)
+      logits = _logits(logits_input, logits, self.logits_dimension)
+      return _create_model_fn_ops(
+          features=features,
+          mode=mode,
+          loss_fn=self._loss_fn,
+          logits_to_predictions_fn=self._logits_to_predictions,
+          metrics_fn=self._metrics,
+          # TODO(zakaria): Handle labels for export.
+          create_output_alternatives_fn=self._create_output_alternatives,
+          labels=labels,
+          train_op_fn=train_op_fn,
+          logits=logits,
+          logits_dimension=self.logits_dimension,
+          head_name=self.head_name,
+          weight_column_name=self.weight_column_name,
+          enable_centered_bias=self._enable_centered_bias)
+
+  def _transform_labels(self, mode, labels):
+    """Applies transformations to labels tensor."""
+    if (mode == model_fn.ModeKeys.INFER) or (labels is None):
+      return None
+    labels_tensor = _to_labels_tensor(labels, self._label_name)
+    _check_no_sparse_tensor(labels_tensor)
+    return labels_tensor
+
+  def _logits_to_predictions(self, logits):
+    """See `_MultiClassHead`."""
+    with ops.name_scope(None, "predictions", (logits,)):
+      return {
+          prediction_key.PredictionKey.LOGITS:
+              logits,
+          prediction_key.PredictionKey.CLASSES:
+              math_ops.argmax(
+                  _one_class_to_two_class_logits(logits),
+                  1,
+                  name=prediction_key.PredictionKey.CLASSES)
+      }
+
+  def _metrics(self, eval_loss, predictions, labels, weights):
+    """See `_MultiClassHead`."""
+    with ops.name_scope("metrics", values=(
+        [eval_loss, labels, weights] + list(six.itervalues(predictions)))):
+      metrics = {_summary_key(self.head_name, mkey.LOSS):
+                 metrics_lib.streaming_mean(eval_loss)}
+
+      # TODO(b/29366811): This currently results in both an "accuracy" and an
+      # "accuracy/threshold_0.500000_mean" metric for binary classification.
+      classes = predictions[prediction_key.PredictionKey.CLASSES]
+      metrics[_summary_key(self.head_name, mkey.ACCURACY)] = (
+          metrics_lib.streaming_accuracy(classes, labels, weights))
+      # TODO(sibyl-vie3Poto): add more metrics relevant for svms.
+
+    return metrics
+
+
+class _MultiLabelHead(_SingleHead):
+  """`Head` for multi-label classification."""
+
+  # TODO(zakaria): add signature and metric for multilabel.
+  def __init__(self,
+               n_classes,
+               label_name,
+               weight_column_name,
+               enable_centered_bias,
+               head_name,
+               thresholds,
+               metric_class_ids=None,
+               loss_fn=None):
+
+    super(_MultiLabelHead, self).__init__(
+        problem_type=constants.ProblemType.CLASSIFICATION,
+        logits_dimension=n_classes,
+        label_name=label_name,
+        weight_column_name=weight_column_name,
+        head_name=head_name)
+
+    self._thresholds = thresholds if thresholds else (.5,)
+    self._loss_fn = loss_fn if loss_fn else _sigmoid_cross_entropy_loss
     self._enable_centered_bias = enable_centered_bias
     self._metric_class_ids = tuple([] if metric_class_ids is None else
                                    metric_class_ids)
@@ -781,214 +1333,38 @@ class _MultiClassHead(_SingleHead):
                           logits=None,
                           logits_input=None,
                           scope=None):
-    """See `_Head`."""
-    _check_mode_valid(mode)
-
+    """See `Head`."""
     with variable_scope.variable_scope(
         scope,
-        self.head_name or "multi_class_head",
+        default_name=self.head_name or "multi_label_head",
         values=(tuple(six.itervalues(features)) +
                 (labels, logits, logits_input))):
+      labels = self._transform_labels(mode=mode, labels=labels)
       logits = _logits(logits_input, logits, self.logits_dimension)
-      centered_bias = None
-      if self._enable_centered_bias:
-        centered_bias = _centered_bias(self.logits_dimension, self.head_name)
-        logits = nn.bias_add(logits, centered_bias)
+      return _create_model_fn_ops(
+          features=features,
+          mode=mode,
+          loss_fn=self._loss_fn,
+          logits_to_predictions_fn=self._logits_to_predictions,
+          metrics_fn=self._metrics,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type),
+          labels=labels,
+          train_op_fn=train_op_fn,
+          logits=logits,
+          logits_dimension=self.logits_dimension,
+          head_name=self.head_name,
+          weight_column_name=self.weight_column_name,
+          enable_centered_bias=self._enable_centered_bias)
 
-      predictions = self._logits_to_predictions(logits)
-      loss = None
-      train_op = None
-      eval_metric_ops = None
-      if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
-        weight_tensor = _weight_tensor(features, self.weight_column_name)
-        labels_tensor = _to_labels_tensor(labels, self._label_name,
-                                          self._logits_dimension)
-        loss, weighted_average_loss = _loss(
-            self._loss_fn(logits, labels_tensor), weight_tensor)
-        logging_ops.scalar_summary(
-            _summary_key(self.head_name, mkey.LOSS), weighted_average_loss)
-
-        if (mode == model_fn.ModeKeys.TRAIN) and (train_op_fn is not None):
-          train_op = _train_op(loss, labels_tensor, train_op_fn, centered_bias,
-                               self.logits_dimension, self._loss_fn)
-        eval_metric_ops = self._default_metrics(weighted_average_loss,
-                                                predictions, labels_tensor,
-                                                weight_tensor)
-    return model_fn.ModelFnOps(
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metric_ops=eval_metric_ops,
-        output_alternatives=self._create_output_alternatives(predictions))
-
-  def _logits_to_predictions(self, logits):
-    """Returns a dict of predictions.
-
-    Args:
-      logits: logits `Tensor` after applying possible centered bias.
-
-    Returns:
-      Dict of prediction `Tensor` keyed by `PredictionKey`.
-    """
-    with ops.name_scope(None, "predictions", (logits,)):
-      return {
-          prediction_key.PredictionKey.LOGITS:
-              logits,
-          prediction_key.PredictionKey.PROBABILITIES:
-              nn.softmax(
-                  logits, name=prediction_key.PredictionKey.PROBABILITIES),
-          prediction_key.PredictionKey.CLASSES:
-              math_ops.argmax(
-                  logits, 1, name=prediction_key.PredictionKey.CLASSES)
-      }
-
-  def _default_metrics(self, eval_loss, predictions, labels, weights):
-    """Returns a dict of metrics keyed by name."""
-    with ops.name_scope("default_metrics", values=(
-        [eval_loss, labels, weights] + list(six.itervalues(predictions)))):
-      classes = predictions[prediction_key.PredictionKey.CLASSES]
-      probabilities = predictions[prediction_key.PredictionKey.PROBABILITIES]
-      logits = predictions[prediction_key.PredictionKey.LOGITS]
-
-      metrics = {_summary_key(self.head_name, mkey.LOSS):
-                 metrics_lib.streaming_mean(eval_loss)}
-      # TODO(b/29366811): This currently results in both an "accuracy" and an
-      # "accuracy/threshold_0.500000_mean" metric for binary classification.
-      metrics[_summary_key(self.head_name, mkey.ACCURACY)] = (
-          metrics_lib.streaming_accuracy(classes, labels, weights))
-      metrics[_summary_key(self.head_name, mkey.AUC)] = (
-          _streaming_auc_with_class_id_label(
-              probabilities, labels, weights, self.logits_dimension))
-
-      for class_id in self._metric_class_ids:
-        # TODO(ptucker): Add per-class accuracy, precision, recall.
-        metrics[_summary_key(
-            self.head_name, mkey.CLASS_PREDICTION_MEAN % class_id)] = (
-                _class_predictions_streaming_mean(classes, weights, class_id))
-        metrics[_summary_key(
-            self.head_name, mkey.CLASS_LABEL_MEAN % class_id)] = (
-                _class_labels_streaming_mean(labels, weights, class_id))
-        metrics[_summary_key(
-            self.head_name, mkey.CLASS_PROBABILITY_MEAN % class_id)] = (
-                _predictions_streaming_mean(probabilities, weights, class_id))
-        metrics[_summary_key(
-            self.head_name, mkey.CLASS_LOGITS_MEAN % class_id)] = (
-                _predictions_streaming_mean(logits, weights, class_id))
-        metrics[_summary_key(self.head_name, mkey.CLASS_AUC % class_id)] = (
-            _class_streaming_auc(logits, labels, weights, class_id,
-                                 self.logits_dimension))
-
-    return metrics
-
-
-def _to_labels_tensor(labels, label_name, num_classes=None):
-  """Returns label as a tensor, converting from sparse to dense if needed.
-
-  Args:
-    labels: Label tensor or a dict containig labels.
-    label_name: Label name if labels is a dict.
-    num_classes: Number of classes.
-
-  Returns:
-    Dense label tensor. If label is sparse, it will be converted to dense.
-
-  Raises:
-    ValueError: if label is sparse and num_classes is not provided or <2
-  """
-  labels = labels[label_name] if isinstance(labels, dict) else labels
-  labels = framework_lib.convert_to_tensor_or_sparse_tensor(labels)
-  if isinstance(labels, sparse_tensor.SparseTensor):
-    if num_classes is None:
-      raise ValueError("Must set num_classes when passing labels as a "
-                       "SparseTensor. Sparse labels are currently supported "
-                       "for MultiLabelHead only.")
-    if num_classes < 2:
-      raise ValueError("Must set num_classes >= 2 when passing labels as a "
-                       "SparseTensor.")
-    labels = math_ops.to_int64(
-        sparse_ops.sparse_to_indicator(labels, num_classes))
-  return labels
-
-
-def _assert_labels_rank(labels):
-  return control_flow_ops.Assert(
-      math_ops.less_equal(array_ops.rank(labels), 2),
-      ("labels shape should be either [batch_size, 1] or [batch_size]",))
-
-
-class _BinarySvmHead(_BinaryLogisticHead):
-  """_Head for binary classification using SVMs."""
-
-  def __init__(self, label_name, weight_column_name, enable_centered_bias,
-               head_name, thresholds):
-
-    def _loss_fn(logits, labels):
-      with ops.name_scope(None, "hinge_loss", (logits, labels)) as name:
-        with ops.control_dependencies((_assert_labels_rank(labels),)):
-          labels = array_ops.reshape(labels, shape=(-1, 1))
-        return losses_lib.hinge_loss(logits, labels, scope=name)
-
-    super(_BinarySvmHead, self).__init__(
-        label_name=label_name,
-        weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias,
-        head_name=head_name,
-        loss_fn=_loss_fn,
-        thresholds=thresholds)
-
-  def _logits_to_predictions(self, logits):
-    """See `_MultiClassHead`."""
-    with ops.name_scope(None, "predictions", (logits,)):
-      return {
-          prediction_key.PredictionKey.LOGITS:
-              logits,
-          prediction_key.PredictionKey.CLASSES:
-              math_ops.argmax(
-                  _one_class_to_two_class_logits(logits),
-                  1,
-                  name=prediction_key.PredictionKey.CLASSES)
-      }
-
-  def _default_metrics(self, eval_loss, predictions, labels, weights):
-    """See `_MultiClassHead`."""
-    with ops.name_scope("default_metrics", values=(
-        [eval_loss, labels, weights] + list(six.itervalues(predictions)))):
-      metrics = {_summary_key(self.head_name, mkey.LOSS):
-                 metrics_lib.streaming_mean(eval_loss)}
-
-      # TODO(b/29366811): This currently results in both an "accuracy" and an
-      # "accuracy/threshold_0.500000_mean" metric for binary classification.
-      classes = predictions[prediction_key.PredictionKey.CLASSES]
-      metrics[_summary_key(self.head_name, mkey.ACCURACY)] = (
-          metrics_lib.streaming_accuracy(classes, labels, weights))
-      # TODO(sibyl-vie3Poto): add more metrics relevant for svms.
-
-    return metrics
-
-
-class _MultiLabelHead(_MultiClassHead):
-  """_Head for multlabel classification."""
-
-  # TODO(zakaria): add signature and metric for multilabel.
-  def __init__(self,
-               n_classes,
-               label_name,
-               weight_column_name,
-               enable_centered_bias,
-               head_name,
-               thresholds,
-               metric_class_ids=None):
-
-    super(_MultiLabelHead, self).__init__(
-        n_classes=n_classes,
-        label_name=label_name,
-        weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias,
-        head_name=head_name,
-        loss_fn=_sigmoid_cross_entropy_loss,
-        thresholds=thresholds,
-        metric_class_ids=metric_class_ids)
+  def _transform_labels(self, mode, labels):
+    """Applies transformations to labels tensor."""
+    if (mode == model_fn.ModeKeys.INFER) or (labels is None):
+      return None
+    labels_tensor = _to_labels_tensor(labels, self._label_name)
+    labels_tensor = _sparse_labels_to_indicator(labels_tensor,
+                                                self._logits_dimension)
+    return labels_tensor
 
   def _logits_to_predictions(self, logits):
     """See `_MultiClassHead`."""
@@ -1005,9 +1381,9 @@ class _MultiLabelHead(_MultiClassHead):
                   name=prediction_key.PredictionKey.CLASSES)
       }
 
-  def _default_metrics(self, eval_loss, predictions, labels, weights):
+  def _metrics(self, eval_loss, predictions, labels, weights):
     """Returns a dict of metrics keyed by name."""
-    with ops.name_scope("default_metrics", values=(
+    with ops.name_scope("metrics", values=(
         [eval_loss, labels, weights] + list(six.itervalues(predictions)))):
       classes = predictions[prediction_key.PredictionKey.CLASSES]
       probabilities = predictions[prediction_key.PredictionKey.PROBABILITIES]
@@ -1021,6 +1397,8 @@ class _MultiLabelHead(_MultiClassHead):
           metrics_lib.streaming_accuracy(classes, labels, weights))
       metrics[_summary_key(self.head_name, mkey.AUC)] = _streaming_auc(
           probabilities, labels, weights)
+      metrics[_summary_key(self.head_name, mkey.AUC_PR)] = _streaming_auc(
+          probabilities, labels, weights, curve="PR")
 
       for class_id in self._metric_class_ids:
         # TODO(ptucker): Add per-class accuracy, precision, recall.
@@ -1037,33 +1415,134 @@ class _MultiLabelHead(_MultiClassHead):
             self.head_name, mkey.CLASS_LOGITS_MEAN % class_id)] = (
                 _predictions_streaming_mean(logits, weights, class_id))
         metrics[_summary_key(self.head_name, mkey.CLASS_AUC % class_id)] = (
-            _streaming_auc(logits, labels, weights, class_id))
+            _streaming_auc(probabilities, labels, weights, class_id))
+        metrics[_summary_key(self.head_name, mkey.CLASS_AUC_PR % class_id)] = (
+            _streaming_auc(probabilities, labels, weights, class_id,
+                           curve="PR"))
 
     return metrics
 
 
-def _noop(unused_loss):
-  return control_flow_ops.no_op()
+class _LossOnlyHead(Head):
+  """`Head` implementation for additional loss terms.
+
+  This class only holds loss terms unrelated to any other heads (labels),
+  e.g. regularization.
+
+  Common usage:
+  This is oftem combine with other heads in a multi head setup.
+    ```python
+    head = multi_head([
+        head1, head2, loss_only_head('regularizer', regularizer)])
+    ```
+  """
+
+  def __init__(self, loss_fn, head_name=None):
+    self._loss_fn = loss_fn
+    self.head_name = head_name or "loss_only_head"
+
+  @property
+  def logits_dimension(self):
+    return 0
+
+  def create_model_fn_ops(self,
+                          features,
+                          mode,
+                          labels=None,
+                          train_op_fn=None,
+                          logits=None,
+                          logits_input=None,
+                          scope=None):
+    """See `_Head.create_model_fn_ops`.
+
+    Args:
+      features: Not been used.
+      mode: Estimator's `ModeKeys`.
+      labels: Labels `Tensor`, or `dict` of same.
+      train_op_fn: Function that takes a scalar loss and returns an op to
+          optimize with the loss.
+      logits: Not been used.
+      logits_input: Not been used.
+      scope: Optional scope for variable_scope. If provided, will be passed to
+          all heads. Most users will want to set this to `None`, so each head
+          constructs a separate variable_scope according to its `head_name`.
+
+    Returns:
+      A `ModelFnOps` object.
+
+    Raises:
+      ValueError: if `mode` is not recognition.
+    """
+    _check_mode_valid(mode)
+    loss = None
+    train_op = None
+    if mode != model_fn.ModeKeys.INFER:
+      with variable_scope.variable_scope(scope, default_name=self.head_name):
+        loss = self._loss_fn()
+        if isinstance(loss, list):
+          loss = math_ops.add_n(loss)
+        logging_ops.scalar_summary(
+            _summary_key(self.head_name, mkey.LOSS), loss)
+        if mode == model_fn.ModeKeys.TRAIN:
+          if train_op_fn is None:
+            raise ValueError("train_op_fn can not be None in TRAIN mode")
+          with ops.name_scope(None, "train_op", (loss,)):
+            train_op = train_op_fn(loss)
+
+    return model_fn.ModelFnOps(
+        mode=mode,
+        loss=loss,
+        train_op=train_op,
+        predictions={},
+        eval_metric_ops={})
 
 
-class _MultiHead(_Head):
-  """_Head to combine multiple _Head objects.
+class _MultiHead(Head):
+  """`Head` implementation for multi objective learning.
+
+  This class is responsible for using and merging the output of multiple
+  `Head` objects.
 
   All heads stem from the same logits/logit_input tensor.
 
-  For training, combines losses of each heads according a function provided by
-  user.
-  For eval, adds a /head_name suffix to the keys in eval metrics.
-  For inference, updates keys prediction dict to a 2-tuple,
-    (head_name, prediction_key)
+  Common usage:
+  For simple use cases you can pass the activation of hidden layer like
+  this from your model_fn,
+    ```python
+    last_hidden_layer_activation = ... Build your model.
+    multi_head = ...
+    return multi_head.create_model_fn_ops(
+        ..., logits_input=last_hidden_layer_activation, ...)
+    ```
+
+  Or you can create a logits tensor of
+  [batch_size, multi_head.logits_dimension] shape. _MultiHead will split the
+  logits for you.
+    return multi_head.create_model_fn_ops(..., logits=logits, ...)
+
+  For more complex use cases like a multi-task/multi-tower model or when logits
+  for each head has to be created separately, you can pass a dict of logits
+  where the keys match the name of the single heads.
+    ```python
+    logits = {"head1": logits1, "head2": logits2}
+    return multi_head.create_model_fn_ops(..., logits=logits, ...)
+    ```
+
+  Here is what this class does,
+  + For training, merges losses of each heads according a function provided by
+      user, calls user provided train_op_fn with this final loss.
+  + For eval, merges metrics by adding head_name suffix to the keys in eval
+      metrics.
+  + For inference, updates keys in prediction dict to a 2-tuple,
+      (head_name, prediction_key)
   """
 
-  def __init__(self, heads, loss_combiner):
-    """_Head to combine multiple _Head objects.
+  def __init__(self, heads, loss_merger):
+    """_Head to merges multiple _Head objects.
 
     Args:
       heads: list of _Head objects.
-      loss_combiner: function that takes a list of loss tensors for the heads
+      loss_merger: function that takes a list of loss tensors for the heads
         and returns the final loss tensor for the multi head.
 
     Raises:
@@ -1071,16 +1550,12 @@ class _MultiHead(_Head):
     """
     self._logits_dimension = 0
     for head in heads:
-      # TODO(ptucker): Change this, and add head_name to MultiHead, to support
-      # nested MultiHeads.
-      if not isinstance(head, _SingleHead):
-        raise ValueError("Members of MultiHead must be SingleHead.")
       if not head.head_name:
         raise ValueError("Members of MultiHead must have names.")
       self._logits_dimension += head.logits_dimension
 
     self._heads = heads
-    self._loss_combiner = loss_combiner
+    self._loss_merger = loss_merger
 
   @property
   def logits_dimension(self):
@@ -1102,10 +1577,15 @@ class _MultiHead(_Head):
       labels: Labels `Tensor`, or `dict` of same.
       train_op_fn: Function that takes a scalar loss and returns an op to
           optimize with the loss.
-      logits: Concatenated logits of (x, 1) shape where x is the sum of
-          `logits_dimension` of all the heads, i.e., same as `logits_dimension`
-          of this class. This function will split the logits tensor and pass
-          logits of proper size to each head.
+      logits: Concatenated logits for all heads or a dict of head name to logits
+          tensor. If concatenated logits, it should have (batchsize, x) shape
+          where x is the sum of `logits_dimension` of all the heads,
+          i.e., same as `logits_dimension` of this class. create_model_fn_ops
+          will split the logits tensor and pass logits of proper size to each
+          head. This is useful if we want to be agnostic about whether you
+          creating a single versus multihead. logits can also be a dict for
+          convenience where you are creating the head specific logits explicitly
+          and don't want to concatenate them yourself.
       logits_input: tensor to build logits from.
       scope: Optional scope for variable_scope. If provided, will be passed to
         all heads. Most users will want to set this to `None`, so each head
@@ -1123,33 +1603,45 @@ class _MultiHead(_Head):
     if logits is None:
       # Use logits_input.
       for head in self._heads:
-        # TODO(ptucker): Do we need to let each head create its own logits?
         all_model_fn_ops.append(
             head.create_model_fn_ops(
                 features=features,
                 mode=mode,
                 labels=labels,
-                train_op_fn=_noop,
+                train_op_fn=no_op_train_fn,
                 logits_input=logits_input,
                 scope=scope))
     else:
-      # Split logits for each head.
-      for head, head_logits in zip(self._heads, self._split_logits(logits)):
+      head_logits_pairs = []
+      if isinstance(logits, dict):
+        head_logits_pairs = []
+        for head in self._heads:
+          if isinstance(head, _LossOnlyHead):
+            head_logits_pairs.append((head, None))
+          else:
+            head_logits_pairs.append((head, logits[head.head_name]))
+      else:
+        # Split logits for each head.
+        head_logits_pairs = zip(self._heads, self._split_logits(logits))
+
+      for head, head_logits in head_logits_pairs:
         all_model_fn_ops.append(
             head.create_model_fn_ops(
                 features=features,
                 mode=mode,
                 labels=labels,
-                train_op_fn=_noop,
+                train_op_fn=no_op_train_fn,
                 logits=head_logits,
                 scope=scope))
 
     if mode == model_fn.ModeKeys.TRAIN:
-      return self._combine_train(all_model_fn_ops, train_op_fn)
+      if train_op_fn is None:
+        raise ValueError("train_op_fn can not be None in TRAIN mode.")
+      return self._merge_train(all_model_fn_ops, train_op_fn)
     if mode == model_fn.ModeKeys.INFER:
-      return self._combine_infer(all_model_fn_ops)
+      return self._merge_infer(all_model_fn_ops)
     if mode == model_fn.ModeKeys.EVAL:
-      return self._combine_eval(all_model_fn_ops)
+      return self._merge_eval(all_model_fn_ops)
     raise ValueError("mode=%s unrecognized" % str(mode))
 
   def _split_logits(self, logits):
@@ -1171,23 +1663,23 @@ class _MultiHead(_Head):
       begin += current_logits_size
     return all_logits
 
-  def _combine_train(self, all_model_fn_ops, train_op_fn):
-    """Combines list of ModelFnOps for training.
+  def _merge_train(self, all_model_fn_ops, train_op_fn):
+    """Merges list of ModelFnOps for training.
 
     Args:
       all_model_fn_ops: list of ModelFnOps for the individual heads.
       train_op_fn: Function to create train op. See `create_model_fn_ops`
-          documentaion for more details.
+          documentation for more details.
 
     Returns:
-      ModelFnOps that combines all the heads.
+      ModelFnOps that merges all heads for TRAIN.
     """
     losses = []
     additional_train_ops = []
     for m in all_model_fn_ops:
       losses.append(m.loss)
       additional_train_ops.append(m.train_op)
-    loss = self._loss_combiner(losses)
+    loss = self._loss_merger(losses)
 
     train_op = train_op_fn(loss)
     train_op = control_flow_ops.group(train_op, *additional_train_ops)
@@ -1196,18 +1688,20 @@ class _MultiHead(_Head):
         loss=loss,
         train_op=train_op)
 
-  def _combine_infer(self, all_model_fn_ops):
-    """Combines list of ModelFnOps for inference.
+  def _merge_infer(self, all_model_fn_ops):
+    """Merges list of ModelFnOps for inference.
 
     Args:
       all_model_fn_ops: list of ModelFnOps for the individual heads.
 
     Returns:
-      ModelFnOps that combines all the heads.
+      ModelFnOps that Merges all the heads for INFER.
     """
     predictions = {}
     output_alternatives = {}
     for head, m in zip(self._heads, all_model_fn_ops):
+      if isinstance(head, _LossOnlyHead):
+        continue
       head_name = head.head_name
       output_alternatives[head_name] = m.output_alternatives[head_name]
       for k, v in m.predictions.items():
@@ -1218,14 +1712,14 @@ class _MultiHead(_Head):
         predictions=predictions,
         output_alternatives=output_alternatives)
 
-  def _combine_eval(self, all_model_fn_ops):
-    """Combines list of ModelFnOps for eval.
+  def _merge_eval(self, all_model_fn_ops):
+    """Merges list of ModelFnOps for eval.
 
     Args:
       all_model_fn_ops: list of ModelFnOps for the individual heads.
 
     Returns:
-      ModelFnOps that combines all the heads.
+      ModelFnOps that merges all the heads for EVAL.
     """
     predictions = {}
     metrics = {}
@@ -1238,7 +1732,7 @@ class _MultiHead(_Head):
       for k, v in m.eval_metric_ops.items():
         # metrics["%s/%s" % (k, head_name)] = v
         metrics[k] = v
-    loss = self._loss_combiner(losses)
+    loss = self._loss_merger(losses)
 
     return model_fn.ModelFnOps(
         mode=model_fn.ModeKeys.EVAL,
@@ -1247,26 +1741,34 @@ class _MultiHead(_Head):
         eval_metric_ops=metrics)
 
 
-def _weighted_loss(loss, weight):
-  """Returns cumulative weighted loss as 1d `Tensor`."""
-  with ops.name_scope(None, "weighted_loss", (loss, weight)) as name:
-    return math_ops.multiply(
-        array_ops.reshape(loss, shape=(-1,)),
-        array_ops.reshape(weight, shape=(-1,)),
-        name=name)
-
-
 def _weight_tensor(features, weight_column_name):
-  """Returns weights as 1d `Tensor`."""
+  """Returns weights as `Tensor` of rank 0, or at least 2."""
   if not weight_column_name:
     return None
-  with ops.name_scope(None, "weight_tensor",
-                      tuple(six.itervalues(features))):
-    return math_ops.to_float(features[weight_column_name])
+  if weight_column_name not in features:
+    raise ValueError("Weights {} missing from features.".format(
+        weight_column_name))
+  with ops.name_scope(None, "weight_tensor", tuple(six.itervalues(features))):
+    weight_tensor = math_ops.to_float(features[weight_column_name])
+    shape = weight_tensor.get_shape()
+    rank = shape.ndims
+    # We don't bother with expanding dims of non-staticly shaped tensors or
+    # scalars, and >1d is already in a good format.
+    if rank == 1:
+      logging.warning(
+          "Weights {} has shape {}, expanding to make it 2d.",
+          weight_column_name, shape)
+      return (
+          sparse_ops.sparse_reshape(weight_tensor, (-1, 1))
+          if isinstance(weight_tensor, sparse_tensor.SparseTensor) else
+          array_ops.reshape(weight_tensor, (-1, 1)))
+    return weight_tensor
 
 
-def _loss(loss_unweighted, weight, name="loss"):
-  """Returns a tuple of (loss, weighted_average_loss).
+# TODO(zakaria): This function is needed for backward compatibility and should
+#   be removed when we migrate to core.
+def _compute_weighted_loss(loss_unweighted, weight, name="loss"):
+  """Returns a tuple of (loss_train, loss_report).
 
   loss is used for gradient descent while weighted_average_loss is used for
   summaries to be backward compatible.
@@ -1285,21 +1787,33 @@ def _loss(loss_unweighted, weight, name="loss"):
     name: Optional name
 
   Returns:
-    A tuple of (loss, weighted_average_loss)
+    A tuple of losses. First one for training and the second one for reporting.
   """
   with ops.name_scope(name, values=(loss_unweighted, weight)) as name_scope:
     if weight is None:
       loss = math_ops.reduce_mean(loss_unweighted, name=name_scope)
       return loss, loss
-    loss_weighted = _weighted_loss(loss_unweighted, weight)
-    # TODO(ptucker): This might be wrong if weights are broadcast to loss shape.
-    # We should use tf.losses here.
-    weighted_average_loss = math_ops.div(
-        math_ops.reduce_sum(loss_weighted),
+    weight = weights_broadcast_ops.broadcast_weights(weight, loss_unweighted)
+    with ops.name_scope(None, "weighted_loss",
+                        (loss_unweighted, weight)) as name:
+      weighted_loss = math_ops.multiply(loss_unweighted, weight, name=name)
+    weighted_loss_mean = math_ops.reduce_mean(weighted_loss, name=name_scope)
+    weighted_loss_normalized = math_ops.div(
+        math_ops.reduce_sum(weighted_loss),
         math_ops.to_float(math_ops.reduce_sum(weight)),
         name="weighted_average_loss")
-    loss = math_ops.reduce_mean(loss_weighted, name=name_scope)
-    return loss, weighted_average_loss
+
+    return weighted_loss_mean, weighted_loss_normalized
+
+
+def _wrap_custom_loss_fn(loss_fn):
+  def _wrapper(labels, logits, weights=None):
+    if weights is None:
+      loss = loss_fn(labels, logits)
+    else:
+      loss = loss_fn(labels, logits, weights)
+    return loss, loss
+  return _wrapper
 
 
 def _check_mode_valid(mode):
@@ -1309,15 +1823,36 @@ def _check_mode_valid(mode):
     raise ValueError("mode=%s unrecognized." % str(mode))
 
 
+def _get_arguments(func):
+  """Returns a spec of given func."""
+  _, func = tf_decorator.unwrap(func)
+  if hasattr(func, "__code__"):
+    # Regular function.
+    return tf_inspect.getargspec(func)
+  elif hasattr(func, "__call__"):
+    # Callable object.
+    return _get_arguments(func.__call__)
+  elif hasattr(func, "func"):
+    # Partial function.
+    return _get_arguments(func.func)
+
+
+def _verify_loss_fn_args(loss_fn):
+  args = _get_arguments(loss_fn).args
+  for arg_name in ["labels", "logits", "weights"]:
+    if arg_name not in args:
+      raise ValueError("Argument %s not found in loss_fn." % arg_name)
+
+
 def _centered_bias(logits_dimension, head_name=None):
-  """Returns `logits`, optionally with centered bias applied.
+  """Returns centered_bias `Variable`.
 
   Args:
     logits_dimension: Last dimension of `logits`. Must be >= 1.
     head_name: Optional name of the head.
 
   Returns:
-    Centered bias `Variable`.
+    `Variable` with shape `[logits_dimension]`.
 
   Raises:
     ValueError: if `logits_dimension` is invalid.
@@ -1327,7 +1862,7 @@ def _centered_bias(logits_dimension, head_name=None):
   # Do not create a variable with variable_scope.get_variable, because that may
   # create a PartitionedVariable, which does not support indexing, so
   # summary.scalar will not work.
-  centered_bias = variables.Variable(
+  centered_bias = variable_scope.variable(
       name="centered_bias_weight",
       initial_value=array_ops.zeros(shape=(logits_dimension,)),
       trainable=True)
@@ -1340,18 +1875,16 @@ def _centered_bias(logits_dimension, head_name=None):
   return centered_bias
 
 
-def _centered_bias_step(centered_bias, logits_dimension, labels, loss_fn):
+def _centered_bias_step(centered_bias, batch_size, labels, loss_fn, weights):
   """Creates and returns training op for centered bias."""
-  if (logits_dimension is None) or (logits_dimension < 1):
-    raise ValueError("Invalid logits_dimension %s." % logits_dimension)
   with ops.name_scope(None, "centered_bias_step", (labels,)) as name:
-    batch_size = array_ops.shape(labels)[0]
+    logits_dimension = array_ops.shape(centered_bias)[0]
     logits = array_ops.reshape(
         array_ops.tile(centered_bias, (batch_size,)),
         (batch_size, logits_dimension))
     with ops.name_scope(None, "centered_bias", (labels, logits)):
       centered_bias_loss = math_ops.reduce_mean(
-          loss_fn(logits, labels), name="training_loss")
+          loss_fn(labels, logits, weights), name="training_loss")
   # Learn central bias by an optimizer. 0.1 is a convervative lr for a
   # single variable.
   return training.AdagradOptimizer(0.1).minimize(
@@ -1362,16 +1895,16 @@ def _summary_key(head_name, val):
   return "%s/%s" % (val, head_name) if head_name else val
 
 
-def _train_op(loss,
-              labels,
-              train_op_fn,
-              centered_bias=None,
-              logits_dimension=None,
-              loss_fn=None):
+def _train_op(loss, labels, train_op_fn, centered_bias, batch_size, loss_fn,
+              weights):
   """Returns op for the training step."""
   if centered_bias is not None:
-    centered_bias_step = _centered_bias_step(centered_bias, logits_dimension,
-                                             labels, loss_fn)
+    centered_bias_step = _centered_bias_step(
+        centered_bias=centered_bias,
+        batch_size=batch_size,
+        labels=labels,
+        loss_fn=loss_fn,
+        weights=weights)
   else:
     centered_bias_step = None
   with ops.name_scope(None, "train_op", (loss, labels)):
@@ -1381,12 +1914,13 @@ def _train_op(loss,
     return train_op
 
 
-def _sigmoid_cross_entropy_loss(logits, labels):
+def _sigmoid_cross_entropy_loss(labels, logits, weights=None):
   with ops.name_scope(None, "sigmoid_cross_entropy_loss",
                       (logits, labels)) as name:
     # sigmoid_cross_entropy_with_logits requires [batch_size, n_classes] labels.
-    return nn.sigmoid_cross_entropy_with_logits(
+    loss = nn.sigmoid_cross_entropy_with_logits(
         labels=math_ops.to_float(labels), logits=logits, name=name)
+    return _compute_weighted_loss(loss, weights)
 
 
 def _float_weights_or_none(weights):
@@ -1397,8 +1931,13 @@ def _float_weights_or_none(weights):
 
 
 def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
-  labels = ops.convert_to_tensor(labels)
+  labels = math_ops.to_float(labels)
+  weights = _float_weights_or_none(weights)
+  if weights is not None:
+    weights = weights_broadcast_ops.broadcast_weights(weights, labels)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     labels = labels[:, class_id]
   return metrics_lib.streaming_mean(labels, weights=weights)
 
@@ -1406,11 +1945,13 @@ def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
 def _predictions_streaming_mean(predictions,
                                 weights=None,
                                 class_id=None):
-  predictions = ops.convert_to_tensor(predictions)
+  predictions = math_ops.to_float(predictions)
+  weights = _float_weights_or_none(weights)
   if weights is not None:
-    weights = ops.convert_to_tensor(weights)
-
+    weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     predictions = predictions[:, class_id]
   return metrics_lib.streaming_mean(predictions, weights=weights)
 
@@ -1443,30 +1984,23 @@ def _class_labels_streaming_mean(labels, weights, class_id):
       weights=weights)
 
 
-def _class_streaming_auc(predictions, labels, weights, class_id,
-                         num_classes):
-  indicator_labels = _class_id_labels_to_indicator(
-      labels, num_classes=num_classes)
-  return _streaming_auc(predictions, indicator_labels, weights, class_id)
-
-
-def _streaming_auc_with_class_id_label(predictions, labels, weights,
-                                       num_classes):
-  indicator_labels = _class_id_labels_to_indicator(
-      labels, num_classes=num_classes)
-  return _streaming_auc(predictions, indicator_labels, weights)
-
-
-def _streaming_auc(predictions, labels, weights=None, class_id=None):
-  predictions = ops.convert_to_tensor(predictions)
-  labels = ops.convert_to_tensor(labels)
+def _streaming_auc(predictions, labels, weights=None, class_id=None,
+                   curve="ROC"):
+  # pylint: disable=missing-docstring
+  predictions = math_ops.to_float(predictions)
+  if labels.dtype.base_dtype != dtypes.bool:
+    logging.warning("Casting %s labels to bool.", labels.dtype)
+    labels = math_ops.cast(labels, dtypes.bool)
+  weights = _float_weights_or_none(weights)
+  if weights is not None:
+    weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     predictions = predictions[:, class_id]
     labels = labels[:, class_id]
   return metrics_lib.streaming_auc(
-      predictions,
-      math_ops.cast(labels, dtypes.bool),
-      weights=_float_weights_or_none(weights))
+      predictions, labels, weights=weights, curve=curve)
 
 
 def _assert_class_id(class_id, num_classes=None):
@@ -1500,3 +2034,78 @@ def _streaming_recall_at_threshold(predictions, labels, weights, threshold):
       weights=_float_weights_or_none(weights))
   return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
 
+
+def _classification_output_alternatives(head_name, problem_type,
+                                        label_keys=None):
+  """Creates a func to generate output alternatives for classification.
+
+  Servo expects classes to be a string tensor, and have the same dimensions
+  as the probabilities tensor. It should contain the labels of the corresponding
+  entries in probabilities. This function creates a new classes tensor that
+  satisfies these conditions and can be exported.
+
+  Args:
+    head_name: Name of the head.
+    problem_type: `ProblemType`
+    label_keys: Optional label keys
+
+  Returns:
+    A function to generate output alternatives.
+  """
+  def _create_output_alternatives(predictions):
+    """Creates output alternative for the Head.
+
+    Args:
+      predictions: a dict of {tensor_name: Tensor}, where 'tensor_name' is a
+        symbolic name for an output Tensor possibly but not necessarily taken
+        from `PredictionKey`, and 'Tensor' is the corresponding output Tensor
+        itself.
+
+    Returns:
+      `dict` of {submodel_name: (problem_type, {tensor_name: Tensor})}, where
+      'submodel_name' is a submodel identifier that should be consistent across
+      the pipeline (here likely taken from the head_name),
+      'problem_type' is a `ProblemType`,
+      'tensor_name' is a symbolic name for an output Tensor possibly but not
+       necessarily taken from `PredictionKey`, and
+      'Tensor' is the corresponding output Tensor itself.
+
+    Raises:
+      ValueError: if predictions does not have PredictionKey.PROBABILITIES key.
+    """
+    probabilities = predictions.get(prediction_key.PredictionKey.PROBABILITIES)
+    if probabilities is None:
+      raise ValueError("%s missing in predictions" %
+                       prediction_key.PredictionKey.PROBABILITIES)
+
+    with ops.name_scope(None, "_classification_output_alternatives",
+                        (probabilities,)):
+      batch_size = array_ops.shape(probabilities)[0]
+      if label_keys:
+        classes = array_ops.tile(
+            input=array_ops.expand_dims(input=label_keys, axis=0),
+            multiples=[batch_size, 1],
+            name="classes_tensor")
+      else:
+        n = array_ops.shape(probabilities)[1]
+        classes = array_ops.tile(
+            input=array_ops.expand_dims(input=math_ops.range(n), axis=0),
+            multiples=[batch_size, 1])
+        classes = string_ops.as_string(classes, name="classes_tensor")
+
+    exported_predictions = {
+        prediction_key.PredictionKey.PROBABILITIES: probabilities,
+        prediction_key.PredictionKey.CLASSES: classes}
+    return {head_name: (problem_type, exported_predictions)}
+
+  return _create_output_alternatives
+
+# Aliases
+# TODO(zakaria): Remove these aliases, See b/34751732
+_regression_head = regression_head
+_poisson_regression_head = poisson_regression_head
+_multi_class_head = multi_class_head
+_binary_svm_head = binary_svm_head
+_multi_label_head = multi_label_head
+_multi_head = multi_head
+_Head = Head
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 88255e20ce3..25a66748587 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -19,14 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import math
-import sys
 
 # pylint: disable=g-bad-todo,g-import-not-at-top
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 import six
 
@@ -38,10 +32,10 @@ from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import test
-# pylint: enable=g-bad-todo,g-import-not-at-top
 
 
 def _assert_variables(test_case,
@@ -100,7 +94,42 @@ def _sigmoid(x):
   return 1. / (1. + math.exp(-1 * x))
 
 
-class RegressionModelHeadTest(test.TestCase):
+class PoissonHeadTest(test.TestCase):
+
+  def _assert_output_alternatives(self, model_fn_ops):
+    self.assertEquals({
+        None: constants.ProblemType.LINEAR_REGRESSION
+    }, {
+        k: v[0] for k, v in six.iteritems(model_fn_ops.output_alternatives)
+    })
+
+  def _log_poisson_loss(self, logits, labels):
+    x = np.array([f[0] for f in logits])
+    z = np.array([f[0] for f in labels])
+    lpl = np.exp(x) - z * x
+    stirling_approx = z * np.log(z) - z + 0.5 * np.log(2. * np.pi * z)
+    lpl += np.ma.masked_array(stirling_approx, mask=(z <= 1)).filled(0.)
+    return sum(lpl)/len(lpl)
+
+  def testPoissonWithLogits(self):
+    head = head_lib.poisson_regression_head()
+    labels = ((0.,), (1.,), (1.,))
+    logits = ((0.,), (-1.,), (3.,))
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_summary_tags(self, ["loss"])
+      _assert_no_variables(self)
+      loss = self._log_poisson_loss(logits, labels)
+      _assert_metrics(self, loss, {"loss": loss}, model_fn_ops)
+
+
+class RegressionHeadTest(test.TestCase):
 
   def _assert_output_alternatives(self, model_fn_ops):
     self.assertEquals({
@@ -111,13 +140,13 @@ class RegressionModelHeadTest(test.TestCase):
 
   # TODO(zakaria): test multilabel regression.
   def testRegressionWithLogits(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           labels=((0.,), (1.,), (1.,)),
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_summary_tags(self, ["loss"])
@@ -125,24 +154,24 @@ class RegressionModelHeadTest(test.TestCase):
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testRegressionWithInvalidLogits(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
         head.create_model_fn_ops(
             {},
             labels=((0.,), (1.,), (1.,)),
             mode=model_fn.ModeKeys.TRAIN,
-            train_op_fn=_noop_train_op,
+            train_op_fn=head_lib.no_op_train_fn,
             logits=((1., 1.), (1., 1.), (3., 1.)))
 
   def testRegressionWithLogitsInput(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           labels=((0.,), (1.,), (1.,)),
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits_input=((0., 0.), (0., 0.), (0., 0.)))
       self._assert_output_alternatives(model_fn_ops)
       w = ("regression_head/logits/weights:0",
@@ -154,7 +183,7 @@ class RegressionModelHeadTest(test.TestCase):
       _assert_metrics(self, 2. / 3, {"loss": 2. / 3}, model_fn_ops)
 
   def testRegressionWithLogitsAndLogitsInput(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(
           ValueError, "Both logits and logits_input supplied"):
@@ -162,18 +191,18 @@ class RegressionModelHeadTest(test.TestCase):
             {},
             labels=((0.,), (1.,), (1.,)),
             mode=model_fn.ModeKeys.TRAIN,
-            train_op_fn=_noop_train_op,
+            train_op_fn=head_lib.no_op_train_fn,
             logits_input=((0., 0.), (0., 0.), (0., 0.)),
             logits=((1.,), (1.,), (3.,)))
 
   def testRegressionEvalMode(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           labels=((1.,), (1.,), (3.,)),
           mode=model_fn.ModeKeys.EVAL,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=((0.,), (1.,), (1.,)))
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
@@ -183,43 +212,79 @@ class RegressionModelHeadTest(test.TestCase):
 
   def testRegressionWithLabelName(self):
     label_name = "my_label"
-    head = head_lib._regression_head(label_name=label_name)
+    head = head_lib.regression_head(label_name=label_name)
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           labels={label_name: ((0.,), (1.,), (1.,))},
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
-  def testRegressionWithWeights(self):
-    head = head_lib._regression_head(weight_column_name="label_weight")
+  def testRegressionWithScalarWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
     with ops.Graph().as_default(), session.Session():
-      weights = ((2.,), (5.,), (0.,))
+      weights = 2.
+      labels = ((0.,), (1.,), (1.,))
       model_fn_ops = head.create_model_fn_ops(
           features={"label_weight": weights},
-          labels=((0.,), (1.,), (1.,)),
+          labels=labels,
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      _assert_metrics(self, 2. / len(weights), {"loss": 2. / np.sum(weights)},
+      _assert_metrics(self, (weights * 5.) / len(labels), {
+          "loss": (weights * 5.) / (weights * len(labels))
+      }, model_fn_ops)
+
+  def testRegressionWith1DWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = (2., 5., 0.)
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
+                      model_fn_ops)
+
+  def testRegressionWith2DWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = ((2.,), (5.,), (0.,))
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
                       model_fn_ops)
 
   def testRegressionWithCenteredBias(self):
-    head = head_lib._regression_head(enable_centered_bias=True)
+    head = head_lib.regression_head(enable_centered_bias=True)
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           labels=((0.,), (1.,), (1.,)),
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_variables(
@@ -230,28 +295,30 @@ class RegressionModelHeadTest(test.TestCase):
           ),
           expected_trainable=("regression_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "regression_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "regression_head/centered_bias/bias_0"
+      ])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testRegressionErrorInSparseTensorLabels(self):
-    head = head_lib._regression_head()
+    head = head_lib.regression_head()
     with ops.Graph().as_default():
       labels = sparse_tensor.SparseTensorValue(
           indices=((0, 0), (1, 0), (2, 0)),
           values=(0., 1., 1.),
           dense_shape=(3, 1))
       with self.assertRaisesRegexp(ValueError,
-                                   "Must set num_classes when passing"):
+                                   "SparseTensor is not supported"):
         head.create_model_fn_ops(
             {},
             labels=labels,
             mode=model_fn.ModeKeys.TRAIN,
-            train_op_fn=_noop_train_op,
+            train_op_fn=head_lib.no_op_train_fn,
             logits=((1.,), (1.,), (3.,)))
 
 
-class MultiLabelModelHeadTest(test.TestCase):
+class MultiLabelHeadTest(test.TestCase):
 
   def _assert_output_alternatives(self, model_fn_ops):
     self.assertEquals({
@@ -267,11 +334,15 @@ class MultiLabelModelHeadTest(test.TestCase):
   def _expected_eval_metrics(self, expected_loss):
     return {
         "accuracy": 1. / 3,
-        "auc": 1. / 4,
         "loss": expected_loss,
+        "auc": 1. / 4,
         "auc/class0": 1.,
         "auc/class1": 1.,
         "auc/class2": 0.,
+        "auc_precision_recall": 0.166667,
+        "auc_precision_recall/class0": 0,
+        "auc_precision_recall/class1": 0.,
+        "auc_precision_recall/class2": 1.,
         "labels/actual_label_mean/class0": self._labels[0][0],
         "labels/actual_label_mean/class1": self._labels[0][1],
         "labels/actual_label_mean/class2": self._labels[0][2],
@@ -288,11 +359,11 @@ class MultiLabelModelHeadTest(test.TestCase):
 
   def testMultiLabelWithLogits(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -301,25 +372,55 @@ class MultiLabelModelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
+  def testMultiLabelTwoClasses(self):
+    n_classes = 2
+    labels = ((0, 1),)
+    logits = ((1., 0.),)
+    head = head_lib.multi_label_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.TRAIN, labels=labels,
+          train_op_fn=head_lib.no_op_train_fn, logits=logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.00320443
+      _assert_metrics(self, expected_loss, {
+          "accuracy": 0.,
+          "auc": 0.,
+          "loss": expected_loss,
+          "auc/class0": 1.,
+          "auc/class1": 0.,
+          "labels/actual_label_mean/class0": labels[0][0],
+          "labels/actual_label_mean/class1": labels[0][1],
+          "labels/logits_mean/class0": logits[0][0],
+          "labels/logits_mean/class1": logits[0][1],
+          "labels/prediction_mean/class0": logits[0][0],
+          "labels/prediction_mean/class1": logits[0][1],
+          "labels/probability_mean/class0": _sigmoid(logits[0][0]),
+          "labels/probability_mean/class1": _sigmoid(logits[0][1]),
+      }, model_fn_ops)
+
   def testMultiLabelWithInvalidLogits(self):
-    head = head_lib._multi_label_head(n_classes=len(self._labels[0]) + 1)
+    head = head_lib.multi_label_head(n_classes=len(self._labels[0]) + 1)
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits=self._logits)
 
   def testMultiLabelWithLogitsInput(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits_input=((0., 0.),))
       self._assert_output_alternatives(model_fn_ops)
-      w = ("multi_class_head/logits/weights:0",
-           "multi_class_head/logits/biases:0")
+      w = ("multi_label_head/logits/weights:0",
+           "multi_label_head/logits/biases:0")
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
@@ -348,22 +449,22 @@ class MultiLabelModelHeadTest(test.TestCase):
 
   def testMultiLabelWithLogitsAndLogitsInput(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(
           ValueError, "Both logits and logits_input supplied"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits_input=((0., 0.),), logits=self._logits)
 
-  def testMultiLabelEvalMode(self):
+  def testMultiLabelEval(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.EVAL, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.EVAL, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
@@ -373,17 +474,86 @@ class MultiLabelModelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
+  def testMultiClassEvalWithLargeLogits(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    logits = ((2., 0., -1),)
+    with ops.Graph().as_default(), session.Session():
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.EVAL, self._labels, head_lib.no_op_train_fn,
+          logits=logits)
+      self._assert_output_alternatives(model_fn_ops)
+      self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.377779
+      expected_eval_metrics = {
+          "accuracy": 1. / 3,
+          "auc": 9.99999e-07,
+          "loss": expected_loss,
+          "auc/class0": 1.,
+          "auc/class1": 1.,
+          "auc/class2": 0.,
+          "labels/actual_label_mean/class0": 0. / 1,
+          "labels/actual_label_mean/class1": 0. / 1,
+          "labels/actual_label_mean/class2": 1. / 1,
+          "labels/logits_mean/class0": logits[0][0],
+          "labels/logits_mean/class1": logits[0][1],
+          "labels/logits_mean/class2": logits[0][2],
+          "labels/prediction_mean/class0": 1,
+          "labels/prediction_mean/class1": 0,
+          "labels/prediction_mean/class2": 0,
+          "labels/probability_mean/class0": _sigmoid(logits[0][0]),
+          "labels/probability_mean/class1": _sigmoid(logits[0][1]),
+          "labels/probability_mean/class2": _sigmoid(logits[0][2]),
+      }
+      _assert_metrics(self, expected_loss,
+                      expected_eval_metrics, model_fn_ops)
+
+  def testMultiLabelInfer(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(n_classes=n_classes, head_name="head_name")
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.INFER, self._labels, head_lib.no_op_train_fn,
+          logits=((1., 0., 0.), (0., 0., 1)))
+      self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      with session.Session():
+        self.assertListEqual(
+            [1, 0, 0], model_fn_ops.predictions["classes"].eval().tolist()[0])
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.CLASSIFICATION,
+            model_fn_ops.output_alternatives["head_name"][0])
+
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertAllEqual(
+            [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
+            predictions_for_serving["classes"].eval())
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertAllClose(
+            [[0.731059, 0.5, 0.5],
+             [0.5, 0.5, 0.731059,]],
+            predictions_for_serving["probabilities"].eval())
+
   def testMultiLabelWithLabelName(self):
     n_classes = 3
     label_name = "my_label"
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes,
         label_name=label_name,
         metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {}, model_fn.ModeKeys.TRAIN, {label_name: self._labels},
-          _noop_train_op, logits=self._logits)
+          head_lib.no_op_train_fn, logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
@@ -391,9 +561,9 @@ class MultiLabelModelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testMultiLabelWithWeight(self):
+  def testMultiLabelWithScalarWeight(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes,
         weight_column_name="label_weight",
         metric_class_ids=range(n_classes))
@@ -402,38 +572,94 @@ class MultiLabelModelHeadTest(test.TestCase):
           features={"label_weight": .1},
           labels=self._labels,
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, .089985214,
-                      self._expected_eval_metrics(2.69956), model_fn_ops)
+                      self._expected_eval_metrics(.89985214), model_fn_ops)
+
+  def testMultiLabelWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      with self.assertRaisesRegexp(
+          ValueError, "weights can not be broadcast to values"):
+        head.create_model_fn_ops(
+            features={"label_weight": (.1, .1, .1)},
+            labels=self._labels,
+            mode=model_fn.ModeKeys.TRAIN,
+            train_op_fn=head_lib.no_op_train_fn,
+            logits=self._logits)
+
+  def testMultiLabelWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": ((.1, .1, .1),)},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, .089985214,
+                      self._expected_eval_metrics(.89985214), model_fn_ops)
+
+  def testMultiLabelWithCustomLoss(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes),
+        loss_fn=_sigmoid_cross_entropy)
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": .1},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = .089985214
+      _assert_metrics(self, expected_loss,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
 
   def testMultiLabelWithCenteredBias(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes,
         enable_centered_bias=True,
         metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_variables(
           self,
           expected_global=(
-              "multi_class_head/centered_bias_weight:0",
-              ("multi_class_head/multi_class_head/centered_bias_weight/"
+              "multi_label_head/centered_bias_weight:0",
+              ("multi_label_head/multi_label_head/centered_bias_weight/"
                "Adagrad:0"),),
-          expected_trainable=("multi_class_head/centered_bias_weight:0",))
+          expected_trainable=("multi_label_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
       _assert_summary_tags(self, (
           "loss",
-          "multi_class_head/centered_bias/bias_0",
-          "multi_class_head/centered_bias/bias_1",
-          "multi_class_head/centered_bias/bias_2"
+          "multi_label_head/centered_bias/bias_0",
+          "multi_label_head/centered_bias/bias_1",
+          "multi_label_head/centered_bias/bias_2"
       ))
       expected_loss = .89985204
       _assert_metrics(self, expected_loss,
@@ -441,7 +667,7 @@ class MultiLabelModelHeadTest(test.TestCase):
 
   def testMultiLabelSparseTensorLabels(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       labels = sparse_tensor.SparseTensorValue(
@@ -452,7 +678,7 @@ class MultiLabelModelHeadTest(test.TestCase):
           features={},
           mode=model_fn.ModeKeys.TRAIN,
           labels=labels,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
@@ -462,7 +688,7 @@ class MultiLabelModelHeadTest(test.TestCase):
 
   def testMultiLabelSparseTensorLabelsTooFewClasses(self):
     n_classes = 3
-    head = head_lib._multi_label_head(
+    head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     # Set _logits_dimension (n_classes) to a lower value; if it's set to 1
     # upfront, the class throws an error during initialization.
@@ -478,11 +704,11 @@ class MultiLabelModelHeadTest(test.TestCase):
             features={},
             labels=labels,
             mode=model_fn.ModeKeys.TRAIN,
-            train_op_fn=_noop_train_op,
+            train_op_fn=head_lib.no_op_train_fn,
             logits=[0.])
 
 
-class BinaryClassificationModelHeadTest(test.TestCase):
+class BinaryClassificationHeadTest(test.TestCase):
 
   def _assert_output_alternatives(self, model_fn_ops):
     self.assertEquals({
@@ -502,6 +728,7 @@ class BinaryClassificationModelHeadTest(test.TestCase):
         "accuracy/baseline_label_mean": label_mean,
         "accuracy/threshold_0.500000_mean": 1. / 2,
         "auc": 1. / 2,
+        "auc_precision_recall": 0.749999,
         "labels/actual_label_mean": label_mean,
         "labels/prediction_mean": .731059,  # softmax
         "loss": expected_loss,
@@ -511,12 +738,12 @@ class BinaryClassificationModelHeadTest(test.TestCase):
 
   def testBinaryClassificationWithLogits(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes)
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -526,21 +753,21 @@ class BinaryClassificationModelHeadTest(test.TestCase):
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
   def testBinaryClassificationWithInvalidLogits(self):
-    head = head_lib._multi_class_head(n_classes=len(self._labels) + 1)
+    head = head_lib.multi_class_head(n_classes=len(self._labels) + 1)
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits=self._logits)
 
   def testBinaryClassificationWithLogitsInput(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes)
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits_input=((0., 0.), (0., 0.)))
       self._assert_output_alternatives(model_fn_ops)
       w = ("binary_logistic_head/logits/weights:0",
@@ -564,22 +791,22 @@ class BinaryClassificationModelHeadTest(test.TestCase):
       }, model_fn_ops)
 
   def testBinaryClassificationWithLogitsAndLogitsInput(self):
-    head = head_lib._multi_class_head(n_classes=2)
+    head = head_lib.multi_class_head(n_classes=2)
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(
           ValueError, "Both logits and logits_input supplied"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits_input=((0., 0.), (0., 0.)), logits=self._logits)
 
-  def testBinaryClassificationEvalMode(self):
+  def testBinaryClassificationEval(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes)
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.EVAL, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.EVAL, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
@@ -589,30 +816,44 @@ class BinaryClassificationModelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testBinaryClassificationInferMode(self):
+  def testBinaryClassificationInfer(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes, head_name="head_name")
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.INFER, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.INFER, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
-      self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
+      with session.Session():
+        self.assertListEqual(
+            [1, 1], list(model_fn_ops.predictions["classes"].eval()))
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.LOGISTIC_REGRESSION,
+            model_fn_ops.output_alternatives["head_name"][0])
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        predicted_classes = predictions_for_serving["classes"].eval().tolist()
+        self.assertListEqual(
+            [b"0", b"1"], predicted_classes[0])
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
 
-  def testBinaryClassificationInferMode_withWightColumn(self):
+  def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes,
-                                      weight_column_name="label_weight")
+    head = head_lib.multi_class_head(n_classes=n_classes,
+                                     weight_column_name="label_weight")
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
           # This is what is being tested, features should not have weight for
           # inference.
-          {}, model_fn.ModeKeys.INFER, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.INFER, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
@@ -620,24 +861,24 @@ class BinaryClassificationModelHeadTest(test.TestCase):
 
   def testErrorInSparseTensorLabels(self):
     n_classes = 2
-    head = head_lib._multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes)
     with ops.Graph().as_default():
       labels = sparse_tensor.SparseTensorValue(
           indices=((0, 0), (1, 0), (2, 0)),
           values=(0, 1, 1),
           dense_shape=(3, 1))
       with self.assertRaisesRegexp(ValueError,
-                                   "Must set num_classes when passing"):
+                                   "SparseTensor is not supported"):
         head.create_model_fn_ops(
             {},
             model_fn.ModeKeys.TRAIN,
             labels,
-            _noop_train_op,
+            head_lib.no_op_train_fn,
             logits=((1.,), (1.,), (3.,)))
 
   def testBinaryClassificationWithLabelName(self):
     label_name = "my_label"
-    head = head_lib._multi_class_head(n_classes=2, label_name=label_name)
+    head = head_lib.multi_class_head(n_classes=2, label_name=label_name)
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
@@ -645,7 +886,7 @@ class BinaryClassificationModelHeadTest(test.TestCase):
           {},
           labels={label_name: self._labels},
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -654,19 +895,19 @@ class BinaryClassificationModelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testBinaryClassificationWithWeights(self):
+  def testBinaryClassificationWith1DWeights(self):
     n_classes = 2
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
         n_classes=n_classes, weight_column_name="label_weight")
     with ops.Graph().as_default(), session.Session():
-      weights = ((1.,), (0.,))
+      weights = (1., 0.)
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
           features={"label_weight": weights},
           labels=self._labels,
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -682,20 +923,91 @@ class BinaryClassificationModelHeadTest(test.TestCase):
               "auc": 0. / 1,
               "labels/actual_label_mean": 1. / 1,
               "labels/prediction_mean": .731059,  # softmax
-              # TODO(ptucker): Is this the correct eval loss, sum not average?
+              # eval loss is weighted loss divided by sum of weights.
               "loss": expected_total_loss,
               "precision/positive_threshold_0.500000_mean": 1. / 1,
               "recall/positive_threshold_0.500000_mean": 1. / 1,
           },
           model_fn_ops)
 
+  def testBinaryClassificationWith2DWeights(self):
+    n_classes = 2
+    head = head_lib.multi_class_head(
+        n_classes=n_classes, weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = ((1.,), (0.,))
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_total_loss = .31326166
+      _assert_metrics(
+          self,
+          expected_total_loss / len(weights),
+          {
+              "accuracy": 1. / 1,
+              "accuracy/baseline_label_mean": 1. / 1,
+              "accuracy/threshold_0.500000_mean": 1. / 1,
+              "auc": 0. / 1,
+              "labels/actual_label_mean": 1. / 1,
+              "labels/prediction_mean": .731059,  # softmax
+              # eval loss is weighted loss divided by sum of weights.
+              "loss": expected_total_loss,
+              "precision/positive_threshold_0.500000_mean": 1. / 1,
+              "recall/positive_threshold_0.500000_mean": 1. / 1,
+          },
+          model_fn_ops)
+
+  def testBinaryClassificationWithCustomLoss(self):
+    head = head_lib.multi_class_head(
+        n_classes=2, weight_column_name="label_weight",
+        loss_fn=_sigmoid_cross_entropy)
+    with ops.Graph().as_default(), session.Session():
+      weights = ((.2,), (0.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      # expected_loss is (total_weighted_loss)/1 since there is 1 nonzero
+      # weight.
+      expected_loss = 0.062652342
+      _assert_metrics(
+          self,
+          expected_loss,
+          {
+              "accuracy": 1. / 1,
+              "accuracy/baseline_label_mean": 1. / 1,
+              "accuracy/threshold_0.500000_mean": 1. / 1,
+              "auc": 0. / 1,
+              "labels/actual_label_mean": 1. / 1,
+              "labels/prediction_mean": .731059,  # softmax
+              "loss": expected_loss,
+              "precision/positive_threshold_0.500000_mean": 1. / 1,
+              "recall/positive_threshold_0.500000_mean": 1. / 1,
+          },
+          model_fn_ops)
+
   def testBinaryClassificationWithCenteredBias(self):
-    head = head_lib._multi_class_head(n_classes=2, enable_centered_bias=True)
+    head = head_lib.multi_class_head(n_classes=2, enable_centered_bias=True)
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_variables(
@@ -706,14 +1018,16 @@ class BinaryClassificationModelHeadTest(test.TestCase):
                "Adagrad:0"),),
           expected_trainable=("binary_logistic_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "binary_logistic_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "binary_logistic_head/centered_bias/bias_0"
+      ])
       expected_loss = .81326175
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
 
-class MultiClassModelHeadTest(test.TestCase):
+class MultiClassHeadTest(test.TestCase):
 
   def _assert_output_alternatives(self, model_fn_ops):
     self.assertEquals({
@@ -724,16 +1038,12 @@ class MultiClassModelHeadTest(test.TestCase):
 
   def setUp(self):
     self._logits = ((1., 0., 0.),)
-    self._labels = (2,)
+    self._labels = ((2,),)
 
   def _expected_eval_metrics(self, expected_loss):
     return {
         "accuracy": 0.,
-        "auc": 1. / 4,
         "loss": expected_loss,
-        "auc/class0": 1.,
-        "auc/class1": 1.,
-        "auc/class2": 0.,
         "labels/actual_label_mean/class0": 0. / 1,
         "labels/actual_label_mean/class1": 0. / 1,
         "labels/actual_label_mean/class2": 1. / 1,
@@ -750,38 +1060,48 @@ class MultiClassModelHeadTest(test.TestCase):
 
   def testMultiClassWithLogits(self):
     n_classes = 3
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      expected_loss = 1.5514446
+      expected_loss = 1.5514447
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
   def testMultiClassWithInvalidLogits(self):
-    head = head_lib._multi_class_head(n_classes=len(self._logits[0]) + 1)
+    head = head_lib.multi_class_head(n_classes=len(self._logits[0]) + 1)
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
+            logits=self._logits)
+
+  def testMultiClassWithNoneTrainOpFnInTrain(self):
+    head = head_lib.multi_class_head(n_classes=3)
+    with ops.Graph().as_default(), session.Session():
+      with self.assertRaisesRegexp(
+          ValueError, "train_op_fn can not be None in TRAIN mode"):
+        head.create_model_fn_ops(
+            {}, model_fn.ModeKeys.TRAIN, self._labels,
+            train_op_fn=None,
             logits=self._logits)
 
   def testMultiClassWithLogitsInput(self):
     n_classes = 3
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
           logits_input=((0., 0.),))
       self._assert_output_alternatives(model_fn_ops)
       w = ("multi_class_head/logits/weights:0",
@@ -793,11 +1113,7 @@ class MultiClassModelHeadTest(test.TestCase):
       expected_loss = 1.0986123
       _assert_metrics(self, expected_loss, {
           "accuracy": 0.,
-          "auc": 2. / 4,
           "loss": expected_loss,
-          "auc/class0": 1.,
-          "auc/class1": 1.,
-          "auc/class2": 0.,
           "labels/actual_label_mean/class0": 0. / 1,
           "labels/actual_label_mean/class1": 0. / 1,
           "labels/actual_label_mean/class2": 1. / 1,
@@ -814,36 +1130,97 @@ class MultiClassModelHeadTest(test.TestCase):
 
   def testMultiClassWithLogitsAndLogitsInput(self):
     n_classes = 3
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(
           ValueError, "Both logits and logits_input supplied"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits_input=((0., 0.),), logits=self._logits)
 
-  def testMultiClassEvalMode(self):
+  def testMultiClassEnableCenteredBias(self):
     n_classes = 3
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
+        n_classes=n_classes, enable_centered_bias=True)
+    with ops.Graph().as_default(), session.Session():
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_variables(
+          self,
+          expected_global=(
+              "multi_class_head/centered_bias_weight:0",
+              ("multi_class_head/multi_class_head/centered_bias_weight/"
+               "Adagrad:0"),
+          ),
+          expected_trainable=("multi_class_head/centered_bias_weight:0",))
+      variables.global_variables_initializer().run()
+      _assert_summary_tags(self,
+                           ["loss",
+                            "multi_class_head/centered_bias/bias_0",
+                            "multi_class_head/centered_bias/bias_1",
+                            "multi_class_head/centered_bias/bias_2"])
+
+  def testMultiClassEval(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
-          {}, model_fn.ModeKeys.EVAL, self._labels, _noop_train_op,
+          {}, model_fn.ModeKeys.EVAL, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      expected_loss = 1.5514446
+      expected_loss = 1.5514447
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testMultiClassWithWeight(self):
+  def testMultiClassEvalModeWithLargeLogits(self):
     n_classes = 3
-    head = head_lib._multi_class_head(
+    head = head_lib.multi_class_head(
+        n_classes=n_classes, metric_class_ids=range(n_classes))
+    logits = ((2., 0., -1),)
+    with ops.Graph().as_default(), session.Session():
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.EVAL, self._labels, head_lib.no_op_train_fn,
+          logits=logits)
+      self._assert_output_alternatives(model_fn_ops)
+      self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 3.1698461
+      expected_eval_metrics = {
+          "accuracy": 0.,
+          "loss": expected_loss,
+          "labels/actual_label_mean/class0": 0. / 1,
+          "labels/actual_label_mean/class1": 0. / 1,
+          "labels/actual_label_mean/class2": 1. / 1,
+          "labels/logits_mean/class0": logits[0][0],
+          "labels/logits_mean/class1": logits[0][1],
+          "labels/logits_mean/class2": logits[0][2],
+          "labels/prediction_mean/class0": 1,
+          "labels/prediction_mean/class1": 0,
+          "labels/prediction_mean/class2": 0,
+          "labels/probability_mean/class0": 0.843795,  # softmax
+          "labels/probability_mean/class1": 0.114195,  # softmax
+          "labels/probability_mean/class2": 0.0420101,  # softmax
+      }
+      _assert_metrics(self, expected_loss,
+                      expected_eval_metrics, model_fn_ops)
+
+  def testMultiClassWithScalarWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
         n_classes=n_classes,
         weight_column_name="label_weight",
         metric_class_ids=range(n_classes))
@@ -855,22 +1232,226 @@ class MultiClassModelHeadTest(test.TestCase):
           features={"label_weight": weight},
           labels=self._labels,
           mode=model_fn.ModeKeys.TRAIN,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._logits)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      expected_loss = 1.5514446
+      expected_loss = 1.5514447
       _assert_metrics(self, expected_loss * weight,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
+  def testMultiClassWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = (weight,)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = ((weight,),)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassWithCustomLoss(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes),
+        loss_fn=losses_lib.sparse_softmax_cross_entropy)
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weight},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447 * weight
+      _assert_metrics(self, expected_loss,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassInfer(self):
+    n_classes = 3
+    head = head_lib._multi_class_head(
+        n_classes=n_classes,
+        head_name="head_name")
+    with ops.Graph().as_default():
+      model_fn_ops = head.create_model_fn_ops(
+          features={},
+          mode=model_fn.ModeKeys.INFER,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1., 0., 0.), (0., 0., 1.),))
+      with session.Session():
+        lookup_ops.tables_initializer().run()
+        self.assertAllEqual(
+            [0, 2],
+            model_fn_ops.predictions["classes"].eval())
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.CLASSIFICATION,
+            model_fn_ops.output_alternatives["head_name"][0])
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertAllEqual(
+            [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
+            predictions_for_serving["classes"].eval())
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertAllClose(
+            [[0.576117, 0.2119416, 0.2119416],
+             [0.2119416, 0.2119416, 0.576117]],
+            predictions_for_serving["probabilities"].eval())
+
   def testInvalidNClasses(self):
     for n_classes in (None, -1, 0, 1):
       with self.assertRaisesRegexp(ValueError, "n_classes must be > 1"):
-        head_lib._multi_class_head(n_classes=n_classes)
+        head_lib.multi_class_head(n_classes=n_classes)
+
+  def testMultiClassWithLabelKeysInvalidShape(self):
+    with self.assertRaisesRegexp(
+        ValueError, "Length of label_keys must equal n_classes"):
+      head_lib._multi_class_head(
+          n_classes=3, label_keys=("key0", "key1"))
+
+  def testMultiClassWithLabelKeysTwoClasses(self):
+    with self.assertRaisesRegexp(
+        ValueError, "label_keys is not supported for n_classes=2"):
+      head_lib._multi_class_head(
+          n_classes=2, label_keys=("key0", "key1"))
+
+  def testMultiClassWithLabelKeysInfer(self):
+    n_classes = 3
+    label_keys = ("key0", "key1", "key2")
+    head = head_lib._multi_class_head(
+        n_classes=n_classes, label_keys=label_keys,
+        metric_class_ids=range(n_classes),
+        head_name="head_name")
+    with ops.Graph().as_default():
+      model_fn_ops = head.create_model_fn_ops(
+          features={},
+          mode=model_fn.ModeKeys.INFER,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1., 0., 0.), (0., 0., 1.),))
+      with session.Session():
+        lookup_ops.tables_initializer().run()
+        self.assertAllEqual(
+            [b"key0", b"key2"],
+            model_fn_ops.predictions["classes"].eval())
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.CLASSIFICATION,
+            model_fn_ops.output_alternatives["head_name"][0])
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertAllEqual(
+            [[b"key0", b"key1", b"key2"], [b"key0", b"key1", b"key2"]],
+            predictions_for_serving["classes"].eval())
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertAllClose(
+            [[0.576117, 0.2119416, 0.2119416],
+             [0.2119416, 0.2119416, 0.576117]],
+            predictions_for_serving["probabilities"].eval())
+
+  def testMultiClassWithLabelKeysEvalAccuracy0(self):
+    n_classes = 3
+    label_keys = ("key0", "key1", "key2")
+    head = head_lib._multi_class_head(
+        n_classes=n_classes,
+        label_keys=label_keys)
+    with ops.Graph().as_default():
+      model_fn_ops = head.create_model_fn_ops(
+          features={},
+          mode=model_fn.ModeKeys.EVAL,
+          labels=("key2",),
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1., 0., 0.),))
+      with session.Session():
+        lookup_ops.tables_initializer().run()
+        self.assertIsNone(model_fn_ops.train_op)
+        _assert_no_variables(self)
+        _assert_summary_tags(self, ["loss"])
+        expected_loss = 1.5514447
+        expected_eval_metrics = {
+            "accuracy": 0.,
+            "loss": expected_loss,
+        }
+        _assert_metrics(self, expected_loss,
+                        expected_eval_metrics, model_fn_ops)
+
+  def testMultiClassWithLabelKeysEvalAccuracy1(self):
+    n_classes = 3
+    label_keys = ("key0", "key1", "key2")
+    head = head_lib._multi_class_head(
+        n_classes=n_classes,
+        label_keys=label_keys)
+    with ops.Graph().as_default():
+      model_fn_ops = head.create_model_fn_ops(
+          features={},
+          mode=model_fn.ModeKeys.EVAL,
+          labels=("key2",),
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((0., 0., 1.),))
+      with session.Session():
+        lookup_ops.tables_initializer().run()
+        self.assertIsNone(model_fn_ops.train_op)
+        _assert_no_variables(self)
+        _assert_summary_tags(self, ["loss"])
+        expected_loss = 0.5514447
+        expected_eval_metrics = {
+            "accuracy": 1.,
+            "loss": expected_loss,
+        }
+        _assert_metrics(self, expected_loss,
+                        expected_eval_metrics, model_fn_ops)
 
 
-class BinarySvmModelHeadTest(test.TestCase):
+class BinarySvmHeadTest(test.TestCase):
 
   def _assert_output_alternatives(self, model_fn_ops):
     self.assertEquals({
@@ -889,13 +1470,13 @@ class BinarySvmModelHeadTest(test.TestCase):
     self._expected_losses = (.5, 0.)
 
   def testBinarySVMWithLogits(self):
-    head = head_lib._binary_svm_head()
+    head = head_lib.binary_svm_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           model_fn.ModeKeys.TRAIN,
           self._labels,
-          _noop_train_op,
+          head_lib.no_op_train_fn,
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -907,25 +1488,25 @@ class BinarySvmModelHeadTest(test.TestCase):
       }, model_fn_ops)
 
   def testBinarySVMWithInvalidLogits(self):
-    head = head_lib._binary_svm_head()
+    head = head_lib.binary_svm_head()
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
         head.create_model_fn_ops(
-            {}, model_fn.ModeKeys.TRAIN, self._labels, _noop_train_op,
+            {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits=np.ones((2, 2)))
 
   def testBinarySVMWithLogitsInput(self):
-    head = head_lib._binary_svm_head()
+    head = head_lib.binary_svm_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           model_fn.ModeKeys.TRAIN,
           self._labels,
-          _noop_train_op,
+          head_lib.no_op_train_fn,
           logits_input=((0., 0.), (0., 0.)))
       self._assert_output_alternatives(model_fn_ops)
-      w = ("binary_logistic_head/logits/weights:0",
-           "binary_logistic_head/logits/biases:0")
+      w = ("binary_svm_head/logits/weights:0",
+           "binary_svm_head/logits/biases:0")
       _assert_variables(
           self, expected_global=w, expected_model=w, expected_trainable=w)
       variables.global_variables_initializer().run()
@@ -937,7 +1518,7 @@ class BinarySvmModelHeadTest(test.TestCase):
       }, model_fn_ops)
 
   def testBinarySVMWithLogitsAndLogitsInput(self):
-    head = head_lib._binary_svm_head()
+    head = head_lib.binary_svm_head()
     with ops.Graph().as_default(), session.Session():
       with self.assertRaisesRegexp(
           ValueError, "Both logits and logits_input supplied"):
@@ -945,18 +1526,18 @@ class BinarySvmModelHeadTest(test.TestCase):
             {},
             model_fn.ModeKeys.TRAIN,
             self._labels,
-            _noop_train_op,
+            head_lib.no_op_train_fn,
             logits_input=((0., 0.), (0., 0.)),
             logits=self._predictions)
 
   def testBinarySVMEvalMode(self):
-    head = head_lib._binary_svm_head()
+    head = head_lib.binary_svm_head()
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           model_fn.ModeKeys.EVAL,
           self._labels,
-          _noop_train_op,
+          head_lib.no_op_train_fn,
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
@@ -970,13 +1551,13 @@ class BinarySvmModelHeadTest(test.TestCase):
 
   def testBinarySVMWithLabelName(self):
     label_name = "my_label"
-    head = head_lib._binary_svm_head(label_name=label_name)
+    head = head_lib.binary_svm_head(label_name=label_name)
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           model_fn.ModeKeys.TRAIN,
           {label_name: self._labels},
-          _noop_train_op,
+          head_lib.no_op_train_fn,
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
@@ -987,47 +1568,69 @@ class BinarySvmModelHeadTest(test.TestCase):
           "loss": expected_loss,
       }, model_fn_ops)
 
-  def testBinarySVMWithWeights(self):
-    head = head_lib._binary_svm_head(weight_column_name="weights")
+  def testBinarySVMWith1DWeights(self):
+    head = head_lib.binary_svm_head(weight_column_name="weights")
     with ops.Graph().as_default(), session.Session():
       weights = (7., 11.)
       model_fn_ops = head.create_model_fn_ops(
+          # We have to add an extra dim here for weights broadcasting to work.
           features={"weights": weights},
           mode=model_fn.ModeKeys.TRAIN,
           labels=self._labels,
-          train_op_fn=_noop_train_op,
+          train_op_fn=head_lib.no_op_train_fn,
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      expected_weighted_sum = np.sum(
-          np.multiply(weights, self._expected_losses))
-      _assert_metrics(self, expected_weighted_sum / len(weights), {
+      expected_weighted_losses = np.multiply(weights, self._expected_losses)
+      _assert_metrics(self, np.mean(expected_weighted_losses), {
           "accuracy": 1.,
-          "loss": expected_weighted_sum / np.sum(weights),
+          "loss": np.sum(expected_weighted_losses) / np.sum(weights),
+      }, model_fn_ops)
+
+  def testBinarySVMWith2DWeights(self):
+    head = head_lib.binary_svm_head(weight_column_name="weights")
+    with ops.Graph().as_default(), session.Session():
+      weights = (7., 11.)
+      model_fn_ops = head.create_model_fn_ops(
+          # We have to add an extra dim here for weights broadcasting to work.
+          features={"weights": tuple([(w,) for w in weights])},
+          mode=model_fn.ModeKeys.TRAIN,
+          labels=self._labels,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._predictions)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_weighted_losses = np.multiply(weights, self._expected_losses)
+      _assert_metrics(self, np.mean(expected_weighted_losses), {
+          "accuracy": 1.,
+          "loss": np.sum(expected_weighted_losses) / np.sum(weights),
       }, model_fn_ops)
 
   def testBinarySVMWithCenteredBias(self):
-    head = head_lib._binary_svm_head(enable_centered_bias=True)
+    head = head_lib.binary_svm_head(enable_centered_bias=True)
     with ops.Graph().as_default(), session.Session():
       model_fn_ops = head.create_model_fn_ops(
           {},
           model_fn.ModeKeys.TRAIN,
           self._labels,
-          _noop_train_op,
+          head_lib.no_op_train_fn,
           logits=self._predictions)
       self._assert_output_alternatives(model_fn_ops)
       _assert_variables(
           self,
           expected_global=(
-              "binary_logistic_head/centered_bias_weight:0",
-              ("binary_logistic_head/binary_logistic_head/centered_bias_weight/"
+              "binary_svm_head/centered_bias_weight:0",
+              ("binary_svm_head/binary_svm_head/centered_bias_weight/"
                "Adagrad:0"),
           ),
-          expected_trainable=("binary_logistic_head/centered_bias_weight:0",))
+          expected_trainable=("binary_svm_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "binary_logistic_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "binary_svm_head/centered_bias/bias_0"
+      ])
       expected_loss = np.average(self._expected_losses)
       _assert_metrics(self, expected_loss, {
           "accuracy": 1.,
@@ -1035,24 +1638,57 @@ class BinarySvmModelHeadTest(test.TestCase):
       }, model_fn_ops)
 
 
+class LossOnlyHead(test.TestCase):
+
+  def testNoPredictionsAndNoMetrics(self):
+    head = head_lib.loss_only_head(lambda: 1, head_name="const")
+    model_fn_ops = head.create_model_fn_ops(
+        features={},
+        mode=model_fn.ModeKeys.TRAIN,
+        train_op_fn=head_lib.no_op_train_fn)
+    self.assertDictEqual(model_fn_ops.predictions, {})
+    self.assertDictEqual(model_fn_ops.eval_metric_ops, {})
+    self.assertIsNotNone(model_fn_ops.loss)
+    with session.Session() as sess:
+      self.assertEqual(1, sess.run(model_fn_ops.loss))
+
+
 class MultiHeadTest(test.TestCase):
 
   def testInvalidHeads(self):
-    named_head = head_lib._multi_class_head(
+    named_head = head_lib.multi_class_head(
         n_classes=3, label_name="label", head_name="head1")
-    unnamed_head = head_lib._multi_class_head(
+    unnamed_head = head_lib.multi_class_head(
         n_classes=4, label_name="label")
     with self.assertRaisesRegexp(ValueError, "must have names"):
-      head_lib._multi_head((named_head, unnamed_head))
-    with self.assertRaisesRegexp(ValueError, "must be SingleHead"):
-      head_lib._multi_head((named_head, head_lib._multi_head((named_head,))))
+      head_lib.multi_head((named_head, unnamed_head))
+
+  def testTrainWithNoneTrainOpFn(self):
+    head1 = head_lib.multi_class_head(
+        n_classes=3, label_name="label1", head_name="head1")
+    head2 = head_lib.multi_class_head(
+        n_classes=4, label_name="label2", head_name="head2")
+    head = head_lib.multi_head((head1, head2))
+    labels = {
+        "label1": (1,),
+        "label2": (1,)
+    }
+    with self.assertRaisesRegexp(
+        ValueError, "train_op_fn can not be None in TRAIN mode"):
+      head.create_model_fn_ops(
+          features={"weights": (2.0, 10.0)},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=None,
+          logits=((-0.7, 0.2, .1, .1, .1, .1, .1),))
 
   def testTrain_withNoHeadWeights(self):
-    head1 = head_lib._multi_class_head(
+    head1 = head_lib.multi_class_head(
         n_classes=3, label_name="label1", head_name="head1")
-    head2 = head_lib._multi_class_head(
+    head2 = head_lib.multi_class_head(
         n_classes=4, label_name="label2", head_name="head2")
-    head = head_lib._multi_head((head1, head2))
+    head3 = head_lib.loss_only_head(lambda: 1.0, head_name="const")
+    head = head_lib.multi_head((head1, head2, head3))
     labels = {
         "label1": (1,),
         "label2": (1,)
@@ -1061,7 +1697,7 @@ class MultiHeadTest(test.TestCase):
         features={"weights": (2.0, 10.0)},
         labels=labels,
         mode=model_fn.ModeKeys.TRAIN,
-        train_op_fn=_noop_train_op,
+        train_op_fn=head_lib.no_op_train_fn,
         logits=((-0.7, 0.2, .1, .1, .1, .1, .1),))
 
     self.assertIsNone(model_fn_ops.predictions)
@@ -1071,14 +1707,14 @@ class MultiHeadTest(test.TestCase):
     self.assertIsNone(model_fn_ops.output_alternatives)
 
     with session.Session() as sess:
-      self.assertAlmostEqual(2.224, sess.run(model_fn_ops.loss), places=3)
+      self.assertAlmostEqual(3.224, sess.run(model_fn_ops.loss), places=3)
 
   def testTrain_withHeadWeights(self):
-    head1 = head_lib._multi_class_head(
+    head1 = head_lib.multi_class_head(
         n_classes=3, label_name="label1", head_name="head1")
-    head2 = head_lib._multi_class_head(
+    head2 = head_lib.multi_class_head(
         n_classes=4, label_name="label2", head_name="head2")
-    head = head_lib._multi_head((head1, head2), (1, .5))
+    head = head_lib.multi_head((head1, head2), (1, .5))
     labels = {
         "label1": (1,),
         "label2": (1,)
@@ -1087,7 +1723,7 @@ class MultiHeadTest(test.TestCase):
         features={"weights": (2.0, 10.0)},
         labels=labels,
         mode=model_fn.ModeKeys.TRAIN,
-        train_op_fn=_noop_train_op,
+        train_op_fn=head_lib.no_op_train_fn,
         logits=((-0.7, 0.2, .1, .1, .1, .1, .1),))
     self.assertIsNone(model_fn_ops.predictions)
     self.assertIsNotNone(model_fn_ops.loss)
@@ -1098,12 +1734,39 @@ class MultiHeadTest(test.TestCase):
     with session.Session() as sess:
       self.assertAlmostEqual(1.531, sess.run(model_fn_ops.loss), places=3)
 
-  def testInfer(self):
-    head1 = head_lib._multi_class_head(
+  def testTrain_withDictLogits(self):
+    head1 = head_lib.multi_class_head(
         n_classes=3, label_name="label1", head_name="head1")
-    head2 = head_lib._multi_class_head(
+    head2 = head_lib.multi_class_head(
         n_classes=4, label_name="label2", head_name="head2")
-    head = head_lib._multi_head((head1, head2), (1, .5))
+    head = head_lib.multi_head((head1, head2))
+    labels = {
+        "label1": (1,),
+        "label2": (1,)
+    }
+    model_fn_ops = head.create_model_fn_ops(
+        features={"weights": (2.0, 10.0)},
+        labels=labels,
+        mode=model_fn.ModeKeys.TRAIN,
+        train_op_fn=head_lib.no_op_train_fn,
+        logits={head1.head_name: ((-0.7, 0.2, .1),),
+                head2.head_name: ((.1, .1, .1, .1),)})
+
+    self.assertIsNone(model_fn_ops.predictions)
+    self.assertIsNotNone(model_fn_ops.loss)
+    self.assertIsNotNone(model_fn_ops.train_op)
+    self.assertFalse(model_fn_ops.eval_metric_ops)
+    self.assertIsNone(model_fn_ops.output_alternatives)
+
+    with session.Session() as sess:
+      self.assertAlmostEqual(2.224, sess.run(model_fn_ops.loss), places=3)
+
+  def testInfer(self):
+    head1 = head_lib.multi_class_head(
+        n_classes=3, label_name="label1", head_name="head1")
+    head2 = head_lib.multi_class_head(
+        n_classes=4, label_name="label2", head_name="head2")
+    head = head_lib.multi_head((head1, head2), (1, .5))
     labels = {
         "label1": (1,),
         "label2": (1,)
@@ -1112,7 +1775,7 @@ class MultiHeadTest(test.TestCase):
         features={"weights": (2.0, 10.0)},
         labels=labels,
         mode=model_fn.ModeKeys.INFER,
-        train_op_fn=_noop_train_op,
+        train_op_fn=head_lib.no_op_train_fn,
         logits=((-0.7, 0.2, .1, .1, .1, .1, .1),))
 
     self.assertIsNotNone(model_fn_ops.predictions)
@@ -1138,22 +1801,20 @@ class MultiHeadTest(test.TestCase):
         k: v[0] for k, v in six.iteritems(model_fn_ops.output_alternatives)
     })
     self.assertItemsEqual((
-        prediction_key.PredictionKey.LOGITS,
         prediction_key.PredictionKey.PROBABILITIES,
         prediction_key.PredictionKey.CLASSES,
     ), model_fn_ops.output_alternatives["head1"][1].keys())
     self.assertItemsEqual((
-        prediction_key.PredictionKey.LOGITS,
         prediction_key.PredictionKey.PROBABILITIES,
         prediction_key.PredictionKey.CLASSES,
     ), model_fn_ops.output_alternatives["head2"][1].keys())
 
   def testEval(self):
-    head1 = head_lib._multi_class_head(
+    head1 = head_lib.multi_class_head(
         n_classes=3, label_name="label1", head_name="head1")
-    head2 = head_lib._multi_class_head(
+    head2 = head_lib.multi_class_head(
         n_classes=4, label_name="label2", head_name="head2")
-    head = head_lib._multi_head((head1, head2), (1, .5))
+    head = head_lib.multi_head((head1, head2), (1, .5))
     labels = {
         "label1": (1,),
         "label2": (1,)
@@ -1162,7 +1823,7 @@ class MultiHeadTest(test.TestCase):
         features={"weights": (2.0, 10.0)},
         labels=labels,
         mode=model_fn.ModeKeys.EVAL,
-        train_op_fn=_noop_train_op,
+        train_op_fn=head_lib.no_op_train_fn,
         logits=((-0.7, 0.2, .1, .1, .1, .1, .1),))
 
     self.assertIsNotNone(model_fn_ops.predictions)
@@ -1178,8 +1839,8 @@ class MultiHeadTest(test.TestCase):
     self.assertIn("accuracy/head2", metric_ops.keys())
 
 
-def _noop_train_op(unused_loss):
-  return control_flow_ops.no_op()
+def _sigmoid_cross_entropy(labels, logits, weights):
+  return losses_lib.sigmoid_cross_entropy(labels, logits, weights)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index e61813cf4ef..a473cf46d59 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
@@ -26,22 +27,126 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.summary import summary
 from tensorflow.python.ops.control_flow_ops import with_dependencies
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 
-SQUARED_EUCLIDEAN_DISTANCE = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
-COSINE_DISTANCE = clustering_ops.COSINE_DISTANCE
-RANDOM_INIT = clustering_ops.RANDOM_INIT
-KMEANS_PLUS_PLUS_INIT = clustering_ops.KMEANS_PLUS_PLUS_INIT
+
+class _LossRelativeChangeHook(session_run_hook.SessionRunHook):
+  """Stops when the change in loss goes below a tolerance."""
+
+  def __init__(self, tolerance):
+    """Initializes _LossRelativeChangeHook.
+
+    Args:
+      tolerance: A relative tolerance of change between iterations.
+    """
+    self._tolerance = tolerance
+    self._prev_loss = None
+
+  def begin(self):
+    self._loss_tensor = ops.get_default_graph().get_tensor_by_name(
+        KMeansClustering.LOSS_OP_NAME + ':0')
+    assert self._loss_tensor is not None
+
+  def before_run(self, run_context):
+    del run_context
+    return SessionRunArgs(
+        fetches={KMeansClustering.LOSS_OP_NAME: self._loss_tensor})
+
+  def after_run(self, run_context, run_values):
+    loss = run_values.results[KMeansClustering.LOSS_OP_NAME]
+    assert loss is not None
+    if self._prev_loss is not None:
+      relative_change = (abs(loss - self._prev_loss) /
+                         (1 + abs(self._prev_loss)))
+      if relative_change < self._tolerance:
+        run_context.request_stop()
+    self._prev_loss = loss
+
+
+class _InitializeClustersHook(session_run_hook.SessionRunHook):
+  """Initializes clusters or waits for cluster initialization."""
+
+  def __init__(self, init_op, is_initialized_op, is_chief):
+    self._init_op = init_op
+    self._is_chief = is_chief
+    self._is_initialized_op = is_initialized_op
+
+  def after_create_session(self, session, _):
+    assert self._init_op.graph == ops.get_default_graph()
+    assert self._is_initialized_op.graph == self._init_op.graph
+    while True:
+      try:
+        if session.run(self._is_initialized_op):
+          break
+        elif self._is_chief:
+          session.run(self._init_op)
+        else:
+          time.sleep(1)
+      except RuntimeError as e:
+        logging.info(e)
+
+
+def _parse_tensor_or_dict(features):
+  """Helper function to parse features."""
+  if isinstance(features, dict):
+    keys = sorted(features.keys())
+    with ops.colocate_with(features[keys[0]]):
+      features = array_ops.concat([features[k] for k in keys], 1)
+  return features
+
+
+def _kmeans_clustering_model_fn(features, labels, mode, params, config):
+  """Model function for KMeansClustering estimator."""
+  assert labels is None, labels
+  (all_scores, model_predictions, losses,
+   is_initialized, init_op, training_op) = clustering_ops.KMeans(
+       _parse_tensor_or_dict(features),
+       params.get('num_clusters'),
+       initial_clusters=params.get('training_initial_clusters'),
+       distance_metric=params.get('distance_metric'),
+       use_mini_batch=params.get('use_mini_batch'),
+       mini_batch_steps_per_iteration=params.get(
+           'mini_batch_steps_per_iteration'),
+       random_seed=params.get('random_seed'),
+       kmeans_plus_plus_num_retries=params.get(
+           'kmeans_plus_plus_num_retries')).training_graph()
+  incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+  loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
+  summary.scalar('loss/raw', loss)
+  training_op = with_dependencies([training_op, incr_step], loss)
+  predictions = {
+      KMeansClustering.ALL_SCORES: all_scores[0],
+      KMeansClustering.CLUSTER_IDX: model_predictions[0],
+  }
+  eval_metric_ops = {KMeansClustering.SCORES: loss}
+  training_hooks = [_InitializeClustersHook(
+      init_op, is_initialized, config.is_chief)]
+  relative_tolerance = params.get('relative_tolerance')
+  if relative_tolerance is not None:
+    training_hooks.append(_LossRelativeChangeHook(relative_tolerance))
+  return ModelFnOps(
+      mode=mode,
+      predictions=predictions,
+      eval_metric_ops=eval_metric_ops,
+      loss=loss,
+      train_op=training_op,
+      training_hooks=training_hooks)
 
 
 # TODO(agarwal,ands): support sharded input.
 class KMeansClustering(estimator.Estimator):
   """An Estimator for K-Means clustering."""
+  SQUARED_EUCLIDEAN_DISTANCE = clustering_ops.SQUARED_EUCLIDEAN_DISTANCE
+  COSINE_DISTANCE = clustering_ops.COSINE_DISTANCE
+  RANDOM_INIT = clustering_ops.RANDOM_INIT
+  KMEANS_PLUS_PLUS_INIT = clustering_ops.KMEANS_PLUS_PLUS_INIT
   SCORES = 'scores'
   CLUSTER_IDX = 'cluster_idx'
   CLUSTERS = 'clusters'
@@ -51,10 +156,11 @@ class KMeansClustering(estimator.Estimator):
   def __init__(self,
                num_clusters,
                model_dir=None,
-               initial_clusters=clustering_ops.RANDOM_INIT,
-               distance_metric=clustering_ops.SQUARED_EUCLIDEAN_DISTANCE,
+               initial_clusters=RANDOM_INIT,
+               distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
                random_seed=0,
                use_mini_batch=True,
+               mini_batch_steps_per_iteration=1,
                kmeans_plus_plus_num_retries=2,
                relative_tolerance=None,
                config=None):
@@ -70,6 +176,9 @@ class KMeansClustering(estimator.Estimator):
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
         full batch.
+      mini_batch_steps_per_iteration: number of steps after which the updated
+        cluster centers are synced back to a master copy. See clustering_ops.py
+        for more details.
       kmeans_plus_plus_num_retries: For each point that is sampled during
         kmeans++ initialization, this parameter specifies the number of
         additional points to draw from the current distribution before selecting
@@ -80,47 +189,20 @@ class KMeansClustering(estimator.Estimator):
         Note that this may not work correctly if use_mini_batch=True.
       config: See Estimator
     """
-    self._num_clusters = num_clusters
-    self._training_initial_clusters = initial_clusters
-    self._distance_metric = distance_metric
-    self._random_seed = random_seed
-    self._use_mini_batch = use_mini_batch
-    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
-    self._relative_tolerance = relative_tolerance
+    params = {}
+    params['num_clusters'] = num_clusters
+    params['training_initial_clusters'] = initial_clusters
+    params['distance_metric'] = distance_metric
+    params['random_seed'] = random_seed
+    params['use_mini_batch'] = use_mini_batch
+    params['mini_batch_steps_per_iteration'] = mini_batch_steps_per_iteration
+    params['kmeans_plus_plus_num_retries'] = kmeans_plus_plus_num_retries
+    params['relative_tolerance'] = relative_tolerance
     super(KMeansClustering, self).__init__(
-        model_fn=self._get_model_function(), model_dir=model_dir)
-
-  class LossRelativeChangeHook(session_run_hook.SessionRunHook):
-    """Stops when the change in loss goes below a tolerance."""
-
-    def __init__(self, tolerance):
-      """Initializes LossRelativeChangeHook.
-
-      Args:
-        tolerance: A relative tolerance of change between iterations.
-      """
-      self._tolerance = tolerance
-      self._prev_loss = None
-
-    def begin(self):
-      self._loss_tensor = ops.get_default_graph().get_tensor_by_name(
-          KMeansClustering.LOSS_OP_NAME + ':0')
-      assert self._loss_tensor is not None
-
-    def before_run(self, run_context):
-      del run_context
-      return SessionRunArgs(
-          fetches={KMeansClustering.LOSS_OP_NAME: self._loss_tensor})
-
-    def after_run(self, run_context, run_values):
-      loss = run_values.results[KMeansClustering.LOSS_OP_NAME]
-      assert loss is not None
-      if self._prev_loss is not None:
-        relative_change = (abs(loss - self._prev_loss) /
-                           (1 + abs(self._prev_loss)))
-        if relative_change < self._tolerance:
-          run_context.request_stop()
-      self._prev_loss = loss
+        model_fn=_kmeans_clustering_model_fn,
+        params=params,
+        model_dir=model_dir,
+        config=config)
 
   def predict_cluster_idx(self, input_fn=None):
     """Yields predicted cluster indices."""
@@ -176,49 +258,3 @@ class KMeansClustering(estimator.Estimator):
   def clusters(self):
     """Returns cluster centers."""
     return super(KMeansClustering, self).get_variable_value(self.CLUSTERS)
-
-  def _parse_tensor_or_dict(self, features):
-    if isinstance(features, dict):
-      keys = sorted(features.keys())
-      with ops.colocate_with(features[keys[0]]):
-        features = array_ops.concat([features[k] for k in keys], 1)
-    return features
-
-  def _get_model_function(self):
-    """Creates a model function."""
-
-    def _model_fn(features, labels, mode):
-      """Model function."""
-      assert labels is None, labels
-      (all_scores, model_predictions, losses,
-       training_op) = clustering_ops.KMeans(
-           self._parse_tensor_or_dict(features),
-           self._num_clusters,
-           self._training_initial_clusters,
-           self._distance_metric,
-           self._use_mini_batch,
-           random_seed=self._random_seed,
-           kmeans_plus_plus_num_retries=self.
-           _kmeans_plus_plus_num_retries).training_graph()
-      incr_step = state_ops.assign_add(variables.get_global_step(), 1)
-      loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
-      logging_ops.scalar_summary('loss/raw', loss)
-      training_op = with_dependencies([training_op, incr_step], loss)
-      predictions = {
-          KMeansClustering.ALL_SCORES: all_scores[0],
-          KMeansClustering.CLUSTER_IDX: model_predictions[0],
-      }
-      eval_metric_ops = {KMeansClustering.SCORES: loss,}
-      if self._relative_tolerance is not None:
-        training_hooks = [self.LossRelativeChangeHook(self._relative_tolerance)]
-      else:
-        training_hooks = None
-      return ModelFnOps(
-          mode=mode,
-          predictions=predictions,
-          eval_metric_ops=eval_metric_ops,
-          loss=loss,
-          train_op=training_op,
-          training_hooks=training_hooks)
-
-    return _model_fn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index 8364a57f326..e5c01336cf7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -19,28 +19,29 @@ from __future__ import division
 from __future__ import print_function
 
 import math
-import sys
 import time
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 from sklearn.cluster import KMeans as SklearnKMeans
 
-from tensorflow.contrib import factorization
+# pylint: disable=g-import-not-at-top
+from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn.estimators import kmeans as kmeans_lib
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import queue_runner
 
 FLAGS = flags.FLAGS
 
@@ -69,25 +70,43 @@ def make_random_points(centers, num_points, max_offset=20):
 
 class KMeansTestBase(test.TestCase):
 
-  def input_fn(self, batch_size=None, points=None, num_epochs=None):
+  def input_fn(self, batch_size=None, points=None, randomize=None,
+               num_epochs=None):
     """Returns an input_fn that randomly selects batches from given points."""
     batch_size = batch_size or self.batch_size
     points = points if points is not None else self.points
     num_points = points.shape[0]
-
+    if randomize is None:
+      randomize = (self.use_mini_batch and
+                   self.mini_batch_steps_per_iteration <= 1)
     def _fn():
       x = constant_op.constant(points)
       if batch_size == num_points:
         return input_lib.limit_epochs(x, num_epochs=num_epochs), None
-      indices = random_ops.random_uniform(
-          constant_op.constant([batch_size]),
-          minval=0,
-          maxval=num_points - 1,
-          dtype=dtypes.int32,
-          seed=10)
+      if randomize:
+        indices = random_ops.random_uniform(
+            constant_op.constant([batch_size]),
+            minval=0, maxval=num_points-1,
+            dtype=dtypes.int32,
+            seed=10)
+      else:
+        # We need to cycle through the indices sequentially. We create a queue
+        # to maintain the list of indices.
+        q = data_flow_ops.FIFOQueue(self.num_points, dtypes.int32, ())
+        # Conditionally initialize the Queue.
+        def _init_q():
+          with ops.control_dependencies([q.enqueue_many(
+              math_ops.range(self.num_points))]):
+            return control_flow_ops.no_op()
+        init_q = control_flow_ops.cond(q.size() <= 0,
+                                       _init_q,
+                                       control_flow_ops.no_op)
+        with ops.control_dependencies([init_q]):
+          offsets = q.dequeue_many(self.batch_size)
+          with ops.control_dependencies([q.enqueue_many(offsets)]):
+            indices = array_ops.identity(offsets)
       batch = array_ops.gather(x, indices)
       return (input_lib.limit_epochs(batch, num_epochs=num_epochs), None)
-
     return _fn
 
   @staticmethod
@@ -102,6 +121,10 @@ class KMeansTestBase(test.TestCase):
   def use_mini_batch(self):
     return False
 
+  @property
+  def mini_batch_steps_per_iteration(self):
+    return 1
+
 
 class KMeansTest(KMeansTestBase):
 
@@ -109,7 +132,7 @@ class KMeansTest(KMeansTestBase):
     np.random.seed(3)
     self.num_centers = 5
     self.num_dims = 2
-    self.num_points = 10000
+    self.num_points = 1000
     self.true_centers = make_random_centers(self.num_centers, self.num_dims)
     self.points, _, self.scores = make_random_points(self.true_centers,
                                                      self.num_points)
@@ -118,10 +141,11 @@ class KMeansTest(KMeansTestBase):
   def _kmeans(self, relative_tolerance=None):
     return kmeans_lib.KMeansClustering(
         self.num_centers,
-        initial_clusters=factorization.RANDOM_INIT,
+        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=self.use_mini_batch,
-        config=self.config(14),
-        random_seed=10,
+        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
+        random_seed=24,
         relative_tolerance=relative_tolerance)
 
   def test_clusters(self):
@@ -144,12 +168,15 @@ class KMeansTest(KMeansTestBase):
 
   def test_monitor(self):
     if self.use_mini_batch:
+      # We don't test for use_mini_batch case since the loss value can be noisy.
       return
     kmeans = kmeans_lib.KMeansClustering(
         self.num_centers,
-        initial_clusters=factorization.RANDOM_INIT,
+        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=self.use_mini_batch,
-        config=run_config.RunConfig(tf_random_seed=14),
+        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
+        config=learn.RunConfig(tf_random_seed=14),
         random_seed=12,
         relative_tolerance=1e-4)
 
@@ -159,11 +186,13 @@ class KMeansTest(KMeansTestBase):
         steps=None)
     score = kmeans.score(
         input_fn=self.input_fn(batch_size=self.num_points), steps=1)
-    self.assertNear(self.true_score, score, self.true_score * 0.005)
+    self.assertNear(self.true_score, score, self.true_score * 0.01)
 
   def test_infer(self):
-    kmeans = self._kmeans(relative_tolerance=1e-4)
-    kmeans.fit(input_fn=self.input_fn())
+    kmeans = self._kmeans()
+    # Make a call to fit to initialize the cluster centers.
+    max_steps = 1
+    kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
     clusters = kmeans.clusters()
 
     # Make a small test set
@@ -194,8 +223,11 @@ class KMeansTest(KMeansTestBase):
     points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError('less'):
-      kmeans = kmeans_lib.KMeansClustering(
-          num_clusters=3, initial_clusters=factorization.RANDOM_INIT)
+      kmeans = learn.KMeansClustering(
+          num_clusters=3,
+          use_mini_batch=self.use_mini_batch,
+          mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
+          initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT)
       kmeans.fit(input_fn=lambda: (constant_op.constant(points), None),
                  steps=10)
 
@@ -204,13 +236,42 @@ class KMeansTest(KMeansTestBase):
     points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError(AssertionError):
-      kmeans = kmeans_lib.KMeansClustering(
-          num_clusters=3, initial_clusters=factorization.KMEANS_PLUS_PLUS_INIT)
+      kmeans = learn.KMeansClustering(
+          num_clusters=3,
+          use_mini_batch=self.use_mini_batch,
+          mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
+          initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT)
       kmeans.fit(input_fn=lambda: (constant_op.constant(points), None),
                  steps=10)
 
 
-class KMeansTestCosineDistance(KMeansTestBase):
+class MiniBatchKMeansTest(KMeansTest):
+
+  @property
+  def batch_size(self):
+    return 50
+
+  @property
+  def use_mini_batch(self):
+    return True
+
+
+class FullBatchAsyncKMeansTest(KMeansTest):
+
+  @property
+  def batch_size(self):
+    return 50
+
+  @property
+  def use_mini_batch(self):
+    return True
+
+  @property
+  def mini_batch_steps_per_iteration(self):
+    return self.num_points // self.batch_size
+
+
+class KMeansCosineDistanceTest(KMeansTestBase):
 
   def setUp(self):
     self.points = np.array(
@@ -228,50 +289,52 @@ class KMeansTestCosineDistance(KMeansTestBase):
                     normalize(self.points)[4:, :], axis=0, keepdims=True))[0]
         ],
         dtype=np.float32)
-    self.true_assignments = [0] * 4 + [1] * 4
+    self.true_assignments = np.array([0] * 4 + [1] * 4)
     self.true_score = len(self.points) - np.tensordot(
         normalize(self.points), self.true_centers[self.true_assignments])
 
     self.num_centers = 2
     self.kmeans = kmeans_lib.KMeansClustering(
         self.num_centers,
-        initial_clusters=factorization.RANDOM_INIT,
-        distance_metric=factorization.COSINE_DISTANCE,
+        initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
         use_mini_batch=self.use_mini_batch,
+        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
         config=self.config(3))
 
   def test_fit(self):
-    self.kmeans.fit(input_fn=self.input_fn(), steps=10)
+    max_steps = 10 * self.num_points // self.batch_size
+    self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
     centers = normalize(self.kmeans.clusters())
-    self.assertAllClose(
-        np.sort(
-            centers, axis=0), np.sort(
-                self.true_centers, axis=0))
+    centers = centers[centers[:, 0].argsort()]
+    true_centers = self.true_centers[self.true_centers[:, 0].argsort()]
+    self.assertAllClose(centers, true_centers, atol=0.04)
 
   def test_transform(self):
     self.kmeans.fit(input_fn=self.input_fn(), steps=10)
     centers = normalize(self.kmeans.clusters())
     true_transform = 1 - cosine_similarity(self.points, centers)
-    transform = self.kmeans.transform(input_fn=self.input_fn())
+    transform = self.kmeans.transform(input_fn=self.input_fn(
+        batch_size=self.num_points))
     self.assertAllClose(transform, true_transform, atol=1e-3)
 
   def test_predict(self):
-    self.kmeans.fit(input_fn=self.input_fn(), steps=30)
-
+    max_steps = 10 * self.num_points // self.batch_size
+    self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
     centers = normalize(self.kmeans.clusters())
-    self.assertAllClose(
-        np.sort(
-            centers, axis=0), np.sort(
-                self.true_centers, axis=0), atol=1e-2)
 
     assignments = list(self.kmeans.predict_cluster_idx(
-        input_fn=self.input_fn(num_epochs=1)))
+        input_fn=self.input_fn(num_epochs=1, batch_size=self.num_points)))
     self.assertAllClose(
         centers[assignments],
         self.true_centers[self.true_assignments],
         atol=1e-2)
 
-    score = self.kmeans.score(input_fn=self.input_fn(), steps=1)
+    centers = centers[centers[:, 0].argsort()]
+    true_centers = self.true_centers[self.true_centers[:, 0].argsort()]
+    self.assertAllClose(centers, true_centers, atol=0.04)
+    score = self.kmeans.score(input_fn=self.input_fn(
+        batch_size=self.num_points), steps=1)
     self.assertAllClose(score, self.true_score, atol=1e-2)
 
   def test_predict_kmeans_plus_plus(self):
@@ -300,9 +363,10 @@ class KMeansTestCosineDistance(KMeansTestBase):
 
     kmeans = kmeans_lib.KMeansClustering(
         3,
-        initial_clusters=factorization.KMEANS_PLUS_PLUS_INIT,
-        distance_metric=factorization.COSINE_DISTANCE,
+        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
         use_mini_batch=self.use_mini_batch,
+        mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
         config=self.config(3))
     kmeans.fit(input_fn=lambda: (constant_op.constant(points), None), steps=30)
 
@@ -323,17 +387,32 @@ class KMeansTestCosineDistance(KMeansTestBase):
     self.assertAllClose(score, true_score, atol=1e-2)
 
 
-class MiniBatchKMeansTest(KMeansTest):
+class MiniBatchKMeansCosineTest(KMeansCosineDistanceTest):
 
   @property
   def batch_size(self):
-    return 450
+    return 2
 
   @property
   def use_mini_batch(self):
     return True
 
 
+class FullBatchAsyncKMeansCosineTest(KMeansCosineDistanceTest):
+
+  @property
+  def batch_size(self):
+    return 2
+
+  @property
+  def use_mini_batch(self):
+    return True
+
+  @property
+  def mini_batch_steps_per_iteration(self):
+    return self.num_points // self.batch_size
+
+
 class KMeansBenchmark(benchmark.Benchmark):
   """Base class for benchmarks."""
 
@@ -405,13 +484,13 @@ class TensorflowKMeansBenchmark(KMeansBenchmark):
       print('Starting tensorflow KMeans: %d' % i)
       tf_kmeans = kmeans_lib.KMeansClustering(
           self.num_clusters,
-          initial_clusters=factorization.KMEANS_PLUS_PLUS_INIT,
+          initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
           kmeans_plus_plus_num_retries=int(math.log(self.num_clusters) + 2),
           random_seed=i * 42,
+          relative_tolerance=1e-6,
           config=run_config.RunConfig(tf_random_seed=3))
       tf_kmeans.fit(input_fn=lambda: (constant_op.constant(self.points), None),
-                    steps=50,
-                    relative_tolerance=1e-6)
+                    steps=50)
       _ = tf_kmeans.clusters()
       scores.append(
           tf_kmeans.score(
@@ -439,5 +518,27 @@ class SklearnKMeansBenchmark(KMeansBenchmark):
     self._report(num_iters, start, time.time(), scores)
 
 
+class KMeansTestQueues(test.TestCase):
+
+  def input_fn(self):
+    def _fn():
+      queue = data_flow_ops.FIFOQueue(capacity=10,
+                                      dtypes=dtypes.float32,
+                                      shapes=[10, 3])
+      enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32))
+      queue_runner.add_queue_runner(queue_runner.QueueRunner(queue,
+                                                             [enqueue_op]))
+      return queue.dequeue(), None
+    return _fn
+
+  # This test makes sure that there are no deadlocks when using a QueueRunner.
+  # Note that since cluster initialization is dependendent on inputs, if input
+  # is generated using a QueueRunner, one has to make sure that these runners
+  # are started before the initialization.
+  def test_queues(self):
+    kmeans = kmeans_lib.KMeansClustering(5)
+    kmeans.fit(input_fn=self.input_fn(), steps=1)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f35828d3422..8a595a79016 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import math
-import re
 
 import six
 
@@ -28,11 +27,13 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -118,7 +119,6 @@ def _linear_model_fn(features, labels, mode, params, config=None):
           optimizer to use for training. If `None`, will use a FTRL optimizer.
       * gradient_clip_norm: A float > 0. If provided, gradients are
           clipped to their global norm with this clipping ratio.
-      * num_ps_replicas: The number of parameter server replicas.
       * joint_weights: If True, the weights for all columns will be stored in a
         single (possibly partitioned) variable. It's more efficient, but it's
         incompatible with SDCAOptimizer, and requires all feature columns are
@@ -150,22 +150,24 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if joint_weights:
-      logits, _, _ = (
-          layers.joint_weighted_sum_from_feature_columns(
-              columns_to_tensors=features,
-              feature_columns=feature_columns,
-              num_outputs=head.logits_dimension,
-              weight_collections=[parent_scope],
-              scope=scope))
+    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+            for fc in feature_columns]):
+      if joint_weights:
+        layer_fn = layers.joint_weighted_sum_from_feature_columns
+      else:
+        layer_fn = layers.weighted_sum_from_feature_columns
+      logits, _, _ = layer_fn(
+          columns_to_tensors=features,
+          feature_columns=feature_columns,
+          num_outputs=head.logits_dimension,
+          weight_collections=[parent_scope],
+          scope=scope)
     else:
-      logits, _, _ = (
-          layers.weighted_sum_from_feature_columns(
-              columns_to_tensors=features,
-              feature_columns=feature_columns,
-              num_outputs=head.logits_dimension,
-              weight_collections=[parent_scope],
-              scope=scope))
+      logits = fc_core.linear_model(
+          features=features,
+          feature_columns=feature_columns,
+          units=head.logits_dimension,
+          weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
       global_step = contrib_variables.get_global_step()
@@ -196,7 +198,7 @@ def sdca_model_fn(features, labels, mode, params):
     params: A dict of hyperparameters.
       The following hyperparameters are expected:
       * head: A `Head` instance. Type must be one of `_BinarySvmHead`,
-          `_RegressionHead` or `_MultiClassHead`.
+          `_RegressionHead` or `_BinaryLogisticHead`.
       * feature_columns: An iterable containing all the feature columns used by
           the model.
       * optimizer: An `SDCAOptimizer` instance.
@@ -223,22 +225,23 @@ def sdca_model_fn(features, labels, mode, params):
   if not isinstance(optimizer, sdca_optimizer.SDCAOptimizer):
     raise ValueError("Optimizer must be of type SDCAOptimizer")
 
-  # pylint: disable=protected-access
-  if isinstance(head, head_lib._BinarySvmHead):
+  if isinstance(head, head_lib._BinarySvmHead):  # pylint: disable=protected-access
     loss_type = "hinge_loss"
-  elif isinstance(
-      head, (head_lib._MultiClassHead, head_lib._BinaryLogisticHead)):
+  elif isinstance(head, head_lib._BinaryLogisticHead):  # pylint: disable=protected-access
     loss_type = "logistic_loss"
-  elif isinstance(head, head_lib._RegressionHead):
+  elif isinstance(head, head_lib._RegressionHead):  # pylint: disable=protected-access
+    assert head.logits_dimension == 1, ("SDCA only applies for "
+                                        "logits_dimension=1.")
     loss_type = "squared_loss"
   else:
     raise ValueError("Unsupported head type: {}".format(head))
-  # pylint: enable=protected-access
 
   parent_scope = "linear"
 
   with variable_scope.variable_op_scope(
       features.values(), parent_scope) as scope:
+    features = features.copy()
+    features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
         layers.weighted_sum_from_feature_columns(
             columns_to_tensors=features,
@@ -339,9 +342,34 @@ class LinearClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = LinearClassifier(
+      n_classes=n_classes,
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -369,7 +397,8 @@ class LinearClassifier(estimator.Estimator):
                enable_centered_bias=False,
                _joint_weight=False,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               label_keys=None):
     """Construct a `LinearClassifier` estimator object.
 
     Args:
@@ -404,6 +433,8 @@ class LinearClassifier(estimator.Estimator):
                         labels which are the output of `input_fn` and
                         returns features and labels which will be fed
                         into the model.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -415,7 +446,6 @@ class LinearClassifier(estimator.Estimator):
     #    requested for SDCA once its default changes to False.
     self._feature_columns = tuple(feature_columns or [])
     assert self._feature_columns
-    self._optimizer = optimizer
 
     chief_hook = None
     if (isinstance(optimizer, sdca_optimizer.SDCAOptimizer) and
@@ -423,10 +453,11 @@ class LinearClassifier(estimator.Estimator):
       enable_centered_bias = False
       logging.warning("centered_bias is not supported with SDCA, "
                       "please disable it explicitly.")
-    head = head_lib._multi_class_head(  # pylint: disable=protected-access
+    head = head_lib.multi_class_head(
         n_classes,
         weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     params = {
         "head": head,
         "feature_columns": feature_columns,
@@ -567,6 +598,7 @@ class LinearClassifier(estimator.Estimator):
       return _as_iterable(preds, output=key)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -591,37 +623,6 @@ class LinearClassifier(estimator.Estimator):
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
 
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def weights_(self):
-    values = {}
-    if self._optimizer and not callable(self._optimizer):
-      optimizer_name = _get_optimizer(self._optimizer).get_name()
-    elif self._optimizer and callable(self._optimizer):
-      raise ValueError("Callable optimizer is not supported in this method.")
-    else:
-      optimizer_name = _get_default_optimizer(self._feature_columns).get_name()
-    optimizer_regex = r".*/" + optimizer_name + r"(_\d)?$"
-    for name in self.get_variable_names():
-      if (name.startswith("linear/") and
-          name != "linear/bias_weight" and
-          not re.match(optimizer_regex, name)):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
-
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def bias_(self):
-    return self.get_variable_value("linear/bias_weight")
-
 
 class LinearRegressor(estimator.Estimator):
   """Linear regressor model.
@@ -713,7 +714,6 @@ class LinearRegressor(estimator.Estimator):
     """
     self._feature_columns = tuple(feature_columns or [])
     assert self._feature_columns
-    self._optimizer = optimizer
 
     chief_hook = None
     if (isinstance(optimizer, sdca_optimizer.SDCAOptimizer) and
@@ -721,7 +721,7 @@ class LinearRegressor(estimator.Estimator):
       enable_centered_bias = False
       logging.warning("centered_bias is not supported with SDCA, "
                       "please disable it explicitly.")
-    head = head_lib._regression_head(  # pylint: disable=protected-access
+    head = head_lib.regression_head(
         weight_column_name=weight_column_name,
         label_dimension=label_dimension,
         enable_centered_bias=enable_centered_bias)
@@ -834,6 +834,7 @@ class LinearRegressor(estimator.Estimator):
       return _as_iterable(preds, output=key)
     return preds[key]
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self,
              export_dir,
              input_fn=None,
@@ -857,33 +858,108 @@ class LinearRegressor(estimator.Estimator):
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
 
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def weights_(self):
-    values = {}
-    if self._optimizer and not callable(self._optimizer):
-      optimizer_name = _get_optimizer(self._optimizer).get_name()
-    elif self._optimizer and callable(self._optimizer):
-      raise ValueError("Callable optimizer is not supported in this method.")
-    else:
-      optimizer_name = _get_default_optimizer(self._feature_columns).get_name()
-    optimizer_regex = r".*/" + optimizer_name + r"(_\d)?$"
-    for name in self.get_variable_names():
-      if (name.startswith("linear/") and
-          name != "linear/bias_weight" and
-          not re.match(optimizer_regex, name)):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
 
-  @property
-  @deprecated("2016-10-30",
-              "This method will be removed after the deprecation date. "
-              "To inspect variables, use get_variable_names() and "
-              "get_variable_value().")
-  def bias_(self):
-    return self.get_variable_value("linear/bias_weight")
+class LinearEstimator(estimator.Estimator):
+  """Linear model with user specified head.
+
+  Train a generalized linear model to predict label value given observation of
+  feature values.
+
+  Example:
+  To do poisson regression,
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  estimator = LinearEstimator(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      head=head_lib.poisson_regression_head())
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    ...
+  def input_fn_eval: # returns x, y
+    ...
+  estimator.fit(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(x=x)
+  ```
+
+  Input of `fit` and `evaluate` should have following features,
+    otherwise there will be a KeyError:
+
+  * if `weight_column_name` is not `None`:
+    key=weight_column_name, value=a `Tensor`
+  * for column in `feature_columns`:
+    - if isinstance(column, `SparseColumn`):
+        key=column.name, value=a `SparseTensor`
+    - if isinstance(column, `WeightedSparseColumn`):
+        {key=id column name, value=a `SparseTensor`,
+         key=weight column name, value=a `SparseTensor`}
+    - if isinstance(column, `RealValuedColumn`):
+        key=column.name, value=a `Tensor`
+  """
+
+  def __init__(self,  # _joint_weights: pylint: disable=invalid-name
+               feature_columns,
+               head,
+               model_dir=None,
+               weight_column_name=None,
+               optimizer=None,
+               gradient_clip_norm=None,
+               _joint_weights=False,
+               config=None,
+               feature_engineering_fn=None):
+    """Construct a `LinearEstimator` object.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      head: An instance of _Head class.
+      model_dir: Directory to save model parameters, graph, etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      optimizer: An instance of `tf.Optimizer` used to train the model. If
+        `None`, will use an Ftrl optimizer.
+      gradient_clip_norm: A `float` > 0. If provided, gradients are clipped
+        to their global norm with this clipping ratio. See
+        `tf.clip_by_global_norm` for more details.
+      _joint_weights: If True use a single (possibly partitioned) variable to
+        store the weights. It's faster, but requires all feature columns are
+        sparse and have the 'sum' combiner. Incompatible with SDCAOptimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+                        labels which are the output of `input_fn` and
+                        returns features and labels which will be fed
+                        into the model.
+
+    Returns:
+      A `LinearEstimator` estimator.
+
+    Raises:
+      ValueError: if optimizer is not supported, e.g., SDCAOptimizer
+    """
+    assert feature_columns
+    if isinstance(optimizer, sdca_optimizer.SDCAOptimizer):
+      raise ValueError("LinearEstimator does not support SDCA optimizer.")
+
+    params = {
+        "head": head,
+        "feature_columns": feature_columns,
+        "optimizer": optimizer,
+        "gradient_clip_norm": gradient_clip_norm,
+        "joint_weights": _joint_weights,
+    }
+    super(LinearEstimator, self).__init__(
+        model_fn=_linear_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 5cac1ea07bf..d3bb0fda576 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import functools
 import json
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
@@ -36,12 +30,14 @@ from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import linear
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -177,6 +173,49 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column_lib.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = linear.LinearClassifier(
+        n_classes=3,
+        feature_columns=[language_column],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_MatrixData(self):
     """Tests binary classification using matrix data as input."""
 
@@ -197,6 +236,32 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [language_column, fc_core.numeric_column('age')]
+
+    classifier = linear.LinearClassifier(feature_columns=feature_columns)
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+
   def testLogisticRegression_MatrixData_Labels1D(self):
     """Same as the last test, but labels shape is [100] instead of [100, 1]."""
 
@@ -238,8 +303,14 @@ class LinearClassifierTest(test.TestCase):
         n_classes=3, feature_columns=[feature_column])
 
     classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=100)
-    self.assertEqual(4, len(classifier.weights_))
-    self.assertEqual(3, len(classifier.bias_))
+
+    variable_names = classifier.get_variable_names()
+    self.assertIn('linear/feature/weight', variable_names)
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertEqual(
+        4, len(classifier.get_variable_value('linear/feature/weight')))
+    self.assertEqual(
+        3, len(classifier.get_variable_value('linear/bias_weight')))
 
   def testCustomOptimizerByObject(self):
     """Tests multi-class classification using matrix data as input."""
@@ -658,7 +729,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.07)
 
   def testSdcaOptimizerRealValuedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and real valued features."""
+    """Tests LinearClassifier with SDCAOptimizer and real valued features."""
 
     def input_fn():
       return {
@@ -705,7 +776,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerBucketizedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and bucketized features."""
+    """Tests LinearClassifier with SDCAOptimizer and bucketized features."""
 
     def input_fn():
       return {
@@ -731,14 +802,14 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerSparseFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.4], [0.6], [0.3]]),
+              constant_op.constant([0.4, 0.6, 0.3]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
@@ -762,7 +833,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerWeightedSparseFeatures(self):
-    """LinearClasssifier with SDCAOptimizer and weighted sparse features."""
+    """LinearClassifier with SDCAOptimizer and weighted sparse features."""
 
     def input_fn():
       return {
@@ -793,7 +864,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerCrossedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and crossed features."""
+    """Tests LinearClassifier with SDCAOptimizer and crossed features."""
 
     def input_fn():
       return {
@@ -826,7 +897,7 @@ class LinearClassifierTest(test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testSdcaOptimizerMixedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and a mix of features."""
+    """Tests LinearClassifier with SDCAOptimizer and a mix of features."""
 
     def input_fn():
       return {
@@ -1364,8 +1435,10 @@ class LinearRegressorTest(test.TestCase):
         feature_columns=feature_columns,
         optimizer=ftrl.FtrlOptimizer(learning_rate=0.8))
     regressor.fit(x, y, batch_size=64, steps=2000)
+    self.assertIn('linear//weight', regressor.get_variable_names())
+    regressor_weights = regressor.get_variable_value('linear//weight')
     # Have to flatten weights since they come in (x, 1) shape.
-    self.assertAllClose(weights, regressor.weights_.flatten(), rtol=1)
+    self.assertAllClose(weights, regressor_weights.flatten(), rtol=1)
     # TODO(ispir): Disable centered_bias.
     # assert abs(bias - regressor.bias_) < 0.1
 
@@ -1392,8 +1465,10 @@ class LinearRegressorTest(test.TestCase):
     regressor.fit(input_fn=input_fn, steps=20)
     loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
     self.assertLess(loss, 0.01)
+    self.assertIn('linear/x/weight', regressor.get_variable_names())
+    regressor_weights = regressor.get_variable_value('linear/x/weight')
     self.assertAllClose(
-        [w[0] for w in weights], regressor.weights_.flatten(), rtol=0.1)
+        [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1)
 
   def testSdcaOptimizerMixedFeaturesArbitraryWeights(self):
     """Tests LinearRegressor with SDCAOptimizer and a mix of features."""
@@ -1403,7 +1478,7 @@ class LinearRegressorTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
@@ -1434,7 +1509,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""
 
     def input_fn():
       return {
@@ -1463,7 +1538,15 @@ class LinearRegressorTest(test.TestCase):
         optimizer=sdca_optimizer)
     regressor.fit(input_fn=input_fn, steps=20)
     no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    no_l1_reg_weights = regressor.weights_
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/price/weight', variable_names)
+    self.assertIn('linear/country/weights', variable_names)
+    no_l1_reg_weights = {
+        'linear/price/weight': regressor.get_variable_value(
+            'linear/price/weight'),
+        'linear/country/weights': regressor.get_variable_value(
+            'linear/country/weights'),
+    }
 
     # Regressor with L1 regularization.
     sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
@@ -1474,7 +1557,12 @@ class LinearRegressorTest(test.TestCase):
         optimizer=sdca_optimizer)
     regressor.fit(input_fn=input_fn, steps=20)
     l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    l1_reg_weights = regressor.weights_
+    l1_reg_weights = {
+        'linear/price/weight': regressor.get_variable_value(
+            'linear/price/weight'),
+        'linear/country/weights': regressor.get_variable_value(
+            'linear/country/weights'),
+    }
 
     # Unregularized loss is lower when there is no L1 regularization.
     self.assertLess(no_l1_reg_loss, l1_reg_loss)
@@ -1493,7 +1581,7 @@ class LinearRegressorTest(test.TestCase):
     self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
 
   def testSdcaOptimizerBiasOnly(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when it's the only feature present.
@@ -1526,7 +1614,7 @@ class LinearRegressorTest(test.TestCase):
         regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
 
   def testSdcaOptimizerBiasAndOtherColumns(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
@@ -1575,14 +1663,20 @@ class LinearRegressorTest(test.TestCase):
 
     regressor.fit(input_fn=input_fn, steps=200)
 
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/a/weight', variable_names)
+    self.assertIn('linear/b/weight', variable_names)
     # TODO(b/29339026): Change the expected results to expect a centered bias.
     self.assertNear(
         regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05)
-    self.assertNear(regressor.weights_['linear/a/weight'][0], 0.2, err=0.05)
-    self.assertNear(regressor.weights_['linear/b/weight'][0], 0.0, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
 
   def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""
 
     def input_fn():
       """Testing the bias weight when there are other features present.
@@ -1621,10 +1715,102 @@ class LinearRegressorTest(test.TestCase):
 
     regressor.fit(input_fn=input_fn, steps=100)
 
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/a/weight', variable_names)
+    self.assertIn('linear/b/weight', variable_names)
     self.assertNear(
         regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05)
-    self.assertNear(regressor.weights_['linear/a/weight'][0], 0.1, err=0.05)
-    self.assertNear(regressor.weights_['linear/b/weight'][0], -0.1, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05)
+
+
+class LinearEstimatorTest(test.TestCase):
+
+  def testExperimentIntegration(self):
+    cont_features = [
+        feature_column_lib.real_valued_column(
+            'feature', dimension=4)
+    ]
+    exp = experiment.Experiment(
+        estimator=linear.LinearEstimator(feature_columns=cont_features,
+                                         head=head_lib.regression_head()),
+        train_input_fn=test_data.iris_input_logistic_fn,
+        eval_input_fn=test_data.iris_input_logistic_fn)
+    exp.test()
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(self,
+                                                   linear.LinearEstimator)
+
+  def testLinearRegression(self):
+    """Tests that loss goes down with training."""
+
+    def input_fn():
+      return {
+          'age':
+              constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[10.]])
+
+    language = feature_column_lib.sparse_column_with_hash_bucket('language',
+                                                                 100)
+    age = feature_column_lib.real_valued_column('age')
+
+    linear_estimator = linear.LinearEstimator(feature_columns=[age, language],
+                                              head=head_lib.regression_head())
+    linear_estimator.fit(input_fn=input_fn, steps=100)
+    loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
+    linear_estimator.fit(input_fn=input_fn, steps=400)
+    loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
+
+    self.assertLess(loss2, loss1)
+    self.assertLess(loss2, 0.5)
+
+  def testPoissonRegression(self):
+    """Tests that loss goes down with training."""
+
+    def input_fn():
+      return {
+          'age':
+              constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[10.]])
+
+    language = feature_column_lib.sparse_column_with_hash_bucket('language',
+                                                                 100)
+    age = feature_column_lib.real_valued_column('age')
+
+    linear_estimator = linear.LinearEstimator(
+        feature_columns=[age, language],
+        head=head_lib.poisson_regression_head())
+    linear_estimator.fit(input_fn=input_fn, steps=10)
+    loss1 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
+    linear_estimator.fit(input_fn=input_fn, steps=100)
+    loss2 = linear_estimator.evaluate(input_fn=input_fn, steps=1)['loss']
+
+    self.assertLess(loss2, loss1)
+    # Here loss of 2.1 implies a prediction of ~9.9998
+    self.assertLess(loss2, 2.1)
+
+  def testSDCANotSupported(self):
+    """Tests that we detect error for SDCA."""
+    maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost')
+    sq_footage = feature_column_lib.real_valued_column('sq_footage')
+    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
+        example_id_column='example_id')
+    with self.assertRaises(ValueError):
+      linear.LinearEstimator(
+          head=head_lib.regression_head(label_dimension=1),
+          feature_columns=[maintenance_cost, sq_footage],
+          optimizer=sdca_optimizer,
+          _joint_weights=True)
 
 
 def boston_input_fn():
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index 06d395d6fc3..d03057087c7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -23,6 +23,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import metrics as metrics_lib
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
@@ -53,13 +54,17 @@ def _get_model_fn_with_logistic_metrics(model_fn):
           thresholds=thresholds)
     else:
       eval_metric_ops = None
-
     return model_fn_lib.ModelFnOps(
         mode=mode,
         predictions=predictions,
         loss=loss,
         train_op=train_op,
-        eval_metric_ops=eval_metric_ops)
+        eval_metric_ops=eval_metric_ops,
+        output_alternatives={
+            'head': (constants.ProblemType.LOGISTIC_REGRESSION, {
+                'predictions': predictions
+            })
+        })
 
   return _model_fn
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index b2749b3d37f..021918f0efc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import layers
diff --git a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
index 10ac888eca7..99388f116b3 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
@@ -22,7 +22,9 @@ class MetricKey(object):
   """Metric key strings."""
   LOSS = "loss"
   AUC = "auc"
+  AUC_PR = "auc_precision_recall"
   CLASS_AUC = "auc/class%d"
+  CLASS_AUC_PR = "auc_precision_recall/class%d"
   PREDICTION_MEAN = "labels/prediction_mean"
   CLASS_PREDICTION_MEAN = "labels/prediction_mean/class%d"
   CLASS_LOGITS_MEAN = "labels/logits_mean/class%d"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index f6f0eeed58d..8a327ab01f2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -25,11 +25,18 @@ import six
 
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import get_graph_from_inputs
-
-from tensorflow.python.training import session_run_hook
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import metric_key
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as core_model_fn_lib
+from tensorflow.python.estimator.export import export_output as core_export_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import session_run_hook
 
 
 class ModeKeys(object):
@@ -46,12 +53,17 @@ class ModeKeys(object):
   EVAL = 'eval'
   INFER = 'infer'
 
+  @classmethod
+  def validate(cls, key):
+    if key not in (cls.TRAIN, cls.EVAL, cls.INFER):
+      raise ValueError('Invalid mode %s.' % key)
+
 
 class ModelFnOps(
     collections.namedtuple('ModelFnOps', [
         'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'output_alternatives', 'training_chief_hooks', 'training_hooks',
-        'scaffold'
+        'scaffold', 'mode'
     ])):
   """Ops returned from a model_fn."""
 
@@ -112,13 +124,15 @@ class ModelFnOps(
     Raises:
       ValueError: If validation fails.
     """
+    ModeKeys.validate(mode)
+
     # Assert all ops are from the same graph.
     get_graph_from_inputs((predictions, loss, train_op))
 
     # Validate train_op.
     if train_op is None:
       if mode == ModeKeys.TRAIN:
-        raise ValueError('Missing training_op.')
+        raise ValueError('Missing train_op.')
     elif not isinstance(train_op, ops.Operation):
       # TODO(ptucker): Should this be allowed? Consider raising error.
       train_op = ops.convert_to_tensor(train_op).op
@@ -176,4 +190,93 @@ class ModelFnOps(
         output_alternatives=output_alternatives,
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        mode=mode)
+
+  def estimator_spec(self, default_serving_output_alternative_key=None):
+    """Creates an equivalent `EstimatorSpec`.
+
+    Args:
+      default_serving_output_alternative_key: Required for multiple heads. If
+        you have multiple entries in `output_alternatives` dict (comparable to
+        multiple heads), `EstimatorSpec` requires a default head that will be
+        used if a Servo request does not explicitly mention which head to infer
+        on. Pass the key of the output alternative here that you want to
+        designate as default. A separate ExportOutpout for this default head
+        wil be added to the export_outputs dict with the special key
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, unless there is
+        already an enry in output_alternatives with this special key.
+
+    Returns:
+      Instance of `EstimatorSpec` that is equivalent to this `ModelFnOps`
+
+    Raises:
+      ValueError: If problem type is unknown.
+    """
+    def _scores(output_tensors):
+      scores = output_tensors.get(prediction_key.PredictionKey.SCORES)
+      if scores is None:
+        scores = output_tensors.get(prediction_key.PredictionKey.PROBABILITIES)
+      return scores
+
+    def _classes(output_tensors):  # pylint: disable=missing-docstring
+      classes = output_tensors.get(prediction_key.PredictionKey.CLASSES)
+      if classes is None:
+        logging.warning(
+            'classes is None, Servo inference will not have class ids.')
+        return None
+      elif classes.dtype != dtypes.string:
+        # Servo classification can only serve string classes
+        logging.warning(
+            'classes is not string, Servo inference will not have class ids.')
+        return None
+
+      return classes
+
+    def _export_output(problem_type, predictions):  # pylint: disable=missing-docstring
+      if problem_type == constants.ProblemType.LINEAR_REGRESSION:
+        return core_export_lib.RegressionOutput(_scores(predictions))
+
+      if (problem_type == constants.ProblemType.CLASSIFICATION or
+          problem_type == constants.ProblemType.LOGISTIC_REGRESSION):
+        return core_export_lib.ClassificationOutput(
+            scores=_scores(predictions), classes=_classes(predictions))
+
+      if problem_type == constants.ProblemType.UNSPECIFIED:
+        return core_export_lib.PredictOutput(predictions)
+
+      raise ValueError('Unknown problem_type=%s' % problem_type)
+
+    # Converts output_alternatives
+    export_outputs_dict = None
+    if self.output_alternatives:
+      output_alternatives = self.output_alternatives
+      # Adds default output_alternative if needed.
+      if (len(output_alternatives) > 1 and
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+          output_alternatives):
+        output_alternatives = output_alternatives.copy()
+        output_alternatives[
+            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = (
+                output_alternatives[default_serving_output_alternative_key])
+      export_outputs_dict = {key: _export_output(*val) for key, val in
+                             output_alternatives.items()}
+
+    def _get_eval_metric_ops():
+      """Returns self.eval_metric_ops without loss metric."""
+      result = {}
+      for key, value in six.iteritems(self.eval_metric_ops):
+        if key != metric_key.MetricKey.LOSS:
+          result[key] = value
+      return result
+
+    return core_model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=_get_eval_metric_ops(),
+        export_outputs=export_outputs_dict,
+        training_chief_hooks=self.training_chief_hooks,
+        training_hooks=self.training_hooks,
+        scaffold=self.scaffold)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
new file mode 100644
index 00000000000..284e2cfd7ac
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
@@ -0,0 +1,297 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModelFnOps tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.client import session
+from tensorflow.python.estimator.export import export_output as core_export_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import monitored_session
+
+
+class ModelFnopsTest(test.TestCase):
+  """Multi-output tests."""
+
+  def create_predictions(self):
+    probabilities = constant_op.constant([1., 1., 1.])
+    scores = constant_op.constant([1., 2., 3.])
+    classes = constant_op.constant([b"0", b"1", b"2"])
+    return {
+        "probabilities": probabilities,
+        "scores": scores,
+        "classes": classes}
+
+  def create_model_fn_ops(self, predictions, output_alternatives,
+                          mode=model_fn.ModeKeys.INFER):
+
+    return model_fn.ModelFnOps(
+        model_fn.ModeKeys.INFER,
+        predictions=predictions,
+        loss=constant_op.constant([1]),
+        train_op=control_flow_ops.no_op(),
+        eval_metric_ops={
+            "metric_key": (constant_op.constant(1.), control_flow_ops.no_op()),
+            "loss": (constant_op.constant(1.), control_flow_ops.no_op()),
+        },
+        training_chief_hooks=[basic_session_run_hooks.StepCounterHook()],
+        training_hooks=[basic_session_run_hooks.StepCounterHook()],
+        output_alternatives=output_alternatives,
+        scaffold=monitored_session.Scaffold())
+
+  def assertEquals_except_export_and_eval_loss(
+      self, model_fn_ops, estimator_spec):
+    expected_eval_metric_ops = {}
+    for key, value in six.iteritems(model_fn_ops.eval_metric_ops):
+      if key != "loss":
+        expected_eval_metric_ops[key] = value
+    self.assertEqual(model_fn_ops.predictions, estimator_spec.predictions)
+    self.assertEqual(model_fn_ops.loss, estimator_spec.loss)
+    self.assertEqual(model_fn_ops.train_op, estimator_spec.train_op)
+    self.assertEqual(expected_eval_metric_ops,
+                     estimator_spec.eval_metric_ops)
+    self.assertAllEqual(model_fn_ops.training_chief_hooks,
+                        estimator_spec.training_chief_hooks)
+    self.assertAllEqual(model_fn_ops.training_hooks,
+                        estimator_spec.training_hooks)
+    self.assertEqual(model_fn_ops.scaffold, estimator_spec.scaffold)
+
+  def testEstimatorSpec_except_export(self):
+    predictions = self.create_predictions()
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, None, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+  def testEstimatorSpec_export_regression_with_scores(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"regression_head": (
+        constants.ProblemType.LINEAR_REGRESSION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          regression_output.value.eval())
+
+  def testEstimatorSpec_export_regression_with_probabilities(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    output_alternatives = {"regression_head": (
+        constants.ProblemType.LINEAR_REGRESSION,
+        output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["probabilities"].eval(),
+                          regression_output.value.eval())
+
+  def testEstimatorSpec_export_classification(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classification_with_missing_scores(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["probabilities"].eval(),
+                          classification_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classification_with_missing_scores_proba(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    del output_alternatives_predictions["probabilities"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertIsNone(classification_output.scores)
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classification_with_missing_classes(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["classes"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertIsNone(classification_output.classes)
+
+  def testEstimatorSpec_export_classification_with_nonstring_classes(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    output_alternatives_predictions["classes"] = constant_op.constant(
+        [1, 2, 3])
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertIsNone(classification_output.classes)
+
+  def testEstimatorSpec_export_logistic(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"logistic_head": (
+        constants.ProblemType.LOGISTIC_REGRESSION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      logistic_output = estimator_spec.export_outputs["logistic_head"]
+      self.assertTrue(isinstance(logistic_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          logistic_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          logistic_output.classes.eval())
+
+  def testEstimatorSpec_export_unspecified(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"unspecified_head": (
+        constants.ProblemType.UNSPECIFIED, predictions)}
+
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      unspecified_output = estimator_spec.export_outputs["unspecified_head"]
+      self.assertTrue(isinstance(unspecified_output,
+                                 core_export_lib.PredictOutput))
+      self.assertEqual(predictions, unspecified_output.outputs)
+
+  def testEstimatorSpec_export_multihead(self):
+    predictions = self.create_predictions()
+    output_alternatives = {
+        "regression_head": (
+            constants.ProblemType.LINEAR_REGRESSION, predictions),
+        "classification_head": (
+            constants.ProblemType.CLASSIFICATION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec("regression_head")
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          regression_output.value.eval())
+
+      default_output = estimator_spec.export_outputs[
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+      self.assertTrue(isinstance(default_output,
+                                 core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          default_output.value.eval())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/multioutput_test.py b/tensorflow/contrib/learn/python/learn/estimators/multioutput_test.py
index 11a096e4233..325c543a080 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/multioutput_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/multioutput_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py b/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
index 3366ce564a5..8cf62707926 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/nonlinear_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
@@ -50,13 +44,22 @@ class NonLinearTest(test.TestCase):
         n_classes=3,
         config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(iris.data, iris.target, max_steps=200)
-    weights = classifier.weights_
-    self.assertEqual(weights[0].shape, (4, 10))
-    self.assertEqual(weights[1].shape, (10, 20))
-    self.assertEqual(weights[2].shape, (20, 10))
-    self.assertEqual(weights[3].shape, (10, 3))
-    biases = classifier.bias_
-    self.assertEqual(len(biases), 4)
+    variable_names = classifier.get_variable_names()
+    self.assertEqual(
+        classifier.get_variable_value("dnn/hiddenlayer_0/weights").shape,
+        (4, 10))
+    self.assertEqual(
+        classifier.get_variable_value("dnn/hiddenlayer_1/weights").shape,
+        (10, 20))
+    self.assertEqual(
+        classifier.get_variable_value("dnn/hiddenlayer_2/weights").shape,
+        (20, 10))
+    self.assertEqual(
+        classifier.get_variable_value("dnn/logits/weights").shape, (10, 3))
+    self.assertIn("dnn/hiddenlayer_0/biases", variable_names)
+    self.assertIn("dnn/hiddenlayer_1/biases", variable_names)
+    self.assertIn("dnn/hiddenlayer_2/biases", variable_names)
+    self.assertIn("dnn/logits/biases", variable_names)
 
   def testBostonDNN(self):
     boston = base.load_boston()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py b/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
index 7dc26781f94..f8d87b89143 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Enum for model prediction keys."""
+"""Enum for model prediction keys.
+
+This file is obsoleted in the move of Estimator to core.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/estimators/regression_test.py b/tensorflow/contrib/learn/python/learn/estimators/regression_test.py
index 2f6b33dc0cc..fef0a084d17 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/regression_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/regression_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python import learn
@@ -47,8 +40,10 @@ class RegressionTest(test.TestCase):
         feature_columns=learn.infer_real_valued_columns_from_input(x),
         optimizer="SGD")
     regressor.fit(x, y, steps=200)
+    self.assertIn("linear//weight", regressor.get_variable_names())
+    regressor_weights = regressor.get_variable_value("linear//weight")
     # Have to flatten weights since they come in (x, 1) shape.
-    self.assertAllClose(weights, regressor.weights_.flatten(), rtol=0.01)
+    self.assertAllClose(weights, regressor_weights.flatten(), rtol=0.01)
     # TODO(ispir): Disable centered_bias.
     # assert abs(bias - regressor.bias_) < 0.1
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
new file mode 100644
index 00000000000..0f09b111bd8
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
@@ -0,0 +1,303 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common operations for RNN Estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import metrics
+from tensorflow.contrib import rnn as contrib_rnn
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+# NOTE(jtbates): As of February 10, 2017, some of the `RNNKeys` have been
+# removed and replaced with values from `prediction_key.PredictionKey`. The key
+# `RNNKeys.PREDICTIONS_KEY` has been replaced by
+# `prediction_key.PredictionKey.SCORES` for regression and
+# `prediction_key.PredictionKey.CLASSES` for classification. The key
+# `RNNKeys.PROBABILITIES_KEY` has been replaced by
+# `prediction_key.PredictionKey.PROBABILITIES`.
+class RNNKeys(object):
+  FINAL_STATE_KEY = 'final_state'
+  LABELS_KEY = '__labels__'
+  SEQUENCE_LENGTH_KEY = 'sequence_length'
+  STATE_PREFIX = 'rnn_cell_state'
+
+
+class PredictionType(object):
+  """Enum-like values for the type of prediction that the model makes.
+  """
+  SINGLE_VALUE = 1
+  MULTIPLE_VALUE = 2
+
+
+_CELL_TYPES = {'basic_rnn': contrib_rnn.BasicRNNCell,
+               'lstm': contrib_rnn.LSTMCell,
+               'gru': contrib_rnn.GRUCell,}
+
+
+def _get_single_cell(cell_type, num_units):
+  """Constructs and return a single `RNNCell`.
+
+  Args:
+    cell_type: Either a string identifying the `RNNCell` type or a subclass of
+      `RNNCell`.
+    num_units: The number of units in the `RNNCell`.
+  Returns:
+    An initialized `RNNCell`.
+  Raises:
+    ValueError: `cell_type` is an invalid `RNNCell` name.
+    TypeError: `cell_type` is not a string or a subclass of `RNNCell`.
+  """
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, contrib_rnn.RNNCell):
+    raise ValueError('The supported cell types are {}; got {}'.format(
+        list(_CELL_TYPES.keys()), cell_type))
+  return cell_type(num_units=num_units)
+
+
+def construct_rnn_cell(num_units, cell_type='basic_rnn',
+                       dropout_keep_probabilities=None):
+  """Constructs cells, applies dropout and assembles a `MultiRNNCell`.
+
+  The cell type chosen by DynamicRNNEstimator.__init__() is the same as
+  returned by this function when called with the same arguments.
+
+  Args:
+    num_units: A single `int` or a list/tuple of `int`s. The size of the
+      `RNNCell`s.
+    cell_type: A string identifying the `RNNCell` type or a subclass of
+      `RNNCell`.
+    dropout_keep_probabilities: a list of dropout probabilities or `None`. If a
+      list is given, it must have length `len(cell_type) + 1`.
+
+  Returns:
+    An initialized `RNNCell`.
+  """
+  if not isinstance(num_units, (list, tuple)):
+    num_units = (num_units,)
+
+  cells = [_get_single_cell(cell_type, n) for n in num_units]
+  if dropout_keep_probabilities:
+    cells = apply_dropout(cells, dropout_keep_probabilities)
+  if len(cells) == 1:
+    return cells[0]
+  return contrib_rnn.MultiRNNCell(cells)
+
+
+def apply_dropout(cells, dropout_keep_probabilities, random_seed=None):
+  """Applies dropout to the outputs and inputs of `cell`.
+
+  Args:
+    cells: A list of `RNNCell`s.
+    dropout_keep_probabilities: a list whose elements are either floats in
+    `[0.0, 1.0]` or `None`. It must have length one greater than `cells`.
+    random_seed: Seed for random dropout.
+
+  Returns:
+    A list of `RNNCell`s, the result of applying the supplied dropouts.
+
+  Raises:
+    ValueError: If `len(dropout_keep_probabilities) != len(cells) + 1`.
+  """
+  if len(dropout_keep_probabilities) != len(cells) + 1:
+    raise ValueError(
+        'The number of dropout probabilities must be one greater than the '
+        'number of cells. Got {} cells and {} dropout probabilities.'.format(
+            len(cells), len(dropout_keep_probabilities)))
+  wrapped_cells = [
+      contrib_rnn.DropoutWrapper(cell, prob, 1.0, seed=random_seed)
+      for cell, prob in zip(cells[:-1], dropout_keep_probabilities[:-2])
+  ]
+  wrapped_cells.append(
+      contrib_rnn.DropoutWrapper(cells[-1], dropout_keep_probabilities[-2],
+                                 dropout_keep_probabilities[-1]))
+  return wrapped_cells
+
+
+def get_eval_metric_ops(problem_type, prediction_type, sequence_length,
+                        prediction_dict, labels):
+  """Returns eval metric ops for given `problem_type` and `prediction_type`.
+
+  Args:
+    problem_type: `ProblemType.CLASSIFICATION` or
+      `ProblemType.LINEAR_REGRESSION`.
+    prediction_type: `PredictionType.SINGLE_VALUE` or
+      `PredictionType.MULTIPLE_VALUE`.
+    sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32`
+      containing the length of each sequence in the batch. If `None`, sequences
+      are assumed to be unpadded.
+    prediction_dict: A dict of prediction tensors.
+    labels: The label `Tensor`.
+
+  Returns:
+    A `dict` mapping strings to the result of calling the metric_fn.
+  """
+  eval_metric_ops = {}
+  if problem_type == constants.ProblemType.CLASSIFICATION:
+    # Multi value classification
+    if prediction_type == PredictionType.MULTIPLE_VALUE:
+      mask_predictions, mask_labels = mask_activations_and_labels(
+          prediction_dict[prediction_key.PredictionKey.CLASSES], labels,
+          sequence_length)
+      eval_metric_ops['accuracy'] = metrics.streaming_accuracy(
+          predictions=mask_predictions, labels=mask_labels)
+    # Single value classification
+    elif prediction_type == PredictionType.SINGLE_VALUE:
+      eval_metric_ops['accuracy'] = metrics.streaming_accuracy(
+          predictions=prediction_dict[prediction_key.PredictionKey.CLASSES],
+          labels=labels)
+  elif problem_type == constants.ProblemType.LINEAR_REGRESSION:
+    # Multi value regression
+    if prediction_type == PredictionType.MULTIPLE_VALUE:
+      pass
+    # Single value regression
+    elif prediction_type == PredictionType.SINGLE_VALUE:
+      pass
+  return eval_metric_ops
+
+
+def select_last_activations(activations, sequence_lengths):
+  """Selects the nth set of activations for each n in `sequence_length`.
+
+  Reuturns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
+  `None`, then `output[i, :] = activations[i, sequence_length[i], :]`. If
+  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
+
+  Args:
+    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
+    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
+  Returns:
+    A `Tensor` of shape `[batch_size, k]`.
+  """
+  with ops.name_scope(
+      'select_last_activations', values=[activations, sequence_lengths]):
+    activations_shape = array_ops.shape(activations)
+    batch_size = activations_shape[0]
+    padded_length = activations_shape[1]
+    num_label_columns = activations_shape[2]
+    if sequence_lengths is None:
+      sequence_lengths = padded_length
+    reshaped_activations = array_ops.reshape(activations,
+                                             [-1, num_label_columns])
+    indices = math_ops.range(batch_size) * padded_length + sequence_lengths - 1
+    last_activations = array_ops.gather(reshaped_activations, indices)
+    last_activations.set_shape(
+        [activations.get_shape()[0], activations.get_shape()[2]])
+    return last_activations
+
+
+def mask_activations_and_labels(activations, labels, sequence_lengths):
+  """Remove entries outside `sequence_lengths` and returned flattened results.
+
+  Args:
+    activations: Output of the RNN, shape `[batch_size, padded_length, k]`.
+    labels: Label values, shape `[batch_size, padded_length]`.
+    sequence_lengths: A `Tensor` of shape `[batch_size]` with the unpadded
+      length of each sequence. If `None`, then each sequence is unpadded.
+
+  Returns:
+    activations_masked: `logit` values with those beyond `sequence_lengths`
+      removed for each batch. Batches are then concatenated. Shape
+      `[tf.sum(sequence_lengths), k]` if `sequence_lengths` is not `None` and
+      shape `[batch_size * padded_length, k]` otherwise.
+    labels_masked: Label values after removing unneeded entries. Shape
+      `[tf.sum(sequence_lengths)]` if `sequence_lengths` is not `None` and shape
+      `[batch_size * padded_length]` otherwise.
+  """
+  with ops.name_scope(
+      'mask_activations_and_labels',
+      values=[activations, labels, sequence_lengths]):
+    labels_shape = array_ops.shape(labels)
+    batch_size = labels_shape[0]
+    padded_length = labels_shape[1]
+    if sequence_lengths is None:
+      flattened_dimension = padded_length * batch_size
+      activations_masked = array_ops.reshape(activations,
+                                             [flattened_dimension, -1])
+      labels_masked = array_ops.reshape(labels, [flattened_dimension])
+    else:
+      mask = array_ops.sequence_mask(sequence_lengths, padded_length)
+      activations_masked = array_ops.boolean_mask(activations, mask)
+      labels_masked = array_ops.boolean_mask(labels, mask)
+    return activations_masked, labels_masked
+
+
+def multi_value_predictions(activations, target_column, problem_type,
+                            predict_probabilities):
+  """Maps `activations` from the RNN to predictions for multi value models.
+
+  If `predict_probabilities` is `False`, this function returns a `dict`
+  containing single entry with key `prediction_key.PredictionKey.CLASSES` for
+  `problem_type` `ProblemType.CLASSIFICATION` or
+  `prediction_key.PredictionKey.SCORE` for `problem_type`
+  `ProblemType.LINEAR_REGRESSION`.
+
+  If `predict_probabilities` is `True`, it will contain a second entry with key
+  `prediction_key.PredictionKey.PROBABILITIES`. The
+  value of this entry is a `Tensor` of probabilities with shape
+  `[batch_size, padded_length, num_classes]`.
+
+  Note that variable length inputs will yield some predictions that don't have
+  meaning. For example, if `sequence_length = [3, 2]`, then prediction `[1, 2]`
+  has no meaningful interpretation.
+
+  Args:
+    activations: Output from an RNN. Should have dtype `float32` and shape
+      `[batch_size, padded_length, ?]`.
+    target_column: An initialized `TargetColumn`, calculate predictions.
+    problem_type: Either `ProblemType.CLASSIFICATION` or
+      `ProblemType.LINEAR_REGRESSION`.
+    predict_probabilities: A Python boolean, indicating whether probabilities
+      should be returned. Should only be set to `True` for
+      classification/logistic regression problems.
+  Returns:
+    A `dict` mapping strings to `Tensors`.
+  """
+  with ops.name_scope('MultiValuePrediction'):
+    activations_shape = array_ops.shape(activations)
+    flattened_activations = array_ops.reshape(activations,
+                                              [-1, activations_shape[2]])
+    prediction_dict = {}
+    if predict_probabilities:
+      flat_probabilities = target_column.logits_to_predictions(
+          flattened_activations, proba=True)
+      flat_predictions = math_ops.argmax(flat_probabilities, 1)
+      if target_column.num_label_columns == 1:
+        probability_shape = array_ops.concat([activations_shape[:2], [2]], 0)
+      else:
+        probability_shape = activations_shape
+      probabilities = array_ops.reshape(
+          flat_probabilities,
+          probability_shape,
+          name=prediction_key.PredictionKey.PROBABILITIES)
+      prediction_dict[
+          prediction_key.PredictionKey.PROBABILITIES] = probabilities
+    else:
+      flat_predictions = target_column.logits_to_predictions(
+          flattened_activations, proba=False)
+    predictions_name = (prediction_key.PredictionKey.CLASSES
+                        if problem_type == constants.ProblemType.CLASSIFICATION
+                        else prediction_key.PredictionKey.SCORES)
+    predictions = array_ops.reshape(
+        flat_predictions, [activations_shape[0], activations_shape[1]],
+        name=predictions_name)
+    prediction_dict[predictions_name] = predictions
+    return prediction_dict
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py b/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py
new file mode 100644
index 00000000000..82563141cc9
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py
@@ -0,0 +1,116 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layers.rnn_common."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.learn.python.learn.estimators import rnn_common
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class RnnCommonTest(test.TestCase):
+
+  def testMaskActivationsAndLabels(self):
+    """Test `mask_activations_and_labels`."""
+    batch_size = 4
+    padded_length = 6
+    num_classes = 4
+    np.random.seed(1234)
+    sequence_length = np.random.randint(0, padded_length + 1, batch_size)
+    activations = np.random.rand(batch_size, padded_length, num_classes)
+    labels = np.random.randint(0, num_classes, [batch_size, padded_length])
+    (activations_masked_t,
+     labels_masked_t) = rnn_common.mask_activations_and_labels(
+         constant_op.constant(activations, dtype=dtypes.float32),
+         constant_op.constant(labels, dtype=dtypes.int32),
+         constant_op.constant(sequence_length, dtype=dtypes.int32))
+
+    with self.test_session() as sess:
+      activations_masked, labels_masked = sess.run(
+          [activations_masked_t, labels_masked_t])
+
+    expected_activations_shape = [sum(sequence_length), num_classes]
+    np.testing.assert_equal(
+        expected_activations_shape, activations_masked.shape,
+        'Wrong activations shape. Expected {}; got {}.'.format(
+            expected_activations_shape, activations_masked.shape))
+
+    expected_labels_shape = [sum(sequence_length)]
+    np.testing.assert_equal(expected_labels_shape, labels_masked.shape,
+                            'Wrong labels shape. Expected {}; got {}.'.format(
+                                expected_labels_shape, labels_masked.shape))
+    masked_index = 0
+    for i in range(batch_size):
+      for j in range(sequence_length[i]):
+        actual_activations = activations_masked[masked_index]
+        expected_activations = activations[i, j, :]
+        np.testing.assert_almost_equal(
+            expected_activations,
+            actual_activations,
+            err_msg='Unexpected logit value at index [{}, {}, :].'
+            '  Expected {}; got {}.'.format(i, j, expected_activations,
+                                            actual_activations))
+
+        actual_labels = labels_masked[masked_index]
+        expected_labels = labels[i, j]
+        np.testing.assert_almost_equal(
+            expected_labels,
+            actual_labels,
+            err_msg='Unexpected logit value at index [{}, {}].'
+            ' Expected {}; got {}.'.format(i, j, expected_labels,
+                                           actual_labels))
+        masked_index += 1
+
+  def testSelectLastActivations(self):
+    """Test `select_last_activations`."""
+    batch_size = 4
+    padded_length = 6
+    num_classes = 4
+    np.random.seed(4444)
+    sequence_length = np.random.randint(0, padded_length + 1, batch_size)
+    activations = np.random.rand(batch_size, padded_length, num_classes)
+    last_activations_t = rnn_common.select_last_activations(
+        constant_op.constant(activations, dtype=dtypes.float32),
+        constant_op.constant(sequence_length, dtype=dtypes.int32))
+
+    with session.Session() as sess:
+      last_activations = sess.run(last_activations_t)
+
+    expected_activations_shape = [batch_size, num_classes]
+    np.testing.assert_equal(
+        expected_activations_shape, last_activations.shape,
+        'Wrong activations shape. Expected {}; got {}.'.format(
+            expected_activations_shape, last_activations.shape))
+
+    for i in range(batch_size):
+      actual_activations = last_activations[i, :]
+      expected_activations = activations[i, sequence_length[i] - 1, :]
+      np.testing.assert_almost_equal(
+          expected_activations,
+          actual_activations,
+          err_msg='Unexpected logit value at index [{}, :].'
+          '  Expected {}; got {}.'.format(i, expected_activations,
+                                          actual_activations))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 42da9969733..3aaee5862df 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -18,13 +18,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import json
 import os
 
+import six
+
+from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as core_run_config
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
+# A list of the property names in RunConfig user allows to change. They will
+# not affect the execution framework, so when execution framework checks the
+# `uid` of the RunConfig, it should be ingored.
+_DEFAULT_UID_WHITE_LIST = [
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+
 class Environment(object):
   # For running general distributed training.
   CLOUD = 'cloud'
@@ -74,6 +94,8 @@ class ClusterConfig(object):
       `cluster_spec`. Defaults to ''.
     * `num_ps_replicas` is set by counting the number of nodes listed
       in the `ps` attribute of `cluster_spec`. Defaults to 0.
+    * `num_worker_replicas` is set by counting the number of nodes listed
+      in the `worker` attribute of `cluster_spec`. Defaults to 0.
     * `is_chief` is deteremined based on `task_type`, `type_id`, and
       `environment`.
 
@@ -81,13 +103,14 @@ class ClusterConfig(object):
     ```
       cluster = {'ps': ['host1:2222', 'host2:2222'],
                  'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-      os.environ['TF_CONFIG'] = json.dumps({
+      os.environ['TF_CONFIG'] = json.dumps(
           {'cluster': cluster,
-           'task': {'type': 'worker', 'index': 1}}})
+           'task': {'type': 'worker', 'index': 1}})
       config = ClusterConfig()
       assert config.master == 'host4:2222'
       assert config.task_id == 1
       assert config.num_ps_replicas == 2
+      assert config.num_worker_replicas == 3
       assert config.cluster_spec == server_lib.ClusterSpec(cluster)
       assert config.task_type == 'worker'
       assert not config.is_chief
@@ -112,6 +135,7 @@ class ClusterConfig(object):
                     _get_master(self._cluster_spec, self._task_type,
                                 self._task_id) or '')
     self._num_ps_replicas = _count_ps(self._cluster_spec) or 0
+    self._num_worker_replicas = _count_worker(self._cluster_spec) or 0
 
     # Set is_chief.
     self._environment = config.get('environment', Environment.LOCAL)
@@ -154,6 +178,10 @@ class ClusterConfig(object):
   def num_ps_replicas(self):
     return self._num_ps_replicas
 
+  @property
+  def num_worker_replicas(self):
+    return self._num_worker_replicas
+
   @property
   def task_id(self):
     return self._task_id
@@ -179,14 +207,17 @@ class ClusterConfig(object):
     return int(task_index) if task_index else 0
 
 
-class RunConfig(ClusterConfig):
+class RunConfig(ClusterConfig, core_run_config.RunConfig):
   """This class specifies the configurations for an `Estimator` run.
 
+  This class is the implementation of ${tf.estimator.RunConfig} interface.
+
   If you're a Google-internal user using command line flags with
   `learn_runner.py` (for instance, to do distributed training or to use
   parameter servers), you probably want to use `learn_runner.EstimatorConfig`
   instead.
   """
+  _USE_DEFAULT = 0
 
   def __init__(self,
                master=None,
@@ -195,11 +226,13 @@ class RunConfig(ClusterConfig):
                gpu_memory_fraction=1,
                tf_random_seed=None,
                save_summary_steps=100,
-               save_checkpoints_secs=600,
+               save_checkpoints_secs=_USE_DEFAULT,
                save_checkpoints_steps=None,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
-               evaluation_master=''):
+               evaluation_master='',
+               model_dir=None,
+               session_config=None):
     """Constructor.
 
     Note that the superclass `ClusterConfig` may set properties like
@@ -229,6 +262,13 @@ class RunConfig(ClusterConfig):
         to be saved. The default value of 10,000 hours effectively disables
         the feature.
       evaluation_master: the master on which to perform evaluation.
+      model_dir: directory where model parameters, graph etc are saved. If
+        `None`, will use `model_dir` property in `TF_CONFIG` environment
+        variable. If both are set, must have same value. If both are `None`, see
+        `Estimator` about where the model will be saved.
+      session_config: a ConfigProto used to set session parameters, or None.
+        Note - using this argument, it is easy to provide settings which break
+        otherwise perfectly good models. Use with care.
     """
     super(RunConfig, self).__init__(
         master=master, evaluation_master=evaluation_master)
@@ -244,12 +284,56 @@ class RunConfig(ClusterConfig):
     self._tf_random_seed = tf_random_seed
     self._save_summary_steps = save_summary_steps
     self._save_checkpoints_secs = save_checkpoints_secs
+    self._session_config = session_config
+    if save_checkpoints_secs == RunConfig._USE_DEFAULT:
+      if save_checkpoints_steps is None:
+        self._save_checkpoints_secs = 600
+      else:
+        self._save_checkpoints_secs = None
     self._save_checkpoints_steps = save_checkpoints_steps
 
     # TODO(weiho): Remove these after ModelFn refactoring, when users can
     # create Scaffold and Saver in their model_fn to set these.
     self._keep_checkpoint_max = keep_checkpoint_max
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
+    self._model_dir = _get_model_dir(model_dir)
+
+  @experimental
+  def uid(self, whitelist=None):
+    """Generates a 'Unique Identifier' based on all internal fields.
+
+    Caller should use the uid string to check `RunConfig` instance integrity
+    in one session use, but should not rely on the implementation details, which
+    is subject to change.
+
+    Args:
+      whitelist: A list of the string names of the properties uid should not
+        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+        includes most properties user allowes to change.
+
+    Returns:
+      A uid string.
+    """
+    if whitelist is None:
+      whitelist = _DEFAULT_UID_WHITE_LIST
+
+    state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+    # Pop out the keys in whitelist.
+    for k in whitelist:
+      state.pop('_' + k, None)
+
+    ordered_state = collections.OrderedDict(
+        sorted(state.items(), key=lambda t: t[0]))
+    # For class instance without __repr__, some special cares are required.
+    # Otherwise, the object address will be used.
+    if '_cluster_spec' in ordered_state:
+      ordered_state['_cluster_spec'] = ordered_state['_cluster_spec'].as_dict()
+    return ', '.join(
+        '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
+
+  @property
+  def model_dir(self):
+    return self._model_dir
 
   @property
   def tf_config(self):
@@ -271,6 +355,10 @@ class RunConfig(ClusterConfig):
   def save_checkpoints_steps(self):
     return self._save_checkpoints_steps
 
+  @property
+  def session_config(self):
+    return self._session_config
+
   @property
   def keep_checkpoint_max(self):
     return self._keep_checkpoint_max
@@ -285,6 +373,11 @@ def _count_ps(cluster_spec):
   return len(cluster_spec.as_dict().get('ps', [])) if cluster_spec else 0
 
 
+def _count_worker(cluster_spec):
+  """Counts the number of workers in cluster_spec."""
+  return len(cluster_spec.as_dict().get('worker', [])) if cluster_spec else 0
+
+
 def _get_master(cluster_spec, task_type, task_id):
   """Returns the appropriate string for the TensorFlow master."""
   if not cluster_spec:
@@ -317,3 +410,21 @@ def _get_master(cluster_spec, task_type, task_id):
   # For backwards compatibility, we return empty string if task_type was
   # not set (task_type did not previously exist).
   return ''
+
+
+def _get_model_dir(model_dir):
+  """Returns `model_dir` based user provided `model_dir` or `TF_CONFIG`."""
+
+  model_dir_in_tf_config = json.loads(
+      os.environ.get('TF_CONFIG') or '{}').get('model_dir', None)
+  if model_dir_in_tf_config is not None:
+    if model_dir is not None and model_dir_in_tf_config != model_dir:
+      raise ValueError(
+          '`model_dir` provided in RunConfig construct, if set, '
+          'must have the same value as the model_dir in TF_CONFIG. '
+          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
+              model_dir, model_dir_in_tf_config))
+
+    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+
+  return model_dir or model_dir_in_tf_config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
index 7b95c500726..6e2a2690ae4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
@@ -18,33 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import json
-import sys
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as core_run_config
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
+TEST_DIR = "test_dir"
+ANOTHER_TEST_DIR = "another_test_dir"
+MASTER = "master_"
+RANDOM_SEED = 123
+
 patch = test.mock.patch
 
 
 class RunConfigTest(test.TestCase):
 
+  def test_instance_of_core_run_config(self):
+    config = run_config_lib.RunConfig()
+    self.assertTrue(isinstance(config, core_run_config.RunConfig))
+
   def test_defaults_with_no_tf_config(self):
-    config = run_config.RunConfig()
-    self.assertEquals(config.master, "")
-    self.assertEquals(config.task_id, 0)
-    self.assertEquals(config.num_ps_replicas, 0)
-    self.assertEquals(config.cluster_spec, {})
+    config = run_config_lib.RunConfig()
+    self.assertEqual(config.master, "")
+    self.assertEqual(config.task_id, 0)
+    self.assertEqual(config.num_ps_replicas, 0)
+    self.assertEqual(config.cluster_spec, {})
     self.assertIsNone(config.task_type)
     self.assertTrue(config.is_chief)
-    self.assertEquals(config.evaluation_master, "")
+    self.assertEqual(config.evaluation_master, "")
 
   def test_values_from_tf_config(self):
     tf_config = {
@@ -58,15 +63,16 @@ class RunConfigTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
-    self.assertEquals(config.master, "grpc://host4:4")
-    self.assertEquals(config.task_id, 1)
-    self.assertEquals(config.num_ps_replicas, 2)
-    self.assertEquals(config.cluster_spec.as_dict(), tf_config["cluster"])
-    self.assertEquals(config.task_type, run_config_lib.TaskType.WORKER)
+    self.assertEqual(config.master, "grpc://host4:4")
+    self.assertEqual(config.task_id, 1)
+    self.assertEqual(config.num_ps_replicas, 2)
+    self.assertEqual(config.num_worker_replicas, 3)
+    self.assertEqual(config.cluster_spec.as_dict(), tf_config["cluster"])
+    self.assertEqual(config.task_type, run_config_lib.TaskType.WORKER)
     self.assertFalse(config.is_chief)
-    self.assertEquals(config.evaluation_master, "")
+    self.assertEqual(config.evaluation_master, "")
 
   def test_explicitly_specified_values(self):
     cluster_spec = {
@@ -81,22 +87,23 @@ class RunConfigTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig(
+      config = run_config_lib.RunConfig(
           master="localhost:0", evaluation_master="localhost:9991")
 
-    self.assertEquals(config.master, "localhost:0")
-    self.assertEquals(config.task_id, 2)
-    self.assertEquals(config.num_ps_replicas, 1)
-    self.assertEquals(config.cluster_spec, server_lib.ClusterSpec(cluster_spec))
-    self.assertEquals(config.task_type, run_config_lib.TaskType.WORKER)
+    self.assertEqual(config.master, "localhost:0")
+    self.assertEqual(config.task_id, 2)
+    self.assertEqual(config.num_ps_replicas, 1)
+    self.assertEqual(config.num_worker_replicas, 0)
+    self.assertEqual(config.cluster_spec, server_lib.ClusterSpec(cluster_spec))
+    self.assertEqual(config.task_type, run_config_lib.TaskType.WORKER)
     self.assertFalse(config.is_chief)
-    self.assertEquals(config.evaluation_master, "localhost:9991")
+    self.assertEqual(config.evaluation_master, "localhost:9991")
 
   def test_single_node_in_cluster_spec_produces_empty_master(self):
     tf_config = {"cluster": {run_config_lib.TaskType.WORKER: ["host1:1"]}}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
-      self.assertEquals(config.master, "")
+      config = run_config_lib.RunConfig()
+      self.assertEqual(config.master, "")
 
   def test_no_task_type_produces_empty_master(self):
     tf_config = {
@@ -107,8 +114,8 @@ class RunConfigTest(test.TestCase):
         # Omits "task": {"type": "worker}
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
-      self.assertEquals(config.master, "")
+      config = run_config_lib.RunConfig()
+      self.assertEqual(config.master, "")
 
   def test_invalid_job_name_raises(self):
     tf_config = {
@@ -125,7 +132,7 @@ class RunConfigTest(test.TestCase):
         "os.environ",
         {"TF_CONFIG": json.dumps(tf_config)}), self.assertRaisesRegexp(
             ValueError, expected_msg_regexp):
-      run_config.RunConfig()
+      run_config_lib.RunConfig()
 
   def test_illegal_task_index_raises(self):
     tf_config = {
@@ -143,7 +150,7 @@ class RunConfigTest(test.TestCase):
         "os.environ",
         {"TF_CONFIG": json.dumps(tf_config)}), self.assertRaisesRegexp(
             ValueError, expected_msg_regexp):
-      run_config.RunConfig()
+      run_config_lib.RunConfig()
 
   def test_is_chief_from_cloud_tf_config(self):
     # is_chief should be true when ["task"]["type"] == "master" and
@@ -162,7 +169,7 @@ class RunConfigTest(test.TestCase):
         "environment": "cloud"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
@@ -182,7 +189,7 @@ class RunConfigTest(test.TestCase):
         "environment": "random"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
@@ -200,17 +207,139 @@ class RunConfigTest(test.TestCase):
         "environment": "random"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertFalse(config.is_chief)
 
   def test_default_is_chief_from_tf_config_without_job_name(self):
     tf_config = {"cluster": {}, "task": {}}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
+  def test_model_dir(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertIsNone(empty_config.model_dir)
+
+    config = run_config_lib.RunConfig(model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+
+  def test_model_dir_in_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig()
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_both_in_tf_config_and_constructor(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig(model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_fail_if_constructor_value_mismatch_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "`model_dir` provided in RunConfig .* must have "
+          "the same value .* in TF_CONFIG"):
+        run_config_lib.RunConfig(model_dir=TEST_DIR + "/sub_dir")
+
+  def test_replace(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertEqual(RANDOM_SEED, config.tf_random_seed)
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+    self.assertEqual(RANDOM_SEED, new_config.tf_random_seed)
+    self.assertEqual(RANDOM_SEED, config.tf_random_seed)
+
+  def test_uid_for_different_configs(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid()
+    # Check for 10 times, which should prove something.
+    for _ in range(10):
+      self.assertEqual(expected_uid, config.uid())
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertNotEqual(expected_uid, new_config.uid())
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_whitelist(self):
+    whitelist = ["model_dir"]
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid(whitelist)
+    self.assertEqual(expected_uid, config.uid(whitelist))
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertEqual(expected_uid, new_config.uid(whitelist))
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_default_whitelist(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_steps=13,
+        save_checkpoints_secs=14,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=True),
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(13, config.save_checkpoints_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(config_pb2.ConfigProto(allow_soft_placement=True),
+                     config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+    new_config = run_config_lib.RunConfig(
+        tf_random_seed=21,
+        save_summary_steps=22,
+        save_checkpoints_steps=23,
+        save_checkpoints_secs=24,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=False),
+        keep_checkpoint_max=26,
+        keep_checkpoint_every_n_hours=27)
+    self.assertEqual(config.uid(), new_config.uid())
+    # model_dir is not on the default whitelist.
+    self.assertNotEqual(config.uid(whitelist=[]),
+                        new_config.uid(whitelist=[]))
+    new_config = new_config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertNotEqual(config.uid(), new_config.uid())
+
+  def test_uid_for_deepcopy(self):
+    tf_config = {
+        "cluster": {
+            run_config_lib.TaskType.PS: ["host1:1", "host2:2"],
+            run_config_lib.TaskType.WORKER: ["host3:3", "host4:4", "host5:5"]
+        },
+        "task": {
+            "type": run_config_lib.TaskType.WORKER,
+            "index": 1
+        }
+    }
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      config = run_config_lib.RunConfig(
+          tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+    self.assertEqual(config.cluster_spec.as_dict(), tf_config["cluster"])
+
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid()
+    new_config = copy.deepcopy(config)
+    self.assertEqual(expected_uid, new_config.uid())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
index 859e98e45a6..6d045438192 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -109,8 +103,15 @@ class StabilityTest(test.TestCase):
           optimizer=_NULL_OPTIMIZER, feature_columns=columns, config=config)
       regressor2.fit(x=boston.data, y=boston.target, steps=1)
 
-    self.assertAllClose(regressor1.weights_, regressor2.weights_)
-    self.assertAllClose(regressor1.bias_, regressor2.bias_)
+    variable_names = regressor1.get_variable_names()
+    self.assertIn('linear//weight', variable_names)
+    self.assertIn('linear/bias_weight', variable_names)
+    regressor1_weights = regressor1.get_variable_value('linear//weight')
+    regressor2_weights = regressor2.get_variable_value('linear//weight')
+    regressor1_bias = regressor1.get_variable_value('linear/bias_weight')
+    regressor2_bias = regressor2.get_variable_value('linear/bias_weight')
+    self.assertAllClose(regressor1_weights, regressor2_weights)
+    self.assertAllClose(regressor1_bias, regressor2_bias)
     self.assertAllClose(
         list(regressor1.predict_scores(
             boston.data, as_iterable=True)),
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
index 8d776847265..0cea35e219a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
@@ -18,76 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.contrib import layers
-from tensorflow.contrib import metrics
 from tensorflow.contrib import rnn as rnn_cell
-from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.contrib.layers.python.layers import optimizers
-from tensorflow.contrib.learn.python.learn import metric_spec
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.contrib.rnn.python.ops import core_rnn
+from tensorflow.contrib.learn.python.learn.estimators import rnn_common
 from tensorflow.contrib.training.python.training import sequence_queueing_state_saver as sqss
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.training import momentum as momentum_opt
 from tensorflow.python.util import nest
 
 
-class ProblemType(object):
-  REGRESSION = 1
-  CLASSIFICATION = 2
-
-
-class RNNKeys(object):
-  PREDICTIONS_KEY = 'predictions'
-  PROBABILITIES_KEY = 'probabilities'
-  FINAL_STATE_KEY = 'final_state'
-  LABELS_KEY = '__labels__'
-  STATE_PREFIX = 'rnn_cell_state'
-
-
-def mask_activations_and_labels(activations, labels, sequence_lengths):
-  """Remove entries outside `sequence_lengths` and returned flattened results.
-
-  Args:
-    activations: Output of the RNN, shape `[batch_size, padded_length, k]`.
-    labels: Label values, shape `[batch_size, padded_length]`.
-    sequence_lengths: A `Tensor` of shape `[batch_size]` with the unpadded
-      length of each sequence. If `None`, then each sequence is unpadded.
-
-  Returns:
-    activations_masked: `logit` values with those beyond `sequence_lengths`
-      removed for each batch. Batches are then concatenated. Shape
-      `[tf.sum(sequence_lengths), k]` if `sequence_lengths` is not `None` and
-      shape `[batch_size * padded_length, k]` otherwise.
-    labels_masked: Label values after removing unneeded entries. Shape
-      `[tf.sum(sequence_lengths)]` if `sequence_lengths` is not `None` and shape
-      `[batch_size * padded_length]` otherwise.
-  """
-  with ops.name_scope('mask_activations_and_labels',
-                      values=[activations, labels, sequence_lengths]):
-    labels_shape = array_ops.shape(labels)
-    batch_size = labels_shape[0]
-    padded_length = labels_shape[1]
-    if sequence_lengths is None:
-      flattened_dimension = padded_length * batch_size
-      activations_masked = array_ops.reshape(activations,
-                                             [flattened_dimension, -1])
-      labels_masked = array_ops.reshape(labels, [flattened_dimension])
-    else:
-      mask = array_ops.sequence_mask(sequence_lengths, padded_length)
-      activations_masked = array_ops.boolean_mask(activations, mask)
-      labels_masked = array_ops.boolean_mask(labels, mask)
-    return activations_masked, labels_masked
-
-
 def construct_state_saving_rnn(cell,
                                inputs,
                                num_label_columns,
@@ -115,7 +64,7 @@ def construct_state_saving_rnn(cell,
     final_state: The final state output by the RNN
   """
   with ops.name_scope(scope):
-    rnn_outputs, final_state = core_rnn.static_state_saving_rnn(
+    rnn_outputs, final_state = rnn.static_state_saving_rnn(
         cell=cell,
         inputs=inputs,
         state_saver=state_saver,
@@ -130,103 +79,11 @@ def construct_state_saving_rnn(cell,
         activation_fn=None,
         trainable=True)
     # Use `identity` to rename `final_state`.
-    final_state = array_ops.identity(final_state, name=RNNKeys.FINAL_STATE_KEY)
+    final_state = array_ops.identity(
+        final_state, name=rnn_common.RNNKeys.FINAL_STATE_KEY)
     return activations, final_state
 
 
-def _mask_multivalue(sequence_length, metric):
-  """Wrapper function that masks values by `sequence_length`.
-
-  Args:
-    sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32`
-      containing the length of each sequence in the batch. If `None`, sequences
-      are assumed to be unpadded.
-    metric: A metric function. Its signature must contain `predictions` and
-      `labels`.
-
-  Returns:
-    A metric function that masks `predictions` and `labels` using
-    `sequence_length` and then applies `metric` to the results.
-  """
-  @functools.wraps(metric)
-  def _metric(predictions, labels, *args, **kwargs):
-    predictions, labels = mask_activations_and_labels(
-        predictions, labels, sequence_length)
-    return metric(predictions, labels, *args, **kwargs)
-  return _metric
-
-
-def _get_default_metrics(problem_type, sequence_length):
-  """Returns default `MetricSpec`s for `problem_type`.
-
-  Args:
-    problem_type: `ProblemType.CLASSIFICATION` or`ProblemType.REGRESSION`.
-    sequence_length: A `Tensor` with shape `[batch_size]` and dtype `int32`
-      containing the length of each sequence in the batch. If `None`, sequences
-      are assumed to be unpadded.
-  Returns:
-    A `dict` mapping strings to `MetricSpec`s.
-  """
-  default_metrics = {}
-  if problem_type == ProblemType.CLASSIFICATION:
-    default_metrics['accuracy'] = metric_spec.MetricSpec(
-        metric_fn=_mask_multivalue(sequence_length, metrics.streaming_accuracy),
-        prediction_key=RNNKeys.PREDICTIONS_KEY)
-  elif problem_type == ProblemType.REGRESSION:
-    pass
-  return default_metrics
-
-
-def _multi_value_predictions(
-    activations, target_column, predict_probabilities):
-  """Maps `activations` from the RNN to predictions for multi value models.
-
-  If `predict_probabilities` is `False`, this function returns a `dict`
-  containing single entry with key `PREDICTIONS_KEY`. If `predict_probabilities`
-  is `True`, it will contain a second entry with key `PROBABILITIES_KEY`. The
-  value of this entry is a `Tensor` of probabilities with shape
-  `[batch_size, padded_length, num_classes]`.
-
-  Note that variable length inputs will yield some predictions that don't have
-  meaning. For example, if `sequence_length = [3, 2]`, then prediction `[1, 2]`
-  has no meaningful interpretation.
-
-  Args:
-    activations: Output from an RNN. Should have dtype `float32` and shape
-      `[batch_size, padded_length, ?]`.
-    target_column: An initialized `TargetColumn`, calculate predictions.
-    predict_probabilities: A Python boolean, indicating whether probabilities
-      should be returned. Should only be set to `True` for
-      classification/logistic regression problems.
-  Returns:
-    A `dict` mapping strings to `Tensors`.
-  """
-  with ops.name_scope('MultiValuePrediction'):
-    activations_shape = array_ops.shape(activations)
-    flattened_activations = array_ops.reshape(activations,
-                                              [-1, activations_shape[2]])
-    prediction_dict = {}
-    if predict_probabilities:
-      flat_probabilities = target_column.logits_to_predictions(
-          flattened_activations, proba=True)
-      flat_predictions = math_ops.argmax(flat_probabilities, 1)
-      if target_column.num_label_columns == 1:
-        probability_shape = array_ops.concat([activations_shape[:2], [2]], 0)
-      else:
-        probability_shape = activations_shape
-      probabilities = array_ops.reshape(
-          flat_probabilities, probability_shape, name=RNNKeys.PROBABILITIES_KEY)
-      prediction_dict[RNNKeys.PROBABILITIES_KEY] = probabilities
-    else:
-      flat_predictions = target_column.logits_to_predictions(
-          flattened_activations, proba=False)
-    predictions = array_ops.reshape(
-        flat_predictions, [activations_shape[0], activations_shape[1]],
-        name=RNNKeys.PREDICTIONS_KEY)
-    prediction_dict[RNNKeys.PREDICTIONS_KEY] = predictions
-    return prediction_dict
-
-
 def _multi_value_loss(
     activations, labels, sequence_length, target_column, features):
   """Maps `activations` from the RNN to loss for multi value models.
@@ -245,7 +102,7 @@ def _multi_value_loss(
     A scalar `Tensor` containing the loss.
   """
   with ops.name_scope('MultiValueLoss'):
-    activations_masked, labels_masked = mask_activations_and_labels(
+    activations_masked, labels_masked = rnn_common.mask_activations_and_labels(
         activations, labels, sequence_length)
     return target_column.loss(activations_masked, labels_masked, features)
 
@@ -267,7 +124,7 @@ def _get_name_or_parent_names(column):
   return [column.name]
 
 
-def _prepare_features_for_sqss(features, labels, mode, input_key_column_name,
+def _prepare_features_for_sqss(features, labels, mode,
                                sequence_feature_columns,
                                context_feature_columns):
   """Prepares features for batching by the SQSS.
@@ -283,37 +140,24 @@ def _prepare_features_for_sqss(features, labels, mode, input_key_column_name,
     labels: An iterable of `Tensor`.
     mode: Defines whether this is training, evaluation or prediction.
       See `ModeKeys`.
-    input_key_column_name: Python string, the name of the feature column
-      containing a string scalar `Tensor` that serves as a unique key to
-      identify the input sequence across minibatches.
     sequence_feature_columns: An iterable containing all the feature columns
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
 
   Returns:
-    input_key: The string scalar `Tensor` that serves as a unique key to
-      identify the input sequence across minibatches.
     sequence_features: A dict mapping feature names to sequence features.
     context_features: A dict mapping feature names to context features.
 
   Raises:
-    ValueError: If `features` does not contain a value for
-      `input_key_column_name`.
     ValueError: If `features` does not contain a value for every key in
       `sequence_feature_columns` or `context_feature_columns`.
   """
-  # Pop the input key from the features dict.
-  input_key = features.pop(input_key_column_name, None)
-  if input_key is None:
-    raise ValueError('No key in features for input_key_column_name: ' +
-                     input_key_column_name)
 
   # Extract sequence features.
-
   feature_column_ops._check_supported_sequence_columns(sequence_feature_columns)  # pylint: disable=protected-access
   sequence_features = {}
   for column in sequence_feature_columns:
@@ -335,9 +179,57 @@ def _prepare_features_for_sqss(features, labels, mode, input_key_column_name,
 
   # Add labels to the resulting sequence features dict.
   if mode != model_fn.ModeKeys.INFER:
-    sequence_features[RNNKeys.LABELS_KEY] = labels
+    sequence_features[rnn_common.RNNKeys.LABELS_KEY] = labels
 
-  return input_key, sequence_features, context_features
+  return sequence_features, context_features
+
+
+def _get_state_names(cell):
+  """Gets the state names for an `RNNCell`.
+
+  Args:
+    cell: A `RNNCell` to be used in the RNN.
+
+  Returns:
+    State names in the form of a string, a list of strings, or a list of
+    string pairs, depending on the type of `cell.state_size`.
+
+  Raises:
+    TypeError: If cell.state_size is of type TensorShape.
+  """
+  state_size = cell.state_size
+  if isinstance(state_size, tensor_shape.TensorShape):
+    raise TypeError('cell.state_size of type TensorShape is not supported.')
+  if isinstance(state_size, int):
+    return '{}_{}'.format(rnn_common.RNNKeys.STATE_PREFIX, 0)
+  if isinstance(state_size, rnn_cell.LSTMStateTuple):
+    return [
+        '{}_{}_c'.format(rnn_common.RNNKeys.STATE_PREFIX, 0),
+        '{}_{}_h'.format(rnn_common.RNNKeys.STATE_PREFIX, 0),
+    ]
+  if isinstance(state_size[0], rnn_cell.LSTMStateTuple):
+    return [[
+        '{}_{}_c'.format(rnn_common.RNNKeys.STATE_PREFIX, i),
+        '{}_{}_h'.format(rnn_common.RNNKeys.STATE_PREFIX, i),
+    ] for i in range(len(state_size))]
+  return [
+      '{}_{}'.format(rnn_common.RNNKeys.STATE_PREFIX, i)
+      for i in range(len(state_size))]
+
+
+def _get_initial_states(cell):
+  """Gets the initial state of the `RNNCell` used in the RNN.
+
+  Args:
+    cell: A `RNNCell` to be used in the RNN.
+
+  Returns:
+    A Python dict mapping state names to the `RNNCell`'s initial state for
+    consumption by the SQSS.
+  """
+  names = nest.flatten(_get_state_names(cell))
+  values = nest.flatten(cell.zero_state(1, dtype=dtypes.float32))
+  return {n: array_ops.squeeze(v, axis=0) for [n, v] in zip(names, values)}
 
 
 def _read_batch(cell,
@@ -345,13 +237,12 @@ def _read_batch(cell,
                 labels,
                 mode,
                 num_unroll,
-                num_layers,
                 batch_size,
-                input_key_column_name,
                 sequence_feature_columns,
                 context_feature_columns=None,
                 num_threads=3,
-                queue_capacity=1000):
+                queue_capacity=1000,
+                seed=None):
   """Reads a batch from a state saving sequence queue.
 
   Args:
@@ -365,16 +256,12 @@ def _read_batch(cell,
     num_unroll: Python integer, how many time steps to unroll at a time.
       The input sequences of length `k` are then split into `k / num_unroll`
       many segments.
-    num_layers: Python integer, number of layers in the RNN.
     batch_size: Python integer, the size of the minibatch produced by the SQSS.
-    input_key_column_name: Python string, the name of the feature column
-      containing a string scalar `Tensor` that serves as a unique key to
-      identify input sequence across minibatches.
     sequence_feature_columns: An iterable containing all the feature columns
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     num_threads: The Python integer number of threads enqueuing input examples
@@ -383,27 +270,20 @@ def _read_batch(cell,
       Needs to be at least `batch_size`. Defaults to 1000. When iterating
       over the same input example multiple times reusing their keys the
       `queue_capacity` must be smaller than the number of examples.
+    seed: Fixes the random seed used for generating input keys by the SQSS.
 
   Returns:
     batch: A `NextQueuedSequenceBatch` containing batch_size `SequenceExample`
       values and their saved internal states.
   """
-  # Set batch_size=1 to initialize SQSS with cell's zero state.
-  values = cell.zero_state(batch_size=1, dtype=dtypes.float32)
+  states = _get_initial_states(cell)
 
-  # Set up stateful queue reader.
-  states = {}
-  state_names = _get_lstm_state_names(num_layers)
-  for i in range(num_layers):
-    states[state_names[i][0]] = array_ops.squeeze(values[i][0], axis=0)
-    states[state_names[i][1]] = array_ops.squeeze(values[i][1], axis=0)
-
-  input_key, sequences, context = _prepare_features_for_sqss(
-      features, labels, mode, input_key_column_name, sequence_feature_columns,
+  sequences, context = _prepare_features_for_sqss(
+      features, labels, mode, sequence_feature_columns,
       context_feature_columns)
 
   return sqss.batch_sequences_with_states(
-      input_key=input_key,
+      input_key='key',
       input_sequences=sequences,
       input_context=context,
       input_length=None,  # infer sequence lengths
@@ -411,40 +291,15 @@ def _read_batch(cell,
       num_unroll=num_unroll,
       batch_size=batch_size,
       pad=True,  # pad to a multiple of num_unroll
+      make_keys_unique=True,
+      make_keys_unique_seed=seed,
       num_threads=num_threads,
       capacity=queue_capacity)
 
 
-def apply_dropout(
-    cell, input_keep_probability, output_keep_probability, random_seed=None):
-  """Apply dropout to the outputs and inputs of `cell`.
-
-  Args:
-    cell: An `RNNCell`.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    random_seed: Seed for random dropout.
-
-  Returns:
-    An `RNNCell`, the result of applying the supplied dropouts to `cell`.
-  """
-  input_prob_none = input_keep_probability is None
-  output_prob_none = output_keep_probability is None
-  if input_prob_none and output_prob_none:
-    return cell
-  if input_prob_none:
-    input_keep_probability = 1.0
-  if output_prob_none:
-    output_keep_probability = 1.0
-  return rnn_cell.DropoutWrapper(
-      cell, input_keep_probability, output_keep_probability, random_seed)
-
-
 def _get_state_name(i):
   """Constructs the name string for state component `i`."""
-  return '{}_{}'.format(RNNKeys.STATE_PREFIX, i)
+  return '{}_{}'.format(rnn_common.RNNKeys.STATE_PREFIX, i)
 
 
 def state_tuple_to_dict(state):
@@ -523,37 +378,37 @@ def _prepare_inputs_for_rnn(sequence_features, context_features,
       axis=1)
 
 
-def _get_rnn_model_fn(cell,
+def _get_rnn_model_fn(cell_type,
                       target_column,
                       problem_type,
                       optimizer,
                       num_unroll,
-                      num_layers,
+                      num_units,
                       num_threads,
                       queue_capacity,
                       batch_size,
-                      input_key_column_name,
                       sequence_feature_columns,
                       context_feature_columns=None,
                       predict_probabilities=False,
                       learning_rate=None,
                       gradient_clipping_norm=None,
-                      input_keep_probability=None,
-                      output_keep_probability=None,
-                      name='StateSavingRNNModel'):
+                      dropout_keep_probabilities=None,
+                      name='StateSavingRNNModel',
+                      seed=None):
   """Creates a state saving RNN model function for an `Estimator`.
 
   Args:
-    cell: An initialized `RNNCell` to be used in the RNN.
+    cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
     target_column: An initialized `TargetColumn`, used to calculate prediction
       and loss.
-    problem_type: `ProblemType.CLASSIFICATION` or`ProblemType.REGRESSION`.
+    problem_type: `ProblemType.CLASSIFICATION` or
+    `ProblemType.LINEAR_REGRESSION`.
     optimizer: A subclass of `Optimizer`, an instance of an `Optimizer` or a
       string.
     num_unroll: Python integer, how many time steps to unroll at a time.
       The input sequences of length `k` are then split into `k / num_unroll`
       many segments.
-    num_layers: Python integer, number of layers in the RNN.
+    num_units: The number of units in the `RNNCell`.
     num_threads: The Python integer number of threads enqueuing input examples
       into a queue.
     queue_capacity: The max capacity of the queue in number of examples.
@@ -561,100 +416,96 @@ def _get_rnn_model_fn(cell,
       example multiple times reusing their keys the `queue_capacity` must be
       smaller than the number of examples.
     batch_size: Python integer, the size of the minibatch produced by the SQSS.
-    input_key_column_name: Python string, the name of the feature column
-      containing a string scalar `Tensor` that serves as a unique key to
-      identify input sequence across minibatches.
     sequence_feature_columns: An iterable containing all the feature columns
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes. Must only be used with `ProblemType.CLASSIFICATION`.
+      for all classes.
+      Must only be used with `ProblemType.CLASSIFICATION`.
     learning_rate: Learning rate used for optimization. This argument has no
       effect if `optimizer` is an instance of an `Optimizer`.
     gradient_clipping_norm: A float. Gradients will be clipped to this value.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
+    dropout_keep_probabilities: a list of dropout keep probabilities or `None`.
+      If given a list, it must have length `len(num_units) + 1`.
     name: A string that will be used to create a scope for the RNN.
+    seed: Fixes the random seed used for generating input keys by the SQSS.
 
   Returns:
     A model function to be passed to an `Estimator`.
 
   Raises:
-    ValueError: `problem_type` is not one of `ProblemType.REGRESSION` or
-      `ProblemType.CLASSIFICATION`.
+    ValueError: `problem_type` is not one of
+      `ProblemType.LINEAR_REGRESSION`
+      or `ProblemType.CLASSIFICATION`.
     ValueError: `predict_probabilities` is `True` for `problem_type` other
       than `ProblemType.CLASSIFICATION`.
     ValueError: `num_unroll` is not positive.
-    ValueError: `input_key_column_name` is empty.
   """
-  if problem_type not in (ProblemType.CLASSIFICATION, ProblemType.REGRESSION):
+  if problem_type not in (constants.ProblemType.CLASSIFICATION,
+                          constants.ProblemType.LINEAR_REGRESSION):
     raise ValueError(
-        'problem_type must be ProblemType.REGRESSION or '
+        'problem_type must be ProblemType.LINEAR_REGRESSION or '
         'ProblemType.CLASSIFICATION; got {}'.
         format(problem_type))
-  if problem_type != ProblemType.CLASSIFICATION and predict_probabilities:
+  if (problem_type != constants.ProblemType.CLASSIFICATION and
+      predict_probabilities):
     raise ValueError(
         'predict_probabilities can only be set to True for problem_type'
         ' ProblemType.CLASSIFICATION; got {}.'.format(problem_type))
   if num_unroll <= 0:
     raise ValueError('num_unroll must be positive; got {}.'.format(num_unroll))
-  if not input_key_column_name:
-    raise ValueError('input_key_column_name must not be empty')
 
   def _rnn_model_fn(features, labels, mode):
     """The model to be passed to an `Estimator`."""
     with ops.name_scope(name):
-      if mode == model_fn.ModeKeys.TRAIN:
-        cell_for_mode = apply_dropout(
-            cell, input_keep_probability, output_keep_probability)
-      else:
-        cell_for_mode = cell
+      dropout = (dropout_keep_probabilities
+                 if mode == model_fn.ModeKeys.TRAIN
+                 else None)
+      cell = rnn_common.construct_rnn_cell(num_units, cell_type, dropout)
 
       batch = _read_batch(
-          cell=cell_for_mode,
+          cell=cell,
           features=features,
           labels=labels,
           mode=mode,
           num_unroll=num_unroll,
-          num_layers=num_layers,
           batch_size=batch_size,
-          input_key_column_name=input_key_column_name,
           sequence_feature_columns=sequence_feature_columns,
           context_feature_columns=context_feature_columns,
           num_threads=num_threads,
-          queue_capacity=queue_capacity)
+          queue_capacity=queue_capacity,
+          seed=seed)
       sequence_features = batch.sequences
       context_features = batch.context
       if mode != model_fn.ModeKeys.INFER:
-        labels = sequence_features.pop(RNNKeys.LABELS_KEY)
+        labels = sequence_features.pop(rnn_common.RNNKeys.LABELS_KEY)
       inputs = _prepare_inputs_for_rnn(sequence_features, context_features,
                                        sequence_feature_columns, num_unroll)
-      state_name = _get_lstm_state_names(num_layers)
+      state_name = _get_state_names(cell)
       rnn_activations, final_state = construct_state_saving_rnn(
-          cell=cell_for_mode,
+          cell=cell,
           inputs=inputs,
           num_label_columns=target_column.num_label_columns,
           state_saver=batch,
           state_name=state_name)
 
       loss = None  # Created below for modes TRAIN and EVAL.
-      prediction_dict = _multi_value_predictions(rnn_activations, target_column,
-                                                 predict_probabilities)
+      prediction_dict = rnn_common.multi_value_predictions(
+          rnn_activations, target_column, problem_type, predict_probabilities)
       if mode != model_fn.ModeKeys.INFER:
         loss = _multi_value_loss(rnn_activations, labels, batch.length,
                                  target_column, features)
 
       eval_metric_ops = None
       if mode != model_fn.ModeKeys.INFER:
-        default_metrics = _get_default_metrics(problem_type, batch.length)
-        eval_metric_ops = estimator._make_metrics_ops(  # pylint: disable=protected-access
-            default_metrics, features, labels, prediction_dict)
+        eval_metric_ops = rnn_common.get_eval_metric_ops(
+            problem_type, rnn_common.PredictionType.MULTIPLE_VALUE,
+            batch.length, prediction_dict, labels)
+
       state_dict = state_tuple_to_dict(final_state)
       prediction_dict.update(state_dict)
 
@@ -676,231 +527,127 @@ def _get_rnn_model_fn(cell,
   return _rnn_model_fn
 
 
-def _get_lstm_state_names(num_layers):
-  """Returns a num_layers long list of lstm state name pairs.
+class StateSavingRnnEstimator(estimator.Estimator):
 
-  Args:
-    num_layers: The number of layers in the RNN.
+  def __init__(self,
+               problem_type,
+               num_unroll,
+               batch_size,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_classes=None,
+               num_units=None,
+               cell_type='basic_rnn',
+               optimizer_type='SGD',
+               learning_rate=0.1,
+               predict_probabilities=False,
+               momentum=None,
+               gradient_clipping_norm=5.0,
+               dropout_keep_probabilities=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               num_threads=3,
+               queue_capacity=1000,
+               seed=None):
+    """Initializes a StateSavingRnnEstimator.
 
-  Returns:
-     A num_layers long list of lstm state name pairs of the form:
-     ['lstm_state_cN', 'lstm_state_mN'] for all N from 0 to num_layers.
-  """
-  return [['lstm_state_c' + str(i), 'lstm_state_m' + str(i)]
-          for i in range(num_layers)]
+    Args:
+      problem_type: `ProblemType.CLASSIFICATION` or
+        `ProblemType.LINEAR_REGRESSION`.
+      num_unroll: Python integer, how many time steps to unroll at a time.
+        The input sequences of length `k` are then split into `k / num_unroll`
+        many segments.
+      batch_size: Python integer, the size of the minibatch.
+      sequence_feature_columns: An iterable containing all the feature columns
+        describing sequence features. All items in the set should be instances
+        of classes derived from `FeatureColumn`.
+      context_feature_columns: An iterable containing all the feature columns
+        describing context features, i.e., features that apply across all time
+        steps. All items in the set should be instances of classes derived from
+        `FeatureColumn`.
+      num_classes: The number of classes for categorization. Used only and
+        required if `problem_type` is `ProblemType.CLASSIFICATION`.
+      num_units: A list of integers indicating the number of units in the
+        `RNNCell`s in each layer. Either `num_units` is specified or `cell_type`
+        is an instance of `RNNCell`.
+      cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
+      optimizer_type: The type of optimizer to use. Either a subclass of
+        `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
+        one of 'Adagrad', 'Adam', 'Ftrl', Momentum', 'RMSProp', or 'SGD'.
+      learning_rate: Learning rate. This argument has no effect if `optimizer`
+        is an instance of an `Optimizer`.
+      predict_probabilities: A boolean indicating whether to predict
+        probabilities for all classes. Used only if `problem_type` is
+        `ProblemType.CLASSIFICATION`.
+      momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
+      gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
+        then no clipping is performed.
+      dropout_keep_probabilities: a list of dropout keep probabilities or
+        `None`. If given a list, it must have length `len(num_units) + 1`.
+      model_dir: The directory in which to save and restore the model graph,
+        parameters, etc.
+      config: A `RunConfig` instance.
+      feature_engineering_fn: Takes features and labels which are the output of
+        `input_fn` and returns features and labels which will be fed into
+        `model_fn`. Please check `model_fn` for a definition of features and
+        labels.
+      num_threads: The Python integer number of threads enqueuing input examples
+        into a queue. Defaults to 3.
+      queue_capacity: The max capacity of the queue in number of examples.
+        Needs to be at least `batch_size`. Defaults to 1000. When iterating
+        over the same input example multiple times reusing their keys the
+        `queue_capacity` must be smaller than the number of examples.
+      seed: Fixes the random seed used for generating input keys by the SQSS.
 
+    Raises:
+      ValueError: Both or neither of the following are true: (a) `num_units` is
+        specified and (b) `cell_type` is an instance of `RNNCell`.
+      ValueError: `problem_type` is not one of
+        `ProblemType.LINEAR_REGRESSION` or `ProblemType.CLASSIFICATION`.
+      ValueError: `problem_type` is `ProblemType.CLASSIFICATION` but
+        `num_classes` is not specified.
+    """
+    name = 'MultiValueStateSavingRNN'
+    if problem_type == constants.ProblemType.LINEAR_REGRESSION:
+      name += 'Regressor'
+      target_column = layers.regression_target()
+    elif problem_type == constants.ProblemType.CLASSIFICATION:
+      if not num_classes:
+        raise ValueError('For CLASSIFICATION problem_type, num_classes must be '
+                         'specified.')
+      target_column = layers.multi_class_target(n_classes=num_classes)
+      name += 'Classifier'
+    else:
+      raise ValueError(
+          'problem_type must be either ProblemType.LINEAR_REGRESSION '
+          'or ProblemType.CLASSIFICATION; got {}'.format(
+              problem_type))
 
-# TODO(jtbates): Allow users to specify cell types other than LSTM.
-def lstm_cell(num_units, num_layers):
-  """Constructs a `MultiRNNCell` with num_layers `BasicLSTMCell`s.
+    if optimizer_type == 'Momentum':
+      optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
 
-  Args:
-    num_units: The number of units in the `RNNCell`.
-    num_layers: The number of layers in the RNN.
+    rnn_model_fn = _get_rnn_model_fn(
+        cell_type=cell_type,
+        target_column=target_column,
+        problem_type=problem_type,
+        optimizer=optimizer_type,
+        num_unroll=num_unroll,
+        num_units=num_units,
+        num_threads=num_threads,
+        queue_capacity=queue_capacity,
+        batch_size=batch_size,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        predict_probabilities=predict_probabilities,
+        learning_rate=learning_rate,
+        gradient_clipping_norm=gradient_clipping_norm,
+        dropout_keep_probabilities=dropout_keep_probabilities,
+        name=name,
+        seed=seed)
 
-  Returns:
-    An intiialized `MultiRNNCell`.
-  """
-  return rnn_cell.MultiRNNCell([
-      rnn_cell.BasicLSTMCell(
-          num_units=num_units, state_is_tuple=True) for _ in range(num_layers)
-  ])
-
-
-@experimental
-def multi_value_rnn_regressor(num_units,
-                              num_unroll,
-                              batch_size,
-                              input_key_column_name,
-                              sequence_feature_columns,
-                              context_feature_columns=None,
-                              num_rnn_layers=1,
-                              optimizer_type='SGD',
-                              learning_rate=0.1,
-                              momentum=None,
-                              gradient_clipping_norm=5.0,
-                              input_keep_probability=None,
-                              output_keep_probability=None,
-                              model_dir=None,
-                              config=None,
-                              feature_engineering_fn=None,
-                              num_threads=3,
-                              queue_capacity=1000):
-  """Creates a RNN `Estimator` that predicts sequences of values.
-
-  Args:
-    num_units: The size of the RNN cells.
-    num_unroll: Python integer, how many time steps to unroll at a time.
-      The input sequences of length `k` are then split into `k / num_unroll`
-      many segments.
-    batch_size: Python integer, the size of the minibatch.
-    input_key_column_name: Python string, the name of the feature column
-      containing a string scalar `Tensor` that serves as a unique key to
-      identify input sequence across minibatches.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-    num_threads: The Python integer number of threads enqueuing input examples
-      into a queue. Defaults to 3.
-    queue_capacity: The max capacity of the queue in number of examples.
-      Needs to be at least `batch_size`. Defaults to 1000. When iterating
-      over the same input example multiple times reusing their keys the
-      `queue_capacity` must be smaller than the number of examples.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = lstm_cell(num_units, num_rnn_layers)
-  target_column = layers.regression_target()
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  rnn_model_fn = _get_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.REGRESSION,
-      optimizer=optimizer_type,
-      num_unroll=num_unroll,
-      num_layers=num_rnn_layers,
-      num_threads=num_threads,
-      queue_capacity=queue_capacity,
-      batch_size=batch_size,
-      input_key_column_name=input_key_column_name,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='MultiValueRnnRegressor')
-
-  return estimator.Estimator(
-      model_fn=rnn_model_fn,
-      model_dir=model_dir,
-      config=config,
-      feature_engineering_fn=feature_engineering_fn)
-
-
-@experimental
-def multi_value_rnn_classifier(num_classes,
-                               num_units,
-                               num_unroll,
-                               batch_size,
-                               input_key_column_name,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               predict_probabilities=False,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               input_keep_probability=None,
-                               output_keep_probability=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None,
-                               num_threads=3,
-                               queue_capacity=1000):
-  """Creates a RNN `Estimator` that predicts sequences of labels.
-
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells.
-    num_unroll: Python integer, how many time steps to unroll at a time.
-      The input sequences of length `k` are then split into `k / num_unroll`
-      many segments.
-    batch_size: Python integer, the size of the minibatch.
-    input_key_column_name: Python string, the name of the feature column
-      containing a string scalar `Tensor` that serves as a unique key to
-      identify input sequence across minibatches.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    num_rnn_layers: Number of RNN layers.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-    num_threads: The Python integer number of threads enqueuing input examples
-      into a queue. Defaults to 3.
-    queue_capacity: The max capacity of the queue in number of examples.
-      Needs to be at least `batch_size`. Defaults to 1000. When iterating
-      over the same input example multiple times reusing their keys the
-      `queue_capacity` must be smaller than the number of examples.
-  Returns:
-    An initialized `Estimator`.
-  """
-  cell = lstm_cell(num_units, num_rnn_layers)
-  target_column = layers.multi_class_target(n_classes=num_classes)
-  if optimizer_type == 'Momentum':
-    optimizer_type = momentum_opt.MomentumOptimizer(learning_rate, momentum)
-  rnn_model_fn = _get_rnn_model_fn(
-      cell=cell,
-      target_column=target_column,
-      problem_type=ProblemType.CLASSIFICATION,
-      optimizer=optimizer_type,
-      num_unroll=num_unroll,
-      num_layers=num_rnn_layers,
-      num_threads=num_threads,
-      queue_capacity=queue_capacity,
-      batch_size=batch_size,
-      input_key_column_name=input_key_column_name,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      predict_probabilities=predict_probabilities,
-      learning_rate=learning_rate,
-      gradient_clipping_norm=gradient_clipping_norm,
-      input_keep_probability=input_keep_probability,
-      output_keep_probability=output_keep_probability,
-      name='MultiValueRnnClassifier')
-
-  return estimator.Estimator(
-      model_fn=rnn_model_fn,
-      model_dir=model_dir,
-      config=config,
-      feature_engineering_fn=feature_engineering_fn)
+    super(StateSavingRnnEstimator, self).__init__(
+        model_fn=rnn_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
index 3e8bc302d64..442247409db 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
@@ -18,31 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import lookup
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import target_column as target_column_lib
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.learn.python.learn.estimators import rnn_common
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import state_saving_rnn_estimator as ssre
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -59,7 +55,7 @@ class PrepareInputsForRnnTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
+      sess.run(lookup_ops.tables_initializer())
       features_val = sess.run(features_by_time)
       self.assertAllEqual(expected, features_val)
 
@@ -249,13 +245,10 @@ class StateSavingRnnEstimatorTest(test.TestCase):
     seq_feature_name = 'seq_feature'
     sparse_seq_feature_name = 'wire_cast'
     ctx_feature_name = 'ctx_feature'
-    input_key_column_name = 'input_key_column'
     sequence_length = 4
     embedding_dimension = 8
 
     features = {
-        input_key_column_name:
-            constant_op.constant('input0'),
         sparse_seq_feature_name:
             sparse_tensor.SparseTensor(
                 indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
@@ -289,10 +282,8 @@ class StateSavingRnnEstimatorTest(test.TestCase):
             ctx_feature_name, dimension=1)
     ]
 
-    expected_input_key = b'input0'
-
     expected_sequence = {
-        ssre.RNNKeys.LABELS_KEY:
+        rnn_common.RNNKeys.LABELS_KEY:
             np.array([5., 5., 5., 5.]),
         seq_feature_name:
             np.array([1., 1., 1., 1.]),
@@ -309,8 +300,8 @@ class StateSavingRnnEstimatorTest(test.TestCase):
 
     expected_context = {ctx_feature_name: 2.}
 
-    input_key, sequence, context = ssre._prepare_features_for_sqss(
-        features, labels, mode, input_key_column_name, sequence_feature_columns,
+    sequence, context = ssre._prepare_features_for_sqss(
+        features, labels, mode, sequence_feature_columns,
         context_feature_columns)
 
     def assert_equal(expected, got):
@@ -325,92 +316,34 @@ class StateSavingRnnEstimatorTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
-      actual_input_key, actual_sequence, actual_context = sess.run(
-          [input_key, sequence, context])
-      self.assertEqual(expected_input_key, actual_input_key)
+      sess.run(lookup_ops.tables_initializer())
+      actual_sequence, actual_context = sess.run(
+          [sequence, context])
       assert_equal(expected_sequence, actual_sequence)
       assert_equal(expected_context, actual_context)
 
-  def testMaskActivationsAndLabels(self):
-    """Test `mask_activations_and_labels`."""
-    batch_size = 4
-    padded_length = 6
-    num_classes = 4
-    np.random.seed(1234)
-    sequence_length = np.random.randint(0, padded_length + 1, batch_size)
-    activations = np.random.rand(batch_size, padded_length, num_classes)
-    labels = np.random.randint(0, num_classes, [batch_size, padded_length])
-    (activations_masked_t, labels_masked_t) = ssre.mask_activations_and_labels(
-        constant_op.constant(
-            activations, dtype=dtypes.float32),
-        constant_op.constant(
-            labels, dtype=dtypes.int32),
-        constant_op.constant(
-            sequence_length, dtype=dtypes.int32))
-
-    with self.test_session() as sess:
-      activations_masked, labels_masked = sess.run(
-          [activations_masked_t, labels_masked_t])
-
-    expected_activations_shape = [sum(sequence_length), num_classes]
-    np.testing.assert_equal(
-        expected_activations_shape, activations_masked.shape,
-        'Wrong activations shape. Expected {}; got {}.'.format(
-            expected_activations_shape, activations_masked.shape))
-
-    expected_labels_shape = [sum(sequence_length)]
-    np.testing.assert_equal(expected_labels_shape, labels_masked.shape,
-                            'Wrong labels shape. Expected {}; got {}.'.format(
-                                expected_labels_shape, labels_masked.shape))
-    masked_index = 0
-    for i in range(batch_size):
-      for j in range(sequence_length[i]):
-        actual_activations = activations_masked[masked_index]
-        expected_activations = activations[i, j, :]
-        np.testing.assert_almost_equal(
-            expected_activations,
-            actual_activations,
-            err_msg='Unexpected logit value at index [{}, {}, :].'
-            '  Expected {}; got {}.'.format(i, j, expected_activations,
-                                            actual_activations))
-
-        actual_labels = labels_masked[masked_index]
-        expected_labels = labels[i, j]
-        np.testing.assert_almost_equal(
-            expected_labels,
-            actual_labels,
-            err_msg='Unexpected logit value at index [{}, {}].'
-            ' Expected {}; got {}.'.format(i, j, expected_labels,
-                                           actual_labels))
-        masked_index += 1
-
   def _getModelFnOpsForMode(self, mode):
     """Helper for testGetRnnModelFn{Train,Eval,Infer}()."""
-    cell_size = 4
-    num_layers = 1
-    cell = ssre.lstm_cell(cell_size, num_layers)
+    num_units = [4]
     seq_columns = [
         feature_column.real_valued_column(
-            'inputs', dimension=cell_size)
+            'inputs', dimension=1)
     ]
     features = {
         'inputs': constant_op.constant([1., 2., 3.]),
-        'input_key_column': constant_op.constant('input0')
     }
     labels = constant_op.constant([1., 0., 1.])
     model_fn = ssre._get_rnn_model_fn(
-        cell=cell,
+        cell_type='basic_rnn',
         target_column=target_column_lib.multi_class_target(n_classes=2),
         optimizer='SGD',
         num_unroll=2,
-        num_layers=num_layers,
+        num_units=num_units,
         num_threads=1,
         queue_capacity=10,
         batch_size=1,
-        input_key_column_name='input_key_column',
         # Only CLASSIFICATION yields eval metrics to test for.
-        problem_type=ssre.ProblemType.CLASSIFICATION,
+        problem_type=constants.ProblemType.CLASSIFICATION,
         sequence_feature_columns=seq_columns,
         context_feature_columns=None,
         learning_rate=0.1)
@@ -444,38 +377,28 @@ class StateSavingRnnEstimatorTest(test.TestCase):
     self.assertFalse(model_fn_ops.eval_metric_ops)
 
   def testExport(self):
-    input_key_column_name = 'input0'
     input_feature_key = 'magic_input_feature_key'
     batch_size = 8
-    cell_size = 4
+    num_units = [4]
     sequence_length = 10
     num_unroll = 2
     num_classes = 2
 
     seq_columns = [
         feature_column.real_valued_column(
-            'inputs', dimension=cell_size)
+            'inputs', dimension=4)
     ]
 
     def get_input_fn(mode, seed):
 
       def input_fn():
-        input_key = string_ops.string_join([
-            'key_', string_ops.as_string(
-                random_ops.random_uniform(
-                    (),
-                    minval=0,
-                    maxval=10000000,
-                    dtype=dtypes.int32,
-                    seed=seed))
-        ])
         features = {}
         random_sequence = random_ops.random_uniform(
             [sequence_length + 1], 0, 2, dtype=dtypes.int32, seed=seed)
         labels = array_ops.slice(random_sequence, [0], [sequence_length])
         inputs = math_ops.to_float(
             array_ops.slice(random_sequence, [1], [sequence_length]))
-        features = {'inputs': inputs, input_key_column_name: input_key}
+        features = {'inputs': inputs}
 
         if mode == model_fn_lib.ModeKeys.INFER:
           input_examples = array_ops.placeholder(dtypes.string)
@@ -488,16 +411,17 @@ class StateSavingRnnEstimatorTest(test.TestCase):
     model_dir = tempfile.mkdtemp()
 
     def estimator_fn():
-      return ssre.multi_value_rnn_classifier(
-          num_classes=num_classes,
-          num_units=cell_size,
+      return ssre.StateSavingRnnEstimator(
+          constants.ProblemType.CLASSIFICATION,
+          num_units=num_units,
           num_unroll=num_unroll,
           batch_size=batch_size,
-          input_key_column_name=input_key_column_name,
           sequence_feature_columns=seq_columns,
+          num_classes=num_classes,
           predict_probabilities=True,
           model_dir=model_dir,
-          queue_capacity=2 + batch_size)
+          queue_capacity=2 + batch_size,
+          seed=1234)
 
     # Train a bit to create an exportable checkpoint.
     estimator_fn().fit(input_fn=get_input_fn(
@@ -516,6 +440,22 @@ class StateSavingRnnEstimatorTest(test.TestCase):
         input_feature_key=input_feature_key)
 
 
+# Smoke tests to ensure deprecated constructor functions still work.
+class LegacyConstructorTest(test.TestCase):
+
+  def _get_input_fn(self,
+                    sequence_length,
+                    seed=None):
+    def input_fn():
+      random_sequence = random_ops.random_uniform(
+          [sequence_length + 1], 0, 2, dtype=dtypes.int32, seed=seed)
+      labels = array_ops.slice(random_sequence, [0], [sequence_length])
+      inputs = math_ops.to_float(
+          array_ops.slice(random_sequence, [1], [sequence_length]))
+      return {'inputs': inputs}, labels
+    return input_fn
+
+
 # TODO(jtbates): move all tests below to a benchmark test.
 class StateSavingRNNEstimatorLearningTest(test.TestCase):
   """Learning tests for state saving RNN Estimators."""
@@ -527,10 +467,10 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     sequence_length = 64
     train_steps = 250
     eval_steps = 20
-    num_units = 4
+    num_rnn_layers = 1
+    num_units = [4] * num_rnn_layers
     learning_rate = 0.3
     loss_threshold = 0.035
-    input_key_column_name = 'input_key_column'
 
     def get_sin_input_fn(sequence_length, increment, seed=None):
 
@@ -542,30 +482,28 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
                               sequence_length + 1))
         inputs = array_ops.slice(sin_curves, [0], [sequence_length])
         labels = array_ops.slice(sin_curves, [1], [sequence_length])
-        input_key = string_ops.string_join([
-            'key_',
-            string_ops.as_string(math_ops.cast(10000 * start, dtypes.int32))
-        ])
-        return {'inputs': inputs, input_key_column_name: input_key}, labels
+        return {'inputs': inputs}, labels
 
       return input_fn
 
     seq_columns = [
         feature_column.real_valued_column(
-            'inputs', dimension=num_units)
+            'inputs', dimension=1)
     ]
     config = run_config.RunConfig(tf_random_seed=1234)
-    sequence_estimator = ssre.multi_value_rnn_regressor(
+    dropout_keep_probabilities = [0.9] * (num_rnn_layers + 1)
+    sequence_estimator = ssre.StateSavingRnnEstimator(
+        constants.ProblemType.LINEAR_REGRESSION,
         num_units=num_units,
+        cell_type='lstm',
         num_unroll=num_unroll,
         batch_size=batch_size,
-        input_key_column_name=input_key_column_name,
         sequence_feature_columns=seq_columns,
         learning_rate=learning_rate,
-        input_keep_probability=0.9,
-        output_keep_probability=0.9,
+        dropout_keep_probabilities=dropout_keep_probabilities,
         config=config,
-        queue_capacity=2 * batch_size)
+        queue_capacity=2 * batch_size,
+        seed=1234)
 
     train_input_fn = get_sin_input_fn(sequence_length, np.pi / 32, seed=1234)
     eval_input_fn = get_sin_input_fn(sequence_length, np.pi / 32, seed=4321)
@@ -587,12 +525,11 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     num_classes = 2
     num_unroll = 32
     sequence_length = 32
-    train_steps = 200
+    train_steps = 300
     eval_steps = 20
-    num_units = 4
+    num_units = [4]
     learning_rate = 0.5
     accuracy_threshold = 0.9
-    input_key_column_name = 'input_key_column'
 
     def get_shift_input_fn(sequence_length, seed=None):
 
@@ -602,35 +539,28 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
         labels = array_ops.slice(random_sequence, [0], [sequence_length])
         inputs = math_ops.to_float(
             array_ops.slice(random_sequence, [1], [sequence_length]))
-        input_key = string_ops.string_join([
-            'key_', string_ops.as_string(
-                random_ops.random_uniform(
-                    (),
-                    minval=0,
-                    maxval=10000000,
-                    dtype=dtypes.int32,
-                    seed=seed))
-        ])
-        return {'inputs': inputs, input_key_column_name: input_key}, labels
+        return {'inputs': inputs}, labels
 
       return input_fn
 
     seq_columns = [
         feature_column.real_valued_column(
-            'inputs', dimension=num_units)
+            'inputs', dimension=1)
     ]
     config = run_config.RunConfig(tf_random_seed=21212)
-    sequence_estimator = ssre.multi_value_rnn_classifier(
-        num_classes=num_classes,
+    sequence_estimator = ssre.StateSavingRnnEstimator(
+        constants.ProblemType.CLASSIFICATION,
         num_units=num_units,
+        cell_type='lstm',
         num_unroll=num_unroll,
         batch_size=batch_size,
-        input_key_column_name=input_key_column_name,
         sequence_feature_columns=seq_columns,
+        num_classes=num_classes,
         learning_rate=learning_rate,
         config=config,
         predict_probabilities=True,
-        queue_capacity=2 + batch_size)
+        queue_capacity=2 + batch_size,
+        seed=1234)
 
     train_input_fn = get_shift_input_fn(sequence_length, seed=12321)
     eval_input_fn = get_shift_input_fn(sequence_length, seed=32123)
@@ -650,11 +580,11 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     self.assertListEqual(
         sorted(list(prediction_dict.keys())),
         sorted([
-            ssre.RNNKeys.PREDICTIONS_KEY, ssre.RNNKeys.PROBABILITIES_KEY,
-            ssre._get_state_name(0)
+            prediction_key.PredictionKey.CLASSES,
+            prediction_key.PredictionKey.PROBABILITIES, ssre._get_state_name(0)
         ]))
-    predictions = prediction_dict[ssre.RNNKeys.PREDICTIONS_KEY]
-    probabilities = prediction_dict[ssre.RNNKeys.PROBABILITIES_KEY]
+    predictions = prediction_dict[prediction_key.PredictionKey.CLASSES]
+    probabilities = prediction_dict[prediction_key.PredictionKey.PROBABILITIES]
     self.assertListEqual(list(predictions.shape), [batch_size, sequence_length])
     self.assertListEqual(
         list(probabilities.shape), [batch_size, sequence_length, 2])
@@ -666,13 +596,12 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     vocab = set(lyrics_list)
     batch_size = 16
     num_classes = len(vocab)
-    num_unroll = 5  # not a divisor of sequence_length
-    train_steps = 300
+    num_unroll = 7  # not a divisor of sequence_length
+    train_steps = 350
     eval_steps = 30
-    num_units = 4
+    num_units = [4]
     learning_rate = 0.4
-    accuracy_threshold = 0.70
-    input_key_column_name = 'input_key_column'
+    accuracy_threshold = 0.65
 
     def get_lyrics_input_fn(seed):
 
@@ -692,16 +621,7 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
             mapping=list(vocab), default_value=-1, name='lookup')
         labels = table.lookup(
             array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length]))
-        input_key = string_ops.string_join([
-            'key_', string_ops.as_string(
-                random_ops.random_uniform(
-                    (),
-                    minval=0,
-                    maxval=10000000,
-                    dtype=dtypes.int32,
-                    seed=seed))
-        ])
-        return {'lyrics': inputs, input_key_column_name: input_key}, labels
+        return {'lyrics': inputs}, labels
 
       return input_fn
 
@@ -711,17 +631,19 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
             dimension=8)
     ]
     config = run_config.RunConfig(tf_random_seed=21212)
-    sequence_estimator = ssre.multi_value_rnn_classifier(
-        num_classes=num_classes,
+    sequence_estimator = ssre.StateSavingRnnEstimator(
+        constants.ProblemType.CLASSIFICATION,
         num_units=num_units,
+        cell_type='basic_rnn',
         num_unroll=num_unroll,
         batch_size=batch_size,
-        input_key_column_name=input_key_column_name,
         sequence_feature_columns=sequence_feature_columns,
+        num_classes=num_classes,
         learning_rate=learning_rate,
         config=config,
         predict_probabilities=True,
-        queue_capacity=2 + batch_size)
+        queue_capacity=2 + batch_size,
+        seed=1234)
 
     train_input_fn = get_lyrics_input_fn(seed=12321)
     eval_input_fn = get_lyrics_input_fn(seed=32123)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index c898a4865b1..5a991da8917 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -138,7 +139,7 @@ class SVM(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         params={
-            "head": head_lib._binary_svm_head(  # pylint: disable=protected-access
+            "head": head_lib.binary_svm_head(
                 weight_column_name=weight_column_name,
                 enable_centered_bias=False),
             "feature_columns": feature_columns,
@@ -183,6 +184,7 @@ class SVM(estimator.Estimator):
     return preds[key]
   # pylint: enable=protected-access
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export(self, export_dir, signature_fn=None,
              input_fn=None, default_batch_size=1,
              exports_to_keep=None):
@@ -194,6 +196,7 @@ class SVM(estimator.Estimator):
         default_batch_size=default_batch_size,
         exports_to_keep=exports_to_keep)
 
+  @deprecated("2017-03-25", "Please use Estimator.export_savedmodel() instead.")
   def export_with_defaults(
       self,
       export_dir,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
index 277148cabb7..f67f181d1ad 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import svm
 from tensorflow.python.framework import constant_op
@@ -66,9 +59,9 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
-          'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
-      }, constant_op.constant([[1], [0], [1]])
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
+          'feature2': constant_op.constant([1.0, -1.0, 0.5]),
+      }, constant_op.constant([1, 0, 1])
 
     feature1 = feature_column.real_valued_column('feature1')
     feature2 = feature_column.real_valued_column('feature2')
@@ -149,7 +142,7 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
           'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
       }, constant_op.constant([[1], [0], [1]])
 
@@ -230,7 +223,7 @@ class SVMTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
diff --git a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
index 178c1180da4..26c2b7840f8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn.estimators import tensor_signature
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 7960ef629df..66e15265171 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -60,19 +60,19 @@ class Evaluable(object):
 
     Args:
       x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-         containing the input samples for fitting the model. Can be iterator that returns
-         arrays of features or dictionary of array of features. If set, `input_fn` must
-         be `None`.
+        containing the input samples for fitting the model. Can be iterator that returns
+        arrays of features or dictionary of array of features. If set, `input_fn` must
+        be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
-         label values (class labels in classification, real numbers in
-         regression) or dictionary of multiple vectors/matrices. Can be iterator
-         that returns array of targets or dictionary of array of targets. If set,
-         `input_fn` must be `None`. Note: For classification, label values must
-         be integers representing the class index (i.e. values from 0 to
-         n_classes-1).
+        label values (class labels in classification, real numbers in
+        regression) or dictionary of multiple vectors/matrices. Can be iterator
+        that returns array of targets or dictionary of array of targets. If set,
+        `input_fn` must be `None`. Note: For classification, label values must
+        be integers representing the class index (i.e. values from 0 to
+        n_classes-1).
       input_fn: Input function returning a tuple of:
-          features - Dictionary of string feature name to `Tensor` or `Tensor`.
-          labels - `Tensor` or dictionary of `Tensor` with labels.
+        features - Dictionary of string feature name to `Tensor` or `Tensor`.
+        labels - `Tensor` or dictionary of `Tensor` with labels.
         If input_fn is set, `x`, `y`, and `batch_size` must be `None`. If
         `steps` is not provided, this should raise `OutOfRangeError` or
         `StopIteration` after the desired amount of data (e.g., one epoch) has
@@ -90,7 +90,6 @@ class Evaluable(object):
         friendly names for the metric to a `MetricSpec` object defining which
         model outputs to evaluate against which labels with which metric
         function.
-
         Metric ops should support streaming, e.g., returning `update_op` and
         `value` tensors. For example, see the options defined in
         `../../../metrics/python/ops/metrics_ops.py`.
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index e64becd888a..1c6ac08e464 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -26,17 +26,21 @@ import time
 
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_args
+from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
+
 __all__ = ["Experiment"]
 
 
@@ -49,7 +53,7 @@ class Experiment(object):
   """
 
   # TODO(ispir): remove delay_workers_by_global_step and make global step based
-  # waiting as only behaviour.
+  # waiting as only behavior.
   @deprecated_args(
       "2016-10-23",
       "local_eval_frequency is deprecated as local_run will be renamed to "
@@ -72,9 +76,10 @@ class Experiment(object):
                local_eval_frequency=None,
                eval_delay_secs=120,
                continuous_eval_throttle_secs=60,
-               min_eval_frequency=1,
+               min_eval_frequency=None,
                delay_workers_by_global_step=False,
-               export_strategies=None):
+               export_strategies=None,
+               train_steps_per_iteration=None):
     """Constructor for `Experiment`.
 
     Creates an Experiment instance. None of the functions passed to this
@@ -82,13 +87,18 @@ class Experiment(object):
     when a method is executed which requires it.
 
     Args:
-      estimator: Object implementing `Trainable` and `Evaluable`.
+      estimator: Object implementing Estimator interface, which could be a
+        combination of ${tf.contrib.learn.Trainable} and
+        ${tf.contrib.learn.Evaluable} (deprecated), or
+        ${tf.estimator.`Estimator}.
       train_input_fn: function, returns features and labels for training.
       eval_input_fn: function, returns features and labels for evaluation. If
         `eval_steps` is `None`, this should be configured only to produce for a
         finite number of batches (generally, 1 epoch over the evaluation data).
       eval_metrics: `dict` of string, metric function. If `None`, default set
-        is used.
+        is used. This should be `None` if the `estimator` is
+        ${tf.estimator.Estimator}. If metrics are provided they will be
+        *appended* to the default set.
       train_steps: Perform this many steps of training. `None`, the default,
         means train forever.
       eval_steps: `evaluate` runs until input is exhausted (or another exception
@@ -97,9 +107,8 @@ class Experiment(object):
         function.
       eval_hooks: A list of `SessionRunHook` hooks to pass to the
         `Estimator`'s `evaluate` function.
-      local_eval_frequency: Frequency of running eval in steps,
-        when running locally. If `None`, runs evaluation only at the end of
-        training.
+      local_eval_frequency: (applies only to local_run) Frequency of running
+        eval in steps. If `None`, runs evaluation only at the end of training.
       eval_delay_secs: Start evaluating after waiting for this many seconds.
       continuous_eval_throttle_secs: Do not re-evaluate unless the last
         evaluation was started at least this many seconds ago for
@@ -107,18 +116,39 @@ class Experiment(object):
       min_eval_frequency: (applies only to train_and_evaluate). the minimum
         number of steps between evaluations. Of course, evaluation does not
         occur if no new snapshot is available, hence, this is the minimum.
+        If 0, the evaluation will only happen after training.
+        If None, defaults to 1, unless model_dir is on GCS, in which case the
+        default is 1000.
       delay_workers_by_global_step: if `True` delays training workers
         based on global step instead of time.
-      export_strategies: A list of `ExportStrategy`s, or a single one, or None.
+      export_strategies: Iterable of `ExportStrategy`s, or a single one, or
+        `None`.
+      train_steps_per_iteration: (applies only to continuous_train_and_eval).
+        Perform this many (integer) number of train steps for each
+        training-evaluation iteration. With a small value, the model will be
+        evaluated more frequently with more checkpoints saved. If `None`, will
+        use a default value (which is smaller than `train_steps` if provided).
 
     Raises:
-      ValueError: if `estimator` does not implement `Evaluable` and `Trainable`,
+      ValueError: if `estimator` does not implement Estimator interface,
         or if export_strategies has the wrong type.
     """
-    if not isinstance(estimator, evaluable.Evaluable):
-      raise ValueError("`estimator` must implement `Evaluable`.")
-    if not isinstance(estimator, trainable.Trainable):
-      raise ValueError("`estimator` must implement `Trainable`.")
+    if isinstance(estimator, core_estimator.Estimator):
+      self._core_estimator_used = True
+      if eval_metrics is not None:
+        raise ValueError(
+            "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
+    else:
+      self._core_estimator_used = False
+      if not isinstance(estimator, evaluable.Evaluable):
+        raise ValueError(
+            "`estimator` must implement `tf.contrib.learn.Evaluable` "
+            "or `tf.estimator.Estimator`.")
+      if not isinstance(estimator, trainable.Trainable):
+        raise ValueError(
+            "`estimator` must implement `tf.contrib.learn.Trainable`"
+            "or `tf.estimator.`Estimator`.")
+
     super(Experiment, self).__init__()
     # Immutable fields.
     self._estimator = estimator
@@ -130,12 +160,24 @@ class Experiment(object):
     self._local_eval_frequency = local_eval_frequency
     self._eval_delay_secs = eval_delay_secs
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
-    self._min_eval_frequency = min_eval_frequency
+    # Using 1 on a non-cached file system requires a lot of overhead to
+    # read the checkpoint state file. This is particular bad on GCS, so
+    # we use a different default. This is a temporary band-aid, to be
+    # fixed holistically later (b/36498507).
+    default_min_eval_frequency = 1000 if _is_gcs(estimator.model_dir) else 1
+    self._min_eval_frequency = min_eval_frequency if (
+        min_eval_frequency is not None) else default_min_eval_frequency
     self._delay_workers_by_global_step = delay_workers_by_global_step
     self._train_monitors = train_monitors[:] if train_monitors else []
     self._eval_hooks = eval_hooks[:] if eval_hooks else []
     self._set_export_strategies(export_strategies)
 
+    self._train_steps_per_iteration = train_steps_per_iteration
+    if (self._train_steps_per_iteration is not None and
+        not isinstance(self._train_steps_per_iteration, int)):
+      raise ValueError(
+          "`train_steps_per_iteration` must be an integer.")
+
   @property
   def estimator(self):
     return self._estimator
@@ -152,16 +194,19 @@ class Experiment(object):
   def eval_steps(self):
     return self._eval_steps
 
-  def _set_export_strategies(self, value):
-    if value is None:
-      self._export_strategies = []
-    elif isinstance(value, list):
-      self._export_strategies = value[:]
-    elif isinstance(value, export_strategy.ExportStrategy):
-      self._export_strategies = [value]
-    else:
-      raise ValueError("`export_strategies` must be an ExportStrategy, "
-                       "a list of ExportStrategies, or None.")
+  def _set_export_strategies(self, values):  # pylint: disable=missing-docstring
+    export_strategies = []
+    if values:
+      if isinstance(values, export_strategy.ExportStrategy):
+        export_strategies.append(values)
+      else:
+        for value in values:
+          if not isinstance(value, export_strategy.ExportStrategy):
+            raise ValueError("`export_strategies` must be an ExportStrategy,"
+                             " an iterable of ExportStrategy, or `None`,"
+                             " found %s." % value)
+          export_strategies.append(value)
+    self._export_strategies = tuple(export_strategies)
 
   def extend_train_hooks(self, additional_hooks):
     """Extends the hooks for training."""
@@ -225,9 +270,9 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting training.", remaining)
       time.sleep(delay_secs)
 
-    return self._estimator.fit(input_fn=self._train_input_fn,
-                               max_steps=self._train_steps,
-                               monitors=self._train_monitors + extra_hooks)
+    return self._call_train(input_fn=self._train_input_fn,
+                            max_steps=self._train_steps,
+                            hooks=self._train_monitors + extra_hooks)
 
   def evaluate(self, delay_secs=None):
     """Evaluate on the evaluation data.
@@ -252,11 +297,11 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    return self._estimator.evaluate(input_fn=self._eval_input_fn,
-                                    steps=self._eval_steps,
-                                    metrics=self._eval_metrics,
-                                    name="one_pass",
-                                    hooks=self._eval_hooks)
+    return self._call_evaluate(input_fn=self._eval_input_fn,
+                               steps=self._eval_steps,
+                               metrics=self._eval_metrics,
+                               name="one_pass",
+                               hooks=self._eval_hooks)
 
   @deprecated(
       "2016-10-23",
@@ -282,7 +327,8 @@ class Experiment(object):
     Runs infinite eval on the evaluation data set. This function starts
     evaluating after `delay_secs` seconds and then runs no more than one
     evaluation (with `self._eval_steps` steps each time) per
-    `throttle_delay_secs`. It never returns.
+    `throttle_delay_secs`. If `train_steps` is not None, will return after
+    global_step reaches `train_steps`.
 
     Args:
       input_fn: The input to use for this eval.
@@ -299,7 +345,8 @@ class Experiment(object):
         results as arguments. At the beginning of evaluation, the passed eval
         results will be None so it's expected that the predicate function
         handles that gracefully. When `predicate_fn` is not specified,
-        continuous eval will run in an infinite loop.
+        continuous eval will run in an infinite loop (if `train_steps` is None)
+        or exit once global step reaches `train_steps`.
 
     Raises:
       ValueError: if `continuous_eval_predicate_fn` is neither None nor
@@ -324,6 +371,14 @@ class Experiment(object):
     last_warning_time = 0
     while (not continuous_eval_predicate_fn or
            continuous_eval_predicate_fn(eval_result)):
+      # Exit if we have already reached number of steps to train.
+      if self._has_training_stopped(eval_result):
+        logging.info("Exiting continuous eval, global_step=%s >= "
+                     "train_step=%s",
+                     eval_result[ops.GraphKeys.GLOBAL_STEP],
+                     self._train_steps)
+        return
+
       start = time.time()
 
       error_msg = None
@@ -341,12 +396,12 @@ class Experiment(object):
           logging.warning(error_msg)
           last_warning_time = time.time()
       else:
-        eval_result = self._estimator.evaluate(input_fn=input_fn,
-                                               steps=self._eval_steps,
-                                               metrics=self._eval_metrics,
-                                               name=name,
-                                               checkpoint_path=latest_path,
-                                               hooks=self._eval_hooks)
+        eval_result = self._call_evaluate(input_fn=input_fn,
+                                          steps=self._eval_steps,
+                                          metrics=self._eval_metrics,
+                                          name=name,
+                                          checkpoint_path=latest_path,
+                                          hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -364,6 +419,15 @@ class Experiment(object):
                      difference)
         time.sleep(difference)
 
+  def _has_training_stopped(self, eval_result):
+    """Determines whether the training has stopped."""
+    if not eval_result:
+      return False
+
+    global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP)
+    return global_step and self._train_steps and (
+        global_step >= self._train_steps)
+
   def continuous_eval(self,
                       delay_secs=None,
                       throttle_delay_secs=None,
@@ -391,8 +455,8 @@ class Experiment(object):
   def train_and_evaluate(self):
     """Interleaves training and evaluation.
 
-    The frequency of evaluation is controlled by the contructor arg
-    `min_eval_frequency`. When this parameter is None or 0, evaluation happens
+    The frequency of evaluation is controlled by the constructor arg
+    `min_eval_frequency`. When this parameter is 0, evaluation happens
     only after training has completed. Note that evaluation cannot happen
     more frequently than checkpoints are taken. If no new snapshots are
     available when evaluation is supposed to occur, then evaluation doesn't
@@ -407,7 +471,8 @@ class Experiment(object):
     performing evaluation allows for the second.
 
     Returns:
-      The result of the `evaluate` call to the `Estimator`.
+      The result of the `evaluate` call to the `Estimator` as well as the
+      export results using the specified `ExportStrategy`.
     """
     # The directory to which evaluation summaries are written are determined
     # by adding a suffix to 'eval'; that suffix is the 'name' parameter to
@@ -429,14 +494,91 @@ class Experiment(object):
         )]
       self.train(delay_secs=0)
 
-    eval_result = self._estimator.evaluate(input_fn=self._eval_input_fn,
-                                           steps=self._eval_steps,
-                                           metrics=self._eval_metrics,
-                                           name=eval_dir_suffix,
-                                           hooks=self._eval_hooks)
+    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
+                                      steps=self._eval_steps,
+                                      metrics=self._eval_metrics,
+                                      name=eval_dir_suffix,
+                                      hooks=self._eval_hooks)
     export_results = self._maybe_export(eval_result)
     return eval_result, export_results
 
+  @experimental
+  def continuous_train_and_eval(self,
+                                continuous_eval_predicate_fn=None):
+    """Interleaves training and evaluation.
+
+    The frequency of evaluation is controlled by the `train_steps_per_iteration`
+    (via constructor). The model will be first trained for
+    `train_steps_per_iteration`, and then be evaluated in turns.
+
+    This method is intended for single machine usage.
+
+    This differs from `train_and_evaluate` as follows:
+      1. The procedure will have train and evaluation in turns. The model
+      will be trained for a number of steps (usually smaller than `train_steps`
+      if provided) and then be evaluated.  `train_and_evaluate` will train the
+      model for `train_steps` (no small training iterations).
+
+      2. Due to the different approach this schedule takes, it leads to two
+      differences in resource control. First, the resources (e.g., memory) used
+      by training will be released before evaluation (`train_and_evaluate` takes
+      double resources). Second, more checkpoints will be saved as a checkpoint
+      is generated at the end of each small training iteration.
+
+    Args:
+      continuous_eval_predicate_fn: A predicate function determining whether to
+        continue after each iteration. `predicate_fn` takes the evaluation
+        results as its arguments. At the beginning of evaluation, the passed
+        eval results will be None so it's expected that the predicate function
+        handles that gracefully. When `predicate_fn` is not specified, this will
+        run in an infinite loop or exit when global_step reaches `train_steps`.
+
+    Returns:
+      A tuple of the result of the `evaluate` call to the `Estimator` and the
+      export results using the specified `ExportStrategy`.
+
+    Raises:
+      ValueError: if `continuous_eval_predicate_fn` is neither None nor
+        callable.
+    """
+
+    if (continuous_eval_predicate_fn is not None and
+        not callable(continuous_eval_predicate_fn)):
+      raise ValueError(
+          "`continuous_eval_predicate_fn` must be a callable, or None.")
+
+    eval_result = None
+
+    # Set the default value for train_steps_per_iteration, which will be
+    # overridden by other settings.
+    train_steps_per_iteration = 1000
+    if self._train_steps_per_iteration is not None:
+      train_steps_per_iteration = self._train_steps_per_iteration
+    elif self._train_steps is not None:
+      train_steps_per_iteration = int(self._train_steps / 10)
+
+    while (not continuous_eval_predicate_fn or
+           continuous_eval_predicate_fn(eval_result)):
+
+      if self._has_training_stopped(eval_result):
+        # Exits once max steps of training is satisfied.
+        logging.info("Stop training model as max steps reached")
+        break
+
+      logging.info("Training model for %s steps", train_steps_per_iteration)
+      self._call_train(input_fn=self._train_input_fn,
+                       steps=train_steps_per_iteration,
+                       hooks=self._train_monitors)
+
+      logging.info("Evaluating model now.")
+      eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
+                                        steps=self._eval_steps,
+                                        metrics=self._eval_metrics,
+                                        name="one_pass",
+                                        hooks=self._eval_hooks)
+
+    return eval_result, self._maybe_export(eval_result)
+
   def _maybe_export(self, eval_result, checkpoint_path=None):
     """Export the Estimator using export_fn, if defined."""
     export_dir_base = os.path.join(
@@ -468,19 +610,22 @@ class Experiment(object):
     self._start_server().join()
 
   def test(self):
-    """Tests training and evaluating the estimator both for a single step.
+    """Tests training, evaluating and exporting the estimator for a single step.
 
     Returns:
       The result of the `evaluate` call to the `Estimator`.
     """
-    self._estimator.fit(input_fn=self._train_input_fn,
-                        steps=1,
-                        monitors=self._train_monitors)
+    self._call_train(input_fn=self._train_input_fn,
+                     steps=1,
+                     hooks=self._train_monitors)
 
-    return self._estimator.evaluate(input_fn=self._eval_input_fn,
-                                    steps=1,
-                                    metrics=self._eval_metrics,
-                                    name="one_pass")
+    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
+                                      steps=1,
+                                      metrics=self._eval_metrics,
+                                      name="one_pass")
+    _ = self._maybe_export(eval_result)
+
+    return eval_result
 
   def _start_server(self):
     """Creates, starts, and returns a server_lib.Server."""
@@ -499,6 +644,49 @@ class Experiment(object):
     server.start()
     return server
 
+  def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
+                  input_fn=None, steps=None, hooks=None, max_steps=None):
+    if _sentinel is not None:
+      raise ValueError("_call_train should be called with keyword args only")
+
+    # Estimator in core cannot work with monitors. We need to convert them
+    # to hooks. For Estimator in contrib, it is converted internally. So, it is
+    # safe to convert for both cases.
+    hooks = monitors.replace_monitors_with_hooks(hooks, self._estimator)
+    if self._core_estimator_used:
+      return self._estimator.train(input_fn=input_fn,
+                                   steps=steps,
+                                   max_steps=max_steps,
+                                   hooks=hooks)
+    else:
+      return self._estimator.fit(input_fn=input_fn,
+                                 steps=steps,
+                                 max_steps=max_steps,
+                                 monitors=hooks)
+
+  def _call_evaluate(self, _sentinel=None,  # pylint: disable=invalid-name,
+                     input_fn=None, steps=None, metrics=None, name=None,
+                     checkpoint_path=None, hooks=None):
+    if _sentinel is not None:
+      raise ValueError("_call_evaluate should be called with keyword args only")
+
+    if self._core_estimator_used:
+      if metrics is not None:
+        raise ValueError(
+            "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
+      return self._estimator.evaluate(input_fn=input_fn,
+                                      steps=steps,
+                                      name=name,
+                                      checkpoint_path=checkpoint_path,
+                                      hooks=hooks)
+    else:
+      return self._estimator.evaluate(input_fn=input_fn,
+                                      steps=steps,
+                                      metrics=metrics,
+                                      name=name,
+                                      checkpoint_path=checkpoint_path,
+                                      hooks=hooks)
+
 
 @contextlib.contextmanager
 def _new_attr_context(obj, attr):
@@ -526,3 +714,7 @@ def _new_attr_context(obj, attr):
     yield
   finally:
     setattr(obj, attr, saved)
+
+
+def _is_gcs(model_dir):
+  return model_dir and model_dir.startswith("gs://")
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 398fc6e176b..17feeb27362 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -19,24 +19,19 @@ from __future__ import print_function
 
 import json
 import os
-import sys
 import tempfile
-import threading
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+import time
 
+from tensorflow.contrib.learn.python.learn import estimator as estimator_lib
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import experiment
-from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -44,32 +39,37 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
-from tensorflow.python.util.all_util import reveal_undocumented
+from tensorflow.python.util import tf_inspect
 
 
 class SheepCounter(object):
-  """To be patched in for time.sleep, in order to capture how long was slept."""
+  """To be patched in for the time module, replacing sleep() and time()."""
 
   def __init__(self):
     self._total_time = 0
     self._sleeptimes = []
+    self._time_calls = 0
 
-  def __call__(self, t):
+  def sleep(self, t):
     self._total_time += t
     self._sleeptimes += [t]
 
-  @property
-  def total_time(self):
+  def time(self):
+    self._time_calls += 1
     return self._total_time
 
   @property
   def sleep_times(self):
     return self._sleeptimes
 
+  @property
+  def time_calls(self):
+    return self._time_calls
 
-class TestEstimator(evaluable.Evaluable, trainable.Trainable):
 
-  def __init__(self, config=None, max_evals=5):
+class TestBaseEstimator(object):
+
+  def __init__(self, config, max_evals, eval_dict):
     self.eval_count = 0
     self.fit_count = 0
     self._max_evals = max_evals
@@ -78,6 +78,7 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
     self.eval_hooks = []
     self._config = config or run_config.RunConfig()
     self._model_dir = tempfile.mkdtemp()
+    self._eval_dict = eval_dict
 
   @property
   def model_dir(self):
@@ -95,7 +96,7 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
     if self.eval_count > self._max_evals:
       tf_logging.info('Ran %d evals. Done.' % self.eval_count)
       raise StopIteration()
-    return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
+    return self._eval_dict
 
   def fake_checkpoint(self):
     save_path = os.path.join(self.model_dir, 'model.ckpt')
@@ -105,12 +106,11 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
       var.initializer.run()
       save.save(sess, save_path, global_step=0)
 
-  def fit(self, **kwargs):
+  def train(self, **kwargs):
     self.fake_checkpoint()
     tf_logging.info('fit called with args: %s' % kwargs)
     self.fit_count += 1
-    if 'monitors' in kwargs:
-      self.monitors = kwargs['monitors']
+
     return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
 
   def export_savedmodel(self, export_dir_base, serving_input_fn, **kwargs):
@@ -121,6 +121,67 @@ class TestEstimator(evaluable.Evaluable, trainable.Trainable):
         compat.as_bytes(export_dir_base), compat.as_bytes('bogus_timestamp'))
 
 
+def _check_method_supports_args(method, kwargs):
+  """Checks that the given method supports the given args."""
+  supported_args = tuple(tf_inspect.getargspec(method).args)
+  for kwarg in kwargs:
+    if kwarg not in supported_args:
+      raise ValueError(
+          'Argument `{}` is not supported in method {}.'.format(kwarg, method))
+
+
+class TestEstimator(
+    TestBaseEstimator, evaluable.Evaluable, trainable.Trainable):
+
+  def __init__(self, config=None, max_evals=5, eval_dict=None):
+    super(TestEstimator, self).__init__(config, max_evals, eval_dict)
+    tf_logging.info('Create Estimator')
+
+  def evaluate(self, **kwargs):
+    _check_method_supports_args(evaluable.Evaluable.evaluate, kwargs)
+    return super(TestEstimator, self).evaluate(**kwargs)
+
+  def fit(self, **kwargs):
+    _check_method_supports_args(trainable.Trainable.fit, kwargs)
+    if 'monitors' in kwargs:
+      self.monitors = kwargs['monitors']
+    return super(TestEstimator, self).train(**kwargs)
+
+  def train(self, **kwargs):
+    raise ValueError('`train` is not defined in Estimator.')
+
+  def export_savedmodel(
+      self, export_dir_base, serving_input_fn, **kwargs):
+    _check_method_supports_args(
+        estimator_lib.Estimator.export_savedmodel, kwargs)
+    return super(TestEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_fn, **kwargs)
+
+
+class TestCoreEstimator(TestBaseEstimator, core_estimator.Estimator):
+
+  def __init__(self, config=None, max_evals=5, eval_dict=None):
+    super(TestCoreEstimator, self).__init__(config, max_evals, eval_dict)
+    tf_logging.info('Create Core Estimator')
+
+  def evaluate(self, **kwargs):
+    _check_method_supports_args(core_estimator.Estimator.evaluate, kwargs)
+    return super(TestCoreEstimator, self).evaluate(**kwargs)
+
+  def train(self, **kwargs):
+    _check_method_supports_args(core_estimator.Estimator.train, kwargs)
+    if 'hooks' in kwargs:
+      self.monitors = kwargs['hooks']
+    return super(TestCoreEstimator, self).train(**kwargs)
+
+  def export_savedmodel(
+      self, export_dir_base, serving_input_receiver_fn, **kwargs):
+    _check_method_supports_args(
+        core_estimator.Estimator.export_savedmodel, kwargs)
+    return super(TestCoreEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_receiver_fn, **kwargs)
+
+
 class _NoopHook(session_run_hook.SessionRunHook):
   pass
 
@@ -134,27 +195,63 @@ class ExperimentTest(test.TestCase):
             ['host3:2222', 'host4:2222', 'host5:2222']
     }
 
-  def test_train(self):
-    est = TestEstimator()
+  def _estimators_for_tests(self, config=None, eval_dict=None):
+    return [TestEstimator(config=config, eval_dict=eval_dict),
+            TestCoreEstimator(config=config, eval_dict=eval_dict)]
+
+  def test_eval_metrcis_for_core_estimator(self):
+    est = TestCoreEstimator()
+    with self.assertRaisesRegexp(
+        ValueError, '`eval_metrics` must be `None`'):
+      experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          train_steps='train_steps',
+          eval_input_fn='eval_input',
+          eval_metrics='eval_metrics')
+
+  def test_default_output_alternative_key_core_estimator(self):
+    est = TestCoreEstimator()
+    export_strategy = saved_model_export_utils.make_export_strategy(
+        est,
+        default_output_alternative_key='export_key',
+        exports_to_keep=None)
     ex = experiment.Experiment(
         est,
         train_input_fn='train_input',
-        train_steps='train_steps',
         eval_input_fn='eval_input',
-        eval_metrics='eval_metrics')
-    fit_args = ex.train(delay_secs=0)
-    self.assertEqual(1, est.fit_count)
-    self.assertIn(('max_steps', 'train_steps'), fit_args)
-    self.assertEqual(0, est.eval_count)
+        train_steps=100,
+        eval_steps=100,
+        export_strategies=export_strategy)
+    with self.assertRaisesRegexp(
+        ValueError, 'default_output_alternative_key is not supported'):
+      ex.train_and_evaluate()
+
+  def test_train(self):
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          train_steps='train_steps',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics)
+      fit_args = ex.train(delay_secs=0)
+      self.assertEqual(1, est.fit_count)
+      self.assertIn(('max_steps', 'train_steps'), fit_args)
+      self.assertEqual(0, est.eval_count)
 
   def test_train_delay(self):
-    est = TestEstimator()
-    ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input')
-    for delay in [0, 1, 3]:
-      with test.mock.patch('time.sleep', SheepCounter()) as sheep:
-        ex.train(delay_secs=delay)
-        self.assertAlmostEqual(delay, sheep.total_time, delta=0.1)
+    for est in self._estimators_for_tests():
+      ex = experiment.Experiment(
+          est, train_input_fn='train_input', eval_input_fn='eval_input')
+      for delay in [0, 1, 3]:
+        sheep = SheepCounter()
+        with test.mock.patch.object(time, 'time', sheep.time):
+          with test.mock.patch.object(time, 'sleep', sheep.sleep):
+            ex.train(delay_secs=delay)
+            self.assertAlmostEqual(delay, sheep.time(), delta=1e-4)
 
   def test_train_default_delay(self):
     for task_id in [0, 1, 3]:
@@ -162,13 +259,15 @@ class ExperimentTest(test.TestCase):
       with test.mock.patch.dict('os.environ',
                                 {'TF_CONFIG': json.dumps(tf_config)}):
         config = run_config.RunConfig()
-      est = TestEstimator(config)
-      ex = experiment.Experiment(
-          est, train_input_fn='train_input', eval_input_fn='eval_input')
+      for est in self._estimators_for_tests(config):
+        ex = experiment.Experiment(
+            est, train_input_fn='train_input', eval_input_fn='eval_input')
 
-      with test.mock.patch('time.sleep', SheepCounter()) as sheep:
-        ex.train()
-        self.assertAlmostEqual(task_id * 5, sheep.total_time, delta=0.1)
+        sheep = SheepCounter()
+        with test.mock.patch.object(time, 'time', sheep.time):
+          with test.mock.patch.object(time, 'sleep', sheep.sleep):
+            ex.train()
+            self.assertAlmostEqual(task_id * 5, sheep.time(), delta=1e-4)
 
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_starts_server(self, mock_server):
@@ -186,42 +285,45 @@ class ExperimentTest(test.TestCase):
       config = run_config_lib.RunConfig(
           master='host4:2222', num_cores=15, gpu_memory_fraction=0.314)
 
-    est = TestEstimator(config)
-    ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input')
+    for est in self._estimators_for_tests(config):
+      ex = experiment.Experiment(
+          est, train_input_fn='train_input', eval_input_fn='eval_input')
 
-    # Act.
-    # We want to make sure we discount the time it takes to start the server
-    # in our accounting of the delay, so we set a small delay here.
-    with test.mock.patch('time.sleep', SheepCounter()) as sheep:
-      ex.train(delay_secs=1)
-      # Ensure that the delay takes into account the time to start the server.
-      self.assertAlmostEqual(1, sheep.total_time, delta=0.1)
+      # Act.
+      # We want to make sure we discount the time it takes to start the server
+      # in our accounting of the delay, so we set a small delay here.
+      sheep = SheepCounter()
+      with test.mock.patch.object(time, 'time', sheep.time):
+        with test.mock.patch.object(time, 'sleep', sheep.sleep):
+          ex.train(delay_secs=1)
+          # Ensure that the delay takes into account the time to start server.
+          self.assertAlmostEqual(1, sheep.time(), delta=1e-4)
 
-    # Assert.
-    expected_config_proto = config_pb2.ConfigProto()
-    expected_config_proto.inter_op_parallelism_threads = 15
-    expected_config_proto.intra_op_parallelism_threads = 15
-    expected_config_proto.gpu_options.per_process_gpu_memory_fraction = 0.314
-    mock_server.assert_called_with(
-        config.cluster_spec,
-        job_name=run_config_lib.TaskType.WORKER,
-        task_index=1,
-        config=expected_config_proto,
-        start=False)
-    mock_server.assert_has_calls([test.mock.call().start()])
+      # Assert.
+      expected_config_proto = config_pb2.ConfigProto()
+      expected_config_proto.inter_op_parallelism_threads = 15
+      expected_config_proto.intra_op_parallelism_threads = 15
+      expected_config_proto.gpu_options.per_process_gpu_memory_fraction = 0.314
+      mock_server.assert_called_with(
+          config.cluster_spec,
+          job_name=run_config_lib.TaskType.WORKER,
+          task_index=1,
+          config=expected_config_proto,
+          start=False)
+      mock_server.assert_has_calls([test.mock.call().start()])
 
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_server_does_not_start_without_cluster_spec(self, mock_server):
     config = run_config_lib.RunConfig(master='host4:2222')
-    ex = experiment.Experiment(
-        TestEstimator(config),
-        train_input_fn='train_input',
-        eval_input_fn='eval_input')
-    ex.train()
+    for est in self._estimators_for_tests(config):
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input')
+      ex.train()
 
-    # The server should not have started because there was no ClusterSpec.
-    self.assertFalse(mock_server.called)
+      # The server should not have started because there was no ClusterSpec.
+      self.assertFalse(mock_server.called)
 
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_server_does_not_start_with_empty_master(self, mock_server):
@@ -229,14 +331,14 @@ class ExperimentTest(test.TestCase):
     with test.mock.patch.dict('os.environ',
                               {'TF_CONFIG': json.dumps(tf_config)}):
       config = run_config_lib.RunConfig(master='')
-    ex = experiment.Experiment(
-        TestEstimator(config),
-        train_input_fn='train_input',
-        eval_input_fn='eval_input')
-    ex.train()
-
-    # The server should not have started because master was the empty string.
-    self.assertFalse(mock_server.called)
+    for est in self._estimators_for_tests(config):
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input')
+      ex.train()
+      # The server should not have started because master was the empty string.
+      self.assertFalse(mock_server.called)
 
   def test_train_raises_if_job_name_is_missing(self):
     tf_config = {
@@ -252,195 +354,461 @@ class ExperimentTest(test.TestCase):
       config = run_config_lib.RunConfig(
           master='host3:2222'  # Normally selected by task type.
       )
-      ex = experiment.Experiment(
-          TestEstimator(config),
-          train_input_fn='train_input',
-          eval_input_fn='eval_input')
-      ex.train()
+      for est in self._estimators_for_tests(config):
+        ex = experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input')
+        ex.train()
 
   def test_evaluate(self):
-    est = TestEstimator()
-    est.fake_checkpoint()
-    noop_hook = _NoopHook()
-    ex = experiment.Experiment(
-        est,
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        eval_hooks=[noop_hook],
-        eval_steps='steps',
-        eval_delay_secs=0)
-    ex.evaluate()
-    self.assertEqual(0, est.fit_count)
-    self.assertEqual(1, est.eval_count)
-    self.assertEqual([noop_hook], est.eval_hooks)
-
-  def test_evaluate_delay(self):
-    est = TestEstimator()
-    est.fake_checkpoint()
-    noop_hook = _NoopHook()
-    ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input',
-        eval_hooks=[noop_hook])
-
-    for delay in [0, 1, 3]:
-      with test.mock.patch('time.sleep', SheepCounter()) as sheep:
-        ex.evaluate(delay_secs=delay)
-      self.assertAlmostEqual(delay, sheep.total_time, delta=0.1)
-      self.assertEqual([noop_hook], est.eval_hooks)
-
-  def test_continuous_eval(self):
-    est = TestEstimator()
-    est.fake_checkpoint()
-    noop_hook = _NoopHook()
-    ex = experiment.Experiment(
-        est,
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        eval_hooks=[noop_hook],
-        eval_delay_secs=0,
-        continuous_eval_throttle_secs=0)
-    self.assertRaises(
-        StopIteration, ex.continuous_eval, evaluate_checkpoint_only_once=False)
-    self.assertEqual(0, est.fit_count)
-    self.assertEqual(6, est.eval_count)
-    self.assertEqual([noop_hook], est.eval_hooks)
-
-  def test_continuous_eval_throttle_delay(self):
-    for delay in [0, 1, 2]:
-      est = TestEstimator()
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
       est.fake_checkpoint()
       noop_hook = _NoopHook()
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           eval_input_fn='eval_input',
-          eval_metrics='eval_metrics',
+          eval_metrics=eval_metrics,
           eval_hooks=[noop_hook],
-          continuous_eval_throttle_secs=delay,
+          eval_steps='steps',
           eval_delay_secs=0)
-      with test.mock.patch('time.sleep', SheepCounter()) as sheep:
-        self.assertRaises(
-            StopIteration,
-            ex.continuous_eval,
-            evaluate_checkpoint_only_once=False)
-        self.assertAlmostEqual(5 * delay, sheep.total_time, delta=0.1)
+      ex.evaluate()
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
+  def test_evaluate_delay(self):
+    for est in self._estimators_for_tests():
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est, train_input_fn='train_input', eval_input_fn='eval_input',
+          eval_hooks=[noop_hook])
+
+      for delay in [0, 1, 3]:
+        sheep = SheepCounter()
+        with test.mock.patch.object(time, 'time', sheep.time):
+          with test.mock.patch.object(time, 'sleep', sheep.sleep):
+            ex.evaluate(delay_secs=delay)
+        self.assertAlmostEqual(delay, sheep.time(), delta=1e-4)
+        self.assertEqual([noop_hook], est.eval_hooks)
+
+  def test_continuous_eval(self):
+    for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0)
+      self.assertRaises(StopIteration, ex.continuous_eval,
+                        evaluate_checkpoint_only_once=False)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(6, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
+  def test_continuous_eval_ends_after_train_step(self):
+    for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0,
+          train_steps=100)
+      ex.continuous_eval()
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
+  def test_continuous_eval_throttle_delay(self):
+    for delay in [0, 1, 2]:
+      for est in self._estimators_for_tests():
+        eval_metrics = 'eval_metrics' if not isinstance(
+            est, core_estimator.Estimator) else None
+        est.fake_checkpoint()
+        noop_hook = _NoopHook()
+        ex = experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            eval_metrics=eval_metrics,
+            eval_hooks=[noop_hook],
+            continuous_eval_throttle_secs=delay,
+            eval_delay_secs=0)
+        sheep = SheepCounter()
+        with test.mock.patch.object(time, 'time', sheep.time):
+          with test.mock.patch.object(time, 'sleep', sheep.sleep):
+            self.assertRaises(
+                StopIteration,
+                ex.continuous_eval,
+                evaluate_checkpoint_only_once=False)
+            self.assertAlmostEqual(5 * delay, sheep.time(), delta=1e-4)
 
   def test_continuous_eval_predicate_fn(self):
-    est = TestEstimator()
-    est.fake_checkpoint()
-    noop_hook = _NoopHook()
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
 
-    def _predicate_fn(unused_eval_result):
-      return est.eval_count < 3
+      def _predicate_fn(unused_eval_result):
+        return est.eval_count < 3  # pylint: disable=cell-var-from-loop
 
-    ex = experiment.Experiment(
-        est,
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        eval_hooks=[noop_hook],
-        eval_delay_secs=0,
-        continuous_eval_throttle_secs=0)
-    ex.continuous_eval(evaluate_checkpoint_only_once=False,
-                       continuous_eval_predicate_fn=_predicate_fn)
-    self.assertEqual(0, est.fit_count)
-    self.assertEqual(3, est.eval_count)
-    self.assertEqual([noop_hook], est.eval_hooks)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0)
+      ex.continuous_eval(evaluate_checkpoint_only_once=False,
+                         continuous_eval_predicate_fn=_predicate_fn)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(3, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
 
   def test_run_local(self):
-    est = TestEstimator()
-    noop_hook = _NoopHook()
-    ex = experiment.Experiment(
-        est,
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        eval_hooks=[noop_hook],
-        train_steps=100,
-        eval_steps=100,
-        local_eval_frequency=10)
-    ex.local_run()
-    self.assertEqual(1, est.fit_count)
-    self.assertEqual(1, est.eval_count)
-    self.assertEqual(1, len(est.monitors))
-    self.assertEqual([noop_hook], est.eval_hooks)
-    self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          train_steps=100,
+          eval_steps=100,
+          local_eval_frequency=10)
+      ex.local_run()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(1, len(est.monitors))
+      self.assertEqual([noop_hook], est.eval_hooks)
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
 
   def test_train_hooks_extend_does_not_mutate_input_hooks(self):
-    noop_hook = _NoopHook()
-    input_hooks = [noop_hook]
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      input_hooks = [noop_hook]
 
-    ex = experiment.Experiment(
-        TestEstimator(),
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        train_monitors=input_hooks)
-    self.assertAllEqual([noop_hook], ex._train_monitors)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          train_monitors=input_hooks)
+      self.assertAllEqual([noop_hook], ex._train_monitors)
 
-    another_noop_hook = _NoopHook()
-    # Assert that the extend API mutates the hooks, but not the input hooks
-    ex.extend_train_hooks([another_noop_hook])
-    self.assertAllEqual([noop_hook, another_noop_hook], ex._train_monitors)
-    self.assertAllEqual([noop_hook], input_hooks)
+      another_noop_hook = _NoopHook()
+      # Assert that the extend API mutates the hooks, but not the input hooks
+      ex.extend_train_hooks([another_noop_hook])
+      self.assertAllEqual([noop_hook, another_noop_hook], ex._train_monitors)
+      self.assertAllEqual([noop_hook], input_hooks)
+
+  def test_invalid_export_strategies(self):
+    for est in self._estimators_for_tests():
+      with self.assertRaisesRegexp(ValueError, 'ExportStrategy'):
+        experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            train_steps=100,
+            eval_steps=100,
+            export_strategies='not_an_export_strategy')
+      with self.assertRaisesRegexp(ValueError, 'ExportStrategy'):
+        experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            train_steps=100,
+            eval_steps=100,
+            export_strategies=['not_an_export_srategy'])
 
   def test_export_strategies_reset(self):
-    est = TestEstimator()
-    export_strategy_1 = saved_model_export_utils.make_export_strategy(
-        est, 'export_input_1', exports_to_keep=None)
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      export_strategy_1 = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_1',
+          exports_to_keep=None)
 
-    ex = experiment.Experiment(
-        est,
-        train_input_fn='train_input',
-        eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        train_steps=100,
-        eval_steps=100,
-        export_strategies=[export_strategy_1])
-    ex.train_and_evaluate()
-    self.assertEqual(1, est.export_count)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          train_steps=100,
+          eval_steps=100,
+          export_strategies=(export_strategy_1,))
+      ex.train_and_evaluate()
+      self.assertEqual(1, est.export_count)
 
-    # After reset with empty list (None), the count does not change and the user
-    # provided export strategy list should remain intact.
-    old_es = ex.reset_export_strategies()
-    ex.train_and_evaluate()
-    self.assertAllEqual([export_strategy_1], old_es)
-    self.assertEqual(1, est.export_count)
+      # After reset with empty list (None), the count does not change and the
+      # user provided export strategy list should remain intact.
+      old_es = ex.reset_export_strategies()
+      ex.train_and_evaluate()
+      self.assertAllEqual([export_strategy_1], old_es)
+      self.assertEqual(1, est.export_count)
 
-    # After reset with list, the count should increase with the number of items.
-    export_strategy_2 = saved_model_export_utils.make_export_strategy(
-        est, 'export_input_2', exports_to_keep=None)
-    export_strategy_3 = saved_model_export_utils.make_export_strategy(
-        est, 'export_input_3', exports_to_keep=None)
+      # After reset with list, the count should increase with the number of
+      # items.
+      export_strategy_2 = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_2',
+          exports_to_keep=None)
+      export_strategy_3 = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_3',
+          exports_to_keep=None)
 
-    old_es = ex.reset_export_strategies([export_strategy_2, export_strategy_3])
-    ex.train_and_evaluate()
-    self.assertAllEqual([], old_es)
-    self.assertEqual(3, est.export_count)
+      old_es = ex.reset_export_strategies(
+          [export_strategy_2, export_strategy_3])
+      ex.train_and_evaluate()
+      self.assertAllEqual([], old_es)
+      self.assertEqual(3, est.export_count)
 
   def test_train_and_evaluate(self):
-    est = TestEstimator()
-    noop_hook = _NoopHook()
-    export_strategy = saved_model_export_utils.make_export_strategy(
-        est, 'export_input', exports_to_keep=None)
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      export_strategy = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          train_steps=100,
+          eval_steps=100,
+          export_strategies=export_strategy)
+      ex.train_and_evaluate()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(1, est.export_count)
+      self.assertEqual(1, len(est.monitors))
+      self.assertEqual([noop_hook], est.eval_hooks)
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
+
+  def test_train_and_evaluate_with_no_eval_during_training(self):
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          train_steps=100,
+          eval_steps=100,
+          min_eval_frequency=0)
+      ex.train_and_evaluate()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(0, len(est.monitors))
+
+  def test_min_eval_frequency_defaults(self):
+    def dummy_model_fn(features, labels):  # pylint: disable=unused-argument
+      pass
+
+    # The default value when model_dir is on GCS is 1000
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
     ex = experiment.Experiment(
-        est,
+        estimator, train_input_fn=None, eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 1000)
+
+    # The default value when model_dir is not on GCS is 1
+    estimator = core_estimator.Estimator(dummy_model_fn, '/tmp/dummy')
+    ex = experiment.Experiment(
+        estimator, train_input_fn=None, eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 1)
+
+    # Make sure default not used when explicitly set
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
+    ex = experiment.Experiment(
+        estimator,
+        min_eval_frequency=123,
+        train_input_fn=None,
+        eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 123)
+
+    # Make sure default not used when explicitly set as 0
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
+    ex = experiment.Experiment(
+        estimator,
+        min_eval_frequency=0,
+        train_input_fn=None,
+        eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 0)
+
+  def test_continuous_train_and_eval(self):
+    for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      export_strategy = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          train_steps=100,
+          eval_steps=100,
+          export_strategies=export_strategy)
+      ex.continuous_train_and_eval()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(1, est.export_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
+  def test_continuous_train_and_eval_with_predicate_fn(self):
+    for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      export_strategy = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          train_steps=100000000000,  # a value will make `ex` never stops.
+          eval_steps=100,
+          export_strategies=export_strategy)
+
+      def predicate_fn(eval_result):
+        del eval_result  # unused. for fn signature.
+        return False
+
+      ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(0, est.eval_count)
+      self.assertEqual(1, est.export_count)
+
+  def test_continuous_train_and_eval_with_adapted_steps_per_iteration(self):
+    mock_estimator = test.mock.Mock(core_estimator.Estimator)
+    type(mock_estimator).model_dir = test.mock.PropertyMock(
+        return_value='test_dir')
+
+    total_steps = 100000000000000
+    ex = experiment.Experiment(
+        mock_estimator,
         train_input_fn='train_input',
         eval_input_fn='eval_input',
-        eval_metrics='eval_metrics',
-        eval_hooks=[noop_hook],
-        train_steps=100,
-        eval_steps=100,
-        export_strategies=export_strategy)
-    ex.train_and_evaluate()
-    self.assertEqual(1, est.fit_count)
-    self.assertEqual(1, est.eval_count)
-    self.assertEqual(1, est.export_count)
-    self.assertEqual(1, len(est.monitors))
-    self.assertEqual([noop_hook], est.eval_hooks)
-    self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+        train_steps=total_steps)
+
+    def predicate_fn(eval_result):
+      # Allows the first invoke only.
+      return eval_result is None
+
+    ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
+    mock_estimator.train.assert_called_once_with(
+        input_fn='train_input',
+        steps=int(total_steps/10),
+        max_steps=test.mock.ANY,
+        hooks=test.mock.ANY)
+
+  def test_continuous_train_and_eval_with_steps_per_iteration_from_user(self):
+    mock_estimator = test.mock.Mock(core_estimator.Estimator)
+    type(mock_estimator).model_dir = test.mock.PropertyMock(
+        return_value='test_dir')
+
+    total_steps = 100000000000000
+    ex = experiment.Experiment(
+        mock_estimator,
+        train_input_fn='train_input',
+        eval_input_fn='eval_input',
+        train_steps_per_iteration=1234,
+        train_steps=total_steps)
+
+    def predicate_fn(eval_result):
+      # Allows the first invoke only.
+      return eval_result is None
+
+    ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
+    mock_estimator.train.assert_called_once_with(
+        input_fn='train_input',
+        steps=1234,
+        max_steps=test.mock.ANY,
+        hooks=test.mock.ANY)
+
+  def test_continuous_train_and_eval_with_default_steps_per_iteration(self):
+    mock_estimator = test.mock.Mock(core_estimator.Estimator)
+    type(mock_estimator).model_dir = test.mock.PropertyMock(
+        return_value='test_dir')
+
+    ex = experiment.Experiment(
+        mock_estimator,
+        train_input_fn='train_input',
+        eval_input_fn='eval_input',
+        train_steps_per_iteration=None,
+        train_steps=None)
+
+    def predicate_fn(eval_result):
+      # Allows the first invoke only.
+      return eval_result is None
+
+    ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
+    mock_estimator.train.assert_called_once_with(
+        input_fn='train_input',
+        steps=1000,
+        max_steps=test.mock.ANY,
+        hooks=test.mock.ANY)
+
+  def test_continuous_train_and_eval_with_invalid_predicate_fn(self):
+    for est in self._estimators_for_tests():
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input')
+      with self.assertRaisesRegexp(
+          ValueError, '`continuous_eval_predicate_fn` must be a callable'):
+        ex.continuous_train_and_eval(continuous_eval_predicate_fn='fn')
+
+  def test_continuous_train_and_eval_with_invalid_train_steps_iterations(self):
+    for est in self._estimators_for_tests():
+      with self.assertRaisesRegexp(
+          ValueError, '`train_steps_per_iteration` must be an integer.'):
+        experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            train_steps_per_iteration='123')
 
   @test.mock.patch.object(server_lib, 'Server')
   def test_run_std_server(self, mock_server):
@@ -458,66 +826,79 @@ class ExperimentTest(test.TestCase):
           master='host2:2222',
           num_cores=15,
           gpu_memory_fraction=0.314,)
-    est = TestEstimator(config)
-    ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input')
+    for est in self._estimators_for_tests(config):
+      ex = experiment.Experiment(
+          est, train_input_fn='train_input', eval_input_fn='eval_input')
 
-    # Act.
-    ex.run_std_server()
+      # Act.
+      ex.run_std_server()
 
-    # Assert.
-    mock_server.assert_has_calls(
-        [test.mock.call().start(), test.mock.call().join()])
+      # Assert.
+      mock_server.assert_has_calls(
+          [test.mock.call().start(), test.mock.call().join()])
 
   @test.mock.patch.object(server_lib, 'Server')
   def test_run_std_server_raises_without_cluster_spec(self, mock_server):
     config = run_config_lib.RunConfig(master='host4:2222')
-    with self.assertRaises(ValueError):
-      ex = experiment.Experiment(
-          TestEstimator(config),
-          train_input_fn='train_input',
-          eval_input_fn='eval_input')
-      ex.run_std_server()
+    for est in self._estimators_for_tests(config):
+      with self.assertRaises(ValueError):
+        ex = experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input')
+        ex.run_std_server()
 
   def test_test(self):
-    est = TestEstimator()
-    ex = experiment.Experiment(
-        est, train_input_fn='train_input', eval_input_fn='eval_input')
-    ex.test()
-    self.assertEqual(1, est.fit_count)
-    self.assertEqual(1, est.eval_count)
+    for est in self._estimators_for_tests():
+      exp_strategy = saved_model_export_utils.make_export_strategy(
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          export_strategies=(exp_strategy,))
+      ex.test()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(1, est.export_count)
 
   def test_continuous_eval_evaluates_checkpoint_once(self):
-    # Temporarily disabled until we figure out the threading story on Jenkins.
-    return
-    # pylint: disable=unreachable
+    for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
 
-    # The TestEstimator will raise StopIteration the second time evaluate is
-    # called.
-    ex = experiment.Experiment(
-        TestEstimator(max_evals=1),
-        train_input_fn='train_input',
-        eval_input_fn='eval_input')
+      result = {
+          'called': 0,
+          'called_with_eval_result': 0,
+      }
+      # pylint: disable=cell-var-from-loop
+      def _predicate_fn(eval_result):
+        result['called'] += 1
+        if eval_result:
+          # If eval_result is not empty nor None, the checkpoint has been
+          # evaluated.
+          result['called_with_eval_result'] += 1
+        # With 300 times of evaluation, this should prove something.
+        return result['called'] < 300
+      # pylint: enable=cell-var-from-loop
 
-    # This should not happen if the logic restricting evaluation of the same
-    # checkpoint works. We do need some checkpoint though, otherwise Experiment
-    # will never evaluate.
-    ex.estimator.fake_checkpoint()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0)
+      ex.continuous_eval(evaluate_checkpoint_only_once=True,
+                         continuous_eval_predicate_fn=_predicate_fn)
 
-    # Start a separate thread with continuous eval
-    thread = threading.Thread(
-        target=lambda: ex.continuous_eval(delay_secs=0, throttle_delay_secs=0))
-    thread.start()
-
-    # The thread will die if it evaluates twice, and we should never evaluate
-    # twice since we don't write another checkpoint. Since we did not enable
-    # throttling, if it hasn't died after two seconds, we're good.
-    thread.join(2)
-    self.assertTrue(thread.is_alive())
-
-    # But we should have evaluated once.
-    count = ex.estimator.eval_count
-    self.assertEqual(1, count)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(300, result['called'])
+      self.assertEqual(1, result['called_with_eval_result'])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index 450f87ccecf..f276aab0e6b 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """ExportStrategy class represents different flavors of model export."""
 
 from __future__ import absolute_import
@@ -20,13 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import inspect
+
+from tensorflow.python.util import tf_inspect
 
 __all__ = ['ExportStrategy']
 
 
-class ExportStrategy(collections.namedtuple('ExportStrategy',
-                                            ['name', 'export_fn'])):
+class ExportStrategy(
+    collections.namedtuple('ExportStrategy', ['name', 'export_fn'])):
   """A class representing a type of model export.
 
   Typically constructed by a utility function specific to the exporter, such as
@@ -43,10 +43,11 @@ class ExportStrategy(collections.namedtuple('ExportStrategy',
       result or based on an internal timer or any other criterion, if exports
       are not desired for every checkpoint.
 
-      The signature of this function must be one of:
-        * (estimator, export_path) -> export_path`
-        * (estimator, export_path, checkpoint_path) -> export_path`
-        * (estimator, export_path, checkpoint_path, eval_result) -> export_path`
+    The signature of this function must be one of:
+
+    * `(estimator, export_path) -> export_path`
+    * `(estimator, export_path, checkpoint_path) -> export_path`
+    * `(estimator, export_path, checkpoint_path, eval_result) -> export_path`
   """
 
   def export(self,
@@ -73,7 +74,7 @@ class ExportStrategy(collections.namedtuple('ExportStrategy',
     """
     # don't break existing export_fns that don't accept checkpoint_path and
     # eval_result
-    export_fn_args = inspect.getargspec(self.export_fn).args
+    export_fn_args = tf_inspect.getargspec(self.export_fn).args
     kwargs = {}
     if 'checkpoint_path' in export_fn_args:
       kwargs['checkpoint_path'] = checkpoint_path
@@ -84,5 +85,3 @@ class ExportStrategy(collections.namedtuple('ExportStrategy',
       kwargs['eval_result'] = eval_result
 
     return self.export_fn(estimator, export_path, **kwargs)
-
-
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 45a2dc18469..98365c05f66 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -37,14 +37,12 @@ from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import coordinator
-from tensorflow.python.training import monitored_session
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import session_manager as session_manager_lib
@@ -121,205 +119,6 @@ def _run_with_monitors(session, step, tensors, feed_dict, monitors):
   return outputs, should_stop
 
 
-def _monitored_train(graph,
-                     output_dir,
-                     train_op,
-                     loss_op,
-                     global_step_tensor=None,
-                     init_op=None,
-                     init_feed_dict=None,
-                     init_fn=None,
-                     log_every_steps=10,
-                     supervisor_is_chief=True,
-                     supervisor_master='',
-                     supervisor_save_model_secs=600,
-                     supervisor_save_model_steps=None,
-                     keep_checkpoint_max=5,
-                     keep_checkpoint_every_n_hours=10000.0,
-                     supervisor_save_summaries_secs=None,
-                     supervisor_save_summaries_steps=100,
-                     feed_fn=None,
-                     steps=None,
-                     fail_on_nan_loss=True,
-                     hooks=None,
-                     max_steps=None):
-  """Train a model via monitored_session.
-
-  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
-  run a training loop. The given `train_op` performs one step of training on the
-  model. The `loss_op` represents the objective function of the training. It is
-  expected to increment the `global_step_tensor`, a scalar integer tensor
-  counting training steps. This function uses `Supervisor` to initialize the
-  graph (from a checkpoint if one is available in `output_dir`), write summaries
-  defined in the graph, and write regular checkpoints as defined by
-  `supervisor_save_model_secs`.
-
-  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
-  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
-  program is terminated with exit code 1.
-
-  Args:
-    graph: A graph to train. It is expected that this graph is not in use
-      elsewhere.
-    output_dir: A directory to write outputs to.
-    train_op: An op that performs one training step when run.
-    loss_op: A scalar loss tensor.
-    global_step_tensor: A tensor representing the global step. If none is given,
-      one is extracted from the graph using the same logic as in `Supervisor`.
-    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
-      default.
-    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
-      This feed dictionary will be used when `init_op` is evaluated.
-    init_fn: Optional callable passed to Supervisor to initialize the model.
-    log_every_steps: Output logs regularly. The logs contain timing data and the
-      current loss. A `0` or negative value disables logging.
-    supervisor_is_chief: Whether the current process is the chief supervisor in
-      charge of restoring the model and running standard services.
-    supervisor_master: The master string to use when preparing the session.
-    supervisor_save_model_secs: Save checkpoints every this many seconds. Can
-        not be specified with `supervisor_save_model_steps`.
-    supervisor_save_model_steps: Save checkpoints every this many steps. Can not
-        be specified with `supervisor_save_model_secs`.
-    keep_checkpoint_max: The maximum number of recent checkpoint files to
-      keep. As new files are created, older files are deleted. If None or 0,
-      all checkpoint files are kept. This is simply passed as the max_to_keep
-      arg to `tf.Saver` constructor.
-    keep_checkpoint_every_n_hours: In addition to keeping the most recent
-      `keep_checkpoint_max` checkpoint files, you might want to keep one checkpoint file
-      for every N hours of training.  This can be useful if you want to later
-      analyze how a model progressed during a long training session.  For
-      example, passing `keep_checkpoint_every_n_hours=2` ensures that you keep
-      one checkpoint file for every 2 hours of training.  The default value of
-      10,000 hours effectively disables the feature.
-    supervisor_save_summaries_secs: Save summaries every
-      `supervisor_save_summaries_secs` seconds when training.
-    supervisor_save_summaries_steps: Save summaries every
-      `supervisor_save_summaries_steps` steps when training. Exactly one of
-      `supervisor_save_model_steps` and `supervisor_save_model_secs` should be
-      specified, and the other should be None.
-    feed_fn: A function that is called every iteration to produce a `feed_dict`
-      passed to `session.run` calls. Optional.
-    steps: Trains for this many steps (e.g. current global step + `steps`).
-    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
-      evaluates to `NaN`. If false, continue training as if nothing happened.
-    hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-      inside the training loop.
-    max_steps: Number of total steps for which to train model. If `None`,
-      train forever. Two calls fit(steps=100) means 200 training iterations.
-      On the other hand two calls of fit(max_steps=100) means, second call
-      will not do any iteration since first call did all 100 steps.
-
-  Returns:
-    The final loss value.
-
-  Raises:
-    ValueError: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
-      is not provided. See `tf.contrib.framework.get_global_step` for how we
-      look up the latter if not provided explicitly.
-    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
-      evaluates to `NaN`.
-    ValueError: If both `steps` and `max_steps` are not `None`.
-  """
-  if (steps is not None) and (max_steps is not None):
-    raise ValueError('Can not provide both steps and max_steps.')
-  if not output_dir:
-    raise ValueError('Output directory should be non-empty %s.' % output_dir)
-  if train_op is None:
-    raise ValueError('Missing train_op.')
-  if loss_op is None:
-    raise ValueError('Missing loss_op.')
-  if hooks is None:
-    hooks = []
-  if not isinstance(hooks, list):
-    raise ValueError('Hooks should be a list.')
-  with graph.as_default():
-    global_step_tensor = contrib_variables.assert_or_get_global_step(
-        graph, global_step_tensor)
-  if global_step_tensor is None:
-    raise ValueError('No "global_step" was provided or found in the graph.')
-
-  if max_steps is not None:
-    try:
-      start_step = load_variable(output_dir, global_step_tensor.name)
-      if max_steps <= start_step:
-        logging.info('Skipping training since max_steps has already saved.')
-        return None
-    except:  # pylint: disable=bare-except
-      pass
-
-  # Adapted SessionRunHooks such as ExportMonitor depend on the
-  # CheckpointSaverHook to be executed before they should be executed.
-  # The `hooks` param comprises of deprecated monitor hooks
-  # (such as ExportMonitor). Appending them after the basic_session_run_hooks.
-  all_hooks = []
-  with graph.as_default():
-    all_hooks.append(basic_session_run_hooks.NanTensorHook(
-        loss_op, fail_on_nan_loss=fail_on_nan_loss))
-    if log_every_steps > 0:
-      all_hooks.append(basic_session_run_hooks.LoggingTensorHook({
-          'loss': loss_op.name,
-          'step': global_step_tensor.name
-      }, every_n_iter=log_every_steps))
-
-    def make_saver():
-      return tf_saver.Saver(
-          sharded=True,
-          max_to_keep=keep_checkpoint_max,
-          keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
-          defer_build=True)
-
-    scaffold = monitored_session.Scaffold(
-        init_op=init_op,
-        init_feed_dict=init_feed_dict,
-        init_fn=init_fn,
-        saver=monitored_session.Scaffold.get_or_default('saver',
-                                                        ops.GraphKeys.SAVERS,
-                                                        make_saver))
-
-    if not supervisor_is_chief:
-      session_creator = monitored_session.WorkerSessionCreator(
-          scaffold=scaffold,
-          master=supervisor_master)
-    else:
-      session_creator = monitored_session.ChiefSessionCreator(
-          scaffold=scaffold,
-          checkpoint_dir=output_dir,
-          master=supervisor_master)
-      summary_writer = summary_io.SummaryWriterCache.get(output_dir)
-      all_hooks.append(
-          basic_session_run_hooks.StepCounterHook(
-              summary_writer=summary_writer))
-      all_hooks.append(
-          basic_session_run_hooks.SummarySaverHook(
-              save_secs=supervisor_save_summaries_secs,
-              save_steps=supervisor_save_summaries_steps,
-              summary_writer=summary_writer,
-              scaffold=scaffold))
-      if (supervisor_save_model_secs is not None
-          or supervisor_save_model_steps is not None):
-        all_hooks.append(
-            basic_session_run_hooks.CheckpointSaverHook(
-                output_dir,
-                save_secs=supervisor_save_model_secs,
-                save_steps=supervisor_save_model_steps,
-                scaffold=scaffold))
-
-    if steps is not None or max_steps is not None:
-      all_hooks.append(basic_session_run_hooks.StopAtStepHook(steps, max_steps))
-    all_hooks.extend(hooks)
-
-    with monitored_session.MonitoredSession(
-        session_creator=session_creator,
-        hooks=all_hooks) as super_sess:
-      loss = None
-      while not super_sess.should_stop():
-        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
-                                 None)
-
-    summary_io.SummaryWriterCache.clear()
-    return loss
-
-
 @_graph_action_deprecation
 def train(graph,
           output_dir,
@@ -378,7 +177,7 @@ def train(graph,
     keep_checkpoint_max: The maximum number of recent checkpoint files to
       keep. As new files are created, older files are deleted. If None or 0,
       all checkpoint files are kept. This is simply passed as the max_to_keep
-      arg to tf.Saver constructor.
+      arg to tf.train.Saver constructor.
     supervisor_save_summaries_steps: Save summaries every
       `supervisor_save_summaries_steps` seconds when training.
     feed_fn: A function that is called every iteration to produce a `feed_dict`
@@ -630,11 +429,14 @@ def _get_ready_op():
 
 
 def _get_local_init_op():
+  """Returns the local init ops to initialize tables and local variables."""
   local_init_op = _get_first_op_from_collection(
       ops.GraphKeys.LOCAL_INIT_OP)
   if local_init_op is None:
-    op_list = [variables.local_variables_initializer(),
-               data_flow_ops.tables_initializer()]
+    op_list = [
+        variables.local_variables_initializer(),
+        lookup_ops.tables_initializer()
+    ]
     if op_list:
       local_init_op = control_flow_ops.group(*op_list)
       ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -881,7 +683,7 @@ def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None):
       else:
         session.run(variables.global_variables_initializer())
       session.run(variables.local_variables_initializer())
-      session.run(data_flow_ops.tables_initializer())
+      session.run(lookup_ops.tables_initializer())
       coord = coordinator.Coordinator()
       threads = None
       try:
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
index 1f131516d5a..0d039d593b7 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions_test.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
@@ -19,21 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import shutil
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib import testing
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn.monitors import BaseMonitor
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import control_flow_ops
@@ -42,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
-from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as saver_lib
 
 
@@ -226,7 +218,7 @@ class GraphActionsTest(test.TestCase):
       self.assertTrue(request_stop.called)
 
   def test_run_feeds_iter_calls_resources_init(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       in0, _, _ = self._build_inference_graph()
       handle = test_ops.stub_resource_handle_op(container='a', shared_name='b')
       resources.register_resource(
@@ -320,7 +312,7 @@ class GraphActionsTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(g):
       variables_lib.create_global_step()
       v = variables.Variable(1.0)
-      w = variables.Variable(
+      variables.Variable(
           v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False)
       ready_for_local_init_op = variables.report_uninitialized_variables(
           variables.global_variables())
@@ -402,223 +394,11 @@ class GraphActionsTest(test.TestCase):
           }},
           expected_session_logs=[])
 
-  def test_train_invalid_args(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      train_op = constant_op.constant(1.0)
-      loss_op = constant_op.constant(2.0)
-      with self.assertRaisesRegexp(ValueError, 'utput directory'):
-        learn.graph_actions._monitored_train(
-            g,  # pylint: disable=protected-access
-            output_dir=None,
-            train_op=train_op,
-            loss_op=loss_op)
-      with self.assertRaisesRegexp(ValueError, 'utput directory'):
-        learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-            g,
-            output_dir='',
-            train_op=constant_op.constant(1.0),
-            loss_op=constant_op.constant(2.0))
-      with self.assertRaisesRegexp(ValueError, 'train_op'):
-        learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-            g,
-            output_dir=self._output_dir,
-            train_op=None,
-            loss_op=loss_op)
-      with self.assertRaisesRegexp(ValueError, 'loss_op'):
-        learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-            g,
-            output_dir=self._output_dir,
-            train_op=constant_op.constant(1.0),
-            loss_op=None)
-      with self.assertRaisesRegexp(ValueError, 'global_step'):
-        learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-            g,
-            output_dir=self._output_dir,
-            train_op=constant_op.constant(1.0),
-            loss_op=loss_op)
-
   # TODO(ptucker): Resume training from previous ckpt.
   # TODO(ptucker): !supervisor_is_chief
   # TODO(ptucker): Custom init op for training.
   # TODO(ptucker): Mock supervisor, and assert all interactions.
 
-  def test_train(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      writer = learn.graph_actions.get_summary_writer(self._output_dir)
-      self._assert_summaries(self._output_dir, writer)
-      self._assert_ckpt(self._output_dir, False)
-      loss = learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          steps=1)
-      meta_graph_def = meta_graph.create_meta_graph_def(
-          graph_def=g.as_graph_def(add_shapes=True),
-          saver_def=monitored_session.Scaffold().finalize().saver.saver_def)
-      self.assertEqual(2.0, loss)
-      self._assert_summaries(
-          self._output_dir,
-          writer,
-          expected_graphs=[g],
-          expected_meta_graphs=[meta_graph_def])
-      self._assert_ckpt(self._output_dir, True)
-
-  def test_train_steps_is_incremental(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          steps=10)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(10, step)
-
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          steps=15)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(25, step)
-
-  def test_train_max_steps_is_not_incremental(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          max_steps=10)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(10, step)
-
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          max_steps=15)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(15, step)
-
-  def test_train_skip_train_if_max_step_already_saved(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          max_steps=10)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(10, step)
-
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          max_steps=10)
-      step = checkpoint_utils.load_variable(
-          self._output_dir, variables_lib.get_global_step().name)
-      self.assertEqual(10, step)
-
-  def test_train_loss(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      variables_lib.create_global_step()
-      loss_var = variables_lib.local_variable(10.0)
-      train_op = control_flow_ops.group(
-          state_ops.assign_add(variables_lib.get_global_step(), 1),
-          state_ops.assign_add(loss_var, -1.0))
-      writer = learn.graph_actions.get_summary_writer(self._output_dir)
-      self._assert_summaries(self._output_dir, writer)
-      self._assert_ckpt(self._output_dir, False)
-      loss = learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=loss_var.value(),
-          steps=6)
-      self.assertEqual(4.0, loss)
-      self._assert_summaries(
-          self._output_dir,
-          writer,
-          expected_graphs=[g],
-          expected_meta_graphs=None)
-      self._assert_ckpt(self._output_dir, True)
-
-  def test_train_summaries(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      loss_op = constant_op.constant(2.0)
-      summary.scalar('loss', loss_op)
-      writer = learn.graph_actions.get_summary_writer(self._output_dir)
-      self._assert_summaries(self._output_dir, writer)
-      self._assert_ckpt(self._output_dir, False)
-      loss = learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=loss_op,
-          steps=1)
-      meta_graph_def = meta_graph.create_meta_graph_def(
-          graph_def=g.as_graph_def(add_shapes=True),
-          saver_def=monitored_session.Scaffold().finalize().saver.saver_def)
-      self.assertEqual(2.0, loss)
-      self._assert_summaries(
-          self._output_dir,
-          writer,
-          expected_graphs=[g],
-          expected_meta_graphs=[meta_graph_def],
-          expected_summaries={1: {
-              'loss': 2.0
-          }})
-      self._assert_ckpt(self._output_dir, True)
-
-  def test_train_override_saver(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      with ops.control_dependencies(self._build_inference_graph()):
-        train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
-      self._assert_ckpt(self._output_dir, False)
-      real_saver = saver_lib.Saver()
-      saver = test.mock.Mock(wraps=real_saver, saver_def=real_saver.saver_def)
-      ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
-      loss = learn.graph_actions._monitored_train(  # pylint: disable=protected-access
-          g,
-          output_dir=self._output_dir,
-          train_op=train_op,
-          loss_op=constant_op.constant(2.0),
-          steps=1)
-      self.assertEqual(2.0, loss)
-      self._assert_ckpt(self._output_dir, True)
-      self.assertTrue(saver.build.called)
-      self.assertEqual(1, saver.save.call_count)
-
 
 # TODO(ispir): remove following tests after deprecated train.
 class GraphActionsTrainTest(test.TestCase):
diff --git a/tensorflow/contrib/learn/python/learn/grid_search_test.py b/tensorflow/contrib/learn/python/learn/grid_search_test.py
index f16496380a2..b7c3e21dee7 100644
--- a/tensorflow/contrib/learn/python/learn/grid_search_test.py
+++ b/tensorflow/contrib/learn/python/learn/grid_search_test.py
@@ -20,12 +20,6 @@ from __future__ import print_function
 
 import os
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.contrib.learn.python import learn
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 32252cd8e30..06c3782a471 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -21,17 +21,18 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_data
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_labels
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import HAS_DASK
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import queue_parsed_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_examples
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_record_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.numpy_io import numpy_input_fn
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_data
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_labels
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_matrix
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import HAS_PANDAS
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import pandas_input_fn
+from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index 7f5711ac1b5..eaf6ae4ed72 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
new file mode 100644
index 00000000000..c302c7725a4
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@@ -0,0 +1,135 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow generator of dict with numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import Container
+from types import FunctionType
+from types import GeneratorType
+
+from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+
+
+def generator_input_fn(x,
+                       target_key=None,
+                       batch_size=128,
+                       num_epochs=1,
+                       shuffle=True,
+                       queue_capacity=1000,
+                       num_threads=1):
+  """Returns input function that would dicts of numpy arrays
+       yielded from a generator.
+
+  It is assumed that every dict yielded from the dictionary represents
+  a single sample. The generator should consume a single epoch of the data.
+
+  This returns a function outputting `features` and `target` based on the dict
+  of numpy arrays. The dict `features` has the same keys as an element yielded
+  from x.
+
+  Example:
+    ```python
+    def generator():
+      for index in range(10):
+        yield {'height': np.random.randint(32,36),
+              'age': np.random.randint(18, 80),
+              'label': np.ones(1)}
+
+    with tf.Session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key="label", batch_size=2, shuffle=False,
+          num_epochs=1)
+    ```
+
+  Args:
+    x: Generator Function, returns a `Generator` that will yield the data
+      in `dict` of numpy arrays
+    target_key: String or Container of Strings, the key or Container of keys of
+      the numpy arrays in x dictionaries to use as target.
+    batch_size: Integer, size of batches to return.
+    num_epochs: Integer, number of epochs to iterate over data. If `None` will
+      run forever.
+    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+      time.
+    queue_capacity: Integer, size of queue to accumulate.
+    num_threads: Integer, number of threads used for reading and enqueueing.
+
+  Returns:
+    Function, that returns a feature `dict` with `Tensors` and an optional
+     label `dict` with `Tensors`, or if target_key is `str` label is a `Tensor`
+
+  Raises:
+    TypeError: `x` is not `FunctionType`.
+    TypeError: `x()` is not `GeneratorType`.
+    TypeError: `next(x())` is not `dict`.
+    TypeError: `target_key` is not `str` or `target_key` is not `Container`
+       of `str`.
+    KeyError:  `target_key` not a key or `target_key[index]` not in next(`x()`).
+    KeyError: `key` mismatch between dicts emitted from `x()`
+  """
+  if not isinstance(x, FunctionType):
+    raise TypeError(
+        'x must be generator function; got {}'.format(type(x).__name__))
+  generator = x()
+  if not isinstance(generator, GeneratorType):
+    raise TypeError(
+        'x() must be generator; got {}'.format(type(generator).__name__))
+  data = next(generator)
+  if not isinstance(data, dict):
+    raise TypeError('x() must yield dict; got {}'.format(type(data).__name__))
+  input_keys = sorted(next(x()).keys())
+  if target_key is not None:
+    if isinstance(target_key, str):
+      target_key = [target_key]
+    elif isinstance(target_key, Container):
+      for item in target_key:
+        if not isinstance(item, str):
+          raise TypeError('target_key must be str or Container of str; got {}'.
+                          format(type(item).__name__))
+        if item not in input_keys:
+          raise KeyError(
+              'target_key not in yielded dict. Expected {} keys; got {}'.format(
+                  input_keys, item))
+    else:
+      raise TypeError('target_key must be str or Container of str; got {}'.
+                      format(type(target_key).__name__))
+
+  def _generator_input_fn():
+    """generator input function."""
+    queue = feeding_functions.enqueue_data(
+        x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+
+    features = (queue.dequeue_many(batch_size)
+                if num_epochs is None else queue.dequeue_up_to(batch_size))
+    if not isinstance(features, list):
+      features = [features]
+    features = dict(zip(input_keys, features))
+    if target_key is not None:
+      if len(target_key) > 1:
+        target = {key: features.pop(key) for key in target_key}
+      else:
+        target = features.pop(target_key[0])
+      return features, target
+    return features
+
+  return _generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
new file mode 100644
index 00000000000..bc767ec18b1
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -0,0 +1,348 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# TODO: #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
+  import ctypes
+
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+import numpy as np
+from tensorflow.contrib.learn.python.learn.learn_io import generator_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class GeneratorIoTest(test.TestCase):
+
+  def testGeneratorInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorSingleInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {'a': np.ones(1) * index}
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnLabelDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32,
+            'label2': np.ones(1) * index - 64,
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key=['label', 'label2'],
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(
+          -1, 1))
+      self.assertAllEqual(res[1]['label2'],
+                          np.asarray([-64, -63]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithDifferentDimensionsOfFeatures(self):
+
+    def generator():
+      for index in range(100):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'],
+                          np.vstack((np.zeros((10, 10)), np.ones(
+                              (10, 10)))).reshape(2, 10, 10))
+      self.assertAllEqual(res[0]['b'],
+                          np.vstack((np.zeros((5, 5)), np.ones(
+                              (5, 5)))).reshape(2, 5, 5) + 32)
+      self.assertAllEqual(res[1],
+                          np.vstack((np.zeros((3, 3)), np.ones(
+                              (3, 3)))).reshape(2, 3, 3) - 32)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithXAsNonGeneratorFunction(self):
+    x = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x must be generator function'):
+        failing_input_fn = generator_io.generator_input_fn(
+            x, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGenerator(self):
+
+    def generator():
+      return np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGeneratorYieldingDicts(self):
+
+    def generator():
+      yield np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelListNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', np.arange(10)]
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotInDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', 'target']
+    with self.test_session():
+      with self.assertRaisesRegexp(KeyError, 'target_key not in yielded dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithNoTargetKey(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithBatchLargerthanData(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1, 0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33, 32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'],
+                          np.asarray([-32, -31, -32, -31]).reshape(-1, 1))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithMismatchinGeneratorKeys(self):
+
+    def generator():
+      index = 0
+      yield {
+          'a': np.ones(1) * index,
+          'b': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+      index = 1
+      yield {
+          'a': np.ones(1) * index,
+          'c': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      with self.assertRaisesRegex(KeyError, 'key mismatch between dicts emitted'
+                                  ' by GenFunExpected'):
+        coord.request_stop()
+        coord.join(threads)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index cf4ea6ae2a3..61208ba24e1 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -25,12 +25,10 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variables as var_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 from tensorflow.python.training import input as input_ops
@@ -65,7 +63,7 @@ def read_batch_examples(file_pattern,
   Use `parse_fn` if you need to do parsing / processing on single examples.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     reader: A function or class that returns an object with
@@ -74,9 +72,11 @@ def read_batch_examples(file_pattern,
     num_epochs: Integer specifying the number of times to read through the
       dataset. If `None`, cycles through the dataset forever.
       NOTE - If specified, creates a variable that must be initialized, so call
-      `tf.global_variables_initializer()` and run the op in a session.
+      `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
-    num_threads: The number of threads enqueuing examples.
+    num_threads: The number of threads enqueuing examples. In order to have
+      predicted and repeatable order of reading and enqueueing, such as in
+      prediction and evaluation mode, `num_threads` should be 1.
     read_batch_size: An int or scalar `Tensor` specifying the number of
       records to read at once
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
@@ -130,7 +130,7 @@ def read_keyed_batch_examples(file_pattern,
   Use `parse_fn` if you need to do parsing / processing on single examples.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     reader: A function or class that returns an object with
@@ -139,9 +139,11 @@ def read_keyed_batch_examples(file_pattern,
     num_epochs: Integer specifying the number of times to read through the
       dataset. If `None`, cycles through the dataset forever.
       NOTE - If specified, creates a variable that must be initialized, so call
-      `tf.global_variables_initializer()` and run the op in a session.
+      `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
-    num_threads: The number of threads enqueuing examples.
+    num_threads: The number of threads enqueuing examples. In order to have
+      predicted and repeatable order of reading and enqueueing, such as in
+      prediction and evaluation mode, `num_threads` should be 1.
     read_batch_size: An int or scalar `Tensor` specifying the number of
       records to read at once
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
@@ -172,17 +174,17 @@ def read_keyed_batch_examples(file_pattern,
       seed=seed)
 
 
-def _read_keyed_batch_examples_shared_queue(file_pattern,
-                                            batch_size,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            num_threads=1,
-                                            read_batch_size=1,
-                                            parse_fn=None,
-                                            name=None,
-                                            seed=None):
+def read_keyed_batch_examples_shared_queue(file_pattern,
+                                           batch_size,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           num_threads=1,
+                                           read_batch_size=1,
+                                           parse_fn=None,
+                                           name=None,
+                                           seed=None):
   """Adds operations to read, queue, batch `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -202,7 +204,7 @@ def _read_keyed_batch_examples_shared_queue(file_pattern,
   Use `parse_fn` if you need to do parsing / processing on single examples.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     reader: A function or class that returns an object with
@@ -211,7 +213,7 @@ def _read_keyed_batch_examples_shared_queue(file_pattern,
     num_epochs: Integer specifying the number of times to read through the
       dataset. If `None`, cycles through the dataset forever.
       NOTE - If specified, creates a variable that must be initialized, so call
-      `tf.global_variables_initializer()` and run the op in a session.
+      `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples.
     read_batch_size: An int or scalar `Tensor` specifying the number of
@@ -248,7 +250,7 @@ def _get_file_names(file_pattern, randomize_input):
   """Parse list of file names from pattern, optionally shuffled.
 
   Args:
-    file_pattern: File glob pattern, or list of strings.
+    file_pattern: File glob pattern, or list of glob patterns.
     randomize_input: Whether to shuffle the order of file names.
 
   Returns:
@@ -258,13 +260,16 @@ def _get_file_names(file_pattern, randomize_input):
     ValueError: If `file_pattern` is empty, or pattern matches no files.
   """
   if isinstance(file_pattern, list):
-    file_names = file_pattern
-    if not file_names:
+    if not file_pattern:
       raise ValueError('No files given to dequeue_examples.')
+    file_names = []
+    for entry in file_pattern:
+      file_names.extend(gfile.Glob(entry))
   else:
     file_names = list(gfile.Glob(file_pattern))
-    if not file_names:
-      raise ValueError('No files match %s.' % file_pattern)
+
+  if not file_names:
+    raise ValueError('No files match %s.' % file_pattern)
 
   # Sort files so it will be deterministic for unit tests. They'll be shuffled
   # in `string_input_producer` if `randomize_input` is enabled.
@@ -317,7 +322,7 @@ def _read_keyed_batch_examples_helper(file_pattern,
   """Adds operations to read, queue, batch `Example` protos.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     reader: A function or class that returns an object with
@@ -326,7 +331,7 @@ def _read_keyed_batch_examples_helper(file_pattern,
     num_epochs: Integer specifying the number of times to read through the
       dataset. If `None`, cycles through the dataset forever.
       NOTE - If specified, creates a variable that must be initialized, so call
-      `tf.global_variables_initializer()` and run the op in a session.
+      `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples.
     read_batch_size: An int or scalar `Tensor` specifying the number of
@@ -354,8 +359,9 @@ def _read_keyed_batch_examples_helper(file_pattern,
   # Check input parameters are given and reasonable.
   if (not queue_capacity) or (queue_capacity <= 0):
     raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
-  if (batch_size is None) or ((not isinstance(batch_size, ops.Tensor)) and
-                              (batch_size <= 0 or batch_size > queue_capacity)):
+  if (batch_size is None) or (
+      (not isinstance(batch_size, ops.Tensor)) and
+      (batch_size <= 0 or batch_size >= queue_capacity)):
     raise ValueError('Invalid batch_size %s, with queue_capacity %s.' %
                      (batch_size, queue_capacity))
   if (read_batch_size is None) or (
@@ -450,7 +456,7 @@ def read_keyed_batch_features(file_pattern,
   All ops are added to the default graph.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
@@ -463,7 +469,9 @@ def read_keyed_batch_features(file_pattern,
       creates a variable that must be initialized, so call
       tf.local_variables_initializer() and run the op in a session.
     queue_capacity: Capacity for input queue.
-    reader_num_threads: The number of threads to read examples.
+    reader_num_threads: The number of threads to read examples. In order to have
+      predicted and repeatable order of reading and enqueueing, such as in
+      prediction and evaluation mode, `reader_num_threads` should be 1.
     feature_queue_capacity: Capacity of the parsed features queue.
     num_enqueue_threads: Number of threads to enqueue the parsed example queue.
       Using multiple threads to enqueue the parsed example queue helps maintain
@@ -504,18 +512,18 @@ def read_keyed_batch_features(file_pattern,
         name=scope)
 
 
-def _read_keyed_batch_features_shared_queue(file_pattern,
-                                            batch_size,
-                                            features,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            reader_num_threads=1,
-                                            feature_queue_capacity=100,
-                                            num_queue_runners=2,
-                                            parse_fn=None,
-                                            name=None):
+def read_keyed_batch_features_shared_queue(file_pattern,
+                                           batch_size,
+                                           features,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           reader_num_threads=1,
+                                           feature_queue_capacity=100,
+                                           num_queue_runners=2,
+                                           parse_fn=None,
+                                           name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -530,7 +538,7 @@ def _read_keyed_batch_features_shared_queue(file_pattern,
   All ops are added to the default graph.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
@@ -563,7 +571,7 @@ def _read_keyed_batch_features_shared_queue(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    keys, examples = _read_keyed_batch_examples_shared_queue(
+    keys, examples = read_keyed_batch_examples_shared_queue(
         file_pattern,
         batch_size,
         reader,
@@ -673,6 +681,11 @@ def queue_parsed_features(parsed_features,
                                           errors.CancelledError)))
 
     dequeued_tensors = input_queue.dequeue()
+    if not isinstance(dequeued_tensors, list):
+      # input_queue.dequeue() returns a single tensor instead of a list of
+      # tensors if there is only one tensor to dequeue, which breaks the
+      # assumption of a list below.
+      dequeued_tensors = [dequeued_tensors]
 
     # Reset shapes on dequeued tensors.
     for i in range(len(tensors_to_enqueue)):
@@ -723,7 +736,7 @@ def read_batch_features(file_pattern,
   All ops are added to the default graph.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
@@ -738,7 +751,9 @@ def read_batch_features(file_pattern,
     queue_capacity: Capacity for input queue.
     feature_queue_capacity: Capacity of the parsed features queue. Set this
       value to a small number, for example 5 if the parsed features are large.
-    reader_num_threads: The number of threads to read examples.
+    reader_num_threads: The number of threads to read examples. In order to have
+      predicted and repeatable order of reading and enqueueing, such as in
+      prediction and evaluation mode, `reader_num_threads` should be 1.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -777,7 +792,7 @@ def read_batch_record_features(file_pattern,
   See more detailed description in `read_examples`.
 
   Args:
-    file_pattern: List of files or pattern of file paths containing
+    file_pattern: List of files or patterns of file paths containing
         `Example` records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int or scalar `Tensor` specifying the batch size to use.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
@@ -788,7 +803,9 @@ def read_batch_record_features(file_pattern,
       creates a variable that must be initialized, so call
       tf.local_variables_initializer() and run the op in a session.
     queue_capacity: Capacity for input queue.
-    reader_num_threads: The number of threads to read examples.
+    reader_num_threads: The number of threads to read examples. In order to have
+      predicted and repeatable order of reading and enqueueing, such as in
+      prediction and evaluation mode, `reader_num_threads` should be 1.
     name: Name of resulting op.
 
   Returns:
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 90d58dec14e..f25f7caf615 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -21,19 +21,13 @@ from __future__ import print_function
 import base64
 import os
 import random
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.learn.python.learn.learn_io import graph_io
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -49,7 +43,9 @@ from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import server_lib
 
 _VALID_FILE_PATTERN = "VALID"
+_VALID_FILE_PATTERN_2 = "VALID_2"
 _FILE_NAMES = [b"abc", b"def", b"ghi", b"jkl"]
+_FILE_NAMES_2 = [b"mno", b"pqr"]
 _INVALID_FILE_PATTERN = "INVALID"
 
 
@@ -58,6 +54,8 @@ class GraphIOTest(test.TestCase):
   def _mock_glob(self, pattern):
     if _VALID_FILE_PATTERN == pattern:
       return _FILE_NAMES
+    if _VALID_FILE_PATTERN_2 == pattern:
+      return _FILE_NAMES_2
     self.assertEqual(_INVALID_FILE_PATTERN, pattern)
     return []
 
@@ -113,6 +111,18 @@ class GraphIOTest(test.TestCase):
         queue_capacity=queue_capacity,
         num_threads=num_threads,
         name=name)
+    self.assertRaisesRegexp(
+        ValueError,
+        "Invalid batch_size",
+        graph_io.read_batch_examples,
+        _VALID_FILE_PATTERN,
+        default_batch_size,
+        io_ops.TFRecordReader,
+        False,
+        num_epochs=None,
+        queue_capacity=default_batch_size,
+        num_threads=num_threads,
+        name=name)
     self.assertRaisesRegexp(
         ValueError,
         "Invalid queue_capacity",
@@ -262,14 +272,14 @@ class GraphIOTest(test.TestCase):
       self.assertEqual(queue_capacity,
                        op_nodes[example_queue_name].attr["capacity"].i)
 
-  def test_batch_randomized(self):
+  def test_batch_randomized_multiple_globs(self):
     batch_size = 17
     queue_capacity = 1234
     name = "my_batch"
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
       inputs = graph_io.read_batch_examples(
-          _VALID_FILE_PATTERN,
+          [_VALID_FILE_PATTERN, _VALID_FILE_PATTERN_2],
           batch_size,
           reader=io_ops.TFRecordReader,
           randomize_input=True,
@@ -288,7 +298,8 @@ class GraphIOTest(test.TestCase):
           name: "QueueDequeueManyV2"
       }, g)
       self.assertEqual(
-          set(_FILE_NAMES), set(sess.run(["%s:0" % file_names_name])[0]))
+          set(_FILE_NAMES + _FILE_NAMES_2),
+          set(sess.run(["%s:0" % file_names_name])[0]))
       self.assertEqual(queue_capacity,
                        op_nodes[example_queue_name].attr["capacity"].i)
 
@@ -356,7 +367,7 @@ class GraphIOTest(test.TestCase):
     ]
     filename = self._create_temp_file("".join(json_lines))
     batch_size = 10000
-    queue_capacity = 10000
+    queue_capacity = 100000
     name = "my_large_batch"
 
     features = {"sequence": parsing_ops.FixedLenFeature([], dtypes_lib.string)}
@@ -452,7 +463,7 @@ class GraphIOTest(test.TestCase):
     name = "my_batch"
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -516,7 +527,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g1, session_lib.Session(
         server.target, graph=g1) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -545,7 +556,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g2, session_lib.Session(
         server.target, graph=g2) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -743,6 +754,17 @@ class GraphIOTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def test_queue_parsed_features_single_tensor(self):
+    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      features = {"test": constant_op.constant([1, 2, 3])}
+      _, queued_features = graph_io.queue_parsed_features(features)
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+      out_features = session.run(queued_features["test"])
+      self.assertAllEqual([1, 2, 3], out_features)
+      coord.request_stop()
+      coord.join(threads)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/io_test.py
index d3c582c9f99..678f80c45c7 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/io_test.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 # pylint: disable=wildcard-import
 from tensorflow.contrib.learn.python import learn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
index 69610018390..692438807fb 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -18,27 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn as core_numpy_input_fn
 
-# Key name to pack the target into dict of `features`. See
-# `_get_unique_target_key` for details.
-_TARGET_KEY = '__target_key__'
-
-def _get_unique_target_key(features):
-  """Returns a key not existed in the input dict `features`.
-
-  Caller of `input_fn` usually provides `features` (dict of numpy arrays) and
-  `target`, but the underlying feeding module expects a single dict of numpy
-  arrays as input. So, the `target` needs to be packed into the `features`
-  temporarily and unpacked after calling the feeding function. Toward this goal,
-  this function returns a key not existed in the `features` to pack the
-  `target`.
-  """
-  target_key = _TARGET_KEY
-  while target_key in features:
-    target_key += '_n'
-  return target_key
 
 def numpy_input_fn(x,
                    y=None,
@@ -47,83 +28,11 @@ def numpy_input_fn(x,
                    shuffle=True,
                    queue_capacity=1000,
                    num_threads=1):
-  """Returns input function that would feed dict of numpy arrays into the model.
-
-  This returns a function outputting `features` and `target` based on the dict
-  of numpy arrays. The dict `features` has the same keys as the `x`.
-
-  Example:
-  ```python
-  age = np.arange(4) * 1.0
-  height = np.arange(32, 36)
-  x = {'age': age, 'height': height}
-  y = np.arange(-32, -28)
-
-  with tf.Session() as session:
-    input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
-  ```
-
-  Args:
-    x: dict of numpy array object.
-    y: numpy array object.
-    batch_size: Integer, size of batches to return.
-    num_epochs: Integer, number of epochs to iterate over data. If `None` will
-      run forever.
-    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
-      time.
-    queue_capacity: Integer, size of queue to accumulate.
-    num_threads: Integer, number of threads used for reading and enqueueing.
-
-  Returns:
-    Function, that has signature of ()->(dict of `features`, `target`)
-
-  Raises:
-    ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
-      values in `x` have same shape).
-    TypeError: `x` is not a dict.
-  """
-
-  def input_fn():
-    """Numpy input function."""
-    if not isinstance(x, dict):
-      raise TypeError('x must be dict; got {}'.format(type(x).__name__))
-
-    unique_target_key = _get_unique_target_key(x)
-    if y is not None:
-      x[unique_target_key] = y
-
-    if len(set(v.shape[0] for v in x.values())) != 1:
-      shape_dict_of_x = {k: x[k].shape for k in x.keys()}
-      shape_of_y = None if y is None else y.shape
-      raise ValueError('Length of tensors in x and y is mismatched. All '
-                       'elements in x and y must have the same length.\n'
-                       'Shapes in x: {}\n'
-                       'Shape for y: {}\n'.format(shape_dict_of_x, shape_of_y))
-
-    # Ensure the order of iteration is consistent.
-    ordered_dict_x = collections.OrderedDict(
-        sorted(x.items(), key=lambda t: t[0]))
-
-    queue = feeding_functions.enqueue_data(
-        ordered_dict_x,
-        queue_capacity,
-        shuffle=shuffle,
-        num_threads=num_threads,
-        enqueue_size=batch_size,
-        num_epochs=num_epochs)
-
-    features = (queue.dequeue_many(batch_size) if num_epochs is None
-                else queue.dequeue_up_to(batch_size))
-
-    # Remove the first `Tensor` in `features`, which is the row number.
-    if len(features) > 0:
-      features.pop(0)
-
-    features = dict(zip(ordered_dict_x.keys(), features))
-    if y is not None:
-      target = features.pop(unique_target_key)
-      return features, target
-    return features
-
-  return input_fn
+  """This input_fn diffs from the core version with default `shuffle`."""
+  return core_numpy_input_fn(x=x,
+                             y=y,
+                             batch_size=batch_size,
+                             shuffle=shuffle,
+                             num_epochs=num_epochs,
+                             queue_capacity=queue_capacity,
+                             num_threads=num_threads)
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
index 409dbec1278..6fe8de8705b 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.learn_io import numpy_io
@@ -62,6 +55,159 @@ class NumpyIoTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def testNumpyInputFnWithVeryLargeBatchSizeAndMultipleEpochs(self):
+    a = np.arange(2) * 1.0
+    b = np.arange(32, 34)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -30)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=128, shuffle=False, num_epochs=2)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1, 0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33, 32, 33])
+      self.assertAllEqual(res[1], [-32, -31, -32, -31])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithZeroEpochs(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=0)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeNotDividedByDataSize(self):
+    batch_size = 2
+    a = np.arange(5) * 1.0
+    b = np.arange(32, 37)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -27)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2, 3])
+      self.assertAllEqual(res[0]['b'], [34, 35])
+      self.assertAllEqual(res[1], [-30, -29])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [4])
+      self.assertAllEqual(res[0]['b'], [36])
+      self.assertAllEqual(res[1], [-28])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeNotDividedByDataSizeAndMultipleEpochs(self):
+    batch_size = 2
+    a = np.arange(3) * 1.0
+    b = np.arange(32, 35)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -29)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=3)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2, 0])
+      self.assertAllEqual(res[0]['b'], [34, 32])
+      self.assertAllEqual(res[1], [-30, -32])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [1, 2])
+      self.assertAllEqual(res[0]['b'], [33, 34])
+      self.assertAllEqual(res[1], [-31, -30])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2])
+      self.assertAllEqual(res[0]['b'], [34])
+      self.assertAllEqual(res[1], [-30])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeLargerThanDataSize(self):
+    batch_size = 10
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1, 2, 3])
+      self.assertAllEqual(res[0]['b'], [32, 33, 34, 35])
+      self.assertAllEqual(res[1], [-32, -31, -30, -29])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
   def testNumpyInputFnWithDifferentDimensionsOfFeatures(self):
     a = np.array([[1, 2], [3, 4]])
     b = np.array([5, 6])
@@ -103,7 +249,7 @@ class NumpyIoTest(test.TestCase):
           x, y, batch_size=2, shuffle=False, num_epochs=1)
       input_fn()
       self.assertAllEqual(x['__target_key__'], array)
-      self.assertAllEqual(x['__target_key___n'], y)
+      self.assertItemsEqual(x.keys(), ['__target_key__'])
 
   def testNumpyInputFnWithMismatchLengthOfInputs(self):
     a = np.arange(4) * 1.0
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index d5ed2eba620..ede7558eafa 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -19,13 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn as core_pandas_input_fn
 
 try:
   # pylint: disable=g-import-not-at-top
   import pandas as pd
   HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
 except ImportError:
   HAS_PANDAS = False
 
@@ -45,6 +47,25 @@ PANDAS_DTYPES = {
 }
 
 
+def pandas_input_fn(x,
+                    y=None,
+                    batch_size=128,
+                    num_epochs=1,
+                    shuffle=True,
+                    queue_capacity=1000,
+                    num_threads=1,
+                    target_column='target'):
+  """This input_fn diffs from the core version with default `shuffle`."""
+  return core_pandas_input_fn(x=x,
+                              y=y,
+                              batch_size=batch_size,
+                              shuffle=shuffle,
+                              num_epochs=num_epochs,
+                              queue_capacity=queue_capacity,
+                              num_threads=num_threads,
+                              target_column=target_column)
+
+
 def extract_pandas_data(data):
   """Extract data from pandas.DataFrame for predictors.
 
@@ -120,79 +141,3 @@ def extract_pandas_labels(labels):
                        'float, or bool. Found: ' + ', '.join(error_report))
   else:
     return labels
-
-
-def pandas_input_fn(x,
-                    y=None,
-                    batch_size=128,
-                    num_epochs=1,
-                    shuffle=True,
-                    queue_capacity=1000,
-                    num_threads=1,
-                    target_column='target'):
-  """Returns input function that would feed Pandas DataFrame into the model.
-
-  Note: `y`'s index must match `x`'s index.
-
-  Args:
-    x: pandas `DataFrame` object.
-    y: pandas `Series` object.
-    batch_size: int, size of batches to return.
-    num_epochs: int, number of epochs to iterate over data. If not `None`,
-      read attempts that would exceed this value will raise `OutOfRangeError`.
-    shuffle: bool, whether to read the records in random order.
-    queue_capacity: int, size of the read queue. If `None`, it will be set
-      roughly to the size of `x`.
-    num_threads: int, number of threads used for reading and enqueueing.
-    target_column: str, name to give the target column `y`.
-
-  Returns:
-    Function, that has signature of ()->(dict of `features`, `target`)
-
-  Raises:
-    ValueError: if `x` already contains a column with the same name as `y`, or
-      if the indexes of `x` and `y` don't match.
-  """
-  x = x.copy()
-  if y is not None:
-    if target_column in x:
-      raise ValueError(
-          'Cannot use name %s for target column: DataFrame already has a '
-          'column with that name: %s' % (target_column, x.columns))
-    if not np.array_equal(x.index, y.index):
-      raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
-                       'Index for y: %s\n' % (x.index, y.index))
-    x[target_column] = y
-
-  # TODO(mdan): These are memory copies. We probably don't need 4x slack space.
-  # The sizes below are consistent with what I've seen elsewhere.
-  if queue_capacity is None:
-    if shuffle:
-      queue_capacity = 4 * len(x)
-    else:
-      queue_capacity = len(x)
-  min_after_dequeue = max(queue_capacity / 4, 1)
-
-  def input_fn():
-    """Pandas input function."""
-    queue = feeding_functions.enqueue_data(
-        x,
-        queue_capacity,
-        shuffle=shuffle,
-        min_after_dequeue=min_after_dequeue,
-        num_threads=num_threads,
-        enqueue_size=batch_size,
-        num_epochs=num_epochs)
-    if num_epochs is None:
-      features = queue.dequeue_many(batch_size)
-    else:
-      features = queue.dequeue_up_to(batch_size)
-    assert len(features) == len(x.columns) + 1, ('Features should have one '
-                                                 'extra element for the index.')
-    features = features[1:]
-    features = dict(zip(list(x.columns), features))
-    if y is not None:
-      target = features.pop(target_column)
-      return features, target
-    return features
-  return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io_test.py
index 52256277bc6..c738f0e8f3e 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.learn_io import pandas_io
@@ -83,6 +76,73 @@ class PandasIoTest(test.TestCase):
       self.assertAllEqual(features['b'], [32, 33])
       self.assertAllEqual(target, [-32, -31])
 
+  def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      index = np.arange(100, 102)
+      a = np.arange(2)
+      b = np.arange(32, 34)
+      x = pd.DataFrame({'a': a, 'b': b}, index=index)
+      y = pd.Series(np.arange(-32, -30), index=index)
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=128, shuffle=False, num_epochs=2)
+
+      results = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [0, 1, 0, 1])
+      self.assertAllEqual(features['b'], [32, 33, 32, 33])
+      self.assertAllEqual(target, [-32, -31, -32, -31])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run(results)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      index = np.arange(100, 105)
+      a = np.arange(5)
+      b = np.arange(32, 37)
+      x = pd.DataFrame({'a': a, 'b': b}, index=index)
+      y = pd.Series(np.arange(-32, -27), index=index)
+
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+
+      results = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+      self.assertAllEqual(target, [-32, -31])
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [2, 3])
+      self.assertAllEqual(features['b'], [34, 35])
+      self.assertAllEqual(target, [-30, -29])
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [4])
+      self.assertAllEqual(features['b'], [36])
+      self.assertAllEqual(target, [-28])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run(results)
+
+      coord.request_stop()
+      coord.join(threads)
+
   def testPandasInputFn_OnlyX(self):
     if not HAS_PANDAS:
       return
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index e3cf27ebab4..bd9fcde8cef 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -18,74 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
 from tensorflow.contrib.learn.python.learn.experiment import Experiment
+from tensorflow.contrib.training.python.training import hparam as hparam_lib
 from tensorflow.python.platform import tf_logging as logging
 
 
-def run(experiment_fn, output_dir, schedule=None):
-  """Make and run an experiment.
-
-  It creates an Experiment by calling `experiment_fn`. Then it calls the
-  function named as `schedule` of the Experiment.
-
-  If schedule is not provided, then the default schedule for the current task
-  type is used. The defaults are as follows:
-
-   * 'ps' maps to 'serve'
-   * 'worker' maps to 'train'
-   * 'master' maps to 'local_run'
-
-  If the experiment's config does not include a task type, then an exception
-  is raised.
-
-  Example:
-  ```
-    def _create_my_experiment(output_dir):
-        return tf.contrib.learn.Experiment(
-          estimator=my_estimator(model_dir=output_dir),
-          train_input_fn=my_train_input,
-          eval_input_fn=my_eval_input)
-
-    learn_runner.run(
-      experiment_fn=_create_my_experiment,
-      output_dir="some/output/dir",
-      schedule="train")
-  ```
-  Args:
-    experiment_fn: A function that creates an `Experiment`. It should accept an
-      argument `output_dir` which should be used to create the `Estimator`
-      (passed as `model_dir` to its constructor). It must return an
-      `Experiment`.
-    output_dir: Base output directory.
-    schedule: The name of the  method in the `Experiment` to run.
-
-  Returns:
-    The return value of function `schedule`.
-
-  Raises:
-    ValueError: If `output_dir` is empty, `schedule` is None but no task
-      type is set in the built experiment's config, the task type has no
-      default, or `schedule` doesn't reference a member of `Experiment`.
-    TypeError: `schedule` references non-callable member.
-  """
-  if not output_dir:
-    raise ValueError('Must specify an output directory')
-  if not callable(experiment_fn):
-    raise TypeError('Experiment builder "%s" is not callable.' %
-                    experiment_fn)
-
-  # Call the builder
-  experiment = experiment_fn(output_dir=output_dir)
-  if not isinstance(experiment, Experiment):
-    raise TypeError('Experiment builder did not return an Experiment '
-                    'instance, got %s instead.' % type(experiment))
-
-  # Get the schedule
-  config = experiment.estimator.config
-  schedule = schedule or _get_default_schedule(config)
-
-  # Execute the schedule
+# TODO(xiejw): Refactor the learn_runner to make code reusable.
+def _execute_schedule(experiment, schedule):
+  """Execute the method named `schedule` of `experiment`."""
   if not hasattr(experiment, schedule):
     logging.error('Schedule references non-existent task %s', schedule)
     valid_tasks = [x for x in dir(experiment)
@@ -102,10 +43,208 @@ def run(experiment_fn, output_dir, schedule=None):
                    and callable(getattr(experiment, x))]
     logging.error('Allowed values for this experiment are: %s', valid_tasks)
     raise TypeError('Schedule references non-callable member %s' % schedule)
-
   return task()
 
 
+def _wrapped_experiment_fn_with_uid_check(experiment_fn, require_hparams=False):
+  """Wraps the `RunConfig` uid check with `experiment_fn`.
+
+  For `experiment_fn` which takes `run_config`, it is expected that the
+  `run_config` is passed to the Estimator correctly. Toward that, the wrapped
+  `experiment_fn` compares the `uid` of the `RunConfig` instance.
+
+  Args:
+    experiment_fn: The original `experiment_fn` which takes `run_config` and
+      `hparams`.
+    require_hparams: If True, the `hparams` passed to `experiment_fn` cannot be
+      `None`.
+
+  Returns:
+    A experiment_fn with same signature.
+  """
+  def wrapped_experiment_fn(run_config, hparams):
+    """Calls experiment_fn and checks the uid of `RunConfig`."""
+    if not isinstance(run_config, run_config_lib.RunConfig):
+      raise ValueError('`run_config` must be `RunConfig` instance')
+    if not run_config.model_dir:
+      raise ValueError(
+          'Must specify a model directory `model_dir` in `run_config`.')
+    if hparams is not None and not isinstance(hparams, hparam_lib.HParams):
+      raise ValueError('`hparams` must be `HParams` instance')
+    if require_hparams and hparams is None:
+      raise ValueError('`hparams` cannot be `None`.')
+
+    expected_uid = run_config.uid()
+    experiment = experiment_fn(run_config, hparams)
+
+    if not isinstance(experiment, Experiment):
+      raise TypeError('Experiment builder did not return an Experiment '
+                      'instance, got %s instead.' % type(experiment))
+
+    if experiment.estimator.config.uid() != expected_uid:
+      raise RuntimeError(
+          '`RunConfig` instance is expected to be used by the `Estimator` '
+          'inside the `Experiment`. expected {}, but got {}'.format(
+              expected_uid, experiment.estimator.config.uid()))
+    return experiment
+  return wrapped_experiment_fn
+
+
+def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
+        hparams=None):
+  """Make and run an experiment.
+
+  It creates an Experiment by calling `experiment_fn`. Then it calls the
+  function named as `schedule` of the Experiment.
+
+  If schedule is not provided, then the default schedule for the current task
+  type is used. The defaults are as follows:
+
+   * 'ps' maps to 'serve'
+   * 'worker' maps to 'train'
+   * 'master' maps to 'local_run'
+
+  If the experiment's config does not include a task type, then an exception
+  is raised.
+
+  Example with `run_config` (Recommended):
+  ```
+    def _create_my_experiment(run_config, hparams):
+
+        # You can change a subset of the run_config properties as
+        #   run_config = run_config.replace(save_checkpoints_steps=500)
+
+        return tf.contrib.learn.Experiment(
+          estimator=my_estimator(config=run_config, hparams=hparams),
+          train_input_fn=my_train_input,
+          eval_input_fn=my_eval_input)
+
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      run_config=run_config_lib.RunConfig(model_dir="some/output/dir"),
+      schedule="train_and_evaluate",
+      hparams=_create_default_hparams())
+  ```
+  or simply as
+  ```
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      run_config=run_config_lib.RunConfig(model_dir="some/output/dir"))
+  ```
+  if `hparams` is not used by the `Estimator`. On a single machine, `schedule`
+  defaults to `train_and_evaluate`.
+
+  Example with `output_dir` (deprecated):
+  ```
+    def _create_my_experiment(output_dir):
+        return tf.contrib.learn.Experiment(
+          estimator=my_estimator(model_dir=output_dir),
+          train_input_fn=my_train_input,
+          eval_input_fn=my_eval_input)
+
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      output_dir="some/output/dir",
+      schedule="train")
+  ```
+  Args:
+    experiment_fn: A function that creates an `Experiment`. It could be one of
+      the two following signatures:
+      1) [Deprecated] It accepts an argument `output_dir` which should be used
+      to create the `Estimator` (passed as `model_dir` to its constructor). It
+      must return an `Experiment`. For this case, `run_config` and `hparams`
+      must be None.
+      2) It accepts two arguments `run_config` and `hparams`, which should be
+      used to create the `Estimator` (`run_config` passed as `config` to its
+      constructor; `hparams` used as the hyper-paremeters of the model).
+      It must return an `Experiment`. For this case, `output_dir` must be None.
+    output_dir: Base output directory [Deprecated].
+    schedule: The name of the  method in the `Experiment` to run.
+    run_config: `RunConfig` instance. The `run_config.model_dir` must be
+      non-empty. If `run_config` is set, `output_dir` must be None.
+    hparams: `HParams` instance. The default hyper-parameters, which will be
+      passed to the `experiment_fn` if `run_config` is not None.
+
+  Returns:
+    The return value of function `schedule`.
+
+  Raises:
+    ValueError: If both `output_dir` and `run_config` are empty or set,
+      `schedule` is None but no task type is set in the built experiment's
+      config, the task type has no default, `run_config.model_dir` is empty or
+      `schedule` doesn't reference a member of `Experiment`.
+    TypeError: `schedule` references non-callable member.
+  """
+
+  if output_dir is not None and run_config is not None:
+    raise ValueError('Cannot provide both `output_dir` and `run_config`')
+
+  if output_dir is None and run_config is None:
+    raise ValueError('Must set value for `output_dir` or `run_config`')
+
+  if not callable(experiment_fn):
+    raise TypeError('Experiment builder "%s" is not callable.' %
+                    experiment_fn)
+
+  experiment = None
+  if run_config is not None:
+    wrapped_experiment_fn = _wrapped_experiment_fn_with_uid_check(experiment_fn)
+    experiment = wrapped_experiment_fn(run_config=run_config, hparams=hparams)
+  else:
+    if not output_dir:
+      raise ValueError('Must specify an output directory')
+    if hparams is not None:
+      raise ValueError(
+          'Must set `hparams` as None for `experiment_fn` with `output_dir`.')
+    # Call the builder
+    experiment = experiment_fn(output_dir=output_dir)
+    if not isinstance(experiment, Experiment):
+      raise TypeError('Experiment builder did not return an Experiment '
+                      'instance, got %s instead.' % type(experiment))
+
+  # Get the schedule
+  run_config = run_config or experiment.estimator.config
+  schedule = schedule or _get_default_schedule(run_config)
+
+  return _execute_schedule(experiment, schedule)
+
+
+def tune(experiment_fn, tuner):
+  """Tune an experiment with hyper-parameters.
+
+  It iterates trials by running the Experiment for each trial with the
+  corresponding hyper-parameters. For each trial, it retrieves the
+  hyper-parameters from `tuner`, creates an Experiment by calling experiment_fn,
+  and then reports the measure back to `tuner`.
+
+  Example:
+  ```
+    def _create_my_experiment(run_config, hparams):
+      hidden_units = [hparams.unit_per_layer] * hparams.num_hidden_layers
+
+      return tf.contrib.learn.Experiment(
+          estimator=DNNClassifier(config=run_config, hidden_units=hidden_units),
+          train_input_fn=my_train_input,
+          eval_input_fn=my_eval_input)
+
+    tuner = create_tuner(study_configuration, objective_key)
+
+    learn_runner.tune(experiment_fn=_create_my_experiment, tuner)
+  ```
+  Args:
+    experiment_fn: A function that creates an `Experiment`. It should accept an
+      argument `run_config` which should be used to create the `Estimator` (
+      passed as `config` to its constructor), and an argument `hparams`, which
+      should be used for hyper-parameters tuning. It must return an
+      `Experiment`.
+    tuner: A `Tuner` instance.
+  """
+  while tuner.next_trial():
+    tuner.run_experiment(
+        _wrapped_experiment_fn_with_uid_check(
+            experiment_fn, require_hparams=True))
+
+
 def _is_distributed(config):
   """Returns true if this is a distributed job."""
   if not config.cluster_spec:
@@ -129,13 +268,13 @@ def _get_default_schedule(config):
   if not config.task_type:
     raise ValueError('Must specify a schedule')
 
-  if config.task_type == run_config.TaskType.MASTER:
+  if config.task_type == run_config_lib.TaskType.MASTER:
     # TODO(rhaertel): handle the case where there is more than one master
     # or explicitly disallow such a case.
     return 'train_and_evaluate'
-  elif config.task_type == run_config.TaskType.PS:
+  elif config.task_type == run_config_lib.TaskType.PS:
     return 'run_std_server'
-  elif config.task_type == run_config.TaskType.WORKER:
+  elif config.task_type == run_config_lib.TaskType.WORKER:
     return 'train'
 
   raise ValueError('No default schedule for task type: %s' % (config.task_type))
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_lib.py b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
new file mode 100644
index 00000000000..7d9b1c7716f
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to run and tune an Experiment.
+
+@@run
+@@tune
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.learn_runner import *  # pylint: disable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = []
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
index 31e0dd561d2..b61a42a1c76 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
@@ -20,36 +20,54 @@ from __future__ import print_function
 
 import json
 import os
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.contrib.learn.python.learn import evaluable  # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.learn.python.learn import experiment
 from tensorflow.contrib.learn.python.learn import learn_runner
-from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn import trainable
+
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.contrib.training.python.training import hparam as hparam_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 patch = test.mock.patch
 
+_MODIR_DIR = "/tmp"
+_HPARAMS = hparam_lib.HParams(learning_rate=0.01)
+_MUST_SPECIFY_OUTPUT_DIR_MSG = "Must specify an output directory"
+_MISSING_MODEL_DIR_ERR_MSG = (
+    "Must specify a model directory `model_dir` in `run_config`.")
+_EXP_NOT_CALLABLE_MSG = "Experiment builder .* is not callable"
+_INVALID_HPARAMS_ERR_MSG = "`hparams` must be `HParams` instance"
+_NOT_EXP_TYPE_MSG = "Experiment builder did not return an Experiment"
+_NON_EXIST_TASK_MSG = "Schedule references non-existent task"
+_NON_CALLABLE_MSG = "Schedule references non-callable member"
+_MUST_SPECIFY_OUTPUT_DIR_OR_CONFIG_MSG = (
+    "Must set value for `output_dir` or `run_config`")
+_HPARAMS_CANNOT_BE_SET_FOR_OUTPUT_DIR_MSG = (
+    "Must set `hparams` as None for `experiment_fn` with `output_dir`.")
+_CANNOT_SET_BOTH_OUTPUT_DIR_AND_CONFIG_MSG = (
+    "Cannot provide both `output_dir` and `run_config`")
+_INVALID_RUN_CONFIG_TYPE_MSG = "`run_config` must be `RunConfig` instance"
+_RUN_CONFIG_UID_CHECK_ERR_MSG = (
+    "`RunConfig` instance is expected to be used by the `Estimator`")
+
 
 class TestExperiment(experiment.Experiment):
 
-  def __init__(self, default=None, config=None):
+  def __init__(self, default=None, config=None, model_dir=None):
     self.default = default
     self.config = config
+    internal_model_dir = model_dir or config.model_dir
+    self._model_dir = internal_model_dir
 
     class Estimator(evaluable.Evaluable, trainable.Trainable):
       config = self.config
 
+      @property
       def model_dir(self):
-        raise NotImplementedError
+        return internal_model_dir
 
       def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
               monitors=None, max_steps=None):
@@ -63,16 +81,16 @@ class TestExperiment(experiment.Experiment):
     super(TestExperiment, self).__init__(Estimator(), None, None)
 
   def local_run(self):
-    return "local_run"
+    return "local_run-{}".format(self._model_dir)
 
   def train(self):
-    return "train"
+    return "train-{}".format(self._model_dir)
 
   def run_std_server(self):
-    return "run_std_server"
+    return "run_std_server-{}".format(self._model_dir)
 
   def train_and_evaluate(self):
-    return "train_and_evaluate"
+    return "train_and_evaluate-{}".format(self._model_dir)
 
   def simple_task(self):
     return "simple_task, default=%s." % self.default
@@ -81,7 +99,20 @@ class TestExperiment(experiment.Experiment):
 # pylint: disable=unused-argument
 def build_experiment(output_dir):
   tf_logging.info("In default build_experiment.")
-  return TestExperiment()
+  return TestExperiment(model_dir=output_dir)
+
+
+def build_experiment_fn_for_output_dir(run_config=None):
+  def _build_experiment(output_dir):
+    tf_logging.info("In default build_experiment.")
+    return TestExperiment(config=run_config, model_dir=output_dir)
+  return _build_experiment
+
+
+def build_experiment_for_run_config(run_config, hparams):
+  if hparams is not None and hparams != _HPARAMS:
+    raise ValueError("hparams is not set correctly")
+  return TestExperiment(config=run_config)
 
 
 def build_non_experiment(output_dir):
@@ -104,7 +135,7 @@ def build_non_distributed_cluster_spec():
   return {"foo": ["localhost:1234"]}
 
 
-class MainTest(test.TestCase):
+class LearnRunnerRunWithOutputDirTest(test.TestCase):
 
   def setUp(self):
     # Ensure the TF_CONFIG environment variable is unset for all tests.
@@ -114,16 +145,170 @@ class MainTest(test.TestCase):
     self.assertEqual(
         "simple_task, default=None.",
         learn_runner.run(build_experiment,
-                         output_dir="/tmp",
+                         output_dir=_MODIR_DIR,
                          schedule="simple_task"))
 
   def test_run_with_explicit_local_run(self):
     self.assertEqual(
-        "local_run",
+        "local_run-" + _MODIR_DIR,
         learn_runner.run(build_experiment,
-                         output_dir="/tmp",
+                         output_dir=_MODIR_DIR,
                          schedule="local_run"))
 
+  def test_fail_output_dir_and_run_config_are_both_set(self):
+    with self.assertRaisesRegexp(
+        ValueError, _CANNOT_SET_BOTH_OUTPUT_DIR_AND_CONFIG_MSG):
+      learn_runner.run(build_experiment,
+                       output_dir=_MODIR_DIR,
+                       schedule="simple_task",
+                       run_config=run_config_lib.RunConfig())
+
+  def test_fail_empty_output_dir(self):
+    with self.assertRaisesRegexp(ValueError, _MUST_SPECIFY_OUTPUT_DIR_MSG):
+      learn_runner.run(build_experiment, output_dir="", schedule="simple_task")
+
+  def test_fail_no_output_dir(self):
+    with self.assertRaisesRegexp(
+        ValueError, _MUST_SPECIFY_OUTPUT_DIR_OR_CONFIG_MSG):
+      learn_runner.run(build_experiment, None, "simple_task")
+
+  def test_fail_hparams_are_set(self):
+    hparams = _HPARAMS
+    with self.assertRaisesRegexp(
+        ValueError, _HPARAMS_CANNOT_BE_SET_FOR_OUTPUT_DIR_MSG):
+      learn_runner.run(
+          build_experiment, _MODIR_DIR, schedule="simple_task", hparams=hparams)
+
+  def test_fail_non_callable(self):
+    with self.assertRaisesRegexp(TypeError, _EXP_NOT_CALLABLE_MSG):
+      learn_runner.run("not callable", _MODIR_DIR, "simple_test")
+
+  def test_fail_not_experiment(self):
+    with self.assertRaisesRegexp(TypeError, _NOT_EXP_TYPE_MSG):
+      learn_runner.run(build_non_experiment, _MODIR_DIR, "simple_test")
+
+  def test_fail_non_existent_task(self):
+    with self.assertRaisesRegexp(ValueError, _NON_EXIST_TASK_MSG):
+      learn_runner.run(build_experiment, _MODIR_DIR, "mirage")
+
+  def test_fail_non_callable_task(self):
+    with self.assertRaisesRegexp(TypeError, _NON_CALLABLE_MSG):
+      learn_runner.run(build_experiment, _MODIR_DIR, "default")
+
+
+class LearnRunnerRunWithRunConfigTest(test.TestCase):
+
+  def setUp(self):
+    # Ensure the TF_CONFIG environment variable is unset for all tests.
+    os.environ.pop("TF_CONFIG", None)
+
+  def test_run_with_custom_schedule(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "simple_task, default=None.",
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
+                         schedule="simple_task"))
+
+  def test_run_with_hparams(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "simple_task, default=None.",
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
+                         schedule="simple_task",
+                         hparams=_HPARAMS))
+
+  def test_run_with_explicit_local_run(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "local_run-" + _MODIR_DIR,
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
+                         schedule="local_run"))
+
+  def test_fail_empty_output_dir(self):
+    run_config = run_config_lib.RunConfig(model_dir="")
+    with self.assertRaisesRegexp(ValueError, _MISSING_MODEL_DIR_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_no_output_dir(self):
+    run_config = run_config_lib.RunConfig()
+    with self.assertRaisesRegexp(ValueError, _MISSING_MODEL_DIR_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_invalid_run_config_type(self):
+    run_config = "invalid_run_config"
+    with self.assertRaisesRegexp(ValueError, _INVALID_RUN_CONFIG_TYPE_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_invalid_hparams_type(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(ValueError, _INVALID_HPARAMS_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run",
+                       hparams=["hparams"])
+
+  def test_fail_non_callable(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _EXP_NOT_CALLABLE_MSG):
+      learn_runner.run("not callable",
+                       run_config=run_config,
+                       schedule="simple_task")
+
+  def test_fail_not_experiment(self):
+    def _experiment_fn(run_config, hparams):
+      del run_config, hparams  # unused.
+      return "not experiment"
+
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _NOT_EXP_TYPE_MSG):
+      learn_runner.run(_experiment_fn,
+                       run_config=run_config,
+                       schedule="simple_task")
+
+  def test_fail_non_existent_task(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(ValueError, _NON_EXIST_TASK_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="mirage")
+
+  def test_fail_non_callable_task(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _NON_CALLABLE_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="default")
+
+  def test_basic_run_config_uid_check(self):
+    expected_run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+
+    def _experiment_fn(run_config, hparams):
+      del run_config, hparams  # unused.
+      # Explicitly use a new run_config.
+      new_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR + "/123")
+
+      return TestExperiment(config=new_config)
+
+    with self.assertRaisesRegexp(RuntimeError, _RUN_CONFIG_UID_CHECK_ERR_MSG):
+      learn_runner.run(experiment_fn=_experiment_fn,
+                       run_config=expected_run_config)
+
+
+class LearnRunnerDefaultScheduleTest(test.TestCase):
+
+  def setUp(self):
+    # Ensure the TF_CONFIG environment variable is unset for all tests.
+    os.environ.pop("TF_CONFIG", None)
+
   def test_schedule_from_tf_config_runs_train_on_worker(self):
     os.environ["TF_CONFIG"] = json.dumps({
         "cluster": build_distributed_cluster_spec(),
@@ -132,11 +317,12 @@ class MainTest(test.TestCase):
         }
     })
     # RunConfig constructor will set job_name from TF_CONFIG.
-    config = run_config.RunConfig()
+    config = run_config_lib.RunConfig()
     self.assertEqual(
-        "train",
-        learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                         output_dir="/tmp"))
+        "train-" + _MODIR_DIR,
+        learn_runner.run(
+            build_experiment_fn_for_output_dir(config),
+            output_dir=_MODIR_DIR))
 
   def test_schedule_from_tf_config_runs_train_and_evaluate_on_master(self):
     tf_config = {
@@ -146,11 +332,12 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "train_and_evaluate",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
+          "train_and_evaluate-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_schedule_from_tf_config_runs_serve_on_ps(self):
     tf_config = {
@@ -160,30 +347,27 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "run_std_server",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
-
-  def test_fail_no_output_dir(self):
-    self.assertRaisesRegexp(ValueError, "Must specify an output directory",
-                            learn_runner.run, build_experiment, "",
-                            "simple_task")
+          "run_std_server-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_no_schedule_and_no_config_runs_train_and_evaluate(self):
     self.assertEqual(
-        "train_and_evaluate",
-        learn_runner.run(build_experiment, output_dir="/tmp"))
+        "train_and_evaluate-" + _MODIR_DIR,
+        learn_runner.run(build_experiment, output_dir=_MODIR_DIR))
 
   def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self):
     tf_config = {"cluster": build_non_distributed_cluster_spec()}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "train_and_evaluate",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
+          "train_and_evaluate-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_fail_task_type_with_no_default_schedule(self):
     tf_config = {
@@ -193,43 +377,24 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       create_experiment_fn = lambda output_dir: TestExperiment(config=config)
-      self.assertRaisesRegexp(ValueError, "No default schedule",
-                              learn_runner.run, create_experiment_fn, "/tmp")
-
-  def test_fail_non_callable(self):
-    self.assertRaisesRegexp(TypeError, "Experiment builder .* is not callable",
-                            learn_runner.run, "not callable", "/tmp",
-                            "simple_test")
-
-  def test_fail_not_experiment(self):
-    self.assertRaisesRegexp(TypeError,
-                            "Experiment builder did not return an Experiment",
-                            learn_runner.run, build_non_experiment, "/tmp",
-                            "simple_test")
-
-  def test_fail_non_existent_task(self):
-    self.assertRaisesRegexp(ValueError, "Schedule references non-existent task",
-                            learn_runner.run, build_experiment, "/tmp",
-                            "mirage")
-
-  def test_fail_non_callable_task(self):
-    self.assertRaisesRegexp(TypeError,
-                            "Schedule references non-callable member",
-                            learn_runner.run, build_experiment, "/tmp",
-                            "default")
+      self.assertRaisesRegexp(ValueError,
+                              "No default schedule",
+                              learn_runner.run,
+                              create_experiment_fn,
+                              _MODIR_DIR)
 
   def test_fail_schedule_from_config_with_no_task_type(self):
     tf_config = {"cluster": build_distributed_cluster_spec()}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertRaisesRegexp(
           ValueError,
           "Must specify a schedule",
           learn_runner.run,
           lambda output_dir: TestExperiment(config=config),
-          output_dir="/tmp")
+          output_dir=_MODIR_DIR)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index c8324ded519..2f2c9317861 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import six
 
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
 
 
 def _assert_named_args(sentinel):
@@ -43,17 +43,18 @@ def _args(fn):
   if hasattr(fn, 'func') and hasattr(fn, 'keywords'):
     # Handle functools.partial and similar objects.
     return tuple([
-        arg for arg in inspect.getargspec(fn.func).args
+        arg for arg in tf_inspect.getargspec(fn.func).args
         if arg not in set(fn.keywords.keys())
     ])
   # Handle function.
-  return tuple(inspect.getargspec(fn).args)
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 _CANONICAL_LABELS_ARG = 'labels'
 _LABELS_ARGS = set((_CANONICAL_LABELS_ARG, 'label', 'targets', 'target'))
 _CANONICAL_PREDICTIONS_ARG = 'predictions'
-_PREDICTIONS_ARGS = set((_CANONICAL_PREDICTIONS_ARG, 'prediction'))
+_PREDICTIONS_ARGS = set((_CANONICAL_PREDICTIONS_ARG, 'prediction',
+                         'logits', 'logit'))
 _CANONICAL_WEIGHTS_ARG = 'weights'
 _WEIGHTS_ARGS = set((_CANONICAL_WEIGHTS_ARG, 'weight'))
 
@@ -400,7 +401,9 @@ class MetricSpec(object):
         if not isinstance(dict_or_tensor, dict):
           raise ValueError('MetricSpec with ' + name + '_key specified'
                            ' requires ' +
-                           name + 's dict, got %s' % dict_or_tensor)
+                           name + 's dict, got %s.\n' % dict_or_tensor +
+                           'You must not provide a %s_key if you ' % name +
+                           'only have a single Tensor as %ss.' % name)
         if key not in dict_or_tensor:
           raise KeyError(
               'Key \'%s\' missing from %s.' % (key, dict_or_tensor.keys()))
@@ -425,6 +428,6 @@ class MetricSpec(object):
           labels=label,
           predictions=prediction,
           weights=inputs[self.weight_key] if self.weight_key else None)
-    except:  # pylint: disable=bare-except
-      logging.error('Could not create metric ops for %s.' % self)
+    except Exception as ex:
+      logging.error('Could not create metric ops for %s, %s.' % (self, ex))
       raise
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec_test.py b/tensorflow/contrib/learn/python/learn/metric_spec_test.py
index 7df5aa5b386..8d578174ad3 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec_test.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec_test.py
@@ -19,14 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import sys
 
 # pylint: disable=g-bad-todo,g-import-not-at-top
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 234605ff763..4283240d018 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -63,7 +63,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
     x: tensor or placeholder for input features.
     y: tensor or placeholder for labels.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
@@ -124,7 +124,7 @@ def logistic_regression(x,
                   will check if graph contains tensor `class_weight:0`.
                   If that is not provided either all ones are used.
     init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.
 
   Returns:
     Predictions and loss tensors.
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index ab6ea0fb02c..d9d22485eea 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -13,62 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Monitors allow user instrumentation of the training process.
-
-Monitors are useful to track training, report progress, request early
-stopping and more. Monitors use the observer pattern and notify at the following
-points:
-
-* when training begins
-* before a training step
-* after a training step
-* when training ends
-
-Monitors are not intended to be reusable.
-
-There are a few pre-defined monitors:
-
-* `CaptureVariable`: saves a variable's values
-* `GraphDump`: intended for debug only - saves all tensor values
-* `PrintTensor`: outputs one or more tensor values to log
-* `SummarySaver`: saves summaries to a summary writer
-* `ValidationMonitor`: runs model validation, by periodically calculating eval
-    metrics on a separate data set; supports optional early stopping
-
-For more specific needs, you can create custom monitors by extending one of the
-following classes:
-
-* `BaseMonitor`: the base class for all monitors
-* `EveryN`: triggers a callback every N training steps
-
-Example:
-
-```python
-  class ExampleMonitor(monitors.BaseMonitor):
-    def __init__(self):
-      print 'Init'
-
-    def begin(self, max_steps):
-      print 'Starting run. Will train until step %d.' % max_steps
-
-    def end(self):
-      print 'Completed run.'
-
-    def step_begin(self, step):
-      print 'About to run step %d...' % step
-      return ['loss_1:0']
-
-    def step_end(self, step, outputs):
-      print 'Done running step %d. The value of "loss" tensor: %s' % (
-        step, outputs['loss_1:0'])
-
-  linear_regressor = LinearRegressor()
-  example_monitor = ExampleMonitor()
-  linear_regressor.fit(
-    x, y, steps=2, batch_size=1, monitors=[example_monitor])
-```
-
-## Ops
+"""Monitors instrument the training process.
 
 @@get_default_monitors
 @@BaseMonitor
@@ -90,24 +35,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
+import copy
 import os
 import time
 
 import numpy as np
 import six
 
-from tensorflow.contrib.framework import deprecated_arg_values
+from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import summary_io
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import tf_inspect
 
 
 # TODO(ptucker): Split each monitor class into a separate file.
@@ -528,7 +475,7 @@ class LoggingTrainable(EveryN):
 
   def every_n_step_begin(self, step):
     super(LoggingTrainable, self).every_n_step_begin(step)
-    # Get a list of trainable variables at the begining of every N steps.
+    # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
     trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self._scope)
@@ -670,6 +617,7 @@ class ValidationMonitor(EveryN):
     self.name = name
     self._best_value_step = None
     self._best_value = None
+    self._best_metrics = None
     self._early_stopped = False
     self._latest_path = None
     self._latest_path_step = None
@@ -689,6 +637,40 @@ class ValidationMonitor(EveryN):
     """Returns the best early stopping metric value found so far."""
     return self._best_value
 
+  @property
+  def best_metrics(self):
+    """Returns all eval metrics computed with the best early stopping metric.
+
+    For instance, if the metrics computed in two successive evals are
+    1. {'loss':40, 'auc':0.5}
+    2. {'loss':50, 'auc':0.6}
+    this function would return the first dict {'loss':40, 'auc':0.5} after both
+    first and second eval (if `early_stopping_metric` is 'loss' and
+    `early_stopping_metric_minimize` is True).
+
+    Returns:
+      The output dict of estimator.evaluate which contains the best value of
+      the early stopping metric seen so far.
+    """
+    return self._best_metrics
+
+  def _evaluate_estimator(self):
+    if isinstance(self._estimator, core_estimator.Estimator):
+      if any((x is not None for x in
+              [self.x, self.y, self.batch_size, self.metrics])):
+        raise ValueError(
+            "tf.estimator.Estimator does not support following "
+            "arguments: x, y, batch_size, metrics. Should set as `None` "
+            "in ValidationMonitor")
+      return self._estimator.evaluate(
+          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          name=self.name)
+    else:
+      return self._estimator.evaluate(
+          x=self.x, y=self.y, input_fn=self.input_fn,
+          batch_size=self.batch_size, steps=self.eval_steps,
+          metrics=self.metrics, hooks=self.hooks, name=self.name)
+
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
     # TODO(mdan): The use of step below is probably misleading.
@@ -711,10 +693,7 @@ class ValidationMonitor(EveryN):
     self._latest_path_step = step
 
     # Run evaluation and log it.
-    validation_outputs = self._estimator.evaluate(
-        x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
-        steps=self.eval_steps, metrics=self.metrics, hooks=self.hooks,
-        name=self.name)
+    validation_outputs = self._evaluate_estimator()
     stats = []
     for name in validation_outputs:
       stats.append("%s = %s" % (name, str(validation_outputs[name])))
@@ -731,6 +710,7 @@ class ValidationMonitor(EveryN):
           (not self.early_stopping_metric_minimize and
            (current_value > self._best_value))):
         self._best_value = current_value
+        self._best_metrics = copy.deepcopy(validation_outputs)
         self._best_value_step = step
       stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
       if stop_now:
@@ -902,16 +882,9 @@ class GraphDump(BaseMonitor):
 class ExportMonitor(EveryN):
   """Monitor that exports Estimator every N steps."""
 
-  # TODO(philstahlfeld): Investigate switching export.export_estimator
-  # configuration values to **kwargs so that updates to the export_estimator
-  # function don't have to be reflected here.
-  @deprecated_arg_values(
-      "2016-09-23",
-      "The signature of the input_fn accepted by export is changing to be "
-      "consistent with what's used by tf.Learn Estimator's train/evaluate. "
-      "input_fn (and in most cases, input_feature_key) will both become "
-      "required args.",
-      input_fn=None)
+  @deprecated("2017-03-25",
+              "ExportMonitor is deprecated. Please pass an "
+              "ExportStrategy to Experiment instead.")
   def __init__(self,
                every_n_steps,
                export_dir,
@@ -981,6 +954,10 @@ class ExportMonitor(EveryN):
   def every_n_step_end(self, step, outputs):
     super(ExportMonitor, self).every_n_step_end(step, outputs)
     try:
+      if isinstance(self._estimator, core_estimator.Estimator):
+        raise ValueError(
+            "ExportMonitor does not support `tf.estimator.Estimator. `. "
+            "Please pass an ExportStrategy to Experiment instead.")
       self._last_export_dir = self._estimator.export(
           self.export_dir,
           exports_to_keep=self.exports_to_keep,
@@ -1008,6 +985,10 @@ class ExportMonitor(EveryN):
       logging.info("Skipping export at the end since model has not been saved "
                    "yet.")
       return
+    if isinstance(self._estimator, core_estimator.Estimator):
+      raise ValueError(
+          "ExportMonitor does not support `tf.estimator.Estimator. `. "
+          "Please pass an ExportStrategy to Experiment instead.")
     try:
       self._last_export_dir = self._estimator.export(
           self.export_dir,
@@ -1226,7 +1207,7 @@ class RunHookAdapterForMonitors(session_run_hook.SessionRunHook):
   def end(self, session):
     self._last_step = None
     for m in self._monitors:
-      if "session" in inspect.getargspec(m.end).args:
+      if "session" in tf_inspect.getargspec(m.end).args:
         m.end(session=session)
       else:
         m.end()
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index 6f310c27db0..e8fe6026b71 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -20,15 +20,9 @@ from __future__ import print_function
 
 import collections
 import shutil
-import sys
 import tempfile
 import time
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib import testing
@@ -37,6 +31,7 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -306,10 +301,12 @@ class MonitorsTest(test.TestCase):
                                  monitor,
                                  expected_early_stopped=False,
                                  expected_best_step=None,
-                                 expected_best_value=None):
+                                 expected_best_value=None,
+                                 expected_best_metrics=None):
     self.assertEqual(expected_early_stopped, monitor.early_stopped)
     self.assertEqual(expected_best_step, monitor.best_step)
     self.assertEqual(expected_best_value, monitor.best_value)
+    self.assertEqual(expected_best_metrics, monitor.best_metrics)
 
   def test_validation_monitor_no_estimator(self):
     monitor = learn.monitors.ValidationMonitor(
@@ -384,7 +381,7 @@ class MonitorsTest(test.TestCase):
     estimator = mock_estimator_class()
     model_dir = 'model/dir'
     estimator.model_dir = model_dir
-    validation_outputs = {'loss': None}
+    validation_outputs = {'loss': None, 'auc': None}
     estimator.evaluate.return_value = validation_outputs
 
     monitor = learn.monitors.ValidationMonitor(
@@ -400,11 +397,13 @@ class MonitorsTest(test.TestCase):
       step = 0
       mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
       validation_outputs['loss'] = 42.0
+      validation_outputs['auc'] = 0.5
       self.assertEqual(0, len(monitor.step_begin(step=step)))
       self.assertFalse(monitor.step_end(step=step, output={}))
       self.assertEqual(1, estimator.evaluate.call_count)
       self._assert_validation_monitor(
-          monitor, expected_best_step=0, expected_best_value=42.0)
+          monitor, expected_best_step=0, expected_best_value=42.0,
+          expected_best_metrics={'loss': 42.0, 'auc': 0.5})
       monitor.post_step(step=step, session=None)
 
       # Step 1, same checkpoint, no eval.
@@ -413,29 +412,34 @@ class MonitorsTest(test.TestCase):
       self.assertFalse(monitor.step_end(step=step, output={}))
       self.assertEqual(1, estimator.evaluate.call_count)
       self._assert_validation_monitor(
-          monitor, expected_best_step=0, expected_best_value=42.0)
+          monitor, expected_best_step=0, expected_best_value=42.0,
+          expected_best_metrics={'loss': 42.0, 'auc': 0.5})
       monitor.post_step(step=step, session=None)
 
       # Step 2, lower loss.
       step = 2
       mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
       validation_outputs['loss'] = 40.0
+      validation_outputs['auc'] = 0.6
       self.assertEqual(0, len(monitor.step_begin(step=step)))
       self.assertFalse(monitor.step_end(step=step, output={}))
       self.assertEqual(2, estimator.evaluate.call_count)
       self._assert_validation_monitor(
-          monitor, expected_best_step=2, expected_best_value=40.0)
+          monitor, expected_best_step=2, expected_best_value=40.0,
+          expected_best_metrics={'loss': 40.0, 'auc': 0.6})
       monitor.post_step(step=step, session=None)
 
       # Step 3, higher loss.
       step = 3
       mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
       validation_outputs['loss'] = 44.0
+      validation_outputs['auc'] = 0.7
       self.assertEqual(0, len(monitor.step_begin(step=step)))
       self.assertFalse(monitor.step_end(step=step, output={}))
       self.assertEqual(3, estimator.evaluate.call_count)
       self._assert_validation_monitor(
-          monitor, expected_best_step=2, expected_best_value=40.0)
+          monitor, expected_best_step=2, expected_best_value=40.0,
+          expected_best_metrics={'loss': 40.0, 'auc': 0.6})
       monitor.post_step(step=step, session=None)
 
       # Step 4, higher loss for 2 steps, early stopping.
@@ -449,12 +453,71 @@ class MonitorsTest(test.TestCase):
           monitor,
           expected_early_stopped=True,
           expected_best_step=2,
-          expected_best_value=40.0)
+          expected_best_value=40.0,
+          expected_best_metrics={'loss': 40.0, 'auc': 0.6})
       monitor.post_step(step=step, session=None)
 
       monitor.epoch_end(epoch=0)
       monitor.end()
 
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_with_core_estimator(self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None, 'auc': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    self._assert_validation_monitor(monitor)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+      self.assertEqual(0, estimator.evaluate.call_count)
+
+      # Step 0, initial loss.
+      step = 0
+      mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+      validation_outputs['loss'] = 42.0
+      validation_outputs['auc'] = 0.5
+      self.assertEqual(0, len(monitor.step_begin(step=step)))
+      self.assertFalse(monitor.step_end(step=step, output={}))
+      self.assertEqual(1, estimator.evaluate.call_count)
+      self._assert_validation_monitor(
+          monitor, expected_best_step=0, expected_best_value=42.0,
+          expected_best_metrics={'loss': 42.0, 'auc': 0.5})
+      monitor.post_step(step=step, session=None)
+
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_fail_with_core_estimator_and_metrics(
+      self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        metrics=constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          'tf.estimator.Estimator does not support .* metrics'):
+        step = 0
+        mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+        validation_outputs['loss'] = 42.0
+        self.assertEqual(0, len(monitor.step_begin(step=step)))
+        self.assertFalse(monitor.step_end(step=step, output={}))
+
   def test_graph_dump(self):
     monitor0 = learn.monitors.GraphDump()
     monitor1 = learn.monitors.GraphDump()
diff --git a/tensorflow/contrib/learn/python/learn/ops/ops_test.py b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
index dd145b99003..d0b9eb8abcb 100644
--- a/tensorflow/contrib/learn/python/learn/ops/ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.layers import conv2d
diff --git a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
index 0faba7cee5e..45727faab43 100644
--- a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
@@ -140,7 +140,7 @@ def rnn_seq2seq(encoder_inputs,
     scope: Scope to use, if None new will be produced.
 
   Returns:
-    List of tensors for outputs and states for trianing and sampling sub-graphs.
+    List of tensors for outputs and states for training and sampling sub-graphs.
   """
   with vs.variable_scope(scope or "rnn_seq2seq"):
     _, last_enc_state = rnn.static_rnn(
diff --git a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
index 10e9e88370e..95aec619553 100644
--- a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops_test.py
@@ -18,19 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn import ops
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
@@ -89,7 +82,7 @@ class Seq2SeqOpsTest(test.TestCase):
           array_ops.placeholder(dtypes.float32, [2, 2]) for _ in range(3)
       ]
       encoding = array_ops.placeholder(dtypes.float32, [2, 2])
-      cell = core_rnn_cell_impl.GRUCell(2)
+      cell = rnn_cell.GRUCell(2)
       outputs, states, sampling_outputs, sampling_states = (
           ops.rnn_decoder(decoder_inputs, encoding, cell))
       self.assertEqual(len(outputs), 3)
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
index 9d4fed99987..5709955c49f 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
@@ -128,9 +128,9 @@ class CategoricalVocabulary(object):
       Class name.
 
     Raises:
-      ValueError: if this vocabulary wasn't initalized with support_reverse.
+      ValueError: if this vocabulary wasn't initialized with support_reverse.
     """
     if not self._support_reverse:
-      raise ValueError("This vocabulary wasn't initalized with "
+      raise ValueError("This vocabulary wasn't initialized with "
                        "support_reverse to support reverse() function.")
     return self._reverse_mapping[class_id]
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_test.py b/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_test.py
index 77c9140999e..4e9cb9df62e 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_test.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_test.py
@@ -20,13 +20,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.learn_io import HAS_PANDAS
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_vocabulary_test.py b/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_vocabulary_test.py
index 454083e64fd..54e90ddd704 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_vocabulary_test.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/tests/categorical_vocabulary_test.py
@@ -19,13 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn.preprocessing import categorical_vocabulary
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/tests/text_test.py b/tensorflow/contrib/learn/python/learn/preprocessing/tests/text_test.py
index bbf2e0dac66..e9555140d08 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/tests/text_test.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/tests/text_test.py
@@ -20,13 +20,6 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python.learn.preprocessing import CategoricalVocabulary
 from tensorflow.contrib.learn.python.learn.preprocessing import text
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
index a04c3b69045..21aae5f7392 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/batch_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/batch_test.py
index 9de6367dacd..596f256869c 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/batch_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/batch_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import batch
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
index c21574cf8e7..f8602ff227e 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
index fdf8edbf9f6..218ef4db2eb 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.tests.dataframe import mocks
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/csv_parser_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/csv_parser_test.py
index 4c4083aade4..80ace7c3b70 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/csv_parser_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/csv_parser_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import csv_parser
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/dataframe_test.py
index a4beffb5590..e92696cc973 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/dataframe_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn.tests.dataframe import mocks
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
index 325d0beca14..78f1b3bfbeb 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
@@ -18,12 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+import copy
 
 from tensorflow.contrib.layers import feature_column
 from tensorflow.contrib.learn.python import learn
@@ -75,6 +70,31 @@ def setup_test_df_3layer():
   return df
 
 
+class DataFrameColumnTest(test.TestCase):
+  """Test of layers.DataFrameColumn."""
+
+  def test_constructor(self):
+    series = learn.PredefinedSeries(
+        "a",
+        parsing_ops.FixedLenFeature(
+            tensor_shape.unknown_shape(), dtypes.int32, 1))
+    column = feature_column.DataFrameColumn("a", series)
+    self.assertEqual("a", column.column_name)
+    self.assertEqual("a", column.name)
+    self.assertEqual(series, column.series)
+
+  def test_deepcopy(self):
+    series = learn.PredefinedSeries(
+        "a",
+        parsing_ops.FixedLenFeature(
+            tensor_shape.unknown_shape(), dtypes.int32, 1))
+    column = feature_column.DataFrameColumn("a", series)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual("a", column_copy.column_name)
+    self.assertEqual("a", column_copy.name)
+    self.assertEqual(series, column_copy.series)
+
+
 class EstimatorUtilsTest(test.TestCase):
   """Test of estimator utils."""
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
index 2b6a300193b..6d83ca74090 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
@@ -18,12 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+import collections
 
 import numpy as np
 
@@ -79,6 +74,32 @@ class _FeedingFunctionsTestCase(test.TestCase):
     actual = aff()
     self.assertEqual(expected, vals_to_list(actual))
 
+  def testArrayFeedFnBatchTwoWithOneEpoch(self):
+    array = np.arange(5) + 10
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [0, 1],
+        "value_placeholder": [10, 11]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [2, 3],
+        "value_placeholder": [12, 13]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [4],
+        "value_placeholder": [14]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
   def testArrayFeedFnBatchOneHundred(self):
     array = np.arange(32).reshape([16, 2])
     placeholders = ["index_placeholder", "value_placeholder"]
@@ -94,6 +115,18 @@ class _FeedingFunctionsTestCase(test.TestCase):
     actual = aff()
     self.assertEqual(expected, vals_to_list(actual))
 
+  def testArrayFeedFnBatchOneHundredWithSmallerArrayAndMultipleEpochs(self):
+    array = np.arange(2) + 10
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [0, 1, 0, 1],
+        "value_placeholder": [10, 11, 10, 11],
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
   def testPandasFeedFnBatchOne(self):
     if not HAS_PANDAS:
       return
@@ -135,6 +168,39 @@ class _FeedingFunctionsTestCase(test.TestCase):
     actual = aff()
     self.assertEqual(expected, vals_to_list(actual))
 
+  def testPandasFeedFnBatchTwoWithOneEpoch(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 37)
+    array2 = np.arange(64, 69)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 101))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [96, 97],
+        "a_placeholder": [32, 33],
+        "b_placeholder": [64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [98, 99],
+        "a_placeholder": [34, 35],
+        "b_placeholder": [66, 67]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [100],
+        "a_placeholder": [36],
+        "b_placeholder": [68]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
   def testPandasFeedFnBatchOneHundred(self):
     if not HAS_PANDAS:
       return
@@ -152,6 +218,75 @@ class _FeedingFunctionsTestCase(test.TestCase):
     actual = aff()
     self.assertEqual(expected, vals_to_list(actual))
 
+  def testPandasFeedFnBatchOneHundredWithSmallDataArrayAndMultipleEpochs(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 34)
+    array2 = np.arange(64, 66)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 98))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [96, 97, 96, 97],
+        "a_placeholder": [32, 33, 32, 33],
+        "b_placeholder": [64, 65, 64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testOrderedDictNumpyFeedFnBatchTwoWithOneEpoch(self):
+    a = np.arange(32, 37)
+    b = np.arange(64, 69)
+    x = {"a": a, "b": b}
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._OrderedDictNumpyFeedFn(
+        placeholders, ordered_dict_x, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [0, 1],
+        "a_placeholder": [32, 33],
+        "b_placeholder": [64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [2, 3],
+        "a_placeholder": [34, 35],
+        "b_placeholder": [66, 67]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [4],
+        "a_placeholder": [36],
+        "b_placeholder": [68]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testOrderedDictNumpyFeedFnLargeBatchWithSmallArrayAndMultipleEpochs(self):
+    a = np.arange(32, 34)
+    b = np.arange(64, 66)
+    x = {"a": a, "b": b}
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._OrderedDictNumpyFeedFn(
+        placeholders, ordered_dict_x, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [0, 1, 0, 1],
+        "a_placeholder": [32, 33, 32, 33],
+        "b_placeholder": [64, 65, 64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_queue_runner_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_queue_runner_test.py
index 125a3e13d52..247e8099ff9 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_queue_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_queue_runner_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions as ff
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/in_memory_source_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/in_memory_source_test.py
index c5147e2768b..fd344dcd094 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/in_memory_source_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/in_memory_source_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import in_memory_source
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/mocks.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/mocks.py
index 377218ed87a..1af23a4adf5 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/mocks.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/mocks.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from abc import ABCMeta
 
 from tensorflow.contrib.learn.python import learn
+from tensorflow.contrib.learn.python.learn.dataframe import transform
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 
@@ -123,11 +124,11 @@ class MockTransform(learn.TensorFlowTransform):
   def name(self):
     return "MockTransform"
 
-  @learn.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def param_one(self):
     return self._param_one
 
-  @learn.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def param_two(self):
     return self._param_two
 
@@ -184,7 +185,7 @@ class MockTwoOutputTransform(MockTransform):
 
   _mock_output_names = ["out1", "out2"]
 
-  @learn.parameter
+  @transform._parameter  # pylint: disable=protected-access
   def param_three(self):
     return self._param_three
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/reader_source_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/reader_source_test.py
index 74f6bfd5c69..e7d46a1ce6e 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/reader_source_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/reader_source_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import reader_source as rs
 from tensorflow.python.ops import io_ops
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/series_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/series_test.py
index bfef2b173a6..c9e062efe6d 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/series_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/series_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn.tests.dataframe import mocks
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
index 7f526e0309c..944024d8a2f 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/sparsify_densify_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import densify
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
index d294b605681..e65eba56592 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import csv
 import math
-import sys
 import tempfile
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
@@ -69,57 +63,54 @@ def _assert_df_equals_dict(expected_df, actual_dict):
                                                              actual_dict[col]))
 
 
-def _make_test_csv():
-  f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
-  w = csv.writer(f)
-  w.writerow(["int", "float", "bool", "string"])
-  for _ in range(100):
-    intvalue = np.random.randint(-10, 10)
-    floatvalue = np.random.rand()
-    boolvalue = int(np.random.rand() > 0.3)
-    stringvalue = "S: %.4f" % np.random.rand()
-
-    row = [intvalue, floatvalue, boolvalue, stringvalue]
-    w.writerow(row)
-  f.close()
-  return f.name
-
-
-def _make_test_csv_sparse():
-  f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
-  w = csv.writer(f)
-  w.writerow(["int", "float", "bool", "string"])
-  for _ in range(100):
-    # leave columns empty; these will be read as default value (e.g. 0 or NaN)
-    intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
-    floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
-    boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
-    stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else
-                   "")
-
-    row = [intvalue, floatvalue, boolvalue, stringvalue]
-    w.writerow(row)
-  f.close()
-  return f.name
-
-
-def _make_test_tfrecord():
-  f = tempfile.NamedTemporaryFile(dir=test.get_temp_dir(), delete=False)
-  w = tf_record.TFRecordWriter(f.name)
-  for i in range(100):
-    ex = example_pb2.Example()
-    ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
-    ex.features.feature["fixed_len_float"].float_list.value.extend(
-        [float(i), 2 * float(i)])
-    w.write(ex.SerializeToString())
-  return f.name
-
-
 class TensorFlowDataFrameTestCase(test.TestCase):
   """Tests for `TensorFlowDataFrame`."""
 
+  def _make_test_csv(self):
+    f = tempfile.NamedTemporaryFile(
+        dir=self.get_temp_dir(), delete=False, mode="w")
+    w = csv.writer(f)
+    w.writerow(["int", "float", "bool", "string"])
+    for _ in range(100):
+      intvalue = np.random.randint(-10, 10)
+      floatvalue = np.random.rand()
+      boolvalue = int(np.random.rand() > 0.3)
+      stringvalue = "S: %.4f" % np.random.rand()
+
+      row = [intvalue, floatvalue, boolvalue, stringvalue]
+      w.writerow(row)
+    f.close()
+    return f.name
+
+  def _make_test_csv_sparse(self):
+    f = tempfile.NamedTemporaryFile(
+        dir=self.get_temp_dir(), delete=False, mode="w")
+    w = csv.writer(f)
+    w.writerow(["int", "float", "bool", "string"])
+    for _ in range(100):
+      # leave columns empty; these will be read as default value (e.g. 0 or NaN)
+      intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
+      floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
+      boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
+      stringvalue = (
+          ("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else "")
+
+      row = [intvalue, floatvalue, boolvalue, stringvalue]
+      w.writerow(row)
+    f.close()
+    return f.name
+
+  def _make_test_tfrecord(self):
+    f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False)
+    w = tf_record.TFRecordWriter(f.name)
+    for i in range(100):
+      ex = example_pb2.Example()
+      ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
+      ex.features.feature["fixed_len_float"].float_list.value.extend(
+          [float(i), 2 * float(i)])
+      w.write(ex.SerializeToString())
+    return f.name
+
   def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df,
                                        num_batches, batch_size):
     self.assertItemsEqual(
@@ -196,7 +187,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     batch_size = 8
     enqueue_size = 7
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     pandas_df = pd.read_csv(data_path)
@@ -217,7 +208,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_epochs = 17
     expected_num_batches = (num_epochs * 100) // batch_size
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
@@ -240,7 +231,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_batches = 100
     batch_size = 8
 
-    data_path = _make_test_csv_sparse()
+    data_path = self._make_test_csv_sparse()
     feature_spec = {
         "int": parsing_ops.FixedLenFeature(None, dtypes.int16, np.nan),
         "float": parsing_ops.VarLenFeature(dtypes.float16),
@@ -276,7 +267,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     enqueue_size = 11
     batch_size = 13
 
-    data_path = _make_test_tfrecord()
+    data_path = self._make_test_tfrecord()
     features = {
         "fixed_len_float":
             parsing_ops.FixedLenFeature(
@@ -324,7 +315,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
     num_epochs = 17
     expected_num_batches = (num_epochs * 100) // batch_size
 
-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
     default_values = [0, 0.0, 0, ""]
 
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/transform_test.py
index a2f22171663..2bca38415a4 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/transform_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn.dataframe.transform import _make_list_of_series
 from tensorflow.contrib.learn.python.learn.tests.dataframe import mocks
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
index 6dd26cfc8d2..bf74b67a0eb 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 2d1d4604251..972fec026f2 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -49,7 +49,7 @@ class Trainable(object):
       steps: Number of steps for which to train model. If `None`, train forever.
         'steps' works incrementally. If you call two times fit(steps=10) then
         training occurs in total 20 steps. If you don't want to have incremental
-        behaviour please set `max_steps` instead. If set, `max_steps` must be
+        behavior please set `max_steps` instead. If set, `max_steps` must be
         `None`.
       batch_size: minibatch size to use on the input, defaults to first
         dimension of `x`. Must be `None` if `input_fn` is provided.
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 275568313c6..6af22877612 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
@@ -29,12 +28,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
 
 
+@deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
 def _get_first_op_from_collection(collection_name):
   """Get first element from the collection."""
   elements = ops.get_collection(collection_name)
@@ -44,6 +44,7 @@ def _get_first_op_from_collection(collection_name):
   return None
 
 
+@deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
 def _get_saver():
   """Lazy init and return saver."""
   saver = _get_first_op_from_collection(ops.GraphKeys.SAVERS)
@@ -58,6 +59,7 @@ def _get_saver():
   return saver
 
 
+@deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
 def _export_graph(graph, saver, checkpoint_path, export_dir,
                   default_graph_signature, named_graph_signatures,
                   exports_to_keep):
@@ -65,25 +67,29 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
   with graph.as_default():
     with tf_session.Session('') as session:
       variables.local_variables_initializer()
-      data_flow_ops.tables_initializer()
+      lookup_ops.tables_initializer()
       saver.restore(session, checkpoint_path)
 
       export = exporter.Exporter(saver)
-      export.init(init_op=control_flow_ops.group(
-          variables.local_variables_initializer(),
-          data_flow_ops.tables_initializer()),
-                  default_graph_signature=default_graph_signature,
-                  named_graph_signatures=named_graph_signatures,
-                  assets_collection=ops.get_collection(
-                      ops.GraphKeys.ASSET_FILEPATHS))
+      export.init(
+          init_op=control_flow_ops.group(
+              variables.local_variables_initializer(),
+              lookup_ops.tables_initializer()),
+          default_graph_signature=default_graph_signature,
+          named_graph_signatures=named_graph_signatures,
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
       return export.export(export_dir, contrib_variables.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
 
+@deprecated('2017-03-25',
+            'signature_fns are deprecated. For canned Estimators they are no '
+            'longer needed. For custom Estimators, please return '
+            'output_alternatives from your model_fn via ModelFnOps.')
 def generic_signature_fn(examples, unused_features, predictions):
   """Creates generic signature from given examples and predictions.
 
-  This is needed for backward compatibility with default behaviour of
+  This is needed for backward compatibility with default behavior of
   export_estimator.
 
   Args:
@@ -108,6 +114,10 @@ def generic_signature_fn(examples, unused_features, predictions):
   return default_signature, {}
 
 
+@deprecated('2017-03-25',
+            'signature_fns are deprecated. For canned Estimators they are no '
+            'longer needed. For custom Estimators, please return '
+            'output_alternatives from your model_fn via ModelFnOps.')
 def classification_signature_fn(examples, unused_features, predictions):
   """Creates classification signature from given examples and predictions.
 
@@ -135,6 +145,10 @@ def classification_signature_fn(examples, unused_features, predictions):
   return default_signature, {}
 
 
+@deprecated('2017-03-25',
+            'signature_fns are deprecated. For canned Estimators they are no '
+            'longer needed. For custom Estimators, please return '
+            'output_alternatives from your model_fn via ModelFnOps.')
 def classification_signature_fn_with_prob(
     examples, unused_features, predictions):
   """Classification signature from given examples and predicted probabilities.
@@ -163,6 +177,10 @@ def classification_signature_fn_with_prob(
   return default_signature, {}
 
 
+@deprecated('2017-03-25',
+            'signature_fns are deprecated. For canned Estimators they are no '
+            'longer needed. For custom Estimators, please return '
+            'output_alternatives from your model_fn via ModelFnOps.')
 def regression_signature_fn(examples, unused_features, predictions):
   """Creates regression signature from given examples and predictions.
 
@@ -185,6 +203,10 @@ def regression_signature_fn(examples, unused_features, predictions):
   return default_signature, {}
 
 
+@deprecated('2017-03-25',
+            'signature_fns are deprecated. For canned Estimators they are no '
+            'longer needed. For custom Estimators, please return '
+            'output_alternatives from your model_fn via ModelFnOps.')
 def logistic_regression_signature_fn(examples, unused_features, predictions):
   """Creates logistic regression signature from given examples and predictions.
 
@@ -231,25 +253,20 @@ def logistic_regression_signature_fn(examples, unused_features, predictions):
 
 
 # pylint: disable=protected-access
-@deprecated(
-    '2016-09-23',
-    'The signature of the input_fn accepted by export is changing to be '
-    'consistent with what\'s used by tf.Learn Estimator\'s train/evaluate, '
-    'which makes this function useless. This will be removed after the '
-    'deprecation date.')
+@deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
 def _default_input_fn(estimator, examples):
   """Creates default input parsing using Estimator's feature signatures."""
   return estimator._get_feature_ops_from_example(examples)
 
 
-@deprecated('2016-09-23', 'Please use BaseEstimator.export')
+@deprecated('2016-09-23', 'Please use Estimator.export_savedmodel() instead.')
 def export_estimator(estimator,
                      export_dir,
                      signature_fn=None,
                      input_fn=_default_input_fn,
                      default_batch_size=1,
                      exports_to_keep=None):
-  """Deprecated, please use BaseEstimator.export."""
+  """Deprecated, please use Estimator.export_savedmodel()."""
   _export_estimator(estimator=estimator,
                     export_dir=export_dir,
                     signature_fn=signature_fn,
@@ -258,16 +275,7 @@ def export_estimator(estimator,
                     exports_to_keep=exports_to_keep)
 
 
-@deprecated_arg_values(
-    '2016-09-23',
-    'The signature of the input_fn accepted by export is changing to be '
-    'consistent with what\'s used by tf.Learn Estimator\'s train/evaluate. '
-    'input_fn and (and in most cases, input_feature_key) will become required '
-    'args. use_deprecated_input_fn will default to False and be removed. '
-    'default_batch_size will also be removed since it will now be a part of '
-    'the input_fn.',
-    use_deprecated_input_fn=True,
-    default_batch_size=1)
+@deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
 def _export_estimator(estimator,
                       export_dir,
                       signature_fn,
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc.py b/tensorflow/contrib/learn/python/learn/utils/gc.py
index dd4376f0512..5af9e8b9e2b 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc.py
@@ -71,6 +71,7 @@ import math
 import os
 
 from tensorflow.python.platform import gfile
+from tensorflow.python.util import compat
 
 Path = collections.namedtuple('Path', 'path export_version')
 
@@ -199,7 +200,9 @@ def get_paths(base_dir, parser):
   raw_paths = gfile.ListDirectory(base_dir)
   paths = []
   for r in raw_paths:
-    p = parser(Path(os.path.join(base_dir, r), None))
+    p = parser(Path(os.path.join(compat.as_str_any(base_dir),
+                                 compat.as_str_any(r)),
+                    None))
     if p:
       paths.append(p)
   return sorted(paths)
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index 07c7f7138fc..0c1a1f43279 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -20,12 +20,6 @@ from __future__ import print_function
 
 import os
 import re
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -33,10 +27,19 @@ from tensorflow.contrib.learn.python.learn.utils import gc
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
-def tearDownModule():
-  gfile.DeleteRecursively(test.get_temp_dir())
+def _create_parser(base_dir):
+  # create a simple parser that pulls the export_version from the directory.
+  def parser(path):
+    match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
+                     compat.as_str_any(path.path))
+    if not match:
+      return None
+    return path._replace(export_version=int(match.group(1)))
+
+  return parser
 
 
 class GcTest(test_util.TensorFlowTestCase):
@@ -112,20 +115,24 @@ class GcTest(test_util.TensorFlowTestCase):
     # add a base_directory to ignore
     gfile.MakeDirs(os.path.join(base_dir, "ignore"))
 
-    # create a simple parser that pulls the export_version from the directory.
-    def parser(path):
-      match = re.match("^" + base_dir + "/(\\d+)$", path.path)
-      if not match:
-        return None
-      return path._replace(export_version=int(match.group(1)))
-
     self.assertEquals(
-        gc.get_paths(
-            base_dir, parser=parser), [
-                gc.Path(os.path.join(base_dir, "0"), 0),
-                gc.Path(os.path.join(base_dir, "1"), 1),
-                gc.Path(os.path.join(base_dir, "2"), 2)
-            ])
+        gc.get_paths(base_dir, _create_parser(base_dir)),
+        [
+            gc.Path(os.path.join(base_dir, "0"), 0),
+            gc.Path(os.path.join(base_dir, "1"), 1),
+            gc.Path(os.path.join(base_dir, "2"), 2)
+        ])
+
+  def testMixedStrTypes(self):
+    temp_dir = compat.as_bytes(test.get_temp_dir())
+
+    for sub_dir in ['str', b'bytes', u'unicode']:
+      base_dir = os.path.join(
+          (temp_dir if isinstance(sub_dir, bytes) else temp_dir.decode()),
+          sub_dir)
+      self.assertFalse(gfile.Exists(base_dir))
+      gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
+      gc.get_paths(base_dir, _create_parser(base_dir))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
index 1a519716193..2b353fbb556 100644
--- a/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/input_fn_utils.py
@@ -12,8 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Utilities for creating input_fns.
+
+Contents of this file are moved to tensorflow/python/estimator/export.py.
+InputFnOps is renamed to ServingInputReceiver.
+build_parsing_serving_input_fn is renamed to
+  build_parsing_serving_input_receiver_fn.
+build_default_serving_input_fn is renamed to
+  build_raw_serving_input_receiver_fn.
+"""
 
-"""Utilities for creating input_fns."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,19 +34,26 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 
 
-# A return type allowing input_fns to return multiple values in a well-
-# defined way (analogous to ModelFnOps).
-# The expected return values are:
-#  features: a dict of string to `Tensor` or `SparseTensor`, giving the features
-#            to be passed to the model.
-#  labels: a dict of string to `Tensor` or `SparseTensor`, giving labels (aka
-#            targets) for training.
-#  default_inputs: a dict of string to `Tensor` or `SparseTensor`, giving the
-#            input placeholders (if any) that this input_fn expects to be fed.
-InputFnOps = collections.namedtuple('InputFnOps',
-                                    ['features',
-                                     'labels',
-                                     'default_inputs'])
+class InputFnOps(collections.namedtuple('InputFnOps',
+                                        ['features',
+                                         'labels',
+                                         'default_inputs'])):
+  """A return type for an input_fn.
+
+  This return type is currently only supported for serving input_fn.
+  Training and eval input_fn should return a `(features, labels)` tuple.
+
+  The expected return values are:
+    features: A dict of string to `Tensor` or `SparseTensor`, specifying the
+      features to be passed to the model.
+    labels: A `Tensor`, `SparseTensor`, or a dict of string to `Tensor` or
+      `SparseTensor`, specifying labels for training or eval. For serving, set
+      `labels` to `None`.
+    default_inputs: a dict of string to `Tensor` or `SparseTensor`, specifying
+      the input placeholders (if any) that this input_fn expects to be fed.
+      Typically, this is used by a serving input_fn, which expects to be fed
+      serialized `tf.Example` protos.
+  """
 
 
 def build_parsing_serving_input_fn(feature_spec, default_batch_size=None):
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 9da69ae0bb3..3f0f3092534 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -13,7 +13,22 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Utilities supporting export to SavedModel."""
+"""Utilities supporting export to SavedModel.
+
+Some contents of this file are moved to tensorflow/python/estimator/export.py:
+
+get_input_alternatives() -> obsolete
+get_output_alternatives() -> obsolete, but see _get_default_export_output()
+build_all_signature_defs() -> build_all_signature_defs()
+get_timestamped_export_directory() -> get_timestamped_export_directory()
+_get_* -> obsolete
+_is_* -> obsolete
+
+Functionality of build_standardized_signature_def() is moved to
+tensorflow/python/estimator/export_output.py as ExportOutput.as_signature_def().
+
+Anything to do with ExportStrategies or garbage collection is not moved.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,8 +42,11 @@ from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import gc
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 
@@ -291,7 +309,7 @@ def get_most_recent_export(export_dir_base):
                      directories.
 
   Returns:
-    A gc.Path, whith is just a namedtuple of (path, export_version).
+    A gc.Path, with is just a namedtuple of (path, export_version).
   """
   select_filter = gc.largest_export_versions(1)
   results = select_filter(gc.get_paths(export_dir_base,
@@ -317,7 +335,10 @@ def garbage_collect_exports(export_dir_base, exports_to_keep):
   delete_filter = gc.negation(keep_filter)
   for p in delete_filter(gc.get_paths(export_dir_base,
                                       parser=_export_version_parser)):
-    gfile.DeleteRecursively(p.path)
+    try:
+      gfile.DeleteRecursively(p.path)
+    except errors_impl.NotFoundError as e:
+      logging.warn('Can not delete %s recursively: %s', p.path, e)
 
 
 def make_export_strategy(serving_input_fn,
@@ -332,7 +353,8 @@ def make_export_strategy(serving_input_fn,
       `InputFnOps`.
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Not needed for single-headed models.
+      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
       path (including the filename) relative to the assets.extra directory.
@@ -364,14 +386,30 @@ def make_export_strategy(serving_input_fn,
 
     Returns:
       The string path to the exported directory.
+
+    Raises:
+      ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
+        and `default_output_alternative_key` was specified.
     """
-    export_result = estimator.export_savedmodel(
-        export_dir_base,
-        serving_input_fn,
-        default_output_alternative_key=default_output_alternative_key,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path)
+    if isinstance(estimator, core_estimator.Estimator):
+      if default_output_alternative_key is not None:
+        raise ValueError(
+            'default_output_alternative_key is not supported in core '
+            'Estimator. Given: {}'.format(default_output_alternative_key))
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
+    else:
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          default_output_alternative_key=default_output_alternative_key,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
 
     garbage_collect_exports(export_dir_base, exports_to_keep)
     return export_result
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index c9a52f455a3..9e778ab72ad 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -18,17 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import tempfile
 import time
 
-# pylint: disable=g-import-not-at-top
-
-# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.learn.python.learn import export_strategy as export_strategy_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
@@ -117,7 +109,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     self.assertEqual(actual_signature_def, expected_signature_def)
 
   def test_build_standardized_signature_def_classification2(self):
-    """Tests multiple output tensors that include classes and probabilites."""
+    """Tests multiple output tensors that include classes and probabilities."""
     input_tensors = {
         "input-1":
             array_ops.placeholder(
@@ -319,7 +311,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     dtype_int64 = types_pb2.DataType.Value("DT_INT64")
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
     expected_signature_def.inputs[
-        signature_constants.PREDICT_INPUTS].CopyFrom(
+        "input-1"].CopyFrom(
             meta_graph_pb2.TensorInfo(
                 name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs["classes"].CopyFrom(
@@ -524,8 +516,8 @@ class SavedModelExportUtilsTest(test.TestCase):
                                                              output_2, None),
         "default_input_alternative:head-3":
             signature_def_utils.predict_signature_def({
-                "input": input_example
-            }, {"output": output_3}),
+                "default input": input_example
+            }, {"some_output_3": output_3}),
         # "features_input_alternative:head-1":
         #     signature_def_utils.regression_signature_def(input_features,
         #                                                  output_1),
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
index 900f6096811..7ce5fb2da67 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
@@ -18,20 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import math
 import random
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
 from tensorflow.contrib.legacy_seq2seq.python.ops import seq2seq as seq2seq_lib
-from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -42,6 +36,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -56,11 +51,10 @@ class Seq2SeqTest(test.TestCase):
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        _, enc_state = core_rnn.static_rnn(
-            core_rnn_cell_impl.GRUCell(2), inp, dtype=dtypes.float32)
+        _, enc_state = rnn.static_rnn(
+            rnn_cell.GRUCell(2), inp, dtype=dtypes.float32)
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
-        cell = core_rnn_cell_impl.OutputProjectionWrapper(
-            core_rnn_cell_impl.GRUCell(2), 4)
+        cell = core_rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
         dec, mem = seq2seq_lib.rnn_decoder(dec_inp, enc_state, cell)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
@@ -76,8 +70,7 @@ class Seq2SeqTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
-        cell = core_rnn_cell_impl.OutputProjectionWrapper(
-            core_rnn_cell_impl.GRUCell(2), 4)
+        cell = core_rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
         dec, mem = seq2seq_lib.basic_rnn_seq2seq(inp, dec_inp, cell)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
@@ -93,8 +86,7 @@ class Seq2SeqTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
-        cell = core_rnn_cell_impl.OutputProjectionWrapper(
-            core_rnn_cell_impl.GRUCell(2), 4)
+        cell = core_rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
         dec, mem = seq2seq_lib.tied_rnn_seq2seq(inp, dec_inp, cell)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
@@ -110,14 +102,17 @@ class Seq2SeqTest(test.TestCase):
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
-        _, enc_state = core_rnn.static_rnn(cell, inp, dtype=dtypes.float32)
+        cell_fn = lambda: rnn_cell.BasicLSTMCell(2)
+        cell = cell_fn()
+        _, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
         dec_inp = [
             constant_op.constant(
                 i, dtypes.int32, shape=[2]) for i in range(3)
         ]
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.embedding_rnn_decoder(
-            dec_inp, enc_state, cell, num_symbols=4, embedding_size=2)
+            dec_inp, enc_state, cell_fn(), num_symbols=4, embedding_size=2)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -140,7 +135,8 @@ class Seq2SeqTest(test.TestCase):
             constant_op.constant(
                 i, dtypes.int32, shape=[2]) for i in range(3)
         ]
-        cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+        cell_fn = lambda: rnn_cell.BasicLSTMCell(2)
+        cell = cell_fn()
         dec, mem = seq2seq_lib.embedding_rnn_seq2seq(
             enc_inp,
             dec_inp,
@@ -159,11 +155,11 @@ class Seq2SeqTest(test.TestCase):
 
         # Test with state_is_tuple=False.
         with variable_scope.variable_scope("no_tuple"):
-          cell1 = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+          cell_nt = rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
           dec, mem = seq2seq_lib.embedding_rnn_seq2seq(
               enc_inp,
               dec_inp,
-              cell1,
+              cell_nt,
               num_encoder_symbols=2,
               num_decoder_symbols=5,
               embedding_size=2)
@@ -182,7 +178,7 @@ class Seq2SeqTest(test.TestCase):
           dec, _ = seq2seq_lib.embedding_rnn_seq2seq(
               enc_inp,
               dec_inp,
-              cell,
+              cell_fn(),
               num_encoder_symbols=2,
               num_decoder_symbols=5,
               embedding_size=2,
@@ -201,29 +197,30 @@ class Seq2SeqTest(test.TestCase):
           d3, _ = seq2seq_lib.embedding_rnn_seq2seq(
               enc_inp,
               dec_inp2,
-              cell,
+              cell_fn(),
               num_encoder_symbols=2,
               num_decoder_symbols=5,
               embedding_size=2,
               feed_previous=constant_op.constant(True))
+        with variable_scope.variable_scope("other_2"):
+          d1, _ = seq2seq_lib.embedding_rnn_seq2seq(
+              enc_inp,
+              dec_inp,
+              cell_fn(),
+              num_encoder_symbols=2,
+              num_decoder_symbols=5,
+              embedding_size=2,
+              feed_previous=True)
+        with variable_scope.variable_scope("other_3"):
+          d2, _ = seq2seq_lib.embedding_rnn_seq2seq(
+              enc_inp,
+              dec_inp2,
+              cell_fn(),
+              num_encoder_symbols=2,
+              num_decoder_symbols=5,
+              embedding_size=2,
+              feed_previous=True)
         sess.run([variables.global_variables_initializer()])
-        variable_scope.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq_lib.embedding_rnn_seq2seq(
-            enc_inp,
-            dec_inp,
-            cell,
-            num_encoder_symbols=2,
-            num_decoder_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
-        d2, _ = seq2seq_lib.embedding_rnn_seq2seq(
-            enc_inp,
-            dec_inp2,
-            cell,
-            num_encoder_symbols=2,
-            num_decoder_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
         res1 = sess.run(d1)
         res2 = sess.run(d2)
         res3 = sess.run(d3)
@@ -242,9 +239,9 @@ class Seq2SeqTest(test.TestCase):
             constant_op.constant(
                 i, dtypes.int32, shape=[2]) for i in range(3)
         ]
-        cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+        cell = functools.partial(rnn_cell.BasicLSTMCell, 2, state_is_tuple=True)
         dec, mem = seq2seq_lib.embedding_tied_rnn_seq2seq(
-            enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2)
+            enc_inp, dec_inp, cell(), num_symbols=5, embedding_size=2)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -260,7 +257,7 @@ class Seq2SeqTest(test.TestCase):
           dec, mem = seq2seq_lib.embedding_tied_rnn_seq2seq(
               enc_inp,
               dec_inp,
-              cell,
+              cell(),
               num_symbols=5,
               num_decoder_symbols=3,
               embedding_size=2)
@@ -276,7 +273,7 @@ class Seq2SeqTest(test.TestCase):
           dec, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
               enc_inp,
               dec_inp,
-              cell,
+              cell(),
               num_symbols=5,
               embedding_size=2,
               output_projection=(w, b))
@@ -291,26 +288,27 @@ class Seq2SeqTest(test.TestCase):
           d3, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
               enc_inp,
               dec_inp2,
-              cell,
+              cell(),
               num_symbols=5,
               embedding_size=2,
               feed_previous=constant_op.constant(True))
+        with variable_scope.variable_scope("other_2"):
+          d1, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
+              enc_inp,
+              dec_inp,
+              cell(),
+              num_symbols=5,
+              embedding_size=2,
+              feed_previous=True)
+        with variable_scope.variable_scope("other_3"):
+          d2, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
+              enc_inp,
+              dec_inp2,
+              cell(),
+              num_symbols=5,
+              embedding_size=2,
+              feed_previous=True)
         sess.run([variables.global_variables_initializer()])
-        variable_scope.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
-            enc_inp,
-            dec_inp,
-            cell,
-            num_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
-        d2, _ = seq2seq_lib.embedding_tied_rnn_seq2seq(
-            enc_inp,
-            dec_inp2,
-            cell,
-            num_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
         res1 = sess.run(d1)
         res2 = sess.run(d2)
         res3 = sess.run(d3)
@@ -321,16 +319,19 @@ class Seq2SeqTest(test.TestCase):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = core_rnn_cell_impl.GRUCell(2)
+        cell_fn = lambda: rnn_cell.GRUCell(2)
+        cell = cell_fn()
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        enc_outputs, enc_state = core_rnn.static_rnn(
-            cell, inp, dtype=dtypes.float32)
+        enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
         attn_states = array_ops.concat([
             array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
         ], 1)
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
+
+        # Create a new cell instance for the decoder, since it uses a
+        # different variable scope
         dec, mem = seq2seq_lib.attention_decoder(
-            dec_inp, enc_state, attn_states, cell, output_size=4)
+            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -343,16 +344,20 @@ class Seq2SeqTest(test.TestCase):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = core_rnn_cell_impl.GRUCell(2)
+        cell_fn = lambda: rnn_cell.GRUCell(2)
+        cell = cell_fn()
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        enc_outputs, enc_state = core_rnn.static_rnn(
-            cell, inp, dtype=dtypes.float32)
+        enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
         attn_states = array_ops.concat([
             array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
         ], 1)
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
+
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.attention_decoder(
-            dec_inp, enc_state, attn_states, cell, output_size=4, num_heads=2)
+            dec_inp, enc_state, attn_states, cell_fn(),
+            output_size=4, num_heads=2)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -365,14 +370,18 @@ class Seq2SeqTest(test.TestCase):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = core_rnn_cell_impl.GRUCell(2)
+        cell_fn = lambda: rnn_cell.GRUCell(2)
+        cell = cell_fn()
         inp = constant_op.constant(0.5, shape=[2, 2, 2])
         enc_outputs, enc_state = rnn.dynamic_rnn(
             cell, inp, dtype=dtypes.float32)
         attn_states = enc_outputs
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
+
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.attention_decoder(
-            dec_inp, enc_state, attn_states, cell, output_size=4)
+            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -385,14 +394,19 @@ class Seq2SeqTest(test.TestCase):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = core_rnn_cell_impl.GRUCell(2)
+        cell_fn = lambda: rnn_cell.GRUCell(2)
+        cell = cell_fn()
         inp = constant_op.constant(0.5, shape=[2, 2, 2])
         enc_outputs, enc_state = rnn.dynamic_rnn(
             cell, inp, dtype=dtypes.float32)
         attn_states = enc_outputs
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
+
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.attention_decoder(
-            dec_inp, enc_state, attn_states, cell, output_size=4, num_heads=2)
+            dec_inp, enc_state, attn_states, cell_fn(),
+            output_size=4, num_heads=2)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -405,19 +419,22 @@ class Seq2SeqTest(test.TestCase):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        single_cell = lambda: core_rnn_cell_impl.BasicLSTMCell(  # pylint: disable=g-long-lambda
+        single_cell = lambda: rnn_cell.BasicLSTMCell(  # pylint: disable=g-long-lambda
             2, state_is_tuple=True)
-        cell = core_rnn_cell_impl.MultiRNNCell(
+        cell_fn = lambda: rnn_cell.MultiRNNCell(  # pylint: disable=g-long-lambda
             cells=[single_cell() for _ in range(2)], state_is_tuple=True)
+        cell = cell_fn()
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        enc_outputs, enc_state = core_rnn.static_rnn(
-            cell, inp, dtype=dtypes.float32)
+        enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
         attn_states = array_ops.concat([
             array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
         ], 1)
         dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
+
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.attention_decoder(
-            dec_inp, enc_state, attn_states, cell, output_size=4)
+            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
         sess.run([variables.global_variables_initializer()])
         res = sess.run(dec)
         self.assertEqual(3, len(res))
@@ -430,45 +447,45 @@ class Seq2SeqTest(test.TestCase):
         self.assertEqual((2, 2), res[0][1].c.shape)
         self.assertEqual((2, 2), res[0][1].h.shape)
 
-    def testDynamicAttentionDecoderStateIsTuple(self):
-      with self.test_session() as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          single_cell = lambda: core_rnn_cell_impl.BasicLSTMCell(  # pylint: disable=g-long-lambda
-              2, state_is_tuple=True)
+  def testDynamicAttentionDecoderStateIsTuple(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        cell_fn = lambda: rnn_cell.MultiRNNCell(  # pylint: disable=g-long-lambda
+            cells=[rnn_cell.BasicLSTMCell(2) for _ in range(2)])
+        cell = cell_fn()
+        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
+        attn_states = array_ops.concat([
+            array_ops.reshape(e, [-1, 1, cell.output_size])
+            for e in enc_outputs
+        ], 1)
+        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
 
-          cell = core_rnn_cell_impl.MultiRNNCell(
-              cells=[single_cell() for _ in range(2)], state_is_tuple=True)
-          inp = constant_op.constant(0.5, shape=[2, 2, 2])
-          enc_outputs, enc_state = core_rnn.static_rnn(
-              cell, inp, dtype=dtypes.float32)
-          attn_states = array_ops.concat([
-              array_ops.reshape(e, [-1, 1, cell.output_size])
-              for e in enc_outputs
-          ], 1)
-          dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
-          dec, mem = seq2seq_lib.attention_decoder(
-              dec_inp, enc_state, attn_states, cell, output_size=4)
-          sess.run([variables.global_variables_initializer()])
-          res = sess.run(dec)
-          self.assertEqual(3, len(res))
-          self.assertEqual((2, 4), res[0].shape)
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
+        dec, mem = seq2seq_lib.attention_decoder(
+            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
 
-          res = sess.run([mem])
-          self.assertEqual(2, len(res[0]))
-          self.assertEqual((2, 2), res[0][0].c.shape)
-          self.assertEqual((2, 2), res[0][0].h.shape)
-          self.assertEqual((2, 2), res[0][1].c.shape)
-          self.assertEqual((2, 2), res[0][1].h.shape)
+        res = sess.run([mem])
+        self.assertEqual(2, len(res[0]))
+        self.assertEqual((2, 2), res[0][0].c.shape)
+        self.assertEqual((2, 2), res[0][0].h.shape)
+        self.assertEqual((2, 2), res[0][1].c.shape)
+        self.assertEqual((2, 2), res[0][1].h.shape)
 
   def testEmbeddingAttentionDecoder(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
-        cell = core_rnn_cell_impl.GRUCell(2)
-        enc_outputs, enc_state = core_rnn.static_rnn(
-            cell, inp, dtype=dtypes.float32)
+        cell_fn = lambda: rnn_cell.GRUCell(2)
+        cell = cell_fn()
+        enc_outputs, enc_state = rnn.static_rnn(cell, inp, dtype=dtypes.float32)
         attn_states = array_ops.concat([
             array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
         ], 1)
@@ -476,11 +493,14 @@ class Seq2SeqTest(test.TestCase):
             constant_op.constant(
                 i, dtypes.int32, shape=[2]) for i in range(3)
         ]
+
+        # Use a new cell instance since the attention decoder uses a
+        # different variable scope.
         dec, mem = seq2seq_lib.embedding_attention_decoder(
             dec_inp,
             enc_state,
             attn_states,
-            cell,
+            cell_fn(),
             num_symbols=4,
             embedding_size=2,
             output_size=3)
@@ -504,7 +524,8 @@ class Seq2SeqTest(test.TestCase):
             constant_op.constant(
                 i, dtypes.int32, shape=[2]) for i in range(3)
         ]
-        cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+        cell_fn = lambda: rnn_cell.BasicLSTMCell(2)
+        cell = cell_fn()
         dec, mem = seq2seq_lib.embedding_attention_seq2seq(
             enc_inp,
             dec_inp,
@@ -523,11 +544,13 @@ class Seq2SeqTest(test.TestCase):
 
         # Test with state_is_tuple=False.
         with variable_scope.variable_scope("no_tuple"):
-          cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+          cell_fn = functools.partial(
+              rnn_cell.BasicLSTMCell, 2, state_is_tuple=False)
+          cell_nt = cell_fn()
           dec, mem = seq2seq_lib.embedding_attention_seq2seq(
               enc_inp,
               dec_inp,
-              cell,
+              cell_nt,
               num_encoder_symbols=2,
               num_decoder_symbols=5,
               embedding_size=2)
@@ -546,7 +569,7 @@ class Seq2SeqTest(test.TestCase):
           dec, _ = seq2seq_lib.embedding_attention_seq2seq(
               enc_inp,
               dec_inp,
-              cell,
+              cell_fn(),
               num_encoder_symbols=2,
               num_decoder_symbols=5,
               embedding_size=2,
@@ -556,43 +579,47 @@ class Seq2SeqTest(test.TestCase):
         self.assertEqual(3, len(res))
         self.assertEqual((2, 2), res[0].shape)
 
-        # Test that previous-feeding model ignores inputs after the first.
-        dec_inp2 = [
-            constant_op.constant(
-                0, dtypes.int32, shape=[2]) for _ in range(3)
-        ]
-        with variable_scope.variable_scope("other"):
-          d3, _ = seq2seq_lib.embedding_attention_seq2seq(
-              enc_inp,
-              dec_inp2,
-              cell,
-              num_encoder_symbols=2,
-              num_decoder_symbols=5,
-              embedding_size=2,
-              feed_previous=constant_op.constant(True))
-        sess.run([variables.global_variables_initializer()])
-        variable_scope.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq_lib.embedding_attention_seq2seq(
-            enc_inp,
-            dec_inp,
-            cell,
-            num_encoder_symbols=2,
-            num_decoder_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
-        d2, _ = seq2seq_lib.embedding_attention_seq2seq(
-            enc_inp,
-            dec_inp2,
-            cell,
-            num_encoder_symbols=2,
-            num_decoder_symbols=5,
-            embedding_size=2,
-            feed_previous=True)
-        res1 = sess.run(d1)
-        res2 = sess.run(d2)
-        res3 = sess.run(d3)
-        self.assertAllClose(res1, res2)
-        self.assertAllClose(res1, res3)
+        # TODO(ebrevdo, lukaszkaiser): Re-enable once RNNCells allow reuse
+        # within a variable scope that already has a weights tensor.
+        #
+        # # Test that previous-feeding model ignores inputs after the first.
+        # dec_inp2 = [
+        #     constant_op.constant(
+        #         0, dtypes.int32, shape=[2]) for _ in range(3)
+        # ]
+        # with variable_scope.variable_scope("other"):
+        #   d3, _ = seq2seq_lib.embedding_attention_seq2seq(
+        #       enc_inp,
+        #       dec_inp2,
+        #       cell_fn(),
+        #       num_encoder_symbols=2,
+        #       num_decoder_symbols=5,
+        #       embedding_size=2,
+        #       feed_previous=constant_op.constant(True))
+        # sess.run([variables.global_variables_initializer()])
+        # variable_scope.get_variable_scope().reuse_variables()
+        # cell = cell_fn()
+        # d1, _ = seq2seq_lib.embedding_attention_seq2seq(
+        #     enc_inp,
+        #     dec_inp,
+        #     cell,
+        #     num_encoder_symbols=2,
+        #     num_decoder_symbols=5,
+        #     embedding_size=2,
+        #     feed_previous=True)
+        # d2, _ = seq2seq_lib.embedding_attention_seq2seq(
+        #     enc_inp,
+        #     dec_inp2,
+        #     cell,
+        #     num_encoder_symbols=2,
+        #     num_decoder_symbols=5,
+        #     embedding_size=2,
+        #     feed_previous=True)
+        # res1 = sess.run(d1)
+        # res2 = sess.run(d2)
+        # res3 = sess.run(d3)
+        # self.assertAllClose(res1, res2)
+        # self.assertAllClose(res1, res3)
 
   def testOne2ManyRNNSeq2Seq(self):
     with self.test_session() as sess:
@@ -612,9 +639,14 @@ class Seq2SeqTest(test.TestCase):
                 i, dtypes.int32, shape=[2]) for i in range(4)
         ]
         dec_symbols_dict = {"0": 5, "1": 6}
-        cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+        def EncCellFn():
+          return rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        def DecCellsFn():
+          return dict((k, rnn_cell.BasicLSTMCell(2, state_is_tuple=True))
+                      for k in dec_symbols_dict)
         outputs_dict, state_dict = (seq2seq_lib.one2many_rnn_seq2seq(
-            enc_inp, dec_inp_dict, cell, 2, dec_symbols_dict, embedding_size=2))
+            enc_inp, dec_inp_dict, EncCellFn(), DecCellsFn(),
+            2, dec_symbols_dict, embedding_size=2))
 
         sess.run([variables.global_variables_initializer()])
         res = sess.run(outputs_dict["0"])
@@ -646,29 +678,33 @@ class Seq2SeqTest(test.TestCase):
           outputs_dict3, _ = seq2seq_lib.one2many_rnn_seq2seq(
               enc_inp,
               dec_inp_dict2,
-              cell,
+              EncCellFn(),
+              DecCellsFn(),
               2,
               dec_symbols_dict,
               embedding_size=2,
               feed_previous=constant_op.constant(True))
+        with variable_scope.variable_scope("other_2"):
+          outputs_dict1, _ = seq2seq_lib.one2many_rnn_seq2seq(
+              enc_inp,
+              dec_inp_dict,
+              EncCellFn(),
+              DecCellsFn(),
+              2,
+              dec_symbols_dict,
+              embedding_size=2,
+              feed_previous=True)
+        with variable_scope.variable_scope("other_3"):
+          outputs_dict2, _ = seq2seq_lib.one2many_rnn_seq2seq(
+              enc_inp,
+              dec_inp_dict2,
+              EncCellFn(),
+              DecCellsFn(),
+              2,
+              dec_symbols_dict,
+              embedding_size=2,
+              feed_previous=True)
         sess.run([variables.global_variables_initializer()])
-        variable_scope.get_variable_scope().reuse_variables()
-        outputs_dict1, _ = seq2seq_lib.one2many_rnn_seq2seq(
-            enc_inp,
-            dec_inp_dict,
-            cell,
-            2,
-            dec_symbols_dict,
-            embedding_size=2,
-            feed_previous=True)
-        outputs_dict2, _ = seq2seq_lib.one2many_rnn_seq2seq(
-            enc_inp,
-            dec_inp_dict2,
-            cell,
-            2,
-            dec_symbols_dict,
-            embedding_size=2,
-            feed_previous=True)
         res1 = sess.run(outputs_dict1["0"])
         res2 = sess.run(outputs_dict2["0"])
         res3 = sess.run(outputs_dict3["0"])
@@ -734,61 +770,64 @@ class Seq2SeqTest(test.TestCase):
       res = sess.run(loss_per_sequence)
       self.assertAllClose(np.asarray([4.828314, 4.828314]), res)
 
-  def testModelWithBucketsScopeAndLoss(self):
-    """Test that variable scope reuse is not reset after model_with_buckets."""
-    classes = 10
-    buckets = [(4, 4), (8, 8)]
+  # TODO(ebrevdo, lukaszkaiser): Re-enable once RNNCells allow reuse
+  # within a variable scope that already has a weights tensor.
+  #
+  # def testModelWithBucketsScopeAndLoss(self):
+  #   """Test variable scope reuse is not reset after model_with_buckets."""
+  #   classes = 10
+  #   buckets = [(4, 4), (8, 8)]
 
-    with self.test_session():
-      # Here comes a sample Seq2Seq model using GRU cells.
-      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
-        """Example sequence-to-sequence model that uses GRU cells."""
+  #   with self.test_session():
+  #     # Here comes a sample Seq2Seq model using GRU cells.
+  #     def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
+  #       """Example sequence-to-sequence model that uses GRU cells."""
 
-        def GRUSeq2Seq(enc_inp, dec_inp):
-          cell = core_rnn_cell_impl.MultiRNNCell(
-              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
-              state_is_tuple=True)
-          return seq2seq_lib.embedding_attention_seq2seq(
-              enc_inp,
-              dec_inp,
-              cell,
-              num_encoder_symbols=classes,
-              num_decoder_symbols=classes,
-              embedding_size=24)
+  #       def GRUSeq2Seq(enc_inp, dec_inp):
+  #         cell = rnn_cell.MultiRNNCell(
+  #             [rnn_cell.GRUCell(24) for _ in range(2)])
+  #         return seq2seq_lib.embedding_attention_seq2seq(
+  #             enc_inp,
+  #             dec_inp,
+  #             cell,
+  #             num_encoder_symbols=classes,
+  #             num_decoder_symbols=classes,
+  #             embedding_size=24)
 
-        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]
-        return seq2seq_lib.model_with_buckets(
-            enc_inp,
-            dec_inp,
-            targets,
-            weights,
-            buckets,
-            GRUSeq2Seq,
-            per_example_loss=per_example_loss)
+  #       targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]
+  #       return seq2seq_lib.model_with_buckets(
+  #           enc_inp,
+  #           dec_inp,
+  #           targets,
+  #           weights,
+  #           buckets,
+  #           GRUSeq2Seq,
+  #           per_example_loss=per_example_loss)
 
-      # Now we construct the copy model.
-      inp = [
-          array_ops.placeholder(
-              dtypes.int32, shape=[None]) for _ in range(8)
-      ]
-      out = [
-          array_ops.placeholder(
-              dtypes.int32, shape=[None]) for _ in range(8)
-      ]
-      weights = [
-          array_ops.ones_like(
-              inp[0], dtype=dtypes.float32) for _ in range(8)
-      ]
-      with variable_scope.variable_scope("root"):
-        _, losses1 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=False)
-        # Now check that we did not accidentally set reuse.
-        self.assertEqual(False, variable_scope.get_variable_scope().reuse)
-        # Construct one more model with per-example loss.
-        variable_scope.get_variable_scope().reuse_variables()
-        _, losses2 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=True)
-        # First loss is scalar, the second one is a 1-dimensinal tensor.
-        self.assertEqual([], losses1[0].get_shape().as_list())
-        self.assertEqual([None], losses2[0].get_shape().as_list())
+  #     # Now we construct the copy model.
+  #     inp = [
+  #         array_ops.placeholder(
+  #             dtypes.int32, shape=[None]) for _ in range(8)
+  #     ]
+  #     out = [
+  #         array_ops.placeholder(
+  #             dtypes.int32, shape=[None]) for _ in range(8)
+  #     ]
+  #     weights = [
+  #         array_ops.ones_like(
+  #             inp[0], dtype=dtypes.float32) for _ in range(8)
+  #     ]
+  #     with variable_scope.variable_scope("root"):
+  #       _, losses1 = SampleGRUSeq2Seq(
+  #           inp, out, weights, per_example_loss=False)
+  #       # Now check that we did not accidentally set reuse.
+  #       self.assertEqual(False, variable_scope.get_variable_scope().reuse)
+  #     with variable_scope.variable_scope("new"):
+  #       _, losses2 = SampleGRUSeq2Seq
+  #           inp, out, weights, per_example_loss=True)
+  #       # First loss is scalar, the second one is a 1-dimensional tensor.
+  #       self.assertEqual([], losses1[0].get_shape().as_list())
+  #       self.assertEqual([None], losses2[0].get_shape().as_list())
 
   def testModelWithBuckets(self):
     """Larger tests that does full sequence-to-sequence model training."""
@@ -811,9 +850,8 @@ class Seq2SeqTest(test.TestCase):
         """Example sequence-to-sequence model that uses GRU cells."""
 
         def GRUSeq2Seq(enc_inp, dec_inp):
-          cell = core_rnn_cell_impl.MultiRNNCell(
-              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
-              state_is_tuple=True)
+          cell = rnn_cell.MultiRNNCell(
+              [rnn_cell.GRUCell(24) for _ in range(2)], state_is_tuple=True)
           return seq2seq_lib.embedding_attention_seq2seq(
               enc_inp,
               dec_inp,
@@ -825,13 +863,13 @@ class Seq2SeqTest(test.TestCase):
 
         targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]
 
-        def SampledLoss(labels, inputs):
+        def SampledLoss(labels, logits):
           labels = array_ops.reshape(labels, [-1, 1])
           return nn_impl.sampled_softmax_loss(
               weights=w_t,
               biases=b,
               labels=labels,
-              inputs=inputs,
+              inputs=logits,
               num_sampled=8,
               num_classes=classes)
 
@@ -861,7 +899,7 @@ class Seq2SeqTest(test.TestCase):
       with variable_scope.variable_scope("root"):
         _, losses = SampleGRUSeq2Seq(inp, out, weights)
         updates = []
-        params = variables.all_variables()
+        params = variables.global_variables()
         optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5)
         for i in range(len(buckets)):
           full_grads = gradients_impl.gradients(losses[i], params)
@@ -891,8 +929,8 @@ class Seq2SeqTest(test.TestCase):
         perplexities[bucket].append(math.exp(float(res[1])))
       for bucket in range(len(buckets)):
         if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
-          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
-                          1.1 * perplexities[bucket][0])
+          self.assertLess(perplexities[bucket][-1],  # 20% margin of error.
+                          1.2 * perplexities[bucket][0])
 
   def testModelWithBooleanFeedPrevious(self):
     """Test the model behavior when feed_previous is True.
@@ -956,7 +994,7 @@ class Seq2SeqTest(test.TestCase):
 
         dec_op_fp_true, update_fp_true, variables_fp_true = ForwardBackward(
             enc_inp, dec_inp_fp_true, feed_previous=True)
-        dec_op_fp_false, update_fp_false, variables_fp_false = ForwardBackward(
+        _, update_fp_false, variables_fp_false = ForwardBackward(
             enc_inp, dec_inp_holder_fp_false, feed_previous=False)
 
         sess.run(variables.global_variables_initializer())
@@ -989,7 +1027,7 @@ class Seq2SeqTest(test.TestCase):
           self.assertAllClose(v_true.eval(), v_false.eval())
 
     def EmbeddingRNNSeq2SeqF(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
       return seq2seq_lib.embedding_rnn_seq2seq(
           enc_inp,
           dec_inp,
@@ -1000,7 +1038,7 @@ class Seq2SeqTest(test.TestCase):
           feed_previous=feed_previous)
 
     def EmbeddingRNNSeq2SeqNoTupleF(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
       return seq2seq_lib.embedding_rnn_seq2seq(
           enc_inp,
           dec_inp,
@@ -1011,7 +1049,7 @@ class Seq2SeqTest(test.TestCase):
           feed_previous=feed_previous)
 
     def EmbeddingTiedRNNSeq2Seq(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
       return seq2seq_lib.embedding_tied_rnn_seq2seq(
           enc_inp,
           dec_inp,
@@ -1021,7 +1059,7 @@ class Seq2SeqTest(test.TestCase):
           feed_previous=feed_previous)
 
     def EmbeddingTiedRNNSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
       return seq2seq_lib.embedding_tied_rnn_seq2seq(
           enc_inp,
           dec_inp,
@@ -1031,7 +1069,7 @@ class Seq2SeqTest(test.TestCase):
           feed_previous=feed_previous)
 
     def EmbeddingAttentionSeq2Seq(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=True)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
       return seq2seq_lib.embedding_attention_seq2seq(
           enc_inp,
           dec_inp,
@@ -1042,7 +1080,7 @@ class Seq2SeqTest(test.TestCase):
           feed_previous=feed_previous)
 
     def EmbeddingAttentionSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
-      cell = core_rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+      cell = rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
       return seq2seq_lib.embedding_attention_seq2seq(
           enc_inp,
           dec_inp,
diff --git a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
index 0d6eac33de8..23b4a73b23d 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
@@ -62,9 +62,7 @@ import copy
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -72,11 +70,13 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 # TODO(ebrevdo): Remove once _linear is fully deprecated.
-linear = core_rnn_cell_impl._linear  # pylint: disable=protected-access
+linear = rnn_cell_impl._linear  # pylint: disable=protected-access
 
 
 def _extract_argmax_and_embed(embedding,
@@ -119,7 +119,7 @@ def rnn_decoder(decoder_inputs,
   Args:
     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
     initial_state: 2D Tensor with shape [batch_size x cell.state_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: rnn_cell.RNNCell defining the cell function and size.
     loop_function: If not None, this function will be applied to the i-th output
       in order to generate the i+1-st input, and decoder_inputs will be ignored,
       except for the first element ("GO" symbol). This can be used for decoding,
@@ -170,7 +170,7 @@ def basic_rnn_seq2seq(encoder_inputs,
   Args:
     encoder_inputs: A list of 2D Tensors [batch_size x input_size].
     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
     scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
 
@@ -183,7 +183,7 @@ def basic_rnn_seq2seq(encoder_inputs,
   """
   with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"):
     enc_cell = copy.deepcopy(cell)
-    _, enc_state = core_rnn.static_rnn(enc_cell, encoder_inputs, dtype=dtype)
+    _, enc_state = rnn.static_rnn(enc_cell, encoder_inputs, dtype=dtype)
     return rnn_decoder(decoder_inputs, enc_state, cell)
 
 
@@ -202,7 +202,7 @@ def tied_rnn_seq2seq(encoder_inputs,
   Args:
     encoder_inputs: A list of 2D Tensors [batch_size x input_size].
     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     loop_function: If not None, this function will be applied to i-th output
       in order to generate i+1-th input, and decoder_inputs will be ignored,
       except for the first element ("GO" symbol), see rnn_decoder for details.
@@ -219,7 +219,7 @@ def tied_rnn_seq2seq(encoder_inputs,
   """
   with variable_scope.variable_scope("combined_tied_rnn_seq2seq"):
     scope = scope or "tied_rnn_seq2seq"
-    _, enc_state = core_rnn.static_rnn(
+    _, enc_state = rnn.static_rnn(
         cell, encoder_inputs, dtype=dtype, scope=scope)
     variable_scope.get_variable_scope().reuse_variables()
     return rnn_decoder(
@@ -244,7 +244,7 @@ def embedding_rnn_decoder(decoder_inputs,
   Args:
     decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
     initial_state: 2D Tensor [batch_size x cell.state_size].
-    cell: core_rnn_cell.RNNCell defining the cell function.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function.
     num_symbols: Integer, how many symbols come into the embedding.
     embedding_size: Integer, the length of the embedding vector for each symbol.
     output_projection: None or a pair (W, B) of output projection weights and
@@ -320,7 +320,7 @@ def embedding_rnn_seq2seq(encoder_inputs,
   Args:
     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
     decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     num_encoder_symbols: Integer; number of symbols on the encoder side.
     num_decoder_symbols: Integer; number of symbols on the decoder side.
     embedding_size: Integer, the length of the embedding vector for each symbol.
@@ -360,8 +360,7 @@ def embedding_rnn_seq2seq(encoder_inputs,
         encoder_cell,
         embedding_classes=num_encoder_symbols,
         embedding_size=embedding_size)
-    _, encoder_state = core_rnn.static_rnn(
-        encoder_cell, encoder_inputs, dtype=dtype)
+    _, encoder_state = rnn.static_rnn(encoder_cell, encoder_inputs, dtype=dtype)
 
     # Decoder.
     if output_projection is None:
@@ -381,7 +380,7 @@ def embedding_rnn_seq2seq(encoder_inputs,
     def decoder(feed_previous_bool):
       reuse = None if feed_previous_bool else True
       with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=reuse) as scope:
+          variable_scope.get_variable_scope(), reuse=reuse):
         outputs, state = embedding_rnn_decoder(
             decoder_inputs,
             encoder_state,
@@ -431,7 +430,7 @@ def embedding_tied_rnn_seq2seq(encoder_inputs,
   Args:
     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
     decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     num_symbols: Integer; number of symbols for both encoder and decoder.
     embedding_size: Integer, the length of the embedding vector for each symbol.
     num_decoder_symbols: Integer; number of output symbols for decoder. If
@@ -560,7 +559,7 @@ def attention_decoder(decoder_inputs,
     decoder_inputs: A list of 2D Tensors [batch_size x input_size].
     initial_state: 2D Tensor [batch_size x cell.state_size].
     attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     output_size: Size of the output vectors; if None, we use cell.output_size.
     num_heads: Number of attention heads that read from attention_states.
     loop_function: If not None, this function will be applied to i-th output
@@ -720,7 +719,7 @@ def embedding_attention_decoder(decoder_inputs,
     decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
     initial_state: 2D Tensor [batch_size x cell.state_size].
     attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: core_rnn_cell.RNNCell defining the cell function.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function.
     num_symbols: Integer, how many symbols come into the embedding.
     embedding_size: Integer, the length of the embedding vector for each symbol.
     num_heads: Number of attention heads that read from attention_states.
@@ -814,7 +813,7 @@ def embedding_attention_seq2seq(encoder_inputs,
   Args:
     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
     decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    cell: tf.nn.rnn_cell.RNNCell defining the cell function and size.
     num_encoder_symbols: Integer; number of symbols on the encoder side.
     num_decoder_symbols: Integer; number of symbols on the decoder side.
     embedding_size: Integer, the length of the embedding vector for each symbol.
@@ -851,7 +850,7 @@ def embedding_attention_seq2seq(encoder_inputs,
         encoder_cell,
         embedding_classes=num_encoder_symbols,
         embedding_size=embedding_size)
-    encoder_outputs, encoder_state = core_rnn.static_rnn(
+    encoder_outputs, encoder_state = rnn.static_rnn(
         encoder_cell, encoder_inputs, dtype=dtype)
 
     # First calculate a concatenation of encoder outputs to put attention on.
@@ -884,7 +883,7 @@ def embedding_attention_seq2seq(encoder_inputs,
     def decoder(feed_previous_bool):
       reuse = None if feed_previous_bool else True
       with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=reuse) as scope:
+          variable_scope.get_variable_scope(), reuse=reuse):
         outputs, state = embedding_attention_decoder(
             decoder_inputs,
             encoder_state,
@@ -917,7 +916,8 @@ def embedding_attention_seq2seq(encoder_inputs,
 
 def one2many_rnn_seq2seq(encoder_inputs,
                          decoder_inputs_dict,
-                         cell,
+                         enc_cell,
+                         dec_cells_dict,
                          num_encoder_symbols,
                          num_decoder_symbols_dict,
                          embedding_size,
@@ -932,11 +932,14 @@ def one2many_rnn_seq2seq(encoder_inputs,
 
   Args:
     encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    decoder_inputs_dict: A dictionany mapping decoder name (string) to
+    decoder_inputs_dict: A dictionary mapping decoder name (string) to
       the corresponding decoder_inputs; each decoder_inputs is a list of 1D
       Tensors of shape [batch_size]; num_decoders is defined as
       len(decoder_inputs_dict).
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
+    enc_cell: tf.nn.rnn_cell.RNNCell defining the encoder cell function and
+      size.
+    dec_cells_dict: A dictionary mapping encoder name (string) to an
+      instance of tf.nn.rnn_cell.RNNCell.
     num_encoder_symbols: Integer; number of symbols on the encoder side.
     num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an
       integer specifying number of symbols for the corresponding decoder;
@@ -960,37 +963,47 @@ def one2many_rnn_seq2seq(encoder_inputs,
       state_dict: A mapping from decoder name (string) to the final state of the
         corresponding decoder RNN; it is a 2D Tensor of shape
         [batch_size x cell.state_size].
+
+  Raises:
+    TypeError: if enc_cell or any of the dec_cells are not instances of RNNCell.
+    ValueError: if len(dec_cells) != len(decoder_inputs_dict).
   """
   outputs_dict = {}
   state_dict = {}
 
+  if not isinstance(enc_cell, rnn_cell_impl.RNNCell):
+    raise TypeError("enc_cell is not an RNNCell: %s" % type(enc_cell))
+  if set(dec_cells_dict) != set(decoder_inputs_dict):
+    raise ValueError("keys of dec_cells_dict != keys of decodre_inputs_dict")
+  for dec_cell in dec_cells_dict.values():
+    if not isinstance(dec_cell, rnn_cell_impl.RNNCell):
+      raise TypeError("dec_cell is not an RNNCell: %s" % type(dec_cell))
+
   with variable_scope.variable_scope(
       scope or "one2many_rnn_seq2seq", dtype=dtype) as scope:
     dtype = scope.dtype
 
     # Encoder.
-    encoder_cell = copy.deepcopy(cell)
-    encoder_cell = core_rnn_cell.EmbeddingWrapper(
-        encoder_cell,
+    enc_cell = core_rnn_cell.EmbeddingWrapper(
+        enc_cell,
         embedding_classes=num_encoder_symbols,
         embedding_size=embedding_size)
-    _, encoder_state = core_rnn.static_rnn(
-        encoder_cell, encoder_inputs, dtype=dtype)
+    _, encoder_state = rnn.static_rnn(enc_cell, encoder_inputs, dtype=dtype)
 
     # Decoder.
     for name, decoder_inputs in decoder_inputs_dict.items():
       num_decoder_symbols = num_decoder_symbols_dict[name]
+      dec_cell = dec_cells_dict[name]
 
       with variable_scope.variable_scope("one2many_decoder_" + str(
           name)) as scope:
-        decoder_cell = copy.deepcopy(cell)
-        decoder_cell = core_rnn_cell.OutputProjectionWrapper(
-            decoder_cell, num_decoder_symbols)
+        dec_cell = core_rnn_cell.OutputProjectionWrapper(
+            dec_cell, num_decoder_symbols)
         if isinstance(feed_previous, bool):
           outputs, state = embedding_rnn_decoder(
               decoder_inputs,
               encoder_state,
-              decoder_cell,
+              dec_cell,
               num_decoder_symbols,
               embedding_size,
               feed_previous=feed_previous)
@@ -1005,7 +1018,7 @@ def one2many_rnn_seq2seq(encoder_inputs,
               outputs, state = embedding_rnn_decoder(
                   decoder_inputs,
                   encoder_state,
-                  decoder_cell,
+                  dec_cell,
                   num_decoder_symbols,
                   embedding_size,
                   feed_previous=feed_previous)
@@ -1046,8 +1059,10 @@ def sequence_loss_by_example(logits,
     weights: List of 1D batch-sized float-Tensors of the same length as logits.
     average_across_timesteps: If set, divide the returned cost by the total
       label weight.
-    softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
+    softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
+      **Note that to avoid confusion, it is required for the function to accept
+      named arguments.**
     name: Optional name for this operation, default: "sequence_loss_by_example".
 
   Returns:
@@ -1071,7 +1086,7 @@ def sequence_loss_by_example(logits,
         crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=target, logits=logit)
       else:
-        crossent = softmax_loss_function(target, logit)
+        crossent = softmax_loss_function(labels=target, logits=logit)
       log_perp_list.append(crossent * weight)
     log_perps = math_ops.add_n(log_perp_list)
     if average_across_timesteps:
@@ -1097,8 +1112,10 @@ def sequence_loss(logits,
     average_across_timesteps: If set, divide the returned cost by the total
       label weight.
     average_across_batch: If set, divide the returned cost by the batch size.
-    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+    softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
+      **Note that to avoid confusion, it is required for the function to accept
+      named arguments.**
     name: Optional name for this operation, defaults to "sequence_loss".
 
   Returns:
@@ -1135,7 +1152,7 @@ def model_with_buckets(encoder_inputs,
 
   The seq2seq argument is a function that defines a sequence-to-sequence model,
   e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(
-      x, y, core_rnn_cell.GRUCell(24))
+      x, y, rnn_cell.GRUCell(24))
 
   Args:
     encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
@@ -1146,8 +1163,10 @@ def model_with_buckets(encoder_inputs,
     seq2seq: A sequence-to-sequence model function; it takes 2 input that
       agree with encoder_inputs and decoder_inputs, and returns a pair
       consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
-    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+    softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
+      **Note that to avoid confusion, it is required for the function to accept
+      named arguments.**
     per_example_loss: Boolean. If set, the returned loss will be a batch-sized
       tensor of losses for each sequence in the batch. If unset, it will be
       a scalar with the averaged loss from all examples.
@@ -1163,7 +1182,7 @@ def model_with_buckets(encoder_inputs,
         if per_example_loss is set, a list of 1D batch-sized float Tensors.
 
   Raises:
-    ValueError: If length of encoder_inputsut, targets, or weights is smaller
+    ValueError: If length of encoder_inputs, targets, or weights is smaller
       than the largest (last) bucket.
   """
   if len(encoder_inputs) < buckets[-1][0]:
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index c1e94744f21..9b4f36da15d 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -28,6 +28,23 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "linear_operator_addition_test",
+    size = "small",
+    srcs = ["python/kernel_tests/linear_operator_addition_test.py"],
+    additional_deps = [
+        ":linalg_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_tests(
     name = "linear_operator_composition_test",
     size = "medium",
@@ -43,7 +60,6 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -61,7 +77,6 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -78,13 +93,12 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
-    name = "linear_operator_matrix_test",
+    name = "linear_operator_full_matrix_test",
     size = "medium",
-    srcs = ["python/kernel_tests/linear_operator_matrix_test.py"],
+    srcs = ["python/kernel_tests/linear_operator_full_matrix_test.py"],
     additional_deps = [
         ":linalg_py",
         "//tensorflow/python:array_ops",
@@ -94,7 +108,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -110,7 +123,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -131,7 +143,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "linear_operator_util_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/linear_operator_util_test.py"],
     additional_deps = [
         ":linalg_py",
@@ -142,7 +154,6 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 py_library(
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 56666339b54..44421a6b7de 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -12,33 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Linear algebra libraries for TensorFlow.
+"""Linear algebra libraries.
 
-## `LinearOperator`
-
-Subclasses of `LinearOperator` provide a access to common methods on a
-(batch) matrix, without the need to materialize the matrix.  This allows:
-
-* Matrix free computations
-* Different operators to take advantage of special strcture, while providing a
-  consistent API to users.
-
-### Base class
+See the @{$python/contrib.linalg} guide.
 
 @@LinearOperator
-
-### Individual operators
-
 @@LinearOperatorDiag
 @@LinearOperatorIdentity
 @@LinearOperatorScaledIdentity
-@@LinearOperatorMatrix
+@@LinearOperatorFullMatrix
 @@LinearOperatorTriL
 @@LinearOperatorUDVHUpdate
-
-### Transformations and Combinations of operators
-
 @@LinearOperatorComposition
+@@add_operators
 
 """
 from __future__ import absolute_import
@@ -48,10 +34,11 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.linalg.python.ops.linear_operator import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_addition import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_composition import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_diag import *
+from tensorflow.contrib.linalg.python.ops.linear_operator_full_matrix import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_identity import *
-from tensorflow.contrib.linalg.python.ops.linear_operator_matrix import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_tril import *
 from tensorflow.contrib.linalg.python.ops.linear_operator_udvh_update import *
 
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_addition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_addition_test.py
new file mode 100644
index 00000000000..47464847557
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_addition_test.py
@@ -0,0 +1,411 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg as linalg_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_addition
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(0)
+
+add_operators = linear_operator_addition.add_operators
+
+
+# pylint: disable=unused-argument
+class _BadAdder(linear_operator_addition._Adder):
+  """Adder that will fail if used."""
+
+  def can_add(self, op1, op2):
+    raise AssertionError("BadAdder.can_add called!")
+
+  def _add(self, op1, op2, operator_name, hints):
+    raise AssertionError("This line should not be reached")
+
+
+# pylint: enable=unused-argument
+
+
+class LinearOperatorAdditionCorrectnessTest(test.TestCase):
+  """Tests correctness of addition with combinations of a few Adders.
+
+  Tests here are done with the _DEFAULT_ADDITION_TIERS, which means
+  add_operators should reduce all operators resulting in one single operator.
+
+  This shows that we are able to correctly combine adders using the tiered
+  system.  All Adders should be tested separately, and there is no need to test
+  every Adder within this class.
+  """
+
+  def test_one_operator_is_returned_unchanged(self):
+    op_a = linalg.LinearOperatorDiag([1., 1.])
+    op_sum = add_operators([op_a])
+    self.assertEqual(1, len(op_sum))
+    self.assertTrue(op_sum[0] is op_a)
+
+  def test_at_least_one_operators_required(self):
+    with self.assertRaisesRegexp(ValueError, "must contain at least one"):
+      add_operators([])
+
+  def test_attempting_to_add_numbers_raises(self):
+    with self.assertRaisesRegexp(TypeError, "contain only LinearOperator"):
+      add_operators([1, 2])
+
+  def test_two_diag_operators(self):
+    op_a = linalg.LinearOperatorDiag(
+        [1., 1.], is_positive_definite=True, name="A")
+    op_b = linalg.LinearOperatorDiag(
+        [2., 2.], is_positive_definite=True, name="B")
+    with self.test_session():
+      op_sum = add_operators([op_a, op_b])
+      self.assertEqual(1, len(op_sum))
+      op = op_sum[0]
+      self.assertTrue(isinstance(op, linalg_lib.LinearOperatorDiag))
+      self.assertAllClose([[3., 0.], [0., 3.]], op.to_dense().eval())
+      # Adding positive definite operators produces positive def.
+      self.assertTrue(op.is_positive_definite)
+      # Real diagonal ==> self-adjoint.
+      self.assertTrue(op.is_self_adjoint)
+      # Positive definite ==> non-singular
+      self.assertTrue(op.is_non_singular)
+      # Enforce particular name for this simple case
+      self.assertEqual("Add/B__A/", op.name)
+
+  def test_three_diag_operators(self):
+    op1 = linalg.LinearOperatorDiag(
+        [1., 1.], is_positive_definite=True, name="op1")
+    op2 = linalg.LinearOperatorDiag(
+        [2., 2.], is_positive_definite=True, name="op2")
+    op3 = linalg.LinearOperatorDiag(
+        [3., 3.], is_positive_definite=True, name="op3")
+    with self.test_session():
+      op_sum = add_operators([op1, op2, op3])
+      self.assertEqual(1, len(op_sum))
+      op = op_sum[0]
+      self.assertTrue(isinstance(op, linalg_lib.LinearOperatorDiag))
+      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense().eval())
+      # Adding positive definite operators produces positive def.
+      self.assertTrue(op.is_positive_definite)
+      # Real diagonal ==> self-adjoint.
+      self.assertTrue(op.is_self_adjoint)
+      # Positive definite ==> non-singular
+      self.assertTrue(op.is_non_singular)
+
+  def test_diag_tril_diag(self):
+    op1 = linalg.LinearOperatorDiag(
+        [1., 1.], is_non_singular=True, name="diag_a")
+    op2 = linalg.LinearOperatorTriL(
+        [[2., 0.], [0., 2.]],
+        is_self_adjoint=True,
+        is_non_singular=True,
+        name="tril")
+    op3 = linalg.LinearOperatorDiag(
+        [3., 3.], is_non_singular=True, name="diag_b")
+    with self.test_session():
+      op_sum = add_operators([op1, op2, op3])
+      self.assertEqual(1, len(op_sum))
+      op = op_sum[0]
+      self.assertTrue(isinstance(op, linalg_lib.LinearOperatorTriL))
+      self.assertAllClose([[6., 0.], [0., 6.]], op.to_dense().eval())
+
+      # The diag operators will be self-adjoint (because real and diagonal).
+      # The TriL operator has the self-adjoint hint set.
+      self.assertTrue(op.is_self_adjoint)
+
+      # Even though op1/2/3 are non-singular, this does not imply op is.
+      # Since no custom hint was provided, we default to None (unknown).
+      self.assertEqual(None, op.is_non_singular)
+
+  def test_matrix_diag_tril_diag_uses_custom_name(self):
+    op0 = linalg.LinearOperatorFullMatrix(
+        [[-1., -1.], [-1., -1.]], name="matrix")
+    op1 = linalg.LinearOperatorDiag([1., 1.], name="diag_a")
+    op2 = linalg.LinearOperatorTriL([[2., 0.], [1.5, 2.]], name="tril")
+    op3 = linalg.LinearOperatorDiag([3., 3.], name="diag_b")
+    with self.test_session():
+      op_sum = add_operators([op0, op1, op2, op3], operator_name="my_operator")
+      self.assertEqual(1, len(op_sum))
+      op = op_sum[0]
+      self.assertTrue(isinstance(op, linalg_lib.LinearOperatorFullMatrix))
+      self.assertAllClose([[5., -1.], [0.5, 5.]], op.to_dense().eval())
+      self.assertEqual("my_operator", op.name)
+
+  def test_incompatible_domain_dimensions_raises(self):
+    op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3))
+    op2 = linalg.LinearOperatorDiag(rng.rand(2, 4))
+    with self.assertRaisesRegexp(ValueError, "must.*same domain dimension"):
+      add_operators([op1, op2])
+
+  def test_incompatible_range_dimensions_raises(self):
+    op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3))
+    op2 = linalg.LinearOperatorDiag(rng.rand(3, 3))
+    with self.assertRaisesRegexp(ValueError, "must.*same range dimension"):
+      add_operators([op1, op2])
+
+  def test_non_broadcastable_batch_shape_raises(self):
+    op1 = linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3))
+    op2 = linalg.LinearOperatorDiag(rng.rand(4, 3, 3))
+    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+      add_operators([op1, op2])
+
+
+class LinearOperatorOrderOfAdditionTest(test.TestCase):
+  """Test that the order of addition is done as specified by tiers."""
+
+  def test_tier_0_additions_done_in_tier_0(self):
+    diag1 = linalg.LinearOperatorDiag([1.])
+    diag2 = linalg.LinearOperatorDiag([1.])
+    diag3 = linalg.LinearOperatorDiag([1.])
+    addition_tiers = [
+        [linear_operator_addition._AddAndReturnDiag()],
+        [_BadAdder()],
+    ]
+    # Should not raise since all were added in tier 0, and tier 1 (with the
+    # _BadAdder) was never reached.
+    op_sum = add_operators([diag1, diag2, diag3], addition_tiers=addition_tiers)
+    self.assertEqual(1, len(op_sum))
+    self.assertTrue(isinstance(op_sum[0], linalg.LinearOperatorDiag))
+
+  def test_tier_1_additions_done_by_tier_1(self):
+    diag1 = linalg.LinearOperatorDiag([1.])
+    diag2 = linalg.LinearOperatorDiag([1.])
+    tril = linalg.LinearOperatorTriL([[1.]])
+    addition_tiers = [
+        [linear_operator_addition._AddAndReturnDiag()],
+        [linear_operator_addition._AddAndReturnTriL()],
+        [_BadAdder()],
+    ]
+    # Should not raise since all were added by tier 1, and the
+    # _BadAdder) was never reached.
+    op_sum = add_operators([diag1, diag2, tril], addition_tiers=addition_tiers)
+    self.assertEqual(1, len(op_sum))
+    self.assertTrue(isinstance(op_sum[0], linalg.LinearOperatorTriL))
+
+  def test_tier_1_additions_done_by_tier_1_with_order_flipped(self):
+    diag1 = linalg.LinearOperatorDiag([1.])
+    diag2 = linalg.LinearOperatorDiag([1.])
+    tril = linalg.LinearOperatorTriL([[1.]])
+    addition_tiers = [
+        [linear_operator_addition._AddAndReturnTriL()],
+        [linear_operator_addition._AddAndReturnDiag()],
+        [_BadAdder()],
+    ]
+    # Tier 0 could convert to TriL, and this converted everything to TriL,
+    # including the Diags.
+    # Tier 1 was never used.
+    # Tier 2 was never used (therefore, _BadAdder didn't raise).
+    op_sum = add_operators([diag1, diag2, tril], addition_tiers=addition_tiers)
+    self.assertEqual(1, len(op_sum))
+    self.assertTrue(isinstance(op_sum[0], linalg.LinearOperatorTriL))
+
+  def test_cannot_add_everything_so_return_more_than_one_operator(self):
+    diag1 = linalg.LinearOperatorDiag([1.])
+    diag2 = linalg.LinearOperatorDiag([2.])
+    tril5 = linalg.LinearOperatorTriL([[5.]])
+    addition_tiers = [
+        [linear_operator_addition._AddAndReturnDiag()],
+    ]
+    # Tier 0 (the only tier) can only convert to Diag, so it combines the two
+    # diags, but the TriL is unchanged.
+    # Result should contain two operators, one Diag, one TriL.
+    op_sum = add_operators([diag1, diag2, tril5], addition_tiers=addition_tiers)
+    self.assertEqual(2, len(op_sum))
+    found_diag = False
+    found_tril = False
+    with self.test_session():
+      for op in op_sum:
+        if isinstance(op, linalg.LinearOperatorDiag):
+          found_diag = True
+          self.assertAllClose([[3.]], op.to_dense().eval())
+        if isinstance(op, linalg.LinearOperatorTriL):
+          found_tril = True
+          self.assertAllClose([[5.]], op.to_dense().eval())
+      self.assertTrue(found_diag and found_tril)
+
+  def test_intermediate_tier_is_not_skipped(self):
+    diag1 = linalg.LinearOperatorDiag([1.])
+    diag2 = linalg.LinearOperatorDiag([1.])
+    tril = linalg.LinearOperatorTriL([[1.]])
+    addition_tiers = [
+        [linear_operator_addition._AddAndReturnDiag()],
+        [_BadAdder()],
+        [linear_operator_addition._AddAndReturnTriL()],
+    ]
+    # tril cannot be added in tier 0, and the intermediate tier 1 with the
+    # BadAdder will catch it and raise.
+    with self.assertRaisesRegexp(AssertionError, "BadAdder.can_add called"):
+      add_operators([diag1, diag2, tril], addition_tiers=addition_tiers)
+
+
+class AddAndReturnScaledIdentityTest(test.TestCase):
+
+  def setUp(self):
+    self._adder = linear_operator_addition._AddAndReturnScaledIdentity()
+
+  def test_identity_plus_identity(self):
+    id1 = linalg.LinearOperatorIdentity(num_rows=2)
+    id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(id1, id2))
+    operator = self._adder.add(id1, id2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorScaledIdentity))
+
+    with self.test_session():
+      self.assertAllClose(2 *
+                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
+                          operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+  def test_identity_plus_scaled_identity(self):
+    id1 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
+    id2 = linalg.LinearOperatorScaledIdentity(num_rows=2, multiplier=2.2)
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(id1, id2))
+    operator = self._adder.add(id1, id2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorScaledIdentity))
+
+    with self.test_session():
+      self.assertAllClose(3.2 *
+                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
+                          operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+  def test_scaled_identity_plus_scaled_identity(self):
+    id1 = linalg.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=[2.2, 2.2, 2.2])
+    id2 = linalg.LinearOperatorScaledIdentity(num_rows=2, multiplier=-1.0)
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(id1, id2))
+    operator = self._adder.add(id1, id2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorScaledIdentity))
+
+    with self.test_session():
+      self.assertAllClose(1.2 *
+                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
+                          operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+
+class AddAndReturnDiagTest(test.TestCase):
+
+  def setUp(self):
+    self._adder = linear_operator_addition._AddAndReturnDiag()
+
+  def test_identity_plus_identity_returns_diag(self):
+    id1 = linalg.LinearOperatorIdentity(num_rows=2)
+    id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(id1, id2))
+    operator = self._adder.add(id1, id2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorDiag))
+
+    with self.test_session():
+      self.assertAllClose(2 *
+                          linalg_ops.eye(num_rows=2, batch_shape=[3]).eval(),
+                          operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+  def test_diag_plus_diag(self):
+    diag1 = rng.rand(2, 3, 4)
+    diag2 = rng.rand(4)
+    op1 = linalg.LinearOperatorDiag(diag1)
+    op2 = linalg.LinearOperatorDiag(diag2)
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(op1, op2))
+    operator = self._adder.add(op1, op2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorDiag))
+
+    with self.test_session():
+      self.assertAllClose(
+          linalg.LinearOperatorDiag(diag1 + diag2).to_dense().eval(),
+          operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+
+class AddAndReturnTriLTest(test.TestCase):
+
+  def setUp(self):
+    self._adder = linear_operator_addition._AddAndReturnTriL()
+
+  def test_diag_plus_tril(self):
+    diag = linalg.LinearOperatorDiag([1., 2.])
+    tril = linalg.LinearOperatorTriL([[10., 0.], [30., 0.]])
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=True, is_non_singular=True)
+
+    self.assertTrue(self._adder.can_add(diag, diag))
+    self.assertTrue(self._adder.can_add(diag, tril))
+    operator = self._adder.add(diag, tril, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorTriL))
+
+    with self.test_session():
+      self.assertAllClose([[11., 0.], [30., 2.]], operator.to_dense().eval())
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+
+class AddAndReturnMatrixTest(test.TestCase):
+
+  def setUp(self):
+    self._adder = linear_operator_addition._AddAndReturnMatrix()
+
+  def test_diag_plus_diag(self):
+    diag1 = linalg.LinearOperatorDiag([1., 2.])
+    diag2 = linalg.LinearOperatorDiag([-1., 3.])
+    hints = linear_operator_addition._Hints(
+        is_positive_definite=False, is_non_singular=False)
+
+    self.assertTrue(self._adder.can_add(diag1, diag2))
+    operator = self._adder.add(diag1, diag2, "my_operator", hints)
+    self.assertTrue(isinstance(operator, linalg.LinearOperatorFullMatrix))
+
+    with self.test_session():
+      self.assertAllClose([[0., 0.], [0., 5.]], operator.to_dense().eval())
+    self.assertFalse(operator.is_positive_definite)
+    self.assertFalse(operator.is_non_singular)
+    self.assertEqual("my_operator", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
index 6309d36258e..e2a7f5fbe10 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
@@ -65,18 +65,21 @@ class SquareLinearOperatorCompositionTest(
       # feed_dict.
       matrices = sess.run(matrices)
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorMatrix(m_ph) for m_ph in matrices_ph])
+          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph],
+          is_square=True)
       feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
     else:
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorMatrix(m) for m in matrices])
+          [linalg.LinearOperatorFullMatrix(m) for m in matrices])
       feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
@@ -86,7 +89,7 @@ class SquareLinearOperatorCompositionTest(
     # The matrix values do not effect auto-setting of the flags.
     matrix = [[1., 0.], [1., 1.]]
     operator = linalg.LinearOperatorComposition(
-        [linalg.LinearOperatorMatrix(matrix)],
+        [linalg.LinearOperatorFullMatrix(matrix)],
         is_positive_definite=True,
         is_non_singular=True,
         is_self_adjoint=False)
@@ -98,8 +101,8 @@ class SquareLinearOperatorCompositionTest(
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
     matrix = [[11., 0.], [1., 8.]]
-    operator_1 = linalg.LinearOperatorMatrix(matrix, is_non_singular=True)
-    operator_2 = linalg.LinearOperatorMatrix(matrix, is_non_singular=True)
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
 
     operator = linalg.LinearOperatorComposition(
         [operator_1, operator_2],
@@ -114,8 +117,8 @@ class SquareLinearOperatorCompositionTest(
 
   def test_name(self):
     matrix = [[11., 0.], [1., 8.]]
-    operator_1 = linalg.LinearOperatorMatrix(matrix, name="left")
-    operator_2 = linalg.LinearOperatorMatrix(matrix, name="right")
+    operator_1 = linalg.LinearOperatorFullMatrix(matrix, name="left")
+    operator_2 = linalg.LinearOperatorFullMatrix(matrix, name="right")
 
     operator = linalg.LinearOperatorComposition([operator_1, operator_2])
 
@@ -123,8 +126,8 @@ class SquareLinearOperatorCompositionTest(
 
   def test_different_dtypes_raises(self):
     operators = [
-        linalg.LinearOperatorMatrix(rng.rand(2, 3, 3)),
-        linalg.LinearOperatorMatrix(rng.rand(2, 3, 3).astype(np.float32))
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 3).astype(np.float32))
     ]
     with self.assertRaisesRegexp(TypeError, "same dtype"):
       linalg.LinearOperatorComposition(operators)
@@ -176,34 +179,34 @@ class NonSquareLinearOperatorCompositionTest(
       # feed_dict.
       matrices = sess.run(matrices)
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorMatrix(m_ph) for m_ph in matrices_ph])
+          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph])
       feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
     else:
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorMatrix(m) for m in matrices])
+          [linalg.LinearOperatorFullMatrix(m) for m in matrices])
       feed_dict = None
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
 
   def test_static_shapes(self):
     operators = [
-        linalg.LinearOperatorMatrix(rng.rand(2, 3, 4)),
-        linalg.LinearOperatorMatrix(rng.rand(2, 4, 5))
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 4, 5))
     ]
     operator = linalg.LinearOperatorComposition(operators)
     self.assertAllEqual((2, 3, 5), operator.shape)
 
   def test_shape_tensors_when_statically_available(self):
     operators = [
-        linalg.LinearOperatorMatrix(rng.rand(2, 3, 4)),
-        linalg.LinearOperatorMatrix(rng.rand(2, 4, 5))
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
+        linalg.LinearOperatorFullMatrix(rng.rand(2, 4, 5))
     ]
     operator = linalg.LinearOperatorComposition(operators)
     with self.test_session():
@@ -217,8 +220,8 @@ class NonSquareLinearOperatorCompositionTest(
     feed_dict = {mat_ph_1: mat_1, mat_ph_2: mat_2}
 
     operators = [
-        linalg.LinearOperatorMatrix(mat_ph_1),
-        linalg.LinearOperatorMatrix(mat_ph_2)
+        linalg.LinearOperatorFullMatrix(mat_ph_1),
+        linalg.LinearOperatorFullMatrix(mat_ph_2)
     ]
     operator = linalg.LinearOperatorComposition(operators)
     with self.test_session():
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
index ab6b91087d2..397bfa22156 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
@@ -118,7 +118,11 @@ class LinearOperatorDiagTest(
       # Should not raise
       operator.assert_self_adjoint().run()
 
-  def test_broadcast_apply_and_solve(self):
+  def test_scalar_diag_raises(self):
+    with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
+      linalg.LinearOperatorDiag(1.)
+
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.matmul cannot handle.
     # In particular, tf.matmul does not broadcast.
@@ -126,7 +130,7 @@ class LinearOperatorDiagTest(
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
       # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
-      # and apply with 'x' as the argument.
+      # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       self.assertAllEqual((2, 1, 3, 3), operator.shape)
@@ -136,10 +140,10 @@ class LinearOperatorDiagTest(
       mat = array_ops.matrix_diag(diag_broadcast)
       self.assertAllEqual((2, 2, 3, 3), mat.get_shape())  # being pedantic.
 
-      operator_apply = operator.apply(x)
-      mat_apply = math_ops.matmul(mat, x)
-      self.assertAllEqual(operator_apply.get_shape(), mat_apply.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, mat_apply]))
+      operator_matmul = operator.matmul(x)
+      mat_matmul = math_ops.matmul(mat, x)
+      self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
new file mode 100644
index 00000000000..528bc3ed124
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
@@ -0,0 +1,249 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg as linalg_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+random_seed.set_random_seed(23)
+
+
+class SquareLinearOperatorFullMatrixTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+    shape = list(shape)
+
+    matrix = linear_operator_test_util.random_positive_definite_matrix(shape,
+                                                                       dtype)
+
+    if use_placeholder:
+      matrix_ph = array_ops.placeholder(dtype=dtype)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrix = matrix.eval()
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph, is_square=True)
+      feed_dict = {matrix_ph: matrix}
+    else:
+      # is_square should be auto-detected here.
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      feed_dict = None
+
+    # Convert back to Tensor.  Needed if use_placeholder, since then we have
+    # already evaluated matrix to a numpy array.
+    mat = ops.convert_to_tensor(matrix)
+
+    return operator, mat, feed_dict
+
+  def test_is_x_flags(self):
+    # Matrix with two positive eigenvalues.
+    matrix = [[1., 0.], [1., 11.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+    self.assertFalse(operator.is_self_adjoint)
+    # Auto-detected.
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
+    with self.test_session():
+      tril = linear_operator_test_util.random_tril_matrix(
+          shape=(50, 50), dtype=np.float32)
+      diag = np.logspace(-2, 2, 50).astype(np.float32)
+      tril = array_ops.matrix_set_diag(tril, diag)
+      matrix = math_ops.matmul(tril, tril, transpose_b=True).eval()
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        # Ensure that we have finite condition number...just HUGE.
+        cond = np.linalg.cond(matrix)
+        self.assertTrue(np.isfinite(cond))
+        self.assertGreater(cond, 1e12)
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_raises_if_cond_infinite(self):
+    with self.test_session():
+      matrix = [[1., 1.], [1., 1.]]
+      # We don't pass the is_self_adjoint hint here, which means we take the
+      # generic code path.
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=True)
+    with self.test_session():
+      with self.assertRaisesOpError("Cholesky decomposition was not success"):
+        operator.assert_positive_definite().run()
+
+
+class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest.
+
+  In this test, the operator is constructed with hints that invoke the use of
+  a Cholesky decomposition for solves/determinant.
+  """
+
+  def setUp(self):
+    # Increase from 1e-6 to 1e-5.  This reduction in tolerance happens,
+    # presumably, because we are taking a different code path in the operator
+    # and the matrix.  The operator uses a Choleksy, the matrix uses standard
+    # solve.
+    self._atol[dtypes.float32] = 1e-5
+    self._rtol[dtypes.float32] = 1e-5
+    self._atol[dtypes.float64] = 1e-10
+    self._rtol[dtypes.float64] = 1e-10
+
+  @property
+  def _dtypes_to_test(self):
+    return [dtypes.float32, dtypes.float64]
+
+  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+    shape = list(shape)
+
+    matrix = linear_operator_test_util.random_positive_definite_matrix(
+        shape, dtype, force_well_conditioned=True)
+
+    if use_placeholder:
+      matrix_ph = array_ops.placeholder(dtype=dtype)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrix = matrix.eval()
+      # is_square is auto-set because of self_adjoint/pd.
+      operator = linalg.LinearOperatorFullMatrix(
+          matrix_ph, is_self_adjoint=True, is_positive_definite=True)
+      feed_dict = {matrix_ph: matrix}
+    else:
+      operator = linalg.LinearOperatorFullMatrix(
+          matrix, is_self_adjoint=True, is_positive_definite=True)
+      feed_dict = None
+
+    # Convert back to Tensor.  Needed if use_placeholder, since then we have
+    # already evaluated matrix to a numpy array.
+    mat = ops.convert_to_tensor(matrix)
+
+    return operator, mat, feed_dict
+
+  def test_is_x_flags(self):
+    # Matrix with two positive eigenvalues.
+    matrix = [[1., 0.], [0., 7.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=True, is_self_adjoint=True)
+
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_self_adjoint)
+
+    # Should be auto-set
+    self.assertTrue(operator.is_non_singular)
+    self.assertTrue(operator._can_use_cholesky)
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_positive_definite().run()
+
+
+class NonSquareLinearOperatorFullMatrixTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
+    matrix = linear_operator_test_util.random_normal(shape, dtype=dtype)
+    if use_placeholder:
+      matrix_ph = array_ops.placeholder(dtype=dtype)
+      # Evaluate here because (i) you cannot feed a tensor, and (ii)
+      # values are random and we want the same value used for both mat and
+      # feed_dict.
+      matrix = matrix.eval()
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph)
+      feed_dict = {matrix_ph: matrix}
+    else:
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      feed_dict = None
+
+    # Convert back to Tensor.  Needed if use_placeholder, since then we have
+    # already evaluated matrix to a numpy array.
+    mat = ops.convert_to_tensor(matrix)
+
+    return operator, mat, feed_dict
+
+  def test_is_x_flags(self):
+    matrix = [[3., 2., 1.], [1., 1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_self_adjoint=False)
+    self.assertEqual(operator.is_positive_definite, None)
+    self.assertEqual(operator.is_non_singular, None)
+    self.assertFalse(operator.is_self_adjoint)
+    self.assertFalse(operator.is_square)
+
+  def test_matrix_must_have_at_least_two_dims_or_raises(self):
+    with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
+      linalg.LinearOperatorFullMatrix([1.])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
index 36a255f3d50..5faf2c432b6 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
@@ -77,14 +77,14 @@ class LinearOperatorIdentityTest(
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_self_adjoint().run()  # Should not fail
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -147,7 +147,7 @@ class LinearOperatorIdentityTest(
     operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -156,7 +156,7 @@ class LinearOperatorIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
@@ -168,11 +168,11 @@ class LinearOperatorIdentityTest(
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -182,15 +182,15 @@ class LinearOperatorIdentityTest(
       x = array_ops.placeholder(dtypes.float32)
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
       feed_dict = {x: rng.randn(1, 2, 3, 4)}
 
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
-  def test_broadcast_apply_static_shapes(self):
+  def test_broadcast_matmul_static_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -204,14 +204,14 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
-  def test_broadcast_apply_dynamic_shapes(self):
+  def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -229,12 +229,12 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
   def test_is_x_flags(self):
     # The is_x flags are by default all True.
@@ -332,7 +332,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not self-adjoint"):
         operator.assert_self_adjoint().run()
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
@@ -340,7 +340,7 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(multiplier[..., None, None] * x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -354,7 +354,7 @@ class LinearOperatorScaledIdentityTest(
         num_rows=2, multiplier=2.2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -363,11 +363,11 @@ class LinearOperatorScaledIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows, multiplier=[1., 2], assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
-  def test_broadcast_apply_and_solve(self):
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -383,11 +383,11 @@ class LinearOperatorScaledIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2 + zeros
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
@@ -395,7 +395,7 @@ class LinearOperatorScaledIdentityTest(
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
       self.assertAllClose(*sess.run([operator_solve, expected]))
 
-  def test_broadcast_apply_and_solve_scalar_scale_multiplier(self):
+  def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -409,11 +409,11 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=3, multiplier=2.2)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_matrix_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_matrix_test.py
deleted file mode 100644
index 72dcaced85a..00000000000
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_matrix_test.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import linalg as linalg_lib
-from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-linalg = linalg_lib
-random_seed.set_random_seed(23)
-
-
-class SquareLinearOperatorMatrixTest(
-    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
-  """Most tests done in the base class LinearOperatorDerivedClassTest."""
-
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
-
-    matrix = linear_operator_test_util.random_positive_definite_matrix(shape,
-                                                                       dtype)
-
-    if use_placeholder:
-      matrix_ph = array_ops.placeholder(dtype=dtype)
-      # Evaluate here because (i) you cannot feed a tensor, and (ii)
-      # values are random and we want the same value used for both mat and
-      # feed_dict.
-      matrix = matrix.eval()
-      operator = linalg.LinearOperatorMatrix(matrix)
-      feed_dict = {matrix_ph: matrix}
-    else:
-      operator = linalg.LinearOperatorMatrix(matrix)
-      feed_dict = None
-
-    # Convert back to Tensor.  Needed if use_placeholder, since then we have
-    # already evaluated matrix to a numpy array.
-    mat = ops.convert_to_tensor(matrix)
-
-    return operator, mat, feed_dict
-
-  def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[1., 0.], [1., 11.]]
-    operator = linalg.LinearOperatorMatrix(
-        matrix,
-        is_positive_definite=True,
-        is_non_singular=True,
-        is_self_adjoint=False)
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_non_singular)
-    self.assertFalse(operator.is_self_adjoint)
-
-
-class SquareLinearOperatorMatrixSymmetricPositiveDefiniteTest(
-    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
-  """Most tests done in the base class LinearOperatorDerivedClassTest.
-
-  In this test, the operator is constructed with hints that invoke the use of
-  a Cholesky decomposition for solves/determinant.
-  """
-
-  def setUp(self):
-    # Increase from 1e-6 to 1e-5.  This reduction in tolerance happens,
-    # presumably, because we are taking a different code path in the operator
-    # and the matrix.  The operator uses a Choleksy, the matrix uses standard
-    # solve.
-    self._atol[dtypes.float32] = 1e-5
-    self._rtol[dtypes.float32] = 1e-5
-    self._atol[dtypes.float64] = 1e-10
-    self._rtol[dtypes.float64] = 1e-10
-
-  @property
-  def _dtypes_to_test(self):
-    return [dtypes.float32, dtypes.float64]
-
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    shape = list(shape)
-
-    matrix = linear_operator_test_util.random_positive_definite_matrix(
-        shape, dtype, force_well_conditioned=True)
-
-    if use_placeholder:
-      matrix_ph = array_ops.placeholder(dtype=dtype)
-      # Evaluate here because (i) you cannot feed a tensor, and (ii)
-      # values are random and we want the same value used for both mat and
-      # feed_dict.
-      matrix = matrix.eval()
-      operator = linalg.LinearOperatorMatrix(
-          matrix, is_self_adjoint=True, is_positive_definite=True)
-      feed_dict = {matrix_ph: matrix}
-    else:
-      operator = linalg.LinearOperatorMatrix(
-          matrix, is_self_adjoint=True, is_positive_definite=True)
-      feed_dict = None
-
-    # Convert back to Tensor.  Needed if use_placeholder, since then we have
-    # already evaluated matrix to a numpy array.
-    mat = ops.convert_to_tensor(matrix)
-
-    return operator, mat, feed_dict
-
-  def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[1., 0.], [0., 7.]]
-    operator = linalg.LinearOperatorMatrix(
-        matrix, is_positive_definite=True, is_self_adjoint=True)
-
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_self_adjoint)
-
-    # Should be auto-set
-    self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator._is_spd)
-
-
-class NonSquareLinearOperatorMatrixTest(
-    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
-  """Most tests done in the base class LinearOperatorDerivedClassTest."""
-
-  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
-    matrix = linear_operator_test_util.random_normal(shape, dtype=dtype)
-    if use_placeholder:
-      matrix_ph = array_ops.placeholder(dtype=dtype)
-      # Evaluate here because (i) you cannot feed a tensor, and (ii)
-      # values are random and we want the same value used for both mat and
-      # feed_dict.
-      matrix = matrix.eval()
-      operator = linalg.LinearOperatorMatrix(matrix)
-      feed_dict = {matrix_ph: matrix}
-    else:
-      operator = linalg.LinearOperatorMatrix(matrix)
-      feed_dict = None
-
-    # Convert back to Tensor.  Needed if use_placeholder, since then we have
-    # already evaluated matrix to a numpy array.
-    mat = ops.convert_to_tensor(matrix)
-
-    return operator, mat, feed_dict
-
-  def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[3., 0.], [1., 1.]]
-    operator = linalg.LinearOperatorMatrix(
-        matrix,
-        is_positive_definite=True,
-        is_non_singular=True,
-        is_self_adjoint=False)
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_non_singular)
-    self.assertFalse(operator.is_self_adjoint)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
index 8b419700db0..78a4822c177 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -54,9 +55,12 @@ class LinearOperatorShape(linalg.LinearOperator):
   def _shape_tensor(self):
     return constant_op.constant(self._stored_shape, dtype=dtypes.int32)
 
+  def _matmul(self):
+    raise NotImplementedError("Not needed for this test.")
 
-class LinearOperatorApplyOnly(linalg.LinearOperator):
-  """LinearOperator that simply wraps a [batch] matrix and implements apply."""
+
+class LinearOperatorMatmulSolve(linalg.LinearOperator):
+  """LinearOperator that wraps a [batch] matrix and implements matmul/solve."""
 
   def __init__(self,
                matrix,
@@ -65,8 +69,8 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
                is_positive_definite=None,
                is_square=None):
     self._matrix = ops.convert_to_tensor(matrix, name="matrix")
-    super(LinearOperatorApplyOnly, self).__init__(
-        dtype=matrix.dtype,
+    super(LinearOperatorMatmulSolve, self).__init__(
+        dtype=self._matrix.dtype,
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
@@ -78,8 +82,15 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._matrix, x, adjoint_a=adjoint)
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = ops.convert_to_tensor(x, name="x")
+    return math_ops.matmul(
+        self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = ops.convert_to_tensor(rhs, name="rhs")
+    assert not adjoint_arg, "Not implemented for this test class."
+    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
 
 
 class LinearOperatorTest(test.TestCase):
@@ -118,7 +129,7 @@ class LinearOperatorTest(test.TestCase):
 
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
@@ -127,12 +138,30 @@ class LinearOperatorTest(test.TestCase):
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
-    operator = LinearOperatorApplyOnly(matrix_ph)
+    operator = LinearOperatorMatmulSolve(matrix_ph)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllClose(
           matrix, operator_dense.eval(feed_dict={matrix_ph: matrix}))
 
+  def test_matvec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    x = [1., 1.]
+    with self.test_session():
+      y = operator.matvec(x)
+      self.assertAllEqual((2,), y.get_shape())
+      self.assertAllClose([1., 2.], y.eval())
+
+  def test_solvevec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    y = [1., 1.]
+    with self.test_session():
+      x = operator.solvevec(y)
+      self.assertAllEqual((2,), x.get_shape())
+      self.assertAllClose([1., 1 / 2.], x.eval())
+
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
     self.assertTrue(operator.is_square)
@@ -148,11 +177,11 @@ class LinearOperatorTest(test.TestCase):
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(matrix, is_non_singular=True, is_square=False)
+      LinearOperatorMatmulSolve(matrix, is_non_singular=True, is_square=False)
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_non_square_operators_raise_on_determinant_and_solve(self):
@@ -166,16 +195,16 @@ class LinearOperatorTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     self.assertEqual(None, operator.is_square)
     # Set to True
-    operator = LinearOperatorApplyOnly(matrix, is_square=True)
+    operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
 
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py
index 57521b2a32a..9f5f2856f13 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_tril_test.py
@@ -60,14 +60,6 @@ class LinearOperatorTriLTest(
 
     return operator, mat, feed_dict
 
-  def test_assert_positive_definite(self):
-    # Singlular matrix with one positive eigenvalue and one negative eigenvalue.
-    with self.test_session():
-      tril = [[1., 0.], [1., -1.]]
-      operator = linalg.LinearOperatorTriL(tril)
-      with self.assertRaisesOpError("was not positive definite"):
-        operator.assert_positive_definite().run()
-
   def test_assert_non_singular(self):
     # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
     with self.test_session():
@@ -88,6 +80,10 @@ class LinearOperatorTriLTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_tril_must_have_at_least_two_dims_or_raises(self):
+    with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
+      linalg.LinearOperatorTriL([1.])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
index ca8c6e3effa..f28213096b7 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import linalg as linalg_lib
 from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
 from tensorflow.python.framework import dtypes
@@ -27,6 +29,7 @@ from tensorflow.python.platform import test
 
 linalg = linalg_lib
 random_seed.set_random_seed(23)
+rng = np.random.RandomState(0)
 
 
 class BaseLinearOperatorUDVHUpdatetest(object):
@@ -36,13 +39,13 @@ class BaseLinearOperatorUDVHUpdatetest(object):
 
   # If True, A = L + UDV^H
   # If False, A = L + UV^H or A = L + UU^H, depending on _use_v.
-  _use_diag_perturbation = None
+  _use_diag_update = None
 
   # If True, diag is > 0, which means D is symmetric positive definite.
-  _is_diag_positive = None
+  _is_diag_update_positive = None
 
   # If True, A = L + UDV^H
-  # If False, A = L + UDU^H or A = L + UU^H, depending on _use_diag_perturbation
+  # If False, A = L + UDU^H or A = L + UU^H, depending on _use_diag_update
   _use_v = None
 
   @property
@@ -53,10 +56,11 @@ class BaseLinearOperatorUDVHUpdatetest(object):
 
   @property
   def _shapes_to_test(self):
-    # Add the (2, 10, 10) shape at the end to get something slightly larger than
-    # the other tests.  Doing this because this operator makes use of inversion
-    # and determinant lemmas that are known to have stability issues.
-    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4), (2, 10, 10)]
+    # Previously we had a (2, 10, 10) shape at the end.  We did this to test the
+    # inversion and determinant lemmas on not-tiny matrices, since these are
+    # known to have stability issues.  This resulted in test timeouts, so this
+    # shape has been removed, but rest assured, the tests did pass.
+    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4)]
 
   def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
     # Recall A = L + UDV^H
@@ -64,7 +68,7 @@ class BaseLinearOperatorUDVHUpdatetest(object):
     diag_shape = shape[:-1]
     k = shape[-2] // 2 + 1
     u_perturbation_shape = shape[:-1] + [k]
-    diag_perturbation_shape = shape[:-2] + [k]
+    diag_update_shape = shape[:-2] + [k]
 
     # base_operator L will be a symmetric positive definite diagonal linear
     # operator, with condition number as high as 1e4.
@@ -83,13 +87,13 @@ class BaseLinearOperatorUDVHUpdatetest(object):
     v_ph = array_ops.placeholder(dtype=dtype)
 
     # D
-    if self._is_diag_positive:
-      diag_perturbation = linear_operator_test_util.random_uniform(
-          diag_perturbation_shape, minval=1e-4, maxval=1., dtype=dtype)
+    if self._is_diag_update_positive:
+      diag_update = linear_operator_test_util.random_uniform(
+          diag_update_shape, minval=1e-4, maxval=1., dtype=dtype)
     else:
-      diag_perturbation = linear_operator_test_util.random_normal(
-          diag_perturbation_shape, stddev=1e-4, dtype=dtype)
-    diag_perturbation_ph = array_ops.placeholder(dtype=dtype)
+      diag_update = linear_operator_test_util.random_normal(
+          diag_update_shape, stddev=1e-4, dtype=dtype)
+    diag_update_ph = array_ops.placeholder(dtype=dtype)
 
     if use_placeholder:
       # Evaluate here because (i) you cannot feed a tensor, and (ii)
@@ -98,7 +102,7 @@ class BaseLinearOperatorUDVHUpdatetest(object):
       base_diag = base_diag.eval()
       u = u.eval()
       v = v.eval()
-      diag_perturbation = diag_perturbation.eval()
+      diag_update = diag_update.eval()
 
       # In all cases, set base_operator to be positive definite.
       base_operator = linalg.LinearOperatorDiag(
@@ -108,13 +112,13 @@ class BaseLinearOperatorUDVHUpdatetest(object):
           base_operator,
           u=u_ph,
           v=v_ph if self._use_v else None,
-          diag=diag_perturbation_ph if self._use_diag_perturbation else None,
-          is_diag_positive=self._is_diag_positive)
+          diag_update=diag_update_ph if self._use_diag_update else None,
+          is_diag_update_positive=self._is_diag_update_positive)
       feed_dict = {
           base_diag_ph: base_diag,
           u_ph: u,
           v_ph: v,
-          diag_perturbation_ph: diag_perturbation}
+          diag_update_ph: diag_update}
     else:
       base_operator = linalg.LinearOperatorDiag(
           base_diag, is_positive_definite=True)
@@ -122,31 +126,31 @@ class BaseLinearOperatorUDVHUpdatetest(object):
           base_operator,
           u,
           v=v if self._use_v else None,
-          diag=diag_perturbation if self._use_diag_perturbation else None,
-          is_diag_positive=self._is_diag_positive)
+          diag_update=diag_update if self._use_diag_update else None,
+          is_diag_update_positive=self._is_diag_update_positive)
       feed_dict = None
 
     # The matrix representing L
     base_diag_mat = array_ops.matrix_diag(base_diag)
 
     # The matrix representing D
-    diag_perturbation_mat = array_ops.matrix_diag(diag_perturbation)
+    diag_update_mat = array_ops.matrix_diag(diag_update)
 
     # Set up mat as some variant of A = L + UDV^H
-    if self._use_v and self._use_diag_perturbation:
+    if self._use_v and self._use_diag_update:
       # In this case, we have L + UDV^H and it isn't symmetric.
       expect_use_cholesky = False
       mat = base_diag_mat + math_ops.matmul(
-          u, math_ops.matmul(diag_perturbation_mat, v, adjoint_b=True))
+          u, math_ops.matmul(diag_update_mat, v, adjoint_b=True))
     elif self._use_v:
       # In this case, we have L + UDV^H and it isn't symmetric.
       expect_use_cholesky = False
       mat = base_diag_mat + math_ops.matmul(u, v, adjoint_b=True)
-    elif self._use_diag_perturbation:
+    elif self._use_diag_update:
       # In this case, we have L + UDU^H, which is PD if D > 0, since L > 0.
-      expect_use_cholesky = self._is_diag_positive
+      expect_use_cholesky = self._is_diag_update_positive
       mat = base_diag_mat + math_ops.matmul(
-          u, math_ops.matmul(diag_perturbation_mat, u, adjoint_b=True))
+          u, math_ops.matmul(diag_update_mat, u, adjoint_b=True))
     else:
       # In this case, we have L + UU^H, which is PD since L > 0.
       expect_use_cholesky = True
@@ -165,8 +169,8 @@ class LinearOperatorUDVHUpdatetestWithDiagUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D > 0, L > 0 ==> A > 0 and we can use a Cholesky."""
 
-  _use_diag_perturbation = True
-  _is_diag_positive = True
+  _use_diag_update = True
+  _is_diag_update_positive = True
   _use_v = False
 
   def setUp(self):
@@ -183,8 +187,8 @@ class LinearOperatorUDVHUpdatetestWithDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
-  _use_diag_perturbation = True
-  _is_diag_positive = False
+  _use_diag_update = True
+  _is_diag_update_positive = False
   _use_v = False
 
   def setUp(self):
@@ -202,8 +206,8 @@ class LinearOperatorUDVHUpdatetestNoDiagUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UU^H, L > 0 ==> A > 0 and we can use a Cholesky."""
 
-  _use_diag_perturbation = False
-  _is_diag_positive = None
+  _use_diag_update = False
+  _is_diag_update_positive = None
   _use_v = False
 
   def setUp(self):
@@ -220,8 +224,8 @@ class LinearOperatorUDVHUpdatetestNoDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
-  _use_diag_perturbation = False
-  _is_diag_positive = None
+  _use_diag_update = False
+  _is_diag_update_positive = None
   _use_v = True
 
   def setUp(self):
@@ -239,10 +243,82 @@ class LinearOperatorUDVHUpdatetestWithDiagNotSquare(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D > 0, L > 0 ==> A > 0 and we can use a Cholesky."""
 
-  _use_diag_perturbation = True
-  _is_diag_positive = True
+  _use_diag_update = True
+  _is_diag_update_positive = True
   _use_v = True
 
 
+class LinearOpearatorUDVHUpdateBroadcastsShape(test.TestCase):
+  """Test that the operator's shape is the broadcast of arguments."""
+
+  def test_static_shape_broadcasts_up_from_operator_to_other_args(self):
+    base_operator = linalg.LinearOperatorIdentity(num_rows=3)
+    u = array_ops.ones(shape=[2, 3, 2])
+    diag = array_ops.ones(shape=[2, 2])
+
+    operator = linalg.LinearOperatorUDVHUpdate(
+        base_operator, u, diag)
+
+    # domain_dimension is 3
+    self.assertAllEqual([2, 3, 3], operator.shape)
+    with self.test_session():
+      self.assertAllEqual([2, 3, 3], operator.to_dense().eval().shape)
+
+  def test_dynamic_shape_broadcasts_up_from_operator_to_other_args(self):
+    num_rows_ph = array_ops.placeholder(dtypes.int32)
+
+    base_operator = linalg.LinearOperatorIdentity(num_rows=num_rows_ph)
+
+    u_shape_ph = array_ops.placeholder(dtypes.int32)
+    u = array_ops.ones(shape=u_shape_ph)
+
+    operator = linalg.LinearOperatorUDVHUpdate(base_operator, u)
+
+    feed_dict = {
+        num_rows_ph: 3,
+        u_shape_ph: [2, 3, 2],  # batch_shape = [2]
+    }
+
+    with self.test_session():
+      shape_tensor = operator.shape_tensor().eval(feed_dict=feed_dict)
+      self.assertAllEqual([2, 3, 3], shape_tensor)
+      dense = operator.to_dense().eval(feed_dict=feed_dict)
+      self.assertAllEqual([2, 3, 3], dense.shape)
+
+  def test_u_and_v_incompatible_batch_shape_raises(self):
+    base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
+    u = rng.rand(5, 3, 2)
+    v = rng.rand(4, 3, 2)
+    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+      linalg.LinearOperatorUDVHUpdate(base_operator, u=u, v=v)
+
+  def test_u_and_base_operator_incompatible_batch_shape_raises(self):
+    base_operator = linalg.LinearOperatorIdentity(
+        num_rows=3, batch_shape=[4], dtype=np.float64)
+    u = rng.rand(5, 3, 2)
+    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+      linalg.LinearOperatorUDVHUpdate(base_operator, u=u)
+
+  def test_u_and_base_operator_incompatible_domain_dimension(self):
+    base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
+    u = rng.rand(5, 4, 2)
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      linalg.LinearOperatorUDVHUpdate(base_operator, u=u)
+
+  def test_u_and_diag_incompatible_low_rank_raises(self):
+    base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
+    u = rng.rand(5, 3, 2)
+    diag = rng.rand(5, 4)  # Last dimension should be 2
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      linalg.LinearOperatorUDVHUpdate(base_operator, u=u, diag_update=diag)
+
+  def test_diag_incompatible_batch_shape_raises(self):
+    base_operator = linalg.LinearOperatorIdentity(num_rows=3, dtype=np.float64)
+    u = rng.rand(5, 3, 2)
+    diag = rng.rand(4, 2)  # First dimension should be 5
+    with self.assertRaisesRegexp(ValueError, "Incompatible shapes"):
+      linalg.LinearOperatorUDVHUpdate(base_operator, u=u, diag_update=diag)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
index bf6f8f83027..f047f4b9787 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
@@ -21,8 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import linalg as linalg_lib
 from tensorflow.contrib.linalg.python.ops import linear_operator_util
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -91,6 +93,165 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
             z, message="ABC123").run()
 
 
+class BroadcastMatrixBatchDimsTest(test.TestCase):
+
+  def test_zero_batch_matrices_returned_as_empty_list(self):
+    self.assertAllEqual(
+        [], linear_operator_util.broadcast_matrix_batch_dims([]))
+
+  def test_one_batch_matrix_returned_after_tensor_conversion(self):
+    arr = rng.rand(2, 3, 4)
+    tensor, = linear_operator_util.broadcast_matrix_batch_dims([arr])
+    self.assertTrue(isinstance(tensor, ops.Tensor))
+
+    with self.test_session():
+      self.assertAllClose(arr, tensor.eval())
+
+  def test_static_dims_broadcast(self):
+    # x.batch_shape = [3, 1, 2]
+    # y.batch_shape = [4, 1]
+    # broadcast batch shape = [3, 4, 2]
+    x = rng.rand(3, 1, 2, 1, 5)
+    y = rng.rand(4, 1, 3, 7)
+    batch_of_zeros = np.zeros((3, 4, 2, 1, 1))
+    x_bc_expected = x + batch_of_zeros
+    y_bc_expected = y + batch_of_zeros
+
+    x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
+
+    with self.test_session() as sess:
+      self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
+      self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
+      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      self.assertAllClose(x_bc_expected, x_bc_)
+      self.assertAllClose(y_bc_expected, y_bc_)
+
+  def test_static_dims_broadcast_second_arg_higher_rank(self):
+    # x.batch_shape =    [1, 2]
+    # y.batch_shape = [1, 3, 1]
+    # broadcast batch shape = [1, 3, 2]
+    x = rng.rand(1, 2, 1, 5)
+    y = rng.rand(1, 3, 2, 3, 7)
+    batch_of_zeros = np.zeros((1, 3, 2, 1, 1))
+    x_bc_expected = x + batch_of_zeros
+    y_bc_expected = y + batch_of_zeros
+
+    x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
+
+    with self.test_session() as sess:
+      self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
+      self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
+      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      self.assertAllClose(x_bc_expected, x_bc_)
+      self.assertAllClose(y_bc_expected, y_bc_)
+
+  def test_dynamic_dims_broadcast_32bit(self):
+    # x.batch_shape = [3, 1, 2]
+    # y.batch_shape = [4, 1]
+    # broadcast batch shape = [3, 4, 2]
+    x = rng.rand(3, 1, 2, 1, 5).astype(np.float32)
+    y = rng.rand(4, 1, 3, 7).astype(np.float32)
+    batch_of_zeros = np.zeros((3, 4, 2, 1, 1)).astype(np.float32)
+    x_bc_expected = x + batch_of_zeros
+    y_bc_expected = y + batch_of_zeros
+
+    x_ph = array_ops.placeholder(dtypes.float32)
+    y_ph = array_ops.placeholder(dtypes.float32)
+
+    x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
+
+    with self.test_session() as sess:
+      x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
+      self.assertAllClose(x_bc_expected, x_bc_)
+      self.assertAllClose(y_bc_expected, y_bc_)
+
+  def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self):
+    # x.batch_shape =    [1, 2]
+    # y.batch_shape = [3, 4, 1]
+    # broadcast batch shape = [3, 4, 2]
+    x = rng.rand(1, 2, 1, 5).astype(np.float32)
+    y = rng.rand(3, 4, 1, 3, 7).astype(np.float32)
+    batch_of_zeros = np.zeros((3, 4, 2, 1, 1)).astype(np.float32)
+    x_bc_expected = x + batch_of_zeros
+    y_bc_expected = y + batch_of_zeros
+
+    x_ph = array_ops.placeholder(dtypes.float32)
+    y_ph = array_ops.placeholder(dtypes.float32)
+
+    x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
+
+    with self.test_session() as sess:
+      x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
+      self.assertAllClose(x_bc_expected, x_bc_)
+      self.assertAllClose(y_bc_expected, y_bc_)
+
+  def test_less_than_two_dims_raises_static(self):
+    x = rng.rand(3)
+    y = rng.rand(1, 1)
+
+    with self.assertRaisesRegexp(ValueError, "at least two dimensions"):
+      linear_operator_util.broadcast_matrix_batch_dims([x, y])
+
+    with self.assertRaisesRegexp(ValueError, "at least two dimensions"):
+      linear_operator_util.broadcast_matrix_batch_dims([y, x])
+
+
+class MatmulWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
+    x = rng.rand(2, 1, 3)
+    y = rng.rand(3, 7)
+    y_broadcast = y + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matmul_with_broadcast(x, y)
+      self.assertAllEqual((2, 1, 7), result.get_shape())
+      expected = math_ops.matmul(x, y_broadcast)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_32bit(self):
+    # batch_shape = [2]
+    # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
+    x = rng.rand(2, 1, 3)
+    y = rng.rand(3, 7)
+    y_broadcast = y + np.zeros((2, 1, 1))
+
+    x_ph = array_ops.placeholder(dtypes.float64)
+    y_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
+           math_ops.matmul(x, y_broadcast)],
+          feed_dict={x_ph: x, y_ph: y})
+      self.assertAllEqual(expected, result)
+
+
+class MatrixAdjointTest(test.TestCase):
+
+  def testNonBatchMatrix(self):
+    a = [[1, 2, 3j], [4, 5, -6j]]  # Shape (2, 3)
+    expected = [[1, 4], [2, 5], [-3j, 6j]]  # Shape (3, 2)
+    with self.test_session():
+      a_adj = linear_operator_util.matrix_adjoint(a)
+      self.assertEqual((3, 2), a_adj.get_shape())
+      self.assertAllClose(expected, a_adj.eval())
+
+  def testBatchMatrix(self):
+    matrix_0 = [[1j, 2, 3], [4, 5, 6]]
+    matrix_0_a = [[-1j, 4], [2, 5], [3, 6]]
+    matrix_1 = [[11, 22, 33], [44, 55, 66j]]
+    matrix_1_a = [[11, 44], [22, 55], [33, -66j]]
+    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    expected_adj = [matrix_0_a, matrix_1_a]  # Shape (2, 3, 2)
+    with self.test_session():
+      matrix_adj = linear_operator_util.matrix_adjoint(batch_matrix)
+      self.assertEqual((2, 3, 2), matrix_adj.get_shape())
+      self.assertAllEqual(expected_adj, matrix_adj.eval())
+
+
 class DomainDimensionStubOperator(object):
 
   def __init__(self, domain_dimension):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index 782ca6c5d2c..6cdfa861893 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -18,19 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import contextlib
 
+import numpy as np
+
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 __all__ = ["LinearOperator"]
 
 
 # TODO(langmore) Use matrix_solve_ls for singular or non-square matrices.
-# TODO(langmore) Add adjoint_x arg to apply, solve.
 class LinearOperator(object):
   """Base class defining a [batch of] linear operator[s].
 
@@ -46,16 +51,14 @@ class LinearOperator(object):
   To enable a public method, subclasses should implement the leading-underscore
   version of the method.  The argument signature should be identical except for
   the omission of `name="..."`.  For example, to enable
-  `apply(x, adjoint=False, name="apply")` a subclass should implement
-  `_apply(x, adjoint=False)`.
+  `matmul(x, adjoint=False, name="matmul")` a subclass should implement
+  `_matmul(x, adjoint=False)`.
 
   #### Performance contract
 
-  Subclasses should implement a method only if it can be done with a reasonable
-  performance increase over generic dense operations, either in time, parallel
-  scalability, or memory usage.  For example, if the determinant can only be
-  computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-  should not be implemented.
+  Subclasses should only implement the assert methods
+  (e.g. `assert_non_singular`) if they can be done in less than `O(N^3)`
+  time.
 
   Class docstrings should contain an explanation of computational complexity.
   Since this is a high-performance library, attention should be paid to detail,
@@ -69,7 +72,7 @@ class LinearOperator(object):
 
   An example is:
 
-  `x` is a batch matrix with compatible shape for `apply` if
+  `x` is a batch matrix with compatible shape for `matmul` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
@@ -101,12 +104,12 @@ class LinearOperator(object):
   operator.shape()
   ==> [2, 4, 4]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> Shape [2] Tensor
 
   x = ... Shape [2, 4, 5] Tensor
 
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4, 5] Tensor
   ```
 
@@ -132,6 +135,7 @@ class LinearOperator(object):
   * If `is_X == None` (the default), callers should have no expectation either
     way.
   """
+  __metaclass__ = abc.ABCMeta
 
   def __init__(self,
                dtype,
@@ -147,7 +151,7 @@ class LinearOperator(object):
     **Subclasses should copy-paste this `__init__` documentation.**
 
     Args:
-      dtype: The type of the this `LinearOperator`.  Arguments to `apply` and
+      dtype: The type of the this `LinearOperator`.  Arguments to `matmul` and
         `solve` will have to be this type.
       graph_parents: Python list of graph prerequisites of this `LinearOperator`
         Typically tensors that are passed during initialization.
@@ -155,8 +159,9 @@ class LinearOperator(object):
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `dtype` is real, this is equivalent to being symmetric.
       is_positive_definite:  Expect that this operator is positive definite,
-        meaning the real part of all eigenvalues is positive.  We do not require
-        the operator to be self-adjoint to be positive-definite.  See:
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
@@ -167,17 +172,23 @@ class LinearOperator(object):
       ValueError:  If hints are set incorrectly.
     """
     # Check and auto-set flags.
-    if is_square is False:
-      if is_non_singular or is_positive_definite:
-        raise ValueError(
-            "A non-singular or positive definite operator is always square.")
-    self._is_square_set_by_user = is_square
-
     if is_positive_definite:
       if is_non_singular is False:
         raise ValueError("A positive definite matrix is always non-singular.")
       is_non_singular = True
 
+    if is_non_singular:
+      if is_square is False:
+        raise ValueError("A non-singular matrix is always square.")
+      is_square = True
+
+    if is_self_adjoint:
+      if is_square is False:
+        raise ValueError("A self-adjoint matrix is always square.")
+      is_square = True
+
+    self._is_square_set_or_implied_by_hints = is_square
+
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
       if t is None or not contrib_framework.is_tensor(t):
@@ -239,15 +250,16 @@ class LinearOperator(object):
     """Return `True/False` depending on if this operator is square."""
     # Static checks done after __init__.  Why?  Because domain/range dimension
     # sometimes requires lots of work done in the derived class after init.
-    static_square_check = self.domain_dimension == self.range_dimension
-    if self._is_square_set_by_user is False and static_square_check:
+    auto_square_check = self.domain_dimension == self.range_dimension
+    if self._is_square_set_or_implied_by_hints is False and auto_square_check:
       raise ValueError(
           "User set is_square hint to False, but the operator was square.")
-    if self._is_square_set_by_user is None:
-      return static_square_check
+    if self._is_square_set_or_implied_by_hints is None:
+      return auto_square_check
 
-    return self._is_square_set_by_user
+    return self._is_square_set_or_implied_by_hints
 
+  @abc.abstractmethod
   def _shape(self):
     # Write this in derived class to enable all static shape methods.
     raise NotImplementedError("_shape is not implemented.")
@@ -265,6 +277,7 @@ class LinearOperator(object):
     """
     return self._shape()
 
+  @abc.abstractmethod
   def _shape_tensor(self):
     raise NotImplementedError("_shape_tensor is not implemented.")
 
@@ -367,8 +380,7 @@ class LinearOperator(object):
           self._cached_tensor_rank_tensor = ops.convert_to_tensor(
               self.tensor_rank)
         else:
-          self._cached_tensor_rank_tensor = array_ops.size(
-              self.shape_tensor())
+          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
       return self._cached_tensor_rank_tensor
 
   @property
@@ -448,36 +460,112 @@ class LinearOperator(object):
       return self._cached_range_dimension_tensor
 
   def _assert_non_singular(self):
+    """Private default implementation of _assert_non_singular."""
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_non_singular."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return self.assert_positive_definite()
+    else:
+      singular_values = linalg_ops.svd(
+          self._get_cached_dense_matrix(), compute_uv=False)
+      # TODO(langmore) Add .eig and .cond as methods.
+      cond = (math_ops.reduce_max(singular_values, axis=-1) /
+              math_ops.reduce_min(singular_values, axis=-1))
+      return check_ops.assert_less(
+          cond,
+          self._max_condition_number_to_be_non_singular(),
+          message="Singular matrix up to precision epsilon.")
     raise NotImplementedError("assert_non_singular is not implemented.")
 
+  def _max_condition_number_to_be_non_singular(self):
+    """Return the maximum condition number that we consider nonsingular."""
+    with ops.name_scope("max_nonsingular_condition_number"):
+      dtype_eps = np.finfo(self.dtype.as_numpy_dtype).eps
+      eps = math_ops.cast(
+          math_ops.reduce_max([
+              100.,
+              math_ops.cast(self.range_dimension_tensor(), self.dtype),
+              math_ops.cast(self.domain_dimension_tensor(), self.dtype)
+          ]), self.dtype) * dtype_eps
+      return 1. / eps
+
   def assert_non_singular(self, name="assert_non_singular"):
-    """Returns an `Op` that asserts this operator is non singular."""
+    """Returns an `Op` that asserts this operator is non singular.
+
+    This operator is considered non-singular if
+
+    ```
+    ConditionNumber < max{100, range_dimension, domain_dimension} * eps,
+    eps := np.finfo(self.dtype.as_numpy_dtype).eps
+    ```
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is singular.
+    """
     with self._name_scope(name):
       return self._assert_non_singular()
 
   def _assert_positive_definite(self):
+    """Default implementation of _assert_positive_definite."""
+    logging.warn(
+        "Using (possibly slow) default implementation of "
+        "assert_positive_definite."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    # If the operator is self-adjoint, then checking that
+    # Cholesky decomposition succeeds + results in positive diag is necessary
+    # and sufficient.
+    if self.is_self_adjoint:
+      return check_ops.assert_positive(
+          array_ops.matrix_diag_part(self._get_cached_chol()),
+          message="Matrix was not positive definite.")
+    # We have no generic check for positive definite.
     raise NotImplementedError("assert_positive_definite is not implemented.")
 
   def assert_positive_definite(self, name="assert_positive_definite"):
     """Returns an `Op` that asserts this operator is positive definite.
 
-    Here, positive definite means the real part of all eigenvalues is positive.
-    We do not require the operator to be self-adjoint.
+    Here, positive definite means that the quadratic form `x^H A x` has positive
+    real part for all nonzero `x`.  Note that we do not require the operator to
+    be self-adjoint to be positive definite.
 
     Args:
       name:  A name to give this `Op`.
 
     Returns:
-      An `Op` that asserts this operator is positive definite.
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not positive definite.
     """
     with self._name_scope(name):
       return self._assert_positive_definite()
 
   def _assert_self_adjoint(self):
-    raise NotImplementedError("assert_self_adjoint is not implemented.")
+    dense = self._get_cached_dense_matrix()
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_self_adjoint."
+        "  Requires conversion to a dense matrix.")
+    return check_ops.assert_equal(
+        dense,
+        linear_operator_util.matrix_adjoint(dense),
+        message="Matrix was not equal to its adjoint.")
 
   def assert_self_adjoint(self, name="assert_self_adjoint"):
-    """Returns an `Op` that asserts this operator is self-adjoint."""
+    """Returns an `Op` that asserts this operator is self-adjoint.
+
+    Here we check that this operator is *exactly* equal to its hermitian
+    transpose.
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not self-adjoint.
+    """
     with self._name_scope(name):
       return self._assert_self_adjoint()
 
@@ -485,19 +573,36 @@ class LinearOperator(object):
     """Check that arg.dtype == self.dtype."""
     if arg.dtype != self.dtype:
       raise TypeError(
-          "Expected argument to have dtype %s.  Found: %s in tensor %s"
-          % (self.dtype, arg.dtype, arg))
+          "Expected argument to have dtype %s.  Found: %s in tensor %s" %
+          (self.dtype, arg.dtype, arg))
 
-  def _apply(self, x, adjoint=False):
-    raise NotImplementedError("_apply is not implemented.")
+  @abc.abstractmethod
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError("_matmul is not implemented.")
 
-  def apply(self, x, adjoint=False, name="apply"):
-    """Transform `x` with left multiplication:  `x --> Ax`.
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+    """Transform [batch] matrix `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    X = ... # shape [..., N, R], batch matrix, R > 0.
+
+    Y = operator.matmul(X)
+    Y.shape
+    ==> [..., M, R]
+
+    Y[..., :, r] = sum_j A[..., :, j] X[j, r]
+    ```
 
     Args:
       x: `Tensor` with compatible shape and same `dtype` as `self`.
         See class docstring for definition of compatibility.
-      adjoint: Python `bool`.  If `True`, left multiply by the adjoint.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
+        the hermitian transpose (transposition and complex conjugation).
       name:  A name for this `Op.
 
     Returns:
@@ -506,14 +611,59 @@ class LinearOperator(object):
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
-      if adjoint:
-        self.shape[-2].assert_is_compatible_with(x.get_shape()[-2])
-      else:
-        self.shape[-1].assert_is_compatible_with(x.get_shape()[-2])
-      return self._apply(x, adjoint=adjoint)
+
+      self_dim = -2 if adjoint else -1
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _matvec(self, x, adjoint=False):
+    x_mat = array_ops.expand_dims(x, axis=-1)
+    y_mat = self.matmul(x_mat, adjoint=adjoint)
+    return array_ops.squeeze(y_mat, axis=-1)
+
+  def matvec(self, x, adjoint=False, name="matvec"):
+    """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matric A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+
+    X = ... # shape [..., N], batch vector
+
+    Y = operator.matvec(X)
+    Y.shape
+    ==> [..., M]
+
+    Y[..., :] = sum_j A[..., :, j] X[..., j]
+    ```
+
+    Args:
+      x: `Tensor` with compatible shape and same `dtype` as `self`.
+        `x` is treated as a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      name:  A name for this `Op.
+
+    Returns:
+      A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
+    """
+    with self._name_scope(name, values=[x]):
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+      self_dim = -2 if adjoint else -1
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[-1])
+      return self._matvec(x, adjoint=adjoint)
 
   def _determinant(self):
-    raise NotImplementedError("_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return math_ops.exp(self.log_abs_determinant())
+    return linalg_ops.matrix_determinant(self._matrix)
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
@@ -535,7 +685,14 @@ class LinearOperator(object):
       return self._determinant()
 
   def _log_abs_determinant(self):
-    raise NotImplementedError("_log_abs_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+    abs_det = math_ops.abs(self.determinant())
+    return math_ops.log(abs_det)
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
@@ -556,37 +713,53 @@ class LinearOperator(object):
     with self._name_scope(name):
       return self._log_abs_determinant()
 
-  def _solve(self, rhs, adjoint=False):
-    # Since this is an exact solve method for all rhs, this will only be
-    # available for non-singular (batch) operators, in particular the operator
-    # must be square.
-    raise NotImplementedError("_solve is not implemented.")
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Default implementation of _solve."""
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Solve is not yet implemented for non-square operators.")
+    logging.warn(
+        "Using (possibly slow) default implementation of solve."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
+    if self._can_use_cholesky():
+      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
+    return linalg_ops.matrix_solve(
+        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
 
-  def solve(self, rhs, adjoint=False, name="solve"):
-    """Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+  def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
+    """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
 
     Examples:
 
     ```python
-    # Create an operator acting like a 10 x 2 x 2 matrix.
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
     operator = LinearOperator(...)
-    operator.shape # = 10 x 2 x 2
+    operator.shape = [..., M, N]
 
-    # Solve one linear system (R = 1) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 1
-    X = operator.solve(RHS)  # shape 10 x 2 x 1
+    # Solve R > 0 linear systems for every member of the batch.
+    RHS = ... # shape [..., M, R]
 
-    # Solve five linear systems (R = 5) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 5
     X = operator.solve(RHS)
-    X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
+    # X[..., :, r] is the solution to the r'th linear system
+    # sum_j A[..., :, j] X[..., j, r] = RHS[..., :, r]
+
+    operator.matmul(X)
+    ==> RHS
     ```
 
     Args:
       rhs: `Tensor` with same `dtype` as this operator and compatible shape.
+        `rhs` is treated like a [batch] matrix meaning for every set of leading
+        dimensions, the last two dimensions defines a matrix.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
-        of this `LinearOperator`.
+        of this `LinearOperator`:  `A^H X = rhs`.
+      adjoint_arg:  Python `bool`.  If `True`, solve `A X = rhs^H` where `rhs^H`
+        is the hermitian transpose (transposition and complex conjugation).
       name:  A name scope to use for ops added by this method.
 
     Returns:
@@ -606,14 +779,70 @@ class LinearOperator(object):
     with self._name_scope(name, values=[rhs]):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
-      if adjoint:
-        self.shape[-1].assert_is_compatible_with(rhs.get_shape()[-2])
-      else:
-        self.shape[-2].assert_is_compatible_with(rhs.get_shape()[-2])
-      return self._solve(rhs, adjoint=adjoint)
+
+      self_dim = -1 if adjoint else -2
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[arg_dim])
+
+      return self._solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _solvevec(self, rhs, adjoint=False):
+    """Default implementation of _solvevec."""
+    rhs_mat = array_ops.expand_dims(rhs, axis=-1)
+    solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+    return array_ops.squeeze(solution_mat, axis=-1)
+
+  def solvevec(self, rhs, adjoint=False, name="solve"):
+    """Solve single equation with best effort: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve one linear system for every member of the batch.
+    RHS = ... # shape [..., M]
+
+    X = operator.solvevec(RHS)
+    # X is the solution to the linear system
+    # sum_j A[..., :, j] X[..., j] = RHS[..., :]
+
+    operator.matvec(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator.
+        `rhs` is treated like a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.  See class docstring
+        for definition of compatibility regarding batch dimensions.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    with self._name_scope(name, values=[rhs]):
+      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      self._check_input_dtype(rhs)
+      self_dim = -1 if adjoint else -2
+      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[-1])
+
+      return self._solvevec(rhs, adjoint=adjoint)
 
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
+    logging.warn("Using (possibly slow) default implementation of to_dense."
+                 "  Converts by self.matmul(identity).")
     if self.batch_shape.is_fully_defined():
       batch_shape = self.batch_shape
     else:
@@ -625,16 +854,48 @@ class LinearOperator(object):
       n = self.domain_dimension_tensor()
 
     eye = linalg_ops.eye(num_rows=n, batch_shape=batch_shape, dtype=self.dtype)
-    return self.apply(eye)
+    return self.matmul(eye)
 
   def to_dense(self, name="to_dense"):
     """Return a dense (batch) matrix representing this operator."""
     with self._name_scope(name):
       return self._to_dense()
 
+  def _diag_part(self):
+    """Generic and often inefficient implementation.  Override often."""
+    return array_ops.matrix_diag_part(self._get_cached_dense_matrix())
+
+  def diag_part(self, name="diag_part"):
+    """Efficiently get the [batch] diagonal part of this operator.
+
+    If this operator has shape `[B1,...,Bb, M, N]`, this returns a
+    `Tensor` `diagonal`, of shape `[B1,...,Bb, min(M, N)]`, where
+    `diagonal[b1,...,bb, i] = self.to_dense()[b1,...,bb, i, i]`.
+
+    ```
+    my_operator = LinearOperatorDiag([1., 2.])
+
+    # Efficiently get the diagonal
+    my_operator.diag_part()
+    ==> [1., 2.]
+
+    # Equivalent, but inefficient method
+    tf.matrix_diag_part(my_operator.to_dense())
+    ==> [1., 2.]
+    ```
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      diag_part:  A `Tensor` of same `dtype` as self.
+    """
+    with self._name_scope(name):
+      return self._diag_part()
+
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
-    return self.to_dense() + x
+    return self._get_cached_dense_matrix() + x
 
   def add_to_tensor(self, x, name="add_to_tensor"):
     """Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
@@ -650,3 +911,18 @@ class LinearOperator(object):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
+
+  def _can_use_cholesky(self):
+    # TODO(langmore) Add complex types when tf.cholesky can use them.
+    return (not self.dtype.is_complex and self.is_self_adjoint and
+            self.is_positive_definite)
+
+  def _get_cached_dense_matrix(self):
+    if not hasattr(self, "_cached_dense_matrix"):
+      self._cached_dense_matrix = self.to_dense()
+    return self._cached_dense_matrix
+
+  def _get_cached_chol(self):
+    if not hasattr(self, "_cached_chol"):
+      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
+    return self._cached_chol
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
new file mode 100644
index 00000000000..16c4c6e6d67
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
@@ -0,0 +1,431 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Add one or more `LinearOperators` efficiently."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.linalg.python.ops import linear_operator
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag
+from tensorflow.contrib.linalg.python.ops import linear_operator_full_matrix
+from tensorflow.contrib.linalg.python.ops import linear_operator_identity
+from tensorflow.contrib.linalg.python.ops import linear_operator_tril
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+
+__all__ = []
+
+
+def add_operators(operators,
+                  operator_name=None,
+                  addition_tiers=None,
+                  name=None):
+  """Efficiently add one or more linear operators.
+
+  Given operators `[A1, A2,...]`, this `Op` returns a possibly shorter list of
+  operators `[B1, B2,...]` such that
+
+  ```sum_k Ak.matmul(x) = sum_k Bk.matmul(x).```
+
+  The operators `Bk` result by adding some of the `Ak`, as allowed by
+  `addition_tiers`.
+
+  Example of efficient adding of diagonal operators.
+
+  ```python
+  A1 = LinearOperatorDiag(diag=[1., 1.], name="A1")
+  A2 = LinearOperatorDiag(diag=[2., 2.], name="A2")
+
+  # Use two tiers, the first contains an Adder that returns Diag.  Since both
+  # A1 and A2 are Diag, they can use this Adder.  The second tier will not be
+  # used.
+  addition_tiers = [
+      [_AddAndReturnDiag()],
+      [_AddAndReturnMatrix()]]
+  B_list = add_operators([A1, A2], addition_tiers=addition_tiers)
+
+  len(B_list)
+  ==> 1
+
+  B_list[0].__class__.__name__
+  ==> 'LinearOperatorDiag'
+
+  B_list[0].to_dense()
+  ==> [[3., 0.],
+       [0., 3.]]
+
+  B_list[0].name
+  ==> 'Add/A1__A2/'
+  ```
+
+  Args:
+    operators:  Iterable of `LinearOperator` objects with same `dtype`, domain
+      and range dimensions, and broadcastable batch shapes.
+    operator_name:  String name for returned `LinearOperator`.  Defaults to
+      concatenation of "Add/A__B/" that indicates the order of addition steps.
+    addition_tiers:  List tiers, like `[tier_0, tier_1, ...]`, where `tier_i`
+      is a list of `Adder` objects.  This function attempts to do all additions
+      in tier `i` before trying tier `i + 1`.
+    name:  A name for this `Op`.  Defaults to `add_operators`.
+
+  Returns:
+    Subclass of `LinearOperator`.  Class and order of addition may change as new
+      (and better) addition strategies emerge.
+
+  Raises:
+    ValueError:  If `operators` argument is empty.
+    ValueError:  If shapes are incompatible.
+  """
+  # Default setting
+  if addition_tiers is None:
+    addition_tiers = _DEFAULT_ADDITION_TIERS
+
+  # Argument checking.
+  check_ops.assert_proper_iterable(operators)
+  operators = list(reversed(operators))
+  if len(operators) < 1:
+    raise ValueError(
+        "Argument 'operators' must contain at least one operator.  "
+        "Found: %s" % operators)
+  if not all(
+      isinstance(op, linear_operator.LinearOperator) for op in operators):
+    raise TypeError(
+        "Argument 'operators' must contain only LinearOperator instances.  "
+        "Found: %s" % operators)
+  _static_check_for_same_dimensions(operators)
+  _static_check_for_broadcastable_batch_shape(operators)
+
+  graph_parents = []
+  for operator in operators:
+    graph_parents.extend(operator.graph_parents)
+
+  with ops.name_scope(name or "add_operators", values=graph_parents):
+
+    # Additions done in one of the tiers.  Try tier 0, 1,...
+    ops_to_try_at_next_tier = list(operators)
+    for tier in addition_tiers:
+      ops_to_try_at_this_tier = ops_to_try_at_next_tier
+      ops_to_try_at_next_tier = []
+      while ops_to_try_at_this_tier:
+        op1 = ops_to_try_at_this_tier.pop()
+        op2, adder = _pop_a_match_at_tier(op1, ops_to_try_at_this_tier, tier)
+        if op2 is not None:
+          # Will try to add the result of this again at this same tier.
+          new_operator = adder.add(op1, op2, operator_name)
+          ops_to_try_at_this_tier.append(new_operator)
+        else:
+          ops_to_try_at_next_tier.append(op1)
+
+    return ops_to_try_at_next_tier
+
+
+def _pop_a_match_at_tier(op1, operator_list, tier):
+  # Search from the back of list to the front in order to create nice default
+  # order of operations.
+  for i in range(1, len(operator_list) + 1):
+    op2 = operator_list[-i]
+    for adder in tier:
+      if adder.can_add(op1, op2):
+        return operator_list.pop(-i), adder
+  return None, None
+
+
+def _infer_hints_allowing_override(op1, op2, hints):
+  """Infer hints from op1 and op2.  hints argument is an override.
+
+  Args:
+    op1:  LinearOperator
+    op2:  LinearOperator
+    hints:  _Hints object holding "is_X" boolean hints to use for returned
+      operator.
+      If some hint is None, try to set using op1 and op2.  If the
+      hint is provided, ignore op1 and op2 hints.  This allows an override
+      of previous hints, but does not allow forbidden hints (e.g. you still
+      cannot say a real diagonal operator is not self-adjoint.
+
+  Returns:
+    _Hints object.
+  """
+  hints = hints or _Hints()
+  # If A, B are self-adjoint, then so is A + B.
+  if hints.is_self_adjoint is None:
+    is_self_adjoint = op1.is_self_adjoint and op2.is_self_adjoint
+  else:
+    is_self_adjoint = hints.is_self_adjoint
+
+  # If A, B are positive definite, then so is A + B.
+  if hints.is_positive_definite is None:
+    is_positive_definite = op1.is_positive_definite and op2.is_positive_definite
+  else:
+    is_positive_definite = hints.is_positive_definite
+
+  # A positive definite operator is always non-singular.
+  if is_positive_definite and hints.is_positive_definite is None:
+    is_non_singular = True
+  else:
+    is_non_singular = hints.is_non_singular
+
+  return _Hints(
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite)
+
+
+def _static_check_for_same_dimensions(operators):
+  """ValueError if operators determined to have different dimensions."""
+  if len(operators) < 2:
+    return
+
+  domain_dimensions = [(op.name, op.domain_dimension.value) for op in operators
+                       if op.domain_dimension.value is not None]
+  if len(set(value for name, value in domain_dimensions)) > 1:
+    raise ValueError("Operators must have the same domain dimension. Found: %s"
+                     % domain_dimensions)
+
+  range_dimensions = [(op.name, op.range_dimension.value) for op in operators
+                      if op.range_dimension.value is not None]
+  if len(set(value for name, value in range_dimensions)) > 1:
+    raise ValueError("Operators must have the same range dimension. Found: %s" %
+                     range_dimensions)
+
+
+def _static_check_for_broadcastable_batch_shape(operators):
+  """ValueError if operators determined to have non-broadcastable shapes."""
+  if len(operators) < 2:
+    return
+
+  # This will fail if they cannot be broadcast together.
+  batch_shape = operators[0].batch_shape
+  for op in operators[1:]:
+    batch_shape = array_ops.broadcast_static_shape(batch_shape, op.batch_shape)
+
+
+class _Hints(object):
+  """Holds 'is_X' flags that every LinearOperator is initialized with."""
+
+  def __init__(self,
+               is_non_singular=None,
+               is_positive_definite=None,
+               is_self_adjoint=None):
+    self.is_non_singular = is_non_singular
+    self.is_positive_definite = is_positive_definite
+    self.is_self_adjoint = is_self_adjoint
+
+
+################################################################################
+# Classes to add two linear operators.
+################################################################################
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _Adder(object):
+  """Abstract base class to add two operators.
+
+  Each `Adder` acts independently, adding everything it can, paying no attention
+  as to whether another `Adder` could have done the addition more efficiently.
+  """
+
+  @property
+  def name(self):
+    return self.__class__.__name__
+
+  @abc.abstractmethod
+  def can_add(self, op1, op2):
+    """Returns `True` if this `Adder` can add `op1` and `op2`.  Else `False`."""
+    pass
+
+  @abc.abstractmethod
+  def _add(self, op1, op2, operator_name, hints):
+    # Derived classes can assume op1 and op2 have been validated, e.g. they have
+    # the same dtype, and their domain/range dimensions match.
+    pass
+
+  def add(self, op1, op2, operator_name, hints=None):
+    """Return new `LinearOperator` acting like `op1 + op2`.
+
+    Args:
+      op1:  `LinearOperator`
+      op2:  `LinearOperator`, with `shape` and `dtype` such that adding to
+        `op1` is allowed.
+      operator_name:  `String` name to give to returned `LinearOperator`
+      hints:  `_Hints` object.  Returned `LinearOperator` will be created with
+        these hints.
+
+    Returns:
+      `LinearOperator`
+    """
+    updated_hints = _infer_hints_allowing_override(op1, op2, hints)
+
+    if operator_name is None:
+      operator_name = "Add/" + op1.name + "__" + op2.name + "/"
+
+    values = op1.graph_parents + op2.graph_parents
+    scope_name = self.name
+    if scope_name.startswith("_"):
+      scope_name = scope_name[1:]
+    with ops.name_scope(scope_name, values=values):
+      return self._add(op1, op2, operator_name, updated_hints)
+
+
+class _AddAndReturnScaledIdentity(_Adder):
+  """Handles additions resulting in an Identity family member.
+
+  The Identity (`LinearOperatorScaledIdentity`, `LinearOperatorIdentity`) family
+  is closed under addition.  This `Adder` respects that, and returns an Identity
+  """
+
+  def can_add(self, op1, op2):
+    types = {_type(op1), _type(op2)}
+    return not types.difference(_IDENTITY_FAMILY)
+
+  def _add(self, op1, op2, operator_name, hints):
+    # Will build a LinearOperatorScaledIdentity.
+
+    if _type(op1) == _SCALED_IDENTITY:
+      multiplier_1 = op1.multiplier
+    else:
+      multiplier_1 = array_ops.ones(op1.batch_shape_tensor(), dtype=op1.dtype)
+
+    if _type(op2) == _SCALED_IDENTITY:
+      multiplier_2 = op2.multiplier
+    else:
+      multiplier_2 = array_ops.ones(op2.batch_shape_tensor(), dtype=op2.dtype)
+
+    return linear_operator_identity.LinearOperatorScaledIdentity(
+        num_rows=op1.range_dimension_tensor(),
+        multiplier=multiplier_1 + multiplier_2,
+        is_non_singular=hints.is_non_singular,
+        is_self_adjoint=hints.is_self_adjoint,
+        is_positive_definite=hints.is_positive_definite,
+        name=operator_name)
+
+
+class _AddAndReturnDiag(_Adder):
+  """Handles additions resulting in a Diag operator."""
+
+  def can_add(self, op1, op2):
+    types = {_type(op1), _type(op2)}
+    return not types.difference(_DIAG_LIKE)
+
+  def _add(self, op1, op2, operator_name, hints):
+    return linear_operator_diag.LinearOperatorDiag(
+        diag=op1.diag_part() + op2.diag_part(),
+        is_non_singular=hints.is_non_singular,
+        is_self_adjoint=hints.is_self_adjoint,
+        is_positive_definite=hints.is_positive_definite,
+        name=operator_name)
+
+
+class _AddAndReturnTriL(_Adder):
+  """Handles additions resulting in a TriL operator."""
+
+  def can_add(self, op1, op2):
+    types = {_type(op1), _type(op2)}
+    return not types.difference(_DIAG_LIKE.union({_TRIL}))
+
+  def _add(self, op1, op2, operator_name, hints):
+    if _type(op1) in _EFFICIENT_ADD_TO_TENSOR:
+      op_add_to_tensor, op_other = op1, op2
+    else:
+      op_add_to_tensor, op_other = op2, op1
+
+    return linear_operator_tril.LinearOperatorTriL(
+        tril=op_add_to_tensor.add_to_tensor(op_other.to_dense()),
+        is_non_singular=hints.is_non_singular,
+        is_self_adjoint=hints.is_self_adjoint,
+        is_positive_definite=hints.is_positive_definite,
+        name=operator_name)
+
+
+class _AddAndReturnMatrix(_Adder):
+  """"Handles additions resulting in a `LinearOperatorFullMatrix`."""
+
+  def can_add(self, op1, op2):  # pylint: disable=unused-argument
+    return isinstance(op1, linear_operator.LinearOperator) and isinstance(
+        op2, linear_operator.LinearOperator)
+
+  def _add(self, op1, op2, operator_name, hints):
+    if _type(op1) in _EFFICIENT_ADD_TO_TENSOR:
+      op_add_to_tensor, op_other = op1, op2
+    else:
+      op_add_to_tensor, op_other = op2, op1
+    return linear_operator_full_matrix.LinearOperatorFullMatrix(
+        matrix=op_add_to_tensor.add_to_tensor(op_other.to_dense()),
+        is_non_singular=hints.is_non_singular,
+        is_self_adjoint=hints.is_self_adjoint,
+        is_positive_definite=hints.is_positive_definite,
+        name=operator_name)
+
+
+################################################################################
+# Constants designating types of LinearOperators
+################################################################################
+
+# Type name constants for LinearOperator classes.
+_IDENTITY = "identity"
+_SCALED_IDENTITY = "scaled_identity"
+_DIAG = "diag"
+_TRIL = "tril"
+_MATRIX = "matrix"
+
+# Groups of operators.
+_DIAG_LIKE = {_DIAG, _IDENTITY, _SCALED_IDENTITY}
+_IDENTITY_FAMILY = {_IDENTITY, _SCALED_IDENTITY}
+# operators with an efficient .add_to_tensor() method.
+_EFFICIENT_ADD_TO_TENSOR = _DIAG_LIKE
+
+
+def _type(operator):
+  """Returns the type name constant (e.g. _TRIL) for operator."""
+  if isinstance(operator, linear_operator_diag.LinearOperatorDiag):
+    return _DIAG
+  if isinstance(operator, linear_operator_tril.LinearOperatorTriL):
+    return _TRIL
+  if isinstance(operator, linear_operator_full_matrix.LinearOperatorFullMatrix):
+    return _MATRIX
+  if isinstance(operator, linear_operator_identity.LinearOperatorIdentity):
+    return _IDENTITY
+  if isinstance(operator,
+                linear_operator_identity.LinearOperatorScaledIdentity):
+    return _SCALED_IDENTITY
+  raise TypeError("Operator type unknown: %s" % operator)
+
+
+################################################################################
+# Addition tiers:
+# We attempt to use Adders in tier K before K+1.
+#
+# Organize tiers to
+#   (i) reduce O(..) complexity of forming final operator, and
+#   (ii) produce the "most efficient" final operator.
+# Dev notes:
+#  * Results of addition at tier K will be added at tier K or higher.
+#  * Tiers may change, and we warn the user that it may change.
+################################################################################
+
+# Note that the final tier, _AddAndReturnMatrix, will convert everything to a
+# dense matrix.  So it is sometimes very inefficient.
+_DEFAULT_ADDITION_TIERS = [
+    [_AddAndReturnScaledIdentity()],
+    [_AddAndReturnDiag()],
+    [_AddAndReturnTriL()],
+    [_AddAndReturnMatrix()],
+]
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index 81e77358410..0853ea03af0 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -52,8 +52,8 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
   ```python
   # Create a 2 x 2 linear operator composed of two 2 x 2 operators.
-  operator_1 = LinearOperatorMatrix([[1., 2.], [3., 4.]])
-  operator_2 = LinearOperatorMatrix([[1., 0.], [0., 1.]])
+  operator_1 = LinearOperatorFullMatrix([[1., 2.], [3., 4.]])
+  operator_2 = LinearOperatorFullMatrix([[1., 0.], [0., 1.]])
   operator = LinearOperatorComposition([operator_1, operator_2])
 
   operator.to_dense()
@@ -63,27 +63,27 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 5 linear operators.
   matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
-  operator_45 = LinearOperatorMatrix(matrix)
+  operator_45 = LinearOperatorFullMatrix(matrix)
 
   # Create a [2, 3] batch of 5 x 6 linear operators.
   matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
-  operator_56 = LinearOperatorMatrix(matrix_56)
+  operator_56 = LinearOperatorFullMatrix(matrix_56)
 
   # Compose to create a [2, 3] batch of 4 x 6 operators.
-  opeartor_46 = LinearOperatorComposition([operator_45, operator_56])
+  operator_46 = LinearOperatorComposition([operator_45, operator_56])
 
   # Create a shape [2, 3, 6, 2] vector.
   x = tf.random_normal(shape=[2, 3, 6, 2])
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 3, 4, 2] Tensor
   ```
 
@@ -96,7 +96,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -112,25 +112,28 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name=None):
-    """Initialize a `LinearOperatorComposition`.
+    r"""Initialize a `LinearOperatorComposition`.
 
     `LinearOperatorComposition` is initialized with a list of operators
-    `[op_1,...,op_J]`.  For the `apply` method to be well defined, the
-    composition `op_i.apply(op_{i+1}(x))` must be defined.  Other methods have
+    `[op_1,...,op_J]`.  For the `matmul` method to be well defined, the
+    composition `op_i.matmul(op_{i+1}(x))` must be defined.  Other methods have
     similar constraints.
 
     Args:
       operators:  Iterable of `LinearOperator` objects, each with
-        the same `dtype` and composible shape.
+        the same `dtype` and composable shape.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
       is_positive_definite:  Expect that this operator is positive definite,
-        meaning the real part of all eigenvalues is positive.  We do not require
-        the operator to be self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
 
@@ -176,6 +179,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   @property
@@ -224,18 +228,19 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # If self.operators = [A, B], and not adjoint, then
-    # apply_order_list = [B, A].
-    # As a result, we return A.apply(B.apply(x))
+    # matmul_order_list = [B, A].
+    # As a result, we return A.matmul(B.matmul(x))
     if adjoint:
-      apply_order_list = self.operators
+      matmul_order_list = self.operators
     else:
-      apply_order_list = list(reversed(self.operators))
+      matmul_order_list = list(reversed(self.operators))
 
-    result = x
-    for operator in apply_order_list:
-      result = operator.apply(result, adjoint=adjoint)
+    result = matmul_order_list[0].matmul(
+        x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    for operator in matmul_order_list[1:]:
+      result = operator.matmul(result, adjoint=adjoint)
     return result
 
   def _determinant(self):
@@ -250,7 +255,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
       result += operator.log_abs_determinant()
     return result
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     # TODO(langmore) Implement solve using solve_ls if some intermediate
     # operator maps to a high dimensional space.
     # In that case, an exact solve may still be possible.
@@ -263,8 +268,9 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     else:
       solve_order_list = self.operators
 
-    solution = rhs
-    for operator in solve_order_list:
+    solution = solve_order_list[0].solve(
+        rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index 4700e655186..56bc967706a 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -52,11 +52,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -68,13 +68,13 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
-  ==> operator.apply(x) = y
+  ==> operator.matmul(x) = y
   ```
 
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -87,7 +87,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorDiag` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N * R` multiplications.
+  * `operator.matmul(x)` involves `N * R` multiplications.
   * `operator.solve(x)` involves `N` divisions and `N * R` multiplications.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -97,7 +97,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -113,8 +113,9 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorDiag"):
-    """Initialize a `LinearOperatorDiag`.
+    r"""Initialize a `LinearOperatorDiag`.
 
     Args:
       diag:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
@@ -124,10 +125,12 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
       is_positive_definite:  Expect that this operator is positive definite,
-        meaning the real part of all eigenvalues is positive.  We do not require
-        the operator to be self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -135,32 +138,45 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
       ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
 
-    allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
-
     with ops.name_scope(name, values=[diag]):
       self._diag = ops.convert_to_tensor(diag, name="diag")
-      dtype = self._diag.dtype
-      if dtype not in allowed_dtypes:
-        raise TypeError(
-            "Argument diag must have dtype in %s.  Found: %s"
-            % (allowed_dtypes, dtype))
+      self._check_diag(self._diag)
 
       # Check and auto-set hints.
-      if not dtype.is_complex:
+      if not self._diag.dtype.is_complex:
         if is_self_adjoint is False:
           raise ValueError("A real diagonal operator is always self adjoint.")
         else:
           is_self_adjoint = True
 
+      if is_square is False:
+        raise ValueError("Only square diagonal operators currently supported.")
+      is_square = True
+
       super(LinearOperatorDiag, self).__init__(
-          dtype=dtype,
+          dtype=self._diag.dtype,
           graph_parents=[self._diag],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
+  def _check_diag(self, diag):
+    """Static check of diag."""
+    allowed_dtypes = [
+        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+
+    dtype = diag.dtype
+    if dtype not in allowed_dtypes:
+      raise TypeError(
+          "Argument diag must have dtype in %s.  Found: %s"
+          % (allowed_dtypes, dtype))
+
+    if diag.get_shape().ndims is not None and diag.get_shape().ndims < 1:
+      raise ValueError("Argument diag must have at least 1 dimension.  "
+                       "Found: %s" % diag)
+
   def _shape(self):
     # If d_shape = [5, 3], we return [5, 3, 3].
     d_shape = self._diag.get_shape()
@@ -197,8 +213,9 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
             "This diagonal operator contained non-zero imaginary values.  "
             " Thus it was not self-adjoint."))
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     diag_term = math_ops.conj(self._diag) if adjoint else self._diag
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     diag_mat = array_ops.expand_dims(diag_term, -1)
     return diag_mat * x
 
@@ -209,15 +226,23 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return math_ops.reduce_sum(
         math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     diag_term = math_ops.conj(self._diag) if adjoint else self._diag
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     inv_diag_mat = array_ops.expand_dims(1. / diag_term, -1)
     return rhs * inv_diag_mat
 
   def _to_dense(self):
     return array_ops.matrix_diag(self._diag)
 
+  def _diag_part(self):
+    return self.diag
+
   def _add_to_tensor(self, x):
     x_diag = array_ops.matrix_diag_part(x)
     new_diag = self._diag + x_diag
     return array_ops.matrix_set_diag(x, new_diag)
+
+  @property
+  def diag(self):
+    return self._diag
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
new file mode 100644
index 00000000000..67889511cbf
--- /dev/null
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
@@ -0,0 +1,175 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` that wraps a [batch] matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.linalg.python.ops import linear_operator
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = ["LinearOperatorFullMatrix"]
+
+
+class LinearOperatorFullMatrix(linear_operator.LinearOperator):
+  """`LinearOperator` that wraps a [batch] matrix.
+
+  This operator wraps a [batch] matrix `A` (which is a `Tensor`) with shape
+  `[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `M x N` matrix.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  matrix = [[1., 2.], [3., 4.]]
+  operator = LinearOperatorFullMatrix(matrix)
+
+  operator.to_dense()
+  ==> [[1., 2.]
+       [3., 4.]]
+
+  operator.shape
+  ==> [2, 2]
+
+  operator.log_abs_determinant()
+  ==> scalar Tensor
+
+  x = ... Shape [2, 4] Tensor
+  operator.matmul(x)
+  ==> Shape [2, 4] Tensor
+
+  # Create a [2, 3] batch of 4 x 4 linear operators.
+  matrix = tf.random_normal(shape=[2, 3, 4, 4])
+  operator = LinearOperatorFullMatrix(matrix)
+  ```
+
+  #### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
+  x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
+  ```
+
+  #### Performance
+
+  `LinearOperatorFullMatrix` has exactly the same performance as would be
+  achieved by using standard `TensorFlow` matrix ops.  Intelligent choices are
+  made based on the following initialization hints.
+
+  * If `dtype` is real, and `is_self_adjoint` and `is_positive_definite`, a
+    Cholesky factorization is used for the determinant and solve.
+
+  In all cases, suppose `operator` is a `LinearOperatorFullMatrix` of shape
+  `[M, N]`, and `x.shape = [N, R]`.  Then
+
+  * `operator.matmul(x)` is `O(M * N * R)`.
+  * If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
+  * If `M=N`, `operator.determinant()` is `O(N^3)`.
+
+  If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
+  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               matrix,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name="LinearOperatorFullMatrix"):
+    r"""Initialize a `LinearOperatorFullMatrix`.
+
+    Args:
+      matrix:  Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
+        Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`.
+
+    Raises:
+      TypeError:  If `diag.dtype` is not an allowed type.
+    """
+
+    with ops.name_scope(name, values=[matrix]):
+      self._matrix = ops.convert_to_tensor(matrix, name="matrix")
+      self._check_matrix(self._matrix)
+
+      super(LinearOperatorFullMatrix, self).__init__(
+          dtype=self._matrix.dtype,
+          graph_parents=[self._matrix],
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  def _check_matrix(self, matrix):
+    """Static check of the `matrix` argument."""
+    allowed_dtypes = [
+        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+
+    dtype = matrix.dtype
+    if dtype not in allowed_dtypes:
+      raise TypeError(
+          "Argument matrix must have dtype in %s.  Found: %s"
+          % (allowed_dtypes, dtype))
+
+    if matrix.get_shape().ndims is not None and matrix.get_shape().ndims < 2:
+      raise ValueError(
+          "Argument matrix must have at least 2 dimensions.  Found: %s"
+          % matrix)
+
+  def _shape(self):
+    return self._matrix.get_shape()
+
+  def _shape_tensor(self):
+    return array_ops.shape(self._matrix)
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return math_ops.matmul(
+        self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+
+  def _to_dense(self):
+    return self._matrix
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index 6559f8b1168..acba1c7035d 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -38,6 +38,7 @@ __all__ = [
 
 
 class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
+  """Base class for Identity operators."""
 
   def _check_num_rows_possibly_add_asserts(self):
     """Static check of init arg `num_rows`, possibly add asserts."""
@@ -73,6 +74,18 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
       raise ValueError("Argument num_rows must be non-negative.  Found:"
                        " %s" % num_rows_static)
 
+  def _ones_diag(self):
+    """Returns the diagonal of this operator as all ones."""
+    if self.shape.is_fully_defined():
+      d_shape = self.batch_shape.concatenate(
+          [min(self.domain_dimension.value, self.range_dimension.value)])
+    else:
+      d_shape = array_ops.concat(
+          [self.batch_shape_tensor(),
+           [math_ops.reduce_min(self.shape_tensor()[-2:])]], axis=0)
+
+    return array_ops.ones(shape=d_shape, dtype=self.dtype)
+
 
 class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   """`LinearOperator` acting like a [batch] square identity matrix.
@@ -99,11 +112,11 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 0.
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor, same as x.
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -128,20 +141,20 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   # to detect that no broadcast is necessary because both x and the operator
   # have statically defined shape.
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, same as x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   # This requires a copy, since the output is different size than the input.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, equal to [x, x]
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -153,21 +166,21 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
 
   If `batch_shape` initialization arg is `None`:
 
-  * `operator.apply(x)` is `O(1)`
+  * `operator.matmul(x)` is `O(1)`
   * `operator.solve(x)` is `O(1)`
   * `operator.determinant()` is `O(1)`
 
   If `batch_shape` initialization arg is provided, and static checks cannot
   rule out the need to broadcast:
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(B1*...*Bb)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -185,9 +198,10 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                is_non_singular=True,
                is_self_adjoint=True,
                is_positive_definite=True,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorIdentity"):
-    """Initialize a `LinearOperatorIdentity`.
+    r"""Initialize a `LinearOperatorIdentity`.
 
     The `LinearOperatorIdentity` is initialized with arguments defining `dtype`
     and shape.
@@ -205,7 +219,13 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
-      is_positive_definite:  Expect that this operator is positive definite.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -230,12 +250,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         raise ValueError("An identity operator is always non-singular.")
       if not is_positive_definite:
         raise ValueError("An identity operator is always positive-definite.")
+      if not is_square:
+        raise ValueError("An identity operator is always square.")
 
       super(LinearOperatorIdentity, self).__init__(
           dtype=dtype,
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
@@ -311,8 +334,9 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
     return x + zeros
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # Note that adjoint has no effect since this matrix is self-adjoint.
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if self._assert_proper_shapes:
       aps = linear_operator_util.assert_compatible_matrix_dimensions(
           self, x)
@@ -325,8 +349,11 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   def _log_abs_determinant(self):
     return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
 
-  def _solve(self, rhs, adjoint=False):
-    return self._apply(rhs)
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self._matmul(rhs, adjoint_arg=adjoint_arg)
+
+  def _diag_part(self):
+    return self._ones_diag()
 
   def add_to_tensor(self, mat, name="add_to_tensor"):
     """Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
@@ -437,11 +464,11 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 2 * Log[3]
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> 3 * x
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -459,19 +486,19 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         [0., 5.]]]
 
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -481,14 +508,14 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
 
   ### Performance
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(D1*...*Dd)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -505,9 +532,10 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorScaledIdentity"):
-    """Initialize a `LinearOperatorScaledIdentity`.
+    r"""Initialize a `LinearOperatorScaledIdentity`.
 
     The `LinearOperatorScaledIdentity` is initialized with `num_rows`, which
     determines the size of each identity matrix, and a `multiplier`,
@@ -522,7 +550,13 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
-      is_positive_definite:  Expect that this operator is positive definite.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
+            #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -534,6 +568,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
+    if not is_square:
+      raise ValueError("A ScaledIdentity operator is always square.")
+
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
@@ -542,6 +579,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       # Shape [B1,...Bb, 1, 1]
@@ -590,7 +628,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         imag_multiplier,
         message="LinearOperator was not self-adjoint")
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if adjoint:
       matrix = self._multiplier_matrix_conj
     else:
@@ -608,7 +647,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     return self._num_rows_cast_to_real_dtype * math_ops.log(
         self._abs_multiplier)
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     if adjoint:
       matrix = self._multiplier_matrix_conj
     else:
@@ -619,6 +659,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       rhs = control_flow_ops.with_dependencies([aps], rhs)
     return rhs / matrix
 
+  def _diag_part(self):
+    return self._ones_diag() * self.multiplier[..., array_ops.newaxis]
+
   def add_to_tensor(self, mat, name="add_to_tensor"):
     """Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py
deleted file mode 100644
index 3b5dc7c4819..00000000000
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_matrix.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""`LinearOperator` that wraps a [batch] matrix."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.linalg.python.ops import linear_operator
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-
-__all__ = ["LinearOperatorMatrix"]
-
-
-class LinearOperatorMatrix(linear_operator.LinearOperator):
-  """`LinearOperator` that wraps a [batch] matrix.
-
-  This operator wraps a [batch] matrix `A` (which is a `Tensor`) with shape
-  `[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-  an `M x N` matrix.
-
-  ```python
-  # Create a 2 x 2 linear operator.
-  matrix = [[1., 2.], [3., 4.]]
-  operator = LinearOperatorMatrix(matrix)
-
-  operator.to_dense()
-  ==> [[1., 2.]
-       [3., 4.]]
-
-  operator.shape
-  ==> [2, 2]
-
-  operator.log_determinant()
-  ==> scalar Tensor
-
-  x = ... Shape [2, 4] Tensor
-  operator.apply(x)
-  ==> Shape [2, 4] Tensor
-
-  # Create a [2, 3] batch of 4 x 4 linear operators.
-  matrix = tf.random_normal(shape=[2, 3, 4, 4])
-  operator = LinearOperatorMatrix(matrix)
-  ```
-
-  #### Shape compatibility
-
-  This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-  ```
-  operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
-  x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-  ```
-
-  #### Performance
-
-  `LinearOperatorMatrix` has exactly the same performance as would be achieved
-  by using standard `TensorFlow` matrix ops.  Intelligent choices are made
-  based on the following initialization hints.
-
-  * If `dtype` is real, and `is_self_adjoint` and `is_positive_definite`, a
-    Cholesky factorization is used for the determinant and solve.
-
-  In all cases, suppose `operator` is a `LinearOperatorMatrix` of shape
-  `[M, N]`, and `x.shape = [N, R]`.  Then
-
-  * `operator.apply(x)` is `O(M * N * R)`.
-  * If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
-  * If `M=N`, `operator.determinant()` is `O(N^3)`.
-
-  If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
-  `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-  #### Matrix property hints
-
-  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
-  These have the following meaning
-  * If `is_X == True`, callers should expect the operator to have the
-    property `X`.  This is a promise that should be fulfilled, but is *not* a
-    runtime assert.  For example, finite floating point precision may result
-    in these promises being violated.
-  * If `is_X == False`, callers should expect the operator to not have `X`.
-  * If `is_X == None` (the default), callers should have no expectation either
-    way.
-  """
-
-  def __init__(self,
-               matrix,
-               is_non_singular=None,
-               is_self_adjoint=None,
-               is_positive_definite=None,
-               name="LinearOperatorMatrix"):
-    """Initialize a `LinearOperatorMatrix`.
-
-    Args:
-      matrix:  Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-        Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
-      is_non_singular:  Expect that this operator is non-singular.
-      is_self_adjoint:  Expect that this operator is equal to its hermitian
-        transpose.
-      is_positive_definite:  Expect that this operator is positive definite,
-        meaning the real part of all eigenvalues is positive.  We do not require
-        the operator to be self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix
-            #Extension_for_non_symmetric_matrices
-      name: A name for this `LinearOperator`.
-
-    Raises:
-      TypeError:  If `diag.dtype` is not an allowed type.
-    """
-
-    allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
-
-    with ops.name_scope(name, values=[matrix]):
-      self._matrix = ops.convert_to_tensor(matrix, name="matrix")
-
-      dtype = self._matrix.dtype
-      if dtype not in allowed_dtypes:
-        raise TypeError(
-            "Argument matrix must have dtype in %s.  Found: %s"
-            % (allowed_dtypes, dtype))
-
-      # Special treatment for (real) Symmetric Positive Definite.
-      self._is_spd = (
-          (not dtype.is_complex) and is_self_adjoint and is_positive_definite)
-      if self._is_spd:
-        self._chol = linalg_ops.cholesky(self._matrix)
-
-      super(LinearOperatorMatrix, self).__init__(
-          dtype=self._matrix.dtype,
-          graph_parents=[self._matrix],
-          is_non_singular=is_non_singular,
-          is_self_adjoint=is_self_adjoint,
-          is_positive_definite=is_positive_definite,
-          name=name)
-
-  def _shape(self):
-    return self._matrix.get_shape()
-
-  def _shape_tensor(self):
-    return array_ops.shape(self._matrix)
-
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._matrix, x, adjoint_a=adjoint)
-
-  def _determinant(self):
-    if self._is_spd:
-      return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
-
-  def _log_abs_determinant(self):
-    if self._is_spd:
-      diag = array_ops.matrix_diag_part(self._chol)
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    abs_det = math_ops.abs(self.determinant())
-    return math_ops.log(abs_det)
-
-  def _solve(self, rhs, adjoint=False):
-    if self._is_spd:
-      return linalg_ops.cholesky_solve(self._chol, rhs)
-    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
-
-  def _to_dense(self):
-    return self._matrix
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index d7142ab9b07..b2d7b10157b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -23,6 +23,7 @@ import numpy as np
 import six
 
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
+from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -115,7 +116,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   @abc.abstractmethod
   def _make_x(self, operator, adjoint):
-    """Make an 'x' appropriate for calling operator.apply(x).
+    """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
@@ -134,12 +135,22 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     # To skip "test_foo", add "foo" to this list.
     return []
 
-  def _maybe_skip(self, test_name):
+  def _skip_if_tests_to_skip_contains(self, test_name):
+    """If self._tests_to_skip contains test_name, raise SkipTest exception.
+
+    See tests below for usage.
+
+    Args:
+      test_name:  String name corresponding to a test.
+
+    Raises:
+      SkipTest Exception, if test_name is in self._tests_to_skip.
+    """
     if test_name in self._tests_to_skip:
       self.skipTest("%s skipped because it was added to self._tests_to_skip.")
 
   def test_to_dense(self):
-    self._maybe_skip("to_dense")
+    self._skip_if_tests_to_skip_contains("to_dense")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
@@ -154,7 +165,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
             self.assertAC(op_dense_v, mat_v)
 
   def test_det(self):
-    self._maybe_skip("det")
+    self._skip_if_tests_to_skip_contains("det")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
@@ -175,7 +186,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
             self.assertAC(op_det_v, mat_det_v)
 
   def test_log_abs_det(self):
-    self._maybe_skip("log_abs_det")
+    self._skip_if_tests_to_skip_contains("log_abs_det")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
@@ -197,46 +208,63 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_apply(self):
-    self._maybe_skip("apply")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           for adjoint in False, True:
-            with self.test_session(graph=ops.Graph()) as sess:
-              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-              operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                  shape, dtype, use_placeholder=use_placeholder)
-              x = self._make_x(operator, adjoint=adjoint)
-              op_apply = operator.apply(x, adjoint=adjoint)
-              mat_apply = math_ops.matmul(mat, x, adjoint_a=adjoint)
-              if not use_placeholder:
-                self.assertAllEqual(op_apply.get_shape(), mat_apply.get_shape())
-              op_apply_v, mat_apply_v = sess.run([op_apply, mat_apply],
-                                                 feed_dict=feed_dict)
-              self.assertAC(op_apply_v, mat_apply_v)
+            for adjoint_arg in False, True:
+              with self.test_session(graph=ops.Graph()) as sess:
+                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+                operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                    shape, dtype, use_placeholder=use_placeholder)
+                x = self._make_x(operator, adjoint=adjoint)
+                # If adjoint_arg, compute A X^H^H = A X.
+                if adjoint_arg:
+                  op_matmul = operator.matmul(
+                      linear_operator_util.matrix_adjoint(x),
+                      adjoint=adjoint, adjoint_arg=adjoint_arg)
+                else:
+                  op_matmul = operator.matmul(x, adjoint=adjoint)
+                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                if not use_placeholder:
+                  self.assertAllEqual(
+                      op_matmul.get_shape(), mat_matmul.get_shape())
+                op_matmul_v, mat_matmul_v = sess.run(
+                    [op_matmul, mat_matmul], feed_dict=feed_dict)
+                self.assertAC(op_matmul_v, mat_matmul_v)
 
   def test_solve(self):
-    self._maybe_skip("solve")
+    self._skip_if_tests_to_skip_contains("solve")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           for adjoint in False, True:
-            with self.test_session(graph=ops.Graph()) as sess:
-              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-              operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                  shape, dtype, use_placeholder=use_placeholder)
-              rhs = self._make_rhs(operator, adjoint=adjoint)
-              op_solve = operator.solve(rhs, adjoint=adjoint)
-              mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
-              if not use_placeholder:
-                self.assertAllEqual(op_solve.get_shape(), mat_solve.get_shape())
-              op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve],
-                                                 feed_dict=feed_dict)
-              self.assertAC(op_solve_v, mat_solve_v)
+            for adjoint_arg in False, True:
+              with self.test_session(graph=ops.Graph()) as sess:
+                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+                operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                    shape, dtype, use_placeholder=use_placeholder)
+                rhs = self._make_rhs(operator, adjoint=adjoint)
+                # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
+                if adjoint_arg:
+                  op_solve = operator.solve(
+                      linear_operator_util.matrix_adjoint(rhs),
+                      adjoint=adjoint, adjoint_arg=adjoint_arg)
+                else:
+                  op_solve = operator.solve(
+                      rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+                mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
+                if not use_placeholder:
+                  self.assertAllEqual(
+                      op_solve.get_shape(), mat_solve.get_shape())
+                op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve],
+                                                   feed_dict=feed_dict)
+                self.assertAC(op_solve_v, mat_solve_v)
 
   def test_add_to_tensor(self):
-    self._maybe_skip("add_to_tensor")
+    self._skip_if_tests_to_skip_contains("add_to_tensor")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
@@ -254,6 +282,27 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
             self.assertAC(op_plus_2mat_v, 3 * mat_v)
 
+  def test_diag_part(self):
+    self._skip_if_tests_to_skip_contains("diag_part")
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                shape, dtype, use_placeholder=use_placeholder)
+            op_diag_part = operator.diag_part()
+            mat_diag_part = array_ops.matrix_diag_part(mat)
+
+            if not use_placeholder:
+              self.assertAllEqual(
+                  mat_diag_part.get_shape(), op_diag_part.get_shape())
+
+            op_diag_part_, mat_diag_part_ = sess.run(
+                [op_diag_part, mat_diag_part], feed_dict=feed_dict)
+
+            self.assertAC(op_diag_part_, mat_diag_part_)
+
 
 @six.add_metaclass(abc.ABCMeta)
 class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
@@ -305,7 +354,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
 
   Square shapes are never tested by this class, so if you want to test your
   operator with a square shape, create two test classes, the other subclassing
-  SquareLinearOperatorMatrixTest.
+  SquareLinearOperatorFullMatrixTest.
 
   Sub-classes must still define all abstractmethods from
   LinearOperatorDerivedClassTest that are not defined here.
@@ -327,7 +376,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         "_make_rhs not implemented because we don't test solve")
 
   def _make_x(self, operator, adjoint):
-    # Return the number of systems for the argument 'x' for .apply(x)
+    # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
     # shape [B1,...,Bb, N, R], R = 1 or 2.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 2b1fb4c04ca..8a152a9b475 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -23,7 +23,6 @@ from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
@@ -54,11 +53,11 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -69,7 +68,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -81,7 +80,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N^2 * R` multiplications.
+  * `operator.matmul(x)` involves `N^2 * R` multiplications.
   * `operator.solve(x)` involves `N * R` size `N` back-substitutions.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -91,7 +90,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -107,8 +106,9 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorTriL"):
-    """Initialize a `LinearOperatorTriL`.
+    r"""Initialize a `LinearOperatorTriL`.
 
     Args:
       tril:  Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
@@ -122,38 +122,55 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         real-valued diagonal entries.  In this case it is advised to use
         `LinearOperatorDiag`.
       is_positive_definite:  Expect that this operator is positive definite,
-        meaning the real part of all eigenvalues is positive.  We do not require
-        the operator to be self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
+      ValueError:  If `is_square` is `False`.
     """
 
-    # TODO(langmore) Add complex types once matrix_triangular_solve works for
-    # them.
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    if is_square is False:
+      raise ValueError(
+          "Only square lower triangular operators supported at this time.")
+    is_square = True
 
     with ops.name_scope(name, values=[tril]):
+      self._tril = ops.convert_to_tensor(tril, name="tril")
+      self._check_tril(self._tril)
       self._tril = array_ops.matrix_band_part(tril, -1, 0)
       self._diag = array_ops.matrix_diag_part(self._tril)
 
-      dtype = self._tril.dtype
-      if dtype not in allowed_dtypes:
-        raise TypeError(
-            "Argument tril must have dtype in %s.  Found: %s"
-            % (allowed_dtypes, dtype))
-
       super(LinearOperatorTriL, self).__init__(
           dtype=self._tril.dtype,
           graph_parents=[self._tril],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
+  def _check_tril(self, tril):
+    """Static check of the `tril` argument."""
+    # TODO(langmore) Add complex types once matrix_triangular_solve works for
+    # them.
+    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    dtype = tril.dtype
+    if dtype not in allowed_dtypes:
+      raise TypeError(
+          "Argument tril must have dtype in %s.  Found: %s"
+          % (allowed_dtypes, dtype))
+
+    if tril.get_shape().ndims is not None and tril.get_shape().ndims < 2:
+      raise ValueError(
+          "Argument tril must have at least 2 dimensions.  Found: %s"
+          % tril)
+
   def _shape(self):
     return self._tril.get_shape()
 
@@ -165,22 +182,9 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self._diag,
         message="Singular operator:  Diagonal contained zero values.")
 
-  def _assert_positive_definite(self):
-    if self.dtype.is_complex:
-      message = (
-          "Diagonal operator had diagonal entries with non-positive real part, "
-          "thus was not positive definite.")
-    else:
-      message = (
-          "Real diagonal operator had non-positive diagonal entries, "
-          "thus was not positive definite.")
-
-    return check_ops.assert_positive(
-        math_ops.real(self._diag),
-        message=message)
-
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._tril, x, adjoint_a=adjoint)
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return math_ops.matmul(
+        self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
     return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
@@ -189,7 +193,8 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
     return math_ops.reduce_sum(
         math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     return linalg_ops.matrix_triangular_solve(
         self._tril, rhs, lower=True, adjoint=adjoint)
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
index 609fd451289..546d899e74e 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
@@ -61,31 +61,31 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   ```python
   # Create a 3 x 3 diagonal linear operator.
   diag_operator = LinearOperatorDiag(
-      diag=[1., 2., 3.], is_non_singular=True, is_self_adjoint=True,
+      diag_update=[1., 2., 3.], is_non_singular=True, is_self_adjoint=True,
       is_positive_definite=True)
 
   # Perturb with a rank 2 perturbation
   operator = LinearOperatorUDVHUpdate(
       operator=diag_operator,
       u=[[1., 2.], [-1., 3.], [0., 0.]],
-      diag=[11., 12.],
+      diag_update=[11., 12.],
       v=[[1., 2.], [-1., 3.], [10., 10.]])
 
   operator.shape
   ==> [3, 3]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [3, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [3, 4] Tensor
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
@@ -95,15 +95,15 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   ### Performance
 
   Suppose `operator` is a `LinearOperatorUDVHUpdate` of shape `[M, N]`,
-  made from a rank `K` update of `base_operator` which performs `.apply(x)` on
-  `x` having `x.shape = [N, R]` with `O(L_apply*N*R)` complexity (and similarly
+  made from a rank `K` update of `base_operator` which performs `.matmul(x)` on
+  `x` having `x.shape = [N, R]` with `O(L_matmul*N*R)` complexity (and similarly
   for `solve`, `determinant`.  Then, if `x.shape = [N, R]`,
 
-  * `operator.apply(x)` is `O(L_apply*N*R + K*N*R)`
+  * `operator.matmul(x)` is `O(L_matmul*N*R + K*N*R)`
 
   and if `M = N`,
 
-  * `operator.solve(x)` is `O(L_apply*N*R + N*K*R + K^2*R + K^3)`
+  * `operator.solve(x)` is `O(L_matmul*N*R + N*K*R + K^2*R + K^3)`
   * `operator.determinant()` is `O(L_determinant + L_solve*N*K + K^2*N + K^3)`
 
   If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
@@ -112,7 +112,8 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite, diag_positive, square`
+  for `X = non_singular, self_adjoint, positive_definite, diag_update_positive`
+  and `square`
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -126,9 +127,9 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   def __init__(self,
                base_operator,
                u,
-               diag=None,
+               diag_update=None,
                v=None,
-               is_diag_positive=None,
+               is_diag_update_positive=None,
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
@@ -151,13 +152,14 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
         `LinearOperator`.  This is `L` above.
       u:  Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
         This is `U` above.
-      diag:  Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype` as
-        `base_operator`.  This is the diagonal of `D` above.
+      diag_update:  Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype`
+        as `base_operator`.  This is the diagonal of `D` above.
          Defaults to `D` being the identity operator.
       v:  Optional `Tensor` of same `dtype` as `u` and shape `[B1,...,Bb, N, K]`
          Defaults to `v = u`, in which case the perturbation is symmetric.
-         If `M != N`, then `v` must be set since the pertrubation is not square.
-      is_diag_positive:  Python `bool`.  If `True`, expect `diag > 0`.
+         If `M != N`, then `v` must be set since the perturbation is not square.
+      is_diag_update_positive:  Python `bool`.
+        If `True`, expect `diag_update > 0`.
       is_non_singular:  Expect that this operator is non-singular.
         Default is `None`, unless `is_positive_definite` is auto-set to be
         `True` (see below).
@@ -166,8 +168,10 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
         and `v = None` (meaning `u=v`), in which case this defaults to `True`.
       is_positive_definite:  Expect that this operator is positive definite.
         Default is `None`, unless `base_operator` is positive-definite
-        `v = None` (meaning `u=v`), and `is_diag_positive`, in which case this
-        defaults to `True`.
+        `v = None` (meaning `u=v`), and `is_diag_update_positive`, in which case
+        this defaults to `True`.
+        Note that we say an operator is positive definite when the quadratic
+        form `x^H A x` has positive real part for all nonzero `x`.
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
@@ -177,10 +181,10 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
     # TODO(langmore) support complex types.
     # Complex types are not allowed due to tf.cholesky() requiring float.
     # If complex dtypes are allowed, we update the following
-    # 1. is_diag_positive should still imply that `diag > 0`, but we need to
-    #    remind the user that this implies diag is real.  This is needed because
-    #    if diag has non-zero imaginary part, it will not be self-adjoint
-    #    positive definite.
+    # 1. is_diag_update_positive should still imply that `diag > 0`, but we need
+    #    to remind the user that this implies diag is real.  This is needed
+    #    because if diag has non-zero imaginary part, it will not be
+    #    self-adjoint positive definite.
     dtype = base_operator.dtype
     allowed_dtypes = [dtypes.float32, dtypes.float64]
     if dtype not in allowed_dtypes:
@@ -188,17 +192,17 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
           "Argument matrix must have dtype in %s.  Found: %s"
           % (allowed_dtypes, dtype))
 
-    if diag is None:
-      if is_diag_positive is False:
+    if diag_update is None:
+      if is_diag_update_positive is False:
         raise ValueError(
             "Default diagonal is the identity, which is positive.  However, "
-            "user set 'is_diag_positive' to False.")
-      is_diag_positive = True
+            "user set 'is_diag_update_positive' to False.")
+      is_diag_update_positive = True
 
     # In this case, we can use a Cholesky decomposition to help us solve/det.
     self._use_cholesky = (
         base_operator.is_positive_definite and base_operator.is_self_adjoint
-        and is_diag_positive
+        and is_diag_update_positive
         and v is None)
 
     # Possibly auto-set some characteristic flags from None to True.
@@ -223,7 +227,7 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
       is_positive_definite = True
       is_self_adjoint = True
 
-    values = base_operator.graph_parents + [u, diag, v]
+    values = base_operator.graph_parents + [u, diag_update, v]
     with ops.name_scope(name, values=values):
 
       # Create U and V.
@@ -233,14 +237,16 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
       else:
         self._v = ops.convert_to_tensor(v, name="v")
 
-      if diag is None:
-        self._diag = None
+      if diag_update is None:
+        self._diag_update = None
       else:
-        self._diag = ops.convert_to_tensor(diag, name="diag")
+        self._diag_update = ops.convert_to_tensor(
+            diag_update, name="diag_update")
 
       # Create base_operator L.
       self._base_operator = base_operator
-      graph_parents = base_operator.graph_parents + [self.u, self.diag, self.v]
+      graph_parents = base_operator.graph_parents + [
+          self.u, self._diag_update, self.v]
       graph_parents = [p for p in graph_parents if p is not None]
 
       super(LinearOperatorUDVHUpdate, self).__init__(
@@ -253,7 +259,12 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
           name=name)
 
       # Create the diagonal operator D.
-      self._set_diag_operators(diag, is_diag_positive)
+      self._set_diag_operators(diag_update, is_diag_update_positive)
+      self._is_diag_update_positive = is_diag_update_positive
+
+      contrib_tensor_util.assert_same_float_dtype(
+          (base_operator, self.u, self.v, self._diag_update))
+      self._check_shapes()
 
       # Pre-compute the so-called "capacitance" matrix
       #   C := D^{-1} + V^H L^{-1} U
@@ -261,16 +272,30 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
       if self._use_cholesky:
         self._chol_capacitance = linalg_ops.cholesky(self._capacitance)
 
-      contrib_tensor_util.assert_same_float_dtype(
-          (base_operator, self.u, self.v, self.diag))
+  def _check_shapes(self):
+    """Static check that shapes are compatible."""
+    # Broadcast shape also checks that u and v are compatible.
+    uv_shape = array_ops.broadcast_static_shape(
+        self.u.get_shape(), self.v.get_shape())
 
-  def _set_diag_operators(self, diag, is_diag_positive):
-    """Set attributes self._diag and self._diag_operator."""
-    if diag is not None:
+    batch_shape = array_ops.broadcast_static_shape(
+        self.base_operator.batch_shape, uv_shape[:-2])
+
+    self.base_operator.domain_dimension.assert_is_compatible_with(
+        uv_shape[-2])
+
+    if self._diag_update is not None:
+      uv_shape[-1].assert_is_compatible_with(self._diag_update.get_shape()[-1])
+      array_ops.broadcast_static_shape(
+          batch_shape, self._diag_update.get_shape()[:-1])
+
+  def _set_diag_operators(self, diag_update, is_diag_update_positive):
+    """Set attributes self._diag_update and self._diag_operator."""
+    if diag_update is not None:
       self._diag_operator = linear_operator_diag.LinearOperatorDiag(
-          self._diag, is_positive_definite=is_diag_positive)
+          self._diag_update, is_positive_definite=is_diag_update_positive)
       self._diag_inv_operator = linear_operator_diag.LinearOperatorDiag(
-          1. / self._diag, is_positive_definite=is_diag_positive)
+          1. / self._diag_update, is_positive_definite=is_diag_update_positive)
     else:
       if self.u.get_shape()[-1].value is not None:
         r = self.u.get_shape()[-1].value
@@ -291,9 +316,14 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
     return self._v
 
   @property
-  def diag(self):
+  def is_diag_update_positive(self):
+    """If this operator is `A = L + U D V^H`, this hints `D > 0` elementwise."""
+    return self._is_diag_update_positive
+
+  @property
+  def diag_update(self):
     """If this operator is `A = L + U D V^H`, this is the diagonal of `D`."""
-    return self._diag
+    return self._diag_update
 
   @property
   def diag_operator(self):
@@ -306,27 +336,34 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
     return self._base_operator
 
   def _shape(self):
-    return self.base_operator.shape
+    batch_shape = array_ops.broadcast_static_shape(
+        self.base_operator.batch_shape,
+        self.u.get_shape()[:-2])
+    return batch_shape.concatenate(self.base_operator.shape[-2:])
 
   def _shape_tensor(self):
-    return self.base_operator.shape_tensor()
+    batch_shape = array_ops.broadcast_dynamic_shape(
+        self.base_operator.batch_shape_tensor(),
+        array_ops.shape(self.u)[:-2])
+    return array_ops.concat(
+        [batch_shape, self.base_operator.shape_tensor()[-2:]], axis=0)
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     u = self.u
     v = self.v
     l = self.base_operator
     d = self.diag_operator
 
-    leading_term = l.apply(x, adjoint=adjoint)
+    leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = math_ops.matmul(u, x, adjoint_a=True)
-      d_uh_x = d.apply(uh_x, adjoint=adjoint)
+      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      d_uh_x = d.matmul(uh_x, adjoint=adjoint)
       v_d_uh_x = math_ops.matmul(v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = math_ops.matmul(v, x, adjoint_a=True)
-      d_vh_x = d.apply(vh_x, adjoint=adjoint)
+      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      d_vh_x = d.matmul(vh_x, adjoint=adjoint)
       u_d_vh_x = math_ops.matmul(u, d_vh_x)
       return leading_term + u_d_vh_x
 
@@ -361,7 +398,7 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
 
     return log_abs_det_c + log_abs_det_d + log_abs_det_l
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     if self.base_operator.is_non_singular is False:
       raise ValueError(
           "Solve not implemented unless this is a perturbation of a "
@@ -384,7 +421,7 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
       u = self.u
 
     # L^{-1} rhs
-    linv_rhs = l.solve(rhs, adjoint=adjoint)
+    linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
     vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
index 6e56fac2e3d..2659bd32e9a 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
@@ -69,10 +69,10 @@ def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
 
 
 def assert_compatible_matrix_dimensions(operator, x):
-  """Assert that an argument to solve/apply has proper domain dimension.
+  """Assert that an argument to solve/matmul has proper domain dimension.
 
   If `operator.shape[-2:] = [M, N]`, and `x.shape[-2:] = [Q, R]`, then
-  `operator.apply(x)` is defined only if `N = Q`.  This `Op` returns an
+  `operator.matmul(x)` is defined only if `N = Q`.  This `Op` returns an
   `Assert` that "fires" if this is not the case.  Static checks are already
   done by the base class `LinearOperator`.
 
@@ -87,13 +87,255 @@ def assert_compatible_matrix_dimensions(operator, x):
   assert_same_dd = check_ops.assert_equal(
       array_ops.shape(x)[-2],
       operator.domain_dimension_tensor(),
-      message=(
-          "Incompatible matrix dimensions.  "
-          "shape[-2] of argument to be the same as this operator"))
+      message=("Incompatible matrix dimensions.  "
+               "shape[-2] of argument to be the same as this operator"))
 
   return assert_same_dd
 
 
+def assert_is_batch_matrix(tensor):
+  """Static assert that `tensor` has rank `2` or higher."""
+  sh = tensor.get_shape()
+  if sh.ndims is not None and sh.ndims < 2:
+    raise ValueError(
+        "Expected [batch] matrix to have at least two dimensions.  Found: "
+        "%s" % tensor)
+
+
+def broadcast_matrix_batch_dims(batch_matrices, name=None):
+  """Broadcast leading dimensions of zero or more [batch] matrices.
+
+  Example broadcasting one batch dim of two simple matrices.
+
+  ```python
+  x = [[1, 2],
+       [3, 4]]  # Shape [2, 2], no batch dims
+
+  y = [[[1]]]   # Shape [1, 1, 1], 1 batch dim of shape [1]
+
+  x_bc, y_bc = broadcast_matrix_batch_dims([x, y])
+
+  x_bc
+  ==> [[[1, 2],
+        [3, 4]]]  # Shape [1, 2, 2], 1 batch dim of shape [1].
+
+  y_bc
+  ==> same as y
+  ```
+
+  Example broadcasting many batch dims
+
+  ```python
+  x = tf.random_normal(shape=(2, 3, 1, 4, 4))
+  y = tf.random_normal(shape=(1, 3, 2, 5, 5))
+  x_bc, y_bc = broadcast_matrix_batch_dims([x, y])
+
+  x_bc.shape
+  ==> (2, 3, 2, 4, 4)
+
+  y_bc.shape
+  ==> (2, 3, 2, 5, 5)
+  ```
+
+  Args:
+    batch_matrices:  Iterable of `Tensor`s, each having two or more dimensions.
+    name:  A string name to prepend to created ops.
+
+  Returns:
+    bcast_matrices: List of `Tensor`s, with `bcast_matricies[i]` containing
+      the values from `batch_matrices[i]`, with possibly broadcast batch dims.
+
+  Raises:
+    ValueError:  If any input `Tensor` is statically determined to have less
+      than two dimensions.
+  """
+  with ops.name_scope(
+      name or "broadcast_matrix_batch_dims", values=batch_matrices):
+    check_ops.assert_proper_iterable(batch_matrices)
+    batch_matrices = list(batch_matrices)
+
+    for i, mat in enumerate(batch_matrices):
+      batch_matrices[i] = ops.convert_to_tensor(mat)
+      assert_is_batch_matrix(batch_matrices[i])
+
+    if len(batch_matrices) < 2:
+      return batch_matrices
+
+    # Try static broadcasting.
+    # bcast_batch_shape is the broadcast batch shape of ALL matrices.
+    # E.g. if batch_matrices = [x, y], with
+    # x.shape =    [2, j, k]  (batch shape =    [2])
+    # y.shape = [3, 1, l, m]  (batch shape = [3, 1])
+    # ==> bcast_batch_shape = [3, 2]
+    bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
+    for mat in batch_matrices[1:]:
+      bcast_batch_shape = array_ops.broadcast_static_shape(
+          bcast_batch_shape, mat.get_shape()[:-2])
+    if bcast_batch_shape.is_fully_defined():
+      # The [1, 1] at the end will broadcast with anything.
+      bcast_shape = bcast_batch_shape.concatenate([1, 1])
+      for i, mat in enumerate(batch_matrices):
+        if mat.get_shape()[:-2] != bcast_batch_shape:
+          batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
+      return batch_matrices
+
+    # Since static didn't work, do dynamic, which always copies data.
+    bcast_batch_shape = array_ops.shape(batch_matrices[0])[:-2]
+    for mat in batch_matrices[1:]:
+      bcast_batch_shape = array_ops.broadcast_dynamic_shape(
+          bcast_batch_shape, array_ops.shape(mat)[:-2])
+    bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
+    for i, mat in enumerate(batch_matrices):
+      batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
+
+    return batch_matrices
+
+
+def _broadcast_to_shape(x, shape):
+  return x + array_ops.zeros(shape=shape, dtype=x.dtype)
+
+
+def matmul_with_broadcast(a,
+                          b,
+                          transpose_a=False,
+                          transpose_b=False,
+                          adjoint_a=False,
+                          adjoint_b=False,
+                          a_is_sparse=False,
+                          b_is_sparse=False,
+                          name=None):
+  """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
+
+  The inputs must be matrices (or tensors of rank > 2, representing batches of
+  matrices).
+
+  Both matrices must be of the same type. The supported types are:
+  `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
+
+  Either matrix can be transposed or adjointed (conjugated and transposed) on
+  the fly by setting one of the corresponding flag to `True`. These are `False`
+  by default.
+
+  If one or both of the matrices contain a lot of zeros, a more efficient
+  multiplication algorithm can be used by setting the corresponding
+  `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
+  This optimization is only available for plain matrices (rank-2 tensors) with
+  datatypes `bfloat16` or `float32`.
+
+  For example:
+
+  ```python
+  # A 2-batch of 3x4 matrices
+  a = tf.random_normal(shape=(2, 3, 4))
+
+  # A single 4x5 matrix
+  b = tf.random_normal(shape=(4, 5))
+
+  result = matmul_with_broadcast(a, b)
+
+  result.shape
+  ==> (2, 3, 5)
+
+  result[0,...]
+  ==> tf.matmul(a[0,...], b)
+
+  result[1,...]
+  ==> tf.matmul(a[1,...], b)
+  ```
+
+  Args:
+    a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
+      `complex128` and `rank > 1`.
+    b: `Tensor` with same type as `a` having compatible matrix dimensions and
+      broadcastable batch dimensions.
+    transpose_a: If `True`, `a` is transposed before multiplication.
+    transpose_b: If `True`, `b` is transposed before multiplication.
+    adjoint_a: If `True`, `a` is conjugated and transposed before
+      multiplication.
+    adjoint_b: If `True`, `b` is conjugated and transposed before
+      multiplication.
+    a_is_sparse: If `True`, `a` is treated as a sparse matrix.
+    b_is_sparse: If `True`, `b` is treated as a sparse matrix.
+    name: Name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same type as `a` and `b` where each inner-most matrix is
+    the product of the corresponding matrices in `a` and `b`, e.g. if all
+    transpose or adjoint attributes are `False`:
+
+    The leading shape of `output` is the result of broadcasting the leading
+    dimensions of `a` and `b`.
+
+    `output`[..., i, j] = sum_k (`a`[..., i, k] * `b`[..., k, j]),
+    for all indices i, j.
+
+    Note: This is matrix product, not element-wise product.
+
+
+  Raises:
+    ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
+      are both set to True.
+  """
+  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]) as name:
+    a, b = broadcast_matrix_batch_dims([a, b])
+    return math_ops.matmul(
+        a,
+        b,
+        transpose_a=transpose_a,
+        transpose_b=transpose_b,
+        adjoint_a=adjoint_a,
+        adjoint_b=adjoint_b,
+        a_is_sparse=a_is_sparse,
+        b_is_sparse=b_is_sparse)
+
+
+def matrix_adjoint(a, name="matrix_adjoint"):
+  """Transposes last two dimensions of tensor `a`, and takes complex conjugate.
+
+  If `a` is real valued, the result is equivalent to `matrix_transpose`.
+
+  For example:
+
+  ```python
+  # Matrix with no batch dimension.
+  # 'x' is [[1 2 3j]
+  #         [4 5 -6j]]
+  tf.matrix_adjoint(x) ==> [[1 4]
+                            [2 5]
+                            [-3j 6j]]
+
+  # Matrix with two batch dimensions.
+  # x.shape is [1, 2, 3, 4]
+  # tf.matrix_adjoint(x) is shape [1, 2, 4, 3]
+  ```
+
+  Note that `tf.matmul` provides kwargs allowing for adjoint of arguments.  This
+  is done with minimal cost, and is preferable to using this function. E.g.
+
+  ```
+  # Good!  Adjoint is taken at minimal additional cost.
+  tf.matmul(matrix, b, adjoint_b=True)
+
+  # Inefficient!
+  tf.matmul(matrix, tf.matrix_adjoint(b))
+  ```
+
+  Args:
+    a: A `Tensor` with `rank >= 2`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A batch matrix `Tensor` with same `dtype` as `a`.
+
+  Raises:
+    ValueError:  If `a` is determined statically to have `rank < 2`.
+  """
+  with ops.name_scope(name, values=[a]):
+    a = ops.convert_to_tensor(a, name="a")
+    a_transpose = array_ops.matrix_transpose(a)
+    return math_ops.conj(a_transpose)
+
+
 def shape_tensor(shape, name=None):
   """Convert Tensor using default type, unless empty list or tuple."""
   # Works just like random_ops._ShapeTensor.
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index d2ff3cf19d2..1fde6e5c6cb 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -8,6 +8,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "sdca_ops_py",
     srcs = [
@@ -29,6 +31,7 @@ py_library(
         "//tensorflow/python:sdca_ops_gen",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
 )
@@ -102,6 +105,39 @@ py_test(
     ],
 )
 
+py_library(
+    name = "sdca_estimator_py",
+    srcs = ["python/sdca_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":sdca_ops_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "sdca_estimator_test",
+    srcs = ["python/sdca_estimator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":sdca_estimator_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sparse_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/linear_optimizer/kernels/g3doc/readme.md b/tensorflow/contrib/linear_optimizer/kernels/g3doc/readme.md
index f5fc77b9c17..a4f5086ddeb 100644
--- a/tensorflow/contrib/linear_optimizer/kernels/g3doc/readme.md
+++ b/tensorflow/contrib/linear_optimizer/kernels/g3doc/readme.md
@@ -159,7 +159,7 @@ expected.
 
 On criteo dataset, the usual Newton method goes out of range for a small (but
 non negligible) fraction of the examples. The returned dual in these cases will
-be $$0$$ or $$\pm 1$$. The modified Newton algorihm always find the true zero
+be $$0$$ or $$\pm 1$$. The modified Newton algorithm always find the true zero
 and achieves a better log loss.
 
 The blue lines represent the modified Newton (evaluation and training) and the
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 71217f8060a..70f777f08bd 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -18,14 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import threading
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.linear_optimizer.python.ops.sdca_ops import SdcaModel
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
 from tensorflow.core.example import example_pb2
@@ -1064,7 +1058,7 @@ class SdcaFprintTest(SdcaModelTest):
   def testFprint(self):
     with self._single_threaded_test_session():
       in_data = constant_op.constant(['abc', 'very looooooong string', 'def'])
-      out_data = gen_sdca_ops._sdca_fprint(in_data)
+      out_data = gen_sdca_ops.sdca_fprint(in_data)
       self.assertAllEqual([[4143508125394299908, -6879828354153669051],
                            [5849691694103072671, -4874542629849009556],
                            [603227410218889250, 8762207001949257490]],
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index d8f140f5ed1..13f2f0f5021 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -307,7 +307,7 @@ class SdcaModel(object):
           sparse_features_values.append(sf.feature_values)
 
       # pylint: disable=protected-access
-      example_ids_hashed = gen_sdca_ops._sdca_fprint(
+      example_ids_hashed = gen_sdca_ops.sdca_fprint(
           internal_convert_to_tensor(self._examples['example_ids']))
       # pylint: enable=protected-access
       example_state_data = self._hashtable.lookup(example_ids_hashed)
@@ -328,7 +328,7 @@ class SdcaModel(object):
           sparse_weights.append(array_ops.gather(w, sparse_indices[-1]))
 
       # pylint: disable=protected-access
-      esu, sfw, dfw = gen_sdca_ops._sdca_optimizer(
+      esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
           sparse_example_indices,
           sparse_feature_indices,
           sparse_features_values,
@@ -390,7 +390,7 @@ class SdcaModel(object):
           with ops.device(var.device):
             # pylint: disable=protected-access
             update_ops.append(
-                gen_sdca_ops._sdca_shrink_l1(
+                gen_sdca_ops.sdca_shrink_l1(
                     self._convert_n_to_tensor(
                         [var], as_ref=True),
                     l1=self._symmetric_l1_regularization(),
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
index ed7105b5c95..003795233ff 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
@@ -27,28 +27,36 @@ class SparseFeatureColumn(object):
   """Represents a sparse feature column.
 
   Contains three tensors representing a sparse feature column, they are
-  example indices (int64), feature indices (int64), and feature values (float).
-  Feature weights are optional, and are treated as 1.0f if missing.
+  example indices (`int64`), feature indices (`int64`), and feature
+  values (`float`).
+  Feature weights are optional, and are treated as `1.0f` if missing.
 
   For example, consider a batch of 4 examples, which contains the following
-  features in a particular SparseFeatureColumn:
-   Example 0: feature 5, value 1
-   Example 1: feature 6, value 1 and feature 10, value 0.5
-   Example 2: no features
-   Example 3: two copies of feature 2, value 1
+  features in a particular `SparseFeatureColumn`:
+
+  * Example 0: feature 5, value 1
+  * Example 1: feature 6, value 1 and feature 10, value 0.5
+  * Example 2: no features
+  * Example 3: two copies of feature 2, value 1
 
   This SparseFeatureColumn will be represented as follows:
+
+  ```
    <0, 5,  1>
    <1, 6,  1>
    <1, 10, 0.5>
    <3, 2,  1>
    <3, 2,  1>
+  ```
 
   For a batch of 2 examples below:
-   Example 0: feature 5
-   Example 1: feature 6
 
-  is represented by SparseFeatureColumn as:
+  * Example 0: feature 5
+  * Example 1: feature 6
+
+  is represented by `SparseFeatureColumn` as:
+
+  ```
    <0, 5,  1>
    <1, 6,  1>
 
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
new file mode 100644
index 00000000000..f4961ab9dbf
--- /dev/null
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -0,0 +1,520 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear Estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import session_run_hook
+
+
+def _head_is_valid_for_sdca(head):
+  """Returns true if the provided head is supported by SDCAOptimizer."""
+  # pylint: disable=protected-access
+  return isinstance(head, head_lib._BinaryLogisticHead) or isinstance(
+      head, head_lib._BinarySvmHead) or isinstance(head,
+                                                   head_lib._RegressionHead)
+  # pylint: enable=protected-access
+
+
+def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
+                     columns_to_variables):
+  """Adds a fake bias feature column filled with all 1s."""
+  # TODO(b/31008490): Move definition to a common constants place.
+  bias_column_name = "tf_virtual_bias_column"
+  if any(col.name is bias_column_name for col in feature_columns):
+    raise ValueError("%s is a reserved column name." % bias_column_name)
+  if not feature_columns:
+    raise ValueError("feature_columns can't be empty.")
+
+  # Loop through input tensors until we can figure out batch_size.
+  batch_size = None
+  for column in columns_to_tensors.values():
+    if isinstance(column, tuple):
+      column = column[0]
+    if isinstance(column, sparse_tensor.SparseTensor):
+      shape = tensor_util.constant_value(column.dense_shape)
+      if shape is not None:
+        batch_size = shape[0]
+        break
+    else:
+      batch_size = array_ops.shape(column)[0]
+      break
+  if batch_size is None:
+    raise ValueError("Could not infer batch size from input features.")
+
+  bias_column = layers.real_valued_column(bias_column_name)
+  columns_to_tensors[bias_column] = array_ops.ones(
+      [batch_size, 1], dtype=dtypes.float32)
+  columns_to_variables[bias_column] = [bias_variable]
+
+
+def sdca_model_fn(features, labels, mode, params, config=None):
+  """A model_fn for linear models that use the SDCA optimizer.
+
+  Args:
+    features: A dict of `Tensor` keyed by column name.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` with values in the set {0, 1}.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * head: A `Head` instance. Type must be one of `_BinarySvmHead`,
+          `_RegressionHead` or `_BinaryLogisticHead`.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * l1_regularization: Global (across all examples) L1-regularization
+          parameter.
+      * l2_regularization: Global (across all examples) L2-regularization
+          parameter.
+      * num_loss_partitions: Number of partitions of the global loss function
+          optimized by `SDCAOptimizer`.
+      * weight_column_name: A string defining the weight feature column, or
+          None if there are no weights.
+      * update_weights_hook: A `SessionRunHook` object or None. Used to update
+          model weights.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    A `ModelFnOps` instance.
+
+  Raises:
+    ValueError: If the type of head is not one of `_BinarySvmHead`,
+      `_RegressionHead` or `_MultiClassHead`.
+    ValueError: If mode is not any of the `ModeKeys`.
+  """
+  head = params["head"]
+  feature_columns = params["feature_columns"]
+  example_id_column = params["example_id_column"]
+  l1_regularization = params["l1_regularization"]
+  l2_regularization = params["l2_regularization"]
+  num_loss_partitions = params["num_loss_partitions"]
+  weight_column_name = params["weight_column_name"]
+  update_weights_hook = params.get("update_weights_hook", None)
+
+  loss_type = None
+  if isinstance(head, head_lib._BinarySvmHead):  # pylint: disable=protected-access
+    loss_type = "hinge_loss"
+  elif isinstance(head, head_lib._BinaryLogisticHead):  # pylint: disable=protected-access
+    loss_type = "logistic_loss"
+  elif isinstance(head, head_lib._RegressionHead):  # pylint: disable=protected-access
+    loss_type = "squared_loss"
+  else:
+    raise ValueError("Unsupported head type: {}".format(type(head)))
+
+  assert head.logits_dimension == 1, (
+      "SDCA only applies to logits_dimension=1.")
+
+  # Update num_loss_partitions based on number of workers.
+  n_loss_partitions = num_loss_partitions or max(1, config.num_worker_replicas)
+  optimizer = sdca_optimizer.SDCAOptimizer(
+      example_id_column=example_id_column,
+      num_loss_partitions=n_loss_partitions,
+      symmetric_l1_regularization=l1_regularization,
+      symmetric_l2_regularization=l2_regularization)
+
+  parent_scope = "linear"
+
+  with variable_scope.variable_op_scope(features.values(),
+                                        parent_scope) as scope:
+    features = features.copy()
+    features.update(layers.transform_features(features, feature_columns))
+    logits, columns_to_variables, bias = (
+        layers.weighted_sum_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=feature_columns,
+            num_outputs=1,
+            scope=scope))
+
+    _add_bias_column(feature_columns, features, bias, columns_to_variables)
+
+  def _train_op_fn(unused_loss):
+    global_step = contrib_variables.get_global_step()
+    sdca_model, train_op = optimizer.get_train_step(
+        columns_to_variables, weight_column_name, loss_type, features, labels,
+        global_step)
+    if update_weights_hook is not None:
+      update_weights_hook.set_parameters(sdca_model, train_op)
+    return train_op
+
+  model_fn_ops = head.create_model_fn_ops(
+      features=features,
+      labels=labels,
+      mode=mode,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+  if update_weights_hook is not None:
+    return model_fn_ops._replace(training_chief_hooks=(
+        model_fn_ops.training_chief_hooks + [update_weights_hook]))
+  return model_fn_ops
+
+
+class _SdcaUpdateWeightsHook(session_run_hook.SessionRunHook):
+  """SessionRunHook to update and shrink SDCA model weights."""
+
+  def __init__(self):
+    pass
+
+  def set_parameters(self, sdca_model, train_op):
+    self._sdca_model = sdca_model
+    self._train_op = train_op
+
+  def begin(self):
+    """Construct the update_weights op.
+
+    The op is implicitly added to the default graph.
+    """
+    self._update_op = self._sdca_model.update_weights(self._train_op)
+
+  def before_run(self, run_context):
+    """Return the update_weights op so that it is executed during this run."""
+    return session_run_hook.SessionRunArgs(self._update_op)
+
+
+class _SDCAEstimator(estimator.Estimator):
+  """Base estimator class for linear models using the SDCA optimizer.
+
+  This class should not be used directly. Rather, users should call one of the
+  derived estimators.
+  """
+
+  def __init__(self,
+               example_id_column,
+               feature_columns,
+               weight_column_name=None,
+               model_dir=None,
+               head=None,
+               l1_regularization=0.0,
+               l2_regularization=1.0,
+               num_loss_partitions=None,
+               config=None,
+               feature_engineering_fn=None):
+    """Construct a `_SDCAEstimator` estimator object.
+
+    Args:
+      example_id_column: A string defining the feature column name representing
+        example ids. Used to initialize the underlying SDCA optimizer.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      model_dir: Directory to save model parameters, graph etc. This can also be
+        used to load checkpoints from the directory into an estimator to
+        continue  training a previously saved model.
+      head: type of head. Currently, _BinaryLogisticHead and _BinarySvmHead are
+        supported for classification and _RegressionHead for regression. It
+        should be a subclass of _SingleHead.
+      l1_regularization: L1-regularization parameter. Refers to global L1
+        regularization (across all examples).
+      l2_regularization: L2-regularization parameter. Refers to global L2
+        regularization (across all examples).
+      num_loss_partitions: number of partitions of the (global) loss function
+        optimized by the underlying optimizer (SDCAOptimizer).
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+
+    Returns:
+      A `_SDCAEstimator` estimator.
+
+    Raises:
+      ValueError: if head is not supported by SDCA.
+    """
+    self._feature_columns = tuple(feature_columns or [])
+    assert self._feature_columns
+
+    if not _head_is_valid_for_sdca(head):
+      raise ValueError(
+          "head type: {} is not supported. Supported head types: "
+          "_BinaryLogisticHead, _BinarySvmHead and _RegressionHead.".format(
+              type(head)))
+    assert head.logits_dimension == 1
+
+    params = {
+        "head": head,
+        "feature_columns": feature_columns,
+        "example_id_column": example_id_column,
+        "num_loss_partitions": num_loss_partitions,
+        "l1_regularization": l1_regularization,
+        "l2_regularization": l2_regularization,
+        "weight_column_name": weight_column_name,
+        "update_weights_hook": _SdcaUpdateWeightsHook(),
+    }
+
+    super(_SDCAEstimator, self).__init__(
+        model_fn=sdca_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        feature_engineering_fn=feature_engineering_fn)
+
+
+class SDCALogisticClassifier(_SDCAEstimator):
+  """Logistic regression binary classifier using the SDCA optimizer.
+
+  Example usage:
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  classifier = SDCALogisticClassifier(
+      example_id_column='example_id',
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b]),
+      weight_column_name=...,
+      l2_regularization=...,
+      num_loss_partitions=...,
+  )
+
+  # Input builders
+  # returns x, y (where y is the label Tensor (with 0/1 values)
+  def input_fn_{train, eval}:
+
+  # returns x (features dict)
+  def input_fn_test:
+    ...
+  classifier.fit(input_fn=input_fn_train)
+  classifier.evaluate(input_fn=input_fn_eval)
+  # Returns predicted classes.
+  classifier.predict_classes(input_fn=input_fn_test)
+  # Returns predicted probabilities.
+  classifier.predict_proba(input_fn=input_fn_test)
+  ```
+
+  The input_fn provided to `fit`, `evaluate` and predict_* methods should return
+  the following features, otherwise there  will be a `KeyError`:
+    * A feature with `key=example_id_column` whose value is a `Tensor` of dtype
+      string.
+    * If `weight_column_name` is not `None`, a feature with
+      `key=weight_column_name` whose value is a `Tensor`.
+    * For each `column` in `feature_columns`:
+      - if `column` is a `SparseColumn`, a feature with `key=column.name` whose
+        `value` is a `SparseTensor`
+      - if `column` is a `RealValuedColumn, a feature with `key=column.name`
+        whose `value` is a `Tensor`
+      - if `column` is a `WeightedSparseColumn`, two features: the first with
+        `key` the id column name, the second with `key` the weight column name.
+        Both features' `value` must be a `SparseTensor`
+  """
+
+  def __init__(self,
+               example_id_column,
+               feature_columns,
+               weight_column_name=None,
+               model_dir=None,
+               l1_regularization=0.0,
+               l2_regularization=1.0,
+               num_loss_partitions=None,
+               config=None,
+               feature_engineering_fn=None):
+    """Construct a `SDCALogisticClassifier` object.
+
+    Args:
+      example_id_column: A string defining the feature column name representing
+        example ids. Used to initialize the underlying SDCA optimizer.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the iterable should derive from `FeatureColumn`.
+        Note that the order of the items is ignored at model construction time.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      model_dir: Directory to save model parameters, graph etc. This can also be
+        used to load checkpoints from the directory into an estimator to
+        continue training a previously saved model.
+      l1_regularization: L1-regularization parameter. Refers to global L1
+        regularization (across all examples).
+      l2_regularization: L2-regularization parameter. Refers to global L2
+        regularization (across all examples).
+      num_loss_partitions: Number of partitions of the global loss function
+        optimized by the underlying optimizer (SDCAOptimizer).
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+
+    Returns:
+      A `SDCALogisiticClassifier` estimator.
+    """
+    super(SDCALogisticClassifier, self).__init__(
+        example_id_column=example_id_column,
+        feature_columns=feature_columns,
+        weight_column_name=weight_column_name,
+        model_dir=model_dir,
+        head=head_lib.multi_class_head(
+            n_classes=2, weight_column_name=weight_column_name),
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        num_loss_partitions=num_loss_partitions,
+        config=config,
+        feature_engineering_fn=None)
+
+  def predict_classes(self, input_fn=None):
+    """Runs inference to determine the predicted class.
+
+    Args:
+      input_fn: The input function providing features.
+
+    Returns:
+      A generator of predicted classes for the features provided by input_fn.
+    """
+    key = prediction_key.PredictionKey.CLASSES
+    predictions = super(SDCALogisticClassifier, self).predict(
+        input_fn=input_fn, outputs=[key])
+    return (pred[key] for pred in predictions)
+
+  def predict_proba(self, input_fn=None):
+    """Runs inference to determine the class probability predictions.
+
+    Args:
+      input_fn: The input function providing features.
+
+    Returns:
+      A generator of predicted class probabilities for the features provided by
+        input_fn.
+    """
+    key = prediction_key.PredictionKey.PROBABILITIES
+    predictions = super(SDCALogisticClassifier, self).predict(
+        input_fn=input_fn, outputs=[key])
+    return (pred[key] for pred in predictions)
+
+
+class SDCALinearRegressor(_SDCAEstimator):
+  """Linear regression model using SDCA to solve the underlying optimization.
+
+  Example usage:
+
+  ```python
+  real_column_a = real_valued_column(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  regressor = SDCALinearRegressor(
+      example_id_column='example_id',
+      feature_columns=[real_column_a, sparse_column_b]),
+      weight_column_name=...,
+      l2_regularization=...,
+      num_loss_partitions=...,
+  )
+
+  # Input builders
+  # returns x, y (where y is the label Tensor (with 0/1 values)
+  def input_fn_{train, eval}:
+
+  # returns x (features dict)
+  def input_fn_test:
+    ...
+  regressor.fit(input_fn=input_fn_train)
+  regressor.evaluate(input_fn=input_fn_eval)
+  regressor.predict_scores(input_fn=input_fn_test) # returns predicted scores.
+  ```
+
+  The input_fn provided to `fit`, `evaluate` and predict_* methods should return
+  the following features, otherwise there  will be a `KeyError`:
+    * A feature with `key=example_id_column` whose value is a `Tensor` of dtype
+      string.
+    * If `weight_column_name` is not `None`, a feature with
+      `key=weight_column_name` whose value is a `Tensor`.
+    * For each `column` in `feature_columns`:
+      - if `column` is a `SparseColumn`, a feature with `key=column.name` whose
+        `value` is a `SparseTensor`
+      - if `column` is a `RealValuedColumn, a feature with `key=column.name`
+        whose `value` is a `Tensor`
+      - if `column` is a `WeightedSparseColumn`, two features: the first with
+        `key` the id column name, the second with `key` the weight column name.
+        Both features' `value` must be a `SparseTensor`
+
+  """
+
+  def __init__(self,
+               example_id_column,
+               feature_columns,
+               weight_column_name=None,
+               model_dir=None,
+               l1_regularization=0.0,
+               l2_regularization=1.0,
+               num_loss_partitions=None,
+               config=None,
+               feature_engineering_fn=None):
+    """Construct a `SDCALinearRegressor` estimator object.
+
+
+    Args:
+      example_id_column: A string defining the feature column name representing
+        example ids. Used to initialize the underlying SDCA optimizer.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the iterable should derive from `FeatureColumn`.
+        Note that the order of the items is ignored at model construction time.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      model_dir: Directory to save model parameters, graph etc. This can also be
+        used to load checkpoints from the directory into an estimator to
+        continue  training a previously saved model.
+      l1_regularization: L1-regularization parameter. Refers to global L1
+        regularization (across all examples).
+      l2_regularization: L2-regularization parameter. Refers to global L2
+        regularization (across all examples).
+      num_loss_partitions: number of partitions of the (global) loss function
+        optimized by the underlying optimizer (SDCAOptimizer).
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+
+    Returns:
+      A `SDCALinearRegressor` estimator.
+    """
+    super(SDCALinearRegressor, self).__init__(
+        example_id_column=example_id_column,
+        feature_columns=feature_columns,
+        weight_column_name=weight_column_name,
+        model_dir=model_dir,
+        head=head_lib.regression_head(weight_column_name=weight_column_name),
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        num_loss_partitions=num_loss_partitions,
+        config=config,
+        feature_engineering_fn=None)
+
+  def predict_scores(self, input_fn):
+    """Returns predicted scores for given features.
+
+    Args:
+      input_fn: The input function providing features.
+
+    Returns:
+      A generator of predicted scores for the features provided by input_fn.
+    """
+    key = prediction_key.PredictionKey.SCORES
+    predictions = super(SDCALinearRegressor, self).predict(
+        input_fn=input_fn, outputs=[key])
+    return (pred[key] for pred in predictions)
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
new file mode 100644
index 00000000000..32b7f956e47
--- /dev/null
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -0,0 +1,501 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for linear_optimizer.sdca_estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
+from tensorflow.contrib.linear_optimizer.python import sdca_estimator
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import test
+
+
+class SDCALogisticClassifierTest(test.TestCase):
+
+  def testRealValuedFeatures(self):
+    """Tests SDCALogisticClassifier works with real valued features."""
+
+    def input_fn():
+      return {
+          'example_id': constant_op.constant(['1', '2']),
+          'maintenance_cost': constant_op.constant([500.0, 200.0]),
+          'sq_footage': constant_op.constant([[800.0], [600.0]]),
+          'weights': constant_op.constant([[1.0], [1.0]])
+      }, constant_op.constant([[0], [1]])
+
+    maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost')
+    sq_footage = feature_column_lib.real_valued_column('sq_footage')
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id',
+        feature_columns=[maintenance_cost, sq_footage],
+        weight_column_name='weights')
+    classifier.fit(input_fn=input_fn, steps=100)
+    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss, 0.05)
+
+  def testRealValuedFeatureWithHigherDimension(self):
+    """Tests SDCALogisticClassifier with high-dimension real valued features."""
+
+    # input_fn is identical to the one in testRealValuedFeatures where 2
+    # 1-dimensional dense features are replaced by a 2-dimensional feature.
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2']),
+          'dense_feature':
+              constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
+      }, constant_op.constant([[0], [1]])
+
+    dense_feature = feature_column_lib.real_valued_column(
+        'dense_feature', dimension=2)
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id', feature_columns=[dense_feature])
+    classifier.fit(input_fn=input_fn, steps=100)
+    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss, 0.05)
+
+  def testBucketizedFeatures(self):
+    """Tests SDCALogisticClassifier with bucketized features."""
+
+    def input_fn():
+      return {
+          'example_id': constant_op.constant(['1', '2', '3']),
+          'price': constant_op.constant([600.0, 1000.0, 400.0]),
+          'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]),
+          'weights': constant_op.constant([[1.0], [1.0], [1.0]])
+      }, constant_op.constant([[1], [0], [1]])
+
+    price_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('price'),
+        boundaries=[500.0, 700.0])
+    sq_footage_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0])
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id',
+        feature_columns=[price_bucket, sq_footage_bucket],
+        weight_column_name='weights',
+        l2_regularization=1.0)
+    classifier.fit(input_fn=input_fn, steps=50)
+    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(metrics['accuracy'], 0.9)
+
+  def testSparseFeatures(self):
+    """Tests SDCALogisticClassifier with sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.4], [0.6], [0.3]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[1.0], [1.0], [1.0]])
+      }, constant_op.constant([[1], [0], [1]])
+
+    price = feature_column_lib.real_valued_column('price')
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id',
+        feature_columns=[price, country],
+        weight_column_name='weights')
+    classifier.fit(input_fn=input_fn, steps=50)
+    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(metrics['accuracy'], 0.9)
+
+  def testWeightedSparseFeatures(self):
+    """Tests SDCALogisticClassifier with weighted sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[2., 3., 1.],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 5]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 5])
+      }, constant_op.constant([[1], [0], [1]])
+
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
+        country, 'price')
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id',
+        feature_columns=[country_weighted_by_price])
+    classifier.fit(input_fn=input_fn, steps=50)
+    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(metrics['accuracy'], 0.9)
+
+  def testCrossedFeatures(self):
+    """Tests SDCALogisticClassifier with crossed features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english', 'italian', 'spanish'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 1]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['US', 'IT', 'MX'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 1])
+      }, constant_op.constant([[0], [0], [1]])
+
+    language = feature_column_lib.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=5)
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    country_language = feature_column_lib.crossed_column(
+        [language, country], hash_bucket_size=10)
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id', feature_columns=[country_language])
+    classifier.fit(input_fn=input_fn, steps=10)
+    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(metrics['accuracy'], 0.9)
+
+  def testMixedFeatures(self):
+    """Tests SDCALogisticClassifier with a mix of features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.6], [0.8], [0.3]]),
+          'sq_footage':
+              constant_op.constant([900.0, 700.0, 600.0]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [1.0], [1.0]])
+      }, constant_op.constant([[1], [0], [1]])
+
+    price = feature_column_lib.real_valued_column('price')
+    sq_footage_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('sq_footage'),
+        boundaries=[650.0, 800.0])
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    sq_footage_country = feature_column_lib.crossed_column(
+        [sq_footage_bucket, country], hash_bucket_size=10)
+    classifier = sdca_estimator.SDCALogisticClassifier(
+        example_id_column='example_id',
+        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
+        weight_column_name='weights')
+    classifier.fit(input_fn=input_fn, steps=50)
+    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(metrics['accuracy'], 0.9)
+
+
+class SDCALinearRegressorTest(test.TestCase):
+
+  def testRealValuedLinearFeatures(self):
+    """Tests SDCALinearRegressor works with real valued features."""
+    x = [[1.2, 2.0, -1.5], [-2.0, 3.0, -0.5], [1.0, -0.5, 4.0]]
+    weights = [[3.0], [-1.2], [0.5]]
+    y = np.dot(x, weights)
+
+    def input_fn():
+      return {
+          'example_id': constant_op.constant(['1', '2', '3']),
+          'x': constant_op.constant(x),
+          'weights': constant_op.constant([[10.0], [10.0], [10.0]])
+      }, constant_op.constant(y)
+
+    x_column = feature_column_lib.real_valued_column('x', dimension=3)
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[x_column],
+        weight_column_name='weights')
+    regressor.fit(input_fn=input_fn, steps=20)
+    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss, 0.01)
+    self.assertIn('linear/x/weight', regressor.get_variable_names())
+    regressor_weights = regressor.get_variable_value('linear/x/weight')
+    self.assertAllClose(
+        [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1)
+
+  def testMixedFeaturesArbitraryWeights(self):
+    """Tests SDCALinearRegressor works with a mix of features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.6], [0.8], [0.3]]),
+          'sq_footage':
+              constant_op.constant([[900.0], [700.0], [600.0]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [5.0], [7.0]])
+      }, constant_op.constant([[1.55], [-1.25], [-3.0]])
+
+    price = feature_column_lib.real_valued_column('price')
+    sq_footage_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('sq_footage'),
+        boundaries=[650.0, 800.0])
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    sq_footage_country = feature_column_lib.crossed_column(
+        [sq_footage_bucket, country], hash_bucket_size=10)
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
+        l2_regularization=1.0,
+        weight_column_name='weights')
+    regressor.fit(input_fn=input_fn, steps=20)
+    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss, 0.05)
+
+  def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
+    """SDCALinearRegressor works with sparse features and L1 regularization."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([0.4, 0.6, 0.3]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[10.0], [10.0], [10.0]])
+      }, constant_op.constant([[1.4], [-0.8], [2.6]])
+
+    price = feature_column_lib.real_valued_column('price')
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    # Regressor with no L1 regularization.
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[price, country],
+        weight_column_name='weights')
+    regressor.fit(input_fn=input_fn, steps=20)
+    no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/price/weight', variable_names)
+    self.assertIn('linear/country/weights', variable_names)
+    no_l1_reg_weights = {
+        'linear/price/weight':
+            regressor.get_variable_value('linear/price/weight'),
+        'linear/country/weights':
+            regressor.get_variable_value('linear/country/weights'),
+    }
+
+    # Regressor with L1 regularization.
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[price, country],
+        l1_regularization=1.0,
+        weight_column_name='weights')
+    regressor.fit(input_fn=input_fn, steps=20)
+    l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+    l1_reg_weights = {
+        'linear/price/weight':
+            regressor.get_variable_value('linear/price/weight'),
+        'linear/country/weights':
+            regressor.get_variable_value('linear/country/weights'),
+    }
+
+    # Unregularized loss is lower when there is no L1 regularization.
+    self.assertLess(no_l1_reg_loss, l1_reg_loss)
+    self.assertLess(no_l1_reg_loss, 0.05)
+
+    # But weights returned by the regressor with L1 regularization have smaller
+    # L1 norm.
+    l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
+    for var_name in sorted(l1_reg_weights):
+      l1_reg_weights_norm += sum(
+          np.absolute(l1_reg_weights[var_name].flatten()))
+      no_l1_reg_weights_norm += sum(
+          np.absolute(no_l1_reg_weights[var_name].flatten()))
+      print('Var name: %s, value: %s' % (var_name,
+                                         no_l1_reg_weights[var_name].flatten()))
+    self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
+
+  def testBiasOnly(self):
+    """Tests SDCALinearRegressor has a valid bias weight."""
+
+    def input_fn():
+      """Testing the bias weight when it's the only feature present.
+
+      All of the instances in this input only have the bias feature, and a
+      1/4 of the labels are positive. This means that the expected weight for
+      the bias should be close to the average prediction, i.e 0.25.
+      Returns:
+        Training data for the test.
+      """
+      num_examples = 40
+      return {
+          'example_id':
+              constant_op.constant([str(x + 1) for x in range(num_examples)]),
+          # place_holder is an empty column which is always 0 (absent), because
+          # LinearClassifier requires at least one column.
+          'place_holder':
+              constant_op.constant([[0.0]] * num_examples),
+      }, constant_op.constant([[1 if i % 4 is 0 else 0]
+                               for i in range(num_examples)])
+
+    place_holder = feature_column_lib.real_valued_column('place_holder')
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id', feature_columns=[place_holder])
+    regressor.fit(input_fn=input_fn, steps=100)
+    self.assertNear(
+        regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
+
+  def testBiasAndOtherColumns(self):
+    """SDCALinearRegressor has valid bias weight with other columns present."""
+
+    def input_fn():
+      """Testing the bias weight when there are other features present.
+
+      1/2 of the instances in this input have feature 'a', the rest have
+      feature 'b', and we expect the bias to be added to each instance as well.
+      0.4 of all instances that have feature 'a' are positive, and 0.2 of all
+      instances that have feature 'b' are positive. The labels in the dataset
+      are ordered to appear shuffled since SDCA expects shuffled data, and
+      converges faster with this pseudo-random ordering.
+      If the bias was centered we would expect the weights to be:
+      bias: 0.3
+      a: 0.1
+      b: -0.1
+      Until b/29339026 is resolved, the bias gets regularized with the same
+      global value for the other columns, and so the expected weights get
+      shifted and are:
+      bias: 0.2
+      a: 0.2
+      b: 0.0
+      Returns:
+        The test dataset.
+      """
+      num_examples = 200
+      half = int(num_examples / 2)
+      return {
+          'example_id':
+              constant_op.constant([str(x + 1) for x in range(num_examples)]),
+          'a':
+              constant_op.constant([[1]] * int(half) + [[0]] * int(half)),
+          'b':
+              constant_op.constant([[0]] * int(half) + [[1]] * int(half)),
+      }, constant_op.constant(
+          [[x]
+           for x in [1, 0, 0, 1, 1, 0, 0, 0, 1, 0] * int(half / 10) +
+           [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] * int(half / 10)])
+
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[
+            feature_column_lib.real_valued_column('a'),
+            feature_column_lib.real_valued_column('b')
+        ])
+
+    regressor.fit(input_fn=input_fn, steps=200)
+
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/a/weight', variable_names)
+    self.assertIn('linear/b/weight', variable_names)
+    # TODO(b/29339026): Change the expected results to expect a centered bias.
+    self.assertNear(
+        regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
+
+  def testBiasAndOtherColumnsFabricatedCentered(self):
+    """SDCALinearRegressor has valid bias weight when instances are centered."""
+
+    def input_fn():
+      """Testing the bias weight when there are other features present.
+
+      1/2 of the instances in this input have feature 'a', the rest have
+      feature 'b', and we expect the bias to be added to each instance as well.
+      0.1 of all instances that have feature 'a' have a label of 1, and 0.1 of
+      all instances that have feature 'b' have a label of -1.
+      We can expect the weights to be:
+      bias: 0.0
+      a: 0.1
+      b: -0.1
+      Returns:
+        The test dataset.
+      """
+      num_examples = 200
+      half = int(num_examples / 2)
+      return {
+          'example_id':
+              constant_op.constant([str(x + 1) for x in range(num_examples)]),
+          'a':
+              constant_op.constant([[1]] * int(half) + [[0]] * int(half)),
+          'b':
+              constant_op.constant([[0]] * int(half) + [[1]] * int(half)),
+      }, constant_op.constant([[1 if x % 10 == 0 else 0] for x in range(half)] +
+                              [[-1 if x % 10 == 0 else 0] for x in range(half)])
+
+    regressor = sdca_estimator.SDCALinearRegressor(
+        example_id_column='example_id',
+        feature_columns=[
+            feature_column_lib.real_valued_column('a'),
+            feature_column_lib.real_valued_column('b')
+        ])
+
+    regressor.fit(input_fn=input_fn, steps=100)
+
+    variable_names = regressor.get_variable_names()
+    self.assertIn('linear/bias_weight', variable_names)
+    self.assertIn('linear/a/weight', variable_names)
+    self.assertIn('linear/b/weight', variable_names)
+    self.assertNear(
+        regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05)
+    self.assertNear(
+        regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 9edb00e7b0a..92d022f2a30 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -32,6 +34,8 @@ class SDCAOptimizer(object):
   Estimator.
 
   Example usage:
+
+  ```python
     real_feature_column = real_valued_column(...)
     sparse_feature_column = sparse_column_with_hash_bucket(...)
     sdca_optimizer = linear.SDCAOptimizer(example_id_column='example_id',
@@ -44,19 +48,22 @@ class SDCAOptimizer(object):
         optimizer=sdca_optimizer)
     classifier.fit(input_fn_train, steps=50)
     classifier.evaluate(input_fn=input_fn_eval)
+  ```
 
-  Here the expectation is that the input_fn_* functions passed to train and
+  Here the expectation is that the `input_fn_*` functions passed to train and
   evaluate return a pair (dict, label_tensor) where dict has `example_id_column`
   as `key` whose value is a `Tensor` of shape [batch_size] and dtype string.
   num_loss_partitions defines the number of partitions of the global loss
-  function and should be set to (#concurrent train ops/per worker) x (#workers).
-  Convergence of (global) loss is guaranteed if num_loss_partitions is larger or
-  equal to the above product. Larger values for num_loss_partitions lead to
-  slower convergence. The recommended value for num_loss_partitions in tf.learn
-  (where currently there is one process per worker) is the number of workers
-  running the train steps. It defaults to 1 (single machine). num_table_shards
-  defines the number of shards for the internal state table, typically set to
-  match the number of parameter servers for large data sets.
+  function and should be set to `(#concurrent train ops/per worker)
+  x (#workers)`.
+  Convergence of (global) loss is guaranteed if `num_loss_partitions` is larger
+  or equal to the above product. Larger values for `num_loss_partitions` lead to
+  slower convergence. The recommended value for `num_loss_partitions` in
+  `tf.learn` (where currently there is one process per worker) is the number
+  of workers running the train steps. It defaults to 1 (single machine).
+  `num_table_shards` defines the number of shards for the internal state
+  table, typically set to match the number of parameter servers for large
+  data sets.
   """
 
   def __init__(self,
@@ -74,16 +81,36 @@ class SDCAOptimizer(object):
   def get_name(self):
     return 'SDCAOptimizer'
 
-  def get_train_step(self, columns_to_variables,
-                     weight_column_name, loss_type, features, targets,
-                     global_step):
+  @property
+  def example_id_column(self):
+    return self._example_id_column
+
+  @property
+  def num_loss_partitions(self):
+    return self._num_loss_partitions
+
+  @property
+  def num_table_shards(self):
+    return self._num_table_shards
+
+  @property
+  def symmetric_l1_regularization(self):
+    return self._symmetric_l1_regularization
+
+  @property
+  def symmetric_l2_regularization(self):
+    return self._symmetric_l2_regularization
+
+  def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
+                     features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
 
-    def _tensor_to_sparse_feature_column(dense_tensor):
+    def _dense_tensor_to_sparse_feature_column(dense_tensor):
       """Returns SparseFeatureColumn for the input dense_tensor."""
       ignore_value = 0.0
-      sparse_indices = array_ops.where(math_ops.not_equal(
-          dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
+      sparse_indices = array_ops.where(
+          math_ops.not_equal(dense_tensor,
+                             math_ops.cast(ignore_value, dense_tensor.dtype)))
       sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
       # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
       # very sparse features with weights and not weights.
@@ -108,34 +135,48 @@ class SDCAOptimizer(object):
       dense_features, sparse_features, sparse_feature_with_values = [], [], []
       dense_feature_weights = []
       sparse_feature_weights, sparse_feature_with_values_weights = [], []
-      # pylint: disable=protected-access
       for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
         transformed_tensor = features[column]
-        if isinstance(column, layers.feature_column._RealValuedColumn):
+        if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
           # A real-valued column corresponds to a dense feature in SDCA. A
-          # transformed tensor corresponding to a RealValuedColumn has rank 2
-          # (its shape is typically [batch_size, column.dimension]) and so it
-          # can be passed to SDCA as is.
+          # transformed tensor corresponding to a RealValuedColumn should have
+          # rank at most 2. In order to be passed to SDCA, its rank needs to be
+          # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
+          check_rank_op = control_flow_ops.Assert(
+              math_ops.less_equal(array_ops.rank(transformed_tensor), 2),
+              ['transformed_tensor shouls have rank at most 2.'])
+          # Reshape to [batch_size, dense_column_dimension].
+          with ops.control_dependencies([check_rank_op]):
+            transformed_tensor = array_ops.reshape(transformed_tensor, [
+                array_ops.shape(transformed_tensor)[0], -1
+            ])
+
           dense_features.append(transformed_tensor)
           # For real valued columns, the variables list contains exactly one
           # element.
           dense_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._BucketizedColumn):
+        elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
           # A bucketized column corresponds to a sparse feature in SDCA. The
           # bucketized feature is "sparsified" for SDCA by converting it to a
           # SparseFeatureColumn respresenting the one-hot encoding of the
           # bucketized feature.
-          dense_bucket_tensor = layers.input_from_feature_columns(
-              {column: transformed_tensor}, [column])
-          sparse_feature_column = _tensor_to_sparse_feature_column(
+          #
+          # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
+          # bucketized feature column to a dense feature in SDCA. This will
+          # likely depend on the number of buckets.
+          dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
+          sparse_feature_column = _dense_tensor_to_sparse_feature_column(
               dense_bucket_tensor)
           sparse_feature_with_values.append(sparse_feature_column)
           # For bucketized columns, the variables list contains exactly one
           # element.
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
-        elif isinstance(column, (layers.feature_column._CrossedColumn,
-                                 layers.feature_column._SparseColumn)):
+        elif isinstance(
+            column,
+            (
+                layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
+                layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
           sparse_features.append(
               SparseFeatureColumn(
                   array_ops.reshape(
@@ -143,10 +184,9 @@ class SDCAOptimizer(object):
                           value=transformed_tensor.indices,
                           num_or_size_splits=2,
                           axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]),
-                  None))
+                  array_ops.reshape(transformed_tensor.values, [-1]), None))
           sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):
+        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
           id_tensor = column.id_tensor(transformed_tensor)
           weight_tensor = column.weight_tensor(transformed_tensor)
           sparse_feature_with_values.append(
@@ -158,11 +198,10 @@ class SDCAOptimizer(object):
                   array_ops.reshape(id_tensor.values, [-1]),
                   array_ops.reshape(weight_tensor.values, [-1])))
           sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
+              columns_to_variables[column][0])
         else:
           raise ValueError('SDCAOptimizer does not support column type %s.' %
                            type(column).__name__)
-      # pylint: enable=protected-access
 
       example_weights = array_ops.reshape(
           features[weight_column_name],
@@ -170,12 +209,13 @@ class SDCAOptimizer(object):
       example_ids = features[self._example_id_column]
       sparse_feature_with_values.extend(sparse_features)
       sparse_feature_with_values_weights.extend(sparse_feature_weights)
-      examples = dict(sparse_features=sparse_feature_with_values,
-                      dense_features=dense_features,
-                      example_labels=math_ops.to_float(array_ops.reshape(
-                          targets, shape=[-1])),
-                      example_weights=example_weights,
-                      example_ids=example_ids)
+      examples = dict(
+          sparse_features=sparse_feature_with_values,
+          dense_features=dense_features,
+          example_labels=math_ops.to_float(
+              array_ops.reshape(targets, shape=[-1])),
+          example_weights=example_weights,
+          example_ids=example_ids)
       sdca_variables = dict(
           sparse_features_weights=sparse_feature_with_values_weights,
           dense_features_weights=dense_feature_weights)
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index bca615a2c5d..1090cecab51 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -7,6 +7,9 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:internal"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+# TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
     name = "lookup_py",
     srcs = [
@@ -16,10 +19,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops_gen",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lookup_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
@@ -37,11 +41,11 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/lookup/__init__.py b/tensorflow/contrib/lookup/__init__.py
index e743832e807..dbd64cf0421 100644
--- a/tensorflow/contrib/lookup/__init__.py
+++ b/tensorflow/contrib/lookup/__init__.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# TODO(ptucker): deprecate string_to_index_table_from_file and
+# string_to_index_table_from_tensor 2017-04-10.
 """Ops for lookup operations.
 
 @@string_to_index
 @@string_to_index_table_from_file
 @@string_to_index_table_from_tensor
+@@index_table_from_file
+@@index_table_from_tensor
 @@index_to_string
 @@index_to_string_table_from_file
 @@index_to_string_table_from_tensor
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 5b4f5cee2d1..f0f1c14fcaa 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -12,874 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Lookup table Operations."""
-# pylint: disable=g-bad-name
+"""Lookup table operations."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import functools
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import gen_lookup_ops
+from tensorflow.python.ops import lookup_ops
+# pylint: disable=unused-import
+from tensorflow.python.ops.lookup_ops import FastHashSpec
+from tensorflow.python.ops.lookup_ops import HasherSpec
+from tensorflow.python.ops.lookup_ops import HashTable
+from tensorflow.python.ops.lookup_ops import IdTableWithHashBuckets
+from tensorflow.python.ops.lookup_ops import index_table_from_file
+from tensorflow.python.ops.lookup_ops import index_to_string_table_from_file
+from tensorflow.python.ops.lookup_ops import InitializableLookupTableBase
+from tensorflow.python.ops.lookup_ops import KeyValueTensorInitializer
+from tensorflow.python.ops.lookup_ops import LookupInterface
+from tensorflow.python.ops.lookup_ops import StrongHashSpec
+from tensorflow.python.ops.lookup_ops import TableInitializerBase
+from tensorflow.python.ops.lookup_ops import TextFileIdTableInitializer
+from tensorflow.python.ops.lookup_ops import TextFileIndex
+from tensorflow.python.ops.lookup_ops import TextFileInitializer
+from tensorflow.python.ops.lookup_ops import TextFileStringTableInitializer
+# pylint: enable=unused-import
 from tensorflow.python.training.saver import BaseSaverBuilder
-from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
 
-class LookupInterface(object):
-  """Represent a lookup table that persists across different steps."""
-
-  def __init__(self, key_dtype, value_dtype, name):
-    """Construct a lookup table interface.
-
-    Args:
-      key_dtype: The table key type.
-      value_dtype: The table value type.
-      name: A name for the operation (optional).
-    """
-    self._key_dtype = dtypes.as_dtype(key_dtype)
-    self._value_dtype = dtypes.as_dtype(value_dtype)
-    self._name = name
-
-  @property
-  def key_dtype(self):
-    """The table key dtype."""
-    return self._key_dtype
-
-  @property
-  def value_dtype(self):
-    """The table value dtype."""
-    return self._value_dtype
-
-  @property
-  def name(self):
-    """The name of the table."""
-    return self._name
-
-  @property
-  def init(self):
-    """The table initialization op."""
-    raise NotImplementedError
-
-  def size(self, name=None):
-    """Compute the number of elements in this table."""
-    raise NotImplementedError
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values."""
-    raise NotImplementedError
-
-  def check_table_dtypes(self, key_dtype, value_dtype):
-    """Check that the given key_dtype and value_dtype matches the table dtypes.
-
-    Args:
-      key_dtype: The key data type to check.
-      value_dtype: The value data type to check.
-
-    Raises:
-      TypeError: when 'key_dtype' or 'value_dtype' doesn't match the table data
-        types.
-    """
-    if key_dtype != self.key_dtype:
-      raise TypeError("Invalid key dtype, expected %s but got %s." %
-                      (self.key_dtype, key_dtype))
-    if value_dtype != self.value_dtype:
-      raise TypeError("Invalid value dtype, expected %s but got %s." %
-                      (self.value_dtype, value_dtype))
-
-
-class InitializableLookupTableBase(LookupInterface):
-  """Initializable lookup table interface.
-
-  An initializable lookup tables persist across different steps.
-  """
-
-  def __init__(self, table_ref, default_value, initializer):
-    """Construct a table object from a table reference.
-
-    If requires a table initializer object (subclass of `TableInitializerBase`).
-    It provides the table key and value types, as well as the op to initialize
-    the table. The caller is responsible to execute the initialization op.
-
-    Args:
-      table_ref: The table reference, i.e. the output of the lookup table ops.
-      default_value: The value to use if a key is missing in the table.
-      initializer: The table initializer to use.
-    """
-    super(InitializableLookupTableBase, self).__init__(
-        initializer.key_dtype, initializer.value_dtype,
-        table_ref.op.name.split("/")[-1])
-    self._table_ref = table_ref
-    self._default_value = ops.convert_to_tensor(default_value,
-                                                dtype=self._value_dtype)
-    self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    self._init = initializer.initialize(self)
-
-  @property
-  def table_ref(self):
-    """Get the underlying table reference."""
-    return self._table_ref
-
-  @property
-  def default_value(self):
-    """The default value of the table."""
-    return self._default_value
-
-  @property
-  def init(self):
-    """The table initialization op."""
-    return self._init
-
-  def size(self, name=None):
-    """Compute the number of elements in this table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A scalar tensor containing the number of elements in this table.
-    """
-    with ops.name_scope(name, "%s_Size" % self._name,
-                        [self._table_ref]) as scope:
-      # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=scope)
-      # pylint: enable=protected-access
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    The `default_value` is used for keys not present in the table.
-
-    Args:
-      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
-      name: A name for the operation (optional).
-
-    Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
-
-    Raises:
-      TypeError: when `keys` or `default_value` doesn't match the table data
-        types.
-    """
-    key_tensor = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
-      key_tensor = keys.values
-
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    with ops.name_scope(name, "%s_Lookup" % self._name,
-                        [self._table_ref]) as scope:
-      # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
-          self._table_ref, key_tensor, self._default_value, name=scope)
-      # pylint: enable=protected-access
-
-    values.set_shape(key_tensor.get_shape())
-    if isinstance(keys, sparse_tensor.SparseTensor):
-      return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
-    else:
-      return values
-
-
-class HashTable(InitializableLookupTableBase):
-  """A generic hash table implementation.
-
-  Example usage:
-
-  ```python
-  table = tf.contrib.lookup.HashTable(
-      tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
-  out = table.lookup(input_tensor).
-  table.init.run()
-  print out.eval()
-  ```
-  """
-
-  def __init__(self, initializer, default_value, shared_name=None, name=None):
-    """Creates a non-initialized `HashTable` object.
-
-    Creates a table, the type of its keys and values are specified by the
-    initializer.
-    Before using the table you will have to initialize it. After initialization
-    the table will be immutable.
-
-    Args:
-      initializer: The table initializer to use.
-      default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
-      name: A name for the operation (optional).
-
-    Returns:
-      A `HashTable` object.
-    """
-    with ops.name_scope(name, "hash_table", [initializer]) as scope:
-      # pylint: disable=protected-access
-      table_ref = gen_data_flow_ops._hash_table(
-          shared_name=shared_name,
-          key_dtype=initializer.key_dtype,
-          value_dtype=initializer.value_dtype,
-          name=scope)
-      # pylint: enable=protected-access
-
-      super(HashTable, self).__init__(table_ref, default_value, initializer)
-
-
-class TableInitializerBase(object):
-  """Base class for lookup table initializers."""
-
-  def __init__(self, key_dtype, value_dtype):
-    """Construct a table initializer object.
-
-    Args:
-      key_dtype: Type of the table keys.
-      value_dtype: Type of the table values.
-    """
-    self._key_dtype = dtypes.as_dtype(key_dtype)
-    self._value_dtype = dtypes.as_dtype(value_dtype)
-
-  @property
-  def key_dtype(self):
-    """The expected table key dtype."""
-    return self._key_dtype
-
-  @property
-  def value_dtype(self):
-    """The expected table value dtype."""
-    return self._value_dtype
-
-  def initialize(self, table):
-    """Returns the table initialization op."""
-    raise NotImplementedError
-
-
-class KeyValueTensorInitializer(TableInitializerBase):
-  """Table initializers given `keys` and `values` tensors."""
-
-  def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
-    """Constructs a table initializer object based on keys and values tensors.
-
-    Args:
-      keys: The tensor for the keys.
-      values: The tensor for the values.
-      key_dtype: The `keys` data type. Used when `keys` is a python array.
-      value_dtype: The `values` data type. Used when `values` is a python array.
-      name: A name for the operation (optional).
-    """
-    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
-      self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
-      self._values = ops.convert_to_tensor(values,
-                                           dtype=value_dtype,
-                                           name="values")
-      self._name = scope
-
-    super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
-                                                    self._values.dtype)
-
-  def initialize(self, table):
-    """Initializes the given `table` with `keys` and `values` tensors.
-
-    Args:
-      table: The table to initialize.
-
-    Returns:
-      The operation that initializes the table.
-
-    Raises:
-      TypeError: when the keys and values data types do not match the table
-      key and value data types.
-    """
-    table.check_table_dtypes(self._keys.dtype, self._values.dtype)
-    with ops.name_scope(self._name, values=[table]) as scope:
-      # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table(table.table_ref,
-                                                    self._keys,
-                                                    self._values,
-                                                    name=scope)
-      # pylint: enable=protected-access
-    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
-    return init_op
-
-
-class TextFileIndex(object):
-  WHOLE_LINE = -2
-  LINE_NUMBER = -1
-
-
-class TextFileInitializer(TableInitializerBase):
-  """Table initializers from a text file.
-
-  This initializer assigns one entry in the table for each line in the file.
-
-  The key and value type of the table to initialize is given by `key_dtype` and
-  `value_dtype`.
-
-  The key and value content to get from each line is specified by
-  the `key_index` and `value_index`.
-    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
-      expects data type int64.
-    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
-      type string.
-    - A value >=0 means use the index (starting at zero) of the split line based
-      on `delimiter`.
-
-  For example if we have a file with the following content:
-
-  ```
-  emerson 10
-  lake 20
-  palmer 30
-  ```
-
-  The following snippet initializes a table with the first column as keys and
-  second column as values:
-  - emerson -> 10
-  - lake -> 20
-  - palmer -> 30
-
-  ```python
-  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
-      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
-
-  Similarly to initialize the whole line as keys and the line number as values.
-  - emerson 10 -> 0
-  - lake 20 -> 1
-  - palmer 30 -> 2
-
-  ```python
-  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
-      "test.txt", tf.string, tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
-      tf.int64, tf.contrib.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
-  """
-
-  def __init__(self,
-               filename,
-               key_dtype,
-               key_index,
-               value_dtype,
-               value_index,
-               vocab_size=None,
-               delimiter="\t",
-               name=None):
-    """Constructs a table initializer object to populate from a text file.
-
-    It generates one key-value pair per line. The type of table key and
-    value are specified by `key_dtype` and `value_dtype`, respectively.
-    Similarly the content of the key and value are specified by the key_index
-    and value_index.
-
-    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
-      expects data type int64.
-    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
-      type string.
-    - A value >=0 means use the index (starting at zero) of the split line based
-      on `delimiter`.
-
-    Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
-      key_dtype: The `key` data type.
-      key_index: the index that represents information of a line to get the
-        table 'key' values from.
-      value_dtype: The `value` data type.
-      value_index: the index that represents information of a line to get the
-        table 'value' values from.'
-      vocab_size: The number of elements in the file, if known.
-      delimiter: The delimiter to separate fields in a line.
-      name: A name for the operation (optional).
-
-    Raises:
-      ValueError: when the filename is empty, or when the table key and value
-      data types do not match the expected data types.
-    """
-    if not isinstance(filename, ops.Tensor) and not filename:
-      raise ValueError("Filename required for %s." % name)
-
-    key_dtype = dtypes.as_dtype(key_dtype)
-    value_dtype = dtypes.as_dtype(value_dtype)
-
-    if key_index < -2:
-      raise ValueError("Invalid key index %s." % (key_index))
-
-    if key_index == TextFileIndex.LINE_NUMBER and key_dtype != dtypes.int64:
-      raise ValueError("Signature mismatch. Keys must be dtype %s, got %s." %
-                       (dtypes.int64, key_dtype))
-    if key_index == TextFileIndex.WHOLE_LINE and key_dtype != dtypes.string:
-      raise ValueError("Signature mismatch. Keys must be dtype %s, got %s." %
-                       (dtypes.string, key_dtype))
-    if value_index < -2:
-      raise ValueError("Invalid value index %s." % (value_index))
-
-    if value_index == TextFileIndex.LINE_NUMBER and value_dtype != dtypes.int64:
-      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
-                       (dtypes.int64, value_dtype))
-    if value_index == TextFileIndex.WHOLE_LINE and value_dtype != dtypes.string:
-      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
-                       (dtypes.string, value_dtype))
-
-    if (vocab_size is not None) and (vocab_size <= 0):
-      raise ValueError("Invalid vocab_size %s." % vocab_size)
-
-    self._filename = filename
-    self._key_index = key_index
-    self._value_index = value_index
-    self._vocab_size = vocab_size
-    self._delimiter = delimiter
-    self._name = name
-
-    super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
-
-  def initialize(self, table):
-    """Initializes the table from a text file.
-
-    Args:
-      table: The table to be initialized.
-
-    Returns:
-      The operation that initializes the table.
-
-    Raises:
-      TypeError: when the keys and values data types do not match the table
-      key and value data types.
-    """
-    table.check_table_dtypes(self.key_dtype, self.value_dtype)
-    with ops.name_scope(self._name, "text_file_init", [table]) as scope:
-      filename = ops.convert_to_tensor(self._filename,
-                                       dtypes.string,
-                                       name="asset_filepath")
-      # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table_from_text_file(
-          table.table_ref,
-          filename,
-          self._key_index,
-          self._value_index,
-          -1 if self._vocab_size is None else self._vocab_size,
-          self._delimiter,
-          name=scope)
-      # pylint: enable=protected-access
-    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
-    ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
-    return init_op
-
-
-class TextFileStringTableInitializer(TextFileInitializer):
-  """Table initializer for `int64` IDs to string tables from a text file."""
-
-  def __init__(self,
-               filename,
-               key_column_index=TextFileIndex.LINE_NUMBER,
-               value_column_index=TextFileIndex.WHOLE_LINE,
-               vocab_size=None,
-               delimiter="\t",
-               name="text_file_string_table_init"):
-    """Constructs an initializer for an id-to-string table from a text file.
-
-    It populates a table that its key and value types are int64 and string,
-    respectively. It generates one key-value pair per line.
-    The content of the key and value are specified by `key_column_index`
-    and `value_column_index`.
-
-    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
-      expects data type int64.
-    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
-      type string.
-    - A value >=0 means use the index (starting at zero) of the split line based
-      on `delimiter`.
-
-    Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
-      key_column_index: The column index from the text file to get the keys
-        from. The default is 0 that represents the whole line content.
-      value_column_index: The column index from the text file to get the
-        values from. The default is to use the line number, starting from zero.
-      vocab_size: The number of elements in the file, if known.
-      delimiter: The delimiter to separate fields in a line.
-      name: Optional name for the op.
-
-    Raises:
-      TypeError: when the filename is empty, or when the table key and value
-      data types do not match the expected data types.
-    """
-    super(TextFileStringTableInitializer, self).__init__(filename,
-                                                         dtypes.int64,
-                                                         key_column_index,
-                                                         dtypes.string,
-                                                         value_column_index,
-                                                         vocab_size=vocab_size,
-                                                         delimiter=delimiter,
-                                                         name=name)
-
-
-class TextFileIdTableInitializer(TextFileInitializer):
-  """Table initializer for string to `int64` IDs tables from a text file."""
-
-  def __init__(self,
-               filename,
-               key_column_index=TextFileIndex.WHOLE_LINE,
-               value_column_index=TextFileIndex.LINE_NUMBER,
-               vocab_size=None,
-               delimiter="\t",
-               name="text_file_id_table_init"):
-    """Constructs an initializer for an string-to-id table from a text file.
-
-    It populates a table that its key and value types are string and int64,
-    respectively. It generates one key-value pair per line.
-    The content of the key and value are specified by the key_index
-    and value_index.
-
-    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
-      expects data type int64.
-    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
-      type string.
-    - A value >=0 means use the index (starting at zero) of the split line based
-      on `delimiter`.
-
-    Args:
-      filename: The filename of the text file to be used for initialization.
-        The path must be accessible from wherever the graph is initialized
-        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
-      key_column_index: The column index from the text file to get the `key`
-        values from. The default is to use the line number, starting from zero.
-      value_column_index: The column index from the text file ro get the `value`
-        values from. The default is 0 that represents the whole line content.
-      vocab_size: The number of elements in the file, if known.
-      delimiter: The delimiter to separate fields in a line.
-      name: Optional name for the op.
-
-    Raises:
-      TypeError: when the filename is empty, or when the table key and value
-      data types do not match the expected data types.
-    """
-    super(TextFileIdTableInitializer, self).__init__(filename,
-                                                     dtypes.string,
-                                                     key_column_index,
-                                                     dtypes.int64,
-                                                     value_column_index,
-                                                     vocab_size=vocab_size,
-                                                     delimiter=delimiter,
-                                                     name=name)
-
-
-class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
-  """A structure for the spec of the hashing function to use for hash buckets.
-
-  `hasher` is the name of the hashing function to use (eg. "fasthash",
-  "stronghash").
-  `key` is optional and specify the key to use for the hash function if
-  supported, currently only used by a strong hash.
-
-  Fields:
-    hasher: The hasher name to use.
-    key: The key to be used by the hashing function, if required.
-  """
-  __slots__ = ()
-
-
-FastHashSpec = HasherSpec("fasthash", None)
-
-
-class StrongHashSpec(HasherSpec):
-  """A structure to specify a key of the strong keyed hash spec.
-
-  The strong hash requires a `key`, which is a list of 2 unsigned integer
-  numbers. These should be non-zero; random numbers generated from random.org
-  would be a fine choice.
-
-  Fields:
-    key: The key to be used by the keyed hashing function.
-  """
-  __slots__ = ()
-
-  def __new__(cls, key):
-    if len(key) != 2:
-      raise ValueError("key must have size 2, got %s." % len(key))
-
-    if not isinstance(key[0], compat.integral_types) or not isinstance(
-        key[1], compat.integral_types):
-      raise TypeError("Invalid key %s. Must be unsigned integer values." % key)
-
-    return super(cls, StrongHashSpec).__new__(cls, "stronghash", key)
-
-
-class IdTableWithHashBuckets(LookupInterface):
-  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
-
-  For example, if an instance of `IdTableWithHashBuckets` is initialized with a
-  string-to-id table that maps:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
-
-  The `IdTableWithHashBuckets` object will performs the following mapping:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
-  - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
-    hash(<term>) % num_oov_buckets + vocab_size
-
-  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
-  the lookup result is [0, 1, 2, 4, 7]
-
-  If `table` is None, only out-of-vocabulary buckets are used.
-
-  Example usage:
-
-  ```python
-  num_oov_buckets = 3
-  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
-  table = tf.IdTableWithHashBuckets(
-      tf.HashTable(tf.TextFileIdTableInitializer(filename), default_value),
-      num_oov_buckets)
-  out = table.lookup(input_tensor).
-  table.init.run()
-  print out.eval()
-  ```
-
-  The hash function used for generating out-of-vocabulary buckets ID is handled
-  by `hasher_spec`.
-  """
-
-  def __init__(self,
-               table,
-               num_oov_buckets,
-               hasher_spec=FastHashSpec,
-               name=None):
-    """Construct a `IdTableWithHashBuckets` object.
-
-    Args:
-      table: Table that maps string to ids.
-      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys.
-      hasher_spec: A `HasherSpec` to specify the hash function to use for
-        assignation of out-of-vocabulary buckets  (optional).
-      name: A name for the operation (optional).
-
-    Raises:
-      ValueError: when `table` in None and `num_oov_buckets` is not positive.
-      TypeError: when `hasher_spec` is invalid.
-    """
-    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
-    # characters to use as table name.
-    if name:
-      name = name.rstrip("/")
-    if table:
-      table.check_table_dtypes(dtypes.string, dtypes.int64)
-      self._table = table
-      name = name or self._table.name
-    else:
-      if num_oov_buckets <= 0:
-        raise ValueError("oov_buckets must be > 0 if no table is supplied.")
-      self._table = None
-      name = name or "hash_bucket"
-    self._num_oov_buckets = num_oov_buckets
-
-    if not isinstance(hasher_spec, HasherSpec):
-      raise TypeError("hasher_spec must be of type HasherSpec, got %s" %
-                      hasher_spec)
-    self._hasher_spec = hasher_spec
-
-    super(IdTableWithHashBuckets, self).__init__(dtypes.string, dtypes.int64,
-                                                 name.split("/")[-1])
-
-  @property
-  def init(self):
-    """The table initialization op."""
-    if self._table:
-      return self._table.init
-    with ops.name_scope(None, "init"):
-      return control_flow_ops.no_op()
-
-  def size(self, name=None):
-    """Compute the number of elements in this table."""
-    with ops.name_scope(name, "%s_Size" % self.name) as scope:
-      if self._table:
-        tsize = self._table.size(scope)
-      else:
-        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
-      return tsize + self._num_oov_buckets
-
-  def _get_string_to_hash_bucket_fn(self, hasher_spec):
-    """Returns the string_to_hash_bucket op to use based on `hasher_spec`."""
-    if not isinstance(hasher_spec, HasherSpec):
-      raise TypeError("hasher_spec must be of type HasherSpec %s" % hasher_spec)
-    if hasher_spec.hasher == "fasthash":
-      return string_ops.string_to_hash_bucket_fast
-    if hasher_spec.hasher == "legacy":
-      return string_ops.string_to_hash_bucket
-    if hasher_spec.hasher == "stronghash":
-      return functools.partial(
-          string_ops.string_to_hash_bucket_strong, key=hasher_spec.key)
-    raise ValueError("Unknown hasher %s" % hasher_spec.hasher)
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in the table, outputs the corresponding values.
-
-    It assigns out-of-vocabulary keys to buckets based in their hashes.
-
-    Args:
-      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
-      name: Optional name for the op.
-
-    Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
-
-    Raises:
-      TypeError: when `keys` doesn't match the table key data type.
-    """
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    string_values = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
-      string_values = keys.values
-
-    if self._num_oov_buckets == 0:
-      ids = self._table.lookup(string_values, name=name)
-    else:
-      # TODO(yleon): Consider moving this functionality to its own kernel.
-      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
-        str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
-            self._hasher_spec)
-        buckets = str_to_hash_bucket(
-            string_values,
-            num_buckets=self._num_oov_buckets,
-            name="hash_bucket")
-        if self._table:
-          ids = self._table.lookup(string_values)
-          buckets = math_ops.add(buckets, self._table.size())
-          is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
-          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
-        else:
-          ids = buckets
-    if isinstance(keys, sparse_tensor.SparseTensor):
-      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
-    return ids
-
-
+@deprecated("2017-04-10", "Use `index_table_from_file`.")
 def string_to_index_table_from_file(vocabulary_file=None,
                                     num_oov_buckets=0,
                                     vocab_size=None,
                                     default_value=-1,
                                     hasher_spec=FastHashSpec,
                                     name=None):
-  """Returns a lookup table that converts a string tensor into int64 IDs.
-
-  This operation constructs a lookup table to convert tensor of strings into
-  int64 IDs. The mapping can be initialized from a vocabulary file specified in
-  `vocabulary_file`, where the whole line is the key and the zero-based line
-  number is the ID.
-
-  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
-  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
-  `default_value`.
-  The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`.
-
-  The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
-
-  Sample Usages:
-
-  If we have a vocabulary file "test.txt" with the following content:
-
-  ```
-  emerson
-  lake
-  palmer
-  ```
-
-  ```python
-  features = tf.constant(["emerson", "lake", "and", "palmer"])
-  table = tf.contrib.lookup.string_to_index_table_from_file(
-      vocabulary_file="test.txt", num_oov_buckets=1)
-  ids = table.lookup(features)
-  ...
-  tf.tables_initializer().run()
-
-  ids.eval()  ==> [0, 1, 3, 2]  # where 3 is the out-of-vocabulary bucket
-  ```
-
-  Args:
-    vocabulary_file: The vocabulary filename.
-    num_oov_buckets: The number of out-of-vocabulary buckets.
-    vocab_size: Number of the elements in the vocabulary, if known.
-    default_value: The value to use for out-of-vocabulary feature values.
-      Defaults to -1.
-    hasher_spec: A `HasherSpec` to specify the hash function to use for
-      assignation of out-of-vocabulary buckets.
-    name: A name for this op (optional).
-
-  Returns:
-    The lookup table to map a string `Tensor` to index `int64` `Tensor`.
-
-  Raises:
-    ValueError: If `vocabulary_file` is not set.
-    ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
-      than zero.
-  """
-  if not vocabulary_file:
-    raise ValueError("vocabulary_file must be specified.")
-  if num_oov_buckets < 0:
-    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
-                     % num_oov_buckets)
-  if vocab_size is not None and vocab_size < 1:
-    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
-
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
-    table = None
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if vocab_size:
-        # Keep the shared_name:
-        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                  TextFileIndex.WHOLE_LINE,
-                                                  TextFileIndex.LINE_NUMBER)
-      else:
-        # Keep the shared_name
-        # <table_type>_<filename>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                               TextFileIndex.WHOLE_LINE,
-                                               TextFileIndex.LINE_NUMBER)
-      init = TextFileIdTableInitializer(
-          vocabulary_file, vocab_size=vocab_size, name="table_init")
-
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
-    if num_oov_buckets:
-      table = IdTableWithHashBuckets(
-          table,
-          num_oov_buckets=num_oov_buckets,
-          hasher_spec=hasher_spec,
-          name=feat_to_id_scope)
-
-    return table
+  return index_table_from_file(
+      vocabulary_file, num_oov_buckets, vocab_size, default_value, hasher_spec,
+      key_dtype=dtypes.string, name=name)
 
 
+@deprecated("2017-04-10", "Use `index_table_from_tensor`.")
 def string_to_index_table_from_tensor(mapping,
                                       num_oov_buckets=0,
                                       default_value=-1,
                                       hasher_spec=FastHashSpec,
                                       name=None):
+  with ops.name_scope(name, "string_to_index") as scope:
+    mapping = ops.convert_to_tensor(mapping)
+  if dtypes.string != mapping.dtype.base_dtype:
+    raise ValueError("string_to_index_table_from_tensor requires string.")
+  return index_table_from_tensor(
+      mapping, num_oov_buckets, default_value, hasher_spec, name=scope)
+
+
+def index_table_from_tensor(mapping,
+                            num_oov_buckets=0,
+                            default_value=-1,
+                            hasher_spec=FastHashSpec,
+                            dtype=dtypes.string,
+                            name=None):
   """Returns a lookup table that converts a string tensor into int64 IDs.
 
   This operation constructs a lookup table to convert tensor of strings into
@@ -901,8 +97,8 @@ def string_to_index_table_from_tensor(mapping,
   Sample Usages:
 
   ```python
-  mapping_strings = t.constant(["emerson", "lake", "palmer")
-  table = tf.contrib.lookup.string_to_index_table_from_tensor(
+  mapping_strings = tf.constant(["emerson", "lake", "palmer"])
+  table = tf.contrib.lookup.index_table_from_tensor(
       mapping=mapping_strings, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = table.lookup(features)
@@ -913,53 +109,38 @@ def string_to_index_table_from_tensor(mapping,
   ```
 
   Args:
-    mapping: A 1-D string `Tensor` that specifies the mapping of strings to
-      indices.
+    mapping: A 1-D `Tensor` that specifies the mapping of keys to indices. The
+      type of this object must be castable to `dtype`.
     num_oov_buckets: The number of out-of-vocabulary buckets.
     default_value: The value to use for out-of-vocabulary feature values.
       Defaults to -1.
     hasher_spec: A `HasherSpec` to specify the hash function to use for
-      assignation of out-of-vocabulary buckets.
+      assignment of out-of-vocabulary buckets.
+    dtype: The type of values passed to `lookup`. Only string and integers are
+      supported.
     name: A name for this op (optional).
 
   Returns:
-    The lookup table to map a string `Tensor` to index `int64` `Tensor`.
+    The lookup table to map an input `Tensor` to index `int64` `Tensor`.
 
   Raises:
-    ValueError: `mapping` is invalid.
+    ValueError: If `mapping` is invalid.
     ValueError: If `num_oov_buckets` is negative.
   """
   if mapping is None:
     raise ValueError("mapping must be specified.")
-
-  if num_oov_buckets < 0:
-    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
-                     % num_oov_buckets)
-
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
-    keys = ops.convert_to_tensor(mapping, dtypes.string)
-    num_elements = array_ops.size(keys)
-    values = math_ops.cast(math_ops.range(num_elements), dtypes.int64)
-
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      init = KeyValueTensorInitializer(
-          keys, values, dtypes.string, dtypes.int64, name="table_init")
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
-    if num_oov_buckets:
-      table = IdTableWithHashBuckets(
-          table,
-          num_oov_buckets=num_oov_buckets,
-          hasher_spec=hasher_spec,
-          name=feat_to_id_scope)
-
-    return table
+  return lookup_ops.index_table_from_tensor(
+      vocabulary_list=mapping,
+      num_oov_buckets=num_oov_buckets,
+      default_value=default_value,
+      hasher_spec=hasher_spec,
+      dtype=dtype,
+      name=name)
 
 
 @deprecated(
     "2017-01-07", "This op will be removed after the deprecation date. "
-    "Please switch to string_to_index_table_from_tensor and call the lookup "
+    "Please switch to index_table_from_tensor and call the lookup "
     "method of the returned table.")
 def string_to_index(tensor, mapping, default_value=-1, name=None):
   """Maps `tensor` of strings into `int64` indices based on `mapping`.
@@ -980,7 +161,7 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
   For example:
 
   ```python
-  mapping_strings = t.constant(["emerson", "lake", "palmer")
+  mapping_strings = tf.constant(["emerson", "lake", "palmer"])
   feats = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = tf.contrib.lookup.string_to_index(
       feats, mapping=mapping_strings, default_value=-1)
@@ -1002,88 +183,11 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
     The mapped indices. It has the same shape and tensor type (dense or sparse)
     as `tensor`.
   """
-  table = string_to_index_table_from_tensor(
+  table = index_table_from_tensor(
       mapping=mapping, default_value=default_value, name=name)
   return table.lookup(tensor)
 
 
-def index_to_string_table_from_file(vocabulary_file,
-                                    vocab_size=None,
-                                    default_value="UNK",
-                                    name=None):
-  """Returns a lookup table that maps a `Tensor` of indices into strings.
-
-  This operation constructs a lookup table to map int64 indices into string
-  values. The table is initialized from a vocabulary file specified in
-  `vocabulary_file`, where the whole line is the value and the
-  zero-based line number is the index.
-
-  Any input which does not have a corresponding index in the vocabulary file
-  (an out-of-vocabulary entry) is assigned the `default_value`
-
-  The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
-
-  Sample Usages:
-
-  If we have a vocabulary file "test.txt" with the following content:
-
-  ```
-  emerson
-  lake
-  palmer
-  ```
-
-  ```python
-  indices = tf.constant([1, 5], tf.int64)
-  table = tf.contrib.lookup.index_to_string_table_from_file(
-      vocabulary_file="test.txt", default_value="UNKNOWN")
-  values = table.lookup(indices)
-  ...
-  tf.tables_initializer().run()
-
-  values.eval() ==> ["lake", "UNKNOWN"]
-  ```
-
-  Args:
-    vocabulary_file: The vocabulary filename.
-    vocab_size: Number of the elements in the vocabulary, if known.
-    default_value: The value to use for out-of-vocabulary indices.
-    name: A name for this op (optional).
-
-  Returns:
-    The lookup table to map a string values associated to a given index `int64`
-    `Tensors`.
-
-  Raises:
-    ValueError: when `vocabulary_file` is empty.
-    ValueError: when `vocab_size` is invalid.
-  """
-  if not vocabulary_file:
-    raise ValueError("vocabulary_file must be specified.")
-  if vocab_size is not None and vocab_size < 1:
-    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
-
-  with ops.name_scope(name, "index_to_string") as scope:
-    shared_name = ""
-    if vocab_size:
-      # Keep a shared_name
-      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                TextFileIndex.LINE_NUMBER,
-                                                TextFileIndex.WHOLE_LINE)
-    else:
-      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                             TextFileIndex.LINE_NUMBER,
-                                             TextFileIndex.WHOLE_LINE)
-    init = TextFileStringTableInitializer(
-        vocabulary_file, vocab_size=vocab_size, name="table_init")
-
-    # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
-
-
 def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   """Returns a lookup table that maps a `Tensor` of indices into strings.
 
@@ -1104,7 +208,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   Sample Usages:
 
   ```python
-  mapping_string = t.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       mapping_string, default_value="UNKNOWN")
@@ -1132,16 +236,8 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   if mapping is None:
     raise ValueError("mapping must be specified.")
 
-  with ops.name_scope(name, "index_to_string") as scope:
-    values = ops.convert_to_tensor(mapping, dtypes.string)
-    num_elements = array_ops.size(values)
-    keys = math_ops.cast(math_ops.range(num_elements), dtypes.int64)
-
-    shared_name = ""
-    init = KeyValueTensorInitializer(
-        keys, values, dtypes.int64, dtypes.string, name="table_init")
-    # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+  return lookup_ops.index_to_string_table_from_tensor(
+      vocabulary_list=mapping, default_value=default_value, name=name)
 
 
 @deprecated(
@@ -1164,7 +260,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   For example:
 
   ```python
-  mapping_string = t.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   values = tf.contrib.lookup.index_to_string(
       indices, mapping=mapping_string, default_value="UNKNOWN")
@@ -1247,14 +343,14 @@ class MutableHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     # pylint: disable=protected-access
     if self._default_value.get_shape().ndims == 0:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table(
+      self._table_ref = gen_lookup_ops._mutable_hash_table_v2(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           name=name)
     else:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table_of_tensors(
+      self._table_ref = gen_lookup_ops._mutable_hash_table_of_tensors_v2(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
@@ -1281,8 +377,10 @@ class MutableHashTable(LookupInterface):
     """
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
-      # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      with ops.colocate_with(self._table_ref):
+
+        # pylint: disable=protected-access
+        return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1306,14 +404,13 @@ class MutableHashTable(LookupInterface):
                       (self._key_dtype, keys.dtype))
 
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
-                        [self._table_ref, keys]) as name:
-      # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(self._table_ref,
-                                                    keys,
-                                                    self._default_value,
-                                                    name=name)
+                        (self._table_ref, keys, self._default_value)) as name:
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        values = gen_lookup_ops._lookup_table_find_v2(
+            self._table_ref, keys, self._default_value, name=name)
 
-    values.set_shape(keys.get_shape().concatenate(self._value_shape))
+        values.set_shape(keys.get_shape().concatenate(self._value_shape))
     return values
 
   def insert(self, keys, values, name=None):
@@ -1333,13 +430,16 @@ class MutableHashTable(LookupInterface):
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    self.check_table_dtypes(keys.dtype, values.dtype)
+    # pylint: disable=protected-access
+    lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype)
+    # pylint: enable=protected-access
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
-      # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
-          self._table_ref, keys, values, name=name)
-      return op
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        op = gen_lookup_ops._lookup_table_insert_v2(
+            self._table_ref, keys, values, name=name)
+    return op
 
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
@@ -1353,12 +453,10 @@ class MutableHashTable(LookupInterface):
     """
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
-      # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
-          self._table_ref,
-          self._key_dtype,
-          self._value_dtype,
-          name=name)
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        exported_keys, exported_values = gen_lookup_ops._lookup_table_export_v2(
+            self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
@@ -1378,8 +476,9 @@ class MutableHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(
-          self.op._table_ref, restored_tensors[0], restored_tensors[1])
+      with ops.colocate_with(self.op._table_ref):
+        return gen_lookup_ops._lookup_table_import_v2(
+            self.op._table_ref, restored_tensors[0], restored_tensors[1])
 
 
 class MutableDenseHashTable(LookupInterface):
@@ -1453,7 +552,7 @@ class MutableDenseHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype)
     # pylint: disable=protected-access
-    self._table_ref = gen_data_flow_ops._mutable_dense_hash_table(
+    self._table_ref = gen_lookup_ops._mutable_dense_hash_table_v2(
         empty_key=empty_key,
         shared_name=shared_name,
         use_node_name_sharing=use_node_name_sharing,
@@ -1480,8 +579,9 @@ class MutableDenseHashTable(LookupInterface):
     """
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
-      # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1506,9 +606,10 @@ class MutableDenseHashTable(LookupInterface):
 
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         [self._table_ref, keys]) as name:
-      # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
-          self._table_ref, keys, self._default_value, name=name)
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        values = gen_lookup_ops._lookup_table_find_v2(
+            self._table_ref, keys, self._default_value, name=name)
 
     if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0:
       values.set_shape(
@@ -1533,12 +634,15 @@ class MutableDenseHashTable(LookupInterface):
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    self.check_table_dtypes(keys.dtype, values.dtype)
+    # pylint: disable=protected-access
+    lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype)
+    # pylint: enable=protected-access
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
-      # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
-          self._table_ref, keys, values, name=name)
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        op = gen_lookup_ops._lookup_table_insert_v2(
+            self._table_ref, keys, values, name=name)
       return op
 
   def export(self, name=None):
@@ -1553,9 +657,10 @@ class MutableDenseHashTable(LookupInterface):
     """
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
-      # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
-          self._table_ref, self._key_dtype, self._value_dtype, name=name)
+      with ops.colocate_with(self._table_ref):
+        # pylint: disable=protected-access
+        exported_keys, exported_values = gen_lookup_ops._lookup_table_export_v2(
+            self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
@@ -1575,6 +680,6 @@ class MutableDenseHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(self.op._table_ref,
-                                                    restored_tensors[0],
-                                                    restored_tensors[1])
+      with ops.colocate_with(self.op._table_ref):
+        return gen_lookup_ops._lookup_table_import_v2(
+            self.op._table_ref, restored_tensors[0], restored_tensors[1])
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index b46db38770b..f0499010d47 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -125,7 +125,7 @@ class HashTableOpTest(test.TestCase):
       table3 = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(3, table1.size().eval())
       self.assertAllEqual(3, table2.size().eval())
       self.assertAllEqual(3, table3.size().eval())
@@ -654,7 +654,25 @@ class MutableHashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
 
       result = output.eval()
-      self.assertAllClose([0, 1.1, -1.5], result)
+      self.assertAllClose([0, 1.1, default_val], result)
+
+  def testMutableHashTableIntFloat(self):
+    with self.test_session():
+      default_val = -1.0
+      keys = constant_op.constant([3, 7, 0], dtypes.int64)
+      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
+      table = lookup.MutableHashTable(dtypes.int64, dtypes.float32,
+                                      default_val)
+      self.assertAllEqual(0, table.size().eval())
+
+      table.insert(keys, values).run()
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllClose([-1.2, 9.9, default_val], result)
 
   def testMutableHashTableInt64String(self):
     with self.test_session():
@@ -695,6 +713,24 @@ class MutableDenseHashTableOpTest(test.TestCase):
       result = output.eval()
       self.assertAllEqual([0, 1, -1], result)
 
+  def testBasicBool(self):
+    with self.test_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([True, True, True], dtypes.bool)
+      table = lookup.MutableDenseHashTable(
+          dtypes.int64, dtypes.bool, default_value=False, empty_key=0)
+      self.assertAllEqual(0, table.size().eval())
+
+      table.insert(keys, values).run()
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = output.eval()
+      self.assertAllEqual([True, True, False], result)
+
   def testLookupUnknownShape(self):
     with self.test_session():
       keys = constant_op.constant([11, 12, 13], dtypes.int64)
@@ -1132,47 +1168,105 @@ class MutableDenseHashTableOpTest(test.TestCase):
         self.assertAllEqual(0, table2.size().eval())
 
 
-class StringToIndexTableFromFile(test.TestCase):
+class IndexTableFromFile(test.TestCase):
 
-  def _createVocabFile(self, basename):
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
     with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+      f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  def test_string_to_index_table_from_file(self):
+  def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
-  def test_string_to_index_table_from_file_with_default_value(self):
-    default_value = -42
-    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+  def test_string_index_table_from_file_tensor_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      vocabulary_file = constant_op.constant(vocabulary_file)
+      table = lookup.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  def test_string_index_table_from_file_placeholder_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      vocabulary_placeholder = array_ops.placeholder(dtypes.string, [])
+      table = lookup.index_table_from_file(
+          vocabulary_file=vocabulary_placeholder, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+
+      feed_dict = {vocabulary_placeholder.name: vocabulary_file}
+      lookup_ops.tables_initializer().run(feed_dict=feed_dict)
+      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertEqual(0,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  def test_int32_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab2.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1,
+          key_dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1,
+          key_dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_file_with_default_value(self):
+    default_value = -42
+    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
+    with self.test_session():
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
-  def test_string_to_index_table_from_file_with_oov_buckets(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab3.txt")
+  def test_index_table_from_file_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, num_oov_buckets=1000)
       ids = table.lookup(
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
               1,  # From vocabulary file.
@@ -1181,62 +1275,68 @@ class StringToIndexTableFromFile(test.TestCase):
               860),  # 3 + fingerprint("toccata") mod 300.
           ids.eval())
 
-  def test_string_to_index_table_from_file_with_only_oov_buckets(self):
+  def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
         ValueError,
-        lookup.string_to_index_table_from_file,
+        lookup.index_table_from_file,
+        vocabulary_file="")
+
+  def test_index_table_from_file_fails_with_empty_vocabulary(self):
+    self.assertRaises(
+        ValueError,
+        lookup.index_table_from_file,
         vocabulary_file=None)
 
-  def test_string_to_index_table_from_file_with_vocab_size_too_small(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
+  def test_index_table_from_file_with_vocab_size_too_small(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, vocab_size=2)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, -1, -1), ids.eval())
       self.assertEqual(2, table.size().eval())
 
-  def test_string_to_index_table_from_file_with_vocab_size_too_large(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+  def test_index_table_from_file_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, vocab_size=4)
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", table.init.run)
 
-  def test_string_to_index_table_from_file_with_vocab_size(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+  def test_index_table_from_file_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
     self.assertRaises(
         ValueError,
-        lookup.string_to_index_table_from_file,
+        lookup.index_table_from_file,
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
     with self.test_session():
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, vocab_size=3)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, -1), ids.eval())
       self.assertEqual(3, table.size().eval())
 
-  def test_string_to_index_table_from_file_with_invalid_hashers(self):
+  def test_index_table_from_file_with_invalid_hashers(self):
     vocabulary_file = self._createVocabFile("invalid_hasher.txt")
     with self.test_session():
       with self.assertRaises(TypeError):
-        lookup.string_to_index_table_from_file(
+        lookup.index_table_from_file(
             vocabulary_file=vocabulary_file,
             vocab_size=3,
             num_oov_buckets=1,
             hasher_spec=1)
 
-      table = lookup.string_to_index_table_from_file(
+      table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file,
           vocab_size=3,
           num_oov_buckets=1,
@@ -1246,44 +1346,101 @@ class StringToIndexTableFromFile(test.TestCase):
                         constant_op.constant(["salad", "surgery", "tarkus"]))
 
 
-class StringToIndexTableFromTensor(test.TestCase):
+class KeyValueTensorInitializerTest(test.TestCase):
 
-  def test_string_to_index_table_from_tensor_with_tensor_init(self):
+  def test_string(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table = lookup.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int64(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup.KeyValueTensorInitializer(
+          (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64)
+      table = lookup.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int32(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup.KeyValueTensorInitializer(
+          (42, 1, -1000), (0, 1, 2), dtypes.int32, dtypes.int64)
+      table = lookup.HashTable(init, default_value=-1)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "No OpKernel was registered"):
+        table.init.run()
+
+
+class IndexTableFromTensor(test.TestCase):
+
+  def test_index_table_from_tensor_with_tensor_init(self):
     with self.test_session():
-      table = lookup.string_to_index_table_from_tensor(
-          mapping=["brain", "salad", "surgery"], num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      table = lookup.index_table_from_tensor(
+          mapping=("brain", "salad", "surgery"), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
-  def test_string_to_index_table_from_tensor_with_default_value(self):
+  def test_int32_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup.index_table_from_tensor(
+          mapping=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup.index_table_from_tensor(
+          mapping=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.test_session():
-      table = lookup.string_to_index_table_from_tensor(
+      table = lookup.index_table_from_tensor(
           mapping=["brain", "salad", "surgery"], default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
-  def test_string_to_index_table_from_tensor_with_only_oov_buckets(self):
+  def test_index_table_from_tensor_missing_mapping(self):
     with self.test_session():
-      with self.assertRaises(ValueError):
-        lookup.string_to_index_table_from_tensor(
-            mapping=None, num_oov_buckets=1)
+      with self.assertRaisesRegexp(ValueError, "mapping must be specified"):
+        lookup.index_table_from_tensor(mapping=None, num_oov_buckets=1)
 
-  def test_string_to_index_table_from_tensor_with_invalid_hashers(self):
+  def test_index_table_from_tensor_empty_mapping(self):
+    with self.test_session():
+      table = lookup.index_table_from_tensor(
+          mapping=np.array([], dtype=np.str_), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "keys and values cannot be empty"):
+        lookup_ops.tables_initializer().run()
+
+  def test_index_table_from_tensor_with_invalid_hashers(self):
     with self.test_session():
       with self.assertRaises(TypeError):
-        lookup.string_to_index_table_from_tensor(
+        lookup.index_table_from_tensor(
             mapping=["brain", "salad", "surgery"],
             num_oov_buckets=1,
             hasher_spec=1)
 
-      table = lookup.string_to_index_table_from_tensor(
+      table = lookup.index_table_from_tensor(
           mapping=["brain", "salad", "surgery"],
           num_oov_buckets=1,
           hasher_spec=lookup.HasherSpec("my-awesome-hash", None))
@@ -1301,7 +1458,7 @@ class StringToIndexTest(test.TestCase):
       indices = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, indices.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((1, 2, -1), indices.eval())
 
@@ -1312,7 +1469,7 @@ class StringToIndexTest(test.TestCase):
       _ = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_string_to_index_with_default_value(self):
     default_value = -42
@@ -1323,7 +1480,7 @@ class StringToIndexTest(test.TestCase):
           feats, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, indices.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), indices.eval())
 
 
@@ -1342,7 +1499,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
 
@@ -1354,7 +1511,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1368,7 +1525,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
                           features.eval())
 
@@ -1380,7 +1537,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      init = data_flow_ops.tables_initializer()
+      init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
@@ -1392,7 +1549,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
 
 
@@ -1407,7 +1564,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
@@ -1419,7 +1576,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           mapping=mapping_strings)
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
 
   def test_index_to_string_with_default_value(self):
@@ -1432,7 +1589,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1446,7 +1603,7 @@ class IndexToStringTest(test.TestCase):
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, feats.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           feats.eval())
@@ -1456,11 +1613,11 @@ class IndexToStringTest(test.TestCase):
       mapping_strings = constant_op.constant(["hello", "hello"])
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), feats.eval())
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
@@ -1471,19 +1628,19 @@ class IndexToStringTest(test.TestCase):
           indices, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, feats.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value), feats.eval())
 
 
 class InitializeTableFromFileOpTest(test.TestCase):
 
-  def _createVocabFile(self, basename):
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
     with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+      f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  def testInitializeTable(self):
+  def testInitializeStringTable(self):
     vocabulary_file = self._createVocabFile("one_column_1.txt")
 
     with self.test_session():
@@ -1496,8 +1653,27 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value)
       table.init.run()
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+      output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeInt64Table(self):
+    vocabulary_file = self._createVocabFile(
+        "one_column_int64.txt", values=("42", "1", "-1000"))
+
+    with self.test_session():
+      default_value = -1
+      table = lookup.HashTable(
+          lookup.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                     lookup.TextFileIndex.WHOLE_LINE,
+                                     dtypes.int64,
+                                     lookup.TextFileIndex.LINE_NUMBER),
+          default_value)
+      table.init.run()
+
+      output = table.lookup(
+          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
       result = output.eval()
       self.assertAllEqual([0, 1, -1], result)
@@ -1615,7 +1791,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value,
           shared_name=shared_name)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
 
@@ -1773,17 +1949,34 @@ class InitializeTableFromFileOpTest(test.TestCase):
       self.assertAllEqual([0, 1, 2, -1], out.eval())
       self.assertEquals(vocab_size, table.size().eval())
 
+  def testInt64ToIdTable(self):
+    vocab_file = self._createVocabFile(
+        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup.HashTable(
+          lookup.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          default_value)
+      table.init.run()
+
+      out = table.lookup(
+          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
+      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
 
 class IdTableWithHashBucketsTest(test.TestCase):
 
-  def _createVocabFile(self, basename):
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
     with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+      f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  def testIdTableWithHashBucketsInit(self):
-    vocab_file = self._createVocabFile("feat_to_id_3.txt")
+  def testStringIdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.test_session():
       default_value = -1
       vocab_size = 3
@@ -1803,7 +1996,50 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 2, 3], out.eval())
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
-  def testIdTableWithOnlyHashBucket(self):
+  def testInt32IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup.IdTableWithHashBuckets(
+          lookup.HashTable(
+              lookup.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value),
+          oov_buckets,
+          key_dtype=dtypes.int32)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt64IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup.IdTableWithHashBuckets(
+          lookup.HashTable(
+              lookup.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value),
+          oov_buckets)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testStringIdTableWithOnlyHashBucket(self):
     with self.test_session():
       oov_buckets = 5
 
@@ -1812,9 +2048,9 @@ class IdTableWithHashBucketsTest(test.TestCase):
       table = lookup.IdTableWithHashBuckets(None, oov_buckets)
       table.init.run()
 
-      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant(("brain", "salad", "surgery"))
 
-      out = table.lookup(input_string)
+      out = table.lookup(values)
       self.assertAllEqual(
           [
               3,  # fingerprint("brain") mod 5.
@@ -1824,6 +2060,40 @@ class IdTableWithHashBucketsTest(test.TestCase):
           out.eval())
       self.assertEquals(oov_buckets, table.size().eval())
 
+  def testInt32IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup.IdTableWithHashBuckets(
+          None, oov_buckets, key_dtype=dtypes.int32)
+      table.init.run()
+
+      input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
+
+      out = table.lookup(input_string)
+      self.assertAllEqual(
+          [
+              1,  # fingerprint("42") mod 5.
+              4,  # fingerprint("1") mod 5.
+              2  # fingerprint("-1000") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testFloat64IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.float64)
+
+  def testBoolIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.bool)
+
   def testIdTableWithHashBucketsWithMultipleInitializers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.test_session() as sess:
@@ -1847,7 +2117,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           hasher_spec=lookup.StrongHashSpec((1, 2)),
           name="table2")
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(
           ["fruit", "brain", "salad", "surgery", "UNK"])
@@ -1933,7 +2203,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value2),
           oov_buckets)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
@@ -1978,6 +2248,64 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup.IdTableWithHashBuckets(
+          lookup.HashTable(
+              lookup.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64),
+              -1),
+          1,
+          key_dtype=dtypes.int32)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup.IdTableWithHashBuckets(
+          lookup.HashTable(
+              lookup.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64),
+              -1),
+          1,
+          key_dtype=dtypes.int64)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
   def testIdTableWithHashBucketsWithInvalidHashers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.test_session():
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 9b590bfd15b..d9074e385a3 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "losses_py",
     srcs = [
diff --git a/tensorflow/contrib/losses/README.md b/tensorflow/contrib/losses/README.md
index f373c94c1b8..7b73c4483a9 100644
--- a/tensorflow/contrib/losses/README.md
+++ b/tensorflow/contrib/losses/README.md
@@ -12,10 +12,10 @@ All loss functions take a pair of tensors, `predictions` and ground truth
 `[batch_size, d1, ... dN]` where `batch_size` is the number
 of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 
-THe `weight` parameter can be used to adjust the relative weight samples within
+The `weight` parameter can be used to adjust the relative weight samples within
 the batch. The result of each loss is a scalar average of all sample losses with
 non-zero weights.
 
 Any parameter named `logit` should be the raw model outputs, not a normalized
-probablility distribution (i.e., `[0.0, 1.0]`). `target` for losses taking
+probability distribution (i.e., `[0.0, 1.0]`). `target` for losses taking
 `logit` _should_ be a normalized probability distribution.
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index a405e11c22b..790bf61367d 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -13,16 +13,35 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Ops for building neural network losses."""
+"""Ops for building neural network losses.
+
+See @{$python/contrib.losses}.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.losses.python import losses
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses import *
-# pylint: enable=unused-import,wildcard-import
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__, doc_string_modules=[losses])
+
+_allowed_symbols = [
+    'absolute_difference',
+    'add_loss',
+    'hinge_loss',
+    'compute_weighted_loss',
+    'cosine_distance',
+    'get_losses',
+    'get_regularization_losses',
+    'get_total_loss',
+    'log_loss',
+    'mean_pairwise_squared_error',
+    'mean_squared_error',
+    'sigmoid_cross_entropy',
+    'softmax_cross_entropy',
+    'sparse_softmax_cross_entropy',
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 1b57f0baeef..6e9d1d4a773 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -12,127 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Loss operations for use in neural networks.
+"""Ops for building neural network losses.
 
-Note: By default all the losses are collected into the `GraphKeys.LOSSES`
-collection.
-
-All of the loss functions take a pair of predictions and ground truth labels,
-from which the loss is computed. It is assumed that the shape of both these
-tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number
-of samples in the batch and `d1` ... `dN` are the remaining dimensions.
-
-It is common, when training with multiple loss functions, to adjust the relative
-strengths of individual losses. This is performed by rescaling the losses via
-a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
-implement this as:
-
-  # Explicitely set the weight.
-  tf.contrib.losses.log(predictions, labels, weight=2.0)
-
-  # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
-
-  # All the losses are collected into the `GraphKeys.LOSSES` collection.
-  losses = tf.get_collection(tf.GraphKeys.LOSSES)
-
-While specifying a scalar loss rescales the loss over the entire batch,
-we sometimes want to rescale the loss per batch sample. For example, if we have
-certain examples that matter more to us to get correctly, we might want to have
-a higher loss that other samples whose mistakes matter less. In this case, we
-can provide a weight vector of length `batch_size` which results in the loss
-for each sample in the batch being scaled by the corresponding weight element.
-For example, consider the case of a classification problem where we want to
-maximize our accuracy but we especially interested in obtaining high accuracy
-for a specific class:
-
-  inputs, labels = LoadData(batch_size=3)
-  logits = MyModelPredictions(inputs)
-
-  # Ensures that the loss for examples whose ground truth class is `3` is 5x
-  # higher than the loss for all other examples.
-  weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1
-
-  onehot_labels = tf.one_hot(labels, num_classes=5)
-  tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight)
-
-Finally, in certain cases, we may want to specify a different loss for every
-single measurable value. For example, if we are performing per-pixel depth
-prediction, or per-pixel denoising, a single batch sample has P values where P
-is the number of pixels in the image. For many losses, the number of measurable
-values matches the number of elements in the predictions and labels tensors.
-For others, such as softmax_cross_entropy and cosine_distance, the
-loss functions reduces the dimensions of the inputs to produces a tensor of
-losses for each measurable value. For example, softmax_cross_entropy takes as
-input predictions and labels of dimension [batch_size, num_classes] but the
-number of measurable values is [batch_size]. Consequently, when passing a weight
-tensor to specify a different loss for every measurable value, the dimension of
-the tensor will depend on the loss being used.
-
-For a concrete example, consider the case of per-pixel depth prediction where
-certain ground truth depth values are missing (due to sensor noise in the
-capture process). In this case, we want to assign zero weight to losses for
-these predictions.
-
-  # 'depths' that are missing have a value of 0:
-  images, depths = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-Note that when using weights for the losses, the final average is computed
-by rescaling the losses by the weights and then dividing by the total number of
-non-zero samples. For an arbitrary set of weights, this may not necessarily
-produce a weighted average. Instead, it simply and transparently rescales the
-per-element losses before averaging over the number of observations. For example
-if the losses computed by the loss function is an array [4, 1, 2, 3] and the
-weights are an array [1, 0.5, 3, 9], then the average loss is:
-
-  (4*1 + 1*0.5 + 2*3 + 3*9) / 4
-
-However, with a single loss function and an arbitrary set of weights, one can
-still easily create a loss function such that the resulting loss is a
-weighted average over the individual prediction errors:
-
-  images, labels = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = MyComplicatedWeightingFunction(labels)
-  weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-@@absolute_difference
-@@add_loss
-@@hinge_loss
-@@compute_weighted_loss
-@@cosine_distance
-@@get_losses
-@@get_regularization_losses
-@@get_total_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@@sum_of_pairwise_squares
-@@sum_of_squares
+See @{$python/contrib.losses}.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses.loss_ops import *
-from tensorflow.python.util.all_util import make_all
-# pylint: enable=unused-import,wildcard-import
-
-__all__ = make_all(__name__)
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 1222abd7d41..f6d3601c7dc 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -236,7 +236,7 @@ def get_regularization_losses(scope=None):
     scope: an optional scope for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
@@ -299,7 +299,9 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.sigmoid_cross_entropy instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
+            "of the predictions and labels arguments was changed.")
 def sigmoid_cross_entropy(
     logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
@@ -346,7 +348,9 @@ def sigmoid_cross_entropy(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.softmax_cross_entropy instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.softmax_cross_entropy instead. Note that the order "
+            "of the logits and labels arguments has been changed.")
 def softmax_cross_entropy(
     logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
@@ -394,7 +398,9 @@ def softmax_cross_entropy(
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.sparse_softmax_cross_entropy instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.sparse_softmax_cross_entropy instead. Note that "
+            "the order of the logits and labels arguments has been changed.")
 def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
   """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
 
@@ -428,7 +434,9 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.log_loss instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.log_loss instead. Note that the order of the "
+            "predictions and labels arguments was changed.")
 def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
   """Adds a Log Loss term to the training procedure.
 
@@ -467,7 +475,9 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.hinge_loss instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.hinge_loss instead. Note that the order of the "
+            "predictions and labels arguments were changed.")
 def hinge_loss(logits, labels=None, scope=None):
   """Method that returns the loss tensor for hinge loss.
 
@@ -529,7 +539,9 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
-@deprecated("2016-12-30", "Use tf.losses.mean_pairwise_squared_error instead.")
+@deprecated("2016-12-30",
+            "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
+            "order of the predictions and labels arguments was changed.")
 def mean_pairwise_squared_error(
     predictions, labels=None, weights=1.0, scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 81a4aaba2bc..9d0f95e6f3e 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -265,7 +266,8 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
                                    [1, 0, 0],
                                    [0, 1, 0]])
     weights = [[2.3], [2.4], [2.5]]
-    weights_placeholder = array_ops.placeholder(dtypes.float32, shape=[None, None])
+    weights_placeholder = array_ops.placeholder(
+        dtypes.float32, shape=[None, None])
     loss = loss_ops.softmax_cross_entropy(logits, labels, weights_placeholder)
     with self.test_session() as sess:
       loss = sess.run(loss, {weights_placeholder: weights})
@@ -479,8 +481,10 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([2, 0, 1])
     weights = [2.3, 2.4, 2.5]
-    weights_placeholder = array_ops.placeholder(dtypes.float32, shape=[None])
-    loss = loss_ops.sparse_softmax_cross_entropy(logits, labels, weights_placeholder)
+    weights_placeholder = array_ops.placeholder(
+        dtypes.float32, shape=[None])
+    loss = loss_ops.sparse_softmax_cross_entropy(
+        logits, labels, weights_placeholder)
     with self.test_session() as sess:
       loss = sess.run(loss, {weights_placeholder: weights})
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
@@ -491,8 +495,10 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                                    [0.0, 0.0, 10.0]])
     labels = constant_op.constant([2, 0, 1])
     weights = [[2.3], [2.4], [2.5]]
-    weights_placeholder = array_ops.placeholder(dtypes.float32, shape=[None, None])
-    loss = loss_ops.sparse_softmax_cross_entropy(logits, labels, weights_placeholder)
+    weights_placeholder = array_ops.placeholder(
+        dtypes.float32, shape=[None, None])
+    loss = loss_ops.sparse_softmax_cross_entropy(
+        logits, labels, weights_placeholder)
     with self.test_session() as sess:
       loss = sess.run(loss, {weights_placeholder: weights})
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
@@ -1054,6 +1060,41 @@ class MeanPairwiseSquaresErrorTest(test.TestCase):
     with self.test_session():
       self.assertAlmostEqual(0.0, loss.eval(), 3)
 
+  def testLossIsAssociativeAcrossBatchElements(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(0)
+
+      height = 3
+      width = 4
+      shape = (1, height, width, 1)
+
+      labels0 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+      predictions0 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+
+      labels1 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+      predictions1 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+
+      loss0 = loss_ops.mean_pairwise_squared_error(
+          predictions=predictions0,
+          labels=labels0)
+      loss1 = loss_ops.mean_pairwise_squared_error(
+          predictions=predictions1,
+          labels=labels1)
+      loss0_1 = loss_ops.mean_pairwise_squared_error(
+          predictions=array_ops.concat([predictions0, predictions1], 0),
+          labels=array_ops.concat([labels0, labels1], 0))
+
+      with self.test_session() as session:
+        loss0, loss1, loss0_1 = session.run([loss0, loss1, loss0_1])
+
+        self.assertTrue(loss0 > 0)
+        self.assertTrue(loss1 > 0)
+        self.assertAlmostEqual(loss0 + loss1, loss0_1, 5)
+
 
 class CosineDistanceLossTest(test.TestCase):
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 2bc9de186df..305ed0d11ec 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -281,6 +281,10 @@ ifeq ($(TARGET),ANDROID)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
 	endif
 
+	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+		CXXFLAGS += -DENABLE_EXPERIMENTAL_HEXNN_OPS
+	endif
+
 endif  # ANDROID
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
@@ -293,7 +297,7 @@ ifeq ($(TARGET),IOS)
 	IPHONESIMULATOR_SYSROOT := $(shell xcrun --sdk iphonesimulator \
 	--show-sdk-path)
 	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
-	MIN_SDK_VERSION := 8.2
+	MIN_SDK_VERSION := 8.0
 # Override IOS_ARCH with ARMV7, ARMV7S, ARM64, or I386.
 	IOS_ARCH := X86_64
 	ifeq ($(IOS_ARCH),ARMV7)
@@ -370,6 +374,7 @@ ifeq ($(TARGET),IOS)
 	ifeq ($(IOS_ARCH),I386)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch i386 \
+		-mno-sse \
 		-fembed-bitcode \
 		-D__thread= \
 		-DUSE_GEMM_FOR_CONV \
@@ -435,6 +440,8 @@ $(wildcard tensorflow/core/*.cc) \
 $(wildcard tensorflow/core/common_runtime/*.cc) \
 $(wildcard tensorflow/core/framework/*.cc) \
 $(wildcard tensorflow/core/graph/*.cc) \
+$(wildcard tensorflow/core/grappler/*.cc) \
+$(wildcard tensorflow/core/grappler/*/*.cc) \
 $(wildcard tensorflow/core/lib/*/*.cc) \
 $(wildcard tensorflow/core/platform/*.cc) \
 $(wildcard tensorflow/core/platform/*/*.cc) \
@@ -477,7 +484,9 @@ $(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
 $(wildcard tensorflow/core/user_ops/*.cu.cc) \
 $(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
+$(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 9f7686d92fc..9ba5c035a26 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -32,7 +32,7 @@ application that will let you check your application.
 First, clone this TensorFlow repository.
 
 You will need to download all dependencies as well.  We have provided a script
-that does so, to be run (as with all commands) at the root of the repository:
+that does so, to be run (as with all commands) **at the root of the repository**:
 
 ```bash
 tensorflow/contrib/makefile/download_dependencies.sh
@@ -75,7 +75,7 @@ To run the executable, use:
 
 ```bash
 tensorflow/contrib/makefile/gen/bin/benchmark \
- --graph=~/graphs/inception/tensorflow_inception_graph.pb
+ --graph=$HOME/graphs/inception/tensorflow_inception_graph.pb
 ```
 
 ## Android
@@ -142,6 +142,8 @@ xcode-select --install
 If this is a new install, you will need to run XCode once to agree to the
 license before continuing.
 
+(You will also need to have [Homebrew](http://brew.sh/) installed.)
+
 Then install [automake](https://en.wikipedia.org/wiki/Automake)/[libtool](https://en.wikipedia.org/wiki/GNU_Libtool):
 
 ```bash
@@ -293,7 +295,7 @@ itself, you'll see it's broken up into host and target sections. If you are
 cross-compiling, you should look at customizing the target settings to match
 what you need for your desired system.
 
-## Dependency Managment
+## Dependency Management
 
 The Makefile loads in a list of dependencies stored in text files. These files
 are generated from the main Bazel build by running 
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 3d80f0fd2dd..161f2df5b27 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,44 +18,35 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-t:]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-s:t:Tx:X]"
+  echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
   echo "-T only build tensorflow"
-  echo "-x use hexagon library located at tensorflow/contrib/makefile/downloads/hexagon"
-  echo "-X download hexagon deps and run hexagon_graph_execution"
+  echo "-x [hexagon library path] copy and hexagon libraries in the specified path"
   exit 1
 }
 
-download_and_push() {
-    URL="$1"
-    LOCAL_DEST="$2"
-    ANDROID_DEST="$3"
-    curl -Ls "${URL}" -o "${LOCAL_DEST}"
-    adb shell mkdir -p "${ANDROID_DEST}"
-    adb push "${LOCAL_DEST}" "${ANDROID_DEST}"
-}
-
 if [[ -z "${NDK_ROOT}" ]]; then
     echo "NDK_ROOT should be set as an environment variable" 1>&2
     exit 1
 fi
 
-while getopts "s:t:TxX" opt_name; do
+while getopts "Es:t:Tx:" opt_name; do
   case "$opt_name" in
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
     t) BUILD_TARGET="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
-    x) USE_HEXAGON="true";;
-    X) DOWNLOAD_AND_USE_HEXAGON="true";;
+    x) HEXAGON_LIB_PATH="${OPTARG}";;
     *) usage;;
   esac
 done
 shift $((OPTIND - 1))
 
 # Make sure we're in the correct directory, at the root of the source tree.
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd ${SCRIPT_DIR}/../../../
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
+cd "${SCRIPT_DIR}"/../../../
 
 source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
@@ -76,51 +67,38 @@ else
   make -f tensorflow/contrib/makefile/Makefile clean_except_protobuf_libs
 fi
 
-if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then
-    URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
+if [[ ! -z "${HEXAGON_LIB_PATH}" ]]; then
+    echo "Copy hexagon libraries from ${HEXAGON_LIB_PATH}"
 
-    rm -rf "${HEXAGON_DOWNLOAD_PATH}"
     mkdir -p "${HEXAGON_DOWNLOAD_PATH}/libs"
-
-    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_controller.so" \
-"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_controller.so" "/data/local/tmp"
-
-    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_nn_skel.so" \
-"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
-
-    download_and_push "${URL_BASE}/example_images/img_299x299.jpg" \
-"${HEXAGON_DOWNLOAD_PATH}/img_299x299.jpg" "/data/local/tmp"
+    cp -fv "${HEXAGON_LIB_PATH}/libhexagon_controller.so" \
+"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_controller.so"
+    cp -fv "${HEXAGON_LIB_PATH}/libhexagon_nn_skel.so" \
+"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_nn_skel.so"
 
     USE_HEXAGON="true"
-    SUB_MAKEFILES="$(pwd)/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in"
-    BUILD_TARGET="hexagon_graph_execution"
 fi
 
 if [[ "${USE_HEXAGON}" == "true" ]]; then
-    HEXAGON_PARENT_DIR=$(cd "${HEXAGON_DOWNLOAD_PATH}" && pwd)
+    HEXAGON_PARENT_DIR=$(cd "${HEXAGON_DOWNLOAD_PATH}" >/dev/null && pwd)
     HEXAGON_LIBS="${HEXAGON_PARENT_DIR}/libs"
-    HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" && pwd)
+    HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" >/dev/null && pwd)
+fi
+
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_MAKE_ARGS+=("ENABLE_EXPERIMENTAL_HEXNN_OPS=true")
 fi
 
 if [[ -z "${BUILD_TARGET}" ]]; then
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}"
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]}
 else
+    # BUILD_TARGET explicitly uncommented to allow multiple targets to be
+    # passed to make in a single build_all_android.sh invocation.
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}" "${BUILD_TARGET}"
-fi
-
-if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then
-    ANDROID_EXEC_FILE_MODE=755
-    echo "Run hexagon_graph_execution"
-    adb push -p "./tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" "/data/local/tmp/"
-    adb wait-for-device
-    adb shell chmod "${ANDROID_EXEC_FILE_MODE}" "/data/local/tmp/hexagon_graph_execution"
-    adb wait-for-device
-    adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
-    "/data/local/tmp/hexagon_graph_execution"
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]} ${BUILD_TARGET}
 fi
diff --git a/tensorflow/contrib/makefile/build_all_linux.sh b/tensorflow/contrib/makefile/build_all_linux.sh
index 5fbccd001ea..6bf1c6d6830 100755
--- a/tensorflow/contrib/makefile/build_all_linux.sh
+++ b/tensorflow/contrib/makefile/build_all_linux.sh
@@ -36,4 +36,6 @@ tensorflow/contrib/makefile/download_dependencies.sh
 tensorflow/contrib/makefile/compile_linux_protobuf.sh
 
 # Build TensorFlow.
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile OPTFLAGS="-O3"
+make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+  OPTFLAGS="-O3 -march=native" \
+  HOST_CXXFLAGS="--std=c++11 -march=native"
diff --git a/tensorflow/contrib/makefile/build_helper.subr b/tensorflow/contrib/makefile/build_helper.subr
index 3c1971821bf..d58b2c0a9be 100644
--- a/tensorflow/contrib/makefile/build_helper.subr
+++ b/tensorflow/contrib/makefile/build_helper.subr
@@ -31,7 +31,7 @@ get_cpu_count() {
 }
 
 get_job_count() {
-  echo $(($(get_cpu_count) * 2))
+  echo $(($(get_cpu_count)))
 }
 
 make_host_protoc() {
@@ -57,3 +57,24 @@ make_host_protoc() {
   fi
   make install
 }
+
+download_and_push() {
+  URL="$1"
+  LOCAL_DEST="$2"
+  ANDROID_DEST="$3"
+  SKIP_DOWNLOAD_IF_EXIST="$4"
+  if [[ "${SKIP_DOWNLOAD_IF_EXIST}" == "true" ]]; then
+    ANDROID_DEST_FILE_PATH="${ANDROID_DEST}/$(basename "${LOCAL_DEST}")"
+    if adb shell test -f "${ANDROID_DEST_FILE_PATH}"; then
+        echo "${ANDROID_DEST_FILE_PATH} already existins, skip download" 1>&2
+      return 0
+    fi
+  fi
+
+  curl -Ls "${URL}" -o "${LOCAL_DEST}"
+
+  if [[ ! -z "${ANDROID_DEST}" ]]; then
+    adb shell mkdir -p "${ANDROID_DEST}"
+    adb push "${LOCAL_DEST}" "${ANDROID_DEST}"
+  fi
+}
diff --git a/tensorflow/contrib/makefile/build_with_docker.sh b/tensorflow/contrib/makefile/build_with_docker.sh
index 7fe38f4b5d5..51a73fafe5d 100755
--- a/tensorflow/contrib/makefile/build_with_docker.sh
+++ b/tensorflow/contrib/makefile/build_with_docker.sh
@@ -23,7 +23,7 @@
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="${SCRIPT_DIR}/../../../"
-cd ${WORKSPACE}
+cd ${WORKSPACE} || exit 1
 
 DOCKER_IMG_NAME="tf-make-base"
 DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
diff --git a/tensorflow/contrib/makefile/compile_android_protobuf.sh b/tensorflow/contrib/makefile/compile_android_protobuf.sh
index 286109b9231..fadbe271b85 100755
--- a/tensorflow/contrib/makefile/compile_android_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_android_protobuf.sh
@@ -27,7 +27,7 @@ cc_prefix="${CC_PREFIX}"
 usage() {
   echo "Usage: $(basename "$0") [-a:c]"
   echo "-a [Architecture] Architecture of target android [default=armeabi-v7a] \
-(supported archtecture list: \
+(supported architecture list: \
 arm64-v8a armeabi armeabi-v7a armeabi-v7a-hard mips mips64 x86 x86_64)"
   echo "-c Clean before building protobuf for target"
   echo "\"NDK_ROOT\" should be defined as an environment variable."
@@ -130,7 +130,7 @@ elif [[ ${ARCHITECTURE} == "x86_64" ]]; then
     sysroot_arch="x86_64"
     bin_prefix="x86_64-linux-android"
 else
-    echo "archtecture ${arcitecture} is not supported." 1>&2
+    echo "architecture ${ARCHITECTURE} is not supported." 1>&2
     usage
     exit 1
 fi
@@ -165,7 +165,7 @@ CXXFLAGS="-frtti -fexceptions ${march_option} \
 -I${NDK_ROOT}/sources/cxx-stl/gnu-libstdc++/4.9/include \
 -I${NDK_ROOT}/sources/cxx-stl/gnu-libstdc++/4.9/libs/${ARCHITECTURE}/include" \
 LDFLAGS="-L${NDK_ROOT}/sources/cxx-stl/gnu-libstdc++/4.9/libs/${ARCHITECTURE}" \
-LIBS="-lz -lgnustl_static"
+LIBS="-llog -lz -lgnustl_static"
 
 if [ $? -ne 0 ]
 then
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 12f34b38d08..e8b9454e7e6 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x -e
+#!/bin/bash
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,9 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
+set -x
+set -e
+
 SCRIPT_DIR=$(dirname $0)
 source "${SCRIPT_DIR}/build_helper.subr"
 
@@ -30,18 +33,18 @@ fi
 
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
-GENDIR=`pwd`/gen/protobuf_ios/
+GENDIR=$(pwd)/gen/protobuf_ios/
 LIBDIR=${GENDIR}lib
 mkdir -p ${LIBDIR}
 
 OSX_VERSION=darwin14.0.0
 
-IPHONEOS_PLATFORM=`xcrun --sdk iphoneos --show-sdk-platform-path`
-IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
-IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
-IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
-IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
-MIN_SDK_VERSION=8.2
+IPHONEOS_PLATFORM=$(xcrun --sdk iphoneos --show-sdk-platform-path)
+IPHONEOS_SYSROOT=$(xcrun --sdk iphoneos --show-sdk-path)
+IPHONESIMULATOR_PLATFORM=$(xcrun --sdk iphonesimulator --show-sdk-platform-path)
+IPHONESIMULATOR_SYSROOT=$(xcrun --sdk iphonesimulator --show-sdk-path)
+IOS_SDK_VERSION=$(xcrun --sdk iphoneos --show-sdk-version)
+MIN_SDK_VERSION=8.0
 
 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
 CXXFLAGS="${CFLAGS} -std=c++11 -stdlib=libc++"
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index 6f47b80780f..bcf097b3031 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -20,7 +20,7 @@ source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 function less_than_required_version() {
-  echo $1 | (IFS=. read major minor micro
+  echo $1 | (IFS=. read -r major minor micro
     if [ $major -ne $2 ]; then
       [ $major -lt $2 ]
     elif [ $minor -ne $3 ]; then
@@ -31,7 +31,7 @@ function less_than_required_version() {
   )
 }
 
-ACTUAL_XCODE_VERSION=`xcodebuild -version | head -n 1 | sed 's/Xcode //'`
+ACTUAL_XCODE_VERSION=$(xcodebuild -version | head -n 1 | sed 's/Xcode //')
 REQUIRED_XCODE_VERSION=7.3.0
 if less_than_required_version $ACTUAL_XCODE_VERSION 7 3 0
 then
diff --git a/tensorflow/contrib/makefile/compile_linux_protobuf.sh b/tensorflow/contrib/makefile/compile_linux_protobuf.sh
index 480fbcc215c..6eb061a3c96 100755
--- a/tensorflow/contrib/makefile/compile_linux_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_linux_protobuf.sh
@@ -38,7 +38,7 @@ then
   exit 1
 fi
 
-./configure --prefix="${GENDIR}"
+./configure --prefix="${GENDIR}" --with-pic
 if [ $? -ne 0 ]
 then
   echo "./configure command failed."
diff --git a/tensorflow/contrib/makefile/compile_pi_protobuf.sh b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
index 2aae2d5f4e6..f863d80009e 100755
--- a/tensorflow/contrib/makefile/compile_pi_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_pi_protobuf.sh
@@ -15,15 +15,15 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
-cd tensorflow/contrib/makefile
+cd tensorflow/contrib/makefile || exit 1
 
-GENDIR=`pwd`/gen/protobuf_pi/
+GENDIR=$(pwd)/gen/protobuf_pi/
 LIBDIR=${GENDIR}
 mkdir -p ${LIBDIR}
 
 CXX=arm-linux-gnueabihf-g++
 
-cd downloads/protobuf
+cd downloads/protobuf || exit 1
 
 ./autogen.sh
 if [ $? -ne 0 ]
diff --git a/tensorflow/contrib/makefile/create_ios_frameworks.sh b/tensorflow/contrib/makefile/create_ios_frameworks.sh
new file mode 100644
index 00000000000..2ad095b3971
--- /dev/null
+++ b/tensorflow/contrib/makefile/create_ios_frameworks.sh
@@ -0,0 +1,112 @@
+#! /bin/sh
+
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Must be run after: build_all_ios.sh
+# Creates an iOS framework which is placed under:
+#    gen/ios_frameworks/tensorflow_experimental.framework.zip
+
+set -e
+pushd .
+
+echo "Starting"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_ios_frameworks
+FW_DIR_TFCORE=$FW_DIR/tensorflow_experimental.framework
+FW_DIR_TFCORE_HDRS=$FW_DIR_TFCORE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFCORE_HDRS
+
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $SCRIPT_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFCORE
+
+echo "Copying static libraries"
+cp $SCRIPT_DIR/gen/lib/libtensorflow-core.a \
+   $FW_DIR_TFCORE/tensorflow_experimental
+cp $SCRIPT_DIR/gen/protobuf_ios/lib/libprotobuf.a \
+   $FW_DIR_TFCORE/libprotobuf_experimental.a
+
+echo "Headers, populating: tensorflow (core)"
+cd $SCRIPT_DIR/../../..
+find tensorflow -name "*.h" | tar -cf $FW_DIR_TFCORE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFCORE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: third_party"
+cd $SCRIPT_DIR/../../..
+tar cf $FW_DIR_TFCORE_HDRS/tmp.tar third_party
+cd $FW_DIR_TFCORE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: unsupported"
+cd $SCRIPT_DIR/downloads/eigen
+tar cf $FW_DIR_TFCORE_HDRS/third_party/eigen3/tmp.tar unsupported
+cd $FW_DIR_TFCORE_HDRS/third_party/eigen3
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Eigen"
+cd $SCRIPT_DIR/downloads/eigen
+tar cf $FW_DIR_TFCORE_HDRS/third_party/eigen3/tmp.tar Eigen
+cd $FW_DIR_TFCORE_HDRS/third_party/eigen3
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: tensorflow (protos)"
+cd $SCRIPT_DIR/gen/proto
+tar cf $FW_DIR_TFCORE_HDRS/tmp.tar tensorflow
+cd $FW_DIR_TFCORE_HDRS
+tar xf tmp.tar
+# Dont include the auto downloaded/generated to build this library
+rm -rf tensorflow/contrib/makefile
+rm -f tmp.tar
+
+echo "Headers, populating: google (proto src)"
+cd $SCRIPT_DIR/downloads/protobuf/src
+tar cf $FW_DIR_TFCORE_HDRS/tmp.tar google
+cd $FW_DIR_TFCORE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org
+echo "Remove all README files"
+cd $FW_DIR_TFCORE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$SCRIPT_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r tensorflow_experimental.framework.zip tensorflow_experimental.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r tensorflow_experimental.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+popd
+rm -rf $TMP_DIR
+
+echo "Finished"
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 58d1dc6f0af..f123111df84 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,9 +30,13 @@ RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" |
 replace_by_sed() {
   local regex="${1}"
   shift
-  if echo "${OSTYPE}" | grep -q darwin; then
+  # Detect the version of sed by the return value of "--version" flag. GNU-sed
+  # supports "--version" while BSD-sed doesn't.
+  if ! sed --version >/dev/null 2>&1; then
+    # BSD-sed.
     sed -i '' -e "${regex}" "$@"
   else
+    # GNU-sed.
     sed -i -e "${regex}" "$@"
   fi
 }
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 3687a61f082..67bc8f8ada3 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -17,6 +17,7 @@ tensorflow/core/platform/setround.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/default/logging.cc
+tensorflow/core/platform/cpu_info.cc
 tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/lib/strings/stringprintf.cc
 tensorflow/core/lib/strings/strcat.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index c17b26eccc9..5ade8942af3 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -7,8 +7,11 @@ tensorflow/core/protobuf/saver.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/meta_graph.pb.cc
+tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
+tensorflow/core/protobuf/rewriter_config.pb.cc
 tensorflow/core/protobuf/debug.pb.cc
+tensorflow/core/protobuf/device_properties.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/framework/variable.pb.cc
@@ -20,6 +23,7 @@ tensorflow/core/framework/tensor.pb.cc
 tensorflow/core/framework/summary.pb.cc
 tensorflow/core/framework/step_stats.pb.cc
 tensorflow/core/framework/resource_handle.pb.cc
+tensorflow/core/framework/remote_fused_graph_execute_info.pb.cc
 tensorflow/core/framework/op_def.pb.cc
 tensorflow/core/framework/node_def.pb.cc
 tensorflow/core/framework/log_memory.pb.cc
@@ -33,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
 tensorflow/core/framework/allocation_description.pb.cc
 tensorflow/core/example/feature.pb.cc
 tensorflow/core/example/example.pb.cc
+tensorflow/core/grappler/costs/op_performance_data.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index f389742186e..1f0ad06cdc5 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -7,8 +7,11 @@ tensorflow/core/protobuf/saver.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/meta_graph.pb.h
+tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
+tensorflow/core/protobuf/device_properties.pb.h
+tensorflow/core/protobuf/rewriter_config.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
 tensorflow/core/framework/versions.pb.h
@@ -21,6 +24,7 @@ tensorflow/core/framework/tensor.pb.h
 tensorflow/core/framework/summary.pb.h
 tensorflow/core/framework/step_stats.pb.h
 tensorflow/core/framework/resource_handle.pb.h
+tensorflow/core/framework/remote_fused_graph_execute_info.pb.h
 tensorflow/core/framework/op_def.pb.h
 tensorflow/core/framework/node_def.pb.h
 tensorflow/core/framework/log_memory.pb.h
@@ -34,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
 tensorflow/core/framework/allocation_description.pb.h
 tensorflow/core/example/feature.pb.h
 tensorflow/core/example/example.pb.h
+tensorflow/core/grappler/costs/op_performance_data.pb.h
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
new file mode 100755
index 00000000000..861bb885c70
--- /dev/null
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -0,0 +1,233 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This is a composite script to build and run inception with hexagon on Android
+
+set -e
+
+usage() {
+  echo "Usage: QUALCOMM_SDK=<path to qualcomm sdk. Not needed if you specify -p> NDK_ROOT=<path to ndk root> $(basename "$0")"
+  echo "Optional: NNLIB_DIR=<path to downloaded nnlib dir>"
+  echo "-b build only"
+  echo "-c test count"
+  echo "-E enable experimental hexnn ops"
+  echo "-p use prebuilt hexagon binaries"
+  echo "-s skip download if files already exist"
+  exit 1
+}
+
+TEST_COUNT=1
+SKIP_DOWNLOAD_IF_EXIST=false
+
+while getopts "bc:Eps" opt_name; do
+  case "$opt_name" in
+    b) BUILD_ONLY="true";;
+    c) TEST_COUNT="${OPTARG}";;
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
+    p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
+    s) SKIP_DOWNLOAD_IF_EXIST="true";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+if [[ -z "${NDK_ROOT}" ]]; then
+    echo "NDK_ROOT is empty" 1>&2
+    usage
+    exit 1
+fi
+
+if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" != "true" &&
+      -z "${QUALCOMM_SDK}" ]]; then
+    echo "QUALCOMM_SDK is empty" 1>&2
+    usage
+    exit 1
+fi
+
+if [[ "${BUILD_ONLY}" != "true" ]]; then
+    if ! type adb >/dev/null 2>&1; then
+        echo "adb is not in your path ${PATH}."
+        exit 1
+    fi
+    if ! adb shell ls /system/lib/rfsa/adsp/testsig* >/dev/null 2>&1; then
+        echo "test signature not found. Unlock your phone first"
+        echo "See ${QUALCOMM_SDK}/tools/elfsigner/README.txt"
+        exit 1
+    fi
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
+TF_ROOT_DIR="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+BUILD_ALL_ANDROID_PATH="${SCRIPT_DIR}/../build_all_android.sh"
+GEN_DIR="${SCRIPT_DIR}/gen"
+GEN_LIBS_DIR="${GEN_DIR}/libs"
+GEN_DOWNLOAD_DIR="${GEN_DIR}/downloads"
+URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
+
+source "${SCRIPT_DIR}/../build_helper.subr"
+
+rm -rf "${GEN_DIR}"
+mkdir -p "${GEN_LIBS_DIR}"
+mkdir -p "${GEN_DOWNLOAD_DIR}"
+
+if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" == "true" ]]; then
+    echo "Download prebuilt hexagon binaries"
+    if [[ "${BUILD_ONLY}" != "true" ]]; then
+        CONTROLLER_PUSH_DEST="/data/local/tmp"
+        NN_LIB_PUSH_DEST="/vendor/lib/rfsa/adsp"
+    fi
+    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_controller.so" \
+"${GEN_LIBS_DIR}/libhexagon_controller.so" "${CONTROLLER_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
+
+    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_nn_skel.so" \
+"${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "${NN_LIB_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
+else
+    echo "Build hexagon binaries from source code"
+    cd "${GEN_DIR}"
+    if [[ -z "${NNLIB_DIR}" ]]; then
+      git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
+    else
+      if [[ ! -f "${NNLIB_DIR}/Makefile" ]]; then
+        echo "Couldn't locate ${NNLIB_DIR}/Makefile" 1>&2
+        exit 1
+      fi
+      echo "Use nnlib in ${NNLIB_DIR}" 1>&2
+      GEN_NNLIB_DIR="${GEN_DIR}/nnlib"
+      mkdir -p "${GEN_NNLIB_DIR}"
+      cp -af "${NNLIB_DIR}/"* "${GEN_NNLIB_DIR}"
+    fi
+
+    cd "${QUALCOMM_SDK}"
+    source "${QUALCOMM_SDK}/setup_sdk_env.sh"
+
+    GENERATED_NNLIB_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_nnlib"
+    if [[ -d "${GENERATED_NNLIB_DIRECTORY}" ]]; then
+        echo "Existing nnlib found.  Remove"
+        rm -rf "${GENERATED_NNLIB_DIRECTORY}"
+    fi
+
+    cp -af "${GEN_DIR}/nnlib" "${GENERATED_NNLIB_DIRECTORY}"
+    cd "${GENERATED_NNLIB_DIRECTORY}"
+
+    make clean V=hexagon_Release_dynamic_toolv72_v60
+    rm -rf hexagon_Release_dynamic_toolv72_v60
+    make tree VERBOSE=1 V=hexagon_Release_dynamic_toolv72_v60
+
+    GENERATED_HEXAGON_CONTROLLER_DIRECTORY=\
+"${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
+
+    if [[ -d "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}" ]]; then
+        echo "Existing hexagon controller found.  Remove"
+        rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
+    fi
+
+    cp -af "${TF_ROOT_DIR}/tensorflow/contrib/hvx/hexagon_controller" \
+       "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
+
+    echo "Copy interface directory"
+    cp -afv "${GENERATED_NNLIB_DIRECTORY}/interface" \
+       "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/"
+
+    echo "Copy glue directory"
+    cp -afv "${GENERATED_NNLIB_DIRECTORY}/glue" \
+       "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/"
+
+    cd "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
+    make clean V=android_Release
+    rm -rf android_Release
+    make tree VERBOSE=1 V=android_Release
+
+    cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" \
+       "${GEN_LIBS_DIR}"
+    cp -v "${GENERATED_NNLIB_DIRECTORY}/hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" \
+       "${GEN_LIBS_DIR}"
+fi
+
+if [[ -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf" &&
+      -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf-host" ]]; then
+    echo "generated protobuf and protobuf-host found."
+    EXTRA_ARGS+=("-T")
+fi
+
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_ARGS+=("-E")
+fi
+
+if [[ -z "${CC_PREFIX}" ]]; then
+    echo "HINT: Installing ccache and specifying CC_PREFIX=ccache accelerate build time"
+fi
+
+CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
+-x "${GEN_LIBS_DIR}" \
+-s "${TF_ROOT_DIR}/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in" \
+-t "hexagon_graph_execution" ${EXTRA_ARGS[@]}
+
+echo "Download and push inception image"
+HEXAGON_DOWNLOAD_PATH=\
+"${TF_ROOT_DIR}/tensorflow/contrib/makefile/downloads/hexagon"
+rm -rf "${HEXAGON_DOWNLOAD_PATH}"
+mkdir -p "${HEXAGON_DOWNLOAD_PATH}/libs"
+
+if [[ "${BUILD_ONLY}" != "true" ]]; then
+    BIN_PUSH_DEST="/data/local/tmp"
+fi
+
+download_and_push "${URL_BASE}/example_images/img_299x299.bmp" \
+"${GEN_DOWNLOAD_DIR}/img_299x299.bmp" "${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
+
+download_and_push \
+"${URL_BASE}/models/tensorflow_inception_v3_stripped_optimized_quantized.pb" \
+"${GEN_DOWNLOAD_DIR}/tensorflow_inception_v3_stripped_optimized_quantized.pb" \
+"${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
+
+download_and_push "${URL_BASE}/models/imagenet_comp_graph_label_strings.txt" \
+"${GEN_DOWNLOAD_DIR}/imagenet_comp_graph_label_strings.txt" "${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
+
+# By default this script runs a test to fuse and run the model
+gtest_args+=("--gtest_filter=GraphTransferer.RunInceptionV3OnHexagonExampleWithTfRuntime")
+# Uncomment this block if you want to run the fused model
+#gtest_args+=("--gtest_filter=GraphTransferer.RunInceptionV3OnHexagonExampleWithFusedGraph")
+# Uncomment this block if you want to run the model with hexagon wrapper
+#gtest_args+=(
+#    "--gtest_also_run_disabled_tests"
+#    "--gtest_filter=GraphTransferer.DISABLED_RunInceptionV3OnHexagonExampleWithHexagonWrapper")
+# Uncomment this block if you want to get the list of tests
+#gtest_args+=("--gtest_list_tests")
+
+if [[ "${BUILD_ONLY}" != "true" ]]; then
+    echo "Run hexagon_graph_execution"
+    ANDROID_EXEC_FILE_MODE=755
+
+    adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
+    adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
+
+    adb push -p \
+        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+        "/data/local/tmp/"
+    adb wait-for-device
+    adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
+        "/data/local/tmp/hexagon_graph_execution"
+    adb wait-for-device
+
+    for i in $(seq 1 "${TEST_COUNT}"); do
+      adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
+          "/data/local/tmp/hexagon_graph_execution" ${gtest_args[@]}
+    done
+fi
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
new file mode 100644
index 00000000000..631d52235a4
--- /dev/null
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This sub Makefile compiles the TensorFlow Android Inference library. This is
+# designed to be used as a sub Makefile with tensorflow/contrib/makefile/Makefile.
+#
+# You can build targets in this file by including this sub makefile like:
+# $ make -f tensorflow/contrib/makefile/Makefile TARGET=ANDROID \
+# SUB_MAKEFILES=\
+# $(pwd)/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in \
+# (optional: NDK_ROOT=<ndk_root>) libtensorflow_inference.so \
+# libtensorflow_demo.so
+
+
+# libtensorflow_inference.so:
+# This library provides TensorFlow support on Android via the Java API and
+# TensorFlowInferenceInterface.
+# It should be packaged into the Android APK along with the Java counterparts
+# under tensorflow/java and tensorflow/contrib/android/java.
+INFERENCE_SRCS := \
+tensorflow/c/tf_status_helper.cc \
+tensorflow/c/checkpoint_reader.cc \
+tensorflow/c/test_op.cc \
+tensorflow/c/c_api.cc \
+tensorflow/java/src/main/native/exception_jni.cc \
+tensorflow/java/src/main/native/graph_jni.cc \
+tensorflow/java/src/main/native/operation_builder_jni.cc \
+tensorflow/java/src/main/native/operation_jni.cc \
+tensorflow/java/src/main/native/session_jni.cc \
+tensorflow/java/src/main/native/tensorflow_jni.cc \
+tensorflow/java/src/main/native/tensor_jni.cc \
+tensorflow/contrib/android/jni/run_stats_jni.cc
+
+INFERENCE_OBJS := $(addprefix $(OBJDIR), $(INFERENCE_SRCS:.cc=.o))
+
+INFERENCE_SO_NAME := libtensorflow_inference.so
+INFERENCE_SO_PATH := $(LIBDIR)$(INFERENCE_SO_NAME)
+
+$(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) \
+	$(LIBFLAGS) $(LDFLAGS) -shared $(LIBS)
+
+$(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
+
+
+# libtensorflow_demo.so:
+# This library provides the additional native support necessary to run the
+# Android TensorFlow demo. This includes image colorspace conversion and object
+# tracking code. It does not provide any TensorFlow functionality itself.
+DEMO_SRCS := \
+tensorflow/examples/android/jni/imageutils_jni.cc \
+tensorflow/examples/android/jni/object_tracking/frame_pair.cc \
+tensorflow/examples/android/jni/object_tracking/image_neon.cc \
+tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc \
+tensorflow/examples/android/jni/object_tracking/logging.cc \
+tensorflow/examples/android/jni/object_tracking/object_detector.cc \
+tensorflow/examples/android/jni/object_tracking/object_tracker.cc \
+tensorflow/examples/android/jni/object_tracking/object_tracker_jni.cc \
+tensorflow/examples/android/jni/object_tracking/optical_flow.cc \
+tensorflow/examples/android/jni/object_tracking/time_log.cc \
+tensorflow/examples/android/jni/object_tracking/tracked_object.cc \
+tensorflow/examples/android/jni/object_tracking/utils_neon.cc \
+tensorflow/examples/android/jni/rgb2yuv.cc \
+tensorflow/examples/android/jni/yuv2rgb.cc
+
+DEMO_OBJS := $(addprefix $(OBJDIR), $(DEMO_SRCS:.cc=.o))
+
+DEMO_SO_NAME := libtensorflow_demo.so
+DEMO_SO_PATH := $(LIBDIR)$(DEMO_SO_NAME)
+
+CXXFLAGS += -DSTANDALONE_DEMO_LIB
+$(DEMO_SO_PATH): $(DEMO_OBJS)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) \
+	-o $@ $(DEMO_OBJS) \
+	$(LIBFLAGS) $(LDFLAGS) -shared $(LIBS)
+
+$(DEMO_SO_NAME): $(DEMO_SO_PATH)
diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index 986150cb3fc..2a6f66edcb7 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -43,11 +43,22 @@ CXXFLAGS += -DTENSORFLOW_DISABLE_META
 # Declare __ANDROID_TYPES_FULL__ to enable required types for hvx
 CXXFLAGS += -D__ANDROID_TYPES_FULL__
 
-GRAPH_EXECUTION_SRCS := \
+GRAPH_TRANSFER_SRCS := \
+tensorflow/cc/framework/scope.cc \
+tensorflow/cc/framework/ops.cc \
+tensorflow/cc/ops/const_op.cc \
 tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
 tensorflow/core/kernels/hexagon/graph_transferer.cc \
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
 tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
+tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc \
+tensorflow/core/kernels/remote_fused_graph_execute_op.cc \
+tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \
+tensorflow/core/ops/remote_fused_graph_ops.cc \
+tensorflow/core/platform/posix/test.cc
+
+GRAPH_EXECUTION_SRCS := \
+$(GRAPH_TRANSFER_SRCS) \
 tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc \
 tensorflow/contrib/makefile/test/test_main.cc
 
diff --git a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
index ef419136638..6ba41d5d12a 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
@@ -40,8 +40,24 @@ $(GTEST_HEADERS)
 # TODO(satok): Remove once it's fixed
 CXXFLAGS += -DTENSORFLOW_DISABLE_META
 
+GRAPH_TRANSFER_SRCS := \
+tensorflow/cc/framework/scope.cc \
+tensorflow/cc/framework/ops.cc \
+tensorflow/cc/ops/const_op.cc \
+tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
+tensorflow/core/kernels/hexagon/graph_transferer.cc \
+tensorflow/core/kernels/hexagon/graph_transferer_test.cc \
+tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
+tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
+tensorflow/core/kernels/remote_fused_graph_execute_op.cc \
+tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \
+tensorflow/core/ops/remote_fused_graph_ops.cc \
+tensorflow/core/platform/posix/test.cc
+
 QUANTIZATION_TEST_SRCS := \
+$(GRAPH_TRANSFER_SRCS) \
 tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc \
+tensorflow/core/kernels/hexagon/graph_transferer_test.cc \
 tensorflow/contrib/makefile/test/test_main.cc
 
 QUANTIZATION_TEST_OBJS := $(addprefix $(OBJDIR), $(QUANTIZATION_TEST_SRCS:.cc=.o))
diff --git a/tensorflow/contrib/makefile/sub_makefiles/so/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/so/Makefile.in
index f55cdda2f53..a20fbcd8fa2 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/so/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/so/Makefile.in
@@ -27,7 +27,7 @@ SO_PATH := $(LIBDIR)$(SO_NAME)
 $(SO_PATH): $(LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-c -o $(SO_NAME) \
-	$(LIBFLAGS) -shared $(LDFLAGS) $(LIBS)
+	-o $@ $(LIB_OBJS) \
+	$(LIBFLAGS) $(LDFLAGS) -shared $(LIBS)
 
 $(SO_NAME): $(SO_PATH)
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index be4e0f6e41c..857d6fa21bc 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -4,6 +4,7 @@ tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
+tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
@@ -58,6 +59,7 @@ tensorflow/core/kernels/scatter_nd_op.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_restore_v2_ops.cc
 tensorflow/core/kernels/save_op.cc
+tensorflow/core/kernels/string_join_op.cc
 tensorflow/core/kernels/reverse_sequence_op.cc
 tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
@@ -72,6 +74,7 @@ tensorflow/core/kernels/reduction_ops_mean.cc
 tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
+tensorflow/core/kernels/reduction_ops_all.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -100,13 +103,16 @@ tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/gather_functor.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
+tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
+tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
+tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
 tensorflow/core/kernels/xsmm_conv2d.cc
@@ -134,12 +140,15 @@ tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_greater_equal.cc
 tensorflow/core/kernels/cwise_op_greater.cc
+tensorflow/core/kernels/cwise_op_floor_div.cc
+tensorflow/core/kernels/cwise_op_floor.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add_2.cc
 tensorflow/core/kernels/cwise_op_add_1.cc
+tensorflow/core/kernels/cwise_op_abs.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -180,6 +189,7 @@ tensorflow/core/kernels/quantization_utils.cc
 tensorflow/core/kernels/quantize_down_and_shrink_range.cc
 tensorflow/core/kernels/quantize_op.cc
 tensorflow/core/kernels/quantized_activation_ops.cc
+tensorflow/core/kernels/quantized_add_op.cc
 tensorflow/core/kernels/quantized_batch_norm_op.cc
 tensorflow/core/kernels/quantized_bias_add_op.cc
 tensorflow/core/kernels/quantized_concat_op.cc
@@ -189,6 +199,7 @@ tensorflow/core/kernels/quantized_matmul_op.cc
 tensorflow/core/kernels/quantized_mul_op.cc
 tensorflow/core/kernels/quantized_pooling_ops.cc
 tensorflow/core/kernels/quantized_reshape_op.cc
+tensorflow/core/kernels/quantized_resize_bilinear_op.cc
 tensorflow/core/kernels/requantization_range_op.cc
 tensorflow/core/kernels/requantize.cc
 tensorflow/core/ops/training_ops.cc
@@ -221,4 +232,4 @@ tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
 tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
-
+tensorflow/core/kernels/warn_about_ints.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index d4b34e71464..c39257ffa91 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -1,8 +1,10 @@
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
+tensorflow/core/protobuf/cluster.pb_text.cc
 tensorflow/core/protobuf/config.pb_text.cc
 tensorflow/core/protobuf/debug.pb_text.cc
+tensorflow/core/protobuf/rewriter_config.pb_text.cc
 tensorflow/core/protobuf/tensor_bundle.pb_text.cc
 tensorflow/core/lib/core/error_codes.pb_text.cc
 tensorflow/core/framework/versions.pb_text.cc
@@ -14,6 +16,7 @@ tensorflow/core/framework/tensor.pb_text.cc
 tensorflow/core/framework/summary.pb_text.cc
 tensorflow/core/framework/step_stats.pb_text.cc
 tensorflow/core/framework/resource_handle.pb_text.cc
+tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.cc
 tensorflow/core/framework/op_def.pb_text.cc
 tensorflow/core/framework/node_def.pb_text.cc
 tensorflow/core/framework/log_memory.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 27d16ac1440..36d9cb74a70 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -7,8 +7,11 @@ tensorflow/core/protobuf/saver.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/meta_graph.proto
+tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
+tensorflow/core/protobuf/device_properties.proto
+tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
 tensorflow/core/framework/versions.proto
@@ -21,6 +24,8 @@ tensorflow/core/framework/tensor.proto
 tensorflow/core/framework/summary.proto
 tensorflow/core/framework/step_stats.proto
 tensorflow/core/framework/resource_handle.proto
+tensorflow/core/framework/remote_fused_graph_execute_info.proto
+tensorflow/core/framework/reader_base.proto
 tensorflow/core/framework/op_def.proto
 tensorflow/core/framework/node_def.proto
 tensorflow/core/framework/log_memory.proto
@@ -34,3 +39,4 @@ tensorflow/core/framework/attr_value.proto
 tensorflow/core/framework/allocation_description.proto
 tensorflow/core/example/feature.proto
 tensorflow/core/example/example.proto
+tensorflow/core/grappler/costs/op_performance_data.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
new file mode 100644
index 00000000000..8b9d30dcfd0
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -0,0 +1,91 @@
+# Description:
+#   Ops that get statistics on memory allocators.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+tf_custom_op_library(
+    name = "python/ops/_memory_stats_ops.so",
+    srcs = [
+        "kernels/memory_stats_ops.cc",
+        "ops/memory_stats_ops.cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "memory_stats_kernels",
+    srcs = [
+        "kernels/memory_stats_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["memory_stats_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "memory_stats_ops",
+    deps = [":memory_stats_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "memory_stats_py",
+    srcs = [
+        "__init__.py",
+        "python/ops/memory_stats_ops.py",
+    ],
+    dso = [
+        ":python/ops/_memory_stats_ops.so",
+    ],
+    kernels = [
+        ":memory_stats_kernels",
+        ":memory_stats_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_stats_ops",
+        "//tensorflow/contrib/util:util_py",
+    ],
+)
+
+cuda_py_test(
+    name = "memory_stats_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/memory_stats_ops_test.py"],
+    additional_deps = [
+        ":memory_stats_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/memory_stats/__init__.py b/tensorflow/contrib/memory_stats/__init__.py
new file mode 100644
index 00000000000..a2b2b656929
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for memory statistics.
+
+@@BytesLimit
+@@MaxBytesInUse
+"""
+
+from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesLimit
+from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import MaxBytesInUse
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
new file mode 100644
index 00000000000..23a682fb905
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Base class of ops that collects statistics of the allocator of a device.
+class MemoryStatsOp : public OpKernel {
+ public:
+  explicit MemoryStatsOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    Allocator* allocator =
+        context->device()->GetAllocator(AllocatorAttributes());
+    AllocatorStats allocator_stats;
+    allocator->GetStats(&allocator_stats);
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape({}), &output_tensor));
+    output_tensor->scalar<int64>()() = ExtractAllocatorStats(allocator_stats);
+  }
+
+ protected:
+  // Extracts a certain field (determined by subclasses) from an allocator
+  // stats.
+  virtual int64 ExtractAllocatorStats(
+      const AllocatorStats& allocator_stats) const = 0;
+};
+
+// Op that measures the total memory (in bytes) of a device.
+class BytesLimitOp : public MemoryStatsOp {
+ public:
+  explicit BytesLimitOp(OpKernelConstruction* context)
+      : MemoryStatsOp(context) {}
+
+ private:
+  int64 ExtractAllocatorStats(
+      const AllocatorStats& allocator_stats) const override {
+    return allocator_stats.bytes_limit;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_CPU), BytesLimitOp);
+REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
+                        BytesLimitOp);
+
+// Op that measures the peak memory in bytes.
+class MaxBytesInUseOp : public MemoryStatsOp {
+ public:
+  explicit MaxBytesInUseOp(OpKernelConstruction* context)
+      : MemoryStatsOp(context) {}
+
+ private:
+  int64 ExtractAllocatorStats(
+      const AllocatorStats& allocator_stats) const override {
+    return allocator_stats.max_bytes_in_use;
+  }
+};
+
+// MallocExtension_GetAllocatedSize doesn't return the allocated size reliably
+// for CPU allocators, so we register this op on GPU only.
+REGISTER_KERNEL_BUILDER(
+    Name("MaxBytesInUse").Device(DEVICE_GPU).HostMemory("out"),
+    MaxBytesInUseOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc
new file mode 100644
index 00000000000..08859c86135
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("BytesLimit")
+    .Output("out: int64")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+REGISTER_OP("MaxBytesInUse")
+    .Output("out: int64")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
new file mode 100644
index 00000000000..ec25c032f05
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for memory statistics ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.memory_stats.python.ops import memory_stats_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class MemoryStatsOpsTest(test_util.TensorFlowTestCase):
+
+  def testBytesLimit(self):
+    # AllocatorStats.bytes_limit is set to zero for CPU allocators, so we skip
+    # the check.
+    if not test.is_gpu_available():
+      return
+
+    with self.test_session(use_gpu=True) as sess:
+      bytes_limit = sess.run(memory_stats_ops.BytesLimit())
+      self.assertLess(0, bytes_limit)
+
+  # Tests the peak memory usage of the following computation.
+  #   a   b
+  #   | / |
+  #   c   |
+  #    \  |
+  #     \ |
+  #       d
+  # The memory for matrix "a" can be reused for matrix "d". Therefore, this
+  # computation needs space for only three matrix plus some small overhead.
+  def testChainOfMatmul(self):
+    # MaxBytesInUse is registered on GPU only. See kernels/memory_stats_ops.cc.
+    if not test.is_gpu_available():
+      return
+
+    with self.test_session(use_gpu=True) as sess:
+      matrix_size = 64
+      matrix_shape = tensor_shape.TensorShape([matrix_size, matrix_size])
+      dtype = dtypes.float32
+      matrix_size_in_bytes = matrix_shape.num_elements() * dtype.size
+      a = random_ops.random_uniform(matrix_shape, dtype=dtype)
+      b = random_ops.random_uniform(matrix_shape, dtype=dtype)
+      c = math_ops.matmul(a, b)
+      d = math_ops.matmul(c, b)
+      sess.run(d)
+
+      max_bytes_in_use = sess.run(memory_stats_ops.MaxBytesInUse())
+      self.assertGreaterEqual(max_bytes_in_use, matrix_size_in_bytes * 3)
+      self.assertLess(max_bytes_in_use, matrix_size_in_bytes * 4)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py b/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py
new file mode 100644
index 00000000000..d35c6583ed0
--- /dev/null
+++ b/tensorflow/contrib/memory_stats/python/ops/memory_stats_ops.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for memory statistics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.memory_stats.ops import gen_memory_stats_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_memory_stats_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_memory_stats_ops.so"))
+
+
+def BytesLimit():
+  """Generates an op that measures the total memory (in bytes) of a device."""
+  return gen_memory_stats_ops.bytes_limit()
+
+
+def MaxBytesInUse():
+  """Generates an op that computes the peak memory of a device."""
+  return gen_memory_stats_ops.max_bytes_in_use()
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 3d078af0818..8b792a0f685 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -8,9 +8,11 @@ exports_files(["LICENSE"])
 
 package(default_visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
-    "//tensorflow:__subpackages__",
+    "//tensorflow:internal",
 ])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "metrics_py",
     srcs = [
@@ -48,6 +50,7 @@ py_library(
 
 py_test(
     name = "classification_test",
+    size = "small",
     srcs = ["python/metrics/classification_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index b5ad8fb8b5d..4c16fb50407 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -12,90 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""##Ops for evaluation metrics and summary statistics.
+"""Ops for evaluation metrics and summary statistics.
 
-### API
-
-This module provides functions for computing streaming metrics: metrics computed
-on dynamically valued `Tensors`. Each metric declaration returns a
-"value_tensor", an idempotent operation that returns the current value of the
-metric, and an "update_op", an operation that accumulates the information
-from the current value of the `Tensors` being measured as well as returns the
-value of the "value_tensor".
-
-To use any of these metrics, one need only declare the metric, call `update_op`
-repeatedly to accumulate data over the desired number of `Tensor` values (often
-each one is a single batch) and finally evaluate the value_tensor. For example,
-to use the `streaming_mean`:
-
-```python
-value = ...
-mean_value, update_op = tf.contrib.metrics.streaming_mean(values)
-sess.run(tf.local_variables_initializer())
-
-for i in range(number_of_batches):
-  print('Mean after batch %d: %f' % (i, update_op.eval())
-print('Final Mean: %f' % mean_value.eval())
-```
-
-Each metric function adds nodes to the graph that hold the state necessary to
-compute the value of the metric as well as a set of operations that actually
-perform the computation. Every metric evaluation is composed of three steps
-
-* Initialization: initializing the metric state.
-* Aggregation: updating the values of the metric state.
-* Finalization: computing the final metric value.
-
-In the above example, calling streaming_mean creates a pair of state variables
-that will contain (1) the running sum and (2) the count of the number of samples
-in the sum.  Because the streaming metrics use local variables,
-the Initialization stage is performed by running the op returned
-by `tf.local_variables_initializer()`. It sets the sum and count variables to
-zero.
-
-Next, Aggregation is performed by examining the current state of `values`
-and incrementing the state variables appropriately. This step is executed by
-running the `update_op` returned by the metric.
-
-Finally, finalization is performed by evaluating the "value_tensor"
-
-In practice, we commonly want to evaluate across many batches and multiple
-metrics. To do so, we need only run the metric computation operations multiple
-times:
-
-```python
-labels = ...
-predictions = ...
-accuracy, update_op_acc = tf.contrib.metrics.streaming_accuracy(
-    labels, predictions)
-error, update_op_error = tf.contrib.metrics.streaming_mean_absolute_error(
-    labels, predictions)
-
-sess.run(tf.local_variables_initializer())
-for batch in range(num_batches):
-  sess.run([update_op_acc, update_op_error])
-
-accuracy, mean_absolute_error = sess.run([accuracy, mean_absolute_error])
-```
-
-Note that when evaluating the same metric multiple times on different inputs,
-one must specify the scope of each metric to avoid accumulating the results
-together:
-
-```python
-labels = ...
-predictions0 = ...
-predictions1 = ...
-
-accuracy0 = tf.contrib.metrics.accuracy(labels, predictions0, name='preds0')
-accuracy1 = tf.contrib.metrics.accuracy(labels, predictions1, name='preds1')
-```
-
-Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted
-via a `weights` argument. The `weights` tensor must be the same size as the
-labels and predictions tensors and results in a weighted average of the metric.
-
-## Metric `Ops`
+See the @{$python/contrib.metrics} guide.
 
 @@streaming_accuracy
 @@streaming_mean
@@ -104,6 +23,7 @@ labels and predictions tensors and results in a weighted average of the metric.
 @@streaming_precision
 @@streaming_precision_at_thresholds
 @@streaming_auc
+@@streaming_curve_points
 @@streaming_recall_at_k
 @@streaming_mean_absolute_error
 @@streaming_mean_iou
@@ -117,6 +37,7 @@ labels and predictions tensors and results in a weighted average of the metric.
 @@streaming_percentage_less
 @@streaming_sensitivity_at_specificity
 @@streaming_sparse_average_precision_at_k
+@@streaming_sparse_average_precision_at_top_k
 @@streaming_sparse_precision_at_k
 @@streaming_sparse_precision_at_top_k
 @@streaming_sparse_recall_at_k
@@ -130,18 +51,11 @@ labels and predictions tensors and results in a weighted average of the metric.
 @@streaming_true_negatives_at_thresholds
 @@streaming_true_positives
 @@streaming_true_positives_at_thresholds
-
 @@auc_using_histogram
-
 @@accuracy
-
 @@aggregate_metrics
 @@aggregate_metric_map
-
 @@confusion_matrix
-
-## Set `Ops`
-
 @@set_difference
 @@set_intersection
 @@set_size
@@ -163,6 +77,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_accuracy
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_concat
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_covariance
+from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_curve_points
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives_at_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_positives
@@ -184,6 +99,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_recall_at
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_root_mean_squared_error
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sensitivity_at_specificity
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_average_precision_at_k
+from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_average_precision_at_top_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_precision_at_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_precision_at_top_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_recall_at_k
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index 9c58d0c619d..26aba1cc514 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import math_ops
 # TODO(nsilberman): move into metrics/python/ops/
 
 
-def accuracy(predictions, labels, weights=None):
+def accuracy(predictions, labels, weights=None, name=None):
   """Computes the percentage of times that predictions matches labels.
 
   Args:
@@ -35,6 +35,7 @@ def accuracy(predictions, labels, weights=None):
     labels: the ground truth values, a `Tensor` of any shape and
             bool, integer, or string dtype.
     weights: None or `Tensor` of float values to reweight the accuracy.
+    name: A name for the operation (optional).
 
   Returns:
     Accuracy `Tensor`.
@@ -52,7 +53,7 @@ def accuracy(predictions, labels, weights=None):
     raise ValueError('Dtypes of predictions and labels should match. '
                      'Given: predictions (%r) and labels (%r)' %
                      (predictions.dtype, labels.dtype))
-  with ops.name_scope('accuracy', values=[predictions, labels]):
+  with ops.name_scope(name, 'accuracy', values=[predictions, labels]):
     is_correct = math_ops.cast(
         math_ops.equal(predictions, labels), dtypes.float32)
     if weights is not None:
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index cf20c5da992..6c773d9a7f2 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 
 
 def _safe_div(numerator, denominator, name):
@@ -73,7 +72,7 @@ def _create_local(name, shape, collections=None, validate_shape=True,
   # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
   collections = list(collections or [])
   collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value=array_ops.zeros(shape, dtype=dtype),
       name=name,
       trainable=False,
@@ -209,9 +208,10 @@ def streaming_true_negatives(predictions, labels, weights=None,
   with variable_scope.variable_scope(
       name, 'true_negatives', (predictions, labels, weights)):
 
-    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
-    labels = math_ops.cast(labels, dtype=dtypes.bool)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions, labels, weights = _remove_squeezable_dimensions(
+        predictions=math_ops.cast(predictions, dtype=dtypes.bool),
+        labels=math_ops.cast(labels, dtype=dtypes.bool),
+        weights=weights)
     is_true_negative = math_ops.logical_and(math_ops.equal(labels, False),
                                             math_ops.equal(predictions, False))
     return _count_condition(is_true_negative, weights, metrics_collections,
@@ -261,7 +261,7 @@ def streaming_false_negatives(predictions, labels, weights=None,
                               metrics_collections=None,
                               updates_collections=None,
                               name=None):
-  """Computes the total number of false positives.
+  """Computes the total number of false negatives.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
@@ -356,7 +356,7 @@ def streaming_mean(values, weights=None, metrics_collections=None,
     mean: A `Tensor` representing the current mean, the value of `total` divided
       by `count`.
     update_op: An operation that increments the `total` and `count` variables
-      appropriately and whose value matches `mean_value`.
+      appropriately and whose value matches `mean`.
 
   Raises:
     ValueError: If `weights` is not `None` and its shape doesn't match `values`,
@@ -403,7 +403,7 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
     mean: A float `Tensor` representing the current mean, the value of `total`
       divided by `count`.
     update_op: An operation that increments the `total` and `count` variables
-      appropriately and whose value matches `mean_value`.
+      appropriately and whose value matches `mean`.
 
   Raises:
     ValueError: If `weights` is not `None` and its shape doesn't match `values`,
@@ -734,6 +734,102 @@ def streaming_true_negatives_at_thresholds(
   return values['tn'], update_ops['tn']
 
 
+def streaming_curve_points(labels=None,
+                           predictions=None,
+                           weights=None,
+                           num_thresholds=200,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           curve='ROC',
+                           name=None):
+  """Computes curve (ROC or PR) values for a prespecified number of points.
+
+  The `streaming_curve_points` function creates four local variables,
+  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
+  that are used to compute the curve values. To discretize the curve, a linearly
+  spaced set of thresholds is used to compute pairs of recall and precision
+  values.
+
+  For best results, `predictions` should be distributed approximately uniformly
+  in the range [0, 1] and not peaked around 0 or 1.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` whose shape matches `predictions`. Will be cast to
+      `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    num_thresholds: The number of thresholds to use when discretizing the roc
+      curve.
+    metrics_collections: An optional list of collections that `auc` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+      'PR' for the Precision-Recall-curve.
+    name: An optional variable_scope name.
+
+  Returns:
+    points: A `Tensor` with shape [num_thresholds, 2] that contains points of
+      the curve.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'curve_points', (labels, predictions,
+                                                            weights)):
+    if curve != 'ROC' and curve != 'PR':
+      raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds - 2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _streaming_confusion_matrix_at_thresholds(
+        labels=labels,
+        predictions=predictions,
+        thresholds=thresholds,
+        weights=weights)
+
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+
+    def compute_points(tp, fn, tn, fp):
+      """Computes the roc-auc or pr-auc based on confusion counts."""
+      rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
+      if curve == 'ROC':
+        fp_rate = math_ops.div(fp, fp + tn + epsilon)
+        return fp_rate, rec
+      else:  # curve == 'PR'.
+        prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
+        return rec, prec
+
+    xs, ys = compute_points(values['tp'], values['fn'], values['tn'],
+                            values['fp'])
+    points = array_ops.stack([xs, ys], axis=1)
+    update_op = control_flow_ops.group(*update_ops.values())
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, points)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return points, update_op
+
+
 def streaming_auc(predictions, labels, weights=None, num_thresholds=200,
                   metrics_collections=None, updates_collections=None,
                   curve='ROC', name=None):
@@ -1339,6 +1435,87 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
         name=name_scope)
 
 
+def sparse_recall_at_top_k(labels,
+                           top_k_predictions,
+                           class_id=None,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate recall by considering only the
+      entries in the batch for which `class_id` is in the label, and computing
+      the fraction of them for which `class_id` is in the top-k `predictions`.
+  If `class_id` is not specified, we'll calculate recall as how often on
+      average a class among the labels of a batch entry is in the top-k
+      `predictions`.
+
+  `sparse_recall_at_top_k` creates two local variables, `true_positive_at_<k>`
+  and `false_negative_at_<k>`, that are used to compute the recall_at_k
+  frequency. This frequency is ultimately returned as `recall_at_<k>`: an
+  idempotent operation that simply divides `true_positive_at_<k>` by total
+  (`true_positive_at_<k>` + `false_negative_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall_at_<k>`. Set operations applied to `top_k` and `labels` calculate the
+  true positives and false negatives weighted by `weights`. Then `update_op`
+  increments `true_positive_at_<k>` and `false_negative_at_<k>` using these
+  values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `top_k_predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range always count towards `false_negative_at_<k>`.
+    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
+      N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
+      The final dimension contains the indices of top-k labels. [D1, ... DN]
+      must match `labels`.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  default_name = _at_k_name('recall', class_id=class_id)
+  with ops.name_scope(name, default_name, (top_k_predictions, labels,
+                                           weights)) as name_scope:
+    return metrics_impl._sparse_recall_at_top_k(  # pylint: disable=protected-access
+        labels=labels,
+        predictions_idx=top_k_predictions,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name_scope)
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
@@ -1403,6 +1580,69 @@ def streaming_sparse_average_precision_at_k(predictions,
       updates_collections=updates_collections, name=name)
 
 
+def streaming_sparse_average_precision_at_top_k(top_k_predictions,
+                                                labels,
+                                                weights=None,
+                                                metrics_collections=None,
+                                                updates_collections=None,
+                                                name=None):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  `streaming_sparse_average_precision_at_top_k` creates two local variables,
+  `average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
+  are used to compute the frequency. This frequency is ultimately returned as
+  `average_precision_at_<k>`: an idempotent operation that simply divides
+  `average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Set operations applied to `top_k` and `labels` calculate
+  the true positives and false positives weighted by `weights`. Then `update_op`
+  increments `true_positive_at_<k>` and `false_positive_at_<k>` using these
+  values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and `predictions_idx` has shape [batch size, k]. The final
+      dimension must be set and contains the top `k` predicted class indices.
+      [D1, ... DN] must match `labels`. Values should be in range
+      [0, num_classes).
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
+      num_labels=1. N >= 1 and num_labels is the number of target classes for
+      the associated prediction. Commonly, N=1 and `labels` has shape
+      [batch_size, num_labels]. [D1, ... DN] must match `top_k_predictions`.
+      Values should be in range [0, num_classes).
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    mean_average_precision: Scalar `float64` `Tensor` with the mean average
+      precision values.
+    update: `Operation` that increments  variables appropriately, and whose
+      value matches `metric`.
+
+  Raises:
+    ValueError: if the last dimension of top_k_predictions is not set.
+  """
+  return metrics_impl._streaming_sparse_average_precision_at_top_k(  # pylint: disable=protected-access
+      predictions_idx=top_k_predictions,
+      labels=labels,
+      weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections,
+      name=name)
+
+
 def streaming_mean_absolute_error(predictions, labels, weights=None,
                                   metrics_collections=None,
                                   updates_collections=None,
@@ -1816,11 +2056,11 @@ def streaming_pearson_correlation(predictions,
         math_ops.multiply(math_ops.sqrt(var_predictions),
                           math_ops.sqrt(var_labels)),
         'pearson_r')
-    with ops.control_dependencies(
-        [update_cov, update_var_predictions, update_var_labels]):
-      update_op = _safe_div(update_cov, math_ops.multiply(
-          math_ops.sqrt(update_var_predictions),
-          math_ops.sqrt(update_var_labels)), 'update_op')
+    update_op = _safe_div(
+        update_cov,
+        math_ops.multiply(math_ops.sqrt(update_var_predictions),
+                          math_ops.sqrt(update_var_labels)),
+        'update_op')
 
   if metrics_collections:
     ops.add_to_collections(metrics_collections, pearson_r)
@@ -2226,8 +2466,10 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
 __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
+    'sparse_recall_at_top_k',
     'streaming_accuracy',
     'streaming_auc',
+    'streaming_curve_points',
     'streaming_false_negatives',
     'streaming_false_negatives_at_thresholds',
     'streaming_false_positives',
@@ -2248,7 +2490,9 @@ __all__ = [
     'streaming_root_mean_squared_error',
     'streaming_sensitivity_at_specificity',
     'streaming_sparse_average_precision_at_k',
+    'streaming_sparse_average_precision_at_top_k',
     'streaming_sparse_precision_at_k',
+    'streaming_sparse_precision_at_top_k',
     'streaming_sparse_recall_at_k',
     'streaming_specificity_at_sensitivity',
     'streaming_true_negatives',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index f37206b7aef..6496cecfbd0 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -521,7 +521,7 @@ class StreamingAccuracyTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=2)
     accuracy, update_op = metrics.streaming_accuracy(predictions, labels)
 
     with self.test_session() as sess:
@@ -663,22 +663,29 @@ class StreamingTruePositivesTest(test.TestCase):
     _assert_local_variables(self, ('true_positives/count:0',))
 
   def testUnweighted(self):
-    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
-      tp, tp_update_op = metrics.streaming_true_positives(predictions, labels)
+    for expand_predictions in [True, False]:
+      for expand_labels in [True, False]:
+        for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+          predictions = math_ops.cast(constant_op.constant(
+              ((1, 0, 1, 0),
+               (0, 1, 1, 1),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_predictions:
+            predictions = array_ops.expand_dims(predictions, 2)
+          labels = math_ops.cast(constant_op.constant(
+              ((0, 1, 1, 0),
+               (1, 0, 0, 0),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_labels:
+            labels = array_ops.expand_dims(labels, 2)
+          tp, tp_update_op = metrics.streaming_true_positives(predictions,
+                                                              labels)
 
-      with self.test_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertEqual(0, tp.eval())
-        self.assertEqual(1, tp_update_op.eval())
-        self.assertEqual(1, tp.eval())
+          with self.test_session() as sess:
+            sess.run(variables.local_variables_initializer())
+            self.assertEqual(0, tp.eval())
+            self.assertEqual(1, tp_update_op.eval())
+            self.assertEqual(1, tp.eval())
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
@@ -712,22 +719,29 @@ class StreamingFalseNegativesTest(test.TestCase):
     _assert_local_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
-    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
-      fn, fn_update_op = metrics.streaming_false_negatives(predictions, labels)
+    for expand_predictions in [True, False]:
+      for expand_labels in [True, False]:
+        for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+          predictions = math_ops.cast(constant_op.constant(
+              ((1, 0, 1, 0),
+               (0, 1, 1, 1),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_predictions:
+            predictions = array_ops.expand_dims(predictions, 2)
+          labels = math_ops.cast(constant_op.constant(
+              ((0, 1, 1, 0),
+               (1, 0, 0, 0),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_labels:
+            labels = array_ops.expand_dims(labels, 2)
+          fn, fn_update_op = metrics.streaming_false_negatives(predictions,
+                                                               labels)
 
-      with self.test_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertEqual(0, fn.eval())
-        self.assertEqual(2, fn_update_op.eval())
-        self.assertEqual(2, fn.eval())
+          with self.test_session() as sess:
+            sess.run(variables.local_variables_initializer())
+            self.assertEqual(0, fn.eval())
+            self.assertEqual(2, fn_update_op.eval())
+            self.assertEqual(2, fn.eval())
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
@@ -761,22 +775,29 @@ class StreamingFalsePositivesTest(test.TestCase):
     _assert_local_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
-    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
-      fp, fp_update_op = metrics.streaming_false_positives(predictions, labels)
+    for expand_predictions in [True, False]:
+      for expand_labels in [True, False]:
+        for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+          predictions = math_ops.cast(constant_op.constant(
+              ((1, 0, 1, 0),
+               (0, 1, 1, 1),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_predictions:
+            predictions = array_ops.expand_dims(predictions, 2)
+          labels = math_ops.cast(constant_op.constant(
+              ((0, 1, 1, 0),
+               (1, 0, 0, 0),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_labels:
+            labels = array_ops.expand_dims(labels, 2)
+          fp, fp_update_op = metrics.streaming_false_positives(predictions,
+                                                               labels)
 
-      with self.test_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertEqual(0, fp.eval())
-        self.assertEqual(4, fp_update_op.eval())
-        self.assertEqual(4, fp.eval())
+          with self.test_session() as sess:
+            sess.run(variables.local_variables_initializer())
+            self.assertEqual(0, fp.eval())
+            self.assertEqual(4, fp_update_op.eval())
+            self.assertEqual(4, fp.eval())
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
@@ -814,22 +835,29 @@ class StreamingTrueNegativesTest(test.TestCase):
     _assert_local_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
-    for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
-      tn, tn_update_op = metrics.streaming_true_negatives(predictions, labels)
+    for expand_predictions in [True, False]:
+      for expand_labels in [True, False]:
+        for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
+          predictions = math_ops.cast(constant_op.constant(
+              ((1, 0, 1, 0),
+               (0, 1, 1, 1),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_predictions:
+            predictions = array_ops.expand_dims(predictions, 2)
+          labels = math_ops.cast(constant_op.constant(
+              ((0, 1, 1, 0),
+               (1, 0, 0, 0),
+               (0, 0, 0, 0))), dtype=dtype)
+          if expand_labels:
+            labels = array_ops.expand_dims(labels, 2)
+          tn, tn_update_op = metrics.streaming_true_negatives(predictions,
+                                                              labels)
 
-      with self.test_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertEqual(0, tn.eval())
-        self.assertEqual(5, tn_update_op.eval())
-        self.assertEqual(5, tn.eval())
+          with self.test_session() as sess:
+            sess.run(variables.local_variables_initializer())
+            self.assertEqual(0, tn.eval())
+            self.assertEqual(5, tn_update_op.eval())
+            self.assertEqual(5, tn.eval())
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
@@ -1073,7 +1101,7 @@ class StreamingPrecisionTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=2)
     precision, update_op = metrics.streaming_precision(predictions, labels)
 
     with self.test_session() as sess:
@@ -1237,7 +1265,7 @@ class StreamingRecallTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=2)
     recall, update_op = metrics.streaming_recall(predictions, labels)
 
     with self.test_session() as sess:
@@ -1327,6 +1355,99 @@ class StreamingRecallTest(test.TestCase):
       self.assertEqual(0, recall.eval())
 
 
+class StreamingCurvePointsTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metric_ops.streaming_curve_points(
+        predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
+    _assert_local_variables(
+        self,
+        ('curve_points/true_positives:0', 'curve_points/false_negatives:0',
+         'curve_points/false_positives:0', 'curve_points/true_negatives:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    points, _ = metric_ops.streaming_curve_points(
+        labels=array_ops.ones((10, 1)),
+        predictions=array_ops.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [points])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metric_ops.streaming_curve_points(
+        labels=array_ops.ones((10, 1)),
+        predictions=array_ops.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def _testValueTensorIsIdempotent(self, curve):
+    predictions = constant_op.constant(
+        np.random.uniform(size=(10, 3)), dtype=dtypes_lib.float32)
+    labels = constant_op.constant(
+        np.random.uniform(high=2, size=(10, 3)), dtype=dtypes_lib.float32)
+
+    points, update_op = metric_ops.streaming_curve_points(
+        labels, predictions=predictions, curve=curve)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      sess.run(update_op)
+      initial_points = points.eval()
+
+      sess.run(update_op)
+      self.assertAllClose(initial_points, points.eval())
+
+  def testValueTensorIsIdempotentROC(self):
+    self._testValueTensorIsIdempotent(curve='ROC')
+
+  def testValueTensorIsIdempotentPR(self):
+    self._testValueTensorIsIdempotent(curve='PR')
+
+  def _testCase(self, labels, predictions, curve, expected_points):
+    with self.test_session() as sess:
+      predictions_tensor = constant_op.constant(
+          predictions, dtype=dtypes_lib.float32)
+      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.float32)
+      points, update_op = metric_ops.streaming_curve_points(
+          labels=labels_tensor,
+          predictions=predictions_tensor,
+          num_thresholds=3,
+          curve=curve)
+
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+
+      self.assertAllClose(expected_points, points.eval())
+
+  def testEdgeCasesROC(self):
+    self._testCase([[1]], [[1]], 'ROC', [[0, 1], [0, 1], [0, 0]])
+    self._testCase([[0]], [[0]], 'ROC', [[1, 1], [0, 1], [0, 1]])
+    self._testCase([[0]], [[1]], 'ROC', [[1, 1], [1, 1], [0, 1]])
+    self._testCase([[1]], [[0]], 'ROC', [[0, 1], [0, 0], [0, 0]])
+
+  def testManyValuesROC(self):
+    self._testCase([[1.0, 0.0, 0.0, 1.0, 1.0, 1.0]],
+                   [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]], 'ROC',
+                   [[1.0, 1.0], [0.0, 0.75], [0.0, 0.0]])
+
+  def testEdgeCasesPR(self):
+    self._testCase([[1]], [[1]], 'PR', [[1, 1], [1, 1], [0, 1]])
+    self._testCase([[0]], [[0]], 'PR', [[1, 0], [1, 1], [1, 1]])
+    self._testCase([[0]], [[1]], 'PR', [[1, 0], [1, 0], [1, 1]])
+    self._testCase([[1]], [[0]], 'PR', [[1, 1], [0, 1], [0, 1]])
+
+  def testManyValuesPR(self):
+    self._testCase([[1.0, 0.0, 0.0, 1.0, 1.0, 1.0]],
+                   [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]], 'PR',
+                   [[1.0, 4.0 / 6.0], [0.75, 1.0], [0.0, 1.0]])
+
+
 class StreamingAUCTest(test.TestCase):
 
   def setUp(self):
@@ -1360,7 +1481,7 @@ class StreamingAUCTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=2)
     auc, update_op = metrics.streaming_auc(predictions, labels)
 
     with self.test_session() as sess:
@@ -1507,7 +1628,7 @@ class StreamingAUCTest(test.TestCase):
       self.assertAlmostEqual(1, auc.eval(), 6)
 
   def np_auc(self, predictions, labels, weights):
-    """Computes the AUC explicitely using Numpy.
+    """Computes the AUC explicitly using Numpy.
 
     Args:
       predictions: an ndarray with shape [N].
@@ -1623,7 +1744,7 @@ class StreamingSpecificityAtSensitivityTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=2)
     specificity, update_op = metrics.streaming_specificity_at_sensitivity(
         predictions, labels, sensitivity=0.7)
 
@@ -1759,7 +1880,7 @@ class StreamingSensitivityAtSpecificityTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     sensitivity, update_op = metrics.streaming_sensitivity_at_specificity(
         predictions, labels, specificity=0.7)
 
@@ -1885,7 +2006,7 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
     labels = random_ops.random_uniform(
-        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
+        (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=2)
     thresholds = [0, 0.5, 1.0]
     prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
                                                               labels,
@@ -2351,6 +2472,31 @@ class StreamingSparsePrecisionTest(test.TestCase):
         self.assertAlmostEqual(expected, update.eval())
         self.assertAlmostEqual(expected, metric.eval())
 
+  def _test_streaming_sparse_average_precision_at_top_k(self,
+                                                        top_k_predictions,
+                                                        labels,
+                                                        expected,
+                                                        weights=None):
+    with ops.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = constant_op.constant(weights, dtypes_lib.float32)
+      metric, update = metrics.streaming_sparse_average_precision_at_top_k(
+          top_k_predictions, labels, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(errors_impl.OpError, metric.eval)
+      self.assertRaises(errors_impl.OpError, update.eval)
+      local_variables = variables.local_variables()
+      variables.variables_initializer(local_variables).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertAlmostEqual(expected, update.eval())
+        self.assertAlmostEqual(expected, metric.eval())
+
   def test_top_k_rank_invalid(self):
     with self.test_session():
       # top_k_predictions has rank < 2.
@@ -2388,6 +2534,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex1[:k],), labels, expected=precision_ex1[i])
       self._test_streaming_sparse_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_top_k(
+          (predictions_top_k_ex1[:k],), labels, expected=avg_precision_ex1[i])
 
     # Example 2.
     labels_ex2 = (0, 2, 4, 5, 6)
@@ -2406,6 +2554,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex2[:k],), labels, expected=precision_ex2[i])
       self._test_streaming_sparse_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex2[i])
+      self._test_streaming_sparse_average_precision_at_top_k(
+          (predictions_top_k_ex2[:k],), labels, expected=avg_precision_ex2[i])
 
     # Both examples, we expect both precision and average precision to be the
     # average of the 2 examples.
@@ -2426,6 +2576,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
           predictions_top_k, labels, expected=streaming_precision[i])
       self._test_streaming_sparse_average_precision_at_k(
           predictions, labels, k, expected=streaming_average_precision[i])
+      self._test_streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels, expected=streaming_average_precision[i])
 
     # Weighted examples, we expect streaming average precision to be the
     # weighted average of the 2 examples.
@@ -2442,6 +2594,11 @@ class StreamingSparsePrecisionTest(test.TestCase):
           k,
           expected=streaming_average_precision[i],
           weights=weights)
+      self._test_streaming_sparse_average_precision_at_top_k(
+          (predictions_top_k_ex1[:k], predictions_top_k_ex2[:k]),
+          labels,
+          expected=streaming_average_precision[i],
+          weights=weights)
 
   def test_average_precision_some_labels_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
@@ -2461,6 +2618,27 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex1[:k],), labels, expected=precision_ex1[i])
       self._test_streaming_sparse_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_top_k(
+          (predictions_top_k_ex1[:k],), labels, expected=avg_precision_ex1[i])
+
+  def test_average_precision_at_top_k_static_shape_check(self):
+    predictions_top_k = array_ops.placeholder(shape=(2, None),
+                                              dtype=dtypes_lib.int64)
+    labels = np.array(((1,), (2,)), dtype=np.int64)
+    # Fails due to non-static predictions_idx shape.
+    with self.assertRaises(ValueError):
+      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
+                                                             labels)
+
+    predictions_top_k = (2, 1)
+    # Fails since rank of predictions_idx is less than one.
+    with self.assertRaises(ValueError):
+      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
+                                                             labels)
+    predictions_top_k = ((2,), (1,))
+    # Valid static shape.
+    metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
+                                                           labels)
 
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
@@ -2901,8 +3079,38 @@ class StreamingSparseRecallTest(test.TestCase):
         self.assertEqual(expected, update.eval())
         self.assertEqual(expected, metric.eval())
 
+  def _test_sparse_recall_at_top_k(self,
+                                   labels,
+                                   top_k_predictions,
+                                   expected,
+                                   class_id=None,
+                                   weights=None):
+    with ops.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = constant_op.constant(weights, dtypes_lib.float32)
+      metric, update = metric_ops.sparse_recall_at_top_k(
+          labels=labels,
+          top_k_predictions=constant_op.constant(top_k_predictions,
+                                                 dtypes_lib.int32),
+          class_id=class_id,
+          weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(errors_impl.OpError, metric.eval)
+      self.assertRaises(errors_impl.OpError, update.eval)
+      variables.variables_initializer(variables.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2913,9 +3121,12 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (-1, 0, 1, 4):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=1, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2924,9 +3135,12 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 0 predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0, class_id=2)
 
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2935,13 +3149,18 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, class_id=3)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=3)
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2)
 
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2950,6 +3169,8 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, class_id=3, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2957,6 +3178,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2964,6 +3191,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(2.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2971,6 +3204,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2978,6 +3217,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2985,6 +3230,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2992,6 +3243,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -2999,6 +3256,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=2.0 / 2,
           class_id=3,
           weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=2.0 / 2,
+          class_id=3,
+          weights=(2.0, 3.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3006,6 +3269,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=3.0 / 3,
           class_id=3,
           weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=3.0 / 3,
+          class_id=3,
+          weights=(3.0, 2.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3013,6 +3282,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.3 / 0.3,
           class_id=3,
           weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.3 / 0.3,
+          class_id=3,
+          weights=(0.3, 0.6))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3020,32 +3295,70 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.6 / 0.6,
           class_id=3,
           weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.6 / 0.6,
+          class_id=3,
+          weights=(0.6, 0.3))
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(2.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, weights=(1.0, 0.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, weights=(0.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 5, weights=(2.0, 3.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 5, weights=(3.0, 2.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.3 / 0.9, weights=(0.3, 0.6))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.6 / 0.9, weights=(0.6, 0.3))
 
   def test_three_labels_at_k5_nan(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3055,10 +3368,16 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3067,10 +3386,16 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 8: 1 label, no predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=8)
 
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3079,23 +3404,35 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 2 labels, both correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
       # Class 5: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
       # Class 7: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
       # All classes: 6 labels, 3 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=3.0 / 6)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 6)
 
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sp_labels = sparse_tensor.SparseTensorValue(
         indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
                  [1, 3]],
@@ -3110,6 +3447,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=2.0 / 2,
         class_id=2)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=2.0 / 2,
+        class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3118,6 +3460,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=1.0 / 1,
         class_id=5)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=1.0 / 1,
+        class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3126,16 +3473,30 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=0.0 / 1,
         class_id=7)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=0.0 / 1,
+        class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 8)
+    self._test_sparse_recall_at_top_k(
+        sp_labels, top_k_predictions, expected=3.0 / 8)
 
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
          [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
@@ -3150,12 +3511,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_3d_no_predictions(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3172,12 +3542,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (1, 8):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=0.0, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=0.0, class_id=class_id)
 
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3187,24 +3566,39 @@ class StreamingSparseRecallTest(test.TestCase):
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 labels, 1 incorrect.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=1.0 / 2, class_id=7)
 
     # All classes: 12 labels, 7 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=7.0 / 12)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=7.0 / 12)
 
   def test_3d_ignore_all(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3219,6 +3613,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0], [0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0], [0]])
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3226,16 +3626,33 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0, 0], [0, 0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3250,6 +3667,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3259,6 +3682,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3268,6 +3697,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1.0,
         class_id=7,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 1.0,
+        class_id=7,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3277,6 +3712,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1.0,
         class_id=7,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=0.0 / 1.0,
+        class_id=7,
+        weights=[[1], [0]])
 
     # Class 7: 2 labels, 1 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3286,6 +3727,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 2.0,
         class_id=7,
         weights=[[1, 0], [1, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 2.0,
+        class_id=7,
+        weights=[[1, 0], [1, 0]])
 
     # Class 7: No labels.
     self._test_streaming_sparse_recall_at_k(
@@ -3295,6 +3742,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=NAN,
         class_id=7,
         weights=[[0, 1], [0, 1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=NAN,
+        class_id=7,
+        weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
     predictions = [[0.1, 0.3, 0.2, 0.4],
@@ -4341,7 +4794,7 @@ class StreamingMeanIOUTest(test.TestCase):
     predictions = random_ops.random_uniform(
         [10], maxval=num_classes, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-        [10], maxval=num_classes, dtype=dtypes_lib.int64, seed=1)
+        [10], maxval=num_classes, dtype=dtypes_lib.int64, seed=2)
     miou, update_op = metrics.streaming_mean_iou(
         predictions, labels, num_classes=num_classes)
 
@@ -4600,6 +5053,25 @@ class StreamingConcatTest(test.TestCase):
       sess.run([update_op], feed_dict={values: [5, 6, 7, 8, 9]})
       self.assertAllEqual(np.arange(10), concatenated.eval())
 
+  def testStreamingConcatStringValues(self):
+    with self.test_session() as sess:
+      values = array_ops.placeholder(dtypes_lib.string, [None])
+      concatenated, update_op = metrics.streaming_concat(values)
+      sess.run(variables.local_variables_initializer())
+
+      self.assertItemsEqual([], concatenated.eval())
+
+      sess.run([update_op], feed_dict={values: ['a', 'b', 'c']})
+      self.assertItemsEqual([b'a', b'b', b'c'], concatenated.eval())
+
+      sess.run([update_op], feed_dict={values: ['d', 'e']})
+      self.assertItemsEqual([b'a', b'b', b'c', b'd', b'e'], concatenated.eval())
+
+      sess.run([update_op], feed_dict={values: ['f', 'g', 'h', 'i', 'j']})
+      self.assertItemsEqual(
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j'],
+          concatenated.eval())
+
   def testStreamingConcatMaxSize(self):
     with self.test_session() as sess:
       values = math_ops.range(3)
diff --git a/tensorflow/contrib/mpi/BUILD b/tensorflow/contrib/mpi/BUILD
new file mode 100644
index 00000000000..20ceef5004a
--- /dev/null
+++ b/tensorflow/contrib/mpi/BUILD
@@ -0,0 +1,90 @@
+# Description:
+#   MPI based communication interfaces and implementations for TensorFlow.
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "mpi_msg_proto",
+    srcs = ["mpi_msg.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:worker_proto"],
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "mpi_utils",
+    srcs = ["mpi_utils.cc"],
+    hdrs = ["mpi_utils.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/mpi",
+    ],
+)
+
+cc_library(
+    name = "mpi_rendezvous_mgr",
+    srcs = ["mpi_rendezvous_mgr.cc"],
+    hdrs = ["mpi_rendezvous_mgr.h"],
+    deps = [
+        ":mpi_msg_proto_cc",
+        ":mpi_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_cc",
+        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//third_party/mpi",
+    ],
+)
+
+cc_library(
+    name = "mpi_server_lib",
+    srcs = ["mpi_server_lib.cc"],
+    hdrs = ["mpi_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        ":mpi_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/mpi/README.md b/tensorflow/contrib/mpi/README.md
new file mode 100644
index 00000000000..b0d03d05a26
--- /dev/null
+++ b/tensorflow/contrib/mpi/README.md
@@ -0,0 +1,94 @@
+## How to compile and use MPI-enabled TensorFlow
+
+1. Follow the regular TF compilation instructions. During configure step, if you want MPI support, answer yes to this question:
+
+    ```Do you wish to build TensorFlow with MPI support [y/N]```
+
+2. To turn on the MPI connection, add the protocol "grpc+mpi" in the server definition:
+
+    ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+mpi') # default protocol is 'grpc'```
+
+## Overview
+
+By using this protocol TensorFlow can take advantage of the high performance networking primitives that are offered via the MPI API. This enables TensorFlow to take advantage of high performance low latency networks such as Infiniband. These changes are largely transparent to the user who only has to change the offered protocol and launch the script using the 'mpirun'  launcher. For example:
+    ```mpirun -np 2 python my_neuralnet.py ```
+
+
+
+
+
+## Runtime options
+
+The following environment variables can be set to modify the behavior at runtime:
+
+**MPI_DISABLED=[0,1]**
+
+This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing). 
+
+**MPI_OPTIMAL_PATH=[0,1]**
+
+When set to 0 it will use the default path where tensors are encoded to ProtoText before being copied to a remote process. When set to 1 a more optimal path will be taken where only the tensor description is encoded while the actual tensor data is transferred directly from the source buffer to the destination buffer.
+This path is disabled by default as it requires that the MPI library can directly access the pointer to the data. For CPU backed buffers this is no problem, however for GPU backed buffers this requires MPI libraries that are built with CUDA support (CUDA Aware). When using non-CUDA aware MPI libraries and GPU buffers you will get segmentation faults.
+
+
+
+## Known problems
+
+For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine). 
+
+**MVAPICH**
+- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support." 
+
+**OpenMPI**
+- With OpenMPI corrupt data will be received resulting in an assertion or the MPI library will print an error and exit. The error is "Attempt to free memory that is still in use by an ongoing MPI communication.  MPI job will now abort."
+
+## Implementation details
+
+
+The implementation takes over the responsibility for sending and receiving tensors between separate processes. This is facilitated by TensorFlow's ability to support different protocols. In this particular implementation, the standard gRPC library is used for all administrative operations while the MPI functions take over the tensor exchanges. On the sending side the tensors are placed in the standard waiting tables and nothing is changed there. On the receiving side the RecvFromRemoteAsync function is newly implemented and instead of requesting the data via gRPC the data is now requested via MPI calls.
+
+To this end once the code is loaded a dedicated thread will be launched that handles all MPI operations. This thread will loop through a set of operations:
+
+* Send requests placed on the request queue to the sending process
+Once a request for a tensor is received two callbacks are created. The first one is to request the tensor and the second one is executed once the requested data has arrived. To this end the request is placed in a queue and will be sent once the MPI thread services the queue. This sending is done using non-blocking MPI_Isend operations.
+
+* Send tensor data in response to a request call
+Once a request has arrived from a remote process the request is forwarded to the original TensorFlow code which looks up the tensor in the waiting table. Once the tensor has been found a callback is executed which places the found tensor on the sendQueue for the MPI thread. Once the sendQueue is served the tensor data will be send using non-blocking send operations (MP_Isend) to the remote process.
+
+* Receive tensor request
+The MPI thread will check if there are any incoming tensor request messages on the communication lines using MPI_Iprobe. Once a request has been received it will be passed on to the standard TensorFlow code and eventually will be placed on the sendQueue.
+
+* Receive tensor 
+At some point after a request has been sent the remote process will transmit the tensor. This tensor will be received and we look-up the callback that is associated with this tensor in our request table and execute the callback on the received data.
+
+
+In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive. 
+The MPI processes identify each other using an MPI process ID. The TensorFlow gRPC processes identify each other using a name. During launch we create a mapping between the TensorFlow process name and the MPI process ID to allow the processes to communicate with the correct destinations when using MPI operations.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tensorflow/contrib/mpi/mpi_msg.proto b/tensorflow/contrib/mpi/mpi_msg.proto
new file mode 100644
index 00000000000..36f1504901c
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_msg.proto
@@ -0,0 +1,19 @@
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+
+import "tensorflow/core/protobuf/worker.proto";
+
+
+message MPIRecvTensorResponse {
+    RecvTensorResponse response = 1;
+    bool              singleSend = 2;
+    string key = 3;
+    int64 step_id = 4;
+    uint64 checksum = 5;
+}
+
+
+
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
new file mode 100644
index 00000000000..e97e8d01638
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -0,0 +1,315 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include "tensorflow/contrib/mpi/mpi_rendezvous_mgr.h"
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+
+namespace tensorflow {
+
+MPIRendezvousMgr::MPIRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env), worker_env_2(env), use_optimal_transfer_(false) {
+
+  const char* mpienv = getenv("MPI_OPTIMAL_PATH");
+  if (mpienv && mpienv[0] == '1') {
+    LOG(INFO) << "MPI Optimal copy path enabled (Requires CUDA-Aware MPI when "
+                 "using GPUs)\n";
+    use_optimal_transfer_ = true;
+  }
+
+  // extract worker-name
+  auto parsed = env->local_devices[0]->parsed_name();
+  const std::string task_id = strings::StrCat(parsed.job, ":", parsed.replica);
+
+  mpiutils_ = new MPIUtils(task_id);
+  background_thread_ =
+      std::thread(&MPIRendezvousMgr::MPIBackgroundThread, this);
+}
+
+BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
+                                               const WorkerEnv* worker_env) {
+  return new MPIRemoteRendezvous(worker_env, step_id, mpiutils_, this);
+}
+
+void MPIRemoteRendezvous::RecvFromRemoteAsync(
+    const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
+    DoneCallback done) {
+
+  Status s = Status::OK();
+  MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
+
+  VLOG(2) << "MPI User requested " << parsed.FullKey()
+          << " @ step: " << step_id_;
+
+  std::string src_task =
+      strings::StrCat(parsed.src.job, ":", parsed.src.replica);
+  const int dst = mpiutils_->GetSourceID(src_task);
+
+  Device* dst_device;
+  if (s.ok()) {
+    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    CHECK(s.ok()) << "Device lookup failed";
+  } else {
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+
+  // Set properties of the request object and create the request function
+  rendezvous_call->Init(parsed, step_id_);
+
+  std::function<void()> request_call = [parsed, dst, rendezvous_call]() {
+    // Use MPI_Alloc_mem here to force allocation inside MPI thread
+    // this is not optimal, but prevents memory corruption and segmentation
+    // faults during inter-server transfers...
+    MPI_CHECK(MPI_Alloc_mem(rendezvous_call->request_buffer_size_,
+                            MPI_INFO_NULL, &rendezvous_call->request_buffer_));
+    rendezvous_call->req_.SerializeToArray(
+        rendezvous_call->request_buffer_,
+        rendezvous_call->request_buffer_size_);
+    MPI_CHECK(MPI_Isend(rendezvous_call->request_buffer_,
+                        rendezvous_call->request_buffer_size_, MPI_CHAR, dst,
+                        TAG_REQTENSOR, MPI_COMM_WORLD,
+                        &rendezvous_call->mpi_request_));
+  };
+
+  // Create the function which is called when the Tensor is send by remote
+  const int64 temp1 = step_id_;
+  rendezvous_call->recv_call_ =
+      [this, parsed, recv_args, done, dst, temp1, rendezvous_call](
+          MPIRecvTensorResponse mpi_response) {
+    Status s;
+    Device* dst_device;
+    if (s.ok()) {
+      s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+      CHECK(s.ok()) << "Device lookup failed";
+    }
+
+    VLOG(3) << "MPI Received tensor " << parsed.FullKey()
+            << " @ step: " << temp1
+            << " single-send: " << mpi_response.singlesend();
+
+    Tensor val;
+    if (mpi_response.singlesend()) {
+      dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
+                                      recv_args.alloc_attrs, &val);
+    } else {
+      TensorResponse tr;
+      tr.InitAlloc(dst_device, recv_args.alloc_attrs);
+      tr.InitPartial(mpi_response.response());
+      const size_t nBytes = tr.tensor().TotalBytes();
+      void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
+      MPI_Status status;
+      MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
+                         TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
+      val = std::move(tr.tensor());
+    }
+
+    done(s, Args(), recv_args, val, mpi_response.response().is_dead());
+  };
+
+  MPIRendezvousMgr* mgr =
+      reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
+  mgr->QueueRequest(parsed.FullKey().ToString(), step_id_,
+                    std::move(request_call), rendezvous_call);
+}
+
+MPIRemoteRendezvous::~MPIRemoteRendezvous() {
+  MPIRendezvousMgr* mgr =
+      reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
+  mgr->RemoveStepID(step_id_);
+}
+
+/*
+ * Add the request for one of our Tensors by a remote process
+ * to the local send/table. The here created callback will
+ * be called once the Tensor data has arrived and is
+ * ready to be send to the remote requester.
+ */
+void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
+                                  const int mpi_dst) {
+  const int64 step_id = request.step_id();
+  const std::string& key = request.rendezvous_key();
+  Rendezvous::ParsedKey parsed;
+  TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
+
+  MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
+      const Status& status, const Rendezvous::Args& send_args,
+      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead,
+      MPISendTensorCall* mpi_send_call) {
+    // TODO(jbedorf) this should be a loop over max size
+    CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
+        << "Buffer too large for single transfer";
+    MPI_CHECK(MPI_Alloc_mem(mpi_send_call->mRes_.ByteSize(), MPI_INFO_NULL,
+                            &mpi_send_call->send_buffer_));
+    mpi_send_call->mRes_.SerializeToArray(mpi_send_call->send_buffer_,
+                                          mpi_send_call->mRes_.ByteSize());
+
+    MPI_CHECK(MPI_Isend(mpi_send_call->send_buffer_,
+                        static_cast<int>(mpi_send_call->mRes_.ByteSize()),
+                        MPI_CHAR, mpi_dst, TAG_SENDTENSOR, MPI_COMM_WORLD,
+                        &(mpi_send_call->msg1_)));
+    MPI_CHECK(MPI_Test(&mpi_send_call->msg1_, &mpi_send_call->done1_,
+                       MPI_STATUS_IGNORE));
+
+    if (!mpi_send_call->mRes_.singlesend()) {
+      const int tensor_size = static_cast<int>(val.TotalBytes());
+      void* temp = const_cast<void*>(DMAHelper::base(&val));
+
+      // If the MPI library is not GPU aware there should be a data transfer
+      // here to get the data on the host.
+      // if(src_dev->tensorflow_gpu_device_info()) //memcpy to send_buffer2_
+
+      // TODO(jbedorf)  this should be a loop over max size
+      MPI_CHECK(MPI_Isend(temp, tensor_size, MPI_CHAR, mpi_dst, TAG_SENDTENSOR2,
+                          MPI_COMM_WORLD, &mpi_send_call->msg2_));
+      mpi_send_call->done2_ = 0;
+    }
+    return mpi_send_call;
+  };
+
+  // Wrapper around the read callback to place the callback on our queue
+  Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb](
+      const Status& status, const Rendezvous::Args& send_args,
+      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+    if (!status.ok()) {
+      CHECK(status.ok()) << "RecvLocalAsync was not ok, key: "
+                         << parsed.FullKey() << " step: " << step_id
+                         << " error message: " << status.error_message();
+      return;
+    }
+
+    VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
+            << " @ step: " << step_id << std::endl;
+
+    auto mpi_send_call = new MPISendTensorCall();
+    mpi_send_call->Init(parsed, step_id, is_dead);
+
+    Device* src_dev = nullptr;
+    Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device,
+                                                            &src_dev);
+    CHECK(s.ok()) << "src device not found";
+
+    // Control if shape and data should be send together or if we can optimize
+    // it in two different transfers, thereby reducing memory copies
+    bool doOptimalTransfer = true;
+    if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
+    if (val.TotalBytes() < 1024) doOptimalTransfer = false;
+
+    doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
+
+    if (doOptimalTransfer) {
+      // First send the Tensor description and in a follow up transfer the data
+      mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
+          val.dtype());
+      val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
+                              ->mutable_tensor()
+                              ->mutable_tensor_shape());
+      mpi_send_call->mRes_.set_singlesend(false);
+    } else {
+      // Send the Tensor description and data in a single transfer
+      if (src_dev->tensorflow_gpu_device_info() &&
+          (!send_args.alloc_attrs.on_host())) {
+        Notification n;
+        GPUUtil::SetProtoFromGPU(
+            val, src_dev, send_args.device_context,
+            mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead,
+            [&n, &s](const Status& s_) {
+              s = s_;
+              n.Notify();
+            });
+        n.WaitForNotification();
+      } else {
+        val.AsProtoTensorContent(
+            mpi_send_call->mRes_.mutable_response()->mutable_tensor());
+      }
+    }
+
+    std::function<MPISendTensorCall*()> res = std::bind(
+        send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
+
+    SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
+
+    this->QueueSendRequest(req);
+
+    // Wait for the notification that indicates the tensor has been
+    // successfully transmitted to the remote process. Only needed if we
+    // have not parsed the tensor to proto
+    if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
+  };  // done_cb
+
+  worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
+    this->RecvLocalAsync(step_id, parsed, done_cb);
+  });
+}
+
+void MPIRendezvousMgr::MPIBackgroundThread() {
+  std::list<std::unique_ptr<MPISendTensorCall>> active_sends;
+
+  while (1) {
+    MPI_Status status;
+
+    // Check for incoming Tensor requests
+    RecvTensorRequest request;
+    if (ProbeForData(TAG_REQTENSOR, &status, &request)) {
+      this->AddRequest(request, status.MPI_SOURCE);
+    }
+
+    // Check for incoming Tensor reply
+    MPIRecvTensorResponse mRes;
+    if (ProbeForData(TAG_SENDTENSOR, &status, &mRes)) {
+      const int64 step_id = mRes.step_id();
+      std::string key = mRes.key();
+
+      std::shared_ptr<MPIRequestTensorCall> call;
+      GetRecvCall(step_id, key, &call);
+      call->recv_call_(mRes);
+      RemoveRecvCall(step_id, key);
+    }
+
+    // Remove sends that have been completed
+    active_sends.remove_if([](std::unique_ptr<MPISendTensorCall>& i) {
+      return i->IsFinished();
+    });
+
+    // send a Tensor request
+    RequestQueueEntry req;
+    if (GetRequest(&req)) req.second();
+
+    // Send a Tensor response
+    SendQueueEntry send;
+    if (GetResponse(&send)) {
+      std::unique_ptr<MPISendTensorCall> p(send.second());
+      active_sends.push_back(std::move(p));
+    }
+
+    //    std::this_thread::sleep_for(std::chrono::microseconds(1));
+  }
+}
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
new file mode 100644
index 00000000000..50fc3804967
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -0,0 +1,260 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include <queue>
+#include <thread>
+#include <list>
+#include <string>
+#include <memory>
+#include <map>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <iostream>
+
+#include "tensorflow/contrib/mpi/mpi_utils.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/contrib/mpi/mpi_msg.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+#define TAG_REQTENSOR 1010
+#define TAG_SENDTENSOR 2020
+#define TAG_SENDTENSOR2 3030
+
+namespace tensorflow {
+
+class MPISendTensorCall {
+ public:
+  char* send_buffer_;
+  char* send_buffer2_;
+
+  MPI_Request msg1_;
+  MPI_Request msg2_;
+  int done1_;  // Int instead of bool for simpler IsFinished logic
+  int done2_;
+  MPIRecvTensorResponse mRes_;
+  Notification n_;
+
+  MPISendTensorCall()
+      : send_buffer_(nullptr), send_buffer2_(nullptr), done1_(1), done2_(1) {}
+
+  ~MPISendTensorCall() {
+    MPI_CHECK(MPI_Wait(&msg1_, MPI_STATUS_IGNORE));
+    n_.Notify();
+    MPI_CHECK(MPI_Free_mem(send_buffer_));
+    //    delete[] send_buffer_;
+    delete[] send_buffer2_;
+  }
+
+  MPISendTensorCall(MPISendTensorCall&&) = delete;
+
+  void Init(const Rendezvous::ParsedKey& parsed, const int64 step_id,
+            const bool is_dead) {
+    mRes_.set_key(parsed.FullKey().ToString());
+    mRes_.set_step_id(step_id);
+    mRes_.mutable_response()->set_is_dead(is_dead);
+    mRes_.mutable_response()->set_send_start_micros(
+        Env::Default()->NowMicros());
+    mRes_.set_singlesend(true);
+  }
+
+  bool IsFinished() {
+    MPI_Status status;
+    if (!done1_) MPI_CHECK(MPI_Test(&msg1_, &done1_, &status));
+    if (!done2_) MPI_CHECK(MPI_Test(&msg2_, &done2_, &status));
+    return done1_ && done2_;
+  }
+};
+
+class MPIRequestTensorCall {
+ public:
+  Rendezvous::DoneCallback done_;
+  RecvTensorRequest req_;
+  MPI_Request mpi_request_;
+  char* request_buffer_;
+  size_t request_buffer_size_;
+  std::function<void(MPIRecvTensorResponse)> recv_call_;
+
+  MPIRequestTensorCall() : request_buffer_(nullptr) {}
+  ~MPIRequestTensorCall() {
+    MPI_CHECK(MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE));
+    // delete[] request_buffer_;
+    MPI_CHECK(MPI_Free_mem(request_buffer_));
+  }
+
+  void Init(const Rendezvous::ParsedKey& parsed, const int64 step_id) {
+    req_.set_step_id(step_id);
+    req_.set_rendezvous_key(parsed.FullKey().data(), parsed.FullKey().size());
+    request_buffer_size_ = req_.ByteSize();
+    //   request_buffer_ = new char[request_buffer_size_];
+    //  req_.SerializeToArray(request_buffer_, request_buffer_size_);
+  }
+};
+
+class MPIRemoteRendezvous : public BaseRemoteRendezvous {
+ public:
+  MPIRemoteRendezvous(const WorkerEnv* env, int64 step_id, const MPIUtils* util,
+                      BaseRendezvousMgr* mgr_)
+      : BaseRemoteRendezvous(env, step_id, false),
+        mpiutils_(util),
+        rendezvous_mgr_(mgr_) {}
+
+ protected:
+  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                           const Rendezvous::Args& args,
+                           DoneCallback done) override;
+
+ private:
+  ~MPIRemoteRendezvous() override;
+
+  const MPIUtils* mpiutils_;
+  BaseRendezvousMgr* rendezvous_mgr_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MPIRemoteRendezvous);
+};
+
+class MPIRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  explicit MPIRendezvousMgr(const WorkerEnv* env);
+  ~MPIRendezvousMgr() {
+    delete mpiutils_;
+    fprintf(stderr, "Delete MPIRendezvousMgr \n");
+    // TODO(jbedorf) stop background_thread_
+    MPI_CHECK(MPI_Finalize());
+  }
+
+  void QueueRequest(std::string key, int64 step_id,
+                    std::function<void()> request_call,
+                    MPIRequestTensorCall* rCall) {
+    mutex_lock l(mrq_);
+    request_queue_.push(RequestQueueEntry(key, std::move(request_call)));
+    recv_tensor_map_[step_id][key] =
+        std::shared_ptr<MPIRequestTensorCall>(rCall);
+  }
+
+  void RemoveStepID(const int64 step_id) {
+    mutex_lock l(mrq_);
+    CHECK(recv_tensor_map_[step_id].size() == 0) << "Removing unfinished step";
+    recv_tensor_map_.erase(step_id);
+    // TODO(jbedorf) Should we verify that the step_id is clear before remove?
+  }
+
+ protected:
+  BaseRemoteRendezvous* Create(int64 step_id,
+                               const WorkerEnv* worker_env) override;
+
+ private:
+  typedef std::function<MPISendTensorCall*(
+      const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
+      const Tensor&, const bool, MPISendTensorCall*)> MPIRecvTensorCallBack;
+
+  typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
+  typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
+      SendQueueEntry;
+
+  const WorkerEnv* worker_env_2;
+  std::thread background_thread_;
+  MPIUtils* mpiutils_;
+  bool use_optimal_transfer_;
+
+  mutex msq_;
+  mutex mrq_;
+
+  std::queue<SendQueueEntry> send_queue_ GUARDED_BY(msq_);
+  std::queue<RequestQueueEntry> request_queue_ GUARDED_BY(mrq_);
+  std::map<int64, std::unordered_map<std::string,
+                                     std::shared_ptr<MPIRequestTensorCall>>>
+      recv_tensor_map_ GUARDED_BY(mrq_);
+
+  void AddRequest(RecvTensorRequest, const int);
+  void MPIBackgroundThread();
+
+  void QueueSendRequest(SendQueueEntry req) {
+    mutex_lock l(msq_);
+    send_queue_.push(req);
+  }
+
+  void GetRecvCall(const int64 step_id, const std::string& key,
+                   std::shared_ptr<MPIRequestTensorCall>* call) {
+    mutex_lock l(mrq_);
+    if (recv_tensor_map_.find(step_id) == recv_tensor_map_.end()) {
+      LOG(FATAL) << "Step not found in recv_tensor_map_, step: " << step_id
+                 << " key:  " << key << std::endl;
+    }
+    if (recv_tensor_map_[step_id].find(key) !=
+        recv_tensor_map_[step_id].end()) {
+      *call = recv_tensor_map_[step_id][key];
+    } else {
+      LOG(FATAL) << "Key not found in recv_tensor_map_, step: " << step_id
+                 << " key:  " << key << std::endl;
+    }
+  }
+
+  void RemoveRecvCall(const int64 step_id, const std::string& key) {
+    mutex_lock l(mrq_);
+    recv_tensor_map_[step_id].erase(key);
+  }
+
+  bool GetRequest(RequestQueueEntry* req) {
+    mutex_lock l(mrq_);
+    if (!request_queue_.empty()) {
+      *req = request_queue_.front();
+      request_queue_.pop();
+      return true;
+    }
+    return false;
+  }
+
+  bool GetResponse(SendQueueEntry* send) {
+    mutex_lock l(msq_);
+    if (!send_queue_.empty()) {
+      *send = send_queue_.front();
+      send_queue_.pop();
+      return true;
+    }
+    return false;
+  }
+
+  template <typename T>
+  int ProbeForData(const int tag, MPI_Status* status, T* obj) {
+    int flag = 0, msg_size = 0;
+    MPI_Message msg;
+    // Receive the message, probe as size is variable
+    MPI_CHECK(
+        MPI_Improbe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &msg, status));
+    if (flag) {
+      MPI_CHECK(MPI_Get_count(status, MPI_CHAR, &msg_size));
+      MPI_Status stat2;
+      std::vector<char> request_buffer_(msg_size);
+      MPI_Mrecv(&request_buffer_[0], msg_size, MPI_CHAR, &msg, &stat2);
+      bool res = obj->ParseFromArray(&request_buffer_[0], msg_size);
+      CHECK(res) << "Failed to parse incomming message";
+    }
+    return flag;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MPIRendezvousMgr);
+};  // MPIRendezvousMgr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
+#endif  // TENSORFLOW_CONTRIB_MPI_MPI_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
new file mode 100644
index 00000000000..3b2fba97a99
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include "tensorflow/contrib/mpi/mpi_server_lib.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+namespace {
+// static utility function
+RendezvousMgrInterface* NewMPIRendezvousMgr(const WorkerEnv* env) {
+  // Runtime check to disable the MPI path
+  const char* mpienv = getenv("MPI_DISABLED");
+  if (mpienv && mpienv[0] == '1') {
+    LOG(INFO) << "MPI path disabled by environment variable\n";
+    return new RpcRendezvousMgr(env);
+  } else {
+    return new MPIRendezvousMgr(env);
+  }
+}
+
+}  // namespace
+
+MPIServer::MPIServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env) {}
+
+MPIServer::~MPIServer() {
+  TF_CHECK_OK(Stop());
+  TF_CHECK_OK(Join());
+}
+
+Status MPIServer::Init(ServiceInitFunction service_func,
+                       RendezvousMgrCreationFunction rendezvous_mgr_func) {
+  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  return s;
+}
+
+Status MPIServer::Start() {
+  Status s = GrpcServer::Start();
+  return s;
+}
+
+Status MPIServer::Join() {
+  Status s = GrpcServer::Join();
+  return s;
+}
+
+/* static */
+Status MPIServer::Create(const ServerDef& server_def, Env* env,
+                         std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<MPIServer> ret(new MPIServer(server_def, Env::Default()));
+  ServiceInitFunction service_func = nullptr;
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewMPIRendezvousMgr));
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class MPIServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "grpc+mpi";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return MPIServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `MPIServer` instances.
+class MPIServerRegistrar {
+ public:
+  MPIServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("MPI_SERVER", new MPIServerFactory());
+  }
+};
+static MPIServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.h b/tensorflow/contrib/mpi/mpi_server_lib.h
new file mode 100644
index 00000000000..736f6922a15
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_server_lib.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include <memory>
+
+#include "tensorflow/contrib/mpi/mpi_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+
+namespace tensorflow {
+
+class MPIServer : public GrpcServer {
+ protected:
+  MPIServer(const ServerDef& server_def, Env* env);
+
+ public:
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+
+  // Destruction is only supported in the factory method. Clean
+  // shutdown is not currently implemented for this server type.
+  ~MPIServer() override;
+
+  // Implementations of ServerInterface methods.
+  Status Start() override;
+  Status Join() override;
+
+ protected:
+  Status Init(ServiceInitFunction service_func,
+              RendezvousMgrCreationFunction rendezvous_mgr_func);
+  Status ChannelCacheFactory(const ServerDef& server_def,
+                             GrpcChannelCache** channel_cache);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
+#endif  // TENSORFLOW_CONTRIB_MPI_MPI_SERVER_LIB_H_
diff --git a/tensorflow/contrib/mpi/mpi_utils.cc b/tensorflow/contrib/mpi/mpi_utils.cc
new file mode 100644
index 00000000000..b8e7d1a274f
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_utils.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include "tensorflow/contrib/mpi/mpi_utils.h"
+namespace tensorflow {
+
+#define max_worker_name_length 128
+
+MPIUtils::MPIUtils(const std::string& worker_name) {
+  InitMPI();
+  // Connect the MPI process IDs to the worker names that are used by TF.
+  // Gather the names of all the active processes (name can't be longer than
+  // 128 bytes)
+  int proc_id = 0, number_of_procs = 1;
+  char my_name[max_worker_name_length];
+  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &proc_id));
+  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs));
+
+  CHECK(worker_name.size() < max_worker_name_length)
+      << "Specified worker name is too long.";
+  snprintf(my_name, max_worker_name_length, worker_name.c_str());
+  std::vector<char> worker_names(number_of_procs * max_worker_name_length);
+  MPI_CHECK(MPI_Allgather(my_name, max_worker_name_length, MPI_CHAR,
+                          &worker_names[0], max_worker_name_length, MPI_CHAR,
+                          MPI_COMM_WORLD));
+
+  if (proc_id == 0) LOG(INFO) << "MPI process-ID to gRPC server name map: \n";
+  for (int i = 0; i < number_of_procs; i++) {
+    name_to_id_[std::string(&worker_names[i * 128])] = i;
+    if (proc_id == 0)
+      LOG(INFO) << "Process: " << i
+                << "\tgRPC-name: " << std::string(&worker_names[i * 128])
+                << std::endl;
+  }
+}
+
+void MPIUtils::InitMPI() {
+  // Initialize the MPI environment if that hasn't been done
+  int flag = 0;
+  MPI_CHECK(MPI_Initialized(&flag));
+  if (!flag) {
+    int proc_id = 0, number_of_procs = 1, len = -1;
+    char my_host_name[max_worker_name_length];
+    // MPI_CHECK(MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &flag));
+    MPI_CHECK(MPI_Init(0, 0));
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &proc_id));
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs));
+    MPI_CHECK(MPI_Get_processor_name(my_host_name, &len));
+    fprintf(stderr,
+            "MPI Environment initialised. Process id: %d Total processes: %d "
+            "|| Hostname: %s \n",
+            proc_id, number_of_procs, my_host_name);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
new file mode 100644
index 00000000000..45e21f2b25a
--- /dev/null
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
+#define TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include <string>
+#include <map>
+#include <vector>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+
+#include "third_party/mpi/mpi.h"
+#define MPI_CHECK(cmd)                                                \
+  do {                                                                \
+    int mpi_errno = cmd;                                              \
+    if (MPI_SUCCESS != mpi_errno) {                                   \
+      fprintf(stderr, "[%s:%d] MPI call failed with %d \n", __FILE__, \
+              __LINE__, mpi_errno);                                   \
+      exit(EXIT_FAILURE);                                             \
+    }                                                                 \
+    assert(MPI_SUCCESS == mpi_errno);                                 \
+  } while (false)
+
+namespace tensorflow {
+class MPIUtils {
+ public:
+  explicit MPIUtils(const std::string& worker_name);
+
+  const int GetSourceID(const std::string& task_id) const {
+    auto it = name_to_id_.find(task_id);
+    if (it == name_to_id_.end()) {
+      LOG(FATAL) << "Failed to convert worker name to MPI index: " << task_id;
+    }
+    return it->second;
+  }
+
+ private:
+  void InitMPI();
+
+  std::map<std::string, int> name_to_id_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
+#endif  // TENSORFLOW_CONTRIB_MPI_MPI_UTILS_H_
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 52191b307a9..338181e4cac 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -1,10 +1,8 @@
 # Description:
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
-package(
-    default_visibility = ["//visibility:private"],
-    features = ["-parse_headers"],
-)
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -12,13 +10,15 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_cc_test",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_nccl_ops.so",
@@ -34,6 +34,50 @@ tf_custom_op_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "nccl_manager_test",
+    size = "medium",
+    srcs = if_cuda(
+        [
+            "kernels/nccl_manager.cc",
+            "kernels/nccl_manager.h",
+            "kernels/nccl_manager_test.cc",
+        ],
+        [],
+    ),
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = if_cuda(
+        [
+            "@nccl_archive//:nccl",
+            "//tensorflow/core:cuda",
+        ],
+        [],
+    ) + [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "nccl_kernels",
+    srcs = [
+        "kernels/nccl_manager.cc",
+        "kernels/nccl_manager.h",
+        "kernels/nccl_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "@nccl_archive//:nccl",
+    ],
+    alwayslink = 1,
+)
+
 tf_gen_op_libs(
     op_lib_names = ["nccl_ops"],
     deps = [
@@ -46,15 +90,19 @@ tf_gen_op_wrapper_py(
     deps = [":nccl_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "nccl_py",
     srcs = [
         "__init__.py",
         "python/ops/nccl_ops.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_nccl_ops.so",
     ],
+    kernels = [
+        ":nccl_kernels",
+        ":nccl_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -64,48 +112,25 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "nccl_ops_test",
-    size = "small",
-    srcs = ["python/ops/nccl_ops_test.py"],
-    additional_deps = [
-        ":nccl_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-    tags = [
-        "manual",
-        "requires_cudnn5",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "nccl_manager_test",
-    size = "medium",
-    srcs = if_cuda(
-        [
-            "kernels/nccl_manager.cc",
-            "kernels/nccl_manager.h",
-            "kernels/nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    deps = if_cuda(
-        [
-            "@nccl_archive//:nccl",
-            "//tensorflow/core",
-            "//tensorflow/core:cuda",
-        ],
-        [],
-    ) + [
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
+# http://b/62064807
+# cuda_py_test(
+#     name = "nccl_ops_test",
+#     size = "small",
+#     srcs = ["python/ops/nccl_ops_test.py"],
+#     additional_deps = [
+#         ":nccl_py",
+#         "//tensorflow/python:array_ops",
+#         "//tensorflow/python:client_testlib",
+#         "//tensorflow/python:framework_for_generated_wrappers",
+#         "//tensorflow/python:framework_test_lib",
+#         "//tensorflow/python:platform_test",
+#     ],
+#     # Disabled on jenkins until errors finding nvmlShutdown are found.
+#     tags = [
+#         "manual",
+#         "notap",
+#     ],
+# )
 
 filegroup(
     name = "all_files",
diff --git a/tensorflow/contrib/nccl/__init__.py b/tensorflow/contrib/nccl/__init__.py
index 0275ed60798..d851c522c03 100644
--- a/tensorflow/contrib/nccl/__init__.py
+++ b/tensorflow/contrib/nccl/__init__.py
@@ -12,13 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ops for nccl AllReduce."""
+"""Functions for using NVIDIA nccl collective ops.
+
+@@all_max
+@@all_min
+@@all_prod
+@@all_sum
+@@broadcast
+
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.nccl.python.ops.nccl_ops import *
-# pylint: enable=wildcard-import
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_max
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_min
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_prod
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_sum
+from tensorflow.contrib.nccl.python.ops.nccl_ops import broadcast
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 31e85b571df..42e7789301c 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 
+#include <utility>
+
 #ifdef GOOGLE_CUDA
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -64,7 +66,7 @@ struct NcclManager::CommunicatorMember {
 
 struct NcclManager::Communicator {
  public:
-  Communicator(std::vector<CommunicatorMember> members)
+  explicit Communicator(std::vector<CommunicatorMember> members)
       : num_devices(members.size()), members(std::move(members)) {}
 
   const int num_devices;
@@ -92,13 +94,14 @@ ncclDataType_t ToNcclType(DataType t) {
 struct NcclManager::Participant {
   Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr,
               perftools::gputools::Stream* tensor_stream,
-              perftools::gputools::StreamExecutor* executor,
+              perftools::gputools::StreamExecutor* executor, int gpu_device_id,
               NcclManager::DoneCallback done_callback)
       : in_t(in_t),
         out_t(out_t),
         event_mgr(event_mgr),
         tensor_stream(tensor_stream),
         executor(executor),
+        gpu_device_id(gpu_device_id),
         done_callback(std::move(done_callback)) {
     DCHECK(executor != nullptr);
     DCHECK(event_mgr != nullptr);
@@ -120,7 +123,9 @@ struct NcclManager::Participant {
 
   // Matches the executor in CommunicatorMember::stream. Expected to be live for
   // process lifetime.
-  perftools::gputools::StreamExecutor* executor = nullptr;
+  perftools::gputools::StreamExecutor* const executor = nullptr;
+
+  const int gpu_device_id;
 
   NcclManager::DoneCallback done_callback;
 
@@ -222,6 +227,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
   // Note that this is done under the lock; performance is not expected to
   // matter as this happens a very small number of times.
   std::vector<CommunicatorMember> members(num_devices);
+  std::vector<int> devices(num_devices);
   for (int i = 0; i < num_devices; ++i) {
     auto* executor = collective->participants[i]->executor;
 
@@ -249,30 +255,14 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     }
 
     members[i].nccl_stream = nccl_stream;
+    devices[i] = collective->participants[i]->gpu_device_id;
   }
 
-  // Call ncclCommInitRank for each member.
-  ncclUniqueId id;
-  CHECK_EQ(ncclSuccess, ncclGetUniqueId(&id));
-  std::unique_ptr<thread::ThreadPool> pool(
-      new thread::ThreadPool(env, "ncclCommInitRank", num_devices));
-  std::vector<ncclResult_t> results(num_devices);
+  std::vector<ncclComm_t> nccl_comms(num_devices);
+  auto result = ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
+  CHECK_EQ(result, ncclSuccess);
   for (int rank = 0; rank < num_devices; ++rank) {
-    CommunicatorMember* member = &members[rank];
-    ncclResult_t* result = &results[rank];
-    pool->Schedule([member, num_devices, result, rank, &id]() {
-      ScopedActivateExecutorContext scoped_context(
-          member->nccl_stream->executor);
-      LOG(INFO) << "Calling ncclCommInitRank for rank " << rank;
-      *result = ncclCommInitRank(&member->nccl_comm, num_devices, id, rank);
-      LOG(INFO) << "Done calling ncclCommInitRank for rank " << rank << " : "
-                << *result;
-    });
-  }
-
-  pool.reset();  // wait for completion.
-  for (int i = 0; i < num_devices; ++i) {
-    CHECK_EQ(results[i], ncclSuccess);
+    members[rank].nccl_comm = nccl_comms[rank];
   }
   communicators_.emplace_back(new Communicator(std::move(members)));
   return communicators_.back().get();
@@ -281,24 +271,25 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
                                  ncclRedOp_t reduction_op,
                                  perftools::gputools::StreamExecutor* executor,
-                                 EventMgr* event_mgr,
+                                 int gpu_device_id, EventMgr* event_mgr,
                                  perftools::gputools::Stream* tensor_stream,
                                  const Tensor* in_t, Tensor* out_t,
                                  const DoneCallback& done_callback) {
-  std::unique_ptr<Participant> participant(new Participant(
-      in_t, out_t, event_mgr, tensor_stream, executor, done_callback));
+  std::unique_ptr<Participant> participant(
+      new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
+                      gpu_device_id, done_callback));
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kAllReduce, reduction_op);
 }
 
 void NcclManager::AddBroadcastSend(
     int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, EventMgr* event_mgr,
-    perftools::gputools::Stream* tensor_stream, const Tensor* in_t,
-    DoneCallback done_callback) {
+    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
+    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
+    const Tensor* in_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
-                      executor, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   participant->root = true;
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
@@ -306,12 +297,12 @@ void NcclManager::AddBroadcastSend(
 
 void NcclManager::AddBroadcastRecv(
     int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, EventMgr* event_mgr,
-    perftools::gputools::Stream* tensor_stream, Tensor* out_t,
-    DoneCallback done_callback) {
+    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
+    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
+    Tensor* out_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
-                      executor, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   AddParticipant(num_devices, key, std::move(participant), out_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
 }
@@ -331,7 +322,7 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
     }
     Collective* collective = collective_ptr.get();
     DCHECK_EQ(collective->type, collective_type);
-    DCHECK_EQ(collective->participants.size(), num_devices);
+    DCHECK_LT(collective->participants.size(), num_devices);
     collective->participants.emplace_back(std::move(participant));
     ++collective->available_participants;
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 8d5e5ddf763..1a661e8f7f7 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -57,7 +57,7 @@ class NcclManager {
   void AddToAllReduce(int num_devices, const string& key,
                       ncclRedOp_t reduction_op,
                       perftools::gputools::StreamExecutor* executor,
-                      EventMgr* event_mgr,
+                      int gpu_device_id, EventMgr* event_mgr,
                       perftools::gputools::Stream* tensor_stream,
                       const Tensor* in_t, Tensor* out_t,
                       const DoneCallback& done_callback);
@@ -66,12 +66,12 @@ class NcclManager {
   // to all receivers.
   void AddBroadcastSend(int num_devices, const string& key,
                         perftools::gputools::StreamExecutor* executor,
-                        EventMgr* event_mgr,
+                        int gpu_device_id, EventMgr* event_mgr,
                         perftools::gputools::Stream* tensor_stream,
                         const Tensor* in_t, DoneCallback done_callback);
   void AddBroadcastRecv(int num_devices, const string& key,
                         perftools::gputools::StreamExecutor* executor,
-                        EventMgr* event_mgr,
+                        int gpu_device_id, EventMgr* event_mgr,
                         perftools::gputools::Stream* tensor_stream,
                         Tensor* out_t, DoneCallback done_callback);
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index b53cb824407..505c4b0d710 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -193,9 +193,9 @@ TEST_F(NcclManagerTest, BasicSumReduction) {
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
-          num_ranks, "allreduce", reduction_op, device->executor(), event_mgr,
-          stream, &test_case->ins[device_num], &test_case->outs[device_num],
-          CreateDoneCallback(test_case.get()));
+          num_ranks, "allreduce", reduction_op, device->executor(),
+          device->gpu_id(), event_mgr, stream, &test_case->ins[device_num],
+          &test_case->outs[device_num], CreateDoneCallback(test_case.get()));
     }
 
     LOG(ERROR) << "Verifying results";
@@ -259,8 +259,9 @@ TEST_F(NcclManagerTest, MultipleCallers) {
         TestCase* test_case = test_cases[test_num].get();
         NcclManager::instance()->AddToAllReduce(
             num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
-            device->executor(), event_mgr, stream, &test_case->ins[device_num],
-            &test_case->outs[device_num], CreateDoneCallback(test_case));
+            device->executor(), device->gpu_id(), event_mgr, stream,
+            &test_case->ins[device_num], &test_case->outs[device_num],
+            CreateDoneCallback(test_case));
       };
       pool->Schedule(fn);
     }
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index db6ee3e0e72..3c532e3d731 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -38,7 +38,7 @@ namespace tensorflow {
 //    when the async op kernel's done callback is called.
 class NcclAsyncOpBase : public AsyncOpKernel {
  public:
-  NcclAsyncOpBase(OpKernelConstruction* c) : AsyncOpKernel(c) {
+  explicit NcclAsyncOpBase(OpKernelConstruction* c) : AsyncOpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("num_devices", &num_devices_));
     OP_REQUIRES_OK(c, c->GetAttr("shared_name", &collective_prefix_));
   }
@@ -62,7 +62,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
 // <k> devices in the communicator.
 class NcclAllReduceOpKernel : public NcclAsyncOpBase {
  public:
-  NcclAllReduceOpKernel(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
+  explicit NcclAllReduceOpKernel(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
     string reduction;
     OP_REQUIRES_OK(c, c->GetAttr("reduction", &reduction));
     if (reduction == "min") {
@@ -90,11 +90,11 @@ class NcclAllReduceOpKernel : public NcclAsyncOpBase {
     };
 
     auto* compute_stream = c->op_device_context()->stream();
-    EventMgr* event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     NcclManager::instance()->AddToAllReduce(
         num_devices(), GetCollectiveKey(c), reduction_op_,
-        compute_stream->parent(), event_mgr, compute_stream, in_t, out_t,
-        actual_done);
+        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
+        compute_stream, in_t, out_t, actual_done);
   }
 
  private:
@@ -106,7 +106,8 @@ REGISTER_KERNEL_BUILDER(Name("NcclAllReduce").Device(DEVICE_GPU),
 
 class NcclBroadcastSendKernel : public NcclAsyncOpBase {
  public:
-  NcclBroadcastSendKernel(OpKernelConstruction* c) : NcclAsyncOpBase(c) {}
+  explicit NcclBroadcastSendKernel(OpKernelConstruction* c)
+      : NcclAsyncOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
     auto actual_done = [c, done](Status s) {
@@ -115,10 +116,11 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
     };
 
     auto* compute_stream = c->op_device_context()->stream();
-    EventMgr* event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     NcclManager::instance()->AddBroadcastSend(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(), event_mgr,
-        compute_stream, &c->input(0), std::move(actual_done));
+        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
+        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, &c->input(0),
+        std::move(actual_done));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("NcclBroadcastSend").Device(DEVICE_GPU),
@@ -126,7 +128,8 @@ REGISTER_KERNEL_BUILDER(Name("NcclBroadcastSend").Device(DEVICE_GPU),
 
 class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
  public:
-  NcclBroadcastRecvKernel(OpKernelConstruction* c) : NcclAsyncOpBase(c) {}
+  explicit NcclBroadcastRecvKernel(OpKernelConstruction* c)
+      : NcclAsyncOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
     const Tensor& shape_t = c->input(0);
@@ -142,10 +145,11 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
     };
 
     auto* compute_stream = c->op_device_context()->stream();
-    EventMgr* event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
     NcclManager::instance()->AddBroadcastRecv(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(), event_mgr,
-        compute_stream, out_t, std::move(actual_done));
+        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
+        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, out_t,
+        std::move(actual_done));
   }
 };
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/contrib/ndlstm/BUILD b/tensorflow/contrib/ndlstm/BUILD
index b587a340540..73a4ad2e70f 100644
--- a/tensorflow/contrib/ndlstm/BUILD
+++ b/tensorflow/contrib/ndlstm/BUILD
@@ -54,6 +54,7 @@ tf_py_test(
 
 tf_py_test(
     name = "lstm2d_test",
+    size = "small",
     srcs = ["python/lstm2d_test.py"],
     additional_deps = [
         ":ndlstm",
@@ -67,6 +68,7 @@ tf_py_test(
 
 tf_py_test(
     name = "misc_test",
+    size = "small",
     srcs = ["python/misc_test.py"],
     additional_deps = [
         ":ndlstm",
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index e4edff00a77..d3c3531f405 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope
 
 
@@ -52,7 +52,7 @@ def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False):
   """
   with variable_scope.variable_scope(scope, "SeqLstmUnrolled", [inputs]):
     length, batch_size, _ = _shape(inputs)
-    lstm_cell = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
     state = array_ops.zeros([batch_size, lstm_cell.state_size])
     output_u = []
     inputs_u = array_ops.unstack(inputs)
@@ -88,7 +88,7 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     # TODO(tmb) make batch size, sequence_length dynamic
     # example: sequence_length = tf.shape(inputs)[0]
     _, batch_size, _ = _shape(inputs)
-    lstm_cell = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
     state = array_ops.zeros([batch_size, lstm_cell.state_size])
     sequence_length = int(inputs.get_shape()[0])
     sequence_lengths = math_ops.to_int64(
@@ -145,7 +145,7 @@ def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False):
   """
   with variable_scope.variable_scope(scope, "SequenceToFinal", [inputs]):
     length, batch_size, _ = _shape(inputs)
-    lstm = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False)
+    lstm = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
     state = array_ops.zeros([batch_size, lstm.state_size])
     inputs_u = array_ops.unstack(inputs)
     if reverse:
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d_test.py b/tensorflow/contrib/ndlstm/python/lstm1d_test.py
index 6b907295ff5..49b15cc814c 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d_test.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.ndlstm.python import lstm1d as lstm1d_lib
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d_test.py b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
index 23d75898e1a..3dbbb817968 100644
--- a/tensorflow/contrib/ndlstm/python/lstm2d_test.py
+++ b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.ndlstm.python import lstm2d as lstm2d_lib
diff --git a/tensorflow/contrib/ndlstm/python/misc_test.py b/tensorflow/contrib/ndlstm/python/misc_test.py
index 5ee29f302fc..fac9023da3b 100644
--- a/tensorflow/contrib/ndlstm/python/misc_test.py
+++ b/tensorflow/contrib/ndlstm/python/misc_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.ndlstm.python import misc as misc_lib
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 89a8988fd7c..0953a6d0011 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -7,12 +7,19 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/delay_compensated_gradient_descent.py",
+        "python/training/drop_stale_gradient_optimizer.py",
         "python/training/external_optimizer.py",
+        "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
+        "python/training/nadam_optimizer.py",
         "python/training/variable_clipping_optimizer.py",
     ],
     srcs_version = "PY2AND3",
@@ -22,15 +29,37 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
 )
 
+py_test(
+    name = "delay_compensated_gradient_descent_test",
+    srcs = ["python/training/delay_compensated_gradient_descent_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
+    size = "small",
     srcs = ["python/training/external_optimizer_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -48,6 +77,7 @@ py_test(
 
 py_test(
     name = "moving_average_optimizer_test",
+    size = "small",
     srcs = ["python/training/moving_average_optimizer_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -82,6 +112,59 @@ py_test(
     ],
 )
 
+py_test(
+    name = "lazy_adam_optimizer_test",
+    size = "small",
+    srcs = ["python/training/lazy_adam_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "nadam_optimizer_test",
+    size = "small",
+    srcs = ["python/training/nadam_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "drop_stale_gradient_optimizer_test",
+    size = "small",
+    srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 8ef90095965..f4cb7456ccc 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -12,23 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""opt: A module containing optimization routines."""
+"""A module containing optimization routines."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.delay_compensated_gradient_descent import *
+from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
+from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['ExternalOptimizerInterface',
-                    'MovingAverageOptimizer',
-                    'ScipyOptimizerInterface',
-                    'VariableClippingOptimizer']
+
+_allowed_symbols = [
+    'DelayCompensatedGradientDescentOptimizer',
+    'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
+    'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
+    'ScipyOptimizerInterface', 'VariableClippingOptimizer'
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py
new file mode 100644
index 00000000000..5a5e67ef68e
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py
@@ -0,0 +1,256 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""DelayCompensatedGradientDescentOptimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class _RefVariableAsynchronousProcessor(optimizer._RefVariableProcessor):
+  """Processor for Variable."""
+  def update_op_asynchronous(self, optimizer, g, index):
+    if isinstance(g, ops.Tensor):
+      return optimizer._apply_dense(g, self._v, index)
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, index)
+
+
+class _DenseResourceVariableAsynchronousProcessor(optimizer._DenseResourceVariableProcessor):
+  """Processor for dense ResourceVariables."""
+  def update_op_asynchronous(self, optimizer, g, index):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      return optimizer._resource_apply_sparse_duplicate_indices(
+        g.values, self._v, g.indices, index)
+    return optimizer._resource_apply_dense(g, self._v, index)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableAsynchronousProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableAsynchronousProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+class DelayCompensatedGradientDescentOptimizer(optimizer.Optimizer):
+  """Optimizer that implements gradient descent with delay compensation.
+
+  See [Zheng, Shuxin, et al., 2016](https://arxiv.org/abs/1609.08326)
+  ([pdf](https://arxiv.org/pdf/1609.08326.pdf)).
+  """
+
+  def __init__(self, learning_rate, variance_parameter, num_workers=1,
+               use_locking=False, name="DelayCompensatedGradientDescent"):
+    """Construct a new gradient descent optimizer with delay compensation.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning
+        rate to use.
+      variance_parameter: A Tensor or a floating point value. The lambda
+        value to use.
+      num_workers: A value to indicate number of workers computing gradients
+        asynchronously.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "DelayCompensatedGradientDescent".
+      """
+    if num_workers <= 0:
+      raise ValueError("num_workers must be positive: %s" % num_workers)
+    super(DelayCompensatedGradientDescentOptimizer, self).__init__(
+          use_locking, name)
+    self._learning_rate = learning_rate
+    self._lambda = variance_parameter
+    self._num_workers = num_workers
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=optimizer.Optimizer.GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, worker_index=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      worker_index: Optional. A value to indicate the instance of worker
+        minimizing if computing asynchronously.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+    """
+    if (worker_index < 0 and worker_index is not None) or worker_index >= self._num_workers:
+      raise ValueError("worker index must be in the range [0, num_workers): %s" %
+                        worker_index)
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name, worker_index=worker_index)
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      global_step=None,
+                      name=None,
+                      worker_index=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+      worker_index: Optional value to indicate the instance of worker
+        minimizing if computing asynchronously.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    converted_grads_and_vars = []
+    for g, v in grads_and_vars:
+      if g is not None:
+        try:
+          # Convert the grad to Tensor or IndexedSlices if necessary.
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+      p = _get_processor(v)
+      converted_grads_and_vars.append((g, v, p))
+
+    converted_grads_and_vars = tuple(converted_grads_and_vars)
+    var_list = [v for g, v, _ in converted_grads_and_vars if g is not None]
+    if not var_list:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, _, v in converted_grads_and_vars],))
+    with ops.control_dependencies(None):
+      self._create_slots([optimizer._get_variable_for(v) for v in var_list])
+    update_ops = []
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+      for grad, var, processor in converted_grads_and_vars:
+        if grad is None:
+          continue
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        with ops.name_scope("update_" + var.op.name), ops.colocate_with(var):
+          if worker_index is None:
+            update_ops.append(processor.update_op(self, grad))
+          else:
+            update_ops.append(processor.update_op_asynchronous(self, grad,
+                                                               worker_index))
+      if global_step is None:
+        apply_updates = self._finish(update_ops, name)
+      else:
+        with ops.control_dependencies([self._finish(update_ops, "update")]):
+          with ops.colocate_with(global_step):
+            apply_updates = state_ops.assign_add(global_step, 1, name=name).op
+
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      if apply_updates not in train_op:
+        train_op.append(apply_updates)
+
+      return apply_updates
+
+  def _create_slots(self, var_list):
+    """Initialize slots for all the vars of each worker to store
+        the previous values of it
+    """
+    for index in range(self._num_workers):
+      for v in var_list:
+        var2 = array_ops.identity(v.initialized_value())
+        self._get_or_make_slot(v, var2, "shadow_{0}".format(index),
+                               self._name)
+
+  def _resource_apply_dense(self, grad, var, worker_index=0):
+    # Get previous value of the variable from the slot
+    shadow = self.get_slot(var, "shadow_{0}".format(worker_index))
+    return training_ops.apply_delay_compensated_gradient_descent(
+        var.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
+        grad,
+        math_ops.cast(self._lambda_tensor, grad.dtype.base_dtype),
+        shadow.handle,
+        use_locking=self._use_locking)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._lambda_tensor = ops.convert_to_tensor(self._lambda,
+                                                name="lambda")
diff --git a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
new file mode 100644
index 00000000000..1dbd8416a08
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for DelayCompensatedGradientDescentOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.contrib.opt.python.training import delay_compensated_gradient_descent
+
+
+class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]), worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]), worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+
+    def testGradWrtRef(self):
+      for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+        with self.test_session():
+          optimizer = (delay_compensated_gradient_descent.
+                       DelayCompensatedGradientDescentOptimizer)(
+                           learning_rate=3.0,
+                           variance_parameter=2.0,
+                           num_workers=1)
+          values = [1.0, 3.0]
+          vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+          grads_and_vars = optimizer.compute_gradients(
+              vars_[0] + vars_[1], vars_)
+          variables.global_variables_initializer().run()
+          for grad, _ in grads_and_vars:
+            self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]),
+            global_step=global_step,
+            worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
new file mode 100644
index 00000000000..f20c172ee37
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for checking and dropping stale gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+class DropStaleGradientOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that checks and drops stale gradient.
+
+  This optimizer records the global step for each worker before computing
+  gradients and compares it with the global step at the time of applying the
+  gradients. If the difference is larger than a threshold, it will drop all
+  the computed gradients.
+  """
+
+  def __init__(self,
+               opt,
+               staleness,
+               use_locking=False,
+               name="DropStaleGradient"):
+    """Constructs a new DropStaleGradientOptimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+           gradients. Must be one of the Optimizer classes.
+      staleness: The maximum staleness allowed for the optimizer.
+      use_locking: If `True` use locks for clip update operations.
+      name: Optional name prefix for the operations created when applying
+            gradients. Defaults to "DropStaleGradient".
+    """
+    super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._staleness = staleness
+
+  def compute_gradients(self, loss, *args, **kwargs):
+    # Record current global step for worker.
+    with ops.colocate_with(loss):
+      self._local_step = training_util.get_global_step() + 0
+
+    with ops.control_dependencies([self._local_step]):
+      loss = gen_array_ops.identity(loss)
+      return self._opt.compute_gradients(loss, *args, **kwargs)
+
+  def get_slot(self, *args, **kwargs):
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    gradients = []
+    # Number of stale gradients.
+    stale_counter = variable_scope.get_variable(
+        "stale_counter", [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False)
+
+    def _AcceptGradientOp():
+      with ops.control_dependencies(
+          [self._opt.apply_gradients(
+              grads_and_vars, global_step=global_step, name=name)]):
+        return gen_array_ops.identity(0.0)
+
+    def _DropGradientOp():
+      return gen_array_ops.identity(1.0)
+
+    for grad_and_var in grads_and_vars:
+      grad = grad_and_var[0]
+      if isinstance(grad, ops.Tensor):
+        gradients.append(grad)
+      elif grad is not None:
+        gradients.append(grad.op)
+
+    with ops.control_dependencies(gradients), ops.colocate_with(global_step):
+      staleness = gen_array_ops.reshape(
+          global_step - self._local_step, shape=())
+
+    conditional_update = stale_counter.assign_add(control_flow_ops.cond(
+        gen_math_ops.less_equal(staleness, self._staleness),
+        _AcceptGradientOp, _DropGradientOp))
+
+    summary.scalar(
+        "Gradient staleness percentage",
+        stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
+    return conditional_update
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
new file mode 100644
index 00000000000..53232082e16
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
@@ -0,0 +1,297 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DropStaleGradientOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+
+from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training_util
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+def _get_workers(num_workers, staleness):
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  cluster_dict = {
+      'worker': ['localhost:%s' % port for port in worker_ports],
+      'ps': ['localhost:%s' % portpicker.pick_unused_port()]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+  workers = [
+      server_lib.Server(
+          cs, job_name='worker', task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  server_lib.Server(cs, job_name='ps', task_index=0, start=True)
+
+  sessions = []
+  graphs = []
+  train_ops = []
+
+  # To simulate stale cases, maintaining two queues for computing and
+  # applying gradients respectively. In the phase of computing gradients,
+  # all workers except chief worker compute gradients together and chief worker
+  # computes after all other worers' computing finished. In the phase of
+  # applying gradients, chief worker will first apply gradients, then all other
+  # workers will apply gradients one by one. Therefore, the chief worker will
+  # always have 0 staleness, each of all other workers will have a unique
+  # staleness value from [1, num_workers).
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    with graph.as_default():
+      global_step = training_util.create_global_step()
+      var_0 = variables.Variable(0.0, name='v0')
+      var_1 = variables.Variable(1.0, name='v1')
+      compute_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='compute_gradients_queue', shared_name='compute_gradients_queue')
+      apply_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='apply_gradients_queue', shared_name='apply_gradients_queue')
+
+      # Gradients for loss on var_0 and var_1 will be 1.0.
+      loss = 0 - var_0 - var_1
+      sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+      stale_check_opt = (
+          drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
+              sgd_opt, staleness))
+
+      # Compute gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [compute_gradients_queue.dequeue_many(num_workers - 1)]):
+          grad_and_vars = stale_check_opt.compute_gradients(loss)
+      else:
+        grad_and_vars = stale_check_opt.compute_gradients(loss)
+        with ops.control_dependencies([t[0] for t in grad_and_vars]):
+          worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
+
+      # Apply gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
+          train_op = apply_gradients_queue.enqueue(global_step)
+      else:
+        with ops.control_dependencies([worker_enqueue_op]):
+          with ops.control_dependencies([apply_gradients_queue.dequeue()]):
+            with ops.control_dependencies(
+                [stale_check_opt.apply_gradients(
+                    grad_and_vars, global_step)]):
+              train_op = apply_gradients_queue.enqueue(global_step)
+
+      sess = session.Session(workers[worker_id].target)
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class DropStaleGradientOptimizerTest(test.TestCase):
+
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Worker(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify the updated value after 1 step.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test1WorkerNegativeStaleness(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, -1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify no updates because max staleness is negative.
+    self.assertAllEqual(0, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness0(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness1(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 1, both workers will update
+    # var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness0(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(2.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness1(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 1, chief worker and only one of
+    # the two other workers will update var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index ff80167ff47..0909760b383 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -99,8 +99,13 @@ class ExternalOptimizerInterface(object):
         slice(start, end) for start, end in zip(accumulated_dims[:-1],
                                                 accumulated_dims[1:])]
 
-  def minimize(self, session=None, feed_dict=None, fetches=None,
-               step_callback=None, loss_callback=None):
+  def minimize(self,
+               session=None,
+               feed_dict=None,
+               fetches=None,
+               step_callback=None,
+               loss_callback=None,
+               **run_kwargs):
     """Minimize a scalar `Tensor`.
 
     Variables subject to optimization are updated in-place at the end of
@@ -120,6 +125,7 @@ class ExternalOptimizerInterface(object):
         flattened into a single vector.
       loss_callback: A function to be called every time the loss and gradients
         are computed, with evaluated fetches supplied as positional arguments.
+      **run_kwargs: kwargs to pass to `session.run`.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
@@ -160,8 +166,10 @@ class ExternalOptimizerInterface(object):
                 for packing_slice in self._packing_slices]
 
     # Set optimization variables to their new values.
-    session.run(self._var_updates,
-                feed_dict=dict(zip(self._update_placeholders, var_vals)))
+    session.run(
+        self._var_updates,
+        feed_dict=dict(zip(self._update_placeholders, var_vals)),
+        **run_kwargs)
 
   def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
new file mode 100644
index 00000000000..4c3fec06728
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+Compared with the original Adam optimizer, the one in this file can provide a
+large improvement in model training throughput for some applications. However,
+it provides slightly different semantics than the original Adam algorithm, and
+may lead to different empirical results.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+
+
+class LazyAdamOptimizer(adam.AdamOptimizer):
+  """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse variables.
+  It only updates moving-average accumulators for sparse variable indices that
+  appear in the current batch, rather than updating the accumulators for all
+  indices. Compared with the original Adam optimizer, it can provide large
+  improvements in model training throughput for some applications. However, it
+  provides slightly different semantics than the original Adam algorithm, and
+  may lead to different empirical results.
+  """
+
+  def _apply_sparse(self, grad, var):
+    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+
+    # m := beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_t = state_ops.scatter_update(m, grad.indices,
+                                   beta1_t * array_ops.gather(m, grad.indices) +
+                                   (1 - beta1_t) * grad.values,
+                                   use_locking=self._use_locking)
+
+    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_t = state_ops.scatter_update(v, grad.indices,
+                                   beta2_t * array_ops.gather(v, grad.indices) +
+                                   (1 - beta2_t) * math_ops.square(grad.values),
+                                   use_locking=self._use_locking)
+
+    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
+    m_t_slice = array_ops.gather(m_t, grad.indices)
+    v_t_slice = array_ops.gather(v_t, grad.indices)
+    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
+    var_update = state_ops.scatter_sub(var, grad.indices,
+                                       lr * m_t_slice / denominator_slice,
+                                       use_locking=self._use_locking)
+    return control_flow_ops.group(var_update, m_t, v_t)
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
new file mode 100644
index 00000000000..a16857db7d5
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for LazyAdamOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import lazy_adam_optimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = lazy_adam_optimizer.LazyAdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = lazy_adam_optimizer.LazyAdamOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update_opt = lazy_adam_optimizer.LazyAdamOptimizer()
+        repeated_update = repeated_update_opt.apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update_opt = lazy_adam_optimizer.LazyAdamOptimizer()
+        aggregated_update = aggregated_update_opt.apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index bc1b24be807..c48494585eb 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -12,39 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Optimizer that computes a moving average of the variables.
+"""Moving average optimizer."""
 
-Empirically it has been found that using the moving average of the trained
-parameters of a deep network is better than using its trained parameters
-directly. This optimizer allows you to compute this moving average and swap the
-variables at save time so that any code outside of the training loop will use by
-default the averaged values instead of the original ones.
-
-Example of usage:
-
-```python
-
-// Encapsulate your favorite optimizer (here the momentum one)
-// inside the MovingAverageOptimizer.
-opt = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum)
-opt = tf.contrib.opt.MovingAverageOptimizer(opt)
-// Then create your model and all its variables.
-model = build_model()
-// Add the training op that optimizes using opt.
-// This needs to be called before swapping_saver().
-opt.minimize(cost, var_list)
-// Then create your saver like this:
-saver = opt.swapping_saver()
-// Pass it to your training loop.
-    slim.learning.train(
-        model,
-        ...
-        saver=saver)
-```
-
-Note that for evaluation, the normal saver should be used instead of
-swapping_saver().
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -60,7 +29,39 @@ from tensorflow.python.training import saver
 
 
 class MovingAverageOptimizer(optimizer.Optimizer):
-  """Optimizer wrapper that maintains a moving average of parameters."""
+  """Optimizer that computes a moving average of the variables.
+
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop will
+  use by default the averaged values instead of the original ones.
+
+  Example of usage:
+
+  ```python
+
+  // Encapsulate your favorite optimizer (here the momentum one)
+  // inside the MovingAverageOptimizer.
+  opt = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum)
+  opt = tf.contrib.opt.MovingAverageOptimizer(opt)
+  // Then create your model and all its variables.
+  model = build_model()
+  // Add the training op that optimizes using opt.
+  // This needs to be called before swapping_saver().
+  opt.minimize(cost, var_list)
+  // Then create your saver like this:
+  saver = opt.swapping_saver()
+  // Pass it to your training loop.
+      slim.learning.train(
+          model,
+          ...
+          saver=saver)
+  ```
+
+  Note that for evaluation, the normal saver should be used instead of
+  swapping_saver().
+  """
 
   def __init__(self, opt, average_decay=0.9999, num_updates=None,
                sequential_update=True):
@@ -121,7 +122,7 @@ class MovingAverageOptimizer(optimizer.Optimizer):
       **kwargs: Keyword arguments of `Saver()`.
 
     Returns:
-      A `tf.Saver` object.
+      A `tf.train.Saver` object.
 
     Raises:
       RuntimeError: If apply_gradients or minimize has not been called before.
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
new file mode 100644
index 00000000000..a4421ecfe6b
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+
+
+class NadamOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the Nadam algorithm.
+
+  See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.apply_adam(
+        var,
+        m,
+        v,
+        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta1_t * m_t
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(
+        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
new file mode 100644
index 00000000000..b0a257d264f
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -0,0 +1,159 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import nadam_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
index 1a88bda2a48..fdda86b0b53 100644
--- a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
@@ -35,7 +35,10 @@ class VariableClippingOptimizerTest(test.TestCase):
   def _setupCluster(self):
 
     def get_open_port():
-      s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+      try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+      except IOError:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
       s.bind(("", 0))
       port = s.getsockname()[1]
       s.close()
diff --git a/tensorflow/contrib/pi_examples/README.md b/tensorflow/contrib/pi_examples/README.md
index 8dde63e4c6c..f5502280837 100644
--- a/tensorflow/contrib/pi_examples/README.md
+++ b/tensorflow/contrib/pi_examples/README.md
@@ -69,5 +69,5 @@ Flite package and then pipe the output of the binary you've built, like this:
 
 ```
 sudo apt-get install flite
-tensorflow/contrib/pi_examples/camera/gen/bin/camera | xargs -n1 flite -t
+tensorflow/contrib/pi_examples/camera/gen/bin/camera | xargs -n 1 flite -t
 ```
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 04ca134ff49..7817cd0c645 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -102,6 +102,7 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   cinfo.client_data = &jpeg_jmpbuf;
   jerr.error_exit = CatchError;
   if (setjmp(jpeg_jmpbuf)) {
+    fclose(infile);
     return tensorflow::errors::Unknown("JPEG decoding failed");
   }
   
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 0f056dc1cae..8c71977d5ac 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -9,6 +9,7 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//visibility:public"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
@@ -16,22 +17,31 @@ load(
     "tf_py_test",
     "tf_gen_op_libs",
     "tf_kernel_library",
+    "tf_gen_op_wrapper_py",
 )
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "rnn_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
-    data = [
+    dso = [
         ":python/ops/_gru_ops.so",
         ":python/ops/_lstm_ops.so",
     ],
+    kernels = [
+        ":gru_ops_kernels",
+        ":lstm_ops_kernels",
+        ":gru_ops_op_lib",
+        ":lstm_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":gru_ops",
+        ":lstm_ops",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/util:util_py",
@@ -41,6 +51,7 @@ py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
@@ -72,6 +83,8 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["notsan"],  # http://b/62135981
+    xla_enabled = True,
 )
 
 cuda_py_tests(
@@ -115,7 +128,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "core_rnn_test",
-    size = "medium",
+    size = "large",
     srcs = ["python/kernel_tests/core_rnn_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -195,6 +208,11 @@ tf_custom_op_library(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "lstm_ops",
+    deps = [":lstm_ops_op_lib"],
+)
+
 tf_custom_op_library(
     name = "python/ops/_gru_ops.so",
     srcs = [
@@ -214,6 +232,11 @@ tf_custom_op_library(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gru_ops",
+    deps = [":gru_ops_op_lib"],
+)
+
 cuda_py_tests(
     name = "gru_ops_test",
     size = "small",
@@ -282,6 +305,7 @@ filegroup(
         exclude = [
             "**/METADATA",
             "**/OWNERS",
+            "tools/**",
         ],
     ),
     visibility = ["//tensorflow:__subpackages__"],
@@ -329,3 +353,27 @@ tf_kernel_library(
         "//third_party/eigen3",
     ],
 )
+
+py_binary(
+    name = "checkpoint_convert",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "checkpoint_convert_test",
+    size = "small",
+    srcs = ["python/tools/checkpoint_convert_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":checkpoint_convert",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index c4dfc6fe9f2..d39c1f062aa 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -12,95 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for constructing RNN Cells and additional RNN operations.
+"""RNN Cells and additional RNN operations.
 
-## Base interface for all RNN Cells
+See @{$python/contrib.rnn} guide.
 
+# From core
 @@RNNCell
-
-## Core RNN Cells for use with TensorFlow's core RNN methods
-
 @@BasicRNNCell
 @@BasicLSTMCell
 @@GRUCell
 @@LSTMCell
-@@LayerNormBasicLSTMCell
-
-## Classes storing split `RNNCell` state
-
 @@LSTMStateTuple
-
-## Core RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-@@MultiRNNCell
-@@LSTMBlockWrapper
 @@DropoutWrapper
-@@EmbeddingWrapper
-@@InputProjectionWrapper
-@@OutputProjectionWrapper
+@@MultiRNNCell
 @@DeviceWrapper
 @@ResidualWrapper
 
-### Block RNNCells
+# Used to be in core, but kept in contrib.
+@@EmbeddingWrapper
+@@InputProjectionWrapper
+@@OutputProjectionWrapper
+
+# Created in contrib, eventual plans to move to core.
+@@LayerNormBasicLSTMCell
+@@LSTMBlockWrapper
 @@LSTMBlockCell
 @@GRUBlockCell
-
-### Fused RNNCells
 @@FusedRNNCell
 @@FusedRNNCellAdaptor
 @@TimeReversedFusedRNN
 @@LSTMBlockFusedCell
-
-### LSTM-like cells
 @@CoupledInputForgetGateLSTMCell
 @@TimeFreqLSTMCell
 @@GridLSTMCell
+@@BidirectionalGridLSTMCell
+@@NASCell
+@@UGRNNCell
+@@IntersectionRNNCell
+@@PhasedLSTMCell
+@@HighwayWrapper
+@@GLSTMCell
 
-### RNNCell wrappers
+# RNNCell wrappers
 @@AttentionCellWrapper
 @@CompiledWrapper
 
-
-## Recurrent Neural Networks
-
-TensorFlow provides a number of methods for constructing Recurrent Neural
-Networks.
-
+# RNN functions
 @@static_rnn
 @@static_state_saving_rnn
 @@static_bidirectional_rnn
 @@stack_bidirectional_dynamic_rnn
+@@stack_bidirectional_rnn
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.rnn.python.ops.core_rnn import static_bidirectional_rnn
-from tensorflow.contrib.rnn.python.ops.core_rnn import static_rnn
-from tensorflow.contrib.rnn.python.ops.core_rnn import static_state_saving_rnn
-
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicLSTMCell
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import DeviceWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import DropoutWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import EmbeddingWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import GRUCell
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import InputProjectionWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import LSTMCell
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import LSTMStateTuple
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import MultiRNNCell
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import OutputProjectionWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import ResidualWrapper
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell import RNNCell
-
 # pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import EmbeddingWrapper
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import InputProjectionWrapper
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import OutputProjectionWrapper
+
 from tensorflow.contrib.rnn.python.ops.fused_rnn_cell import *
 from tensorflow.contrib.rnn.python.ops.gru_ops import *
 from tensorflow.contrib.rnn.python.ops.lstm_ops import *
 from tensorflow.contrib.rnn.python.ops.rnn import *
 from tensorflow.contrib.rnn.python.ops.rnn_cell import *
+
+from tensorflow.python.ops.rnn import static_bidirectional_rnn
+from tensorflow.python.ops.rnn import static_rnn
+from tensorflow.python.ops.rnn import static_state_saving_rnn
+
+from tensorflow.python.ops.rnn_cell import *
 # pylint: enable=unused-import,wildcard-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__, ['core_rnn_cell'])
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index 6173591d3db..0796f82b214 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -122,9 +122,9 @@ class GRUCellBlockOp : public OpKernel {
                                   &c_tensor));
 
     Tensor* h_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("h", TensorShape({batch_size, cell_size}),
-                                  &h_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"h_prev"}, "h",
+                            TensorShape({batch_size, cell_size}), &h_tensor));
 
     // Allocate temp tensors.
     Tensor x_h_prev_tensor;
@@ -304,14 +304,15 @@ class GRUBlockCellGradOp : public OpKernel {
 
     // Create output tensors.
     Tensor* d_x_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("d_x", TensorShape({batch_size, input_size}),
-                                  &d_x_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"x"}, "d_x", TensorShape({batch_size, input_size}),
+                            &d_x_tensor));
 
     Tensor* d_h_prev_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            "d_h_prev", TensorShape({batch_size, cell_size}),
-                            &d_h_prev_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"h_prev"}, "d_h_prev", TensorShape({batch_size, cell_size}),
+                 &d_h_prev_tensor));
 
     Tensor* d_c_bar_tensor;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 2cebcd8fb31..f74d6cec762 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -112,9 +112,9 @@ class LSTMBlockCellOp : public OpKernel {
 
     // Allocate our output tensors.
     Tensor* i_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("i", TensorShape({batch_size, cell_size}),
-                                  &i_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"h_prev"}, "i",
+                            TensorShape({batch_size, cell_size}), &i_tensor));
 
     Tensor* cs_tensor = nullptr;
     OP_REQUIRES_OK(
@@ -127,9 +127,9 @@ class LSTMBlockCellOp : public OpKernel {
                                   &f_tensor));
 
     Tensor* o_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("o", TensorShape({batch_size, cell_size}),
-                                  &o_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"cs_prev"}, "o",
+                            TensorShape({batch_size, cell_size}), &o_tensor));
 
     Tensor* ci_tensor = nullptr;
     OP_REQUIRES_OK(
@@ -387,10 +387,10 @@ class LSTMBlockCellGradOp : public OpKernel {
 
     // Allocate our output tensors.
     Tensor* cs_prev_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output("cs_prev_grad",
-                                        TensorShape({batch_size, cell_size}),
-                                        &cs_prev_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"cs_grad"}, "cs_prev_grad",
+                 TensorShape({batch_size, cell_size}), &cs_prev_grad_tensor));
 
     Tensor* dicfo_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(
@@ -398,16 +398,19 @@ class LSTMBlockCellGradOp : public OpKernel {
                             &dicfo_tensor));
 
     Tensor* wci_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wci_grad", wci_tensor->shape(),
-                                             &wci_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wci"}, "wci_grad", wci_tensor->shape(), &wci_grad_tensor));
 
     Tensor* wcf_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wcf_grad", wcf_tensor->shape(),
-                                             &wcf_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wcf"}, "wcf_grad", wcf_tensor->shape(), &wcf_grad_tensor));
 
     Tensor* wco_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wco_grad", wco_tensor->shape(),
-                                             &wco_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wco"}, "wco_grad", wco_tensor->shape(), &wco_grad_tensor));
 
     // Allocate our temp tensors.
     Tensor do_tensor;
@@ -521,7 +524,7 @@ namespace {
 template <typename Device, typename T>
 class SliceHelper {
  public:
-  SliceHelper(OpKernelContext* ctx)
+  explicit SliceHelper(OpKernelContext* ctx)
       : ctx_(ctx), device_(ctx_->eigen_device<Device>()) {}
 
   ~SliceHelper() {
@@ -596,7 +599,7 @@ class SliceHelper {
       CHECK(aligned.shape().IsSameSize(t.shape()));
       CHECK_EQ(aligned.dtype(), t.dtype());
     } else {  // allocate a new temporary tensor
-      ctx_->allocate_temp(t.dtype(), t.shape(), &aligned);
+      TF_CHECK_OK(ctx_->allocate_temp(t.dtype(), t.shape(), &aligned));
       pool_.emplace(name, std::make_pair(aligned, true));
     }
     functor::TensorCopyUnaligned<Device, T>()(device_, t.unaligned_flat<T>(),
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
index 2de40825c90..699cc6c88a4 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -78,7 +78,7 @@ ci = tanh(ci)
 cs = ci .* i + cs_prev .* f
 cs = clip(cs, cell_clip)
 
-o = sigmoid(cs * wco + f)
+o = sigmoid(cs * wco + o)
 co = tanh(cs)
 h = co .* o
 ```
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 7c4d003b833..544cd163c50 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -37,7 +37,7 @@ class LSTMOpsTest : public ::testing::Test {
   }
 };
 
-static string JoinedCopies(string s, int copies) {
+static string JoinedCopies(const string& s, int copies) {
   string res;
   for (int i = 0; i < copies; ++i) {
     strings::StrAppend(&res, i > 0 ? ";" : "", s);
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index b69dc0c03a5..06954f51d8e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -19,33 +19,31 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
 # TODO(ebrevdo): Remove once _linear is fully deprecated.
 # pylint: disable=protected-access
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import _linear as linear
+from tensorflow.contrib import rnn as contrib_rnn
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
 
 # pylint: enable=protected-access
+linear = rnn_cell_impl._linear
 
 
 class RNNCellTest(test.TestCase):
@@ -77,7 +75,37 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
-        g, _ = core_rnn_cell_impl.BasicRNNCell(2)(x, m)
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g], {x.name: np.array([[1., 1.]]),
+                  m.name: np.array([[0.1, 0.1]])})
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testBasicRNNCellNotTrainable(self):
+    with self.test_session() as sess:
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g], {x.name: np.array([[1., 1.]]),
@@ -90,7 +118,7 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
-        g, _ = core_rnn_cell_impl.GRUCell(2)(x, m)
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g], {x.name: np.array([[1., 1.]]),
@@ -102,7 +130,7 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros(
             [1, 3])  # Test GRUCell with input_size != num_units.
         m = array_ops.zeros([1, 2])
-        g, _ = core_rnn_cell_impl.GRUCell(2)(x, m)
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g],
@@ -117,10 +145,26 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 8])
-        g, out_m = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.BasicLSTMCell(
-                2, state_is_tuple=False)] * 2,
-            state_is_tuple=False)(x, m)
+        cell = rnn_cell_impl.MultiRNNCell(
+            [
+                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                for _ in range(2)
+            ],
+            state_is_tuple=False)
+        g, out_m = cell(x, m)
+        expected_variable_names = [
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+            rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+            rnn_cell_impl._BIAS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+            rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+            rnn_cell_impl._BIAS_VARIABLE_NAME
+        ]
+        self.assertEqual(
+            expected_variable_names, [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m],
@@ -128,15 +172,7 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 8])})
         self.assertEqual(len(res), 2)
         variables = variables_lib.global_variables()
-        self.assertEqual(4, len(variables))
-        self.assertEquals(variables[0].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/weights")
-        self.assertEquals(variables[1].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/biases")
-        self.assertEquals(variables[2].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/weights")
-        self.assertEquals(variables[3].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/biases")
+        self.assertEqual(expected_variable_names, [v.name for v in variables])
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
         expected_mem = np.array([[
@@ -149,8 +185,7 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros(
             [1, 3])  # Test BasicLSTMCell with input_size != num_units.
         m = array_ops.zeros([1, 4])
-        g, out_m = core_rnn_cell_impl.BasicLSTMCell(
-            2, state_is_tuple=False)(x, m)
+        g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m],
@@ -158,6 +193,44 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 4])})
         self.assertEqual(len(res), 2)
 
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+               m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3 # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+                    m.name: 0.1 * np.ones([batch_size, state_size])})
+
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
       with variable_scope.variable_scope(
@@ -165,30 +238,29 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m0 = (array_ops.zeros([1, 2]),) * 2
         m1 = (array_ops.zeros([1, 2]),) * 2
-        cell = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.BasicLSTMCell(2)] * 2, state_is_tuple=True)
+        cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
+            state_is_tuple=True)
         self.assertTrue(isinstance(cell.state_size, tuple))
         self.assertTrue(
-            isinstance(cell.state_size[0], core_rnn_cell_impl.LSTMStateTuple))
+            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
         self.assertTrue(
-            isinstance(cell.state_size[1], core_rnn_cell_impl.LSTMStateTuple))
+            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
 
         # Pass in regular tuples
         _, (out_m0, out_m1) = cell(x, (m0, m1))
-        self.assertTrue(isinstance(out_m0, core_rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, core_rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
 
         # Pass in LSTMStateTuples
         variable_scope.get_variable_scope().reuse_variables()
         zero_state = cell.zero_state(1, dtypes.float32)
         self.assertTrue(isinstance(zero_state, tuple))
-        self.assertTrue(
-            isinstance(zero_state[0], core_rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(
-            isinstance(zero_state[1], core_rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
         _, (out_m0, out_m1) = cell(x, zero_state)
-        self.assertTrue(isinstance(out_m0, core_rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, core_rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
 
   def testBasicLSTMCellWithStateTuple(self):
     with self.test_session() as sess:
@@ -197,9 +269,11 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m0 = array_ops.zeros([1, 4])
         m1 = array_ops.zeros([1, 4])
-        cell = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.BasicLSTMCell(
-                2, state_is_tuple=False)] * 2,
+        cell = rnn_cell_impl.MultiRNNCell(
+            [
+                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                for _ in range(2)
+            ],
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([variables_lib.global_variables_initializer()])
@@ -231,7 +305,7 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([batch_size, input_size])
         m = array_ops.zeros([batch_size, state_size])
-        cell = core_rnn_cell_impl.LSTMCell(
+        cell = rnn_cell_impl.LSTMCell(
             num_units=num_units,
             num_proj=num_proj,
             forget_bias=1.0,
@@ -265,17 +339,17 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([batch_size, input_size])
         m = array_ops.zeros([batch_size, state_size])
-        cell = core_rnn_cell_impl.LSTMCell(
+        cell = rnn_cell_impl.LSTMCell(
             num_units=num_units,
             num_proj=num_proj,
             forget_bias=1.0,
             state_is_tuple=False)
         cell(x, m)  # Execute to create variables
       variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/weights")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/biases")
+      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
       self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/weights")
+                        "root/lstm_cell/projection/kernel")
 
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
@@ -283,8 +357,7 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = array_ops.zeros([1, 3])
-        cell = core_rnn_cell_impl.OutputProjectionWrapper(
-            core_rnn_cell_impl.GRUCell(3), 2)
+        cell = contrib_rnn.OutputProjectionWrapper(rnn_cell_impl.GRUCell(3), 2)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, new_m], {
@@ -301,8 +374,8 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 3])
-        cell = core_rnn_cell_impl.InputProjectionWrapper(
-            core_rnn_cell_impl.GRUCell(3), num_proj=3)
+        cell = contrib_rnn.InputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
@@ -319,10 +392,10 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = array_ops.zeros([1, 3])
-        base_cell = core_rnn_cell_impl.GRUCell(3)
+        base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
-        g_res, m_new_res = core_rnn_cell_impl.ResidualWrapper(base_cell)(x, m)
+        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(base_cell)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, g_res, m_new, m_new_res], {
             x: np.array([[1., 1., 1.]]),
@@ -338,28 +411,36 @@ class RNNCellTest(test.TestCase):
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 3])
       m = array_ops.zeros([1, 3])
-      cell = core_rnn_cell_impl.DeviceWrapper(
-          core_rnn_cell_impl.GRUCell(3), "/cpu:14159")
+      cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/cpu:14159")
       outputs, _ = cell(x, m)
       self.assertTrue("cpu:14159" in outputs.device.lower())
 
-  def testDropoutWrapper(self):
-    with self.test_session() as sess:
+  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
+    if not test.is_gpu_available():
+      # Can't perform this test w/o a GPU
+      return
+
+    with self.test_session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        keep = array_ops.zeros([]) + 1
-        g, new_m = core_rnn_cell_impl.DropoutWrapper(
-            core_rnn_cell_impl.GRUCell(3), keep, keep)(x, m)
+        x = array_ops.zeros([1, 1, 3])
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/gpu:0")
+        with ops.device("/cpu:0"):
+          outputs, _ = rnn.dynamic_rnn(
+              cell=cell, inputs=x, dtype=dtypes.float32)
+        run_metadata = config_pb2.RunMetadata()
+        opts = config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.FULL_TRACE)
+
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
+        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+      step_stats = run_metadata.step_stats
+      ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+      gpu_stats = step_stats.dev_stats[ix].node_stats
+      cpu_stats = step_stats.dev_stats[1 - ix].node_stats
+      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
+      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
 
   def testEmbeddingWrapper(self):
     with self.test_session() as sess:
@@ -367,10 +448,8 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 1], dtype=dtypes.int32)
         m = array_ops.zeros([1, 2])
-        embedding_cell = core_rnn_cell_impl.EmbeddingWrapper(
-            core_rnn_cell_impl.GRUCell(2),
-            embedding_classes=3,
-            embedding_size=2)
+        embedding_cell = contrib_rnn.EmbeddingWrapper(
+            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -387,9 +466,8 @@ class RNNCellTest(test.TestCase):
       with variable_scope.variable_scope("root"):
         inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
         input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
-        embedding_cell = core_rnn_cell_impl.EmbeddingWrapper(
-            core_rnn_cell_impl.BasicLSTMCell(
-                1, state_is_tuple=True),
+        embedding_cell = contrib_rnn.EmbeddingWrapper(
+            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
             embedding_classes=1,
             embedding_size=2)
         outputs, _ = rnn.dynamic_rnn(
@@ -407,8 +485,9 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
-        _, ml = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.GRUCell(2)] * 2, state_is_tuple=False)(x, m)
+        _, ml = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2)
+             for _ in range(2)], state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -427,12 +506,13 @@ class RNNCellTest(test.TestCase):
 
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          core_rnn_cell_impl.MultiRNNCell(
-              [core_rnn_cell_impl.GRUCell(2)] * 2,
-              state_is_tuple=True)(x, m_bad)
+          rnn_cell_impl.MultiRNNCell(
+              [rnn_cell_impl.GRUCell(2)
+               for _ in range(2)], state_is_tuple=True)(x, m_bad)
 
-        _, ml = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.GRUCell(2)] * 2, state_is_tuple=True)(x, m_good)
+        _, ml = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2)
+             for _ in range(2)], state_is_tuple=True)(x, m_good)
 
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
@@ -448,6 +528,213 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(res[1], [[0.13248, 0.13248]])
 
 
+class DropoutWrapperTest(test.TestCase):
+
+  def _testDropoutWrapper(self, batch_size=None, time_steps=None,
+                          parallel_iterations=None, **kwargs):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        if batch_size is None and time_steps is None:
+          # 2 time steps, batch size 1, depth 3
+          batch_size = 1
+          time_steps = 2
+          x = constant_op.constant(
+              [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
+          m = rnn_cell_impl.LSTMStateTuple(
+              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)
+               ] * 2)
+        else:
+          x = constant_op.constant(
+              np.random.randn(time_steps, batch_size, 3).astype(np.float32))
+          m = rnn_cell_impl.LSTMStateTuple(*[
+              constant_op.constant(
+                  [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
+          ] * 2)
+        outputs, final_state = rnn.dynamic_rnn(
+            cell=rnn_cell_impl.DropoutWrapper(
+                rnn_cell_impl.LSTMCell(3), dtype=x.dtype, **kwargs),
+            time_major=True,
+            parallel_iterations=parallel_iterations,
+            inputs=x,
+            initial_state=m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([outputs, final_state])
+        self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
+        self.assertEqual(res[1].c.shape, (batch_size, 3))
+        self.assertEqual(res[1].h.shape, (batch_size, 3))
+        return res
+
+  def testDropoutWrapperKeepAllConstantInput(self):
+    keep = array_ops.ones([])
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]],
+         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  def testDropoutWrapperKeepAll(self):
+    keep = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]],
+         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  def testDropoutWrapperWithSeed(self):
+    keep_some = 0.5
+    random_seed.set_random_seed(2)
+    ## Use parallel_iterations = 1 in both calls to
+    ## _testDropoutWrapper to ensure the (per-time step) dropout is
+    ## consistent across both calls.  Otherwise the seed may not end
+    ## up being munged consistently across both graphs.
+    res_standard_1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some, output_keep_prob=keep_some,
+        state_keep_prob=keep_some, seed=10,
+        parallel_iterations=1)
+    # Clear away the graph and the test session (which keeps variables around)
+    ops.reset_default_graph()
+    self._ClearCachedSession()
+    random_seed.set_random_seed(2)
+    res_standard_2 = self._testDropoutWrapper(
+        input_keep_prob=keep_some, output_keep_prob=keep_some,
+        state_keep_prob=keep_some, seed=10,
+        parallel_iterations=1)
+    self.assertAllClose(res_standard_1[0], res_standard_2[0])
+    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
+    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
+
+  def testDropoutWrapperKeepNoOutput(self):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all, output_keep_prob=keep_none,
+        state_keep_prob=keep_all)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]],
+         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(np.zeros(res[0].shape), res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  def testDropoutWrapperKeepNoState(self):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all, output_keep_prob=keep_all,
+        state_keep_prob=keep_none)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]],
+         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+    self.assertAllClose(true_full_output[0], res[0][0])
+    # Second output is modified by zero input state
+    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
+    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
+    self.assertAllClose(np.zeros(res[1].c.shape), res[1].c)
+
+  def testDropoutWrapperKeepNoInput(self):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]],
+         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    # All outputs are different because inputs are zeroed out
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_none, output_keep_prob=keep_all,
+        state_keep_prob=keep_all)
+    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
+
+  def testDropoutWrapperRecurrentOutput(self):
+    keep_some = 0.8
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all, output_keep_prob=keep_some,
+        state_keep_prob=keep_all, variational_recurrent=True,
+        input_size=3, batch_size=5, time_steps=7)
+    # Ensure the same dropout pattern for all time steps
+    output_mask = np.abs(res[0]) > 1e-6
+    for m in output_mask[1:]:
+      self.assertAllClose(output_mask[0], m)
+
+  def testDropoutWrapperRecurrentStateInputAndOutput(self):
+    keep_some = 0.9
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_some, output_keep_prob=keep_some,
+        state_keep_prob=keep_some, variational_recurrent=True,
+        input_size=3, batch_size=5, time_steps=7)
+
+    # Smoke test for the state/input masks.
+    output_mask = np.abs(res[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res[1].c) > 1e-6
+    state_h_mask = np.abs(res[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(self):
+    keep_some = 0.9
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res0 = self._testDropoutWrapper(
+        input_keep_prob=keep_some, output_keep_prob=keep_some,
+        state_keep_prob=keep_some, variational_recurrent=True,
+        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+    ops.reset_default_graph()
+    self._ClearCachedSession()
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some, output_keep_prob=keep_some,
+        state_keep_prob=keep_some, variational_recurrent=True,
+        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+
+    output_mask = np.abs(res0[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res0[1].c) > 1e-6
+    state_h_mask = np.abs(res0[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+    # Ensure seeded calculation is identical.
+    self.assertAllClose(res0[0], res1[0])
+    self.assertAllClose(res0[1].c, res1[1].c)
+    self.assertAllClose(res0[1].h, res1[1].h)
+
+
 class SlimRNNCellTest(test.TestCase):
 
   def testBasicRNNCell(self):
@@ -458,7 +745,7 @@ class SlimRNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         my_cell = functools.partial(basic_rnn_cell, num_units=2)
         # pylint: disable=protected-access
-        g, _ = core_rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
+        g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
         # pylint: enable=protected-access
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
@@ -475,14 +762,14 @@ class SlimRNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         inputs = random_ops.random_uniform((batch_size, input_size))
         _, initial_state = basic_rnn_cell(inputs, None, num_units)
+        rnn_cell = rnn_cell_impl.BasicRNNCell(num_units)
+        outputs, state = rnn_cell(inputs, initial_state)
+        variable_scope.get_variable_scope().reuse_variables()
         my_cell = functools.partial(basic_rnn_cell, num_units=num_units)
         # pylint: disable=protected-access
-        slim_cell = core_rnn_cell_impl._SlimRNNCell(my_cell)
+        slim_cell = rnn_cell_impl._SlimRNNCell(my_cell)
         # pylint: enable=protected-access
         slim_outputs, slim_state = slim_cell(inputs, initial_state)
-        rnn_cell = core_rnn_cell_impl.BasicRNNCell(num_units)
-        variable_scope.get_variable_scope().reuse_variables()
-        outputs, state = rnn_cell(inputs, initial_state)
         self.assertEqual(slim_outputs.get_shape(), outputs.get_shape())
         self.assertEqual(slim_state.get_shape(), state.get_shape())
         sess.run([variables_lib.global_variables_initializer()])
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 3c84c34726f..09aa30a20b3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -19,20 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib import rnn as rnn_lib
-from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -44,13 +35,14 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
-
+from tensorflow.python.framework import test_util
 
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
@@ -159,7 +151,7 @@ class RNNTest(test.TestCase):
     cell = Plus1RNNCell()
     inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
-      core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
+      rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
 
   def testRNN(self):
     cell = Plus1RNNCell()
@@ -170,7 +162,7 @@ class RNNTest(test.TestCase):
         array_ops.placeholder(
             dtypes.float32, shape=(batch_size, input_size))
     ]
-    outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+    outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
     for out, inp in zip(outputs, inputs):
       self.assertEqual(out.get_shape(), inp.get_shape())
@@ -192,7 +184,7 @@ class RNNTest(test.TestCase):
 
   def testDropout(self):
     cell = Plus1RNNCell()
-    full_dropout_cell = core_rnn_cell_impl.DropoutWrapper(
+    full_dropout_cell = rnn_cell.DropoutWrapper(
         cell, input_keep_prob=1e-12, seed=0)
     batch_size = 2
     input_size = 5
@@ -202,9 +194,9 @@ class RNNTest(test.TestCase):
             dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("share_scope"):
-      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     with variable_scope.variable_scope("drop_scope"):
-      dropped_outputs, _ = core_rnn.static_rnn(
+      dropped_outputs, _ = rnn.static_rnn(
           full_dropout_cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
     for out, inp in zip(outputs, inputs):
@@ -233,7 +225,7 @@ class RNNTest(test.TestCase):
             dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("drop_scope"):
-      dynamic_outputs, dynamic_state = core_rnn.static_rnn(
+      dynamic_outputs, dynamic_state = rnn.static_rnn(
           cell, inputs, sequence_length=sequence_length, dtype=dtypes.float32)
     self.assertEqual(len(dynamic_outputs), len(inputs))
 
@@ -303,8 +295,7 @@ class RNNTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(batch_size, input_size))
       ]
-      return core_rnn.static_rnn(
-          cell, inputs, dtype=dtypes.float32, scope=scope)
+      return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope)
 
     self._testScope(factory, use_outer_scope=True)
     self._testScope(factory, use_outer_scope=False)
@@ -325,13 +316,13 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units, initializer=initializer, state_is_tuple=False)
       inputs = max_length * [
           array_ops.placeholder(
               dtypes.float32, shape=(batch_size, input_size))
       ]
-      outputs, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
         self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
@@ -348,7 +339,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           cell_clip=0.0,
@@ -358,7 +349,7 @@ class LSTMTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(batch_size, input_size))
       ]
-      outputs, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
         self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
@@ -380,7 +371,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=False,
           initializer=initializer,
@@ -390,7 +381,7 @@ class LSTMTest(test.TestCase):
               dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
-        outputs, state = core_rnn.static_state_saving_rnn(
+        outputs, state = rnn.static_state_saving_rnn(
             cell, inputs, state_saver=state_saver, state_name="save_lstm")
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
@@ -412,7 +403,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, num_units)
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=False,
           initializer=initializer,
@@ -422,7 +413,7 @@ class LSTMTest(test.TestCase):
               dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
-        outputs, state = core_rnn.static_state_saving_rnn(
+        outputs, state = rnn.static_state_saving_rnn(
             cell, inputs, state_saver=state_saver, state_name=("c", "m"))
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
@@ -456,14 +447,14 @@ class LSTMTest(test.TestCase):
       })
 
       def _cell(i):
-        return core_rnn_cell_impl.LSTMCell(
+        return rnn_cell.LSTMCell(
             num_units + i,
             use_peepholes=False,
             initializer=initializer,
             state_is_tuple=True)
 
       # This creates a state tuple which has 4 sub-tuples of length 2 each.
-      cell = core_rnn_cell_impl.MultiRNNCell(
+      cell = rnn_cell.MultiRNNCell(
           [_cell(i) for i in range(4)], state_is_tuple=True)
 
       self.assertEqual(len(cell.state_size), 4)
@@ -477,7 +468,7 @@ class LSTMTest(test.TestCase):
 
       state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3"))
       with variable_scope.variable_scope("share_scope"):
-        outputs, state = core_rnn.static_state_saving_rnn(
+        outputs, state = rnn.static_state_saving_rnn(
             cell, inputs, state_saver=state_saver, state_name=state_names)
       self.assertEqual(len(outputs), len(inputs))
 
@@ -514,20 +505,20 @@ class LSTMTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(None, input_size))
       ]
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
           initializer=initializer,
           state_is_tuple=False)
-      outputs, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
-  def testStateTupleWithProjAndSequenceLength(self):
+  def _testStateTupleWithProjAndSequenceLength(self):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -541,27 +532,34 @@ class LSTMTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(None, input_size))
       ]
-      cell_notuple = core_rnn_cell_impl.LSTMCell(
+      cell_notuple = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
           initializer=initializer,
           state_is_tuple=False)
-      cell_tuple = core_rnn_cell_impl.LSTMCell(
+      cell_tuple = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
           initializer=initializer,
           state_is_tuple=True)
       with variable_scope.variable_scope("root") as scope:
-        outputs_notuple, state_notuple = core_rnn.static_rnn(
+        outputs_notuple, state_notuple = rnn.static_rnn(
             cell_notuple,
             inputs,
             dtype=dtypes.float32,
             sequence_length=sequence_length,
             scope=scope)
         scope.reuse_variables()
-        outputs_tuple, state_tuple = core_rnn.static_rnn(
+        # TODO(ebrevdo): For this test, we ensure values are identical and
+        # therefore the weights here are tied.  In the future, we may consider
+        # making the state_is_tuple property mutable so we can avoid
+        # having to do this - especially if users ever need to reuse
+        # the parameters from different RNNCell instances.  Right now,
+        # this seems an unrealistic use case except for testing.
+        cell_tuple._scope = cell_notuple._scope  # pylint: disable=protected-access
+        outputs_tuple, state_tuple = rnn.static_rnn(
             cell_tuple,
             inputs,
             dtype=dtypes.float32,
@@ -602,7 +600,7 @@ class LSTMTest(test.TestCase):
               dtypes.float32, shape=(None, input_size))
       ]
 
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
@@ -611,7 +609,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
 
-      outputs, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+      outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
       self.assertEqual(len(outputs), len(inputs))
 
@@ -634,7 +632,7 @@ class LSTMTest(test.TestCase):
               dtypes.float64, shape=(None, input_size))
       ]
 
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
@@ -643,7 +641,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
 
-      outputs, _ = core_rnn.static_rnn(
+      outputs, _ = rnn.static_rnn(
           cell,
           inputs,
           initial_state=cell.zero_state(batch_size, dtypes.float64))
@@ -671,7 +669,7 @@ class LSTMTest(test.TestCase):
       ]
       initializer = init_ops.constant_initializer(0.001)
 
-      cell_noshard = core_rnn_cell_impl.LSTMCell(
+      cell_noshard = rnn_cell.LSTMCell(
           num_units,
           num_proj=num_proj,
           use_peepholes=True,
@@ -680,7 +678,7 @@ class LSTMTest(test.TestCase):
           num_proj_shards=num_proj_shards,
           state_is_tuple=False)
 
-      cell_shard = core_rnn_cell_impl.LSTMCell(
+      cell_shard = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           initializer=initializer,
@@ -688,10 +686,10 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
       with variable_scope.variable_scope("noshard_scope"):
-        outputs_noshard, state_noshard = core_rnn.static_rnn(
+        outputs_noshard, state_noshard = rnn.static_rnn(
             cell_noshard, inputs, dtype=dtypes.float32)
       with variable_scope.variable_scope("shard_scope"):
-        outputs_shard, state_shard = core_rnn.static_rnn(
+        outputs_shard, state_shard = rnn.static_rnn(
             cell_shard, inputs, dtype=dtypes.float32)
 
       self.assertEqual(len(outputs_noshard), len(inputs))
@@ -730,7 +728,7 @@ class LSTMTest(test.TestCase):
               dtypes.float64, shape=(None, input_size))
       ]
 
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
@@ -738,9 +736,9 @@ class LSTMTest(test.TestCase):
           num_proj_shards=num_proj_shards,
           initializer=initializer,
           state_is_tuple=False)
-      dropout_cell = core_rnn_cell_impl.DropoutWrapper(cell, 0.5, seed=0)
+      dropout_cell = rnn_cell.DropoutWrapper(cell, 0.5, seed=0)
 
-      outputs, state = core_rnn.static_rnn(
+      outputs, state = rnn.static_rnn(
           dropout_cell,
           inputs,
           sequence_length=sequence_length,
@@ -775,13 +773,13 @@ class LSTMTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(None, input_size))
       ]
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
           initializer=initializer,
           state_is_tuple=False)
-      cell_d = core_rnn_cell_impl.LSTMCell(
+      cell_d = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
@@ -789,11 +787,11 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
       with variable_scope.variable_scope("share_scope"):
-        outputs0, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+        outputs0, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       with variable_scope.variable_scope("share_scope", reuse=True):
-        outputs1, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+        outputs1, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       with variable_scope.variable_scope("diff_scope"):
-        outputs2, _ = core_rnn.static_rnn(cell_d, inputs, dtype=dtypes.float32)
+        outputs2, _ = rnn.static_rnn(cell_d, inputs, dtype=dtypes.float32)
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -822,7 +820,7 @@ class LSTMTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(None, input_size))
       ]
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
@@ -831,10 +829,10 @@ class LSTMTest(test.TestCase):
 
       with ops_lib.name_scope("scope0"):
         with variable_scope.variable_scope("share_scope"):
-          outputs0, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+          outputs0, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       with ops_lib.name_scope("scope1"):
         with variable_scope.variable_scope("share_scope", reuse=True):
-          outputs1, _ = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+          outputs1, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -880,7 +878,7 @@ class LSTMTest(test.TestCase):
 
   def testDynamicRNNAllowsUnknownTimeDimension(self):
     inputs = array_ops.placeholder(dtypes.float32, shape=[1, None, 20])
-    cell = core_rnn_cell.GRUCell(30)
+    cell = rnn_cell.GRUCell(30)
     # Smoke test, this should not raise an error
     rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -899,14 +897,14 @@ class LSTMTest(test.TestCase):
               dtypes.float32, shape=(None, input_size))
       ]
       inputs_c = array_ops.stack(inputs)
-      cell = core_rnn_cell.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           num_proj=num_proj,
           initializer=initializer,
           state_is_tuple=True)
       with variable_scope.variable_scope("root") as scope:
-        outputs_static, state_static = core_rnn.static_rnn(
+        outputs_static, state_static = rnn.static_rnn(
             cell,
             inputs,
             dtype=dtypes.float32,
@@ -920,8 +918,8 @@ class LSTMTest(test.TestCase):
             time_major=True,
             sequence_length=sequence_length,
             scope=scope)
-      self.assertTrue(isinstance(state_static, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(isinstance(state_dynamic, core_rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(state_static, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(state_dynamic, rnn_cell.LSTMStateTuple))
       self.assertEqual(state_static[0], state_static.c)
       self.assertEqual(state_static[1], state_static.h)
       self.assertEqual(state_dynamic[0], state_dynamic.c)
@@ -959,7 +957,7 @@ class LSTMTest(test.TestCase):
       inputs_c = array_ops.stack(inputs)
 
       def _cell(i):
-        return core_rnn_cell.LSTMCell(
+        return rnn_cell.LSTMCell(
             num_units + i,
             use_peepholes=True,
             num_proj=num_proj + i,
@@ -967,7 +965,7 @@ class LSTMTest(test.TestCase):
             state_is_tuple=True)
 
       # This creates a state tuple which has 4 sub-tuples of length 2 each.
-      cell = core_rnn_cell.MultiRNNCell(
+      cell = rnn_cell.MultiRNNCell(
           [_cell(i) for i in range(4)], state_is_tuple=True)
 
       self.assertEqual(len(cell.state_size), 4)
@@ -981,7 +979,7 @@ class LSTMTest(test.TestCase):
         self.assertEqual(test_zero[i][1].get_shape()[1], cell.state_size[i][1])
 
       with variable_scope.variable_scope("root") as scope:
-        outputs_static, state_static = core_rnn.static_rnn(
+        outputs_static, state_static = rnn.static_rnn(
             cell,
             inputs,
             dtype=dtypes.float32,
@@ -1033,7 +1031,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
-      cell = core_rnn_cell.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           initializer=initializer,
@@ -1041,7 +1039,7 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
       with variable_scope.variable_scope("dynamic_scope"):
-        outputs_static, state_static = core_rnn.static_rnn(
+        outputs_static, state_static = rnn.static_rnn(
             cell, inputs, sequence_length=sequence_length, dtype=dtypes.float32)
 
       feeds = {concat_inputs: input_values}
@@ -1091,7 +1089,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
-      cell = core_rnn_cell.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=True,
           initializer=initializer,
@@ -1204,16 +1202,16 @@ class BidirectionalRNNTest(test.TestCase):
         -0.01, 0.01, seed=self._seed)
     sequence_length = array_ops.placeholder(
         dtypes.int64) if use_sequence_length else None
-    cell_fw = core_rnn_cell_impl.LSTMCell(
+    cell_fw = rnn_cell.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
-    cell_bw = core_rnn_cell_impl.LSTMCell(
+    cell_bw = rnn_cell.LSTMCell(
         num_units, input_size, initializer=initializer, state_is_tuple=False)
     inputs = max_length * [
         array_ops.placeholder(
             dtypes.float32,
             shape=(batch_size, input_size) if use_shape else (None, input_size))
     ]
-    outputs, state_fw, state_bw = core_rnn.static_bidirectional_rnn(
+    outputs, state_fw, state_bw = rnn.static_bidirectional_rnn(
         cell_fw,
         cell_bw,
         inputs,
@@ -1325,6 +1323,7 @@ class BidirectionalRNNTest(test.TestCase):
                                      use_shape,
                                      use_state_tuple,
                                      use_time_major,
+                                     use_sequence_length,
                                      scope=None):
     num_units = 3
     input_size = 5
@@ -1333,10 +1332,11 @@ class BidirectionalRNNTest(test.TestCase):
 
     initializer = init_ops.random_uniform_initializer(
         -0.01, 0.01, seed=self._seed)
-    sequence_length = array_ops.placeholder(dtypes.int64)
-    cell_fw = core_rnn_cell.LSTMCell(
+    sequence_length = (
+        array_ops.placeholder(dtypes.int64) if use_sequence_length else None)
+    cell_fw = rnn_cell.LSTMCell(
         num_units, initializer=initializer, state_is_tuple=use_state_tuple)
-    cell_bw = core_rnn_cell.LSTMCell(
+    cell_bw = rnn_cell.LSTMCell(
         num_units, initializer=initializer, state_is_tuple=use_state_tuple)
     inputs = max_length * [
         array_ops.placeholder(
@@ -1368,25 +1368,27 @@ class BidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple,
-                                   use_time_major):
+                                   use_time_major, use_sequence_length):
     with self.test_session(use_gpu=use_gpu, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalDynamicRNN(use_gpu, use_shape,
-                                              use_state_tuple, use_time_major))
+                                              use_state_tuple, use_time_major,
+                                              use_sequence_length))
       variables_lib.global_variables_initializer().run()
       # Run with pre-specified sequence length of 2, 3
+      feed_dict = (
+          {sequence_length: [2, 3]} if use_sequence_length else {})
+      feed_dict.update({inputs[0]: input_value})
       if use_state_tuple:
         out, c_fw, m_fw, c_bw, m_bw = sess.run(
             [outputs, state_fw[0], state_fw[1], state_bw[0], state_bw[1]],
-            feed_dict={inputs[0]: input_value,
-                       sequence_length: [2, 3]})
+            feed_dict=feed_dict)
         s_fw = (c_fw, m_fw)
         s_bw = (c_bw, m_bw)
       else:
+        feed_dict.update({inputs[0]: input_value})
         out, s_fw, s_bw = sess.run(
-            [outputs, state_fw, state_bw],
-            feed_dict={inputs[0]: input_value,
-                       sequence_length: [2, 3]})
+            [outputs, state_fw, state_bw], feed_dict=feed_dict)
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1395,45 +1397,53 @@ class BidirectionalRNNTest(test.TestCase):
       # - forward output:  out[][][depth] for 0 <= depth < 3
       # - backward output: out[][][depth] for 4 <= depth < 6
       #
-      # First sequence in batch is length=2
-      # Check that the time=0 forward output is equal to time=1 backward output
       if not use_time_major:
         out = np.swapaxes(out, 0, 1)
-      self.assertEqual(out[0][0][0], out[1][0][3])
-      self.assertEqual(out[0][0][1], out[1][0][4])
-      self.assertEqual(out[0][0][2], out[1][0][5])
-      # Check that the time=1 forward output is equal to time=0 backward output
-      self.assertEqual(out[1][0][0], out[0][0][3])
-      self.assertEqual(out[1][0][1], out[0][0][4])
-      self.assertEqual(out[1][0][2], out[0][0][5])
 
-      # Second sequence in batch is length=3
-      # Check that the time=0 forward output is equal to time=2 backward output
-      self.assertEqual(out[0][1][0], out[2][1][3])
-      self.assertEqual(out[0][1][1], out[2][1][4])
-      self.assertEqual(out[0][1][2], out[2][1][5])
-      # Check that the time=1 forward output is equal to time=1 backward output
-      self.assertEqual(out[1][1][0], out[1][1][3])
-      self.assertEqual(out[1][1][1], out[1][1][4])
-      self.assertEqual(out[1][1][2], out[1][1][5])
-      # Check that the time=2 forward output is equal to time=0 backward output
-      self.assertEqual(out[2][1][0], out[0][1][3])
-      self.assertEqual(out[2][1][1], out[0][1][4])
-      self.assertEqual(out[2][1][2], out[0][1][5])
-      # Via the reasoning above, the forward and backward final state should be
-      # exactly the same
-      self.assertAllClose(s_fw, s_bw)
+      if use_sequence_length:
+        # First sequence in batch is length=2
+        # Check that the t=0 forward output is equal to t=1 backward output
+        self.assertEqual(out[0][0][0], out[1][0][3])
+        self.assertEqual(out[0][0][1], out[1][0][4])
+        self.assertEqual(out[0][0][2], out[1][0][5])
+        # Check that the t=1 forward output is equal to t=0 backward output
+        self.assertEqual(out[1][0][0], out[0][0][3])
+        self.assertEqual(out[1][0][1], out[0][0][4])
+        self.assertEqual(out[1][0][2], out[0][0][5])
+
+        # Second sequence in batch is length=3
+        # Check that the t=0 forward output is equal to t=2 backward output
+        self.assertEqual(out[0][1][0], out[2][1][3])
+        self.assertEqual(out[0][1][1], out[2][1][4])
+        self.assertEqual(out[0][1][2], out[2][1][5])
+        # Check that the t=1 forward output is equal to t=1 backward output
+        self.assertEqual(out[1][1][0], out[1][1][3])
+        self.assertEqual(out[1][1][1], out[1][1][4])
+        self.assertEqual(out[1][1][2], out[1][1][5])
+        # Check that the t=2 forward output is equal to t=0 backward output
+        self.assertEqual(out[2][1][0], out[0][1][3])
+        self.assertEqual(out[2][1][1], out[0][1][4])
+        self.assertEqual(out[2][1][2], out[0][1][5])
+        # Via the reasoning above, the forward and backward final state should
+        # be exactly the same
+        self.assertAllClose(s_fw, s_bw)
+      else:  # not use_sequence_length
+        max_length = 8  # from createBidirectionalDynamicRNN
+        for t in range(max_length):
+          self.assertAllEqual(out[t, :, 0:3], out[max_length - t - 1, :, 3:6])
+        self.assertAllClose(s_fw, s_bw)
 
   def testBidirectionalDynamicRNN(self):
-    # Generate 2^4 option values
-    # from [True, True, True, True] to [False, False, False, False]
-    options = itertools.product([True, False], repeat=4)
+    # Generate 2^5 option values
+    # from [True, True, True, True, True] to [False, False, False, False, False]
+    options = itertools.product([True, False], repeat=5)
     for option in options:
       self._testBidirectionalDynamicRNN(
           use_gpu=option[0],
           use_shape=option[1],
           use_state_tuple=option[2],
-          use_time_major=option[3])
+          use_time_major=option[3],
+          use_sequence_length=option[4])
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
     # REMARKS: factory(scope) is a function accepting a scope
@@ -1477,6 +1487,7 @@ class BidirectionalRNNTest(test.TestCase):
             use_gpu=True,
             use_shape=True,
             use_state_tuple=True,
+            use_sequence_length=True,
             use_time_major=use_time_major,
             scope=scope)
 
@@ -1516,7 +1527,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
       # variables.
       cell = DummyMultiDimensionalLSTM(feature_dims)
       state_saver = TestStateSaver(batch_size, input_size)
-      outputs_static, state_static = core_rnn.static_rnn(
+      outputs_static, state_static = rnn.static_rnn(
           cell, inputs, dtype=dtypes.float32, sequence_length=sequence_length)
       outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
           cell,
@@ -1524,13 +1535,13 @@ class MultiDimensionalLSTMTest(test.TestCase):
           dtype=dtypes.float32,
           time_major=True,
           sequence_length=sequence_length)
-      outputs_bid, state_fw, state_bw = core_rnn.static_bidirectional_rnn(
+      outputs_bid, state_fw, state_bw = rnn.static_bidirectional_rnn(
           cell,
           cell,
           inputs_using_dim,
           dtype=dtypes.float32,
           sequence_length=sequence_length)
-      outputs_sav, state_sav = core_rnn.static_state_saving_rnn(
+      outputs_sav, state_sav = rnn.static_state_saving_rnn(
           cell,
           inputs_using_dim,
           sequence_length=sequence_length,
@@ -1620,15 +1631,15 @@ class NestedLSTMTest(test.TestCase):
           dtype=dtypes.float32,
           time_major=True,
           sequence_length=sequence_length)
-      outputs_static, state_static = core_rnn.static_rnn(
+      outputs_static, state_static = rnn.static_rnn(
           cell, inputs, dtype=dtypes.float32, sequence_length=sequence_length)
-      outputs_bid, state_fw, state_bw = core_rnn.static_bidirectional_rnn(
+      outputs_bid, state_fw, state_bw = rnn.static_bidirectional_rnn(
           cell,
           cell,
           inputs_using_dim,
           dtype=dtypes.float32,
           sequence_length=sequence_length)
-      outputs_sav, state_sav = core_rnn.static_state_saving_rnn(
+      outputs_sav, state_sav = rnn.static_state_saving_rnn(
           cell,
           inputs_using_dim,
           sequence_length=sequence_length,
@@ -1724,7 +1735,7 @@ class StateSaverRNNTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = core_rnn_cell_impl.LSTMCell(
+      cell = rnn_cell.LSTMCell(
           num_units,
           use_peepholes=False,
           initializer=initializer,
@@ -1733,7 +1744,7 @@ class StateSaverRNNTest(test.TestCase):
           array_ops.placeholder(
               dtypes.float32, shape=(batch_size, input_size))
       ]
-      return core_rnn.static_state_saving_rnn(
+      return rnn.static_state_saving_rnn(
           cell,
           inputs,
           state_saver=state_saver,
@@ -1765,7 +1776,7 @@ class GRUTest(test.TestCase):
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
 
-      cell = core_rnn_cell.GRUCell(num_units=num_units)
+      cell = rnn_cell.GRUCell(num_units=num_units)
 
       with variable_scope.variable_scope("dynamic_scope"):
         outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
@@ -1816,7 +1827,7 @@ class GRUTest(test.TestCase):
     def factory(scope):
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
-      cell = core_rnn_cell.GRUCell(num_units=num_units)
+      cell = rnn_cell.GRUCell(num_units=num_units)
       return rnn.dynamic_rnn(
           cell,
           inputs=concat_inputs,
@@ -1850,7 +1861,7 @@ class RawRNNTest(test.TestCase):
           dtype=dtypes.float32, size=array_ops.shape(inputs)[0])
       inputs_ta = inputs_ta.unstack(inputs)
 
-      cell = core_rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = rnn_cell.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, unused_loop_state):
         emit_output = cell_output  # == None for time == 0
@@ -1951,7 +1962,7 @@ class RawRNNTest(test.TestCase):
           dtype=dtypes.float32, size=array_ops.shape(inputs)[0])
       inputs_ta = inputs_ta.unstack(inputs)
 
-      cell = core_rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = rnn_cell.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, loop_state):
         if cell_output is None:
@@ -1987,7 +1998,7 @@ class RawRNNTest(test.TestCase):
           dtype=dtypes.float32, size=array_ops.shape(inputs)[0])
       inputs_ta = inputs_ta.unstack(inputs)
 
-      cell = core_rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = rnn_cell.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, loop_state):
         if cell_output is None:
@@ -2029,19 +2040,22 @@ class RawRNNTest(test.TestCase):
       inputs_ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=array_ops.shape(inputs)[0])
       inputs_ta = inputs_ta.unstack(inputs)
+      # Verify emit shapes may be unknown by feeding a placeholder that
+      # determines an emit shape.
+      unknown_dim = array_ops.placeholder(dtype=dtypes.int32)
 
-      cell = core_rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = rnn_cell.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
           emit_output = (array_ops.zeros(
               [2, 3], dtype=dtypes.int32), array_ops.zeros(
-                  [1], dtype=dtypes.int64))
+                  [unknown_dim], dtype=dtypes.int64))
           next_state = cell.zero_state(batch_size, dtypes.float32)
         else:
           emit_output = (array_ops.ones(
               [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones(
-                  [batch_size, 1], dtype=dtypes.int64))
+                  [batch_size, unknown_dim], dtype=dtypes.int64))
           next_state = cell_state
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
@@ -2058,7 +2072,7 @@ class RawRNNTest(test.TestCase):
       self.assertEqual([dtypes.int32, dtypes.int64],
                        [ta.dtype for ta in output_ta])
       output = [ta.stack() for ta in output_ta]
-      output_vals = sess.run(output)
+      output_vals = sess.run(output, feed_dict={unknown_dim: 1})
       self.assertAllEqual(
           np.ones((max_time, batch_size, 2, 3), np.int32), output_vals[0])
       self.assertAllEqual(
@@ -2099,7 +2113,7 @@ class RawRNNTest(test.TestCase):
           dtype=dtypes.float32, size=array_ops.shape(inputs)[0])
       inputs_ta = inputs_ta.unstack(inputs)
 
-      cell = core_rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      cell = rnn_cell.LSTMCell(num_units, state_is_tuple=True)
 
       def loop_fn(time_, cell_output, cell_state, unused_loop_state):
         emit_output = cell_output  # == None for time == 0
@@ -2124,7 +2138,7 @@ class RawRNNTest(test.TestCase):
     self._testScope(factory, prefix=None, use_outer_scope=False)
 
 
-class DeviceWrapperCell(core_rnn_cell.RNNCell):
+class DeviceWrapperCell(rnn_cell.RNNCell):
   """Class to ensure cell calculation happens on a specific device."""
 
   def __init__(self, cell, device):
@@ -2158,7 +2172,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     input_size = 5
     num_units = 10
 
-    cell = core_rnn_cell.LSTMCell(num_units, use_peepholes=True)
+    cell = rnn_cell.LSTMCell(num_units, use_peepholes=True)
     gpu_cell = DeviceWrapperCell(cell, cell_device)
     inputs = np.random.randn(batch_size, time_steps,
                              input_size).astype(np.float32)
@@ -2195,9 +2209,10 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/gpu:0")
+        rnn_device="/cpu:0", cell_device=test_util.gpu_device_name())
     step_stats = run_metadata.step_stats
-    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ("sycl" in step_stats.dev_stats[0].device)) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2219,9 +2234,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/cpu:0", input_device="/gpu:0")
+        rnn_device="/cpu:0", cell_device="/cpu:0",
+        input_device=test_util.gpu_device_name())
     step_stats = run_metadata.step_stats
-    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ("sycl" in step_stats.dev_stats[0].device)) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2236,9 +2253,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
-    run_metadata = self._execute_rnn_on(input_device="/gpu:0")
+    run_metadata = self._execute_rnn_on(
+        input_device=test_util.gpu_device_name())
     step_stats = run_metadata.step_stats
-    ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ("sycl" in step_stats.dev_stats[0].device)) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
index 020ac2c5227..f2a032e41e1 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/fused_rnn_cell_test.py
@@ -18,23 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.contrib.rnn.python.ops import fused_rnn_cell
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -48,7 +41,7 @@ class FusedRnnCellTest(test.TestCase):
     with self.test_session() as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890212)
-      cell = core_rnn_cell_impl.BasicRNNCell(10)
+      cell = rnn_cell.BasicRNNCell(10)
       batch_size = 5
       input_size = 20
       timelen = 15
@@ -56,7 +49,7 @@ class FusedRnnCellTest(test.TestCase):
           np.random.randn(timelen, batch_size, input_size))
       with variable_scope.variable_scope("basic", initializer=initializer):
         unpacked_inputs = array_ops.unstack(inputs)
-        outputs, state = core_rnn.static_rnn(
+        outputs, state = rnn.static_rnn(
             cell, unpacked_inputs, dtype=dtypes.float64)
         packed_outputs = array_ops.stack(outputs)
         basic_vars = [
@@ -71,7 +64,8 @@ class FusedRnnCellTest(test.TestCase):
 
       with variable_scope.variable_scope(
           "fused_static", initializer=initializer):
-        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(cell)
+        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
+            rnn_cell.BasicRNNCell(10))
         outputs, state = fused_cell(inputs, dtype=dtypes.float64)
         fused_static_vars = [
             v for v in variables.trainable_variables()
@@ -92,7 +86,7 @@ class FusedRnnCellTest(test.TestCase):
       with variable_scope.variable_scope(
           "fused_dynamic", initializer=initializer):
         fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
-            cell, use_dynamic_rnn=True)
+            rnn_cell.BasicRNNCell(10), use_dynamic_rnn=True)
         outputs, state = fused_cell(inputs, dtype=dtypes.float64)
         fused_dynamic_vars = [
             v for v in variables.trainable_variables()
@@ -115,7 +109,8 @@ class FusedRnnCellTest(test.TestCase):
     with self.test_session() as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890213)
-      cell = core_rnn_cell_impl.BasicRNNCell(10)
+      fw_cell = rnn_cell.BasicRNNCell(10)
+      bw_cell = rnn_cell.BasicRNNCell(10)
       batch_size = 5
       input_size = 20
       timelen = 15
@@ -125,8 +120,8 @@ class FusedRnnCellTest(test.TestCase):
       # test bi-directional rnn
       with variable_scope.variable_scope("basic", initializer=initializer):
         unpacked_inputs = array_ops.unstack(inputs)
-        outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn(
-            cell, cell, unpacked_inputs, dtype=dtypes.float64)
+        outputs, fw_state, bw_state = rnn.static_bidirectional_rnn(
+            fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64)
         packed_outputs = array_ops.stack(outputs)
         basic_vars = [
             v for v in variables.trainable_variables()
@@ -140,8 +135,10 @@ class FusedRnnCellTest(test.TestCase):
             gradients_impl.gradients(packed_outputs, basic_vars))
 
       with variable_scope.variable_scope("fused", initializer=initializer):
-        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(cell)
-        fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(fused_cell)
+        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
+            rnn_cell.BasicRNNCell(10))
+        fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(
+            fused_rnn_cell.FusedRNNCellAdaptor(rnn_cell.BasicRNNCell(10)))
         fw_outputs, fw_state = fused_cell(
             inputs, dtype=dtypes.float64, scope="fw")
         bw_outputs, bw_state = fused_bw_cell(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index f842c7c6434..baf17431f35 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -18,17 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import time
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.contrib.rnn.python.ops import gru_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -39,6 +32,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -46,10 +40,9 @@ from tensorflow.python.training import gradient_descent
 
 
 class GRUBlockCellTest(test.TestCase):
-  _use_gpu = False
 
   def testNoneDimsWithDynamicRNN(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       cell_size = 5
       input_size = 6
@@ -66,7 +59,7 @@ class GRUBlockCellTest(test.TestCase):
       sess.run(output, feed)
 
   def testBlockGRUToGRUCellSingleStep(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       cell_size = 5
       input_size = 6
@@ -84,7 +77,7 @@ class GRUBlockCellTest(test.TestCase):
 
       # Output from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
+        output = rnn_cell.GRUCell(cell_size)(x, h)
         sess.run([variables.global_variables_initializer()])
         basic_res = sess.run([output], {x: x_value, h: h_value})
 
@@ -99,7 +92,7 @@ class GRUBlockCellTest(test.TestCase):
         self.assertAllClose(block, basic)
 
   def testBlockGRUToGRUCellMultiStep(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 2
       cell_size = 3
       input_size = 3
@@ -134,7 +127,7 @@ class GRUBlockCellTest(test.TestCase):
 
       # Output from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.GRUCell(cell_size)
+        cell = rnn_cell.GRUCell(cell_size)
         outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
             cell,
             inputs=concat_x,
@@ -158,7 +151,7 @@ class GRUBlockCellTest(test.TestCase):
       self.assertAllClose(block_res[1], block_res[1])
 
   def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 2
       cell_size = 3
       input_size = 4
@@ -198,7 +191,7 @@ class GRUBlockCellTest(test.TestCase):
 
       # Gradients from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
+        output = rnn_cell.GRUCell(cell_size)(x, h)
         sess.run([variables.global_variables_initializer()])
 
         all_variables = variables.global_variables()[4:8]
@@ -228,7 +221,7 @@ class GRUBlockCellTest(test.TestCase):
     cell_size = 3
     input_size = 4
     time_steps = 2
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       # Random initializers.
       seed = 1994
       initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
@@ -264,7 +257,7 @@ class GRUBlockCellTest(test.TestCase):
 
       # Gradients from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.GRUCell(cell_size)
+        cell = rnn_cell.GRUCell(cell_size)
 
         outputs_dynamic, _ = rnn.dynamic_rnn(
             cell,
@@ -295,7 +288,7 @@ class GRUBlockCellTest(test.TestCase):
       self.assertAllClose(block, basic)
 
   def testGradient(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 1
       cell_size = 3
       input_size = 2
@@ -337,10 +330,6 @@ class GRUBlockCellTest(test.TestCase):
     self.assertLess(error_b_c, eps)
 
 
-class GRUBlockCellGpuTest(GRUBlockCellTest):
-  _use_gpu = True
-
-
 #### Benchmarking GRUBlockCell vs GRUCell.
 
 
@@ -383,7 +372,7 @@ def training_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.GRUCell(cell_size)
+        cell = rnn_cell.GRUCell(cell_size)
 
         outputs_dynamic, _ = rnn.dynamic_rnn(
             cell,
@@ -454,7 +443,7 @@ def inference_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.GRUCell(cell_size)
+        cell = rnn_cell.GRUCell(cell_size)
         outputs_dynamic, _ = rnn.dynamic_rnn(
             cell,
             inputs=concat_x,
@@ -503,8 +492,8 @@ def single_bprop_step_gru_block_vs_gru_cell(batch_size,
 
       # Output from the basic GRU cell implementation.
       with vs.variable_scope("basic", initializer=initializer):
-        output = core_rnn_cell_impl.GRUCell(cell_size)(array_ops.identity(x),
-                                                       array_ops.identity(h))
+        output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x),
+                                             array_ops.identity(h))
         sess.run([variables.global_variables_initializer()])
         grad_output_wrt_input = gradients_impl.gradients([output], h)
         basic_time_bprop = time_taken_by_op(grad_output_wrt_input, sess, iters)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 36aad13e703..0ec37411f5f 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -18,17 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -37,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -45,10 +37,9 @@ block_lstm = lstm_ops._block_lstm  # pylint: disable=protected-access
 
 
 class LSTMBlockCellTest(test.TestCase):
-  _use_gpu = False
 
   def testNoneDimsWithDynamicRNN(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       num_steps = 5
       input_dim = 6
@@ -65,7 +56,7 @@ class LSTMBlockCellTest(test.TestCase):
       sess.run(output, feed)
 
   def testLSTMBlockCell(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
+    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
@@ -73,10 +64,9 @@ class LSTMBlockCellTest(test.TestCase):
         m1 = array_ops.zeros([1, 2])
         m2 = array_ops.zeros([1, 2])
         m3 = array_ops.zeros([1, 2])
-        g, ((out_m0, out_m1),
-            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [lstm_ops.LSTMBlockCell(2)] * 2, state_is_tuple=True)(x, (
-                    (m0, m1), (m2, m3)))
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
+            [lstm_ops.LSTMBlockCell(2)
+             for _ in range(2)], state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: np.array([[1., 1.]]),
@@ -94,29 +84,29 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(res[4], [[0.24024698, 0.24024698]])
 
   def testCompatibleNames(self):
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
-      cell = core_rnn_cell_impl.LSTMCell(10)
-      pcell = core_rnn_cell_impl.LSTMCell(10, use_peepholes=True)
+    with self.test_session(use_gpu=True, graph=ops.Graph()):
+      cell = rnn_cell.LSTMCell(10)
+      pcell = rnn_cell.LSTMCell(10, use_peepholes=True)
       inputs = [array_ops.zeros([4, 5])] * 6
-      core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic")
-      core_rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole")
+      rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic")
+      rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole")
       basic_names = {
           v.name: v.get_shape()
           for v in variables.trainable_variables()
       }
 
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
+    with self.test_session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockCell(10)
       pcell = lstm_ops.LSTMBlockCell(10, use_peephole=True)
       inputs = [array_ops.zeros([4, 5])] * 6
-      core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic")
-      core_rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole")
+      rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic")
+      rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole")
       block_names = {
           v.name: v.get_shape()
           for v in variables.trainable_variables()
       }
 
-    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
+    with self.test_session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockFusedCell(10)
       pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True)
       inputs = [array_ops.zeros([4, 5])] * 6
@@ -131,7 +121,7 @@ class LSTMBlockCellTest(test.TestCase):
     self.assertEqual(basic_names, fused_names)
 
   def testLSTMBasicToBlockCell(self):
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       x = array_ops.zeros([1, 2])
       x_values = np.random.randn(1, 2)
 
@@ -147,11 +137,9 @@ class LSTMBlockCellTest(test.TestCase):
         m1 = array_ops.zeros([1, 2])
         m2 = array_ops.zeros([1, 2])
         m3 = array_ops.zeros([1, 2])
-        g, ((out_m0, out_m1),
-            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [core_rnn_cell_impl.BasicLSTMCell(
-                    2, state_is_tuple=True)] * 2,
-                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
+            [rnn_cell.BasicLSTMCell(2, state_is_tuple=True) for _ in range(2)],
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: x_values,
@@ -166,10 +154,9 @@ class LSTMBlockCellTest(test.TestCase):
         m1 = array_ops.zeros([1, 2])
         m2 = array_ops.zeros([1, 2])
         m3 = array_ops.zeros([1, 2])
-        g, ((out_m0, out_m1),
-            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [lstm_ops.LSTMBlockCell(2)] * 2, state_is_tuple=True)(x, (
-                    (m0, m1), (m2, m3)))
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
+            [lstm_ops.LSTMBlockCell(2)
+             for _ in range(2)], state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: x_values,
@@ -184,7 +171,7 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(basic, block)
 
   def testLSTMBasicToBlockCellPeeping(self):
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       x = array_ops.zeros([1, 2])
       x_values = np.random.randn(1, 2)
 
@@ -200,13 +187,12 @@ class LSTMBlockCellTest(test.TestCase):
         m1 = array_ops.zeros([1, 2])
         m2 = array_ops.zeros([1, 2])
         m3 = array_ops.zeros([1, 2])
-        g, ((out_m0, out_m1),
-            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [
-                    core_rnn_cell_impl.LSTMCell(
-                        2, use_peepholes=True, state_is_tuple=True)
-                ] * 2,
-                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
+            [
+                rnn_cell.LSTMCell(2, use_peepholes=True, state_is_tuple=True)
+                for _ in range(2)
+            ],
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: x_values,
@@ -221,11 +207,9 @@ class LSTMBlockCellTest(test.TestCase):
         m1 = array_ops.zeros([1, 2])
         m2 = array_ops.zeros([1, 2])
         m3 = array_ops.zeros([1, 2])
-        g, ((out_m0, out_m1),
-            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [lstm_ops.LSTMBlockCell(
-                    2, use_peephole=True)] * 2,
-                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
+            [lstm_ops.LSTMBlockCell(2, use_peephole=True) for _ in range(2)],
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: x_values,
@@ -240,7 +224,7 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(basic, block)
 
   def testLSTMBasicToBlock(self):
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       batch_size = 2
       input_size = 3
       cell_size = 4
@@ -255,8 +239,8 @@ class LSTMBlockCellTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890212)
       with variable_scope.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.BasicLSTMCell(cell_size, state_is_tuple=True)
-        outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+        cell = rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
+        outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
         sess.run([variables.global_variables_initializer()])
         basic_outputs, basic_state = sess.run([outputs, state[0]])
@@ -314,7 +298,7 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
 
   def testLSTMBasicToBlockPeeping(self):
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       batch_size = 2
       input_size = 3
       cell_size = 4
@@ -329,9 +313,9 @@ class LSTMBlockCellTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890212)
       with variable_scope.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.LSTMCell(
+        cell = rnn_cell.LSTMCell(
             cell_size, use_peepholes=True, state_is_tuple=True)
-        outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+        outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
         sess.run([variables.global_variables_initializer()])
         basic_outputs, basic_state = sess.run([outputs, state[0]])
@@ -402,7 +386,7 @@ class LSTMBlockCellTest(test.TestCase):
 
   def testLSTMFusedSequenceLengths(self):
     """Verify proper support for sequence lengths in LSTMBlockFusedCell."""
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       batch_size = 3
       input_size = 4
       cell_size = 5
@@ -418,8 +402,8 @@ class LSTMBlockCellTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890213)
       with variable_scope.variable_scope("basic", initializer=initializer):
-        cell = core_rnn_cell_impl.BasicLSTMCell(cell_size, state_is_tuple=True)
-        outputs, state = core_rnn.static_rnn(
+        cell = rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
+        outputs, state = rnn.static_rnn(
             cell, inputs, dtype=dtypes.float32, sequence_length=seq_lengths)
         sess.run([variables.global_variables_initializer()])
         basic_outputs, basic_state = sess.run([outputs, state[0]])
@@ -484,9 +468,5 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(basic, unfused, rtol=1e-2, atol=1e-2)
 
 
-class LSTMBlockCellGpuTest(LSTMBlockCellTest):
-  _use_gpu = True
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index fd462304487..fb91fe14f4e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -19,17 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.contrib.rnn.python.ops import rnn_cell
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
@@ -43,6 +36,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -71,8 +65,8 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([batch_size, input_size])
         m = array_ops.zeros([batch_size, state_size])
-        output, state = rnn_cell.CoupledInputForgetGateLSTMCell(
-            num_units=num_units, forget_bias=1.0)(x, m)
+        output, state = contrib_rnn_cell.CoupledInputForgetGateLSTMCell(
+            num_units=num_units, forget_bias=1.0, state_is_tuple=False)(x, m)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state], {
             x.name:
@@ -95,12 +89,12 @@ class RNNCellTest(test.TestCase):
       input_size = 4
       feature_size = 2
       frequency_skip = 1
-      num_shifts = (input_size - feature_size) / frequency_skip + 1
+      num_shifts = (input_size - feature_size) // frequency_skip + 1
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([batch_size, input_size])
         m = array_ops.zeros([batch_size, state_size * num_shifts])
-        output, state = rnn_cell.TimeFreqLSTMCell(
+        output, state = contrib_rnn_cell.TimeFreqLSTMCell(
             num_units=num_units,
             feature_size=feature_size,
             frequency_skip=frequency_skip,
@@ -136,7 +130,7 @@ class RNNCellTest(test.TestCase):
       num_shifts = int((input_size - feature_size) / frequency_skip + 1)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = rnn_cell.GridLSTMCell(
+        cell = contrib_rnn_cell.GridLSTMCell(
             num_units=num_units,
             feature_size=feature_size,
             frequency_skip=frequency_skip,
@@ -179,7 +173,6 @@ class RNNCellTest(test.TestCase):
     with self.test_session() as sess:
       num_units = 8
       batch_size = 3
-      input_size = 4
       feature_size = 2
       frequency_skip = 1
       num_frequency_blocks = [1, 1]
@@ -188,7 +181,7 @@ class RNNCellTest(test.TestCase):
       end_freqindex_list = [2, 4]
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = rnn_cell.GridLSTMCell(
+        cell = contrib_rnn_cell.GridLSTMCell(
             num_units=num_units,
             feature_size=feature_size,
             frequency_skip=frequency_skip,
@@ -256,7 +249,7 @@ class RNNCellTest(test.TestCase):
         with variable_scope.variable_scope(
             "state_is_tuple" + str(state_is_tuple),
             initializer=init_ops.constant_initializer(0.5)):
-          cell = rnn_cell.GridLSTMCell(
+          cell = contrib_rnn_cell.GridLSTMCell(
               num_units=num_units,
               feature_size=feature_size,
               frequency_skip=frequency_skip,
@@ -337,7 +330,7 @@ class RNNCellTest(test.TestCase):
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = rnn_cell.BidirectionalGridLSTMCell(
+        cell = contrib_rnn_cell.BidirectionalGridLSTMCell(
             num_units=num_units,
             feature_size=feature_size,
             share_time_frequency_weights=True,
@@ -410,7 +403,7 @@ class RNNCellTest(test.TestCase):
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
-        cell = rnn_cell.BidirectionalGridLSTMCell(
+        cell = contrib_rnn_cell.BidirectionalGridLSTMCell(
             num_units=num_units,
             feature_size=feature_size,
             share_time_frequency_weights=True,
@@ -449,28 +442,28 @@ class RNNCellTest(test.TestCase):
   def testAttentionCellWrapperFailures(self):
     with self.assertRaisesRegexp(TypeError,
                                  "The parameter cell is not RNNCell."):
-      rnn_cell.AttentionCellWrapper(None, 0)
+      contrib_rnn_cell.AttentionCellWrapper(None, 0)
 
     num_units = 8
     for state_is_tuple in [False, True]:
       with ops.Graph().as_default():
-        lstm_cell = core_rnn_cell_impl.BasicLSTMCell(
+        lstm_cell = rnn_cell.BasicLSTMCell(
             num_units, state_is_tuple=state_is_tuple)
         with self.assertRaisesRegexp(
             ValueError, "attn_length should be greater than zero, got 0"):
-          rnn_cell.AttentionCellWrapper(
+          contrib_rnn_cell.AttentionCellWrapper(
               lstm_cell, 0, state_is_tuple=state_is_tuple)
         with self.assertRaisesRegexp(
             ValueError, "attn_length should be greater than zero, got -1"):
-          rnn_cell.AttentionCellWrapper(
+          contrib_rnn_cell.AttentionCellWrapper(
               lstm_cell, -1, state_is_tuple=state_is_tuple)
       with ops.Graph().as_default():
-        lstm_cell = core_rnn_cell_impl.BasicLSTMCell(
-            num_units, state_is_tuple=True)
+        lstm_cell = rnn_cell.BasicLSTMCell(num_units, state_is_tuple=True)
         with self.assertRaisesRegexp(
             ValueError, "Cell returns tuple of states, but the flag "
             "state_is_tuple is not set. State size is: *"):
-          rnn_cell.AttentionCellWrapper(lstm_cell, 4, state_is_tuple=False)
+          contrib_rnn_cell.AttentionCellWrapper(
+              lstm_cell, 4, state_is_tuple=False)
 
   def testAttentionCellWrapperZeros(self):
     num_units = 8
@@ -482,9 +475,9 @@ class RNNCellTest(test.TestCase):
         with self.test_session() as sess:
           with variable_scope.variable_scope("state_is_tuple_" + str(
               state_is_tuple)):
-            lstm_cell = core_rnn_cell_impl.BasicLSTMCell(
+            lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
-            cell = rnn_cell.AttentionCellWrapper(
+            cell = contrib_rnn_cell.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
             if state_is_tuple:
               zeros = array_ops.zeros([batch_size, num_units], dtype=np.float32)
@@ -533,9 +526,9 @@ class RNNCellTest(test.TestCase):
         with self.test_session() as sess:
           with variable_scope.variable_scope("state_is_tuple_" + str(
               state_is_tuple)):
-            lstm_cell = core_rnn_cell_impl.BasicLSTMCell(
+            lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
-            cell = rnn_cell.AttentionCellWrapper(
+            cell = contrib_rnn_cell.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
             if state_is_tuple:
               zeros = constant_op.constant(
@@ -576,7 +569,7 @@ class RNNCellTest(test.TestCase):
               self.assertTrue(
                   float(np.linalg.norm((state[0, :] - state[i, :]))) > 1e-6)
 
-  def testAttentionCellWrapperCorrectResult(self):
+  def _testAttentionCellWrapperCorrectResult(self):
     num_units = 4
     attn_length = 6
     batch_size = 2
@@ -604,15 +597,22 @@ class RNNCellTest(test.TestCase):
         dtype=np.float32)
     seed = 12345
     random_seed.set_random_seed(seed)
+    rnn_scope = None
     for state_is_tuple in [False, True]:
       with session.Session() as sess:
         with variable_scope.variable_scope(
             "state_is_tuple", reuse=state_is_tuple,
             initializer=init_ops.glorot_uniform_initializer()):
-          lstm_cell = core_rnn_cell_impl.BasicLSTMCell(
+          lstm_cell = rnn_cell.BasicLSTMCell(
               num_units, state_is_tuple=state_is_tuple)
-          cell = rnn_cell.AttentionCellWrapper(
+          cell = contrib_rnn_cell.AttentionCellWrapper(
               lstm_cell, attn_length, state_is_tuple=state_is_tuple)
+          # This is legacy behavior to preserve the test.  Weight
+          # sharing no longer works by creating a new RNNCell in the
+          # same variable scope; so here we restore the scope of the
+          # RNNCells after the first use below.
+          if rnn_scope is not None:
+            (cell._scope, lstm_cell._scope) = rnn_scope  # pylint: disable=protected-access,unpacking-non-sequence
           zeros1 = random_ops.random_uniform(
               (batch_size, num_units), 0.0, 1.0, seed=seed + 1)
           zeros2 = random_ops.random_uniform(
@@ -629,6 +629,12 @@ class RNNCellTest(test.TestCase):
           inputs = random_ops.random_uniform(
               (batch_size, num_units), 0.0, 1.0, seed=seed + 5)
           output, state = cell(inputs, zero_state)
+          # This is legacy behavior to preserve the test.  Weight
+          # sharing no longer works by creating a new RNNCell in the
+          # same variable scope; so here we store the scope of the
+          # first RNNCell for reuse above.
+          if rnn_scope is None:
+            rnn_scope = (cell._scope, lstm_cell._scope)  # pylint: disable=protected-access
           if state_is_tuple:
             state = array_ops.concat(
                 [state[0][0], state[0][1], state[1], state[2]], 1)
@@ -636,6 +642,320 @@ class RNNCellTest(test.TestCase):
           self.assertAllClose(sess.run(output), expected_output)
           self.assertAllClose(sess.run(state), expected_state)
 
+  def testNASCell(self):
+    num_units = 6
+    batch_size = 3
+    expected_output = np.array([[0.576751, 0.576751, 0.576751, 0.576751,
+                                 0.576751, 0.576751],
+                                [0.618936, 0.618936, 0.618936, 0.618936,
+                                 0.618936, 0.618936],
+                                [0.627393, 0.627393, 0.627393, 0.627393,
+                                 0.627393, 0.627393]])
+    expected_state = np.array([[0.71579772, 0.71579772, 0.71579772, 0.71579772,
+                                0.71579772, 0.71579772, 0.57675087, 0.57675087,
+                                0.57675087, 0.57675087, 0.57675087, 0.57675087],
+                               [0.78041625, 0.78041625, 0.78041625, 0.78041625,
+                                0.78041625, 0.78041625, 0.6189357, 0.6189357,
+                                0.61893570, 0.6189357, 0.6189357, 0.6189357],
+                               [0.79457647, 0.79457647, 0.79457647, 0.79457647,
+                                0.79457653, 0.79457653, 0.62739348, 0.62739348,
+                                0.62739348, 0.62739348, 0.62739348, 0.62739348]
+                              ])
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "nas_test",
+          initializer=init_ops.constant_initializer(0.5)):
+        cell = contrib_rnn_cell.NASCell(num_units=num_units)
+        inputs = constant_op.constant(
+            np.array([[1., 1., 1., 1.],
+                      [2., 2., 2., 2.],
+                      [3., 3., 3., 3.]],
+                     dtype=np.float32),
+            dtype=dtypes.float32)
+        state_value = constant_op.constant(
+            0.1 * np.ones(
+                (batch_size, num_units), dtype=np.float32),
+            dtype=dtypes.float32)
+        init_state = rnn_cell.LSTMStateTuple(state_value, state_value)
+        output, state = cell(inputs, init_state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state])
+
+        # This is a smoke test: Only making sure expected values not change.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], expected_output)
+        # There should be 2 states in the tuple.
+        self.assertEqual(len(res[1]), 2)
+        # Checking the shape of each state to be batch_size * num_units
+        new_c, new_h = res[1]
+        self.assertEqual(new_c.shape[0], batch_size)
+        self.assertEqual(new_c.shape[1], num_units)
+        self.assertEqual(new_h.shape[0], batch_size)
+        self.assertEqual(new_h.shape[1], num_units)
+        self.assertAllClose(np.concatenate(res[1], axis=1), expected_state)
+
+  def testNASCellProj(self):
+    num_units = 6
+    batch_size = 3
+    num_proj = 5
+    expected_output = np.array([[1.697418, 1.697418, 1.697418, 1.697418,
+                                 1.697418],
+                                [1.840037, 1.840037, 1.840037, 1.840037,
+                                 1.840037],
+                                [1.873985, 1.873985, 1.873985, 1.873985,
+                                 1.873985]])
+    expected_state = np.array([[0.69855207, 0.69855207, 0.69855207, 0.69855207,
+                                0.69855207, 0.69855207, 1.69741797, 1.69741797,
+                                1.69741797, 1.69741797, 1.69741797],
+                               [0.77073824, 0.77073824, 0.77073824, 0.77073824,
+                                0.77073824, 0.77073824, 1.84003687, 1.84003687,
+                                1.84003687, 1.84003687, 1.84003687],
+                               [0.78973997, 0.78973997, 0.78973997, 0.78973997,
+                                0.78973997, 0.78973997, 1.87398517, 1.87398517,
+                                1.87398517, 1.87398517, 1.87398517]])
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "nas_proj_test",
+          initializer=init_ops.constant_initializer(0.5)):
+        cell = contrib_rnn_cell.NASCell(num_units=num_units, num_proj=num_proj)
+        inputs = constant_op.constant(
+            np.array([[1., 1., 1., 1.],
+                      [2., 2., 2., 2.],
+                      [3., 3., 3., 3.]],
+                     dtype=np.float32),
+            dtype=dtypes.float32)
+        state_value_c = constant_op.constant(
+            0.1 * np.ones(
+                (batch_size, num_units), dtype=np.float32),
+            dtype=dtypes.float32)
+        state_value_h = constant_op.constant(
+            0.1 * np.ones(
+                (batch_size, num_proj), dtype=np.float32),
+            dtype=dtypes.float32)
+        init_state = rnn_cell.LSTMStateTuple(state_value_c, state_value_h)
+        output, state = cell(inputs, init_state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state])
+
+        # This is a smoke test: Only making sure expected values not change.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], expected_output)
+        # There should be 2 states in the tuple.
+        self.assertEqual(len(res[1]), 2)
+        # Checking the shape of each state to be batch_size * num_units
+        new_c, new_h = res[1]
+        self.assertEqual(new_c.shape[0], batch_size)
+        self.assertEqual(new_c.shape[1], num_units)
+        self.assertEqual(new_h.shape[0], batch_size)
+        self.assertEqual(new_h.shape[1], num_proj)
+        self.assertAllClose(np.concatenate(res[1], axis=1), expected_state)
+
+  def testUGRNNCell(self):
+    num_units = 2
+    batch_size = 3
+    expected_state_and_output = np.array(
+        [[0.13752282, 0.13752282],
+         [0.10545051, 0.10545051],
+         [0.10074195, 0.10074195]],
+        dtype=np.float32)
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "ugrnn_cell_test",
+          initializer=init_ops.constant_initializer(0.5)):
+        cell = contrib_rnn_cell.UGRNNCell(num_units=num_units)
+        inputs = constant_op.constant(
+            np.array([[1., 1., 1., 1.],
+                      [2., 2., 2., 2.],
+                      [3., 3., 3., 3.]],
+                     dtype=np.float32),
+            dtype=dtypes.float32)
+        init_state = constant_op.constant(
+            0.1 * np.ones(
+                (batch_size, num_units), dtype=np.float32),
+            dtype=dtypes.float32)
+        output, state = cell(inputs, init_state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state])
+        # This is a smoke test: Only making sure expected values didn't change.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], expected_state_and_output)
+        self.assertAllClose(res[1], expected_state_and_output)
+
+  def testIntersectionRNNCell(self):
+    num_units = 2
+    batch_size = 3
+    expected_state = np.array(
+        [[0.13752282, 0.13752282],
+         [0.10545051, 0.10545051],
+         [0.10074195, 0.10074195]],
+        dtype=np.float32)
+    expected_output = np.array(
+        [[2.00431061, 2.00431061],
+         [4.00060606, 4.00060606],
+         [6.00008249, 6.00008249]],
+        dtype=np.float32)
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "intersection_rnn_cell_test",
+          initializer=init_ops.constant_initializer(0.5)):
+        cell = contrib_rnn_cell.IntersectionRNNCell(
+            num_units=num_units, num_in_proj=num_units)
+        inputs = constant_op.constant(
+            np.array([[1., 1., 1., 1.],
+                      [2., 2., 2., 2.],
+                      [3., 3., 3., 3.]],
+                     dtype=np.float32),
+            dtype=dtypes.float32)
+        init_state = constant_op.constant(
+            0.1 * np.ones(
+                (batch_size, num_units), dtype=np.float32),
+            dtype=dtypes.float32)
+        output, state = cell(inputs, init_state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state])
+        # This is a smoke test: Only making sure expected values didn't change.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], expected_output)
+        self.assertAllClose(res[1], expected_state)
+
+  def testIntersectionRNNCellFailure(self):
+    num_units = 2
+    batch_size = 3
+    cell = contrib_rnn_cell.IntersectionRNNCell(num_units=num_units)
+    inputs = constant_op.constant(
+        np.array([[1., 1., 1., 1.],
+                  [2., 2., 2., 2.],
+                  [3., 3., 3., 3.]],
+                 dtype=np.float32),
+        dtype=dtypes.float32)
+    init_state = constant_op.constant(
+        0.1 * np.ones(
+            (batch_size, num_units), dtype=np.float32),
+        dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        ValueError, "Must have input size == output size for "
+                    "Intersection RNN. To fix, num_in_proj should "
+                    "be set to num_units at cell init."):
+      cell(inputs, init_state)
+
+  def testPhasedLSTMCell(self):
+    with self.test_session() as sess:
+      num_units = 2
+      batch_size = 3
+      input_size = 4
+      expected_state_c = np.array(
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
+          dtype=np.float32)
+      expected_state_h = np.array(
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
+          dtype=np.float32)
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        t = array_ops.zeros([batch_size, 1], dtype=dtypes.float64)
+        x = array_ops.zeros([batch_size, input_size])
+        c0 = array_ops.zeros([batch_size, 2])
+        h0 = array_ops.zeros([batch_size, 2])
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
+        output, state = contrib_rnn_cell.PhasedLSTMCell(num_units=num_units)(
+            (t, x), state0)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state], {
+            t.name:
+                np.array([[1.], [2.], [3.]]),
+            x.name:
+                np.array([[1., 1., 1., 1.],
+                          [2., 2., 2., 2.],
+                          [3., 3., 3., 3.]]),
+        })
+        # This is a smoke test, making sure expected values are unchanged.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], res[1].h)
+        self.assertAllClose(res[1].c, expected_state_c)
+        self.assertAllClose(res[1].h, expected_state_h)
+
+  def testHighwayWrapper(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "base_cell", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        base_cell = rnn_cell.GRUCell(3)
+        g, m_new = base_cell(x, m)
+      with variable_scope.variable_scope(
+          "hw_cell", initializer=init_ops.constant_initializer(0.5)):
+        hw_cell = contrib_rnn_cell.HighwayWrapper(
+            rnn_cell.GRUCell(3), carry_bias_init=-100.0)
+        g_res, m_new_res = hw_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+      res = sess.run([g, g_res, m_new, m_new_res], {
+          x: np.array([[1., 1., 1.]]),
+          m: np.array([[0.1, 0.1, 0.1]])
+      })
+      # As carry_bias_init is very negative, the carry gate is 'open' and the
+      # transform gate is 'closed'. This means the output equals the input.
+      self.assertAllClose(res[1], res[0])
+      # States are left untouched
+      self.assertAllClose(res[2], res[3])
+
+  def testGLSTMCell(self):
+    # Ensure that G-LSTM matches LSTM when number_of_groups = 1
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 1
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root1", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.ones([batch_size, num_units])
+        # When number_of_groups = 1, G-LSTM is equivalent to regular LSTM
+        gcell = contrib_rnn_cell.GLSTMCell(
+            num_units=num_units, number_of_groups=number_of_groups)
+        cell = rnn_cell.LSTMCell(num_units=num_units)
+        self.assertTrue(isinstance(gcell.state_size, tuple))
+        zero_state = gcell.zero_state(batch_size=batch_size,
+                                      dtype=dtypes.float32)
+        gh, gs = gcell(x, zero_state)
+        h, g = cell(x, zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        glstm_result = sess.run([gh, gs])
+        lstm_result = sess.run([h, g])
+
+        self.assertAllClose(glstm_result[0], lstm_result[0], 1e-5)
+        self.assertAllClose(glstm_result[1], lstm_result[1], 1e-5)
+
+    # Test that G-LSTM subgroup act like corresponding sub-LSTMs
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 2
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root2", initializer=init_ops.constant_initializer(0.5)):
+        # input for G-LSTM with 2 groups
+        glstm_input = array_ops.ones([batch_size, num_units])
+        gcell = contrib_rnn_cell.GLSTMCell(
+            num_units=num_units, number_of_groups=number_of_groups)
+        gcell_zero_state = gcell.zero_state(batch_size=batch_size,
+                                            dtype=dtypes.float32)
+        gh, gs = gcell(glstm_input, gcell_zero_state)
+
+        # input for LSTM cell simulating single G-LSTM group
+        lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
+        # note division by number_of_groups. This cell one simulates G-LSTM group
+        cell = rnn_cell.LSTMCell(num_units=int(num_units / number_of_groups))
+        cell_zero_state = cell.zero_state(batch_size=batch_size,
+                                          dtype=dtypes.float32)
+        h, g = cell(lstm_input, cell_zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        [gh_res, h_res] = sess.run([gh, h])
+        self.assertAllClose(gh_res[:, 0:int(num_units / number_of_groups)],
+                            h_res, 1e-5)
+        self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
+                            h_res, 1e-5)
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
@@ -648,13 +968,13 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         c0 = array_ops.zeros([1, 2])
         h0 = array_ops.zeros([1, 2])
-        state0 = core_rnn_cell_impl.LSTMStateTuple(c0, h0)
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
         c1 = array_ops.zeros([1, 2])
         h1 = array_ops.zeros([1, 2])
-        state1 = core_rnn_cell_impl.LSTMStateTuple(c1, h1)
+        state1 = rnn_cell.LSTMStateTuple(c1, h1)
         state = (state0, state1)
-        single_cell = lambda: rnn_cell.LayerNormBasicLSTMCell(2)
-        cell = core_rnn_cell_impl.MultiRNNCell([single_cell() for _ in range(2)])
+        single_cell = lambda: contrib_rnn_cell.LayerNormBasicLSTMCell(2)
+        cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([g, out_m], {
@@ -689,8 +1009,8 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
             [1, 3])  # Test BasicLSTMCell with input_size != num_units.
         c = array_ops.zeros([1, 2])
         h = array_ops.zeros([1, 2])
-        state = core_rnn_cell_impl.LSTMStateTuple(c, h)
-        cell = rnn_cell.LayerNormBasicLSTMCell(2)
+        state = rnn_cell.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2)
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([g, out_m], {
@@ -706,6 +1026,73 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_c, 1e-5)
         self.assertAllClose(res[1].h, expected_h, 1e-5)
 
+
+  def testBasicLSTMCellWithoutNorm(self):
+    """Tests that BasicLSTMCell with layer_norm=False."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
+        c1 = array_ops.zeros([1, 2])
+        h1 = array_ops.zeros([1, 2])
+        state1 = rnn_cell.LSTMStateTuple(c1, h1)
+        state = (state0, state1)
+        single_cell = lambda: contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)
+        cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
+        g, out_m = cell(x, state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, out_m], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+          c1.name: 0.1 * np.asarray([[4, 5]]),
+          h1.name: 0.1 * np.asarray([[6, 7]]),
+        })
+
+        expected_h = np.array([[ 0.70230919, 0.72581059]])
+        expected_state0_c = np.array([[ 0.8020075,  0.89599884]])
+        expected_state0_h = np.array([[ 0.56668288,  0.60858738]])
+        expected_state1_c = np.array([[ 1.17500675,  1.26892781]])
+        expected_state1_h = np.array([[ 0.70230919,  0.72581059]])
+
+        actual_h = res[0]
+        actual_state0_c = res[1][0].c
+        actual_state0_h = res[1][0].h
+        actual_state1_c = res[1][1].c
+        actual_state1_h = res[1][1].h
+
+        self.assertAllClose(actual_h, expected_h, 1e-5)
+        self.assertAllClose(expected_state0_c, actual_state0_c, 1e-5)
+        self.assertAllClose(expected_state0_h, actual_state0_h, 1e-5)
+        self.assertAllClose(expected_state1_c, actual_state1_c, 1e-5)
+        self.assertAllClose(expected_state1_h, actual_state1_h, 1e-5)
+
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)) as vs:
+        x = array_ops.zeros(
+          [1, 3])  # Test BasicLSTMCell with input_size != num_units.
+        c = array_ops.zeros([1, 2])
+        h = array_ops.zeros([1, 2])
+        state = rnn_cell.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)
+        g, out_m = cell(x, state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, out_m], {
+          x.name: np.array([[1., 1., 1.]]),
+          c.name: 0.1 * np.asarray([[0, 1]]),
+          h.name: 0.1 * np.asarray([[2, 3]]),
+        })
+
+        expected_h = np.array([[ 0.64121795, 0.68166804]])
+        expected_c = np.array([[ 0.88477188, 0.98103917]])
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], expected_h, 1e-5)
+        self.assertAllClose(res[1].c, expected_c, 1e-5)
+        self.assertAllClose(res[1].h, expected_h, 1e-5)
+
   def testBasicLSTMCellWithStateTuple(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
@@ -713,15 +1100,12 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         c0 = array_ops.zeros([1, 2])
         h0 = array_ops.zeros([1, 2])
-        state0 = core_rnn_cell_impl.LSTMStateTuple(c0, h0)
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
         c1 = array_ops.zeros([1, 2])
         h1 = array_ops.zeros([1, 2])
-        state1 = core_rnn_cell_impl.LSTMStateTuple(c1, h1)
-        def single_cell():
-          return rnn_cell.LayerNormBasicLSTMCell(2)
-
-        cell = core_rnn_cell_impl.MultiRNNCell(
-            [single_cell() for _ in range(2)])
+        state1 = rnn_cell.LSTMStateTuple(c1, h1)
+        cell = rnn_cell.MultiRNNCell(
+            [contrib_rnn_cell.LayerNormBasicLSTMCell(2) for _ in range(2)])
         h, (s0, s1) = cell(x, (state0, state1))
         sess.run([variables.global_variables_initializer()])
         res = sess.run([h, s0, s1], {
@@ -771,8 +1155,8 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         x = array_ops.zeros([1, 5])
         c = array_ops.zeros([1, 5])
         h = array_ops.zeros([1, 5])
-        state = core_rnn_cell_impl.LSTMStateTuple(c, h)
-        cell = rnn_cell.LayerNormBasicLSTMCell(
+        state = rnn_cell.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormBasicLSTMCell(
             num_units, layer_norm=False, dropout_keep_prob=keep_prob)
 
         g, s = cell(x, state)
@@ -815,10 +1199,9 @@ def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
     inputs = variable_scope.get_variable(
         "inputs", initializer=random_ops.random_uniform(
             (max_time, batch_size, input_depth), seed=1))
-    maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c
-    cell = core_rnn_cell_impl.MultiRNNCell(
-        [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units))
-         for _ in range(num_layers)])
+    maybe_xla = lambda c: contrib_rnn_cell.CompiledWrapper(c) if compiled else c
+    cell = rnn_cell.MultiRNNCell(
+        [maybe_xla(rnn_cell.LSTMCell(num_units)) for _ in range(num_layers)])
     initial_state = cell.zero_state(
         batch_size=batch_size, dtype=dtypes.float32)
     outputs, final_state = rnn.dynamic_rnn(
@@ -849,7 +1232,7 @@ class CompiledWrapperTest(test.TestCase):
     num_layers = 2
     max_time = 20
 
-    atol = 1e-6
+    atol = 1e-5
 
     random_seed.set_random_seed(1234)
     with self.test_session(graph=ops.Graph()) as sess:
@@ -896,13 +1279,13 @@ class CompiledWrapperTest(test.TestCase):
 
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          core_rnn_cell_impl.MultiRNNCell(
-              [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
-              state_is_tuple=True)(x, m_bad)
+          rnn_cell.MultiRNNCell(
+              [rnn_cell.GRUCell(2)
+               for _ in range(2)], state_is_tuple=True)(x, m_bad)
 
-        _, ml = core_rnn_cell_impl.MultiRNNCell(
-            [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=True)(x, m_good)
+        _, ml = rnn_cell.MultiRNNCell(
+            [rnn_cell.GRUCell(2)
+             for _ in range(2)], state_is_tuple=True)(x, m_good)
 
         sess.run([variables.global_variables_initializer()])
         res = sess.run(ml, {
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
index 444dd70ab07..e0d063a1b69 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@@ -19,21 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.contrib.rnn.python.ops import rnn
+from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -64,14 +58,14 @@ class StackBidirectionalRNNTest(test.TestCase):
         dtypes.int64) if use_sequence_length else None
 
     self.cells_fw = [
-        core_rnn_cell_impl.LSTMCell(
+        rnn_cell.LSTMCell(
             num_units,
             input_size,
             initializer=initializer,
             state_is_tuple=False) for num_units in self.layers
     ]
     self.cells_bw = [
-        core_rnn_cell_impl.LSTMCell(
+        rnn_cell.LSTMCell(
             num_units,
             input_size,
             initializer=initializer,
@@ -83,7 +77,7 @@ class StackBidirectionalRNNTest(test.TestCase):
             dtypes.float32,
             shape=(batch_size, input_size) if use_shape else (None, input_size))
     ]
-    outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn(
+    outputs, state_fw, state_bw = contrib_rnn.stack_bidirectional_rnn(
         self.cells_fw,
         self.cells_bw,
         inputs,
@@ -243,14 +237,14 @@ class StackBidirectionalRNNTest(test.TestCase):
     sequence_length = array_ops.placeholder(dtypes.int64)
 
     self.cells_fw = [
-        core_rnn_cell_impl.LSTMCell(
+        rnn_cell.LSTMCell(
             num_units,
             input_size,
             initializer=initializer,
             state_is_tuple=False) for num_units in self.layers
     ]
     self.cells_bw = [
-        core_rnn_cell_impl.LSTMCell(
+        rnn_cell.LSTMCell(
             num_units,
             input_size,
             initializer=initializer,
@@ -264,7 +258,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     ]
     inputs_c = array_ops.stack(inputs)
     inputs_c = array_ops.transpose(inputs_c, [1, 0, 2])
-    outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn(
+    outputs, st_fw, st_bw = contrib_rnn.stack_bidirectional_dynamic_rnn(
         self.cells_fw,
         self.cells_bw,
         inputs_c,
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn.py b/tensorflow/contrib/rnn/python/ops/core_rnn.py
deleted file mode 100644
index d254e717d55..00000000000
--- a/tensorflow/contrib/rnn/python/ops/core_rnn.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""RNN helpers for TensorFlow models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.util import nest
-
-
-# pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
-_infer_state_dtype = rnn._infer_state_dtype
-_reverse_seq = rnn._reverse_seq
-_rnn_step = rnn._rnn_step
-# pylint: enable=protected-access
-
-
-def static_rnn(cell, inputs, initial_state=None, dtype=None,
-               sequence_length=None, scope=None):
-  """Creates a recurrent neural network specified by RNNCell `cell`.
-
-  The simplest form of RNN network generated is:
-
-  ```python
-    state = cell.zero_state(...)
-    outputs = []
-    for input_ in inputs:
-      output, state = cell(input_, state)
-      outputs.append(output)
-    return (outputs, state)
-  ```
-  However, a few other options are available:
-
-  An initial state can be provided.
-  If the sequence_length vector is provided, dynamic calculation is performed.
-  This method of calculation does not compute the RNN steps past the maximum
-  sequence length of the minibatch (thus saving computational time),
-  and properly propagates the state at an example's sequence length
-  to the final state output.
-
-  The dynamic calculation performed is, at time `t` for batch row `b`,
-
-  ```python
-    (output, state)(b, t) =
-      (t >= sequence_length(b))
-        ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
-        : cell(input(b, t), state(b, t - 1))
-  ```
-
-  Args:
-    cell: An instance of RNNCell.
-    inputs: A length T list of inputs, each a `Tensor` of shape
-      `[batch_size, input_size]`, or a nested tuple of such elements.
-    initial_state: (optional) An initial state for the RNN.
-      If `cell.state_size` is an integer, this must be
-      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-      If `cell.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell.state_size`.
-    dtype: (optional) The data type for the initial state and expected output.
-      Required if initial_state is not provided or RNN state has a heterogeneous
-      dtype.
-    sequence_length: Specifies the length of each sequence in inputs.
-      An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
-    scope: VariableScope for the created subgraph; defaults to "rnn".
-
-  Returns:
-    A pair (outputs, state) where:
-
-    - outputs is a length T list of outputs (one for each input), or a nested
-      tuple of such elements.
-    - state is the final state
-
-  Raises:
-    TypeError: If `cell` is not an instance of RNNCell.
-    ValueError: If `inputs` is `None` or an empty list, or if the input depth
-      (column size) cannot be inferred from inputs via shape inference.
-  """
-
-  if not isinstance(cell, core_rnn_cell.RNNCell):
-    raise TypeError("cell must be an instance of RNNCell")
-  if not nest.is_sequence(inputs):
-    raise TypeError("inputs must be a sequence")
-  if not inputs:
-    raise ValueError("inputs must not be empty")
-
-  outputs = []
-  # Create a new scope in which the caching device is either
-  # determined by the parent scope, or is set to place the cached
-  # Variable using the same placement as for the rest of the RNN.
-  with vs.variable_scope(scope or "rnn") as varscope:
-    if varscope.caching_device is None:
-      varscope.set_caching_device(lambda op: op.device)
-
-    # Obtain the first sequence of the input
-    first_input = inputs
-    while nest.is_sequence(first_input):
-      first_input = first_input[0]
-
-    # Temporarily avoid EmbeddingWrapper and seq2seq badness
-    # TODO(lukaszkaiser): remove EmbeddingWrapper
-    if first_input.get_shape().ndims != 1:
-
-      input_shape = first_input.get_shape().with_rank_at_least(2)
-      fixed_batch_size = input_shape[0]
-
-      flat_inputs = nest.flatten(inputs)
-      for flat_input in flat_inputs:
-        input_shape = flat_input.get_shape().with_rank_at_least(2)
-        batch_size, input_size = input_shape[0], input_shape[1:]
-        fixed_batch_size.merge_with(batch_size)
-        for i, size in enumerate(input_size):
-          if size.value is None:
-            raise ValueError(
-                "Input size (dimension %d of inputs) must be accessible via "
-                "shape inference, but saw value None." % i)
-    else:
-      fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0]
-
-    if fixed_batch_size.value:
-      batch_size = fixed_batch_size.value
-    else:
-      batch_size = array_ops.shape(first_input)[0]
-    if initial_state is not None:
-      state = initial_state
-    else:
-      if not dtype:
-        raise ValueError("If no initial_state is provided, "
-                         "dtype must be specified")
-      state = cell.zero_state(batch_size, dtype)
-
-    if sequence_length is not None:  # Prepare variables
-      sequence_length = ops.convert_to_tensor(
-          sequence_length, name="sequence_length")
-      if sequence_length.get_shape().ndims not in (None, 1):
-        raise ValueError(
-            "sequence_length must be a vector of length batch_size")
-      def _create_zero_output(output_size):
-        # convert int to TensorShape if necessary
-        size = _state_size_with_prefix(output_size, prefix=[batch_size])
-        output = array_ops.zeros(
-            array_ops.stack(size), _infer_state_dtype(dtype, state))
-        shape = _state_size_with_prefix(
-            output_size, prefix=[fixed_batch_size.value])
-        output.set_shape(tensor_shape.TensorShape(shape))
-        return output
-
-      output_size = cell.output_size
-      flat_output_size = nest.flatten(output_size)
-      flat_zero_output = tuple(
-          _create_zero_output(size) for size in flat_output_size)
-      zero_output = nest.pack_sequence_as(structure=output_size,
-                                          flat_sequence=flat_zero_output)
-
-      sequence_length = math_ops.to_int32(sequence_length)
-      min_sequence_length = math_ops.reduce_min(sequence_length)
-      max_sequence_length = math_ops.reduce_max(sequence_length)
-
-    for time, input_ in enumerate(inputs):
-      if time > 0: varscope.reuse_variables()
-      # pylint: disable=cell-var-from-loop
-      call_cell = lambda: cell(input_, state)
-      # pylint: enable=cell-var-from-loop
-      if sequence_length is not None:
-        (output, state) = _rnn_step(
-            time=time,
-            sequence_length=sequence_length,
-            min_sequence_length=min_sequence_length,
-            max_sequence_length=max_sequence_length,
-            zero_output=zero_output,
-            state=state,
-            call_cell=call_cell,
-            state_size=cell.state_size)
-      else:
-        (output, state) = call_cell()
-
-      outputs.append(output)
-
-    return (outputs, state)
-
-
-def static_state_saving_rnn(cell, inputs, state_saver, state_name,
-                            sequence_length=None, scope=None):
-  """RNN that accepts a state saver for time-truncated RNN calculation.
-
-  Args:
-    cell: An instance of `RNNCell`.
-    inputs: A length T list of inputs, each a `Tensor` of shape
-      `[batch_size, input_size]`.
-    state_saver: A state saver object with methods `state` and `save_state`.
-    state_name: Python string or tuple of strings.  The name to use with the
-      state_saver. If the cell returns tuples of states (i.e.,
-      `cell.state_size` is a tuple) then `state_name` should be a tuple of
-      strings having the same length as `cell.state_size`.  Otherwise it should
-      be a single string.
-    sequence_length: (optional) An int32/int64 vector size [batch_size].
-      See the documentation for rnn() for more details about sequence_length.
-    scope: VariableScope for the created subgraph; defaults to "rnn".
-
-  Returns:
-    A pair (outputs, state) where:
-      outputs is a length T list of outputs (one for each input)
-      states is the final state
-
-  Raises:
-    TypeError: If `cell` is not an instance of RNNCell.
-    ValueError: If `inputs` is `None` or an empty list, or if the arity and
-     type of `state_name` does not match that of `cell.state_size`.
-  """
-  state_size = cell.state_size
-  state_is_tuple = nest.is_sequence(state_size)
-  state_name_tuple = nest.is_sequence(state_name)
-
-  if state_is_tuple != state_name_tuple:
-    raise ValueError(
-        "state_name should be the same type as cell.state_size.  "
-        "state_name: %s, cell.state_size: %s"
-        % (str(state_name), str(state_size)))
-
-  if state_is_tuple:
-    state_name_flat = nest.flatten(state_name)
-    state_size_flat = nest.flatten(state_size)
-
-    if len(state_name_flat) != len(state_size_flat):
-      raise ValueError("#elems(state_name) != #elems(state_size): %d vs. %d"
-                       % (len(state_name_flat), len(state_size_flat)))
-
-    initial_state = nest.pack_sequence_as(
-        structure=state_size,
-        flat_sequence=[state_saver.state(s) for s in state_name_flat])
-  else:
-    initial_state = state_saver.state(state_name)
-
-  (outputs, state) = static_rnn(cell, inputs, initial_state=initial_state,
-                                sequence_length=sequence_length, scope=scope)
-
-  if state_is_tuple:
-    flat_state = nest.flatten(state)
-    state_name = nest.flatten(state_name)
-    save_state = [state_saver.save_state(name, substate)
-                  for name, substate in zip(state_name, flat_state)]
-  else:
-    save_state = [state_saver.save_state(state_name, state)]
-
-  with ops.control_dependencies(save_state):
-    last_output = outputs[-1]
-    flat_last_output = nest.flatten(last_output)
-    flat_last_output = [
-        array_ops.identity(output) for output in flat_last_output]
-    outputs[-1] = nest.pack_sequence_as(structure=last_output,
-                                        flat_sequence=flat_last_output)
-
-  return (outputs, state)
-
-
-def static_bidirectional_rnn(cell_fw, cell_bw, inputs,
-                             initial_state_fw=None, initial_state_bw=None,
-                             dtype=None, sequence_length=None, scope=None):
-  """Creates a bidirectional recurrent neural network.
-
-  Similar to the unidirectional case above (rnn) but takes input and builds
-  independent forward and backward RNNs with the final forward and backward
-  outputs depth-concatenated, such that the output will have the format
-  [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
-  forward and backward cell must match. The initial state for both directions
-  is zero by default (but can be set optionally) and no intermediate states are
-  ever returned -- the network is fully unrolled for the given (passed in)
-  length(s) of the sequence(s) or completely unrolled if length(s) is not given.
-
-  Args:
-    cell_fw: An instance of RNNCell, to be used for forward direction.
-    cell_bw: An instance of RNNCell, to be used for backward direction.
-    inputs: A length T list of inputs, each a tensor of shape
-      [batch_size, input_size], or a nested tuple of such elements.
-    initial_state_fw: (optional) An initial state for the forward RNN.
-      This must be a tensor of appropriate type and shape
-      `[batch_size, cell_fw.state_size]`.
-      If `cell_fw.state_size` is a tuple, this should be a tuple of
-      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
-      the corresponding properties of `cell_bw`.
-    dtype: (optional) The data type for the initial state.  Required if
-      either of the initial states are not provided.
-    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
-      containing the actual lengths for each of the sequences.
-    scope: VariableScope for the created subgraph; defaults to
-      "bidirectional_rnn"
-
-  Returns:
-    A tuple (outputs, output_state_fw, output_state_bw) where:
-      outputs is a length `T` list of outputs (one for each input), which
-        are depth-concatenated forward and backward outputs.
-      output_state_fw is the final state of the forward rnn.
-      output_state_bw is the final state of the backward rnn.
-
-  Raises:
-    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-    ValueError: If inputs is None or an empty list.
-  """
-
-  if not isinstance(cell_fw, core_rnn_cell.RNNCell):
-    raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, core_rnn_cell.RNNCell):
-    raise TypeError("cell_bw must be an instance of RNNCell")
-  if not nest.is_sequence(inputs):
-    raise TypeError("inputs must be a sequence")
-  if not inputs:
-    raise ValueError("inputs must not be empty")
-
-  with vs.variable_scope(scope or "bidirectional_rnn"):
-    # Forward direction
-    with vs.variable_scope("fw") as fw_scope:
-      output_fw, output_state_fw = static_rnn(
-          cell_fw, inputs, initial_state_fw, dtype,
-          sequence_length, scope=fw_scope)
-
-    # Backward direction
-    with vs.variable_scope("bw") as bw_scope:
-      reversed_inputs = _reverse_seq(inputs, sequence_length)
-      tmp, output_state_bw = static_rnn(
-          cell_bw, reversed_inputs, initial_state_bw,
-          dtype, sequence_length, scope=bw_scope)
-
-  output_bw = _reverse_seq(tmp, sequence_length)
-  # Concat each of the forward/backward outputs
-  flat_output_fw = nest.flatten(output_fw)
-  flat_output_bw = nest.flatten(output_bw)
-
-  flat_outputs = tuple(
-      array_ops.concat([fw, bw], 1)
-      for fw, bw in zip(flat_output_fw, flat_output_bw))
-
-  outputs = nest.pack_sequence_as(structure=output_fw,
-                                  flat_sequence=flat_outputs)
-
-  return (outputs, output_state_fw, output_state_bw)
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
index c101b68d921..6b6bd503cee 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
@@ -12,45 +12,219 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Module implementing RNN Cells that used to be in core.
 
-"""Module for constructing RNN Cells.
-
-## Base interface for all RNN Cells
-
-@@RNNCell
-
-## RNN Cells for use with TensorFlow's core RNN methods
-
-@@BasicRNNCell
-@@BasicLSTMCell
-@@GRUCell
-@@LSTMCell
-
-## Classes storing split `RNNCell` state
-
-@@LSTMStateTuple
-
-## RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-@@MultiRNNCell
-@@DropoutWrapper
 @@EmbeddingWrapper
 @@InputProjectionWrapper
 @@OutputProjectionWrapper
-@@DeviceWrapper
-@@ResidualWrapper
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import math
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+
+RNNCell = rnn_cell_impl.RNNCell  # pylint: disable=invalid-name
+_linear = rnn_cell_impl._linear  # pylint: disable=invalid-name, protected-access
+_like_rnncell = rnn_cell_impl._like_rnncell  # pylint: disable=invalid-name, protected-access
 
 
-_allowed_symbols = []
+class EmbeddingWrapper(RNNCell):
+  """Operator adding input embedding to the given cell.
 
-remove_undocumented(__name__, _allowed_symbols)
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the embedding on this batch-concatenated sequence, then split it and
+  feed into your RNN.
+  """
+
+  def __init__(self,
+               cell,
+               embedding_classes,
+               embedding_size,
+               initializer=None,
+               reuse=None):
+    """Create a cell with an added input embedding.
+
+    Args:
+      cell: an RNNCell, an embedding will be put before its inputs.
+      embedding_classes: integer, how many symbols will be embedded.
+      embedding_size: integer, the size of the vectors we embed into.
+      initializer: an initializer to use when creating the embedding;
+        if None, the initializer from variable scope or a default one is used.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if embedding_classes is not positive.
+    """
+    super(EmbeddingWrapper, self).__init__(_reuse=reuse)
+    if not _like_rnncell(cell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if embedding_classes <= 0 or embedding_size <= 0:
+      raise ValueError("Both embedding_classes and embedding_size must be > 0: "
+                       "%d, %d." % (embedding_classes, embedding_size))
+    self._cell = cell
+    self._embedding_classes = embedding_classes
+    self._embedding_size = embedding_size
+    self._initializer = initializer
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def call(self, inputs, state):
+    """Run the cell on embedded inputs."""
+    with ops.device("/cpu:0"):
+      if self._initializer:
+        initializer = self._initializer
+      elif vs.get_variable_scope().initializer:
+        initializer = vs.get_variable_scope().initializer
+      else:
+        # Default initializer for embeddings should have variance=1.
+        sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
+        initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
+
+      if isinstance(state, tuple):
+        data_type = state[0].dtype
+      else:
+        data_type = state.dtype
+
+      embedding = vs.get_variable(
+          "embedding", [self._embedding_classes, self._embedding_size],
+          initializer=initializer,
+          dtype=data_type)
+      embedded = embedding_ops.embedding_lookup(embedding,
+                                                array_ops.reshape(inputs, [-1]))
+
+      return self._cell(embedded, state)
+
+
+class InputProjectionWrapper(RNNCell):
+  """Operator adding an input projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the projection on this batch-concatenated sequence, then split it.
+  """
+
+  def __init__(self,
+               cell,
+               num_proj,
+               activation=None,
+               input_size=None,
+               reuse=None):
+    """Create a cell with input projection.
+
+    Args:
+      cell: an RNNCell, a projection of inputs is added before it.
+      num_proj: Python integer.  The dimension to project to.
+      activation: (optional) an optional activation function.
+      input_size: Deprecated and unused.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+    """
+    super(InputProjectionWrapper, self).__init__(_reuse=reuse)
+    if input_size is not None:
+      logging.warn("%s: The input_size parameter is deprecated.", self)
+    if not _like_rnncell(cell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    self._cell = cell
+    self._num_proj = num_proj
+    self._activation = activation
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def call(self, inputs, state):
+    """Run the input projection and then the cell."""
+    # Default scope: "InputProjectionWrapper"
+    projected = _linear(inputs, self._num_proj, True)
+    if self._activation:
+      projected = self._activation(projected)
+    return self._cell(projected, state)
+
+
+class OutputProjectionWrapper(RNNCell):
+  """Operator adding an output projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your outputs in time,
+  do the projection on this batch-concatenated sequence, then split it
+  if needed or directly feed into a softmax.
+  """
+
+  def __init__(self, cell, output_size, activation=None, reuse=None):
+    """Create a cell with output projection.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      output_size: integer, the size of the output after projection.
+      activation: (optional) an optional activation function.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if output_size is not positive.
+    """
+    super(OutputProjectionWrapper, self).__init__(_reuse=reuse)
+    if not _like_rnncell(cell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if output_size < 1:
+      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
+    self._cell = cell
+    self._output_size = output_size
+    self._activation = activation
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def call(self, inputs, state):
+    """Run the cell and output projection on inputs, starting from state."""
+    output, res_state = self._cell(inputs, state)
+    projected = _linear(output, self._output_size, True)
+    if self._activation:
+      projected = self._activation(projected)
+    return projected, res_state
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
deleted file mode 100644
index b90abd57d95..00000000000
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
+++ /dev/null
@@ -1,813 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Module implementing RNN Cells."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import math
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import variable_scope as vs
-
-from tensorflow.python.ops.math_ops import sigmoid
-from tensorflow.python.ops.math_ops import tanh
-from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import nest
-
-
-class BasicRNNCell(RNNCell):
-  """The most basic RNN cell."""
-
-  def __init__(self, num_units, input_size=None, activation=tanh):
-    if input_size is not None:
-      logging.warn("%s: The input_size parameter is deprecated.", self)
-    self._num_units = num_units
-    self._activation = activation
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-    with vs.variable_scope(scope or "basic_rnn_cell"):
-      output = self._activation(
-          _linear([inputs, state], self._num_units, True))
-    return output, output
-
-
-class GRUCell(RNNCell):
-  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
-
-  def __init__(self, num_units, input_size=None, activation=tanh):
-    if input_size is not None:
-      logging.warn("%s: The input_size parameter is deprecated.", self)
-    self._num_units = num_units
-    self._activation = activation
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Gated recurrent unit (GRU) with nunits cells."""
-    with vs.variable_scope(scope or "gru_cell"):
-      with vs.variable_scope("gates"):  # Reset gate and update gate.
-        # We start with bias of 1.0 to not reset and not update.
-        r, u = array_ops.split(
-            value=_linear(
-                [inputs, state], 2 * self._num_units, True, 1.0),
-            num_or_size_splits=2,
-            axis=1)
-        r, u = sigmoid(r), sigmoid(u)
-      with vs.variable_scope("candidate"):
-        c = self._activation(_linear([inputs, r * state],
-                                     self._num_units, True))
-      new_h = u * state + (1 - u) * c
-    return new_h, new_h
-
-
-_LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
-
-
-class LSTMStateTuple(_LSTMStateTuple):
-  """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
-
-  Stores two elements: `(c, h)`, in that order.
-
-  Only used when `state_is_tuple=True`.
-  """
-  __slots__ = ()
-
-  @property
-  def dtype(self):
-    (c, h) = self
-    if not c.dtype == h.dtype:
-      raise TypeError("Inconsistent internal state: %s vs %s" %
-                      (str(c.dtype), str(h.dtype)))
-    return c.dtype
-
-
-class BasicLSTMCell(RNNCell):
-  """Basic LSTM recurrent network cell.
-
-  The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-  We add forget_bias (default: 1) to the biases of the forget gate in order to
-  reduce the scale of forgetting in the beginning of the training.
-
-  It does not allow cell clipping, a projection layer, and does not
-  use peep-hole connections: it is the basic baseline.
-
-  For advanced models, please use the full LSTMCell that follows.
-  """
-
-  def __init__(self, num_units, forget_bias=1.0, input_size=None,
-               state_is_tuple=True, activation=tanh):
-    """Initialize the basic LSTM cell.
-
-    Args:
-      num_units: int, The number of units in the LSTM cell.
-      forget_bias: float, The bias added to forget gates (see above).
-      input_size: Deprecated and unused.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of
-        the `c_state` and `m_state`.  If False, they are concatenated
-        along the column axis.  The latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.
-    """
-    if not state_is_tuple:
-      logging.warn("%s: Using a concatenated state is slower and will soon be "
-                   "deprecated.  Use state_is_tuple=True.", self)
-    if input_size is not None:
-      logging.warn("%s: The input_size parameter is deprecated.", self)
-    self._num_units = num_units
-    self._forget_bias = forget_bias
-    self._state_is_tuple = state_is_tuple
-    self._activation = activation
-
-  @property
-  def state_size(self):
-    return (LSTMStateTuple(self._num_units, self._num_units)
-            if self._state_is_tuple else 2 * self._num_units)
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Long short-term memory cell (LSTM)."""
-    with vs.variable_scope(scope or "basic_lstm_cell"):
-      # Parameters of gates are concatenated into one multiply for efficiency.
-      if self._state_is_tuple:
-        c, h = state
-      else:
-        c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
-      concat = _linear([inputs, h], 4 * self._num_units, True)
-
-      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
-
-      new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
-               self._activation(j))
-      new_h = self._activation(new_c) * sigmoid(o)
-
-      if self._state_is_tuple:
-        new_state = LSTMStateTuple(new_c, new_h)
-      else:
-        new_state = array_ops.concat([new_c, new_h], 1)
-      return new_h, new_state
-
-
-class LSTMCell(RNNCell):
-  """Long short-term memory unit (LSTM) recurrent network cell.
-
-  The default non-peephole implementation is based on:
-
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-  S. Hochreiter and J. Schmidhuber.
-  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-  The peephole implementation is based on:
-
-    https://research.google.com/pubs/archive/43905.pdf
-
-  Hasim Sak, Andrew Senior, and Francoise Beaufays.
-  "Long short-term memory recurrent neural network architectures for
-   large scale acoustic modeling." INTERSPEECH, 2014.
-
-  The class uses optional peep-hole connections, optional cell clipping, and
-  an optional projection layer.
-  """
-
-  def __init__(self, num_units, input_size=None,
-               use_peepholes=False, cell_clip=None,
-               initializer=None, num_proj=None, proj_clip=None,
-               num_unit_shards=None, num_proj_shards=None,
-               forget_bias=1.0, state_is_tuple=True,
-               activation=tanh):
-    """Initialize the parameters for an LSTM cell.
-
-    Args:
-      num_units: int, The number of units in the LSTM cell
-      input_size: Deprecated and unused.
-      use_peepholes: bool, set True to enable diagonal/peephole connections.
-      cell_clip: (optional) A float value, if provided the cell state is clipped
-        by this value prior to the cell output activation.
-      initializer: (optional) The initializer to use for the weight and
-        projection matrices.
-      num_proj: (optional) int, The output dimensionality for the projection
-        matrices.  If None, no projection is performed.
-      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-        provided, then the projected values are clipped elementwise to within
-        `[-proj_clip, proj_clip]`.
-      num_unit_shards: Deprecated, will be removed by Jan. 2017.
-        Use a variable_scope partitioner instead.
-      num_proj_shards: Deprecated, will be removed by Jan. 2017.
-        Use a variable_scope partitioner instead.
-      forget_bias: Biases of the forget gate are initialized by default to 1
-        in order to reduce the scale of forgetting at the beginning of
-        the training.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of
-        the `c_state` and `m_state`.  If False, they are concatenated
-        along the column axis.  This latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.
-    """
-    if not state_is_tuple:
-      logging.warn("%s: Using a concatenated state is slower and will soon be "
-                   "deprecated.  Use state_is_tuple=True.", self)
-    if input_size is not None:
-      logging.warn("%s: The input_size parameter is deprecated.", self)
-    if num_unit_shards is not None or num_proj_shards is not None:
-      logging.warn(
-          "%s: The num_unit_shards and proj_unit_shards parameters are "
-          "deprecated and will be removed in Jan 2017.  "
-          "Use a variable scope with a partitioner instead.", self)
-
-    self._num_units = num_units
-    self._use_peepholes = use_peepholes
-    self._cell_clip = cell_clip
-    self._initializer = initializer
-    self._num_proj = num_proj
-    self._proj_clip = proj_clip
-    self._num_unit_shards = num_unit_shards
-    self._num_proj_shards = num_proj_shards
-    self._forget_bias = forget_bias
-    self._state_is_tuple = state_is_tuple
-    self._activation = activation
-
-    if num_proj:
-      self._state_size = (
-          LSTMStateTuple(num_units, num_proj)
-          if state_is_tuple else num_units + num_proj)
-      self._output_size = num_proj
-    else:
-      self._state_size = (
-          LSTMStateTuple(num_units, num_units)
-          if state_is_tuple else 2 * num_units)
-      self._output_size = num_units
-
-  @property
-  def state_size(self):
-    return self._state_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run one step of LSTM.
-
-    Args:
-      inputs: input Tensor, 2D, batch x num_units.
-      state: if `state_is_tuple` is False, this must be a state Tensor,
-        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-        `m_state`.
-      scope: VariableScope for the created subgraph; defaults to "lstm_cell".
-
-    Returns:
-      A tuple containing:
-
-      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-        LSTM after reading `inputs` when previous state was `state`.
-        Here output_dim is:
-           num_proj if num_proj was set,
-           num_units otherwise.
-      - Tensor(s) representing the new state of LSTM after reading `inputs` when
-        the previous state was `state`.  Same type and shape(s) as `state`.
-
-    Raises:
-      ValueError: If input size cannot be inferred from inputs via
-        static shape inference.
-    """
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
-
-    if self._state_is_tuple:
-      (c_prev, m_prev) = state
-    else:
-      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
-      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
-
-    dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2)[1]
-    if input_size.value is None:
-      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    with vs.variable_scope(scope or "lstm_cell",
-                           initializer=self._initializer) as unit_scope:
-      if self._num_unit_shards is not None:
-        unit_scope.set_partitioner(
-            partitioned_variables.fixed_size_partitioner(
-                self._num_unit_shards))
-      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True)
-      i, j, f, o = array_ops.split(
-          value=lstm_matrix, num_or_size_splits=4, axis=1)
-      # Diagonal connections
-      if self._use_peepholes:
-        with vs.variable_scope(unit_scope) as projection_scope:
-          if self._num_unit_shards is not None:
-            projection_scope.set_partitioner(None)
-          w_f_diag = vs.get_variable(
-              "w_f_diag", shape=[self._num_units], dtype=dtype)
-          w_i_diag = vs.get_variable(
-              "w_i_diag", shape=[self._num_units], dtype=dtype)
-          w_o_diag = vs.get_variable(
-              "w_o_diag", shape=[self._num_units], dtype=dtype)
-
-      if self._use_peepholes:
-        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
-      else:
-        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
-             self._activation(j))
-
-      if self._cell_clip is not None:
-        # pylint: disable=invalid-unary-operand-type
-        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
-        # pylint: enable=invalid-unary-operand-type
-      if self._use_peepholes:
-        m = sigmoid(o + w_o_diag * c) * self._activation(c)
-      else:
-        m = sigmoid(o) * self._activation(c)
-
-      if self._num_proj is not None:
-        with vs.variable_scope("projection") as proj_scope:
-          if self._num_proj_shards is not None:
-            proj_scope.set_partitioner(
-                partitioned_variables.fixed_size_partitioner(
-                    self._num_proj_shards))
-          m = _linear(m, self._num_proj, bias=False)
-
-        if self._proj_clip is not None:
-          # pylint: disable=invalid-unary-operand-type
-          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
-          # pylint: enable=invalid-unary-operand-type
-
-    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
-                 array_ops.concat([c, m], 1))
-    return m, new_state
-
-
-class OutputProjectionWrapper(RNNCell):
-  """Operator adding an output projection to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your outputs in time,
-  do the projection on this batch-concatenated sequence, then split it
-  if needed or directly feed into a softmax.
-  """
-
-  def __init__(self, cell, output_size):
-    """Create a cell with output projection.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      output_size: integer, the size of the output after projection.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if output_size is not positive.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    if output_size < 1:
-      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
-    self._cell = cell
-    self._output_size = output_size
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell and output projection on inputs, starting from state."""
-    output, res_state = self._cell(inputs, state)
-    # Default scope: "OutputProjectionWrapper"
-    with vs.variable_scope(scope or "output_projection_wrapper"):
-      projected = _linear(output, self._output_size, True)
-    return projected, res_state
-
-
-class InputProjectionWrapper(RNNCell):
-  """Operator adding an input projection to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your inputs in time,
-  do the projection on this batch-concatenated sequence, then split it.
-  """
-
-  def __init__(self, cell, num_proj, input_size=None):
-    """Create a cell with input projection.
-
-    Args:
-      cell: an RNNCell, a projection of inputs is added before it.
-      num_proj: Python integer.  The dimension to project to.
-      input_size: Deprecated and unused.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-    """
-    if input_size is not None:
-      logging.warn("%s: The input_size parameter is deprecated.", self)
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    self._cell = cell
-    self._num_proj = num_proj
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the input projection and then the cell."""
-    # Default scope: "InputProjectionWrapper"
-    with vs.variable_scope(scope or "input_projection_wrapper"):
-      projected = _linear(inputs, self._num_proj, True)
-    return self._cell(projected, state)
-
-
-class DropoutWrapper(RNNCell):
-  """Operator adding dropout to inputs and outputs of the given cell."""
-
-  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
-               seed=None):
-    """Create a cell with added input and/or output dropout.
-
-    Dropout is never used on the state.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      input_keep_prob: unit Tensor or float between 0 and 1, input keep
-        probability; if it is float and 1, no input dropout will be added.
-      output_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is float and 1, no output dropout will be added.
-      seed: (optional) integer, the randomness seed.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if keep_prob is not between 0 and 1.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not a RNNCell.")
-    if (isinstance(input_keep_prob, float) and
-        not (input_keep_prob >= 0.0 and input_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
-                       % input_keep_prob)
-    if (isinstance(output_keep_prob, float) and
-        not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
-      raise ValueError("Parameter output_keep_prob must be between 0 and 1: %d"
-                       % output_keep_prob)
-    self._cell = cell
-    self._input_keep_prob = input_keep_prob
-    self._output_keep_prob = output_keep_prob
-    self._seed = seed
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell with the declared dropouts."""
-    if (not isinstance(self._input_keep_prob, float) or
-        self._input_keep_prob < 1):
-      inputs = nn_ops.dropout(inputs, self._input_keep_prob, seed=self._seed)
-    output, new_state = self._cell(inputs, state, scope)
-    if (not isinstance(self._output_keep_prob, float) or
-        self._output_keep_prob < 1):
-      output = nn_ops.dropout(output, self._output_keep_prob, seed=self._seed)
-    return output, new_state
-
-
-class ResidualWrapper(RNNCell):
-  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
-
-  def __init__(self, cell):
-    """Constructs a `ResidualWrapper` for `cell`.
-
-    Args:
-      cell: An instance of `RNNCell`.
-    """
-    self._cell = cell
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell and add its inputs to its outputs.
-
-    Args:
-      inputs: cell inputs.
-      state: cell state.
-      scope: optional cell scope.
-
-    Returns:
-      Tuple of cell outputs and new state.
-
-    Raises:
-      TypeError: If cell inputs and outputs have different structure (type).
-      ValueError: If cell inputs and outputs have different structure (value).
-    """
-    output, new_state = self._cell(inputs, state, scope=scope)
-    nest.assert_same_structure(inputs, output)
-    res_output = nest.map_structure(
-        lambda inp, out: inp + out, inputs, output)
-    return (res_output, new_state)
-
-
-class DeviceWrapper(RNNCell):
-  """Operator that ensures an RNNCell runs on a particular device."""
-
-  def __init__(self, cell, device):
-    """Construct a `DeviceWrapper` for `cell` with device `device`.
-
-    Ensures the wrapped `cell` is called with `tf.device(device)`.
-
-    Args:
-      cell: An instance of `RNNCell`.
-      device: A device string or function, for passing to `tf.device`.
-    """
-    self._cell = cell
-    self._device = device
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell on specified device."""
-    with ops.device(self._device):
-      return self._cell(inputs, state, scope=scope)
-
-
-class EmbeddingWrapper(RNNCell):
-  """Operator adding input embedding to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your inputs in time,
-  do the embedding on this batch-concatenated sequence, then split it and
-  feed into your RNN.
-  """
-
-  def __init__(self, cell, embedding_classes, embedding_size, initializer=None):
-    """Create a cell with an added input embedding.
-
-    Args:
-      cell: an RNNCell, an embedding will be put before its inputs.
-      embedding_classes: integer, how many symbols will be embedded.
-      embedding_size: integer, the size of the vectors we embed into.
-      initializer: an initializer to use when creating the embedding;
-        if None, the initializer from variable scope or a default one is used.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if embedding_classes is not positive.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    if embedding_classes <= 0 or embedding_size <= 0:
-      raise ValueError("Both embedding_classes and embedding_size must be > 0: "
-                       "%d, %d." % (embedding_classes, embedding_size))
-    self._cell = cell
-    self._embedding_classes = embedding_classes
-    self._embedding_size = embedding_size
-    self._initializer = initializer
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell on embedded inputs."""
-    with vs.variable_scope(scope or "embedding_wrapper"):  # "EmbeddingWrapper"
-      with ops.device("/cpu:0"):
-        if self._initializer:
-          initializer = self._initializer
-        elif vs.get_variable_scope().initializer:
-          initializer = vs.get_variable_scope().initializer
-        else:
-          # Default initializer for embeddings should have variance=1.
-          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
-          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
-
-        if type(state) is tuple:
-          data_type = state[0].dtype
-        else:
-          data_type = state.dtype
-
-        embedding = vs.get_variable(
-            "embedding", [self._embedding_classes, self._embedding_size],
-            initializer=initializer,
-            dtype=data_type)
-        embedded = embedding_ops.embedding_lookup(
-            embedding, array_ops.reshape(inputs, [-1]))
-    return self._cell(embedded, state)
-
-
-class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
-
-  def __init__(self, cells, state_is_tuple=True):
-    """Create a RNN cell composed sequentially of a number of RNNCells.
-
-    Args:
-      cells: list of RNNCells that will be composed in this order.
-      state_is_tuple: If True, accepted and returned states are n-tuples, where
-        `n = len(cells)`.  If False, the states are all
-        concatenated along the column axis.  This latter behavior will soon be
-        deprecated.
-
-    Raises:
-      ValueError: if cells is empty (not allowed), or at least one of the cells
-        returns a state tuple but the flag `state_is_tuple` is `False`.
-    """
-    if not cells:
-      raise ValueError("Must specify at least one cell for MultiRNNCell.")
-    if not nest.is_sequence(cells):
-      raise TypeError(
-          "cells must be a list or tuple, but saw: %s." % cells)
-
-    self._cells = cells
-    self._state_is_tuple = state_is_tuple
-    if not state_is_tuple:
-      if any(nest.is_sequence(c.state_size) for c in self._cells):
-        raise ValueError("Some cells return tuples of states, but the flag "
-                         "state_is_tuple is not set.  State sizes are: %s"
-                         % str([c.state_size for c in self._cells]))
-
-  @property
-  def state_size(self):
-    if self._state_is_tuple:
-      return tuple(cell.state_size for cell in self._cells)
-    else:
-      return sum([cell.state_size for cell in self._cells])
-
-  @property
-  def output_size(self):
-    return self._cells[-1].output_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run this multi-layer cell on inputs, starting from state."""
-    with vs.variable_scope(scope or "multi_rnn_cell"):
-      cur_state_pos = 0
-      cur_inp = inputs
-      new_states = []
-      for i, cell in enumerate(self._cells):
-        with vs.variable_scope("cell_%d" % i):
-          if self._state_is_tuple:
-            if not nest.is_sequence(state):
-              raise ValueError(
-                  "Expected state to be a tuple of length %d, but received: %s"
-                  % (len(self.state_size), state))
-            cur_state = state[i]
-          else:
-            cur_state = array_ops.slice(
-                state, [0, cur_state_pos], [-1, cell.state_size])
-            cur_state_pos += cell.state_size
-          cur_inp, new_state = cell(cur_inp, cur_state)
-          new_states.append(new_state)
-    new_states = (tuple(new_states) if self._state_is_tuple else
-                  array_ops.concat(new_states, 1))
-    return cur_inp, new_states
-
-
-class _SlimRNNCell(RNNCell):
-  """A simple wrapper for slim.rnn_cells."""
-
-  def __init__(self, cell_fn):
-    """Create a SlimRNNCell from a cell_fn.
-
-    Args:
-      cell_fn: a function which takes (inputs, state, scope) and produces the
-        outputs and the new_state. Additionally when called with inputs=None and
-        state=None it should return (initial_outputs, initial_state).
-
-    Raises:
-      TypeError: if cell_fn is not callable
-      ValueError: if cell_fn cannot produce a valid initial state.
-    """
-    if not callable(cell_fn):
-      raise TypeError("cell_fn %s needs to be callable", cell_fn)
-    self._cell_fn = cell_fn
-    self._cell_name = cell_fn.func.__name__
-    init_output, init_state = self._cell_fn(None, None)
-    output_shape = init_output.get_shape()
-    state_shape = init_state.get_shape()
-    self._output_size = output_shape.with_rank(2)[1].value
-    self._state_size = state_shape.with_rank(2)[1].value
-    if self._output_size is None:
-      raise ValueError("Initial output created by %s has invalid shape %s" %
-                       (self._cell_name, output_shape))
-    if self._state_size is None:
-      raise ValueError("Initial state created by %s has invalid shape %s" %
-                       (self._cell_name, state_shape))
-
-  @property
-  def state_size(self):
-    return self._state_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  def __call__(self, inputs, state, scope=None):
-    scope = scope or self._cell_name
-    output, state = self._cell_fn(inputs, state, scope=scope)
-    return output, state
-
-
-def _linear(args, output_size, bias, bias_start=0.0):
-  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
-
-  Args:
-    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
-    output_size: int, second dimension of W[i].
-    bias: boolean, whether to add a bias term or not.
-    bias_start: starting value to initialize the bias; 0 by default.
-
-  Returns:
-    A 2D Tensor with shape [batch x output_size] equal to
-    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
-
-  Raises:
-    ValueError: if some of the arguments has unspecified or wrong shape.
-  """
-  if args is None or (nest.is_sequence(args) and not args):
-    raise ValueError("`args` must be specified")
-  if not nest.is_sequence(args):
-    args = [args]
-
-  # Calculate the total size of arguments on dimension 1.
-  total_arg_size = 0
-  shapes = [a.get_shape() for a in args]
-  for shape in shapes:
-    if shape.ndims != 2:
-      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-    if shape[1].value is None:
-      raise ValueError("linear expects shape[1] to be provided for shape %s, "
-                       "but saw %s" % (shape, shape[1]))
-    else:
-      total_arg_size += shape[1].value
-
-  dtype = [a.dtype for a in args][0]
-
-  # Now the computation.
-  scope = vs.get_variable_scope()
-  with vs.variable_scope(scope) as outer_scope:
-    weights = vs.get_variable(
-        "weights", [total_arg_size, output_size], dtype=dtype)
-    if len(args) == 1:
-      res = math_ops.matmul(args[0], weights)
-    else:
-      res = math_ops.matmul(array_ops.concat(args, 1), weights)
-    if not bias:
-      return res
-    with vs.variable_scope(outer_scope) as inner_scope:
-      inner_scope.set_partitioner(None)
-      biases = vs.get_variable(
-          "biases", [output_size],
-          dtype=dtype,
-          initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
-    return nn_ops.bias_add(res, biases)
diff --git a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
index 65e8705d1ee..b7393d8b988 100644
--- a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import abc
 
-from tensorflow.contrib.rnn.python.ops import core_rnn as contrib_rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import rnn
 
@@ -116,12 +115,13 @@ class FusedRNNCellAdaptor(FusedRNNCell):
     else:  # non-dynamic rnn
       if not is_list:
         inputs = array_ops.unstack(inputs)
-      outputs, state = contrib_rnn.static_rnn(self._cell,
-                                              inputs,
-                                              initial_state=initial_state,
-                                              dtype=dtype,
-                                              sequence_length=sequence_length,
-                                              scope=scope)
+      outputs, state = rnn.static_rnn(
+          self._cell,
+          inputs,
+          initial_state=initial_state,
+          dtype=dtype,
+          sequence_length=sequence_length,
+          scope=scope)
       if not is_list:
         # Convert outputs back to tensor
         outputs = array_ops.stack(outputs)
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index 8b531dab665..92beae35dd9 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -17,13 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 
@@ -79,7 +80,7 @@ def _GRUBlockCellGrad(op, *grad):
   r, u, c, _ = op.outputs
   _, _, _, d_h = grad
 
-  d_x, d_h_prev, d_c_bar, d_r_bar_u_bar = _gru_ops_so.gru_block_cell_grad(
+  d_x, d_h_prev, d_c_bar, d_r_bar_u_bar = gen_gru_ops.gru_block_cell_grad(
       x, h_prev, w_ru, w_c, b_ru, b_c, r, u, c, d_h)
 
   x_h_prev = array_ops.concat([x, h_prev], 1)
@@ -93,11 +94,11 @@ def _GRUBlockCellGrad(op, *grad):
   return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c
 
 
-class GRUBlockCell(core_rnn_cell.RNNCell):
+class GRUBlockCell(rnn_cell_impl.RNNCell):
   r"""Block GRU cell implementation.
 
   The implementation is based on:  http://arxiv.org/abs/1406.1078
-  Computes the LSTM cell forward propagation for 1 time step.
+  Computes the GRU cell forward propagation for 1 time step.
 
   This kernel op implements the following mathematical equations:
 
@@ -171,7 +172,7 @@ class GRUBlockCell(core_rnn_cell.RNNCell):
           "b_c", [self._cell_size],
           initializer=init_ops.constant_initializer(0.0))
 
-      _gru_block_cell = _gru_ops_so.gru_block_cell  # pylint: disable=invalid-name
+      _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
       _, _, _, new_h = _gru_block_cell(
           x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
 
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index d1d547b9526..c41b5793fc9 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import abc
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.contrib.rnn.ops import gen_lstm_ops
 from tensorflow.contrib.rnn.python.ops import fused_rnn_cell
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
@@ -28,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 
@@ -70,7 +71,7 @@ def _lstm_block_cell(x,
   cs = ci .* i + cs_prev .* f
   cs = clip(cs, cell_clip)
 
-  o = sigmoid(cs * wco + f)
+  o = sigmoid(cs * wco + o)
   co = tanh(cs)
   h = co .* o
   ```
@@ -119,7 +120,7 @@ def _lstm_block_cell(x,
     wcf = wci
 
   # pylint: disable=protected-access
-  return _lstm_ops_so.lstm_block_cell(
+  return gen_lstm_ops.lstm_block_cell(
       x=x,
       cs_prev=cs_prev,
       h_prev=h_prev,
@@ -204,7 +205,7 @@ def _block_lstm(seq_len_max,
     wcf = wci
 
   # pylint: disable=protected-access
-  i, cs, f, o, ci, co, h = _lstm_ops_so.block_lstm(
+  i, cs, f, o, ci, co, h = gen_lstm_ops.block_lstm(
       seq_len_max=seq_len_max,
       x=array_ops.stack(x),
       cs_prev=cs_prev,
@@ -247,7 +248,7 @@ def _LSTMBlockCellGrad(op, *grad):
     raise ValueError("cell_size from `cs_prev` should not be None.")
 
   (cs_prev_grad, dicfo, wci_grad, wcf_grad,
-   wco_grad) = _lstm_ops_so.lstm_block_cell_grad(
+   wco_grad) = gen_lstm_ops.lstm_block_cell_grad(
        x,
        cs_prev,
        h_prev,
@@ -299,7 +300,7 @@ def _BlockLSTMGrad(op, *grad):
   h_grad = grad[6]
 
   (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wco_grad, wcf_grad,
-   b_grad) = _lstm_ops_so.block_lstm_grad(
+   b_grad) = gen_lstm_ops.block_lstm_grad(
        seq_len_max,
        x,
        cs_prev,
@@ -324,7 +325,7 @@ def _BlockLSTMGrad(op, *grad):
           wcf_grad, b_grad]
 
 
-class LSTMBlockCell(core_rnn_cell.RNNCell):
+class LSTMBlockCell(rnn_cell_impl.RNNCell):
   """Basic LSTM recurrent network cell.
 
   The implementation is based on: http://arxiv.org/abs/1409.2329.
@@ -332,8 +333,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
   We add `forget_bias` (default: 1) to the biases of the forget gate in order to
   reduce the scale of forgetting in the beginning of the training.
 
-  Unlike `core_rnn_cell.LSTMCell`, this is a monolithic op and should be much
-  faster.  The weight and bias matrixes should be compatible as long as the
+  Unlike `rnn_cell_impl.LSTMCell`, this is a monolithic op and should be much
+  faster.  The weight and bias matrices should be compatible as long as the
   variable scope matches.
   """
 
@@ -352,8 +353,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
     self._names = {
-        "W": "weights",
-        "b": "biases",
+        "W": "kernel",
+        "b": "bias",
         "wci": "w_i_diag",
         "wco": "w_o_diag",
         "wcf": "w_f_diag",
@@ -362,7 +363,7 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
 
   @property
   def state_size(self):
-    return (self._num_units,) * 2
+    return rnn_cell_impl.LSTMStateTuple(self._num_units, self._num_units)
 
   @property
   def output_size(self):
@@ -401,7 +402,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
           forget_bias=self._forget_bias,
           use_peephole=self._use_peephole)
 
-      return (h, (cs, h))
+      new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
+      return h, new_state
 
 
 class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
@@ -544,7 +546,8 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
         # Input was a list, so return a list
         outputs = array_ops.unstack(outputs)
 
-      return outputs, (final_cell_state, final_output)
+      final_state = rnn_cell_impl.LSTMStateTuple(final_cell_state, final_output)
+      return outputs, final_state
 
   def _gather_states(self, data, indices, batch_size):
     """Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
@@ -565,7 +568,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
   We add forget_bias (default: 1) to the biases of the forget gate in order to
   reduce the scale of forgetting in the beginning of the training.
 
-  The variable naming is consistent with `core_rnn_cell.LSTMCell`.
+  The variable naming is consistent with `rnn_cell_impl.LSTMCell`.
   """
 
   def __init__(self,
@@ -621,10 +624,10 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
       time_len = array_ops.shape(inputs)[0]
     input_size = inputs_shape[2].value
     w = vs.get_variable(
-        "weights",
+        "kernel",
         [input_size + self._num_units, self._num_units * 4], dtype=dtype)
     b = vs.get_variable(
-        "biases", [w.get_shape().with_rank(2)[1]],
+        "bias", [w.get_shape().with_rank(2)[1]],
         initializer=init_ops.constant_initializer(0.0),
         dtype=dtype)
     if self._use_peephole:
@@ -639,7 +642,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     else:
       max_seq_len = math_ops.to_int64(math_ops.reduce_max(sequence_length))
 
-    _, cs, _, _, _, _, h = _lstm_ops_so.block_lstm(
+    _, cs, _, _, _, _, h = gen_lstm_ops.block_lstm(
         seq_len_max=max_seq_len,
         x=inputs,
         cs_prev=initial_cell_state,
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 3cb027167ee..676441b4fc3 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.rnn.python.ops import core_rnn as contrib_rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import variable_scope as vs
@@ -106,7 +105,7 @@ def stack_bidirectional_rnn(cells_fw,
         initial_state_bw = initial_states_bw[i]
 
       with vs.variable_scope("cell_%d" % i) as cell_scope:
-        prev_layer, state_fw, state_bw = contrib_rnn.static_bidirectional_rnn(
+        prev_layer, state_fw, state_bw = rnn.static_bidirectional_rnn(
             cell_fw,
             cell_bw,
             prev_layer,
@@ -128,6 +127,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
                                     initial_states_bw=None,
                                     dtype=None,
                                     sequence_length=None,
+                                    parallel_iterations=None,
                                     scope=None):
   """Creates a dynamic bidirectional recurrent neural network.
 
@@ -143,8 +143,8 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
       to be used for forward direction.
     cells_bw: List of instances of RNNCell, one per layer,
       to be used for backward direction.
-    inputs: A length T list of inputs, each a tensor of shape
-      [batch_size, input_size], or a nested tuple of such elements.
+    inputs: The RNN inputs. this must be a tensor of shape:
+      `[batch_size, max_time, ...]`, or a nested tuple of such elements.
     initial_states_fw: (optional) A list of the initial states (one per layer)
       for the forward RNN.
       Each tensor must has an appropriate type and shape
@@ -155,6 +155,11 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
       either of the initial states are not provided.
     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
       containing the actual lengths for each of the sequences.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
     scope: VariableScope for the created subgraph; defaults to None.
 
   Returns:
@@ -169,7 +174,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
 
   Raises:
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-    ValueError: If inputs is `None`, not a list or an empty list.
+    ValueError: If inputs is `None`.
   """
   if not cells_fw:
     raise ValueError("Must specify at least one fw cell for BidirectionalRNN.")
@@ -211,6 +216,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
             initial_state_fw=initial_state_fw,
             initial_state_bw=initial_state_bw,
             sequence_length=sequence_length,
+            parallel_iterations=parallel_iterations,
             dtype=dtype)
         # Concat the outputs to create the new input.
         prev_layer = array_ops.concat(outputs, 2)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 0191ce8302b..c8552fc050f 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -23,8 +23,6 @@ import math
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -74,12 +74,12 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
   return shards
 
 
-class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
+class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
   The default non-peephole implementation is based on:
 
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+    http://www.bioinf.jku.at/publications/older/2604.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -105,8 +105,8 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
   def __init__(self, num_units, use_peepholes=False,
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=1, num_proj_shards=1,
-               forget_bias=1.0, state_is_tuple=False,
-               activation=math_ops.tanh):
+               forget_bias=1.0, state_is_tuple=True,
+               activation=math_ops.tanh, reuse=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -130,7 +130,11 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
         the `c_state` and `m_state`.  By default (False), they are concatenated
         along the column axis.  This default behavior will soon be deprecated.
       activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
     """
+    super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn(
           "%s: Using a concatenated state is slower and will soon be "
@@ -145,16 +149,15 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
     self._activation = activation
+    self._reuse = reuse
 
     if num_proj:
-      self._state_size = (
-          core_rnn_cell.LSTMStateTuple(num_units, num_proj)
-          if state_is_tuple else num_units + num_proj)
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+                          if state_is_tuple else num_units + num_proj)
       self._output_size = num_proj
     else:
-      self._state_size = (
-          core_rnn_cell.LSTMStateTuple(num_units, num_units)
-          if state_is_tuple else 2 * num_units)
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+                          if state_is_tuple else 2 * num_units)
       self._output_size = num_units
 
   @property
@@ -165,7 +168,7 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._output_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
@@ -174,7 +177,6 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
         `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
         tuple of state Tensors, both `2-D`, with column sizes `c_state` and
         `m_state`.
-      scope: VariableScope for the created subgraph; defaults to "LSTMCell".
 
     Returns:
       A tuple containing:
@@ -204,58 +206,56 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
     input_size = inputs.get_shape().with_rank(2)[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    with vs.variable_scope(scope or "coupled_input_forget_gate_lstm_cell",
-                           initializer=self._initializer):
-      concat_w = _get_concat_variable(
-          "W", [input_size.value + num_proj, 3 * self._num_units],
-          dtype, self._num_unit_shards)
+    concat_w = _get_concat_variable(
+        "W", [input_size.value + num_proj, 3 * self._num_units],
+        dtype, self._num_unit_shards)
 
-      b = vs.get_variable(
-          "B",
-          shape=[3 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
+    b = vs.get_variable(
+        "B",
+        shape=[3 * self._num_units],
+        initializer=init_ops.zeros_initializer(),
+        dtype=dtype)
 
-      # j = new_input, f = forget_gate, o = output_gate
-      cell_inputs = array_ops.concat([inputs, m_prev], 1)
-      lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
-      j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
+    # j = new_input, f = forget_gate, o = output_gate
+    cell_inputs = array_ops.concat([inputs, m_prev], 1)
+    lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+    j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
 
-      # Diagonal connections
-      if self._use_peepholes:
-        w_f_diag = vs.get_variable(
-            "W_F_diag", shape=[self._num_units], dtype=dtype)
-        w_o_diag = vs.get_variable(
-            "W_O_diag", shape=[self._num_units], dtype=dtype)
+    # Diagonal connections
+    if self._use_peepholes:
+      w_f_diag = vs.get_variable(
+          "W_F_diag", shape=[self._num_units], dtype=dtype)
+      w_o_diag = vs.get_variable(
+          "W_O_diag", shape=[self._num_units], dtype=dtype)
 
-      if self._use_peepholes:
-        f_act = sigmoid(f + self._forget_bias + w_f_diag * c_prev)
-      else:
-        f_act = sigmoid(f + self._forget_bias)
-      c = (f_act * c_prev + (1 - f_act) * self._activation(j))
+    if self._use_peepholes:
+      f_act = sigmoid(f + self._forget_bias + w_f_diag * c_prev)
+    else:
+      f_act = sigmoid(f + self._forget_bias)
+    c = (f_act * c_prev + (1 - f_act) * self._activation(j))
 
-      if self._use_peepholes:
-        m = sigmoid(o + w_o_diag * c) * self._activation(c)
-      else:
-        m = sigmoid(o) * self._activation(c)
+    if self._use_peepholes:
+      m = sigmoid(o + w_o_diag * c) * self._activation(c)
+    else:
+      m = sigmoid(o) * self._activation(c)
 
-      if self._num_proj is not None:
-        concat_w_proj = _get_concat_variable(
-            "W_P", [self._num_units, self._num_proj],
-            dtype, self._num_proj_shards)
+    if self._num_proj is not None:
+      concat_w_proj = _get_concat_variable(
+          "W_P", [self._num_units, self._num_proj],
+          dtype, self._num_proj_shards)
 
-        m = math_ops.matmul(m, concat_w_proj)
-        if self._proj_clip is not None:
-          # pylint: disable=invalid-unary-operand-type
-          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
-          # pylint: enable=invalid-unary-operand-type
+      m = math_ops.matmul(m, concat_w_proj)
+      if self._proj_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+        # pylint: enable=invalid-unary-operand-type
 
-    new_state = (core_rnn_cell.LSTMStateTuple(c, m) if self._state_is_tuple else
-                 array_ops.concat([c, m], 1))
+    new_state = (rnn_cell_impl.LSTMStateTuple(c, m)
+                 if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
 
-class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
+class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
   """Time-Frequency Long short-term memory unit (LSTM) recurrent network cell.
 
   This implementation is based on:
@@ -270,7 +270,8 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
   def __init__(self, num_units, use_peepholes=False,
                cell_clip=None, initializer=None,
                num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None):
+               feature_size=None, frequency_skip=None,
+               reuse=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -288,7 +289,11 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
       feature_size: int, The size of the input feature the LSTM spans over.
       frequency_skip: int, The amount the LSTM filter is shifted by in
         frequency.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
     """
+    super(TimeFreqLSTMCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._cell_clip = cell_clip
@@ -299,6 +304,7 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
     self._frequency_skip = frequency_skip
     self._state_size = 2 * num_units
     self._output_size = num_units
+    self._reuse = reuse
 
   @property
   def output_size(self):
@@ -308,14 +314,12 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
   def state_size(self):
     return self._state_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, batch x num_units.
       state: state Tensor, 2D, batch x state_size.
-      scope: VariableScope for the created subgraph; defaults to
-        "TimeFreqLSTMCell".
 
     Returns:
       A tuple containing:
@@ -334,63 +338,63 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
     freq_inputs = self._make_tf_features(inputs)
     dtype = inputs.dtype
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
-    with vs.variable_scope(scope or "time_freq_lstm_cell",
-                           initializer=self._initializer):  # "TimeFreqLSTMCell"
-      concat_w = _get_concat_variable(
-          "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
-          dtype, self._num_unit_shards)
-      b = vs.get_variable(
-          "B",
-          shape=[4 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
 
-      # Diagonal connections
+    concat_w = _get_concat_variable(
+        "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
+        dtype, self._num_unit_shards)
+
+    b = vs.get_variable(
+        "B",
+        shape=[4 * self._num_units],
+        initializer=init_ops.zeros_initializer(),
+        dtype=dtype)
+
+    # Diagonal connections
+    if self._use_peepholes:
+      w_f_diag = vs.get_variable(
+          "W_F_diag", shape=[self._num_units], dtype=dtype)
+      w_i_diag = vs.get_variable(
+          "W_I_diag", shape=[self._num_units], dtype=dtype)
+      w_o_diag = vs.get_variable(
+          "W_O_diag", shape=[self._num_units], dtype=dtype)
+
+    # initialize the first freq state to be zero
+    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
+                                   self._num_units], dtype)
+    for fq in range(len(freq_inputs)):
+      c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
+                               [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
+                               [-1, self._num_units])
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
+                                     1)
+      lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+
       if self._use_peepholes:
-        w_f_diag = vs.get_variable(
-            "W_F_diag", shape=[self._num_units], dtype=dtype)
-        w_i_diag = vs.get_variable(
-            "W_I_diag", shape=[self._num_units], dtype=dtype)
-        w_o_diag = vs.get_variable(
-            "W_O_diag", shape=[self._num_units], dtype=dtype)
+        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * tanh(j))
+      else:
+        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
 
-      # initialize the first freq state to be zero
-      m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
-                                     self._num_units], dtype)
-      for fq in range(len(freq_inputs)):
-        c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
-                                 [-1, self._num_units])
-        m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
-                                 [-1, self._num_units])
-        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-        cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
-                                       1)
-        lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
-        i, j, f, o = array_ops.split(
-            value=lstm_matrix, num_or_size_splits=4, axis=1)
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
 
-        if self._use_peepholes:
-          c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-               sigmoid(i + w_i_diag * c_prev) * tanh(j))
-        else:
-          c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
-
-        if self._cell_clip is not None:
-          # pylint: disable=invalid-unary-operand-type
-          c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
-          # pylint: enable=invalid-unary-operand-type
-
-        if self._use_peepholes:
-          m = sigmoid(o + w_o_diag * c) * tanh(c)
-        else:
-          m = sigmoid(o) * tanh(c)
-        m_prev_freq = m
-        if fq == 0:
-          state_out = array_ops.concat([c, m], 1)
-          m_out = m
-        else:
-          state_out = array_ops.concat([state_out, c, m], 1)
-          m_out = array_ops.concat([m_out, m], 1)
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * tanh(c)
+      else:
+        m = sigmoid(o) * tanh(c)
+      m_prev_freq = m
+      if fq == 0:
+        state_out = array_ops.concat([c, m], 1)
+        m_out = m
+      else:
+        state_out = array_ops.concat([state_out, c, m], 1)
+        m_out = array_ops.concat([m_out, m], 1)
     return m_out, state_out
 
   def _make_tf_features(self, input_feat):
@@ -419,7 +423,7 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
     return freq_inputs
 
 
-class GridLSTMCell(core_rnn_cell.RNNCell):
+class GridLSTMCell(rnn_cell_impl.RNNCell):
   """Grid Long short-term memory unit (LSTM) recurrent network cell.
 
   The default is based on:
@@ -444,7 +448,8 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
                start_freqindex_list=None,
                end_freqindex_list=None,
                couple_input_forget_gates=False,
-               state_is_tuple=False):
+               state_is_tuple=True,
+               reuse=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -457,7 +462,7 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
         state is clipped by this value prior to the cell output activation.
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
-      num_unit_shards: (optional) int, defualt 1, How to split the weight
+      num_unit_shards: (optional) int, default 1, How to split the weight
         matrix. If > 1,the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
@@ -479,9 +484,13 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
       state_is_tuple: If True, accepted and returned states are 2-tuples of
         the `c_state` and `m_state`.  By default (False), they are concatenated
         along the column axis.  This default behavior will soon be deprecated.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
     Raises:
       ValueError: if the num_frequency_blocks list is not specified
     """
+    super(GridLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -500,6 +509,7 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
     self._end_freqindex_list = end_freqindex_list
     self._num_frequency_blocks = num_frequency_blocks
     self._total_blocks = 0
+    self._reuse = reuse
     if self._num_frequency_blocks is None:
       raise ValueError("Must specify num_frequency_blocks")
 
@@ -532,15 +542,13 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
   def state_tuple_type(self):
     return self._state_tuple_type
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, [batch, feature_size].
       state: Tensor or tuple of Tensors, 2D, [batch, state_size], depends on the
         flag self._state_is_tuple.
-      scope: (optional) VariableScope for the created subgraph; if None, it
-        defaults to "GridLSTMCell".
 
     Returns:
       A tuple containing:
@@ -553,23 +561,21 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = int(inputs.get_shape()[0])
+    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
     freq_inputs = self._make_tf_features(inputs)
-    with vs.variable_scope(scope or "grid_lstm_cell",
-                           initializer=self._initializer):  # "GridLSTMCell"
-      m_out_lst = []
-      state_out_lst = []
-      for block in range(len(freq_inputs)):
-        m_out_lst_current, state_out_lst_current = self._compute(
-            freq_inputs[block], block, state, batch_size,
-            state_is_tuple=self._state_is_tuple)
-        m_out_lst.extend(m_out_lst_current)
-        state_out_lst.extend(state_out_lst_current)
-      if self._state_is_tuple:
-        state_out = self._state_tuple_type(*state_out_lst)
-      else:
-        state_out = array_ops.concat(state_out_lst, 1)
-      m_out = array_ops.concat(m_out_lst, 1)
+    m_out_lst = []
+    state_out_lst = []
+    for block in range(len(freq_inputs)):
+      m_out_lst_current, state_out_lst_current = self._compute(
+          freq_inputs[block], block, state, batch_size,
+          state_is_tuple=self._state_is_tuple)
+      m_out_lst.extend(m_out_lst_current)
+      state_out_lst.extend(state_out_lst_current)
+    if self._state_is_tuple:
+      state_out = self._state_tuple_type(*state_out_lst)
+    else:
+      state_out = array_ops.concat(state_out_lst, 1)
+    m_out = array_ops.concat(m_out_lst, 1)
     return m_out, state_out
 
   def _compute(self, freq_inputs, block, state, batch_size,
@@ -898,7 +904,8 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
                start_freqindex_list=None,
                end_freqindex_list=None,
                couple_input_forget_gates=False,
-               backward_slice_offset=0):
+               backward_slice_offset=0,
+               reuse=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -911,7 +918,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         state is clipped by this value prior to the cell output activation.
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
-      num_unit_shards: (optional) int, defualt 1, How to split the weight
+      num_unit_shards: (optional) int, default 1, How to split the weight
         matrix. If > 1,the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
@@ -932,13 +939,15 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         model parameters and computation cost.
       backward_slice_offset: (optional) int32, default 0, the starting offset to
         slice the feature for backward processing.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
     """
     super(BidirectionalGridLSTMCell, self).__init__(
         num_units, use_peepholes, share_time_frequency_weights, cell_clip,
         initializer, num_unit_shards, forget_bias, feature_size, frequency_skip,
         num_frequency_blocks, start_freqindex_list, end_freqindex_list,
-        couple_input_forget_gates=False,
-        state_is_tuple=True)
+        couple_input_forget_gates, True, reuse)
     self._backward_slice_offset = int(backward_slice_offset)
     state_names = ""
     for direction in ["fwd", "bwd"]:
@@ -953,14 +962,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         *([num_units, num_units] * self._total_blocks * 2))
     self._output_size = 2 * num_units * self._total_blocks * 2
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, [batch, num_units].
       state: tuple of Tensors, 2D, [batch, state_size].
-      scope: (optional) VariableScope for the created subgraph; if None, it
-        defaults to "BidirectionalGridLSTMCell".
 
     Returns:
       A tuple containing:
@@ -973,7 +980,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = int(inputs.get_shape()[0])
+    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
     fwd_inputs = self._make_tf_features(inputs)
     if self._backward_slice_offset:
       bwd_inputs = self._make_tf_features(inputs, self._backward_slice_offset)
@@ -981,29 +988,27 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       bwd_inputs = fwd_inputs
 
     # Forward processing
-    with vs.variable_scope(scope or "bidirectional_grid_lstm_cell",
-                           initializer=self._initializer):
-      with vs.variable_scope("fwd"):
-        fwd_m_out_lst = []
-        fwd_state_out_lst = []
-        for block in range(len(fwd_inputs)):
-          fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
-              fwd_inputs[block], block, state, batch_size,
-              state_prefix="fwd_state", state_is_tuple=True)
-          fwd_m_out_lst.extend(fwd_m_out_lst_current)
-          fwd_state_out_lst.extend(fwd_state_out_lst_current)
-      # Backward processing
-      bwd_m_out_lst = []
-      bwd_state_out_lst = []
-      with vs.variable_scope("bwd"):
-        for block in range(len(bwd_inputs)):
-          # Reverse the blocks
-          bwd_inputs_reverse = bwd_inputs[block][::-1]
-          bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
-              bwd_inputs_reverse, block, state, batch_size,
-              state_prefix="bwd_state", state_is_tuple=True)
-          bwd_m_out_lst.extend(bwd_m_out_lst_current)
-          bwd_state_out_lst.extend(bwd_state_out_lst_current)
+    with vs.variable_scope("fwd"):
+      fwd_m_out_lst = []
+      fwd_state_out_lst = []
+      for block in range(len(fwd_inputs)):
+        fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
+            fwd_inputs[block], block, state, batch_size,
+            state_prefix="fwd_state", state_is_tuple=True)
+        fwd_m_out_lst.extend(fwd_m_out_lst_current)
+        fwd_state_out_lst.extend(fwd_state_out_lst_current)
+    # Backward processing
+    bwd_m_out_lst = []
+    bwd_state_out_lst = []
+    with vs.variable_scope("bwd"):
+      for block in range(len(bwd_inputs)):
+        # Reverse the blocks
+        bwd_inputs_reverse = bwd_inputs[block][::-1]
+        bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
+            bwd_inputs_reverse, block, state, batch_size,
+            state_prefix="bwd_state", state_is_tuple=True)
+        bwd_m_out_lst.extend(bwd_m_out_lst_current)
+        bwd_state_out_lst.extend(bwd_state_out_lst_current)
     state_out = self._state_tuple_type(*(fwd_state_out_lst + bwd_state_out_lst))
     # Outputs are always concated as it is never used separately.
     m_out = array_ops.concat(fwd_m_out_lst + bwd_m_out_lst, 1)
@@ -1011,18 +1016,18 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
 
 
 # pylint: disable=protected-access
-_linear = core_rnn_cell_impl._linear
+_linear = rnn_cell_impl._linear
 # pylint: enable=protected-access
 
 
-class AttentionCellWrapper(core_rnn_cell.RNNCell):
+class AttentionCellWrapper(rnn_cell_impl.RNNCell):
   """Basic attention cell wrapper.
 
   Implementation based on https://arxiv.org/abs/1409.0473.
   """
 
   def __init__(self, cell, attn_length, attn_size=None, attn_vec_size=None,
-               input_size=None, state_is_tuple=False):
+               input_size=None, state_is_tuple=True, reuse=None):
     """Create a cell with attention.
 
     Args:
@@ -1039,13 +1044,17 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
       state_is_tuple: If True, accepted and returned states are n-tuples, where
         `n = len(cells)`.  By default (False), the states are all
         concatenated along the column axis.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
 
     Raises:
       TypeError: if cell is not an RNNCell.
       ValueError: if cell returns a state tuple but the flag
           `state_is_tuple` is `False` or if attn_length is zero or less.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    super(AttentionCellWrapper, self).__init__(_reuse=reuse)
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("The parameter cell is not RNNCell.")
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
       raise ValueError("Cell returns tuple of states, but the flag "
@@ -1068,6 +1077,7 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
     self._input_size = input_size
     self._attn_size = attn_size
     self._attn_length = attn_length
+    self._reuse = reuse
 
   @property
   def state_size(self):
@@ -1082,41 +1092,40 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._attn_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Long short-term memory cell with attention (LSTMA)."""
-    with vs.variable_scope(scope or "attention_cell_wrapper"):
-      if self._state_is_tuple:
-        state, attns, attn_states = state
-      else:
-        states = state
-        state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
-        attns = array_ops.slice(
-            states, [0, self._cell.state_size], [-1, self._attn_size])
-        attn_states = array_ops.slice(
-            states, [0, self._cell.state_size + self._attn_size],
-            [-1, self._attn_size * self._attn_length])
-      attn_states = array_ops.reshape(attn_states,
-                                      [-1, self._attn_length, self._attn_size])
-      input_size = self._input_size
-      if input_size is None:
-        input_size = inputs.get_shape().as_list()[1]
-      inputs = _linear([inputs, attns], input_size, True)
-      lstm_output, new_state = self._cell(inputs, state)
-      if self._state_is_tuple:
-        new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
-      else:
-        new_state_cat = new_state
-      new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
-      with vs.variable_scope("attn_output_projection"):
-        output = _linear([lstm_output, new_attns], self._attn_size, True)
-      new_attn_states = array_ops.concat(
-          [new_attn_states, array_ops.expand_dims(output, 1)], 1)
-      new_attn_states = array_ops.reshape(
-          new_attn_states, [-1, self._attn_length * self._attn_size])
-      new_state = (new_state, new_attns, new_attn_states)
-      if not self._state_is_tuple:
-        new_state = array_ops.concat(list(new_state), 1)
-      return output, new_state
+    if self._state_is_tuple:
+      state, attns, attn_states = state
+    else:
+      states = state
+      state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
+      attns = array_ops.slice(
+          states, [0, self._cell.state_size], [-1, self._attn_size])
+      attn_states = array_ops.slice(
+          states, [0, self._cell.state_size + self._attn_size],
+          [-1, self._attn_size * self._attn_length])
+    attn_states = array_ops.reshape(attn_states,
+                                    [-1, self._attn_length, self._attn_size])
+    input_size = self._input_size
+    if input_size is None:
+      input_size = inputs.get_shape().as_list()[1]
+    inputs = _linear([inputs, attns], input_size, True)
+    lstm_output, new_state = self._cell(inputs, state)
+    if self._state_is_tuple:
+      new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
+    else:
+      new_state_cat = new_state
+    new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
+    with vs.variable_scope("attn_output_projection"):
+      output = _linear([lstm_output, new_attns], self._attn_size, True)
+    new_attn_states = array_ops.concat(
+        [new_attn_states, array_ops.expand_dims(output, 1)], 1)
+    new_attn_states = array_ops.reshape(
+        new_attn_states, [-1, self._attn_length * self._attn_size])
+    new_state = (new_state, new_attns, new_attn_states)
+    if not self._state_is_tuple:
+      new_state = array_ops.concat(list(new_state), 1)
+    return output, new_state
 
   def _attention(self, query, attn_states):
     conv2d = nn_ops.conv2d
@@ -1142,7 +1151,90 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
       return new_attns, new_attn_states
 
 
-class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
+class HighwayWrapper(rnn_cell_impl.RNNCell):
+  """RNNCell wrapper that adds highway connection on cell input and output.
+
+  Based on:
+    R. K. Srivastava, K. Greff, and J. Schmidhuber, "Highway networks",
+    arXiv preprint arXiv:1505.00387, 2015.
+    https://arxiv.org/abs/1505.00387
+  """
+
+  def __init__(self, cell,
+               couple_carry_transform_gates=True,
+               carry_bias_init=1.0):
+    """Constructs a `HighwayWrapper` for `cell`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      couple_carry_transform_gates: boolean, should the Carry and Transform gate
+        be coupled.
+      carry_bias_init: float, carry gates bias initialization.
+    """
+    self._cell = cell
+    self._couple_carry_transform_gates = couple_carry_transform_gates
+    self._carry_bias_init = carry_bias_init
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def _highway(self, inp, out):
+    input_size = inp.get_shape().with_rank(2)[1].value
+    carry_weight = vs.get_variable("carry_w", [input_size, input_size])
+    carry_bias = vs.get_variable(
+        "carry_b", [input_size],
+        initializer=init_ops.constant_initializer(
+            self._carry_bias_init))
+    carry = math_ops.sigmoid(nn_ops.xw_plus_b(inp, carry_weight, carry_bias))
+    if self._couple_carry_transform_gates:
+      transform = 1 - carry
+    else:
+      transform_weight = vs.get_variable("transform_w",
+                                         [input_size, input_size])
+      transform_bias = vs.get_variable(
+          "transform_b", [input_size],
+          initializer=init_ops.constant_initializer(
+              -self._carry_bias_init))
+      transform = math_ops.sigmoid(nn_ops.xw_plus_b(inp,
+                                                    transform_weight,
+                                                    transform_bias))
+    return inp * carry + out * transform
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and add its inputs to its outputs.
+
+    Args:
+      inputs: cell inputs.
+      state: cell state.
+      scope: optional cell scope.
+
+    Returns:
+      Tuple of cell outputs and new state.
+
+    Raises:
+      TypeError: If cell inputs and outputs have different structure (type).
+      ValueError: If cell inputs and outputs have different structure (value).
+    """
+    outputs, new_state = self._cell(inputs, state, scope=scope)
+    nest.assert_same_structure(inputs, outputs)
+    # Ensure shapes match
+    def assert_shape_match(inp, out):
+      inp.get_shape().assert_is_compatible_with(out.get_shape())
+    nest.map_structure(assert_shape_match, inputs, outputs)
+    res_outputs = nest.map_structure(self._highway, inputs, outputs)
+    return (res_outputs, new_state)
+
+
+class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
   """LSTM unit with layer normalization and recurrent dropout.
 
   This class adds layer normalization and recurrent dropout to a
@@ -1165,7 +1257,8 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
   def __init__(self, num_units, forget_bias=1.0,
                input_size=None, activation=math_ops.tanh,
                layer_norm=True, norm_gain=1.0, norm_shift=0.0,
-               dropout_keep_prob=1.0, dropout_prob_seed=None):
+               dropout_keep_prob=1.0, dropout_prob_seed=None,
+               reuse=None):
     """Initializes the basic LSTM cell.
 
     Args:
@@ -1182,7 +1275,11 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
         recurrent dropout probability value. If float and 1.0, no dropout will
         be applied.
       dropout_prob_seed: (optional) integer, the randomness seed.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
     """
+    super(LayerNormBasicLSTMCell, self).__init__(_reuse=reuse)
 
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
@@ -1195,10 +1292,11 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
     self._layer_norm = layer_norm
     self._g = norm_gain
     self._b = norm_shift
+    self._reuse = reuse
 
   @property
   def state_size(self):
-    return core_rnn_cell.LSTMStateTuple(self._num_units, self._num_units)
+    return rnn_cell_impl.LSTMStateTuple(self._num_units, self._num_units)
 
   @property
   def output_size(self):
@@ -1218,46 +1316,399 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
   def _linear(self, args):
     out_size = 4 * self._num_units
     proj_size = args.get_shape()[-1]
-    weights = vs.get_variable("weights", [proj_size, out_size])
+    weights = vs.get_variable("kernel", [proj_size, out_size])
     out = math_ops.matmul(args, weights)
     if not self._layer_norm:
-      bias = vs.get_variable("biases", [out_size])
+      bias = vs.get_variable("bias", [out_size])
       out = nn_ops.bias_add(out, bias)
     return out
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """LSTM cell with layer normalization and recurrent dropout."""
+    c, h = state
+    args = array_ops.concat([inputs, h], 1)
+    concat = self._linear(args)
 
-    with vs.variable_scope(scope or "layer_norm_basic_lstm_cell"):
-      c, h = state
-      args = array_ops.concat([inputs, h], 1)
-      concat = self._linear(args)
+    i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+    if self._layer_norm:
+      i = self._norm(i, "input")
+      j = self._norm(j, "transform")
+      f = self._norm(f, "forget")
+      o = self._norm(o, "output")
 
-      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
-      if self._layer_norm:
-        i = self._norm(i, "input")
-        j = self._norm(j, "transform")
-        f = self._norm(f, "forget")
-        o = self._norm(o, "output")
+    g = self._activation(j)
+    if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
+      g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
 
-      g = self._activation(j)
-      if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-        g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
+    new_c = (c * math_ops.sigmoid(f + self._forget_bias)
+             + math_ops.sigmoid(i) * g)
+    if self._layer_norm:
+      new_c = self._norm(new_c, "state")
+    new_h = self._activation(new_c) * math_ops.sigmoid(o)
 
-      new_c = (c * math_ops.sigmoid(f + self._forget_bias)
-               + math_ops.sigmoid(i) * g)
-      if self._layer_norm:
-        new_c = self._norm(new_c, "state")
-      new_h = self._activation(new_c) * math_ops.sigmoid(o)
+    new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
+    return new_h, new_state
 
-      new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
-      return new_h, new_state
+
+class NASCell(rnn_cell_impl.RNNCell):
+  """Neural Architecture Search (NAS) recurrent network cell.
+
+  This implements the recurrent cell from the paper:
+
+    https://arxiv.org/abs/1611.01578
+
+  Barret Zoph and Quoc V. Le.
+  "Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.
+
+  The class uses an optional projection layer.
+  """
+
+  def __init__(self, num_units, num_proj=None,
+               use_biases=False, reuse=None):
+    """Initialize the parameters for a NAS cell.
+
+    Args:
+      num_units: int, The number of units in the NAS cell
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      use_biases: (optional) bool, If True then use biases within the cell. This
+        is False by default.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(NASCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._num_proj = num_proj
+    self._use_biases = use_biases
+    self._reuse = reuse
+
+    if num_proj is not None:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def call(self, inputs, state):
+    """Run one step of NAS Cell.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: This must be a tuple of state Tensors, both `2-D`, with column
+        sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        NAS Cell after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of NAS Cell after reading `inputs`
+        when the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    sigmoid = math_ops.sigmoid
+    tanh = math_ops.tanh
+    relu = nn_ops.relu
+
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+
+    (c_prev, m_prev) = state
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    # Variables for the NAS cell. W_m is all matrices multiplying the
+    # hiddenstate and W_inputs is all matrices multiplying the inputs.
+    concat_w_m = vs.get_variable(
+        "recurrent_kernel", [num_proj, 8 * self._num_units],
+        dtype)
+    concat_w_inputs = vs.get_variable(
+        "kernel", [input_size.value, 8 * self._num_units],
+        dtype)
+
+    m_matrix = math_ops.matmul(m_prev, concat_w_m)
+    inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
+
+    if self._use_biases:
+      b = vs.get_variable(
+          "bias",
+          shape=[8 * self._num_units],
+          initializer=init_ops.zeros_initializer(),
+          dtype=dtype)
+      m_matrix = nn_ops.bias_add(m_matrix, b)
+
+    # The NAS cell branches into 8 different splits for both the hiddenstate
+    # and the input
+    m_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
+                                      value=m_matrix)
+    inputs_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
+                                           value=inputs_matrix)
+
+    # First layer
+    layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
+    layer1_1 = relu(inputs_matrix_splits[1] + m_matrix_splits[1])
+    layer1_2 = sigmoid(inputs_matrix_splits[2] + m_matrix_splits[2])
+    layer1_3 = relu(inputs_matrix_splits[3] * m_matrix_splits[3])
+    layer1_4 = tanh(inputs_matrix_splits[4] + m_matrix_splits[4])
+    layer1_5 = sigmoid(inputs_matrix_splits[5] + m_matrix_splits[5])
+    layer1_6 = tanh(inputs_matrix_splits[6] + m_matrix_splits[6])
+    layer1_7 = sigmoid(inputs_matrix_splits[7] + m_matrix_splits[7])
+
+    # Second layer
+    l2_0 = tanh(layer1_0 * layer1_1)
+    l2_1 = tanh(layer1_2 + layer1_3)
+    l2_2 = tanh(layer1_4 * layer1_5)
+    l2_3 = sigmoid(layer1_6 + layer1_7)
+
+    # Inject the cell
+    l2_0 = tanh(l2_0 + c_prev)
+
+    # Third layer
+    l3_0_pre = l2_0 * l2_1
+    new_c = l3_0_pre  # create new cell
+    l3_0 = l3_0_pre
+    l3_1 = tanh(l2_2 + l2_3)
+
+    # Final layer
+    new_m = tanh(l3_0 * l3_1)
+
+    # Projection layer if specified
+    if self._num_proj is not None:
+      concat_w_proj = vs.get_variable(
+          "projection_weights", [self._num_units, self._num_proj],
+          dtype)
+      new_m = math_ops.matmul(new_m, concat_w_proj)
+
+    new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_m)
+    return new_m, new_state
+
+
+class UGRNNCell(rnn_cell_impl.RNNCell):
+  """Update Gate Recurrent Neural Network (UGRNN) cell.
+
+  Compromise between a LSTM/GRU and a vanilla RNN.  There is only one
+  gate, and that is to determine whether the unit should be
+  integrating or computing instantaneously.  This is the recurrent
+  idea of the feedforward Highway Network.
+
+  This implements the recurrent cell from the paper:
+
+    https://arxiv.org/abs/1611.09913
+
+  Jasmine Collins, Jascha Sohl-Dickstein, and David Sussillo.
+  "Capacity and Trainability in Recurrent Neural Networks" Proc. ICLR 2017.
+  """
+
+  def __init__(self, num_units, initializer=None, forget_bias=1.0,
+               activation=math_ops.tanh, reuse=None):
+    """Initialize the parameters for an UGRNN cell.
+
+    Args:
+      num_units: int, The number of units in the UGRNN cell
+      initializer: (optional) The initializer to use for the weight matrices.
+      forget_bias: (optional) float, default 1.0, The initial bias of the
+        forget gate, used to reduce the scale of forgetting at the beginning
+        of the training.
+      activation: (optional) Activation function of the inner states.
+        Default is `tf.tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(UGRNNCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._initializer = initializer
+    self._forget_bias = forget_bias
+    self._activation = activation
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Run one step of UGRNN.
+
+    Args:
+      inputs: input Tensor, 2D, batch x input size.
+      state: state Tensor, 2D, batch x num units.
+
+    Returns:
+      new_output: batch x num units, Tensor representing the output of the UGRNN
+        after reading `inputs` when previous state was `state`. Identical to
+        `new_state`.
+      new_state: batch x num units, Tensor representing the state of the UGRNN
+        after reading `inputs` when previous state was `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    sigmoid = math_ops.sigmoid
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    with vs.variable_scope(vs.get_variable_scope(),
+                           initializer=self._initializer):
+      cell_inputs = array_ops.concat([inputs, state], 1)
+      rnn_matrix = _linear(cell_inputs, 2 * self._num_units, True)
+
+      [g_act, c_act] = array_ops.split(
+          axis=1, num_or_size_splits=2, value=rnn_matrix)
+
+      c = self._activation(c_act)
+      g = sigmoid(g_act + self._forget_bias)
+      new_state = g * state + (1.0 - g) * c
+      new_output = new_state
+
+    return new_output, new_state
+
+
+class IntersectionRNNCell(rnn_cell_impl.RNNCell):
+  """Intersection Recurrent Neural Network (+RNN) cell.
+
+  Architecture with coupled recurrent gate as well as coupled depth
+  gate, designed to improve information flow through stacked RNNs. As the
+  architecture uses depth gating, the dimensionality of the depth
+  output (y) also should not change through depth (input size == output size).
+  To achieve this, the first layer of a stacked Intersection RNN projects
+  the inputs to N (num units) dimensions. Therefore when initializing an
+  IntersectionRNNCell, one should set `num_in_proj = N` for the first layer
+  and use default settings for subsequent layers.
+
+  This implements the recurrent cell from the paper:
+
+    https://arxiv.org/abs/1611.09913
+
+  Jasmine Collins, Jascha Sohl-Dickstein, and David Sussillo.
+  "Capacity and Trainability in Recurrent Neural Networks" Proc. ICLR 2017.
+
+  The Intersection RNN is built for use in deeply stacked
+  RNNs so it may not achieve best performance with depth 1.
+  """
+
+  def __init__(self, num_units, num_in_proj=None,
+               initializer=None, forget_bias=1.0,
+               y_activation=nn_ops.relu, reuse=None):
+    """Initialize the parameters for an +RNN cell.
+
+    Args:
+      num_units: int, The number of units in the +RNN cell
+      num_in_proj: (optional) int, The input dimensionality for the RNN.
+        If creating the first layer of an +RNN, this should be set to
+        `num_units`. Otherwise, this should be set to `None` (default).
+        If `None`, dimensionality of `inputs` should be equal to `num_units`,
+        otherwise ValueError is thrown.
+      initializer: (optional) The initializer to use for the weight matrices.
+      forget_bias: (optional) float, default 1.0, The initial bias of the
+        forget gates, used to reduce the scale of forgetting at the beginning
+        of the training.
+      y_activation: (optional) Activation function of the states passed
+        through depth. Default is 'tf.nn.relu`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(IntersectionRNNCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._initializer = initializer
+    self._forget_bias = forget_bias
+    self._num_input_proj = num_in_proj
+    self._y_activation = y_activation
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Run one step of the Intersection RNN.
+
+    Args:
+      inputs: input Tensor, 2D, batch x input size.
+      state: state Tensor, 2D, batch x num units.
+
+    Returns:
+      new_y: batch x num units, Tensor representing the output of the +RNN
+        after reading `inputs` when previous state was `state`.
+      new_state: batch x num units, Tensor representing the state of the +RNN
+        after reading `inputs` when previous state was `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from `inputs` via
+        static shape inference.
+      ValueError: If input size != output size (these must be equal when
+        using the Intersection RNN).
+    """
+    sigmoid = math_ops.sigmoid
+    tanh = math_ops.tanh
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    with vs.variable_scope(vs.get_variable_scope(),
+                           initializer=self._initializer):
+      # read-in projections (should be used for first layer in deep +RNN
+      # to transform size of inputs from I --> N)
+      if input_size.value != self._num_units:
+        if self._num_input_proj:
+          with vs.variable_scope("in_projection"):
+            inputs = _linear(inputs, self._num_units, True)
+        else:
+          raise ValueError("Must have input size == output size for "
+                           "Intersection RNN. To fix, num_in_proj should "
+                           "be set to num_units at cell init.")
+
+      n_dim = i_dim = self._num_units
+      cell_inputs = array_ops.concat([inputs, state], 1)
+      rnn_matrix = _linear(cell_inputs, 2*n_dim + 2*i_dim, True)
+
+      gh_act = rnn_matrix[:, :n_dim]                           # b x n
+      h_act = rnn_matrix[:, n_dim:2*n_dim]                     # b x n
+      gy_act = rnn_matrix[:, 2*n_dim:2*n_dim+i_dim]            # b x i
+      y_act = rnn_matrix[:, 2*n_dim+i_dim:2*n_dim+2*i_dim]     # b x i
+
+      h = tanh(h_act)
+      y = self._y_activation(y_act)
+      gh = sigmoid(gh_act + self._forget_bias)
+      gy = sigmoid(gy_act + self._forget_bias)
+
+      new_state = gh * state + (1.0 - gh) * h  # passed thru time
+      new_y = gy * inputs + (1.0 - gy) * y  # passed thru depth
+
+    return new_y, new_state
 
 
 _REGISTERED_OPS = None
 
 
-class CompiledWrapper(core_rnn_cell.RNNCell):
+class CompiledWrapper(rnn_cell_impl.RNNCell):
   """Wraps step execution in an XLA JIT scope."""
 
   def __init__(self, cell, compile_stateful=False):
@@ -1279,6 +1730,10 @@ class CompiledWrapper(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._cell.output_size
 
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
   def __call__(self, inputs, state, scope=None):
     if self._compile_stateful:
       compile_ops = True
@@ -1290,4 +1745,353 @@ class CompiledWrapper(core_rnn_cell.RNNCell):
         return not _REGISTERED_OPS[node_def.op].is_stateful
 
     with jit.experimental_jit_scope(compile_ops=compile_ops):
-      return self._cell(inputs, state, scope=scope)
+      return self._cell(inputs, state, scope)
+
+
+def _random_exp_initializer(minval,
+                            maxval,
+                            seed=None,
+                            dtype=dtypes.float32):
+  """Returns an exponential distribution initializer.
+
+  Args:
+    minval: float or a scalar float Tensor. With value > 0. Lower bound of the
+        range of random values to generate.
+    maxval: float or a scalar float Tensor. With value > minval. Upper bound of
+        the range of random values to generate.
+    seed: An integer. Used to create random seeds.
+    dtype: The data type.
+
+  Returns:
+    An initializer that generates tensors with an exponential distribution.
+  """
+
+  def _initializer(shape, dtype=dtype, partition_info=None):
+    del partition_info  # Unused.
+    return math_ops.exp(
+        random_ops.random_uniform(
+            shape,
+            math_ops.log(minval),
+            math_ops.log(maxval),
+            dtype,
+            seed=seed))
+
+  return _initializer
+
+
+class PhasedLSTMCell(rnn_cell_impl.RNNCell):
+  """Phased LSTM recurrent network cell.
+
+  https://arxiv.org/pdf/1610.09513v1.pdf
+  """
+
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               leak=0.001,
+               ratio_on=0.1,
+               trainable_ratio_on=True,
+               period_init_min=1.0,
+               period_init_max=1000.0,
+               reuse=None):
+    """Initialize the Phased LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the Phased LSTM cell.
+      use_peepholes: bool, set True to enable peephole connections.
+      leak: float or scalar float Tensor with value in [0, 1]. Leak applied
+          during training.
+      ratio_on: float or scalar float Tensor with value in [0, 1]. Ratio of the
+          period during which the gates are open.
+      trainable_ratio_on: bool, weather ratio_on is trainable.
+      period_init_min: float or scalar float Tensor. With value > 0.
+          Minimum value of the initialized period.
+          The period values are initialized by drawing from the distribution:
+          e^U(log(period_init_min), log(period_init_max))
+          Where U(.,.) is the uniform distribution.
+      period_init_max: float or scalar float Tensor.
+          With value > period_init_min. Maximum value of the initialized period.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope. If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(PhasedLSTMCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._leak = leak
+    self._ratio_on = ratio_on
+    self._trainable_ratio_on = trainable_ratio_on
+    self._period_init_min = period_init_min
+    self._period_init_max = period_init_max
+    self._reuse = reuse
+
+  @property
+  def state_size(self):
+    return rnn_cell_impl.LSTMStateTuple(self._num_units, self._num_units)
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def _mod(self, x, y):
+    """Modulo function that propagates x gradients."""
+    return array_ops.stop_gradient(math_ops.mod(x, y) - x) + x
+
+  def _get_cycle_ratio(self, time, phase, period):
+    """Compute the cycle ratio in the dtype of the time."""
+    phase_casted = math_ops.cast(phase, dtype=time.dtype)
+    period_casted = math_ops.cast(period, dtype=time.dtype)
+    shifted_time = time - phase_casted
+    cycle_ratio = self._mod(shifted_time, period_casted) / period_casted
+    return math_ops.cast(cycle_ratio, dtype=dtypes.float32)
+
+  def call(self, inputs, state):
+    """Phased LSTM Cell.
+
+    Args:
+      inputs: A tuple of 2 Tensor.
+         The first Tensor has shape [batch, 1], and type float32 or float64.
+         It stores the time.
+         The second Tensor has shape [batch, features_size], and type float32.
+         It stores the features.
+      state: rnn_cell_impl.LSTMStateTuple, state from previous timestep.
+
+    Returns:
+      A tuple containing:
+      - A Tensor of float32, and shape [batch_size, num_units], representing the
+        output of the cell.
+      - A rnn_cell_impl.LSTMStateTuple, containing 2 Tensors of float32, shape
+        [batch_size, num_units], representing the new state and the output.
+    """
+    (c_prev, h_prev) = state
+    (time, x) = inputs
+
+    in_mask_gates = [x, h_prev]
+    if self._use_peepholes:
+      in_mask_gates.append(c_prev)
+
+    with vs.variable_scope("mask_gates"):
+      mask_gates = math_ops.sigmoid(
+          _linear(in_mask_gates, 2 * self._num_units, True))
+      [input_gate, forget_gate] = array_ops.split(
+          axis=1, num_or_size_splits=2, value=mask_gates)
+
+    with vs.variable_scope("new_input"):
+      new_input = math_ops.tanh(
+          _linear([x, h_prev], self._num_units, True))
+
+    new_c = (c_prev * forget_gate + input_gate * new_input)
+
+    in_out_gate = [x, h_prev]
+    if self._use_peepholes:
+      in_out_gate.append(new_c)
+
+    with vs.variable_scope("output_gate"):
+      output_gate = math_ops.sigmoid(
+          _linear(in_out_gate, self._num_units, True))
+
+    new_h = math_ops.tanh(new_c) * output_gate
+
+    period = vs.get_variable(
+        "period", [self._num_units],
+        initializer=_random_exp_initializer(
+            self._period_init_min, self._period_init_max))
+    phase = vs.get_variable(
+        "phase", [self._num_units],
+        initializer=init_ops.random_uniform_initializer(
+            0., period.initial_value))
+    ratio_on = vs.get_variable(
+        "ratio_on", [self._num_units],
+        initializer=init_ops.constant_initializer(self._ratio_on),
+        trainable=self._trainable_ratio_on)
+
+    cycle_ratio = self._get_cycle_ratio(time, phase, period)
+
+    k_up = 2 * cycle_ratio / ratio_on
+    k_down = 2 - k_up
+    k_closed = self._leak * cycle_ratio
+
+    k = array_ops.where(cycle_ratio < ratio_on, k_down, k_closed)
+    k = array_ops.where(cycle_ratio < 0.5 * ratio_on, k_up, k)
+
+    new_c = k * new_c + (1 - k) * c_prev
+    new_h = k * new_h + (1 - k) * h_prev
+
+    new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
+
+    return new_h, new_state
+
+
+class GLSTMCell(rnn_cell_impl.RNNCell):
+  """Group LSTM cell (G-LSTM).
+
+  The implementation is based on:
+
+    https://arxiv.org/abs/1703.10722
+
+  O. Kuchaiev and B. Ginsburg
+  "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
+  """
+
+  def __init__(self, num_units, initializer=None, num_proj=None,
+               number_of_groups=1, forget_bias=1.0, activation=math_ops.tanh,
+               reuse=None):
+    """Initialize the parameters of G-LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the G-LSTM cell
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      number_of_groups: (optional) int, number of groups to use.
+        If `number_of_groups` is 1, then it should be equivalent to LSTM cell
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already
+        has the given variables, an error is raised.
+
+    Raises:
+      ValueError: If `num_units` or `num_proj` is not divisible by 
+        `number_of_groups`.
+    """
+    super(GLSTMCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._forget_bias = forget_bias
+    self._activation = activation
+    self._number_of_groups = number_of_groups
+
+    if self._num_units % self._number_of_groups != 0:
+      raise ValueError("num_units must be divisible by number_of_groups")
+    if self._num_proj:
+      if self._num_proj % self._number_of_groups != 0:
+        raise ValueError("num_proj must be divisible by number_of_groups")
+      self._group_shape = [int(self._num_proj / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+    else:
+      self._group_shape = [int(self._num_units / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+
+    if num_proj:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _get_input_for_group(self, inputs, group_id, group_size):
+    """Slices inputs into groups to prepare for processing by cell's groups
+
+    Args:
+      inputs: cell input or it's previous state,
+              a Tensor, 2D, [batch x num_units]
+      group_id: group id, a Scalar, for which to prepare input
+      group_size: size of the group
+
+    Returns:
+      subset of inputs corresponding to group "group_id",
+      a Tensor, 2D, [batch x num_units/number_of_groups]
+    """
+    return array_ops.slice(input_=inputs,
+                           begin=[0, group_id * group_size],
+                           size=[self._batch_size, group_size],
+                           name=("GLSTM_group%d_input_generation" % group_id))
+
+  def call(self, inputs, state):
+    """Run one step of G-LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, [batch x num_units].
+      state: this must be a tuple of state Tensors, both `2-D`,
+      with column sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        G-LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - LSTMStateTuple representing the new state of G-LSTM  cell
+        after reading `inputs` when the previous state was `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    (c_prev, m_prev) = state
+
+    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer):
+      i_parts = []
+      j_parts = []
+      f_parts = []
+      o_parts = []
+
+      for group_id in range(self._number_of_groups):
+        with vs.variable_scope("group%d" % group_id):
+          x_g_id = array_ops.concat(
+            [self._get_input_for_group(inputs, group_id,
+                                       self._group_shape[0]),
+             self._get_input_for_group(m_prev, group_id,
+                                       self._group_shape[0])], axis=1)
+          R_k = _linear(x_g_id, 4 * self._group_shape[1], bias=False)
+          i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)
+
+        i_parts.append(i_k)
+        j_parts.append(j_k)
+        f_parts.append(f_k)
+        o_parts.append(o_k)
+
+      bi = vs.get_variable(name="bias_i",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bj = vs.get_variable(name="bias_j",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bf = vs.get_variable(name="bias_f",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bo = vs.get_variable(name="bias_o",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+
+      i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
+      j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
+      f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
+      o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)
+
+    c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
+         math_ops.sigmoid(i) * math_ops.tanh(j))
+    m = math_ops.sigmoid(o) * self._activation(c)
+
+    if self._num_proj is not None:
+      with vs.variable_scope("projection"):
+        m = _linear(m, self._num_proj, bias=False)
+
+    new_state = rnn_cell_impl.LSTMStateTuple(c, m)
+    return m, new_state
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
new file mode 100644
index 00000000000..1cbd27a2e53
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
@@ -0,0 +1,247 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert checkpoints using RNNCells to new name convention.
+
+Usage:
+
+  python checkpoint_convert.py [--write_v1_checkpoint] \
+      '/path/to/checkpoint' '/path/to/new_checkpoint'
+
+For example, if there is a V2 checkpoint to be converted and the files include:
+  /tmp/my_checkpoint/model.ckpt.data-00000-of-00001
+  /tmp/my_checkpoint/model.ckpt.index
+  /tmp/my_checkpoint/model.ckpt.meta
+
+use the following command:
+  mkdir /tmp/my_converted_checkpoint &&
+  python checkpoint_convert.py \
+      /tmp/my_checkpoint/model.ckpt /tmp/my_converted_checkpoint/model.ckpt
+
+This will generate three converted checkpoint files corresponding to the three
+old ones in the new directory:
+  /tmp/my_converted_checkpoint/model.ckpt.data-00000-of-00001
+  /tmp/my_converted_checkpoint/model.ckpt.index
+  /tmp/my_converted_checkpoint/model.ckpt.meta
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import re
+import sys
+
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
+
+_RNN_NAME_REPLACEMENTS = collections.OrderedDict([
+    ############################################################################
+    # contrib/rnn/python/ops/core_rnn_cell_impl.py
+    # BasicRNNCell
+    ('basic_rnn_cell/weights', 'basic_rnn_cell/kernel'),
+    ('basic_rnn_cell/biases', 'basic_rnn_cell/bias'),
+    # GRUCell
+    ('gru_cell/weights', 'gru_cell/kernel'),
+    ('gru_cell/biases', 'gru_cell/bias'),
+    ('gru_cell/gates/weights', 'gru_cell/gates/kernel'),
+    ('gru_cell/gates/biases', 'gru_cell/gates/bias'),
+    ('gru_cell/candidate/weights', 'gru_cell/candidate/kernel'),
+    ('gru_cell/candidate/biases', 'gru_cell/candidate/bias'),
+    # BasicLSTMCell
+    ('basic_lstm_cell/weights', 'basic_lstm_cell/kernel'),
+    ('basic_lstm_cell/biases', 'basic_lstm_cell/bias'),
+    # LSTMCell
+    ('lstm_cell/weights', 'lstm_cell/kernel'),
+    ('lstm_cell/biases', 'lstm_cell/bias'),
+    ('lstm_cell/projection/weights', 'lstm_cell/projection/kernel'),
+    ('lstm_cell/projection/biases', 'lstm_cell/projection/bias'),
+    # OutputProjectionWrapper
+    ('output_projection_wrapper/weights', 'output_projection_wrapper/kernel'),
+    ('output_projection_wrapper/biases', 'output_projection_wrapper/bias'),
+    # InputProjectionWrapper
+    ('input_projection_wrapper/weights', 'input_projection_wrapper/kernel'),
+    ('input_projection_wrapper/biases', 'input_projection_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/lstm_ops.py
+    # LSTMBlockFusedCell ??
+    ('lstm_block_wrapper/weights', 'lstm_block_wrapper/kernel'),
+    ('lstm_block_wrapper/biases', 'lstm_block_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/rnn_cell.py
+    # LayerNormBasicLSTMCell
+    ('layer_norm_basic_lstm_cell/weights', 'layer_norm_basic_lstm_cell/kernel'),
+    ('layer_norm_basic_lstm_cell/biases', 'layer_norm_basic_lstm_cell/bias'),
+    # UGRNNCell, not found in g3, but still need it?
+    ('ugrnn_cell/weights', 'ugrnn_cell/kernel'),
+    ('ugrnn_cell/biases', 'ugrnn_cell/bias'),
+    # NASCell
+    ('nas_rnn/weights', 'nas_rnn/kernel'),
+    ('nas_rnn/recurrent_weights', 'nas_rnn/recurrent_kernel'),
+    # IntersectionRNNCell
+    ('intersection_rnn_cell/weights', 'intersection_rnn_cell/kernel'),
+    ('intersection_rnn_cell/biases', 'intersection_rnn_cell/bias'),
+    ('intersection_rnn_cell/in_projection/weights',
+     'intersection_rnn_cell/in_projection/kernel'),
+    ('intersection_rnn_cell/in_projection/biases',
+     'intersection_rnn_cell/in_projection/bias'),
+    # PhasedLSTMCell
+    ('phased_lstm_cell/mask_gates/weights',
+     'phased_lstm_cell/mask_gates/kernel'),
+    ('phased_lstm_cell/mask_gates/biases', 'phased_lstm_cell/mask_gates/bias'),
+    ('phased_lstm_cell/new_input/weights', 'phased_lstm_cell/new_input/kernel'),
+    ('phased_lstm_cell/new_input/biases', 'phased_lstm_cell/new_input/bias'),
+    ('phased_lstm_cell/output_gate/weights',
+     'phased_lstm_cell/output_gate/kernel'),
+    ('phased_lstm_cell/output_gate/biases',
+     'phased_lstm_cell/output_gate/bias'),
+    # AttentionCellWrapper
+    ('attention_cell_wrapper/weights', 'attention_cell_wrapper/kernel'),
+    ('attention_cell_wrapper/biases', 'attention_cell_wrapper/bias'),
+    ('attention_cell_wrapper/attn_output_projection/weights',
+     'attention_cell_wrapper/attn_output_projection/kernel'),
+    ('attention_cell_wrapper/attn_output_projection/biases',
+     'attention_cell_wrapper/attn_output_projection/bias'),
+    ('attention_cell_wrapper/attention/weights',
+     'attention_cell_wrapper/attention/kernel'),
+    ('attention_cell_wrapper/attention/biases',
+     'attention_cell_wrapper/attention/bias'),
+])
+
+_RNN_SHARDED_NAME_REPLACEMENTS = collections.OrderedDict([
+    ('LSTMCell/W_', 'lstm_cell/weights/part_'),
+    ('BasicLSTMCell/Linear/Matrix_', 'basic_lstm_cell/weights/part_'),
+    ('GRUCell/W_', 'gru_cell/weights/part_'),
+    ('MultiRNNCell/Cell', 'multi_rnn_cell/cell_'),
+])
+
+
+def _rnn_name_replacement(var_name):
+  for pattern in _RNN_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern, _RNN_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+      break
+  return var_name
+
+
+def _rnn_name_replacement_sharded(var_name):
+  for pattern in _RNN_SHARDED_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern,
+                                  _RNN_SHARDED_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+  return var_name
+
+
+def _split_sharded_vars(name_shape_map):
+  """Split shareded variables.
+
+  Args:
+    name_shape_map: A dict from variable name to variable shape.
+
+  Returns:
+    not_sharded: Names of the non-sharded variables.
+    sharded: Names of the sharded variables.
+  """
+  sharded = []
+  not_sharded = []
+  for name in name_shape_map:
+    if re.match(name, '_[0-9]+$'):
+      if re.sub('_[0-9]+$', '_1', name) in name_shape_map:
+        sharded.append(name)
+      else:
+        not_sharded.append(name)
+    else:
+      not_sharded.append(name)
+  return not_sharded, sharded
+
+
+def convert_names(checkpoint_from_path,
+                  checkpoint_to_path,
+                  write_v1_checkpoint=False):
+  """Migrates the names of variables within a checkpoint.
+
+  Args:
+    checkpoint_from_path: Path to source checkpoint to be read in.
+    checkpoint_to_path: Path to checkpoint to be written out.
+    write_v1_checkpoint: Whether the output checkpoint will be in V1 format.
+
+  Returns:
+    A dictionary that maps the new variable names to the Variable objects.
+    A dictionary that maps the old variable names to the new variable names.
+  """
+  with ops.Graph().as_default():
+    logging.info('Reading checkpoint_from_path %s' % checkpoint_from_path)
+    reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_from_path)
+    name_shape_map = reader.get_variable_to_shape_map()
+    not_sharded, sharded = _split_sharded_vars(name_shape_map)
+    new_variable_map = {}
+    conversion_map = {}
+    for var_name in not_sharded:
+      new_var_name = _rnn_name_replacement(var_name)
+      tensor = reader.get_tensor(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+    for var_name in sharded:
+      new_var_name = _rnn_name_replacement_sharded(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+
+    write_version = (saver_pb2.SaverDef.V1
+                     if write_v1_checkpoint else saver_pb2.SaverDef.V2)
+    saver = saver_lib.Saver(new_variable_map, write_version=write_version)
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      logging.info('Writing checkpoint_to_path %s' % checkpoint_to_path)
+      saver.save(sess, checkpoint_to_path)
+
+  logging.info('Summary:')
+  logging.info('  Converted %d variable name(s).' % len(new_variable_map))
+  return new_variable_map, conversion_map
+
+
+def main(_):
+  convert_names(
+      FLAGS.checkpoint_from_path,
+      FLAGS.checkpoint_to_path,
+      write_v1_checkpoint=FLAGS.write_v1_checkpoint)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument('checkpoint_from_path', type=str,
+                      help='Path to source checkpoint to be read in.')
+  parser.add_argument('checkpoint_to_path', type=str,
+                      help='Path to checkpoint to be written out.')
+  parser.add_argument('--write_v1_checkpoint', action='store_true',
+                      help='Write v1 checkpoint')
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
new file mode 100644
index 00000000000..e2fc2fa80ea
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for checkpoint converter."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import tempfile
+
+from tensorflow.contrib.rnn.python.tools import checkpoint_convert
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class CheckpointConvertTest(test.TestCase):
+
+  def setUp(self):
+    self._old_ckpt_path = tempfile.mktemp()
+    self._new_ckpt_path = tempfile.mktemp()
+    ops.reset_default_graph()
+
+  def tearDown(self):
+    for file_name in glob.glob(self._old_ckpt_path + "*"):
+      os.remove(file_name)
+    for file_name in glob.glob(self._new_ckpt_path + "*"):
+      os.remove(file_name)
+
+  def testReplacementDictsContainUniqueAndNonEmptyVariableNames(self):
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+    for old_name in checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+
+  def testConversionFromV2WithConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      variables.Variable(20.0, name=old_name)
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertTrue(glob.glob(self._new_ckpt_path + "*"))
+    self.assertItemsEqual(
+        ["a"] + list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values()),
+        new_var_map.keys())
+    self.assertEqual(checkpoint_convert._RNN_NAME_REPLACEMENTS, conversion_map)
+
+  def testConversionFromV2WithoutConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertItemsEqual(["a"], new_var_map.keys())
+    self.assertFalse(conversion_map)
+
+  def testConversionToV1Succeeds(self):
+    variables.Variable(10.0, name="a")
+    variables.Variable(
+        20.0, name=list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1])
+
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path, write_v1_checkpoint=True)
+    self.assertItemsEqual(
+        ["a", list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]],
+        new_var_map.keys())
+    self.assertEqual(
+        {list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1]:
+         list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]},
+        conversion_map)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
new file mode 100644
index 00000000000..6ab9631d29f
--- /dev/null
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -0,0 +1,86 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+# SavedModel contrib libraries.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "saved_model_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ] + glob(
+        ["python/saved_model/*.py"],
+        exclude = ["python/saved_model/*_test.py"],
+    ),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "reader_test",
+    size = "small",
+    srcs = ["python/saved_model/reader_test.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":saved_model_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+py_test(
+    name = "signature_def_utils_test",
+    size = "small",
+    srcs = ["python/saved_model/signature_def_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saved_model_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:utils",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/saved_model/__init__.py b/tensorflow/contrib/saved_model/__init__.py
new file mode 100644
index 00000000000..b4f27a055da
--- /dev/null
+++ b/tensorflow/contrib/saved_model/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel contrib support.
+
+SavedModel provides a language-neutral format to save machine-learned models
+that is recoverable and hermetic. It enables higher-level systems and tools to
+produce, consume and transform TensorFlow models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.saved_model.python.saved_model.signature_def_utils import *
+# pylint: enable=unused-import,widcard-import,line-too-long
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["get_signature_def_by_key"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
new file mode 100644
index 00000000000..37c363cd0d2
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -0,0 +1,56 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+# SavedModel contrib libraries for C++.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.cc"],
+    hdrs = ["signature_def_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "signature_def_utils_test",
+    size = "small",
+    srcs = ["signature_def_utils_test.cc"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc
new file mode 100644
index 00000000000..a45908d2726
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+namespace {
+template <class T>
+Status FindInProtobufMap(StringPiece description,
+                         const protobuf::Map<string, T>& map, const string& key,
+                         const T** value) {
+  const auto it = map.find(key);
+  if (it == map.end()) {
+    return errors::NotFound("Could not find ", description, " for key: ", key);
+  }
+  *value = &it->second;
+  return Status::OK();
+}
+}  // namespace
+
+Status FindSignatureDefByKey(const MetaGraphDef& meta_graph_def,
+                             const string& signature_def_key,
+                             const SignatureDef** signature_def) {
+  return FindInProtobufMap("SignatureDef", meta_graph_def.signature_def(),
+                           signature_def_key, signature_def);
+}
+
+Status FindInputTensorInfoByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key,
+                                const TensorInfo** tensor_info) {
+  return FindInProtobufMap("input TensorInfo", signature_def.inputs(),
+                           tensor_info_key, tensor_info);
+}
+
+Status FindOutputTensorInfoByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key,
+                                 const TensorInfo** tensor_info) {
+  return FindInProtobufMap("output TensorInfo", signature_def.outputs(),
+                           tensor_info_key, tensor_info);
+}
+
+Status FindInputTensorNameByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key, string* name) {
+  const TensorInfo* tensor_info;
+  TF_RETURN_IF_ERROR(
+      FindInputTensorInfoByKey(signature_def, tensor_info_key, &tensor_info));
+  *name = tensor_info->name();
+  return Status::OK();
+}
+
+Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key, string* name) {
+  const TensorInfo* tensor_info;
+  TF_RETURN_IF_ERROR(
+      FindOutputTensorInfoByKey(signature_def, tensor_info_key, &tensor_info));
+  *name = tensor_info->name();
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
new file mode 100644
index 00000000000..c0df224bc8c
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for working with the SignatureDefs of TensorFlow SavedModels.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+// Finds the entry in meta_graph_def.signature_def with the given key, or
+// returns NotFound and leaves *signature_def unchanged. NOTE: The output
+// SignatureDef* points into meta_graph_def and may be invalidated by changes
+// to that protocol buffer, as usual.
+Status FindSignatureDefByKey(const MetaGraphDef& meta_graph_def,
+                             const string& signature_def_key,
+                             const SignatureDef** signature_def);
+
+// Finds the entry in signature_def.inputs with the given key, or returns
+// NotFound and leaves *tensor_info unchanged. NOTE: The output TensorInfo*
+// points into signature_def and may be invalidated by changes to that protocol
+// buffer, as usual.
+Status FindInputTensorInfoByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key,
+                                const TensorInfo** tensor_info);
+
+// Finds the entry in signature_def.outputs with the given key, or returns
+// NotFound and leaves *tensor_info unchanged. NOTE: The output TensorInfo*
+// points into signature_def and may be invalidated by changes to that protocol
+// buffer, as usual.
+Status FindOutputTensorInfoByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key,
+                                 const TensorInfo** tensor_info);
+
+// Finds the entry in signature_def.inputs with the given key and copies out
+// the name of this Tensor in the graph, or returns NotFound and leaves *name
+// unchanged.
+Status FindInputTensorNameByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key, string* name);
+
+// Finds the entry in signature_def.outputs with the given key and copies out
+// the name of this Tensor in the graph, or returns NotFound and leaves *name
+// unchanged.
+Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key, string* name);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc
new file mode 100644
index 00000000000..a063e956960
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class SignatureDefUtilsTest : public ::testing::Test {
+ protected:
+  MetaGraphDef MakeSampleMetaGraphDef() {
+    MetaGraphDef result;
+    (*result.mutable_signature_def())["blah"].set_method_name("foo");
+    (*result.mutable_signature_def())[kSignatureKey] = MakeSampleSignatureDef();
+    (*result.mutable_signature_def())["gnarl"].set_method_name("blah");
+    return result;
+  }
+
+  SignatureDef MakeSampleSignatureDef() {
+    SignatureDef result;
+    result.set_method_name(kMethodName);
+    (*result.mutable_inputs())[kInput1Key].set_name(kInput1Name);
+    (*result.mutable_inputs())[kInput2Key].set_name(kInput2Name);
+    (*result.mutable_outputs())[kOutput1Key].set_name(kOutput1Name);
+    (*result.mutable_outputs())[kOutput2Key].set_name(kOutput2Name);
+    return result;
+  }
+
+  const string kSignatureKey = "my_signature";
+  const string kMethodName = "my_method";
+  const string kInput1Key = "input_one_key";
+  const string kInput1Name = "input_one";
+  const string kInput2Key = "input_two_key";
+  const string kInput2Name = "input_two";
+  const string kOutput1Key = "output_one_key";
+  const string kOutput1Name = "output_one";
+  const string kOutput2Key = "output_two_key";
+  const string kOutput2Name = "output_two";
+};
+
+TEST_F(SignatureDefUtilsTest, FindSignatureDefByKey) {
+  const MetaGraphDef meta_graph_def = MakeSampleMetaGraphDef();
+  const SignatureDef* signature_def;
+  // Succeeds for an existing signature.
+  TF_ASSERT_OK(
+      FindSignatureDefByKey(meta_graph_def, kSignatureKey, &signature_def));
+  EXPECT_EQ(kMethodName, signature_def->method_name());
+  // Fails for a missing signature.
+  EXPECT_FALSE(
+      FindSignatureDefByKey(meta_graph_def, "nonexistent", &signature_def)
+          .ok());
+}
+
+TEST_F(SignatureDefUtilsTest, FindInputTensorNameByKey) {
+  const SignatureDef signature_def = MakeSampleSignatureDef();
+  string name;
+  // Succeeds for an existing input.
+  TF_ASSERT_OK(FindInputTensorNameByKey(signature_def, kInput2Key, &name));
+  EXPECT_EQ(kInput2Name, name);
+  // Fails for a missing input.
+  EXPECT_FALSE(
+      FindInputTensorNameByKey(signature_def, "nonexistent", &name).ok());
+}
+
+TEST_F(SignatureDefUtilsTest, FindOutputTensorNameByKey) {
+  const SignatureDef signature_def = MakeSampleSignatureDef();
+  string name;
+  // Succeeds for an existing output.
+  TF_ASSERT_OK(FindOutputTensorNameByKey(signature_def, kOutput2Key, &name));
+  EXPECT_EQ(kOutput2Name, name);
+  // Fails for a missing output.
+  EXPECT_FALSE(
+      FindOutputTensorNameByKey(signature_def, "nonexistent", &name).ok());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/saved_model/python/__init__.py b/tensorflow/contrib/saved_model/python/__init__.py
new file mode 100644
index 00000000000..f186c520c55
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel contrib support.
+
+SavedModel provides a language-neutral format to save machine-learned models
+that is recoverable and hermetic. It enables higher-level systems and tools to
+produce, consume and transform TensorFlow models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.saved_model.python.saved_model import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/saved_model/python/saved_model/__init__.py b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
new file mode 100644
index 00000000000..7b91622b612
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel contrib support.
+
+SavedModel provides a language-neutral format to save machine-learned models
+that is recoverable and hermetic. It enables higher-level systems and tools to
+produce, consume and transform TensorFlow models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader.py b/tensorflow/contrib/saved_model/python/saved_model/reader.py
new file mode 100644
index 00000000000..b9e5319181d
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader.py
@@ -0,0 +1,92 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel functionality to read a SavedModel from disk."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from google.protobuf import message
+from google.protobuf import text_format
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def read_saved_model(saved_model_dir):
+  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel file.
+
+  Returns:
+    A `SavedModel` protocol buffer.
+
+  Raises:
+    IOError: If the file does not exist, or cannot be successfully parsed.
+  """
+  # Build the path to the SavedModel in pbtxt format.
+  path_to_pbtxt = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+  # Build the path to the SavedModel in pb format.
+  path_to_pb = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+  # Ensure that the SavedModel exists at either path.
+  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
+      path_to_pb):
+    raise IOError("SavedModel file does not exist at: %s" % saved_model_dir)
+
+  # Parse the SavedModel protocol buffer.
+  saved_model = saved_model_pb2.SavedModel()
+  if file_io.file_exists(path_to_pb):
+    try:
+      file_content = file_io.FileIO(path_to_pb, "rb").read()
+      saved_model.ParseFromString(file_content)
+      return saved_model
+    except message.DecodeError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pb, str(e)))
+  elif file_io.file_exists(path_to_pbtxt):
+    try:
+      file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
+      text_format.Merge(file_content.decode("utf-8"), saved_model)
+      return saved_model
+    except text_format.ParseError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  else:
+    raise IOError("SavedModel file does not exist at: %s/{%s|%s}" %
+                  (saved_model_dir, constants.SAVED_MODEL_FILENAME_PBTXT,
+                   constants.SAVED_MODEL_FILENAME_PB))
+
+
+def get_saved_model_tag_sets(saved_model_dir):
+  """Retrieves all the tag-sets available in the SavedModel.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel.
+
+  Returns:
+    String representation of all tag-sets in the SavedModel.
+  """
+  saved_model = read_saved_model(saved_model_dir)
+  all_tags = []
+  for meta_graph_def in saved_model.meta_graphs:
+    all_tags.append(list(meta_graph_def.meta_info_def.tags))
+  return all_tags
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
new file mode 100644
index 00000000000..76d5a3e96d2
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
@@ -0,0 +1,98 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModel Reader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+
+
+def tearDownModule():
+  file_io.delete_recursively(test.get_temp_dir())
+
+
+class ReaderTest(test.TestCase):
+
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = variables.Variable(variable_value, name=variable_name)
+    sess.run(variables.global_variables_initializer())
+    self.assertEqual(variable_value, v.eval())
+
+  def testReadSavedModelValid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "valid_saved_model")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    builder.save()
+
+    actual_saved_model_pb = reader.read_saved_model(saved_model_dir)
+    self.assertEqual(len(actual_saved_model_pb.meta_graphs), 1)
+    self.assertEqual(
+        len(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags), 1)
+    self.assertEqual(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags[0],
+                     tag_constants.TRAINING)
+
+  def testReadSavedModelInvalid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "invalid_saved_model")
+    with self.assertRaisesRegexp(
+        IOError, "SavedModel file does not exist at: %s" % saved_model_dir):
+      reader.read_saved_model(saved_model_dir)
+
+  def testGetSavedModelTagSets(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+
+    # Graph with a single variable. SavedModel invoked to:
+    # - add with weights.
+    # - a single tag (from predefined constants).
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - a single tag (from predefined constants).
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 43)
+      builder.add_meta_graph([tag_constants.SERVING])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple custom tags.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph(["foo", "bar"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    actual_tags = reader.get_saved_model_tag_sets(saved_model_dir)
+    expected_tags = [["train"], ["serve"], ["foo", "bar"]]
+    self.assertEqual(expected_tags, actual_tags)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils.py b/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils.py
new file mode 100644
index 00000000000..f521647999a
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SignatureDef utility functions implementation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def get_signature_def_by_key(meta_graph_def, signature_def_key):
+  """Utility function to get a SignatureDef protocol buffer by its key.
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer with the SignatureDefMap to
+      look up.
+    signature_def_key: Key of the SignatureDef protocol buffer to find in the
+      SignatureDefMap.
+
+  Returns:
+    A SignatureDef protocol buffer corresponding to the supplied key, if it
+    exists.
+
+  Raises:
+    ValueError: If no entry corresponding to the supplied key is found in the
+    SignatureDefMap of the MetaGraphDef.
+  """
+  if signature_def_key not in meta_graph_def.signature_def:
+    raise ValueError("No SignatureDef with key '%s' found in MetaGraphDef." %
+                     signature_def_key)
+  return meta_graph_def.signature_def[signature_def_key]
diff --git a/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils_test.py b/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils_test.py
new file mode 100644
index 00000000000..282dd7dc3b0
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/signature_def_utils_test.py
@@ -0,0 +1,191 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SignatureDef utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils as signature_def_contrib_utils
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import utils
+
+
+class SignatureDefUtilsTest(test.TestCase):
+
+  def _add_to_signature_def_map(self, meta_graph_def, signature_def_map=None):
+    if signature_def_map is not None:
+      for key in signature_def_map:
+        meta_graph_def.signature_def[key].CopyFrom(signature_def_map[key])
+
+  def _check_tensor_info(self, tensor_info_map, map_key, expected_tensor_name):
+    actual_tensor_info = tensor_info_map[map_key]
+    self.assertEqual(expected_tensor_name, actual_tensor_info.name)
+
+  def testGetSignatureDefByKey(self):
+    x = array_ops.placeholder(dtypes.float32, 1, name="x")
+    x_tensor_info = utils.build_tensor_info(x)
+
+    y = array_ops.placeholder(dtypes.float32, name="y")
+    y_tensor_info = utils.build_tensor_info(y)
+
+    foo_signature_def = signature_def_utils.build_signature_def({
+        "foo-input": x_tensor_info
+    }, {"foo-output": y_tensor_info}, "foo-method-name")
+    bar_signature_def = signature_def_utils.build_signature_def({
+        "bar-input": x_tensor_info
+    }, {"bar-output": y_tensor_info}, "bar-method-name")
+    meta_graph_def = meta_graph_pb2.MetaGraphDef()
+    self._add_to_signature_def_map(
+        meta_graph_def, {"foo": foo_signature_def,
+                         "bar": bar_signature_def})
+
+    # Look up a key that does not exist in the SignatureDefMap.
+    missing_key = "missing-key"
+    with self.assertRaisesRegexp(
+        ValueError,
+        "No SignatureDef with key '%s' found in MetaGraphDef" % missing_key):
+      signature_def_contrib_utils.get_signature_def_by_key(
+          meta_graph_def, missing_key)
+
+    # Look up the key, `foo` which exists in the SignatureDefMap.
+    foo_signature_def = signature_def_contrib_utils.get_signature_def_by_key(
+        meta_graph_def, "foo")
+    self.assertTrue("foo-method-name", foo_signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(1, len(foo_signature_def.inputs))
+    self._check_tensor_info(foo_signature_def.inputs, "foo-input", "x:0")
+
+    # Check outputs in signature def.
+    self.assertEqual(1, len(foo_signature_def.outputs))
+    self._check_tensor_info(foo_signature_def.outputs, "foo-output", "y:0")
+
+    # Look up the key, `bar` which exists in the SignatureDefMap.
+    bar_signature_def = signature_def_contrib_utils.get_signature_def_by_key(
+        meta_graph_def, "bar")
+    self.assertTrue("bar-method-name", bar_signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(1, len(bar_signature_def.inputs))
+    self._check_tensor_info(bar_signature_def.inputs, "bar-input", "x:0")
+
+    # Check outputs in signature def.
+    self.assertEqual(1, len(bar_signature_def.outputs))
+    self._check_tensor_info(bar_signature_def.outputs, "bar-output", "y:0")
+
+  def testGetSignatureDefByKeyRegression(self):
+    input1 = constant_op.constant("a", name="input-1")
+    output1 = constant_op.constant("b", name="output-1")
+
+    meta_graph_def = meta_graph_pb2.MetaGraphDef()
+    self._add_to_signature_def_map(meta_graph_def, {
+        "my_regression":
+            signature_def_utils.regression_signature_def(input1, output1)
+    })
+
+    # Look up the regression signature with the key used while saving.
+    signature_def = signature_def_contrib_utils.get_signature_def_by_key(
+        meta_graph_def, "my_regression")
+
+    # Check the method name to match the constants regression method name.
+    self.assertEqual(signature_constants.REGRESS_METHOD_NAME,
+                     signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(1, len(signature_def.inputs))
+    self._check_tensor_info(signature_def.inputs,
+                            signature_constants.REGRESS_INPUTS, "input-1:0")
+
+    # Check outputs in signature def.
+    self.assertEqual(1, len(signature_def.outputs))
+    self._check_tensor_info(signature_def.outputs,
+                            signature_constants.REGRESS_OUTPUTS, "output-1:0")
+
+  def testGetSignatureDefByKeyClassification(self):
+    input1 = constant_op.constant("a", name="input-1")
+    output1 = constant_op.constant("b", name="output-1")
+    output2 = constant_op.constant("c", name="output-2")
+
+    meta_graph_def = meta_graph_pb2.MetaGraphDef()
+    self._add_to_signature_def_map(meta_graph_def, {
+        "my_classification":
+            signature_def_utils.classification_signature_def(
+                input1, output1, output2)
+    })
+
+    # Look up the classification signature def with the key used while saving.
+    signature_def = signature_def_contrib_utils.get_signature_def_by_key(
+        meta_graph_def, "my_classification")
+
+    # Check the method name to match the constants classification method name.
+    self.assertEqual(signature_constants.CLASSIFY_METHOD_NAME,
+                     signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(1, len(signature_def.inputs))
+    self._check_tensor_info(signature_def.inputs,
+                            signature_constants.CLASSIFY_INPUTS, "input-1:0")
+
+    # Check outputs in signature def.
+    self.assertEqual(2, len(signature_def.outputs))
+    self._check_tensor_info(signature_def.outputs,
+                            signature_constants.CLASSIFY_OUTPUT_CLASSES,
+                            "output-1:0")
+    self._check_tensor_info(signature_def.outputs,
+                            signature_constants.CLASSIFY_OUTPUT_SCORES,
+                            "output-2:0")
+
+  def testPredictionSignatureDef(self):
+    input1 = constant_op.constant("a", name="input-1")
+    input2 = constant_op.constant("b", name="input-2")
+    output1 = constant_op.constant("c", name="output-1")
+    output2 = constant_op.constant("d", name="output-2")
+
+    meta_graph_def = meta_graph_pb2.MetaGraphDef()
+    self._add_to_signature_def_map(meta_graph_def, {
+        "my_prediction":
+            signature_def_utils.predict_signature_def({
+                "input-1": input1,
+                "input-2": input2
+            }, {"output-1": output1,
+                "output-2": output2})
+    })
+
+    # Look up the prediction signature def with the key used while saving.
+    signature_def = signature_def_contrib_utils.get_signature_def_by_key(
+        meta_graph_def, "my_prediction")
+    self.assertEqual(signature_constants.PREDICT_METHOD_NAME,
+                     signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(2, len(signature_def.inputs))
+    self._check_tensor_info(signature_def.inputs, "input-1", "input-1:0")
+    self._check_tensor_info(signature_def.inputs, "input-2", "input-2:0")
+
+    # Check outputs in signature def.
+    self.assertEqual(2, len(signature_def.outputs))
+    self._check_tensor_info(signature_def.outputs, "output-1", "output-1:0")
+    self._check_tensor_info(signature_def.outputs, "output-2", "output-2:0")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index c10d77f7efb..f1e39a13732 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -8,12 +8,28 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_kernel_library",
+    "tf_gen_op_wrapper_py",
+)
 
-py_library(
+tf_custom_op_py_library(
     name = "seq2seq_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    dso = [
+        ":python/ops/_beam_search_ops.so",
+    ],
+    kernels = [
+        ":beam_search_ops_kernels",
+        ":beam_search_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":beam_search_ops",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
@@ -21,23 +37,52 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
-    name = "decoder_fn_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/decoder_fn_test.py"],
-    additional_deps = [
-        ":seq2seq_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+tf_custom_op_library(
+    name = "python/ops/_beam_search_ops.so",
+    srcs = [
+        "kernels/beam_search_ops.cc",
+        "kernels/beam_search_ops.h",
+        "ops/beam_search_ops.cc",
+    ],
+    gpu_srcs = [
+        "kernels/beam_search_ops_gpu.cu.cc",
+        "kernels/beam_search_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "beam_search_ops",
+    deps = [":beam_search_ops_op_lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "beam_search_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "beam_search_ops_kernels",
+    prefix = "kernels/beam_search_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
     ],
 )
 
@@ -60,9 +105,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "seq2seq_test",
+    name = "basic_decoder_test",
     size = "medium",
-    srcs = ["python/kernel_tests/seq2seq_test.py"],
+    srcs = ["python/kernel_tests/basic_decoder_test.py"],
     additional_deps = [
         ":seq2seq_py",
         "//tensorflow/contrib/layers:layers_py",
@@ -80,22 +125,16 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "sampling_decoder_test",
+    name = "beam_search_ops_test",
     size = "medium",
-    srcs = ["python/kernel_tests/sampling_decoder_test.py"],
+    srcs = ["python/kernel_tests/beam_search_ops_test.py"],
     additional_deps = [
         ":seq2seq_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
     ],
 )
 
@@ -119,6 +158,47 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "beam_search_decoder_test",
+    size = "small",
+    srcs = ["python/kernel_tests/beam_search_decoder_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "attention_wrapper_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/attention_wrapper_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/seq2seq/README.md b/tensorflow/contrib/seq2seq/README.md
deleted file mode 100644
index 50ac32ec15c..00000000000
--- a/tensorflow/contrib/seq2seq/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# TensorFlow contrib seq2seq layers and losses
-
-## Layers
-
-Information to be added.
-
-## Losses
-
-Information to be added.
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index f8b35a1cbb3..d36d7e16dec 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -13,29 +13,51 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Ops for building neural network seq2seq decoders and losses."""
+"""Ops for building neural network seq2seq decoders and losses.
+
+See the @{$python/contrib.seq2seq} guide.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import,line-too-long
-from tensorflow.contrib.seq2seq.python.ops.attention_decoder_fn import attention_decoder_fn_inference
-from tensorflow.contrib.seq2seq.python.ops.attention_decoder_fn import attention_decoder_fn_train
-from tensorflow.contrib.seq2seq.python.ops.attention_decoder_fn import prepare_attention
-from tensorflow.contrib.seq2seq.python.ops.decoder_fn import *
+from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import *
+from tensorflow.contrib.seq2seq.python.ops.basic_decoder import *
+from tensorflow.contrib.seq2seq.python.ops.beam_search_decoder import *
+from tensorflow.contrib.seq2seq.python.ops.beam_search_ops import *
+from tensorflow.contrib.seq2seq.python.ops.decoder import *
+from tensorflow.contrib.seq2seq.python.ops.helper import *
 from tensorflow.contrib.seq2seq.python.ops.loss import *
-from tensorflow.contrib.seq2seq.python.ops.seq2seq import *
+from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,widcard-import,line-too-long
 
-from tensorflow.python.util.all_util import remove_undocumented
+_allowed_symbols = [
+    "sequence_loss",
+    "Decoder",
+    "dynamic_decode",
+    "BasicDecoder",
+    "BasicDecoderOutput",
+    "BeamSearchDecoder",
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "Helper",
+    "CustomHelper",
+    "FinalBeamSearchDecoderOutput",
+    "gather_tree",
+    "GreedyEmbeddingHelper",
+    "SampleEmbeddingHelper",
+    "ScheduledEmbeddingTrainingHelper",
+    "ScheduledOutputTrainingHelper",
+    "TrainingHelper",
+    "BahdanauAttention",
+    "LuongAttention",
+    "hardmax",
+    "AttentionWrapperState",
+    "AttentionWrapper",
+    "AttentionMechanism",
+    "tile_batch"]
 
-_allowed_symbols = ["attention_decoder_fn_inference",
-                    "attention_decoder_fn_train",
-                    "dynamic_rnn_decoder",
-                    "prepare_attention",
-                    "sequence_loss",
-                    "simple_decoder_fn_train",
-                    "simple_decoder_fn_inference"]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
new file mode 100644
index 00000000000..ec493b84635
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class GatherTreeOp : public OpKernel {
+ public:
+  explicit GatherTreeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Device& device = ctx->eigen_device<Device>();
+    const Tensor& step_ids = ctx->input(0);
+    const Tensor& parent_ids = ctx->input(1);
+    const Tensor& sequence_length = ctx->input(2);
+    const TensorShape& step_ids_shape = step_ids.shape();
+    OP_REQUIRES(
+        ctx, step_ids_shape.dims() == 3,
+        errors::InvalidArgument("step_ids must be a 3-tensor, saw shape: ",
+                                step_ids_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(sequence_length.shape()),
+        errors::InvalidArgument("sequence_length must be a matrix, saw shape: ",
+                                sequence_length.shape().DebugString()));
+    OP_REQUIRES(ctx, sequence_length.dim_size(0) == step_ids_shape.dim_size(1),
+                errors::InvalidArgument(
+                    "Inconsistent batch sizes: sequence_length.shape[0] (",
+                    sequence_length.dim_size(0), ") != ", "step_ids.shape[1] (",
+                    step_ids_shape.dim_size(1), ")"));
+    OP_REQUIRES(ctx, sequence_length.dim_size(1) == step_ids_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "Inconsistent batch sizes: sequence_length.shape[1] (",
+                    sequence_length.dim_size(1), ") != ", "step_ids.shape[2] (",
+                    step_ids_shape.dim_size(2), ")"));
+    OP_REQUIRES(
+        ctx, step_ids_shape == parent_ids.shape(),
+        errors::InvalidArgument(
+            "step_ids.shape must match parent_ids.shape.  but shapes are: ",
+            step_ids_shape.DebugString(), " and ",
+            parent_ids.shape().DebugString()));
+    Tensor* beams;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, step_ids_shape, &beams));
+    typename TTypes<T, 3>::ConstTensor step_ids_t = step_ids.tensor<T, 3>();
+    typename TTypes<T, 3>::ConstTensor parent_ids_t = parent_ids.tensor<T, 3>();
+    typename TTypes<T>::ConstMatrix seq_len_t = sequence_length.matrix<T>();
+    typename TTypes<T, 3>::Tensor beams_t = beams->tensor<T, 3>();
+    functor::GatherTree<Device, T>()(ctx, device, step_ids_t, parent_ids_t,
+                                     seq_len_t, beams_t);
+  }
+};
+
+#define REGISTER_KERNEL(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("GatherTree").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      GatherTreeOp<CPUDevice, T>);
+REGISTER_KERNEL(int32);
+#undef REGISTER_KERNEL
+
+namespace functor {
+
+// CPU specialization
+template <>
+struct GatherTree<CPUDevice, int32> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d,
+                  typename TTypes<int32, 3>::ConstTensor step_ids,
+                  typename TTypes<int32, 3>::ConstTensor parent_ids,
+                  typename TTypes<int32>::ConstMatrix sequence_length,
+                  typename TTypes<int32, 3>::Tensor beams) {
+    const int64 max_time = parent_ids.dimension(0);
+    const int64 batch_size = parent_ids.dimension(1);
+    const int64 beam_width = parent_ids.dimension(2);
+    beams.setConstant(-1);
+
+    auto DoWork = [&, ctx](int start_batch_beam, int limit_batch_beam) {
+      for (int32 i = start_batch_beam; i < limit_batch_beam; ++i) {
+        const int32 batch = i / beam_width;
+        const int32 beam = i % beam_width;
+        int32 seq_len_b = sequence_length(batch, beam);
+        if (seq_len_b == 0) {
+          continue;
+        }
+        beams(seq_len_b - 1, batch, beam) =
+            step_ids(seq_len_b - 1, batch, beam);
+        int32 parent = parent_ids(seq_len_b - 1, batch, beam);
+        for (int32 level = seq_len_b - 2; level >= 0; --level) {
+          if (parent < 0 || parent > beam_width) {
+            ctx->SetStatus(
+                errors::InvalidArgument("Saw invalid parent id ", parent,
+                                        " at (batch, time, beam) == (", batch,
+                                        ", ", level, ", ", beam, ")"));
+            return;
+          }
+          beams(level, batch, beam) = step_ids(level, batch, parent);
+          parent = parent_ids(level, batch, parent);
+        }
+      }
+    };
+    // Guesstimate of cost; ~5 lookup/store/compare per inner beam
+    // traversal time step.
+    const int64 batch_beam_cost =
+        Eigen::TensorOpCost::DivCost<int32>() +
+        6 * Eigen::TensorOpCost::AddCost<int32>() +
+        max_time * (5 * Eigen::TensorOpCost::AddCost<int32>());
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          batch_size * beam_width, batch_beam_cost, DoWork);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                            \
+  template <>                                          \
+  void GatherTree<GPUDevice, T>::operator()(           \
+      OpKernelContext* ctx, const GPUDevice& d,        \
+      typename TTypes<T, 3>::ConstTensor step_ids,     \
+      typename TTypes<T, 3>::ConstTensor parent_ids,   \
+      typename TTypes<T>::ConstMatrix sequence_length, \
+      typename TTypes<T, 3>::Tensor beams);            \
+  extern template struct GatherTree<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(int32);
+#undef DECLARE_GPU_SPEC
+}  // end namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("GatherTree").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      GatherTreeOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(int32);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
new file mode 100644
index 00000000000..124d07264e7
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_activations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, typename T>
+struct GatherTree {
+  void operator()(OpKernelContext* ctx, const Device& d,
+                  typename TTypes<T, 3>::ConstTensor step_ids,
+                  typename TTypes<T, 3>::ConstTensor parent_ids,
+                  typename TTypes<T>::ConstMatrix sequence_length,
+                  typename TTypes<T, 3>::Tensor beams);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
new file mode 100644
index 00000000000..e3c0d0bfa98
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
@@ -0,0 +1,88 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+__global__ void GatherTreeOpKernel(const int32 batch_size, const int32 max_time,
+                                   const int32 beam_width, const T* step_ids,
+                                   const T* parent_ids,
+                                   const T* sequence_length, T* beams) {
+  CUDA_1D_KERNEL_LOOP(i, batch_size * beam_width) {
+    const int32 batch = i / beam_width;
+    const int32 beam = i % beam_width;
+    const int32 seq_len_b = ldg(sequence_length + batch * beam_width + beam);
+#define GET_IX(time_ix, beam_ix) \
+  (batch_size * beam_width * (time_ix) + beam_width * batch + (beam_ix))
+    const int32 initial_beam_ix = GET_IX(seq_len_b - 1, beam);
+    beams[initial_beam_ix] = ldg(step_ids + initial_beam_ix);
+    int32 parent = ldg(parent_ids + initial_beam_ix);
+    for (int32 level = seq_len_b - 2; level >= 0; --level) {
+      const int32 level_beam_ix = GET_IX(level, beam);
+      const int32 level_parent_ix = GET_IX(level, parent);
+      if (parent < 0 || parent > beam_width) {
+        beams[level_beam_ix] = -1;
+        parent = -1;
+      } else {
+        beams[level_beam_ix] = ldg(step_ids + level_parent_ix);
+        parent = ldg(parent_ids + level_parent_ix);
+      }
+    }
+#undef GET_IX
+  }
+}
+
+template <typename T>
+struct GatherTree<GPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor step_ids,
+                  typename TTypes<T, 3>::ConstTensor parent_ids,
+                  typename TTypes<T>::ConstMatrix sequence_length,
+                  typename TTypes<T, 3>::Tensor beams) {
+    const int32 max_time = parent_ids.dimension(0);
+    const int32 batch_size = parent_ids.dimension(1);
+    const int32 beam_width = parent_ids.dimension(2);
+    // First kernel launch to zero things out
+    beams.device(d) = beams.constant(T(-1));
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
+    // clang-format off
+    GatherTreeOpKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            batch_size, max_time, beam_width,
+            step_ids.data(), parent_ids.data(), sequence_length.data(),
+            beams.data());
+    // clang-format on
+  }
+};
+
+#define DEFINE_GPU_SPECS(T) template struct GatherTree<GPUDevice, T>;
+
+DEFINE_GPU_SPECS(int32);
+#undef DEFINE_GPU_SPECS
+
+}  // end namespace functor
+}  // end namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc b/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc
new file mode 100644
index 00000000000..6c445cd4606
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("GatherTree")
+    .Input("step_ids: T")
+    .Input("parent_ids: T")
+    .Input("sequence_length: T")
+    .Output("beams: T")
+    .Attr("T: {int32}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle step_ids, parent_ids, sequence_length;
+
+      // step_ids, parent_ids, and output are all shaped:
+      //   [max_time, batch_size, beam_width].
+      // sequence_length is shaped [batch_size, beam_width].
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &step_ids));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &parent_ids));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &sequence_length));
+
+      DimensionHandle batch_size = c->Dim(step_ids, 1);
+      DimensionHandle beam_width = c->Dim(step_ids, 2);
+
+      TF_RETURN_IF_ERROR(c->Merge(step_ids, parent_ids, &step_ids));
+      TF_RETURN_IF_ERROR(
+          c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
+      TF_RETURN_IF_ERROR(
+          c->Merge(beam_width, c->Dim(sequence_length, 1), &beam_width));
+
+      c->set_output(0, step_ids);
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+Calculates the full beams from the per-step ids and parent beam ids.
+
+This op implements the following mathematical equations:
+
+```python
+TODO(ebrevdo): fill in
+```
+
+step_ids: `[max_time, batch_size, beam_width]`.
+parent_ids: `[max_time, batch_size, beam_width]`.
+sequence_length: `[batch_size, beam_width]`.
+beams: `[max_time, batch_size, beam_width]`.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
new file mode 100644
index 00000000000..99e51589c9a
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -0,0 +1,340 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.ops.attention_wrapper."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import collections
+import functools
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+# pylint: enable=g-import-not-at-top
+
+
+# for testing
+AttentionWrapperState = wrapper.AttentionWrapperState  # pylint: disable=invalid-name
+LSTMStateTuple = rnn_cell.LSTMStateTuple  # pylint: disable=invalid-name
+BasicDecoderOutput = basic_decoder.BasicDecoderOutput  # pylint: disable=invalid-name
+float32 = np.float32
+int32 = np.int32
+array = np.array
+dtype = np.dtype
+
+
+class ResultSummary(
+    collections.namedtuple('ResultSummary', ('shape', 'dtype', 'mean'))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
+
+
+class AttentionWrapperTest(test.TestCase):
+
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperTest, self).assertAllClose(
+          x, y, atol=1e-4, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
+
+  def testAttentionWrapperState(self):
+    num_fields = len(wrapper.AttentionWrapperState._fields)  # pylint: disable=protected-access
+    state = wrapper.AttentionWrapperState(*([None] * num_fields))
+    new_state = state.clone(time=1)
+    self.assertEqual(state.time, None)
+    self.assertEqual(new_state.time, 1)
+
+  def _testWithAttention(self,
+                         create_attention_mechanism,
+                         expected_final_output,
+                         expected_final_state,
+                         attention_mechanism_depth=3,
+                         alignment_history=False,
+                         expected_final_alignment_history=None,
+                         attention_layer_size=6,
+                         name=''):
+    encoder_sequence_length = [3, 2, 3, 1, 1]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    encoder_max_time = 8
+    decoder_max_time = 4
+    input_depth = 7
+    encoder_output_depth = 10
+    cell_depth = 9
+
+    if attention_layer_size is not None:
+      attention_depth = attention_layer_size
+    else:
+      attention_depth = encoder_output_depth
+
+    decoder_inputs = array_ops.placeholder_with_default(
+        np.random.randn(batch_size, decoder_max_time,
+                        input_depth).astype(np.float32),
+        shape=(None, None, input_depth))
+    encoder_outputs = array_ops.placeholder_with_default(
+        np.random.randn(batch_size, encoder_max_time,
+                        encoder_output_depth).astype(np.float32),
+        shape=(None, None, encoder_output_depth))
+
+    attention_mechanism = create_attention_mechanism(
+        num_units=attention_mechanism_depth,
+        memory=encoder_outputs,
+        memory_sequence_length=encoder_sequence_length)
+
+    with self.test_session(use_gpu=True) as sess:
+      with vs.variable_scope(
+          'root',
+          initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        cell = rnn_cell.LSTMCell(cell_depth)
+        cell = wrapper.AttentionWrapper(
+            cell,
+            attention_mechanism,
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history)
+        helper = helper_py.TrainingHelper(decoder_inputs,
+                                          decoder_sequence_length)
+        my_decoder = basic_decoder.BasicDecoder(
+            cell=cell,
+            helper=helper,
+            initial_state=cell.zero_state(
+                dtype=dtypes.float32, batch_size=batch_size))
+
+        final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
+      self.assertEqual((batch_size, None, attention_depth),
+                       tuple(final_outputs.rnn_output.get_shape().as_list()))
+      self.assertEqual((batch_size, None),
+                       tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.assertEqual((batch_size, attention_depth),
+                       tuple(final_state.attention.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.c.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.h.get_shape().as_list()))
+
+      if alignment_history:
+        state_alignment_history = final_state.alignment_history.stack()
+        # Remove the history from final_state for purposes of the
+        # remainder of the tests.
+        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
+        self.assertEqual((None, batch_size, None),
+                         tuple(state_alignment_history.get_shape().as_list()))
+      else:
+        state_alignment_history = ()
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          'final_outputs': final_outputs,
+          'final_state': final_state,
+          'state_alignment_history': state_alignment_history,
+      })
+
+      final_output_info = nest.map_structure(get_result_summary,
+                                             sess_results['final_outputs'])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            sess_results['final_state'])
+      print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
+      print('expected_final_state = %s' % str(final_state_info))
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
+      if alignment_history:  # by default, the wrapper emits attention as output
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, sess_results['state_alignment_history'])
+        print('expected_final_alignment_history = %s' %
+              str(final_alignment_history_info))
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
+            # outputs are batch major but the stacked TensorArray is time major
+            expected_final_alignment_history,
+            final_alignment_history_info)
+
+  def testBahdanauNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttention
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052250605),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040092287),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020015112)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0052052638),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.12500001)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testBahdanauNotNormalized')
+
+  def testBahdanauNormalized(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.BahdanauAttention, normalize=True)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040052128),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019996136)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00595117),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        name='testBahdanauNormalized')
+
+  def testLuongNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongAttention
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        name='testLuongNotNormalized')
+
+  def testLuongScaled(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.LuongAttention, scale=True)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.004009536),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0020016613)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.0051812846),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        name='testLuongScaled')
+
+  def testNotUseAttentionLayer(self):
+    create_attention_mechanism = wrapper.BahdanauAttention
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=dtype('float32'), mean=0.117389656),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=4.5999999999999996))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0063607907),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.00323448)),
+        attention=ResultSummary(
+            shape=(5, 10), dtype=dtype('float32'), mean=0.117389656,),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        name='testNotUseAttentionLayer')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
new file mode 100644
index 00000000000..cb12bc9450c
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -0,0 +1,497 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.basic_decoder."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+# pylint: enable=g-import-not-at-top
+
+
+class BasicDecoderTest(test.TestCase):
+
+  def _testStepWithTrainingHelper(self, use_output_layer):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    output_layer_depth = 3
+
+    with self.test_session(use_gpu=True) as sess:
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      helper = helper_py.TrainingHelper(
+          inputs, sequence_length, time_major=False)
+      if use_output_layer:
+        output_layer = layers_core.Dense(output_layer_depth, use_bias=False)
+        expected_output_depth = output_layer_depth
+      else:
+        output_layer = None
+        expected_output_depth = cell_depth
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size),
+          output_layer=output_layer)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(expected_output_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (first_finished, first_inputs, first_state) = my_decoder.initialize()
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, expected_output_depth),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      if use_output_layer:
+        # The output layer was accessed
+        self.assertEqual(len(output_layer.variables), 1)
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          sess_results["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          sess_results["step_finished"])
+      self.assertAllEqual(
+          np.argmax(sess_results["step_outputs"].rnn_output, -1),
+          sess_results["step_outputs"].sample_id)
+
+  def testStepWithTrainingHelperNoOutputLayer(self):
+    self._testStepWithTrainingHelper(use_output_layer=False)
+
+  def testStepWithTrainingHelperWithOutputLayer(self):
+    self._testStepWithTrainingHelper(use_output_layer=True)
+
+  def testStepWithGreedyEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.test_session(use_gpu=True) as sess:
+      embeddings = np.random.randn(vocabulary_size,
+                                   input_depth).astype(np.float32)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      helper = helper_py.GreedyEmbeddingHelper(embeddings, start_tokens,
+                                               end_token)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size))
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (first_finished, first_inputs, first_state) = my_decoder.initialize()
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      expected_sample_ids = np.argmax(
+          sess_results["step_outputs"].rnn_output, -1)
+      expected_step_finished = (expected_sample_ids == end_token)
+      expected_step_next_inputs = embeddings[expected_sample_ids]
+      self.assertAllEqual([False, False, False, False, False],
+                          sess_results["first_finished"])
+      self.assertAllEqual(expected_step_finished, sess_results["step_finished"])
+      self.assertAllEqual(expected_sample_ids,
+                          sess_results["step_outputs"].sample_id)
+      self.assertAllEqual(expected_step_next_inputs,
+                          sess_results["step_next_inputs"])
+
+  def testStepWithSampleEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    np.random.seed(0)
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.test_session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "testStepWithSampleEmbeddingHelper",
+          initializer=init_ops.constant_initializer(0.01)):
+        embeddings = np.random.randn(vocabulary_size,
+                                     input_depth).astype(np.float32)
+        cell = rnn_cell.LSTMCell(vocabulary_size)
+        helper = helper_py.SampleEmbeddingHelper(embeddings, start_tokens,
+                                                 end_token, seed=0)
+        my_decoder = basic_decoder.BasicDecoder(
+            cell=cell,
+            helper=helper,
+            initial_state=cell.zero_state(
+                dtype=dtypes.float32, batch_size=batch_size))
+        output_size = my_decoder.output_size
+        output_dtype = my_decoder.output_dtype
+        self.assertEqual(
+            basic_decoder.BasicDecoderOutput(cell_depth,
+                                             tensor_shape.TensorShape([])),
+            output_size)
+        self.assertEqual(
+            basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+            output_dtype)
+
+        (first_finished, first_inputs, first_state) = my_decoder.initialize()
+        (step_outputs, step_state, step_next_inputs,
+         step_finished) = my_decoder.step(
+             constant_op.constant(0), first_inputs, first_state)
+        batch_size_t = my_decoder.batch_size
+
+        self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+        self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+        self.assertTrue(
+            isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+        self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+        self.assertEqual((batch_size,), step_outputs[1].get_shape())
+        self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+        self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+        self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+        self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+        sess.run(variables.global_variables_initializer())
+        sess_results = sess.run({
+            "batch_size": batch_size_t,
+            "first_finished": first_finished,
+            "first_inputs": first_inputs,
+            "first_state": first_state,
+            "step_outputs": step_outputs,
+            "step_state": step_state,
+            "step_next_inputs": step_next_inputs,
+            "step_finished": step_finished
+        })
+
+        sample_ids = sess_results["step_outputs"].sample_id
+        expected_step_finished = (sample_ids == end_token)
+        expected_step_next_inputs = embeddings[sample_ids]
+        self.assertAllEqual(expected_step_finished,
+                            sess_results["step_finished"])
+        self.assertAllEqual(expected_step_next_inputs,
+                            sess_results["step_next_inputs"])
+
+  def testStepWithScheduledEmbeddingTrainingHelper(self):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    vocabulary_size = 10
+
+    with self.test_session(use_gpu=True) as sess:
+      inputs = np.random.randn(
+          batch_size, max_time, input_depth).astype(np.float32)
+      embeddings = np.random.randn(
+          vocabulary_size, input_depth).astype(np.float32)
+      half = constant_op.constant(0.5)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      helper = helper_py.ScheduledEmbeddingTrainingHelper(
+          inputs=inputs,
+          sequence_length=sequence_length,
+          embedding=embeddings,
+          sampling_probability=half,
+          time_major=False)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size))
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(vocabulary_size,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (first_finished, first_inputs, first_state) = my_decoder.initialize()
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[1].get_shape())
+      self.assertEqual((batch_size, input_depth),
+                       step_next_inputs.get_shape())
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          sess_results["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          sess_results["step_finished"])
+      sample_ids = sess_results["step_outputs"].sample_id
+      batch_where_not_sampling = np.where(sample_ids == -1)
+      batch_where_sampling = np.where(sample_ids > -1)
+      self.assertAllClose(
+          sess_results["step_next_inputs"][batch_where_sampling],
+          embeddings[sample_ids[batch_where_sampling]])
+      self.assertAllClose(
+          sess_results["step_next_inputs"][batch_where_not_sampling],
+          np.squeeze(inputs[batch_where_not_sampling, 1]))
+
+  def _testStepWithScheduledOutputTrainingHelper(
+      self, sampling_probability, use_next_input_layer, use_auxiliary_inputs):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = input_depth
+    if use_next_input_layer:
+      cell_depth = 6
+    if use_auxiliary_inputs:
+      auxiliary_input_depth = 4
+      auxiliary_inputs = np.random.randn(
+          batch_size, max_time, auxiliary_input_depth).astype(np.float32)
+    else:
+      auxiliary_inputs = None
+
+    with self.test_session(use_gpu=True) as sess:
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampling_probability = constant_op.constant(sampling_probability)
+
+      next_input_layer = None
+      if use_next_input_layer:
+        next_input_layer = layers_core.Dense(input_depth, use_bias=False)
+
+      helper = helper_py.ScheduledOutputTrainingHelper(
+          inputs=inputs,
+          sequence_length=sequence_length,
+          sampling_probability=sampling_probability,
+          time_major=False,
+          next_input_layer=next_input_layer,
+          auxiliary_inputs=auxiliary_inputs)
+
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtypes.float32, batch_size=batch_size))
+
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (first_finished, first_inputs, first_state) = my_decoder.initialize()
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+
+      if use_next_input_layer:
+        output_after_next_input_layer = next_input_layer(
+            step_outputs.rnn_output)
+
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      sess.run(variables.global_variables_initializer())
+
+      fetches = {
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      }
+      if use_next_input_layer:
+        fetches["output_after_next_input_layer"] = output_after_next_input_layer
+
+      sess_results = sess.run(fetches)
+
+      self.assertAllEqual([False, False, False, False, True],
+                          sess_results["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          sess_results["step_finished"])
+
+      sample_ids = sess_results["step_outputs"].sample_id
+      batch_where_not_sampling = np.where(np.logical_not(sample_ids))
+      batch_where_sampling = np.where(sample_ids)
+
+      auxiliary_inputs_to_concat = (
+          auxiliary_inputs[:, 1] if use_auxiliary_inputs else
+          np.array([]).reshape(batch_size, 0).astype(np.float32))
+
+      expected_next_sampling_inputs = np.concatenate(
+          (sess_results["output_after_next_input_layer"][batch_where_sampling]
+           if use_next_input_layer else
+           sess_results["step_outputs"].rnn_output[batch_where_sampling],
+           auxiliary_inputs_to_concat[batch_where_sampling]),
+          axis=-1)
+      self.assertAllClose(
+          sess_results["step_next_inputs"][batch_where_sampling],
+          expected_next_sampling_inputs)
+
+      self.assertAllClose(
+          sess_results["step_next_inputs"][batch_where_not_sampling],
+          np.concatenate(
+              (np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
+               auxiliary_inputs_to_concat[batch_where_not_sampling]),
+              axis=-1))
+
+  def testStepWithScheduledOutputTrainingHelperWithoutNextInputLayerOrAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_input_layer=False,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputLayer(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_input_layer=True,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithAuxiliaryInputs(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_input_layer=False,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputLayerAndAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_input_layer=True,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNoSampling(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.0, use_next_input_layer=True,
+        use_auxiliary_inputs=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
new file mode 100644
index 00000000000..3d0627467aa
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -0,0 +1,332 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.beam_search_decoder."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+# pylint: enable=g-import-not-at-top
+
+
+class TestGatherTree(test.TestCase):
+  """Tests the gather_tree function."""
+
+  def test_gather_tree(self):
+    # (max_time = 3, batch_size = 2, beam_width = 3)
+
+    # create (batch_size, max_time, beam_width) matrix and transpose it
+    predicted_ids = np.array(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+         [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
+        dtype=np.int32).transpose([1, 0, 2])
+    parent_ids = np.array(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]],
+         [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
+        dtype=np.int32).transpose([1, 0, 2])
+
+    # sequence_lengths is shaped (batch_size = 2, beam_width = 3)
+    sequence_lengths = [[3, 3, 3], [3, 3, 3]]
+
+    expected_result = np.array(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
+         [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([1, 0, 2])
+
+    res = beam_search_ops.gather_tree(
+        predicted_ids, parent_ids, sequence_lengths)
+
+    with self.test_session() as sess:
+      res_ = sess.run(res)
+
+    self.assertAllEqual(expected_result, res_)
+
+
+class TestEosMasking(test.TestCase):
+  """Tests EOS masking used in beam search."""
+
+  def test_eos_masking(self):
+    probs = constant_op.constant([
+        [[-.2, -.2, -.2, -.2, -.2], [-.3, -.3, -.3, 3, 0], [5, 6, 0, 0, 0]],
+        [[-.2, -.2, -.2, -.2, 0], [-.3, -.3, -.1, 3, 0], [5, 6, 3, 0, 0]],
+    ])
+
+    eos_token = 0
+    previously_finished = constant_op.constant(
+        [[0, 1, 0], [0, 1, 1]], dtype=dtypes.float32)
+    masked = beam_search_decoder._mask_probs(probs, eos_token,
+                                             previously_finished)
+
+    with self.test_session() as sess:
+      probs = sess.run(probs)
+      masked = sess.run(masked)
+
+      self.assertAllEqual(probs[0][0], masked[0][0])
+      self.assertAllEqual(probs[0][2], masked[0][2])
+      self.assertAllEqual(probs[1][0], masked[1][0])
+
+      self.assertEqual(masked[0][1][0], 0)
+      self.assertEqual(masked[1][1][0], 0)
+      self.assertEqual(masked[1][2][0], 0)
+
+      for i in range(1, 5):
+        self.assertAllClose(masked[0][1][i], np.finfo('float32').min)
+        self.assertAllClose(masked[1][1][i], np.finfo('float32').min)
+        self.assertAllClose(masked[1][2][i], np.finfo('float32').min)
+
+
+class TestBeamStep(test.TestCase):
+  """Tests a single step of beam search."""
+
+  def setUp(self):
+    super(TestBeamStep, self).setUp()
+    self.batch_size = 2
+    self.beam_width = 3
+    self.vocab_size = 5
+    self.end_token = 0
+    self.length_penalty_weight = 0.6
+
+  def test_step(self):
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=nn_ops.log_softmax(
+            array_ops.ones([self.batch_size, self.beam_width])),
+        lengths=constant_op.constant(
+            2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int32),
+        finished=array_ops.zeros(
+            [self.batch_size, self.beam_width], dtype=dtypes.bool))
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 2.7
+    logits_[1, 2, 2] = 10.0
+    logits_[1, 2, 3] = 0.2
+    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        next_cell_state=dummy_cell_state,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, state_, log_probs_ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]])
+    self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]])
+    self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]])
+    self.assertAllEqual(next_state_.finished, [[False, False, False],
+                                               [False, False, False]])
+
+    expected_log_probs = []
+    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
+    expected_log_probs.append(state_.log_probs[1][[2, 1, 0]])  # 0 --> 1
+    expected_log_probs[0][0] += log_probs_[0, 1, 3]
+    expected_log_probs[0][1] += log_probs_[0, 0, 3]
+    expected_log_probs[0][2] += log_probs_[0, 0, 2]
+    expected_log_probs[1][0] += log_probs_[1, 2, 2]
+    expected_log_probs[1][1] += log_probs_[1, 1, 2]
+    expected_log_probs[1][2] += log_probs_[1, 0, 1]
+    self.assertAllEqual(next_state_.log_probs, expected_log_probs)
+
+  def test_step_with_eos(self):
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=nn_ops.log_softmax(
+            array_ops.ones([self.batch_size, self.beam_width])),
+        lengths=ops.convert_to_tensor(
+            [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int32),
+        finished=ops.convert_to_tensor(
+            [[False, True, False], [False, False, True]], dtype=dtypes.bool))
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 5.7  # why does this not work when it's 2.7?
+    logits_[1, 2, 2] = 1.0
+    logits_[1, 2, 3] = 0.2
+    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        next_cell_state=dummy_cell_state,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, state_, log_probs_ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]])
+    self.assertAllEqual(outputs_.predicted_ids, [[0, 3, 2], [2, 0, 1]])
+    self.assertAllEqual(next_state_.lengths, [[1, 3, 3], [3, 1, 3]])
+    self.assertAllEqual(next_state_.finished, [[True, False, False],
+                                               [False, True, False]])
+
+    expected_log_probs = []
+    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
+    expected_log_probs.append(state_.log_probs[1][[1, 2, 0]])
+    expected_log_probs[0][1] += log_probs_[0, 0, 3]
+    expected_log_probs[0][2] += log_probs_[0, 0, 2]
+    expected_log_probs[1][0] += log_probs_[1, 1, 2]
+    expected_log_probs[1][2] += log_probs_[1, 0, 1]
+    self.assertAllEqual(next_state_.log_probs, expected_log_probs)
+
+
+class BeamSearchDecoderTest(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major, has_attention):
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
+    batch_size = 5
+    decoder_max_time = 4
+    input_depth = 7
+    cell_depth = 9
+    attention_depth = 6
+    vocab_size = 20
+    end_token = vocab_size - 1
+    start_token = 0
+    embedding_dim = 50
+    max_out = max(decoder_sequence_length)
+    output_layer = layers_core.Dense(vocab_size, use_bias=True, activation=None)
+    beam_width = 3
+
+    with self.test_session() as sess:
+      batch_size_tensor = constant_op.constant(batch_size)
+      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      initial_state = cell.zero_state(batch_size, dtypes.float32)
+      if has_attention:
+        inputs = array_ops.placeholder_with_default(
+            np.random.randn(batch_size, decoder_max_time,
+                            input_depth).astype(np.float32),
+            shape=(None, None, input_depth))
+        tiled_inputs = beam_search_decoder.tile_batch(
+            inputs, multiplier=beam_width)
+        tiled_sequence_length = beam_search_decoder.tile_batch(
+            encoder_sequence_length, multiplier=beam_width)
+        attention_mechanism = attention_wrapper.BahdanauAttention(
+            num_units=attention_depth,
+            memory=tiled_inputs,
+            memory_sequence_length=tiled_sequence_length)
+        initial_state = beam_search_decoder.tile_batch(
+            initial_state, multiplier=beam_width)
+        cell = attention_wrapper.AttentionWrapper(
+            cell=cell,
+            attention_mechanism=attention_mechanism,
+            attention_layer_size=attention_depth,
+            alignment_history=False)
+      cell_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+      if has_attention:
+        cell_state = cell_state.clone(
+            cell_state=initial_state)
+      bsd = beam_search_decoder.BeamSearchDecoder(
+          cell=cell,
+          embedding=embedding,
+          start_tokens=array_ops.fill([batch_size_tensor], start_token),
+          end_token=end_token,
+          initial_state=cell_state,
+          beam_width=beam_width,
+          output_layer=output_layer,
+          length_penalty_weight=0.0)
+
+      final_outputs, final_state, final_sequence_lengths = (
+          decoder.dynamic_decode(
+              bsd, output_time_major=time_major, maximum_iterations=max_out))
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertTrue(
+          isinstance(final_outputs,
+                     beam_search_decoder.FinalBeamSearchDecoderOutput))
+      self.assertTrue(
+          isinstance(final_state, beam_search_decoder.BeamSearchDecoderState))
+
+      beam_search_decoder_output = final_outputs.beam_search_decoder_output
+      self.assertEqual(
+          _t((batch_size, None, beam_width)),
+          tuple(beam_search_decoder_output.scores.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, None, beam_width)),
+          tuple(final_outputs.predicted_ids.get_shape().as_list()))
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          'final_outputs': final_outputs,
+          'final_state': final_state,
+          'final_sequence_lengths': final_sequence_lengths
+      })
+
+      max_sequence_length = np.max(sess_results['final_sequence_lengths'])
+
+      # A smoke test
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)),
+          sess_results['final_outputs'].beam_search_decoder_output.scores.shape)
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)), sess_results[
+              'final_outputs'].beam_search_decoder_output.predicted_ids.shape)
+
+  def testDynamicDecodeRNNBatchMajorNoAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=False)
+
+  def testDynamicDecodeRNNBatchMajorYesAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
new file mode 100644
index 00000000000..491d87f62d8
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.beam_search_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _transpose_batch_time(x):
+  return np.transpose(x, [1, 0, 2]).astype(np.int32)
+
+
+class GatherTreeTest(test.TestCase):
+
+  def testGatherTreeOne(self):
+    # (max_time = 4, batch_size = 1, beams = 3)
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    expected_result = _transpose_batch_time(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    beams = beam_search_ops.gather_tree(
+        step_ids=step_ids, parent_ids=parent_ids,
+        sequence_length=sequence_length)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_result, beams.eval())
+
+  def testBadParentValuesOnCPU(self):
+    # (batch_size = 1, max_time = 4, beams = 3)
+    # bad parent in beam 1 time 1
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, -1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    with ops.device("/cpu:0"):
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r"parent id -1 at \(batch, time, beam\) == \(0, 0, 1\)"):
+        _ = beams.eval()
+
+  def testBadParentValuesOnGPU(self):
+    if not test.is_gpu_available():
+      return
+    # (max_time = 4, batch_size = 1, beams = 3)
+    # bad parent in beam 1 time 1; appears as a negative index at time 0
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, -1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    expected_result = _transpose_batch_time(
+        [[[2, -1, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    with ops.device("/gpu:0"):
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_result, beams.eval())
+
+  def testGatherTreeBatch(self):
+    # sequence_length is [batch_size, beam_width] = [4, 5]
+    sequence_length = [[0] * 5, [1] * 5, [2] * 5, [3] * 5]
+
+    with self.test_session(use_gpu=True):
+      # (max_time = 4, batch_size = 4, beam_width = 5)
+      step_ids = _transpose_batch_time(
+          [[[3, 4, 0, 4, 0],
+            [4, 2, 0, 3, 1],
+            [1, 1, 3, 2, 2],
+            [3, 1, 2, 3, 4]],
+           [[3, 4, 0, 4, 0],
+            [4, 2, 0, 3, 1],
+            [1, 1, 3, 2, 2],
+            [3, 1, 2, 3, 4]],
+           [[1, 2, 3, 4, 2],
+            [2, 1, 1, 3, 2],
+            [3, 0, 1, 0, 0],
+            [3, 4, 0, 2, 4]],
+           [[0, 2, 2, 3, 1],
+            [3, 2, 2, 2, 3],
+            [3, 4, 3, 0, 3],
+            [1, 2, 2, 2, 4]]])
+      parent_ids = _transpose_batch_time(
+          [[[4, 2, 4, 3, 4],
+            [3, 4, 0, 2, 0],
+            [3, 1, 3, 2, 2],
+            [0, 2, 1, 4, 2]],
+           [[4, 2, 4, 3, 4],
+            [3, 4, 0, 2, 0],
+            [3, 1, 3, 2, 2],
+            [0, 2, 1, 4, 2]],
+           [[3, 0, 0, 4, 0],
+            [1, 2, 4, 2, 2],
+            [4, 4, 0, 3, 0],
+            [2, 4, 4, 3, 0]],
+           [[3, 1, 4, 1, 3],
+            [3, 2, 4, 0, 4],
+            [1, 0, 1, 4, 2],
+            [0, 3, 2, 0, 1]]])
+      expected_beams = _transpose_batch_time(
+          [[[-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[3, 4, 0, 4, 0],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[2, 3, 2, 3, 3],
+            [2, 1, 1, 3, 2],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[2, 3, 2, 1, 1],
+            [2, 3, 2, 3, 2],
+            [3, 4, 3, 0, 3],
+            [-1, -1, -1, -1, -1]]])
+
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+      self.assertAllEqual(expected_beams, beams.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
deleted file mode 100644
index c450fc66976..00000000000
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for contrib.seq2seq.python.seq2seq.decoder_fn."""
-# pylint: disable=unused-import,g-bad-import-order
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-# pylint: enable=unused-import
-
-from tensorflow.python.platform import test
-
-
-class DecoderFnTest(test.TestCase):
-
-  def testDecoderFn(self):
-    pass
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index 851eb6ef921..ac830ae98e5 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -19,21 +19,14 @@ from __future__ import division
 from __future__ import print_function
 # pylint: enable=unused-import
 
-import sys
-
-# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes  # pylint: disable=g-import-not-at-top
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-# pylint: disable=g-import-not-at-top
 import numpy as np
 
-from tensorflow.contrib.rnn import core_rnn_cell
 from tensorflow.contrib.seq2seq.python.ops import decoder
-from tensorflow.contrib.seq2seq.python.ops import sampling_decoder
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
@@ -51,25 +44,25 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       if time_major:
         inputs = np.random.randn(max_time, batch_size,
                                  input_depth).astype(np.float32)
       else:
         inputs = np.random.randn(batch_size, max_time,
                                  input_depth).astype(np.float32)
-      cell = core_rnn_cell.LSTMCell(cell_depth)
-      sampler = sampling_decoder.BasicTrainingSampler(
+      cell = rnn_cell.LSTMCell(cell_depth)
+      helper = helper_py.TrainingHelper(
           inputs, sequence_length, time_major=time_major)
-      my_decoder = sampling_decoder.BasicSamplingDecoder(
+      my_decoder = basic_decoder.BasicDecoder(
           cell=cell,
-          sampler=sampler,
+          helper=helper,
           initial_state=cell.zero_state(
               dtype=dtypes.float32, batch_size=batch_size))
 
-      final_outputs, final_state = decoder.dynamic_decode_rnn(
-          my_decoder, output_time_major=time_major,
-          maximum_iterations=maximum_iterations)
+      final_outputs, final_state, final_sequence_length = (
+          decoder.dynamic_decode(my_decoder, output_time_major=time_major,
+                                 maximum_iterations=maximum_iterations))
 
       def _t(shape):
         if time_major:
@@ -77,9 +70,12 @@ class DynamicDecodeRNNTest(test.TestCase):
         return shape
 
       self.assertTrue(
-          isinstance(final_outputs, sampling_decoder.SamplingDecoderOutput))
-      self.assertTrue(isinstance(final_state, core_rnn_cell.LSTMStateTuple))
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertTrue(isinstance(final_state, rnn_cell.LSTMStateTuple))
 
+      self.assertEqual(
+          (batch_size,),
+          tuple(final_sequence_length.get_shape().as_list()))
       self.assertEqual(
           _t((batch_size, None, cell_depth)),
           tuple(final_outputs.rnn_output.get_shape().as_list()))
@@ -90,7 +86,8 @@ class DynamicDecodeRNNTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       sess_results = sess.run({
           "final_outputs": final_outputs,
-          "final_state": final_state
+          "final_state": final_state,
+          "final_sequence_length": final_sequence_length,
       })
 
       # Mostly a smoke test
@@ -116,7 +113,7 @@ class DynamicDecodeRNNTest(test.TestCase):
   def testDynamicDecodeRNNOneMaxIter(self):
     self._testDynamicDecodeRNN(time_major=True, maximum_iterations=1)
 
-  def _testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNN(
+  def _testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
       self, use_sequence_length):
     sequence_length = [3, 4, 3, 1, 0]
     batch_size = 5
@@ -125,20 +122,20 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
 
-      cell = core_rnn_cell.LSTMCell(cell_depth)
+      cell = rnn_cell.LSTMCell(cell_depth)
       zero_state = cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)
-      sampler = sampling_decoder.BasicTrainingSampler(inputs, sequence_length)
-      my_decoder = sampling_decoder.BasicSamplingDecoder(
-          cell=cell, sampler=sampler, initial_state=zero_state)
+      helper = helper_py.TrainingHelper(inputs, sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell, helper=helper, initial_state=zero_state)
 
       # Match the variable scope of dynamic_rnn below so we end up
       # using the same variables
       with vs.variable_scope("root") as scope:
-        final_decoder_outputs, final_decoder_state = decoder.dynamic_decode_rnn(
+        final_decoder_outputs, final_decoder_state, _ = decoder.dynamic_decode(
             my_decoder,
             # impute_finished=True ensures outputs and final state
             # match those of dynamic_rnn called with sequence_length not None
@@ -169,14 +166,12 @@ class DynamicDecodeRNNTest(test.TestCase):
         self.assertAllClose(sess_results["final_decoder_state"],
                             sess_results["final_rnn_state"])
 
-  def testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNNWithSeqLen(
-      self):
-    self._testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNN(
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNWithSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
         use_sequence_length=True)
 
-  def testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNNNoSeqLen(
-      self):
-    self._testDynamicDecodeRNNWithBasicTrainingSamplerMatchesDynamicRNN(
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNNoSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
         use_sequence_length=False)
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 4e65960ea4a..35c601a4bcf 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -21,14 +21,6 @@ from __future__ import print_function
 # pylint: enable=unused-import
 
 import numpy as np
-import sys
-
-# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
-# pylint: disable=g-import-not-at-top
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 
 from tensorflow.contrib.seq2seq.python.ops import loss
 from tensorflow.python.framework import constant_op
@@ -42,7 +34,7 @@ from tensorflow.python.platform import test
 class LossTest(test.TestCase):
 
   def testSequenceLoss(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         batch_size = 2
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py
deleted file mode 100644
index 3f8b4c077d6..00000000000
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/sampling_decoder_test.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for contrib.seq2seq.python.seq2seq.sampling_decoder."""
-# pylint: disable=unused-import,g-bad-import-order
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-# pylint: enable=unused-import
-
-import sys
-
-# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes  # pylint: disable=g-import-not-at-top
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-# pylint: disable=g-import-not-at-top
-import numpy as np
-
-from tensorflow.contrib.rnn import core_rnn_cell
-from tensorflow.contrib.seq2seq.python.ops import sampling_decoder
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-# pylint: enable=g-import-not-at-top
-
-
-class BasicSamplingDecoderTest(test.TestCase):
-
-  def testStepWithBasicTrainingSampler(self):
-    sequence_length = [3, 4, 3, 1, 0]
-    batch_size = 5
-    max_time = 8
-    input_depth = 7
-    cell_depth = 10
-
-    with self.test_session() as sess:
-      inputs = np.random.randn(batch_size, max_time,
-                               input_depth).astype(np.float32)
-      cell = core_rnn_cell.LSTMCell(cell_depth)
-      sampler = sampling_decoder.BasicTrainingSampler(
-          inputs, sequence_length, time_major=False)
-      my_decoder = sampling_decoder.BasicSamplingDecoder(
-          cell=cell,
-          sampler=sampler,
-          initial_state=cell.zero_state(
-              dtype=dtypes.float32, batch_size=batch_size))
-      output_size = my_decoder.output_size
-      output_dtype = my_decoder.output_dtype
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(cell_depth,
-                                                 tensor_shape.TensorShape([])),
-          output_size)
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(dtypes.float32, dtypes.int32),
-          output_dtype)
-
-      (first_finished, first_inputs, first_state) = my_decoder.initialize()
-      (step_outputs, step_state, step_next_inputs,
-       step_finished) = my_decoder.step(
-           constant_op.constant(0), first_inputs, first_state)
-      batch_size_t = my_decoder.batch_size
-
-      self.assertTrue(isinstance(first_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(isinstance(step_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(
-          isinstance(step_outputs, sampling_decoder.SamplingDecoderOutput))
-      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
-      self.assertEqual((batch_size,), step_outputs[1].get_shape())
-      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
-      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
-      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
-      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
-
-      sess.run(variables.global_variables_initializer())
-      sess_results = sess.run({
-          "batch_size": batch_size_t,
-          "first_finished": first_finished,
-          "first_inputs": first_inputs,
-          "first_state": first_state,
-          "step_outputs": step_outputs,
-          "step_state": step_state,
-          "step_next_inputs": step_next_inputs,
-          "step_finished": step_finished
-      })
-
-      self.assertAllEqual([False, False, False, False, True],
-                          sess_results["first_finished"])
-      self.assertAllEqual([False, False, False, True, True],
-                          sess_results["step_finished"])
-      self.assertAllEqual(
-          np.argmax(sess_results["step_outputs"].rnn_output, -1),
-          sess_results["step_outputs"].sample_id)
-
-  def testStepWithGreedyEmbeddingSampler(self):
-    batch_size = 5
-    vocabulary_size = 7
-    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
-    input_depth = 10
-    start_tokens = [0] * batch_size
-    end_token = 1
-
-    with self.test_session() as sess:
-      embeddings = np.random.randn(vocabulary_size,
-                                   input_depth).astype(np.float32)
-      cell = core_rnn_cell.LSTMCell(vocabulary_size)
-      sampler = sampling_decoder.GreedyEmbeddingSampler(
-          embeddings, start_tokens, end_token)
-      my_decoder = sampling_decoder.BasicSamplingDecoder(
-          cell=cell,
-          sampler=sampler,
-          initial_state=cell.zero_state(
-              dtype=dtypes.float32, batch_size=batch_size))
-      output_size = my_decoder.output_size
-      output_dtype = my_decoder.output_dtype
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(cell_depth,
-                                                 tensor_shape.TensorShape([])),
-          output_size)
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(dtypes.float32, dtypes.int32),
-          output_dtype)
-
-      (first_finished, first_inputs, first_state) = my_decoder.initialize()
-      (step_outputs, step_state, step_next_inputs,
-       step_finished) = my_decoder.step(
-           constant_op.constant(0), first_inputs, first_state)
-      batch_size_t = my_decoder.batch_size
-
-      self.assertTrue(isinstance(first_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(isinstance(step_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(
-          isinstance(step_outputs, sampling_decoder.SamplingDecoderOutput))
-      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
-      self.assertEqual((batch_size,), step_outputs[1].get_shape())
-      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
-      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
-      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
-      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
-
-      sess.run(variables.global_variables_initializer())
-      sess_results = sess.run({
-          "batch_size": batch_size_t,
-          "first_finished": first_finished,
-          "first_inputs": first_inputs,
-          "first_state": first_state,
-          "step_outputs": step_outputs,
-          "step_state": step_state,
-          "step_next_inputs": step_next_inputs,
-          "step_finished": step_finished
-      })
-
-      expected_sample_ids = np.argmax(
-          sess_results["step_outputs"].rnn_output, -1)
-      expected_step_finished = (expected_sample_ids == end_token)
-      expected_step_next_inputs = embeddings[expected_sample_ids]
-      self.assertAllEqual([False, False, False, False, False],
-                          sess_results["first_finished"])
-      self.assertAllEqual(expected_step_finished, sess_results["step_finished"])
-      self.assertAllEqual(expected_sample_ids,
-                          sess_results["step_outputs"].sample_id)
-      self.assertAllEqual(expected_step_next_inputs,
-                          sess_results["step_next_inputs"])
-
-  def testStepWithScheduledEmbeddingTrainingSampler(self):
-    sequence_length = [3, 4, 3, 1, 0]
-    batch_size = 5
-    max_time = 8
-    input_depth = 7
-    vocabulary_size = 10
-
-    with self.test_session() as sess:
-      inputs = np.random.randn(
-          batch_size, max_time, input_depth).astype(np.float32)
-      embeddings = np.random.randn(
-          vocabulary_size, input_depth).astype(np.float32)
-      half = constant_op.constant(0.5)
-      cell = core_rnn_cell.LSTMCell(vocabulary_size)
-      sampler = sampling_decoder.ScheduledEmbeddingTrainingSampler(
-          inputs=inputs, sequence_length=sequence_length,
-          embedding=embeddings, sampling_probability=half,
-          time_major=False)
-      my_decoder = sampling_decoder.BasicSamplingDecoder(
-          cell=cell,
-          sampler=sampler,
-          initial_state=cell.zero_state(
-              dtype=dtypes.float32, batch_size=batch_size))
-      output_size = my_decoder.output_size
-      output_dtype = my_decoder.output_dtype
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(vocabulary_size,
-                                                 tensor_shape.TensorShape([])),
-          output_size)
-      self.assertEqual(
-          sampling_decoder.SamplingDecoderOutput(dtypes.float32, dtypes.int32),
-          output_dtype)
-
-      (first_finished, first_inputs, first_state) = my_decoder.initialize()
-      (step_outputs, step_state, step_next_inputs,
-       step_finished) = my_decoder.step(
-           constant_op.constant(0), first_inputs, first_state)
-      batch_size_t = my_decoder.batch_size
-
-      self.assertTrue(isinstance(first_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(isinstance(step_state, core_rnn_cell.LSTMStateTuple))
-      self.assertTrue(
-          isinstance(step_outputs, sampling_decoder.SamplingDecoderOutput))
-      self.assertEqual((batch_size, vocabulary_size),
-                       step_outputs[0].get_shape())
-      self.assertEqual((batch_size,), step_outputs[1].get_shape())
-      self.assertEqual((batch_size, vocabulary_size),
-                       first_state[0].get_shape())
-      self.assertEqual((batch_size, vocabulary_size),
-                       first_state[1].get_shape())
-      self.assertEqual((batch_size, vocabulary_size),
-                       step_state[0].get_shape())
-      self.assertEqual((batch_size, vocabulary_size),
-                       step_state[1].get_shape())
-      self.assertEqual((batch_size, input_depth),
-                       step_next_inputs.get_shape())
-
-      sess.run(variables.global_variables_initializer())
-      sess_results = sess.run({
-          "batch_size": batch_size_t,
-          "first_finished": first_finished,
-          "first_inputs": first_inputs,
-          "first_state": first_state,
-          "step_outputs": step_outputs,
-          "step_state": step_state,
-          "step_next_inputs": step_next_inputs,
-          "step_finished": step_finished
-      })
-
-      self.assertAllEqual([False, False, False, False, True],
-                          sess_results["first_finished"])
-      self.assertAllEqual([False, False, False, True, True],
-                          sess_results["step_finished"])
-      sample_ids = sess_results["step_outputs"].sample_id
-      batch_where_not_sampling = np.where(sample_ids == -1)
-      batch_where_sampling = np.where(sample_ids > -1)
-      self.assertAllClose(
-          sess_results["step_next_inputs"][batch_where_sampling],
-          embeddings[sample_ids[batch_where_sampling]])
-      self.assertAllClose(
-          sess_results["step_next_inputs"][batch_where_not_sampling],
-          np.squeeze(inputs[batch_where_not_sampling, 1]))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
deleted file mode 100644
index d2476ab5e70..00000000000
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for contrib.seq2seq.python.ops.seq2seq."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-from tensorflow.contrib import layers
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.contrib.seq2seq.python.ops import attention_decoder_fn
-from tensorflow.contrib.seq2seq.python.ops import decoder_fn as decoder_fn_lib
-from tensorflow.contrib.seq2seq.python.ops import seq2seq
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class Seq2SeqTest(test.TestCase):
-
-  # test a default call of rnn_decoder
-  def test_rnn_decoder(self):
-    pass
-
-  # test default call with time_major=True
-  def test_dynamic_rnn_decoder_time_major(self):
-    with self.test_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)) as varscope:
-        # Define inputs/outputs to model
-        batch_size = 2
-        encoder_embedding_size = 3
-        decoder_embedding_size = 4
-        encoder_hidden_size = 5
-        decoder_hidden_size = encoder_hidden_size
-        input_sequence_length = 6
-        decoder_sequence_length = 7
-        num_decoder_symbols = 20
-        start_of_sequence_id = end_of_sequence_id = 1
-        decoder_embeddings = variable_scope.get_variable(
-            "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size],
-            initializer=init_ops.random_normal_initializer(stddev=0.1))
-        inputs = constant_op.constant(
-            0.5,
-            shape=[input_sequence_length, batch_size, encoder_embedding_size])
-        decoder_inputs = constant_op.constant(
-            0.4,
-            shape=[decoder_sequence_length, batch_size, decoder_embedding_size])
-        decoder_length = constant_op.constant(
-            decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,])
-        with variable_scope.variable_scope("rnn") as scope:
-          # setting up weights for computing the final output
-          output_fn = lambda x: layers.linear(x, num_decoder_symbols,
-                                              scope=scope)
-
-          # Define model
-          encoder_outputs, encoder_state = rnn.dynamic_rnn(
-              cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
-              inputs=inputs,
-              dtype=dtypes.float32,
-              time_major=True,
-              scope=scope)
-
-        with variable_scope.variable_scope("decoder") as scope:
-          # Train decoder
-          decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)
-          decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state(
-              decoder_fn_lib.simple_decoder_fn_train(
-                  encoder_state=encoder_state))
-          (decoder_outputs_train, decoder_state_train,
-           decoder_context_state_train) = (seq2seq.dynamic_rnn_decoder(
-               cell=decoder_cell,
-               decoder_fn=decoder_fn_train,
-               inputs=decoder_inputs,
-               sequence_length=decoder_length,
-               time_major=True,
-               scope=scope))
-          decoder_outputs_train = output_fn(decoder_outputs_train)
-
-          # Setup variable reuse
-          scope.reuse_variables()
-
-          # Inference decoder
-          decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state(
-              decoder_fn_lib.simple_decoder_fn_inference(
-                  output_fn=output_fn,
-                  encoder_state=encoder_state,
-                  embeddings=decoder_embeddings,
-                  start_of_sequence_id=start_of_sequence_id,
-                  end_of_sequence_id=end_of_sequence_id,
-                  #TODO: find out why it goes to +1
-                  maximum_length=decoder_sequence_length - 1,
-                  num_decoder_symbols=num_decoder_symbols,
-                  dtype=dtypes.int32))
-          (decoder_outputs_inference, decoder_state_inference,
-           decoder_context_state_inference) = (seq2seq.dynamic_rnn_decoder(
-               cell=decoder_cell,
-               decoder_fn=decoder_fn_inference,
-               time_major=True,
-               scope=scope))
-
-        # Run model
-        variables.global_variables_initializer().run()
-        (decoder_outputs_train_res, decoder_state_train_res,
-         decoder_context_state_train_res) = sess.run([
-             decoder_outputs_train, decoder_state_train,
-             decoder_context_state_train
-         ])
-        (decoder_outputs_inference_res, decoder_state_inference_res,
-         decoder_context_state_inference_res) = sess.run([
-             decoder_outputs_inference, decoder_state_inference,
-             decoder_context_state_inference
-         ])
-
-        # Assert outputs
-        self.assertEqual((decoder_sequence_length, batch_size,
-                          num_decoder_symbols), decoder_outputs_train_res.shape)
-        self.assertEqual((batch_size, num_decoder_symbols),
-                         decoder_outputs_inference_res.shape[1:3])
-        self.assertEqual(decoder_sequence_length,
-                         decoder_context_state_inference_res)
-        self.assertEqual((batch_size, decoder_hidden_size),
-                         decoder_state_train_res.shape)
-        self.assertEqual((batch_size, decoder_hidden_size),
-                         decoder_state_inference_res.shape)
-        self.assertEqual(decoder_sequence_length,
-                         decoder_context_state_train_res)
-        # The dynamic decoder might end earlier than `maximal_length`
-        # under inference
-        self.assertGreaterEqual(decoder_sequence_length,
-                                decoder_state_inference_res.shape[0])
-
-  # test attention
-  def test_attention(self):
-    with self.test_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        # Define inputs/outputs to model
-        batch_size = 2
-        encoder_embedding_size = 3
-        decoder_embedding_size = 4
-        encoder_hidden_size = 5
-        decoder_hidden_size = encoder_hidden_size
-        input_sequence_length = 6
-        decoder_sequence_length = 7
-        num_decoder_symbols = 20
-        start_of_sequence_id = end_of_sequence_id = 1
-        decoder_embeddings = variable_scope.get_variable(
-            "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size],
-            initializer=init_ops.random_normal_initializer(stddev=0.1))
-        inputs = constant_op.constant(
-            0.5,
-            shape=[input_sequence_length, batch_size, encoder_embedding_size])
-        decoder_inputs = constant_op.constant(
-            0.4,
-            shape=[decoder_sequence_length, batch_size, decoder_embedding_size])
-        decoder_length = constant_op.constant(
-            decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,])
-
-        # attention
-        attention_option = "luong"  # can be "bahdanau"
-
-        with variable_scope.variable_scope("rnn") as scope:
-          # Define model
-          encoder_outputs, encoder_state = rnn.dynamic_rnn(
-              cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
-              inputs=inputs,
-              dtype=dtypes.float32,
-              time_major=True,
-              scope=scope)
-
-          # attention_states: size [batch_size, max_time, num_units]
-          attention_states = array_ops.transpose(encoder_outputs, [1, 0, 2])
-
-        with variable_scope.variable_scope("decoder") as scope:
-          # Prepare attention
-          (attention_keys, attention_values, attention_score_fn,
-           attention_construct_fn) = (attention_decoder_fn.prepare_attention(
-               attention_states, attention_option, decoder_hidden_size))
-          decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
-              encoder_state=encoder_state,
-              attention_keys=attention_keys,
-              attention_values=attention_values,
-              attention_score_fn=attention_score_fn,
-              attention_construct_fn=attention_construct_fn)
-
-          # setting up weights for computing the final output
-          def create_output_fn():
-
-            def output_fn(x):
-              return layers.linear(x, num_decoder_symbols, scope=scope)
-
-            return output_fn
-
-          output_fn = create_output_fn()
-
-          # Train decoder
-          decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)
-          (decoder_outputs_train, decoder_state_train, _) = (
-              seq2seq.dynamic_rnn_decoder(
-                  cell=decoder_cell,
-                  decoder_fn=decoder_fn_train,
-                  inputs=decoder_inputs,
-                  sequence_length=decoder_length,
-                  time_major=True,
-                  scope=scope))
-          decoder_outputs_train = output_fn(decoder_outputs_train)
-          # Setup variable reuse
-          scope.reuse_variables()
-
-          # Inference decoder
-          decoder_fn_inference = (
-              attention_decoder_fn.attention_decoder_fn_inference(
-                  output_fn=output_fn,
-                  encoder_state=encoder_state,
-                  attention_keys=attention_keys,
-                  attention_values=attention_values,
-                  attention_score_fn=attention_score_fn,
-                  attention_construct_fn=attention_construct_fn,
-                  embeddings=decoder_embeddings,
-                  start_of_sequence_id=start_of_sequence_id,
-                  end_of_sequence_id=end_of_sequence_id,
-                  maximum_length=decoder_sequence_length - 1,
-                  num_decoder_symbols=num_decoder_symbols,
-                  dtype=dtypes.int32))
-          (decoder_outputs_inference, decoder_state_inference, _) = (
-              seq2seq.dynamic_rnn_decoder(
-                  cell=decoder_cell,
-                  decoder_fn=decoder_fn_inference,
-                  time_major=True,
-                  scope=scope))
-
-        # Run model
-        variables.global_variables_initializer().run()
-        (decoder_outputs_train_res, decoder_state_train_res) = sess.run(
-            [decoder_outputs_train, decoder_state_train])
-        (decoder_outputs_inference_res, decoder_state_inference_res) = sess.run(
-            [decoder_outputs_inference, decoder_state_inference])
-
-        # Assert outputs
-        self.assertEqual((decoder_sequence_length, batch_size,
-                          num_decoder_symbols), decoder_outputs_train_res.shape)
-        self.assertEqual((batch_size, num_decoder_symbols),
-                         decoder_outputs_inference_res.shape[1:3])
-        self.assertEqual((batch_size, decoder_hidden_size),
-                         decoder_state_train_res.shape)
-        self.assertEqual((batch_size, decoder_hidden_size),
-                         decoder_state_inference_res.shape)
-        # The dynamic decoder might end earlier than `maximal_length`
-        # under inference
-        self.assertGreaterEqual(decoder_sequence_length,
-                                decoder_state_inference_res.shape[0])
-
-  @staticmethod
-  def _decoder_fn_with_context_state(inner_decoder_fn, name=None):
-    """Wraps a given decoder function, adding context state to it.
-
-    Given a valid `inner_decoder_fn`, returns another valid `decoder_fn` which
-    first calls `inner_decoder_fn`, then overwrites the context_state, setting
-    it to the current time.
-
-    Args:
-      inner_decoder_fn: A valid `decoder_fn` of the type passed into
-        `dynamic_rnn_decoder`.
-
-    Returns:
-      A valid `decoder_fn` to be passed into `dynamic_rnn_decoder`.
-    """
-
-    def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
-      with ops.name_scope(
-          name, "decoder_fn_with_context_state",
-          [time, cell_state, cell_input, cell_output, context_state]):
-        done, next_state, next_input, emit_output, next_context_state = (
-            inner_decoder_fn(time, cell_state, cell_input, cell_output,
-                             context_state))
-        next_context_state = time
-        return done, next_state, next_input, emit_output, next_context_state
-
-    return decoder_fn
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_decoder_fn.py b/tensorflow/contrib/seq2seq/python/ops/attention_decoder_fn.py
deleted file mode 100644
index 28c957d26c3..00000000000
--- a/tensorflow/contrib/seq2seq/python/ops/attention_decoder_fn.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Attention-based decoder functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-
-__all__ = [
-    "prepare_attention", "attention_decoder_fn_train",
-    "attention_decoder_fn_inference"
-]
-
-
-def attention_decoder_fn_train(encoder_state,
-                               attention_keys,
-                               attention_values,
-                               attention_score_fn,
-                               attention_construct_fn,
-                               name=None):
-  """Attentional decoder function for `dynamic_rnn_decoder` during training.
-
-  The `attention_decoder_fn_train` is a training function for an
-  attention-based sequence-to-sequence model. It should be used when
-  `dynamic_rnn_decoder` is in the training mode.
-
-  The `attention_decoder_fn_train` is called with a set of the user arguments
-  and returns the `decoder_fn`, which can be passed to the
-  `dynamic_rnn_decoder`, such that
-
-  ```
-  dynamic_fn_train = attention_decoder_fn_train(encoder_state)
-  outputs_train, state_train = dynamic_rnn_decoder(
-      decoder_fn=dynamic_fn_train, ...)
-  ```
-
-  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
-
-  Args:
-    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
-    attention_keys: to be compared with target states.
-    attention_values: to be used to construct context vectors.
-    attention_score_fn: to compute similarity between key and target states.
-    attention_construct_fn: to build attention states.
-    name: (default: `None`) NameScope for the decoder function;
-      defaults to "simple_decoder_fn_train"
-
-  Returns:
-    A decoder function with the required interface of `dynamic_rnn_decoder`
-    intended for training.
-  """
-  with ops.name_scope(name, "attention_decoder_fn_train", [
-      encoder_state, attention_keys, attention_values, attention_score_fn,
-      attention_construct_fn
-  ]):
-    pass
-
-  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
-    """Decoder function used in the `dynamic_rnn_decoder` for training.
-
-    Args:
-      time: positive integer constant reflecting the current timestep.
-      cell_state: state of RNNCell.
-      cell_input: input provided by `dynamic_rnn_decoder`.
-      cell_output: output of RNNCell.
-      context_state: context state provided by `dynamic_rnn_decoder`.
-
-    Returns:
-      A tuple (done, next state, next input, emit output, next context state)
-      where:
-
-      done: `None`, which is used by the `dynamic_rnn_decoder` to indicate
-      that `sequence_lengths` in `dynamic_rnn_decoder` should be used.
-
-      next state: `cell_state`, this decoder function does not modify the
-      given state.
-
-      next input: `cell_input`, this decoder function does not modify the
-      given input. The input could be modified when applying e.g. attention.
-
-      emit output: `cell_output`, this decoder function does not modify the
-      given output.
-
-      next context state: `context_state`, this decoder function does not
-      modify the given context state. The context state could be modified when
-      applying e.g. beam search.
-    """
-    with ops.name_scope(
-        name, "attention_decoder_fn_train",
-        [time, cell_state, cell_input, cell_output, context_state]):
-      if cell_state is None:  # first call, return encoder_state
-        cell_state = encoder_state
-
-        # init attention
-        attention = _init_attention(encoder_state)
-      else:
-        # construct attention
-        attention = attention_construct_fn(cell_output, attention_keys,
-                                           attention_values)
-        cell_output = attention
-
-      # combine cell_input and attention
-      next_input = array_ops.concat([cell_input, attention], 1)
-
-      return (None, cell_state, next_input, cell_output, context_state)
-
-  return decoder_fn
-
-
-def attention_decoder_fn_inference(output_fn,
-                                   encoder_state,
-                                   attention_keys,
-                                   attention_values,
-                                   attention_score_fn,
-                                   attention_construct_fn,
-                                   embeddings,
-                                   start_of_sequence_id,
-                                   end_of_sequence_id,
-                                   maximum_length,
-                                   num_decoder_symbols,
-                                   dtype=dtypes.int32,
-                                   name=None):
-  """Attentional decoder function for `dynamic_rnn_decoder` during inference.
-
-  The `attention_decoder_fn_inference` is a simple inference function for a
-  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
-  in the inference mode.
-
-  The `attention_decoder_fn_inference` is called with user arguments
-  and returns the `decoder_fn`, which can be passed to the
-  `dynamic_rnn_decoder`, such that
-
-  ```
-  dynamic_fn_inference = attention_decoder_fn_inference(...)
-  outputs_inference, state_inference = dynamic_rnn_decoder(
-      decoder_fn=dynamic_fn_inference, ...)
-  ```
-
-  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
-
-  Args:
-    output_fn: An output function to project your `cell_output` onto class
-    logits.
-
-    An example of an output function;
-
-    ```
-      tf.variable_scope("decoder") as varscope
-        output_fn = lambda x: layers.linear(x, num_decoder_symbols,
-                                            scope=varscope)
-
-        outputs_train, state_train = seq2seq.dynamic_rnn_decoder(...)
-        logits_train = output_fn(outputs_train)
-
-        varscope.reuse_variables()
-        logits_inference, state_inference = seq2seq.dynamic_rnn_decoder(
-            output_fn=output_fn, ...)
-    ```
-
-    If `None` is supplied it will act as an identity function, which
-    might be wanted when using the RNNCell `OutputProjectionWrapper`.
-
-    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
-    attention_keys: to be compared with target states.
-    attention_values: to be used to construct context vectors.
-    attention_score_fn: to compute similarity between key and target states.
-    attention_construct_fn: to build attention states.
-    embeddings: The embeddings matrix used for the decoder sized
-    `[num_decoder_symbols, embedding_size]`.
-    start_of_sequence_id: The start of sequence ID in the decoder embeddings.
-    end_of_sequence_id: The end of sequence ID in the decoder embeddings.
-    maximum_length: The maximum allowed of time steps to decode.
-    num_decoder_symbols: The number of classes to decode at each time step.
-    dtype: (default: `dtypes.int32`) The default data type to use when
-    handling integer objects.
-    name: (default: `None`) NameScope for the decoder function;
-      defaults to "attention_decoder_fn_inference"
-
-  Returns:
-    A decoder function with the required interface of `dynamic_rnn_decoder`
-    intended for inference.
-  """
-  with ops.name_scope(name, "attention_decoder_fn_inference", [
-      output_fn, encoder_state, attention_keys, attention_values,
-      attention_score_fn, attention_construct_fn, embeddings,
-      start_of_sequence_id, end_of_sequence_id, maximum_length,
-      num_decoder_symbols, dtype
-  ]):
-    start_of_sequence_id = ops.convert_to_tensor(start_of_sequence_id, dtype)
-    end_of_sequence_id = ops.convert_to_tensor(end_of_sequence_id, dtype)
-    maximum_length = ops.convert_to_tensor(maximum_length, dtype)
-    num_decoder_symbols = ops.convert_to_tensor(num_decoder_symbols, dtype)
-    encoder_info = nest.flatten(encoder_state)[0]
-    batch_size = encoder_info.get_shape()[0].value
-    if output_fn is None:
-      output_fn = lambda x: x
-    if batch_size is None:
-      batch_size = array_ops.shape(encoder_info)[0]
-
-  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
-    """Decoder function used in the `dynamic_rnn_decoder` for inference.
-
-    The main difference between this decoder function and the `decoder_fn` in
-    `attention_decoder_fn_train` is how `next_cell_input` is calculated. In
-    decoder function we calculate the next input by applying an argmax across
-    the feature dimension of the output from the decoder. This is a
-    greedy-search approach. (Bahdanau et al., 2014) & (Sutskever et al., 2014)
-    use beam-search instead.
-
-    Args:
-      time: positive integer constant reflecting the current timestep.
-      cell_state: state of RNNCell.
-      cell_input: input provided by `dynamic_rnn_decoder`.
-      cell_output: output of RNNCell.
-      context_state: context state provided by `dynamic_rnn_decoder`.
-
-    Returns:
-      A tuple (done, next state, next input, emit output, next context state)
-      where:
-
-      done: A boolean vector to indicate which sentences has reached a
-      `end_of_sequence_id`. This is used for early stopping by the
-      `dynamic_rnn_decoder`. When `time>=maximum_length` a boolean vector with
-      all elements as `true` is returned.
-
-      next state: `cell_state`, this decoder function does not modify the
-      given state.
-
-      next input: The embedding from argmax of the `cell_output` is used as
-      `next_input`.
-
-      emit output: If `output_fn is None` the supplied `cell_output` is
-      returned, else the `output_fn` is used to update the `cell_output`
-      before calculating `next_input` and returning `cell_output`.
-
-      next context state: `context_state`, this decoder function does not
-      modify the given context state. The context state could be modified when
-      applying e.g. beam search.
-
-    Raises:
-      ValueError: if cell_input is not None.
-
-    """
-    with ops.name_scope(
-        name, "attention_decoder_fn_inference",
-        [time, cell_state, cell_input, cell_output, context_state]):
-      if cell_input is not None:
-        raise ValueError("Expected cell_input to be None, but saw: %s" %
-                         cell_input)
-      if cell_output is None:
-        # invariant that this is time == 0
-        next_input_id = array_ops.ones(
-            [batch_size,], dtype=dtype) * (start_of_sequence_id)
-        done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
-        cell_state = encoder_state
-        cell_output = array_ops.zeros(
-            [num_decoder_symbols], dtype=dtypes.float32)
-        cell_input = array_ops.gather(embeddings, next_input_id)
-
-        # init attention
-        attention = _init_attention(encoder_state)
-      else:
-        # construct attention
-        attention = attention_construct_fn(cell_output, attention_keys,
-                                           attention_values)
-        cell_output = attention
-
-        # argmax decoder
-        cell_output = output_fn(cell_output)  # logits
-        next_input_id = math_ops.cast(
-            math_ops.argmax(cell_output, 1), dtype=dtype)
-        done = math_ops.equal(next_input_id, end_of_sequence_id)
-        cell_input = array_ops.gather(embeddings, next_input_id)
-
-      # combine cell_input and attention
-      next_input = array_ops.concat([cell_input, attention], 1)
-
-      # if time > maxlen, return all true vector
-      done = control_flow_ops.cond(
-          math_ops.greater(time, maximum_length),
-          lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
-          lambda: done)
-      return (done, cell_state, next_input, cell_output, context_state)
-
-  return decoder_fn
-
-
-## Helper functions ##
-def prepare_attention(attention_states,
-                      attention_option,
-                      num_units,
-                      reuse=False):
-  """Prepare keys/values/functions for attention.
-
-  Args:
-    attention_states: hidden states to attend over.
-    attention_option: how to compute attention, either "luong" or "bahdanau".
-    num_units: hidden state dimension.
-    reuse: whether to reuse variable scope.
-
-  Returns:
-    attention_keys: to be compared with target states.
-    attention_values: to be used to construct context vectors.
-    attention_score_fn: to compute similarity between key and target states.
-    attention_construct_fn: to build attention states.
-  """
-
-  # Prepare attention keys / values from attention_states
-  with variable_scope.variable_scope("attention_keys", reuse=reuse) as scope:
-    attention_keys = layers.linear(
-        attention_states, num_units, biases_initializer=None, scope=scope)
-  attention_values = attention_states
-
-  # Attention score function
-  attention_score_fn = _create_attention_score_fn("attention_score", num_units,
-                                                  attention_option, reuse)
-
-  # Attention construction function
-  attention_construct_fn = _create_attention_construct_fn("attention_construct",
-                                                          num_units,
-                                                          attention_score_fn,
-                                                          reuse)
-
-  return (attention_keys, attention_values, attention_score_fn,
-          attention_construct_fn)
-
-
-def _init_attention(encoder_state):
-  """Initialize attention. Handling both LSTM and GRU.
-
-  Args:
-    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
-
-  Returns:
-    attn: initial zero attention vector.
-  """
-
-  # Multi- vs single-layer
-  # TODO(thangluong): is this the best way to check?
-  if isinstance(encoder_state, tuple):
-    top_state = encoder_state[-1]
-  else:
-    top_state = encoder_state
-
-  # LSTM vs GRU
-  if isinstance(top_state, core_rnn_cell_impl.LSTMStateTuple):
-    attn = array_ops.zeros_like(top_state.h)
-  else:
-    attn = array_ops.zeros_like(top_state)
-
-  return attn
-
-
-def _create_attention_construct_fn(name, num_units, attention_score_fn, reuse):
-  """Function to compute attention vectors.
-
-  Args:
-    name: to label variables.
-    num_units: hidden state dimension.
-    attention_score_fn: to compute similarity between key and target states.
-    reuse: whether to reuse variable scope.
-
-  Returns:
-    attention_construct_fn: to build attention states.
-  """
-  with variable_scope.variable_scope(name, reuse=reuse) as scope:
-
-    def construct_fn(attention_query, attention_keys, attention_values):
-      context = attention_score_fn(attention_query, attention_keys,
-                                   attention_values)
-      concat_input = array_ops.concat([attention_query, context], 1)
-      attention = layers.linear(
-          concat_input, num_units, biases_initializer=None, scope=scope)
-      return attention
-
-    return construct_fn
-
-
-# keys: [batch_size, attention_length, attn_size]
-# query: [batch_size, 1, attn_size]
-# return weights [batch_size, attention_length]
-@function.Defun(func_name="attn_add_fun", noinline=True)
-def _attn_add_fun(v, keys, query):
-  return math_ops.reduce_sum(v * math_ops.tanh(keys + query), [2])
-
-
-@function.Defun(func_name="attn_mul_fun", noinline=True)
-def _attn_mul_fun(keys, query):
-  return math_ops.reduce_sum(keys * query, [2])
-
-
-def _create_attention_score_fn(name,
-                               num_units,
-                               attention_option,
-                               reuse,
-                               dtype=dtypes.float32):
-  """Different ways to compute attention scores.
-
-  Args:
-    name: to label variables.
-    num_units: hidden state dimension.
-    attention_option: how to compute attention, either "luong" or "bahdanau".
-      "bahdanau": additive (Bahdanau et al., ICLR'2015)
-      "luong": multiplicative (Luong et al., EMNLP'2015)
-    reuse: whether to reuse variable scope.
-    dtype: (default: `dtypes.float32`) data type to use.
-
-  Returns:
-    attention_score_fn: to compute similarity between key and target states.
-  """
-  with variable_scope.variable_scope(name, reuse=reuse):
-    if attention_option == "bahdanau":
-      query_w = variable_scope.get_variable(
-          "attnW", [num_units, num_units], dtype=dtype)
-      score_v = variable_scope.get_variable("attnV", [num_units], dtype=dtype)
-
-    def attention_score_fn(query, keys, values):
-      """Put attention masks on attention_values using attention_keys and query.
-
-      Args:
-        query: A Tensor of shape [batch_size, num_units].
-        keys: A Tensor of shape [batch_size, attention_length, num_units].
-        values: A Tensor of shape [batch_size, attention_length, num_units].
-
-      Returns:
-        context_vector: A Tensor of shape [batch_size, num_units].
-
-      Raises:
-        ValueError: if attention_option is neither "luong" or "bahdanau".
-
-
-      """
-      if attention_option == "bahdanau":
-        # transform query
-        query = math_ops.matmul(query, query_w)
-
-        # reshape query: [batch_size, 1, num_units]
-        query = array_ops.reshape(query, [-1, 1, num_units])
-
-        # attn_fun
-        scores = _attn_add_fun(score_v, keys, query)
-      elif attention_option == "luong":
-        # reshape query: [batch_size, 1, num_units]
-        query = array_ops.reshape(query, [-1, 1, num_units])
-
-        # attn_fun
-        scores = _attn_mul_fun(keys, query)
-      else:
-        raise ValueError("Unknown attention option %s!" % attention_option)
-
-      # Compute alignment weights
-      #   scores: [batch_size, length]
-      #   alignments: [batch_size, length]
-      # TODO(thangluong): not normalize over padding positions.
-      alignments = nn_ops.softmax(scores)
-
-      # Now calculate the attention-weighted vector.
-      alignments = array_ops.expand_dims(alignments, 2)
-      context_vector = math_ops.reduce_sum(alignments * values, [1])
-      context_vector.set_shape([None, num_units])
-
-      return context_vector
-
-    return attention_score_fn
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
new file mode 100644
index 00000000000..fbbfb25aea5
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -0,0 +1,767 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A powerful dynamic attention wrapper object."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+__all__ = [
+    "AttentionMechanism",
+    "AttentionWrapper",
+    "AttentionWrapperState",
+    "LuongAttention",
+    "BahdanauAttention",
+    "hardmax",
+]
+
+
+_zero_state_tensors = rnn_cell_impl._zero_state_tensors  # pylint: disable=protected-access
+
+
+class AttentionMechanism(object):
+  pass
+
+
+def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
+  """Convert to tensor and possibly mask `memory`.
+
+  Args:
+    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
+    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
+    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
+      argument's shape is checked to ensure all but the two outermost
+      dimensions are fully defined.
+
+  Returns:
+    A (possibly masked), checked, new `memory`.
+
+  Raises:
+    ValueError: If `check_inner_dims_defined` is `True` and not
+      `memory.shape[2:].is_fully_defined()`.
+  """
+  memory = nest.map_structure(
+      lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+  if memory_sequence_length is not None:
+    memory_sequence_length = ops.convert_to_tensor(
+        memory_sequence_length, name="memory_sequence_length")
+  if check_inner_dims_defined:
+    def _check_dims(m):
+      if not m.get_shape()[2:].is_fully_defined():
+        raise ValueError("Expected memory %s to have fully defined inner dims, "
+                         "but saw shape: %s" % (m.name, m.get_shape()))
+    nest.map_structure(_check_dims, memory)
+  if memory_sequence_length is None:
+    seq_len_mask = None
+  else:
+    seq_len_mask = array_ops.sequence_mask(
+        memory_sequence_length,
+        maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+        dtype=nest.flatten(memory)[0].dtype)
+    seq_len_batch_size = (
+        memory_sequence_length.shape[0].value
+        or array_ops.shape(memory_sequence_length)[0])
+  def _maybe_mask(m, seq_len_mask):
+    rank = m.get_shape().ndims
+    rank = rank if rank is not None else array_ops.rank(m)
+    extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+    m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+    if memory_sequence_length is not None:
+      message = ("memory_sequence_length and memory tensor batch sizes do not "
+                 "match.")
+      with ops.control_dependencies([
+          check_ops.assert_equal(
+              seq_len_batch_size, m_batch_size, message=message)]):
+        seq_len_mask = array_ops.reshape(
+            seq_len_mask,
+            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+        return m * seq_len_mask
+    else:
+      return m
+  return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
+
+
+def _maybe_mask_score(score, memory_sequence_length, score_mask_value):
+  if memory_sequence_length is None:
+    return score
+  message = ("All values in memory_sequence_length must greater than zero.")
+  with ops.control_dependencies(
+      [check_ops.assert_positive(memory_sequence_length, message=message)]):
+    score_mask = array_ops.sequence_mask(
+        memory_sequence_length, maxlen=array_ops.shape(score)[1])
+    score_mask_values = score_mask_value * array_ops.ones_like(score)
+    return array_ops.where(score_mask, score, score_mask_values)
+
+
+class _BaseAttentionMechanism(AttentionMechanism):
+  """A base AttentionMechanism class providing common functionality.
+
+  Common functionality includes:
+    1. Storing the query and memory layers.
+    2. Preprocessing and storing the memory.
+  """
+
+  def __init__(self,
+               query_layer,
+               memory,
+               probability_fn,
+               memory_sequence_length=None,
+               memory_layer=None,
+               check_inner_dims_defined=True,
+               score_mask_value=float("-inf"),
+               name=None):
+    """Construct base AttentionMechanism class.
+
+    Args:
+      query_layer: Callable.  Instance of `tf.layers.Layer`.  The layer's depth
+        must match the depth of `memory_layer`.  If `query_layer` is not
+        provided, the shape of `query` must match that of `memory_layer`.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      probability_fn: A `callable`.  Converts the score and previous alignments
+        to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, previous_alignments)`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      memory_layer: Instance of `tf.layers.Layer` (may be None).  The layer's
+        depth must match the depth of `query_layer`.
+        If `memory_layer` is not provided, the shape of `memory` must match
+        that of `query_layer`.
+      check_inner_dims_defined: Python boolean.  If `True`, the `memory`
+        argument's shape is checked to ensure all but the two outermost
+        dimensions are fully defined.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
+      name: Name to use when creating ops.
+    """
+    if (query_layer is not None
+        and not isinstance(query_layer, layers_base.Layer)):
+      raise TypeError(
+          "query_layer is not a Layer: %s" % type(query_layer).__name__)
+    if (memory_layer is not None
+        and not isinstance(memory_layer, layers_base.Layer)):
+      raise TypeError(
+          "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
+    self._query_layer = query_layer
+    self._memory_layer = memory_layer
+    if not callable(probability_fn):
+      raise TypeError("probability_fn must be callable, saw type: %s" %
+                      type(probability_fn).__name__)
+    self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
+        probability_fn(
+            _maybe_mask_score(score, memory_sequence_length, score_mask_value),
+            prev))
+    with ops.name_scope(
+        name, "BaseAttentionMechanismInit", nest.flatten(memory)):
+      self._values = _prepare_memory(
+          memory, memory_sequence_length,
+          check_inner_dims_defined=check_inner_dims_defined)
+      self._keys = (
+          self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
+          else self._values)
+      self._batch_size = (
+          self._keys.shape[0].value or array_ops.shape(self._keys)[0])
+      self._alignments_size = (self._keys.shape[1].value or
+                               array_ops.shape(self._keys)[1])
+
+  @property
+  def memory_layer(self):
+    return self._memory_layer
+
+  @property
+  def query_layer(self):
+    return self._query_layer
+
+  @property
+  def values(self):
+    return self._values
+
+  @property
+  def keys(self):
+    return self._keys
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
+
+class LuongAttention(_BaseAttentionMechanism):
+  """Implements Luong-style (multiplicative) attention scoring.
+
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
+
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  "Effective Approaches to Attention-based Neural Machine Translation."
+  EMNLP 2015.  https://arxiv.org/abs/1508.04025
+
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
+
+  To enable the second form, construct the object with parameter
+  `scale=True`.
+  """
+
+  def __init__(self,
+               num_units,
+               memory,
+               memory_sequence_length=None,
+               scale=False,
+               probability_fn=None,
+               score_mask_value=float("-inf"),
+               name="LuongAttention"):
+    """Construct the AttentionMechanism mechanism.
+
+    Args:
+      num_units: The depth of the attention mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      scale: Python boolean.  Whether to scale the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
+      name: Name to use when creating ops.
+    """
+    # For LuongAttention, we only transform the memory layer; thus
+    # num_units **must** match expected the query depth.
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    super(LuongAttention, self).__init__(
+        query_layer=None,
+        memory_layer=layers_core.Dense(
+            num_units, name="memory_layer", use_bias=False),
+        memory=memory,
+        probability_fn=wrapped_probability_fn,
+        memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
+        name=name)
+    self._num_units = num_units
+    self._scale = scale
+    self._name = name
+
+  def __call__(self, query, previous_alignments):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+
+    Raises:
+      ValueError: If `key` and `query` depths do not match.
+    """
+    depth = query.get_shape()[-1]
+    key_units = self.keys.get_shape()[-1]
+    if depth != key_units:
+      raise ValueError(
+          "Incompatible or unknown inner dimensions between query and keys.  "
+          "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
+          "Perhaps you need to set num_units to the keys' dimension (%s)?"
+          % (query, depth, self.keys, key_units, key_units))
+    dtype = query.dtype
+
+    with variable_scope.variable_scope(None, "luong_attention", [query]):
+      # Reshape from [batch_size, depth] to [batch_size, 1, depth]
+      # for matmul.
+      query = array_ops.expand_dims(query, 1)
+
+      # Inner product along the query units dimension.
+      # matmul shapes: query is [batch_size, 1, depth] and
+      #                keys is [batch_size, max_time, depth].
+      # the inner product is asked to **transpose keys' inner shape** to get a
+      # batched matmul on:
+      #   [batch_size, 1, depth] . [batch_size, depth, max_time]
+      # resulting in an output shape of:
+      #   [batch_time, 1, max_time].
+      # we then squeeze out the center singleton dimension.
+      score = math_ops.matmul(query, self.keys, transpose_b=True)
+      score = array_ops.squeeze(score, [1])
+
+      if self._scale:
+        # Scalar used in weight scaling
+        g = variable_scope.get_variable(
+            "attention_g", dtype=dtype, initializer=1.)
+        score = g * score
+
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
+
+
+class BahdanauAttention(_BaseAttentionMechanism):
+  """Implements Bahdanau-style (additive) attention.
+
+  This attention has two forms.  The first is Bahdanau attention,
+  as described in:
+
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
+
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
+
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
+
+  To enable the second form, construct the object with parameter
+  `normalize=True`.
+  """
+
+  def __init__(self,
+               num_units,
+               memory,
+               memory_sequence_length=None,
+               normalize=False,
+               probability_fn=None,
+               score_mask_value=float("-inf"),
+               name="BahdanauAttention"):
+    """Construct the Attention mechanism.
+
+    Args:
+      num_units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
+      name: Name to use when creating ops.
+    """
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    super(BahdanauAttention, self).__init__(
+        query_layer=layers_core.Dense(
+            num_units, name="query_layer", use_bias=False),
+        memory_layer=layers_core.Dense(
+            num_units, name="memory_layer", use_bias=False),
+        memory=memory,
+        probability_fn=wrapped_probability_fn,
+        memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
+        name=name)
+    self._num_units = num_units
+    self._normalize = normalize
+    self._name = name
+
+  def __call__(self, query, previous_alignments):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
+    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
+      processed_query = self.query_layer(query) if self.query_layer else query
+      dtype = processed_query.dtype
+      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
+      processed_query = array_ops.expand_dims(processed_query, 1)
+      keys = self._keys
+      v = variable_scope.get_variable(
+          "attention_v", [self._num_units], dtype=dtype)
+      if self._normalize:
+        # Scalar used in weight normalization
+        g = variable_scope.get_variable(
+            "attention_g", dtype=dtype,
+            initializer=math.sqrt((1. / self._num_units)))
+        # Bias added prior to the nonlinearity
+        b = variable_scope.get_variable(
+            "attention_b", [self._num_units], dtype=dtype,
+            initializer=init_ops.zeros_initializer())
+        # normed_v = g * v / ||v||
+        normed_v = g * v * math_ops.rsqrt(
+            math_ops.reduce_sum(math_ops.square(v)))
+        score = math_ops.reduce_sum(
+            normed_v * math_ops.tanh(keys + processed_query + b), [2])
+      else:
+        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
+                                    [2])
+
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
+
+
+class AttentionWrapperState(
+    collections.namedtuple("AttentionWrapperState",
+                           ("cell_state", "attention", "time", "alignments",
+                            "alignment_history"))):
+  """`namedtuple` storing the state of a `AttentionWrapper`.
+
+  Contains:
+
+    - `cell_state`: The state of the wrapped `RNNCell` at the previous time
+      step.
+    - `attention`: The attention emitted at the previous time step.
+    - `time`: int32 scalar containing the current time step.
+    - `alignments`: The alignment emitted at the previous time step.
+    - `alignment_history`: (if enabled) a `TensorArray` containing alignment
+       matrices from all time steps.  Call `stack()` to convert to a `Tensor`.
+  """
+
+  def clone(self, **kwargs):
+    """Clone this object, overriding components provided by kwargs.
+
+    Example:
+
+    ```python
+    initial_state = attention_wrapper.zero_state(dtype=..., batch_size=...)
+    initial_state = initial_state.clone(cell_state=encoder_state)
+    ```
+
+    Args:
+      **kwargs: Any properties of the state object to replace in the returned
+        `AttentionWrapperState`.
+
+    Returns:
+      A new `AttentionWrapperState` whose properties are the same as
+      this one, except any overridden properties as provided in `kwargs`.
+    """
+    return super(AttentionWrapperState, self)._replace(**kwargs)
+
+
+def hardmax(logits, name=None):
+  """Returns batched one-hot vectors.
+
+  The depth index containing the `1` is that of the maximum logit value.
+
+  Args:
+    logits: A batch tensor of logit values.
+    name: Name to use when creating ops.
+  Returns:
+    A batched one-hot tensor.
+  """
+  with ops.name_scope(name, "Hardmax", [logits]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    if logits.get_shape()[-1].value is not None:
+      depth = logits.get_shape()[-1].value
+    else:
+      depth = array_ops.shape(logits)[-1]
+    return array_ops.one_hot(
+        math_ops.argmax(logits, -1), depth, dtype=logits.dtype)
+
+
+class AttentionWrapper(rnn_cell_impl.RNNCell):
+  """Wraps another `RNNCell` with attention.
+  """
+
+  def __init__(self,
+               cell,
+               attention_mechanism,
+               attention_layer_size=None,
+               alignment_history=False,
+               cell_input_fn=None,
+               output_attention=True,
+               initial_cell_state=None,
+               name=None):
+    """Construct the `AttentionWrapper`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      attention_mechanism: An instance of `AttentionMechanism`.
+      attention_layer_size: Python integer, the depth of the attention (output)
+        layer. If None (default), use the context as attention at each time
+        step. Otherwise, feed the context and cell output into the attention
+        layer to generate attention at each time step.
+      alignment_history: Python boolean, whether to store alignment history
+        from all time steps in the final output state (currently stored as a
+        time major `TensorArray` on which you must call `stack()`).
+      cell_input_fn: (optional) A `callable`.  The default is:
+        `lambda inputs, attention: array_ops.concat([inputs, attention], -1)`.
+      output_attention: Python bool.  If `True` (default), the output at each
+        time step is the attention value.  This is the behavior of Luong-style
+        attention mechanisms.  If `False`, the output at each time step is
+        the output of `cell`.  This is the beahvior of Bhadanau-style
+        attention mechanisms.  In both cases, the `attention` tensor is
+        propagated to the next time step via the state and is used there.
+        This flag only controls whether the attention mechanism is propagated
+        up to the next cell in an RNN stack or to the top RNN output.
+      initial_cell_state: The initial state value to use for the cell when
+        the user calls `zero_state()`.  Note that if this value is provided
+        now, and the user uses a `batch_size` argument of `zero_state` which
+        does not match the batch size of `initial_cell_state`, proper
+        behavior is not guaranteed.
+      name: Name to use when creating ops.
+    """
+    super(AttentionWrapper, self).__init__(name=name)
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
+      raise TypeError(
+          "cell must be an RNNCell, saw type: %s" % type(cell).__name__)
+    if not isinstance(attention_mechanism, AttentionMechanism):
+      raise TypeError(
+          "attention_mechanism must be a AttentionMechanism, saw type: %s"
+          % type(attention_mechanism).__name__)
+    if cell_input_fn is None:
+      cell_input_fn = (
+          lambda inputs, attention: array_ops.concat([inputs, attention], -1))
+    else:
+      if not callable(cell_input_fn):
+        raise TypeError(
+            "cell_input_fn must be callable, saw type: %s"
+            % type(cell_input_fn).__name__)
+
+    if attention_layer_size is not None:
+      self._attention_layer = layers_core.Dense(
+          attention_layer_size, name="attention_layer", use_bias=False)
+      self._attention_layer_size = attention_layer_size
+    else:
+      self._attention_layer = None
+      self._attention_layer_size = attention_mechanism.values.get_shape()[
+          -1].value
+
+    self._cell = cell
+    self._attention_mechanism = attention_mechanism
+    self._cell_input_fn = cell_input_fn
+    self._output_attention = output_attention
+    self._alignment_history = alignment_history
+    with ops.name_scope(name, "AttentionWrapperInit"):
+      if initial_cell_state is None:
+        self._initial_cell_state = None
+      else:
+        final_state_tensor = nest.flatten(initial_cell_state)[-1]
+        state_batch_size = (
+            final_state_tensor.shape[0].value
+            or array_ops.shape(final_state_tensor)[0])
+        error_message = (
+            "When constructing AttentionWrapper %s: " % self._base_name +
+            "Non-matching batch sizes between the memory "
+            "(encoder output) and initial_cell_state.  Are you using "
+            "the BeamSearchDecoder?  You may need to tile your initial state "
+            "via the tf.contrib.seq2seq.tile_batch function with argument "
+            "multiple=beam_width.")
+        with ops.control_dependencies(
+            [check_ops.assert_equal(state_batch_size,
+                                    self._attention_mechanism.batch_size,
+                                    message=error_message)]):
+          self._initial_cell_state = nest.map_structure(
+              lambda s: array_ops.identity(s, name="check_initial_cell_state"),
+              initial_cell_state)
+
+  @property
+  def output_size(self):
+    if self._output_attention:
+      return self._attention_layer_size
+    else:
+      return self._cell.output_size
+
+  @property
+  def state_size(self):
+    return AttentionWrapperState(
+        cell_state=self._cell.state_size,
+        time=tensor_shape.TensorShape([]),
+        attention=self._attention_layer_size,
+        alignments=self._attention_mechanism.alignments_size,
+        alignment_history=())  # alignment_history is sometimes a TensorArray
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      if self._initial_cell_state is not None:
+        cell_state = self._initial_cell_state
+      else:
+        cell_state = self._cell.zero_state(batch_size, dtype)
+      error_message = (
+          "When calling zero_state of AttentionWrapper %s: " % self._base_name +
+          "Non-matching batch sizes between the memory "
+          "(encoder output) and the requested batch size.  Are you using "
+          "the BeamSearchDecoder?  If so, make sure your encoder output has "
+          "been tiled to beam_width via tf.contrib.seq2seq.tile_batch, and "
+          "the batch_size= argument passed to zero_state is "
+          "batch_size * beam_width.")
+      with ops.control_dependencies(
+          [check_ops.assert_equal(batch_size,
+                                  self._attention_mechanism.batch_size,
+                                  message=error_message)]):
+        cell_state = nest.map_structure(
+            lambda s: array_ops.identity(s, name="checked_cell_state"),
+            cell_state)
+      if self._alignment_history:
+        alignment_history = tensor_array_ops.TensorArray(
+            dtype=dtype, size=0, dynamic_size=True)
+      else:
+        alignment_history = ()
+      return AttentionWrapperState(
+          cell_state=cell_state,
+          time=array_ops.zeros([], dtype=dtypes.int32),
+          attention=_zero_state_tensors(self._attention_layer_size, batch_size,
+                                        dtype),
+          alignments=self._attention_mechanism.initial_alignments(
+              batch_size, dtype),
+          alignment_history=alignment_history)
+
+  def call(self, inputs, state):
+    """Perform a step of attention-wrapped RNN.
+
+    - Step 1: Mix the `inputs` and previous step's `attention` output via
+      `cell_input_fn`.
+    - Step 2: Call the wrapped `cell` with this input and its previous state.
+    - Step 3: Score the cell's output with `attention_mechanism`.
+    - Step 4: Calculate the alignments by passing the score through the
+      `normalizer`.
+    - Step 5: Calculate the context vector as the inner product between the
+      alignments and the attention_mechanism's values (memory).
+    - Step 6: Calculate the attention output by concatenating the cell output
+      and context through the attention layer (a linear layer with
+      `attention_layer_size` outputs).
+
+    Args:
+      inputs: (Possibly nested tuple of) Tensor, the input at this time step.
+      state: An instance of `AttentionWrapperState` containing
+        tensors from the previous time step.
+
+    Returns:
+      A tuple `(attention_or_cell_output, next_state)`, where:
+
+      - `attention_or_cell_output` depending on `output_attention`.
+      - `next_state` is an instance of `AttentionWrapperState`
+         containing the state calculated at this time step.
+    """
+    # Step 1: Calculate the true inputs to the cell based on the
+    # previous attention value.
+    cell_inputs = self._cell_input_fn(inputs, state.attention)
+    cell_state = state.cell_state
+    cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
+
+    cell_batch_size = (
+        cell_output.shape[0].value or array_ops.shape(cell_output)[0])
+    error_message = (
+        "When applying AttentionWrapper %s: " % self.name +
+        "Non-matching batch sizes between the memory "
+        "(encoder output) and the query (decoder output).  Are you using "
+        "the BeamSearchDecoder?  You may need to tile your memory input via "
+        "the tf.contrib.seq2seq.tile_batch function with argument "
+        "multiple=beam_width.")
+    with ops.control_dependencies(
+        [check_ops.assert_equal(cell_batch_size,
+                                self._attention_mechanism.batch_size,
+                                message=error_message)]):
+      cell_output = array_ops.identity(
+          cell_output, name="checked_cell_output")
+
+    alignments = self._attention_mechanism(
+        cell_output, previous_alignments=state.alignments)
+
+    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+    expanded_alignments = array_ops.expand_dims(alignments, 1)
+    # Context is the inner product of alignments and values along the
+    # memory time dimension.
+    # alignments shape is
+    #   [batch_size, 1, memory_time]
+    # attention_mechanism.values shape is
+    #   [batch_size, memory_time, attention_mechanism.num_units]
+    # the batched matmul is over memory_time, so the output shape is
+    #   [batch_size, 1, attention_mechanism.num_units].
+    # we then squeeze out the singleton dim.
+    attention_mechanism_values = self._attention_mechanism.values
+    context = math_ops.matmul(expanded_alignments, attention_mechanism_values)
+    context = array_ops.squeeze(context, [1])
+
+    if self._attention_layer is not None:
+      attention = self._attention_layer(
+          array_ops.concat([cell_output, context], 1))
+    else:
+      attention = context
+
+    if self._alignment_history:
+      alignment_history = state.alignment_history.write(
+          state.time, alignments)
+    else:
+      alignment_history = ()
+
+    next_state = AttentionWrapperState(
+        time=state.time + 1,
+        cell_state=next_cell_state,
+        attention=attention,
+        alignments=alignments,
+        alignment_history=alignment_history)
+
+    if self._output_attention:
+      return attention, next_state
+    else:
+      return cell_output, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
new file mode 100644
index 00000000000..8ae175b6b59
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -0,0 +1,150 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A class of Decoders that may sample to generate the next input.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.util import nest
+
+
+__all__ = [
+    "BasicDecoderOutput",
+    "BasicDecoder",
+]
+
+
+class BasicDecoderOutput(
+    collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))):
+  pass
+
+
+class BasicDecoder(decoder.Decoder):
+  """Basic sampling decoder."""
+
+  def __init__(self, cell, helper, initial_state, output_layer=None):
+    """Initialize BasicDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      helper: A `Helper` instance.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+        The initial state of the RNNCell.
+      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
+        to storing the result or sampling.
+
+    Raises:
+      TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
+    """
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
+      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    if not isinstance(helper, helper_py.Helper):
+      raise TypeError("helper must be a Helper, received: %s" % type(helper))
+    if (output_layer is not None
+        and not isinstance(output_layer, layers_base.Layer)):
+      raise TypeError(
+          "output_layer must be a Layer, received: %s" % type(output_layer))
+    self._cell = cell
+    self._helper = helper
+    self._initial_state = initial_state
+    self._output_layer = output_layer
+
+  @property
+  def batch_size(self):
+    return self._helper.batch_size
+
+  def _rnn_output_size(self):
+    size = self._cell.output_size
+    if self._output_layer is None:
+      return size
+    else:
+      # To use layer's compute_output_shape, we need to convert the
+      # RNNCell's output_size entries into shapes with an unknown
+      # batch size.  We then pass this through the layer's
+      # compute_output_shape and read off all but the first (batch)
+      # dimensions to get the output size of the rnn with the layer
+      # applied to the top.
+      output_shape_with_unknown_batch = nest.map_structure(
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s),
+          size)
+      layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+          output_shape_with_unknown_batch)
+      return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return BasicDecoderOutput(
+        rnn_output=self._rnn_output_size(),
+        sample_id=tensor_shape.TensorShape([]))
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_state)[0].dtype
+    return BasicDecoderOutput(
+        nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        dtypes.int32)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, first_inputs, initial_state)`.
+    """
+    return self._helper.initialize() + (self._initial_state,)
+
+  def step(self, time, inputs, state, name=None):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+      name: Name scope for any created operations.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)):
+      cell_outputs, cell_state = self._cell(inputs, state)
+      if self._output_layer is not None:
+        cell_outputs = self._output_layer(cell_outputs)
+      sample_ids = self._helper.sample(
+          time=time, outputs=cell_outputs, state=cell_state)
+      (finished, next_inputs, next_state) = self._helper.next_inputs(
+          time=time,
+          outputs=cell_outputs,
+          state=cell_state,
+          sample_ids=sample_ids)
+    outputs = BasicDecoderOutput(cell_outputs, sample_ids)
+    return (outputs, next_state, next_inputs, finished)
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
new file mode 100644
index 00000000000..1d1babda163
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -0,0 +1,733 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A decoder that performs beam search."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+
+__all__ = [
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "BeamSearchDecoder",
+    "FinalBeamSearchDecoderOutput",
+    "tile_batch",
+]
+
+
+class BeamSearchDecoderState(
+    collections.namedtuple("BeamSearchDecoderState", ("cell_state", "log_probs",
+                                                      "finished", "lengths"))):
+  pass
+
+
+class BeamSearchDecoderOutput(
+    collections.namedtuple("BeamSearchDecoderOutput",
+                           ("scores", "predicted_ids", "parent_ids"))):
+  pass
+
+
+class FinalBeamSearchDecoderOutput(
+    collections.namedtuple("FinalBeamDecoderOutput",
+                           ["predicted_ids", "beam_search_decoder_output"])):
+  """Final outputs returned by the beam search after all decoding is finished.
+
+  Args:
+    predicted_ids: The final prediction. A tensor of shape
+      `[T, batch_size, beam_width]`.
+    beam_search_output: An instance of `BeamSearchDecoderOutput` that describes
+      the state of the beam search.
+  """
+  pass
+
+
+def _tile_batch(t, multiplier):
+  """Core single-tensor implementation of tile_batch."""
+  t = ops.convert_to_tensor(t, name="t")
+  shape_t = array_ops.shape(t)
+  if t.shape.ndims is None or t.shape.ndims < 1:
+    raise ValueError("t must have statically known rank")
+  tiling = [1] * (t.shape.ndims + 1)
+  tiling[1] = multiplier
+  tiled_static_batch_size = (
+      t.shape[0].value * multiplier if t.shape[0].value is not None else None)
+  tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
+  tiled = array_ops.reshape(
+      tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+  tiled.set_shape(
+      tensor_shape.TensorShape(
+          [tiled_static_batch_size]).concatenate(t.shape[1:]))
+  return tiled
+
+
+def tile_batch(t, multiplier, name=None):
+  """Tile the batch dimension of a (possibly nested structure of) tensor(s) t.
+
+  For each tensor t in a (possibly nested structure) of tensors,
+  this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
+  minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
+  `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
+  `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
+  `multiplier` times.
+
+  Args:
+    t: `Tensor` shaped `[batch_size, ...]`.
+    multiplier: Python int.
+    name: Name scope for any created operations.
+
+  Returns:
+    A (possibly nested structure of) `Tensor` shaped
+    `[batch_size * multiplier, ...]`.
+
+  Raises:
+    ValueError: if tensor(s) `t` do not have a statically known rank or
+    the rank is < 1.
+  """
+  flat_t = nest.flatten(t)
+  with ops.name_scope(name, "tile_batch", flat_t + [multiplier]):
+    return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
+
+
+def _check_maybe(t):
+  if isinstance(t, tensor_array_ops.TensorArray):
+    raise TypeError(
+        "TensorArray state is not supported by BeamSearchDecoder: %s" % t.name)
+  if t.shape.ndims is None:
+    raise ValueError(
+        "Expected tensor (%s) to have known rank, but ndims == None." % t)
+
+
+class BeamSearchDecoder(decoder.Decoder):
+  """BeamSearch sampling decoder."""
+
+  def __init__(self,
+               cell,
+               embedding,
+               start_tokens,
+               end_token,
+               initial_state,
+               beam_width,
+               output_layer=None,
+               length_penalty_weight=0.0):
+    """Initialize BeamSearchDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
+        to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.layers.Layer`.
+      ValueError: If `start_tokens` is not a vector or
+        `end_token` is not a scalar.
+    """
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
+      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    if (output_layer is not None
+        and not isinstance(output_layer, layers_base.Layer)):
+      raise TypeError(
+          "output_layer must be a Layer, received: %s" % type(output_layer))
+    self._cell = cell
+    self._output_layer = output_layer
+
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._beam_width = beam_width
+    self._length_penalty_weight = length_penalty_weight
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams,
+        initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+    self._finished = array_ops.zeros(
+        [self._batch_size, self._beam_width], dtype=dtypes.bool)
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  def _rnn_output_size(self):
+    size = self._cell.output_size
+    if self._output_layer is None:
+      return size
+    else:
+      # To use layer's compute_output_shape, we need to convert the
+      # RNNCell's output_size entries into shapes with an unknown
+      # batch size.  We then pass this through the layer's
+      # compute_output_shape and read off all but the first (batch)
+      # dimensions to get the output size of the rnn with the layer
+      # applied to the top.
+      output_shape_with_unknown_batch = nest.map_structure(
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s),
+          size)
+      layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+          output_shape_with_unknown_batch)
+      return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return BeamSearchDecoderOutput(
+        scores=tensor_shape.TensorShape([self._beam_width]),
+        predicted_ids=tensor_shape.TensorShape([self._beam_width]),
+        parent_ids=tensor_shape.TensorShape([self._beam_width]))
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    """
+    finished, start_inputs = self._finished, self._start_inputs
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=array_ops.zeros(
+            [self._batch_size, self._beam_width],
+            dtype=nest.flatten(self._initial_cell_state)[0].dtype),
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int32))
+
+    return (finished, start_inputs, initial_state)
+
+  def finalize(self, outputs, final_state, sequence_lengths):
+    """Finalize and return the predicted_ids.
+
+    Args:
+      outputs: An instance of BeamSearchDecoderOutput.
+      final_state: An instance of BeamSearchDecoderState. Passed through to the
+        output.
+      sequence_lengths: An `int32` tensor shaped `[batch_size, beam_width]`.
+        The sequence lengths determined for each beam during decode.
+
+    Returns:
+      outputs: An instance of FinalBeamSearchDecoderOutput where the
+        predicted_ids are the result of calling _gather_tree.
+      final_state: The same input instance of BeamSearchDecoderState.
+    """
+    predicted_ids = beam_search_ops.gather_tree(
+        outputs.predicted_ids, outputs.parent_ids,
+        sequence_length=sequence_lengths)
+    outputs = FinalBeamSearchDecoderOutput(
+        beam_search_decoder_output=outputs, predicted_ids=predicted_ids)
+    return outputs, final_state
+
+  def _merge_batch_beams(self, t, s=None):
+    """Merges the tensor from a batch of beams into a batch by beams.
+
+    More exactly, t is a tensor of dimension [batch_size, beam_width, s]. We
+    reshape this into [batch_size*beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size, beam_width, s]
+      s: (Possibly known) depth shape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size * beam_width, s].
+    """
+    if isinstance(s, ops.Tensor):
+      s = tensor_shape.as_shape(tensor_util.constant_value(s))
+    else:
+      s = tensor_shape.TensorShape(s)
+    t_shape = array_ops.shape(t)
+    static_batch_size = tensor_util.constant_value(self._batch_size)
+    batch_size_beam_width = (
+        None if static_batch_size is None
+        else static_batch_size * self._beam_width)
+    reshaped_t = array_ops.reshape(
+        t, array_ops.concat(
+            ([self._batch_size * self._beam_width], t_shape[2:]), 0))
+    reshaped_t.set_shape(
+        (tensor_shape.TensorShape([batch_size_beam_width]).concatenate(s)))
+    return reshaped_t
+
+  def _split_batch_beams(self, t, s=None):
+    """Splits the tensor from a batch by beams into a batch of beams.
+
+    More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
+    reshape this into [batch_size, beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s].
+      s: (Possibly known) depth shape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size, beam_width, s].
+
+    Raises:
+      ValueError: If, after reshaping, the new tensor is not shaped
+        `[batch_size, beam_width, s]` (assuming batch_size and beam_width
+        are known statically).
+    """
+    if isinstance(s, ops.Tensor):
+      s = tensor_shape.TensorShape(tensor_util.constant_value(s))
+    else:
+      s = tensor_shape.TensorShape(s)
+    t_shape = array_ops.shape(t)
+    reshaped_t = array_ops.reshape(
+        t, array_ops.concat(
+            ([self._batch_size, self._beam_width], t_shape[1:]), 0))
+    static_batch_size = tensor_util.constant_value(self._batch_size)
+    expected_reshaped_shape = tensor_shape.TensorShape(
+        [static_batch_size, self._beam_width]).concatenate(s)
+    if not reshaped_t.shape.is_compatible_with(expected_reshaped_shape):
+      raise ValueError("Unexpected behavior when reshaping between beam width "
+                       "and batch size.  The reshaped tensor has shape: %s.  "
+                       "We expected it to have shape "
+                       "(batch_size, beam_width, depth) == %s.  Perhaps you "
+                       "forgot to create a zero_state with "
+                       "batch_size=encoder_batch_size * beam_width?"
+                       % (reshaped_t.shape, expected_reshaped_shape))
+    reshaped_t.set_shape(expected_reshaped_shape)
+    return reshaped_t
+
+  def _maybe_split_batch_beams(self, t, s):
+    """Maybe splits the tensor from a batch by beams into a batch of beams.
+
+    We do this so that we can use nest and not run into problems with shapes.
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s]
+      s: Tensor, Python int, or TensorShape.
+
+    Returns:
+      Either a reshaped version of t with dimension
+      [batch_size, beam_width, s] if t's first dimension is of size
+      batch_size*beam_width or t if not.
+
+    Raises:
+      TypeError: If t is an instance of TensorArray.
+      ValueError: If the rank of t is not statically known.
+    """
+    _check_maybe(t)
+    if t.shape.ndims >= 1:
+      return self._split_batch_beams(t, s)
+    else:
+      return t
+
+  def _maybe_merge_batch_beams(self, t, s):
+    """Splits the tensor from a batch by beams into a batch of beams.
+
+    More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
+    reshape this into [batch_size, beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s]
+      s: Tensor, Python int, or TensorShape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size, beam_width, s].
+
+    Raises:
+      TypeError: If t is an instance of TensorArray.
+      ValueError:  If the rank of t is not statically known.
+    """
+    _check_maybe(t)
+    if t.shape.ndims >= 2:
+      return self._merge_batch_beams(t, s)
+    else:
+      return t
+
+  def step(self, time, inputs, state, name=None):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+      name: Name scope for any created operations.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    batch_size = self._batch_size
+    beam_width = self._beam_width
+    end_token = self._end_token
+    length_penalty_weight = self._length_penalty_weight
+
+    with ops.name_scope(name, "BeamSearchDecoderStep", (time, inputs, state)):
+      cell_state = state.cell_state
+      inputs = nest.map_structure(
+          lambda inp: self._merge_batch_beams(inp, s=inp.shape[2:]), inputs)
+      cell_state = nest.map_structure(
+          self._maybe_merge_batch_beams,
+          cell_state, self._cell.state_size)
+      cell_outputs, next_cell_state = self._cell(inputs, cell_state)
+      cell_outputs = nest.map_structure(
+          lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
+      next_cell_state = nest.map_structure(
+          self._maybe_split_batch_beams,
+          next_cell_state, self._cell.state_size)
+
+      if self._output_layer is not None:
+        cell_outputs = self._output_layer(cell_outputs)
+
+      beam_search_output, beam_search_state = _beam_search_step(
+          time=time,
+          logits=cell_outputs,
+          next_cell_state=next_cell_state,
+          beam_state=state,
+          batch_size=batch_size,
+          beam_width=beam_width,
+          end_token=end_token,
+          length_penalty_weight=length_penalty_weight)
+
+      finished = beam_search_state.finished
+      sample_ids = beam_search_output.predicted_ids
+      next_inputs = control_flow_ops.cond(
+          math_ops.reduce_all(finished), lambda: self._start_inputs,
+          lambda: self._embedding_fn(sample_ids))
+
+    return (beam_search_output, beam_search_state, next_inputs, finished)
+
+
+def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
+                      beam_width, end_token, length_penalty_weight):
+  """Performs a single step of Beam Search Decoding.
+
+  Args:
+    time: Beam search time step, should start at 0. At time 0 we assume
+      that all beams are equal and consider only the first beam for
+      continuations.
+    logits: Logits at the current time step. A tensor of shape
+      `[batch_size, beam_width, vocab_size]`
+    next_cell_state: The next state from the cell, e.g. an instance of
+      AttentionWrapperState if the cell is attentional.
+    beam_state: Current state of the beam search.
+      An instance of `BeamSearchDecoderState`.
+    batch_size: The batch size for this input.
+    beam_width: Python int.  The size of the beams.
+    end_token: The int32 end token.
+    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+  Returns:
+    A new beam state.
+  """
+  static_batch_size = tensor_util.constant_value(batch_size)
+
+  # Calculate the current lengths of the predictions
+  prediction_lengths = beam_state.lengths
+  previously_finished = beam_state.finished
+
+  # Calculate the total log probs for the new hypotheses
+  # Final Shape: [batch_size, beam_width, vocab_size]
+  step_log_probs = nn_ops.log_softmax(logits)
+  step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
+  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
+
+  # Calculate the continuation lengths by adding to all continuing beams.
+  vocab_size = logits.shape[-1].value
+  lengths_to_add = array_ops.one_hot(
+      indices=array_ops.tile(
+          array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
+      depth=vocab_size,
+      on_value=0,
+      off_value=1)
+  add_mask = (1 - math_ops.to_int32(previously_finished))
+  lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
+  new_prediction_lengths = (
+      lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))
+
+  # Calculate the scores for each beam
+  scores = _get_scores(
+      log_probs=total_probs,
+      sequence_lengths=new_prediction_lengths,
+      length_penalty_weight=length_penalty_weight)
+
+  time = ops.convert_to_tensor(time, name="time")
+  # During the first time step we only consider the initial beam
+  scores_shape = array_ops.shape(scores)
+  scores_flat = control_flow_ops.cond(
+      time > 0,
+      lambda: array_ops.reshape(scores, [batch_size, -1]),
+      lambda: scores[:, 0])
+  num_available_beam = control_flow_ops.cond(
+      time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
+      lambda: math_ops.reduce_prod(scores_shape[2:]))
+
+  # Pick the next beams according to the specified successors function
+  next_beam_size = math_ops.minimum(
+      ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"),
+      num_available_beam)
+  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
+  next_beam_scores.set_shape([static_batch_size, beam_width])
+  word_indices.set_shape([static_batch_size, beam_width])
+
+  # Pick out the probs, beam_ids, and states according to the chosen predictions
+  next_beam_probs = _tensor_gather_helper(
+      gather_indices=word_indices,
+      gather_from=total_probs,
+      batch_size=batch_size,
+      range_size=beam_width * vocab_size,
+      gather_shape=[-1])
+  next_word_ids = math_ops.to_int32(word_indices % vocab_size)
+  next_beam_ids = math_ops.to_int32(word_indices / vocab_size)
+
+  # Append new ids to current predictions
+  previously_finished = _tensor_gather_helper(
+      gather_indices=next_beam_ids,
+      gather_from=previously_finished,
+      batch_size=batch_size,
+      range_size=beam_width,
+      gather_shape=[-1])
+  next_finished = math_ops.logical_or(previously_finished,
+                                      math_ops.equal(next_word_ids, end_token))
+
+  # Calculate the length of the next predictions.
+  # 1. Finished beams remain unchanged
+  # 2. Beams that are now finished (EOS predicted) remain unchanged
+  # 3. Beams that are not yet finished have their length increased by 1
+  lengths_to_add = math_ops.to_int32(
+      math_ops.not_equal(next_word_ids, end_token))
+  lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add
+  next_prediction_len = _tensor_gather_helper(
+      gather_indices=next_beam_ids,
+      gather_from=beam_state.lengths,
+      batch_size=batch_size,
+      range_size=beam_width,
+      gather_shape=[-1])
+  next_prediction_len += lengths_to_add
+
+  # Pick out the cell_states according to the next_beam_ids. We use a
+  # different gather_shape here because the cell_state tensors, i.e.
+  # the tensors that would be gathered from, all have dimension
+  # greater than two and we need to preserve those dimensions.
+  # pylint: disable=g-long-lambda
+  next_cell_state = nest.map_structure(
+      lambda gather_from: _maybe_tensor_gather_helper(
+          gather_indices=next_beam_ids,
+          gather_from=gather_from,
+          batch_size=batch_size,
+          range_size=beam_width,
+          gather_shape=[batch_size * beam_width, -1]),
+      next_cell_state)
+  # pylint: enable=g-long-lambda
+
+  next_state = BeamSearchDecoderState(
+      cell_state=next_cell_state,
+      log_probs=next_beam_probs,
+      lengths=next_prediction_len,
+      finished=next_finished)
+
+  output = BeamSearchDecoderOutput(
+      scores=next_beam_scores,
+      predicted_ids=next_word_ids,
+      parent_ids=next_beam_ids)
+
+  return output, next_state
+
+
+def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
+  """Calculates scores for beam search hypotheses.
+
+  Args:
+    log_probs: The log probabilities with shape
+      `[batch_size, beam_width, vocab_size]`.
+    sequence_lengths: The array of sequence lengths.
+    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+  Returns:
+    The scores normalized by the length_penalty.
+  """
+  length_penality_ = _length_penalty(
+      sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
+  return log_probs / length_penality_
+
+
+def _length_penalty(sequence_lengths, penalty_factor):
+  """Calculates the length penalty. See https://arxiv.org/abs/1609.08144.
+
+  Args:
+    sequence_lengths: The sequence length of all hypotheses, a tensor
+      of shape [beam_size, vocab_size].
+    penalty_factor: A scalar that weights the length penalty.
+
+  Returns:
+    The length penalty factor, a tensor fo shape [beam_size].
+  """
+  penalty_factor = ops.convert_to_tensor(penalty_factor, name="penalty_factor")
+  penalty_factor.set_shape(())  # penalty should be a scalar.
+  static_penalty = tensor_util.constant_value(penalty_factor)
+  if static_penalty is not None and static_penalty == 0:
+    return 1.0
+  return math_ops.div((5. + math_ops.to_float(sequence_lengths))
+                      **penalty_factor, (5. + 1.)**penalty_factor)
+
+
+def _mask_probs(probs, eos_token, finished):
+  """Masks log probabilities.
+
+  The result is that finished beams allocate all probability mass to eos and
+  unfinished beams remain unchanged.
+
+  Args:
+    probs: Log probabiltiies of shape `[batch_size, beam_width, vocab_size]`
+    eos_token: An int32 id corresponding to the EOS token to allocate
+      probability to.
+    finished: A boolean tensor of shape `[batch_size, beam_width]` that
+      specifies which
+      elements in the beam are finished already.
+
+  Returns:
+    A tensor of shape `[batch_size, beam_width, vocab_size]`, where unfinished
+    beams stay unchanged and finished beams are replaced with a tensor with all
+    probability on the EOS token.
+  """
+  vocab_size = array_ops.shape(probs)[2]
+  finished_mask = array_ops.expand_dims(
+      math_ops.to_float(1. - math_ops.to_float(finished)), 2)
+  # These examples are not finished and we leave them
+  non_finished_examples = finished_mask * probs
+  # All finished examples are replaced with a vector that has all
+  # probability on EOS
+  finished_row = array_ops.one_hot(
+      eos_token,
+      vocab_size,
+      dtype=probs.dtype,
+      on_value=0.,
+      off_value=probs.dtype.min)
+  finished_examples = (1. - finished_mask) * finished_row
+  return finished_examples + non_finished_examples
+
+
+def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
+                                range_size, gather_shape):
+  """Maybe applies _tensor_gather_helper.
+
+  This applies _tensor_gather_helper when the gather_from dims is at least as
+  big as the length of gather_shape. This is used in conjunction with nest so
+  that we don't apply _tensor_gather_helper to inapplicable values like scalars.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The batch size.
+    range_size: The number of values in each range. Likely equal to beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve the
+      correct values. An example is when gather_from is the attention from an
+      AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+      There, we want to preserve the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+
+  Returns:
+    output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+      or the original tensor if its dimensions are too small.
+  """
+  _check_maybe(gather_from)
+  if gather_from.shape.ndims >= len(gather_shape):
+    return _tensor_gather_helper(
+        gather_indices=gather_indices,
+        gather_from=gather_from,
+        batch_size=batch_size,
+        range_size=range_size,
+        gather_shape=gather_shape)
+  else:
+    return gather_from
+
+
+def _tensor_gather_helper(gather_indices, gather_from, batch_size,
+                          range_size, gather_shape):
+  """Helper for gathering the right indices from the tensor.
+
+  This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
+  gathering from that according to the gather_indices, which are offset by
+  the right amounts in order to preserve the batch order.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The input batch size.
+    range_size: The number of values in each range. Likely equal to beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve the
+      correct values. An example is when gather_from is the attention from an
+      AttentionWrapperState with shape [batch_size, beam_width, attention_size].
+      There, we want to preserve the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+
+  Returns:
+    output: Gathered tensor of shape tf.shape(gather_from)[:1+len(gather_shape)]
+  """
+  range_ = array_ops.expand_dims(math_ops.range(batch_size) * range_size, 1)
+  gather_indices = array_ops.reshape(gather_indices + range_, [-1])
+  output = array_ops.gather(
+      array_ops.reshape(gather_from, gather_shape), gather_indices)
+  final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)]
+  static_batch_size = tensor_util.constant_value(batch_size)
+  final_static_shape = (tensor_shape.TensorShape([static_batch_size])
+                        .concatenate(
+                            gather_from.shape[1:1 + len(gather_shape)]))
+  output = array_ops.reshape(output, final_shape)
+  output.set_shape(final_static_shape)
+  return output
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py
new file mode 100644
index 00000000000..7d9fcc0c90a
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam Search helper ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.seq2seq.ops import gen_beam_search_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_beam_search_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_beam_search_ops.so"))
+
+gather_tree = gen_beam_search_ops.gather_tree
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index aa835585753..4795dfb8c91 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Seq2seq layer operations for use in neural networks.
-"""
+"""Seq2seq layer operations for use in neural networks."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import abc
-
 import six
 
 from tensorflow.python.framework import constant_op
@@ -31,50 +29,36 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
-__all__ = ["Decoder", "dynamic_decode_rnn"]
+
+__all__ = ["Decoder", "dynamic_decode"]
 
 
-def _transpose_batch_time(x):
-  """Transpose the batch and time dimensions of a Tensor.
-
-  Retains as much of the static shape information as possible.
-
-  Args:
-    x: A tensor of rank 2 or higher.
-
-  Returns:
-    x transposed along the first two dimensions.
-
-  Raises:
-    ValueError: if `x` is rank 1 or lower.
-  """
-  x_static_shape = x.get_shape()
-  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
-    raise ValueError(
-        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
-        (x, x_static_shape))
-  x_rank = array_ops.rank(x)
-  x_t = array_ops.transpose(
-      x, array_ops.concat(
-          ([1, 0], math_ops.range(2, x_rank)), axis=0))
-  x_t.set_shape(
-      tensor_shape.TensorShape([
-          x_static_shape[1].value, x_static_shape[0].value
-      ]).concatenate(x_static_shape[2:]))
-  return x_t
+_transpose_batch_time = rnn._transpose_batch_time  # pylint: disable=protected-access
 
 
 @six.add_metaclass(abc.ABCMeta)
 class Decoder(object):
-  """An RNN Decoder abstract interface object."""
+  """An RNN Decoder abstract interface object.
+
+  Concepts used by this interface:
+  - `inputs`: (structure of) tensors and TensorArrays that is passed as input to
+    the RNNCell composing the decoder, at each time step.
+  - `state`: (structure of) tensors and TensorArrays that is passed to the
+    RNNCell instance as the state.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at each
+    time step.
+  """
 
   @property
   def batch_size(self):
-    """The batch size of the inputs returned by `sample`."""
+    """The batch size of input values."""
     raise NotImplementedError
 
   @property
@@ -91,11 +75,14 @@ class Decoder(object):
   def initialize(self, name=None):
     """Called before any decoding iterations.
 
+    This methods must compute initial input values and initial state.
+
     Args:
       name: Name scope for any created operations.
 
     Returns:
-      `(finished, first_inputs, initial_state)`.
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
     """
     raise NotImplementedError
 
@@ -104,16 +91,25 @@ class Decoder(object):
     """Called per step of decoding (but only once for dynamic decoding).
 
     Args:
-      time: Scalar `int32` tensor.
-      inputs: Input (possibly nested tuple of) tensor[s] for this time step.
-      state: State (possibly nested tuple of) tensor[s] from previous time step.
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNNCell input (possibly nested tuple of) tensor[s] for this time
+        step.
+      state: RNNCell state (possibly nested tuple of) tensor[s] from previous
+        time step.
       name: Name scope for any created operations.
 
     Returns:
-      `(outputs, next_state, next_inputs, finished)`.
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an instance
+      of BasicDecoderOutput, `next_state` is a (structure of) state tensors and
+      TensorArrays, `next_inputs` is the tensor that should be used as input for
+      the next step, `finished` is a boolean tensor telling whether the sequence
+      is complete, for each sequence in the batch.
     """
     raise NotImplementedError
 
+  def finalize(self, outputs, final_state, sequence_lengths):
+    raise NotImplementedError
+
 
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
@@ -131,15 +127,17 @@ def _create_zero_outputs(size, dtype, batch_size):
   return nest.map_structure(_create, size, dtype)
 
 
-def dynamic_decode_rnn(decoder,
-                       output_time_major=False,
-                       impute_finished=False,
-                       maximum_iterations=None,
-                       parallel_iterations=32,
-                       swap_memory=False,
-                       scope=None):
+def dynamic_decode(decoder,
+                   output_time_major=False,
+                   impute_finished=False,
+                   maximum_iterations=None,
+                   parallel_iterations=32,
+                   swap_memory=False,
+                   scope=None):
   """Perform dynamic decoding with `decoder`.
 
+  Calls initialize() once and step() repeatedly on the Decoder object.
+
   Args:
     decoder: A `Decoder` instance.
     output_time_major: Python boolean.  Default: `False` (batch major).  If
@@ -159,17 +157,17 @@ def dynamic_decode_rnn(decoder,
     scope: Optional variable scope to use.
 
   Returns:
-    `(final_outputs, final_state)`.
+    `(final_outputs, final_state, final_sequence_lengths)`.
 
   Raises:
     TypeError: if `decoder` is not an instance of `Decoder`.
-    ValueError: if maximum_iterations is provided but is not a scalar.
+    ValueError: if `maximum_iterations` is provided but is not a scalar.
   """
   if not isinstance(decoder, Decoder):
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
-  with variable_scope.variable_scope(scope or "decoder") as varscope:
+  with variable_scope.variable_scope(scope, "decoder") as varscope:
     # Properly cache variable values inside the while_loop
     if varscope.caching_device is None:
       varscope.set_caching_device(lambda op: op.device)
@@ -189,6 +187,8 @@ def dynamic_decode_rnn(decoder,
     if maximum_iterations is not None:
       initial_finished = math_ops.logical_or(
           initial_finished, 0 >= maximum_iterations)
+    initial_sequence_lengths = array_ops.zeros_like(
+        initial_finished, dtype=dtypes.int32)
     initial_time = constant_op.constant(0, dtype=dtypes.int32)
 
     def _shape(batch_size, from_shape):
@@ -211,10 +211,10 @@ def dynamic_decode_rnn(decoder,
                                             decoder.output_dtype)
 
     def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
-                  finished):
+                  finished, unused_sequence_lengths):
       return math_ops.logical_not(math_ops.reduce_all(finished))
 
-    def body(time, outputs_ta, state, inputs, finished):
+    def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
       """Internal while_loop body.
 
       Args:
@@ -222,10 +222,13 @@ def dynamic_decode_rnn(decoder,
         outputs_ta: structure of TensorArray.
         state: (structure of) state tensors and TensorArrays.
         inputs: (structure of) input tensors.
-        finished: 1-D bool tensor.
+        finished: bool tensor (keeping track of what's finished).
+        sequence_lengths: int32 tensor (keeping track of time of finish).
 
       Returns:
-        `(time + 1, outputs_ta, next_state, next_inputs, next_finished)`.
+        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
+          next_sequence_lengths)`.
+        ```
       """
       (next_outputs, decoder_state, next_inputs,
        decoder_finished) = decoder.step(time, inputs, state)
@@ -233,6 +236,10 @@ def dynamic_decode_rnn(decoder,
       if maximum_iterations is not None:
         next_finished = math_ops.logical_or(
             next_finished, time + 1 >= maximum_iterations)
+      next_sequence_lengths = array_ops.where(
+          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
+          array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
+          sequence_lengths)
 
       nest.assert_same_structure(state, decoder_state)
       nest.assert_same_structure(outputs_ta, next_outputs)
@@ -249,8 +256,14 @@ def dynamic_decode_rnn(decoder,
 
       # Copy through states past finish
       def _maybe_copy_state(new, cur):
-        return (new if isinstance(cur, tensor_array_ops.TensorArray) else
-                array_ops.where(finished, cur, new))
+        # TensorArrays and scalar states get passed through.
+        if isinstance(cur, tensor_array_ops.TensorArray):
+          pass_through = True
+        else:
+          new.set_shape(cur.shape)
+          pass_through = (new.shape.ndims == 0)
+        return new if pass_through else array_ops.where(finished, cur, new)
+
       if impute_finished:
         next_state = nest.map_structure(
             _maybe_copy_state, decoder_state, state)
@@ -259,23 +272,32 @@ def dynamic_decode_rnn(decoder,
 
       outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
                                       outputs_ta, emit)
-      return (time + 1, outputs_ta, next_state, next_inputs, next_finished)
+      return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
+              next_sequence_lengths)
 
     res = control_flow_ops.while_loop(
         condition,
         body,
         loop_vars=[
             initial_time, initial_outputs_ta, initial_state, initial_inputs,
-            initial_finished
+            initial_finished, initial_sequence_lengths,
         ],
         parallel_iterations=parallel_iterations,
         swap_memory=swap_memory)
 
     final_outputs_ta = res[1]
     final_state = res[2]
+    final_sequence_lengths = res[5]
 
     final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
+
+    try:
+      final_outputs, final_state = decoder.finalize(
+          final_outputs, final_state, final_sequence_lengths)
+    except NotImplementedError:
+      pass
+
     if not output_time_major:
       final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
 
-  return final_outputs, final_state
+  return final_outputs, final_state, final_sequence_lengths
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py b/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
deleted file mode 100644
index 685aec2d66a..00000000000
--- a/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Seq2seq loss operations for use in neural networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util import nest
-
-__all__ = ["simple_decoder_fn_train",
-           "simple_decoder_fn_inference"]
-
-def simple_decoder_fn_train(encoder_state, name=None):
-  """ Simple decoder function for a sequence-to-sequence model used in the
-  `dynamic_rnn_decoder`.
-
-  The `simple_decoder_fn_train` is a simple training function for a
-  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
-  in the training mode.
-
-  The `simple_decoder_fn_train` is called with a set of the user arguments and
-  returns the `decoder_fn`, which can be passed to the `dynamic_rnn_decoder`,
-  such that
-
-  ```
-  dynamic_fn_train = simple_decoder_fn_train(encoder_state)
-  outputs_train, state_train = dynamic_rnn_decoder(
-      decoder_fn=dynamic_fn_train, ...)
-  ```
-
-  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
-
-  Args:
-    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
-    name: (default: `None`) NameScope for the decoder function;
-      defaults to "simple_decoder_fn_train"
-
-  Returns:
-    A decoder function with the required interface of `dynamic_rnn_decoder`
-    intended for training.
-  """
-  with ops.name_scope(name, "simple_decoder_fn_train", [encoder_state]):
-    pass
-
-  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
-    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
-    training.
-
-    Args:
-      time: positive integer constant reflecting the current timestep.
-      cell_state: state of RNNCell.
-      cell_input: input provided by `dynamic_rnn_decoder`.
-      cell_output: output of RNNCell.
-      context_state: context state provided by `dynamic_rnn_decoder`.
-
-    Returns:
-      A tuple (done, next state, next input, emit output, next context state)
-      where:
-
-      done: `None`, which is used by the `dynamic_rnn_decoder` to indicate
-      that `sequence_lengths` in `dynamic_rnn_decoder` should be used.
-
-      next state: `cell_state`, this decoder function does not modify the
-      given state.
-
-      next input: `cell_input`, this decoder function does not modify the
-      given input. The input could be modified when applying e.g. attention.
-
-      emit output: `cell_output`, this decoder function does not modify the
-      given output.
-
-      next context state: `context_state`, this decoder function does not
-      modify the given context state. The context state could be modified when
-      applying e.g. beam search.
-  """
-    with ops.name_scope(name, "simple_decoder_fn_train",
-                        [time, cell_state, cell_input, cell_output,
-                         context_state]):
-      if cell_state is None:  # first call, return encoder_state
-        return (None, encoder_state, cell_input, cell_output, context_state)
-      else:
-        return (None, cell_state, cell_input, cell_output, context_state)
-  return decoder_fn
-
-
-def simple_decoder_fn_inference(output_fn, encoder_state, embeddings,
-                                start_of_sequence_id, end_of_sequence_id,
-                                maximum_length, num_decoder_symbols,
-                                dtype=dtypes.int32, name=None):
-  """ Simple decoder function for a sequence-to-sequence model used in the
-  `dynamic_rnn_decoder`.
-
-  The `simple_decoder_fn_inference` is a simple inference function for a
-  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
-  in the inference mode.
-
-  The `simple_decoder_fn_inference` is called with a set of the user arguments
-  and returns the `decoder_fn`, which can be passed to the
-  `dynamic_rnn_decoder`, such that
-
-  ```
-  dynamic_fn_inference = simple_decoder_fn_inference(...)
-  outputs_inference, state_inference = dynamic_rnn_decoder(
-      decoder_fn=dynamic_fn_inference, ...)
-  ```
-
-  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
-
-  Args:
-    output_fn: An output function to project your `cell_output` onto class
-    logits.
-
-    An example of an output function;
-
-    ```
-      tf.variable_scope("decoder") as varscope
-        output_fn = lambda x: layers.linear(x, num_decoder_symbols,
-                                            scope=varscope)
-
-        outputs_train, state_train = seq2seq.dynamic_rnn_decoder(...)
-        logits_train = output_fn(outputs_train)
-
-        varscope.reuse_variables()
-        logits_inference, state_inference = seq2seq.dynamic_rnn_decoder(
-            output_fn=output_fn, ...)
-    ```
-
-    If `None` is supplied it will act as an identity function, which
-    might be wanted when using the RNNCell `OutputProjectionWrapper`.
-
-    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
-    embeddings: The embeddings matrix used for the decoder sized
-    `[num_decoder_symbols, embedding_size]`.
-    start_of_sequence_id: The start of sequence ID in the decoder embeddings.
-    end_of_sequence_id: The end of sequence ID in the decoder embeddings.
-    maximum_length: The maximum allowed of time steps to decode.
-    num_decoder_symbols: The number of classes to decode at each time step.
-    dtype: (default: `dtypes.int32`) The default data type to use when
-    handling integer objects.
-    name: (default: `None`) NameScope for the decoder function;
-      defaults to "simple_decoder_fn_inference"
-
-  Returns:
-    A decoder function with the required interface of `dynamic_rnn_decoder`
-    intended for inference.
-  """
-  with ops.name_scope(name, "simple_decoder_fn_inference",
-                      [output_fn, encoder_state, embeddings,
-                       start_of_sequence_id, end_of_sequence_id,
-                       maximum_length, num_decoder_symbols, dtype]):
-    start_of_sequence_id = ops.convert_to_tensor(start_of_sequence_id, dtype)
-    end_of_sequence_id = ops.convert_to_tensor(end_of_sequence_id, dtype)
-    maximum_length = ops.convert_to_tensor(maximum_length, dtype)
-    num_decoder_symbols = ops.convert_to_tensor(num_decoder_symbols, dtype)
-    encoder_info = nest.flatten(encoder_state)[0]
-    batch_size = encoder_info.get_shape()[0].value
-    if output_fn is None:
-      output_fn = lambda x: x
-    if batch_size is None:
-      batch_size = array_ops.shape(encoder_info)[0]
-
-  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
-    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
-    inference.
-
-    The main difference between this decoder function and the `decoder_fn` in
-    `simple_decoder_fn_train` is how `next_cell_input` is calculated. In this
-    decoder function we calculate the next input by applying an argmax across
-    the feature dimension of the output from the decoder. This is a
-    greedy-search approach. (Bahdanau et al., 2014) & (Sutskever et al., 2014)
-    use beam-search instead.
-
-    Args:
-      time: positive integer constant reflecting the current timestep.
-      cell_state: state of RNNCell.
-      cell_input: input provided by `dynamic_rnn_decoder`.
-      cell_output: output of RNNCell.
-      context_state: context state provided by `dynamic_rnn_decoder`.
-
-    Returns:
-      A tuple (done, next state, next input, emit output, next context state)
-      where:
-
-      done: A boolean vector to indicate which sentences has reached a
-      `end_of_sequence_id`. This is used for early stopping by the
-      `dynamic_rnn_decoder`. When `time>=maximum_length` a boolean vector with
-      all elements as `true` is returned.
-
-      next state: `cell_state`, this decoder function does not modify the
-      given state.
-
-      next input: The embedding from argmax of the `cell_output` is used as
-      `next_input`.
-
-      emit output: If `output_fn is None` the supplied `cell_output` is
-      returned, else the `output_fn` is used to update the `cell_output`
-      before calculating `next_input` and returning `cell_output`.
-
-      next context state: `context_state`, this decoder function does not
-      modify the given context state. The context state could be modified when
-      applying e.g. beam search.
-  """
-    with ops.name_scope(name, "simple_decoder_fn_inference",
-                        [time, cell_state, cell_input, cell_output,
-                         context_state]):
-      if cell_input is not None:
-        raise ValueError("Expected cell_input to be None, but saw: %s" %
-                         cell_input)
-      if cell_output is None:
-        # invariant that this is time == 0
-        next_input_id = array_ops.ones([batch_size,], dtype=dtype) * (
-            start_of_sequence_id)
-        done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
-        cell_state = encoder_state
-        cell_output = array_ops.zeros([num_decoder_symbols],
-                                      dtype=dtypes.float32)
-      else:
-        cell_output = output_fn(cell_output)
-        next_input_id = math_ops.cast(
-            math_ops.argmax(cell_output, 1), dtype=dtype)
-        done = math_ops.equal(next_input_id, end_of_sequence_id)
-      next_input = array_ops.gather(embeddings, next_input_id)
-      # if time > maxlen, return all true vector
-      done = control_flow_ops.cond(math_ops.greater(time, maximum_length),
-          lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
-          lambda: done)
-      return (done, cell_state, next_input, cell_output, context_state)
-  return decoder_fn
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
new file mode 100644
index 00000000000..bee75479357
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -0,0 +1,554 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library of helpers for use with SamplingDecoders.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.util import nest
+
+__all__ = [
+    "Helper",
+    "TrainingHelper",
+    "GreedyEmbeddingHelper",
+    "SampleEmbeddingHelper",
+    "CustomHelper",
+    "ScheduledEmbeddingTrainingHelper",
+    "ScheduledOutputTrainingHelper",
+]
+
+_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
+
+
+def _unstack_ta(inp):
+  return tensor_array_ops.TensorArray(
+      dtype=inp.dtype, size=array_ops.shape(inp)[0],
+      element_shape=inp.get_shape()[1:]).unstack(inp)
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Helper(object):
+  """Interface for implementing sampling in seq2seq decoders.
+
+  Helper instances are used by `BasicDecoder`.
+  """
+
+  @abc.abstractproperty
+  def batch_size(self):
+    """Batch size of tensor returned by `sample`.
+
+    Returns a scalar int32 tensor.
+    """
+    raise NotImplementedError("batch_size has not been implemented")
+
+  @abc.abstractmethod
+  def initialize(self, name=None):
+    """Returns `(initial_finished, initial_inputs)`."""
+    pass
+
+  @abc.abstractmethod
+  def sample(self, time, outputs, state, name=None):
+    """Returns `sample_ids`."""
+    pass
+
+  @abc.abstractmethod
+  def next_inputs(self, time, outputs, state, sample_ids, name=None):
+    """Returns `(finished, next_inputs, next_state)`."""
+    pass
+
+
+class CustomHelper(Helper):
+  """Base abstract class that allows the user to customize sampling."""
+
+  def __init__(self, initialize_fn, sample_fn, next_inputs_fn):
+    """Initializer.
+
+    Args:
+      initialize_fn: callable that returns `(finished, next_inputs)`
+        for the first iteration.
+      sample_fn: callable that takes `(time, outputs, state)`
+        and emits tensor `sample_ids`.
+      next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)`
+        and emits `(finished, next_inputs, next_state)`.
+    """
+    self._initialize_fn = initialize_fn
+    self._sample_fn = sample_fn
+    self._next_inputs_fn = next_inputs_fn
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  def initialize(self, name=None):
+    with ops.name_scope(name, "%sInitialize" % type(self).__name__):
+      (finished, next_inputs) = self._initialize_fn()
+      if self._batch_size is None:
+        self._batch_size = array_ops.size(finished)
+    return (finished, next_inputs)
+
+  def sample(self, time, outputs, state, name=None):
+    with ops.name_scope(
+        name, "%sSample" % type(self).__name__, (time, outputs, state)):
+      return self._sample_fn(time=time, outputs=outputs, state=state)
+
+  def next_inputs(self, time, outputs, state, sample_ids, name=None):
+    with ops.name_scope(
+        name, "%sNextInputs" % type(self).__name__, (time, outputs, state)):
+      return self._next_inputs_fn(
+          time=time, outputs=outputs, state=state, sample_ids=sample_ids)
+
+
+class TrainingHelper(Helper):
+  """A helper for use during training.  Only reads inputs.
+
+  Returned sample_ids are the argmax of the RNN output logits.
+  """
+
+  def __init__(self, inputs, sequence_length, time_major=False, name=None):
+    """Initializer.
+
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+      time_major: Python bool.  Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      name: Name scope for any created operations.
+
+    Raises:
+      ValueError: if `sequence_length` is not a 1D tensor.
+    """
+    with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
+      inputs = ops.convert_to_tensor(inputs, name="inputs")
+      if not time_major:
+        inputs = nest.map_structure(_transpose_batch_time, inputs)
+
+      self._input_tas = nest.map_structure(_unstack_ta, inputs)
+      self._sequence_length = ops.convert_to_tensor(
+          sequence_length, name="sequence_length")
+      if self._sequence_length.get_shape().ndims != 1:
+        raise ValueError(
+            "Expected sequence_length to be a vector, but received shape: %s" %
+            self._sequence_length.get_shape())
+
+      self._zero_inputs = nest.map_structure(
+          lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
+
+      self._batch_size = array_ops.size(sequence_length)
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  def initialize(self, name=None):
+    with ops.name_scope(name, "TrainingHelperInitialize"):
+      finished = math_ops.equal(0, self._sequence_length)
+      all_finished = math_ops.reduce_all(finished)
+      next_inputs = control_flow_ops.cond(
+          all_finished, lambda: self._zero_inputs,
+          lambda: nest.map_structure(lambda inp: inp.read(0), self._input_tas))
+      return (finished, next_inputs)
+
+  def sample(self, time, outputs, name=None, **unused_kwargs):
+    with ops.name_scope(name, "TrainingHelperSample", [time, outputs]):
+      sample_ids = math_ops.cast(
+          math_ops.argmax(outputs, axis=-1), dtypes.int32)
+      return sample_ids
+
+  def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
+    """next_inputs_fn for TrainingHelper."""
+    with ops.name_scope(name, "TrainingHelperNextInputs",
+                        [time, outputs, state]):
+      next_time = time + 1
+      finished = (next_time >= self._sequence_length)
+      all_finished = math_ops.reduce_all(finished)
+      def read_from_ta(inp):
+        return inp.read(next_time)
+      next_inputs = control_flow_ops.cond(
+          all_finished, lambda: self._zero_inputs,
+          lambda: nest.map_structure(read_from_ta, self._input_tas))
+      return (finished, next_inputs, state)
+
+
+class ScheduledEmbeddingTrainingHelper(TrainingHelper):
+  """A training helper that adds scheduled sampling.
+
+  Returns -1s for sample_ids where no sampling took place; valid sample id
+  values elsewhere.
+  """
+
+  def __init__(self, inputs, sequence_length, embedding, sampling_probability,
+               time_major=False, seed=None, scheduling_seed=None, name=None):
+    """Initializer.
+
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      sampling_probability: A 0D `float32` tensor: the probability of sampling
+        categorically from the output ids instead of reading directly from the
+        inputs.
+      time_major: Python bool.  Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      scheduling_seed: The schedule decision rule sampling seed.
+      name: Name scope for any created operations.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    with ops.name_scope(name, "ScheduledEmbeddingSamplingWrapper",
+                        [embedding, sampling_probability]):
+      if callable(embedding):
+        self._embedding_fn = embedding
+      else:
+        self._embedding_fn = (
+            lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+      self._sampling_probability = ops.convert_to_tensor(
+          sampling_probability, name="sampling_probability")
+      if self._sampling_probability.get_shape().ndims not in (0, 1):
+        raise ValueError(
+            "sampling_probability must be either a scalar or a vector. "
+            "saw shape: %s" % (self._sampling_probability.get_shape()))
+      self._seed = seed
+      self._scheduling_seed = scheduling_seed
+      super(ScheduledEmbeddingTrainingHelper, self).__init__(
+          inputs=inputs,
+          sequence_length=sequence_length,
+          time_major=time_major,
+          name=name)
+
+  def initialize(self, name=None):
+    return super(ScheduledEmbeddingTrainingHelper, self).initialize(name=name)
+
+  def sample(self, time, outputs, state, name=None):
+    with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
+                        [time, outputs, state]):
+      # Return -1s where we did not sample, and sample_ids elsewhere
+      select_sample_noise = random_ops.random_uniform(
+          [self.batch_size], seed=self._scheduling_seed)
+      select_sample = (self._sampling_probability > select_sample_noise)
+      sample_id_sampler = categorical.Categorical(logits=outputs)
+      return array_ops.where(
+          select_sample,
+          sample_id_sampler.sample(seed=self._seed),
+          array_ops.tile([-1], [self.batch_size]))
+
+  def next_inputs(self, time, outputs, state, sample_ids, name=None):
+    with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
+                        [time, outputs, state, sample_ids]):
+      (finished, base_next_inputs, state) = (
+          super(ScheduledEmbeddingTrainingHelper, self).next_inputs(
+              time=time,
+              outputs=outputs,
+              state=state,
+              sample_ids=sample_ids,
+              name=name))
+
+      def maybe_sample():
+        """Perform scheduled sampling."""
+        where_sampling = math_ops.cast(
+            array_ops.where(sample_ids > -1), dtypes.int32)
+        where_not_sampling = math_ops.cast(
+            array_ops.where(sample_ids <= -1), dtypes.int32)
+        where_sampling_flat = array_ops.reshape(where_sampling, [-1])
+        where_not_sampling_flat = array_ops.reshape(where_not_sampling, [-1])
+        sample_ids_sampling = array_ops.gather(sample_ids, where_sampling_flat)
+        inputs_not_sampling = array_ops.gather(
+            base_next_inputs, where_not_sampling_flat)
+        sampled_next_inputs = self._embedding_fn(sample_ids_sampling)
+        base_shape = array_ops.shape(base_next_inputs)
+        return (array_ops.scatter_nd(indices=where_sampling,
+                                     updates=sampled_next_inputs,
+                                     shape=base_shape)
+                + array_ops.scatter_nd(indices=where_not_sampling,
+                                       updates=inputs_not_sampling,
+                                       shape=base_shape))
+
+      all_finished = math_ops.reduce_all(finished)
+      next_inputs = control_flow_ops.cond(
+          all_finished, lambda: base_next_inputs, maybe_sample)
+      return (finished, next_inputs, state)
+
+
+class ScheduledOutputTrainingHelper(TrainingHelper):
+  """A training helper that adds scheduled sampling directly to outputs.
+
+  Returns False for sample_ids where no sampling took place; True elsewhere.
+  """
+
+  def __init__(self, inputs, sequence_length, sampling_probability,
+               time_major=False, seed=None, next_input_layer=None,
+               auxiliary_inputs=None, name=None):
+    """Initializer.
+
+    Args:
+      inputs: A (structure) of input tensors.
+      sequence_length: An int32 vector tensor.
+      sampling_probability: A 0D `float32` tensor: the probability of sampling
+        from the outputs instead of reading directly from the inputs.
+      time_major: Python bool.  Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      next_input_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`.  Optional layer to apply to the RNN output to create
+        the next input.
+      auxiliary_inputs: An optional (structure of) auxiliary input tensors with
+        a shape that matches `inputs` in all but (potentially) the final
+        dimension. These tensors will be concatenated to the sampled output or
+        the `inputs` when not sampling for use as the next input.
+      name: Name scope for any created operations.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    with ops.name_scope(name, "ScheduledOutputTrainingHelper",
+                        [inputs, auxiliary_inputs, sampling_probability]):
+      self._sampling_probability = ops.convert_to_tensor(
+          sampling_probability, name="sampling_probability")
+      if self._sampling_probability.get_shape().ndims not in (0, 1):
+        raise ValueError(
+            "sampling_probability must be either a scalar or a vector. "
+            "saw shape: %s" % (self._sampling_probability.get_shape()))
+
+      if auxiliary_inputs is None:
+        maybe_concatenated_inputs = inputs
+      else:
+        inputs = ops.convert_to_tensor(inputs, name="inputs")
+        auxiliary_inputs = ops.convert_to_tensor(
+            auxiliary_inputs, name="auxiliary_inputs")
+        maybe_concatenated_inputs = nest.map_structure(
+            lambda x, y: array_ops.concat((x, y), -1),
+            inputs, auxiliary_inputs)
+        if not time_major:
+          auxiliary_inputs = nest.map_structure(
+              _transpose_batch_time, auxiliary_inputs)
+
+      self._auxiliary_input_tas = (
+          nest.map_structure(_unstack_ta, auxiliary_inputs)
+          if auxiliary_inputs is not None else None)
+
+      self._seed = seed
+
+      if (next_input_layer is not None and not isinstance(next_input_layer,
+                                                          layers_base.Layer)):
+        raise TypeError("next_input_layer must be a Layer, received: %s" %
+                        type(next_input_layer))
+      self._next_input_layer = next_input_layer
+
+      super(ScheduledOutputTrainingHelper, self).__init__(
+          inputs=maybe_concatenated_inputs,
+          sequence_length=sequence_length,
+          time_major=time_major,
+          name=name)
+
+  def initialize(self, name=None):
+    return super(ScheduledOutputTrainingHelper, self).initialize(name=name)
+
+  def sample(self, time, outputs, state, name=None):
+    with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
+                        [time, outputs, state]):
+      sampler = bernoulli.Bernoulli(probs=self._sampling_probability)
+      return math_ops.cast(
+          sampler.sample(sample_shape=self.batch_size, seed=self._seed),
+          dtypes.bool)
+
+  def next_inputs(self, time, outputs, state, sample_ids, name=None):
+    with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
+                        [time, outputs, state, sample_ids]):
+      (finished, base_next_inputs, state) = (
+          super(ScheduledOutputTrainingHelper, self).next_inputs(
+              time=time,
+              outputs=outputs,
+              state=state,
+              sample_ids=sample_ids,
+              name=name))
+
+      def maybe_sample():
+        """Perform scheduled sampling."""
+
+        def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
+          """Concatenate outputs with auxiliary inputs, if they exist."""
+          if self._auxiliary_input_tas is None:
+            return outputs_
+
+          next_time = time + 1
+          auxiliary_inputs = nest.map_structure(
+              lambda ta: ta.read(next_time), self._auxiliary_input_tas)
+          if indices is not None:
+            auxiliary_inputs = array_ops.gather_nd(auxiliary_inputs, indices)
+          return nest.map_structure(
+              lambda x, y: array_ops.concat((x, y), -1),
+              outputs_, auxiliary_inputs)
+
+        if self._next_input_layer is None:
+          return array_ops.where(
+              sample_ids, maybe_concatenate_auxiliary_inputs(outputs),
+              base_next_inputs)
+
+        where_sampling = math_ops.cast(
+            array_ops.where(sample_ids), dtypes.int32)
+        where_not_sampling = math_ops.cast(
+            array_ops.where(math_ops.logical_not(sample_ids)), dtypes.int32)
+        outputs_sampling = array_ops.gather_nd(outputs, where_sampling)
+        inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
+                                                  where_not_sampling)
+        sampled_next_inputs = maybe_concatenate_auxiliary_inputs(
+            self._next_input_layer(outputs_sampling), where_sampling)
+
+        base_shape = array_ops.shape(base_next_inputs)
+        return (array_ops.scatter_nd(indices=where_sampling,
+                                     updates=sampled_next_inputs,
+                                     shape=base_shape)
+                + array_ops.scatter_nd(indices=where_not_sampling,
+                                       updates=inputs_not_sampling,
+                                       shape=base_shape))
+
+      all_finished = math_ops.reduce_all(finished)
+      no_samples = math_ops.logical_not(math_ops.reduce_any(sample_ids))
+      next_inputs = control_flow_ops.cond(
+          math_ops.logical_or(all_finished, no_samples),
+          lambda: base_next_inputs, maybe_sample)
+      return (finished, next_inputs, state)
+
+
+class GreedyEmbeddingHelper(Helper):
+  """A helper for use during inference.
+
+  Uses the argmax of the output (treated as logits) and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding, start_tokens, end_token):
+    """Initializer.
+
+    Args:
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`. The returned tensor
+        will be passed to the decoder input.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._batch_size = array_ops.size(start_tokens)
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  def initialize(self, name=None):
+    finished = array_ops.tile([False], [self._batch_size])
+    return (finished, self._start_inputs)
+
+  def sample(self, time, outputs, state, name=None):
+    """sample for GreedyEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, use argmax to get the most probable id
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError("Expected outputs to be a single Tensor, got: %s" %
+                      type(outputs))
+    sample_ids = math_ops.cast(
+        math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    return sample_ids
+
+  def next_inputs(self, time, outputs, state, sample_ids, name=None):
+    """next_inputs_fn for GreedyEmbeddingHelper."""
+    del time, outputs  # unused by next_inputs_fn
+    finished = math_ops.equal(sample_ids, self._end_token)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        # If we're finished, the next_inputs value doesn't matter
+        lambda: self._start_inputs,
+        lambda: self._embedding_fn(sample_ids))
+    return (finished, next_inputs, state)
+
+
+class SampleEmbeddingHelper(GreedyEmbeddingHelper):
+  """A helper for use during inference.
+
+  Uses sampling (from a distribution) instead of argmax and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding, start_tokens, end_token, seed=None):
+    """Initializer.
+
+    Args:
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`. The returned tensor
+        will be passed to the decoder input.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      seed: The sampling seed.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    super(SampleEmbeddingHelper, self).__init__(
+        embedding, start_tokens, end_token)
+    self._seed = seed
+
+  def sample(self, time, outputs, state, name=None):
+    """sample for SampleEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, we sample instead of argmax (greedy).
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError("Expected outputs to be a single Tensor, got: %s" %
+                      type(outputs))
+    sample_id_sampler = categorical.Categorical(logits=outputs)
+    sample_ids = sample_id_sampler.sample(seed=self._seed)
+
+    return sample_ids
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index bb871112666..39a6d2f58b1 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Seq2seq loss operations for use in sequence models.
 """
 
@@ -22,38 +21,54 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 
 __all__ = ["sequence_loss"]
 
-def sequence_loss(logits, targets, weights,
-                  average_across_timesteps=True, average_across_batch=True,
-                  softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits (per example).
+
+def sequence_loss(logits,
+                  targets,
+                  weights,
+                  average_across_timesteps=True,
+                  average_across_batch=True,
+                  softmax_loss_function=None,
+                  name=None):
+  """Weighted cross-entropy loss for a sequence of logits.
+
+  Depending on the values of `average_across_timesteps` and
+  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
+  arguments reduce the cross-entropy at each target, which has shape
+  `[batch_size, sequence_length]`, over their respective dimensions. For
+  example, if `average_across_timesteps` is `True` and `average_across_batch`
+  is `False`, then the return Tensor will have shape `[batch_size]`.
 
   Args:
-    logits: A 3D Tensor of shape
-      [batch_size x sequence_length x num_decoder_symbols] and dtype float.
+    logits: A Tensor of shape
+      `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
       The logits correspond to the prediction across all classes at each
       timestep.
-    targets: A 2D Tensor of shape [batch_size x sequence_length] and dtype
+    targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
       int. The target represents the true class at each timestep.
-    weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype
-      float. Weights constitutes the weighting of each prediction in the
-      sequence. When using weights as masking set all valid timesteps to 1 and
-      all padded timesteps to 0.
+    weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
+      float. `weights` constitutes the weighting of each prediction in the
+      sequence. When using `weights` as masking, set all valid timesteps to 1
+      and all padded timesteps to 0, e.g. a mask returned by `tf.sequence_mask`.
     average_across_timesteps: If set, sum the cost across the sequence
-      dimension and divide by the cost by the total label weight across
-      timesteps.
+      dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
       divide the returned cost by the batch size.
-    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
+    softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
+      **Note that to avoid confusion, it is required for the function to accept
+      named arguments.**
     name: Optional name for this operation, defaults to "sequence_loss".
 
   Returns:
-    A scalar float Tensor: The average log-perplexity per symbol (weighted).
+    A float Tensor of rank 0, 1, or 2 depending on the
+    `average_across_timesteps` and `average_across_batch` arguments. By default,
+    it has rank 0 (scalar) and is the weighted average cross-entropy
+    (log-perplexity) per symbol.
 
   Raises:
     ValueError: logits does not have 3 dimensions or targets does not have 2
@@ -70,18 +85,18 @@ def sequence_loss(logits, targets, weights,
                      "tensor")
   with ops.name_scope(name, "sequence_loss", [logits, targets, weights]):
     num_classes = array_ops.shape(logits)[2]
-    probs_flat = array_ops.reshape(logits, [-1, num_classes])
+    logits_flat = array_ops.reshape(logits, [-1, num_classes])
     targets = array_ops.reshape(targets, [-1])
     if softmax_loss_function is None:
       crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
-        labels=targets, logits=probs_flat)
+          labels=targets, logits=logits_flat)
     else:
-      crossent = softmax_loss_function(probs_flat, targets)
-    crossent = crossent * array_ops.reshape(weights, [-1])
+      crossent = softmax_loss_function(labels=targets, logits=logits_flat)
+    crossent *= array_ops.reshape(weights, [-1])
     if average_across_timesteps and average_across_batch:
       crossent = math_ops.reduce_sum(crossent)
       total_size = math_ops.reduce_sum(weights)
-      total_size += 1e-12 # to avoid division by 0 for all-0 weights
+      total_size += 1e-12  # to avoid division by 0 for all-0 weights
       crossent /= total_size
     else:
       batch_size = array_ops.shape(logits)[0]
@@ -90,11 +105,11 @@ def sequence_loss(logits, targets, weights,
     if average_across_timesteps and not average_across_batch:
       crossent = math_ops.reduce_sum(crossent, axis=[1])
       total_size = math_ops.reduce_sum(weights, axis=[1])
-      total_size += 1e-12 # to avoid division by 0 for all-0 weights
+      total_size += 1e-12  # to avoid division by 0 for all-0 weights
       crossent /= total_size
     if not average_across_timesteps and average_across_batch:
       crossent = math_ops.reduce_sum(crossent, axis=[0])
       total_size = math_ops.reduce_sum(weights, axis=[0])
-      total_size += 1e-12 # to avoid division by 0 for all-0 weights
+      total_size += 1e-12  # to avoid division by 0 for all-0 weights
       crossent /= total_size
     return crossent
diff --git a/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py b/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py
deleted file mode 100644
index 3cd986cb04c..00000000000
--- a/tensorflow/contrib/seq2seq/python/ops/sampling_decoder.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A class of Decoders that may sample to generate the next input.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import collections
-
-import six
-
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.rnn import core_rnn_cell
-from tensorflow.contrib.seq2seq.python.ops import decoder
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.util import nest
-
-__all__ = [
-    "Sampler", "SamplingDecoderOutput", "BasicSamplingDecoder",
-    "BasicTrainingSampler", "GreedyEmbeddingSampler", "CustomSampler",
-    "ScheduledEmbeddingTrainingSampler",
-]
-
-_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Sampler(object):
-  """Sampler interface.  Sampler instances are used by BasicSamplingDecoder."""
-
-  @abc.abstractproperty
-  def batch_size(self):
-    """Returns a scalar int32 tensor."""
-    raise NotImplementedError("batch_size has not been implemented")
-
-  @abc.abstractmethod
-  def initialize(self, name=None):
-    """Returns `(initial_finished, initial_inputs)`."""
-    pass
-
-  @abc.abstractmethod
-  def sample(self, time, outputs, state, name=None):
-    """Returns `sample_ids`."""
-    pass
-
-  @abc.abstractmethod
-  def next_inputs(self, time, outputs, state, sample_ids, name=None):
-    """Returns `(finished, next_inputs, next_state)`."""
-    pass
-
-
-class SamplingDecoderOutput(
-    collections.namedtuple("SamplingDecoderOutput",
-                           ("rnn_output", "sample_id"))):
-  pass
-
-
-class BasicSamplingDecoder(decoder.Decoder):
-  """Basic sampling decoder."""
-
-  def __init__(self, cell, sampler, initial_state):
-    """Initialize BasicSamplingDecoder.
-
-    Args:
-      cell: An `RNNCell` instance.
-      sampler: A `Sampler` instance.
-      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
-
-    Raises:
-      TypeError: if `cell` is not an instance of `RNNCell` or `sampler`
-        is not an instance of `Sampler`.
-    """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
-      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
-    if not isinstance(sampler, Sampler):
-      raise TypeError("sampler must be a Sampler, received: %s" %
-                      type(sampler))
-    self._cell = cell
-    self._sampler = sampler
-    self._initial_state = initial_state
-
-  @property
-  def batch_size(self):
-    return self._sampler.batch_size
-
-  @property
-  def output_size(self):
-    # Return the cell output and the id
-    return SamplingDecoderOutput(
-        rnn_output=self._cell.output_size,
-        sample_id=tensor_shape.TensorShape([]))
-
-  @property
-  def output_dtype(self):
-    # Assume the dtype of the cell is the output_size structure
-    # containing the input_state's first component's dtype.
-    # Return that structure and int32 (the id)
-    dtype = nest.flatten(self._initial_state)[0].dtype
-    return SamplingDecoderOutput(
-        nest.map_structure(lambda _: dtype, self._cell.output_size),
-        dtypes.int32)
-
-  def initialize(self, name=None):
-    """Initialize the decoder.
-
-    Args:
-      name: Name scope for any created operations.
-
-    Returns:
-      `(finished, first_inputs, initial_state)`.
-    """
-    return self._sampler.initialize() + (self._initial_state,)
-
-  def step(self, time, inputs, state, name=None):
-    """Perform a decoding step.
-
-    Args:
-      time: scalar `int32` tensor.
-      inputs: A (structure of) input tensors.
-      state: A (structure of) state tensors and TensorArrays.
-      name: Name scope for any created operations.
-
-    Returns:
-      `(outputs, next_state, next_inputs, finished)`.
-    """
-    with ops.name_scope(
-        name, "BasicSamplingDecoderStep", (time, inputs, state)):
-      cell_outputs, cell_state = self._cell(inputs, state)
-      sample_ids = self._sampler.sample(
-          time=time, outputs=cell_outputs, state=cell_state)
-      (finished, next_inputs, next_state) = self._sampler.next_inputs(
-          time=time, outputs=cell_outputs, state=cell_state,
-          sample_ids=sample_ids)
-    outputs = SamplingDecoderOutput(cell_outputs, sample_ids)
-    return (outputs, next_state, next_inputs, finished)
-
-
-class CustomSampler(Sampler):
-  """Base abstract class that allows the user to customize sampling."""
-
-  def __init__(self, initialize_fn, sample_fn, next_inputs_fn):
-    """Initializer.
-
-    Args:
-      initialize_fn: callable that returns `(finished, next_inputs)`
-        for the first iteration.
-      sample_fn: callable that takes `(time, outputs, state)`
-        and emits tensor `sample_ids`.
-      next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)`
-        and emits `(finished, next_inputs, next_state)`.
-    """
-    self._initialize_fn = initialize_fn
-    self._sample_fn = sample_fn
-    self._next_inputs_fn = next_inputs_fn
-    self._batch_size = None
-
-  @property
-  def batch_size(self):
-    if self._batch_size is None:
-      raise ValueError("batch_size accessed before initialize was called")
-    return self._batch_size
-
-  def initialize(self, name=None):
-    with ops.name_scope(name, "%sInitialize" % type(self).__name__):
-      (finished, next_inputs) = self._initialize_fn()
-      if self._batch_size is None:
-        self._batch_size = array_ops.size(finished)
-    return (finished, next_inputs)
-
-  def sample(self, time, outputs, state, name=None):
-    with ops.name_scope(
-        name, "%sSample" % type(self).__name__, (time, outputs, state)):
-      return self._sample_fn(time=time, outputs=outputs, state=state)
-
-  def next_inputs(self, time, outputs, state, sample_ids, name=None):
-    with ops.name_scope(
-        name, "%sNextInputs" % type(self).__name__, (time, outputs, state)):
-      return self._next_inputs_fn(
-          time=time, outputs=outputs, state=state, sample_ids=sample_ids)
-
-
-class BasicTrainingSampler(Sampler):
-  """A (non-)sampler for use during training.  Only reads inputs.
-
-  Returned sample_ids are the argmax of the RNN output logits.
-  """
-
-  def __init__(self, inputs, sequence_length, time_major=False, name=None):
-    """Initializer.
-
-    Args:
-      inputs: A (structure of) input tensors.
-      sequence_length: An int32 vector tensor.
-      time_major: Python bool.  Whether the tensors in `inputs` are time major.
-        If `False` (default), they are assumed to be batch major.
-      name: Name scope for any created operations.
-
-    Raises:
-      ValueError: if `sequence_length` is not a 1D tensor.
-    """
-    with ops.name_scope(
-        name, "BasicTrainingSampler", [inputs, sequence_length]):
-      inputs = ops.convert_to_tensor(inputs, name="inputs")
-      if not time_major:
-        inputs = nest.map_structure(_transpose_batch_time, inputs)
-
-      def _unstack_ta(inp):
-        return tensor_array_ops.TensorArray(
-            dtype=inp.dtype, size=array_ops.shape(inp)[0],
-            element_shape=inp.get_shape()[1:]).unstack(inp)
-
-      self._input_tas = nest.map_structure(_unstack_ta, inputs)
-      self._sequence_length = ops.convert_to_tensor(
-          sequence_length, name="sequence_length")
-      if self._sequence_length.get_shape().ndims != 1:
-        raise ValueError(
-            "Expected sequence_length to be a vector, but received shape: %s" %
-            self._sequence_length.get_shape())
-
-      self._zero_inputs = nest.map_structure(
-          lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
-
-      self._batch_size = array_ops.size(sequence_length)
-
-  @property
-  def batch_size(self):
-    return self._batch_size
-
-  def initialize(self, name=None):
-    with ops.name_scope(name, "BasicTrainingSamplerInitialize"):
-      finished = math_ops.equal(0, self._sequence_length)
-      all_finished = math_ops.reduce_all(finished)
-      next_inputs = control_flow_ops.cond(
-          all_finished, lambda: self._zero_inputs,
-          lambda: nest.map_structure(lambda inp: inp.read(0), self._input_tas))
-      return (finished, next_inputs)
-
-  def sample(self, time, outputs, name=None, **unused_kwargs):
-    with ops.name_scope(name, "BasicTrainingSamplerSample", [time, outputs]):
-      sample_ids = math_ops.cast(
-          math_ops.argmax(outputs, axis=-1), dtypes.int32)
-      return sample_ids
-
-  def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
-    """next_inputs_fn for BasicTrainingSampler."""
-    with ops.name_scope(
-        name, "BasicTrainingSamplerNextInputs", [time, outputs, state]):
-      next_time = time + 1
-      finished = (next_time >= self._sequence_length)
-      all_finished = math_ops.reduce_all(finished)
-      def read_from_ta(inp):
-        return inp.read(next_time)
-      next_inputs = control_flow_ops.cond(
-          all_finished, lambda: self._zero_inputs,
-          lambda: nest.map_structure(read_from_ta, self._input_tas))
-      return (finished, next_inputs, state)
-
-
-class ScheduledEmbeddingTrainingSampler(BasicTrainingSampler):
-  """A training sampler that adds scheduled sampling.
-
-  Returns -1s for sample_ids where no sampling took place; valid sample id
-  values elsewhere.
-  """
-
-  def __init__(self, inputs, sequence_length, embedding, sampling_probability,
-               time_major=False, seed=None, scheduling_seed=None, name=None):
-    """Initializer.
-
-    Args:
-      inputs: A (structure of) input tensors.
-      sequence_length: An int32 vector tensor.
-      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
-      sampling_probability: A 0D `float32` tensor: the probability of sampling
-        categorically from the output ids instead of reading directly from the
-        inputs.
-      time_major: Python bool.  Whether the tensors in `inputs` are time major.
-        If `False` (default), they are assumed to be batch major.
-      seed: The sampling seed.
-      scheduling_seed: The schedule decision rule sampling seed.
-      name: Name scope for any created operations.
-
-    Raises:
-      ValueError: if `sampling_probability` is not a scalar or vector.
-    """
-    with ops.name_scope(name, "ScheduledEmbeddingSamplingWrapper",
-                        [embedding, sampling_probability]):
-      if callable(embedding):
-        self._embedding_fn = embedding
-      else:
-        self._embedding_fn = (
-            lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-      self._sampling_probability = ops.convert_to_tensor(
-          sampling_probability, name="sampling_probability")
-      if self._sampling_probability.get_shape().ndims not in (0, 1):
-        raise ValueError(
-            "sampling_probability must be either a scalar or a vector. "
-            "saw shape: %s" % (self._sampling_probability.get_shape()))
-      self._seed = seed
-      self._scheduling_seed = scheduling_seed
-      super(ScheduledEmbeddingTrainingSampler, self).__init__(
-          inputs=inputs,
-          sequence_length=sequence_length,
-          time_major=time_major,
-          name=name)
-
-  def initialize(self, name=None):
-    return super(ScheduledEmbeddingTrainingSampler, self).initialize(
-        name=name)
-
-  def sample(self, time, outputs, state, name=None):
-    with ops.name_scope(name, "ScheduledEmbeddingTrainingSamplerSample",
-                        [time, outputs, state]):
-      # Return -1s where we did not sample, and sample_ids elsewhere
-      select_sample_noise = random_ops.random_uniform(
-          [self.batch_size], seed=self._scheduling_seed)
-      select_sample = (self._sampling_probability > select_sample_noise)
-      sample_id_sampler = categorical.Categorical(logits=outputs)
-      return array_ops.where(
-          select_sample,
-          sample_id_sampler.sample(seed=self._seed),
-          array_ops.tile([-1], [self.batch_size]))
-
-  def next_inputs(self, time, outputs, state, sample_ids, name=None):
-    with ops.name_scope(name, "ScheduledEmbeddingTrainingSamplerSample",
-                        [time, outputs, state, sample_ids]):
-      (finished, base_next_inputs, state) = (
-          super(ScheduledEmbeddingTrainingSampler, self).next_inputs(
-              time=time, outputs=outputs, state=state, sample_ids=sample_ids,
-              name=name))
-
-      def maybe_sample():
-        """Perform scheduled sampling."""
-        where_sampling = math_ops.cast(
-            array_ops.where(sample_ids > -1), dtypes.int32)
-        where_not_sampling = math_ops.cast(
-            array_ops.where(sample_ids <= -1), dtypes.int32)
-        where_sampling_flat = array_ops.reshape(where_sampling, [-1])
-        where_not_sampling_flat = array_ops.reshape(where_not_sampling, [-1])
-        sample_ids_sampling = array_ops.gather(sample_ids, where_sampling_flat)
-        inputs_not_sampling = array_ops.gather(
-            base_next_inputs, where_not_sampling_flat)
-        sampled_next_inputs = self._embedding_fn(sample_ids_sampling)
-        base_shape = array_ops.shape(base_next_inputs)
-        return (array_ops.scatter_nd(indices=where_sampling,
-                                     updates=sampled_next_inputs,
-                                     shape=base_shape)
-                + array_ops.scatter_nd(indices=where_not_sampling,
-                                       updates=inputs_not_sampling,
-                                       shape=base_shape))
-
-      all_finished = math_ops.reduce_all(finished)
-      next_inputs = control_flow_ops.cond(
-          all_finished, lambda: base_next_inputs, maybe_sample)
-      return (finished, next_inputs, state)
-
-
-class GreedyEmbeddingSampler(Sampler):
-  """A (non-)sampler for use during inference.
-
-  Uses the argmax of the output (treated as logits) and passes the
-  result through an embedding layer to get the next input.
-  """
-
-  def __init__(self, embedding, start_tokens, end_token):
-    """Initializer.
-
-    Args:
-      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
-      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-      end_token: `int32` scalar, the token that marks end of decoding.
-
-    Raises:
-      ValueError: if `sequence_length` is not a 1D tensor.
-    """
-    if callable(embedding):
-      self._embedding_fn = embedding
-    else:
-      self._embedding_fn = (
-          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-    self._start_tokens = ops.convert_to_tensor(
-        start_tokens, dtype=dtypes.int32, name="start_tokens")
-    self._end_token = ops.convert_to_tensor(
-        end_token, dtype=dtypes.int32, name="end_token")
-    if self._start_tokens.get_shape().ndims != 1:
-      raise ValueError("start_tokens must be a vector")
-    self._batch_size = array_ops.size(start_tokens)
-    if self._end_token.get_shape().ndims != 0:
-      raise ValueError("end_token must be a scalar")
-    self._start_inputs = self._embedding_fn(self._start_tokens)
-
-  @property
-  def batch_size(self):
-    return self._batch_size
-
-  def initialize(self, name=None):
-    finished = array_ops.tile([False], [self._batch_size])
-    return (finished, self._start_inputs)
-
-  def sample(self, time, outputs, state, name=None):
-    """sample for GreedyEmbeddingSampler."""
-    del time, state  # unused by sample_fn
-    # Outputs are logits, use argmax to get the most probable id
-    if not isinstance(outputs, ops.Tensor):
-      raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                      outputs)
-    sample_ids = math_ops.cast(
-        math_ops.argmax(outputs, axis=-1), dtypes.int32)
-    return sample_ids
-
-  def next_inputs(self, time, outputs, state, sample_ids, name=None):
-    """next_inputs_fn for GreedyEmbeddingSampler."""
-    del time, outputs  # unused by next_inputs_fn
-    finished = math_ops.equal(sample_ids, self._end_token)
-    all_finished = math_ops.reduce_all(finished)
-    next_inputs = control_flow_ops.cond(
-        all_finished,
-        # If we're finished, the next_inputs value doesn't matter
-        lambda: self._start_inputs,
-        lambda: self._embedding_fn(sample_ids))
-    return (finished, next_inputs, state)
diff --git a/tensorflow/contrib/seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
deleted file mode 100644
index 67132a3677d..00000000000
--- a/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Seq2seq layer operations for use in neural networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import layers
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import variable_scope as vs
-
-__all__ = ["dynamic_rnn_decoder"]
-
-def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None,
-                        parallel_iterations=None, swap_memory=False,
-                        time_major=False, scope=None, name=None):
-  """ Dynamic RNN decoder for a sequence-to-sequence model specified by
-  RNNCell and decoder function.
-
-  The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn`
-  as the decoder does not make any assumptions of sequence length and batch
-  size of the input.
-
-  The `dynamic_rnn_decoder` has two modes: training or inference and expects
-  the user to create seperate functions for each.
-
-  Under both training and inference, both `cell` and `decoder_fn` are expected,
-  where `cell` performs computation at every timestep using `raw_rnn`, and
-  `decoder_fn` allows modeling of early stopping, output, state, and next
-  input and context.
-
-  When training the user is expected to supply `inputs`. At every time step a
-  slice of the supplied input is fed to the `decoder_fn`, which modifies and
-  returns the input for the next time step.
-
-  `sequence_length` is needed at training time, i.e., when `inputs` is not
-  None, for dynamic unrolling. At test time, when `inputs` is None,
-  `sequence_length` is not needed.
-
-  Under inference `inputs` is expected to be `None` and the input is inferred
-  solely from the `decoder_fn`.
-
-  Args:
-    cell: An instance of RNNCell.
-    decoder_fn: A function that takes time, cell state, cell input,
-      cell output and context state. It returns a early stopping vector,
-      cell state, next input, cell output and context state.
-      Examples of decoder_fn can be found in the decoder_fn.py folder.
-    inputs: The inputs for decoding (embedded format).
-
-      If `time_major == False` (default), this must be a `Tensor` of shape:
-        `[batch_size, max_time, ...]`.
-
-      If `time_major == True`, this must be a `Tensor` of shape:
-        `[max_time, batch_size, ...]`.
-
-      The input to `cell` at each time step will be a `Tensor` with dimensions
-        `[batch_size, ...]`.
-    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
-      if `inputs` is not None and `sequence_length` is None it is inferred
-      from the `inputs` as the maximal possible sequence length.
-    parallel_iterations: (Default: 32).  The number of iterations to run in
-      parallel.  Those operations which do not have any temporal dependency
-      and can be run in parallel, will be.  This parameter trades off
-      time for space.  Values >> 1 use more memory but take less time,
-      while smaller values use less memory but computations take longer.
-    swap_memory: Transparently swap the tensors produced in forward inference
-      but needed for back prop from GPU to CPU.  This allows training RNNs
-      which would typically not fit on a single GPU, with very minimal (or no)
-      performance penalty.
-    time_major: The shape format of the `inputs` and `outputs` Tensors.
-      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-      Using `time_major = True` is a bit more efficient because it avoids
-      transposes at the beginning and end of the RNN calculation.  However,
-      most TensorFlow data is batch-major, so by default this function
-      accepts input and emits output in batch-major form.
-    scope: VariableScope for the `raw_rnn`;
-      defaults to None.
-    name: NameScope for the decoder;
-      defaults to "dynamic_rnn_decoder"
-
-  Returns:
-    A tuple (outputs, final_state, final_context_state) where:
-
-      outputs: the RNN output 'Tensor'.
-
-        If time_major == False (default), this will be a `Tensor` shaped:
-          `[batch_size, max_time, cell.output_size]`.
-
-        If time_major == True, this will be a `Tensor` shaped:
-          `[max_time, batch_size, cell.output_size]`.
-
-      final_state: The final state and will be shaped
-        `[batch_size, cell.state_size]`.
-
-      final_context_state: The context state returned by the final call
-        to decoder_fn. This is useful if the context state maintains internal
-        data which is required after the graph is run.
-        For example, one way to diversify the inference output is to use
-        a stochastic decoder_fn, in which case one would want to store the
-        decoded outputs, not just the RNN outputs. This can be done by
-        maintaining a TensorArray in context_state and storing the decoded
-        output of each iteration therein.
-
-  Raises:
-    ValueError: if inputs is not None and has less than three dimensions.
-  """
-  with ops.name_scope(name, "dynamic_rnn_decoder",
-                      [cell, decoder_fn, inputs, sequence_length,
-                       parallel_iterations, swap_memory, time_major, scope]):
-    if inputs is not None:
-      # Convert to tensor
-      inputs = ops.convert_to_tensor(inputs)
-
-      # Test input dimensions
-      if inputs.get_shape().ndims is not None and (
-          inputs.get_shape().ndims < 2):
-        raise ValueError("Inputs must have at least two dimensions")
-      # Setup of RNN (dimensions, sizes, length, initial state, dtype)
-      if not time_major:
-        # [batch, seq, features] -> [seq, batch, features]
-        inputs = array_ops.transpose(inputs, perm=[1, 0, 2])
-
-      dtype = inputs.dtype
-      # Get data input information
-      input_depth = int(inputs.get_shape()[2])
-      batch_depth = inputs.get_shape()[1].value
-      max_time = inputs.get_shape()[0].value
-      if max_time is None:
-        max_time = array_ops.shape(inputs)[0]
-      # Setup decoder inputs as TensorArray
-      inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
-      inputs_ta = inputs_ta.unstack(inputs)
-
-    def loop_fn(time, cell_output, cell_state, loop_state):
-      if cell_state is None:  # first call, before while loop (in raw_rnn)
-        if cell_output is not None:
-          raise ValueError("Expected cell_output to be None when cell_state "
-                           "is None, but saw: %s" % cell_output)
-        if loop_state is not None:
-          raise ValueError("Expected loop_state to be None when cell_state "
-                           "is None, but saw: %s" % loop_state)
-        context_state = None
-      else:  # subsequent calls, inside while loop, after cell excution
-        if isinstance(loop_state, tuple):
-          (done, context_state) = loop_state
-        else:
-          done = loop_state
-          context_state = None
-
-      # call decoder function
-      if inputs is not None:  # training
-        # get next_cell_input
-        if cell_state is None:
-          next_cell_input = inputs_ta.read(0)
-        else:
-          if batch_depth is not None:
-            batch_size = batch_depth
-          else:
-            batch_size = array_ops.shape(done)[0]
-          next_cell_input = control_flow_ops.cond(
-              math_ops.equal(time, max_time),
-              lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype),
-              lambda: inputs_ta.read(time))
-        (next_done, next_cell_state, next_cell_input, emit_output,
-         next_context_state) = decoder_fn(time, cell_state, next_cell_input,
-                                          cell_output, context_state)
-      else:  # inference
-        # next_cell_input is obtained through decoder_fn
-        (next_done, next_cell_state, next_cell_input, emit_output,
-         next_context_state) = decoder_fn(time, cell_state, None, cell_output,
-                                          context_state)
-
-      # check if we are done
-      if next_done is None:  # training
-        next_done = time >= sequence_length
-
-      # build next_loop_state
-      if next_context_state is None:
-        next_loop_state = next_done
-      else:
-        next_loop_state = (next_done, next_context_state)
-
-      return (next_done, next_cell_input, next_cell_state,
-              emit_output, next_loop_state)
-
-    # Run raw_rnn function
-    outputs_ta, final_state, final_loop_state = rnn.raw_rnn(
-        cell, loop_fn, parallel_iterations=parallel_iterations,
-        swap_memory=swap_memory, scope=scope)
-    outputs = outputs_ta.stack()
-
-    # Get final context_state, if generated by user
-    if isinstance(final_loop_state, tuple):
-      final_context_state = final_loop_state[1]
-    else:
-      final_context_state = None
-
-    if not time_major:
-      # [seq, batch, features] -> [batch, seq, features]
-      outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-    return outputs, final_state, final_context_state
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 84dea09ad34..fc34d82f256 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -17,6 +17,7 @@ load(
     "if_ios",
     "if_mobile",
     "if_not_mobile",
+    "py_test",
 )
 
 filegroup(
@@ -40,6 +41,16 @@ filegroup(
     ]),
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "session_bundle_pip",
+    deps = [
+        ":bundle_shim_py",
+        ":exporter",
+        ":gc",
+    ],
+)
+
 py_library(
     name = "bundle_shim_py",
     srcs = ["bundle_shim.py"],
@@ -68,6 +79,7 @@ py_test(
     ],
     main = "bundle_shim_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":bundle_shim_py",
         ":constants",
@@ -83,15 +95,19 @@ py_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 py_library(
     name = "constants",
     srcs = ["constants.py"],
+    deprecation = "Use SavedModel constants instead.",
     srcs_version = "PY2AND3",
 )
 
+# DEPRECATED: Use SavedModel instead.
 py_library(
     name = "exporter",
     srcs = ["exporter.py"],
+    deprecation = "Use SavedModel Builder instead.",
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
@@ -132,9 +148,11 @@ py_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 py_library(
     name = "gc",
     srcs = ["gc.py"],
+    deprecation = "Use SavedModel instead.",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework",
@@ -144,6 +162,7 @@ py_library(
 
 py_test(
     name = "gc_test",
+    size = "small",
     srcs = [
         "gc_test.py",
     ],
@@ -173,9 +192,11 @@ filegroup(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 cc_library(
     name = "session_bundle",
     hdrs = ["session_bundle.h"],
+    deprecation = "Use SavedModel Loader instead.",
     visibility = ["//visibility:public"],
     deps = [
         ":session_bundle_lite",
@@ -183,6 +204,7 @@ cc_library(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 # This is a lite version of the session_bundle target that does not link in any
 # Tensorflow ops in order to minimize its size. Clients using this should link
 # any required ops manually.
@@ -191,6 +213,7 @@ cc_library(
     srcs = ["session_bundle.cc"],
     hdrs = ["session_bundle.h"],
     copts = if_ios(["-DGOOGLE_LOGGING"]),
+    deprecation = "Use SavedModel Loader instead.",
     visibility = ["//visibility:public"],
     deps = [
         ":signature_lite",
@@ -225,9 +248,11 @@ cc_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 py_library(
     name = "session_bundle_py",
     srcs = ["session_bundle.py"],
+    deprecation = "Use SavedModel Loader instead.",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -248,6 +273,7 @@ py_test(
     data = [":session_bundle_half_plus_two"],
     main = "session_bundle_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":constants",
         ":manifest_proto_py",
@@ -266,6 +292,7 @@ py_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 # This is a lite version of the signature target that does not link in any
 # Tensorflow ops in order to minimize its size. Clients using this should
 # link any required ops manually.
@@ -273,6 +300,7 @@ cc_library(
     name = "signature_lite",
     srcs = ["signature.cc"],
     hdrs = ["signature.h"],
+    deprecation = "Use SavedModel instead.",
     visibility = ["//visibility:public"],
     deps = if_not_mobile([
         ":manifest_proto_cc",
@@ -283,9 +311,11 @@ cc_library(
     ]),
 )
 
+# DEPRECATED: Use SavedModel instead.
 cc_library(
     name = "signature",
     hdrs = ["signature.h"],
+    deprecation = "Use SavedModel instead.",
     visibility = ["//visibility:public"],
     deps = [
         ":signature_lite",
@@ -302,7 +332,6 @@ cc_test(
     deps = [
         ":manifest_proto_cc",
         ":signature",
-        ":test_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -314,14 +343,15 @@ cc_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 cc_library(
     name = "test_util",
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
+    deprecation = "Use SavedModel instead.",
     visibility = ["//visibility:private"],
     deps = [
-        "//tensorflow/core",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -363,7 +393,6 @@ cc_test(
     deps = [
         ":bundle_shim",
         ":test_util",
-        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:core_cpu",
@@ -375,6 +404,7 @@ cc_test(
     ],
 )
 
+# DEPRECATED: Use SavedModel instead.
 tf_proto_library(
     name = "manifest_proto",
     srcs = ["manifest.proto"],
diff --git a/tensorflow/contrib/session_bundle/README.md b/tensorflow/contrib/session_bundle/README.md
index 6df63cba807..5bcc8fab70f 100644
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@@ -1,5 +1,8 @@
 # TensorFlow Inference Model Format
 
+WARNING: SessionBundle has been deprecated. Please use
+[SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) instead.
+
 [TOC]
 
 ## Overview
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.cc b/tensorflow/contrib/session_bundle/bundle_shim.cc
index 9c7cdf192da..a367ea059c9 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/contrib/session_bundle/manifest.pb.h"
 #include "tensorflow/contrib/session_bundle/session_bundle.h"
 #include "tensorflow/contrib/session_bundle/signature.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -46,33 +47,39 @@ bool IsRegressionSignature(const Signature& signature) {
 // SignatureDefs.
 
 SignatureDef BuildRegressionSignatureDef(
-    const RegressionSignature& regression_signature) {
+    const RegressionSignature& regression_signature,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype) {
   SignatureDef signature_def;
   signature_def.set_method_name(kRegressMethodName);
   internal::AddInputToSignatureDef(regression_signature.input().tensor_name(),
-                                   kRegressInputs, &signature_def);
+                                   tensor_name_to_dtype, kRegressInputs,
+                                   &signature_def);
   internal::AddOutputToSignatureDef(regression_signature.output().tensor_name(),
-                                    kRegressOutputs, &signature_def);
+                                    tensor_name_to_dtype, kRegressOutputs,
+                                    &signature_def);
   return signature_def;
 }
 
 SignatureDef BuildClassificationSignatureDef(
-    const ClassificationSignature& classification_signature) {
+    const ClassificationSignature& classification_signature,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype) {
   SignatureDef signature_def;
   signature_def.set_method_name(kClassifyMethodName);
   internal::AddInputToSignatureDef(
-      classification_signature.input().tensor_name(), kClassifyInputs,
-      &signature_def);
+      classification_signature.input().tensor_name(), tensor_name_to_dtype,
+      kClassifyInputs, &signature_def);
   internal::AddOutputToSignatureDef(
-      classification_signature.classes().tensor_name(), kClassifyOutputClasses,
-      &signature_def);
+      classification_signature.classes().tensor_name(), tensor_name_to_dtype,
+      kClassifyOutputClasses, &signature_def);
   internal::AddOutputToSignatureDef(
-      classification_signature.scores().tensor_name(), kClassifyOutputScores,
-      &signature_def);
+      classification_signature.scores().tensor_name(), tensor_name_to_dtype,
+      kClassifyOutputScores, &signature_def);
   return signature_def;
 }
 
-Status MaybeBuildPredictSignatureDef(MetaGraphDef* meta_graph_def) {
+Status MaybeBuildPredictSignatureDef(
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    MetaGraphDef* meta_graph_def) {
   Signature input_signature, output_signature;
   // Ensure that named signatures corresponding to `inputs` and `outputs` keys
   // exist.
@@ -97,13 +104,15 @@ Status MaybeBuildPredictSignatureDef(MetaGraphDef* meta_graph_def) {
   // signature def.
   for (const auto& map_entry : input_signature.generic_signature().map()) {
     internal::AddInputToSignatureDef(map_entry.second.tensor_name(),
-                                     map_entry.first, &signature_def);
+                                     tensor_name_to_dtype, map_entry.first,
+                                     &signature_def);
   }
   // Add map entries from the `outputs` generic signature to the output map in
   // the signature def.
   for (const auto& map_entry : output_signature.generic_signature().map()) {
     internal::AddOutputToSignatureDef(map_entry.second.tensor_name(),
-                                      map_entry.first, &signature_def);
+                                      tensor_name_to_dtype, map_entry.first,
+                                      &signature_def);
   }
   // Add the constructed signature def to the signature def map of the meta
   // graph def. Use the default key if it isn't already in use.
@@ -148,8 +157,10 @@ Status LoadSavedModelFromLegacySessionBundlePath(
 
 // Up-conversion of default signatures is supported for classification and
 // regression.
-Status ConvertDefaultSignatureToSignatureDef(const Signatures& signatures,
-                                             MetaGraphDef* meta_graph_def) {
+Status ConvertDefaultSignatureToSignatureDef(
+    const Signatures& signatures,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    MetaGraphDef* meta_graph_def) {
   if (!signatures.has_default_signature()) {
     return Status::OK();
   }
@@ -165,10 +176,12 @@ Status ConvertDefaultSignatureToSignatureDef(const Signatures& signatures,
   const Signature& signature = signatures.default_signature();
   if (IsRegressionSignature(signature)) {
     (*meta_graph_def->mutable_signature_def())[kDefaultServingSignatureDefKey] =
-        BuildRegressionSignatureDef(signature.regression_signature());
+        BuildRegressionSignatureDef(signature.regression_signature(),
+                                    tensor_name_to_dtype);
   } else if (IsClassificationSignature(signature)) {
     (*meta_graph_def->mutable_signature_def())[kDefaultServingSignatureDefKey] =
-        BuildClassificationSignatureDef(signature.classification_signature());
+        BuildClassificationSignatureDef(signature.classification_signature(),
+                                        tensor_name_to_dtype);
   } else {
     LOG(WARNING) << "Default signature up-conversion to SignatureDef is only "
                     "supported for `Classification` and `Regression`. Could "
@@ -180,14 +193,16 @@ Status ConvertDefaultSignatureToSignatureDef(const Signatures& signatures,
   return Status::OK();
 }
 
-Status ConvertNamedSignaturesToSignatureDef(const Signatures& signatures,
-                                            MetaGraphDef* meta_graph_def) {
+Status ConvertNamedSignaturesToSignatureDef(
+    const Signatures& signatures,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    MetaGraphDef* meta_graph_def) {
   if (signatures.named_signatures().empty()) {
     return Status::OK();
   }
   // Check for a Predict signature for up-conversion.
   Status predict_signature_def_status =
-      MaybeBuildPredictSignatureDef(meta_graph_def);
+      MaybeBuildPredictSignatureDef(tensor_name_to_dtype, meta_graph_def);
   for (const auto& it_named_signature : signatures.named_signatures()) {
     const string key = it_named_signature.first;
     // If a Predict SignatureDef was successfully constructed, skip the entries
@@ -200,10 +215,12 @@ Status ConvertNamedSignaturesToSignatureDef(const Signatures& signatures,
     const Signature signature = it_named_signature.second;
     if (IsRegressionSignature(signature)) {
       (*meta_graph_def->mutable_signature_def())[key] =
-          BuildRegressionSignatureDef(signature.regression_signature());
+          BuildRegressionSignatureDef(signature.regression_signature(),
+                                      tensor_name_to_dtype);
     } else if (IsClassificationSignature(signature)) {
       (*meta_graph_def->mutable_signature_def())[key] =
-          BuildClassificationSignatureDef(signature.classification_signature());
+          BuildClassificationSignatureDef(signature.classification_signature(),
+                                          tensor_name_to_dtype);
     } else {
       LOG(WARNING)
           << "Named signature up-conversion to SignatureDef is only supported "
@@ -223,39 +240,97 @@ namespace internal {
 // Helper functions to populate SignatureDef fields.
 
 // Adds an entry to the `inputs` map of the supplied SignatureDef.
-void AddInputToSignatureDef(const string& tensor_name, const string& map_key,
-                            SignatureDef* signature_def) {
+void AddInputToSignatureDef(
+    const string& tensor_name,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    const string& input_key, SignatureDef* signature_def) {
   if (tensor_name.empty()) {
+    LOG(WARNING) << "Tensor name not provided. Cannot add TensorInfo to "
+                    "SignatureDef inputs.";
     return;
   }
-  // TensorInfo messages used in the SignatureDefs are thinly populated with
-  // name only.
+  // Extract the tensor-name in case the supplied string is a tensor-reference.
+  // Example: Extract "x" from "x:0".
+  std::size_t pos = tensor_name.find(":");
+  const string key =
+      (pos != std::string::npos) ? tensor_name.substr(0, pos) : tensor_name;
+  const auto it_tensor_info = tensor_name_to_dtype.find(key);
   TensorInfo tensor_info;
   tensor_info.set_name(tensor_name);
-  (*signature_def->mutable_inputs())[map_key] = tensor_info;
+  if (it_tensor_info != tensor_name_to_dtype.end()) {
+    tensor_info.set_dtype(it_tensor_info->second);
+  } else {
+    LOG(WARNING)
+        << "No dtype found for tensor with name: " << tensor_name << ". "
+        << "Building TensorInfo with only name for SignatureDef inputs. "
+        << "Downstream functionality including validation may be "
+        << "impacted.";
+  }
+  (*signature_def->mutable_inputs())[input_key] = tensor_info;
 }
 
 // Adds an entry to the `outputs` map of the supplied SignatureDef.
-void AddOutputToSignatureDef(const string& tensor_name, const string& map_key,
-                             SignatureDef* signature_def) {
+void AddOutputToSignatureDef(
+    const string& tensor_name,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    const string& output_key, SignatureDef* signature_def) {
   if (tensor_name.empty()) {
+    LOG(WARNING) << "Tensor name not provided. Cannot add TensorInfo to "
+                    "SignatureDef outputs.";
     return;
   }
-  // TensorInfo messages used in the SignatureDefs are thinly populated with
-  // name only.
+  // Extract the tensor-name in case the supplied string is a tensor-reference.
+  // Example: Extract "x" from "x:0".
+  std::size_t pos = tensor_name.find(":");
+  const string key =
+      (pos != std::string::npos) ? tensor_name.substr(0, pos) : tensor_name;
+  const auto it_tensor_info = tensor_name_to_dtype.find(key);
   TensorInfo tensor_info;
   tensor_info.set_name(tensor_name);
-  (*signature_def->mutable_outputs())[map_key] = tensor_info;
+  if (it_tensor_info != tensor_name_to_dtype.end()) {
+    tensor_info.set_dtype(it_tensor_info->second);
+  } else {
+    LOG(WARNING)
+        << "No dtype found for tensor with name: " << tensor_name << ". "
+        << "Building TensorInfo with only name for SignatureDef outputs."
+        << " Downstream functionality including validation may be "
+        << "impacted.";
+  }
+  (*signature_def->mutable_outputs())[output_key] = tensor_info;
+}
+
+// Builds a map from tensor name to the corresponding datatype, by parsing the
+// MetaGraphDef.
+Status BuildTensorNameToDtypeMap(
+    const MetaGraphDef& meta_graph_def,
+    std::unordered_map<string, DataType>* tensor_name_to_dtype) {
+  GraphConstructorOptions opts;
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(opts, meta_graph_def.graph_def(), &graph));
+  for (Node* node : graph.nodes()) {
+    for (auto dt : node->output_types()) {
+      tensor_name_to_dtype->insert(std::make_pair(node->name(), dt));
+    }
+  }
+  return Status::OK();
 }
 
 // Converts SessionBundle signatures to SavedModel signature-defs.
 Status ConvertSignaturesToSignatureDefs(MetaGraphDef* meta_graph_def) {
   Signatures signatures;
-  GetSignatures(*meta_graph_def, &signatures);
+  GetSignatures(*meta_graph_def, &signatures).IgnoreError();
+
+  // Build a map of tensor-names to the corresponding tensor-info with `name`
+  // and `dtype` fields.
+  std::unordered_map<string, DataType> tensor_name_to_dtype;
   TF_RETURN_IF_ERROR(
-      ConvertDefaultSignatureToSignatureDef(signatures, meta_graph_def));
-  TF_RETURN_IF_ERROR(
-      ConvertNamedSignaturesToSignatureDef(signatures, meta_graph_def));
+      BuildTensorNameToDtypeMap(*meta_graph_def, &tensor_name_to_dtype));
+
+  TF_RETURN_IF_ERROR(ConvertDefaultSignatureToSignatureDef(
+      signatures, tensor_name_to_dtype, meta_graph_def));
+  TF_RETURN_IF_ERROR(ConvertNamedSignaturesToSignatureDef(
+      signatures, tensor_name_to_dtype, meta_graph_def));
   return Status::OK();
 }
 
@@ -266,7 +341,7 @@ Status ConvertSessionBundleToSavedModelBundle(
   saved_model_bundle->session = std::move(session_bundle.session);
 
   // Copy the meta graph def from the SessionBundle to the SavedModelBundle.
-  saved_model_bundle->meta_graph_def.CopyFrom(session_bundle.meta_graph_def);
+  saved_model_bundle->meta_graph_def = session_bundle.meta_graph_def;
 
   // Convert signatures from session-bundle to signature-defs in
   // saved-model-bundle.
@@ -288,6 +363,8 @@ Status LoadSessionBundleOrSavedModelBundle(
     return LoadSavedModel(session_options, run_options, export_dir,
                           saved_model_tags, saved_model_bundle);
   } else if (IsPossibleExportDirectory(export_dir)) {
+    LOG(ERROR) << "Found possible SessionBundle in export directory. "
+                  "SessionBundle is deprecated. Use SavedModel instead.";
     LOG(INFO) << "Attempting to up-convert SessionBundle to SavedModelBundle "
                  "in bundle-shim from: "
               << export_dir;
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.h b/tensorflow/contrib/session_bundle/bundle_shim.h
index 37c242c6ea7..e24efa0de14 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.h
+++ b/tensorflow/contrib/session_bundle/bundle_shim.h
@@ -32,15 +32,21 @@ namespace tensorflow {
 namespace serving {
 namespace internal {
 
-// Adds an entry (key and value) to the input map of the signature def.
-void AddInputToSignatureDef(const string& tensor_name,
-                            const string& input_map_key,
-                            SignatureDef* signature_def);
+// Adds an entry (key and value) to the input map of the signature def. Builds
+// TensorInfos for the SignatureDefs by using the name and dtype information
+// from the supplied map.
+void AddInputToSignatureDef(
+    const string& tensor_name,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    const string& input_map_key, SignatureDef* signature_def);
 
-// Adds an entry (key and value) to the output map of the signature def.
-void AddOutputToSignatureDef(const string& tensor_name,
-                             const string& output_map_key,
-                             SignatureDef* signature_def);
+// Adds an entry (key and value) to the output map of the signature def. Builds
+// TensorInfos for the SignatureDefs by using the name and dtype information
+// from the supplied map.
+void AddOutputToSignatureDef(
+    const string& tensor_name,
+    const std::unordered_map<string, DataType>& tensor_name_to_dtype,
+    const string& output_map_key, SignatureDef* signature_def);
 
 // Converts signatures in the MetaGraphDef into a SignatureDefs in the
 // MetaGraphDef.
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 0f4f0f6ee41..72f32a0f555 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/session_bundle/bundle_shim.h"
 
+#include "google/protobuf/any.pb.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/contrib/session_bundle/test_util.h"
@@ -32,8 +33,6 @@ namespace {
 
 constexpr char kSessionBundlePath[] =
     "session_bundle/testdata/half_plus_two/00000123";
-constexpr char kSessionBundleMetaGraphFilename[] = "export.meta";
-constexpr char kSessionBundleVariablesFilename[] = "export-00000-of-00001";
 constexpr char kSavedModelBundlePath[] =
     "cc/saved_model/testdata/half_plus_two/00000123";
 
@@ -83,21 +82,43 @@ void LoadAndValidateSavedModelBundle(const string& export_dir,
   TensorInfo input_tensor_info =
       regression_signature_def.inputs().find(kRegressInputs)->second;
   EXPECT_EQ(1, regression_signature_def.outputs_size());
+  // Ensure the TensorInfo has dtype populated.
+  EXPECT_EQ(DT_STRING, input_tensor_info.dtype());
 
   ASSERT_FALSE(regression_signature_def.outputs().find(kRegressOutputs) ==
                regression_signature_def.outputs().end());
   TensorInfo output_tensor_info =
       regression_signature_def.outputs().find(kRegressOutputs)->second;
+  // Ensure the TensorInfo has dtype populated.
+  EXPECT_EQ(DT_FLOAT, output_tensor_info.dtype());
   ValidateHalfPlusTwo(saved_model_bundle, input_tensor_info.name(),
                       output_tensor_info.name());
 }
 
+// Helper function to validate that the SignatureDef found in the MetaGraphDef
+// with the provided key has the expected string representation.
+void ValidateSignatureDef(const MetaGraphDef& meta_graph_def, const string& key,
+                          const string& expected_string_signature_def) {
+  tensorflow::SignatureDef expected_signature;
+  CHECK(protobuf::TextFormat::ParseFromString(expected_string_signature_def,
+                                              &expected_signature));
+  auto iter = meta_graph_def.signature_def().find(key);
+  ASSERT_TRUE(iter != meta_graph_def.signature_def().end());
+  EXPECT_EQ(expected_signature.DebugString(), iter->second.DebugString());
+}
+
 // Checks that the input map in a signature def is populated correctly.
 TEST(BundleShimTest, AddInputToSignatureDef) {
   SignatureDef signature_def;
   const string tensor_name = "foo_tensor";
   const string map_key = "foo_key";
-  AddInputToSignatureDef(tensor_name, map_key, &signature_def);
+
+  // Build a map of tensor-name to dtype, for the unit-test.
+  std::unordered_map<string, DataType> tensor_name_to_dtype;
+  tensor_name_to_dtype[tensor_name] = tensorflow::DT_STRING;
+
+  AddInputToSignatureDef(tensor_name, tensor_name_to_dtype, map_key,
+                         &signature_def);
   EXPECT_EQ(1, signature_def.inputs_size());
   EXPECT_EQ(tensor_name, signature_def.inputs().find(map_key)->second.name());
 }
@@ -107,7 +128,13 @@ TEST(BundleShimTest, AddOutputToSignatureDef) {
   SignatureDef signature_def;
   const string tensor_name = "foo_tensor";
   const string map_key = "foo_key";
-  AddOutputToSignatureDef(tensor_name, map_key, &signature_def);
+
+  // Build a map of tensor-name to dtype, for the unit-test.
+  std::unordered_map<string, DataType> tensor_name_to_dtype;
+  tensor_name_to_dtype[tensor_name] = tensorflow::DT_STRING;
+
+  AddOutputToSignatureDef(tensor_name, tensor_name_to_dtype, map_key,
+                          &signature_def);
   EXPECT_EQ(1, signature_def.outputs_size());
   EXPECT_EQ(tensor_name, signature_def.outputs().find(map_key)->second.name());
 }
@@ -116,7 +143,7 @@ TEST(BundleShimTest, AddOutputToSignatureDef) {
 TEST(BundleShimTest, DefaultSignatureMissing) {
   MetaGraphDef meta_graph_def;
   // Signatures signatures;
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(0, meta_graph_def.signature_def_size());
 }
 
@@ -130,7 +157,7 @@ TEST(BundleShimTest, DefaultSignatureEmpty) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(0, meta_graph_def.signature_def_size());
 }
 
@@ -146,7 +173,7 @@ TEST(BundleShimTest, DefaultSignatureRegression) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(1, meta_graph_def.signature_def_size());
   const auto actual_signature_def =
       meta_graph_def.signature_def().find(kDefaultServingSignatureDefKey);
@@ -174,7 +201,7 @@ TEST(BundleShimTest, DefaultSignatureClassification) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(1, meta_graph_def.signature_def_size());
   const auto actual_signature_def =
       meta_graph_def.signature_def().find(kDefaultServingSignatureDefKey);
@@ -209,22 +236,10 @@ TEST(BundleShimTest, DefaultSignatureGeneric) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(0, meta_graph_def.signature_def_size());
 }
 
-// Helper function to validate that the SignatureDef found in the MetaGraphDef
-// with the provided key has the expected string representation.
-void ValidateSignatureDef(const MetaGraphDef& meta_graph_def, const string& key,
-                          const string& expected_string_signature_def) {
-  tensorflow::SignatureDef expected_signature;
-  CHECK(protobuf::TextFormat::ParseFromString(expected_string_signature_def,
-                                              &expected_signature));
-  auto iter = meta_graph_def.signature_def().find(key);
-  ASSERT_TRUE(iter != meta_graph_def.signature_def().end());
-  EXPECT_EQ(expected_signature.DebugString(), iter->second.DebugString());
-}
-
 TEST(BundleShimTest, NamedRegressionSignatures) {
   Signatures signatures;
 
@@ -245,7 +260,7 @@ TEST(BundleShimTest, NamedRegressionSignatures) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   ASSERT_EQ(2, meta_graph_def.signature_def_size());
 
   ValidateSignatureDef(meta_graph_def, "foo",
@@ -299,7 +314,7 @@ TEST(BundleShimTest, NamedClassificationSignatures) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   ASSERT_EQ(2, meta_graph_def.signature_def_size());
 
   ValidateSignatureDef(meta_graph_def, "foo",
@@ -358,7 +373,7 @@ TEST(BundleShimTest, NamedSignatureGenericInputsAndOutputs) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(1, meta_graph_def.signature_def_size());
   const auto actual_signature_def =
       meta_graph_def.signature_def().find(kDefaultServingSignatureDefKey);
@@ -397,7 +412,7 @@ TEST(BundleShimTest, NamedSignatureGenericNoInputsOrOutputs) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(0, meta_graph_def.signature_def_size());
 }
 
@@ -418,7 +433,7 @@ TEST(BundleShimTest, NamedSignatureGenericOnlyInput) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(0, meta_graph_def.signature_def_size());
 }
 
@@ -457,7 +472,7 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) {
       .mutable_any_list()
       ->add_value()
       ->PackFrom(signatures);
-  ConvertSignaturesToSignatureDefs(&meta_graph_def);
+  TF_EXPECT_OK(ConvertSignaturesToSignatureDefs(&meta_graph_def));
   EXPECT_EQ(2, meta_graph_def.signature_def_size());
 
   // Verify that the default regression signature is converted to a
@@ -522,11 +537,20 @@ TEST(BundleShimTest, BasicExportSessionBundle) {
     found_named_signature = true;
 
     EXPECT_EQ(1, signature_def.inputs_size());
-    EXPECT_FALSE(signature_def.inputs().find("x") ==
-                 signature_def.inputs().end());
+    const auto it_inputs_x = signature_def.inputs().find("x");
+    EXPECT_FALSE(it_inputs_x == signature_def.inputs().end());
+    // Ensure the TensorInfo has name and dtype populated.
+    const TensorInfo& tensor_info_x = it_inputs_x->second;
+    EXPECT_EQ("x:0", tensor_info_x.name());
+    EXPECT_EQ(DT_FLOAT, tensor_info_x.dtype());
+
     EXPECT_EQ(1, signature_def.outputs_size());
-    EXPECT_FALSE(signature_def.outputs().find("y") ==
-                 signature_def.outputs().end());
+    const auto it_outputs_y = signature_def.outputs().find("y");
+    EXPECT_FALSE(it_outputs_y == signature_def.outputs().end());
+    // Ensure the TensorInfo has name and dtype populated.
+    const TensorInfo& tensor_info_y = it_outputs_y->second;
+    EXPECT_EQ("y:0", tensor_info_y.name());
+    EXPECT_EQ(DT_FLOAT, tensor_info_y.dtype());
   }
   EXPECT_TRUE(found_named_signature);
 }
diff --git a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
index 08ca47058c8..4a56509e596 100644
--- a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
+++ b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
@@ -97,7 +97,7 @@ def Export(export_dir, use_checkpoint_v2):
     }
 
     # Create two filename assets and corresponding tensors.
-    # TODO(b/26254158) Consider adding validation of file existance as well as
+    # TODO(b/26254158) Consider adding validation of file existence as well as
     # hashes (e.g. sha1) for consistency.
     original_filename1 = tf.constant("hello1.txt")
     tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, original_filename1)
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index 7d68523dec5..efeb808ee7c 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -38,8 +38,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def gfile_copy_callback(files_to_copy, export_dir_path):
   """Callback to copy files using `gfile.Copy` to an export directory.
 
@@ -69,6 +71,7 @@ def gfile_copy_callback(files_to_copy, export_dir_path):
     gfile.Copy(source_filepath, new_path)
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def regression_signature(input_tensor, output_tensor):
   """Creates a regression signature.
 
@@ -85,6 +88,7 @@ def regression_signature(input_tensor, output_tensor):
   return signature
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def classification_signature(input_tensor,
                              classes_tensor=None,
                              scores_tensor=None):
@@ -107,6 +111,7 @@ def classification_signature(input_tensor,
   return signature
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def generic_signature(name_tensor_map):
   """Creates a generic signature of name to Tensor name.
 
@@ -140,6 +145,7 @@ class Exporter(object):
     self._has_init = False
     self._assets_to_copy = {}
 
+  @deprecated("2017-06-30", "Please use SavedModel instead.")
   def init(self,
            graph_def=None,
            init_op=None,
@@ -221,6 +227,7 @@ class Exporter(object):
 
     self._assets_callback = assets_callback
 
+  @deprecated("2017-06-30", "Please use SavedModel instead.")
   def export(self,
              export_dir_base,
              global_step_tensor,
diff --git a/tensorflow/contrib/session_bundle/gc.py b/tensorflow/contrib/session_bundle/gc.py
index 91155f0fe77..885b888e27c 100644
--- a/tensorflow/contrib/session_bundle/gc.py
+++ b/tensorflow/contrib/session_bundle/gc.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""System for specifying garbage collection (GC) of path based data.
+r"""System for specifying garbage collection (GC) of path based data.
 
 This framework allows for GC of data specified by path names, for example files
 on disk.  gc.Path objects each represent a single item stored at a path and may
@@ -72,10 +72,12 @@ import os
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.platform import gfile
+from tensorflow.python.util.deprecation import deprecated
 
 Path = collections.namedtuple('Path', 'path export_version')
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def largest_export_versions(n):
   """Creates a filter that keeps the largest n export versions.
 
@@ -96,8 +98,9 @@ def largest_export_versions(n):
   return keep
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def one_of_every_n_export_versions(n):
-  """Creates a filter that keeps one of every n export versions.
+  r"""Creates a filter that keeps one of every n export versions.
 
   Args:
     n: interval size.
@@ -125,6 +128,7 @@ def one_of_every_n_export_versions(n):
   return keep
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def mod_export_version(n):
   """Creates a filter that keeps every export that is a multiple of n.
 
@@ -143,6 +147,7 @@ def mod_export_version(n):
   return keep
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def union(lf, rf):
   """Creates a filter that keeps the union of two filters.
 
@@ -160,6 +165,7 @@ def union(lf, rf):
   return keep
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def negation(f):
   """Negate a filter.
 
@@ -176,6 +182,7 @@ def negation(f):
   return keep
 
 
+@deprecated('2017-06-30', 'Please use SavedModel instead.')
 def get_paths(base_dir, parser):
   """Gets a list of Paths in a given directory.
 
diff --git a/tensorflow/contrib/session_bundle/gc_test.py b/tensorflow/contrib/session_bundle/gc_test.py
index 1a8ee93cca4..8faf3ef3d4c 100644
--- a/tensorflow/contrib/session_bundle/gc_test.py
+++ b/tensorflow/contrib/session_bundle/gc_test.py
@@ -29,10 +29,6 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-def tearDownModule():
-  gfile.DeleteRecursively(test.get_temp_dir())
-
-
 class GcTest(test_util.TensorFlowTestCase):
 
   def testLargestExportVersions(self):
diff --git a/tensorflow/contrib/session_bundle/session_bundle.h b/tensorflow/contrib/session_bundle/session_bundle.h
index 5e37d3bb992..2ff258411d1 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.h
+++ b/tensorflow/contrib/session_bundle/session_bundle.h
@@ -50,7 +50,7 @@ struct SessionBundle {
   // resource leaks, we explicitly call Close on Sessions that we create.
   ~SessionBundle() {
     if (session) {
-      session->Close();
+      session->Close().IgnoreError();
     }
   }
 
diff --git a/tensorflow/contrib/session_bundle/session_bundle.py b/tensorflow/contrib/session_bundle/session_bundle.py
index b0c2fb266eb..37407f90420 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.py
+++ b/tensorflow/contrib/session_bundle/session_bundle.py
@@ -31,8 +31,10 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util.deprecation import deprecated
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def maybe_session_bundle_dir(export_dir):
   """Checks if the model path contains session bundle model.
 
@@ -48,6 +50,7 @@ def maybe_session_bundle_dir(export_dir):
   return file_io.file_exists(meta_graph_filename)
 
 
+@deprecated("2017-06-30", "Please use SavedModel instead.")
 def load_session_bundle_from_path(export_dir,
                                   target="",
                                   config=None,
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index fc80b9bec79..eb36d79e0f4 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -270,12 +270,12 @@ class SessionBundleTest : public ::testing::Test {
   // MetaGraphDef.
   // Returns the path of the export.
   // ** Should only be called once per test **
-  string SetupExport(MetaGraphDefTwiddler twiddler) {
+  string SetupExport(const MetaGraphDefTwiddler& twiddler) {
     return SetupExport(twiddler, kVariablesFilename, kMetaGraphDefFilename);
   }
   // SetupExport that allows for the variables and meta_graph_def filenames
   // to be overridden.
-  string SetupExport(MetaGraphDefTwiddler twiddler,
+  string SetupExport(const MetaGraphDefTwiddler& twiddler,
                      const string& variables_filename,
                      const string& meta_graph_def_filename) {
     // Construct a unique path name based on the test name.
diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc
index e84f63bd2de..7133875ad53 100644
--- a/tensorflow/contrib/session_bundle/signature.cc
+++ b/tensorflow/contrib/session_bundle/signature.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/session_bundle/signature_test.cc b/tensorflow/contrib/session_bundle/signature_test.cc
index 582f9455381..741b7fde9bd 100644
--- a/tensorflow/contrib/session_bundle/signature_test.cc
+++ b/tensorflow/contrib/session_bundle/signature_test.cc
@@ -261,6 +261,10 @@ struct MockSession : public tensorflow::Session {
     return errors::Unimplemented("Not implemented for mock.");
   }
 
+  Status ListDevices(std::vector<DeviceAttributes>* response) override {
+    return errors::Unimplemented("Not implemented for mock.");
+  }
+
   // Arguments stored on a Run call.
   std::vector<std::pair<string, Tensor>> inputs;
   std::vector<string> output_tensor_names;
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
new file mode 100644
index 00000000000..5b65a6ae05e
--- /dev/null
+++ b/tensorflow/contrib/signal/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "signal_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/shape_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
new file mode 100644
index 00000000000..9f906dd28e8
--- /dev/null
+++ b/tensorflow/contrib/signal/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""##Signal ops.
+
+@@frames
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.signal.python.ops.shape_ops import frames
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/signal/python/__init__.py
new file mode 100644
index 00000000000..e672d1146c5
--- /dev/null
+++ b/tensorflow/contrib/signal/python/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
new file mode 100644
index 00000000000..e07942875fd
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -0,0 +1,68 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for shape_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import shape_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class FramesTest(test.TestCase):
+
+  def test_mapping_of_indices_without_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(9152), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 180)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (49, 1))
+      expected += np.tile(np.arange(49) * 180, (512, 1)).T
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+  def test_mapping_of_indices_with_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(10000), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 192)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (51, 1))
+      expected += np.tile(np.arange(51) * 192, (512, 1)).T
+
+      expected[expected >= 10000] = 0
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/__init__.py b/tensorflow/contrib/signal/python/ops/__init__.py
new file mode 100644
index 00000000000..e672d1146c5
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/contrib/signal/python/ops/shape_ops.py
new file mode 100644
index 00000000000..4914f19be75
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/shape_ops.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""General shape ops for frames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def frames(signal, frame_length, frame_step, name=None):
+  """Frame a signal into overlapping frames.
+
+  May be used in front of spectral functions.
+
+  For example:
+
+  ```python
+  pcm = tf.placeholder(tf.float32, [None, 9152])
+  frames = tf.contrib.signal.frames(pcm, 512, 180)
+  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  image = tf.expand_dims(magspec, 3)
+  ```
+
+  Args:
+    signal: A `Tensor` of shape `[batch_size, signal_length]`.
+    frame_length: An `int32` or `int64` `Tensor`. The length of each frame.
+    frame_step: An `int32` or `int64` `Tensor`. The step between frames.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of frames with shape `[batch_size, num_frames, frame_length]`.
+
+  Raises:
+    ValueError: if signal does not have rank 2.
+  """
+  with ops.name_scope(name, "frames", [signal, frame_length, frame_step]):
+    signal = ops.convert_to_tensor(signal, name="signal")
+    frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
+    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
+
+    signal_rank = signal.shape.ndims
+
+    if signal_rank != 2:
+      raise ValueError("expected signal to have rank 2 but was " + signal_rank)
+
+    signal_length = array_ops.shape(signal)[1]
+
+    num_frames = math_ops.ceil((signal_length - frame_length) / frame_step)
+    num_frames = 1 + math_ops.cast(num_frames, dtypes.int32)
+
+    pad_length = (num_frames - 1) * frame_step + frame_length
+    pad_signal = array_ops.pad(signal, [[0, 0], [0,
+                                                 pad_length - signal_length]])
+
+    indices_frame = array_ops.expand_dims(math_ops.range(frame_length), 0)
+    indices_frames = array_ops.tile(indices_frame, [num_frames, 1])
+
+    indices_step = array_ops.expand_dims(
+        math_ops.range(num_frames) * frame_step, 1)
+    indices_steps = array_ops.tile(indices_step, [1, frame_length])
+
+    indices = indices_frames + indices_steps
+
+    # TODO(androbin): remove `transpose` when `gather` gets `axis` support
+    pad_signal = array_ops.transpose(pad_signal)
+    signal_frames = array_ops.gather(pad_signal, indices)
+    signal_frames = array_ops.transpose(signal_frames, perm=[2, 0, 1])
+
+    return signal_frames
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index 1b8e408dee0..427c25e07c7 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "evaluation",
     srcs = ["python/slim/evaluation.py"],
@@ -23,6 +25,7 @@ py_library(
 
 py_test(
     name = "evaluation_test",
+    size = "small",
     srcs = ["python/slim/evaluation_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -142,6 +145,7 @@ py_library(
         ":learning",
         ":model_analyzer",
         ":queues",
+        ":summaries",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/losses:losses_py",
@@ -158,6 +162,33 @@ py_library(
     ],
 )
 
+py_library(
+    name = "summaries",
+    srcs = ["python/slim/summaries.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:summary",
+    ],
+)
+
+py_test(
+    name = "summaries_test",
+    size = "small",
+    srcs = ["python/slim/summaries_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":summaries",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index bcc641e04aa..4bde661698f 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -109,7 +109,7 @@ weights = slim.variable('weights',
 Note that in native TensorFlow, there are two types of variables: regular
 variables and local (transient) variables. The vast majority of variables are
 regular variables: once created, they can be saved to disk using a
-[saver](https://www.tensorflow.org/versions/r0.11/api_docs/python/state_ops.html#Saver).
+[saver](https://www.tensorflow.org/api_docs/python/tf/train/Saver).
 Local variables are those variables that only exist for the duration of a
 session and are not saved to disk.
 
@@ -229,7 +229,7 @@ net = ...
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_1')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_2')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_3')
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 One way to reduce this code duplication would be via a `for` loop:
@@ -238,14 +238,14 @@ One way to reduce this code duplication would be via a `for` loop:
 net = ...
 for i in range(3):
   net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 This can be made even cleaner by using TF-Slim's `repeat` operation:
 
 ```python
 net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
-net = slim.max_pool(net, [2, 2], scope='pool2')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
 Notice that the `slim.repeat` not only applies the same argument in-line, it
@@ -289,10 +289,10 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 ### Scopes
 
 In addition to the types of scope mechanisms in TensorFlow
-([name_scope](https://www.tensorflow.org/api_docs/python/framework.html#name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/state_layers.html#variable_scope),
+([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/code/tensorflow/contrib/framework/python/ops/arg_scope.py).
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -352,7 +352,7 @@ we can both ensure that each layer uses the same values and simplify the code:
 ```
 
 As the example illustrates, the use of arg_scope makes the code cleaner,
-simpler and easier to maintain. Notice that while argument values are specifed
+simpler and easier to maintain. Notice that while argument values are specified
 in the arg_scope, they can be overwritten locally. In particular, while
 the padding argument has been set to 'SAME', the second convolution overrides
 it with the value of 'VALID'.
@@ -447,7 +447,7 @@ vgg = tf.contrib.slim.nets.vgg
 images, labels = ...
 
 # Create the model.
-predictions = vgg.vgg16(images)
+predictions, _ = vgg.vgg_16(images)
 
 # Define the loss functions and get the total loss.
 loss = slim.losses.softmax_cross_entropy(predictions, labels)
@@ -505,7 +505,7 @@ regularization_loss = tf.add_n(slim.losses.get_regularization_losses())
 total_loss1 = classification_loss + sum_of_squares_loss + pose_loss + regularization_loss
 
 # (Regularization Loss is included in the total loss by default).
-total_loss2 = losses.get_total_loss()
+total_loss2 = slim.losses.get_total_loss()
 ```
 In this example, we can again either produce the total loss function manually
 or let TF-Slim know about the additional loss and let TF-Slim handle the losses.
@@ -836,7 +836,7 @@ with tf.Session() as sess:
   for batch_id in range(num_batches):
     sess.run(names_to_updates.values())
 
-  metric_values = sess.run(name_to_values.values())
+  metric_values = sess.run(names_to_values.values())
   for metric, value in zip(names_to_values.keys(), metric_values):
     print('Metric %s has value: %f' % (metric, value))
 ```
diff --git a/tensorflow/contrib/slim/__init__.py b/tensorflow/contrib/slim/__init__.py
index 67846a95fd5..22998b11265 100644
--- a/tensorflow/contrib/slim/__init__.py
+++ b/tensorflow/contrib/slim/__init__.py
@@ -34,6 +34,7 @@ from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.slim.python.slim import learning
 from tensorflow.contrib.slim.python.slim import model_analyzer
 from tensorflow.contrib.slim.python.slim import queues
+from tensorflow.contrib.slim.python.slim import summaries
 from tensorflow.contrib.slim.python.slim.data import data_decoder
 from tensorflow.contrib.slim.python.slim.data import data_provider
 from tensorflow.contrib.slim.python.slim.data import dataset
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 620008376c9..1326f4093c0 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "data",
     deps = [
@@ -18,6 +20,15 @@ py_library(
     ],
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "data_pip",
+    deps = [
+        ":data",
+        ":test_utils",
+    ],
+)
+
 py_library(
     name = "data_decoder",
     srcs = ["data_decoder.py"],
@@ -48,6 +59,7 @@ py_library(
 
 py_test(
     name = "dataset_data_provider_test",
+    size = "small",
     srcs = ["dataset_data_provider_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/slim/python/slim/data/README.md b/tensorflow/contrib/slim/python/slim/data/README.md
index 858c6949902..fe15a10b99d 100644
--- a/tensorflow/contrib/slim/python/slim/data/README.md
+++ b/tensorflow/contrib/slim/python/slim/data/README.md
@@ -71,27 +71,27 @@ for item in data_decoder.list_items():
   print(item)
 ```
 
-## Example: TFExampleDataDecoder
+## Example: TFExampleDecoder
 
 The
-[tfexample_data_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_data_decoder.py)
+[tfexample_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py)
 is a data decoder which decodes serialized `TFExample` protocol buffers. A
 `TFExample` protocol buffer is a map from keys (strings) to either a
 `tf.FixedLenFeature` or `tf.VarLenFeature`. Consequently, to decode a
 `TFExample`, one must provide a mapping from one or more `TFExample` fields
-to each of the `items` that the `tfexample_data_decoder` can provide. For
+to each of the `items` that the `tfexample_decoder` can provide. For
 example, a dataset of `TFExamples` might store images in various formats and
 each `TFExample` might contain an `encoding` key and a `format` key which can
 be used to decode the image using the appropriate decoder (jpg, png, etc).
 
-To make this possible, the `tfexample_data_decoder` is constructed by specifying
+To make this possible, the `tfexample_decoder` is constructed by specifying
 the a map of `TFExample` keys to either `tf.FixedLenFeature` or
 `tf.VarLenFeature` as well as a set of `ItemHandlers`. An `ItemHandler`
 provides a mapping from `TFExample` keys to the item being provided. Because a
-`tfexample_data_decoder` might return multiple `items`, one often constructs a
-`tfexample_data_decoder` using multiple `ItemHandlers`.
+`tfexample_decoder` might return multiple `items`, one often constructs a
+`tfexample_decoder` using multiple `ItemHandlers`.
 
-`tfexample_data_decoder` provides some predefined `ItemHandlers` which take care
+`tfexample_decoder` provides some predefined `ItemHandlers` which take care
 of the common cases of mapping `TFExamples` to images, `Tensors` and
 `SparseTensors`. For example, the following specification might be
 used to decode a dataset of images:
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index 542b3bfcd50..82c6b5a6196 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -33,7 +33,7 @@ To read data using multiple readers simultaneous with shuffling:
       shuffle=True)
   images, labels = pascal_voc_data_provider.get(['images', 'labels'])
 
-Equivalently, one may request different fields of the same sample seperately:
+Equivalently, one may request different fields of the same sample separately:
 
   [images] = pascal_voc_data_provider.get(['images'])
   [labels] = pascal_voc_data_provider.get(['labels'])
@@ -58,7 +58,9 @@ class DatasetDataProvider(data_provider.DataProvider):
                num_epochs=None,
                common_queue_capacity=256,
                common_queue_min=128,
-               seed=None):
+               record_key='record_key',
+               seed=None,
+               scope=None):
     """Creates a DatasetDataProvider.
 
     Args:
@@ -72,9 +74,14 @@ class DatasetDataProvider(data_provider.DataProvider):
       common_queue_capacity: The capacity of the common queue.
       common_queue_min: The minimum number of elements in the common queue after
         a dequeue.
+      record_key: The item name to use for the dataset record keys in the
+        provided tensors.
       seed: The seed to use if shuffling.
+      scope: Optional name scope for the ops.
+    Raises:
+      ValueError: If `record_key` matches one of the items in the dataset.
     """
-    _, data = parallel_reader.parallel_read(
+    key, data = parallel_reader.parallel_read(
         dataset.data_sources,
         reader_class=dataset.reader,
         num_epochs=num_epochs,
@@ -83,11 +90,18 @@ class DatasetDataProvider(data_provider.DataProvider):
         shuffle=shuffle,
         capacity=common_queue_capacity,
         min_after_dequeue=common_queue_min,
-        seed=seed)
+        seed=seed,
+        scope=scope)
 
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
 
+    if record_key in items:
+      raise ValueError('The item name used for `record_key` cannot also be '
+                       'used for a dataset item: %s', record_key)
+    items.append(record_key)
+    tensors.append(key)
+
     super(DatasetDataProvider, self).__init__(
         items_to_tensors=dict(zip(items, tensors)),
         num_samples=dataset.num_samples)
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider_test.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider_test.py
index 01fbf2c6b4d..1bb6fbc5700 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider_test.py
@@ -89,14 +89,18 @@ class DatasetDataProviderTest(test.TestCase):
     width = 280
 
     with self.test_session():
-      provider = dataset_data_provider.DatasetDataProvider(
-          _create_tfrecord_dataset(dataset_dir))
-      image, label = provider.get(['image', 'label'])
+      test_dataset = _create_tfrecord_dataset(dataset_dir)
+      provider = dataset_data_provider.DatasetDataProvider(test_dataset)
+      key, image, label = provider.get(['record_key', 'image', 'label'])
       image = _resize_image(image, height, width)
 
       with session.Session('') as sess:
         with queues.QueueRunners(sess):
-          image, label = sess.run([image, label])
+          key, image, label = sess.run([key, image, label])
+      split_key = key.decode('utf-8').split(':')
+      self.assertEqual(2, len(split_key))
+      self.assertEqual(test_dataset.data_sources[0], split_key[0])
+      self.assertTrue(split_key[1].isdigit())
       self.assertListEqual([height, width, 3], list(image.shape))
       self.assertListEqual([1], list(label.shape))
 
@@ -120,6 +124,14 @@ class DatasetDataProviderTest(test.TestCase):
       self.assertListEqual([height, width, 3], list(image.shape))
       self.assertListEqual([1], list(label.shape))
 
+  def testConflictingRecordKeyItem(self):
+    dataset_dir = tempfile.mkdtemp(prefix=os.path.join(self.get_temp_dir(),
+                                                       'tfrecord_dataset'))
+
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        dataset_data_provider.DatasetDataProvider(
+            _create_tfrecord_dataset(dataset_dir), record_key='image')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
index ea25fe8fd37..37e9c4754ca 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
@@ -25,9 +25,15 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 
 
+def _which_queue(dynamic_pad):
+  return (data_flow_ops.PaddingFIFOQueue if dynamic_pad
+          else data_flow_ops.FIFOQueue)
+
+
 def prefetch_queue(tensors,
                    capacity=8,
                    num_threads=1,
+                   dynamic_pad=False,
                    shared_name=None,
                    name=None):
   """Creates a queue to prefetech tensors from `tensors`.
@@ -50,6 +56,7 @@ def prefetch_queue(tensors,
     tensors: A list or dictionary of `Tensors` to enqueue in the buffer.
     capacity: An integer. The maximum number of elements in the queue.
     num_threads: An integer.  Number of threads running the enqueue op.
+    dynamic_pad: Boolean.  Whether to allow variable dimensions in input shapes.
     shared_name: (optional). If set, this queue will be shared under the given
       name across multiple sessions.
     name: (Optional) A name for the operations.
@@ -70,7 +77,7 @@ def prefetch_queue(tensors,
   with ops.name_scope(name, "prefetch_queue", tensor_list) as name:
     dtypes = [t.dtype for t in tensor_list]
     shapes = [t.get_shape() for t in tensor_list]
-    queue = data_flow_ops.FIFOQueue(
+    queue = _which_queue(dynamic_pad)(
         capacity=capacity,
         dtypes=dtypes,
         shapes=shapes,
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
index 0a3a9e700bd..6c3e57c47de 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -153,6 +155,63 @@ class PrefetchQueueTest(test.TestCase):
       for thread in threads:
         thread.join()
 
+  def testDynamicPad_failure(self):
+    with ops.Graph().as_default():
+      variable_tensor = array_ops.placeholder(dtypes.int32, shape=[None, 3])
+      with self.assertRaisesRegexp(ValueError, 'shapes must be fully defined'):
+        prefetch_queue.prefetch_queue([variable_tensor])
+
+  def testDynamicPad(self):
+    with self.test_session() as sess:
+      # Create 3 tensors of variable but compatible shapes.
+      var_shape = [None, 2]
+      p1 = constant_op.constant([[1, 2], [3, 4]])
+      p1.set_shape(var_shape)
+      p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]])
+      p2.set_shape(var_shape)
+      p3 = constant_op.constant([[11, 12]])
+      p3.set_shape(var_shape)
+      batch = [p1, p2, p3]
+      batch_size = len(batch)
+
+      zero64 = constant_op.constant(0, dtype=dtypes.int64)
+      examples = variables.Variable(zero64)
+      counter = examples.count_up_to(batch_size)
+
+      # Create a PaddingFIFOQueue to enqueue these tensors.
+      q = data_flow_ops.PaddingFIFOQueue(
+          capacity=10, dtypes=[dtypes.int32], shapes=[var_shape])
+      for tensor in [p1, p2, p3]:
+        q.enqueue([tensor]).run()
+
+      # Dequeue from the queue and batch them using batch().
+      batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size,
+                                num_threads=1, dynamic_pad=True)
+      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())
+
+      # Finally, assemble them into prefetch_queue with dynamic_pad.
+      batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True)
+      batches = batcher.dequeue()
+      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())
+
+      variables.global_variables_initializer().run()
+      threads = queue_runner_impl.start_queue_runners()
+
+      values, _ = sess.run(batches)
+      # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad
+      # they should be padded to the fixed size [3, 3, 2], where 3
+      # is the maximum length of the batch.
+      self.assertTrue(np.array_equal(
+          np.array([[[1, 2], [3, 4], [0, 0]],
+                    [[5, 6], [7, 8], [9, 10]],
+                    [[11, 12], [0, 0], [0, 0]]]),
+          values))
+
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        sess.run(batches)
+      for thread in threads:
+        thread.join()
+
   def testDictConstruction(self):
     with ops.Graph().as_default():
       batches = {
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index c764887f6da..f9449095be0 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -269,7 +270,13 @@ class SparseTensor(ItemHandler):
 class Image(ItemHandler):
   """An ItemHandler that decodes a parsed Tensor as an image."""
 
-  def __init__(self, image_key=None, format_key=None, shape=None, channels=3):
+  def __init__(self,
+               image_key=None,
+               format_key=None,
+               shape=None,
+               channels=3,
+               dtype=dtypes.uint8,
+               repeated=False):
     """Initializes the image.
 
     Args:
@@ -282,6 +289,12 @@ class Image(ItemHandler):
         accordingly. If left as None, no reshaping is done. A shape should
         be supplied only if all the stored images have the same shape.
       channels: the number of channels in the image.
+      dtype: images will be decoded at this bit depth. Different formats
+        support different bit depths.
+          See tf.image.decode_image,
+              tf.decode_raw,
+      repeated: if False, decodes a single image. If True, decodes a
+        variable number of image strings from a 1D tensor of strings.
     """
     if not image_key:
       image_key = 'image/encoded'
@@ -293,56 +306,48 @@ class Image(ItemHandler):
     self._format_key = format_key
     self._shape = shape
     self._channels = channels
+    self._dtype = dtype
+    self._repeated = repeated
 
   def tensors_to_item(self, keys_to_tensors):
     """See base class."""
     image_buffer = keys_to_tensors[self._image_key]
     image_format = keys_to_tensors[self._format_key]
 
-    return self._decode(image_buffer, image_format)
+    if self._repeated:
+      return functional_ops.map_fn(lambda x: self._decode(x, image_format),
+                                   image_buffer, dtype=self._dtype)
+    else:
+      return self._decode(image_buffer, image_format)
 
   def _decode(self, image_buffer, image_format):
     """Decodes the image buffer.
 
     Args:
       image_buffer: The tensor representing the encoded image tensor.
-      image_format: The image format for the image in `image_buffer`.
+      image_format: The image format for the image in `image_buffer`. If image
+        format is `raw`, all images are expected to be in this format, otherwise
+        this op can decode a mix of `jpg` and `png` formats.
 
     Returns:
       A tensor that represents decoded image of self._shape, or
       (?, ?, self._channels) if self._shape is not specified.
     """
-
-    def decode_png():
-      return image_ops.decode_png(image_buffer, self._channels)
+    def decode_image():
+      """Decodes a png or jpg based on the headers."""
+      return image_ops.decode_image(image_buffer, self._channels)
 
     def decode_raw():
-      return parsing_ops.decode_raw(image_buffer, dtypes.uint8)
-
-    def decode_jpg():
-      return image_ops.decode_jpeg(image_buffer, self._channels)
-
-    # For RGBA images JPEG is not a valid decoder option.
-    if self._channels > 3:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_png
-    else:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'png'),
-              math_ops.equal(image_format, 'PNG')): decode_png,
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_jpg
+      """Decodes a raw image."""
+      return parsing_ops.decode_raw(image_buffer, out_type=self._dtype)
 
+    pred_fn_pairs = {
+        math_ops.logical_or(
+            math_ops.equal(image_format, 'raw'),
+            math_ops.equal(image_format, 'RAW')): decode_raw,
+    }
     image = control_flow_ops.case(
-        pred_fn_pairs, default=default_decoder, exclusive=True)
+        pred_fn_pairs, default=decode_image, exclusive=True)
 
     image.set_shape([None, None, self._channels])
     if self._shape is not None:
@@ -385,7 +390,7 @@ class TFExampleDecoder(data_decoder.DataDecoder):
 
   def list_items(self):
     """See base class."""
-    return self._items_to_handlers.keys()
+    return list(self._items_to_handlers.keys())
 
   def decode(self, serialized_example, items=None):
     """Decodes the given serialized TF-example.
@@ -402,8 +407,9 @@ class TFExampleDecoder(data_decoder.DataDecoder):
     example = parsing_ops.parse_single_example(serialized_example,
                                                self._keys_to_features)
 
-    # Reshape non-sparse elements just once:
-    for k in self._keys_to_features:
+    # Reshape non-sparse elements just once, adding the reshape ops in
+    # deterministic order.
+    for k in sorted(self._keys_to_features):
       v = self._keys_to_features[k]
       if isinstance(v, parsing_ops.FixedLenFeature):
         example[k] = array_ops.reshape(example[k], v.shape)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index 179b6d23c63..506f4bd8777 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -224,6 +224,16 @@ class TFExampleDecoderTest(test.TestCase):
 
     self.assertAllClose(image, decoded_image, atol=0)
 
+  def testDecodeExampleWithJpegEncodingAt16BitCausesError(self):
+    image_shape = (2, 3, 3)
+    unused_image, serialized_example = self.GenerateImage(
+        image_format='jpeg', image_shape=image_shape)
+    with self.assertRaises(TypeError):
+      unused_decoded_image = self.RunDecodeExample(
+          serialized_example,
+          tfexample_decoder.Image(dtype=dtypes.uint16),
+          image_format='jpeg')
+
   def testDecodeExampleWithStringTensor(self):
     tensor_shape = (2, 3, 1)
     np_array = np.array([[['ab'], ['cd'], ['ef']],
@@ -718,6 +728,43 @@ class TFExampleDecoderTest(test.TestCase):
 
     self.assertAllClose(np_bboxes, bboxes)
 
+  def testDecodeExampleWithRepeatedImages(self):
+    image_shape = (2, 3, 3)
+    image_format = 'png'
+    image, _ = self.GenerateImage(
+        image_format=image_format, image_shape=image_shape)
+    tf_encoded = self._Encoder(image, image_format)
+    with self.test_session():
+      tf_string = tf_encoded.eval()
+
+    example = example_pb2.Example(features=feature_pb2.Features(feature={
+        'image/encoded': feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+            value=[tf_string, tf_string])),
+        'image/format': self._StringFeature(image_format),
+    }))
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = array_ops.reshape(serialized_example, shape=[])
+
+      decoder = tfexample_decoder.TFExampleDecoder(
+          keys_to_features={
+              'image/encoded':
+                  parsing_ops.FixedLenFeature(
+                      (2,), dtypes.string),
+              'image/format':
+                  parsing_ops.FixedLenFeature(
+                      (), dtypes.string, default_value=image_format),
+          },
+          items_to_handlers={'image': tfexample_decoder.Image(repeated=True)})
+      [tf_image] = decoder.decode(serialized_example, ['image'])
+
+      output_image = tf_image.eval()
+
+      self.assertEqual(output_image.shape, (2, 2, 3, 3))
+      self.assertAllEqual(np.squeeze(output_image[0, :, :, :]), image)
+      self.assertAllEqual(np.squeeze(output_image[1, :, :, :]), image)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index ff6e4082256..15c9f3d3f44 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -213,6 +213,7 @@ def evaluation_loop(master,
                     num_evals=1,
                     initial_op=None,
                     initial_op_feed_dict=None,
+                    init_fn=None,
                     eval_op=None,
                     eval_op_feed_dict=None,
                     final_op=None,
@@ -223,7 +224,8 @@ def evaluation_loop(master,
                     eval_interval_secs=60,
                     max_number_of_evaluations=None,
                     session_config=None,
-                    timeout=None):
+                    timeout=None,
+                    hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
   Args:
@@ -233,6 +235,8 @@ def evaluation_loop(master,
     num_evals: The number of times to run `eval_op`.
     initial_op: An operation run at the beginning of evaluation.
     initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
+    init_fn: An optional callable to be executed after `init_op` is called. The
+      callable must accept one argument, the session being initialized.
     eval_op: A operation run `num_evals` times.
     eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
     final_op: An operation to execute after all of the `eval_op` executions. The
@@ -252,6 +256,8 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    hooks: A list of additional SessionRunHook objects to pass during
+      repeated evaluations.
 
   Returns:
     The value of `final_op` or `None` if `final_op` is `None`.
@@ -259,12 +265,16 @@ def evaluation_loop(master,
   if summary_op == _USE_DEFAULT:
     summary_op = summary.merge_all()
 
-  hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
+  all_hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
 
   if summary_op is not None:
-    hooks.append(evaluation.SummaryAtEndHook(
+    all_hooks.append(evaluation.SummaryAtEndHook(
         log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict))
 
+  if hooks is not None:
+    # Add custom hooks if provided.
+    all_hooks.extend(hooks)
+
   saver = None
   if variables_to_restore is not None:
     saver = tf_saver.Saver(variables_to_restore)
@@ -273,13 +283,14 @@ def evaluation_loop(master,
       checkpoint_dir,
       master=master,
       scaffold=monitored_session.Scaffold(
-          init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver),
+          init_op=initial_op, init_feed_dict=initial_op_feed_dict,
+          init_fn=init_fn, saver=saver),
       eval_ops=eval_op,
       feed_dict=eval_op_feed_dict,
       final_ops=final_op,
       final_ops_feed_dict=final_op_feed_dict,
       eval_interval_secs=eval_interval_secs,
-      hooks=hooks,
+      hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
       timeout=timeout)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 18c97d75e58..d9e0f54b724 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -41,6 +41,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import input
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import session_run_hook
+
 
 FLAGS = flags.FLAGS
 
@@ -100,6 +102,22 @@ class EvaluationTest(test.TestCase):
       init_op.run()
       saver.save(sess, os.path.join(chkpt_dir, 'chkpt'))
 
+    class Object(object):
+
+      def __init__(self):
+        self.hook_was_run = False
+
+    obj = Object()
+
+    # Create a custom session run hook.
+    class CustomHook(session_run_hook.SessionRunHook):
+
+      def __init__(self, obj):
+        self.obj = obj
+
+      def end(self, session):
+        self.obj.hook_was_run = True
+
     # Now, run the evaluation loop:
     accuracy_value = evaluation.evaluation_loop(
         '',
@@ -107,9 +125,13 @@ class EvaluationTest(test.TestCase):
         logdir,
         eval_op=update_op,
         final_op=value_op,
+        hooks=[CustomHook(obj)],
         max_number_of_evaluations=1)
     self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
 
+    # Validate that custom hook ran.
+    self.assertTrue(obj.hook_was_run)
+
   def _create_names_to_metrics(self, predictions, labels):
     accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
     accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 044ce59d88c..f7dddc46c36 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -18,9 +18,9 @@ This script contains various functions for training models. These include
 manipulating gradients, creating a `train_op` (an operation that computes the
 loss and applies the gradients) and a training loop function. The training loop
 allows the user to pass in the `train_op` and runs the optimization according
-to user-specified arguments. Note that the training loop uses the tf.Supervisor
-and its managed_session in its implementation to ensure the ability of worker
-processes to recover from failures.
+to user-specified arguments. Note that the training loop uses the
+tf.train.Supervisor and its managed_session in its implementation to ensure the
+ability of worker processes to recover from failures.
 
 ************************************
 * A simple working training script *
@@ -261,7 +261,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
@@ -329,13 +329,15 @@ def multiply_gradients(grads_and_vars, gradient_multipliers):
       if grad is None:
         raise ValueError('Requested multiple of `None` gradient.')
 
+      multiplier = gradient_multipliers[key]
+      if not isinstance(multiplier, ops.Tensor):
+        multiplier = constant_op.constant(multiplier, dtype=grad.dtype)
+
       if isinstance(grad, ops.IndexedSlices):
-        tmp = grad.values * constant_op.constant(
-            gradient_multipliers[key], dtype=grad.dtype)
+        tmp = grad.values * multiplier
         grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape)
       else:
-        grad *= constant_op.constant(
-            gradient_multipliers[key], dtype=grad.dtype)
+        grad *= multiplier
     multiplied_grads_and_vars.append((grad, var))
   return multiplied_grads_and_vars
 
@@ -380,7 +382,8 @@ def create_train_op(total_loss,
                     gate_gradients=tf_optimizer.Optimizer.GATE_OP,
                     aggregation_method=None,
                     colocate_gradients_with_ops=False,
-                    gradient_multipliers=None):
+                    gradient_multipliers=None,
+                    check_numerics=True):
   """Creates an `Operation` that evaluates the gradients and returns the loss.
 
   Args:
@@ -406,6 +409,8 @@ def create_train_op(total_loss,
     gradient_multipliers: A dictionary of either `Variables` or `Variable` op
       names to the coefficient by which the associated gradient should be
       scaled.
+    check_numerics: Whether or not we apply check_numerics.
+
   Returns:
     A `Tensor` that when evaluated, computes the gradients and returns the total
       loss value.
@@ -431,7 +436,8 @@ def create_train_op(total_loss,
       summarize_gradients=summarize_gradients,
       gate_gradients=gate_gradients,
       aggregation_method=aggregation_method,
-      colocate_gradients_with_ops=colocate_gradients_with_ops)
+      colocate_gradients_with_ops=colocate_gradients_with_ops,
+      check_numerics=check_numerics)
 
 
 def _wait_for_step(sess, global_step, step):
@@ -496,7 +502,7 @@ def train_step(sess, train_op, global_step, train_step_kwargs):
 
   if 'should_log' in train_step_kwargs:
     if sess.run(train_step_kwargs['should_log']):
-      logging.info('global step %d: loss = %.4f (%.2f sec/step)',
+      logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                    np_global_step, total_loss, time_elapsed)
 
   # TODO(nsilberman): figure out why we can't put this into sess.run. The
@@ -572,8 +578,10 @@ def train(train_op,
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
       then slim.variables.get_or_create_global_step() is used.
-    number_of_steps: The max number of gradient steps to take during training.
-      If the value is left as None, training proceeds indefinitely.
+    number_of_steps: The max number of gradient steps to take during training,
+      as measured by 'global_step': training will stop if global_step is
+      greater than 'number_of_steps'. If the value is left as None, training
+      proceeds indefinitely.
     init_op: The initialization operation. If left to its default value, then
       the session is initialized by calling `tf.global_variables_initializer()`.
     init_feed_dict: A feed dictionary to use when executing the `init_op`.
@@ -649,7 +657,7 @@ def train(train_op,
       if local_init_op == _USE_DEFAULT:
         local_init_op = control_flow_ops.group(
             tf_variables.local_variables_initializer(),
-            data_flow_ops.tables_initializer())
+            lookup_ops.tables_initializer())
 
       if sync_optimizer is not None and isinstance(
           sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -669,8 +677,6 @@ def train(train_op,
     if summary_writer == _USE_DEFAULT:
       summary_writer = supervisor.Supervisor.USE_DEFAULT
 
-    cleanup_op = None
-
     if is_chief and sync_optimizer is not None:
       if not isinstance(sync_optimizer,
                         (sync_replicas_optimizer.SyncReplicasOptimizer)):
@@ -680,9 +686,6 @@ def train(train_op,
       # Need to create these BEFORE the supervisor finalizes the graph:
       init_tokens_op = sync_optimizer.get_init_tokens_op()
       chief_queue_runner = sync_optimizer.get_chief_queue_runner()
-      if isinstance(sync_optimizer,
-                    sync_replicas_optimizer.SyncReplicasOptimizer):
-        cleanup_op = sync_optimizer.get_clean_up_op()
 
     if train_step_kwargs == _USE_DEFAULT:
       with ops.name_scope('train_step'):
@@ -693,8 +696,9 @@ def train(train_op,
         else:
           should_stop_op = constant_op.constant(False)
         train_step_kwargs['should_stop'] = should_stop_op
-        train_step_kwargs['should_log'] = math_ops.equal(
-            math_ops.mod(global_step, log_every_n_steps), 0)
+        if log_every_n_steps > 0:
+          train_step_kwargs['should_log'] = math_ops.equal(
+              math_ops.mod(global_step, log_every_n_steps), 0)
         if is_chief and trace_every_n_steps is not None:
           train_step_kwargs['should_trace'] = math_ops.equal(
               math_ops.mod(global_step, trace_every_n_steps), 0)
@@ -734,31 +738,27 @@ def train(train_op,
           _wait_for_step(sess, global_step,
                          min(startup_delay_steps, number_of_steps or
                              sys.maxint))
-        sv.start_queue_runners(sess)
+        threads = sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
           sv.start_queue_runners(sess, [chief_queue_runner])
           sess.run(init_tokens_op)
         try:
-          try:
-            while not sv.should_stop():
-              total_loss, should_stop = train_step_fn(
-                  sess, train_op, global_step, train_step_kwargs)
-              if should_stop:
-                logging.info('Stopping Training.')
-                break
-          except errors.OutOfRangeError:
-            # OutOfRangeError is thrown when epoch limit per
-            # tf.train.limit_epochs is reached.
-            logging.info('Caught OutOfRangeError. Stopping Training.')
-          if logdir and sv.is_chief:
-            logging.info('Finished training! Saving model to disk.')
-            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
-        except:
-          if sv.is_chief and cleanup_op is not None:
-            logging.info('About to execute sync_clean_up_op!')
-            sess.run(cleanup_op)
-          raise
+          while not sv.should_stop():
+            total_loss, should_stop = train_step_fn(
+                sess, train_op, global_step, train_step_kwargs)
+            if should_stop:
+              logging.info('Stopping Training.')
+              sv.request_stop()
+              break
+        except errors.OutOfRangeError:
+          # OutOfRangeError is thrown when epoch limit per
+          # tf.train.limit_epochs is reached.
+          logging.info('Caught OutOfRangeError. Stopping Training.')
+        if logdir and sv.is_chief:
+          logging.info('Finished training! Saving model to disk.')
+          sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
+          sv.stop(threads, close_summary_writer=True)
 
     except errors.AbortedError:
       # Always re-run on AbortedError as it indicates a restart of one of the
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index eb57102c638..83d45f6f5ad 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -189,6 +189,31 @@ class MultiplyGradientsTest(test.TestCase):
     np_testing.assert_almost_equal(actual_gradient, self._multiplied_grad_vec,
                                    5)
 
+  def testTensorMultiplierOfGradient(self):
+    gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
+    variable = variables_lib.Variable(array_ops.zeros_like(gradient))
+    multiplier_flag = variables_lib.Variable(True)
+    tensor_multiplier = array_ops.where(multiplier_flag,
+                                        self._multiplier,
+                                        1.0)
+    grad_to_var = (gradient, variable)
+    gradient_multipliers = {variable: tensor_multiplier}
+
+    [grad_to_var] = learning.multiply_gradients([grad_to_var],
+                                                gradient_multipliers)
+
+    with self.test_session() as sess:
+      sess.run(variables_lib.global_variables_initializer())
+      gradient_true_flag = sess.run(grad_to_var[0])
+      sess.run(multiplier_flag.assign(False))
+      gradient_false_flag = sess.run(grad_to_var[0])
+    np_testing.assert_almost_equal(gradient_true_flag,
+                                   self._multiplied_grad_vec,
+                                   5)
+    np_testing.assert_almost_equal(gradient_false_flag,
+                                   self._grad_vec,
+                                   5)
+
 
 def LogisticClassifier(inputs):
   return layers.fully_connected(inputs, 1, activation_fn=math_ops.sigmoid)
@@ -815,7 +840,7 @@ class TrainTest(test.TestCase):
         # Initialize the variables.
         sess.run(variables_lib.global_variables_initializer())
 
-        # Get the intial weights and biases values.
+        # Get the initial weights and biases values.
         weights_values, biases_values = sess.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index d94fd006eb8..737bbbe57b2 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -5,6 +5,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 package(
     default_visibility = [
         "//tensorflow:__subpackages__",
@@ -12,6 +14,19 @@ package(
     ],
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "nets_pip",
+    deps = [
+        ":alexnet",
+        ":inception",
+        ":overfeat",
+        ":resnet_v1",
+        ":resnet_v2",
+        ":vgg",
+    ],
+)
+
 py_library(
     name = "alexnet",
     srcs = ["alexnet.py"],
@@ -271,6 +286,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "resnet_is_training_test",
+    size = "medium",
+    srcs = ["resnet_is_training_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":resnet_utils",
+        ":resnet_v1",
+        ":resnet_v2",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "vgg",
     srcs = ["vgg.py"],
diff --git a/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py b/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
index ec880fa759a..eb93f753ae4 100644
--- a/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.slim.python.slim.nets import alexnet
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
index 8d21f3605b1..7a3d1c97039 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import arg_scope
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
index 34a7cc94789..5fbc9e5aa32 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import arg_scope
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
index 41b17f4ecb4..6ba02318ed9 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import arg_scope
diff --git a/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py b/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
index c519ca97827..317af3cb29d 100644
--- a/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.slim.python.slim.nets import overfeat
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py
new file mode 100644
index 00000000000..9a165577b69
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py
@@ -0,0 +1,154 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Specifying is_training in resnet_arg_scope is being deprecated.
+
+Test that everything behaves as expected in the meantime.
+
+Note: This test modifies the layers.batch_norm function.
+Other tests that use layers.batch_norm may not work if added to this file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.framework.python.ops import add_arg_scope
+from tensorflow.contrib.framework.python.ops import arg_scope
+from tensorflow.contrib.slim.python.slim.nets import resnet_utils
+from tensorflow.contrib.slim.python.slim.nets import resnet_v1
+from tensorflow.contrib.slim.python.slim.nets import resnet_v2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def create_test_input(batch, height, width, channels):
+  """Create test input tensor."""
+  if None in [batch, height, width, channels]:
+    return array_ops.placeholder(dtypes.float32, (batch, height, width,
+                                                  channels))
+  else:
+    return math_ops.to_float(
+        np.tile(
+            np.reshape(
+                np.reshape(np.arange(height), [height, 1]) +
+                np.reshape(np.arange(width), [1, width]),
+                [1, height, width, 1]),
+            [batch, 1, 1, channels]))
+
+
+class ResnetIsTrainingTest(test.TestCase):
+
+  def _testDeprecatingIsTraining(self, network_fn):
+    batch_norm_fn = layers.batch_norm
+
+    @add_arg_scope
+    def batch_norm_expect_is_training(*args, **kwargs):
+      assert kwargs['is_training']
+      return batch_norm_fn(*args, **kwargs)
+
+    @add_arg_scope
+    def batch_norm_expect_is_not_training(*args, **kwargs):
+      assert not kwargs['is_training']
+      return batch_norm_fn(*args, **kwargs)
+
+    global_pool = True
+    num_classes = 10
+    inputs = create_test_input(2, 224, 224, 3)
+
+    # Default argument for resnet_arg_scope
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet1')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet2')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet3')
+
+    # resnet_arg_scope with is_training set to True (deprecated)
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet4')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet5')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet6')
+
+    # resnet_arg_scope with is_training set to False (deprecated)
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet7')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet8')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet9')
+
+    layers.batch_norm = batch_norm_fn
+
+  def testDeprecatingIsTrainingResnetV1(self):
+    self._testDeprecatingIsTraining(resnet_v1.resnet_v1_50)
+
+  def testDeprecatingIsTrainingResnetV2(self):
+    self._testDeprecatingIsTraining(resnet_v2.resnet_v2_50)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py b/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
index 9a8d7d06d6a..58614a998ab 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
@@ -41,6 +41,7 @@ from __future__ import print_function
 import collections
 
 from tensorflow.contrib import layers as layers_lib
+from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework.python.ops import add_arg_scope
 from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.layers.python.layers import initializers
@@ -204,28 +205,16 @@ def stack_blocks_dense(net,
           raise ValueError('The target output_stride cannot be reached.')
 
         with variable_scope.variable_scope('unit_%d' % (i + 1), values=[net]):
-          unit_depth, unit_depth_bottleneck, unit_stride = unit
-
           # If we have reached the target output_stride, then we need to employ
           # atrous convolution with stride=1 and multiply the atrous rate by the
           # current unit's stride for use in subsequent layers.
           if output_stride is not None and current_stride == output_stride:
-            net = block.unit_fn(
-                net,
-                depth=unit_depth,
-                depth_bottleneck=unit_depth_bottleneck,
-                stride=1,
-                rate=rate)
-            rate *= unit_stride
+            net = block.unit_fn(net, rate=rate, **dict(unit, stride=1))
+            rate *= unit.get('stride', 1)
 
           else:
-            net = block.unit_fn(
-                net,
-                depth=unit_depth,
-                depth_bottleneck=unit_depth_bottleneck,
-                stride=unit_stride,
-                rate=1)
-            current_stride *= unit_stride
+            net = block.unit_fn(net, rate=1, **unit)
+            current_stride *= unit.get('stride', 1)
       net = utils.collect_named_outputs(outputs_collections, sc.name, net)
 
   if output_stride is not None and current_stride != output_stride:
@@ -234,6 +223,10 @@ def stack_blocks_dense(net,
   return net
 
 
+@deprecated_args(
+    '2017-08-01',
+    'Pass is_training directly to the network instead of the arg_scope.',
+    'is_training')
 def resnet_arg_scope(is_training=True,
                      weight_decay=0.0001,
                      batch_norm_decay=0.997,
@@ -248,7 +241,7 @@ def resnet_arg_scope(is_training=True,
 
   Args:
     is_training: Whether or not we are training the parameters in the batch
-      normalization layers of the model.
+      normalization layers of the model. (deprecated)
     weight_decay: The weight decay to use for regularizing the model.
     batch_norm_decay: The moving average decay when estimating layer activation
       statistics in batch normalization.
@@ -273,8 +266,7 @@ def resnet_arg_scope(is_training=True,
       weights_regularizer=regularizers.l2_regularizer(weight_decay),
       weights_initializer=initializers.variance_scaling_initializer(),
       activation_fn=nn_ops.relu,
-      normalizer_fn=layers.batch_norm,
-      normalizer_params=batch_norm_params):
+      normalizer_fn=layers.batch_norm):
     with arg_scope([layers.batch_norm], **batch_norm_params):
       # The following implies padding='SAME' for pool1, which makes feature
       # alignment easier for dense prediction tasks. This is also used in
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index 6d24baa2bef..90f93d46e34 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -40,15 +40,16 @@ Typical use:
 ResNet-101 for image classification into 1000 classes:
 
    # inputs has shape [batch, 224, 224, 3]
-   with slim.arg_scope(resnet_v1.resnet_arg_scope(is_training)):
-      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000)
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
+      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 
 ResNet-101 for semantic segmentation into 21 classes:
 
    # inputs has shape [batch, 513, 513, 3]
-   with slim.arg_scope(resnet_v1.resnet_arg_scope(is_training)):
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
       net, end_points = resnet_v1.resnet_v1_101(inputs,
                                                 21,
+                                                is_training=False,
                                                 global_pool=False,
                                                 output_stride=16)
 """
@@ -127,6 +128,7 @@ def bottleneck(inputs,
 def resnet_v1(inputs,
               blocks,
               num_classes=None,
+              is_training=None,
               global_pool=True,
               output_stride=None,
               include_root_block=True,
@@ -161,6 +163,8 @@ def resnet_v1(inputs,
       is a resnet_utils.Block object describing the units in the block.
     num_classes: Number of predicted classes for classification tasks. If None
       we return the features before the logit layer.
+    is_training: whether is training or not. If None, the value inherited from
+      the resnet_arg_scope is used. Specifying value None is deprecated.
     global_pool: If True, we perform global average pooling before computing the
       logits. Set to True for image classification, False for dense prediction.
     output_stride: If None, then the output will be computed at the nominal
@@ -192,55 +196,82 @@ def resnet_v1(inputs,
     with arg_scope(
         [layers.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
         outputs_collections=end_points_collection):
-      net = inputs
-      if include_root_block:
-        if output_stride is not None:
-          if output_stride % 4 != 0:
-            raise ValueError('The output_stride needs to be a multiple of 4.')
-          output_stride /= 4
-        net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
-        net = layers_lib.max_pool2d(net, [3, 3], stride=2, scope='pool1')
-      net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
-      if global_pool:
-        # Global average pooling.
-        net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
-      if num_classes is not None:
-        net = layers.conv2d(
-            net,
-            num_classes, [1, 1],
-            activation_fn=None,
-            normalizer_fn=None,
-            scope='logits')
-      # Convert end_points_collection into a dictionary of end_points.
-      end_points = utils.convert_collection_to_dict(end_points_collection)
-      if num_classes is not None:
-        end_points['predictions'] = layers_lib.softmax(net, scope='predictions')
-      return net, end_points
-
-
+      if is_training is not None:
+        bn_scope = arg_scope([layers.batch_norm], is_training=is_training)
+      else:
+        bn_scope = arg_scope([])
+      with bn_scope:
+        net = inputs
+        if include_root_block:
+          if output_stride is not None:
+            if output_stride % 4 != 0:
+              raise ValueError('The output_stride needs to be a multiple of 4.')
+            output_stride /= 4
+          net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
+          net = layers_lib.max_pool2d(net, [3, 3], stride=2, scope='pool1')
+        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
+        if global_pool:
+          # Global average pooling.
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+        if num_classes is not None:
+          net = layers.conv2d(
+              net,
+              num_classes, [1, 1],
+              activation_fn=None,
+              normalizer_fn=None,
+              scope='logits')
+        # Convert end_points_collection into a dictionary of end_points.
+        end_points = utils.convert_collection_to_dict(end_points_collection)
+        if num_classes is not None:
+          end_points['predictions'] = layers_lib.softmax(
+              net, scope='predictions')
+        return net, end_points
 resnet_v1.default_image_size = 224
 
 
+def resnet_v1_block(scope, base_depth, num_units, stride):
+  """Helper function for creating a resnet_v1 bottleneck block.
+
+  Args:
+    scope: The scope of the block.
+    base_depth: The depth of the bottleneck layer for each unit.
+    num_units: The number of units in the block.
+    stride: The stride of the block, implemented as a stride in the last unit.
+      All other units have stride=1.
+
+  Returns:
+    A resnet_v1 bottleneck block.
+  """
+  return resnet_utils.Block(scope, bottleneck, [{
+      'depth': base_depth * 4,
+      'depth_bottleneck': base_depth,
+      'stride': 1
+  }] * (num_units - 1) + [{
+      'depth': base_depth * 4,
+      'depth_bottleneck': base_depth,
+      'stride': stride
+  }])
+
+
 def resnet_v1_50(inputs,
                  num_classes=None,
+                 is_training=None,
                  global_pool=True,
                  output_stride=None,
                  reuse=None,
                  scope='resnet_v1_50'):
   """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 3 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
+      resnet_v1_block('block3', base_depth=256, num_units=6, stride=2),
+      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v1(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -250,24 +281,23 @@ def resnet_v1_50(inputs,
 
 def resnet_v1_101(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v1_101'):
   """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 3 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
+      resnet_v1_block('block3', base_depth=256, num_units=23, stride=2),
+      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v1(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -277,24 +307,23 @@ def resnet_v1_101(inputs,
 
 def resnet_v1_152(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v1_152'):
   """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 7 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
+      resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
+      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v1(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -304,24 +333,23 @@ def resnet_v1_152(inputs,
 
 def resnet_v1_200(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v1_200'):
   """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 23 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v1_block('block2', base_depth=128, num_units=24, stride=2),
+      resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
+      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v1(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index 5d5e6ed89e6..d510337fef0 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import layers
@@ -172,10 +165,11 @@ class ResnetUtilsTest(test.TestCase):
 
   def testEndPointsV1(self):
     """Test the end points of a tiny v1 bottleneck network."""
-    bottleneck = resnet_v1.bottleneck
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])
+        resnet_v1.resnet_v1_block(
+            'block1', base_depth=1, num_units=2, stride=2),
+        resnet_v1.resnet_v1_block(
+            'block2', base_depth=2, num_units=2, stride=1),
     ]
     inputs = create_test_input(2, 32, 16, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
@@ -202,62 +196,52 @@ class ResnetUtilsTest(test.TestCase):
     for block in blocks:
       with variable_scope.variable_scope(block.scope, 'block', [net]):
         for i, unit in enumerate(block.args):
-          depth, depth_bottleneck, stride = unit
           with variable_scope.variable_scope('unit_%d' % (i + 1), values=[net]):
-            net = block.unit_fn(
-                net,
-                depth=depth,
-                depth_bottleneck=depth_bottleneck,
-                stride=stride,
-                rate=1)
+            net = block.unit_fn(net, rate=1, **unit)
     return net
 
-  def _atrousValues(self, bottleneck):
+  def testAtrousValuesBottleneck(self):
     """Verify the values of dense feature extraction by atrous convolution.
 
     Make sure that dense feature extraction by stack_blocks_dense() followed by
     subsampling gives identical results to feature extraction at the nominal
     network output stride using the simple self._stack_blocks_nondense() above.
-
-    Args:
-      bottleneck: The bottleneck function.
     """
+    block = resnet_v1.resnet_v1_block
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]),
-        resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]),
-        resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)])
+        block('block1', base_depth=1, num_units=2, stride=2),
+        block('block2', base_depth=2, num_units=2, stride=2),
+        block('block3', base_depth=4, num_units=2, stride=2),
+        block('block4', base_depth=8, num_units=2, stride=1),
     ]
     nominal_stride = 8
 
     # Test both odd and even input dimensions.
     height = 30
     width = 31
-    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
-      for output_stride in [1, 2, 4, 8, None]:
-        with ops.Graph().as_default():
-          with self.test_session() as sess:
-            random_seed.set_random_seed(0)
-            inputs = create_test_input(1, height, width, 3)
-            # Dense feature extraction followed by subsampling.
-            output = resnet_utils.stack_blocks_dense(inputs, blocks,
-                                                     output_stride)
-            if output_stride is None:
-              factor = 1
-            else:
-              factor = nominal_stride // output_stride
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      with arg_scope([layers.batch_norm], is_training=False):
+        for output_stride in [1, 2, 4, 8, None]:
+          with ops.Graph().as_default():
+            with self.test_session() as sess:
+              random_seed.set_random_seed(0)
+              inputs = create_test_input(1, height, width, 3)
+              # Dense feature extraction followed by subsampling.
+              output = resnet_utils.stack_blocks_dense(inputs, blocks,
+                                                       output_stride)
+              if output_stride is None:
+                factor = 1
+              else:
+                factor = nominal_stride // output_stride
 
-            output = resnet_utils.subsample(output, factor)
-            # Make the two networks use the same weights.
-            variable_scope.get_variable_scope().reuse_variables()
-            # Feature extraction at the nominal network rate.
-            expected = self._stack_blocks_nondense(inputs, blocks)
-            sess.run(variables.global_variables_initializer())
-            output, expected = sess.run([output, expected])
-            self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
-
-  def testAtrousValuesBottleneck(self):
-    self._atrousValues(resnet_v1.bottleneck)
+              output = resnet_utils.subsample(output, factor)
+              # Make the two networks use the same weights.
+              variable_scope.get_variable_scope().reuse_variables()
+              # Feature extraction at the nominal network rate.
+              expected = self._stack_blocks_nondense(inputs, blocks)
+              sess.run(variables.global_variables_initializer())
+              output, expected = sess.run([output, expected])
+              self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
 
 
 class ResnetCompleteNetworkTest(test.TestCase):
@@ -266,22 +250,23 @@ class ResnetCompleteNetworkTest(test.TestCase):
   def _resnet_small(self,
                     inputs,
                     num_classes=None,
+                    is_training=None,
                     global_pool=True,
                     output_stride=None,
                     include_root_block=True,
                     reuse=None,
                     scope='resnet_v1_small'):
     """A shallow and thin ResNet v1 for faster tests."""
-    bottleneck = resnet_v1.bottleneck
+    block = resnet_v1.resnet_v1_block
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]),
-        resnet_utils.Block('block3', bottleneck,
-                           [(16, 4, 1)] * 2 + [(16, 4, 2)]),
-        resnet_utils.Block('block4', bottleneck, [(32, 8, 1)] * 2)
+        block('block1', base_depth=1, num_units=3, stride=2),
+        block('block2', base_depth=2, num_units=3, stride=2),
+        block('block3', base_depth=4, num_units=3, stride=2),
+        block('block4', base_depth=8, num_units=2, stride=1),
     ]
-    return resnet_v1.resnet_v1(inputs, blocks, num_classes, global_pool,
-                               output_stride, include_root_block, reuse, scope)
+    return resnet_v1.resnet_v1(inputs, blocks, num_classes, is_training,
+                               global_pool, output_stride, include_root_block,
+                               reuse, scope)
 
   def testClassificationEndPoints(self):
     global_pool = True
@@ -289,7 +274,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
     self.assertTrue('predictions' in end_points)
@@ -302,7 +287,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 28, 28, 4],
           'resnet/block2': [2, 14, 14, 8],
@@ -319,7 +304,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 321, 321, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 41, 41, 4],
           'resnet/block2': [2, 21, 21, 8],
@@ -338,7 +323,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           include_root_block=False,
           scope='resnet')
       endpoint_to_shape = {
@@ -360,7 +345,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           output_stride=output_stride,
           scope='resnet')
       endpoint_to_shape = {
@@ -377,14 +362,18 @@ class ResnetCompleteNetworkTest(test.TestCase):
     """Verify dense feature extraction with atrous convolution."""
     nominal_stride = 32
     for output_stride in [4, 8, 16, 32, None]:
-      with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
           with self.test_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
             output, _ = self._resnet_small(
-                inputs, None, global_pool=False, output_stride=output_stride)
+                inputs,
+                None,
+                is_training=False,
+                global_pool=False,
+                output_stride=output_stride)
             if output_stride is None:
               factor = 1
             else:
@@ -393,7 +382,8 @@ class ResnetCompleteNetworkTest(test.TestCase):
             # Make the two networks use the same weights.
             variable_scope.get_variable_scope().reuse_variables()
             # Feature extraction at the nominal network rate.
-            expected, _ = self._resnet_small(inputs, None, global_pool=False)
+            expected, _ = self._resnet_small(
+                inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
                 output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
@@ -406,7 +396,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(None, height, width, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, _ = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
@@ -422,7 +412,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     global_pool = False
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
-      output, _ = self._resnet_small(inputs, None, global_pool)
+      output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
@@ -438,7 +428,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       output, _ = self._resnet_small(
-          inputs, None, global_pool, output_stride=output_stride)
+          inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 35c23b52863..63e8f1ff356 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -25,8 +25,6 @@ introduced by:
 
 The key difference of the full preactivation 'v2' variant compared to the
 'v1' variant in [1] is the use of batch normalization before every weight layer.
-Another difference is that 'v2' ResNets do not include an activation function in
-the main pathway. Also see [2; Fig. 4e].
 
 Typical use:
 
@@ -36,15 +34,16 @@ Typical use:
 ResNet-101 for image classification into 1000 classes:
 
    # inputs has shape [batch, 224, 224, 3]
-   with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)):
-      net, end_points = resnet_v2.resnet_v2_101(inputs, 1000)
+   with slim.arg_scope(resnet_v2.resnet_arg_scope()):
+      net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False)
 
 ResNet-101 for semantic segmentation into 21 classes:
 
    # inputs has shape [batch, 513, 513, 3]
-   with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)):
+   with slim.arg_scope(resnet_v2.resnet_arg_scope()):
       net, end_points = resnet_v2.resnet_v2_101(inputs,
                                                 21,
+                                                is_training=False,
                                                 global_pool=False,
                                                 output_stride=16)
 """
@@ -63,6 +62,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 
+resnet_arg_scope = resnet_utils.resnet_arg_scope
+
 
 @add_arg_scope
 def bottleneck(inputs,
@@ -129,6 +130,7 @@ def bottleneck(inputs,
 def resnet_v2(inputs,
               blocks,
               num_classes=None,
+              is_training=None,
               global_pool=True,
               output_stride=None,
               include_root_block=True,
@@ -163,6 +165,8 @@ def resnet_v2(inputs,
       is a resnet_utils.Block object describing the units in the block.
     num_classes: Number of predicted classes for classification tasks. If None
       we return the features before the logit layer.
+    is_training: whether is training or not. If None, the value inherited from
+      the resnet_arg_scope is used. Specifying value None is deprecated.
     global_pool: If True, we perform global average pooling before computing the
       logits. Set to True for image classification, False for dense prediction.
     output_stride: If None, then the output will be computed at the nominal
@@ -196,63 +200,91 @@ def resnet_v2(inputs,
     with arg_scope(
         [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
         outputs_collections=end_points_collection):
-      net = inputs
-      if include_root_block:
-        if output_stride is not None:
-          if output_stride % 4 != 0:
-            raise ValueError('The output_stride needs to be a multiple of 4.')
-          output_stride /= 4
-        # We do not include batch normalization or activation functions in conv1
-        # because the first ResNet unit will perform these. Cf. Appendix of [2].
-        with arg_scope(
-            [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
-          net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
-        net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
-      net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
-      # This is needed because the pre-activation variant does not have batch
-      # normalization or activation functions in the residual unit output. See
-      # Appendix of [2].
-      net = layers.batch_norm(net, activation_fn=nn_ops.relu, scope='postnorm')
-      if global_pool:
-        # Global average pooling.
-        net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
-      if num_classes is not None:
-        net = layers_lib.conv2d(
-            net,
-            num_classes, [1, 1],
-            activation_fn=None,
-            normalizer_fn=None,
-            scope='logits')
-      # Convert end_points_collection into a dictionary of end_points.
-      end_points = utils.convert_collection_to_dict(end_points_collection)
-      if num_classes is not None:
-        end_points['predictions'] = layers.softmax(net, scope='predictions')
-      return net, end_points
-
-
+      if is_training is not None:
+        bn_scope = arg_scope([layers.batch_norm], is_training=is_training)
+      else:
+        bn_scope = arg_scope([])
+      with bn_scope:
+        net = inputs
+        if include_root_block:
+          if output_stride is not None:
+            if output_stride % 4 != 0:
+              raise ValueError('The output_stride needs to be a multiple of 4.')
+            output_stride /= 4
+          # We do not include batch normalization or activation functions in
+          # conv1 because the first ResNet unit will perform these. Cf.
+          # Appendix of [2].
+          with arg_scope(
+              [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
+            net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
+          net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
+        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
+        # This is needed because the pre-activation variant does not have batch
+        # normalization or activation functions in the residual unit output. See
+        # Appendix of [2].
+        net = layers.batch_norm(
+            net, activation_fn=nn_ops.relu, scope='postnorm')
+        if global_pool:
+          # Global average pooling.
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+        if num_classes is not None:
+          net = layers_lib.conv2d(
+              net,
+              num_classes, [1, 1],
+              activation_fn=None,
+              normalizer_fn=None,
+              scope='logits')
+        # Convert end_points_collection into a dictionary of end_points.
+        end_points = utils.convert_collection_to_dict(end_points_collection)
+        if num_classes is not None:
+          end_points['predictions'] = layers.softmax(net, scope='predictions')
+        return net, end_points
 resnet_v2.default_image_size = 224
 
 
+def resnet_v2_block(scope, base_depth, num_units, stride):
+  """Helper function for creating a resnet_v2 bottleneck block.
+
+  Args:
+    scope: The scope of the block.
+    base_depth: The depth of the bottleneck layer for each unit.
+    num_units: The number of units in the block.
+    stride: The stride of the block, implemented as a stride in the last unit.
+      All other units have stride=1.
+
+  Returns:
+    A resnet_v2 bottleneck block.
+  """
+  return resnet_utils.Block(scope, bottleneck, [{
+      'depth': base_depth * 4,
+      'depth_bottleneck': base_depth,
+      'stride': 1
+  }] * (num_units - 1) + [{
+      'depth': base_depth * 4,
+      'depth_bottleneck': base_depth,
+      'stride': stride
+  }])
+
+
 def resnet_v2_50(inputs,
                  num_classes=None,
+                 is_training=None,
                  global_pool=True,
                  output_stride=None,
                  reuse=None,
                  scope='resnet_v2_50'):
   """ResNet-50 model of [1]. See resnet_v2() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 3 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
+      resnet_v2_block('block3', base_depth=256, num_units=6, stride=2),
+      resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v2(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -263,23 +295,22 @@ def resnet_v2_50(inputs,
 def resnet_v2_101(inputs,
                   num_classes=None,
                   global_pool=True,
+                  is_training=None,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v2_101'):
   """ResNet-101 model of [1]. See resnet_v2() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 3 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
+      resnet_v2_block('block3', base_depth=256, num_units=23, stride=2),
+      resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v2(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -289,24 +320,23 @@ def resnet_v2_101(inputs,
 
 def resnet_v2_152(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v2_152'):
   """ResNet-152 model of [1]. See resnet_v2() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 7 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v2_block('block2', base_depth=128, num_units=8, stride=2),
+      resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
+      resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v2(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -316,24 +346,23 @@ def resnet_v2_152(inputs,
 
 def resnet_v2_200(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v2_200'):
   """ResNet-200 model of [2]. See resnet_v2() for arg and return description."""
   blocks = [
-      resnet_utils.Block('block1', bottleneck,
-                         [(256, 64, 1)] * 2 + [(256, 64, 2)]),
-      resnet_utils.Block('block2', bottleneck,
-                         [(512, 128, 1)] * 23 + [(512, 128, 2)]),
-      resnet_utils.Block('block3', bottleneck,
-                         [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
-      resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+      resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
+      resnet_v2_block('block2', base_depth=128, num_units=24, stride=2),
+      resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
+      resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
   ]
   return resnet_v2(
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
index b33b7921abb..c4f3b071fd9 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib import layers
@@ -175,10 +168,11 @@ class ResnetUtilsTest(test.TestCase):
 
   def testEndPointsV2(self):
     """Test the end points of a tiny v2 bottleneck network."""
-    bottleneck = resnet_v2.bottleneck
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])
+        resnet_v2.resnet_v2_block(
+            'block1', base_depth=1, num_units=2, stride=2),
+        resnet_v2.resnet_v2_block(
+            'block2', base_depth=2, num_units=2, stride=1),
     ]
     inputs = create_test_input(2, 32, 16, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
@@ -206,62 +200,52 @@ class ResnetUtilsTest(test.TestCase):
     for block in blocks:
       with variable_scope.variable_scope(block.scope, 'block', [net]):
         for i, unit in enumerate(block.args):
-          depth, depth_bottleneck, stride = unit
           with variable_scope.variable_scope('unit_%d' % (i + 1), values=[net]):
-            net = block.unit_fn(
-                net,
-                depth=depth,
-                depth_bottleneck=depth_bottleneck,
-                stride=stride,
-                rate=1)
+            net = block.unit_fn(net, rate=1, **unit)
     return net
 
-  def _atrousValues(self, bottleneck):
+  def testAtrousValuesBottleneck(self):
     """Verify the values of dense feature extraction by atrous convolution.
 
     Make sure that dense feature extraction by stack_blocks_dense() followed by
     subsampling gives identical results to feature extraction at the nominal
     network output stride using the simple self._stack_blocks_nondense() above.
-
-    Args:
-      bottleneck: The bottleneck function.
     """
+    block = resnet_v2.resnet_v2_block
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]),
-        resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]),
-        resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)])
+        block('block1', base_depth=1, num_units=2, stride=2),
+        block('block2', base_depth=2, num_units=2, stride=2),
+        block('block3', base_depth=4, num_units=2, stride=2),
+        block('block4', base_depth=8, num_units=2, stride=1),
     ]
     nominal_stride = 8
 
     # Test both odd and even input dimensions.
     height = 30
     width = 31
-    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
-      for output_stride in [1, 2, 4, 8, None]:
-        with ops.Graph().as_default():
-          with self.test_session() as sess:
-            random_seed.set_random_seed(0)
-            inputs = create_test_input(1, height, width, 3)
-            # Dense feature extraction followed by subsampling.
-            output = resnet_utils.stack_blocks_dense(inputs, blocks,
-                                                     output_stride)
-            if output_stride is None:
-              factor = 1
-            else:
-              factor = nominal_stride // output_stride
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      with arg_scope([layers.batch_norm], is_training=False):
+        for output_stride in [1, 2, 4, 8, None]:
+          with ops.Graph().as_default():
+            with self.test_session() as sess:
+              random_seed.set_random_seed(0)
+              inputs = create_test_input(1, height, width, 3)
+              # Dense feature extraction followed by subsampling.
+              output = resnet_utils.stack_blocks_dense(inputs, blocks,
+                                                       output_stride)
+              if output_stride is None:
+                factor = 1
+              else:
+                factor = nominal_stride // output_stride
 
-            output = resnet_utils.subsample(output, factor)
-            # Make the two networks use the same weights.
-            variable_scope.get_variable_scope().reuse_variables()
-            # Feature extraction at the nominal network rate.
-            expected = self._stack_blocks_nondense(inputs, blocks)
-            sess.run(variables.global_variables_initializer())
-            output, expected = sess.run([output, expected])
-            self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
-
-  def testAtrousValuesBottleneck(self):
-    self._atrousValues(resnet_v2.bottleneck)
+              output = resnet_utils.subsample(output, factor)
+              # Make the two networks use the same weights.
+              variable_scope.get_variable_scope().reuse_variables()
+              # Feature extraction at the nominal network rate.
+              expected = self._stack_blocks_nondense(inputs, blocks)
+              sess.run(variables.global_variables_initializer())
+              output, expected = sess.run([output, expected])
+              self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
 
 
 class ResnetCompleteNetworkTest(test.TestCase):
@@ -270,22 +254,23 @@ class ResnetCompleteNetworkTest(test.TestCase):
   def _resnet_small(self,
                     inputs,
                     num_classes=None,
+                    is_training=None,
                     global_pool=True,
                     output_stride=None,
                     include_root_block=True,
                     reuse=None,
                     scope='resnet_v2_small'):
     """A shallow and thin ResNet v2 for faster tests."""
-    bottleneck = resnet_v2.bottleneck
+    block = resnet_v2.resnet_v2_block
     blocks = [
-        resnet_utils.Block('block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]),
-        resnet_utils.Block('block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]),
-        resnet_utils.Block('block3', bottleneck,
-                           [(16, 4, 1)] * 2 + [(16, 4, 2)]),
-        resnet_utils.Block('block4', bottleneck, [(32, 8, 1)] * 2)
+        block('block1', base_depth=1, num_units=3, stride=2),
+        block('block2', base_depth=2, num_units=3, stride=2),
+        block('block3', base_depth=4, num_units=3, stride=2),
+        block('block4', base_depth=8, num_units=2, stride=1),
     ]
-    return resnet_v2.resnet_v2(inputs, blocks, num_classes, global_pool,
-                               output_stride, include_root_block, reuse, scope)
+    return resnet_v2.resnet_v2(inputs, blocks, num_classes, is_training,
+                               global_pool, output_stride, include_root_block,
+                               reuse, scope)
 
   def testClassificationEndPoints(self):
     global_pool = True
@@ -293,7 +278,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
     self.assertTrue('predictions' in end_points)
@@ -306,7 +291,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 28, 28, 4],
           'resnet/block2': [2, 14, 14, 8],
@@ -323,7 +308,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 321, 321, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 41, 41, 4],
           'resnet/block2': [2, 21, 21, 8],
@@ -342,7 +327,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           include_root_block=False,
           scope='resnet')
       endpoint_to_shape = {
@@ -364,7 +349,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           output_stride=output_stride,
           scope='resnet')
       endpoint_to_shape = {
@@ -381,14 +366,18 @@ class ResnetCompleteNetworkTest(test.TestCase):
     """Verify dense feature extraction with atrous convolution."""
     nominal_stride = 32
     for output_stride in [4, 8, 16, 32, None]:
-      with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
           with self.test_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
             output, _ = self._resnet_small(
-                inputs, None, global_pool=False, output_stride=output_stride)
+                inputs,
+                None,
+                is_training=False,
+                global_pool=False,
+                output_stride=output_stride)
             if output_stride is None:
               factor = 1
             else:
@@ -397,7 +386,8 @@ class ResnetCompleteNetworkTest(test.TestCase):
             # Make the two networks use the same weights.
             variable_scope.get_variable_scope().reuse_variables()
             # Feature extraction at the nominal network rate.
-            expected, _ = self._resnet_small(inputs, None, global_pool=False)
+            expected, _ = self._resnet_small(
+                inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
                 output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
@@ -410,7 +400,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(None, height, width, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, _ = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
@@ -426,7 +416,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     global_pool = False
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
-      output, _ = self._resnet_small(inputs, None, global_pool)
+      output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
@@ -442,7 +432,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       output, _ = self._resnet_small(
-          inputs, None, global_pool, output_stride=output_stride)
+          inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/slim/python/slim/nets/vgg_test.py b/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
index 317aca00ef7..36628b32d15 100644
--- a/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.slim.python.slim.nets import vgg
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/slim/python/slim/summaries.py b/tensorflow/contrib/slim/python/slim/summaries.py
new file mode 100644
index 00000000000..358359d6ebe
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/summaries.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains helper functions for creating summaries.
+
+This module contains various helper functions for quickly and easily adding
+tensorflow summaries. These allow users to print summary values
+automatically as they are computed and add prefixes to collections of summaries.
+
+Example usage:
+
+  import tensorflow as tf
+  slim = tf.contrib.slim
+
+  slim.summaries.add_histogram_summaries(slim.variables.get_model_variables())
+  slim.summaries.add_scalar_summary(total_loss, 'Total Loss')
+  slim.summaries.add_scalar_summary(learning_rate, 'Learning Rate')
+  slim.summaries.add_histogram_summaries(my_tensors)
+  slim.summaries.add_zero_fraction_summaries(my_tensors)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import nn_impl as nn
+from tensorflow.python.summary import summary
+
+
+def _get_summary_name(tensor, name=None, prefix=None, postfix=None):
+  """Produces the summary name given.
+
+  Args:
+    tensor: A variable or op `Tensor`.
+    name: The optional name for the summary.
+    prefix: An optional prefix for the summary name.
+    postfix: An optional postfix for the summary name.
+
+  Returns:
+    a summary name.
+  """
+  if not name:
+    name = tensor.op.name
+  if prefix:
+    name = prefix + '/' + name
+  if postfix:
+    name = name + '/' + postfix
+  return name
+
+
+def add_histogram_summary(tensor, name=None, prefix=None):
+  """Adds a histogram summary for the given tensor.
+
+  Args:
+    tensor: A variable or op tensor.
+    name: The optional name for the summary.
+    prefix: An optional prefix for the summary names.
+
+  Returns:
+    A scalar `Tensor` of type `string` whose contents are the serialized
+    `Summary` protocol buffer.
+  """
+  return summary.histogram(
+      _get_summary_name(tensor, name, prefix), tensor)
+
+
+def add_image_summary(tensor, name=None, prefix=None, print_summary=False):
+  """Adds an image summary for the given tensor.
+
+  Args:
+    tensor: a variable or op tensor with shape [batch,height,width,channels]
+    name: the optional name for the summary.
+    prefix: An optional prefix for the summary names.
+    print_summary: If `True`, the summary is printed to stdout when the summary
+      is computed.
+
+  Returns:
+    An image `Tensor` of type `string` whose contents are the serialized
+    `Summary` protocol buffer.
+  """
+  summary_name = _get_summary_name(tensor, name, prefix)
+  # If print_summary, then we need to make sure that this call doesn't add the
+  # non-printing op to the collection. We'll add it to the collection later.
+  collections = [] if print_summary else None
+  op = summary.image(
+      name=summary_name, tensor=tensor, collections=collections)
+  if print_summary:
+    op = logging_ops.Print(op, [tensor], summary_name)
+    ops.add_to_collection(ops.GraphKeys.SUMMARIES, op)
+  return op
+
+
+def add_scalar_summary(tensor, name=None, prefix=None, print_summary=False):
+  """Adds a scalar summary for the given tensor.
+
+  Args:
+    tensor: a variable or op tensor.
+    name: the optional name for the summary.
+    prefix: An optional prefix for the summary names.
+    print_summary: If `True`, the summary is printed to stdout when the summary
+      is computed.
+
+  Returns:
+    A scalar `Tensor` of type `string` whose contents are the serialized
+    `Summary` protocol buffer.
+  """
+  collections = [] if print_summary else None
+  summary_name = _get_summary_name(tensor, name, prefix)
+
+  # If print_summary, then we need to make sure that this call doesn't add the
+  # non-printing op to the collection. We'll add it to the collection later.
+  op = summary.scalar(
+      name=summary_name, tensor=tensor, collections=collections)
+  if print_summary:
+    op = logging_ops.Print(op, [tensor], summary_name)
+    ops.add_to_collection(ops.GraphKeys.SUMMARIES, op)
+  return op
+
+
+def add_zero_fraction_summary(tensor, name=None, prefix=None,
+                              print_summary=False):
+  """Adds a summary for the percentage of zero values in the given tensor.
+
+  Args:
+    tensor: a variable or op tensor.
+    name: the optional name for the summary.
+    prefix: An optional prefix for the summary names.
+    print_summary: If `True`, the summary is printed to stdout when the summary
+      is computed.
+
+  Returns:
+    A scalar `Tensor` of type `string` whose contents are the serialized
+    `Summary` protocol buffer.
+  """
+  name = _get_summary_name(tensor, name, prefix, 'Fraction of Zero Values')
+  tensor = nn.zero_fraction(tensor)
+  return add_scalar_summary(tensor, name, print_summary=print_summary)
+
+
+def add_histogram_summaries(tensors, prefix=None):
+  """Adds a histogram summary for each of the given tensors.
+
+  Args:
+    tensors: A list of variable or op tensors.
+    prefix: An optional prefix for the summary names.
+
+  Returns:
+    A list of scalar `Tensors` of type `string` whose contents are the
+    serialized `Summary` protocol buffer.
+  """
+  summary_ops = []
+  for tensor in tensors:
+    summary_ops.append(add_histogram_summary(tensor, prefix=prefix))
+  return summary_ops
+
+
+def add_image_summaries(tensors, prefix=None):
+  """Adds an image summary for each of the given tensors.
+
+  Args:
+    tensors: A list of variable or op tensors.
+    prefix: An optional prefix for the summary names.
+
+  Returns:
+    A list of scalar `Tensors` of type `string` whose contents are the
+    serialized `Summary` protocol buffer.
+  """
+  summary_ops = []
+  for tensor in tensors:
+    summary_ops.append(add_image_summary(tensor, prefix=prefix))
+  return summary_ops
+
+
+def add_scalar_summaries(tensors, prefix=None, print_summary=False):
+  """Adds a scalar summary for each of the given tensors.
+
+  Args:
+    tensors: a list of variable or op tensors.
+    prefix: An optional prefix for the summary names.
+    print_summary: If `True`, the summary is printed to stdout when the summary
+      is computed.
+
+  Returns:
+    A list of scalar `Tensors` of type `string` whose contents are the
+    serialized `Summary` protocol buffer.
+  """
+  summary_ops = []
+  for tensor in tensors:
+    summary_ops.append(add_scalar_summary(tensor, prefix=prefix,
+                                          print_summary=print_summary))
+  return summary_ops
+
+
+def add_zero_fraction_summaries(tensors, prefix=None):
+  """Adds a scalar zero-fraction summary for each of the given tensors.
+
+  Args:
+    tensors: a list of variable or op tensors.
+    prefix: An optional prefix for the summary names.
+
+  Returns:
+    A list of scalar `Tensors` of type `string` whose contents are the
+    serialized `Summary` protocol buffer.
+  """
+  summary_ops = []
+  for tensor in tensors:
+    summary_ops.append(add_zero_fraction_summary(tensor, prefix=prefix))
+  return summary_ops
diff --git a/tensorflow/contrib/slim/python/slim/summaries_test.py b/tensorflow/contrib/slim/python/slim/summaries_test.py
new file mode 100644
index 00000000000..873ee78de27
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/summaries_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.contrib.slim.summaries."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+
+
+from tensorflow.contrib.slim.python.slim import summaries
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary
+from tensorflow.python.summary import summary_iterator
+
+
+class SummariesTest(test.TestCase):
+
+  def safe_create(self, output_dir):
+    if gfile.Exists(output_dir):
+      gfile.DeleteRecursively(output_dir)
+    gfile.MakeDirs(output_dir)
+
+  def assert_scalar_summary(self, output_dir, names_to_values):
+    """Asserts that the given output directory contains written summaries.
+
+    Args:
+      output_dir: The output directory in which to look for even tfiles.
+      names_to_values: A dictionary of summary names to values.
+    """
+    # The events file may have additional entries, e.g. the event version
+    # stamp, so have to parse things a bit.
+    output_filepath = glob.glob(os.path.join(output_dir, '*'))
+    self.assertEqual(len(output_filepath), 1)
+
+    events = summary_iterator.summary_iterator(output_filepath[0])
+    summaries_list = [e.summary for e in events if e.summary.value]
+    values = []
+    for item in summaries_list:
+      for value in item.value:
+        values.append(value)
+    saved_results = {v.tag: v.simple_value for v in values}
+    for name in names_to_values:
+      self.assertAlmostEqual(names_to_values[name], saved_results[name])
+
+  def testScalarSummaryIsPartOfCollectionWithNoPrint(self):
+    tensor = array_ops.ones([]) * 3
+    name = 'my_score'
+    prefix = 'eval'
+    op = summaries.add_scalar_summary(tensor, name, prefix, print_summary=False)
+    self.assertTrue(op in ops.get_collection(ops.GraphKeys.SUMMARIES))
+
+  def testScalarSummaryIsPartOfCollectionWithPrint(self):
+    tensor = array_ops.ones([]) * 3
+    name = 'my_score'
+    prefix = 'eval'
+    op = summaries.add_scalar_summary(tensor, name, prefix, print_summary=True)
+    self.assertTrue(op in ops.get_collection(ops.GraphKeys.SUMMARIES))
+
+  def verify_scalar_summary_is_written(self, print_summary):
+    value = 3
+    tensor = array_ops.ones([]) * value
+    name = 'my_score'
+    prefix = 'eval'
+    summaries.add_scalar_summary(tensor, name, prefix, print_summary)
+
+    output_dir = os.path.join(self.get_temp_dir(),
+                              'scalar_summary_no_print_test')
+    self.safe_create(output_dir)
+
+    summary_op = summary.merge_all()
+
+    summary_writer = summary.FileWriter(output_dir)
+    with self.test_session() as sess:
+      new_summary = sess.run(summary_op)
+      summary_writer.add_summary(new_summary, 1)
+      summary_writer.flush()
+
+    self.assert_scalar_summary(output_dir, {
+        '%s/%s' % (prefix, name): value
+    })
+
+  def testScalarSummaryIsWrittenWithNoPrint(self):
+    self.verify_scalar_summary_is_written(print_summary=False)
+
+  def testScalarSummaryIsWrittenWithPrint(self):
+    self.verify_scalar_summary_is_written(print_summary=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index bd59c626f2e..7441f1429fb 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -27,6 +27,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 0be4988dbf6..19d213fb3e8 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -28,3 +28,8 @@ from __future__ import print_function
 from tensorflow.contrib.sparsemax.python.ops.sparsemax import sparsemax
 from tensorflow.contrib.sparsemax.python.ops.sparsemax_loss \
     import sparsemax_loss
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['sparsemax', 'sparsemax_loss']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 89dbcd96f86..c8b4e472c99 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -159,7 +159,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertShapeEqual(q, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst estimated-loss Rop"""
+    """check sparsemax-loss Rop, against estimated-loss Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     q = np.zeros((test_obs, 10)).astype(dtype)
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
@@ -178,7 +178,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst numpy Rop"""
+    """check sparsemax-loss Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index eafac1b9ae7..82d36ee9cb2 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -188,7 +188,7 @@ class SparsemaxTest(test.TestCase):
     self.assertShapeEqual(z, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst estimated Rop"""
+    """check sparsemax Rop, against estimated Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = array_ops.placeholder(dtype, name='z')
@@ -204,7 +204,7 @@ class SparsemaxTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst numpy Rop"""
+    """check sparsemax Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = constant_op.constant(z, name='z')
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 6e1cd75f223..73a5cf1e928 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -13,16 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 """Sparsemax op."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.framework import ops, dtypes
-from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.platform import resource_loader
+
+__all__ = ["sparsemax"]
 
 
 def sparsemax(logits, name=None):
@@ -55,8 +58,7 @@ def sparsemax(logits, name=None):
     # calculate k(z)
     z_cumsum = math_ops.cumsum(z_sorted, axis=1)
     k = math_ops.range(
-        1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype
-    )
+        1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype)
     z_check = 1 + k * z_sorted > z_cumsum
     # because the z_check vector is always [1,1,...1,0,0,...0] finding the
     # (index + 1) of the last `1` is the same as just summing the number of 1.
@@ -69,6 +71,4 @@ def sparsemax(logits, name=None):
 
     # calculate p
     return math_ops.maximum(
-        math_ops.cast(0, logits.dtype),
-        z - tau_z[:, array_ops.newaxis]
-    )
+        math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index 1f5e8c37e35..ba18f89e16c 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Sparsemax Loss op."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,6 +24,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
+__all__ = ["sparsemax_loss"]
+
 
 def sparsemax_loss(logits, sparsemax, labels, name=None):
   """Computes sparsemax loss function [1].
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index f7b9d7f209e..dfdbb61dccf 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -53,6 +53,7 @@ tf_py_test(
 
 tf_py_test(
     name = "summaries_test",
+    size = "small",
     srcs = ["python/summaries_test.py"],
     additional_deps = [
         ":specs",
diff --git a/tensorflow/contrib/specs/python/specs.py b/tensorflow/contrib/specs/python/specs.py
index a9fba442db5..d5223b9b551 100644
--- a/tensorflow/contrib/specs/python/specs.py
+++ b/tensorflow/contrib/specs/python/specs.py
@@ -19,13 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-import inspect
-
 from six import exec_
 from tensorflow.contrib.specs.python import params_ops
 from tensorflow.contrib.specs.python import specs_lib
 from tensorflow.contrib.specs.python import specs_ops
+from tensorflow.python.util import tf_inspect
 
 
 def eval_params(params, environment=None):
@@ -44,7 +42,8 @@ def eval_params(params, environment=None):
   """
   specs_lib.check_keywords(params)
   bindings = {}
-  if environment: bindings.update(environment)
+  if environment:
+    bindings.update(environment)
   exec_(params, vars(params_ops), bindings)  # pylint: disable=exec-used
   return bindings
 
@@ -71,7 +70,8 @@ def eval_spec(spec, environment=None):
   """
   specs_lib.check_keywords(spec)
   bindings = {}
-  if environment: bindings.update(environment)
+  if environment:
+    bindings.update(environment)
   exec_(spec, vars(specs_ops), bindings)  # pylint: disable=exec-used
   return bindings
 
@@ -141,7 +141,7 @@ class LocalImport(object):
     self.names = names
 
   def __enter__(self):
-    self.frame = inspect.currentframe()
+    self.frame = tf_inspect.currentframe()
     bindings = self.frame.f_back.f_globals
     self.old = {k: bindings.get(k, None) for k in self.names.keys()}
     bindings.update(self.names)
@@ -151,7 +151,9 @@ class LocalImport(object):
     bindings = self.frame.f_back.f_globals
     bindings.update(self.old)
     for k, v in self.old.items():
-      if v is None: del bindings[k]
+      if v is None:
+        del bindings[k]
     del self.frame
 
+
 ops = LocalImport(specs_ops)
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index 7004ca2e637..41782a9fc9a 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.specs import python
diff --git a/tensorflow/contrib/specs/python/summaries_test.py b/tensorflow/contrib/specs/python/summaries_test.py
index 090b4d2361f..34ff4bc8cad 100644
--- a/tensorflow/contrib/specs/python/summaries_test.py
+++ b/tensorflow/contrib/specs/python/summaries_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.specs.python import specs
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
new file mode 100644
index 00000000000..8ffc96c3469
--- /dev/null
+++ b/tensorflow/contrib/staging/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "staging",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/contrib/staging/__init__.py b/tensorflow/contrib/staging/__init__.py
new file mode 100644
index 00000000000..e58ac31918a
--- /dev/null
+++ b/tensorflow/contrib/staging/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""contrib module containing StagingArea."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.data_flow_ops import StagingArea
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
new file mode 100644
index 00000000000..1d9c1ffa50d
--- /dev/null
+++ b/tensorflow/contrib/stateless/BUILD
@@ -0,0 +1,50 @@
+# Stateless random ops
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+tf_gen_op_wrapper_py(
+    name = "stateless_random_ops",
+    out = "gen_stateless_random_ops.py",  # cmake chokes without this
+    deps = ["//tensorflow/core:stateless_random_ops_op_lib"],
+)
+
+py_library(
+    name = "stateless",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":stateless_random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "stateless_random_ops_test",
+    srcs = ["python/kernel_tests/stateless_random_ops_test.py"],
+    additional_deps = [
+        ":stateless",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
new file mode 100644
index 00000000000..82e5d36ce44
--- /dev/null
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless random ops which take seed as a tensor input.
+
+Instead of taking `seed` as an attr which initializes a mutable state within
+the op, these random ops take `seed` as an input, and the random numbers are
+a deterministic function of `shape` and `seed`.
+
+WARNING: These ops are in contrib, and are not stable.  They should be
+consistent across multiple runs on the same hardware, but only for the same
+version of the code.
+
+@@stateless_random_uniform
+@@stateless_random_normal
+@@stateless_truncated_normal
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.stateless.gen_stateless_random_ops import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
new file mode 100644
index 00000000000..9a36bdc2f95
--- /dev/null
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -0,0 +1,84 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateless random ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import stateless
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+CASES = [(stateless.stateless_random_uniform, random_ops.random_uniform),
+         (stateless.stateless_random_normal, random_ops.random_normal),
+         (stateless.stateless_truncated_normal, random_ops.truncated_normal)]
+
+
+def invert_philox(key, value):
+  """Invert the Philox bijection."""
+  key = np.array(key, dtype=np.uint32)
+  value = np.array(value, dtype=np.uint32)
+  step = np.array([0x9E3779B9, 0xBB67AE85], dtype=np.uint32)
+  for n in range(10)[::-1]:
+    key0, key1 = key + n * step
+    v0 = value[3] * 0x991a7cdb & 0xffffffff
+    v2 = value[1] * 0x6d7cae67 & 0xffffffff
+    hi0 = v0 * 0xD2511F53 >> 32
+    hi1 = v2 * 0xCD9E8D57 >> 32
+    v1 = hi1 ^ value[0] ^ key0
+    v3 = hi0 ^ value[2] ^ key1
+    value = v0, v1, v2, v3
+  return np.array(value)
+
+
+class StatelessOpsTest(test.TestCase):
+
+  def testMatchStateful(self):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    for seed in (7, 17), (11, 5), (2, 3):
+      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
+      preseed = preseed[::2] | preseed[1::2] << 32
+      random_seed.set_random_seed(seed[0])
+      with self.test_session(use_gpu=True):
+        for stateless_op, stateful_op in CASES:
+          for shape in (), (3,), (2, 5):
+            stateful = stateful_op(shape, seed=seed[1])
+            pure = stateless_op(shape, seed=preseed)
+            self.assertAllEqual(stateful.eval(), pure.eval())
+
+  def testDeterminism(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    with self.test_session(use_gpu=True):
+      seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for stateless_op, _ in CASES:
+        for shape in (), (3,), (2, 5):
+          pure = stateless_op(shape, seed=seed_t)
+          values = [(seed, pure.eval(feed_dict={seed_t: seed}))
+                    for seed in seeds]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 74004665388..40d23183112 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -7,9 +7,12 @@ package(default_visibility = [
     "//visibility:public",
 ])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 filegroup(
     name = "all_files",
@@ -32,6 +35,7 @@ filegroup(
         ],
         exclude = [
             "kernels/*_test.cc",
+            "kernels/tree_utils.cc",
         ],
     ),
 )
@@ -42,6 +46,10 @@ filegroup(
         [
             "kernels/*.h",
         ],
+        exclude = [
+            "kernels/data_spec.h",
+            "kernels/tree_utils.h",
+        ],
     ),
 )
 
@@ -50,9 +58,10 @@ cc_library(
     srcs = [":custom_op_sources"],
     hdrs = [":custom_op_headers"],
     deps = [
+        ":tree_utils",
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
     alwayslink = 1,
 )
@@ -101,7 +110,6 @@ tf_custom_op_library(
         "kernels/reinterpret_string_to_float_op.cc",
         "kernels/sample_inputs_op.cc",
         "kernels/scatter_add_ndim_op.cc",
-        "kernels/topn_ops.cc",
         "kernels/tree_predictions_op.cc",
         "kernels/update_fertile_slots_op.cc",
         "ops/tensor_forest_ops.cc",
@@ -127,10 +135,36 @@ py_library(
     ],
 )
 
-py_library(
+tf_kernel_library(
+    name = "tensor_forest_kernels",
+    srcs = [
+        "kernels/best_splits_op.cc",
+        "kernels/count_extremely_random_stats_op.cc",
+        "kernels/finished_nodes_op.cc",
+        "kernels/grow_tree_op.cc",
+        "kernels/reinterpret_string_to_float_op.cc",
+        "kernels/sample_inputs_op.cc",
+        "kernels/scatter_add_ndim_op.cc",
+        "kernels/tree_predictions_op.cc",
+        "kernels/update_fertile_slots_op.cc",
+    ],
+    deps = [
+        ":tree_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check",
+    ],
+)
+
+tf_custom_op_py_library(
     name = "tensor_forest_ops_py",
     srcs = ["python/ops/tensor_forest_ops.py"],
-    data = ["python/ops/_tensor_forest_ops.so"],
+    dso = ["python/ops/_tensor_forest_ops.so"],
+    kernels = [
+        ":tensor_forest_kernels",
+        ":tensor_forest_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
@@ -150,6 +184,7 @@ py_library(
     srcs = ["client/eval_metrics.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/learn:estimator_constants_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/python:array_ops",
@@ -193,7 +228,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
 )
 
@@ -270,6 +305,7 @@ py_test(
     size = "small",
     srcs = ["python/kernel_tests/scatter_add_ndim_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip_gpu"],
     deps = [
         ":tensor_forest_ops_py",
         "//tensorflow/python:framework_test_lib",
@@ -361,42 +397,6 @@ cc_test(
     ],
 )
 
-py_library(
-    name = "topn_py",
-    srcs = ["python/topn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":constants",
-        ":tensor_forest_ops_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-py_test(
-    name = "topn_test",
-    size = "small",
-    srcs = ["python/topn_test.py"],
-    shard_count = 10,
-    srcs_version = "PY2AND3",
-    tags = ["manual"],
-    deps = [
-        ":tensor_forest_ops_py",
-        ":topn_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
-    ],
-)
-
 py_library(
     name = "random_forest",
     srcs = ["client/random_forest.py"],
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 5082e8b1279..17269863542 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -20,14 +20,17 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import losses
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 
-INFERENCE_PROB_NAME = 'inference'
-INFERENCE_PRED_NAME = 'predictions'
+INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
+INFERENCE_PRED_NAME = prediction_key.PredictionKey.CLASSES
+
+FEATURE_IMPORTANCE_NAME = 'global_feature_importance'
 
 
 def _top_k_generator(k):
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 174394d67e3..ef2f0337ac9 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,12 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.learn.python.learn import evaluable
-from tensorflow.contrib.learn.python.learn import trainable
 
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
@@ -34,6 +33,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 
 
@@ -59,6 +60,17 @@ def _assert_float32(tensors):
       raise TypeError('Expected dtype=float32, %s.' % tensor)
 
 
+class TensorForestRunOpAtEndHook(session_run_hook.SessionRunHook):
+
+  def __init__(self, op_dict):
+    """Ops is a dict of {name: op} to run before the session is destroyed."""
+    self._ops = op_dict
+
+  def end(self, session):
+    for name, op in self._ops.iteritems():
+      logging.info('{0}: {1}'.format(name, session.run(op)))
+
+
 class TensorForestLossHook(session_run_hook.SessionRunHook):
   """Monitor to request stop when loss stops decreasing."""
 
@@ -80,7 +92,7 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
     current_loss = run_values.results['current_loss']
     current_step = run_values.results['global_step']
     self.steps += 1
-    # Gaurd against the global step going backwards, which might happen
+    # Guard against the global step going backwards, which might happen
     # if we recover from something.
     if self.last_step == -1 or self.last_step > current_step:
       logging.info('TensorForestLossHook resetting last_step.')
@@ -98,36 +110,86 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
 
 
-def get_model_fn(params, graph_builder_class, device_assigner,
-                 weights_name=None, keys_name=None, num_trainers=1,
-                 trainer_id=0):
+class EveryCheckpointPreSaveListener(
+    basic_session_run_hooks.CheckpointSaverListener):
+  """Runs a given op before each checkpoint save."""
+
+  def __init__(self, op):
+    """Initializes the object.
+
+    Args:
+      op: An op to run before each checkpoint save.
+    """
+    self._op = op
+
+  def before_save(self, session, global_step_value):
+    session.run(self._op)
+
+
+def get_model_fn(params,
+                 graph_builder_class,
+                 device_assigner,
+                 weights_name=None,
+                 keys_name=None,
+                 early_stopping_rounds=100,
+                 num_trainers=1,
+                 trainer_id=0,
+                 report_feature_importances=False,
+                 model_dir=None,
+                 local_eval=False):
   """Return a model function given a way to construct a graph builder."""
   def _model_fn(features, labels, mode):
     """Function that returns predictions, training loss, and training op."""
     weights = None
-    keys = None
     if weights_name and weights_name in features:
       weights = features.pop(weights_name)
+
+    keys = None
     if keys_name and keys_name in features:
       keys = features.pop(keys_name)
 
-    graph_builder = graph_builder_class(params, device_assigner=device_assigner)
+    # If we're doing eval, optionally ignore device_assigner.
+    # Also ignore device assigner if we're exporting (mode == INFER)
+    dev_assn = device_assigner
+    if (mode == model_fn_lib.ModeKeys.INFER or
+        (local_eval and mode == model_fn_lib.ModeKeys.EVAL)):
+      dev_assn = None
+
+    graph_builder = graph_builder_class(params,
+                                        device_assigner=dev_assn)
     inference = {}
+    output_alternatives = None
     if (mode == model_fn_lib.ModeKeys.EVAL or
         mode == model_fn_lib.ModeKeys.INFER):
       inference[eval_metrics.INFERENCE_PROB_NAME] = (
           graph_builder.inference_graph(features))
 
-      if not params.regression:
+      if params.regression:
+        predictions = {
+            None: inference[eval_metrics.INFERENCE_PROB_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.LINEAR_REGRESSION, predictions)}
+      else:
         inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
             inference[eval_metrics.INFERENCE_PROB_NAME], 1)
-      if keys:
-        inference[KEYS_NAME] = keys
+
+        predictions = {
+            prediction_key.PredictionKey.PROBABILITIES:
+                inference[eval_metrics.INFERENCE_PROB_NAME],
+            prediction_key.PredictionKey.CLASSES:
+                inference[eval_metrics.INFERENCE_PRED_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.CLASSIFICATION, predictions)}
+
+      if keys is not None:
+        inference[keys_name] = keys
 
     # labels might be None if we're doing prediction (which brings up the
     # question of why we force everything to adhere to a single model_fn).
     loss_deps = []
     training_graph = None
+    training_hooks = []
+    scaffold = None
     if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
       training_graph = control_flow_ops.group(
           graph_builder.training_graph(
@@ -136,6 +198,15 @@ def get_model_fn(params, graph_builder_class, device_assigner,
               trainer_id=trainer_id),
           state_ops.assign_add(contrib_framework.get_global_step(), 1))
       loss_deps.append(training_graph)
+      if hasattr(graph_builder, 'finalize_training'):
+        finalize_listener = EveryCheckpointPreSaveListener(
+            graph_builder.finalize_training())
+        scaffold = monitored_session.Scaffold()
+        training_hooks.append(
+            basic_session_run_hooks.CheckpointSaverHook(
+                model_dir, save_secs=600, save_steps=None,
+                scaffold=scaffold,
+                listeners=[finalize_listener]))
 
     training_loss = None
     if (mode == model_fn_lib.ModeKeys.EVAL or
@@ -143,14 +214,31 @@ def get_model_fn(params, graph_builder_class, device_assigner,
       with ops.control_dependencies(loss_deps):
         training_loss = graph_builder.training_loss(
             features, labels, name=LOSS_NAME)
+
     # Put weights back in
     if weights is not None:
       features[weights_name] = weights
-    return (inference, training_loss, training_graph)
+
+    if early_stopping_rounds:
+      training_hooks.append(TensorForestLossHook(early_stopping_rounds))
+
+    if report_feature_importances:
+      training_hooks.append(TensorForestRunOpAtEndHook(
+          {'feature_importances': graph_builder.feature_importances()}))
+
+    return model_fn_lib.ModelFnOps(
+        mode=mode,
+        predictions=inference,
+        loss=training_loss,
+        train_op=training_graph,
+        training_hooks=training_hooks,
+        scaffold=scaffold,
+        output_alternatives=output_alternatives)
+
   return _model_fn
 
 
-class TensorForestEstimator(evaluable.Evaluable, trainable.Trainable):
+class TensorForestEstimator(estimator.Estimator):
   """An estimator that can train and evaluate a random forest.
 
   Example:
@@ -174,16 +262,22 @@ class TensorForestEstimator(evaluable.Evaluable, trainable.Trainable):
     ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x)
+
+  # Predict returns an iterable of dicts.
+  results = list(estimator.predict(x=x))
+  prob0 = results[0][eval_metrics.INFERENCE_PROB_NAME]
+  prediction0 = results[0][eval_metrics.INFERENCE_PRED_NAME]
   ```
   """
 
   def __init__(self, params, device_assigner=None, model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
                config=None, weights_name=None, keys_name=None,
-               feature_engineering_fn=None, early_stopping_rounds=100,
-               num_trainers=1, trainer_id=0):
-
+               feature_engineering_fn=None,
+               early_stopping_rounds=100,
+               num_trainers=1, trainer_id=0,
+               report_feature_importances=False,
+               local_eval=False):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -202,177 +296,41 @@ class TensorForestEstimator(evaluable.Evaluable, trainable.Trainable):
       weights_name: A string defining feature column name representing
         weights. Will be multiplied by the loss of the example. Used to
         downweight or boost examples during training.
-      keys_name: A string defining feature column name representing example
-        keys. Used by `predict_with_keys` method.
+      keys_name: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
       early_stopping_rounds: Allows training to terminate early if the forest is
-        no longer growing. 100 by default.
+        no longer growing. 100 by default.  Set to a Falsy value to disable
+        the default training hook.
       num_trainers: Number of training jobs, which will partition trees
         among them.
       trainer_id: Which trainer this instance is.
+      report_feature_importances: If True, print out feature importances
+        during evaluation.
+      local_eval: If True, don't use a device assigner for eval. This is to
+        support some common setups where eval is done on a single machine, even
+        though training might be distributed.
 
     Returns:
       A `TensorForestEstimator` instance.
     """
-    self.params = params.fill()
-    self.graph_builder_class = graph_builder_class
-    self.early_stopping_rounds = early_stopping_rounds
-    self.weights_name = weights_name
-    self._estimator = estimator.Estimator(
-        model_fn=get_model_fn(params, graph_builder_class, device_assigner,
-                              weights_name=weights_name, keys_name=keys_name,
-                              num_trainers=num_trainers, trainer_id=trainer_id),
+    super(TensorForestEstimator, self).__init__(
+        model_fn=get_model_fn(
+            params.fill(),
+            graph_builder_class,
+            device_assigner,
+            weights_name=weights_name,
+            keys_name=keys_name,
+            early_stopping_rounds=early_stopping_rounds,
+            num_trainers=num_trainers,
+            trainer_id=trainer_id,
+            report_feature_importances=report_feature_importances,
+            model_dir=model_dir,
+            local_eval=local_eval),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
-    self._skcompat = estimator.SKCompat(self._estimator)
 
-  @property
-  def model_dir(self):
-    """See evaluable.Evaluable."""
-    return self._estimator.model_dir
-
-  def evaluate(self,
-               x=None,
-               y=None,
-               input_fn=None,
-               batch_size=None,
-               steps=None,
-               metrics=None,
-               name=None,
-               checkpoint_path=None,
-               hooks=None):
-    """See evaluable.Evaluable."""
-    if x is not None and y is not None:
-      return self._skcompat.score(x, y, batch_size=batch_size, steps=steps,
-                                  metrics=metrics)
-    elif input_fn is not None:
-      return self._estimator.evaluate(
-          input_fn=input_fn,
-          steps=steps,
-          metrics=metrics,
-          name=name,
-          checkpoint_path=checkpoint_path,
-          hooks=hooks)
-    else:
-      raise ValueError(
-          'evaluate: Must provide either both x and y or input_fn.')
-
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
-    """See trainable.Trainable."""
-    if not monitors:
-      monitors = [TensorForestLossHook(self.early_stopping_rounds)]
-    if x is not None and y is not None:
-      self._skcompat.fit(x, y, batch_size=batch_size, steps=steps,
-                         max_steps=max_steps, monitors=monitors)
-    elif input is not None:
-      self._estimator.fit(input_fn=input_fn, steps=steps, monitors=monitors,
-                          max_steps=max_steps)
-    else:
-      raise ValueError('fit: Must provide either both x and y or input_fn.')
-
-  def predict_proba(
-      self, x=None, input_fn=None, batch_size=None):
-    """Returns prediction probabilities for given features (classification).
-
-    Args:
-      x: features.
-      input_fn: Input function. If set, x and y must be None.
-      batch_size: Override default batch size.
-
-    Returns:
-      Numpy array of predicted probabilities (or an iterable of predicted
-      probabilities if as_iterable is True).
-
-    Raises:
-      ValueError: If both or neither of x and input_fn were given.
-    """
-    if x is not None:
-      results = self._skcompat.predict(x, batch_size=batch_size)
-      return results[eval_metrics.INFERENCE_PROB_NAME]
-    else:
-      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
-      return (x[eval_metrics.INFERENCE_PROB_NAME] for x in results)
-
-  def predict(
-      self, x=None, input_fn=None, axis=None, batch_size=None):
-    """Returns predictions for given features.
-
-    Args:
-      x: features.
-      input_fn: Input function. If set, x must be None.
-      axis: Axis on which to argmax (for classification).
-            Last axis is used by default.
-      batch_size: Override default batch size.
-
-    Returns:
-      Numpy array of predicted classes or regression values (or an iterable of
-      predictions if as_iterable is True).
-    """
-    predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
-                    else eval_metrics.INFERENCE_PRED_NAME)
-    if x is not None:
-      results = self._skcompat.predict(x, batch_size=batch_size)
-      return results[predict_name]
-    else:
-      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
-      return (x[predict_name] for x in results)
-
-  def predict_with_keys(
-      self, x=None, input_fn=None, axis=None, batch_size=None):
-    """Same as predict but also returns the example keys."""
-    predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
-                    else eval_metrics.INFERENCE_PRED_NAME)
-    if x is not None:
-      results = self._skcompat.predict(x, batch_size=batch_size)
-      return results[predict_name]
-    else:
-      results = self._estimator.predict(input_fn=input_fn, as_iterable=True)
-      return ((x[predict_name], x.get(KEYS_NAME, None)) for x in results)
-
-  def export(self,
-             export_dir,
-             input_fn,
-             signature_fn=None,
-             input_feature_key=None,
-             default_batch_size=1):
-    """See BaseEstimator.export."""
-    # Reset model function with basic device assigner.
-    # Servo doesn't support distributed inference
-    # but it will try to respect device assignments if they're there.
-    # pylint: disable=protected-access
-    orig_model_fn = self._estimator._model_fn
-    self._estimator._model_fn = get_model_fn(
-        self.params, self.graph_builder_class,
-        tensor_forest.RandomForestDeviceAssigner(),
-        weights_name=self.weights_name)
-    result = self._estimator.export(
-        export_dir=export_dir,
-        input_fn=input_fn,
-        input_feature_key=input_feature_key,
-        use_deprecated_input_fn=False,
-        signature_fn=(signature_fn or
-                      (export.regression_signature_fn
-                       if self.params.regression else
-                       export.classification_signature_fn_with_prob)),
-        default_batch_size=default_batch_size,
-        prediction_key=eval_metrics.INFERENCE_PROB_NAME)
-    self._estimator._model_fn = orig_model_fn
-    # pylint: enable=protected-access
-    return result
-
-  def export_savedmodel(self,
-                        export_dir_base,
-                        serving_input_fn,
-                        default_output_alternative_key=None,
-                        assets_extra=None,
-                        as_text=False):
-    return self._estimator.export_savedmodel(
-        export_dir_base,
-        serving_input_fn,
-        default_output_alternative_key=default_output_alternative_key,
-        assets_extra=assets_extra,
-        as_text=as_text)
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest_test.py b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
index 1e774dab2b0..e78c772af3e 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets import base
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index 579fcb57f94..41a815bdec3 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -8,7 +8,10 @@ package(default_visibility = [
     "//visibility:public",
 ])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 filegroup(
     name = "all_files",
@@ -41,6 +44,33 @@ filegroup(
     ),
 )
 
+cc_library(
+    name = "all_kernels",
+    srcs = [
+        "core/ops/hard_routing_function_op.cc",
+        "core/ops/k_feature_gradient_op.cc",
+        "core/ops/k_feature_routing_function_op.cc",
+        "core/ops/routing_function_op.cc",
+        "core/ops/routing_gradient_op.cc",
+        "core/ops/stochastic_hard_routing_function_op.cc",
+        "core/ops/stochastic_hard_routing_gradient_op.cc",
+        "core/ops/unpack_path_op.cc",
+    ],
+    deps = [
+        ":utils",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_wrapper_py(
+    name = "training_ops",
+    deps = [":all_kernels"],
+)
+
 tf_custom_op_library(
     name = "python/ops/_training_ops.so",
     srcs = [
@@ -68,21 +98,25 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "ops_lib",
     srcs = [
         "__init__.py",
         "python/ops/training_ops.py",
     ],
-    data = [
+    dso = [
         "python/ops/_training_ops.so",
     ],
+    kernels = [
+        ":all_kernels",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":training_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -169,6 +203,7 @@ py_test(
     tags = ["manual"],
     deps = [
         ":ops_lib",
+        ":training_ops",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -183,6 +218,7 @@ py_test(
     tags = ["manual"],
     deps = [
         ":ops_lib",
+        ":training_ops",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_test_lib",
@@ -393,3 +429,18 @@ py_library(
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
     ],
 )
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "hybrid_pip",
+    deps = [
+        ":all_layers",
+        ":decisions_to_data_then_nn",
+        ":forest_to_data_then_nn",
+        ":hard_decisions_to_data_then_nn",
+        ":k_feature_decisions_to_data_then_nn",
+        ":nn",
+        ":stochastic_hard_decisions_to_data_then_nn",
+        ":stochastic_soft_decisions_to_data_then_nn",
+    ],
+)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index 690fadd8cbb..76cfb4c9ca0 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -37,14 +37,9 @@
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-using tensorforest::CHILDREN_INDEX;
-using tensorforest::FEATURE_INDEX;
-using tensorforest::LEAF_NODE;
-
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbability;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
index 6b67cb24c1a..4027e732b3f 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
@@ -40,10 +40,6 @@ namespace tensorflow {
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-using tensorforest::CHILDREN_INDEX;
-using tensorforest::FEATURE_INDEX;
-using tensorforest::LEAF_NODE;
-
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbability;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_gradient_op.cc
index 131e8196428..5aca54d131c 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_gradient_op.cc
@@ -34,7 +34,6 @@
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 26a7d7917ed..09b83e2af1f 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -41,7 +41,6 @@
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_layer.py b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_layer.py
index 46ff363229a..7527b12c0e7 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_layer.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_layer.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensor_forest.python import tensor_forest
+from tensorflow.contrib.framework.python.ops import variables as framework_variables
 
 
 class HybridLayer(object):
@@ -33,7 +33,7 @@ class HybridLayer(object):
   def __init__(self, params, layer_num, device_assigner, *args, **kwargs):
     self.layer_num = layer_num
     self.device_assigner = (
-        device_assigner or tensor_forest.RandomForestDeviceAssigner())
+        device_assigner or framework_variables.VariableDeviceChooser())
     self.params = params
     self._define_vars(params, **kwargs)
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
index 539e7602367..a427a02b7cd 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import collections
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.tensor_forest.python import tensor_forest
+from tensorflow.contrib.framework.python.ops import variables as framework_variables
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -45,7 +45,7 @@ class HybridModel(object):
                **kwargs):
 
     self.device_assigner = (
-        device_assigner or tensor_forest.RandomForestDeviceAssigner())
+        device_assigner or framework_variables.VariableDeviceChooser())
 
     self.params = params
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
index f2cf20fa46c..980f53253d7 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
@@ -17,8 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow  # pylint: disable=unused-import
-
+from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops
 from tensorflow.contrib.tensor_forest.hybrid.python.ops import training_ops
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 
@@ -60,7 +59,7 @@ class KFeatureRoutingFunctionTest(test_util.TensorFlowTestCase):
 
   def testRoutingFunction(self):
     with self.test_session():
-      route_tensor = self.ops.k_feature_routing_function(
+      route_tensor = gen_training_ops.k_feature_routing_function(
           self.input_data,
           self.tree_weights,
           self.tree_thresholds,
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
index 699599f4dca..a27fd49d321 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
@@ -17,8 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow  # pylint: disable=unused-import
-
+from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops
 from tensorflow.contrib.tensor_forest.hybrid.python.ops import training_ops
 
 from tensorflow.python.framework import test_util
@@ -38,10 +37,8 @@ class RoutingFunctionTest(test_util.TensorFlowTestCase):
 
   def testRoutingFunction(self):
     with self.test_session():
-      route_tensor = self.ops.routing_function(self.input_data,
-                                               self.tree_weights,
-                                               self.tree_thresholds,
-                                               max_nodes=3)
+      route_tensor = gen_training_ops.routing_function(
+          self.input_data, self.tree_weights, self.tree_thresholds, max_nodes=3)
 
       route_tensor_shape = route_tensor.get_shape()
       self.assertEquals(len(route_tensor_shape), 2)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/decisions_to_data.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/decisions_to_data.py
index 710885ad2ec..b57b1434751 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/decisions_to_data.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/decisions_to_data.py
@@ -22,6 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops
 from tensorflow.contrib.tensor_forest.hybrid.python import hybrid_layer
 from tensorflow.contrib.tensor_forest.hybrid.python.ops import training_ops
 from tensorflow.python.framework import ops
@@ -34,7 +35,7 @@ class DecisionsToDataLayer(hybrid_layer.HybridLayer):
   """A layer that treats soft decisions as data."""
 
   def _define_vars(self, params, **kwargs):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
 
       self.tree_parameters = variable_scope.get_variable(
           name='tree_parameters_%d' % self.layer_num,
@@ -53,11 +54,11 @@ class DecisionsToDataLayer(hybrid_layer.HybridLayer):
     super(DecisionsToDataLayer, self).__init__(
         params, layer_num, device_assigner, *args, **kwargs)
 
-    self.training_ops = training_ops.Load()
+    self._training_ops = training_ops.Load()
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
-      routing_probabilities = self.training_ops.routing_function(
+    with ops.device(self.device_assigner):
+      routing_probabilities = gen_training_ops.routing_function(
           data,
           self.tree_parameters,
           self.tree_thresholds,
@@ -75,7 +76,7 @@ class KFeatureDecisionsToDataLayer(hybrid_layer.HybridLayer):
   """A layer that treats soft decisions made on single features as data."""
 
   def _define_vars(self, params, **kwargs):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
 
       self.tree_parameters = variable_scope.get_variable(
           name='tree_parameters_%d' % self.layer_num,
@@ -94,12 +95,12 @@ class KFeatureDecisionsToDataLayer(hybrid_layer.HybridLayer):
     super(KFeatureDecisionsToDataLayer, self).__init__(
         params, layer_num, device_assigner, *args, **kwargs)
 
-    self.training_ops = training_ops.Load()
+    self._training_ops = training_ops.Load()
 
   # pylint: disable=unused-argument
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
-      routing_probabilities = self.training_ops.k_feature_routing_function(
+    with ops.device(self.device_assigner):
+      routing_probabilities = gen_training_ops.k_feature_routing_function(
           data,
           self.tree_parameters,
           self.tree_thresholds,
@@ -120,7 +121,7 @@ class HardDecisionsToDataLayer(DecisionsToDataLayer):
   """A layer that learns a soft decision tree but treats it as hard at test."""
 
   def _define_vars(self, params, **kwargs):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
 
       self.tree_parameters = variable_scope.get_variable(
           name='hard_tree_parameters_%d' % self.layer_num,
@@ -138,8 +139,8 @@ class HardDecisionsToDataLayer(DecisionsToDataLayer):
     return super(HardDecisionsToDataLayer, self).inference_graph(data)
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
-      path_probability, path = self.training_ops.hard_routing_function(
+    with ops.device(self.device_assigner):
+      path_probability, path = gen_training_ops.hard_routing_function(
           data,
           self.tree_parameters,
           self.tree_thresholds,
@@ -147,7 +148,7 @@ class HardDecisionsToDataLayer(DecisionsToDataLayer):
           tree_depth=self.params.hybrid_tree_depth)
 
       output = array_ops.slice(
-          self.training_ops.unpack_path(path, path_probability),
+          gen_training_ops.unpack_path(path, path_probability),
           [0, self.params.num_nodes - self.params.num_leaves - 1],
           [-1, self.params.num_leaves])
 
@@ -158,7 +159,7 @@ class StochasticHardDecisionsToDataLayer(HardDecisionsToDataLayer):
   """A layer that learns a soft decision tree by sampling paths."""
 
   def _define_vars(self, params, **kwargs):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
 
       self.tree_parameters = variable_scope.get_variable(
           name='stochastic_hard_tree_parameters_%d' % self.layer_num,
@@ -173,9 +174,9 @@ class StochasticHardDecisionsToDataLayer(HardDecisionsToDataLayer):
               mean=params.weight_init_mean, stddev=params.weight_init_std))
 
   def soft_inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
       path_probability, path = (
-          self.training_ops.stochastic_hard_routing_function(
+          gen_training_ops.stochastic_hard_routing_function(
               data,
               self.tree_parameters,
               self.tree_thresholds,
@@ -183,15 +184,15 @@ class StochasticHardDecisionsToDataLayer(HardDecisionsToDataLayer):
               random_seed=self.params.base_random_seed))
 
       output = array_ops.slice(
-          self.training_ops.unpack_path(path, path_probability),
+          gen_training_ops.unpack_path(path, path_probability),
           [0, self.params.num_nodes - self.params.num_leaves - 1],
           [-1, self.params.num_leaves])
 
       return output
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
-      path_probability, path = self.training_ops.hard_routing_function(
+    with ops.device(self.device_assigner):
+      path_probability, path = gen_training_ops.hard_routing_function(
           data,
           self.tree_parameters,
           self.tree_thresholds,
@@ -199,7 +200,7 @@ class StochasticHardDecisionsToDataLayer(HardDecisionsToDataLayer):
           tree_depth=self.params.hybrid_tree_depth)
 
       output = array_ops.slice(
-          self.training_ops.unpack_path(path, path_probability),
+          gen_training_ops.unpack_path(path, path_probability),
           [0, self.params.num_nodes - self.params.num_leaves - 1],
           [-1, self.params.num_leaves])
 
@@ -210,7 +211,7 @@ class StochasticSoftDecisionsToDataLayer(StochasticHardDecisionsToDataLayer):
   """A layer that learns a soft decision tree by sampling paths."""
 
   def _define_vars(self, params, **kwargs):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
 
       self.tree_parameters = variable_scope.get_variable(
           name='stochastic_soft_tree_parameters_%d' % self.layer_num,
@@ -225,8 +226,8 @@ class StochasticSoftDecisionsToDataLayer(StochasticHardDecisionsToDataLayer):
               mean=params.weight_init_mean, stddev=params.weight_init_std))
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
-      routes = self.training_ops.routing_function(
+    with ops.device(self.device_assigner):
+      routes = gen_training_ops.routing_function(
           data,
           self.tree_parameters,
           self.tree_thresholds,
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
index cbb20fc0b88..ff3ab21eaa9 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
@@ -32,7 +32,7 @@ class FullyConnectedLayer(hybrid_layer.HybridLayer):
     pass
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
       # Compute activations for the neural network.
       nn_activations = layers.fully_connected(data, self.params.layer_size)
 
@@ -49,7 +49,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer):
     pass
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
       # Compute activations for the neural network.
       nn_activations = layers.fully_connected(data, 1)
 
@@ -65,7 +65,7 @@ class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer):
     pass
 
   def inference_graph(self, data):
-    with ops.device(self.device_assigner.get_device(self.layer_num)):
+    with ops.device(self.device_assigner):
       # Compute activations for the neural network.
       nn_activations = [layers.fully_connected(data, self.params.layer_size)]
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/ops/training_ops.py b/tensorflow/contrib/tensor_forest/hybrid/python/ops/training_ops.py
index 940420416c1..7f403ec825b 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/ops/training_ops.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/ops/training_ops.py
@@ -19,7 +19,8 @@ from __future__ import print_function
 
 import threading
 
-from tensorflow.python.framework import load_library
+from tensorflow.contrib.tensor_forest.hybrid.ops import gen_training_ops
+from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,7 +53,7 @@ def _RoutingFunctionGradient(op, grad):
   Returns:
     Gradients with respect to the input of the RoutingFunction op.
   """
-  routing_gradient = _training_ops.routing_gradient
+  routing_gradient = gen_training_ops.routing_gradient
 
   input_data_tensor = op.inputs[0]
   tree_weights_tensor = op.inputs[1]
@@ -129,8 +130,8 @@ def _StochasticHardRoutingFunctionGradient(op, routing_grad, unused_path_grad):
   Returns:
     Gradients with respect to the input of the RoutingFunction op.
   """
-  gradient_op = _training_ops.stochastic_hard_routing_gradient
-  unpack_path_op = _training_ops.unpack_path
+  gradient_op = gen_training_ops.stochastic_hard_routing_gradient
+  unpack_path_op = gen_training_ops.unpack_path
 
   input_data_tensor = op.inputs[0]
   tree_weights_tensor = op.inputs[1]
@@ -208,7 +209,7 @@ def _KFeatureRoutingFunctionGradient(op, grad):
   Returns:
     Gradients with respect to the input of the RoutingFunction op.
   """
-  gradient_op = _training_ops.k_feature_gradient
+  gradient_op = gen_training_ops.k_feature_gradient
 
   input_data_tensor = op.inputs[0]
   tree_weights_tensor = op.inputs[1]
@@ -287,7 +288,7 @@ def Load():
     if not _training_ops:
       ops_path = resource_loader.get_path_to_datafile(TRAINING_OPS_FILE)
       logging.info('data path: %s', ops_path)
-      _training_ops = load_library.load_op_library(ops_path)
+      _training_ops = loader.load_op_library(ops_path)
 
       assert _training_ops, 'Could not load _training_ops.so'
   return _training_ops
diff --git a/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc b/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc
index e4675b40a33..b56185e99eb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc
@@ -26,10 +26,6 @@
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
 using std::placeholders::_1;
 using tensorforest::BestFeatureClassification;
 using tensorforest::BestFeatureRegression;
diff --git a/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc b/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc
index 2b8d7f27c37..bf8d28c6e05 100644
--- a/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc
@@ -48,10 +48,6 @@ using tensorforest::TensorForestDataSpec;
 using tensorforest::Initialize;
 using tensorforest::IsAllInitialized;
 
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
 // A data structure to store the results of parallel tree traversal.
 struct InputDataResult {
   // A list of each node that was visited.
@@ -547,9 +543,9 @@ class CountExtremelyRandomStats : public OpKernel {
           const float output = labels(i * num_outputs + j);
           out_node_sums(node, j + 1) += w * output;
           out_node_squares(node, j + 1) += w * output * output;
-          out_node_sums(node, 0) += w;
-          out_node_squares(node, 0) += w;
         }
+        out_node_sums(node, 0) += w;
+        out_node_squares(node, 0) += w;
       }
       out_leaves(i) = results[i].node_indices.back();
       if (epoch > start_epochs(out_leaves(i)) + 1) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc b/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc
index 6b149348cf5..f71ba17fc33 100644
--- a/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc
@@ -27,15 +27,10 @@
 
 namespace tensorflow {
 
-using shape_inference::Dimension;
-using shape_inference::InferenceContext;
-using shape_inference::Shape;
-
 using std::placeholders::_1;
 using std::placeholders::_2;
 
 using tensorforest::CheckTensorBounds;
-using tensorforest::Sum;
 using tensorforest::BestSplitDominatesClassificationBootstrap;
 using tensorforest::BestSplitDominatesClassificationChebyshev;
 using tensorforest::BestSplitDominatesClassificationHoeffding;
diff --git a/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc b/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc
index ba460298ec6..ae558058250 100644
--- a/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc
@@ -32,10 +32,6 @@
 
 namespace tensorflow {
 
-using shape_inference::Dimension;
-using shape_inference::InferenceContext;
-using shape_inference::Shape;
-
 using tensorforest::CHILDREN_INDEX;
 using tensorforest::FEATURE_INDEX;
 using tensorforest::LEAF_NODE;
diff --git a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc b/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
index e5594f89878..6bfc29d96fe 100644
--- a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
@@ -31,10 +31,6 @@
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
 using tensorforest::CheckTensorBounds;
 using tensorforest::IsAllInitialized;
 
@@ -64,64 +60,6 @@ class SampleInputs : public OpKernel {
     input_spec_.ParseFromString(serialized_proto);
   }
 
-  // Returns the number of sparse values for example input_index.
-  // Also returns the index where those features start in sparse_input_start
-  // if any were found.
-  int32 GetNumSparseFeatures(const Tensor& sparse_input_indices,
-                             int32 input_index, int64* sparse_input_start) {
-    // Binary search for input_index.
-    // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
-    // for a simpler but possibly slower solution, or searching for
-    // input_start and input_end simultaneously.
-    const auto indices = sparse_input_indices.matrix<int64>();
-    const int64 num_total = sparse_input_indices.shape().dim_size(0);
-    int64 index;
-    int64 low = 0;
-    int64 high = num_total;
-
-    while (true) {
-      if (low == high) {
-        return 0;
-      }
-      index = low + (high - low) / 2;
-      const int64 feature_index = indices(index, 0);
-      if (feature_index == input_index) {
-        // found it.
-        break;
-      } else if (feature_index < input_index) {
-        // Correct for the implicit floor in the index assignment.
-        if (low == index) {
-          return 0;
-        }
-        low = index;
-      } else {
-        high = index;
-      }
-    }
-
-    // Scan for the start and end of the input_index range.
-    int64 input_start = index;
-    int64 val = indices(input_start, 0);
-    while (val == input_index) {
-      --input_start;
-      if (input_start < 0) {
-        break;
-      }
-      val = indices(input_start, 0);
-    }
-    *sparse_input_start = input_start + 1;
-    int32 input_end = index;
-    val = indices(input_end, 0);
-    while (val == input_index) {
-      ++input_end;
-      if (input_end >= num_total) {
-        break;
-      }
-      val = indices(input_end, 0);
-    }
-    return input_end - input_start - 1;
-  }
-
   // increment_input implements a "++" operation for the situation when
   // you want to do something n times on an underlying iterator.
   // In an ideal world, this would be a built-in iterator adaptor.
@@ -333,8 +271,8 @@ class SampleInputs : public OpKernel {
           int64 sparse_input_start;
           int32 num_total_features = input_spec_.dense_features_size();
           if (sparse_input) {
-            num_total_features += GetNumSparseFeatures(
-                sparse_input_indices, *it, &sparse_input_start);
+            num_total_features += tensorforest::GetNumSparseFeatures(
+                sparse_input_indices.matrix<int64>(), *it, &sparse_input_start);
           }
           if (num_total_features == 0) {
             LOG(WARNING) << "num total features is zero.";
diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
index c36af9d9e12..dd2a98b08cd 100644
--- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
@@ -25,12 +25,8 @@
 
 namespace tensorflow {
 
-using shape_inference::Dimension;
-using shape_inference::InferenceContext;
-using shape_inference::Shape;
 using tensorforest::CheckTensorBounds;
 
-
 class ScatterAddNdim : public OpKernel {
  public:
   explicit ScatterAddNdim(OpKernelConstruction* context) : OpKernel(context) {}
diff --git a/tensorflow/contrib/tensor_forest/kernels/topn_ops.cc b/tensorflow/contrib/tensor_forest/kernels/topn_ops.cc
deleted file mode 100644
index cebbdcddd7a..00000000000
--- a/tensorflow/contrib/tensor_forest/kernels/topn_ops.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// The three Ops used to implement a TopN structure:  Insert, Remove, and
-// RefreshShortlist.
-
-#include <algorithm>
-#include <numeric>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
-
-class TopNInsert : public OpKernel {
- public:
-  explicit TopNInsert(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& ids = context->input(0);
-    const Tensor& scores = context->input(1);
-    const Tensor& new_ids = context->input(2);
-    const Tensor& new_scores = context->input(3);
-
-    OP_REQUIRES(context, ids.shape().dims() == 1,
-                errors::InvalidArgument("ids should be one-dimensional"));
-    OP_REQUIRES(context, scores.shape().dims() == 1,
-                errors::InvalidArgument("scores should be one-dimensional"));
-    OP_REQUIRES(context, new_ids.shape().dims() == 1,
-                errors::InvalidArgument("new_ids should be one-dimensional"));
-    OP_REQUIRES(
-        context, new_scores.shape().dims() == 1,
-        errors::InvalidArgument("new_scores should be one-dimensional"));
-
-    OP_REQUIRES(
-        context, ids.shape().dim_size(0) == scores.shape().dim_size(0),
-        errors::InvalidArgument("ids and scores should be the same length"));
-    OP_REQUIRES(context,
-                new_ids.shape().dim_size(0) == new_scores.shape().dim_size(0),
-                errors::InvalidArgument(
-                    "new_ids and new_scores should be the same length"));
-
-    const auto flat_ids = ids.unaligned_flat<int64>();
-    const auto flat_scores = scores.unaligned_flat<float>();
-    const auto flat_new_ids = new_ids.unaligned_flat<int64>();
-    const auto flat_new_scores = new_scores.unaligned_flat<float>();
-
-    const int num_updates = new_ids.shape().dim_size(0);
-    const int shortlist_max_size = ids.shape().dim_size(0) - 1;
-    int shortlist_size = std::max(0, static_cast<int>(flat_ids(0)));
-    int overflow = shortlist_size + num_updates - shortlist_max_size;
-
-    std::vector<std::tuple<int64, int64, float>> updates;
-    float score_cutoff = flat_scores(0);
-
-    if (overflow > 0) {
-      // Sort the *highest* overflow updates
-      std::vector<int> update_indices(num_updates);
-      for (int i = 0; i < num_updates; i++) {
-        update_indices[i] = i;
-      }
-      auto cmp = [&flat_new_scores](int a, int b) {
-        return flat_new_scores(a) > flat_new_scores(b);
-      };
-      std::sort(update_indices.begin(), update_indices.end(), cmp);
-
-      // Sort the *lowest* overflow shortlist entries
-      std::vector<int> shortlist_indices(shortlist_max_size + 1);
-      std::iota(shortlist_indices.begin() + 1, shortlist_indices.end(), 1);
-      auto cmp2 = [&flat_scores](int a, int b) {
-        return flat_scores(a) < flat_scores(b);
-      };
-      std::sort(shortlist_indices.begin() + 1, shortlist_indices.end(), cmp2);
-
-      int i = 0;  // Points into update_indices
-      int j = 1;  // Points into shortlist_indices
-      while (i < num_updates && j <= shortlist_max_size) {
-        VLOG(2) << "i = " << i;
-        VLOG(2) << "j = " << j;
-        VLOG(2) << "update_indices[i] = " << update_indices[i];
-        VLOG(2) << "shortlist_indices[j] = " << shortlist_indices[j];
-        VLOG(2) << "flat_new_scores(update_indices[i]) = "
-                << flat_new_scores(update_indices[i]);
-        VLOG(2) << "flat_scores(shortlist_indices[j])) = "
-                << flat_scores(shortlist_indices[j]);
-        if (flat_new_scores(update_indices[i]) >
-            flat_scores(shortlist_indices[j])) {
-          // Whenever we erase something from the shortlist, we need to
-          // update score_cutoff.
-          score_cutoff =
-              std::max(score_cutoff, flat_scores(shortlist_indices[j]));
-          updates.push_back(std::make_tuple(
-              shortlist_indices[j], flat_new_ids(update_indices[i]),
-              flat_new_scores(update_indices[i])));
-          if (flat_ids(shortlist_indices[j]) == -1) {
-            shortlist_size++;
-          }
-          j++;
-        } else {
-          // Whenever we fail to insert something into the shortlist, we need to
-          // update score_cutoff.
-          score_cutoff =
-              std::max(score_cutoff, flat_new_scores(update_indices[i]));
-        }
-        i++;
-      }
-    } else {
-      // Everything fits, no need to sort.
-      int j = 1;
-      for (int i = 0; i < num_updates; i++) {
-        if (flat_new_scores(i) < score_cutoff) {
-          continue;
-        }
-        while (j <= shortlist_max_size && flat_ids(j) != -1) {
-          j++;
-        }
-        if (j > shortlist_max_size) {
-          LOG(FATAL) << "Bug";
-        }
-        updates.push_back(
-            std::make_tuple(j, flat_new_ids(i), flat_new_scores(i)));
-        j++;
-        shortlist_size++;
-      }
-    }
-
-    updates.push_back(std::make_tuple(0, shortlist_size, score_cutoff));
-
-    Tensor* output_shortlist_ids = nullptr;
-    TensorShape shortlist_ids_shape;
-    shortlist_ids_shape.AddDim(updates.size());
-    OP_REQUIRES_OK(context, context->allocate_output(0, shortlist_ids_shape,
-                                                     &output_shortlist_ids));
-    auto shortlist_ids_flat = output_shortlist_ids->tensor<int64, 1>();
-
-    Tensor* output_ids = nullptr;
-    TensorShape ids_shape;
-    ids_shape.AddDim(updates.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, ids_shape, &output_ids));
-    auto output_ids_flat = output_ids->tensor<int64, 1>();
-
-    Tensor* output_scores = nullptr;
-    TensorShape scores_shape;
-    scores_shape.AddDim(updates.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, scores_shape, &output_scores));
-    auto output_scores_flat = output_scores->tensor<float, 1>();
-
-    int i = 0;
-    for (const auto& update : updates) {
-      shortlist_ids_flat(i) = std::get<0>(update);
-      output_ids_flat(i) = std::get<1>(update);
-      output_scores_flat(i) = std::get<2>(update);
-      i++;
-    }
-  }
-};
-
-class TopNRemove : public OpKernel {
- public:
-  explicit TopNRemove(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& ids = context->input(0);
-    const Tensor& remove_ids = context->input(1);
-
-    OP_REQUIRES(context, ids.shape().dims() == 1,
-                errors::InvalidArgument("ids should be one-dimensional"));
-    OP_REQUIRES(
-        context, remove_ids.shape().dims() == 1,
-        errors::InvalidArgument("remove_ids should be one-dimensional"));
-
-    const auto flat_ids = ids.unaligned_flat<int64>();
-    const auto flat_remove_ids = remove_ids.unaligned_flat<int64>();
-
-    const int num_to_remove = remove_ids.shape().dim_size(0);
-    const int shortlist_max_size = ids.shape().dim_size(0);
-
-    // First, turn remove_ids into a set for easy membership checking.
-    std::unordered_set<int> ids_to_remove(
-        flat_remove_ids.data(), flat_remove_ids.data() + num_to_remove);
-
-    std::vector<int64> updates;
-    int shortlist_size = std::max(0, static_cast<int>(flat_ids(0)));
-    for (int j = 1; j < shortlist_max_size; j++) {
-      if (ids_to_remove.find(flat_ids(j)) != ids_to_remove.end()) {
-        shortlist_size--;
-        updates.push_back(j);
-      }
-    }
-
-    Tensor* output_shortlist_ids = nullptr;
-    TensorShape shortlist_ids_shape;
-    shortlist_ids_shape.AddDim(updates.size());
-    OP_REQUIRES_OK(context, context->allocate_output(0, shortlist_ids_shape,
-                                                     &output_shortlist_ids));
-    auto shortlist_ids_flat = output_shortlist_ids->tensor<int64, 1>();
-
-    std::copy(updates.begin(), updates.end(), shortlist_ids_flat.data());
-
-    Tensor* new_length = nullptr;
-    TensorShape new_length_shape;
-    new_length_shape.AddDim(1);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, new_length_shape, &new_length));
-    new_length->tensor<int64, 1>()(0) = shortlist_size;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("TopNInsert").Device(DEVICE_CPU), TopNInsert);
-REGISTER_KERNEL_BUILDER(Name("TopNRemove").Device(DEVICE_CPU), TopNRemove);
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc b/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc
index 9b82e5ea950..3fe37d56a8f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc
@@ -33,12 +33,6 @@ using tensorforest::LEAF_NODE;
 using tensorforest::FREE_NODE;
 
 using tensorforest::CheckTensorBounds;
-using tensorforest::DataColumnTypes;
-using tensorforest::Sum;
-
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
 
 namespace {
 // Traverse the tree for every example from start to end. Put the resulting
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 3692b89f79d..94e12cea5a0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -15,7 +15,6 @@
 #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
 #include <algorithm>
 #include <cfloat>
-#include "tensorflow/core/lib/random/distribution_sampler.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -43,8 +42,8 @@ DataColumnTypes FindSparseFeatureSpec(
   return static_cast<DataColumnTypes>(spec.sparse(column_num).original_type());
 }
 
-void GetTwoBest(int max, std::function<float(int)> score_fn, float* best_score,
-                int* best_index, float* second_best_score,
+void GetTwoBest(int max, const std::function<float(int)>& score_fn,
+                float* best_score, int* best_index, float* second_best_score,
                 int* second_best_index) {
   *best_index = -1;
   *second_best_index = -1;
@@ -216,8 +215,6 @@ bool BestSplitDominatesRegression(
   return false;
 }
 
-// We return the Gini Impurity of the bootstrap sample as an int rather
-// than a float, so that we can more easily check for ties.
 int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
                   random::SimplePhilox* rand) {
   std::vector<int> counts(s, 0);
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index 8c4117c1899..35f9fb7eaf4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -22,6 +22,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
@@ -57,6 +58,16 @@ T Sum(Tensor counts) {
   return count_sum(0);
 }
 
+// Get the two best scores and their indices among max splits.
+void GetTwoBest(int max, const std::function<float(int)>& score_fn,
+                float* best_score, int* best_index, float* second_best_score,
+                int* second_best_index);
+
+// If the Gini Impurity is g, this returns n^2 (g - 1).  This is an int
+// rather than a float, and so can be more easily checked for ties.
+int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
+                  random::SimplePhilox* rand);
+
 // Get the DataColumnTypes number for the given feature.
 DataColumnTypes FindDenseFeatureSpec(
     int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
@@ -197,6 +208,66 @@ float FindSparseValue(
   return 0.0;
 }
 
+// Returns the number of sparse values for example input_index.
+// Also returns the index where those features start in sparse_input_start
+// if any were found.
+// Assumes that the first column in indices is ordered.
+template <typename T1>
+int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
+                           int64* sparse_input_start) {
+  // Binary search for input_index.
+  // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
+  // for a simpler but possibly slower solution, or searching for
+  // input_start and input_end simultaneously.
+  const int64 num_total = indices.dimension(0);
+  int64 index;
+  int64 low = 0;
+  int64 high = num_total;
+  *sparse_input_start = -1;  // Easy error checking.
+
+  while (true) {
+    if (low == high) {
+      return 0;
+    }
+    index = low + (high - low) / 2;
+    const int64 feature_index = indices(index, 0);
+    if (feature_index == input_index) {
+      // found it.
+      break;
+    } else if (feature_index < input_index) {
+      // Correct for the implicit floor in the index assignment.
+      if (low == index) {
+        return 0;
+      }
+      low = index;
+    } else {
+      high = index;
+    }
+  }
+
+  // Scan for the start and end of the input_index range.
+  int64 input_start = index;
+  int64 val = indices(input_start, 0);
+  while (val == input_index) {
+    --input_start;
+    if (input_start < 0) {
+      break;
+    }
+    val = indices(input_start, 0);
+  }
+  *sparse_input_start = input_start + 1;
+  int32 input_end = index;
+  val = indices(input_end, 0);
+  while (val == input_index) {
+    ++input_end;
+    if (input_end >= num_total) {
+      break;
+    }
+    val = indices(input_end, 0);
+  }
+  return input_end - input_start - 1;
+}
+
 // Returns left/right decision between the input value and the threshold bias.
 // For floating point types, the decision is value > bias, but for
 // categorical data, it is value != bias.
diff --git a/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc b/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc
index 073fad528a4..798b003a865 100644
--- a/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc
@@ -30,16 +30,10 @@
 
 namespace tensorflow {
 
-using shape_inference::Dimension;
-using shape_inference::InferenceContext;
-using shape_inference::Shape;
-
 using gtl::TopN;
 using tensorforest::CheckTensorBounds;
-using tensorforest::Initialize;
 using tensorforest::WeightedGiniImpurity;
 
-
 class UpdateFertileSlots : public OpKernel {
  public:
   explicit UpdateFertileSlots(OpKernelConstruction* context)
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py
index 28a49834073..351245fbdd2 100644
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py
+++ b/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.tensor_forest.python.ops import data_ops
 
 from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py
index 6c53b871bb1..150632c3984 100644
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py
+++ b/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py
index 89ba53a427f..705949a4540 100644
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py
+++ b/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.tensor_forest.python.ops import data_ops
 
 from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
index baf7db609a7..e429d12e965 100644
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
+++ b/tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
diff --git a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
index 398db3acd04..2e54f620d5b 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
@@ -138,7 +138,11 @@ def ParseDataTensorOrDict(data):
         col_spec.original_type = DTYPE_TO_FTYPE[data[k].dtype]
         col_spec.name = k
         # the second dimension of get_shape should always be known.
-        col_spec.size = data[k].get_shape()[1].value
+        shape = data[k].get_shape()
+        if len(shape) == 1:
+          col_spec.size = 1
+        else:
+          col_spec.size = shape[1].value
 
         dense_features_size += col_spec.size
         dense_features.append(CastToFloat(data[k]))
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index ab93ffc0ca7..177783c207e 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -21,6 +21,7 @@ import math
 import random
 import sys
 
+from tensorflow.contrib.framework.python.ops import variables as framework_variables
 from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.contrib.tensor_forest.python import constants
 from tensorflow.contrib.tensor_forest.python.ops import data_ops
@@ -280,8 +281,16 @@ class ForestTrainingVariables(object):
   def __init__(self, params, device_assigner, training=True,
                tree_variables_class=TreeTrainingVariables):
     self.variables = []
+    # Set up some scalar variables to run through the device assigner, then
+    # we can use those to colocate everything related to a tree.
+    self.device_dummies = []
+    with ops.device(device_assigner):
+      for i in range(params.num_trees):
+        self.device_dummies.append(variable_scope.get_variable(
+            name='device_dummy_%d' % i, shape=0))
+
     for i in range(params.num_trees):
-      with ops.device(device_assigner.get_variable_device(i)):
+      with ops.device(self.device_dummies[i].device):
         self.variables.append(tree_variables_class(params, i, training))
 
   def __setitem__(self, t, val):
@@ -291,30 +300,6 @@ class ForestTrainingVariables(object):
     return self.variables[t]
 
 
-class RandomForestDeviceAssigner(object):
-  """A device assigner that uses the default device.
-
-  Write subclasses that implement get_device for control over how trees
-  get assigned to devices.  This assumes that whole trees are assigned
-  to a device.
-  """
-
-  def __init__(self):
-    self.cached = None
-    self.variables = None
-    self.params = None
-
-  def get_variable_device(self, unused_tree_num):
-    if not self.cached:
-      dummy = constant_op.constant(0)
-      self.cached = dummy.device
-    return self.cached
-
-  def get_device(self, tree_num):
-    # By default, colocate ops with variables.
-    return self.get_variable_device(tree_num)
-
-
 class RandomForestGraphs(object):
   """Builds TF graphs for random forest training and inference."""
 
@@ -326,8 +311,8 @@ class RandomForestGraphs(object):
                tree_graphs=None,
                training=True):
     self.params = params
-    self.device_assigner = device_assigner or RandomForestDeviceAssigner()
-    self.device_assigner.params = self.params
+    self.device_assigner = (
+        device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
     self.variables = variables or ForestTrainingVariables(
@@ -338,7 +323,6 @@ class RandomForestGraphs(object):
         tree_graph_class(self.variables[i], self.params, i)
         for i in range(self.params.num_trees)
     ]
-    self.device_assigner.variables = self.variables
 
   def _bag_features(self, tree_num, input_data):
     split_data = array_ops.split(
@@ -382,7 +366,7 @@ class RandomForestGraphs(object):
     tree_end = int((trainer_id + 1) * trees_per_trainer)
     for i in range(tree_start, tree_end):
       logging.info('training graph for tree: %d' % i)
-      with ops.device(self.device_assigner.get_device(i)):
+      with ops.device(self.variables.device_dummies[i].device):
         seed = self.params.base_random_seed
         if seed != 0:
           seed += i
@@ -445,7 +429,7 @@ class RandomForestGraphs(object):
 
     probabilities = []
     for i in range(self.params.num_trees):
-      with ops.device(self.device_assigner.get_device(i)):
+      with ops.device(self.variables.device_dummies[i].device):
         tree_data = processed_dense_features
         if self.params.bagged_features:
           if processed_sparse_features is not None:
@@ -457,7 +441,7 @@ class RandomForestGraphs(object):
             data_spec,
             sparse_features=processed_sparse_features,
             **inference_args))
-    with ops.device(self.device_assigner.get_device(0)):
+    with ops.device(self.variables.device_dummies[0].device):
       all_predict = array_ops.stack(probabilities)
       return math_ops.div(
           math_ops.reduce_sum(all_predict, 0), self.params.num_trees,
@@ -471,7 +455,7 @@ class RandomForestGraphs(object):
     """
     sizes = []
     for i in range(self.params.num_trees):
-      with ops.device(self.device_assigner.get_device(i)):
+      with ops.device(self.variables.device_dummies[i].device):
         sizes.append(self.trees[i].size())
     return math_ops.reduce_mean(math_ops.to_float(array_ops.stack(sizes)))
 
@@ -491,17 +475,23 @@ class RandomForestGraphs(object):
     """
     impurities = []
     for i in range(self.params.num_trees):
-      with ops.device(self.device_assigner.get_device(i)):
+      with ops.device(self.variables.device_dummies[i].device):
         impurities.append(self.trees[i].average_impurity())
     return math_ops.reduce_mean(array_ops.stack(impurities))
 
   def get_stats(self, session):
     tree_stats = []
     for i in range(self.params.num_trees):
-      with ops.device(self.device_assigner.get_device(i)):
+      with ops.device(self.variables.device_dummies[i].device):
         tree_stats.append(self.trees[i].get_stats(session))
     return ForestStats(tree_stats, self.params)
 
+  def feature_importances(self):
+    tree_counts = [self.trees[i].feature_usage_counts()
+                   for i in range(self.params.num_trees)]
+    total_counts = math_ops.reduce_sum(array_ops.stack(tree_counts, 0), 0)
+    return total_counts / math_ops.reduce_sum(total_counts)
+
 
 def one_hot_wrapper(num_classes, loss_fn):
   """Some loss functions take one-hot labels."""
@@ -998,3 +988,10 @@ class RandomTreeGraphs(object):
             self.variables.tree, [0, 0], [-1, 1])), constants.LEAF_NODE)
         ).eval(session=session).shape[0]
     return TreeStats(num_nodes, num_leaves)
+
+  def feature_usage_counts(self):
+    features = array_ops.slice(self.variables.tree, [0, 1], [-1, 1])
+    # One hot ignores negative values, which is the default for unused nodes.
+    one_hots = array_ops.one_hot(
+        array_ops.squeeze(features), self.params.num_features)
+    return math_ops.reduce_sum(one_hots, 0)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 254d0de6ef3..a9a3f66bbfe 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,13 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/tensor_forest/python/topn.py b/tensorflow/contrib/tensor_forest/python/topn.py
deleted file mode 100644
index 342b8144e59..00000000000
--- a/tensorflow/contrib/tensor_forest/python/topn.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A collection that allows repeated access to its top-scoring items."""
-# pylint: disable=g-bad-name
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-
-
-class TopN(object):
-  """A collection that allows repeated access to its top-scoring items.
-
-  A TopN supports the following three operations:
-  1) insert(ids, scores).  ids is a 1-d int64 Tensor and scores is a 1-d
-  float Tensor.  scores[i] is the score associated with ids[i].  It is
-  totally fine to re-insert ids that have already been inserted into the
-  collection.
-
-  2) remove(ids)
-
-  3) ids, scores = get_best(n).  scores will contain the n highest (most
-  positive) scores currently in the TopN, and ids their corresponding ids.
-  n is a 1-d int32 Tensor with shape (1).
-
-  TopN is implemented using a short-list of the top scoring items.  At
-  construction time, the size of the short-list must be specified, and it
-  is an error to call GetBest(n) with an n greater than that size.
-  """
-
-  def __init__(self, max_id, shortlist_size=100, name_prefix=''):
-    """Creates a new TopN."""
-    self.shortlist_size = shortlist_size
-    # id_to_score contains all the scores we are tracking.
-    self.id_to_score = variable_scope.get_variable(
-        name=name_prefix + 'id_to_score',
-        dtype=dtypes.float32,
-        shape=[max_id],
-        initializer=init_ops.constant_initializer(dtypes.float32.min))
-    # sl_ids and sl_scores together satisfy four invariants:
-    # 1) If sl_ids[i] != -1, then
-    #    id_to_score[sl_ids[i]] = sl_scores[i] >= sl_scores[0]
-    # 2) sl_ids[0] is the number of i > 0 for which sl_ids[i] != -1.
-    # 3) If id_to_score[i] > sl_scores[0], then
-    #    sl_ids[j] = i for some j.
-    # 4) If sl_ids[i] == -1, then sl_scores[i] = tf.float32.min.
-    self.sl_ids = variable_scope.get_variable(
-        name=name_prefix + 'shortlist_ids',
-        dtype=dtypes.int64,
-        shape=[shortlist_size + 1],
-        initializer=init_ops.constant_initializer(-1))
-    # Ideally, we would set self.sl_ids[0] = 0 here.  But then it is hard
-    # to pass that control dependency to the other other Ops.  Instead, we
-    # have insert, remove and get_best all deal with the fact that
-    # self.sl_ids[0] == -1 actually means the shortlist size is 0.
-    self.sl_scores = variable_scope.get_variable(
-        name=name_prefix + 'shortlist_scores',
-        dtype=dtypes.float32,
-        shape=[shortlist_size + 1],
-        initializer=init_ops.constant_initializer(dtypes.float32.min))
-    # TopN keeps track of its internal data dependencies, so the user
-    # doesn't have to.
-    self.last_ops = []
-
-  def insert(self, ids, scores):
-    """Insert the ids and scores into the TopN."""
-    with ops.control_dependencies(self.last_ops):
-      scatter_op = state_ops.scatter_update(self.id_to_score, ids, scores)
-      larger_scores = math_ops.greater(scores, self.sl_scores[0])
-
-      def shortlist_insert():
-        larger_ids = array_ops.boolean_mask(
-            math_ops.to_int64(ids), larger_scores)
-        larger_score_values = array_ops.boolean_mask(scores, larger_scores)
-        shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert(
-            self.sl_ids, self.sl_scores, larger_ids, larger_score_values)
-        u1 = state_ops.scatter_update(self.sl_ids, shortlist_ids, new_ids)
-        u2 = state_ops.scatter_update(self.sl_scores, shortlist_ids, new_scores)
-        return control_flow_ops.group(u1, u2)
-
-      # We only need to insert into the shortlist if there are any
-      # scores larger than the threshold.
-      cond_op = control_flow_ops.cond(
-          math_ops.reduce_any(larger_scores), shortlist_insert,
-          control_flow_ops.no_op)
-      with ops.control_dependencies([cond_op]):
-        self.last_ops = [scatter_op, cond_op]
-
-  def remove(self, ids):
-    """Remove the ids (and their associated scores) from the TopN."""
-    with ops.control_dependencies(self.last_ops):
-      scatter_op = state_ops.scatter_update(
-          self.id_to_score,
-          ids,
-          array_ops.ones_like(
-              ids, dtype=dtypes.float32) * dtypes.float32.min)
-      # We assume that removed ids are almost always in the shortlist,
-      # so it makes no sense to hide the Op behind a tf.cond
-      shortlist_ids_to_remove, new_length = tensor_forest_ops.top_n_remove(
-          self.sl_ids, ids)
-      u1 = state_ops.scatter_update(
-          self.sl_ids,
-          array_ops.concat([[0], shortlist_ids_to_remove], 0),
-          array_ops.concat(
-              [new_length, array_ops.ones_like(shortlist_ids_to_remove) * -1],
-              0))
-      u2 = state_ops.scatter_update(
-          self.sl_scores,
-          shortlist_ids_to_remove,
-          dtypes.float32.min * array_ops.ones_like(
-              shortlist_ids_to_remove, dtype=dtypes.float32))
-      self.last_ops = [scatter_op, u1, u2]
-
-  def get_best(self, n):
-    """Return the indices and values of the n highest scores in the TopN."""
-
-    def refresh_shortlist():
-      """Update the shortlist with the highest scores in id_to_score."""
-      new_scores, new_ids = nn_ops.top_k(self.id_to_score, self.shortlist_size)
-      smallest_new_score = math_ops.reduce_min(new_scores)
-      new_length = math_ops.reduce_sum(
-          math_ops.to_int32(math_ops.greater(new_scores, dtypes.float32.min)))
-      u1 = self.sl_ids.assign(
-          math_ops.to_int64(array_ops.concat([[new_length], new_ids], 0)))
-      u2 = self.sl_scores.assign(
-          array_ops.concat([[smallest_new_score], new_scores], 0))
-      self.last_ops = [u1, u2]
-      return control_flow_ops.group(u1, u2)
-
-    # We only need to refresh the shortlist if n is greater than the
-    # current shortlist size (which is stored in sl_ids[0]).
-    with ops.control_dependencies(self.last_ops):
-      cond_op = control_flow_ops.cond(n > self.sl_ids[0], refresh_shortlist,
-                                      control_flow_ops.no_op)
-      with ops.control_dependencies([cond_op]):
-        topk_values, topk_indices = nn_ops.top_k(
-            self.sl_scores,
-            math_ops.minimum(n, math_ops.to_int32(self.sl_ids[0])))
-        # topk_indices are the indices into the shortlist, we want to return
-        # the indices into id_to_score
-        gathered_indices = array_ops.gather(self.sl_ids, topk_indices)
-        return gathered_indices, topk_values
-
-  def get_and_remove_best(self, n):
-    # TODO(thomaswc): Replace this with a version of get_best where
-    # refresh_shortlist grabs the top n + shortlist_size.
-    top_ids, unused_top_vals = self.get_best(n)
-    remove_op = self.remove(top_ids)
-    return array_ops.identity(top_ids, control_inputs=remove_op)
diff --git a/tensorflow/contrib/tensor_forest/python/topn_test.py b/tensorflow/contrib/tensor_forest/python/topn_test.py
deleted file mode 100644
index a527cddf3db..00000000000
--- a/tensorflow/contrib/tensor_forest/python/topn_test.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for topn.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python import topn
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-from tensorflow.python.client import session
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-
-class TopNOpsTest(test_util.TensorFlowTestCase):
-
-  def testInsertOpIntoEmptyShortlist(self):
-    with self.test_session():
-      shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert(
-          [0, -1, -1, -1, -1, -1],  # sl_ids
-          [-999, -999, -999, -999, -999, -999],  # sl_scores
-          [5],
-          [33.0]  # new id and score
-      )
-      self.assertAllEqual([1, 0], shortlist_ids.eval())
-      self.assertAllEqual([5, 1], new_ids.eval())
-      self.assertAllEqual([33.0, -999], new_scores.eval())
-
-  def testInsertOpIntoAlmostFullShortlist(self):
-    with self.test_session():
-      shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert(
-          [4, 13, -1, 27, 99, 15],  # sl_ids
-          [60.0, 87.0, -999, 65.0, 1000.0, 256.0],  # sl_scores
-          [5],
-          [93.0]  # new id and score
-      )
-      self.assertAllEqual([2, 0], shortlist_ids.eval())
-      self.assertAllEqual([5, 5], new_ids.eval())
-      # Shortlist still contains all known scores > 60.0
-      self.assertAllEqual([93.0, 60.0], new_scores.eval())
-
-  def testInsertOpIntoFullShortlist(self):
-    with self.test_session():
-      shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert(
-          [5, 13, 44, 27, 99, 15],  # sl_ids
-          [60.0, 87.0, 111.0, 65.0, 1000.0, 256.0],  # sl_scores
-          [5],
-          [93.0]  # new id and score
-      )
-      self.assertAllEqual([3, 0], shortlist_ids.eval())
-      self.assertAllEqual([5, 5], new_ids.eval())
-      # We removed a 65.0 from the list, so now we can only claim that
-      # it holds all scores > 65.0.
-      self.assertAllEqual([93.0, 65.0], new_scores.eval())
-
-  def testInsertOpHard(self):
-    with self.test_session():
-      shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert(
-          [4, 13, -1, 27, 99, 15],  # sl_ids
-          [60.0, 87.0, -999, 65.0, 1000.0, 256.0],  # sl_scores
-          [5, 6, 7, 8, 9],
-          [61.0, 66.0, 90.0, 100.0, 2000.0]  # new id and score
-      )
-      # Top 5 scores are: 2000.0, 1000.0, 256.0, 100.0, 90.0
-      self.assertAllEqual([2, 3, 1, 0], shortlist_ids.eval())
-      self.assertAllEqual([9, 8, 7, 5], new_ids.eval())
-      # 87.0 is the highest score we overwrote or didn't insert.
-      self.assertAllEqual([2000.0, 100.0, 90.0, 87.0], new_scores.eval())
-
-  def testRemoveSimple(self):
-    with self.test_session():
-      shortlist_ids, new_length = tensor_forest_ops.top_n_remove(
-          [5, 100, 200, 300, 400, 500], [200, 400, 600])
-      self.assertAllEqual([2, 4], shortlist_ids.eval())
-      self.assertAllEqual([3], new_length.eval())
-
-  def testRemoveAllMissing(self):
-    with self.test_session():
-      shortlist_ids, new_length = tensor_forest_ops.top_n_remove(
-          [5, 100, 200, 300, 400, 500], [1200, 1400, 600])
-      self.assertAllEqual([], shortlist_ids.eval())
-      self.assertAllEqual([5], new_length.eval())
-
-  def testRemoveAll(self):
-    with self.test_session():
-      shortlist_ids, new_length = tensor_forest_ops.top_n_remove(
-          [5, 100, 200, 300, 400, 500],
-          [100, 200, 300, 400, 500],)
-      self.assertAllEqual([1, 2, 3, 4, 5], shortlist_ids.eval())
-      self.assertAllEqual([0], new_length.eval())
-
-
-class TopNTest(test_util.TensorFlowTestCase):
-
-  def testSimple(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    t.insert([1, 2, 3, 4, 5], [1.0, 2.0, 3.0, 4.0, 5.0])
-    t.remove([4, 5])
-    ids, vals = t.get_best(2)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertItemsEqual([2, 3], list(ids_v))
-      self.assertItemsEqual([2.0, 3.0], list(vals_v))
-
-  def testSimpler(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    t.insert([1], [33.0])
-    ids, vals = t.get_best(1)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertListEqual([1], list(ids_v))
-      self.assertListEqual([33.0], list(vals_v))
-
-  def testLotsOfInsertsAscending(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    for i in range(100):
-      t.insert([i], [float(i)])
-    ids, vals = t.get_best(5)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertItemsEqual([95, 96, 97, 98, 99], list(ids_v))
-      self.assertItemsEqual([95.0, 96.0, 97.0, 98.0, 99.0], list(vals_v))
-
-  def testLotsOfInsertsDescending(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    for i in range(99, 1, -1):
-      t.insert([i], [float(i)])
-    ids, vals = t.get_best(5)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertItemsEqual([95, 96, 97, 98, 99], list(ids_v))
-      self.assertItemsEqual([95.0, 96.0, 97.0, 98.0, 99.0], list(vals_v))
-
-  def testRemoveNotInShortlist(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    for i in range(20):
-      t.insert([i], [float(i)])
-    t.remove([4, 5])
-    ids, vals = t.get_best(2)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertItemsEqual([18.0, 19.0], list(vals_v))
-      self.assertItemsEqual([18, 19], list(ids_v))
-
-  def testNeedToRefreshShortlistInGetBest(self):
-    t = topn.TopN(1000, shortlist_size=10)
-    for i in range(20):
-      t.insert([i], [float(i)])
-    # Shortlist now has 10 .. 19
-    t.remove([11, 12, 13, 14, 15, 16, 17, 18, 19])
-    ids, vals = t.get_best(2)
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      ids_v, vals_v = sess.run([ids, vals])
-      self.assertItemsEqual([9, 10], list(ids_v))
-      self.assertItemsEqual([9.0, 10.0], list(vals_v))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 2e0a46ffe43..13de7fb39d9 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -42,8 +42,8 @@ py_library(
     srcs = ["plugins/projector/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":protos_all_py",
         "//tensorflow/python:lib",
+        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
@@ -54,10 +54,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":projector",
-        ":protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
+        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
index 09a8b592f7f..be2398cdc0c 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Public API for the Embedding Projector."""
+"""Public API for the Embedding Projector.
+
+@@ProjectorPluginAsset
+@@ProjectorConfig
+@@EmbeddingInfo
+@@EmbeddingMetadata
+@@SpriteMetadata
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,18 +28,18 @@ from __future__ import print_function
 import os
 
 from google.protobuf import text_format
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import EmbeddingInfo
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
 from tensorflow.python.lib.io import file_io
-
-PROJECTOR_FILENAME = 'projector_config.pbtxt'
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
+# pylint: disable=wildcard-import
+from tensorflow.tensorboard.plugins.projector.projector_config_pb2 import *
+# pylint: enable=wildcard-import
 
 
 def visualize_embeddings(summary_writer, config):
   """Stores a config file used by the embedding projector.
 
   Args:
-    summary_writer: The summary writer used for writting events.
+    summary_writer: The summary writer used for writing events.
     config: `tf.contrib.tensorboard.plugins.projector.ProjectorConfig`
       proto that holds the configuration for the projector such as paths to
       checkpoint files and metadata files for the embeddings. If
@@ -50,5 +57,8 @@ def visualize_embeddings(summary_writer, config):
 
   # Saving the config file in the logdir.
   config_pbtxt = text_format.MessageToString(config)
+  # FYI - the 'projector_config.pbtxt' string is hardcoded in the projector
+  # plugin.
+  # TODO(dandelion): Restore this to a reference to the projector plugin
   file_io.write_string_to_file(
-      os.path.join(logdir, PROJECTOR_FILENAME), config_pbtxt)
+      os.path.join(logdir, 'projector_config.pbtxt'), config_pbtxt)
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
index 96e084fa736..5f86f57a1c6 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@@ -24,10 +24,10 @@ import shutil
 from google.protobuf import text_format
 
 from tensorflow.contrib.tensorboard.plugins import projector
-from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer as writer_lib
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
 
 
 class ProjectorApiTest(test.TestCase):
@@ -46,7 +46,7 @@ class ProjectorApiTest(test.TestCase):
     writer = writer_lib.FileWriter(temp_dir)
     projector.visualize_embeddings(writer, config)
 
-    # Read the configuratin from disk and make sure it matches the original.
+    # Read the configurations from disk and make sure it matches the original.
     with gfile.GFile(os.path.join(temp_dir, 'projector_config.pbtxt')) as f:
       config2 = projector_config_pb2.ProjectorConfig()
       text_format.Parse(f.read(), config2)
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index 02f2d7fae86..f2065c66625 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -127,3 +127,6 @@ class FakeSummaryWriter(object):
 
   def reopen(self):
     pass
+
+  def close(self):
+    pass
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
new file mode 100644
index 00000000000..6bcb03238cc
--- /dev/null
+++ b/tensorflow/contrib/text/BUILD
@@ -0,0 +1,120 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which
+#   are not supported.
+
+package(default_visibility = [
+    "//learning/brain:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+
+tf_custom_op_py_library(
+    name = "text_py",
+    srcs = [
+        "__init__.py",
+        "python/ops/__init__.py",
+        "python/ops/skip_gram_ops.py",
+    ],
+    dso = [
+        ":python/ops/_skip_gram_ops.so",
+    ],
+    kernels = [
+        ":all_kernels",
+        ":all_ops",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_skip_gram_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_gram_kernels",
+    srcs = ["kernels/skip_gram_kernels.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "all_kernels",
+    deps = [":skip_gram_kernels"],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_skip_gram_ops.so",
+    srcs = [
+        "kernels/skip_gram_kernels.cc",
+        "ops/skip_gram_ops.cc",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["skip_gram_ops"],
+)
+
+cc_library(
+    name = "all_ops",
+    deps = [":skip_gram_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_skip_gram_ops",
+    out = "python/ops/gen_skip_gram_ops.py",
+    deps = [":skip_gram_ops_op_lib"],
+)
+
+py_test(
+    name = "skip_gram_ops_test",
+    size = "medium",
+    srcs = ["python/ops/skip_gram_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":text_py",
+        "//tensorflow/contrib/lookup:lookup_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/text/__init__.py b/tensorflow/contrib/text/__init__.py
new file mode 100644
index 00000000000..35e66231890
--- /dev/null
+++ b/tensorflow/contrib/text/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text-processing ops.
+
+@@skip_gram_sample
+@@skip_gram_sample_with_text_vocab
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.text.python.ops import *
+# pylint: enable=unused-import,wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/text/kernels/skip_gram_kernels.cc b/tensorflow/contrib/text/kernels/skip_gram_kernels.cc
new file mode 100644
index 00000000000..3cd0b5f72b5
--- /dev/null
+++ b/tensorflow/contrib/text/kernels/skip_gram_kernels.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SkipGramGenerateCandidatesOp : public OpKernel {
+ public:
+  explicit SkipGramGenerateCandidatesOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(context, context->input("input_tensor", &input_tensor));
+    const auto input = input_tensor->flat<T>();
+
+    const Tensor* min_skips_tensor;
+    OP_REQUIRES_OK(context, context->input("min_skips", &min_skips_tensor));
+    const int min_skips = *(min_skips_tensor->scalar<int>().data());
+    const Tensor* max_skips_tensor;
+    OP_REQUIRES_OK(context, context->input("max_skips", &max_skips_tensor));
+    const int max_skips = *(max_skips_tensor->scalar<int>().data());
+
+    OP_REQUIRES(
+        context, min_skips >= 0 && max_skips >= 0,
+        errors::InvalidArgument("Both min_skips and max_skips must be >= 0."));
+    OP_REQUIRES(context, min_skips <= max_skips,
+                errors::InvalidArgument("min_skips must be <= max_skips."));
+
+    const Tensor* start_tensor;
+    OP_REQUIRES_OK(context, context->input("start", &start_tensor));
+    const int start = *(start_tensor->scalar<int>().data());
+    const Tensor* limit_tensor;
+    OP_REQUIRES_OK(context, context->input("limit", &limit_tensor));
+    const int limit = *(limit_tensor->scalar<int>().data());
+    const int end =
+        limit < 0 ? input.size()
+                  : std::min(start + limit, static_cast<int>(input.size()));
+
+    const Tensor* emit_self_tensor;
+    OP_REQUIRES_OK(context,
+                   context->input("emit_self_as_target", &emit_self_tensor));
+    const bool emit_self_as_target = *(emit_self_tensor->scalar<bool>().data());
+
+    std::vector<T> tokens;
+    std::vector<T> labels;
+
+    // Reserve the number of random numbers we will use - we use one for each
+    // token between start and end.
+    random::PhiloxRandom local_gen =
+        generator_.ReserveSamples32(end - start + 1);
+    random::SimplePhilox rng(&local_gen);
+
+    // For each token in the sentence, pick a random skip, then generates
+    // (token, label) pairs for all labels whose distances from the token are
+    // within the range [-skip, skip].
+    for (int i = start; i < end; ++i) {
+      const int skips = min_skips + rng.Uniform(max_skips - min_skips + 1);
+      for (int j = -skips; j <= skips; ++j) {
+        if ((i + j < start) || (i + j >= end) ||
+            (j == 0 && !emit_self_as_target)) {
+          continue;
+        }
+        tokens.push_back(input(i));
+        labels.push_back(input(i + j));
+      }
+    }
+
+    Tensor* tokens_output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       "tokens", TensorShape({static_cast<int>(tokens.size())}),
+                       &tokens_output));
+    Tensor* labels_output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       "labels", TensorShape({static_cast<int>(labels.size())}),
+                       &labels_output));
+    OP_REQUIRES(
+        context, tokens_output->IsSameSize(*labels_output),
+        errors::Internal(strings::StrCat(
+            "Mismatch between tokens_output shape of ",
+            tokens_output->shape().DebugString(),
+            " and labels_output shape of ",
+            labels_output->shape().DebugString(),
+            ". This should never happen - contact ami-team@ if it does.")));
+
+    // Copies results to output tensors.
+    for (int i = 0; i < tokens.size(); ++i) {
+      tokens_output->vec<T>()(i) = tokens[i];
+      labels_output->vec<T>()(i) = labels[i];
+    }
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER_KERNEL(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("SkipGramGenerateCandidates") \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<type>("T"),    \
+                          SkipGramGenerateCandidatesOp<type>)
+
+REGISTER_KERNEL(string);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int16);
+// TODO(weiho): Add other types if the need arises.
+
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/text/ops/skip_gram_ops.cc b/tensorflow/contrib/text/ops/skip_gram_ops.cc
new file mode 100644
index 00000000000..9a7a20d81a9
--- /dev/null
+++ b/tensorflow/contrib/text/ops/skip_gram_ops.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("SkipGramGenerateCandidates")
+    .Input("input_tensor: T")
+    .Input("min_skips: int32")
+    .Input("max_skips: int32")
+    .Input("start: int32")
+    .Input("limit: int32")
+    .Input("emit_self_as_target: bool")
+    .Output("tokens: T")
+    .Output("labels: T")
+    .Attr("T: type")
+    // The seed attributes are needed by GuardedPhiloxRandom
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // input_tensor must be of rank-1.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      // All other args must be scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+
+      // Due to possible randomness in selecting skips, we only know that the
+      // outputs will be of rank-1, but not their sizes.
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Generates skip-gram token and label paired Tensors from the input tensor.
+See docs for the public-facing skip_gram_sample() Python op for more details.
+)doc");
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/text/python/ops/__init__.py b/tensorflow/contrib/text/python/ops/__init__.py
new file mode 100644
index 00000000000..bb47266dd2b
--- /dev/null
+++ b/tensorflow/contrib/text/python/ops/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various contrib ops related to text-processing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.text.python.ops.skip_gram_ops import skip_gram_sample
+from tensorflow.contrib.text.python.ops.skip_gram_ops import skip_gram_sample_with_text_vocab
diff --git a/tensorflow/contrib/text/python/ops/skip_gram_ops.py b/tensorflow/contrib/text/python/ops/skip_gram_ops.py
new file mode 100644
index 00000000000..410ee517e03
--- /dev/null
+++ b/tensorflow/contrib/text/python/ops/skip_gram_ops.py
@@ -0,0 +1,428 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Skip-gram sampling ops from https://arxiv.org/abs/1301.3781."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+
+from tensorflow.contrib import lookup
+from tensorflow.contrib.text.python.ops import gen_skip_gram_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import input as input_ops
+
+_checkpoint_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_skip_gram_ops.so"))
+
+ops.NotDifferentiable("SkipGramGenerateCandidates")
+
+
+def skip_gram_sample(input_tensor,
+                     min_skips=1,
+                     max_skips=5,
+                     start=0,
+                     limit=-1,
+                     emit_self_as_target=False,
+                     vocab_freq_table=None,
+                     vocab_min_count=None,
+                     vocab_subsampling=None,
+                     corpus_size=None,
+                     batch_size=None,
+                     batch_capacity=None,
+                     seed=None,
+                     name=None):
+  """Generates skip-gram token and label paired Tensors from the input tensor.
+
+  Generates skip-gram `("token", "label")` pairs using each element in the
+  rank-1 `input_tensor` as a token. The window size used for each token will be
+  randomly selected from the range specified by `[min_skips, max_skips]`,
+  inclusive. See https://arxiv.org/abs/1301.3781 for more details about
+  skip-gram.
+
+  For example, given `input_tensor = ["the", "quick", "brown", "fox", "jumps"]`,
+  `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`, the output
+  `(tokens, labels)` pairs for the token "quick" will be randomly selected from
+  either `(tokens=["quick", "quick"], labels=["the", "brown"])` for 1 skip, or
+  `(tokens=["quick", "quick", "quick"], labels=["the", "brown", "fox"])` for 2
+  skips.
+
+  If `emit_self_as_target = True`, each token will also be emitted as a label
+  for itself. From the previous example, the output will be either
+  `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])` for 1
+  skip, or `(tokens=["quick", "quick", "quick", "quick"], labels=["the",
+  "quick", "brown", "fox"])` for 2 skips.
+
+  The same process is repeated for each element of `input_tensor` and
+  concatenated together into the two output rank-1 `Tensors` (one for all the
+  tokens, another for all the labels).
+
+  If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
+  present in the vocabulary are discarded. Tokens whose frequency counts are
+  below `vocab_min_count` are also discarded. Tokens whose frequency proportions
+  in the corpus exceed `vocab_subsampling` may be randomly down-sampled. See
+  Eq. 5 in http://arxiv.org/abs/1310.4546 for more details about subsampling.
+
+  Due to the random window sizes used for each token, the lengths of the outputs
+  are non-deterministic, unless `batch_size` is specified to batch the outputs
+  to always return `Tensors` of length `batch_size`.
+
+  Args:
+    input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
+    min_skips: `int` or scalar `Tensor` specifying the minimum window size to
+      randomly use for each token. Must be >= 0 and <= `max_skips`. If
+      `min_skips` and `max_skips` are both 0, the only label outputted will be
+      the token itself when `emit_self_as_target = True` - or no output
+      otherwise.
+    max_skips: `int` or scalar `Tensor` specifying the maximum window size to
+      randomly use for each token. Must be >= 0.
+    start: `int` or scalar `Tensor` specifying the position in
+      `input_tensor` from which to start generating skip-gram candidates.
+    limit: `int` or scalar `Tensor` specifying the maximum number of
+      elements in `input_tensor` to use in generating skip-gram candidates. -1
+      means to use the rest of the `Tensor` after `start`.
+    emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
+      each token as a label for itself.
+    vocab_freq_table: (Optional) A lookup table (subclass of
+      `lookup.InitializableLookupTableBase`) that maps tokens to their raw
+      frequency counts. If specified, any token in `input_tensor` that is not
+      found in `vocab_freq_table` will be filtered out before generating
+      skip-gram candidates. While this will typically map to integer raw
+      frequency counts, it could also map to float frequency proportions.
+      `vocab_min_count` and `corpus_size` should be in the same units as this.
+    vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying
+      minimum frequency threshold (from `vocab_freq_table`) for a token to be
+      kept in `input_tensor`. If this is specified, `vocab_freq_table` must also
+      be specified - and they should both be in the same units.
+    vocab_subsampling: (Optional) `float` specifying frequency proportion
+      threshold for tokens from `input_tensor`. Tokens that occur more
+      frequently (based on the ratio of the token's `vocab_freq_table` value to
+      the `corpus_size`) will be randomly down-sampled. Reasonable starting
+      values may be around 1e-3 or 1e-5. If this is specified, both
+      `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
+      in http://arxiv.org/abs/1310.4546 for more details.
+    corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
+      total number of tokens in the corpus (e.g., sum of all the frequency
+      counts of `vocab_freq_table`). Used with `vocab_subsampling` for
+      down-sampling frequently occurring tokens. If this is specified,
+      `vocab_freq_table` and `vocab_subsampling` must also be specified.
+    batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
+    batch_capacity: (Optional) `int` specifying batch capacity for the queue
+      used for batching returned `Tensors`. Only has an effect if
+      `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
+    seed: (Optional) `int` used to create a random seed for window size and
+      subsampling. See `set_random_seed` docs for behavior.
+    name: (Optional) A `string` name or a name scope for the operations.
+
+  Returns:
+    A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+    rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
+    length `batch_size`; if `batch_size` is not specified, they will be of
+    random length, though they will be in sync with each other as long as they
+    are evaluated together.
+
+  Raises:
+    ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`,
+      `vocab_subsampling`, or `corpus_size` is specified. If `vocab_subsampling`
+      and `corpus_size` are not both present or both absent.
+  """
+
+  if vocab_freq_table is None and (vocab_min_count is not None or
+                                   vocab_subsampling is not None or
+                                   corpus_size is not None):
+    raise ValueError(
+        "vocab_freq_table is not provided, but vocab_min_count={}, "
+        "vocab_subsampling={}, or corpus_size={} is not None. These settings "
+        "are useless without a vocab_freq_table.".format(
+            vocab_min_count, vocab_subsampling, corpus_size))
+
+  if (vocab_subsampling is None) != (corpus_size is None):
+    raise ValueError(
+        "vocab_subsampling is {} while corpus_size is {} - both must be "
+        "provided in order for subsampling to work.".format(
+            vocab_subsampling, corpus_size))
+
+  with ops.name_scope(
+      name,
+      "skip_gram_sample",
+      values=[input_tensor, min_skips, max_skips, start, limit]):
+
+    input_tensor = _filter_input(
+        input_tensor=input_tensor,
+        vocab_freq_table=vocab_freq_table,
+        vocab_min_count=vocab_min_count,
+        vocab_subsampling=vocab_subsampling,
+        corpus_size=corpus_size,
+        seed=seed)
+
+    seed1, seed2 = random_seed.get_seed(seed)
+    tokens, labels = gen_skip_gram_ops.skip_gram_generate_candidates(
+        input_tensor=input_tensor,
+        min_skips=min_skips,
+        max_skips=max_skips,
+        start=start,
+        limit=limit,
+        emit_self_as_target=emit_self_as_target,
+        # Note that seed here should be seed1! This is due to
+        # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2".
+        seed=seed1,
+        seed2=seed2)
+
+    # TODO(weiho): If the need arises, add support for sparse input_tensor that
+    # figures out sentence boundaries, then calls
+    # skip_gram_generate_candidates() on each sentence.
+
+    # Batches the (tokens, labels) outputs so that they will be of deterministic
+    # batch_size, to facilitate feeding them into the rest of the network.
+    if batch_size is not None and batch_size > 0:
+      batch_capacity = (batch_capacity
+                        if (batch_capacity is not None and batch_capacity > 0)
+                        else 100 * batch_size)
+      return input_ops.batch(
+          [tokens, labels],
+          batch_size,
+          capacity=batch_capacity,
+          enqueue_many=True)
+
+    return tokens, labels
+
+
+def skip_gram_sample_with_text_vocab(input_tensor,
+                                     vocab_freq_file,
+                                     vocab_token_index=0,
+                                     vocab_token_dtype=dtypes.string,
+                                     vocab_freq_index=1,
+                                     vocab_freq_dtype=dtypes.float64,
+                                     vocab_delimiter=",",
+                                     vocab_min_count=0,
+                                     vocab_subsampling=None,
+                                     min_skips=1,
+                                     max_skips=5,
+                                     start=0,
+                                     limit=-1,
+                                     emit_self_as_target=False,
+                                     batch_size=None,
+                                     batch_capacity=None,
+                                     seed=None,
+                                     name=None):
+  """Skip-gram sampling with a text vocabulary file.
+
+  Wrapper around `skip_gram_sample()` for use with a text vocabulary file. The
+  vocabulary file is expected to be a plain-text file, with lines of
+  `vocab_delimiter`-separated columns. The `vocab_token_index` column should
+  contain the vocabulary term, while the `vocab_freq_index` column should
+  contain the number of times that term occurs in the corpus. For example, with
+  a text vocabulary file of:
+
+    ```
+    bonjour,fr,42
+    hello,en,777
+    hola,es,99
+    ```
+
+  You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
+  `vocab_freq_index=2`.
+
+  See `skip_gram_sample()` documentation for more details about the skip-gram
+  sampling process.
+
+  Args:
+    input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
+    vocab_freq_file: `string` specifying full file path to the text vocab file.
+    vocab_token_index: `int` specifying which column in the text vocab file
+      contains the tokens.
+    vocab_token_dtype: `DType` specifying the format of the tokens in the text
+      vocab file.
+    vocab_freq_index: `int` specifying which column in the text vocab file
+      contains the frequency counts of the tokens.
+    vocab_freq_dtype: `DType` specifying the format of the frequency counts in
+      the text vocab file.
+    vocab_delimiter: `string` specifying the delimiter used in the text vocab
+      file.
+    vocab_min_count: `int`, `float`, or scalar `Tensor` specifying
+      minimum frequency threshold (from `vocab_freq_file`) for a token to be
+      kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
+    vocab_subsampling: (Optional) `float` specifying frequency proportion
+      threshold for tokens from `input_tensor`. Tokens that occur more
+      frequently will be randomly down-sampled. Reasonable starting values may
+      be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546 for
+      more details.
+    min_skips: `int` or scalar `Tensor` specifying the minimum window size to
+      randomly use for each token. Must be >= 0 and <= `max_skips`. If
+      `min_skips` and `max_skips` are both 0, the only label outputted will be
+      the token itself.
+    max_skips: `int` or scalar `Tensor` specifying the maximum window size to
+      randomly use for each token. Must be >= 0.
+    start: `int` or scalar `Tensor` specifying the position in `input_tensor`
+      from which to start generating skip-gram candidates.
+    limit: `int` or scalar `Tensor` specifying the maximum number of elements in
+      `input_tensor` to use in generating skip-gram candidates. -1 means to use
+      the rest of the `Tensor` after `start`.
+    emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
+      each token as a label for itself.
+    batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
+    batch_capacity: (Optional) `int` specifying batch capacity for the queue
+      used for batching returned `Tensors`. Only has an effect if
+      `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
+    seed: (Optional) `int` used to create a random seed for window size and
+      subsampling. See
+      [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
+      for behavior.
+    name: (Optional) A `string` name or a name scope for the operations.
+
+  Returns:
+    A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+    rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
+    length `batch_size`; if `batch_size` is not specified, they will be of
+    random length, though they will be in sync with each other as long as they
+    are evaluated together.
+
+  Raises:
+    ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0 or
+      exceeds the number of columns in `vocab_freq_file`. If `vocab_token_index`
+      and `vocab_freq_index` are both set to the same column. If any token in
+      `vocab_freq_file` has a negative frequency.
+  """
+
+  if vocab_token_index < 0 or vocab_freq_index < 0:
+    raise ValueError(
+        "vocab_token_index={} and vocab_freq_index={} must both be >= 0.".
+        format(vocab_token_index, vocab_freq_index))
+  if vocab_token_index == vocab_freq_index:
+    raise ValueError(
+        "vocab_token_index and vocab_freq_index should be different, but are "
+        "both {}.".format(vocab_token_index))
+
+  # Iterates through the vocab file and calculates the number of vocab terms as
+  # well as the total corpus size (by summing the frequency counts of all the
+  # vocab terms).
+  corpus_size = 0.0
+  vocab_size = 0
+  with gfile.GFile(vocab_freq_file, mode="r") as f:
+    reader = csv.reader(f, delimiter=vocab_delimiter)
+    for row in reader:
+      if vocab_token_index >= len(row) or vocab_freq_index >= len(row):
+        raise ValueError(
+            "Row in vocab file only has {} columns, so vocab_token_index={} or "
+            "vocab_freq_index={} is out of bounds. Row content: {}".format(
+                len(row), vocab_token_index, vocab_freq_index, row))
+      vocab_size += 1
+      freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index])
+      if freq < 0:
+        raise ValueError(
+            "Row in vocab file has negative frequency of {}. Row content: {}".
+            format(freq, row))
+      # Note: tokens whose frequencies are below vocab_min_count will still
+      # contribute to the total corpus size used for vocab subsampling.
+      corpus_size += freq
+
+  vocab_freq_table = lookup.HashTable(
+      lookup.TextFileInitializer(
+          filename=vocab_freq_file,
+          key_dtype=vocab_token_dtype,
+          key_index=vocab_token_index,
+          value_dtype=vocab_freq_dtype,
+          value_index=vocab_freq_index,
+          vocab_size=vocab_size,
+          delimiter=vocab_delimiter),
+      # For vocab terms not in vocab file, use a default value of -1.
+      default_value=-1)
+
+  return skip_gram_sample(
+      input_tensor,
+      min_skips=min_skips,
+      max_skips=max_skips,
+      start=start,
+      limit=limit,
+      emit_self_as_target=emit_self_as_target,
+      vocab_freq_table=vocab_freq_table,
+      vocab_min_count=vocab_min_count,
+      vocab_subsampling=vocab_subsampling,
+      # corpus_size is not used unless vocab_subsampling is specified.
+      corpus_size=None if vocab_subsampling is None else corpus_size,
+      batch_size=batch_size,
+      batch_capacity=batch_capacity,
+      seed=seed,
+      name=name)
+
+
+def _filter_input(input_tensor, vocab_freq_table, vocab_min_count,
+                  vocab_subsampling, corpus_size, seed):
+  """Filters input tensor based on vocab freq, threshold, and subsampling."""
+  if vocab_freq_table is None:
+    return input_tensor
+
+  if not isinstance(vocab_freq_table, lookup.InitializableLookupTableBase):
+    raise ValueError(
+        "vocab_freq_table must be a subclass of "
+        "InitializableLookupTableBase (such as HashTable) instead of type "
+        "{}.".format(type(vocab_freq_table)))
+
+  with ops.name_scope(
+      "filter_vocab", values=[vocab_freq_table, input_tensor, vocab_min_count]):
+    freq = vocab_freq_table.lookup(input_tensor)
+    # Filters out elements in input_tensor that are not found in
+    # vocab_freq_table (table returns a default value of -1 specified above when
+    # an element is not found).
+    mask = math_ops.not_equal(freq, vocab_freq_table.default_value)
+
+    # Filters out elements whose vocab frequencies are less than the threshold.
+    if vocab_min_count is not None:
+      cast_threshold = math_ops.cast(vocab_min_count, freq.dtype)
+      mask = math_ops.logical_and(mask,
+                                  math_ops.greater_equal(freq, cast_threshold))
+
+    input_tensor = array_ops.boolean_mask(input_tensor, mask)
+    freq = array_ops.boolean_mask(freq, mask)
+
+  if not vocab_subsampling:
+    return input_tensor
+
+  if vocab_subsampling < 0 or vocab_subsampling > 1:
+    raise ValueError(
+        "Invalid vocab_subsampling={} - it should be within range [0, 1].".
+        format(vocab_subsampling))
+
+  # Subsamples the input tokens based on vocabulary frequency and
+  # vocab_subsampling threshold (ie randomly discard commonly appearing
+  # tokens).
+  with ops.name_scope(
+      "subsample_vocab", values=[input_tensor, freq, vocab_subsampling]):
+    corpus_size = math_ops.cast(corpus_size, dtypes.float64)
+    freq = math_ops.cast(freq, dtypes.float64)
+    vocab_subsampling = math_ops.cast(vocab_subsampling, dtypes.float64)
+
+    # From tensorflow_models/tutorials/embedding/word2vec_kernels.cc, which is
+    # suppose to correlate with Eq. 5 in http://arxiv.org/abs/1310.4546.
+    keep_prob = ((math_ops.sqrt(freq /
+                                (vocab_subsampling * corpus_size)) + 1.0) *
+                 (vocab_subsampling * corpus_size / freq))
+    random_prob = random_ops.random_uniform(
+        array_ops.shape(freq),
+        minval=0,
+        maxval=1,
+        dtype=dtypes.float64,
+        seed=seed)
+
+    mask = math_ops.less_equal(random_prob, keep_prob)
+    return array_ops.boolean_mask(input_tensor, mask)
diff --git a/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py b/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py
new file mode 100644
index 00000000000..d989942f732
--- /dev/null
+++ b/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py
@@ -0,0 +1,571 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Skip-gram sampling ops tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import os
+
+from tensorflow.contrib import lookup
+from tensorflow.contrib import text
+from tensorflow.contrib.text.python.ops import skip_gram_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class SkipGramOpsTest(test.TestCase):
+
+  def _split_tokens_labels(self, output):
+    tokens = [x[0] for x in output]
+    labels = [x[1] for x in output]
+    return tokens, labels
+
+  def test_skip_gram_sample_skips_2(self):
+    """Tests skip-gram with min_skips = max_skips = 2."""
+    input_tensor = constant_op.constant(
+        [b"the", b"quick", b"brown", b"fox", b"jumps"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=2, max_skips=2)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"the", b"brown"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"quick", b"fox"),
+        (b"brown", b"the"),
+        (b"brown", b"quick"),
+        (b"brown", b"fox"),
+        (b"brown", b"jumps"),
+        (b"fox", b"quick"),
+        (b"fox", b"brown"),
+        (b"fox", b"jumps"),
+        (b"jumps", b"brown"),
+        (b"jumps", b"fox"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_emit_self(self):
+    """Tests skip-gram with emit_self_as_target = True."""
+    input_tensor = constant_op.constant(
+        [b"the", b"quick", b"brown", b"fox", b"jumps"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=2, max_skips=2, emit_self_as_target=True)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"the"),
+        (b"the", b"quick"),
+        (b"the", b"brown"),
+        (b"quick", b"the"),
+        (b"quick", b"quick"),
+        (b"quick", b"brown"),
+        (b"quick", b"fox"),
+        (b"brown", b"the"),
+        (b"brown", b"quick"),
+        (b"brown", b"brown"),
+        (b"brown", b"fox"),
+        (b"brown", b"jumps"),
+        (b"fox", b"quick"),
+        (b"fox", b"brown"),
+        (b"fox", b"fox"),
+        (b"fox", b"jumps"),
+        (b"jumps", b"brown"),
+        (b"jumps", b"fox"),
+        (b"jumps", b"jumps"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_skips_0(self):
+    """Tests skip-gram with min_skips = max_skips = 0."""
+    input_tensor = constant_op.constant([b"the", b"quick", b"brown"])
+
+    # If emit_self_as_target is False (default), output will be empty.
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=0, max_skips=0, emit_self_as_target=False)
+    with self.test_session():
+      self.assertEqual(0, tokens.eval().size)
+      self.assertEqual(0, labels.eval().size)
+
+    # If emit_self_as_target is True, each token will be its own label.
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=0, max_skips=0, emit_self_as_target=True)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"the"),
+        (b"quick", b"quick"),
+        (b"brown", b"brown"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_skips_exceed_length(self):
+    """Tests skip-gram when min/max_skips exceed length of input."""
+    input_tensor = constant_op.constant([b"the", b"quick", b"brown"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=100, max_skips=100)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"the", b"brown"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"brown", b"the"),
+        (b"brown", b"quick"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_start_limit(self):
+    """Tests skip-gram over a limited portion of the input."""
+    input_tensor = constant_op.constant(
+        [b"foo", b"the", b"quick", b"brown", b"bar"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=1, start=1, limit=3)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"brown", b"quick"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_limit_exceeds(self):
+    """Tests skip-gram when limit exceeds the length of the input."""
+    input_tensor = constant_op.constant([b"foo", b"the", b"quick", b"brown"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=1, start=1, limit=100)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"brown", b"quick"),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_random_skips(self):
+    """Tests skip-gram with min_skips != max_skips, with random output."""
+    # The number of outputs is non-deterministic in this case, so set random
+    # seed to help ensure the outputs remain constant for this test case.
+    random_seed.set_random_seed(42)
+
+    input_tensor = constant_op.constant(
+        [b"the", b"quick", b"brown", b"fox", b"jumps", b"over"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=2, seed=9)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"the", b"brown"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"quick", b"fox"),
+        (b"brown", b"the"),
+        (b"brown", b"quick"),
+        (b"brown", b"fox"),
+        (b"brown", b"jumps"),
+        (b"fox", b"brown"),
+        (b"fox", b"jumps"),
+        (b"jumps", b"fox"),
+        (b"jumps", b"over"),
+        (b"over", b"fox"),
+        (b"over", b"jumps"),
+    ])
+    with self.test_session() as sess:
+      tokens_eval, labels_eval = sess.run([tokens, labels])
+      self.assertAllEqual(expected_tokens, tokens_eval)
+      self.assertAllEqual(expected_labels, labels_eval)
+
+  def test_skip_gram_sample_random_skips_default_seed(self):
+    """Tests outputs are still random when no op-level seed is specified."""
+    # This is needed since tests set a graph-level seed by default. We want to
+    # explicitly avoid setting both graph-level seed and op-level seed, to
+    # simulate behavior under non-test settings when the user doesn't provide a
+    # seed to us. This results in random_seed.get_seed() returning None for both
+    # seeds, forcing the C++ kernel to execute its default seed logic.
+    random_seed.set_random_seed(None)
+
+    # Uses an input tensor with 10 words, with possible skip ranges in [1,
+    # 5]. Thus, the probability that two random samplings would result in the
+    # same outputs is 1/5^10 ~ 1e-7 (aka the probability of this test being
+    # flaky).
+    input_tensor = constant_op.constant([str(x) for x in range(10)])
+
+    # Do not provide an op-level seed here!
+    tokens_1, labels_1 = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=5)
+    tokens_2, labels_2 = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=5)
+
+    with self.test_session() as sess:
+      tokens_1_eval, labels_1_eval, tokens_2_eval, labels_2_eval = sess.run(
+          [tokens_1, labels_1, tokens_2, labels_2])
+
+    if len(tokens_1_eval) == len(tokens_2_eval):
+      self.assertNotEqual(tokens_1_eval.tolist(), tokens_2_eval.tolist())
+    if len(labels_1_eval) == len(labels_2_eval):
+      self.assertNotEqual(labels_1_eval.tolist(), labels_2_eval.tolist())
+
+  def test_skip_gram_sample_batch(self):
+    """Tests skip-gram with batching."""
+    input_tensor = constant_op.constant([b"the", b"quick", b"brown", b"fox"])
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=1, batch_size=3)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"quick"),
+        (b"quick", b"the"),
+        (b"quick", b"brown"),
+        (b"brown", b"quick"),
+        (b"brown", b"fox"),
+        (b"fox", b"brown"),
+    ])
+    with self.test_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+
+      tokens_eval, labels_eval = sess.run([tokens, labels])
+      self.assertAllEqual(expected_tokens[:3], tokens_eval)
+      self.assertAllEqual(expected_labels[:3], labels_eval)
+      tokens_eval, labels_eval = sess.run([tokens, labels])
+      self.assertAllEqual(expected_tokens[3:6], tokens_eval)
+      self.assertAllEqual(expected_labels[3:6], labels_eval)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_skip_gram_sample_non_string_input(self):
+    """Tests skip-gram with non-string input."""
+    input_tensor = constant_op.constant([1, 2, 3], dtype=dtypes.int16)
+    tokens, labels = text.skip_gram_sample(
+        input_tensor, min_skips=1, max_skips=1)
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (1, 2),
+        (2, 1),
+        (2, 3),
+        (3, 2),
+    ])
+    with self.test_session():
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def test_skip_gram_sample_errors(self):
+    """Tests various errors raised by skip_gram_sample()."""
+    input_tensor = constant_op.constant([b"the", b"quick", b"brown"])
+
+    invalid_skips = (
+        # min_skips and max_skips must be >= 0.
+        (-1, 2),
+        (1, -2),
+        # min_skips must be <= max_skips.
+        (2, 1))
+    for min_skips, max_skips in invalid_skips:
+      tokens, labels = text.skip_gram_sample(
+          input_tensor, min_skips=min_skips, max_skips=max_skips)
+      with self.test_session() as sess, self.assertRaises(
+          errors.InvalidArgumentError):
+        sess.run([tokens, labels])
+
+    # input_tensor must be of rank 1.
+    with self.assertRaises(ValueError):
+      invalid_tensor = constant_op.constant([[b"the"], [b"quick"], [b"brown"]])
+      text.skip_gram_sample(invalid_tensor)
+
+    # vocab_freq_table must be provided if vocab_min_count, vocab_subsampling,
+    # or corpus_size is specified.
+    dummy_input = constant_op.constant([""])
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(
+          dummy_input, vocab_freq_table=None, vocab_min_count=1)
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(
+          dummy_input, vocab_freq_table=None, vocab_subsampling=1e-5)
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(dummy_input, vocab_freq_table=None, corpus_size=100)
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(
+          dummy_input,
+          vocab_freq_table=None,
+          vocab_subsampling=1e-5,
+          corpus_size=100)
+
+    # vocab_subsampling and corpus_size must both be present or absent.
+    dummy_table = lookup.HashTable(
+        lookup.KeyValueTensorInitializer([b"foo"], [10]), -1)
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(
+          dummy_input,
+          vocab_freq_table=dummy_table,
+          vocab_subsampling=None,
+          corpus_size=100)
+    with self.assertRaises(ValueError):
+      text.skip_gram_sample(
+          dummy_input,
+          vocab_freq_table=dummy_table,
+          vocab_subsampling=1e-5,
+          corpus_size=None)
+
+  def test_filter_input_filter_vocab(self):
+    """Tests input filtering based on vocab frequency table and thresholds."""
+    input_tensor = constant_op.constant(
+        [b"the", b"answer", b"to", b"life", b"and", b"universe"])
+    keys = constant_op.constant([b"and", b"life", b"the", b"to", b"universe"])
+    values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
+    vocab_freq_table = lookup.HashTable(
+        lookup.KeyValueTensorInitializer(keys, values), -1)
+
+    with self.test_session():
+      vocab_freq_table.init.run()
+
+      # No vocab_freq_table specified - output should be the same as input.
+      no_table_output = skip_gram_ops._filter_input(
+          input_tensor=input_tensor,
+          vocab_freq_table=None,
+          vocab_min_count=None,
+          vocab_subsampling=None,
+          corpus_size=None,
+          seed=None)
+      self.assertAllEqual(input_tensor.eval(), no_table_output.eval())
+
+      # vocab_freq_table specified, but no vocab_min_count - output should have
+      # filtered out tokens not in the table (b"answer").
+      table_output = skip_gram_ops._filter_input(
+          input_tensor=input_tensor,
+          vocab_freq_table=vocab_freq_table,
+          vocab_min_count=None,
+          vocab_subsampling=None,
+          corpus_size=None,
+          seed=None)
+      self.assertAllEqual([b"the", b"to", b"life", b"and", b"universe"],
+                          table_output.eval())
+
+      # vocab_freq_table and vocab_min_count specified - output should have
+      # filtered out tokens whose frequencies are below the threshold
+      # (b"and": 0, b"life": 1).
+      threshold_output = skip_gram_ops._filter_input(
+          input_tensor=input_tensor,
+          vocab_freq_table=vocab_freq_table,
+          vocab_min_count=2,
+          vocab_subsampling=None,
+          corpus_size=None,
+          seed=None)
+      self.assertAllEqual([b"the", b"to", b"universe"], threshold_output.eval())
+
+  def test_filter_input_subsample_vocab(self):
+    """Tests input filtering based on vocab subsampling."""
+    # The outputs are non-deterministic, so set random seed to help ensure that
+    # the outputs remain constant for testing.
+    random_seed.set_random_seed(42)
+
+    input_tensor = constant_op.constant([
+        # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
+        b"the",
+        b"answer",  # Not in vocab. (Always discarded)
+        b"to",  # keep_prob = 0.75.
+        b"life",  # keep_prob > 1. (Always kept)
+        b"and",  # keep_prob = 0.48.
+        b"universe"  # Below vocab threshold of 3. (Always discarded)
+    ])
+    keys = constant_op.constant([b"and", b"life", b"the", b"to", b"universe"])
+    values = constant_op.constant([40, 8, 30, 20, 2], dtypes.int64)
+    vocab_freq_table = lookup.HashTable(
+        lookup.KeyValueTensorInitializer(keys, values), -1)
+
+    with self.test_session():
+      vocab_freq_table.init.run()
+      output = skip_gram_ops._filter_input(
+          input_tensor=input_tensor,
+          vocab_freq_table=vocab_freq_table,
+          vocab_min_count=3,
+          vocab_subsampling=0.05,
+          corpus_size=math_ops.reduce_sum(values),
+          seed=9)
+      self.assertAllEqual([b"the", b"to", b"life", b"and"], output.eval())
+
+  def _make_text_vocab_freq_file(self):
+    filepath = os.path.join(test.get_temp_dir(), "vocab_freq.txt")
+    with open(filepath, "w") as f:
+      writer = csv.writer(f)
+      writer.writerows([
+          ["and", 40],
+          ["life", 8],
+          ["the", 30],
+          ["to", 20],
+          ["universe", 2],
+      ])
+    return filepath
+
+  def _make_text_vocab_float_file(self):
+    filepath = os.path.join(test.get_temp_dir(), "vocab_freq_float.txt")
+    with open(filepath, "w") as f:
+      writer = csv.writer(f)
+      writer.writerows([
+          ["and", 0.4],
+          ["life", 0.08],
+          ["the", 0.3],
+          ["to", 0.2],
+          ["universe", 0.02],
+      ])
+    return filepath
+
+  def test_skip_gram_sample_with_text_vocab_filter_vocab(self):
+    """Tests skip-gram sampling with text vocab and freq threshold filtering."""
+    input_tensor = constant_op.constant([
+        b"the",
+        b"answer",  # Will be filtered before candidate generation.
+        b"to",
+        b"life",
+        b"and",
+        b"universe"  # Will be filtered before candidate generation.
+    ])
+
+    # b"answer" is not in vocab file, and b"universe"'s frequency is below
+    # threshold of 3.
+    vocab_freq_file = self._make_text_vocab_freq_file()
+
+    tokens, labels = text.skip_gram_sample_with_text_vocab(
+        input_tensor=input_tensor,
+        vocab_freq_file=vocab_freq_file,
+        vocab_token_index=0,
+        vocab_freq_index=1,
+        vocab_min_count=3,
+        min_skips=1,
+        max_skips=1)
+
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"to"),
+        (b"to", b"the"),
+        (b"to", b"life"),
+        (b"life", b"to"),
+        (b"life", b"and"),
+        (b"and", b"life"),
+    ])
+    with self.test_session():
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(expected_tokens, tokens.eval())
+      self.assertAllEqual(expected_labels, labels.eval())
+
+  def _text_vocab_subsample_vocab_helper(self, vocab_freq_file, vocab_min_count,
+                                         vocab_freq_dtype):
+    # The outputs are non-deterministic, so set random seed to help ensure that
+    # the outputs remain constant for testing.
+    random_seed.set_random_seed(42)
+
+    input_tensor = constant_op.constant([
+        # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
+        b"the",
+        b"answer",  # Not in vocab. (Always discarded)
+        b"to",  # keep_prob = 0.75.
+        b"life",  # keep_prob > 1. (Always kept)
+        b"and",  # keep_prob = 0.48.
+        b"universe"  # Below vocab threshold of 3. (Always discarded)
+    ])
+    # keep_prob calculated from vocab file with relative frequencies of:
+    # and: 40
+    # life: 8
+    # the: 30
+    # to: 20
+    # universe: 2
+
+    tokens, labels = text.skip_gram_sample_with_text_vocab(
+        input_tensor=input_tensor,
+        vocab_freq_file=vocab_freq_file,
+        vocab_token_index=0,
+        vocab_freq_index=1,
+        vocab_freq_dtype=vocab_freq_dtype,
+        vocab_min_count=vocab_min_count,
+        vocab_subsampling=0.05,
+        min_skips=1,
+        max_skips=1,
+        seed=123)
+
+    expected_tokens, expected_labels = self._split_tokens_labels([
+        (b"the", b"to"),
+        (b"to", b"the"),
+        (b"to", b"life"),
+        (b"life", b"to"),
+    ])
+    with self.test_session() as sess:
+      lookup_ops.tables_initializer().run()
+      tokens_eval, labels_eval = sess.run([tokens, labels])
+      self.assertAllEqual(expected_tokens, tokens_eval)
+      self.assertAllEqual(expected_labels, labels_eval)
+
+  def test_skip_gram_sample_with_text_vocab_subsample_vocab(self):
+    """Tests skip-gram sampling with text vocab and vocab subsampling."""
+    # Vocab file frequencies
+    # and: 40
+    # life: 8
+    # the: 30
+    # to: 20
+    # universe: 2
+    self._text_vocab_subsample_vocab_helper(
+        vocab_freq_file=self._make_text_vocab_freq_file(),
+        vocab_min_count=3,
+        vocab_freq_dtype=dtypes.int64)
+
+  def test_skip_gram_sample_with_text_vocab_subsample_vocab_float(self):
+    """Tests skip-gram sampling with text vocab and subsampling with floats."""
+    # Vocab file frequencies
+    # and: 0.4
+    # life: 0.08
+    # the: 0.3
+    # to: 0.2
+    # universe: 0.02
+    self._text_vocab_subsample_vocab_helper(
+        vocab_freq_file=self._make_text_vocab_float_file(),
+        vocab_min_count=0.03,
+        vocab_freq_dtype=dtypes.float32)
+
+  def test_skip_gram_sample_with_text_vocab_errors(self):
+    """Tests various errors raised by skip_gram_sample_with_text_vocab()."""
+    dummy_input = constant_op.constant([""])
+    vocab_freq_file = self._make_text_vocab_freq_file()
+
+    invalid_indices = (
+        # vocab_token_index can't be negative.
+        (-1, 0),
+        # vocab_freq_index can't be negative.
+        (0, -1),
+        # vocab_token_index can't be equal to vocab_freq_index.
+        (0, 0),
+        (1, 1),
+        # vocab_freq_file only has two columns.
+        (0, 2),
+        (2, 0))
+
+    for vocab_token_index, vocab_freq_index in invalid_indices:
+      with self.assertRaises(ValueError):
+        text.skip_gram_sample_with_text_vocab(
+            input_tensor=dummy_input,
+            vocab_freq_file=vocab_freq_file,
+            vocab_token_index=vocab_token_index,
+            vocab_freq_index=vocab_freq_index)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index c7ff4a2921e..4fa1ccea699 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,76 +1,26 @@
-# tfprof: A Profiling Tool for TensorFlow Models
+# tfprof: TensorFlow Profiler and Beyond
 
-# Full Docment in tensorflow/tools/tfprof/README.md
+# Full Document in tensorflow/tools/tfprof/README.md
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+4.  Selectively group, filter, account and order ops.
 
-tfprof can be used as Python API, Interactive CLI and One-shot Script.
+####tfprof supports 3 views to organize TensorFlow model profiles
 
-## Python API Tutorials
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
 
-tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
+####For each view, there are 3 ways to display outputs:
 
-### Examine the shapes and sizes of all trainiable Variables.
-```python
-# Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
-sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
-```
-
-### Examine the number of floating point operations
-``` python
-# Print to stdout an analysis of the number of floating point operations in the
-# model broken down by individual operations.
-#
-# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
-# also requires complete shape information. It is common that shape is unknown
-# statically. To complete the shape, provide run-time shape information with
-# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
-```
-
-### Examine the timing and memory usage
-You will first need to run the following set up in your model in order to
-compute the memory and timing statistics.
-
-```python
-# Generate the meta information for the model that contains the memory usage
-# and timing information.
-run_metadata = tf.RunMetadata()
-with tf.Session() as sess:
-  _ = sess.run(train_op,
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-```
-
-Finally, you may run `print_model_analysis` to explore the timing and memory
-demands of the model.
-
-``` python
-# Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
-```
-
-Users can change ```tfprof_options``` to fully leverage tfprof's power.
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
index e62b2671eb0..8040c791ee4 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@@ -1,17 +1,17 @@
-licenses(["notice"])  # Apache 2.0
-
 package(default_visibility = ["//visibility:public"])
 
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
 py_library(
     name = "model_analyzer",
     srcs = ["model_analyzer.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":pywrap_tensorflow_print_model_analysis_lib",
         ":tfprof_logger",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:pywrap_tensorflow_print_model_analysis_lib",
         "//tensorflow/tools/tfprof:protos_all_py",
     ],
 )
@@ -20,17 +20,30 @@ py_test(
     name = "model_analyzer_test",
     srcs = ["model_analyzer_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":model_analyzer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:model_analyzer_testlib",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":model_analyzer",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:model_analyzer_testlib",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
 )
@@ -49,6 +62,7 @@ py_library(
 
 tf_py_test(
     name = "tfprof_logger_test",
+    size = "small",
     srcs = ["tfprof_logger_test.py"],
     additional_deps = [
         ":tfprof_logger",
@@ -63,36 +77,27 @@ tf_py_test(
     ],
 )
 
-tf_py_wrap_cc(
-    name = "pywrap_tensorflow_print_model_analysis_lib",
-    srcs = ["pywrap_tensorflow_print_model_analysis.i"],
-    swig_includes = [
-        "//tensorflow/python:lib/core/strings.i",
-        "//tensorflow/python:platform/base.i",
-    ],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/tools/tfprof/internal:print_model_analysis_hdr",
-        "//util/python:python_headers",
-    ],
+py_library(
+    name = "pprof_profiler",
+    srcs = ["pprof_profiler.py"],
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_pprof//:pprof_proto_py"],
 )
 
 py_test(
-    name = "print_model_analysis_test",
-    srcs = ["print_model_analysis_test.py"],
+    name = "pprof_profiler_test",
+    size = "small",
+    srcs = ["pprof_profiler_test.py"],
+    main = "pprof_profiler_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # TODO(annarev): get it working with pip.
     deps = [
-        ":pywrap_tensorflow_print_model_analysis_lib",
-        "//tensorflow/python:array_ops",
+        ":pprof_profiler",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/tools/tfprof:protos_all_py",
+        "@com_google_pprof//:pprof_proto_py",
     ],
 )
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD
new file mode 100644
index 00000000000..3fa5b7867d4
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD
@@ -0,0 +1,76 @@
+package(default_visibility = ["//tensorflow/contrib/tfprof/python/tools/tfprof:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+py_library(
+    name = "model_analyzer_testlib",
+    srcs = ["model_analyzer_testlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof:model_analyzer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_tensorflow_print_model_analysis_lib",
+    srcs = ["pywrap_tensorflow_print_model_analysis.i"],
+    swig_includes = [
+        "//tensorflow/python:lib/core/strings.i",
+        "//tensorflow/python:platform/base.i",
+    ],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/tools/tfprof/internal:print_model_analysis_hdr",
+        "//util/python:python_headers",
+    ],
+)
+
+py_test(
+    name = "print_model_analysis_test",
+    srcs = ["print_model_analysis_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow_print_model_analysis_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/tools/tfprof:protos_all_py",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/model_analyzer_testlib.py b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/model_analyzer_testlib.py
new file mode 100644
index 00000000000..42b83fde7cf
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/model_analyzer_testlib.py
@@ -0,0 +1,97 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A test lib that defines some models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import gradient_descent
+
+
+def BuildSmallModel():
+  """Build a small forward conv model."""
+  image = array_ops.zeros([2, 6, 6, 3])
+  _ = variable_scope.get_variable(
+      'ScalarW', [],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  kernel = variable_scope.get_variable(
+      'DW', [3, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+  kernel = variable_scope.get_variable(
+      'DW2', [2, 2, 6, 12],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
+  return x
+
+
+def BuildFullModel():
+  """Build the full model with conv,rnn,opt."""
+  seq = []
+  for i in range(4):
+    with variable_scope.variable_scope('inp_%d' % i):
+      seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
+
+  cell = rnn_cell.BasicRNNCell(16)
+  out = rnn.dynamic_rnn(
+      cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
+
+  target = array_ops.ones_like(out)
+  loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
+  sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
+  return sgd_op.minimize(loss)
+
+
+def BuildSplitableModel():
+  """Build a small model that can be run partially in each step."""
+  image = array_ops.zeros([2, 6, 6, 3])
+
+  kernel1 = variable_scope.get_variable(
+      'DW', [3, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  r1 = nn_ops.conv2d(image, kernel1, [1, 2, 2, 1], padding='SAME')
+
+  kernel2 = variable_scope.get_variable(
+      'DW2', [2, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  r2 = nn_ops.conv2d(image, kernel2, [1, 2, 2, 1], padding='SAME')
+
+  r3 = r1 + r2
+  return r1, r2, r3
+
+
+def SearchTFProfNode(node, name):
+  """Search a node in the tree."""
+  if node.name == name:
+    return node
+  for c in node.children:
+    r = SearchTFProfNode(c, name)
+    if r: return r
+  return None
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py
new file mode 100644
index 00000000000..76e7d627cea
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py
@@ -0,0 +1,383 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""print_model_analysis test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+from tensorflow.tools.tfprof import tfprof_options_pb2
+from tensorflow.tools.tfprof import tfprof_output_pb2
+
+# pylint: disable=g-bad-import-order
+# XXX: this depends on pywrap_tensorflow and must come later
+from tensorflow.contrib.tfprof.python.tools.tfprof.internal import pywrap_tensorflow_print_model_analysis_lib as print_mdl
+
+# pylint: disable=bad-whitespace
+# pylint: disable=bad-continuation
+TEST_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 0,
+    'order_by': 'name',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['params'],
+    'output': 'stdout',
+}
+
+# pylint: enable=bad-whitespace
+# pylint: enable=bad-continuation
+
+
+class PrintModelAnalysisTest(test.TestCase):
+
+  def _BuildSmallModel(self):
+    image = array_ops.zeros([2, 6, 6, 3])
+    kernel = variable_scope.get_variable(
+        'DW', [6, 6, 3, 6],
+        dtypes.float32,
+        initializer=init_ops.random_normal_initializer(stddev=0.001))
+    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+    return x
+
+  def testPrintModelAnalysis(self):
+    opts = tfprof_options_pb2.OptionsProto()
+    opts.max_depth = TEST_OPTIONS['max_depth']
+    opts.min_bytes = TEST_OPTIONS['min_bytes']
+    opts.min_micros = TEST_OPTIONS['min_micros']
+    opts.min_params = TEST_OPTIONS['min_params']
+    opts.min_float_ops = TEST_OPTIONS['min_float_ops']
+    opts.order_by = TEST_OPTIONS['order_by']
+    for p in TEST_OPTIONS['account_type_regexes']:
+      opts.account_type_regexes.append(p)
+    for p in TEST_OPTIONS['start_name_regexes']:
+      opts.start_name_regexes.append(p)
+    for p in TEST_OPTIONS['trim_name_regexes']:
+      opts.trim_name_regexes.append(p)
+    for p in TEST_OPTIONS['show_name_regexes']:
+      opts.show_name_regexes.append(p)
+    for p in TEST_OPTIONS['hide_name_regexes']:
+      opts.hide_name_regexes.append(p)
+    opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
+    for p in TEST_OPTIONS['select']:
+      opts.select.append(p)
+    opts.output = TEST_OPTIONS['output']
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      _ = self._BuildSmallModel()
+      tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
+      tfprof_pb.ParseFromString(
+          print_mdl.PrintModelAnalysis(
+              sess.graph.as_graph_def(add_shapes=True).SerializeToString(),
+              b'',
+              b'',
+              b'scope',
+              opts.SerializeToString()))
+
+      expected_pb = tfprof_output_pb2.TFGraphNodeProto()
+      text_format.Merge(r"""name: "_TFProfRoot"
+          exec_micros: 0
+          requested_bytes: 0
+          total_exec_micros: 0
+          total_requested_bytes: 0
+          total_parameters: 648
+          children {
+            name: "Conv2D"
+            exec_micros: 0
+            requested_bytes: 0
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 0
+            float_ops: 0
+            total_float_ops: 0
+            input_shapes {
+              key: 0
+              value {
+                dim {
+                  size: 2
+                }
+                dim {
+                  size: 6
+                }
+                dim {
+                  size: 6
+                }
+                dim {
+                  size: 3
+                }
+              }
+            }
+            input_shapes {
+              key: 1
+              value {
+                dim {
+                  size: 6
+                }
+                dim {
+                  size: 6
+                }
+                dim {
+                  size: 3
+                }
+                dim {
+                  size: 6
+                }
+              }
+            }
+          }
+          children {
+            name: "DW"
+            exec_micros: 0
+            requested_bytes: 0
+            parameters: 648
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 648
+            children {
+              name: "DW/Assign"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              float_ops: 0
+              total_float_ops: 0
+              input_shapes {
+                key: 0
+                value {
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 3
+                  }
+                  dim {
+                    size: 6
+                  }
+                }
+              }
+              input_shapes {
+                key: 1
+                value {
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 3
+                  }
+                  dim {
+                    size: 6
+                  }
+                }
+              }
+            }
+            children {
+              name: "DW/Initializer"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              children {
+                name: "DW/Initializer/random_normal"
+                exec_micros: 0
+                requested_bytes: 0
+                total_exec_micros: 0
+                total_requested_bytes: 0
+                total_parameters: 0
+                children {
+                  name: "DW/Initializer/random_normal/RandomStandardNormal"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  float_ops: 0
+                  total_float_ops: 0
+                  input_shapes {
+                    key: 0
+                    value {
+                      dim {
+                        size: 4
+                      }
+                    }
+                  }
+                }
+                children {
+                  name: "DW/Initializer/random_normal/mean"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/mul"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  float_ops: 0
+                  total_float_ops: 0
+                  input_shapes {
+                    key: 0
+                    value {
+                      dim {
+                        size: 6
+                      }
+                      dim {
+                        size: 6
+                      }
+                      dim {
+                        size: 3
+                      }
+                      dim {
+                        size: 6
+                      }
+                    }
+                  }
+                  input_shapes {
+                    key: 1
+                    value {
+                      dim {
+                        size: 1
+                      }
+                    }
+                  }
+                }
+                children {
+                  name: "DW/Initializer/random_normal/shape"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/stddev"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                float_ops: 0
+                total_float_ops: 0
+                input_shapes {
+                  key: 0
+                  value {
+                    dim {
+                      size: 6
+                    }
+                    dim {
+                      size: 6
+                    }
+                    dim {
+                      size: 3
+                    }
+                    dim {
+                      size: 6
+                    }
+                  }
+                }
+                input_shapes {
+                  key: 1
+                  value {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+              float_ops: 0
+              total_float_ops: 0
+            }
+            children {
+              name: "DW/read"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              float_ops: 0
+              total_float_ops: 0
+              input_shapes {
+                key: 0
+                value {
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 6
+                  }
+                  dim {
+                    size: 3
+                  }
+                  dim {
+                    size: 6
+                  }
+                }
+              }
+            }
+            float_ops: 0
+            total_float_ops: 0
+          }
+          children {
+            name: "zeros"
+            exec_micros: 0
+            requested_bytes: 0
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 0
+            float_ops: 0
+            total_float_ops: 0
+          }
+          float_ops: 0
+          total_float_ops: 0""", expected_pb)
+      self.assertEqual(expected_pb, tfprof_pb)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
similarity index 85%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i
rename to tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
index 05b734a699f..40f29ae8a2c 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
@@ -19,6 +19,8 @@ limitations under the License.
 %{
 #include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
 #include "tensorflow/core/framework/types.h"
+
+using tensorflow::int64;
 %}
 
 %typemap(typecheck) const string & = char *;
@@ -37,6 +39,11 @@ limitations under the License.
 %unignore tensorflow;
 %unignore tensorflow::tfprof;
 %unignore tensorflow::tfprof::PrintModelAnalysis;
+%unignore tensorflow::tfprof::NewProfiler;
+%unignore tensorflow::tfprof::DeleteProfiler;
+%unignore tensorflow::tfprof::AddStep;
+%unignore tensorflow::tfprof::Profile;
+%unignore tensorflow::tfprof::Advise;
 
 %include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
index cc94fd65b53..419beac0b9b 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -20,8 +20,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tfprof.python.tools.tfprof import pywrap_tensorflow_print_model_analysis_lib as print_mdl
 from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
+from tensorflow.contrib.tfprof.python.tools.tfprof.internal import pywrap_tensorflow_print_model_analysis_lib as print_mdl
+from tensorflow.python.framework import errors
 from tensorflow.tools.tfprof import tfprof_options_pb2
 from tensorflow.tools.tfprof import tfprof_output_pb2
 
@@ -36,7 +37,6 @@ TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
     'min_micros': 0,
     'min_params': 0,
     'min_float_ops': 0,
-    'device_regexes': ['.*'],
     'order_by': 'name',
     'account_type_regexes': [tfprof_logger.TRAINABLE_VARIABLES],
     'start_name_regexes': ['.*'],
@@ -45,7 +45,7 @@ TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -56,7 +56,6 @@ FLOAT_OPS_OPTIONS = {
     'min_micros': 0,
     'min_params': 0,
     'min_float_ops': 1,
-    'device_regexes': ['.*'],
     'order_by': 'float_ops',
     'account_type_regexes': ['.*'],
     'start_name_regexes': ['.*'],
@@ -65,7 +64,7 @@ FLOAT_OPS_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['float_ops'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -78,7 +77,6 @@ PRINT_PARAMS_ON_DEVICE = {
     'min_micros': 0,
     'min_params': 0,
     'min_float_ops': 0,
-    'device_regexes': ['.*'],
     'order_by': 'name',
     'account_type_regexes': ['.*ps.*task:0.*'],
     'start_name_regexes': ['.*'],
@@ -87,7 +85,7 @@ PRINT_PARAMS_ON_DEVICE = {
     'hide_name_regexes': [],
     'account_displayed_op_only': False,
     'select': ['device', 'params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -98,7 +96,6 @@ PRINT_ALL_TIMING_MEMORY = {
     'min_micros': 1,  # Only >=1
     'min_params': 0,
     'min_float_ops': 0,
-    'device_regexes': ['.*'],
     'order_by': 'name',
     'account_type_regexes': ['.*'],
     'start_name_regexes': ['.*'],
@@ -107,7 +104,7 @@ PRINT_ALL_TIMING_MEMORY = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['micros', 'bytes'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -115,6 +112,187 @@ PRINT_ALL_TIMING_MEMORY = {
 # pylint: enable=bad-continuation
 
 
+def _build_options(tfprof_options):
+  """Build tfprof.OptionsProto.
+
+  Args:
+    tfprof_options: A dictionary of options.
+  Returns:
+    tfprof.OptionsProto.
+  """
+  opts = tfprof_options_pb2.OptionsProto()
+  opts.max_depth = tfprof_options.get('max_depth', 10)
+  opts.min_bytes = tfprof_options.get('min_bytes', 0)
+  opts.min_micros = tfprof_options.get('min_micros', 0)
+  opts.min_params = tfprof_options.get('min_params', 0)
+  opts.min_float_ops = tfprof_options.get('min_float_ops', 0)
+  opts.min_occurrence = tfprof_options.get('min_occurrence', 0)
+
+  opts.step = tfprof_options.get('step', -1)
+
+  opts.order_by = tfprof_options.get('order_by', 'name')
+
+  for p in tfprof_options.get('account_type_regexes', []):
+    opts.account_type_regexes.append(p)
+  for p in tfprof_options.get('start_name_regexes', []):
+    opts.start_name_regexes.append(p)
+  for p in tfprof_options.get('trim_name_regexes', []):
+    opts.trim_name_regexes.append(p)
+  for p in tfprof_options.get('show_name_regexes', []):
+    opts.show_name_regexes.append(p)
+  for p in tfprof_options.get('hide_name_regexes', []):
+    opts.hide_name_regexes.append(p)
+  opts.account_displayed_op_only = tfprof_options.get(
+      'account_displayed_op_only', False)
+
+  for p in tfprof_options.get('select', []):
+    opts.select.append(p)
+
+  opts.output = tfprof_options.get('output', 'stdout')
+  opts.dump_to_file = tfprof_options.get('dump_to_file', '')
+
+  return opts
+
+
+class Profiler(object):
+  """TensorFlow multi-step profiler.
+
+  See go/tfprof or README for details.
+
+  Typical use case:
+    # Currently we are only allowed to create 1 profiler per process.
+    profiler = Profile(sess.graph)
+
+    for i in xrange(total_steps):
+      if i % 10000 == 0:
+        run_meta = tf.RunMetadata()
+        _ = sess.run(...,
+                     options=tf.RunOptions(
+                         trace_level=tf.RunOptions.FULL_TRACE),
+                     run_metadata=run_meta)
+        profiler.add_step(i, run_meta)
+
+        # Profile the parameters of your model.
+        profiler.profile_name_scope(options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+
+        # Or profile the timing of your model operations.
+        opts = PRINT_ALL_TIMING_MEMORY.copy()
+        opts['order_by'] = 'micros'
+        opts['select'] = ['micros', 'occurrence']
+        opts['max_depth'] = 20
+        profiler.profile_operations(options=opts)
+
+        # Or you can generate a timeline:
+        opts = PRINT_ALL_TIMING_MEMORY.copy()
+        opts['output'] = 'timeline:outfile=' + filename
+        opts['step'] = i
+        profiler.profile_graph(options=opts)
+      else:
+        _ = sess.run(...)
+    # Auto detect problems and generate advice.
+    profiler.advise()
+  """
+
+  def __init__(self, graph, op_log=None):
+    """Constructor.
+
+    Args:
+      graph: tf.Graph.
+      op_log: optional. tensorflow::tfprof::OpLog proto. Used to define
+          extra op types.
+    """
+    self._graph = graph
+    # pylint: disable=protected-access
+    op_log = tfprof_logger._merge_default_with_oplog(
+        self._graph, op_log=op_log)
+    # pylint: enable=protected-access
+
+    print_mdl.NewProfiler(
+        self._graph.as_graph_def(add_shapes=True).SerializeToString(),
+        op_log.SerializeToString())
+
+  def __del__(self):
+    print_mdl.DeleteProfiler()
+
+  def add_step(self, step, run_meta):
+    """Add statistics of a step.
+
+    Args:
+      step: A step uint64 used to identify the RunMetadata. Must be different
+         across different AddStep() calls.
+      run_meta: RunMetadata proto that contains statistics of a session run.
+    """
+    # pylint: disable=protected-access
+    op_log = tfprof_logger._merge_default_with_oplog(
+        self._graph, run_meta=run_meta, add_trace=False,
+        add_trainable_var=False)
+    # pylint: enable=protected-access
+    print_mdl.AddStep(
+        step, run_meta.SerializeToString(), op_log.SerializeToString())
+
+  def profile_python_codes(self, options):
+    """Profile the statistics of the Python codes.
+
+      Hint: set options['show_name_regexes'] = ['.*my_code.py.*']
+
+    Args:
+      options: A dict of profiler options.
+    Returns:
+      a TFMultiGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('code'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_operations(self, options):
+    """Profile the statistics of the Operation types (e.g. MatMul, Conv2D).
+
+    Args:
+      options: A dict of profiler options.
+    Returns:
+      a TFMultiGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('op'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_name_scope(self, options):
+    """Profile the statistics of graph nodes, organized by name scope.
+
+    Args:
+      options: A dict of profiler options.
+    Returns:
+      a TFGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('scope'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_graph(self, options):
+    """Profile the statistics of graph nodes, organized by dataflow graph.
+
+    Args:
+      options: A dict of profiler options.
+    Returns:
+      a TFGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('graph'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def advise(self):
+    """Automatically detect problems and generate reports."""
+    print_mdl.Advise()
+
+
 def print_model_analysis(graph,
                          run_meta=None,
                          op_log=None,
@@ -122,19 +300,10 @@ def print_model_analysis(graph,
                          tfprof_options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS):
   """Print model statistics.
 
-    Prints the model statistics to stdout. Also returns the results
-    in a TFProfNode proto. See go/tfprof or run tfprof tool:
+    See go/tfprof or README for examples and tutorials.
+    Run tfprof tool for help:
     'bazel run third_party/tensorflow/tools/tfprof help'
 
-    Examples:
-      Show the parameter/shape statistics of tf.trainable_variables().
-        print_model_analysis(sess.graph).
-
-      Show number of float ops. Only ops with RegisterStatistics defined
-      are counted.
-        show_float_op_opts = model_analyzer.FLOAT_OPS_OPTIONS
-        print_model_analysis(sess.graph, tfprof_options=show_float_op_opts)
-
   Args:
     graph: tf.Graph.
     run_meta: tensorflow::RunMetadata proto. When provided, also shows valid
@@ -142,47 +311,46 @@ def print_model_analysis(graph,
               'micros' and 'bytes'.
     op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
             group together ops and use a op_type to select the group.
-    tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
-                ops using their name scopes. 'graph' view organize ops using
-                their graph inputs.
+    tfprof_cmd: string. Either 'op', 'scope', 'graph', 'code'.
+                'op' view organize outputs using operation type. (e.g. MatMul)
+                'scope' view organize outputs using graph node name scope.
+                'graph' view organize outputs using graph node inputs/outputs.
+                'code' view organize outputs using Python call stack.
     tfprof_options: See 'tfprof help' for details.
   Returns:
-    TFProfNode proto. Side effect: a formatted output to stdout.
+    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
+    If tfprof_cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto.
+    Side effect: stdout/file/timeline.json depending on tfprof_options['output']
   """
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
   # pylint: enable=protected-access
-  opts = tfprof_options_pb2.OptionsProto()
-  opts.max_depth = tfprof_options['max_depth']
-  opts.min_bytes = tfprof_options['min_bytes']
-  opts.min_micros = tfprof_options['min_micros']
-  opts.min_params = tfprof_options['min_params']
-  opts.min_float_ops = tfprof_options['min_float_ops']
-  for p in tfprof_options['device_regexes']:
-    opts.device_regexes.append(p)
-  opts.order_by = tfprof_options['order_by']
-  for p in tfprof_options['account_type_regexes']:
-    opts.account_type_regexes.append(p)
-  for p in tfprof_options['start_name_regexes']:
-    opts.start_name_regexes.append(p)
-  for p in tfprof_options['trim_name_regexes']:
-    opts.trim_name_regexes.append(p)
-  for p in tfprof_options['show_name_regexes']:
-    opts.show_name_regexes.append(p)
-  for p in tfprof_options['hide_name_regexes']:
-    opts.hide_name_regexes.append(p)
-  opts.account_displayed_op_only = tfprof_options['account_displayed_op_only']
-  for p in tfprof_options['select']:
-    opts.select.append(p)
-  opts.viz = tfprof_options['viz']
-  opts.dump_to_file = tfprof_options['dump_to_file']
+
+  opts = _build_options(tfprof_options)
 
   run_meta_str = run_meta.SerializeToString() if run_meta else b''
-  op_log_str = op_log.SerializeToString() if op_log else b''
 
-  tfprof_node = tfprof_output_pb2.TFProfNode()
-  tfprof_node.ParseFromString(
-      print_mdl.PrintModelAnalysis(
-          graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
-          tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
+  if tfprof_cmd == 'code' or tfprof_cmd == 'op':
+    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def(add_shapes=True).SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  elif tfprof_cmd == 'graph' or tfprof_cmd == 'scope':
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def(add_shapes=True).SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  else:
+    raise errors.InvalidArgumentError(
+        None, None, 'unknown tfprof_cmd: %s\n' % tfprof_cmd)
+
   return tfprof_node
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
index 9e713ac0af1..913971afaf1 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -18,69 +18,51 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 # XXX: this depends on pywrap_tensorflow and must come later
 from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
+from tensorflow.contrib.tfprof.python.tools.tfprof.internal import model_analyzer_testlib as lib
 
 
 class PrintModelAnalysisTest(test.TestCase):
 
-  def _BuildSmallModel(self):
-    image = array_ops.zeros([2, 6, 6, 3])
-    kernel = variable_scope.get_variable(
-        'DW', [3, 3, 3, 6],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
-    kernel = variable_scope.get_variable(
-        'DW2', [2, 2, 6, 12],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
-    return x
-
   def testDumpToFile(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
+      _ = lib.BuildSmallModel()
       model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
-        self.assertEqual(u'_TFProfRoot (--/450 params)\n'
+      with gfile.Open(outfile, 'r') as f:
+        self.assertEqual(u'node name | # parameters\n'
+                         '_TFProfRoot (--/451 params)\n'
                          '  DW (3x3x3x6, 162/162 params)\n'
-                         '  DW2 (2x2x6x12, 288/288 params)\n',
+                         '  DW2 (2x2x6x12, 288/288 params)\n'
+                         '  ScalarW (1, 1/1 params)\n',
                          f.read())
 
   def testSelectEverything(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['select'] = [
-        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
+        'bytes', 'params', 'float_ops', 'occurrence', 'device', 'op_types',
+        'input_shapes'
     ]
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      x = self._BuildSmallModel()
+      x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
@@ -92,13 +74,206 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.print_model_analysis(
           sess.graph, run_meta, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            '_TFProfRoot (0/450 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            'node name | # parameters | # float_ops | output bytes | assigned devices | op types | input shapes\n_TFProfRoot (--/451 params, --/10.44k flops, --/5.28KB, _kTFScopeParent, )\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 0:2x6x6x3|1:3x3x3x6)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 0:2x3x3x6|1:2x2x6x12)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, )\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:3x3x3x6|1:3x3x3x6)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:4)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 0:3x3x3x6)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, )\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:2x2x6x12|1:2x2x6x12)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:4)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 0:2x2x6x12)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, VariableV2|_trainable_variables, )\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:1|1:1)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:1|1:1)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:0)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:1|1:1)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, Identity, 0:1)\n  init (0/0 params, 0/0 flops, 0B/0B, NoOp, 0:1|1:3x3x3x6|2:2x2x6x12)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const, )\n',
             f.read())
         # pylint: enable=line-too-long
 
+  def testSimpleCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
+    opts['account_displayed_op_only'] = False
+    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
+    # it's a bit difficult to test it now.
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
+        'input_shapes'
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            'node name | output bytes | # parameters | # float_ops | assigned devices | input',
+            f.read()[0:80])
+        # pylint: enable=line-too-long
+
+  def testComplexCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = ['params', 'float_ops']
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      # pylint: disable=line-too-long
+      with gfile.Open(outfile, 'r') as f:
+        lines = f.read().split('\n')
+        result = '\n'.join([l[:min(len(l), 80)] for l in lines])
+        self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/54.08k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
+                         result)
+
+      self.assertLess(0, tfprof_node.total_exec_micros)
+      self.assertEqual(2844, tfprof_node.total_parameters)
+      self.assertEqual(54080, tfprof_node.total_float_ops)
+      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual('_TFProfRoot', tfprof_node.name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...',
+          tfprof_node.children[0].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
+          tfprof_node.children[1].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
+          tfprof_node.children[2].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
+          tfprof_node.children[3].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
+          tfprof_node.children[4].name)
+      # pylint: enable=line-too-long
+
+  def testCodeViewLeafGraphNode(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['account_type_regexes'] = ['.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'device'
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      leaf = tfprof_node
+      while leaf.children:
+        self.assertEqual(0, len(leaf.graph_nodes))
+        leaf = leaf.children[0]
+      self.assertEqual(1, len(leaf.graph_nodes))
+
+  def testTimeline(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'timeline')
+    opts['output'] = 'timeline:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['max_depth'] = 100000
+    opts['step'] = 0
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(
+          x,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE),
+          run_metadata=run_meta)
+
+      _ = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='graph', tfprof_options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # Test that a json file is created.
+        self.assertLess(1000, len(f.read()))
+
+  def testOpView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['min_occurrence'] = 10
+    opts['select'] = ['params', 'micros', 'occurrence', 'input_shapes']
+    opts['order_by'] = 'occurrence'
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='op', tfprof_options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        self.assertEqual(
+            'nodename|executiontime|#parameters|opoccurrence|inputshapes\n',
+            f.read().replace('\t', '').replace(' ', '')[0:60])
+
+      total_children = 0
+      last_occurrence = 1e32
+      input_shapes = 0
+      last_total_micros = tfprof_node.total_exec_micros
+      last_micros = tfprof_node.exec_micros
+      while tfprof_node.children:
+        for gnode in tfprof_node.graph_nodes:
+          input_shapes += len(gnode.input_shapes)
+        self.assertEqual(len(tfprof_node.children), 1)
+        tfprof_node = tfprof_node.children[0]
+
+        self.assertEqual(
+            last_total_micros, tfprof_node.total_exec_micros + last_micros)
+        last_total_micros = tfprof_node.total_exec_micros
+        last_micros = tfprof_node.exec_micros
+
+        total_children += 1
+        self.assertLessEqual(len(tfprof_node.graph_nodes), last_occurrence)
+        last_occurrence = len(tfprof_node.graph_nodes)
+
+      self.assertEqual(total_children, 15)
+      self.assertGreater(input_shapes, 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
new file mode 100644
index 00000000000..c57e45748d2
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
@@ -0,0 +1,445 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler for TensorFlow models that outputs data in pprof format.
+
+See https://github.com/google/pprof/blob/master/proto/profile.proto for pprof
+profile format.
+The following needs to be set for profiler to work:
+  * trace_level needs to be set to FULL_TRACE
+  * run_metadata object should be passed in to session.run call
+
+Sample usage:
+  options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+  run_metadata = tf.RunMetadata()
+
+  with tf.Session as sess:
+    ...
+    sess.run(computation, run_metadata=run_metadata, options=options)
+  pprof_profiler.profile(sess.graph, run_metadata, output_dir)
+
+
+  The code above would output a pprof profile to separate output_dir/.*.pb.gz
+  file for each device. These files can be passed to pprof for formatting.
+  For e.g.:
+     pprof -png --nodecount=100 --sample_index=1 output_dir/profile_output.pb.gz
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+from collections import namedtuple
+import gzip
+import os
+import string
+import sys
+import time
+
+from proto import profile_pb2
+
+
+if sys.version_info < (3,):
+  maketrans = string.maketrans
+else:
+  maketrans = str.maketrans
+
+
+ProfileDatum = namedtuple('ProfileDatum', [
+    'node_exec_stats', 'op_type', 'traceback'])
+
+
+class StringTable(object):
+  """Keeps track of strings to add to string_table in pprof proto."""
+
+  def __init__(self):
+    # Pprof requires first entry in string_table to be ''.
+    self._string_table = ['']
+    self._string_to_index = {'': 0}
+
+  def index_of(self, value_str):
+    """Get index of value_str in the string table.
+
+    If value_str is not in the string table, we will add it at the end
+    and then return the new index.
+    Args:
+      value_str: (string) Value to lookup/add in/to the string table.
+
+    Returns:
+      Index of value_str in the string table.
+    """
+    if value_str is None:
+      value_str = ''
+    if value_str in self._string_to_index:
+      return self._string_to_index[value_str]
+    index = len(self._string_table)
+    self._string_table.append(value_str)
+    self._string_to_index[value_str] = index
+    return index
+
+  def next_index(self):
+    """Gets index that would be assigned to the next added string.
+
+    Returns:
+      Index of the next string if it was added.
+    """
+    return len(self._string_table)
+
+  def string_table(self):
+    """Returns a list of strings to store in pprof's string_table."""
+    return self._string_table
+
+
+class Functions(object):
+  """Keeps track of `Function` protos for pprof profile."""
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # Maps tuples in the form (file_path, function_name, start_line_number)
+    # to `Function` protos.
+    self._function_key_to_function = {}
+
+  def index_of(self, file_path, function_name, function_start_line):
+    """Returns index of the function, adding the function if needed.
+
+    Args:
+      file_path: (string) Path to file where the function is defined.
+      function_name: (string) Function name.
+      function_start_line: (integer) Start line number of function definition.
+
+    Returns:
+      Function index.
+    """
+    function_key = (file_path, function_name, function_start_line)
+    if function_key in self._function_key_to_function:
+      return self._function_key_to_function[function_key].id
+    else:
+      # Function indexes should start from 1
+      function_index = len(self._function_key_to_function) + 1
+      function = profile_pb2.Function()
+      function.id = function_index
+      function.name = self._string_table.index_of(function_name)
+      function.filename = self._string_table.index_of(file_path)
+      function.start_line = function_start_line
+      self._function_key_to_function[function_key] = function
+      return function_index
+
+  def function_protos(self):
+    """Returns list of `profile_pb2.Function` protos."""
+    return self._function_key_to_function.values()
+
+
+class Locations(object):
+  """Keeps track of `Location` protos for pprof profile.
+
+  `Locations` store information about function call locations.
+  """
+
+  def __init__(self, functions):
+    """Constructor.
+
+    Args:
+      functions: A `Functions` object.
+    """
+    self._functions = functions
+    # Maps tuples in the form (file_path, called_function_name, line_number)
+    # to `Location` protos.
+    self._location_key_to_location = {}
+
+  def index_of(
+      self, file_path, line_number, called_function_name, called_file_path,
+      called_function_start_line):
+    """Returns index of the location, adding the location if needed.
+
+    Args:
+      file_path: (string) Path to file that makes the call.
+      line_number: (integer) Call line number.
+      called_function_name: (string) Function name of the function called at
+        `file_path` and `line_number`.
+      called_file_path: (string) Path to file where the called function is
+        defined.
+      called_function_start_line: (integer) Start line number of called
+        function definition in `called_file_path` file.
+
+    Returns:
+      Index of location.
+    """
+    location_key = (file_path, called_function_name, line_number)
+    if location_key in self._location_key_to_location:
+      location = self._location_key_to_location[location_key]
+      return location.id
+    else:
+      # Location indexes should start from 1
+      location_index = len(self._location_key_to_location) + 1
+      location = profile_pb2.Location()
+      location.id = location_index
+      self._location_key_to_location[location_key] = location
+
+      line = location.line.add()
+      line.function_id = self._functions.index_of(
+          called_file_path, called_function_name, called_function_start_line)
+      line.line = line_number
+      return location_index
+
+  def location_protos(self):
+    """Returns list of `profile_pb2.Location` protos."""
+    return self._location_key_to_location.values()
+
+
+class Samples(object):
+  """Keeps track of `Sample` protos for pprof profile.
+
+  Samples store the following statistics in order:
+  count, all_time, op_time
+  """
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # TODO(annarev): figure out if location is unique for each node name.
+    # If not, also key this dictionary based on location ids.
+    self._node_name_to_sample = {}
+
+  def add(self, datum, location_ids):
+    """Adds a sample data point.
+
+    Args:
+      datum: `ProfileDatum` to add a sample for.
+      location_ids: List of numberic location ids for this
+        sample.
+    """
+    node_name = datum.node_exec_stats.node_name
+    if node_name in self._node_name_to_sample:
+      sample = self._node_name_to_sample[node_name]
+      sample.location_id.extend(location_ids)
+    else:
+      sample = profile_pb2.Sample()
+      # Sample stores 3 values: count, all_time, op_time
+      sample.value.extend([0, 0, 0])
+
+      label = sample.label.add()
+      label.key = self._string_table.index_of('node_name')
+      label.str = self._string_table.index_of(node_name)
+      label = sample.label.add()
+      label.key = self._string_table.index_of('op_type')
+      label.str = self._string_table.index_of(datum.op_type)
+      self._node_name_to_sample[node_name] = sample
+    sample.value[0] += 1
+    sample.value[1] += datum.node_exec_stats.all_end_rel_micros
+    sample.value[2] += (
+        datum.node_exec_stats.op_end_rel_micros -
+        datum.node_exec_stats.op_start_rel_micros)
+
+  def get_sample_protos(self):
+    """Returns list of `Sample` protos for pprof profile."""
+    return self._node_name_to_sample.values()
+
+
+class PprofProfiler(object):
+  """Creates profiles in pprof format."""
+
+  def __init__(self, graph, run_metadata):
+    """Constructor.
+
+    Args:
+      graph: A `Graph` instance.
+      run_metadata: A list of `RunMetadata` objects.
+    """
+    self._graph = graph
+    self._run_metadata = run_metadata
+    self._string_table = StringTable()
+    self._functions = Functions(self._string_table)
+    self._locations = Locations(self._functions)
+
+  def profile(self):
+    """Generates pprof profiles.
+
+    Returns:
+      Dictionary mapping from device name to proto in `profile_pb2.Profile`
+      format.
+    """
+    profiles = {}
+    data_generator_func = self._get_profile_data_generator()
+    for device_index, device_stats in enumerate(
+        self._run_metadata.step_stats.dev_stats):
+      # Create profile
+      pprof_proto = self._get_pprof_proto(data_generator_func(device_stats))
+      if not pprof_proto.sample:
+        print(
+            'Not enough data to create profile for device %s. Did you pass '
+            'RunMetadata to session.run call?' % device_stats.device)
+        continue
+      # Add device name comment
+      device_count = len(self._run_metadata.step_stats.dev_stats)
+      device_description = (
+          'Device %d of %d: %s' %
+          (device_index + 1, device_count, device_stats.device))
+      device_description_str_index = self._string_table.next_index()
+      pprof_proto.string_table.append(device_description)
+      pprof_proto.comment.append(device_description_str_index)
+      profiles[device_stats.device] = pprof_proto
+    return profiles
+
+  def _get_pprof_proto(self, profile_datum_generator):
+    """Returns profile data in pprof proto format.
+
+    Args:
+      profile_datum_generator: Generator outputting `ProfileDatum` objects.
+
+    Returns:
+      A proto in pprof format.
+    """
+    pprof_profile = profile_pb2.Profile()
+    samples = Samples(self._string_table)
+
+    for datum in profile_datum_generator:
+      if not datum.traceback:
+        continue
+
+      stack_frame = datum.traceback[-1]
+      after_apply_op = False
+      location_ids = []
+
+      # We add locations from stack trace in bottom-up order.
+      for stack_frame_index in reversed(range(len(datum.traceback) - 1)):
+        prev_stack_frame = stack_frame
+        stack_frame = datum.traceback[stack_frame_index]
+
+        # Call at current frame calls function at previous frame.
+        prev_file_path = prev_stack_frame[0]
+        prev_function = prev_stack_frame[2]
+        prev_function_start_line = prev_stack_frame[4]
+        curr_file_path = stack_frame[0]
+        curr_line_number = stack_frame[1]
+
+        # Skip all calls up to apply_op since they are the same for all ops.
+        if not after_apply_op:
+          if prev_function == 'apply_op':
+            after_apply_op = True
+          continue
+        location_index = self._locations.index_of(
+            curr_file_path, curr_line_number,
+            prev_function, prev_file_path, prev_function_start_line)
+        location_ids.append(location_index)
+      samples.add(datum, location_ids)
+
+    sample_type_description = 'count'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('count')
+    sample_type_description = 'all_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+    sample_type_description = 'op_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+
+    pprof_profile.string_table.extend(self._string_table.string_table())
+    pprof_profile.sample.extend(samples.get_sample_protos())
+    pprof_profile.function.extend(self._functions.function_protos())
+    pprof_profile.location.extend(self._locations.location_protos())
+    return pprof_profile
+
+  def _get_profile_data_generator(self):
+    """Get function that generates `ProfileDatum` objects.
+
+    Returns:
+      A function that generates `ProfileDatum` objects.
+    """
+    node_to_traceback = defaultdict(list)
+    node_to_op_type = defaultdict(str)
+    for op in self._graph.get_operations():
+      node_to_traceback[op.name] = op.traceback_with_start_lines
+      node_to_op_type[op.name] = op.type
+
+    def profile_data_generator(device_step_stats):
+      for node_stats in device_step_stats.node_stats:
+        if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK':
+          continue
+        yield ProfileDatum(
+            node_stats,
+            node_to_op_type[node_stats.node_name],
+            node_to_traceback[node_stats.node_name])
+
+    return profile_data_generator
+
+
+def get_profiles(graph, run_metadata):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+
+  Returns:
+    A dictionary mapping from device name to pprof proto for that device.
+  """
+  return PprofProfiler(graph, run_metadata).profile()
+
+
+def profile(graph, run_metadata, output_dir=None):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+    output_dir: (string) Directory to output pprof profile to.
+      Profile files for each device will be stored in compressed
+      serialized proto format. If output_dir is None, profile protos
+      will be printed to stdout instead.
+
+  Returns:
+    List of output files created by this profile call.
+    (Note: this list will be empty if output_dir is None)
+  """
+  profiles = get_profiles(graph, run_metadata)
+  output_file_template = None
+  if output_dir:
+    if not os.path.isdir(output_dir):
+      os.makedirs(output_dir)
+    time_suffix = time.strftime('%Y%m%d%H%M%S')
+    output_file_template = os.path.join(
+        output_dir, '%s_' + time_suffix + '.pb.gz')
+
+  profile_files = []
+  for device, pprof_proto in profiles.items():
+    if output_file_template is None:
+      print('No output directory specified, printing to stdout instead.')
+      print(pprof_proto)
+    else:
+      device_name = str(device).strip('/').translate(
+          maketrans('/:', '__'))
+      profile_file = output_file_template % device_name
+      profile_files.append(profile_file)
+      with gzip.open(profile_file, 'w') as output_file:
+        print('Writing profile to %s...' % profile_file)
+        output_file.write(pprof_proto.SerializeToString())
+  return profile_files
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
new file mode 100644
index 00000000000..6487adf9920
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pprof_profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+
+from proto import profile_pb2
+from tensorflow.contrib.tfprof.python.tools.tfprof import pprof_profiler
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PprofProfilerTest(test.TestCase):
+
+  def testDataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    graph.get_operations.return_value = []
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testRunMetadataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [('a/b/file1', 10, 'some_var')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testValidProfile(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name='Add/123',
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = 'deviceA'
+    device1.node_stats.extend([node1])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [
+        ('a/b/file1', 10, 'apply_op', 'abc'), ('a/c/file2', 12, 'my_op', 'def')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    expected_proto = """sample_type {
+  type: 5
+  unit: 5
+}
+sample_type {
+  type: 6
+  unit: 7
+}
+sample_type {
+  type: 8
+  unit: 7
+}
+sample {
+  value: 1
+  value: 4
+  value: 2
+  label {
+    key: 1
+    str: 2
+  }
+  label {
+    key: 3
+    str: 4
+  }
+}
+string_table: ""
+string_table: "node_name"
+string_table: "Add/123"
+string_table: "op_type"
+string_table: "add"
+string_table: "count"
+string_table: "all_time"
+string_table: "nanoseconds"
+string_table: "op_time"
+string_table: "Device 1 of 1: deviceA"
+comment: 9
+"""
+    # Test with protos
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(1, len(profiles))
+    self.assertTrue('deviceA' in profiles)
+    self.assertEquals(expected_proto, str(profiles['deviceA']))
+    # Test with files
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(1, len(profile_files))
+    with gzip.open(profile_files[0]) as profile_file:
+      profile_contents = profile_file.read()
+      profile = profile_pb2.Profile()
+      profile.ParseFromString(profile_contents)
+      self.assertEquals(expected_proto, str(profile))
+
+  def testProfileWithWhileLoop(self):
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+
+    num_iters = 5
+    with self.test_session() as sess:
+      i = constant_op.constant(0)
+      c = lambda i: math_ops.less(i, num_iters)
+      b = lambda i: math_ops.add(i, 1)
+      r = control_flow_ops.while_loop(c, b, [i])
+      sess.run(r, options=options, run_metadata=run_metadata)
+      profiles = pprof_profiler.get_profiles(sess.graph, run_metadata)
+      self.assertEquals(1, len(profiles))
+      profile = next(iter(profiles.values()))
+      add_samples = []  # Samples for the while/Add node
+      for sample in profile.sample:
+        if profile.string_table[sample.label[0].str] == 'while/Add':
+          add_samples.append(sample)
+      # Values for same nodes are aggregated.
+      self.assertEquals(1, len(add_samples))
+      # Value of "count" should be equal to number of iterations.
+      self.assertEquals(num_iters, add_samples[0].value[0])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
deleted file mode 100644
index 07ed324d7ce..00000000000
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""print_model_analysis test."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
-from google.protobuf import text_format
-
-from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-from tensorflow.tools.tfprof import tfprof_options_pb2
-from tensorflow.tools.tfprof import tfprof_output_pb2
-
-# XXX: this depends on pywrap_tensorflow and must come later
-from tensorflow.contrib.tfprof.python.tools.tfprof import pywrap_tensorflow_print_model_analysis_lib as print_mdl
-
-# pylint: disable=bad-whitespace
-# pylint: disable=bad-continuation
-TEST_OPTIONS = {
-    'max_depth': 10000,
-    'min_bytes': 0,
-    'min_micros': 0,
-    'min_params': 0,
-    'min_float_ops': 0,
-    'device_regexes': ['.*'],
-    'order_by': 'name',
-    'account_type_regexes': ['.*'],
-    'start_name_regexes': ['.*'],
-    'trim_name_regexes': [],
-    'show_name_regexes': ['.*'],
-    'hide_name_regexes': [],
-    'account_displayed_op_only': True,
-    'select': ['params'],
-    'viz': False
-}
-
-# pylint: enable=bad-whitespace
-# pylint: enable=bad-continuation
-
-
-class PrintModelAnalysisTest(test.TestCase):
-
-  def _BuildSmallModel(self):
-    image = array_ops.zeros([2, 6, 6, 3])
-    kernel = variable_scope.get_variable(
-        'DW', [6, 6, 3, 6],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
-    return x
-
-  def testPrintModelAnalysis(self):
-    opts = tfprof_options_pb2.OptionsProto()
-    opts.max_depth = TEST_OPTIONS['max_depth']
-    opts.min_bytes = TEST_OPTIONS['min_bytes']
-    opts.min_micros = TEST_OPTIONS['min_micros']
-    opts.min_params = TEST_OPTIONS['min_params']
-    opts.min_float_ops = TEST_OPTIONS['min_float_ops']
-    for p in TEST_OPTIONS['device_regexes']:
-      opts.device_regexes.append(p)
-    opts.order_by = TEST_OPTIONS['order_by']
-    for p in TEST_OPTIONS['account_type_regexes']:
-      opts.account_type_regexes.append(p)
-    for p in TEST_OPTIONS['start_name_regexes']:
-      opts.start_name_regexes.append(p)
-    for p in TEST_OPTIONS['trim_name_regexes']:
-      opts.trim_name_regexes.append(p)
-    for p in TEST_OPTIONS['show_name_regexes']:
-      opts.show_name_regexes.append(p)
-    for p in TEST_OPTIONS['hide_name_regexes']:
-      opts.hide_name_regexes.append(p)
-    opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
-    for p in TEST_OPTIONS['select']:
-      opts.select.append(p)
-    opts.viz = TEST_OPTIONS['viz']
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.TFProfNode()
-      tfprof_pb.ParseFromString(
-          print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
-          ).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
-
-      expected_pb = tfprof_output_pb2.TFProfNode()
-      text_format.Merge(r"""name: "_TFProfRoot"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 648
-      children {
-      name: "Conv2D"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      device: "/device:CPU:0"
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW"
-      exec_micros: 0
-      requested_bytes: 0
-      parameters: 648
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 648
-      device: "/device:CPU:0"
-      children {
-      name: "DW/Assign"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      device: "/device:CPU:0"
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/Initializer"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      children {
-      name: "DW/Initializer/random_normal"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      children {
-      name: "DW/Initializer/random_normal/RandomStandardNormal"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/Initializer/random_normal/mean"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/Initializer/random_normal/mul"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/Initializer/random_normal/shape"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/Initializer/random_normal/stddev"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      float_ops: 0
-      total_float_ops: 0
-      }
-      float_ops: 0
-      total_float_ops: 0
-      }
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "DW/read"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      device: "/device:CPU:0"
-      float_ops: 0
-      total_float_ops: 0
-      }
-      float_ops: 0
-      total_float_ops: 0
-      }
-      children {
-      name: "zeros"
-      exec_micros: 0
-      requested_bytes: 0
-      total_exec_micros: 0
-      total_requested_bytes: 0
-      total_parameters: 0
-      device: "/device:CPU:0"
-      float_ops: 0
-      total_float_ops: 0
-      }
-      float_ops: 0
-      total_float_ops: 0""", expected_pb)
-      self.assertEqual(expected_pb, tfprof_pb)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
new file mode 100644
index 00000000000..5daaafd7c8a
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
@@ -0,0 +1,187 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+# pylint: disable=g-bad-import-order
+from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
+from tensorflow.contrib.tfprof.python.tools.tfprof.internal import model_analyzer_testlib as lib
+
+
+class ProfilerTest(test.TestCase):
+
+  def testProfileBasic(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['account_type_regexes'] = ['.*']
+    opts['select'] = ['params', 'float_ops', 'micros', 'bytes',
+                      'device', 'op_types', 'occurrence']
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+
+    # Test the output without run_meta.
+    sess = session.Session()
+    r = lib.BuildFullModel()
+    sess.run(variables.global_variables_initializer())
+
+    profiler = model_analyzer.Profiler(sess.graph)
+    profiler.profile_name_scope(opts)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='scope', tfprof_options=opts)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertEqual(pma_str, profiler_str)
+
+    # Test the output with run_meta.
+    run_meta = config_pb2.RunMetadata()
+    _ = sess.run(r,
+                 options=config_pb2.RunOptions(
+                     trace_level=config_pb2.RunOptions.FULL_TRACE),
+                 run_metadata=run_meta)
+
+    profiler.add_step(1, run_meta)
+    profiler.profile_graph(opts)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='graph', run_meta=run_meta, tfprof_options=opts)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertEqual(pma_str, profiler_str)
+
+    profiler.profile_python_codes(opts)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='code', run_meta=run_meta, tfprof_options=opts)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertEqual(pma_str, profiler_str)
+
+    profiler.profile_operations(opts)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='op', run_meta=run_meta, tfprof_options=opts)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertEqual(pma_str, profiler_str)
+
+    # Test the output difference between multi-step profile and 1-step profile.
+    _ = sess.run(r,
+                 options=config_pb2.RunOptions(
+                     trace_level=config_pb2.RunOptions.FULL_TRACE),
+                 run_metadata=run_meta)
+
+    profiler.add_step(2, run_meta)
+    profiler.profile_name_scope(opts)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertNotEqual(pma_str, profiler_str)
+
+    opts2 = opts.copy()
+    opts2['select'] = ['params', 'float_ops']
+    profiler.profile_name_scope(opts2)
+    with gfile.Open(outfile, 'r') as f:
+      profiler_str = f.read()
+
+    model_analyzer.print_model_analysis(
+        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts2)
+    with gfile.Open(outfile, 'r') as f:
+      pma_str = f.read()
+    self.assertEqual(pma_str, profiler_str)
+
+  def testMultiStepProfile(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+    opts['account_type_regexes'] = ['.*']
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      r1, r2, r3 = lib.BuildSplitableModel()
+      sess.run(variables.global_variables_initializer())
+
+      profiler = model_analyzer.Profiler(sess.graph)
+      pb0 = profiler.profile_name_scope(opts)
+
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(r1,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+      profiler.add_step(1, run_meta)
+      pb1 = profiler.profile_name_scope(opts)
+
+      self.assertNotEqual(lib.SearchTFProfNode(pb1, 'DW'), None)
+      self.assertEqual(lib.SearchTFProfNode(pb1, 'DW2'), None)
+      self.assertEqual(lib.SearchTFProfNode(pb1, 'add'), None)
+
+      run_meta2 = config_pb2.RunMetadata()
+      _ = sess.run(r2,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta2)
+      profiler.add_step(2, run_meta2)
+      pb2 = profiler.profile_name_scope(opts)
+
+      self.assertNotEqual(lib.SearchTFProfNode(pb2, 'DW'), None)
+      self.assertNotEqual(lib.SearchTFProfNode(pb2, 'DW2'), None)
+      self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None)
+
+      run_meta3 = config_pb2.RunMetadata()
+      _ = sess.run(r3,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta3)
+      profiler.add_step(3, run_meta3)
+      pb3 = profiler.profile_name_scope(opts)
+
+      self.assertNotEqual(lib.SearchTFProfNode(pb3, 'DW'), None)
+      self.assertNotEqual(lib.SearchTFProfNode(pb3, 'DW2'), None)
+      self.assertNotEqual(lib.SearchTFProfNode(pb3, 'add'), None)
+
+      self.assertEqual(lib.SearchTFProfNode(pb0, 'Conv2D'), None)
+      self.assertGreater(lib.SearchTFProfNode(pb1, 'Conv2D').exec_micros, 0)
+      self.assertEqual(lib.SearchTFProfNode(pb1, 'Conv2D_1'), None)
+      self.assertGreater(lib.SearchTFProfNode(pb2, 'Conv2D_1').exec_micros, 0)
+      self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None)
+      self.assertGreater(lib.SearchTFProfNode(pb3, 'add').exec_micros, 0)
+
+      # TODO(xpan): Better test of advisor.
+      profiler.advise()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
index e8cf84b6c77..52febef26cd 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -62,12 +62,16 @@ def _fill_missing_graph_shape(graph, run_meta):
   return graph
 
 
-def _get_logged_ops(graph, run_meta=None):
+def _get_logged_ops(graph, run_meta=None, add_trace=True,
+                    add_trainable_var=True):
   """Extract trainable model parameters and FLOPs for ops from a Graph.
 
   Args:
     graph: tf.Graph.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
+    add_trainable_var: Whether to assign tf.trainable_variables() op type
+      '_trainable_variables'.
   Returns:
     logged_ops: dict mapping from op_name to OpLogEntry.
   """
@@ -76,50 +80,69 @@ def _get_logged_ops(graph, run_meta=None):
 
   op_missing_shape = 0
   logged_ops = {}
-  graph_def = graph.as_graph_def()
-  for node in graph_def.node:
+  # TODO(xpan): Work with Profiler more efficiently.
+  for op in graph.get_operations():
     try:
-      stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
+      stats = ops.get_stats_for_node_def(
+          graph, op.node_def, REGISTERED_FLOP_STATS)
     except ValueError:
       # Catch Exception When shape is incomplete. Skip it.
       op_missing_shape += 1
       stats = None
 
-    if not stats or not stats.value:
-      continue
-    if node.name not in logged_ops:
-      entry = tfprof_log_pb2.OpLogEntry()
-      entry.name = node.name
+    entry = tfprof_log_pb2.OpLogEntry()
+    entry.name = op.name
+    add_entry = False
+    if stats and stats.value:
       entry.float_ops = int(stats.value)
+      add_entry = True
+
+    if add_trace:
+      for tb in op.traceback:
+        trace = entry.code_def.traces.add()
+        trace.file = tb[0] if tb[0] else 'none'
+        trace.lineno = tb[1] if tb[1] else -1
+        trace.function = tb[2] if tb[2] else 'none'
+        trace.line = tb[3] if tb[3] else 'none'
+      add_entry = True
+
+    if add_entry:
       logged_ops[entry.name] = entry
 
-  for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
-    if v.op.name not in logged_ops:
-      entry = tfprof_log_pb2.OpLogEntry()
-      entry.name = v.op.name
-      entry.types.append(TRAINABLE_VARIABLES)
-      logged_ops[entry.name] = entry
-    else:
-      logged_ops[v.op.name].types.append(TRAINABLE_VARIABLES)
+  if add_trainable_var:
+    for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
+      if v.op.name not in logged_ops:
+        entry = tfprof_log_pb2.OpLogEntry()
+        entry.name = v.op.name
+        entry.types.append(TRAINABLE_VARIABLES)
+        logged_ops[entry.name] = entry
+      else:
+        logged_ops[v.op.name].types.append(TRAINABLE_VARIABLES)
+
   if op_missing_shape > 0 and not run_meta:
-    sys.stderr.write('%d ops no flops stats due to incomplete shapes. '
-                     'Consider passing run_meta to use run_time shapes.\n' %
+    sys.stderr.write('%d ops no flops stats due to incomplete shapes.\n' %
                      op_missing_shape)
   return logged_ops
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
+def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                              add_trace=True, add_trainable_var=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
     graph: tf.Graph.
     op_log: OpLog proto.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
+    add_trainable_var: Whether to assign tf.trainable_variables() op type
+      '_trainable_variables'.
   Returns:
     tmp_op_log: Merged OpLog proto.
   """
   tmp_op_log = tfprof_log_pb2.OpLog()
-  logged_ops = _get_logged_ops(graph, run_meta)
+  logged_ops = _get_logged_ops(
+      graph, run_meta, add_trace=add_trace, add_trainable_var=add_trainable_var)
+
   if not op_log:
     tmp_op_log.log_entries.extend(logged_ops.values())
   else:
@@ -131,13 +154,15 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
         all_ops[op_name].types.extend(entry.types)
         if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
           all_ops[op_name].float_ops = entry.float_ops
+        if entry.code_def.traces and not all_ops[op_name].code_def.traces:
+          all_ops[op_name].code_def.MergeFrom(entry.code_def)
       else:
         all_ops[op_name] = entry
     tmp_op_log.log_entries.extend(all_ops.values())
   return tmp_op_log
 
 
-def write_op_log(graph, log_dir, op_log=None, run_meta=None):
+def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
     The API also assigns ops in tf.trainable_variables() an op type called
@@ -154,8 +179,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
         one is created.
     run_meta: (Optional) RunMetadata proto that helps flops computation using
         run time shape information.
+    add_trace: Whether to add op trace information. Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py
index 9a7fe9a8876..87dfdc0fc19 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py
@@ -17,13 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib.copy_graph.python.util import copy_elements
 from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
 from tensorflow.core.protobuf import config_pb2
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 119c9fb18ea..1180ea92994 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -5,25 +5,35 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
     name = "training_py",
     srcs = [
         "__init__.py",
+        "python/__init__.py",
+        "python/training/__init__.py",
         "python/training/bucket_ops.py",
         "python/training/device_setter.py",
         "python/training/evaluation.py",
-        "python/training/failure_tolerator.py",
-        "python/training/feeder.py",
+        "python/training/feeding_queue_runner.py",
+        "python/training/hparam.py",
+        "python/training/python_input.py",
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
         "python/training/training.py",
+        "python/training/tuner.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":protos_all_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -36,8 +46,10 @@ py_library(
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
@@ -46,6 +58,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -108,29 +121,13 @@ py_test(
 )
 
 py_test(
-    name = "failure_tolerator_test",
+    name = "hparam_test",
     size = "small",
-    srcs = ["python/training/failure_tolerator_test.py"],
+    srcs = ["python/training/hparam_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":training_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-py_test(
-    name = "feeder_test",
-    size = "medium",
-    srcs = ["python/training/feeder_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":training_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
     ],
 )
 
@@ -218,6 +215,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "python_input_test",
+    size = "medium",
+    srcs = ["python/training/python_input_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":training_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "evaluation_test",
     size = "small",
@@ -271,6 +288,7 @@ py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
     ],
 )
@@ -286,3 +304,12 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+tf_proto_library(
+    name = "protos_all",
+    srcs = glob(["**/*.proto"]),
+    cc_api_version = 2,
+    go_api_version = 2,
+    java_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index ed975b29132..e9174aeb48e 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -14,51 +14,28 @@
 # ==============================================================================
 """Training and input utilities.
 
-## Splitting sequence inputs into minibatches with state saving
-
-Use [`SequenceQueueingStateSaver`](#SequenceQueueingStateSaver) or
-its wrapper [`batch_sequences_with_states`](#batch_sequences_with_states) if
-you have input data with a dynamic primary time / frame count axis which
-you'd like to convert into fixed size segments during minibatching, and would
-like to store state in the forward direction across segments of an example.
+See @{$python/contrib.training} guide.
 
 @@batch_sequences_with_states
 @@NextQueuedSequenceBatch
 @@SequenceQueueingStateSaver
-
-
-## Online data resampling
-
-To resample data with replacement on a per-example basis, use
-['rejection_sample'](#rejection_sample) or
-['resample_at_rate'](#resample_at_rate). For `rejection_sample`, provide
-a boolean Tensor describing whether to accept or reject. Resulting batch sizes
-are always the same. For `resample_at_rate`, provide the desired rate for each
-example. Resulting batch sizes may vary. If you wish to specify relative
-rates, rather than absolute ones, use ['weighted_resample'](#weighted_resample)
-(which also returns the actual resampling rate used for each output example).
-
-Use ['stratified_sample'](#stratified_sample) to resample without replacement
-from the data to achieve a desired mix of class proportions that the Tensorflow
-graph sees. For instance, if you have a binary classification dataset that is
-99.9% class 1, a common approach is to resample from the data so that the data
-is more balanced.
-
 @@rejection_sample
 @@resample_at_rate
 @@stratified_sample
 @@weighted_resample
-
-## Bucketing
-
-Use ['bucket'](#bucket) or
-['bucket_by_sequence_length'](#bucket_by_sequence_length) to stratify
-minibatches into groups ("buckets").  Use `bucket_by_sequence_length`
-with the argument `dynamic_pad=True` to receive minibatches of similarly
-sized sequences for efficient training via `dynamic_rnn`.
-
 @@bucket
 @@bucket_by_sequence_length
+@@GreedyLoadBalancingStrategy
+@@byte_size_load_fn
+@@FailureTolerator
+@@rejection_sample
+@@stratified_sample
+@@resample_at_rate
+@@weighted_resample
+@@HParams
+@@HParamDef
+@@parse_values
+@@python_input
 """
 
 from __future__ import absolute_import
@@ -75,8 +52,9 @@ from tensorflow.contrib.training.python.training.evaluation import get_or_create
 from tensorflow.contrib.training.python.training.evaluation import StopAfterNEvalsHook
 from tensorflow.contrib.training.python.training.evaluation import SummaryAtEndHook
 from tensorflow.contrib.training.python.training.evaluation import wait_for_new_checkpoint
-from tensorflow.contrib.training.python.training.failure_tolerator import *
-from tensorflow.contrib.training.python.training.feeder import *
+from tensorflow.contrib.training.python.training.feeding_queue_runner import FeedingQueueRunner
+from tensorflow.contrib.training.python.training.hparam import *
+from tensorflow.contrib.training.python.training.python_input import python_input
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
@@ -85,7 +63,17 @@ from tensorflow.contrib.training.python.training.training import clip_gradient_n
 from tensorflow.contrib.training.python.training.training import create_train_op
 from tensorflow.contrib.training.python.training.training import multiply_gradients
 from tensorflow.contrib.training.python.training.training import train
-from tensorflow.python.util.all_util import make_all
+from tensorflow.contrib.training.python.training.tuner import Tuner
 # pylint: enable=unused-import,wildcard-import
 
-__all__ = make_all(__name__)
+from tensorflow.python.util.all_util import remove_undocumented
+
+# Allow explicitly imported symbols. Symbols imported with * must also be
+# whitelisted here or in the module docstring above.
+_allowed_symbols = [
+    'checkpoints_iterator', 'evaluate_once', 'evaluate_repeatedly',
+    'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
+    'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
+    'clip_gradient_norms', 'create_train_op', 'multiply_gradients', 'train']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/g3doc/how_tos/__init__.py b/tensorflow/contrib/training/python/__init__.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/__init__.py
rename to tensorflow/contrib/training/python/__init__.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/__init__.py b/tensorflow/contrib/training/python/training/__init__.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/__init__.py
rename to tensorflow/contrib/training/python/training/__init__.py
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 0f52c2128d8..f6237872cce 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -68,6 +68,16 @@ class BatchSequencesWithStatesTest(test.TestCase):
         array_ops.constant(ind2, dtypes.int64),
         array_ops.constant(val2, dtypes.int64),
         array_ops.constant(shape2, dtypes.int64))
+    sp_tensor3 = sparse_tensor.SparseTensor(
+        array_ops.constant([[1, 9], [2, 2], [2, 10]], dtypes.int64),
+        array_ops.constant([7, 15, 2], dtypes.int64),
+        array_ops.constant([5, 12], dtypes.int64)
+    )
+    self.sp_tensor3_expected = sparse_tensor.SparseTensorValue(
+        [[0, 1, 9], [0, 2, 2], [0, 2, 10], [1, 1, 9], [1, 2, 2], [1, 2, 10]],
+        [7, 15, 2, 7, 15, 2],
+        [2, 5, 12]
+    )
     self.batch_size = 2
     self.key = string_ops.string_join([
         "key_", string_ops.as_string(
@@ -78,7 +88,9 @@ class BatchSequencesWithStatesTest(test.TestCase):
         "seq2": np.random.rand(self.value_length, 4, 2),
         "seq3": sp_tensor1,
         "seq4": sp_tensor2}
-    self.context = {"context1": [3, 4]}
+    self.context = {
+        "context1": [3, 4],
+        "sp_context": sp_tensor3}
     self.initial_states = {
         "state1": np.random.rand(6, 7),
         "state2": np.random.rand(8)
@@ -92,11 +104,12 @@ class BatchSequencesWithStatesTest(test.TestCase):
                   expected_seq1_batch1, expected_seq2_batch1,
                   expected_seq1_batch2, expected_seq2_batch2,
                   expected_seq3_batch1, expected_seq3_batch2,
-                  expected_seq4_batch1, expected_seq4_batch2):
+                  expected_seq4_batch1, expected_seq4_batch2,
+                  key=None, make_keys_unique=False):
 
     with self.test_session() as sess:
       next_batch = sqss.batch_sequences_with_states(
-          input_key=self.key,
+          input_key=key if key is not None else self.key,
           input_sequences=self.sequences,
           input_context=self.context,
           input_length=length,
@@ -107,7 +120,9 @@ class BatchSequencesWithStatesTest(test.TestCase):
           # to enforce that we only move on to the next examples after finishing
           # all segments of the first ones.
           capacity=2,
-          pad=pad)
+          pad=pad,
+          make_keys_unique=make_keys_unique,
+          make_keys_unique_seed=9)
 
       state1 = next_batch.state("state1")
       state2 = next_batch.state("state2")
@@ -131,12 +146,13 @@ class BatchSequencesWithStatesTest(test.TestCase):
 
       # Step 1
       (key_value, next_key_value, seq1_value, seq2_value, seq3_value,
-       seq4_value, context1_value, state1_value, state2_value, length_value,
-       _, _) = sess.run(
+       seq4_value, context1_value, context2_value, state1_value, state2_value,
+       length_value, _, _) = sess.run(
            (next_batch.key, next_batch.next_key, next_batch.sequences["seq1"],
             next_batch.sequences["seq2"], next_batch.sequences["seq3"],
             next_batch.sequences["seq4"], next_batch.context["context1"],
-            state1, state2, next_batch.length, state1_update, state2_update))
+            next_batch.context["sp_context"], state1, state2, next_batch.length,
+            state1_update, state2_update))
       expected_first_keys = set([b"00000_of_00002"])
       expected_second_keys = set([b"00001_of_00002"])
       expected_final_keys = set([b"STOP"])
@@ -146,6 +162,12 @@ class BatchSequencesWithStatesTest(test.TestCase):
       self.assertAllEqual(
           np.tile(self.context["context1"], (self.batch_size, 1)),
           context1_value)
+      self.assertAllEqual(self.sp_tensor3_expected.indices,
+                          context2_value.indices)
+      self.assertAllEqual(self.sp_tensor3_expected.values,
+                          context2_value.values)
+      self.assertAllEqual(self.sp_tensor3_expected.dense_shape,
+                          context2_value.dense_shape)
       self.assertAllEqual(expected_seq1_batch1, seq1_value)
       self.assertAllEqual(expected_seq2_batch1, seq2_value)
       self.assertAllEqual(expected_seq3_batch1.indices, seq3_value.indices)
@@ -166,18 +188,25 @@ class BatchSequencesWithStatesTest(test.TestCase):
 
       # Step 2
       (key_value, next_key_value, seq1_value, seq2_value, seq3_value,
-       seq4_value, context1_value, state1_value, state2_value, length_value,
-       _, _) = sess.run(
+       seq4_value, context1_value, context2_value, state1_value, state2_value,
+       length_value, _, _) = sess.run(
            (next_batch.key, next_batch.next_key, next_batch.sequences["seq1"],
             next_batch.sequences["seq2"], next_batch.sequences["seq3"],
             next_batch.sequences["seq4"], next_batch.context["context1"],
-            state1, state2, next_batch.length, state1_update, state2_update))
+            next_batch.context["sp_context"], state1, state2, next_batch.length,
+            state1_update, state2_update))
 
       self.assertEqual(expected_second_keys, self._prefix(key_value))
       self.assertEqual(expected_final_keys, self._prefix(next_key_value))
       self.assertAllEqual(
           np.tile(self.context["context1"], (self.batch_size, 1)),
           context1_value)
+      self.assertAllEqual(self.sp_tensor3_expected.indices,
+                          context2_value.indices)
+      self.assertAllEqual(self.sp_tensor3_expected.values,
+                          context2_value.values)
+      self.assertAllEqual(self.sp_tensor3_expected.dense_shape,
+                          context2_value.dense_shape)
       self.assertAllEqual(expected_seq1_batch2, seq1_value)
       self.assertAllEqual(expected_seq2_batch2, seq2_value)
       self.assertAllEqual(expected_seq3_batch2.indices, seq3_value.indices)
@@ -197,7 +226,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
       coord.request_stop()
       coord.join(threads, stop_grace_period_secs=2)
 
-  def _testBasicPadding(self, pad):
+  def _testBasicPadding(self, pad, key=None, make_keys_unique=False):
     num_unroll = 2  # Divisor of value_length - so no padding necessary.
     expected_seq1_batch1 = np.tile(
         self.sequences["seq1"][np.newaxis, 0:num_unroll, :],
@@ -272,7 +301,9 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq3_batch1=expected_seq3_batch1,
         expected_seq3_batch2=expected_seq3_batch2,
         expected_seq4_batch1=expected_seq4_batch1,
-        expected_seq4_batch2=expected_seq4_batch2)
+        expected_seq4_batch2=expected_seq4_batch2,
+        key=key,
+        make_keys_unique=make_keys_unique)
 
   def testBasicPadding(self):
     self._testBasicPadding(pad=True)
@@ -280,6 +311,11 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testBasicNoPadding(self):
     self._testBasicPadding(pad=False)
 
+  def testRandomKeyGen(self):
+    self._testBasicPadding(pad=False,
+                           key=constant_op.constant("fixed_key"),
+                           make_keys_unique=True)
+
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
@@ -358,6 +394,57 @@ class BatchSequencesWithStatesTest(test.TestCase):
                        32, 33])
     shape1 = np.array([self.batch_size, num_unroll, 6])
 
+    # For sp_tensor2 all values fall into the first segment.
+    ind2_1 = np.array([
+        # batch entry 1
+        [0, 0, 0, 1],
+        [0, 0, 1, 0],
+        [0, 0, 1, 2],
+        [0, 1, 0, 3],
+        [0, 1, 1, 0],
+        [0, 1, 1, 1],
+        [0, 1, 1, 2],
+        [0, 1, 2, 2],
+        # batch entry 2
+        [1, 0, 0, 1],
+        [1, 0, 1, 0],
+        [1, 0, 1, 2],
+        [1, 1, 0, 3],
+        [1, 1, 1, 0],
+        [1, 1, 1, 1],
+        [1, 1, 1, 2],
+        [1, 1, 2, 2],
+    ])
+    val2_1 = np.array([1, 10, 12, 103, 150, 149, 150, 122,
+                       1, 10, 12, 103, 150, 149, 150, 122])
+    shape2 = np.array([self.batch_size, num_unroll, 3, 4])
+    expected_seq3_batch1 = sparse_tensor.SparseTensorValue(
+        ind1_1, val1_1, shape1)
+    expected_seq3_batch2 = sparse_tensor.SparseTensorValue(
+        ind1_2, val1_2, shape1)
+    expected_seq4_batch1 = sparse_tensor.SparseTensorValue(
+        ind2_1, val2_1, shape2)
+    expected_seq4_batch2 = sparse_tensor.SparseTensorValue(
+        np.empty(shape=[0, 4], dtype=np.int64), np.array([]), shape2)
+
+    ind1_1 = np.array([
+        # batch entry 1
+        [0, 0, 0],
+        [0, 1, 0], [0, 1, 3], [0, 1, 4],
+        # batch entry 2
+        [1, 0, 0],
+        [1, 1, 0], [1, 1, 3], [1, 1, 4]])
+    ind1_2 = np.array([
+        # batch entry 1
+        [0, 0, 2], [0, 0, 3],
+        # batch entry 2
+        [1, 0, 2], [1, 0, 3]])
+    val1_1 = np.array([0, 10, 13, 14,
+                       0, 10, 13, 14])
+    val1_2 = np.array([32, 33,
+                       32, 33])
+    shape1 = np.array([self.batch_size, num_unroll, 6])
+
     # For sp_tensor2 all values fall into the first segment.
     ind2_1 = np.array([
         # batch entry 1
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 74577e10440..7e293da5511 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -66,6 +66,7 @@ def bucket(tensors,
            num_buckets,
            num_threads=1,
            capacity=32,
+           bucket_capacities=None,
            shapes=None,
            dynamic_pad=False,
            allow_smaller_final_batch=False,
@@ -124,7 +125,11 @@ def bucket(tensors,
     num_buckets: A python integer, the number of buckets.
     num_threads: An integer.  The number of threads enqueuing `tensors`.
     capacity: An integer. The maximum number of minibatches in the top queue,
-      and also the maximum number of elements within each bucket.
+      and also (by default) the maximum number of elements within each bucket.
+    bucket_capacities: (Optional) None or a list of integers, the capacities of
+      each bucket. If None, capacity is used (default). If specified, it must
+      be a list of integers of length num_buckets: the i-th element is used
+      as capacity for the i-th bucket queue.
     shapes: (Optional) The shapes for each example.  Defaults to the
       inferred shapes for `tensors`.
     dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
@@ -149,7 +154,8 @@ def bucket(tensors,
   Raises:
     ValueError: If the `shapes` are not specified, and cannot be
       inferred from the elements of `tensors` or if batch_size is a sequence
-      but it's length != num_buckets.
+      but its length != num_buckets. Also if bucket_capacities is not None but
+      its length != num_buckets.
   """
   batch_size_per_bucket = False
   if isinstance(batch_size, (list, tuple)):
@@ -159,6 +165,14 @@ def bucket(tensors,
           "If batch_size is a list it must have num_buckets elements")
   else:
     batch_size = [batch_size] * num_buckets
+
+  if bucket_capacities is None:
+    bucket_capacities = [capacity] * num_buckets
+  if len(bucket_capacities) != num_buckets:
+    raise ValueError(
+        "The list bucket_capacities (%s) must have exactly num_buckets (%d) "
+        "elements." % (str(bucket_capacities), num_buckets))
+
   tensor_list = _as_tensor_list(tensors)
   with ops.name_scope(name, "bucket", tensor_list) as name:
     tensor_list = _validate_bucket(tensor_list)
@@ -187,7 +201,7 @@ def bucket(tensors,
                        else None)
       bucket_queues.append(
           queue_creator(
-              capacity=capacity,
+              capacity=bucket_capacities[i],
               dtypes=types,
               shapes=shapes,
               shared_name=shared_name_i,
@@ -213,7 +227,7 @@ def bucket(tensors,
         name="top_queue")
 
     def enqueue_which():
-
+      """Return an op that enqueues conditionally in one of the queues."""
       def enqueue_single(i):
         return bucket_queues[i].enqueue(tensor_list)
 
@@ -237,10 +251,16 @@ def bucket(tensors,
     else:
       which_dequeue = lambda q: q.dequeue_many
 
+    def make_list(t):
+      if isinstance(t, (list, tuple)):
+        return t
+      else:
+        return [t]
+
     enqueues_to_top = [
         top_queue.enqueue(
-            [constant_op.constant(i)] + which_dequeue(q)(
-                bs, name="read_bucket_%d" % i),
+            [constant_op.constant(i)] + make_list(which_dequeue(q)(
+                bs, name="read_bucket_%d" % i)),
             name="enqueue_from_bucket_%d" % i)
         for i, (q, bs) in enumerate(zip(bucket_queues, batch_size))
     ]
@@ -268,6 +288,8 @@ def bucket(tensors,
     dequeued = top_queue.dequeue(name="dequeue_top")
     which_bucket_dequeued = dequeued[0]
     dequeued = dequeued[1:]
+    if len(dequeued) == 1:
+      dequeued = dequeued[0]
     dequeued = _restore_sparse_tensors(dequeued, sparse_info)
     return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
 
@@ -278,6 +300,7 @@ def bucket_by_sequence_length(input_length,
                               bucket_boundaries,
                               num_threads=1,
                               capacity=32,
+                              bucket_capacities=None,
                               shapes=None,
                               dynamic_pad=False,
                               allow_smaller_final_batch=False,
@@ -306,6 +329,10 @@ def bucket_by_sequence_length(input_length,
     num_threads: An integer.  The number of threads enqueuing `tensors`.
     capacity: An integer. The maximum number of minibatches in the top queue,
       and also the maximum number of elements within each bucket.
+    bucket_capacities: (Optional) None or a list of integers, the capacities of
+      each bucket. If None, capacity is used (default). If specified, it must
+      be a list of integers of length one larger than bucket_boundaries.
+      Its i-th element is used as capacity for the i-th bucket queue.
     shapes: (Optional) The shapes for each example.  Defaults to the
       inferred shapes for `tensors`.
     dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
@@ -379,6 +406,7 @@ def bucket_by_sequence_length(input_length,
         num_buckets=len(bucket_boundaries) + 1,
         num_threads=num_threads,
         capacity=capacity,
+        bucket_capacities=bucket_capacities,
         shapes=shapes,
         dynamic_pad=dynamic_pad,
         allow_smaller_final_batch=allow_smaller_final_batch,
diff --git a/tensorflow/contrib/training/python/training/bucket_ops_test.py b/tensorflow/contrib/training/python/training/bucket_ops_test.py
index 3a8d0410316..330bee8a3fb 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops_test.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops_test.py
@@ -309,10 +309,19 @@ class BucketTest(test.TestCase):
       self.assertAllEqual(
           np.arange(0, 128, 2), sorted(bucketed_values_all_elem0))
 
+  def testFailOnWrongBucketCapacities(self):
+    with self.assertRaisesRegexp(ValueError, r"must have exactly num_buckets"):
+      bucket_ops.bucket(  # 2 buckets and 3 capacities raises ValueError.
+          tensors=[self.scalar_int, self.unk_int64, self.vec3_str],
+          which_bucket=constant_op.constant(0), num_buckets=2,
+          batch_size=32, bucket_capacities=[3, 4, 5])
+
 
 class BucketBySequenceLengthTest(test.TestCase):
 
-  def _testBucketBySequenceLength(self, allow_small_batch):
+  def _testBucketBySequenceLength(self,
+                                  allow_small_batch,
+                                  bucket_capacities=None):
     ops.reset_default_graph()
 
     # All inputs must be identical lengths across tuple index.
@@ -345,6 +354,7 @@ class BucketBySequenceLengthTest(test.TestCase):
         tensors=[data_t, labels_t],
         batch_size=batch_size,
         bucket_boundaries=bucket_boundaries,
+        bucket_capacities=bucket_capacities,
         allow_smaller_final_batch=allow_small_batch,
         num_threads=10))
 
@@ -405,6 +415,16 @@ class BucketBySequenceLengthTest(test.TestCase):
   def testBucketBySequenceLengthAllow(self):
     self._testBucketBySequenceLength(allow_small_batch=True)
 
+  def testBucketBySequenceLengthBucketCapacities(self):
+    # Above bucket_boundaries = [3, 4, 5, 10] so we need 5 capacities.
+    with self.assertRaisesRegexp(ValueError, r"must have exactly num_buckets"):
+      self._testBucketBySequenceLength(allow_small_batch=False,
+                                       bucket_capacities=[32, 32, 32, 32])
+    # Test with different capacities.
+    capacities = [48, 40, 32, 24, 16]
+    self._testBucketBySequenceLength(allow_small_batch=True,
+                                     bucket_capacities=capacities)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/training/python/training/device_setter.py b/tensorflow/contrib/training/python/training/device_setter.py
index 53f6fa54361..91b55239bce 100644
--- a/tensorflow/contrib/training/python/training/device_setter.py
+++ b/tensorflow/contrib/training/python/training/device_setter.py
@@ -35,7 +35,7 @@ class GreedyLoadBalancingStrategy(object):
   ps ops (typically variables) are created, as it greedily places ops
   on the least-loaded ps at the point each op is processed.
 
-  One reasonable heuristic is the `variable_size_load_fn`, which
+  One reasonable heuristic is the `byte_size_load_fn`, which
   estimates load as the number of bytes that would be used to store and
   transmit the entire variable.  More advanced load functions
   could consider the difference in access patterns across ops, or trade
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 2aea9449172..24b733dd29c 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -141,11 +141,11 @@ from __future__ import print_function
 import time
 
 from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import evaluation
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import session_run_hook
@@ -161,6 +161,15 @@ __all__ = [
     'wait_for_new_checkpoint',
 ]
 
+# pylint: disable=protected-access
+# pylint: disable=invalid-name
+StopAfterNEvalsHook = evaluation._StopAfterNEvalsHook
+evaluate_once = evaluation._evaluate_once
+get_or_create_eval_step = evaluation._get_or_create_eval_step
+
+# pylint: enable=invalid-name
+# pylint: enable=protected-access
+
 
 def wait_for_new_checkpoint(checkpoint_dir,
                             last_checkpoint=None,
@@ -193,7 +202,10 @@ def wait_for_new_checkpoint(checkpoint_dir,
       return checkpoint_path
 
 
-def checkpoints_iterator(checkpoint_dir, min_interval_secs=0, timeout=None):
+def checkpoints_iterator(checkpoint_dir,
+                         min_interval_secs=0,
+                         timeout=None,
+                         timeout_fn=None):
   """Continuously yield new checkpoint files as they appear.
 
   The iterator only checks for new checkpoints when control flow has been
@@ -201,89 +213,68 @@ def checkpoints_iterator(checkpoint_dir, min_interval_secs=0, timeout=None):
   to run between iterations than `min_interval_secs` or the interval at which
   new checkpoints are written.
 
+  The `timeout` argument is the maximum number of seconds to block waiting for
+  a new checkpoint.  It is used in combination with the `timeout_fn` as
+  follows:
+
+  * If the timeout expires and no `timeout_fn` was specified, the iterator
+    stops yielding.
+  * If a `timeout_fn` was specified, that function is called and if it returns
+    a true boolean value the iterator stops yielding.
+  * If the function returns a false boolean value then the iterator resumes the
+    wait for new checkpoints.  At this point the timeout logic applies again.
+
+  This behavior gives control to callers on what to do if checkpoints do not
+  come fast enough or stop being generated.  For example, if callers have a way
+  to detect that the training has stopped and know that no new new checkpoints
+  will be generated, they can provide a `timeout_fn` that returns `True` when
+  the training has stopped.  If they know that the training is still going on
+  they return `False` instead.
+
   Args:
     checkpoint_dir: The directory in which checkpoints are saved.
     min_interval_secs: The minimum number of seconds between yielding
       checkpoints.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
 
   Yields:
-    String paths to latest checkpoint files as they arrive. Stops yielding only
-    if/when waiting for a checkpoint times out.
+    String paths to latest checkpoint files as they arrive.
   """
   checkpoint_path = None
   while True:
-    checkpoint_path = wait_for_new_checkpoint(
+    new_checkpoint_path = wait_for_new_checkpoint(
         checkpoint_dir, checkpoint_path, timeout=timeout)
-    if checkpoint_path is None:
-      # timed out
-      return
+    if new_checkpoint_path is None:
+      if not timeout_fn:
+        # timed out
+        logging.info('Timed-out waiting for a checkpoint.')
+        return
+      if timeout_fn():
+        # The timeout_fn indicated that we are truly done.
+        return
+      else:
+        # The timeout_fn indicated that more checkpoints may come.
+        continue
     start = time.time()
+    checkpoint_path = new_checkpoint_path
     yield checkpoint_path
     time_to_next_eval = start + min_interval_secs - time.time()
     if time_to_next_eval > 0:
       time.sleep(time_to_next_eval)
 
 
-def get_or_create_eval_step():
-  """Gets or creates the eval step `Tensor`.
-
-  Returns:
-    A `Tensor` representing a counter for the evaluation step.
-
-  Raises:
-    ValueError: If multiple `Tensors` have been added to the
-      `tf.GraphKeys.EVAL_STEP` collection.
-  """
-  graph = ops.get_default_graph()
-  eval_steps = graph.get_collection(ops.GraphKeys.EVAL_STEP)
-  if len(eval_steps) == 1:
-    return eval_steps[0]
-  elif len(eval_steps) > 1:
-    raise ValueError('Multiple tensors added to tf.GraphKeys.EVAL_STEP')
-  else:
-    counter = variables.local_variable(0.0, name='eval_step')
-    graph.add_to_collection(ops.GraphKeys.EVAL_STEP, counter)
-    return counter
-
-
-class StopAfterNEvalsHook(session_run_hook.SessionRunHook):
-  """Run hook used by the evaluation routines to run the `eval_ops` N times."""
-
-  def __init__(self, num_evals, log_progress=True):
-    """Constructs the run hook.
-
-    Args:
-      num_evals: The number of evaluations to run for.
-      log_progress: Whether to log evaluation progress, defaults to True.
-    """
-    # The number of evals to run for.
-    self._num_evals = num_evals
-    self._evals_completed = None
-    self._log_progress = log_progress
-
-  def _set_evals_completed_tensor(self, updated_eval_step):
-    self._evals_completed = updated_eval_step
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs({
-        'evals_completed': self._evals_completed
-    })
-
-  def after_run(self, run_context, run_values):
-    evals_completed = run_values.results['evals_completed']
-    if self._log_progress:
-      logging.info('Evaluation [%d/%d]', evals_completed, self._num_evals)
-    if evals_completed >= self._num_evals:
-      run_context.request_stop()
-
-
 class SummaryAtEndHook(session_run_hook.SessionRunHook):
   """A run hook that saves a summary with the results of evaluation."""
 
-  def __init__(self, log_dir=None, summary_writer=None,
-               summary_op=None, feed_dict=None):
+  def __init__(self,
+               log_dir=None,
+               summary_writer=None,
+               summary_op=None,
+               feed_dict=None):
     """Constructs the Summary Hook.
 
     Args:
@@ -299,19 +290,21 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
       ValueError: If both `log_dir` and `summary_writer` are `None`.
     """
     self._summary_op = summary_op
+    self._replace_summary_op = summary_op is None
     self._feed_dict = feed_dict
     self._summary_writer = summary_writer
     self._log_dir = log_dir
-    self._summary_writer = summary_writer
     if self._log_dir is None and self._summary_writer is None:
       raise ValueError('One of log_dir or summary_writer should be used.')
-    self._global_step = variables.get_or_create_global_step()
 
   def begin(self):
+    if self._replace_summary_op:
+      self._summary_op = summary.merge_all()
+    self._global_step = variables.get_or_create_global_step()
+
+  def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
       self._summary_writer = summary.FileWriterCache.get(self._log_dir)
-    if self._summary_op is None:
-      self._summary_op = summary.merge_all()
 
   def end(self, session):
     global_step = training_util.global_step(session, self._global_step)
@@ -349,103 +342,6 @@ def _scaffold_with_init(scaffold, saver, checkpoint_path):
   return scaffold
 
 
-def evaluate_once(checkpoint_path,
-                  master='',
-                  scaffold=None,
-                  eval_ops=None,
-                  feed_dict=None,
-                  final_ops=None,
-                  final_ops_feed_dict=None,
-                  hooks=None,
-                  config=None):
-  """Evaluates the model at the given checkpoint path.
-
-  During a single evaluation, the `eval_ops` is run until the session is
-  interrupted or requested to finish. This is typically requested via a
-  `tf.contrib.training.StopAfterNEvalsHook` which results in `eval_ops` running
-  the requested number of times.
-
-  Optionally, a user can pass in `final_ops`, a single `Tensor`, a list of
-  `Tensors` or a dictionary from names to `Tensors`. The `final_ops` is
-  evaluated a single time after `eval_ops` has finished running and the fetched
-  values of `final_ops` are returned. If `final_ops` is left as `None`, then
-  `None` is returned.
-
-  One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
-  summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
-  summaries run immedietly after the model checkpoint has been restored.
-
-  Note that `evaluate_once` creates a local variable used to track the number of
-  evaluations run via `tf.contrib.training.get_or_create_eval_step`.
-  Consequently, if a custom local init op is provided via a `scaffold`, the
-  caller should ensure that the local init op also initializes the eval step.
-
-  Args:
-    checkpoint_path: The path to a checkpoint to use for evaluation.
-    master: The BNS address of the TensorFlow master.
-    scaffold: An tf.train.Scaffold instance for initializing variables and
-      restoring variables. Note that `scaffold.init_fn` is used by the function
-      to restore the checkpoint. If you supply a custom init_fn, then it must
-      also take care of restoring the model from its checkpoint.
-    eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
-      to `Tensors`, which is run until the session is requested to stop,
-      commonly done by a `tf.contrib.training.StopAfterNEvalsHook`.
-    feed_dict: The feed dictionary to use when executing the `eval_ops`.
-    final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
-      to `Tensors`.
-    final_ops_feed_dict: A feed dictionary to use when evaluating `final_ops`.
-    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
-      evaluation loop.
-    config: An instance of `tf.ConfigProto` that will be used to
-      configure the `Session`. If left as `None`, the default will be used.
-
-  Returns:
-    The fetched values of `final_ops` or `None` if `final_ops` is `None`.
-  """
-  eval_step = get_or_create_eval_step()
-
-  # Prepare the run hooks.
-  hooks = hooks or []
-
-  if eval_ops is not None:
-    update_eval_step = state_ops.assign_add(eval_step, 1)
-
-    for h in hooks:
-      if isinstance(h, StopAfterNEvalsHook):
-        h._set_evals_completed_tensor(update_eval_step)  # pylint: disable=protected-access
-
-    if isinstance(eval_ops, dict):
-      eval_ops['update_eval_step'] = update_eval_step
-    elif isinstance(eval_ops, (tuple, list)):
-      eval_ops = list(eval_ops) + [update_eval_step]
-    else:
-      eval_ops = [eval_ops, update_eval_step]
-
-  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
-                                                         time.gmtime()))
-
-  # Prepare the session creator.
-  session_creator = monitored_session.ChiefSessionCreator(
-      scaffold=scaffold,
-      checkpoint_filename_with_path=checkpoint_path,
-      master=master,
-      config=config)
-
-  final_ops_hook = basic_session_run_hooks.FinalOpsHook(
-      final_ops, final_ops_feed_dict)
-  hooks.append(final_ops_hook)
-
-  with monitored_session.MonitoredSession(
-      session_creator=session_creator, hooks=hooks) as session:
-    if eval_ops is not None:
-      while not session.should_stop():
-        session.run(eval_ops, feed_dict)
-
-  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
-                                                         time.gmtime()))
-  return final_ops_hook.final_ops_values
-
-
 def evaluate_repeatedly(checkpoint_dir,
                         master='',
                         scaffold=None,
@@ -457,7 +353,8 @@ def evaluate_repeatedly(checkpoint_dir,
                         hooks=None,
                         config=None,
                         max_number_of_evaluations=None,
-                        timeout=None):
+                        timeout=None,
+                        timeout_fn=None):
   """Repeatedly searches for a checkpoint in `checkpoint_dir` and evaluates it.
 
   During a single evaluation, the `eval_ops` is run until the session is
@@ -473,7 +370,7 @@ def evaluate_repeatedly(checkpoint_dir,
 
   One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
   summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
-  summaries run immedietly after the model checkpoint has been restored.
+  summaries run immediately after the model checkpoint has been restored.
 
   Note that `evaluate_once` creates a local variable used to track the number of
   evaluations run via `tf.contrib.training.get_or_create_eval_step`.
@@ -503,6 +400,9 @@ def evaluate_repeatedly(checkpoint_dir,
       as `None`, then evaluation runs indefinitely.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
 
   Returns:
     The fetched values of `final_ops` or `None` if `final_ops` is `None`.
@@ -526,13 +426,16 @@ def evaluate_repeatedly(checkpoint_dir,
     else:
       eval_ops = [eval_ops, update_eval_step]
 
-  final_ops_hook = basic_session_run_hooks.FinalOpsHook(
-      final_ops, final_ops_feed_dict)
+  final_ops_hook = basic_session_run_hooks.FinalOpsHook(final_ops,
+                                                        final_ops_feed_dict)
   hooks.append(final_ops_hook)
 
   num_evaluations = 0
-  for checkpoint_path in checkpoints_iterator(checkpoint_dir,
-                                              eval_interval_secs, timeout):
+  for checkpoint_path in checkpoints_iterator(
+      checkpoint_dir,
+      min_interval_secs=eval_interval_secs,
+      timeout=timeout,
+      timeout_fn=timeout_fn):
 
     session_creator = monitored_session.ChiefSessionCreator(
         scaffold=scaffold,
@@ -552,9 +455,8 @@ def evaluate_repeatedly(checkpoint_dir,
           '%Y-%m-%d-%H:%M:%S', time.gmtime()))
     num_evaluations += 1
 
-    reached_max = num_evaluations >= max_number_of_evaluations
-    if max_number_of_evaluations and reached_max:
+    if (max_number_of_evaluations is not None and
+        num_evaluations >= max_number_of_evaluations):
       return final_ops_hook.final_ops_values
 
-  logging.info('Timed-out waiting for a checkpoint.')
   return final_ops_hook.final_ops_values
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index 44acb770dad..babd2239b67 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -22,7 +22,6 @@ import glob
 import os
 import time
 
-
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables
@@ -107,6 +106,18 @@ class CheckpointIteratorTest(test.TestCase):
       num_found += 1
     self.assertEqual(num_found, 1)
 
+  def testTimeoutFn(self):
+    timeout_fn_calls = [0]
+    def timeout_fn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] > 3
+
+    results = list(
+        evaluation.checkpoints_iterator(
+            '/non-existent-dir', timeout=0.1, timeout_fn=timeout_fn))
+    self.assertEqual([], results)
+    self.assertEqual(4, timeout_fn_calls[0])
+
 
 class WaitForNewCheckpointTest(test.TestCase):
 
@@ -193,7 +204,9 @@ class EvaluateOnceTest(test.TestCase):
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
         final_ops={'accuracy': accuracy},
-        hooks=[evaluation.StopAfterNEvalsHook(1),])
+        hooks=[
+            evaluation.StopAfterNEvalsHook(1),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
 
   def testEvalOpAndFinalOp(self):
@@ -218,7 +231,9 @@ class EvaluateOnceTest(test.TestCase):
         checkpoint_path=checkpoint_path,
         eval_ops=eval_ops,
         final_ops={'value': final_ops},
-        hooks=[evaluation.StopAfterNEvalsHook(num_evals),])
+        hooks=[
+            evaluation.StopAfterNEvalsHook(num_evals),
+        ])
     self.assertEqual(final_ops_values['value'], num_evals + final_increment)
 
   def testOnlyFinalOp(self):
@@ -302,7 +317,9 @@ class EvaluateRepeatedlyTest(test.TestCase):
         checkpoint_dir=checkpoint_dir,
         eval_ops=update_op,
         final_ops={'accuracy': accuracy},
-        hooks=[evaluation.StopAfterNEvalsHook(1),],
+        hooks=[
+            evaluation.StopAfterNEvalsHook(1),
+        ],
         max_number_of_evaluations=1)
     self.assertTrue(final_values['accuracy'] > .99)
 
@@ -335,6 +352,42 @@ class EvaluateRepeatedlyTest(test.TestCase):
     # Then the timeout kicked in and stops the loop.
     self.assertLess(end - start, 7)
 
+  def testEvaluationLoopTimeoutWithTimeoutFn(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(),
+                                  'evaluation_loop_timeout_with_timeout_fn')
+
+    # Train a Model to completion:
+    self._train_model(checkpoint_dir, num_steps=300)
+
+    # Run
+    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+    labels = constant_op.constant(self._labels, dtype=dtypes.float32)
+    logits = logistic_classifier(inputs)
+    predictions = math_ops.round(logits)
+
+    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+
+    timeout_fn_calls = [0]
+    def timeout_fn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] > 3
+
+    final_values = evaluation.evaluate_repeatedly(
+        checkpoint_dir=checkpoint_dir,
+        eval_ops=update_op,
+        final_ops={'accuracy': accuracy},
+        hooks=[
+            evaluation.StopAfterNEvalsHook(1),
+        ],
+        eval_interval_secs=1,
+        max_number_of_evaluations=2,
+        timeout=0.1,
+        timeout_fn=timeout_fn)
+    # We should have evaluated once.
+    self.assertTrue(final_values['accuracy'] > .99)
+    # And called 4 times the timeout fn
+    self.assertEqual(4, timeout_fn_calls[0])
+
   def testEvaluateWithEvalFeedDict(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -357,14 +410,16 @@ class EvaluateRepeatedlyTest(test.TestCase):
         eval_ops=eval_ops,
         feed_dict={increment: 3},
         final_ops={'my_var': array_ops.identity(my_var)},
-        hooks=[evaluation.StopAfterNEvalsHook(num_evals),],
+        hooks=[
+            evaluation.StopAfterNEvalsHook(num_evals),
+        ],
         max_number_of_evaluations=1)
     self.assertEqual(final_values['my_var'], expected_value)
 
   def _create_names_to_metrics(self, predictions, labels):
     accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy1, update_op1 = metric_ops.streaming_accuracy(
+        predictions + 1, labels)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -413,7 +468,9 @@ class EvaluateRepeatedlyTest(test.TestCase):
 
     evaluation.evaluate_repeatedly(
         checkpoint_dir=checkpoint_dir,
-        hooks=[evaluation.SummaryAtEndHook(log_dir=logdir),],
+        hooks=[
+            evaluation.SummaryAtEndHook(log_dir=logdir),
+        ],
         max_number_of_evaluations=1)
 
     self._verify_summaries(logdir, names_to_values)
diff --git a/tensorflow/contrib/training/python/training/failure_tolerator.py b/tensorflow/contrib/training/python/training/failure_tolerator.py
deleted file mode 100644
index cd533f1da9b..00000000000
--- a/tensorflow/contrib/training/python/training/failure_tolerator.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A retry helper for tolerating transient failures in distributed training."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import time
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-
-class FailureTolerator(object):
-  """Helper for tolerating certain exceptions.
-
-  When encountering a handled exception inside tolerator.forgive(), it
-  is suppressed (but logged). A subsequent call to tolerator.forgive()
-  will sleep for a period of time before continuing, with exponential
-  backoff on multiple exceptions. (The delay avoids retrying too
-  quickly -- a subsequent attempt will often only succeed after a
-  transient failure has resolved itself.)
-
-  If more than `limit` exceptions have been encountered,
-  the error will not be suppressed.
-
-  Exceptions occurring more than `forgive_after_seconds` ago
-  (excluding time spent waiting between retries) are forgiven and no
-  longer count towards the limit.
-
-  An example loop using FailureTolerator to retry until a successful
-  `session.run(...)` would look like:
-  ```
-  failure_tolerator = FailureTolerator()
-  while True:
-    with failure_tolerator.forgive():
-      session = make_session_somehow()
-      while not should_stop():
-        session.run(...)
-      break  # session.run was successful
-  ```
-
-  By using FailureTolerator, failures are logged, there are delays
-  between retries, and there's a ceiling on the maximum number of
-  retries available. (In the case of persistent errors, the task won't
-  just loop forever!)
-  """
-
-  def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0,
-               forgive_after_seconds=6000, handled_exceptions=None):
-    """Creates a FailureTolerator.
-
-    The result will pause for `init_delay *
-    (backoff_factor^(failure_count-1))` when re-entering `forgive()`
-    after a failure.
-
-    Args:
-      limit: The maximum number of suppressed, unforgiven, failures.
-      init_delay: How long to pause once the first failure is
-        encountered. Defaults to five seconds.
-      backoff_factor: Each subsequent failure grows the pause by this factor.
-      forgive_after_seconds: Failures older than this are forgiven.
-      handled_exceptions: The exceptions to forgive. Defaults to
-          `(errors.AbortedError,)`.
-
-    """
-    self.limit = limit
-    self.backoff = backoff_factor
-    self.delay = init_delay
-    self.forgive_after = forgive_after_seconds
-    self.exceptions = []
-    self.time_in_delay = 0.0
-    if handled_exceptions is None:
-      self.handled = (errors.AbortedError,)
-    else:
-      self.handled = tuple(handled_exceptions)
-
-  def _adjusted_now(self):
-    """Returns what the current time would be if no delays had occurred."""
-    return time.time() - self.time_in_delay
-
-  def _forgive_old(self):
-    adjusted_now = self._adjusted_now()
-    self.exceptions = [t for t in self.exceptions
-                       if (adjusted_now - t) < self.forgive_after]
-
-  def _handle_error(self, e):
-    if not isinstance(e, self.handled):
-      return True
-
-    self._forgive_old()
-    self.exceptions.append(self._adjusted_now())
-
-    return len(self.exceptions) >= self.limit
-
-  # pylint: disable=broad-except
-  @contextlib.contextmanager
-  def forgive(self):
-    self._forgive_old()
-    if self.exceptions:
-      delay = self.delay * (self.backoff ** (len(self.exceptions) - 1))
-      logging.warning('Sleeping for %f seconds before resuming' % delay)
-      time.sleep(delay)
-      self.time_in_delay += delay
-    try:
-      yield
-    except Exception as e:
-      if self._handle_error(e):
-        raise
-      else:
-        logging.warning('Forgiving an exception', exc_info=True)
diff --git a/tensorflow/contrib/training/python/training/failure_tolerator_test.py b/tensorflow/contrib/training/python/training/failure_tolerator_test.py
deleted file mode 100644
index 89238e82f8b..00000000000
--- a/tensorflow/contrib/training/python/training/failure_tolerator_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.training.failure_tolerator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-from tensorflow.contrib.training.python.training import failure_tolerator
-from tensorflow.python.platform import test
-
-
-class ForgiveMe(Exception):
-  pass
-
-
-class Unforgivable(Exception):
-  pass
-
-
-class FailureToleratorTest(test.TestCase):
-  # Tests for the FailureTolerator helper
-
-  def testHandledExceptions(self):
-    tolerator = failure_tolerator.FailureTolerator(
-        init_delay=0.0, handled_exceptions=[ForgiveMe])
-
-    with tolerator.forgive():
-      raise ForgiveMe()
-
-    with self.assertRaises(Unforgivable):
-      with tolerator.forgive():
-        raise Unforgivable()
-
-  def testLimit(self):
-    tolerator = failure_tolerator.FailureTolerator(
-        init_delay=0.0, limit=3, handled_exceptions=[ForgiveMe])
-
-    with tolerator.forgive():
-      raise ForgiveMe()
-    with tolerator.forgive():
-      raise ForgiveMe()
-
-    with self.assertRaises(ForgiveMe):
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-  def testDelaysExponentially(self):
-    # Tests that delays are appropriate, with exponential backoff.
-    tolerator = failure_tolerator.FailureTolerator(
-        init_delay=1.0, backoff_factor=1.5, handled_exceptions=[ForgiveMe])
-
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-      mock_sleep.assert_has_calls(
-          [test.mock.call(1.0), test.mock.call(1.5), test.mock.call(2.25)],
-          any_order=False)
-      self.assertEquals(3, mock_sleep.call_count)
-
-  def testForgivesSuccessfully(self):
-    # Tests that exceptions are forgiven after forgive_after_seconds
-    tolerator = failure_tolerator.FailureTolerator(
-        limit=3,
-        init_delay=0.0,
-        backoff_factor=1.0,  # no exponential backoff
-        forgive_after_seconds=10.0,
-        handled_exceptions=[ForgiveMe])
-
-    cur_time = 10.0
-
-    with test.mock.patch.object(time, 'time') as mock_time:
-      mock_time.side_effect = lambda: cur_time
-
-      with tolerator.forgive():
-        raise ForgiveMe()
-      cur_time = 15.0
-      with tolerator.forgive():
-        raise ForgiveMe()
-
-      cur_time = 20.1  # advance more than forgive_after_seconds
-
-      with tolerator.forgive():
-        raise ForgiveMe()  # should not be raised
-
-      cur_time = 24.0
-
-      with self.assertRaises(ForgiveMe):
-        with tolerator.forgive():
-          raise ForgiveMe()  # third exception in < 10secs (t=15, 20.1, 24)
-
-  def testForgivesDoesNotCountDelays(self):
-    tolerator = failure_tolerator.FailureTolerator(
-        limit=3,
-        init_delay=1.0,
-        backoff_factor=1.0,  # no exponential backoff
-        forgive_after_seconds=10.0,
-        handled_exceptions=[ForgiveMe])
-
-    cur_time = [10.0]
-
-    def _sleep(x):
-      cur_time[0] += x
-
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      with test.mock.patch.object(time, 'time') as mock_time:
-        mock_time.side_effect = lambda: cur_time[0]
-        mock_sleep.side_effect = _sleep
-
-        with tolerator.forgive():
-          raise ForgiveMe()
-
-        cur_time[0] += 1.0
-
-        with tolerator.forgive():
-          raise ForgiveMe()
-
-        self.assertEquals(12.0, time.time())  # ensure there was a sleep
-
-        cur_time[0] = 20.1  # 10.1 seconds after the first failure!
-
-        with self.assertRaises(ForgiveMe):
-          with tolerator.forgive():
-            raise ForgiveMe()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/training/python/training/feeder.py b/tensorflow/contrib/training/python/training/feeder.py
deleted file mode 100644
index a7f43cc07e9..00000000000
--- a/tensorflow/contrib/training/python/training/feeder.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-"""A mechanism to strongly decouple input generation from consumption.
-
-This helper handles the plumbing in order to set up a feeder task to
-push generated inputs to a pool of remote consumers; or to run an
-identical feeding mechanism in a seperate thread in the same process.
-
-Example usage for distributed feeding:
-
-```
-# In the consumer job:
-dtypes = [tf.int32, tf.string]
-shapes = [[5], []]
-
-with tf.Graph().as_default():
-  feeder = tf.contrib.training.Feeder(dtypes, shapes)
-  int_inputs, str_inputs = feeder.get_many_fed_tensors(batch_size=10)
-
-  # ... go on to use inputs and a training/eval/etc loop as usual ...
-
-
-# In the feeder job:
-with tf.Graph().as_default():
-  input_ints = tf.constant([[1, 2, 3, 4, 5], [2, 4, 6, 8, 10]])
-  input_strs = tf.constant(['one_x', 'two_x'])
-
-  # Important: constructor arguments must be the same as in the consumer job!
-  feeder = tf.contrib.training.Feeder(dtypes, shapes)
-
-  feeder.set_many_fed_tensors([input_ints, input_strs])
-
-  feeder.add_remote_devices(
-      ['/job:consumer/replica:0', '/job:consumer/replica:1'])
-  # ...or use the add_remote_replicas helper.
-
-  feeder.run_feeding_forever(lambda: tf.Session(FLAGS.master))
-```
-
-For feeding in-process, a Feeder acts similarly to a Queue, with a
-QueueRunner automatically registered:
-
-```
-dtypes = [tf.int32, tf.string]
-shapes = [[5], []]
-
-# ... in main():
-with tf.Graph().as_default():
-  feeder = tf.contrib.training.Feeder(dtypes, shapes)
-
-  feeder.set_many_fed_tensors([tf.constant([[1, 2, 3, 4, 5], [2, 4, 6, 8, 10]]),
-                               tf.constant(['one_x', 'two_x'])])
-
-  int_inputs, str_inputs = feeder.get_many_fed_tensors(batch_size=10)
-
-  # ... go on to use inputs and a training/eval/etc loop as usual ...
-```
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import threading
-
-from tensorflow.contrib.training.python.training import failure_tolerator
-from tensorflow.python.framework import device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner
-
-
-class Feeder(object):
-  """Helper to manage the plumbing for externally-fed graphs."""
-  REMOTE_QUEUE_RUNNERS = 'feeder_remote_queue_runners'
-
-  def __init__(
-      self, dtypes, shapes=None, capacity=10, shared_name='feeding_queue'):
-    self._dtypes = dtypes
-    self._shapes = shapes
-    self._shared_name = shared_name
-    self._capacity = capacity
-    self._local_q = data_flow_ops.FIFOQueue(capacity=self._capacity,
-                                            dtypes=self._dtypes,
-                                            shapes=self._shapes,
-                                            name=self._shared_name,
-                                            shared_name=self._shared_name)
-    self._num_remote_feeds = 0
-
-    # Fake do-nothing operation that's used to prevent remote queues
-    # from being closed, and as a workaround for b/32749157
-    self._fake_op = array_ops.constant('dummy close', name='feeder_fake_op').op
-    self._feeding_event = threading.Event()
-
-  def get_fed_tensors(self):
-    """Returns fed tensor values."""
-    return self._local_q.dequeue()
-
-  def get_many_fed_tensors(self, batch_size):
-    """Returns a batch of fed tensor values."""
-    return self._local_q.dequeue_many(batch_size)
-
-  def set_fed_tensors(self, tensors):
-    """Sets fed tensors."""
-    enq_op = self._local_q.enqueue(tensors)
-    queue_runner.add_queue_runner(queue_runner.QueueRunner(
-        self._local_q, [enq_op]))
-
-  def set_many_fed_tensors(self, tensors):
-    """Sets batches fed tensors."""
-    enq_op = self._local_q.enqueue_many(tensors)
-    queue_runner.add_queue_runner(queue_runner.QueueRunner(
-        self._local_q, [enq_op]))
-
-  def add_remote_device(self, remote_device):
-    """Requests that fed values are sent to `remote_device`."""
-    local_value = self.get_fed_tensors()
-
-    self._num_remote_feeds += 1
-
-    with ops.device(None):  # Bypass any existing device() calls
-      with ops.device(remote_device):
-        remote_q = data_flow_ops.FIFOQueue(capacity=self._capacity,
-                                           dtypes=self._dtypes,
-                                           shapes=self._shapes,
-                                           name=self._shared_name,
-                                           shared_name=self._shared_name)
-        remote_enq_op = remote_q.enqueue(local_value)
-
-    # Add a remote queue runner to feed the remote queue.
-    self._add_remote_queue_runner(remote_q, [remote_enq_op])
-
-  def add_remote_devices(self, devices):
-    for d in devices:
-      self.add_remote_device(d)
-
-  def add_remote_replicas(self, job_name, replica_count, feeder_task_num=None,
-                          replicas_per_feeder=None,
-                          base_device_spec=None):
-    """Adds feeding for a range of replicas from `job_name`.
-
-    Args:
-      job_name: The job name portion of the remote jobs
-      replica_count: The total number of remote jobs
-      feeder_task_num: Optional; if there is more than one feeder job
-        in the flock this is the task # of the current process.
-      replicas_per_feeder: How many replicas each feeder job should
-        push to. If present, `feeder_task_num` is required.
-      base_device_spec: Optional base device spec. If present, then
-        each replica device spec is derived from `base_device_spec`,
-        with the job and replica properties set.
-    Raises:
-      ValueError: On invalid arguments.
-    """
-    if replicas_per_feeder is not None and feeder_task_num is None:
-      raise ValueError(
-          'Must set feeder_task_num if replicas_per_feeder is provided.')
-
-    if replicas_per_feeder is None:
-      replicas_per_feeder = replica_count
-      feeder_task_num = 0
-
-    if isinstance(base_device_spec, device.DeviceSpec):
-      device_spec = copy.copy(base_device_spec)
-    else:
-      device_spec = device.DeviceSpec.from_string(base_device_spec or '')
-
-    device_spec.job = job_name
-
-    start_index = feeder_task_num * replicas_per_feeder
-    end_index = start_index + replicas_per_feeder
-
-    for idx in range(start_index, end_index):
-      device_spec.replica = (idx % replica_count)
-      self.add_remote_device(device_spec.to_string())
-
-  def run_feeding_forever(self,
-                          sess_callback,
-                          outer_coordinator=None,
-                          tolerator=None,
-                          start_queue_runners=True):
-    """Runs feeding forever.
-
-    This method exits only if `outer_coordinator` has a stop requested
-    or if a remote feed encounters an un-tolerated error. The most
-    likely cause of `outer_coordinator` stopping besides a manual call
-    to `request_stop()` is a `QueueRunner` thread reaching the end of
-    its queue or encountering an error.
-
-    Returns only after joining `outer_coordinator`.
-
-    Args:
-      sess_callback: A function which, when called, returns a Session
-        to use for feeding. Can be called multiple times due to retries.
-      outer_coordinator: If present, a `Coordinator` which the feeding
-        process will respect. Will be created if omitted.
-      tolerator: If present, a `failure_tolerator.FailureTolerator` which is
-        used to manage retries of feeding the remote devices.
-      start_queue_runners: Whether to start queue runners before
-        beginning to feed the remote devices. Defaults to True. If
-        False and no other mechanism is used to start queue runners, this
-        method will hang forever without doing work.
-
-    """
-    # We use /two/ coordinators: one which runs normal queue
-    # runners (outer_coordinator), and one which runs the remote
-    # enqueues (using an inner coordinator) with retries and failure
-    # tolerance. By using two coordinators, errors
-    # encountered while running the remote enqueue ops don't cause the
-    # outer_coordinator to be shut down.
-    if outer_coordinator is None:
-      outer_coordinator = coordinator.Coordinator()
-
-    # Start the outer queue runners:
-    if start_queue_runners:
-      session = sess_callback()
-      # Work around b/32749157 by running an operation before proceeding --
-      # this way the session used for queue runners will be fully established
-      # before we create another session with the same target.
-      session.run(self._fake_op)
-      queue_runner.start_queue_runners(sess=session,
-                                       coord=outer_coordinator)
-
-    if self._num_remote_feeds == 0:
-      self._feeding_event.set()
-      outer_coordinator.join()
-      return
-    else:
-      try:
-        self._feed_remote_queues_forever(
-            sess_callback, outer_coordinator, tolerator)
-      finally:
-        self._feeding_event.set()
-        outer_coordinator.join()
-
-  def wait_until_feeding(self, timeout=None):
-    """Waits until run_feeding_forever() is entered.
-
-    Does not return until it is safe to create new sessions against
-    the same target as the feeder is using; see b/32749157.
-
-    Args:
-      timeout: An optional timeout in seconds.
-    Returns:
-      True if feeding has begun; False if the timeout was reached.
-    """
-    return self._feeding_event.wait(timeout=timeout)
-
-  def _feed_remote_queues_forever(
-      self, sess_callback, outer_coordinator, tolerator):
-    if tolerator is None:
-      tolerator = failure_tolerator.FailureTolerator(limit=5)
-
-    # In a retry loop, keep the remote queue runners going:
-    while True:
-      if outer_coordinator.should_stop():
-        return
-
-      inner_coordinator = coordinator.Coordinator()
-
-      # Make sure inner_coordinator stops when outer_coordinator does:
-      _link_coordinators(inner_coordinator, outer_coordinator)
-
-      # Create a fresh session to use for remote queues:
-      inner_session = sess_callback()
-      inner_session.run(self._fake_op)  # Work around b/32749157, as above
-
-      queue_runner.start_queue_runners(sess=inner_session,
-                                       coord=inner_coordinator,
-                                       collection=Feeder.REMOTE_QUEUE_RUNNERS)
-
-      self._feeding_event.set()  # Notify that feeding has begun
-
-      try:
-        with tolerator.forgive():
-          # Wait for a stop to be requested.
-          inner_coordinator.wait_for_stop()
-
-          # TODO(shoutis): If outer_coordinator.should_stop(), it
-          # would be nice to interrupt the remote queue runners (which
-          # may be blocked if their remote queue is full) -- but
-          # there's no way currently; see b/32774422.
-
-          # Cause any exceptions from the remote queue runners to be
-          # reraised immediately, without waiting for their associated
-          # threads to terminate like join() would. This means a retry
-          # can begin immediately after any remote device fails,
-          # rather than having to wait for any pending enqueues to
-          # other remote devices to finish first.
-          inner_coordinator.raise_requested_exception()
-
-          # If this line is reached, there was a graceful shutdown
-          # requested.
-
-          # Request the outer coordinator to stop. Since
-          # outer_coordinator.request_stop() is the currently only way
-          # for inner_coordinator() to finish without failure, this is
-          # redundant, but it's harmless and defends against infinite
-          # hangs should code changes make it possible for
-          # inner_coordinator to finish in other ways.
-          outer_coordinator.request_stop()
-
-          return
-      except Exception as e:
-        # Pass non-forgiven errors along to outer_coordinator:
-        outer_coordinator.request_stop(e)
-        raise
-
-  def _add_remote_queue_runner(self, queue, enq_ops):
-    """Adds a remote queue runner to the graph.
-
-    These queue runners differ from the standard in two ways: First,
-    they never close their queue. Second, they are added to the
-    `Feeder.REMOTE_QUEUE_RUNNERS` collection, rather than
-    `ops.GraphKeys.QUEUE_RUNNERS`, so they can be started/stopped
-    seperately.
-
-    Args:
-      queue: The queue.
-      enq_ops: A list of ops which perform enqueues (each on its own thread).
-    """
-
-    runner = queue_runner.QueueRunner(
-        queue,
-        enq_ops,
-        cancel_op=self._fake_op,
-        close_op=self._fake_op)
-    queue_runner.add_queue_runner(
-        runner, collection=Feeder.REMOTE_QUEUE_RUNNERS)
-
-
-def _link_coordinators(inner_coord, outer_coord, start=True, wait_time=5):
-  """Returns a thread which stops `inner_coord` whenever `outer_coord` stops.
-
-  The thread is also registered with `inner_coord`.
-
-  Args:
-    inner_coord: The `Coordinator` to stop.
-    outer_coord: The `Coordinator` to watch for stopping.
-    start: Whether to start the thread before returning.
-    wait_time: The number of seconds for each `outer_coord.wait_for_stop` call.
-  Returns:
-    A `Thread` which links the coordinators.
-  """
-  def _link_thread():
-    while True:
-      if inner_coord.should_stop():
-        # The inner coordinator is stopping, so this thread's done.
-        return
-
-      if outer_coord.wait_for_stop(wait_time):
-        # The outer coordinator stopped; we should stop the inner.
-        with inner_coord.stop_on_exception():
-          # Causes a re-raise, but without waiting for registered threads
-          outer_coord.raise_requested_exception()
-          inner_coord.request_stop()
-          return
-
-  result = threading.Thread(target=_link_thread)
-  inner_coord.register_thread(result)
-  if start:
-    result.start()
-  return result
diff --git a/tensorflow/contrib/training/python/training/feeder_test.py b/tensorflow/contrib/training/python/training/feeder_test.py
deleted file mode 100644
index 4d5cf9eff26..00000000000
--- a/tensorflow/contrib/training/python/training/feeder_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.training.feeder."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import portpicker
-from tensorflow.contrib.training.python.training import feeder as feeder_lib
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes as dtypes_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
-from tensorflow.python.training import server_lib
-
-
-_PORTS = set()
-
-
-def _pick_unused_port():
-  """For some reason portpicker returns the same port sometimes."""
-  while True:
-    p = portpicker.pick_unused_port()
-    if p not in _PORTS:
-      break
-  _PORTS.add(p)
-  return p
-
-
-class FeederThread(object):
-  # Helper class, wrapping a feeder and making sure it's located on the proper
-  # device
-
-  def __init__(self, test_case, coord, servers, job, task_num, prefix=''):
-    self.graph = ops.Graph()
-    self.coord = coord
-    self.server = servers[job][task_num]
-    self.remote_devices = []
-
-    # Just because we do tf.session(X) doesn't mean ops will located
-    # on the X task; wrapping all feeder creation/interaction in an
-    # extra tf.device(X) ensures that any ops that don't provider
-    # their own tf.device() wrapper will be placed on the correct "local"
-    # feeder task. A session can and does put ops that have no device
-    # assignment onto any of the tasks it knows about, not just the
-    # task passed as its target= argument!
-    self.device = '/job:%s/task:%d' % (job, task_num)
-    self.prefix = prefix
-    self.thread = test_case.checkedThread(target=self._feed_thread)
-
-    with self.graph.as_default(), ops.device(self.device):
-      self.feeder = feeder_lib.Feeder(
-          [dtypes_lib.string, dtypes_lib.string], [[], []], capacity=1)
-      self.feeder.set_many_fed_tensors(self._get_feed_values())
-
-  def _get_feed_values(self):
-    # Return some feeding strings, possibly prefixed.
-    return [
-        constant_op.constant(
-            ['%s%s' % (self.prefix, x) for x in ['a0', 'a1', 'a2']]),
-        constant_op.constant(
-            ['%s%s' % (self.prefix, x) for x in ['b0', 'b1', 'b2']])
-    ]
-
-  def add_remote_device(self, dev):
-    with self.graph.as_default(), ops.device(self.device):
-      self.feeder.add_remote_device(dev)
-
-  def start(self):
-    self.thread.start()
-    self.feeder.wait_until_feeding()  # wait until it's up & feeding
-    if self.coord.should_stop():
-      self.coord.join()  # rethrows errors encountered in run_feeding_forever
-
-  def join(self):
-    self.thread.join()
-
-  def _session(self):
-    return session_lib.Session(target=self.server.target)
-
-  def _feed_thread(self):
-    with self.coord.stop_on_exception():
-      with self.graph.as_default(), ops.device(self.device):
-        self.feeder.run_feeding_forever(self._session, self.coord)
-
-
-class FeederTest(test.TestCase):
-  # Tests for Feeder
-
-  def _create_local_cluster(self, **kargs):
-    """Creates a local cluster."""
-    cluster_dict = {}
-    for (k, v) in kargs.items():
-      cluster_dict[k] = [
-          'localhost:%d' % _pick_unused_port() for _ in range(v)
-      ]
-
-    # Launch servers:
-    servers = {}
-    for (k, v) in kargs.items():
-      servers[k] = [
-          server_lib.Server(
-              cluster_dict, job_name=k, task_index=idx, start=True)
-          for idx in range(v)
-      ]
-    return servers
-
-  def testFeederActsLikeQueue(self):
-    # Tests that a feeder acts like a queue
-    feeder = feeder_lib.Feeder(
-        dtypes=[dtypes_lib.string, dtypes_lib.string],
-        shapes=[[], []],
-        capacity=10)
-
-    feeder.set_many_fed_tensors([
-        constant_op.constant(['a0', 'a1', 'a2']),
-        constant_op.constant(['b0', 'b1', 'b2'])
-    ])
-
-    out_a, out_b = feeder.get_fed_tensors()
-
-    with self.test_session() as session:
-      coord = coordinator.Coordinator()
-      queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      a, b = session.run([out_a, out_b])
-      self.assertEquals(b'a0', a)
-      self.assertEquals(b'b0', b)
-      a = session.run(out_a)  # Omit b!
-      self.assertEquals(b'a1', a)
-      a, b = session.run([out_a, out_b])
-      self.assertEquals(b'a2', a)
-      self.assertEquals(b'b2', b)  # queued together
-      a, b = session.run([out_a, out_b])  # loops around
-      self.assertEquals(b'a0', a)
-      self.assertEquals(b'b0', b)  # queued together
-
-    coord.request_stop()
-    coord.join()
-
-  def testFeederSeparateThread(self):
-    # Start a feeder on a seperate thread, but with a shared local queue
-    servers = self._create_local_cluster(worker=1)
-    coord = coordinator.Coordinator()
-    feed_thread = FeederThread(self, coord, servers, 'worker', 0)
-    feed_thread.start()
-
-    with ops.Graph().as_default():
-      with ops.device('/job:worker/task:0'):
-        feeder = feeder_lib.Feeder(
-            dtypes=[dtypes_lib.string, dtypes_lib.string],
-            shapes=[[], []],
-            capacity=1)
-
-        out_a, out_b = feeder.get_fed_tensors()
-
-      with session_lib.Session(servers['worker'][0].target) as session:
-        a, b = session.run([out_a, out_b])
-        self.assertEquals(b'a0', a)
-        self.assertEquals(b'b0', b)
-        a = session.run(out_a)  # Omit b!
-        self.assertEquals(b'a1', a)
-
-    coord.request_stop()
-    coord.join()
-    feed_thread.join()
-
-  def testOneEachFeeding(self):
-    # One feeder, one consumer
-    servers = self._create_local_cluster(consumer=1, feeder=1)
-
-    coord = coordinator.Coordinator()
-    feeder_thread = FeederThread(self, coord, servers, 'feeder', 0)
-    feeder_thread.add_remote_device('/job:consumer/task:0')
-    feeder_thread.start()
-
-    with ops.Graph().as_default():
-      with ops.device('/job:consumer/task:0'):
-        feeder = feeder_lib.Feeder(
-            dtypes=[dtypes_lib.string, dtypes_lib.string],
-            shapes=[[], []],
-            capacity=1)
-
-        out_a, out_b = feeder.get_fed_tensors()
-
-      with session_lib.Session(servers['consumer'][0].target) as session:
-        a, b = session.run([out_a, out_b])
-        self.assertEquals(b'a0', a)
-        self.assertEquals(b'b0', b)
-        a = session.run(out_a)  # Omit b!
-        self.assertEquals(b'a1', a)
-
-    coord.request_stop()
-    coord.join()
-    feeder_thread.join()
-
-  def testMultipleProducersAndConsumers(self):
-    # Three feeders, three consumers.
-    servers = self._create_local_cluster(consumer=3, feeder=3)
-
-    coord = coordinator.Coordinator()
-
-    # Start the three feeders:
-    f0 = FeederThread(self, coord, servers, 'feeder', 0, prefix='feed0_')
-    f0.add_remote_device('/job:consumer/task:0')
-    f0.add_remote_device('/job:consumer/task:1')
-    f0.start()
-
-    f1 = FeederThread(self, coord, servers, 'feeder', 1, prefix='feed1_')
-    f1.add_remote_device('/job:consumer/task:2')
-    f1.add_remote_device('/job:consumer/task:0')
-    f1.start()
-
-    f2 = FeederThread(self, coord, servers, 'feeder', 2, prefix='feed2_')
-    f2.add_remote_device('/job:consumer/task:1')
-    f2.add_remote_device('/job:consumer/task:2')
-    f2.start()
-
-    # Three consumers.
-    def _run_consumer(task, expected_keys):
-      server = servers['consumer'][task]
-      # Runs until everything in expected_keys has been seen at least once;
-      # fails if any prefix not in expected_keys shows up
-      with ops.Graph().as_default(), ops.device('/job:consumer/task:%d' % task):
-        feeder = feeder_lib.Feeder(
-            dtypes=[dtypes_lib.string, dtypes_lib.string],
-            shapes=[[], []],
-            capacity=1)
-
-        out_a, out_b = feeder.get_fed_tensors()
-        counts = collections.Counter()
-        with session_lib.Session(server.target) as sess:
-          while True:
-            a, b = sess.run([out_a, out_b])
-            counts[a[:-1]] += 1
-            counts[b[:-1]] += 1
-
-            self.assertTrue(a[:-1] in expected_keys)
-            self.assertTrue(b[:-1] in expected_keys)
-
-            if all(counts[k] > 0 for k in expected_keys):
-              return
-
-    _run_consumer(0, [b'feed0_a', b'feed0_b', b'feed1_a', b'feed1_b'])
-    _run_consumer(1, [b'feed0_a', b'feed0_b', b'feed2_a', b'feed2_b'])
-    _run_consumer(2, [b'feed1_a', b'feed1_b', b'feed2_a', b'feed2_b'])
-
-    coord.request_stop()
-    coord.join()
-
-    f0.join()
-    f1.join()
-    f2.join()
-
-  def testAddRemoteReplicas(self):
-    with ops.Graph().as_default():
-      for idx in range(3):
-        with ops.name_scope('replica_%d' % idx):
-          feeder = feeder_lib.Feeder(
-              dtypes=[dtypes_lib.string, dtypes_lib.string],
-              shapes=[[], []],
-              capacity=10)
-
-          feeder.add_remote_replicas(
-              'consumer',
-              replica_count=3,
-              feeder_task_num=idx,
-              replicas_per_feeder=2,
-              base_device_spec='/device:cpu:0')
-
-      # Examine ops...
-      op_types_by_scope_and_device = collections.defaultdict(
-          lambda: collections.defaultdict(collections.Counter))
-
-      for op in ops.get_default_graph().get_operations():
-        scope = '/'.join(op.name.split('/')[:-1])
-        dev = op.device
-
-        op_types_by_scope_and_device[scope][dev][op.type] += 1
-
-      expected_ops = collections.Counter(
-          {'QueueEnqueueV2': 1, 'FIFOQueueV2': 1})
-      expected_enq_devices = [('replica_0', [
-          '/job:consumer/replica:0/device:cpu:0',
-          '/job:consumer/replica:1/device:cpu:0',
-      ]), ('replica_1', [
-          '/job:consumer/replica:2/device:cpu:0',
-          '/job:consumer/replica:0/device:cpu:0',
-      ]), ('replica_2', [
-          '/job:consumer/replica:1/device:cpu:0',
-          '/job:consumer/replica:2/device:cpu:0',
-      ])]
-
-      for scope, devs in expected_enq_devices:
-        for dev in devs:
-          self.assertEqual(expected_ops,
-                           op_types_by_scope_and_device[scope][dev])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/training/python/training/feeding_queue_runner.py b/tensorflow/contrib/training/python/training/feeding_queue_runner.py
new file mode 100644
index 00000000000..d055555b010
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/feeding_queue_runner.py
@@ -0,0 +1,24 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `QueueRunner` that takes a feed function as an argument."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.estimator.inputs.queues.feeding_queue_runner import _FeedingQueueRunner as FeedingQueueRunner
+# pylint: enable=unused-import
diff --git a/tensorflow/contrib/training/python/training/hparam.proto b/tensorflow/contrib/training/python/training/hparam.proto
new file mode 100644
index 00000000000..67462cd9cfa
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/hparam.proto
@@ -0,0 +1,52 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+
+// Protocol buffer holding hyper parameters.
+// Examples of hyper parameters:
+//   learning_rate = 0.1,
+//   num_hidden_units = 100,
+//   activations = ['relu', 'tanh']
+message HParamDef {
+  message BytesList {
+    repeated bytes value = 1;
+  }
+  message FloatList {
+    repeated float value = 1 [packed = true];
+  }
+  message Int64List {
+    repeated int64 value = 1 [packed = true];
+  }
+  message BoolList {
+    repeated bool value = 1 [packed = true];
+  }
+  message HParamType {
+    oneof kind {
+      int64 int64_value = 1;
+      float float_value = 2;
+      bytes bytes_value = 3;
+      bool bool_value = 7;
+      Int64List int64_list = 4;
+      FloatList float_list = 5;
+      BytesList bytes_list = 6;
+      BoolList bool_list = 8;
+    }
+  };
+  map<string, HParamType> hparam = 1;
+}
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
new file mode 100644
index 00000000000..c19a36eabcf
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -0,0 +1,467 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Hyperparameter values."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import re
+
+import six
+
+from tensorflow.contrib.training.python.training import hparam_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.util import compat
+
+
+def parse_values(values, type_map):
+  """Parses hyperparameter values from a string into a python map..
+
+  `values` is a string containing comma-separated `name=value` pairs.
+  For each pair, the value of the hyperparameter named `name` is set to
+  `value`.
+
+  If a hyperparameter name appears multiple times in `values`, the last
+  value is used.
+
+  The `value` in `name=value` must follows the syntax according to the
+  type of the parameter:
+
+  *  Scalar integer: A Python-parsable integer point value.  E.g.: 1,
+     100, -12.
+  *  Scalar float: A Python-parsable floating point value.  E.g.: 1.0,
+     -.54e89.
+  *  Boolean: Either true or false.
+  *  Scalar string: A non-empty sequence of characters, excluding comma,
+     spaces, and square brackets.  E.g.: foo, bar_1.
+  *  List: A comma separated list of scalar values of the parameter type
+     enclosed in square backets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+
+  Args:
+    values: String.  Comma separated list of `name=value` pairs where
+      'value' must follow the syntax described above.
+    type_map: A dictionary mapping hyperparameter names to types.  Note every
+      parameter name in values must be a key in type_map.  The values must
+      conform to the types indicated, where a value V is said to conform to a
+      type T if either V has type T, or V is a list of elements of type T.
+      Hence, for a multidimensional parameter 'x' taking float values,
+      'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+
+  Returns:
+    A python map containing the name, value pairs.
+
+  Raises:
+    ValueError: If `values` cannot be parsed.
+
+  """
+  ret = {}
+  param_re = re.compile(
+      r'(?P<name>[a-zA-Z][\w]*)\s*=\s*'
+      r'((?P<val>[^,\[]*)|\[(?P<vals>[^\]]*)\])($|,)')
+  pos = 0
+  while pos < len(values):
+    m = param_re.match(values, pos)
+    if not m:
+      raise ValueError('Malformed hyperparameter value: %s' % values[pos:])
+    # Check that there is a comma between parameters and move past it.
+    pos = m.end()
+    # Parse the values.
+    m_dict = m.groupdict()
+    name = m_dict['name']
+    if name not in type_map:
+      raise ValueError('Unknown hyperparameter type for %s' % name)
+    type_ = type_map[name]
+    def parse_fail(value):
+      raise ValueError(
+          'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s'
+          % (name, type_.__name__, value, values))
+    if type_ == bool:
+      def parse_bool(value):
+        if value == 'true':
+          return True
+        elif value == 'false':
+          return False
+        else:
+          try:
+            return bool(int(value))
+          except (ValueError, TypeError):
+            parse_fail(value)
+      parse = parse_bool
+    else:
+      parse = type_
+    if m_dict['val'] is not None:
+      try:
+        ret[name] = parse(m_dict['val'])
+      except (ValueError, TypeError):
+        parse_fail(m_dict['val'])
+    elif m_dict['vals'] is not None:
+      elements = filter(None, re.split('[ ,]', m_dict['vals']))
+      try:
+        ret[name] = [parse(e) for e in elements]
+      except (ValueError, TypeError):
+        parse_fail(m_dict['vals'])
+    else:
+      parse_fail('')
+  return ret
+
+
+class HParams(object):
+  """Class to hold a set of hyperparameters as name-value pairs.
+
+  A `HParams` object holds hyperparameters used to build and train a model,
+  such as the number of hidden units in a neural net layer or the learning rate
+  to use when training.
+
+  You first create a `HParams` object by specifying the names and values of the
+  hyperparameters.
+
+  To make them easily accessible the parameter names are added as direct
+  attributes of the class.  A typical usage is as follows:
+
+  ```python
+  # Create a HParams object specifying names and values of the model
+  # hyperparameters:
+  hparams = HParams(learning_rate=0.1, num_hidden_units=100)
+
+  # The hyperparameter are available as attributes of the HParams object:
+  hparams.learning_rate ==> 0.1
+  hparams.num_hidden_units ==> 100
+  ```
+
+  Hyperparameters have type, which is inferred from the type of their value
+  passed at construction type.   The currently supported types are: integer,
+  float, string, and list of integer, float, or string.
+
+  You can override hyperparameter values by calling the
+  [`parse()`](#HParams.parse) method, passing a string of comma separated
+  `name=value` pairs.  This is intended to make it possible to override
+  any hyperparameter values from a single command-line flag to which
+  the user passes 'hyper-param=value' pairs.  It avoids having to define
+  one flag for each hyperparameter.
+
+  The syntax expected for each value depends on the type of the parameter.
+  See `parse()` for a description of the syntax.
+
+  Example:
+
+  ```python
+  # Define a command line flag to pass name=value pairs.
+  # For example using argparse:
+  import argparse
+  parser = argparse.ArgumentParser(description='Train my model.')
+  parser.add_argument('--hparams', type=str,
+                      help='Comma separated list of "name=value" pairs.')
+  args = parser.parse_args()
+  ...
+  def my_program():
+    # Create a HParams object specifying the names and values of the
+    # model hyperparameters:
+    hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
+                         activations=['relu', 'tanh'])
+
+    # Override hyperparameters values by parsing the command line
+    hparams.parse(args.hparams)
+
+    # If the user passed `--hparams=learning_rate=0.3` on the command line
+    # then 'hparams' has the following attributes:
+    hparams.learning_rate ==> 0.3
+    hparams.num_hidden_units ==> 100
+    hparams.activations ==> ['relu', 'tanh']
+
+    # If the hyperparameters are in json format use parse_json:
+    hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
+  ```
+  """
+
+  def __init__(self, hparam_def=None, model_structure=None, **kwargs):
+    """Create an instance of `HParams` from keyword arguments.
+
+    The keyword arguments specify name-values pairs for the hyperparameters.
+    The parameter types are inferred from the type of the values passed.
+
+    The parameter names are added as attributes of `HParams` object, so they
+    can be accessed directly with the dot notation `hparams._name_`.
+
+    Example:
+
+    ```python
+    # Define 3 hyperparameters: 'learning_rate' is a float parameter,
+    # 'num_hidden_units' an integer parameter, and 'activation' a string
+    # parameter.
+    hparams = tf.HParams(
+        learning_rate=0.1, num_hidden_units=100, activation='relu')
+
+    hparams.activation ==> 'relu'
+    ```
+
+    Note that a few names are reserved and cannot be used as hyperparameter
+    names.  If you use one of the reserved name the constructor raises a
+    `ValueError`.
+
+    Args:
+      hparam_def: Serialized hyperparameters, encoded as a hparam_pb2.HParamDef
+        protocol buffer. If provided, this object is initialized by
+        deserializing hparam_def.  Otherwise **kwargs is used.
+      model_structure: An instance of ModelStructure, defining the feature
+        crosses to be used in the Trial.
+      **kwargs: Key-value pairs where the key is the hyperparameter name and
+        the value is the value for the parameter.
+
+    Raises:
+      ValueError: If both `hparam_def` and initialization values are provided,
+        or if one of the arguments is invalid.
+
+    """
+    # Register the hyperparameters and their type in _hparam_types.
+    # This simplifies the implementation of parse().
+    # _hparam_types maps the parameter name to a tuple (type, bool).
+    # The type value is the type of the parameter for scalar hyperparameters,
+    # or the type of the list elements for multidimensional hyperparameters.
+    # The bool value is True if the value is a list, False otherwise.
+    self._hparam_types = {}
+    self._model_structure = model_structure
+    if hparam_def:
+      self._init_from_proto(hparam_def)
+      if kwargs:
+        raise ValueError('hparam_def and initialization values are '
+                         'mutually exclusive')
+    else:
+      for name, value in six.iteritems(kwargs):
+        self.add_hparam(name, value)
+
+  def _init_from_proto(self, hparam_def):
+    """Creates a new HParams from `HParamDef` protocol buffer.
+
+    Args:
+      hparam_def: `HParamDef` protocol buffer.
+    """
+    assert isinstance(hparam_def, hparam_pb2.HParamDef)
+    for name, value in hparam_def.hparam.items():
+      kind = value.WhichOneof('kind')
+      if kind.endswith('_value'):
+        # Single value.
+        if kind.startswith('int64'):
+          # Setting attribute value to be 'int' to ensure the type is compatible
+          # with both Python2 and Python3.
+          self.add_hparam(name, int(getattr(value, kind)))
+        elif kind.startswith('bytes'):
+          # Setting attribute value to be 'str' to ensure the type is compatible
+          # with both Python2 and Python3. UTF-8 encoding is assumed.
+          self.add_hparam(name, compat.as_str(getattr(value, kind)))
+        else:
+          self.add_hparam(name, getattr(value, kind))
+      else:
+        # List of values.
+        if kind.startswith('int64'):
+          # Setting attribute value to be 'int' to ensure the type is compatible
+          # with both Python2 and Python3.
+          self.add_hparam(name, [int(v) for v in getattr(value, kind).value])
+        elif kind.startswith('bytes'):
+          # Setting attribute value to be 'str' to ensure the type is compatible
+          # with both Python2 and Python3. UTF-8 encoding is assumed.
+          self.add_hparam(name, [compat.as_str(v)
+                                 for v in getattr(value, kind).value])
+        else:
+          self.add_hparam(name, [v for v in getattr(value, kind).value])
+
+  def add_hparam(self, name, value):
+    """Adds {name, value} pair to hyperparameters.
+
+    Args:
+      name: Name of the hyperparameter.
+      value: Value of the hyperparameter. Can be one of the following types:
+        int, float, string, int list, float list, or string list.
+
+    Raises:
+      ValueError: if one of the arguments is invalid.
+    """
+    # Keys in kwargs are unique, but 'name' could the name of a pre-existing
+    # attribute of this object.  In that case we refuse to use it as a
+    # hyperparameter name.
+    if getattr(self, name, None) is not None:
+      raise ValueError('Hyperparameter name is reserved: %s' % name)
+    if isinstance(value, (list, tuple)):
+      if not value:
+        raise ValueError('Multi-valued hyperparameters cannot be empty: %s'
+                         % name)
+      self._hparam_types[name] = (type(value[0]), True)
+    else:
+      self._hparam_types[name] = (type(value), False)
+    setattr(self, name, value)
+
+  def parse(self, values):
+    """Override hyperparameter values, parsing new values from a string.
+
+    See parse_values for more detail on the allowed format for values.
+
+    Args:
+      values: String.  Comma separated list of `name=value` pairs where
+        'value' must follow the syntax described above.
+
+    Returns:
+      The `HParams` instance.
+
+    Raises:
+      ValueError: If `values` cannot be parsed.
+    """
+    type_map = dict()
+    for name, t in self._hparam_types.items():
+      param_type, _ = t
+      type_map[name] = param_type
+
+    values_map = parse_values(values, type_map)
+    return self._set_from_map(values_map)
+
+  def _set_from_map(self, values_map):
+    """Override hyperparameter values, parsing new values from a dictionary.
+
+    Args:
+      values_map: Dictionary of name:value pairs.
+
+    Returns:
+      The `HParams` instance.
+
+    Raises:
+      ValueError: If `values_map` cannot be parsed.
+    """
+    for name, value in values_map.items():
+      _, is_list = self._hparam_types[name]
+      if isinstance(value, list):
+        if not is_list:
+          raise ValueError(
+              'Must not pass a list for single-valued parameter: %s' % name)
+        setattr(self, name, value)
+      else:
+        if is_list:
+          raise ValueError(
+              'Must pass a list for multi-valued parameter: %s.' % name)
+        setattr(self, name, value)
+    return self
+
+  def set_model_structure(self, model_structure):
+    self._model_structure = model_structure
+
+  def get_model_structure(self):
+    return self._model_structure
+
+  def to_json(self):
+    """Serializes the hyperparameters into JSON.
+
+    Returns:
+      A JSON string.
+    """
+    return json.dumps(self.values())
+
+  def parse_json(self, values_json):
+    """Override hyperparameter values, parsing new values from a json object.
+
+    Args:
+      values_json: String containing a json object of name:value pairs.
+
+    Returns:
+      The `HParams` instance.
+
+    Raises:
+      ValueError: If `values_json` cannot be parsed.
+    """
+    values_map = json.loads(values_json)
+    return self._set_from_map(values_map)
+
+  def values(self):
+    """Return the hyperparameter values as a Python dictionary.
+
+    Returns:
+      A dictionary with hyperparameter names as keys.  The values are the
+      hyperparameter values.
+    """
+    return {n: getattr(self, n) for n in self._hparam_types.keys()}
+
+  def __str__(self):
+    return str(sorted(self.values().items()))
+
+  @staticmethod
+  def _get_kind_name(param_type, is_list):
+    """Returns the field name given parameter type and is_list.
+
+    Args:
+      param_type: Data type of the hparam.
+      is_list: Whether this is a list.
+
+    Returns:
+      A string representation of the field name.
+
+    Raises:
+      ValueError: If parameter type is not recognized.
+    """
+    if issubclass(param_type, bool):
+      # This check must happen before issubclass(param_type, six.integer_types),
+      # since Python considers bool to be a subclass of int.
+      typename = 'bool'
+    elif issubclass(param_type, six.integer_types):
+      # Setting 'int' and 'long' types to be 'int64' to ensure the type is
+      # compatible with both Python2 and Python3.
+      typename = 'int64'
+    elif issubclass(param_type, (six.string_types, six.binary_type)):
+      # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
+      # compatible with both Python2 and Python3.
+      typename = 'bytes'
+    elif issubclass(param_type, float):
+      typename = 'float'
+    else:
+      raise ValueError('Unsupported parameter type: %s' % str(param_type))
+
+    suffix = 'list' if is_list else 'value'
+    return '_'.join([typename, suffix])
+
+  def to_proto(self, export_scope=None):  # pylint: disable=unused-argument
+    """Converts a `HParams` object to a `HParamDef` protocol buffer.
+
+    Args:
+      export_scope: Optional `string`. Name scope to remove.
+
+    Returns:
+      A `HParamDef` protocol buffer.
+    """
+    hparam_proto = hparam_pb2.HParamDef()
+    for name in self._hparam_types:
+      # Parse the values.
+      param_type, is_list = self._hparam_types.get(name, (None, None))
+      kind = HParams._get_kind_name(param_type, is_list)
+
+      if is_list:
+        if kind.startswith('bytes'):
+          v_list = [compat.as_bytes(v) for v in getattr(self, name)]
+        else:
+          v_list = [v for v in getattr(self, name)]
+        getattr(hparam_proto.hparam[name], kind).value.extend(v_list)
+      else:
+        v = getattr(self, name)
+        if kind.startswith('bytes'):
+          v = compat.as_bytes(getattr(self, name))
+        setattr(hparam_proto.hparam[name], kind, v)
+
+    return hparam_proto
+
+  @staticmethod
+  def from_proto(hparam_def, import_scope=None):  # pylint: disable=unused-argument
+    return HParams(hparam_def=hparam_def)
+
+
+ops.register_proto_function('hparams',
+                            proto_type=hparam_pb2.HParamDef,
+                            to_proto=HParams.to_proto,
+                            from_proto=HParams.from_proto)
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
new file mode 100644
index 00000000000..0b900e65d8a
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -0,0 +1,195 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for hparam."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.training.python.training import hparam
+
+from tensorflow.python.platform import test
+
+
+class HParamsTest(test.TestCase):
+
+  def _assertDictEquals(self, d1, d2):
+    self.assertEqual(len(d1), len(d2))
+    for k, v in six.iteritems(d1):
+      self.assertTrue(k in d2, k)
+      self.assertEquals(v, d2[k], d2[k])
+
+  def testEmpty(self):
+    hparams = hparam.HParams()
+    self._assertDictEquals({}, hparams.values())
+    hparams.parse('')
+    self._assertDictEquals({}, hparams.values())
+    with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
+      hparams.parse('xyz=123')
+
+  def testSomeValues(self):
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6')
+    self._assertDictEquals(
+        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
+    expected_str = '[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\')]'
+    self.assertEquals(expected_str, str(hparams.__str__()))
+    self.assertEquals(expected_str, str(hparams))
+    self.assertEquals(1, hparams.aaa)
+    self.assertEquals(2.0, hparams.b)
+    self.assertEquals('relu6', hparams.c_c)
+    hparams.parse('aaa=12')
+    self._assertDictEquals(
+        {'aaa': 12, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
+    self.assertEquals(12, hparams.aaa)
+    self.assertEquals(2.0, hparams.b)
+    self.assertEquals('relu6', hparams.c_c)
+    hparams.parse('c_c=relu4,b=-2.0e10')
+    self._assertDictEquals({'aaa': 12, 'b': -2.0e10, 'c_c': 'relu4'},
+                           hparams.values())
+    self.assertEquals(12, hparams.aaa)
+    self.assertEquals(-2.0e10, hparams.b)
+    self.assertEquals('relu4', hparams.c_c)
+    hparams.parse('c_c=,b=0,')
+    self._assertDictEquals({'aaa': 12, 'b': 0, 'c_c': ''}, hparams.values())
+    self.assertEquals(12, hparams.aaa)
+    self.assertEquals(0.0, hparams.b)
+    self.assertEquals('', hparams.c_c)
+    hparams.parse('c_c=2.3",b=+2,')
+    self.assertEquals(2.0, hparams.b)
+    self.assertEquals('2.3"', hparams.c_c)
+    with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
+      hparams.parse('x=123')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('aaa=poipoi')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('aaa=1.0')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('b=12x')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('b=relu')
+    with self.assertRaisesRegexp(ValueError, 'Must not pass a list'):
+      hparams.parse('aaa=[123]')
+    self.assertEquals(12, hparams.aaa)
+    self.assertEquals(2.0, hparams.b)
+    self.assertEquals('2.3"', hparams.c_c)
+    # Exports to proto.
+    hparam_def = hparams.to_proto()
+    # Imports from proto.
+    hparams2 = hparam.HParams(hparam_def=hparam_def)
+    # Verifies that all hparams are restored.
+    self.assertEquals(12, hparams2.aaa)
+    self.assertEquals(2.0, hparams2.b)
+    self.assertEquals('2.3"', hparams2.c_c)
+
+  def testBoolParsing(self):
+    for value in 'true', 'false', '1', '0':
+      for initial in False, True:
+        hparams = hparam.HParams(use_gpu=initial)
+        hparams.parse('use_gpu=' + value)
+        self.assertEqual(hparams.use_gpu, value in ['true', '1'])
+
+        # Exports to proto.
+        hparam_def = hparams.to_proto()
+        # Imports from proto.
+        hparams2 = hparam.HParams(hparam_def=hparam_def)
+        self.assertEquals(hparams.use_gpu, hparams2.use_gpu)
+        # Check that hparams2.use_gpu is a bool rather than an int.
+        # The assertEquals() call above won't catch this, since
+        # (0 == False) and (1 == True) in Python.
+        self.assertEquals(bool, type(hparams2.use_gpu))
+
+  def testBoolParsingFail(self):
+    hparams = hparam.HParams(use_gpu=True)
+    with self.assertRaisesRegexp(ValueError, r'Could not parse.*use_gpu'):
+      hparams.parse('use_gpu=yep')
+
+  def testLists(self):
+    hparams = hparam.HParams(aaa=[1], b=[2.0, 3.0], c_c=['relu6'])
+    self._assertDictEquals({'aaa': [1], 'b': [2.0, 3.0], 'c_c': ['relu6']},
+                           hparams.values())
+    self.assertEquals([1], hparams.aaa)
+    self.assertEquals([2.0, 3.0], hparams.b)
+    self.assertEquals(['relu6'], hparams.c_c)
+    hparams.parse('aaa=[12]')
+    self.assertEquals([12], hparams.aaa)
+    hparams.parse('aaa=[12,34,56]')
+    self.assertEquals([12, 34, 56], hparams.aaa)
+    hparams.parse('c_c=[relu4,relu12],b=[1.0]')
+    self.assertEquals(['relu4', 'relu12'], hparams.c_c)
+    self.assertEquals([1.0], hparams.b)
+    hparams.parse('c_c=[],aaa=[-34]')
+    self.assertEquals([-34], hparams.aaa)
+    self.assertEquals([], hparams.c_c)
+    hparams.parse('c_c=[_12,3\'4"],aaa=[+3]')
+    self.assertEquals([3], hparams.aaa)
+    self.assertEquals(['_12', '3\'4"'], hparams.c_c)
+    with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
+      hparams.parse('x=[123]')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('aaa=[poipoi]')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('aaa=[1.0]')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('b=[12x]')
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('b=[relu]')
+    with self.assertRaisesRegexp(ValueError, 'Must pass a list'):
+      hparams.parse('aaa=123')
+    # Exports to proto.
+    hparam_def = hparams.to_proto()
+    # Imports from proto.
+    hparams2 = hparam.HParams(hparam_def=hparam_def)
+    # Verifies that all hparams are restored.
+    self.assertEquals([3], hparams2.aaa)
+    self.assertEquals([1.0], hparams2.b)
+    self.assertEquals(['_12', '3\'4"'], hparams2.c_c)
+
+  def testJson(self):
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True)
+    self._assertDictEquals(
+        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6', 'd': True}, hparams.values())
+    self.assertEquals(1, hparams.aaa)
+    self.assertEquals(2.0, hparams.b)
+    self.assertEquals('relu6', hparams.c_c)
+    hparams.parse_json('{"aaa": 12, "b": 3.0, "c_c": "relu4", "d": false}')
+    self._assertDictEquals(
+        {'aaa': 12, 'b': 3.0, 'c_c': 'relu4', 'd': False}, hparams.values())
+    self.assertEquals(12, hparams.aaa)
+    self.assertEquals(3.0, hparams.b)
+    self.assertEquals('relu4', hparams.c_c)
+
+    json_str = hparams.to_json()
+    hparams2 = hparam.HParams(aaa=10, b=20.0, c_c='hello', d=False)
+    hparams2.parse_json(json_str)
+    self.assertEquals(12, hparams2.aaa)
+    self.assertEquals(3.0, hparams2.b)
+    self.assertEquals('relu4', hparams2.c_c)
+    self.assertEquals(False, hparams2.d)
+
+  def testNonProtoFails(self):
+    with self.assertRaisesRegexp(AssertionError, ''):
+      hparam.HParams(hparam_def=1)
+    with self.assertRaisesRegexp(AssertionError, ''):
+      hparam.HParams(hparam_def=1.0)
+    with self.assertRaisesRegexp(AssertionError, ''):
+      hparam.HParams(hparam_def='hello')
+    with self.assertRaisesRegexp(AssertionError, ''):
+      hparam.HParams(hparam_def=[1, 2, 3])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/training/python/training/python_input.py b/tensorflow/contrib/training/python/training/python_input.py
new file mode 100644
index 00000000000..7f5420a98a1
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/python_input.py
@@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for asynchronously reading data from python into queues.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import script_ops
+
+
+def _process_yielded_dict(feature_values, keys, features, dtypes, shapes):
+  """Read feature_values from the generator and emit a proper output dict."""
+  if not isinstance(feature_values, dict):
+    raise TypeError("generator must return dict, saw: %s" % feature_values)
+
+  processed_values = {}
+  for pk in keys:
+    if feature_values.get(pk, None) is not None:
+      processed_values[pk] = np.asarray(
+          feature_values[pk], dtype=dtypes[pk].as_numpy_dtype)
+      check_shape = tensor_shape.TensorShape(processed_values[pk].shape)
+      if not shapes[pk].is_compatible_with(check_shape):
+        raise ValueError(
+            "Feature '%s' has shape %s that is incompatible with declared "
+            "shape: %s" % (pk, shapes[pk], check_shape))
+      continue
+    if isinstance(features[pk], parsing_ops.FixedLenFeature):
+      if features[pk].default_value is not None:
+        processed_values[pk] = np.asarray(
+            features[pk].default_value, dtype=dtypes[pk].as_numpy_dtype)
+    elif isinstance(features[pk], parsing_ops.FixedLenSequenceFeature):
+      processed_values[pk] = np.empty(
+          [0] + features[pk].shape.aslist(), dtype=dtypes[pk].as_numpy_dtype)
+    else:
+      raise ValueError(
+          "Expected generator to return key '%s' with non-empty value" % pk)
+
+  return processed_values
+
+
+def python_input(generator, features, name=None):
+  """Easily feed data from a python generator into TensorFlow queues.
+
+  Example usage:
+
+  ```python
+  def generator():
+    for i in range(3):
+      yield {"value": i}
+
+  features = {
+    "value": tf.FixedLenFeature(shape=[], dtype=dtypes.int32)
+  }
+
+  tensor_dict = tf.contrib.training.python_input(generator, features)
+  batched_dict = tf.train.batch(
+    tensor_dict, batch_size=2, allow_smaller_final_batch=True)
+
+  s = tf.Session()
+  tf.train.start_queue_runners()
+
+  batch1 = s.run(batched_dict)  # returns {"value": np.array([0, 1])}
+  batch2 = s.run(batched_dict)  # returns {"value": np.array([2])}
+  s.run(batched_dict)  # error: Queue is closed (generator finished at i==3)
+  ```
+
+  Args:
+    generator: A python generator that takes no arguments, and yields dicts
+      containing a single minibatch entry one at a time.
+    features: A python `dict` mapping keys expected from the generator to
+      instances of `tf.FixedLenFeature`, or `tf.FixedLenSequenceFeature`.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A dict mapping keys of the `features` dict to `Tensor` objects.
+    These `Tensor` objects are outputs of a queue that is fed by `generator`.
+
+  Raises:
+    TypeError: If generator is not callable or features is not a dict.
+    TypeError: If any of features' values are not a Feature object.
+    NotImplementedError: If any of features' values are instances of
+      `SparseFeature` or `VarLenFeature`  (these are not currently supported).
+    ValueError: If any FixedLenSequenceFeatures contain a default value
+      (this field is not supported).
+    ValueError: if any FixedLenSequenceFeatures have allow_missing=False
+      (this field is not supported).
+  """
+  if not callable(generator):
+    raise TypeError("generator must be callable, saw: %s" % generator)
+  if not isinstance(features, dict):
+    raise TypeError("features must be a dict, saw: %s"
+                    % type(features).__name__)
+
+  with ops.name_scope(name, "python_input"):
+    shapes = {}
+    dtypes = {}
+    for k, v in features.items():
+      if isinstance(v, parsing_ops.FixedLenFeature):
+        if v.default_value is not None:
+          value = ops.convert_to_tensor(v.default_value, dtype=v.dtype, name=k)
+          shapes[k] = value.shape
+          dtypes[k] = value.dtype
+        else:
+          tensor_shape.TensorShape(v.shape).assert_is_fully_defined()
+          shapes[k] = tensor_shape.TensorShape(v.shape)
+          dtypes[k] = v.dtype
+      elif isinstance(v, parsing_ops.VarLenFeature):
+        raise NotImplementedError("VarLenFeature not supported")
+      elif isinstance(v, parsing_ops.SparseFeature):
+        raise NotImplementedError("SparseFeature not supported")
+      elif isinstance(v, parsing_ops.FixedLenSequenceFeature):
+        if v.default_value is not None:
+          raise ValueError("FixedLenSequenceFeature with default value not "
+                           "supported")
+        if not v.allow_missing:
+          raise ValueError("FixedLenSequenceFeature with allow_missing=False "
+                           "not supported")
+        tensor_shape.TensorShape(v.shape).assert_is_fully_defined()
+        shapes[k] = tensor_shape.TensorShape([None]).concatenate(v.shape)
+        dtypes[k] = v.dtype
+      else:
+        raise TypeError(
+            "Expected value for features key '%s' to be one of "
+            "FixedLenFeature, VarLenFeature, SparseFeature, or "
+            "FixedLenSequenceFeature.  Got: %s" % (k, v))
+
+    keys = list(shapes.keys())
+    dtypes_list = [dtypes[pk] for pk in keys]
+
+    counter = [0]
+    lock = threading.Lock()
+    iterator = iter(generator())
+
+    def generator_iter():
+      """Iterate through generator output and return np.arrays to py_func."""
+      with lock:
+        try:
+          feature_values = next(iterator)
+          counter[0] += 1
+        except StopIteration as e:
+          raise StopIteration("Iteration finished.  Processed %d entries (%s)"
+                              % (counter[0], e))
+
+      processed_dict = _process_yielded_dict(
+          feature_values, keys, features, dtypes, shapes)
+      return [processed_dict[pk] for pk in keys]
+
+    generator_pyfunc_values = script_ops.py_func(
+        generator_iter, inp=[], Tout=dtypes_list, stateful=True)
+
+    pyfunc_input = {k: v for (k, v) in zip(keys, generator_pyfunc_values)}
+    for k, v in shapes.items():
+      pyfunc_input[k].set_shape(v)
+
+  return pyfunc_input
+
+
+__all__ = ["python_input"]
diff --git a/tensorflow/contrib/training/python/training/python_input_test.py b/tensorflow/contrib/training/python/training/python_input_test.py
new file mode 100644
index 00000000000..afd0f38c2cd
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/python_input_test.py
@@ -0,0 +1,191 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.contrib.training.python_input."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.training.python.training import bucket_ops
+from tensorflow.contrib.training.python.training import python_input
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import input as core_input
+from tensorflow.python.training import queue_runner_impl
+
+
+class PythonInputTest(test.TestCase):
+
+  def testGenerator(self):
+    def simple_generator():
+      for i in range(2):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+    self.assertEqual(["value"], tensors.keys())
+    self.assertEqual(dtypes.int32, tensors["value"].dtype)
+    self.assertEqual((), tensors["value"].shape)
+
+    with self.test_session() as sess:
+      self.assertEqual({"value": 0}, sess.run(tensors))
+      self.assertEqual({"value": 1}, sess.run(tensors))
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+
+  def testInvalidGenerator(self):
+    generator1 = lambda: iter([{"value": "a"}])
+    int_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors1 = python_input.python_input(generator1, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("invalid literal"):
+        # Can't convert a string to an integer
+        sess.run(tensors1)
+
+    generator2 = lambda: iter([None])
+    tensors2 = python_input.python_input(generator2, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("generator must return dict"):
+        sess.run(tensors2)
+
+    generator3 = lambda: iter([{"value": [1, 2]}])
+    tensors3 = python_input.python_input(generator3, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("incompatible with declared shape"):
+        sess.run(tensors3)
+
+  def testGeneratorWorksWithBatching(self):
+    def simple_generator():
+      for i in range(5):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+
+    # Request batches of size 4 at a time, the final batch may be smaller.
+    batched_tensors = core_input.batch(tensors, batch_size=4,
+                                       allow_smaller_final_batch=True)
+
+    self.assertEqual(["value"], batched_tensors.keys())
+    self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
+    self.assertEqual([None], batched_tensors["value"].shape.as_list())
+
+    with self.test_session() as sess:
+      # The generator emits 5 items total.  The first 4 are returned in
+      # the first session run; the final one is returned in the
+      # second.  This works because allow_smaller_final_batch=True.
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      r1 = sess.run(batched_tensors)
+      r2 = sess.run(batched_tensors)
+      self.assertAllEqual([0, 1, 2, 3], r1["value"])
+      self.assertEqual([4], r2["value"])
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+  def testGeneratorWorksWithManyBatchingThreads(self):
+    def simple_generator():
+      for i in range(5000):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+
+    # Request batches of size 20 at a time, the final batch may be smaller.
+    _, batched_tensors = bucket_ops.bucket(
+        tensors, which_bucket=tensors["value"] % 5,
+        batch_size=20, num_buckets=5, num_threads=7, capacity=17,
+        allow_smaller_final_batch=True)
+
+    self.assertEqual(["value"], batched_tensors.keys())
+    self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
+    self.assertEqual([None], batched_tensors["value"].shape.as_list())
+
+    with self.test_session() as sess:
+      # The generator emits 5 items total.  The first 4 are returned in
+      # the first session run; the final one is returned in the
+      # second.  This works because allow_smaller_final_batch=True.
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      results = []
+      while True:
+        try:
+          r = sess.run(batched_tensors)
+          results.extend(r["value"].tolist())
+        except errors.OutOfRangeError:
+          break
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+    self.assertEqual(sorted(results),
+                     list(range(5000)))
+
+  def testVaryingFieldsInGenerator(self):
+    def simple_generator():
+      for i in range(2):
+        yield {"value": i,
+               "seqlen_value": np.ones((i, 1))}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32),
+        "seqlen_value": parsing_ops.FixedLenSequenceFeature(
+            shape=[1], dtype=dtypes.float32, allow_missing=True),
+        "empty_value": parsing_ops.FixedLenFeature(
+            default_value=[-1, -2], dtype=dtypes.int32, shape=[2])
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+    self.assertEqual(
+        set(["value", "seqlen_value", "empty_value"]), set(tensors.keys()))
+    self.assertEqual(dtypes.int32, tensors["value"].dtype)
+    self.assertEqual((), tensors["value"].shape)
+    self.assertEqual(dtypes.float32, tensors["seqlen_value"].dtype)
+    self.assertEqual([None, 1], tensors["seqlen_value"].shape.as_list())
+    self.assertEqual(dtypes.int32, tensors["empty_value"].dtype)
+    self.assertEqual([2], tensors["empty_value"].shape)
+
+    with self.test_session() as sess:
+      r1 = sess.run(tensors)
+      self.assertAllEqual(0, r1["value"])
+      self.assertAllEqual(np.ones((0, 1)), r1["seqlen_value"])
+      self.assertAllEqual([-1, -2], r1["empty_value"])
+
+      r2 = sess.run(tensors)
+      self.assertAllEqual(1, r2["value"])
+      self.assertAllEqual([[1]], r2["seqlen_value"])
+      self.assertAllEqual([-1, -2], r2["empty_value"])
+
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py
index 9f6411d57ac..16f54fbc92e 100644
--- a/tensorflow/contrib/training/python/training/resample.py
+++ b/tensorflow/contrib/training/python/training/resample.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -28,6 +29,42 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import moving_averages
 
 
+def _repeat_range(counts, name=None):
+  """Repeat integers given by range(len(counts)) each the given number of times.
+
+  Example behavior:
+  [0, 1, 2, 3] -> [1, 2, 2, 3, 3, 3]
+
+  Args:
+    counts: 1D tensor with dtype=int32.
+    name: optional name for operation.
+
+  Returns:
+    1D tensor with dtype=int32 and dynamic length giving the repeated integers.
+  """
+  with ops.name_scope(name, 'repeat_range', [counts]) as scope:
+    counts = ops.convert_to_tensor(counts, name='counts')
+
+    def cond(unused_output, i):
+      return i < size
+
+    def body(output, i):
+      value = array_ops.fill(counts[i:i+1], i)
+      return (output.write(i, value), i + 1)
+
+    size = array_ops.shape(counts)[0]
+    init_output_array = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=size, infer_shape=False)
+    output_array, num_writes = control_flow_ops.while_loop(
+        cond, body, loop_vars=[init_output_array, 0])
+
+    return control_flow_ops.cond(
+        num_writes > 0,
+        output_array.concat,
+        lambda: array_ops.zeros(shape=[0], dtype=dtypes.int32),
+        name=scope)
+
+
 def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
   """Given `inputs` tensors, stochastically resamples each at a given rate.
 
@@ -38,95 +75,29 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
   calls, each set of inputs should appear in the output `rate` times
   the number of invocations.
 
-  Uses Knuth's method to generate samples from the poisson
-  distribution (but instead of just incrementing a count, actually
-  emits the input); this is described at
-  https://en.wikipedia.org/wiki/Poisson_distribution in the section on
-  generating Poisson-distributed random variables.
-
-  Note that this method is not appropriate for large rate values: with
-  float16 it will stop performing correctly for rates above 9.17;
-  float32, 87; and float64, 708. (These are the base-e versions of the
-  minimum representable exponent for each type.)
-
   Args:
     inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
     rates: A tensor of shape `[batch_size]` contiaining the resampling rates
-           for each input.
+       for each input.
     scope: Scope for the op.
     seed: Random seed to use.
     back_prop: Whether to allow back-propagation through this op.
 
   Returns:
     Selections from the input tensors.
-
   """
-  # TODO(shoutis): Refactor, splitting this up into a poisson draw and a repeat.
-
-  # What this implementation does is loop, simulating the intervals
-  # between events by drawing from the exponential distribution
-  # (`-log(random_uniform)/rate`), and emitting another copy of the
-  # corresponding input so long as sum(intervals) < 1. However, that
-  # condition can be transformed into the easier-to-compute condition
-  # `product(random_uniforms) > e^-rate`.
-  with ops.name_scope(scope, default_name='resample_at_rate', values=inputs):
-    floor_vals = math_ops.exp(-rates)
-
-    def _body(chosen_inputs, running_products, idx, output_count):
-      """Body of the resampling loop."""
-      # Update the running product
-      next_running_products = running_products * random_ops.random_uniform(
-          shape=array_ops.shape(running_products),
-          dtype=running_products.dtype,
-          seed=seed)
-
-      # Append inputs which still pass the condition:
-      indexes = array_ops.reshape(
-          array_ops.where(next_running_products > floor_vals), [-1])
-
-      next_output_count = output_count + array_ops.shape(indexes)[0]
-
-      next_chosen_inputs = [
-          chosen_inputs[i].write(idx, array_ops.gather(inputs[i], indexes))
-          for i in range(len(inputs))]
-
-      return [next_chosen_inputs,
-              next_running_products,
-              idx + 1,
-              next_output_count]
-
-    def _cond(unused_chosen_inputs, running_products, unused_idx, unused_count):
-      """Resampling loop exit condition."""
-      return math_ops.reduce_any(running_products > floor_vals)
-
-    initial_chosen_inputs = [
-        tensor_array_ops.TensorArray(dtype=x.dtype, size=0, dynamic_size=True)
-        for x in inputs]
-
-    resampled_inputs, _, unused_idx, count = control_flow_ops.while_loop(
-        _cond,
-        _body,
-        loop_vars=[initial_chosen_inputs,
-                   array_ops.ones_like(rates),  # initial running_products
-                   0,   # initial idx
-                   0],  # initial count
-        back_prop=back_prop)
-
-  # Work around TensorArray "Currently only static shapes are supported when
-  # concatenating zero-size TensorArrays" limitation:
-  def _empty_tensor_like(t):
-    result = array_ops.zeros(
-        shape=(array_ops.concat([[0], array_ops.shape(t)[1:]], 0)),
-        dtype=t.dtype)
-    if t.get_shape().ndims is not None:
-      # preserve known shapes
-      result.set_shape([0] + t.get_shape()[1:].as_list())
-    return result
-
-  return control_flow_ops.cond(
-      count > 0,
-      lambda: [tensor_array.concat() for tensor_array in resampled_inputs],
-      lambda: [_empty_tensor_like(t) for t in inputs])
+  with ops.name_scope(scope, default_name='resample_at_rate',
+                      values=list(inputs) + [rates]):
+    rates = ops.convert_to_tensor(rates, name='rates')
+    # random_poisson does not support rates of size 0 (b/36076216)
+    sample_counts = math_ops.cast(control_flow_ops.cond(
+        array_ops.shape(rates)[0] > 0,
+        lambda: random_ops.random_poisson(rates, (), rates.dtype, seed=seed),
+        lambda: array_ops.zeros(shape=[0], dtype=rates.dtype)), dtypes.int32)
+    sample_indices = _repeat_range(sample_counts)
+    if not back_prop:
+      sample_indices = array_ops.stop_gradient(sample_indices)
+    return [array_ops.gather(x, sample_indices) for x in inputs]
 
 
 def weighted_resample(inputs, weights, overall_rate, scope=None,
@@ -150,7 +121,6 @@ def weighted_resample(inputs, weights, overall_rate, scope=None,
     A list of tensors exactly like `inputs`, but with an unknown (and
       possibly zero) first dimension.
     A tensor containing the effective resampling rate used for each output.
-
   """
   # Algorithm: Just compute rates as weights/mean_weight *
   # overall_rate. This way the average weight corresponds to the
diff --git a/tensorflow/contrib/training/python/training/resample_test.py b/tensorflow/contrib/training/python/training/resample_test.py
index 249007fed00..774241a8164 100644
--- a/tensorflow/contrib/training/python/training/resample_test.py
+++ b/tensorflow/contrib/training/python/training/resample_test.py
@@ -34,6 +34,22 @@ from tensorflow.python.platform import test
 class ResampleTest(test.TestCase):
   """Tests that resampling runs and outputs are close to expected values."""
 
+  def testRepeatRange(self):
+    cases = [
+        ([], []),
+        ([0], []),
+        ([1], [0]),
+        ([1, 0], [0]),
+        ([0, 1], [1]),
+        ([3], [0, 0, 0]),
+        ([0, 1, 2, 3], [1, 2, 2, 3, 3, 3]),
+    ]
+    with self.test_session() as sess:
+      for inputs, expected in cases:
+        array_inputs = numpy.array(inputs, dtype=numpy.int32)
+        actual = sess.run(resample._repeat_range(array_inputs))
+        self.assertAllEqual(actual, expected)
+
   def testRoundtrip(self, rate=0.25, count=5, n=500):
     """Tests `resample(x, weights)` and resample(resample(x, rate), 1/rate)`."""
 
@@ -132,12 +148,12 @@ class ResampleTest(test.TestCase):
     resampled = resample.resample_at_rate([vals], rates)
 
     with self.test_session() as s:
-      rs = s.run(resampled, {
+      rs, = s.run(resampled, {
           vals: list(range(count)),
           rates: numpy.zeros(
               shape=[count], dtype=numpy.float32)
       })
-      self.assertEqual(0, len(rs))
+      self.assertEqual(rs.shape, (0,))
 
   def testDtypes(self, count=10):
     """Test that we can define the ops with float64 weights."""
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index bf1d2c8cad3..410c0a9a6b9 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -26,7 +26,6 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.training import input as input_ops
 
 __all__ = [
@@ -264,7 +263,7 @@ def _estimate_data_distribution(labels, num_classes, smoothing_constant=10):
   # slower convergence.
   if smoothing_constant <= 0:
     raise ValueError('smoothing_constant must be nonzero.')
-  num_examples_per_class_seen = variables.Variable(
+  num_examples_per_class_seen = variable_scope.variable(
       initial_value=[smoothing_constant] * num_classes,
       trainable=False,
       name='class_count',
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 64894d7f7ff..9312070e52b 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.summary import summary
@@ -343,7 +344,7 @@ def _prepare_sequence_inputs(inputs, states):
   key = _check_rank(inputs.key, 0)
 
   if length.dtype != dtypes.int32:
-    raise TypeError("length dtype must be int32, but recieved: %s" %
+    raise TypeError("length dtype must be int32, but received: %s" %
                     length.dtype)
   if key.dtype != dtypes.string:
     raise TypeError("key dtype must be string, but received: %s" % key.dtype)
@@ -1274,6 +1275,8 @@ def batch_sequences_with_states(input_key,
                                 capacity=1000,
                                 allow_small_batch=True,
                                 pad=True,
+                                make_keys_unique=False,
+                                make_keys_unique_seed=None,
                                 name=None):
   """Creates batches of segments of sequential input.
 
@@ -1326,6 +1329,7 @@ def batch_sequences_with_states(input_key,
       input_key=key,
       input_sequences=sequences,
       input_context=context,
+      input_length=tf.shape(sequences["input"])[0],
       initial_states=initial_states,
       num_unroll=num_unroll,
       batch_size=batch_size,
@@ -1359,7 +1363,10 @@ def batch_sequences_with_states(input_key,
       input example.  This is used to keep track of the split minibatch elements
       of this input.  Batched keys of the current iteration are made
       accessible via the `key` property.  The shape of `input_key` (scalar) must
-      be fully specified.
+      be fully specified.  Consider setting `make_keys_unique` to True when
+      iterating over the same input multiple times.
+
+      **Note**: if `make_keys_unique=False` then `input_key`s must be unique.
     input_sequences: A dict mapping string names to `Tensor` values.  The values
       must all have matching first dimension, called `value_length`. They may
       vary from input to input. The remainder of the shape (other than the first
@@ -1411,6 +1418,11 @@ def batch_sequences_with_states(input_key,
       `num_unroll`. In that case `input_length` may be `None` and is assumed to
       be the length of first dimension of values in `input_sequences`
       (i.e. `value_length`).
+    make_keys_unique: Whether to append a random integer to the `input_key` in
+      an effort to make it unique. The seed can be set via
+      `make_keys_unique_seed`.
+    make_keys_unique_seed: If `make_keys_unique=True` this fixes the seed with
+      which a random postfix is generated.
     name: An op name string (optional).
 
   Returns:
@@ -1431,6 +1443,7 @@ def batch_sequences_with_states(input_key,
       input_length = input_length if input_length is not None else length
     elif input_sequences:
       # Assert that value_length is a multiple of num_unroll.
+      checked_input_sequences = {}
       for key, value in input_sequences.items():
         if (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
@@ -1448,11 +1461,13 @@ def batch_sequences_with_states(input_key,
                           ", but saw value: ",
                           string_ops.as_string(value_length),
                           ". Consider setting pad=True."])])]):
-            input_sequences[key] = sparse_tensor.SparseTensor(
-                indices=value.indices,
+            checked_input_sequences[key] = sparse_tensor.SparseTensor(
+                indices=array_ops.identity(
+                    value.indices, name="multiple_of_checked"),
                 values=array_ops.identity(
                     value.values, name="multiple_of_checked"),
-                dense_shape=value.dense_shape)
+                dense_shape=array_ops.identity(
+                    value.dense_shape, name="multiple_of_checked"))
         else:
           if not isinstance(value, ops.Tensor):
             try:
@@ -1478,15 +1493,25 @@ def batch_sequences_with_states(input_key,
                       ])
                   ])
           ]):
-            input_sequences[key] = array_ops.identity(
+            checked_input_sequences[key] = array_ops.identity(
                 value, name="multiple_of_checked")
-
+      input_sequences = checked_input_sequences
+    # Move SparseTensors in context into input_sequences.
+    _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll)
     # Deconstruct SparseTensors in sequence into a dense Tensor before inputting
     # to SQSS.
     (transformed_input_seq,
      sparse_tensor_keys,
      tensor_list) = _deconstruct_sparse_tensor_seq(input_sequences)
 
+    if make_keys_unique:
+      input_key = string_ops.string_join([
+          input_key,
+          string_ops.as_string(
+              random_ops.random_uniform(
+                  (), minval=0, maxval=100000000, dtype=dtypes.int32,
+                  seed=make_keys_unique_seed))])
+
     # setup stateful queue reader
     stateful_reader = SequenceQueueingStateSaver(
         batch_size,
@@ -1517,6 +1542,8 @@ def batch_sequences_with_states(input_key,
         tensor_list,
         batch_size,
         num_unroll)
+    # Move select SparseTensors back to context.
+    _move_sparse_tensor_in_context(batch.context, batch.sequences)
     return batch
 
 
@@ -1601,6 +1628,112 @@ def _padding(sequences, num_unroll):
   return length, padded_sequences
 
 
+_SPARSE_CONTEXT_PREFIX_KEY = "_context_in_seq_"
+
+
+def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
+  """Moves `SparseTensor`s from `input_context` into `input_sequences` as seq.
+
+  For `key, value` pairs in `input_context` with `SparseTensor` `value` removes
+  them from `input_context` and transforms the `value` into a sequence and
+  then adding `key`, transformed `value` into `input_seuqences`.
+  The transformation is done by adding a new first dimension of `value_length`
+  equal to that of the other values in input_sequences` and tiling the `value`
+  every `num_unroll` steps.
+
+  Args:
+    input_context: dictionary with `Tensor` or `SparseTensor` values. To be
+      modified to take out `SparseTensor` values.
+    input_sequences: dictionary with `Tensor` or `SparseTensor` values. To be
+      modified to add transformed `SparseTensor` values from `input_context`.
+    num_unroll: int specifying to what multiple to pad sequences to.
+  """
+  value_length = array_ops.constant(1)
+  if input_sequences:
+    seq = list(input_sequences.values())[0]
+    if isinstance(seq, ops.Tensor):
+      value_length = array_ops.shape(seq)[0]
+    else:
+      value_length = seq.dense_shape[0]
+  value_length = math_ops.cast(value_length, dtype=dtypes.int64)
+  def _copy_sparse_tensor(sp_tensor):
+    """Operation to tile a sparse tensor along a newly added 0 dimension.
+
+    Adding a new first dimension of `value_length` and tiling the `sp_tensor`
+    every `num_unroll` steps.
+
+    Args:
+      sp_tensor: `SparseTensor`.
+    Returns:
+      `SparseTensor` sequence with `sp_tensor` tiled.
+    """
+    n = value_length // num_unroll
+    n = math_ops.cast(n, dtype=dtypes.int32)
+    values = array_ops.tile(sp_tensor.values, array_ops.expand_dims(n, 0))
+    shape = array_ops.concat(
+        [array_ops.expand_dims(value_length, 0), sp_tensor.dense_shape], 0)
+
+    # Construct new indices by multiplying old ones and prepending [0, n).
+    # First multiply indices n times along a newly created 0-dimension.
+    multiplied_indices = array_ops.tile(
+        array_ops.expand_dims(sp_tensor.indices, 0),
+        array_ops.stack([n, 1, 1]))
+
+    # Construct indicator for [0, n).
+    # [ [ [0] [0] ... [0] ]
+    #   [ [num_unroll] [num_unroll] ... [num_unroll] ]
+    #     ...
+    #   [ [num_unroll*(n-1)] [num_unroll*(n-1)] ... [num_unroll*(n-1)] ] ]
+    # of shape [n, shape(sp_tensor.indices)[0], 1]
+    # Get current dimensions of indices.
+    dim0 = array_ops.shape(sp_tensor.indices)[0]
+    dim1 = array_ops.shape(sp_tensor.indices)[1]
+    ind = math_ops.range(start=0, limit=value_length, delta=num_unroll)
+
+    # ind.set_shape([n])
+    ind = array_ops.expand_dims(ind, 1)
+    ind = array_ops.expand_dims(ind, 2)
+    ind = array_ops.tile(ind, [1, dim0, 1])
+
+    # Concatenate both and reshape.
+    indices = array_ops.concat([ind, multiplied_indices], 2)
+    indices = array_ops.reshape(indices, [dim0 * n, dim1 + 1])
+
+    return sparse_tensor.SparseTensor(indices=indices,
+                                      values=values,
+                                      dense_shape=shape)
+
+  sparse_tensor_keys = [
+      k for k in sorted(input_context.keys())
+      if (isinstance(input_context[k], sparse_tensor.SparseTensor) or
+          isinstance(input_context[k], sparse_tensor.SparseTensorValue))]
+  for key in sparse_tensor_keys:
+    input_sequences[_SPARSE_CONTEXT_PREFIX_KEY + key] = _copy_sparse_tensor(
+        input_context[key])
+    del input_context[key]
+
+
+def _move_sparse_tensor_in_context(context, sequences):
+  sparse_tensor_keys = [
+      k for k in sorted(sequences) if k.startswith(_SPARSE_CONTEXT_PREFIX_KEY)]
+  for key in sparse_tensor_keys:
+    new_key = key[len(_SPARSE_CONTEXT_PREFIX_KEY):]
+    sp_tensor = sequences[key]
+    # Take out time dimension.
+    sp_tensor = sparse_tensor.SparseTensor(
+        sp_tensor.indices,  # with only 0s at column 1 representing time.
+        sp_tensor.values,
+        array_ops.concat(
+            [[sp_tensor.dense_shape[0]],  # batch
+             [1],  # time
+             sp_tensor.dense_shape[2:]],  # SparseTensor shape prior to batching
+            0))
+    new_shape = array_ops.concat(
+        [[sp_tensor.dense_shape[0]], sp_tensor.dense_shape[2:]], 0)
+    context[new_key] = sparse_ops.sparse_reshape(sp_tensor, new_shape)
+    del sequences[key]
+
+
 def _deconstruct_sparse_tensor_seq(input_sequence, shared_name=None):
   """Converts `SparseTensor` values into `Tensors` of IDs and meta data.
 
@@ -1624,7 +1757,8 @@ def _deconstruct_sparse_tensor_seq(input_sequence, shared_name=None):
   """
   sparse_tensor_keys = [
       k for k in sorted(input_sequence.keys())
-      if isinstance(input_sequence[k], sparse_tensor.SparseTensor)]
+      if (isinstance(input_sequence[k], sparse_tensor.SparseTensor) or
+          isinstance(input_sequence[k], sparse_tensor.SparseTensorValue))]
   if not sparse_tensor_keys:
     return input_sequence, None, sparse_tensor_keys
   sparse_tensor_list = [input_sequence[k] for k in sparse_tensor_keys]
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index 1bb91757664..048410d3216 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -368,7 +368,8 @@ def create_train_op(total_loss,
                     summarize_gradients=False,
                     gate_gradients=tf_optimizer.Optimizer.GATE_OP,
                     aggregation_method=None,
-                    colocate_gradients_with_ops=False):
+                    colocate_gradients_with_ops=False,
+                    check_numerics=True):
   """Creates an `Operation` that evaluates the gradients and returns the loss.
 
   Args:
@@ -393,6 +394,7 @@ def create_train_op(total_loss,
       Valid values are defined in the class `AggregationMethod`.
     colocate_gradients_with_ops: Whether or not to try colocating the gradients
       with the ops that generated them.
+    check_numerics: Whether or not we apply check_numerics.
 
   Returns:
     A `Tensor` that when evaluated, computes the gradients and returns the total
@@ -449,8 +451,9 @@ def create_train_op(total_loss,
 
   with ops.name_scope('train_op'):
     # Make sure total_loss is valid.
-    total_loss = array_ops.check_numerics(total_loss,
-                                          'LossTensor is inf or nan')
+    if check_numerics:
+      total_loss = array_ops.check_numerics(total_loss,
+                                            'LossTensor is inf or nan')
 
     # Ensure the train_tensor computes grad_updates.
     train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index 06a36ebada8..0af79cf2e36 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -19,27 +19,19 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-
-# pylint: disable=g-import-not-at-top
-# TODO(jart): #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.contrib.training.python.training import training
-from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as variables_lib2
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import basic_session_run_hooks
@@ -69,16 +61,13 @@ class CreateTrainOpTest(test.TestCase):
 
   def testTrainOpInCollection(self):
     with ops.Graph().as_default():
-      random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      loss = losses.log_loss(tf_labels, tf_predictions)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
-      train_op = training.create_train_op(total_loss, optimizer)
+      train_op = training.create_train_op(loss, optimizer)
 
       # Make sure the training op was recorded in the proper collection
       self.assertTrue(train_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
@@ -93,26 +82,26 @@ class CreateTrainOpTest(test.TestCase):
       expected_var = np.var(self._inputs, axis=(0))
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      loss = losses.log_loss(tf_labels, tf_predictions)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
 
-      train_op = training.create_train_op(total_loss, optimizer)
+      train_op = training.create_train_op(loss, optimizer)
 
       moving_mean = variables_lib.get_variables_by_name('moving_mean')[0]
       moving_variance = variables_lib.get_variables_by_name('moving_variance')[
           0]
 
-      with session_lib.Session() as sess:
+      with self.test_session() as session:
         # Initialize all variables
-        sess.run(variables_lib2.global_variables_initializer())
-        mean, variance = sess.run([moving_mean, moving_variance])
+        session.run(variables_lib2.global_variables_initializer())
+        mean, variance = session.run([moving_mean, moving_variance])
         # After initialization moving_mean == 0 and moving_variance == 1.
         self.assertAllClose(mean, [0] * 4)
         self.assertAllClose(variance, [1] * 4)
 
         for _ in range(10):
-          sess.run([train_op])
+          session.run(train_op)
+
         mean = moving_mean.eval()
         variance = moving_variance.eval()
         # After 10 updates with decay 0.1 moving_mean == expected_mean and
@@ -127,26 +116,25 @@ class CreateTrainOpTest(test.TestCase):
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      loss = losses.log_loss(tf_labels, tf_predictions)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
-      train_op = training.create_train_op(total_loss, optimizer, update_ops=[])
+      train_op = training.create_train_op(loss, optimizer, update_ops=[])
 
       moving_mean = variables_lib.get_variables_by_name('moving_mean')[0]
       moving_variance = variables_lib.get_variables_by_name('moving_variance')[
           0]
 
-      with session_lib.Session() as sess:
+      with self.test_session() as session:
         # Initialize all variables
-        sess.run(variables_lib2.global_variables_initializer())
-        mean, variance = sess.run([moving_mean, moving_variance])
+        session.run(variables_lib2.global_variables_initializer())
+        mean, variance = session.run([moving_mean, moving_variance])
         # After initialization moving_mean == 0 and moving_variance == 1.
         self.assertAllClose(mean, [0] * 4)
         self.assertAllClose(variance, [1] * 4)
 
         for _ in range(10):
-          sess.run([train_op])
+          session.run(train_op)
+
         mean = moving_mean.eval()
         variance = moving_variance.eval()
 
@@ -154,59 +142,54 @@ class CreateTrainOpTest(test.TestCase):
         self.assertAllClose(mean, [0] * 4)
         self.assertAllClose(variance, [1] * 4)
 
-  def testUseGlobalStep(self):
+  def testGlobalStepIsIncrementedByDefault(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      loss = losses.log_loss(tf_labels, tf_predictions)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
-      train_op = training.create_train_op(total_loss, optimizer)
+      train_op = training.create_train_op(loss, optimizer)
 
       global_step = variables_lib.get_or_create_global_step()
 
-      with session_lib.Session() as sess:
+      with self.test_session() as session:
         # Initialize all variables
-        sess.run(variables_lib2.global_variables_initializer())
+        session.run(variables_lib2.global_variables_initializer())
 
         for _ in range(10):
-          sess.run([train_op])
-        global_step = global_step.eval()
+          session.run(train_op)
+
         # After 10 updates global_step should be 10.
-        self.assertAllClose(global_step, 10)
+        self.assertAllClose(global_step.eval(), 10)
 
-  def testNoneGlobalStep(self):
+  def testGlobalStepNotIncrementedWhenSetToNone(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      loss = losses.log_loss(tf_labels, tf_predictions)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
-      train_op = training.create_train_op(
-          total_loss, optimizer, global_step=None)
+      train_op = training.create_train_op(loss, optimizer, global_step=None)
 
       global_step = variables_lib.get_or_create_global_step()
 
-      with session_lib.Session() as sess:
+      with self.test_session() as session:
         # Initialize all variables
-        sess.run(variables_lib2.global_variables_initializer())
+        session.run(variables_lib2.global_variables_initializer())
 
         for _ in range(10):
-          sess.run([train_op])
-        global_step = global_step.eval()
+          session.run(train_op)
+
         # Since train_op don't use global_step it shouldn't change.
-        self.assertAllClose(global_step, 0)
+        self.assertAllClose(global_step.eval(), 0)
 
 
-class TrainBNClassifierTest(test.TestCase):
+class TrainBatchNormClassifierTest(test.TestCase):
 
   def setUp(self):
     # Create an easy training set:
@@ -214,22 +197,20 @@ class TrainBNClassifierTest(test.TestCase):
 
     self._inputs = np.zeros((16, 4))
     self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
-    self._logdir = os.path.join(self.get_temp_dir(), 'tmp_bnlogs/')
 
     for i in range(16):
       j = int(2 * self._labels[i] + np.random.randint(0, 2))
       self._inputs[i, j] = 1
 
   def testTrainWithNoInitAssignCanAchieveZeroLoss(self):
-    g = ops.Graph()
-    with g.as_default():
+    with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = batchnorm_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
+      losses.log_loss(tf_labels, tf_predictions)
+      total_loss = losses.get_total_loss()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
 
@@ -237,8 +218,10 @@ class TrainBNClassifierTest(test.TestCase):
 
       loss = training.train(
           train_op,
-          self._logdir,
-          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)])
+          None,
+          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)],
+          save_summaries_steps=None,
+          save_checkpoint_secs=None)
       self.assertLess(loss, .1)
 
 
@@ -256,31 +239,27 @@ class TrainTest(test.TestCase):
       self._inputs[i, j] = 1
 
   def testCanAchieveZeroLoss(self):
-    logdir = os.path.join(self.get_temp_dir(), 'can_achieve_zero_loss')
-
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
       tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
       tf_predictions = logistic_classifier(tf_inputs)
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
-
+      losses.log_loss(tf_labels, tf_predictions)
+      total_loss = losses.get_total_loss()
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
       train_op = training.create_train_op(total_loss, optimizer)
 
       loss = training.train(
           train_op,
-          logdir,
-          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)])
+          None,
+          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)],
+          save_summaries_steps=None,
+          save_checkpoint_secs=None)
       self.assertIsNotNone(loss)
       self.assertLess(loss, .015)
 
   def testTrainWithLocalVariable(self):
-    logdir = os.path.join(self.get_temp_dir(), 'train_with_local_variable')
-
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
@@ -289,17 +268,17 @@ class TrainTest(test.TestCase):
       local_multiplier = variables_lib.local_variable(1.0)
 
       tf_predictions = logistic_classifier(tf_inputs) * local_multiplier
-      loss_ops.log_loss(tf_predictions, tf_labels)
-      total_loss = loss_ops.get_total_loss()
-
+      losses.log_loss(tf_labels, tf_predictions)
+      total_loss = losses.get_total_loss()
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
       train_op = training.create_train_op(total_loss, optimizer)
 
       loss = training.train(
           train_op,
-          logdir,
-          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)])
+          None,
+          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)],
+          save_summaries_steps=None,
+          save_checkpoint_secs=None)
       self.assertIsNotNone(loss)
       self.assertLess(loss, .015)
 
@@ -314,8 +293,8 @@ class TrainTest(test.TestCase):
         tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
         tf_predictions = logistic_classifier(tf_inputs)
-        loss_ops.log_loss(tf_predictions, tf_labels)
-        total_loss = loss_ops.get_total_loss()
+        losses.log_loss(tf_labels, tf_predictions)
+        total_loss = losses.get_total_loss()
 
         optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
 
@@ -331,7 +310,9 @@ class TrainTest(test.TestCase):
                     num_steps=number_of_steps[i]),
                 basic_session_run_hooks.CheckpointSaverHook(
                     logdir, save_steps=50, saver=saver),
-            ])
+            ],
+            save_checkpoint_secs=None,
+            save_summaries_steps=None)
         self.assertIsNotNone(loss)
         self.assertLess(loss, .015)
 
@@ -340,8 +321,8 @@ class TrainTest(test.TestCase):
     tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
     tf_predictions = logistic_classifier(tf_inputs)
-    loss_ops.log_loss(tf_predictions, tf_labels)
-    total_loss = loss_ops.get_total_loss()
+    losses.log_loss(tf_labels, tf_predictions)
+    total_loss = losses.get_total_loss()
 
     optimizer = gradient_descent.GradientDescentOptimizer(
         learning_rate=learning_rate)
@@ -381,7 +362,8 @@ class TrainTest(test.TestCase):
                   logdir1, save_steps=1, saver=saver),
               basic_session_run_hooks.StopAtStepHook(num_steps=1),
           ],
-          save_checkpoint_secs=None)
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
       self.assertGreater(loss, .5)
 
     # Next, train the model to convergence.
@@ -394,10 +376,11 @@ class TrainTest(test.TestCase):
           logdir1,
           hooks=[
               basic_session_run_hooks.CheckpointSaverHook(
-                  logdir1, save_steps=1, saver=saver),
+                  logdir1, save_steps=300, saver=saver),
               basic_session_run_hooks.StopAtStepHook(num_steps=300),
           ],
-          save_checkpoint_secs=None)
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
       self.assertIsNotNone(loss)
       self.assertLess(loss, .02)
 
@@ -408,19 +391,21 @@ class TrainTest(test.TestCase):
       train_op = self.create_train_op()
 
       model_variables = variables_lib2.global_variables()
-      model_path = os.path.join(logdir1, 'model.ckpt-300')
+      model_path = saver_lib.latest_checkpoint(logdir1)
 
-      assign_fn = variables_lib.assign_from_checkpoint_fn(model_path,
-                                                          model_variables)
+      assign_fn = variables_lib.assign_from_checkpoint_fn(
+          model_path, model_variables)
 
       def init_fn(_, session):
         assign_fn(session)
 
       loss = training.train(
           train_op,
-          logdir2,
+          None,
           scaffold=monitored_session.Scaffold(init_fn=init_fn),
-          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)])
+          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
 
       self.assertIsNotNone(loss)
       self.assertLess(loss, .02)
@@ -430,8 +415,8 @@ class TrainTest(test.TestCase):
     tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
 
     tf_predictions = logistic_classifier(tf_inputs)
-    loss_ops.log_loss(tf_predictions, tf_labels)
-    return loss_ops.get_total_loss()
+    losses.log_loss(tf_labels, tf_predictions)
+    return losses.get_total_loss()
 
   def testTrainAllVarsHasLowerLossThanTrainSubsetOfVars(self):
     logdir = os.path.join(self.get_temp_dir(), 'tmp_logs3/')
@@ -454,9 +439,11 @@ class TrainTest(test.TestCase):
           logdir,
           hooks=[
               basic_session_run_hooks.CheckpointSaverHook(
-                  logdir, save_steps=1, saver=saver),
+                  logdir, save_steps=200, saver=saver),
               basic_session_run_hooks.StopAtStepHook(num_steps=200),
-          ])
+          ],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
       self.assertGreater(loss, .015)
       self.assertLess(loss, .05)
 
@@ -476,9 +463,11 @@ class TrainTest(test.TestCase):
           logdir,
           hooks=[
               basic_session_run_hooks.CheckpointSaverHook(
-                  logdir, save_steps=1, saver=saver),
+                  logdir, save_steps=300, saver=saver),
               basic_session_run_hooks.StopAtStepHook(num_steps=300),
-          ])
+          ],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
       self.assertGreater(loss, .015)
       self.assertLess(loss, .05)
 
@@ -494,10 +483,10 @@ class TrainTest(test.TestCase):
           train_op,
           logdir,
           hooks=[
-              basic_session_run_hooks.CheckpointSaverHook(
-                  logdir, save_steps=1, saver=saver),
               basic_session_run_hooks.StopAtStepHook(num_steps=400),
-          ])
+          ],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
       self.assertIsNotNone(loss)
       self.assertLess(loss, .015)
 
@@ -515,19 +504,19 @@ class TrainTest(test.TestCase):
       train_biases = training.create_train_op(
           total_loss, optimizer, variables_to_train=[biases])
 
-      with session_lib.Session() as sess:
+      with self.test_session() as session:
         # Initialize the variables.
-        sess.run(variables_lib2.global_variables_initializer())
+        session.run(variables_lib2.global_variables_initializer())
 
-        # Get the intial weights and biases values.
-        weights_values, biases_values = sess.run([weights, biases])
+        # Get the initial weights and biases values.
+        weights_values, biases_values = session.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
 
         # Update weights and biases.
-        loss = sess.run(train_op)
+        loss = session.run(train_op)
         self.assertGreater(loss, .5)
-        new_weights, new_biases = sess.run([weights, biases])
+        new_weights, new_biases = session.run([weights, biases])
 
         # Check that the weights and biases have been updated.
         self.assertGreater(np.linalg.norm(weights_values - new_weights), 0)
@@ -536,9 +525,9 @@ class TrainTest(test.TestCase):
         weights_values, biases_values = new_weights, new_biases
 
         # Update only weights.
-        loss = sess.run(train_weights)
+        loss = session.run(train_weights)
         self.assertGreater(loss, .5)
-        new_weights, new_biases = sess.run([weights, biases])
+        new_weights, new_biases = session.run([weights, biases])
 
         # Check that the weights have been updated, but biases have not.
         self.assertGreater(np.linalg.norm(weights_values - new_weights), 0)
@@ -546,9 +535,9 @@ class TrainTest(test.TestCase):
         weights_values = new_weights
 
         # Update only biases.
-        loss = sess.run(train_biases)
+        loss = session.run(train_biases)
         self.assertGreater(loss, .5)
-        new_weights, new_biases = sess.run([weights, biases])
+        new_weights, new_biases = session.run([weights, biases])
 
         # Check that the biases have been updated, but weights have not.
         self.assertAlmostEqual(np.linalg.norm(weights_values - new_weights), 0)
@@ -559,17 +548,8 @@ class TrainTest(test.TestCase):
     # to train two models. Model with equivalently larger learning
     # rate (i.e., learning_rate * gradient_multiplier) has smaller
     # training loss.
-    logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs6/')
-    logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs7/')
-
-    if gfile.Exists(logdir1):
-      gfile.DeleteRecursively(logdir1)
-    if gfile.Exists(logdir2):
-      gfile.DeleteRecursively(logdir2)
-
     multipliers = [1., 1000.]
     number_of_steps = 10
-    losses = []
     learning_rate = 0.001
 
     # First, train the model with equivalently smaller learning rate.
@@ -578,43 +558,37 @@ class TrainTest(test.TestCase):
       train_op = self.create_train_op(
           learning_rate=learning_rate, gradient_multiplier=multipliers[0])
 
-      saver = saver_lib.Saver()
-
-      loss = training.train(
+      loss0 = training.train(
           train_op,
-          logdir1,
+          None,
           hooks=[
               basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps),
-              basic_session_run_hooks.CheckpointSaverHook(
-                  logdir1, save_steps=50, saver=saver),
-          ])
-
-      losses.append(loss)
-      self.assertGreater(loss, .5)
+          ],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
+      self.assertIsNotNone(loss0)
+      self.assertGreater(loss0, .5)
 
     # Second, train the model with equivalently larger learning rate.
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       train_op = self.create_train_op(
           learning_rate=learning_rate, gradient_multiplier=multipliers[1])
-      saver = saver_lib.Saver()
 
-      loss = training.train(
+      loss1 = training.train(
           train_op,
-          logdir2,
+          None,
           hooks=[
               basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps),
-              basic_session_run_hooks.CheckpointSaverHook(
-                  logdir2, save_steps=50, saver=saver),
-          ])
-
-      losses.append(loss)
-      self.assertIsNotNone(loss)
-      self.assertLess(loss, .5)
+          ],
+          save_checkpoint_secs=None,
+          save_summaries_steps=None)
+      self.assertIsNotNone(loss1)
+      self.assertLess(loss1, .5)
 
     # The loss of the model trained with larger learning rate should
     # be smaller.
-    self.assertGreater(losses[0], losses[1])
+    self.assertGreater(loss0, loss1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/training/python/training/tuner.py b/tensorflow/contrib/training/python/training/tuner.py
new file mode 100644
index 00000000000..8843632619f
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tuner.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""The Tuner interface for hyper-parameters tuning."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.contrib.framework.python.framework import experimental
+
+
+class Tuner(object):
+  """Tuner class is the interface for Experiment hyper-parameters tuning.
+
+  Example:
+  ```
+    def _create_my_experiment(run_config, hparams):
+      hidden_units = [hparams.unit_per_layer] * hparams.num_hidden_layers
+
+      return tf.contrib.learn.Experiment(
+          estimator=DNNClassifier(config=run_config, hidden_units=hidden_units),
+          train_input_fn=my_train_input,
+          eval_input_fn=my_eval_input)
+
+    tuner = create_tuner(study_configuration, objective_key)
+
+    learn_runner.tune(experiment_fn=_create_my_experiment, tuner)
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  @experimental
+  @abc.abstractmethod
+  def next_trial(self):
+    """Switch to the next trial.
+
+    Ask the tuning service for a new trial for hyper-parameters tuning.
+
+    Returns:
+      A boolean indicating if a trial was assigned to the tuner.
+
+    Raises:
+      RuntimeError: If the tuner is initialized correctly.
+    """
+    raise NotImplementedError("Calling an abstract method.")
+
+  @experimental
+  @abc.abstractmethod
+  def run_experiment(self, experiment_fn):
+    """Creates an Experiment by calling `experiment_fn` and executes it.
+
+    It creates a `RunConfig`, which captures the current execution environment
+    configuration and retrieves the hyper-parameters for current trial from the
+    tuning service. Both are passed to the `experiment_fn` and used to create
+    the Experiment for current trial execution. When finished, the measure will
+    be reported to the tuning service.
+
+
+    If the `RunConfig` does not include a task type, then an exception is
+    raised. The task type should be one of the types supported by the tuner. If
+    tuner does not support the task type directly, it could delegate the task to
+    Experiment, which is usually a function of Experiment. An exception would be
+    raised, if neither tuner nor Experiment could support the task type.
+
+    Args:
+      experiment_fn: A function that creates an `Experiment`. It should accept
+        an argument `run_config` which should be used to create the `Estimator`
+        (passed as `config` to its constructor), and an argument `hparams`,
+        which should be used for hyper-parameters tuning. It must return an
+        `Experiment`.
+    """
+    raise NotImplementedError("Calling an abstract method.")
diff --git a/tensorflow/contrib/util/__init__.py b/tensorflow/contrib/util/__init__.py
index 45efdc20c80..08741cf8ca5 100644
--- a/tensorflow/contrib/util/__init__.py
+++ b/tensorflow/contrib/util/__init__.py
@@ -15,7 +15,7 @@
 
 """Utilities for dealing with Tensors.
 
-## Miscellaneous Utility Functions
+See @{$python/contrib.util} guide.
 
 @@constant_value
 @@make_tensor_proto
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index 95657217a00..f4283cd9ed6 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 
 from tensorflow.python.framework import load_library
 from tensorflow.python.platform import resource_loader
@@ -29,9 +30,9 @@ from tensorflow.python.platform import resource_loader
 def load_op_library(path):
   """Loads a contrib op library from the given path.
 
-  NOTE(mrry): On Windows, we currently assume that contrib op
+  NOTE(mrry): On Windows, we currently assume that some contrib op
   libraries are statically linked into the main TensorFlow Python
-  extension DLL.
+  extension DLL - use dynamically linked ops if the .so is present.
 
   Args:
     path: An absolute path to a shared object file.
@@ -40,11 +41,17 @@ def load_op_library(path):
     A Python module containing the Python wrappers for Ops defined in the
     plugin.
   """
-  if os.name != 'nt':
-    path = resource_loader.get_path_to_datafile(path)
-    ret = load_library.load_op_library(path)
-    assert ret, 'Could not load %s' % path
-    return ret
-  else:
-    # NOTE(mrry):
-    return None
+  if os.name == 'nt':
+    # To avoid makeing every user_ops aware of windows, re-write
+    # the file extension from .so to .dll.
+    path = re.sub(r'\.so$', '.dll', path)
+
+    # Currently we have only some user_ops as dlls on windows - don't try
+    # to load them if the dll is not found.
+    # TODO(mrry): Once we have all of them this check should be removed.
+    if not os.path.exists(path):
+      return None
+  path = resource_loader.get_path_to_datafile(path)
+  ret = load_library.load_op_library(path)
+  assert ret, 'Could not load %s' % path
+  return ret
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
new file mode 100644
index 00000000000..e747fa4c9e4
--- /dev/null
+++ b/tensorflow/contrib/verbs/BUILD
@@ -0,0 +1,168 @@
+# Description:
+#   Verbs RDMA communication interfaces and implementations for TensorFlow.
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "verbs_service_proto",
+    srcs = ["verbs_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "verbs_util",
+    srcs = ["verbs_util.cc"],
+    hdrs = ["verbs_util.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "grpc_verbs_service",
+    srcs = ["grpc_verbs_service.cc"],
+    hdrs = ["grpc_verbs_service.h"],
+    deps = [
+        ":grpc_verbs_service_impl",
+        ":rdma_mgr",
+        ":verbs_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "grpc_verbs_service_impl",
+    srcs = ["grpc_verbs_service_impl.cc"],
+    hdrs = ["grpc_verbs_service_impl.h"],
+    deps = [
+        ":verbs_service_proto_cc",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_verbs_client",
+    srcs = ["grpc_verbs_client.cc"],
+    hdrs = ["grpc_verbs_client.h"],
+    deps = [
+        ":grpc_verbs_service_impl",
+        ":verbs_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:call_options",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "rdma_rendezvous_mgr",
+    srcs = ["rdma_rendezvous_mgr.cc"],
+    hdrs = ["rdma_rendezvous_mgr.h"],
+    deps = [
+        ":rdma_mgr",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+    ],
+)
+
+cc_library(
+    name = "rdma_mgr",
+    srcs = ["rdma_mgr.cc"],
+    hdrs = ["rdma_mgr.h"],
+    deps = [
+        ":grpc_verbs_client",
+        ":rdma",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+    ],
+)
+
+cc_library(
+    name = "rdma",
+    srcs = ["rdma.cc"],
+    hdrs = ["rdma.h"],
+    linkopts = select({
+        "//tensorflow:with_verbs_support": ["-libverbs"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":verbs_util",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+    ],
+)
+
+cc_library(
+    name = "verbs_server_lib",
+    srcs = ["verbs_server_lib.cc"],
+    hdrs = ["verbs_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        ":grpc_verbs_service",
+        ":rdma_mgr",
+        ":rdma_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
new file mode 100644
index 00000000000..da5f2b0223b
--- /dev/null
+++ b/tensorflow/contrib/verbs/README.md
@@ -0,0 +1,77 @@
+## How to compile and use RDMA-enabled TensorFlow
+1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:
+
+    ```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
+
+2. To turn on RDMA connection, add the protocol "grpc+verbs" in server definition:
+
+    ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```
+
+## Overview
+The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.
+
+During the server setup, an RDMA manager is created to manage low-level RDMA components such as RDMA channel and RDMA adapter, an RDMA rendezvous manager is created to oversee send/recv operations between servers. Following the distributed TensorFlow design philosophy, the send operation is passive, i.e. merely placing a tensor in the local out-going table. It is the receive operation that actually initiates the tensor transfer.
+
+TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Two remedies are possible, either the memory is pinned, transfer, then unpinned for each and every tensor to be transferred, or a buffer is pre-allocated and pinned for each tensor. The former incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. The latter incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former. The second approach is adopted in this design. Each RDMA channel, representing a RDMA connection to a peer, contains a table of pinned buffers for all the seen tensors that requires transfer. It is assumed that the tensor size rarely changes across different steps. So only one buffer is created for the same tensor across all the steps. In the rare case when the tensor size does increases, the old buffer is discarded and new buffer of larger size is created and pinned.
+
+When a tensor is prepared for transfer, it is first converted to TensorProto, then the proto is serialized to byte array and copied to the pinned buffer. The content of the buffer is transferred to the remote node via RDMA write. On the remote side, the process is reversed. This is illustrated in the diagram below. The conversion of TensorProto is introduced to simplify transfer of string-tensors. Also since the TensorProto lives in host memory, even if the origin tensor lives in the device, the pinned buffers are all allocated in the host memory.
+![TensorFlow RDMA path](./design_diagram.png)
+
+The following improvements can be made in the future. First, conversion to TensorProto and serialization can be avoided for numeric (float/int) tensors since their internal buffer can be access directly as byte array. Second, the pinned buffer may be allocated on device if the tensor is located in the device. This avoids extra device-to-host copy at the expense of extra device memory consumption.
+## Design details
+
+### RDMA components
+
+* **RDMA adapter:** The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
+* **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
+* **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
+* **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
+* **RDMA rendezvous manager:** manages multiple rdma rendezvous. 
+* **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+
+### The SEND operation
+
+In TensorFlow, when rendezvous sends a tensor, it merely puts a tensor in a local table in the corresponding rendezvous. If the tensor has been requested, a callback exists in the table. "send" will activate the callback, which tries to send the tensor across the node.
+
+
+### The RECV operation
+
+When a tensor is requested, rendezvous' recv function is called. The function first places a callback in the channel's callback table, which will be activated once the tensor is sent from the source. In the next step, a message is sent to notify the source of the requested tensor. Once the source receives the message, it will check locally for the tensor, if not found, a callback is placed in the table, otherwise, the tensor id will be placed at corresponding RDMA buffer's job queue for future transmission. When a tensor is scheduled to be transmitted, the RDMA buffer needs to have the memory allocated and initialized (registered with the remote buffer info). If the memory is not ready, the transmission is deferred, a message is sent to the destination to establish the memory first. The other case a transmission can be deferred is when the buffer is still being used by an on-going transmission.
+
+### Three types of RDMA buffers
+
+* **Message buffer:** responsible for sending message only.
+* **Ack buffer:** once a message is sent, the recipient needs to send an ack via the ack buffer to free up the message buffer. An ack buffer is exclusively for its coupled message buffer.
+* **Tensor buffer:** responsible for sending tensors. The recipient needs to send back a message to free up the sending buffer.
+
+### RDMA packet format
+
+|type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|data_type|tensor_shape|tensor_bytes|tensor_buffer|
+
+### Six types of RDMA messages
+* RDMA_MESSAGE_ACK
+* RDMA_MESSAGE_BUFFER_IDLE
+* RDMA_MESSAGE_BUFFER_REQUEST
+* RDMA_MESSAGE_BUFFER_RESPONSE
+* RDMA_MESSAGE_TENSOR_REQUEST
+* RDMA_MESSAGE_TENSOR_WRITE
+
+### Actions upon receiving RDMA messages
+* RDMA_MESSAGE_ACK
+  * sender: mark local ack buffer idle.
+  * receiver: mark remote message buffer idle, send next item.
+* RDMA_MESSAGE_BUFFER_IDLE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, set remote tensor buffer idle, send next item.
+* RDMA_MESSAGE_BUFFER_REQUEST
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, find or create tensor buffer, send BUFFER_RESPONSE.
+* RDMA_MESSAGE_BUFFER_RESPONSE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, set remote buffer info, set local and remote buffer idle, send next item.
+* RDMA_MESSAGE_TENSOR_REQUEST
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, find or create tensor buffer, enqueue tensor id, send next item.
+* RDMA_MESSAGE_TENSOR_WRITE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: run callback.
diff --git a/tensorflow/contrib/verbs/design_diagram.png b/tensorflow/contrib/verbs/design_diagram.png
new file mode 100644
index 00000000000..f0ad27455fa
Binary files /dev/null and b/tensorflow/contrib/verbs/design_diagram.png differ
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.cc b/tensorflow/contrib/verbs/grpc_verbs_client.cc
new file mode 100644
index 00000000000..608a9140d3d
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/grpc_verbs_client.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+Status GrpcVerbsClient::GetRemoteAddress(CallOptions* call_options,
+                                         const GetRemoteAddressRequest* request,
+                                         GetRemoteAddressResponse* response) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  SetDeadline(&ctx, call_options->GetTimeout());
+  return FromGrpcStatus(stub_->GetRemoteAddress(&ctx, *request, response));
+}
+
+Status GrpcVerbsClient::GetRemoteAddress(const GetRemoteAddressRequest* request,
+                                         GetRemoteAddressResponse* response) {
+  CallOptions call_options;
+  call_options.SetTimeout(-1);  // no time out
+  return GetRemoteAddress(&call_options, request, response);
+}
+
+void GrpcVerbsClient::SetDeadline(::grpc::ClientContext* ctx,
+                                  int64 time_in_ms) {
+  if (time_in_ms > 0) {
+    ctx->set_deadline(gpr_time_from_millis(time_in_ms, GPR_TIMESPAN));
+  }
+}
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.h b/tensorflow/contrib/verbs/grpc_verbs_client.h
new file mode 100644
index 00000000000..358977f9254
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// GrpcVerbsClient is a client that uses gRPC to talk to the Verbs service.
+class GrpcVerbsClient {
+ public:
+  explicit GrpcVerbsClient(SharedGrpcChannelPtr client_channel)
+      : stub_(grpc::VerbsService::NewStub(client_channel)) {}
+  ~GrpcVerbsClient() {}
+
+  Status GetRemoteAddress(CallOptions* call_options,
+                          const GetRemoteAddressRequest* request,
+                          GetRemoteAddressResponse* response);
+  Status GetRemoteAddress(const GetRemoteAddressRequest* request,
+                          GetRemoteAddressResponse* response);
+
+ private:
+  std::unique_ptr<grpc::VerbsService::Stub> stub_;
+
+  void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcVerbsClient);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
new file mode 100644
index 00000000000..f2af6b79fba
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "grpc++/alarm.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/server_builder.h"
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+
+namespace tensorflow {
+
+GrpcVerbsService::GrpcVerbsService(const WorkerEnv* worker_env,
+                                   ::grpc::ServerBuilder* builder)
+    : is_shutdown_(false), worker_env_(worker_env) {
+  builder->RegisterService(&verbs_service_);
+  cq_ = builder->AddCompletionQueue().release();
+}
+
+GrpcVerbsService::~GrpcVerbsService() {
+  delete shutdown_alarm_;
+  delete cq_;
+}
+
+void GrpcVerbsService::Shutdown() {
+  bool did_shutdown = false;
+  {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      LOG(INFO) << "Shutting down GrpcWorkerService.";
+      is_shutdown_ = true;
+      did_shutdown = true;
+    }
+  }
+  if (did_shutdown) {
+    shutdown_alarm_ =
+        new ::grpc::Alarm(cq_, gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+  }
+}
+
+// This macro creates a new request for the given RPC method name
+// (e.g., `ENQUEUE_REQUEST(GetRemoteAddress, false);`), and enqueues it on
+// `this->cq_`.
+//
+// This macro is invoked one or more times for each RPC method to
+// ensure that there are sufficient completion queue entries to
+// handle incoming requests without blocking.
+//
+// The implementation of the request handler for each RPC method
+// must ensure that it calls ENQUEUE_REQUEST() for that RPC method,
+// to keep accepting new requests.
+#define ENQUEUE_REQUEST(method, supports_cancel)                             \
+  do {                                                                       \
+    mutex_lock l(shutdown_mu_);                                              \
+    if (!is_shutdown_) {                                                     \
+      Call<GrpcVerbsService, grpc::VerbsService::AsyncService,               \
+           method##Request, method##Response>::                              \
+          EnqueueRequest(&verbs_service_, cq_,                               \
+                         &grpc::VerbsService::AsyncService::Request##method, \
+                         &GrpcVerbsService::method##Handler,                 \
+                         (supports_cancel));                                 \
+    }                                                                        \
+  } while (0)
+
+// This method blocks forever handling requests from the completion queue.
+void GrpcVerbsService::HandleRPCsLoop() {
+  for (int i = 0; i < 10; ++i) {
+    ENQUEUE_REQUEST(GetRemoteAddress, false);
+  }
+
+  void* tag;
+  bool ok;
+
+  while (cq_->Next(&tag, &ok)) {
+    UntypedCall<GrpcVerbsService>::Tag* callback_tag =
+        static_cast<UntypedCall<GrpcVerbsService>::Tag*>(tag);
+    if (callback_tag) {
+      callback_tag->OnCompleted(this, ok);
+    } else {
+      cq_->Shutdown();
+    }
+  }
+}
+
+void GrpcVerbsService::GetRemoteAddressHandler(
+    WorkerCall<GetRemoteAddressRequest, GetRemoteAddressResponse>* call) {
+  Status s = GetRemoteAddressSync(&call->request, &call->response);
+  call->SendResponse(ToGrpcStatus(s));
+  ENQUEUE_REQUEST(GetRemoteAddress, false);
+}
+
+// synchronous method
+Status GrpcVerbsService::GetRemoteAddressSync(
+    const GetRemoteAddressRequest* request,
+    GetRemoteAddressResponse* response) {
+  // analyzing request
+  // the channel setting part is redundant.
+  const string remote_host_name = request->host_name();
+  RdmaChannel* rc = rdma_mgr_->FindChannel(remote_host_name);
+  CHECK(rc);
+  RdmaAddress ra;
+  ra.lid = request->channel().lid();
+  ra.qpn = request->channel().qpn();
+  ra.psn = request->channel().psn();
+  ra.snp = request->channel().snp();
+  ra.iid = request->channel().iid();
+  rc->SetRemoteAddress(ra, false);
+  rc->Connect();
+  int i = 0;
+  int idx[] = {1, 0, 3, 2};
+  std::vector<RdmaBuffer*> mb(rc->message_buffers());
+  CHECK_EQ(request->mr_size(), 4);
+  for (const auto& mr : request->mr()) {
+    // the connections are crossed, i.e.
+    // local tx_message_buffer <---> remote rx_message_buffer_
+    // local rx_message_buffer <---> remote tx_message_buffer_
+    // local tx_ack_buffer <---> remote rx_ack_buffer_
+    // local rx_ack_buffer <---> remote tx_ack_buffer_
+    // hence idx[] = {1, 0, 3, 2}.
+    RdmaBuffer* rb = mb[idx[i]];
+    RemoteMR rmr;
+    rmr.remote_addr = mr.remote_addr();
+    rmr.rkey = mr.rkey();
+    rb->SetRemoteMR(rmr, false);
+    i++;
+  }
+  CHECK(i == RdmaChannel::kNumMessageBuffers);
+
+  // setting up response
+  response->set_host_name(
+      worker_env_->session_mgr->LegacySession()->worker_name);
+  Channel* channel_info = response->mutable_channel();
+  channel_info->set_lid(rc->self().lid);
+  channel_info->set_qpn(rc->self().qpn);
+  channel_info->set_psn(rc->self().psn);
+  channel_info->set_snp(rc->self().snp);
+  channel_info->set_iid(rc->self().iid);
+  for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
+    MemoryRegion* mr = response->add_mr();
+    mr->set_remote_addr(reinterpret_cast<uint64>(mb[i]->buffer()));
+    mr->set_rkey(mb[i]->self()->rkey);
+  }
+  return Status::OK();
+}
+
+// Create a GrpcVerbsService, then assign it to a given handle.
+void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
+                        ::grpc::ServerBuilder* builder) {
+  *handle = new GrpcVerbsService(worker_env, builder);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
new file mode 100644
index 00000000000..aa509602b51
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace grpc {
+class ServerBuilder;
+class ServerCompletionQueue;
+class Alarm;
+}  // namespace grpc
+
+namespace tensorflow {
+
+class GrpcVerbsService : public AsyncServiceInterface {
+ public:
+  GrpcVerbsService(const WorkerEnv* worker_env, ::grpc::ServerBuilder* builder);
+  ~GrpcVerbsService();
+  void HandleRPCsLoop() override;
+  void Shutdown() override;
+  void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
+
+ private:
+  template <class RequestMessage, class ResponseMessage>
+  using WorkerCall = Call<GrpcVerbsService, grpc::VerbsService::AsyncService,
+                          RequestMessage, ResponseMessage>;
+  void GetRemoteAddressHandler(
+      WorkerCall<GetRemoteAddressRequest, GetRemoteAddressResponse>* call);
+  Status GetRemoteAddressSync(const GetRemoteAddressRequest* request,
+                              GetRemoteAddressResponse* response);
+
+  ::grpc::ServerCompletionQueue* cq_;
+  grpc::VerbsService::AsyncService verbs_service_;
+  mutex shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  ::grpc::Alarm* shutdown_alarm_;
+  // not owned
+  RdmaMgr* rdma_mgr_;
+  const WorkerEnv* const worker_env_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcVerbsService);
+};
+
+// Create a GrpcVerbsService, then assign it to a given handle.
+void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
+                        ::grpc::ServerBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
new file mode 100644
index 00000000000..e0ba78dbfd5
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
@@ -0,0 +1,68 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/channel_interface.h"
+#include "grpc++/impl/codegen/client_unary_call.h"
+#include "grpc++/impl/codegen/method_handler_impl.h"
+#include "grpc++/impl/codegen/rpc_service_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+namespace tensorflow {
+
+namespace grpc {
+
+static const char* grpcVerbsService_method_names[] = {
+    "/tensorflow.VerbsService/GetRemoteAddress",
+};
+
+std::unique_ptr<VerbsService::Stub> VerbsService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  std::unique_ptr<VerbsService::Stub> stub(new VerbsService::Stub(channel));
+  return stub;
+}
+
+VerbsService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0],
+                                  ::grpc::RpcMethod::NORMAL_RPC, channel) {}
+
+::grpc::Status VerbsService::Stub::GetRemoteAddress(
+    ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+    GetRemoteAddressResponse* response) {
+  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_GetRemoteAddress_,
+                                   context, request, response);
+}
+
+VerbsService::AsyncService::AsyncService() {
+  for (int i = 0; i < 1; ++i) {
+    AddMethod(new ::grpc::RpcServiceMethod(grpcVerbsService_method_names[i],
+                                           ::grpc::RpcMethod::NORMAL_RPC,
+                                           nullptr));
+    ::grpc::Service::MarkMethodAsync(i);
+  }
+}
+
+VerbsService::AsyncService::~AsyncService() {}
+
+}  // namespace grpc
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
new file mode 100644
index 00000000000..f7ea774b661
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -0,0 +1,89 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/impl/codegen/rpc_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/status.h"
+#include "grpc++/impl/codegen/stub_options.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+}  // namespace grpc
+
+namespace tensorflow {
+
+namespace grpc {
+
+// Implementation of `tensorflow.VerbsService`, based on the
+// definition in "//tensorflow/contrib/verbs/verbs_service.proto",
+// and the gRPC generated stub and service classes.
+// See the proto file for the definition of methods and messages.
+class VerbsService GRPC_FINAL {
+ public:
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status GetRemoteAddress(
+        ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+        GetRemoteAddressResponse* response) = 0;
+  };
+  class Stub GRPC_FINAL : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status GetRemoteAddress(
+        ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+        GetRemoteAddressResponse* response) GRPC_OVERRIDE;
+
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    const ::grpc::RpcMethod rpcmethod_GetRemoteAddress_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+    void RequestGetRemoteAddress(
+        ::grpc::ServerContext* context, GetRemoteAddressRequest* request,
+        ::grpc::ServerAsyncResponseWriter<GetRemoteAddressResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
new file mode 100644
index 00000000000..bc687be0abb
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -0,0 +1,888 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma.h"
+#include <cstdlib>
+#include "tensorflow/contrib/verbs/verbs_util.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+// hash name to 32-bit integer
+uint32_t NameHash(const string& name) {
+  return Hash32(name.data(), name.size(), 0x1234ABCD);
+}
+
+// convenience function for printing message
+string MessageTypeToString(RdmaMessageType rmt) {
+  switch (rmt) {
+    case RDMA_MESSAGE_ACK:
+      return "RDMA_MESSAGE_ACK";
+      break;
+    case RDMA_MESSAGE_BUFFER_IDLE:
+      return "RDMA_MESSAGE_BUFFER_IDLE";
+      break;
+    case RDMA_MESSAGE_BUFFER_REQUEST:
+      return "RDMA_MESSAGE_BUFFER_REQUEST";
+      break;
+    case RDMA_MESSAGE_BUFFER_RESPONSE:
+      return "RDMA_MESSAGE_BUFFER_RESPONSE";
+      break;
+    case RDMA_MESSAGE_TENSOR_REQUEST:
+      return "RDMA_MESSAGE_TENSOR_REQUEST";
+      break;
+    case RDMA_MESSAGE_TENSOR_WRITE:
+      return "RDMA_MESSAGE_TENSOR_WRITE";
+      break;
+    default:
+      return "UNKNOWN MESSAGE";
+  }
+}
+}  // namespace
+
+ibv_context* open_default_device() {
+  ibv_device** dev_list;
+  ibv_device* ib_dev;
+  dev_list = ibv_get_device_list(NULL);
+  CHECK(dev_list) << "No InfiniBand device found";
+  ib_dev = dev_list[0];
+  CHECK(ib_dev) << "No InfiniBand device found";
+  ibv_context* context = ibv_open_device(ib_dev);
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
+  return context;
+}
+
+ibv_pd* alloc_protection_domain(ibv_context* context) {
+  ibv_pd* pd = ibv_alloc_pd(context);
+  CHECK(pd) << "Failed to allocate protection domain";
+  return pd;
+}
+
+RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
+    : context_(open_default_device()),
+      pd_(alloc_protection_domain(context_)),
+      worker_env_(worker_env) {
+  event_channel_ = ibv_create_comp_channel(context_);
+  CHECK(event_channel_) << "Failed to create completion channel";
+  cq_ = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, event_channel_,
+                      0);
+  CHECK(cq_) << "Failed to create completion queue";
+  CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification";
+  polling_thread_.reset(Env::Default()->StartThread(
+      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
+  VLOG(2) << "Start RdmaAdapter: " << name();
+}
+
+RdmaAdapter::~RdmaAdapter() {
+  polling_thread_.reset();
+  CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ";
+  CHECK(!ibv_destroy_comp_channel(event_channel_))
+      << "Failed to destroy channel";
+  CHECK(!ibv_dealloc_pd(pd_)) << "Failed to deallocate PD";
+  CHECK(!ibv_close_device(context_)) << "Failed to release context";
+}
+
+string RdmaAdapter::name() const { return string(context_->device->name); }
+
+// Function to process incoming messages
+// There are two types of messages:
+// 1. IBV_WC_RECV_RDMA_WITH_IMM (receive)
+// 2. IBV_WC_RDMA_WRITE (send))
+void RdmaAdapter::Process_CQ() {
+  while (true) {
+    ibv_cq* cq;
+    void* cq_context;
+    CHECK(!ibv_get_cq_event(event_channel_, &cq, &cq_context));
+    CHECK(cq == cq_);
+    ibv_ack_cq_events(cq, 1);
+    CHECK(!ibv_req_notify_cq(cq_, 0));
+
+    int ne =
+        ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_));
+    CHECK_GE(ne, 0);
+    for (int i = 0; i < ne; ++i) {
+      CHECK(wc_[i].status == IBV_WC_SUCCESS)
+          << "Failed status \n"
+          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
+          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
+      if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+        RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
+        // put back a recv wr.
+        rc->Recv();
+        // imm_data is the index of RX buffer in the buffer table.
+        uint32_t imm_data = wc_[i].imm_data;
+        RdmaBuffer* rb = rc->FindBuffer(imm_data);
+        RdmaMessage rm;
+        RdmaMessage::ParseMessage(rm, rb->buffer_);
+        VLOG(2) << "recv RDMA message: " << MessageTypeToString(rm.type_);
+
+        if (rm.type_ == RDMA_MESSAGE_ACK) {
+          // receive an ack to a message
+          rb = rc->tx_message_buffer_;
+          rb->SetBufferStatus(remote, idle);
+          rb->SendNextItem();
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
+          // received a request-for-tensor message
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find or create buffer
+          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_);
+          string key_with_step_id =
+              VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
+          tb->EnqueueItem(key_with_step_id);
+          // send the next tensor
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_IDLE) {
+          // receive tensor-buffer-ready message
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find buffer
+          RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+          tb->SetBufferStatus(remote, idle);
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) {
+          // remote host requests to create a tensor buffer;
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find or create the buffer
+          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_, TENSOR);
+          RemoteMR rmr;
+          rmr.remote_addr = rm.remote_addr_;
+          rmr.rkey = rm.rkey_;
+          tb->SetRemoteMR(rmr, true);
+          tb->CreateCPUBuffer(rm.buffer_size_);
+          // create RDMA_MESSAGE_BUFFER_RESPONSE message
+          RdmaMessage br;
+          br.type_ = RDMA_MESSAGE_BUFFER_RESPONSE;
+          br.name_size_ = rm.name_.size();
+          br.name_ = rm.name_;
+          br.buffer_size_ = rm.buffer_size_;
+          br.remote_addr_ = reinterpret_cast<uint64_t>(tb->buffer_);
+          br.rkey_ = tb->self_->rkey;
+          string message = RdmaMessage::CreateMessage(br);
+          RdmaBuffer* mb = rc->tx_message_buffer_;
+          mb->EnqueueItem(message);
+          mb->SendNextItem();
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE) {
+          // remote creates a buffer and responds
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find buffer
+          RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+          CHECK(rm.buffer_size_ == tb->size_)
+              << "rm.buffer_size = " << rm.buffer_size_
+              << "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_;
+          RemoteMR rmr;
+          rmr.remote_addr = rm.remote_addr_;
+          rmr.rkey = rm.rkey_;
+          tb->SetRemoteMR(rmr, true);
+          tb->SetBufferStatus(local, idle);
+          tb->SetBufferStatus(remote, idle);
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+          // tensor RDMA write completed
+          worker_env_->compute_pool->Schedule([rm, rc]() {
+            string key_with_step_id =
+                VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
+            rc->RunRecvCallback(key_with_step_id);
+          });
+        }
+      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
+        RdmaBuffer* rb = reinterpret_cast<RdmaBuffer*>(wc_[i].wr_id);
+        rb->SetBufferStatus(local, idle);
+        RdmaMessage rm;
+        RdmaMessage::ParseMessage(rm, rb->buffer_);
+        VLOG(2) << "sent RDMA message: " << MessageTypeToString(rm.type_);
+        if (rm.type_ != RDMA_MESSAGE_ACK) {
+          worker_env_->compute_pool->Schedule([rb]() { rb->SendNextItem(); });
+        }
+      }
+    }
+  }
+}
+
+RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
+                         const string remote_name)
+    : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
+  // Create queue pair
+  {
+    struct ibv_qp_init_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_init_attr));
+    attr.send_cq = adapter_->cq_;
+    attr.recv_cq = adapter_->cq_;
+    attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_send_sge = 1;
+    attr.cap.max_recv_sge = 1;
+    attr.qp_type = IBV_QPT_RC;
+
+    qp_ = ibv_create_qp(adapter_->pd_, &attr);
+    CHECK(qp_) << "Failed to create queue pair";
+  }
+
+  // Init queue pair
+  {
+    struct ibv_qp_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_INIT;
+    attr.pkey_index = 0;
+    attr.port_num = 1;
+    attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
+    int mask =
+        IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
+    CHECK(!ibv_modify_qp(qp_, &attr, mask)) << "Failed to set QP to INIT";
+  }
+
+  // Local address
+  {
+    struct ibv_port_attr attr;
+    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
+        << "Query port";
+    self_.lid = attr.lid;
+    self_.qpn = qp_->qp_num;
+    self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
+    union ibv_gid gid;
+    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+        << "Query gid";
+    self_.snp = gid.global.subnet_prefix;
+    self_.iid = gid.global.interface_id;
+  }
+
+  // create message and ack buffers, then initialize the tables.
+  {
+    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
+                                   "tx_ack_buffer", "rx_ack_buffer"};
+    tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
+    rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
+    tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
+    rx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[3]);
+    message_buffers_.reserve(kNumMessageBuffers);
+    message_buffers_.push_back(tx_message_buffer_);
+    message_buffers_.push_back(rx_message_buffer_);
+    message_buffers_.push_back(tx_ack_buffer_);
+    message_buffers_.push_back(rx_ack_buffer_);
+    // create buffer on host
+    tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+    rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+    tx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
+    rx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
+    // bt_mu_.lock() is not used in constructor.
+    for (int i = 0; i < kNumMessageBuffers; i++) {
+      uint32_t index = NameHash(buffer_names[i]);
+      buffer_table_.insert({index, message_buffers_[i]});
+      buffer_index_name_table_.insert({index, buffer_names[i]});
+      buffer_name_index_table_.insert({buffer_names[i], index});
+    }
+
+    // Initiate recv
+    for (int i = 0; i < 100; i++) {
+      Recv();
+    }
+  }
+}
+
+RdmaChannel::~RdmaChannel() {
+  CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
+  delete tx_message_buffer_;
+  delete rx_message_buffer_;
+  delete tx_ack_buffer_;
+  delete rx_ack_buffer_;
+}
+
+void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
+  mutex_lock lock{mu_};
+  if ((override) || (!remote_set_)) {
+    remote_.lid = ra.lid;
+    remote_.qpn = ra.qpn;
+    remote_.psn = ra.psn;
+    remote_.snp = ra.snp;
+    remote_.iid = ra.iid;
+    remote_set_ = true;
+  } else {
+    CHECK(remote_.lid == ra.lid);
+    CHECK(remote_.qpn == ra.qpn);
+    CHECK(remote_.psn == ra.psn);
+    CHECK(remote_.snp == ra.snp);
+    CHECK(remote_.iid == ra.iid);
+  }
+}
+
+// Adding tokens to the completion queue
+// Tokens are needed to process future messages.
+void RdmaChannel::Recv() {
+  struct ibv_recv_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)this;
+  struct ibv_recv_wr* bad_wr;
+  CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
+}
+
+// Lookup 32-bit buffer index from buffer name
+// Args:
+//   buffer_name: name of the buffer
+// Returns:
+//   32-bit index
+uint32_t RdmaChannel::LookupBufferIndex(const string& buffer_name) {
+  mutex_lock lock{bt_mu_};
+  BufferNameIndexTable::iterator iter =
+      buffer_name_index_table_.find(buffer_name);
+  CHECK(iter != buffer_name_index_table_.end());
+  return iter->second;
+}
+
+// Find a buffer by its 32-bit index
+// Args:
+//   index: 32-bit hash code of the tensor buffer name
+// Returns:
+//   name of the tensor buffer
+RdmaBuffer* RdmaChannel::FindBuffer(const uint32_t index) {
+  mutex_lock lock{bt_mu_};
+  BufferTable::iterator iter = buffer_table_.find(index);
+  CHECK(iter != buffer_table_.end());
+  return iter->second;
+}
+
+// Find a buffer by its name
+// Args:
+//   name: name of the buffer
+// Returns:
+//   the named rdma buffer
+RdmaBuffer* RdmaChannel::FindBuffer(const string& name) {
+  uint32_t index = LookupBufferIndex(name);
+  return FindBuffer(index);
+}
+
+// Find a buffer if it exists, otherwise create one.
+// The memory inside the created buffer is not allocated.
+// Args:
+//   name: the name of the buffer
+//   buffer_type: TENSOR, MESSAGE or ACK.
+// Returns:
+//   the named buffer
+RdmaBuffer* RdmaChannel::FindOrCreateBuffer(const string& name,
+                                            BufferType buffer_type) {
+  mutex_lock lock{bt_mu_};
+  RdmaBuffer* rb;
+  // find index
+  BufferNameIndexTable::iterator iter = buffer_name_index_table_.find(name);
+  if (iter != buffer_name_index_table_.end()) {
+    uint32_t index = iter->second;
+    // find buffer
+    BufferTable::iterator iter = buffer_table_.find(index);
+    CHECK(iter != buffer_table_.end());
+    rb = iter->second;
+  } else {
+    uint32_t index = NameHash(name);
+    if (buffer_type == TENSOR) {
+      rb = new RdmaTensorBuffer(this, name);
+    } else if (buffer_type == MESSAGE) {
+      rb = new RdmaMessageBuffer(this, name);
+    } else if (buffer_type == ACK) {
+      rb = new RdmaAckBuffer(this, name);
+    }
+    buffer_name_index_table_.insert({name, index});
+    buffer_index_name_table_.insert({index, name});
+    buffer_table_.insert({index, rb});
+  }
+  CHECK(rb);
+  return rb;
+}
+
+// Insert callback to the callback_table.
+// The callback is activated when the corresponding tensor is received.
+// Arg:
+//   key: the name of the tensor
+//   recv_done: the callback associated with the tensor.
+// Returns:
+//   None
+void RdmaChannel::InsertRecvCallback(const string& key,
+                                     std::function<void()> recv_done) {
+  mutex_lock lock{ct_mu_};
+  callback_table_.insert({key, recv_done});
+}
+
+// Remove callback from the callback_table.
+// Arg:
+//   key: the name of the tensor
+// Returns:
+//   None
+void RdmaChannel::RemoveRecvCallback(const string& key) {
+  mutex_lock lock{ct_mu_};
+  callback_table_.erase(key);
+}
+
+// Run named callback in the callback_table.
+// Arg:
+//   key: the name of the tensor
+// Returns:
+//   None
+void RdmaChannel::RunRecvCallback(const string& key) {
+  std::function<void()> recv_done;
+  {
+    mutex_lock lock{ct_mu_};
+    CallbackTable::iterator iter = callback_table_.find(key);
+    CHECK(iter != callback_table_.end());
+    recv_done = iter->second;
+  }
+  recv_done();
+}
+
+void RdmaChannel::Connect() {
+  {
+    mutex_lock lock{mu_};
+    CHECK(remote_set_) << "remote channel is not set";
+  }
+  Connect(remote_);
+}
+
+// Setup channel to a remote node
+// Args:
+//   remoteAddr: the rdma address of a remote channel.
+// Returns:
+//   None
+void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
+  mutex_lock lock{mu_};
+  if (!connected_) {
+    struct ibv_qp_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_RTR;
+    struct ibv_port_attr port_attr;
+    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
+        << "Query port failed";
+    // This assumes both QP's ports are configured with the same MTU
+    attr.path_mtu = port_attr.active_mtu;
+    attr.dest_qp_num = remoteAddr.qpn;
+    attr.rq_psn = remoteAddr.psn;
+    attr.max_dest_rd_atomic = 1;
+    attr.min_rnr_timer = 12;
+    attr.ah_attr.is_global = 1;
+    attr.ah_attr.grh.dgid.global.subnet_prefix = remoteAddr.snp;
+    attr.ah_attr.grh.dgid.global.interface_id = remoteAddr.iid;
+    attr.ah_attr.grh.flow_label = 0;
+    attr.ah_attr.grh.hop_limit = 255;
+    attr.ah_attr.dlid = remoteAddr.lid;
+    attr.ah_attr.sl = 0;
+    attr.ah_attr.src_path_bits = 0;
+    attr.ah_attr.port_num = 1;
+
+    int r;
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                  IBV_QP_MAX_DEST_RD_ATOMIC |
+                                  IBV_QP_MIN_RNR_TIMER)))
+        << "QP to Ready to Receive " << r;
+
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_RTS;
+    attr.sq_psn = self_.psn;
+    attr.timeout = 14;
+    attr.retry_cnt = 7;
+    attr.rnr_retry = 7; /* infinite */
+    attr.max_rd_atomic = 1;
+
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                  IBV_QP_MAX_QP_RD_ATOMIC)))
+        << "QP to Ready to Send " << r;
+
+    connected_ = true;
+  } else {
+    LOG(INFO) << "channel already connected";
+  }
+}
+
+RdmaBuffer::RdmaBuffer(RdmaChannel* channel, string name)
+    : channel_(channel), name_(name) {}
+
+RdmaBuffer::~RdmaBuffer() {
+  CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
+  FreeBuffer();
+}
+
+void RdmaBuffer::FreeBuffer() {
+  if ((buffer_ != nullptr) && buffer_on_host_) {
+    free(buffer_);
+  }
+  // TODO
+  // release buffer if it is on device.
+  // We don't support RDMABuffer on device at this moment.
+}
+
+// Allocate CPU memory for the Rdma buffer
+// Args:
+//   size: to-be-allocated memory size
+//   lock: whether or not mutex_lock the process to protect concurrency.
+// Returns:
+//   None
+void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
+  CHECK(size > 0);
+  if (lock) {
+    mu_.lock();
+  }
+  if (local_status_ != none) {
+    // delete existing buffer
+    CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
+    FreeBuffer();
+  }
+  size_ = size;
+  buffer_ = malloc(size_);
+  self_ = ibv_reg_mr(channel_->adapter_->pd_, buffer_, size_,
+                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  CHECK(self_) << "Failed to register memory region";
+  buffer_on_host_ = true;
+  local_status_ = idle;
+  if (lock) {
+    mu_.unlock();
+  }
+}
+
+// Set address of remote memory region
+// Args:
+//   rmr: address of remote memory region
+//   override: whether override existing information
+// Returns:
+//   None
+void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
+  mutex_lock lock{mu_};
+  if ((override) || (remote_status_ == none)) {
+    remote_.remote_addr = rmr.remote_addr;
+    remote_.rkey = rmr.rkey;
+    remote_status_ = idle;
+  } else {
+    CHECK(remote_.remote_addr == rmr.remote_addr);
+    CHECK(remote_.rkey == rmr.rkey);
+  }
+}
+
+// Put a task in the buffer's job queue
+void RdmaBuffer::EnqueueItem(string item) {
+  mutex_lock lock{mu_};
+  queue_.push(item);
+}
+
+// Rdma-Write the content of the buffer
+void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+  struct ibv_sge list;
+  list.addr = (uint64_t)buffer_;
+  list.length = buffer_size;
+  list.lkey = self_->lkey;
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)this;
+  wr.sg_list = &list;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  wr.imm_data = imm_data;
+  wr.wr.rdma.remote_addr = (uint64_t)remote_.remote_addr;
+  wr.wr.rdma.rkey = remote_.rkey;
+
+  struct ibv_send_wr* bad_wr;
+  CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send";
+}
+
+RdmaAckBuffer::RdmaAckBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+// Send the next ack from the buffer's job queue.
+void RdmaAckBuffer::SendNextItem() {
+  uint32_t imm_data = LookupBufferIndex("rx_ack_buffer");
+  RdmaMessage rm;
+  rm.name_ = "rx_ack_buffer";
+  rm.type_ = RDMA_MESSAGE_ACK;
+  rm.name_size_ = rm.name_.size();
+  string message = RdmaMessage::CreateMessage(rm);
+  memcpy(buffer_, message.data(), message.size());
+  Write(imm_data, message.size());
+}
+
+// Send the next message from the buffer's job queue.
+void RdmaMessageBuffer::SendNextItem() {
+  uint32_t imm_data = LookupBufferIndex("rx_message_buffer");
+  mu_.lock();
+  if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) {
+    local_status_ = busy;
+    remote_status_ = busy;
+    string message = queue_.front();
+    queue_.pop();
+    // local/remote_status_ won't be set back to idle
+    // unitl Write() is successful
+    mu_.unlock();
+    memcpy(buffer_, message.data(), message.size());
+    Write(imm_data, message.size());
+  } else {
+    mu_.unlock();
+  }
+}
+
+// Send the next tensor from the buffer's job queue.
+void RdmaTensorBuffer::SendNextItem() {
+  // get the key
+  string key_with_step_id = "";
+  {
+    mutex_lock lock{mu_};
+    if (!queue_.empty()) {
+      key_with_step_id = queue_.front();
+      queue_.pop();
+    }
+  }
+  // send the tensor if a key is acquired.
+  if (key_with_step_id != "") {
+    VLOG(2) << "try to send tensor: " << key_with_step_id;
+    string key;
+    int64 step_id;
+    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
+    CHECK(key.compare(name_) == 0);
+    Rendezvous::ParsedKey parsed;
+    Rendezvous::ParseKey(key, &parsed);
+    Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id,
+                                   parsed](const Status& status,
+                                           const Rendezvous::Args& send_args,
+                                           const Rendezvous::Args& recv_args,
+                                           const Tensor& in, bool is_dead) {
+      CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
+                         << " error message: " << status.error_message();
+      size_t buffer_size = RdmaMessage::kMessageTotalBytes;
+      size_t tensor_bytes = 0;
+      TensorProto proto;
+      // Figures out which device the tensor is hosted on.
+      Device* src_dev = nullptr;
+      Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
+          parsed.src_device, &src_dev);
+      CHECK(s.ok()) << "src device not found";
+      // Does the device have the right incarnation number we expect?
+      CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
+          << "RecvTensor expects a different device incarnation: "
+          << parsed.src_incarnation << " vs. "
+          << src_dev->attributes().incarnation()
+          << ". Your worker job was probably restarted. Check your "
+          << "worker job for the reason why it was restarted.";
+      Device* dst_dev = nullptr;
+      // destination is on CPU.
+      s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
+                                                                    &dst_dev);
+      CHECK(s.ok()) << "dst device not found";
+      AllocatorAttributes dst_alloc_attr;
+      dst_alloc_attr.set_on_host(true);
+      // string tensor needs to be serialized
+      if (src_dev->tensorflow_gpu_device_info() &&
+          (!send_args.alloc_attrs.on_host())) {
+        CHECK(send_args.device_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+        // "val" is on a GPU. Uses GPUUtil to fill the proto.
+        s = VerbsUtil::SetProtoFromGPUSync(
+            in, src_dev, send_args.device_context, &proto, is_dead);
+        CHECK(s.ok()) << "set proto from gpu sync";
+      } else {
+        // tensor is in CPU memory.
+        in.AsProtoTensorContent(&proto);
+      }
+      tensor_bytes = proto.ByteSize();
+      // maybe some margin for string tensor?
+      buffer_size += tensor_bytes;
+      // prepare message
+      RdmaMessage rm;
+      rm.name_size_ = key.size();
+      rm.name_ = key;
+      rm.tensor_shape_ = in.shape();
+      rm.data_type_ = in.dtype();
+      rm.step_id_ = step_id;
+      rm.is_dead_ = is_dead;
+      rm.tensor_bytes_ = tensor_bytes;
+      rm.buffer_size_ = buffer_size;
+      mu_.lock();
+      if (local_status_ == none ||
+          (buffer_size > size_ && local_status_ == idle &&
+           remote_status_ == idle)) {
+        if ((local_status_ != none) && (buffer_size > size_)) {
+          CHECK(rm.data_type_ == DT_STRING)
+              << "Only string tensor allows to change size";
+        }
+        CreateCPUBuffer(buffer_size, false);
+        mu_.unlock();
+        // put back the key since it is not sent;
+        EnqueueItem(key_with_step_id);
+        // ask the remote to create the same buffer
+        rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST;
+        rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_);
+        rm.rkey_ = self_->rkey;
+        string message = RdmaMessage::CreateMessage(rm);
+        channel_->tx_message_buffer_->EnqueueItem(message);
+        channel_->tx_message_buffer_->SendNextItem();
+      } else if ((local_status_ == idle) && (remote_status_ == idle)) {
+        // both buffers are ready, send the tensor
+        local_status_ = busy;
+        remote_status_ = busy;
+        // local/remote_status_ won't be set back to idle
+        // unitl Write() is successful
+        mu_.unlock();
+        CHECK((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
+              (buffer_size <= size_ && rm.data_type_ == DT_STRING))
+            << "tensor and buffer size do not agree!"
+            << " buffer_size = " << size_
+            << " requested tensor size = " << buffer_size << in.DebugString();
+        uint32_t imm_data = LookupBufferIndex(key);
+        rm.type_ = RDMA_MESSAGE_TENSOR_WRITE;
+        string message = RdmaMessage::CreateMessage(rm);
+        memcpy(buffer_, message.data(), message.size());
+        if (!is_dead) {
+          // copy the tensor buffer content
+          void* output =
+              static_cast<void*>(static_cast<char*>(buffer_) +
+                                 RdmaMessage::kTensorBufferStartIndex);
+          CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
+          proto.SerializeToArray(output, tensor_bytes);
+        } else {
+          buffer_size = RdmaMessage::kMessageTotalBytes;
+        }
+        Write(imm_data, buffer_size);
+      } else {
+        mu_.unlock();
+        // put back the key since it is not sent;
+        EnqueueItem(key_with_step_id);
+      }
+    };
+    channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
+                                                                    parsed, cb);
+  }
+}
+
+// Create a RdmaMessage according to the pre-defined format
+// Args:
+//   rm: the message structure
+// Returns:
+//   message in string format
+string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
+  // Rdma Message format
+  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
+  // ...|   XB    |    XB      |    8B      |...
+  //
+  // ACK:             type|13|"rx_ack_buffer"
+  // TENSOR_REQUEST:  type|name_size|tensor_name|step_id
+  // TENSOR_WRITE:    type|name_size|tensor_name|step_id|...|is_dead
+  //                 |data_type|tensor_shape|tensor_bytes
+  // BUFFER_IDLE:     type|name_size|buffer_name
+  // BUFFER_REQUEST:
+  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
+  // BUFFER_RESPONSE:
+  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
+  char message[kMessageTotalBytes];
+  // type
+  message[kTypeStartIndex] = static_cast<char>(rm.type_) & 0xff;
+  // size of name
+  memcpy(&message[kNameSizeStartIndex], &rm.name_size_, sizeof(rm.name_size_));
+  // name
+  memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
+  // buffer_size, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
+    memcpy(&message[kBufferSizeStartIndex], &rm.buffer_size_,
+           sizeof(rm.buffer_size_));
+    memcpy(&message[kRemoteAddrStartIndex], &rm.remote_addr_,
+           sizeof(rm.remote_addr_));
+    memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_));
+  }
+  // step_id
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
+    memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_));
+  }
+  // is_dead, data_type, tensor_shape, tensor_bytes
+  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+    memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_));
+
+    memcpy(&message[kDataTypeStartIndex], &rm.data_type_,
+           sizeof(rm.data_type_));
+    memcpy(&message[kTensorShapeStartIndex], &rm.tensor_shape_,
+           sizeof(rm.tensor_shape_));
+    memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_,
+           sizeof(rm.tensor_bytes_));
+  }
+  return string(message, kMessageTotalBytes);
+}
+
+// Parse a RdmaMessage according to the pre-defined format
+// Args:
+//   rm: the message structure where the parsed message will be saved
+//   buffer: the place where the raw message is stored
+// Returns:
+//   None
+void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
+  char* message = static_cast<char*>(buffer);
+  // type
+  rm.type_ = static_cast<RdmaMessageType>(message[kTypeStartIndex]);
+  // name_size_
+  memcpy(&rm.name_size_, &message[kNameSizeStartIndex], sizeof(rm.name_size_));
+  // name
+  rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
+  // buffer_size, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
+    memcpy(&rm.buffer_size_, &message[kBufferSizeStartIndex],
+           sizeof(rm.buffer_size_));
+    memcpy(&rm.remote_addr_, &message[kRemoteAddrStartIndex],
+           sizeof(rm.remote_addr_));
+    memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_));
+  }
+  // step_id
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
+    memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_));
+  }
+  // data_type, tensor_bytes, tensor_shape, is_dead
+  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+    memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_));
+    memcpy(&rm.data_type_, &message[kDataTypeStartIndex],
+           sizeof(rm.data_type_));
+    memcpy(&rm.tensor_shape_, &message[kTensorShapeStartIndex],
+           sizeof(rm.tensor_shape_));
+    memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex],
+           sizeof(rm.tensor_bytes_));
+  }
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
new file mode 100644
index 00000000000..10cbbe58d9a
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -0,0 +1,279 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include <infiniband/verbs.h>
+#include <cstring>  // for memset
+#include <functional>
+#include <memory>  // for shared_ptr
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// structure to save the address of remote channels.
+struct RdmaAddress {
+  uint32_t lid;
+  uint32_t qpn;
+  uint32_t psn;
+  uint64_t snp;
+  uint64_t iid;
+};
+// structure to save information for remote memory regions.
+struct RemoteMR {
+  uint64_t remote_addr;
+  uint32_t rkey;
+};
+enum BufferStatus { none, idle, busy };
+enum Location { local, remote };
+enum BufferType { ACK, MESSAGE, TENSOR };
+enum RdmaMessageType {
+  RDMA_MESSAGE_ACK,
+  RDMA_MESSAGE_BUFFER_IDLE,
+  RDMA_MESSAGE_BUFFER_REQUEST,
+  RDMA_MESSAGE_BUFFER_RESPONSE,
+  RDMA_MESSAGE_TENSOR_REQUEST,
+  RDMA_MESSAGE_TENSOR_WRITE
+};
+class RdmaBuffer;
+// Class that represents the Rdma Adapter.
+// Responsible for creation of the completion queue, and handling
+// of work completions.
+class RdmaAdapter {
+  friend class RdmaChannel;
+  friend class RdmaBuffer;
+  friend class RdmaAckBuffer;
+  friend class RdmaMessageBuffer;
+  friend class RdmaTensorBuffer;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  RdmaAdapter(const WorkerEnv* worker_env);
+  ~RdmaAdapter();
+  // Adapter name, e.g. mlx5_0.
+  string name() const;
+  void Process_CQ();
+
+ protected:
+  static const int MAX_CONCURRENT_WRITES = 1000;
+  ibv_context* context_;
+  // ibverbs protection domain
+  ibv_pd* pd_;
+  // Completion event channel, to wait for work completions
+  ibv_comp_channel* event_channel_;
+  // Completion queue, to poll on work completions
+  ibv_cq* cq_;
+  // Pre-allocated work completions array used for polling
+  ibv_wc wc_[MAX_CONCURRENT_WRITES * 2];
+  // worker env for thread
+  const WorkerEnv* worker_env_;
+  // thread for cq.
+  std::unique_ptr<Thread> polling_thread_;
+};
+
+// Class that represents a connection to a remote Rdma peer.
+// Responsible for connecting queue pairs.
+class RdmaChannel {
+  friend class RdmaAdapter;
+  friend class RdmaBuffer;
+  friend class RdmaAckBuffer;
+  friend class RdmaMessageBuffer;
+  friend class RdmaTensorBuffer;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  explicit RdmaChannel(const RdmaAdapter* adapter, const string local_name,
+                       const string remote_name_);
+  ~RdmaChannel();
+  inline const RdmaAddress& self() { return self_; }
+  RdmaAddress address() const;
+  inline const std::vector<RdmaBuffer*>& message_buffers() const {
+    return message_buffers_;
+  }
+  void Connect(const RdmaAddress& remoteAddr);
+  void Connect();
+  void Recv();
+  RdmaBuffer* FindBuffer(const uint32_t index);
+  RdmaBuffer* FindBuffer(const string& name);
+  RdmaBuffer* FindOrCreateBuffer(const string& name,
+                                 BufferType buffer_type = TENSOR);
+  uint32_t LookupBufferIndex(const string& buffer_name);
+  void SetRemoteAddress(const RdmaAddress& ra, bool override);
+  void InsertRecvCallback(const string& key, std::function<void()> recv_done);
+  void RemoveRecvCallback(const string& key);
+  void RunRecvCallback(const string& key);
+  static const int kNumMessageBuffers = 4;
+
+ protected:
+  const RdmaAdapter* adapter_;
+  RdmaAddress self_;
+  string local_name_;
+  string remote_name_;
+  ibv_qp* qp_;
+  mutex mu_;
+  bool connected_ GUARDED_BY(bt_mu_) = false;
+  RdmaAddress remote_ GUARDED_BY(bt_mu_);
+  bool remote_set_ GUARDED_BY(bt_mu_) = false;
+  mutex ct_mu_;
+  typedef std::unordered_map<string, std::function<void()> > CallbackTable;
+  CallbackTable callback_table_ GUARDED_BY(ct_mu_);
+  mutex bt_mu_;
+  typedef std::unordered_map<unsigned int, RdmaBuffer*> BufferTable;
+  BufferTable buffer_table_ GUARDED_BY(bt_mu_);
+  typedef std::unordered_map<uint32_t, string> BufferIndexNameTable;
+  BufferIndexNameTable buffer_index_name_table_ GUARDED_BY(bt_mu_);
+  typedef std::unordered_map<string, uint32_t> BufferNameIndexTable;
+  BufferNameIndexTable buffer_name_index_table_ GUARDED_BY(bt_mu_);
+  RdmaBuffer* tx_message_buffer_;
+  RdmaBuffer* rx_message_buffer_;
+  RdmaBuffer* tx_ack_buffer_;
+  RdmaBuffer* rx_ack_buffer_;
+  std::vector<RdmaBuffer*> message_buffers_;
+};
+
+// Class that represents a buffer for Rdma writes and reads.
+class RdmaBuffer {
+  friend class RdmaChannel;
+  friend class RdmaAdapter;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  explicit RdmaBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaBuffer();
+
+  inline void* buffer() const { return buffer_; }
+  inline ibv_mr* self() const { return self_; }
+  inline void SetBufferStatus(Location loc, BufferStatus status) {
+    mu_.lock();
+    if (loc == local) {
+      local_status_ = status;
+    } else {
+      remote_status_ = status;
+    }
+    mu_.unlock();
+  }
+  void FreeBuffer();
+  void EnqueueItem(string Item);
+  virtual void SendNextItem(){};
+  void CreateCPUBuffer(size_t size, bool lock = true);
+  void SetRemoteMR(RemoteMR rmi, bool override);
+  uint32_t LookupBufferIndex(const string& buffer_name) {
+    return const_cast<RdmaChannel*>(channel_)->LookupBufferIndex(buffer_name);
+  }
+  void Write(uint32_t imm_data, size_t buffer_size);
+
+ protected:
+  const RdmaChannel* channel_;
+  void* buffer_ = nullptr;
+  bool buffer_on_host_ = true;
+  size_t size_ = 0;
+  const string name_;
+  ibv_mr* self_ = nullptr;
+  mutex mu_;
+  RemoteMR remote_;
+  std::queue<string> queue_ GUARDED_BY(mu_);
+  BufferStatus local_status_ GUARDED_BY(mu_) = none;
+  BufferStatus remote_status_ GUARDED_BY(mu_) = none;
+};
+
+class RdmaAckBuffer : public RdmaBuffer {
+ public:
+  explicit RdmaAckBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaAckBuffer() override {}
+  void SendNextItem() override;
+};
+
+class RdmaMessageBuffer : public RdmaBuffer {
+  friend class RdmaChannel;
+  friend class RdmaAapater;
+
+ public:
+  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaMessageBuffer() override {}
+  void SendNextItem() override;
+};
+
+class RdmaTensorBuffer : public RdmaBuffer {
+ public:
+  explicit RdmaTensorBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaTensorBuffer() override {}
+  void SendNextItem() override;
+};
+
+struct RdmaMessage {
+  RdmaMessageType type_;
+  uint16_t name_size_;
+  string name_;
+  int64 step_id_;
+  uint64_t buffer_size_;
+  uint64_t remote_addr_;
+  uint32_t rkey_;
+  bool is_dead_;
+  DataType data_type_;
+  TensorShape tensor_shape_;
+  size_t tensor_bytes_;
+
+  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
+  // ...|   XB    |    XB      |    8B      |...
+  //
+  static const size_t kNameCapacity = 512;
+  static const size_t kTypeStartIndex = 0;
+  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
+  static const size_t kNameStartIndex =
+      kNameSizeStartIndex + sizeof(name_size_);
+  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
+  static const size_t kBufferSizeStartIndex =
+      kStepIdStartIndex + sizeof(step_id_);
+  static const size_t kRemoteAddrStartIndex =
+      kBufferSizeStartIndex + sizeof(buffer_size_);
+  static const size_t kRkeyStartIndex =
+      kRemoteAddrStartIndex + sizeof(remote_addr_);
+  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
+  static const size_t kDataTypeStartIndex =
+      kIsDeadStartIndex + sizeof(is_dead_);
+  static const size_t kTensorShapeStartIndex =
+      kDataTypeStartIndex + sizeof(data_type_);
+  static const size_t kTensorBytesStartIndex =
+      kTensorShapeStartIndex + sizeof(TensorShape);
+  static const size_t kTensorBufferStartIndex =
+      kTensorBytesStartIndex + sizeof(tensor_bytes_);
+  static const size_t kMessageTotalBytes = kTensorBufferStartIndex;
+  static const size_t kRdmaMessageBufferSize = kMessageTotalBytes;
+  static const size_t kRdmaAckBufferSize = kMessageTotalBytes;
+  static string CreateMessage(const RdmaMessage& rm);
+  static void ParseMessage(RdmaMessage& rm, void* buffer);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
new file mode 100644
index 00000000000..09b878843f5
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -0,0 +1,137 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include <vector>
+#include "tensorflow/contrib/verbs/grpc_verbs_client.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env,
+                 GrpcChannelCache* const channel_cache)
+    : worker_env_(worker_env), channel_cache_(channel_cache) {
+  rdma_adapter_ = new RdmaAdapter(worker_env_);
+  // hardcoded to default session (legacy_session_)
+  // TODO: use WorkerSessionForSession
+  // need to pass in session handle
+  local_worker_ = worker_env_->session_mgr->LegacySession()->worker_name;
+  std::vector<string> workers;
+  worker_env_->session_mgr->LegacySession()->worker_cache->ListWorkers(
+      &workers);
+  num_remote_workers_ = workers.size() - 1;
+  VLOG(2) << "rmda_mgr on local worker: " << local_worker_;
+  for (size_t i = 0; i < workers.size(); i++) {
+    if (local_worker_.compare(workers[i]) != 0) {
+      channel_table_.insert(
+          {workers[i],
+           new RdmaChannel(rdma_adapter_, local_worker_, workers[i])});
+    }
+  }
+}
+
+// Setup Rdma channels between peers.
+// This is done at the beginning of the server setup.
+
+void RdmaMgr::SetupChannels() {
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    LOG(INFO) << "connecting to remote node " << worker_name;
+    RdmaChannel* rc = p.second;
+    GetRemoteAddressRequest req;
+    GetRemoteAddressResponse resp;
+    // get the channel cache
+    SharedGrpcChannelPtr client_channel =
+        channel_cache_->FindWorkerChannel(worker_name);
+    GrpcVerbsClient* client = new GrpcVerbsClient(client_channel);
+    CHECK(client != nullptr) << "No worker known as " << worker_name;
+
+    // setting up request
+    req.set_host_name(local_worker_);
+    Channel* channel_info = req.mutable_channel();
+    channel_info->set_lid(rc->self_.lid);
+    channel_info->set_qpn(rc->self_.qpn);
+    channel_info->set_psn(rc->self_.psn);
+    channel_info->set_snp(rc->self_.snp);
+    channel_info->set_iid(rc->self_.iid);
+    for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
+      MemoryRegion* mr = req.add_mr();
+      mr->set_remote_addr(
+          reinterpret_cast<uint64_t>(rc->message_buffers_[i]->buffer_));
+      mr->set_rkey(rc->message_buffers_[i]->self_->rkey);
+    }
+    // synchronous call
+    Status s = client->GetRemoteAddress(&req, &resp);
+    // save obtained remote addresses
+    // connect to the remote channel
+    if (s.ok()) {
+      CHECK(worker_name.compare(resp.host_name()) == 0);
+      RdmaAddress ra;
+      ra.lid = resp.channel().lid();
+      ra.qpn = resp.channel().qpn();
+      ra.psn = resp.channel().psn();
+      ra.snp = resp.channel().snp();
+      ra.iid = resp.channel().iid();
+      rc->SetRemoteAddress(ra, false);
+      rc->Connect();
+      int i = 0;
+      int idx[] = {1, 0, 3, 2};
+      for (const auto& mr : resp.mr()) {
+        // the connections are crossed, i.e.
+        // local tx_message_buffer <---> remote rx_message_buffer_
+        // local rx_message_buffer <---> remote tx_message_buffer_
+        // local tx_ack_buffer <---> remote rx_ack_buffer_
+        // local rx_ack_buffer <---> remote tx_ack_buffer_
+        // hence idx[] = {1, 0, 3, 2}.
+        RdmaBuffer* rb = rc->message_buffers_[idx[i]];
+        RemoteMR rmr;
+        rmr.remote_addr = mr.remote_addr();
+        rmr.rkey = mr.rkey();
+        rb->SetRemoteMR(rmr, false);
+        i++;
+      }
+      CHECK(i == RdmaChannel::kNumMessageBuffers);
+    } else {
+      LOG(ERROR) << s.error_message();
+    }
+    delete client;
+  }
+}
+
+RdmaMgr::~RdmaMgr() {
+  for (const auto& p : channel_table_) delete p.second;
+  channel_table_.clear();
+  delete rdma_adapter_;
+}
+
+// Find a channel via the given name.
+// Args:
+//   name: peer name, e.g. worker1
+// Returns
+//   channel object that is connected to the named peer.
+RdmaChannel* RdmaMgr::FindChannel(const string& name) {
+  ChannelTable::iterator iter = channel_table_.find(name);
+  CHECK(iter != channel_table_.end());
+  return iter->second;
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
new file mode 100644
index 00000000000..b156f64096c
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/contrib/verbs/rdma.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+
+namespace tensorflow {
+
+class RdmaMgr {
+ public:
+  explicit RdmaMgr(const WorkerEnv* const worker_env,
+                   GrpcChannelCache* const channel_cache);
+  ~RdmaMgr();
+  RdmaChannel* FindChannel(const string& key);
+  void SetupChannels();
+  const string& local_worker() { return local_worker_; }
+
+ private:
+  string local_worker_;
+  size_t num_remote_workers_;
+  const WorkerEnv* const worker_env_;
+  GrpcChannelCache* const channel_cache_;
+  RdmaAdapter* rdma_adapter_;
+  typedef std::unordered_map<string, RdmaChannel*> ChannelTable;
+  ChannelTable channel_table_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
new file mode 100644
index 00000000000..5871400f26a
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -0,0 +1,144 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_rendezvous_mgr.h"
+#include <unordered_set>
+#include "tensorflow/contrib/verbs/verbs_util.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
+ public:
+  RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
+      : BaseRemoteRendezvous(env, step_id, true), rdma_mgr_(rdma_mgr) {}
+
+ protected:
+  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                           const Rendezvous::Args& args,
+                           DoneCallback done) override;
+
+ private:
+  ~RdmaRemoteRendezvous() override {}
+  RdmaMgr* rdma_mgr_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaRemoteRendezvous);
+};
+
+void RdmaRemoteRendezvous::RecvFromRemoteAsync(
+    const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
+    DoneCallback done) {
+  Status s;
+  // parse src_name and dst_name
+  string src_name, dst_name, unused;
+  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_name,
+                                        &unused)) {
+    s = errors::Internal("Could not parse src name.");
+  }
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+  if (!DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
+                                        &unused)) {
+    s = errors::Internal("Could not parse dst name.");
+  }
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+  CHECK(dst_name.compare(rdma_mgr_->local_worker()) == 0);
+  RdmaChannel* rc = rdma_mgr_->FindChannel(src_name);
+  string key(std::move(parsed.FullKey().ToString()));
+  string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_);
+  // insert callback
+  rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
+                                            recv_args, parsed, done]() {
+    Status s;
+    Device* src_dev;
+    s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
+    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+    if (!s.ok()) {
+      done(s, Args(), recv_args, Tensor(), true);
+      return;
+    }
+    Device* dst_dev;
+    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+    if (!s.ok()) {
+      done(s, Args(), recv_args, Tensor(), true);
+      return;
+    }
+    RdmaBuffer* rb = rc->FindBuffer(key);
+    RdmaMessage rm;
+    CHECK(rb->size_ >= RdmaMessage::kMessageTotalBytes);
+    RdmaMessage::ParseMessage(rm, rb->buffer_);
+    CHECK(rm.type_ == RDMA_MESSAGE_TENSOR_WRITE);
+    Tensor val;
+    if (!rm.is_dead_) {
+      void* input = static_cast<char*>(rb->buffer_) +
+                    RdmaMessage::kTensorBufferStartIndex;
+      TensorProto proto;
+      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+            rb->size_);
+      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+          << "fail to parse proto from array";
+      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+    }
+
+    rc->RemoveRecvCallback(key_with_step_id);
+    // create message
+    RdmaMessage br;
+    br.type_ = RDMA_MESSAGE_BUFFER_IDLE;
+    br.name_size_ = key.size();
+    br.name_ = key;
+    string message = RdmaMessage::CreateMessage(br);
+    RdmaBuffer* tb = rc->tx_message_buffer_;
+    tb->EnqueueItem(message);
+    tb->SendNextItem();
+    done(s, Args(), recv_args, val, rm.is_dead_);
+  });
+  // append key to message queue
+  RdmaBuffer* rb = rc->tx_message_buffer_;
+  RdmaMessage rm;
+  rm.type_ = RDMA_MESSAGE_TENSOR_REQUEST;
+  rm.name_size_ = key.size();
+  rm.name_ = key;
+  rm.step_id_ = step_id_;
+  string message = RdmaMessage::CreateMessage(rm);
+  rb->EnqueueItem(message);
+  rb->SendNextItem();
+}
+
+RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env) {}
+
+BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id,
+                                                const WorkerEnv* worker_env) {
+  return new RdmaRemoteRendezvous(worker_env, step_id, rdma_mgr_);
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
new file mode 100644
index 00000000000..2dedd6c48f9
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RendezvousMgr keeps track of a set of local rendezvous instances.
+// All tensors sent by this worker are buffered in a RendezvousMgr
+// until the tensor is received.  Each global unique "step_id"
+// corresponds to one local rendezvous instance managed by a
+// RendezvousMgr.
+//
+// E.g.,
+//   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
+//   fork execution of an graph executor using "rendez"  on thread 1;
+//   fork execution of another graph executor using "rendez" on thread 2;
+//   ...
+//   join threads 1 and 2;
+//
+// In the example above, execution in thread 1 and 2 communicates with
+// each other by send/recv operations through the "rend".
+//
+// Tensors sent and recved through rendezvous managed by this
+// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
+class RdmaRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  explicit RdmaRendezvousMgr(const WorkerEnv* env);
+  void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
+
+ protected:
+  BaseRemoteRendezvous* Create(int64 step_id,
+                               const WorkerEnv* worker_env) override;
+
+ private:
+  RdmaMgr* rdma_mgr_;
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaRendezvousMgr);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
new file mode 100644
index 00000000000..c3597249354
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/verbs_server_lib.h"
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/contrib/verbs/rdma_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+namespace {
+// static utility function
+RendezvousMgrInterface* NewRdmaRendezvousMgr(const WorkerEnv* env) {
+  return new RdmaRendezvousMgr(env);
+}
+
+}  // namespace
+
+VerbsServer::VerbsServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env), verbs_state_(DISCONNECTED) {}
+
+VerbsServer::~VerbsServer() {
+  TF_CHECK_OK(Stop());
+  TF_CHECK_OK(Join());
+  delete rdma_mgr_;
+  delete verbs_service_;
+  delete channel_cache_;
+}
+
+Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
+                                        GrpcChannelCache** channel_cache) {
+  string name_prefix =
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
+
+  GrpcChannelSpec channel_spec;
+  TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
+
+  *channel_cache =
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction());
+
+  const string host_port = (*channel_cache)->TranslateTask(name_prefix);
+  int requested_port;
+
+  if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
+                             &requested_port)) {
+    return errors::Internal("Could not parse port for local server from \"",
+                            (*channel_cache)->TranslateTask(name_prefix),
+                            "\".");
+  }
+  if (requested_port != bound_port()) {
+    return errors::InvalidArgument("Requested port ", requested_port,
+                                   " differs from expected port ",
+                                   bound_port());
+  }
+
+  return Status::OK();
+}
+
+Status VerbsServer::Init(ServiceInitFunction service_func,
+                         RendezvousMgrCreationFunction rendezvous_mgr_func) {
+  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  {
+    mutex_lock l(mu_);
+    CHECK_EQ(verbs_state_, DISCONNECTED);
+    CHECK(ChannelCacheFactory(server_def(), &channel_cache_).ok());
+    rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_);
+    // set rdma_mgr for verbs_service and rdma_rendezvous_mgr
+    verbs_service_->SetRdmaMgr(rdma_mgr_);
+    dynamic_cast<RdmaRendezvousMgr*>(worker_env()->rendezvous_mgr)
+        ->SetRdmaMgr(rdma_mgr_);
+  }
+  return s;
+}
+
+Status VerbsServer::Start() {
+  Status s = GrpcServer::Start();
+  {
+    mutex_lock l(mu_);
+    if (verbs_state_ == DISCONNECTED) {
+      // verbs_thread needs to be initiated
+      // before rdma_mgr sets up the rdma channels.
+      verbs_thread_.reset(worker_env()->env->StartThread(
+          ThreadOptions(), "TF_verbs_service",
+          [this] { verbs_service_->HandleRPCsLoop(); }));
+      rdma_mgr_->SetupChannels();
+      verbs_state_ = CONNECTED;
+    }
+  }
+  return s;
+}
+
+Status VerbsServer::Join() {
+  Status s = GrpcServer::Join();
+  {
+    mutex_lock l(mu_);
+    if (verbs_state_ == CONNECTED) {
+      verbs_state_ = DISCONNECTED;
+      verbs_thread_.reset();
+    }
+  }
+  return s;
+}
+
+/* static */
+Status VerbsServer::Create(const ServerDef& server_def, Env* env,
+                           std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<VerbsServer> ret(new VerbsServer(server_def, Env::Default()));
+  ServiceInitFunction service_func = [&ret](const WorkerEnv* worker_env,
+                                            ::grpc::ServerBuilder* builder) {
+    return SetNewVerbsService(&ret->verbs_service_, worker_env, builder);
+  };
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRdmaRendezvousMgr));
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class VerbsServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "grpc+verbs";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return VerbsServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `VerbsServer` instances.
+class VerbsServerRegistrar {
+ public:
+  VerbsServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("VERBS_SERVER", new VerbsServerFactory());
+  }
+};
+static VerbsServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.h b/tensorflow/contrib/verbs/verbs_server_lib.h
new file mode 100644
index 00000000000..855380129f2
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_server_lib.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+
+namespace tensorflow {
+
+class VerbsServer : public GrpcServer {
+ protected:
+  VerbsServer(const ServerDef& server_def, Env* env);
+
+ public:
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+
+  // Destruction is only supported in the factory method. Clean
+  // shutdown is not currently implemented for this server type.
+  virtual ~VerbsServer() override;
+
+  // Implementations of ServerInterface methods.
+  Status Start() override;
+  Status Join() override;
+
+ protected:
+  Status Init(ServiceInitFunction service_func,
+              RendezvousMgrCreationFunction rendezvous_mgr_func);
+  Status ChannelCacheFactory(const ServerDef& server_def,
+                             GrpcChannelCache** channel_cache);
+
+ private:
+  RdmaMgr* rdma_mgr_;
+
+  // Guards state transitions.
+  mutex mu_;
+
+  enum State { DISCONNECTED, CONNECTED };
+  State verbs_state_ GUARDED_BY(mu_);
+
+  GrpcVerbsService* verbs_service_ = nullptr;
+  std::unique_ptr<Thread> verbs_thread_ GUARDED_BY(mu_);
+  GrpcChannelCache* channel_cache_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
diff --git a/tensorflow/contrib/verbs/verbs_service.proto b/tensorflow/contrib/verbs/verbs_service.proto
new file mode 100644
index 00000000000..0df1fed4b9d
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_service.proto
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option java_outer_classname = "VerbsServiceProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.contrib.verbs";
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// GRPC Helper messages used to exchange RDMA information.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message Channel {
+  int32 lid = 1;
+  int32 qpn = 2;
+  int32 psn = 3;
+  uint64 snp = 4;
+  uint64 iid = 5;
+}
+
+message MemoryRegion {
+  uint64 remote_addr = 1;
+  uint32 rkey = 2;
+}
+message GetRemoteAddressRequest {
+  string host_name = 1;
+  Channel channel = 2;
+  repeated MemoryRegion mr = 3;
+}
+
+message GetRemoteAddressResponse {
+  string host_name = 1;
+  Channel channel = 2;
+  repeated MemoryRegion mr = 3;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// VerbsService
+//
+////////////////////////////////////////////////////////////////////////////////
+
+service VerbsService {
+  rpc GetRemoteAddress(GetRemoteAddressRequest)
+      returns (GetRemoteAddressResponse);
+}
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
new file mode 100644
index 00000000000..c3350f7958c
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/verbs_util.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+namespace tensorflow {
+
+// static sync wrapper:
+Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
+                                      const DeviceContext* device_context,
+                                      TensorProto* proto, bool is_dead) {
+  Notification n;
+  Status status;
+  GPUUtil::SetProtoFromGPU(tensor, dev, device_context, proto, is_dead,
+                           [&n, &status](const Status& s) {
+                             status = s;
+                             n.Notify();
+                           });
+  n.WaitForNotification();
+  return status;
+}
+
+// static
+string VerbsUtil::AppendStepidToKey(const string& key, int64 step_id) {
+  return strings::StrCat(key, ";", step_id);
+}
+
+// static
+void VerbsUtil::GetKeyAndStepId(const string& key_with_step_id, string& key,
+                                int64& step_id) {
+  StringPiece s(key_with_step_id);
+  // a key (with step_id) has exact 6 parts if split by ";"
+  // part 1: src_device;
+  // part 2: src_incarnation;
+  // part 3: dst_device;
+  // part 4: name;
+  // part 5: frame_iter.frame_id:frame_iter.iter_id
+  // part 6: step_id
+  std::vector<string> parts = str_util::Split(s, ';');
+  CHECK(parts.size() == 6) << "Key with step_id must have 6 parts";
+  strings::safe_strto64(parts[5], &step_id);
+  parts.pop_back();                        // remove step_id
+  key.assign(str_util::Join(parts, ";"));  // stitch them together
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
new file mode 100644
index 00000000000..cbc01adae49
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+#define TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class TensorProto;
+
+class VerbsUtil {
+ public:
+  // synchronous wrapper of SetProtoFromGPU
+  static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
+                                    const DeviceContext* device_context,
+                                    TensorProto* proto, bool is_dead);
+  static string AppendStepidToKey(const string& key, int64 step_id);
+  static void GetKeyAndStepId(const string& key_with_step_id, string& key,
+                              int64& step_id);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_RDMA_UTIL_H_
diff --git a/tensorflow/contrib/xla_tf_graph/BUILD b/tensorflow/contrib/xla_tf_graph/BUILD
new file mode 100644
index 00000000000..0487e793347
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/BUILD
@@ -0,0 +1,64 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+cc_library(
+    name = "xla_tf_graph_util",
+    srcs = [
+        "xla_tf_graph_util.cc",
+    ],
+    hdrs = [
+        "xla_tf_graph_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "xla_tf_graph_util_test",
+    size = "small",
+    srcs = ["xla_tf_graph_util_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":xla_tf_graph_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:cwise_op",
+    ],
+)
diff --git a/tensorflow/contrib/xla_tf_graph/README.md b/tensorflow/contrib/xla_tf_graph/README.md
new file mode 100644
index 00000000000..a374189e813
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/README.md
@@ -0,0 +1,8 @@
+# Xla Tf Graph
+
+## Description
+
+This module contains utilities to treat xla representation as tf graph to support mobile SOC experiments and leverage tf tools.
+
+Maintainers:
+- Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
new file mode 100644
index 00000000000..302aa6457ab
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
@@ -0,0 +1,247 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h"
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+namespace {
+
+constexpr const char* const GRAPH_NAME = "xla_tf_graph";
+constexpr const char* const NODE_NAME_PREFIX = "xla";
+
+Status ConvertPrimitiveTypeToDataType(const xla::PrimitiveType p_type,
+                                      DataType* d_type) {
+  switch (p_type) {
+    case xla::PRED:
+      *d_type = DT_BOOL;
+      return Status::OK();
+    case xla::S8:
+      *d_type = DT_INT8;
+      return Status::OK();
+    case xla::S16:
+      *d_type = DT_INT16;
+      return Status::OK();
+    case xla::S32:
+      *d_type = DT_INT32;
+      return Status::OK();
+    case xla::S64:
+      *d_type = DT_INT64;
+      return Status::OK();
+    case xla::U8:
+      *d_type = DT_UINT8;
+      return Status::OK();
+    case xla::U16:
+      *d_type = DT_UINT16;
+      return Status::OK();
+    case xla::F16:
+      *d_type = DT_HALF;
+      return Status::OK();
+    case xla::F32:
+      *d_type = DT_FLOAT;
+      return Status::OK();
+    case xla::F64:
+      *d_type = DT_DOUBLE;
+      return Status::OK();
+    default:
+      return errors::InvalidArgument(
+          "Unsupported PrimitiveType in ConvertPrimitiveTypeToDataType ",
+          xla::PrimitiveType_Name(p_type));
+  }
+}
+
+Status ConvertXlaShapeToTensorShapeType(const xla::Shape& xla_shape,
+                                        std::vector<TensorShape>* tensor_shapes,
+                                        std::vector<DataType>* data_types) {
+  switch (xla_shape.element_type()) {
+    case xla::TUPLE: {
+      for (const xla::Shape& element_shape : xla_shape.tuple_shapes()) {
+        if (element_shape.element_type() == xla::TUPLE) {
+          return errors::InvalidArgument("Nested tuple is not allowed.");
+        }
+        TF_RETURN_IF_ERROR(ConvertXlaShapeToTensorShapeType(
+            element_shape, tensor_shapes, data_types));
+      }
+      return Status::OK();
+    }
+    case xla::PRED:
+    case xla::S8:
+    case xla::S16:
+    case xla::S32:
+    case xla::S64:
+    case xla::U8:
+    case xla::U16:
+    case xla::U32:
+    case xla::U64:
+    case xla::F16:
+    case xla::F32:
+    case xla::F64: {
+      TensorShape shape;
+      DataType type;
+      TF_RETURN_IF_ERROR(
+          ConvertPrimitiveTypeToDataType(xla_shape.element_type(), &type));
+      for (const int64& dim : xla_shape.dimensions()) {
+        shape.AddDim(dim);
+      }
+      tensor_shapes->emplace_back(shape);
+      data_types->emplace_back(type);
+      return Status::OK();
+    }
+    default:
+      return errors::InvalidArgument(
+          "Unsupported PrimitiveType in ConvertXlaShapeToTensorShapeType ",
+          xla::PrimitiveType_Name(xla_shape.element_type()));
+  }
+}
+
+string BuildXlaNodeName(const xla::OperationRequest& operation_request,
+                        const string& xla_op_type, const string& suffix) {
+  const string name = strings::StrCat(
+      NODE_NAME_PREFIX, "/", operation_request.output_handle().handle(), "/",
+      xla_op_type);
+  if (suffix.empty()) {
+    return name;
+  } else {
+    return strings::StrCat(name, "/", suffix);
+  }
+}
+
+string BuildXlaNodeName(const xla::OperationRequest& operation_request,
+                        const string& xla_op_type) {
+  return BuildXlaNodeName(operation_request, xla_op_type, "");
+}
+
+string BuildXlaNodeOp(const protobuf::Message& msg, const string& suffix) {
+  return strings::StrCat(msg.GetDescriptor()->name(), "/", suffix);
+}
+
+string BuildXlaNodeOp(const protobuf::Message& msg) {
+  return BuildXlaNodeOp(msg, "");
+}
+
+Status ConvertOpRequestToXlaNode(const xla::OperationRequest& operation_request,
+                                 XlaNode* xla_node) {
+  const xla::OpRequest& op_request = operation_request.request();
+  switch (op_request.op_case()) {
+    case xla::OpRequest::kBinaryOpRequest: {
+      const xla::BinaryOpRequest& op = op_request.binary_op_request();
+      xla_node->op_type =
+          BuildXlaNodeOp(op, xla::BinaryOperation_Name(op.binop()));
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      xla_node->input_ids.emplace_back(std::make_tuple(op.lhs().handle(), 0));
+      xla_node->input_ids.emplace_back(std::make_tuple(op.rhs().handle(), 0));
+      for (const int64& dim : op.broadcast_dimensions()) {
+        xla_node->broadcast_dimensions.emplace_back(dim);
+      }
+      break;
+    }
+    case xla::OpRequest::kParameterRequest: {
+      const xla::ParameterRequest& op = op_request.parameter_request();
+      xla_node->op_type = BuildXlaNodeOp(op, "");
+      xla_node->name =
+          BuildXlaNodeName(operation_request, xla_node->op_type, op.name());
+      break;
+    }
+    case xla::OpRequest::kVariadicOpRequest: {
+      const xla::VariadicOpRequest& op = op_request.variadic_op_request();
+      xla_node->op_type =
+          BuildXlaNodeOp(op, xla::VariadicOperation_Name(op.varop()));
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      for (const xla::ComputationDataHandle& handle : op.operands()) {
+        xla_node->input_ids.emplace_back(std::make_tuple(handle.handle(), 0));
+      }
+      break;
+    }
+    case xla::OpRequest::kGetTupleElementRequest: {
+      const xla::GetTupleElementRequest& op =
+          op_request.get_tuple_element_request();
+      xla_node->op_type = BuildXlaNodeOp(op);
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      xla_node->input_ids.emplace_back(
+          std::make_tuple(op.operand().handle(), op.index()));
+      break;
+    }
+    default:
+      // TODO(satok): Implement all possible cases.
+      LOG(FATAL) << "Op request: " << op_request.op_case()
+                 << " is not supported yet.";
+      break;
+  }
+
+  CHECK(!xla_node->name.empty());
+  CHECK(!xla_node->op_type.empty());
+
+  TF_RETURN_IF_ERROR(ConvertXlaShapeToTensorShapeType(
+      operation_request.output_shape(), &xla_node->output_shapes,
+      &xla_node->output_data_types));
+  return Status::OK();
+}
+
+void SetupXlaCpuClient(std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+                       std::unique_ptr<XlaCompiler>* compiler) {
+  xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  FunctionDefLibrary flib;
+  flib_def->reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  // Setup compiler options
+  XlaCompiler::Options options;
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  options.device_type = &device_type;
+  options.flib_def = flib_def->get();
+  options.client = client;
+  compiler->reset(new XlaCompiler(options));
+}
+
+}  // namespace
+
+xla::StatusOr<std::unique_ptr<xla::SessionModule>>
+ConvertTfGraphToXlaSessionModule(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph) {
+  CHECK(graph);
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  std::unique_ptr<XlaCompiler> compiler;
+
+  SetupXlaCpuClient(&flib_def, &compiler);
+
+  // Compile graph and build computation
+  XlaCompiler::CompilationResult result;
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), GRAPH_NAME,
+                                     std::move(graph), args, &result));
+
+  return result.computation->Snapshot();
+}
+
+xla::StatusOr<std::unordered_map<int64, XlaNode>>
+ConvertXlaSessionModuleToXlaNodes(const xla::SessionModule& session_module) {
+  std::unordered_map<int64, XlaNode> xla_nodes;
+  for (const auto& operation_request : session_module.entry().requests()) {
+    XlaNode xla_node;
+    TF_RETURN_IF_ERROR(
+        ConvertOpRequestToXlaNode(operation_request.second, &xla_node));
+    xla_nodes.emplace(operation_request.first, xla_node);
+  }
+  return std::move(xla_nodes);
+}
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h
new file mode 100644
index 00000000000..e635290851f
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
+#define TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+// A set of utilities to handle xla computation requests.
+// These utilities help developers leverage existing tools to work with
+// xla computations, also provide a way to support TensorFlow ops by
+// implementing xla computations so that they can do experiments on their
+// specialized environments.
+
+// A structure to represent typed attributes of TensorFlow graph node.
+// This structure contains op specific attributes as members so that
+// we can treat them explicitly.
+struct XlaNode {
+  // Unique node name
+  string name;
+  // Op type of xla computation
+  string op_type;
+  // List of pair of unique id and port of input node.
+  // We store this value instead
+  // of node name in order not to wait for all XlaNodes to be constructed.
+  std::vector<std::tuple<int64, int>> input_ids;
+  // Oputput shapes
+  std::vector<TensorShape> output_shapes;
+  // Output data types
+  std::vector<DataType> output_data_types;
+
+  //---------------------------
+  // Op specific attributes
+  // #xla::OpRequest::kBinaryOpRequest
+  std::vector<int64> broadcast_dimensions;
+};
+
+// Convert a tf graph to a xla session module
+xla::StatusOr<std::unique_ptr<xla::SessionModule>>
+ConvertTfGraphToXlaSessionModule(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph);
+
+// Convert a xla session module to a map to XlaNode from unique id
+xla::StatusOr<std::unordered_map<int64, XlaNode>>
+ConvertXlaSessionModuleToXlaNodes(const xla::SessionModule& session_module);
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
new file mode 100644
index 00000000000..270e062db64
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+static std::unique_ptr<Graph> BuildAddGraph() {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+  // See tf2xla/kernels/binary_ops.cc
+  auto c = ops::Add(scope.WithOpName("C"), a, b);
+  auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  return graph;
+}
+
+static std::vector<XlaCompiler::Argument> BuildAddGraphArguments() {
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  // Difference of dimension will add extra broadcast_dimensions.
+  // broadcast_dimension generates an additional HloInstruction
+  // in user_computation.cc
+  args[0].shape = TensorShape({2, 2});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2});
+  return args;
+}
+
+// CAVEAT: Debug purpose only.
+// This function dumps a protobuf string format of HloModule.
+static void DumpHloGraphForDebug(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph) {
+  std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  std::unique_ptr<FunctionLibraryRuntime> flr;
+  std::unique_ptr<XlaCompiler> compiler;
+
+  xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  FunctionDefLibrary flib;
+  flib_def.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  // Compiles the graph.
+  XlaCompiler::Options options;
+  DeviceType device_type("XLA_CPU_JIT");
+  options.device_type = &device_type;
+  options.client = client;
+  options.flib_def = flib_def.get();
+  compiler.reset(new XlaCompiler(options));
+
+  // Compile graph
+  XlaCompiler::CompilationResult result;
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), "dump",
+                                     std::move(graph), args, &result));
+
+  // Convert to hlo
+  xla::Computation& computation = *result.computation;
+
+  xla::Service* service(
+      static_cast<xla::Service*>(xla::ClientLibrary::GetXlaService(
+          static_cast<xla::LocalClient*>(client)->platform())));
+  const xla::ComputationTracker& computation_tracker =
+      service->computation_tracker();
+
+  auto user_computation_status =
+      computation_tracker.Resolve(computation.handle());
+  TF_CHECK_OK(user_computation_status.status());
+  auto user_computation = user_computation_status.ConsumeValueOrDie();
+  xla::VersionedComputationHandle versioned_handle =
+      user_computation->GetVersionedHandle();
+  std::unique_ptr<xla::HloModule> hlo_module =
+      std::move(computation_tracker
+                    .BuildHloModule(versioned_handle, xla::HloModuleConfig())
+                    .ValueOrDie());
+  VLOG(1) << "--- DUMP HLO ---";
+  VLOG(1) << hlo_module->ToString();
+}
+
+TEST(XlaTfGraphUtil, ConvertTfGraphToSessionModule) {
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
+  std::unique_ptr<Graph> graph = BuildAddGraph();
+
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<xla::SessionModule> session_module,
+      ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
+
+  ASSERT_EQ(5, session_module->entry().requests_size());
+
+  VLOG(1) << "--- DUMP ---";
+  VLOG(1) << session_module->DebugString();
+  DumpHloGraphForDebug(args, BuildAddGraph());
+}
+
+TEST(XlaTfGraphUtil, ConvertXlaSessionModuleToXlaNodes) {
+  std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
+  std::unique_ptr<Graph> graph = BuildAddGraph();
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<xla::SessionModule> session_module,
+      ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
+  TF_ASSIGN_OR_ASSERT_OK(auto xla_nodes,
+                         ConvertXlaSessionModuleToXlaNodes(*session_module));
+  EXPECT_EQ(session_module->entry().requests_size(), xla_nodes.size());
+}
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 903a42c665f..1f0b100bbbd 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -62,6 +62,7 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "full_path",
     "if_android",
     "if_ios",
     "if_x86",
@@ -81,6 +82,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 
 # For platform specific build config
 load(
@@ -107,6 +109,9 @@ load(
     "tf_kernel_tests_linkstatic",
     "tf_additional_cloud_op_deps",
     "tf_additional_cloud_kernel_deps",
+    "tf_lib_proto_parsing_deps",
+    "tf_additional_verbs_lib_defines",
+    "tf_additional_mpi_lib_defines",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -116,6 +121,8 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -137,6 +144,8 @@ CORE_PROTO_SRCS = [
     "framework/log_memory.proto",
     "framework/node_def.proto",
     "framework/op_def.proto",
+    "framework/reader_base.proto",
+    "framework/remote_fused_graph_execute_info.proto",
     "framework/resource_handle.proto",
     "framework/step_stats.proto",
     "framework/summary.proto",
@@ -145,10 +154,15 @@ CORE_PROTO_SRCS = [
     "framework/tensor_shape.proto",
     "framework/tensor_slice.proto",
     "framework/types.proto",
+    "framework/variable.proto",
     "framework/versions.proto",
     "lib/core/error_codes.proto",
     "protobuf/config.proto",
+    "protobuf/cluster.proto",
     "protobuf/debug.proto",
+    "protobuf/device_properties.proto",
+    "protobuf/queue_runner.proto",
+    "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
     "util/memmapped_file_system.proto",
@@ -162,11 +176,9 @@ CORE_PROTO_SRCS = [
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
-    "framework/variable.proto",
     "protobuf/control_flow.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
-    "protobuf/queue_runner.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
     "util/event.proto",
@@ -178,12 +190,25 @@ tf_proto_library(
     srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     go_api_version = 2,
+    j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
     js_codegen = "jspb",
     visibility = ["//visibility:public"],
 )
 
+exports_files([
+    "framework/types.proto",
+])
+
+tf_proto_library(
+    name = "protos_test",
+    srcs = ["util/example_proto_fast_parsing_test.proto"],
+    cc_api_version = 2,
+    protodeps = [":protos_all"],
+    visibility = ["//visibility:public"],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -203,10 +228,7 @@ cc_library(
         "platform/types.h",
     ] + glob(tf_additional_proto_hdrs()) + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-    deps = [
-        ":protos_all_cc",
-        "//tensorflow/core/platform/default/build_config:proto_parsing",
-    ],
+    deps = tf_lib_proto_parsing_deps(),
 )
 
 cc_library(
@@ -228,6 +250,7 @@ cc_library(
         "lib/gtl/flatmap.h",
         "lib/gtl/flatset.h",
         "lib/gtl/inlined_vector.h",
+        "lib/gtl/optional.h",
         "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
         "lib/histogram/histogram.h",
@@ -247,6 +270,7 @@ cc_library(
         "lib/monitoring/sampler.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
+        "lib/random/random_distributions.h",
         "lib/random/simple_philox.h",
         "lib/strings/numbers.h",
         "lib/strings/str_util.h",
@@ -314,6 +338,7 @@ tf_cuda_library(
     hdrs = [
         "example/feature_util.h",
         "framework/allocator.h",
+        "framework/allocator_registry.h",
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
@@ -357,6 +382,7 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
+        "util/env_var.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -379,15 +405,37 @@ tf_cuda_library(
         "util/work_sharder.h",
     ] + select({
         "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "util/memmapped_file_system.h",
             "util/memmapped_file_system_writer.h",
         ],
-    }),
+    }) + if_mkl([
+        "util/mkl_util.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [":framework_internal"],
 )
 
+cc_library(
+    name = "overflow",
+    hdrs = ["util/overflow.h"],
+    deps = [
+        ":framework_lite",
+    ],
+)
+
+cc_library(
+    name = "reader_base",
+    srcs = ["framework/reader_base.cc"],
+    hdrs = ["framework/reader_base.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":framework",
+        ":lib",
+    ],
+)
+
 tf_proto_library_cc(
     name = "op_gen_overrides_proto",
     srcs = ["framework/op_gen_overrides.proto"],
@@ -404,9 +452,16 @@ cc_library(
     deps = [
         ":lib",
         ":op_gen_overrides_proto_cc",
+        ":protos_all_cc",
     ],
 )
 
+cc_library(
+    name = "session_options",
+    hdrs = ["public/session_options.h"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "framework_lite",
     srcs = tf_additional_minimal_lib_srcs(),
@@ -416,7 +471,9 @@ cc_library(
         "framework/type_traits.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
+        "platform/default/logging.h",
         "platform/default/mutex.h",
+        "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/dynamic_annotations.h",
         "platform/macros.h",
@@ -442,11 +499,13 @@ tf_gen_op_libs(
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
+        "dataset_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "lookup_ops",
         "logging_ops",
         "math_ops",
         "nn_ops",
@@ -460,12 +519,34 @@ tf_gen_op_libs(
         "script_ops",
         "sendrecv_ops",
         "sparse_ops",
+        "spectral_ops",
         "state_ops",
+        "stateless_random_ops",
         "string_ops",
         "training_ops",
     ],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "audio_ops",
+    ],
+    deps = [":lib"],
+)
+
+cc_library(
+    name = "debug_ops_op_lib",
+    srcs = ["ops/debug_ops.cc"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":framework",
+        "//tensorflow/core/kernels:debug_ops",
+    ],
+    alwayslink = 1,
+)
+
 # And one for all user ops
 cc_library(
     name = "user_ops_op_lib",
@@ -486,30 +567,23 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "cloud_ops_op_lib",
-    srcs = ["ops/cloud_ops.cc"],
-    copts = tf_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [":framework"],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
     deps = [
         ":array_ops_op_lib",
+        ":audio_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
         ":data_flow_ops_op_lib",
+        ":dataset_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
+        ":lookup_ops_op_lib",
         ":logging_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
@@ -521,7 +595,9 @@ cc_library(
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
         ":sparse_ops_op_lib",
+        ":spectral_ops_op_lib",
         ":state_ops_op_lib",
+        ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
@@ -623,15 +699,19 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:audio",
+        "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
+        "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
+        "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
@@ -639,6 +719,7 @@ cc_library(
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
@@ -646,6 +727,7 @@ cc_library(
         "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
         "//tensorflow/core/kernels:state",
+        "//tensorflow/core/kernels:stateless_random_ops",
         "//tensorflow/core/kernels:string",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
@@ -654,6 +736,17 @@ cc_library(
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels/neon:neon_depthwise_conv_op",
+    ]) + if_mkl([
+        "//tensorflow/core/kernels:mkl_concat_op",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
+        "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
+        "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
 
@@ -710,15 +803,13 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         ":shape_inference_testutil",
         ":tensor_testutil",
         ":test",
         "//tensorflow/core/kernels:constant_op",
-        "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform/default/build_config:gtest",
+        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"])
     ],
 )
 
@@ -754,7 +845,7 @@ filegroup(
 
 # Core sources for Android builds.
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         ":proto_text_srcs_all",
         "//tensorflow/core/kernels:android_srcs",
@@ -925,6 +1016,27 @@ cc_library(
     alwayslink = 1,
 )
 
+# Android library for use with the SELECTIVE_REGISTRATION feature with
+# no proto_rtti.
+cc_library(
+    name = "android_tensorflow_lib_selective_registration_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts() + tf_opts_nortti_if_android() + [
+        "-Os",
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":protos_cc",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 filegroup(
     name = "android_op_registrations_and_gradients",
     srcs = glob(
@@ -953,6 +1065,7 @@ filegroup(
         ":framework/shape_inference_testutil.h",
         ":framework/tensor_testutil.cc",
         ":framework/tensor_testutil.h",
+        ":platform/test.cc",
         ":platform/test.h",
         ":util/reporter.cc",
         ":util/reporter.h",
@@ -980,6 +1093,12 @@ cc_library(
     name = "android_tensorflow_test_lib",
     testonly = 1,
     srcs = if_android([":android_test_srcs"]),
+    hdrs = [
+        "framework/fake_input.h",
+        "framework/shape_inference_testutil.h",
+        "framework/tensor_testutil.h",
+        "util/reporter.h",
+    ],
     copts = tf_copts() + ["-Os"],
     tags = [
         "manual",
@@ -1080,30 +1199,35 @@ tf_proto_library_cc(
     ],
 )
 
+LIB_INTERNAL_WINDOWS_DEPS = glob(
+    [
+        "lib/**/*.h",
+        "lib/**/*.cc",
+        "platform/*.h",
+        "platform/*.cc",
+        "platform/profile_utils/**/*.h",
+        "platform/profile_utils/**/*.cc",
+    ],
+    exclude = [
+        "**/*test*",
+        "lib/hash/crc32c_accelerate.cc",
+        "lib/gif/**/*",
+        "lib/jpeg/**/*",
+        "platform/gif.h",
+        "platform/jpeg.h",
+        "platform/**/env_time.cc",
+        "platform/**/cuda.h",
+        "platform/**/cuda_libdevice_path.cc",
+        "platform/**/stream_executor.h",
+        "platform/load_library.cc",
+    ],
+)
+
 cc_library(
     name = "lib_internal",
     srcs = select({
-        "//tensorflow:windows": glob(
-            [
-                "lib/**/*.h",
-                "lib/**/*.cc",
-                "platform/*.h",
-                "platform/*.cc",
-            ],
-            exclude = [
-                "**/*test*",
-                "lib/hash/crc32c_accelerate.cc",
-                "lib/gif/**/*",
-                "lib/jpeg/**/*",
-                "platform/gif.h",
-                "platform/jpeg.h",
-                "platform/**/env_time.cc",
-                "platform/**/cuda.h",
-                "platform/**/cuda_libdevice_path.cc",
-                "platform/**/stream_executor.h",
-                "platform/load_library.cc",
-            ],
-        ),
+        "//tensorflow:windows": LIB_INTERNAL_WINDOWS_DEPS,
+        "//tensorflow:windows_msvc": LIB_INTERNAL_WINDOWS_DEPS,
         "//conditions:default": glob(
             [
                 "lib/**/*.h",
@@ -1175,18 +1299,29 @@ cc_library(
         "platform/host_info.h",
         "platform/platform.h",
         "platform/protobuf_internal.h",
+        "platform/setround.h",
         "platform/tensor_coding.h",
         "platform/tracing.h",
     ],
     copts = tf_copts(),
-    defines = tf_additional_lib_defines(),
-    linkopts = ["-ldl"],
+    defines = tf_additional_lib_defines() + [
+                  "SNAPPY",
+              ] + tf_additional_verbs_lib_defines() +
+              tf_additional_mpi_lib_defines(),
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": [
+            "-ldl",
+            "-lpthread",
+        ],
+    }),
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
         ":protos_all_cc",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
+        "@snappy",
         "@zlib_archive//:zlib",
     ],
 )
@@ -1207,7 +1342,10 @@ cc_library(
     ],
     hdrs = ["lib/gif/gif_io.h"],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }),
     deps = [
         ":lib",
         "//tensorflow/core/platform/default/build_config:gif",
@@ -1226,7 +1364,10 @@ cc_library(
         "lib/jpeg/jpeg_mem.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }),
     deps = [
         ":lib",
         "//tensorflow/core/platform/default/build_config:jpeg",
@@ -1269,6 +1410,11 @@ tf_cuda_library(
             "framework/**/*.cc",
             "util/**/*.h",
             "util/**/*.cc",
+        ] + [
+            "graph/edgeset.h",
+            "graph/edgeset.cc",
+            "graph/graph.h",
+            "graph/graph.cc",
         ],
         exclude = [
             "**/*test*",
@@ -1278,12 +1424,14 @@ tf_cuda_library(
             "util/reporter.cc",
             "framework/fake_input.*",
             "framework/op_gen_lib.*",
+            "framework/reader_base.*",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
             "util/version_info.cc",
         ],
     ) + select({
         "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "util/memmapped_file_system.h",
             "util/memmapped_file_system.cc",
@@ -1299,13 +1447,16 @@ tf_cuda_library(
         "framework/unique_tensor_references.h",
         "util/command_line_flags.h",
         "util/env_var.h",
+        "util/equal_graph_def.h",
         "util/presized_cuckoo_map.h",
         "util/tensor_slice_set.h",
         "util/tensor_slice_util.h",
     ],
     copts = tf_copts(),
-    linkopts = [
-        "-ldl",
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
         "-lm",
     ],
     deps = [
@@ -1316,7 +1467,7 @@ tf_cuda_library(
         ":version_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
     alwayslink = 1,
 )
 
@@ -1325,6 +1476,7 @@ cc_header_only_library(
     visibility = ["//visibility:public"],
     deps = [
         ":framework",
+        ":reader_base",
     ],
 )
 
@@ -1368,54 +1520,169 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
+CORE_CPU_BASE_HDRS = [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+    "graph/algorithm.h",
+    "graph/colors.h",
+    "graph/control_flow.h",
+    "graph/costmodel.h",
+    "graph/default_device.h",
+    "graph/edgeset.h",
+    "graph/graph.h",
+    "graph/graph_constructor.h",
+    "graph/graph_def_builder.h",
+    "graph/graph_partition.h",
+    "graph/mkl_layout_pass.h",
+    "graph/mkl_tfconversion_pass.h",
+    "graph/node_builder.h",
+    "graph/optimizer_cse.h",
+    "graph/subgraph.h",
+    "graph/tensor_id.h",
+    "graph/testlib.h",
+    "graph/types.h",
+    "graph/validate.h",
+]
+
 tf_cuda_library(
-    name = "core_cpu_internal",
-    srcs = glob(
-        [
-            "client/**/*.cc",
-            "common_runtime/*.h",
-            "common_runtime/*.cc",
-            "framework/versions.h",
-            "graph/**/*.h",
-            "graph/**/*.cc",
-            "public/session.h",
-            "public/session_options.h",
-            "public/version.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "**/*main.cc",
-            "common_runtime/direct_session.cc",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
-    hdrs = glob(
-        [
-            "common_runtime/*.h",
-            "framework/versions.h",
-            "graph/**/*.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/algorithm.cc",
+        "graph/colors.cc",
+        "graph/control_flow.cc",
+        "graph/costmodel.cc",
+        "graph/graph_constructor.cc",
+        "graph/graph_def_builder.cc",
+        "graph/graph_partition.cc",
+        "graph/node_builder.cc",
+        "graph/optimizer_cse.cc",
+        "graph/subgraph.cc",
+        "graph/tensor_id.cc",
+        "graph/validate.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS,
     copts = tf_copts(),
     deps = [
         ":framework",
         ":framework_internal",
-        ":function_ops_op_lib",
-        ":functional_grad",
-        ":functional_ops_op_lib",
         ":lib",
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
-        "//third_party/eigen3",
+        "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:required",
-    ] + tf_additional_core_deps(),
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "core_cpu_internal",
+    srcs = [
+        "common_runtime/allocator_retry.cc",
+        "common_runtime/bfc_allocator.cc",
+        "common_runtime/build_graph_options.cc",
+        "common_runtime/constant_folding.cc",
+        "common_runtime/copy_tensor.cc",
+        "common_runtime/costmodel_manager.cc",
+        "common_runtime/debugger_state_interface.cc",
+        "common_runtime/device.cc",
+        "common_runtime/device_factory.cc",
+        "common_runtime/device_mgr.cc",
+        "common_runtime/device_set.cc",
+        "common_runtime/executor.cc",
+        "common_runtime/function.cc",
+        "common_runtime/graph_optimizer.cc",
+        "common_runtime/graph_runner.cc",
+        "common_runtime/local_device.cc",
+        "common_runtime/memory_types.cc",
+        "common_runtime/optimization_registry.cc",
+        "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/process_util.cc",
+        "common_runtime/renamed_device.cc",
+        "common_runtime/rendezvous_mgr.cc",
+        "common_runtime/resource_variable_read_optimizer.cc",
+        "common_runtime/session.cc",
+        "common_runtime/session_factory.cc",
+        "common_runtime/session_options.cc",
+        "common_runtime/session_state.cc",
+        "common_runtime/simple_graph_execution_state.cc",
+        "common_runtime/simple_placer.cc",
+        "common_runtime/stats_publisher_interface.cc",
+        "common_runtime/step_stats_collector.cc",
+        "common_runtime/threadpool_device.cc",
+        "common_runtime/threadpool_device_factory.cc",
+        "graph/gradients.cc",
+        "graph/mkl_layout_pass.cc",
+        "graph/mkl_tfconversion_pass.cc",
+        "graph/quantize_training.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS + [
+        "common_runtime/allocator_retry.h",
+        "common_runtime/bfc_allocator.h",
+        "common_runtime/build_graph_options.h",
+        "common_runtime/constant_folding.h",
+        "common_runtime/copy_tensor.h",
+        "common_runtime/costmodel_manager.h",
+        "common_runtime/debugger_state_interface.h",
+        "common_runtime/device_factory.h",
+        "common_runtime/device_mgr.h",
+        "common_runtime/device_set.h",
+        "common_runtime/dma_helper.h",
+        "common_runtime/eigen_thread_pool.h",
+        "common_runtime/executor.h",
+        "common_runtime/function.h",
+        "common_runtime/graph_optimizer.h",
+        "common_runtime/local_device.h",
+        "common_runtime/memory_types.h",
+        "common_runtime/mkl_cpu_allocator.h",
+        "common_runtime/optimization_registry.h",
+        "common_runtime/pending_counts.h",
+        "common_runtime/process_util.h",
+        "common_runtime/profile_handler.h",
+        "common_runtime/renamed_device.h",
+        "common_runtime/rendezvous_mgr.h",
+        "common_runtime/session_factory.h",
+        "common_runtime/simple_graph_execution_state.h",
+        "common_runtime/simple_placer.h",
+        "common_runtime/stats_publisher_interface.h",
+        "common_runtime/step_stats_collector.h",
+        "common_runtime/threadpool_device.h",
+        "common_runtime/visitable_allocator.h",
+        "graph/gradients.h",
+        "graph/quantize_training.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+               ":core_cpu_base",
+               ":framework",
+               ":framework_internal",
+               ":function_ops_op_lib",
+               ":functional_grad",
+               ":functional_ops_op_lib",
+               ":lib",
+               ":lib_internal",
+               ":proto_text",
+               ":protos_all_cc",
+               "//tensorflow/core/grappler:grappler_item",
+               "//tensorflow/core/grappler/clusters:utils",
+               "//tensorflow/core/grappler/clusters:virtual_cluster",
+               "//tensorflow/core/grappler/optimizers:meta_optimizer",
+               "//third_party/eigen3",
+               "//tensorflow/core/kernels:required",
+           ] + if_mkl(["//third_party/mkl:intel_binary_blob"]) +
+           tf_additional_core_deps(),
     alwayslink = 1,
 )
 
@@ -1436,7 +1703,10 @@ cc_library(
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
-    hdrs = ["common_runtime/direct_session.h"],
+    hdrs = [
+        "common_runtime/direct_session.h",
+        "util/env_var.h",
+    ],
     copts = tf_copts(),
     cuda_deps = [
         ":gpu_tracer",
@@ -1449,6 +1719,8 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
+        "//tensorflow/core/kernels:function_ops",
     ],
     alwayslink = 1,
 )
@@ -1495,7 +1767,6 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
         "common_runtime/gpu/gpu_device_factory.cc",
-        "common_runtime/gpu/gpu_init.cc",
         "common_runtime/gpu/gpu_stream_util.cc",
         "common_runtime/gpu/gpu_util.cc",
         "common_runtime/gpu/gpu_util_platform_specific.cc",
@@ -1520,6 +1791,7 @@ tf_cuda_library(
         ":core_cpu_internal",
         ":framework",
         ":framework_internal",
+        ":gpu_init",
         ":gpu_lib",
         ":lib",
         ":lib_internal",
@@ -1530,6 +1802,26 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+tf_cuda_library(
+    name = "gpu_init",
+    srcs = [
+        "common_runtime/gpu/gpu_init.cc",
+    ],
+    hdrs = [
+        "common_runtime/gpu/gpu_init.h",
+    ],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":stream_executor",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "sycl_runtime",
     srcs = if_not_windows([
@@ -1541,6 +1833,7 @@ cc_library(
     hdrs = if_not_windows([
         "common_runtime/sycl/sycl_allocator.h",
         "common_runtime/sycl/sycl_device.h",
+        "common_runtime/sycl/sycl_util.h",
         "common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),
@@ -1599,7 +1892,6 @@ cc_library(
         ":framework",
         ":lib",
         ":lib_internal",
-        ":test",
     ],
 )
 
@@ -1614,7 +1906,7 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":test",
+        ":test",  # buildcleaner: keep
         "//tensorflow/core/platform/default/build_config:test_main",
     ],
     alwayslink = 1,
@@ -1644,6 +1936,7 @@ tf_cc_tests(
         "lib/gtl/iterator_range_test.cc",
         "lib/gtl/manual_constructor_test.cc",
         "lib/gtl/map_util_test.cc",
+        "lib/gtl/optional_test.cc",
         "lib/gtl/top_n_test.cc",
         "lib/hash/crc32c_test.cc",
         "lib/hash/hash_test.cc",
@@ -1709,6 +2002,24 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_setround_test",
+    size = "small",
+    srcs = ["platform/setround_test.cc"],
+    tags = [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "platform_file_system_test",
     size = "small",
@@ -1723,6 +2034,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "util_overflow_test",
+    size = "small",
+    srcs = ["util/overflow_test.cc"],
+    deps = [
+        ":framework_lite",
+        ":overflow",
+        ":test",
+        ":test_main",
+    ],
+)
+
 cc_test(
     name = "lib_jpeg_jpeg_mem_unittest",
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
@@ -1755,18 +2078,40 @@ cc_test(
     deps = [
         ":lib",
         ":lib_internal",
-        ":lib_test_internal",
         ":test",
         ":test_main",
     ],
 )
 
+tf_cc_test(
+    name = "quantize_training_test",
+    srcs = ["graph/quantize_training_test.cc"],
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
     srcs = [
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
+        "common_runtime/resource_variable_read_optimizer_test.cc",
         "common_runtime/pending_counts_test.cc",
         "common_runtime/session_test.cc",
         "common_runtime/simple_placer_test.cc",
@@ -1802,20 +2147,18 @@ tf_cc_tests(
         "framework/unique_tensor_references_test.cc",
         "graph/algorithm_test.cc",
         "graph/edgeset_test.cc",
-        "graph/equal_graph_def_test.cc",
-        "graph/graph_constructor_test.cc",
         "graph/graph_def_builder_test.cc",
         "graph/graph_partition_test.cc",
         "graph/graph_test.cc",
         "graph/node_builder_test.cc",
         "graph/optimizer_cse_test.cc",
-        "graph/quantize_training_test.cc",
         "graph/subgraph_test.cc",
         "graph/tensor_id_test.cc",
         "graph/validate_test.cc",
         "util/bcast_test.cc",
         "util/command_line_flags_test.cc",
         "util/device_name_utils_test.cc",
+        "util/equal_graph_def_test.cc",
         "util/events_writer_test.cc",
         "util/example_proto_fast_parsing_test.cc",
         "util/example_proto_helper_test.cc",
@@ -1848,6 +2191,7 @@ tf_cc_tests(
         ":lib_internal",
         ":ops",
         ":protos_all_cc",
+        ":protos_test_cc",
         ":test",
         ":test_main",
         ":testlib",
@@ -1860,33 +2204,79 @@ tf_cc_tests(
     ],
 )
 
-if_mkl(
-    tf_cc_test_mkl(
-        name = "mkl_related_tests",
-        size = "small",
-        srcs = ["graph/mkl_optimizer_merge_test.cc"],
-        linkstatic = tf_kernel_tests_linkstatic(),
-        deps = [
-            ":core",
-            ":core_cpu",
-            ":core_cpu_internal",
-            ":direct_session_internal",
-            ":framework",
-            ":framework_internal",
-            ":lib",
-            ":lib_internal",
-            ":ops",
-            ":protos_all_cc",
-            ":test",
-            ":test_main",
-            ":testlib",
-            "//tensorflow/cc:cc_ops",
-            "//tensorflow/cc:scope",
-            "//tensorflow/cc:sendrecv_ops",
-            "//tensorflow/core/kernels:ops_util",
-            "//third_party/eigen3",
-        ],
-    ),
+tf_cc_tests(
+    name = "higher_level_tests_needing_kernels",
+    size = "small",
+    srcs = [
+        "graph/graph_constructor_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test_mkl(
+    name = "mkl_related_tests",
+    size = "small",
+    srcs = [
+        "graph/mkl_layout_pass_test.cc",
+        "graph/mkl_tfconversion_pass_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:mkl_concat_op",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
+        "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
+        "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
 )
 
 tf_cc_tests_gpu(
@@ -1931,6 +2321,18 @@ tf_cc_test_gpu(
     ],
 )
 
+tf_cuda_only_cc_test(
+    name = "util_cuda_kernel_helper_test",
+    srcs = [
+        "util/cuda_kernel_helper_test.cu.cc",
+    ],
+    deps = [
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test_gpu(
     name = "memory_types_test",
     size = "small",
@@ -1979,9 +2381,12 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/core/kernels:bcast_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:concat_op",
+        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
@@ -2098,6 +2503,7 @@ tf_cc_test(
     srcs = ["common_runtime/direct_session_with_tracking_alloc_test.cc"],
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2172,6 +2578,9 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
@@ -2313,6 +2722,7 @@ tf_cc_test(
     size = "small",
     srcs = ["ops/math_grad_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2377,6 +2787,7 @@ tf_cc_tests(
         "ops/random_ops_test.cc",
         "ops/set_ops_test.cc",
         "ops/sparse_ops_test.cc",
+        "ops/spectral_ops_test.cc",
         "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
         "ops/training_ops_test.cc",
@@ -2474,9 +2885,26 @@ filegroup(
         # -- hand-edited variant: stops after a restart marker
         "lib/jpeg/testdata/corrupt34_4.jpg",
         # GIF data
+        "lib/gif/testdata/lena.gif",
         "lib/gif/testdata/scan.gif",
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
+        # BMP data
+        "lib/bmp/testdata/lena.bmp",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "lmdb_testdata",
+    testonly = 1,
+    srcs = [
+        # A simple key-value store:
+        #   0 : 'a'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'j'
+        "lib/lmdb/testdata/data.mdb",
     ],
     visibility = ["//visibility:public"],
 )
@@ -2514,3 +2942,9 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 38ce4467077..2cf668400e6 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -215,8 +215,8 @@ void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
       if (log_counter < 10) {
         log_counter++;
         LOG(WARNING)
-            << "Ran out of memory trying to allocate "
-            << strings::HumanReadableNumBytes(num_bytes)
+            << "Allocator (" << Name() << ") ran out of memory trying "
+            << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
             << ". The caller indicates that this is not a failure, but"
             << " may mean that there could be performance gains if more"
             << " memory is available.";
@@ -270,11 +270,11 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   // couldn't find one.  This means we must have run out of memory,
   // Dump the memory log for analysis.
   if (dump_log_on_failure) {
+    LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying "
+                 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
+                 << ".  Current allocation summary follows.";
     DumpMemoryLog(rounded_bytes);
     LOG(WARNING) << RenderOccupancy();
-    LOG(WARNING) << "Ran out of memory trying to allocate "
-                 << strings::HumanReadableNumBytes(num_bytes)
-                 << ".  See logs for memory state.";
   }
   return nullptr;
 }
@@ -453,8 +453,8 @@ void BFCAllocator::RemoveFreeChunkIterFromBin(
 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
-  int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
-  CHECK(count > 0) << "Could not find chunk in bin";
+  CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
+      << "Could not find chunk in bin";
   c->bin_num = kInvalidBinNum;
 }
 
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 0b528cb0c27..b74c161dcec 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -78,7 +78,7 @@ class BFCAllocator : public VisitableAllocator {
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
-  typedef int ChunkHandle;
+  typedef size_t ChunkHandle;
   static const int kInvalidChunkHandle = -1;
 
   typedef int BinNum;
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index c6d4bdad9c1..5f0e8f170b9 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
@@ -30,6 +31,13 @@ struct BuildGraphOptions {
   // the former via "ref" fetch_endpoints.
   std::vector<string> target_nodes;
 
+  // If `true`, uses Arg/Retval to implement feeds/fetches; otherwise
+  // uses Recv/Send to implement feeds/fetches.
+  // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
+  bool use_function_convention = false;
+
+  DebugOptions debug_options;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 5db49aa498c..914683d9fa3 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -42,7 +44,10 @@ namespace tensorflow {
 namespace {
 
 bool IsConstantFoldable(const Node* n,
-                        std::function<bool(const Node*)> consider) {
+                        const std::function<bool(const Node*)>& consider) {
+  if (n->IsConstant()) {
+    return true;
+  }
   if (n->op_def().is_stateful()) {
     return false;
   }
@@ -77,50 +82,65 @@ bool IsConstantFoldable(const Node* n,
   return true;
 }
 
-// Returns the constant foldable nodes in `nodes_result` in data flow order.
-void FindConstantFoldableNodes(const Graph* graph,
-                               const FunctionLibraryDefinition* flib_def,
-                               ConstantFoldingOptions opts,
-                               std::vector<Node*>* nodes_result) {
-  std::set<const Node*> node_set;
-  std::vector<Node*>& nodes = *nodes_result;
+// Returns the constant foldable nodes in `nodes` in topological order.
+// Populates `constant_control_deps` with the non-constant control dependencies
+// of each constant node.
+void FindConstantFoldableNodes(
+    const Graph* graph, ConstantFoldingOptions opts, std::vector<Node*>* nodes,
+    std::unordered_map<const Node*, gtl::FlatSet<Node*>>*
+        constant_control_deps) {
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order
-  ReverseDFS(*graph, nullptr, [&nodes, &node_set, &internal_node_inserted, opts,
-                               flib_def](Node* n) {
-    if (n->IsConstant()) {
-      // Constants with no control inputs (except from _SOURCE node)
-      // are definitely constant foldable.
-      if (n->in_edges().size() == 0 ||
-          (n->in_edges().size() == 1 &&
-           (*n->in_edges().begin())->src()->IsSource())) {
-        node_set.insert(n);
-        nodes.push_back(n);
-      }
-    } else if (IsConstantFoldable(n, opts.consider)) {
-      // Check whether the set of this node's in_nodes is completely
-      // included in the set of constant foldable nodes. If true,
-      // then this node is also constant foldable.
-      bool all_parents_constant = true;
-      for (const Node* parent : n->in_nodes()) {
-        if (node_set.count(parent) == 0 && !parent->IsSource()) {
-          all_parents_constant = false;
-          break;
+  ReverseDFS(
+      *graph, nullptr,
+      [nodes, constant_control_deps, &internal_node_inserted, opts](Node* n) {
+        if (IsConstantFoldable(n, opts.consider)) {
+          // A node is constant provided all of its non-control
+          // incoming Tensors come from constant nodes.
+          //
+          // We allow control dependencies from non-constant nodes to constant
+          // nodes, but to preserve the graph structure we must transfer the
+          // control dependency onto any constant replacement.
+          bool all_parents_constant = true;
+          for (const Edge* in : n->in_edges()) {
+            // Allows non-constant -> constant control edges.
+            if (!in->IsControlEdge() &&
+                constant_control_deps->count(in->src()) == 0) {
+              all_parents_constant = false;
+              break;
+            }
+          }
+          if (all_parents_constant) {
+            gtl::FlatSet<Node*>& control_deps = (*constant_control_deps)[n];
+            for (const Edge* e : n->in_edges()) {
+              if (constant_control_deps->count(e->src()) == 0) {
+                if (!e->src()->IsSource()) {
+                  control_deps.insert(e->src());
+                }
+              } else {
+                // If the parent is constant, add all of its transitive control
+                // deps.
+                const gtl::FlatSet<Node*>& parent_deps =
+                    (*constant_control_deps)[e->src()];
+                control_deps.insert(parent_deps.begin(), parent_deps.end());
+              }
+            }
+            nodes->push_back(n);
+            if (!n->IsConstant()) {
+              internal_node_inserted = true;
+            }
+          }
         }
-      }
-      if (all_parents_constant) {
-        node_set.insert(n);
-        nodes.push_back(n);
-        internal_node_inserted = true;
-      }
-    }
-  });
+      });
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
-    nodes.clear();
+    nodes->clear();
+    constant_control_deps->clear();
   }
 }
 
+typedef std::pair<Node*, int> NodeAndOutput;
+
 // Given the constant foldable nodes in 'nodes', returns a new graph 'g'. 'g'
 // will contain copies of the nodes in 'nodes'. In addition, if there is an edge
 // going from a node 'n' in 'nodes' to another node in 'orig_graph' but not in
@@ -131,23 +151,21 @@ Graph* GetConstantGraph(const Graph* orig_graph,
                         std::map<NodeAndOutput, Node*>* tensors_to_fetch) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, Node*> node_map;
-  std::set<Node*> already_added;
-  already_added.insert(constant_graph->source_node());
-  already_added.insert(constant_graph->sink_node());
   node_map[orig_graph->source_node()] = constant_graph->source_node();
   node_map[orig_graph->sink_node()] = constant_graph->sink_node();
   for (Node* n : nodes) {
     Node* added = constant_graph->CopyNode(n);
     node_map[n] = added;
-    already_added.insert(added);
     for (const Edge* in_edge : n->in_edges()) {
-      Node* in = in_edge->src();
-      CHECK_GT(node_map.count(in), size_t{0}) << n->DebugString() << " <-"
-                                              << in->DebugString();
-      CHECK_GT(already_added.count(node_map[in]), size_t{0})
-          << in->DebugString();
-      constant_graph->AddEdge(node_map[in], in_edge->src_output(), added,
-                              in_edge->dst_input());
+      // Don't copy control edges to the constant graph.
+      if (!in_edge->IsControlEdge()) {
+        Node* in = in_edge->src();
+        auto it = node_map.find(in);
+        CHECK(it != node_map.end())
+            << n->DebugString() << " <-" << in->DebugString();
+        constant_graph->AddEdge(it->second, in_edge->src_output(), added,
+                                in_edge->dst_input());
+      }
     }
   }
 
@@ -169,10 +187,15 @@ int64 UniqueConstantId() {
   return id.fetch_add(1);
 }
 
-}  // namespace
-
+// Replaces the identified Tensor in 'graph' by a 'Const' node with
+// the value supplied in 'constant'. 'partition_device', if non-null
+// is the device where the graph executes. Returns true if the
+// replacement was successful, false otherwise.
+// 'control_deps' is the set of nodes that should be control predecessors of the
+// new constant node.
 bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant) {
+                               NodeAndOutput tensor, const Tensor& constant,
+                               const gtl::FlatSet<Node*>& control_deps) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
@@ -236,8 +259,8 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     return false;
   }
 
-  VLOG(1) << "Replacing " << tensor.first->DebugString()
-          << " :: " << tensor.second << " with a constant";
+  VLOG(1) << "Replacing " << tensor.first->name() << " :: " << tensor.second
+          << " with a constant";
 
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
@@ -246,35 +269,30 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     graph->AddEdge(constant_node, 0, edge->dst(), edge->dst_input());
     graph->RemoveEdge(edge);
   }
-  graph->AddEdge(graph->source_node(), -1, constant_node, -1);
+  if (control_deps.empty()) {
+    graph->AddControlEdge(graph->source_node(), constant_node);
+  } else {
+    for (Node* node : control_deps) {
+      graph->AddControlEdge(node, constant_node);
+    }
+  }
   if (partition_device) {
     constant_node->set_assigned_device_name(partition_device->name());
   }
   return true;
 }
 
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph) {
-  bool was_mutated;
-  Status unused_status = DoConstantFoldingWithStatus(
-      opts, function_library, env, partition_device, graph, &was_mutated);
-  return was_mutated;
-}
+}  // namespace
 
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated) {
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated) {
   DumpGraph("Before", graph);
 
-  const FunctionLibraryDefinition* flib_def = nullptr;
-  if (function_library) {
-    flib_def = function_library->GetFunctionLibraryDefinition();
-  }
-
   std::vector<Node*> constant_foldable_nodes;
-  FindConstantFoldableNodes(graph, flib_def, opts, &constant_foldable_nodes);
+  std::unordered_map<const Node*, gtl::FlatSet<Node*>> constant_control_deps;
+  FindConstantFoldableNodes(graph, opts, &constant_foldable_nodes,
+                            &constant_control_deps);
   if (constant_foldable_nodes.empty()) {
     VLOG(1) << "No constant foldable nodes found";
     *was_mutated = false;
@@ -304,10 +322,18 @@ Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
     tensors_to_replace.push_back({n.second, n.first.second});
   }
 
+  auto graph_runner = std::unique_ptr<GraphRunner>(new GraphRunner(env));
   // Evaluate the constant foldable nodes.
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(constant_graph.get(), function_library, env,
-                              {} /* inputs*/, tensors_to_fetch_names, &outputs);
+  auto delete_tensors = gtl::MakeCleanup([&graph_runner, &outputs] {
+    // Output tensors need to be cleared before the GraphRunner is deleted.
+    outputs.clear();
+    graph_runner.reset(nullptr);
+  });
+
+  Status s =
+      graph_runner->Run(constant_graph.get(), function_library, {} /* inputs*/,
+                        tensors_to_fetch_names, &outputs);
   if (!s.ok()) {
     VLOG(1) << "Could not fetch constants: " << s;
     *was_mutated = false;
@@ -319,8 +345,11 @@ Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
   // original graph with those constants.
   int32 num_nodes_replaced = 0;
   for (size_t c = 0; c < outputs.size(); ++c) {
+    const gtl::FlatSet<Node*>& control_deps =
+        constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(graph, partition_device,
-                                  tensors_to_replace[c], outputs[c])) {
+                                  tensors_to_replace[c], outputs[c],
+                                  control_deps)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 9e3479e50b0..93289b875f5 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,12 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+// Options specific to constant folding optimizations.
+struct ConstantFoldingOptions {
+  // If "consider" is not a nullptr, then only constant fold a node "n" if
+  // consider(n) returns true.
+  std::function<bool(const Node*)> consider = nullptr;
+};
+
 // Perform constant folding optimization on "graph".
 // Looks for nodes in "graph" that can be completely evaluated statically, i.e.,
 // that are only dependent on constants. Evaluates those nodes on a CPU device
@@ -32,25 +40,9 @@ namespace tensorflow {
 // Sets `was_mutated` to true if and only if "graph" has been mutated.
 // The status is only set to a non-OK state if an unexpected error is hit
 // running the graph.
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated);
-
-// Version of the function that doesn't return a Status, for backwards
-// compatibility.
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph);
-
-typedef std::pair<Node*, int> NodeAndOutput;
-
-// Replaces the identified Tensor in 'graph' by a 'Const' node with
-// the value supplied in 'constant'. 'partition_device', if non-null
-// is the device where the graph executes. Returns true if the
-// replacement was successful, false otherwise.
-bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant);
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 3a9bdfe1441..4a8560960ed 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -30,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -42,27 +43,14 @@ namespace {
 
 class ConstantFoldingTest : public ::testing::Test {
  protected:
-  ConstantFoldingTest() { Reset(); }
-  void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
-
-  template <typename T>
-  Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
-    return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
-  }
-
-  template <typename T>
-  Node* Constant(T v) {
-    return test::graph::Constant(g_.get(), test::AsScalar(v));
-  }
-
   template <typename T>
   void ExpectNodeClose(const Node* n, gtl::ArraySlice<T> values,
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectClose(t, test::AsTensor(values, shape));
@@ -73,46 +61,57 @@ class ConstantFoldingTest : public ::testing::Test {
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectTensorEqual<T>(t, test::AsTensor(values, shape));
   }
 
-// Construct the following graph
-/*
-      s1  s2
-      |    |
-      m1   m2
-      / \ / \
-     a   b   c
-*/
-#define SIMPLE_GRAPH                                                  \
-  Reset();                                                            \
-  Graph* g = g_.get();                                                \
-  Node* a = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});            \
-  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});            \
-  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});            \
-  g->AddControlEdge(g->source_node(), a);                             \
-  g->AddControlEdge(g->source_node(), b);                             \
-  g->AddControlEdge(g->source_node(), c);                             \
-  Node* m1 = test::graph::Matmul(g, a, b, false, false);              \
-  Node* s1 = test::graph::Send(g, m1, "m1", "sender", 0, "receiver"); \
-  Node* m2 = test::graph::Matmul(g, b, c, false, false);              \
-  Node* s2 = test::graph::Send(g, m2, "m2", "sender", 0, "receiver"); \
-  g->AddControlEdge(s1, g->sink_node());                              \
-  g->AddControlEdge(s2, g->sink_node());
+  // Builds a map from node name to Node* for `graph`.
+  std::unordered_map<string, Node*> NodeNameIndex(const Graph& graph) {
+    std::unordered_map<string, Node*> index;
+    for (Node* node : graph.nodes()) {
+      index[node->name()] = node;
+    }
+    return index;
+  }
 
-  std::unique_ptr<Graph> g_;
+  // Constructs the following graph.
+  /*
+        s1  s2
+        |    |
+        m1   m2
+        / \ / \
+       a   b   c
+  */
+  void BuildSimpleGraph(Scope* scope) {
+    Scope& s = *scope;
+    auto a = ops::Const<float>(s, {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto b = ops::Const<float>(s, {1.0, 2.0, 3.0, 4.0}, {2, 2});
+    auto c = ops::Const<float>(s, {0.0, 1.0, 1.0, 0.0}, {2, 2});
+    auto m1 = ops::MatMul(s, a, b);
+    auto s1 = ops::_Send(s.WithOpName("s1"), m1, "m1", "sender", 0, "receiver");
+    auto m2 = ops::MatMul(s.WithOpName("m2"), b, c);
+    auto s2 = ops::_Send(s.WithOpName("s2"), m2, "m2", "sender", 0, "receiver");
+  }
 };
 
 TEST_F(ConstantFoldingTest, Basic) {
-  SIMPLE_GRAPH;
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
 
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
   // Nodes s1 and s2 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
   ExpectNodeClose<float>(*(s1->in_nodes().begin()), {1.0, 2.0, 3.0, 4.0},
@@ -123,11 +122,23 @@ TEST_F(ConstantFoldingTest, Basic) {
 }
 
 TEST_F(ConstantFoldingTest, ConsiderFunction) {
-  SIMPLE_GRAPH;
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
+
   ConstantFoldingOptions opts;
   // Do not allow constant folding of m2
-  opts.consider = [m2](const Node* n) { return m2 != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "m2" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
+  Node* m2 = index.at("m2");
 
   // Node s1 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -139,40 +150,52 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
-  SIMPLE_GRAPH;
-  Node* d = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});
-  g->AddControlEdge(g->source_node(), d);
-  Node* s3 = test::graph::Send(g, d, "d", "sender", 0, "receiver");
-  g->AddControlEdge(s3, g->sink_node());
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    BuildSimpleGraph(&s);
+    auto d = ops::Const<float>(s.WithOpName("d"), {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto s3 = ops::_Send(s.WithOpName("s3"), d, "d", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* d = index.at("d");
+  Node* s3 = index.at("s3");
 
   // Nodes s3 should still have d as input
   EXPECT_EQ(1, s3->num_inputs());
   EXPECT_EQ(*(s3->in_nodes().begin()), d);
 }
 
-#undef SIMPLE_GRAPH
-
 TEST_F(ConstantFoldingTest, TwoOutputs) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1 = test::graph::Send(g, test::graph::Identity(g, b, 1),
-                               strings::StrCat(b->name(), "1"), "sender", 0,
-                               "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1 = ops::_Send(s.WithOpName("b1"), ops::Identity(s, b.r1), "b1",
+                         "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
 
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   EXPECT_EQ(1, b1->num_inputs());
@@ -180,126 +203,164 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
 }
 
 TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1_ident = test::graph::Identity(g, b, 1);
-  Node* b1 = test::graph::Send(g, b1_ident, strings::StrCat(b->name(), "1"),
-                               "sender", 0, "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1_ident = ops::Identity(s.WithOpName("b1_ident"), b.r1);
+    auto b1 =
+        ops::_Send(s.WithOpName("b1"), b1_ident, "b1", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   ConstantFoldingOptions opts;
-  opts.consider = [b1_ident](const Node* n) { return b1_ident != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "b1_ident" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
+  Node* b1_ident = index.at("b1_ident");
+
   // 0th output of b should have been folded.
-  EXPECT_EQ(1, b0->num_inputs());
+  ASSERT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   // 1st output of b should still be b1_ident. However, b1_ident's input must
   // have been replaced with a constant.
-  EXPECT_EQ(1, b1->num_inputs());
+  ASSERT_EQ(1, b1->num_inputs());
   EXPECT_EQ(*(b1->in_nodes().begin()), b1_ident);
 
-  EXPECT_EQ(1, b1_ident->num_inputs());
+  ASSERT_EQ(1, b1_ident->num_inputs());
   ExpectNodeEqual<int>(*(b1_ident->in_nodes().begin()), {}, {0});
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceLargeConstant) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  Node* s1 = Constant<int>(std::vector<int>(5 * 1024 * 256 + 1, 0),
-                           {5 * 1024 * 256 + 1});
-  Node* concat_dim = Constant<int>(0);
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  // Concat s0 and s1. The resulting tensor would be of size 10M + 1 bytes
-  Node* concat = test::graph::Concat(g, concat_dim, {s0, s1});
-  Node* concat_send =
-      test::graph::Send(g, concat, "concat_send", "sender", 0, "receiver");
-  g->AddControlEdge(concat_send, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, 0, {5 * 1024 * 256});
+    auto s1 = ops::Const<int>(s, 0, {5 * 1024 * 256 + 1});
+    auto concat_dim = ops::Const<int>(s, 0);
+    auto concat = ops::Concat(s, {s0, s1}, concat_dim);
+    auto concat_send = ops::_Send(s.WithOpName("concat_send"), concat,
+                                  "concat_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above concat should not have been constant folded.
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceFunctionCall) {
-  FunctionDefLibrary fdef_lib;
-  *fdef_lib.add_function() = test::function::XTimesTwo();
+  FunctionDefLibrary flib;
+  *flib.add_function() = test::function::XTimesTwo();
 
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
-  g_.reset(new Graph(&flib_def));
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph g(flib_def);
+  {
+    Scope s = Scope::NewRootScope();
+    auto c = ops::Const<int32>(s.WithOpName("c"), {1}, {1});
+    TF_EXPECT_OK(s.graph()->AddFunctionLibrary(flib));
 
-  Graph* g = g_.get();
-  Node* s =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  g->AddControlEdge(g->source_node(), s);
+    // TODO(phawkins): there is no way to make a function call using the C++
+    // graph builder API.
+    NodeDef def;
+    TF_ASSERT_OK(
+        NodeDefBuilder("times_two", "XTimesTwo", s.graph()->op_registry())
+            .Input(c.name(), 0, DT_INT32)
+            .Finalize(&def));
+    Status status;
+    Node* times_two = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
+    s.graph()->AddEdge(c.node(), 0, times_two, 0);
 
-  NodeDef def;
-  TF_ASSERT_OK(NodeDefBuilder("times_two", "XTimesTwo", g->op_registry())
-                   .Input(s->name(), 0, DT_INT32)
-                   .Finalize(&def));
-  Status status;
-  Node* times_two = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-
-  Node* times_two_send = test::graph::Send(g, times_two, "times_two_send",
-                                           "sender", 0, "receiver");
-  g->AddControlEdge(times_two_send, g->sink_node());
+    auto times_two_send =
+        ops::_Send(s.WithOpName("times_two_send"), Output(times_two),
+                   "times_two_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above function call should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
-
-  g_ = nullptr;
 }
 
 REGISTER_OP("ConstantFoldingTestOp").Input("a: int64").Output("b: int64");
 
 TEST_F(ConstantFoldingTest, TestNoReplaceNonCPUOp) {
-  Graph* g = g_.get();
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto aconst = ops::Const<int64>(s, 0, {5});
 
-  Node* aconst = Constant<int64>(std::vector<int64>(5, 0), {5});
-  g->AddControlEdge(g->source_node(), aconst);
+    NodeDef def;
+    TF_ASSERT_OK(NodeDefBuilder("testop", "ConstantFoldingTestOp")
+                     .Input(aconst.name(), 0, DT_INT64)
+                     .Finalize(&def));
+    Status status;
+    Node* non_cpu = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
 
-  NodeDef def;
-  TF_ASSERT_OK(
-      NodeDefBuilder("testop", "ConstantFoldingTestOp", g->op_registry())
-          .Input(aconst->name(), 0, DT_INT64)
-          .Finalize(&def));
-  Status status;
-  Node* non_cpu = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-  g->AddEdge(aconst, 0, non_cpu, 0);
-
-  Node* non_cpu_send =
-      test::graph::Send(g, non_cpu, "non_cpu_send", "sender", 0, "receiver");
-  g->AddControlEdge(non_cpu_send, g->sink_node());
+    auto non_cpu_send =
+        ops::_Send(s.WithOpName("non_cpu_send"), Output(non_cpu),
+                   "non_cpu_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The non-CPU op should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
+}
+
+TEST_F(ConstantFoldingTest, ControlDependencies) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto c0 = ops::Const<int>(s, 1);
+    auto recv1 = ops::_Recv(s.WithOpName("recv1"), DT_FLOAT, "recv1", "sender",
+                            0, "receiver");
+    auto c1 = ops::Const<int>(s.WithControlDependencies(recv1), 2);
+    auto recv2 = ops::_Recv(s.WithOpName("recv2"), DT_FLOAT, "recv2", "sender",
+                            0, "receiver");
+    auto c2 = ops::Const<int>(s.WithControlDependencies(recv2), 3);
+    auto add = ops::Add(s.WithControlDependencies(c2), c0, c1);
+    auto send =
+        ops::_Send(s.WithOpName("send"), add, "send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* recv1 = index.at("recv1");
+  Node* recv2 = index.at("recv2");
+  Node* send = index.at("send");
+
+  ASSERT_EQ(1, send->num_inputs());
+  Node* p = *(send->in_nodes().begin());
+  ExpectNodeEqual<int>(p, {3}, {});
+
+  ASSERT_EQ(2, p->in_edges().size());
+  for (const Edge* e : p->in_edges()) {
+    EXPECT_TRUE(e->IsControlEdge());
+    EXPECT_TRUE(e->src() == recv1 || e->src() == recv2) << e->src()->name();
+  }
 }
 
 namespace {
@@ -365,8 +426,7 @@ class TestTFEnvironment : public ::tensorflow::EnvWrapper {
 }  // namespace
 
 TEST_F(ConstantFoldingTest, TestImmutableConst) {
-  Reset();
-  Graph* g = g_.get();
+  Graph g(OpRegistry::Global());
   Scope root = Scope::NewRootScope();
 
   auto a = ops::ImmutableConst(root, DT_DOUBLE, {2, 2}, kTestMemRegionName);
@@ -374,18 +434,16 @@ TEST_F(ConstantFoldingTest, TestImmutableConst) {
   auto c = ops::RandomGamma(root, {2, 2}, 2.0);
   auto result1 = ops::MatMul(root, a, b);
   auto result2 = ops::MatMul(root, result1, c);
-  TF_ASSERT_OK(root.ToGraph(g));
+  TF_ASSERT_OK(root.ToGraph(&g));
   TestTFEnvironment test_env;
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  Status status = ConstantFold(ConstantFoldingOptions{}, nullptr,
+                               Env::Default(), nullptr, &g, &was_mutated);
   EXPECT_FALSE(was_mutated);
   EXPECT_FALSE(status.ok());
-  status = DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                       &test_env, nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, &test_env,
+                            nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index b25131b07b5..ffd37faca42 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -71,7 +71,8 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
       if (ri.sender_device_type == src_device_type &&
           ri.receiver_device_type == dst_device_type) {
         ri.copy_function(send_dev_context, recv_dev_context, src, dst,
-                         src_alloc_attr, dst_alloc_attr, input, output, done);
+                         src_alloc_attr, dst_alloc_attr, input, output,
+                         std::move(done));
         return;
       }
     }
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.cc b/tensorflow/core/common_runtime/debugger_state_interface.cc
index 2e2fbcd7f40..c1a92f9a221 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.cc
+++ b/tensorflow/core/common_runtime/debugger_state_interface.cc
@@ -15,10 +15,43 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+
 namespace tensorflow {
 
+// static
 DebuggerStateFactory* DebuggerStateRegistry::factory_ = nullptr;
 
+// static
+DebugGraphDecoratorFactory* DebugGraphDecoratorRegistry::factory_ = nullptr;
+
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches) {
+  std::ostringstream oss;
+
+  for (const DebugTensorWatch& watch : watches) {
+    string tensor_name =
+        strings::StrCat(watch.node_name(), ":", watch.output_slot());
+    if (watch.tolerate_debug_op_creation_failures()) {
+      oss << "(TOL)";  // Shorthand for "tolerate".
+    }
+    oss << tensor_name << "|";
+
+    for (const string& debug_op : watch.debug_ops()) {
+      oss << debug_op << ",";
+    }
+
+    oss << "@";
+    for (const string& debug_url : watch.debug_urls()) {
+      oss << debug_url << ",";
+    }
+
+    oss << ";";
+  }
+
+  return oss.str();
+}
+
 // static
 void DebuggerStateRegistry::RegisterFactory(
     const DebuggerStateFactory& factory) {
@@ -27,11 +60,38 @@ void DebuggerStateRegistry::RegisterFactory(
 }
 
 // static
-std::unique_ptr<DebuggerStateInterface> DebuggerStateRegistry::CreateState(
-    const DebugOptions& debug_options) {
-  return (factory_ == nullptr || *factory_ == nullptr)
-             ? nullptr
-             : (*factory_)(debug_options);
+Status DebuggerStateRegistry::CreateState(
+    const DebugOptions& debug_options,
+    std::unique_ptr<DebuggerStateInterface>* state) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of debugger state failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *state = (*factory_)(debug_options);
+    return Status::OK();
+  }
+}
+
+// static
+void DebugGraphDecoratorRegistry::RegisterFactory(
+    const DebugGraphDecoratorFactory& factory) {
+  delete factory_;
+  factory_ = new DebugGraphDecoratorFactory(factory);
+}
+
+// static
+Status DebugGraphDecoratorRegistry::CreateDecorator(
+    const DebugOptions& options,
+    std::unique_ptr<DebugGraphDecoratorInterface>* decorator) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of graph decorator failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *decorator = (*factory_)(options);
+    return Status::OK();
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.h b/tensorflow/core/common_runtime/debugger_state_interface.h
index a4ff6b74fc5..e0fa9833730 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.h
+++ b/tensorflow/core/common_runtime/debugger_state_interface.h
@@ -18,27 +18,55 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
-class DebugOptions;  // Defined in core/protobuf/debug.h.
-class Device;
-class Graph;
+// Returns a summary string for the list of debug tensor watches.
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches);
 
 // An abstract interface for storing and retrieving debugging information.
 class DebuggerStateInterface {
  public:
   virtual ~DebuggerStateInterface() {}
 
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  virtual const string SummarizeDebugTensorWatches() = 0;
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // Args:
+  //   global_step: A global step count supplied by the caller of
+  //     Session::Run().
+  //   session_run_index: A chronologically sorted index for calls to the Run()
+  //     method of the Session object.
+  //   executor_step_index: A chronologically sorted index of invocations of the
+  //     executor charged to serve this Session::Run() call.
+  //   input_names: Name of the input Tensors (feed keys).
+  //   output_names: Names of the fetched Tensors.
+  //   target_names: Names of the target nodes.
+  virtual Status PublishDebugMetadata(
+      const int64 global_step, const int64 session_run_index,
+      const int64 executor_step_index, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_nodes) = 0;
+};
+
+class DebugGraphDecoratorInterface {
+ public:
+  virtual ~DebugGraphDecoratorInterface() {}
 
   // Insert special-purpose debug nodes to graph and dump the graph for
   // record. See the documentation of DebugNodeInserter::InsertNodes() for
   // details.
-  virtual Status DecorateGraphForDebug(Graph* graph, Device* device) = 0;
+  virtual Status DecorateGraph(Graph* graph, Device* device) = 0;
+
+  // Publish Graph to debug URLs.
+  virtual Status PublishGraph(const Graph& graph,
+                              const string& device_name) = 0;
 };
 
 typedef std::function<std::unique_ptr<DebuggerStateInterface>(
@@ -56,11 +84,12 @@ class DebuggerStateRegistry {
   // implementation based on DebugOptions.
   static void RegisterFactory(const DebuggerStateFactory& factory);
 
-  // If RegisterFactory() has been called, creates and returns a concrete
+  // If RegisterFactory() has been called, creates and supplies a concrete
   // DebuggerStateInterface implementation using the registered factory,
-  // owned by the caller.  Otherwise returns nullptr.
-  static std::unique_ptr<DebuggerStateInterface> CreateState(
-      const DebugOptions& debug_options);
+  // owned by the caller and return an OK Status. Otherwise returns an error
+  // Status.
+  static Status CreateState(const DebugOptions& debug_options,
+                            std::unique_ptr<DebuggerStateInterface>* state);
 
  private:
   static DebuggerStateFactory* factory_;
@@ -68,6 +97,24 @@ class DebuggerStateRegistry {
   TF_DISALLOW_COPY_AND_ASSIGN(DebuggerStateRegistry);
 };
 
+typedef std::function<std::unique_ptr<DebugGraphDecoratorInterface>(
+    const DebugOptions& options)>
+    DebugGraphDecoratorFactory;
+
+class DebugGraphDecoratorRegistry {
+ public:
+  static void RegisterFactory(const DebugGraphDecoratorFactory& factory);
+
+  static Status CreateDecorator(
+      const DebugOptions& options,
+      std::unique_ptr<DebugGraphDecoratorInterface>* decorator);
+
+ private:
+  static DebugGraphDecoratorFactory* factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DebugGraphDecoratorRegistry);
+};
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 78649afeb93..8fc64fff69a 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -23,15 +23,18 @@ limitations under the License.
 
 namespace tensorflow {
 
-Device::Device(Env* env, const DeviceAttributes& device_attributes,
-               Allocator* device_allocator)
+Device::Device(Env* env, const DeviceAttributes& device_attributes)
     : DeviceBase(env), device_attributes_(device_attributes) {
   CHECK(DeviceNameUtils::ParseFullName(name(), &parsed_name_))
       << "Invalid device name: " << name();
   rmgr_ = new ResourceMgr(parsed_name_.job);
 }
 
-Device::~Device() { delete rmgr_; }
+Device::~Device() {
+  if (rmgr_ != nullptr) {
+    DeleteResourceMgr();
+  }
+}
 
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 07c6bdd6831..ded7e383d17 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -34,8 +34,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/control_flow.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_attributes.pb_text.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -53,15 +53,16 @@ namespace tensorflow {
 
 class Device : public DeviceBase {
  public:
-  Device(Env* env, const DeviceAttributes& device_attributes,
-         Allocator* device_allocator);
+  Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
   // Full name of this device (see top comment).
   const string& name() const { return device_attributes_.name(); }
 
   // Parsed name of this device
-  const DeviceNameUtils::ParsedName parsed_name() const { return parsed_name_; }
+  const DeviceNameUtils::ParsedName& parsed_name() const {
+    return parsed_name_;
+  }
 
   // Describes what kind of device this is.  This is intended to be
   // human-readable and not computer-parsed, except that two devices
@@ -150,6 +151,12 @@ class Device : public DeviceBase {
     return BuildDeviceAttributes(name, device, memory_limit, locality, "");
   }
 
+ protected:
+  void DeleteResourceMgr() {
+    delete rmgr_;
+    rmgr_ = nullptr;
+  }
+
  private:
   const DeviceAttributes device_attributes_;
   DeviceNameUtils::ParsedName parsed_name_;
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index 108a9c0d911..fa12c48fb90 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -99,7 +99,7 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
         "CPU Factory not registered.  Did you link in threadpool_device?");
   }
   size_t init_size = devices->size();
-  cpu_factory->CreateDevices(options, name_prefix, devices);
+  TF_RETURN_IF_ERROR(cpu_factory->CreateDevices(options, name_prefix, devices));
   if (devices->size() == init_size) {
     return errors::NotFound("No CPU devices are available in this process");
   }
@@ -126,7 +126,7 @@ Device* DeviceFactory::NewDevice(const string& type,
   SessionOptions opt = options;
   (*opt.config.mutable_device_count())[type] = 1;
   std::vector<Device*> devices;
-  device_factory->CreateDevices(opt, name_prefix, &devices);
+  TF_CHECK_OK(device_factory->CreateDevices(opt, name_prefix, &devices));
   CHECK_EQ(devices.size(), size_t{1});
   return devices[0];
 }
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 820c4370e21..31f12d48337 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -29,10 +29,18 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
   for (Device* d : devices) {
     devices_.push_back(d);
 
-    // Register under both the full name and the local name.
+    // Register under the (1) full name, (2) canonical name, and (3) local name.
     string full_name = d->name();
     device_map_[CopyToBackingStore(full_name)] = d;
 
+    DeviceNameUtils::ParsedName parsed_name = d->parsed_name();
+    if (parsed_name.has_job && parsed_name.has_replica &&
+        parsed_name.has_task && parsed_name.has_type && parsed_name.has_id) {
+      string canonical_name = DeviceNameUtils::FullName(
+          parsed_name.job, parsed_name.replica, parsed_name.task,
+          parsed_name.type, parsed_name.id);
+      device_map_[CopyToBackingStore(canonical_name)] = d;
+    }
     string lname = DeviceNameUtils::LocalName(d->name());
     device_map_[CopyToBackingStore(lname)] = d;
     device_type_counts_[d->device_type()]++;
@@ -40,11 +48,12 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
 }
 
 DeviceMgr::~DeviceMgr() {
-  for (auto p : devices_) delete p;
+  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
+  for (Device* p : devices_) delete p;
 }
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
-  int n = s.size();
+  size_t n = s.size();
   char* space = name_backing_store_.Alloc(n);
   memcpy(space, s.data(), n);
   return StringPiece(space, n);
@@ -85,6 +94,12 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   Status s;
   auto iter = device_map_.find(name);
   if (iter == device_map_.end()) {
+    std::vector<StringPiece> device_names;
+    for (auto&& itr : device_map_) {
+      device_names.push_back(itr.first);
+    }
+    LOG(WARNING) << "Unknown device: " << name
+                 << " all devices: " << str_util::Join(device_names, ", ");
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index bb1ed726408..d16681ac59d 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -36,6 +36,7 @@ class DeviceMgr {
  public:
   // Takes ownership of each device in 'devices'.
   // TODO(zhifengc): Other initialization information.
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   explicit DeviceMgr(const std::vector<Device*>& devices);
   ~DeviceMgr();
 
@@ -61,6 +62,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   typedef gtl::InlinedVector<Device*, 8> DeviceVec;
   DeviceVec devices_;
 
diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index 0ed9470655b..493349176ea 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -53,7 +53,7 @@ Device* DeviceSet::FindDeviceByName(const string& name) const {
 
 // static
 int DeviceSet::DeviceTypeOrder(const DeviceType& d) {
-  return DeviceFactory::DevicePriority(d.type());
+  return DeviceFactory::DevicePriority(d.type_string());
 }
 
 static bool DeviceTypeComparator(const DeviceType& a, const DeviceType& b) {
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index b0540dfa95b..4cd56e583c0 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -39,7 +39,10 @@ class DeviceSet {
 
   // Set the device designated as the "client".  This device
   // must also be registered via AddDevice().
-  void set_client_device(Device* device) { client_device_ = device; }
+  void set_client_device(Device* device) {
+    DCHECK(client_device_ == nullptr);
+    client_device_ = device;
+  }
 
   // Returns a pointer to the device designated as the "client".
   Device* client_device() const { return client_device_; }
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index ff20ee94a7d..0507076c8c3 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -27,8 +27,7 @@ namespace {
 static Device* Dev(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr)
-        : Device(nullptr, attr, nullptr) {}
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 953c4180fd7..e771674a1e9 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/simple_placer.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
@@ -242,6 +243,13 @@ DirectSession::DirectSession(const SessionOptions& options,
     thread_pools_.push_back(GlobalThreadPool(options));
     owns_thread_pools_ = false;
   }
+  // The default value of sync_on_finish will be flipped soon and this
+  // environment variable will be removed as well.
+  Status status =
+      ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
   // NOTE(mrry): We do not need to use a unique string for the session
   // handle, because DirectSession owns its devices. This may change
   // in future versions.
@@ -271,7 +279,7 @@ DirectSession::DirectSession(const SessionOptions& options,
 }
 
 DirectSession::~DirectSession() {
-  if (!closed_) Close();
+  if (!closed_) Close().IgnoreError();
   for (auto& it : partial_runs_) {
     it.second.reset(nullptr);
   }
@@ -353,7 +361,6 @@ Status DirectSession::ExtendLocked(const GraphDef& graph) {
   return Status::OK();
 }
 
-// TODO(yuanbyu): Simplify by treating Run() as "PRunSetup(); PRun()".
 Status DirectSession::Run(const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
                           const std::vector<string>& target_nodes,
@@ -363,6 +370,31 @@ Status DirectSession::Run(const NamedTensorList& inputs,
              &run_metadata);
 }
 
+Status DirectSession::CreateDebuggerState(
+    const DebugOptions& debug_options, int64 session_run_index,
+    int64 executor_step_index, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), session_run_index, executor_step_index,
+      input_names, output_names, target_names));
+  return Status::OK();
+}
+
+Status DirectSession::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
+  return Status::OK();
+}
+
 Status DirectSession::Run(const RunOptions& run_options,
                           const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
@@ -395,31 +427,54 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  RunStateArgs run_state_args(run_options.debug_options());
 
-  // EXPERIMENTAL: Options that allow the client to insert nodes into partition
-  // graphs for debugging.
-  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
-    run_state_args.debugger_state =
-        DebuggerStateRegistry::CreateState(run_options.debug_options());
-  }
+  Executor::Args args;
+  args.step_id = step_id_counter_.fetch_add(1);
 
   TF_RETURN_IF_ERROR(
       GetOrCreateExecutors(pool, input_tensor_names, output_names, target_nodes,
                            &executors_and_keys, &run_state_args));
+  const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
+
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(CreateDebuggerState(
+        run_options.debug_options(), args.step_id, executor_step_count,
+        input_tensor_names, output_names, target_nodes, &debugger_state));
+  }
+
+  // Configure a call frame for the step, which we use to feed and
+  // fetch values to and from the executors.
+  FunctionCallFrame call_frame(executors_and_keys->input_types,
+                               executors_and_keys->output_types);
+  gtl::InlinedVector<Tensor, 4> feed_args(inputs.size());
+  for (const auto& it : inputs) {
+    if (it.second.dtype() == DT_RESOURCE) {
+      Tensor tensor_from_handle;
+      TF_RETURN_IF_ERROR(
+          ResourceHandleToInputTensor(it.second, &tensor_from_handle));
+      feed_args[executors_and_keys->input_name_to_index[it.first]] =
+          tensor_from_handle;
+    } else {
+      feed_args[executors_and_keys->input_name_to_index[it.first]] = it.second;
+    }
+  }
+  Status s = call_frame.SetArgs(feed_args);
+  if (errors::IsInternal(s)) {
+    return errors::InvalidArgument(s.error_message());
+  } else if (!s.ok()) {
+    return s;
+  }
 
   // Create a run state and start execution.
-  Executor::Args args;
-  args.step_id = step_id_counter_.fetch_add(1);
   RunState run_state(args.step_id, &devices_);
   run_state.rendez = new IntraProcessRendezvous(device_mgr_.get());
   CancellationManager step_cancellation_manager;
-
-  // Send inputs.
-  TF_RETURN_IF_ERROR(SendInputs(inputs, executors_and_keys, run_state.rendez));
+  args.call_frame = &call_frame;
 
   // Start parallel Executors.
-  const int num_executors = executors_and_keys->items.size();
+  const size_t num_executors = executors_and_keys->items.size();
   ExecutorBarrier* barrier = new ExecutorBarrier(
       num_executors, run_state.rendez, [&run_state](const Status& ret) {
         {
@@ -440,7 +495,7 @@ Status DirectSession::Run(const RunOptions& run_options,
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
 
   const bool do_trace = (run_options.trace_level() > RunOptions::NO_TRACE);
 
@@ -450,8 +505,7 @@ Status DirectSession::Run(const RunOptions& run_options,
         options_.config.graph_options().build_cost_model();
     const int64 build_cost_model_after =
         options_.config.graph_options().build_cost_model_after();
-    int measure_step_count =
-        executors_and_keys->step_count - build_cost_model_after;
+    int64 measure_step_count = executor_step_count - build_cost_model_after;
     if (measure_step_count >= 0) {
       update_cost_model =
           ((measure_step_count + 1) % build_cost_model_every == 0);
@@ -468,7 +522,8 @@ Status DirectSession::Run(const RunOptions& run_options,
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
     tracer.reset(CreateGPUTracer());
     // tracer will be NULL on non-GPU platforms.
-    if (tracer) tracer->Start();
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    if (tracer) tracer->Start().IgnoreError();
   }
 #endif  // GOOGLE_CUDA
 
@@ -507,8 +562,9 @@ Status DirectSession::Run(const RunOptions& run_options,
 
 #if GOOGLE_CUDA
   if (tracer) {
-    tracer->Stop();
-    tracer->Collect(args.stats_collector);
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    tracer->Stop().IgnoreError();
+    tracer->Collect(args.stats_collector).IgnoreError();
   }
 #endif  // GOOGLE_CUDA
 
@@ -518,8 +574,22 @@ Status DirectSession::Run(const RunOptions& run_options,
   }
 
   // Receive outputs.
-  TF_RETURN_IF_ERROR(
-      RecvOutputs(output_names, executors_and_keys, &run_state, outputs));
+  if (outputs) {
+    std::vector<Tensor> sorted_outputs;
+    Status s = call_frame.ConsumeRetvals(&sorted_outputs);
+    if (errors::IsInternal(s)) {
+      return errors::InvalidArgument(s.error_message());
+    } else if (!s.ok()) {
+      return s;
+    }
+    outputs->clear();
+    outputs->reserve(sorted_outputs.size());
+    for (const string& output_name : output_names) {
+      outputs->emplace_back(
+          std::move(sorted_outputs[executors_and_keys
+                                       ->output_name_to_index[output_name]]));
+    }
+  }
 
   // Save the output tensors of this run we choose to keep.
   TF_RETURN_IF_ERROR(
@@ -527,7 +597,6 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // Build and return the cost model as instructed.
   mutex_lock l(executor_lock_);
-  ++executors_and_keys->step_count;
   if (update_cost_model) {
     // Build the cost model
     std::unordered_map<string, const Graph*> device_to_graph;
@@ -549,11 +618,11 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // If requested via RunOptions, output the partition graphs.
   if (run_options.output_partition_graphs()) {
-    protobuf::RepeatedPtrField<GraphDef>* parition_graph_defs =
+    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
         run_metadata->mutable_partition_graphs();
     for (const PerPartitionExecutorsAndLib& exec_and_lib :
          executors_and_keys->items) {
-      GraphDef* partition_graph_def = parition_graph_defs->Add();
+      GraphDef* partition_graph_def = partition_graph_defs->Add();
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
@@ -579,7 +648,9 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  // TODO(cais): TFDBG support for partial runs.
+  DebugOptions debug_options;
+  RunStateArgs run_state_args(debug_options);
   run_state_args.is_partial_run = true;
   TF_RETURN_IF_ERROR(GetOrCreateExecutors(pool, input_names, output_names,
                                           target_nodes, &executors_and_keys,
@@ -603,7 +674,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   }
 
   // Start parallel Executors.
-  const int num_executors = executors_and_keys->items.size();
+  const size_t num_executors = executors_and_keys->items.size();
   ExecutorBarrier* barrier = new ExecutorBarrier(
       num_executors, run_state->rendez, [run_state](const Status& ret) {
         if (!ret.ok()) {
@@ -624,7 +695,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
 
   if (options_.config.graph_options().build_cost_model()) {
     run_state->collector.reset(new StepStatsCollector(nullptr));
@@ -668,16 +739,23 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
     for (const auto& input : inputs) {
       auto it = run_state->pending_inputs.find(input.first);
       if (it == run_state->pending_inputs.end()) {
+        return errors::InvalidArgument(
+            "The feed ", input.first,
+            " was not specified in partial_run_setup.");
+      } else if (it->second) {
         return errors::InvalidArgument("The feed ", input.first,
-                                       " had already been fed.");
+                                       " has already been fed.");
       }
     }
     // Check that this is a new set of fetches that are still pending.
     for (const auto& output : output_names) {
       auto it = run_state->pending_outputs.find(output);
       if (it == run_state->pending_outputs.end()) {
+        return errors::InvalidArgument(
+            "The fetch ", output, " was not specified in partial_run_setup.");
+      } else if (it->second) {
         return errors::InvalidArgument("The fetch ", output,
-                                       " had already been fetched.");
+                                       " has already been fetched.");
       }
     }
   }
@@ -688,11 +766,11 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
       CheckFetch(inputs, output_names, executors_and_keys, run_state));
 
   // Send inputs.
-  Status s = SendInputs(inputs, executors_and_keys, run_state->rendez);
+  Status s = SendPRunInputs(inputs, executors_and_keys, run_state->rendez);
 
   // Receive outputs.
   if (s.ok()) {
-    s = RecvOutputs(output_names, executors_and_keys, run_state, outputs);
+    s = RecvPRunOutputs(output_names, executors_and_keys, run_state, outputs);
   }
 
   // Save the output tensors of this run we choose to keep.
@@ -712,14 +790,15 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
                        << run_state->status;
         }
       }
-      for (const auto& it : inputs) {
-        run_state->pending_inputs.erase(it.first);
+      for (const auto& input : inputs) {
+        auto it = run_state->pending_inputs.find(input.first);
+        it->second = true;
       }
       for (const auto& name : output_names) {
-        run_state->pending_outputs.erase(name);
+        auto it = run_state->pending_outputs.find(name);
+        it->second = true;
       }
-      done = (run_state->pending_inputs.size() == 0 &&
-              run_state->pending_outputs.size() == 0);
+      done = run_state->PendingDone();
     }
     if (done) {
       WaitForNotification(run_state, cancellation_manager_,
@@ -731,16 +810,38 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
   return s;
 }
 
-Status DirectSession::SendInputs(const NamedTensorList& inputs,
-                                 const ExecutorsAndKeys* executors_and_keys,
-                                 IntraProcessRendezvous* rendez) {
+Status DirectSession::ResourceHandleToInputTensor(const Tensor& resource_tensor,
+                                                  Tensor* retrieved_tensor) {
+  if (resource_tensor.dtype() != DT_RESOURCE) {
+    return errors::InvalidArgument(strings::StrCat(
+        "ResourceHandleToInputTensor() received non-DT_RESOURCE Tensor: ",
+        resource_tensor.dtype()));
+  }
+
+  ResourceHandle resource_handle = resource_tensor.scalar<ResourceHandle>()();
+
+  if (resource_handle.container() ==
+      SessionState::kTensorHandleResourceTypeName) {
+    return session_state_.GetTensor(resource_handle.name(), retrieved_tensor);
+  } else {
+    return errors::InvalidArgument(strings::StrCat(
+        "Invalid resource type hash code: ", resource_handle.hash_code(),
+        "(name: ", resource_handle.name(),
+        " type: ", resource_handle.maybe_type_name(), ")"));
+  }
+}
+
+Status DirectSession::SendPRunInputs(const NamedTensorList& inputs,
+                                     const ExecutorsAndKeys* executors_and_keys,
+                                     IntraProcessRendezvous* rendez) {
   Status s;
   Rendezvous::ParsedKey parsed;
   // Insert the input tensors into the local rendezvous by their
   // rendezvous key.
   for (const auto& input : inputs) {
-    auto it = executors_and_keys->input_keys.find(input.first);
-    if (it == executors_and_keys->input_keys.end()) {
+    auto it =
+        executors_and_keys->input_name_to_rendezvous_key.find(input.first);
+    if (it == executors_and_keys->input_name_to_rendezvous_key.end()) {
       return errors::Internal("'", input.first, "' is not a pre-defined feed.");
     }
     const string& input_key = it->second;
@@ -751,7 +852,16 @@ Status DirectSession::SendInputs(const NamedTensorList& inputs,
       return s;
     }
 
-    s = rendez->Send(parsed, Rendezvous::Args(), input.second, false);
+    if (input.second.dtype() == DT_RESOURCE) {
+      Tensor tensor_from_handle;
+      s = ResourceHandleToInputTensor(input.second, &tensor_from_handle);
+      if (s.ok()) {
+        s = rendez->Send(parsed, Rendezvous::Args(), tensor_from_handle, false);
+      }
+    } else {
+      s = rendez->Send(parsed, Rendezvous::Args(), input.second, false);
+    }
+
     if (!s.ok()) {
       rendez->StartAbort(s);
       return s;
@@ -760,10 +870,10 @@ Status DirectSession::SendInputs(const NamedTensorList& inputs,
   return Status::OK();
 }
 
-Status DirectSession::RecvOutputs(const std::vector<string>& output_names,
-                                  const ExecutorsAndKeys* executors_and_keys,
-                                  RunState* run_state,
-                                  std::vector<Tensor>* outputs) {
+Status DirectSession::RecvPRunOutputs(
+    const std::vector<string>& output_names,
+    const ExecutorsAndKeys* executors_and_keys, RunState* run_state,
+    std::vector<Tensor>* outputs) {
   Status s;
   if (!output_names.empty()) {
     outputs->resize(output_names.size());
@@ -774,8 +884,9 @@ Status DirectSession::RecvOutputs(const std::vector<string>& output_names,
   for (size_t output_offset = 0; output_offset < output_names.size();
        ++output_offset) {
     const string& output_name = output_names[output_offset];
-    auto it = executors_and_keys->output_keys.find(output_name);
-    if (it == executors_and_keys->output_keys.end()) {
+    auto it =
+        executors_and_keys->output_name_to_rendezvous_key.find(output_name);
+    if (it == executors_and_keys->output_name_to_rendezvous_key.end()) {
       return errors::Internal("'", output_name,
                               "' is not a pre-defined fetch.");
     }
@@ -816,11 +927,13 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
   {
     mutex_lock l(executor_lock_);
-    for (const string& feed : run_state->pending_inputs) {
-      TensorId id(ParseTensorName(feed));
+    for (const auto& input : run_state->pending_inputs) {
+      // Skip if the feed has already been fed.
+      if (input.second) continue;
+      TensorId id(ParseTensorName(input.first));
       auto it = name_to_node->find(id.first);
       if (it == name_to_node->end()) {
-        return errors::NotFound("Feed ", feed, ": not found");
+        return errors::NotFound("Feed ", input.first, ": not found");
       }
       pending_feeds.insert(id);
     }
@@ -868,14 +981,15 @@ Status DirectSession::GetOrCreateExecutors(
     thread::ThreadPool* pool, gtl::ArraySlice<string> inputs,
     gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes,
     ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args) {
-  string debug_tensor_watches_summary;
   int64 handle_name_counter_value = -1;
   if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
     handle_name_counter_value = handle_name_counter_.fetch_add(1);
   }
-  if (run_state_args->debugger_state) {
-    debug_tensor_watches_summary =
-        run_state_args->debugger_state->SummarizeDebugTensorWatches();
+
+  string debug_tensor_watches_summary;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    debug_tensor_watches_summary = SummarizeDebugTensorWatches(
+        run_state_args->debug_options.debug_tensor_watch_opts());
   }
 
   // Fast lookup path, no sorting.
@@ -939,14 +1053,19 @@ Status DirectSession::GetOrCreateExecutors(
   options.feed_endpoints = inputs_sorted;
   options.fetch_endpoints = outputs_sorted;
   options.target_nodes = tn_sorted;
+  options.use_function_convention = !run_state_args->is_partial_run;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    options.debug_options = run_state_args->debug_options;
+  }
 
   std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
   // The executor_lock_ is intentionally released while executor is
   // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
-  TF_RETURN_IF_ERROR(
-      CreateGraphs(options, &graphs, &ek->flib_def, run_state_args));
+  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, &ek->flib_def,
+                                  run_state_args, &ek->input_types,
+                                  &ek->output_types));
 
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
@@ -1013,10 +1132,10 @@ Status DirectSession::GetOrCreateExecutors(
 
     optimizer.Optimize(lib, options_.env, device, &iter->second);
 
-    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph
-    if (run_state_args->debugger_state) {
-      TF_RETURN_IF_ERROR(run_state_args->debugger_state->DecorateGraphForDebug(
-          partition_graph.get(), params.device));
+    // EXPERIMENTAL: tfdbg inserts debug nodes in the graph.
+    if (!options.debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          options.debug_options, partition_graph.get(), params.device));
     }
 
     TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()),
@@ -1031,17 +1150,37 @@ Status DirectSession::GetOrCreateExecutors(
     item->executor.reset(executor);
   }
 
-  // Compute the rendezvous keys to avoid recomputing them every time.
-  //
-  // We always use the first device as the device name portion of the
-  // key, even if we're feeding another graph.
-  for (const string& input : inputs) {
-    ek->input_keys[input] = GetRendezvousKey(
-        input, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
-  }
-  for (const string& output : outputs) {
-    ek->output_keys[output] = GetRendezvousKey(
-        output, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
+  // Cache the mapping from input/output names to graph elements to
+  // avoid recomputing it every time.
+  if (!run_state_args->is_partial_run) {
+    // For regular `Run()`, we use the function calling convention, and so
+    // maintain a mapping from input/output names to
+    // argument/return-value ordinal index.
+    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
+      const string& input = inputs_sorted[i];
+      ek->input_name_to_index[input] = i;
+    }
+    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
+      const string& output = outputs_sorted[i];
+      ek->output_name_to_index[output] = i;
+    }
+  } else {
+    // For `PRun()`, we use the rendezvous calling convention, and so
+    // maintain a mapping from input/output names to rendezvous keys.
+    //
+    // We always use the first device as the device name portion of the
+    // key, even if we're feeding another graph.
+    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
+      const string& input = inputs_sorted[i];
+      ek->input_name_to_rendezvous_key[input] = GetRendezvousKey(
+          input, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
+    }
+    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
+      const string& output = outputs_sorted[i];
+      ek->output_name_to_rendezvous_key[output] =
+          GetRendezvousKey(output, device_set_.client_device()->attributes(),
+                           FrameAndIter(0, 0));
+    }
   }
 
   // Reacquire the lock, try to insert into the map.
@@ -1062,7 +1201,8 @@ Status DirectSession::CreateGraphs(
     const BuildGraphOptions& subgraph_options,
     std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
     std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-    RunStateArgs* run_state_args) {
+    RunStateArgs* run_state_args, DataTypeVector* input_types,
+    DataTypeVector* output_types) {
   mutex_lock l(graph_def_lock_);
   std::unique_ptr<SimpleClientGraph> client_graph;
 
@@ -1087,6 +1227,23 @@ Status DirectSession::CreateGraphs(
         execution_state->BuildGraph(subgraph_options, &client_graph));
   }
 
+  if (subgraph_options.feed_endpoints.size() !=
+      client_graph->feed_types.size()) {
+    return errors::Internal(
+        "Graph pruning failed: requested number of feed endpoints = ",
+        subgraph_options.feed_endpoints.size(),
+        " versus number of pruned feed endpoints = ",
+        client_graph->feed_types.size());
+  }
+  if (subgraph_options.fetch_endpoints.size() !=
+      client_graph->fetch_types.size()) {
+    return errors::Internal(
+        "Graph pruning failed: requested number of fetch endpoints = ",
+        subgraph_options.fetch_endpoints.size(),
+        " versus number of pruned fetch endpoints = ",
+        client_graph->fetch_types.size());
+  }
+
   auto current_stateful_placements = execution_state->GetStatefulPlacements();
   // Update our current state based on the execution_state's
   // placements.  If there are any mismatches for a node,
@@ -1192,9 +1349,22 @@ Status DirectSession::CreateGraphs(
     }
   }
   *flib_def = std::move(client_graph->flib_def);
+  std::swap(*input_types, client_graph->feed_types);
+  std::swap(*output_types, client_graph->fetch_types);
   return s;
 }
 
+::tensorflow::Status DirectSession::ListDevices(
+    std::vector<DeviceAttributes>* response) {
+  response->clear();
+  response->reserve(devices_.size());
+  for (Device* d : devices_) {
+    const DeviceAttributes& attrs = d->attributes();
+    response->emplace_back(attrs);
+  }
+  return ::tensorflow::Status::OK();
+}
+
 ::tensorflow::Status DirectSession::Reset(
     const std::vector<string>& containers) {
   device_mgr_->ClearContainers(containers);
@@ -1225,10 +1395,10 @@ DirectSession::RunState::RunState(
       }) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : pending_input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : pending_output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1246,6 +1416,16 @@ DirectSession::RunState::~RunState() {
   }
 }
 
+bool DirectSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 void DirectSession::WaitForNotification(RunState* run_state,
                                         CancellationManager* cm,
                                         int64 timeout_in_ms) {
@@ -1267,7 +1447,8 @@ void DirectSession::WaitForNotification(RunState* run_state,
 ::tensorflow::Status DirectSession::WaitForNotification(
     Notification* notification, int64 timeout_in_ms) {
   if (timeout_in_ms > 0) {
-    bool notified = WaitForNotificationWithTimeout(notification, timeout_in_ms);
+    int64 timeout_in_us = timeout_in_ms * 1000;
+    bool notified = WaitForNotificationWithTimeout(notification, timeout_in_us);
     if (!notified) {
       return Status(error::DEADLINE_EXCEEDED,
                     "Timed out waiting for notification");
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index ff4786f998d..b14a5171886 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -95,6 +95,8 @@ class DirectSession : public Session {
   // If 'containers' is empty, then Reset clears the default container.
   ::tensorflow::Status Reset(const std::vector<string>& containers);
 
+  ::tensorflow::Status ListDevices(
+      std::vector<DeviceAttributes>* response) override;
   ::tensorflow::Status Close() override;
 
   void ExportCostModels(CostModelManager::CostModelMap* cost_models) {
@@ -125,13 +127,20 @@ class DirectSession : public Session {
   // library. Consider giving each partition its own function library to enable
   // per-partition rewrites.
   struct ExecutorsAndKeys {
-    int64 step_count = 0;
+    ExecutorsAndKeys() : step_count(0) {}
+
+    std::atomic_int_fast64_t step_count;
     std::unique_ptr<Graph> graph;
     NameNodeMap name_to_node;
     std::unique_ptr<FunctionLibraryDefinition> flib_def;
     std::vector<PerPartitionExecutorsAndLib> items;
-    std::unordered_map<string, string> input_keys;
-    std::unordered_map<string, string> output_keys;
+    std::unordered_map<string, size_t> input_name_to_index;
+    std::unordered_map<string, string> input_name_to_rendezvous_key;
+    std::unordered_map<string, size_t> output_name_to_index;
+    std::unordered_map<string, string> output_name_to_rendezvous_key;
+
+    DataTypeVector input_types;
+    DataTypeVector output_types;
   };
 
   // For each live partial execution, the session maintains a RunState.
@@ -144,8 +153,8 @@ class DirectSession : public Session {
     IntraProcessRendezvous* rendez = nullptr;
     std::unique_ptr<StepStatsCollector> collector;
     Notification executors_done;
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     TensorStore tensor_store;
     ScopedStepContainer step_container;
 
@@ -155,14 +164,19 @@ class DirectSession : public Session {
              const std::vector<string>& pending_output_names, int64 step_id,
              const std::vector<Device*>* devices);
 
+    // Returns true if all pending inputs and outputs have been completed.
+    bool PendingDone() const;
+
     ~RunState();
   };
 
   struct RunStateArgs {
+    RunStateArgs(const DebugOptions& options) : debug_options(options) {}
+
     bool is_partial_run = false;
     string handle;
     std::unique_ptr<Graph> graph;
-    std::unique_ptr<DebuggerStateInterface> debugger_state;
+    const DebugOptions& debug_options;
   };
 
   // Initializes the base execution state given the 'graph',
@@ -185,23 +199,27 @@ class DirectSession : public Session {
       const BuildGraphOptions& options,
       std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-      RunStateArgs* run_state_args);
+      RunStateArgs* run_state_args, DataTypeVector* input_types,
+      DataTypeVector* output_types);
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
 
+  ::tensorflow::Status ResourceHandleToInputTensor(
+      const Tensor& resource_tensor, Tensor* retrieved_tensor);
+
   // Feeds more inputs to the executors, triggering further execution.
-  ::tensorflow::Status SendInputs(
+  ::tensorflow::Status SendPRunInputs(
       const std::vector<std::pair<string, Tensor>>& inputs,
       const ExecutorsAndKeys* executors_and_keys,
       IntraProcessRendezvous* rendez);
 
   // Fetches more outputs from the executors. It waits until the output
   // tensors are computed.
-  ::tensorflow::Status RecvOutputs(const std::vector<string>& output_names,
-                                   const ExecutorsAndKeys* executors_and_keys,
-                                   RunState* run_state,
-                                   std::vector<Tensor>* outputs);
+  ::tensorflow::Status RecvPRunOutputs(
+      const std::vector<string>& output_names,
+      const ExecutorsAndKeys* executors_and_keys, RunState* run_state,
+      std::vector<Tensor>* outputs);
 
   // Check if the specified fetches can be computed from the feeds
   // that we have already provided.
@@ -225,6 +243,16 @@ class DirectSession : public Session {
     return ::tensorflow::Status::OK();
   }
 
+  ::tensorflow::Status CreateDebuggerState(
+      const DebugOptions& debug_options, int64 session_run_index,
+      int64 executor_step_index, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_names,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
+  ::tensorflow::Status DecorateAndPublishGraphForDebug(
+      const DebugOptions& debug_options, Graph* graph, Device* device);
+
   const SessionOptions options_;
 
   // Device structures.
@@ -242,6 +270,8 @@ class DirectSession : public Session {
   std::vector<thread::ThreadPool*> thread_pools_;
   bool owns_thread_pools_ = false;
 
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
   // Schedules 'c' for execution on pool.
   void SchedClosure(thread::ThreadPool* pool, std::function<void()> c);
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 0ff93f91dc0..107c84e39a2 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/direct_session.h"
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -43,10 +46,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Session* CreateSession() {
+std::unique_ptr<Session> CreateSession() {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
-  return NewSession(options);
+  return std::unique_ptr<Session>(NewSession(options));
 }
 
 class DirectSessionMinusAXTest : public ::testing::Test {
@@ -85,7 +88,7 @@ class DirectSessionMinusAXTest : public ::testing::Test {
 
 TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork) {
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
@@ -107,7 +110,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork) {
 
 TEST_F(DirectSessionMinusAXTest, TestFeed) {
   Initialize({1, 2, 3, 4});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   TF_ASSERT_OK(session->Create(def_));
@@ -136,7 +139,7 @@ TEST_F(DirectSessionMinusAXTest, TestFeed) {
 
 TEST_F(DirectSessionMinusAXTest, TestConcurrency) {
   Initialize({1, 2, 3, 4});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
 
@@ -205,7 +208,7 @@ TEST_F(DirectSessionMinusAXTest, TestPerSessionThreads) {
 
 TEST_F(DirectSessionMinusAXTest, TwoCreateCallsFails) {
   Initialize({1, 2, 3, 4});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
 
@@ -215,7 +218,7 @@ TEST_F(DirectSessionMinusAXTest, TwoCreateCallsFails) {
 
 TEST_F(DirectSessionMinusAXTest, ForgetToCreate) {
   Initialize({1, 2, 3, 4});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   std::vector<std::pair<string, Tensor>> inputs;
   std::vector<Tensor> outputs;
@@ -259,7 +262,7 @@ TEST_F(DirectSessionMinusAXTest, InvalidDevice) {
 
 TEST_F(DirectSessionMinusAXTest, RunSimpleNetworkWithOpts) {
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
@@ -311,7 +314,7 @@ TEST(DirectSessionTest, KeepsStateAcrossRunsOfSession) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -346,7 +349,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -434,7 +437,6 @@ TEST(DirectSessionTest, DarthKernel) {
   std::vector<Tensor> outputs;
   auto s = sess->Run({}, {y->name() + ":0"}, {}, &outputs);
   EXPECT_TRUE(errors::IsInternal(s));
-  delete sess;
 }
 
 // Have the Darth op in the graph placed on GPU, but don't run it.
@@ -498,7 +500,7 @@ TEST(DirectSessionTest, PartialRunTest) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -554,7 +556,7 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -587,7 +589,7 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -625,7 +627,7 @@ TEST(DirectSessionTest, RunHandleTest) {
   value1.scalar<float>()() = 2.0;
   Node* const1 = test::graph::Constant(&g, value1);
   Node* node3 = test::graph::Add(&g, identity0, const1);
-  Node* node4 = test::graph::Unary(&g, "GetSessionHandle", node3);
+  Node* node4 = test::graph::Unary(&g, "GetSessionHandleV2", node3);
 
   Tensor value2(DT_STRING, TensorShape({}));
   Node* const2 = test::graph::Constant(&g, value2);
@@ -636,7 +638,7 @@ TEST(DirectSessionTest, RunHandleTest) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -646,17 +648,21 @@ TEST(DirectSessionTest, RunHandleTest) {
   ASSERT_TRUE(s.ok());
   ASSERT_EQ(1, outputs.size());
 
+  ResourceHandle resource_handle = outputs[0].scalar<ResourceHandle>()();
+  Tensor string_handle(DT_STRING, {});
+  string_handle.flat<string>().setConstant(resource_handle.name());
+
   // Second run call: Use a handle.
   std::vector<Tensor> outputs1;
-  s = session->Run({{const2->name(), outputs[0]}}, {node6->name() + ":0"}, {},
-                   &outputs1);
+  s = session->Run({{const2->name(), string_handle}}, {node6->name() + ":0"},
+                   {}, &outputs1);
   ASSERT_TRUE(s.ok());
   ASSERT_EQ(1, outputs1.size());
   ASSERT_EQ(5.0, outputs1[0].flat<float>()(0));
 
   // Third run call: Delete a handle.
   std::vector<Tensor> outputs2;
-  s = session->Run({{const2->name(), outputs[0]}}, {}, {node7->name()},
+  s = session->Run({{const2->name(), string_handle}}, {}, {node7->name()},
                    &outputs2);
   ASSERT_TRUE(s.ok());
 }
@@ -673,7 +679,7 @@ TEST(DirectSessionTest, CreateGraphFailsWhenAssigningAFedVar) {
   // a = b
   Node* assign = test::graph::Assign(&graph, a, b);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   // The graph is invalid since a constant cannot be assigned to a constant.
@@ -751,30 +757,35 @@ TEST(DirectSessionTest, TimeoutSession) {
   )proto",
                                         &graph);
 
-  // Creates a session with operation_timeout_in_ms set to 100 milliseconds.
-  SessionOptions options;
-  (*options.config.mutable_device_count())["CPU"] = 2;
-  options.config.set_operation_timeout_in_ms(100);
-  std::unique_ptr<Session> session(NewSession(options));
-  ASSERT_TRUE(session != nullptr);
-  TF_ASSERT_OK(session->Create(graph));
+  {
+    // Creates a session with operation_timeout_in_ms set to 100 milliseconds.
+    SessionOptions options;
+    (*options.config.mutable_device_count())["CPU"] = 2;
+    options.config.set_operation_timeout_in_ms(100);
 
-  // Verifies that the error code is DEADLINE_EXCEEDED.
-  Status s = session->Run({}, {}, {"fifo_queue_Dequeue"}, nullptr);
-  ASSERT_EQ(error::DEADLINE_EXCEEDED, s.code());
-  session->Close();
+    std::unique_ptr<Session> session(NewSession(options));
+    ASSERT_TRUE(session != nullptr);
+    TF_ASSERT_OK(session->Create(graph));
 
-  // Creates a session with no operation_timeout_in_ms.
-  session.reset(CreateSession());
-  ASSERT_TRUE(session != nullptr);
-  TF_ASSERT_OK(session->Create(graph));
-  RunOptions run_options;
-  run_options.set_timeout_in_ms(20);
-  // Verifies that the error code is DEADLINE_EXCEEDED.
-  Status s2 = session->Run(run_options, {}, {}, {"fifo_queue_Dequeue"}, nullptr,
-                           nullptr);
-  ASSERT_EQ(error::DEADLINE_EXCEEDED, s2.code());
-  session->Close();
+    // Verifies that the error code is DEADLINE_EXCEEDED.
+    Status s = session->Run({}, {}, {"fifo_queue_Dequeue"}, nullptr);
+    ASSERT_EQ(error::DEADLINE_EXCEEDED, s.code());
+    TF_ASSERT_OK(session->Close());
+  }
+
+  {
+    // Creates a session with no operation_timeout_in_ms.
+    auto session = CreateSession();
+    ASSERT_TRUE(session != nullptr);
+    TF_ASSERT_OK(session->Create(graph));
+    RunOptions run_options;
+    run_options.set_timeout_in_ms(20);
+    // Verifies that the error code is DEADLINE_EXCEEDED.
+    Status s2 = session->Run(run_options, {}, {}, {"fifo_queue_Dequeue"},
+                             nullptr, nullptr);
+    ASSERT_EQ(error::DEADLINE_EXCEEDED, s2.code());
+    TF_ASSERT_OK(session->Close());
+  }
 }
 
 // Accesses the cancellation manager for the step after the step has been
@@ -827,7 +838,7 @@ TEST(DirectSessionTest, TestTimeoutCleanShutdown) {
   // Verify that the op ran to completion.
   ASSERT_TRUE(CancellationMgrPollingOp::notification.HasBeenNotified());
 
-  session->Close();
+  TF_ASSERT_OK(session->Close());
 }
 
 class BlockingOpState {
@@ -871,8 +882,6 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
 
-REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_SYCL), BlockingOp);
-
 static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {
@@ -910,6 +919,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
       ->set_opt_level(OptimizerOptions_Level_L0);
   (*options.config.mutable_device_count())["CPU"] = 2;
   (*options.config.mutable_device_count())["GPU"] = 0;
+  (*options.config.mutable_device_count())["SYCL"] = 0;
 
   options.config.add_session_inter_op_thread_pool();
   auto* p = options.config.add_session_inter_op_thread_pool();
@@ -1058,7 +1068,7 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
   outputs.clear();
 
   // Close the session.
-  session->Close();
+  TF_ASSERT_OK(session->Close());
 
   // Run the read on the variable to get an error.
   Status s = session->Run({} /* inputs */, {},
@@ -1085,7 +1095,7 @@ TEST(DirectSessionTest, TestDirectSessionPRunClose) {
 
   test::graph::ToGraphDef(&g, &def);
 
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def));
 
@@ -1105,7 +1115,7 @@ TEST(DirectSessionTest, TestDirectSessionPRunClose) {
   value_22.scalar<float>()() = 22.0;
 
   // Close the session.
-  session->Close();
+  TF_ASSERT_OK(session->Close());
 
   // Feed first_const, fetch first_identity
   s = session->PRun(handle, {{first_const->name(), value_11}},
@@ -1142,7 +1152,7 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   outputs.clear();
 
   // Reset the containers.
-  Reset(options, {});
+  TF_EXPECT_OK(Reset(options, {}));
 
   // Run the read on the variable to get an error.
   // TODO(suharshs): This test only works because we close the Session in Reset.
@@ -1153,5 +1163,67 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   EXPECT_EQ("Cancelled: Session has been closed.", s.ToString());
 }
 
+// A simple benchmark for the overhead of `DirectSession::Run()` calls
+// with varying numbers of feeds/fetches.
+void FeedFetchBenchmarkHelper(int num_feeds, int iters) {
+  testing::StopTiming();
+
+  Tensor value(DT_FLOAT, TensorShape());
+  value.flat<float>()(0) = 37.0;
+
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.reserve(num_feeds);
+  std::vector<string> outputs;
+
+  Graph g(OpRegistry::Global());
+  for (int i = 0; i < num_feeds; ++i) {
+    // NOTE(mrry): We pin nodes to the "/cpu:0" device, so as not to
+    // measure CPU<->GPU copying overhead. We should also optimize and
+    // monitor this overhead where possible, but that is not the
+    // object of study in this benchmark.
+    Node* placeholder;
+    TF_CHECK_OK(NodeBuilder(g.NewName("Placeholder"), "Placeholder")
+                    .Attr("shape", TensorShape())
+                    .Attr("dtype", DT_FLOAT)
+                    .Device("/cpu:0")
+                    .Finalize(&g, &placeholder));
+    Node* identity;
+    TF_CHECK_OK(NodeBuilder(g.NewName("Identity"), "Identity")
+                    .Input(placeholder)
+                    .Attr("T", DT_FLOAT)
+                    .Device("/cpu:0")
+                    .Finalize(&g, &identity));
+    inputs.push_back({placeholder->name() + ":0", value});
+    outputs.push_back(identity->name() + ":0");
+  }
+  GraphDef gd;
+  g.ToGraphDef(&gd);
+  SessionOptions opts;
+  std::unique_ptr<Session> session(NewSession(opts));
+  TF_CHECK_OK(session->Create(gd));
+  {
+    // NOTE(mrry): Ignore the first run, which will incur the graph
+    // partitioning/pruning overhead and skew the results.
+    //
+    // Note that we should also optimize and monitor the overhead on
+    // the first run, which will impact application startup times, but
+    // that is not the object of study in this benchmark.
+    std::vector<Tensor> output_values;
+    TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
+  }
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    std::vector<Tensor> output_values;
+    TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
+  }
+  testing::StopTiming();
+}
+
+void BM_FeedFetch(int iters, int num_feeds) {
+  FeedFetchBenchmarkHelper(iters, num_feeds);
+}
+
+BENCHMARK(BM_FeedFetch)->Arg(1)->Arg(2)->Arg(5)->Arg(10);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 6f92cd09d3b..0cfc2894940 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -155,10 +155,16 @@ static void TestHWAccelerator(bool enableHWTrace) {
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+#ifdef TENSORFLOW_USE_SYCL
+y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -169,6 +175,9 @@ static void TestHWAccelerator(bool enableHWTrace) {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
+#ifdef TENSORFLOW_USE_SYCL
+  (*options.config.mutable_device_count())["SYCL"] = 1;
+#endif // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 8eacbf82db0..24b519fb078 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/edgeset.h"
-#include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -93,31 +92,28 @@ bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) {
       }
     }
   }
-  const NodeDef& def = node->def();
-  string text = "";
+  const AttrSlice attrs = node->attrs();
+  string text;
   if (IsSend(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string recv_device;
-    TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", recv_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", recv_device);
     is_transfer_node = true;
   } else if (IsRecv(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string send_device;
-    TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", send_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", send_device);
     is_transfer_node = true;
   } else {
-    text = strings::StrCat(
-        memory, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    text =
+        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
+                        str_util::Join(node->requested_inputs(), ", "), ")");
   }
   node_stats->set_timeline_label(text);
   return is_transfer_node;
@@ -161,12 +157,27 @@ void SetMemory(NodeExecStats* nt, OpKernelContext* ctx) {
     // be dereferenced again after this statement
     auto sizes = allocator_pair.second->GetSizesAndUnRef();
     memory->set_allocator_name(allocator_pair.first->Name());
-    int tb = sizes.first;
-    memory->set_total_bytes(tb);
+    memory->set_total_bytes(std::get<0>(sizes));
     if (allocator_pair.first->TracksAllocationSizes()) {
-      memory->set_peak_bytes(sizes.second);
+      memory->set_peak_bytes(std::get<1>(sizes));
+      memory->set_live_bytes(std::get<2>(sizes));
     }
+    AllocatorStats stats;
+    allocator_pair.first->GetStats(&stats);
+    memory->set_allocator_bytes_in_use(stats.bytes_in_use);
   }
+  auto* ms = nt->mutable_memory_stats();
+  ms->set_host_temp_memory_size(ctx->host_temp_memory_size());
+  ms->set_device_temp_memory_size(ctx->device_temp_memory_size());
+  for (const auto& alloc_id : ctx->host_persistent_alloc_ids()) {
+    ms->mutable_host_persistent_tensor_alloc_ids()->Add(alloc_id);
+  }
+  for (const auto& alloc_id : ctx->device_persistent_alloc_ids()) {
+    ms->mutable_device_persistent_tensor_alloc_ids()->Add(alloc_id);
+  }
+  ms->set_host_persistent_memory_size(ctx->host_persistent_memory_allocated());
+  ms->set_device_persistent_memory_size(
+      ctx->device_persistent_memory_allocated());
 }
 
 void SetReferencedTensors(NodeExecStats* nt,
@@ -186,9 +197,9 @@ class GraphView;
 
 struct EdgeInfo {
   int dst_id;
-  int16 output_slot;
+  int output_slot : 31;
   // true if this is the last info for output_slot in the EdgeInfo list.
-  bool is_last;
+  bool is_last : 1;
   int input_slot;
 };
 
@@ -221,7 +232,7 @@ struct NodeItem {
   int input_start = 0;
 
   // Number of output edges.
-  int num_output_edges;
+  size_t num_output_edges;
 
   PendingCounts::Handle pending_id;
 
@@ -290,24 +301,30 @@ typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 // Immutable view of a Graph organized for efficient execution.
 class GraphView {
  public:
-  GraphView() : arena_(8192) {}
+  GraphView() : space_(nullptr) {}
   ~GraphView();
 
   void Initialize(const Graph* g);
   Status SetAllocAttrs(const Graph* g, const Device* device);
 
-  NodeItem* node(int id) const {
+  NodeItem* node(size_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
-    return nodes_[id];
+    uint32 offset = node_offsets_[id];
+    return ((offset == kuint32max)
+                ? nullptr
+                : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
   }
 
  private:
-  void InitializeNode(const Node* n);
+  char* InitializeNode(char* ptr, const Node* n);
+  size_t NodeItemBytes(const Node* n);
 
   int32 num_nodes_ = 0;
-  NodeItem** nodes_ = nullptr;  // array of size "graph_.num_node_ids()"
-  core::Arena arena_;           // NodeItem objects are allocated here
+  uint32* node_offsets_ = nullptr;  // array of size "graph_.num_node_ids()"
+  // node_offsets_[id] holds the byte offset for node w/ "id" in space_
+
+  char* space_;  // NodeItem objects are allocated here
 
   TF_DISALLOW_COPY_AND_ASSIGN(GraphView);
 };
@@ -315,7 +332,7 @@ class GraphView {
 class ExecutorImpl : public Executor {
  public:
   ExecutorImpl(const LocalExecutorParams& p, const Graph* g)
-      : params_(p), graph_(g) {
+      : params_(p), graph_(g), gview_() {
     CHECK(p.create_kernel != nullptr);
     CHECK(p.delete_kernel != nullptr);
   }
@@ -416,9 +433,9 @@ class ExecutorImpl : public Executor {
 // connected to n by a single edge, but might be a downstream
 // consumer of n's output by reference.  *attr is updated with any
 // necessary attributes.
-static Status InferAllocAttr(const Node* n, const Node* dst,
-                             const DeviceNameUtils::ParsedName& local_dev_name,
-                             AllocatorAttributes* attr);
+Status InferAllocAttr(const Node* n, const Node* dst,
+                      const DeviceNameUtils::ParsedName& local_dev_name,
+                      AllocatorAttributes* attr);
 
 GraphView::~GraphView() {
   static_assert(std::is_trivially_destructible<AllocatorAttributes>::value,
@@ -429,23 +446,22 @@ GraphView::~GraphView() {
     NodeItem* n = node(i);
     if (n != nullptr) {
       n->NodeItem::~NodeItem();
-      // Memory for "n" itself is held in arena_ & gets cleaned up automatically
+      // Memory for "n" itself is held in space_ & gets cleaned up below
     }
   }
-  delete[] nodes_;
+  delete[] node_offsets_;
+  delete[] space_;
 }
 
-void GraphView::InitializeNode(const Node* n) {
-  const int id = n->id();
-  CHECK(nodes_[id] == nullptr);
-  const int num_output_edges = n->out_edges().size();
+size_t GraphView::NodeItemBytes(const Node* n) {
+  const size_t num_output_edges = n->out_edges().size();
   const int num_inputs = n->num_inputs();
   const int num_outputs = n->num_outputs();
 
   // Compute number of bytes needed for NodeItem and variable length data.
   // We do not subtract sizeof(var) since num_inputs/num_outputs might
   // both be zero.
-  const size_t bytes =
+  const size_t raw_bytes =
       sizeof(NodeItem)                             // Fixed
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
@@ -464,16 +480,41 @@ void GraphView::InitializeNode(const Node* n) {
                 "NodeItem must be aligned with AllocatorAttributes");
   static_assert(sizeof(EdgeInfo) % alignof(AllocatorAttributes) == 0,
                 "EdgeInfo must be aligned with AllocatorAttributes");
-  NodeItem* item = reinterpret_cast<NodeItem*>(
-      arena_.AllocAligned(bytes, sizeof(NodeItem*)));
+  const size_t bytes =
+      ((raw_bytes + kItemAlignment - 1) / kItemAlignment) * kItemAlignment;
+  return bytes;
+}
+
+char* GraphView::InitializeNode(char* ptr, const Node* n) {
+  const int id = n->id();
+  CHECK(node_offsets_[id] == kuint32max);  // Initial value in constructor
+
+  const size_t bytes = NodeItemBytes(n);
+  constexpr size_t kItemAlignment = sizeof(NodeItem*);
+  CHECK_EQ(reinterpret_cast<uintptr_t>(ptr) % kItemAlignment, 0);
+  NodeItem* item = reinterpret_cast<NodeItem*>(ptr);
+
+  // We store a 32-bit offset relative to the beginning of space_, so that we
+  // only need an array of 32-bit values to map from node id to the NodeItem*,
+  // (versus 64 bits on most machines if we just stored an array of NodeItem*
+  // pointers). Casting to int64 is needed on 32bit CPU to avoid comparing
+  // values as "int" vs "size_t" in CHECK_LE.
+  CHECK_LE(static_cast<int64>(ptr - space_), kuint32max);
+  const uint32 offset = static_cast<uint32>(ptr - space_);
+  node_offsets_[id] = offset;
+  ptr += bytes;
+
+  const size_t num_output_edges = n->out_edges().size();
+  const int num_inputs = n->num_inputs();
+  const int num_outputs = n->num_outputs();
+
   new (item) NodeItem();
-  nodes_[id] = item;
   item->num_inputs = num_inputs;
   item->num_outputs = num_outputs;
   item->num_output_edges = num_output_edges;
 
   // Fill output edges.
-  // Keep track of the last EdgeInfo in the EdngeInfo array that references
+  // Keep track of the last EdgeInfo in the EdgeInfo array that references
   // a given output slot.  For all but the last, we need to do a copy of the
   // Tensor when propagating results downstream in the graph, but for the
   // last one, we can just do a move of the Tensor object to propagate it.
@@ -481,11 +522,12 @@ void GraphView::InitializeNode(const Node* n) {
   EdgeInfo* dst_edge = item->output_edge_base();
   for (auto e : n->out_edges()) {
     dst_edge->dst_id = e->dst()->id();
-    CHECK_LT(e->src_output(), 32768);  // Must fit in int16
+    CHECK_LE(e->src_output(), 0x3FFFFFFF);  // Must fit in 31 bits
     dst_edge->output_slot = e->src_output();
     dst_edge->is_last = false;
-    if (dst_edge->output_slot >= 0) {
-      last_indices[dst_edge->output_slot] = dst_edge;
+    const int output_slot = dst_edge->output_slot;
+    if (output_slot >= 0) {
+      last_indices[output_slot] = dst_edge;
     }
     dst_edge->input_slot = e->dst_input();
     dst_edge++;
@@ -513,26 +555,35 @@ void GraphView::InitializeNode(const Node* n) {
     output_types[i] = static_cast<uint8>(n->output_type(i));
     DCHECK_EQ(item->output_type(i), n->output_type(i));
   }
+  return ptr;
 }
 
 void GraphView::Initialize(const Graph* g) {
-  CHECK(nodes_ == nullptr);
+  CHECK(node_offsets_ == nullptr);
   const int num_nodes = g->num_node_ids();
   num_nodes_ = num_nodes;
-  nodes_ = new NodeItem*[num_nodes];
-  for (int i = 0; i < num_nodes; i++) {
-    nodes_[i] = nullptr;
+  size_t total_bytes = 0;
+  for (const Node* n : g->nodes()) {
+    total_bytes += NodeItemBytes(n);
   }
 
-  for (const Node* n : g->nodes()) {
-    InitializeNode(n);
+  node_offsets_ = new uint32[num_nodes];
+  for (int i = 0; i < num_nodes; i++) {
+    node_offsets_[i] = kuint32max;
   }
+
+  space_ = new char[total_bytes];  // NodeItem objects are allocated here
+  char* ptr = space_;
+  for (const Node* n : g->nodes()) {
+    ptr = InitializeNode(ptr, n);
+  }
+  CHECK_EQ(ptr, space_ + total_bytes);
 }
 
-static void GetMaxPendingCounts(const Node* n, int* max_pending,
-                                int* max_dead_count) {
-  const int num_in_edges = n->in_edges().size();
-  int initial_count;
+void GetMaxPendingCounts(const Node* n, size_t* max_pending,
+                         size_t* max_dead_count) {
+  const size_t num_in_edges = n->in_edges().size();
+  size_t initial_count;
   if (IsMerge(n)) {
     // merge waits all control inputs so we initialize the pending
     // count to be the number of control edges.
@@ -557,7 +608,7 @@ Status ExecutorImpl::Initialize() {
 
   // Build the information about frames in this subgraph.
   ControlFlowInfo cf_info;
-  BuildControlFlowInfo(graph_, &cf_info);
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info));
 
   // Cache this value so we make this virtual function call once, rather
   // that O(# steps * # nodes per step) times.
@@ -576,8 +627,7 @@ Status ExecutorImpl::Initialize() {
     FrameInfo* frame_info = EnsureFrameInfo(frame_name);
 
     // See if this node is a root node, and if so, add to root_nodes_.
-    const int num_in_edges = n->in_edges().size();
-    if (num_in_edges == 0) {
+    if (n->in_edges().empty()) {
       root_nodes_.push_back(n);
     }
 
@@ -590,7 +640,7 @@ Status ExecutorImpl::Initialize() {
     Status s = params_.create_kernel(n->def(), &item->kernel);
     if (!s.ok()) {
       item->kernel = nullptr;
-      s = AttachDef(s, n->def());
+      s = AttachDef(s, *n);
       LOG(ERROR) << "Executor failed to create kernel. " << s;
       return s;
     }
@@ -609,7 +659,7 @@ Status ExecutorImpl::Initialize() {
     // pending counts data structure, and allocate a handle in
     // that frame's pending counts data structure that has enough
     // space to store these maximal count values.
-    int max_pending, max_dead;
+    size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     item->pending_id =
         frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
@@ -618,7 +668,7 @@ Status ExecutorImpl::Initialize() {
     frame_info->nodes->push_back(n);
     if (IsEnter(n)) {
       string enter_name;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "frame_name", &enter_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
     }
   }
@@ -641,37 +691,39 @@ Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) {
     // Examine the out edges of each node looking for special use
     // cases that may affect memory allocation attributes.
     for (auto e : n->out_edges()) {
-      AllocatorAttributes attr;
-      s = InferAllocAttr(n, e->dst(), local_dev_name, &attr);
-      if (!s.ok()) return s;
-      if (attr.value != 0) {
-        if (!e->IsControlEdge()) {
+      if (!e->IsControlEdge()) {
+        AllocatorAttributes attr;
+        s = InferAllocAttr(n, e->dst(), local_dev_name, &attr);
+        if (!s.ok()) return s;
+        if (attr.value != 0) {
           attrs[e->src_output()].Merge(attr);
         }
       }
     }
 
     for (int out = 0; out < n->num_outputs(); out++) {
-      OpKernel* op_kernel = item->kernel;
+      const OpKernel* op_kernel = item->kernel;
       DCHECK_LT(out, op_kernel->output_memory_types().size());
       bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY;
-      AllocatorAttributes h;
-      h.set_on_host(on_host);
-      attrs[out].Merge(h);
+      if (on_host) {
+        AllocatorAttributes h;
+        h.set_on_host(on_host);
+        attrs[out].Merge(h);
+      }
     }
   }
   return s;
 }
 
-static Status InferAllocAttr(const Node* n, const Node* dst,
-                             const DeviceNameUtils::ParsedName& local_dev_name,
-                             AllocatorAttributes* attr) {
+Status InferAllocAttr(const Node* n, const Node* dst,
+                      const DeviceNameUtils::ParsedName& local_dev_name,
+                      AllocatorAttributes* attr) {
   Status s;
   // Note that it's possible for *n to be a Recv and *dst to be a Send,
   // so these two cases are not mutually exclusive.
   if (IsRecv(n)) {
     string src_name;
-    s = GetNodeAttr(n->def(), "send_device", &src_name);
+    s = GetNodeAttr(n->attrs(), "send_device", &src_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_src_name;
     if (!DeviceNameUtils::ParseFullName(src_name, &parsed_src_name)) {
@@ -696,7 +748,7 @@ static Status InferAllocAttr(const Node* n, const Node* dst,
   }
   if (IsSend(dst)) {
     string dst_name;
-    s = GetNodeAttr(dst->def(), "recv_device", &dst_name);
+    s = GetNodeAttr(dst->attrs(), "recv_device", &dst_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_dst_name;
     if (!DeviceNameUtils::ParseFullName(dst_name, &parsed_dst_name)) {
@@ -720,11 +772,6 @@ static Status InferAllocAttr(const Node* n, const Node* dst,
       VLOG(2) << "default alloc case local type " << local_dev_name.type
               << " remote type " << parsed_dst_name.type;
     }
-  } else if (dst->type_string() == "ToFloat") {
-    for (auto e : dst->out_edges()) {
-      s = InferAllocAttr(n, e->dst(), local_dev_name, attr);
-      if (!s.ok()) return s;
-    }
   }
   return s;
 }
@@ -796,6 +843,7 @@ class ExecutorState {
       if (val_field_is_set) {
         val.Destroy();
         val_field_is_set = false;
+        has_value = false;
       }
     }
 
@@ -848,7 +896,7 @@ class ExecutorState {
     Entry* input_tensors;
 
     // The number of outstanding ops for each iteration.
-    int outstanding_ops;
+    size_t outstanding_ops;
 
     // The number of outstanding frames for each iteration.
     int outstanding_frame_count;
@@ -989,13 +1037,13 @@ class ExecutorState {
 
     inline IterationState* GetIteration(int64 iter)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      int index = iter % iterations.size();
+      size_t index = iter % iterations.size();
       return iterations[index];
     }
 
     inline void SetIteration(int64 iter, IterationState* state)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      int index = iter % iterations.size();
+      size_t index = iter % iterations.size();
       DCHECK(state == nullptr || iterations[index] == nullptr);
       iterations[index] = state;
     }
@@ -1165,7 +1213,8 @@ class ExecutorState {
       GUARDED_BY(mu_);
 
   // The unique name of a frame.
-  inline string MakeFrameName(FrameState* frame, int64 iter_id, string name) {
+  inline string MakeFrameName(FrameState* frame, int64 iter_id,
+                              const string& name) {
     return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
   }
 
@@ -1312,7 +1361,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     if (IsEnter(curr_node)) {
       // Enter a child frame.
       TF_RETURN_IF_ERROR(
-          GetNodeAttr(curr_node->def(), "frame_name", &frame_name));
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       parent = curr_node;
     } else if (IsExit(curr_node)) {
       // Exit to the parent frame.
@@ -1356,7 +1405,7 @@ void ExecutorImpl::InitializePending(const Graph* graph,
   for (const Node* n : graph->nodes()) {
     const int id = n->id();
     const string& name = cf_info.frame_names[id];
-    int max_pending, max_dead;
+    size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     const NodeItem* item = gview_.node(id);
     PendingCounts* counts = EnsureFrameInfo(name)->pending_counts;
@@ -1386,7 +1435,7 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) {
   } else {
     num_outstanding_ops_ = ready.size();
     root_frame_->iterations[0]->outstanding_ops = ready.size();
-    done_cb_ = done;
+    done_cb_ = std::move(done);
     // Schedule to run all the ready ops in thread pool.
     ScheduleReady(ready, nullptr);
   }
@@ -1506,7 +1555,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNodeDef(node->def());
+              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead;
     }
 
     Entry* input_tensors = GetInputTensors(input_frame, input_iter);
@@ -1560,7 +1609,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
           if (vlog_) {
             VLOG(2) << this << " Async kernel done: "
-                    << SummarizeNodeDef(state->item->node->def());
+                    << SummarizeNode(*state->item->node);
           }
           if (stats) nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1689,11 +1738,14 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
       }
       inp->tensor = entry->val.get();
     } else {
-      if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) {
-        return AttachDef(
-            errors::FailedPrecondition("Attempting to use uninitialized value ",
-                                       item.kernel->def().input(i)),
-            item.kernel->def());
+      {
+        mutex_lock ml(*entry->ref_mu);
+        if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) {
+          return AttachDef(errors::FailedPrecondition(
+                               "Attempting to use uninitialized value ",
+                               item.kernel->def().input(i)),
+                           item.kernel->def());
+        }
       }
       if (expect_ref) {
         inp->mutex_if_ref = entry->ref_mu;
@@ -1747,7 +1799,8 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
   if (item.num_outputs == 0 && impl_->params_.node_outputs_cb != nullptr) {
     // If the node has no output, invoke the callback with output slot set to
     // -1, signifying that this is a no-output node.
-    impl_->params_.node_outputs_cb(item.node->name(), -1, nullptr, false, ctx);
+    s.Update(impl_->params_.node_outputs_cb(item.node->name(), -1, nullptr,
+                                            false, ctx));
   }
 
   for (int i = 0; i < item.num_outputs; ++i) {
@@ -1757,7 +1810,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // tensor value at i-th output.
       if (!IsSwitch(node) && !IsRecv(node)) {
         s.Update(errors::Internal("Missing ", i, "-th output from ",
-                                  SummarizeNodeDef(node->def())));
+                                  SummarizeNode(*node)));
       }
     } else {
       Entry* out = &((*outputs)[i]);
@@ -1769,8 +1822,13 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       out->alloc_attr = ctx->output_alloc_attr(i);
 
       // Sanity check of output tensor types.
-      DataType dtype = val->dtype();
-      if (val.is_ref()) dtype = MakeRefType(dtype);
+      DataType dtype;
+      if (val.is_ref()) {
+        mutex_lock ml(*val.mutex_if_ref);
+        dtype = MakeRefType(val->dtype());
+      } else {
+        dtype = val->dtype();
+      }
       if (dtype == item.output_type(i)) {
         if (stats && val.tensor->IsInitialized()) {
           nodestats::SetOutput(stats, i, val.tensor);
@@ -1790,10 +1848,11 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                           ctx->step_id(), i, to_log);
           }
 
-          // Experimental: debugger (tfdb) access to intermediate node outputs.
+          // Experimental: debugger (tfdb) access to intermediate node
+          // outputs.
           if (impl_->params_.node_outputs_cb != nullptr) {
-            impl_->params_.node_outputs_cb(item.node->name(), i, out->ref, true,
-                                           ctx);
+            s.Update(impl_->params_.node_outputs_cb(item.node->name(), i,
+                                                    out->ref, true, ctx));
           }
         } else {
           // NOTE that std::move is used here, so val.tensor goes to
@@ -1809,8 +1868,8 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 
           // Experimental: debugger access to intermediate node outputs.
           if (impl_->params_.node_outputs_cb != nullptr) {
-            impl_->params_.node_outputs_cb(item.node->name(), i, out->val.get(),
-                                           false, ctx);
+            s.Update(impl_->params_.node_outputs_cb(
+                item.node->name(), i, out->val.get(), false, ctx));
           }
         }
       } else {
@@ -1818,7 +1877,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                   DataTypeString(dtype),
                                   " does not match declared output type ",
                                   DataTypeString(item.output_type(i)),
-                                  " for node ", SummarizeNodeDef(node->def())));
+                                  " for node ", SummarizeNode(*node)));
       }
     }
     if (!val.is_ref()) {
@@ -1855,7 +1914,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
         &impl_->gview_, input_iter, ready);
   } else if (item->is_enter) {
     bool is_constant;
-    Status s = GetNodeAttr(node->def(), "is_constant", &is_constant);
+    Status s = GetNodeAttr(node->attrs(), "is_constant", &is_constant);
     DCHECK(s.ok()) << s;
     FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
     output_iter = 0;
@@ -1968,7 +2027,7 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
   }
 
   bool completed = false;
-  int ready_size = ready.size();
+  size_t ready_size = ready.size();
   if (ready_size == 0 || !s.ok()) {
     completed = (num_outstanding_ops_.fetch_sub(1) == 1);
   } else if (ready_size > 1) {
@@ -2181,7 +2240,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
                                            FrameState** child) {
   // Get the child frame name.
   string enter_name;
-  Status s = GetNodeAttr(node->def(), "frame_name", &enter_name);
+  Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
   DCHECK(s.ok()) << s;
   const string child_name = MakeFrameName(frame, iter, enter_name);
 
@@ -2199,7 +2258,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters;
-  s = GetNodeAttr(node->def(), "parallel_iterations", &parallel_iters);
+  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
   DCHECK(s.ok()) << s;
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
@@ -2316,9 +2375,10 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
                                               TaggedNodeSeq* ready) {
   const GraphView& gview = executor->gview_;
   IterationState* iter_state = GetIteration(iter);
-  const int num_output_edges = item->num_output_edges;
+  const size_t num_output_edges = item->num_output_edges;
   const EdgeInfo* edges = item->output_edge_list();
-  for (int out_index = 0; out_index < num_output_edges; out_index++) {
+  Entry* input_tensors = iter_state->input_tensors;
+  for (size_t out_index = 0; out_index < num_output_edges; out_index++) {
     const EdgeInfo& e = edges[out_index];
     const int dst_id = e.dst_id;
     const NodeItem* dst_item = gview.node(dst_id);
@@ -2338,8 +2398,9 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     bool dst_need_input = !is_control_edge;
     if (dst_item->is_merge) {
       // A merge node is ready if all control inputs have arrived and either
-      // a) a live data input becomes available or b) all data inputs are dead.
-      // For Merge, pending's LSB is set iff a live data input has arrived.
+      // a) a live data input becomes available or b) all data inputs are
+      // dead. For Merge, pending's LSB is set iff a live data input has
+      // arrived.
       if (is_control_edge) {
         iter_state->decrement_pending(dst_pending_id, 2);
         int count = iter_state->pending(dst_pending_id);
@@ -2384,8 +2445,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
 
     if (dst_need_input) {
       const int dst_slot = e.input_slot;
-      Entry* input_tensors = iter_state->input_tensors;
-      int dst_loc = dst_item->input_start + dst_slot;
+      const int dst_loc = dst_item->input_start + dst_slot;
       if (e.is_last) {
         input_tensors[dst_loc] = std::move((*outputs)[src_slot]);
       } else {
@@ -2500,7 +2560,7 @@ bool ExecutorState::FrameState::CleanupIterations(const GraphView* gview,
 }
 
 void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
-  (new ExecutorState(args, this))->RunAsync(done);
+  (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
 }  // end namespace
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 239c9666e33..e09dc4e3463 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -74,8 +74,8 @@ class Executor {
   //
   // RunAsync() uses "cancellation_manager", if not nullptr, to
   // register callbacks that should be called if the graph computation
-  // is cancelled. Note that the callbacks merely unblock any
-  // long-running computation, and a cancelled step will terminate by
+  // is canceled. Note that the callbacks merely unblock any
+  // long-running computation, and a canceled step will terminate by
   // returning/calling the DoneCallback as usual.
   //
   // RunAsync() dispatches closures to "runner". Typically, "runner"
@@ -162,7 +162,7 @@ class ExecutorBarrier {
   //
   // 'done' is called after the last executor completes, and
   // ExecutorBarrier is deleted.
-  ExecutorBarrier(int num, Rendezvous* r, StatusCallback done)
+  ExecutorBarrier(size_t num, Rendezvous* r, StatusCallback done)
       : rendez_(r), done_cb_(done), pending_(num) {}
 
   ~ExecutorBarrier() {}
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index cb7e1a40ceb..4970c2d252a 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -49,7 +49,6 @@ static constexpr const char* const kGradientOp =
 static constexpr const char* const kNodeLabel = "Func";
 static constexpr const char* const kFuncAttr =
     FunctionLibraryDefinition::kFuncAttr;
-static constexpr const char* const kNoInlineAttr = "_noinline";
 
 // Represents the index-th output of a node.
 struct Endpoint {
@@ -150,8 +149,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   ~FunctionLibraryRuntimeImpl() override;
 
-  Status Instantiate(const string& function_name,
-                     const InstantiateAttrValueMap& attrs,
+  Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) override;
 
   const FunctionBody* GetFunctionBody(Handle handle) override;
@@ -208,8 +206,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   };
   std::vector<Item*> items_;
 
-  Status FunctionDefToBody(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attrs,
+  Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            FunctionBody** fbody);
   Status CreateItem(Handle handle, Item** item);
   Status GetOrCreateItem(Handle handle, Item** item);
@@ -274,8 +271,9 @@ class CallOp : public AsyncOpKernel {
                if (!status.ok()) {
                  ctx->SetStatus(status);
                } else {
-                 CHECK_EQ(rets->size(), ctx->num_outputs());
-                 for (size_t i = 0; i < rets->size(); ++i) {
+                 const int ret_size = static_cast<int>(rets->size());
+                 CHECK_EQ(ret_size, ctx->num_outputs());
+                 for (int i = 0; i < ret_size; ++i) {
                    ctx->set_output(i, (*rets)[i]);
                  }
                }
@@ -323,7 +321,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   // Try to instantiate this function for the func/attr. Maybe its
   // cached already.
   Handle handle;
-  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
@@ -354,24 +352,10 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   return s;
 }
 
-Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
-    const FunctionDef& fdef, const InstantiateAttrValueMap& attrs,
-    FunctionBody** fbody) {
-  // Instantiates the function template into a graph def.
-  InstantiationResult result;
-  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig_, &result));
-
-  Graph* graph = new Graph(lib_def_);
-  GraphConstructorOptions opts;
-  opts.allow_internal_ops = true;
-  opts.expect_device_spec = false;
-  Status s = ConvertGraphDefToGraph(opts, result.gdef, graph);
-  if (!s.ok()) {
-    delete graph;
-  } else {
-    *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph);
-  }
-  return s;
+Status FunctionLibraryRuntimeImpl::FunctionDefToBody(const FunctionDef& fdef,
+                                                     AttrSlice attrs,
+                                                     FunctionBody** fbody) {
+  return FunctionDefToBodyHelper(fdef, attrs, lib_def_, get_func_sig_, fbody);
 }
 
 Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
@@ -389,11 +373,13 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
     // TODO(josh11b): Should filter out the attrs from func that aren't used
     // by the gradient function.
     TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
-    TF_RETURN_IF_ERROR(FunctionDefToBody(grad_fdef, func.attr(), g_body));
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBody(grad_fdef, AttrSlice(&func.attr()), g_body));
   } else {
     // f is a user-defined function.
     Handle f_handle;
-    TF_RETURN_IF_ERROR(Instantiate(func.name(), func.attr(), &f_handle));
+    TF_RETURN_IF_ERROR(
+        Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
     const FunctionBody* f_body = GetFunctionBody(f_handle);
     CHECK_NOTNULL(f_body);
     *g_body = SymbolicGradient(*f_body);
@@ -401,9 +387,9 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::Instantiate(
-    const string& function_name, const InstantiateAttrValueMap& attrs,
-    Handle* handle) {
+Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
+                                               AttrSlice attrs,
+                                               Handle* handle) {
   const string key = Canonicalize(function_name, attrs);
   {
     mutex_lock l(mu_);
@@ -416,7 +402,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   Status s;
   FunctionBody* fbody = nullptr;
   if (function_name == kGradientOp) {
-    const AttrValue* f = gtl::FindOrNull(attrs, kFuncAttr);
+    const AttrValue* f = attrs.Find(kFuncAttr);
     if (f == nullptr) {
       return errors::InvalidArgument("SymbolicGradient is missing attr: f");
     }
@@ -426,7 +412,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     }
     const string grad = lib_def_->FindGradient(func.name());
     if (!grad.empty()) {
-      return Instantiate(grad, func.attr(), handle);
+      return Instantiate(grad, AttrSlice(&func.attr()), handle);
     }
     TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(func, &fbody));
   } else {
@@ -455,7 +441,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
-          << g->edges().size();
+          << g->num_edges();
   if (VLOG_IS_ON(2)) {
     for (const auto& line : str_util::Split(DebugString(g), '\n')) {
       VLOG(2) << "|| " << line;
@@ -603,7 +589,7 @@ struct CustomCreatorSingleton {
 
   void Set(CustomKernelCreator cb) {
     mutex_lock l(mu);
-    custom_creator = cb;
+    custom_creator = std::move(cb);
   }
 
   CustomKernelCreator Get() {
@@ -620,7 +606,7 @@ CustomCreatorSingleton* GetCustomCreatorSingleton() {
 }  // end namespace
 
 void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb) {
-  GetCustomCreatorSingleton()->Set(cb);
+  GetCustomCreatorSingleton()->Set(std::move(cb));
 }
 
 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
@@ -630,7 +616,7 @@ FunctionLibraryRuntime* NewFunctionLibraryRuntime(
     CustomKernelCreator custom_kernel_creator) {
   return new FunctionLibraryRuntimeImpl(dmgr, env, device, graph_def_version,
                                         lib_def, optimizer_options,
-                                        custom_kernel_creator);
+                                        std::move(custom_kernel_creator));
 }
 
 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
@@ -828,14 +814,31 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
 // Given a "caller" in "graph", which is a function call of a function
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly.
-static void InlineFunctionBody(Graph* g, Node* caller,
-                               const FunctionBody* fbody) {
+void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                        Node* caller, const FunctionBody* fbody) {
   if (!ValidateInlining(caller, fbody)) {
     LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
                  << DebugString(fbody->graph);
     return;
   }
 
+  // Input edges. For data edges coming into "caller", we first compute the
+  // <src>:<src_output> for the i-th input in "inputs".
+  // If "caller" has any input control dependencies, we add a NoOp
+  // node "input_control_node", which depends on "caller"'s control inputs.
+  std::vector<Endpoint> inputs(caller->num_inputs());
+  Node* input_control_node = nullptr;
+  for (const Edge* e : caller->in_edges()) {
+    if (e->IsControlEdge()) {
+      if (input_control_node == nullptr) {
+        input_control_node = AddNoOp(g);
+      }
+      g->AddControlEdge(e->src(), input_control_node);
+    } else {
+      inputs[e->dst_input()] = {e->src(), e->src_output()};
+    }
+  }
+
   // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
   // fbody->graph into 'g' except the source and sink nodes.  We copy
   // edges among nodes in 'fbody->graph'.
@@ -844,13 +847,39 @@ static void InlineFunctionBody(Graph* g, Node* caller,
   // remember 'y' in node_map[x->id()].
   std::vector<Node*> node_map(fbody->graph->num_node_ids());
   Status s;
-  for (Node* n : fbody->graph->nodes()) {
-    if (n->IsSource() || n->IsSink()) continue;
-    CHECK(n->IsOp());
+  for (Node* n : fbody->graph->op_nodes()) {
     NodeDef ndef = n->def();
     ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    node_map[n->id()] = g->AddNode(ndef, &s);
+    ndef.set_device(caller->def().device());
+    Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
+    node_map[n->id()] = clone;
+
+    // If there is an input control node, and one of:
+    // a) the node has no data or control inputs, or
+    // b) the node is a function call or SymbolicGradient,
+    // then add a control edge from the input control node to the clone.
+    //
+    // We must not execute any nodes if the original function call would not
+    // have executed. This is especially critical when the function call is
+    // inside a control-flow construct like tf.cond(). Case (a) ensures that
+    // such nodes do not run.
+    //
+    // The purpose of case (b) is to ensure that instances of case (a) created
+    // by further inlining steps also receive the control dependency.
+    if (input_control_node) {
+      bool has_inputs = false;
+      for (const Edge* e : n->in_edges()) {
+        if (!e->src()->IsSource()) {
+          has_inputs = true;
+          break;
+        }
+      }
+      if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
+          clone->type_string() == "SymbolicGradient") {
+        g->AddControlEdge(input_control_node, clone);
+      }
+    }
   }
   for (const Edge* e : fbody->graph->edges()) {
     if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
@@ -864,29 +893,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,
 
   // Connect input edges.
   //
-  // For data edges coming into "caller", we first compute the
-  // <src>:<src_output> for the i-th input in "inputs". We create one
-  // Identity node for each input. Then, we connect inputs[i] to to
-  // the i-th identity node added. The nodes that previously connects
-  // to the j-th output of i-th arg node are reconnected to th i-th
+  // We create one Identity node for each input. Then, we connect inputs[i] to
+  // the i-th identity node added. The nodes that previously connected
+  // to the j-th output of i-th arg node are reconnected to the i-th
   // identity node.
   //
-  // If "caller" has any input control dependencies, we add a NoOp
-  // node "input_control_node". This "input_control_node" depends on
-  // what "caller" depends on, and the added identity nodes depend on
-  // "input_control_node".
-  std::vector<Endpoint> inputs(caller->num_inputs());
-  Node* input_control_node = nullptr;
-  for (const Edge* e : caller->in_edges()) {
-    if (e->IsControlEdge()) {
-      if (input_control_node == nullptr) {
-        input_control_node = AddNoOp(g);
-      }
-      g->AddControlEdge(e->src(), input_control_node);
-    } else {
-      inputs[e->dst_input()] = {e->src(), e->src_output()};
-    }
-  }
+  // The added identity nodes depend on "input_control_node".
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = AddIdentity(g, inputs[i]);
@@ -960,13 +972,12 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
   for (Node* node : graph->nodes()) {
     VLOG(3) << "Expanding " << node->DebugString();
     bool noinline;
-    if (fld->GetAttr(node->def(), kNoInlineAttr, &noinline).ok() && noinline) {
+    if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
       VLOG(3) << "noinline: " << node->DebugString();
       continue;
     }
     FunctionLibraryRuntime::Handle handle;
-    Status s =
-        lib->Instantiate(node->type_string(), node->def().attr(), &handle);
+    Status s = lib->Instantiate(node->type_string(), node->attrs(), &handle);
     if (!s.ok()) {
       // Either "node" is a primitive op, or the instantiation failed.
       if (errors::IsNotFound(s)) {
@@ -981,7 +992,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
     candidates.push_back({node, fbody});
   }
   for (const auto& p : candidates) {
-    InlineFunctionBody(graph, p.first, p.second);
+    InlineFunctionBody(*fld, graph, p.first, p.second);
   }
   return !candidates.empty();
 }
@@ -1000,25 +1011,19 @@ string NewName(const Node* n, bool pretty) {
 void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   // We visit nodes in forward topological sort order, which is a
   // possible execution order of the graph.
-  std::vector<int> pending(g->num_node_ids());
-  std::deque<const Node*> ready;
-  for (const Node* n : g->nodes()) {
-    pending[n->id()] = n->in_edges().size();
-    if (pending[n->id()] == 0) ready.push_back(n);
-  }
   gtl::InlinedVector<const Edge*, 4> inputs;
   gdef->Clear();
   gdef->mutable_versions()->CopyFrom(g->versions());
-  while (!ready.empty()) {
-    const Node* n = ready.front();
-    ready.pop_front();
-    for (const Edge* e : n->out_edges()) {
-      const Node* next = e->dst();
-      if (--pending[next->id()] == 0) {
-        ready.push_back(next);
-      }
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
     }
-    if (!n->IsOp()) continue;
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
     NodeDef* ndef = gdef->add_node();
     ndef->set_name(NewName(n, pretty));
     ndef->set_op(n->type_string());
@@ -1053,7 +1058,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
       }
     }
-  }
+  });
 }
 
 string DebugString(const Graph* g) {
@@ -1070,7 +1075,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       ret_types(ret_t.begin(), ret_t.end()) {
   this->arg_nodes.resize(arg_types.size());
   this->ret_nodes.resize(ret_types.size());
-  for (Node* n : this->graph->nodes()) {
+  for (Node* n : this->graph->op_nodes()) {
     gtl::InlinedVector<Node*, 4>* node_vec;
     if (n->type_string() == kRetOp) {
       node_vec = &this->ret_nodes;
@@ -1080,7 +1085,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       continue;
     }
     int index;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "index", &index));
+    TF_CHECK_OK(GetNodeAttr(n->attrs(), "index", &index));
     CHECK_LE(0, index);
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
@@ -1117,9 +1122,7 @@ void SymbolicGradientHelper::Copy() {
   // Copy the nodes.
   node_map[src.source_node()->id()] = dst->source_node();
   node_map[src.sink_node()->id()] = dst->sink_node();
-  for (Node* n : src.nodes()) {
-    if (n->IsSource() || n->IsSink()) continue;
-    CHECK(n->IsOp());
+  for (Node* n : src.op_nodes()) {
     node_map[n->id()] = dst->CopyNode(n);
   }
 
@@ -1154,7 +1157,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
 
   Graph* g = gbody_->graph;
 
-  const int num_y = gbody_->ret_nodes.size();
+  const int num_y = static_cast<int>(gbody_->ret_nodes.size());
 
   // Populate 'y_node_outputs_' with node function body outputs.
   // Populate 'y_grad_nodes' with initial gradient nodes for each return node of
@@ -1169,7 +1172,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
     y_node_outputs.push_back({y, 0});
     DCHECK_EQ(y->type_string(), kRetOp);
     const DataType dtype = y->input_type(0);
-    const int index = gbody_->arg_nodes.size();
+    const int index = static_cast<int>(gbody_->arg_nodes.size());
     Node* dy = AddArg(g, dtype, index);
     gbody_->arg_types.push_back(dtype);
     gbody_->arg_nodes.push_back(dy);
@@ -1177,7 +1180,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   }
 
   // Populate 'x_nodes' with function args (excluding 'y_grad_node_outputs').
-  const int num_x = fbody_->arg_nodes.size();
+  const size_t num_x = fbody_->arg_nodes.size();
   std::vector<NodeOut> x_node_outputs;
   x_node_outputs.reserve(num_x);
   for (size_t i = 0; i < fbody_->arg_nodes.size(); ++i) {
@@ -1200,7 +1203,8 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   gbody_->ret_nodes.clear();
   // Add new return nodes to the function gradient body for each node
   // in 'x_grad_nodes'.
-  for (size_t i = 0; i < fbody_->arg_types.size(); ++i) {
+  const int arg_types_size = static_cast<int>(fbody_->arg_types.size());
+  for (int i = 0; i < arg_types_size; ++i) {
     Endpoint grad = {x_grad_node_outputs[i].node, x_grad_node_outputs[i].index};
     Node* ret = AddRet(g, grad, i);
     gbody_->ret_nodes.push_back(ret);
@@ -1215,4 +1219,26 @@ FunctionBody* SymbolicGradient(const FunctionBody& f) {
   return SymbolicGradientHelper(f).Compute();
 }
 
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* const lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    FunctionBody** fbody) {
+  // Instantiates the function template into a graph def.
+  InstantiationResult result;
+  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig, &result));
+
+  Graph* graph = new Graph(lib_def);
+  GraphConstructorOptions opts;
+  opts.allow_internal_ops = true;
+  opts.expect_device_spec = false;
+  Status s = ConvertNodeDefsToGraph(opts, result.nodes, graph);
+  if (!s.ok()) {
+    delete graph;
+  } else {
+    *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph);
+  }
+  return s;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 7cfe6946735..89ef57109a1 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -27,6 +27,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+static constexpr const char* const kNoInlineAttr = "_noinline";
+
 // Registers a default customizable kernel creator for a function call.
 //
 // If 'cb()' returns a non-OK, we still fall back to an executor-based
@@ -147,6 +149,19 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
 // TODO(zhifengc): Asks math expert to say the comment again.
 FunctionBody* SymbolicGradient(const FunctionBody& f);
 
+// Given a "caller" in graph "g", which is a function call of a function
+// to "fbody". Replaces the "caller" with fbody->graph and connects
+// edges properly.
+void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                        Node* caller, const FunctionBody* fbody);
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef.
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* const lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    FunctionBody** fbody);
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index f86a8ed5dc0..dec6ca996aa 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -28,12 +32,15 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
+namespace {
 
 typedef FunctionDefHelper FDH;
 
@@ -44,7 +51,7 @@ Status GetOpSig(const string& op, const OpDef** sig) {
 void FunctionTestSchedClosure(std::function<void()> fn) {
   static thread::ThreadPool* w =
       new thread::ThreadPool(Env::Default(), "Test", 8);
-  w->Schedule(fn);
+  w->Schedule(std::move(fn));
 }
 
 void HasError(const Status& s, const string& substr) {
@@ -52,19 +59,30 @@ void HasError(const Status& s, const string& substr) {
       << s << ", expected substring " << substr;
 }
 
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
+
 class FunctionTest : public ::testing::Test {
  protected:
   FunctionTest()
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionTest() override {
-    delete exec_;
-    delete device_;
-  }
-
-  void Create(const FunctionDef& fdef, InstantiateAttrValueSlice attrs) {
-    delete exec_;
+  void Create(const FunctionDef& fdef, Attrs attrs) {
+    exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
 
@@ -75,19 +93,22 @@ class FunctionTest : public ::testing::Test {
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     opts.expect_device_spec = false;
-    TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
+    TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g));
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+    Executor* exec;
+    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    exec_.reset(exec);
   }
 
   void Run(const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -105,8 +126,8 @@ class FunctionTest : public ::testing::Test {
     }
   }
 
-  Device* device_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<Executor> exec_;
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
 };
@@ -136,25 +157,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionLibraryRuntimeTest() override {
-    delete lib_;
-    delete lib_def_;
-    delete device_;
-  }
-
   void Init(const std::vector<FunctionDef>& flib) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
-    delete lib_def_;
-    lib_def_ = new FunctionLibraryDefinition(OpRegistry::Global(), proto);
-    delete lib_;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ = NewFunctionLibraryRuntime(nullptr, Env::Default(), device_,
-                                     TF_GRAPH_DEF_VERSION, lib_def_, opts);
+    lib_.reset(NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
+                                         TF_GRAPH_DEF_VERSION, lib_def_.get(),
+                                         opts));
+    fdef_lib_ = lib_def_->ToProto();
   }
 
-  Status Run(const string& name, InstantiateAttrValueSlice attrs,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+  Status Run(const string& name, Attrs attrs, const std::vector<Tensor>& args,
+             std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -190,7 +205,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  Graph* GetFuncBody(const string& name, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetFuncBody(const string& name, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -199,12 +214,12 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*fbody->graph, ret);
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  Graph* GetGradBody(const string& func, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetGradBody(const string& func, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
@@ -213,17 +228,17 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    FunctionBody* gbody = SymbolicGradient(*fbody);
+    std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*gbody->graph, ret);
-    delete gbody;
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*gbody->graph, ret.get());
     return ret;
   }
 
-  Device* device_ = nullptr;
-  FunctionLibraryDefinition* lib_def_ = nullptr;
-  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
@@ -254,113 +269,258 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+// Adds a function call to 'scope.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            gtl::ArraySlice<Input> inputs) {
+  NodeDef def;
+  NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
+  for (const Input& input : inputs) {
+    builder.Input(input.node()->name(), input.index(),
+                  input.node()->output_type(input.index()));
+  }
+  TF_CHECK_OK(builder.Finalize(&def));
+  Status status;
+  Node* n = scope->graph()->AddNode(def, &status);
+  TF_CHECK_OK(status);
+  for (int i = 0; i < inputs.size(); ++i) {
+    scope->graph()->AddEdge(inputs[i].node(), inputs[i].index(), n, i);
+  }
+  return Output(n);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  Graph* g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n2:float) -> (n4:float) {
-  n3 = XTimesFour[T=float](n2)
-  n4 = XTimesFour[T=float](n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e1 = R"P(
-(n2:float) -> (n17:float) {
-  n10 = Identity[T=float](n2)
-  n7 = XTimesTwo[T=float](n10)
-  n8 = XTimesTwo[T=float](n7)
-  n11 = Identity[T=float](n8)
-  n16 = Identity[T=float](n11)
-  n13 = XTimesTwo[T=float](n16)
-  n14 = XTimesTwo[T=float](n13)
-  n17 = Identity[T=float](n14)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto arg = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto a = Call(&s, "x4", "XTimesFour", {arg});
+    auto b = Call(&s, "y", "XTimesFour", {a});
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), b, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e2 = R"P(
-(n2:float) -> (n17:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n10 = Identity[T=float](n2)
-  n23 = Identity[T=float](n10)
-  n21 = Mul[T=float](n23, n19)
-  n24 = Identity[T=float](n21)
-  n30 = Identity[T=float](n24)
-  n28 = Mul[T=float](n30, n26)
-  n31 = Identity[T=float](n28)
-  n11 = Identity[T=float](n31)
-  n16 = Identity[T=float](n11)
-  n37 = Identity[T=float](n16)
-  n35 = Mul[T=float](n37, n33)
-  n38 = Identity[T=float](n35)
-  n44 = Identity[T=float](n38)
-  n42 = Mul[T=float](n44, n40)
-  n45 = Identity[T=float](n42)
-  n17 = Identity[T=float](n45)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto x4_x2 = Call(&s, "x4/x2", "XTimesTwo", {func0});
+    auto x4_y = Call(&s, "x4/y", "XTimesTwo", {x4_x2});
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto y_x2 = Call(&s, "y/x2", "XTimesTwo", {func2});
+    auto y_y = Call(&s, "y/y", "XTimesTwo", {y_x2});
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  GraphDef e2;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
+    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
+    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
+    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&e2));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_, g);
-  EXPECT_EQ(e2, DebugString(g));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // Get rid of redundant Identity nodes.
-  RemoveIdentityNodes(g);
-  const char* e3 = R"P(
-(n2:float) -> (n42:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n21 = Mul[T=float](n2, n19)
-  n28 = Mul[T=float](n21, n26)
-  n35 = Mul[T=float](n28, n33)
-  n42 = Mul[T=float](n35, n40)
+  RemoveIdentityNodes(g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_y_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, y_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, y_y_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
-)P";
-  EXPECT_EQ(e3, DebugString(g));
-  delete g;
+
+// Verifies that control dependencies on the caller are added as control
+// dependencies on any function calls created by inlining.
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+  Init({test::function::XTimesTwo(), test::function::XTimesFour()});
+
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto b = Call(&s, "b", "XTimesFour", {a});
+    s.graph()->AddControlEdge(c.operation.node(), b.node());
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
+    TF_ASSERT_OK(s.ToGraph(g.get()));
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+    auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
+    s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
+    auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
+    s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+
+    auto func3 =
+        ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
+    auto func4 = ops::Identity(
+        s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
+    auto b_x2_two = ops::Const(
+        s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
+    auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
+    auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
+
+    auto func6 =
+        ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
+    auto func7 = ops::Identity(
+        s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
+    auto b_y_two = ops::Const(
+        s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
+    auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
+    auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
+
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g(GetFuncBody("XTimes16", {{"T", DT_FLOAT}}));
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_, g.get());
-  OptimizeGraph(lib_, &g);
-  const char* e0 = R"P(
-(n2:float) -> (n7:float) {
-  n8 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n4 = Mul[T=float](n2, n8)
-  n5 = Mul[T=float](n4, n8)
-  n6 = Mul[T=float](n5, n8)
-  n7 = Mul[T=float](n6, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_scale = ops::Const<float>(
+        s.WithOpName("x4/x2/scale/_12__cf__2")
+            .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+        2.0f);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_x2_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, x4_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, x4_x2_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
-      // Name
+                            // Name
       "ManySwapsNodeDef",
       // Input
       {"x: float", "y: float"},
@@ -379,9 +539,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsNodeDef", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -412,24 +572,35 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsFirst", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
 
-  // NOTE: We can remove n8, n9, n10, n11 with a control edge n8->n5.
+  // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
-  const char* e0 = R"P(
-(n3:float, n2:float) -> (n6:float) {
-  n4 = Mul[T=float](n3, n3)
-  n8 = NoOp() @ n4
-  n9 = Identity[T=float](n3) @ n8
-  n10 = Identity[T=float](n2) @ n8
-  n11 = NoOp() @ n10, n9
-  n5 = Mul[T=float](n2, n2) @ n11
-  n6 = Add[T=float](n4, n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
+    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+    auto func2 = ops::Identity(
+        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
+        {func1.output.op(), func2.output.op()}));
+    auto y2 =
+        ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
+    auto o = ops::Add(s.WithOpName("o"), x2, y2);
+    auto ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
@@ -459,13 +630,14 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 
   // Instantiating "XTimesTwo" should fail.
   FunctionLibraryRuntime::Handle handle;
-  HasError(lib_->Instantiate("XTimesTwo", {{"T", DT_FLOAT}}, &handle),
+  HasError(lib_->Instantiate("XTimesTwo", Attrs({{"T", DT_FLOAT}}), &handle),
            "Not found: type attr not found");
 
   // But XTimesFour and XTimes16 instantiation should succeed. Only
   // when they run, they fail because XTimesTwo is bad.
-  TF_CHECK_OK(lib_->Instantiate("XTimesFour", {{"T", DT_FLOAT}}, &handle));
-  TF_CHECK_OK(lib_->Instantiate("XTimes16", {{"T", DT_FLOAT}}, &handle));
+  TF_CHECK_OK(
+      lib_->Instantiate("XTimesFour", Attrs({{"T", DT_FLOAT}}), &handle));
+  TF_CHECK_OK(lib_->Instantiate("XTimes16", Attrs({{"T", DT_FLOAT}}), &handle));
 
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
@@ -476,84 +648,136 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  auto f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
-  const char* e0 = R"P(
-(n4:float) -> (n5:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(f));
-  delete f;
-  std::unique_ptr<Graph> g(GetGradBody("XTimesTwo", {{"T", DT_FLOAT}}));
-  const char* e1 = R"P(
-(n4:float, n6:float) -> (n7:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-  n7 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Mul[T=float]](n4, n3, n6)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n2:float, n3:float) -> (n9:float) {
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [0] values: >]()
-  n10 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n6 = Shape[T=float, out_type=int32](n2)
-  n5 = Mul[T=float](n3, n10)
-  n7 = BroadcastGradientArgs[T=int32](n6, n11)
-  n8 = Sum[T=float, Tidx=int32, keep_dims=false](n5, n7)
-  n9 = Reshape[T=float, Tshape=int32](n8, n6)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+    GraphDef actual;
+    f->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    NameAttrList fn;
+    fn.set_name("Mul");
+    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto func1 = ops::SymbolicGradient(
+        s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
+        {DT_FLOAT, DT_FLOAT}, fn);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto scale =
+        ops::Const(s.WithOpName("scale/_5__cf__6")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   2.0f);
+    auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
+    auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
+    auto const0 =
+        ops::Const(s.WithOpName("Func/_1/sy/_6__cf__7")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   0, {0});
+    auto func1_rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("Func/_1/rx"), func1_sx, const0);
+    auto func1_sum_gx =
+        ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
+    auto func1_dx =
+        ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
-  const char* e0 = R"P(
-(n7:float, n5:float, n2:float) -> (n14:float, n11:float) {
-  n3 = Identity[T=float](n2)
-  n4 = Identity[T=float](n2)
-  n6 = Shape[T=float, out_type=int32](n5)
-  n8 = Shape[T=float, out_type=int32](n7)
-  n9 = BroadcastGradientArgs[T=int32](n8, n6)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n3, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n6)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Identity(s.WithOpName("gx"), dz);
+    auto gy = ops::Identity(s.WithOpName("gy"), dz);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
-  const char* e0 = R"P(
-(n6:float, n3:float, n2:float) -> (n14:float, n11:float) {
-  n4 = Mul[T=float](n2, n3)
-  n5 = Shape[T=float, out_type=int32](n3)
-  n7 = Mul[T=float](n6, n2)
-  n8 = Shape[T=float, out_type=int32](n6)
-  n9 = BroadcastGradientArgs[T=int32](n8, n5)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n7, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n5)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Mul(s.WithOpName("gx"), dz, y);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto gy = ops::Mul(s.WithOpName("gy"), x, dz);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
@@ -570,107 +794,169 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
                           });
 
   // TestGrad = Test'(x, y)
-  auto grad =
-      FDH::Define("TestGrad", {"x:float", "y:float"}, {"dx:float", "dy:float"},
-                  {}, {FDH::Const<float>("dz", 1),
-                       {{"grad0", "grad1"},
-                        "SymbolicGradient",
-                        {"x", "y", "dz"},
-                        {
-                            {"f", FDH::FunctionRef("Test")},
-                            {"Tin", DataTypeSlice{T, T, T}},
-                            {"Tout", DataTypeSlice{T, T}},
-                        }},
-                       {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
-                       {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
+  auto grad = FDH::Define("TestGrad", {"x:float", "y:float"},
+                          {"dx:float", "dy:float"}, {},
+                          {FDH::Const<float>("dz", 1),
+                           {{"grad0", "grad1"},
+                            "SymbolicGradient",
+                            {"x", "y", "dz"},
+                            {
+                                {"f", FDH::FunctionRef("Test")},
+                                {"Tin", DataTypeSlice{T, T, T}},
+                                {"Tout", DataTypeSlice{T, T}},
+                            }},
+                           {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
+                           {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g(GetFuncBody("TestGrad", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n5 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Test](n4, n3, n2)
-  n6 = Identity[T=float](n5:1)
-  n8 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    NameAttrList fn;
+    fn.set_name("Test");
+    auto grad0 = ops::SymbolicGradient(s.WithOpName("grad0"),
+                                       std::initializer_list<Input>{x, y, dz},
+                                       {DT_FLOAT, DT_FLOAT}, fn);
+    auto dx = ops::Identity(s.WithOpName("dx"), grad0[0]);
+    auto dy = ops::Identity(s.WithOpName("dy"), grad0[1]);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g.get());
-  const char* e1 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n10 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n26 = Identity[T=float](n2)
-  n25 = Identity[T=float](n3)
-  n24 = Identity[T=float](n4)
-  n14 = Add[T=float](n24, n25)
-  n15 = Rank[T=float](n14)
-  n16 = Range[Tidx=int32](n11, n15, n10)
-  n20 = ZerosLike[T=int32](n15)
-  n17 = Sum[T=float, Tidx=int32, keep_dims=false](n14, n16)
-  n19 = SymbolicGradient[Tin={float, int32, float}, Tout={float, int32}, f=Sum[T=float, Tidx=int32, keep_dims=false]](n14, n16, n26)
-  n21 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Add[T=float]](n24, n25, n19)
-  n28 = Identity[T=float](n21:1)
-  n27 = Identity[T=float](n21)
-  n6 = Identity[T=float](n28)
-  n8 = Identity[T=float](n27)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n4:float, n3:float) -> (n25:float, n23:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n8 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n7 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n19 = Shape[T=float, out_type=int32](n3)
-  n9 = Add[T=float](n4, n3)
-  n20 = Shape[T=float, out_type=int32](n4)
-  n10 = Rank[T=float](n9)
-  n14 = Shape[T=float, out_type=int32](n9)
-  n21 = BroadcastGradientArgs[T=int32](n20, n19)
-  n11 = Range[Tidx=int32](n8, n10, n7)
-  n12 = Shape[T=int32, out_type=int32](n11)
-  n13 = Fill[T=int32](n12, n7)
-  n15 = DynamicStitch[N=2, T=int32](n11, n11, n14, n13)
-  n16 = Reshape[T=float, Tshape=int32](n2, n15)
-  n17 = Div[T=int32](n14, n15)
-  n18 = Tile[T=float, Tmultiples=int32](n16, n17)
-  n24 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21)
-  n22 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21:1)
-  n25 = Reshape[T=float, Tshape=int32](n24, n20)
-  n23 = Reshape[T=float, Tshape=int32](n22, n19)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto grad0_l = ops::Sum(s.WithOpName("grad0/l"), grad0_z, grad0_indices);
+
+    NameAttrList sum;
+    sum.set_name("Sum");
+    (*sum.mutable_attr())["T"].set_type(DT_FLOAT);
+    (*sum.mutable_attr())["Tidx"].set_type(DT_INT32);
+    (*sum.mutable_attr())["keep_dims"].set_b(false);
+    auto grad0_func1 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_1"),
+        std::initializer_list<Input>{grad0_z, grad0_indices, func2},
+        {DT_FLOAT, DT_INT32}, sum);
+
+    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+
+    NameAttrList add;
+    add.set_name("Add");
+    (*add.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto grad0_func3 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_3"),
+        std::initializer_list<Input>{func0, func1, grad0_func1[0]},
+        {DT_FLOAT, DT_FLOAT}, add);
+
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto dx = ops::Identity(s.WithOpName("dx"), func3);
+    auto dy = ops::Identity(s.WithOpName("dy"), func4);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), x, y);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto i_shape =
+        ops::Shape(s.WithOpName("grad0/Func/_1/i_shape"), grad0_indices);
+    auto stitch_val = ops::Fill(s.WithOpName("grad0/Func/_1/stitch_val1"),
+                                i_shape, grad0_one);
+    auto x_shape = ops::Shape(s.WithOpName("grad0/Func/_1/x_shape"), grad0_z);
+    auto y_shape = ops::DynamicStitch(
+        s.WithOpName("grad0/Func/_1/y_shape"),
+        std::initializer_list<Input>{grad0_indices, grad0_indices},
+        std::initializer_list<Input>{x_shape, stitch_val});
+    auto dy_reshaped =
+        ops::Reshape(s.WithOpName("grad0/Func/_1/dy_reshaped"), dz, y_shape);
+    auto tile_scaling =
+        ops::Div(s.WithOpName("grad0/Func/_1/tile_scaling"), x_shape, y_shape);
+    auto func1_dx =
+        ops::Tile(s.WithOpName("grad0/Func/_1/dx"), dy_reshaped, tile_scaling);
+
+    auto sx = ops::Shape(s.WithOpName("grad0/Func/_3/sx"), x);
+    auto sy = ops::Shape(s.WithOpName("grad0/Func/_3/sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("grad0/Func/_3/rx"), sx, sy);
+    auto sum_gx =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gx"), func1_dx, rx.r0);
+    auto sum_gy =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gy"), func1_dx, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("grad0/Func/_3/dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("grad0/Func/_3/dy"), sum_gy, sy);
+
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
 
-string Optimize(std::function<bool(Graph* g)> pass, const FunctionDef& fdef) {
+GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
+                  const FunctionDef& fdef) {
   InstantiationResult result;
-  InstantiateAttrValueMap empty;
-  TF_CHECK_OK(InstantiateFunction(fdef, empty, GetOpSig, &result));
-  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
-  pass(g);
-  Graph* g1 = new Graph(OpRegistry::Global());
-  CopyGraph(*g, g1);
-  delete g;
+  TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g.get()));
+  pass(g.get());
+  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
+  CopyGraph(*g, g1.get());
+  g = nullptr;
   GraphDef gdef;
   g1->ToGraphDef(&gdef);
-  delete g1;
-  return DebugString(gdef);
+  return gdef;
 }
 
 }  // end namespace
@@ -699,21 +985,25 @@ TEST(OptimizationTest, RemoveDeadNodes) {
        {{"keep_me"}, "RandomUniform", {"o"}, {{"T", T}, {"dtype", DT_FLOAT}}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2)
-  n3 = Add[T=int32](n2, n2)
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n4 = Mul[T=int32](n1, n3)
-  n5 = Mul[T=int32](n3, n4)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto keep_me = ops::RandomUniform(s.WithOpName("keep_me"), {o}, DT_FLOAT);
+    auto x1 = ops::Add(s.WithOpName("x1"), o, o);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x2 = ops::Mul(s.WithOpName("x2"), a, x1);
+    auto x3 = ops::Mul(s.WithOpName("x3"), x1, x2);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
 
   // TODO(zhifengc): Comes up another test case.
-  EXPECT_EQ(Optimize(::tensorflow::RemoveDeadNodes, func), e0);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(::tensorflow::RemoveDeadNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
@@ -734,23 +1024,19 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
-  const char* e0 = R"S(
-() -> (n2:float) {
-  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  n1 = Identity[T=float](n0)
-  n2 = Add[T=float](n1, n1)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-() -> (n2:float) {
-  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  n1 = Identity[T=float](n0)
-  n2 = Add[T=float](n1, n1)
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto v = ops::Variable(s.WithOpName("v"), PartialTensorShape({}), DT_FLOAT);
+    auto v_read = ops::Identity(s.WithOpName("v_read"), v);
+    auto ret = ops::Add(s.WithOpName("ret"), v_read, v_read);
+    auto ret_retval = ops::_Retval(s.WithOpName("ret_RetVal"), ret, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  TF_EXPECT_GRAPH_EQ(expected,
+                     Optimize(::tensorflow::RemoveIdentityNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes) {
@@ -781,28 +1067,38 @@ TEST(OptimizationTest, RemoveIdentityNodes) {
         {"x3"}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n3 = Identity[T=int32](n1)
-  n4 = Identity[T=int32](n3)
-  n5 = Identity[T=int32](n4)
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2) @ n5
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2) @ n1
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x1 = ops::Identity(s.WithOpName("x1"), a);
+    auto x2 = ops::Identity(s.WithOpName("x2"), x1);
+    auto x3 = ops::Identity(s.WithOpName("x3"), x2);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(x3), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(a), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected,
+                       Optimize(::tensorflow::RemoveIdentityNodes, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter) {
@@ -839,49 +1135,63 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
       // Return values
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n3 = _ArrayToList[N=4, T=float, out_types={float, float, float, float}](n2, n2:1, n2:2, n2:3)
-  n5 = Mul[T=float](n3:2, n3:3)
-  n4 = Mul[T=float](n3, n3:1)
-  n6 = _ListToArray[N=2, T=float, Tin={float, float}](n4, n5)
-  n7 = AddN[N=2, T=float](n6, n6:1)
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto a = ops::_ArrayToList(scope.WithOpName("a"), s.output,
+                               {DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT});
+    auto r = ops::Mul(scope.WithOpName("r"), a[2], a[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), a[0], a[1]);
+    auto x = ops::_ListToArray(scope.WithOpName("x"),
+                               std::initializer_list<Input>{l, r}, DT_FLOAT, 2);
+    auto o = ops::AddN(scope.WithOpName("o"), x.output);
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n5 = Mul[T=float](Func/_2, Func/_3)
-  n4 = Mul[T=float](Func/_0, Func/_1)
-  n7 = AddN[N=2, T=float](Func/_4, Func/_5)
-  Func/_0 = Identity[T=float](n2)
-  Func/_1 = Identity[T=float](n2:1)
-  Func/_2 = Identity[T=float](n2:2)
-  Func/_3 = Identity[T=float](n2:3)
-  Func/_4 = Identity[T=float](n4)
-  Func/_5 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
+    auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto o = ops::AddN(scope.WithOpName("o"),
+                       std::initializer_list<Input>{func_4, func_5});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
+  }
 
-  const char* e2 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n5 = Mul[T=float](n2:2, n2:3)
-  n4 = Mul[T=float](n2, n2:1)
-  n7 = AddN[N=2, T=float](n4, n5)
-}
-)P";
-  auto remove_listarray_and_identity = [](Graph* g) {
-    return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
-  };
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e2);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto r = ops::Mul(scope.WithOpName("r"), s[2], s[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), s[0], s[1]);
+    auto o =
+        ops::AddN(scope.WithOpName("o"), std::initializer_list<Input>{l, r});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    auto remove_listarray_and_identity = [](Graph* g) {
+      return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
+    };
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
@@ -910,33 +1220,48 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
         {"x"}}},
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = _ListToArray[N=2, T=float, Tin={float, float}](n0, n0) @ n1
-  n3 = AddN[N=2, T=float](n2, n2:1) @ n2
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto x = ops::_ListToArray(s.WithOpName("x").WithControlDependencies(dummy),
+                               std::initializer_list<Input>{i, i}, DT_FLOAT, 2);
+    auto o =
+        ops::AddN(s.WithOpName("o").WithControlDependencies({x.output[0].op()}),
+                  x.output);
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n3 = AddN[N=2, T=float](Func/_0, Func/_1) @ Func/_3
-  Func/_0 = Identity[T=float](n0) @ Func/_2
-  Func/_1 = Identity[T=float](n0) @ Func/_2
-  Func/_2 = NoOp() @ n1
-  Func/_3 = NoOp() @ Func/_0, Func/_1
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto func_2 =
+        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_0 = ops::Identity(
+        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+    auto func_1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
+        {func_0.output.op(), func_1.output.op()}));
+    auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
+                       std::initializer_list<Input>{func_0, func_1});
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
 
   auto remove_listarray_and_identity = [](Graph* g) {
     return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
   };
   // NOTE: We are not removing Identity nodes with any control
   // dependencies yet.
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e1);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 54c7baf680e..646cd88a3a3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
-
-namespace gpu = ::perftools::gputools;
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
@@ -29,6 +28,7 @@ GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory,
     : BFCAllocator(
           new GPUMemAllocator(
               GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()),
-          total_memory, gpu_options.allow_growth(), "gpu_bfc") {}
+          total_memory, gpu_options.allow_growth(),
+          strings::StrCat("GPU_", device_id, "_bfc")) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 784dcfe81e3..1c4aaa5f748 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -30,8 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 namespace {
 
@@ -326,6 +324,7 @@ static void BM_AllocationDelayed(int iters, int delay) {
   int size_index = 0;
 
   std::vector<void*> ptrs;
+  ptrs.reserve(delay);
   for (int i = 0; i < delay; i++) {
     ptrs.push_back(nullptr);
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d40e3e4caa8..7eda5c90a12 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <algorithm>
+#include <map>
+#include <tuple>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -56,8 +59,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 // Eigen Ops directly allocate memory only for temporary buffers used
@@ -74,17 +75,18 @@ namespace tensorflow {
 
 class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+  EigenCudaStreamDevice()
+      : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     Eigen::initializeDeviceProp();
   }
-  ~EigenCudaStreamDevice() {
-  }
+  ~EigenCudaStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
                     int gpu_id, ::tensorflow::Allocator* alloc, char* scratch) {
     if (LogMemory::IsEnabled()) {
       operation_ = context->op_kernel().name() + "/EigenAllocator";
       step_id_ = context->step_id();
     }
+    context_ = context;
     scratch_ = scratch;
     semaphore_ =
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
@@ -101,8 +103,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   void* allocate(size_t num_bytes) const override {
     void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
     if (ret == nullptr) {
-      LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating "
-                 << num_bytes << ". See error logs for more detailed info.";
+      if (context_) {
+        context_->SetStatus(errors::ResourceExhausted(
+            strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
+                            " bytes for ", operation_)));
+      } else {
+        LOG(FATAL)
+            << "EigenAllocator for GPU ran out of memory when allocating "
+            << num_bytes << ". See error logs for more detailed info.";
+      }
     }
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
@@ -123,15 +132,13 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
 
   // Return a pointer to a per stream scratchpad of 1024 bytes residing
   // in global memory.
-  void* scratchpad() const {
-    return scratch_;
-  }
+  void* scratchpad() const override { return scratch_; }
 
   // Return a semaphore. The semaphore is initially initialized to 0, and
   // each kernel using it is responsible for resetting to 0 upon completion
   // to maintain the invariant that the semaphore is always equal to 0 upon
   // each kernel start.
-  unsigned int* semaphore() const { return semaphore_; }
+  unsigned int* semaphore() const override { return semaphore_; }
 
  private:
   struct AsyncFreeData {
@@ -162,19 +169,76 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   ::tensorflow::Allocator* allocator_;  // Not owned.
   mutable char* scratch_;
   mutable unsigned int* semaphore_;
+  OpKernelContext* context_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
 };
 
+// This factory helps to ensure that different GPU device objects that refer to
+// the same physical device and stream group id use the same stream group
+// object (and therefore the same CUDA streams). This is necessary since there
+// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
+// and allocators must not be shared across streams.
+class BaseGPUDevice::StreamGroupFactory {
+ public:
+  // Returns the unique stream group for use with the stream defined by
+  // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist.
+  // This function is thread safe.
+  BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id,
+                                          int stream_group_within_gpu,
+                                          gpu::StreamExecutor* executor) {
+    mutex_lock guard(lock_);
+    StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)];
+    if (!group->compute) {
+      group->compute = new gpu::Stream(executor);
+      group->compute->Init();
+      VLOG(2) << "Created stream[" << stream_group_within_gpu
+              << "] = " << group->compute;
+
+      group->host_to_device = new gpu::Stream(executor);
+      group->host_to_device->Init();
+      VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
+              << "] = " << group->host_to_device;
+
+      group->device_to_host = new gpu::Stream(executor);
+      group->device_to_host->Init();
+      VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
+              << "] = " << group->device_to_host;
+
+      group->device_to_device = new gpu::Stream(executor);
+      group->device_to_device->Init();
+      VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
+              << "] = " << group->device_to_host;
+    }
+    return group;
+  }
+
+  // Returns a reference to the StreamGroupFactory singleton. Note that this is
+  // never destroyed, so the objects it owns are never deleted.
+  static StreamGroupFactory& Global() {
+    static StreamGroupFactory* instance = new StreamGroupFactory();
+    return *instance;
+  }
+
+ private:
+  mutex lock_;
+  using key_type = std::tuple<int, int>;
+  std::map<key_type, StreamGroup> streams_;
+
+  // StreamGroupFactory cannot be created directly; Call
+  // StreamGroupFactory::Global() to get the global instance.
+  StreamGroupFactory() = default;
+  TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
+};
+
 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              Bytes memory_limit, const DeviceLocality& locality,
                              int gpu_id, const string& physical_device_desc,
                              Allocator* gpu_allocator, Allocator* cpu_allocator,
                              bool sync_every_op, int32 max_streams)
-    : LocalDevice(options,
-                  Device::BuildDeviceAttributes(name, DEVICE_GPU, memory_limit,
-                                                locality, physical_device_desc),
-                  gpu_allocator),
+    : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
+                                                         memory_limit, locality,
+                                                         physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
       gpu_id_(gpu_id),
@@ -186,12 +250,6 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
 BaseGPUDevice::~BaseGPUDevice() {
   delete gpu_device_info_;
   for (auto ctx : device_contexts_) ctx->Unref();
-  for (auto& stream_group : streams_) {
-    delete stream_group.compute;
-    delete stream_group.host_to_device;
-    delete stream_group.device_to_host;
-    delete stream_group.device_to_device;
-  }
 }
 
 Status BaseGPUDevice::Init(const SessionOptions& options) {
@@ -210,27 +268,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
 
   // Create the specified number of GPU streams
   for (int i = 0; i < max_streams_; i++) {
-    auto stream = new gpu::Stream(executor_);
-    stream->Init();
-    VLOG(2) << "Created stream[" << i << "] = " << stream;
-
-    auto host_to_device_stream = new gpu::Stream(executor_);
-    host_to_device_stream->Init();
-    VLOG(2) << "Created host_to_device_stream[" << i
-            << "] = " << host_to_device_stream;
-
-    auto device_to_host_stream = new gpu::Stream(executor_);
-    device_to_host_stream->Init();
-    VLOG(2) << "Created device_to_host_stream[" << i
-            << "] = " << device_to_host_stream;
-
-    auto device_to_device_stream = new gpu::Stream(executor_);
-    device_to_device_stream->Init();
-    VLOG(2) << "Created device_to_device_stream[" << i
-            << "] = " << device_to_device_stream;
-
-    streams_.push_back({stream, host_to_device_stream, device_to_host_stream,
-                        device_to_device_stream});
+    streams_.push_back(
+        StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_));
 
     size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
     void* scratch_buffer = gpu_allocator_->AllocateRaw(
@@ -252,14 +291,15 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
           "Failed to memcopy into scratch buffer for device ", gpu_id_);
     }
 
-    device_contexts_.push_back(
-        new GPUDeviceContext(i, stream, host_to_device_stream,
-                             device_to_host_stream, device_to_device_stream));
+    device_contexts_.push_back(new GPUDeviceContext(
+        i, streams_.back()->compute, streams_.back()->host_to_device,
+        streams_.back()->device_to_host, streams_.back()->device_to_device));
   }
   gpu_device_info_ = new GpuDeviceInfo;
-  gpu_device_info_->stream = streams_[0].compute;
+  gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
+  gpu_device_info_->gpu_id = gpu_id_;
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
   return Status::OK();
@@ -276,14 +316,14 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
                                      DeviceContextMap* device_context_map) {
   VLOG(2) << "FillContextMap";
 
-  const auto num_streams = streams_.size();
+  const size_t num_streams = streams_.size();
   // Special case for single stream.
   if (num_streams == 1) {
     return Status::OK();
   }
   const int64 before = Env::Default()->NowMicros();
   gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = num_streams;
+  opts.max_streams = static_cast<int32>(num_streams);
   std::unordered_map<int, int> node_to_stream_id;
   TF_RETURN_IF_ERROR(
       gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
@@ -315,6 +355,29 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
                                        id);
 
+  // NOTE(tucker): We need to discriminate between Eigen GPU
+  // operations and all others.  If an operation is Eigen
+  // implemented (or otherwise tries to launch a cuda kernel
+  // directly), we need to establish a stacked-scoped environment
+  // that directs it to execute on the proper device.  Otherwise we
+  // expect the Op to use StreamExecutor directly and correctly.  The
+  // way we make this discrimination is quite hacky: At the moment
+  // the only non-Eigen GPU Op is the recv-op, which is known to be
+  // asynchronous.
+  if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
+    context->SetStatus(errors::Internal(
+        "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
+  } else if (port::Tracing::ScopedAnnotation::Enabled()) {
+    port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
+                                               op_kernel->type_string());
+    ComputeHelper(op_kernel, context);
+  } else {
+    ComputeHelper(op_kernel, context);
+  }
+}
+
+void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
+                                  OpKernelContext* context) {
   GPUDeviceContext* gpu_device_context = device_contexts_[0];
   if (context->op_device_context() != nullptr) {
     gpu_device_context =
@@ -332,65 +395,47 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
             << stream_id << "]";
   }
 
-  // NOTE(tucker): We need to discriminate between Eigen GPU
-  // operations and all others.  If an operation is Eigen
-  // implemented (or otherwise tries to launch a cuda kernel
-  // directly), we need to establish a stacked-scoped environment
-  // that directs it to execute on the proper device.  Otherwise we
-  // expect the Op to use StreamExecutor directly and correctly.  The
-  // way we make this discrimination is quite hacky: At the moment
-  // the only non-Eigen GPU Op is the recv-op, which is known to be
-  // asynchronous.
-  if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
-    context->SetStatus(errors::Internal(
-        "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
-  } else {
-    port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
-                                               op_kernel->type_string());
-
-    const auto num_streams = streams_.size();
-    if (num_streams > 1) {
-      // If this op's device context is different from the other contexts,
-      // we must wait on the stream.
-      for (int i = 0; i < context->num_inputs(); ++i) {
-        const GPUDeviceContext* idc =
-            static_cast<GPUDeviceContext*>(context->input_device_context(i));
-        OP_REQUIRES(context, idc != nullptr,
-                    errors::Internal("Input device context ", i,
-                                     " was not set properly."));
-        if (vlog_2) {
-          const void* base;
-          size_t len;
-          if (context->has_input(i)) {
-            if (IsRefType(context->input_dtype(i))) {
-              Tensor tensor = context->mutable_input(i, false);
-              base = DMAHelper::base(&tensor);
-              len = tensor.TotalBytes();
-            } else {
-              const Tensor& tensor = context->input(i);
-              base = DMAHelper::base(&tensor);
-              len = tensor.TotalBytes();
-            }
-            VLOG(2) << "Input " << i << " " << base << "  " << len;
-            VLOG(2) << "  stream[" << stream_id << "].ThenWaitFor(stream["
+  const auto num_streams = streams_.size();
+  if (num_streams > 1) {
+    // If this op's device context is different from the other contexts,
+    // we must wait on the stream.
+    for (int i = 0; i < context->num_inputs(); ++i) {
+      const GPUDeviceContext* idc =
+          static_cast<GPUDeviceContext*>(context->input_device_context(i));
+      OP_REQUIRES(context, idc != nullptr,
+                  errors::Internal("Input device context ", i,
+                                   " was not set properly."));
+      if (vlog_2) {
+        const void* base;
+        size_t len;
+        if (context->has_input(i)) {
+          if (IsRefType(context->input_dtype(i))) {
+            Tensor tensor = context->mutable_input(i, false);
+            base = DMAHelper::base(&tensor);
+            len = tensor.TotalBytes();
+          } else {
+            const Tensor& tensor = context->input(i);
+            base = DMAHelper::base(&tensor);
+            len = tensor.TotalBytes();
+          }
+          LOG(INFO) << "Input " << i << " " << base << "  " << len;
+          LOG(INFO) << "  stream[" << stream_id << "].ThenWaitFor(stream["
                     << idc->stream_id() << "])"
                     << ((idc->stream() == stream) ? " not needed" : "");
-          }
         }
-        if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
       }
+      if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
-    gpu::cuda::ScopedActivateExecutorContext scoped_activation{
-        stream->parent()};
-    op_kernel->Compute(context);
-    if (context->status().ok()) {
-      if (sync_every_op_) {
-        // Note: GPUUtil::Sync() only syncs the default stream.
-        // We need to either sync the stream used by this op, or
-        // all streams.  Given that this flag is typically used for
-        // debugging it makes more sense to sync all GPU activity.
-        context->SetStatus(GPUUtil::SyncAll(this));
-      }
+  }
+  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  op_kernel->Compute(context);
+  if (context->status().ok()) {
+    if (sync_every_op_) {
+      // Note: GPUUtil::Sync() only syncs the default stream.
+      // We need to either sync the stream used by this op, or
+      // all streams.  Given that this flag is typically used for
+      // debugging it makes more sense to sync all GPU activity.
+      context->SetStatus(GPUUtil::SyncAll(this));
     }
   }
 }
@@ -423,8 +468,10 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
           << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
           << stream_id << "]";
 
-  port::Tracing::TraceMe activity(
-      strings::StrCat(op_kernel->name(), ":", op_kernel->type_string()));
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string());
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -449,6 +496,14 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                               DataTypeString(parsed.dtype()), " tensor");
     }
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
+
+    // If the tensor is not initialized, we likely ran out of memory.
+    if (!copy.IsInitialized()) {
+      return errors::ResourceExhausted(
+          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
+          " and type ", DataTypeString(parsed.dtype()));
+    }
+
     port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     Notification n;
     device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, &copy,
@@ -488,7 +543,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
       static_cast<ConcretePerOpGpuDevice*>(device);
   DCHECK(concrete_device);
   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
-      streams_[stream_id].compute->implementation()->CudaStreamMemberHack());
+      streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
   concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
@@ -516,7 +571,7 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
-  int n = INT_MAX;
+  size_t n = INT_MAX;
   auto iter = options.config.device_count().find("GPU");
   if (iter != options.config.device_count().end()) {
     n = iter->second;
@@ -543,16 +598,15 @@ namespace {
 int64 MinSystemMemory(int64 available_memory) {
   // We use the following heuristic for now:
   //
-  // If the available_memory is < 2GiB, we allocate 200MiB to system memory.
-  // Otherwise, allocate 300MiB to system memory.
+  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
   //
-  // In the future we could be more sophisticated by using a table of
-  // devices.
+  // In the future we could be more sophisticated by using a table of devices.
   if (available_memory < (1LL << 31)) {
-    // 200MiB
-    return 209715200LL;
+    // 225MiB
+    return 225 * 1024 * 1024;
   } else {
-    // max(300 MiB, 0.95 * available_memory)
+    // max(300 MiB, 0.05 * available_memory)
     return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
   }
 }
@@ -680,8 +734,8 @@ struct CudaVersion {
   // Initialize from version_name in the form of "3.5"
   explicit CudaVersion(const std::string& version_name) {
     size_t dot_pos = version_name.find('.');
-    CHECK(dot_pos != string::npos) << "Illegal version name: [" << version_name
-                                   << "]";
+    CHECK(dot_pos != string::npos)
+        << "Illegal version name: [" << version_name << "]";
     string major_str = version_name.substr(0, dot_pos);
     CHECK(strings::safe_strto32(major_str, &major_part))
         << "Illegal version name: [" << version_name << "]";
@@ -706,8 +760,7 @@ struct CudaVersion {
 };
 
 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
-  TF_CUDA_CAPABILITIES,
-};
+    TF_CUDA_CAPABILITIES,};
 
 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   auto cuda_caps = supported_cuda_compute_capabilities;
@@ -816,7 +869,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // By default, visible to virtual mapping is unchanged.
     int deviceNo = 0;
     std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
-	              [&deviceNo]{ return deviceNo++; });
+                  [&deviceNo] { return deviceNo++; });
   } else {
     std::vector<string> order_str = str_util::Split(visible_device_list, ',');
     for (int i = 0; i < order_str.size(); ++i) {
@@ -969,7 +1022,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       continue;
     }
 
-    int new_id = ids->size();
+    size_t new_id = ids->size();
     ids->push_back(visible_gpu_id);
 
     LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index f2d76b83b01..08c58867eed 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -20,7 +20,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -96,12 +100,14 @@ class BaseGPUDevice : public LocalDevice {
 
  private:
   struct StreamGroup {
-    gpu::Stream* compute;
-    gpu::Stream* host_to_device;
-    gpu::Stream* device_to_host;
-    gpu::Stream* device_to_device;
+    gpu::Stream* compute = nullptr;
+    gpu::Stream* host_to_device = nullptr;
+    gpu::Stream* device_to_host = nullptr;
+    gpu::Stream* device_to_device = nullptr;
   };
-  gtl::InlinedVector<StreamGroup, 4> streams_;
+  class StreamGroupFactory;
+
+  gtl::InlinedVector<StreamGroup*, 4> streams_;
   gtl::InlinedVector<char*, 4> scratch_;
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
@@ -113,6 +119,8 @@ class BaseGPUDevice : public LocalDevice {
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
+
+  void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
 };
 
 class BaseGPUDeviceFactory : public DeviceFactory {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 94143a55d5e..1e7a2b35beb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -31,12 +31,17 @@ class GPUDevice : public BaseGPUDevice {
             Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
-                      false /* sync every op */, 1 /* max_streams */) {}
+                      false /* sync every op */, 1 /* max_streams */) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     if (attr.on_host()) {
-      ProcessState* ps = ProcessState::singleton();
-      if (attr.gpu_compatible()) {
+      if (attr.gpu_compatible() || force_gpu_compatible_) {
+        ProcessState* ps = ProcessState::singleton();
         return ps->GetCUDAHostAllocator(0);
       } else {
         return cpu_allocator_;
@@ -45,6 +50,9 @@ class GPUDevice : public BaseGPUDevice {
       return gpu_allocator_;
     }
   }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };
 
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
@@ -71,18 +79,26 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
-      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
+      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }
   ~GPUCompatibleCPUDevice() override {}
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     ProcessState* ps = ProcessState::singleton();
-    if (attr.gpu_compatible()) {
+    if (attr.gpu_compatible() || force_gpu_compatible_) {
       return ps->GetCUDAHostAllocator(0);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
     }
   }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };
 
 // The associated factory.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 9618717fc5e..2452efc7795 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -77,14 +77,14 @@ EventMgr::~EventMgr() {
 }
 
 void EventMgr::StartPollingLoop() {
-  CHECK(polling_stopped_.get() == nullptr);
+  CHECK(polling_stopped_ == nullptr);
   stop_polling_.reset(new Notification);
   polling_stopped_.reset(new Notification);
   threadpool_.Schedule([this]() { PollLoop(); });
 }
 
 void EventMgr::StopPollingLoop() {
-  if (stop_polling_.get()) {
+  if (stop_polling_) {
     stop_polling_->Notify();
     polling_stopped_->WaitForNotification();
     stop_polling_.reset(nullptr);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index e4cd79bc7f0..3ad0b0eb85f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -37,12 +37,12 @@ class TEST_EventMgrHelper {
     StopPollingLoop();
   }
 
-  int queue_size() {
+  size_t queue_size() {
     mutex_lock l(em_->mu_);
     return em_->used_events_.size();
   }
 
-  int free_size() {
+  size_t free_size() {
     mutex_lock l(em_->mu_);
     return em_->free_events_.size();
   }
@@ -80,10 +80,10 @@ static std::atomic_int_fast64_t live_tensor_bytes(0);
 // A TensorBuffer that counts live memory usage for testing
 class TestTensorBuffer : public TensorBuffer {
  public:
-  TestTensorBuffer(size_t bytes) : bytes_(bytes) {
+  explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) {
     live_tensor_bytes += bytes_;
   }
-  ~TestTensorBuffer() { live_tensor_bytes -= bytes_; }
+  ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; }
 
   size_t size() const override { return bytes_; }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
index eae917a4395..de715d140a1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
@@ -82,7 +82,7 @@ Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
     // Determine a suitable stream to use.
     int stream_id = highest_stream_id + 1;
     for (const Edge* e : n->in_edges()) {
-      const int fanout = e->src()->out_edges().size();
+      const size_t fanout = e->src()->out_edges().size();
       if (fanout == 1) {
         stream_id = (*node_to_stream_id)[e->src()->id()];
         break;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index b186c9d88cb..a8bad5b94dc 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -36,7 +36,7 @@ class GpuStreamUtilTest : public OpsTestBase {
 TEST_F(GpuStreamUtilTest, BogusOpts) {
   auto root = Scope::NewRootScope().ExitOnError();
   Graph g(OpRegistry::Global());
-  root.ToGraph(&g);
+  TF_ASSERT_OK(root.ToGraph(&g));
   std::unordered_map<int, int> node_to_stream_id;
   gpu_stream_util::AssignStreamsOpts opts;
   Status status;
@@ -56,7 +56,7 @@ TEST_F(GpuStreamUtilTest, BogusOpts) {
 TEST_F(GpuStreamUtilTest, EmptyGraph) {
   auto root = Scope::NewRootScope().ExitOnError();
   Graph g(OpRegistry::Global());
-  root.ToGraph(&g);
+  TF_ASSERT_OK(root.ToGraph(&g));
   std::unordered_map<int, int> node_to_stream_id;
   gpu_stream_util::AssignStreamsOpts opts;
   TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
index 981a6549889..fb76348ddd2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@@ -226,7 +226,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
                                            size_t validSize) {
   VLOG(2) << "BufferCompleted";
   CUptiResult status;
-  CUpti_Activity *record = NULL;
+  CUpti_Activity *record = nullptr;
   mutex_lock l(mu_);  // Hold mu_ while using client_.
   if (client_ && validSize > 0) {
     do {
@@ -288,6 +288,7 @@ class GPUTracerImpl : public GPUTracer,
                       public port::Tracing::Engine {
  public:
   GPUTracerImpl();
+  ~GPUTracerImpl() override;
 
   // GPUTracer interface:
   Status Start() override;
@@ -307,7 +308,7 @@ class GPUTracerImpl : public GPUTracer,
         // Remember the most recent ScopedAnnotation for each thread.
         tls_current_annotation.get() = annotation.c_str();
       }
-      ~Impl() { tls_current_annotation.get() = nullptr; }
+      ~Impl() override { tls_current_annotation.get() = nullptr; }
     };
     return new Impl(name);
   }
@@ -382,6 +383,12 @@ GPUTracerImpl::GPUTracerImpl() {
   enabled_ = false;
 }
 
+GPUTracerImpl::~GPUTracerImpl() {
+  // Unregister the CUPTI callbacks if needed to prevent them from accessing
+  // freed memory.
+  Stop().IgnoreError();
+}
+
 Status GPUTracerImpl::Start() {
   VLOG(1) << "GPUTracer::Start";
   mutex_lock l(mu_);
@@ -391,8 +398,8 @@ Status GPUTracerImpl::Start() {
   // There can only be one CUPTI subscriber.  If we can't create one then
   // there is another trace in progress (possibly by external code).
   CUptiResult ret;
-  ret = cupti_wrapper_->Subscribe(&subscriber_, (CUpti_CallbackFunc)ApiCallback,
-                                  this);
+  ret = cupti_wrapper_->Subscribe(
+      &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
   if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
     return errors::Unavailable("CUPTI subcriber limit reached.");
   } else if (ret != CUPTI_SUCCESS) {
@@ -561,7 +568,6 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   const int id = 0;
   const string stream_device = strings::StrCat(prefix, "/gpu:", id, "/stream:");
   const string memcpy_device = strings::StrCat(prefix, "/gpu:", id, "/memcpy");
-  const string sync_device = strings::StrCat(prefix, "/gpu:", id, "/sync");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
index b1be278ab4f..7ca5d4aa5b5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -41,12 +42,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Session* CreateSession() {
+std::unique_ptr<Session> CreateSession() {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
   options.config.set_allow_soft_placement(true);
-  return NewSession(options);
+  return std::unique_ptr<Session>(NewSession(options));
 }
 
 class GPUTracerTest : public ::testing::Test {
@@ -82,7 +83,7 @@ class GPUTracerTest : public ::testing::Test {
   }
 
  protected:
-  void ExpectFailure(Status status, error::Code code) {
+  void ExpectFailure(const Status& status, error::Code code) {
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.error_message();
@@ -97,24 +98,21 @@ class GPUTracerTest : public ::testing::Test {
 };
 
 TEST_F(GPUTracerTest, StartStop) {
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   TF_EXPECT_OK(tracer->Stop());
 }
 
 TEST_F(GPUTracerTest, StopBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Stop());
   TF_EXPECT_OK(tracer->Stop());
 }
 
 TEST_F(GPUTracerTest, CollectBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
   StepStats stats;
   StepStatsCollector collector(&stats);
@@ -123,8 +121,7 @@ TEST_F(GPUTracerTest, CollectBeforeStart) {
 }
 
 TEST_F(GPUTracerTest, CollectBeforeStop) {
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   StepStats stats;
@@ -135,10 +132,8 @@ TEST_F(GPUTracerTest, CollectBeforeStop) {
 }
 
 TEST_F(GPUTracerTest, StartTwoTracers) {
-  std::unique_ptr<GPUTracer> tracer1;
-  tracer1.reset(CreateGPUTracer());
-  std::unique_ptr<GPUTracer> tracer2;
-  tracer2.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer1(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer2(CreateGPUTracer());
   if (!tracer1 || !tracer2) return;
 
   TF_EXPECT_OK(tracer1->Start());
@@ -151,12 +146,11 @@ TEST_F(GPUTracerTest, StartTwoTracers) {
 
 TEST_F(GPUTracerTest, RunWithTracer) {
   // On non-GPU platforms, we may not support GPUTracer.
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
@@ -179,12 +173,11 @@ TEST_F(GPUTracerTest, RunWithTracer) {
 }
 
 TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
-  std::unique_ptr<GPUTracer> tracer;
-  tracer.reset(CreateGPUTracer());
+  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
@@ -209,7 +202,7 @@ TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
 
 TEST_F(GPUTracerTest, RunWithTraceOption) {
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<Session> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 522db80d7fa..ae9e5aeaa3d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -55,7 +55,6 @@ const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
 extern bool FLAGS_brain_gpu_record_mem_types;
 
 using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::DeviceMemory;
 using perftools::gputools::Stream;
 
 namespace tensorflow {
@@ -227,7 +226,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
     }
     // Since we want to use the memory from recv_stream in the
     // send_device_to_device_stream, add a dependency to make sure the memory is
-    // truely free.
+    // truly free.
     // TODO(zhengxq): remove this dependency when we switch to a better way
     // to make sure the memory is free.
     send_device_to_device_stream->ThenWaitFor(recv_stream);
@@ -322,7 +321,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     done(errors::Internal("No send gpu copy-out-stream is available."));
     return;
   }
-  // Wait for the recv-stream to make sure the buffer is truely available.
+  // Wait for the recv-stream to make sure the buffer is truly available.
   recv_host_to_device_stream->ThenWaitFor(recv_stream);
 
   const int64 total_bytes = cpu_tensor->TotalBytes();
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index f9975ef0a08..7a1c10d900d 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -127,7 +127,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
         gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
-      for (auto v : gpu_visitors_[bus_id]) {
+      for (const auto& v : gpu_visitors_[bus_id]) {
         gpu_allocators_[gpu_id]->AddAllocVisitor(v);
       }
     }
@@ -159,9 +159,36 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
   numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
-    Allocator* allocator =
-        new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
-                          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+    bool use_bfc_allocator = false;
+    // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
+    // efficient.
+    Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
+                                       &use_bfc_allocator);
+    if (!status.ok()) {
+      LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+    }
+    Allocator* allocator;
+    if (use_bfc_allocator) {
+      // TODO(reedwm): evaluate whether 64GB by default is the best choice.
+      int64 cpu_mem_limit_in_mb = -1;
+      Status status = ReadInt64FromEnvVar("TF_CPU_BFC_MEM_LIMIT_IN_MB",
+                                          1LL << 16 /*64GB max by default*/,
+                                          &cpu_mem_limit_in_mb);
+      if (!status.ok()) {
+        LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+      }
+      int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
+      allocator = new BFCAllocator(new BasicCPUAllocator(), cpu_mem_limit,
+                                   true /*allow_growth*/,
+                                   "bfc_cpu_allocator_for_gpu" /*name*/);
+      VLOG(2) << "Using BFCAllocator with memory limit of "
+              << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
+    } else {
+      allocator = new PoolAllocator(
+          100 /*pool_size_limit*/, true /*auto_resize*/,
+          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+      VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator";
+    }
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
@@ -191,7 +218,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // example, process_state could maybe save the first stream executor
   // it knows is valid.
   gpu::StreamExecutor* se = nullptr;
-  for (size_t i = 0; i < gpu_allocators_.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
       se = GPUMachineManager()->ExecutorForDevice(i).ValueOrDie();
       break;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index e2be3a60865..edfecfae06e 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 
@@ -56,7 +57,10 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
 
     if (opts_.do_constant_folding()) {
       ConstantFoldingOptions cf_opts;
-      if (DoConstantFolding(cf_opts, runtime, env, device, g)) {
+      bool was_mutated;
+      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+          .IgnoreError();
+      if (was_mutated) {
         RemoveDeadNodes(g);
         DumpGraph("ConstFolding", g);
         changed = true;
@@ -79,7 +83,9 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
     if (!changed) break;
   }
 
-  std::unique_ptr<Graph> copy(new Graph(g->op_registry()));
+  // Note that we use the Graph constructor that copies the input
+  // FunctionLibraryDefinition, since the original lib def will go out of scope.
+  std::unique_ptr<Graph> copy(new Graph(g->flib_def()));
   CopyGraph(*g, copy.get());
   graph->swap(copy);
 
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index c93ff1cdde8..74b2252c7c6 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -95,22 +96,24 @@ class SimpleRendezvous : public Rendezvous {
 
 }  // namespace
 
-// static
+GraphRunner::GraphRunner(Env* env) : cpu_device_(GetCPUDevice(env)) {}
+
+GraphRunner::~GraphRunner() {}
+
 Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
-                        Env* env, const NamedTensorList& inputs,
+                        const NamedTensorList& inputs,
                         const std::vector<string>& output_names,
                         std::vector<Tensor>* outputs) {
+  if (cpu_device_ == nullptr) {
+    return errors::NotFound("Cannot find a device for GraphRunner.");
+  }
+
   // TODO(vrv): Instead of copying the entire graph, consider modifying
   // the existing graph, and then removing those removed edges.
   // prior to returning.
   std::unique_ptr<Graph> graph_to_run(new Graph(graph->op_registry()));
   CopyGraph(*graph, graph_to_run.get());
 
-  std::unique_ptr<Device> device = GetCPUDevice(env);
-  if (!device) {
-    return errors::NotFound("Cannot find a device for GraphRunner.");
-  }
-
   SimpleRendezvous* rendez = new SimpleRendezvous;
   core::ScopedUnref rendez_unref(rendez);
 
@@ -128,9 +131,11 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   }
 
   // Call RewriteGraphForExecution
+  subgraph::RewriteGraphMetadata metadata;
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
       graph_to_run.get(), input_names, output_names, {} /* target nodes */,
-      device->attributes()));
+      cpu_device_->attributes(), false /* use_function_convention */,
+      &metadata));
 
   // Create the local executor and the Rendezvous for fetching back the
   // constants.
@@ -143,10 +148,11 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   Graph* g = graph_to_run.release();
 
   LocalExecutorParams params;
-  params.device = device.get();
+  // The ownership of the output tensors are bound to this device's lifetime.
+  params.device = cpu_device_.get();
   params.function_library = function_library;
-  params.create_kernel = [&device, g](const NodeDef& ndef, OpKernel** kernel) {
-    return CreateNonCachedKernel(device.get(), nullptr, ndef,
+  params.create_kernel = [this, g](const NodeDef& ndef, OpKernel** kernel) {
+    return CreateNonCachedKernel(cpu_device_.get(), nullptr, ndef,
                                  g->versions().producer(), kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
@@ -173,8 +179,13 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
     Rendezvous::ParsedKey parsed;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(output_key, &parsed));
     bool is_dead;
+    Tensor output_tensor;
     TF_RETURN_IF_ERROR(
-        rendez->Recv(parsed, Rendezvous::Args(), &(*outputs)[i], &is_dead));
+        rendez->Recv(parsed, Rendezvous::Args(), &output_tensor, &is_dead));
+    // Does a deep copy so that ownership of the tensor isn't tied to the
+    // allocator of the cpu device we created above. The allocator could be
+    // deleted along with the device.
+    (*outputs)[i] = tensor::DeepCopy(output_tensor);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index e078c7ffc8c..1e4ae772279 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -20,16 +20,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
@@ -44,16 +40,26 @@ namespace tensorflow {
 // to be particularly lightweight, fast, or efficient.
 class GraphRunner {
  public:
+  // REQUIRES: `env` is not nullptr.
+  GraphRunner(Env* env);
+  ~GraphRunner();
+
   // Function semantics for `inputs`, `output_names` and `outputs`
   // matches those from Session::Run().
   //
+  // NOTE: The output tensors share lifetime with the GraphRunner, and could
+  // be destroyed once the GraphRunner is destroyed.
+  //
   // REQUIRES: `graph`, `env`, and `outputs` are not nullptr.
   // `function_library` may be nullptr.
   typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
-  static Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
-                    Env* env, const NamedTensorList& inputs,
-                    const std::vector<string>& output_names,
-                    std::vector<Tensor>* outputs);
+  Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
+             const NamedTensorList& inputs,
+             const std::vector<string>& output_names,
+             std::vector<Tensor>* outputs);
+
+ private:
+  std::unique_ptr<Device> cpu_device_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_runner_test.cc b/tensorflow/core/common_runtime/graph_runner_test.cc
index 5918ba9a22d..e969ee8df77 100644
--- a/tensorflow/core/common_runtime/graph_runner_test.cc
+++ b/tensorflow/core/common_runtime/graph_runner_test.cc
@@ -46,20 +46,48 @@ using test::internal::ExpectEqual;
 TEST(GraphRunnerTest, SingleConst) {
   Scope root = Scope::NewRootScope();
   auto c = ops::Const(root, 42.0f);
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), {},
-                              {c.name()}, &outputs);
+  Status s = graph_runner.Run(root.graph(), nullptr, {}, {c.name()}, &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(42.0f, outputs[0].scalar<float>()());
 }
 
+// If not using DeepCopy, and the allocator is deleted with the cpu-device,
+// this test will seg-fault.
+TEST(GraphRunnerTest, DeepCopy) {
+  Scope root = Scope::NewRootScope();
+  auto p1 = ops::Placeholder(root.WithOpName("p1"), DT_FLOAT);
+  auto p2 = ops::Placeholder(root.WithOpName("p2"), DT_FLOAT);
+  auto add = ops::Add(root.WithOpName("add"), p1, p2);
+
+  Tensor p1_data(DT_FLOAT, TensorShape({}));
+  Tensor p2_data(DT_FLOAT, TensorShape({}));
+  p1_data.scalar<float>()() = 1.0f;
+  p2_data.scalar<float>()() = 2.0f;
+  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                   {"p2:0", p2_data}};
+
+  // Create and destroy the GraphRunner, and ensure that the outputs are
+  // consumable beyond the lifetime of GraphRunner.
+  std::vector<Tensor> outputs;
+  {
+    GraphRunner graph_runner(Env::Default());
+    Status s =
+        graph_runner.Run(root.graph(), nullptr, inputs, {"add:0"}, &outputs);
+    TF_ASSERT_OK(s);
+  }
+  ExpectEqual(3.0f, outputs[0].scalar<float>()());
+}
+
 TEST(GraphRunnerTest, MultiFetchConst) {
   Scope root = Scope::NewRootScope();
   auto c = ops::Const(root, 42.0f);
   auto pi = ops::Const(root, 3.14f);
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), {},
-                              {c.name(), pi.name()}, &outputs);
+  Status s = graph_runner.Run(root.graph(), nullptr, {}, {c.name(), pi.name()},
+                              &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(42.0f, outputs[0].scalar<float>()());
   ExpectEqual(3.14f, outputs[1].scalar<float>()());
@@ -78,9 +106,10 @@ TEST(GraphRunnerTest, FeedAndFetch) {
   std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
                                                    {"p2:0", p2_data}};
 
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), inputs,
-                              {"add:0"}, &outputs);
+  Status s =
+      graph_runner.Run(root.graph(), nullptr, inputs, {"add:0"}, &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(3.0f, outputs[0].scalar<float>()());
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index eef32e799eb..4a5b88d5fda 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -39,7 +39,8 @@ namespace tensorflow {
 namespace test {
 
 Benchmark::Benchmark(const string& device, Graph* g,
-                     const SessionOptions* options, Graph* init) {
+                     const SessionOptions* options, Graph* init,
+                     Rendezvous* rendez) {
   SessionOptions default_options;
   if (!options) {
     options = &default_options;
@@ -61,7 +62,11 @@ Benchmark::Benchmark(const string& device, Graph* g,
     pool_->Schedule(closure);
   };
 
-  rendez_ = NewLocalRendezvous();
+  if (rendez == nullptr) {
+    rendez_ = NewLocalRendezvous();
+  } else {
+    rendez_ = rendez;
+  }
 
   const int graph_def_version = g->versions().producer();
 
@@ -103,13 +108,13 @@ void Benchmark::Run(int iters) { RunWithArgs({}, {}, iters); }
 
 string GetRendezvousKey(const Node* node) {
   string send_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device", &send_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
   string recv_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "recv_device", &recv_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
   string tensor_name;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "tensor_name", &tensor_name));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
   uint64 send_device_incarnation;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device_incarnation",
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device_incarnation",
                           reinterpret_cast<int64*>(&send_device_incarnation)));
   return Rendezvous::CreateKey(send_device, send_device_incarnation,
                                recv_device, tensor_name, FrameAndIter(0, 0));
@@ -123,10 +128,12 @@ void Benchmark::RunWithArgs(
   }
   // Gets inputs' and outputs' rendezvous keys.
   std::vector<std::pair<string, Tensor>> in;
+  in.reserve(inputs.size());
   for (const auto& p : inputs) {
     in.push_back({GetRendezvousKey(p.first), p.second});
   }
   std::vector<string> out;
+  out.reserve(outputs.size());
   for (const auto& n : outputs) {
     out.push_back(GetRendezvousKey(n));
   }
@@ -144,13 +151,13 @@ void Benchmark::RunWithArgs(
     for (const auto& p : in) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
-      rendez_->Send(parsed, Rendezvous::Args(), p.second, false);
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
     for (const string& key : out) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
-      rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead);
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
     }
   }
   TF_CHECK_OK(device_->Sync());
@@ -161,13 +168,13 @@ void Benchmark::RunWithArgs(
     for (const auto& p : in) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
-      rendez_->Send(parsed, Rendezvous::Args(), p.second, false);
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
     for (const string& key : out) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
-      rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead);
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
     }
   }
 
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 278a6b3f9f8..3a7b3a5acec 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -35,10 +35,11 @@ namespace test {
 
 class Benchmark {
  public:
-  // "device" must be either "cpu" or "gpu".  Takes ownership of "g"
-  // and "init".
+  // "device" must be either "cpu" or "gpu".  Takes ownership of "g",
+  // "init", and one reference on "rendez" (if not null).
   Benchmark(const string& device, Graph* g,
-            const SessionOptions* options = nullptr, Graph* init = nullptr);
+            const SessionOptions* options = nullptr, Graph* init = nullptr,
+            Rendezvous* rendez = nullptr);
   ~Benchmark();
 
   // Executes the graph for "iters" times.
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 0a6342ed736..aa699f3b8a4 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 bool LocalDevice::use_global_threadpool_ = true;
 
 struct LocalDevice::EigenThreadPoolInfo {
-  EigenThreadPoolInfo(const SessionOptions& options) {
+  explicit EigenThreadPoolInfo(const SessionOptions& options) {
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
     if (intra_op_parallelism_threads == 0) {
@@ -60,10 +60,8 @@ struct LocalDevice::EigenThreadPoolInfo {
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
-                         const DeviceAttributes& attributes,
-                         Allocator* device_allocator)
-    : Device(options.env, attributes, device_allocator),
-      owned_tp_info_(nullptr) {
+                         const DeviceAttributes& attributes)
+    : Device(options.env, attributes), owned_tp_info_(nullptr) {
   // If we're running on the CPU, log warnings if we're not compiled using the
   // best flags for performance.
   port::WarnAboutUnusedCPUFeatures();
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index d1c27c62481..84a4f66db4a 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -33,8 +33,8 @@ struct SessionOptions;
 // GPUDevice into more 'process-wide' abstractions.
 class LocalDevice : public Device {
  public:
-  LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes,
-              Allocator* device_allocator);
+  LocalDevice(const SessionOptions& options,
+              const DeviceAttributes& attributes);
   ~LocalDevice() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 80c483e70b0..21ed73df77d 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -43,14 +45,14 @@ struct EndpointEq {
 };
 
 static Status ProcessMemoryTypes(
-    DeviceType device_type, const Graph* g,
-    std::function<Status(const Edge*, MemoryType, MemoryType)> fn) {
-  if (device_type != DEVICE_GPU) {
-    // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always
+    const DeviceType& device_type, const Graph* g,
+    const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+    // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
   }
-  // For GPU device, HOST_MEMORY and DEVICE_MEMORY is not
+  // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not
   // compatible. I.e., a conversion/transfer must be done.
   //
   // {node id, slot id} -> memory type.
@@ -88,17 +90,18 @@ static Status ProcessMemoryTypes(
   return Status::OK();
 }
 
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g) {
-  return ProcessMemoryTypes(device_type, g, [g](const Edge* e, MemoryType sm,
-                                                MemoryType dm) {
-    if (sm == dm) {
-      return Status::OK();
-    }
-    return errors::Internal(
-        "Memory type mismatch (", sm, " ", dm, ") between :", e->src()->id(),
-        ":", e->src_output(), " and ", e->dst()->id(), ":", e->dst_input(),
-        " : from ", e->src()->DebugString(), " to ", e->dst()->DebugString());
-  });
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
+  return ProcessMemoryTypes(
+      device_type, g, [g](const Edge* e, MemoryType sm, MemoryType dm) {
+        if (sm == dm) {
+          return Status::OK();
+        }
+        return errors::Internal(
+            "Memory type mismatch (", sm, " ", dm,
+            ") between :", e->src()->id(), ":", e->src_output(), " and ",
+            e->dst()->id(), ":", e->dst_input(), " : from ",
+            e->src()->DebugString(), " to ", e->dst()->DebugString());
+      });
 }
 
 static Node* Send(Graph* g, const string& device_name, bool host,
@@ -132,8 +135,8 @@ static Node* Recv(Graph* g, const string& device_name, bool host,
   return ret;
 }
 
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g) {
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g) {
   struct Item {
     const Edge* edge;
     MemoryType sm;
@@ -185,7 +188,7 @@ Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
   return ValidateMemoryTypes(device_type, g);
 }
 
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type) {
   MemoryTypeVector inp_mvec;
   MemoryTypeVector out_mvec;
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index ccbb8cffb17..fa0a7595f32 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 // Returns an error iff *g running on a single device of 'device_type'
 // has memory type mismatch for any edge's source and destination.
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
 
 // Updates '*g' so that every edge's source and destination has
 // compatible memory types by inserting proper HostSend/Recv and
@@ -35,12 +35,12 @@ Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
 // Returns OK if '*g' is updated properly (ValidateMemoryTypes(g) must
 // be OK). Otherwise, returns an error and '*g' may be in an
 // invalidate state and the caller should discard it.
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g);
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g);
 
 // Get the memory type for 'index'th output of node 'n' in graph 'g', when
 // running on 'device_type'.
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 06d7daea9cd..b3a43d35046 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -34,6 +34,9 @@ TEST(MemoryTypeChecker, Int32OK) {
   // There is a kernel for adding two int32s on host memory.
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -53,6 +56,15 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  // There is no kernel for casting int32/host memory to float/device
+  // memory.
+  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g)));
+
+  // But we can insert _HostSend/_HostRecv to ensure the invariant.
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
+  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -74,7 +86,13 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred);
+  TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
+  // int Switch's output on GPU has HOST_MEMORY constraint.
+  EXPECT_EQ(memory_type, HOST_MEMORY);
+#endif // TENSORFLOW_USE_SYCL
   delete g;
 }
 
-}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
new file mode 100644
index 00000000000..41bf23be270
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A simple CPU allocator that intercepts malloc/free calls from MKL library
+// and redirects them to Tensorflow allocator
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+
+#ifdef INTEL_MKL
+
+#include <string>
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mem.h"
+
+#include "third_party/mkl/include/i_malloc.h"
+
+namespace tensorflow {
+
+class MklSubAllocator : public SubAllocator {
+ public:
+  ~MklSubAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
+};
+
+/// CPU allocator for MKL that wraps BFC allocator and intercepts
+/// and redirects memory allocation calls from MKL.
+class MklCPUAllocator : public Allocator {
+ public:
+  // Constructor and other standard functions
+
+  MklCPUAllocator() {
+    VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
+    allocator_ =
+        new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName);
+
+    // For redirecting all allocations from MKL to this allocator
+    // From: http://software.intel.com/en-us/node/528565
+    i_malloc = MallocHook;
+    i_calloc = CallocHook;
+    i_realloc = ReallocHook;
+    i_free = FreeHook;
+  }
+
+  ~MklCPUAllocator() override { delete allocator_; }
+
+  inline string Name() override { return kName; }
+
+  inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return allocator_->AllocateRaw(alignment, num_bytes);
+  }
+
+  inline void DeallocateRaw(void* ptr) override {
+    allocator_->DeallocateRaw(ptr);
+  }
+
+ private:
+  // Hooks provided by this allocator for memory allocation routines from MKL
+
+  static inline void* MallocHook(size_t size) {
+    VLOG(2) << "MklCPUAllocator: In MallocHook";
+    return cpu_allocator()->AllocateRaw(kAlignment, size);
+  }
+
+  static inline void FreeHook(void* ptr) {
+    VLOG(2) << "MklCPUAllocator: In FreeHook";
+    cpu_allocator()->DeallocateRaw(ptr);
+  }
+
+  static inline void* CallocHook(size_t num, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  static inline void* ReallocHook(void* ptr, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  // TODO(jbobba): We should ideally move this into CPUOptions in config.proto.
+  /// Memory limit - 64GB
+  static const size_t kMaxMemSize =
+      static_cast<size_t>(64) * 1024 * 1024 * 1024;
+
+  /// Do we allow growth in BFC Allocator
+  static const bool kAllowGrowth = true;
+
+  /// Name
+  static constexpr const char* kName = "mklcpu";
+
+  /// The alignment that we need for the allocations
+  static const size_t kAlignment = 64;
+
+  Allocator* allocator_;  // owned by this class
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index adfa17ae9d7..a469c8aa4ea 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/graph/graph.h"
@@ -39,6 +40,12 @@ struct GraphOptimizationPassOptions {
   const CostModel* cost_model = nullptr;
 
   FunctionLibraryDefinition* flib_def = nullptr;  // Not owned.
+  // The DeviceSet contains all the devices known to the system and is
+  // filled in for optimizations run by the session master, i.e.,
+  // PRE_PLACEMENT, POST_PLACEMENT, and POST_REWRITE_FOR_EXEC. It is
+  // nullptr for POST_PARTITIONING optimizations which are run at the
+  // workers.
+  const DeviceSet* device_set = nullptr;  // Not owned.
 
   // The graph to optimize, for optimization passes that run before
   // partitioning. Null for post-partitioning passes.
diff --git a/tensorflow/core/common_runtime/optimization_registry_test.cc b/tensorflow/core/common_runtime/optimization_registry_test.cc
index 3d94fa5bee5..cce8b21d4f2 100644
--- a/tensorflow/core/common_runtime/optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/optimization_registry_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 class TestOptimization : public GraphOptimizationPass {
  public:
   static int count_;
-  Status Run(const GraphOptimizationPassOptions& options) {
+  Status Run(const GraphOptimizationPassOptions& options) override {
     ++count_;
     return Status::OK();
   }
@@ -45,9 +45,8 @@ TEST(OptimizationRegistry, OptimizationPass) {
 
 class UpdateFuncLibPass : public GraphOptimizationPass {
  public:
-  Status Run(const GraphOptimizationPassOptions& options) {
-    options.flib_def->AddFunctionDef(test::function::WXPlusB());
-    return Status::OK();
+  Status Run(const GraphOptimizationPassOptions& options) override {
+    return options.flib_def->AddFunctionDef(test::function::WXPlusB());
   }
 };
 
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index ffbfbc74f16..f9f36443a80 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -43,17 +43,17 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
           "graph should be available.");
     }
     gtl::InlinedVector<Node*, 2> matches;
-    for (Node* n : g->nodes()) {
+    for (Node* n : g->op_nodes()) {
       if (n->type_string() == "ParallelConcat") {
         matches.push_back(n);
       }
     }
     for (Node* n : matches) {
-      AttrSlice n_attrs(n->def());
+      AttrSlice n_attrs = n->attrs();
       auto base_make_node = [n, g, &n_attrs](const string& op,
                                              const string& name) {
         NodeBuilder node_builder(name, op);
-        node_builder.Device(n->def().device());
+        node_builder.Device(n->requested_device());
         string colo;
         if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
           node_builder.Attr("_class", colo);
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 0ab5311c5f8..198eb896afc 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -69,7 +69,7 @@ class PendingCounts {
   // to retrieve the count data for this node.
   class Layout {
    public:
-    Handle CreateHandle(int max_pending_count, int max_dead_count);
+    Handle CreateHandle(size_t max_pending_count, size_t max_dead_count);
 
    private:
     friend class PendingCounts;
@@ -91,7 +91,7 @@ class PendingCounts {
 
   ~PendingCounts() { delete[] bytes_; }
 
-  void set_initial_count(Handle h, int pending_count) {
+  void set_initial_count(Handle h, size_t pending_count) {
     if (h.is_large_) {
       LargeCounts* c = Large(h);
       c->pending = pending_count;
@@ -305,8 +305,8 @@ class PendingCounts {
   void operator=(const PendingCounts&) = delete;
 };
 
-PendingCounts::Handle PendingCounts::Layout::CreateHandle(int max_pending_count,
-                                                          int max_dead_count) {
+inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
+    size_t max_pending_count, size_t max_dead_count) {
   Handle result;
   if ((max_pending_count > kMaxCountForPackedCounts) ||
       (max_dead_count > kMaxCountForPackedCounts)) {
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
new file mode 100644
index 00000000000..fa9713735ed
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/renamed_device.h"
+
+namespace tensorflow {
+
+// TODO(saeta): Convert to returning a std::unique_ptr?
+/* static */
+Device* RenamedDevice::NewRenamedDevice(const string& new_base,
+                                        Device* underlying,
+                                        bool owns_underlying) {
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
+  DeviceNameUtils::ParsedName underlying_parsed_name =
+      underlying->parsed_name();
+  CHECK(underlying_parsed_name.has_type);
+  CHECK(underlying_parsed_name.has_id);
+  parsed_name.type = underlying_parsed_name.type;
+  parsed_name.id = underlying_parsed_name.id;
+  string name = DeviceNameUtils::FullName(parsed_name.job, parsed_name.replica,
+                                          parsed_name.task, parsed_name.type,
+                                          parsed_name.id);
+  DeviceAttributes attributes(underlying->attributes());
+  attributes.set_name(name);
+  return new RenamedDevice(underlying, attributes, owns_underlying);
+}
+
+RenamedDevice::RenamedDevice(Device* underlying,
+                             const DeviceAttributes& attributes,
+                             bool owns_underlying)
+    : Device(underlying->env(), attributes),
+      underlying_(underlying),
+      owns_underlying_(owns_underlying) {}
+
+RenamedDevice::~RenamedDevice() {
+  if (owns_underlying_) {
+    delete underlying_;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
new file mode 100644
index 00000000000..0158e18cedc
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -0,0 +1,119 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Wraps a device with a new name, delegating work to the wrapped device.
+//
+// This class is used to wrap local devices when using clusterspec propagation
+// where the name of a particular device may change in the context of a given
+// session.
+class RenamedDevice : public Device {
+ public:
+  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
+                                  bool owns_underlying);
+  ~RenamedDevice() override;
+
+  // Below are virtual methods defined on DeviceBase
+  bool RequiresRecordingAccessedTensors() const override {
+    return underlying_->RequiresRecordingAccessedTensors();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    return underlying_->tensorflow_cpu_worker_threads();
+  }
+
+  const GpuDeviceInfo* tensorflow_gpu_device_info() const override {
+    return underlying_->tensorflow_gpu_device_info();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return underlying_->GetAllocator(attr);
+  }
+
+  Allocator* GetStepAllocator(AllocatorAttributes attr,
+                              ResourceMgr* step_resource_manager) override {
+    return underlying_->GetStepAllocator(attr, step_resource_manager);
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    return underlying_->eigen_cpu_device();
+  }
+
+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice* eigen_sycl_device() const override {
+    return underlying_->eigen_sycl_device();
+  }
+#endif
+
+  PerOpGpuDevice* MakeGpuDevice() override {
+    return underlying_->MakeGpuDevice();
+  }
+
+  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                             DeviceContext* dc, Allocator* allocator) override {
+    underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+  }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    return underlying_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
+  }
+
+  // Below are virtual methods defined on Device
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
+    underlying_->Compute(op_kernel, context);
+  }
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override {
+    underlying_->ComputeAsync(op_kernel, context, std::move(done));
+  }
+
+  void ConsumeListOfAccessedTensors(
+      DeviceContext* context, const TensorReferenceVector& tensors) override {
+    underlying_->ConsumeListOfAccessedTensors(context, tensors);
+  }
+
+  Status Sync() override { return underlying_->Sync(); }
+
+  Status MaybeRewriteGraph(const FunctionDefLibrary& library,
+                           std::unique_ptr<Graph>* graph) override {
+    return underlying_->MaybeRewriteGraph(library, graph);
+  }
+
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override {
+    return underlying_->FillContextMap(graph, device_context_map);
+  }
+
+ private:
+  RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
+                bool owns_underlying);
+  Device* const underlying_;
+  const bool owns_underlying_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 285ac7540c8..2a2b10c0cff 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -106,7 +106,7 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
   CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context,
                      recv_args.device_context, src_device, dst_device,
                      send_args.alloc_attrs, recv_args.alloc_attrs, &in, out,
-                     done);
+                     std::move(done));
 }
 
 void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
@@ -132,7 +132,8 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
     };
 
     if (status.ok() && in.IsInitialized()) {
-      SameWorkerRecvDone(parsed, send_args, recv_args, in, out, final_callback);
+      SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                         std::move(final_callback));
     } else {
       final_callback(status);
     }
diff --git a/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
new file mode 100644
index 00000000000..228c4b54063
--- /dev/null
+++ b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
@@ -0,0 +1,105 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+namespace {
+
+// Replaces ReadVariableOp nodes which are only used by Sends, sinks,
+// and function Retvals with _UnsafeReadVariable nodes, as this
+// transformation is safe and will improve performance.
+class ResourceVariableReadPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override {
+    if (options.graph == nullptr) {
+      // TODO(apassos) returning OK feels weird here as we can't do anything
+      // without a graph, but some tests require this.
+      return Status::OK();
+    }
+    Graph* g = options.graph->get();
+    if (g == nullptr) {
+      return errors::Internal(
+          "Read to unsafe read conversion should happen before partitioning "
+          "and a graph should be available.");
+    }
+    gtl::InlinedVector<Node*, 2> matches;
+    for (Node* n : g->op_nodes()) {
+      if (n->type_string() == "ReadVariableOp") {
+        bool skip = false;
+        for (const Edge* e : n->out_edges()) {
+          if (!e->dst()->IsSend() && e->dst()->type_string() != "_Retval" &&
+              e->dst()->name() != "_SINK") {
+            skip = true;
+          }
+        }
+        if (!skip) {
+          matches.push_back(n);
+        }
+      }
+    }
+    for (Node* read : matches) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(GetNodeAttr(read->attrs(), "dtype", &dtype));
+      std::vector<Node*> in_control_edges;
+      std::vector<std::pair<Node*, int>> in_edges;
+      for (const Edge* edge : read->in_edges()) {
+        if (edge->IsControlEdge()) {
+          in_control_edges.push_back(edge->src());
+        } else {
+          in_edges.push_back({edge->src(), edge->src_output()});
+        }
+      }
+      std::vector<Node*> out_control_edges;
+      std::vector<std::pair<Node*, int>> out_edges;
+      for (const Edge* edge : read->out_edges()) {
+        if (edge->IsControlEdge()) {
+          out_control_edges.push_back(edge->dst());
+        } else {
+          out_edges.push_back({edge->dst(), edge->dst_input()});
+        }
+      }
+      string name = read->name();
+      string device_name = read->assigned_device_name();
+      g->RemoveNode(read);
+      Node* unsafe_read;
+      NodeBuilder unsafe_read_builder(g->NewName(name), "_UnsafeReadVariable");
+      for (Node* node : in_control_edges) {
+        unsafe_read_builder.ControlInput(node);
+      }
+      for (const std::pair<Node*, int>& p : in_edges) {
+        unsafe_read_builder.Input(p.first, p.second);
+      }
+      TF_RETURN_IF_ERROR(
+          unsafe_read_builder.Attr("dtype", dtype).Finalize(g, &unsafe_read));
+      unsafe_read->set_assigned_device_name(device_name);
+      for (Node* node : out_control_edges) {
+        g->AddControlEdge(unsafe_read, node);
+      }
+      for (std::pair<Node*, int> p : out_edges) {
+        g->AddEdge(unsafe_read, 0, p.first, p.second);
+      }
+    }
+    return Status::OK();
+  }
+};
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 0,
+                      ResourceVariableReadPass);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/resource_variable_read_optimizer_test.cc b/tensorflow/core/common_runtime/resource_variable_read_optimizer_test.cc
new file mode 100644
index 00000000000..435258456c8
--- /dev/null
+++ b/tensorflow/core/common_runtime/resource_variable_read_optimizer_test.cc
@@ -0,0 +1,88 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+REGISTER_OP("VarHandleOp").Output("resource: resource");
+REGISTER_OP("ReadVariableOp")
+    .Input("resource: resource")
+    .Attr("dtype: type")
+    .Output("value: dtype");
+REGISTER_OP("_UnsafeReadVariable")
+    .Input("resource: resource")
+    .Attr("dtype: type")
+    .Output("value: dtype");
+
+TEST(ReadReplaceTest, Simple) {
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  Node* handle;
+  TF_ASSERT_OK(NodeBuilder("handle", "VarHandleOp").Finalize(g.get(), &handle));
+  Node* read;
+  TF_ASSERT_OK(NodeBuilder("read", "ReadVariableOp")
+                   .Input(handle)
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(g.get(), &read));
+  Node* send;
+  TF_ASSERT_OK(NodeBuilder("send", "_Send")
+                   .Input(read)
+                   .Attr("recv_device", "")
+                   .Attr("send_device", "")
+                   .Attr("send_device_incarnation", 0)
+                   .Attr("tensor_name", "")
+                   .Finalize(g.get(), &send));
+  Node* other_send;
+  TF_ASSERT_OK(NodeBuilder("other_send", "_Send")
+                   .Input(read)
+                   .Attr("recv_device", "")
+                   .Attr("send_device", "")
+                   .Attr("send_device_incarnation", 0)
+                   .Attr("tensor_name", "")
+                   .Finalize(g.get(), &other_send));
+  GraphOptimizationPassOptions opts;
+  opts.graph = &g;
+  TF_CHECK_OK(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, opts));
+  int found_reads = 0;
+  int found_unsafe_reads = 0;
+  for (const Node* n : g->nodes()) {
+    if (n->type_string() == "ReadVariableOp") {
+      found_reads++;
+    } else if (n->type_string() == "_UnsafeReadVariable") {
+      found_unsafe_reads++;
+      ASSERT_EQ(n->num_inputs(), 1);
+      const Node* inp;
+      TF_ASSERT_OK(n->input_node(0, &inp));
+      EXPECT_EQ(inp->name(), handle->name());
+      ASSERT_EQ(n->out_edges().size(), 2);
+      for (Node* out : n->out_nodes()) {
+        EXPECT_EQ(out->type_string(), "_Send");
+      }
+    }
+  }
+  EXPECT_EQ(found_reads, 0);
+  EXPECT_EQ(found_unsafe_reads, 1);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 2e81811b7c2..dba7a9253e9 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -94,6 +94,7 @@ Status SessionFactory::GetFactory(const SessionOptions& options,
     // TODO(mrry): Consider providing a system-default fallback option
     // in this case.
     std::vector<string> factory_types;
+    factory_types.reserve(candidate_factories.size());
     for (const auto& candidate_factory : candidate_factories) {
       factory_types.push_back(candidate_factory.first);
     }
diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h
index 66f4d315872..df3198a70dd 100644
--- a/tensorflow/core/common_runtime/session_factory.h
+++ b/tensorflow/core/common_runtime/session_factory.h
@@ -33,7 +33,28 @@ class SessionFactory {
   virtual Session* NewSession(const SessionOptions& options) = 0;
   virtual bool AcceptsOptions(const SessionOptions& options) = 0;
 
-  // Sessions that support resource containers should override this functions.
+  // Abort and close all existing sessions, disconnecting their resources from
+  // future sessions.
+  //
+  // Reset() allows misbehaving or slow sessions to be aborted and closed, and
+  // causes their resources eventually to be released.  Reset() does not wait
+  // for the computations in old sessions to cease; it merely starts the
+  // process of tearing them down.  However, if a new session is started after
+  // a Reset(), the new session is isolated from changes that old sessions
+  // (started prior to the Reset()) may continue to make to resources, provided
+  // all those resources are in containers listed in "containers".
+  //
+  // Old sessions may continue to have side-effects on resources not in
+  // containers listed in "containers", and thus may affect future
+  // sessions' results in ways that are hard to predict.  Thus, if well-defined
+  // behavior is desired, is it recommended that all containers be listed in
+  // "containers".
+  //
+  // If the "containers" vector is empty, the default container is assumed.
+  // If the "containers" vector is non-empty, the default container should be
+  // listed explicitly.
+  //
+  // Sessions that support resource containers should override this function.
   virtual Status Reset(const SessionOptions& options,
                        const std::vector<string>& containers) {
     return errors::Unimplemented("Reset()");
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
index 2c80c4d1123..6befa53dff0 100644
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+const char* SessionState::kTensorHandleResourceTypeName = "TensorHandle";
+
 Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
   mutex_lock l(state_lock_);
   auto it = tensors_.find(handle);
@@ -64,7 +66,7 @@ Status TensorStore::AddTensor(const string& name, const TensorAndKey& tk) {
 Status TensorStore::SaveTensors(const std::vector<string>& output_names,
                                 SessionState* session_state) {
   mutex_lock l(lock_);
-  if (tensors_.size() != 0) {
+  if (!tensors_.empty()) {
     // Save only the tensors in output_names in the session.
     for (const string& name : output_names) {
       TensorId id(ParseTensorName(name));
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index dc1272c5d68..5103e852218 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -19,8 +19,9 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/public/session.h"
@@ -29,12 +30,20 @@ namespace tensorflow {
 
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeAndType;
 using shape_inference::ShapeHandle;
 
-ShapeRefiner::ShapeRefiner(const OpRegistryInterface* ops)
-    : ops_registry_(ops) {}
+ShapeRefiner::ShapeRefiner(int graph_def_version,
+                           const OpRegistryInterface* ops)
+    : graph_def_version_(graph_def_version),
+      ops_registry_(ops),
+      graph_runner_(Env::Default()) {}
 
-ShapeRefiner::~ShapeRefiner() { gtl::STLDeleteValues(&node_to_context_); }
+ShapeRefiner::~ShapeRefiner() {
+  // The lifetime of the tensors are bound to the GraphRunner, so the tensors
+  // should be deleted before it.
+  const_tensor_map_.clear();
+}
 
 Status ShapeRefiner::AddNode(const Node* node) {
   // For each 'input' of this node, fetch the corresponding shape
@@ -42,8 +51,8 @@ Status ShapeRefiner::AddNode(const Node* node) {
   // indexed by 'node's input.
   std::vector<Node*> input_nodes(node->num_inputs());
   std::vector<ShapeHandle> input_shapes(node->num_inputs());
-  std::vector<DataType> input_handle_dtypes(node->num_inputs());
-  std::vector<ShapeHandle> input_handle_shapes(node->num_inputs());
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+      input_handle_shapes_and_types(node->num_inputs());
   for (const Edge* e : node->in_edges()) {
     if (e->IsControlEdge()) continue;
 
@@ -55,113 +64,49 @@ Status ShapeRefiner::AddNode(const Node* node) {
           node->name(), "' was not previously added to ShapeRefiner.");
     }
 
-    InferenceContext* c = it->second;
+    InferenceContext* c = it->second.get();
     DCHECK_GE(e->dst_input(), 0);
     input_nodes[e->dst_input()] = input;
     input_shapes[e->dst_input()] = c->output(e->src_output());
 
-    // Only propagate handle xshape and dtype of edges which are carrying
-    // resource handles.
+    // Only propagate handle data of edges which are carrying resource handles.
     if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
-      input_handle_dtypes[e->dst_input()] =
-          c->output_handle_dtype(e->src_output());
-      input_handle_shapes[e->dst_input()] =
-          c->output_handle_shape(e->src_output());
+      const auto* in_v = c->output_handle_shapes_and_types(e->src_output());
+      if (in_v != nullptr) {
+        input_handle_shapes_and_types[e->dst_input()].reset(
+            new std::vector<ShapeAndType>(*in_v));
+      }
     }
   }
 
   // Get the shape function for this node
   const OpRegistrationData* op_reg_data;
   TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
-  if (op_reg_data->shape_inference_fn == nullptr) {
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
     return errors::InvalidArgument(
         "No shape inference function exists for op '", node->type_string(),
         "', did you forget to define it?");
   }
 
   // This needs to be filled in with real data in a second pass.
-  std::vector<const Tensor*> input_tensors(node->num_inputs());
-  std::vector<Tensor> real_tensors(node->num_inputs());
-  std::vector<bool> attempted_materialization(node->num_inputs());
-  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
-  std::unique_ptr<InferenceContext> c(new InferenceContext(
-      &node->def(), node->op_def(), input_shapes, input_tensors,
-      input_tensors_as_shapes, input_handle_shapes, input_handle_dtypes));
+  std::unique_ptr<InferenceContext> c(
+      new InferenceContext(graph_def_version_, &node->def(), node->op_def(),
+                           input_shapes, input_tensors, input_tensors_as_shapes,
+                           std::move(input_handle_shapes_and_types)));
   if (!c->construction_status().ok()) {
     return c->construction_status();
   }
 
   // Run the shape inference function, and return if there was an error.
-  TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
-
-  // We must run the shape function repeatedly, in case users write
-  // shape functions where they only conditionally call input_tensor()
-  // based on the values of another input tensor.
-  bool rerun_shape_fn;
-  do {
-    // If the result of running shape inference would have benefitted
-    // from knowing the values of input tensors, try to materialize
-    // the results of those tensors, and then run the shape inference
-    // function again using those known tensors.
-    rerun_shape_fn = false;
-
-    // NOTE: It is possible to batch the extraction and
-    // materialization of inputs, instead of materializing one input
-    // at a time like we do below.  If input-at-a-time computation
-    // becomes a bottleneck, we could separate ExtractConstantSubgraph
-    // into two functions: one that returns true if an input is
-    // derivable from constants, and another function that extracts
-    // the subgraph for multiple target nodes and executes the whole
-    // subgraph once.
-
-    for (int i = 0; i < c->num_inputs(); ++i) {
-      if (!c->requested_input_tensor(i)) {
-        continue;
-      }
-      // Check if we have not already filled in the requested input,
-      // and if not, try to materialize the tensors.
-      if (!attempted_materialization[i]) {
-        attempted_materialization[i] = true;
-
-        Tensor result;
-        bool evaluated = false;
-        TF_RETURN_IF_ERROR(
-            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
-        if (evaluated) {
-          real_tensors[i] = result;
-          input_tensors[i] = &real_tensors[i];
-          // We have more concrete information about a shape,
-          // so re-run shape inference.
-          rerun_shape_fn = true;
-        }
-      }
-      if (c->requested_input_tensor_as_partial_shape(i) &&
-          !attempted_tensor_as_shape_conversion[i]) {
-        attempted_tensor_as_shape_conversion[i] = true;
-        if (i >= input_tensors_as_shapes.size()) {
-          input_tensors_as_shapes.resize(i + 1);
-        }
-        ShapeHandle s;
-        TF_RETURN_IF_ERROR(ConstantPartialShape(c.get(), node, i, &s));
-        input_tensors_as_shapes[i] = s;
-        rerun_shape_fn = true;
-      }
-    }
-
-    if (rerun_shape_fn) {
-      // We have more information about the shapes on this pass,
-      // so re-run shape inference.
-      c->set_input_tensors(input_tensors);
-      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
-      TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c.get()));
-    }
-  } while (rerun_shape_fn);
+  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, c.get()));
 
   // Store the resulting InferenceContext object in the map.
-  node_to_context_[node] = c.release();
+  node_to_context_[node].swap(c);
 
   return Status::OK();
 }
@@ -194,6 +139,73 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
+Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
+  auto it = node_to_context_.find(node);
+  if (it == node_to_context_.end()) {
+    *refined = true;
+    return AddNode(node);
+  }
+  InferenceContext* node_context = it->second.get();
+
+  // Give up if the context wasn't successfully built by the AddNode() method.
+  TF_RETURN_IF_ERROR(node_context->construction_status());
+
+  // Check if the shapes of the nodes in the fan-in of this node have changed,
+  // and if they have update the node input shapes.
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) continue;
+
+    Node* input = e->src();
+    auto iter = node_to_context_.find(input);
+    if (iter == node_to_context_.end()) {
+      return errors::FailedPrecondition(
+          "Input ", e->dst_input(), " ('", input->name(), "') for '",
+          node->name(), "' was not previously added to ShapeRefiner.");
+    }
+
+    InferenceContext* c = iter->second.get();
+    DCHECK_GE(e->dst_input(), 0);
+    if (node_context->MergeInput(e->dst_input(), c->output(e->src_output()))) {
+      *refined = true;
+    }
+
+    // Also propagate handle shape and dtype of edges which are carrying
+    // resource handles.
+    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+      auto* shapes_and_types =
+          c->output_handle_shapes_and_types(e->src_output());
+      if (shapes_and_types != nullptr &&
+          node_context->MergeInputHandleShapesAndTypes(e->dst_input(),
+                                                       *shapes_and_types)) {
+        *refined = true;
+      }
+    }
+  }
+
+  if (!*refined) {
+    // No input shape has changed, we're done
+    return Status::OK();
+  }
+
+  // Get and run the shape function for this node to update the shapes of the
+  // outputs.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", node->type_string(),
+        "', did you forget to define it?");
+  }
+
+  if (!op_reg_data->shape_inference_fn) {
+    // There is nothing more we can infer
+    return Status::OK();
+  }
+
+  return RunShapeFn(node, op_reg_data, node_context);
+}
+
 Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                                    int dst_idx, bool* evaluated,
                                                    Tensor* result) {
@@ -203,6 +215,9 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
 
   bool is_constant_graph = false;
   Graph subgraph(ops_registry_);
+  auto versions = subgraph.versions();
+  versions.set_producer(graph_def_version_);
+  subgraph.set_versions(versions);
 
   // We identify the possibly constant subgraph to evaluate by
   // recursively iterating backwards through the inputs to 'node'
@@ -220,9 +235,8 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
   std::vector<Tensor> outputs;
   // NOTE; we should pass in a function library runtime if we want
   // to support constant-expression evaluation on functions.
-  Status s = GraphRunner::Run(&subgraph, nullptr /* function_library */,
-                              Env::Default(), const_inputs,
-                              {output_tensor_name}, &outputs);
+  Status s = graph_runner_.Run(&subgraph, nullptr /* function_library */,
+                               const_inputs, {output_tensor_name}, &outputs);
 
   // If all kernels in the constant graph are not registered
   // in the process, GraphRunner::Run may fail, in which case
@@ -243,6 +257,85 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
   return Status::OK();
 }
 
+Status ShapeRefiner::TryToInferTensorOutputFromInputShapes(const Edge* edge,
+                                                           Tensor* output,
+                                                           bool* success) {
+  *success = false;
+  const Node* node = edge->src();
+  auto it = node_to_context_.find(node);
+  if (it == node_to_context_.end()) {
+    return errors::FailedPrecondition("Node does not have context.");
+  }
+  InferenceContext* c = it->second.get();
+
+  if (node->def().op() == "Shape") {
+    // If input shapes to the shape op are fully defined,
+    // we can infer the shape op's output tensor.
+    bool fully_defined_inputs = c->FullyDefined(c->input(0));
+    if (fully_defined_inputs) {
+      int input_rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({input_rank}));
+      if (node->output_type(0) == DT_INT32) {
+        auto flat = t.flat<int>();
+        for (int i = 0; i < input_rank; i++) {
+          int64 dimension = c->Value(c->Dim(c->input(0), i));
+          if (!FastBoundsCheck(dimension, std::numeric_limits<int32>::max())) {
+            return errors::FailedPrecondition(
+                "Shape has output type int32, but dimension exceeds maximum "
+                "int32 value");
+          }
+          flat(i) = static_cast<int32>(dimension);
+        }
+      } else if (node->output_type(0) == DT_INT64) {
+        auto flat = t.flat<int64>();
+        for (int i = 0; i < input_rank; i++) {
+          flat(i) = c->Value(c->Dim(c->input(0), i));
+        }
+      } else {
+        return errors::FailedPrecondition(
+            "Shape has output type that is not int32 or int64");
+      }
+      *output = t;
+      *success = true;
+    }
+  } else if (node->def().op() == "Rank") {
+    bool rank_known = c->RankKnown(c->input(0));
+    if (rank_known) {
+      int32 input_rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({}));
+      t.flat<int32>()(0) = input_rank;
+      *output = t;
+      *success = true;
+    }
+  } else if (node->def().op() == "Size") {
+    bool fully_defined_inputs = c->FullyDefined(c->input(0));
+    if (fully_defined_inputs) {
+      int32 rank = c->Rank(c->input(0));
+      Tensor t(node->output_type(0), TensorShape({}));
+      int64 size = 1;
+      for (int i = 0; i < rank; i++) {
+        size *= c->Value(c->Dim(c->input(0), i));
+      }
+      if (node->output_type(0) == DT_INT32) {
+        if (!FastBoundsCheck(size, std::numeric_limits<int32>::max())) {
+          return errors::FailedPrecondition(
+              "Size has output type int32, but size exceeds maximum int32 "
+              "value");
+        }
+        t.flat<int32>()(0) = static_cast<int32>(size);
+      } else if (node->output_type(0) == DT_INT64) {
+        t.flat<int64>()(0) = size;
+      } else {
+        return errors::FailedPrecondition(
+            "Size has output type that is not int32 or int64");
+      }
+      *output = t;
+      *success = true;
+    }
+  }
+  return Status::OK();
+}
+
 Status ShapeRefiner::ExtractConstantSubgraph(
     Node* target_node, Graph* out_graph, bool* is_constant_graph,
     std::vector<std::pair<string, Tensor>>* const_inputs) {
@@ -283,6 +376,20 @@ Status ShapeRefiner::ExtractConstantSubgraph(
       return Status::OK();
     }
 
+    // During construction or import from GraphConstructor, back edges may not
+    // be filled in.  Don't constant fold through merges at all for now.
+    if (IsMerge(current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
+    // Don't constant fold enter/exit currently either, as it's easy to end
+    // up with a partial frame.
+    if (IsEnter(current_node) || IsExit(current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
     // If there is nothing more to recurse down, see if
     // the generator node is a constant.
     if (current_node->num_inputs() == 0) {
@@ -329,15 +436,27 @@ Status ShapeRefiner::ExtractConstantSubgraph(
                          dst_copy, current_edge->dst_input());
     }
 
-    // If we have a copy of the input tensor materialized already,
-    // then add to the list of inputs to feed and do not recurse further.
     const string& output_tensor_name =
         strings::StrCat(current_node->name(), ":", current_edge->src_output());
+
+    // Some tensor values can be inferred. For example, a shape op
+    // with input shapes fully defined can have its output tensor inferred.
+    Tensor tensor_inferred;
+    bool successfully_inferred_tensor = false;
+    TF_RETURN_IF_ERROR(TryToInferTensorOutputFromInputShapes(
+        current_edge, &tensor_inferred, &successfully_inferred_tensor));
+    if (successfully_inferred_tensor) {
+      const_inputs->emplace_back(output_tensor_name, tensor_inferred);
+      const_inputs_added.insert(output_tensor_name);
+      continue;
+    }
+
+    // If we have a copy of the input tensor materialized already,
+    // then add to the list of inputs to feed and do not recurse further.
     auto it = const_tensor_map_.find(output_tensor_name);
     if (it != const_tensor_map_.end() &&
         const_inputs_added.count(output_tensor_name) == 0) {
-      const_inputs->emplace_back(
-          std::make_pair(output_tensor_name, it->second));
+      const_inputs->emplace_back(output_tensor_name, it->second);
       const_inputs_added.insert(output_tensor_name);
       continue;
     }
@@ -430,4 +549,93 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::RunShapeFn(const Node* node,
+                                const OpRegistrationData* op_reg_data,
+                                shape_inference::InferenceContext* c) {
+  // This will be filled in with real data in a second pass.
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+  std::vector<Tensor> real_tensors(node->num_inputs());
+  std::vector<bool> attempted_materialization(node->num_inputs());
+  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<ShapeHandle> input_tensors_as_shapes;
+
+  // Run the shape inference function, and return if there was an error.
+  c->set_input_tensors(input_tensors);
+  c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+  if (op_reg_data->shape_inference_fn) {
+    TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
+  } else {
+    TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
+  }
+
+  // We must run the shape function repeatedly, in case users write
+  // shape functions where they only conditionally call input_tensor()
+  // based on the values of another input tensor.
+  bool rerun_shape_fn;
+  do {
+    // If the result of running shape inference would have benefitted
+    // from knowing the values of input tensors, try to materialize
+    // the results of those tensors, and then run the shape inference
+    // function again using those known tensors.
+    rerun_shape_fn = false;
+
+    // NOTE: It is possible to batch the extraction and
+    // materialization of inputs, instead of materializing one input
+    // at a time like we do below.  If input-at-a-time computation
+    // becomes a bottleneck, we could separate ExtractConstantSubgraph
+    // into two functions: one that returns true if an input is
+    // derivable from constants, and another function that extracts
+    // the subgraph for multiple target nodes and executes the whole
+    // subgraph once.
+
+    for (int i = 0; i < c->num_inputs(); ++i) {
+      if (!c->requested_input_tensor(i)) {
+        continue;
+      }
+      // Check if we have not already filled in the requested input,
+      // and if not, try to materialize the tensors.
+      if (!attempted_materialization[i]) {
+        attempted_materialization[i] = true;
+
+        Tensor result;
+        bool evaluated = false;
+        TF_RETURN_IF_ERROR(
+            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        if (evaluated) {
+          real_tensors[i] = result;
+          input_tensors[i] = &real_tensors[i];
+          // We have more concrete information about a shape,
+          // so re-run shape inference.
+          rerun_shape_fn = true;
+        }
+      }
+      if (c->requested_input_tensor_as_partial_shape(i) &&
+          !attempted_tensor_as_shape_conversion[i]) {
+        attempted_tensor_as_shape_conversion[i] = true;
+        if (i >= input_tensors_as_shapes.size()) {
+          input_tensors_as_shapes.resize(i + 1);
+        }
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s));
+        input_tensors_as_shapes[i] = s;
+        rerun_shape_fn = true;
+      }
+    }
+
+    if (rerun_shape_fn) {
+      // We have more information about the shapes on this pass,
+      // so re-run shape inference.
+      c->set_input_tensors(input_tensors);
+      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+      if (op_reg_data->shape_inference_fn) {
+        TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c));
+      } else {
+        TF_RETURN_IF_ERROR(shape_inference::UnknownShape(c));
+      }
+    }
+  } while (rerun_shape_fn);
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 6ce5ddb3661..603659d54e2 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -31,7 +32,7 @@ namespace tensorflow {
 // construction time.
 class ShapeRefiner {
  public:
-  explicit ShapeRefiner(const OpRegistryInterface* ops);
+  ShapeRefiner(int graph_def_version, const OpRegistryInterface* ops);
   ~ShapeRefiner();
 
   // Performs validation of 'node' and runs 'node's shape function,
@@ -54,16 +55,36 @@ class ShapeRefiner {
   Status SetShape(const Node* node, int output_port,
                   shape_inference::ShapeHandle shape);
 
+  // Update the input shapes of node in case the shapes of the fan-ins of 'node'
+  // have themselves been modified (For example, in case of incremental shape
+  // refinement). Sets refined to true if any of the node shape has changed.
+  Status UpdateNode(const Node* node, bool* refined);
+
   // Returns the InferenceContext for 'node', if present.
   shape_inference::InferenceContext* GetContext(const Node* node) const {
     auto it = node_to_context_.find(node);
     if (it == node_to_context_.end()) {
       return nullptr;
     }
-    return it->second;
+    return it->second.get();
+  }
+
+  // Getters and setters for graph_def_version_.
+  int32 graph_def_version() const { return graph_def_version_; }
+  void set_graph_def_version(int32 version) { graph_def_version_ = version; }
+
+  void set_require_shape_inference_fns(bool require_shape_inference_fns) {
+    require_shape_inference_fns_ = require_shape_inference_fns;
   }
 
  private:
+  // Tries to infer tensor output based on the input shapes of the node. In some
+  // cases, the shapes of the inputs are sufficient for inferring the contents
+  // of the output tensor. For example, a Shape op with fully defined input
+  // shapes can have its output tensor inferred.
+  Status TryToInferTensorOutputFromInputShapes(const Edge* edge, Tensor* output,
+                                               bool* success);
+
   // Extracts the subgraph ending at 'node' that is statically
   // computable and inserts into 'out_graph'. If statically computable,
   // 'is_constant_graph' will be true.
@@ -99,12 +120,21 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
-  const OpRegistryInterface* ops_registry_ = nullptr;
+  Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
+                    shape_inference::InferenceContext* c);
+
+  int32 graph_def_version_;
+  const OpRegistryInterface* const ops_registry_;
+
+  // The lifetime of the tensors are bound to the runner, so it should be the
+  // deleted after the tensors.
+  GraphRunner graph_runner_;
 
   // Stores a map from a node to its InferenceContext.
   //
   // Owns values.
-  std::unordered_map<const Node*, shape_inference::InferenceContext*>
+  std::unordered_map<const Node*,
+                     std::unique_ptr<shape_inference::InferenceContext>>
       node_to_context_;
 
   // Holds a cache from 'tensor name' to the tensor that is
@@ -117,6 +147,9 @@ class ShapeRefiner {
   // Only tensors less than 1KiB are currently stored in the cache.
   static constexpr int64 kMaxTensorSize = 1024;
   std::unordered_map<string, Tensor> const_tensor_map_;
+
+  bool require_shape_inference_fns_ = true;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
 };
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 54a00ac9ffa..55485dc979a 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -38,14 +39,14 @@ TEST(ShapeRefinerTest, Constant) {
   // and that its shape is correct.
   Scope root = Scope::NewRootScope();
   auto c = ops::Const(root, 42.0f);
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(c.node()));
 
   EXPECT_SHAPE("[]", m, c, 0);
 }
 
 TEST(ShapeRefinerTest, MatMul) {
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
 
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, {{1.0f}, {2.0f}});
@@ -62,7 +63,7 @@ TEST(ShapeRefinerTest, MatMul) {
 }
 
 TEST(ShapeRefinerTest, InvalidOrder) {
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, {{1.0f}, {2.0f}});
   auto b = ops::Const(root, {{1.0f, 2.0f}});
@@ -77,7 +78,7 @@ TEST(ShapeRefinerTest, InvalidOrder) {
 }
 
 TEST(ShapeRefinerTest, BadShapes) {
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, {{1.0f}, {2.0f}});
   auto b = ops::Const(root, {{1.0f}, {2.0f}});
@@ -94,7 +95,7 @@ TEST(ShapeRefinerTest, BadShapes) {
 }
 
 TEST(ShapeRefinerTest, SetShape) {
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
 
   Scope root = Scope::NewRootScope();
   auto a = ops::Placeholder(root, DT_FLOAT);
@@ -125,6 +126,27 @@ TEST(ShapeRefinerTest, SetShape) {
   ASSERT_FALSE(m.SetShape(a.node(), 0, h).ok());
 }
 
+namespace {
+
+// An op with no shape function.
+REGISTER_OP("TestOpWithNoShapeFn").Input("a: int32").Output("o: int32");
+
+}  // namespace
+
+TEST(ShapeRefinerTest, MissingShapeInferenceFns) {
+  Scope root = Scope::NewRootScope();
+  auto a = ops::Const(root, 42);
+  Node* b;
+  TF_ASSERT_OK(NodeBuilder("b", "TestOpWithNoShapeFn")
+                   .Input(a.node())
+                   .Finalize(root.graph(), &b));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(a.node()));
+  EXPECT_FALSE(m.AddNode(b).ok());
+  m.set_require_shape_inference_fns(false);
+  TF_EXPECT_OK(m.AddNode(b));
+}
+
 TEST(ShapeRefinerTest, PropagateConstants) {
   // Reduction dimension is a variable, so we don't know its value.
   // So the output shape value is unknown (though its rank is known).
@@ -136,7 +158,7 @@ TEST(ShapeRefinerTest, PropagateConstants) {
     auto dim = ops::Variable(root, {}, DT_INT32);
 
     auto am = ops::ArgMax(root, input, dim);
-    ShapeRefiner m(OpRegistry::Global());
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
     TF_ASSERT_OK(m.AddNode(input.node()));
     TF_ASSERT_OK(m.AddNode(dim.node()));
     TF_ASSERT_OK(m.AddNode(am.node()));
@@ -153,7 +175,7 @@ TEST(ShapeRefinerTest, PropagateConstants) {
     auto dim = ops::Const(root, 1);
 
     auto am = ops::ArgMax(root, input, dim);
-    ShapeRefiner m(OpRegistry::Global());
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
     TF_ASSERT_OK(m.AddNode(input.node()));
     TF_ASSERT_OK(m.AddNode(dim.node()));
     TF_ASSERT_OK(m.AddNode(am.node()));
@@ -169,7 +191,7 @@ TEST(ShapeRefinerTest, PropagateConstants) {
     auto dim = ops::Const(root, 0);
 
     auto am = ops::ArgMax(root, input, dim);
-    ShapeRefiner m(OpRegistry::Global());
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
     TF_ASSERT_OK(m.AddNode(input.node()));
     TF_ASSERT_OK(m.AddNode(dim.node()));
     TF_ASSERT_OK(m.AddNode(am.node()));
@@ -199,7 +221,7 @@ REGISTER_OP("TestOp")
 }  // namespace
 
 TEST(ShapeRefinerTest, InputTensorDependencies) {
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Graph graph(OpRegistry::Global());
   Node* node;
 
@@ -237,6 +259,7 @@ REGISTER_OP("ShapeData")
       }
 
       std::vector<shape_inference::DimensionHandle> dims;
+      dims.reserve(shape_data->NumElements());
       for (int i = 0; i < shape_data->NumElements(); ++i) {
         dims.emplace_back(c->MakeDim(shape_data->flat<int32>()(i)));
       }
@@ -245,8 +268,234 @@ REGISTER_OP("ShapeData")
       return Status::OK();
     });
 
+REGISTER_OP("ShapeDataInt64")
+    .Input("a: int64")
+    .Output("o: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      const Tensor* shape_data = c->input_tensor(0);
+      if (shape_data == nullptr) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      std::vector<shape_inference::DimensionHandle> dims;
+      dims.reserve(shape_data->NumElements());
+      for (int i = 0; i < shape_data->NumElements(); ++i) {
+        dims.emplace_back(c->MakeDim(shape_data->flat<int64>()(i)));
+      }
+
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    });
+
 }  // namespace
 
+TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContent) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable 2x4 tensor.
+  auto input = ops::Variable(root, {2, 4}, DT_INT32);
+
+  // Shape is a vector of 2 elements (2,4)
+  auto shape = ops::Shape(root, input);
+
+  // Ones for indices of the slice. (get the 4).
+  auto ones = ops::Const(root, {1});
+
+  // Slice an element of the shape (4).
+  auto sliced = ops::Slice(root, shape, ones, ones);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(sliced.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(ones.node()));
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(shape.node()));
+  TF_ASSERT_OK(m.AddNode(sliced.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(shape_data);
+  EXPECT_EQ("[4]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable 2x4 tensor.
+  auto input = ops::Variable(
+      root, {2, 4, static_cast<int64>(std::numeric_limits<int32>::max()) * 2},
+      DT_INT64);
+
+  // Shape is a vector of 2 elements (2,4)
+  auto attrs = ops::Shape::OutType(DT_INT64);
+  auto shape = ops::Shape(root, input, attrs);
+
+  // Ones for indices of the slice. (get the 4).
+  auto ones = ops::Const(root, {1});
+
+  // Slice an element of the shape (4).
+  auto sliced = ops::Slice(root, shape, ones, ones);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeDataInt64")
+                   .Input(sliced.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(ones.node()));
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(shape.node()));
+  TF_ASSERT_OK(m.AddNode(sliced.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(shape_data);
+  EXPECT_EQ("[4]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable 2x4 tensor.
+  auto input = ops::Variable(
+      root, {2, 4, static_cast<int64>(std::numeric_limits<int32>::max()) * 2},
+      DT_INT32);
+
+  // Shape is a vector of 2 elements (2,4)
+  auto shape = ops::Shape(root, input);
+
+  // Ones for indices of the slice. (get the 4).
+  auto ones = ops::Const(root, {1});
+
+  // Slice an element of the shape (4).
+  auto sliced = ops::Slice(root, shape, ones, ones);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(sliced.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(ones.node()));
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(shape.node()));
+  TF_ASSERT_OK(m.AddNode(sliced.node()));
+
+  // Expect an error since there's an overflow.
+  EXPECT_FALSE(m.AddNode(shape_data).ok());
+}
+
+TEST(ShapeRefinerTest, PropagateRankAcrossTensorContent) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable 2x4x3 tensor.
+  auto input = ops::Variable(root, {2, 4, 3}, DT_INT32);
+
+  // Rank 3.
+  auto rank = ops::Rank(root, input);
+
+  auto identity = ops::Identity(root, rank);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(identity.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(rank.node()));
+  TF_ASSERT_OK(m.AddNode(identity.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(shape_data);
+  EXPECT_EQ("[3]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContent) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable.
+  auto input = ops::Variable(root, {1, 2, 3, 4, 5}, DT_INT32);
+
+  // 5!.
+  auto size = ops::Size(root, input);
+
+  auto identity = ops::Identity(root, size);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(identity.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(size.node()));
+  TF_ASSERT_OK(m.AddNode(identity.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(shape_data);
+  EXPECT_EQ("[120]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable.
+  auto input =
+      ops::Variable(root,
+                    {1, 2, 3, 4, 5,
+                     static_cast<int64>(std::numeric_limits<int32>::max()) * 2},
+                    DT_INT64);
+
+  // 5! * int32_max_value * 2.
+  auto attrs = ops::Size::OutType(DT_INT64);
+  auto size = ops::Size(root, input, attrs);
+
+  auto identity = ops::Identity(root, size);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeDataInt64")
+                   .Input(identity.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(size.node()));
+  TF_ASSERT_OK(m.AddNode(identity.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(shape_data);
+  EXPECT_EQ("[515396075280]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
+  Scope root = Scope::NewRootScope();
+
+  // Create variable.
+  auto input =
+      ops::Variable(root,
+                    {1, 2, 3, 4, 5,
+                     static_cast<int64>(std::numeric_limits<int32>::max()) * 2},
+                    DT_INT32);
+
+  // 5!.
+  auto size = ops::Size(root, input);
+
+  auto identity = ops::Identity(root, size);
+
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(identity.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(size.node()));
+  TF_ASSERT_OK(m.AddNode(identity.node()));
+  EXPECT_FALSE(m.AddNode(shape_data).ok());
+}
+
 TEST(ShapeRefinerTest, PropagateShape) {
   Scope root = Scope::NewRootScope();
   // 3x2 input
@@ -260,7 +509,7 @@ TEST(ShapeRefinerTest, PropagateShape) {
                    .Input(shape.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(input.node()));
   TF_ASSERT_OK(m.AddNode(shape.node()));
   TF_ASSERT_OK(m.AddNode(shape_data));
@@ -281,7 +530,7 @@ TEST(ShapeRefinerTest, PropagateSize) {
                    .Input(size.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(input.node()));
   TF_ASSERT_OK(m.AddNode(size.node()));
   TF_ASSERT_OK(m.AddNode(shape_data));
@@ -302,7 +551,7 @@ TEST(ShapeRefinerTest, PropagateRank) {
                    .Input(rank.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(input.node()));
   TF_ASSERT_OK(m.AddNode(rank.node()));
   TF_ASSERT_OK(m.AddNode(shape_data));
@@ -323,7 +572,7 @@ TEST(ShapeRefinerTest, PropagateRange) {
                    .Input(range.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(begin.node()));
   TF_ASSERT_OK(m.AddNode(limit.node()));
   TF_ASSERT_OK(m.AddNode(delta.node()));
@@ -346,7 +595,7 @@ TEST(ShapeRefinerTest, ConstantValueTwoInputsToSameNode) {
                    .Input(range.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(begin_and_delta.node()));
   TF_ASSERT_OK(m.AddNode(limit.node()));
   TF_ASSERT_OK(m.AddNode(range.node()));
@@ -381,7 +630,7 @@ TEST(ShapeRefinerTest, ConstantValueVisitNodeTwice) {
                    .Input(range.node())
                    .Finalize(root.graph(), &shape_data));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(begin.node()));
   TF_ASSERT_OK(m.AddNode(limit.node()));
   TF_ASSERT_OK(m.AddNode(delta.node()));
@@ -477,7 +726,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_EmptyVector) {
                    .Input(input)
                    .Finalize(root.graph(), &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(input));
   TF_ASSERT_OK(m.AddNode(result));
 
@@ -498,7 +747,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Shape) {
                      .Input(shape.node())
                      .Finalize(root.graph(), &result));
 
-    ShapeRefiner m(OpRegistry::Global());
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
     TF_ASSERT_OK(m.AddNode(input));
     TF_ASSERT_OK(m.AddNode(shape.node()));
     TF_ASSERT_OK(m.AddNode(result));
@@ -533,8 +782,8 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
                    .Input(pack.node())
                    .Finalize(root.graph(), &result));
 
-  ShapeRefiner m(OpRegistry::Global());
-  for (auto input : inputs) {
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  for (const auto& input : inputs) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
@@ -565,7 +814,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
                    .Input(pack.node())
                    .Finalize(root.graph(), &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   for (const auto& input : inputs) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
@@ -591,7 +840,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
                    .Input(pack.node())
                    .Finalize(root.graph(), &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   for (const auto& input : inputs) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
@@ -618,7 +867,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
                    .Input(pack.node())
                    .Finalize(root.graph(), &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   for (const auto& input : inputs) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
@@ -650,7 +899,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Concat) {
                    .Input(concat.node())
                    .Finalize(g, &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(partial_1));
   TF_ASSERT_OK(m.AddNode(partial_2));
   for (const auto& o : concat_inputs) {
@@ -692,7 +941,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
                    .Input(concat.node())
                    .Finalize(g, &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(partial_1));
   TF_ASSERT_OK(m.AddNode(partial_2));
   TF_ASSERT_OK(m.AddNode(unknown));
@@ -734,7 +983,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
                    .Input(concat.node())
                    .Finalize(g, &result));
 
-  ShapeRefiner m(OpRegistry::Global());
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   TF_ASSERT_OK(m.AddNode(partial_1));
   TF_ASSERT_OK(m.AddNode(partial_2));
   for (const auto& o : concat_inputs) {
@@ -746,5 +995,38 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST(ShapeRefinerTest, IncrementalUpdates) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* queue;
+  TF_CHECK_OK(NodeBuilder("queue", "FIFOQueueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Finalize(g, &queue));
+  Node* dequeue;
+  TF_CHECK_OK(NodeBuilder("dequeue", "QueueDequeueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Input(queue)
+                  .Finalize(g, &dequeue));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(queue));
+  TF_ASSERT_OK(m.AddNode(dequeue));
+
+  // At this point, the shapes of the dequeued tensor are unknown.
+  shape_inference::InferenceContext* ctx = m.GetContext(dequeue);
+  EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+
+  // Inject a shape, and incrementally propagate it to the dequeue op.
+  ctx = m.GetContext(queue);
+  shape_inference::ShapeHandle shp = ctx->MakeShape({3, 7});
+  ctx->set_output_handle_shapes_and_types(
+      0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
+
+  bool refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, &refined));
+  EXPECT_TRUE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[3,7]", ctx->DebugString(ctx->output(0)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.cc b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
index 82d36b51b5a..8206a678b4b 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
@@ -37,6 +37,13 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
+#ifndef IS_MOBILE_PLATFORM
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 
 SimpleGraphExecutionState::SimpleGraphExecutionState(
@@ -67,8 +74,8 @@ SimpleGraphExecutionState::~SimpleGraphExecutionState() {
   std::unique_ptr<SimpleGraphExecutionState> ret(
       new SimpleGraphExecutionState(graph_def, options));
 
-  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&ret->original_graph_def_,
-                                               *ret->flib_def_.get(), 0));
+  TF_RETURN_IF_ERROR(
+      AddDefaultAttrsToGraphDef(&ret->original_graph_def_, *ret->flib_def_, 0));
   // TODO(mrry): Refactor InitBaseGraph() so that we don't have to
   // pass an empty BuildGraphOptions (that isn't going to be used when
   // place_pruned_graph is false).
@@ -96,8 +103,8 @@ SimpleGraphExecutionState::~SimpleGraphExecutionState() {
   GraphDef temp(graph_def);
   std::unique_ptr<SimpleGraphExecutionState> ret(
       new SimpleGraphExecutionState(&temp, options));
-  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&ret->original_graph_def_,
-                                               *ret->flib_def_.get(), 0));
+  TF_RETURN_IF_ERROR(
+      AddDefaultAttrsToGraphDef(&ret->original_graph_def_, *ret->flib_def_, 0));
   TF_RETURN_IF_ERROR(ret->InitBaseGraph(subgraph_options));
   TF_RETURN_IF_ERROR(ret->BuildGraph(subgraph_options, out_client_graph));
   *out_state = std::move(ret);
@@ -132,7 +139,7 @@ Status SimpleGraphExecutionState::Extend(
   int old_node_size = gdef.node_size();
   gdef.mutable_node()->MergeFrom(extension_def.node());
   TF_RETURN_IF_ERROR(
-      AddDefaultAttrsToGraphDef(&gdef, *flib_def_.get(), old_node_size));
+      AddDefaultAttrsToGraphDef(&gdef, *flib_def_, old_node_size));
   // Merge versions
   if (gdef.has_versions()) {
     if (gdef.versions().producer() != extension_def.versions().producer()) {
@@ -174,7 +181,7 @@ Status SimpleGraphExecutionState::Extend(
   if (gdef.versions().producer() >= 5) {
     // Validate the graph: we assume that merging two valid graphs
     // should maintain graph validity.
-    TF_RETURN_IF_ERROR(graph::ValidateGraphDef(gdef, *flib_def_.get()));
+    TF_RETURN_IF_ERROR(graph::ValidateGraphDef(gdef, *flib_def_));
   }
 
   // 6. Add the extension.
@@ -189,7 +196,7 @@ Status SimpleGraphExecutionState::Extend(
       new SimpleGraphExecutionState(&gdef, combined_options));
 
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
-      &new_execution_state->original_graph_def_, *flib_def_.get(), 0));
+      &new_execution_state->original_graph_def_, *flib_def_, 0));
   if (!session_options_->config.graph_options().place_pruned_graph()) {
     // TODO(mrry): Refactor InitBaseGraph() so that we don't have to
     // pass an empty BuildGraphOptions (that isn't going to be used
@@ -227,10 +234,65 @@ void SimpleGraphExecutionState::RestoreStatefulNodes(Graph* graph) {
 
 Status SimpleGraphExecutionState::InitBaseGraph(
     const BuildGraphOptions& options) {
-  std::unique_ptr<Graph> new_graph(new Graph(flib_def_.get()));
+  const GraphDef* graph_def = &original_graph_def_;
+
+#ifndef IS_MOBILE_PLATFORM
+  GraphDef optimized_graph;
+
+  const RewriterConfig& rewrite_options =
+      session_options_->config.graph_options().rewrite_options();
+
+  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
+    // Adding this functionality in steps. The first step is to make sure
+    // we don't break dependencies. The second step will be to turn the
+    // functionality on by default.
+    grappler::GrapplerItem item;
+    item.id = "tf_graph";
+    item.graph = original_graph_def_;
+
+    item.fetch = options.fetch_endpoints;
+    item.fetch.insert(item.fetch.end(), options.target_nodes.begin(),
+                      options.target_nodes.end());
+
+    Status s;
+    if (!options.feed_endpoints.empty()) {
+      std::unordered_set<string> feeds(options.feed_endpoints.begin(),
+                                       options.feed_endpoints.end());
+      for (const NodeDef& node : original_graph_def_.node()) {
+        if (feeds.find(node.name()) == feeds.end()) {
+          continue;
+        }
+        if (node.attr().count("dtype") == 0 ||
+            node.attr().count("shape") == 0) {
+          s = errors::InvalidArgument("Missing node shape or type");
+          break;
+        }
+        TensorShape shape(node.attr().at("shape").shape());
+        DataType type = node.attr().at("dtype").type();
+        Tensor fake_input(type, shape);
+        item.feed.emplace_back(node.name(), fake_input);
+      }
+    }
+
+    if (s.ok()) {
+      std::unordered_map<string, DeviceProperties> device_map;
+      for (const auto& device : device_set_->devices()) {
+        device_map[device->name()] =
+            grappler::GetDeviceInfo(device->parsed_name());
+      }
+      grappler::VirtualCluster cluster(device_map);
+      s = grappler::RunMetaOptimizer(item, rewrite_options, &cluster,
+                                     &optimized_graph);
+    }
+    if (s.ok()) {
+      graph_def = &optimized_graph;
+    }
+  }
+#endif  // IS_MOBILE_PLATFORM
+
+  std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(opts, original_graph_def_, new_graph.get()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, *graph_def, new_graph.get()));
   for (const Node* n : new_graph->nodes()) {
     VLOG(2) << "Mapping " << n->name() << " to " << n->cost_id();
     node_name_to_cost_id_map_[n->name()] = n->cost_id();
@@ -238,9 +300,11 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   if (session_options_ &&
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
+    rewrite_metadata_.reset(new subgraph::RewriteGraphMetadata);
     TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
         new_graph.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes()));
+        options.target_nodes, device_set_->client_device()->attributes(),
+        options.use_function_convention, rewrite_metadata_.get()));
   }
 
   // Save stateful placements before placing.
@@ -249,7 +313,7 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   CostModel costs(true /*is_global*/);
   {
     mutex_lock l(mu_);
-    costs_.InitFromGraph(*new_graph.get());
+    costs_.InitFromGraph(*new_graph);
     costs.MergeFromGlobal(costs_);
   }
 
@@ -257,6 +321,7 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   optimization_options.session_options = session_options_;
   optimization_options.graph = &new_graph;
   optimization_options.flib_def = flib_def_.get();
+  optimization_options.device_set = device_set_;
   optimization_options.cost_model = &costs;
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -286,15 +351,26 @@ Status SimpleGraphExecutionState::BuildGraph(
   std::unique_ptr<Graph> ng(new Graph(flib_def_.get()));
   CopyGraph(*graph_, ng.get());
 
+  subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
     // Extract the subset of the graph that needs to be run, adding feed/fetch
     // ops as needed.
     TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
         ng.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes()));
+        options.target_nodes, device_set_->client_device()->attributes(),
+        options.use_function_convention, &rewrite_metadata));
+  } else {
+    // This SimpleGraphExecutionState represents a graph that was
+    // pruned when this was constructed, so we copy the metadata from
+    // a member variable.
+    CHECK(rewrite_metadata_);
+    rewrite_metadata = *rewrite_metadata_;
   }
 
+  CHECK_EQ(options.feed_endpoints.size(), rewrite_metadata.feed_types.size());
+  CHECK_EQ(options.fetch_endpoints.size(), rewrite_metadata.fetch_types.size());
+
   // Make a fresh copy of the function library for the client graph.
   std::unique_ptr<FunctionLibraryDefinition> flib(
       new FunctionLibraryDefinition(*flib_def_));
@@ -306,6 +382,7 @@ Status SimpleGraphExecutionState::BuildGraph(
   optimization_options.session_options = session_options_;
   optimization_options.graph = &ng;
   optimization_options.flib_def = flib.get();
+  optimization_options.device_set = device_set_;
   optimization_options.cost_model = &costs;
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -315,7 +392,8 @@ Status SimpleGraphExecutionState::BuildGraph(
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
   std::unique_ptr<SimpleClientGraph> dense_copy(
-      new SimpleClientGraph(std::move(flib)));
+      new SimpleClientGraph(std::move(flib), rewrite_metadata.feed_types,
+                            rewrite_metadata.fetch_types));
   CopyGraph(*ng, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.h b/tensorflow/core/common_runtime/simple_graph_execution_state.h
index 3b6ce23c754..00b5509fd78 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.h
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.h
@@ -39,6 +39,10 @@ struct SessionOptions;
 class StepStats;
 class Timeline;
 
+namespace subgraph {
+struct RewriteGraphMetadata;
+}
+
 struct SimpleGraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
@@ -50,13 +54,19 @@ struct SimpleGraphExecutionStateOptions {
 // A SimpleClientGraph is simply a sub-graph of the full graph as induced by
 // BuildGraphOptions.
 struct SimpleClientGraph {
-  explicit SimpleClientGraph(std::unique_ptr<FunctionLibraryDefinition> flib)
-      : flib_def(std::move(flib)), graph(flib_def.get()) {}
+  explicit SimpleClientGraph(std::unique_ptr<FunctionLibraryDefinition> flib,
+                             DataTypeVector feed_types,
+                             DataTypeVector fetch_types)
+      : flib_def(std::move(flib)),
+        graph(flib_def.get()),
+        feed_types(std::move(feed_types)),
+        fetch_types(std::move(fetch_types)) {}
   // Each client-graph gets its own function library since optimization passes
   // post rewrite for execution might want to introduce new functions.
   std::unique_ptr<FunctionLibraryDefinition> flib_def;
   Graph graph;
-  int32 placement_version;
+  DataTypeVector feed_types;
+  DataTypeVector fetch_types;
 };
 
 // SimpleGraphExecutionState is responsible for generating an
@@ -190,6 +200,10 @@ class SimpleGraphExecutionState {
   // and may be updated by a graph optimization pass.
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 
+  // `rewrite_metadata_` is only set for SimpleGraphExecutionState
+  // objects created by `MakeForPrunedGraph()`.
+  std::unique_ptr<subgraph::RewriteGraphMetadata> rewrite_metadata_;
+
   // The dataflow graph owned by this object.
   Graph* graph_;
 
diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc
index f6e6bf06925..6b7c47f8fe5 100644
--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@@ -34,6 +34,11 @@ namespace tensorflow {
 
 namespace {
 
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
 // Returns a list of devices sorted by preferred type and then name
 // from 'devices' whose type is in 'supported_device_types'.  This
 // function searches the device types in 'supported_device_types' and
@@ -64,34 +69,6 @@ std::vector<Device*> FilterSupportedDevices(
   return filtered_devices;
 }
 
-// Returns the name of the colocation group of the node by inspecting
-// the kColocationAttrName attribute of the NodeDef.
-void ColocationGroups(const Node& node,
-                      std::vector<string>* colocation_groups) {
-  std::vector<string> class_specs;
-  // TODO(vrv): We should consider adding a GetNodeAttr that returns a
-  // StringPiece, to avoid a copy.
-  Status s = GetNodeAttr(node.def(), kColocationAttrName, &class_specs);
-  if (!s.ok()) {
-    // No attribute value is equivalent to the empty colocation_group.
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
-    return;
-  }
-
-  bool found_spec = false;
-  for (const string& class_spec : class_specs) {
-    StringPiece spec(class_spec);
-    if (spec.Consume(kColocationGroupPrefix)) {
-      found_spec = true;
-      colocation_groups->emplace_back(class_spec);
-    }
-  }
-
-  if (!found_spec) {
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
-  }
-}
-
 // This class maintains the connected components of a colocation
 // constraint graph, and uses this information to assign a satisfying
 // device placement to the nodes of the graph.
@@ -123,51 +100,96 @@ void ColocationGroups(const Node& node,
 class ColocationGraph {
  public:
   ColocationGraph(Graph* graph, const DeviceSet* device_set,
-                  const SessionOptions* options)
-      : device_set_(device_set),
+                  bool allow_soft_placement)
+      : graph_(graph),
+        device_set_(device_set),
         device_types_(device_set->PrioritizedDeviceTypeList()),
-        options_(options) {
-    members_.reserve(graph->num_node_ids());
+        allow_soft_placement_(allow_soft_placement) {
+    members_.resize(graph->num_node_ids());
   }
 
-  // Adds the given node to this ColocationGraph as a singleton.
+  // Adds each node of the Graph to this ColocationGraph as a singleton.
   //
   // NOTE: The implementation assumes that the ids of nodes passed to
   // this method are dense and zero-based; the memory used will be linear in
   // the largest node ID.
   // NOTE: If this method returns an error, *this is left in an undefined
   // state.
-  Status AddNode(const Node& node) {
-    Member member;
-    TF_RETURN_IF_ERROR(InitializeMember(node, &member));
-    CHECK_GE(member.parent, 0);
-    members_.resize(member.parent + 1);
-    members_[member.parent] = std::move(member);
+  Status ColocateAllNodes() {
+    // This maps from a colocation group identifier to the 'root' of that
+    // colocation group.  Note that the keys in this map are StringPiece; the
+    // actual strings are stored under the NodeDef.  The lifetime of this map
+    // is limited to this ColocateAllNodes() method, and no part of the
+    // NodeDef trees are changed during the lifetime of this method, so using
+    // StringPiece as a key is safe.
+    //
+    // Also, as a further optimization, we remove the "loc:@" prefix from
+    // "class" attribute values, when they are used as keys in this table.
+    // This allows us to use StringPiece values that refer to substrings of
+    // 'string' values stored in NodeDef attribute lists, as well as StringPiece
+    // values that refer to 'string' values from NodeDef::name(), without
+    // performing any string allocations.
+    std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>
+        colocation_group_root;
 
-    // When adding the node, identify whether it is part of a
-    // colocation group.
-    std::vector<string> colocation_groups;
-    ColocationGroups(node, &colocation_groups);
-    Status s;
-    for (const string& colocation_group : colocation_groups) {
-      auto it = colocation_group_root_.find(colocation_group);
-      if (it == colocation_group_root_.end()) {
-        // This is the first node of the colocation group, so
-        // designate this node as the 'root' of that colocation group.
-        colocation_group_root_[colocation_group] = &node;
-      } else {
-        // Try to colocate the node with the root.  If there is an
-        // error, return it.
-        s = ColocateNodes(node, *(it->second));
-        if (!s.ok()) {
-          return s;
+    for (Node* node : graph_->nodes()) {
+      if (!node->IsOp()) {
+        continue;
+      }
+
+      // When adding the node, identify whether it is part of a
+      // colocation group.
+
+      // This code is effectively the equivalent of GetNodeAttr() for a string
+      // array, but it avoids all internal allocations (the allocation of the
+      // backing store of the std::vector<string> as well as the copies of the
+      // strings within it).  Instead, we combine the query of the colocation
+      // attribute with the calls to ColocateNodeToGroup.
+      bool found_spec = false;
+      const AttrValue* attr_value =
+          AttrSlice(node->def()).Find(kColocationAttrNameStringPiece);
+      if (attr_value != nullptr && attr_value->has_list()) {
+        for (const string& class_spec : attr_value->list().s()) {
+          StringPiece spec(class_spec);
+          if (spec.Consume(kColocationGroupPrefixStringPiece)) {
+            found_spec = true;
+            TF_RETURN_IF_ERROR(
+                ColocateNodeToGroup(&colocation_group_root, node, spec));
+          }
         }
       }
+
+      if (!found_spec) {
+        // If the node does not specify a colocation group, then use the
+        // name of this node as the colocation group.
+        TF_RETURN_IF_ERROR(
+            ColocateNodeToGroup(&colocation_group_root, node, node->name()));
+      }
     }
 
     return Status::OK();
   }
 
+  Status ColocateNodeToGroup(
+      std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>*
+          colocation_group_root,
+      Node* node, StringPiece colocation_group) {
+    const Node*& root_node = (*colocation_group_root)[colocation_group];
+    if (root_node == nullptr) {
+      // This is the first node of the colocation group, so
+      // designate this node as the 'root' of that colocation group.
+      root_node = node;
+    } else {
+      // Try to colocate the node with the root.  If there is an
+      // error, return it.
+      Status s = ColocateNodes(*node, *root_node);
+      if (!s.ok()) {
+        return AttachDef(s, node->def());
+      }
+    }
+    return Status::OK();
+  }
+
   // Merge the (possibly disjoint) sets containing nodes "x" and
   // "y". Returns OK if the all nodes in the union of these sets can
   // be placed on the same device type.
@@ -177,105 +199,104 @@ class ColocationGraph {
   Status ColocateNodes(const Node& x, const Node& y) {
     int x_root = FindRoot(x.id());
     int y_root = FindRoot(y.id());
+    return ColocateNodes(x, x_root, y, y_root);
+  }
 
-    Status s;
-    if (x_root != y_root) {
-      // Merge the sets by swinging the parent pointer of the smaller
-      // tree to point to the root of the larger tree. Together with
-      // path compression in ColocationGraph::FindRoot, this ensures
-      // that we do not experience pathological performance on graphs
-      // such as chains.
-      int new_root, old_root;
-      if (members_[x_root].rank < members_[y_root].rank) {
-        // The tree rooted at x_root is shallower, so connect it to
-        // y_root. The rank of y_root is unchanged because its new
-        // child has strictly less rank.
-        members_[x_root].parent = y_root;
-        new_root = y_root;
-        old_root = x_root;
-      } else if (members_[x_root].rank > members_[y_root].rank) {
-        // The tree rooted at y_root is shallower, so connect it to
-        // x_root. The rank of x_root is unchanged because its new
-        // child has strictly less rank.
-        members_[y_root].parent = x_root;
-        new_root = x_root;
-        old_root = y_root;
-      } else {
-        // Both trees have the same rank, so break the tie by choosing
-        // x_root as the new root.
-        members_[y_root].parent = x_root;
-        // Increment the rank of the tree rooted at x_root, because it
-        // is now strictly deeper than before.
-        ++members_[x_root].rank;
-        new_root = x_root;
-        old_root = y_root;
-      }
-
-      // Merge the partial device specifications, and ensure that they are
-      // compatible. NULL options_ is treated as allowing soft placement.
-      // TODO(mrry): Consider enriching the error message by pointing
-      // out which nodes have the explicit partial device
-      // specifications that caused this conflict.
-      s = DeviceNameUtils::MergeDevNames(
-          &members_[new_root].device_name, members_[old_root].device_name,
-          options_ == nullptr || options_->config.allow_soft_placement());
-      if (!s.ok()) {
-        return errors::InvalidArgument("Cannot colocate nodes '", x.name(),
-                                       "' and '", y.name(), ": ",
-                                       s.error_message());
-      }
-
-      // Transfer ids in the old group to the new one.
-      members_[new_root].ids_in_group.insert(
-          members_[old_root].ids_in_group.begin(),
-          members_[old_root].ids_in_group.end());
-      members_[old_root].ids_in_group.clear();
-
-      // Ensure that the common root has at least one supported device
-      // type, by computing the intersection of
-      // members_[new_root].supported_device_types and
-      // members_[old_root].supported_device_types.
-      MergeSupportedDevices(&members_[new_root].supported_device_types,
-                            members_[old_root].supported_device_types);
-      if (members_[new_root].supported_device_types.size() == 0) {
-        string debug_info;
-        AddDebugInfo(x_root, &debug_info);
-        AddDebugInfo(y_root, &debug_info);
-        return errors::InvalidArgument(
-            "Cannot colocate nodes '", x.name(), "' and '", y.name(),
-            "' because no device type supports both of those nodes and the "
-            "other nodes colocated with them.",
-            debug_info);
-      }
+  // This overload of ColocateNodes() allows a caller to provide the root node
+  // ids for the two nodes. For large graphs, this noticeably reduces the
+  // graph load time.
+  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root) {
+    if (x_root == y_root) {
+      return Status::OK();
     }
+
+    DCHECK_EQ(x_root, FindRoot(x.id()));
+    DCHECK_EQ(y_root, FindRoot(y.id()));
+
+    Member& x_root_member = members_[x_root];
+    Member& y_root_member = members_[y_root];
+
+    // Merge the sets by swinging the parent pointer of the smaller
+    // tree to point to the root of the larger tree. Together with
+    // path compression in ColocationGraph::FindRoot, this ensures
+    // that we do not experience pathological performance on graphs
+    // such as chains.
+    int new_root, old_root;
+    if (x_root_member.rank < y_root_member.rank) {
+      // The tree rooted at x_root is shallower, so connect it to
+      // y_root. The rank of y_root is unchanged because its new
+      // child has strictly less rank.
+      x_root_member.parent = y_root;
+      new_root = y_root;
+      old_root = x_root;
+    } else if (x_root_member.rank > y_root_member.rank) {
+      // The tree rooted at y_root is shallower, so connect it to
+      // x_root. The rank of x_root is unchanged because its new
+      // child has strictly less rank.
+      y_root_member.parent = x_root;
+      new_root = x_root;
+      old_root = y_root;
+    } else {
+      // Both trees have the same rank, so break the tie by choosing
+      // x_root as the new root.
+      y_root_member.parent = x_root;
+      // Increment the rank of the tree rooted at x_root, because it
+      // is now strictly deeper than before.
+      ++x_root_member.rank;
+      new_root = x_root;
+      old_root = y_root;
+    }
+
+    Member& new_root_member = members_[new_root];
+    Member& old_root_member = members_[old_root];
+
+    // Merge the partial device specifications, and ensure that they are
+    // compatible. NULL options_ is treated as allowing soft placement.
+    // TODO(mrry): Consider enriching the error message by pointing
+    // out which nodes have the explicit partial device
+    // specifications that caused this conflict.
+    Status s = DeviceNameUtils::MergeDevNames(&new_root_member.device_name,
+                                              old_root_member.device_name,
+                                              allow_soft_placement_);
+    if (!s.ok()) {
+      return errors::InvalidArgument("Cannot colocate nodes '", x.name(),
+                                     "' and '", y.name(), ": ",
+                                     s.error_message());
+    }
+
+    // Ensure that the common root has at least one supported device
+    // type, by computing the intersection of
+    // new_root_member.supported_device_types and
+    // old_root_member.supported_device_types.
+    MergeSupportedDevices(&new_root_member.supported_device_types,
+                          old_root_member.supported_device_types);
+    if (new_root_member.supported_device_types.empty()) {
+      return errors::InvalidArgument(
+          "Cannot colocate nodes '", x.name(), "' and '", y.name(),
+          "' because no device type supports both of those nodes and the "
+          "other nodes colocated with them.",
+          DebugInfo(x_root), DebugInfo(y_root));
+    }
+
     return Status::OK();
   }
 
-  // Returns the device name associated with 'node'.
-  DeviceNameUtils::ParsedName DeviceForNode(const Node& node) {
-    int node_root = FindRoot(node.id());
-    return members_[node_root].device_name;
-  }
-
-  void SetDeviceForNode(Node* node, const DeviceNameUtils::ParsedName& device) {
-    int node_root = FindRoot(node->id());
-    members_[node_root].device_name = device;
-  }
-
   // For the given node, subject to the constraints previously given
   // to this ColocationGraph, set its assigned_device_name. Returns OK
   // if a satisfying device can be found, otherwise an error.
-  Status GetDevicesForNode(Node* node, std::vector<Device*>* possible_devices) {
-    possible_devices->clear();
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  Status GetDevicesForNode(Node* node,
+                           std::vector<Device*>** possible_devices) {
+    *possible_devices = nullptr;
     const int node_root = FindRoot(node->id());
     if (!members_[node_root].possible_devices.empty()) {
-      *possible_devices = members_[node_root].possible_devices;
+      *possible_devices = &members_[node_root].possible_devices;
       return Status::OK();
     }
 
-    // String containing additional debugging info on failures.
-    string debug_info;
-
     // We have not yet computed the possible devices for the
     // colocated node set containing 'node', so we do so now using the
     // constraints on the root node.
@@ -297,10 +318,8 @@ class ColocationGraph {
             devices, members_[node_root].supported_device_types);
       }
 
-      // Perform soft placement if allow_soft_placement is set.  options_
-      // being NULL is treated as allowing soft placement.
-      if (devices.empty() &&
-          (options_ == nullptr || options_->config.allow_soft_placement())) {
+      // Perform soft placement if allow_soft_placement_ is set.
+      if (devices.empty() && allow_soft_placement_) {
         // The soft_device_name is the same as the node's device name
         // without specifying the device type or ID.
         DeviceNameUtils::ParsedName soft_device_name =
@@ -319,10 +338,10 @@ class ColocationGraph {
         // Return an error when a physical device that matches an explicit
         // device specification is not found. This ensures that we don't
         // assign a node to GPU when the user wanted to force it on CPU.
-        AddDebugInfo(node_root, &debug_info);
+        string debug_info = DebugInfo(node_root);
 
         DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->def().device(),
+        if (DeviceNameUtils::ParseFullName(node->requested_device(),
                                            &specified_device_name) &&
             specified_device_name == members_[node_root].device_name) {
           // The specified device and merged set device match, and
@@ -341,28 +360,27 @@ class ColocationGraph {
             std::sort(device_names.begin(), device_names.end());
 
             return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->def().device(),
-                "' because no devices matching that specification "
-                "are registered in this process; available devices: ",
-                str_util::Join(device_names, ", "), debug_info);
+                "Operation was explicitly assigned to ",
+                node->requested_device(), " but available devices are [ ",
+                str_util::Join(device_names, ", "), " ]. Make sure ",
+                "the device specification refers to a valid device.");
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), "' because no supported kernel for ",
+                node->requested_device(), "' because no supported kernel for ",
                 specified_device_name.type, " devices is available.",
                 debug_info);
           } else {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), debug_info);
+                node->requested_device(), debug_info);
           }
         } else {
           // The specified device may be a valid device but the
           // merged set device is different, so print both.
           return errors::InvalidArgument(
               "Could not satisfy explicit device specification '",
-              node->def().device(),
+              node->requested_device(),
               "' because the node was colocated with a group of nodes that "
               "required incompatible device '",
               DeviceNameUtils::ParsedNameToString(
@@ -380,21 +398,32 @@ class ColocationGraph {
           device_set_->devices(), members_[node_root].supported_device_types);
 
       if (devices.empty()) {
-        AddDebugInfo(node_root, &debug_info);
         return errors::InvalidArgument(
             "Node had no OpKernel registered to support this operation: ",
             "Operation was ", node->type_string(), " and inputs were ",
-            DataTypeVectorString(node->input_types()), debug_info);
+            DataTypeVectorString(node->input_types()), DebugInfo(node_root));
       }
     }
 
     // Cache the result of the possible devices for this node group.
-    members_[node_root].possible_devices = devices;
-    *possible_devices = members_[node_root].possible_devices;
+    members_[node_root].possible_devices = std::move(devices);
+    *possible_devices = &members_[node_root].possible_devices;
+    return Status::OK();
+  }
+
+  Status InitializeMembers() {
+    for (Node* node : graph_->nodes()) {
+      if (!node->IsOp()) {
+        continue;
+      }
+      Status status = InitializeMember(*node, &members_[node->id()]);
+      if (!status.ok()) {
+        return AttachDef(status, node->def());
+      }
+    }
     return Status::OK();
   }
 
- private:
   // Represents a node in the disjoint node set forest, and the
   // accumulated constraints on the device used by that node.
   struct Member {
@@ -403,15 +432,6 @@ class ColocationGraph {
     // id if it is a root. parent <= 0 indicates that this member is invalid.
     int parent = -1;
 
-    // The set of ids that are part of the disjoint node set forest.
-    //
-    // This is only fully specified in the root of a disjoint
-    // node set forest.
-    std::set<int> ids_in_group;
-
-    // The type of the op for this node.
-    string op_type;
-
     // A proxy for the depth of the tree that is used to prefer
     // connecting smaller trees to larger trees when merging disjoint
     // sets.
@@ -432,49 +452,56 @@ class ColocationGraph {
     std::vector<Device*> possible_devices;
   };
 
-  // Adds debugging info to 'output' for the node referred to by
-  // 'node_root'.
-  void AddDebugInfo(const int node_root, string* output) {
-    if (members_[node_root].ids_in_group.size() > 1) {
-      strings::StrAppend(output, "\nColocation Debug Info:\n");
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root) {
+    string text(
+        "\nColocation Debug Info:\n"
+        "Colocation group had the following types and devices: ");
 
-      // If this node is part of a colocation group, then we want to
-      // collect the mapping of ops to supported devices, so that
-      // the user can see why an unsatisfiable placement occurred.
-      strings::StrAppend(
-          output, "Colocation group had the following types and devices: ");
+    // If this node is part of a colocation group, then we want to
+    // collect the mapping of ops to supported devices, so that
+    // the user can see why an unsatisfiable placement occurred.
 
-      std::unordered_map<string, string> type_to_devices;
-      for (const int id : members_[node_root].ids_in_group) {
-        const string& op_type = members_[id].op_type;
-        string devices_registered;
-        for (const auto& device_type : members_[id].supported_device_types) {
-          strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
-                             " ");
-        }
+    std::unordered_map<string, string> type_to_devices;
+    int num_nodes_found = 0;
 
-        type_to_devices[op_type] = devices_registered;
+    for (const Node* node : graph_->nodes()) {
+      if (!node->IsOp()) {
+        continue;
+      }
+      int id = node->id();
+      if (FindRoot(id) != node_root) {
+        continue;
+      }
+      ++num_nodes_found;
+      const string& op_type = node->type_string();
+      string devices_registered;
+      for (const auto& device_type : members_[id].supported_device_types) {
+        strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
+                           " ");
       }
 
-      for (const auto& td : type_to_devices) {
-        strings::StrAppend(output, "\n", td.first, ": ", td.second);
-      }
+      type_to_devices[op_type] = std::move(devices_registered);
     }
+
+    for (const auto& td : type_to_devices) {
+      strings::StrAppend(&text, "\n", td.first, ": ", td.second);
+    }
+
+    if (num_nodes_found <= 1) {
+      text.clear();
+    }
+    return text;
   }
 
   Status InitializeMember(const Node& node, Member* member) {
     const int id = node.id();
-    member->ids_in_group.insert(id);
-    member->op_type = node.type_string();
-
-    if (id < 0) {
-      return errors::InvalidArgument("Node id was not positive: ", id);
-    }
+    DCHECK_GE(id, 0);
     member->parent = id;
     TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
         device_types_, node.def(), &member->supported_device_types));
 
-    if (!node.assigned_device_name().empty()) {
+    if (node.has_assigned_device_name()) {
       // This node has already been assigned to a device, so we
       // respect this placement, after sanity-checking it.  The
       // device_name and supported_device_types for this node reflect
@@ -484,17 +511,16 @@ class ColocationGraph {
       // NOTE: Since any assignment must have been performed by
       // the TensorFlow runtime, we consider errors in this branch to
       // be INTERNAL.
-      if (!DeviceNameUtils::ParseFullName(node.assigned_device_name(),
+      const string& assigned_device_name = node.assigned_device_name();
+      if (!DeviceNameUtils::ParseFullName(assigned_device_name,
                                           &member->device_name)) {
         return errors::Internal("Malformed assigned device '",
-                                node.assigned_device_name(), "'");
+                                assigned_device_name, "'");
       }
-      std::vector<Device*> devices;
       const Device* assigned_device =
-          device_set_->FindDeviceByName(node.assigned_device_name());
+          device_set_->FindDeviceByName(assigned_device_name);
       if (assigned_device == nullptr) {
-        return errors::Internal("Assigned device '",
-                                node.assigned_device_name(),
+        return errors::Internal("Assigned device '", assigned_device_name,
                                 "' does not match any device");
       }
 
@@ -504,10 +530,10 @@ class ColocationGraph {
         }
       }
 
-      return errors::Internal("Assigned device '", node.assigned_device_name(),
+      return errors::Internal("Assigned device '", assigned_device_name,
                               "' does not have registered OpKernel support "
                               "for ",
-                              node.def().op());
+                              node.type_string());
     } else {
       // This node has not yet been assigned to a device, so we
       // calculate any constraints due to the set of registered
@@ -521,25 +547,25 @@ class ColocationGraph {
           registered_device_types.insert(d->device_type());
         }
         return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.def().op(),
+            "No OpKernel was registered to support Op '", node.type_string(),
             "' with these attrs.  Registered devices: [",
             str_util::Join(registered_device_types, ","),
             "], Registered kernels:\n",
-            KernelsRegisteredForOp(node.def().op()));
+            KernelsRegisteredForOp(node.type_string()));
       }
 
       // If the NodeDef contains a device, then we interpret it as a
       // (partial) device specification.
-      if (!node.def().device().empty()) {
+      if (!node.requested_device().empty()) {
         // The user has specified a device in the NodeDef, try to find a
         // valid device matching their specification in the set of
         // devices.
         // NOTE: The full name may specify a device that is not in
         // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.def().device(),
+        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
                                             &member->device_name)) {
           return errors::InvalidArgument("Malformed device specification '",
-                                         node.def().device(), "'");
+                                         node.requested_device(), "'");
         }
       }
     }
@@ -571,32 +597,32 @@ class ColocationGraph {
   // Returns the root node of the disjoint tree to which the node with the
   // given id is connected.
   int FindRoot(int node_id) {
-    DCHECK_GE(members_[node_id].parent, 0);
-    if (members_[node_id].parent != node_id) {
+    Member& member = members_[node_id];
+
+    int parent = member.parent;
+    DCHECK_GE(parent, 0);
+
+    if (parent != node_id) {
       // NOTE: Compress paths from node_id to its root, so that future
       // calls to FindRoot and ColocateNodes are more efficient.
-      members_[node_id].parent = FindRoot(members_[node_id].parent);
+      int root = FindRoot(parent);
+      if (parent != root) {
+        parent = root;
+        member.parent = root;
+      }
     }
-    return members_[node_id].parent;
+
+    DCHECK_GE(parent, 0);
+    return parent;
   }
 
+  Graph* const graph_;  // Not owned.
   std::vector<Member> members_;
   const DeviceSet* device_set_;  // Not owned.
   const std::vector<DeviceType> device_types_;
-  const SessionOptions* options_;  // Not owned;
-
-  // Maps from a colocation group identifier to the 'root' of that
-  // colocation group.
-  std::unordered_map<string, const Node*> colocation_group_root_;
+  const bool allow_soft_placement_;
 };
 
-// Returns true if the node only depends on its input's metadata
-// (shape).  Not necessarily a complete list.
-bool IsMetadataNode(const Node* node) {
-  const string& node_type = node->type_string();
-  return (node_type == "Size" || node_type == "Shape" || node_type == "Rank");
-}
-
 // Returns true if the node has no inputs and produces outputs
 // that are consumed by a single node.
 //
@@ -612,12 +638,14 @@ bool IsGeneratorNode(const Node* node) {
 
 SimplePlacer::SimplePlacer(Graph* graph, const DeviceSet* devices,
                            const SessionOptions* options)
-    : graph_(graph), devices_(devices), options_(options) {}
+    : graph_(graph),
+      devices_(devices),
+      options_(options),
+      log_device_placement_(options != nullptr &&
+                            options->config.log_device_placement()) {}
 
 SimplePlacer::SimplePlacer(Graph* graph, const DeviceSet* devices)
-    : graph_(graph), devices_(devices) {
-  options_ = nullptr;
-}
+    : SimplePlacer(graph, devices, nullptr) {}
 
 SimplePlacer::~SimplePlacer() {}
 
@@ -626,104 +654,93 @@ Status SimplePlacer::Run() {
     return errors::FailedPrecondition("No devices are registered");
   }
 
-  ColocationGraph colocation_graph(graph_, devices_, options_);
-  Status status;
+  ColocationGraph colocation_graph(
+      graph_, devices_,
+      options_ == nullptr || options_->config.allow_soft_placement());
+
+  TF_RETURN_IF_ERROR(colocation_graph.InitializeMembers());
 
   // 1. First add all of the nodes. Note that steps (1) and (2)
   // requires two passes over the nodes because the graph (and hence
   // the constraints) may not be acyclic.
-  for (Node* node : graph_->nodes()) {
-    // Skip the source and sink nodes.
-    if (!node->IsOp()) {
-      continue;
-    }
-    status = colocation_graph.AddNode(*node);
-    if (!status.ok()) return AttachDef(status, node->def());
-  }
+  TF_RETURN_IF_ERROR(colocation_graph.ColocateAllNodes());
 
   // 2. Enumerate the constraint edges, and use them to update the disjoint
   // node set.
-  for (Node* node : graph_->nodes()) {
-    if (!node->IsOp()) {
+
+  // If `node` has an input edge with reference type, add an
+  // edge from the source of that edge to `node`.
+  for (const Edge* edge : graph_->edges()) {
+    if (edge->IsControlEdge()) {
       continue;
     }
+    Node* src = edge->src();
+    Node* dst = edge->dst();
+    DataType input_type = dst->input_type(edge->dst_input());
+    if (input_type == DT_RESOURCE || IsRefType(input_type)) {
+      int src_root_id = colocation_graph.FindRoot(src->id());
+      int dst_root_id = colocation_graph.FindRoot(dst->id());
+      auto& src_root = colocation_graph.members_[src_root_id];
+      auto& dst_root = colocation_graph.members_[dst_root_id];
+      // If both the source node and this node have paritally
+      // specified a device, then 'node's device should be
+      // cleared: the reference edge forces 'node' to be on the
+      // same device as the source node.
+      const auto& source_parsed_name = src_root.device_name;
+      const auto& dest_parsed_name = dst_root.device_name;
+      if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
+          DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
+        // Add a log saying that we are ignoring a specified device
+        // for 'dst' if the two names were incompatible.
+        if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
+                                                    dest_parsed_name)) {
+          LOG(INFO) << "Ignoring device specification "
+                    << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
+                    << " for node '" << dst->name()
+                    << "' because the input edge from '" << src->name()
+                    << "' is a reference connection and already has a device "
+                       "field set to "
+                    << DeviceNameUtils::ParsedNameToString(source_parsed_name);
 
-    // If `node` has an input edge with reference type, add an
-    // edge from the source of that edge to `node`.
-    for (const auto& edge : node->in_edges()) {
-      if (!edge->IsControlEdge() &&
-          (IsRefType(node->input_type(edge->dst_input())) ||
-           node->input_type(edge->dst_input()) == DT_RESOURCE)) {
-        // If both the source node and this node have paritally
-        // specified a device, then 'node's device should be
-        // cleared: the reference edge forces 'node' to be on the
-        // same device as the source node.
-        auto source_parsed_name = colocation_graph.DeviceForNode(*edge->src());
-        auto dest_parsed_name = colocation_graph.DeviceForNode(*node);
-        if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
-            DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
-          // Add a log saying that we are ignoring a specified device
-          // for 'node' if the two names were incompatible.
-          if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
-                                                      dest_parsed_name)) {
-            LOG(INFO) << "Ignoring device specification "
-                      << DeviceNameUtils::ParsedNameToString(
-                             colocation_graph.DeviceForNode(*node))
-                      << " for node '" << node->name()
-                      << "' because the input edge from '"
-                      << edge->src()->name()
-                      << "' is a reference connection and already has a device "
-                         "field set to "
-                      << DeviceNameUtils::ParsedNameToString(
-                             colocation_graph.DeviceForNode(*edge->src()));
+          // Make 'dst' colocated with the source
+          dst_root.device_name = source_parsed_name;
+        } else {
+          bool source_subset_of_dest = DeviceNameUtils::IsSpecification(
+              source_parsed_name, dest_parsed_name);
+          bool dest_subset_of_source = DeviceNameUtils::IsSpecification(
+              dest_parsed_name, source_parsed_name);
 
-            // Make 'node' colocated with the source
-            colocation_graph.SetDeviceForNode(node, source_parsed_name);
+          if (source_subset_of_dest && !dest_subset_of_source) {
+            src_root.device_name = dest_parsed_name;
           } else {
-            bool source_subset_of_dest = DeviceNameUtils::IsSpecification(
-                source_parsed_name, dest_parsed_name);
-            bool dest_subset_of_source = DeviceNameUtils::IsSpecification(
-                dest_parsed_name, source_parsed_name);
-
-            if (source_subset_of_dest && !dest_subset_of_source) {
-              colocation_graph.SetDeviceForNode(edge->src(), dest_parsed_name);
-            } else {
-              colocation_graph.SetDeviceForNode(node, source_parsed_name);
-            }
+            dst_root.device_name = source_parsed_name;
           }
         }
+      }
 
-        status = colocation_graph.ColocateNodes(*edge->src(), *node);
-        if (!status.ok()) {
-          return AttachDef(errors::InvalidArgument(
-                               "Nodes were connected by a "
-                               "reference connection (requiring them to "
-                               "be on the same device), but the two nodes "
-                               "were assigned two different devices: ",
-                               status.error_message()),
-                           node->def());
-        }
+      Status status =
+          colocation_graph.ColocateNodes(*src, src_root_id, *dst, dst_root_id);
+      if (!status.ok()) {
+        return AttachDef(
+            errors::InvalidArgument("Nodes were connected by a "
+                                    "reference connection (requiring them to "
+                                    "be on the same device), but the two nodes "
+                                    "were assigned two different devices: ",
+                                    status.error_message()),
+            dst->def());
       }
     }
   }
 
   // 3. For each node, assign a device based on the constraints in the
   // disjoint node set.
-  std::vector<Device*> devices;
   std::vector<Node*> second_pass;
-  for (Node* node : graph_->nodes()) {
-    // Skip the source and sink nodes.
-    if (!node->IsOp()) {
-      continue;
-    }
-
+  for (Node* node : graph_->op_nodes()) {
     // The graph may have come pre-populated by the framework with assigned
     // devices (e.g., for stateful placements), so the placer should not try to
     // place nodes that are already placed.
-    if (!node->assigned_device_name().empty()) {
-      // Although the device is already assigned, we run this function to
-      // possibly log pre-assigned placements.
-      AssignAndLog(node->assigned_device_name(), node);
+    if (node->has_assigned_device_name()) {
+      LogDeviceAssignment(node);
       continue;
     }
 
@@ -738,12 +755,13 @@ Status SimplePlacer::Run() {
       continue;
     }
 
-    status = colocation_graph.GetDevicesForNode(node, &devices);
+    std::vector<Device*>* devices;
+    Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
     // Returns the first device in sorted devices list so we will always
@@ -755,12 +773,12 @@ Status SimplePlacer::Run() {
     // given a choice of devices. Once we have a better idea of the
     // types of heuristics we want to use and the information needed
     // to perform good placement we can add an interface for this.
-    string assigned_device = devices[0]->name();
+    int assigned_device = -1;
 
     // Heuristic B: If the node only operates on metadata, not data,
     // then it is desirable to place that metadata node with its
     // input.
-    if (IsMetadataNode(node)) {
+    if (IsMetadata(node)) {
       // Make sure that the input device type is in the list of supported
       // device types for this node.
       const Node* input = (*node->in_edges().begin())->src();
@@ -768,45 +786,55 @@ Status SimplePlacer::Run() {
       // node's assignment to the second pass, so that we handle the
       // case where a metadata node's input comes from a backedge
       // of a loop.
-      const string& input_device_name = input->assigned_device_name();
-      if (CanAssignToDevice(input_device_name, devices)) {
-        assigned_device = input_device_name;
+      if (CanAssignToDevice(input->assigned_device_name(), *devices)) {
+        assigned_device = input->assigned_device_name_index();
       }
     }
 
+    // Provide the default, if necessary.
+    if (assigned_device == -1) {
+      assigned_device = graph_->InternDeviceName((*devices)[0]->name());
+    }
+
     AssignAndLog(assigned_device, node);
   }
 
   // 4. Perform a second pass assignment for those nodes explicitly
   // skipped during the first pass.
   for (Node* node : second_pass) {
-    status = colocation_graph.GetDevicesForNode(node, &devices);
+    std::vector<Device*>* devices;
+    Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
-    string assigned_device = devices[0]->name();
+    int assigned_device = -1;
 
     // Heuristic A application.
     if (IsGeneratorNode(node)) {
       const Node* output = (*node->out_edges().begin())->dst();
-      const string& output_device_name = output->assigned_device_name();
+      int output_device_name = output->assigned_device_name_index();
 
       const bool consumers_on_same_device = std::all_of(
           node->out_edges().begin(), node->out_edges().end(),
           [output_device_name](const Edge* e) {
-            return e->dst()->assigned_device_name() == output_device_name;
+            return e->dst()->assigned_device_name_index() == output_device_name;
           });
 
-      if (consumers_on_same_device && 
-          CanAssignToDevice(output_device_name, devices)) {
+      if (consumers_on_same_device &&
+          CanAssignToDevice(output->assigned_device_name(), *devices)) {
         assigned_device = output_device_name;
       }
     }
 
+    // Provide the default, if necessary.
+    if (assigned_device == -1) {
+      assigned_device = graph_->InternDeviceName((*devices)[0]->name());
+    }
+
     AssignAndLog(assigned_device, node);
   }
 
@@ -831,11 +859,14 @@ bool SimplePlacer::CanAssignToDevice(
   return false;
 }
 
-void SimplePlacer::AssignAndLog(const string& assigned_device,
-                                Node* node) const {
-  node->set_assigned_device_name(assigned_device);
+void SimplePlacer::AssignAndLog(int assigned_device, Node* node) const {
+  node->set_assigned_device_name_index(assigned_device);
+  LogDeviceAssignment(node);
+}
+
+void SimplePlacer::LogDeviceAssignment(const Node* node) const {
   // Log placement if log_device_placement is set.
-  if (options_ && options_->config.log_device_placement()) {
+  if (log_device_placement_) {
     printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
            node->assigned_device_name().c_str());
     LOG(INFO) << node->name() << ": "
diff --git a/tensorflow/core/common_runtime/simple_placer.h b/tensorflow/core/common_runtime/simple_placer.h
index a041e968309..9c63cef40bb 100644
--- a/tensorflow/core/common_runtime/simple_placer.h
+++ b/tensorflow/core/common_runtime/simple_placer.h
@@ -86,11 +86,13 @@ class SimplePlacer {
 
   // Assigns 'node's devices to 'assigned_device', and logs the
   // placement if the SessionOptions entry in 'options_' requests it.
-  void AssignAndLog(const string& assigned_device, Node* node) const;
+  void AssignAndLog(int assigned_device, Node* node) const;
+  void LogDeviceAssignment(const Node* node) const;
 
   Graph* const graph_;                           // Not owned.
   const DeviceSet* const devices_;               // Not owned.
   const SessionOptions* options_;                // Not owned.
+  const bool log_device_placement_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SimplePlacer);
 };
diff --git a/tensorflow/core/common_runtime/simple_placer_test.cc b/tensorflow/core/common_runtime/simple_placer_test.cc
index c73ed041ed1..967bee63a18 100644
--- a/tensorflow/core/common_runtime/simple_placer_test.cc
+++ b/tensorflow/core/common_runtime/simple_placer_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
@@ -66,7 +65,7 @@ class DummyOp : public OpKernel {
 class FakeDevice : public Device {
  private:
   explicit FakeDevice(const DeviceAttributes& device_attributes)
-      : Device(nullptr, device_attributes, nullptr) {}
+      : Device(nullptr, device_attributes) {}
 
  public:
   Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
@@ -237,7 +236,7 @@ class SimplePlacerTest : public ::testing::Test {
 
   Status ReferenceTestHelper(const string& variable_op_type,
                              const string& assign_op_type,
-                             DeviceType expected_device_type);
+                             const DeviceType& expected_device_type);
 };
 
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
@@ -500,9 +499,9 @@ TEST_F(SimplePlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 // Build a graph containing a Variable op of "variable_op_type" and an
 // Assign op of "assign_op_type", and expect all of the ops to be
 // placed on a device of type "expected_device_type".
-Status SimplePlacerTest::ReferenceTestHelper(const string& variable_op_type,
-                                             const string& assign_op_type,
-                                             DeviceType expected_device_type) {
+Status SimplePlacerTest::ReferenceTestHelper(
+    const string& variable_op_type, const string& assign_op_type,
+    const DeviceType& expected_device_type) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -939,10 +938,7 @@ TEST_F(SimplePlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -957,10 +953,7 @@ TEST_F(SimplePlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -1107,10 +1100,7 @@ TEST_F(SimplePlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakegpu:11'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1127,10 +1117,7 @@ TEST_F(SimplePlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakecpu:0'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
   EXPECT_TRUE(
       StringPiece(s.error_message())
           .contains("no supported kernel for fakecpu devices is available"));
@@ -1151,12 +1138,9 @@ TEST_F(SimplePlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Could not satisfy explicit device specification "
-                    "'/job:foo/replica:17' "
-                    "because no devices matching that specification are "
-                    "registered in this process"));
+  EXPECT_TRUE(StringPiece(s.error_message())
+                  .contains("was explicitly assigned to /job:foo/replica:17 "
+                            "but available devices"));
 }
 
 TEST_F(SimplePlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.cc b/tensorflow/core/common_runtime/stats_publisher_interface.cc
index 408c901d170..f589140cd6f 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.cc
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.cc
@@ -15,29 +15,30 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 
-namespace tensorflow {
+#include "tensorflow/core/framework/graph.pb.h"
 
+namespace tensorflow {
 namespace {
+
 // NoOpStatsPublisher provides an dummy/no-op implementation of
 // StatsPublisherInterface.
 class NoOpStatsPublisher : public StatsPublisherInterface {
  public:
-  NoOpStatsPublisher(){};
+  NoOpStatsPublisher() = default;
 
-  void PublishStatsProto(const StepStats& step_stats) override { return; }
+  void PublishStatsProto(const StepStats& step_stats) override {}
 
   void PublishGraphProto(
-      const std::vector<const GraphDef*>& graph_defs) override {
-    return;
-  }
+      const std::vector<const GraphDef*>& graph_defs) override {}
 
   std::unique_ptr<ProfileHandler> GetProfileHandler(
       uint64 step, int64 execution_count, const RunOptions& ropts) override {
     return nullptr;
   }
 
-  ~NoOpStatsPublisher() override {}
+  ~NoOpStatsPublisher() override = default;
 };
+
 }  // namespace
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 57466fa9185..9b43385d6f7 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -186,6 +186,7 @@ void StepStatsCollector::BuildCostModel(
                                               .allocation_description()
                                               .allocation_id());
         }
+        cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
         string node_name = dev_stats.regular_stats->node_stats(i).node_name();
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 0d238276f4f..485e5397e89 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -19,29 +19,29 @@ limitations under the License.
 
 namespace tensorflow {
 
-SYCLAllocator::~SYCLAllocator() {}
+SYCLAllocator::~SYCLAllocator() {
+  if(sycl_device_) {
+    delete sycl_device_;
+  }
+}
 
 string SYCLAllocator::Name() { return "device:SYCL"; }
 
 void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  assert(device_);
-  auto p = device_->allocate(num_bytes);
+  assert(sycl_device_);
+  if (num_bytes == 0) {
+    return sycl_device_->allocate(1);
+  }
+  auto p = sycl_device_->allocate(num_bytes);
   return p;
 }
 
 void SYCLAllocator::DeallocateRaw(void *ptr) {
-  if (device_) {
-    device_->deallocate(ptr);
+  if (sycl_device_) {
+    sycl_device_->deallocate(ptr);
   }
 }
 
-void SYCLAllocator::EnterLameDuckMode() {
-  if (device_) {
-    device_->deallocate_all();
-    device_ = nullptr;
-  }
-}
+}  // namespace tensorflow
 
-} // namespace tensorflow
-
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index c896f7f6037..8668cba06af 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -27,20 +27,23 @@ limitations under the License.
 namespace tensorflow {
 
 class SYCLAllocator : public Allocator {
-public:
-  SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
+ public:
+  SYCLAllocator(Eigen::QueueInterface *queue) : sycl_device_(new Eigen::SyclDevice(queue)) {}
   virtual ~SYCLAllocator() override;
   string Name() override;
   void *AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void *ptr) override;
 
-  void EnterLameDuckMode();
   virtual bool ShouldAllocateEmptyTensors() override final { return true; }
-private:
-  Eigen::QueueInterface *device_;  // not owned
+  void Synchronize() { sycl_device_->synchronize(); }
+  bool Ok() { return sycl_device_->ok(); }
+  Eigen::SyclDevice* getSyclDevice() { return sycl_device_; }
+ private:
+  Eigen::SyclDevice *sycl_device_;  // owned
+
   TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };
 
-} // namespace tensorflow
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 0abe25c373e..17f5edd5725 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -22,50 +22,18 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
-
-static std::unordered_set<SYCLDevice*> live_devices;
-static bool first_time = true;
+std::mutex GSYCLInterface::mutex_;
+GSYCLInterface *GSYCLInterface::s_instance = 0;
 
 void ShutdownSycl() {
-  for (auto device : live_devices) {
-    device->EnterLameDuckMode();
-  }
-  live_devices.clear();
+  GSYCLInterface::Reset();
 }
 
 void SYCLDevice::RegisterDevice() {
-  if (first_time) {
-    first_time = false;
     atexit(ShutdownSycl);
-  }
-  live_devices.insert(this);
 }
 
-SYCLDevice::~SYCLDevice() {
-  device_context_->Unref();
-  sycl_allocator_->EnterLameDuckMode();
-  if (sycl_device_) {
-    sycl_device_->synchronize();
-    delete sycl_device_;
-  }
-  if (sycl_queue_) {
-    delete sycl_queue_;
-  }
-  live_devices.erase(this);
-}
-
-void SYCLDevice::EnterLameDuckMode() {
-  sycl_allocator_->EnterLameDuckMode();
-  if (sycl_device_) {
-    sycl_device_->synchronize();
-    delete sycl_device_;
-    sycl_device_ = nullptr;
-  }
-  if (sycl_queue_) {
-    delete sycl_queue_;
-    sycl_queue_ = nullptr;
-  }
-}
+SYCLDevice::~SYCLDevice() {}
 
 void SYCLDevice::Compute(OpKernel *op_kernel, OpKernelContext *context) {
   assert(context);
@@ -88,8 +56,12 @@ Allocator *SYCLDevice::GetAllocator(AllocatorAttributes attr) {
 Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
                                        const AllocatorAttributes alloc_attrs,
                                        Tensor *tensor) {
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  Allocator* host_alloc = GetAllocator(attr);
+
   Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(cpu_allocator_, tensor_proto)) {
+  if (!parsed.FromProto(host_alloc, tensor_proto)) {
     return errors::InvalidArgument("Cannot parse tensor from proto: ",
                                    tensor_proto.DebugString());
   }
@@ -98,6 +70,14 @@ Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
     *tensor = parsed;
   } else {
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
+
+    // If the tensor is not initialized, we likely ran out of memory.
+    if (!copy.IsInitialized()) {
+      return errors::ResourceExhausted(
+          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
+          " and type ", DataTypeString(parsed.dtype()));
+    }
+
     device_context_->CopyCPUTensorToDevice(
         &parsed, this, &copy, [&status](const Status &s) { status = s; });
     *tensor = copy;
@@ -119,8 +99,8 @@ Status SYCLDevice::FillContextMap(const Graph *graph,
 }
 
 Status SYCLDevice::Sync() {
-  sycl_device_->synchronize();
-  if (sycl_device_->ok()) {
+  sycl_allocator_->Synchronize();
+  if (sycl_allocator_->Ok()) {
     return Status::OK();
   } else {
     return errors::Internal("Unknown error detected on device ", name());
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index b5a72d94763..b4123ca071a 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -27,30 +27,184 @@ limitations under the License.
 
 namespace tensorflow {
 
+
+class GSYCLInterface
+{
+    std::vector<Eigen::QueueInterface*>     m_queue_interface_;    // owned
+    std::vector<Allocator*>                 m_cpu_allocator_;      // not owned
+    std::vector<SYCLAllocator*>             m_sycl_allocator_;     // owned
+    std::vector<SYCLDeviceContext*>         m_sycl_context_;       // owned
+
+    static std::mutex mutex_;
+    static GSYCLInterface* s_instance;
+    GSYCLInterface() {
+      bool found_device =false;
+      auto device_list = Eigen::get_sycl_supported_devices();
+      // Obtain list of supported devices from Eigen
+      for (const auto& device : device_list) {
+        if(device.is_gpu()) {
+          // returns first found GPU
+          AddDevice(device);
+          found_device = true;
+        }
+      }
+
+      if(!found_device) {
+        // Currently Intel GPU is not supported
+        LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, trying OpenCL CPU";
+      }
+
+      for (const auto& device : device_list) {
+        if(device.is_cpu()) {
+          // returns first found CPU
+          AddDevice(device);
+          found_device = true;
+        }
+      }
+
+      if(!found_device) {
+        // Currently Intel GPU is not supported
+        LOG(FATAL) << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+      } else {
+        LOG(INFO) << "Found following OpenCL devices:";
+        for (int i = 0; i < device_list.size(); i++) {
+          LOG(INFO) << GetShortDeviceDescription(i);
+        }
+      }
+    }
+
+    ~GSYCLInterface() {
+        m_cpu_allocator_.clear();
+
+        for (auto p : m_sycl_allocator_) {
+          p->Synchronize();
+          delete p;
+        }
+        m_sycl_allocator_.clear();
+
+        for(auto p : m_sycl_context_) {
+          p->Unref();
+        }
+        m_sycl_context_.clear();
+
+        for (auto p : m_queue_interface_) {
+          p->deallocate_all();
+          delete p;
+          p = nullptr;
+        }
+        m_queue_interface_.clear();
+    }
+
+    void AddDevice(const cl::sycl::device & d) {
+      m_queue_interface_.push_back(new Eigen::QueueInterface(d));
+      m_cpu_allocator_.push_back(cpu_allocator());
+      m_sycl_allocator_.push_back(new SYCLAllocator(m_queue_interface_.back()));
+      m_sycl_context_.push_back(new SYCLDeviceContext());
+    }
+
+  public:
+    static GSYCLInterface *instance()
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (!s_instance) {
+        s_instance = new GSYCLInterface();
+      }
+      return s_instance;
+    }
+
+    static void Reset()
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if(s_instance) {
+        delete s_instance;
+        s_instance = NULL;
+      }
+    }
+
+    Eigen::QueueInterface * GetQueueInterface(size_t i = 0) {
+      if(!m_queue_interface_.empty()) {
+        return m_queue_interface_[i];
+      } else {
+        std::cerr << "No cl::sycl::device has been added" << std::endl;
+        return nullptr;
+      }
+    }
+
+    SYCLAllocator * GetSYCLAllocator(size_t i = 0) {
+      if(!m_sycl_allocator_.empty()) {
+        return m_sycl_allocator_[i];
+      } else {
+        std::cerr << "No cl::sycl::device has been added" << std::endl;
+        return nullptr;
+      }
+    }
+
+    Allocator * GetCPUAllocator(size_t i = 0) {
+      if(!m_cpu_allocator_.empty()) {
+        return m_cpu_allocator_[i];
+      } else {
+        std::cerr << "No cl::sycl::device has been added" << std::endl;
+        return nullptr;
+      }
+    }
+
+    SYCLDeviceContext * GetSYCLContext(size_t i = 0) {
+      if(!m_sycl_context_.empty()) {
+        return m_sycl_context_[i];
+      } else {
+        std::cerr << "No cl::sycl::device has been added" << std::endl;
+        return nullptr;
+      }
+    }
+
+    string GetShortDeviceDescription(int device_id = 0) {
+      auto _device = GetSYCLAllocator(device_id)
+                         ->getSyclDevice()
+                         ->sycl_queue()
+                         .get_device();
+      auto _name = _device.get_info<cl::sycl::info::device::name>();
+      auto _vendor = _device.get_info<cl::sycl::info::device::vendor>();
+      auto _profile = _device.get_info<cl::sycl::info::device::profile>();
+
+      std::string _type;
+      if (_device.is_host()) {
+        _type = "Host";
+      } else if (_device.is_cpu()) {
+        _type = "CPU";
+      } else if (_device.is_gpu()) {
+        _type = "GPU";
+      } else if (_device.is_accelerator()) {
+        _type = "Accelerator";
+      } else {
+        _type = "Unknown";
+      }
+
+      return strings::StrCat("id: ", device_id, " ,type: ", _type, " ,name: ",
+                             _name.c_str(), " ,vendor: ", _vendor.c_str(),
+                             " ,profile: ", _profile.c_str());
+    }
+};
+
+
 class SYCLDevice : public LocalDevice {
  public:
-  template <typename SYCLSelector>
   SYCLDevice(const SessionOptions &options, const string &name,
              Bytes memory_limit, const DeviceLocality &locality,
-             const string &physical_device_desc, SYCLSelector sycl_selector,
-             Allocator *cpu_allocator)
-      : LocalDevice(options, Device::BuildDeviceAttributes(
-                                 name, DEVICE_SYCL, memory_limit, locality,
-                                 physical_device_desc),
-                    nullptr),
+             const string &physical_device_desc, SYCLAllocator * sycl_allocator,
+             Allocator *cpu_allocator, SYCLDeviceContext* ctx)
+      : LocalDevice(
+            options,
+            Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
+                                          locality, physical_device_desc)),
         cpu_allocator_(cpu_allocator),
-        sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
-        sycl_device_(new Eigen::SyclDevice(sycl_queue_)),
-        sycl_allocator_(new SYCLAllocator(sycl_queue_)),
-        device_context_(new SYCLDeviceContext()) {
-    set_eigen_sycl_device(sycl_device_);
+        sycl_allocator_(sycl_allocator),
+        device_context_(ctx) {
     RegisterDevice();
+    set_eigen_sycl_device(sycl_allocator->getSyclDevice());
   }
 
   ~SYCLDevice() override;
 
-  void EnterLameDuckMode();
-
   void Compute(OpKernel *op_kernel, OpKernelContext *context) override;
   Allocator *GetAllocator(AllocatorAttributes attr) override;
   Status MakeTensorFromProto(const TensorProto &tensor_proto,
@@ -61,18 +215,12 @@ class SYCLDevice : public LocalDevice {
                         DeviceContextMap *device_context_map) override;
 
   Status Sync() override;
-  static string GetShortDeviceDescription(/*int device_id,
-                                          const DeviceDescription& desc*/) {
-    return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
-  }
 
  private:
   void RegisterDevice();
 
-  Allocator *cpu_allocator_;           // owned
-  Eigen::QueueInterface *sycl_queue_;  // owned
-  Eigen::SyclDevice *sycl_device_;     // owned
-  SYCLAllocator *sycl_allocator_;      // owned
+  Allocator         *cpu_allocator_;           // not owned
+  SYCLAllocator     *sycl_allocator_;          // not owned
   SYCLDeviceContext *device_context_;
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
index a6be9195d4b..1c868f5606e 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 
 namespace tensorflow {
 
@@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
     const void *src_ptr = DMAHelper::base(cpu_tensor);
     void *dst_ptr = DMAHelper::base(device_tensor);
     switch (cpu_tensor->dtype()) {
-    case DT_FLOAT:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_DOUBLE:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT32:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT64:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_HALF:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<Eigen::half *>(dst_ptr),
-          static_cast<const Eigen::half *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX64:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<std::complex<float> *>(dst_ptr),
-          static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX128:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<std::complex<double> *>(dst_ptr),
-          static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-      break;
-    case DT_INT8:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT16:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT8:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT16:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_BOOL:
-      device->eigen_sycl_device()->memcpyHostToDevice(
-          static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-          total_bytes);
-      break;
-    default:
-      assert(false && "unsupported type");
+      case DT_FLOAT:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_DOUBLE:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<double *>(dst_ptr),
+            static_cast<const double *>(src_ptr), total_bytes);
+        break;
+      case DT_INT32:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT64:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_HALF:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<Eigen::half *>(dst_ptr),
+            static_cast<const Eigen::half *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX64:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<std::complex<float> *>(dst_ptr),
+            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX128:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<std::complex<double> *>(dst_ptr),
+            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+        break;
+      case DT_INT8:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT16:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT8:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT16:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<uint16 *>(dst_ptr),
+            static_cast<const uint16 *>(src_ptr), total_bytes);
+        break;
+      case DT_BOOL:
+        device->eigen_sycl_device()->memcpyHostToDevice(
+            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+            total_bytes);
+        break;
+      default:
+        assert(false && "unsupported type");
     }
   }
   device->eigen_sycl_device()->synchronize();
@@ -106,71 +106,71 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
                                               StatusCallback done) {
   const int64 total_bytes = device_tensor->TotalBytes();
   if (total_bytes > 0) {
-    const void* src_ptr = DMAHelper::base(device_tensor);
-    void* dst_ptr = DMAHelper::base(cpu_tensor);
+    const void *src_ptr = DMAHelper::base(device_tensor);
+    void *dst_ptr = DMAHelper::base(cpu_tensor);
     switch (device_tensor->dtype()) {
-    case DT_FLOAT:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_DOUBLE:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT32:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT64:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_HALF:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<Eigen::half *>(dst_ptr),
-          static_cast<const Eigen::half *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX64:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<std::complex<float> *>(dst_ptr),
-          static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-      break;
-    case DT_COMPLEX128:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<std::complex<double> *>(dst_ptr),
-          static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-      break;
-    case DT_INT8:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_INT16:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT8:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_UINT16:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
-          total_bytes);
-      break;
-    case DT_BOOL:
-      device->eigen_sycl_device()->memcpyDeviceToHost(
-          static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-          total_bytes);
-      break;
-    default:
-      assert(false && "unsupported type");
+      case DT_FLOAT:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_DOUBLE:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<double *>(dst_ptr),
+            static_cast<const double *>(src_ptr), total_bytes);
+        break;
+      case DT_INT32:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT64:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_HALF:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<Eigen::half *>(dst_ptr),
+            static_cast<const Eigen::half *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX64:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<std::complex<float> *>(dst_ptr),
+            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+        break;
+      case DT_COMPLEX128:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<std::complex<double> *>(dst_ptr),
+            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+        break;
+      case DT_INT8:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_INT16:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT8:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+            total_bytes);
+        break;
+      case DT_UINT16:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<uint16 *>(dst_ptr),
+            static_cast<const uint16 *>(src_ptr), total_bytes);
+        break;
+      case DT_BOOL:
+        device->eigen_sycl_device()->memcpyDeviceToHost(
+            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+            total_bytes);
+        break;
+      default:
+        assert(false && "unsupported type");
     }
   }
   device->eigen_sycl_device()->synchronize();
@@ -178,4 +178,4 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
 }
 
 }  // namespace tensorflow
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
index 1f7ad543d94..0f8f17b8058 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
@@ -26,7 +26,7 @@ limitations under the License.
 namespace tensorflow {
 
 class SYCLDeviceContext : public DeviceContext {
-public:
+ public:
   SYCLDeviceContext() {}
 
   ~SYCLDeviceContext() override {}
@@ -40,6 +40,6 @@ public:
                              StatusCallback done) override;
 };
 
-} // namespace tensorflow
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 51eb4973d8a..19c14770dca 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -18,24 +18,34 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"
 
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+
 namespace tensorflow {
 
 class SYCLDeviceFactory : public DeviceFactory {
-public:
+ public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
-    int n = 1;
+
+    auto syclInterface = GSYCLInterface::instance();
+
+    size_t n = 1;
     auto iter = options.config.device_count().find("SYCL");
     if (iter != options.config.device_count().end()) {
       n = iter->second;
     }
+
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
-                                        DeviceLocality(),
-                                        SYCLDevice::GetShortDeviceDescription(),
-                                        cl::sycl::gpu_selector(), cpu_allocator()));
+      devices->push_back(
+          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality()
+                         , syclInterface->GetShortDeviceDescription(i)
+                         , syclInterface->GetSYCLAllocator(i)
+                         , syclInterface->GetCPUAllocator(i)
+                         , syclInterface->GetSYCLContext(i))
+                       );
     }
+
     return Status::OK();
   }
 };
@@ -43,4 +53,4 @@ public:
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
 }
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
new file mode 100644
index 00000000000..f58614c4ff9
--- /dev/null
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building TensorFlow with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+// For DMA helper
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+  inline void* GetBase(const Tensor* src) {
+    return const_cast<void*>(DMAHelper::base(src));
+  }
+
+  inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
+}
+
+#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 686bc6885e0..f5f8aab6946 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
@@ -27,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
+#ifdef INTEL_MKL
+#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#endif
+
 namespace tensorflow {
 
 ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
@@ -34,13 +39,16 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                    const DeviceLocality& locality,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
-                               name, DEVICE_CPU, memory_limit, locality),
-                  allocator),
+                               name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
 void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   if (port::Tracing::IsActive()) {
     // TODO(pbar) We really need a useful identifier of the graph node.
     const uint64 id = Hash64(op_kernel->name());
@@ -70,4 +78,8 @@ Status ThreadPoolDevice::MakeTensorFromProto(
                                  ProtoDebugString(tensor_proto));
 }
 
+#ifdef INTEL_MKL
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
index c83e4a4e3a1..8edf922d11e 100644
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -44,7 +44,7 @@ class VisitableAllocator : public Allocator {
 };
 
 // Needed for cases when a VisitableAllocator gets wrapped for tracking.
-// Multiple-inheritance is considered acceptible in this case because
+// Multiple-inheritance is considered acceptable in this case because
 // VisitableAllocator is a pure virtual interface and only TrackingAllocator
 // has default implementation.
 class TrackingVisitableAllocator : public TrackingAllocator,
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 5a7e7bb7e56..2fc49d4412e 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -15,13 +15,14 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+# Google-internal rules omitted.
 load(
     "//tensorflow:tensorflow.bzl",
+    "check_deps",
     "tf_copts",
     "tf_cc_test",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
 # For platform specific build config
 load(
@@ -34,6 +35,13 @@ load(
     "tf_cuda_tests_tags",
 )
 
+# Check that tensorflow/core:tensorflow does not depend on grpc.
+check_deps(
+    name = "core_tensorflow_check_deps",
+    disallowed_deps = ["@grpc//:grpc++_unsecure"],
+    deps = ["//tensorflow/core:tensorflow"],
+)
+
 tf_proto_library_cc(
     name = "debug_service_proto",
     srcs = ["debug_service.proto"],
@@ -50,8 +58,9 @@ cc_library(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
-        ":debug_graph_utils",
+        ":debugger_state_impl",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:debug_ops_op_lib",
     ],
     alwayslink = 1,
 )
@@ -76,6 +85,19 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+tf_cuda_library(
+    name = "debugger_state_impl",
+    srcs = ["debugger_state_impl.cc"],
+    hdrs = ["debugger_state_impl.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":debug_graph_utils",
+        ":debug_io_utils",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "debug_graph_utils",
     srcs = ["debug_graph_utils.cc"],
@@ -83,7 +105,6 @@ tf_cuda_library(
     copts = tf_copts(),
     linkstatic = 1,
     deps = [
-        ":debug_io_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -126,33 +147,21 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "@grpc//:grpc++_unsecure",
     ],
     alwayslink = 1,
 )
 
-cc_binary(
-    name = "debug_test_server_main",
-    srcs = [
-        "debug_test_server_main.cc",
-    ],
-    deps = [
-        ":debug_grpc_testlib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++_unsecure",
-    ],
-)
-
-tf_cc_test_gpu(
+# TODO(cais): Fix flakiness on GPU and change this back to a tf_cc_test_gpu.
+#   See b/34081273.
+tf_cc_test(
     name = "debug_gateway_test",
     size = "small",
     srcs = ["debug_gateway_test.cc"],
     args = ["--heap_check=local"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
+    tags = ["no_gpu"],
     deps = [
         ":debug",
         ":debug_gateway_internal",
@@ -199,27 +208,42 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "debug_grpc_io_utils_test",
+    name = "debug_graph_utils_test",
     size = "small",
-    srcs = ["debug_grpc_io_utils_test.cc"],
-    data = [
-        ":debug_test_server_main",
-    ],
+    srcs = ["debug_graph_utils_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":debug_graph_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_session_debug_test",
+    size = "medium",
+    srcs = ["grpc_session_debug_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["nomac"],  # b/38276817
+    deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:matmul_op",
     ],
 )
 
diff --git a/tensorflow/core/debug/debug.cc b/tensorflow/core/debug/debug.cc
index c293b285c35..1aedfc2710e 100644
--- a/tensorflow/core/debug/debug.cc
+++ b/tensorflow/core/debug/debug.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
-#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debugger_state_impl.h"
 
 namespace tensorflow {
 namespace {
@@ -30,10 +30,18 @@ class DebuggerStateRegistration {
     return std::unique_ptr<DebuggerStateInterface>(new DebuggerState(options));
   }
 
+  static std::unique_ptr<DebugGraphDecoratorInterface>
+  CreateDebugGraphDecorator(const DebugOptions& options) {
+    return std::unique_ptr<DebugGraphDecoratorInterface>(
+        new DebugGraphDecorator(options));
+  }
+
   DebuggerStateRegistration() {
     DebuggerStateRegistry::RegisterFactory(CreateDebuggerState);
+    DebugGraphDecoratorRegistry::RegisterFactory(CreateDebugGraphDecorator);
   }
 };
+
 static DebuggerStateRegistration register_debugger_state_implementation;
 
 }  // end namespace
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 24b9dd799aa..2aaed9563a6 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_gateway.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -56,11 +58,11 @@ DebugGateway::~DebugGateway() {
 }
 
 void DebugGateway::SetNodeCompletionCallback(NodeCompletionCallback callback) {
-  comp_cb_ = callback;
+  comp_cb_ = std::move(callback);
 }
 
 void DebugGateway::SetNodeValueCallback(NodeValueCallback callback) {
-  val_cb_ = callback;
+  val_cb_ = std::move(callback);
 }
 
 void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
@@ -84,7 +86,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if (device->name().find("gpu:") != string::npos &&
+    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 963cea84197..f25d91a3c27 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_gateway.h"
 
 #include <algorithm>
+#include <cstdlib>
+#include <memory>
 #include <unordered_map>
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
@@ -28,14 +30,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-DirectSession* CreateSession() {
+std::unique_ptr<DirectSession> CreateSession() {
   SessionOptions options;
   // Turn off graph optimizer so we can observe intermediate node states.
   options.config.mutable_graph_options()
       ->mutable_optimizer_options()
       ->set_opt_level(OptimizerOptions_Level_L0);
 
-  return dynamic_cast<DirectSession*>(NewSession(options));
+  return std::unique_ptr<DirectSession>(
+      dynamic_cast<DirectSession*>(NewSession(options)));
 }
 
 class SessionDebugMinusAXTest : public ::testing::Test {
@@ -45,6 +48,8 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -82,7 +87,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 
 TEST_F(SessionDebugMinusAXTest, RunSimpleNetwork) {
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
@@ -217,7 +222,7 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetwork) {
 TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
   // Tensor contains one count of NaN
   Initialize({3, std::numeric_limits<float>::quiet_NaN(), -1, 0});
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
@@ -302,6 +307,8 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
 // through RunMetadata, given whether GPU is involved.
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -336,12 +343,16 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
   ASSERT_EQ(1, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
+#if !defined(GOOGLE_CUDA) && !defined(TENSORFLOW_USE_SYCL)
+// TODO(cais): Reinstate the following test for concurrent debugged runs on
+//   a GPU once the root cause of the ~0.5% flakiness has been addressed.
+//   (b/34081273)
 TEST_F(SessionDebugMinusAXTest,
        RunSimpleNetworkConcurrentlyWithDifferentDebugTensorWatches) {
   // Test concurrent Run() calls on a graph with different debug watches.
 
   Initialize({3, 2, -1, 0});
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
 
@@ -442,11 +453,7 @@ TEST_F(SessionDebugMinusAXTest,
                             &outputs, &run_metadata);
     TF_ASSERT_OK(s);
 
-#if GOOGLE_CUDA
-    ASSERT_EQ(2, run_metadata.partition_graphs().size());
-#else
     ASSERT_EQ(1, run_metadata.partition_graphs().size());
-#endif
 
     ASSERT_EQ(1, outputs.size());
     ASSERT_TRUE(outputs[0].IsInitialized());
@@ -490,6 +497,7 @@ TEST_F(SessionDebugMinusAXTest,
     ASSERT_EQ(-1.0, y_mat_identity(1, 0));
   }
 }
+#endif
 
 class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
  public:
@@ -498,6 +506,8 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -529,7 +539,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 TEST_F(SessionDebugOutputSlotWithoutOngoingEdgeTest,
        WatchSlotWithoutOutgoingEdge) {
   Initialize();
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
@@ -598,6 +608,8 @@ class SessionDebugVariableTest : public ::testing::Test {
 
 #if GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif defined(TENSORFLOW_USE_SYCL)
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
     const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
 #endif
@@ -652,7 +664,7 @@ class SessionDebugVariableTest : public ::testing::Test {
 
 TEST_F(SessionDebugVariableTest, WatchUninitializedVariableWithDebugOps) {
   Initialize();
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
@@ -731,7 +743,7 @@ TEST_F(SessionDebugVariableTest, WatchUninitializedVariableWithDebugOps) {
 TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   // Tensor contains one count of NaN
   Initialize();
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
@@ -757,6 +769,10 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   tensor_watch_opts->add_debug_ops(debug_identity);
   tensor_watch_opts->add_debug_ops(debug_nan_count);
 
+  char tempdir_template[] = "/tmp/tfdbg_XXXXXX";
+  string temp_dir(mkdtemp(tempdir_template));
+  tensor_watch_opts->add_debug_urls(strings::StrCat("file://", temp_dir));
+
   // Expected name of the inserted debug node
   string debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
       strings::StrCat(var_node_name_, ":", 0), 0, debug_identity);
@@ -817,6 +833,8 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
 
 #if GOOGLE_CUDA
   ASSERT_EQ(2, run_metadata.partition_graphs().size());
+#elif defined(TENSORFLOW_USE_SYCL)
+  ASSERT_EQ(2, run_metadata.partition_graphs().size());
 #else
   ASSERT_EQ(1, run_metadata.partition_graphs().size());
 #endif
@@ -854,13 +872,17 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   ASSERT_EQ(2, debug_nan_count_tensor_vals[0].scalar<int64>()());
 }
 
-#if GOOGLE_CUDA
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_SYCL)
 class SessionDebugGPUSwitchTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
 
+#ifdef GOOGLE_CUDA
     const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+#elif TENSORFLOW_USE_SYCL
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
+#endif
 
     Tensor vb(DT_BOOL, TensorShape({}));
     vb.scalar<bool>()() = true;
@@ -897,7 +919,7 @@ class SessionDebugGPUSwitchTest : public ::testing::Test {
 // Test for debug-watching tensors marked as HOST_MEMORY on GPU.
 TEST_F(SessionDebugGPUSwitchTest, RunSwitchWithHostMemoryDebugOp) {
   Initialize();
-  std::unique_ptr<DirectSession> session(CreateSession());
+  auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
 
   DebugGateway debug_gateway(session.get());
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 06c129ecc84..2559808b59a 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 
 #include "tensorflow/core/common_runtime/memory_types.h"
-#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -26,59 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-DebuggerState::DebuggerState(const DebugOptions& debug_options)
-    : watches(debug_options.debug_tensor_watch_opts()), debug_urls_() {
-  for (const DebugTensorWatch& watch : watches) {
-    for (const string& url : watch.debug_urls()) {
-      debug_urls_.insert(url);
-    }
+namespace {
+
+// TODO(cais): Switch to safe_strtob when available.
+Status ParseBoolString(const string& bool_str, bool* bool_val) {
+  const string lower_bool_str = str_util::Lowercase(bool_str);
+  if (lower_bool_str == "false" || lower_bool_str == "f" ||
+      lower_bool_str == "0") {
+    *bool_val = false;
+  } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
+             lower_bool_str == "1") {
+    *bool_val = true;
+  } else {
+    return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
   }
+  return Status::OK();
 }
 
-DebuggerState::~DebuggerState() {
-  for (const string& debug_url : debug_urls_) {
-    DebugIO::CloseDebugURL(debug_url);
-  }
-}
-
-const string DebuggerState::SummarizeDebugTensorWatches() {
-  std::ostringstream oss;
-
-  for (const DebugTensorWatch& watch : watches) {
-    string tensor_name =
-        strings::StrCat(watch.node_name(), ":", watch.output_slot());
-    oss << tensor_name << "|";
-
-    for (const string& debug_op : watch.debug_ops()) {
-      oss << debug_op << ",";
-    }
-
-    oss << "@";
-    for (const string& debug_url : watch.debug_urls()) {
-      oss << debug_url << ",";
-    }
-
-    oss << ";";
-  }
-
-  return oss.str();
-}
-
-Status DebuggerState::DecorateGraphForDebug(Graph* graph, Device* device) {
-  Status status;
-
-  status.Update(DebugNodeInserter::InsertNodes(watches, graph, device));
-  if (status.ok()) {
-    status.Update(DebugIO::PublishGraph(*graph, debug_urls_));
-  }
-
-  return status;
-}
+}  // namespace
 
 // static
 Status DebugNodeInserter::InsertNodes(
     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
     Device* device) {
+  // TODO(cais): This method is getting too large in size.
+  // Refactor it with helpers.
+
   if (watches.empty()) {
     // Nothing to do: Return OK right away.
     return Status::OK();
@@ -89,6 +61,7 @@ Status DebugNodeInserter::InsertNodes(
   std::unordered_map<string, std::vector<string>> tensor_watches;
   // A map from tensor name to debug_url.
   std::unordered_map<string, std::vector<string>> tensor_watch_urls;
+  std::unordered_map<string, bool> tensor_tolerate_failures;
 
   // Cache the proto content for fast lookup later
   for (const DebugTensorWatch& watch : watches) {
@@ -111,6 +84,8 @@ Status DebugNodeInserter::InsertNodes(
     }
 
     tensor_watches[tensor_name] = debug_ops;
+    tensor_tolerate_failures[tensor_name] =
+        watch.tolerate_debug_op_creation_failures();
 
     std::vector<string> urls;
     for (const string& url : watch.debug_urls()) {
@@ -167,14 +142,15 @@ Status DebugNodeInserter::InsertNodes(
 
       const DataType src_dt = src_node->output_type(src_output_slot);
       MemoryType memory_type;
-      MemoryTypeForOutput(device_type, graph, src_node, src_output_slot,
-                          &memory_type);
+      TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
+                                             src_output_slot, &memory_type));
 
       // Create the copy node for the watched tensor.
       Node* copy_node;
       Status copy_s = CreateCopyNode(
           graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
-          src_output_slot, src_dt, tensor_name, &copy_node);
+          src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
+          tensor_watch_urls[tensor_name], &copy_node);
       if (!copy_s.ok()) {
         return Status(
             error::FAILED_PRECONDITION,
@@ -192,20 +168,24 @@ Status DebugNodeInserter::InsertNodes(
 
         Node* debug_node;
         Status debug_s = CreateDebugNode(
-            graph, device_type, copy_node->name(), src_dt, tensor_name,
+            graph, *device, copy_node->name(), src_dt, tensor_name,
             tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
-        if (!debug_s.ok()) {
-          return Status(
-              error::FAILED_PRECONDITION,
-              strings::StrCat("Failed to create debug node ", debug_op_name,
-                              " for tensor ", tensor_name, ", due to: ",
-                              debug_s.error_message()));
+        if (debug_s.ok()) {
+          graph->AddEdge(copy_node, 0, debug_node, 0);
+          debug_nodes.push_back(debug_node);
+        } else {
+          if (tensor_tolerate_failures[tensor_name]) {
+            LOG(INFO) << "Tolerating failure to create debug node: "
+                      << "tensor name = " << tensor_name << "; "
+                      << "debug op name = " << debug_op_name;
+          } else {
+            return Status(
+                error::FAILED_PRECONDITION,
+                strings::StrCat("Failed to create debug node ", debug_op_name,
+                                " for tensor ", tensor_name,
+                                ", due to: ", debug_s.error_message()));
+          }
         }
-
-        // Create edges from the Copy node to the debug node.
-        graph->AddEdge(copy_node, 0, debug_node, 0);
-
-        debug_nodes.push_back(debug_node);
       }
 
       // Is the output a reference?
@@ -221,10 +201,12 @@ Status DebugNodeInserter::InsertNodes(
 
         // Add control edges from the debug nodes to the destination node
         // to ensure that the debug nodes are executed before the destination
-        // node.
+        // node. Skip Enter and NextIteration ops to avoid hanging.
         for (Node* debug_node : debug_nodes) {
-          graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
-                         Graph::kControlSlot);
+          if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
+            graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
+                           Graph::kControlSlot);
+          }
         }
       }
     }
@@ -238,6 +220,24 @@ Status DebugNodeInserter::InsertNodes(
   return Status::OK();
 }
 
+void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
+  for (Node* node : graph->nodes()) {
+    if (node->IsEnter()) {
+      const AttrValue* parallel_iterations =
+          node->attrs().Find("parallel_iterations");
+      if (parallel_iterations && parallel_iterations->i() > 1) {
+        LOG(INFO) << "For debugging, tfdbg is changing the "
+                  << "parallel_iterations attribute of the Enter/RefEnter "
+                  << "node \"" << node->name() << "\" on device \""
+                  << device->name() << "\" from " << parallel_iterations->i()
+                  << " to 1. (This does not affect subsequent non-debug "
+                  << "runs.)";
+        node->AddAttr<int64>("parallel_iterations", 1);
+      }
+    }
+  }
+}
+
 // static
 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
                                                 const int output_slot) {
@@ -261,15 +261,40 @@ const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
 Status DebugNodeInserter::CreateCopyNode(
     Graph* graph, const DeviceType device_type, const bool is_host_memory,
     const string& src_node_name, const int src_output, const DataType src_dt,
-    const string& tensor_name, Node** copy_node) {
+    const string& tensor_name, const std::vector<string>& debug_ops,
+    const std::vector<string>& debug_urls, Node** copy_node) {
+  const string kGatedGrpcAttributeKey = "gated_grpc";
+
   NodeDef node_def;
   const KernelDef* kdef;
 
   const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
   const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
 
+  // Cross debug_ops and debug_urls to get the list of debug ops and watches.
+  std::vector<string> debug_ops_spec;
+  for (const string& debug_op : debug_ops) {
+    for (const string& debug_url : debug_urls) {
+      string debug_op_name_proper;
+      std::unordered_map<string, string> custom_attributes;
+      TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
+                                          &custom_attributes));
+
+      bool gated_grpc_value = false;
+      if (custom_attributes.find(kGatedGrpcAttributeKey) !=
+          custom_attributes.end()) {
+        TF_RETURN_IF_ERROR(ParseBoolString(
+            custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
+      }
+      debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
+                                               debug_url, ";",
+                                               gated_grpc_value ? "1" : "0"));
+    }
+  }
+
   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
-                     .Input(src_node_name, src_output, src_dt);
+                     .Input(src_node_name, src_output, src_dt)
+                     .Attr("debug_ops_spec", std::move(debug_ops_spec));
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
@@ -294,38 +319,159 @@ Status DebugNodeInserter::CreateCopyNode(
   return Status::OK();
 }
 
+// static
+Status DebugNodeInserter::ParseDebugOpName(
+    const string& debug_op_name, string* debug_op_name_proper,
+    std::unordered_map<string, string>* attributes) {
+  const size_t l_index = debug_op_name.find('(');
+  const size_t r_index = debug_op_name.find(')');
+  if (l_index == string::npos && r_index == string::npos) {
+    *debug_op_name_proper = debug_op_name;
+  } else {
+    if (l_index == string::npos || l_index == 0 ||
+        r_index != debug_op_name.size() - 1) {
+      return errors::InvalidArgument("Malformed debug op name \"",
+                                     debug_op_name, "\"");
+    }
+
+    *debug_op_name_proper = debug_op_name.substr(0, l_index);
+    string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
+
+    std::vector<string> attribute_segs = str_util::Split(arguments, ";");
+    for (const string& attribute_seg : attribute_segs) {
+      StringPiece seg(attribute_seg);
+      str_util::RemoveWhitespaceContext(&seg);
+      if (seg.empty()) {
+        continue;
+      }
+
+      const size_t eq_index = seg.find('=');
+      if (eq_index == string::npos) {
+        return errors::InvalidArgument(
+            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+      }
+
+      const string key = seg.substr(0, eq_index).ToString();
+      const string value =
+          seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1)
+              .ToString();
+      if (key.empty() || value.empty()) {
+        return errors::InvalidArgument(
+            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+      }
+
+      if (attributes->find(key) == attributes->end()) {
+        (*attributes)[key] = value;
+      } else {
+        return errors::InvalidArgument("Duplicate attribute name \"", key,
+                                       "\" found in the debug op: \"",
+                                       debug_op_name, "\"");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// static
+Status DebugNodeInserter::SetDebugNodeAttributes(
+    Node* debug_node, const std::unordered_map<string, string>& attributes) {
+  std::unordered_set<string> unfulfilled_keys;
+  for (const auto& item : attributes) {
+    unfulfilled_keys.insert(item.first);
+  }
+
+  for (const auto& attr : debug_node->op_def().attr()) {
+    if (attributes.find(attr.name()) != attributes.end()) {
+      const string& attr_value = attributes.at(attr.name());
+      if (attr.type() == "string") {
+        debug_node->AddAttr<string>(attr.name(), attr_value);
+      } else if (attr.type() == "float") {
+        float float_value = 0.0;
+        if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
+                                                &float_value)) {
+          return errors::InvalidArgument(
+              "Invalid value string for float-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+        debug_node->AddAttr<float>(attr.name(), float_value);
+      } else if (attr.type() == "int") {
+        int64 int_value = 0;
+        if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
+          return errors::InvalidArgument(
+              "Invalid value string for int-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+        debug_node->AddAttr<int>(attr.name(), int_value);
+      } else if (attr.type() == "bool") {
+        bool bool_value;
+        if (!ParseBoolString(attr_value, &bool_value).ok()) {
+          return errors::InvalidArgument(
+              "Invalid value string for bool-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+        debug_node->AddAttr<bool>(attr.name(), bool_value);
+      } else {
+        return errors::InvalidArgument(
+            "Unsupported type of custom attribute for debug ops: ",
+            attr.type());
+      }
+
+      unfulfilled_keys.erase(attr.name());
+    }
+  }
+
+  if (unfulfilled_keys.empty()) {
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument(
+        unfulfilled_keys.size(),
+        " attribute key(s) were not valid for debug node ", debug_node->name(),
+        ": ", str_util::Join(unfulfilled_keys, ", "));
+  }
+}
+
 // static
 Status DebugNodeInserter::CreateDebugNode(
-    Graph* graph, const DeviceType device_type,
-    const string& src_copy_node_name, const DataType src_dt,
-    const string& tensor_name, const std::vector<string>& debug_urls,
-    const int debug_op_num, const string& debug_op_name, Node** debug_node) {
+    Graph* graph, const Device& device, const string& src_copy_node_name,
+    const DataType src_dt, const string& tensor_name,
+    const std::vector<string>& debug_urls, const int debug_op_num,
+    const string& debug_op_name, Node** debug_node) {
   NodeDef node_def;
   const KernelDef* kdef;
 
+  string debug_op_name_proper;
+  std::unordered_map<string, string> custom_attributes;
+  TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
+                                      &custom_attributes));
+
   const string debug_node_name =
-      GetDebugNodeName(tensor_name, debug_op_num, debug_op_name);
-  auto builder = NodeDefBuilder(debug_node_name, debug_op_name)
+      GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
+  auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
                      .Input(src_copy_node_name, 0, src_dt)
+                     .Attr("device_name", device.name())
                      .Attr("tensor_name", tensor_name)
                      .Attr("debug_urls", debug_urls);
 
   if (!builder.Finalize(&node_def).ok()) {
-    return Status(
-        error::FAILED_PRECONDITION,
-        strings::StrCat("Failed to create node definition ", "for debug op ",
-                        debug_op_name, " on watched tensor ", tensor_name));
+    return errors::FailedPrecondition(
+        "Failed to create node definition for debug op ", debug_op_name_proper,
+        " on watched tensor ", tensor_name);
   }
-  if (!FindKernelDef(device_type, node_def, &kdef, nullptr).ok()) {
-    return Status(
-        error::FAILED_PRECONDITION,
-        strings::StrCat("Failed to find kernel definition ", "for debug op ",
-                        debug_op_name, " on watched tensor ", tensor_name));
+  if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
+           .ok()) {
+    return errors::FailedPrecondition(
+        "Failed to find kernel definition for debug op ", debug_op_name_proper,
+        " on watched tensor ", tensor_name);
   }
   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
-    return Status(error::FAILED_PRECONDITION,
-                  strings::StrCat("Failed to create debug node ", debug_op_name,
-                                  " on watched tensor ", tensor_name));
+    return errors::FailedPrecondition("Failed to create debug node ",
+                                      debug_op_name_proper,
+                                      " on watched tensor ", tensor_name);
+  }
+
+  // Set custom attributes (if any).
+  if (!custom_attributes.empty()) {
+    TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index cda2c31f0ce..64deff1f00b 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_DEBUG_NODE_INSERTER_H_
 
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -29,24 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-class DebuggerState : public DebuggerStateInterface {
- public:
-  DebuggerState(const DebugOptions& debug_options);
-  virtual ~DebuggerState();
-
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  const string SummarizeDebugTensorWatches() override;
-
-  // Insert special-purpose debug nodes to graph. See the documentation of
-  // DebugNodeInserter::InsertNodes() for details.
-  Status DecorateGraphForDebug(Graph* graph, Device* device) override;
-
-  const protobuf::RepeatedPtrField<DebugTensorWatch>& watches;
-
- private:
-  std::unordered_set<string> debug_urls_;
-};
-
 class DebugNodeInserter {
  public:
   // EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for
@@ -94,6 +75,12 @@ class DebugNodeInserter {
       const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
       Device* device);
 
+  // Set the parallel_iterations attribute of TensorFlow while loops
+  // (specifically the nodes for which IsEnter() returns true) to 1 to prevent
+  // any node from being executed multiple times concurrently and
+  // generating temporally-overlapping debug Tensor dumps.
+  static void DeparallelizeWhileLoops(Graph* graph, Device* device);
+
   // Get canonical name of a copy node.
   static const string GetCopyNodeName(const string& node_name,
                                       const int output_slot);
@@ -104,19 +91,35 @@ class DebugNodeInserter {
                                        const string& debug_op_name);
 
  private:
-  static Status CreateCopyNode(Graph* graph, const DeviceType device_type,
-                               const bool is_host_memory,
-                               const string& src_node_name,
-                               const int src_output, const DataType src_dt,
-                               const string& tensor_name, Node** copy_node);
+  static Status CreateCopyNode(
+      Graph* graph, const DeviceType device_type, const bool is_host_memory,
+      const string& src_node_name, const int src_output, const DataType src_dt,
+      const string& tensor_name, const std::vector<string>& debug_ops,
+      const std::vector<string>& debug_urls, Node** copy_node);
 
-  static Status CreateDebugNode(Graph* graph, const DeviceType device_type,
+  // Parse the debug_op_name string to extract proper op name and attributes.
+  // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
+  // It can also contain customizable keys and values. Each key-value pair is
+  // connected with an equal sign ("="). Multiple key-value pairs are separated
+  // with semicolons (";"), which optional whitespace in between, e.g.,
+  // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
+  static Status ParseDebugOpName(
+      const string& debug_op_name, string* debug_op_name_proper,
+      std::unordered_map<string, string>* attributes);
+
+  static Status SetDebugNodeAttributes(
+      Node* debug_node, const std::unordered_map<string, string>& attributes);
+
+  static Status CreateDebugNode(Graph* graph, const Device& device,
                                 const string& src_copy_node_name,
                                 const DataType src_dt,
                                 const string& tensor_name,
                                 const std::vector<string>& debug_urls,
                                 const int debug_op_num,
                                 const string& debug_op_name, Node** debug_node);
+  // TODO(cais): Cut down the number of args to this method.
+
+  friend class DebugGraphUtilsTest;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/debug/debug_graph_utils_test.cc b/tensorflow/core/debug/debug_graph_utils_test.cc
new file mode 100644
index 00000000000..b3305e84a08
--- /dev/null
+++ b/tensorflow/core/debug/debug_graph_utils_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class DebugGraphUtilsTest : public ::testing::Test {
+ protected:
+  Status ParseDebugOpName(const string& debug_op_name,
+                          string* debug_op_name_proper,
+                          std::unordered_map<string, string>* attributes) {
+    return DebugNodeInserter::ParseDebugOpName(
+        debug_op_name, debug_op_name_proper, attributes);
+  }
+};
+
+TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  TF_ASSERT_OK(
+      ParseDebugOpName("DebugIdentity", &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugIdentity", debug_op_name_proper);
+  ASSERT_EQ(0, attributes.size());
+}
+
+TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  Status s = ParseDebugOpName("(mute_if_healthy=true)", &debug_op_name_proper,
+                              &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary)", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  Status s = ParseDebugOpName("DebugNumericSummary(=)", &debug_op_name_proper,
+                              &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(=true)", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy:true)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=true;threshold=)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true;threshold:300.0)",
+      &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary()", &debug_op_name_proper,
+                                &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(0, attributes.size());
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary(mute_if_healthy=true)",
+                                &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(1, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true; threshold=300.0)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true;threshold=300.0;first_n=100)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(3, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+  ASSERT_EQ("100", attributes["first_n"]);
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicatettributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  Status s = ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true; lower_bound=3; "
+      "mute_if_healthy=false;)",
+      &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithWhitespaceInAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(  mute_if_healthy=true; threshold=300.0  )",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(;;mute_if_healthy=true; threshold=300.0;;)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
deleted file mode 100644
index acf1224be19..00000000000
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/debug/debug_graph_utils.h"
-#include "tensorflow/core/debug/debug_grpc_testlib.h"
-#include "tensorflow/core/debug/debug_io_utils.h"
-#include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/util/event.pb.h"
-
-namespace tensorflow {
-namespace {
-
-class GrpcDebugTest : public ::testing::Test {
- protected:
-  bool SetUpServer() {
-    // Obtain port number for the test server.
-    int port = testing::PickUnusedPortOrDie();
-
-    server_client_pair.reset(new test::GrpcTestServerClientPair(port));
-
-    // Launch a debug test server in a subprocess.
-    const string test_server_bin = strings::StrCat(
-        testing::TensorFlowSrcRoot(), "/core/debug/debug_test_server_main");
-    const std::vector<string> argv(
-        {test_server_bin,
-         strings::Printf("%d", server_client_pair->server_port),
-         server_client_pair->dump_root});
-    subprocess_ = testing::CreateSubProcess(argv);
-
-    return subprocess_->Start();
-  }
-
-  void TearDownServer() {
-    // Stop the test server subprocess.
-    subprocess_->Kill(9);
-
-    // Clean up server dump directory.
-    int64 undeleted_files = -1;
-    int64 undeleted_dirs = -1;
-    Env::Default()->DeleteRecursively(server_client_pair->dump_root,
-                                      &undeleted_files, &undeleted_dirs);
-
-    ASSERT_EQ(0, undeleted_files);
-    ASSERT_EQ(0, undeleted_dirs);
-  }
-
-  std::unique_ptr<test::GrpcTestServerClientPair> server_client_pair;
-
- private:
-  std::shared_ptr<SubProcess> subprocess_;
-};
-
-TEST_F(GrpcDebugTest, AttemptToSendToNonexistentGrpcAddress) {
-  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
-  tensor.flat<float>()(0) = 42.0;
-
-  const string kInvalidGrpcUrl = "grpc://0.0.0.0:0";
-
-  // Attempt to publish debug tensor to the invalid URL should lead to a non-OK
-  // Status.
-  Status publish_status = DebugIO::PublishDebugTensor(
-      "foo_tensor", "DebugIdentity", tensor, Env::Default()->NowMicros(),
-      {kInvalidGrpcUrl});
-  ASSERT_FALSE(publish_status.ok());
-  ASSERT_NE(
-      string::npos,
-      publish_status.error_message().find(
-          "Channel at the following gRPC address is not ready: 0.0.0.0:0"));
-
-  DebugIO::CloseDebugURL(kInvalidGrpcUrl);
-}
-
-TEST_F(GrpcDebugTest, SendSingleDebugTensorViaGrpcTest) {
-  // Start the server process.
-  ASSERT_TRUE(SetUpServer());
-
-  // Poll the server with Event stream requests until first success.
-  ASSERT_TRUE(server_client_pair->PollTillFirstRequestSucceeds());
-
-  // Verify that the expected dump file exists.
-  std::vector<string> dump_files;
-  Env::Default()->GetChildren(server_client_pair->dump_root, &dump_files);
-
-  ASSERT_EQ(1, dump_files.size());
-  ASSERT_EQ(0, dump_files[0].find("prep_node_0_DebugIdentity_"));
-
-  TearDownServer();
-}
-
-TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
-  const int kSends = 4;
-
-  // Start the server process.
-  ASSERT_TRUE(SetUpServer());
-
-  // Prepare the tensors to sent.
-  std::vector<Tensor> tensors;
-  for (int i = 0; i < kSends; ++i) {
-    Tensor tensor(DT_INT32, TensorShape({1, 1}));
-    tensor.flat<int>()(0) = i * i;
-    tensors.push_back(tensor);
-  }
-
-  // Poll the server with Event stream requests until first success.
-  ASSERT_TRUE(server_client_pair->PollTillFirstRequestSucceeds());
-
-  thread::ThreadPool* tp =
-      new thread::ThreadPool(Env::Default(), "grpc_debug_test", kSends);
-
-  mutex mu;
-  Notification all_done;
-  int tensor_count GUARDED_BY(mu) = 0;
-  std::vector<Status> statuses GUARDED_BY(mu);
-
-  const std::vector<string> urls({server_client_pair->test_server_url});
-
-  // Set up the concurrent tasks of sending Tensors via an Event stream to the
-  // server.
-  auto fn = [this, &mu, &tensor_count, &tensors, &statuses, &all_done,
-             &urls]() {
-    int this_count;
-    {
-      mutex_lock l(mu);
-      this_count = tensor_count++;
-    }
-
-    // Different concurrent tasks will send different tensors.
-    const uint64 wall_time = Env::Default()->NowMicros();
-    Status publish_status = DebugIO::PublishDebugTensor(
-        strings::StrCat("synchronized_node_", this_count, ":0"),
-        "DebugIdentity", tensors[this_count], wall_time, urls);
-
-    {
-      mutex_lock l(mu);
-      statuses.push_back(publish_status);
-      if (this_count == kSends - 1 && !all_done.HasBeenNotified()) {
-        all_done.Notify();
-      }
-    }
-  };
-
-  // Schedule the concurrent tasks.
-  for (int i = 0; i < kSends; ++i) {
-    tp->Schedule(fn);
-  }
-
-  // Wait for all client tasks to finish.
-  all_done.WaitForNotification();
-  delete tp;
-
-  // Close the debug gRPC stream.
-  Status close_status =
-      DebugIO::CloseDebugURL(server_client_pair->test_server_url);
-  ASSERT_TRUE(close_status.ok());
-
-  // Check all statuses from the PublishDebugTensor calls().
-  for (const Status& status : statuses) {
-    TF_ASSERT_OK(status);
-  }
-
-  // Load the dump files generated by the server upon receiving the tensors
-  // via the Event stream.
-  std::vector<string> dump_files;
-  Env::Default()->GetChildren(server_client_pair->dump_root, &dump_files);
-
-  // One prep tensor plus kSends concurrent tensors are expected.
-  ASSERT_EQ(1 + kSends, dump_files.size());
-
-  // Verify the content of the dumped tensors (in Event proto files).
-  for (const string& dump_file : dump_files) {
-    if (dump_file.find("prep_node") == 0) {
-      continue;
-    }
-
-    std::vector<string> items = str_util::Split(dump_file, '_');
-    int tensor_index;
-    strings::safe_strto32(items[2], &tensor_index);
-
-    const string file_path =
-        io::JoinPath(server_client_pair->dump_root, dump_file);
-
-    Event event;
-    TF_ASSERT_OK(ReadEventFromFile(file_path, &event));
-
-    const TensorProto& tensor_proto = event.summary().value(0).tensor();
-    Tensor tensor(tensor_proto.dtype());
-    ASSERT_TRUE(tensor.FromProto(tensor_proto));
-
-    // Verify the content of the tensor sent via the Event stream.
-    ASSERT_EQ(TensorShape({1, 1}), tensor.shape());
-    ASSERT_EQ(tensor_index * tensor_index, tensor.flat<int>()(0));
-  }
-
-  TearDownServer();
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index 020a3484283..c19842a2f6c 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_grpc_testlib.h"
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
-#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,62 +33,116 @@ namespace test {
   Event event;
 
   while (stream->Read(&event)) {
-    const Summary::Value& val = event.summary().value(0);
+    if (event.has_log_message()) {
+      debug_metadata_strings.push_back(event.log_message().message());
+    } else if (!event.graph_def().empty()) {
+      encoded_graph_defs.push_back(event.graph_def());
+    } else if (event.has_summary()) {
+      const Summary::Value& val = event.summary().value(0);
 
-    std::vector<string> name_items =
-        tensorflow::str_util::Split(val.node_name(), ':');
+      std::vector<string> name_items =
+          tensorflow::str_util::Split(val.node_name(), ':');
 
-    const string node_name = name_items[0];
-    int32 output_slot = 0;
-    tensorflow::strings::safe_strto32(name_items[1], &output_slot);
-    const string debug_op = name_items[2];
+      const string node_name = name_items[0];
+      int32 output_slot = 0;
+      tensorflow::strings::safe_strto32(name_items[1], &output_slot);
+      const string debug_op = name_items[2];
 
-    const TensorProto& tensor_proto = val.tensor();
-    Tensor tensor(tensor_proto.dtype());
-    if (!tensor.FromProto(tensor_proto)) {
-      return ::grpc::Status::CANCELLED;
+      const TensorProto& tensor_proto = val.tensor();
+      Tensor tensor(tensor_proto.dtype());
+      if (!tensor.FromProto(tensor_proto)) {
+        return ::grpc::Status::CANCELLED;
+      }
+
+      device_names.push_back(val.tag());
+      node_names.push_back(node_name);
+      output_slots.push_back(output_slot);
+      debug_ops.push_back(debug_op);
+      debug_tensors.push_back(tensor);
     }
+  }
 
-    string dump_path;
-    DebugFileIO::DumpTensorToDir(node_name, output_slot, debug_op, tensor,
-                                 event.wall_time(), dump_root, &dump_path);
+  {
+    mutex_lock l(changes_mu_);
+    for (size_t i = 0; i < changes_to_enable_.size(); ++i) {
+      EventReply event_reply;
+      EventReply::DebugOpStateChange* change =
+          event_reply.add_debug_op_state_changes();
+      change->set_change(changes_to_enable_[i]
+                             ? EventReply::DebugOpStateChange::ENABLE
+                             : EventReply::DebugOpStateChange::DISABLE);
+      change->set_node_name(changes_node_names_[i]);
+      change->set_output_slot(changes_output_slots_[i]);
+      change->set_debug_op(changes_debug_ops_[i]);
+      stream->Write(event_reply);
+    }
+    changes_to_enable_.clear();
+    changes_node_names_.clear();
+    changes_output_slots_.clear();
+    changes_debug_ops_.clear();
   }
 
   return ::grpc::Status::OK;
 }
 
-GrpcTestServerClientPair::GrpcTestServerClientPair(const int server_port)
-    : server_port(server_port) {
-  const int kTensorSize = 2;
-  prep_tensor_.reset(
-      new Tensor(DT_FLOAT, TensorShape({kTensorSize, kTensorSize})));
-  for (int i = 0; i < kTensorSize * kTensorSize; ++i) {
-    prep_tensor_->flat<float>()(i) = static_cast<float>(i);
-  }
-
-  // Obtain server's gRPC url.
-  test_server_url = strings::StrCat("grpc://0.0.0.0:", server_port);
-
-  // Obtain dump directory for the stream server.
-  string tmp_dir = port::Tracing::LogDir();
-  dump_root =
-      io::JoinPath(tmp_dir, strings::StrCat("tfdbg_dump_port", server_port, "_",
-                                            Env::Default()->NowMicros()));
+void TestEventListenerImpl::ClearReceivedDebugData() {
+  debug_metadata_strings.clear();
+  encoded_graph_defs.clear();
+  device_names.clear();
+  node_names.clear();
+  output_slots.clear();
+  debug_ops.clear();
+  debug_tensors.clear();
 }
 
-bool GrpcTestServerClientPair::PollTillFirstRequestSucceeds() {
-  const std::vector<string> urls({test_server_url});
-  int n_attempts = 0;
+void TestEventListenerImpl::RequestDebugOpStateChangeAtNextStream(
+    bool to_enable, const DebugNodeKey& debug_node_key) {
+  mutex_lock l(changes_mu_);
+
+  changes_to_enable_.push_back(to_enable);
+  changes_node_names_.push_back(debug_node_key.node_name);
+  changes_output_slots_.push_back(debug_node_key.output_slot);
+  changes_debug_ops_.push_back(debug_node_key.debug_op);
+}
+
+void TestEventListenerImpl::RunServer(const int server_port) {
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(strings::StrCat("localhost:", server_port),
+                           ::grpc::InsecureServerCredentials());
+  builder.RegisterService(this);
+  std::unique_ptr<::grpc::Server> server = builder.BuildAndStart();
+
+  while (!stop_requested_.load()) {
+    Env::Default()->SleepForMicroseconds(200 * 1000);
+  }
+  server->Shutdown();
+  stopped_.store(true);
+}
+
+void TestEventListenerImpl::StopServer() {
+  stop_requested_.store(true);
+  while (!stopped_.load()) {
+  }
+}
+
+bool PollTillFirstRequestSucceeds(const string& server_url,
+                                  const size_t max_attempts) {
+  const int kSleepDurationMicros = 100 * 1000;
+  size_t n_attempts = 0;
   bool success = false;
 
   // Try a number of times to send the Event proto to the server, as it may
   // take the server a few seconds to start up and become responsive.
-  while (n_attempts++ < kMaxAttempts) {
-    const uint64 wall_time = Env::Default()->NowMicros();
+  Tensor prep_tensor(DT_FLOAT, TensorShape({1, 1}));
+  prep_tensor.flat<float>()(0) = 42.0f;
 
+  while (n_attempts++ < max_attempts) {
+    const uint64 wall_time = Env::Default()->NowMicros();
     Status publish_s = DebugIO::PublishDebugTensor(
-        "prep_node:0", "DebugIdentity", *prep_tensor_, wall_time, urls);
-    Status close_s = DebugIO::CloseDebugURL(test_server_url);
+        DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", "prep_node", 0,
+                     "DebugIdentity"),
+        prep_tensor, wall_time, {server_url});
+    Status close_s = DebugIO::CloseDebugURL(server_url);
 
     if (publish_s.ok() && close_s.ok()) {
       success = true;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 0e7a78e3857..d55933c5809 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -16,9 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
 #define TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
 
+#include <atomic>
+
 #include "grpc++/grpc++.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -26,41 +30,53 @@ namespace test {
 
 class TestEventListenerImpl final : public EventListener::Service {
  public:
-  TestEventListenerImpl(const string& dump_root) : dump_root(dump_root) {}
+  TestEventListenerImpl() : stop_requested_(false), stopped_(false) {}
+
+  void RunServer(const int server_port);
+  void StopServer();
 
   ::grpc::Status SendEvents(
       ::grpc::ServerContext* context,
       ::grpc::ServerReaderWriter< ::tensorflow::EventReply,
                                   ::tensorflow::Event>* stream);
 
-  string dump_root;
-};
+  // Clear debug data (e.g., Tensors) received so far.
+  void ClearReceivedDebugData();
 
-class GrpcTestServerClientPair {
- public:
-  GrpcTestServerClientPair(const int server_port);
-  virtual ~GrpcTestServerClientPair() {}
+  void RequestDebugOpStateChangeAtNextStream(
+      bool to_enable, const DebugNodeKey& debug_node_key);
 
-  // Keep sending requests to the test server until the first success.
-  // This is necessary because the server may take a certain amount of time
-  // to start up and become responsive.
-  //
-  // Returns: A boolean indicating whether a successful response is obtained
-  //   within the limit of maximum number of attempts.
-  bool PollTillFirstRequestSucceeds();
-
-  string dump_root;
-
-  int server_port;
-  string test_server_url;
+  std::vector<string> debug_metadata_strings;
+  std::vector<string> encoded_graph_defs;
+  std::vector<string> device_names;
+  std::vector<string> node_names;
+  std::vector<int32> output_slots;
+  std::vector<string> debug_ops;
+  std::vector<Tensor> debug_tensors;
 
  private:
-  std::unique_ptr<Tensor> prep_tensor_;
+  std::atomic_bool stop_requested_;
+  std::atomic_bool stopped_;
 
-  const int kMaxAttempts = 100;
-  const int kSleepDurationMicros = 100 * 1000;
+  std::vector<bool> changes_to_enable_ GUARDED_BY(changes_mu_);
+  std::vector<string> changes_node_names_ GUARDED_BY(changes_mu_);
+  std::vector<int32> changes_output_slots_ GUARDED_BY(changes_mu_);
+  std::vector<string> changes_debug_ops_ GUARDED_BY(changes_mu_);
+
+  mutex changes_mu_;
 };
 
+// Poll a gRPC debug server by sending a small tensor repeatedly till success.
+//
+// Args:
+//   server_url: gRPC URL of the server to poll, e.g., "grpc://foo:3333".
+//   max_attempts: Maximum number of attempts.
+//
+// Returns:
+//   Whether the polling succeeded within max_attempts.
+bool PollTillFirstRequestSucceeds(const string& server_url,
+                                  const size_t max_attempts);
+
 }  // namespace test
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 78196007650..54366ce2490 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -17,38 +17,45 @@ limitations under the License.
 
 #include <vector>
 
+#if defined(PLATFORM_GOOGLE)
 #include "grpc++/create_channel.h"
+#endif
 
 #if defined(PLATFORM_WINDOWS)
 // winsock2.h is used in grpc, so Ws2_32.lib is needed
 #pragma comment(lib,"Ws2_32.lib")
 #endif
 
-#include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/util/event.pb.h"
 
+#define GRPC_OSS_UNIMPLEMENTED_ERROR \
+  return errors::Unimplemented(      \
+      kGrpcURLScheme,                \
+      " debug URL scheme is not implemented in open source yet.")
+
 namespace tensorflow {
 
 namespace {
 
 // Encapsulate the tensor value inside a Summary proto, and then inside an
 // Event proto.
-Event WrapTensorAsEvent(const string& tensor_name, const string& debug_op,
+Event WrapTensorAsEvent(const DebugNodeKey& debug_node_key,
                         const Tensor& tensor, const uint64 wall_time_us) {
   Event event;
   event.set_wall_time(static_cast<double>(wall_time_us));
-
   Summary::Value* summ_val = event.mutable_summary()->add_value();
 
   // Create the debug node_name in the Summary proto.
   // For example, if tensor_name = "foo/node_a:0", and the debug_op is
   // "DebugIdentity", the debug node_name in the Summary proto will be
   // "foo/node_a:0:DebugIdentity".
-  const string debug_node_name = strings::StrCat(tensor_name, ":", debug_op);
-  summ_val->set_node_name(debug_node_name);
+  summ_val->set_node_name(debug_node_key.debug_node_name);
+  summ_val->set_tag(debug_node_key.device_name);
 
   if (tensor.dtype() == DT_STRING) {
     // Treat DT_STRING specially, so that tensor_util.MakeNdarray can convert
@@ -62,8 +69,78 @@ Event WrapTensorAsEvent(const string& tensor_name, const string& debug_op,
   return event;
 }
 
+// Append an underscore and a timestamp to a file path. If the path already
+// exists on the file system, append a hyphen and a 1-up index. Consecutive
+// values of the index will be tried until the first unused one is found.
+// TOCTOU race condition is not of concern here due to the fact that tfdbg
+// sets parallel_iterations attribute of all while_loops to 1 to prevent
+// the same node from between executed multiple times concurrently.
+string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
+  string out = strings::StrCat(in, "_", timestamp);
+
+  uint64 i = 1;
+  while (Env::Default()->FileExists(out).ok()) {
+    out = strings::StrCat(in, "_", timestamp, "-", i);
+    ++i;
+  }
+  return out;
+}
+
+#if defined(PLATFORM_GOOGLE)
+Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
+                                      const string& device_name,
+                                      const int64 wall_time,
+                                      const string& debug_url) {
+  static const size_t kChunkSizeLimitBytes = 4000 * 1024;
+  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
+  const size_t total_length = encoded_graph_def.size();
+  const size_t num_chunks = static_cast<size_t>(
+      std::ceil(static_cast<float>(total_length) / kChunkSizeLimitBytes));
+  for (size_t i = 0; i < num_chunks; ++i) {
+    const size_t pos = i * kChunkSizeLimitBytes;
+    const size_t len =
+        (i == num_chunks - 1) ? (total_length - pos) : kChunkSizeLimitBytes;
+    Event event;
+    event.set_wall_time(static_cast<double>(wall_time));
+    // Prefix the chunk with
+    //   <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|.
+    event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time,
+                                        "|", i, "|", num_chunks, "|",
+                                        encoded_graph_def.substr(pos, len)));
+    if (!DebugGrpcIO::SendEventProtoThroughGrpcStream(event, debug_url).ok()) {
+      return errors::FailedPrecondition(
+          "Failed to send chunk ", i, " of ", num_chunks,
+          " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes");
+    }
+  }
+  return Status::OK();
+}
+#endif
+
 }  // namespace
 
+// static
+const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
+
+// static
+const char* const DebugIO::kCoreMetadataTag = "core_metadata_";
+
+// static
+const char* const DebugIO::kDeviceTag = "device_";
+
+// static
+const char* const DebugIO::kGraphTag = "graph_";
+
+DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
+                           const int32 output_slot, const string& debug_op)
+    : device_name(device_name),
+      node_name(node_name),
+      output_slot(output_slot),
+      debug_op(debug_op),
+      debug_node_name(
+          strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
+      device_path(DeviceNameToDevicePath(device_name)) {}
+
 Status ReadEventFromFile(const string& dump_file_path, Event* event) {
   Env* env(Env::Default());
 
@@ -93,58 +170,137 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) {
   return Status::OK();
 }
 
+// static
+const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+  return strings::StrCat(
+      DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
+      str_util::StringReplace(
+          str_util::StringReplace(device_name, ":", "_", true), "/", ",",
+          true));
+}
+
 // static
 const char* const DebugIO::kFileURLScheme = "file://";
 // static
 const char* const DebugIO::kGrpcURLScheme = "grpc://";
 
 // static
-Status DebugIO::PublishDebugTensor(const string& tensor_name,
-                                   const string& debug_op, const Tensor& tensor,
-                                   const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls) {
-  // Split the tensor_name into node name and output slot index.
-  std::vector<string> name_items = str_util::Split(tensor_name, ':');
-  string node_name;
-  int32 output_slot = 0;
-  if (name_items.size() == 2) {
-    node_name = name_items[0];
-    if (!strings::safe_strto32(name_items[1], &output_slot)) {
-      return Status(error::INVALID_ARGUMENT,
-                    strings::StrCat("Invalid string value for output_slot: \"",
-                                    name_items[1], "\""));
+Status DebugIO::PublishDebugMetadata(
+    const int64 global_step, const int64 session_run_index,
+    const int64 executor_step_index, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_nodes,
+    const std::unordered_set<string>& debug_urls) {
+  std::ostringstream oss;
+
+  // Construct a JSON string to carry the metadata.
+  oss << "{";
+  oss << "\"global_step\":" << global_step << ",";
+  oss << "\"session_run_index\":" << session_run_index << ",";
+  oss << "\"executor_step_index\":" << executor_step_index << ",";
+  oss << "\"input_names\":[";
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    oss << "\"" << input_names[i] << "\"";
+    if (i < input_names.size() - 1) {
+      oss << ",";
+    }
+  }
+  oss << "],";
+  oss << "\"output_names\":[";
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    oss << "\"" << output_names[i] << "\"";
+    if (i < output_names.size() - 1) {
+      oss << ",";
+    }
+  }
+  oss << "],";
+  oss << "\"target_nodes\":[";
+  for (size_t i = 0; i < target_nodes.size(); ++i) {
+    oss << "\"" << target_nodes[i] << "\"";
+    if (i < target_nodes.size() - 1) {
+      oss << ",";
+    }
+  }
+  oss << "]";
+  oss << "}";
+
+  const string json_metadata = oss.str();
+  Event event;
+  event.set_wall_time(static_cast<double>(Env::Default()->NowMicros()));
+  LogMessage* log_message = event.mutable_log_message();
+  log_message->set_message(json_metadata);
+
+  Status status;
+  for (const string& url : debug_urls) {
+    if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
+#if defined(PLATFORM_GOOGLE)
+      Event grpc_event;
+
+      // Determine the path (if any) in the grpc:// URL, and add it as a field
+      // of the JSON string.
+      const string address = url.substr(strlen(DebugIO::kFileURLScheme));
+      const string path = address.find("/") == string::npos
+                              ? ""
+                              : address.substr(address.find("/"));
+      grpc_event.set_wall_time(event.wall_time());
+      LogMessage* log_message_grpc = grpc_event.mutable_log_message();
+      log_message_grpc->set_message(
+          strings::StrCat(json_metadata.substr(0, json_metadata.size() - 1),
+                          ",\"grpc_path\":\"", path, "\"}"));
+
+      status.Update(
+          DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url));
+#else
+      GRPC_OSS_UNIMPLEMENTED_ERROR;
+#endif
+    } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
+      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
+      const string core_metadata_path = AppendTimestampToFilePath(
+          io::JoinPath(
+              dump_root_dir,
+              strings::StrCat(DebugIO::kMetadataFilePrefix,
+                              DebugIO::kCoreMetadataTag, "sessionrun",
+                              strings::Printf("%.14lld", session_run_index))),
+          Env::Default()->NowMicros());
+      status.Update(DebugFileIO::DumpEventProtoToFile(
+          event, io::Dirname(core_metadata_path).ToString(),
+          io::Basename(core_metadata_path).ToString()));
     }
-  } else if (name_items.size() == 1) {
-    node_name = name_items[0];
-  } else {
-    return Status(
-        error::INVALID_ARGUMENT,
-        strings::StrCat("Failed to parse tensor name: \"", tensor_name, "\""));
   }
 
-  int num_failed_urls = 0;
+  return status;
+}
+
+// static
+Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
+                                   const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const bool gated_grpc) {
+  int32 num_failed_urls = 0;
   std::vector<Status> fail_statuses;
   for (const string& url : debug_urls) {
     if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
       const string dump_root_dir = url.substr(strlen(kFileURLScheme));
 
-      Status s =
-          DebugFileIO::DumpTensorToDir(node_name, output_slot, debug_op, tensor,
-                                       wall_time_us, dump_root_dir, nullptr);
+      Status s = DebugFileIO::DumpTensorToDir(
+          debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr);
       if (!s.ok()) {
         num_failed_urls++;
         fail_statuses.push_back(s);
       }
     } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
-      const string grpc_server_stream_addr = url.substr(strlen(kGrpcURLScheme));
+#if defined(PLATFORM_GOOGLE)
       Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
-          node_name, output_slot, debug_op, tensor, wall_time_us,
-          grpc_server_stream_addr);
+          debug_node_key, tensor, wall_time_us, url, gated_grpc);
 
       if (!s.ok()) {
         num_failed_urls++;
         fail_statuses.push_back(s);
       }
+#else
+      GRPC_OSS_UNIMPLEMENTED_ERROR;
+#endif
     } else {
       return Status(error::UNAVAILABLE,
                     strings::StrCat("Invalid debug target URL: ", url));
@@ -167,7 +323,16 @@ Status DebugIO::PublishDebugTensor(const string& tensor_name,
 }
 
 // static
-Status DebugIO::PublishGraph(const Graph& graph,
+Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
+                                   const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls) {
+  return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
+                            false);
+}
+
+// static
+Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
                              const std::unordered_set<string>& debug_urls) {
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
@@ -183,25 +348,89 @@ Status DebugIO::PublishGraph(const Graph& graph,
   Status status = Status::OK();
   for (const string& debug_url : debug_urls) {
     if (debug_url.find(kFileURLScheme) == 0) {
-      const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
-      const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
+      const string dump_root_dir =
+          io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
+                       DebugNodeKey::DeviceNameToDevicePath(device_name));
+      const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
+                                               DebugIO::kGraphTag, now_micros);
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
     } else if (debug_url.find(kGrpcURLScheme) == 0) {
-      DebugGrpcIO::SendEventProtoThroughGrpcStream(
-          event, debug_url.substr(strlen(kGrpcURLScheme)));
+#if defined(PLATFORM_GOOGLE)
+      status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
+                                                   debug_url));
+#else
+      GRPC_OSS_UNIMPLEMENTED_ERROR;
+#endif
     }
   }
 
   return status;
 }
 
+// static
+bool DebugIO::IsCopyNodeGateOpen(
+    const std::vector<DebugWatchAndURLSpec>& specs) {
+#if defined(PLATFORM_GOOGLE)
+  for (const DebugWatchAndURLSpec& spec : specs) {
+    if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme),
+                                             DebugIO::kGrpcURLScheme)) {
+      return true;
+    } else {
+      if (DebugGrpcIO::IsGateOpen(spec.watch_key, spec.url)) {
+        return true;
+      }
+    }
+  }
+  return false;
+#else
+  return true;
+#endif
+}
+
+// static
+bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
+                                  const std::vector<string>& debug_urls) {
+#if defined(PLATFORM_GOOGLE)
+  for (const string& debug_url : debug_urls) {
+    if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
+                          DebugIO::kGrpcURLScheme)) {
+      return true;
+    } else {
+      if (DebugGrpcIO::IsGateOpen(watch_key, debug_url)) {
+        return true;
+      }
+    }
+  }
+  return false;
+#else
+  return true;
+#endif
+}
+
+// static
+bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
+                                 const string& debug_url) {
+#if defined(PLATFORM_GOOGLE)
+  if (debug_url.find(kGrpcURLScheme) != 0) {
+    return true;
+  } else {
+    return DebugGrpcIO::IsGateOpen(watch_key, debug_url);
+  }
+#else
+  return true;
+#endif
+}
+
 // static
 Status DebugIO::CloseDebugURL(const string& debug_url) {
   if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
-    return DebugGrpcIO::CloseGrpcStream(
-        debug_url.substr(strlen(DebugIO::kGrpcURLScheme)));
+#if defined(PLATFORM_GOOGLE)
+    return DebugGrpcIO::CloseGrpcStream(debug_url);
+#else
+    GRPC_OSS_UNIMPLEMENTED_ERROR;
+#endif
   } else {
     // No-op for non-gRPC URLs.
     return Status::OK();
@@ -212,30 +441,31 @@ Status DebugIO::CloseDebugURL(const string& debug_url) {
 static Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
 
 // static
-Status DebugFileIO::DumpTensorToDir(
-    const string& node_name, const int32 output_slot, const string& debug_op,
-    const Tensor& tensor, const uint64 wall_time_us,
-    const string& dump_root_dir, string* dump_file_path) {
-  const string file_path = GetDumpFilePath(dump_root_dir, node_name,
-                                           output_slot, debug_op, wall_time_us);
+Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
+                                    const Tensor& tensor,
+                                    const uint64 wall_time_us,
+                                    const string& dump_root_dir,
+                                    string* dump_file_path) {
+  const string file_path =
+      GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us);
 
   if (dump_file_path != nullptr) {
     *dump_file_path = file_path;
   }
 
-  return DumpTensorToEventFile(node_name, output_slot, debug_op, tensor,
-                               wall_time_us, file_path);
+  return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
 }
 
 // static
 string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
-                                    const string& node_name,
-                                    const int32 output_slot,
-                                    const string& debug_op,
+                                    const DebugNodeKey& debug_node_key,
                                     const uint64 wall_time_us) {
-  return io::JoinPath(
-      dump_root_dir, strings::StrCat(node_name, "_", output_slot, "_", debug_op,
-                                     "_", wall_time_us));
+  return AppendTimestampToFilePath(
+      io::JoinPath(dump_root_dir, debug_node_key.device_path,
+                   strings::StrCat(debug_node_key.node_name, "_",
+                                   debug_node_key.output_slot, "_",
+                                   debug_node_key.debug_op)),
+      wall_time_us);
 }
 
 // static
@@ -258,21 +488,20 @@ Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
 
   std::unique_ptr<WritableFile> f = nullptr;
   TF_CHECK_OK(env->NewWritableFile(file_path, &f));
-  f->Append(event_str);
+  f->Append(event_str).IgnoreError();
   TF_CHECK_OK(f->Close());
 
   return Status::OK();
 }
 
 // static
-Status DebugFileIO::DumpTensorToEventFile(
-    const string& node_name, const int32 output_slot, const string& debug_op,
-    const Tensor& tensor, const uint64 wall_time_us, const string& file_path) {
-  const string tensor_name = strings::StrCat(node_name, ":", output_slot);
-  Event event = WrapTensorAsEvent(tensor_name, debug_op, tensor, wall_time_us);
-
-  return DumpEventProtoToFile(event, io::Dirname(file_path).ToString(),
-                              io::Basename(file_path).ToString());
+Status DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
+                                          const Tensor& tensor,
+                                          const uint64 wall_time_us,
+                                          const string& file_path) {
+  return DumpEventProtoToFile(
+      WrapTensorAsEvent(debug_node_key, tensor, wall_time_us),
+      io::Dirname(file_path).ToString(), io::Basename(file_path).ToString());
 }
 
 // static
@@ -299,7 +528,7 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
                                   " because the path exists as a file "));
   }
 
-  env->CreateDir(dir);
+  env->CreateDir(dir).IgnoreError();
   // Guard against potential race in creating directories by doing a check
   // after the CreateDir call.
   if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
@@ -310,18 +539,29 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
   }
 }
 
+#if defined(PLATFORM_GOOGLE)
 DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
-    : ctx_(),
-      channel_(::grpc::CreateCustomChannel(server_stream_addr,
-                                           ::grpc::InsecureChannelCredentials(),
-                                           ::grpc::ChannelArguments())),
-      stub_(EventListener::NewStub(channel_)),
-      reader_writer_(stub_->SendEvents(&ctx_)),
-      mu_() {}
-// TODO(cais): Set GRPC_ARG_MAX_MESSAGE_LENGTH to max if necessary.
+    : server_stream_addr_(server_stream_addr),
+      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
 
-bool DebugGrpcChannel::is_channel_ready() {
-  return channel_->GetState(false) == GRPC_CHANNEL_READY;
+Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  channel_ = ::grpc::CreateCustomChannel(
+      server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
+  if (!channel_->WaitForConnected(
+          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
+                       gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) {
+    return errors::FailedPrecondition(
+        "Failed to connect to gRPC channel at ", server_stream_addr_,
+        " within a timeout of ", timeout_micros / 1e6, " s.");
+  }
+  stub_ = EventListener::NewStub(channel_);
+  reader_writer_ = stub_->SendEvents(&ctx_);
+
+  return Status::OK();
 }
 
 bool DebugGrpcChannel::WriteEvent(const Event& event) {
@@ -330,10 +570,29 @@ bool DebugGrpcChannel::WriteEvent(const Event& event) {
   return reader_writer_->Write(event);
 }
 
-Status DebugGrpcChannel::Close() {
+Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   mutex_lock l(mu_);
 
   reader_writer_->WritesDone();
+
+  // Read all EventReply messages (if any) from the server.
+  EventReply event_reply;
+  while (reader_writer_->Read(&event_reply)) {
+    for (const EventReply::DebugOpStateChange& debug_op_state_change :
+         event_reply.debug_op_state_changes()) {
+      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
+                                         debug_op_state_change.output_slot(),
+                                         ":", debug_op_state_change.debug_op());
+      if (debug_op_state_change.change() ==
+          EventReply::DebugOpStateChange::ENABLE) {
+        DebugGrpcIO::EnableWatchKey(url_, watch_key);
+      } else if (debug_op_state_change.change() ==
+                 EventReply::DebugOpStateChange::DISABLE) {
+        DebugGrpcIO::DisableWatchKey(url_, watch_key);
+      }
+    }
+  }
+
   if (reader_writer_->Finish().ok()) {
     return Status::OK();
   } else {
@@ -344,61 +603,94 @@ Status DebugGrpcChannel::Close() {
 
 // static
 mutex DebugGrpcIO::streams_mu;
-std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>
-    DebugGrpcIO::stream_channels;
+
+// static
+int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
+// TODO(cais): Make this configurable?
+
+// static
+std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+DebugGrpcIO::GetStreamChannels() {
+  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+      stream_channels =
+          new std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>();
+  return stream_channels;
+}
 
 // static
 Status DebugGrpcIO::SendTensorThroughGrpcStream(
-    const string& node_name, const int32 output_slot, const string& debug_op,
-    const Tensor& tensor, const uint64 wall_time_us,
-    const string& server_stream_addr) {
-  const string tensor_name = strings::StrCat(node_name, ":", output_slot);
-
-  // Prepare tensor Event data to be sent.
-  Event event = WrapTensorAsEvent(tensor_name, debug_op, tensor, wall_time_us);
-
-  return SendEventProtoThroughGrpcStream(event, server_stream_addr);
+    const DebugNodeKey& debug_node_key, const Tensor& tensor,
+    const uint64 wall_time_us, const string& grpc_stream_url,
+    const bool gated) {
+  if (gated && !IsGateOpen(debug_node_key.debug_node_name, grpc_stream_url)) {
+    return Status::OK();
+  } else {
+    return SendEventProtoThroughGrpcStream(
+        WrapTensorAsEvent(debug_node_key, tensor, wall_time_us),
+        grpc_stream_url);
+  }
 }
 
 // static
 Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
-    const Event& event_proto, const string& server_stream_addr) {
+    const Event& event_proto, const string& grpc_stream_url) {
+  const string addr_with_path =
+      grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0
+          ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
+          : grpc_stream_url;
+  const string server_stream_addr =
+      addr_with_path.substr(0, addr_with_path.find('/'));
   std::shared_ptr<DebugGrpcChannel> debug_grpc_channel;
   {
     mutex_lock l(streams_mu);
-    if (stream_channels.find(server_stream_addr) == stream_channels.end()) {
+    std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+        stream_channels = GetStreamChannels();
+    if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       debug_grpc_channel.reset(new DebugGrpcChannel(server_stream_addr));
+      TF_RETURN_IF_ERROR(
+          debug_grpc_channel->Connect(channel_connection_timeout_micros));
 
-      if (!debug_grpc_channel->is_channel_ready()) {
-        return errors::FailedPrecondition(
-            strings::StrCat("Channel at the following gRPC address is ",
-                            "not ready: ", server_stream_addr));
-      }
-
-      stream_channels[server_stream_addr] = debug_grpc_channel;
+      (*stream_channels)[grpc_stream_url] = debug_grpc_channel;
+      CreateEmptyEnabledSet(grpc_stream_url);
     } else {
-      debug_grpc_channel = stream_channels[server_stream_addr];
+      debug_grpc_channel = (*stream_channels)[grpc_stream_url];
     }
   }
 
   bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
   if (!write_ok) {
     return errors::Cancelled(strings::StrCat("Write event to stream URL ",
-                                             server_stream_addr, "failed."));
+                                             grpc_stream_url, " failed."));
   }
 
   return Status::OK();
 }
 
-Status DebugGrpcIO::CloseGrpcStream(const string& server_stream_addr) {
+// static
+bool DebugGrpcIO::IsGateOpen(const string& watch_key,
+                             const string& grpc_debug_url) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    return false;
+  } else {
+    const auto& url_enabled = (*enabled_watch_keys)[grpc_debug_url];
+    return url_enabled.find(watch_key) != url_enabled.end();
+  }
+}
+
+// static
+Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
   mutex_lock l(streams_mu);
 
-  if (stream_channels.find(server_stream_addr) != stream_channels.end()) {
+  std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+      stream_channels = GetStreamChannels();
+  if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
     // Stream of the specified address exists. Close it and remove it from
     // record.
     Status s;
-    s = stream_channels[server_stream_addr]->Close();
-    stream_channels.erase(server_stream_addr);
+    s = (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose();
+    (*stream_channels).erase(grpc_stream_url);
     return s;
   } else {
     // Stream of the specified address does not exist. No action.
@@ -406,4 +698,59 @@ Status DebugGrpcIO::CloseGrpcStream(const string& server_stream_addr) {
   }
 }
 
+// static
+std::unordered_map<string, std::unordered_set<string>>*
+DebugGrpcIO::GetEnabledWatchKeys() {
+  static std::unordered_map<string, std::unordered_set<string>>*
+      enabled_watch_keys =
+          new std::unordered_map<string, std::unordered_set<string>>();
+  return enabled_watch_keys;
+}
+
+// static
+void DebugGrpcIO::EnableWatchKey(const string& grpc_debug_url,
+                                 const string& watch_key) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    CreateEmptyEnabledSet(grpc_debug_url);
+  }
+  (*enabled_watch_keys)[grpc_debug_url].insert(watch_key);
+}
+
+// static
+void DebugGrpcIO::DisableWatchKey(const string& grpc_debug_url,
+                                  const string& watch_key) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    LOG(WARNING) << "Attempt to disable a watch key for an unregistered gRPC "
+                 << "debug URL: " << grpc_debug_url;
+  } else {
+    std::unordered_set<string>& url_enabled =
+        (*enabled_watch_keys)[grpc_debug_url];
+    if (url_enabled.find(watch_key) == url_enabled.end()) {
+      LOG(WARNING) << "Attempt to disable a watch key that is not currently "
+                   << "enabled at " << grpc_debug_url << ": " << watch_key;
+    } else {
+      url_enabled.erase(watch_key);
+    }
+  }
+}
+
+// static
+void DebugGrpcIO::ClearEnabledWatchKeys() { GetEnabledWatchKeys()->clear(); }
+
+// static
+void DebugGrpcIO::CreateEmptyEnabledSet(const string& grpc_debug_url) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    std::unordered_set<string> empty_watch_keys;
+    (*enabled_watch_keys)[grpc_debug_url] = empty_watch_keys;
+  }
+}
+
+#endif  // #if defined(PLATFORM_GOOGLE)
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index a10e24386ad..69d8c7bd4e0 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -19,33 +19,68 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
-#include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
 Status ReadEventFromFile(const string& dump_file_path, Event* event);
 
+struct DebugWatchAndURLSpec {
+  DebugWatchAndURLSpec(const string& watch_key, const string& url,
+                       const bool gated_grpc)
+      : watch_key(watch_key), url(url), gated_grpc(gated_grpc) {}
+
+  const string watch_key;
+  const string url;
+  const bool gated_grpc;
+};
+
+struct DebugNodeKey {
+  DebugNodeKey(const string& device_name, const string& node_name,
+               const int32 output_slot, const string& debug_op);
+
+  static const string DeviceNameToDevicePath(const string& device_name);
+
+  const string device_name;
+  const string node_name;
+  const int32 output_slot;
+  const string debug_op;
+  const string debug_node_name;
+  const string device_path;
+};
+
 class DebugIO {
  public:
+  static Status PublishDebugMetadata(
+      const int64 global_step, const int64 session_run_index,
+      const int64 executor_step_index, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_nodes,
+      const std::unordered_set<string>& debug_urls);
+
   // Publish a tensor to a debug target URL.
   //
   // Args:
-  //   tensor_name: Name of the tensor being published: node_name followed by
-  //     a colon, followed by the output slot index. E.g., "node_a:0".
-  //     N.B.: Use the original tensor name, i.e., name of the input tensor to
-  //     the debug op, even if the debug_op is not DebugIdentity.
-  //   debug_op: Name of the debug op, e.g., "DebugIdentity".
+  //   debug_node_key: A DebugNodeKey identifying the debug node.
   //   tensor: The Tensor object being published.
   //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
   //   debug_urls: An array of debug target URLs, e.g.,
   //     "file:///foo/tfdbg_dump", "grpc://localhost:11011"
-  static Status PublishDebugTensor(const string& tensor_name,
-                                   const string& debug_op, const Tensor& tensor,
+  //   gated_grpc: Whether this call is subject to gRPC gating.
+  static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
+                                   const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const bool gated_grpc);
+
+  // Convenience overload of the method above for no gated_grpc by default.
+  static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
+                                   const Tensor& tensor,
                                    const uint64 wall_time_us,
                                    const gtl::ArraySlice<string>& debug_urls);
 
@@ -54,12 +89,61 @@ class DebugIO {
   // Args:
   //   graph: The graph to be published.
   //   debug_urls: The set of debug URLs to publish the graph to.
-  static Status PublishGraph(const Graph& graph,
+  static Status PublishGraph(const Graph& graph, const string& device_name,
                              const std::unordered_set<string>& debug_urls);
 
+  // Determine whether a copy node needs to perform deep-copy of input tensor.
+  //
+  // The input arguments contain sufficient information about the attached
+  // downstream debug ops for this method to determine whether all the said
+  // ops are disabled given the current status of the gRPC gating.
+  //
+  // Args:
+  //   specs: A vector of DebugWatchAndURLSpec carrying information about the
+  //     debug ops attached to the Copy node, their debug URLs and whether
+  //     they have the attribute value gated_grpc == True.
+  //
+  // Returns:
+  //   Whether any of the attached downstream debug ops is enabled given the
+  //   current status of the gRPC gating.
+  static bool IsCopyNodeGateOpen(
+      const std::vector<DebugWatchAndURLSpec>& specs);
+
+  // Determine whether a debug node needs to proceed given the current gRPC
+  // gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_urls: the debug URLs of the debug node.
+  //
+  // Returns:
+  //   Whether this debug op should proceed.
+  static bool IsDebugNodeGateOpen(const string& watch_key,
+                                  const std::vector<string>& debug_urls);
+
+  // Determine whether debug information should be sent through a grpc://
+  // debug URL given the current gRPC gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_url: the debug URL, e.g., "grpc://localhost:3333",
+  //     "file:///tmp/tfdbg_1".
+  //
+  // Returns:
+  //   Whether the sending of debug data to the debug_url should
+  //     proceed.
+  static bool IsDebugURLGateOpen(const string& watch_key,
+                                 const string& debug_url);
+
   static Status CloseDebugURL(const string& debug_url);
 
- private:
+  static const char* const kMetadataFilePrefix;
+  static const char* const kCoreMetadataTag;
+  static const char* const kDeviceTag;
+  static const char* const kGraphTag;
+
   static const char* const kFileURLScheme;
   static const char* const kGrpcURLScheme;
 };
@@ -81,16 +165,12 @@ class DebugFileIO {
   //   /tmp/tfdbg_dump/foo/bar_0_DebugIdentity_1467891234512345.
   //
   // Args:
-  //   node_name: Name of the node from which the tensor is output.
-  //   output_slot: Output slot index.
-  //   debug_op: Name of the debug op, e.g., "DebugIdentity".
-  //   tensor: The Tensor object to be dumped to file.
+  //   debug_node_key: A DebugNodeKey identifying the debug node.
   //   wall_time_us: Wall time at which the Tensor is generated during graph
   //     execution. Unit: microseconds (us).
   //   dump_root_dir: Root directory for dumping the tensor.
   //   dump_file_path: The actual dump file path (passed as reference).
-  static Status DumpTensorToDir(const string& node_name,
-                                const int32 output_slot, const string& debug_op,
+  static Status DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                 const Tensor& tensor, const uint64 wall_time_us,
                                 const string& dump_root_dir,
                                 string* dump_file_path);
@@ -105,8 +185,7 @@ class DebugFileIO {
   //   debug_op: Name of the debug op, e.g., DebugIdentity.
   //   wall_time_us: Time stamp of the dumped tensor, in microseconds (us).
   static string GetDumpFilePath(const string& dump_root_dir,
-                                const string& node_name,
-                                const int32 output_slot, const string& debug_op,
+                                const DebugNodeKey& debug_node_key,
                                 const uint64 wall_time_us);
 
   static Status DumpEventProtoToFile(const Event& event_proto,
@@ -115,9 +194,10 @@ class DebugFileIO {
 
  private:
   // Encapsulate the Tensor in an Event protobuf and write it to file.
-  static Status DumpTensorToEventFile(
-      const string& node_name, const int32 output_slot, const string& debug_op,
-      const Tensor& tensor, const uint64 wall_time_us, const string& file_path);
+  static Status DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
+                                      const Tensor& tensor,
+                                      const uint64 wall_time_us,
+                                      const string& file_path);
 
   // Implemented ad hoc here for now.
   // TODO(cais): Replace with shared implementation once http://b/30497715 is
@@ -125,6 +205,15 @@ class DebugFileIO {
   static Status RecursiveCreateDir(Env* env, const string& dir);
 };
 
+}  // namespace tensorflow
+
+// TODO(cais): Support grpc:// debug URLs in open source once Python grpc
+//   genrule becomes available. See b/23796275.
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/debug/debug_service.grpc.pb.h"
+
+namespace tensorflow {
+
 class DebugGrpcChannel {
  public:
   // Constructor of DebugGrpcChannel.
@@ -137,8 +226,16 @@ class DebugGrpcChannel {
 
   virtual ~DebugGrpcChannel() {}
 
-  // Query whether the gRPC channel is ready for use.
-  bool is_channel_ready();
+  // Attempt to establish connection with server.
+  //
+  // Args:
+  //   timeout_micros: Timeout (in microseconds) for the attempt to establish
+  //     the connection.
+  //
+  // Returns:
+  //   OK Status iff connection is successfully established before timeout,
+  //   otherwise return an error Status.
+  Status Connect(const int64 timeout_micros);
 
   // Write an Event proto to the debug gRPC stream.
   //
@@ -151,10 +248,13 @@ class DebugGrpcChannel {
   //   True iff the write is successful.
   bool WriteEvent(const Event& event);
 
-  // Close the stream and the channel.
-  Status Close();
+  // Receive EventReplies from server (if any) and close the stream and the
+  // channel.
+  Status ReceiveServerRepliesAndClose();
 
  private:
+  string server_stream_addr_;
+  string url_;
   ::grpc::ClientContext ctx_;
   std::shared_ptr<::grpc::Channel> channel_;
   std::unique_ptr<EventListener::Stub> stub_;
@@ -167,30 +267,69 @@ class DebugGrpcChannel {
 class DebugGrpcIO {
  public:
   // Send a tensor through a debug gRPC stream.
-  static Status SendTensorThroughGrpcStream(const string& node_name,
-                                            const int32 output_slot,
-                                            const string& debug_op,
+  static Status SendTensorThroughGrpcStream(const DebugNodeKey& debug_node_key,
                                             const Tensor& tensor,
                                             const uint64 wall_time_us,
-                                            const string& server_stream_addr);
+                                            const string& grpc_stream_url,
+                                            const bool gated);
 
   // Send an Event proto through a debug gRPC stream.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to CloseGrpcStream().
-  static Status SendEventProtoThroughGrpcStream(
-      const Event& event_proto, const string& server_stream_addr);
+  static Status SendEventProtoThroughGrpcStream(const Event& event_proto,
+                                                const string& grpc_stream_url);
+
+  // Check whether a debug watch key is allowed to send data to a given grpc://
+  // debug URL given the current gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   grpc_debug_url: the debug URL, e.g., "grpc://localhost:3333",
+  //
+  // Returns:
+  //   Whether the sending of debug data to grpc_debug_url should
+  //     proceed.
+  static bool IsGateOpen(const string& watch_key, const string& grpc_debug_url);
 
   // Close a gRPC stream to the given address, if it exists.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to SendTensorThroughGrpcStream().
-  static Status CloseGrpcStream(const string& server_stream_addr);
+  static Status CloseGrpcStream(const string& grpc_stream_url);
+
+  // Enable a debug watch key at a grpc:// debug URL.
+  static void EnableWatchKey(const string& grpc_debug_url,
+                             const string& watch_key);
+
+  // Disable a debug watch key at a grpc:// debug URL.
+  static void DisableWatchKey(const string& grpc_debug_url,
+                              const string& watch_key);
 
  private:
+  // Returns a global map from grpc debug URLs to the corresponding
+  // DebugGrpcChannels.
+  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+  GetStreamChannels();
+
+  // Returns a global map from grpc debug URLs to the enabled gated debug nodes.
+  // The keys are grpc:// URLs of the debug servers, e.g., "grpc://foo:3333".
+  // Each value element of the value has the format
+  // <node_name>:<output_slot>:<debug_op>", e.g.,
+  // "Weights_1:0:DebugNumericSummary".
+  static std::unordered_map<string, std::unordered_set<string>>*
+  GetEnabledWatchKeys();
+
+  static void ClearEnabledWatchKeys();
+  static void CreateEmptyEnabledSet(const string& grpc_debug_url);
+
   static mutex streams_mu;
-  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>
-      stream_channels GUARDED_BY(streams_mu);
+  static int64 channel_connection_timeout_micros;
+
+  friend class GrpcDebugTest;
+  friend class DebugNumericSummaryOpTest;
 };
 
 }  // namespace tensorflow
+#endif  // #if defined(PLATFORM_GOOGLE)
 
 #endif  // TENSORFLOW_DEBUG_IO_UTILS_H_
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index ab020517b04..35c95fb98c4 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_io_utils.h"
 
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -47,6 +49,18 @@ class DebugIOUtilsTest : public ::testing::Test {
   std::unique_ptr<Tensor> tensor_b_;
 };
 
+TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+                              "hidden_1/MatMul", 0, "DebugIdentity");
+  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
+  EXPECT_EQ(0, debug_node_key.output_slot);
+  EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
+  EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+            debug_node_key.device_path);
+}
+
 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
   Initialize();
 
@@ -54,15 +68,13 @@ TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
 
   // Append levels of nonexisting directories, to test that the function can
   // create directories.
-  const string kNodeName = "foo/bar/qux/tensor_a";
-  const string kDebugOpName = "DebugIdentity";
-  const int32 output_slot = 0;
   const uint64 wall_time = env_->NowMicros();
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo/bar/qux/tensor_a", 0, "DebugIdentity");
 
   string dump_file_path;
-  TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(kNodeName, output_slot,
-                                            kDebugOpName, *tensor_a_, wall_time,
-                                            test_dir, &dump_file_path));
+  TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(
+      kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_path));
 
   // Read the file into a Event proto.
   Event event;
@@ -70,7 +82,7 @@ TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
 
   ASSERT_GE(wall_time, event.wall_time());
   ASSERT_EQ(1, event.summary().value().size());
-  ASSERT_EQ(strings::StrCat(kNodeName, ":", output_slot, ":", kDebugOpName),
+  ASSERT_EQ(kDebugNodeKey.debug_node_name,
             event.summary().value(0).node_name());
 
   Tensor a_prime(DT_FLOAT);
@@ -97,15 +109,13 @@ TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
 
   const string test_dir = testing::TmpDir();
 
-  const string kNodeName = "quux/grault/tensor_b";
-  const string kDebugOpName = "DebugIdentity";
-  const int32 output_slot = 1;
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "quux/grault/tensor_b", 1, "DebugIdentity");
   const uint64 wall_time = env_->NowMicros();
 
   string dump_file_name;
-  Status s = DebugFileIO::DumpTensorToDir(kNodeName, output_slot, kDebugOpName,
-                                          *tensor_b_, wall_time, test_dir,
-                                          &dump_file_name);
+  Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_b_, wall_time,
+                                          test_dir, &dump_file_name);
   ASSERT_TRUE(s.ok());
 
   // Read the file into a Event proto.
@@ -114,7 +124,8 @@ TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
 
   ASSERT_GE(wall_time, event.wall_time());
   ASSERT_EQ(1, event.summary().value().size());
-  ASSERT_EQ(strings::StrCat(kNodeName, ":", output_slot, ":", kDebugOpName),
+  ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+  ASSERT_EQ(kDebugNodeKey.debug_node_name,
             event.summary().value(0).node_name());
 
   Tensor b_prime(DT_STRING);
@@ -141,18 +152,22 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // First, create the file at the path.
   const string test_dir = testing::TmpDir();
-  const string txt_file_name = strings::StrCat(test_dir, "/baz");
-
-  if (!env_->FileExists(test_dir).ok()) {
-    ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+  const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+  const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
+                                   "DebugIdentity");
+  const string txt_file_dir =
+      io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
+  const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+  if (!env_->FileExists(txt_file_dir).ok()) {
+    ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
   }
   ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
 
   std::unique_ptr<WritableFile> file;
   ASSERT_TRUE(env_->NewWritableFile(txt_file_name, &file).ok());
-  file->Append("text in baz");
-  file->Flush();
-  file->Close();
+  TF_EXPECT_OK(file->Append("text in baz"));
+  TF_EXPECT_OK(file->Flush());
+  TF_ASSERT_OK(file->Close());
 
   // Verify that the path exists and that it is a file, not a directory.
   ASSERT_TRUE(env_->FileExists(txt_file_name).ok());
@@ -160,15 +175,12 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // Second, try to dump the tensor to a path that requires "baz" to be a
   // directory, which should lead to an error.
-  const string kNodeName = "baz/tensor_a";
-  const string kDebugOpName = "DebugIdentity";
-  const int32 output_slot = 0;
+
   const uint64 wall_time = env_->NowMicros();
 
   string dump_file_name;
-  Status s = DebugFileIO::DumpTensorToDir(kNodeName, output_slot, kDebugOpName,
-                                          *tensor_a_, wall_time, test_dir,
-                                          &dump_file_name);
+  Status s = DebugFileIO::DumpTensorToDir(kDebugNodeKey, *tensor_a_, wall_time,
+                                          test_dir, &dump_file_name);
   ASSERT_FALSE(s.ok());
 
   // Tear down temporary file and directories.
@@ -185,9 +197,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
   Initialize();
 
   const int kNumDumpRoots = 3;
-  const string kNodeName = "foo/bar/qux/tensor_a";
-  const string kDebugOpName = "DebugIdentity";
-  const int32 output_slot = 0;
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo/bar/qux/tensor_a", 0, "DebugIdentity");
   const uint64 wall_time = env_->NowMicros();
 
   std::vector<string> dump_roots;
@@ -197,8 +208,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
     string dump_root = strings::StrCat(testing::TmpDir(), "/", i);
 
     dump_roots.push_back(dump_root);
-    dump_file_paths.push_back(DebugFileIO::GetDumpFilePath(
-        dump_root, kNodeName, output_slot, kDebugOpName, wall_time));
+    dump_file_paths.push_back(
+        DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
     urls.push_back(strings::StrCat("file://", dump_root));
   }
 
@@ -206,12 +217,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
     ASSERT_NE(dump_roots[0], dump_roots[i]);
   }
 
-  const string tensor_name = strings::StrCat(kNodeName, ":", output_slot);
-  const string debug_node_name =
-      strings::StrCat(tensor_name, ":", kDebugOpName);
-
-  Status s = DebugIO::PublishDebugTensor(tensor_name, kDebugOpName, *tensor_a_,
-                                         wall_time, urls);
+  Status s =
+      DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls);
   ASSERT_TRUE(s.ok());
 
   // Try reading the file into a Event proto.
@@ -222,7 +229,9 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
 
     ASSERT_GE(wall_time, event.wall_time());
     ASSERT_EQ(1, event.summary().value().size());
-    ASSERT_EQ(debug_node_name, event.summary().value(0).node_name());
+    ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+    ASSERT_EQ(kDebugNodeKey.debug_node_name,
+              event.summary().value(0).node_name());
 
     Tensor a_prime(DT_FLOAT);
     ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
@@ -250,18 +259,13 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
   Initialize();
 
   const int kConcurrentPubs = 3;
-  const string kNodeName = "tensor_a";
-  const string kDebugOpName = "DebugIdentity";
-  const int32 kOutputSlot = 0;
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "tensor_a", 0, "DebugIdentity");
 
   thread::ThreadPool* tp =
       new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs);
   const uint64 wall_time = env_->NowMicros();
-
   const string dump_root_base = testing::TmpDir();
-  const string tensor_name = strings::StrCat(kNodeName, ":", kOutputSlot);
-  const string debug_node_name =
-      strings::StrCat(tensor_name, ":", kDebugOpName);
 
   mutex mu;
   std::vector<string> dump_roots GUARDED_BY(mu);
@@ -272,8 +276,7 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
   Notification all_done;
 
   auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
-             &dump_file_paths, &wall_time, &tensor_name, &debug_node_name,
-             &kNodeName, &kDebugOpName, &kConcurrentPubs, &kOutputSlot,
+             &dump_file_paths, &wall_time, &kDebugNodeKey, &kConcurrentPubs,
              &all_done]() {
     // "gumpy" is the shared directory part of the path.
     string dump_root;
@@ -284,8 +287,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
           strings::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++);
 
       dump_roots.push_back(dump_root);
-      dump_file_paths.push_back(DebugFileIO::GetDumpFilePath(
-          dump_root, kNodeName, kOutputSlot, kDebugOpName, wall_time));
+      dump_file_paths.push_back(
+          DebugFileIO::GetDumpFilePath(dump_root, kDebugNodeKey, wall_time));
 
       debug_url = strings::StrCat("file://", dump_root);
     }
@@ -293,8 +296,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
     std::vector<string> urls;
     urls.push_back(debug_url);
 
-    Status s = DebugIO::PublishDebugTensor(tensor_name, kDebugOpName,
-                                           *tensor_a_, wall_time, urls);
+    Status s =
+        DebugIO::PublishDebugTensor(kDebugNodeKey, *tensor_a_, wall_time, urls);
     ASSERT_TRUE(s.ok());
 
     {
@@ -330,7 +333,9 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
 
       ASSERT_GE(wall_time, event.wall_time());
       ASSERT_EQ(1, event.summary().value().size());
-      ASSERT_EQ(debug_node_name, event.summary().value(0).node_name());
+      ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+      ASSERT_EQ(kDebugNodeKey.debug_node_name,
+                event.summary().value(0).node_name());
 
       Tensor a_prime(DT_FLOAT);
       ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
diff --git a/tensorflow/core/debug/debug_service.proto b/tensorflow/core/debug/debug_service.proto
index 967c99c108f..63d6668292a 100644
--- a/tensorflow/core/debug/debug_service.proto
+++ b/tensorflow/core/debug/debug_service.proto
@@ -20,9 +20,22 @@ package tensorflow;
 import "tensorflow/core/util/event.proto";
 
 // Reply message from EventListener to the client, i.e., to the source of the
-// Event protocal buffers, e.g., debug ops inserted by a debugged runtime to a
+// Event protocol buffers, e.g., debug ops inserted by a debugged runtime to a
 // TensorFlow graph being executed.
 message EventReply {
+  message DebugOpStateChange {
+    enum Change {
+      DISABLE = 0;
+      ENABLE = 1;
+    }
+
+    Change change = 1;
+    string node_name = 2;
+    int32 output_slot = 3;
+    string debug_op = 4;
+  }
+
+  repeated DebugOpStateChange debug_op_state_changes = 1;
 }
 
 // EventListener: Receives Event protos, e.g., from debugged TensorFlow
diff --git a/tensorflow/core/debug/debug_test_server_main.cc b/tensorflow/core/debug/debug_test_server_main.cc
deleted file mode 100644
index 578563be093..00000000000
--- a/tensorflow/core/debug/debug_test_server_main.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "grpc++/grpc++.h"
-#include "tensorflow/core/debug/debug_grpc_testlib.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-// Usage: debug_test_server_main <port> <dump_root>
-int main(int argc, char* argv[]) {
-  if (argc != 3) {
-    std::cerr << "Usage: debug_test_server_main <port> <dump_root>"
-              << std::endl;
-    return 1;
-  }
-
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  int port = 0;
-  tensorflow::strings::safe_strto32(argv[1], &port);
-  std::string test_server_addr = tensorflow::strings::StrCat("0.0.0.0:", port);
-
-  tensorflow::test::TestEventListenerImpl debug_test_server(argv[2]);
-
-  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(test_server_addr,
-                           ::grpc::InsecureServerCredentials());
-  builder.RegisterService(&debug_test_server);
-  std::unique_ptr<::grpc::Server> test_server = builder.BuildAndStart();
-
-  test_server->Wait();
-
-  return 0;
-}
diff --git a/tensorflow/core/debug/debugger_state_impl.cc b/tensorflow/core/debug/debugger_state_impl.cc
new file mode 100644
index 00000000000..2f5aaf93fa2
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.cc
@@ -0,0 +1,67 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debugger_state_impl.h"
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+
+namespace tensorflow {
+
+DebuggerState::DebuggerState(const DebugOptions& debug_options) {
+  for (const DebugTensorWatch& watch :
+       debug_options.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls_.insert(url);
+    }
+  }
+}
+
+DebuggerState::~DebuggerState() {
+  for (const string& debug_url : debug_urls_) {
+    DebugIO::CloseDebugURL(debug_url).IgnoreError();
+  }
+}
+
+Status DebuggerState::PublishDebugMetadata(
+    const int64 global_step, const int64 session_run_index,
+    const int64 executor_step_index, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names) {
+  return DebugIO::PublishDebugMetadata(global_step, session_run_index,
+                                       executor_step_index, input_names,
+                                       output_names, target_names, debug_urls_);
+}
+
+Status DebugGraphDecorator::DecorateGraph(Graph* graph, Device* device) {
+  DebugNodeInserter::DeparallelizeWhileLoops(graph, device);
+  return DebugNodeInserter::InsertNodes(
+      debug_options_.debug_tensor_watch_opts(), graph, device);
+}
+
+Status DebugGraphDecorator::PublishGraph(const Graph& graph,
+                                         const string& device_name) {
+  std::unordered_set<string> debug_urls;
+  for (const DebugTensorWatch& watch :
+       debug_options_.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls.insert(url);
+    }
+  }
+
+  return DebugIO::PublishGraph(graph, device_name, debug_urls);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
new file mode 100644
index 00000000000..52e2663d083
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+#define TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace tensorflow {
+
+class DebuggerState : public DebuggerStateInterface {
+ public:
+  DebuggerState(const DebugOptions& debug_options);
+  virtual ~DebuggerState();
+
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
+  // details.
+  Status PublishDebugMetadata(const int64 global_step,
+                              const int64 session_run_count,
+                              const int64 executor_step_count,
+                              const std::vector<string>& input_names,
+                              const std::vector<string>& output_names,
+                              const std::vector<string>& target_names) override;
+
+ private:
+  std::unordered_set<string> debug_urls_;
+};
+
+class DebugGraphDecorator : public DebugGraphDecoratorInterface {
+ public:
+  DebugGraphDecorator(const DebugOptions& debug_options)
+      : debug_options_(debug_options) {}
+  virtual ~DebugGraphDecorator() {}
+
+  Status DecorateGraph(Graph* graph, Device* device) override;
+  Status PublishGraph(const Graph& graph, const string& device_name) override;
+
+ private:
+  DebugOptions debug_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DEBUGGER_STATE_IMPL_H_
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
new file mode 100644
index 00000000000..d6f35fe24c3
--- /dev/null
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -0,0 +1,295 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+namespace {
+
+SessionOptions Devices(int num_cpus, int num_gpus) {
+  SessionOptions result;
+  (*result.config.mutable_device_count())["CPU"] = num_cpus;
+  (*result.config.mutable_device_count())["GPU"] = num_gpus;
+  return result;
+}
+
+void CreateGraphDef(GraphDef* graph_def, string node_names[3]) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({1, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  node_names[0] = a->name();
+
+  Tensor b_tensor(DT_FLOAT, TensorShape({2, 1}));
+  test::FillValues<float>(&b_tensor, {2.0, 1.0});
+  Node* b = test::graph::Constant(&graph, b_tensor);
+  node_names[1] = b->name();
+
+  // c = a * b
+  Node* c = test::graph::Matmul(&graph, a, b, false, false);
+  node_names[2] = c->name();
+
+  test::graph::ToGraphDef(&graph, graph_def);
+}
+
+// Asserts that "val" is a single float tensor. The only float is
+// "expected_val".
+void IsSingleFloatValue(const Tensor& val, float expected_val) {
+  ASSERT_EQ(val.dtype(), DT_FLOAT);
+  ASSERT_EQ(val.NumElements(), 1);
+  ASSERT_EQ(val.flat<float>()(0), expected_val);
+}
+
+SessionOptions Options(const string& target, int placement_period) {
+  SessionOptions options;
+  // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
+  // string.
+  options.target = strings::StrCat("grpc://", target);
+  options.config.set_placement_period(placement_period);
+  options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_opt_level(OptimizerOptions::L0);
+  return options;
+}
+
+std::unique_ptr<Session> NewRemote(const SessionOptions& options) {
+  return std::unique_ptr<Session>(CHECK_NOTNULL(NewSession(options)));
+}
+
+class GrpcSessionDebugTest : public ::testing::Test {
+ protected:
+  void SetUp() override { CreateDumpDir(); }
+
+  void TearDown() override { DeleteDumpDir(); }
+
+  void DeleteDumpDir() {
+    if (Env::Default()->IsDirectory(dump_dir_).ok()) {
+      int64 undeleted_files = 0;
+      int64 undeleted_dirs = 0;
+      ASSERT_TRUE(
+          Env::Default()
+              ->DeleteRecursively(dump_dir_, &undeleted_files, &undeleted_dirs)
+              .ok());
+      ASSERT_EQ(0, undeleted_files);
+      ASSERT_EQ(0, undeleted_dirs);
+    }
+  }
+
+  const string GetDebugURL() { return debug_url_; }
+
+  void LoadTensorDumps(const string& subdir, std::vector<Tensor>* tensors) {
+    const string dirpath = io::JoinPath(dump_dir_, subdir);
+    if (!(Env::Default()->IsDirectory(dirpath).ok())) {
+      return;
+    }
+
+    std::vector<string> filenames;
+    TF_ASSERT_OK(Env::Default()->GetChildren(dirpath, &filenames));
+
+    for (const string& filename : filenames) {
+      Event event;
+      TF_ASSERT_OK(ReadEventFromFile(io::JoinPath(dirpath, filename), &event));
+      if (event.summary().value().size() == 1) {
+        Tensor tensor;
+        ASSERT_TRUE(tensor.FromProto(event.summary().value(0).tensor()));
+        tensors->push_back(tensor);
+      }
+    }
+  }
+
+ private:
+  void CreateDumpDir() {
+    char dir_template[] = "/tmp/tfdbg_grpc_sessions_XXXXXX";
+    dump_dir_ = mkdtemp(dir_template);
+    debug_url_ = strings::StrCat("file://", dump_dir_);
+  }
+
+  string dump_dir_;
+  string debug_url_;
+};
+
+TEST_F(GrpcSessionDebugTest, FileDebugURL) {
+  GraphDef graph;
+  string node_names[3];
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  auto session = NewRemote(Options(cluster->targets()[0], 1));
+  TF_CHECK_OK(session->Create(graph));
+
+  // Iteration 0: No watch.
+  // Iterations 1 and 2: Watch one Tensor.
+  // Iterations 3 and 4: Watch two Tensors.
+  // Iteration 5: No watch.
+  for (size_t i = 0; i < 6; ++i) {
+    RunOptions options;
+    if (i >= 1 && i < 5) {
+      DebugOptions* debug_options = options.mutable_debug_options();
+      DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+      watch->set_node_name(node_names[0]);
+      watch->set_output_slot(0);
+      watch->add_debug_ops("DebugIdentity");
+      watch->add_debug_urls(GetDebugURL());
+
+      if (i >= 3) {
+        watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(node_names[1]);
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+      }
+    }
+
+    RunMetadata metadata;
+    std::vector<Tensor> outputs;
+    TF_CHECK_OK(
+        session->Run(options, {}, {node_names[2]}, {}, &outputs, &metadata));
+    ASSERT_EQ(1, outputs.size());
+    IsSingleFloatValue(outputs[0], 4.0);
+
+    std::vector<Tensor> dumped_tensors;
+    LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(
+                                     cluster->devices()[0].name()),
+                                 "n"),
+                    &dumped_tensors);
+
+    if (i == 0 || i == 5) {
+      ASSERT_EQ(0, dumped_tensors.size());
+    } else {
+      if (i == 1 || i == 2) {
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({1, 2}), dumped_tensors[0].shape());
+        ASSERT_EQ(1.0, dumped_tensors[0].flat<float>()(0));
+        ASSERT_EQ(2.0, dumped_tensors[0].flat<float>()(1));
+      } else {
+        ASSERT_EQ(2, dumped_tensors.size());
+      }
+      DeleteDumpDir();
+    }
+  }
+  TF_CHECK_OK(session->Close());
+}
+
+void SetDevice(GraphDef* graph, const string& name, const string& dev) {
+  for (size_t i = 0; i < graph->node_size(); ++i) {
+    if (graph->node(i).name() == name) {
+      graph->mutable_node(i)->set_device(dev);
+      return;
+    }
+  }
+  LOG(FATAL) << "Name '" << name << "' not found.";
+}
+
+TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 1), 2, &cluster));
+  auto session = NewRemote(Options(cluster->targets()[0], 1000));
+
+  // b = a
+  Graph graph(OpRegistry::Global());
+  Tensor a_tensor(DT_STRING, TensorShape({2, 2}));
+  for (size_t i = 0; i < 4; ++i) {
+    a_tensor.flat<string>()(i) = "hello, world";
+  }
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  Node* b = test::graph::Identity(&graph, a);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  // In this test, we force each node (a, b) on every possible device.
+  // We test all possible cases.
+  for (const auto& a_dev : cluster->devices()) {
+    for (const auto& b_dev : cluster->devices()) {
+      LOG(INFO) << "a: " << a_dev.name() << " b: " << b_dev.name();
+      SetDevice(&def, a->name(), a_dev.name());
+      SetDevice(&def, b->name(), b_dev.name());
+
+      Status s = session->Create(def);
+      if (s.ok()) {
+        std::vector<Tensor> outputs;
+
+        RunOptions options;
+        DebugOptions* debug_options = options.mutable_debug_options();
+        DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(a->name());
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+
+        RunMetadata metadata;
+        TF_CHECK_OK(
+            session->Run(options, {}, {b->name()}, {}, &outputs, &metadata));
+        ASSERT_EQ(1, outputs.size());
+        ASSERT_EQ(outputs[0].dtype(), DT_STRING);
+        ASSERT_EQ(outputs[0].NumElements(), 4);
+        for (size_t i = 0; i < outputs[0].NumElements(); ++i) {
+          EXPECT_EQ(outputs[0].flat<string>()(i), "hello, world");
+        }
+        TF_CHECK_OK(session->Close());
+
+        std::vector<Tensor> dumped_tensors;
+        LoadTensorDumps(
+            io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()),
+                         "n"),
+            &dumped_tensors);
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
+        for (size_t i = 0; i < 4; ++i) {
+          ASSERT_EQ("hello, world", dumped_tensors[0].flat<string>()(i));
+        }
+
+        DeleteDumpDir();
+      } else {
+        // CUDA and SYCL devices do not have an Identity op for strings
+        LOG(ERROR) << "Error: " << s;
+        ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
+                    (a_dev.device_type() == DEVICE_SYCL) ||
+                    (b_dev.device_type() == DEVICE_GPU) ||
+                    (b_dev.device_type() == DEVICE_SYCL));
+        ASSERT_FALSE(s.ok());
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index e267414654a..efc08e4c9d0 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -29,6 +29,7 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # For platform specific build config
 load(
@@ -44,6 +45,29 @@ package(default_visibility = [
     "//tensorflow:internal",
 ])
 
+cc_library(
+    name = "partial_run_mgr",
+    srcs = ["partial_run_mgr.cc"],
+    hdrs = ["partial_run_mgr.h"],
+    deps = [
+        ":worker_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "partial_run_mgr_test",
+    size = "small",
+    srcs = ["partial_run_mgr_test.cc"],
+    deps = [
+        ":partial_run_mgr",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "message_wrappers",
     srcs = ["message_wrappers.cc"],
@@ -53,6 +77,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -63,16 +88,56 @@ cc_test(
     srcs = ["message_wrappers_test.cc"],
     deps = [
         ":message_wrappers",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+cc_library(
+    name = "worker_session",
+    srcs = ["worker_session.cc"],
+    hdrs = [
+        "worker_session.h",
+    ],
+    deps = [
+        ":graph_mgr",
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+    ],
+)
+
+cc_library(
+    name = "session_mgr",
+    srcs = ["session_mgr.cc"],
+    hdrs = ["session_mgr.h"],
+    deps = [
+        ":graph_mgr",
+        ":worker_session",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "session_mgr_test",
+    size = "small",
+    srcs = ["session_mgr_test.cc"],
+    deps = [
+        ":session_mgr",
+        ":worker_env",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+    ],
+)
+
 cc_library(
     name = "worker_env",
     hdrs = ["worker_env.h"],
-    deps = [],
+    deps = ["//tensorflow/core:lib"],
 )
 
 cc_library(
@@ -100,8 +165,11 @@ cc_library(
     ],
     deps = [
         ":graph_mgr",
+        ":partial_run_mgr",
         ":rendezvous_mgr_interface",
+        ":session_mgr",
         ":worker_interface",
+        ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_internal",
     ],
@@ -147,7 +215,11 @@ cc_test(
 cc_library(
     name = "worker_cache",
     hdrs = ["worker_cache.h"],
-    deps = ["//tensorflow/core:protos_all_cc"],
+    deps = [
+        ":worker_interface",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 cc_library(
@@ -167,6 +239,7 @@ cc_library(
     name = "master_interface",
     hdrs = ["master_interface.h"],
     deps = [
+        ":call_options",
         ":message_wrappers",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
@@ -190,6 +263,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -212,6 +286,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
     ],
 )
 
@@ -231,7 +306,9 @@ cc_library(
     srcs = [],
     hdrs = ["rendezvous_mgr_interface.h"],
     deps = [
-        "//tensorflow/core:framework",
+        ":worker_env",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -250,21 +327,29 @@ cc_library(
     name = "base_rendezvous_mgr",
     srcs = ["base_rendezvous_mgr.cc"],
     hdrs = ["base_rendezvous_mgr.h"],
+    copts = tf_copts(),
     deps = [
         ":rendezvous_mgr_interface",
         ":worker_cache",
         ":worker_env",
         ":worker_interface",
+        ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 cc_library(
     name = "master_env",
     hdrs = ["master_env.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+    ],
 )
 
 cc_library(
@@ -280,6 +365,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/debug",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/README.md b/tensorflow/core/distributed_runtime/README.md
index ab1771e2942..d22cd2a45bc 100644
--- a/tensorflow/core/distributed_runtime/README.md
+++ b/tensorflow/core/distributed_runtime/README.md
@@ -5,6 +5,4 @@ distributed TensorFlow runtime, using [gRPC](http://grpc.io) for inter-process
 communication.
 
 To learn how to use the distributed runtime to create a TensorFlow cluster,
-see the "Distributed TensorFlow" How To, which is available [in this
-repository](../../g3doc/how_tos/distributed/index.md), and will be available
-on the TensorFlow website after the next version is released.
+see the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) How-To.
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 787aa64f69b..e68aea46ecd 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -35,7 +35,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* env) : worker_env_(env) {}
+BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env)
+    : worker_env_(worker_env) {}
 
 BaseRendezvousMgr::~BaseRendezvousMgr() {
   for (auto& p : table_) {
@@ -45,7 +46,7 @@ BaseRendezvousMgr::~BaseRendezvousMgr() {
   }
 }
 
-Rendezvous* BaseRendezvousMgr::Find(int64 step_id) {
+RemoteRendezvous* BaseRendezvousMgr::Find(int64 step_id) {
   return FindOrCreate(step_id);
 }
 
@@ -130,7 +131,8 @@ BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
                                            bool tolerate_dup_recv)
     : env_(env),
       step_id_(step_id),
-      local_(NewLocalRendezvous(tolerate_dup_recv)) {}
+      local_(NewLocalRendezvous(tolerate_dup_recv)),
+      session_(nullptr) {}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
   CHECK(active_.empty());
@@ -140,9 +142,44 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // Returns true if "device_name" is a valid full name of local device
 // of the "worker".  This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
-static bool IsLocalDevice(const WorkerEnv& worker,
+static bool IsLocalDevice(const string& worker_name,
                           const StringPiece device_name) {
-  return device_name.starts_with(worker.worker_name);
+  return device_name.starts_with(worker_name);
+}
+
+Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
+  CHECK_NE(session, nullptr) << "session must not be null!";
+  std::vector<DeferredCall> deferred_calls;
+  {
+    mutex_lock l(mu_);
+    if (session_ != nullptr) {
+      if (session_->worker_name == session->worker_name) {
+        LOG(INFO) << "Skipping rendezvous re-initialization.";
+        return Status::OK();
+      }
+      Status s = errors::Internal(
+          "Double init! Worker names would have changed from: ",
+          session_->worker_name, " -> ", session->worker_name);
+      LOG(WARNING) << s;
+      return s;
+    }
+    session_ = session;
+    std::swap(deferred_calls, deferred_calls_);
+  }
+  for (DeferredCall& call : deferred_calls) {
+    RecvLocalAsyncInternal(call.parsed, std::move(call.done));
+  }
+  return Status::OK();
+}
+
+WorkerSession* BaseRemoteRendezvous::session() {
+  mutex_lock l(mu_);
+  return session_;
+}
+
+bool BaseRemoteRendezvous::is_initialized() {
+  mutex_lock l(mu_);
+  return is_initialized_locked();
 }
 
 Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
@@ -152,10 +189,12 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
-  }
-  if (!IsLocalDevice(*env_, parsed.src_device)) {
-    return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", env_->worker_name);
+    DCHECK(is_initialized_locked());
+    if (!IsLocalDevice(session_->worker_name, parsed.src_device)) {
+      return errors::InvalidArgument(
+          "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
+          session_->worker_name);
+    }
   }
   // Buffers "val" and "device_context" in local_.
   return local_->Send(parsed, args, val, is_dead);
@@ -163,17 +202,24 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
 
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
                                              bool is_src) {
+  // Cache session pointer to avoid repeatedly taking & releasing the lock
+  // (e.g. calling session())
+  WorkerSession* sess = nullptr;
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
+    if (!is_initialized_locked()) {
+      return errors::Internal("ValidateDevices called before initialization.");
+    }
+    sess = session_;
   }
-  if (is_src && !IsLocalDevice(*env_, parsed.src_device)) {
+  if (is_src && !IsLocalDevice(sess->worker_name, parsed.src_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", env_->worker_name);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
-  if (!is_src && !IsLocalDevice(*env_, parsed.dst_device)) {
+  if (!is_src && !IsLocalDevice(sess->worker_name, parsed.dst_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (dst): ",
-                                   parsed.FullKey(), " @ ", env_->worker_name);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
   return Status::OK();
 }
@@ -239,6 +285,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
+  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
@@ -275,6 +322,26 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
 
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
+  {
+    mutex_lock l(mu_);
+    if (!is_initialized_locked()) {
+      // RecvLocalAsync can be called (due to an incoming RecvTensor RPC from a
+      // remote worker) before the RunStep (or PartialRunStep) RPC from the
+      // master arrives. RecvLocalAsync thus buffers the arguments until after
+      // the RemoteRendezvous is Initialize()'d, when it completes the
+      // rendezvous logic. At some point after Initialize() is called, a Tensor
+      // is produced locally that will then be sent in response to the incoming
+      // RPC.
+      DeferredCall call(parsed, std::move(done));
+      deferred_calls_.push_back(call);
+      return;
+    }
+  }
+  RecvLocalAsyncInternal(parsed, std::move(done));
+}
+
+void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
+                                                  DoneCallback done) {
   Status s = ValidateDevices(parsed, true /* is_src */);
   if (!s.ok()) {
     done(s, Args(), Args(), Tensor(), false);
@@ -313,4 +380,8 @@ void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
   active_.erase(call);
 }
 
+BaseRemoteRendezvous::DeferredCall::DeferredCall(const ParsedKey& parsed,
+                                                 DoneCallback done)
+    : parsed(parsed), done(std::move(done)) {}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index 5e614541095..b252f45fe96 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -59,12 +60,16 @@ class BaseRecvTensorCall;
 class BaseRendezvousMgr : public RendezvousMgrInterface {
  public:
   explicit BaseRendezvousMgr(const WorkerEnv* worker_env);
+
   ~BaseRendezvousMgr() override;
 
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  Rendezvous* Find(int64 step_id) override;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  RemoteRendezvous* Find(int64 step_id) override;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
@@ -111,11 +116,14 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 // Buffering of Tensor values is delegated to a "local" Rendezvous
 // obtained from NewLocalRendezvous().  This class just adds
 // functionality to coordinate with remote workers.
-class BaseRemoteRendezvous : public Rendezvous {
+class BaseRemoteRendezvous : public RemoteRendezvous {
  public:
   BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
                        bool tolerate_dup_recv);
 
+  // Upgrades the BaseRemoteRendezvous to full initialization.
+  Status Initialize(WorkerSession* session) override;
+
   // Forwards to local_, where the Tensor "val" will be buffered and
   // any waiting callback stored.
   Status Send(const ParsedKey& key, const Rendezvous::Args& args,
@@ -158,6 +166,10 @@ class BaseRemoteRendezvous : public Rendezvous {
   // Removes "call" from active_ if "call" is in active_.
   void DeregisterCall(BaseRecvTensorCall* call);
 
+  WorkerSession* session();
+
+  bool is_initialized();
+
   ~BaseRemoteRendezvous() override;
 
   const WorkerEnv* const env_;  // Not owned.
@@ -170,10 +182,24 @@ class BaseRemoteRendezvous : public Rendezvous {
 
   // Status given by StartAbort() if any.
   Status status_ GUARDED_BY(mu_);
+  WorkerSession* session_ GUARDED_BY(mu_);  // Not owned.
+
+  // Data structures to handle calls when partially initialized.
+  struct DeferredCall {
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    DeferredCall(const ParsedKey& parsed, DoneCallback done);
+  };
+  std::vector<DeferredCall> deferred_calls_ GUARDED_BY(mu_);
 
   // Active outstanding RecvTensor calls.
   gtl::FlatSet<BaseRecvTensorCall*> active_ GUARDED_BY(mu_);
 
+  bool is_initialized_locked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return session_ != nullptr;
+  }
+
   // If "is_src" is true, checks that the rendezvous key "parsed"'s
   // source is in this process. If "is_src" is false, checks that the
   // rendezvous key "parsed"'s destination is in this process.
@@ -188,6 +214,9 @@ class BaseRemoteRendezvous : public Rendezvous {
                           const Rendezvous::Args& out_args, const Tensor& in,
                           Tensor* out, StatusCallback done);
 
+  // Must be called only if fully initialized.
+  void RecvLocalAsyncInternal(const ParsedKey& parsed, DoneCallback done);
+
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
 };
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 13ca471f464..f4bf9dcd3b9 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -41,11 +42,20 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
-GraphMgr::GraphMgr(const WorkerEnv* worker_env)
-    : worker_env_(worker_env), table_(5) {}
+GraphMgr::GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr)
+    : worker_env_(worker_env), device_mgr_(device_mgr), table_(5) {
+  // The default value of sync_on_finish will be flipped soon and this
+  // environment variable will be removed as well.
+  Status status =
+      ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+}
 
 GraphMgr::~GraphMgr() {
   for (auto p : table_) p.second->Unref();
@@ -83,6 +93,16 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
   return Status::OK();
 }
 
+Status GraphMgr::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
+  return Status::OK();
+}
+
 // Creates executors given a graph definition "gdef" of a "session".
 // If a node in "gdef" is shared by other graphs in "session", the
 // same op kernel is reused. E.g., typically a params node is shared
@@ -95,7 +115,8 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
 Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, Item* item) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, Item* item) {
   item->session = session;
   item->lib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library());
@@ -109,7 +130,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   }
 
   // Constructs the graph out of "gdef".
-  Graph graph(item->lib_def);
+  Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = true;
@@ -125,7 +146,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* device = nullptr;
-    Status s = worker_env_->device_mgr->LookupDevice(name, &device);
+    Status s = device_mgr_->LookupDevice(name, &device);
     if (s.ok()) {
       return device->attributes().incarnation();
     } else {
@@ -141,7 +162,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
   std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
   for (const auto& partition : partitions) {
-    std::unique_ptr<Graph> device_graph(new Graph(item->lib_def));
+    std::unique_ptr<Graph> device_graph(new Graph(OpRegistry::Global()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now allow.
     device_opts.allow_internal_ops = true;
@@ -170,8 +191,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     ExecutionUnit* unit = &(item->units.back());
 
     // Find the device.
-    Status s =
-        worker_env_->device_mgr->LookupDevice(device_name, &unit->device);
+    Status s = device_mgr_->LookupDevice(device_name, &unit->device);
     if (!s.ok()) {
       // Remove the empty unit from the item as the item destructor wants all
       // units to have valid devices.
@@ -191,7 +211,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
     // Function library runtime.
     unit->lib = NewFunctionLibraryRuntime(
-        worker_env_->device_mgr, worker_env_->env, unit->device,
+        device_mgr_, worker_env_->env, unit->device,
         subgraph->versions().producer(), item->lib_def,
         graph_options.optimizer_options());
 
@@ -221,6 +241,13 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph);
+
+    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph.
+    if (!debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          debug_options, subgraph.get(), params.device));
+    }
+
     TF_RETURN_IF_ERROR(
         EnsureMemoryTypes(DeviceType(unit->device->device_type()),
                           unit->device->name(), subgraph.get()));
@@ -236,9 +263,10 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 }
 
 Status GraphMgr::Register(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, string* handle) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, string* handle) {
   Item* item = new Item;
-  Status s = InitItem(session, gdef, graph_options, item);
+  Status s = InitItem(session, gdef, graph_options, debug_options, item);
   if (!s.ok()) {
     item->Unref();
     return s;
@@ -326,6 +354,67 @@ Status GraphMgr::RecvOutputsFromRendezvous(Rendezvous* rendezvous,
   return Status::OK();
 }
 
+void GraphMgr::RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous,
+                                              NamedTensors* out,
+                                              const StatusCallback& done) {
+  if (out->empty()) {
+    done(Status::OK());
+    return;
+  }
+  // We compute the args before calling RecvAsync because we need to ensure that
+  // out isn't being iterated over after done is called, since done deletes out.
+  std::vector<std::tuple<string, Tensor*, Rendezvous::ParsedKey>> args;
+  for (auto& p : *out) {
+    Rendezvous::ParsedKey parsed;
+    Status s = Rendezvous::ParseKey(p.first, &parsed);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    args.push_back(std::make_tuple(p.first, &p.second, parsed));
+  }
+
+  typedef struct {
+    mutex mu;
+    int done_counter;
+    Status shared_status = Status::OK();
+  } CallState;
+  CallState* call_state = new CallState;
+  call_state->done_counter = out->size();
+  for (auto& p : args) {
+    const string& key = std::get<0>(p);
+    Tensor* val = std::get<1>(p);
+    Rendezvous::ParsedKey parsed = std::get<2>(p);
+    rendezvous->RecvAsync(
+        parsed, Rendezvous::Args(),
+        [val, done, key, call_state](const Status& s,
+                                     const Rendezvous::Args& send_args,
+                                     const Rendezvous::Args& recv_args,
+                                     const Tensor& v, const bool is_dead) {
+          Status status = s;
+          if (status.ok()) {
+            *val = v;
+            if (is_dead) {
+              status = errors::InvalidArgument("The tensor returned for ", key,
+                                               " was not valid.");
+            }
+          }
+          call_state->mu.lock();
+          call_state->shared_status.Update(status);
+          call_state->done_counter--;
+          // If we are the last async call to return, call the done callback.
+          if (call_state->done_counter == 0) {
+            const Status& final_status = call_state->shared_status;
+            call_state->mu.unlock();
+            done(final_status);
+            delete call_state;
+            return;
+          }
+          call_state->mu.unlock();
+        });
+  }
+}
+
 Status GraphMgr::SendInputs(const int64 step_id, const NamedTensors& in) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = SendInputsToRendezvous(rendezvous, in);
@@ -340,8 +429,19 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
   return s;
 }
 
+void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
+                                StatusCallback done) {
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  RecvOutputsFromRendezvousAsync(rendezvous, out,
+                                 [done, rendezvous](const Status s) {
+                                   rendezvous->Unref();
+                                   done(s);
+                                 });
+}
+
 void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
-                            const ExecutorOpts& opts,
+                            WorkerSession* session,
+                            const ExecutorOpts& /*opts*/,
                             StepStatsCollector* collector,
                             CostGraphDef* cost_graph,
                             CancellationManager* cancellation_manager,
@@ -362,10 +462,14 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Status s = rendezvous->Initialize(session);
 
   // Sends values specified by the caller.
-  Status s = SendInputsToRendezvous(rendezvous, in);
+  if (s.ok()) {
+    s = SendInputsToRendezvous(rendezvous, in);
+  }
+
   if (!s.ok()) {
     done(s);
     item->Unref();
@@ -390,18 +494,18 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
                                       StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
-  ScopedStepContainer* step_container =
-      new ScopedStepContainer(step_id, [this](const string& name) {
-        worker_env_->device_mgr->ClearContainers({name});
-      });
+  ScopedStepContainer* step_container = new ScopedStepContainer(
+      step_id,
+      [this](const string& name) { device_mgr_->ClearContainers({name}); });
   // NOTE: Transfer one ref of rendezvous and item.
-  ExecutorBarrier* barrier = new ExecutorBarrier(
-      num_units, rendezvous, [this, item, collector, cost_graph, step_container,
-                              done](const Status& s) {
-        BuildCostModel(item, collector, cost_graph);
-        done(s);
-        delete step_container;
-      });
+  ExecutorBarrier* barrier =
+      new ExecutorBarrier(num_units, rendezvous,
+                          [this, item, collector, cost_graph, step_container,
+                           done](const Status& s) {
+                            BuildCostModel(item, collector, cost_graph);
+                            done(s);
+                            delete step_container;
+                          });
   Executor::Args args;
   {
     mutex_lock l(mu_);
@@ -411,12 +515,12 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
   args.cancellation_manager = cancellation_manager;
   args.stats_collector = collector;
   args.step_container = step_container;
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, handle);
   }
   thread::ThreadPool* pool = worker_env_->compute_pool;
-  using namespace std::placeholders;
+  using std::placeholders::_1;
   // Line below is equivalent to this code, but does one less indirect call:
   //  args.runner = [pool](std::function<void()> fn) { pool->Schedule(fn); };
   args.runner = std::bind(&thread::ThreadPool::Schedule, pool, _1);
@@ -439,7 +543,8 @@ void GraphMgr::BuildCostModel(Item* item, StepStatsCollector* collector,
 
     if (cost_graph != nullptr) {
       for (const auto& unit : item->units) {
-        cost_model_manager_.AddToCostGraphDef(unit.graph, cost_graph);
+        cost_model_manager_.AddToCostGraphDef(unit.graph, cost_graph)
+            .IgnoreError();
       }
     }
   }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index e9b8d415ed3..4ee3711d028 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -30,11 +30,15 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
 class ExecutorOpts;
 class StepStatsCollector;
+class RendezvousMgrInterface;
+class DeviceMgr;
+struct WorkerSession;
 
 // GraphMgr keeps track of a set of graphs that are registered with a
 // TensorFlow worker. Each registered graph is identified by a handle
@@ -60,12 +64,13 @@ class StepStatsCollector;
 //   EXPECT_EQ(out["c"], Tensor({4, 6}));
 class GraphMgr {
  public:
-  explicit GraphMgr(const WorkerEnv* worker_env);
+  explicit GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr);
   ~GraphMgr();
 
   // Registers a graph. Fills in "handle"
   Status Register(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, string* handle);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, string* handle);
 
   // Executes one step of a registered graph "handle".
   //
@@ -74,13 +79,15 @@ class GraphMgr {
   typedef std::map<string, Tensor> NamedTensors;
   typedef std::function<void(const Status&)> StatusCallback;
   void ExecuteAsync(const string& handle, const int64 step_id,
-                    const ExecutorOpts& opts, StepStatsCollector* collector,
-                    CostGraphDef* cost_graph,
+                    WorkerSession* session, const ExecutorOpts& opts,
+                    StepStatsCollector* collector, CostGraphDef* cost_graph,
                     CancellationManager* cancellation_manager,
                     const NamedTensors& in, StatusCallback done);
 
   Status SendInputs(const int64 step_id, const NamedTensors& in);
   Status RecvOutputs(const int64 step_id, NamedTensors* out);
+  void RecvOutputsAsync(const int64 step_id, NamedTensors* out,
+                        StatusCallback done);
 
   // Deregisters a graph.
   Status Deregister(const string& handle);
@@ -101,9 +108,9 @@ class GraphMgr {
   };
 
   struct Item : public core::RefCounted {
-    // TOOD(zhifengc): Keeps a copy of the original graph if the need arises.
-    // TOOD(zhifengc): Stats, updated by multiple runs potentially.
-    // TOOD(zhifengc): Dup-detection. Ensure step_id only run once.
+    // TODO(zhifengc): Keeps a copy of the original graph if the need arises.
+    // TODO(zhifengc): Stats, updated by multiple runs potentially.
+    // TODO(zhifengc): Dup-detection. Ensure step_id only run once.
     ~Item() override;
 
     // Session handle.
@@ -119,13 +126,13 @@ class GraphMgr {
     // has a root executor which may call into the runtime library.
     std::vector<ExecutionUnit> units;
 
-    // Used to deresgister a cost model when cost model is requried in graph
+    // Used to deresgister a cost model when cost model is required in graph
     // manager.
     GraphMgr* graph_mgr;
   };
 
-  // Not owned.
-  const WorkerEnv* worker_env_;
+  const WorkerEnv* worker_env_;             // Not owned.
+  DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
 
@@ -133,6 +140,9 @@ class GraphMgr {
   mutex mu_;
   int64 next_id_ GUARDED_BY(mu_) = 0;
 
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
+
   // Table mapping graph handles to registered graphs.
   //
   // TODO(zhifengc): If the client does not call Deregister, we'll
@@ -147,7 +157,7 @@ class GraphMgr {
                               CancellationManager* cancellation_manager,
                               StatusCallback done);
 
-  // Don't attempt to process cost models unless explicitely requested for at
+  // Don't attempt to process cost models unless explicitly requested for at
   // least one of the items.
   bool skip_cost_models_ = true;
 
@@ -156,9 +166,15 @@ class GraphMgr {
 
   Status SendInputsToRendezvous(Rendezvous* rendezvous, const NamedTensors& in);
   Status RecvOutputsFromRendezvous(Rendezvous* rendezvous, NamedTensors* out);
+  void RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous, NamedTensors* out,
+                                      const StatusCallback& done);
 
   Status InitItem(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, Item* item);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, Item* item);
+
+  Status DecorateAndPublishGraphForDebug(const DebugOptions& debug_options,
+                                         Graph* graph, Device* device);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GraphMgr);
 };
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index 61ead9f31da..c7ba7abeaff 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -23,11 +23,15 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-
-Status WaitForNotification(CallOptions* call_options, Notification* n) {
+Status WaitForNotification(CallOptions* call_options,
+                           const int64 default_timeout_in_ms, Notification* n) {
   int64 timeout_in_ms = call_options->GetTimeout();
+  if (timeout_in_ms == 0) {
+    timeout_in_ms = default_timeout_in_ms;
+  }
   if (timeout_in_ms > 0) {
-    bool notified = WaitForNotificationWithTimeout(n, timeout_in_ms);
+    int64 timeout_in_us = timeout_in_ms * 1000;
+    bool notified = WaitForNotificationWithTimeout(n, timeout_in_us);
     if (!notified) {
       call_options->StartCancel();
       // The call has borrowed pointers to the request and response
@@ -40,9 +44,11 @@ Status WaitForNotification(CallOptions* call_options, Notification* n) {
   }
   return Status::OK();
 }
-}
+}  // namespace
 
-LocalMaster::LocalMaster(Master* master_impl) : master_impl_(master_impl) {}
+LocalMaster::LocalMaster(Master* master_impl, const int64 default_timeout_in_ms)
+    : master_impl_(master_impl),
+      default_timeout_in_ms_(default_timeout_in_ms) {}
 
 Status LocalMaster::CreateSession(CallOptions* call_options,
                                   const CreateSessionRequest* request,
@@ -53,7 +59,8 @@ Status LocalMaster::CreateSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -66,7 +73,8 @@ Status LocalMaster::ExtendSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -79,7 +87,8 @@ Status LocalMaster::PartialRunSetup(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -93,7 +102,8 @@ Status LocalMaster::RunStep(CallOptions* call_options,
                           ret.Update(s);
                           n.Notify();
                         });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -114,7 +124,8 @@ Status LocalMaster::CloseSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -127,7 +138,8 @@ Status LocalMaster::ListDevices(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -140,7 +152,8 @@ Status LocalMaster::Reset(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -150,7 +163,15 @@ mutex* get_local_master_registry_lock() {
   return &local_master_registry_lock;
 }
 
-typedef std::unordered_map<string, Master*> LocalMasterRegistry;
+struct MasterInfo {
+  Master* master;
+  const int64 default_timeout_in_ms;
+
+  MasterInfo(Master* master, const int64 default_timeout_in_ms)
+      : master(master), default_timeout_in_ms(default_timeout_in_ms) {}
+};
+
+typedef std::unordered_map<string, MasterInfo> LocalMasterRegistry;
 LocalMasterRegistry* local_master_registry() {
   static LocalMasterRegistry* local_master_registry_ = new LocalMasterRegistry;
   return local_master_registry_;
@@ -158,9 +179,11 @@ LocalMasterRegistry* local_master_registry() {
 }  // namespace
 
 /* static */
-void LocalMaster::Register(const string& target, Master* master) {
+void LocalMaster::Register(const string& target, Master* master,
+                           int64 default_timeout_in_ms) {
   mutex_lock l(*get_local_master_registry_lock());
-  local_master_registry()->insert({target, master});
+  local_master_registry()->insert(
+      {target, MasterInfo(master, default_timeout_in_ms)});
 }
 
 /* static */
@@ -169,7 +192,8 @@ std::unique_ptr<LocalMaster> LocalMaster::Lookup(const string& target) {
   mutex_lock l(*get_local_master_registry_lock());
   auto iter = local_master_registry()->find(target);
   if (iter != local_master_registry()->end()) {
-    ret.reset(new LocalMaster(iter->second));
+    ret.reset(new LocalMaster(iter->second.master,
+                              iter->second.default_timeout_in_ms));
   }
   return ret;
 }
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index fe9cd9381c9..5fc21d3a1e2 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
 
 #include <memory>
 
@@ -38,7 +38,7 @@ class Master;
 // for cancellation.
 class LocalMaster : public MasterInterface {
  public:
-  ~LocalMaster(){};
+  ~LocalMaster() {}
 
   Status CreateSession(CallOptions* call_options,
                        const CreateSessionRequest* request,
@@ -67,6 +67,7 @@ class LocalMaster : public MasterInterface {
                      const ListDevicesRequest* request,
                      ListDevicesResponse* response) override;
 
+  // See tensorflow::Reset() and the comment on ResetRequest.
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
@@ -77,7 +78,8 @@ class LocalMaster : public MasterInterface {
   // any LocalMaster objects that may wrap this master. There is no
   // corresponding deregister method, since clean server shutdown is
   // not currently implemented for any server type.
-  static void Register(const string& target, Master* master);
+  static void Register(const string& target, Master* master,
+                       int64 default_timeout_in_ms);
 
   // Returns a pointer to the local master associated with the given
   // `target`, or nullptr if none exists.
@@ -85,14 +87,15 @@ class LocalMaster : public MasterInterface {
 
  private:
   Master* master_impl_;  // Not owned.
+  const int64 default_timeout_in_ms_;
 
   // See `LocalMaster::Lookup` for the factory function that creates
   // objects of this type.
-  LocalMaster(Master* master_impl);
+  LocalMaster(Master* master_impl, const int64 default_timeout_in_ms);
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalMaster);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 23fe908d43f..e3f23ef0dd0 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -25,7 +25,7 @@ limitations under the License.
 // A Master discovers remote devices on-demand and keeps track of
 // statistics of those remote devices.
 //
-// Each session analyses the graph, places nodes across available
+// Each session analyzes the graph, places nodes across available
 // devices, and ultimately drives the graph computation by initiating
 // RunGraph on the workers.
 
@@ -34,6 +34,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -42,17 +43,23 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+namespace {
+const char* const kGrpcProtocol = "grpc://";
+}  // namespace
+
 Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
@@ -101,8 +108,7 @@ void Master::GC() {
                        << "Note that if you are starting multiple replicas "
                        << "on a staggered delay, session_gc_seconds may need "
                        << "to be raised.";
-          sess->Close();
-          sess->Unref();
+          sess->GarbageCollect();
         });
       }
     }
@@ -114,18 +120,28 @@ class DeviceFinder {
  public:
   static Status GetRemoteDevices(
       const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      std::vector<Device*>* out_remote) {
-    DeviceFinder finder(device_filters, env);
+      WorkerCacheInterface* worker_cache,
+      std::vector<std::unique_ptr<Device>>* out_remote) {
+    DeviceFinder finder(device_filters, env, worker_cache);
     finder.Start();
     TF_RETURN_IF_ERROR(finder.Wait());
     finder.GetRemoteDevices(env->local_devices, out_remote);
     return Status::OK();
   }
 
+  static void GetRemoteWorkers(
+      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
+      WorkerCacheInterface* worker_cache, std::vector<string>* workers) {
+    DeviceFinder finder(device_filters, env, worker_cache);
+    *workers = finder.targets_;
+  }
+
  private:
   explicit DeviceFinder(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env)
-      : env_(env) {
+      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
+      WorkerCacheInterface* worker_cache)
+      : env_(env), worker_cache_(worker_cache) {
+    CHECK(worker_cache) << "Worker cache was null!";
     auto process_filter = [this](const string& filter) {
       DeviceNameUtils::ParsedName parsed;
       if (DeviceNameUtils::ParseFullName(filter, &parsed)) {
@@ -140,7 +156,7 @@ class DeviceFinder {
     // Enumerates all known workers' target. A target name is a
     // prefix of a device name. E.g., /job:mnist/replica:0/task:10.
     std::vector<string> workers;
-    env_->worker_cache->ListWorkers(&workers);
+    worker_cache->ListWorkers(&workers);
     if (filters_.empty()) {
       std::swap(workers, targets_);
     } else {
@@ -171,7 +187,7 @@ class DeviceFinder {
     for (size_t i = 0; i < targets_.size(); ++i) {
       // TODO(mrry): Propagate a timeout here, since `this->WhenFound()` may
       // never be called.
-      NewRemoteDevices(env_->env, env_->worker_cache, targets_[i],
+      NewRemoteDevices(env_->env, worker_cache_, targets_[i],
                        std::bind(&ME::WhenFound, this, i, _1, _2));
     }
   }
@@ -188,7 +204,7 @@ class DeviceFinder {
     while (num_pending_ != 0) {
       pending_zero_.wait_for(l, std::chrono::milliseconds(kLoggingPeriodMs));
       if (num_pending_ != 0) {
-        for (int i = 0; i < targets_.size(); ++i) {
+        for (size_t i = 0; i < targets_.size(); ++i) {
           if (!seen_targets_[i]) {
             LOG(INFO)
                 << "CreateSession still waiting for response from worker: "
@@ -202,14 +218,14 @@ class DeviceFinder {
 
   // The caller takes the ownership of returned remote devices.
   void GetRemoteDevices(const std::vector<Device*>& local,
-                        std::vector<Device*>* remote) {
+                        std::vector<std::unique_ptr<Device>>* remote) {
     std::unordered_set<string> names(local.size());
     for (Device* dev : local) names.insert(dev->name());
     mutex_lock l(mu_);
     for (Device* dev : found_) {
       const string& name = dev->name();
       if (names.insert(name).second && MatchFilters(name)) {
-        remote->push_back(dev);
+        remote->push_back(std::unique_ptr<Device>(dev));
       } else {
         delete dev;
       }
@@ -219,6 +235,7 @@ class DeviceFinder {
 
   typedef DeviceFinder ME;
   const MasterEnv* env_;
+  WorkerCacheInterface* worker_cache_;
   std::vector<DeviceNameUtils::ParsedName> filters_;
 
   mutex mu_;
@@ -278,38 +295,134 @@ class DeviceFinder {
 void Master::CreateSession(const CreateSessionRequest* req,
                            CreateSessionResponse* resp, MyClosure done) {
   SchedClosure([this, req, resp, done]() {
-    Status status = ValidateExternalGraphDefSyntax(req->graph_def());
-    if (status.ok()) {
+    Status status;
+    WorkerCacheFactoryOptions worker_cache_factory_options;
+    string grpc_protocol("grpc");
+    worker_cache_factory_options.protocol = &grpc_protocol;
+    auto call_done = gtl::MakeCleanup([&status, &done] { done(status); });
+    status = ValidateExternalGraphDefSyntax(req->graph_def());
+    if (!status.ok()) return;
+
+    // The following 4 variables are set differently, depending on whether this
+    // session uses a client-provided clusterspec or not.
+    WorkerCacheInterface* worker_cache = nullptr;
+    // Note: worker_cache_ptr will be null except if this session is using a
+    // client-supplied ClusterDef (ClusterSpec propagation).
+    std::unique_ptr<WorkerCacheInterface> worker_cache_ptr;
+    std::unique_ptr<DeviceSet> device_set;
+    // TODO(saeta): Convert to std::make_unique when available.
+    std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devices(
+        new std::vector<std::unique_ptr<Device>>());
+
+    if (req->config().has_cluster_def()) {
+      worker_cache_factory_options.cluster_def = &req->config().cluster_def();
+
+      // Set the server_def's job_name and task_index fields.
+      string normalized_string;
+      string grpc_protocol(kGrpcProtocol);
+      if (req->target().compare(0, grpc_protocol.length(), grpc_protocol) ==
+          0) {
+        normalized_string =
+            req->target().substr(grpc_protocol.length(), string::npos);
+      } else {
+        normalized_string = req->target();
+      }
+      for (auto&& job : req->config().cluster_def().job()) {
+        for (auto&& task : job.tasks()) {
+          if (task.second == normalized_string) {
+            if (worker_cache_factory_options.job_name != nullptr) {
+              status = errors::InvalidArgument(
+                  "Found multiple matching tasks that correspond to "
+                  "to the master. Master target: '",
+                  req->target(), "'. ClusterDef: ",
+                  req->config().cluster_def().ShortDebugString());
+              LOG(ERROR) << status;
+              return;
+            }
+            if (env_->local_devices[0]->parsed_name().job == job.name() &&
+                env_->local_devices[0]->parsed_name().task == task.first) {
+              // TODO(b/37868888): Remove this limitation when resolved
+              status = errors::InvalidArgument(
+                  "The ClusterSpec names the job and task index to be the same "
+                  "names that were provided when the server booted. This is "
+                  "currently not allowed. Job: ",
+                  job.name(), ", task index: ", task.first);
+              return;
+            }
+            worker_cache_factory_options.job_name = &job.name();
+            worker_cache_factory_options.task_index = task.first;
+          }
+        }
+      }
+
+      // Create the worker cache from the computed server_def.
+      status = env_->worker_cache_factory(worker_cache_factory_options,
+                                          &worker_cache);
+      if (!status.ok()) return;
+      worker_cache_ptr = std::unique_ptr<WorkerCacheInterface>(worker_cache);
       // Ping all the workers and build the list of devices that the
       // session will use.
-      std::vector<Device*> remote_devices;
-      status = DeviceFinder::GetRemoteDevices(req->config().device_filters(),
-                                              env_, &remote_devices);
-      if (!status.ok()) {
-        done(status);
-        return;
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
+        DeviceNameUtils::ParsedName name = d->parsed_name();
+        if (name.job == *worker_cache_factory_options.job_name &&
+            name.task == worker_cache_factory_options.task_index &&
+            name.type == "CPU") {
+          device_set->set_client_device(d.get());
+        }
       }
-      SessionOptions options;
-      options.config = req->config();
-      MasterSession* session =
-          env_->master_session_factory(options, env_, &remote_devices);
-      GraphDef* gdef =
-          const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
-      Status create_status = session->Create(gdef);
-      if (!create_status.ok()) {
-        session->Close();
-        session->Unref();
-        done(create_status);
-        return;
+    } else {
+      worker_cache = env_->worker_cache;
+      // Ping all the workers and build the list of devices that the
+      // session will use.
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
       }
-      resp->set_session_handle(session->handle());
-      // Insert into the session map, which takes ownership of the session.
-      {
-        mutex_lock l(mu_);
-        CHECK(sessions_.insert({session->handle(), session}).second);
+      int num_local_devices = 0;
+      for (Device* d : env_->local_devices) {
+        device_set->AddDevice(d);
+        if (num_local_devices == 0) {
+          // Uses the first local device as the client device.
+          device_set->set_client_device(d);
+        }
+        num_local_devices++;
       }
     }
-    done(status);
+
+    CHECK(device_set->client_device());
+
+    SessionOptions options;
+    options.config = req->config();
+
+    MasterSession* session = env_->master_session_factory(
+        options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
+        std::move(device_set));
+
+    GraphDef* gdef =
+        const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
+
+    status = session->Create(gdef, worker_cache_factory_options);
+    if (!status.ok()) {
+      session->Close().IgnoreError();
+      session->Unref();
+      return;
+    }
+    resp->set_session_handle(session->handle());
+    // Insert into the session map, which takes ownership of the session.
+    {
+      mutex_lock l(mu_);
+      CHECK(sessions_.insert({session->handle(), session}).second);
+    }
   });
 }
 
@@ -411,15 +524,35 @@ void Master::CloseSession(const CloseSessionRequest* req,
 void Master::ListDevices(const ListDevicesRequest* req,
                          ListDevicesResponse* resp, MyClosure done) {
   SchedClosure([this, req, resp, done]() {
-    std::vector<Device*> remote_devices;
-    Status s = DeviceFinder::GetRemoteDevices({}, env_, &remote_devices);
+    if (!req->session_handle().empty()) {
+      MasterSession* session = nullptr;
+      {
+        mutex_lock l(mu_);
+        session = gtl::FindPtrOrNull(sessions_, req->session_handle());
+        if (session != nullptr) {
+          session->Ref();
+        }
+      }
+      if (session == nullptr) {
+        done(errors::InvalidArgument(
+            "Session ", req->session_handle(),
+            " is not found. Possibly, this master has restarted."));
+        return;
+      }
+      core::ScopedUnref ref(session);
+      Status s = session->ListDevices(resp);
+      done(s);
+      return;
+    }
+    std::vector<std::unique_ptr<Device>> remote_devices;
+    Status s = DeviceFinder::GetRemoteDevices({}, env_, env_->worker_cache,
+                                              &remote_devices);
     if (s.ok()) {
       for (Device* dev : env_->local_devices) {
         *(resp->add_local_device()) = dev->attributes();
       }
-      for (Device* dev : remote_devices) {
+      for (auto&& dev : remote_devices) {
         *(resp->add_remote_device()) = dev->attributes();
-        delete dev;
       }
     }
     done(s);
@@ -428,7 +561,8 @@ void Master::ListDevices(const ListDevicesRequest* req,
 
 void Master::CleanupWorkers(const ResetRequest& reset) {
   std::vector<string> worker_names;
-  env_->worker_cache->ListWorkers(&worker_names);
+  DeviceFinder::GetRemoteWorkers(reset.device_filters(), env_,
+                                 env_->worker_cache, &worker_names);
   if (!worker_names.empty()) {
     const int num_workers = worker_names.size();
     std::vector<Notification> n(num_workers);
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 2bfebc1bfa6..ce05a6508ba 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -58,6 +58,7 @@ class Master {
   void ListDevices(const ListDevicesRequest* req, ListDevicesResponse* resp,
                    MyClosure done);
 
+  // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
  private:
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 7f9c6ecd9a7..178c5b40ee1 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -19,16 +19,40 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
 class Device;
+class DeviceSet;
 class Env;
 class MasterSession;
 class OpRegistryInterface;
-class WorkerCacheInterface;
+
+// Options passed to the worker_cache_factory function.
+struct WorkerCacheFactoryOptions {
+  const ClusterDef* cluster_def = nullptr;
+  const string* job_name = nullptr;
+  int task_index;
+  const string* protocol = nullptr;
+
+  WorkerCacheFactoryOptions() {}
+
+  // Construct from a ServerDef proto.
+  //
+  // Note: server_def must outlive WorkerCacheFactoryOptions!
+  WorkerCacheFactoryOptions(const ServerDef& server_def) {
+    if (server_def.has_cluster() && !server_def.job_name().empty()) {
+      cluster_def = &server_def.cluster();
+      job_name = &server_def.job_name();
+      task_index = server_def.task_index();
+      protocol = &server_def.protocol();
+    }
+  }
+};
 
 // The master environment class, which holds a bag of pointers to
 // per-master state.
@@ -54,12 +78,17 @@ struct MasterEnv {
   //
   // The caller of the function takes ownership of the returned
   // `MasterSession`, which may not be null. Ownership of the
-  // `MasterEnv*` is retained by the caller. The callee takes
-  // ownership of the `std::vector<Device*>*` argument, but does not
-  // take ownership of the `Device*` objects in the vector.
-  std::function<MasterSession*(const SessionOptions&, MasterEnv*,
-                               std::vector<Device*>*)>
+  // `MasterEnv*` is retained by the caller.
+  std::function<MasterSession*(
+      SessionOptions, MasterEnv*,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
+      std::unique_ptr<WorkerCacheInterface>,
+      std::unique_ptr<DeviceSet> device_set)>
       master_session_factory;
+
+  std::function<Status(const WorkerCacheFactoryOptions&,
+                       WorkerCacheInterface**)>
+      worker_cache_factory;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 52dce0557bf..94fec4f6d00 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -35,11 +36,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -60,38 +63,23 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
                     std::unique_ptr<SimpleClientGraph> cg,
                     const SessionOptions& session_opts,
-                    StatsPublisherFactory stats_publisher_factory,
+                    const StatsPublisherFactory& stats_publisher_factory,
                     SimpleGraphExecutionState* execution_state, bool is_partial,
                     WorkerCacheInterface* worker_cache)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
+        debug_opts_(bopts.debug_options),
         worker_cache_(worker_cache) {
     VLOG(1) << "Created ReffedClientGraph for node with "
             << client_graph_->graph.num_node_ids();
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // If this is a partial run we need to initialize a name to node map for
-    // testing that fetches are reachable.
-    if (is_partial) {
-      std::unordered_set<StringPiece, StringPiece::Hasher> names;
-      for (const string& input : bopts.feed_endpoints) {
-        TensorId id(ParseTensorName(input));
-        names.emplace(id.first);
-      }
-      for (const string& output : bopts.fetch_endpoints) {
-        TensorId id(ParseTensorName(output));
-        names.emplace(id.first);
-      }
-      // We use the graph from the execution_state because we want the graph
-      // nodes before they are rewritten replaced by the rewriter.
-      for (Node* n : execution_state->full_graph()->nodes()) {
-        if (names.count(n->name()) > 0) {
-          name_to_node_.insert({n->name(), n});
-        }
-      }
+    // Initialize a name to node map for testing that fetches are reachable.
+    for (Node* n : execution_state->full_graph()->nodes()) {
+      name_to_node_.insert({n->name(), n});
     }
   }
 
@@ -146,8 +134,9 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       for (auto& p : partitions_) {
         LoggingResponse* resp = new LoggingResponse;
         p.worker->LoggingAsync(
-            &req, resp, [step_id, ss, resp, &scoped_mu, &waiting_for,
-                         &all_done](const Status& s) {
+            &req, resp,
+            [step_id, ss, resp, &scoped_mu, &waiting_for,
+             &all_done](const Status& s) {
               {
                 mutex_lock l(scoped_mu);
                 if (s.ok()) {
@@ -175,14 +164,12 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Partitions the graph into subgraphs and registers them on
   // workers.
   Status RegisterPartitions(const PartitionOptions& popts,
-                            const FunctionDefLibrary& func_def_lib);
+                            const FunctionLibraryDefinition& flib_def);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
-                       int64 execution_count,
-                       SimpleGraphExecutionState* execution_state,
-                       PerStepState* pss, CallOptions* opts,
-                       const RunStepRequestWrapper& req,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
 
@@ -192,18 +179,16 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // Post-processing of any runtime statistics gathered during execution.
   void ProcessStats(int64 step_id, PerStepState* pss,
-                    SimpleGraphExecutionState* execution_state,
                     ProfileHandler* ph, const RunOptions& options,
                     RunMetadata* resp);
   void ProcessDeviceStats(ProfileHandler* ph,
-                          const SimpleGraphExecutionState* execution_state,
                           const DeviceStepStats& ds, bool is_rpc);
   // Checks that the requested fetches can be computed from the provided feeds.
   Status CheckFetches(const RunStepRequestWrapper& req,
                       const RunState* run_state,
                       SimpleGraphExecutionState* execution_state);
 
-  string DetailText(const NodeDef& def, const NodeExecStats& ns) {
+  string DetailText(const Node& node, const NodeExecStats& ns) {
     int64 tot = 0;
     for (auto& no : ns.output()) {
       tot += no.tensor_description().allocation_description().requested_bytes();
@@ -212,12 +197,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     if (tot >= 0.1 * 1048576.0) {
       bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
     }
-    return strings::StrCat(
-        bytes, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
+                           str_util::Join(node.requested_inputs(), ", "), ")");
   }
 
  private:
@@ -225,6 +206,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<SimpleClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
+  const DebugOptions& debug_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node_;
 
@@ -289,7 +271,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts, const FunctionDefLibrary& func_def_lib) {
+    const PartitionOptions& popts, const FunctionLibraryDefinition& flib_def) {
   {  // Ensure register once.
     mu_.lock();
     if (!init_started_) {
@@ -297,17 +279,20 @@ Status MasterSession::ReffedClientGraph::RegisterPartitions(
       mu_.unlock();
       std::unordered_map<string, GraphDef> graph_defs;
       Status s = DoBuildPartitions(popts, &graph_defs);
-      // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
-      // valid after the call to DoRegisterPartitions begins, so
-      // `stats_publisher_` must make a copy if it wants to retain the
-      // GraphDef objects.
-      std::vector<const GraphDef*> graph_defs_for_publishing;
-      graph_defs_for_publishing.reserve(partitions_.size());
-      for (const auto& name_def : graph_defs) {
-        graph_defs_for_publishing.push_back(&name_def.second);
+      if (s.ok()) {
+        // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
+        // valid after the call to DoRegisterPartitions begins, so
+        // `stats_publisher_` must make a copy if it wants to retain the
+        // GraphDef objects.
+        std::vector<const GraphDef*> graph_defs_for_publishing;
+        graph_defs_for_publishing.reserve(partitions_.size());
+        for (const auto& name_def : graph_defs) {
+          graph_defs_for_publishing.push_back(&name_def.second);
+        }
+        stats_publisher_->PublishGraphProto(graph_defs_for_publishing);
+        s = DoRegisterPartitions(popts, flib_def.ToProto(),
+                                 std::move(graph_defs));
       }
-      stats_publisher_->PublishGraphProto(graph_defs_for_publishing);
-      s = DoRegisterPartitions(popts, func_def_lib, std::move(graph_defs));
       mu_.lock();
       init_result_ = s;
       init_done_.Notify();
@@ -382,8 +367,6 @@ Status MasterSession::ReffedClientGraph::DoBuildPartitions(
   }
 
   // Partition the graph.
-  Status s;
-  std::unordered_map<string, GraphDef> graph_partitions;
   return Partition(popts, &client_graph_->graph, out_partitions);
 }
 
@@ -425,6 +408,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     // For simplicity, we ship the library completely to every worker.
     *c->req.mutable_graph_def()->mutable_library() = func_def_lib;
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
+    *c->req.mutable_debug_options() = debug_opts_;
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -441,19 +425,6 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   return s;
 }
 
-static bool CopyIfNeeded(TensorProto* in, TensorProto* out) {
-  if (in->tensor_content().empty()) {
-    // If the tensor is not encoded in tensor_content or contains 0
-    // elements, we can return it to the client directly.
-    out->Swap(in);
-  } else {
-    Tensor t(in->dtype());
-    if (!t.FromProto(cpu_allocator(), *in)) return false;
-    t.AsProtoTensorContent(out);
-  }
-  return true;
-}
-
 // Helper class to manage "num" parallel RunGraph calls.
 class RunManyGraphs {
  public:
@@ -512,8 +483,7 @@ class RunManyGraphs {
 
 Status MasterSession::ReffedClientGraph::RunPartitions(
     const MasterEnv* env, int64 step_id, int64 execution_count,
-    SimpleGraphExecutionState* execution_state, PerStepState* pss,
-    CallOptions* call_opts, const RunStepRequestWrapper& req,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp, CancellationManager* cm,
     const bool is_last_partial_run) {
   VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
@@ -556,6 +526,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       c->req->set_is_partial(is_partial_);
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
+    c->req->set_session_handle(session_handle_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -566,7 +537,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (int i = 0; i < req.num_feeds(); ++i) {
+      for (size_t i = 0; i < req.num_feeds(); ++i) {
         const string& name = req.feed_name(i);
         auto iter = part.feed_key.find(name);
         if (iter == part.feed_key.end()) {
@@ -578,7 +549,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
         if (feeds_iter == feeds.end()) {
           return errors::InvalidArgument("No feed is provided for feed=", name,
                                          ", key=", key);
-        } else if (feeds_iter->second != i) {
+        } else if (feeds_iter->second != static_cast<size_t>(i)) {
           return errors::Internal("Cannot find feed named \"", name,
                                   " in request.");
         }
@@ -586,7 +557,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; i < req.num_fetches(); ++i) {
+      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
         const string& req_fetch = req.fetch_name(i);
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
@@ -737,10 +708,11 @@ void MasterSession::ReffedClientGraph::CleanupPartitionsAsync(
   }
 }
 
-void MasterSession::ReffedClientGraph::ProcessStats(
-    int64 step_id, PerStepState* pss,
-    SimpleGraphExecutionState* execution_state, ProfileHandler* ph,
-    const RunOptions& options, RunMetadata* resp) {
+void MasterSession::ReffedClientGraph::ProcessStats(int64 step_id,
+                                                    PerStepState* pss,
+                                                    ProfileHandler* ph,
+                                                    const RunOptions& options,
+                                                    RunMetadata* resp) {
   if (!pss->collect_costs && !pss->collect_timeline) return;
 
   // Out-of-band logging data is collected now, during post-processing.
@@ -752,13 +724,13 @@ void MasterSession::ReffedClientGraph::ProcessStats(
     const StepStats& ss = pss->step_stats[i];
     if (ph) {
       for (const auto& ds : ss.dev_stats()) {
-        ProcessDeviceStats(ph, execution_state, ds, false /*is_rpc*/);
+        ProcessDeviceStats(ph, ds, false /*is_rpc*/);
       }
     }
   }
   if (ph) {
     for (const auto& ds : pss->rpc_stats.dev_stats()) {
-      ProcessDeviceStats(ph, execution_state, ds, true /*is_rpc*/);
+      ProcessDeviceStats(ph, ds, true /*is_rpc*/);
     }
     ph->StepDone(pss->start_micros, pss->end_micros,
                  Microseconds(0) /*cleanup_time*/, 0 /*total_runops*/,
@@ -782,8 +754,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(
 }
 
 void MasterSession::ReffedClientGraph::ProcessDeviceStats(
-    ProfileHandler* ph, const SimpleGraphExecutionState* execution_state,
-    const DeviceStepStats& ds, bool is_rpc) {
+    ProfileHandler* ph, const DeviceStepStats& ds, bool is_rpc) {
   const string& dev_name = ds.device();
   VLOG(1) << "Device " << dev_name << " reports stats for "
           << ds.node_stats_size() << " nodes";
@@ -794,7 +765,7 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       ph->RecordOneOp(dev_name, ns, true /*is_copy*/, "", ns.node_name(),
                       ns.timeline_label());
     } else {
-      const Node* node = execution_state->get_node_by_name(ns.node_name());
+      const Node* node = name_to_node_[ns.node_name()];
       const bool found_node_in_graph = node != nullptr;
       if (!found_node_in_graph && ns.timeline_label().empty()) {
         // The counter incrementing is not thread-safe. But we don't really
@@ -815,7 +786,7 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(node->def(), ns);
+        details = DetailText(*node, ns);
       } else {
         // Leave details string empty
       }
@@ -829,16 +800,20 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
+// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
+// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     SimpleGraphExecutionState* execution_state) {
   // Build the set of pending feeds that we haven't seen.
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
-  for (const string& feed : run_state->pending_inputs) {
-    TensorId id(ParseTensorName(feed));
+  for (const auto& input : run_state->pending_inputs) {
+    // Skip if already fed.
+    if (input.second) continue;
+    TensorId id(ParseTensorName(input.first));
     auto it = name_to_node_.find(id.first);
     if (it == name_to_node_.end()) {
-      return errors::NotFound("Feed ", feed, ": not found");
+      return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
   }
@@ -895,6 +870,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     // The graph handle may be empty if we failed during partition registration.
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
+      c->req.set_session_handle(session_handle_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
@@ -928,6 +904,10 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
     opts->target_nodes.push_back(req.target_name(i));
   }
 
+  if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
+    opts->debug_options = req.options().debug_options();
+  }
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -945,6 +925,8 @@ void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
     opts->target_nodes.push_back(target);
   }
 
+  // TODO(cais): Add TFDBG support to partial runs.
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -961,6 +943,13 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   for (const string& name : opts.fetch_endpoints) {
     h = Hash64(name.c_str(), name.size(), h);
   }
+
+  if (!opts.debug_options.debug_tensor_watch_opts().empty()) {
+    const string watch_summary = SummarizeDebugTensorWatches(
+        opts.debug_options.debug_tensor_watch_opts());
+    h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
+  }
+
   return h;
 }
 
@@ -981,51 +970,44 @@ string BuildGraphOptionsString(const BuildGraphOptions& opts) {
   return buf;
 }
 
-MasterSession::MasterSession(const SessionOptions& opt, const MasterEnv* env,
-                             std::vector<Device*>* remote_devs,
-                             StatsPublisherFactory stats_publisher_factory)
+MasterSession::MasterSession(
+    const SessionOptions& opt, const MasterEnv* env,
+    std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    std::unique_ptr<DeviceSet> device_set,
+    StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
       handle_(strings::FpToString(random::New64())),
+      remote_devs_(std::move(remote_devs)),
+      worker_cache_(std::move(worker_cache)),
+      devices_(std::move(device_set)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
-      partial_run_graphs_(5),
-      cancellation_manager_(new CancellationManager) {
+      partial_run_graphs_(5) {
   UpdateLastAccessTime();
+  CHECK(devices_) << "device_set was null!";
 
-  swap(remote_devs_, *remote_devs);
   VLOG(1) << "Session " << handle_ << " #local " << env->local_devices.size()
-          << " #remote " << remote_devs_.size();
-  for (Device* d : remote_devs_) {
-    devices_.AddDevice(d);
-  }
-  int num_local_devices = 0;
-  for (Device* d : env->local_devices) {
-    devices_.AddDevice(d);
-    if (num_local_devices == 0) {
-      // Uses the first local device as the client device.
-      devices_.set_client_device(d);
-    }
-    num_local_devices++;
-  }
+          << " #remote " << remote_devs_->size();
+
   LOG(INFO) << "Start master session " << handle_
             << " with config: " << std::endl
             << session_opts_.config.DebugString();
 }
 
 MasterSession::~MasterSession() {
-  delete cancellation_manager_;
   for (const auto& iter : run_graphs_) iter.second->Unref();
   for (const auto& iter : partial_run_graphs_) iter.second->Unref();
-  for (Device* dev : remote_devs_) delete dev;
 }
 
 void MasterSession::UpdateLastAccessTime() {
   last_access_time_usec_.store(Env::Default()->NowMicros());
 }
 
-Status MasterSession::Create(GraphDef* graph_def) {
+Status MasterSession::Create(GraphDef* graph_def,
+                             const WorkerCacheFactoryOptions& options) {
   if (session_opts_.config.graph_options().place_pruned_graph()) {
     // TODO(b/29900832): Fix this or remove the option.
     LOG(WARNING) << "Distributed session does not support the "
@@ -1033,11 +1015,114 @@ Status MasterSession::Create(GraphDef* graph_def) {
     session_opts_.config.mutable_graph_options()->set_place_pruned_graph(false);
   }
 
-  SimpleGraphExecutionStateOptions options;
-  options.device_set = &devices_;
-  options.session_options = &session_opts_;
-  TF_RETURN_IF_ERROR(SimpleGraphExecutionState::MakeForBaseGraph(
-      graph_def, options, &execution_state_));
+  SimpleGraphExecutionStateOptions execution_options;
+  execution_options.device_set = devices_.get();
+  execution_options.session_options = &session_opts_;
+  {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(SimpleGraphExecutionState::MakeForBaseGraph(
+        graph_def, execution_options, &execution_state_));
+  }
+  if (options.cluster_def != nullptr) {
+    return CreateWorkerSessions(options);
+  }
+  return Status::OK();
+}
+
+Status MasterSession::CreateWorkerSessions(
+    const WorkerCacheFactoryOptions& options) {
+  CHECK(worker_cache_) << "CreateWorkerSessions should be called only with "
+                       << "dynamic cluster membership.";
+  std::vector<string> worker_names;
+  worker_cache_->ListWorkers(&worker_names);
+
+  struct WorkerGroup {
+    // The worker name. (Not owned.)
+    const string* name;
+
+    // The worker referenced by name. (Not owned.)
+    WorkerInterface* worker = nullptr;
+
+    // Request and responses used for a given worker.
+    CreateWorkerSessionRequest request;
+    CreateWorkerSessionResponse response;
+    Status status = Status::OK();
+  };
+  BlockingCounter done(worker_names.size());
+  std::vector<WorkerGroup> workers(worker_names.size());
+
+  // Release the workers.
+  auto cleanup = gtl::MakeCleanup([this, &workers] {
+    for (auto&& worker_group : workers) {
+      if (worker_group.worker != nullptr) {
+        worker_cache_->ReleaseWorker(*worker_group.name, worker_group.worker);
+      }
+    }
+  });
+
+  Status status = Status::OK();
+  // Create all the workers & kick off the computations.
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    workers[i].name = &worker_names[i];
+    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].request.set_session_handle(handle_);
+    *workers[i].request.mutable_server_def()->mutable_cluster() =
+        *options.cluster_def;
+    workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+
+    DeviceNameUtils::ParsedName name;
+    if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
+      status = errors::Internal("Could not parse name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+    if (!name.has_job || !name.has_task) {
+      status = errors::Internal("Incomplete worker name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+
+    workers[i].request.mutable_server_def()->set_job_name(name.job);
+    workers[i].request.mutable_server_def()->set_task_index(name.task);
+  }
+
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    auto cb = [i, &workers, &done](const Status& s) {
+      workers[i].status = s;
+      done.DecrementCount();
+    };
+    workers[i].worker->CreateWorkerSessionAsync(&workers[i].request,
+                                                &workers[i].response, cb);
+  }
+
+  done.Wait();
+  for (size_t i = 0; i < workers.size(); ++i) {
+    status.Update(workers[i].status);
+  }
+  return status;
+}
+
+Status MasterSession::ListDevices(ListDevicesResponse* resp) const {
+  if (worker_cache_) {
+    // This is a ClusterSpec-propagated session, and thus env_->local_devices
+    // are invalid.
+
+    // Mark the "client_device" as the sole local device.
+    const Device* client_device = devices_->client_device();
+    for (const Device* dev : devices_->devices()) {
+      if (dev != client_device) {
+        *(resp->add_remote_device()) = dev->attributes();
+      }
+    }
+    *(resp->add_local_device()) = client_device->attributes();
+  } else {
+    for (Device* dev : env_->local_devices) {
+      *(resp->add_local_device()) = dev->attributes();
+    }
+    for (auto&& dev : *remote_devs_) {
+      *(resp->add_local_device()) = dev->attributes();
+    }
+  }
   return Status::OK();
 }
 
@@ -1051,15 +1136,6 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
       return errors::FailedPrecondition("Session is closed.");
     }
 
-    // TODO(mrry): Redesign the locking with reader/writer locks to prevent
-    //   starvation due to concurrent steps being issued. This is not
-    //   immediately important because we expect Extend to be used in
-    //   development/interactive exploration, and not during high-throughput
-    //   training.
-    while (num_running_ != 0) {
-      num_running_is_zero_.wait(l);
-    }
-
     if (graph_version_ != req->current_graph_version()) {
       return errors::Aborted("Current version is ", graph_version_,
                              " but caller expected ",
@@ -1079,6 +1155,13 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
   return Status::OK();
 }
 
+WorkerCacheInterface* MasterSession::get_worker_cache() const {
+  if (worker_cache_) {
+    return worker_cache_.get();
+  }
+  return env_->worker_cache;
+}
+
 Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
                                 ReffedClientGraph** rcg, bool is_partial) {
   const uint64 hash = HashBuildGraphOptions(opts);
@@ -1102,10 +1185,11 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
               << "\n";
       std::unique_ptr<SimpleClientGraph> client_graph;
       TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+      WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
           stats_publisher_factory_, execution_state_.get(), is_partial,
-          env_->worker_cache);
+          worker_cache);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
@@ -1180,6 +1264,8 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
       return errors::FailedPrecondition("Session is closed.");
     }
     ++num_running_;
+    // Note: all code paths must eventually call MarkRunCompletion()
+    // in order to appropriate decrement the num_running_ counter.
   }
   Status status;
   if (!req.partial_run_handle().empty()) {
@@ -1187,16 +1273,18 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
   } else {
     status = DoRunWithLocalExecution(opts, req, resp);
   }
-  {
-    mutex_lock l(mu_);
-    --num_running_;
-    if (num_running_ == 0) {
-      num_running_is_zero_.notify_all();
-    }
-  }
   return status;
 }
 
+// Decrements num_running_ and broadcasts if num_running_ is zero.
+void MasterSession::MarkRunCompletion() {
+  mutex_lock l(mu_);
+  --num_running_;
+  if (num_running_ == 0) {
+    num_running_is_zero_.notify_all();
+  }
+}
+
 Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
@@ -1206,7 +1294,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
-    Device* d = devices_.FindDeviceByName(name);
+    Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
       return PartitionOptions::kIllegalIncarnation;
     } else {
@@ -1233,7 +1321,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   }
 
   TF_RETURN_IF_ERROR(
-      rcg->RegisterPartitions(popts, rcg->client_graph()->flib_def->ToProto()));
+      rcg->RegisterPartitions(popts, *rcg->client_graph()->flib_def));
 
   return Status::OK();
 }
@@ -1241,6 +1329,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
 Status MasterSession::DoPartialRun(CallOptions* opts,
                                    const RunStepRequestWrapper& req,
                                    MutableRunStepResponseWrapper* resp) {
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
   const string& prun_handle = req.partial_run_handle();
   RunState* run_state = nullptr;
   {
@@ -1286,10 +1375,14 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Make sure that this is a new set of feeds that are still pending.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    const string& feed = req.feed_name(i);
+    auto it = run_state->pending_inputs.find(feed);
     if (it == run_state->pending_inputs.end()) {
-      return errors::InvalidArgument("The feed ", req.feed_name(i),
-                                     " had already been fed.");
+      return errors::InvalidArgument(
+          "The feed ", feed, " was not specified in partial_run_setup.");
+    } else if (it->second) {
+      return errors::InvalidArgument("The feed ", feed,
+                                     " has already been fed.");
     }
   }
   // Check that this is a new set of fetches that are still pending.
@@ -1297,29 +1390,35 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const string& fetch = req.fetch_name(i);
     auto it = run_state->pending_outputs.find(fetch);
     if (it == run_state->pending_outputs.end()) {
+      return errors::InvalidArgument(
+          "The fetch ", fetch, " was not specified in partial_run_setup.");
+    } else if (it->second) {
       return errors::InvalidArgument("The fetch ", fetch,
-                                     " had already been fetched.");
+                                     " has already been fetched.");
     }
   }
 
   // Ensure that the requested fetches can be computed from the provided feeds.
-  TF_RETURN_IF_ERROR(
-      run_state->rcg->CheckFetches(req, run_state, execution_state_.get()));
+  {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(
+        run_state->rcg->CheckFetches(req, run_state, execution_state_.get()));
+  }
 
-  // Determine if this partial run satisfies all the pending inputs and ouputs.
+  // Determine if this partial run satisfies all the pending inputs and outputs.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    run_state->pending_inputs.erase(req.feed_name(i));
+    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    it->second = true;
   }
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    run_state->pending_outputs.erase(req.fetch_name(i));
+    auto it = run_state->pending_outputs.find(req.fetch_name(i));
+    it->second = true;
   }
-  bool is_last_partial_run =
-      (run_state->pending_inputs.empty() && run_state->pending_outputs.empty());
+  bool is_last_partial_run = run_state->PendingDone();
 
   Status s = run_state->rcg->RunPartitions(
-      env_, run_state->step_id, run_state->count, execution_state_.get(),
-      &run_state->pss, opts, req, resp, cancellation_manager_,
-      is_last_partial_run);
+      env_, run_state->step_id, run_state->count, &run_state->pss, opts, req,
+      resp, &cancellation_manager_, is_last_partial_run);
 
   // Delete the run state if there is an error or all fetches are done.
   if (!s.ok() || is_last_partial_run) {
@@ -1327,15 +1426,16 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     run_state->pss.end_micros = Env::Default()->NowMicros();
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->Ref();
-    rcg->ProcessStats(run_state->step_id, &run_state->pss,
-                      execution_state_.get(), run_state->ph.get(),
+    rcg->ProcessStats(run_state->step_id, &run_state->pss, run_state->ph.get(),
                       req.options(), resp->mutable_metadata());
+    cleanup.release();  // MarkRunCompletion called in done closure.
     rcg->CleanupPartitionsAsync(
         run_state->step_id, [this, rcg, prun_handle](const Status& s) {
           if (!s.ok()) {
             LOG(ERROR) << "Cleanup partition error: " << s;
           }
           rcg->Unref();
+          MarkRunCompletion();
         });
     mutex_lock l(mu_);
     partial_runs_.erase(prun_handle);
@@ -1343,13 +1443,44 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
   return s;
 }
 
+Status MasterSession::CreateDebuggerState(
+    const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+    int64 rcg_execution_count,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+
+  std::vector<string> input_names;
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    input_names.push_back(req.feed_name(i));
+  }
+  std::vector<string> output_names;
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    output_names.push_back(req.fetch_name(i));
+  }
+  std::vector<string> target_names;
+  for (size_t i = 0; i < req.num_targets(); ++i) {
+    target_names.push_back(req.target_name(i));
+  }
+
+  // TODO(cais): We currently use -1 as a dummy value for session run count.
+  // While this counter value is straightforward to define and obtain for
+  // DirectSessions, it is less so for non-direct Sessions. Devise a better
+  // way to get its value when the need arises.
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), rcg_execution_count, rcg_execution_count,
+      input_names, output_names, target_names));
+
+  return Status::OK();
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
-  VLOG(2) << "DoRunWithLocalExecution "
-          << "req: " << req.DebugString();
+  VLOG(2) << "DoRunWithLocalExecution req: " << req.DebugString();
   PerStepState pss;
   pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
 
   // Prepare.
   BuildGraphOptions bgopts;
@@ -1361,6 +1492,13 @@ Status MasterSession::DoRunWithLocalExecution(
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
 
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  const DebugOptions& debug_options = req.options().debug_options();
+
+  if (!debug_options.debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(
+        CreateDebuggerState(debug_options, req, count, &debugger_state));
+  }
   TF_RETURN_IF_ERROR(BuildAndRegisterPartitions(rcg));
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
@@ -1387,34 +1525,51 @@ Status MasterSession::DoRunWithLocalExecution(
     pss.collect_rpcs = ph->should_collect_rpcs();
   }
 
-  TF_RETURN_IF_ERROR(rcg->RunPartitions(env_, step_id, count,
-                                        execution_state_.get(), &pss, opts, req,
-                                        resp, cancellation_manager_, false));
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
+  if (s.ok()) {
+    pss.end_micros = Env::Default()->NowMicros();
 
-  pss.end_micros = Env::Default()->NowMicros();
-
-  // Schedule post-processing and cleanup to be done asynchronously.
+    // Schedule post-processing and cleanup to be done asynchronously.
+    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
+                      resp->mutable_metadata());
+  } else if (errors::IsCancelled(s)) {
+    mutex_lock l(mu_);
+    if (closed_) {
+      if (garbage_collected_) {
+        s = errors::Cancelled(
+            "Step was cancelled because the session was garbage collected due "
+            "to inactivity.");
+      } else {
+        s = errors::Cancelled(
+            "Step was cancelled by an explicit call to `Session::Close()`.");
+      }
+    }
+  }
   rcg->Ref();
-  rcg->ProcessStats(step_id, &pss, execution_state_.get(), ph.get(),
-                    req.options(), resp->mutable_metadata());
-  rcg->CleanupPartitionsAsync(step_id, [rcg](const Status& s) {
+  cleanup.release();  // MarkRunCompletion called in done closure.
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
     if (!s.ok()) {
       LOG(ERROR) << "Cleanup partition error: " << s;
     }
     rcg->Unref();
+    MarkRunCompletion();
   });
-  return Status::OK();
+  return s;
 }
 
 Status MasterSession::Close() {
-  cancellation_manager_->StartCancel();
+  {
+    mutex_lock l(mu_);
+    closed_ = true;  // All subsequent calls to Run() or Extend() will fail.
+  }
+  cancellation_manager_.StartCancel();
   std::vector<ReffedClientGraph*> to_unref;
   {
     mutex_lock l(mu_);
     while (num_running_ != 0) {
       num_running_is_zero_.wait(l);
     }
-    closed_ = true;  // All subsequent calls to Run() or Extend() will fail.
     ClearRunsTable(&to_unref, &run_graphs_);
     ClearRunsTable(&to_unref, &partial_run_graphs_);
   }
@@ -1422,6 +1577,16 @@ Status MasterSession::Close() {
   return Status::OK();
 }
 
+void MasterSession::GarbageCollect() {
+  {
+    mutex_lock l(mu_);
+    closed_ = true;
+    garbage_collected_ = true;
+  }
+  cancellation_manager_.StartCancel();
+  Unref();
+}
+
 MasterSession::RunState::RunState(const std::vector<string>& input_names,
                                   const std::vector<string>& output_names,
                                   ReffedClientGraph* rcg, const uint64 step_id,
@@ -1429,10 +1594,10 @@ MasterSession::RunState::RunState(const std::vector<string>& input_names,
     : rcg(rcg), step_id(step_id), count(count) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1440,4 +1605,14 @@ MasterSession::RunState::~RunState() {
   if (rcg) rcg->Unref();
 }
 
+bool MasterSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 4e78e08559f..10fc4868caa 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -19,12 +19,14 @@ limitations under the License.
 #include <atomic>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/simple_graph_execution_state.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -45,15 +47,18 @@ class MasterSession : public core::RefCounted {
   // operations on these devices.
   //
   // The caller takes ownership of all remote devices.
-  MasterSession(const SessionOptions& options, const MasterEnv* env,
-                std::vector<Device*>* remote_devs,
-                StatsPublisherFactory stats_publisher_factory);
+  MasterSession(
+      const SessionOptions& options, const MasterEnv* env,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      std::unique_ptr<DeviceSet> device_set,
+      StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
   // Run(), or Close().
   //
   // After this method returns, `def` will no longer be valid.
-  Status Create(GraphDef* def);
+  Status Create(GraphDef* def, const WorkerCacheFactoryOptions& options);
 
   // Returns the session handle.
   const string& handle() const { return handle_; }
@@ -82,12 +87,20 @@ class MasterSession : public core::RefCounted {
   Status Run(CallOptions* opts, const RunStepRequestWrapper& req,
              MutableRunStepResponseWrapper* resp);
 
+  Status ListDevices(ListDevicesResponse* resp) const;
+
   // Close this session and delete "*this". Returns OK if all known
   // states are cleanup successfully.
   //
   // Close() may block the caller thread for a long time.
   Status Close();
 
+  // Close this session and release a reference on "*this".
+  //
+  // Note that, unlike Close(), this method does not block on the
+  // completion of all work.
+  void GarbageCollect();
+
  private:
   SessionOptions session_opts_;
 
@@ -97,11 +110,16 @@ class MasterSession : public core::RefCounted {
   // The opaque session handle.
   const string handle_;
 
-  // Owned.
-  std::vector<Device*> remote_devs_;
+  std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
+
+  // The optional session-specific worker cluster.
+  // TODO(saeta): Convert to std::optional when available.
+  const std::unique_ptr<WorkerCacheInterface> worker_cache_;
+  // Retrieves either worker_cache_ or the env_->worker_cache as appropriate.
+  WorkerCacheInterface* get_worker_cache() const;
 
   // The device set used by this session.
-  DeviceSet devices_;
+  std::unique_ptr<DeviceSet> devices_;
 
   StatsPublisherFactory stats_publisher_factory_;
 
@@ -110,7 +128,7 @@ class MasterSession : public core::RefCounted {
   std::atomic<int64> partial_run_handle_counter_ = {0};
 
   mutex mu_;
-  std::unique_ptr<SimpleGraphExecutionState> execution_state_;
+  std::unique_ptr<SimpleGraphExecutionState> execution_state_ GUARDED_BY(mu_);
   int64 graph_version_;
 
   // We keep a map from a signature of a run request to the
@@ -135,8 +153,8 @@ class MasterSession : public core::RefCounted {
   };
 
   struct RunState {
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
     uint64 step_id;
     int64 count = 0;
@@ -148,6 +166,8 @@ class MasterSession : public core::RefCounted {
              const std::vector<string>& output_names, ReffedClientGraph* rcg,
              const uint64 step_id, const int64 count);
 
+    bool PendingDone() const;
+
     ~RunState();
   };
   std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
@@ -158,6 +178,7 @@ class MasterSession : public core::RefCounted {
   int32 num_running_ GUARDED_BY(mu_) = 0;
 
   bool closed_ GUARDED_BY(mu_) = false;
+  bool garbage_collected_ GUARDED_BY(mu_) = false;
 
   std::unordered_map<uint64, int64> subgraph_execution_counts_ GUARDED_BY(mu_);
 
@@ -166,11 +187,18 @@ class MasterSession : public core::RefCounted {
   int64 next_node_id_ GUARDED_BY(mu_) = 0;
 
   // Used to cancel running steps on Close().
-  CancellationManager* cancellation_manager_;
+  CancellationManager cancellation_manager_;
 
   // Private dtor. The client must call Close().
   virtual ~MasterSession();
 
+  // Creates sessions on all workers.
+  //
+  // If this session is operating using the new ClusterSpec propagation behavior
+  // call this method in order to propagate the cluster membership to all
+  // workers.
+  Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
+
   Status StartStep(const BuildGraphOptions& opts, int64* count,
                    ReffedClientGraph** graph, bool is_partial);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
@@ -180,10 +208,16 @@ class MasterSession : public core::RefCounted {
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  void MarkRunCompletion();
   void UpdateLastAccessTime();
 
   Status BuildAndRegisterPartitions(ReffedClientGraph* rcg);
 
+  Status CreateDebuggerState(
+      const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+      int64 rcg_execution_count,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
   TF_DISALLOW_COPY_AND_ASSIGN(MasterSession);
 };
 
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 77c1537b547..121c58762f1 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -49,8 +49,9 @@ class MasterTest : public ::testing::Test {
     (*options.config.mutable_device_count())["CPU"] = 1;
     (*options.config.mutable_device_count())["GPU"] = 0;
     TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 2, &cluster_));
-    master_ = grpc::MasterService::NewStub(
-        NewHostPortGrpcChannel(cluster_->targets()[0]));
+    SharedGrpcChannelPtr channel_ptr;
+    TF_CHECK_OK(NewHostPortGrpcChannel(cluster_->targets()[0], &channel_ptr));
+    master_ = grpc::MasterService::NewStub(channel_ptr);
   }
 
   std::unique_ptr<test::TestCluster> cluster_;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 7b58feb93cc..b5b564375db 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/named_tensor.pb.h"
 
 namespace tensorflow {
 
@@ -252,6 +256,14 @@ string ProtoRunStepRequest::DebugString() const {
 
 const RunStepRequest& ProtoRunStepRequest::ToProto() const { return *request_; }
 
+const string& InMemoryRunGraphRequest::session_handle() const {
+  return session_handle_;
+}
+
+void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
+  session_handle_ = handle;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -320,6 +332,7 @@ void InMemoryRunGraphRequest::set_is_last_partial_run(
 const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
+    proto_version_->set_session_handle(session_handle());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -337,6 +350,14 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   return *proto_version_;
 }
 
+const string& MutableProtoRunGraphRequest::session_handle() const {
+  return request_.session_handle();
+}
+
+void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
+  request_.set_session_handle(handle);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -423,6 +444,10 @@ const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
 ProtoRunGraphRequest::ProtoRunGraphRequest(const RunGraphRequest* request)
     : request_(request) {}
 
+const string& ProtoRunGraphRequest::session_handle() const {
+  return request_->session_handle();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
@@ -495,6 +520,7 @@ CostGraphDef* InMemoryRunGraphResponse::mutable_cost_graph() {
 
 RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunGraphResponse";
+  return nullptr;
 }
 
 size_t OwnedProtoRunGraphResponse::num_recvs() const {
@@ -613,6 +639,7 @@ RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
 
 RunStepResponse* InMemoryRunStepResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunStepResponse";
+  return nullptr;
 }
 
 size_t OwnedProtoRunStepResponse::num_tensors() const {
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 02516eabb4a..f247b50dd5a 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -17,8 +17,12 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
@@ -223,6 +227,10 @@ class RunGraphRequestWrapper {
  public:
   virtual ~RunGraphRequestWrapper() {}
 
+  // The session handle used to register the graph. If empty, a single global
+  // namespace is used.
+  virtual const string& session_handle() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -262,6 +270,7 @@ class RunGraphRequestWrapper {
 // See `RunGraphRequestWrapper` above for a description of the fields.
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
+  virtual void set_session_handle(const string& handle) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -280,6 +289,7 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -293,6 +303,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -304,6 +315,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   void set_is_last_partial_run(bool is_last_partial_run) override;
 
  private:
+  string session_handle_;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -325,6 +337,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -338,6 +351,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -357,6 +371,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   ProtoRunGraphRequest(const RunGraphRequest* request);
 
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers_test.cc b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
index 00ccec4fdf0..5b0c2945b99 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers_test.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
 
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
@@ -51,9 +55,9 @@ static void CheckRunStepRequest(const RunStepRequestWrapper& request) {
   EXPECT_EQ("feed_a:0", request.feed_name(0));
   EXPECT_EQ("feed_b:0", request.feed_name(1));
   Tensor val;
-  request.FeedValue(0, &val);
+  TF_EXPECT_OK(request.FeedValue(0, &val));
   test::ExpectTensorEqual<int32>(TensorA(), val);
-  request.FeedValue(1, &val);
+  TF_EXPECT_OK(request.FeedValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
 
   EXPECT_EQ(2, request.num_fetches());
@@ -70,8 +74,10 @@ static void BuildRunGraphRequest(
   run_graph_request->set_graph_handle("graph_handle");
   run_graph_request->set_step_id(13);
   run_graph_request->mutable_exec_opts()->set_record_timeline(true);
-  run_graph_request->AddSendFromRunStepRequest(run_step_request, 0, "send_0");
-  run_graph_request->AddSendFromRunStepRequest(run_step_request, 1, "send_1");
+  TF_EXPECT_OK(run_graph_request->AddSendFromRunStepRequest(run_step_request, 0,
+                                                            "send_0"));
+  TF_EXPECT_OK(run_graph_request->AddSendFromRunStepRequest(run_step_request, 1,
+                                                            "send_1"));
   run_graph_request->add_recv_key("recv_2");
   run_graph_request->add_recv_key("recv_3");
   run_graph_request->set_is_partial(true);
@@ -84,9 +90,9 @@ static void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
   EXPECT_TRUE(request.exec_opts().record_timeline());
   EXPECT_EQ(2, request.num_sends());
   Tensor val;
-  request.SendValue(0, &val);
+  TF_EXPECT_OK(request.SendValue(0, &val));
   test::ExpectTensorEqual<int32>(TensorA(), val);
-  request.SendValue(1, &val);
+  TF_EXPECT_OK(request.SendValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
   EXPECT_TRUE(request.is_partial());
   EXPECT_FALSE(request.is_last_partial_run());
@@ -106,9 +112,9 @@ static void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
   EXPECT_EQ("recv_2", response->recv_key(0));
   EXPECT_EQ("recv_3", response->recv_key(1));
   Tensor val;
-  response->RecvValue(0, &val);
+  TF_EXPECT_OK(response->RecvValue(0, &val));
   test::ExpectTensorEqual<int32>(TensorA(), val);
-  response->RecvValue(1, &val);
+  TF_EXPECT_OK(response->RecvValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
   EXPECT_EQ(1, response->mutable_step_stats()->dev_stats_size());
   EXPECT_EQ("/cpu:0", response->mutable_step_stats()->dev_stats(0).device());
@@ -119,10 +125,10 @@ static void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
 static void BuildRunStepResponse(
     MutableRunGraphResponseWrapper* run_graph_response,
     MutableRunStepResponseWrapper* run_step_response) {
-  run_step_response->AddTensorFromRunGraphResponse("fetch_x:0",
-                                                   run_graph_response, 0);
-  run_step_response->AddTensorFromRunGraphResponse("fetch_y:0",
-                                                   run_graph_response, 1);
+  TF_EXPECT_OK(run_step_response->AddTensorFromRunGraphResponse(
+      "fetch_x:0", run_graph_response, 0));
+  TF_EXPECT_OK(run_step_response->AddTensorFromRunGraphResponse(
+      "fetch_y:0", run_graph_response, 1));
   *run_step_response->mutable_metadata()->mutable_step_stats() =
       *run_graph_response->mutable_step_stats();
 }
@@ -133,9 +139,9 @@ static void CheckRunStepResponse(
   EXPECT_EQ("fetch_x:0", response.tensor_name(0));
   EXPECT_EQ("fetch_y:0", response.tensor_name(1));
   Tensor val;
-  response.TensorValue(0, &val);
+  TF_EXPECT_OK(response.TensorValue(0, &val));
   test::ExpectTensorEqual<int32>(TensorA(), val);
-  response.TensorValue(1, &val);
+  TF_EXPECT_OK(response.TensorValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
   EXPECT_EQ(1, response.metadata().step_stats().dev_stats_size());
   EXPECT_EQ("/cpu:0", response.metadata().step_stats().dev_stats(0).device());
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.cc b/tensorflow/core/distributed_runtime/partial_run_mgr.cc
new file mode 100644
index 00000000000..c0dbabf9a21
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.cc
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+
+namespace tensorflow {
+
+namespace {
+// TODO(suharshs): Move this to a common location to allow other part of the
+// repo to use it.
+template <typename T, typename... Args>
+std::unique_ptr<T> MakeUnique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+}  // namespace
+
+bool PartialRunMgr::FindOrCreate(int step_id,
+                                 CancellationManager** cancellation_manager) {
+  mutex_lock l(mu_);
+  auto it = step_id_to_partial_run_.find(step_id);
+  if (it != step_id_to_partial_run_.end()) {
+    *cancellation_manager = it->second->cancellation_manager.get();
+    return false;
+  }
+
+  std::unique_ptr<PartialRunState> partial_run = MakeUnique<PartialRunState>();
+  partial_run->cancellation_manager = MakeUnique<CancellationManager>();
+  *cancellation_manager = partial_run->cancellation_manager.get();
+  step_id_to_partial_run_[step_id] = std::move(partial_run);
+  return true;
+}
+
+void PartialRunMgr::ExecutorDone(int step_id, const Status& executor_status) {
+  StatusCallback done;
+  Status callback_status;
+  {
+    mutex_lock l(mu_);
+    auto run_it = step_id_to_partial_run_.find(step_id);
+    if (run_it == step_id_to_partial_run_.end()) {
+      return;
+    }
+    // If we found the partial_run, we call the final callback, if it
+    // exists.
+    // It is guaranteed that run_it->second->final_callback is left empty
+    // after the std::move call.
+    done = std::move(run_it->second->final_callback);
+    if (!executor_status.ok()) {
+      run_it->second->final_status = executor_status;
+    }
+    callback_status = run_it->second->final_status;
+    run_it->second->executor_done = true;
+  }
+  if (done != nullptr) {
+    done(callback_status);
+    mutex_lock l(mu_);
+    step_id_to_partial_run_.erase(step_id);
+  }
+}
+
+void PartialRunMgr::PartialRunDone(int step_id, StatusCallback done,
+                                   const Status& status) {
+  Status callback_status;
+  {
+    mutex_lock l(mu_);
+    auto run_it = step_id_to_partial_run_.find(step_id);
+    if (run_it == step_id_to_partial_run_.end()) {
+      return;
+    }
+    run_it->second->final_status.Update(status);
+    if (!run_it->second->executor_done) {
+      // If we found the partial_run, we set the final callback to call only
+      // when the executor is completely done.
+      run_it->second->final_callback = std::move(done);
+      return;
+    }
+    callback_status = run_it->second->final_status;
+  }
+  // Otherwise we call the callback immediately.
+  done(callback_status);
+  mutex_lock l(mu_);
+  step_id_to_partial_run_.erase(step_id);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.h b/tensorflow/core/distributed_runtime/partial_run_mgr.h
new file mode 100644
index 00000000000..af56e723a9a
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PartialRunMgr keeps track of pending partial run requests, and ensures that
+// the partial run is only marked complete when the corresponding executor is
+// run to completion.
+//
+// In tensorflow workers, the executor runs operations asynchronously until
+// specified fetches (operations that return tensors) or targets (operations
+// that don't return tensors) are reached. A PartialRun has two components: a
+// setup which specifies all desired fetches and targets, and run calls that
+// specify fetch values (from the setup calls) to retrieve.
+// On the last partial run call, it is possible to satisfy the
+// required fetches before the executor has completed running the graph to all
+// the desired targets.
+// PartialRunMgr is used to ensure that we don't complete and return the final
+// partial run call to the user until both the partial run and executor have
+// completed.
+//
+// PartialRunMgr is thread-safe.
+class PartialRunMgr {
+ public:
+  // Find or create the CancellationManager associated with step_id.
+  // The PartialRunMgr owns the cancellation_manager.
+  // Returns true if a new CancellationManager was created
+  // (i.e this is a new partial run).
+  bool FindOrCreate(int step_id, CancellationManager** cancellation_manager);
+
+  // Calls the final callback if the PartialRunRequest has already completed.
+  // Otherwise stores the executor_status to be propagated when the
+  // PartialRunRequest completes (PartialRunDone has been called).
+  void ExecutorDone(int step_id, const Status& executor_status);
+
+  // Calls done if the executor has already completed (ExecutorDone has been
+  // called). Otherwise, stores the status and done callback, calling them when
+  // ExecutorDone is called. The callback will either be called by the calling
+  // thread of either PartialRunDone or ExecutorDone.
+  // If executor_status in ExecutorDone is not OK, it takes precedence over
+  // status and is passed to the done callback.
+  void PartialRunDone(int step_id, StatusCallback done, const Status& status);
+
+ private:
+  // PartialRunState stores state associated with a pending partial run request.
+  // This is protected by the mutex in PartialRunMgr.
+  struct PartialRunState {
+    std::unique_ptr<CancellationManager> cancellation_manager;
+
+    bool executor_done = false;
+    StatusCallback final_callback = nullptr;
+    Status final_status;
+  };
+
+  mutex mu_;
+
+  std::unordered_map<int, std::unique_ptr<PartialRunState>>
+      step_id_to_partial_run_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
new file mode 100644
index 00000000000..5f7c0cb3cae
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PartialRunMgrFindOrCreate, Create) {
+  // Basic test of PartialRunMgr CancellationManager creation.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  EXPECT_TRUE(cancellation_manager != nullptr);
+}
+
+TEST(PartialRunMgrFindOrCreate, Find) {
+  // Basic test of PartialRunMgr CancellationManager find.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  // Looking for the same step should return the same cancellation_manager.
+  CancellationManager* found_cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &found_cancellation_manager);
+  EXPECT_EQ(cancellation_manager, found_cancellation_manager);
+}
+
+TEST(PartialRunMgrFindOrCreate, NewCreate) {
+  // Test that PartialRunMgr creates a new CancellationManager for new steps.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  // FindOrCreate on a new step should return a new cancellation_manager.
+  int new_step_id = 2;
+  CancellationManager* new_cancellation_manager;
+  partial_run_mgr.FindOrCreate(new_step_id, &new_cancellation_manager);
+  EXPECT_NE(cancellation_manager, new_cancellation_manager);
+}
+
+TEST(PartialRunMgr, PartialRunRemoved) {
+  // Test that PartialRunMgr ensures that the PartialRun is deleted after
+  // ExecutorDone and PartialRunDone are called.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+
+  int called = 0;
+  partial_run_mgr.PartialRunDone(
+      step_id, [&called](Status status) { called++; }, Status::OK());
+  partial_run_mgr.ExecutorDone(step_id, Status::OK());
+
+  // Calling ExecutorDone and PartialRunDone on the step_id should still only
+  // result in the callback being called once.
+  // This proves that the original PartialRun has been removed.
+  partial_run_mgr.PartialRunDone(
+      step_id, [&called](Status status) { called++; }, Status::OK());
+  partial_run_mgr.ExecutorDone(step_id, Status::OK());
+  EXPECT_EQ(1, called);
+}
+
+struct StatusTestParam {
+  Status executor_status;
+  Status partial_run_status;
+  Status expected_status;
+};
+
+class StatusPropagationTest : public ::testing::TestWithParam<StatusTestParam> {
+ protected:
+  PartialRunMgr partial_run_mgr_;
+
+  // State to help keep track of when the callback is called.
+  Notification invoked_;
+  Status status_;
+
+  void set_status(const Status& status) {
+    status_ = status;
+    invoked_.Notify();
+  }
+
+  // Blocks until status is set.
+  Status status() {
+    invoked_.WaitForNotification();
+    return status_;
+  }
+};
+
+TEST_P(StatusPropagationTest, ExecutorDoneFirst) {
+  // Tests error propagation when ExecutorDone is called first.
+  StatusTestParam param = GetParam();
+  int step_id = 1;
+
+  CancellationManager* cancellation_manager;
+  partial_run_mgr_.FindOrCreate(step_id, &cancellation_manager);
+
+  partial_run_mgr_.ExecutorDone(step_id, param.executor_status);
+  partial_run_mgr_.PartialRunDone(step_id,
+                                  [this](Status status) { set_status(status); },
+                                  param.partial_run_status);
+
+  EXPECT_EQ(status(), param.expected_status);
+}
+
+TEST_P(StatusPropagationTest, PartialRunDoneFirst) {
+  // Tests error propagation when PartialRunDone is called first.
+  StatusTestParam param = GetParam();
+  int step_id = 1;
+
+  CancellationManager* cancellation_manager;
+  partial_run_mgr_.FindOrCreate(step_id, &cancellation_manager);
+
+  partial_run_mgr_.PartialRunDone(step_id,
+                                  [this](Status status) { set_status(status); },
+                                  param.partial_run_status);
+  partial_run_mgr_.ExecutorDone(step_id, param.executor_status);
+
+  EXPECT_EQ(status(), param.expected_status);
+}
+
+// Instantiate tests for all error orderings, for both call orders of
+// ExecutorDone and PartialRunDone.
+Status ExecutorError() { return errors::Internal("executor error"); }
+Status PartialRunError() { return errors::Internal("partial run error"); }
+INSTANTIATE_TEST_CASE_P(
+    PartialRunMgr, StatusPropagationTest,
+    ::testing::Values(
+        StatusTestParam{Status::OK(), Status::OK(), Status::OK()},
+        StatusTestParam{ExecutorError(), Status::OK(), ExecutorError()},
+        StatusTestParam{Status::OK(), PartialRunError(), PartialRunError()},
+        StatusTestParam{ExecutorError(), PartialRunError(), ExecutorError()}));
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 9632e9c4398..ec26ac44b5f 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -16,19 +16,19 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 
 #include <vector>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
-using std::placeholders::_1;
-
 // TODO(zhifengc): We need to consolidate (full/partial) device name
 // parsing into one place.
 //
@@ -43,8 +43,7 @@ string GetLocalDeviceName(StringPiece fullname) {
 class RemoteDevice : public Device {
  public:
   RemoteDevice(Env* env, const DeviceAttributes& da)
-      : Device(env, da, nullptr),
-        local_dev_name_(GetLocalDeviceName(da.name())) {}
+      : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {}
 
   Status Sync() override { return Status::OK(); }
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
@@ -68,18 +67,50 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
     GetStatusResponse resp;
   };
   Call* call = new Call;
-  auto cb = [env, worker_cache, worker_name, done, wi, call](const Status& s) {
+  auto cb = [env, worker_cache, worker_name, done, wi,
+             call](const Status& status) {
+    Status s = status;
     std::vector<Device*> remote_devices;
+    auto cleanup = gtl::MakeCleanup(
+        [&worker_cache, &worker_name, &wi, &done, &remote_devices, &s, call] {
+          worker_cache->ReleaseWorker(worker_name, wi);
+          done(s, &remote_devices);
+          delete call;
+        });
     if (s.ok()) {
+      DeviceNameUtils::ParsedName worker_name_parsed;
+      if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
+          !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
+          !worker_name_parsed.has_task) {
+        s = errors::InvalidArgument("Could not parse worker name: ",
+                                    worker_name);
+        LOG(WARNING) << s;
+        return;
+      }
       remote_devices.reserve(call->resp.device_attributes_size());
       for (const DeviceAttributes& da : call->resp.device_attributes()) {
-        auto d = new RemoteDevice(env, da);
-        remote_devices.push_back(d);
+        DeviceNameUtils::ParsedName device_name_parsed;
+        CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
+            << "Device attribute name '" << da.name() << "' could not be "
+            << "parsed. Device Attribute: " << da.DebugString();
+        // Preserve the exact name, if possible.
+        // TODO(b/37868888): Simplify when legacy device name formats removed.
+        if (device_name_parsed.job == worker_name_parsed.job &&
+            device_name_parsed.replica == worker_name_parsed.replica &&
+            device_name_parsed.task == worker_name_parsed.task) {
+          auto d = new RemoteDevice(env, da);
+          remote_devices.push_back(d);
+        } else {
+          DeviceAttributes da_rewritten = da;
+          da_rewritten.set_name(DeviceNameUtils::FullName(
+              worker_name_parsed.job, worker_name_parsed.replica,
+              worker_name_parsed.task, device_name_parsed.type,
+              device_name_parsed.id));
+          auto d = new RemoteDevice(env, da_rewritten);
+          remote_devices.push_back(d);
+        }
       }
     }
-    worker_cache->ReleaseWorker(worker_name, wi);
-    done(s, &remote_devices);
-    delete call;
   };
   wi->GetStatusAsync(&call->req, &call->resp, cb);
 }
diff --git a/tensorflow/core/distributed_runtime/remote_device_test.cc b/tensorflow/core/distributed_runtime/remote_device_test.cc
index 2faa85dbf91..778060daafa 100644
--- a/tensorflow/core/distributed_runtime/remote_device_test.cc
+++ b/tensorflow/core/distributed_runtime/remote_device_test.cc
@@ -46,14 +46,18 @@ class RemoteDeviceTest : public ::testing::Test {
     TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 1, &cluster_));
     const string& hostport = cluster_->targets()[0];
     GrpcChannelSpec spec;
-    spec.AddHostPortsJob("localhost", {hostport});
+    TF_CHECK_OK(spec.AddHostPortsJob("localhost", {hostport}));
+    ChannelCreationFunction channel_func =
+        ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
     worker_cache_.reset(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, NewHostPortGrpcChannel)));
+        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
     remote_name_ = "/job:localhost/replica:0/task:0";
     wi_ = worker_cache_->CreateWorker(remote_name_);
   }
 
-  ~RemoteDeviceTest() { worker_cache_->ReleaseWorker(remote_name_, wi_); }
+  ~RemoteDeviceTest() override {
+    worker_cache_->ReleaseWorker(remote_name_, wi_);
+  }
 
   void SetUp() override {
     Notification n;
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index 04c1fc248ef..43267d4362f 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -25,6 +25,23 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct WorkerSession;
+
+// RemoteRendezvous follow a 2-part initialization. First the objects are
+// constructed. Eventually, they will be initialized. Clients of the
+// RendezvousMgrInterface must guarantee to call Initialize on the returned
+// RemoteRendezvous eventually.
+//
+// Partially initialized RemoteRendezvous must respect the Rendezvous interface
+// (i.e. Send() must never block), however implementations are not expected to
+// actually perform the underlying operations until after the RemoteRendezvous
+// has been Initialize'd.
+class RemoteRendezvous : public Rendezvous {
+ public:
+  // Fully construct the RemoteRendezvous.
+  virtual Status Initialize(WorkerSession* session) = 0;
+};
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -51,7 +68,10 @@ class RendezvousMgrInterface {
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  virtual Rendezvous* Find(int64 step_id) = 0;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  virtual RemoteRendezvous* Find(int64 step_id) = 0;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 8ab8712c8cc..3ebc11614de 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -167,6 +167,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_session",
         "@grpc//:grpc++_unsecure",
     ],
 )
@@ -273,6 +274,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:master_env",
         "//tensorflow/core/distributed_runtime:master_session",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_env",
         "@grpc//:grpc++_unsecure",
         "@grpc//:grpc_unsecure",
@@ -280,6 +282,15 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "grpc_runtime",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":grpc_server_lib",
+        ":grpc_session",
+    ],
+)
+
 cc_binary(
     name = "grpc_tensorflow_server",
     srcs = [
@@ -287,12 +298,12 @@ cc_binary(
     ],
     deps = [
         ":grpc_server_lib",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/kernels:data_flow",
         "@grpc//:grpc++_unsecure",
     ],
 )
@@ -321,6 +332,7 @@ cc_binary(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:cwise_op",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index 35f849c7a5e..e85b8ccbd39 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/macros.h"
 
 #include "grpc++/grpc++.h"
@@ -88,7 +89,7 @@ class UntypedCall : public core::RefCounted {
   virtual void RequestReceived(Service* service, bool ok) = 0;
 
   // This method will be called either (i) when the server is notified
-  // that the request has been cancelled, or (ii) when the request completes
+  // that the request has been canceled, or (ii) when the request completes
   // normally. The implementation should distinguish these cases by querying
   // the `grpc::ServerContext` associated with the request.
   virtual void RequestCancelled(Service* service, bool ok) = 0;
@@ -174,7 +175,7 @@ class Call : public UntypedCall<Service> {
   }
 
   // Registers `callback` as the function that should be called if and when this
-  // call is cancelled by the client.
+  // call is canceled by the client.
   void SetCancelCallback(std::function<void()> callback) {
     mutex_lock l(mu_);
     cancel_callback_ = std::move(callback);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 450f25b3df9..bcd2c71f841 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -42,20 +42,6 @@ string MakeAddress(const string& job, int task) {
   return strings::StrCat("/job:", job, "/replica:0/task:", task);
 }
 
-}  // namespace
-
-SharedGrpcChannelPtr NewHostPortGrpcChannel(const string& target) {
-  // TODO(mrry): Implement secure channels.
-  ::grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
-  // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
-  // on connection failure, which makes our tests time out.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
-  return ::grpc::CreateCustomChannel(
-      target, ::grpc::InsecureChannelCredentials(), args);
-}
-
-namespace {
 Status ValidateHostPortPair(const string& host_port) {
   uint32 port;
   std::vector<string> parts = str_util::Split(host_port, ':');
@@ -69,6 +55,35 @@ Status ValidateHostPortPair(const string& host_port) {
 }
 }  // namespace
 
+Status NewHostPortGrpcChannel(const string& target,
+                              SharedGrpcChannelPtr* channel_pointer) {
+  // Minimally ensure that the target is valid
+  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+
+  // TODO(mrry): Implement secure channels.
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
+  // on connection failure, which makes our tests time out.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  *channel_pointer = ::grpc::CreateCustomChannel(
+      target, ::grpc::InsecureChannelCredentials(), args);
+  return Status::OK();
+}
+
+ChannelCreationFunction ConvertToChannelCreationFunction(
+    const std::function<Status(string, SharedGrpcChannelPtr*)>&
+        new_channel_func_ptr) {
+  return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
+    SharedGrpcChannelPtr channel_ptr;
+    if (new_channel_func_ptr(target, &channel_ptr).ok()) {
+      return channel_ptr;
+    } else {
+      return nullptr;
+    }
+  };
+}
+
 Status GrpcChannelSpec::AddHostPortsJob(const string& job_id,
                                         const std::vector<string>& host_ports) {
   std::map<int, string> host_ports_map;
@@ -142,7 +157,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<string>* workers) const override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkers(workers);
     }
@@ -201,7 +216,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
   ~SparseGrpcChannelCache() override {}
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<string>* workers) const override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       workers->emplace_back(MakeAddress(job_id_, id_host_port.first));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 0c0779440e2..c662cde9be8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -65,7 +65,7 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<string>* workers) const = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
@@ -84,7 +84,12 @@ GrpcChannelCache* NewGrpcChannelCache(const GrpcChannelSpec& channel_spec,
 
 // Below here are internal-only functions.
 
-SharedGrpcChannelPtr NewHostPortGrpcChannel(const string& target);
+ChannelCreationFunction ConvertToChannelCreationFunction(
+    const std::function<Status(string, SharedGrpcChannelPtr*)>&
+        new_channel_func_ptr);
+
+Status NewHostPortGrpcChannel(const string& target,
+                              SharedGrpcChannelPtr* channel_pointer);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index 4e8028b74f7..c975563a21f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -56,9 +57,12 @@ TEST(GrpcChannelTest, IsSameAddressSpace) {
 
 TEST(GrpcChannelTest, HostPorts) {
   GrpcChannelSpec spec;
-  spec.AddHostPortsJob("mnist", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"});
-  std::unique_ptr<GrpcChannelCache> cc(
-      NewGrpcChannelCache(spec, NewHostPortGrpcChannel));
+  TF_EXPECT_OK(spec.AddHostPortsJob(
+      "mnist", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  std::unique_ptr<GrpcChannelCache> cc(NewGrpcChannelCache(spec, channel_func));
+
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:6"));
@@ -96,9 +100,12 @@ TEST(GrpcChannelTest, HostPorts) {
 
 TEST(GrpcChannelTest, SparseHostPorts) {
   GrpcChannelSpec spec;
-  spec.AddHostPortsJob("mnist", {{0, "a:1"}, {3, "d:4"}, {4, "e:5"}});
-  std::unique_ptr<GrpcChannelCache> cc(
-      NewGrpcChannelCache(spec, NewHostPortGrpcChannel));
+  TF_EXPECT_OK(
+      spec.AddHostPortsJob("mnist", {{0, "a:1"}, {3, "d:4"}, {4, "e:5"}}));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  std::unique_ptr<GrpcChannelCache> cc(NewGrpcChannelCache(spec, channel_func));
+
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:1"));
@@ -137,4 +144,16 @@ TEST(GrpcChannelTest, SparseHostPorts) {
             workers);
 }
 
+TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
+  SharedGrpcChannelPtr mock_ptr;
+
+  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222", &mock_ptr).ok());
+
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222", &mock_ptr).ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/", &mock_ptr).ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:", &mock_ptr).ok());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index c8b3746d180..07205bb2c2b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -25,7 +25,7 @@ limitations under the License.
 // A GrpcMasterService discovers remote devices in the background and
 // keeps track of statistics of those remote devices.
 //
-// Each session analyses the graph, places nodes across available
+// Each session analyzes the graph, places nodes across available
 // devices, and ultimately drives the graph computation by initiating
 // RunGraph on workers.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service.h"
@@ -46,15 +46,16 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, ::grpc::ServerBuilder* builder)
-      : master_impl_(master), is_shutdown_(false) {
+  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+                    ::grpc::ServerBuilder* builder)
+      : master_impl_(master),
+        default_timeout_in_ms_(default_timeout_in_ms),
+        is_shutdown_(false) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
 
-  ~GrpcMasterService() {
-    delete shutdown_alarm_;
-  }
+  ~GrpcMasterService() override { delete shutdown_alarm_; }
 
   void Shutdown() override {
     bool did_shutdown = false;
@@ -127,6 +128,7 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
+  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
@@ -171,6 +173,11 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for running one step in a session.
   void RunStepHandler(MasterCall<RunStepRequest, RunStepResponse>* call) {
     CallOptions* call_opts = new CallOptions;
+    if (call->request.options().timeout_in_ms() > 0) {
+      call_opts->SetTimeout(call->request.options().timeout_in_ms());
+    } else {
+      call_opts->SetTimeout(default_timeout_in_ms_);
+    }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
     MutableRunStepResponseWrapper* wrapped_response =
@@ -221,8 +228,9 @@ class GrpcMasterService : public AsyncServiceInterface {
 };
 
 AsyncServiceInterface* NewGrpcMasterService(Master* master,
+                                            int64 default_timeout_in_ms,
                                             ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, builder);
+  return new GrpcMasterService(master, default_timeout_in_ms, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 77ae400bd57..8770dcc3ac9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
 
 #include <memory>
+#include "tensorflow/core/platform/types.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,6 +29,7 @@ class AsyncServiceInterface;
 class Master;
 
 AsyncServiceInterface* NewGrpcMasterService(Master* master,
+                                            int64 default_timeout_in_ms,
                                             ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index c3b76ed31bc..bf72d9a7fcd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 
+#include <utility>
+
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
@@ -29,7 +31,7 @@ namespace tensorflow {
 // that uses gRPC to talk to the Master service.
 class GrpcRemoteMaster : public MasterInterface {
  public:
-  explicit GrpcRemoteMaster(SharedGrpcChannelPtr client_channel)
+  explicit GrpcRemoteMaster(const SharedGrpcChannelPtr& client_channel)
       : stub_(grpc::MasterService::NewStub(client_channel)) {}
 
   ~GrpcRemoteMaster() override {}
@@ -106,7 +108,7 @@ class GrpcRemoteMaster : public MasterInterface {
   }
 };
 
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel) {
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel) {
   return new GrpcRemoteMaster(channel);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
index 881a6b10e30..d661caaa602 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 // Returns a MasterInterface wrapped around the gRPC channel `channel`.
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel);
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 7e83301841e..1e8c30bad55 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h"
 
+#include <utility>
+
 #include "grpc++/grpc++.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -37,9 +39,10 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             WorkerCacheLogger* logger)
-      : channel_(channel),
+      : channel_(std::move(channel)),
         cq_(completion_queue),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
+        createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
         registergraph_(Method(GrpcWorkerMethod::kRegisterGraph)),
         deregistergraph_(Method(GrpcWorkerMethod::kDeregisterGraph)),
         rungraph_(Method(GrpcWorkerMethod::kRunGraph)),
@@ -58,6 +61,12 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, getstatus_, std::move(done));
   }
 
+  void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                CreateWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    IssueRequest(request, response, createworkersession_, std::move(done));
+  }
+
   void RegisterGraphAsync(const RegisterGraphRequest* request,
                           RegisterGraphResponse* response,
                           StatusCallback done) override {
@@ -138,8 +147,9 @@ class GrpcRemoteWorker : public WorkerInterface {
             // the RecvTensor response can not have been sent before
             // the RecvTensor request, and must have been sent before
             // it was received.
-            send_start_usec = std::max(start_usec, static_cast<int64>(
-                response->metadata().send_start_micros()));
+            send_start_usec = std::max(
+                start_usec,
+                static_cast<int64>(response->metadata().send_start_micros()));
             send_start_usec = std::min(send_start_usec, end_usec - 1);
           }
           const string& key = request->rendezvous_key();
@@ -163,7 +173,7 @@ class GrpcRemoteWorker : public WorkerInterface {
     }
 
     IssueRequest(req_copy ? req_copy : request, response, recvtensor_,
-                 std::move(*cb_to_use), call_opts);
+                 *cb_to_use, call_opts);
   }
 
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
@@ -245,6 +255,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   ::grpc::CompletionQueue* cq_;
 
   const ::grpc::RpcMethod getstatus_;
+  const ::grpc::RpcMethod createworkersession_;
   const ::grpc::RpcMethod registergraph_;
   const ::grpc::RpcMethod deregistergraph_;
   const ::grpc::RpcMethod rungraph_;
@@ -263,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(channel, completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index 7d5b10d6857..e7eca62fdfc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -188,7 +188,7 @@ class UnlimitedSizeProtoSerializationTraits {
   }
 
   static Status Deserialize(grpc_byte_buffer* buffer, T* msg,
-                            int max_message_size) {
+                            int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index d2d2bfceff5..3867dd1f4d0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 
+#include <cstring>
 #include <limits>
 #include <memory>
 
@@ -61,6 +62,11 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
+// static utility function
+RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
+  return new RpcRendezvousMgr(env);
+}
+
 }  // namespace
 
 GrpcServer::GrpcServer(const ServerDef& server_def, Env* env)
@@ -75,17 +81,21 @@ GrpcServer::~GrpcServer() {
 
   // TODO(mrry): Refactor the *Env classes so that it is less fiddly
   // to destroy them.
-  delete master_env_.worker_cache;  // Shared with worker_env.worker_cache.
+
+  // Shut down all outstanding rendezvous.
+  delete worker_env_.rendezvous_mgr;
 
   // We must delete graph_mgr before device_mgr, due to shared
   // ownership of OpKernels in the executors. (The graph_mgr will
   // free all stateless OpKernels, and pass over borrowed stateful
   // OpKernels, which are also held in their respective devices'
   // OpSegments.)
-  delete worker_env_.graph_mgr;
-  delete worker_env_.device_mgr;
-
-  delete worker_env_.rendezvous_mgr;
+  if (worker_env_.session_mgr != nullptr) {
+    delete worker_env_.session_mgr;  // Deletes graph_mgr's.
+  } else {
+    // Note: session_mgr's legacy_session_ deletes device_mgr now.
+    delete worker_env_.device_mgr;
+  }
 
   // Do not delete (as these are not owned by the server):
   // - master_env_.env
@@ -93,30 +103,38 @@ GrpcServer::~GrpcServer() {
   // - worker_env_.compute_pool
 }
 
-Status GrpcServer::Init() {
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
   worker_env_.env = env_;
 
   SessionOptions sess_opts;
-  sess_opts.config = server_def_.default_session_config();
+  ConfigProto config = server_def_.default_session_config();
+  sess_opts.config = config;
 
   // Configure shared devices between master and worker.
   string name_prefix =
-      strings::StrCat("/job:", server_def_.job_name(), "/replica:0", "/task:",
-                      server_def_.task_index());
+      strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
+                      "/task:", server_def_.task_index());
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
                                                &master_env_.local_devices));
-  worker_env_.device_mgr = new DeviceMgr(master_env_.local_devices);
+  worker_env_.local_devices = master_env_.local_devices;
+  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+                                   ? new RpcRendezvousMgr(&worker_env_)
+                                   : rendezvous_mgr_func(&worker_env_);
   string unused;
+  string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
-                                        &worker_env_.worker_name, &unused)) {
+                                        &default_worker_name, &unused)) {
     return errors::Internal("Could not parse worker name.");
   }
 
   // Look up the port that has been requested for this task in `server_def_`.
-  requested_port_ = -1;
+  int requested_port = -1;
   for (const auto& job : server_def_.cluster().job()) {
     if (job.name() == server_def_.job_name()) {
       auto iter = job.tasks().find(server_def_.task_index());
@@ -128,7 +146,7 @@ Status GrpcServer::Init() {
       const std::vector<string> hostname_port =
           str_util::Split(iter->second, ':');
       if (hostname_port.size() != 2 ||
-          !strings::safe_strto32(hostname_port[1], &requested_port_)) {
+          !strings::safe_strto32(hostname_port[1], &requested_port)) {
         return errors::InvalidArgument(
             "Could not parse port for local server from \"", iter->second,
             "\"");
@@ -137,13 +155,13 @@ Status GrpcServer::Init() {
       }
     }
   }
-  if (requested_port_ == -1) {
+  if (requested_port == -1) {
     return errors::Internal("Job \"", server_def_.job_name(),
                             "\" was not defined in cluster");
   }
 
   // N.B. The order of initialization here is intricate, because we
-  // wish to allow `requested_port_ == 0` (for choosing any port,
+  // wish to allow `requested_port == 0` (for choosing any port,
   // mostly for testing). Therefore, the construction of the channel
   // and worker caches depends on `bound_port_`, which is not set
   // until we call `builder.BuildAndStart()`. We must create the
@@ -157,23 +175,75 @@ Status GrpcServer::Init() {
   // the identities of tasks in the worker pool after the service is
   // running.
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port_),
+  builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port),
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
-  master_impl_.reset(new Master(&master_env_, 0.0));
-  master_service_ = NewGrpcMasterService(master_impl_.get(), &builder);
-  worker_impl_.reset(NewGrpcWorker(&worker_env_));
-  worker_service_ = NewGrpcWorkerService(worker_impl_.get(), &builder);
+  master_impl_ = CreateMaster(&master_env_);
+  master_service_ = NewGrpcMasterService(
+      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
+  worker_impl_ = NewGrpcWorker(&worker_env_);
+  worker_service_ =
+      NewGrpcWorkerService(worker_impl_.get(), &builder).release();
+  // extra service:
+  if (service_func != nullptr) {
+    service_func(&worker_env_, &builder);
+  }
   server_ = builder.BuildAndStart();
 
   if (!server_) {
     return errors::Unknown("Could not start gRPC server");
   }
 
-  GrpcChannelSpec channel_spec;
-  for (const auto& job : server_def_.cluster().job()) {
+  WorkerCacheInterface* worker_cache;
+  WorkerCacheFactoryOptions worker_cache_factory_options(server_def_);
+  TF_RETURN_IF_ERROR(
+      WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
+  CHECK_NE(nullptr, worker_cache);
+
+  // Set up worker environment.
+  worker_env_.session_mgr = new SessionMgr(
+      &worker_env_, SessionMgr::WorkerNameFromServerDef(server_def_),
+      std::unique_ptr<WorkerCacheInterface>(worker_cache),
+      [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
+        WorkerCacheFactoryOptions options(server_def);
+        return WorkerCacheFactory(options, worker_cache);
+      });
+  worker_env_.compute_pool = ComputePool(sess_opts);
+
+  // Finish setting up master environment.
+  master_env_.ops = OpRegistry::Global();
+  master_env_.worker_cache = worker_cache;
+  master_env_.master_session_factory =
+      [config](
+          SessionOptions options, const MasterEnv* env,
+          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+          std::unique_ptr<WorkerCacheInterface> worker_cache,
+          std::unique_ptr<DeviceSet> device_set) {
+        options.config.MergeFrom(config);
+        return new MasterSession(options, env, std::move(remote_devs),
+                                 std::move(worker_cache), std::move(device_set),
+                                 CreateNoOpStatsPublisher);
+      };
+  master_env_.worker_cache_factory =
+      [this](const WorkerCacheFactoryOptions& options,
+             WorkerCacheInterface** worker_cache) {
+        return WorkerCacheFactory(options, worker_cache);
+      };
+
+  // Provide direct access to the master from in-process clients.
+  LocalMaster::Register(target(), master_impl_.get(),
+                        config.operation_timeout_in_ms());
+
+  return Status::OK();
+}
+
+Status GrpcServer::Init() { return Init(nullptr, nullptr); }
+
+Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
+                                    GrpcChannelSpec* channel_spec) {
+  for (const auto& job : options.cluster_def->job()) {
     std::map<int, string> host_ports;
     for (const auto& task : job.tasks()) {
       string& host_port = host_ports[task.first];
@@ -183,45 +253,52 @@ Status GrpcServer::Init() {
                                        task.first, "\": ", host_port, " and ",
                                        task.second);
       }
-      if (job.name() == server_def_.job_name() &&
-          task.first == server_def_.task_index()) {
+      if (job.name() == *options.job_name && task.first == options.task_index) {
         host_port = strings::StrCat("localhost:", bound_port_);
       } else {
         host_port = task.second;
       }
     }
-    channel_spec.AddHostPortsJob(job.name(), host_ports);
+    TF_RETURN_IF_ERROR(channel_spec->AddHostPortsJob(job.name(), host_ports));
+  }
+  return Status::OK();
+}
+
+Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
+                                      WorkerCacheInterface** worker_cache) {
+  if (options.job_name == nullptr || options.job_name->empty()) {
+    Status s = errors::InvalidArgument(
+        "The master (current machine) is not included in the provided "
+        "cluster_def. ",
+        options.cluster_def->DebugString());
+    LOG(WARNING) << s;
+    return s;
   }
 
-  std::unique_ptr<GrpcChannelCache> channel_cache(NewGrpcChannelCache(
-      channel_spec, GetChannelCreationFunction(server_def_)));
+  GrpcChannelSpec channel_spec;
+  TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
+
+  std::unique_ptr<GrpcChannelCache> channel_cache(
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
+
+  string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
+                                       "/task:", options.task_index);
+
   const string host_port = channel_cache->TranslateTask(name_prefix);
+  int requested_port;
+
   if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
-                             &requested_port_)) {
+                             &requested_port)) {
     return errors::Internal("Could not parse port for local server from \"",
                             channel_cache->TranslateTask(name_prefix), "\".");
   }
-  worker_env_.worker_cache = NewGrpcWorkerCacheWithLocalWorker(
+  if (requested_port != bound_port_) {
+    return errors::InvalidArgument("Requested port ", requested_port,
+                                   " differs from expected port ", bound_port_);
+  }
+
+  *worker_cache = NewGrpcWorkerCacheWithLocalWorker(
       channel_cache.release(), worker_impl_.get(), name_prefix);
-
-  // Finish setting up master environment.
-  master_env_.ops = OpRegistry::Global();
-  master_env_.worker_cache = worker_env_.worker_cache;
-  master_env_.master_session_factory = [](const SessionOptions& options,
-                                          const MasterEnv* env,
-                                          std::vector<Device*>* remote_devs) {
-    return new MasterSession(options, env, remote_devs,
-                             CreateNoOpStatsPublisher);
-  };
-
-  // Finish setting up worker environment.
-  worker_env_.graph_mgr = new GraphMgr(&worker_env_);
-  worker_env_.compute_pool = ComputePool(sess_opts);
-  worker_env_.rendezvous_mgr = new RpcRendezvousMgr(&worker_env_);
-
-  // Provide direct access to the master from in-process clients.
-  LocalMaster::Register(target(), master_impl_.get());
-
   return Status::OK();
 }
 
@@ -292,16 +369,23 @@ std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
   return ::grpc::InsecureServerCredentials();
 }
 
-ChannelCreationFunction GrpcServer::GetChannelCreationFunction(
-    const ServerDef& server_def) const {
-  return NewHostPortGrpcChannel;
+ChannelCreationFunction GrpcServer::GetChannelCreationFunction() const {
+  // We can do this because SparseGrpcChannelCache is robust to nullptr being
+  // returned by the channel creation function
+  return ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+}
+
+std::unique_ptr<Master> GrpcServer::CreateMaster(MasterEnv* master_env) {
+  return std::unique_ptr<Master>(new Master(master_env, 0.0));
 }
 
 /* static */
 Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<ServerInterface>* out_server) {
-  std::unique_ptr<GrpcServer> ret(new GrpcServer(server_def, Env::Default()));
-  TF_RETURN_IF_ERROR(ret->Init());
+  std::unique_ptr<GrpcServer> ret(
+      new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
+  ServiceInitFunction service_func = nullptr;
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr));
   *out_server = std::move(ret);
   return Status::OK();
 }
@@ -325,6 +409,7 @@ class GrpcServerRegistrar {
  public:
   GrpcServerRegistrar() {
     gpr_allocation_functions alloc_fns;
+    memset(&alloc_fns, 0, sizeof(alloc_fns));
     alloc_fns.malloc_fn = port::Malloc;
     alloc_fns.realloc_fn = port::Realloc;
     alloc_fns.free_fn = port::Free;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index cb66a963e62..7b54bb84c88 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
@@ -35,6 +36,15 @@ namespace tensorflow {
 class GrpcWorker;
 class Master;
 
+// function that creates a RendezvousMgr.
+typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
+    RendezvousMgrCreationFunction;
+
+// function that registers a service to the server. The service needs to
+// be registered before builder.BuildAndStart().
+typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
+    ServiceInitFunction;
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
@@ -54,26 +64,40 @@ class GrpcServer : public ServerInterface {
   const string target() const override;
 
  protected:
+  Status Init(ServiceInitFunction service_func,
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
+
   Status Init();
 
   // A subclass can override this method to support secure credentials.
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
       const ServerDef& server_def) const;
 
-  virtual ChannelCreationFunction GetChannelCreationFunction(
-      const ServerDef& server_def) const;
+  virtual ChannelCreationFunction GetChannelCreationFunction() const;
+
+  virtual std::unique_ptr<Master> CreateMaster(MasterEnv* master_env);
+
+  // Creates a WorkerCacheInterface for a session.
+  Status WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
+                            WorkerCacheInterface** worker_cache);
+
+  // Parses a WorkerCacheFactoryOptions into a GrpcChannelSpec.
+  Status ParseChannelSpec(const WorkerCacheFactoryOptions& options,
+                          GrpcChannelSpec* channel_spec);
 
   // Returns the port to which this server is bound.
   // This method may only be called after `this->Init()` returns successfully.
   int bound_port() const { return bound_port_; }
 
+  WorkerEnv* worker_env() { return &worker_env_; }
+
+  const ServerDef& server_def() const { return server_def_; }
+
  private:
   // The overall server configuration.
   const ServerDef server_def_;
   Env* env_;
 
-  // The port requested for this server.
-  int requested_port_;
   // The port to which this server is bound.
   int bound_port_ = 0;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 4e2f5de2139..fbde7aa9240 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -30,8 +32,7 @@ limitations under the License.
 namespace tensorflow {
 
 GrpcSession::GrpcSession(const SessionOptions& options)
-    : options_(options),
-      current_graph_version_(-1) {}
+    : options_(options), current_graph_version_(-1) {}
 
 GrpcSession::~GrpcSession() {}
 
@@ -43,7 +44,7 @@ const size_t kSchemePrefixLength = strlen(kSchemePrefix);
 /* static */
 Status GrpcSession::Create(const SessionOptions& options,
                            std::unique_ptr<GrpcSession>* out_session) {
-  std::unique_ptr<GrpcSession> ret(new GrpcSession(options));
+  std::unique_ptr<GrpcSession> session(new GrpcSession(options));
   std::unique_ptr<MasterInterface> master;
   // For testing, we enable the client to disable the use of the local
   // master registry, so that the RPC stack is exercised.
@@ -51,12 +52,13 @@ Status GrpcSession::Create(const SessionOptions& options,
     master = LocalMaster::Lookup(options.target);
   }
   if (!master) {
-    SharedGrpcChannelPtr master_channel =
-        NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength));
+    SharedGrpcChannelPtr master_channel;
+    TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
+        options.target.substr(kSchemePrefixLength), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
-  ret->SetRemoteMaster(std::move(master));
-  *out_session = std::move(ret);
+  session->SetRemoteMaster(std::move(master));
+  *out_session = std::move(session);
   return Status::OK();
 }
 
@@ -101,6 +103,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionRequest req;
   *req.mutable_config() = options_.config;
   *req.mutable_graph_def() = graph;
+  req.set_target(options_.target);
   ReEncodeConsts(req.mutable_graph_def());
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
@@ -176,6 +179,11 @@ Status GrpcSession::RunHelper(
 
   *req->mutable_options() = run_options;
 
+  if (run_options.timeout_in_ms() == 0) {
+    req->mutable_options()->set_timeout_in_ms(
+        options_.config.operation_timeout_in_ms());
+  }
+
   if (!prun_handle.empty()) {
     req->set_partial_run_handle(prun_handle);
   }
@@ -196,7 +204,7 @@ Status GrpcSession::RunHelper(
   }
 
   CallOptions call_options;
-  call_options.SetTimeout(run_options.timeout_in_ms());
+  call_options.SetTimeout(req->options().timeout_in_ms());
   TF_RETURN_IF_ERROR(RunProto(&call_options, req.get(), resp.get()));
 
   if (!output_tensor_names.empty()) {
@@ -314,27 +322,41 @@ Status GrpcSession::Close() {
   return master_->CloseSession(&call_options, &req, &resp);
 }
 
-std::vector<DeviceAttributes> GrpcSession::ListDevices() {
-  std::vector<DeviceAttributes> devices;
-
+Status GrpcSession::ListDevices(std::vector<DeviceAttributes>* response) {
   ListDevicesRequest req;
+  {
+    mutex_lock l(mu_);
+    req.set_session_handle(handle_);
+  }
+  if (req.session_handle().empty()) {
+    LOG(WARNING) << "GrpcSession::ListDevices will initialize the session with "
+                    "an empty graph and other defaults because the session has "
+                    "not yet been created.";
+    GraphDef graph_def;
+    TF_RETURN_IF_ERROR(Create(graph_def));
+    {
+      mutex_lock l(mu_);
+      req.set_session_handle(handle_);
+    }
+  }
   ListDevicesResponse resp;
   CallOptions call_options;
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   Status s = master_->ListDevices(&call_options, &req, &resp);
   if (!s.ok()) {
     LOG(ERROR) << "Could not list devices: " << s;
-    return devices;
+    return s;
   }
 
+  response->clear();
+  response->reserve(resp.local_device_size() + resp.remote_device_size());
   for (const auto& device_attr : resp.local_device()) {
-    devices.push_back(device_attr);
+    response->emplace_back(device_attr);
   }
   for (const auto& device_attr : resp.remote_device()) {
-    devices.push_back(device_attr);
+    response->emplace_back(device_attr);
   }
-
-  return devices;
+  return Status::OK();
 }
 
 void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
@@ -344,8 +366,9 @@ void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
 // Static method.
 Status GrpcSession::Reset(const SessionOptions& options,
                           const std::vector<string>& containers) {
-  SharedGrpcChannelPtr master_channel =
-      NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength));
+  SharedGrpcChannelPtr master_channel;
+  TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
+      options.target.substr(kSchemePrefixLength), &master_channel));
   auto master = NewGrpcMaster(master_channel);
   ResetRequest req;
   for (const auto& c : containers) req.add_container(c);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 8fd17a053be..300f7271249 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -94,7 +94,7 @@ class GrpcSession : public Session {
       const std::vector<string>& output_names,
       std::vector<Tensor>* outputs) override;
 
-  std::vector<DeviceAttributes> ListDevices();
+  Status ListDevices(std::vector<DeviceAttributes>* response) override;
 
  protected:
   // Takes ownership of `*master`.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 49793a6f8c5..405b2939ebd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/port.h"
 
@@ -255,7 +254,7 @@ void FindMaxEigen(const string& target) {
 
 TEST(FindMaxEigenTest, RemoteDevice) {
   std::unique_ptr<test::TestCluster> cluster;
-  test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster);
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
   FindMaxEigen(cluster->targets()[0]);
 }
 
@@ -517,7 +516,7 @@ TEST(GrpcSessionTest, Error) {
     //
     // Subgraph for "b" sleeps at the node "b_delay". When the sleep
     // finishes, the subgraph "b" will continue execution till it
-    // notices that it is cancelled. Meanwhile, subgraph's executor
+    // notices that it is canceled. Meanwhile, subgraph's executor
     // and its related state (registered ops) should still be alive.
     auto b = test::graph::Constant(&g, Tensor());
     b->set_assigned_device_name(dev_b);
@@ -814,7 +813,7 @@ TEST(SessionTest, ExtendValidation) {
 // Tests that Create() with "operation_timeout_in_ms" set times out.
 TEST(SessionTest, CreateTimeoutWithSessionOptions) {
   // Creates a RemoteSession with "operation_timeout_in_ms" set to 100.
-  SessionOptions options = Options("example.org", 1);
+  SessionOptions options = Options("example.org:2222", 1);
   options.config.set_operation_timeout_in_ms(100);
   std::unique_ptr<Session> session(NewRemote(options));
 
@@ -832,7 +831,7 @@ TEST(SessionTest, CreateTimeoutWithSessionOptions) {
 
 // Tests that Create() with "timeout_in_ms" in RunOptions set times out.
 TEST(SessionTest, CreateTimeoutWithRunOptions) {
-  SessionOptions options = Options("example.org", 1);
+  SessionOptions options = Options("example.org:2222", 1);
   std::unique_ptr<Session> session(NewRemote(options));
 
   // Creates a long running op.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index c6260afa20e..90e311a4930 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "grpc++/support/byte_buffer.h"
 #include "grpc++/support/slice.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -27,10 +28,9 @@ namespace tensorflow {
 namespace grpc {
 
 static void do_nothing(void* raw) {}
-static void unref_tensorreference(void* raw) {
-  TensorReference* ref = static_cast<TensorReference*>(raw);
-  ref->Unref();
-  delete ref;
+static void unref_tensorbuffer(void* raw) {
+  TensorBuffer* buf = static_cast<TensorBuffer*>(raw);
+  buf->Unref();
 }
 
 void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
@@ -166,7 +166,7 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
         (e_skeleton.size() +
          VarLengthEncodingSize(TensorProto::kTensorContentFieldNumber,
                                tdata.size()));
-    string header;  // All of RecvTensorRequest except the tensor() field
+    string header;  // All of RecvTensorResponse except the tensor() field
     response.AppendToString(&header);
 
     size_t expected_size =
@@ -219,8 +219,8 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
     if (tensor_data_is_large) {
       // Encode the actual tensor data by pointing to the backing store,
-      // and add a special zero-length slice that is really a TensorReference
-      // object that we will destroy when we are done.
+      // and add a special zero-length slice that is really a TensorBuffer
+      // reference that we will unref when we are done.
       //
       // TODO(jeff): Note that this approach relies on the fact that
       // slices are destroyed in the order in which they are added to
@@ -241,17 +241,15 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
       // (E) Encode tensor data, but by sharing backing store
 
-      // TODO(jeff,sanjay): It'd be nice to avoid this TensorReference
-      // allocation, and instead get our hands on the underlying
-      // TensorBuffer object and just directly ref it here and unref
-      // it in unref_tensorreference.
-      TensorReference* ref = new TensorReference(val);
+      const TensorBuffer* buf = DMAHelper::buffer(&val);
+      buf->Ref();
       gpr_slice s1 = gpr_slice_new(
           const_cast<void*>(static_cast<const void*>(tdata.data())),
           tdata.size(), do_nothing);
       slices[1] = ::grpc::Slice(s1, ::grpc::Slice::STEAL_REF);
 
-      gpr_slice s2 = gpr_slice_new(ref, 0, unref_tensorreference);
+      gpr_slice s2 =
+          gpr_slice_new(const_cast<TensorBuffer*>(buf), 0, unref_tensorbuffer);
       slices[2] = ::grpc::Slice(s2, ::grpc::Slice::STEAL_REF);
       num_slices += 2;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
index 1c7bb4375c5..f247322bc49 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -120,7 +121,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
   std::unique_ptr<tensorflow::ServerInterface> server;
-  tensorflow::NewServer(server_def, &server);
-  server->Start();
-  server->Join();
+  TF_QCHECK_OK(tensorflow::NewServer(server_def, &server));
+  TF_QCHECK_OK(server->Start());
+  TF_QCHECK_OK(server->Join());
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index bcf5181e153..c237f2dce43 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -70,7 +70,7 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   std::unique_ptr<GrpcSession> session;
   TF_RETURN_IF_ERROR(GrpcSession::Create(options_copy, &session));
   std::vector<DeviceAttributes> device_attributes;
-  ret->devices_ = session->ListDevices();
+  TF_RETURN_IF_ERROR(session->ListDevices(&ret->devices_));
 
   *out_cluster = std::move(ret);
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index 6b7f6f915bf..e718db251c3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -111,8 +112,8 @@ int main(int argc, char* argv[]) {
     LOG(ERROR) << "Could not create server: " << s.error_message();
     return -1;
   }
-  svr->Start();
-  svr->Join();
+  TF_QCHECK_OK(svr->Start());
+  TF_QCHECK_OK(svr->Join());
 
   // NOTE(mrry): Unreachable code.
   return 0;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 3d39e389bec..29c812408dc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -55,7 +55,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
     delete channel_cache_;
   }
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<string>* workers) const override {
     channel_cache_->ListWorkers(workers);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index fc3cc37d26a..873ef8588f4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -57,9 +58,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
     cq_ = builder->AddCompletionQueue();
   }
 
-  ~GrpcWorkerService() {
-    delete shutdown_alarm_;
-  }
+  ~GrpcWorkerService() override { delete shutdown_alarm_; }
 
   void Shutdown() override {
     bool did_shutdown = false;
@@ -114,6 +113,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
     // completes, and we may decide to bound some of the request
     // types.
     ENQUEUE_REQUEST(GetStatus, false);
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
     ENQUEUE_REQUEST(CleanupAll, false);
     ENQUEUE_REQUEST(RegisterGraph, false);
     ENQUEUE_REQUEST(DeregisterGraph, false);
@@ -182,6 +182,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(GetStatus, false);
   }
 
+  void CreateWorkerSessionHandler(
+      WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
+          call) {
+    Schedule([this, call]() {
+      Status s = worker_->CreateWorkerSession(&call->request, &call->response);
+      call->SendResponse(ToGrpcStatus(s));
+    });
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
+  }
+
   void CleanupAllHandler(
       WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
     Schedule([this, call]() {
@@ -376,15 +386,18 @@ void GrpcWorker::RecvTensorAsync(CallOptions* opts,
           done(status);
         }
       });
-  }
+}
 
-  WorkerEnv* GrpcWorker::env() { return env_; }
+WorkerEnv* GrpcWorker::env() { return env_; }
 
-  GrpcWorker* NewGrpcWorker(WorkerEnv* env) { return new GrpcWorker(env); }
+std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
+  return std::unique_ptr<GrpcWorker>(new GrpcWorker(env));
+}
 
-  AsyncServiceInterface* NewGrpcWorkerService(GrpcWorker* worker,
-                                              ::grpc::ServerBuilder* builder) {
-    return new GrpcWorkerService(worker, builder);
+std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder) {
+  return std::unique_ptr<AsyncServiceInterface>(
+      new GrpcWorkerService(worker, builder));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 9bd8416a0cb..7e9eae0881c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -27,6 +27,7 @@ namespace tensorflow {
 
 class AsyncServiceInterface;
 struct WorkerEnv;
+struct WorkerSession;
 
 class GrpcWorker : public Worker {
  public:
@@ -39,11 +40,11 @@ class GrpcWorker : public Worker {
   WorkerEnv* env();
 };
 
-GrpcWorker* NewGrpcWorker(WorkerEnv* worker_env);
+std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env);
 
 // Returns an implementation of WorkerService rpc service.
-AsyncServiceInterface* NewGrpcWorkerService(GrpcWorker* worker,
-                                            ::grpc::ServerBuilder* builder);
+std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index a3ba22a95d8..80a2f89337c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -30,6 +30,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
   switch (id) {
     case GrpcWorkerMethod::kGetStatus:
       return "/tensorflow.WorkerService/GetStatus";
+    case GrpcWorkerMethod::kCreateWorkerSession:
+      return "/tensorflow.WorkerService/CreateWorkerSession";
     case GrpcWorkerMethod::kRegisterGraph:
       return "/tensorflow.WorkerService/RegisterGraph";
     case GrpcWorkerMethod::kDeregisterGraph:
@@ -47,6 +49,9 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
     case GrpcWorkerMethod::kTracing:
       return "/tensorflow.WorkerService/Tracing";
   }
+  // Shouldn't be reached.
+  LOG(FATAL) << "Invalid id: this line shouldn't be reached.";
+  return "invalid id";
 }
 
 namespace grpc {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index f3aac795cab..bfdd58d46d7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -84,7 +84,7 @@ class SerializationTraits<tensorflow::TensorResponse>
   }
   static Status Deserialize(grpc_byte_buffer* buffer,
                             tensorflow::TensorResponse* msg,
-                            int max_message_size) {
+                            int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
@@ -109,6 +109,7 @@ namespace tensorflow {
 // Names of worker methods.
 enum class GrpcWorkerMethod {
   kGetStatus,
+  kCreateWorkerSession,
   kRegisterGraph,
   kDeregisterGraph,
   kRunGraph,
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 729aa36803f..8265100061e 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -38,9 +38,8 @@ namespace {
 
 class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  public:
-  RpcRemoteRendezvous(const WorkerEnv* env, WorkerCacheInterface* cache,
-                      int64 step_id)
-      : BaseRemoteRendezvous(env, step_id, false), cache_(cache) {}
+  RpcRemoteRendezvous(const WorkerEnv* env, int64 step_id)
+      : BaseRemoteRendezvous(env, step_id, false) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
@@ -50,7 +49,6 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  private:
   ~RpcRemoteRendezvous() override {}
 
-  WorkerCacheInterface* cache_;  // Not owned.
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRemoteRendezvous);
 };
 
@@ -204,86 +202,12 @@ static RpcRecvTensorFreeList* get_call_freelist() {
   return call_freelist;
 }
 
-// A private cache that wraps env->worker_cache and allows reuse of
-// WorkerInterface objects.
-class WorkerFreeListCache : public WorkerCacheInterface {
- public:
-  explicit WorkerFreeListCache(WorkerCacheInterface* w) : wrapped_(w) {}
-
-  ~WorkerFreeListCache() {
-    for (auto p : workers_) {
-      wrapped_->ReleaseWorker(p.first, p.second.worker);
-    }
-  }
-
-  void ListWorkers(std::vector<string>* workers) override {
-    wrapped_->ListWorkers(workers);
-  }
-
-  WorkerInterface* CreateWorker(const string& target) override {
-    mutex_lock l(mu_);
-    auto p = workers_.find(target);
-    if (p != workers_.end()) {
-      return p->second.worker;
-    }
-    WorkerState state;
-    state.worker = wrapped_->CreateWorker(target);
-    if (state.worker != nullptr) {
-      workers_.insert(make_pair(target, state));
-    }
-    return state.worker;
-  }
-
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
-    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
-  }
-
-  bool GetDeviceLocalityNonBlocking(const string& device,
-                                    DeviceLocality* locality) override {
-    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
-  }
-
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
-                              StatusCallback done) override {
-    wrapped_->GetDeviceLocalityAsync(device, locality, done);
-  }
-
-  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
-
-  void ClearLogs() override { wrapped_->ClearLogs(); }
-
-  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
-    return wrapped_->RetrieveLogs(step_id, ss);
-  }
-
- private:
-  WorkerCacheInterface* wrapped_;
-
-  // Information kept per created WorkerInterface.
-  struct WorkerState {
-    WorkerInterface* worker;
-    // TODO(jeff,sanjay): Add reference count if we support eviction.
-  };
-
-  // TODO(jeff,sanjay): Eviction when the map becomes too big.
-  mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
-};
-
 void RpcRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
+  CHECK(is_initialized());
   Status s;
 
-  // TODO(jeff): Consider checking for a valid worker_cache during the
-  // constructor of RpcRemoteRendezvous, rather than here, to simplify
-  // the twisty logic below.
-  if (env_->worker_cache == nullptr) {
-    s = errors::Internal("No remote worker cache available.");
-    done(s, Args(), recv_args, Tensor{}, false);
-    return;
-  }
-
   // Prepare a RecvTensor call that can handle being aborted.
   RpcRecvTensorCall* call = get_call_freelist()->New();
 
@@ -293,17 +217,21 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     s = errors::Internal(parsed.src_device,
                          " is invalid remote source device.");
   }
-  WorkerInterface* rwi = cache_->CreateWorker(call->src_worker_);
+  WorkerSession* sess = session();
+  WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
   }
 
   Device* dst_device;
   if (s.ok()) {
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
-    get_call_freelist()->Release(call, cache_);
+    if (rwi != nullptr) {
+      sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
+    }
+    get_call_freelist()->Release(call, sess->worker_cache.get());
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -323,9 +251,9 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // current status should be bad.
     Status s = call->status();
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    cache_->ReleaseWorker(call->src_worker_, call->wi_);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
     call->wi_ = nullptr;
-    get_call_freelist()->Release(call, cache_);
+    get_call_freelist()->Release(call, session()->worker_cache.get());
     Unref();
   });
 }
@@ -333,12 +261,11 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 }  // namespace
 
 RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
-    : BaseRendezvousMgr(env),
-      cache_(new WorkerFreeListCache(env->worker_cache)) {}
+    : BaseRendezvousMgr(env) {}
 
 BaseRemoteRendezvous* RpcRendezvousMgr::Create(int64 step_id,
                                                const WorkerEnv* worker_env) {
-  return new RpcRemoteRendezvous(worker_env, cache_.get(), step_id);
+  return new RpcRemoteRendezvous(worker_env, step_id);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index 6a65d04ba47..34c48a79177 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
 
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -46,13 +47,9 @@ class RpcRendezvousMgr : public BaseRendezvousMgr {
   explicit RpcRendezvousMgr(const WorkerEnv* env);
 
  protected:
-  BaseRemoteRendezvous* Create(int64 step_id,
-                               const WorkerEnv* worker_env) override;
+  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env);
 
  private:
-  // Private cache_ that allows us to reuse WorkerInterface objects.
-  std::unique_ptr<WorkerCacheInterface> cache_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 19ee433389a..2d0d76623d4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -49,7 +49,7 @@ Rendezvous::ParsedKey MakeKey(const string& s) {
 namespace {
 // Fake cache implementation for WorkerEnv.
 class DummyWorkerCache : public WorkerCacheInterface {
-  void ListWorkers(std::vector<string>* workers) override {}
+  void ListWorkers(std::vector<string>* workers) const override {}
   WorkerInterface* CreateWorker(const string& target) override {
     return nullptr;
   }
@@ -62,19 +62,33 @@ class DummyWorkerCache : public WorkerCacheInterface {
 };
 }  // namespace
 
-TEST(RpcRendezvousMgrTest, LocalSendRecv) {
-  DummyWorkerCache cache;
+class RpcRendezvousMgrTest : public ::testing::Test {
+ protected:
+  RpcRendezvousMgrTest()
+      : cache_(new DummyWorkerCache),
+        worker_session_("/job:mnist/replica:1/task:2",
+                        std::unique_ptr<WorkerCacheInterface>(cache_),
+                        std::unique_ptr<DeviceMgr>(),
+                        std::unique_ptr<GraphMgr>()),
+        rmgr_(&env) {
+    env.env = Env::Default();
+  }
+
+  DummyWorkerCache* cache_;  // Managed by worker_session.
   WorkerEnv env;
-  env.env = Env::Default();
-  env.worker_name = "/job:mnist/replica:1/task:2";
-  env.worker_cache = &cache;
-  RpcRendezvousMgr rmgr(&env);
+
+  WorkerSession worker_session_;
+  RpcRendezvousMgr rmgr_;
+};
+
+TEST_F(RpcRendezvousMgrTest, LocalSendRecv) {
   const int64 step_id = 123;
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
@@ -82,67 +96,58 @@ TEST(RpcRendezvousMgrTest, LocalSendRecv) {
   {
     Tensor val(DT_FLOAT);
     bool val_dead = false;
-    TF_ASSERT_OK(rmgr.RecvLocal(step_id, key, &val, &val_dead));
+    TF_ASSERT_OK(rmgr_.RecvLocal(step_id, key, &val, &val_dead));
     EXPECT_EQ(V(val), "peach");
   }
-  rmgr.Cleanup(step_id);
+  rmgr_.Cleanup(step_id);
 }
 
-TEST(RpcRendezvousMgrTest, LocalAbort) {
-  DummyWorkerCache cache;
-  WorkerEnv env;
-  env.env = Env::Default();
-  env.worker_name = "/job:mnist/replica:1/task:2";
-  env.worker_cache = &cache;
-  RpcRendezvousMgr rmgr(&env);
+TEST_F(RpcRendezvousMgrTest, LocalAbort) {
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {  // Explicit Abort().
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
-    SchedClosure([env, rendez]() {
+    SchedClosure([this, rendez]() {
       env.env->SleepForMicroseconds(100 * 1000);
       rendez->StartAbort(errors::Aborted(""));
     });
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
   {  // Cleanup causes Abort().
     const int64 step_id = 321;
-    Rendezvous* rendez = rmgr.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
-    SchedClosure([env, &rmgr, step_id]() {
+    SchedClosure([this, step_id]() {
       env.env->SleepForMicroseconds(100 * 1000);
-      rmgr.Cleanup(step_id);
+      rmgr_.Cleanup(step_id);
     });
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
 }
 
-TEST(RpcRendezvousMgrTest, CleanupAll) {
-  DummyWorkerCache cache;
-  WorkerEnv env;
-  env.env = Env::Default();
-  env.worker_name = "/job:mnist/replica:1/task:2";
-  env.worker_cache = &cache;
-  RpcRendezvousMgr rmgr(&env);
+TEST_F(RpcRendezvousMgrTest, CleanupAll) {
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
-    rmgr.CleanupAll();
+    rmgr_.CleanupAll();
     Tensor val(DT_STRING);
     bool val_dead = false;
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
@@ -159,32 +164,28 @@ class DummyDeviceContext : public DeviceContext {
   const int stream_id_;
 };
 
-TEST(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
+TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   DummyDeviceContext* dc = new DummyDeviceContext(123);
 
-  DummyWorkerCache cache;
-  WorkerEnv env;
-  env.env = Env::Default();
-  env.worker_name = "/job:mnist/replica:1/task:2";
-  env.worker_cache = &cache;
-  RpcRendezvousMgr rmgr(&env);
   const int64 step_id = 123;
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     args.device_context = dc;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
   }
   {
     Notification n;
-    rmgr.RecvLocalAsync(
-        step_id, key, [&n](const Status& s, const Rendezvous::Args send_args,
-                           const Rendezvous::Args recv_args, const Tensor& val,
-                           bool is_dead) {
+    rmgr_.RecvLocalAsync(
+        step_id, key,
+        [&n](const Status& s, const Rendezvous::Args send_args,
+             const Rendezvous::Args recv_args, const Tensor& val,
+             bool is_dead) {
           auto send_dev_context =
               static_cast<DummyDeviceContext*>(send_args.device_context);
           CHECK_EQ(123, send_dev_context->stream_id());
@@ -193,7 +194,7 @@ TEST(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
         });
     n.WaitForNotification();
   }
-  rmgr.Cleanup(step_id);
+  rmgr_.Cleanup(step_id);
   dc->Unref();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index 866953ee4b1..b2668fae25a 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session.h"
 
@@ -84,8 +85,8 @@ void MakeGRPCCluster(const SessionOptions& options, int n,
 
       std::unique_ptr<ServerInterface> svr;
       TF_CHECK_OK(NewServer(server, &svr));
-      svr->Start();
-      svr->Join();
+      TF_CHECK_OK(svr->Start());
+      TF_CHECK_OK(svr->Join());
     });
   }
 
@@ -95,7 +96,7 @@ void MakeGRPCCluster(const SessionOptions& options, int n,
   options_copy.target = (*workers)[0];
   std::unique_ptr<GrpcSession> session;
   TF_CHECK_OK(GrpcSession::Create(options_copy, &session));
-  *devices = session->ListDevices();
+  TF_CHECK_OK(session->ListDevices(devices));
 }
 
 struct Cluster {
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 6b18db5332f..0b628205c3c 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -264,7 +264,7 @@ Microseconds GreedyScheduler::ComputeSchedule(
     for (auto& x : device_states_) {
       Sim* sim = x.second;
       while (sim->num_running < sim->degree_parallelism &&
-             sim->ready_nodes.size() > 0) {
+             !sim->ready_nodes.empty()) {
         Event e;
         e.node = GetNodeWithHighestPriority(sim->ready_nodes);
         e.time = event.time + cost_model_->TimeEstimate(e.node);
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
new file mode 100644
index 00000000000..22551d54821
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -0,0 +1,99 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+SessionMgr::SessionMgr(
+    WorkerEnv* worker_env, const string& default_worker_name,
+    std::unique_ptr<WorkerCacheInterface> default_worker_cache,
+    WorkerCacheFactory worker_cache_factory)
+    : worker_env_(worker_env),
+      legacy_session_(default_worker_name, std::move(default_worker_cache),
+                      std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+                      std::unique_ptr<GraphMgr>(
+                          new GraphMgr(worker_env, worker_env->device_mgr))),
+      worker_cache_factory_(std::move(worker_cache_factory)) {}
+
+string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
+  return strings::StrCat("/job:", server_def.job_name(),
+                         "/replica:0/task:", server_def.task_index());
+}
+
+Status SessionMgr::CreateSession(const string& session,
+                                 const ServerDef& server_def) {
+  mutex_lock l(mu_);
+  if (session.empty()) {
+    return errors::InvalidArgument("Session must be non-empty.");
+  }
+
+  const string worker_name = WorkerNameFromServerDef(server_def);
+
+  WorkerCacheInterface* worker_cache = nullptr;
+  TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+
+  std::vector<Device*> renamed_devices;
+  for (Device* d : worker_env_->local_devices) {
+    renamed_devices.push_back(
+        RenamedDevice::NewRenamedDevice(worker_name, d, false));
+  }
+  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
+
+  std::unique_ptr<GraphMgr> graph_mgr(
+      new GraphMgr(worker_env_, device_mgr.get()));
+
+  std::unique_ptr<WorkerSession> worker_session(new WorkerSession(
+      worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
+      std::move(device_mgr), std::move(graph_mgr)));
+
+  sessions_.insert(std::make_pair(session, std::move(worker_session)));
+  return Status::OK();
+}
+
+Status SessionMgr::DeleteSession(const string& session) {
+  mutex_lock l(mu_);
+  auto it = sessions_.find(session);
+  if (it != sessions_.end()) {
+    sessions_.erase(it);
+  }
+  return Status::OK();
+}
+
+WorkerSession* SessionMgr::WorkerSessionForSessionUnlocked(
+    const string& session) {
+  auto it = sessions_.find(session);
+  if (it == sessions_.end()) {
+    return &legacy_session_;
+  } else {
+    return it->second.get();
+  }
+}
+
+WorkerSession* SessionMgr::WorkerSessionForSession(const string& session) {
+  mutex_lock l(mu_);
+  return WorkerSessionForSessionUnlocked(session);
+}
+
+WorkerSession* SessionMgr::LegacySession() { return &legacy_session_; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
new file mode 100644
index 00000000000..c44bca7b7a4
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+
+#include <functional>
+
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+
+class WorkerCacheInterface;
+struct WorkerEnv;
+
+// SessionMgr keeps track of information related to a given session.
+//
+// SessionMgr runs on the workers.
+//
+// SessionMgr is threadsafe.
+class SessionMgr {
+ public:
+  typedef std::function<Status(const ServerDef&, WorkerCacheInterface**)>
+      WorkerCacheFactory;
+
+  explicit SessionMgr(
+      WorkerEnv* worker_env, const string& default_worker_name,
+      std::unique_ptr<WorkerCacheInterface> default_worker_cache,
+      WorkerCacheFactory worker_cache_factory);
+  ~SessionMgr() {}
+
+  // Allocates state for a new session.
+  Status CreateSession(const string& session, const ServerDef& server_def);
+
+  // Locates the worker session for a given session handle
+  WorkerSession* WorkerSessionForSession(const string& session);
+  WorkerSession* LegacySession();
+
+  Status DeleteSession(const string& session);
+
+  static string WorkerNameFromServerDef(const ServerDef& server_def);
+
+ private:
+  const WorkerEnv* const worker_env_;  // Not owned.
+
+  // A note about destruction:
+  // We must delete graph_mgr before device_mgr, due to shared
+  // ownership of OpKernels in the executors. (The graph_mgr will
+  // free all stateless OpKernels, and pass over borrowed stateful
+  // OpKernels, which are also held in their respective devices'
+  // OpSegments.)
+  //
+  // legacy_session_ owns the worker_env_.device_mgr, and so we must ensure
+  // that sessions_'s WorkerSessions are deleted (which do not own the
+  // underlying devices, but instead own RenamedDevices) before
+  // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
+  // device_mgr is deleted after WorkerSession's graph_mgr.
+
+  WorkerSession legacy_session_;
+
+  const WorkerCacheFactory worker_cache_factory_;
+
+  WorkerSession* WorkerSessionForSessionUnlocked(const string& session)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutex mu_;
+  // A map from session identifier to internal session structure.
+  std::map<string, std::unique_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
new file mode 100644
index 00000000000..7132f123a59
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class SessionMgrTest : public ::testing::Test {
+ protected:
+  SessionMgrTest()
+      : mgr_(&env_, "/job:mnist/replica:0/task:0",
+             std::unique_ptr<WorkerCacheInterface>(),
+             factory_),
+        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {}
+
+  WorkerEnv env_;
+  SessionMgr::WorkerCacheFactory factory_ =
+      [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
+        *worker_cache = nullptr;  // Set to null to make debugging easier.
+        return Status::OK();
+      };
+  SessionMgr mgr_;
+  WorkerSession* legacy_session_;
+};
+
+TEST_F(SessionMgrTest, CreateSessionSimple) {
+  ServerDef server_def;
+  string session_handle = "test_session_handle";
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
+  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
+  EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
+  EXPECT_NE(mgr_.LegacySession(), session);
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
+TEST_F(SessionMgrTest, LegacySession) {
+  ServerDef server_def;
+  string session_handle = "";
+  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
+  EXPECT_EQ(mgr_.LegacySession(), session);
+
+  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
+}
+
+TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+  string worker_name = SessionMgr::WorkerNameFromServerDef(server_def);
+  EXPECT_EQ("/job:worker/replica:0/task:3", worker_name);
+}
+
+TEST_F(SessionMgrTest, DeleteLegacySession) {
+  TF_EXPECT_OK(mgr_.DeleteSession("legacy_session"));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 55b7d5fe823..f98bd17ab93 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+
+#include "google/protobuf/any.pb.h"
 #include "tensorflow/core/common_runtime/device.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 992423f103f..540b76ada68 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -49,7 +49,7 @@ class StringSource : public TensorResponse::Source {
  public:
   explicit StringSource(const string* s, int block_size)
       : s_(s), stream_(nullptr), block_size_(block_size) {}
-  virtual ~StringSource() { DeleteStream(); }
+  ~StringSource() override { DeleteStream(); }
 
   protobuf::io::ZeroCopyInputStream* contents() override {
     DeleteStream();
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index e59e880af37..16e450abb00 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
@@ -39,48 +40,35 @@ void Worker::GetStatusAsync(const GetStatusRequest* request,
   done(Status::OK());
 }
 
+void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                      CreateWorkerSessionResponse* response,
+                                      StatusCallback done) {
+  Status s = env_->session_mgr->CreateSession(request->session_handle(),
+                                              request->server_def());
+  done(s);
+}
+
 void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
                                 RegisterGraphResponse* response,
                                 StatusCallback done) {
-  Status s = env_->graph_mgr->Register(
+  WorkerSession* session =
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  Status s = session->graph_mgr->Register(
       request->session_handle(), request->graph_def(), request->graph_options(),
-      response->mutable_graph_handle());
+      request->debug_options(), response->mutable_graph_handle());
   done(s);
 }
 
 void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
-  Status s = env_->graph_mgr->Deregister(request->graph_handle());
+  WorkerSession* session =
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+  Status s = session->graph_mgr->Deregister(request->graph_handle());
+
   done(s);
 }
 
-Worker::PartialRunState* Worker::FindPartialRun(const string& graph_handle,
-                                                int step_id) {
-  std::pair<string, int> k(graph_handle, step_id);
-  Worker::PartialRunState* prun_state = nullptr;
-  mutex_lock l(mu_);
-  auto it = partial_runs_.find(k);
-  if (it != partial_runs_.end()) {
-    prun_state = it->second.get();
-  }
-  return prun_state;
-}
-
-void Worker::InsertPartialRunLocked(const string& graph_handle, int step_id,
-                                    Worker::PartialRunState* partial_run_state)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  std::pair<string, int> k(graph_handle, step_id);
-  partial_runs_.emplace(std::make_pair(
-      k, std::unique_ptr<Worker::PartialRunState>(partial_run_state)));
-}
-
-void Worker::RemovePartialRun(const string& graph_handle, int step_id) {
-  std::pair<string, int> k(graph_handle, step_id);
-  mutex_lock l(mu_);
-  partial_runs_.erase(partial_runs_.find(k));
-}
-
 void Worker::AbortStep(int64 step_id) {
   Rendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
   SchedNonBlockingClosureAfter(1000000, [rendez, step_id]() {
@@ -132,6 +120,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
                         StatusCallback done) {
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
+  WorkerSession* session =
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
@@ -167,12 +157,13 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     }
   }
   CostGraphDef* cost_graph = response->mutable_cost_graph();
-  env_->graph_mgr->ExecuteAsync(
-      request->graph_handle(), step_id, request->exec_opts(), collector,
-      cost_graph, cm, in, [this, step_id, response, cm, out, token, collector,
-                           opts, done](Status s) {
+  session->graph_mgr->ExecuteAsync(
+      request->graph_handle(), step_id, session, request->exec_opts(),
+      collector, cost_graph, cm, in,
+      [this, step_id, response, session, cm, out, token, collector, opts,
+       done](Status s) {
         if (s.ok()) {
-          env_->graph_mgr->RecvOutputs(step_id, out);
+          s = session->graph_mgr->RecvOutputs(step_id, out);
         }
         opts->ClearCancelCallback();
         {
@@ -202,10 +193,14 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const int64 step_id = request->step_id();
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
+  WorkerSession* session =
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
-  auto finish = [this, done, out](const Status& s) {
+  auto finish = [this, done, out, opts](const Status& s) {
+    opts->ClearCancelCallback();
     delete out;
     done(s);
   };
@@ -214,18 +209,8 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
     return;
   }
 
-  PartialRunState* partial_run_state = FindPartialRun(graph_handle, step_id);
-
   CancellationManager* cm = nullptr;
-  // If this is a new partial run call we need to create a new cancellation
-  // manager.
-  // Otherwise we use the cancellation manager stored in the found partial
-  // run state.
-  if (partial_run_state == nullptr) {
-    cm = new CancellationManager;
-  } else {
-    cm = partial_run_state->cancellation_manager;
-  }
+  bool is_new_partial_run = partial_run_mgr_.FindOrCreate(step_id, &cm);
 
   // Before we start doing anything, we set the RPC cancellation.
   opts->SetCancelCallback([this, cm, step_id]() {
@@ -235,65 +220,49 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
 
   // If this is a new partial run request, the request will need to start the
   // executors.
-  if (partial_run_state == nullptr) {
+  if (is_new_partial_run) {
     CancellationToken token;
     {
       mutex_lock l(mu_);
-      // Insert the new partial run into the partial_runs_ map.
-      partial_run_state = new PartialRunState(cm);
-      InsertPartialRunLocked(graph_handle, step_id, partial_run_state);
       token = cancellation_manager_->get_cancellation_token();
       cancellation_manager_->RegisterCallback(token,
                                               [cm]() { cm->StartCancel(); });
     }
-    env_->graph_mgr->ExecuteAsync(
-        graph_handle, step_id, request->exec_opts(), nullptr /* collector */,
-        nullptr /* cost_graph */, cm, in,
-        [this, step_id, graph_handle, token, partial_run_state](Status s) {
+    session->graph_mgr->ExecuteAsync(
+        graph_handle, step_id, session, request->exec_opts(),
+        nullptr /* collector */, nullptr /* cost_graph */, cm, in,
+        [this, token, step_id, cm](Status s) {
           {
             mutex_lock l(mu_);
             cancellation_manager_->DeregisterCallback(token);
           }
-          partial_run_state->executor_done.Notify();
-          // TODO(suharshs): Propagate the status once we keep state for
-          // each partial run call.
+          partial_run_mgr_.ExecutorDone(step_id, s);
         });
-    } else {
-      // Send the partial run's new inputs.
-      s = env_->graph_mgr->SendInputs(step_id, in);
-      if (!s.ok()) {
-        finish(s);
-        return;
-      }
-    }
-
-    // Receive the partial run's outputs.
-    s = env_->graph_mgr->RecvOutputs(step_id, out);
+  } else {
+    // Send the partial run's new inputs.
+    s = session->graph_mgr->SendInputs(step_id, in);
     if (!s.ok()) {
       finish(s);
       return;
     }
+  }
 
-    // Construct and return the resp.
-    for (const auto& p : *out) {
-      const string& key = p.first;
-      const Tensor& val = p.second;
-      response->AddRecv(key, val);
-    }
-
-    // If this is the last partial run request we must also wait for the entire
-    // graph execution to be completed.
-    if (request->is_last_partial_run()) {
-      partial_run_state->executor_done.WaitForNotification();
-      RemovePartialRun(graph_handle, step_id);
-      // Before deleting the cancellation manager on the final call, ensure
-      // that we clear the RPC cancel callback, which has a reference to the
-      // cancellation manager.
-      opts->ClearCancelCallback();
-      delete cm;
-    }
-
-    finish(s);
+  session->graph_mgr->RecvOutputsAsync(
+      step_id, out, [this, out, request, response, step_id, finish](Status s) {
+        if (s.ok()) {
+          // Construct and return the resp.
+          for (const auto& p : *out) {
+            const string& key = p.first;
+            const Tensor& val = p.second;
+            response->AddRecv(key, val);
+          }
+        }
+        if (request->is_last_partial_run()) {
+          partial_run_mgr_.PartialRunDone(step_id, finish, s);
+        } else {
+          finish(s);
+        }
+      });
 }
 
 void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
@@ -328,8 +297,8 @@ void Worker::TracingAsync(const TracingRequest* request,
 Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                                  Device** src_dev) {
   // Figures out which device the tensor is hosted on.
-  TF_RETURN_IF_ERROR(
-      env_->device_mgr->LookupDevice(parsed.src_device, src_dev));
+  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  TF_RETURN_IF_ERROR(env_->device_mgr->LookupDevice(local_name, src_dev));
 
   // Does the device have the right incarnation number we expect?
   if ((*src_dev)->attributes().incarnation() != parsed.src_incarnation) {
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index b52a809a0ea..07300338c38 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -19,13 +19,16 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 
 namespace tensorflow {
 
 class CancellationManager;
 class Device;
-class WorkerEnv;
+struct WorkerEnv;
+struct WorkerSession;
 
 // A TensorFlow Worker runs registered graphs and supports worker-to-worker
 // Tensor transfer.
@@ -45,6 +48,10 @@ class Worker : public WorkerInterface {
                       GetStatusResponse* response,
                       StatusCallback done) override;
 
+  void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                CreateWorkerSessionResponse* response,
+                                StatusCallback done) override;
+
   void RegisterGraphAsync(const RegisterGraphRequest* request,
                           RegisterGraphResponse* response,
                           StatusCallback done) override;
@@ -87,34 +94,11 @@ class Worker : public WorkerInterface {
   void AbortStep(int64);
 
  private:
+  PartialRunMgr partial_run_mgr_;
+
   mutex mu_;
   CancellationManager* cancellation_manager_ GUARDED_BY(mu_);
 
-  struct PartialRunState {
-    CancellationManager* cancellation_manager;
-    Notification executor_done;
-
-    explicit PartialRunState(CancellationManager* cm)
-        : cancellation_manager(cm) {}
-  };
-  struct PairHash {
-    std::size_t operator()(std::pair<string, int> const& p) const {
-      return Hash64Combine(std::hash<string>()(p.first),
-                           std::hash<int>()(p.second));
-    }
-  };
-  std::unordered_map<std::pair<string, int>, std::unique_ptr<PartialRunState>,
-                     PairHash>
-      partial_runs_ GUARDED_BY(mu_);
-
-  PartialRunState* FindPartialRun(const string& graph_handle, int step_id);
-
-  void InsertPartialRunLocked(const string& graph_handle, int step_id,
-                              PartialRunState* partial_run_state)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  void RemovePartialRun(const string& graph_handle, int step_id);
-
   Status PrepareRunGraph(RunGraphRequestWrapper* req,
                          GraphMgr::NamedTensors* in,
                          GraphMgr::NamedTensors* out);
diff --git a/tensorflow/core/distributed_runtime/worker_cache.h b/tensorflow/core/distributed_runtime/worker_cache.h
index 225f48356d3..8521f8956b9 100644
--- a/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/tensorflow/core/distributed_runtime/worker_cache.h
@@ -35,7 +35,7 @@ class WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<string>* workers) const = 0;
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 6d68c82fd19..5ca1d92a81b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache_logger.h"
 
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -35,7 +37,7 @@ void WorkerCacheLogger::SetLogging(bool v) {
     ++want_logging_count_;
   } else {
     --want_logging_count_;
-    // If RPCs get cancelled, it may be possible for the count
+    // If RPCs get canceled, it may be possible for the count
     // to go negative.  This should not be a fatal error, since
     // logging is non-critical.
     if (want_logging_count_ < 0) want_logging_count_ = 0;
@@ -96,7 +98,9 @@ void WorkerCacheLogger::RecordRecvTensor(int64 step_id, int64 start_usecs,
                                          src_device, " to ", dst_device));
   ns->set_all_start_micros(start_usecs);
   ns->set_op_start_rel_micros(0);
-  ns->set_op_end_rel_micros(end_usecs - start_usecs);
+  int64 elapsed = end_usecs - start_usecs;
+  ns->set_op_end_rel_micros(elapsed);
+  ns->set_all_end_rel_micros(elapsed);
   NodeOutput* no = ns->add_output();
   no->set_slot(0);
   // TODO(tucker): Maybe set the dimensions too, but then they'll
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 646a184ead8..f09bea328fd 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 
+#include <vector>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -24,11 +25,11 @@ namespace thread {
 class ThreadPool;
 }  // namespace thread
 
+class Device;
 class DeviceMgr;
 class Env;
-class GraphMgr;
 class RendezvousMgrInterface;
-class WorkerCacheInterface;
+class SessionMgr;
 
 // The worker environment class, which holds a bag of pointers to
 // per-worker singletons.
@@ -37,19 +38,18 @@ class WorkerCacheInterface;
 struct WorkerEnv {
   Env* env = nullptr;
 
-  // The name of the worker. E.g., /job:mnist/replica:1/task:0.
-  string worker_name;
+  // session_mgr encapsulates state for each session.
+  SessionMgr* session_mgr = nullptr;
 
-  // Object from which WorkerInterface instances can be obtained.
-  WorkerCacheInterface* worker_cache = nullptr;
+  // The local devices of this worker. Devices are owned by the device_mgr.
+  //
+  // REQUIRES: !local_devices.empty().
+  std::vector<Device*> local_devices;
 
   // device_mgr manages local devices (cpu and gpu). The WorkerService
   // is the network interface for managed devices.
   DeviceMgr* device_mgr = nullptr;
 
-  // graph_mgr keeps track of registered graphs of this worker.
-  GraphMgr* graph_mgr = nullptr;
-
   // A set of rendezvous keyed by step ids.
   RendezvousMgrInterface* rendezvous_mgr = nullptr;
 
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 6de432ea0d4..c9db28ec67f 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -40,6 +40,10 @@ class WorkerInterface {
                               GetStatusResponse* response,
                               StatusCallback done) = 0;
 
+  virtual void CreateWorkerSessionAsync(
+      const CreateWorkerSessionRequest* request,
+      CreateWorkerSessionResponse* response, StatusCallback done) = 0;
+
   virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
                                   RegisterGraphResponse* response,
                                   StatusCallback done) = 0;
@@ -109,6 +113,11 @@ class WorkerInterface {
     return CallAndWait(&ME::GetStatusAsync, request, response);
   }
 
+  Status CreateWorkerSession(const CreateWorkerSessionRequest* request,
+                             CreateWorkerSessionResponse* response) {
+    return CallAndWait(&ME::CreateWorkerSessionAsync, request, response);
+  }
+
   Status RegisterGraph(const RegisterGraphRequest* request,
                        RegisterGraphResponse* response) {
     return CallAndWait(&ME::RegisterGraphAsync, request, response);
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
new file mode 100644
index 00000000000..8691450e9bc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -0,0 +1,100 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+
+namespace tensorflow {
+
+namespace {
+
+// A private cache that wraps worker_cache and allows reuse of
+// WorkerInterface objects.
+class WorkerFreeListCache : public WorkerCacheInterface {
+ public:
+  explicit WorkerFreeListCache(std::unique_ptr<WorkerCacheInterface> w)
+      : wrapped_(std::move(w)) {}
+
+  ~WorkerFreeListCache() final {
+    for (auto p : workers_) {
+      wrapped_->ReleaseWorker(p.first, p.second.worker);
+    }
+  }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    wrapped_->ListWorkers(workers);
+  }
+
+  WorkerInterface* CreateWorker(const string& target) override {
+    mutex_lock l(mu_);
+    auto p = workers_.find(target);
+    if (p != workers_.end()) {
+      return p->second.worker;
+    }
+    WorkerState state;
+    state.worker = wrapped_->CreateWorker(target);
+    if (state.worker != nullptr) {
+      workers_.insert(std::make_pair(target, state));
+    }
+    return state.worker;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
+  }
+
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    wrapped_->GetDeviceLocalityAsync(device, locality, done);
+  }
+
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
+
+  void ClearLogs() override { wrapped_->ClearLogs(); }
+
+  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  std::unique_ptr<WorkerCacheInterface> wrapped_;
+
+  // Information kept per created WorkerInterface.
+  struct WorkerState {
+    WorkerInterface* worker;
+    // TODO(jeff,sanjay): Add reference count if we support eviction.
+  };
+
+  // TODO(jeff,sanjay): Eviction when the map becomes too big.
+  mutex mu_;
+  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
+};
+
+}  // namespace
+
+WorkerSession::WorkerSession(const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             std::unique_ptr<DeviceMgr> device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
+    : worker_name(worker_name),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      device_mgr(std::move(device_mgr)),
+      graph_mgr(std::move(graph_mgr)) {}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
new file mode 100644
index 00000000000..77cf4de8f74
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+
+namespace tensorflow {
+
+class GraphMgr;
+class WorkerCacheInterface;
+
+// WorkerSession encapsulates all of the state relating to a given session.
+struct WorkerSession {
+  // The name of the worker. E.g., /job:mnist/replica:0/task:1.
+  const string worker_name;
+
+  // Object from which WorkerInterface instances can be obtained.
+  const std::unique_ptr<WorkerCacheInterface> worker_cache;
+
+  // Collection of local devices. These devices are typically RenamedDevices
+  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
+  // == worker_env_.device_mgr, which holds the true devices.
+  const std::unique_ptr<DeviceMgr> device_mgr;
+
+  // graph_mgr keeps track of the registered graphs of this session.
+  //
+  // Note: graph_mgr must be deleted before rendezvous_mgr!
+  // Note: graph_mgr must be deleted before device_mgr!
+  const std::unique_ptr<GraphMgr> graph_mgr;
+
+  WorkerSession(const string& worker_name,
+                std::unique_ptr<WorkerCacheInterface> worker_cache,
+                std::unique_ptr<DeviceMgr> device_mgr,
+                std::unique_ptr<GraphMgr> graph_mgr);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
diff --git a/tensorflow/core/example/example_parser_configuration.cc b/tensorflow/core/example/example_parser_configuration.cc
index 55b2b03c837..e4a3f26209d 100644
--- a/tensorflow/core/example/example_parser_configuration.cc
+++ b/tensorflow/core/example/example_parser_configuration.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb_text.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/example/example_parser_configuration_test.cc b/tensorflow/core/example/example_parser_configuration_test.cc
index 0fa772bd6bb..a2238faaf37 100644
--- a/tensorflow/core/example/example_parser_configuration_test.cc
+++ b/tensorflow/core/example/example_parser_configuration_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/example/example_parser_configuration.h"
 
-#include "tensorflow/core/example/example.pb.h"
+#include <memory>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -28,10 +32,11 @@ namespace {
 void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
   TF_CHECK_OK(ReadFileToString(env, filename, output));
 }
-Session* CreateSession() {
+
+std::unique_ptr<Session> CreateSession() {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
-  return NewSession(options);
+  return std::unique_ptr<Session>(NewSession(options));
 }
 
 class ExtractExampleParserConfigurationTest : public ::testing::Test {
@@ -43,19 +48,19 @@ class ExtractExampleParserConfigurationTest : public ::testing::Test {
                      "core/example/testdata/parse_example_graph_def.pbtxt");
     ReadFileToStringOrDie(Env::Default(), filename, &proto_string);
     protobuf::TextFormat::ParseFromString(proto_string, &graph_def_);
-    session_.reset(CreateSession());
-    session_->Create(graph_def_);
+    session_ = CreateSession();
+    TF_CHECK_OK(session_->Create(graph_def_));
   }
 
   NodeDef* parse_example_node() {
-    for (int i = 0; i < graph_def_.node_size(); ++i) {
-      auto mutable_node = graph_def_.mutable_node(i);
-      if (mutable_node->name() == "ParseExample/ParseExample") {
-        return mutable_node;
+    for (auto& node : *graph_def_.mutable_node()) {
+      if (node.name() == "ParseExample/ParseExample") {
+        return &node;
       }
     }
     return nullptr;
   }
+
   GraphDef graph_def_;
   std::unique_ptr<Session> session_;
 };
@@ -194,8 +199,8 @@ class ExampleParserConfigurationProtoToFeatureVectorsTest
 TEST_F(ExampleParserConfigurationProtoToFeatureVectorsTest, Basic) {
   std::vector<FixedLenFeature> fixed_len_features;
   std::vector<VarLenFeature> var_len_features;
-  ExampleParserConfigurationProtoToFeatureVectors(
-      config_proto_, &fixed_len_features, &var_len_features);
+  TF_ASSERT_OK(ExampleParserConfigurationProtoToFeatureVectors(
+      config_proto_, &fixed_len_features, &var_len_features));
   ASSERT_EQ(1, fixed_len_features.size());
   ASSERT_EQ(1, var_len_features.size());
 
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index ff31ad965b0..943dcab3626 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -119,11 +120,13 @@ Allocator* MakeCpuAllocator() {
 }  // namespace
 
 Allocator* cpu_allocator() {
-  static Allocator* cpu_alloc = MakeCpuAllocator();
+  static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
   }
   return cpu_alloc;
 }
 
+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 06859c52908..cb58896f492 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -256,6 +256,58 @@ inline void Allocator::RunDtor(ResourceHandle* p, size_t n) {
   RunResourceDtor(p, n);
 }
 
+// An implementation of Allocator that delegates all calls to another Allocator.
+//
+// Useful to clients who want to override part of the functionality of another
+// allocator.
+class AllocatorWrapper : public Allocator {
+ public:
+  explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
+
+  ~AllocatorWrapper() override {}
+
+  // Returns the wrapped allocator to which all calls are delegated.
+  Allocator* wrapped() const { return wrapped_; }
+
+  string Name() override { return wrapped_->Name(); }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return wrapped_->AllocateRaw(alignment, num_bytes);
+  }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override {
+    return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr);
+  }
+
+  void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
+
+  bool TracksAllocationSizes() override {
+    return wrapped_->TracksAllocationSizes();
+  }
+
+  bool ShouldAllocateEmptyTensors() override {
+    return wrapped_->TracksAllocationSizes();
+  }
+
+  size_t RequestedSize(void* ptr) override {
+    return wrapped_->RequestedSize(ptr);
+  }
+
+  size_t AllocatedSize(void* ptr) override {
+    return wrapped_->AllocatedSize(ptr);
+  }
+
+  int64 AllocationId(void* ptr) override { return wrapped_->AllocationId(ptr); }
+
+  size_t AllocatedSizeSlow(void* ptr) override {
+    return wrapped_->AllocatedSizeSlow(ptr);
+  }
+
+ private:
+  Allocator* const wrapped_;
+};
+
 // A tensorflow Op may need access to different kinds of memory that
 // are not simply a function of the device to which the Op has been
 // assigned.  For example, an Op executing on a GPU may still need
@@ -286,6 +338,11 @@ struct AllocatorAttributes {
   void set_track_sizes(bool v) { value |= (static_cast<int>(v) << 3); }
   bool track_sizes() const { return value & (0x1 << 3); }
   void Merge(AllocatorAttributes other) { value |= other.value; }
+  // Returns true if the fields set in *this is a subset of or equal to
+  // those set in other.
+  bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
+    return (value | other.value) == other.value;
+  }
 
   // NOTE: The upper 8 bits of the value are reserved for
   // device-specific uses.  Implementors of a device can interpret these
diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc
new file mode 100644
index 00000000000..486be39ae31
--- /dev/null
+++ b/tensorflow/core/framework/allocator_registry.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// static
+AllocatorRegistry* AllocatorRegistry::Global() {
+  static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
+  return global_allocator_registry;
+}
+
+Allocator* AllocatorRegistry::GetRegisteredAllocator(const string& name,
+                                                     int priority) {
+  for (auto entry : allocators_) {
+    if (!name.compare(entry.name) && priority == entry.priority) {
+      return entry.allocator;
+    }
+  }
+  return nullptr;
+}
+
+void AllocatorRegistry::Register(const string& name, int priority,
+                                 Allocator* allocator) {
+  CHECK(!name.empty()) << "Need a valid name for Allocator";
+  CHECK_GE(priority, 0) << "Priority needs to be non-negative";
+
+  Allocator* existing = GetRegisteredAllocator(name, priority);
+  if (existing != nullptr) {
+    // A duplicate is if the registration name and priority match
+    // but the Allocator::Name()'s don't match.
+    CHECK_EQ(existing->Name(), allocator->Name())
+        << "Allocator with name: [" << name << "], type [" << existing->Name()
+        << "], priority: [" << priority
+        << "] already registered.  Choose a different name to register "
+        << "an allocator of type " << allocator->Name();
+
+    // The allocator names match, so we can just return.
+    // It should be safe to delete the allocator since the caller
+    // gives up ownership of it.
+    delete allocator;
+    return;
+  }
+
+  AllocatorRegistryEntry tmp_entry;
+  tmp_entry.name = name;
+  tmp_entry.priority = priority;
+  tmp_entry.allocator = allocator;
+
+  allocators_.push_back(tmp_entry);
+  int high_pri = -1;
+  for (auto entry : allocators_) {
+    if (high_pri < entry.priority) {
+      m_curr_allocator_ = entry.allocator;
+      high_pri = entry.priority;
+    }
+  }
+}
+
+Allocator* AllocatorRegistry::GetAllocator() {
+  return CHECK_NOTNULL(m_curr_allocator_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
new file mode 100644
index 00000000000..b26e79ac3b0
--- /dev/null
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocators
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+// A global AllocatorRegistry is used to hold allocators for CPU backends
+class AllocatorRegistry {
+ public:
+  // Add an allocator to the registry.  Caller releases ownership of
+  // 'allocator'.
+  void Register(const string& name, int priority, Allocator* allocator);
+
+  // Return allocator with highest priority
+  // If multiple allocators have the same high priority, return one of them
+  Allocator* GetAllocator();
+
+  // Returns the global registry of allocators.
+  static AllocatorRegistry* Global();
+
+ private:
+  typedef struct {
+    string name;
+    int priority;
+    Allocator* allocator;  // not owned
+  } AllocatorRegistryEntry;
+
+  // Returns the Allocator registered for 'name' and 'priority',
+  // or 'nullptr' if not found.
+  Allocator* GetRegisteredAllocator(const string& name, int priority);
+
+  std::vector<AllocatorRegistryEntry> allocators_;
+  Allocator* m_curr_allocator_;  // not owned
+};
+
+namespace allocator_registration {
+
+class AllocatorRegistration {
+ public:
+  AllocatorRegistration(const string& name, int priority,
+                        Allocator* allocator) {
+    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  }
+};
+
+}  // namespace allocator_registration
+
+#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
+  static allocator_registration::AllocatorRegistration              \
+      register_allocator_##ctr(name, priority, new allocator)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index b5587a7b056..878c7466558 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -62,6 +62,30 @@ TEST(AllocatorAttributesTest, AllCombos) {
   }
 }
 
+TEST(AllocatorAttributesTest, IsEqualOrLessRestrictiveThan) {
+  AllocatorAttributes a, b;
+  EXPECT_TRUE(a.IsEqualOrLessRestrictiveThan(b));
+  EXPECT_TRUE(a.IsEqualOrLessRestrictiveThan(a));
+  EXPECT_TRUE(b.IsEqualOrLessRestrictiveThan(b));
+
+  b.set_gpu_compatible(true);
+  // The set of flags in b is not a subset of those in a.
+  EXPECT_TRUE(a.IsEqualOrLessRestrictiveThan(b));
+  EXPECT_FALSE(b.IsEqualOrLessRestrictiveThan(a));
+  EXPECT_TRUE(a.IsEqualOrLessRestrictiveThan(a));
+  EXPECT_TRUE(b.IsEqualOrLessRestrictiveThan(b));
+
+  a.set_nic_compatible(true);
+  // Neither a nor b is a subset of the other.
+  EXPECT_FALSE(a.IsEqualOrLessRestrictiveThan(b));
+  EXPECT_FALSE(b.IsEqualOrLessRestrictiveThan(a));
+
+  a.set_gpu_compatible(true);
+  // The set of flags in b is a proper subset of those in a.
+  EXPECT_TRUE(b.IsEqualOrLessRestrictiveThan(a));
+  EXPECT_FALSE(a.IsEqualOrLessRestrictiveThan(b));
+}
+
 TEST(CPUAllocatorTest, Simple) {
   EnableCPUAllocatorStats(true);
   Allocator* a = cpu_allocator();
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 55e00996401..b18ce3decc0 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -36,8 +36,8 @@ string SummarizeString(const string& str) {
 string SummarizeTensor(const TensorProto& tensor_proto) {
   Tensor t;
   if (!t.FromProto(tensor_proto)) {
-    return strings::StrCat("<Invalid TensorProto: ",
-                           ProtoShortDebugString(tensor_proto), ">");
+    return strings::StrCat(
+        "<Invalid TensorProto: ", ProtoShortDebugString(tensor_proto), ">");
   }
   return t.DebugString();
 }
@@ -48,7 +48,7 @@ string SummarizeFunc(const NameAttrList& func) {
     entries.push_back(
         strings::StrCat(p.first, "=", SummarizeAttrValue(p.second)));
   }
-  sort(entries.begin(), entries.end());
+  std::sort(entries.begin(), entries.end());
   return strings::StrCat(func.name(), "[", str_util::Join(entries, ", "), "]");
 }
 
@@ -290,12 +290,12 @@ bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
 #define DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD) \
   void SetAttrValue(ARG_TYPE value, AttrValue* out) { out->set_##FIELD(value); }
 
-#define DEFINE_SET_ATTR_VALUE_LIST(ARG_TYPE, FIELD)              \
-  void SetAttrValue(ARG_TYPE value, AttrValue* out) {            \
-    out->mutable_list(); /* create list() even if value empty */ \
-    for (const auto& v : value) {                                \
-      out->mutable_list()->add_##FIELD(v);                       \
-    }                                                            \
+#define DEFINE_SET_ATTR_VALUE_LIST(ARG_TYPE, FIELD)                       \
+  void SetAttrValue(ARG_TYPE value, AttrValue* out) {                     \
+    out->mutable_list()->Clear(); /* create list() even if value empty */ \
+    for (const auto& v : value) {                                         \
+      out->mutable_list()->add_##FIELD(v);                                \
+    }                                                                     \
   }
 
 #define DEFINE_SET_ATTR_VALUE_BOTH(ARG_TYPE, FIELD) \
@@ -319,7 +319,7 @@ void SetAttrValue(StringPiece value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<StringPiece> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     out->mutable_list()->add_s(v.data(), v.size());
   }
@@ -338,14 +338,14 @@ void SetAttrValue(const PartialTensorShape& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<TensorShape> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     v.AsProto(out->mutable_list()->add_shape());
   }
 }
 
 void SetAttrValue(gtl::ArraySlice<TensorShapeProto> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_shape() = v;
   }
@@ -353,7 +353,7 @@ void SetAttrValue(gtl::ArraySlice<TensorShapeProto> value, AttrValue* out) {
 
 void SetAttrValue(const gtl::ArraySlice<PartialTensorShape> value,
                   AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     v.AsProto(out->mutable_list()->add_shape());
   }
@@ -368,7 +368,7 @@ void SetAttrValue(const Tensor& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<Tensor> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     if (v.NumElements() > 1) {
       v.AsProtoTensorContent(out->mutable_list()->add_tensor());
@@ -383,7 +383,7 @@ void SetAttrValue(const TensorProto& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<TensorProto> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_tensor() = v;
   }
@@ -394,22 +394,39 @@ void SetAttrValue(const NameAttrList& value, AttrValue* out) {
 }
 
 void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_func() = v;
   }
 }
 
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+template <typename T>
+static bool DeterministicSerialization(const T& t, string* result) {
+  const int size = t.ByteSize();
+  *result = string(size, '\0');
+  ::tensorflow::protobuf::io::ArrayOutputStream array_stream(&(*result)[0],
+                                                             size);
+  ::tensorflow::protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  t.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
   string a_str, b_str;
-  a.SerializeToString(&a_str);
-  b.SerializeToString(&b_str);
+  DeterministicSerialization(a, &a_str);
+  DeterministicSerialization(b, &b_str);
   // Note: it should be safe to compare proto serializations of the attr
   // values since at most one field should be set in each (indeed, it
   // must be the same field if they are to compare equal).
   // Exception: there are multiple equivalent representations of
   // TensorProtos.  So a return value of true implies a == b, but not the
   // converse.
+  // TODO(phawkins): this is incorrect for NameAttrList attributes that may
+  // contain nested AttrValue maps.
   return a_str == b_str;
 }
 
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 4cc3f923531..651c054fe8b 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -36,7 +36,7 @@ namespace tensorflow {
 // CancellationManager::get_cancellation_token.
 typedef int64 CancellationToken;
 
-// A callback that is invoked when a step is cancelled.
+// A callback that is invoked when a step is canceled.
 //
 // NOTE(mrry): See caveats about CancelCallback implementations in the
 // comment for CancellationManager::RegisterCallback.
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 9d5d212dddd..035bceb640f 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
@@ -222,9 +223,10 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
 
   if (s.ok() && data_format == "NCHW") {
     // Convert input shape to default NHWC for inference
-    input_shape =
-        c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
-                       c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
+    };
+    input_shape = c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('C')}});
     stride_rows = strides[2];
     stride_cols = strides[3];
   } else {
@@ -265,12 +267,16 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+// TODO(mjanusz): Unify all conv/pooling shape functions.
 Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &filter_shape));
 
+  string data_format;
+  Status s = c->GetAttr("data_format", &data_format);
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
@@ -279,9 +285,22 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
-  int32 stride_planes = strides[1];
-  int32 stride_rows = strides[2];
-  int32 stride_cols = strides[3];
+  int32 stride_planes, stride_rows, stride_cols;
+  if (s.ok() && data_format == "NCDHW") {
+    // Convert input_shape to NDHWC.
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<3>(FORMAT_NCHW, dimension));
+    };
+    input_shape =
+        c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
+    stride_planes = strides[2];
+    stride_cols = strides[3];
+    stride_rows = strides[4];
+  } else {
+    stride_planes = strides[1];
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+  }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
   DimensionHandle in_planes_dim = c->Dim(input_shape, 1);
@@ -309,9 +328,14 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
       c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
 
-  ShapeHandle output_shape =
-      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
-                    output_depth_dim});
+  ShapeHandle output_shape;
+  if (data_format == "NCDHW") {
+    output_shape = c->MakeShape({batch_size_dim, output_depth_dim,
+                                 output_planes, output_rows, output_cols});
+  } else {
+    output_shape = c->MakeShape({batch_size_dim, output_planes, output_rows,
+                                 output_cols, output_depth_dim});
+  }
   c->set_output(0, output_shape);
   return Status::OK();
 }
@@ -332,9 +356,26 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
+  string data_format;
+  Status s = c->GetAttr("data_format", &data_format);
+  int32 stride_rows;
+  int32 stride_cols;
+  if (s.ok() && data_format == "NCHW") {
+    // Convert input shape to default NHWC for inference
+    input_shape =
+        c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
+                       c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+  } else {
+    stride_rows = strides[1];
+    stride_cols = strides[2];
+  }
+
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
   DimensionHandle in_rows_dim = c->Dim(input_shape, 1);
   DimensionHandle in_cols_dim = c->Dim(input_shape, 2);
+
   DimensionHandle filter_rows_dim = c->Dim(filter_shape, 0);
   DimensionHandle filter_cols_dim = c->Dim(filter_shape, 1);
   DimensionHandle input_depth = c->Dim(filter_shape, 2);
@@ -350,9 +391,6 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
-  const int32 stride_rows = strides[1];
-  const int32 stride_cols = strides[2];
-
   // TODO(mrry,shlens): Raise an error if the stride would cause
   // information in the input to be ignored. This will require a change
   // in the kernel implementation.
@@ -363,8 +401,14 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
       c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
 
-  ShapeHandle output_shape =
-      c->MakeShape({batch_size_dim, output_rows, output_cols, output_depth});
+  ShapeHandle output_shape;
+  if (data_format == "NCHW") {
+    output_shape =
+        c->MakeShape({batch_size_dim, output_depth, output_rows, output_cols});
+  } else {
+    output_shape =
+        c->MakeShape({batch_size_dim, output_rows, output_cols, output_depth});
+  }
   c->set_output(0, output_shape);
   return Status::OK();
 }
@@ -397,10 +441,11 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   int32 kernel_rows, kernel_cols;
 
   if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference
-    input_shape =
-        c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
-                       c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
+    // Convert input shape to default NHWC for inference.
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
+    };
+    input_shape = c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('C')}});
     stride_rows = strides[2];
     stride_cols = strides[3];
     kernel_rows = kernel_sizes[2];
@@ -471,10 +516,11 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   int32 kernel_rows, kernel_cols, kernel_depth;
 
   if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference
-    input_shape =
-        c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
-                       c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
+    // Convert input shape to default NHWC for inference.
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
+    };
+    input_shape = c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('C')}});
     stride_depth = strides[1];
     stride_rows = strides[2];
     stride_cols = strides[3];
@@ -511,9 +557,10 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
       c->MakeShape({batch_size_dim, output_rows, output_cols, output_depth});
   if (data_format == "NCHW") {
     // Convert output shape back to expected NCHW data format.
-    output_shape =
-        c->MakeShape({c->Dim(output_shape, 0), c->Dim(output_shape, 3),
-                      c->Dim(output_shape, 1), c->Dim(output_shape, 2)});
+    auto dim = [&](char dimension) {
+      return c->Dim(output_shape, GetTensorDimIndex<2>(FORMAT_NHWC, dimension));
+    };
+    output_shape = c->MakeShape({{dim('N'), dim('C'), dim('0'), dim('1')}});
   }
 
   c->set_output(0, output_shape);
@@ -524,6 +571,9 @@ Status Pool3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
 
+  string data_format;
+  Status s = c->GetAttr("data_format", &data_format);
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
@@ -544,12 +594,27 @@ Status Pool3DShape(shape_inference::InferenceContext* c) {
   int32 stride_planes, stride_rows, stride_cols;
   int32 kernel_planes, kernel_rows, kernel_cols;
 
-  stride_planes = strides[1];
-  stride_rows = strides[2];
-  stride_cols = strides[3];
-  kernel_planes = kernel_sizes[1];
-  kernel_rows = kernel_sizes[2];
-  kernel_cols = kernel_sizes[3];
+  if (s.ok() && data_format == "NCDHW") {
+    // Convert input_shape to NDHWC.
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<3>(FORMAT_NCHW, dimension));
+    };
+    input_shape =
+        c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
+    stride_planes = strides[2];
+    stride_rows = strides[3];
+    stride_cols = strides[4];
+    kernel_planes = kernel_sizes[2];
+    kernel_rows = kernel_sizes[3];
+    kernel_cols = kernel_sizes[4];
+  } else {
+    stride_planes = strides[1];
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+    kernel_planes = kernel_sizes[1];
+    kernel_rows = kernel_sizes[2];
+    kernel_cols = kernel_sizes[3];
+  }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
   DimensionHandle in_planes_dim = c->Dim(input_shape, 1);
@@ -571,9 +636,14 @@ Status Pool3DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
       c, in_cols_dim, kernel_cols, stride_cols, padding, &output_cols));
 
-  ShapeHandle output_shape =
-      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
-                    output_depth_dim});
+  ShapeHandle output_shape;
+  if (data_format == "NCDHW") {
+    output_shape = c->MakeShape({batch_size_dim, output_depth_dim,
+                                 output_planes, output_rows, output_cols});
+  } else {
+    output_shape = c->MakeShape({batch_size_dim, output_planes, output_rows,
+                                 output_cols, output_depth_dim});
+  }
 
   c->set_output(0, output_shape);
   return Status::OK();
@@ -590,7 +660,13 @@ Status ReductionShape(InferenceContext* c) {
   ShapeHandle input = c->input(0);
 
   ShapeHandle indices;
-  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &indices));
+  // Older versions of TensorFlow accidentally allowed higher rank tensors like
+  // [[1,2]] or [[1],[2]] to represent axis=[1,2].
+  if (c->graph_def_version() < 21) {
+    indices = c->input(1);
+  } else {
+    TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &indices));
+  }
 
   bool keep_dims;
   TF_RETURN_IF_ERROR(c->GetAttr("keep_dims", &keep_dims));
@@ -670,6 +746,7 @@ Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
     }
     // Build result of <rank> different unknown dims.
     std::vector<DimensionHandle> dims;
+    dims.reserve(rank);
     for (int i = 0; i < rank; ++i) dims.push_back(c->UnknownDim());
     c->set_output(0, c->MakeShape(dims));
     return Status::OK();
@@ -797,6 +874,13 @@ Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status RandomShape(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
                             ShapeHandle values_shape, ShapeHandle shape_shape) {
   // Validate ranks.
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 73509fb7fba..dc99e48adb9 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -199,6 +199,9 @@ Status ConcatV2Shape(shape_inference::InferenceContext* c);
 // Tested by ops/math_ops_test.cc.
 Status BroadcastBinaryOpShapeFn(InferenceContext* c);
 
+// Shape function for random operations.
+Status RandomShape(shape_inference::InferenceContext* c);
+
 // Validates the 3 component tensors of a sparse tensor have the proper
 // shapes. This mimics SparseTensor.__init__ in python/framework/ops.py.
 Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 89acf1202ca..d14e1dfee09 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -69,7 +69,8 @@ TEST(CommonShapeFnsTest, NoOutputShapeTest) {
                   .Input({{"data", 0, DT_FLOAT}})
                   .Finalize(&def));
 
-  InferenceContext c(&def, op_def, {S({}), S({10})}, {}, {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({}), S({10})}, {},
+                     {}, {});
   TF_EXPECT_OK(NoOutputs(&c));
   EXPECT_EQ(0, c.num_outputs());
 }
@@ -87,14 +88,15 @@ TEST(CommonShapeFnsTest, ScalarShapeTest) {
       NodeDefBuilder("test", "L2Loss").Input("t", 0, DT_FLOAT).Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({})}, {}, {}, {});
     TF_EXPECT_OK(ScalarShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(0, c.Rank(output));
   }
 
   {
-    InferenceContext c(&def, op_def, {S({1, 23, 4, 4, 2})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({1, 23, 4, 4, 2})}, {}, {}, {});
     TF_EXPECT_OK(ScalarShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(0, c.Rank(output));
@@ -121,7 +123,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3, 4})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 3}), S({3, 4})}, {}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -130,7 +133,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Unknown inner dimension for one
-    InferenceContext c(&def, op_def, {S({2, -1}), S({3, 4})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, -1}), S({3, 4})}, {}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -139,7 +143,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Invalid rank.
-    InferenceContext c(&def, op_def, {S({2}), S({3, 4})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({2}), S({3, 4})},
+                       {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -149,7 +154,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Unknown outer dimension
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3, -1})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 3}), S({3, -1})}, {}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -158,7 +164,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Inner shapes not compatible
-    InferenceContext c(&def, op_def, {S({2, 5}), S({3, 4})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -169,8 +176,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Inner shapes not compatible
-    InferenceContext c(&def, op_def, {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {},
-                       {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -188,7 +195,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                     .Attr("type", DT_FLOAT)
                     .Finalize(&def));
 
-    InferenceContext c(&def, op_def, {S({3, 2}), S({3, 4})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({3, 2}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -205,7 +213,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                     .Attr("type", DT_FLOAT)
                     .Finalize(&def));
 
-    InferenceContext c(&def, op_def, {S({2, 3}), S({4, 3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 3}), S({4, 3})}, {}, {}, {});
     auto s = MatMulShape(&c);
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -229,7 +238,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 10}), S({10})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 10}), S({10})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -238,7 +248,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Unknown ranks.
-    InferenceContext c(&def, op_def, {Unknown(), Unknown()}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {Unknown(), Unknown()}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_FALSE(c.RankKnown(output));
@@ -246,8 +257,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Rank > 2
-    InferenceContext c(&def, op_def, {S({4, 3, 4, 2, 15}), S({15})}, {}, {}, {},
-                       {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({4, 3, 4, 2, 15}), S({15})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[4,3,4,2,15]", c.DebugString(output));
@@ -260,7 +271,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({2, 3, 4, 5}), S({3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({2, 3, 4, 5}), S({3})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[2,3,4,5]", c.DebugString(output));
@@ -273,8 +285,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5}), S({3})}, {}, {},
-                       {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({8, 6, 4, 2, 3, 4, 5}), S({3})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[8,6,4,2,3,4,5]", c.DebugString(output));
@@ -287,8 +299,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({10, 11, 12}), S({10})}, {}, {}, {},
-                       {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({10, 11, 12}), S({10})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[10,11,12]", c.DebugString(output));
@@ -296,7 +308,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Input rank not high enough
-    InferenceContext c(&def, op_def, {S({3}), S({3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({3}), S({3})}, {},
+                       {}, {});
     EXPECT_FALSE(BiasAddShape(&c).ok());
   }
 
@@ -308,7 +321,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
     // NCHW format
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({2, 3}), S({3})},
+                       {}, {}, {});
     EXPECT_FALSE(BiasAddShape(&c).ok());
   }
 }
@@ -327,7 +341,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 10})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({2, 10})}, {}, {},
+                       {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -335,7 +350,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
   {
     // Rank > 2
-    InferenceContext c(&def, op_def, {S({5, 7, 2, 10})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({5, 7, 2, 10})},
+                       {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -347,7 +363,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({2, 3, 4, 5})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({2, 3, 4, 5})},
+                       {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(3, c.Value(c.Dim(output, 0)));
@@ -359,8 +376,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5})}, {}, {}, {},
-                       {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
+                       {S({8, 6, 4, 2, 3, 4, 5})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(3, c.Value(c.Dim(output, 0)));
@@ -372,7 +389,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({10, 11, 12})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({10, 11, 12})},
+                       {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -380,7 +398,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
   {
     // Input rank not high enough
-    InferenceContext c(&def, op_def, {S({3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({3})}, {}, {},
+                       {});
     EXPECT_FALSE(BiasAddGradShape(&c).ok());
   }
 
@@ -391,7 +410,8 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
     // NCHW format
-    InferenceContext c(&def, op_def, {S({2, 3})}, {}, {}, {}, {});
+    InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def, {S({2, 3})}, {}, {},
+                       {});
     EXPECT_FALSE(BiasAddGradShape(&c).ok());
   }
 }
@@ -592,6 +612,7 @@ TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
                   .Input("filter", 0, DT_FLOAT)
                   .Attr("strides", strides)
                   .Attr("padding", "VALID")
+                  .Attr("data_format", "NHWC")
                   .Finalize(&op.node_def));
 
   // Most of DepthwiseConv2D is implicitly tested by Conv2D, so
@@ -613,6 +634,18 @@ TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   INFER_OK(op, "[1,2,2,3];[1,1,?,4]", "[d0_0,2,2,12]");
   INFER_OK(op, "[1,2,2,?];[1,1,?,4]", "[d0_0,2,2,?]");
   INFER_OK(op, "[1,2,2,3];[1,1,3,?]", "[d0_0,2,2,?]");
+
+  // Test for NCHW format.
+  TF_CHECK_OK(NodeDefBuilder("test", "DepthwiseConv2dNative")
+                  .Input("input", 0, DT_FLOAT)
+                  .Input("filter", 0, DT_FLOAT)
+                  .Attr("strides", strides)
+                  .Attr("padding", "VALID")
+                  .Attr("data_format", "NCHW")
+                  .Finalize(&op.node_def));
+
+  // 1x1 filter, depth multiplication
+  INFER_OK(op, "[1,3,2,2];[1,1,3,4]", "[d0_0,12,2,2]");
 }
 
 TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
@@ -781,12 +814,24 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
   op.input_tensors[1] = nullptr;
   INFER_OK(op, "[?,?,?];?", "[?,?,?]");
   INFER_OK(op, "[?,?,?];[2]", "[?,?,?]");
+
+  // Reduction indices with too many dimensions.
+  INFER_ERROR("must be at most rank 1 but is rank 2", op, "[?,?,?];[?,?]");
+  // With older graph-def version, this is allowed.
+  op.graph_def_version = 20;
+  INFER_OK(op, "[?,?,?];[?,?]", "[?,?,?]");
+  // And when the tensor is specified, it's still allowed.
+  op.input_tensors[1] = &indices;
+  indices = test::AsTensor<int32>({-1, -2}, TensorShape({2, 1}));
+  INFER_OK(op, "[2,4,5];[2,1]", "[d0_0, 1, 1]");
+  indices = test::AsTensor<int32>({-1, -2}, TensorShape({1, 2}));
+  INFER_OK(op, "[2,4,5];[1,2]", "[d0_0, 1, 1]");
 }
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownShapes) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {Unknown(), Unknown(), Unknown()},
-                     {}, {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {Unknown(), Unknown(), Unknown()}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -798,8 +843,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownShapes) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownDims) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, -1}), S({-1}), S({-1})}, {},
-                     {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({-1, -1}), S({-1}), S({-1})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -811,8 +856,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownDims) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidIndicesRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1}), S({-1}), S({-1})}, {}, {},
-                     {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({-1}), S({-1}), S({-1})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -825,8 +870,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidIndicesRank) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidNumElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({4}), S({3})}, {}, {},
-                     {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, 3}), S({4}), S({3})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -839,8 +884,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidNumElements) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({4})}, {}, {},
-                     {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, 3}), S({5}), S({4})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -853,8 +898,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_InvalidRank) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownNumIndexElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, 3}), S({5}), S({3})}, {},
-                     {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({-1, 3}), S({5}), S({3})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -866,8 +911,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownNumIndexElements) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownNumValueElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({-1}), S({3})}, {},
-                     {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, 3}), S({-1}), S({3})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -879,8 +924,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownNumValueElements) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownIndexRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, -1}), S({5}), S({3})}, {},
-                     {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, -1}), S({5}), S({3})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -892,8 +937,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownIndexRank) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownShapeRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({-1})}, {},
-                     {}, {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, 3}), S({5}), S({-1})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -905,8 +950,8 @@ TEST(CommonShapeFnsTest, ValidateSparseTensor_UnknownShapeRank) {
 
 TEST(CommonShapeFnsTest, ValidateSparseTensor) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({3})}, {}, {},
-                     {}, {});
+  InferenceContext c(TF_GRAPH_DEF_VERSION, &def, MakeOpDef(3, 1),
+                     {S({5, 3}), S({5}), S({3})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index 2f83765e2ee..f4837fbfc55 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -45,9 +45,22 @@ message CostGraphDef {
     // Temporary memory used by this node.
     int64 temporary_memory_size = 6;
 
+    int64 host_temp_memory_size = 10;
+    int64 device_temp_memory_size = 11;
+    int64 host_persistent_memory_size = 12;
+    int64 device_persistent_memory_size = 16;
+
     // Estimate of the computational cost of this node, in microseconds.
     int64 compute_cost = 9;
 
+    // Analytical estimate of the computational cost of this node, in
+    // microseconds.
+    int64 compute_time = 14;
+
+    // Analytical estimate of the memory access cost of this node, in
+    // microseconds.
+    int64 memory_time = 15;
+
     // If true, the output is permanent: it can't be discarded, because this
     // node is part of the "final output". Nodes may depend on final nodes.
     bool is_final = 7;
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 6edbda12764..27fe28fe60a 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -115,7 +115,7 @@ class DeviceBase {
     cpu_worker_threads_ = t;
   }
 
-  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
+  virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
     CHECK(cpu_worker_threads_ != nullptr);
     return cpu_worker_threads_;
   }
@@ -132,6 +132,7 @@ class DeviceBase {
     perftools::gputools::Stream* stream = nullptr;
     DeviceContext* default_context = nullptr;
     EventMgr* event_mgr = nullptr;
+    int gpu_id = -1;
   };
 
   // Does not take ownership.
@@ -139,7 +140,7 @@ class DeviceBase {
     gpu_device_info_ = g;
   }
 
-  const GpuDeviceInfo* tensorflow_gpu_device_info() const {
+  virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const {
     return gpu_device_info_;
   }
 
@@ -169,13 +170,13 @@ class DeviceBase {
     return GetAllocator(attr);
   }
 
-  const Eigen::ThreadPoolDevice* eigen_cpu_device() {
+  virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
   }
 
 #ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice* eigen_sycl_device() const {
+  virtual const Eigen::SyclDevice* eigen_sycl_device() const {
     CHECK(eigen_sycl_device_ != nullptr);
     return eigen_sycl_device_;
   }
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 527dcdb4ee7..9026075a2f0 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 
@@ -42,12 +45,11 @@ namespace {
 // Otherwise (arg_def is a simple type T), *is_type_list is set to
 // false, and *dtypes is set to a single element vector, whose only
 // element is T.
-Status ArgNumType(const InstantiateAttrValueMap& attrs,
-                  const OpDef::ArgDef& arg_def, bool* is_type_list,
-                  DataTypeVector* dtypes) {
+Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
+                  bool* is_type_list, DataTypeVector* dtypes) {
   dtypes->clear();
   if (!arg_def.type_list_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_list_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_list_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ",
                               arg_def.type_list_attr());
@@ -62,7 +64,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   *is_type_list = false;
   int num = 1;
   if (!arg_def.number_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.number_attr());
+    const AttrValue* v = attrs.Find(arg_def.number_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -75,7 +77,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   } else if (arg_def.type_attr().empty()) {
     dtype = DT_INVALID;
   } else {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -85,35 +87,22 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   return Status::OK();
 }
 
-string Name(int node_index) { return strings::StrCat("n", node_index); }
-
-string Name(int node_index, int output_index) {
-  if (output_index == 0) {
-    return Name(node_index);
-  } else {
-    return strings::StrCat("n", node_index, ":", output_index);
-  }
-}
-
-string Dep(int node_index) { return strings::StrCat("^", Name(node_index)); }
-
 template <typename T>
 void AddAttr(const string& name, const T& val, NodeDef* ndef) {
   SetAttrValue(val, &((*ndef->mutable_attr())[name]));
 }
 
-Status ValidateSignatureWithAttrs(const OpDef& sig,
-                                  const InstantiateAttrValueMap& attr_values) {
+Status ValidateSignatureWithAttrs(const OpDef& sig, AttrSlice attr_values) {
   // attr_values should specify all attrs defined in fdef.
   for (const auto& a : sig.attr()) {
-    auto const iter = attr_values.find(a.name());
-    if (iter == attr_values.end()) {
+    const AttrValue* v = attr_values.Find(a.name());
+    if (!v) {
       return errors::NotFound("Attr ", a.name(), " is not found from ",
                               SummarizeOpDef(sig));
     }
-    Status status = AttrValueHasType(iter->second, a.type());
+    Status status = AttrValueHasType(*v, a.type());
     if (!status.ok()) {
-      errors::AppendToMessage(&status, "for attr '", iter->first, "'");
+      errors::AppendToMessage(&status, "for attr '", a.name(), "'");
       return status;
     }
   }
@@ -144,230 +133,304 @@ Status ValidateSignatureWithAttrs(const OpDef& sig,
   return Status::OK();
 }
 
-// We build a small index for all names that can be used as a node's
-// input arguments.
-//
-// If is_func_arg is true, the name is a function's argument.  In
-// this case, the produced graph def has gdef.node[nid ... nid +
-// dtype.size()).
-//
-// Otherwise, the name is a function body's node return value.  In
-// this case, the produced graph def has one node gdef.node[nid] and
-// the node's output index [idx ... idx + num) corresponds to the
-// named outputs.
-//
-// In all cases, "dtype" specifies the data type.
-struct NameInfoItem {
-  bool is_func_arg;
-  int nid;
-  int idx;
-  bool is_type_list;
-  DataTypeVector dtypes;
-};
-typedef std::unordered_map<string, NameInfoItem> NameInfoIndex;
-
-Status AddArgName(NameInfoIndex* name_info, const string& arg,
-                  const NameInfoItem& item) {
-  if (!name_info->insert({arg, item}).second) {
-    return errors::InvalidArgument("Duplicated arg name: ", arg);
+// A helper class for instantiating functions. This contains shared information
+// like the resulting graph and node name index.
+class FunctionInstantiationHelper {
+ public:
+  FunctionInstantiationHelper(GetFunctionSignature get_function,
+                              InstantiationResult* result)
+      : get_function_(std ::move(get_function)), result_(*result) {
+    result_.nodes.clear();
   }
-  return Status::OK();
-}
 
-Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                          const InstantiateAttrValueMap& attr_values,
-                          NameInfoIndex* name_info,
-                          InstantiationResult* result) {
-  bool is_type_list;
-  DataTypeVector dtypes;
-  TF_RETURN_IF_ERROR(ArgNumType(attr_values, arg_def, &is_type_list, &dtypes));
-  CHECK_GE(dtypes.size(), size_t{1});
-  GraphDef* gdef = &result->gdef;
-  int arg_index = gdef->node_size();
-  TF_RETURN_IF_ERROR(AddArgName(name_info, arg_def.name(),
-                                {true, arg_index, 0, is_type_list, dtypes}));
-  // Creates dtypes.size() nodes in the gdef.
-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    TF_RETURN_IF_ERROR(AddArgName(name_info,
-                                  strings::StrCat(arg_def.name(), ":", i),
-                                  {true, arg_index, 0, false, {dtypes[i]}}));
-    DCHECK_EQ(arg_index, gdef->node_size());
-    NodeDef* gnode = gdef->add_node();
-    gnode->set_name(Name(arg_index));
-    gnode->set_op("_Arg");
-    AddAttr("T", dtypes[i], gnode);
-    AddAttr("index", arg_index, gnode);
-    result->arg_types.push_back(dtypes[i]);
-    ++arg_index;
-  }
-  return Status::OK();
-}
-
-Status AddRetName(NameInfoIndex* name_info, const string& ret,
-                  const NameInfoItem& item) {
-  if (!name_info->insert({ret, item}).second) {
-    return errors::InvalidArgument("Duplicated ret name: ", ret);
-  }
-  return Status::OK();
-}
-
-Status BuildNodeOutputIndex(const NodeDef& node,
-                            const InstantiateAttrValueMap& attrs,
-                            GetFunctionSignature get_function,
-                            const int arg_index, NameInfoIndex* name_info) {
-  const OpDef* node_sig = nullptr;
-  TF_RETURN_IF_ERROR(get_function(node.op(), &node_sig));
-  if (node_sig->output_arg_size() == 0) {
-    return AddRetName(name_info, node.name(), {false, arg_index, 0, false, {}});
-  }
-  const int num_retval = node_sig->output_arg_size();
-  int start = 0;
-  bool is_type_list;
-  DataTypeVector dtypes;
-  for (int i = 0; i < num_retval; ++i) {
+  // Builds index for nodes that can be used as node's input arguments.
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
+                            AttrSlice attr_values) {
+    bool is_type_list;
+    DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
-    // Note that we rely on the backwards-compatibility test enforcing
-    // that output_arg(*).name() doesn't change here.
-    const string base_name =
-        strings::StrCat(node.name(), ":", node_sig->output_arg(i).name());
-    TF_RETURN_IF_ERROR(AddRetName(
-        name_info, base_name, {false, arg_index, start, is_type_list, dtypes}));
-    for (int j = 0; j < static_cast<int>(dtypes.size()); ++j) {
+        ArgNumType(attr_values, arg_def, &is_type_list, &dtypes));
+    CHECK_GE(dtypes.size(), size_t{1});
+    int arg_index = result_.nodes.size();
+    TF_RETURN_IF_ERROR(
+        AddItem(arg_def.name(), {true, arg_index, 0, is_type_list, dtypes}));
+    // Creates dtypes.size() nodes in the graph.
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      TF_RETURN_IF_ERROR(AddItem(strings::StrCat(arg_def.name(), ":", i),
+                                 {true, arg_index, 0, false, {dtypes[i]}}));
+      DCHECK_EQ(arg_index, result_.nodes.size());
+      string name = arg_def.name();
+      if (dtypes.size() > 1) {
+        strings::StrAppend(&name, "_", i);
+      }
+      NodeDef* gnode = AddNode(name);
+      gnode->set_op("_Arg");
+      AddAttr("T", dtypes[i], gnode);
+      AddAttr("index", arg_index, gnode);
+      result_.arg_types.push_back(dtypes[i]);
+      ++arg_index;
+    }
+    return Status::OK();
+  }
+
+  Status BuildNodeOutputIndex(const NodeDef& node, AttrSlice attrs,
+                              const int arg_index) {
+    const OpDef* node_sig = nullptr;
+    TF_RETURN_IF_ERROR(get_function_(node.op(), &node_sig));
+    if (node_sig->output_arg_size() == 0) {
+      return AddItem(node.name(), {false, arg_index, 0, false, {}});
+    }
+    const int num_retval = node_sig->output_arg_size();
+    int start = 0;
+    bool is_type_list;
+    DataTypeVector dtypes;
+    for (int i = 0; i < num_retval; ++i) {
       TF_RETURN_IF_ERROR(
-          AddRetName(name_info, strings::StrCat(base_name, ":", j),
-                     {false, arg_index, start + j, false, {dtypes[j]}}));
+          ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
+      // Note that we rely on the backwards-compatibility test enforcing
+      // that output_arg(*).name() doesn't change here.
+      const string base_name =
+          strings::StrCat(node.name(), ":", node_sig->output_arg(i).name());
+      TF_RETURN_IF_ERROR(
+          AddItem(base_name, {false, arg_index, start, is_type_list, dtypes}));
+      for (int j = 0; j < static_cast<int>(dtypes.size()); ++j) {
+        TF_RETURN_IF_ERROR(
+            AddItem(strings::StrCat(base_name, ":", j),
+                    {false, arg_index, start + j, false, {dtypes[j]}}));
+      }
+      start += dtypes.size();
     }
-    start += dtypes.size();
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-Status InstantiateNode(const NodeDef& fnode,
-                       const InstantiateAttrValueMap& attrs,
-                       GetFunctionSignature get_function,
-                       const NameInfoIndex& name_info, GraphDef* gdef) {
-  const OpDef* fnode_sig = nullptr;
-  TF_CHECK_OK(get_function(fnode.op(), &fnode_sig));
-  NodeDef* gnode = gdef->add_node();
-  gnode->set_name(Name(gdef->node_size() - 1));
-  gnode->set_op(fnode.op());
-  gnode->set_device(fnode.device());
+  Status InstantiateNode(const NodeDef& fnode, AttrSlice attrs) {
+    const OpDef* fnode_sig = nullptr;
+    TF_CHECK_OK(get_function_(fnode.op(), &fnode_sig));
+    NodeDef* gnode = AddNode(fnode.name());
+    gnode->set_op(fnode.op());
+    gnode->set_device(fnode.device());
+    int gnode_idx = nodes_.size() - 1;
 
-  // Input
-  const int num_args = fnode_sig->input_arg_size();
-  bool is_type_list;  // ignored
-  DataTypeVector dtypes;
-  int fnode_arg_index = 0;
-  for (int i = 0; i < num_args; ++i) {
-    TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes));
-    // Consume inputs (indexed by fnode_arg_index) until we have
-    // matched each element of dtypes (indexed by j).
-    for (size_t j = 0; j < dtypes.size(); ++fnode_arg_index) {
-      if (fnode_arg_index >= fnode.input_size()) {
-        // Should never happen if we computed dtypes correctly.
-        return errors::InvalidArgument("Attempt to access beyond input size: ",
-                                       fnode_arg_index, " >= ",
-                                       fnode.input_size());
-      }
-      // Look up the next input.
-      const string& input_name = fnode.input(fnode_arg_index);
-      const NameInfoItem* item = gtl::FindOrNull(name_info, input_name);
-      if (item == nullptr) {
-        return errors::InvalidArgument("input ", input_name, " is not found: ",
-                                       SummarizeNodeDef(fnode));
-      }
-      if (item->dtypes.size() > dtypes.size() - j) {
-        return errors::InvalidArgument("Input ", input_name, " too long for ",
-                                       fnode_sig->input_arg(i).name());
-      }
-      // Match up all the elements of this input (indexed by k) with
-      // elements of dtypes (advancing j).
-      for (int k = 0; k < item->dtypes.size(); ++k, ++j) {
-        if (item->dtypes[k] != dtypes[j]) {
+    // Input
+    const int num_args = fnode_sig->input_arg_size();
+    bool is_type_list;  // ignored
+    DataTypeVector dtypes;
+    int fnode_arg_index = 0;
+    for (int i = 0; i < num_args; ++i) {
+      TF_RETURN_IF_ERROR(
+          ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes));
+      // Consume inputs (indexed by fnode_arg_index) until we have
+      // matched each element of dtypes (indexed by j).
+      for (size_t j = 0; j < dtypes.size(); ++fnode_arg_index) {
+        if (fnode_arg_index >= fnode.input_size()) {
+          // Should never happen if we computed dtypes correctly.
           return errors::InvalidArgument(
-              "input ", fnode_sig->input_arg(i).name(), "[", j,
-              "] expected type ", DataTypeString(dtypes[j]), " != ",
-              DataTypeString(item->dtypes[k]), ", the type of ", input_name,
-              "[", k, "]");
+              "Attempt to access beyond input size: ", fnode_arg_index,
+              " >= ", fnode.input_size());
         }
-        if (item->is_func_arg) {
-          gnode->add_input(Name(item->nid + k));
-        } else {
-          gnode->add_input(Name(item->nid, item->idx + k));
+        // Look up the next input.
+        const string& input_name = fnode.input(fnode_arg_index);
+        const auto* item = GetItemOrNull(input_name);
+        if (item == nullptr) {
+          return errors::InvalidArgument(
+              "input ", input_name, " is not found: ", SummarizeNodeDef(fnode));
         }
+        if (item->dtypes.size() > dtypes.size() - j) {
+          return errors::InvalidArgument("Input ", input_name, " too long for ",
+                                         fnode_sig->input_arg(i).name());
+        }
+        // Match up all the elements of this input (indexed by k) with
+        // elements of dtypes (advancing j).
+        for (int k = 0; k < item->dtypes.size(); ++k, ++j) {
+          if (item->dtypes[k] != dtypes[j]) {
+            return errors::InvalidArgument(
+                "input ", fnode_sig->input_arg(i).name(), "[", j,
+                "] expected type ", DataTypeString(dtypes[j]),
+                " != ", DataTypeString(item->dtypes[k]), ", the type of ",
+                input_name, "[", k, "]");
+          }
+          if (item->is_func_arg) {
+            AddInput(gnode_idx, item->nid + k, 0);
+          } else {
+            AddInput(gnode_idx, item->nid, item->idx + k);
+          }
+        }
+      }
+    }
+
+    // Control deps.
+    for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
+      const string& input = fnode.input(i);
+      if (input.empty() || input[0] != '^') {
+        return errors::InvalidArgument("Expected input[", i, "] == '", input,
+                                       "' to be a control input.");
+      }
+      int nid = -1;
+      const string node_name = input.substr(1);
+      const string node_colon = node_name + ":";
+      for (const auto& p : index_) {
+        if (p.first == node_name ||
+            tensorflow::StringPiece(p.first).starts_with(node_colon)) {
+          nid = p.second.nid;
+          break;
+        }
+      }
+      if (nid == -1) {
+        return errors::InvalidArgument("input[", i, "] == '", input,
+                                       "', is not found.");
+      }
+      AddDep(gnode_idx, nid);
+    }
+
+    // Attrs.
+    for (const auto& p : attrs) {
+      (*gnode->mutable_attr())[p.first] = p.second;
+    }
+
+    return Status::OK();
+  }
+
+  Status AddReturnNode(
+      const OpDef::ArgDef& ret_def, AttrSlice attrs,
+      const ::tensorflow::protobuf::Map<string, string>& ret_map,
+      int* ret_index) {
+    auto ret_iter = ret_map.find(ret_def.name());
+    if (ret_iter == ret_map.end()) {
+      return errors::InvalidArgument("Return ", ret_def.name(), " missing.");
+    }
+    bool is_type_list;
+    DataTypeVector dtypes;
+    TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+    CHECK_GE(dtypes.size(), size_t{1});
+    const auto* item = GetItemOrNull(ret_iter->second);
+    if (item == nullptr) {
+      return errors::InvalidArgument("Return ", ret_def.name(), " -> ",
+                                     ret_iter->second, " is not found.");
+    }
+    if (dtypes != item->dtypes) {
+      return errors::InvalidArgument("Invalid ret types ", ret_def.name(),
+                                     " : ", DataTypeVectorString(dtypes),
+                                     " vs. ",
+                                     DataTypeVectorString(item->dtypes));
+    }
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      string name = strings::StrCat(ret_def.name(), "_RetVal");
+      if (dtypes.size() > 1) {
+        strings::StrAppend(&name, "_", i);
+      }
+      NodeDef* gnode = AddNode(name);
+      gnode->set_op("_Retval");
+      AddInput(nodes_.size() - 1, item->nid, item->idx + i);
+      AddAttr("T", dtypes[i], gnode);
+      AddAttr("index", (*ret_index)++, gnode);
+      result_.ret_types.push_back(dtypes[i]);
+    }
+    return Status::OK();
+  }
+
+  // Adds the actual node inputs to the result graph by converting indexes to
+  // the node names.
+  void AddNodeInputs() {
+    for (int i = 0; i < result_.nodes.size(); i++) {
+      NodeInfo& node_info = nodes_[i];
+      for (const auto& p : node_info.data_inputs) {
+        result_.nodes[i].add_input(Name(p.first, p.second));
+      }
+      for (int index : node_info.control_inputs) {
+        result_.nodes[i].add_input(Dep(index));
       }
     }
   }
 
-  // Control deps.
-  for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
-    const string& input = fnode.input(i);
-    if (input.empty() || input[0] != '^') {
-      return errors::InvalidArgument("Expected input[", i, "] == '", input,
-                                     "' to be a control input.");
+ private:
+  // This is used to build a small index for all names that can be used as a
+  // node's input arguments.
+  //
+  // If is_func_arg is true, the name is a function's argument.  In
+  // this case, the produced graph def has node[nid:nid + dtype.size()].
+  //
+  // Otherwise, the name is a function body's node return value.  In
+  // this case, the produced graph def has one node node[nid] and
+  // the node's output index [idx ... idx + num) corresponds to the
+  // named outputs.
+  //
+  // In all cases, "dtype" specifies the data type.
+  struct NameInfoItem {
+    bool is_func_arg;
+    int nid;
+    int idx;
+    bool is_type_list;
+    DataTypeVector dtypes;
+  };
+
+  // Adds an item into the input name index.
+  Status AddItem(const string& name, const NameInfoItem& item) {
+    if (!index_.insert({name, item}).second) {
+      return errors::InvalidArgument(
+          strings::StrCat("Duplicated ", item.is_func_arg ? "arg" : "ret",
+                          " name: "),
+          name);
     }
-    int nid = -1;
-    const string node_name = input.substr(1);
-    const string node_colon = node_name + ":";
-    for (const auto& p : name_info) {
-      if (p.first == node_name ||
-          tensorflow::StringPiece(p.first).starts_with(node_colon)) {
-        nid = p.second.nid;
-        break;
-      }
-    }
-    if (nid == -1) {
-      return errors::InvalidArgument("input[", i, "] == '", input,
-                                     "', is not found.");
-    }
-    gnode->add_input(Dep(nid));
+    return Status::OK();
   }
 
-  // Attrs.
-  for (const auto& p : attrs) {
-    (*gnode->mutable_attr())[p.first] = p.second;
+  const NameInfoItem* GetItemOrNull(const string& name) const {
+    return gtl::FindOrNull(index_, name);
   }
 
-  return Status::OK();
-}
+  string Dep(int node_index) const {
+    return strings::StrCat("^", Name(node_index));
+  }
 
-Status AddReturnNode(const OpDef::ArgDef& ret_def,
-                     const InstantiateAttrValueMap& attrs,
-                     const ::tensorflow::protobuf::Map<string, string>& ret_map,
-                     const NameInfoIndex& name_info, int* ret_index,
-                     InstantiationResult* result) {
-  auto ret_iter = ret_map.find(ret_def.name());
-  if (ret_iter == ret_map.end()) {
-    return errors::InvalidArgument("Return ", ret_def.name(), " missing.");
+  string Name(int node_index) const {
+    CHECK_LT(node_index, nodes_.size());
+    return nodes_[node_index].name;
   }
-  bool is_type_list;
-  DataTypeVector dtypes;
-  TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
-  CHECK_GE(dtypes.size(), size_t{1});
-  const NameInfoItem* item = gtl::FindOrNull(name_info, ret_iter->second);
-  if (item == nullptr) {
-    return errors::InvalidArgument("Return ", ret_def.name(), " -> ",
-                                   ret_iter->second, " is not found.");
+
+  string Name(int node_index, int output_index) const {
+    if (output_index == 0) {
+      return Name(node_index);
+    } else {
+      return strings::StrCat(Name(node_index), ":", output_index);
+    }
   }
-  if (dtypes != item->dtypes) {
-    return errors::InvalidArgument("Invalid ret types ", ret_def.name(), " : ",
-                                   DataTypeVectorString(dtypes), " vs. ",
-                                   DataTypeVectorString(item->dtypes));
+
+  NodeDef* AddNode(const string& name) {
+    result_.nodes.emplace_back();
+    NodeDef* gnode = &result_.nodes.back();
+    gnode->set_name(name);
+    nodes_.push_back({name, {}, {}});
+    CHECK_EQ(result_.nodes.size(), nodes_.size());
+    return gnode;
   }
-  GraphDef* gdef = &result->gdef;
-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    NodeDef* gnode = gdef->add_node();
-    gnode->set_name(Name(gdef->node_size() - 1));
-    gnode->set_op("_Retval");
-    gnode->add_input(Name(item->nid, item->idx + i));
-    AddAttr("T", dtypes[i], gnode);
-    AddAttr("index", (*ret_index)++, gnode);
-    result->ret_types.push_back(dtypes[i]);
+
+  void AddInput(int node_index, int output_node, int output_index) {
+    CHECK_LT(node_index, nodes_.size());
+    nodes_[node_index].data_inputs.push_back(
+        std::make_pair(output_node, output_index));
   }
-  return Status::OK();
-}
+
+  void AddDep(int node_index, int dep_index) {
+    CHECK_LT(node_index, nodes_.size());
+    nodes_[node_index].control_inputs.push_back(dep_index);
+  }
+
+  GetFunctionSignature get_function_;
+  InstantiationResult& result_;
+  // A small index for all names that can be used as a node's input arguments.
+  std::unordered_map<string, NameInfoItem> index_;
+  // This contains information about a node in the new graph including the node
+  // names and input nodes' indexes.
+  struct NodeInfo {
+    string name;
+    // Data inputs where <n, k> means arg k of node n.
+    std::vector<std::pair<int, int>> data_inputs;
+    // Control inputs (dependencies).
+    std::vector<int> control_inputs;
+  };
+  // nodes_[i] is the information about result_.nodes[i].
+  std::vector<NodeInfo> nodes_;
+};
 
 // Various helpers Print(proto) to print relevant protos to ascii.
 string Print(const OpDef::ArgDef& arg) {
@@ -407,7 +470,7 @@ string Print(const AttrValue& attr_value) {
     for (auto p : attr_value.func().attr()) {
       entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
     }
-    sort(entries.begin(), entries.end());
+    std::sort(entries.begin(), entries.end());
     return strings::StrCat(attr_value.func().name(), "[",
                            str_util::Join(entries, ", "), "]");
   }
@@ -423,7 +486,7 @@ string Print(const NodeDef& n) {
     for (auto& a : n.attr()) {
       entries.push_back(strings::StrCat(a.first, "=", Print(a.second)));
     }
-    sort(entries.begin(), entries.end());
+    std::sort(entries.begin(), entries.end());
     strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
@@ -481,17 +544,17 @@ string Print(const FunctionDef& fdef) {
   return out;
 }
 
-string Print(const GraphDef& gdef) {
+string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> arg;
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
-  for (const NodeDef& n : gdef.node()) {
-    if (n.op() == "_Arg") {
-      arg.push_back(&n);
-    } else if (n.op() == "_Retval") {
-      ret.push_back(&n);
+  for (const NodeDef* n : nodes) {
+    if (n->op() == "_Arg") {
+      arg.push_back(n);
+    } else if (n->op() == "_Retval") {
+      ret.push_back(n);
     } else {
-      body.push_back(&n);
+      body.push_back(n);
     }
   }
   auto comp = [](const NodeDef* x, const NodeDef* y) {
@@ -501,29 +564,28 @@ string Print(const GraphDef& gdef) {
     TF_CHECK_OK(GetNodeAttr(*y, "index", &yi));
     return xi < yi;
   };
-  sort(arg.begin(), arg.end(), comp);
-  sort(ret.begin(), ret.end(), comp);
+  std::sort(arg.begin(), arg.end(), comp);
+  std::sort(ret.begin(), ret.end(), comp);
   string out;
   strings::StrAppend(&out, "\n(");
   auto get_type = [](const NodeDef& n) {
-    for (auto a : n.attr()) {
-      if (a.first == "T") {
-        return DataTypeString(a.second.type());
-      }
+    DataType dt;
+    if (!GetNodeAttr(n, "T", &dt).ok()) {
+      dt = DT_INVALID;
     }
-    return DataTypeString(DT_INVALID);
+    return DataTypeString(dt);
   };
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_GE(n->attr_size(), 2);
     strings::StrAppend(&out, n->name(), ":", get_type(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_LE(2, n->attr_size());
     CHECK_EQ(1, n->input_size());
     strings::StrAppend(&out, n->input(0), ":", get_type(*n));
   }
@@ -535,8 +597,9 @@ string Print(const GraphDef& gdef) {
   return out;
 }
 
-Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
-                       InstantiateAttrValueMap* attrs) {
+Status AddDefaultAttrs(const string& op,
+                       const GetFunctionSignature& get_function,
+                       AttrValueMap* attrs) {
   const OpDef* op_def = nullptr;
   TF_RETURN_IF_ERROR(get_function(op, &op_def));
   AttrSlice attr_slice(attrs);
@@ -552,41 +615,35 @@ Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
 
 }  // end namespace
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
   VLOG(3) << "Instantiation Function: " << Print(fdef);
 
   const OpDef& sig = fdef.signature();
-  GraphDef* gdef = &result->gdef;
-  gdef->Clear();
-
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
-  NameInfoIndex name_info;
+  FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = BuildInputArgIndex(arg_def, attr_values, &name_info, result);
+    s = helper.BuildInputArgIndex(arg_def, attr_values);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
     }
   }
 
-  auto substitute = [&attr_values](const string& name, AttrValue* val) {
-    auto iter = attr_values.find(name);
-    if (iter == attr_values.end()) {
-      return false;
-    } else {
-      *val = iter->second;
+  auto substitute = [attr_values](StringPiece name, AttrValue* val) {
+    if (const AttrValue* v = attr_values.Find(name)) {
+      *val = *v;
       return true;
     }
+    return false;
   };
 
   // Makes a copy of all attrs in fdef and substitutes placeholders.
   // After this step, every attr is bound to a concrete value.
-  std::vector<InstantiateAttrValueMap> node_attrs;
+  std::vector<AttrValueMap> node_attrs;
   node_attrs.resize(fdef.node_def_size());
   for (int i = 0; i < fdef.node_def_size(); ++i) {
     for (auto attr : fdef.node_def(i).attr()) {
@@ -603,17 +660,16 @@ Status InstantiateFunction(const FunctionDef& fdef,
   }
 
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function,
-                             gdef->node_size() + i, &name_info);
+    s = helper.BuildNodeOutputIndex(fdef.node_def(i), AttrSlice(&node_attrs[i]),
+                                    result->nodes.size() + i);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
       return s;
     }
   }
-  // Emits one gdef.node for each fdef.node_def.
+  // Emits one node for each fdef.node_def.
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function,
-                        name_info, gdef);
+    s = helper.InstantiateNode(fdef.node_def(i), AttrSlice(&node_attrs[i]));
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
       return s;
@@ -623,21 +679,35 @@ Status InstantiateFunction(const FunctionDef& fdef,
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
-    s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index,
-                      result);
+    s = helper.AddReturnNode(ret_def, attr_values, fdef.ret(), &ret_index);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In function output ", Print(ret_def));
       return s;
     }
   }
 
+  // Adds the actual node inputs using the input indexes.
+  helper.AddNodeInputs();
+
   return Status::OK();
 }
 
 string DebugString(const FunctionDef& func_def) { return Print(func_def); }
 
 string DebugString(const GraphDef& instantiated_func_def) {
-  return Print(instantiated_func_def);
+  std::vector<const NodeDef*> ptrs;
+  for (const NodeDef& n : instantiated_func_def.node()) {
+    ptrs.push_back(&n);
+  }
+  return Print(ptrs);
+}
+
+string DebugString(gtl::ArraySlice<NodeDef> instantiated_func_nodes) {
+  std::vector<const NodeDef*> ptrs;
+  for (const NodeDef& n : instantiated_func_nodes) {
+    ptrs.push_back(&n);
+  }
+  return Print(ptrs);
 }
 
 string DebugStringWhole(const GraphDef& gdef) {
@@ -652,14 +722,43 @@ string DebugStringWhole(const GraphDef& gdef) {
   return ret;
 }
 
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs) {
+bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
+  // NOTE(skyewm): Using MessageDifferencer would be better here, but that is
+  // currently not included in tensorflow/core/platform/default/protobuf.h, so
+  // play fast and loose here.  I don't see anything in OpDef that should allow
+  // multiple equivalent string serializations, with the exception of
+  // AttrValues, which can vary for tensor values (see AreAttrValuesEqual()
+  // comments).
+  string sig1, sig2;
+  f1.signature().SerializeToString(&sig1);
+  f2.signature().SerializeToString(&sig2);
+  if (sig1 != sig2) return false;
+
+  if (f1.attr().size() != f2.attr().size()) return false;
+  for (auto iter1 : f1.attr()) {
+    auto iter2 = f2.attr().find(iter1.first);
+    if (iter2 == f2.attr().end()) return false;
+    if (!AreAttrValuesEqual(iter1.second, iter2->second)) return false;
+  }
+
+  if (!EqualRepeatedNodeDef(f1.node_def(), f2.node_def(), nullptr)) {
+    return false;
+  }
+
+  std::map<string, string> ret1(f1.ret().begin(), f1.ret().end());
+  std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
+  if (ret1 != ret2) return false;
+
+  return true;
+}
+
+string Canonicalize(const string& funcname, AttrSlice attrs) {
   std::vector<string> entries;
   entries.reserve(attrs.size());
   for (auto p : attrs) {
     entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
   }
-  sort(entries.begin(), entries.end());
+  std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
 }
 
@@ -695,7 +794,7 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   rets->clear();
   rets->reserve(rets_.size());
   for (size_t i = 0; i < rets_.size(); ++i) {
-    auto item = rets_[i];
+    const auto& item = rets_[i];
     if (item.has_val) {
       rets->push_back(item.val);
     } else {
@@ -705,6 +804,19 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   return Status::OK();
 }
 
+Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets) {
+  rets->clear();
+  rets->reserve(rets_.size());
+  for (size_t i = 0; i < rets_.size(); ++i) {
+    if (rets_[i].has_val) {
+      rets->emplace_back(std::move(rets_[i].val));
+    } else {
+      return errors::Internal("Retval[", i, "] does not have value");
+    }
+  }
+  return Status::OK();
+}
+
 Status FunctionCallFrame::GetArg(int index, Tensor* val) const {
   if (index < 0 || static_cast<size_t>(index) >= args_.size()) {
     return errors::InvalidArgument("GetArg ", index, " is not within [0, ",
@@ -775,10 +887,50 @@ Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
                                    fdef.signature().name(),
                                    " already exists in function library.");
   }
+  const OpDef* op_def;
+  if (default_registry_->LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
+    return errors::InvalidArgument(
+        "Cannot add function '", fdef.signature().name(),
+        "' because an op with the same name already exists.");
+  }
   ptr.reset(new FunctionDefAndOpRegistration(fdef));
   return Status::OK();
 }
 
+Status FunctionLibraryDefinition::AddGradientDef(const GradientDef& grad) {
+  if (func_grad_.count(grad.function_name()) > 0) {
+    return errors::InvalidArgument("Gradient for function '",
+                                   grad.function_name(), "' already exists.");
+  }
+  func_grad_[grad.function_name()] = grad.gradient_func();
+  return Status::OK();
+}
+
+Status FunctionLibraryDefinition::AddLibrary(
+    const FunctionLibraryDefinition& other) {
+  for (auto iter : other.function_defs_) {
+    TF_RETURN_IF_ERROR(AddFunctionDef(iter.second->fdef));
+  }
+  for (auto iter : other.func_grad_) {
+    GradientDef grad;
+    grad.set_function_name(iter.first);
+    grad.set_gradient_func(iter.second);
+    TF_RETURN_IF_ERROR(AddGradientDef(grad));
+  }
+  return Status::OK();
+}
+
+Status FunctionLibraryDefinition::AddLibrary(
+    const FunctionDefLibrary& lib_def) {
+  for (const FunctionDef& fdef : lib_def.function()) {
+    TF_RETURN_IF_ERROR(AddFunctionDef(fdef));
+  }
+  for (const GradientDef& grad : lib_def.gradient()) {
+    TF_RETURN_IF_ERROR(AddGradientDef(grad));
+  }
+  return Status::OK();
+}
+
 string FunctionLibraryDefinition::FindGradient(const string& func) const {
   return gtl::FindWithDefault(func_grad_, func, "");
 }
@@ -804,8 +956,7 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   // If ndef is SymbolicGradient[f=Foo], we use Foo's gradient or
   // Foo's attributes.
   const NameAttrList* forward_func_attrs;
-  if (!GetNodeAttr(AttrSlice(&ndef.attr()), kFuncAttr, &forward_func_attrs)
-           .ok()) {
+  if (!GetNodeAttr(ndef, kFuncAttr, &forward_func_attrs).ok()) {
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
@@ -832,34 +983,30 @@ FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   return lib;
 }
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attr_values) {
-    m.insert({aval.first, aval.second.proto});
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
+                                          const string& attr, T* value) const {
+  const FunctionDef* fdef = GetAttrImpl(ndef);
+  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
+    return Status::OK();
   }
-  return InstantiateFunction(fdef, m, get_function, result);
+  return errors::InvalidArgument("Attr ", attr, " is not defined.");
 }
 
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Canonicalize(funcname, m);
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const Node& node, const string& attr,
+                                          T* value) const {
+  return GetAttr(node.def(), attr, value);
 }
 
-Status FunctionLibraryRuntime::Instantiate(const string& function_name,
-                                           InstantiateAttrValueSlice attrs,
-                                           Handle* handle) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Instantiate(function_name, m, handle);
-}
+#define GET_ATTR(T)                                                            \
+  template Status FunctionLibraryDefinition::GetAttr(const Node&,              \
+                                                     const string&, T*) const; \
+  template Status FunctionLibraryDefinition::GetAttr(const NodeDef&,           \
+                                                     const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 2f166ad26e7..6c2da84790c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -36,6 +36,7 @@ class CancellationManager;
 class OpKernel;
 class ResourceMgr;
 class ScopedStepContainer;
+class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
 // FunctionDef proto.
@@ -190,11 +191,6 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(StringPiece val) {
 // InstantiateFunction calls "get_function" to find signatures of other
 // functions and primitive ops.
 
-// Placeholders in "fdef" is substituted based on "attr_values" here.
-typedef ::tensorflow::protobuf::Map<string, AttrValue> InstantiateAttrValueMap;
-typedef gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
-    InstantiateAttrValueSlice;
-
 // GetFunctionSignature(func name, opdef) returns OK if the func name is found
 // and opdef is filled with a pointer to the corresponding signature
 // (a OpDef proto). Otherwise, returns an error.
@@ -204,14 +200,9 @@ typedef std::function<Status(const string&, const OpDef**)>
 struct InstantiationResult {
   DataTypeVector arg_types;
   DataTypeVector ret_types;
-  GraphDef gdef;
+  std::vector<NodeDef> nodes;
 };
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result);
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result);
 
@@ -225,11 +216,16 @@ Status InstantiateFunction(const FunctionDef& fdef,
 // etc.)
 string DebugString(const FunctionDef& func_def);
 string DebugString(const GraphDef& instantiated_func_def);
+string DebugString(gtl::ArraySlice<NodeDef> instantiated_func_nodes);
 
 // Returns a debug string for a top level graph (the main program and
 // its supporting functions defined in its library).
 string DebugStringWhole(const GraphDef& gdef);
 
+// Returns true if f1 == f2. Compares all fields, including descriptions. Order
+// of NodeDefs doesn't matter.
+bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
+
 // Returns a canonicalized string for the instantiation of the
 // function of the given "name" and attributes "attrs".
 //
@@ -237,9 +233,7 @@ string DebugStringWhole(const GraphDef& gdef);
 // space. But it may be change as the implementation
 // evolves. Therefore, it should not be persisted or compared across
 // address spaces.
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs);
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs);
+string Canonicalize(const string& funcname, AttrSlice attrs);
 
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
@@ -255,6 +249,7 @@ class FunctionCallFrame {
   // Caller methods.
   Status SetArgs(gtl::ArraySlice<Tensor> args);
   Status GetRetvals(std::vector<Tensor>* rets) const;
+  Status ConsumeRetvals(std::vector<Tensor>* rets);
 
   // Callee methods.
   Status GetArg(int index, Tensor* val) const;
@@ -295,6 +290,17 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // from 'LookUp' and included in the proto returned by 'ToProto'.
   Status AddFunctionDef(const FunctionDef& fdef);
 
+  // Adds gradient definition 'grad' to this function library.
+  // If 'grad' is successfully added, it will be accessible via 'FindGradient'
+  // and included in the proto returned by 'ToProto'.
+  Status AddGradientDef(const GradientDef& grad);
+
+  // Adds the functions and gradients in 'other' to this function library.
+  Status AddLibrary(const FunctionLibraryDefinition& other);
+
+  // Adds the functions and gradients in 'lib_def' to this function library.
+  Status AddLibrary(const FunctionDefLibrary& lib_def);
+
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
@@ -314,12 +320,23 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Given a node def 'ndef', inspects attributes of the callee
   // function to derive the attribute 'value' for 'attr'. Returns OK
   // iff the attribute is given by the function's definition.
+  // TODO(irving): Remove; keep only the const Node& version.
   template <typename T>
   Status GetAttr(const NodeDef& ndef, const string& attr, T* value) const;
 
+  // Given a node, inspects attributes of the callee function to derive the
+  // attribute 'value' for 'attr'. Returns OK iff the attribute is given by the
+  // function's definition.
+  template <typename T>
+  Status GetAttr(const Node& node, const string& attr, T* value) const;
+
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const;
 
+  const OpRegistryInterface* default_registry() const {
+    return default_registry_;
+  }
+
  private:
   // TODO(cwhipkey): support shape functions in FunctionDefLibrary.
   struct FunctionDefAndOpRegistration {
@@ -355,11 +372,8 @@ class FunctionLibraryRuntime {
   // Returns OK and fills in "handle" if the instantiation succeeds.
   // Otherwise returns an error and "handle" is undefined.
   typedef uint64 Handle;
-  virtual Status Instantiate(const string& function_name,
-                             const InstantiateAttrValueMap& attrs,
+  virtual Status Instantiate(const string& function_name, AttrSlice attrs,
                              Handle* handle) = 0;
-  Status Instantiate(const string& function_name,
-                     InstantiateAttrValueSlice attrs, Handle* handle);
 
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
@@ -486,17 +500,15 @@ bool RegisterOp(const string& op, Creator func);
 Status GetOpGradientCreator(const string& op, Creator* creator);
 };
 
-// Implementation details.
-
-template <typename T>
-Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
-                                          const string& attr, T* value) const {
-  const FunctionDef* fdef = GetAttrImpl(ndef);
-  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Attr ", attr, " is not defined.");
-}
+// Declare explicit instantiations of GetAttr
+#define GET_ATTR(T)                                          \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const Node&, const string&, T*) const;                 \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const NodeDef&, const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 8bce215a9af..2ecdc36c111 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -29,6 +28,24 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace {
+
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+            attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
 
 typedef FunctionDefHelper FDH;
 
@@ -46,8 +63,6 @@ y: A scalar in type T.
 
 )doc");
 
-static InstantiateAttrValueMap kNoAttrs;
-
 TEST(TFunc, SquarePlusOne) {
   auto fdef = FDH::Create(
       // Name
@@ -81,17 +96,18 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Square[T=float](n0)
-  n2 = One[T=float]()
-  n3 = Add[T=float](n1, n2)
+(x:float) -> (y:float) {
+  a = Square[T=float](x)
+  o = One[T=float]()
+  y = Add[T=float](a, o)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 TEST(TFunc, ControlDep) {
@@ -126,17 +142,18 @@ ControlDep(x:int32) -> (y:int32) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:int32) -> (n3:int32) {
-  n1 = Identity[T=int32](n0)
-  n2 = NoOp() @ n1
-  n3 = Identity[T=int32](n1) @ n2
+(x:int32) -> (y:int32) {
+  a = Identity[T=int32](x)
+  o = NoOp() @ a
+  y = Identity[T=int32](a) @ o
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_INT32}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_INT32}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 REGISTER_OP("HasDefaultType")
@@ -145,7 +162,7 @@ REGISTER_OP("HasDefaultType")
 
 // This verifies that a function using an op before a type attr (with
 // a default) is added, still works.  This is important for backwards
-// compatibilty.
+// compatibility.
 TEST(TFunc, MissingTypeAttr) {
   auto fdef = FDH::Create(
       // Name
@@ -171,17 +188,16 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(
-      InstantiateFunction(fdef, InstantiateAttrValueMap{}, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   // Should get T=float from Op's default.
   const char* e2 = R"P(
-() -> (n0:float) {
-  n0 = HasDefaultType[T=float]()
+() -> (a:float) {
+  a = HasDefaultType[T=float]()
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector());
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 TEST(TFunc, NTimesT) {
@@ -209,15 +225,15 @@ NTimesT(x:float, y:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float, n1:float) -> (n2:float) {
-  n2 = AddN[N=2, T=float](n0, n1)
+(x:float, y:float) -> (a:float) {
+  a = AddN[N=2, T=float](x, y)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT, DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 // NOTE: This is the simplest Map op. It takes a f:T->U.
@@ -272,17 +288,17 @@ AddSquared[N:int, T:{float, double, int32, int64}](x:N*T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"N", 3}, {"T", DT_FLOAT}}, GetOpSig,
-                                   &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, Attrs({{"N", 3}, {"T", DT_FLOAT}}),
+                                   GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float, n1:float, n2:float) -> (n4:float) {
-  n3 = Map[N=3, T=float, U=float, func=Square[T=float]](n0, n1, n2)
-  n4 = AddN[N=3, T=float](n3, n3:1, n3:2)
+(x_0:float, x_1:float, x_2:float) -> (y:float) {
+  a = Map[N=3, T=float, U=float, func=Square[T=float]](x_0, x_1, x_2)
+  y = AddN[N=3, T=float](a, a:1, a:2)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT, DT_FLOAT, DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 TEST(TFunc, ControlDeps) {
@@ -315,19 +331,19 @@ ControlDeps(x:float) -> () {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> () {
-  n1 = One[T=float]() @ n0
-  n2 = NoOp() @ n1
-  n3 = One[T=float]() @ n2
-  n4 = NoOp() @ n3
-  n5 = One[T=float]() @ n1, n4
+(x:float) -> () {
+  a = One[T=float]() @ x
+  u = NoOp() @ a
+  b = One[T=float]() @ u
+  v = NoOp() @ b
+  c = One[T=float]() @ a, v
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 TEST(TFunc, XTimesTwo) {
@@ -395,20 +411,20 @@ Test(i:float) -> (o:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n6:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n3 = Mul[T=float](n2, n2:1)
-  n4 = Mul[T=float](n2:2, n2:3)
-  n5 = _ListToArray[N=2, T=float, Tin={float, float}](n3, n4)
-  n6 = AddN[N=2, T=float](n5, n5:1)
+(i:float) -> (o:float) {
+  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
+  s = Split[T=float, num_split=4](zero, i)
+  l = Mul[T=float](s, s:1)
+  r = Mul[T=float](s:2, s:3)
+  x = _ListToArray[N=2, T=float, Tin={float, float}](l, r)
+  o = AddN[N=2, T=float](x, x:1)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 REGISTER_OP("Cond")
@@ -467,16 +483,16 @@ MySelect(x:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n2:float) {
-  n1 = Cond[Tin={float}, cond=MyCond, else_branch=MyElse, out_types={float}, then_branch=MyThen](n0)
-  n2 = Cond[Tin={float, float}, cond=MyCond2, else_branch=MyElse2, out_types={float}, then_branch=MyThen2](n1, n1)
+(x:float) -> (z:float) {
+  y = Cond[Tin={float}, cond=MyCond, else_branch=MyElse, out_types={float}, then_branch=MyThen](x)
+  z = Cond[Tin={float, float}, cond=MyCond2, else_branch=MyElse2, out_types={float}, then_branch=MyThen2](y, y)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
   EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
+  EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
 static void HasError(const Status& s, const string& substr) {
@@ -488,8 +504,9 @@ TEST(InstantiateErrors, Not_Sufficient_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"U", DT_FLOAT}}, GetOpSig, &result),
-           "Attr T is not found from ");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"U", DT_FLOAT}}), GetOpSig, &result),
+      "Attr T is not found from ");
 }
 
 #if 0  // TODO(josh11b): Enable this test once having an extra attr is an error.
@@ -497,7 +514,7 @@ TEST(InstantiateErrors, Too_Many_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_INT32}, {"U", DT_FLOAT}},
+  HasError(InstantiateFunction(fdef, Attrs({{"T", DT_INT32}, {"U", DT_FLOAT}}),
                                GetOpSig, &result),
            "Attr U is not found in ");
 }
@@ -508,7 +525,7 @@ TEST(InstantiateErrors, AttrValue_Value_Placeholder) {
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
   HasError(
-      InstantiateFunction(fdef, {{"T", "$bad"}}, GetOpSig, &result),
+      InstantiateFunction(fdef, Attrs({{"T", "$bad"}}), GetOpSig, &result),
       "AttrValue had value with unexpected type 'placeholder'\n\tfor attr 'T'");
 }
 
@@ -518,14 +535,15 @@ TEST(InstantiateErrors, Unbounded_Attr) {
                               {{"a"}, "One", {}, {{"T", "$unknown"}}, {"x"}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result),
-           "Failed to bind all placeholders");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result),
+      "Failed to bind all placeholders");
 }
 
 TEST(InstantiateErrors, DupArgs) {
   auto fdef = FDH::Define("test", {"x:float", "x:float"}, {}, {}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated arg name");
 }
 
@@ -536,7 +554,7 @@ TEST(InstantiateErrors, Dup_Node_Names) {
                               {{"y"}, "One", {}, {{"T", DT_FLOAT}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated ret name");
 }
 
@@ -547,7 +565,7 @@ TEST(InstantiateErrors, Node_Arg_Notfound) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input z is not found");
 }
 
@@ -557,7 +575,7 @@ TEST(InstantiateErrors, Node_Arg_TypeMismatch) {
                               {{"y"}, "Add", {"x", "x"}, {{"T", DT_INT32}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input x[0] expected type int32 != float, the type of x[0]");
 }
 
@@ -568,7 +586,7 @@ TEST(InstantiateErrors, Node_Arg_ControlMissing) {
                       {{"y"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}, {"z"}},
                   });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input[2] == '^z', is not found.");
 }
 
@@ -579,7 +597,7 @@ TEST(InstantiateErrors, FuncRet_Missing) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -590,7 +608,7 @@ TEST(InstantiateErrors, FuncRet_NotFound) {
                           },
                           {{"y", "z"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y -> z is not found");
 }
 
@@ -601,7 +619,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
                           },
                           {{"z", "x:y:0"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -613,7 +631,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
 //                           },
 //                           {{"y", "x:y:0"}, {"z", "x:y:0"}});
 //   InstantiationResult result;
-//   HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+//   HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
 //            "ret is not found");
 // }
 
@@ -623,7 +641,7 @@ TEST(InstantiateErrors, FuncRet_TypeMismatch) {
                               {{"y"}, "One", {}, {{"T", DT_DOUBLE}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types y : float vs. double\n\tIn function output y");
 }
 
@@ -649,7 +667,7 @@ TEST(InstantiateErrors, TypeList_Missing_Retval_Attr) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "type attr not found: out_types");
 }
 
@@ -676,7 +694,7 @@ TEST(InstantiateErrors, TypeList_Num_Retval_Mismatch) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types");
 }
 
@@ -703,7 +721,7 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input unknown is not found");
 }
 
@@ -724,7 +742,7 @@ TEST(InstantiateErrors, TooManyInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[2] == 'x' to be a control input.");
 }
 
@@ -745,7 +763,7 @@ TEST(InstantiateErrors, TooFewInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Attempt to access beyond input size: 2 >= 2");
 }
 
@@ -773,7 +791,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray1) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[1] == 'y' to be a control input.");
 }
 
@@ -801,7 +819,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray2) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Input a:output too long for inputs");
 }
 
@@ -822,7 +840,7 @@ TEST(InstantiateErrors, TypeMismatch) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input inputs[1] expected type float != int32, the type of y[0]");
 }
 
@@ -874,17 +892,17 @@ TEST(FunctionCallFrame, Float_Float_Float) {
 }
 
 TEST(Canonicalize, Basic) {
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_a", false},
-                                    {"transpose_b", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_a", false},
+                                          {"transpose_b", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_b", false},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_b", false},
+                                          {"transpose_a", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_DOUBLE},
-                                    {"transpose_b", true},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_DOUBLE},
+                                          {"transpose_b", true},
+                                          {"transpose_a", false}})),
             "MatMul[T=double,transpose_a=false,transpose_b=true]");
 }
 
@@ -944,6 +962,71 @@ TEST(FunctionLibraryDefinitionTest, AddFunctionDef) {
   ASSERT_NE(second, nullptr);
   EXPECT_EQ(second->DebugString(),
             test::function::WXPlusB().signature().DebugString());
+
+  // Can't add function with same name as existing op
+  FunctionDef fdef = test::function::XTimesTwo();
+  fdef.mutable_signature()->set_name("Add");
+  Status s = lib_def.AddFunctionDef(fdef);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
+}
+
+TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
+  // AddGradientDef() doesn't check that functions referenced exist (yet?)
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), FunctionDefLibrary());
+
+  // Test adding a gradient (XTimesFour isn't a valid grad function for
+  // XTimesTwo but that's ok for now)
+  GradientDef grad;
+  grad.set_function_name(test::function::XTimesTwo().signature().name());
+  grad.set_gradient_func(test::function::XTimesFour().signature().name());
+  TF_EXPECT_OK(lib_def.AddGradientDef(grad));
+
+  // Test that adding a duplicate gradient fails
+  grad.set_gradient_func(test::function::XTimes16().signature().name());
+  Status s = lib_def.AddGradientDef(grad);
+  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
+  EXPECT_EQ(s.error_message(),
+            "Gradient for function 'XTimesTwo' already exists.");
+}
+
+TEST(FunctionLibraryDefinitionTest, AddLibrary) {
+  // Create lib def with single function
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+
+  // Error if you try to add the same function twice
+  Status s = lib_def.AddLibrary(lib_def);
+  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
+  EXPECT_EQ(s.error_message(),
+            "Function with name: XTimesTwo already exists in function "
+            "library.");
+
+  // Add gradient
+  GradientDef grad;
+  grad.set_function_name(test::function::XTimesTwo().signature().name());
+  grad.set_gradient_func(test::function::XTimesFour().signature().name());
+  TF_EXPECT_OK(lib_def.AddGradientDef(grad));
+
+  // Error if you try to add the same library function twice
+  proto.Clear();
+  *proto.add_gradient() = grad;
+  FunctionLibraryDefinition lib_def2(OpRegistry::Global(), proto);
+  s = lib_def.AddLibrary(lib_def2);
+  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
+  EXPECT_EQ(s.error_message(),
+            "Gradient for function 'XTimesTwo' already exists.");
+
+  // No conflicting functions or gradients OK
+  proto.Clear();
+  *proto.add_function() = test::function::XTimesFour();
+  grad.set_function_name(test::function::XTimes16().signature().name());
+  *proto.add_gradient() = grad;
+  FunctionLibraryDefinition lib_def3(OpRegistry::Global(), proto);
+  TF_EXPECT_OK(lib_def.AddLibrary(lib_def3));
 }
 
 TEST(FunctionLibraryDefinitionTest, ToProto) {
@@ -1051,4 +1134,37 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+// TODO(skyewm): this could be more thorough
+TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
+  // Equal functions
+  FunctionDef fdef1 = test::function::XTimesTwo();
+  FunctionDef fdef2 = test::function::XTimesTwo();
+  EXPECT_TRUE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different functions
+  fdef2 = test::function::XTimesFour();
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different signatures
+  fdef2 = test::function::XTimesTwo();
+  fdef2.mutable_signature()->mutable_input_arg(0)->set_name("foo");
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Descriptions must be equal
+  fdef2 = test::function::XTimesTwo();
+  fdef2.mutable_signature()->mutable_input_arg(0)->set_description("foo");
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different NodeDefs
+  fdef2 = test::function::XTimesTwo();
+  *fdef2.add_node_def() = fdef2.node_def(0);
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different return values
+  fdef2 = test::function::XTimesTwo();
+  (*fdef2.mutable_ret())["y"] = "y:z:1";  // originally is "y:z:0"
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+}
+
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index fb1ad0102f6..7caddf3cb86 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -126,25 +127,33 @@ FunctionDef XTimes16() {
       {{"y", "y:y:0"}});
 }
 
-FunctionDef WXPlusB() {
-  return FDH::Define(
-      // Name
-      "WXPlusB",
-      // Args
-      {"w: T", "x: T", "b: T"},
-      // Return values
-      {"y: T"},
-      // Attr def
-      {"T: {float, double}"},
-      // Nodes
-      {{{"mm"},
-        "MatMul",
-        {"w", "x"},
-        {{"T", "$T"},
-         {"transpose_a", false},
-         {"transpose_b", false},
+FunctionDef WXPlusB(){return FDH::Define(
+    // Name
+    "WXPlusB",
+    // Args
+    {"w: T", "x: T", "b: T"},
+    // Return values
+    {"y: T"},
+    // Attr def
+    {"T: {float, double}"},
+    // Nodes
+    {
+      {{"mm"},
+       "MatMul",
+       {"w", "x"},
+       {
+           {"T", "$T"}, {"transpose_a", false}, {"transpose_b", false},
+#ifdef INTEL_MKL
+       }},
+#else
          {"_kernel", "eigen"}}},
-       {{"y"}, "Add", {"mm", "b"}, {{"T", "$T"}}}});
+#endif
+      {
+        {"y"}, "Add", {"mm", "b"}, {
+          { "T", "$T" }
+        }
+      }
+    });
 }
 
 FunctionDef Swap() {
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index b76ab40b683..8496774793d 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/versions.pb_text.h"
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 56355eaf367..950737c39aa 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -62,7 +62,7 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 // attr with a default was added). Note that this will not affect
 // attrs with non-default values, so you must run a
 // ValidateGraphDef...() function to see if the result is in fact
-// compatible. If not nulllptr, the op/attr pairs that were removed
+// compatible. If not nullptr, the op/attr pairs that were removed
 // are added to '*op_attr_removed'.
 //
 // Expected usage, for a producer that wants to prepare a graph for
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 1b12323e9a6..1ac322e48e2 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -21,14 +21,14 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_builder.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
 
-Status FinalizeOpDef(OpDefBuilder b, OpDef* op_def) {
+Status FinalizeOpDef(const OpDefBuilder& b, OpDef* op_def) {
   OpRegistrationData op_reg_data;
   const Status s = b.Finalize(&op_reg_data);
   *op_def = op_reg_data.op_def;
diff --git a/tensorflow/core/framework/graph_transfer_info.proto b/tensorflow/core/framework/graph_transfer_info.proto
index 6327662773e..016259ddbf5 100644
--- a/tensorflow/core/framework/graph_transfer_info.proto
+++ b/tensorflow/core/framework/graph_transfer_info.proto
@@ -6,6 +6,8 @@ option java_outer_classname = "GraphTransferInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+import "tensorflow/core/framework/types.proto";
+
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
@@ -32,6 +34,7 @@ message GraphTransferInfo {
     int32 node_id = 2;
     repeated int64 shape = 3;
     bytes data = 4;
+    DataType dtype = 5;
   };
   message NodeInputInfo {
     int32 node_id = 1;
@@ -44,11 +47,13 @@ message GraphTransferInfo {
   message GraphInputNodeInfo {
     string name = 1;
     repeated int64 shape = 2;
+    DataType dtype = 3;
   }
 
   message GraphOutputNodeInfo {
     string name = 1;
     repeated int64 shape = 2;
+    DataType dtype = 3;
   }
 
   repeated NodeInfo node_info = 1;
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index a2d5453e8cf..6366ac5bebb 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb_text.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/framework/log_memory.cc b/tensorflow/core/framework/log_memory.cc
index dc82504d43d..5b525412d57 100644
--- a/tensorflow/core/framework/log_memory.cc
+++ b/tensorflow/core/framework/log_memory.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/log_memory.h"
 
-#include "tensorflow/core/framework/log_memory.pb.h"
 #include "tensorflow/core/framework/log_memory.pb_text.h"
+#include "tensorflow/core/framework/log_memory.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 14d8d91490e..c1dde1504a7 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -64,7 +66,7 @@ MemoryType MTypeFromDType(const DataType dtype) {
 }  // namespace
 
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* inp_mtypes,
                           MemoryTypeVector* out_mtypes) {
   // Look up the Op registered for this op name.
diff --git a/tensorflow/core/framework/memory_types.h b/tensorflow/core/framework/memory_types.h
index 3d4ca7597a4..e35e22f5907 100644
--- a/tensorflow/core/framework/memory_types.h
+++ b/tensorflow/core/framework/memory_types.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 // REQUIRES: * '*_memory_types' is not nullptr.
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* input_memory_types,
                           MemoryTypeVector* output_memory_types);
 
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index 8d3811582a2..d145fac8c14 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -29,9 +29,8 @@ message NodeDef {
   // node should be placed.
   // The expected syntax for this string is as follows:
   //
-  // DEVICE_SPEC ::= COLOCATED_NODE | PARTIAL_SPEC
+  // DEVICE_SPEC ::= PARTIAL_SPEC
   //
-  // COLOCATED_NODE ::= "@" NODE_NAME  // See NodeDef.name above.
   // PARTIAL_SPEC ::= ("/" CONSTRAINT) *
   // CONSTRAINT ::= ("job:" JOB_NAME)
   //              | ("replica:" [1-9][0-9]*)
@@ -39,7 +38,6 @@ message NodeDef {
   //              | ( ("gpu" | "cpu") ":" ([1-9][0-9]* | "*") )
   //
   // Valid values for this string include:
-  // * "@other/node"                         (colocate with "other/node")
   // * "/job:worker/replica:0/task:1/gpu:3"  (full specification)
   // * "/job:worker/gpu:3"                   (partial specification)
   // * ""                                    (no specification)
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index 861ad9d7d06..e836873f667 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -155,7 +155,8 @@ TEST_F(NodeDefBuilderTest, Simple) {
 
   {  // Finalize() twice.
     NodeDefBuilder& builder = Builder();
-    builder.Input(FakeInput()).Finalize(nullptr);  // First call to Finalize()
+    // First call to Finalize()
+    TF_EXPECT_OK(builder.Input(FakeInput()).Finalize(nullptr));
     // ExpectSuccess() also calls Finalize().
     ExpectSuccess(builder, {DT_INT32}, {DT_FLOAT}, R"proto(
         op: "Simple" input: "a" )proto");
@@ -207,9 +208,8 @@ TEST_F(NodeDefBuilderTest, OpDoesNotExist) {
       .ControlInput("y")
       .Attr("foo", 12)
       .Device("device");
-  ExpectFailure(
-      builder,
-      "Op type not registered 'Op Does Not Exist' while building NodeDef 'n'");
+  ExpectFailures(builder, {"Op type not registered 'Op Does Not Exist'",
+                           "while building NodeDef 'n'"});
 }
 
 TEST_F(NodeDefBuilderTest, Polymorphic) {
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 568346a71d3..9b737e1f72d 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -36,18 +37,23 @@ namespace tensorflow {
 const char* const kColocationAttrName = "_class";
 const char* const kColocationGroupPrefix = "loc:@";
 
+AttrSlice::AttrSlice() : ndef_(nullptr) {
+  static const AttrValueMap* const kEmptyAttrValueMap = new AttrValueMap;
+  attrs_ = kEmptyAttrValueMap;
+}
+
 AttrSlice::AttrSlice(const NodeDef& node_def)
     : ndef_(&node_def), attrs_(&ndef_->attr()) {}
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+static string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
+  string ret;
 
   // We sort the attrs so the output is deterministic.
   std::vector<string> attr_names;
-  attr_names.reserve(node_def.attr().size());
-  for (const auto& attr : node_def.attr()) {
+  attr_names.reserve(attrs.size());
+  for (const auto& attr : attrs) {
     attr_names.push_back(attr.first);
   }
   std::sort(attr_names.begin(), attr_names.end());
@@ -55,20 +61,34 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& attr_name : attr_names) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    auto iter = node_def.attr().find(attr_name);
-    strings::StrAppend(&ret, attr_name, "=", SummarizeAttrValue(iter->second));
+    strings::StrAppend(&ret, attr_name, "=",
+                       SummarizeAttrValue(*attrs.Find(attr_name)));
   }
 
   // Consider the device to be a final attr with name "_device".
-  if (!node_def.device().empty()) {
+  if (!device.empty()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    strings::StrAppend(&ret, "_device=\"", node_def.device(), "\"");
+    strings::StrAppend(&ret, "_device=\"", device, "\"");
   }
+  return ret;
+}
+
+string AttrSlice::SummarizeNode() const {
+  return ndef_ ? SummarizeNodeDef(*ndef_)
+               : strings::StrCat(
+                     "[", SummarizeAttrsHelper(*this, StringPiece()), "]");
+}
+
+string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
+
+string SummarizeNodeDef(const NodeDef& node_def) {
+  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+  strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
   // Output inputs, including control inputs, verbatim.
-  first = true;
+  bool first = true;
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
@@ -79,9 +99,24 @@ string SummarizeNodeDef(const NodeDef& node_def) {
 }
 
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
-  auto iter = attrs_->find(attr_name.ToString());
-  if (iter == attrs_->end()) return nullptr;
-  return &iter->second;
+  // Currently, the collection used for NodeDef::attr() (google::protobuf::Map)
+  // requires that the keys used for lookups have type 'const string&'. Because
+  // this method takes a StringPiece, it is necessary to allocate a temporary
+  // string, copy attr_name to it, and then use that temporary string for the
+  // lookup. This causes an excessive number of short-lived allocations, and for
+  // large graphs, this can be a significant cost.
+  //
+  // Because most nodes have a small number of attributes, a simple linear scan
+  // is generally more efficient than a hashed lookup.  If google::protobuf::Map
+  // changes so that it supports efficient lookups using StringPiece instead of
+  // const string&, then this code could be changed to use attrs_->find() again.
+
+  for (const auto& attr : *attrs_) {
+    if (attr.first == attr_name) {
+      return &attr.second;
+    }
+  }
+  return nullptr;
 }
 
 Status AttrSlice::Find(StringPiece attr_name,
@@ -94,12 +129,28 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!StringPiece(attr_name).starts_with("_") && ndef_) {
+  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
 }
 
+bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
+  if (size() != other.size()) return false;
+
+  for (const auto& attr : *other.attrs_) {
+    auto iter = attrs_->find(attr.first);
+    if (iter == attrs_->end()) return false;
+    // TODO(irving): Comparing AttrValues by proto is slightly buggy, since
+    // TensorProto is a nonunique representation of Tensor.  This bug will go
+    // away once AttrSlice switches over to NodeInfo.
+    iter->second.SerializeToString(&scratch->a);
+    attr.second.SerializeToString(&scratch->b);
+    if (scratch->a != scratch->b) return false;
+  }
+  return true;
+}
+
 // The ... is to allow the caller to inject some value validation code.  Use
 // just ; if no additional validation code is needed.
 #define DEFINE_GET_ATTR(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...)         \
@@ -125,7 +176,41 @@ Status AttrSlice::Find(StringPiece attr_name,
     return Status::OK();                                                      \
   }
 
+#define DEFINE_GET_ATTR_SIMPLE(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...) \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         TYPE* value) {                                      \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, ATTR_TYPE);                     \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    const auto& v = attr_value->FIELD();                                     \
+    __VA_ARGS__;                                                             \
+    *value = CAST;                                                           \
+    return true;                                                             \
+  }                                                                          \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         std::vector<TYPE>* value) {                         \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")");         \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    for (const auto& v : attr_value->list().FIELD()) {                       \
+      __VA_ARGS__;                                                           \
+      value->APPEND_OP(CAST);                                                \
+    }                                                                        \
+    return true;                                                             \
+  }
+
 DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
+DEFINE_GET_ATTR_SIMPLE(string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(int32, i, "int", emplace_back, static_cast<int32>(v),
                 if (static_cast<int64>(static_cast<int32>(v)) != v) {
@@ -156,6 +241,20 @@ DEFINE_GET_ATTR(Tensor, tensor, "tensor", emplace_back, t, Tensor t;
 
 #undef DEFINE_GET_ATTR
 
+static const string& kEmptyString = *new string();
+
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return kEmptyString;
+  }
+  Status s = AttrValueHasType(*attr_value, "string");
+  if (!s.ok()) {
+    return kEmptyString;
+  }
+  return attr_value->s();
+}
+
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    DataTypeVector* value) {
   const AttrValue* attr_value;
@@ -278,14 +377,14 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     if (StringPiece(input).starts_with("^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
-        return errors::InvalidArgument("Control input '", input,
-                                       "' must not have ':' in NodeDef: ",
-                                       SummarizeNodeDef(node_def));
+        return errors::InvalidArgument(
+            "Control input '", input,
+            "' must not have ':' in NodeDef: ", SummarizeNodeDef(node_def));
       }
     } else if (seen_control) {
-      return errors::InvalidArgument("Non-control input '", input,
-                                     "' after control input in NodeDef: ",
-                                     SummarizeNodeDef(node_def));
+      return errors::InvalidArgument(
+          "Non-control input '", input,
+          "' after control input in NodeDef: ", SummarizeNodeDef(node_def));
     } else {
       ++num_inputs;
     }
@@ -295,8 +394,8 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   for (const auto& attr : op_def.attr()) {
     if (!gtl::InsertIfNotPresent(&op_attrs, attr.name(), &attr)) {
       return errors::InvalidArgument("OpDef has duplicate attr name '",
-                                     attr.name(), "': ",
-                                     SummarizeOpDef(op_def));
+                                     attr.name(),
+                                     "': ", SummarizeOpDef(op_def));
     }
   }
   for (const auto& attr : node_def.attr()) {
@@ -306,13 +405,23 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     }
     auto iter = op_attrs.find(attr.first);
     if (iter == op_attrs.end()) {
-      return errors::InvalidArgument("NodeDef mentions attr '", attr.first,
-                                     "' not in ", SummarizeOpDef(op_def),
-                                     "; NodeDef: ", SummarizeNodeDef(node_def));
+      // A common cause of this error is that TensorFlow has made a
+      // backwards-compatible change to the NodeDef (e.g., adding a
+      // new attr with a default value), but the binary consuming the
+      // NodeDef does not know about the new attribute; the solution
+      // in these cases is to ensure that the binary consuming the
+      // NodeDef is built with a version of TensorFlow no earlier than
+      // the binary producing it.
+      return errors::InvalidArgument(
+          "NodeDef mentions attr '", attr.first, "' not in ",
+          SummarizeOpDef(op_def), "; NodeDef: ", SummarizeNodeDef(node_def),
+          ". (Check whether your GraphDef-interpreting binary is up to date "
+          "with your GraphDef-generating binary.).");
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second), "; NodeDef: ",
-        SummarizeNodeDef(node_def), "; ", SummarizeOpDef(op_def));
+        ValidateAttrValue(attr.second, *iter->second),
+        "; NodeDef: ", SummarizeNodeDef(node_def), "; ",
+        SummarizeOpDef(op_def));
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
@@ -359,9 +468,9 @@ Status ComputeArgRange(const NodeDef& node_def, const OpDef::ArgDef& arg_def,
   } else if (!arg_def.type_attr().empty() || arg_def.type() != DT_INVALID) {
     *num = 1;
   } else {
-    return errors::InvalidArgument("Argument '", arg_def.name(),
-                                   "' incorrectly specified in op definition: ",
-                                   SummarizeOpDef(op_def));
+    return errors::InvalidArgument(
+        "Argument '", arg_def.name(),
+        "' incorrectly specified in op definition: ", SummarizeOpDef(op_def));
   }
   return Status::OK();
 }
@@ -393,6 +502,11 @@ Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
   return Status::OK();
 }
 
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs) {
+  return NameRangesForNode(node.def(), op_def, inputs, outputs);
+}
+
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def) {
   for (const auto& attr_def : op_def.attr()) {
     AttrSlice attrs(*node_def);
@@ -493,4 +607,8 @@ Status AttachDef(const Status& status, const NodeDef& node_def) {
   return ret;
 }
 
+Status AttachDef(const Status& status, const Node& node) {
+  return AttachDef(status, node.def());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 5c4d2272682..1438abdec60 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -29,6 +29,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class Node;
+
 // Name of the attribute used to encode node colocation constraints.
 //
 // Nodes can be co-located on the same device. Desire for explicit co-location
@@ -39,8 +41,9 @@ extern const char* const kColocationAttrName;
 // String prefix applied to the operation name for colocation constraints.
 extern const char* const kColocationGroupPrefix;
 
-// Produce a human-readable version of a NodeDef that is more concise
+// Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
+string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
@@ -78,8 +81,11 @@ class AttrSlice {
  public:
   AttrSlice(const NodeDef& node_def);  // NOLINT(runtime/explicit)
 
+  AttrSlice();  // Empty
   explicit AttrSlice(const AttrValueMap* a);
 
+  int size() const { return attrs_->size(); }
+
   // Returns the attr with attr_name if found.  Otherwise, returns
   // nullptr.
   const AttrValue* Find(StringPiece attr_name) const;
@@ -88,6 +94,33 @@ class AttrSlice {
   // NotFound status.
   Status Find(StringPiece attr_name, const AttrValue** attr_value) const;
 
+  // Helper class to avoid allocations in EqualAttrs.
+  // TODO(irving): Will go away once NodeInfo is used.
+  struct Scratch {
+    string a;
+    string b;
+  };
+
+  // Check if all attrs and attr values match.  Does not take defaults into
+  // account.
+  //
+  // TODO(irving): There is a bug in this routine inherited from its
+  // OptimizerCSE::EqualAttrs precedecessor.  The same tensor attr can be
+  // represented in more than one way as an AttrValue, since TensorProto is
+  // not 1-1.  This bug will go away once I replace everything with NodeInfo,
+  // which stores a Tensor object directly.  The Scratch object will also go
+  // away.
+  bool EqualAttrs(AttrSlice other, Scratch* scratch) const;
+
+  // If this AttrSlice has an attached NodeDef, summarize it.  This is for
+  // error messages only: we intentionally do not provide direct access to the
+  // NodeDef, since it is not always there.
+  string SummarizeNode() const;
+
+  // Iteration over all attrs
+  AttrValueMap::const_iterator begin() const { return attrs_->begin(); }
+  AttrValueMap::const_iterator end() const { return attrs_->end(); }
+
  private:
   const NodeDef* ndef_;
   const AttrValueMap* attrs_;
@@ -153,6 +186,20 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    std::vector<NameAttrList>* value);  // type: "list(func)"
 
+// Look up the attr with name attr_name and set *value to its value.  If no
+// attr with attr_name is found in node_def, or the attr does not have
+// a matching type, false is returned.
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       string* value);  // type: "string"
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       std::vector<string>* value);  // type: "string"
+
+// Look up the attr with name attr_name and return a reference to its value.
+// If no attr with attr_name is found in node_def, or the attr does not have
+// a matching type, a reference to an empty string is returned.
+// REQUIRES: Must not use the returned value beyond the lifetime of node_def.
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name);
+
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
@@ -169,9 +216,12 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def);
 // corresponding input/output index range.  For example,
 // input "foo" corresponds to input indices
 //   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
+// TODO(irving): Remove the NodeDef version; keep only the Node version.
 typedef std::unordered_map<string, std::pair<int, int>> NameRangeMap;
 Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
                          NameRangeMap* inputs, NameRangeMap* outputs);
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs);
 
 // Adds default values to *node_def for unspecified attrs from op_def.
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
@@ -192,6 +242,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
 // Returns "status" with kernel's NodeDef attached as additional text
 // in the error message.
 Status AttachDef(const Status& status, const NodeDef& node_def);
+Status AttachDef(const Status& status, const Node& node);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/numeric_op.h b/tensorflow/core/framework/numeric_op.h
index f24bcfead3e..4538ff053cd 100644
--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@@ -56,9 +56,9 @@ class UnaryElementWiseOp : public UnaryOp<T> {
   void Compute(OpKernelContext* context) override {
     // Output shape is the same as input shape.
     const Tensor& input = context->input(0);
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
     static_cast<CHILD*>(this)->Operate(context, input, output);
   }
 };
@@ -77,8 +77,9 @@ class BinaryElementWiseOp : public BinaryOp<T> {
       return;
     }
 
-    Tensor* output;
-    OP_REQUIRES_OK(context, context->allocate_output(0, a.shape(), &output));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0, 1}, 0, a.shape(), &output));
 
     // Dispatch to the descendant's Operate() function.
     switch (a.dims()) {
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 6bff192b1ec..fe333dc9ffa 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -47,7 +48,7 @@ OpRegistry::~OpRegistry() {
   for (const auto& e : registry_) delete e.second;
 }
 
-void OpRegistry::Register(OpRegistrationDataFactory op_data_factory) {
+void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
   mutex_lock lock(mu_);
   if (initialized_) {
     TF_QCHECK_OK(RegisterAlreadyLocked(op_data_factory));
@@ -83,7 +84,10 @@ Status OpRegistry::LookUp(const string& op_type_name,
       first_unregistered = false;
     }
     Status status =
-        errors::NotFound("Op type not registered '", op_type_name, "'");
+        errors::NotFound("Op type not registered '", op_type_name,
+                         "' in binary running on ", port::Hostname(), ". ",
+                         "Make sure the Op and Kernel are registered in the "
+                         "binary running in this process.");
     VLOG(1) << status.ToString();
     return status;
   }
@@ -177,7 +181,7 @@ Status OpRegistry::CallDeferred() const {
 }
 
 Status OpRegistry::RegisterAlreadyLocked(
-    OpRegistrationDataFactory op_data_factory) const {
+    const OpRegistrationDataFactory& op_data_factory) const {
   std::unique_ptr<OpRegistrationData> op_reg_data(new OpRegistrationData);
   Status s = op_data_factory(op_reg_data.get());
   if (s.ok()) {
@@ -225,7 +229,10 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     *op_reg_data = nullptr;
-    return errors::NotFound("Op type not registered '", op_type_name, "'");
+    return errors::NotFound("Op type not registered '", op_type_name,
+                            "' in binary running on ", port::Hostname(), ". ",
+                            "Make sure the Op and Kernel are registered in the "
+                            "binary running in this process.");
   }
   *op_reg_data = iter->second;
   return Status::OK();
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index f047ddb12a1..a4dd06de453 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -70,7 +70,7 @@ class OpRegistry : public OpRegistryInterface {
   OpRegistry();
   ~OpRegistry() override;
 
-  void Register(OpRegistrationDataFactory op_data_factory);
+  void Register(const OpRegistrationDataFactory& op_data_factory);
 
   Status LookUp(const string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override;
@@ -138,8 +138,8 @@ class OpRegistry : public OpRegistryInterface {
   // Add 'def' to the registry with additional data 'data'. On failure, or if
   // there is already an OpDef with that name registered, returns a non-okay
   // status.
-  Status RegisterAlreadyLocked(OpRegistrationDataFactory op_data_factory) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
+      const EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
@@ -205,7 +205,6 @@ class OpDefBuilderWrapper;
 template <>
 class OpDefBuilderWrapper<true> {
  public:
-  typedef OpDefBuilderWrapper<true> WrapperType;
   OpDefBuilderWrapper(const char name[]) : builder_(name) {}
   OpDefBuilderWrapper<true>& Attr(StringPiece spec) {
     builder_.Attr(spec);
@@ -293,6 +292,18 @@ struct OpDefBuilderReceiver {
           ::tensorflow::register_op::OpDefBuilderWrapper<SHOULD_REGISTER_OP( \
               name)>(name)
 
+// The `REGISTER_SYSTEM_OP()` macro acts as `REGISTER_OP()` except
+// that the op is registered unconditionally even when selective
+// registration is used.
+#define REGISTER_SYSTEM_OP(name) \
+  REGISTER_SYSTEM_OP_UNIQ_HELPER(__COUNTER__, name)
+#define REGISTER_SYSTEM_OP_UNIQ_HELPER(ctr, name) \
+  REGISTER_SYSTEM_OP_UNIQ(ctr, name)
+#define REGISTER_SYSTEM_OP_UNIQ(ctr, name)                                \
+  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr \
+      TF_ATTRIBUTE_UNUSED =                                               \
+          ::tensorflow::register_op::OpDefBuilderWrapper<true>(name)
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_OP_H_
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index a6ffd5c5961..d545db5e091 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_def_builder.h"
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -73,7 +74,7 @@ class OpDefBuilderTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const OpDefBuilder& builder, string error) {
+  void ExpectFailure(const OpDefBuilder& builder, const string& error) {
     OpRegistrationData op_reg_data;
     Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index da36171b957..da623ae5b25 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 4b50386e940..dec987e1ed9 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -61,11 +63,11 @@ Status MatchSignatureHelper(const DataTypeSlice expected_inputs,
   }
 
   if (signature_mismatch) {
-    return errors::InvalidArgument("Signature mismatch, have: ",
-                                   DataTypeSliceString(inputs), "->",
-                                   DataTypeSliceString(outputs), " expected: ",
-                                   DataTypeSliceString(expected_inputs), "->",
-                                   DataTypeSliceString(expected_outputs));
+    return errors::InvalidArgument(
+        "Signature mismatch, have: ", DataTypeSliceString(inputs), "->",
+        DataTypeSliceString(outputs),
+        " expected: ", DataTypeSliceString(expected_inputs), "->",
+        DataTypeSliceString(expected_outputs));
   }
   return Status::OK();
 }
@@ -89,14 +91,14 @@ OpKernel::OpKernel(OpKernelConstruction* context)
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
-                 NameRangesForNode(def_, context->op_def(), &input_name_map_,
+                 NameRangesForNode(def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
-  OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
+  OP_REQUIRES_OK(context, CheckOpDeprecation(*context->op_def_,
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU tie very few resources on the CPU where the
+  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
@@ -125,6 +127,23 @@ Status OpKernel::OutputRange(StringPiece output_name, int* start,
   }
 }
 
+Status OpKernel::MakeShape(const Tensor& shape, TensorShape* out) const {
+  if (!IsLegacyVector(shape.shape())) {
+    return errors::InvalidArgument(
+        "shape must be a vector of {int32,int64}, got shape ",
+        shape.shape().DebugString());
+  }
+  if (shape.dtype() == DataType::DT_INT32) {
+    auto vec = shape.flat<int32>();
+    return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
+  } else if (shape.dtype() == DataType::DT_INT64) {
+    auto vec = shape.flat<int64>();
+    return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
+  } else {
+    return errors::InvalidArgument("shape must be a vector of {int32,int64}.");
+  }
+}
+
 void AsyncOpKernel::Compute(OpKernelContext* context) {
   Notification n;
   ComputeAsync(context, [&n]() { n.Notify(); });
@@ -199,8 +218,13 @@ OpKernelContext::OpKernelContext(Params* params)
     : OpKernelContext(
           params, static_cast<int>(params->op_kernel->output_types().size())) {}
 
-OpKernelContext::OpKernelContext(Params* params, int noutputs)
-    : params_(params), outputs_(noutputs) {
+OpKernelContext::OpKernelContext(Params* params, int num_outputs)
+    : params_(params),
+      outputs_(num_outputs),
+      host_temp_memory_size_(0),
+      device_temp_memory_size_(0),
+      host_persistent_memory_allocated_(0),
+      device_persistent_memory_allocated_(0) {
   Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
   params_->ensure_eigen_gpu_device();
   params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
@@ -223,7 +247,7 @@ OpKernelContext::~OpKernelContext() {
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
   Allocator* allocator =
       params_->device->GetStepAllocator(attr, resource_manager());
-  if (params_->track_allocations) {
+  if (track_allocations()) {
     mutex_lock lock(mu_);
     for (const auto& wrapped : wrapped_allocators_) {
       if (wrapped.first == allocator) {
@@ -258,9 +282,9 @@ Status OpKernelContext::input(StringPiece name, const Tensor** tensor) {
                                    "' when single-valued input was "
                                    "expected");
   }
-  if ((*params_->inputs)[start].is_ref()) {
+  if (input_is_ref(start)) {
     return errors::InvalidArgument("OpKernel used ref input name '", name,
-                                   "' when immutable input was expected");
+                                   "' when non-ref input was expected");
   }
   *tensor = (*params_->inputs)[start].tensor;
   record_tensor_reference(**tensor);
@@ -299,8 +323,8 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
 
 const Tensor& OpKernelContext::input(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK(!(*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
   record_tensor_reference(tensor);
   return tensor;
@@ -308,8 +332,8 @@ const Tensor& OpKernelContext::input(int index) {
 
 Tensor OpKernelContext::mutable_input(int index, bool lock_held) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK((*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
   // return a copy of the Ref acquired while holding the mutex
   if (lock_held) {
     Tensor& tensor = *((*params_->inputs)[index].tensor);
@@ -326,8 +350,8 @@ Tensor OpKernelContext::mutable_input(int index, bool lock_held) {
 void OpKernelContext::replace_ref_input(int index, const Tensor& tensor,
                                         bool lock_held) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK((*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
   // should only modify the tensor while holding the mutex
   if (lock_held) {
     *(*params_->inputs)[index].tensor = tensor;
@@ -341,16 +365,117 @@ void OpKernelContext::replace_ref_input(int index, const Tensor& tensor,
 void OpKernelContext::forward_ref_input_to_ref_output(int input_index,
                                                       int output_index) {
   DCHECK_GE(input_index, 0);
-  DCHECK_LT(input_index, params_->inputs->size());
-  DCHECK((*params_->inputs)[input_index].is_ref());
+  DCHECK_LT(input_index, num_inputs());
+  DCHECK(input_is_ref(input_index));
   set_output_ref(output_index, (*params_->inputs)[input_index].mutex_if_ref,
                  (*params_->inputs)[input_index].tensor);
 }
 
+bool OpKernelContext::forward_input_to_output_with_shape(
+    int input_index, int output_index, const TensorShape& output_shape,
+    Tensor** output) {
+  const auto output_attr = params_->output_attr_array == nullptr
+                               ? AllocatorAttributes()
+                               : output_alloc_attr(output_index);
+  std::unique_ptr<Tensor> new_tensor = forward_input(
+      input_index, expected_output_dtype(output_index), output_shape,
+      output_memory_type(output_index), output_attr);
+  if (new_tensor != nullptr) {
+    // Transfer ownership to the output slot in OpKernelContext.
+    outputs_[output_index] = TensorValue(new_tensor.release());
+    *output = outputs_[output_index].tensor;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status OpKernelContext::forward_input_to_output_with_shape(
+    StringPiece input_name, StringPiece output_name,
+    const TensorShape& output_shape, Tensor** output) {
+  int input_index, output_index, stop;
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->InputRange(input_name, &input_index, &stop));
+  if (stop != input_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued input name '",
+                                   input_name,
+                                   "' when single-valued input was "
+                                   "expected");
+  }
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->OutputRange(output_name, &output_index, &stop));
+  if (stop != output_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued output name '",
+                                   output_name,
+                                   "' when single-valued output was "
+                                   "expected");
+  }
+  if (!forward_input_to_output_with_shape(input_index, output_index,
+                                          output_shape, output)) {
+    return errors::FailedPrecondition("OpKernel could not forward input '",
+                                      input_name, "' to output '", output_name);
+  }
+  return Status::OK();
+}
+
+std::unique_ptr<Tensor> OpKernelContext::forward_input(
+    int input_index, DataType output_dtype, const TensorShape& output_shape,
+    MemoryType output_memory_type, const AllocatorAttributes& output_attr) {
+  DCHECK_GE(input_index, 0);
+  DCHECK_LT(input_index, num_inputs());
+  const TensorValue& input = (*params_->inputs)[input_index];
+  // Check that input tensor exists, is not a ref, and has no other consumers.
+  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+    return nullptr;
+  }
+  // Check that input type matches.
+  if (input_dtype(input_index) != output_dtype) {
+    return nullptr;
+  }
+  // Check that the input and output sizes are compatible.
+  if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    return nullptr;
+  }
+  // Check that input and output memory types match, i.e.
+  // that they either both live in host or both live in device memmory.
+  if (input_memory_type(input_index) != output_memory_type) {
+    return nullptr;
+  }
+  // Check that output allocator attributes are not more restrictive than
+  // input allocator attributes.
+  const auto input_attr = params_->input_alloc_attrs == nullptr
+                              ? AllocatorAttributes()
+                              : input_alloc_attr(input_index);
+  if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
+    return nullptr;
+  }
+  // TODO(rmlarsen): Use MakeUnique here. There is already a copy in
+  // tensorflow/compiler/xla/ptr_util.h. Perhaps this should be part of
+  // general cleanup of ownership in this code.
+  std::unique_ptr<Tensor> output_tensor(new Tensor());
+  CHECK(output_tensor->CopyFrom(*input.tensor, output_shape));
+  return output_tensor;
+}
+
+Status OpKernelContext::forward_input_or_allocate_temp(
+    gtl::ArraySlice<int> candidate_input_indices, DataType type,
+    const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+    Tensor* out_temp) {
+  for (int input_index : candidate_input_indices) {
+    std::unique_ptr<Tensor> new_tensor =
+        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+    if (new_tensor != nullptr) {
+      *out_temp = std::move(*new_tensor);
+      return Status::OK();
+    }
+  }
+  return allocate_temp(type, shape, out_temp, allocator_attr);
+}
+
 void OpKernelContext::delete_ref_input(int index, bool lock_held) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK((*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
   // should only modify the tensor while holding the mutex
   if (lock_held) {
     delete (*params_->inputs)[index].tensor;
@@ -369,8 +494,8 @@ Status OpKernelContext::mutable_input(StringPiece name, Tensor* tensor,
                                    name,
                                    "' when single-valued input was expected");
   }
-  if (!(*params_->inputs)[start].is_ref()) {
-    return errors::InvalidArgument("OpKernel used immutable input name '", name,
+  if (!input_is_ref(start)) {
+    return errors::InvalidArgument("OpKernel used non-ref input name '", name,
                                    "' when ref input was expected");
   }
   // return a copy of the Ref acquired while holding the mutex
@@ -394,7 +519,7 @@ Status OpKernelContext::replace_ref_input(StringPiece name,
                                    name,
                                    "' when single-valued input was expected");
   }
-  if (!(*params_->inputs)[start].is_ref()) {
+  if (!input_is_ref(start)) {
     return errors::InvalidArgument("OpKernel used immutable input name '", name,
                                    "' when ref input was expected");
   }
@@ -505,6 +630,18 @@ Status OpKernelContext::allocate_temp(
     const AllocationAttributes& allocation_attr) {
   Status s =
       allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
+  if (track_allocations() && out_temp->TotalBytes() > 0) {
+    Allocator* a = get_allocator(allocator_attr);
+    if (a->TracksAllocationSizes()) {
+      int64 alloc_size =
+          a->AllocatedSize(const_cast<char*>(out_temp->tensor_data().data()));
+      if (allocate_on_host(allocator_attr)) {
+        record_host_temp_memory_size(alloc_size);
+      } else {
+        record_device_temp_memory_size(alloc_size);
+      }
+    }
+  }
   return s;
 }
 
@@ -513,7 +650,6 @@ Status OpKernelContext::allocate_persistent(DataType type,
                                             PersistentTensor* out_persistent,
                                             Tensor** out_tensor,
                                             AllocatorAttributes attr) {
-  // TODO(misard) add specific memory tracking for persistent tensors
   Tensor persistent;
   Status s = allocate_tensor(type, shape, &persistent, attr);
   if (s.ok()) {
@@ -622,6 +758,32 @@ Status OpKernelContext::MatchSignature(const DataTypeSlice expected_inputs,
                               outputs);
 }
 
+bool OpKernelContext::allocate_on_host(AllocatorAttributes alloc_attr) const {
+  return alloc_attr.on_host() || device()->attributes().device_type() == "CPU";
+}
+
+void OpKernelContext::record_host_persistent_memory_allocation(int64 size,
+                                                               int64 alloc_id) {
+  host_persistent_memory_allocated_ += size;
+  host_persistent_alloc_ids_.push_back(alloc_id);
+}
+
+void OpKernelContext::record_device_persistent_memory_allocation(
+    int64 size, int64 alloc_id) {
+  device_persistent_memory_allocated_ += size;
+  device_persistent_alloc_ids_.push_back(alloc_id);
+}
+
+std::vector<int64> OpKernelContext::host_persistent_alloc_ids() const {
+  return std::vector<int64>(host_persistent_alloc_ids_.begin(),
+                            host_persistent_alloc_ids_.end());
+}
+
+std::vector<int64> OpKernelContext::device_persistent_alloc_ids() const {
+  return std::vector<int64>(device_persistent_alloc_ids_.begin(),
+                            device_persistent_alloc_ids_.end());
+}
+
 // OpKernel registration ------------------------------------------------------
 
 struct KernelRegistration {
@@ -647,7 +809,7 @@ static KernelRegistry* GlobalKernelRegistryTyped() {
   return reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
 }
 
-static string Key(StringPiece op_type, DeviceType device_type,
+static string Key(StringPiece op_type, const DeviceType& device_type,
                   StringPiece label) {
   return strings::StrCat(op_type, ":", DeviceTypeString(device_type), ":",
                          label);
@@ -681,13 +843,10 @@ bool InTypeList(DataType dt, const AttrValue& type_list) {
   return false;
 }
 
-// Returns whether the attrs in the NodeDef satisfy the constraints in
-// the kernel_def.  Returns an error if attrs in kernel_def are not
-// found, or have a mismatching type.
-Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
-                  bool* match) {
+// Returns whether the attrs satisfy the constraints in the kernel_def.  Returns
+// an error if attrs in kernel_def are not found, or have a mismatching type.
+Status AttrsMatch(AttrSlice attrs, const KernelDef& kernel_def, bool* match) {
   *match = false;
-  AttrSlice attrs(node_def);
   for (const auto& constraint : kernel_def.constraint()) {
     if (constraint.allowed_values().list().type_size() == 0) {
       return errors::Unimplemented(
@@ -711,7 +870,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
               "' that has value '", SummarizeAttrValue(*found),
               "' that does not have type 'type' or 'list(type)' in NodeDef "
               "'",
-              SummarizeNodeDef(node_def), "'");
+              attrs.SummarizeNode(), "'");
         }
 
         for (int t : found->list().type()) {
@@ -724,7 +883,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
     } else {
       return errors::InvalidArgument(
           "OpKernel '", kernel_def.op(), "' has constraint on attr '",
-          constraint.name(), "' not in NodeDef '", SummarizeNodeDef(node_def),
+          constraint.name(), "' not in NodeDef '", attrs.SummarizeNode(),
           "', KernelDef: '", ProtoShortDebugString(kernel_def), "'");
     }
   }
@@ -732,13 +891,18 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
   return Status::OK();
 }
 
-Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
+static const StringPiece kKernelAttr("_kernel");
+
+// TODO(irving): Replace with const Node& version below.
+Status FindKernelRegistration(const DeviceType& device_type,
+                              const NodeDef& node_def,
                               const KernelRegistration** reg,
                               bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
-  string label;  // Label defaults to empty if not found in NodeDef.
-  GetNodeAttr(node_def, "_kernel", &label);
+  // Label defaults to empty if not found in NodeDef.
+  const string& label = GetNodeAttrString(node_def, kKernelAttr);
+
   const string key = Key(node_def.op(), device_type, label);
   auto regs = GlobalKernelRegistryTyped()->equal_range(key);
   for (auto iter = regs.first; iter != regs.second; ++iter) {
@@ -762,9 +926,17 @@ Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
   return Status::OK();
 }
 
+Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
+                              const KernelRegistration** reg,
+                              bool* was_attr_mismatch) {
+  return FindKernelRegistration(device_type, node.def(), reg,
+                                was_attr_mismatch);
+}
+
 }  // namespace
 
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+// TODO(irving): Change const NodeDef& to const Node&
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
@@ -779,8 +951,8 @@ Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
       errors::AppendToMessage(
           &s, " (OpKernel was found, but attributes didn't match)");
     }
-    errors::AppendToMessage(&s, ".  Registered:",
-                            KernelsRegisteredForOp(node_def.op()));
+    errors::AppendToMessage(
+        &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
     return s;
   }
   if (def != nullptr) *def = &reg->def;
@@ -846,8 +1018,8 @@ std::unique_ptr<OpKernel> CreateOpKernel(
     DeviceType device_type, DeviceBase* device, Allocator* allocator,
     const NodeDef& node_def, int graph_def_version, Status* status) {
   OpKernel* kernel = nullptr;
-  *status = CreateOpKernel(device_type, device, allocator, nullptr, node_def,
-                           graph_def_version, &kernel);
+  *status = CreateOpKernel(std::move(device_type), device, allocator, nullptr,
+                           node_def, graph_def_version, &kernel);
   return std::unique_ptr<OpKernel>(kernel);
 }
 
@@ -884,8 +1056,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
       errors::AppendToMessage(
           &s, " (OpKernel was found, but attributes didn't match)");
     }
-    errors::AppendToMessage(&s, ".  Registered:",
-                            KernelsRegisteredForOp(node_def.op()));
+    errors::AppendToMessage(
+        &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
     return s;
   }
 
@@ -948,9 +1120,9 @@ Status ValidateKernelRegistrations(const OpRegistryInterface& op_registry) {
     for (const auto& host_memory_arg : kernel_def.host_memory_arg()) {
       if (!FindArgInOp(host_memory_arg, op_def.input_arg()) &&
           !FindArgInOp(host_memory_arg, op_def.output_arg())) {
-        return errors::InvalidArgument("HostMemory arg '", host_memory_arg,
-                                       "' not found in OpDef: ",
-                                       SummarizeOpDef(op_def));
+        return errors::InvalidArgument(
+            "HostMemory arg '", host_memory_arg,
+            "' not found in OpDef: ", SummarizeOpDef(op_def));
       }
     }
   }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 75ad4bb7fc5..465395d858c 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/framework/unique_tensor_references.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -150,6 +152,10 @@ class OpKernel {
     return shape.dims() == 1 || (allow_legacy_scalars() && shape.dims() == 0);
   }
 
+  // Turn a shape Tensor into a TensorShape
+  // TODO(irving): Move to TensorShapeUtils once !allow_legacy_scalars
+  Status MakeShape(const Tensor& shape, TensorShape* out) const;
+
  private:
   const NodeDef def_;
   const DataTypeVector input_types_;
@@ -206,6 +212,8 @@ class PersistentTensor {
 
   int64 NumElements() const { return tensor_.NumElements(); }
 
+  int64 AllocatedBytes() const { return tensor_.AllocatedBytes(); }
+
  private:
   Tensor tensor_;
 };
@@ -220,7 +228,7 @@ class OpKernelConstruction {
                        const DataTypeSlice& output_types,
                        const MemoryTypeSlice& output_memory_types,
                        int graph_def_version, Status* status)
-      : device_type_(device_type),
+      : device_type_(std::move(device_type)),
         device_(device),
         allocator_(allocator),
         def_(node_def),
@@ -270,9 +278,6 @@ class OpKernelConstruction {
   // User-supplied configuration of this operation.
   const NodeDef& def() const { return *def_; }
 
-  // Op registered for this op type.
-  const OpDef& op_def() const { return *op_def_; }
-
   // For inspecting the inputs to this operation.
   int num_inputs() const { return input_types_.size(); }
   DataType input_type(int i) const { return input_types_[i]; }
@@ -346,6 +351,10 @@ class OpKernelConstruction {
   const int graph_def_version_;
   Status* status_;
 
+  // Allow op_def_ across from OpKernel, but not from subclasses.
+  // TODO(irving): Remove protos from this header entirely.
+  friend class OpKernel;
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
 };
 
@@ -425,6 +434,7 @@ class OpOutputList {
   OpOutputList& operator=(const OpOutputList& other) = default;
   Tensor* operator[](int i);
   bool required(int i) const;
+  DataType expected_output_dtype(int i) const;
   Status allocate(int i, const TensorShape& shape, Tensor** output);
   void set(int i, const Tensor& tensor);
   void set_ref(int i, mutex* mu, Tensor* tensor_for_ref);
@@ -443,8 +453,7 @@ class OpOutputList {
 struct TensorValue {
   TensorValue() : mutex_if_ref(nullptr), tensor(nullptr) {}
   TensorValue(Tensor* t)  // NOLINT(runtime/explicit)
-      : mutex_if_ref(nullptr),
-        tensor(t) {}
+      : mutex_if_ref(nullptr), tensor(t) {}
   TensorValue(mutex* mu, Tensor* t) : mutex_if_ref(mu), tensor(t) {}
   Tensor* operator->() const { return tensor; }
   bool is_ref() const { return mutex_if_ref != nullptr; }
@@ -569,8 +578,11 @@ class OpKernelContext {
   int num_inputs() const { return params_->inputs->size(); }
   DataType input_dtype(int index) const;
   Status input_dtype(StringPiece name, DataType* dtype) const;
+  MemoryType input_memory_type(int index) const;
+
   int num_outputs() const { return outputs_.size(); }
   DataType expected_output_dtype(int index) const;
+  MemoryType output_memory_type(int index) const;
 
   // Input
 
@@ -643,12 +655,6 @@ class OpKernelContext {
   Status replace_ref_input(StringPiece name, const Tensor& tensor,
                            bool lock_held);
 
-  // Set the output Ref Tensor at output_index to be an alias of the
-  // input Ref Tensor at input_index.
-  // REQUIRES: IsRefType(input_dtype(input_index)).
-  // REQUIRES: IsRefType(output_dtype(output_index)).
-  void forward_ref_input_to_ref_output(int input_index, int output_index);
-
   // Deletes the Tensor object used as the Ref Input at
   // input_index. This is not usually necessary and should be used
   // with caution. If !lock_held the input mutex will be acquired
@@ -667,6 +673,70 @@ class OpKernelContext {
   // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
   bool ValidateInputsAreSameShape(OpKernel* op);
 
+  // Input to output forwarding.
+
+  // Set the output Ref Tensor at output_index to be an alias of the
+  // input Ref Tensor at input_index.
+  // REQUIRES: IsRefType(input_dtype(input_index)).
+  // REQUIRES: IsRefType(output_dtype(output_index)).
+  void forward_ref_input_to_ref_output(int input_index, int output_index);
+
+  // Returns true when an alias to input[input_index], reshaped to output_shape,
+  // which is is safe to use for in-place computation was written to *output.
+  // Returns false if input[input_index] has a refcount greater than one, or if
+  // its type does not match the expected output type of output[output_index],
+  // or the number of elements in input[input_index] does not equal the number
+  // of elements in output_shape.
+  bool forward_input_to_output_with_shape(int input_index, int output_index,
+                                          const TensorShape& output_shape,
+                                          Tensor** output) TF_MUST_USE_RESULT;
+  Status forward_input_to_output_with_shape(StringPiece input_name,
+                                            StringPiece output_name,
+                                            const TensorShape& output_shape,
+                                            Tensor** output) TF_MUST_USE_RESULT;
+
+  // Returns a pointer to a Tensor aliasing the underlying buffer backing
+  // input[input_index] iff
+  //   * input[input_index] is not a ref,
+  //   * the data type, shape, memory type, and allocator attributes of
+  //     input[input_index] are compatible with those given in dtype, shape,
+  //     memory_type, and attr,
+  //   * refcount on the underlying buffer is one.
+  // Otherwise returns nullptr.
+  // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
+  // forwarding is only safe if there are no reads via __ldg() after writes
+  // to the same address.
+  std::unique_ptr<Tensor> forward_input(
+      int input_index, DataType dtype, const TensorShape& shape,
+      MemoryType memory_type,
+      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+
+  // Tries to forward one of the inputs given in input_indices to
+  // output[output_index]. If none of the given inputs can be forwarded, calls
+  // allocate_output() to allocate a new output buffer.
+  Status forward_input_or_allocate_output(
+      gtl::ArraySlice<int> candidate_input_indices, int output_index,
+      const TensorShape& output_shape, Tensor** output) TF_MUST_USE_RESULT;
+  Status forward_input_or_allocate_output(
+      gtl::ArraySlice<StringPiece> candidate_input_names,
+      StringPiece output_name, const TensorShape& output_shape,
+      Tensor** output) TF_MUST_USE_RESULT;
+
+  // Tries to reuse one of of the inputs given in input_indices as a temporary.
+  // If none of the given inputs can be forwarded, calls
+  // allocate_temp() to allocate a new temporary buffer.
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+      Tensor* out_temp) TF_MUST_USE_RESULT;
+
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, Tensor* out_temp) TF_MUST_USE_RESULT {
+    return forward_input_or_allocate_temp(candidate_input_indices, type, shape,
+                                          AllocatorAttributes(), out_temp);
+  }
+
   // Output
 
   // Returns the named list-valued output in "list", as defined in the OpDef.
@@ -961,6 +1031,40 @@ class OpKernelContext {
   void set_output_ref(int index, mutex* mu, Tensor* tensor_for_ref);
   TensorValue release_output(int index);
 
+  bool track_allocations() const { return params_->track_allocations; }
+  bool allocate_on_host(AllocatorAttributes alloc_attr) const;
+
+  // Records temporary memory sizes.
+  void record_host_temp_memory_size(int64 size) {
+    host_temp_memory_size_ += size;
+  }
+  void record_device_temp_memory_size(int64 size) {
+    device_temp_memory_size_ += size;
+  }
+
+  // Returns recorded size of temporary memory;
+  int64 host_temp_memory_size() const { return host_temp_memory_size_; }
+  int64 device_temp_memory_size() const { return device_temp_memory_size_; }
+
+  // Records persistent memory allocation, size can be negative indicating
+  // deallocation.
+  void record_host_persistent_memory_allocation(int64 size,
+                                                int64 alloc_id = -1);
+  void record_device_persistent_memory_allocation(int64 size,
+                                                  int64 alloc_id = -1);
+
+  // Returns recorded size and ids of persistent memory.
+  int64 host_persistent_memory_allocated() const {
+    return host_persistent_memory_allocated_;
+  }
+  int64 device_persistent_memory_allocated() const {
+    return device_persistent_memory_allocated_;
+  }
+  std::vector<int64> host_persistent_alloc_ids() const;
+  std::vector<int64> device_persistent_alloc_ids() const;
+
+  bool input_is_ref(int index) const;
+
  private:
   Allocator* get_allocator(AllocatorAttributes attr);
 
@@ -1002,6 +1106,13 @@ class OpKernelContext {
 
   bool is_output_dead_ = false;
 
+  int64 host_temp_memory_size_;
+  int64 device_temp_memory_size_;
+  gtl::InlinedVector<int64, 2> host_persistent_alloc_ids_;
+  gtl::InlinedVector<int64, 2> device_persistent_alloc_ids_;
+  int64 host_persistent_memory_allocated_;
+  int64 device_persistent_memory_allocated_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
 
@@ -1086,6 +1197,17 @@ class Name : public KernelDefBuilder {
       : KernelDefBuilder(SHOULD_REGISTER_OP(op) ? op : "_no_register") {}
 };
 
+namespace system {
+
+class Name : public KernelDefBuilder {
+ public:
+  // For system kernels, we ignore selective registration and
+  // unconditionally register the kernel.
+  explicit Name(const char* op) : KernelDefBuilder(op) {}
+};
+
+}  // namespace system
+
 }  // namespace register_kernel
 
 #define REGISTER_KERNEL_BUILDER(kernel_builder, ...) \
@@ -1094,23 +1216,46 @@ class Name : public KernelDefBuilder {
 #define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
   REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
 
-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)          \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar                \
-      registrar__body__##ctr##__object(                                 \
-          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                       \
-              ? ::tensorflow::register_kernel::kernel_builder.Build()   \
-              : nullptr,                                                \
-          #__VA_ARGS__, [](::tensorflow::OpKernelConstruction* context) \
-                            -> ::tensorflow::OpKernel* {                \
-                              return new __VA_ARGS__(context);          \
-                            });
+#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
+  constexpr bool should_register_##ctr##__flag =                      \
+      SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__);                        \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
+      registrar__body__##ctr##__object(                               \
+          should_register_##ctr##__flag                               \
+              ? ::tensorflow::register_kernel::kernel_builder.Build() \
+              : nullptr,                                              \
+          #__VA_ARGS__,                                               \
+          [](::tensorflow::OpKernelConstruction* context)             \
+              -> ::tensorflow::OpKernel* {                            \
+            return new __VA_ARGS__(context);                          \
+          });
+
+// The `REGISTER_SYSTEM_KERNEL_BUILDER()` macro acts as
+// `REGISTER_KERNEL_BUILDER()` except that the kernel is registered
+// unconditionally even when selective registration is used.
+#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...)               \
+  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, \
+                                             __VA_ARGS__)
+
+#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
+  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
+
+#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)    \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar                 \
+      registrar__body__##ctr##__object(                                  \
+          ::tensorflow::register_kernel::system::kernel_builder.Build(), \
+          #__VA_ARGS__,                                                  \
+          [](::tensorflow::OpKernelConstruction* context)                \
+              -> ::tensorflow::OpKernel* {                               \
+            return new __VA_ARGS__(context);                             \
+          });
 
 void* GlobalKernelRegistry();
 
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name);
 
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
@@ -1149,7 +1294,7 @@ Status OpKernelConstruction::GetAttr(StringPiece attr_name, T* value) const {
 
 inline DataType OpKernelContext::input_dtype(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
+  DCHECK_LT(index, num_inputs());
   const TensorValue& value((*params_->inputs)[index]);
   if (value.is_ref()) {
     return MakeRefType(value->dtype());
@@ -1158,12 +1303,29 @@ inline DataType OpKernelContext::input_dtype(int index) const {
   }
 }
 
+inline MemoryType OpKernelContext::input_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  return op_kernel().input_memory_types()[index];
+}
+
 inline DataType OpKernelContext::expected_output_dtype(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->op_kernel->output_types().size());
+  DCHECK_LT(index, num_outputs());
   return params_->op_kernel->output_type(index);
 }
 
+inline MemoryType OpKernelContext::output_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  return op_kernel().output_memory_types()[index];
+}
+
+inline bool OpKernelContext::input_is_ref(int index) const {
+  const TensorValue& value((*params_->inputs)[index]);
+  return value.is_ref();
+}
+
 inline void OpKernelContext::record_tensor_reference(const Tensor& tensor) {
   DCHECK_EQ(params_->device->RequiresRecordingAccessedTensors(),
             params_->record_tensor_accesses);
@@ -1183,14 +1345,14 @@ inline void OpKernelContext::retrieve_accessed_tensors(
 // no input if tensor == nullptr.
 inline bool OpKernelContext::has_input(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
+  DCHECK_LT(index, num_inputs());
   return (*params_->inputs)[index].tensor != nullptr;
 }
 
 inline mutex* OpKernelContext::input_ref_mutex(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK((*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
   return (*params_->inputs)[index].mutex_if_ref;
 }
 
@@ -1202,7 +1364,7 @@ inline void OpKernelContext::NotifyUseOfPersistentTensor(const Tensor& t) {
 
 inline Tensor* OpKernelContext::mutable_output(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, outputs_.size());
+  DCHECK_LT(index, num_outputs());
   // No need to record_tensor_reference since the output must already
   // have been set by a call that did so.
   return outputs_[index].tensor;
@@ -1210,12 +1372,37 @@ inline Tensor* OpKernelContext::mutable_output(int index) {
 
 inline TensorValue OpKernelContext::release_output(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, outputs_.size());
+  DCHECK_LT(index, num_outputs());
   TensorValue value = outputs_[index];
   outputs_[index] = TensorValue();
   return value;
 }
 
+inline Status OpKernelContext::forward_input_or_allocate_output(
+    gtl::ArraySlice<int> candidate_input_indices, int output_index,
+    const TensorShape& output_shape, Tensor** output) {
+  for (int input_index : candidate_input_indices) {
+    if (forward_input_to_output_with_shape(input_index, output_index,
+                                           output_shape, output)) {
+      return Status::OK();
+    }
+  }
+  return allocate_output(output_index, output_shape, output);
+}
+
+inline Status OpKernelContext::forward_input_or_allocate_output(
+    gtl::ArraySlice<StringPiece> candidate_input_names, StringPiece output_name,
+    const TensorShape& output_shape, Tensor** output) {
+  for (const StringPiece& input_name : candidate_input_names) {
+    if (forward_input_to_output_with_shape(input_name, output_name,
+                                           output_shape, output)
+            .ok()) {
+      return Status::OK();
+    }
+  }
+  return allocate_output(output_name, output_shape, output);
+}
+
 template <typename T>
 T* OpKernelContext::op_device_context() {
   static_assert(std::is_base_of<DeviceContext, T>::value,
@@ -1268,6 +1455,12 @@ inline bool OpOutputList::required(int i) const {
   return ctx_->output_required(start_ + i);
 }
 
+inline DataType OpOutputList::expected_output_dtype(int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->expected_output_dtype(start_ + i);
+}
+
 inline Status OpOutputList::allocate(int i, const TensorShape& shape,
                                      Tensor** output) {
   DCHECK_GE(i, 0);
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 1c561899159..f87b7178449 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -133,8 +134,8 @@ class OpKernelTest : public ::testing::Test {
                      const DataTypeVector& outputs) {
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(
-        device_type, &device_, cpu_allocator(), CreateNodeDef(op_type, inputs),
-        TF_GRAPH_DEF_VERSION, &status));
+        std::move(device_type), &device_, cpu_allocator(),
+        CreateNodeDef(op_type, inputs), TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(status.ok()) << status;
     EXPECT_TRUE(op != nullptr);
     if (op != nullptr) {
@@ -148,9 +149,9 @@ class OpKernelTest : public ::testing::Test {
     NodeDef node_def;
     protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
     Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device_,
-                                                cpu_allocator(), node_def,
-                                                TF_GRAPH_DEF_VERSION, &status));
+    std::unique_ptr<OpKernel> op(
+        CreateOpKernel(std::move(device_type), &device_, cpu_allocator(),
+                       node_def, TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(op == nullptr);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
@@ -384,7 +385,7 @@ class OpKernelBuilderTest : public ::testing::Test {
   }
 
   std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
-                                          DeviceType device_type,
+                                          const DeviceType& device_type,
                                           const std::vector<string>& attrs,
                                           DataTypeSlice input_types = {}) {
     Status status;
@@ -423,7 +424,7 @@ class OpKernelBuilderTest : public ::testing::Test {
     return op;
   }
 
-  void ExpectFailure(const string& op_type, DeviceType device_type,
+  void ExpectFailure(const string& op_type, const DeviceType& device_type,
                      const std::vector<string>& attrs, error::Code code) {
     Status status;
     const NodeDef def = CreateNodeDef(op_type, attrs);
@@ -455,7 +456,8 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
   }
 
-  string GetKernelClassName(const string& op_type, DeviceType device_type,
+  string GetKernelClassName(const string& op_type,
+                            const DeviceType& device_type,
                             const std::vector<string>& attrs,
                             DataTypeSlice input_types = {}) {
     NodeDef def = CreateNodeDef(op_type, attrs);
@@ -613,6 +615,36 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
                 error::INVALID_ARGUMENT);
 }
 
+REGISTER_OP("ListOut").Output("a: int32").Output("b: T").Attr("T: list(type)");
+REGISTER_KERNEL_BUILDER(Name("ListOut").Device(tensorflow::DEVICE_CPU),
+                        DummyKernel);
+
+TEST_F(OpKernelBuilderTest, OpOutputList) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  params.record_tensor_accesses = false;
+  std::unique_ptr<DummyDevice> device(
+      new DummyDevice(env, params.record_tensor_accesses));
+  params.device = device.get();
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, params.device, cpu_allocator(),
+      CreateNodeDef("ListOut", {"T|list(type)|[DT_FLOAT, DT_INT32]"}),
+      TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok()) << status.ToString();
+  params.op_kernel = op.get();
+  gtl::InlinedVector<TensorValue, 4> inputs{};
+  params.inputs = &inputs;
+  std::unique_ptr<OpKernelContext> ctx(new OpKernelContext(&params));
+
+  EXPECT_EQ(DT_INT32, ctx->expected_output_dtype(0));
+  OpOutputList out_list;
+  EXPECT_FALSE(ctx->output_list("non_existent_output", &out_list).ok());
+  ASSERT_TRUE(ctx->output_list("b", &out_list).ok());
+  EXPECT_EQ(DT_FLOAT, out_list.expected_output_dtype(0));
+  EXPECT_EQ(DT_INT32, out_list.expected_output_dtype(1));
+}
+
 class GetAttrKernel : public ::tensorflow::OpKernel {
  public:
   explicit GetAttrKernel(OpKernelConstruction* context) : OpKernel(context) {
diff --git a/tensorflow/core/framework/op_registration_test.cc b/tensorflow/core/framework/op_registration_test.cc
index 9ef55bcf9a1..665b1bf33c7 100644
--- a/tensorflow/core/framework/op_registration_test.cc
+++ b/tensorflow/core/framework/op_registration_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -47,10 +48,11 @@ TEST(OpRegistrationTest, TestDuplicate) {
   Status s = registry->ProcessRegistrations();
   EXPECT_TRUE(s.ok());
 
-  registry->SetWatcher([](const Status& s, const OpDef& op_def) -> Status {
-    EXPECT_TRUE(errors::IsAlreadyExists(s));
-    return Status::OK();
-  });
+  TF_EXPECT_OK(
+      registry->SetWatcher([](const Status& s, const OpDef& op_def) -> Status {
+        EXPECT_TRUE(errors::IsAlreadyExists(s));
+        return Status::OK();
+      }));
   Register("Foo", registry.get());
   s = registry->ProcessRegistrations();
   EXPECT_TRUE(s.ok());
diff --git a/tensorflow/core/framework/op_segment_test.cc b/tensorflow/core/framework/op_segment_test.cc
index 75926eb2ca6..af16e9f7ef9 100644
--- a/tensorflow/core/framework/op_segment_test.cc
+++ b/tensorflow/core/framework/op_segment_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/ops_util.h"
diff --git a/tensorflow/core/framework/partial_tensor_shape.cc b/tensorflow/core/framework/partial_tensor_shape.cc
deleted file mode 100644
index f650468c1c8..00000000000
--- a/tensorflow/core/framework/partial_tensor_shape.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-bool PartialTensorShape::IsValid(const TensorShapeProto& proto) {
-  if (proto.unknown_rank()) {
-    return proto.dim_size() == 0;
-  } else {
-    for (const auto& d : proto.dim()) {
-      if (d.size() < -1) return false;
-    }
-  }
-  return true;
-}
-
-bool PartialTensorShape::IsFullyDefined() const {
-  if (is_unknown_) {
-    return false;
-  }
-  for (auto s : dim_sizes_) {
-    if (s < 0) return false;
-  }
-  return true;
-}
-
-Status PartialTensorShape::IsValidShape(const TensorShapeProto& proto) {
-  if (proto.unknown_rank() && proto.dim_size() > 0) {
-    return errors::InvalidArgument(
-        "An unknown shape must not have any dimensions set.");
-  } else {
-    for (const auto& d : proto.dim()) {
-      if (d.size() < -1) {
-        return errors::InvalidArgument(
-            "Shape ", DebugString(proto),
-            " has dimensions with values below -1 (where -1 means unknown)");
-      }
-    }
-  }
-  return Status::OK();
-}
-
-PartialTensorShape::PartialTensorShape(const TensorShapeProto& proto)
-    : is_unknown_(proto.unknown_rank()) {
-  if (!is_unknown_) {
-    dim_sizes_.reserve(proto.dim_size());
-    for (const auto& d : proto.dim()) {
-      CHECK_GE(d.size(), -1);
-      dim_sizes_.push_back(d.size());
-    }
-  }
-}
-
-PartialTensorShape::PartialTensorShape(gtl::ArraySlice<int64> dim_sizes)
-    : is_unknown_(false) {
-  dim_sizes_.reserve(dim_sizes.size());
-  for (const int64& s : dim_sizes) {
-    const int64 dim = internal::SubtleMustCopy(s);
-    CHECK_GE(dim, -1);
-    dim_sizes_.push_back(dim);
-  }
-}
-
-PartialTensorShape PartialTensorShape::Concatenate(int64 size) const {
-  if (is_unknown_) {
-    return *this;
-  }
-  CHECK_GE(size, -1);
-  PartialTensorShape out = *this;
-  out.dim_sizes_.push_back(size);
-  return out;
-}
-
-PartialTensorShape PartialTensorShape::Concatenate(
-    const PartialTensorShape& shape) const {
-  if (is_unknown_ || shape.is_unknown_) {
-    return PartialTensorShape();
-  }
-  PartialTensorShape out = *this;
-  if (!out.is_unknown_ && !shape.is_unknown_) {
-    for (auto s : shape.dim_sizes_) out.dim_sizes_.push_back(s);
-  }
-  return out;
-}
-
-Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
-                                     PartialTensorShape* result) const {
-  if (is_unknown_) {
-    *result = shape;
-    return Status::OK();
-  }
-  CHECK(result != this);
-  *result = *this;
-  if (shape.is_unknown_) {
-    return Status::OK();
-  }
-  if (dims() != shape.dims()) {
-    return errors::InvalidArgument(
-        "PartialTensorShape: Incompatible ranks during merge: ", dims(),
-        " vs. ", shape.dims());
-  }
-  for (int i = 0; i < dims(); ++i) {
-    if (dim_sizes_[i] == -1) {
-      result->dim_sizes_[i] = shape.dim_sizes_[i];
-    } else if (shape.dim_sizes_[i] != -1 &&
-               dim_sizes_[i] != shape.dim_sizes_[i]) {
-      return errors::InvalidArgument(
-          "PartialTensorShape: Incompatible shapes during merge: ",
-          DebugString(), " vs. ", shape.DebugString());
-    }
-  }
-  return Status::OK();
-}
-
-void PartialTensorShape::AsProto(TensorShapeProto* proto) const {
-  proto->Clear();
-  if (is_unknown_) {
-    proto->set_unknown_rank(true);
-  } else {
-    for (size_t d = 0; d < dim_sizes_.size(); ++d) {
-      auto* dim = proto->add_dim();
-      dim->set_size(dim_sizes_[d]);
-    }
-  }
-}
-
-bool PartialTensorShape::AsTensorShape(TensorShape* shape) const {
-  if (is_unknown_) {
-    return false;
-  }
-  shape->Clear();
-  for (auto s : dim_sizes_) {
-    if (s < 0) return false;
-    shape->AddDim(s);
-  }
-  return true;
-}
-
-string PartialTensorShape::DebugString() const {
-  if (is_unknown_) {
-    return "<unknown>";
-  }
-  string s = "[";
-  bool first = true;
-  for (int64 v : dim_sizes_) {
-    if (v == -1)
-      strings::StrAppend(&s, (first ? "" : ","), "?");
-    else
-      strings::StrAppend(&s, (first ? "" : ","), v);
-    first = false;
-  }
-  strings::StrAppend(&s, "]");
-  return s;
-}
-
-string PartialTensorShape::DebugString(const TensorShapeProto& proto) {
-  if (proto.unknown_rank()) {
-    return "<unknown>";
-  }
-  string s = "[";
-  bool first = true;
-  for (const auto& d : proto.dim()) {
-    if (d.size() == -1)
-      strings::StrAppend(&s, (first ? "" : ","), "?");
-    else
-      strings::StrAppend(&s, (first ? "" : ","), d.size());
-    first = false;
-  }
-  strings::StrAppend(&s, "]");
-  return s;
-}
-
-bool PartialTensorShape::IsIdenticalTo(const PartialTensorShape& shape) const {
-  return is_unknown_ == shape.is_unknown_ && dim_sizes_ == shape.dim_sizes_;
-}
-
-bool PartialTensorShape::IsCompatibleWith(
-    const PartialTensorShape& shape) const {
-  if (is_unknown_ || shape.is_unknown_) return true;
-  if (dims() != shape.dims()) return false;
-  for (int i = 0; i < dims(); i++) {
-    if (dim_size(i) == -1 || shape.dim_size(i) == -1) continue;
-    if (dim_size(i) != shape.dim_size(i)) return false;
-  }
-  return true;
-}
-
-bool PartialTensorShape::IsCompatibleWith(const TensorShape& shape) const {
-  if (is_unknown_) return true;
-  if (dims() != shape.dims()) return false;
-  for (int i = 0; i < dims(); i++) {
-    if (dim_size(i) == -1) continue;
-    if (dim_size(i) != shape.dim_size(i)) return false;
-  }
-  return true;
-}
-
-template <typename T>
-static Status CheckAndCopyDims(const T* dims, int n,
-                               gtl::InlinedVector<int64, 4>* out_dims) {
-  out_dims->reserve(n);
-  for (int i = 0; i < n; ++i) {
-    const int64 dim = internal::SubtleMustCopy(dims[i]);
-    if (dim >= -1) {
-      out_dims->push_back(dim);
-    } else {
-      return errors::InvalidArgument("Dimension ", dim, " must be >= -1");
-    }
-  }
-  return Status::OK();
-}
-
-#define MAKE_PARTIAL_SHAPE(T)                                            \
-  Status PartialTensorShape::MakePartialShape(const T* dims, int n,      \
-                                              PartialTensorShape* out) { \
-    out->is_unknown_ = false;                                            \
-    return CheckAndCopyDims(dims, n, &out->dim_sizes_);                  \
-  }
-MAKE_PARTIAL_SHAPE(int32)
-MAKE_PARTIAL_SHAPE(int64)
-#undef MAKE_PARTIAL_SHAPE
-
-string PartialTensorShapeUtils::PartialShapeListString(
-    const gtl::ArraySlice<PartialTensorShape>& shapes) {
-  string result = "[";
-  bool first = true;
-  for (const PartialTensorShape& shape : shapes) {
-    strings::StrAppend(&result, (first ? "" : ", "), shape.DebugString());
-    first = false;
-  }
-  strings::StrAppend(&result, "]");
-  return result;
-}
-
-bool PartialTensorShapeUtils::AreCompatible(
-    const gtl::ArraySlice<PartialTensorShape>& shapes0,
-    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
-  if (shapes0.size() == shapes1.size()) {
-    for (size_t i = 0; i < shapes0.size(); ++i) {
-      if (!shapes0[i].IsCompatibleWith(shapes1[i])) {
-        return false;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool PartialTensorShapeUtils::AreIdentical(
-    const gtl::ArraySlice<PartialTensorShape>& shapes0,
-    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
-  if (shapes0.size() == shapes1.size()) {
-    for (size_t i = 0; i < shapes0.size(); ++i) {
-      if (!shapes0[i].IsIdenticalTo(shapes1[i])) {
-        return false;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/partial_tensor_shape.h b/tensorflow/core/framework/partial_tensor_shape.h
index 1504b8c983b..fa1ce07dd43 100644
--- a/tensorflow/core/framework/partial_tensor_shape.h
+++ b/tensorflow/core/framework/partial_tensor_shape.h
@@ -16,136 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
 
-#include <string>
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+// TODO(irving): Remove this forwarding header
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-class PartialTensorShapeIter;  // Declared below
-
-/// Manages the partially known dimensions of a Tensor and their sizes.
-class PartialTensorShape {
- public:
-  /// \brief Construct an unknown `PartialTensorShape`.
-  PartialTensorShape() : is_unknown_(true) {}
-
-  /// \brief Construct a `PartialTensorShape` from the provided sizes.
-  /// REQUIRES: `dim_sizes[i] >= 0`
-  explicit PartialTensorShape(gtl::ArraySlice<int64> dim_sizes);
-  PartialTensorShape(std::initializer_list<int64> dim_sizes)
-      : PartialTensorShape(gtl::ArraySlice<int64>(dim_sizes)) {}
-
-  /// REQUIRES: `IsValid(proto)`
-  explicit PartialTensorShape(const TensorShapeProto& proto);
-
-  /// Returns `true` iff `proto` is a valid partial tensor shape.
-  static bool IsValid(const TensorShapeProto& proto);
-
-  /// Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error
-  /// status otherwise.
-  static Status IsValidShape(const TensorShapeProto& proto);
-
-  /// Add a dimension to the end ("inner-most"), returns a new
-  /// PartialTensorShape.
-  /// REQUIRES: `size >= -1`, where -1 means unknown
-  PartialTensorShape Concatenate(int64 size) const;
-
-  /// Appends all the dimensions from `shape`.  Returns a new
-  /// PartialTensorShape.
-  PartialTensorShape Concatenate(const PartialTensorShape& shape) const;
-
-  /// Merges all the dimensions from `shape`.  Returns
-  /// `InvalidArgument` error if either `shape` has a different rank
-  /// or if any of the dimensions are incompatible.
-  Status MergeWith(const PartialTensorShape& shape,
-                   PartialTensorShape* result) const;
-
-  /// Return the number of dimensions in the tensor. If the number of
-  /// dimensions is unknown, return -1.
-  int dims() const { return is_unknown_ ? -1 : dim_sizes_.size(); }
-
-  /// Return true iff the rank and all of the dimensions are well defined
-  bool IsFullyDefined() const;
-
-  /// Exact equality test. Returns true iff the ranks match (i.e., both are
-  /// unknown, or both are known and equal), and all dimensions are equal (i.e.,
-  /// both dimensions are known, or both are known and equal). This is a
-  /// stronger condition that IsCompatibleWith.
-  bool IsIdenticalTo(const PartialTensorShape& shape) const;
-
-  /// Return true iff the ranks match, and if the
-  /// dimensions all either match or one is unknown.
-  bool IsCompatibleWith(const PartialTensorShape& shape) const;
-
-  /// Return true iff the dimensions of `shape` are compatible with
-  /// `*this`.
-  bool IsCompatibleWith(const TensorShape& shape) const;
-
-  /// \brief Returns the number of elements in dimension `d`.
-  /// REQUIRES: `0 <= d < dims()`
-  int64 dim_size(int d) const {
-    DCHECK_GE(d, 0);
-    if (is_unknown_) {
-      return -1;
-    } else {
-      DCHECK_LT(d, dims());
-      return dim_sizes_[d];
-    }
-  }
-
-  /// Returns sizes of all dimensions.
-  gtl::ArraySlice<int64> dim_sizes() const { return dim_sizes_; }
-
-  /// Fill `*proto` from `*this`.
-  void AsProto(TensorShapeProto* proto) const;
-
-  // Fill `*tensor_shape` from `*this`.
-  // If `*this` is not fully defined, returns false and
-  // `*tensor_shape` is left in an intermediate state.  Otherwise
-  // returns true.
-  bool AsTensorShape(TensorShape* tensor_shape) const;
-
-  /// For error messages.
-  string DebugString() const;
-  static string DebugString(const TensorShapeProto& proto);
-
-  /// \brief Returns a `PartialTensorShape` whose dimensions are
-  /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.  Values of -1 are
-  /// considered "unknown".
-  static Status MakePartialShape(const int32* dims, int n,
-                                 PartialTensorShape* out);
-  static Status MakePartialShape(const int64* dims, int n,
-                                 PartialTensorShape* out);
-
- private:
-  bool is_unknown_;
-  gtl::InlinedVector<int64, 4> dim_sizes_;
-};
-
-/// \brief Static helper routines for `PartialTensorShape`. Includes a few
-/// common predicates on a partially known tensor shape.
-class PartialTensorShapeUtils {
- public:
-  static string PartialShapeListString(
-      const gtl::ArraySlice<PartialTensorShape>& shapes);
-
-  static bool AreIdentical(const gtl::ArraySlice<PartialTensorShape>& shapes0,
-                           const gtl::ArraySlice<PartialTensorShape>& shapes1);
-
-  static bool AreCompatible(const gtl::ArraySlice<PartialTensorShape>& shapes0,
-                            const gtl::ArraySlice<PartialTensorShape>& shapes1);
-};
-
-}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_PARTIAL_TENSOR_SHAPE_H_
diff --git a/tensorflow/core/framework/partial_tensor_shape_test.cc b/tensorflow/core/framework/partial_tensor_shape_test.cc
index bd0c4ce5d92..f8ebd99bf88 100644
--- a/tensorflow/core/framework/partial_tensor_shape_test.cc
+++ b/tensorflow/core/framework/partial_tensor_shape_test.cc
@@ -27,6 +27,7 @@ TEST(PartialTensorShapeTest, Default) {
   // with unknown rank.
   const PartialTensorShape s;
   EXPECT_EQ(s.dims(), -1);
+  EXPECT_TRUE(s.unknown_rank());
 }
 
 TEST(PartialTensorShapeTest, Concatenate) {
@@ -34,6 +35,7 @@ TEST(PartialTensorShapeTest, Concatenate) {
   ASSERT_EQ(2, s.dims());
   EXPECT_EQ(10, s.dim_size(0));
   EXPECT_EQ(5, s.dim_size(1));
+  EXPECT_EQ(50, s.num_elements());
 
   const auto s1 = s.Concatenate(s);
   ASSERT_EQ(4, s1.dims());
@@ -41,6 +43,7 @@ TEST(PartialTensorShapeTest, Concatenate) {
   EXPECT_EQ(5, s1.dim_size(1));
   EXPECT_EQ(10, s1.dim_size(2));
   EXPECT_EQ(5, s1.dim_size(3));
+  EXPECT_EQ(50 * 50, s1.num_elements());
 
   const auto s2 = s.Concatenate(-1);
   const auto s3 = s2.Concatenate(0);
@@ -53,9 +56,12 @@ TEST(PartialTensorShapeTest, Concatenate) {
   EXPECT_EQ(-1, s2.dim_size(2));
   EXPECT_EQ(-1, s3.dim_size(2));
   EXPECT_EQ(0, s3.dim_size(3));
+  EXPECT_EQ(-1, s2.num_elements());
+  EXPECT_EQ(-1, s3.num_elements());
 
   const auto s4 = s.Concatenate(PartialTensorShape());
-  ASSERT_EQ(-1, s4.dims());
+  EXPECT_EQ(-1, s4.dims());
+  EXPECT_EQ(-1, s4.num_elements());
 }
 
 TEST(PartialTensorShapeTest, InvalidShapeProto) {
@@ -257,5 +263,13 @@ TEST(PartialTensorShapeTest, MakePartialShapeFull) {
   }
 }
 
+TEST(PartialTensorShapeTest, MakePartialShapeInvalid) {
+  // Check that arrays are copied through correctly
+  const int64 dims[3] = {7, -2, 2};
+  PartialTensorShape shape;
+  EXPECT_EQ(error::INVALID_ARGUMENT,
+            PartialTensorShape::MakePartialShape(dims, 3, &shape).code());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_base.cc b/tensorflow/core/framework/reader_base.cc
similarity index 83%
rename from tensorflow/core/kernels/reader_base.cc
rename to tensorflow/core/framework/reader_base.cc
index c1ac04bed98..ebed957d99d 100644
--- a/tensorflow/core/kernels/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/reader_base.h"
+#include "tensorflow/core/framework/reader_base.h"
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/coding.h"
@@ -65,7 +65,7 @@ Status ReaderBase::RestoreState(const string& state) {
   mutex_lock lock(mu_);
   Status status = RestoreStateLocked(state);
   if (!status.ok()) {
-    ResetLocked();
+    ResetLocked().IgnoreError();
   }
   return status;
 }
@@ -88,8 +88,17 @@ int64 ReaderBase::ReadUpTo(const int64 num_records, QueueInterface* queue,
       return records_produced_this_call;
     }
     if (!work_in_progress()) {
-      GetNextWorkLocked(queue, context);
-      if (!context->status().ok()) return records_produced_this_call;
+      work_ = GetNextWorkLocked(queue, context);
+      if (!context->status().ok()) {
+        return records_produced_this_call;
+      }
+      Status status = OnWorkStartedLocked();
+      if (status.ok()) {
+        work_started_++;
+      } else {
+        context->SetStatus(status);
+        return records_produced_this_call;
+      }
     }
     bool at_end = false;
 
@@ -145,8 +154,17 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
   mutex_lock lock(mu_);
   while (true) {
     if (!work_in_progress()) {
-      GetNextWorkLocked(queue, context);
-      if (!context->status().ok()) return;
+      work_ = GetNextWorkLocked(queue, context);
+      if (!context->status().ok()) {
+        return;
+      }
+      Status status = OnWorkStartedLocked();
+      if (status.ok()) {
+        work_started_++;
+      } else {
+        context->SetStatus(status);
+        return;
+      }
     }
 
     bool produced = false;
@@ -178,11 +196,12 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
   }
 }
 
-void ReaderBase::GetNextWorkLocked(QueueInterface* queue,
-                                   OpKernelContext* context) {
+string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
+                                     OpKernelContext* context) const {
+  string work;
   Notification n;
   queue->TryDequeue(
-      context, [this, context, &n](const QueueInterface::Tuple& tuple) {
+      context, [this, context, &n, &work](const QueueInterface::Tuple& tuple) {
         if (context->status().ok()) {
           if (tuple.size() != 1) {
             context->SetStatus(
@@ -194,18 +213,13 @@ void ReaderBase::GetNextWorkLocked(QueueInterface* queue,
             context->SetStatus(errors::InvalidArgument(
                 "Expected to dequeue a one-element string tensor"));
           } else {
-            work_ = tuple[0].flat<string>()(0);
-            ++work_started_;
-            Status status = OnWorkStartedLocked();
-            if (!status.ok()) {
-              context->SetStatus(status);
-              --work_started_;
-            }
+            work = tuple[0].flat<string>()(0);
           }
         }
         n.Notify();
       });
   n.WaitForNotification();
+  return work;
 }
 
 void ReaderBase::SaveBaseState(ReaderBaseState* state) const {
@@ -226,14 +240,24 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
   num_records_produced_ = state.num_records_produced();
   work_ = state.current_work();
   if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
+#ifdef __ANDROID__
+    const string debug_string = "<debug state not available>";
+#else
+    const string debug_string = state.DebugString();
+#endif
     return errors::InvalidArgument(
         "Unexpected negative value when restoring in ", name(), ": ",
-        state.DebugString());
+        debug_string);
   }
   if (work_started_ > work_finished_) {
+#ifdef __ANDROID__
+    const string debug_string = "<debug state not available>";
+#else
+    const string debug_string = state.DebugString();
+#endif
     return errors::InvalidArgument(
         "Inconsistent work started vs. finished when restoring in ", name(),
-        ": ", state.DebugString());
+        ": ", debug_string);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/reader_base.h b/tensorflow/core/framework/reader_base.h
similarity index 92%
rename from tensorflow/core/kernels/reader_base.h
rename to tensorflow/core/framework/reader_base.h
index 3cb910751d0..0528841814b 100644
--- a/tensorflow/core/kernels/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_READER_BASE_H_
-#define TENSORFLOW_KERNELS_READER_BASE_H_
+#ifndef TENSORFLOW_FRAMEWORK_READER_BASE_H_
+#define TENSORFLOW_FRAMEWORK_READER_BASE_H_
 
 #include <memory>
 #include <string>
 #include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/reader_interface.h"
-#include "tensorflow/core/kernels/reader_base.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
@@ -101,6 +101,12 @@ class ReaderBase : public ReaderInterface {
   Status RestoreBaseState(const ReaderBaseState& state);
 
  private:
+  // For descendants that wish to obtain the next work item in a different way.
+  // For implementing Read().  Dequeues the next work item from
+  // *queue, and if successful returns "work" (a string). May block.
+  virtual string GetNextWorkLocked(QueueInterface* queue,
+                                   OpKernelContext* context) const;
+
   // Implementations of ReaderInterface methods.  These ensure thread-safety
   // and call the methods above to do the work.
   void Read(QueueInterface* queue, string* key, string* value,
@@ -118,12 +124,6 @@ class ReaderBase : public ReaderInterface {
   Status SerializeState(string* state) override;
   Status RestoreState(const string& state) override;
 
-  // For implementing Read().  Dequeues the next work item from
-  // *queue, and if successful updates work_, work_started_
-  // (establishing work_in_progress() == true) and calls
-  // OnWorkStartedLocked().  May block.
-  void GetNextWorkLocked(QueueInterface* queue, OpKernelContext* context);
-
   mutable mutex mu_;
   const string name_;
   int64 work_started_ = 0;
diff --git a/tensorflow/core/kernels/reader_base.proto b/tensorflow/core/framework/reader_base.proto
similarity index 88%
rename from tensorflow/core/kernels/reader_base.proto
rename to tensorflow/core/framework/reader_base.proto
index abef43eea56..1b8b965ee10 100644
--- a/tensorflow/core/kernels/reader_base.proto
+++ b/tensorflow/core/framework/reader_base.proto
@@ -4,7 +4,7 @@ package tensorflow;
 option cc_enable_arenas = true;
 option java_outer_classname = "ReaderBaseProtos";
 option java_multiple_files = true;
-option java_package = "org.tensorflow.kernels";
+option java_package = "org.tensorflow.framework";
 
 // For serializing and restoring the state of ReaderBase, see
 // reader_base.h for details.
diff --git a/tensorflow/core/framework/reader_op_kernel.h b/tensorflow/core/framework/reader_op_kernel.h
index 502b98f13d9..ffd6a1a1848 100644
--- a/tensorflow/core/framework/reader_op_kernel.h
+++ b/tensorflow/core/framework/reader_op_kernel.h
@@ -47,7 +47,28 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
     factory_ = factory;
   }
 
+  void Compute(OpKernelContext* context) override {
+    if (!IsCancellable()) {
+      ResourceOpKernel<ReaderInterface>::Compute(context);
+    } else {
+      // Install cancellation
+      CancellationManager* cm = context->cancellation_manager();
+      CancellationToken token = cm->get_cancellation_token();
+      bool already_cancelled =
+          !cm->RegisterCallback(token, [this]() { this->Cancel(); });
+
+      if (!already_cancelled) {
+        ResourceOpKernel<ReaderInterface>::Compute(context);
+      } else {
+        context->SetStatus(errors::Cancelled("read operation was cancelled"));
+      }
+    }
+  }
+
  private:
+  virtual bool IsCancellable() const { return false; }
+  virtual void Cancel() {}
+
   Status CreateResource(ReaderInterface** reader)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
     *reader = factory_();
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 8f8d9fd08e6..c1fe5517c69 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -21,6 +21,10 @@ limitations under the License.
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -66,6 +70,17 @@ struct proxy_type_pod<GPUDevice, 2> {
   typedef Eigen::half type;
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+struct proxy_type_pod<SYCLDevice, 8> {
+  typedef double type;
+};
+template <>
+struct proxy_type_pod<SYCLDevice, 4> {
+  typedef float type;
+};
+#endif // TENSORFLOW_USE_SYCL
+
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
 struct proxy_type {
@@ -81,6 +96,10 @@ struct proxy_type {
       TF_CALL_int8(m) TF_CALL_complex128(m)
 #define TF_CALL_GPU_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m)
+#ifdef TENSORFLOW_USE_SYCL
+#define TF_CALL_SYCL_PROXY_TYPES(m) \
+  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
new file mode 100644
index 00000000000..389a08ac2f3
--- /dev/null
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -0,0 +1,55 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// Protocol buffer representing a handle to a tensorflow resource. Handles are
+// not valid across executions, but can be serialized back and forth from within
+// a single run.
+message RemoteFusedGraphExecuteInfo {
+  enum NodeType {
+    UNUSED = 0;
+    GRAPH_INPUT = 1;
+    GRAPH_OUTPUT = 2;
+    FUSED_NODE = 3;
+    BORDER_INPUT = 4;
+    BORDER_OUTPUT = 5;
+  }
+
+  message TensorShapeTypeProto {
+    DataType dtype = 1;
+    TensorShapeProto shape = 2;
+  }
+
+  // Definition of remote graph
+  GraphDef remote_graph = 1;
+
+  // Remote fused graph input node name
+  repeated string graph_input_node_name = 2;
+
+  // Remote fused graph output node name
+  repeated string graph_output_node_name = 3;
+
+  // Executor's name
+  string executor_name = 4;
+
+  // Optional: Parameters given to the executor
+  bytes serialized_executor_parameters = 5;
+
+  // Optional: Default graph input tensor shape used to allocate memory
+  // before executing op
+  repeated TensorShapeTypeProto default_graph_input_tensor_shape = 6;
+
+  // Optional: Default graph input tensor shape used to allocate memory
+  // before executing op
+  // TODO(satok): Remote output tensor shape once shape information is stored
+  // in NodeDef
+  repeated TensorShapeTypeProto default_graph_output_tensor_shape = 7;
+};
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 11c49e86033..6e578cdbab4 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -127,7 +127,8 @@ Status Rendezvous::Recv(const ParsedKey& key, const Args& recv_args,
               n.Notify();
             });
   if (timeout_ms > 0) {
-    bool notified = WaitForNotificationWithTimeout(&n, timeout_ms);
+    int64 timeout_us = timeout_ms * 1000;
+    bool notified = WaitForNotificationWithTimeout(&n, timeout_us);
     if (!notified) {
       return Status(error::DEADLINE_EXCEEDED,
                     "Timed out waiting for notification");
@@ -282,7 +283,6 @@ class LocalRendezvousImpl : public Rendezvous {
     }
     CHECK(table_.insert({key_hash, item}).second);
     mu_.unlock();
-    return;
   }
 
   void StartAbort(const Status& status) override {
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 7f9fe084ba4..4365a861e52 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -24,6 +24,34 @@ limitations under the License.
 #include "tensorflow/core/platform/demangle.h"
 
 namespace tensorflow {
+ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
+                                  const string& name,
+                                  const TypeIndex& type_index) {
+  ResourceHandle result;
+  result.set_device(ctx->device()->attributes().name());
+  string actual_container;
+  if (!container.empty()) {
+    actual_container = container;
+  } else {
+    actual_container = ctx->resource_manager()->default_container();
+  }
+  result.set_container(actual_container);
+  result.set_name(name);
+  result.set_hash_code(type_index.hash_code());
+  result.set_maybe_type_name(type_index.name());
+  return result;
+}
+
+Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
+                                  const string& container, const string& name,
+                                  const TypeIndex& type_index) {
+  Tensor* handle;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, TensorShape({}), &handle));
+  handle->scalar<ResourceHandle>()() =
+      MakeResourceHandle(context, container, name, type_index);
+  return Status::OK();
+}
 
 namespace internal {
 
@@ -96,6 +124,7 @@ string ResourceMgr::DebugString() const {
     }
   }
   std::vector<string> text;
+  text.reserve(lines.size());
   for (const Line& line : lines) {
     text.push_back(strings::Printf(
         "%-20s | %-40s | %-40s | %-s", line.container->c_str(),
@@ -246,6 +275,14 @@ ResourceHandle HandleFromInput(OpKernelContext* ctx, int input) {
   return ctx->input(input).flat<ResourceHandle>()(0);
 }
 
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle) {
+  const Tensor* tensor;
+  TF_RETURN_IF_ERROR(ctx->input(input, &tensor));
+  *handle = tensor->flat<ResourceHandle>()(0);
+  return Status::OK();
+}
+
 Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p) {
   TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
   return ctx->resource_manager()->Delete(p);
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 882d56b6114..0e1a5a82d3f 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -77,6 +77,9 @@ class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
   virtual string DebugString() = 0;
+
+  // Returns memory used by this resource.
+  virtual int64 MemoryUsed() const { return 0; };
 };
 
 // Container used for per-step resources.
@@ -199,15 +202,28 @@ class ResourceMgr {
 
 // Makes a resource handle with the specified type for a given container /
 // name.
+ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
+                                  const string& name,
+                                  const TypeIndex& type_index);
+
 template <typename T>
 ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
-                                  const string& name);
+                                  const string& name) {
+  return MakeResourceHandle(ctx, container, name, MakeTypeIndex<T>());
+}
+
+Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
+                                  const string& container, const string& name,
+                                  const TypeIndex& type_index);
+
 template <typename T>
 ResourceHandle MakePerStepResourceHandle(OpKernelContext* ctx,
                                          const string& name);
 
 // Returns a resource handle from a numbered op input.
 ResourceHandle HandleFromInput(OpKernelContext* ctx, int input);
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle);
 
 // Create a resource pointed by a given resource handle.
 template <typename T>
@@ -418,25 +434,6 @@ Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
   return ctx->resource_manager()->Lookup(container, shared_name, resource);
 }
 
-template <typename T>
-ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
-                                  const string& name) {
-  ResourceHandle result;
-  result.set_device(ctx->device()->attributes().name());
-  string actual_container;
-  if (!container.empty()) {
-    actual_container = container;
-  } else {
-    actual_container = ctx->resource_manager()->default_container();
-  }
-  result.set_container(actual_container);
-  result.set_name(name);
-  auto type_index = MakeTypeIndex<T>();
-  result.set_hash_code(type_index.hash_code());
-  result.set_maybe_type_name(type_index.name());
-  return result;
-}
-
 template <typename T>
 ResourceHandle MakePerStepResourceHandle(OpKernelContext* ctx,
                                          const string& name) {
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 8fc75731a10..cc7613b97d5 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -200,7 +200,9 @@ TEST(ContainerInfo, Error) {
 // handles.
 class StubDevice : public DeviceBase {
  public:
-  StubDevice(const string& name) : DeviceBase(nullptr) { attr_.set_name(name); }
+  explicit StubDevice(const string& name) : DeviceBase(nullptr) {
+    attr_.set_name(name);
+  }
 
   Allocator* GetAllocator(AllocatorAttributes) override {
     return cpu_allocator();
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index de65657a9e5..813ec6eed58 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -95,11 +95,9 @@ class ResourceOpKernel : public OpKernel {
       resource_ = resource;
     }
     if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      Tensor* handle;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(0, TensorShape({}), &handle));
-      handle->scalar<ResourceHandle>()() =
-          MakeResourceHandle<T>(context, cinfo_.container(), cinfo_.name());
+      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                  context, 0, cinfo_.container(), cinfo_.name(),
+                                  MakeTypeIndex<T>()));
     } else {
       context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index a3eafcf4745..8fbe940f6ae 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -41,6 +41,8 @@ class SessionState {
 
   int64 GetNewId();
 
+  static const char* kTensorHandleResourceTypeName;
+
  private:
   mutex state_lock_;
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 02fdc16e882..1f9e98551f1 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -29,13 +29,15 @@ constexpr int32 InferenceContext::kUnknownRank;
 constexpr int64 InferenceContext::kUnknownDim;
 
 InferenceContext::InferenceContext(
-    const NodeDef* node_def, const OpDef& op_def,
+    int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
     const std::vector<TensorShapeProto>& input_shapes,
     const std::vector<const Tensor*>& input_tensors,
     const std::vector<TensorShapeProto>& input_tensors_as_shapes,
-    const std::vector<TensorShapeProto>& input_handle_shapes,
-    const std::vector<DataType>& input_handle_dtypes)
-    : node_def_(*CHECK_NOTNULL(node_def)) {
+    const std::vector<
+        std::unique_ptr<std::vector<std::pair<TensorShapeProto, DataType>>>>&
+        input_handle_shapes_and_types)
+    : graph_def_version_(graph_def_version),
+      node_def_(*CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
   for (const TensorShapeProto& p : input_tensors_as_shapes) {
     ShapeHandle shape;
@@ -55,30 +57,42 @@ InferenceContext::InferenceContext(
     }
     inputs_.push_back(shape);
   }
-  std::vector<ShapeHandle> handle_shapes;
-  for (const auto& p : input_handle_shapes) {
-    ShapeHandle shape;
-    construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
-    if (!construction_status_.ok()) {
-      return;
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>> handle_data(
+      input_shapes.size());
+  for (int i = 0; i < input_handle_shapes_and_types.size(); ++i) {
+    const auto& v = input_handle_shapes_and_types[i];
+    if (v == nullptr) {
+      continue;
+    }
+    handle_data[i].reset(new std::vector<ShapeAndType>(v->size()));
+    auto& new_v = *handle_data[i];
+    for (int j = 0; j < v->size(); ++j) {
+      const auto& p = (*v)[j];
+      construction_status_.Update(
+          MakeShapeFromShapeProto(p.first, &new_v[j].shape));
+      if (!construction_status_.ok()) {
+        return;
+      }
+      new_v[j].dtype = p.second;
     }
-    handle_shapes.push_back(shape);
   }
-  PostInputInit(handle_shapes, input_handle_dtypes);
+  PostInputInit(std::move(handle_data));
 }
 
 InferenceContext::InferenceContext(
-    const NodeDef* node_def, const OpDef& op_def,
+    int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
     const std::vector<ShapeHandle>& input_shapes,
     const std::vector<const Tensor*>& input_tensors,
     const std::vector<ShapeHandle>& input_tensors_as_shapes,
-    const std::vector<ShapeHandle>& input_handle_shapes,
-    const std::vector<DataType>& input_handle_dtypes)
-    : node_def_(*CHECK_NOTNULL(node_def)) {
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+        input_handle_shapes_and_types)
+    : graph_def_version_(graph_def_version),
+      node_def_(*CHECK_NOTNULL(node_def)) {
   PreInputInit(op_def, input_tensors, input_tensors_as_shapes);
   if (!construction_status_.ok()) return;
   inputs_ = input_shapes;
-  PostInputInit(input_handle_shapes, input_handle_dtypes);
+
+  PostInputInit(std::move(input_handle_shapes_and_types));
 }
 
 InferenceContext::~InferenceContext() {}
@@ -92,7 +106,8 @@ Status InferenceContext::set_output(StringPiece output_name,
     const int start = result->second.first;
     const int size = result->second.second - start;
     if (size != shapes.size()) {
-      errors::InvalidArgument("Must have exactly ", shapes.size(), " shapes.");
+      return errors::InvalidArgument("Must have exactly ", shapes.size(),
+                                     " shapes.");
     }
     for (int i = 0; i < size; ++i) {
       outputs_[i + start] = shapes[i];
@@ -146,16 +161,11 @@ void InferenceContext::PreInputInit(
   for (int i = 0; i < num_outputs; ++i) {
     outputs_.push_back(nullptr);
   }
-  output_handle_shape_.reserve(num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    output_handle_shape_.push_back(UnknownShape());
-  }
-  output_handle_dtype_ = std::vector<DataType>(num_outputs, DT_INVALID);
+  output_handle_shapes_and_types_.resize(num_outputs);
 }
 
 void InferenceContext::PostInputInit(
-    const std::vector<ShapeHandle>& input_handle_shapes,
-    const std::vector<DataType>& input_handle_dtypes) {
+    std::vector<std::unique_ptr<std::vector<ShapeAndType>>> input_handle_data) {
   int num_inputs_from_node_def = 0;
   for (const auto& e : input_name_map_) {
     num_inputs_from_node_def =
@@ -163,25 +173,16 @@ void InferenceContext::PostInputInit(
   }
 
   // Allow passing empty shapes/dtypes to avoid changing every single test.
-  if (input_handle_shapes.empty()) {
-    input_handle_shape_.resize(inputs_.size());
+  if (input_handle_data.empty()) {
+    input_handle_shapes_and_types_.resize(inputs_.size());
   } else {
-    input_handle_shape_ = input_handle_shapes;
-    if (input_handle_shape_.size() != inputs_.size()) {
+    if (input_handle_data.size() != inputs_.size()) {
       construction_status_ = errors::InvalidArgument(
           "Wrong number of handle shapes passed; expected ", inputs_.size(),
-          " got ", input_handle_shape_.size());
-    }
-  }
-  if (input_handle_dtypes.empty()) {
-    input_handle_dtype_ = std::vector<DataType>(inputs_.size(), DT_INVALID);
-  } else {
-    input_handle_dtype_ = input_handle_dtypes;
-    if (input_handle_dtype_.size() != inputs_.size()) {
-      construction_status_ = errors::InvalidArgument(
-          "Wrong number of handle dtypes passed; expected ", inputs_.size(),
-          " got ", input_handle_dtype_.size());
+          " got ", input_handle_data.size());
+      return;
     }
+    input_handle_shapes_and_types_ = std::move(input_handle_data);
   }
 
   if (inputs_.size() != num_inputs_from_node_def) {
@@ -236,8 +237,11 @@ string InferenceContext::DebugString() const {
                          ProtoDebugString(node_def_));
 }
 
-Status InferenceContext::WithRank(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
                                   ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing == rank) {
     *out = shape;
@@ -258,8 +262,11 @@ Status InferenceContext::WithRank(ShapeHandle shape, int32 rank,
                                  existing);
 }
 
-Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64 rank,
                                          ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing >= rank) {
     *out = shape;
@@ -273,8 +280,11 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int32 rank,
                                  " but is rank ", existing);
 }
 
-Status InferenceContext::WithRankAtMost(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64 rank,
                                         ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing == kUnknownRank) {
     return ReturnUnknownShape(out);
@@ -467,12 +477,12 @@ Status InferenceContext::Concatenate(ShapeHandle s1, ShapeHandle s2,
   return ReturnCreatedShape(dims, out);
 }
 
-Status InferenceContext::ReplaceDim(ShapeHandle s, int dim_index_in,
+Status InferenceContext::ReplaceDim(ShapeHandle s, int64 dim_index_in,
                                     DimensionHandle new_dim, ShapeHandle* out) {
   if (!RankKnown(s)) {
     return ReturnUnknownShape(out);
   }
-  int dim_index = dim_index_in;
+  int64 dim_index = dim_index_in;
   if (dim_index < 0) {
     dim_index = s->dims_.size() + dim_index;
   }
@@ -507,7 +517,12 @@ ShapeHandle InferenceContext::UnknownShape() {
   return shape_manager_.UnknownShape();
 }
 
-ShapeHandle InferenceContext::UnknownShapeOfRank(int32 rank) {
+ShapeHandle InferenceContext::UnknownShapeOfRank(int64 rank) {
+  CHECK_LE(rank, kint32max) << "rank must be less than kint32max";
+  if(rank == kUnknownRank) {
+    return UnknownShape();
+  }
+  CHECK_GE(rank, 0) << "rank must not be negative";
   std::vector<DimensionHandle> dims(rank);
   for (int32 i = 0; i < rank; ++i) {
     dims[i] = UnknownDim();
@@ -531,13 +546,13 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(WithRank(input(input_idx), 1, &input_shape));
 
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
   if (input_idx < input_tensors_as_shapes_.size() &&
       input_tensors_as_shapes_[input_idx].IsSet() &&
       RankKnown(input_tensors_as_shapes_[input_idx])) {
     *out = input_tensors_as_shapes_[input_idx];
     return Status::OK();
   }
-  requested_input_tensor_as_partial_shape_[input_idx] = true;
 
   return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
 }
@@ -554,6 +569,7 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
     }
     const auto num_dims = Value(shape_dim);
     std::vector<DimensionHandle> dims;
+    dims.reserve(num_dims);
     for (int i = 0; i < num_dims; i++) dims.push_back(UnknownDim());
     return ReturnCreatedShape(dims, out);
   }
@@ -596,46 +612,64 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
   return ReturnCreatedShape(dims, out);
 }
 
+Status InferenceContext::MakeShapeFromPartialTensorShape(
+    const PartialTensorShape& partial_shape, ShapeHandle* out) {
+  *out = nullptr;
+  if (partial_shape.dims() == -1) {
+    return ReturnUnknownShape(out);
+  }
+  const int num_dims = partial_shape.dims();
+  std::vector<DimensionHandle> dims(num_dims);
+  for (int i = 0; i < num_dims; ++i) {
+    // -1 is unknown in PartialTensorShape and in InferenceContext, so this size
+    // can be passed directly to MakeDim.
+    dims[i] = MakeDim(partial_shape.dim_size(i));
+  }
+  return ReturnCreatedShape(dims, out);
+}
+
+Status InferenceContext::MakeShapeFromTensorShape(const TensorShape& shape,
+                                                  ShapeHandle* out) {
+  return MakeShapeFromPartialTensorShape(PartialTensorShape(shape.dim_sizes()),
+                                         out);
+}
+
 Status InferenceContext::MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                                  ShapeHandle* out) {
   *out = nullptr;
   TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(proto));
   PartialTensorShape partial_shape(proto);
-  if (partial_shape.dims() == -1) {
-    return ReturnUnknownShape(out);
-  }
-  const int num_dims = partial_shape.dims();
-  std::vector<DimensionHandle> dims;
-  dims.reserve(partial_shape.dims());
-  for (int i = 0; i < num_dims; ++i) {
-    // -1 is unknown in proto and in InferenceContext, so this size can be
-    // passed directly to MakeDim.
-    dims.push_back(MakeDim(partial_shape.dim_size(i)));
-  }
-  return ReturnCreatedShape(dims, out);
+  return MakeShapeFromPartialTensorShape(partial_shape, out);
 }
 
-// Returns a new dimension whose value is given by a scalar input tensor.
-Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
-  const Tensor* t = input_tensor(idx);
-  if (t == nullptr) {
-    *out = UnknownDim();
-    return Status::OK();
-  }
+Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64* val) {
+  // Caller must ensure that <t> is not NULL.
   const int rank = t->dims();
   if (rank != 0) {
     return errors::InvalidArgument("Input must be scalar but has rank ", rank);
   }
 
-  int64 val;
   if (t->dtype() == DT_INT32) {
-    val = t->scalar<int32>()();
+    *val = t->scalar<int32>()();
+    return Status::OK();
   } else if (t->dtype() == DT_INT64) {
-    val = t->scalar<int64>()();
+    *val = t->scalar<int64>()();
+    return Status::OK();
   } else {
     return errors::InvalidArgument(
         "Scalar input for dim size must be int32 or int64");
   }
+}
+
+// Returns a new dimension whose value is given by a scalar input tensor.
+Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
+  int64 val;
+  const Tensor* t = input_tensor(idx);
+  if (t == nullptr) {
+    *out = UnknownDim();
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(GetScalarFromTensor(t, &val));
   if (val < 0) {
     return errors::InvalidArgument("Dimension size, given by scalar input ",
                                    idx, ", must be non-negative but is ", val);
@@ -644,6 +678,35 @@ Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
   return Status::OK();
 }
 
+Status InferenceContext::MakeDimForScalarInputWithNegativeIndexing(
+    int idx, int input_rank, DimensionHandle* out) {
+  int64 val;
+  const Tensor* t = input_tensor(idx);
+  if (t == nullptr) {
+    *out = UnknownDim();
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(GetScalarFromTensor(t, &val));
+  if (val < 0) {
+    if (input_rank < 0) {
+      *out = UnknownDim();
+      return Status::OK();
+    } else if (val + input_rank < 0) {
+      return errors::InvalidArgument("Dimension size, given by scalar input ",
+                                     val, " must be in range [-", input_rank,
+                                     ", ", input_rank, ")");
+    } else {
+      val += input_rank;
+    }
+  } else if (input_rank >= 0 && val >= input_rank) {
+    return errors::InvalidArgument("Dimension size, given by scalar input ",
+                                   val, " must be in range [-", input_rank,
+                                   ", ", input_rank, ")");
+  }
+  *out = MakeDim(val);
+  return Status::OK();
+}
+
 Status InferenceContext::Divide(DimensionHandle dividend,
                                 DimensionOrConstant divisor,
                                 bool evenly_divisible, DimensionHandle* out) {
@@ -732,7 +795,7 @@ Status InferenceContext::Multiply(DimensionHandle first,
   } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
-    // Invariant: Both values are known and and greater than 1.
+    // Invariant: Both values are known and greater than 1.
     const int64 product = first_value * second_value;
     if (product < 0) {
       return errors::InvalidArgument(
@@ -786,18 +849,94 @@ Status InferenceContext::AttachContext(const Status& status) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
 
+  // Add information about the input tensors and partial tensor shapes used.
+  std::vector<string> input_from_tensors_str;
+  std::vector<string> input_from_tensors_as_shape_str;
+  for (int i = 0; i < inputs_.size(); ++i) {
+    if (requested_input_tensor_as_partial_shape_[i] &&
+        i < input_tensors_as_shapes_.size() &&
+        input_tensors_as_shapes_[i].IsSet() &&
+        RankKnown(input_tensors_as_shapes_[i])) {
+      input_from_tensors_as_shape_str.push_back(strings::StrCat(
+          "input[", i, "] = ", DebugString(input_tensors_as_shapes_[i])));
+    } else if (requested_input_tensor_[i] && i < input_tensors_.size() &&
+               input_tensors_[i] != nullptr) {
+      input_from_tensors_str.push_back(strings::StrCat(
+          "input[", i, "] = <",
+          input_tensors_[i]->SummarizeValue(256 /* max_values */), ">"));
+    }
+  }
+
   string error_context = strings::StrCat(
       " for '", node_def_.name(), "' (op: '", node_def_.op(),
-      "') with input shapes: ", str_util::Join(input_shapes, ", "), ".");
+      "') with input shapes: ", str_util::Join(input_shapes, ", "));
+  if (!input_from_tensors_str.empty()) {
+    strings::StrAppend(&error_context, " and with computed input tensors: ",
+                       str_util::Join(input_from_tensors_str, ", "));
+  }
+  if (!input_from_tensors_as_shape_str.empty()) {
+    strings::StrAppend(&error_context,
+                       " and with input tensors computed as partial shapes: ",
+                       str_util::Join(input_from_tensors_as_shape_str, ","));
+  }
+
+  strings::StrAppend(&error_context, ".");
   return Status(status.code(),
                 strings::StrCat(status.error_message(), error_context));
 }
 
-ShapeHandle InferenceContext::input_handle_shape(int idx) {
-  if (!input_handle_shape_[idx].IsSet()) {
-    input_handle_shape_[idx] = UnknownShape();
+bool InferenceContext::MergeHandleShapesAndTypes(
+    const std::vector<ShapeAndType>& shapes_and_types,
+    std::vector<ShapeAndType>* to_update) {
+  if (shapes_and_types.size() != to_update->size()) {
+    return false;
   }
-  return input_handle_shape_[idx];
+  std::vector<ShapeAndType> new_values(shapes_and_types.size());
+  bool refined = false;
+  for (int i = 0; i < shapes_and_types.size(); ++i) {
+    const ShapeAndType& existing = (*to_update)[i];
+    new_values[i].dtype = shapes_and_types[i].dtype;
+    if (new_values[i].dtype != existing.dtype && existing.dtype == DT_INVALID) {
+      refined = true;
+    }
+    if (!Merge(existing.shape, shapes_and_types[i].shape, &new_values[i].shape)
+             .ok()) {
+      // merge failed, ignore the new value.
+      new_values[i].shape = existing.shape;
+    }
+    if (!existing.shape.SameHandle(new_values[i].shape)) {
+      refined = true;
+    }
+  }
+  if (!refined) {
+    return false;
+  }
+  for (int i = 0; i < new_values.size(); ++i) {
+    (*to_update)[i] = new_values[i];
+  }
+  return true;
+}
+
+bool InferenceContext::MergeOutputHandleShapesAndTypes(
+    int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+  if (output_handle_shapes_and_types_[idx] == nullptr) {
+    output_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+    return true;
+  }
+  return MergeHandleShapesAndTypes(shapes_and_types,
+                                   output_handle_shapes_and_types_[idx].get());
+}
+
+bool InferenceContext::MergeInputHandleShapesAndTypes(
+    int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+  if (input_handle_shapes_and_types_[idx] == nullptr) {
+    input_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+    return true;
+  }
+  return MergeHandleShapesAndTypes(shapes_and_types,
+                                   input_handle_shapes_and_types_[idx].get());
 }
 
 // -----------------------------------------------------------------------------
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index fd4e25c7283..119bed4071f 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -121,6 +121,14 @@ struct DimensionOrConstant {
   DimensionOrConstant();
 };
 
+struct ShapeAndType {
+  ShapeAndType() {}
+  ShapeAndType(ShapeHandle s, DataType t) : shape(s), dtype(t) {}
+
+  ShapeHandle shape;
+  DataType dtype = DT_INVALID;
+};
+
 // Shape inference functions registered on ops in REGISTER_OP implement
 // their shape functions in terms of this InferenceContext.  An InferenceContext
 // is created by the framework and passed to a shape inference function.  The
@@ -144,29 +152,33 @@ class InferenceContext {
   // Values of <input_tensors_as_shapes> do not need to outlive the context.
   //
   // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
-  InferenceContext(const NodeDef* node_def, const OpDef& op_def,
+  InferenceContext(int graph_def_version, const NodeDef* node_def,
+                   const OpDef& op_def,
                    const std::vector<ShapeHandle>& input_shapes,
                    const std::vector<const Tensor*>& input_tensors,
                    const std::vector<ShapeHandle>& input_tensors_as_shapes,
-                   const std::vector<ShapeHandle>& input_handle_shapes,
-                   const std::vector<DataType>& input_handle_dtypes);
+                   std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+                       input_handle_shapes_and_types);
 
   // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
   //
-  // Elements of <input_tensors_as_shapes> are used for when a shape function
-  // makes a call to MakeShapeFromShapeTensor; in particular, when the
-  // input_tensors[i] is nullptr but the shape represented by it is partially
-  // known from analysis of the graph.
-  // <input_tensors_as_shapes> can have fewer elements than <input_shapes>.
-  // Values of <input_tensors_as_shapes> do not need to outlive the context.
+  // Elements of <input_tensors_as_shapes> are used for when a shape
+  // function makes a call to MakeShapeFromShapeTensor; in particular, when
+  // the input_tensors[i] is nullptr but the shape represented by it is
+  // partially known from analysis of the graph. <input_tensors_as_shapes>
+  // can have fewer elements than <input_shapes>. Values of
+  // <input_tensors_as_shapes> do not need to outlive the context.
   //
-  // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
-  InferenceContext(const NodeDef* node_def, const OpDef& op_def,
-                   const std::vector<TensorShapeProto>& input_shapes,
-                   const std::vector<const Tensor*>& input_tensors,
-                   const std::vector<TensorShapeProto>& input_tensors_as_shapes,
-                   const std::vector<TensorShapeProto>& input_handle_shapes,
-                   const std::vector<DataType>& input_handle_dtypes);
+  // REQUIRES: <node_def> is not NULL, and must outlive the
+  // InferenceContext.
+  InferenceContext(
+      int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
+      const std::vector<TensorShapeProto>& input_shapes,
+      const std::vector<const Tensor*>& input_tensors,
+      const std::vector<TensorShapeProto>& input_tensors_as_shapes,
+      const std::vector<
+          std::unique_ptr<std::vector<std::pair<TensorShapeProto, DataType>>>>&
+          input_handle_shapes_and_types);
 
   ~InferenceContext();
 
@@ -180,10 +192,29 @@ class InferenceContext {
     if (!s.ok()) {
       return AttachContext(s);
     }
+#ifndef NDEBUG
+    for (int i = 0; i < num_outputs(); ++i) {
+      DCHECK(output(i).IsSet())
+          << i << " for " << node_def_.name() << " of type " << node_def_.op();
+    }
+#endif  // NDEBUG
     return s;
   }
 
-  ShapeHandle input(int idx) const { return inputs_[idx]; }
+  // Merge the stored shape of the input in position idx with the specified
+  // shape. This requires idx to be in the [0, num_inputs) range. If the merge
+  // is successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool MergeInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
+        inputs_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    inputs_[idx] = new_shape;
+    return true;
+  }
+  ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
 
@@ -224,9 +255,11 @@ class InferenceContext {
   Status output(StringPiece output_name,
                 std::vector<ShapeHandle>* output) const;
 
+  AttrSlice attrs() const { return AttrSlice(node_def_); }
+
   // idx can be negative for an offset from end of dimensions.
   // idx must be in the range [-1 * s.rank, s.rank).
-  DimensionHandle Dim(ShapeHandle s, int32 idx) {
+  DimensionHandle Dim(ShapeHandle s, int64 idx) {
     if (s->rank_ == kUnknownRank) {
       return UnknownDim();
     }
@@ -237,7 +270,7 @@ class InferenceContext {
   }
   int32 Rank(ShapeHandle s) const {
     DCHECK(s.IsSet());
-    return s->rank_;
+    return s.IsSet() ? s->rank_ : kUnknownRank;
   }
   bool RankKnown(ShapeHandle s) const {
     return (s.IsSet() && (Rank(s) != kUnknownRank));
@@ -266,11 +299,11 @@ class InferenceContext {
   // the shape with asserted rank in <*out>. Otherwise return an error.
   //
   // Note that <*out> may be set to <shape>.
-  Status WithRank(ShapeHandle shape, int32 rank,
+  Status WithRank(ShapeHandle shape, int64 rank,
                   ShapeHandle* out) TF_MUST_USE_RESULT;
-  Status WithRankAtLeast(ShapeHandle shape, int32 rank,
+  Status WithRankAtLeast(ShapeHandle shape, int64 rank,
                          ShapeHandle* out) TF_MUST_USE_RESULT;
-  Status WithRankAtMost(ShapeHandle shape, int32 rank,
+  Status WithRankAtMost(ShapeHandle shape, int64 rank,
                         ShapeHandle* out) TF_MUST_USE_RESULT;
 
   // If <dim> has value <value>, or its value is unknown, returns OK and returns
@@ -321,7 +354,7 @@ class InferenceContext {
 
   // Returns in <out> the shape from replacing <s.dim[dim_index]> with
   // <new_dim>.
-  Status ReplaceDim(ShapeHandle s, int dim_index, DimensionHandle new_dim,
+  Status ReplaceDim(ShapeHandle s, int64 dim_index, DimensionHandle new_dim,
                     ShapeHandle* out) TF_MUST_USE_RESULT;
 
   // Returns a new shape with the given dims. The returned value is owned by
@@ -333,7 +366,7 @@ class InferenceContext {
   ShapeHandle UnknownShape();
 
   // Returns a shape with specified rank but unknown dims.
-  ShapeHandle UnknownShapeOfRank(int32 rank);
+  ShapeHandle UnknownShapeOfRank(int64 rank);
 
   // Returns a new shape of zero dimensions.
   ShapeHandle Scalar();
@@ -353,6 +386,13 @@ class InferenceContext {
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
 
+  // Returns in <out> a new shape corresponding to <partial_shape>.
+  Status MakeShapeFromPartialTensorShape(
+      const PartialTensorShape& partial_shape, ShapeHandle* out);
+
+  // Returns in <out> a new shape corresponding to <shape>.
+  Status MakeShapeFromTensorShape(const TensorShape& shape, ShapeHandle* out);
+
   // Returns a new dimension of the given size.  The returned value is owned by
   // this context.
   inline DimensionHandle MakeDim(DimensionOrConstant d) {
@@ -361,11 +401,24 @@ class InferenceContext {
 
   inline DimensionHandle UnknownDim() { return MakeDim(kUnknownDim); }
 
+  // Returns in <val> a scalar value from an input tensor <t>.  The input tensor
+  // must be a 1-dimensional int32 or int64 tensor.  Caller must ensure that the
+  // input tensor is not NULL.
+  Status GetScalarFromTensor(const Tensor* t, int64* val);
+
   // Returns a new dimension whose value is given by a scalar input tensor.
   // The input tensor must be in host memory, since it is dereferenced to get
   // the value.
   Status MakeDimForScalarInput(int idx, DimensionHandle* out);
 
+  // Returns a new dimension whose value is given by a scalar input tensor.
+  // This allows for a negative input dimension given the rank of a separate
+  // tensor.  This rank can be negative if unknown.
+  // The input tensor must be in host memory, since it is dereferenced to get
+  // the value.
+  Status MakeDimForScalarInputWithNegativeIndexing(int idx, int input_rank,
+                                                   DimensionHandle* out);
+
   // Look up the attr for the NodeDef being evaluated with name attr_name and
   // set *value to its value.  If no attr with attr_name is found in def(), or
   // the attr does not have a matching type, a non-ok status will be returned.
@@ -410,21 +463,38 @@ class InferenceContext {
   // and dtypes of tensors which can be accessed via the handle. These methods
   // propagate that information. Output handle dtypes and shapes are ignored if
   // the output tensor is not of type DT_RESOURCE.
-  ShapeHandle input_handle_shape(int idx);
-  DataType input_handle_dtype(int idx) const {
-    return input_handle_dtype_[idx];
+
+  // Merge the stored shapes and types corresponding to the input handle in
+  // position idx with the specified shapes and types. This requires idx to be
+  // in the [0, num_inputs) range.
+  //
+  // If the merge is successful and any of the new shapes differs from the old
+  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
+  // return true.  Return false otherwise.
+  bool MergeInputHandleShapesAndTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // As MergeInputHandleShapesAndTypes, but for an output.
+  bool MergeOutputHandleShapesAndTypes(
+      int idx, const std::vector<ShapeAndType>& shapes) TF_MUST_USE_RESULT;
+
+  // Returns the output handle shapes and types, for the resource tensor output
+  // at index <idx>. Returns NULL if the shape and types were never set.
+  const std::vector<ShapeAndType>* output_handle_shapes_and_types(int idx) {
+    return output_handle_shapes_and_types_[idx].get();
   }
-  void set_output_handle_shape(int idx, ShapeHandle shape) {
-    output_handle_shape_[idx] = shape;
+
+  // Returns the inputs handle shapes and types, for the resource tensor output
+  // at index <idx>. Returns NULL if the shape and types were not available.
+  const std::vector<ShapeAndType>* input_handle_shapes_and_types(int idx) {
+    return input_handle_shapes_and_types_[idx].get();
   }
-  void set_output_handle_dtype(int idx, DataType dtype) {
-    output_handle_dtype_[idx] = dtype;
-  }
-  ShapeHandle output_handle_shape(int idx) const {
-    return output_handle_shape_[idx];
-  }
-  DataType output_handle_dtype(int idx) const {
-    return output_handle_dtype_[idx];
+
+  void set_output_handle_shapes_and_types(
+      int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+    output_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
   }
 
   // Note that shape functions should usually call MakeShapeFromShapeTensor,
@@ -436,6 +506,8 @@ class InferenceContext {
   Status MakeShapeFromTensor(const Tensor* t, ShapeHandle tensor_shape,
                              ShapeHandle* out);
 
+  int graph_def_version() const { return graph_def_version_; }
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
@@ -473,8 +545,8 @@ class InferenceContext {
   void PreInputInit(const OpDef& op_def,
                     const std::vector<const Tensor*>& input_tensors,
                     const std::vector<ShapeHandle>& input_tensors_as_shapes);
-  void PostInputInit(const std::vector<ShapeHandle>& input_handle_shapes,
-                     const std::vector<DataType>& input_handle_dtypes);
+  void PostInputInit(std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+                         input_handle_data);
 
   DimensionHandle GetDimension(const DimensionOrConstant& d);
 
@@ -491,6 +563,12 @@ class InferenceContext {
   // Adds additional context to the given status.
   Status AttachContext(const Status& status);
 
+  // Used to implement MergeInputHandleShapesAndTypes and
+  // MergeOutputHandleShapesAndTypes.
+  bool MergeHandleShapesAndTypes(
+      const std::vector<ShapeAndType>& shapes_and_types,
+      std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
@@ -503,11 +581,21 @@ class InferenceContext {
   std::vector<ShapeHandle> input_tensors_as_shapes_;
   std::vector<bool> requested_input_tensor_as_partial_shape_;
 
-  std::vector<ShapeHandle> input_handle_shape_;
-  std::vector<DataType> input_handle_dtype_;
-  std::vector<ShapeHandle> output_handle_shape_;
-  std::vector<DataType> output_handle_dtype_;
+  // input_handle_shapes_and_types_[i] is the list of shape/type pairs available
+  // through the resource handle passed along input i of the node.
+  //
+  // Values may be NULL.
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+      input_handle_shapes_and_types_;
 
+  // output_handle_shapes_and_types_[i] is the list of shape/type pairs
+  // available through the resource handle passed along output i of the node.
+  //
+  // Values may be NULL.
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>>
+      output_handle_shapes_and_types_;
+
+  const int graph_def_version_;
   const NodeDef& node_def_;
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 80a8639c021..a9c0303d4cb 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -61,6 +61,9 @@ class ShapeInferenceTest : public ::testing::Test {
   bool SameHandle(ShapeHandle a, ShapeHandle b) { return a.SameHandle(b); }
   bool IsSet(DimensionHandle d) { return d.IsSet(); }
   bool IsSet(ShapeHandle s) { return s.IsSet(); }
+  void TestMergeHandles(bool input_not_output);
+
+  static const int kVersion = 0;  // used for graph-def version.
 };
 
 TEST_F(ShapeInferenceTest, InputOutputByName) {
@@ -71,8 +74,8 @@ TEST_F(ShapeInferenceTest, InputOutputByName) {
                .Attr("N", 3)
                .Input(FakeInput(DT_FLOAT))
                .Finalize(&def);
-  InferenceContext c(&def, op_def, {S({1, 5}), S({2, 5}), S({1, 3})}, {}, {},
-                     {}, {});
+  InferenceContext c(kVersion, &def, op_def, {S({1, 5}), S({2, 5}), S({1, 3})},
+                     {}, {}, {});
 
   EXPECT_EQ("5", c.DebugString(c.NumElements(c.input(0))));
   EXPECT_EQ("10", c.DebugString(c.NumElements(c.input(1))));
@@ -108,7 +111,7 @@ static OpDef MakeOpDef(int num_inputs, int num_outputs) {
 
 TEST_F(ShapeInferenceTest, DimensionOrConstant) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 1), {Unknown()}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 1), {Unknown()}, {}, {}, {});
   EXPECT_EQ(InferenceContext::kUnknownDim,
             c.Value(InferenceContext::kUnknownDim));
   EXPECT_EQ(1, c.Value(1));
@@ -123,13 +126,15 @@ TEST_F(ShapeInferenceTest, Run) {
   NodeDef def;
   def.set_name("foo");
   def.set_op("foo_op");
-  InferenceContext c(&def, MakeOpDef(3, 2), {S({1})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1})}, {}, {}, {});
+  TF_ASSERT_OK(c.construction_status());
 
   {
     auto fn = [](InferenceContext* c) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 6, &h));
       c->set_output(0, c->input(0));
+      c->set_output(1, c->input(0));
       return Status::OK();
     };
     TF_ASSERT_OK(c.Run(fn));
@@ -140,6 +145,7 @@ TEST_F(ShapeInferenceTest, Run) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
+      c->set_output(1, c->input(0));
       return Status::OK();
     };
     Status s = c.Run(fn);
@@ -152,10 +158,102 @@ TEST_F(ShapeInferenceTest, Run) {
   }
 }
 
+// Tests different context data added when Run returns error.
+TEST_F(ShapeInferenceTest, AttachContext) {
+  NodeDef def;
+  def.set_name("foo");
+  def.set_op("foo_op");
+  // Error when no constant tensors were requested.
+  {
+    InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1, 2, 3})}, {}, {},
+                       {});
+    TF_ASSERT_OK(c.construction_status());
+    auto fn = [](InferenceContext* c) {
+      ShapeHandle h;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    };
+    EXPECT_EQ(
+        "Invalid argument: Shape must be at most rank 0 but is rank 3 for "
+        "'foo' (op: 'foo_op') with input shapes: [1,2,3].",
+        c.Run(fn).ToString());
+  }
+
+  // Error when a constant tensor value was requested.
+  {
+    Tensor input_t =
+        ::tensorflow::test::AsTensor<float>({1.1, 2.2, 3.3, 4.4, 5.5});
+    InferenceContext c(kVersion, &def, MakeOpDef(2, 2),
+                       {S({1, 2, 3}), S({4, 5})}, {nullptr, &input_t}, {}, {});
+    TF_ASSERT_OK(c.construction_status());
+    auto fn = [](InferenceContext* c) {
+      c->input_tensor(0);  // get this one, but it's null - won't be in error.
+      c->input_tensor(1);  // get this one, will now be in error.
+      ShapeHandle h;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    };
+    EXPECT_EQ(
+        "Invalid argument: Shape must be at most rank 0 but is rank 3 for "
+        "'foo' (op: 'foo_op') with input shapes: [1,2,3], [4,5] and with "
+        "computed input tensors: input[1] = <1.1 2.2 3.3 4.4 5.5>.",
+        c.Run(fn).ToString());
+  }
+
+  // Error when a constant tensor value as shape was requested, but no partial
+  // shapes provided.
+  {
+    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    InferenceContext c(kVersion, &def, MakeOpDef(2, 2), {S({3}), S({4})},
+                       {nullptr, &input_t}, {}, {});
+    TF_ASSERT_OK(c.construction_status());
+    auto fn = [](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      ShapeHandle h;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    };
+    EXPECT_EQ(
+        "Invalid argument: Shape must be at most rank 0 but is rank 1 for "
+        "'foo' (op: 'foo_op') with input shapes: [3], [4] and with computed "
+        "input tensors: input[1] = <1 2 3 4 5>.",
+        c.Run(fn).ToString());
+  }
+
+  // Error when a constant tensor value as shape was requested, and a partial
+  // shape was provided.
+  {
+    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    InferenceContext c(kVersion, &def, MakeOpDef(2, 2), {S({3}), S({4})},
+                       {nullptr, &input_t}, {S({10, -1, 5}), Unknown()}, {});
+    TF_ASSERT_OK(c.construction_status());
+    auto fn = [](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      ShapeHandle h;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    };
+    EXPECT_EQ(
+        "Invalid argument: Shape must be at most rank 0 but is rank 1 for "
+        "'foo' (op: 'foo_op') with input shapes: [3], [4] and with computed "
+        "input tensors: input[1] = <1 2 3 4 5> and with input tensors computed "
+        "as partial shapes: input[0] = [10,?,5].",
+        c.Run(fn).ToString());
+  }
+}
+
 TEST_F(ShapeInferenceTest, RankAndDimInspection) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 2), {Unknown(), S({1, -1, 3}), S({})},
-                     {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(3, 2),
+                     {Unknown(), S({1, -1, 3}), S({})}, {}, {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(2, c.num_outputs());
 
@@ -195,9 +293,8 @@ TEST_F(ShapeInferenceTest, RankAndDimInspection) {
 
 TEST_F(ShapeInferenceTest, NumElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 2),
-                     {Unknown(), S({1, -1, 3}), S({5, 4, 3, 2})}, {}, {}, {},
-                     {});
+  InferenceContext c(kVersion, &def, MakeOpDef(3, 2),
+                     {Unknown(), S({1, -1, 3}), S({5, 4, 3, 2})}, {}, {}, {});
 
   EXPECT_EQ("?", c.DebugString(c.NumElements(c.input(0))));
   EXPECT_EQ("?", c.DebugString(c.NumElements(c.input(1))));
@@ -210,8 +307,8 @@ TEST_F(ShapeInferenceTest, NumElements) {
 
 TEST_F(ShapeInferenceTest, WithRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {},
-                     {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2),
+                     {Unknown(), S({1, -1, 3})}, {}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -249,8 +346,8 @@ TEST_F(ShapeInferenceTest, WithRank) {
 
 TEST_F(ShapeInferenceTest, WithRankAtMost) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {},
-                     {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2),
+                     {Unknown(), S({1, -1, 3})}, {}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -288,8 +385,8 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
 TEST_F(ShapeInferenceTest, WithRankAtLeast) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {},
-                     {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2),
+                     {Unknown(), S({1, -1, 3})}, {}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -327,7 +424,7 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
 TEST_F(ShapeInferenceTest, WithValue) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, -1})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1, -1})}, {}, {}, {});
 
   auto d0 = c.Dim(c.input(0), 0);
   auto d1 = c.Dim(c.input(0), 1);
@@ -368,8 +465,8 @@ TEST_F(ShapeInferenceTest, WithValue) {
 
 TEST_F(ShapeInferenceTest, MergeDim) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({2, -1, 2, 1, -1})}, {}, {}, {},
-                     {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({2, -1, 2, 1, -1})},
+                     {}, {}, {});
 
   auto d2 = c.Dim(c.input(0), 0);
   auto d_unknown = c.Dim(c.input(0), 1);
@@ -415,10 +512,10 @@ TEST_F(ShapeInferenceTest, MergeDim) {
 
 TEST_F(ShapeInferenceTest, MergeShape) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(7, 2),
+  InferenceContext c(kVersion, &def, MakeOpDef(7, 2),
                      {Unknown(), S({1, 2}), S({-1, 2}), S({1, -1}), S({1, 3}),
                       Unknown(), S({1})},
-                     {}, {}, {}, {});
+                     {}, {}, {});
 
   auto s_unknown = c.input(0);
   auto s_1_2 = c.input(1);
@@ -457,6 +554,11 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 0), c.Dim(out, 0)));
   EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 1), c.Dim(out, 1)));
 
+  auto s_u1 = c.UnknownShapeOfRank(1);
+  auto s_u2 = c.UnknownShapeOfRank(1);
+  TF_EXPECT_OK(c.Merge(s_u1, s_u2, &out));
+  EXPECT_TRUE(SameHandle(s_u1, out));
+
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
   EXPECT_TRUE(
@@ -485,11 +587,11 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
 TEST_F(ShapeInferenceTest, MergePrefix) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(4, 2),
+  InferenceContext c(kVersion, &def, MakeOpDef(4, 2),
                      {
                          Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
                      },
-                     {}, {}, {}, {});
+                     {}, {}, {});
 
   auto s_unknown = c.input(0);
   auto s_u_2 = c.input(1);
@@ -541,8 +643,8 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
 
 TEST_F(ShapeInferenceTest, Subshape) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {S({1, 2, 3, -1, 5}), Unknown()},
-                     {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2),
+                     {S({1, 2, 3, -1, 5}), Unknown()}, {}, {}, {});
 
   ShapeHandle unknown = c.input(1);
   ShapeHandle out;
@@ -616,8 +718,8 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
 TEST_F(ShapeInferenceTest, Concatenate) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 2),
-                     {S({1, -1, 3}), S({4, 5}), Unknown()}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(3, 2),
+                     {S({1, -1, 3}), S({4, 5}), Unknown()}, {}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -643,8 +745,8 @@ TEST_F(ShapeInferenceTest, Concatenate) {
 
 TEST_F(ShapeInferenceTest, ReplaceDim) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 0), {S({1, 2, 3}), Unknown()}, {}, {},
-                     {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 0), {S({1, 2, 3}), Unknown()},
+                     {}, {}, {});
 
   auto in = c.input(0);
   auto unknown = c.input(1);
@@ -675,12 +777,13 @@ TEST_F(ShapeInferenceTest, ReplaceDim) {
 
 TEST_F(ShapeInferenceTest, MakeShape) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, 3, -1, 5})}, {}, {}, {},
-                     {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1, 2, 3, -1, 5})}, {},
+                     {}, {});
 
   std::vector<DimensionHandle> dims;
   auto in0 = c.input(0);
   const int rank = c.Rank(in0);
+  dims.reserve(rank);
   for (int i = 0; i < rank; ++i) {
     dims.push_back(c.Dim(in0, rank - i - 1));
   }
@@ -701,7 +804,7 @@ TEST_F(ShapeInferenceTest, MakeShape) {
 TEST_F(ShapeInferenceTest, UnknownShape) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto u0 = c.UnknownShape();
   auto u1 = c.UnknownShape();
@@ -713,7 +816,7 @@ TEST_F(ShapeInferenceTest, UnknownShape) {
 TEST_F(ShapeInferenceTest, Scalar) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto s0 = c.Scalar();
   EXPECT_EQ("[]", c.DebugString(s0));
@@ -724,7 +827,7 @@ TEST_F(ShapeInferenceTest, Scalar) {
 TEST_F(ShapeInferenceTest, Vector) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto s0 = c.Vector(1);
   EXPECT_EQ("[1]", c.DebugString(s0));
@@ -740,7 +843,7 @@ TEST_F(ShapeInferenceTest, Vector) {
 TEST_F(ShapeInferenceTest, Matrix) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto s0 = c.Matrix(1, 2);
   EXPECT_EQ("[1,2]", c.DebugString(s0));
@@ -762,7 +865,8 @@ TEST_F(ShapeInferenceTest, Matrix) {
 TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   auto create = [&](Tensor* t) {
     NodeDef def;
-    InferenceContext c(&def, MakeOpDef(1, 0), {Unknown()}, {t}, {}, {}, {});
+    InferenceContext c(kVersion, &def, MakeOpDef(1, 0), {Unknown()}, {t}, {},
+                       {});
     ShapeHandle out;
     Status s = c.MakeShapeFromShapeTensor(0, &out);
     if (s.ok()) {
@@ -814,18 +918,51 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   // Test when the input shape is wrong.
   {
     NodeDef def;
-    InferenceContext c(&def, MakeOpDef(1, 0), {S({1, -1})}, {nullptr}, {}, {},
-                       {});
+    InferenceContext c(kVersion, &def, MakeOpDef(1, 0), {S({1, -1})}, {nullptr},
+                       {}, {});
     ShapeHandle out;
     EXPECT_EQ("Shape must be rank 1 but is rank 2",
               c.MakeShapeFromShapeTensor(0, &out).error_message());
   }
 }
 
+TEST_F(ShapeInferenceTest, MakeShapeFromPartialTensorShape) {
+  NodeDef def;
+  std::vector<ShapeHandle> empty;
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
+
+  // With an unknown rank.
+  ShapeHandle out;
+  TF_ASSERT_OK(c.MakeShapeFromPartialTensorShape(PartialTensorShape(), &out));
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // With a known rank.
+  TF_ASSERT_OK(
+      c.MakeShapeFromPartialTensorShape(PartialTensorShape({0}), &out));
+  EXPECT_EQ("[0]", c.DebugString(out));
+  TF_ASSERT_OK(c.MakeShapeFromPartialTensorShape(
+      PartialTensorShape({0, -1, 1000}), &out));
+  EXPECT_EQ("[0,?,1000]", c.DebugString(out));
+}
+
+TEST_F(ShapeInferenceTest, MakeShapeFromTensorShape) {
+  NodeDef def;
+  std::vector<ShapeHandle> empty;
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
+
+  ShapeHandle out;
+  TF_ASSERT_OK(c.MakeShapeFromTensorShape(TensorShape(), &out));
+  EXPECT_EQ("[]", c.DebugString(out));
+  TF_ASSERT_OK(c.MakeShapeFromTensorShape(TensorShape({0}), &out));
+  EXPECT_EQ("[0]", c.DebugString(out));
+  TF_ASSERT_OK(c.MakeShapeFromTensorShape(TensorShape({0, 7, 1000}), &out));
+  EXPECT_EQ("[0,7,1000]", c.DebugString(out));
+}
+
 TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
   TensorShapeProto proto;
 
   // With a set unknown rank.
@@ -861,7 +998,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 TEST_F(ShapeInferenceTest, MakeDim) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto d0 = c.MakeDim(1);
   auto d1 = c.MakeDim(1);
@@ -875,7 +1012,7 @@ TEST_F(ShapeInferenceTest, MakeDim) {
 TEST_F(ShapeInferenceTest, UnknownDim) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto d0 = c.UnknownDim();
   auto d1 = c.UnknownDim();
@@ -887,7 +1024,7 @@ TEST_F(ShapeInferenceTest, UnknownDim) {
 TEST_F(ShapeInferenceTest, UnknownShapeOfRank) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   auto unknown_shape_of_rank_3 = c.UnknownShapeOfRank(3);
   EXPECT_EQ("[?,?,?]", c.DebugString(unknown_shape_of_rank_3));
@@ -900,8 +1037,8 @@ TEST_F(ShapeInferenceTest, InputTensors) {
   const Tensor t1 = tensorflow::test::AsTensor<float>({10});
   const Tensor t2 = tensorflow::test::AsTensor<float>({20, 30});
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 2), {S({1}), S({2}), S({3})},
-                     {&t1, &t2}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(3, 2), {S({1}), S({2}), S({3})},
+                     {&t1, &t2}, {}, {});
 
   EXPECT_TRUE(c.input_tensor(0) == &t1);
   EXPECT_TRUE(c.input_tensor(1) == &t2);
@@ -912,8 +1049,8 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   Tensor t1 = tensorflow::test::AsScalar<int32>(20);
   Tensor t2 = tensorflow::test::AsScalar<int32>(-1);
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2}, {}, {},
-                     {});
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2), {S({}), S({})},
+                     {&t1, &t2}, {}, {});
 
   DimensionHandle d;
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
@@ -944,7 +1081,7 @@ TEST_F(ShapeInferenceTest, GetAttr) {
             .ok());
 
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, op_reg_data.op_def, empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, op_reg_data.op_def, empty, {}, {}, {});
   string value;
   EXPECT_TRUE(c.GetAttr("foo", &value).ok());
   EXPECT_EQ("bar", value);
@@ -952,8 +1089,8 @@ TEST_F(ShapeInferenceTest, GetAttr) {
 
 TEST_F(ShapeInferenceTest, Divide) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 1, 2, 0})}, {}, {}, {},
-                     {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({6, -1, 1, 2, 0})}, {},
+                     {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1015,7 +1152,8 @@ TEST_F(ShapeInferenceTest, Divide) {
 
 TEST_F(ShapeInferenceTest, Add) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({6, -1, 0})}, {}, {},
+                     {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1066,7 +1204,8 @@ TEST_F(ShapeInferenceTest, Add) {
 
 TEST_F(ShapeInferenceTest, Subtract) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 5})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({6, -1, 0, 5})}, {},
+                     {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1115,7 +1254,8 @@ TEST_F(ShapeInferenceTest, Subtract) {
 
 TEST_F(ShapeInferenceTest, Multiply) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 1})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({6, -1, 0, 1})}, {},
+                     {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1168,7 +1308,7 @@ TEST_F(ShapeInferenceTest, Multiply) {
 TEST_F(ShapeInferenceTest, FullyDefined) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(0, 2), empty, {}, {}, {});
 
   // No rank or missing dimension information should return false.
   EXPECT_FALSE(c.FullyDefined(c.UnknownShape()));
@@ -1181,7 +1321,8 @@ TEST_F(ShapeInferenceTest, FullyDefined) {
 
 TEST_F(ShapeInferenceTest, Min) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1, 0})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1, 2, -1, 0})}, {},
+                     {}, {});
 
   auto s = c.input(0);
   auto d_1 = c.Dim(s, 0);
@@ -1229,7 +1370,8 @@ TEST_F(ShapeInferenceTest, Min) {
 
 TEST_F(ShapeInferenceTest, Max) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1})}, {}, {}, {}, {});
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2), {S({1, 2, -1})}, {}, {},
+                     {});
 
   auto s = c.input(0);
   auto d_1 = c.Dim(s, 0);
@@ -1264,5 +1406,116 @@ TEST_F(ShapeInferenceTest, Max) {
   EXPECT_TRUE(SameHandle(d_2, out));
 }
 
+void ShapeInferenceTest::TestMergeHandles(bool input_not_output) {
+  NodeDef def;
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2), {S({}), S({})}, {}, {},
+                     {});
+  auto make_shape = [&c](std::initializer_list<int64> dim_sizes) {
+    ShapeHandle s;
+    TF_CHECK_OK(c.MakeShapeFromShapeProto(S(dim_sizes), &s));
+    return s;
+  };
+  auto get_shapes_and_types_from_context = [&](int idx) {
+    if (input_not_output) {
+      return c.input_handle_shapes_and_types(idx);
+    } else {
+      return c.output_handle_shapes_and_types(idx);
+    }
+  };
+  auto merge_shapes_and_types_to_context =
+      [&](int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+        if (input_not_output) {
+          return c.MergeInputHandleShapesAndTypes(idx, shapes_and_types);
+        } else {
+          return c.MergeOutputHandleShapesAndTypes(idx, shapes_and_types);
+        }
+      };
+
+  EXPECT_TRUE(get_shapes_and_types_from_context(0) == nullptr);
+  EXPECT_TRUE(get_shapes_and_types_from_context(1) == nullptr);
+
+  // First merge will take the input completely.
+  std::vector<ShapeAndType> t{{make_shape({1, 2, 3}), DT_FLOAT},
+                              {c.UnknownShape(), DT_INVALID},
+                              {make_shape({4, 3, 2, 1}), DT_INT32}};
+  ASSERT_TRUE(merge_shapes_and_types_to_context(0, t));
+  ASSERT_TRUE(get_shapes_and_types_from_context(0) != nullptr);
+  std::vector<ShapeAndType> v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Merge that fails because wrong number of values passed.
+  // Fails, and no changes made.
+  ASSERT_FALSE(merge_shapes_and_types_to_context(
+      0, std::vector<ShapeAndType>{{make_shape({1, 2, 3}), DT_FLOAT}}));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Only difference is in a mismatched shape. That is ignored,
+  // and there are no other changes, so nothing is done.
+  //
+  // TODO(cwhipkey): in mismatch cases, change Merge*HandleShapesAndTypes to
+  // return an error (separate error from 'refined' output)?
+  auto t2 = t;
+  t2[2].shape = make_shape({4, 3, 4, 1});
+  ASSERT_FALSE(merge_shapes_and_types_to_context(0, t2));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Only difference is in a mismatched dtype. That is ignored,
+  // and there are no other changes, so nothing is done.
+  t2 = t;
+  t2[2].dtype = DT_FLOAT;
+  ASSERT_FALSE(merge_shapes_and_types_to_context(0, t2));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Difference is mergeable (new shape).
+  t[1].shape = make_shape({1, 10});
+  ASSERT_TRUE(merge_shapes_and_types_to_context(0, t));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Difference is mergeable (new type).
+  t[1].dtype = DT_DOUBLE;
+  ASSERT_TRUE(merge_shapes_and_types_to_context(0, t));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // No difference.
+  ASSERT_FALSE(merge_shapes_and_types_to_context(0, t));
+}
+
+TEST_F(ShapeInferenceTest, MergeInputHandleShapesAndTypes) {
+  TestMergeHandles(true);
+}
+
+TEST_F(ShapeInferenceTest, MergeOutputHandleShapesAndTypes) {
+  TestMergeHandles(false);
+}
+
 }  // namespace shape_inference
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index a225824f826..7b3cd07429b 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -43,8 +43,9 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     in_shapes.push_back(shape);
   }
 
-  shape_inference::InferenceContext c(&op.node_def, op_reg_data->op_def,
-                                      in_shapes, op.input_tensors, {}, {}, {});
+  shape_inference::InferenceContext c(op.graph_def_version, &op.node_def,
+                                      op_reg_data->op_def, in_shapes,
+                                      op.input_tensors, {}, {});
   TF_RETURN_IF_ERROR(c.construction_status());
   if (op_reg_data->shape_inference_fn == nullptr) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 64067464fb9..996281e70e6 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/version.h"
 
 // Contains utilities for writing tests for shape inference functions.
 
@@ -34,6 +35,7 @@ struct ShapeInferenceTestOp {
   string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
+  int graph_def_version = TF_GRAPH_DEF_VERSION;
 };
 
 namespace shape_inference {
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index b0af0e5bd91..20a6807064b 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -51,6 +51,7 @@ string RunInferShapes(const string& op_name, const string& ins,
   ShapeInferenceTestOp op(op_name);
   const int num_inputs = 1 + std::count(ins.begin(), ins.end(), ';');
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
   NodeDef node_def;
   TF_CHECK_OK(NodeDefBuilder("dummy", op_name)
@@ -93,10 +94,11 @@ TEST(ShapeInferenceTestutilTest, Failures) {
             RunInferShapes(op, "[1];[2];[1]", "e", fn_copy_input_0));
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "[1];[2]", fn_copy_input_0),
                   "wrong number of outputs");
-  EXPECT_EQ("Op type not registered 'NoSuchOp'",
-            ShapeInferenceTestutil::InferShapes(
-                ShapeInferenceTestOp("NoSuchOp"), "", "")
-                .error_message());
+  auto error_message = ShapeInferenceTestutil::InferShapes(
+                           ShapeInferenceTestOp("NoSuchOp"), "", "")
+                           .error_message();
+  EXPECT_TRUE(StringPiece(error_message)
+                  .starts_with("Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 4488f985c7a..3b3d62193c9 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -15,8 +15,15 @@ import "tensorflow/core/framework/tensor_description.proto";
 
 message AllocatorMemoryUsed {
   string allocator_name = 1;
+  // These are per-node allocator memory stats.
   int64 total_bytes = 2;
   int64 peak_bytes = 3;
+  // The bytes that are not deallocated.
+  int64 live_bytes = 4;
+
+  // These are snapshots of the overall allocator memory stats.
+  // The number of live bytes currently allocated by the allocator.
+  int64 allocator_bytes_in_use = 5;
 }
 
 // Output sizes recorded for a single execution of a graph node.
@@ -25,6 +32,16 @@ message NodeOutput {
   TensorDescription tensor_description = 3;
 };
 
+// For memory tracking.
+message MemoryStats {
+  int64 host_temp_memory_size = 1;
+  int64 device_temp_memory_size = 2;
+  int64 host_persistent_memory_size = 3;
+  int64 device_persistent_memory_size = 4;
+  repeated int64 host_persistent_tensor_alloc_ids = 5;
+  repeated int64 device_persistent_tensor_alloc_ids = 6;
+}
+
 // Time/size stats recorded for a single execution of a graph node.
 message NodeExecStats {
   // TODO(tucker): Use some more compact form of node identity than
@@ -42,6 +59,7 @@ message NodeExecStats {
   int64 scheduled_micros = 9;
   uint32 thread_id = 10;
   repeated AllocationDescription referenced_tensor = 11;
+  MemoryStats memory_stats = 12;
 };
 
 message DeviceStepStats {
diff --git a/tensorflow/core/framework/summary.proto b/tensorflow/core/framework/summary.proto
index 3560b96dfcc..12274d5e135 100644
--- a/tensorflow/core/framework/summary.proto
+++ b/tensorflow/core/framework/summary.proto
@@ -33,6 +33,26 @@ message HistogramProto {
   repeated double bucket = 7 [packed = true];
 };
 
+// A SummaryMetadata encapsulates information on which plugins are able to make
+// use of a certain summary value.
+message SummaryMetadata {
+  message PluginData {
+    // The name of the plugin this data pertains to.
+    string plugin_name = 1;
+
+    // The content to store for the plugin. The best practice is for this JSON
+    // string to be the canonical JSON serialization of a protocol buffer
+    // defined by the plugin. Converting that protobuf to and from JSON is the
+    // responsibility of the plugin code, and is not enforced by
+    // TensorFlow/TensorBoard.
+    string content = 2;
+  }
+
+  // A list of plugin data. A single summary value instance may be used by more
+  // than 1 plugin.
+  repeated PluginData plugin_data = 1;
+};
+
 // A Summary is a set of named values to be displayed by the
 // visualizer.
 //
@@ -71,22 +91,21 @@ message Summary {
   }
 
   message Value {
-    // Name of the node that output this summary; in general, the name of a
-    // TensorSummary node. If the node in question has multiple outputs, then
-    // a ":\d+" suffix will be appended, like "some_op:13".
-    // Might not be set for legacy summaries (i.e. those not using the tensor
-    // value field)
+    // This field is deprecated and will not be set.
     string node_name = 7;
 
-    // Tag name for the data.  Will only be used by legacy summaries
-    // (ie. those not using the tensor value field)
-    // For legacy summaries, will be used as the title of the graph
-    // in the visualizer.
-    //
-    // Tag is usually "op_name:value_name", where "op_name" itself can have
-    // structure to indicate grouping.
+    // Tag name for the data. Used by TensorBoard plugins to organize data. Tags
+    // are often organized by scope (which contains slashes to convey
+    // hierarchy). For example: foo/bar/0
     string tag = 1;
 
+    // Contains metadata on the summary value such as which plugins may use it.
+    // Take note that many summary values may lack a metadata field. This is
+    // because the FileWriter only keeps a metadata object on the first summary
+    // value with a certain tag for each tag. TensorBoard then remembers which
+    // tags are associated with which plugins. This saves space.
+    SummaryMetadata metadata = 9;
+
     // Value associated with the tag.
     oneof value {
       float simple_value = 2;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index f622d031f2a..243b30c007f 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -51,7 +51,7 @@ namespace {
 // An un-templated base class for Buffer.
 class BufferBase : public TensorBuffer {
  public:
-  BufferBase(Allocator* alloc) : alloc_(alloc) {}
+  explicit BufferBase(Allocator* alloc) : alloc_(alloc) {}
 
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -526,6 +526,14 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
   }
 }
 
+// Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
+// For the latter case, we have to make sure that the refcount is
+// one both for the SubBuffer _and_ the underlying TensorBuffer.
+bool Tensor::RefCountIsOne() const {
+  return buf_ != nullptr && buf_->RefCountIsOne() &&
+         buf_->root_buffer()->RefCountIsOne() && buf_->OwnsMemory();
+}
+
 // The macro CASES() expands to a switch statement conditioned on
 // TYPE_ENUM. Each case expands the STMTS after a typedef for T.
 #define SINGLE_ARG(...) __VA_ARGS__
@@ -722,6 +730,18 @@ size_t Tensor::TotalBytes() const {
   return 0;  // Makes compiler happy.
 }
 
+size_t Tensor::AllocatedBytes() const {
+  TensorDescription tensor_description;
+  FillDescription(&tensor_description);
+  if (tensor_description.has_allocation_description() &&
+      tensor_description.allocation_description().allocated_bytes() > 0) {
+    return tensor_description.allocation_description().allocated_bytes();
+  } else {
+    // Fall back to TotalBytes() if the allocator doesn't have its size.
+    return TotalBytes();
+  }
+}
+
 bool Tensor::CanUseDMA() const {
   CASES(dtype(), return is_simple_type<T>::value);
   return false;  // Makes compiler happy.
@@ -882,42 +902,27 @@ void Tensor::FillDescription(TensorDescription* description) const {
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatInnerDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = num_out_dims - 1; out_dim > 0; --out_dim) {
-    const int64 in_dim = out_dim + (dims() - num_out_dims);
-    out_dims[out_dim] = (in_dim >= dims() || in_dim < 0) ? 1 : dim_size(in_dim);
-    prod_out_dims *= out_dims[out_dim];
+  int64 offset = orig.size() - num_out_dims;
+  for (int64 out_dim = num_out_dims - 1; out_dim >= 0; --out_dim) {
+    const int64 in_dim = out_dim + offset;
+    out_dims[out_dim] = in_dim < 0 ? 1 : orig[in_dim];
   }
-  if (prod_out_dims != 0) {
-    out_dims[0] = num_elements / prod_out_dims;
-  } else {
-    out_dims[0] = 0;
+  for (int64 in_dim = 0; in_dim < offset; ++in_dim) {
+    out_dims[0] *= orig[in_dim];
   }
   return out_dims;
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatOuterDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = 0; out_dim < num_out_dims - 1; ++out_dim) {
-    out_dims[out_dim] = out_dim >= dims() ? 1 : dim_size(out_dim);
-    prod_out_dims *= out_dims[out_dim];
+  for (int64 out_dim = 0; out_dim <= num_out_dims - 1; ++out_dim) {
+    out_dims[out_dim] = out_dim >= orig.size() ? 1 : orig[out_dim];
   }
-  if (prod_out_dims != 0) {
-    out_dims[num_out_dims - 1] = num_elements / prod_out_dims;
-  } else {
-    out_dims[num_out_dims - 1] = 0;
+  for (int64 in_dim = num_out_dims; in_dim < orig.size(); ++in_dim) {
+    out_dims[num_out_dims - 1] *= orig[in_dim];
   }
   return out_dims;
 }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d71dfdab9cb..a164fe61b5f 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -38,6 +38,7 @@ namespace tensorflow {
 class TensorBuffer;  // Forward declaration.
 class TensorCApi;
 
+/// @ingroup core
 /// Represents an n-dimensional array of values.
 class Tensor {
  public:
@@ -102,9 +103,9 @@ class Tensor {
   /// Copy constructor.
   Tensor(const Tensor& other);
 
-  /// \brief Move constructor. After this call, <other> is safely destructible and can
-  /// be assigned to, but other calls on it (e.g. shape manipulation) are not
-  /// valid.
+  /// \brief Move constructor. After this call, <other> is safely destructible
+  /// and can be assigned to, but other calls on it (e.g. shape manipulation)
+  /// are not valid.
   Tensor(Tensor&& other);
 
   ~Tensor();
@@ -143,6 +144,9 @@ class Tensor {
   /// Returns the estimated memory usage of this tensor.
   size_t TotalBytes() const;
 
+  // Returns the size of sallocated memory for this tensor.
+  size_t AllocatedBytes() const;
+
   /// Returns true iff this tensor is aligned.
   bool IsAligned() const {
 #if EIGEN_MAX_ALIGN_BYTES == 0
@@ -300,6 +304,15 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::Tensor flat_outer_dims();
 
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
+  /// first 'begin' Tensor dimensions into the first dimension of the result and
+  /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
+  /// dimension of the result. If 'begin' < 0 then the |'begin'| leading
+  /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
+  /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64 begin);
+
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::Tensor shaped(gtl::ArraySlice<int64> new_sizes);
 
@@ -382,6 +395,9 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
 
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(int64 begin) const;
+
   /// Render the first `max_entries` values in `*this` into a string.
   string SummarizeValue(int64 max_entries) const;
 
@@ -413,6 +429,9 @@ class Tensor {
                               const TensorShape&);
 
  private:
+  // Returns true if the refcount on buf_ and any possible underlying root
+  // buffer is one.
+  bool RefCountIsOne() const;
   void CheckType(DataType expected_dtype) const;
   void CheckTypeAndIsAligned(DataType expected_dtype) const;
   void CheckIsAlignedAndSingleElement() const;
@@ -422,10 +441,11 @@ class Tensor {
       gtl::ArraySlice<int64> new_sizes,
       Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
-  // TODO(rmlarsen): These shouldn't hardcode '4' so that it lines up with
   // TensorShape's InlineVector.
-  gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(int64 num_out_dims) const;
-  gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(int64 num_out_dims) const;
+  static gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
+  static gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
 
   TensorShape shape_;
   TensorBuffer* buf_;
@@ -438,6 +458,9 @@ class Tensor {
   friend class TensorTestHelper;      // For access to set_shape
   template <typename Device, typename T>
   friend class CreateVariableOp;
+  friend class OpKernelContext;  // For access to RefCountIsOne().
+  friend class NumpyTensorBuffer;  // For access to the private constructor
+                                   // taking the buffer.
 
   // Creates a tensor with the input datatype, shape and buf.
   //
@@ -492,6 +515,9 @@ class TensorBuffer : public core::RefCounted {
   T* base() const {
     return reinterpret_cast<T*>(data());
   }
+
+  // Whether this TensorBuffer owns the underlying memory.
+  virtual bool OwnsMemory() const { return true; }
 };
 
 template <typename T>
@@ -516,7 +542,6 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::tensor() const {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::Tensor(base<T>(),
                                            shape().AsEigenDSizes<NDIMS>());
 }
@@ -524,7 +549,6 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_tensor() const {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(),
                                                 shape().AsEigenDSizes<NDIMS>());
 }
@@ -555,7 +579,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
@@ -596,7 +619,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(&dims, new_sizes);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
@@ -625,22 +647,36 @@ typename TTypes<T>::ConstScalar Tensor::scalar() const {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_dims() {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(int64 begin) const {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 inline Tensor::Tensor(const Tensor& other)
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 3ab6cd01a82..1284214952c 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -20,15 +20,22 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/overflow.h"
 
 namespace tensorflow {
 
-// An upper limit of the total number of elements in a tensor.
-static const int64 kMaxElements = (1LL << 40);
+// TensorShape and PartialTensorShape should have no fields beyond
+// TensorShapeRep.  In particular, their sizes should be the same.
+static_assert(sizeof(TensorShapeRep) == sizeof(TensorShape),
+              "TensorShape must have no fields beyond TensorShapeRep");
+static_assert(sizeof(TensorShapeRep) == sizeof(PartialTensorShape),
+              "PartialTensorShape must have no fields beyond TensorShapeRep");
 
-static void AppendTo(const TensorShape& s, gtl::InlinedVector<int64, 8>* vals) {
-  for (auto it = s.begin(); it != s.end(); ++it) {
-    vals->push_back((*it).size);
+template <class Shape>
+static void AppendTo(const TensorShapeBase<Shape>& s,
+                     gtl::InlinedVector<int64, 8>* vals) {
+  for (auto dim : s) {
+    vals->push_back(dim.size);
   }
 }
 
@@ -43,71 +50,117 @@ void TensorShape::CheckDimsAtLeast(int NDIMS) const {
                           << " dimensions";
 }
 
-bool TensorShape::IsValid(const TensorShapeProto& proto) {
+template <class Shape>
+bool TensorShapeBase<Shape>::IsValid(const TensorShapeProto& proto) {
+  // NOTE(irving): Unfortunately, TensorShape allows parsing protos with
+  // unknown_shape() set, and it seems hard to remove this without backwards
+  // compatibility issues.
+  if (kIsPartial && proto.unknown_rank()) return proto.dim_size() == 0;
   int64 num_elements = 1;
   if (proto.dim().size() > MaxDimensions()) return false;
   for (const auto& d : proto.dim()) {
-    if (d.size() < 0) return false;
-    num_elements *= d.size();
-    if (num_elements > kMaxElements) return false;
+    if (d.size() < (kIsPartial ? -1 : 0)) return false;
+    if (d.size() == -1) {
+      num_elements = -1;
+    } else if (!kIsPartial || num_elements >= 0) {
+      num_elements = MultiplyWithoutOverflow(num_elements, d.size());
+      if (num_elements < 0) return false;
+    }
   }
   return true;
 }
 
-Status TensorShape::IsValidShape(const TensorShapeProto& proto) {
+template <class Shape>
+Status TensorShapeBase<Shape>::IsValidShape(const TensorShapeProto& proto) {
+  // NOTE(irving): Unfortunately, TensorShape allows parsing protos with
+  // unknown_shape() set, and it seems hard to remove this without backwards
+  // compatibility issues.
+  if (kIsPartial && proto.unknown_rank()) {
+    if (proto.dim_size() > 0) {
+      return errors::InvalidArgument(
+          "An unknown shape must not have any dimensions set.");
+    }
+    return Status::OK();
+  }
   int64 num_elements = 1;
   if (proto.dim().size() > MaxDimensions()) {
     return errors::InvalidArgument("Shape ", DebugString(proto),
                                    " has too many dimensions");
   }
   for (const auto& d : proto.dim()) {
-    if (d.size() < 0) {
-      return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " has negative dimensions");
+    if (d.size() < (kIsPartial ? -1 : 0)) {
+      if (kIsPartial) {
+        return errors::InvalidArgument(
+            "Shape ", DebugString(proto),
+            " has dimensions with values below -1 (where -1 means unknown)");
+      } else {
+        return errors::InvalidArgument("Shape ", DebugString(proto),
+                                       " is not fully defined");
+      }
     }
-    num_elements *= d.size();
-    if (num_elements > kMaxElements) {
-      return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " is too large (more than ", kMaxElements,
-                                     " entries)");
+    if (d.size() == -1) {
+      num_elements = -1;
+    } else if (!kIsPartial || num_elements >= 0) {
+      num_elements = MultiplyWithoutOverflow(num_elements, d.size());
+      if (num_elements < 0) {
+        return errors::InvalidArgument(
+            "Shape ", DebugString(proto),
+            " is too large (more than 2**63 - 1 entries)");
+      }
     }
   }
   return Status::OK();
 }
 
-TensorShape::TensorShape(const TensorShapeProto& proto) {
+template <class Shape>
+TensorShapeBase<Shape>::TensorShapeBase(const TensorShapeProto& proto) {
   set_tag(REP16);
-  set_ndims_byte(0);
   set_data_type(DT_INVALID);
-  num_elements_ = 1;
-  for (const auto& d : proto.dim()) {
-    AddDim(d.size());
+  // NOTE(irving): Unfortunately, TensorShape allows parsing protos with
+  // unknown_shape() set, and it seems hard to remove this without backwards
+  // compatibility issues.
+  if (kIsPartial && proto.unknown_rank()) {
+    set_ndims_byte(kUnknownRank);
+    set_num_elements(-1);
+  } else {
+    set_ndims_byte(0);
+    set_num_elements(1);
+    for (const auto& d : proto.dim()) {
+      AddDim(d.size());
+    }
   }
 }
 
-TensorShape::TensorShape(gtl::ArraySlice<int64> dim_sizes) {
+template <class Shape>
+TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64> dim_sizes) {
   set_tag(REP16);
-  set_ndims_byte(0);
   set_data_type(DT_INVALID);
-  num_elements_ = 1;
-  for (const int64& s : dim_sizes) {
+  set_ndims_byte(0);
+  set_num_elements(1);
+  for (int64 s : dim_sizes) {
     AddDim(internal::SubtleMustCopy(s));
   }
 }
 
-TensorShape::TensorShape() {
+template <class Shape>
+TensorShapeBase<Shape>::TensorShapeBase() {
   set_tag(REP16);
-  set_ndims_byte(0);
   set_data_type(DT_INVALID);
-  num_elements_ = 1;
+  if (kIsPartial) {
+    set_ndims_byte(kUnknownRank);
+    set_num_elements(-1);
+  } else {
+    set_ndims_byte(0);
+    set_num_elements(1);
+  }
 }
 
-void TensorShape::DestructorOutOfLine() {
+void TensorShapeRep::DestructorOutOfLine() {
   DCHECK(tag() == REP_OUT_OF_LINE);
   delete as64()->dims_;
 }
 
-void TensorShape::SlowCopyFrom(const TensorShape& b) {
+void TensorShapeRep::SlowCopyFrom(const TensorShapeRep& b) {
   if (b.tag() != REP_OUT_OF_LINE) {
     if (tag() == REP_OUT_OF_LINE) {
       delete as64()->dims_;
@@ -131,51 +184,81 @@ void TensorShape::SlowCopyFrom(const TensorShape& b) {
   }
 }
 
-int64 TensorShape::dim_size(int d) const {
+template <class Shape>
+int64 TensorShapeBase<Shape>::dim_size(int d) const {
+  if (unknown_rank()) return -1;
   DCHECK_GE(d, 0);
   DCHECK_LT(d, dims());
   if (tag() == REP16) {
-    return as16()->dims_[d];
+    uint16 dim = as16()->dims_[d];
+    if (kIsPartial && dim == kUnknownRep16) return -1;
+    return dim;
   } else if (tag() == REP32) {
-    return as32()->dims_[d];
+    uint32 dim = as32()->dims_[d];
+    if (kIsPartial && dim == kUnknownRep32) return -1;
+    return dim;
   } else {
     return (*as64()->dims_)[d];
   }
 }
 
-void TensorShape::Clear() {
+void TensorShapeRep::Clear() {
   ClearAllButDataType();
   set_data_type(DT_INVALID);
 }
 
-void TensorShape::ClearAllButDataType() {
+void TensorShapeRep::ClearAllButDataType() {
   if (tag() == REP_OUT_OF_LINE) {
     delete as64()->dims_;
   }
   set_tag(REP16);
   set_ndims_byte(0);
   // Leaves data_type alone
-  num_elements_ = 1;
+  set_num_elements(1);
 }
 
-void TensorShape::RecomputeNumElements() {
-  int64 n = 1;
-  for (auto it = begin(); it != end(); ++it) {
-    n *= (*it).size;
-    CHECK_LE(0, n);
-    CHECK_LE(n, kMaxElements);
+template <class Shape>
+void TensorShapeBase<Shape>::RecomputeNumElements() {
+  if (unknown_rank()) {
+    set_num_elements(-1);
+    return;
   }
-  num_elements_ = n;
+  int64 n = 1;
+  for (auto dim : *this) {
+    if (kIsPartial && dim.size < 0) {
+      n = -1;
+      break;
+    }
+    n = MultiplyWithoutOverflow(n, dim.size);
+    CHECK_LE(0, n);
+  }
+  set_num_elements(n);
 }
 
-void TensorShape::AddDim(int64 size) {
-  CHECK_GE(size, 0);
+template <class Shape>
+void TensorShapeBase<Shape>::AddDim(int64 size) {
+  if (!kIsPartial) CHECK_GE(size, 0);
+  if (unknown_rank()) return;
+  CHECK_LT(ndims_byte(), MaxDimensions()) << "Too many dimensions in tensor";
+  int64 new_num_elements;
+  if (kIsPartial && (num_elements() < 0 || size < 0)) {
+    new_num_elements = -1;
+  } else {
+    new_num_elements = MultiplyWithoutOverflow(num_elements(), size);
+    CHECK_LE(0, new_num_elements);
+  }
+  UnsafeAddDim(size, new_num_elements);
+}
+
+template <class Shape>
+void TensorShapeBase<Shape>::UnsafeAddDim(int64 size, int64 new_num_elements) {
   const int nd = ndims_byte();
-  CHECK_LT(nd, MaxDimensions()) << "Too many dimensions in tensor";
   if (tag() == REP16 && nd < 6 && size < kMaxRep16) {
-    as16()->dims_[nd] = static_cast<int16>(size);
+    as16()->dims_[nd] =
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
   } else if (tag() == REP32 && nd < 3 && size < kMaxRep32) {
-    as32()->dims_[nd] = static_cast<int32>(size);
+    as32()->dims_[nd] =
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     as64()->dims_->push_back(size);
   } else {
@@ -198,7 +281,9 @@ void TensorShape::AddDim(int64 size) {
     if (can_be_rep32) {
       set_tag(REP32);
       for (size_t d = 0; d < vals.size(); d++) {
-        as32()->dims_[d] = static_cast<int32>(vals[d]);
+        as32()->dims_[d] = kIsPartial && vals[d] < 0
+                               ? kUnknownRep32
+                               : static_cast<uint32>(vals[d]);
       }
     } else {
       set_tag(REP_OUT_OF_LINE);
@@ -207,19 +292,19 @@ void TensorShape::AddDim(int64 size) {
     }
   }
   set_ndims_byte(nd + 1);
-  num_elements_ *= size;
-  CHECK_LE(0, num_elements_);
-  CHECK_LE(num_elements_, kMaxElements);
+  set_num_elements(new_num_elements);
 }
 
-void TensorShape::AppendShape(const TensorShape& shape) {
+template <class Shape>
+void TensorShapeBase<Shape>::AppendShape(const TensorShapeBase& shape) {
   for (auto d : shape) AddDim(d.size);
 }
 
-void TensorShape::InsertDim(int d, int64 size) {
+template <class Shape>
+void TensorShapeBase<Shape>::InsertDim(int d, int64 size) {
   CHECK_GE(d, 0);
   CHECK_LE(d, dims());
-  CHECK_GE(size, 0);
+  if (!kIsPartial) CHECK_GE(size, 0);
   CHECK_LT(dims(), MaxDimensions());
   gtl::InlinedVector<int64, 8> vals;
   AppendTo(*this, &vals);
@@ -230,22 +315,26 @@ void TensorShape::InsertDim(int d, int64 size) {
   }
 }
 
-gtl::InlinedVector<int64, 4> TensorShape::dim_sizes() const {
+template <class Shape>
+gtl::InlinedVector<int64, 4> TensorShapeBase<Shape>::dim_sizes() const {
   gtl::InlinedVector<int64, 4> result;
-  for (auto it = begin(); it != end(); ++it) {
-    result.push_back((*it).size);
+  for (auto dim : *this) {
+    result.push_back(dim.size);
   }
   return result;
 }
 
-void TensorShape::set_dim(int d, int64 size) {
+template <class Shape>
+void TensorShapeBase<Shape>::set_dim(int d, int64 size) {
   CHECK_GE(d, 0);
   CHECK_LT(d, dims());
   CHECK_GE(size, 0);
   if (tag() == REP16 && size < kMaxRep16) {
-    as16()->dims_[d] = static_cast<int16>(size);
+    as16()->dims_[d] =
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
   } else if (tag() == REP32 && size < kMaxRep32) {
-    as32()->dims_[d] = static_cast<int32>(size);
+    as32()->dims_[d] =
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     (*as64()->dims_)[d] = size;
   } else {
@@ -261,7 +350,9 @@ void TensorShape::set_dim(int d, int64 size) {
   RecomputeNumElements();
 }
 
-void TensorShape::RemoveDim(int d) {
+template <class Shape>
+void TensorShapeBase<Shape>::RemoveDim(int d) {
+  if (unknown_rank()) return;
   CHECK_GE(d, 0);
   CHECK_LT(d, dims());
   gtl::InlinedVector<int64, 8> vals;
@@ -282,15 +373,19 @@ bool TensorShape::IsSameSize(const TensorShape& b) const {
   return true;
 }
 
-void TensorShape::AsProto(TensorShapeProto* proto) const {
+template <class Shape>
+void TensorShapeBase<Shape>::AsProto(TensorShapeProto* proto) const {
   proto->Clear();
-  for (auto d = begin(); d != end(); ++d) {
-    auto* dim = proto->add_dim();
-    dim->set_size((*d).size);
+  if (unknown_rank()) {
+    proto->set_unknown_rank(true);
+  } else {
+    for (int i = 0; i < dims(); i++) {
+      proto->add_dim()->set_size(dim_size(i));
+    }
   }
 }
 
-void TensorShape::DumpRep() const {
+void TensorShapeRep::DumpRep() const {
 #if 0
   fprintf(stderr, "Rep: %d %d dims\n", tag(), dims());
   if (tag() == REP16) {
@@ -312,24 +407,49 @@ void TensorShape::DumpRep() const {
 #endif
 }
 
-TensorShapeIter TensorShape::begin() const { return TensorShapeIter(this, 0); }
-
-TensorShapeIter TensorShape::end() const {
-  return TensorShapeIter(this, dims());
+template <class Shape>
+TensorShapeIter<Shape> TensorShapeBase<Shape>::begin() const {
+  return TensorShapeIter<Shape>(static_cast<const Shape*>(this), 0);
 }
 
-string TensorShape::DebugString() const {
-  gtl::InlinedVector<int64, 8> vals;
-  AppendTo(*this, &vals);
-  return strings::StrCat("[", str_util::Join(gtl::ArraySlice<int64>(vals), ","),
-                         "]");
+template <class Shape>
+TensorShapeIter<Shape> TensorShapeBase<Shape>::end() const {
+  CHECK(!unknown_rank());
+  return TensorShapeIter<Shape>(static_cast<const Shape*>(this), dims());
 }
 
-string TensorShape::DebugString(const TensorShapeProto& proto) {
+string TensorShapeRep::DebugString() const {
+  const auto& shape = *static_cast<const PartialTensorShape*>(this);
+  if (shape.unknown_rank()) return "<unknown>";
   string s = "[";
+  for (int i = 0; i < shape.dims(); i++) {
+    if (i > 0) strings::StrAppend(&s, ",");
+    int64 dim = shape.dim_size(i);
+    if (dim < 0) {
+      strings::StrAppend(&s, "?");
+    } else {
+      strings::StrAppend(&s, dim);
+    }
+  }
+  strings::StrAppend(&s, "]");
+  return s;
+}
+
+string TensorShapeRep::DebugString(const TensorShapeProto& proto) {
+  string s;
+  if (proto.unknown_rank()) {
+    strings::StrAppend(&s, "<unknown>");
+    if (proto.dim_size() == 0) return s;
+  }
+  strings::StrAppend(&s, "[");
   bool first = true;
   for (const auto& d : proto.dim()) {
-    strings::StrAppend(&s, first ? "" : ",", d.size());
+    if (!first) strings::StrAppend(&s, ",");
+    if (d.size() == -1) {
+      strings::StrAppend(&s, "?");
+    } else {
+      strings::StrAppend(&s, d.size());
+    }
     first = false;
   }
   strings::StrAppend(&s, "]");
@@ -357,9 +477,9 @@ bool TensorShapeUtils::EndsWith(const TensorShape& shape,
   return true;
 }
 
-template <typename T>
-static inline Status MakeShapeHelper(const T* dims, int64 n, TensorShape* out) {
-  *out = TensorShape();
+template <typename T, class Shape>
+Status MakeShapeHelper(const T* dims, int64 n, Shape* out) {
+  out->Clear();
   if (n > TensorShape::MaxDimensions()) {
     return errors::InvalidArgument("Too many dimensions");
   }
@@ -367,27 +487,47 @@ static inline Status MakeShapeHelper(const T* dims, int64 n, TensorShape* out) {
     return errors::InvalidArgument("Negative number of dimensions ", n);
   }
   for (int64 i = 0; i < n; ++i) {
-    const T dim = internal::SubtleMustCopy(dims[i]);
-    if (dim >= 0) {
-      out->AddDim(dim);
+    T dim = internal::SubtleMustCopy(dims[i]);
+    int64 new_num_elements;
+    if (dim < 0) {
+      if (!out->kIsPartial) {
+        return errors::InvalidArgument("Dimension ", dim, " must be >= 0");
+      }
+      if (dim < -1) {
+        return errors::InvalidArgument("Dimension ", dim, " must be >= -1");
+      }
+      dim = -1;
+      new_num_elements = -1;
+    } else if (out->num_elements() < 0) {
+      new_num_elements = -1;
     } else {
-      return errors::InvalidArgument("Dimension ", dim, " must be >= 0");
+      new_num_elements = MultiplyWithoutOverflow(out->num_elements(), dim);
+      if (TF_PREDICT_FALSE(new_num_elements < 0)) {
+        TensorShapeProto proto;
+        for (int64 j = 0; j < n; ++j) {
+          proto.add_dim()->set_size(dim);
+        }
+        return errors::InvalidArgument(
+            "Shape ", TensorShape::DebugString(proto),
+            " would have more than 2**63 - 1 elements");
+      }
     }
+    out->UnsafeAddDim(dim, new_num_elements);
   }
   return Status::OK();
 }
 
-#define MAKE_SHAPE(T)                                          \
-  Status TensorShapeUtils::MakeShape(const T* dims, int64 n,   \
-                                     TensorShape* out) {       \
-    return MakeShapeHelper(dims, n, out);                      \
-  }                                                            \
-  Status TensorShapeUtils::MakeShape(gtl::ArraySlice<T> shape, \
-                                     TensorShape* out) {       \
-    return MakeShapeHelper(shape.data(), shape.size(), out);   \
+#define MAKE_SHAPE(T, Shape)                                                 \
+  Status TensorShapeUtils::MakeShape(const T* dims, int64 n, Shape* out) {   \
+    return MakeShapeHelper(dims, n, out);                                    \
+  }                                                                          \
+  Status TensorShapeUtils::MakeShape(gtl::ArraySlice<T> shape, Shape* out) { \
+    return MakeShapeHelper(shape.data(), shape.size(), out);                 \
   }
-MAKE_SHAPE(int32)
-MAKE_SHAPE(int64)
+MAKE_SHAPE(int32, TensorShape)
+MAKE_SHAPE(int64, TensorShape)
+MAKE_SHAPE(int32, PartialTensorShape)
+MAKE_SHAPE(int64, PartialTensorShape)
 #undef MAKE_SHAPE
 
 string TensorShapeUtils::ShapeListString(
@@ -402,4 +542,143 @@ string TensorShapeUtils::ShapeListString(
   return result;
 }
 
+PartialTensorShape PartialTensorShape::Concatenate(int64 size) const {
+  PartialTensorShape out = *this;
+  out.AddDim(size);
+  return out;
+}
+
+PartialTensorShape PartialTensorShape::Concatenate(
+    const PartialTensorShape& shape) const {
+  if (unknown_rank() || shape.unknown_rank()) {
+    return PartialTensorShape();
+  }
+  PartialTensorShape out = *this;
+  for (auto dim : shape) out.AddDim(dim.size);
+  return out;
+}
+
+Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
+                                     PartialTensorShape* result) const {
+  if (unknown_rank()) {
+    *result = shape;
+    return Status::OK();
+  }
+  if (shape.unknown_rank()) {
+    *result = *this;
+    return Status::OK();
+  }
+  const int dims_ = dims();
+  if (dims_ != shape.dims()) {
+    return errors::InvalidArgument(
+        "PartialTensorShape: Incompatible ranks during merge: ", dims_, " vs. ",
+        shape.dims());
+  }
+  CHECK(result != this);
+  result->Clear();
+  for (int i = 0; i < dims_; ++i) {
+    const int64 dim0 = dim_size(i);
+    const int64 dim1 = shape.dim_size(i);
+    if (dim0 >= 0 && dim1 >= 0 && dim0 != dim1) {
+      return errors::InvalidArgument(
+          "PartialTensorShape: Incompatible shapes during merge: ",
+          DebugString(), " vs. ", shape.DebugString());
+    }
+    result->AddDim(dim0 >= 0 ? dim0 : dim1);
+  }
+  return Status::OK();
+}
+
+bool PartialTensorShape::AsTensorShape(TensorShape* shape) const {
+  if (IsFullyDefined()) {
+    const TensorShapeRep* rep = this;
+    *shape = *static_cast<const TensorShape*>(rep);
+    return true;
+  }
+  return false;
+}
+
+bool PartialTensorShape::IsIdenticalTo(const PartialTensorShape& shape) const {
+  if (unknown_rank() || shape.unknown_rank()) {
+    return unknown_rank() == shape.unknown_rank();
+  }
+  if (dims() != shape.dims()) return false;
+  for (int i = 0; i < dims(); i++) {
+    if (dim_size(i) != shape.dim_size(i)) return false;
+  }
+  return true;
+}
+
+bool PartialTensorShape::IsCompatibleWith(
+    const PartialTensorShape& shape) const {
+  if (unknown_rank() || shape.unknown_rank()) return true;
+  if (dims() != shape.dims()) return false;
+  for (int i = 0; i < dims(); i++) {
+    const int64 dim0 = dim_size(i);
+    const int64 dim1 = shape.dim_size(i);
+    if (dim0 >= 0 && dim1 >= 0 && dim0 != dim1) return false;
+  }
+  return true;
+}
+
+string PartialTensorShapeUtils::PartialShapeListString(
+    const gtl::ArraySlice<PartialTensorShape>& shapes) {
+  string result = "[";
+  bool first = true;
+  for (const PartialTensorShape& shape : shapes) {
+    strings::StrAppend(&result, (first ? "" : ", "), shape.DebugString());
+    first = false;
+  }
+  strings::StrAppend(&result, "]");
+  return result;
+}
+
+bool PartialTensorShapeUtils::AreCompatible(
+    const gtl::ArraySlice<PartialTensorShape>& shapes0,
+    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
+  if (shapes0.size() == shapes1.size()) {
+    for (size_t i = 0; i < shapes0.size(); ++i) {
+      if (!shapes0[i].IsCompatibleWith(shapes1[i])) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool PartialTensorShapeUtils::AreIdentical(
+    const gtl::ArraySlice<PartialTensorShape>& shapes0,
+    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
+  if (shapes0.size() == shapes1.size()) {
+    for (size_t i = 0; i < shapes0.size(); ++i) {
+      if (!shapes0[i].IsIdenticalTo(shapes1[i])) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status TensorShapeUtils::NumElements(gtl::ArraySlice<int64> shape,
+                                     int64* num_elements) {
+  int64 n = 1;
+  for (auto dim : shape) {
+    n = MultiplyWithoutOverflow(n, dim);
+    if (n < 0) {
+      return errors::InvalidArgument("Can't compute total size of shape [",
+                                     str_util::Join(shape, ","),
+                                     "]; product would overflow int64");
+    }
+  }
+  *num_elements = n;
+  return Status::OK();
+}
+
+template class TensorShapeBase<TensorShape>;
+template class TensorShapeBase<PartialTensorShape>;
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 756fd542cfe..b2016074614 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -32,165 +32,81 @@ limitations under the License.
 namespace tensorflow {
 
 // START_SKIP_DOXYGEN
-class TensorShapeIter;  // Declared below
+template <class Shape>
+class TensorShapeIter;
+class TensorShape;
+class PartialTensorShape;
 // END_SKIP_DOXYGEN
 
-/// Represents the shape of a Tensor.
-///
-/// A tensor's shape is denoted by its number of dimensions and a size for each
-/// dimension.  For example, a Tensor represented by a 3 x 4 matrix would have
-/// a shape of 2-D, [3,4].
-///
-/// If you know the exact shape of your Tensor when you create the TensorShape
-/// object, you can specify it then, or you can create a TensorShape with
-/// zero dimensions and one element, and call AddDim() to add dimensions later.
-class TensorShape {
+/// Internal representation for both TensorShape and PartialTensorShape.
+class TensorShapeRep {
  public:
-  /// \brief Construct a `TensorShape` from the provided sizes.
-  /// REQUIRES: `dim_sizes[i] >= 0`
-  explicit TensorShape(gtl::ArraySlice<int64> dim_sizes);
-  TensorShape(std::initializer_list<int64> dim_sizes)
-      : TensorShape(gtl::ArraySlice<int64>(dim_sizes)) {}
-
-  /// REQUIRES: `IsValid(proto)`
-  explicit TensorShape(const TensorShapeProto& proto);
-
-  /// Create a tensor shape with no dimensions and one element, which you can
-  /// then call `AddDim()` on.
-  TensorShape();
-
-  ~TensorShape();
+  ~TensorShapeRep();
 
   /// Copy the specified shape
-  TensorShape(const TensorShape& b);
-  void operator=(const TensorShape& b);
+  TensorShapeRep(const TensorShapeRep& b);
+  void operator=(const TensorShapeRep& b);
 
   /// Move the specified shape.  After moving, <b> is safe for destruction and
   // can be reassigned into, but its dimensions and number of elements can be
   // nonsensical (e.g., negative dimension sizes, or number of elements not
   // properly recomputed).
-  TensorShape(TensorShape&& b);
-  void operator=(TensorShape&& b);
+  TensorShapeRep(TensorShapeRep&& b);
+  void operator=(TensorShapeRep&& b);
 
-  /// Returns `true` iff `proto` is a valid tensor shape.
-  static bool IsValid(const TensorShapeProto& proto);
-
-  /// Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error
-  /// status otherwise.
-  static Status IsValidShape(const TensorShapeProto& proto);
-
-  /// Clear a tensor shape
+  /// Clear a tensor shape, producing the scalar shape.
   void Clear();
 
-  /// \brief Add a dimension to the end ("inner-most").
-  /// REQUIRES: `size >= 0`
-  void AddDim(int64 size);
-
-  /// Appends all the dimensions from `shape`.
-  void AppendShape(const TensorShape& shape);
-
   // Maximum number of dimensions in a tensor.
-  static constexpr int MaxDimensions() { return 255; }
-
-  /// \brief Insert a dimension somewhere in the `TensorShape`.
-  /// REQUIRES: `0 <= d <= dims()`
-  /// REQUIRES: `size >= 0`
-  void InsertDim(int d, int64 size);
-
-  /// \brief Modifies the size of the dimension `d` to be `size`
-  /// REQUIRES: `0 <= d < dims()`
-  /// REQUIRES: `size >= 0`
-  void set_dim(int d, int64 size);
-
-  /// \brief Removes dimension `d` from the `TensorShape`.
-  /// REQUIRES: `0 <= d < dims()`
-  void RemoveDim(int d);
-
-  /// Return the number of dimensions in the tensor.
-  int dims() const {
-    DCHECK(tag() != REP_OUT_OF_LINE || (*as64()->dims_).size() == ndims_byte());
-    return ndims_byte();
-  }
-
-  /// \brief Returns the number of elements in dimension `d`.
-  /// REQUIRES: `0 <= d < dims()`
-  // TODO(touts): Rename to `dimension()` to match
-  // `Eigen::Tensor::dimension()`?
-  int64 dim_size(int d) const;
-
-  /// Returns sizes of all dimensions.
-  gtl::InlinedVector<int64, 4> dim_sizes() const;
+  // It's 254 because 255 = kUnknownRank is used to represent unknown rank.
+  static constexpr int MaxDimensions() { return 254; }
 
   /// \brief Returns the number of elements in the tensor.
   ///
   /// We use `int64` and not `size_t` to be compatible with `Eigen::Tensor`
-  /// which uses `ptrdiff_t`.
+  /// which uses `ptrdiff_t`.  For PartialTensorShape, -1 means not fully
+  /// defined.
   int64 num_elements() const { return num_elements_; }
 
-  /// Returns true if `*this` and `b` have the same sizes. Ignores
-  /// dimension names.
-  bool IsSameSize(const TensorShape& b) const;
-  bool operator==(const TensorShape& b) const { return IsSameSize(b); }
-  bool operator!=(const TensorShape& b) const { return !IsSameSize(b); }
-
-  /// Fill `*proto` from `*this`.
-  void AsProto(TensorShapeProto* proto) const;
-
-  /// Fill `*dsizes` from `*this`.
-  template <int NDIMS>
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizes() const;
-
-  /// Same as `AsEigenDSizes()` but allows for `NDIMS > dims()` -- in
-  /// which case we pad the rest of the sizes with 1.
-  template <int NDIMS>
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPadding() const;
-
-  /// For iterating through the dimensions.
-  TensorShapeIter begin() const;
-  TensorShapeIter end() const;
-
   /// For error messages.
   string DebugString() const;
-
-  /// Same as `TensorShape(proto).DebugString()` but doesn't crash for
-  /// invalid protos.
   static string DebugString(const TensorShapeProto& proto);
 
   void DumpRep() const;  // XXX
- private:
-  void DestructorOutOfLine();
+
+ protected:
+  // Constructable only via TensorShapeBase
+  TensorShapeRep() = default;
+
   void ClearAllButDataType();
-  void SlowCopyFrom(const TensorShape& b);
-
-  void RecomputeNumElements();
-
-  void CheckDimsEqual(int NDIMS) const;
-  void CheckDimsAtLeast(int NDIMS) const;
 
   // We use 16 bytes to represent a TensorShape.  Because we need to
   // be able to support full 64-bit dimension sizes and an arbitrary
   // number of dimensions for a Tensor, but most tensor dimensions are
   // significantly smaller than 64 bits and most tensors are 1, 2, or 3
   // dimensions, we have several representations.
-  // Rep16: Supports up to 6 dimensions where each dimension is < 2^16
-  // Rep32: Supports up to 3 dimensions where each dimension is < 2^32
+  // Rep16: Supports up to 6 dimensions where each dimension is < 2^16 - 1
+  // Rep32: Supports up to 3 dimensions where each dimension is < 2^32 - 1
   // Rep64: Supports arbitrary dimensionality, 64-bit dimensions using
   //        an out of line vector.
+  // For PartialTensorShape, a dimension of static_cast<uint??>(-1) is unknown.
+  // This value is not allowed in TensorShape either for format compatibility.
   struct Rep16 {
-    int16 dims_[6];
+    uint16 dims_[6];
   };
   struct Rep32 {
-    int32 dims_[3];
+    uint32 dims_[3];
   };
   struct Rep64 {
     gtl::InlinedVector<int64, 4>* dims_;
   };
 
-  static const int64 kMaxRep16 = 32768;
-  static const int64 kMaxRep32 = (1ull << 31) - 1;
-
-  uint8* buf() { return &u_.buf[0]; }
-  const uint8* buf() const { return &u_.buf[0]; }
+  // We use the max value of uint16 or uint32 to represent unknown shapes, so
+  // the maximum representable valid shape in these representations is one less.
+  static const int64 kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
+  static const int64 kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
+  static const uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
+  static const uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
 
   Rep16* as16() { return reinterpret_cast<Rep16*>(buf()); }
   Rep32* as32() { return reinterpret_cast<Rep32*>(buf()); }
@@ -216,13 +132,24 @@ class TensorShape {
   }
 
   // We store the number of dimensions in byte 14, and the RepTag in byte 15.
-  // Bytes [0..13] vary depending on the representation
+  // Bytes [0..13] vary depending on the representation.
+  // A value of 255 indicates unknown rank in the PartialTensorShape case.
+  static const uint8 kUnknownRank = 255;
   uint8 ndims_byte() const { return buf()[14]; }
   void set_ndims_byte(uint8 nd) { buf()[14] = nd; }
 
   RepTag tag() const { return static_cast<RepTag>(buf()[15]); }
   void set_tag(RepTag tag) { buf()[15] = static_cast<uint8>(tag); }
 
+  void set_num_elements(int64 n) { num_elements_ = n; }
+
+ private:
+  void DestructorOutOfLine();
+  void SlowCopyFrom(const TensorShapeRep& b);
+
+  uint8* buf() { return &u_.buf[0]; }
+  const uint8* buf() const { return &u_.buf[0]; }
+
   union {
     uint8 buf[16];
     // Force data to be aligned enough for a pointer.
@@ -231,6 +158,145 @@ class TensorShape {
   int64 num_elements_;
 };
 
+/// Base class for TensorShape and PartialTensorShape.
+/// The class is templatized by either TensorShape or PartialTensorShape to
+/// allow skipping known/unknown checks in the TensorShape case, but the
+/// representation is shared exactly for fast conversion.
+template <class Shape>
+class TensorShapeBase : public TensorShapeRep {
+ public:
+  /// \brief Construct a `TensorShapeBase` from the provided sizes.
+  /// REQUIRES: `dim_sizes[i] >= 0` (or >= -1 for PartialTensorShape)
+  explicit TensorShapeBase(gtl::ArraySlice<int64> dim_sizes);
+  TensorShapeBase(std::initializer_list<int64> dim_sizes)
+      : TensorShapeBase(gtl::ArraySlice<int64>(dim_sizes)) {}
+
+  /// Construct an empty TensorShape, or an unknown rank PartialTensorShape
+  TensorShapeBase();
+
+  TensorShapeBase(const TensorShapeProto& proto);
+
+  /// Returns `true` iff `proto` is a valid tensor shape.
+  // For TensorShape, the proto shape must be fully defined.
+  static bool IsValid(const TensorShapeProto& proto);
+
+  /// Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error
+  /// status otherwise.
+  static Status IsValidShape(const TensorShapeProto& proto);
+
+  /// \brief Add a dimension to the end ("inner-most").
+  /// REQUIRES: `size >= 0`
+  void AddDim(int64 size);
+
+  /// Appends all the dimensions from `shape`.
+  void AppendShape(const TensorShapeBase& shape);
+
+  // Maximum number of dimensions in a tensor.
+  static constexpr int MaxDimensions() { return 254; }
+
+  /// \brief Insert a dimension somewhere in the `TensorShape`.
+  /// REQUIRES: `0 <= d <= dims()`
+  /// REQUIRES: `size >= 0`
+  void InsertDim(int d, int64 size);
+
+  /// \brief Modifies the size of the dimension `d` to be `size`
+  /// REQUIRES: `0 <= d < dims()`
+  /// REQUIRES: `size >= 0`
+  void set_dim(int d, int64 size);
+
+  /// \brief Removes dimension `d` from the `TensorShape`.
+  /// REQUIRES: `0 <= d < dims()`
+  void RemoveDim(int d);
+
+  /// Return whether the rank is unknown
+  bool unknown_rank() const {
+    return kIsPartial && ndims_byte() == kUnknownRank;
+  }
+
+  /// Return the number of dimensions in the tensor.
+  /// Can be -1 meaning unknown rank for PartialTensorShape.
+  int dims() const {
+    uint8 dims = ndims_byte();
+    return kIsPartial && dims == kUnknownRank ? -1 : dims;
+  }
+
+  /// \brief Returns the number of elements in dimension `d`.
+  /// REQUIRES: `0 <= d < dims()`
+  // TODO(touts): Rename to `dimension()` to match
+  // `Eigen::Tensor::dimension()`?
+  int64 dim_size(int d) const;
+
+  /// Returns sizes of all dimensions.
+  // Returns an empty list for unknown rank PartialTensorShape.
+  gtl::InlinedVector<int64, 4> dim_sizes() const;
+
+  /// Return true iff the rank and all of the dimensions are well defined
+  // TODO(irving): Rename to is_fully_defined now that it's fast.
+  bool IsFullyDefined() const { return !kIsPartial || num_elements() != -1; }
+
+  /// Fill `*proto` from `*this`.
+  void AsProto(TensorShapeProto* proto) const;
+
+  /// For iterating through the dimensions.
+  TensorShapeIter<Shape> begin() const;
+  TensorShapeIter<Shape> end() const;
+
+ private:
+  void RecomputeNumElements();
+
+  // True for PartialTensorShape, false for TensorShape
+  static constexpr bool kIsPartial =
+      std::is_same<Shape, PartialTensorShape>::value;
+  static_assert(kIsPartial || std::is_same<Shape, TensorShape>::value,
+                "Shape is neither TensorShape nor PartialTensorShape");
+
+  // Used by AddDim and MakeShapeHelper.  Does no error checking.
+  void UnsafeAddDim(int64 size, int64 new_num_elements);
+
+  // For use by TensorShapeUtils::MakeShape
+  template <class T, class S>
+  friend Status MakeShapeHelper(const T*, int64, S*);
+};
+
+/// Represents the shape of a Tensor.
+///
+/// A tensor's shape is denoted by its number of dimensions and a size for each
+/// dimension.  For example, a Tensor represented by a 3 x 4 matrix would have
+/// a shape of 2-D, [3,4].
+///
+/// If you know the exact shape of your Tensor when you create the TensorShape
+/// object, you can specify it then, or you can create a TensorShape with
+/// zero dimensions and one element, and call AddDim() to add dimensions later.
+class TensorShape : public TensorShapeBase<TensorShape> {
+ public:
+  using TensorShapeBase<TensorShape>::TensorShapeBase;
+
+  /// Allow a TensorShape to be used as a PartialTensorShape without copying
+  operator const PartialTensorShape&() const;  // NOLINT(runtime/explicit)
+
+  /// Returns true if `*this` and `b` have the same sizes. Ignores
+  /// dimension names.
+  bool IsSameSize(const TensorShape& b) const;
+  bool operator==(const TensorShape& b) const { return IsSameSize(b); }
+  bool operator!=(const TensorShape& b) const { return !IsSameSize(b); }
+
+  /// Fill `*dsizes` from `*this`.
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizes() const;
+
+  /// Same as `AsEigenDSizes()` but allows for `NDIMS > dims()` -- in
+  /// which case we pad the rest of the sizes with 1.
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPadding() const;
+
+ private:
+  // These CHECK fail to ease debugging.
+  // REQUIRES: dims() == NDIMS
+  void CheckDimsEqual(int NDIMS) const;
+  // REQUIRES: dims() >= NDIMS
+  void CheckDimsAtLeast(int NDIMS) const;
+};
+
 /// Represents the value of one dimension in a TensorShape.
 struct TensorShapeDim {
   explicit TensorShapeDim(int64 s) : size(s) {}
@@ -238,9 +304,10 @@ struct TensorShapeDim {
 };
 
 // START_SKIP_DOXYGEN
+template <class Shape>
 class TensorShapeIter {
  public:
-  TensorShapeIter(const TensorShape* shape, int d) : shape_(shape), d_(d) {}
+  TensorShapeIter(const Shape* shape, int d) : shape_(shape), d_(d) {}
   bool operator==(const TensorShapeIter& rhs) {
     DCHECK(shape_ == rhs.shape_);
     return d_ == rhs.d_;
@@ -253,7 +320,7 @@ class TensorShapeIter {
   TensorShapeDim operator*() { return TensorShapeDim(shape_->dim_size(d_)); }
 
  private:
-  const TensorShape* shape_;
+  const Shape* shape_;
   int d_;
 };
 // END_SKIP_DOXYGEN
@@ -286,6 +353,12 @@ class TensorShapeUtils {
   static Status MakeShape(const int64* dims, int64 n, TensorShape* out);
   static Status MakeShape(gtl::ArraySlice<int32> shape, TensorShape* out);
   static Status MakeShape(gtl::ArraySlice<int64> shape, TensorShape* out);
+  static Status MakeShape(const int32* dims, int64 n, PartialTensorShape* out);
+  static Status MakeShape(const int64* dims, int64 n, PartialTensorShape* out);
+  static Status MakeShape(gtl::ArraySlice<int32> shape,
+                          PartialTensorShape* out);
+  static Status MakeShape(gtl::ArraySlice<int64> shape,
+                          PartialTensorShape* out);
 
   static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes);
 
@@ -294,6 +367,72 @@ class TensorShapeUtils {
 
   /// \brief Returns true iff `shape` ends with `suffix`.
   static bool EndsWith(const TensorShape& shape, const TensorShape& suffix);
+
+  /// \brief Returns the product of values in an int64 array,
+  /// or a failing Status if the array represents a value larger than
+  /// a `TensorShape` can hold.
+  static Status NumElements(gtl::ArraySlice<int64> shape, int64* num_elements);
+};
+
+/// Manages the partially known dimensions of a Tensor and their sizes.
+class PartialTensorShape : public TensorShapeBase<PartialTensorShape> {
+ public:
+  PartialTensorShape() {}
+  using TensorShapeBase<PartialTensorShape>::TensorShapeBase;
+
+  /// Add a dimension to the end ("inner-most"), returns a new
+  /// PartialTensorShape.
+  /// REQUIRES: `size >= -1`, where -1 means unknown
+  PartialTensorShape Concatenate(int64 size) const;
+
+  /// Appends all the dimensions from `shape`.  Returns a new
+  /// PartialTensorShape.
+  PartialTensorShape Concatenate(const PartialTensorShape& shape) const;
+
+  /// Merges all the dimensions from `shape`.  Returns
+  /// `InvalidArgument` error if either `shape` has a different rank
+  /// or if any of the dimensions are incompatible.
+  Status MergeWith(const PartialTensorShape& shape,
+                   PartialTensorShape* result) const;
+
+  /// Exact equality test. Returns true iff the ranks match (i.e., both are
+  /// unknown, or both are known and equal), and all dimensions are equal (i.e.,
+  /// both dimensions are known, or both are known and equal). This is a
+  /// stronger condition that IsCompatibleWith.
+  bool IsIdenticalTo(const PartialTensorShape& shape) const;
+
+  /// Return true iff the ranks match, and if the
+  /// dimensions all either match or one is unknown.
+  bool IsCompatibleWith(const PartialTensorShape& shape) const;
+
+  // Fill `*shape` from `*this`.
+  // If `*this` is not fully defined, returns false and
+  // `*shape` is left in an intermediate state.  Otherwise
+  // returns true.
+  bool AsTensorShape(TensorShape* shape) const;
+
+  /// \brief Returns a `PartialTensorShape` whose dimensions are
+  /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.  Values of -1 are
+  /// considered "unknown".
+  template <class T>
+  static Status MakePartialShape(const T* dims, int n,
+                                 PartialTensorShape* out) {
+    return TensorShapeUtils::MakeShape(dims, n, out);
+  }
+};
+
+/// \brief Static helper routines for `PartialTensorShape`. Includes a few
+/// common predicates on a partially known tensor shape.
+class PartialTensorShapeUtils {
+ public:
+  static string PartialShapeListString(
+      const gtl::ArraySlice<PartialTensorShape>& shapes);
+
+  static bool AreIdentical(const gtl::ArraySlice<PartialTensorShape>& shapes0,
+                           const gtl::ArraySlice<PartialTensorShape>& shapes1);
+
+  static bool AreCompatible(const gtl::ArraySlice<PartialTensorShape>& shapes0,
+                            const gtl::ArraySlice<PartialTensorShape>& shapes1);
 };
 
 // ----------------------------------------------------------------------------
@@ -325,7 +464,7 @@ Eigen::DSizes<Eigen::DenseIndex, NDIMS> TensorShape::AsEigenDSizesWithPadding()
 // Inlining of some performance critical routines
 // ----------------------------------------------------------------------------
 
-inline TensorShape::TensorShape(const TensorShape& b) {
+inline TensorShapeRep::TensorShapeRep(const TensorShapeRep& b) {
   num_elements_ = b.num_elements_;
   if (b.tag() != REP_OUT_OF_LINE) {
     memcpy(buf(), b.buf(), sizeof(u_.buf));
@@ -338,7 +477,7 @@ inline TensorShape::TensorShape(const TensorShape& b) {
   }
 }
 
-inline TensorShape::TensorShape(TensorShape&& b) {
+inline TensorShapeRep::TensorShapeRep(TensorShapeRep&& b) {
   num_elements_ = b.num_elements_;
   memcpy(buf(), b.buf(), sizeof(u_.buf));
   // memcpy above Implicitly does:
@@ -347,13 +486,13 @@ inline TensorShape::TensorShape(TensorShape&& b) {
   b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
 }
 
-inline TensorShape::~TensorShape() {
+inline TensorShapeRep::~TensorShapeRep() {
   if (tag() == REP_OUT_OF_LINE) {
     DestructorOutOfLine();
   }
 }
 
-inline void TensorShape::operator=(const TensorShape& b) {
+inline void TensorShapeRep::operator=(const TensorShapeRep& b) {
   num_elements_ = b.num_elements_;
   if (tag() != REP_OUT_OF_LINE && b.tag() != REP_OUT_OF_LINE) {
     memcpy(buf(), b.buf(), sizeof(u_.buf));
@@ -365,7 +504,7 @@ inline void TensorShape::operator=(const TensorShape& b) {
   }
 }
 
-inline void TensorShape::operator=(TensorShape&& b) {
+inline void TensorShapeRep::operator=(TensorShapeRep&& b) {
   if (tag() == REP_OUT_OF_LINE) {
     DestructorOutOfLine();
   }
@@ -377,6 +516,16 @@ inline void TensorShape::operator=(TensorShape&& b) {
   b.set_tag(REP16);  // other shape no longer owns out-of-line data, if any.
 }
 
+inline TensorShape::operator const PartialTensorShape&() const {
+  // Downcast to the shared representation and upcast to PartialTensorShape
+  const TensorShapeRep* rep = this;
+  return *static_cast<const PartialTensorShape*>(rep);
+}
+
+// Declare explicit instantiations in .cc file
+extern template class TensorShapeBase<TensorShape>;
+extern template class TensorShapeBase<PartialTensorShape>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_SHAPE_H_
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 79b94704381..d6fe9a1511b 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -83,8 +83,8 @@ TEST(TensorShapeTest, InvalidShapeProto) {
   EXPECT_FALSE(TensorShape::IsValid(proto));
 
   proto.Clear();
-  proto.add_dim()->set_size(1LL << 20);
-  proto.add_dim()->set_size((1LL << 20) + 1);
+  proto.add_dim()->set_size(1LL << 35);
+  proto.add_dim()->set_size((1LL << 35) + 1);
   EXPECT_FALSE(TensorShape::IsValid(proto));
 }
 
@@ -516,6 +516,55 @@ TEST(TensorShapeTest, Randomized) {
   }
 }
 
+TEST(TensorShapeTest, Large) {
+  // We used to cap shapes at 2**40 elements.  Ensure the
+  // bound is now higher.
+  int64 one = 1;
+  int64 max = std::numeric_limits<int64>::max();
+  EXPECT_EQ(TensorShape({max}).num_elements(), max);
+  EXPECT_EQ(TensorShape({1, max}).num_elements(), max);
+  EXPECT_EQ(TensorShape({max, 1}).num_elements(), max);
+  EXPECT_EQ(TensorShape({one << 62}).num_elements(), one << 62);
+  EXPECT_EQ(TensorShape({one << 20, one << 41}).num_elements(), one << 61);
+  EXPECT_EQ(TensorShape({1000, 1000, 1000, 1000, 1000, 1000}).num_elements(),
+            1e18);
+}
+
+TEST(TensorShapeTest, Overflow) {
+  int64 one = 1;
+  std::vector<std::vector<int64>> overflows = {
+      {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1},
+  };
+  for (const auto& overflow : overflows) {
+    TensorShapeProto proto;
+    for (auto dim : overflow) {
+      proto.add_dim()->set_size(dim);
+    }
+    EXPECT_EQ(tensorflow::error::INVALID_ARGUMENT,
+              TensorShape::IsValidShape(proto).code());
+    TensorShape shape;
+    EXPECT_EQ(tensorflow::error::INVALID_ARGUMENT,
+              TensorShapeUtils::MakeShape(overflow, &shape).code());
+  }
+}
+
+TEST(TensorShapeTest, UnknownRank) {
+  // NOTE(irving): Unfortunately, for historical reasons we have to allow an
+  // TensorShapeProto with unknown_rank() set to be parsed as a TensorShape.
+  // Would be nice to tighten this, but it's tricky given backwards
+  // compatibility requirements.
+  TensorShapeProto proto;
+  proto.set_unknown_rank(true);
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  EXPECT_EQ(TensorShape(), TensorShape(proto));
+
+  proto.add_dim()->set_size(7);
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  EXPECT_EQ(TensorShape({7}), TensorShape(proto));
+}
+
 TEST(TensorShapeUtilsTest, StartsWith) {
   EXPECT_TRUE(TensorShapeUtils::StartsWith(TensorShape({}), TensorShape({})));
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c907bbb69fe..369f64e9e2d 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -202,11 +202,19 @@ TEST(Tensor_QInt32, Simple) {
   TestCopies<qint32>(t);
 }
 
-TEST(Tensor_Float, Reshape) {
-  Tensor t(DT_FLOAT, TensorShape({2, 3, 4, 5}));
-  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+class TensorReshapeTest : public ::testing::Test {
+ protected:
+  Tensor t;
+  Tensor zero_t;
+
+  TensorReshapeTest()
+      : t(DT_FLOAT, TensorShape({2, 3, 4, 5})),
+        zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5})) {}
+
+  void SetUp() override {
+    EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+    EXPECT_TRUE(zero_t.shape().IsSameSize(TensorShape({3, 0, 2, 0, 5})));
 
-  {
     auto tensor = t.tensor<float, 4>();
     EXPECT_EQ(2, tensor.dimension(0));
     EXPECT_EQ(3, tensor.dimension(1));
@@ -217,6 +225,10 @@ TEST(Tensor_Float, Reshape) {
     tensor(0, 0, 0, 0) = 0.01f;
     tensor(1, 2, 3, 4) = 0.02f;
   }
+};
+
+TEST_F(TensorReshapeTest, Reshape) {
+  LOG(INFO) << "shaped";
   {
     auto shaped = t.shaped<float, 1>({120});
     EXPECT_EQ(120, shaped.dimension(0));
@@ -248,6 +260,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(shaped(0, 0, 0, 0), 0.01f);
     EXPECT_EQ(shaped(1, 2, 3, 4), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, Flat) {
+  LOG(INFO) << "flat";
   {
     auto flat = t.flat<float>();
     EXPECT_EQ(flat(0), 0.01f);
@@ -255,6 +271,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat(0), 0.01f);
     EXPECT_EQ(flat(119), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerDims) {
+  LOG(INFO) << "flat_inner_dims";
   {
     auto flat_inner_dims = t.flat_inner_dims<float>();
     EXPECT_EQ(24, flat_inner_dims.dimension(0));
@@ -262,13 +282,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(23, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(60, flat_outer_dims.dimension(1));
-    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 3>();
     EXPECT_EQ(6, flat_inner_dims.dimension(0));
@@ -277,14 +290,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(5, 3, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(3, flat_outer_dims.dimension(1));
-    EXPECT_EQ(20, flat_outer_dims.dimension(2));
-    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 5>();
     EXPECT_EQ(1, flat_inner_dims.dimension(0));
@@ -295,37 +300,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(0, 1, 2, 3, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float, 5>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(3, flat_outer_dims.dimension(1));
-    EXPECT_EQ(4, flat_outer_dims.dimension(2));
-    EXPECT_EQ(5, flat_outer_dims.dimension(3));
-    EXPECT_EQ(1, flat_outer_dims.dimension(4));
-    EXPECT_EQ(flat_outer_dims(0, 0, 0, 0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 2, 3, 4, 0), 0.02f);
-  }
-
-  Tensor zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5}));
-  {
-    auto flat_outer_dims = zero_t.flat_outer_dims<float>();
-    EXPECT_EQ(3, flat_outer_dims.dimension(0));
-    EXPECT_EQ(0, flat_outer_dims.dimension(1));
-  }
-  {
-    auto flat_outer_dims = zero_t.flat_outer_dims<float, 3>();
-    EXPECT_EQ(3, flat_outer_dims.dimension(0));
-    EXPECT_EQ(0, flat_outer_dims.dimension(1));
-    EXPECT_EQ(0, flat_outer_dims.dimension(2));
-  }
-  {
-    auto flat_outer_dims = zero_t.flat_outer_dims<float, 5>();
-    EXPECT_EQ(3, flat_outer_dims.dimension(0));
-    EXPECT_EQ(0, flat_outer_dims.dimension(1));
-    EXPECT_EQ(2, flat_outer_dims.dimension(2));
-    EXPECT_EQ(0, flat_outer_dims.dimension(3));
-    EXPECT_EQ(5, flat_outer_dims.dimension(4));
-  }
   {
     auto flat_inner_dims = zero_t.flat_inner_dims<float>();
     EXPECT_EQ(0, flat_inner_dims.dimension(0));
@@ -347,6 +321,181 @@ TEST(Tensor_Float, Reshape) {
   }
 }
 
+TEST_F(TensorReshapeTest, FlatOuterDims) {
+  LOG(INFO) << "flat_outer_dims";
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(60, flat_outer_dims.dimension(1));
+    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
+  }
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_outer_dims.dimension(2));
+    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float, 5>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_outer_dims.dimension(4));
+    EXPECT_EQ(flat_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 2, 3, 4, 0), 0.02f);
+  }
+  {
+    auto flat_outer_dims = zero_t.flat_outer_dims<float>();
+    EXPECT_EQ(3, flat_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_outer_dims.dimension(1));
+  }
+  {
+    auto flat_outer_dims = zero_t.flat_outer_dims<float, 3>();
+    EXPECT_EQ(3, flat_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_outer_dims.dimension(2));
+  }
+  {
+    auto flat_outer_dims = zero_t.flat_outer_dims<float, 5>();
+    EXPECT_EQ(3, flat_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_outer_dims.dimension(2));
+    EXPECT_EQ(0, flat_outer_dims.dimension(3));
+    EXPECT_EQ(5, flat_outer_dims.dimension(4));
+  }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerOuterDims) {
+  LOG(INFO) << "flat_inner_outer_dims";
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 4>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 8>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(6));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(7));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 2>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 5>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(4));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(3);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(2);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
+  }
+}
+
 TEST(Tensor_Scalar, Basics) {
   {
     Tensor t(DT_BOOL, TensorShape({}));
@@ -671,16 +820,14 @@ namespace {
 // failures to allocate.
 class DummyCPUAllocator : public Allocator {
  public:
-  DummyCPUAllocator() {}
+  DummyCPUAllocator() = default;
   string Name() override { return "cpu"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return nullptr;
   }
-  void DeallocateRaw(void* ptr) override { return; }
+  void DeallocateRaw(void* ptr) override {}
 };
 
-}  // namespace
-
 TEST(Tensor, FailureToAllocate) {
   TensorShape shape({1});
   DummyCPUAllocator allocator;
@@ -810,7 +957,8 @@ TEST(Tensor, Slice_Basic) {
 
 namespace {
 template <typename T>
-Tensor MkTensor(DataType dt, TensorShape shape, std::vector<T> init_values) {
+Tensor MkTensor(DataType dt, const TensorShape& shape,
+                std::vector<T> init_values) {
   Tensor x(dt, shape);
   const int limit = x.NumElements();
   int vi = 0;
@@ -930,4 +1078,5 @@ static void BM_CreateAndMoveCtrWithBuf(int iters) {
 }
 BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index ecf51e22b17..ec7b228be85 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -25,14 +25,16 @@ namespace tensor {
 Tensor DeepCopy(const Tensor& other) {
   Tensor tmp = Tensor(other.dtype(), other.shape());
   if (DataTypeCanUseMemcpy(other.dtype())) {
-    StringPiece other_data = other.tensor_data();
+    if (other.NumElements() > 0) {
+      StringPiece other_data = other.tensor_data();
 
-    // We use StringPiece as a convenient map over the tensor buffer,
-    // but we cast the type to get to the underlying buffer to do the
-    // copy.
-    StringPiece tmp_data = tmp.tensor_data();
-    memcpy(const_cast<char*>(tmp_data.data()), other_data.data(),
-           other_data.size());
+      // We use StringPiece as a convenient map over the tensor buffer,
+      // but we cast the type to get to the underlying buffer to do the
+      // copy.
+      StringPiece tmp_data = tmp.tensor_data();
+      memcpy(const_cast<char*>(tmp_data.data()), other_data.data(),
+             other_data.size());
+    }
   } else {
     CHECK_EQ(DT_STRING, other.dtype());
     tmp.flat<string>() = other.flat<string>();
@@ -40,23 +42,36 @@ Tensor DeepCopy(const Tensor& other) {
   return tmp;
 }
 
-Tensor Concat(const gtl::ArraySlice<Tensor>& tensors) {
-  CHECK_GT(tensors.size(), size_t{0});
+Status Concat(const gtl::ArraySlice<Tensor>& tensors, Tensor* result) {
+  if (tensors.empty()) {
+    return errors::InvalidArgument("Cannot concatenate zero tensors");
+  }
   int64 total_dim0_size = 0;
   for (const Tensor& tensor : tensors) {
-    CHECK_GT(tensor.dims(), 0);
+    if (tensor.dims() == 0) {
+      return errors::InvalidArgument(
+          "Cannot concatenate a zero-dimensional tensor");
+    }
     total_dim0_size += tensor.dim_size(0);
   }
   TensorShape shape = tensors[0].shape();
   shape.set_dim(0, total_dim0_size);
-  Tensor result = Tensor(tensors[0].dtype(), shape);
+
+  const DataType dtype = tensors[0].dtype();
+  for (int i = 1; i < tensors.size(); ++i) {
+    if (tensors[i].dtype() != dtype) {
+      return errors::InvalidArgument(
+          "Cannot concatenate tensors that have different data types");
+    }
+  }
+  *result = Tensor(dtype, shape);
 
   // We use StringPiece as a convenient map over the tensor buffer,
   // but we cast the type to get to the underlying buffer to do the
   // copy.
-  StringPiece to_data = result.tensor_data();
+  StringPiece to_data = result->tensor_data();
 
-  if (DataTypeCanUseMemcpy(result.dtype())) {
+  if (DataTypeCanUseMemcpy(dtype)) {
     int64 offset = 0;
     for (const Tensor& tensor : tensors) {
       StringPiece from_data = tensor.tensor_data();
@@ -67,14 +82,16 @@ Tensor Concat(const gtl::ArraySlice<Tensor>& tensors) {
       offset += from_data.size();
     }
   } else {
-    CHECK_EQ(DT_STRING, result.dtype());
+    if (dtype != DT_STRING) {
+      return errors::Internal("Unexpected data type");
+    }
     string* to_strings =
         reinterpret_cast<string*>(const_cast<char*>(to_data.data()));
 
     int64 offset = 0;
     for (const Tensor& tensor : tensors) {
       auto from_strings = tensor.flat<string>();
-      CHECK_LE(offset + tensor.NumElements(), result.NumElements());
+      CHECK_LE(offset + tensor.NumElements(), result->NumElements());
       for (int i = 0; i < tensor.NumElements(); ++i) {
         to_strings[offset + i] = from_strings(i);
       }
@@ -83,19 +100,23 @@ Tensor Concat(const gtl::ArraySlice<Tensor>& tensors) {
     }
   }
 
-  return result;
+  return Status::OK();
 }
 
-std::vector<Tensor> Split(const Tensor& tensor,
-                          const gtl::ArraySlice<int64>& sizes) {
-  CHECK_GT(tensor.dims(), 0);
+Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* result) {
+  if (tensor.dims() == 0) {
+    return errors::InvalidArgument("Cannot split a zero-dimensional tensor");
+  }
   int64 total_size = 0;
   for (int64 size : sizes) {
     total_size += size;
   }
-  CHECK_EQ(total_size, tensor.dim_size(0));
-
-  std::vector<Tensor> result;
+  if (total_size != tensor.dim_size(0)) {
+    return errors::InvalidArgument(
+        "The values in 'sizes' do not sum to the zeroth-dimension size of "
+        "'tensor'");
+  }
 
   StringPiece from_data = tensor.tensor_data();
 
@@ -104,8 +125,8 @@ std::vector<Tensor> Split(const Tensor& tensor,
     for (int64 size : sizes) {
       TensorShape shape = tensor.shape();
       shape.set_dim(0, size);
-      result.emplace_back(tensor.dtype(), shape);
-      Tensor* split = &result[result.size() - 1];
+      result->emplace_back(tensor.dtype(), shape);
+      Tensor* split = &(*result)[result->size() - 1];
 
       // We use StringPiece as a convenient map over the tensor buffer,
       // but we cast the type to get to the underlying buffer to do the
@@ -118,15 +139,17 @@ std::vector<Tensor> Split(const Tensor& tensor,
       offset += to_data.size();
     }
   } else {
-    CHECK_EQ(DT_STRING, tensor.dtype());
+    if (tensor.dtype() != DT_STRING) {
+      return errors::Internal("Unexpected data type");
+    }
     auto from_strings = tensor.flat<string>();
 
     int64 offset = 0;
     for (int64 size : sizes) {
       TensorShape shape = tensor.shape();
       shape.set_dim(0, size);
-      result.emplace_back(tensor.dtype(), shape);
-      Tensor& split = result[result.size() - 1];
+      result->emplace_back(tensor.dtype(), shape);
+      Tensor& split = (*result)[result->size() - 1];
       string* to_strings = reinterpret_cast<string*>(
           const_cast<char*>(split.tensor_data().data()));
 
@@ -139,7 +162,7 @@ std::vector<Tensor> Split(const Tensor& tensor,
     }
   }
 
-  return result;
+  return Status::OK();
 }
 
 }  // namespace tensor
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 04b5bfee7d1..6c218b69e07 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -38,7 +38,8 @@ Tensor DeepCopy(const Tensor& other);
 // REQUIRES: Each member of 'tensors' must point to data stored in CPU memory.
 // REQUIRES: Each member of 'tensors' must be a Tensor of a copy-able type if it
 //           is not appropriately memory-aligned.
-Tensor Concat(const gtl::ArraySlice<Tensor>& tensors);
+Status Concat(const gtl::ArraySlice<Tensor>& tensors,
+              Tensor* result) TF_MUST_USE_RESULT;
 
 // Splits 'tensor' into 'sizes.size()' individual tensors, along the 0th
 // dimension. The ith output tensor has 0th-dimension size 'sizes[i]'.
@@ -50,8 +51,8 @@ Tensor Concat(const gtl::ArraySlice<Tensor>& tensors);
 //           appropriately memory-aligned.
 //
 // Split() and Concat() are inverse operations.
-std::vector<Tensor> Split(const Tensor& tensor,
-                          const gtl::ArraySlice<int64>& sizes);
+Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 67611940259..69eb8363b2c 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -60,6 +61,14 @@ TEST(TensorUtil, DeepCopy0d) {
   EXPECT_EQ(DT_FLOAT, z.dtype());
 }
 
+TEST(TensorUtil, DeepCopyZeroElements) {
+  Tensor x;
+  Tensor y = tensor::DeepCopy(x);
+  EXPECT_EQ(TensorShape({0}), y.shape());
+  EXPECT_EQ(DT_FLOAT, y.dtype());
+  EXPECT_EQ(0, y.NumElements());
+}
+
 TEST(TensorUtil, DeepCopy) {
   Tensor x(DT_FLOAT, TensorShape({1}));
   x.flat<float>()(0) = 10.0;
@@ -154,7 +163,8 @@ TEST(TensorUtil, Concat) {
     offset += size;
   }
 
-  Tensor concated = tensor::Concat(to_concat);
+  Tensor concated;
+  TF_ASSERT_OK(tensor::Concat(to_concat, &concated));
   ASSERT_EQ(TensorShape({total_size, 2}), concated.shape());
   for (int i = 0; i < total_size; ++i) {
     for (int j = 0; j < 2; ++j) {
@@ -172,7 +182,8 @@ TEST(TensorUtil, Split) {
   }
 
   std::vector<int64> sizes = {1, 4, 5};
-  std::vector<Tensor> splits = tensor::Split(to_split, sizes);
+  std::vector<Tensor> splits;
+  TF_ASSERT_OK(tensor::Split(to_split, sizes, &splits));
   ASSERT_EQ(sizes.size(), splits.size());
 
   int offset = 0;
@@ -197,7 +208,10 @@ TEST(TensorUtil, ConcatSplitStrings) {
     x.flat<string>()(i) = strings::StrCat("foo_", i);
   }
 
-  Tensor x_round_tripped = tensor::Concat(tensor::Split(x, {2, 1, 1}));
+  std::vector<Tensor> split;
+  TF_ASSERT_OK(tensor::Split(x, {2, 1, 1}, &split));
+  Tensor x_round_tripped;
+  TF_ASSERT_OK(tensor::Concat(split, &x_round_tripped));
   ASSERT_EQ(x.shape(), x_round_tripped.shape());
   for (int i = 0; i < 4 * 3; ++i) {
     EXPECT_EQ(x.flat<string>()(i), x_round_tripped.flat<string>()(i));
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 302bb1d981b..1052ac05549 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -151,20 +151,22 @@ void TrackingAllocator::GetStats(AllocatorStats* stats) {
   allocator_->GetStats(stats);
 }
 
-std::pair<size_t, size_t> TrackingAllocator::GetSizesAndUnRef() {
+std::tuple<size_t, size_t, size_t> TrackingAllocator::GetSizesAndUnRef() {
   size_t high_watermark;
   size_t total_bytes;
+  size_t still_live_bytes;
   bool should_delete;
   {
     mutex_lock lock(mu_);
     high_watermark = high_watermark_;
     total_bytes = total_bytes_;
+    still_live_bytes = allocated_;
     should_delete = UnRef();
   }
   if (should_delete) {
     delete this;
   }
-  return std::make_pair(total_bytes, high_watermark);
+  return std::make_tuple(total_bytes, high_watermark, still_live_bytes);
 }
 
 bool TrackingAllocator::UnRef() {
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index f92ecc670fc..92c89d30ac5 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -59,19 +59,20 @@ class TrackingAllocator : public Allocator {
   void GetStats(AllocatorStats* stats) override;
 
   // If the underlying allocator tracks allocation sizes, this returns
-  // a pair where the first value is the total number of bytes
-  // allocated through this wrapper, and the second value is the high
-  // watermark of bytes allocated through this wrapper. If the
+  // a tuple where the first value is the total number of bytes
+  // allocated through this wrapper, the second value is the high
+  // watermark of bytes allocated through this wrapper and the third value is
+  // the allocated bytes through this wrapper that are still alive. If the
   // underlying allocator does not track allocation sizes the first
   // value is the total number of bytes requested through this wrapper
-  // and the second is 0.
+  // and the second and the third are 0.
   //
   // After GetSizesAndUnref is called, the only further calls allowed
   // on this wrapper are calls to DeallocateRaw with pointers that
   // were allocated by this wrapper and have not yet been
   // deallocated. After this call completes and all allocated pointers
   // have been deallocated the wrapper will delete itself.
-  std::pair<size_t, size_t> GetSizesAndUnRef();
+  std::tuple<size_t, size_t, size_t> GetSizesAndUnRef();
 
  protected:
   ~TrackingAllocator() override {}
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 850cdc39099..ae440cc28b5 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -75,10 +75,11 @@ TEST(TrackingAllocatorTest, SimpleNoTracking) {
   ta->DeallocateRaw(p1);
   void* p2 = ta->AllocateRaw(4, 12);
 
-  std::pair<size_t, size_t> sizes = ta->GetSizesAndUnRef();
+  std::tuple<size_t, size_t, size_t> sizes = ta->GetSizesAndUnRef();
 
-  EXPECT_EQ(16, sizes.first);
-  EXPECT_EQ(0, sizes.second);
+  EXPECT_EQ(16, std::get<0>(sizes));
+  EXPECT_EQ(0, std::get<1>(sizes));
+  EXPECT_EQ(0, std::get<2>(sizes));
 
   ta->DeallocateRaw(p2);
 
@@ -97,8 +98,9 @@ TEST(TrackingAllocatorTest, SimpleNoTracking) {
 
   sizes = ta->GetSizesAndUnRef();
 
-  EXPECT_LE(16, sizes.first);
-  EXPECT_LE(12, sizes.second);
+  EXPECT_LE(16, std::get<0>(sizes));
+  EXPECT_LE(12, std::get<1>(sizes));
+  EXPECT_LE(12, std::get<2>(sizes));
 
   ta->DeallocateRaw(p2);
 }
@@ -114,10 +116,11 @@ TEST(TrackingAllocatorTest, SimpleTracking) {
   ta->DeallocateRaw(p1);
   void* p2 = ta->AllocateRaw(4, 4);
 
-  std::pair<size_t, size_t> sizes = ta->GetSizesAndUnRef();
+  std::tuple<size_t, size_t, size_t> sizes = ta->GetSizesAndUnRef();
 
-  EXPECT_EQ(16, sizes.first);
-  EXPECT_EQ(12, sizes.second);
+  EXPECT_EQ(16, std::get<0>(sizes));
+  EXPECT_EQ(12, std::get<1>(sizes));
+  EXPECT_EQ(4, std::get<2>(sizes));
 
   ta->DeallocateRaw(p2);
 }
@@ -132,10 +135,11 @@ TEST(TrackingAllocatorTest, OutOfMemory) {
   void* p1 = ta->AllocateRaw(4, 12);
   EXPECT_EQ(nullptr, p1);
 
-  std::pair<size_t, size_t> sizes = ta->GetSizesAndUnRef();
+  std::tuple<size_t, size_t, size_t> sizes = ta->GetSizesAndUnRef();
 
-  EXPECT_EQ(0, sizes.first);
-  EXPECT_EQ(0, sizes.second);
+  EXPECT_EQ(0, std::get<0>(sizes));
+  EXPECT_EQ(0, std::get<1>(sizes));
+  EXPECT_EQ(0, std::get<2>(sizes));
 }
 
 TEST(TrackingAllocatorTest, FreeNullPtr) {
@@ -147,10 +151,11 @@ TEST(TrackingAllocatorTest, FreeNullPtr) {
 
   ta->DeallocateRaw(nullptr);
 
-  std::pair<size_t, size_t> sizes = ta->GetSizesAndUnRef();
+  std::tuple<size_t, size_t, size_t> sizes = ta->GetSizesAndUnRef();
 
-  EXPECT_EQ(0, sizes.first);
-  EXPECT_EQ(0, sizes.second);
+  EXPECT_EQ(0, std::get<0>(sizes));
+  EXPECT_EQ(0, std::get<1>(sizes));
+  EXPECT_EQ(0, std::get<2>(sizes));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index dfde25c21e9..b978d90fa80 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
 
 #include <string>
-#ifdef __GXX_RTTI
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
 #include <typeindex>
 #include <typeinfo>
 #endif  // __GXX_RTTI
@@ -30,7 +30,7 @@ namespace tensorflow {
 // binary sizes. The following #ifdef section provides a non-RTTI
 // replacement for std::type_index (with a minimal set of functions needed by
 // the TensorFlow framework, and more can be added if necessary).
-#ifndef __GXX_RTTI
+#if !defined(__GXX_RTTI) && !defined(_CPPRTTI)
 
 // A thin TypeIndex class that mimics std::type_index but does not use RTTI. As
 // a result, it does not provide the actual name of the type, and only returns a
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index a374f848a17..dc396e468ae 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -169,7 +169,9 @@ bool DataTypeFromString(StringPiece sp, DataType* dt) {
   return false;
 }
 
-string DeviceTypeString(DeviceType device_type) { return device_type.type(); }
+string DeviceTypeString(const DeviceType& device_type) {
+  return device_type.type();
+}
 
 string DataTypeSliceString(const DataTypeSlice types) {
   string out;
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 589730baf10..f562880e7cf 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -57,6 +57,7 @@ class DeviceType {
   explicit DeviceType(StringPiece type) : type_(type.data(), type.size()) {}
 
   const char* type() const { return type_.c_str(); }
+  const string& type_string() const { return type_; }
 
   bool operator<(const DeviceType& other) const;
   bool operator==(const DeviceType& other) const;
@@ -68,9 +69,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 
 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;   // "CPU"
-extern const char* const DEVICE_GPU;   // "GPU"
-extern const char* const DEVICE_SYCL;  // "SYCL"
+TF_EXPORT extern const char* const DEVICE_CPU;   // "CPU"
+TF_EXPORT extern const char* const DEVICE_GPU;   // "GPU"
+TF_EXPORT extern const char* const DEVICE_SYCL;  // "SYCL"
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
@@ -82,7 +83,7 @@ typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
-string DeviceTypeString(DeviceType device_type);
+string DeviceTypeString(const DeviceType& device_type);
 string DataTypeSliceString(const DataTypeSlice dtypes);
 inline string DataTypeVectorString(const DataTypeVector& dtypes) {
   return DataTypeSliceString(dtypes);
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 38f011ecaf1..3bfba3fc4ee 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-void DFS(const Graph& g, std::function<void(Node*)> enter,
-         std::function<void(Node*)> leave) {
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
@@ -61,15 +61,23 @@ void DFS(const Graph& g, std::function<void(Node*)> enter,
   }
 }
 
-void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave) {
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave) {
+  ReverseDFSFrom(g, {g.sink_node()}, enter, leave);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.sink_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 74aace80722..01d36e0a124 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -21,20 +21,28 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Perform a depth-first-search on g starting at the source node.
 // If enter is not empty, calls enter(n) before visiting any children of n.
 // If leave is not empty, calls leave(n) after visiting all children of n.
-extern void DFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave);
+extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave);
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
-extern void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                       std::function<void(Node*)> leave);
+extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                       const std::function<void(Node*)>& leave);
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& enter,
+                           const std::function<void(Node*)>& leave);
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index d6abf681902..a529760426d 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 8409fb4cd0b..db6683d1e74 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -88,7 +88,7 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
           out_info->frame = out;
           out_info->parent_frame = frame;
           TF_RETURN_IF_ERROR(
-              GetNodeAttr(out->def(), "frame_name", &out_info->frame_name));
+              GetNodeAttr(out->attrs(), "frame_name", &out_info->frame_name));
           if (out_info->frame_name.empty()) {
             return errors::InvalidArgument("The Enter node ", out->name(),
                                            " must have a frame name.");
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index e05297e2955..f798af85e15 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -60,7 +60,7 @@ void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
     time_[global_id] += cm.time_[local_id];
     int num_slots = cm.slot_bytes_[local_id].size();
     if (num_slots > 0) {
-      if (slot_bytes_[global_id].size() == 0) {
+      if (slot_bytes_[global_id].empty()) {
         slot_bytes_[global_id].resize(num_slots);
       } else {
         CHECK_EQ(num_slots, slot_bytes_[global_id].size());
@@ -82,7 +82,7 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
     time_[i] += cm.time_[i];
     int num_slots = cm.slot_bytes_[i].size();
     if (num_slots > 0) {
-      if (slot_bytes_[i].size() == 0) {
+      if (slot_bytes_[i].empty()) {
         slot_bytes_[i].resize(num_slots);
       } else {
         CHECK_EQ(num_slots, slot_bytes_[i].size());
@@ -138,16 +138,15 @@ void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   auto perslot = &slot_bytes_[id];
   auto max_mem_usage = &max_mem_usage_[id];
   auto output_port_alloc_ids = &output_port_alloc_ids_[id];
-  if (perslot->size() > 0) {
+  if (!perslot->empty()) {
     CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
                                            << node->name();
   } else {
     perslot->resize(num_outputs, Bytes(-1));
-    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
-    max_mem_usage->output_port_shape.resize(num_outputs, TensorShapeProto());
-    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
-    max_mem_usage->temp_memory_size = Bytes(-1);
     output_port_alloc_ids->resize(num_outputs, -1);
+    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
+    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
+    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
   }
 }
 
@@ -218,19 +217,17 @@ Microseconds CostModel::TimeEstimate(const Node* node) const {
 }
 
 void CostModel::CheckInitialized(const Graph& graph) const {
-  for (const Node* n : graph.nodes()) {
-    if (n->IsOp()) {
-      CHECK(static_cast<size_t>(n->id()) < time_.size() &&
-            time_[n->id()] >= Microseconds(0))
-          << ": no time estimate for " << n->DebugString();
+  for (const Node* n : graph.op_nodes()) {
+    CHECK(static_cast<size_t>(n->id()) < time_.size() &&
+          time_[n->id()] >= Microseconds(0))
+        << ": no time estimate for " << n->DebugString();
 
-      CHECK(static_cast<size_t>(n->id()) < slot_bytes_.size())
-          << ": no size estimate for " << n->DebugString();
-      const auto& perslot = slot_bytes_[n->id()];
-      for (size_t i = 0; i < perslot.size(); i++) {
-        CHECK_GE(perslot[i], Bytes(0)) << ": no size estimate for output# " << i
-                                       << " of " << n->DebugString();
-      }
+    CHECK(static_cast<size_t>(n->id()) < slot_bytes_.size())
+        << ": no size estimate for " << n->DebugString();
+    const auto& perslot = slot_bytes_[n->id()];
+    for (size_t i = 0; i < perslot.size(); i++) {
+      CHECK_GE(perslot[i], Bytes(0)) << ": no size estimate for output# " << i
+                                     << " of " << n->DebugString();
     }
   }
 }
@@ -257,26 +254,28 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
 
 Bytes CostModel::MaxMemorySize(const Node* node, int slot) const {
   const int id = Id(node);
-  if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
-      slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
+  if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
+      max_mem_usage_[id].output_port_mem.size() <= static_cast<size_t>(slot)) {
     return Bytes(0);
   }
   return max_mem_usage_[id].output_port_mem[slot];
 }
 
-TensorShapeProto CostModel::MaxMemoryShape(const Node* node, int slot) const {
+const TensorShapeProto& CostModel::MaxMemoryShape(const Node* node,
+                                                  int slot) const {
   const int id = Id(node);
-  if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
-      slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
-    return TensorShapeProto();
+  if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
+      max_mem_usage_[id].output_port_shape.size() <=
+          static_cast<size_t>(slot)) {
+    return unknown_shape_;
   }
   return max_mem_usage_[id].output_port_shape[slot];
 }
 
 DataType CostModel::MaxMemoryType(const Node* node, int slot) const {
   const int id = Id(node);
-  if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
-      slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
+  if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
+      max_mem_usage_[id].output_port_type.size() <= static_cast<size_t>(slot)) {
     return DT_INVALID;
   }
   return max_mem_usage_[id].output_port_type[slot];
@@ -290,6 +289,63 @@ Bytes CostModel::TempMemorySize(const Node* node) const {
   return max_mem_usage_[id].temp_memory_size;
 }
 
+Bytes CostModel::HostTempMemorySize(const Node* node) const {
+  const int id = Id(node);
+  if (id < 0) {
+    return Bytes(0);
+  }
+  return max_mem_usage_[id].host_temp_memory_size;
+}
+
+Bytes CostModel::DeviceTempMemorySize(const Node* node) const {
+  const int id = Id(node);
+  if (id < 0) {
+    return Bytes(0);
+  }
+  return max_mem_usage_[id].device_temp_memory_size;
+}
+
+Bytes CostModel::HostPersistentMemorySize(const Node* node) const {
+  const int id = Id(node);
+  if (id < 0) {
+    return Bytes(0);
+  }
+  return max_mem_usage_[id].host_persistent_memory_size;
+}
+
+Bytes CostModel::DevicePersistentMemorySize(const Node* node) const {
+  const int id = Id(node);
+  if (id < 0) {
+    return Bytes(0);
+  }
+  return max_mem_usage_[id].device_persistent_memory_size;
+}
+
+void CostModel::RecordMemoryStats(const Node* node,
+                                  const MemoryStats& memory_stats) {
+  const int id = Id(node);
+  if (id < 0) return;
+  max_mem_usage_[id].host_temp_memory_size =
+      memory_stats.host_temp_memory_size();
+  max_mem_usage_[id].device_temp_memory_size =
+      memory_stats.device_temp_memory_size();
+  max_mem_usage_[id].host_persistent_memory_size =
+      memory_stats.host_persistent_memory_size();
+  max_mem_usage_[id].device_persistent_memory_size =
+      memory_stats.device_persistent_memory_size();
+  for (int64 alloc_id : memory_stats.host_persistent_tensor_alloc_ids()) {
+    if (alloc_id > 0) {
+      host_persistent_alloc_ids_.insert(alloc_id);
+    }
+  }
+  for (int64 alloc_id : memory_stats.device_persistent_tensor_alloc_ids()) {
+    if (alloc_id > 0) {
+      persistent_alloc_ids_by_devices_[node->assigned_device_name()].insert(
+          alloc_id);
+    }
+  }
+}
+
 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
@@ -315,13 +371,25 @@ void CostModel::RecordAllocationId(const Node* node, int output_slot,
 
 int64 CostModel::AllocationId(const Node* node, int slot) const {
   const int id = Id(node);
-  if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
-      slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
+  if (id < 0 || static_cast<size_t>(id) >= output_port_alloc_ids_.size() ||
+      output_port_alloc_ids_[id].size() <= static_cast<size_t>(slot)) {
     return -1;
   }
   return output_port_alloc_ids_[id][slot];
 }
 
+bool CostModel::IsPersistentTensor(const Node* node, int64 alloc_id) const {
+  if (host_persistent_alloc_ids_.count(alloc_id) > 0) {
+    return true;
+  }
+  if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
+      persistent_alloc_ids_by_devices_.end()) {
+    return false;
+  }
+  return persistent_alloc_ids_by_devices_.at(node->assigned_device_name())
+      .count(alloc_id);
+}
+
 Microseconds CostModel::CopyTimeEstimate(Bytes b, double network_latency_millis,
                                          double estimated_gbps) {
   // TODO(jeff,sanjay): estimate cost based on bandwidth along the
@@ -408,6 +476,14 @@ static void EstimateComputationCosts(const Graph& g, CostModel* cost_model) {
 }  // namespace
 
 void CostModel::InitFromGraph(const Graph& g) {
+  const int num_node_ids = g.num_node_ids();
+  slot_bytes_.reserve(num_node_ids);
+  count_.reserve(num_node_ids);
+  time_.reserve(num_node_ids);
+  max_mem_usage_.reserve(num_node_ids);
+  max_exec_time_.reserve(num_node_ids);
+  output_port_alloc_ids_.reserve(num_node_ids);
+
   AddNodesToCostModel(g, this);
   AssignSizes(g, this);
   EstimateComputationCosts(g, this);
@@ -447,7 +523,6 @@ void CostModel::AddToCostGraphDef(const Graph* graph,
 
     for (int i = 0; i < n->num_outputs(); i++) {
       CostGraphDef::Node::OutputInfo* output_info = cnode->add_output_info();
-      output_info->set_size(MaxMemorySize(n, i).value());
       int64 alloc_id = AllocationId(n, i);
       int64 alias_to_input = -1;
       for (const Edge* e : inputs) {
@@ -460,13 +535,22 @@ void CostModel::AddToCostGraphDef(const Graph* graph,
       output_info->set_alias_input_port(alias_to_input);
       output_info->set_dtype(MaxMemoryType(n, i));
       *output_info->mutable_shape() = MaxMemoryShape(n, i);
+      if (alias_to_input < 0 && IsPersistentTensor(n, alloc_id)) {
+        output_info->set_size(0);
+      } else {
+        output_info->set_size(MaxMemorySize(n, i).value());
+      }
     }
 
     for (const Edge* e : control_inputs) {
       cnode->add_control_input(Id(e->src()));
     }
 
-    cnode->set_temporary_memory_size(TempMemorySize(n).value());
+    cnode->set_host_temp_memory_size(HostTempMemorySize(n).value());
+    cnode->set_device_temp_memory_size(DeviceTempMemorySize(n).value());
+    cnode->set_host_persistent_memory_size(HostPersistentMemorySize(n).value());
+    cnode->set_device_persistent_memory_size(
+        DevicePersistentMemorySize(n).value());
 
     cnode->set_compute_cost(MaxExecutionTime(n).value());
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index f1fcd7820f6..a908a4843ca 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -20,12 +20,14 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 typedef std::unordered_map<StringPiece, int32, StringPiece::Hasher>
@@ -44,7 +46,9 @@ class CostModel {
  public:
   // If "global" is true, maintains costs based on Node::cost_id, otherwise
   // maintains costs based on Node::id.
-  explicit CostModel(bool is_global) : is_global_(is_global) {}
+  explicit CostModel(bool is_global) : is_global_(is_global) {
+    unknown_shape_.set_unknown_rank(true);
+  }
 
   // Assigns min_count_ as a function of the median count for a Node.
   // This value is then used for suppressing the time/size costs of
@@ -119,7 +123,8 @@ class CostModel {
 
   // Returns the shape corresponding to the largest memory size of the tensor
   // generated by "output_slot" of "node".
-  TensorShapeProto MaxMemoryShape(const Node* node, int output_slot) const;
+  const TensorShapeProto& MaxMemoryShape(const Node* node,
+                                         int output_slot) const;
 
   // Returns the shape corresponding to the largest memory size of the tensor
   // generated by "output_slot" of "node".
@@ -128,6 +133,17 @@ class CostModel {
   // Returns the size in bytes of temporary memory consumed by "node".
   Bytes TempMemorySize(const Node* node) const;
 
+  // Returns the size in bytes of temporary memory consumed by "node".
+  Bytes HostTempMemorySize(const Node* node) const;
+  Bytes DeviceTempMemorySize(const Node* node) const;
+
+  // Returns the size of persistent memory allocated by "node".
+  Bytes HostPersistentMemorySize(const Node* node) const;
+  Bytes DevicePersistentMemorySize(const Node* node) const;
+
+  // Records memory stats such as temp momory and persistent memory.
+  void RecordMemoryStats(const Node* node, const MemoryStats& memory_stats);
+
   // Records the maximum execution time (in microseconds) of "node".
   void RecordMaxExecutionTime(const Node* node, Microseconds time);
 
@@ -142,6 +158,8 @@ class CostModel {
   // Return the unique id of the tensor generated by "output_slot" of "node".
   int64 AllocationId(const Node* node, int output_slot) const;
 
+  bool IsPersistentTensor(const Node* node, int64 alloc_id) const;
+
   // Helper routines to encapsulate static estimation heuristics
 
   // Compute an estimate of the time to copy "b" bytes over the network,
@@ -192,7 +210,22 @@ class CostModel {
 
   // Maximum memory usage
   struct MemUsage {
+    MemUsage()
+        : temp_memory_size(-1),
+          host_temp_memory_size(0),
+          device_temp_memory_size(0),
+          host_persistent_memory_size(0),
+          device_persistent_memory_size(0) {}
+
+    // TODO(yuefengz): temp_memory_size is not being used, remove it.
     Bytes temp_memory_size;
+
+    Bytes host_temp_memory_size;
+    Bytes device_temp_memory_size;
+
+    Bytes host_persistent_memory_size;
+    Bytes device_persistent_memory_size;
+
     gtl::InlinedVector<Bytes, 2> output_port_mem;
     gtl::InlinedVector<TensorShapeProto, 2> output_port_shape;
     gtl::InlinedVector<DataType, 2> output_port_type;
@@ -201,6 +234,11 @@ class CostModel {
 
   std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
 
+  std::set<int64> host_persistent_alloc_ids_;
+  std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
+
+  TensorShapeProto unknown_shape_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CostModel);
 };
 
diff --git a/tensorflow/core/graph/default_device.h b/tensorflow/core/graph/default_device.h
index 19f119ad1cd..68d7c8e553d 100644
--- a/tensorflow/core/graph/default_device.h
+++ b/tensorflow/core/graph/default_device.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 
 namespace tensorflow {
 namespace graph {
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 509c67c11ff..dcb8520cf73 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -30,6 +30,48 @@ const int Graph::kControlSlot = -1;
 
 // Node
 
+#define REF_CLASS(key, value) \
+  {key, value}, { "Ref" key, value }
+
+const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
+    *new std::unordered_map<string, Node::NodeClass>({
+        // Keep in same order as NodeClass values
+        REF_CLASS("Switch", NC_SWITCH),
+        REF_CLASS("Merge", NC_MERGE),
+        REF_CLASS("Enter", NC_ENTER),
+        REF_CLASS("Exit", NC_EXIT),
+        REF_CLASS("NextIteration", NC_NEXT_ITERATION),
+        {"LoopCond", NC_LOOP_COND},
+        {"ControlTrigger", NC_CONTROL_TRIGGER},
+        {"_Send", NC_SEND},
+        {"_HostSend", NC_HOST_SEND},
+        {"_Recv", NC_RECV},
+        {"_HostRecv", NC_HOST_RECV},
+        {"Const", NC_CONSTANT},
+        {"HostConst", NC_CONSTANT},
+        {"Variable", NC_VARIABLE},
+        {"VariableV2", NC_VARIABLE},
+        REF_CLASS("Identity", NC_IDENTITY),
+        {"GetSessionHandle", NC_GET_SESSION_HANDLE},
+        {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
+        {"GetSessionTensor", NC_GET_SESSION_TENSOR},
+        {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
+        {"Size", NC_METADATA},
+        {"Shape", NC_METADATA},
+        {"Rank", NC_METADATA},
+    });
+
+#undef REF_CLASS
+
+Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
+  auto it = kNodeClassTable.find(ts);
+  if (it != kNodeClassTable.end()) {
+    return it->second;
+  } else {
+    return NC_OTHER;
+  }
+}
+
 string Node::DebugString() const {
   string ret = strings::StrCat("{name:'", name(), "' id:", id_);
   if (IsSource()) {
@@ -38,8 +80,8 @@ string Node::DebugString() const {
     strings::StrAppend(&ret, " sink}");
   } else {
     strings::StrAppend(&ret, " op device:");
-    strings::StrAppend(&ret, "{", assigned_device_name_, "}");
-    strings::StrAppend(&ret, " def:{", SummarizeNodeDef(def()), "}}");
+    strings::StrAppend(&ret, "{", assigned_device_name(), "}");
+    strings::StrAppend(&ret, " def:{", SummarizeNode(*this), "}}");
   }
   return ret;
 }
@@ -49,7 +91,7 @@ Node::Node()
       cost_id_(-1),
       class_(NC_UNINITIALIZED),
       props_(nullptr),
-      assigned_device_name_() {}
+      assigned_device_name_index_(0) {}
 
 Node::~Node() {
   if (props_) {
@@ -70,40 +112,7 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
   }
   props_ = props;
   // Initialize the class_ based on the type string
-  const string& ts = this->type_string();
-  class_ = NC_UNINITIALIZED;
-
-#define SET_CLASS(enum_val, ts, str1, str2)        \
-  do {                                             \
-    if ((((ts) == (str1)) || ((ts) == (str2)))) {  \
-      /* Cannot be member of more than one class*/ \
-      CHECK(class_ == NC_UNINITIALIZED);           \
-      class_ = (enum_val);                         \
-    }                                              \
-  } while (0)
-
-  SET_CLASS(NC_SWITCH, ts, "Switch", "RefSwitch");
-  SET_CLASS(NC_MERGE, ts, "Merge", "RefMerge");
-  SET_CLASS(NC_ENTER, ts, "Enter", "RefEnter");
-  SET_CLASS(NC_EXIT, ts, "Exit", "RefExit");
-  SET_CLASS(NC_NEXT_ITERATION, ts, "NextIteration", "RefNextIteration");
-  SET_CLASS(NC_LOOP_COND, ts, "LoopCond", "");
-  SET_CLASS(NC_CONTROL_TRIGGER, ts, "ControlTrigger", "");
-  SET_CLASS(NC_SEND, ts, "_Send", "");
-  SET_CLASS(NC_HOST_SEND, ts, "_HostSend", "");
-  SET_CLASS(NC_RECV, ts, "_Recv", "");
-  SET_CLASS(NC_HOST_RECV, ts, "_HostRecv", "");
-  SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
-  SET_CLASS(NC_VARIABLE, ts, "Variable", "");
-  SET_CLASS(NC_VARIABLE, ts, "VariableV2", "");
-  SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
-  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
-  SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
-  SET_CLASS(NC_DELETE_SESSION_TENSOR, ts, "DeleteSessionTensor", "");
-  if (class_ == NC_UNINITIALIZED) {
-    class_ = NC_OTHER;  // Catch all
-  }
-#undef SET_CLASS
+  class_ = GetNodeClassForOp(props->node_def_.op());
 }
 
 void Node::Clear() {
@@ -118,7 +127,7 @@ void Node::Clear() {
     props_ = nullptr;
   }
 
-  assigned_device_name_.clear();
+  assigned_device_name_index_ = 0;
 }
 
 gtl::iterator_range<NeighborIter> Node::out_nodes() const {
@@ -198,7 +207,7 @@ Status Node::input_edges(std::vector<const Edge*>* input_edges) const {
   return Status::OK();
 }
 
-Status Node::input_node(int idx, const Node** n) const {
+Status Node::input_node(int idx, Node** n) const {
   const Edge* e;
   TF_RETURN_IF_ERROR(input_edge(idx, &e));
   if (e == nullptr) {
@@ -209,6 +218,13 @@ Status Node::input_node(int idx, const Node** n) const {
   return Status::OK();
 }
 
+Status Node::input_node(int idx, const Node** const_n) const {
+  Node* n;
+  TF_RETURN_IF_ERROR(input_node(idx, &n));
+  *const_n = n;
+  return Status::OK();
+}
+
 // Node::Properties
 
 Node::Properties::Properties(const OpDef* op_def, const NodeDef& node_def,
@@ -224,10 +240,14 @@ Node::Properties::~Properties() {}
 // Graph
 
 Graph::Graph(const OpRegistryInterface* ops)
-    : ops_(ops), arena_(8 << 10 /* 8kB */) {
+    : ops_(ops, FunctionDefLibrary()), arena_(8 << 10 /* 8kB */) {
   versions_.set_producer(TF_GRAPH_DEF_VERSION);
   versions_.set_min_consumer(TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
 
+  // Initialize the name interning table for assigned_device_name.
+  device_names_.push_back("");
+  DCHECK_EQ(0, InternDeviceName(""));
+
   // Source and sink have no endpoints, just control edges.
   NodeDef def;
   def.set_name("_SOURCE");
@@ -245,6 +265,12 @@ Graph::Graph(const OpRegistryInterface* ops)
   AddControlEdge(source, sink);
 }
 
+Graph::Graph(const FunctionLibraryDefinition& flib_def)
+    : Graph(flib_def.default_registry()) {
+  Status s = ops_.AddLibrary(flib_def);
+  CHECK(s.ok()) << s.error_message();
+}
+
 Graph::~Graph() {
   // Manually call the destructors for all the Nodes we constructed using
   // placement new.
@@ -262,7 +288,7 @@ Graph::~Graph() {
 
 Node* Graph::AddNode(const NodeDef& node_def, Status* status) {
   const OpDef* op_def;
-  status->Update(ops_->LookUpOpDef(node_def.op(), &op_def));
+  status->Update(ops_.LookUpOpDef(node_def.op(), &op_def));
   if (!status->ok()) return nullptr;
 
   DataTypeVector inputs;
@@ -285,6 +311,17 @@ Node* Graph::CopyNode(Node* node) {
   props->Ref();
   Node* copy = AllocateNode(props, node);
   copy->set_assigned_device_name(node->assigned_device_name());
+
+  // Since the OpDef of a function may be owned by the Graph that owns 'node',
+  // relookup the OpDef in the target graph. If it differs, then clone the
+  // node properties with the updated OpDef.
+  const OpDef* op_def;
+  TF_CHECK_OK(ops_.LookUpOpDef(node->type_string(), &op_def));
+  if (op_def != props->op_def_) {
+    copy->MaybeCopyOnWrite();
+    copy->props_->op_def_ = op_def;
+  }
+
   return copy;
 }
 
@@ -330,7 +367,7 @@ const Edge* Graph::AddEdge(Node* source, int x, Node* dest, int y) {
   CHECK(source->out_edges_.insert(e).second);
   CHECK(dest->in_edges_.insert(e).second);
   edges_.push_back(e);
-  edge_set_.insert(e);
+  ++num_edges_;
   return e;
 }
 
@@ -340,8 +377,8 @@ void Graph::RemoveEdge(const Edge* e) {
   CHECK_EQ(e->src_->out_edges_.erase(e), size_t{1});
   CHECK_EQ(e->dst_->in_edges_.erase(e), size_t{1});
   CHECK_EQ(e, edges_[e->id_]);
+  CHECK_GT(num_edges_, 0);
 
-  CHECK_EQ(edge_set_.erase(e), size_t{1});
   edges_[e->id_] = nullptr;
 
   Edge* del = const_cast<Edge*>(e);
@@ -351,6 +388,39 @@ void Graph::RemoveEdge(const Edge* e) {
   del->src_output_ = kControlSlot - 1;
   del->dst_input_ = kControlSlot - 1;
   free_edges_.push_back(del);
+  --num_edges_;
+}
+
+Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+  for (const FunctionDef& fdef : fdef_lib.function()) {
+    const FunctionDef* preexisting_fdef = ops_.Find(fdef.signature().name());
+    if (preexisting_fdef != nullptr) {
+      if (!FunctionDefsEqual(*preexisting_fdef, fdef)) {
+        return errors::InvalidArgument(
+            "Cannot add function '", fdef.signature().name(),
+            "' because a different function with the same name already "
+            "exists.");
+      }
+      // Ignore duplicate FunctionDefs
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ops_.AddFunctionDef(fdef));
+  }
+  for (const GradientDef& grad : fdef_lib.gradient()) {
+    string preexisting_grad_func = ops_.FindGradient(grad.function_name());
+    if (!preexisting_grad_func.empty()) {
+      if (preexisting_grad_func != grad.gradient_func()) {
+        return errors::InvalidArgument(
+            "Cannot assign gradient function '", grad.gradient_func(), "' to '",
+            grad.function_name(), "' because it already has gradient function ",
+            "'", preexisting_grad_func, "'");
+      }
+      // Ignore duplicate GradientDefs
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ops_.AddGradientDef(grad));
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -373,7 +443,8 @@ void Graph::ToGraphDef(GraphDef* graph_def) const {
 
 void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
   graph_def->Clear();
-  graph_def->mutable_versions()->CopyFrom(versions());
+  *graph_def->mutable_versions() = versions();
+  *graph_def->mutable_library() = ops_.ToProto();
   std::vector<const Edge*>
       inputs;  // Construct this outside the loop for speed.
   for (auto id = from_node_id; id < num_node_ids(); ++id) {
@@ -410,7 +481,7 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
     for (size_t i = 0; i < inputs.size(); ++i) {
       const Edge* edge = inputs[i];
       if (edge == nullptr) {
-        node_def->add_input(node->def().input(i));
+        node_def->add_input(node->requested_inputs()[i]);
       } else {
         const Node* src = edge->src();
         if (!src->IsOp()) continue;
@@ -424,12 +495,6 @@ string Graph::NewName(StringPiece prefix) {
   return strings::StrCat(prefix, "/_", name_counter_++);
 }
 
-gtl::iterator_range<NodeIter> Graph::nodes() const {
-  // Note that NodeId 0 is always valid since we don't let the source
-  // node be removed from the graph.
-  return gtl::make_range(NodeIter(this, 0), NodeIter(this, num_node_ids()));
-}
-
 bool Graph::IsValidNode(Node* node) const {
   if (node == nullptr) return false;
   const int id = node->id();
@@ -445,6 +510,7 @@ Node* Graph::AllocateNode(Node::Properties* props, const Node* cost_node) {
     node = free_nodes_.back();
     free_nodes_.pop_back();
   }
+  node->graph_ = this;
   const int id = nodes_.size();
   int cost_id = cost_node ? cost_node->cost_id() : id;
   node->Initialize(id, cost_id, props);
@@ -461,4 +527,26 @@ void Graph::ReleaseNode(Node* node) {
   node->Clear();
 }
 
+// Ensures that 'device_name' is present in the device name table, and returns
+// the index of that device name. The index is stable, and can be used in
+// calls to Node::set_assigned_device_name_index().
+int Graph::InternDeviceName(const string& device_name) {
+  // Special case, very common.  Also, this allows us to use a single map
+  // lookup below, instead of two.  The 'if (index_cell > 0)' test below
+  // relies on this check.
+  if (device_name.empty()) {
+    return 0;
+  }
+
+  int& index_cell = device_names_map_[device_name];
+  if (index_cell > 0) {
+    return index_cell;
+  }
+
+  const int index = device_names_map_.size();
+  index_cell = index;
+  device_names_.push_back(device_name);
+  return index;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 72884a23728..8cb270170e9 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include <functional>
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -70,6 +71,7 @@ class Node {
   int cost_id() const { return cost_id_; }
   const string& name() const { return props_->node_def_.name(); }
   const string& type_string() const { return props_->node_def_.op(); }
+
   // def() provides the NodeDef the user supplied, but the specifics
   // of this Node may have changed due to placement, optimization, etc.
   // In particular:
@@ -79,27 +81,43 @@ class Node {
   // * def().device() is the "user's requested device" and may not match
   //   the actual assigned device, see assigned_device_name() below;
   // * def().attr() is authoritative.
+  // TODO(irving): Replace with NodeInfo.
   const NodeDef& def() const { return props_->node_def_; }
   const OpDef& op_def() const { return *props_->op_def_; }
 
   // input and output types
-  int num_inputs() const { return props_->input_types_.size(); }
-  DataType input_type(int i) const { return props_->input_types_[i]; }
+  int32 num_inputs() const { return props_->input_types_.size(); }
+  DataType input_type(int32 i) const { return props_->input_types_[i]; }
   const DataTypeVector& input_types() const { return props_->input_types_; }
 
-  int num_outputs() const { return props_->output_types_.size(); }
-  DataType output_type(int o) const { return props_->output_types_[o]; }
+  int32 num_outputs() const { return props_->output_types_.size(); }
+  DataType output_type(int32 o) const { return props_->output_types_[o]; }
   const DataTypeVector& output_types() const { return props_->output_types_; }
 
+  // The device requested by the user.  For the actual assigned device,
+  // use assigned_device_name() below.
+  const string& requested_device() const { return def().device(); }
+
   // This gives the device the runtime has assigned this node to.  If
   // you want the device the user requested, use def().device() instead.
   // TODO(josh11b): Validate that the assigned_device, if not empty:
   // fully specifies a device, and satisfies def().device().
   // TODO(josh11b): Move assigned_device_name outside of Node into a
   // NodeId->DeviceName map.
-  string assigned_device_name() const { return assigned_device_name_; }
-  void set_assigned_device_name(const string& device_name) {
-    assigned_device_name_ = device_name;
+  const string& assigned_device_name() const;
+  void set_assigned_device_name(const string& device_name);
+  bool has_assigned_device_name() const {
+    return assigned_device_name_index_ > 0;
+  }
+  int assigned_device_name_index() const { return assigned_device_name_index_; }
+  void set_assigned_device_name_index(int index);
+
+  // Read only access to attributes
+  AttrSlice attrs() const { return AttrSlice(def()); }
+
+  // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
+  const protobuf::RepeatedPtrField<string>& requested_inputs() const {
+    return def().input();
   }
 
   // Get the neighboring nodes via edges either in or out of this node.
@@ -140,6 +158,8 @@ class Node {
   bool IsHostSend() const { return class_ == NC_HOST_SEND; }
   bool IsHostRecv() const { return class_ == NC_HOST_RECV; }
 
+  bool IsMetadata() const { return class_ == NC_METADATA; }
+
   template <typename T>
   void AddAttr(const string& name, const T& val) {
     MaybeCopyOnWrite();
@@ -158,6 +178,7 @@ class Node {
   // Returns into '*n' the node that has an output connected to the
   // 'idx' input of this Node.
   Status input_node(int idx, const Node** n) const;
+  Status input_node(int idx, Node** n) const;
 
  private:
   friend class Graph;
@@ -216,9 +237,14 @@ class Node {
     NC_GET_SESSION_HANDLE,
     NC_GET_SESSION_TENSOR,
     NC_DELETE_SESSION_TENSOR,
+    NC_METADATA,
     NC_OTHER  // Not a special kind of node
   };
 
+  static const std::unordered_map<string, NodeClass>& kNodeClassTable;
+
+  static NodeClass GetNodeClassForOp(const string& ts);
+
   int id_;       // -1 until Initialize() is called
   int cost_id_;  // -1 if there is no corresponding cost accounting node
   NodeClass class_;
@@ -228,8 +254,16 @@ class Node {
 
   Properties* props_;
 
-  // Name of device assigned to perform this computation.
-  string assigned_device_name_;
+  // Index within Graph::device_names_ of the name of device assigned
+  // to perform this computation.
+  int assigned_device_name_index_;
+
+  // A back-pointer to the Graph that owns this node.  Currently, this exists
+  // solely to allow Node::[set_]assigned_device_name() to work. However, if all
+  // callers of Node::[set_]assigned_device_name() are modified to use the
+  // equivalent methods defined directly on Graph, then we can remove this
+  // field and reclaim that memory.
+  Graph* graph_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Node);
 };
@@ -266,14 +300,86 @@ class Edge {
   int dst_input_;
 };
 
+// Allows for iteration of the edges of a Graph, by iterating the underlying
+// Graph.edges_ vector while skipping over null entries.
+class GraphEdgesIterable {
+ private:
+  const std::vector<Edge*>& edges_;
+
+ public:
+  explicit GraphEdgesIterable(const std::vector<Edge*>& edges)
+      : edges_(edges) {}
+
+  typedef Edge* value_type;
+
+  class const_iterator {
+   private:
+    // The underlying iterator.
+    std::vector<value_type>::const_iterator iter_;
+
+    // The end of the underlying iterator.
+    std::vector<value_type>::const_iterator end_;
+
+    // Advances iter_ until it reaches a non-null item, or reaches the end.
+    void apply_filter() {
+      while (iter_ != end_ && *iter_ == nullptr) {
+        ++iter_;
+      }
+    }
+
+   public:
+    const_iterator(std::vector<value_type>::const_iterator iter,
+                   std::vector<value_type>::const_iterator end)
+        : iter_(iter), end_(end) {
+      apply_filter();
+    }
+
+    bool operator==(const const_iterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const const_iterator& other) const {
+      return iter_ != other.iter_;
+    }
+
+    // This is the prefix increment operator (++x), which is the operator
+    // used by C++ range iteration (for (x : y) ...).  We intentionally do not
+    // provide a postfix increment operator.
+    const_iterator& operator++() {
+      ++iter_;
+      apply_filter();
+      return *this;
+    }
+
+    value_type operator*() { return *iter_; }
+  };
+
+  const_iterator begin() {
+    return const_iterator(edges_.begin(), edges_.end());
+  }
+  const_iterator end() { return const_iterator(edges_.end(), edges_.end()); }
+};
+
 // Thread compatible but not thread safe.
 class Graph {
  public:
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
   //
-  // The graph can hold ops found in registry.
+  // The graph can hold ops found in registry. `registry`s lifetime must be at
+  // least that of the constructed graph's.
   explicit Graph(const OpRegistryInterface* registry);
+
+  // Constructs a graph with a single SOURCE (always id kSourceId) and a
+  // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
+  //
+  // The graph can hold ops found in `flib_def`. Unlike the constructor taking
+  // an OpRegistryInterface, this constructor copies the function definitions in
+  // `flib_def` so its lifetime may be shorter than that of the graph's. The
+  // OpRegistryInterface backing `flib_def` must still have the lifetime of the
+  // graph though.
+  explicit Graph(const FunctionLibraryDefinition& flib_def);
+
   ~Graph();
 
   static const int kControlSlot;
@@ -311,6 +417,12 @@ class Graph {
   // REQUIRES: The edge must exist.
   void RemoveEdge(const Edge* edge);
 
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib);
+
   // The number of live nodes in the graph.
   //
   // Because nodes can be removed from the graph, num_nodes() is often
@@ -319,13 +431,19 @@ class Graph {
   // array's size.
   int num_nodes() const { return num_nodes_; }
 
+  // The number of live nodes in the graph, excluding the Source and Sink nodes.
+  int num_op_nodes() const {
+    DCHECK_GE(num_nodes_, 2);
+    return num_nodes_ - 2;
+  }
+
   // The number of live edges in the graph.
   //
   // Because edges can be removed from the graph, num_edges() is often
   // smaller than num_edge_ids(). If one needs to create an array of
   // edges indexed by edge ids, num_edge_ids() should be used as the
   // array's size.
-  int num_edges() const { return edges().size(); }
+  int num_edges() const { return num_edges_; }
 
   // Serialize the nodes starting at `from_node_id` to a GraphDef.
   void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const;
@@ -341,6 +459,9 @@ class Graph {
   //   for (Node* node : graph.nodes()) { ... }
   gtl::iterator_range<NodeIter> nodes() const;
 
+  // Access to the list of all nodes, excluding the Source and Sink nodes.
+  gtl::iterator_range<NodeIter> op_nodes() const;
+
   // Returns one more than the maximum id assigned to any node.
   int num_node_ids() const { return nodes_.size(); }
 
@@ -361,14 +482,35 @@ class Graph {
 
   // Access to the set of all edges.  Example usage:
   //   for (const Edge* e : graph.edges()) { ... }
-  const EdgeSet& edges() const { return edge_set_; }
+  GraphEdgesIterable edges() const { return GraphEdgesIterable(edges_); }
 
   // The pre-defined nodes.
   enum { kSourceId = 0, kSinkId = 1 };
   Node* source_node() const { return FindNodeId(kSourceId); }
   Node* sink_node() const { return FindNodeId(kSinkId); }
 
-  const OpRegistryInterface* op_registry() const { return ops_; }
+  const OpRegistryInterface* op_registry() const { return &ops_; }
+  const FunctionLibraryDefinition& flib_def() const { return ops_; }
+
+  void CheckDeviceNameIndex(int index) {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, static_cast<int>(device_names_.size()));
+  }
+
+  int InternDeviceName(const string& device_name);
+
+  const string& get_assigned_device_name(const Node& node) const {
+    return device_names_[node.assigned_device_name_index()];
+  }
+
+  void set_assigned_device_name_index(Node* node, int device_name_index) {
+    CheckDeviceNameIndex(device_name_index);
+    node->assigned_device_name_index_ = device_name_index;
+  }
+
+  void set_assigned_device_name(Node* node, const string& device_name) {
+    node->assigned_device_name_index_ = InternDeviceName(device_name);
+  }
 
   // TODO(josh11b): uint64 hash() const;
 
@@ -380,8 +522,8 @@ class Graph {
   Node* AllocateNode(Node::Properties* props, const Node* cost_node);
   void ReleaseNode(Node* node);
 
-  // Registry of all known ops.  Not owned.
-  const OpRegistryInterface* const ops_;
+  // Registry of all known ops, including functions.
+  FunctionLibraryDefinition ops_;
 
   // GraphDef versions
   VersionDef versions_;
@@ -400,9 +542,8 @@ class Graph {
   // the edge with that id was removed from the graph.
   std::vector<Edge*> edges_;
 
-  // For ease of iteration, we currently just keep a set of all live
-  // edges.  May want to optimize by removing this copy.
-  EdgeSet edge_set_;
+  // The number of entries in edges_ that are not nullptr.
+  int num_edges_ = 0;
 
   // Allocated but free nodes and edges.
   std::vector<Node*> free_nodes_;
@@ -411,6 +552,30 @@ class Graph {
   // For generating unique names.
   int name_counter_ = 0;
 
+  // In most graphs, the number of unique values used for the
+  // Node::assigned_device_name() property is quite small.  If the graph is
+  // large, then this duplication of values can consume a significant amount of
+  // memory.  Instead, we represent the same information using an interning
+  // table, which consists of a vector of unique strings (device_names_), as
+  // well a map (device_names_map_) from unique strings to indices within the
+  // unique string table.
+  //
+  // The InternDeviceName() method handles adding a new entry into the table,
+  // or locating the index of an existing entry.
+  //
+  // The fact that Node::assigned_device_name() is implemented using an
+  // interning table is intentionally public.  This allows algorithms that
+  // frequently access this field to do so efficiently, especially for the case
+  // where the assigned_device_name of one Node is copied directly from that
+  // of another Node.
+
+  // A table of the unique assigned device names.  Indices do NOT correspond
+  // to node IDs.  Index 0 is always the empty string.
+  std::vector<string> device_names_;
+
+  // Maps unique device names to indices within device_names_[i].
+  std::unordered_map<string, int> device_names_map_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(Graph);
 };
 
@@ -443,6 +608,10 @@ inline bool IsIdentity(const Node* node) { return node->IsIdentity(); }
 // Returns true iff 'n' is a control flow node.
 inline bool IsControlFlow(const Node* n) { return n->IsControlFlow(); }
 
+// Returns true if the node only depends on its input's metadata
+// (shape).  Specifically, returns true for "Size", "Shape" and "Rank" ops.
+inline bool IsMetadata(const Node* n) { return n->IsMetadata(); }
+
 inline bool IsHostMemoryPreserving(const Node* node) {
   return IsIdentity(node) || IsControlFlow(node);
 }
@@ -535,6 +704,43 @@ inline bool Edge::IsControlEdge() const {
   return src_output_ == Graph::kControlSlot;
 }
 
+inline gtl::iterator_range<NodeIter> Graph::nodes() const {
+  // Note that NodeId 0 is always valid since we don't let the source
+  // node be removed from the graph.
+  return gtl::make_range(NodeIter(this, 0), NodeIter(this, num_node_ids()));
+}
+
+inline gtl::iterator_range<NodeIter> Graph::op_nodes() const {
+  // Note that NodeId 0 is always valid since we don't let the source
+  // node be removed from the graph.
+  //
+  // The current implementation of Graph maintains the invariant that the
+  // first two nodes are the source and sink nodes, and all other nodes are op
+  // nodes. This method (op_nodes()) relies on this invariant.
+  NodeIter begin(this, 0);
+  NodeIter end(this, num_node_ids());
+  if (begin != end) {
+    ++begin;
+  }
+  if (begin != end) {
+    ++begin;
+  }
+  return gtl::make_range(begin, end);
+}
+
+inline void Node::set_assigned_device_name_index(int index) {
+  graph_->CheckDeviceNameIndex(index);
+  assigned_device_name_index_ = index;
+}
+
+inline void Node::set_assigned_device_name(const string& device_name) {
+  graph_->set_assigned_device_name(this, device_name);
+}
+
+inline const string& Node::assigned_device_name() const {
+  return graph_->get_assigned_device_name(*this);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_GRAPH_GRAPH_H_
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 44646e9241b..10f110686fb 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
@@ -55,11 +57,11 @@ bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
 class GraphConstructor {
  public:
   struct Options {
-    Options(const GraphConstructorOptions& in)
+    Options(const GraphConstructorOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(in.allow_internal_ops),
           expect_device_spec(in.expect_device_spec),
           importing(false) {}
-    Options(const ImportGraphDefOptions& in)
+    Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
           prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
@@ -89,24 +91,36 @@ class GraphConstructor {
     bool importing;
   };
 
-  static Status Construct(const Options& opts, const GraphDef* gdef, Graph* g,
+  typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
+
+  // versions and library may be nullptr
+  static Status Construct(const Options& opts, NodeDefSlice node_defs,
+                          const VersionDef* versions,
+                          const FunctionDefLibrary* library, Graph* g,
                           ShapeRefiner* refiner,
                           std::vector<std::pair<Node*, int>>* return_tensors) {
-    TF_RETURN_IF_ERROR(CheckVersions(gdef->versions(), TF_GRAPH_DEF_VERSION,
-                                     TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
-                                     "GraphDef", "graph"));
-    GraphConstructor c(opts, gdef, g, refiner, return_tensors);
+    if (versions) {
+      TF_RETURN_IF_ERROR(CheckVersions(*versions, TF_GRAPH_DEF_VERSION,
+                                       TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
+                                       "GraphDef", "graph"));
+    }
+    GraphConstructor c(opts, node_defs, versions, library, g, refiner,
+                       return_tensors);
     const Status s = c.TryImport();
     if (!s.ok()) c.Undo();
     return s;
   }
 
  private:
-  GraphConstructor(const Options& opts, const GraphDef* gdef, Graph* g,
+  GraphConstructor(const Options& opts, NodeDefSlice node_defs,
+                   const VersionDef* versions,
+                   const FunctionDefLibrary* library, Graph* g,
                    ShapeRefiner* refiner,
                    std::vector<std::pair<Node*, int>>* return_tensors)
       : opts_(opts),
-        gdef_(gdef),
+        node_defs_(node_defs),
+        versions_(versions),
+        library_(library),
         g_(g),
         original_versions_(g->versions()),
         refiner_(refiner),
@@ -157,7 +171,9 @@ class GraphConstructor {
 
   // From constructor
   const Options opts_;
-  const GraphDef* gdef_;
+  const NodeDefSlice node_defs_;
+  const VersionDef* versions_;
+  const FunctionDefLibrary* library_;
   Graph* g_;
   const VersionDef original_versions_;
 
@@ -166,7 +182,7 @@ class GraphConstructor {
   // May be null. Not owned.
   std::vector<std::pair<Node*, int>>* return_tensors_;
 
-  // Mapping from node name to the index within gdef_
+  // Mapping from node name to the index within node_defs_
   struct NodeInfo {
     explicit NodeInfo(int i) : gdef_index(i), node(nullptr) {}
     // std::unordered_map<> requires that we have a default constructor.
@@ -181,18 +197,18 @@ class GraphConstructor {
   // Mapping from node name to the existing node in g_
   std::unordered_map<StringPiece, Node*, StringPiece::Hasher> existing_nodes_;
 
-  // Index of NodeDefs in gdef_ with all inputs already converted.
+  // Index of NodeDefs in node_defs_ with all inputs already converted.
   std::vector<int> ready_;
 
-  // Mapping between index within gdef_ and the number of inputs that
+  // Mapping between index within node_defs_ and the number of inputs that
   // still need to be converted.
   std::vector<int> pending_count_;
 
-  // Mapping between index within gdef_ and the index within gdef_ of
+  // Mapping between index within node_defs_ and the index within node_defs_ of
   // all nodes it outputs to.
   std::vector<gtl::InlinedVector<int, 4>> outputs_;
 
-  // Used in the conversion from gdef_ to g_ to represent the ith input
+  // Used in the conversion from node_defs_ to g_ to represent the ith input
   // of a node.
   struct InputInfo {
     explicit InputInfo(const string& node_name, Node* n, int i)
@@ -203,7 +219,7 @@ class GraphConstructor {
     int index;
   };
 
-  // Used in the conversion from gdef_ to g_ to represent an edge from
+  // Used in the conversion from node_defs_ to g_ to represent an edge from
   // the node named 'name' to node 'n'.
   struct EdgeInfo {
     explicit EdgeInfo(const string& name, int i1, Node* n, int i2)
@@ -246,13 +262,14 @@ Status GraphConstructor::EnsureNoNameCollisions() {
       if (NodeNameInValues(opts_.control_dependencies, n->name())) {
         return errors::InvalidArgument(
             "cannot resolve control_dependencies because multiple nodes exist "
-            "with name '", n->name(), "'");
+            "with name '",
+            n->name(), "'");
       }
     }
   }
   if (opts_.prefix.empty() && opts_.importing) {
-    for (int n = 0; n < gdef_->node_size(); ++n) {
-      const string& name = gdef_->node(n).name();
+    for (const NodeDef* n : node_defs_) {
+      const string& name = n->name();
       if (existing_nodes_.find(name) != existing_nodes_.end()) {
         return errors::InvalidArgument("Node '", name,
                                        "' already exists in the Graph");
@@ -291,15 +308,16 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
     }
     if ((src.second == Graph::kControlSlot) !=
         (dst.second == Graph::kControlSlot)) {
-      return errors::InvalidArgument(
-          "input_map entry ", src.ToString(), "->", dst.ToString(), " between ",
-          "control edge and non-control edge");
+      return errors::InvalidArgument("input_map entry ", src.ToString(), "->",
+                                     dst.ToString(), " between ",
+                                     "control edge and non-control edge");
     }
   }
   for (const string& node : opts_.control_dependencies) {
     if (existing_nodes_.count(node) == 0) {
       return errors::InvalidArgument(
-          "node '", node, "' in control_dependencies does not exist in "
+          "node '", node,
+          "' in control_dependencies does not exist in "
           "graph");
     }
   }
@@ -308,8 +326,8 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
 
 Status GraphConstructor::BuildNodeIndex() {
   // Validate the node names and add them to gdef_nodes_.
-  for (int n = 0; n < gdef_->node_size(); ++n) {
-    const NodeDef& node_def(gdef_->node(n));
+  for (int n = 0; n < node_defs_.size(); ++n) {
+    const NodeDef& node_def = *node_defs_[n];
     if (!IsValidNodeName(node_def.name(), opts_.allow_internal_ops)) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -347,13 +365,13 @@ Status GraphConstructor::BuildNodeIndex() {
 }
 
 Status GraphConstructor::InitFromEdges() {
-  const int num_nodes = gdef_->node_size();
+  const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
-    const NodeDef& node_def(gdef_->node(n));
+    const NodeDef& node_def = *node_defs_[n];
     if (IsMerge(node_def)) {
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
@@ -417,10 +435,10 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
 Status GraphConstructor::ValidateShape(Node* node) {
   if (!opts_.importing) return Status::OK();
   TF_RETURN_IF_ERROR(refiner_->AddNode(node));
-  // For nodes with the _output_shapes atttribute, override the shape.
+  // For nodes with the _output_shapes attribute, override the shape.
   std::vector<TensorShapeProto> shape_attrs;
   const char* kAttrName = "_output_shapes";
-  if (!GetNodeAttr(node->def(), kAttrName, &shape_attrs).ok()) {
+  if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
     // No _output_shapes attribute, the AddNode call above was sufficient.
     return Status::OK();
   }
@@ -454,7 +472,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
       // functions that are not critical to correct execution but
       // would cause graphs to fail if imported after correcting.
       //
-      const string& op = node->def().op();
+      const string& op = node->type_string();
       const std::vector<string> whitelist = {
           // To be removed after 2017/03/08.
           "RandomShuffleQueue", "PaddingFIFOQueue", "FIFOQueue",
@@ -463,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
           "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
           "WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
           "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
-          "RefNextIteration", "RefMerge", "RefIdentity",
+          "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
           // To be removed after 2017/04/24.
           "ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
       };
@@ -485,13 +503,18 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
   TF_RETURN_IF_ERROR(g_->op_registry()->LookUpOpDef(node_def->op(), &op_def));
   AddDefaultsToNodeDef(*op_def, node_def);
   TF_RETURN_IF_ERROR(ValidateNodeDef(*node_def, *op_def));
-  TF_RETURN_IF_ERROR(CheckOpDeprecation(*op_def, TF_GRAPH_DEF_VERSION));
+  if (versions_) {
+    TF_RETURN_IF_ERROR(CheckOpDeprecation(*op_def, versions_->producer()));
+  }
   return Status::OK();
 }
 
-void RemoveInputs(NodeDef* node_def, const std::vector<int>& inputs_to_remove) {
+void RemoveInputs(const std::vector<int>& inputs_to_remove, NodeDef* node_def,
+                  std::vector<bool>* input_already_exists) {
+  // Remove 'inputs_to_remove' from 'node_def'
   // TODO(skyewm): is there a better way to do this?
   std::vector<string> inputs;
+  inputs.reserve(node_def->input_size());
   for (int i = 0; i < node_def->input_size(); ++i) {
     inputs.push_back(node_def->input(i));
   }
@@ -503,6 +526,11 @@ void RemoveInputs(NodeDef* node_def, const std::vector<int>& inputs_to_remove) {
       node_def->add_input(inputs[i]);
     }
   }
+  // Remove 'inputs_to_remove' from 'input_already_exists'
+  for (int idx : inputs_to_remove) {
+    input_already_exists->erase(input_already_exists->begin() + idx);
+  }
+  DCHECK_EQ(input_already_exists->size(), node_def->input_size());
 }
 
 void GraphConstructor::RemapNodeDefInputs(
@@ -528,7 +556,9 @@ void GraphConstructor::RemapNodeDefInputs(
     node_def->set_input(i, new_input.ToString());
     (*input_already_exists)[i] = true;
   }
-  if (!inputs_to_remove.empty()) RemoveInputs(node_def, inputs_to_remove);
+  if (!inputs_to_remove.empty()) {
+    RemoveInputs(inputs_to_remove, node_def, input_already_exists);
+  }
 }
 
 void GraphConstructor::AddControlDependencies(
@@ -601,8 +631,17 @@ void GraphConstructor::AddPrefixToNodeDef(
 }
 
 Status GraphConstructor::Convert() {
+  // Import functions before adding nodes, since imported nodes may refer to
+  // functions
+  if (library_) {
+    TF_RETURN_IF_ERROR(g_->AddFunctionLibrary(*library_));
+  }
+
   std::vector<InputInfo> inputs;
   int processed = 0;
+
+  std::vector<bool> input_already_exists;
+
   // Process the NodeDefs in topological order.
   // (InitFromEdges() sets this up by filling in ready_ with nodes that have no
   // inputs, pending_counts_ with the number of inputs for each node and
@@ -614,16 +653,16 @@ Status GraphConstructor::Convert() {
     inputs.clear();
     bool has_data_back_edge = false;
 
-    const NodeDef& original_node_def = gdef_->node(o);
+    const NodeDef& original_node_def = *node_defs_[o];
     NodeDef imported_node_def;
     const NodeDef* node_def;
 
     // input_already_exists[i] is true iff the i-th input of the node we're
     // importing refers to a preexisting node in g_ (i.e. input[i] existed prior
-    // to importing gdef_).  Conversely, input_already_exists[i] is false iff
-    // the input refers to a node in gdef_.
-    std::vector<bool> input_already_exists(original_node_def.input_size(),
-                                           false);
+    // to importing node_defs_).  Conversely, input_already_exists[i] is false
+    // iff the input refers to a node in node_defs_.
+    input_already_exists.clear();
+    input_already_exists.resize(original_node_def.input_size(), false);
 
     if (opts_.importing) {
       // TODO(ashankar): The line below means an additional copy of the NodeDef,
@@ -632,6 +671,7 @@ Status GraphConstructor::Convert() {
       // GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
+        // Note that input_already_exists can shrink here
         RemapNodeDefInputs(&imported_node_def, &input_already_exists);
       }
       if (!opts_.control_dependencies.empty()) {
@@ -643,6 +683,7 @@ Status GraphConstructor::Convert() {
       node_def = &original_node_def;
     }
 
+    DCHECK_EQ(node_def->input_size(), input_already_exists.size());
     TF_RETURN_IF_ERROR(ValidateColocationConstraints(*node_def));
     for (int i = 0; i < node_def->input_size(); ++i) {
       TensorId id(ParseTensorName(node_def->input(i)));
@@ -702,7 +743,12 @@ Status GraphConstructor::Convert() {
         TF_RETURN_IF_ERROR(MakeEdge(inputs[i].node, inputs[i].index, node, i));
       }
     }
-    TF_RETURN_IF_ERROR(ValidateShape(node));
+
+    // TODO(skyewm): remove conditional when b/35715995 ("Functions lack shape
+    // inference") is resolved.
+    if (g_->flib_def().Find(node_def->name()) == nullptr) {
+      TF_RETURN_IF_ERROR(ValidateShape(node));
+    }
 
     // Update pending_count_ for outputs.
     for (size_t i = 0; i < outputs_[o].size(); ++i) {
@@ -714,8 +760,8 @@ Status GraphConstructor::Convert() {
     }
   }
 
-  if (processed < gdef_->node_size()) {
-    return errors::InvalidArgument(gdef_->node_size() - processed,
+  if (processed < node_defs_.size()) {
+    return errors::InvalidArgument(node_defs_.size() - processed,
                                    " nodes in a cycle");
   }
   return Status::OK();
@@ -739,20 +785,21 @@ Status GraphConstructor::AddBackEdges() {
 }
 
 Status GraphConstructor::UpdateVersionDef() {
+  if (versions_ == nullptr) return Status::OK();
+
   if (!opts_.importing) {
-    g_->set_versions(gdef_->versions());
+    g_->set_versions(*versions_);
     return Status::OK();
   }
   VersionDef versions = g_->versions();
-  // This new graph is being "produced" by the binary invoking ImportGraphDef.
-  versions.set_producer(TF_GRAPH_DEF_VERSION);
+  versions.set_producer(std::min(versions.producer(), versions_->producer()));
   versions.set_min_consumer(
-      std::max(versions.min_consumer(), gdef_->versions().min_consumer()));
-  if (gdef_->versions().bad_consumers_size() > 0) {
+      std::max(versions.min_consumer(), versions_->min_consumer()));
+  if (versions_->bad_consumers_size() > 0) {
     std::set<int> bad(versions.bad_consumers().begin(),
                       versions.bad_consumers().end());
-    bad.insert(gdef_->versions().bad_consumers().begin(),
-               gdef_->versions().bad_consumers().end());
+    bad.insert(versions_->bad_consumers().begin(),
+               versions_->bad_consumers().end());
     versions.clear_bad_consumers();
     for (int v : bad) {
       versions.add_bad_consumers(v);
@@ -770,15 +817,15 @@ Status GraphConstructor::PopulateReturnTensors() {
       // Locate id in imported nodes
       auto iter = gdef_nodes_.find(id.first);
       if (iter == gdef_nodes_.end()) {
-        return errors::InvalidArgument(
-            "Requested return node '", id.first, "' not found in graph def");
+        return errors::InvalidArgument("Requested return node '", id.first,
+                                       "' not found in graph def");
       }
       int num_outputs = iter->second.node->num_outputs();
       if ((id.second < 0 || id.second >= num_outputs) &&
           id.second != Graph::kControlSlot) {
-        return errors::InvalidArgument(
-            "Invalid return output ", id.second, " of node '", id.first,
-            "', which has ", num_outputs, " outputs");
+        return errors::InvalidArgument("Invalid return output ", id.second,
+                                       " of node '", id.first, "', which has ",
+                                       num_outputs, " outputs");
       }
       return_tensors_->push_back({iter->second.node, id.second});
     } else {
@@ -819,18 +866,26 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
 
 Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
-  ShapeRefiner refiner(g->op_registry());
-  return GraphConstructor::Construct(opts, &gdef, g, &refiner, nullptr);
+  ShapeRefiner refiner(gdef.versions().producer(), g->op_registry());
+  return GraphConstructor::Construct(opts, gdef.node(), &gdef.versions(),
+                                     &gdef.library(), g, &refiner, nullptr);
+}
+
+Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
+                              gtl::ArraySlice<NodeDef> nodes, Graph* g) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, g->op_registry());
+  // TODO(irving): Copy will go away once NodeInfo exists
+  std::vector<const NodeDef*> node_defs;
+  for (const auto& n : nodes) {
+    node_defs.push_back(&n);
+  }
+  return GraphConstructor::Construct(opts, node_defs, nullptr, nullptr, g,
+                                     &refiner, nullptr);
 }
 
 Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
                       Graph* g, ShapeRefiner* refiner,
                       std::vector<std::pair<Node*, int>>* return_tensors) {
-  ShapeRefiner default_refiner(g->op_registry());
-  if (refiner == nullptr) {
-    refiner = &default_refiner;
-  }
-
   if (!opts.return_tensors.empty()) {
     if (return_tensors == nullptr) {
       return errors::InvalidArgument(
@@ -840,10 +895,43 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
     if (!return_tensors->empty()) {
       return errors::InvalidArgument(
           "return_tensors argument to ImportNodeDef() should be empty (has "
-          "size ", return_tensors->size(), ")");
+          "size ",
+          return_tensors->size(), ")");
     }
   }
-  return GraphConstructor::Construct(opts, &gdef, g, refiner, return_tensors);
+
+  ShapeRefiner default_refiner(gdef.versions().producer(), g->op_registry());
+  if (refiner == nullptr) {
+    refiner = &default_refiner;
+  } else {
+    // Log a warning if we are importing a GraphDef at an older
+    // producer version after already having added non-source/sink
+    // nodes to the graph in the past.
+    if (gdef.versions().producer() > 0 &&
+        gdef.versions().producer() < refiner->graph_def_version() &&
+        g->num_nodes() > 2) {
+      LOG(WARNING) << "Importing a graph with a lower producer version "
+                   << gdef.versions().producer()
+                   << " into an existing graph with producer version "
+                   << refiner->graph_def_version() << ". Shape inference will "
+                   << "have run different parts of the graph with different "
+                   << "producer versions.";
+    }
+  }
+
+  // Set the graph def version of the refiner as the min of the
+  // current value and the version from the graph we are about to
+  // import.
+  //
+  // Note: to match Run() semantics, we should re-run shape inference
+  // on the entire graph if the producer version has changed.  For now
+  // we log the warning above.
+  refiner->set_graph_def_version(
+      std::min(refiner->graph_def_version(), gdef.versions().producer()));
+
+  return GraphConstructor::Construct(opts, gdef.node(), &gdef.versions(),
+                                     &gdef.library(), g, refiner,
+                                     return_tensors);
 }
 
 void CopyGraph(const Graph& src, Graph* dest) {
@@ -859,9 +947,7 @@ void CopyGraph(const Graph& src, Graph* dest) {
       node_map;  // "Node in src" -> "Node in *dest"
   node_map[src.source_node()] = dest->source_node();
   node_map[src.sink_node()] = dest->sink_node();
-  for (Node* n : src.nodes()) {
-    if (n->IsSource() || n->IsSink()) continue;
-    CHECK(n->IsOp());
+  for (Node* n : src.op_nodes()) {
     node_map[n] = dest->CopyNode(n);
   }
 
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 186859d132a..7c34dd536cc 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -24,15 +24,6 @@ limitations under the License.
 namespace tensorflow {
 class ShapeRefiner;
 
-// Options specific to constant folding optimizations.
-//
-// TODO(ashankar,vrv): This should move to where constant folding is done.
-struct ConstantFoldingOptions {
-  // If "consider" is not a nullptr, then only constant fold a node "n" if
-  // consider(n) returns true.
-  std::function<bool(const Node*)> consider = nullptr;
-};
-
 // Construct a Graph *g out of a GraphDef gdef. Returns non-OK on
 // error, in which case *g is left in an incomplete state.
 //
@@ -55,12 +46,18 @@ struct GraphConstructorOptions {
 extern Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                                      const GraphDef& gdef, Graph* g);
 
+// Same as ConvertGraphDefToGraph, but takes just nodes.  Used by function
+// instantiation.
+// TODO(irving): This will turn into std::vector<NodeInfoPtr> soon.
+extern Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
+                                     gtl::ArraySlice<NodeDef> nodes, Graph* g);
+
 // Add the graph in GraphDef gdef into an existing Graph *g.
 //
 // On error, returns non-OK and leaves *g unmodified.
 //
 // "shape_refiner" can be null. It should be non-null if the caller
-// intends to add additonal nodes to the graph after the import. This
+// intends to add additional nodes to the graph after the import. This
 // allows the caller to validate shapes of those nodes (since
 // ShapeRefiner::AddNode must be called in topological order).
 //
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 9ce7a0fdf85..8abf21235e0 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -25,11 +25,13 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/version.h"
 
 // TODO(josh11b): Test InitCostModel().
@@ -144,7 +146,7 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     std::vector<string> value;
-    Status s = GetNodeAttr(n->def(), kColocationAttrName, &value);
+    Status s = GetNodeAttr(n->attrs(), kColocationAttrName, &value);
     if (!s.ok()) {
       return "";
     }
@@ -202,6 +204,15 @@ REGISTER_OP("TestOneInputOneOutput")
 REGISTER_OP("TestDefaultAttr")
     .Attr("default_int: int=31415")
     .SetShapeFn(shape_inference::NoOutputs);
+REGISTER_OP("RequiresCurrentGraphVersion")
+    .Output("version: int32")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      if (c->graph_def_version() != TF_GRAPH_DEF_VERSION) {
+        return errors::InvalidArgument("Wrong graph version for shape");
+      }
+      return shape_inference::ScalarShape(c);
+    });
 
 TEST_F(GraphConstructorTest, InvalidNodeName) {
   auto expect_invalid_name = [this](const char* name) {
@@ -461,6 +472,337 @@ versions {
   )EOF");
 }
 
+TEST_F(GraphConstructorTest, ImportGraphThatUsesConstantValueFromInsideLoop) {
+  // Test graph produced in python using:
+  /*
+    with tf.Graph().as_default():
+      i = tf.constant(0)
+      j = tf.constant([0])
+      def s(t):
+        t.set_shape(tf.vector(1))
+        return t
+      c = lambda i, _: tf.less(i, 10)
+      b = lambda i, j: [i, s(tf.transpose(j, j))]
+      r1, r2 = tf.while_loop(c, b, [i, j])
+      with open('/tmp/graph.txt', 'w') as f:
+        f.write(str(tf.get_default_graph().as_graph_def()))
+
+  */
+  const string pb_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/transpose"
+  op: "Transpose"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+  GraphDef def;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(pb_ascii, &def));
+
+  ImportGraphDefOptions opts;
+  auto s = ImportGraphDef(opts, def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+}
+
 TEST_F(GraphConstructorTest, TypeMismatch) {
   ExpectError(
       "node { name: 'input' op: 'TestInput' }"
@@ -655,7 +997,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
   }
   ASSERT_TRUE(a != nullptr);
   int value = 0;
-  s = GetNodeAttr(a->def(), "default_int", &value);
+  s = GetNodeAttr(a->attrs(), "default_int", &value);
   ASSERT_EQ(Status::OK(), s) << s << " -- " << a->def().DebugString();
   EXPECT_EQ(31415, value);
 }
@@ -694,6 +1036,111 @@ TEST_F(GraphConstructorTest, ImportGraphDef_Versioning) {
   EXPECT_EQ(TF_GRAPH_DEF_VERSION - 1, graph_.versions().bad_consumers(0));
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_DeprecatedOps) {
+  // BatchNormWithGlobalNormalization was deprecated in GraphDef version 9
+  GraphDef def;
+  bool parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+node {
+  name: "zeros"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 149
+          }
+          dim {
+            size: 149
+          }
+          dim {
+            size: 32
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "m_v_beta_gamma"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 32
+          }
+        }
+        tensor_content: "\265\374\010=S\250\t\276\206\371>;Z\306y>\217]@\276\347\206\202\275\3747\241\275+1\227=J1\352\275\353?H;`\253\000>\023Y\014\276\341\310L;\301\030\314;\032Kw\275\273fQ;\036\252\200=\257o/\273\377\241\247\275\307,\332\274L\255\247\274\023\331R=r\271\225<\016/\204<\364\340\375\272t\030J=\220\306}\276\276x\003\275\231\013}\276\212\034\224\276\257\020\216>A\223\217\276"
+      }
+    }
+  }
+}
+node {
+  name: "batchnorm"
+  op: "BatchNormWithGlobalNormalization"
+  input: "zeros"
+  input: "m_v_beta_gamma"
+  input: "m_v_beta_gamma"
+  input: "m_v_beta_gamma"
+  input: "m_v_beta_gamma"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "scale_after_normalization"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "variance_epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+}
+  )EOF",
+      &def);
+  ASSERT_TRUE(parsed);
+  Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
+  EXPECT_EQ(Status::OK(), s) << s;
+
+  Graph g2(OpRegistry::Global());
+  def.mutable_versions()->set_producer(10);
+  s = ImportGraphDef(ImportGraphDefOptions(), def, &g2, nullptr);
+  EXPECT_EQ(error::UNIMPLEMENTED, s.code());
+  EXPECT_TRUE(s.error_message().find("BatchNormWithGlobalNormalization is not "
+                                     "available in GraphDef version 10") !=
+              string::npos)
+      << s;
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_ShapeWhitelist) {
   // Barrier's shape is an output vector of 2, but the graph says it's a vector
   // of 1. This is currently whitelisted.
@@ -720,7 +1167,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ShapeWhitelist) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMap) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK("node { name: 'input' op: 'TestInput' }", ImportGraphDefOptions(),
@@ -754,13 +1201,13 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMap) {
 
   // Check that t1's NodeDef is consistent with graph
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  ASSERT_EQ(t1->def().input(0), "input:1");
-  ASSERT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "input:1");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK(
@@ -807,23 +1254,23 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
 
   // Check that NodeDefs are consistent with graph
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "input:0");
-  EXPECT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "input:0");
 
   Node* t2 = FindNode("import/t2");
-  ASSERT_EQ(t2->def().input_size(), 2);
-  EXPECT_EQ(t2->def().input(0), "import/t1:0");
-  EXPECT_EQ(t2->def().input(1), "import/t1:0");
+  ASSERT_EQ(t2->requested_inputs().size(), 2);
+  EXPECT_EQ(t2->requested_inputs()[0], "import/t1:0");
+  EXPECT_EQ(t2->requested_inputs()[1], "import/t1:0");
 
   Node* t3 = FindNode("import/t3");
-  ASSERT_EQ(t3->def().input_size(), 2);
-  EXPECT_EQ(t3->def().input(0), "import/unmapped_input:0");
-  EXPECT_EQ(t3->def().input(1), "import/unmapped_input:1");
+  ASSERT_EQ(t3->requested_inputs().size(), 2);
+  EXPECT_EQ(t3->requested_inputs()[0], "import/unmapped_input:0");
+  EXPECT_EQ(t3->requested_inputs()[1], "import/unmapped_input:1");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithControlEdges) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK("node { name: 'W1' op: 'TestParams' }", ImportGraphDefOptions(),
@@ -887,7 +1334,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithControlEdges) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithBadControlEdge) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK("node { name: 'W1' op: 'TestParams' }", ImportGraphDefOptions(),
@@ -919,7 +1366,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithBadControlEdge) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithInvalidNodeIndex) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK("node { name: 'input1' op: 'TestInput' }", ImportGraphDefOptions(),
@@ -940,7 +1387,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithInvalidNodeIndex) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithMissingEntries) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with node we'll use in input map
   ExpectOK("node { name: 'W1' op: 'TestParams' }", ImportGraphDefOptions(),
@@ -961,7 +1408,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithMissingEntries) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Add two nodes with the same name to graph
   Node* node;
@@ -986,7 +1433,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ReturnTensors) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   ImportGraphDefOptions opts;
   opts.return_tensors.push_back({"input", 1});
@@ -1302,7 +1749,7 @@ versions {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with nodes we'll use in control deps and input map
   ExpectOK(
@@ -1313,24 +1760,37 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
   ImportGraphDefOptions opts;
   opts.control_dependencies = {"W1", "W2"};
   opts.prefix = "import";
-  opts.input_map[TensorId("W1", -1)] = TensorId("W1", -1);
+  // Create two input mappings to the same control dep so we can test adding and
+  // consolidating control deps from the same node
+  opts.input_map[TensorId("W2", -1)] = TensorId("W2", -1);
+  opts.input_map[TensorId("W3", -1)] = TensorId("W2", -1);
   ExpectOK(
       R"EOF(
-      node { name: 'W1' op: 'TestParams' }
+      node { name: 'W2' op: 'TestParams' }
+      node { name: 'W3' op: 'TestParams' }
       node { name: 'input' op: 'TestInput' }
-      node { name: 'input2' op: 'TestInput' input: [ '^W1' ] }
+      node { name: 'input2' op: 'TestInput' input: [ '^W2' ] }
+      node { name: 'input3' op: 'TestInput' input: [ '^W2', '^W3' ] }
       node { name: 't1' op: 'TestMul' input: [ 'input:0', 'input:1' ] }
+      node { name: 't2' op: 'TestMul'
+             input: [ 'input:0', 'input:1', '^W2', '^W3' ] }
       )EOF",
       opts, &refiner);
 
   // Sanity checks
-  EXPECT_TRUE(HasNode("import/W1"));
+  EXPECT_TRUE(HasNode("import/W2"));
+  EXPECT_TRUE(HasNode("import/W3"));
   EXPECT_TRUE(HasNode("import/input"));
   EXPECT_TRUE(HasNode("import/input2"));
+  EXPECT_TRUE(HasNode("import/input3"));
   EXPECT_TRUE(HasNode("import/t1"));
+  EXPECT_TRUE(HasNode("import/t2"));
 
-  EXPECT_TRUE(HasControlEdge("W1", "import/W1"));
-  EXPECT_TRUE(HasControlEdge("W2", "import/W1"));
+  EXPECT_TRUE(HasControlEdge("W1", "import/W2"));
+  EXPECT_TRUE(HasControlEdge("W2", "import/W2"));
+
+  EXPECT_TRUE(HasControlEdge("W1", "import/W3"));
+  EXPECT_TRUE(HasControlEdge("W2", "import/W3"));
 
   EXPECT_TRUE(HasControlEdge("W1", "import/input"));
   EXPECT_TRUE(HasControlEdge("W2", "import/input"));
@@ -1341,35 +1801,63 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
   EXPECT_TRUE(HasEdge("import/input", 0, "import/t1", 0));
   EXPECT_TRUE(HasEdge("import/input", 1, "import/t1", 1));
 
+  // Test that t2 has consolidated remapped control edge and not redundant
+  // control edge
+  EXPECT_TRUE(HasControlEdge("W2", "import/t2"));
+  EXPECT_FALSE(HasControlEdge("W1", "import/t2"));
+  EXPECT_TRUE(HasEdge("import/input", 0, "import/t1", 0));
+  EXPECT_TRUE(HasEdge("import/input", 1, "import/t1", 1));
+
   // Test that input2 has control edges since its only input was remapped
   EXPECT_TRUE(HasControlEdge("W1", "import/input2"));
   EXPECT_TRUE(HasControlEdge("W2", "import/input2"));
-  EXPECT_FALSE(HasControlEdge("import/W1", "import/input2"));
+  EXPECT_FALSE(HasControlEdge("import/W2", "import/input2"));
+
+  // Test that input3 has consolidated remapped control edge and added control
+  // edge
+  EXPECT_TRUE(HasControlEdge("W1", "import/input3"));
+  EXPECT_TRUE(HasControlEdge("W2", "import/input3"));
 
   // Test that node defs are consistent with graph
-  Node* w1 = FindNode("import/W1");
-  ASSERT_EQ(w1->def().input_size(), 2);
-  EXPECT_EQ(w1->def().input(0), "^W1");
-  EXPECT_EQ(w1->def().input(1), "^W2");
+  Node* w2 = FindNode("import/W2");
+  ASSERT_EQ(w2->requested_inputs().size(), 2);
+  EXPECT_EQ(w2->requested_inputs()[0], "^W1");
+  EXPECT_EQ(w2->requested_inputs()[1], "^W2");
+
+  Node* w3 = FindNode("import/W3");
+  ASSERT_EQ(w3->requested_inputs().size(), 2);
+  EXPECT_EQ(w3->requested_inputs()[0], "^W1");
+  EXPECT_EQ(w3->requested_inputs()[1], "^W2");
 
   Node* input = FindNode("import/input");
-  ASSERT_EQ(input->def().input_size(), 2);
-  EXPECT_EQ(input->def().input(0), "^W1");
-  EXPECT_EQ(input->def().input(1), "^W2");
+  ASSERT_EQ(input->requested_inputs().size(), 2);
+  EXPECT_EQ(input->requested_inputs()[0], "^W1");
+  EXPECT_EQ(input->requested_inputs()[1], "^W2");
 
   Node* input2 = FindNode("import/input2");
-  ASSERT_EQ(input2->def().input_size(), 2);
-  EXPECT_EQ(input2->def().input(0), "^W1");
-  EXPECT_EQ(input2->def().input(1), "^W2");
+  ASSERT_EQ(input2->requested_inputs().size(), 2);
+  EXPECT_EQ(input2->requested_inputs()[0], "^W2");
+  EXPECT_EQ(input2->requested_inputs()[1], "^W1");
+
+  Node* input3 = FindNode("import/input3");
+  ASSERT_EQ(input3->requested_inputs().size(), 2);
+  EXPECT_EQ(input3->requested_inputs()[0], "^W2");
+  EXPECT_EQ(input3->requested_inputs()[1], "^W1");
 
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "import/input:0");
-  EXPECT_EQ(t1->def().input(1), "import/input:1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "import/input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "import/input:1");
+
+  Node* t2 = FindNode("import/t2");
+  ASSERT_EQ(t2->requested_inputs().size(), 3);
+  EXPECT_EQ(t2->requested_inputs()[0], "import/input:0");
+  EXPECT_EQ(t2->requested_inputs()[1], "import/input:1");
+  EXPECT_EQ(t2->requested_inputs()[2], "^W2");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
-  ShapeRefiner refiner(graph_.op_registry());
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // Populate graph with nodes we'll use in control deps and input map
   ExpectOK(
@@ -1409,15 +1897,15 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
 
   // Test that node defs are consistent with graph
   Node* merge = FindNode("merge");
-  ASSERT_EQ(merge->def().input_size(), 3);
-  EXPECT_EQ(merge->def().input(0), "input:0");
-  EXPECT_EQ(merge->def().input(1), "t1:0");
-  EXPECT_EQ(merge->def().input(2), "^W1");
+  ASSERT_EQ(merge->requested_inputs().size(), 3);
+  EXPECT_EQ(merge->requested_inputs()[0], "input:0");
+  EXPECT_EQ(merge->requested_inputs()[1], "t1:0");
+  EXPECT_EQ(merge->requested_inputs()[2], "^W1");
 
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "merge:0");
-  EXPECT_EQ(t1->def().input(1), "merge:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "merge:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "merge:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsErrors) {
@@ -1430,7 +1918,8 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsErrors) {
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   GraphDef def;
-  NodeDefBuilder("scope/A", "TestParams").Finalize(def.add_node());
+  TF_EXPECT_OK(
+      NodeDefBuilder("scope/A", "TestParams").Finalize(def.add_node()));
   ImportGraphDefOptions opts;
   const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
   const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
@@ -1561,6 +2050,198 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
 #undef EXPECT_IMPORT_FAILURE
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_FunctionDefs) {
+  // Import a graph def containing a function. The graph def was generated using
+  // this python code:
+  // @function.Defun(tf.float32, tf.float32, tf.float32)
+  // def FooGrad(x, y, dz): return dz, dz
+  //
+  // @function.Defun(tf.float32, tf.float32, grad_func=FooGrad)
+  // def Foo(x, y): return x + y
+  //
+  // p1 = tf.placeholder(tf.float32)
+  // p2 = tf.placeholder(tf.float32)
+  // foo = Foo(p1, p2)
+  ImportGraphDefOptions opts;
+  ExpectOK(
+      R"EOF(
+      node {
+        name: "Placeholder" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Placeholder_1" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Foo_d03c39a3" op: "Foo_d03c39a3"
+        input: "Placeholder" input: "Placeholder_1"
+      }
+      library {
+        function {
+          signature {
+            name: "Foo_d03c39a3"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "add" type: DT_FLOAT }
+          }
+          node_def {
+            name: "add" op: "Add" input: "x" input: "y"
+            attr { key: "T" value { type: DT_FLOAT } }
+          }
+          ret { key: "add" value: "add:z:0" }
+        }
+        function {
+          signature {
+            name: "FooGrad_dc60abc8"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            input_arg { name: "dz" type: DT_FLOAT }
+            output_arg { name: "dz" type: DT_FLOAT }
+            output_arg { name: "dz_U0" type: DT_FLOAT }
+          }
+          ret { key: "dz" value: "dz:0" }
+          ret { key: "dz_U0" value: "dz:0" }
+        }
+        gradient {
+          function_name: "Foo_d03c39a3" gradient_func: "FooGrad_dc60abc8"
+        }
+      }
+      versions { producer: 21 min_consumer: 12 }
+      )EOF",
+      opts);
+
+  EXPECT_TRUE(HasNode("Placeholder"));
+  EXPECT_TRUE(HasNode("Placeholder_1"));
+  EXPECT_TRUE(HasNode("Foo_d03c39a3"));
+  // Check that Foo and FooGrad have been imported
+  const OpDef* op_def;
+  TF_ASSERT_OK(graph_.op_registry()->LookUpOpDef("Foo_d03c39a3", &op_def));
+  TF_ASSERT_OK(graph_.op_registry()->LookUpOpDef("FooGrad_dc60abc8", &op_def));
+
+  // Re-serialize and run the graph. This tests that re-serialized functions can
+  // be imported again and that imported functions can be run.
+  GraphDef gdef;
+  graph_.ToGraphDef(&gdef);
+  EXPECT_EQ(gdef.library().function_size(), 2);
+  EXPECT_EQ(gdef.library().gradient_size(), 1);
+  EXPECT_EQ(gdef.library().gradient()[0].function_name(), "Foo_d03c39a3");
+  EXPECT_EQ(gdef.library().gradient()[0].gradient_func(), "FooGrad_dc60abc8");
+
+  std::unique_ptr<Session> sess(NewSession(SessionOptions()));
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  Tensor p1(DT_FLOAT, TensorShape({1}));
+  p1.scalar<float>()() = 1.0;
+  Tensor p2(DT_FLOAT, TensorShape({1}));
+  p2.scalar<float>()() = 2.0;
+  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
+                                                   {"Placeholder_1", p2}};
+  std::vector<string> output_names = {"Foo_d03c39a3"};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run(inputs, output_names, target_names, &outputs));
+
+  ASSERT_EQ(outputs.size(), 1);
+  EXPECT_EQ(outputs[0].scalar<float>()(), 3.0);
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
+  // Import a graph def containing a function. The graph def was generated using
+  // this python code:
+  //   @function.Defun(tf.float32, tf.float32)
+  //   def Inner(x, y): return x + y
+  //
+  //   @function.Defun(tf.float32, tf.float32)
+  //   def Outer(x, y): return Inner(x, y)
+  //
+  //   p1 = tf.placeholder(tf.float32)
+  //   p2 = tf.placeholder(tf.float32)
+  //   Outer(p1, p2)
+  ImportGraphDefOptions opts;
+  ExpectOK(
+      R"EOF(
+      node {
+        name: "Placeholder" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Placeholder_1" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Outer_966fa13d" op: "Outer_966fa13d"
+        input: "Placeholder" input: "Placeholder_1"
+      }
+      library {
+        function {
+          signature {
+            name: "Outer_966fa13d"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "Inner_d03c39a3" type: DT_FLOAT }
+          }
+          node_def {
+            name: "Inner_d03c39a3" op: "Inner_d03c39a3" input: "x" input: "y"
+          }
+          ret { key: "Inner_d03c39a3" value: "Inner_d03c39a3:add:0" }
+        }
+        function {
+          signature {
+            name: "Inner_d03c39a3"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "add" type: DT_FLOAT }
+          }
+          node_def {
+            name: "add" op: "Add" input: "x" input: "y"
+            attr { key: "T" value { type: DT_FLOAT } }
+          }
+          ret { key: "add" value: "add:z:0" }
+        }
+      }
+      versions { producer: 21 min_consumer: 12 }
+      )EOF",
+      opts);
+
+  EXPECT_TRUE(HasNode("Placeholder"));
+  EXPECT_TRUE(HasNode("Placeholder_1"));
+  EXPECT_TRUE(HasNode("Outer_966fa13d"));
+  // Check that Inner and Outer have been imported
+  const OpDef* op_def;
+  Status s = graph_.op_registry()->LookUpOpDef("Inner_d03c39a3", &op_def);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+  s = graph_.op_registry()->LookUpOpDef("Outer_966fa13d", &op_def);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  // Re-serialize and run the graph. This tests that re-serialized functions can
+  // be imported again and that imported functions can be run.
+  GraphDef gdef;
+  graph_.ToGraphDef(&gdef);
+  std::unique_ptr<Session> sess(NewSession(SessionOptions()));
+  s = sess->Create(gdef);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  Tensor p1(DT_FLOAT, TensorShape({1}));
+  p1.scalar<float>()() = 1.0;
+  Tensor p2(DT_FLOAT, TensorShape({1}));
+  p2.scalar<float>()() = 2.0;
+  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
+                                                   {"Placeholder_1", p2}};
+  std::vector<string> output_names = {"Outer_966fa13d"};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  s = sess->Run(inputs, output_names, target_names, &outputs);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  ASSERT_EQ(outputs.size(), 1);
+  EXPECT_EQ(outputs[0].scalar<float>()(), 3.0);
+}
+
 TEST_F(GraphConstructorTest, CopyGraph) {
   const int v = TF_GRAPH_DEF_VERSION;
   const int bad = v + 17;
@@ -1580,5 +2261,227 @@ TEST_F(GraphConstructorTest, CopyGraph) {
   EXPECT_EQ(dst.versions().bad_consumers(0), bad);
 }
 
+// Confirms that graph def version in the graph reaches the shape inference
+// function.
+TEST_F(GraphConstructorTest, GraphDefVersionUsedForShapeInference) {
+  string gdef_ascii = strings::StrCat(R"EOF(
+      node{ name:"A" op:"RequiresCurrentGraphVersion" }
+      versions { producer: )EOF",
+                                      TF_GRAPH_DEF_VERSION - 1, "}");
+  ImportGraphDefOptions opts;
+  ExpectError(gdef_ascii, opts, {"Wrong graph version for shape"});
+  gdef_ascii = strings::StrCat(R"EOF(
+      node{ name:"A" op:"RequiresCurrentGraphVersion" }
+      versions { producer: )EOF",
+                               TF_GRAPH_DEF_VERSION, "}");
+  ExpectOK(gdef_ascii, opts);
+}
+
+TEST_F(GraphConstructorTest, GraphDefVersionMergingDuringImport) {
+  ImportGraphDefOptions opts;
+  ExpectOK(
+      "versions { producer: 15 min_consumer: 5 bad_consumers: 2 bad_consumers: "
+      "3 "
+      "}",
+      opts);
+  EXPECT_EQ(15, graph_.versions().producer());
+  EXPECT_EQ(5, graph_.versions().min_consumer());
+  ASSERT_EQ(2, graph_.versions().bad_consumers_size());
+  EXPECT_EQ(2, graph_.versions().bad_consumers(0));
+  EXPECT_EQ(3, graph_.versions().bad_consumers(1));
+
+  ExpectOK(
+      "versions { producer: 10 min_consumer: 8 bad_consumers: 1 bad_consumers: "
+      "3 "
+      "}",
+      opts);
+  EXPECT_EQ(10, graph_.versions().producer());
+  EXPECT_EQ(8, graph_.versions().min_consumer());
+  ASSERT_EQ(3, graph_.versions().bad_consumers_size());
+  EXPECT_EQ(1, graph_.versions().bad_consumers(0));
+  EXPECT_EQ(2, graph_.versions().bad_consumers(1));
+  EXPECT_EQ(3, graph_.versions().bad_consumers(2));
+
+  // This one is a no-op.
+  ExpectOK("versions { producer: 20 min_consumer: 7 }", opts);
+  EXPECT_EQ(10, graph_.versions().producer());
+  EXPECT_EQ(8, graph_.versions().min_consumer());
+  ASSERT_EQ(3, graph_.versions().bad_consumers_size());
+  EXPECT_EQ(1, graph_.versions().bad_consumers(0));
+  EXPECT_EQ(2, graph_.versions().bad_consumers(1));
+  EXPECT_EQ(3, graph_.versions().bad_consumers(2));
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDefProvidedShapeRefinerVersions) {
+  ImportGraphDefOptions opts;
+  // A valid graph at producer version 20, but one
+  // that would not import if the graph_def_version were 21.
+  string gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "Sum/input"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Sum/reduction_indices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Sum"
+  op: "Sum"
+  input: "Sum/input"
+  input: "Sum/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 20
+})EOF");
+
+  // Create a shape refiner with the latest TF_GRAPH_DEF_VERSION.
+  // Importing the graphdef with an existing refiner should
+  // make the refiner inherit the graphdef version from the
+  // passed in graphdef since it has a lower producer.
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+  ExpectOK(gdef_ascii, opts, &refiner);
+
+  // Add another node with a higher producer
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+versions {
+  producer: 21
+})EOF");
+
+  ExpectOK(gdef_ascii, opts, &refiner);
+  // Check that the refiner's graph def version is the lowest of
+  // the graph defs we have seen so far.
+  EXPECT_EQ(20, refiner.graph_def_version());
+
+  // Add another node with a lower producer
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+versions {
+  producer: 17
+})EOF");
+  ExpectOK(gdef_ascii, opts, &refiner);
+
+  // Check that the refiner's graph def version is the lowest of
+  // the graph defs we have seen so far.
+  EXPECT_EQ(17, refiner.graph_def_version());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index ec1c1b6cea2..33d2021f381 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph_def_builder.h"
 
+#include <utility>
+
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -119,7 +121,7 @@ Node* UnaryOp(const string& op_name, NodeOut input,
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
                            opts.op_registry());
-  node_builder.Input(input);
+  node_builder.Input(std::move(input));
   return opts.FinalizeBuilder(&node_builder);
 }
 
@@ -128,7 +130,7 @@ Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
                            opts.op_registry());
-  node_builder.Input(a).Input(b);
+  node_builder.Input(std::move(a)).Input(std::move(b));
   return opts.FinalizeBuilder(&node_builder);
 }
 
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index fe994024c48..f8c6895dfa1 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/memory_types.h"
@@ -356,7 +357,7 @@ string ControlLoopName(const string& name) {
 }
 
 bool IsControlLoop(const Node* node) {
-  const string& name = node->def().name();
+  const string& name = node->name();
   return StringPiece(name).starts_with("_cloop");
 }
 
@@ -392,7 +393,8 @@ Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
 Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
                        const string& device_name,
                        const GraphDefBuilder::Options& bopts) {
-  Node* res_node = ops::BinaryOp("Switch", input1, input2, bopts);
+  Node* res_node =
+      ops::BinaryOp("Switch", std::move(input1), std::move(input2), bopts);
   if (bopts.HaveError()) return nullptr;
   res_node->set_assigned_device_name(device_name);
   return res_node;
@@ -401,7 +403,7 @@ Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
 // A next_iteration node for control flow.
 Node* AddControlNext(NodeBuilder::NodeOut input, const string& device_name,
                      const GraphDefBuilder::Options& bopts) {
-  Node* res_node = ops::UnaryOp("NextIteration", input, bopts);
+  Node* res_node = ops::UnaryOp("NextIteration", std::move(input), bopts);
   if (bopts.HaveError()) return nullptr;
   res_node->set_assigned_device_name(device_name);
   return res_node;
@@ -468,7 +470,7 @@ Status AddControlLoop(const PartitionOptions& opts, Graph* g, const Node* src,
   const string& device_name = edge->dst()->assigned_device_name();
   const string& frame_name = src_info.frame_name;
   int parallel_iterations;
-  status = GetNodeAttr(src_info.frame->def(), "parallel_iterations",
+  status = GetNodeAttr(src_info.frame->attrs(), "parallel_iterations",
                        &parallel_iterations);
   if (!status.ok()) return status;
 
@@ -519,9 +521,7 @@ Status BuildMemoryDeviceInfo(const Graph& g, GraphInfo* info) {
   MemoryTypeVector output_memory_types;
 
   info->device_types.resize(g.num_node_ids(), DEVICE_CPU);
-  for (const Node* node : g.nodes()) {
-    if (!node->IsOp()) continue;  // Skip Sink/Source nodes.
-
+  for (const Node* node : g.op_nodes()) {
     DeviceNameUtils::ParsedName parsed;
     if (!DeviceNameUtils::ParseFullName(node->assigned_device_name(),
                                         &parsed)) {
@@ -831,9 +831,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
   int32 num_data = 0;
   int32 num_control = 0;
-  for (const Node* dst : g->nodes()) {
-    if (!dst->IsOp()) continue;  // Skip Sink/Source nodes.
-
+  for (const Node* dst : g->op_nodes()) {
     dstp = opts.node_to_loc(dst);
     GraphDef* dst_graph = &(*partitions)[dstp];
     NodeDef* dst_def = dst_graph->add_node();
@@ -903,11 +901,11 @@ Status Partition(const PartitionOptions& opts, Graph* g,
           send_start_time = opts.start_times[src->id()].value();
           recv_start_time = opts.start_times[dst->id()].value();
         } else {
-          status = GetNodeAttr(src->def(), "_start_time", &send_start_time);
+          status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
           if (!status.ok()) {
             return status;
           }
-          status = GetNodeAttr(dst->def(), "_start_time", &recv_start_time);
+          status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
           if (!status.ok()) {
             return status;
           }
@@ -1028,9 +1026,10 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     }
   }
 
-  // Set versions
+  // Set versions and function library
   for (auto& it : *partitions) {
     it.second.mutable_versions()->CopyFrom(g->versions());
+    *it.second.mutable_library() = g->flib_def().ToProto();
   }
 
   // Set the start times for recvs at the very end.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6d69fcd0a44..ca49ea0ac49 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_partition.h"
 
 #include <unordered_map>
+#include <utility>
 
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
@@ -23,8 +24,8 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace {
@@ -137,7 +139,8 @@ Output ConstructOp(const Scope& scope, const string& op_type,
                    const gtl::ArraySlice<Input>& inputs) {
   if (!scope.ok()) return Output();
   const string unique_name = scope.GetUniqueNameForOp(op_type);
-  auto builder = NodeBuilder(unique_name, op_type);
+  auto builder =
+      NodeBuilder(unique_name, op_type, scope.graph()->op_registry());
   for (auto const& input : inputs) {
     builder.Input(ops::NodeOut(input.node(), input.index()));
   }
@@ -157,7 +160,7 @@ Output BoolInput(const Scope& scope) {
 }
 
 Output Combine(const Scope& scope, Input a, Input b) {
-  return ConstructOp(scope, "Combine", {a, b});
+  return ConstructOp(scope, "Combine", {std::move(a), std::move(b)});
 }
 
 class GraphPartitionTest : public ::testing::Test {
@@ -170,24 +173,33 @@ class GraphPartitionTest : public ::testing::Test {
             "/job:a/replica:0/task:0/cpu:1")) {}
 
   const GraphDef& ToGraphDef() {
-    in_.ToGraphDef(&in_graph_def_);
+    TF_EXPECT_OK(in_.ToGraphDef(&in_graph_def_));
     return in_graph_def_;
   }
 
   void ExpectMatchA() {
     GraphDef graph_def;
-    scope_a_.ToGraphDef(&graph_def);
+    TF_EXPECT_OK(scope_a_.ToGraphDef(&graph_def));
     string a = "/job:a/replica:0/task:0/cpu:0";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[a]);
   }
 
   void ExpectMatchB() {
     GraphDef graph_def;
-    scope_b_.ToGraphDef(&graph_def);
+    TF_EXPECT_OK(scope_b_.ToGraphDef(&graph_def));
     string b = "/job:a/replica:0/task:0/cpu:1";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[b]);
   }
 
+  void ExpectFunctions(const FunctionDefLibrary& library,
+                       const std::set<string>& expected_names) {
+    std::set<string> actual_names;
+    for (const FunctionDef& fdef : library.function()) {
+      actual_names.insert(fdef.signature().name());
+    }
+    EXPECT_EQ(actual_names, expected_names);
+  }
+
   Scope in_;
   GraphDef in_graph_def_;
   Scope scope_a_;
@@ -401,5 +413,27 @@ TEST_F(GraphPartitionTest, PartitionIncompleteGraph) {
   EXPECT_EQ(error::INVALID_ARGUMENT, status.code()) << status;
 }
 
+TEST_F(GraphPartitionTest, Functions) {
+  FunctionDefLibrary fdef_lib;
+  *fdef_lib.add_function() = test::function::XTimesTwo();
+  *fdef_lib.add_function() = test::function::XTimesFour();
+  TF_ASSERT_OK(in_.graph()->AddFunctionLibrary(fdef_lib));
+
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
+  ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
+  ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
+
+  Partition(ToGraphDef(), &partitions_);
+  EXPECT_EQ(2, partitions_.size());
+
+  // Test that partition graphs inherit function library from original graph
+  string a = "/job:a/replica:0/task:0/cpu:0";
+  string b = "/job:a/replica:0/task:0/cpu:1";
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+  ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index f5ed7a83e47..68848ae8c84 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <set>
 #include <vector>
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -50,8 +51,8 @@ class GraphTest : public ::testing::Test {
   GraphTest() : graph_(OpRegistry::Global()) {}
   ~GraphTest() override {}
 
-  static void VerifyNodes(Node* node, std::vector<Node*> expected_in,
-                          std::vector<Node*> expected_out) {
+  static void VerifyNodes(Node* node, const std::vector<Node*>& expected_in,
+                          const std::vector<Node*>& expected_out) {
     std::vector<Node*> in;
     for (const Edge* e : node->in_edges()) {
       in.push_back(e->src());
@@ -109,6 +110,7 @@ class GraphTest : public ::testing::Test {
   // are readable.
   static std::vector<string> Stringify(const std::vector<Node*>& nodes) {
     std::vector<string> result;
+    result.reserve(nodes.size());
     for (Node* n : nodes) {
       result.push_back(n->DebugString());
     }
@@ -317,21 +319,21 @@ TEST_F(GraphTest, AddAttr) {
   n1->AddAttr("_a", "new_attr");
 
   string attr;
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
 
   Node* n2 = graph_.CopyNode(n1);
 
   n1->AddAttr("_b", "new_attr_2");
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_b", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_b", &attr));
   EXPECT_EQ("new_attr_2", attr);
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_NE(Status::OK(), GetNodeAttr(n2->def(), "_b", &attr));
+  EXPECT_NE(Status::OK(), GetNodeAttr(n2->attrs(), "_b", &attr));
 }
 
 // Convert edge iteration results into a sorted string.
@@ -387,6 +389,60 @@ TEST_F(GraphTest, InputEdges) {
   TF_EXPECT_OK(b->input_edges(&edges));
 }
 
+TEST_F(GraphTest, AddFunctionLibrary) {
+  // Basic functionality
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::XTimesFour();
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesTwo") != nullptr);
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesFour") != nullptr);
+
+  // Duplicate functions are ignored
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesTwo") != nullptr);
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesFour") != nullptr);
+
+  // Duplicate names corresponding to different functions trigger an error
+  FunctionDefLibrary error_proto = proto;
+  *error_proto.mutable_function(0)->add_node_def() =
+      error_proto.function(0).node_def(0);
+  Status s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'XTimesTwo' because a different function with "
+            "the same name already exists.");
+
+  // Function with same name as an existing op triggers an error
+  error_proto = proto;
+  error_proto.mutable_function(0)->mutable_signature()->set_name("Add");
+  s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
+
+  // Adding a gradient function to an existing function is ok
+  GradientDef* grad = proto.add_gradient();
+  grad->set_function_name("XTimesTwo");
+  grad->set_gradient_func("Undefined");  // undefined funcs in grads are ok
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_EQ(graph_.flib_def().FindGradient("XTimesTwo"), "Undefined");
+
+  // Duplicate gradients are ignored
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_EQ(graph_.flib_def().FindGradient("XTimesTwo"), "Undefined");
+
+  // Conflicting gradient triggers an error
+  error_proto = proto;
+  error_proto.mutable_gradient(0)->set_gradient_func("Undefined2");
+  s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot assign gradient function 'Undefined2' to 'XTimesTwo' "
+            "because it already has gradient function 'Undefined'");
+}
+
 REGISTER_OP("Input").Output("o: float");
 REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("o: float");
 
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
new file mode 100644
index 00000000000..94741a11ffa
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -0,0 +1,1997 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following 2 scenarios:
+//     1) Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//     2) Context-based rewrite: This is needed in order to optimize
+//        gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
+//        MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
+//        Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
+//        This is context-specific optimization, where the context is the
+//        forward operator that the BiasAddGrad corresponds to.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//
+// The meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous node in the graph.
+//
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
+//
+// Rewriting rules:
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
+//
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be deleted later.)
+//
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
+//
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
+//
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where a workspace tensor originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
+//
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
+//
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
+//
+// Example of B.2 : Context-based node rewrite
+// -------------------------------------------
+// Consider BiasAddGrad op as:
+//
+//           O = _MklConv2D(A, B, C, A_m, B_m, C_m)
+//           P = BiasAddGrad(O)
+//
+// Then we rewrite it as:
+//
+//           P = Conv2DWithBiasBackpropBias(O, O_m)
+//
+// 'Distance' between input of BiasAddGrad and _MklConv2D in terms of hops is
+// the context matching depth. If _MklConv2DWithBias is not within the context
+// matching depth, then we do not rewrite BiasAddGrad.
+
+// How many hops do we search for matching node in the backward dataflow graph?
+// We use maxhop of 10 based on empirical observations. Also, these are
+// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
+// directly goes to backward nodes, we do not expect the hop-distance
+// would be more than few nodes.
+static size_t kNodeMergeContextMaxDepth = 10;
+
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    // NOTE: names are alphabetically sorted.
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_with_bias_backprop_bias =
+                                   "_MklConv2DWithBiasBackpropBias";
+    csinfo_.relu                  = "Relu";
+    csinfo_.relu_grad             = "ReluGrad";
+    csinfo_.reshape               = "Reshape";
+    csinfo_.split                 = "Split";
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.avg_pool,
+                      GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
+    // on if context contains Conv2D.
+    rinfo_.push_back({csinfo_.bias_add_grad,
+                      csinfo_.mkl_conv2d_with_bias_backprop_bias,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_conv2dwithbias_context_});
+    // BiasAddGrad gets written into BiasAddGrad depending on if context
+    // contains MatMul.
+    rinfo_.push_back({csinfo_.bias_add_grad, csinfo_.matmul,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_matmul_context_});
+    rinfo_.push_back({csinfo_.concat,
+                      GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.concatv2,
+                      GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d,
+                      GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
+                      GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.identity,
+                      GetMklOpName(csinfo_.identity),
+                      CopyAttrsIdentity, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn,
+                      GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.max_pool,
+                      GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu,
+                      GetMklOpName(csinfo_.relu),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.reshape,
+                      GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite, nullptr});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
+
+    // Add a rule for merging nodes
+    minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
+                      csinfo_.mkl_conv2d_with_bias});
+
+    // We use maxhop of 10 based on empirical observations. Also, these are
+    // maxhops in backward data-flow graph. Since input of forward nodes
+    // (Conv2D) directly goes to backward nodes, we do not expect the
+    // hop-distance would be more than few nodes.
+    biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
+                                   kNodeMergeContextMaxDepth};
+
+    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
+                                   csinfo_.mkl_conv2d_with_bias,
+                                   kNodeMergeContextMaxDepth};
+
+    cinfo_.push_back(&biasaddgrad_matmul_context_);
+    cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+  /// Structure to specify the context information used in a node rewrite rule
+  typedef struct {
+    string node;     // Name of the node to be rewritten
+    string fwd;      // Name of the node in the forward pass that this node
+                     // corresponds to
+    size_t max_hop;  // Maximum number of hops the fwd is located
+                     // from this node. If the fwd is farther than max_hop
+                     // then we do not rewrite the node.
+  } ContextInfo;
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*, const ContextInfo* c)> rewrite_rule;
+    // ContextInfo, if any, to be used for rewrite
+    ContextInfo* context;
+  } RewriteInfo;
+
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
+  typedef struct {
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge
+  typedef struct {
+    string pred;      // Predecessor node string
+    string succ;      // Successor node string
+    int op;           // The operand no the predecessor node corresponds
+                      // to the successor node
+    string new_node;  // Name of the node after merge
+  } MergeInfo;
+
+  /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
+  struct {
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
+    string conv2d;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
+    string matmul;
+    string max_pool;
+    string max_pool_grad;
+    string mkl_conv2d;
+    string mkl_conv2d_with_bias;
+    string mkl_conv2d_with_bias_backprop_bias;
+    string relu;
+    string relu_grad;
+    string reshape;
+    string split;
+  } csinfo_;
+
+ private:
+  /// Maintain info about nodes to rewrite
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info about nodes to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain info about nodes to rewrite
+  static std::vector<ContextInfo*> cinfo_;
+
+  /// Context variables used in referencing rules
+  static ContextInfo biasaddgrad_matmul_context_;
+  static ContextInfo biasaddgrad_conv2dwithbias_context_;
+
+  /// Hash table to maintain nodes visited in the graph.
+  std::unordered_set<const Node*> visited_nodes_;
+
+ private:
+  // Check if we rewrote node 'n'
+  //
+  // If we rewrote the node, then the rewritten node will produce
+  // Mkl tensor as output. If we did not rewrite the node, then
+  // we need to insert dummy Mkl node on the input side.
+  //
+  // Returns true if node is rewritten, false otherwise.
+  inline bool IsRewrittenNode(Node* n) const {
+    return visited_nodes_.find(n) != visited_nodes_.end();
+  }
+
+  // Mark the node as rewritten
+  inline void MarkRewrittenNode(Node* n) { visited_nodes_.insert(n); }
+
+  // Clear all visited nodes
+  inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
+
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
+  }
+
+  // Get the name of Mkl op from original TensorFlow op
+  // We prefix 'Mkl' to the original op to get Mkl op.
+  // TODO(nhasabni) We should move this to mkl_util.h.
+  inline string GetMklOpName(const string& name) const {
+    // Prefix that we add to Tensorflow op name to construct Mkl op name.
+    const char* const kMklOpPrefix = "_Mkl";
+    return string(kMklOpPrefix) + name;
+  }
+
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "cpu";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge predecessor node with its successor.
+  // Currently, we merge Conv2D with BiasAdd only.
+  //
+  // Input nodes succ and pred may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result in undefined behaviors.
+  //
+  // @input g - input graph, succ - successor node, pred - predecessor node
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n, const ContextInfo* c = nullptr) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n,
+                                           const ContextInfo* c) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrite rule that uses context-information for matching,
+  // used in scenario 2.
+  //
+  // @input - Node 'n' for which to search for matching context
+  // @input - The context 'c' under which to rewrite
+  // @return - true if we can rewrite node under context 'c';
+  //           false otherwise.
+  static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
+
+  // Helper function that searches the matching contextinfo for the node.
+  // Implements depth-first search in the data dependence graph for the
+  // gradient op in the backward direction.
+  //
+  // @input n - Node (gradient op) whose contextinfo is to be searched,
+  //        fwd_node - pointer to node from the forward pass that this node
+  //        belongs to. fwd_node cannot be NULL.
+  // @return Matching contextinfo in case a match is found; null otherwise.
+  //         Also updates *fwd_node with pointer to forward node that this
+  //         context matches.
+  static const ContextInfo* SearchMatchingContext(const Node* n,
+                                                  const Node** fwd_node);
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
+  //
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsRelu(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orig_node' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orig_node);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orig_node);
+};
+
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
+
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
+                                                 Node** out, Node* orig_node) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
+    std::unique_ptr<Graph>* g,
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+  if (IsRewrittenNode(n)) {
+    // If we have visited this node and rewritten it, then it will generate
+    // an edge that will receive Mkl tensor from a node.
+    // First, let's assert that this op is Mkl layer.
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
+    // If this op has been rewritten, then its name must have been same as
+    // Mkl op.
+    CHECK_EQ(mkl_op_registry::IsMklOp(n->type_string(), T), true);
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
+
+  // Let's now setup all Mkl inputs to new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                old_node_inputs[iidx].second,
+                                &mkl_node, &mkl_node_output_slot);
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
+  }
+
+  return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type float.
+  const DataType dt = DataTypeToEnum<float>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()),
+                                        T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() == GetMklOpName(ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
+          // Add workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orig_node->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsIdentity(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsRelu(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->succ) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    const int N_in = a->num_inputs();
+    if (mi->op >= N_in) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    // Get operand op of the operator
+    Node* b = nullptr;
+    b = a_in[mi->op].first;
+    if (b == nullptr || (b->type_string() != mi->pred)) {
+      // NOTE: Should the first check be assert?
+      continue;
+    }
+
+    const int B_in = b->num_inputs();
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
+                                       Node* pred) {
+  CHECK_NOTNULL(succ);
+  CHECK_NOTNULL(pred);
+
+  if (succ->type_string() == csinfo_.bias_add &&
+      pred->type_string() == csinfo_.mkl_conv2d) {
+    // 1. Get all attributes from input nodes.
+    DataType T_pred, T_succ;
+    string padding;
+    std::vector<int32> strides;
+    string data_format_pred, data_format_succ;
+    bool use_cudnn_on_gnu;
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+    TF_CHECK_OK(
+        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+    // We check to ensure that data formats of both succ and pred are same.
+    // We expect them to be same, so we can enforce this as assert.
+    // But assert can be too strict, so we enforce this as a check.
+    // If the check fails, then we do not merge two nodes.
+    // We also do same check for devices.
+    if (data_format_pred != data_format_succ || T_pred != T_succ ||
+        pred->assigned_device_name() != succ->assigned_device_name() ||
+        pred->def().device() != succ->def().device()) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "data_format or T attribute or devices of Conv2D and "
+                    "BiasAdd do not match. Will skip node merge optimization");
+    }
+
+    const int succ_num = succ->num_inputs();
+    gtl::InlinedVector<Node*, 4> succ_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+    FillInputs(succ, &succ_control_edges, &succ_in);
+
+    const int pred_num = pred->num_inputs();
+    gtl::InlinedVector<Node*, 4> pred_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+    FillInputs(pred, &pred_control_edges, &pred_in);
+
+    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
+    // Otherwise, merging is semantically incorrect.
+    if (pred->out_edges().size() != 1) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D has multiple outputs."
+                    "Will skip node merge optimization");
+    }
+
+    for (const Edge* e : pred->out_edges()) {
+      if (e->dst() != succ) {
+        return Status(error::Code::INVALID_ARGUMENT,
+                      "Conv2D does not feed to BiasAdd."
+                      "Will skip node merge optimization");
+      }
+    }
+
+    // 2. Get inputs from both the nodes.
+    // Find the 2 inputs from the conv and the bias from the add Bias.
+    // Get operand 0, 1 of conv2D and their Mkl tensors.
+    CHECK_EQ(pred->in_edges().size(), 4);  // _MklConv2D must have 4 inputs.
+    // Get operand 1 of add_bias
+    // BiasAdd must have 2 inputs: Conv, bias
+    CHECK_EQ(succ->in_edges().size(), 2);
+    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
+    int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
+    GetDummyMklTensorNode(g, &oper3_mkl, pred);  // Get dummy Mkl tensor node
+    // as BiasAdd does not have Mkl tensor as input.
+    CHECK_NOTNULL(oper3_mkl);
+
+    // We will use the node name of BiasAdd as the name of new node
+    // Build new node. We use same name as original node, but change the op
+    // name.
+    NodeBuilder nb(succ->name(), csinfo_.mkl_conv2d_with_bias);
+    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
+      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
+      // we follow contiguous ordering.
+      nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
+      nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
+      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
+      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+    } else {
+      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
+      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
+      // we follow contiguous ordering.
+      nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+      nb.Input(pred_in[2].first, pred_in[2].second);  // Mkl for In1 of Conv2D
+      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2 of Conv2D
+      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+    }
+
+    // Copy attributes from Conv2D to Conv2DWithBias.
+    CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+    // Copy the device assigned to old node to new node.
+    nb.Device(succ->def().device());
+
+    // Create node.
+    Node* new_node;
+    nb.Finalize(&**g, &new_node);
+    CHECK_NOTNULL(new_node);
+
+    // Set the Mkl layer label for this op.
+    new_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
+
+    // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+    // node are already copied in BuildNode. We handle control edges now.
+    for (const Edge* e : pred->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
+    for (const Edge* e : succ->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
+
+    // Incoming edges are fixed, we will fix the outgoing edges now.
+    // First, we will fix outgoing control edges from 'pred' node.
+    // We don't need to handle outgoing data edges from 'pred' node
+    // because pred has only 1 output going to succ node (we enforced
+    // this check for merge already).
+    for (const Edge* e : pred->out_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      }
+    }
+
+    // Second, we will fix outgoing control and data edges from 'succ' node.
+    for (const Edge* e : succ->out_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      } else {
+        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
+                                  e->dst_input()));
+      }
+    }
+
+    // Copy device assigned to old node to new node.
+    // It's ok to use pred or succ as we have enforced a check that
+    // both have same device assigned.
+    new_node->set_assigned_device_name(pred->assigned_device_name());
+
+    VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+            << ", and node: " << succ->DebugString()
+            << ", into node:" << new_node->DebugString();
+
+    (*g)->RemoveNode(succ);
+    (*g)->RemoveNode(pred);
+    MarkRewrittenNode(new_node);
+
+    return Status::OK();
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
+
+  // Check if this is scenario 2 (context-based rewrite).
+  // Get the matching ContextInfo if it is.
+  const Node* fwd_node = nullptr;
+  const ContextInfo* ci = nullptr;
+  bool is_context_based_rewrite = false;
+  if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
+    CHECK_NOTNULL(fwd_node);
+    is_context_based_rewrite = true;
+
+    // Sanity checks for context-based rewrite (if any)
+    if (orig_node->type_string() == csinfo_.bias_add_grad &&
+        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
+      DataType orig_T, ctx_T;
+      string orig_data_format, ctx_data_format;
+      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
+      TF_CHECK_OK(
+          GetNodeAttr(orig_node->def(), "data_format", &orig_data_format));
+      TF_CHECK_OK(GetNodeAttr(fwd_node->def(), "T", &ctx_T));
+      TF_CHECK_OK(
+          GetNodeAttr(fwd_node->def(), "data_format", &ctx_data_format));
+
+      if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
+          orig_node->assigned_device_name() !=
+              fwd_node->assigned_device_name() ||
+          orig_node->def().device() != fwd_node->def().device()) {
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "data_format or T attribute or devices of BiasAddGrad and "
+            "Conv2D do not match. Will skip node rewrite optimization");
+      }
+    } else if (orig_node->type_string() == csinfo_.bias_add_grad &&
+               ri->new_name == csinfo_.matmul) {
+      // When BiasAddGrad has MatMul in context, we do not do any rewrite
+      // and leave BiasAddGrad as it is. But we check for this condition
+      // when we check for node rewrite rule. So we should not even come
+      // here for MatMul. So we will fail now.
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "No rewrite is required for BiasAddGrad for MatMul context.");
+    }
+  }
+
+  // Get all inputs.
+  const int num_inputs = orig_node->in_edges().size();
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orig_node->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
+  if (s != Status::OK()) {
+    return s;
+  }
+
+  // Copy attributes from original node to new node (for scenario 1).
+  // For context-based rewrite, we use context to copy the attributes.
+  if (is_context_based_rewrite) {
+    if (orig_node->type_string() == csinfo_.bias_add_grad &&
+        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
+      CHECK_NOTNULL(fwd_node);
+      ri->copy_attrs(fwd_node, &nb);
+    } else {
+      return Status(error::Code::UNIMPLEMENTED,
+                    "Unimplemented case for node rewrite optimization.");
+    }
+  } else {
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  }
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Finalize graph and get new node.
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orig_node);
+  MarkRewrittenNode(new_node);
+
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
+  return Status::OK();
+}
+
+const MklLayoutRewritePass::ContextInfo*
+MklLayoutRewritePass::SearchMatchingContext(const Node* n,
+                                            const Node** fwd_node) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(fwd_node);
+  *fwd_node = nullptr;
+
+  // Search for matching contextinfo based on node name.
+  // There could be more than one matching contextinfos.
+  bool is_matching_cinfo_found = false;
+  std::vector<const ContextInfo*> mci;
+  for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
+    if (n->type_string() == (*ci)->node) {
+      mci.push_back(*ci);
+      is_matching_cinfo_found = true;
+    }
+  }
+  // If no matching contextinfo is found, return immediately.
+  if (!is_matching_cinfo_found) {
+    return nullptr;
+  }
+
+  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
+          << " in backwards.";
+
+  // Now we will check for forward op name for context info in data
+  // flow graph. Get the max hops we should search for the fwd node.
+  // We are now going to search (breadth-first) backwards in data
+  // dependence graph (for up to max hops) from n for the node
+  // specified in fwd.
+  // queue to maintain nodes to be visited and depth info for
+  // breadth-first search
+  std::queue<std::pair<const Node*, int>> nqueue;
+  const Node* curr_node = n;
+  size_t curr_depth = 0;
+  nqueue.push(std::make_pair(curr_node, curr_depth));
+
+  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
+    std::pair<const Node*, int> curr_pair = nqueue.front();
+    nqueue.pop();
+
+    std::set<const Node*> visited_nodes;
+    curr_node = curr_pair.first;
+    curr_depth = curr_pair.second;
+    CHECK_NOTNULL(curr_node);
+
+    VLOG(1) << "MklLayoutRewritePass: Visiting node: "
+            << curr_node->type_string() << " at depth: " << curr_depth
+            << " for node: " << n->type_string();
+
+    // If we find a match, we return immediately.
+    for (const ContextInfo* ci : mci) {
+      if (curr_node->type_string() == ci->fwd) {
+        *fwd_node = curr_node;
+        return ci;
+      }
+    }
+
+    // Else we explore backward edges from current node.
+    // Add the source nodes of all incoming edges of the node to the queue.
+    for (const Edge* e : curr_node->in_edges()) {
+      // We do not visit already visited node.
+      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
+        // Depth of these nodes is 1 more than the depth of current node.
+        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
+        visited_nodes.insert(e->src());
+      }
+    }
+  } /* while */
+
+  return nullptr;
+}
+
+bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n,
+                                               const ContextInfo* c) {
+  const Node* fwd_node = nullptr;
+  return SearchMatchingContext(n, &fwd_node) == c;
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+
+  // BiasAddGrad is not an Mkl layer, so we make an exception for it.
+  if (n->type_string() != csinfo_.bias_add_grad) {
+    if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) {
+      return nullptr;
+    }
+  }
+
+  // We support 2 types of node rewrites:
+  // 1. Rewriting BiasAddGrad depending on its MklConv2DWithBias context.
+  // 2. Rewriting an op to Mkl op always
+  // We return true if any of these 2 conditions is met.
+
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n, ri->context)) {
+      // If we are rewriting BiasAddGrad into BiasAddGrad for MatMul context,
+      // then we just return directly.
+      if (n->type_string() == csinfo_.bias_add_grad &&
+          ri->context->fwd == csinfo_.matmul &&
+          ri->new_name == csinfo_.bias_add_grad) {
+        return nullptr;
+      }
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const RewriteInfo* ri = nullptr;
+    Node* predn = nullptr;
+    // We will first search if node is to be rewritten
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
+      // Otherwise, we will check if the node is to be merged.
+      string n1_name = n->name();
+      string n2_name = predn->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, predn) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass", &**g);
+
+  // Clear marked nodes as the same graph pass may be used multiple times.
+  UnMarkRewrittenNodes();
+
+  return result;
+}
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/graph/mkl_layout_pass.h b/tensorflow/core/graph/mkl_layout_pass.h
new file mode 100644
index 00000000000..ffe5c1ecfcd
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A graph pass that rewrites graph for propagating MKL layout as a tensor
+
+#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
new file mode 100644
index 00000000000..3c4a5263afd
--- /dev/null
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -0,0 +1,1646 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
+                        .Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved ordering)
+// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous ordering)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
+            "N->E:4;Y->Z:1");
+}
+
+// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved)
+// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous)
+// Test for correct output slots selected
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput2'}"
+      "node { name: 'N' op: '_MklInput2'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M:1', 'N:1']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
+            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
+            "M:1->E:3;N:1->E:4;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// This is a case of node rewrite followed by node merge.
+// We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D
+// with BiasAdd to produce _MklConv2DWithBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Sub)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+
+// Graph contains only _MklConv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);M(_MklInput);N(_MklInput)|"
+            "A->C;B->C:1;M->C:2;N->C:3");
+}
+
+// _MklConv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3");
+}
+
+// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:2;N->C:3");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);M(_MklInput);"
+            "N(_MklInput)|A->C;B->C:1;C->E;D->E:1;M->C:2;N->C:3");
+}
+
+// Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
+// rewrite tests
+
+// D=_MklConv2D(A,M,B,N,C,O); E=Sub(D,A); F=BiasAddGrad(E)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
+            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
+            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
+            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
+            "O->D:5");
+}
+
+// No _MklConv2DWithBias in context, but _MklConv2D in context.
+// No rewrite for BiasAddGrad should happen.
+// C=_MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) (for interleaved)
+// C=_MklConv2D(A,B,M,N); D=Sub(C,A); E=BiasAddGrad(D) (for contiguous)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Sub);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No Conv2D in the context for BiasAddGrad, but MatMul in context.
+// Rewrite should happen, but name of BiasAddGrad does not change.
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No MatMul in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:1->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+}
+
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
+            "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:1->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:1->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:1->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
+            "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Mul)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Mul)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Mul)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Mul)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Mul)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Mul)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Mul)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Mul)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Mul)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.cc b/tensorflow/core/graph/mkl_optimizer_merge.cc
deleted file mode 100644
index 98fc268d284..00000000000
--- a/tensorflow/core/graph/mkl_optimizer_merge.cc
+++ /dev/null
@@ -1,596 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-// This module implements node merging optimization on the graph.
-// We process the nodes in the graph in reverse postorder
-// (i.e. inputs before their downstream dependencies).
-//
-#include <set>
-#include <vector>
-#include <queue>
-#include <utility>
-
-#include "tensorflow/core/graph/mkl_optimizer_merge.h"
-
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/common_runtime/function.h"
-
-namespace tensorflow {
-
-// How many hops do we search for matching node in the backward dataflow graph?
-// We use maxhop of 10 based on empirical observations. Also, these are
-// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
-// directly goes to backward nodes, we do not expect the hop-distance
-// would be more than few nodes.
-static size_t kNodeMergeContextMaxDepth = 10;
-
-// This optimization pass performs two tasks: merge
-// nodes in the forward pass, and rewrite the gradient ops
-// corresponding to merged forward ops.
-//
-// Merging nodes in the graph: Currently, it merges Conv2D+AddBias together.
-//
-// Rewriting nodes in the graph: This is neded in order to optimize
-// gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
-// MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
-// Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
-// This is context-specific optimization, where the context is the
-// forward operator that the BiasAddGrad corresponds to.
-class NodeMergeRewritePass : public GraphOptimizationPass {
- public:
-  NodeMergeRewritePass() {
-    csinfo_.conv2d                     = "Conv2D";
-    csinfo_.conv2dwithbias             = "Conv2DWithBias";
-    csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias";
-    csinfo_.biasadd                    = "BiasAdd";
-    csinfo_.matmul                     = "MatMul";
-    csinfo_.biasaddgrad                = "BiasAddGrad";
-
-    minfo_.push_back({csinfo_.conv2d, csinfo_.biasadd, 0,
-                      csinfo_.conv2dwithbias});
-
-    // We use maxhop of 10 based on emperical observations. Also, these are
-    // maxhops in backward data-flow graph. Since input of forward nodes
-    // (Conv2D) directly goes to backward nodes, we do not expect the
-    // hop-distance would be more than few nodes.
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
-                  {csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}});
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
-                  {csinfo_.conv2d, kNodeMergeContextMaxDepth}});
-    // For now, we are rewriting BiasAddGrad to BiasAddGrad for MatMul. This is
-    // because we do not have a separate Op for MatMulwithBias.
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad,
-                      {csinfo_.matmul, kNodeMergeContextMaxDepth}});
-  }
-
-  // Standard interface to run optimization pass
-  Status Run(const GraphOptimizationPassOptions& options);
-
-  // Helper function which does most of heavy lifting for node merge
-  //
-  // Extracts common functionality between Run public interface and
-  // test interface.
-  //
-  // @return true, if and only if graph is mutated; false otherwise.
-  bool RunPass(std::unique_ptr<Graph>* g);
-
- private:
-  /// Structure to specify information used in node merge
-  typedef struct {
-    string pred;  // Predecessor node string
-    string succ;  // Successor node string
-    int    op;    // What operand no the predecessor node corresponds
-                  // to successor node?
-    string newnode;  // Name of the node after merge
-  } MergeInfo;
-
-  /// Structure to specify information used in node rewrite
-  typedef struct {
-    string node;  // Name of the node to be rewritten
-    string rewrite;  // New name of the node after rewrite
-    typedef struct {
-        string fwd;  // Node name in forward pass that this node
-                       // corresponds to
-        size_t maxhop;  // Maximum number of hops the mfwd_ is located
-                         // from this node. If mfwd_ is farther than mmaxhop_
-                         // then we do not rewrite the node.
-    } ContextInfo;
-    ContextInfo cinfo;  // Context for rewrite
-  } RewriteInfo;
-
-  /// Structure to store all constant strings
-  typedef struct {
-    string conv2d;
-    string conv2dwithbias;
-    string conv2dwithbiasbackpropbias;
-    string biasadd;
-    string matmul;
-    string biasaddgrad;
-  } ConstStringInfo;
-
-  ConstStringInfo csinfo_;
-  std::vector<MergeInfo> minfo_;
-  std::vector<RewriteInfo> rinfo_;
-
- private:
-  // Return a node that can be merged with input node
-  //
-  // @return pointer to the node if we can find such a
-  // node. Otherwise, it returns nullptr.
-  Node* FindNodeForMerge(const Node* a) const;
-
-  // Merge predecessor node with its successor.
-  // Currently, we merge Conv2D with AddBias only.
-  //
-  // Input nodes succ and pred may be deleted if the call to
-  // this function is successful. Attempt to use the pointers
-  // after the call to function may result is undefined behaviors.
-  //
-  // @input g - input graph, succ - successor node, pred - predecessor node
-  // @return Status::OK(), if merging is successful and supported.
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case nodes are merged. Otherwise, it is
-  //         not updated.
-  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
-
-  // Is input node (n) a candidate for rewrite?
-  //
-  // @return true, if it can be rewritten; false, otherwise.
-  bool IsApplicableRewriteNode(const Node* n) const;
-
-  // Rewrites input node to a new node specified by its matching rewrite info.
-  //
-  // Method first searches matching rewrite info for input node and then
-  // uses that info to rewrite.
-  //
-  // Input node may be deleted in case of rewrite. Attempt to use the node
-  // after the call can result in undefined behaviors.
-  //
-  // @input  g - input graph, n - Node to be rewritten
-  // @return Status::OK(), if the input node is rewritten;
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case the input node is rewritten.
-  //         Otherwise, it is not updated.
-  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n);
-
-  // Helper function that searches the matching rewriteinfo for the node.
-  // Implements depth-first search in the data dependence graph for the
-  // gradient op in backward direction.
-  //
-  // @input n - Node (gradient op) whose rewriteinfo is to be searched,
-  //        fwdn - pointer to node from the forward pass that this node
-  //        belongs to
-  // @return Matching rewriteinfo in case a match is found; null otherwise.
-  const RewriteInfo* FindMatchingRewriteInfo(const Node* n,
-                                             const Node** fwdn) const;
-};
-
-/// We register merge optimizer for phase 1 and MKLToTF insertion for phase 2.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
-                      NodeMergeRewritePass);
-
-static void FillInputs(const Node* n,
-                       gtl::InlinedVector<Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
-  DCHECK_EQ(in->size(), n->num_inputs());
-  control_edges->clear();
-  for (const Edge* e : n->in_edges()) {
-    if (e->IsControlEdge()) {
-      control_edges->push_back(e->src());
-    } else {
-      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
-    }
-  }
-  std::sort(control_edges->begin(), control_edges->end());
-  if (n->op_def().is_commutative()) {
-    // For commutative inputs, we sort the input by the input Node*
-    // to get a canonical ordering (so that add(a,b) and add(b, a) will
-    // hash to the same value if is_commutative is true for 'add').
-    std::sort(in->begin(), in->end());
-  }
-}
-
-Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
-  // Search for all matching mergeinfo.
-  // We allow more than one match for extensibility.
-  std::vector<const MergeInfo*> matching_mi;
-  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
-    if (a->type_string() == mi->succ) {
-      matching_mi.push_back(&*mi);
-    }
-  }
-
-  VLOG(1) << "FindNodeForMerge: " << a->type_string();
-
-  for (const MergeInfo* mi : matching_mi) {
-    const int N_in = a->num_inputs();
-    if (mi->op >= N_in) {
-      continue;
-    }
-
-    // Get the control edges and input of node
-    gtl::InlinedVector<Node*, 4> a_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
-    FillInputs(a, &a_control_edges, &a_in);
-
-    // Get operand op of the operator
-    Node *b = nullptr;
-    b = a_in[mi->op].first;
-    if (b == nullptr || (b->type_string() != mi->pred)) {
-      // NOTE: Should the first check be assert?
-      continue;
-    }
-
-    VLOG(1) << "     FindNode: " << b->type_string();
-
-    gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
-    FillInputs(b, &b_control_edges, &b_in);
-
-    // Shouldn't merge if a and b have different control edges.
-    if (a_control_edges != b_control_edges) {
-      continue;
-    } else {
-      // We found a match.
-      return b;
-    }
-  }
-
-  return nullptr;
-}
-
-Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
-                                     Node* succ, Node* pred) {
-  CHECK_NOTNULL(succ);
-  CHECK_NOTNULL(pred);
-
-  if (succ->type_string() == csinfo_.biasadd &&
-      pred->type_string() == csinfo_.conv2d) {
-    // 1. Get all attributes from input nodes.
-    DataType T_pred, T_succ;
-    string padding;
-    std::vector<int32> strides;
-    string data_format_pred, data_format_succ;
-    bool use_cudnn_on_gnu;
-    int groups = 1;
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu",
-                            &use_cudnn_on_gnu));
-    // Groups attribute may not be there on the input node. So we do not
-    // check for error in GetNodeAttr call.
-    GetNodeAttr(pred->def(), "groups", &groups);
-    // We check to ensure that data formats of both succ and pred are same.
-    // We expect them to be same, so we can enforce this as assert.
-    // But assert can be too strict, so we enforce this as a check.
-    // If the check fails, then we do not merge two nodes.
-    if (data_format_pred != data_format_succ ||
-        T_pred != T_succ) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute of Conv2D and BiasAdd"
-                    "do not match. Will skip node merge optimization");
-    }
-
-    // 2. Get inputs from both the nodes.
-    // Find the 2 inputs from the conv and the bias from the add Bias.
-    Node* oper1 = nullptr;
-    Node* oper2 = nullptr;
-    Node* oper3 = nullptr;
-
-    const int succ_num = succ->num_inputs();
-    gtl::InlinedVector<Node*, 4> succ_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
-    FillInputs(succ, &succ_control_edges, &succ_in);
-
-    const int pred_num = pred->num_inputs();
-    gtl::InlinedVector<Node*, 4> pred_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
-    FillInputs(pred, &pred_control_edges, &pred_in);
-
-    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
-    // Otherwise, merging is semantically incorrect.
-    if (pred->out_edges().size() != 1) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D has multiple outputs."
-                    "Will skip node merge optimization");
-    }
-
-    for (const Edge *e : pred->out_edges()) {
-      if (e->dst() != succ) {
-        return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D does not feed to BiasAdd."
-                    "Will skip node merge optimization");
-      }
-    }
-
-    // Get operand 0, 1 of conv2D
-    oper1 = pred_in[0].first;
-    oper2 = pred_in[1].first;
-    // Get operand 1 of add_bias
-    oper3 = succ_in[1].first;
-
-    Node* ret;
-    // We will use the node name of BiasAdd as the name of new node
-    TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias)
-                  .Input(oper1)
-                  .Input(oper2)
-                  .Input(oper3)
-                  .Attr("T", T_pred)
-                  .Attr("strides", strides)
-                  .Attr("padding", padding)
-                  .Attr("data_format", data_format_pred)
-                  .Attr("use_cudnn_on_gpu", use_cudnn_on_gnu)
-                  .Attr("groups", groups)
-                  .Finalize(&**g, &ret));
-    CHECK_NOTNULL(ret);
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    for (const Edge* e : succ->out_edges()) {
-      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
-    }
-
-    (*g)->RemoveNode(succ);
-    (*g)->RemoveNode(pred);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node merge optimization.");
-}
-
-Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
-  CHECK_NOTNULL(n);
-
-  // Get the matching rewriteinfo for the node
-  const Node* fwdn = nullptr;
-  const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn);
-  if (ri == nullptr || fwdn == nullptr) {
-    VLOG(1) << "Rewriteinfo not found for: " << n->type_string();
-    return Status(error::Code::INVALID_ARGUMENT,
-                  "Rewrite info not found for the node."
-                  "Will skip node rewrite optimization");
-  }
-
-  VLOG(1) << "Rewrite called for: " << n->type_string();
-
-  if (n->type_string() == csinfo_.biasaddgrad &&
-      ri->node         == csinfo_.biasaddgrad &&
-      (ri->rewrite     == csinfo_.conv2dwithbiasbackpropbias ||
-       ri->rewrite     == csinfo_.biasaddgrad)) {
-    DataType T; string data_format;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
-    TF_CHECK_OK(GetNodeAttr(n->def(), "data_format", &data_format));
-
-    int n_num = n->num_inputs();  // this must be 1.
-    CHECK_EQ(n_num, 1);
-
-    gtl::InlinedVector<Node*, 4> n_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> n_in(n_num);
-    FillInputs(n, &n_control_edges, &n_in);
-
-    Node *ret = nullptr, *op = n_in[0].first;
-
-    if (ri->rewrite == csinfo_.conv2dwithbiasbackpropbias) {
-      // Get strides info from Conv2D (node in the forward pass that this
-      // node corresponds to).
-      std::vector<int32> strides;
-      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "strides", &strides));
-
-      // We use same name as original node name as there may be fetchoutputs
-      // associated with it.
-      TF_CHECK_OK(NodeBuilder(n->name(), ri->rewrite)
-                    .Input(op)
-                    .Attr("T", T)
-                    .Attr("data_format", data_format)
-                    .Attr("strides", strides)
-                    .Finalize(&**g, &ret));
-    } else {
-      CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad);
-      TF_CHECK_OK(NodeBuilder(n->name(), ri->rewrite)
-                    .Input(op)
-                    .Attr("T", T)
-                    .Attr("data_format", data_format)
-                    .Finalize(&**g, &ret));
-    }
-
-    CHECK_NOTNULL(ret);
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    for (const Edge* e : n->out_edges()) {
-      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
-    }
-
-    VLOG(1) << "Rewrite node: " << n->type_string() << " successful";
-    (*g)->RemoveNode(n);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node rewrite optimization.");
-}
-
-const NodeMergeRewritePass::RewriteInfo*
-NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
-                                              const Node** fwdn) const {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(fwdn);
-  *fwdn = nullptr;
-
-  // Search for matching rewriteinfo based on node name.
-  // There could be more than one matching rewriteinfos.
-  std::vector<const RewriteInfo*> matching_ri;
-  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string() == ri->node) {
-      matching_ri.push_back(&*ri);
-    }
-  }
-
-  VLOG(1) << "Searching graph for: " << n->type_string() << " in backwards.";
-
-  // Now we will check for forward op name for rewrite info in data
-  // flow graph. Get the max hops we should search for the fwd node
-  // We are now going to search (breadth-first) backwards in data
-  // dependence graph (for up to max hops) from n for the node
-  // specified in fwd.
-  // queue to maintain nodes to be visited and depth info for
-  // breadth-first search
-  std::queue<std::pair<const Node*, int>> nqueue;
-  const Node* curr_node = n;
-  size_t curr_depth = 0;
-  nqueue.push(std::make_pair(curr_node, curr_depth));
-
-  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
-    std::pair<const Node*, int> curr_pair = nqueue.front();
-    nqueue.pop();
-
-    std::set<const Node*> visited_nodes;
-    curr_node  = curr_pair.first;
-    curr_depth = curr_pair.second;
-    CHECK_NOTNULL(curr_node);
-
-    VLOG(1) << "Visiting node: " << curr_node->type_string()
-            << " at depth: " << curr_depth
-            << " for node: " << n->type_string();
-
-    // If we find a match, we return immediately with the matching rewrite
-    // info.
-    for (const RewriteInfo* ri : matching_ri) {
-      if (curr_node->type_string() == ri->cinfo.fwd) {
-        *fwdn = curr_node;
-        return ri;
-      }
-    }
-
-    // Else we explore backward edges from current node.
-    // Add the source nodes of all incoming edges of the node to the queue.
-    for (const Edge* e : curr_node->in_edges()) {
-      // We do not visit already visited node.
-      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
-         // Depth of these nodes is 1 more than the depth of current node.
-         nqueue.push(std::make_pair(e->src(), curr_depth+1));
-         visited_nodes.insert(e->src());
-      }
-    }
-  } /* while */
-
-  return nullptr;
-}
-
-bool NodeMergeRewritePass::IsApplicableRewriteNode(const Node *n) const {
-  CHECK_NOTNULL(n);
-
-  // Search for matching rewriteinfo
-  // Even if we find one match, we return true.
-  bool match_found = false;
-  for (const RewriteInfo &ri : rinfo_) {
-    if (n->type_string() == ri.node) {
-      match_found = true;
-      break;
-    }
-  }
-
-  return match_found;
-}
-
-bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
-  bool result = false;
-  CHECK_NOTNULL(g);
-
-  DumpGraph("Before OptimizeMerge", &**g);
-
-  std::vector<Node*> order;
-  GetReversePostOrder(**g, &order);
-  std::vector<std::pair<Node*, Node*>> nodes_to_be_merged;
-  std::vector<Node*> nodes_to_be_rewritten;
-
-  VLOG(1) << "Running NodeMerge Optimization";
-
-  for (Node* n : order) {
-    if (!n->IsOp()) continue;
-    Node* n1 = nullptr;
-    if ((n1 = FindNodeForMerge(n)) != nullptr) {
-      VLOG(1) << "Scheduled nodes " << n->name() << " and "
-              << n1->name() << " for merging";
-      nodes_to_be_merged.push_back(std::make_pair(n, n1));
-    } else if (IsApplicableRewriteNode(n)) {
-      VLOG(1) << "Scheduled node " << n->name() << " for rewrite";
-      nodes_to_be_rewritten.push_back(n);
-    }
-  }
-
-  for (std::pair < Node*, Node* > i : nodes_to_be_merged) {
-    // Even if MergeNode merges single pair of nodes, we
-    // need to return true.
-    string n1_name = i.first->name();
-    string n2_name = i.second->name();
-    if (MergeNode(g, i.first, i.second) == Status::OK()) {
-      VLOG(1) << "Merged nodes " << n1_name << " and " << n2_name;
-      result = true;
-    }
-  }
-
-  DumpGraph("After OptimizeMerge(nodemerge)", &**g);
-
-  for (Node* i : nodes_to_be_rewritten) {
-    string name = i->name();
-    if (RewriteNode(g, i) == Status::OK()) {
-      VLOG(1) << "Rewrite node: " << name << " successful.";
-      result = true;
-    }
-  }
-
-  DumpGraph("After OptimizeMerge(noderewrite)", &**g);
-
-  return result;
-}
-
-bool OptimizeNodeMerge(std::unique_ptr<Graph>* g) {
-  return NodeMergeRewritePass().RunPass(g);
-}
-
-Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  // Currently checking only for two cases - Conv2D+Bias and Matmul+Bias.
-  // It is possible to extend it to other operators in future.
-  if (options.graph == nullptr) {
-    return Status::OK();
-  }
-
-  // Get the ownership of graph
-  std::unique_ptr<Graph>* g = std::move(options.graph);
-
-  RunPass(g);
-
-  // Return the ownership of graph back
-  options.graph->reset(g->release());
-
-  return Status::OK();
-}
-
-}  // namespace tensorflow
-
-#endif
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.h b/tensorflow/core/graph/mkl_optimizer_merge.h
deleted file mode 100644
index 554709e9dd6..00000000000
--- a/tensorflow/core/graph/mkl_optimizer_merge.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// An optimization pass that performs node merging and rewrite on graph nodes
-
-#ifndef TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
-#define TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
-
-#ifdef INTEL_MKL
-
-#include <sys/types.h>
-#include <vector>
-#include <string>
-#include <memory>
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-
-namespace tensorflow {
-
-// Interface to invoke the pass for unit test
-//
-// Returns true if and only if 'g' is mutated.
-extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
-
-}  // namespace tensorflow
-
-#endif  // INTEL_MKL
-
-#endif  // TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
diff --git a/tensorflow/core/graph/mkl_optimizer_merge_test.cc b/tensorflow/core/graph/mkl_optimizer_merge_test.cc
deleted file mode 100644
index da3b01955cc..00000000000
--- a/tensorflow/core/graph/mkl_optimizer_merge_test.cc
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/graph/mkl_optimizer_merge.h"
-
-#include <vector>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-class OptimizerMergeTest : public ::testing::Test {
- public:
-  OptimizerMergeTest() : graph_(OpRegistry::Global()) {}
-
-  static void InitGraph(const string& s, Graph* graph) {
-    GraphDef graph_def;
-
-    auto parser = protobuf::TextFormat::Parser();
-    CHECK(parser.MergeFromString(s, &graph_def)) << s;
-    GraphConstructorOptions opts;
-    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
-  }
-
-  void InitGraph(const string& s) {
-    InitGraph(s, &graph_);
-    original_ = CanonicalGraphString(&graph_);
-  }
-
-  static bool IncludeNode(const Node* n) { return n->IsOp(); }
-
-  static string EdgeId(const Node* n, int index) {
-    if (index == 0) {
-      return n->name();
-    } else if (index == Graph::kControlSlot) {
-      return strings::StrCat(n->name(), ":control");
-    } else {
-      return strings::StrCat(n->name(), ":", index);
-    }
-  }
-
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
-    for (const Node* n : g->nodes()) {
-      if (IncludeNode(n)) {
-        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
-      }
-    }
-    for (const Edge* e : g->edges()) {
-      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
-        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
-                                        EdgeId(e->dst(), e->dst_input())));
-      }
-    }
-    // Canonicalize
-    std::sort(nodes.begin(), nodes.end());
-    std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
-  }
-
-  string DoNodeMerge() {
-    string before = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "Before node merge optimize: " << before;
-
-    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
-    OptimizeNodeMerge(ug);
-
-    string result = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "After node merge optimize:  " << result;
-    return result;
-  }
-
-  const string& OriginalGraph() const { return original_; }
-
-  Graph graph_;
-  string original_;
-};
-
-REGISTER_OP("Input").Output("o: float").SetIsStateful();
-
-TEST_F(OptimizerMergeTest, Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Mul);D(Mul)|"
-            "A->C;A->D;B->C:1;B->D:1");
-}
-
-// Test set 1: Conv2D + AddBias
-
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y)
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);D(Input);E(Conv2DWithBias);Y(Input);Z(Sub)|"
-             "A->E;B->E:1;D->E:2;E->Z;Y->Z:1");
-}
-
-// Graph contains only Conv2D, no AddBias.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D)|"
-             "A->C;B->C:1");
-}
-
-// Conv2D output does not go to BiasAdd.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of Conv2D does not go to BiasAdd.
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd)|"
-             "A->C;B->C:1;D->F;E->F:1");
-}
-
-// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
-// Merge should not be done in such case.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }"  // Conv2D has two outputs.
-                              // No merge should happen.
-      "node { name: 'G' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd);G(Add)|"
-             "A->C;B->C:1;C->G;D->F;E->F:1;E->G:1");
-}
-
-// data_format attribute value mismatch. Merge should not be done
-// in such case.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NHCW' } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd)|"
-            "A->C;B->C:1;C->E;D->E:1");
-}
-
-// Test set 2: Conv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias rewrite tests
-
-// C=Conv2D(A,B); D=Sub(C,A); F=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Sub);E(Conv2DWithBiasBackpropBias)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad, but MatMul in context.
-// Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
-TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
-TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-
-static void BM_NodeMerge(int iters, int op_nodes) {
-  testing::StopTiming();
-  string s;
-  for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
-  }
-  random::PhiloxRandom philox(301, 17);
-  random::SimplePhilox rnd(&philox);
-  for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
-        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
-        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
-        op, rnd.Uniform(10), rnd.Uniform(10));
-  }
-
-  bool first = true;
-  while (iters > 0) {
-    Graph* graph = new Graph(OpRegistry::Global());
-    OptimizerMergeTest::InitGraph(s, graph);
-    int N = graph->num_node_ids();
-    if (first) {
-      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
-      first = false;
-    }
-    {
-      testing::StartTiming();
-      std::unique_ptr<Graph> ug(graph);
-      OptimizeNodeMerge(&ug);
-      testing::StopTiming();
-    }
-    iters -= N;  // Our benchmark units are individual graph nodes,
-                 // not whole graphs
-    // delete graph;
-  }
-}
-BENCHMARK(BM_NodeMerge)->Arg(1000)->Arg(10000);
-
-}  // namespace
-}  // namespace tensorflow
-
-#endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
new file mode 100644
index 00000000000..590b3d030fa
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <memory>
+#include <queue>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass inserts Mkl to Tf tensor conversion nodes (represented by C)
+// in the graph in between A and B, where A and B match any one
+// of the following cases:
+//
+//  1) A = a node that generates output in the Mkl format and,
+//     B = a node that does not accept input in the Mkl format and,
+//     A -> B (there is a direct edge between A and B, then
+//     We will insert C such that A->C->B.
+//
+//  2) A = a node that generates output in the Mkl format and,
+//     B = NULL (in other words, A is the last node in the graph), then
+//     We will insert C such that A->C->B. (C will be the last node.)
+//
+//  Note that case 1 applies to all outputs of A that are input to B.
+//  In other words, the conversions will be required for every output
+//  of A that is input to B. For example, let us say the output of A
+//  is A1, A2, A3, of which A1 and A2 are in Mkl format, but A3 is not
+//  in Mkl format, and all of them are input to B. In such case, we will
+//  do the conversion for A1 and A2 only. We do not need to do any conversion
+//  for A3.
+//
+// This pass relies on ops registering themselves about their Mkl compliance.
+// An Mkl-compliant op can accept inputs in the Mkl format, and produce outputs
+// in the Mkl format. Non-compliant ops accept inputs and outputs in the
+// TensorFlow format.
+//
+class MklToTfConversionPass : public GraphOptimizationPass {
+ public:
+  MklToTfConversionPass() {}
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Insert layout conversion node in the graph pointed by g.
+  // Function scans the graph for candidate edges where we
+  // need to insert conversion nodes.
+  //
+  // @return true even if single conversion node is inserted;
+  // false, otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+ private:
+  // Is the input Op supported by Mkl-specific layout?
+  //
+  // @input op_name string of the op
+  // @input T Datatype to use for checking input op
+  // @return true if op is Mkl supported; false, otherwise.
+  inline bool IsMklSupportedOp(const string& op_name, DataType T) const {
+    return mkl_op_registry::IsMklOp(op_name, T);
+  }
+
+  // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
+  //
+  // Edge will be deleted once a call to this function is successful.
+  // Any attempt to use the edge after this call
+  // will lead to undefined behaviors.
+  //
+  // @return Success:OK() if insertion is successful, otherwise returns
+  //         appropriate error status code.
+  Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
+};
+
+// We register MklToTf insertion for phase 2 in post-partition grouping
+// because we register MklLayoutRewritePass for phase 1 in post-partition
+// grouping. We register this pass after partitioning so that we get a
+// complete picture of inputs and outputs of the nodes in the graphs.
+const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 2, MklToTfConversionPass);
+
+Status MklToTfConversionPass::InsertConversionNodeOnEdge(
+    std::unique_ptr<Graph>* g, Edge* e) {
+  CHECK_NOTNULL(e);
+
+  Node* src = e->src();
+  Node* dst = e->dst();
+
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(dst);
+
+  Node* conversion_node = nullptr;
+  DataType src_datatype = DT_INVALID;
+  DataType dst_datatype = DT_INVALID;
+  string data_format;
+
+  TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
+  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                          Status::OK();
+  // We compare source and destination datatypes only when both are found.
+  if (dst_dtype_found && (src_datatype != dst_datatype)) {
+    string err_msg = "T attribute of " + src->name() + " and " +
+                      dst->name() + " do not match. Will not insert" +
+                     " MklToTf node in such case.";
+    return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
+  }
+
+  // Build the conversion node and specify src as input.
+  TF_CHECK_OK(
+      NodeBuilder((*g)->NewName("Mkl2Tf"), "_MklToTf")
+          .Input(src, e->src_output())
+          .Input(src, DataIndexToMetaDataIndex(
+                          e->src_output(),
+                          src->num_outputs()))  // Get an Mkl tensor slot
+                                                // from the Tf tensor slot.
+          .Device(src->def().device())  // We want to get conversion node
+                                        // on same device as source node.
+          .Attr("T", src_datatype)
+          .Finalize(&**g, &conversion_node));
+
+  CHECK_NOTNULL(conversion_node);
+  if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) {
+    conversion_node->AddAttr("data_format", data_format);
+  }
+
+  // Get assigned device from source node and apply it to conversion node.
+  // We want conversion node to be on the same device as the source node.
+  conversion_node->set_assigned_device_name(src->assigned_device_name());
+
+  // Set the Mkl op label for this op.
+  conversion_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Now that we have added edge from src->conversion_node, let's add edge from
+  // output of conversion_node to the dest node. Since conversion_node
+  // has only 1 output, the src_output of conversion_node is 0.
+  CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, dst, e->dst_input()));
+
+  VLOG(1) << "MklToTfConversionPass: Inserting Conversion node on: "
+          << src->type_string() << " and " << dst->type_string()
+          << " successful.";
+
+  // Remove src->dst edge now.
+  (*g)->RemoveEdge(e);
+  return Status::OK();
+}
+
+bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before MklToTfConversionPass", &**g);
+
+  // Since we are looking for an Mkl-supported op node immediately
+  // followed by a non-Mkl op node, we will just iterate over edge
+  // set of the graph.
+  // edge set whose source and destination are candidates for
+  // inserting conversion node
+  std::vector<Edge*> candidate_edges;
+
+  for (const Edge* e : (*g)->edges()) {
+    Node* src = e->src();
+    Node* dst = e->dst();
+
+    // We skip control edges.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // We skip adding MklToTf on an edge between X->MklToTf or
+    // MklToTf->X, where X is any node.
+    if (src->type_string().compare("_MklToTf") == 0 ||
+        dst->type_string().compare("_MklToTf") == 0) {
+      continue;
+    }
+
+    VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
+            << src->type_string() << " and " << dst->type_string();
+
+    // Let's get source and destination data type.
+    // We cannot check datatype on destination node because destination node
+    // may not be Mkl node.
+    DataType src_datatype;
+    DataType dst_datatype;
+    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(dst->type_string(), dst_datatype));
+
+    // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
+    if (src_is_mkl_op && !dst_is_mkl_op) {
+      VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
+              << " and " << dst->name() << " for inserting conversion nodes";
+      candidate_edges.push_back(const_cast<Edge*>(e));
+    }
+  }
+
+  // Process all candidate edges and insert conversion nodes on them.
+  for (Edge* e : candidate_edges) {
+    // Even if we insert conversion node on a single edge, we
+    // need to return true.
+    string src_name = e->src()->name();
+    string dst_name = e->dst()->name();
+    if (InsertConversionNodeOnEdge(g, e) == Status::OK()) {
+      VLOG(1) << "MklToTfConversionPass: Inserted conversion "
+              << "node on edge between " << src_name << " and " << dst_name;
+      result = true;
+    }
+  }
+
+  DumpGraph("After MklToTfConversionPass", &**g);
+
+  // We need to return true even if we insert one conversion node
+  // anywhere in the graph.
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+//////////////////////////////////////////////////////////////////////////////
+
+bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g) {
+  return MklToTfConversionPass().RunPass(g);
+}
+
+Status MklToTfConversionPass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklTfConvPassGroup != OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h
new file mode 100644
index 00000000000..0562d8b3cd4
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that inserts MklToTf conversion nodes in the graph
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
new file mode 100644
index 00000000000..90bef111648
--- /dev/null
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -0,0 +1,308 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class MklToTfConversionPass : public ::testing::Test {
+ public:
+  MklToTfConversionPass() : graph_(OpRegistry::Global()) {}
+
+  static void InitGraph(const string& s, Graph* graph) {
+    GraphDef graph_def;
+
+    auto parser = protobuf::TextFormat::Parser();
+    CHECK(parser.MergeFromString(s, &graph_def)) << s;
+    GraphConstructorOptions opts;
+    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+  }
+
+  void InitGraph(const string& s) {
+    InitGraph(s, &graph_);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoRunMklToTfConversionPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MklToTf conversion pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    InsertMklToTfConversionNodes(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MklToTf conversion pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+
+TEST_F(MklToTfConversionPass, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// MklConv2D followed by Non-Mkl layer
+// C=MklConv2D(A,M,B,N); E=Sub(C,D) (for interleaved ordering)
+// C=MklConv2D(A,B,M,N); E=Sub(C,D) (for contiguous ordering)
+TEST_F(MklToTfConversionPass, Positive) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'M', 'B', 'N']}"
+        "node { name: 'D' op: 'Input'}"
+        "node { name: 'E' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['C', 'D']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+              "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'B', 'M', 'N']}"
+        "node { name: 'D' op: 'Input'}"
+        "node { name: 'E' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['C', 'D']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
+              "C:1->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
+  }
+}
+
+// MklConv2D followed by MklToTf op followed by Non-Mkl layer.
+// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for interleaved)
+// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for contiguous)
+// MklToTf node should not be inserted again.
+TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'M', 'B', 'N']}"
+        "node { name: 'D' op: '_MklToTf'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " input: ['C:0', 'C:1']}"
+        "node { name: 'E' op: 'Input'}"
+        "node { name: 'F' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['D', 'E']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
+              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'B', 'M', 'N']}"
+        "node { name: 'D' op: '_MklToTf'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " input: ['C:0', 'C:1']}"
+        "node { name: 'E' op: 'Input'}"
+        "node { name: 'F' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['D', 'E']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
+              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A->C;B->C:1;C->D;C:1->D:1;D->F;E->F:1;M->C:2;N->C:3");
+  }
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// There is no Mkl layer so no conversion op should be inserted.
+TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+            "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
+}
+
+static void BM_RunMklToTfConversionPass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    MklToTfConversionPass::InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      InsertMklToTfConversionNodes(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_RunMklToTfConversionPass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 46e54c9eabe..500ac129e8b 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -21,14 +21,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-NodeBuilder::NodeOut::NodeOut(Node* n, int i)  // NOLINT(runtime/explicit)
+NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
     : node(n),
       error(false),
       name(node != nullptr ? node->name() : (error = true, "")),
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
-NodeBuilder::NodeOut::NodeOut(StringPiece n, int i, DataType t)
+NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n.ToString()), index(i), dt(t) {}
 
 NodeBuilder::NodeOut::NodeOut()
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 2684e482865..86647a49c12 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -49,13 +49,13 @@ class NodeBuilder {
   // ArraySlice.
   struct NodeOut {
     // For referencing an existing Node.
-    NodeOut(Node* n, int i = 0);
+    NodeOut(Node* n, int32 i = 0);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
     // back edge to a node that hasn't been added to the graph yet,
     // but will be.
-    NodeOut(StringPiece name, int i, DataType t);
+    NodeOut(StringPiece name, int32 i, DataType t);
 
     // Default constructor for std::vector<NodeOut>.
     NodeOut();
@@ -67,7 +67,7 @@ class NodeBuilder {
     // * an out-of-range index was passed to the NodeOut constructor.
     bool error;
     string name;
-    int index;
+    int32 index;
     DataType dt;
   };
 
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 59dff60ea3b..a22a9b3fa31 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/optimizer_cse.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/graph/algorithm.h"
@@ -52,14 +53,12 @@ class OptimizerCSE {
  public:
   explicit OptimizerCSE(Graph* g) : g_(g) {}
 
-  bool Optimize(std::function<bool(const Node*)> consider_fn);
+  bool Optimize(const std::function<bool(const Node*)>& consider_fn);
 
  private:
-  struct Scratch;
-
   static size_t NodeHash(const Node* n);
-  static bool Equivalent(const Node* a, const Node* b, Scratch* s);
-  static bool EqualAttrs(const Node* a, const Node* b, Scratch* s);
+  static bool Equivalent(const Node* a, const Node* b,
+                         AttrSlice::Scratch* scratch);
 
   Graph* g_;
 };
@@ -109,7 +108,7 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   // Hash the attrs.  For example, this makes sure different constants
   // end up in different hash buckets.
   string tmp;
-  for (const auto& attr : n->def().attr()) {
+  for (const auto& attr : n->attrs()) {
     tmp = attr.first;
     attr.second.AppendToString(&tmp);
     // Add hashes of attrs, so the order of attrs doesn't matter.
@@ -121,28 +120,6 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   return h;
 }
 
-struct OptimizerCSE::Scratch {
-  // For EqualAttrs():
-  string a;
-  string b;
-};
-
-bool OptimizerCSE::EqualAttrs(const Node* a, const Node* b, Scratch* scratch) {
-  if (a->def().attr_size() != b->def().attr_size()) return false;
-
-  for (const auto& attr : b->def().attr()) {
-    auto iter = a->def().attr().find(attr.first);
-    if (iter == a->def().attr().end()) return false;
-    // Note: it should be safe to compare proto serializations of the attr
-    // values since at most one field should be set in each (indeed, it
-    // should be the same field).
-    iter->second.SerializeToString(&scratch->a);
-    attr.second.SerializeToString(&scratch->b);
-    if (scratch->a != scratch->b) return false;
-  }
-  return true;
-}
-
 static bool HasRefInput(const Node* n) {
   for (auto dt : n->input_types()) {
     if (IsRefType(dt)) return true;
@@ -150,7 +127,8 @@ static bool HasRefInput(const Node* n) {
   return false;
 }
 
-bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
+bool OptimizerCSE::Equivalent(const Node* a, const Node* b,
+                              AttrSlice::Scratch* scratch) {
   // Different op names are different
   if (a->type_string() != b->type_string()) return false;
 
@@ -163,7 +141,7 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
 
   // Compare attrs.  Note that equal attrs implies equal input and
   // output types.
-  if (!EqualAttrs(a, b, scratch)) return false;
+  if (!a->attrs().EqualAttrs(b->attrs(), scratch)) return false;
 
   // Compare input sources
   if (a->num_inputs() != b->num_inputs()) return false;
@@ -180,7 +158,8 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
   return true;
 }
 
-bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
+bool OptimizerCSE::Optimize(
+    const std::function<bool(const Node*)>& consider_fn) {
   // This very simple implementation works if the whole graph is one
   // giant basic block (because we just traverse nodes in a
   // topological order). This simple implementation works well
@@ -204,7 +183,7 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   // Scratch space for Equivalent calls.  Allocated here and passed in to
   // Equivalent to avoid allocation inside the loop below.
   bool changed = false;
-  Scratch scratch;
+  AttrSlice::Scratch scratch;
   for (Node* n : order) {
     if (!n->IsOp()) continue;
 
@@ -232,7 +211,8 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   return changed;
 }
 
-bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn) {
+bool OptimizeCSE(Graph* g,
+                 const std::function<bool(const Node*)>& consider_fn) {
   OptimizerCSE opt(g);
   return opt.Optimize(consider_fn);
 }
diff --git a/tensorflow/core/graph/optimizer_cse.h b/tensorflow/core/graph/optimizer_cse.h
index 24ec5658d86..b8f3230c70c 100644
--- a/tensorflow/core/graph/optimizer_cse.h
+++ b/tensorflow/core/graph/optimizer_cse.h
@@ -29,7 +29,8 @@ namespace tensorflow {
 // during the common subexpression elimination.
 //
 // Returns true if and only if 'g' is mutated.
-extern bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn);
+extern bool OptimizeCSE(Graph* g,
+                        const std::function<bool(const Node*)>& consider_fn);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 1091af4e451..21a63662cf2 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/optimizer_cse.h"
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -85,7 +86,7 @@ class OptimizerCSETest : public ::testing::Test {
                            str_util::Join(edges, ";"));
   }
 
-  string DoCSE(std::function<bool(const Node*)> consider_fn = nullptr) {
+  string DoCSE(const std::function<bool(const Node*)>& consider_fn = nullptr) {
     string before = CanonicalGraphString(&graph_);
     LOG(ERROR) << "Before rewrites: " << before;
 
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 21756b56f51..48b6b2a4973 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -35,13 +35,18 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+
+// TODO(suharshs): If desired, make these values configurable.
+const uint32 kAllowedInputs = 2;
+const float kEMADecay = 0.999;
+
 // Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
 const std::unordered_set<string, StringPiece::Hasher> nodes_to_rewrite{
     "MatMul", "Conv2D"};
 
 // Contains necessary parameters to convert an edge.
 struct EdgeToConvert {
-  // Edge is not owned here.
+  // edge is not owned here.
   const Edge* edge;
   int32 num_bits;
   bool signed_input;
@@ -50,14 +55,13 @@ struct EdgeToConvert {
   float input_max;
 
   EdgeToConvert(const Edge* e, int32 bits, bool sign, bool range, float min,
-                float max) {
-    edge = e;
-    num_bits = bits;
-    signed_input = sign;
-    range_given = range;
-    input_min = min;
-    input_max = max;
-  }
+                float max)
+      : edge(e),
+        num_bits(bits),
+        signed_input(sign),
+        range_given(range),
+        input_min(min),
+        input_max(max) {}
 };
 
 // Decide if a node is in backward pass by checking if its name is led by
@@ -83,6 +87,9 @@ bool FindType(const Graph* graph, const Node* node, bool* signed_input,
     *signed_input = false;
     *range_given = false;
   } else if (src_op == "Relu6") {
+    // TODO(suharshs): Also the theoretical min and max is 0 and 6, if the
+    // actual activations are somewhere in within this range, we can quantize
+    // this even further. This is true for other activations like Sigmoid6 too.
     *signed_input = false;
     *range_given = true;
     *input_min = 0;
@@ -97,8 +104,10 @@ bool FindType(const Graph* graph, const Node* node, bool* signed_input,
     *range_given = true;
     *input_min = -1;
     *input_max = 1;
-  } else if (src_op == "Reshape") {
+  } else if (src_op == "Reshape" || src_op == "ConcatV2") {
     // Reshape has 2 inputs and the first one is the tensor.
+    // ConcatV2 has many inputs but they should all have the same activation
+    // function (i.e. Inception). So we just recurse on the first input.
     for (const Edge* edge : node->in_edges()) {
       if (edge->src_output() != Graph::kControlSlot && edge->dst_input() == 0) {
         FindType(graph, edge->src(), signed_input, range_given, input_min,
@@ -117,7 +126,7 @@ bool FindType(const Graph* graph, const Node* node, bool* signed_input,
     }
   } else {
     // Unknown type, could be the model input examples.
-    // TODO: Set the params for input with user's hint.
+    // TODO(jmchen): Set the params for input with user's hint.
     *signed_input = true;
     *range_given = false;
     return false;
@@ -126,29 +135,445 @@ bool FindType(const Graph* graph, const Node* node, bool* signed_input,
   return true;
 }
 
+// Find the Save op and inputs.
+Status FindSaveOp(const Graph* graph, Node** save_op,
+                  std::vector<const Edge*>* in_edges, bool* found) {
+  *found = false;
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == "SaveV2") {
+      // We found multiple save ops.
+      if (*found) {
+        return errors::InvalidArgument("Input graph has multiple SaveV2 ops.");
+      }
+      *save_op = node;
+      *found = true;
+      TF_RETURN_IF_ERROR(node->input_edges(in_edges));
+    }
+  }
+  return Status::OK();
+}
+
+Node* FindRestoreAllOp(const Graph* graph, StringPiece save_prefix) {
+  for (Node* node : graph->op_nodes()) {
+    // The restore_all op should have the same prefix of the save_op.
+    if (node->name() == strings::StrCat(save_prefix, "/restore_all")) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+// Strips the last "/suffix" from a name.
+// We use this to construct the name of restore ops in the same way they are
+// constructed by the Saver.
+StringPiece GetNodeNamePrefix(const Node* node) {
+  StringPiece name = node->name();
+  return name.substr(0, name.rfind('/'));
+}
+
+void FillStringTensor(Tensor* dst, const Tensor& src) {
+  auto dst_flat = dst->flat<string>();
+  auto src_flat = src.flat<string>();
+  for (int i = 0; i < src.NumElements(); i++) {
+    dst_flat(i) = src_flat(i);
+  }
+}
+
+// Add the added_variables as an inputs to the Save op.
+// We change the inputs of the SaveV2 op to include the names of the added
+// variables. We also add the variables as inputs to the save op.
+Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
+                                const std::vector<const Edge*>& in_edges,
+                                const std::vector<Node*>& added_variables) {
+  Node* tensor_names_op = in_edges[1]->src();
+  Node* shape_and_slices_op = in_edges[2]->src();
+
+  // Get the tensor_names and shape_and_slices tensors from the const op.
+  Tensor tensor_names;
+  Tensor shape_and_slices;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(tensor_names_op->attrs(), "value", &tensor_names));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(shape_and_slices_op->attrs(), "value", &shape_and_slices));
+
+  int tn_size = tensor_names.NumElements();
+  int var_size = added_variables.size();
+
+  // Create a new save_op that has inputs to all the new variables.
+  NodeBuilder save_op_builder =
+      NodeBuilder(save_op->name(), save_op->type_string());
+  // The first three inputs are prefix, tensor_names, and shapes_and_slices.
+  for (int i = 0; i < 3; i++) {
+    save_op_builder = save_op_builder.Input(in_edges[i]->src());
+  }
+  std::vector<NodeBuilder::NodeOut> var_nodeouts;
+  var_nodeouts.reserve(tn_size + var_size);
+  // The rest of the inputs need to be used the construct the tensor list arg.
+  for (int i = 3; i < in_edges.size(); i++) {
+    var_nodeouts.emplace_back(in_edges[i]->src());
+  }
+
+  // Add the new values to the tensors and the op input.
+  Tensor new_tensor_names(DT_STRING, TensorShape({tn_size + var_size}));
+  Tensor new_shape_and_slices(DT_STRING, TensorShape({tn_size + var_size}));
+  FillStringTensor(&new_tensor_names, tensor_names);
+  FillStringTensor(&new_shape_and_slices, shape_and_slices);
+  for (int i = 0; i < var_size; i++) {
+    Node* var = added_variables[i];
+    new_tensor_names.flat<string>()(tn_size + i) = var->name();
+    new_shape_and_slices.flat<string>()(tn_size + i) = "";
+    var_nodeouts.emplace_back(var);
+  }
+  save_op_builder = save_op_builder.Input(var_nodeouts);
+
+  // Update the attrs.
+  tensor_names_op->AddAttr("value", new_tensor_names);
+  shape_and_slices_op->AddAttr("value", new_shape_and_slices);
+
+  // Remove the old save_op and add the new one.
+  Node* new_save_op;
+  TF_RETURN_IF_ERROR(save_op_builder.Finalize(graph, &new_save_op));
+  // Add outputs to the new_save_op, all outputs are control edges.
+  for (const Edge* edge : save_op->out_edges()) {
+    graph->AddControlEdge(new_save_op, edge->dst());
+  }
+  graph->RemoveNode(save_op);
+
+  return Status::OK();
+}
+
+// Add a restore subgraph for each variable and connect to the restore_all op.
+// For each variable we add the following subgraph:
+//           Assign----restore_all
+//          |      |
+//   RestoreV2    Variable
+Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
+                                   const std::vector<const Edge*>& in_edges,
+                                   const std::vector<Node*>& variables) {
+  Node* prefix_op = in_edges[0]->src();
+  StringPiece name_prefix = GetNodeNamePrefix(save_op);
+  Node* restore_all = FindRestoreAllOp(graph, name_prefix);
+  if (restore_all == nullptr) {
+    return errors::InvalidArgument("graph has SaveOp, but no restore_all NoOp");
+  }
+  const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
+  const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
+  for (Node* var : variables) {
+    string new_restore_op_name = graph->NewName(restore_op_name);
+    string new_assign_op_name = graph->NewName(assign_op_name);
+    string tensor_names_op_name =
+        strings::StrCat(new_restore_op_name, "/tensor_names");
+    string shape_and_slices_op_name =
+        strings::StrCat(new_restore_op_name, "/shape_and_slices");
+
+    // Construct the tensor_names input with the variable name.
+    Node* tensor_names;
+    Tensor tensor_names_val(DT_STRING, TensorShape({1}));
+    tensor_names_val.flat<string>()(0) = var->name();
+    TF_RETURN_IF_ERROR(NodeBuilder(tensor_names_op_name, "Const")
+                           .Attr("dtype", DT_STRING)
+                           .Attr("value", tensor_names_val)
+                           .Finalize(graph, &tensor_names));
+
+    // Construct the shape_and_slices input with empty string.
+    Node* shape_and_slices;
+    Tensor shape_and_slices_val(DT_STRING, TensorShape({1}));
+    shape_and_slices_val.flat<string>()(0) = "";
+    TF_RETURN_IF_ERROR(NodeBuilder(shape_and_slices_op_name, "Const")
+                           .Attr("dtype", DT_STRING)
+                           .Attr("value", shape_and_slices_val)
+                           .Finalize(graph, &shape_and_slices));
+
+    // Build the new Restore op for this variable.
+    Node* restore_op;
+    TF_RETURN_IF_ERROR(NodeBuilder(new_restore_op_name, "RestoreV2")
+                           .Input(prefix_op)
+                           .Input(tensor_names)
+                           .Input(shape_and_slices)
+                           .Attr("dtypes", {DT_FLOAT})
+                           .Finalize(graph, &restore_op));
+
+    // Create Assign op, attaching the variable and Restore op to it.
+    Node* assign_op;
+    TF_RETURN_IF_ERROR(NodeBuilder(new_assign_op_name, "Assign")
+                           .Input(var)
+                           .Input(restore_op)
+                           .Finalize(graph, &assign_op));
+
+    // Add a control edge from the assign op to restore_all op.
+    graph->AddControlEdge(assign_op, restore_all);
+  }
+  return Status::OK();
+}
+
+// Adds new variables to save and restore ops matching the Save and Restore
+// graphs created in tensorflow/python/training/saver.py.
+Status AddSaveAndRestore(Graph* graph, const std::vector<Node*>& variables) {
+  Node* save_op = nullptr;
+  std::vector<const Edge*> in_edges;
+  bool found = false;
+  TF_RETURN_IF_ERROR(FindSaveOp(graph, &save_op, &in_edges, &found));
+  if (found) {
+    TF_RETURN_IF_ERROR(
+        AddRestoreVariableSubgraphs(graph, save_op, in_edges, variables));
+    TF_RETURN_IF_ERROR(
+        ConnectVariablesToSaveOp(graph, save_op, in_edges, variables));
+  }
+  return Status::OK();
+}
+
+// Sets output to the Node that computes reduction axes corresponding to all
+// dimensions of input and return.
+Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
+                         Node** output) {
+  name_prefix = strings::StrCat(name_prefix, "/ReductionAxes");
+  Node* start;
+  Tensor zero_tensor(DT_INT32, TensorShape());
+  zero_tensor.flat<int32>()(0) = 0;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/RangeStart"), "Const")
+          .Attr("dtype", DT_INT32)
+          .Attr("value", zero_tensor)
+          .Finalize(graph, &start));
+  Node* delta;
+  Tensor one_tensor(DT_INT32, TensorShape());
+  one_tensor.flat<int32>()(0) = 1;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/RangeDelta"), "Const")
+          .Attr("dtype", DT_INT32)
+          .Attr("value", one_tensor)
+          .Finalize(graph, &delta));
+  Node* rank;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/InputRank"), "Rank")
+          .Input(input)
+          .Finalize(graph, &rank));
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/ReductionAxes"), "Range")
+          .Input(start)
+          .Input(rank)
+          .Input(delta)
+          .Finalize(graph, output));
+  return Status::OK();
+}
+
+// Computes the exponential moving average of input, updated in update_variable.
+Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
+                                    const NodeBuilder::NodeOut& input,
+                                    Node* decay, Node* update_variable,
+                                    Node** assign_value) {
+  // variable_t+1 = variable_t - [(variable_t - value) * (1 - decay)]
+  name_prefix = strings::StrCat(name_prefix, "/EMA");
+  Node* one;
+  Tensor one_tensor(DT_FLOAT, TensorShape());
+  one_tensor.flat<float>()(0) = 1.0;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/OneConst"), "Const")
+          .Attr("dtype", DT_FLOAT)
+          .Attr("value", one_tensor)
+          .Finalize(graph, &one));
+  Node* decay_complement;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/DecayComplement"), "Sub")
+          .Input(one)
+          .Input(decay)
+          .Finalize(graph, &decay_complement));
+
+  Node* value_diff;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/ValueDiff"), "Sub")
+          .Input(update_variable)
+          .Input(input)
+          .Finalize(graph, &value_diff));
+  Node* update_value;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/UpdateValue"), "Mul")
+          .Input(value_diff)
+          .Input(decay_complement)
+          .Finalize(graph, &update_value));
+
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/EMAValue"), "Sub")
+          .Input(update_variable)
+          .Input(update_value)
+          .Finalize(graph, assign_value));
+  return Status::OK();
+}
+
+// Creates an automatically initialized exponential moving average variable.
+// This uses a switch op to assign a value to the variable on the first run,
+// and update with the moving average for all other runs:
+//                   init_val
+//                      |
+//      var--is_init--switch
+//       |      true /      \ false
+//       |          |        |
+//       |         EMA    init_val
+//       |           \      /
+//       +----------- assign
+Status MakeInitializedEMAVariable(Graph* graph, const string& name, Node* decay,
+                                  Node* init_val,
+                                  std::vector<Node*>* added_variables,
+                                  Node** var) {
+  // TODO(suharshs): Update this to use ResourceVariables when they are ready.
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name, "/Variable"), "VariableV2")
+          .Attr("shape", TensorShape())
+          .Attr("dtype", DT_FLOAT)
+          .Finalize(graph, var));
+  added_variables->push_back(*var);
+
+  Node* is_initialized;
+  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/IsInitialized"),
+                                 "IsVariableInitialized")
+                         .Input(*var)
+                         .Finalize(graph, &is_initialized));
+  Node* switch_node;
+  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Switch"), "Switch")
+                         .Input(init_val)
+                         .Input(is_initialized)
+                         .Finalize(graph, &switch_node));
+  NodeBuilder::NodeOut output_false = NodeBuilder::NodeOut(switch_node, 0);
+  NodeBuilder::NodeOut output_true = NodeBuilder::NodeOut(switch_node, 1);
+
+  Node* ema_value;
+  TF_RETURN_IF_ERROR(MakeExponentialMovingAverage(graph, name, output_true,
+                                                  decay, *var, &ema_value));
+
+  Node* assign_value;
+  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Merge"), "Merge")
+                         .Input({output_false, ema_value})
+                         .Finalize(graph, &assign_value));
+
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name, "/AssignValue"), "Assign")
+          .Input(*var)
+          .Input(assign_value)
+          .Finalize(graph, var));
+  return Status::OK();
+}
+
+// Computes the min and max EMA of input and stores them in min_var and max_var.
+Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix, Node* input,
+                         std::vector<Node*>* added_variables, Node** min_var,
+                         Node** max_var) {
+  // TODO(suharshs): The decay will be constant, so we could make only one for
+  // all quantize_and_dequantize ops to share, this would have to live outside
+  // this function.
+  Tensor decay_tensor(DT_FLOAT, TensorShape());
+  decay_tensor.flat<float>()(0) = kEMADecay;
+  Node* decay;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(strings::StrCat(name_prefix, "/Decay"), "Const")
+          .Attr("dtype", DT_FLOAT)
+          .Attr("value", decay_tensor)
+          .Finalize(graph, &decay));
+
+  Node* reduction_axes;
+  TF_RETURN_IF_ERROR(
+      MakeReductionAxes(graph, name_prefix, input, &reduction_axes));
+  Node* min;
+  string min_name = strings::StrCat(name_prefix, "/Min");
+  TF_RETURN_IF_ERROR(NodeBuilder(min_name, "Min")
+                         .Input(input)
+                         .Input(reduction_axes)
+                         .Finalize(graph, &min));
+  Node* max;
+  string max_name = strings::StrCat(name_prefix, "/Max");
+  TF_RETURN_IF_ERROR(NodeBuilder(max_name, "Max")
+                         .Input(input)
+                         .Input(reduction_axes)
+                         .Finalize(graph, &max));
+  TF_RETURN_IF_ERROR(MakeInitializedEMAVariable(graph, min_name, decay, min,
+                                                added_variables, min_var));
+  TF_RETURN_IF_ERROR(MakeInitializedEMAVariable(graph, max_name, decay, max,
+                                                added_variables, max_var));
+  return Status::OK();
+}
+
+// Makes an input min and max constant if the range is given. Otherwise, makes
+// min and max variables that are updated by an EMA.
+Status MakeInputMinMax(Graph* graph, const string& name_prefix,
+                       const EdgeToConvert& edge,
+                       std::vector<Node*>* added_variables, Node** input_min,
+                       Node** input_max) {
+  if (edge.range_given) {
+    // Make constant nodes for the input_min and input_max if the range is
+    // provided.
+    Tensor input_min_tensor(DT_FLOAT, TensorShape());
+    input_min_tensor.flat<float>()(0) = edge.input_min;
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat(name_prefix, "/InputMin"), "Const")
+            .Attr("dtype", DT_FLOAT)
+            .Attr("value", input_min_tensor)
+            .Finalize(graph, input_min));
+    Tensor input_max_tensor(DT_FLOAT, TensorShape());
+    input_max_tensor.flat<float>()(0) = edge.input_max;
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat(name_prefix, "/InputMax"), "Const")
+            .Attr("dtype", DT_FLOAT)
+            .Attr("value", input_max_tensor)
+            .Finalize(graph, input_max));
+  } else {
+    // If the range is not given, estimate the range with EMA variables.
+    TF_RETURN_IF_ERROR(MakeEMAMinMaxVars(graph, name_prefix, edge.edge->src(),
+                                         added_variables, input_min,
+                                         input_max));
+  }
+
+  return Status::OK();
+}
+
+// Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
+// (and required input nodes) based on edge.
+// The result is stored in convert_node.
+Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
+                      const string& quant_op_type, const EdgeToConvert& edge,
+                      std::vector<Node*>* added_variables,
+                      Node** convert_node) {
+  Node* input_min;
+  Node* input_max;
+  TF_RETURN_IF_ERROR(MakeInputMinMax(graph, name_prefix, edge, added_variables,
+                                     &input_min, &input_max));
+  string quant_name = strings::StrCat(name_prefix, "/", quant_op_type);
+  if (quant_op_type == "QuantizeAndDequantizeV2") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("signed_input", edge.signed_input)
+                           .Attr("num_bits", edge.num_bits)
+                           .Attr("range_given", true)
+                           .Finalize(graph, convert_node));
+  } else if (quant_op_type == "FakeQuantWithMinMaxVars") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("num_bits", edge.num_bits)
+                           .Finalize(graph, convert_node));
+  } else {
+    return errors::InvalidArgument("Unknown quant op type: ", quant_op_type);
+  }
+  return Status::OK();
+}
+
 // Insert conversion op, connect it to the graph and remove the old edge.
-Status ProcessTargetEdges(Graph* graph,
+Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
                           const std::vector<EdgeToConvert>& target_edges) {
-  // Remember previous convert ops to avoid duplicated conversion on the same
-  // input.
+  // Remember previously converted ops to avoid duplicated conversion on the
+  // same input.
   std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
+  std::vector<Node*> added_variables;
   for (const EdgeToConvert edge : target_edges) {
     Node* convert_node;
-    string name =
-        strings::StrCat(edge.edge->src()->name(), "/QuantizeAndDequantize");
+    string name_prefix = edge.edge->src()->name();
 
-    auto iter = name_index.find(name);
+    auto iter = name_index.find(name_prefix);
     if (iter == name_index.end()) {
-      TF_RETURN_IF_ERROR(NodeBuilder(name, "QuantizeAndDequantize")
-                             .Input(edge.edge->src())
-                             .Attr("signed_input", edge.signed_input)
-                             .Attr("num_bits", edge.num_bits)
-                             .Attr("range_given", edge.range_given)
-                             .Attr("input_min", edge.input_min)
-                             .Attr("input_max", edge.input_max)
-                             .Finalize(graph, &convert_node));
-
-      name_index[name] = convert_node;
+      TF_RETURN_IF_ERROR(MakeQuantizeOp(graph, name_prefix, quant_op_type, edge,
+                                        &added_variables, &convert_node));
+      name_index[name_prefix] = convert_node;
     } else {
       convert_node = iter->second;
     }
@@ -157,12 +582,15 @@ Status ProcessTargetEdges(Graph* graph,
     graph->RemoveEdge(edge.edge);
   }
 
+  TF_RETURN_IF_ERROR(AddSaveAndRestore(graph, added_variables));
+
   return Status::OK();
 }
 
 }  // namespace
 
-Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* graph) {
   if (graph == nullptr) {
     return errors::InvalidArgument("Cannot accept empty graph pointer.");
   }
@@ -202,17 +630,15 @@ Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
                                    &range_given, &input_min, &input_max);
           if (!known_op) {
             // Unknown op is considered as input.
-            // Only support one input for now.
-            // TODO: Make this configurable if this is the desirable way to find
-            // input.
-            if (potential_input > 0) {
+            potential_input++;
+            if (potential_input > kAllowedInputs) {
               return errors::Unimplemented(
-                  "Find a second unknown op: ", edge->src()->name(),
+                  "Found an unknown op: ", edge->src()->name(),
                   " with type: ", edge->src()->type_string(),
                   "; Unknown ops are considered as model input for now and "
-                  "only 1 input is supported currently.");
+                  "only ",
+                  kAllowedInputs, " inputs are supported currently.");
             }
-            potential_input++;
           }
 
           target_edges.emplace_back(EdgeToConvert(
@@ -222,13 +648,14 @@ Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
     }
   }
 
-  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
+  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, quant_op_type, target_edges));
 
   return Status::OK();
 }
 
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph) {
   // First create the graph from the GraphDef.
   Graph graph(OpRegistry::Global());
@@ -240,7 +667,7 @@ Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, input_graphdef, &graph));
 
   // Call the rewriter on the graph.
-  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, &graph));
+  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, quant_op_type, &graph));
 
   // Convert the result graph back to a GraphDef.
   GraphDef output_graphdef;
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
index 66db0c5bf4b..2c1a7e6ae36 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@@ -24,6 +24,10 @@ namespace tensorflow {
 // the model can learn to deal with such loss and achieve better accuracy when
 // it is quantized later for inference.
 // Note that the num_bits should be in [1, 63] and 'g' must be not null.
+// quant_op_type specifies which quantization op should be used.
+// Current ops supported:
+// - QuantizeAndDequantizeV2.
+// - FakeQuantWithMinMaxVars.
 //
 // On success, returns OK.
 //
@@ -31,12 +35,14 @@ namespace tensorflow {
 //    - num_bits out of range.
 //    - g is null.
 //    - More than 1 unknown ops encountered.
-Status DoQuantizeTraining(int32 num_bits, Graph* g);
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* g);
 
 // Converts a input GraphDef and returns a rewritten GraphDef with the
 // quantized training.
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph);
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index f278dd6dc4a..d817d980de9 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -27,12 +27,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -48,15 +50,35 @@ class QuantizeTrainingTest : public ::testing::Test {
     return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
   }
 
+  Status Placeholder(Graph* g, const string& name, TensorShape shape,
+                     Node** out) {
+    TF_RETURN_IF_ERROR(NodeBuilder(name, "Placeholder")
+                           .Attr("dtype", DT_FLOAT)
+                           .Attr("shape", shape)
+                           .Finalize(g, out));
+    return Status::OK();
+  }
+
+  Status FindNode(Graph* g, const string& name, Node** out) {
+    for (Node* node : g->nodes()) {
+      if (node->name() == name) {
+        *out = node;
+        return Status::OK();
+      }
+    }
+    return errors::Unimplemented("Node ", name, " not found.");
+  }
+
   std::unique_ptr<Graph> g_;
 };
 
-TEST_F(QuantizeTrainingTest, NormalGraph) {
+TEST_F(QuantizeTrainingTest, SignedInput) {
+  // Test that Quantization ops are created with the correct signed_input value.
   // Construct the following graph
   /*
-           m1      m2
-        /      \ /     \
-      Relu   Identity   c
+           m1
+        /      \
+      Relu   Identity
         |       |
         a       b
   */
@@ -64,70 +86,108 @@ TEST_F(QuantizeTrainingTest, NormalGraph) {
   Graph* g = g_.get();
   Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
   Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
   g->AddControlEdge(g->source_node(), a);
   g->AddControlEdge(g->source_node(), b);
-  g->AddControlEdge(g->source_node(), c);
   Node* relu = test::graph::Relu(g, a);
   Node* identity = test::graph::Identity(g, b);
   Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
-  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
   g->AddControlEdge(m1, g->sink_node());
-  g->AddControlEdge(m2, g->sink_node());
 
-  // The graph after the rewriting should be:
-  // "Q" is the quantize_and_dequantize op.
-  // Note the Q in the middle is shared by both m1 and m2.
   /*
-         m1       m2
-      /      \ /     \
-      Q       Q       Q
-      |       |       |
-    Relu   Identity   c
+         m1
+      /      \
+    EMA_Q   EMA_Q  <- these are subgraphs that estimate moving average.
+      |       |
+    Relu   Identity
       |       |
       a       b
   */
-  int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
-  // There should be 12 nodes in total including the source and sink nodes.
-  EXPECT_EQ(12, g->num_nodes());
-  // Nodes m1 and m2's inputs should be the quantize_and_dequantize op.
-  std::vector<Node*> target_nodes{m1, m2};
-  for (Node* n : target_nodes) {
-    for (Node* in : n->in_nodes()) {
-      EXPECT_EQ("QuantizeAndDequantize", in->type_string());
-    }
-  }
-
-  // relu, identity, c should now connect to the quantize_and_dequantize nodes.
-  std::vector<Node*> target_inputs{relu, identity, c};
-  for (Node* n : target_inputs) {
-    for (Node* out : n->out_nodes()) {
-      EXPECT_EQ("QuantizeAndDequantize", out->type_string());
-    }
-  }
+  EXPECT_EQ(63, g->num_nodes());
 
   // Quantize_and_dequantize node for identity should have signed_input==true.
-  NodeDef identity_Q = identity->out_nodes().begin()->def();
+  Node* identity_q_node;
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+               &identity_q_node));
   ASSERT_EQ("true",
-            SummarizeAttrValue(identity_Q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*identity_q_node->attrs().Find("signed_input")));
   // Quantize_and_dequantize node for relu should have signed_input==false.
-  NodeDef relu_Q = relu->out_nodes().begin()->def();
+  Node* relu_q_node;
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
+               &relu_q_node));
   ASSERT_EQ("false",
-            SummarizeAttrValue(relu_Q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*relu_q_node->attrs().Find("signed_input")));
 }
 
-TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
-  // Construct the same graph plus another backward Matmul.
+TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
+  // Test that Quantization ops are created with the correct range_given value.
+  // Construct the following graph
+  /*
+           m1
+        /      \
+      Relu   Relu6
+        |       |
+        a       b
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  Node* relu = test::graph::Relu(g, a);
+  Node* relu6 = test::graph::Relu6(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, relu6, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+
+  /*
+         m1
+      /      \
+    EMA_Q     Q
+      |       |
+    Relu   Relu6
+      |       |
+      a       b
+  */
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
+
+  EXPECT_EQ(38, g->num_nodes());
+
+  // Quantize_and_dequantize node for relu6 should have range_given==true.
+  Node* relu6_q_node;
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
+               &relu6_q_node));
+  ASSERT_EQ("true",
+            SummarizeAttrValue(*relu6_q_node->attrs().Find("range_given")));
+  // Quantize_and_dequantize node for relu should have range_given==true.
+  Node* relu_q_node;
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
+               &relu_q_node));
+  ASSERT_EQ("true",
+            SummarizeAttrValue(*relu_q_node->attrs().Find("range_given")));
+}
+
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
+  // Construct a graph with an additional backward Matmul.
   Reset();
   Graph* g = g_.get();
   Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
   Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
   Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  // We will use node d as input to the backwards matmul to ensure that it
+  // isn't quantized.
+  Node* d = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
   g->AddControlEdge(g->source_node(), a);
   g->AddControlEdge(g->source_node(), b);
   g->AddControlEdge(g->source_node(), c);
+  g->AddControlEdge(g->source_node(), d);
   Node* relu = test::graph::Relu(g, a);
   Node* identity = test::graph::Identity(g, b);
   Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
@@ -135,10 +195,11 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
   g->AddControlEdge(m1, g->sink_node());
   g->AddControlEdge(m2, g->sink_node());
 
-  // Add a Matmul node with name starting with "gradients".
+  // Add a Matmul node with name starting with "gradients". We will check that
+  // its input d was not quantized.
   Node* backward_m;
   TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
-                   .Input(m1)
+                   .Input(d)
                    .Input(m2)
                    .Attr("transpose_a", true)
                    .Attr("transpose_b", false)
@@ -146,11 +207,79 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
   g->AddControlEdge(backward_m, g->sink_node());
 
   int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
-  // Nodes m1 and m2's inputs should now be the quantize_and_dequantize op.
-  EXPECT_EQ(13, g->num_nodes());
-  EXPECT_EQ(2, m2->num_inputs());
+  EXPECT_EQ(95, g->num_nodes());
+
+  // Ensure that the backwards matmul input was not quantized.
+  Node* found_node;
+  Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
+                      &found_node);
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+
+  // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
+               &found_node));
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+               &found_node));
+  TF_ASSERT_OK(FindNode(
+      g, strings::StrCat(c->name(), "/QuantizeAndDequantizeV2"), &found_node));
+}
+
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
+  // Construct a graph with an additional backward Matmul.
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  // We will use node d as input to the backwards matmul to ensure that it
+  // isn't quantized.
+  Node* d = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  g->AddControlEdge(g->source_node(), d);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // Add a Matmul node with name starting with "gradients". We will check that
+  // its input d was not quantized.
+  Node* backward_m;
+  TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
+                   .Input(d)
+                   .Input(m2)
+                   .Attr("transpose_a", true)
+                   .Attr("transpose_b", false)
+                   .Finalize(g, &backward_m));
+  g->AddControlEdge(backward_m, g->sink_node());
+
+  int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
+
+  EXPECT_EQ(95, g->num_nodes());
+
+  // Ensure that the backwards matmul input was not quantized.
+  Node* found_node;
+  Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
+                      &found_node);
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+
+  // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(FindNode(
+      g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
 }
 
 TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
@@ -175,15 +304,207 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   input_graph.SerializeToString(&input_string);
 
   string result_string;
-  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(input_string, num_bits,
-                                                      &result_string));
+  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(
+      input_string, num_bits, "QuantizeAndDequantizeV2", &result_string));
 
-  GraphDef result_graph;
-  EXPECT_TRUE(ParseProtoUnlimited(&result_graph, result_string));
+  GraphDef result_graphdef;
+  EXPECT_TRUE(ParseProtoUnlimited(&result_graphdef, result_string));
 
-  // Nodes m1's inputs should now be converted with 2 added ops, which results
-  // in the total of 7 nodes.
-  EXPECT_EQ(7, result_graph.node_size());
+  // Ensure that quantizing the graph_def results in a graph with the same
+  // number of nodes.
+  GraphConstructorOptions opts;
+  Graph result_graph(OpRegistry::Global());
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, result_graphdef, &result_graph));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", graph));
+  EXPECT_EQ(graph->num_nodes(), result_graph.num_nodes());
+}
+
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
+  // Construct the following graph
+  // Relu has an unknown range, so we will check if the EMA correctly estimates
+  // the range.
+  /*
+           m1
+        /      \
+      Relu    Relu6
+        |       |
+        a       c
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a;
+  TF_ASSERT_OK(Placeholder(g, "a", {2, 2}, &a));
+  Node* c = Constant<float>({2.0, 3.0, 4.0, 5.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* relu6 = test::graph::Relu6(g, c);
+  Node* m1 = test::graph::Matmul(g, relu, relu6, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+
+  // This is rewritten into the following subgraph, where Q_a and Q_c are
+  // quantize and dequantize subgraphs.
+  // Since relu's range is unknown, we check that the exponential moving average
+  // works correctly.
+  /*
+         m1
+      /      \
+     Q_a     Q_c
+      |       |
+    Relu     Relu6
+      |       |
+      a       c
+  */
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
+
+  SessionOptions options;
+  Session* sess;
+  TF_ASSERT_OK(NewSession(options, &sess));
+  GraphDef gdef;
+  g->ToGraphDef(&gdef);
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  // The min and max values of the relu6 quantization should be constant values
+  // of 0 and 6.
+  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
+  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  Tensor a1(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a1, {0.0, 1.0, 2.0, 3.0});
+  Tensor a2(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a2, {1.0, 2.0, 3.0, 4.0});
+
+  TF_ASSERT_OK(sess->Run({{"a", a1}}, {m1->name()}, {}, &outputs));
+
+  // The value of the min and max should be set to the min and max of a1 since
+  // this is the first run that initializes the EMA variables.
+  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
+  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  // Now when we run with new inputs, we should get a moving average for the min
+  // and max variables. They should be equal to:
+  // min_var = old_min_var * decay + min(a2) * (1 - decay)
+  // max_var = old_max_var * decay + max(a2) * (1 - decay)
+  TF_ASSERT_OK(sess->Run({{"a", a2}}, {m1->name()}, {}, &outputs));
+
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  const float decay = 0.999;
+  const float expected_min = 0.0 * decay + 1.0 * (1.0 - decay);
+  const float expected_max = 3.0 * decay + 4.0 * (1.0 - decay);
+  EXPECT_NEAR(outputs[0].flat<float>()(0), expected_min, 1e-4);
+  EXPECT_NEAR(outputs[1].flat<float>()(0), expected_max, 1e-4);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+}
+
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
+  // Construct the following graph
+  // Relu has an unknown range, so we will check if the EMA correctly estimates
+  // the range.
+  /*
+           m1
+        /      \
+      Relu    Relu6
+        |       |
+        a       c
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a;
+  TF_ASSERT_OK(Placeholder(g, "a", {2, 2}, &a));
+  Node* c = Constant<float>({2.0, 3.0, 4.0, 5.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* relu6 = test::graph::Relu6(g, c);
+  Node* m1 = test::graph::Matmul(g, relu, relu6, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+
+  // This is rewritten into the following subgraph, where Q_a and Q_c are
+  // quantize and dequantize subgraphs.
+  // Since relu's range is unknown, we check that the exponential moving average
+  // works correctly.
+  /*
+         m1
+      /      \
+     Q_a     Q_c
+      |       |
+    Relu     Relu6
+      |       |
+      a       c
+  */
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
+
+  SessionOptions options;
+  Session* sess;
+  TF_ASSERT_OK(NewSession(options, &sess));
+  GraphDef gdef;
+  g->ToGraphDef(&gdef);
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  // The min and max values of the relu6 quantization should be constant values
+  // of 0 and 6.
+  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
+  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  Tensor a1(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a1, {0.0, 1.0, 2.0, 3.0});
+  Tensor a2(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a2, {1.0, 2.0, 3.0, 4.0});
+
+  TF_ASSERT_OK(sess->Run({{"a", a1}}, {m1->name()}, {}, &outputs));
+
+  // The value of the min and max should be set to the min and max of a1 since
+  // this is the first run that initializes the EMA variables.
+  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
+  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  // Now when we run with new inputs, we should get a moving average for the min
+  // and max variables. They should be equal to:
+  // min_var = old_min_var * decay + min(a2) * (1 - decay)
+  // max_var = old_max_var * decay + max(a2) * (1 - decay)
+  TF_ASSERT_OK(sess->Run({{"a", a2}}, {m1->name()}, {}, &outputs));
+
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  const float decay = 0.999;
+  const float expected_min = 0.0 * decay + 1.0 * (1.0 - decay);
+  const float expected_max = 3.0 * decay + 4.0 * (1.0 - decay);
+  EXPECT_NEAR(outputs[0].flat<float>()(0), expected_min, 1e-4);
+  EXPECT_NEAR(outputs[1].flat<float>()(0), expected_max, 1e-4);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
 }
 
 }  // namespace
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index ff46abd439d..2a08bf8ca01 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -55,8 +55,13 @@ namespace {
 // state).
 static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
                          const gtl::ArraySlice<string>& fed_outputs,
-                         subgraph::NameIndex* name_index) {
-  for (const string& t : fed_outputs) {
+                         bool use_function_convention,
+                         subgraph::NameIndex* name_index,
+                         DataTypeVector* out_feed_types) {
+  out_feed_types->clear();
+  out_feed_types->reserve(fed_outputs.size());
+  for (size_t i = 0; i < fed_outputs.size(); ++i) {
+    const string& t = fed_outputs[i];
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
@@ -71,32 +76,32 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
     }
 
     Node* recv_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
-                    "_Recv")
-            .Attr("tensor_type", BaseType(n->output_type(id.second)))
-            .Attr("tensor_name", t)
-            .Attr("send_device", device_info.name())
-            .Attr("recv_device", device_info.name())
-            .Attr("send_device_incarnation",
-                  static_cast<int64>(device_info.incarnation()))
-            .Attr("client_terminated", true)
-            .Finalize(g, &recv_node));
-    recv_node->set_assigned_device_name(device_info.name());
 
-    // Copy the _output_shapes from the original node to the feed node,
-    // if any.
-    std::vector<PartialTensorShape> output_shapes;
-    if (GetNodeAttr(n->def(), "_output_shapes", &output_shapes).ok()) {
-      if (n->num_outputs() != output_shapes.size()) {
-        return errors::InvalidArgument(
-            "FeedInputs: ", t,
-            ": size of _output_shapes attribute does not "
-            "match the number of node outputs");
-      }
-      std::vector<PartialTensorShape> feed_shapes = {output_shapes[id.second]};
-      recv_node->AddAttr("_output_shapes", feed_shapes);
+    if (!use_function_convention) {
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
+                      "_Recv")
+              .Attr("tensor_type", BaseType(n->output_type(id.second)))
+              .Attr("tensor_name", t)
+              .Attr("send_device", device_info.name())
+              .Attr("recv_device", device_info.name())
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(device_info.incarnation()))
+              .Attr("client_terminated", true)
+              .Finalize(g, &recv_node));
+    } else {
+      // NOTE(mrry): We must include the index as part of the node
+      // name, because _Arg is a "stateful" kernel and therefore
+      // its name must uniquely identify a kernel instance across all
+      // graphs in the same session.
+      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_arg_", id.first, "_",
+                                                     id.second, "_", i),
+                                     "_Arg")
+                             .Attr("T", BaseType(n->output_type(id.second)))
+                             .Attr("index", static_cast<int32>(i))
+                             .Finalize(g, &recv_node));
     }
+    recv_node->set_assigned_device_name(device_info.name());
 
     // Update name_index
     (*name_index)[recv_node->name()] = recv_node;
@@ -110,8 +115,8 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
       if (e->src_output() == id.second) {
         to_remove.emplace_back(e);
       } else if (e->src_output() == Graph::kControlSlot &&
-                 (n->def().op() == "Placeholder" ||
-                  n->def().op() == "PlaceholderV2")) {
+                 (n->type_string() == "Placeholder" ||
+                  n->type_string() == "PlaceholderV2")) {
         // When feeding a Placeholder node, any outgoing control edges
         // will be replaced with a control edge from the replacement
         // recv_node.
@@ -130,6 +135,7 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
       }
       g->RemoveEdge(e);
     }
+    out_feed_types->push_back(BaseType(n->output_type(id.second)));
   }
   return Status::OK();
 }
@@ -143,10 +149,7 @@ static bool AddNodeToTargets(const string& node_or_tensor_name,
     return false;
   }
   const Node* n = iter->second;
-  if (n->name() != node_or_tensor_name) {
-    return false;
-  }
-
+  CHECK_EQ(n->name(), id.first);
   targets->insert(n);
   return true;
 }
@@ -184,9 +187,14 @@ namespace subgraph {
 
 Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
                     const gtl::ArraySlice<string>& fetch_outputs,
-                    NameIndex* name_index, std::vector<Node*>* fetch_nodes) {
-  fetch_nodes->clear();
-  for (const string& t : fetch_outputs) {
+                    bool use_function_convention, NameIndex* name_index,
+                    std::vector<Node*>* out_fetch_nodes,
+                    DataTypeVector* out_fetch_types) {
+  out_fetch_nodes->clear();
+  out_fetch_nodes->reserve(fetch_outputs.size());
+  for (size_t i = 0; i < fetch_outputs.size(); ++i) {
+    const string& t = fetch_outputs[i];
+
     // Parse t into node_name and output_index.
     TensorId id(ParseTensorName(t));
 
@@ -216,25 +224,39 @@ Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
 
     // Create the fetch Node and connect it up
     Node* send_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
-                    "_Send")
-            .Input(n, id.second)
-            .Attr("tensor_name", t)
-            .Attr("send_device", device_info.name())
-            .Attr("recv_device", device_info.name())
-            .Attr("send_device_incarnation",
-                  static_cast<int64>(device_info.incarnation()))
-            .Attr("client_terminated", true)
-            .Finalize(g, &send_node));
+    if (!use_function_convention) {
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
+                      "_Send")
+              .Input(n, id.second)
+              .Attr("tensor_name", t)
+              .Attr("send_device", device_info.name())
+              .Attr("recv_device", device_info.name())
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(device_info.incarnation()))
+              .Attr("client_terminated", true)
+              .Finalize(g, &send_node));
+    } else {
+      // NOTE(mrry): We must include the index as part of the node
+      // name, because _Retval is a "stateful" kernel and therefore
+      // its name must uniquely identify a kernel instance across all
+      // graphs in the same session.
+      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_retval_", id.first, "_",
+                                                     id.second, "_", i),
+                                     "_Retval")
+                             .Input(n, id.second)
+                             .Attr("T", BaseType(n->output_type(id.second)))
+                             .Attr("index", static_cast<int32>(i))
+                             .Finalize(g, &send_node));
+    }
     send_node->set_assigned_device_name(device_info.name());
-    VLOG(1) << "Created fetch node: " << SummarizeNodeDef(send_node->def());
 
     // Update the index.
     (*name_index)[send_node->name()] = send_node;
 
     g->AddControlEdge(send_node, g->sink_node());
-    fetch_nodes->push_back(send_node);
+    out_fetch_nodes->push_back(send_node);
+    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
   }
 
   return Status::OK();
@@ -244,7 +266,8 @@ Status RewriteGraphForExecution(
     Graph* g, const gtl::ArraySlice<string>& fed_outputs,
     const gtl::ArraySlice<string>& fetch_outputs,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info) {
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata) {
   if (fetch_outputs.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
@@ -277,18 +300,21 @@ Status RewriteGraphForExecution(
   // currently listed in "fetch_nodes".  We pass "name_index" so the index is
   // kept up to date.
   if (!fed_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs, &name_index));
+    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs,
+                                  use_function_convention, &name_index,
+                                  &out_metadata->feed_types));
   }
 
   // Add the fetch nodes, also updating "name_index".
   std::vector<Node*> fetch_nodes;
   if (!fetch_outputs.empty()) {
-    TF_RETURN_IF_ERROR(
-        FetchOutputs(g, device_info, fetch_outputs, &name_index, &fetch_nodes));
+    TF_RETURN_IF_ERROR(FetchOutputs(g, device_info, fetch_outputs,
+                                    use_function_convention, &name_index,
+                                    &fetch_nodes, &out_metadata->fetch_types));
   }
 
   // Prune the graph to only compute what is needed for the fetch nodes and the
-  // targets nodes.
+  // target nodes.
   if (!fetch_nodes.empty() || !target_node_names.empty()) {
     TF_RETURN_IF_ERROR(
         PruneForTargets(g, name_index, fetch_nodes, target_node_names));
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index d94d983d000..8ccc27914bc 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -26,6 +26,18 @@ limitations under the License.
 namespace tensorflow {
 namespace subgraph {
 
+// Information about a graph rewritten by `RewriteGraphForExecution()`.
+struct RewriteGraphMetadata {
+  // The element type of each tensor fed to this subgraph. The order
+  // of types corresponds to the order of tensor names in
+  // `fed_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector feed_types;
+  // The element type of each tensor fetched from this subgraph. The
+  // order of types corresponds to the order of tensor names in
+  // `fetch_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector fetch_types;
+};
+
 // Rewrite the graph structure of "*g" to deal with feeding node
 // outputs, fetching node outputs, and only running a subset of the
 // graph.  "fed_outputs" and "fetch_outputs" are both lists of
@@ -56,7 +68,8 @@ Status RewriteGraphForExecution(
     Graph* g, const gtl::ArraySlice<string>& fed_outputs,
     const gtl::ArraySlice<string>& fetch_outputs,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info);
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata);
 
 typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher> NameIndex;
 
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index ee4960121f5..fde1ea17437 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -81,7 +81,7 @@ class SubgraphTest : public ::testing::Test {
     for (const string& s : expected_nodes) {
       Node* n = FindNode(s);
       EXPECT_TRUE(n != nullptr) << s;
-      if (n->def().op() == "_Send" || n->def().op() == "_Recv") {
+      if (n->type_string() == "_Send" || n->type_string() == "_Recv") {
         EXPECT_EQ(device_info_.name(), n->assigned_device_name()) << s;
       }
     }
@@ -104,7 +104,8 @@ class SubgraphTest : public ::testing::Test {
   }
 
   string Subgraph(const string& fed_str, const string& fetch_str,
-                  const string& targets_str) {
+                  const string& targets_str,
+                  bool use_function_convention = false) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(*g_, subgraph);
     std::vector<string> fed =
@@ -114,13 +115,18 @@ class SubgraphTest : public ::testing::Test {
     std::vector<string> targets =
         str_util::Split(targets_str, ',', str_util::SkipEmpty());
 
-    Status s = subgraph::RewriteGraphForExecution(subgraph, fed, fetch, targets,
-                                                  device_info_);
+    subgraph::RewriteGraphMetadata metadata;
+    Status s = subgraph::RewriteGraphForExecution(
+        subgraph, fed, fetch, targets, device_info_, use_function_convention,
+        &metadata);
     if (!s.ok()) {
       delete subgraph;
       return s.ToString();
     }
 
+    EXPECT_EQ(fed.size(), metadata.feed_types.size());
+    EXPECT_EQ(fetch.size(), metadata.fetch_types.size());
+
     // Replace the graph with the subgraph for the rest of the display program
     g_.reset(subgraph);
     return "OK";
@@ -178,6 +184,20 @@ TEST_F(SubgraphTest, FedOutputs1) {
   ExpectNodes("W1,W2,_recv_input_1,t1,t2");
 }
 
+TEST_F(SubgraphTest, FedOutputs1_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK",
+            Subgraph("input:1", "", "t2", true /* use_function_convention */));
+  ExpectNodes("W1,W2,_arg_input_1_0,t1,t2");
+}
+
 TEST_F(SubgraphTest, FedRefNode) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
@@ -189,7 +209,19 @@ TEST_F(SubgraphTest, FedRefNode) {
   EXPECT_FALSE(IsRefType(CHECK_NOTNULL(n)->output_type(0)));
 }
 
-TEST_F(SubgraphTest, FedOutputs2) {
+TEST_F(SubgraphTest, FedRefNode_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W2', 'W1' ] }");
+  EXPECT_EQ("OK",
+            Subgraph("W1:0", "", "t1", true /* use_function_convention */));
+  ExpectNodes("_arg_W1_0_0,W2,t1");
+  Node* n = FindNode("_arg_W1_0_0");
+  EXPECT_FALSE(IsRefType(CHECK_NOTNULL(n)->output_type(0)));
+}
+
+TEST_F(SubgraphTest, FedOutputs2_FunctionConvention) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
       "node { name: 'W2' op: 'TestParams' }"
@@ -200,8 +232,9 @@ TEST_F(SubgraphTest, FedOutputs2) {
       "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
   // We feed input:1, but nothing connects to it, so the _recv(input:1)
   // node also disappears.
-  EXPECT_EQ("OK", Subgraph("input:1,t1,W2", "", "t2"));
-  ExpectNodes("_recv_t1_0,_recv_W2_0,t2");
+  EXPECT_EQ("OK", Subgraph("input:1,t1,W2", "", "t2",
+                           true /* use_function_convention */));
+  ExpectNodes("_arg_t1_0_1,_arg_W2_0_2,t2");
 }
 
 TEST_F(SubgraphTest, FetchOutputs1) {
@@ -218,6 +251,22 @@ TEST_F(SubgraphTest, FetchOutputs1) {
       "W1,W2,input,t1,t2,_send_W2_0,_send_input_1,_send_t1_0,_send_t2_0");
 }
 
+TEST_F(SubgraphTest, FetchOutputs1_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK", Subgraph("", "W2,input:1,t1,t2", "t2",
+                           true /* use_function_convention */));
+  ExpectNodes(
+      "W1,W2,input,t1,t2,_retval_W2_0_0,_retval_input_1_1,_retval_t1_0_2,_"
+      "retval_t2_0_3");
+}
+
 TEST_F(SubgraphTest, FetchOutputs2) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
@@ -231,6 +280,20 @@ TEST_F(SubgraphTest, FetchOutputs2) {
   ExpectNodes("W1,W2,input,t1,t2,t3_a,_send_t3_a_0");
 }
 
+TEST_F(SubgraphTest, FetchOutputs2_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK",
+            Subgraph("", "t3_a", "t2", true /* use_function_convention */));
+  ExpectNodes("W1,W2,input,t1,t2,t3_a,_retval_t3_a_0_0");
+}
+
 TEST_F(SubgraphTest, ChainOfFools) {
   ExpectOK(
       "node { name: 'a' op: 'TestParams' }"
@@ -275,47 +338,11 @@ TEST_F(SubgraphTest, Errors) {
   EXPECT_TRUE(HasSubstr(Subgraph("", "", ""), "at least one target"));
 }
 
-TEST_F(SubgraphTest, FedOutputsPreservesOutputShapes) {
-  ExpectOK(
-      R"proto(
-        node { name: 'W1' op: 'TestParams' }
-        node { name: 'W2' op: 'TestParams' }
-        node {
-          name: 'input'
-          op: 'TestInput'
-          attr {
-            key: '_output_shapes'
-            value {
-              list {
-                shape { unknown_rank: true }
-                shape { dim { size: 23 } }
-              }
-            }
-          }
-        }
-        node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }
-        node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }
-        node { name: 't3_a' op: 'TestRelu' input: 't2' }
-        node { name: 't3_b' op: 'TestRelu' input: 't2' }
-      )proto");
-  EXPECT_EQ("OK", Subgraph("input:1", "", "t2"));
-  ExpectNodes("W1,W2,_recv_input_1,t1,t2");
-
-  for (Node* node : graph()->nodes()) {
-    if (node->name() == "_recv_input_1") {
-      std::vector<PartialTensorShape> shapes;
-      TF_ASSERT_OK(GetNodeAttr(node->def(), "_output_shapes", &shapes));
-      ASSERT_EQ(1, shapes.size());
-      EXPECT_TRUE(PartialTensorShape({23}).IsIdenticalTo(shapes[0]));
-      break;
-    }
-  }
-}
-
 REGISTER_OP("In").Output("o: float");
 REGISTER_OP("Op").Input("i: float").Output("o: float");
 
-static void BM_Subgraph(int iters, int num_nodes) {
+static void BM_SubgraphHelper(int iters, int num_nodes,
+                              bool use_function_convention) {
   DeviceAttributes device_info;
   device_info.set_name("/job:a/replica:0/task:0/cpu:0");
   device_info.set_device_type(DeviceType(DEVICE_CPU).type());
@@ -347,12 +374,26 @@ static void BM_Subgraph(int iters, int num_nodes) {
   while (--iters > 0) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(g, subgraph);
-    TF_CHECK_OK(subgraph::RewriteGraphForExecution(subgraph, fed, fetch,
-                                                   targets, device_info));
+    subgraph::RewriteGraphMetadata metadata;
+    TF_CHECK_OK(subgraph::RewriteGraphForExecution(
+        subgraph, fed, fetch, targets, device_info, use_function_convention,
+        &metadata));
     delete subgraph;
   }
 }
+
+static void BM_Subgraph(int iters, int num_nodes) {
+  BM_SubgraphHelper(iters, num_nodes, false /* use_function_convention */);
+}
+static void BM_SubgraphFunctionConvention(int iters, int num_nodes) {
+  BM_SubgraphHelper(iters, num_nodes, true /* use_function_convention */);
+}
 BENCHMARK(BM_Subgraph)->Arg(100)->Arg(1000)->Arg(10000)->Arg(100000);
+BENCHMARK(BM_SubgraphFunctionConvention)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(100000);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index ef4dd047875..c59c44c80ed 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -36,6 +36,10 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp);
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
+#endif // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
@@ -219,6 +223,16 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha) {
   return ret;
 }
 
+Node* RandomPoisson(Graph* g, Node* shape, Node* lam) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "RandomPoisson")
+                  .Input(shape)
+                  .Input(lam)
+                  .Attr("seed", 0)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Unary(Graph* g, const string& func, Node* input, int index) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), func, g->op_registry())
@@ -406,24 +420,6 @@ Node* Cast(Graph* g, Node* in, DataType dst) {
   return ret;
 }
 
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastGradientArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
 Node* Gather(Graph* g, Node* in0, Node* in1) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Gather")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 7a23b20c2c8..48250fef0fa 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -113,6 +113,10 @@ Node* RandomGaussian(Graph* g, Node* input, DataType dtype);
 // Output dtype determined by alpha.
 Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 
+// Generates random poisson distribution with the given shape and lam[s].
+// Output dtype determined by lam.
+Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
@@ -170,12 +174,6 @@ Node* Cast(Graph* g, Node* in, DataType dst);
 // Perform gather op on params "in0" with indices "in1".
 Node* Gather(Graph* g, Node* in0, Node* in1);
 
-// Computes broadcasted shape from the given input shapes.
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1);
-
-// Computes the args needed broadcast gradient function.
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1);
-
 // Gets a tensor stored in the session state.
 Node* GetSessionTensor(Graph* g, Node* in);
 
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index b5fbb5fa276..bfdc5cab0db 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
new file mode 100644
index 00000000000..152d33fe051
--- /dev/null
+++ b/tensorflow/core/grappler/BUILD
@@ -0,0 +1,129 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "op_types",
+    srcs = ["op_types.cc"],
+    hdrs = ["op_types.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "utils_test",
+    size = "small",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "devices",
+    srcs = ["devices.cc"],
+    hdrs = ["devices.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+    ],
+)
+
+cc_library(
+    name = "grappler_item",
+    srcs = [
+        "grappler_item.cc",
+    ],
+    hdrs = ["grappler_item.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_types",
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "grappler_item_builder",
+    srcs = [
+        "grappler_item_builder.cc",
+    ],
+    hdrs = ["grappler_item_builder.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":grappler_item",
+        ":op_types",
+        ":utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/inputs:utils",
+    ],
+)
+
+cc_test(
+    name = "grappler_item_test",
+    size = "small",
+    srcs = ["grappler_item_test.cc"],
+    deps = [
+        ":grappler_item",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_test(
+    name = "grappler_item_builder_test",
+    size = "small",
+    srcs = ["grappler_item_builder_test.cc"],
+    deps = [
+        ":grappler_item_builder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:grad_testutil",
+        "//tensorflow/cc:gradients",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
new file mode 100644
index 00000000000..fd2f2b32492
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -0,0 +1,109 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+config_setting(
+    name = "xsmm",
+    licenses = ["notice"],
+    values = {
+        "define": "tensorflow_xsmm=1",
+    },
+)
+
+tf_cuda_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ] + select({
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "cluster",
+    srcs = ["cluster.cc"],
+    hdrs = [
+        "cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "virtual_cluster",
+    srcs = ["virtual_cluster.cc"],
+    hdrs = [
+        "virtual_cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "single_machine",
+    srcs = ["single_machine.cc"],
+    hdrs = [
+        "single_machine.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":utils",
+        "//tensorflow/cc:coordinator",
+        "//tensorflow/cc:queue_runner",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_test(
+    name = "single_machine_test",
+    srcs = ["single_machine_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":single_machine",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
new file mode 100644
index 00000000000..8690d9f24ad
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include <atomic>
+
+namespace tensorflow {
+namespace grappler {
+
+static std::atomic<bool> already_created(false);
+
+Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
+  // This is really ugly: to avoid leaking variables, we need to reset the tf
+  // session every time we're done processing a grappler item. However,
+  // variables are global, and therefore we can't have more than 1 session alive
+  // at a time. This check detects when more that one cluster is created.
+  CHECK(!already_created);
+  already_created = true;
+
+  DisableDetailedStats(false);
+}
+
+Cluster::~Cluster() {
+  CHECK(already_created);
+  already_created = false;
+}
+
+void Cluster::AllowSoftPlacement(bool soft_placement_state) {
+  options_.config.set_allow_soft_placement(soft_placement_state);
+}
+
+void Cluster::SetNumWarmupSteps(int num_steps) {
+  options_.config.mutable_graph_options()->set_build_cost_model_after(
+      num_steps);
+}
+
+void Cluster::DisableDetailedStats(bool disable) {
+  if (disable) {
+    options_.config.mutable_graph_options()->set_build_cost_model(0);
+    run_options_.set_trace_level(RunOptions::NO_TRACE);
+  } else {
+    options_.config.mutable_graph_options()->set_build_cost_model(1);
+    run_options_.set_trace_level(RunOptions::HARDWARE_TRACE);
+  }
+}
+
+void Cluster::DisableOptimizer(bool disable) {
+  OptimizerOptions* options =
+      options_.config.mutable_graph_options()->mutable_optimizer_options();
+  if (disable) {
+    options->set_opt_level(OptimizerOptions::L0);
+  } else {
+    options->set_opt_level(OptimizerOptions::L1);
+  }
+}
+
+const std::vector<string> Cluster::GetDeviceNames() const {
+  std::vector<string> device_names;
+  device_names.reserve(devices_.size());
+  for (const auto& device : devices_) {
+    device_names.push_back(device.first);
+  }
+  std::sort(device_names.begin(), device_names.end());
+  return device_names;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
new file mode 100644
index 00000000000..911bc1e5a2c
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A cluster represents of collection of hardware resources available to run
+// the TensorFlow model.
+// A process can only create a single cluster at a time.
+class Cluster {
+ public:
+  explicit Cluster(int timeout_s);
+  virtual ~Cluster();
+
+  // Provision the hardware resources needed to run TensorFlow and start a
+  // TensorFlow session that can take advantage of these resources.
+  // The actual resources that are leveraged depend on the type of cluster
+  // instantiated.
+  // Returns OK iff all the requested resources could be reserved and a
+  // TensorFlow session successfully created. Returns an error otherwise.
+  // There is no graceful degradation to handle the case where only a subset
+  // of the requested resources are available.
+  virtual Status Provision() = 0;
+
+  // Attempts to shutdown the cluster.
+  // Returns OK iff there are no pending calls to the Run() method and all the
+  // resources used by the cluster could be released. Returns an error
+  // otherwise.
+  virtual Status Shutdown() { return Status::OK(); }
+
+  // Whether soft placement is allowed. If allow_soft_placement is true,
+  // an op will be placed on CPU if there's no GPU implementation for the OP
+  // or if no GPU devices are known or registered or if we need to co-locate
+  // with reftype input(s) which are from CPU.
+  void AllowSoftPlacement(bool soft_placement_state);
+
+  // Set the number of steps required to warmup TensorFlow. Must be called
+  // before Provision().
+  void SetNumWarmupSteps(int num_steps);
+
+  // Disable the collection of detailed statistics. Must be called
+  // before Provision().
+  void DisableDetailedStats(bool disable);
+
+  // Disable the TensorFlow optimizer. This ensures that the graph that TF
+  // executes is similar to the input graph. Must be called before Provision().
+  void DisableOptimizer(bool disable);
+
+  // Return the list of TensorFlow devices that are available to execute a
+  // graph. This is empty until provision() is called.
+  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+    return devices_;
+  }
+
+  // Convenience method that returns the set of device names. These names are
+  // sorted alphabetically.
+  const std::vector<string> GetDeviceNames() const;
+
+  // Prepare the session to run the specified grappler item. This include
+  // initializing all the model variables.
+  virtual Status Initialize(const GrapplerItem& item) = 0;
+
+  // Run the specified graph_def and return the corresponding metadata.
+  virtual Status Run(const GraphDef& graph_def,
+                     const std::vector<std::pair<string, Tensor>>& feed,
+                     const std::vector<string>& fetch,
+                     RunMetadata* metadata) = 0;
+
+ protected:
+  std::unordered_map<string, DeviceProperties> devices_;
+  const int timeout_s_;
+  SessionOptions options_;
+  RunOptions run_options_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
new file mode 100644
index 00000000000..22ccf5208c1
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -0,0 +1,315 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+
+#include <memory>
+
+#include "tensorflow/cc/training/queue_runner.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
+    : Cluster(timeout_s),
+      num_gpus_(num_gpus),
+      expected_init_time_s_(0),
+      closing_(false) {
+  thread_pool_.reset(new thread::ThreadPool(
+      Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
+
+  (*options_.config.mutable_device_count())["CPU"] = 1;
+  if (num_gpus > 0) {
+    (*options_.config.mutable_device_count())["GPU"] = num_gpus;
+  }
+  CHECK_GE(num_cpu_cores, 1);
+  options_.config.set_intra_op_parallelism_threads(num_cpu_cores);
+  // Create a session specific thread pool to ensure the threads are reset when
+  // the session is reset.
+  options_.config.add_session_inter_op_thread_pool()->set_num_threads(
+      num_cpu_cores);
+  if (timeout_s > 0) {
+    options_.config.set_operation_timeout_in_ms(timeout_s * 1000);
+  }
+}
+
+SingleMachine::~SingleMachine() {
+  CloseSession(false /*use_timeout*/).IgnoreError();
+
+  // Reset the thread-pool so that there are no outstanding Session::Run(...)s
+  // when we delete the session.
+  thread_pool_.reset();
+
+  Reset(options_, {}).IgnoreError();
+}
+
+Status SingleMachine::Provision() {
+  Status status = ResetSession();
+  if (!status.ok()) {
+    return status;
+  }
+
+  DeviceProperties attr = GetLocalCPUInfo();
+  devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
+
+  for (int i = 0; i < num_gpus_; ++i) {
+    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
+        GetLocalGPUInfo(i);
+  }
+  return Status::OK();
+}
+
+Status SingleMachine::Initialize(const GrapplerItem& item) {
+  mutex_lock l(this->last_graph_mu_);
+  if (last_graph_ != &item.graph || last_graph_id_ != item.id) {
+    init_ops_ = item.init_ops;
+    expected_init_time_s_ = item.expected_init_time;
+    last_graph_ = nullptr;
+    queue_runner_defs_ = item.queue_runners;
+    last_graph_id_ = item.id;
+  }
+  return Status::OK();
+}
+
+Status SingleMachine::Shutdown() {
+  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+
+  // Delete the threadpool: this ensures that all the pending closures complete
+  // before we return. Note that if that if TF deadlocked on us, the closures
+  // will never complete, and the call to thread_pool_.reset() will never
+  // return: therefore we need to delete the threadpool with the background
+  // thread. That thread itself will also never complete, so the user should
+  // abort the process to avoid leaking too many resources.
+  auto n = std::make_shared<Notification>();
+  Env::Default()->SchedClosure([this, n]() {
+    thread_pool_.reset();
+    n->Notify();
+  });
+  int64 timeout_us = 1000000ll * timeout_s_;
+  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
+  if (!notified) {
+    // Let the caller know that we can't shutdown the session properly since
+    // there are calls to Session::Run() still running.
+    return errors::Unavailable("The session is still running graphs after ",
+                               timeout_s_, " seconds");
+  }
+  return Status::OK();
+}
+
+Status SingleMachine::Run(const GraphDef& graph_def,
+                          const std::vector<std::pair<string, Tensor>>& feed,
+                          const std::vector<string>& fetch,
+                          RunMetadata* metadata) {
+  {
+    mutex_lock l(this->last_graph_mu_);
+    if (last_graph_ != &graph_def) {
+      TF_RETURN_IF_ERROR(ResetSession());
+      TF_RETURN_IF_ERROR(session_->Create(graph_def));
+      if (!init_ops_.empty()) {
+        init_metadata_ = RunMetadata();
+        int64 timeout_s = timeout_s_ + expected_init_time_s_;
+        TF_RETURN_IF_ERROR(
+            RunWithTimeout({}, init_ops_, &init_metadata_, timeout_s));
+        // The compute cost for init ops is likely to be pessimistic since init
+        // ops are run only once before warmup. Therefore we only keep their
+        // memory costs.
+        for (auto node : *init_metadata_.mutable_cost_graph()->mutable_node()) {
+          node.clear_compute_cost();
+        }
+        // Also clear the timeline to save memory
+        init_metadata_.clear_step_stats();
+      }
+      for (int i = 0; i < queue_runner_defs_.size(); ++i) {
+        std::unique_ptr<QueueRunner> queue_runner;
+        TF_RETURN_IF_ERROR(QueueRunner::New(queue_runner_defs_[i],
+                                            coordinator_.get(), &queue_runner));
+        TF_RETURN_IF_ERROR(queue_runner->StartAndCollectCostGraph(
+            session_.get(), &run_options_));
+        TF_RETURN_IF_ERROR(
+            coordinator_->RegisterRunner(std::move(queue_runner)));
+        TF_RETURN_IF_ERROR(coordinator_->GetStatus());
+      }
+
+      // Warmup TensorFlow if needed
+      for (int i = 0;
+           i < options_.config.graph_options().build_cost_model_after(); ++i) {
+        TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, nullptr));
+      }
+
+      last_graph_ = &graph_def;
+    }
+  }
+
+  if (metadata) {
+    TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
+    // Merge the costs of the initialization and the queue runners.
+    CostGraphDef queue_costs;
+    TF_RETURN_IF_ERROR(coordinator_->ExportCostGraph(&queue_costs));
+    MergeCosts(metadata->mutable_cost_graph(), init_metadata_.cost_graph(),
+               queue_costs);
+  } else {
+    return RunWithTimeout(feed, fetch, nullptr);
+  }
+  return Status::OK();
+}
+
+Status SingleMachine::RunWithTimeout(
+    const std::vector<std::pair<string, Tensor>>& feed,
+    const std::vector<string>& fetch, RunMetadata* run_metadata) {
+  return RunWithTimeout(feed, fetch, run_metadata, timeout_s_);
+}
+
+Status SingleMachine::RunWithTimeout(
+    const std::vector<std::pair<string, Tensor>>& feed,
+    const std::vector<string>& fetch, RunMetadata* run_metadata,
+    int64 timeout_s) {
+  // We shouldn't be running or closing the session at this point.
+  {
+    mutex_lock l(close_mu_);
+    CHECK(!closing_);
+  }
+
+  auto status = std::make_shared<Status>();
+  auto local_metadata = std::make_shared<RunMetadata>();
+  const bool executed_in_time = ExecuteWithTimeout(
+      [this, status, local_metadata, feed, fetch]() {
+        *status = session_->Run(run_options_, feed, {}, fetch, nullptr,
+                                local_metadata.get());
+      },
+      timeout_s * 1000, thread_pool_.get());
+  if (!executed_in_time) {
+    return errors::DeadlineExceeded("Failed to run the graph after ", timeout_s,
+                                    " seconds, aborting");
+  } else if (run_metadata && status->ok()) {
+    *run_metadata = *local_metadata;
+  }
+  return *status;
+}
+
+Status SingleMachine::CloseSession(bool use_timeout) {
+  if (!session_) {
+    return Status::OK();
+  }
+
+  {
+    mutex_lock l(close_mu_);
+
+    if (!closing_) {
+      closing_ = true;
+    }
+  }
+
+  const bool executed_in_time = ExecuteWithTimeout(
+      [&]() {
+        if (this->coordinator_) {
+          this->coordinator_->RequestStop().IgnoreError();
+          // Wait for all the runners to have closed their queues.
+          while (!this->coordinator_->AllRunnersStopped()) {
+            sleep(1);
+          }
+          // Now we can close the session. This should cancel any pending I/O
+          // operation.
+          this->session_->Close().IgnoreError();
+          // Last but not least, we can delete the coordinator.
+          this->coordinator_.reset();
+        } else {
+          this->session_->Close().IgnoreError();
+        }
+
+        mutex_lock l2(close_mu_);
+        closing_ = false;
+      },
+      use_timeout ? timeout_s_ * 1000 : -1, thread_pool_.get());
+
+  if (!executed_in_time) {
+    // Let the caller know that we can't shutdown the session, and therefore
+    // can't process any further.
+    return errors::Unavailable("Failed to close the previous session after ",
+                               timeout_s_, " seconds, aborting");
+  }
+
+  return Status::OK();
+}
+
+Status SingleMachine::ResetSession() {
+  if (session_) {
+    LOG(INFO) << "Cleaning up previous session";
+
+    // Make sure the session is properly closed
+    TF_RETURN_IF_ERROR(Shutdown());
+
+    // We need to Reset the session to ensure that all the variables are
+    // deleted. But first we need to delete the session since Reset()
+    // deletes some of the containers referenced by the session.
+    session_.reset();
+    TF_RETURN_IF_ERROR(Reset(options_, {}));
+  }
+
+  LOG(INFO) << "Starting new session";
+
+  // Create a new threadpool
+  thread_pool_.reset(new thread::ThreadPool(
+      Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
+
+  session_.reset(NewSession(options_));
+  CHECK(session_ != nullptr);
+
+  coordinator_.reset(new Coordinator());
+
+  return Status::OK();
+}
+
+void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
+                               const CostGraphDef& init_costs,
+                               const CostGraphDef& queue_costs) {
+  graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
+                                       init_costs.node_size() +
+                                       queue_costs.node_size());
+  std::unordered_set<string> nodes_seen;
+  for (const auto& node : graph_costs->node()) {
+    nodes_seen.insert(node.name());
+  }
+
+  // The costs obtained by running the main graph could be more stable than
+  // the one we get from the queue runners since the queue runners run
+  // asynchronously.
+  for (const auto& node : queue_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+
+  // Don't overwrite the costs with that generated during initialization since
+  // these are possibly outdated.
+  for (const auto& node : init_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
new file mode 100644
index 00000000000..d3efbe3c614
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+
+#include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that makes available to grappler a subset of the
+// nodes available on a single local computer.
+class SingleMachine : public Cluster {
+ public:
+  SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
+  ~SingleMachine() override;
+
+  Status Provision() override;
+  Status Shutdown() override;
+
+  Status Initialize(const GrapplerItem& item) override;
+  Status Run(const GraphDef& item,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch, RunMetadata* metadata) override;
+
+ private:
+  Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
+                        const std::vector<string>& fetch,
+                        RunMetadata* run_metadata);
+  Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
+                        const std::vector<string>& fetch,
+                        RunMetadata* run_metadata, int64 timeout_s);
+  Status ResetSession();
+  Status CloseSession(bool use_timeout);
+  void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
+                  const CostGraphDef& queue_costs);
+
+  const int num_gpus_;
+  std::unique_ptr<Session> session_;
+  std::vector<QueueRunnerDef> queue_runner_defs_;
+  string last_graph_id_;
+  mutex last_graph_mu_;
+  const GraphDef* last_graph_ GUARDED_BY(last_graph_mu_) = nullptr;
+  std::vector<string> init_ops_;
+  int64 expected_init_time_s_;
+  std::unique_ptr<Coordinator> coordinator_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  RunMetadata init_metadata_;
+
+  mutex close_mu_;
+  bool closing_ GUARDED_BY(close_mu_);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
new file mode 100644
index 00000000000..84e796c9601
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -0,0 +1,467 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SingleMachineTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Provision a single machine with 3 cpu cores, and a short timeout of 5
+    // seconds: since there isn't much work to process a test graph that should
+    // be plenty.
+    cluster_.reset(new SingleMachine(5, 3, 0));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override {
+    cluster_.reset();
+  }
+
+ protected:
+  std::unique_ptr<SingleMachine> cluster_;
+};
+
+TEST_F(SingleMachineTest, CostModel) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+
+  RunMetadata metadata;
+  const int64 start_micros = Env::Default()->NowMicros();
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+  const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros;
+
+  // There should be at least 4 nodes corresponding to the 4 stages we created
+  // in the fake input.
+  EXPECT_LE(4, metadata.cost_graph().node_size());
+  for (const auto& node : metadata.cost_graph().node()) {
+    // Skip the special nodes inserted by TF: these are prefixed with an
+    // underscore.
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    EXPECT_EQ(1, node.output_info_size());
+    EXPECT_LE(8, node.output_info(0).size());
+    const TensorShapeProto& shape = node.output_info(0).shape();
+    EXPECT_EQ(2, shape.dim_size());
+    EXPECT_EQ(10, shape.dim(0).size());
+    EXPECT_EQ(1, shape.dim(1).size());
+    EXPECT_LE(0, node.compute_cost());
+    EXPECT_GE(run_duration_micros, node.compute_cost());
+  }
+}
+
+TEST_F(SingleMachineTest, Queue) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, true,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+}
+
+TEST_F(SingleMachineTest, MultipleItems) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+
+  for (int i = 0; i < 3; ++i) {
+    GrapplerItem item;
+    CHECK(fake_input.NextItem(&item));
+    TF_CHECK_OK(cluster_->Initialize(item));
+    RunMetadata metadata1;
+    TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata1));
+    RunMetadata metadata2;
+    TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata2));
+
+    // There should be at least 4 nodes corresponding to the 4 stages we created
+    // in the fake input, plus 1 enqueue and 1 dequeue node.
+    EXPECT_LE(6, metadata1.cost_graph().node_size());
+    for (const auto& node : metadata1.cost_graph().node()) {
+      if (node.name()[0] == '_' || node.name().find("/_") != string::npos ||
+          node.name() == "queue") {
+        continue;
+      }
+      EXPECT_EQ(1, node.output_info_size());
+      const TensorShapeProto& shape = node.output_info(0).shape();
+      EXPECT_EQ(2, shape.dim_size());
+      EXPECT_EQ(10, shape.dim(0).size());
+      EXPECT_EQ(1, shape.dim(1).size());
+    }
+
+    for (int i = 0; i < metadata1.cost_graph().node_size(); ++i) {
+      metadata1.mutable_cost_graph()->mutable_node(i)->set_compute_cost(0);
+      metadata1.clear_step_stats();
+    }
+    for (int i = 0; i < metadata2.cost_graph().node_size(); ++i) {
+      metadata2.mutable_cost_graph()->mutable_node(i)->set_compute_cost(0);
+      metadata2.clear_step_stats();
+    }
+    string s1;
+    ::tensorflow::protobuf::TextFormat::PrintToString(metadata1, &s1);
+    string s2;
+    ::tensorflow::protobuf::TextFormat::PrintToString(metadata2, &s2);
+    EXPECT_EQ(s1, s2);
+  }
+}
+
+TEST_F(SingleMachineTest, GraphOptimizations) {
+  // Create a graph that can be fully precomputed
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto zero = ops::Const(root.WithOpName("zero"), 0.0f, {2, 3});
+  auto one = ops::Const(root.WithOpName("one"), 1.0f, {2, 3});
+  auto add = ops::Add(root.WithOpName("add"), zero, one);
+  auto square = ops::Square(root.WithOpName("square"), add);
+
+  auto new_shape = ops::Const(root.WithOpName("new_shape"), {3, -1}, {2});
+  auto reshaped = ops::Reshape(root.WithOpName("reshaped"), square, new_shape);
+  auto final_shape = ops::Shape(root.WithOpName("final_shape"), reshaped);
+
+  auto expected_shape =
+      ops::Const(root.WithOpName("expected_shape"), {3, 2}, {2});
+  auto valid =
+      ops::Equal(root.WithOpName("valid"), final_shape, expected_shape);
+  auto all_dims = ops::Const(root.WithOpName("all_dims"), {0}, {1});
+
+  auto all_valid = ops::All(root.WithOpName("all_valid"), valid, all_dims);
+  auto assert_valid = ops::Assert(root.WithOpName("assert_valid"), all_valid,
+                                  {final_shape.output});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  item.fetch.push_back("assert_valid");
+
+  // Force the placement of all the nodes on CPU since TF attempts to use a GPU
+  // when possible event though we created the session to have a single CPU !.
+  for (auto& node : *item.graph.mutable_node()) {
+    node.set_device("/cpu:0");
+  }
+
+  // With optimizations turned on, some nodes could have been optimized away,
+  // and the cost model could be partial. Restart the cluster with optimizations
+  // disabled and make sure we have all the information we're looking for.
+  cluster_.reset();
+  cluster_.reset(new SingleMachine(5, 3, 0));
+  cluster_->DisableOptimizer(true);
+  TF_CHECK_OK(cluster_->Provision());
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Initialize(item));
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+  std::set<string> cost_nodes;
+  for (const auto& node : metadata.cost_graph().node()) {
+    // Skip nodes added by TF internally.
+    if (node.name()[0] != '_') {
+      cost_nodes.insert(node.name());
+    }
+  }
+  const std::set<string> expected_cost_nodes = {
+      "zero",      "one",      "add",         "square",
+      "new_shape", "reshaped", "final_shape", "expected_shape",
+      "valid",     "all_dims", "all_valid",   "assert_valid"};
+  EXPECT_EQ(expected_cost_nodes, cost_nodes);
+}
+
+TEST_F(SingleMachineTest, TimeOuts) {
+  // Create a graph that will block forever: Just try to dequeue data from a
+  // queue that is never fed.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto q = ops::FIFOQueue(root.WithOpName("queue"), {DataType::DT_INT32});
+  auto dequeue =
+      ops::QueueDequeue(root.WithOpName("dequeue"), q, {DataType::DT_INT32});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  item.fetch.push_back("dequeue");
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  Status s1 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
+  EXPECT_TRUE(errors::IsDeadlineExceeded(s1));
+  Status s2 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
+  EXPECT_TRUE(errors::IsDeadlineExceeded(s2));
+}
+
+static void RunInfiniteTFLoop() {
+  // Create a while(true) loop
+  GrapplerItem item;
+
+  NodeDef* shp = item.graph.add_node();
+  shp->set_name("shape");
+  shp->set_op("Const");
+  (*shp->mutable_attr())["dtype"].set_type(DT_INT32);
+  Tensor shp_tensor(DT_INT32, TensorShape({1}));
+  shp_tensor.flat<int32>()(0) = 1;
+  shp_tensor.AsProtoTensorContent(
+      (*shp->mutable_attr())["value"].mutable_tensor());
+
+  NodeDef* r = item.graph.add_node();
+  r->set_name("random");
+  r->set_op("RandomUniform");
+  (*r->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  (*r->mutable_attr())["T"].set_type(DT_INT32);
+  *r->add_input() = "shape";
+
+  NodeDef* e = item.graph.add_node();
+  e->set_name("while/Enter");
+  e->set_op("Enter");
+  (*e->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*e->mutable_attr())["frame_name"].set_s("while/while/");
+  *e->add_input() = "random";
+
+  NodeDef* m = item.graph.add_node();
+  m->set_name("while/Merge");
+  m->set_op("Merge");
+  (*m->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*m->mutable_attr())["N"].set_i(2);
+  *m->add_input() = "while/Enter";
+  *m->add_input() = "while/NextIteration";
+
+  NodeDef* t = item.graph.add_node();
+  t->set_name("always_true");
+  t->set_op("Const");
+  (*t->mutable_attr())["dtype"].set_type(DT_BOOL);
+  *t->add_input() = "^while/Merge";
+  Tensor true_tensor(DT_BOOL, TensorShape());
+  true_tensor.flat<bool>()(0) = true;
+  true_tensor.AsProtoTensorContent(
+      (*t->mutable_attr())["value"].mutable_tensor());
+
+  NodeDef* c = item.graph.add_node();
+  c->set_name("while/LoopCond");
+  c->set_op("LoopCond");
+  *c->add_input() = "always_true";
+
+  NodeDef* s = item.graph.add_node();
+  s->set_name("while/Switch");
+  (*s->mutable_attr())["T"].set_type(DT_FLOAT);
+  s->set_op("Switch");
+  *s->add_input() = "while/Merge";
+  *s->add_input() = "while/LoopCond";
+
+  NodeDef* i = item.graph.add_node();
+  i->set_name("while/Identity");
+  i->set_op("Identity");
+  (*i->mutable_attr())["T"].set_type(DT_FLOAT);
+  *i->add_input() = "while/Switch:1";
+
+  NodeDef* n = item.graph.add_node();
+  n->set_name("while/NextIteration");
+  n->set_op("NextIteration");
+  (*n->mutable_attr())["T"].set_type(DT_FLOAT);
+  *n->add_input() = "while/Identity";
+
+  NodeDef* x = item.graph.add_node();
+  x->set_name("while/Exit");
+  x->set_op("Exit");
+  (*x->mutable_attr())["T"].set_type(DT_FLOAT);
+  *x->add_input() = "while/Switch";
+
+  item.fetch.push_back("while/Exit");
+
+  // Create our own cluster to run it
+  SingleMachine cluster(5, 3, 0);
+  TF_CHECK_OK(cluster.Provision());
+  TF_CHECK_OK(cluster.Initialize(item));
+
+  Status s1 = cluster.Run(item.graph, item.feed, item.fetch, nullptr);
+  if (!errors::IsDeadlineExceeded(s1)) {
+    LOG(ERROR) << "Expected 'deadline exceeded' error, got " << s1;
+    // Exit to break the infinite loop
+    _exit(1);
+  }
+
+  // Attempt to shutdown the cluster and make sure we get the proper error code.
+  Status s2 = cluster.Shutdown();
+  if (!errors::IsUnavailable(s2)) {
+    LOG(ERROR) << "Expected 'unavailable' error, got " << s2;
+    // Exit to break the infinite loop
+    _exit(2);
+  }
+
+  // The isn't much we can do at this point. Exit with the error code 0 to
+  // indicate everything went according to plan.
+  _exit(0);
+}
+
+TEST_F(SingleMachineTest, InfiniteLoops) {
+  // The RunInfiniteTFLoop function creates its own cluster.
+  cluster_.reset();
+
+  EXPECT_EXIT(RunInfiniteTFLoop(), ::testing::ExitedWithCode(0), ".*");
+}
+
+TEST_F(SingleMachineTest, InitializationMemory) {
+  // Build a variable and its initialization graph.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  int batch_size = 10;
+  Output x =
+      ops::RandomNormal(s.WithOpName("x"), {batch_size, 1}, DataType::DT_FLOAT);
+  Output v = ops::Variable(s.WithOpName("v"), TensorShape({batch_size, 1}),
+                           DataType::DT_FLOAT);
+  Output init = ops::Assign(s.WithOpName("init"), v, x);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.init_ops.push_back(init.name());
+  item.fetch.push_back(v.name());
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check that the initialization op is present in the cost model.
+  bool found = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    found |= (node.name() == NodeName(init.name()));
+  }
+  EXPECT_TRUE(found);
+}
+
+namespace {
+template <class T>
+inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+  AttrValue attr_value;
+  SetAttrValue(value, &attr_value);
+  auto* attr_map = node->mutable_attr();
+  (*attr_map)[key] = attr_value;
+}
+template <>
+inline void SetNodeAttr(const string& key, const Tensor& tensor,
+                        NodeDef* node) {
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  SetNodeAttr(key, tensor_proto, node);
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, PersistentMemory) {
+  // Build a hashtable and its initialization graph.
+  GrapplerItem item;
+  const DataType key_dtype = DT_INT64;
+  const DataType data_dtype = DT_INT64;
+
+  NodeDef* hashtable_node = item.graph.add_node();
+  hashtable_node->set_op("HashTable");
+  hashtable_node->set_name("hash_table");
+  SetNodeAttr("key_dtype", key_dtype, hashtable_node);
+  SetNodeAttr("value_dtype", data_dtype, hashtable_node);
+
+  // Initial hashtable keys and values
+  NodeDef* keys_node = item.graph.add_node();
+  keys_node->set_op("Const");
+  keys_node->set_name("table_keys");
+  SetNodeAttr("dtype", key_dtype, keys_node);
+  Tensor keys(key_dtype, TensorShape{2});
+  keys.vec<int64>()(0) = 123;
+  keys.vec<int64>()(1) = 321;
+  SetNodeAttr("value", keys, keys_node);
+
+  NodeDef* values_node = item.graph.add_node();
+  values_node->set_op("Const");
+  values_node->set_name("table_values");
+  SetNodeAttr("dtype", data_dtype, values_node);
+  Tensor values(data_dtype, TensorShape{2});
+  values.vec<int64>()(0) = 789;
+  values.vec<int64>()(1) = 987;
+  SetNodeAttr("value", values, values_node);
+
+  // InitializeTable node
+  NodeDef* init_table_node = item.graph.add_node();
+  init_table_node->set_op("InitializeTable");
+  init_table_node->set_name("initialize_table");
+  SetNodeAttr("Tkey", key_dtype, init_table_node);
+  SetNodeAttr("Tval", data_dtype, init_table_node);
+  *init_table_node->add_input() = "hash_table";
+  *init_table_node->add_input() = "table_keys";
+  *init_table_node->add_input() = "table_values";
+  item.init_ops.push_back(init_table_node->name());
+
+  // Key to lookup
+  NodeDef* query_node = item.graph.add_node();
+  query_node->set_op("Const");
+  query_node->set_name("query");
+  SetNodeAttr("dtype", key_dtype, query_node);
+  Tensor query(key_dtype, TensorShape({}));
+  query.flat<int64>()(0) = 0;
+  SetNodeAttr("value", query, query_node);
+
+  // Default return value of hashtable lookup
+  NodeDef* default_value_node = item.graph.add_node();
+  default_value_node->set_op("Const");
+  default_value_node->set_name("default_table_value");
+  SetNodeAttr("dtype", data_dtype, default_value_node);
+  Tensor dflt(data_dtype, TensorShape({}));
+  dflt.flat<int64>()(0) = 456;
+  SetNodeAttr("value", dflt, default_value_node);
+
+  // HashTable lookup node
+  NodeDef* lookup_node = item.graph.add_node();
+  lookup_node->set_op("LookupTableFind");
+  lookup_node->set_name("table_lookup");
+  SetNodeAttr("Tin", key_dtype, lookup_node);
+  SetNodeAttr("Tout", data_dtype, lookup_node);
+  *lookup_node->add_input() = "hash_table";
+  *lookup_node->add_input() = "query";
+  *lookup_node->add_input() = "default_table_value";
+  item.fetch.push_back(lookup_node->name());
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check the cost model.
+  bool found_table_init = false;
+  bool found_hashtable = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    if (node.name() == "hash_table") {
+      found_hashtable = true;
+      // Persistent memory usage should be 0 since it's recorded as part of the
+      // initialize_table op.
+      EXPECT_EQ(0, node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    } else if (node.name() == "initialize_table") {
+      found_table_init = true;
+      // Persistent memory should hold 2 keys and 2 values.
+      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    }
+  }
+  EXPECT_TRUE(found_table_init);
+  EXPECT_TRUE(found_hashtable);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
new file mode 100644
index 00000000000..592e4b789d0
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+#include "third_party/eigen3/Eigen/Core"
+
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "cuda/include/cudnn.h"
+#endif
+
+#ifdef EIGEN_USE_LIBXSMM
+#include "include/libxsmm.h"
+#endif
+
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+
+namespace tensorflow {
+namespace grappler {
+
+DeviceProperties GetLocalCPUInfo() {
+  DeviceProperties device;
+  device.set_type("CPU");
+
+  device.set_vendor(port::CPUVendorIDString());
+  // Combine cpu family and model into the model string.
+  device.set_model(
+      strings::StrCat((port::CPUFamily() << 4) + port::CPUModelNum()));
+  device.set_frequency(port::NominalCPUFrequency() * 1e-6);
+  device.set_num_cores(port::NumSchedulableCPUs());
+  device.set_l1_cache_size(Eigen::l1CacheSize());
+  device.set_l2_cache_size(Eigen::l2CacheSize());
+  device.set_l3_cache_size(Eigen::l3CacheSize());
+
+  (*device.mutable_environment())["cpu_instruction_set"] =
+      Eigen::SimdInstructionSetsInUse();
+
+  (*device.mutable_environment())["eigen"] = strings::StrCat(
+      EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
+#ifdef EIGEN_USE_LIBXSMM
+  (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
+#endif
+
+  return device;
+}
+
+DeviceProperties GetLocalGPUInfo(int gpu_id) {
+  DeviceProperties device;
+  device.set_type("GPU");
+
+#if GOOGLE_CUDA
+  cudaDeviceProp properties;
+  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
+  if (error == cudaSuccess) {
+    device.set_vendor("NVidia");
+    device.set_model(properties.name);
+    device.set_frequency(properties.clockRate * 1e-3);
+    device.set_num_cores(properties.multiProcessorCount);
+    device.set_num_registers(properties.regsPerMultiprocessor);
+    // For compute capability less than 5, l1 cache size is configurable to
+    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
+    // compute capability larger or equal to 5, l1 cache (unified with texture
+    // cache) size is 24 KB. This number may need to be updated for future
+    // compute capabilities.
+    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
+    device.set_l2_cache_size(properties.l2CacheSize);
+    device.set_l3_cache_size(0);
+    device.set_shared_memory_size_per_multiprocessor(
+        properties.sharedMemPerMultiprocessor);
+    device.set_memory_size(properties.totalGlobalMem);
+    // 8 is the number of bits per byte. 2 is accounted for
+    // double data rate (DDR).
+    device.set_bandwidth(properties.memoryBusWidth / 8 *
+                         properties.memoryClockRate * 2);
+  }
+
+  (*device.mutable_environment())["architecture"] =
+      strings::StrCat(properties.major, ".", properties.minor);
+  (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
+  (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
+#endif
+
+  return device;
+}
+
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
+  if (device.type == "CPU") {
+    return GetLocalCPUInfo();
+  } else if (device.type == "GPU") {
+    if (device.has_id) {
+      return GetLocalGPUInfo(device.id);
+    } else {
+      return GetLocalGPUInfo(0);
+    }
+  }
+  DeviceProperties result;
+  result.set_type("UNKNOWN");
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
new file mode 100644
index 00000000000..191942040a1
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns the DeviceProperties of the CPU on which grappler is running.
+DeviceProperties GetLocalCPUInfo();
+
+// Returns the DeviceProperties for the specified GPU attached to the server on
+// which grappler is running.
+DeviceProperties GetLocalGPUInfo(int gpu_id);
+
+// Returns the DeviceProperties of the specified device
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
new file mode 100644
index 00000000000..4ca4c03dbb6
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices)
+    : Cluster(0) {
+  devices_ = devices;
+}
+
+VirtualCluster::~VirtualCluster() {}
+
+Status VirtualCluster::Provision() { return Status::OK(); }
+
+Status VirtualCluster::Initialize(const GrapplerItem& item) {
+  return Status::OK();
+}
+
+Status VirtualCluster::Run(const GraphDef& item,
+                           const std::vector<std::pair<string, Tensor>>& feed,
+                           const std::vector<string>& fetch,
+                           RunMetadata* metadata) {
+  return Status::OK();
+
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
new file mode 100644
index 00000000000..cd8436a9870
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that lists the devices (and their properties)
+// available in a TensorFlow session. This cluster doesn't allow running an
+// actual graph. It is useful however when used in conjusction with costs models
+// that aren't based on the execution of the graph.
+class VirtualCluster : public Cluster {
+ public:
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
+
+  ~VirtualCluster() override;
+
+  Status Provision() override;
+  Status Initialize(const GrapplerItem& item) override;
+  Status Run(const GraphDef& item,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch, RunMetadata* metadata) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
new file mode 100644
index 00000000000..2b30facd84d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -0,0 +1,286 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "if_cuda",
+)
+
+tf_proto_library(
+    name = "op_performance_data",
+    srcs = ["op_performance_data.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "graph_properties",
+    srcs = ["graph_properties.cc"],
+    hdrs = ["graph_properties.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_performance_data_cc",
+        ":utils",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "graph_properties_test",
+    size = "small",
+    srcs = ["graph_properties_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":graph_properties",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "graph_memory",
+    srcs = ["graph_memory.cc"],
+    hdrs = ["graph_memory.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_properties",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "graph_memory_test",
+    size = "small",
+    srcs = ["graph_memory_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":graph_memory",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "robust_stats",
+    srcs = ["robust_stats.cc"],
+    hdrs = ["robust_stats.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "robust_stats_test",
+    size = "small",
+    srcs = ["robust_stats_test.cc"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_performance_data_cc",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "cost_estimator",
+    hdrs = ["cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "virtual_placer",
+    srcs = ["virtual_placer.cc"],
+    hdrs = ["virtual_placer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "virtual_placer_test",
+    size = "small",
+    srcs = ["virtual_placer_test.cc"],
+    deps = [
+        ":virtual_placer",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+    ],
+)
+
+cc_library(
+    name = "virtual_scheduler",
+    srcs = ["virtual_scheduler.cc"],
+    hdrs = ["virtual_scheduler.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_properties",
+        ":utils",
+        ":virtual_placer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+    ],
+)
+
+cc_test(
+    name = "virtual_scheduler_test",
+    size = "small",
+    srcs = ["virtual_scheduler_test.cc"],
+    deps = [
+        ":virtual_placer",
+        ":virtual_scheduler",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+    ],
+)
+
+cc_library(
+    name = "measuring_cost_estimator",
+    srcs = ["measuring_cost_estimator.cc"],
+    hdrs = ["measuring_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_library(
+    name = "op_level_cost_estimator",
+    srcs = ["op_level_cost_estimator.cc"],
+    hdrs = ["op_level_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":op_performance_data_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_test(
+    name = "op_level_cost_estimator_test",
+    size = "small",
+    srcs = ["op_level_cost_estimator_test.cc"],
+    deps = [
+        ":op_level_cost_estimator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "analytical_cost_estimator",
+    srcs = ["analytical_cost_estimator.cc"],
+    hdrs = ["analytical_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":graph_properties",
+        ":op_level_cost_estimator",
+        ":op_performance_data_cc",
+        ":utils",
+        ":virtual_placer",
+        ":virtual_scheduler",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_test(
+    name = "analytical_cost_estimator_test",
+    size = "small",
+    srcs = ["analytical_cost_estimator_test.cc"],
+    deps = [
+        ":analytical_cost_estimator",
+        ":virtual_scheduler",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
new file mode 100644
index 00000000000..e530f66415b
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+
+#include <limits>
+#include <unordered_map>
+
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
+                                                 bool use_static_shapes)
+    : cluster_(cluster),
+      node_estimator_(new OpLevelCostEstimator()),
+      use_static_shapes_(use_static_shapes) {}
+
+AnalyticalCostEstimator::AnalyticalCostEstimator(
+    Cluster* cluster, OpLevelCostEstimator* node_estimator,
+    bool use_static_shapes)
+    : cluster_(cluster),
+      node_estimator_(node_estimator),
+      use_static_shapes_(use_static_shapes) {}
+
+Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
+  item_ = item;
+  return Status::OK();
+}
+
+Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                             CostGraphDef* cost_graph,
+                                             Costs* costs) const {
+  GrapplerItem item = item_;
+  item.graph = optimized_graph;
+
+  std::unordered_map<string, CostGraphDef::Node*> name_to_cost;
+  if (cost_graph) {
+    for (auto& node : *cost_graph->mutable_node()) {
+      name_to_cost[node.name()] = &node;
+    }
+  }
+  std::vector<string> inaccurate_nodes;
+  VirtualScheduler scheduler(&item, use_static_shapes_, cluster_);
+  auto status = scheduler.Init();
+  if (!status.ok()) {
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  Costs node_costs;
+  do {
+    NodeInfo node_info = scheduler.GetCurrNodeInfo();
+    auto& op_info = node_info.op_info;
+    const string& op_name = node_info.name;
+
+    node_costs = node_estimator_->PredictCosts(op_info);
+    if (node_costs.inaccurate) {
+      inaccurate_nodes.push_back(op_name);
+    }
+    if (cost_graph) {
+      auto it = name_to_cost.find(op_name);
+      CostGraphDef::Node* cost_node;
+      if (it != name_to_cost.end()) {
+        cost_node = it->second;
+      } else {
+        cost_node = cost_graph->add_node();
+        cost_node->set_name(op_name);
+      }
+      cost_node->set_device(node_info.device_name);
+      cost_node->set_compute_cost(
+          node_costs.execution_time.asMicroSeconds().count());
+      cost_node->set_compute_time(
+          node_costs.compute_time.asMicroSeconds().count());
+      cost_node->set_memory_time(
+          node_costs.memory_time.asMicroSeconds().count());
+      for (const auto& output : node_info.op_info.outputs()) {
+        auto output_info = cost_node->add_output_info();
+        output_info->set_dtype(output.dtype());
+        auto shape = output_info->mutable_shape();
+        *shape = output.shape();
+      }
+    }
+  } while (scheduler.MarkCurrNodeExecuted(node_costs));
+
+  *costs = scheduler.Summary();
+  VLOG(1) << inaccurate_nodes.size() << " out of "
+          << optimized_graph.node_size()
+          << " nodes have inaccurate time estimation";
+  for (const auto& node : inaccurate_nodes) {
+    VLOG(2) << "Node with inaccurate time estimation: " << node;
+  }
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
new file mode 100644
index 00000000000..cf9163302c6
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item based on the theoretical
+// performance of the hardware that will run the model.
+class AnalyticalCostEstimator : public CostEstimator {
+ public:
+  // Does not take ownership of cluster.
+  AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
+  // Does not take ownership of the cluster, but takes ownership of the
+  // node_estimator
+  AnalyticalCostEstimator(Cluster* cluster,
+                          OpLevelCostEstimator* node_estimator,
+                          bool use_static_shapes);
+  ~AnalyticalCostEstimator() override {}
+
+  // Initializes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Predict the performance of each node of the optimized graph and annotate
+  // the CostGraphDef with the corresponding estimates. Also returns the
+  // expected latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_latency) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  GrapplerItem item_;
+  std::unique_ptr<OpLevelCostEstimator> node_estimator_;
+  bool use_static_shapes_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
new file mode 100644
index 00000000000..02156fbf580
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class AnalyticalCostEstimatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initializes cluster_ and placer_.
+    std::unordered_map<string, DeviceProperties> devices;
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_num_cores(4);
+    cpu_device.set_frequency(2600);
+    cpu_device.set_bandwidth(24 * 1024 * 1024);
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    DeviceProperties gpu_device;
+    gpu_device.set_type("GPU");
+    gpu_device.set_num_cores(12);
+    gpu_device.set_frequency(1100);
+    gpu_device.set_bandwidth(180 * 1024 * 1024);
+    (*gpu_device.mutable_environment())["architecture"] = "6";
+    devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+
+    cluster_.reset(new VirtualCluster(devices));
+  }
+
+  GrapplerItem CreateMiniGraph() {
+    const int batch = 1;
+    const int width = 28;
+    const int height = 28;
+    const int num_channels = 1;
+    const int num_labels = 10;
+    const int kernel_size = 3;
+    const int conv_filters = 32;
+
+    Scope s = Scope::NewRootScope();
+    auto images = ops::RandomUniform(
+        s.WithOpName("image"), {batch, width, height, num_channels}, DT_FLOAT);
+    auto labels = ops::RandomUniform(s.WithOpName("label"), {batch, num_labels},
+                                     DT_FLOAT);
+    auto w = ops::Variable(
+        s.WithOpName("W"),
+        {kernel_size, kernel_size, num_channels, conv_filters}, DT_FLOAT);
+    auto b = ops::Variable(s.WithOpName("B"), {conv_filters}, DT_FLOAT);
+    auto conv =
+        ops::Conv2D(s.WithOpName("conv"), images, w, {1, 1, 1, 1}, "SAME");
+    auto bias = ops::Add(s.WithOpName("bias"), conv, b);
+    auto relu = ops::Relu(s.WithOpName("relu"), bias);
+    auto flat_shape = ops::Const(s.WithOpName("flat_shape"),
+                                 {batch, width * height * conv_filters});
+    auto flat = ops::Reshape(s.WithOpName("flat"), relu, flat_shape);
+
+    auto w2 =
+        ops::Variable(s.WithOpName("W2"),
+                      {width * height * conv_filters, num_labels}, DT_FLOAT);
+    auto b2 = ops::Variable(s.WithOpName("B2"), {num_labels}, DT_FLOAT);
+    auto matmul = ops::MatMul(s.WithOpName("matmul"), flat, w2);
+    auto logits = ops::Add(s.WithOpName("logits"), matmul, b2);
+    auto softmax = ops::Softmax(s.WithOpName("softmax"), logits);
+    auto lsm = ops::Log(s.WithOpName("lsm"), softmax);
+
+    GrapplerItem item;
+    item.fetch.push_back("lsm");
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    return item;
+  }
+
+  std::unique_ptr<VirtualCluster> cluster_;
+};
+
+TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
+  GrapplerItem item = CreateMiniGraph();
+
+  AnalyticalCostEstimator estimator(cluster_.get(), true);
+  TF_ASSERT_OK(estimator.Initialize(item));
+
+  CostGraphDef cost_graph;
+  Costs summary;
+  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
+
+  EXPECT_EQ(Costs::NanoSeconds(9156), summary.execution_time);
+  EXPECT_FALSE(summary.inaccurate);
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
new file mode 100644
index 00000000000..868c4a9733a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -0,0 +1,157 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+
+#include <chrono>
+#include <unordered_map>
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class GraphDef;
+class CostGraphDef;
+
+namespace grappler {
+struct GrapplerItem;
+
+constexpr int64 kMemoryUnknown = -1ll;
+constexpr int64 kZeroMemory = 0ll;
+
+// Holds the set of things we might want to estimate or measure in Grappler.
+// Always produce execution time. Other fields are optional depending on the
+// estimator being used.
+struct Costs {
+  // Returns a Costs structure with default values for all of the fields.
+  inline Costs();
+
+  // Builds a Costs structure with all zero values, rather than unknowns.
+  static inline Costs ZeroCosts();
+
+  struct MicroSeconds : std::chrono::microseconds {
+    MicroSeconds() : std::chrono::microseconds(0) {}
+    MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
+    MicroSeconds(const std::chrono::microseconds& d)
+        : std::chrono::microseconds(d) {}
+    MicroSeconds& operator=(const std::chrono::microseconds& d) {
+      std::chrono::microseconds::operator=(d);
+      return *this;
+    }
+  };
+  struct NanoSeconds : std::chrono::nanoseconds {
+    NanoSeconds() : std::chrono::nanoseconds(0) {}
+    NanoSeconds(double d) : std::chrono::nanoseconds(static_cast<int64>(d)) {}
+    NanoSeconds(const std::chrono::nanoseconds& d)
+        : std::chrono::nanoseconds(d) {}
+    NanoSeconds& operator=(const std::chrono::nanoseconds& d) {
+      std::chrono::nanoseconds::operator=(d);
+      return *this;
+    }
+    MicroSeconds asMicroSeconds() const {
+      std::chrono::microseconds us =
+          std::chrono::duration_cast<std::chrono::microseconds>(*this);
+      return MicroSeconds(us);
+    }
+  };
+
+  // We store all our times in nanoseconds. If needs be, we can always switch to
+  // picoseconds in the future by updating this typedef.
+  typedef NanoSeconds Duration;
+
+  // Overall cost of running the graph; latency.
+  // Mean
+  Duration execution_time;
+  Duration min_execution_time;
+  Duration max_execution_time;
+
+  // Computation cost of running the graph.
+  Duration compute_time;
+
+  // Memory access cost of running the graph.
+  Duration memory_time;
+
+  // This field can be a very pessimistic estimate of the main memory
+  // requirements of a graph. For example, it might assume that all activations
+  // are live for all of a graph's execution.
+  int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+
+  // These fields are used for TPU-related estimations. They are per-op
+  // maximums, so each op is evaluated independently, but we want the maximum of
+  // the value over all ops.
+  int64 max_per_op_buffers;    // Sum of all buffers used by the ops.
+  int64 max_per_op_streaming;  // Ignore largest input buffer, assuming it
+                               // streams from main memory.
+  // If the time estimation is inaccurate.
+  bool inaccurate = false;
+
+  // Max possible memory usage per device.
+  std::unordered_map<string, uint64> estimated_max_memory_per_device;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
+  os << d.count() << "us";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const Costs::NanoSeconds d) {
+  os << d.count() << "ns";
+  return os;
+}
+
+Costs::Costs() {
+  execution_time = Duration::zero();
+  compute_time = Duration::zero();
+  memory_time = Duration::zero();
+  max_memory = kMemoryUnknown;
+  max_per_op_buffers = kMemoryUnknown;
+  max_per_op_streaming = kMemoryUnknown;
+}
+
+Costs Costs::ZeroCosts() {
+  Costs costs;
+  costs.execution_time = Duration::zero();
+  costs.max_memory = kZeroMemory;
+  costs.max_per_op_buffers = kZeroMemory;
+  costs.max_per_op_streaming = kZeroMemory;
+  return costs;
+}
+
+// Given a GrapperItem and an optimized implementation of the corresponding
+// TensorFlow graph, the CostEstimator attempts to predicts the actual cost of
+// running the graph.
+class CostEstimator {
+ public:
+  virtual ~CostEstimator() {}
+
+  // Initializes the estimator for the specified grappler item.
+  // The estimator shouldn't be used if this function returns any status other
+  // that OK.
+  virtual Status Initialize(const GrapplerItem& item) = 0;
+
+  // Predicts the cost of running the given optimized version of the grappler
+  // item.
+  // If a CostGraphDef is passed, it will be populated with detailed information
+  // about the cost of running each operation of the optimized graph.
+  // if a double value is passed, it will be set to a value that reflects the
+  // overall cost of running the graph (e.g. the latency of the computation).
+  // Returns a status that indicate is the performance could be estimated or
+  // not.
+  virtual Status PredictCosts(const GraphDef& optimized_graph,
+                              CostGraphDef* cost_graph, Costs* cost) const = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
new file mode 100644
index 00000000000..b7827fc1adf
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/graph_memory.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status GraphMemory::InferStatically() {
+  GraphProperties properties(item_);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+  return InferFromGraphProperties(&properties);
+}
+
+Status GraphMemory::InferDynamically(Cluster* cluster) {
+  GraphProperties properties(item_);
+  TF_RETURN_IF_ERROR(properties.InferDynamically(cluster));
+  return InferFromGraphProperties(&properties);
+}
+
+Status GraphMemory::InferFromGraphProperties(GraphProperties* properties) {
+  // Compute the worst case usage between initialization and normal mode.
+  // TODO(bsteiner): we should consider persistent memory usage separately.
+  int64 worst_case_init_mem_usage;
+  int64 best_case_init_mem_usage;
+  InferMemUsageForNodes(item_.InitOpsFanin(), properties,
+                        &worst_case_init_mem_usage, &best_case_init_mem_usage);
+  int64 worst_case_main_mem_usage;
+  int64 best_case_main_mem_usage;
+  InferMemUsageForNodes(item_.MainOpsFanin(), properties,
+                        &worst_case_main_mem_usage, &best_case_main_mem_usage);
+
+  worst_case_memory_usage_ =
+      std::max(worst_case_init_mem_usage, worst_case_main_mem_usage);
+  best_case_memory_usage_ =
+      std::max(best_case_init_mem_usage, best_case_main_mem_usage);
+
+  return Status::OK();
+}
+
+void GraphMemory::InferMemUsageForNodes(
+    const std::vector<const NodeDef*>& nodes, GraphProperties* properties,
+    int64* worst_case_memory_usage, int64* best_case_memory_usage) const {
+  // TODO(bsteiner) refine this: we should consider the multidevice case.
+  *worst_case_memory_usage = 0;
+  *best_case_memory_usage = 0;
+  for (const auto& node : item_.graph.node()) {
+    // Estimate the memory required to store the tensors generated by the node.
+    std::vector<OpInfo::TensorProperties> outputs =
+        properties->GetOutputProperties(node.name());
+    int64 node_memory_usage = InferMemUsageForNeighbors(outputs);
+
+    // Worst case memory usage corresponds to the case where all the nodes are
+    // alive.
+    *worst_case_memory_usage += node_memory_usage;
+
+    // Estimate the memory required to store the input tensors needed by the
+    // node.
+    std::vector<OpInfo::TensorProperties> inputs =
+        properties->GetInputProperties(node.name());
+    node_memory_usage += InferMemUsageForNeighbors(inputs);
+
+    *best_case_memory_usage =
+        std::max(*best_case_memory_usage, node_memory_usage);
+  }
+}
+
+int64 GraphMemory::InferMemUsageForNeighbors(
+    const std::vector<OpInfo::TensorProperties>& props) const {
+  int64 neighbors_memory_usage = 0;
+  for (const auto& prop : props) {
+    DataType dtype = prop.dtype();
+    int size = DataTypeSize(dtype);
+    TensorShapeProto shape = prop.shape();
+    if (shape.unknown_rank()) {
+      // Can't infer the size if the rank is unknown, just skip.
+      continue;
+    }
+    // If one of the dimensions is unknown statically, assume it's one.
+    for (int i = 0; i < shape.dim_size(); ++i) {
+      if (shape.dim(i).size() < 0) {
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+    int num_elems = TensorShape(shape).num_elements();
+    neighbors_memory_usage += num_elems * size;
+  }
+  return neighbors_memory_usage;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_memory.h b/tensorflow/core/grappler/costs/graph_memory.h
new file mode 100644
index 00000000000..a3e152a0e10
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_memory.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Infer the worst case memory usage for a given grappler item.
+class GraphMemory {
+ public:
+  explicit GraphMemory(const GrapplerItem& item)
+      : item_(item), worst_case_memory_usage_(-1) {}
+
+  Status InferStatically();
+  Status InferDynamically(Cluster* cluster);
+  Status InferFromGraphProperties(GraphProperties* properties);
+
+  // Worst case memory usage in bytes, or -1 if the usage is unknown.
+  int64 GetWorstCaseMemoryUsage() const { return worst_case_memory_usage_; }
+
+  // Best case memory usage in bytes, or -1 if the usage is unknown.
+  // This corresponds to the case where all the data is swapped out excepted
+  // that which is needed for a single node to perform its computations.
+  int64 GetBestCaseMemoryUsage() const { return best_case_memory_usage_; }
+
+ private:
+  void InferMemUsageForNodes(const std::vector<const NodeDef*>& nodes,
+                             GraphProperties* properties, int64* worst_case,
+                             int64* best_case) const;
+  int64 InferMemUsageForNeighbors(
+      const std::vector<OpInfo::TensorProperties>& props) const;
+
+  // Inputs
+  GrapplerItem item_;
+  int64 worst_case_memory_usage_;
+  int64 best_case_memory_usage_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
diff --git a/tensorflow/core/grappler/costs/graph_memory_test.cc b/tensorflow/core/grappler/costs/graph_memory_test.cc
new file mode 100644
index 00000000000..82c86064c68
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_memory_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/graph_memory.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GraphMemoryTest : public ::testing::Test {};
+
+TEST_F(GraphMemoryTest, Basic) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {{"CPU:0"}});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphMemory memory(item);
+  Status s = memory.InferStatically();
+  TF_CHECK_OK(s);
+  // 5 AddN + 1 random op each generating 10 values -> 240 bytes
+  // 4 more bytes for the mean of the distribution and 4 more for the stddev.
+  EXPECT_EQ(248, memory.GetWorstCaseMemoryUsage());
+  // If at most one op executes at a time, it needs 10 inputs values and 10
+  // output values, or 8 bytes.
+  EXPECT_EQ(80, memory.GetBestCaseMemoryUsage());
+}
+
+TEST_F(GraphMemoryTest, UnknownBatchSize) {
+  TrivialTestGraphInputYielder fake_input(4, 1, -1, false, {{"CPU:0"}});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphMemory memory(item);
+  Status s = memory.InferStatically();
+  TF_CHECK_OK(s);
+  // Same maths as before, except that batch size is unknown and therefore
+  // assumed to be one.
+  EXPECT_EQ(32, memory.GetWorstCaseMemoryUsage());
+  EXPECT_EQ(12, memory.GetBestCaseMemoryUsage());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
new file mode 100644
index 00000000000..21b73b6618d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -0,0 +1,272 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeAndType;
+using shape_inference::ShapeHandle;
+
+namespace {
+
+// Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
+// <*queue_shapes_and_types>.
+Status MergeEnqueueShapesAndTypes(
+    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    std::vector<ShapeAndType>* queue_shapes_and_types) {
+  if (shapes_and_types.size() != queue_shapes_and_types->size()) {
+    return errors::InvalidArgument(
+        "Enqueue nodes mixed number of tensors: ", shapes_and_types.size(),
+        "  vs ", queue_shapes_and_types->size());
+  }
+  for (int i = 0; i < shapes_and_types.size(); ++i) {
+    const ShapeAndType& a = shapes_and_types[i];
+    ShapeAndType& b = (*queue_shapes_and_types)[i];
+    if (a.dtype != b.dtype) {
+      return errors::InvalidArgument("Enqueue nodes mixed dtypes for tensor ",
+                                     i, ": ", DataTypeString(a.dtype), " vs ",
+                                     DataTypeString(b.dtype));
+    }
+
+    TF_RETURN_IF_ERROR(qctx->Merge(a.shape, b.shape, &b.shape));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GraphProperties::InferStatically() {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
+  ImportGraphDefOptions options;
+  Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
+  TF_RETURN_IF_ERROR(s);
+
+  // List the resources and the nodes using them
+  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
+  for (const Node* const node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (node->input_type(i) == DataType::DT_RESOURCE) {
+        const Node* resource;
+        TF_CHECK_OK(node->input_node(i, &resource));
+        resources[resource].insert(node);
+      }
+    }
+  }
+
+  // If we found a resource, try to propagate the shapes through it.
+  bool done = true;
+  do {
+    std::queue<const Node*> new_shapes;
+    for (const auto& resource_data : resources) {
+      const Node* qnode = resource_data.first;
+      StringPiece type(qnode->type_string());
+      if (!type.ends_with("QueueV2")) {
+        continue;
+      }
+      auto qctx = shape_refiner.GetContext(qnode);
+      if (!qctx) {
+        continue;
+      }
+
+      // Check to see if the shape is fully defined.
+      auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
+      if (queue_handle_data != nullptr) {
+        bool fully_defined = true;
+        for (const auto& shape_and_type : *queue_handle_data) {
+          if (!qctx->FullyDefined(shape_and_type.shape) ||
+              shape_and_type.dtype == DT_INVALID) {
+            fully_defined = false;
+          }
+        }
+        if (fully_defined) {
+          continue;
+        }
+      }
+
+      std::vector<ShapeAndType> queue_shapes_and_types;
+      if (queue_handle_data != nullptr) {
+        queue_shapes_and_types = *queue_handle_data;
+      }
+      for (const auto& node : resource_data.second) {
+        auto ctx = shape_refiner.GetContext(node);
+        if (!ctx) {
+          continue;
+        }
+        // TODO(bsteiner): handle EnqueueMany as well.
+        if (node->type_string().find("Enqueue") != std::string::npos &&
+            node->type_string().find("EnqueueMany") == std::string::npos) {
+          std::vector<ShapeAndType> shapes_and_types;
+          for (int i = 1; i < ctx->num_inputs(); ++i) {
+            shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+          }
+
+          if (queue_shapes_and_types.empty()) {
+            queue_shapes_and_types = shapes_and_types;
+          } else {
+            TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
+                shapes_and_types, qctx, &queue_shapes_and_types));
+          }
+        }
+      }
+      if (!queue_shapes_and_types.empty() &&
+          qctx->MergeOutputHandleShapesAndTypes(0, queue_shapes_and_types)) {
+        new_shapes.push(qnode);
+      }
+    }
+    // Propagate the shapes in the transitive fan-out of the queue.
+    done = new_shapes.empty();
+    while (!new_shapes.empty()) {
+      const Node* n = new_shapes.front();
+      new_shapes.pop();
+      for (const Node* fanout : n->out_nodes()) {
+        bool updated = false;
+        TF_RETURN_IF_ERROR(shape_refiner.UpdateNode(fanout, &updated));
+        if (updated) {
+          new_shapes.push(fanout);
+        }
+      }
+    }
+  } while (!done);
+
+  for (const Node* const node : graph.nodes()) {
+    VLOG(1) << "<Node> " << node->name();
+    auto ctx = shape_refiner.GetContext(node);
+    if (!ctx) {
+      continue;
+    }
+    CHECK_EQ(ctx->num_inputs(), node->num_inputs());
+    std::vector<OpInfo::TensorProperties> input_properties;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      OpInfo::TensorProperties properties;
+      properties.set_dtype(node->input_type(i));
+      ShapeHandle shp = ctx->input(i);
+      if (!ctx->RankKnown(shp)) {
+        properties.mutable_shape()->set_unknown_rank(true);
+      } else {
+        for (int j = 0; j < ctx->Rank(shp); ++j) {
+          shape_inference::DimensionHandle dim = ctx->Dim(shp, j);
+          int64 d = ctx->Value(dim);
+          properties.mutable_shape()->add_dim()->set_size(d);
+        }
+      }
+      input_properties.push_back(properties);
+    }
+    input_properties_[node->name()] = input_properties;
+
+    // TODO(bsteiner): share this code with the input processing above.
+    CHECK_EQ(ctx->num_outputs(), node->num_outputs());
+    std::vector<OpInfo::TensorProperties> output_properties;
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      OpInfo::TensorProperties properties;
+      properties.set_dtype(node->output_type(i));
+      ShapeHandle shp = ctx->output(i);
+      if (!ctx->RankKnown(shp)) {
+        properties.mutable_shape()->set_unknown_rank(true);
+      } else {
+        for (int j = 0; j < ctx->Rank(shp); ++j) {
+          shape_inference::DimensionHandle dim = ctx->Dim(shp, j);
+          int64 d = ctx->Value(dim);
+          properties.mutable_shape()->add_dim()->set_size(d);
+        }
+      }
+      output_properties.push_back(properties);
+    }
+    output_properties_[node->name()] = output_properties;
+  }
+
+  return Status::OK();
+}
+
+Status GraphProperties::InferDynamically(Cluster* cluster) {
+  TF_RETURN_IF_ERROR(cluster->Initialize(item_));
+
+  // Runs the model once to collect the shapes in the cost model.
+  RunMetadata metadata;
+  TF_RETURN_IF_ERROR(
+      cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
+
+  return InferFromCostGraph(metadata.cost_graph());
+}
+
+Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
+  std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
+  std::unordered_map<string, const NodeDef*> name_to_node;  // Empty
+  for (auto& node : cost_graph.node()) {
+    name_to_cost[node.name()] = &node;
+
+    std::vector<OpInfo::TensorProperties> output_properties;
+    for (const auto& out : node.output_info()) {
+      OpInfo::TensorProperties properties;
+      properties.set_dtype(out.dtype());
+      *properties.mutable_shape() = out.shape();
+      output_properties.push_back(properties);
+    }
+    output_properties_[node.name()] = output_properties;
+  }
+
+  for (const auto& node : item_.graph.node()) {
+    // Skip the nodes that are not in the cost graph: these are nodes that
+    // aren't run, because they aren't in the intersection of transitive fan-in
+    // of a fetch node and the transitive fan-out of an input, or nodes that
+    // were optimized away by the optimizer.
+    auto it = name_to_cost.find(node.name());
+    if (it == name_to_cost.end()) {
+      continue;
+    }
+    std::vector<OpInfo::TensorProperties> inputs =
+        FindInputFeatures(node, name_to_cost, name_to_node);
+
+    input_properties_[node.name()] = inputs;
+  }
+  return Status::OK();
+}
+
+bool GraphProperties::HasOutputProperties(const string& name) const {
+  return output_properties_.find(name) != output_properties_.end();
+}
+
+std::vector<OpInfo::TensorProperties> GraphProperties::GetInputProperties(
+    const string& node_name) const {
+  auto it = input_properties_.find(node_name);
+  if (it != input_properties_.end()) {
+    return it->second;
+  }
+  return std::vector<OpInfo::TensorProperties>();
+}
+
+std::vector<OpInfo::TensorProperties> GraphProperties::GetOutputProperties(
+    const string& node_name) const {
+  auto it = output_properties_.find(node_name);
+  if (it != output_properties_.end()) {
+    return it->second;
+  }
+  return std::vector<OpInfo::TensorProperties>();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
new file mode 100644
index 00000000000..b849c4b3f04
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+
+#include <unordered_map>
+#include <vector>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A TensorFlow model to optimize.
+// Models are represented by the combination of a graph, one of more fetch
+// nodes, and potentially a set of nodes to feed.
+class GraphProperties {
+ public:
+  // Factory method for creating a GrapplerShapes from a MetaGraphDef.
+  // Returns nullptr if the given meta_graph cannot be converted.
+  explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
+
+  Status InferStatically();
+  Status InferDynamically(Cluster* cluster);
+  Status InferFromCostGraph(const CostGraphDef& cost_graph);
+
+  bool HasOutputProperties(const string& name) const;
+  std::vector<OpInfo::TensorProperties> GetInputProperties(
+      const string& node_name) const;
+  std::vector<OpInfo::TensorProperties> GetOutputProperties(
+      const string& node_name) const;
+
+ private:
+  // Inputs
+  GrapplerItem item_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
new file mode 100644
index 00000000000..10a88b59a2f
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -0,0 +1,537 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GraphPropertiesTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Provision a single machine with 3 cpu cores
+    cluster_.reset(new SingleMachine(5 * 60, 3, 0));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override { cluster_.reset(); }
+
+ protected:
+  // Returns a string form of <p>, suitable for comparing type and shape.
+  // Example output for 4-d float tensor: "float: [10,2,30,4]"
+  string PropToString(const OpInfo::TensorProperties& p) {
+    string s = strings::StrCat(DataTypeString(p.dtype()), ": ");
+    if (p.shape().unknown_rank()) {
+      strings::StrAppend(&s, "?");
+    } else {
+      strings::StrAppend(&s, "[");
+      for (int i = 0; i < p.shape().dim_size(); ++i) {
+        strings::StrAppend(&s, i == 0 ? "" : ",", p.shape().dim(i).size());
+      }
+      strings::StrAppend(&s, "]");
+    }
+    return s;
+  }
+
+  std::unique_ptr<SingleMachine> cluster_;
+};
+
+TEST_F(GraphPropertiesTest, StaticProperties) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphProperties properties(item);
+  Status s = properties.InferStatically();
+  TF_CHECK_OK(s);
+
+  for (const auto& node : item.graph.node()) {
+    if (node.op() == "RandomStandardNormal") {
+      // The node has one input (the shape of the tensor to generate).
+      EXPECT_EQ(1, properties.GetInputProperties(node.name()).size());
+      // The const node has one output.
+      const auto props = properties.GetOutputProperties(node.name());
+      EXPECT_EQ(1, props.size());
+      const OpInfo::TensorProperties& prop = props[0];
+      EXPECT_EQ(DT_FLOAT, prop.dtype());
+      EXPECT_FALSE(prop.shape().unknown_rank());
+      EXPECT_EQ(2, prop.shape().dim_size());
+      EXPECT_EQ(10, prop.shape().dim(0).size());
+      EXPECT_EQ(1, prop.shape().dim(1).size());
+    } else if (node.op() == "AddN") {
+      const auto in_props = properties.GetInputProperties(node.name());
+      EXPECT_EQ(1, in_props.size());
+      const OpInfo::TensorProperties& in_prop = in_props[0];
+      EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+      EXPECT_FALSE(in_prop.shape().unknown_rank());
+      EXPECT_EQ(2, in_prop.shape().dim_size());
+      EXPECT_EQ(10, in_prop.shape().dim(0).size());
+      EXPECT_EQ(1, in_prop.shape().dim(1).size());
+      const auto out_props = properties.GetOutputProperties(node.name());
+      EXPECT_EQ(1, out_props.size());
+      string in_prop_str;
+      ::tensorflow::protobuf::TextFormat::PrintToString(in_prop, &in_prop_str);
+      string out_prop_str;
+      ::tensorflow::protobuf::TextFormat::PrintToString(out_props[0],
+                                                        &out_prop_str);
+      EXPECT_EQ(in_prop_str, out_prop_str);
+    }
+  }
+}
+
+TEST_F(GraphPropertiesTest, DynamicProperties) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(cluster_->Initialize(item));
+  Status s = properties.InferDynamically(cluster_.get());
+  TF_CHECK_OK(s);
+
+  for (const auto& node : item.graph.node()) {
+    if (node.op() == "RandomStandardNormal") {
+      // The random node is missing from the cost graph (why ?)
+      EXPECT_EQ(0, properties.GetInputProperties(node.name()).size());
+    } else if (node.op() == "AddN") {
+      // Since the random node is missing, we can't infer the input properties
+      // of the first AddN node. The other AddN nodes have the expected
+      // properties.
+      if (node.name() == "AddN") {
+        const auto props = properties.GetInputProperties(node.name());
+        EXPECT_EQ(1, props.size());
+        const OpInfo::TensorProperties& prop = props[0];
+        EXPECT_EQ(DT_INVALID, prop.dtype());
+        EXPECT_TRUE(prop.shape().unknown_rank());
+      } else {
+        const auto props = properties.GetInputProperties(node.name());
+        EXPECT_EQ(1, props.size());
+        const OpInfo::TensorProperties& prop = props[0];
+        EXPECT_EQ(DT_FLOAT, prop.dtype());
+        EXPECT_FALSE(prop.shape().unknown_rank());
+        EXPECT_EQ(2, prop.shape().dim_size());
+        EXPECT_EQ(10, prop.shape().dim(0).size());
+        EXPECT_EQ(1, prop.shape().dim(1).size());
+        const auto out_props = properties.GetOutputProperties(node.name());
+        EXPECT_EQ(1, out_props.size());
+        string prop_str;
+        ::tensorflow::protobuf::TextFormat::PrintToString(prop, &prop_str);
+        string out_prop_str;
+        ::tensorflow::protobuf::TextFormat::PrintToString(out_props[0],
+                                                          &out_prop_str);
+        EXPECT_EQ(prop_str, out_prop_str);
+      }
+    }
+  }
+}
+
+TEST_F(GraphPropertiesTest, Variables) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "Variable")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+  item.fetch.push_back("Var");
+
+  Tensor initial_val(DT_FLOAT, TensorShape({3, 7}));
+  test::FillIota<float>(&initial_val, 0);
+  TF_CHECK_OK(NodeDefBuilder("InitialVal", "Const")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("value", initial_val)
+                  .Finalize(item.graph.add_node()));
+  TF_CHECK_OK(NodeDefBuilder("InitVar", "Assign")
+                  .Input("Var", 0, DT_FLOAT_REF)
+                  .Input("InitialVal", 0, DT_FLOAT)
+                  .Finalize(item.graph.add_node()));
+  item.init_ops.push_back("InitVar");
+
+  {
+    GraphProperties static_properties(item);
+    TF_CHECK_OK(static_properties.InferStatically());
+
+    const auto props = static_properties.GetOutputProperties("Var");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+    EXPECT_FALSE(prop.shape().unknown_rank());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    EXPECT_EQ(3, prop.shape().dim(0).size());
+    EXPECT_EQ(7, prop.shape().dim(1).size());
+  }
+  {
+    TF_CHECK_OK(cluster_->Initialize(item));
+    GraphProperties dynamic_properties(item);
+    TF_CHECK_OK(dynamic_properties.InferDynamically(cluster_.get()));
+
+    const auto props = dynamic_properties.GetOutputProperties("Var");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+    EXPECT_FALSE(prop.shape().unknown_rank());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    EXPECT_EQ(3, prop.shape().dim(0).size());
+    EXPECT_EQ(7, prop.shape().dim(1).size());
+  }
+}
+
+TEST_F(GraphPropertiesTest, VarHandles) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+
+  TF_CHECK_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Input("Var", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("VarRead");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_FALSE(prop.shape().unknown_rank());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  EXPECT_EQ(3, prop.shape().dim(0).size());
+  EXPECT_EQ(7, prop.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Queues) {
+  // Create a graph with known input shapes, and propagate the shapes through a
+  // couple of queues.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+
+  auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
+  Output rnd =
+      ops::RandomNormal(root.WithOpName("rnd"), {3, 7}, DataType::DT_FLOAT);
+  Output square1 = ops::Square(root.WithOpName("Square1"), rnd);
+  auto enqueue1 = ops::QueueEnqueue(root.WithOpName("Enqueue1"), q1, {square1});
+  auto dequeue1 =
+      ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
+
+  auto q2 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue2"), {DataType::DT_FLOAT});
+  Output square2 = ops::Square(root.WithOpName("Square2"), dequeue1[0]);
+  auto enqueue2 = ops::QueueEnqueue(root.WithOpName("Enqueue2"), q2, {square2});
+  auto dequeue2 =
+      ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
+
+  // Create a queue that feeds itself.
+  auto q3 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
+  auto dequeue3 =
+      ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
+  auto merge3 = ops::Merge(root.WithOpName("Merge3"), {dequeue3[0], square2});
+  auto enqueue3 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue3"), q3, {merge3.output});
+
+  auto q4 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
+  auto enqueue4 = ops::QueueEnqueue(root.WithOpName("Enqueue4"), q4, {square2});
+  auto enqueue4_2 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue3[0]});
+  auto dequeue4 =
+      ops::QueueDequeue(root.WithOpName("Dequeue4"), q4, {DataType::DT_FLOAT});
+
+  // Create a queue that takes in three tensors.
+  auto q5 = ops::RandomShuffleQueue(
+      root.WithOpName("Queue5"),
+      {DataType::DT_FLOAT, DataType::DT_DOUBLE, DataType::DT_FLOAT});
+  Output rnd2 =
+      ops::RandomNormal(root.WithOpName("rnd"), {10}, DataType::DT_DOUBLE);
+  Output rnd3 =
+      ops::RandomNormal(root.WithOpName("rnd"), {1, 2, 3}, DataType::DT_FLOAT);
+  auto enqueue5 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue5"), q5, {rnd, rnd2, rnd3});
+  auto dequeue5 = ops::QueueDequeue(
+      root.WithOpName("Dequeue5"), q5,
+      {DataType::DT_FLOAT, DataType::DT_DOUBLE, DataType::DT_FLOAT});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props1 = properties.GetOutputProperties("Dequeue1");
+  ASSERT_EQ(1, props1.size());
+  EXPECT_EQ("float: [3,7]", PropToString(props1[0]));
+
+  const auto props2 = properties.GetOutputProperties("Dequeue2");
+  ASSERT_EQ(1, props2.size());
+  EXPECT_EQ("float: [3,7]", PropToString(props2[0]));
+
+  // The dequeue3 op shape is unknown.
+  const auto props3 = properties.GetOutputProperties("Dequeue3");
+  ASSERT_EQ(1, props3.size());
+  EXPECT_EQ("float: ?", PropToString(props3[0]));
+
+  // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
+  // that we merge the 2 properly to determine the shape of the data coming out
+  // of the queue.
+  const auto props4 = properties.GetOutputProperties("Dequeue4");
+  ASSERT_EQ(1, props4.size());
+  EXPECT_EQ("float: [3,7]", PropToString(props4[0]));
+
+  // The dequeue5 op shape is known.
+  const auto props5 = properties.GetOutputProperties("Dequeue5");
+  ASSERT_EQ(3, props5.size());
+  EXPECT_EQ("float: [3,7]", PropToString(props5[0]));
+  EXPECT_EQ("double: [10]", PropToString(props5[1]));
+  EXPECT_EQ("float: [1,2,3]", PropToString(props5[2]));
+}
+
+TEST_F(GraphPropertiesTest, Loops) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("while/Exit");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_INT32, prop.dtype());
+  EXPECT_TRUE(prop.shape().unknown_rank());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
new file mode 100644
index 00000000000..e4a0d6f1b86
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+
+#include <limits>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+MeasuringCostEstimator::MeasuringCostEstimator(Cluster* cluster,
+                                               int measurement_steps,
+                                               int measurement_threads)
+    : measurement_steps_(measurement_steps),
+      measurement_threads_(measurement_threads) {
+  CHECK_GE(measurement_steps, 1);
+  if (measurement_threads > 0) {
+    thread_pool_.reset(new thread::ThreadPool(
+        Env::Default(), SanitizeThreadSuffix("measurements"),
+        measurement_threads));
+  }
+  cluster_ = cluster;
+}
+
+Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
+  feed_ = item.feed;
+  fetch_ = item.fetch;
+  return cluster_->Initialize(item);
+}
+
+Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                            CostGraphDef* cost_graph,
+                                            Costs* costs) const {
+  std::vector<double> times(measurement_steps_);
+  BlockingCounter barrier(measurement_steps_);
+
+  mutex status_mu;
+  Status status;
+
+  auto measurement_fn = [&](const int step) {
+    const Costs::MicroSeconds start = Env::Default()->NowMicros();
+
+    RunMetadata metadata;
+    const Status local_status =
+        cluster_->Run(optimized_graph, feed_, fetch_, &metadata);
+    {
+      mutex_lock lock(status_mu);
+      status.Update(local_status);
+    }
+    if (step < 0) {
+      // Discard the first iteration as it triggers the warmup, and therefore
+      // takes much longer than a normal step.
+      return;
+    }
+    if (!local_status.ok()) {
+      // Discard the data if the run wasn't successful.
+      barrier.DecrementCount();
+      return;
+    }
+
+    const Costs::MicroSeconds finish = Env::Default()->NowMicros();
+    const double time = (finish - start).count() * 1e3;
+    times[step] = time;
+
+    if (cost_graph && (step + 1 == measurement_steps_)) {
+      metadata.mutable_cost_graph()->Swap(cost_graph);
+    }
+
+    barrier.DecrementCount();
+  };
+
+  // Initialize the computation and warm up TensorFlow.
+  measurement_fn(-1);
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to run start measurements: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  // Run "measurement_steps_" and measure the time.
+  if (measurement_threads_ > 0) {
+    for (int i = 0; i < measurement_steps_; ++i) {
+      thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
+    }
+    barrier.Wait();
+  } else {
+    for (int i = 0; i < measurement_steps_ && status.ok(); ++i) {
+      measurement_fn(i);
+    }
+  }
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to measure graph performance: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    costs->max_execution_time = Costs::Duration::max();
+    costs->min_execution_time = 0;
+    return status;
+  }
+
+  // Compute the average time of the measure steps. Use Huber statistics
+  // to filter out outliers.
+  RobustStats stats(times);
+  costs->execution_time = Costs::Duration(stats.mean());
+  costs->max_execution_time = Costs::Duration(stats.hi());
+  costs->min_execution_time = Costs::Duration(stats.lo());
+
+  return Status::OK();
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 00000000000..1b3edb4c27b
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initializes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measure
+  // the runtimes of each operation, and annotated the CostGraphDef
+  // with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
new file mode 100644
index 00000000000..d8b8a12eb29
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -0,0 +1,846 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr int kOpsPerMac = 2;
+constexpr char kConv2d[] = "Conv2D";
+constexpr char kConv2dBackPropFilter[] = "Conv2DBackpropFilter";
+constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
+constexpr char kMatMul[] = "MatMul";
+constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kIdentity[] = "Identity";
+constexpr char kNoOp[] = "NoOp";
+constexpr char kReshape[] = "Reshape";
+constexpr char kRecv[] = "_Recv";
+constexpr char kBatchMatMul[] = "BatchMatMul";
+constexpr char kVariable[] = "Variable";
+constexpr char kVariableV2[] = "VariableV2";
+
+namespace {
+
+string GetDataFormat(const OpInfo& op_features) {
+  string data_format = "NHWC";  // Default format.
+  if (op_features.attr().find("data_format") != op_features.attr().end()) {
+    data_format = op_features.attr().at("data_format").s();
+  }
+  return data_format;
+}
+
+Padding GetPadding(const OpInfo& op_features) {
+  if (op_features.attr().find("padding") != op_features.attr().end() &&
+      op_features.attr().at("padding").s() == "VALID") {
+    return Padding::VALID;
+  }
+  return Padding::SAME;  // Default padding.
+}
+
+std::vector<int64> GetStrides(const OpInfo& op_features) {
+  if (op_features.attr().find("strides") != op_features.attr().end()) {
+    const auto strides = op_features.attr().at("strides").list().i();
+    return {strides[0], strides[1], strides[2], strides[3]};
+  }
+  return {1, 1, 1, 1};
+}
+
+int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
+                    const Padding& padding) {
+  // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
+  // function in third_party/tensorflow/core/framework/common_shape_fns.cc.
+  if (padding == Padding::VALID) {
+    return (input - filter + stride) / stride;
+  } else {  // SAME.
+    return (input + stride - 1) / stride;
+  }
+}
+
+// Return a minimum shape if the shape is unknown. If known, return the original
+// shape.
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes) {
+  auto shape = original_shape;
+  if (shape.unknown_rank() || shape.dim_size() < rank) {
+    *found_unknown_shapes = true;
+    TensorShapeProto::Dim dim;
+    VLOG(1) << "WARNING: Use minimum shape because the rank is unknown.";
+    // The size of each dimension is at least 1, if unknown.
+    dim.set_size(1);
+    for (int i = 0; i < rank; i++) {
+      *shape.add_dim() = dim;
+    }
+  } else {
+    for (int i = 0; i < shape.dim_size(); i++) {
+      if (shape.dim(i).size() == -1) {
+        *found_unknown_shapes = true;
+        VLOG(1)
+            << "WARNING: Use minimum dim size 1 because the shape is unknown.";
+        // The size of each dimension is at least 1, if unknown.
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+  }
+  return shape;
+}
+
+// Return the output element count of a binary element-wise op considering
+// broadcasting.
+int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
+                              const TensorShapeProto& input_shape_2) {
+  bool found_unknown_shapes;
+  int rank = std::max(1, input_shape_1.dim_size());
+  TensorShapeProto output_shape =
+      MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);
+
+  if (input_shape_1.dim_size() == input_shape_2.dim_size()) {
+    auto shape_1 =
+        MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);
+    auto shape_2 =
+        MaybeGetMinimumShape(input_shape_2, rank, &found_unknown_shapes);
+    if (shape_1.dim_size() == shape_2.dim_size()) {
+      for (int i = 0; i < shape_1.dim_size(); i++) {
+        output_shape.mutable_dim(i)->set_size(
+            std::max(shape_1.dim(i).size(), shape_2.dim(i).size()));
+      }
+    }
+  }
+
+  int64 count = 1;
+  for (int i = 0; i < output_shape.dim_size(); i++) {
+    count *= output_shape.dim(i).size();
+  }
+  return count;
+}
+
+}  // namespace
+
+OpLevelCostEstimator::OpLevelCostEstimator() {
+  // Syntactic sugar to build and return a lambda that takes an OpInfo and
+  // returns a cost.
+  typedef Costs (OpLevelCostEstimator::*CostImpl)(const OpInfo& op_feature)
+      const;
+  auto wrap = [this](CostImpl impl) -> std::function<Costs(const OpInfo&)> {
+    return [this, impl](const OpInfo& op) { return (this->*impl)(op); };
+  };
+
+  device_cost_impl_ = {
+      {kConv2d, wrap(&OpLevelCostEstimator::PredictConv2D)},
+      {kConv2dBackPropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropFilter)},
+      {kConv2dBackPropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropInput)},
+      {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)}};
+
+  elementwise_ops_ = {
+      // Unary ops alphabetically sorted
+      {"Acos", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_acos_op<float>>::Cost},
+      {"Asin", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_asin_op<float>>::Cost},
+      {"Atan", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_atan_op<float>>::Cost},
+      {"Atan2", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_quotient_op<float>>::Cost +
+                    Eigen::internal::functor_traits<
+                        Eigen::internal::scalar_atan_op<float>>::Cost},
+      {"Ceil", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_ceil_op<float>>::Cost},
+      {"Cos", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_cos_op<float>>::Cost},
+      {"Erf", 1},
+      {"Erfc", 1},
+      {"Exp", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_exp_op<float>>::Cost},
+      {"Expm1", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_expm1_op<float>>::Cost},
+      {"Floor", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_floor_op<float>>::Cost},
+      {"Inv", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_inverse_op<float>>::Cost},
+      {"InvGrad", 1},
+      {"Lgamma", 1},
+      {"Log", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_log_op<float>>::Cost},
+      {"Log1p", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_log1p_op<float>>::Cost},
+      {"Neg", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_opposite_op<float>>::Cost},
+      {"Reciprocal", Eigen::internal::functor_traits<
+                         Eigen::internal::scalar_inverse_op<float>>::Cost},
+      {"Rint", 1},
+      {"Round", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_round_op<float>>::Cost},
+      {"Rsqrt", Eigen::internal::functor_traits<
+                    Eigen::internal::scalar_rsqrt_op<float>>::Cost},
+      {"Sqrt", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_sqrt_op<float>>::Cost},
+      {"Square", Eigen::internal::functor_traits<
+                     Eigen::internal::scalar_square_op<float>>::Cost},
+      {"Tanh", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_tanh_op<float>>::Cost},
+      {"Sigmoid", Eigen::internal::functor_traits<
+                      Eigen::internal::scalar_sigmoid_op<float>>::Cost},
+      {"Sign", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_sign_op<float>>::Cost},
+      {"Sin", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_sin_op<float>>::Cost},
+      {"Tan", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_tan_op<float>>::Cost},
+      // Binary ops alphabetically sorted
+      {"Add", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_sum_op<float>>::Cost},
+      {"ApproximateEqual", 1},
+      {"Div", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_quotient_op<float>>::Cost},
+      {"Equal", 1},
+      {"FloorDiv", Eigen::internal::functor_traits<
+                       Eigen::internal::scalar_quotient_op<float>>::Cost},
+      {"FloorMod", Eigen::internal::functor_traits<
+                       Eigen::internal::scalar_mod_op<float>>::Cost},
+      {"Greater", 1},
+      {"GreaterEqual", 1},
+      {"Less", 1},
+      {"LessEqual", 1},
+      {"LogicalAnd", Eigen::internal::functor_traits<
+                         Eigen::internal::scalar_boolean_and_op>::Cost},
+      {"LogicalNot", 1},
+      {"LogicalOr", Eigen::internal::functor_traits<
+                        Eigen::internal::scalar_boolean_or_op>::Cost},
+      {"Maximum", Eigen::internal::functor_traits<
+                      Eigen::internal::scalar_max_op<float>>::Cost},
+      {"Minimum", Eigen::internal::functor_traits<
+                      Eigen::internal::scalar_min_op<float>>::Cost},
+      {"Mod", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_mod_op<float>>::Cost},
+      {"Mul", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_product_op<float>>::Cost},
+      {"NotEqual", 1},
+      {"QuantizedAdd", Eigen::internal::functor_traits<
+                           Eigen::internal::scalar_sum_op<float>>::Cost},
+      {"QuantizedMul", Eigen::internal::functor_traits<
+                           Eigen::internal::scalar_product_op<float>>::Cost},
+      {"RealDiv", Eigen::internal::functor_traits<
+                      Eigen::internal::scalar_quotient_op<float>>::Cost},
+      {"SquareDifference", 1},
+      {"Sub", Eigen::internal::functor_traits<
+                  Eigen::internal::scalar_difference_op<float>>::Cost},
+      {"TruncateDiv", Eigen::internal::functor_traits<
+                          Eigen::internal::scalar_quotient_op<float>>::Cost},
+      {"TruncateMod", Eigen::internal::functor_traits<
+                          Eigen::internal::scalar_mod_op<float>>::Cost}};
+}
+
+Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
+  auto it = device_cost_impl_.find(op_features.op());
+  if (it == device_cost_impl_.end()) {
+    if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
+      return PredictCwiseOp(op_features);
+    }
+    VLOG(1) << "Missing implementation for op: " << op_features.op();
+    return DummyExecutionTime(op_features);
+  }
+
+  std::function<Costs(const OpInfo&)> estimator = it->second;
+  Costs costs = estimator(op_features);
+  VLOG(1) << "Operation " << op_features.op() << " takes "
+          << costs.execution_time.count() << " ns.";
+  return costs;
+}
+
+std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
+    const DeviceProperties& device) const {
+  double gflops = -1;
+  double bandwidth = -1;
+
+  if (device.type() == "CPU") {
+    // Check if vector instructions are available, and refine performance
+    // prediction based on this.
+    // Frequencies are stored in MHz in the DeviceProperties.
+    gflops = device.num_cores() * device.frequency() * 1e-3;
+    if (bandwidth < 0) {
+      if (device.bandwidth() > 0) {
+        bandwidth = device.bandwidth() / 1e6;
+      } else {
+        bandwidth = 32;
+      }
+    }
+  } else if (device.type() == "GPU") {
+    const string architecture = device.environment().at("architecture");
+    int cores_per_multiprocessor;
+    if (architecture < "3") {
+      // Fermi
+      cores_per_multiprocessor = 32;
+    } else if (architecture < "4") {
+      // Kepler
+      cores_per_multiprocessor = 192;
+    } else if (architecture < "6") {
+      // Maxwell
+      cores_per_multiprocessor = 128;
+    } else {
+      // Pascal
+      cores_per_multiprocessor = 64;
+    }
+    gflops = device.num_cores() * device.frequency() * 1e-3 *
+             cores_per_multiprocessor * kOpsPerMac;
+    if (device.bandwidth() > 0) {
+      bandwidth = device.bandwidth() / 1e6;
+    } else {
+      bandwidth = 100;
+    }
+  }
+
+  return std::make_pair(gflops, bandwidth);
+}
+
+Costs OpLevelCostEstimator::PredictCwiseOp(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  // For unary or binary element-wise operations, op count is the element count
+  // of any input. We use the count for the largest input here to be more robust
+  // in case that the shape is unknown or partially known for other input.
+  int64 op_count =
+      CalculateLargestInputCount(op_features, &found_unknown_shapes);
+  // If output shape is available, try use the element count calcuated from
+  // that.
+  if (op_features.outputs_size() > 0) {
+    op_count =
+        std::max(op_count, CalculateTensorElementCount(op_features.outputs(0),
+                                                       &found_unknown_shapes));
+  }
+  // For binary ops, calculate the output shape possibly resulting from
+  // broadcasting.
+  if (op_features.inputs_size() >= 2) {
+    op_count = std::max(op_count,
+                        CwiseOutputElementCount(op_features.inputs(0).shape(),
+                                                op_features.inputs(1).shape()));
+  }
+
+  int op_cost = 1;
+  auto it = elementwise_ops_.find(op_features.op());
+  if (it != elementwise_ops_.end()) {
+    op_cost = it->second;
+  }
+  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::DummyExecutionTime(
+    const OpInfo& op_features) const {
+  // Use CwiseOp time as an estimation
+  auto costs = PredictCwiseOp(op_features);
+  costs.inaccurate = true;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictOpCountBasedCost(
+    double operations, const OpInfo& op_features) const {
+  std::pair<double, double> device_perf = GetDeviceInfo(op_features.device());
+  Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.first));
+  VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
+          << " Execution Time (ns):" << compute_cost.count();
+
+  bool found_unknown_shapes = false;
+  double total_input_size =
+      CalculateInputSize(op_features, &found_unknown_shapes);
+  double total_output_size =
+      CalculateOutputSize(op_features, &found_unknown_shapes);
+  double total_io_size = total_input_size + total_output_size;
+
+  Costs::NanoSeconds memory_cost(std::ceil(total_io_size / device_perf.second));
+  VLOG(1) << "Op:" << op_features.op() << " Size (KB):" << (total_io_size) / 1e3
+          << " Memory Time (ns):" << memory_cost.count();
+
+  Costs costs;
+  costs.compute_time = compute_cost;
+  costs.memory_time = memory_cost;
+  costs.execution_time = compute_cost + memory_cost;
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+// Helper to translate the positional arguments into named fields.
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    bool* found_unknown_shapes) {
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  auto filter_shape =
+      MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_features);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+  int64 kx = filter_shape.dim(0).size();
+  int64 ky = filter_shape.dim(1).size();
+  std::vector<int64> strides = GetStrides(op_features);
+  const auto padding = GetPadding(op_features);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = filter_shape.dim(3).size();
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (iz != 1 && filter_shape.dim(2).size() != 1) {
+    CHECK_EQ(iz, filter_shape.dim(2).size());
+  } else {
+    iz = std::max<int64>(iz, filter_shape.dim(2).size());
+  }
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+
+  VLOG(1) << "Batch Size:" << batch;
+  VLOG(1) << "Image Dims:" << ix << "," << iy;
+  VLOG(1) << "Input Features:" << iz;
+  VLOG(1) << "Kernel Dims:" << kx << "," << ky;
+  VLOG(1) << "Output Features:" << oz;
+  VLOG(1) << "Output Dims:" << ox << "," << oy;
+  VLOG(1) << "Strides:" << sx << "," << sy;
+  VLOG(1) << "Padding:" << (padding == Padding::VALID ? "VALID" : "SAME");
+  return conv_dims;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    bool* found_unknown_shapes) const {
+  if (op_features.op() != kConv2d) {
+    LOG(ERROR) << "Invalid Operation";
+    return 0;
+  }
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  int64 ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+  VLOG(1) << "Operations for Conv2D" << ops;
+
+  if (conv_info != nullptr) {
+    *conv_info = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    bool* found_unknown_shapes) const {
+  double ops = 0;
+
+  // TODO(nishantpatil): Create separate estimator for Sparse Matmul
+  if ((op_features.op() != kMatMul) && (op_features.op() != kSparseMatMul)) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  // first matrix
+  auto& a_matrix = op_features.inputs(0);
+  auto& b_matrix = op_features.inputs(1);
+
+  bool transpose_a = false;
+  bool transpose_b = false;
+
+  double m_dim, n_dim, k_dim, k_dim_b = 0;
+
+  for (const auto& item : op_features.attr()) {
+    VLOG(1) << "Key:" << item.first
+            << " Value:" << SummarizeAttrValue(item.second);
+    if (item.first == "transpose_a" && item.second.b() == true)
+      transpose_a = true;
+    if (item.first == "transpose_b" && item.second.b() == true)
+      transpose_b = true;
+  }
+  VLOG(1) << "transpose_a:" << transpose_a;
+  VLOG(1) << "transpose_b:" << transpose_b;
+  auto a_matrix_shape =
+      MaybeGetMinimumShape(a_matrix.shape(), 2, found_unknown_shapes);
+  auto b_matrix_shape =
+      MaybeGetMinimumShape(b_matrix.shape(), 2, found_unknown_shapes);
+  if (transpose_a) {
+    m_dim = a_matrix_shape.dim(1).size();
+    k_dim = a_matrix_shape.dim(0).size();
+  } else {
+    m_dim = a_matrix_shape.dim(0).size();
+    k_dim = a_matrix_shape.dim(1).size();
+  }
+  if (transpose_b) {
+    k_dim_b = b_matrix_shape.dim(1).size();
+    n_dim = b_matrix_shape.dim(0).size();
+  } else {
+    k_dim_b = b_matrix_shape.dim(0).size();
+    n_dim = b_matrix_shape.dim(1).size();
+  }
+
+  VLOG(1) << "M, N, K: " << m_dim << "," << n_dim << "," << k_dim;
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (k_dim_b != 1 && k_dim != 1 && k_dim_b != k_dim) {
+    LOG(ERROR) << "Incompatible Matrix dimensions";
+    return ops;
+  } else {
+    // One of k_dim and k_dim_b might be 1 (mininum dimension size).
+    k_dim = std::max(k_dim, k_dim_b);
+  }
+
+  ops = m_dim * n_dim * k_dim * 2;
+  VLOG(1) << "Operations for Matmul" << ops;
+
+  if (mat_mul != nullptr) {
+    mat_mul->m = m_dim;
+    mat_mul->n = n_dim;
+    mat_mul->k = k_dim;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountBatchMatMulOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  if (op_features.op() != kBatchMatMul) {
+    LOG(ERROR) << "Invalid Operation: " << op_features.op();
+    *found_unknown_shapes = true;
+    return 0;
+  }
+  if (op_features.inputs_size() != 2) {
+    LOG(ERROR) << "Expected 2 inputs but got " << op_features.inputs_size();
+    *found_unknown_shapes = true;
+    return 0;
+  }
+
+  double ops = 0;
+  const auto& a_input = op_features.inputs(0);
+  const auto& b_input = op_features.inputs(1);
+
+  // BatchMatMul requires inputs of at least matrix shape (rank 2).
+  // The two most minor dimensions of each input are matrices that
+  // need to be multiplied together. The other dimensions determine
+  // the number of such MatMuls.  For example, if the BatchMatMul has
+  // inputs of shape:
+  //   a_input_shape = [2, 3, 4, 5]
+  //   b_input_shape = [2, 3, 5, 6]
+  // then there are 2*3 = 6 MatMuls of dimensions m = 4, k = 5, n = 6
+  // in this BatchMatMul.
+  const int matrix_rank = 2;
+
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  *found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
+                          (a_input.shape().dim_size() < matrix_rank) ||
+                          (b_input.shape().dim_size() < matrix_rank);
+
+  // Compute the number of matmuls as the max indicated at each dimension
+  // by either input. Note that the shapes do not have to have
+  // the same rank due to incompleteness.
+  TensorShapeProto* bigger_rank_shape = &a_input_shape;
+  TensorShapeProto* smaller_rank_shape = &b_input_shape;
+  if (b_input_shape.dim_size() > a_input_shape.dim_size()) {
+    bigger_rank_shape = &b_input_shape;
+    smaller_rank_shape = &a_input_shape;
+  }
+  int num_matmuls = 1;
+  for (int b_i = 0,
+           s_i = smaller_rank_shape->dim_size() - bigger_rank_shape->dim_size();
+       b_i < bigger_rank_shape->dim_size() - matrix_rank; ++b_i, ++s_i) {
+    int b_dim = bigger_rank_shape->dim(b_i).size();
+    int s_dim = 1;
+    if (s_i >= 0) {
+      s_dim = smaller_rank_shape->dim(s_i).size();
+    }
+    num_matmuls *= std::max(b_dim, s_dim);
+  }
+
+  // Build the MatMul. Note that values are ignored here since we are just
+  // counting ops (e.g. only shapes matter).
+  OpInfo matmul_op_features;
+  matmul_op_features.set_op("MatMul");
+
+  AttrValue transpose_a;
+  transpose_a.set_b(false);
+  if (op_features.attr().find("adj_x") != op_features.attr().end()) {
+    transpose_a.set_b(op_features.attr().at("adj_x").b());
+  }
+  (*matmul_op_features.mutable_attr())["transpose_a"] = transpose_a;
+
+  AttrValue transpose_b;
+  transpose_b.set_b(false);
+  if (op_features.attr().find("adj_y") != op_features.attr().end()) {
+    transpose_b.set_b(op_features.attr().at("adj_y").b());
+  }
+  (*matmul_op_features.mutable_attr())["transpose_b"] = transpose_b;
+
+  OpInfo::TensorProperties* a_matrix = matmul_op_features.add_inputs();
+  a_matrix->set_dtype(a_input.dtype());
+  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
+  for (int i = std::max(0, a_input_shape.dim_size() - matrix_rank);
+       i < a_input_shape.dim_size(); ++i) {
+    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i);
+  }
+
+  OpInfo::TensorProperties* b_matrix = matmul_op_features.add_inputs();
+  b_matrix->set_dtype(b_input.dtype());
+  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
+  for (int i = std::max(0, b_input_shape.dim_size() - matrix_rank);
+       i < b_input_shape.dim_size(); ++i) {
+    *(b_matrix_shape->add_dim()) = b_input_shape.dim(i);
+  }
+
+  for (int i = 0; i < num_matmuls; ++i) {
+    bool matmul_unknown_shapes = false;
+    ops += CountMatMulOperations(matmul_op_features, &matmul_unknown_shapes);
+    *found_unknown_shapes |= matmul_unknown_shapes;
+  }
+  return ops;
+}
+
+// TODO(cliffy): Dedup this method and CountConv2DBackPropFilterOperations.
+int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+
+  if (op_features.op() != kConv2dBackPropInput) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.outputs_size() != 1) {
+    // Need _output_shapes for input shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropInput op.";
+    return ops;
+  }
+
+  const auto& input_shape = op_features.outputs(0).shape();
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      input_shape, op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropInput" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+  if (op_features.op() != kConv2dBackPropFilter) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.outputs_size() != 1) {
+    // Need _output_shapes for input shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropFilter op.";
+    return ops;
+  }
+
+  const auto& filter_shape = op_features.outputs(0).shape();
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), filter_shape, op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropFilter" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CalculateTensorElementCount(
+    const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
+  VLOG(1) << "   with " << tensor.dtype() << " tensor of shape "
+          << tensor.shape().DebugString();
+  int64 tensor_size = 1;
+  int num_dims = std::max(1, tensor.shape().dim_size());
+  auto tensor_shape =
+      MaybeGetMinimumShape(tensor.shape(), num_dims, found_unknown_shapes);
+  for (const auto& dim : tensor_shape.dim()) {
+    tensor_size *= dim.size();
+  }
+  return tensor_size;
+}
+
+int64 OpLevelCostEstimator::CalculateTensorSize(
+    const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
+  return CalculateTensorElementCount(tensor, found_unknown_shapes) *
+         DataTypeSize(BaseType(tensor.dtype()));
+}
+
+int64 OpLevelCostEstimator::CalculateInputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_input_size = 0;
+  for (auto& input : op_features.inputs()) {
+    int64 input_size = CalculateTensorSize(input, found_unknown_shapes);
+    total_input_size += input_size;
+    VLOG(1) << "Input Size: " << input_size
+            << " Total Input Size:" << total_input_size;
+  }
+  return total_input_size;
+}
+
+int64 OpLevelCostEstimator::CalculateLargestInputCount(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 largest_input_count = 0;
+  for (auto& input : op_features.inputs()) {
+    int64 input_count =
+        CalculateTensorElementCount(input, found_unknown_shapes);
+    if (input_count > largest_input_count) {
+      largest_input_count = input_count;
+    }
+    VLOG(1) << "Input Count: " << input_count
+            << " Largest Input Count:" << largest_input_count;
+  }
+  return largest_input_count;
+}
+
+int64 OpLevelCostEstimator::CalculateOutputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_output_size = 0;
+  // use float as default for calculations
+  for (const auto& output : op_features.outputs()) {
+    DataType dt = output.dtype();
+    const auto& original_output_shape = output.shape();
+    int64 output_size = DataTypeSize(BaseType(dt));
+    int num_dims = std::max(1, original_output_shape.dim_size());
+    auto output_shape = MaybeGetMinimumShape(original_output_shape, num_dims,
+                                             found_unknown_shapes);
+    for (const auto& dim : output_shape.dim()) {
+      output_size *= dim.size();
+    }
+    total_output_size += output_size;
+    VLOG(1) << "Output Size: " << output_size
+            << " Total Output Size:" << total_output_size;
+  }
+  return total_output_size;
+}
+
+Costs OpLevelCostEstimator::PredictConv2D(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropInput(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropInputOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropFilter(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropFilterOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMatMul(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictNoOp(const OpInfo& op_features) const {
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  return Costs::ZeroCosts();
+}
+
+Costs OpLevelCostEstimator::PredictBatchMatMul(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  Costs costs = PredictOpCountBasedCost(
+      CountBatchMatMulOperations(op_features, &found_unknown_shapes),
+      op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
new file mode 100644
index 00000000000..28d49a77037
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -0,0 +1,159 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+
+#include <functional>
+#include <map>
+#include <string>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class OpLevelCostEstimator {
+ public:
+  OpLevelCostEstimator();
+  virtual ~OpLevelCostEstimator() {}
+
+  virtual Costs PredictCosts(const OpInfo& op_features) const;
+
+ protected:
+  // Returns an estimate of device performance (in billions of operations
+  // executed per second) and memory bandwidth (in GigaBytes/second) for the
+  // specified device.
+  virtual std::pair<double, double> GetDeviceInfo(
+      const DeviceProperties& device) const;
+
+  // For operations for which we haven't yet built estimates, returns a dummy
+  // value based on input size.
+  Costs DummyExecutionTime(const OpInfo& op_features) const;
+
+  // Naive cost estimate based on operations divided by device ops/sec.
+  Costs PredictOpCountBasedCost(double operations,
+                                const OpInfo& op_features) const;
+
+  // This family of routines counts the number of operations to perform the
+  // specified TensorFlow Op.
+  struct MatMulDimensions {
+    int m;
+    int n;
+    int k;
+  };
+  struct ConvolutionDimensions {
+    int64 batch;      // Batch size.
+    int64 ix;         // Input size x.
+    int64 iy;         // Input size y.
+    int64 iz;         // Input depth.
+    int64 kx;         // Kernel x.
+    int64 ky;         // Kernel y.
+    int64 oz;         // Output depth.
+    int64 ox;         // Output size x.
+    int64 oy;         // Output size y.
+    int64 sx;         // Stride x.
+    int64 sy;         // Stride y.
+    Padding padding;  // SAME or VALID.
+  };
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              ConvolutionDimensions* conv_info,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              MatMulDimensions* mat_mul,
+                              bool* found_unknown_shapes) const;
+  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+                                   bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropInputOperations(const OpInfo& op_features,
+                                           ConvolutionDimensions* conv_info,
+                                           bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropFilterOperations(const OpInfo& op_features,
+                                            ConvolutionDimensions* conv_info,
+                                            bool* found_unknown_shapes) const;
+
+  // Calculate the element count of an input/output tensor.
+  int64 CalculateTensorElementCount(const OpInfo::TensorProperties& tensor,
+                                    bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of an input/output tensor.
+  int64 CalculateTensorSize(const OpInfo::TensorProperties& tensor,
+                            bool* found_unknown_shapes) const;
+
+  // Calculate the element count of the largest
+  // input of specified TensorFlow op.
+  int64 CalculateLargestInputCount(const OpInfo& op_features,
+                                   bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the inputs of specified TensorFlow op.
+  int64 CalculateInputSize(const OpInfo& op_features,
+                           bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the outputs of specified TensorFlow op.
+  int64 CalculateOutputSize(const OpInfo& op_features,
+                            bool* found_unknown_shapes) const;
+
+  // This family of routines predicts the costs to
+  // perform the specified TensorFlow Op on the
+  // device represented by a subclass. The default
+  // implementation just divides the operations to
+  // perform the op (from the "Count" routines,
+  // above) by the device peak operations per
+  // second. Override to supply a better estimate.
+  // Implementation of costs other than
+  // execution_time is optional, depending on the
+  // device.
+  Costs PredictConv2D(const OpInfo& op_features) const;
+  Costs PredictCwiseOp(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropInput(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropFilter(const OpInfo& op_features) const;
+  Costs PredictMatMul(const OpInfo& op_features) const;
+  Costs PredictNoOp(const OpInfo& op_features) const;
+  Costs PredictBatchMatMul(const OpInfo& op_features) const;
+
+  // Utility function for safe division. Returns 0
+  // if rhs is 0 or negative.
+  static double SafeDiv(const double lhs, const double rhs) {
+    if (rhs > 0) {
+      return lhs / rhs;
+    } else {
+      return 0.0;
+    }
+  }
+
+  static ConvolutionDimensions ConvolutionDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      bool* found_unknown_shapes);
+
+ protected:
+  std::map<string, int> elementwise_ops_;
+  typedef std::function<Costs(const OpInfo& op_feature)> CostImpl;
+  std::map<string, CostImpl> device_cost_impl_;
+
+ private:
+  friend class OpLevelCostEstimatorTest;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
new file mode 100644
index 00000000000..1f0e02c1605
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -0,0 +1,236 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+// Wrangles the minimum number of proto fields to set up a matrix.
+void DescribeMatrix(int rows, int columns, OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  auto shape_rows = shape->add_dim();
+  shape_rows->set_size(rows);
+  auto shape_columns = shape->add_dim();
+  shape_columns->set_size(columns);
+  input->set_dtype(DT_FLOAT);
+}
+
+void SetCpuDevice(OpInfo* op_features) {
+  auto device = op_features->mutable_device();
+  device->set_type("CPU");
+  device->set_num_cores(10);
+  device->set_bandwidth(10000000);  // 10000000 KB/s = 10 GB/s
+  device->set_frequency(1000);      // 1000 Mhz = 1 GHz
+}
+
+// Returns an OpInfo for MatMul with the minimum set of fields set up.
+OpInfo DescribeMatMul(int m, int n, int l, int k) {
+  OpInfo op_features;
+  SetCpuDevice(&op_features);
+  op_features.set_op("MatMul");
+
+  DescribeMatrix(m, l, &op_features);
+  DescribeMatrix(k, n, &op_features);
+  return op_features;
+}
+
+// Returns an OpInfo for MatMul with unknown input shapes.
+OpInfo DescribeMatMulUnknownShape() {
+  OpInfo op_features;
+  SetCpuDevice(&op_features);
+  op_features.set_op("MatMul");
+
+  auto input = op_features.add_inputs();
+  auto shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  input = op_features.add_inputs();
+  shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  return op_features;
+}
+
+// Wrangles the minimum number of proto fields to set up an input of
+// arbitrary rank and type.
+void DescribeArbitraryRankInput(const std::vector<int>& dims, DataType dtype,
+                                OpInfo* op_features) {
+  auto input = op_features->add_inputs();
+  input->set_dtype(dtype);
+  auto shape = input->mutable_shape();
+  for (auto d : dims) {
+    shape->add_dim()->set_size(d);
+  }
+}
+
+// Returns an OpInfo for a BatchMatMul
+OpInfo DescribeBatchMatMul(const std::vector<int>& dims_a,
+                           const std::vector<int>& dims_b) {
+  OpInfo op_features;
+  SetCpuDevice(&op_features);
+  op_features.set_op("BatchMatMul");
+
+  DescribeArbitraryRankInput(dims_a, DT_FLOAT, &op_features);
+  DescribeArbitraryRankInput(dims_b, DT_FLOAT, &op_features);
+  return op_features;
+}
+
+// Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
+// estimation purposes.
+void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
+                      OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  shape->add_dim()->set_size(dim1);
+  shape->add_dim()->set_size(dim2);
+  shape->add_dim()->set_size(dim3);
+  input->set_dtype(DT_FLOAT);
+}
+
+// Returns an OpInfo for Conv2D with the minimum set of fields set up.
+OpInfo DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2, int kx,
+                           int ky, int oz) {
+  OpInfo op_features;
+  SetCpuDevice(&op_features);
+  op_features.set_op("Conv2D");
+
+  DescribeTensor4D(batch, ix, iy, iz1, &op_features);
+  DescribeTensor4D(kx, ky, iz2, oz, &op_features);
+  return op_features;
+}
+
+OpInfo DescribeOp(const string& op, int size1, int size2) {
+  OpInfo op_features;
+  SetCpuDevice(&op_features);
+  op_features.set_op(op);
+
+  DescribeTensor4D(size1, 1, 1, 1, &op_features);
+  DescribeTensor4D(2 * size1, size2, 1, 1, &op_features);
+
+  auto output = op_features.add_outputs();
+  auto shape = output->mutable_shape();
+  shape->add_dim()->set_size(2 * size1);
+  shape->add_dim()->set_size(size2);
+  shape->add_dim()->set_size(1);
+  shape->add_dim()->set_size(1);
+  output->set_dtype(DT_FLOAT);
+
+  SetCpuDevice(&op_features);
+  return op_features;
+}
+}  // namespace
+
+class OpLevelCostEstimatorTest : public ::testing::Test {
+ protected:
+  Costs PredictCosts(const OpInfo& op_features) const {
+    return estimator_.PredictCosts(op_features);
+  }
+
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const {
+    return estimator_.CountMatMulOperations(op_features, found_unknown_shapes);
+  }
+
+  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+                                   bool* found_unknown_shapes) const {
+    return estimator_.CountBatchMatMulOperations(op_features,
+                                                 found_unknown_shapes);
+  }
+
+  OpLevelCostEstimator estimator_;
+};
+
+TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
+  auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
+  EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_TRUE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
+  auto cost = PredictCosts(DescribeOp("Mul", 1000, 1));
+  EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
+  auto cost = PredictCosts(DescribeOp("Mul", 1000, 2));
+  EXPECT_EQ(Costs::Duration(3600), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(400), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(4000), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
+  auto cost = PredictCosts(DescribeOp("Mod", 1000, 1));
+  EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(1600), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(3600), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
+  EXPECT_FALSE(PredictCosts(DescribeMatMul(2, 4, 7, 7)).inaccurate);
+  EXPECT_TRUE(PredictCosts(DescribeMatMul(-1, 4, 7, 7)).inaccurate);
+  EXPECT_TRUE(PredictCosts(DescribeMatMul(2, 4, -1, 7)).inaccurate);
+
+  EXPECT_FALSE(PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256))
+                   .inaccurate);
+  EXPECT_TRUE(PredictCosts(DescribeConvolution(16, -1, 19, 48, 48, 5, 5, 256))
+                  .inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
+  EXPECT_TRUE(PredictCosts(DescribeBatchMatMul({}, {})).inaccurate);
+  EXPECT_TRUE(PredictCosts(DescribeBatchMatMul({2, 4}, {})).inaccurate);
+  EXPECT_FALSE(PredictCosts(DescribeBatchMatMul({2, 4}, {4, 2})).inaccurate);
+  EXPECT_FALSE(
+      PredictCosts(DescribeBatchMatMul({1, 2, 4}, {1, 4, 2})).inaccurate);
+  EXPECT_FALSE(
+      PredictCosts(DescribeBatchMatMul({2, 4}, {1, 3, 4, 2})).inaccurate);
+  bool matmul_inaccurate = false;
+  bool batch_matmul_inaccurate = false;
+  EXPECT_EQ(
+      CountMatMulOperations(DescribeMatMul(2, 2, 4, 4), &matmul_inaccurate),
+      CountBatchMatMulOperations(DescribeBatchMatMul({2, 4}, {4, 2}),
+                                 &batch_matmul_inaccurate));
+  EXPECT_EQ(matmul_inaccurate, batch_matmul_inaccurate);
+  EXPECT_EQ(10 * CountMatMulOperations(DescribeMatMul(2, 2, 4, 4),
+                                       &matmul_inaccurate),
+            CountBatchMatMulOperations(
+                DescribeBatchMatMul({10, 2, 4}, {-1, 10, 4, 2}),
+                &batch_matmul_inaccurate));
+  EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
+  EXPECT_EQ(20 * CountMatMulOperations(DescribeMatMul(2, 2, 4, 4),
+                                       &matmul_inaccurate),
+            CountBatchMatMulOperations(
+                DescribeBatchMatMul({2, 10, 2, 4}, {-1, 10, 4, 2}),
+                &batch_matmul_inaccurate));
+  EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
new file mode 100644
index 00000000000..0d6b337d5a3
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+
+import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/device_properties.proto";
+
+// Description of an operation as well as the parameters expected to impact its
+// performance.
+message OpInfo {
+  // The operation name.  There may be custom parameters in attrs.
+  string op = 1;
+
+  // Custom parameters impacting the behavior of the op.
+  map<string, AttrValue> attr = 2;
+
+  // Input data types, shapes and values if known.
+  message TensorProperties {
+    DataType dtype = 1;
+    TensorShapeProto shape = 2;
+    TensorProto value = 3;
+  };
+  repeated TensorProperties inputs = 3;
+
+  // Optional description of the op outputs
+  repeated TensorProperties outputs = 5;
+
+  // Device on which the operation is run.
+  DeviceProperties device = 4;
+}
+
+// Performance data for tensorflow operations
+message OpPerformance {
+  // The op
+  OpInfo op = 1;
+
+  // The node name (optional). Makes it easier to associate the performance data
+  // with a specific graph node.
+  string node = 5;
+
+  // Temporary memory used by this node (in bytes).
+  int64 temporary_memory_size = 2;
+
+  // Time it takes to run the op (in nanoseconds).
+  int64 compute_cost = 3;
+
+  // Analytical compute cost (in nanoseconds).
+  int64 compute_time = 6;
+
+  // Analytical memory access cost (in nanoseconds).
+  int64 memory_time = 7;
+
+  // Percentage of theoretical compute performance.
+  double compute_efficiency = 4;
+
+  // Percentage of theoretical memory performance.
+  double memory_efficiency = 8;
+
+  // Memory usage data for a tensorflow operation.
+  message OpMemory {
+    // The output information may have memory usage and output shapes.
+    repeated int64 output_memory = 1;
+
+    // Temporary memory allocated by this node.
+    int64 host_temp_memory = 2;
+    int64 device_temp_memory = 3;
+
+    // The persisted_memory doesn't include outputs.
+    int64 host_persistent_memory = 4;
+    int64 device_persistent_memory = 5;
+  }
+  OpMemory op_memory = 9;
+}
+
+// A collection of OpPerformance data points.
+message OpPerformanceList {
+  repeated OpPerformance op_performance = 1;
+}
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
new file mode 100644
index 00000000000..9866bc86887
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include <algorithm>
+#include <cmath>
+
+namespace tensorflow {
+namespace grappler {
+
+// Given a sorted vector of values, calculate the median.
+// Returns 0 for an empty vector.  Does not verify sortedness.
+static double SortedMedian(const std::vector<double> &values) {
+  const int n = values.size();
+  if (n == 0) return 0.0;
+  if (n & 1) {
+    return values[n / 2];
+  } else {
+    return (values[n / 2] + values[n / 2 - 1]) / 2.0;
+  }
+}
+
+// Given a vector of values (sorted or not), calculate the median.
+static double Median(std::vector<double> &&values) {
+  const size_t n = values.size();
+  if (n == 0) return 0;
+  const auto middle = values.begin() + (n / 2);
+  // Put the middle value in its place.
+  std::nth_element(values.begin(), middle, values.end());
+  if (n & 1) {
+    return *middle;
+  }
+  // Return the average of the two elements, the max_element lower than
+  // *middle is found between begin and middle as a post-cond of
+  // nth_element.
+  const auto lower_middle = std::max_element(values.begin(), middle);
+  // Preventing overflow. We know that '*lower_middle <= *middle'.
+  // If both are on opposite sides of zero, the sum won't overflow, otherwise
+  // the difference won't overflow.
+  if (*lower_middle <= 0 && *middle >= 0) {
+    return (*lower_middle + *middle) / 2;
+  }
+  return *lower_middle + (*middle - *lower_middle) / 2;
+}
+
+// Given a set of values, calculates the scaled Median Absolute Deviation (a
+// robust approximation to the standard deviation).  This is calculated as the
+// median of the absolute deviations from the median, scaled by 1.4826.  Its
+// advantage over the standard deviation is that it is not (as) affected by
+// outlier values.  Returns a pair<median, mad>.
+static std::pair<double, double> ScaledMedianAbsoluteDeviation(
+    const std::vector<double> &sorted_values) {
+  double median = SortedMedian(sorted_values);
+
+  // Next, we calculate the absolute deviations from the median,
+  // find the median of the resulting data, and scale by 1.4826.
+  std::vector<double> deviations;
+  deviations.reserve(sorted_values.size());
+  for (double d : sorted_values) {
+    deviations.push_back(std::abs(d - median));
+  }
+  double mad = Median(std::move(deviations)) * 1.4826;
+  return std::pair<double, double>(median, mad);
+}
+
+RobustStats::RobustStats(const std::vector<double> &values)
+    : RobustStats(std::vector<double>(values)) {}
+
+RobustStats::RobustStats(std::vector<double> &&values) {
+  std::sort(values.begin(), values.end());
+  lo_ = values[0];
+  hi_ = values.back();
+  HuberMAD(values);
+}
+
+// Computes an updated mean using Huber's weighting function (values beyond
+// the margin are weighted by margin / abs(value - mean).
+double UpdateHuberMean(const std::vector<double> &sorted_values, double mean,
+                       double margin) {
+  int num_within = 0;
+  double sum = 0.0;
+
+  for (double d : sorted_values) {
+    if (d < mean - margin) {
+      sum -= margin;
+    } else if (d > mean + margin) {
+      sum += margin;
+    } else {
+      sum += d;
+      ++num_within;
+    }
+  }
+
+  // It is possible, for a set with an interquartile distance of 0, i.e., with
+  // more than half of the values at the median, to encounter the case where
+  // the Huber mean drifts slightly off the median and there are no values
+  // within the margin.  In that case, just return the old mean, and the caller
+  // will quit.
+  if (num_within > 0) {
+    return sum / num_within;
+  } else {
+    return mean;
+  }
+}
+
+// Given a list of values, this approximates the stddev using the MAD and then
+// uses it to compute a Huber robust mean (sandwich mean).  A margin of
+// c*stddev is defined around the current mean, and values are weighted by
+// margin / abs(value - mean) if outside the margin, or 1 if inside.  This
+// computes the mean iteratively, because each time it changes the margin
+// shifts a bit.  It typically settles very quickly, but it's possible for it
+// to be unstable.  We limit it to 10 iterations.
+//
+void RobustStats::HuberMAD(const std::vector<double> &sorted_values) {
+  const std::pair<double, double> median_mad =
+      ScaledMedianAbsoluteDeviation(sorted_values);
+  mean_ = median_mad.first;
+  stddev_ = median_mad.second;
+
+  // c = 1.345 is the commonly used cutoff with 95% efficiency at the normal.
+  // We're using c = 1.5 to be a little more conservative, and because that's
+  // the default in S-plus.
+  // TODO(dehnert): Specialize Stats for integral types so we don't implement
+  // methods that don't make sense.
+  const double c = 1.5;
+  const double margin = c * stddev_;
+
+  // Iterate 10 times, or until the Huber mean stabilizes.
+  // If the margin is zero, we don't want mean to drift from the median.
+  if (margin > 0.0) {
+    for (int k = 0; k < 10; ++k) {
+      double old_mean = mean_;
+      mean_ = UpdateHuberMean(sorted_values, mean_, margin);
+      if (mean_ == old_mean) break;
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/robust_stats.h b/tensorflow/core/grappler/costs/robust_stats.h
new file mode 100644
index 00000000000..9d8f5bc970a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
+ public:
+  RobustStats(const std::vector<double>& values);
+  RobustStats(std::vector<double>&& values);
+
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
+
+ private:
+  void HuberMAD(const std::vector<double>& values);
+
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats_test.cc b/tensorflow/core/grappler/costs/robust_stats_test.cc
new file mode 100644
index 00000000000..924097b126d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RobustStatsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    for (double d = 1.0; d <= 5.0; d += 1.0) {
+      values1_.push_back(5.0 - d);
+      values1_.push_back(5.0 + d);
+      values2_.push_back(25.0 - 2 * d);
+      values2_.push_back(25.0 + 2 * d);
+      values3_.push_back(-3.0 - d);
+      values3_.push_back(-3.0 + d);
+    }
+    values1_.push_back(5.0);  // Odd # elements, mean is 5.0
+    values3_.push_back(197.0);
+    values3_.push_back(-203.0);  // Even # elements, mean is -3.0
+  }
+
+  std::vector<double> values1_;
+  std::vector<double> values2_;
+  std::vector<double> values3_;
+};
+
+TEST_F(RobustStatsTest, Simple) {
+  RobustStats s1(values1_);
+  EXPECT_EQ(5.0, s1.mean());
+  EXPECT_EQ(0.0, s1.lo());
+  EXPECT_EQ(10.0, s1.hi());
+
+  RobustStats s2(values2_);
+  EXPECT_EQ(25.0, s2.mean());
+  EXPECT_EQ(15.0, s2.lo());
+  EXPECT_EQ(35.0, s2.hi());
+
+  RobustStats s3(values3_);
+  EXPECT_EQ(-3.0, s3.mean());
+  EXPECT_EQ(-203.0, s3.lo());
+  EXPECT_EQ(197.0, s3.hi());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
new file mode 100644
index 00000000000..2fbd54d7591
--- /dev/null
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -0,0 +1,292 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/utils.h"
+
+#include <stddef.h>
+#include <utility>
+
+#include "third_party/eigen3/Eigen/Core"
+
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "cuda/include/cudnn.h"
+#endif
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+static OpInfo::TensorProperties UnknownInput() {
+  OpInfo::TensorProperties input;
+  input.set_dtype(DataType::DT_INVALID);
+  input.mutable_shape()->set_unknown_rank(true);
+  return input;
+}
+
+static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
+  std::vector<TensorProto> tensors;
+  switch (attr_value.value_case()) {
+    case AttrValue::kTensor: {
+      tensors.push_back(attr_value.tensor());
+      break;
+    }
+    case AttrValue::kList: {
+      for (const auto& tensor_proto : attr_value.list().tensor()) {
+        tensors.push_back(tensor_proto);
+      }
+      break;
+    }
+    default: {}
+  }
+  return tensors;
+}
+
+static void ExtractExtraProperties(
+    const NodeDef& node,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    std::vector<OpInfo::TensorProperties>* extra_inputs,
+    protobuf::Map<string, AttrValue>* attr_map) {
+  OpRegistry* op_registry = OpRegistry::Global();
+  const OpDef* op_def = nullptr;
+  auto s = op_registry->LookUpOpDef(node.op(), &op_def);
+  if (!s.ok()) {
+    op_def = nullptr;
+  }
+
+  for (int i = 0; i < node.input_size(); ++i) {
+    const string input_name = node.input(i);
+    CHECK(!input_name.empty());
+    TensorId input_tensor_id = ParseTensorName(input_name);
+    const string input_node_name = input_tensor_id.first.ToString();
+
+    auto iter = name_to_node.find(input_node_name);
+    if (iter == name_to_node.end()) continue;
+    const NodeDef* input_node = iter->second;
+
+    // The value attribute in Const input is useful for cost prediction.
+    if (input_node->op() == "Const") {
+      auto it = input_node->attr().find("value");
+      if (it == input_node->attr().end()) continue;
+
+      const AttrValue& attr_value = it->second;
+      std::vector<TensorProto> tensors = ExtractTensors(attr_value);
+      if (tensors.empty()) continue;
+
+      const TensorProto& t = tensors[0];
+      OpInfo::TensorProperties input;
+      input.set_dtype(t.dtype());
+      *(input.mutable_shape()) = t.tensor_shape();
+      *(input.mutable_value()) = t;
+      extra_inputs->push_back(input);
+
+      // For filename input, the file size can also be useful.
+      if (op_def && i < op_def->input_arg_size() &&
+          op_def->input_arg(i).name().find("filename") != std::string::npos) {
+        Tensor tensor;
+        if (!tensor.FromProto(t)) {
+          continue;
+        }
+        if (tensor.NumElements() != 1) {
+          continue;
+        }
+        const string filename = tensor.scalar<string>()();
+
+        Env* env = Env::Default();
+        FileStatistics stat;
+        Status s = env->Stat(filename, &stat);
+        if (!s.ok()) {
+          continue;
+        }
+        AttrValue attr;
+        attr.set_i(stat.length);
+        string attr_key = strings::StrCat("input_", i, "_filesize");
+        (*attr_map)[attr_key] = attr;
+      }
+    }
+
+    // When the input is a handle (e.g. look up table handle), the information
+    // in the op itself is not sufficient to predict the op memory.
+    if (op_def && i < op_def->input_arg_size() &&
+        op_def->input_arg(i).name().find("handle") != std::string::npos) {
+      string new_key = strings::StrCat("parent_", i, "_op");
+      AttrValue attr;
+      attr.set_s(input_node->op());
+      (*attr_map)[new_key] = attr;
+      // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
+      // and attributes when necessary.
+    }
+  }
+}
+
+std::vector<OpInfo::TensorProperties> FindInputFeatures(
+    const NodeDef& node,
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
+    const std::unordered_map<string, const NodeDef*>& name_to_node) {
+  std::vector<OpInfo::TensorProperties> inputs;
+  for (const auto& input_name : node.input()) {
+    CHECK(!input_name.empty());
+    TensorId input_tensor_id = ParseTensorName(input_name);
+    const string input_node_name = input_tensor_id.first.ToString();
+    const int output_index = input_tensor_id.second;
+
+    // Skip control inputs.
+    if (output_index == Graph::kControlSlot) {
+      continue;
+    }
+
+    auto it = name_to_cost.find(input_node_name);
+    if (it == name_to_cost.end() || output_index < 0) {
+      inputs.push_back(UnknownInput());
+    } else {
+      const CostGraphDef::Node* input_cost = it->second;
+      if (input_cost->output_info_size() == 0) {
+        inputs.push_back(UnknownInput());
+      } else {
+        const CostGraphDef::Node::OutputInfo& output =
+            input_cost->output_info(output_index);
+        OpInfo::TensorProperties input;
+        input.set_dtype(output.dtype());
+        *input.mutable_shape() = output.shape();
+        inputs.push_back(input);
+      }
+    }
+  }
+
+  return inputs;
+}
+
+DeviceProperties GetDeviceInfo(const string& device_str) {
+  DeviceNameUtils::ParsedName parsed;
+  if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
+    if (parsed.type == "GPU") {
+      return GetLocalGPUInfo(parsed.id);
+    } else if (parsed.type == "CPU") {
+      return GetLocalCPUInfo();
+    }
+  }
+  DeviceProperties device;
+  device.set_type("UNKNOWN");
+  return device;
+}
+
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
+  return GetDeviceInfo(node.device());
+}
+
+OpInfo BuildOpInfoWithoutDevice(
+    const NodeDef& node,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  OpInfo op_info;
+  op_info.set_op(node.op());
+  *op_info.mutable_attr() = node.attr();
+  for (auto& input : inputs) {
+    *op_info.add_inputs() = input;
+  }
+
+  std::vector<OpInfo::TensorProperties> extra_inputs;
+  ExtractExtraProperties(node, name_to_node, &extra_inputs,
+                         op_info.mutable_attr());
+  for (auto& input : extra_inputs) {
+    *op_info.add_inputs() = input;
+  }
+
+  return op_info;
+}
+
+string GetOpDescription(const OpInfo& op_info) {
+  string description = "[";
+  description += "Op=" + op_info.op() + ", ";
+  description += "input_shapes=[";
+  for (auto const& input : op_info.inputs()) {
+    description += PartialTensorShape::DebugString(input.shape());
+  }
+  description += "]";
+  return description;
+}
+
+OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
+                                               const GraphDef& graph) {
+  OpPerformanceList ret;
+  std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (auto& node : cost_graph.node()) {
+    name_to_cost[node.name()] = &node;
+  }
+  for (auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  for (const auto& node : graph.node()) {
+    // Skip the nodes that are not in the cost graph: these are nodes that
+    // aren't run, because they aren't in the intersection of transitive
+    // fan-in of a fetch node and the transitive fan-out of an input, or nodes
+    // that were optimized away by the optimizer. Since they don't contribute
+    // to the execution time we simply discard them.
+    auto it = name_to_cost.find(node.name());
+    if (it == name_to_cost.end()) {
+      continue;
+    }
+    const CostGraphDef::Node* cost_node = it->second;
+
+    OpPerformance* perf = ret.add_op_performance();
+    perf->set_node(node.name());
+
+    std::vector<OpInfo::TensorProperties> inputs =
+        FindInputFeatures(node, name_to_cost, name_to_node);
+    *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
+    *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
+
+    perf->set_temporary_memory_size(cost_node->temporary_memory_size());
+    // Note that CostGraphDef::Node::compute_cost is microseconds, while
+    // OpPerformance.compute_cost is nanoseconds.
+    perf->set_compute_cost(cost_node->compute_cost() * 1000);
+    perf->set_compute_time(cost_node->compute_time() * 1000);
+    perf->set_memory_time(cost_node->memory_time() * 1000);
+
+    for (const auto& output_info : cost_node->output_info()) {
+      perf->mutable_op_memory()->add_output_memory(output_info.size());
+    }
+
+    perf->mutable_op_memory()->set_host_temp_memory(
+        cost_node->host_temp_memory_size());
+    perf->mutable_op_memory()->set_device_temp_memory(
+        cost_node->device_temp_memory_size());
+    perf->mutable_op_memory()->set_host_persistent_memory(
+        cost_node->host_persistent_memory_size());
+    perf->mutable_op_memory()->set_device_persistent_memory(
+        cost_node->device_persistent_memory_size());
+  }
+  return ret;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
new file mode 100644
index 00000000000..96f2935951f
--- /dev/null
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns a vector of InputProperties for 'node'. The vector will contain one
+// entry for each input of 'node'.
+// For each node in the graph, the 'name_to_cost' map stores a pointer to the
+// corresponding cost graph node indexed by node name. The 'name_to_node' maps a
+// node name to its node definition.
+std::vector<OpInfo::TensorProperties> FindInputFeatures(
+    const NodeDef& node,
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
+    const std::unordered_map<string, const NodeDef*>& name_to_node);
+
+// Returns the DeviceProperties of the device on which 'node' runs.
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
+DeviceProperties GetDeviceInfo(const string& device_str);
+
+// Return a string describing a node given a nodeinfo.
+string GetOpDescription(const OpInfo& op_info);
+
+// Builds the OpInfo for node without filling its device information, given all
+// nodes in the graph and its input properties.
+OpInfo BuildOpInfoWithoutDevice(
+    const NodeDef& node,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    const std::vector<OpInfo::TensorProperties>& inputs);
+
+// Gather performance data from a cost graph.
+OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
+                                               const GraphDef& graph);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
new file mode 100644
index 00000000000..0291bd04909
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
+  CHECK(cluster);
+  devices_ = cluster->GetDevices();
+
+  if (devices_.empty()) {
+    // If there are no devices in the cluster, add a single device, "UNKNOWN" to
+    // the cluster.
+    default_device_ = "UNKNOWN";
+    DeviceProperties& prop = devices_["UNKNOWN"];
+    prop.set_type("UNKNOWN");
+
+  } else {
+    default_device_ = devices_.begin()->first;
+    for (const auto& device : devices_) {
+      if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
+        default_device_ = device.first;
+      }
+      break;
+    }
+  }
+}
+
+const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
+  string device = get_canonical_device_name(node);
+  auto it = devices_.find(device);
+  DCHECK(it != devices_.end());
+  return it->second;
+}
+
+string VirtualPlacer::get_canonical_device_name(const NodeDef& node) const {
+  string device;
+  if (!node.device().empty()) {
+    if (devices_.find(node.device()) != devices_.end()) {
+      return node.device();
+    }
+    DeviceNameUtils::ParsedName parsed_name;
+    bool parsed = DeviceNameUtils::ParseFullName(node.device(), &parsed_name);
+    if (!parsed) {
+      parsed = DeviceNameUtils::ParseLocalName(node.device(), &parsed_name);
+      parsed_name.job = "localhost";
+    }
+    if (!parsed) {
+      if (node.device() == "GPU" || node.device() == "CPU" ||
+          node.device() == "gpu" || node.device() == "cpu") {
+        parsed_name.job = "localhost";
+        parsed_name.type = node.device();
+        parsed = true;
+      }
+    }
+    if (!parsed) {
+      return get_default_device_name();
+    } else {
+      device = strings::StrCat(
+          "/job:", parsed_name.job, "/replica:", parsed_name.replica,
+          "/task:", parsed_name.task, "/",
+          str_util::Lowercase(parsed_name.type), ":", parsed_name.id);
+    }
+  } else {
+    return get_default_device_name();
+  }
+  if (devices_.find(device) == devices_.end()) {
+    return get_default_device_name();
+  }
+  return device;
+}
+
+const string& VirtualPlacer::get_default_device_name() const {
+  return default_device_;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
new file mode 100644
index 00000000000..6d814c95273
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+class NodeDef;
+
+namespace grappler {
+class Cluster;
+
+// The virtual placer emulates the behavior of the TF placer.
+class VirtualPlacer {
+ public:
+  VirtualPlacer(const Cluster* cluster);
+
+  const DeviceProperties& get_device(const NodeDef& node) const;
+
+  // Returns canonical device name that has a corresponding device in the
+  // cluster; returns empty string if no device found or the node.device() can
+  // not be parsed.
+  string get_canonical_device_name(const NodeDef& node) const;
+
+ private:
+  std::unordered_map<string, DeviceProperties> devices_;
+  string default_device_;
+  const string& get_default_device_name() const;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
diff --git a/tensorflow/core/grappler/costs/virtual_placer_test.cc b/tensorflow/core/grappler/costs/virtual_placer_test.cc
new file mode 100644
index 00000000000..31592cd822f
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+TEST(VirtualPlacerTest, LocalDevices) {
+  // Create a virtual cluster with a local CPU and a local GPU
+  std::unordered_map<string, DeviceProperties> devices;
+  DeviceProperties cpu_device;
+  cpu_device.set_type("CPU");
+  devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+  DeviceProperties gpu_device;
+  gpu_device.set_type("GPU");
+  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  VirtualCluster cluster(devices);
+  VirtualPlacer placer(&cluster);
+
+  NodeDef node;
+  node.set_op("Conv2D");
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+
+  node.set_device("CPU");
+  EXPECT_EQ("CPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
+            placer.get_canonical_device_name(node));
+
+  node.set_device("GPU:0");
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+}
+
+TEST(VirtualPlacerTest, FallBackUnknown) {
+  // Virtual placer falls back to "UNKNOWN" only if there are no devices in the
+  // cluster.
+  std::unordered_map<string, DeviceProperties> devices;
+  VirtualCluster cluster(devices);
+  VirtualPlacer placer(&cluster);
+
+  NodeDef node;
+  node.set_op("Conv2D");
+
+  // Device falls back to UNKNOWN since the cluster has no devices.
+  EXPECT_EQ("UNKNOWN", placer.get_device(node).type());
+  EXPECT_EQ("UNKNOWN", placer.get_canonical_device_name(node));
+}
+
+TEST(VirtualPlacerTest, FallBackCPU) {
+  std::unordered_map<string, DeviceProperties> devices;
+  DeviceProperties cpu_device;
+  cpu_device.set_type("CPU");
+  devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
+  VirtualCluster cluster(devices);
+  VirtualPlacer placer(&cluster);
+
+  NodeDef node;
+  node.set_op("Conv2D");
+
+  // Device falls back to CPU since there is no GPU.
+  EXPECT_EQ("CPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
+            placer.get_canonical_device_name(node));
+}
+
+TEST(VirtualPlacerTest, RemoteDevices) {
+  std::unordered_map<string, DeviceProperties> devices;
+  DeviceProperties cpu_device;
+  cpu_device.set_type("CPU");
+  devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
+  DeviceProperties gpu_device;
+  gpu_device.set_type("GPU");
+  devices["/job:my_job/replica:0/task:0/gpu:0"] = gpu_device;
+  VirtualCluster cluster(devices);
+  VirtualPlacer placer(&cluster);
+
+  NodeDef node;
+  node.set_op("Conv2D");
+
+  // Device falls back to GPU.
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+
+  node.set_device("/job:my_job/replica:0/task:0/cpu:0");
+  EXPECT_EQ("CPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
+            placer.get_canonical_device_name(node));
+
+  node.set_device("/job:my_job/replica:0/task:0/gpu:0");
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+
+  // There is no local cpu available. Device falls back to GPU.
+  node.set_device("CPU");
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+
+  node.set_device("GPU:0");
+  // There is no local GPU available. Fall back to default GPU.
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+
+  // This isn't a valid name. Fall back to GPU.
+  node.set_device("/job:my_job/replica:0/task:0");
+  EXPECT_EQ("GPU", placer.get_device(node).type());
+  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
new file mode 100644
index 00000000000..c68d4e31c46
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -0,0 +1,621 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+
+#include <math.h>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+  VLOG(3) << "costs execution_time=" << result.execution_time.count()
+          << " max_memory=" << result.max_memory
+          << " max_per_op_buffers=" << result.max_per_op_buffers
+          << " max_per_op_streaming=" << result.max_per_op_streaming;
+  return result;
+}
+}  // namespace
+
+VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
+                                   const bool use_static_shapes,
+                                   Cluster* cluster)
+    :  // TODO(dyoon): Use a better way than FIFO.
+      ready_nodes_(new FIFOManager()),
+      graph_costs_(Costs::ZeroCosts()),
+      graph_properties_(*grappler_item),
+      cluster_(cluster),
+      grappler_item_(grappler_item),
+      use_static_shapes_(use_static_shapes),
+      placer_(cluster) {
+  initialized_ = false;
+}
+
+Status VirtualScheduler::Init() {
+  // Init() preprocesses the input grappler_item and graph_properties to extract
+  // necessary information for emulating tensorflow op scheduling and
+  // construct internal data structures (NodeState and DeviceState) for virtual
+  // scheduling.
+
+  // Construct graph properties.
+  Status status;
+  if (use_static_shapes_) {
+    status = graph_properties_.InferStatically();
+  } else {
+    status = graph_properties_.InferDynamically(cluster_);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  const auto& graph = grappler_item_->graph;
+  const auto& fetch_nodes = grappler_item_->fetch;
+
+  // Get the nodes that would run to output fetch_nodes.
+  std::vector<const NodeDef*> nodes =
+      ComputeTransitiveFanin(graph, fetch_nodes);
+
+  // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
+  // ComputeTransitiveFanin().
+  // Once ComputeTransitiveFanin is complete, only the nodes that can be reached
+  // from the fetch nodes are scheduled. So the scheduled nodes should be
+  // exactly the same as those executed for real. One possible discrepancy could
+  // be the control flow nodes, where tf only executes one path.
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : nodes) {
+    name_to_node[node->name()] = node;
+  }
+
+  // Build node_map; for each node, create its NodeState and connect its inputs
+  // and outputs.
+  for (const auto* curr_node : nodes) {
+    auto& curr_node_state = GetNodeStateOrCreateIt(curr_node);
+    const string curr_node_device = DeviceName(curr_node);
+    for (const string& input_node_name : curr_node->input()) {
+      // Note that input_node_name may be in <prefix><node_name>:<port_num>
+      // format, where <prefix> (e.g., "^" for control dependency) and
+      // ":<port_num>" may be omitted. NodeName() extracts only the node_name.
+      const NodeDef* input_node = name_to_node[NodeName(input_node_name)];
+
+      CHECK(input_node);
+      const string in_device = DeviceName(input_node);
+      const auto input_node_port_num = NodePosition(input_node_name);
+
+      if (curr_node_device == in_device) {
+        // Same device: connect input_node and curr_node directly.
+        curr_node_state.inputs.push_back(
+            std::make_pair(input_node, input_node_port_num));
+        auto& input_node_state = GetNodeStateOrCreateIt(input_node);
+        input_node_state.outputs[input_node_port_num].push_back(curr_node);
+      } else {
+        if (cached_recv_nodes_.count(input_node) > 0 &&
+            cached_recv_nodes_[input_node].count(curr_node_device) > 0) {
+          // Different device, but found an already-cached copy (a _Recv op);
+          // connect the _Recv to curr_node.
+          const auto* recv_op =
+              cached_recv_nodes_[input_node][curr_node_device];
+          // recv_op's output port is hard-coded to zero.
+          curr_node_state.inputs.push_back(std::make_pair(recv_op, 0));
+          auto& input_node_state = node_map_.at(recv_op);
+          input_node_state.outputs[0].push_back(curr_node);
+        } else {
+          // Different device, no cached copy; transfer input_node to the
+          // curr_node's device.
+          auto send_and_recv =
+              CreateSendRecv(input_node, curr_node, input_node_name);
+          // Note that CreateSendRecv() already connected input/output between
+          // _Send and _Recv ops.
+          const auto* send = send_and_recv.first;
+          const auto* recv = send_and_recv.second;
+          // recv_op's output port is hard-coded to zero.
+          curr_node_state.inputs.push_back(std::make_pair(recv, 0));
+          auto& input_node_state = GetNodeStateOrCreateIt(input_node);
+          input_node_state.outputs[input_node_port_num].push_back(send);
+
+          // Cache the _Recv op for future use.
+          cached_recv_nodes_[input_node][curr_node_device] = recv;
+        }
+      }
+    }
+
+    if (curr_node->input().empty()) {
+      // Node without input: ready at time 0.
+      curr_node_state.time_ready = Costs::Duration();
+      ready_nodes_->AddNode(curr_node);
+    }
+
+    if (IsPersistentNode(curr_node)) {
+      auto& device_state = device_[curr_node_device];
+      for (int port_num = 0;
+           port_num < curr_node_state.output_properties.size(); ++port_num) {
+        device_state.persistent_nodes.insert(
+            std::make_pair(curr_node, port_num));
+      }
+    }
+  }
+
+  if (ready_nodes_->Empty()) {
+    return Status(error::UNAVAILABLE, "No ready nodes in the graph.");
+  }
+
+  initialized_ = true;
+  return Status::OK();
+}
+
+void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
+  CHECK(!initialized_) << "MaybeUpdateInputOutput is called after Init().";
+  // This method is called when NodeState is created and adds input and output
+  // properties for a few exceptional cases that GraphProperties cannot provide
+  // input/output properties.
+  if (IsSend(*node) || IsRecv(*node)) {
+    auto& node_state = node_map_[node];
+    auto& inputs = node_state.input_properties;
+    auto& outputs = node_state.output_properties;
+
+    // _Send and _Recv ops are created from VirtualScheduler, so
+    // there should be no inputs TensorProperties.
+    CHECK(inputs.empty());
+    CHECK(outputs.empty());
+    const auto& attr = node->attr();
+    // This is the original input source to the _Send and _Recv, and this
+    // string includes "^" if it was control dependency, and output port
+    /// (e.g., ":2") if the input source had multiple outputs.
+    const auto& input_source_name = attr.at(kAttrInputSrc).s();
+    if (IsControlInput(input_source_name)) {
+      // Control dependency; regardless of the input source tensor size,
+      // send 4B.
+      OpInfo::TensorProperties control_message;
+      control_message.set_dtype(DT_FLOAT);
+      control_message.mutable_shape()->add_dim()->set_size(1);
+      auto* value = control_message.mutable_value();
+      value->add_float_val(1);
+      inputs.push_back(control_message);
+      outputs.push_back(control_message);
+    } else {
+      auto output_properties =
+          graph_properties_.GetOutputProperties(NodeName(input_source_name));
+      // Like with HasInputProperties, if a node does not have output
+      // properties, it's likely it was pruned during the shape inference run.
+      if (!output_properties.empty()) {
+        const auto input_node_port_num = NodePosition(input_source_name);
+        // Use the input source's output property as _Send and _Recv's input
+        // property.
+        CHECK_GT(output_properties.size(), input_node_port_num);
+        inputs.push_back(output_properties[input_node_port_num]);
+        outputs.push_back(output_properties[input_node_port_num]);
+      }
+    }
+  }
+}
+
+float VirtualScheduler::Round2(const float x) const {
+  // Not using std::round from <cmath> here because not all platforms seem to
+  // support that (specifically Android).
+  return ::round(100.0 * x) / 100.0;
+}
+
+bool VirtualScheduler::IsPersistentNode(const NodeDef* node) const {
+  // Variables are persistent nodes.
+  return IsVariable(*node);
+}
+
+string VirtualScheduler::DeviceName(const NodeDef* node) const {
+  return placer_.get_canonical_device_name(*node);
+}
+
+string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
+                                           const NodeDef* to) const {
+  CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
+
+  return kChannelDevice + ": " + DeviceName(from) + " to " + DeviceName(to);
+}
+
+std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
+    const NodeDef* from, const NodeDef* to, const string& input_name) {
+  CHECK(!initialized_) << "CreateSendRecv is called after Init().";
+
+  // Connect "from" node to "to" node with _Send and _Recv such that
+  // from -> _Send -> _Recv -> to.
+  // _Send is placed on "Channel" device, and _Recv is on the same device
+  // as "to" node.
+  // input_node_name is the string from the "to" node to identify which output
+  // we get from the "from" node.
+
+  // Note that we use NodeState for scheduling, so _Send and _Recv
+  // NodeDefs created here need not be correct: in terms of name,
+  // input names, attrs, etc.
+
+  auto input_node_port_num = NodePosition(input_name);
+
+  // _Send op.
+  auto* send = new NodeDef();
+  send->set_name("Send " + from->name() + " from " + DeviceName(from) + " to " +
+                 DeviceName(to));
+  send->set_op("_Send");
+  send->add_input(from->name());
+  send->set_device(ChannelDeviceName(from, to));
+  auto& send_attr = *(send->mutable_attr());
+  send_attr[kAttrInputSrc].set_s(input_name);
+  send_attr[kAttrSrcDevice].set_s(DeviceName(from));
+  send_attr[kAttrDstDevice].set_s(DeviceName(to));
+
+  // _Recv op.
+  auto* recv = new NodeDef();
+  recv->set_name("Recv " + from->name() + " on " + DeviceName(to));
+  recv->set_op("_Recv");
+  recv->add_input(send->name());
+  recv->set_device(DeviceName(to));
+  auto& recv_attr = *(recv->mutable_attr());
+  recv_attr[kAttrInputSrc].set_s(input_name);
+
+  // NodeState for _Send op.
+  auto& send_node_state = GetNodeStateOrCreateIt(send);
+  send_node_state.device_name = send->device();  // Set Channel device.
+  send_node_state.inputs.push_back(std::make_pair(from, input_node_port_num));
+  send_node_state.outputs[0].push_back(recv);
+
+  // NodeState for _Recv op.
+  auto& recv_node_state = GetNodeStateOrCreateIt(recv);
+  recv_node_state.inputs.push_back(std::make_pair(send, 0));
+  recv_node_state.outputs[0].push_back(to);
+
+  // Keep the created nodes.
+  additional_nodes_.emplace_back(std::unique_ptr<NodeDef>(send));
+  additional_nodes_.emplace_back(std::unique_ptr<NodeDef>(recv));
+
+  // Return _Send and _Recv.
+  return std::make_pair(send, recv);
+}
+
+NodeInfo VirtualScheduler::GetCurrNodeInfo() const {
+  const NodeDef* node = ready_nodes_->GetCurrNode();
+
+  // Get the device from the placer.
+  DeviceProperties device;
+  device = placer_.get_device(*node);
+
+  // Special case for _Send op.
+  if (IsSend(*node)) {
+    device.set_type(kChannelDevice);
+  }
+
+  // Construct NodeInfo.
+  const auto& node_state = node_map_.at(node);
+  NodeInfo node_info;
+  node_info.name = node->name();
+  node_info.device_name = node_state.device_name;
+  auto& op_info = node_info.op_info;
+  op_info.set_op(node->op());
+  *op_info.mutable_attr() = node->attr();
+  for (auto& input : node_state.input_properties) {
+    *op_info.add_inputs() = input;
+  }
+  for (auto& output : node_state.output_properties) {
+    *op_info.add_outputs() = output;
+  }
+  op_info.mutable_device()->Swap(&device);
+  return node_info;
+}
+
+NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+  CHECK(!initialized_) << "GetNodeStateOrCreateIt is called after Init().";
+
+  auto it = node_map_.find(node);
+  if (it == node_map_.end()) {
+    // Not found; create a NodeState for this node.
+    it = node_map_.emplace(node, NodeState()).first;
+    auto& node_state = it->second;
+    node_state.input_properties =
+        graph_properties_.GetInputProperties(node->name());
+    node_state.output_properties =
+        graph_properties_.GetOutputProperties(node->name());
+
+    // Some ops may need further processing to the input / output properties:
+    // _Send and _Recv.
+    MaybeUpdateInputOutput(node);
+
+    if (!IsSend(*node)) {
+      node_state.device_name = DeviceName(node);
+      // For _Send op, device_name will be set to Channel in CreateSendRecv().
+    }
+
+    // Initialize output port related data:
+    // Assume the size of OutputProperties represents the number of output ports
+    // of this node.
+    for (int i = 0; i < node_state.output_properties.size(); ++i) {
+      node_state.time_no_references[i] = Costs::Duration::max();
+      node_state.num_outputs_executed[i] = 0;
+      // Populate an empty vector for each port. The caller will add nodes
+      // that use this port as input.
+      node_state.outputs[i] = {};
+    }
+    // Port_num -1 is for control dependency.
+    node_state.time_no_references[-1] = Costs::Duration::max();
+    node_state.num_outputs_executed[-1] = 0;
+    node_state.outputs[-1] = {};
+  }
+  return it->second;
+}
+
+int64 VirtualScheduler::CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    const int port_num) const {
+  if (port_num < 0) {
+    return 4;  // 4B for control dependency.
+  }
+
+  if (port_num >= output_properties.size()) {
+    VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
+            << "port_num: " << port_num
+            << " >= output_properties.size(): " << output_properties.size();
+    return 0;
+  }
+
+  const auto& output = output_properties[port_num];
+  int64 output_size = DataTypeSize(BaseType(output.dtype()));
+
+  for (const auto& dim : output.shape().dim()) {
+    auto dim_size = dim.size();
+    if (dim_size < 0) {
+      // Zero output size if there's any unknown dim.
+      output_size = 0;
+      VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
+              << "unknown dim: " << output_size;
+      break;
+    }
+    output_size *= dim_size;
+  }
+
+  return output_size;
+}
+
+Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
+                                          std::map<string, Costs>* op_cost) {
+  auto it = op_cost->find(op_name);
+  if (it == op_cost->end()) {
+    // Note that default constructor of Costs sets some memory related fields
+    // to unknown values so we should explicitly initialize it with ZeroCosts.
+    it = op_cost->emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  return it->second;
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  graph_costs_ = CombineCosts(graph_costs_, node_costs);
+  const auto* node = ready_nodes_->GetCurrNode();
+  const auto& op_name = node->op();
+
+  // Also keep track of op counts and times per op (with their shapes).
+  NodeInfo node_info = GetCurrNodeInfo();
+  string node_description = GetOpDescription(node_info.op_info);
+  op_counts_[node_description] += 1;
+  op_costs_[node_description] =
+      node_costs.execution_time.asMicroSeconds().count();
+
+  auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Update node and device states.
+  auto& node_state = node_map_[node];
+  auto& device = device_[node_state.device_name];
+  device.nodes_executed.push_back(node);
+  // Node is scheduled when the device is available AND all the inputs are
+  // ready; hence, time_scheduled is time_ready if time_ready > device curr
+  // time.
+  node_state.time_scheduled =
+      std::max(device.GetCurrTime(), node_state.time_ready);
+  // Override device curr time with the time_scheduled.
+  device.device_costs.execution_time = node_state.time_scheduled;
+  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  auto curr_time = device.GetCurrTime();
+  node_state.time_finished = curr_time;
+
+  // Update device memory usage.
+  if (!IsPersistentNode(node)) {
+    for (const auto& port_num_output_pair : node_state.outputs) {
+      int port_num = port_num_output_pair.first;
+      // There's a chance that a specific output is not used at all.
+      if (node_state.outputs[port_num].empty()) {
+        node_state.time_no_references[port_num] = curr_time;
+      } else {
+        device.memory_usage +=
+            CalculateOutputSize(node_state.output_properties, port_num);
+        device.nodes_in_memory.insert(std::make_pair(node, port_num));
+      }
+    }
+  }
+
+  // Update device's per-op cost.
+  auto& device_op_cost = FindOrCreateZero(op_name, &device.op_to_cost);
+  device_op_cost = CombineCosts(device_op_cost, node_costs);
+
+  VLOG(2) << "Op scheduled -- name: " << node->name() << ", op: " << node->op()
+          << ", device: " << node->device()
+          << ", ready: " << node_state.time_ready.count()
+          << ", scheduled: " << node_state.time_scheduled.count()
+          << ", finished: " << node_state.time_finished.count();
+
+  // Increment num_inputs_ready of the output nodes
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+      }
+    }
+  }
+
+  // Increment num_outputs_executed of the input nodes.
+  for (const auto& input_port : node_state.inputs) {
+    auto* input = input_port.first;
+    auto port = input_port.second;
+    auto& input_state = node_map_[input];
+    input_state.num_outputs_executed[port]++;
+    if (input_state.num_outputs_executed[port] ==
+            input_state.outputs[port].size() &&
+        !IsPersistentNode(input)) {
+      // All the outputs are executed; no reference to this output port of
+      // input node.
+      input_state.time_no_references[port] = curr_time;
+      auto& input_device = device_[input_state.device_name];
+      input_device.memory_usage -=
+          CalculateOutputSize(input_state.output_properties, port);
+
+      input_device.nodes_in_memory.erase(std::make_pair(input, port));
+    }
+  }
+
+  if (!IsPersistentNode(node)) {
+    // Now that output memory is added and used up nodes are deallocated,
+    // check max memory usage.
+    if (device.memory_usage > device.max_memory_usage) {
+      device.max_memory_usage = device.memory_usage;
+      device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+
+  return !ready_nodes_->Empty();
+}
+
+Costs VirtualScheduler::Summary() const {
+  // Print out basic execution summary.
+  VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
+  VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
+  VLOG(1) << "Expected max per-op streaming buffers: "
+          << graph_costs_.max_per_op_streaming;
+
+  VLOG(1) << "Per-op execution time:";
+  for (const auto& op_cost_pair : op_to_cost_) {
+    const auto& op = op_cost_pair.first;
+    const auto& cost = op_cost_pair.second.execution_time.count();
+    if (cost) {  // Skip printing out zero-cost ops.
+      VLOG(1) << " + " << op << " : " << cost;
+    }
+  }
+
+  // Print per device summary
+  VLOG(1) << "Devices:";
+  Costs critical_path_costs = Costs::ZeroCosts();
+
+  for (const auto& device : device_) {
+    const auto& name = device.first;
+    const auto& state = device.second;
+
+    std::map<string, int64> op_to_memory;
+    // First profile only persistent memory usage.
+    int64 persistent_memory_usage = 0;
+    std::set<string> persisent_ops;
+    for (const auto& node_port : state.persistent_nodes) {
+      const auto* node = node_port.first;
+      const auto port = node_port.second;
+      const auto output_size =
+          CalculateOutputSize(node_map_.at(node).output_properties, port);
+      persistent_memory_usage += output_size;
+      op_to_memory[node->op()] += output_size;
+      persisent_ops.insert(node->op());
+    }
+    int64 max_memory_usage = persistent_memory_usage + state.max_memory_usage;
+    critical_path_costs.estimated_max_memory_per_device[name] =
+        max_memory_usage;
+
+    VLOG(1) << "Device = " << name
+            << ", num_nodes = " << state.nodes_executed.size()
+            << ", execution_time = " << state.GetCurrTime().count()
+            << ", memory usage: "
+            << "persistenst = "
+            << Round2(persistent_memory_usage / 1024.0 / 1024.0 / 1024.0)
+            << " GB, peak = "
+            << Round2(state.max_memory_usage / 1024.0 / 1024.0 / 1024.0)
+            << " GB, total = "
+            << Round2(max_memory_usage / 1024.0 / 1024.0 / 1024.0)
+            << " GB, at the end: " << state.memory_usage << " B";
+
+    VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
+    // Profile non-persistent op memory usage.
+    for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
+      const auto* node = node_port.first;
+      const auto port = node_port.second;
+      op_to_memory[node->op()] +=
+          CalculateOutputSize(node_map_.at(node).output_properties, port);
+    }
+    for (const auto& op_cost_pair : state.op_to_cost) {
+      const auto& op = op_cost_pair.first;
+      const auto& cost = op_cost_pair.second.execution_time.count();
+      const float mem_usage_gb =
+          Round2(op_to_memory[op] / 1024.0 / 1024.0 / 1024.0);
+      int64 op_mem_usage = op_to_memory.at(op);
+      const float mem_usage_percent =
+          max_memory_usage > 0 ? Round2(100.0 * op_mem_usage / max_memory_usage)
+                               : 0.0;
+      if (cost || mem_usage_percent > 1.0) {
+        // Print out only non-zero cost ops or ops with > 1% memory usage.
+        VLOG(1) << " + " << op << " : " << cost << " (" << mem_usage_gb
+                << " GB [" << mem_usage_percent << "%] "
+                << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
+      }
+    }
+    if (critical_path_costs.execution_time <= state.GetCurrTime()) {
+      critical_path_costs = state.device_costs;
+    }
+  }
+
+  // Also log the op description and their corresponding counts.
+  VLOG(2) << "Node description, counts, cost:";
+  for (const auto& item : op_counts_) {
+    VLOG(2) << "Node: " << item.first << ", Count: " << item.second
+            << ", Individual Cost: " << op_costs_.at(item.first);
+  }
+
+  VLOG(1) << "Critical path execution time: "
+          << critical_path_costs.execution_time.count();
+  return critical_path_costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 00000000000..472ba90f7c5
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,229 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeState {
+  // A node (i.e., an op) takes a set of input:port pairs and produces
+  // a set of output ports.
+
+  // Cross references to input and output nodes from graphdef.
+  std::vector<std::pair<const NodeDef*, int>> inputs;  // Input, port pairs.
+  // List of output nodes (a list of nodes that takes this output port as input)
+  // keyed by port_num. Note that port_num -1 is used for control dependency.
+  std::unordered_map<int, std::vector<const NodeDef*>> outputs;
+
+  // Info from GraphProperties.
+  std::vector<OpInfo::TensorProperties> input_properties;
+  std::vector<OpInfo::TensorProperties> output_properties;
+
+  // Canonical device name used within VirtualScheduler.
+  string device_name;
+
+  // States updated as scheduling nodes.
+  int num_inputs_ready;
+  std::unordered_map<int, int> num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  // Time that all the consumers are executed (hence, no need to keep this
+  // output in memory), keyed by port_num.
+  std::unordered_map<int, Costs::Duration> time_no_references;
+
+  // Note that a node may have multiple output ports. The length of outputs,
+  // num_outputs_executed, and time_no_references should be
+  // identical when a NodeState is fully initialized.
+  // They should be 1 + output_properties.size() as we add [-1] for control
+  // dependency.
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Each output port uses up memory space from time_scheduled to its
+  // time_no_references.
+
+  NodeState() {
+    num_inputs_ready = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    // Note that num_outputs_executed and time_no_references are not initialized
+    // here, since we don't know the size (i.e., # outputs for this node).
+  }
+};
+
+struct DeviceState {
+  // Nodes executed on this device in execution order.
+  std::vector<const NodeDef*> nodes_executed;
+
+  // Nodes currently allocated in memory: set of NodeDef* and port_num pairs
+  // so that we can track which output of the node is in memory.
+  std::set<std::pair<const NodeDef*, int>> nodes_in_memory;
+
+  // Nodes allocated in memory persistently: e.g., Variables.
+  std::set<std::pair<const NodeDef*, int>> persistent_nodes;
+
+  // Snapshot of nodes_in_memory, when memory usage is at peak.
+  // Same to nodes_in_memory, it's a set of NodeDef* and port_num pairs.
+  std::set<std::pair<const NodeDef*, int>> mem_usage_snapshot_at_peak;
+
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;    // Per-op cost.
+  std::map<string, int64> op_to_memory;  // Per-op memory usage at peak usage.
+  int64 memory_usage;
+  int64 max_memory_usage;
+
+  DeviceState() {
+    device_costs = Costs::ZeroCosts();
+    memory_usage = 0;
+    max_memory_usage = 0;
+  }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+
+// A wrapper struct to OpInfo proto.
+// TODO(dyoon): once we extend OpInfo or implement a better interface, and  then
+// delete this wrapper struct.
+struct NodeInfo {
+  OpInfo op_info;
+  string name;
+  string device_name;
+};
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  VirtualScheduler(const GrapplerItem* grappler_item,
+                   const bool use_static_shapes, Cluster* cluster);
+
+  // Initializes NodeState and DeviceState from grappler_item_ and
+  // graph_properties_.
+  Status Init();
+
+  NodeInfo GetCurrNodeInfo() const;
+
+  // Returns true if there is any node to be scheduled.
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  // Prints out summary of execution (timing, memory usage, etc.)
+  Costs Summary() const;
+
+ protected:
+  // GetDeviceStates and GetNodeStates are currently for testing purpuse only.
+  // Retrieves detailed scheduling results.
+  const std::unordered_map<string, DeviceState>& GetDeviceStates() const {
+    return device_;
+  }
+  const std::unordered_map<const NodeDef*, NodeState>& GetNodeStates() const {
+    return node_map_;
+  }
+
+  // Returns the size of output at port_num (unit: bytes). A special case is
+  // port_num -1, which is for control dependency and assumed to be 4 bytes.
+  int64 CalculateOutputSize(
+      const std::vector<OpInfo::TensorProperties>& output_properties,
+      const int port_num) const;
+
+ private:
+  // Constants.
+  const string kAttrInputSrc = "input_source_";
+  const string kAttrSrcDevice = "src_device_";
+  const string kAttrDstDevice = "dst_device_";
+  const string kChannelDevice = "Channel";
+
+  // Methods called from Init(). Fails if initialize_ is set.
+  void MaybeUpdateInputOutput(const NodeDef* node);
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+  std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
+      const NodeDef* from, const NodeDef* to, const string& input_name);
+  string DeviceName(const NodeDef* node) const;
+  string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
+
+  // Helper methods.
+  Costs& FindOrCreateZero(const string& op_name,
+                          std::map<string, Costs>* op_cost);
+  float Round2(const float x) const;
+  bool IsPersistentNode(const NodeDef* node) const;
+
+  // Scheduler states:
+  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+
+  // Pool of NodeDefs for SendRecv and Identity ops created.
+  std::vector<std::unique_ptr<NodeDef>> additional_nodes_;
+  // Cache of nodes transferred to another device.
+  std::unordered_map<const NodeDef*, std::unordered_map<string, const NodeDef*>>
+      cached_recv_nodes_;
+
+  // Stats:
+  std::map<string, int> op_counts_;  // Op counts with key with input shape.
+  std::map<string, int> op_costs_;   // Individual op costs (with input shapes).
+  Costs graph_costs_;                // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+
+  // Auxilliary data structures for constructing NodeState and DeviceState.
+  GraphProperties graph_properties_;
+  Cluster* cluster_;                   // Not owned.
+  const GrapplerItem* grappler_item_;  // Not owned.
+  bool use_static_shapes_;
+  bool initialized_;
+
+  VirtualPlacer placer_;  // owned.
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
new file mode 100644
index 00000000000..9e48c411dc0
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -0,0 +1,502 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+// Class for testing virtual scheduler.
+class TestVirtualScheduler : public VirtualScheduler {
+ public:
+  TestVirtualScheduler(const GrapplerItem* grappler_item,
+                       const bool use_static_shapes, Cluster* cluster)
+      : VirtualScheduler(grappler_item, use_static_shapes, cluster) {}
+
+  FRIEND_TEST(VirtualSchedulerTest, CalculateOutputSize);
+  FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
+  FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
+  FRIEND_TEST(VirtualSchedulerTest, ComplexDependency);
+  FRIEND_TEST(VirtualSchedulerTest, Variable);
+};
+
+class VirtualSchedulerTest : public ::testing::Test {
+ protected:
+  const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
+
+  void SetUp() override {
+    // Initializes cluster_ and placer_.
+    std::unordered_map<string, DeviceProperties> devices;
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    devices[kCPU0] = cpu_device;
+
+    cluster_.reset(new VirtualCluster(devices));
+    placer_.reset(new VirtualPlacer(cluster_.get()));
+  }
+
+  // Three Conv2Ds with only two in fetch nodes.
+  void CreateGrapplerItemWithConv2Ds() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = tensorflow::ops::RandomUniform(
+        s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto y = tensorflow::ops::RandomUniform(
+        s.WithOpName("y"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto z = tensorflow::ops::RandomUniform(
+        s.WithOpName("z"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto f = tensorflow::ops::RandomUniform(
+        s.WithOpName("f"), {kernel_, kernel_, depth_in_, depth_out_}, DT_FLOAT);
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto c0 =
+        tensorflow::ops::Conv2D(s.WithOpName("c0"), x, f, strides, "SAME");
+    auto c1 =
+        tensorflow::ops::Conv2D(s.WithOpName("c1"), y, f, strides, "SAME");
+    auto c2 =
+        tensorflow::ops::Conv2D(s.WithOpName("c2"), z, f, strides, "SAME");
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_conv2d_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"c0", "c1"};
+
+    dependency_["c0"] = {"x", "f"};
+    dependency_["c1"] = {"y", "f"};
+  }
+
+  // A Conv2D with a variable.
+  void CreateGrapplerItemWithConv2DAndVariable() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = tensorflow::ops::RandomUniform(
+        s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto f = tensorflow::ops::Variable(
+        s.WithOpName("f"), {kernel_, kernel_, depth_in_, depth_out_}, DT_FLOAT);
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto y = tensorflow::ops::Conv2D(s.WithOpName("y"), x, f, strides, "SAME");
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_conv2d_var_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"y"};
+
+    dependency_["y"] = {"x", "f"};
+  }
+
+  // AddN that takes 4 tensors with 10x10x10x10.
+  void CreateGrapplerItemWithAddN() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = tensorflow::ops::RandomUniform(s.WithOpName("x"), {10, 10, 10, 10},
+                                            DT_FLOAT);
+    auto y = tensorflow::ops::RandomUniform(s.WithOpName("y"), {10, 10, 10, 10},
+                                            DT_FLOAT);
+    auto z = tensorflow::ops::RandomUniform(s.WithOpName("z"), {10, 10, 10, 10},
+                                            DT_FLOAT);
+    auto w = tensorflow::ops::RandomUniform(s.WithOpName("w"), {10, 10, 10, 10},
+                                            DT_FLOAT);
+    tensorflow::OutputList input_tensors = {x, y, z, w};
+    auto out = tensorflow::ops::AddN(s.WithOpName("out"), input_tensors);
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_addn_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"out"};
+
+    dependency_["out"] = {"x", "y", "z", "w"};
+  }
+
+  // NoOp that takes 7 NoOps as control dependency.
+  void CreateGrapplerItemWithControlDependency() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    std::vector<string> input_noop_names = {"x", "y", "z", "w", "u", "v", "t"};
+    std::vector<tensorflow::Operation> input_tensors;
+    for (const auto& input : input_noop_names) {
+      auto x = tensorflow::ops::NoOp(s.WithOpName(input));
+      input_tensors.push_back(x.operation);
+    }
+    auto out = tensorflow::ops::NoOp(
+        s.WithControlDependencies(input_tensors).WithOpName("out"));
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_control_dependency_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"out"};
+
+    dependency_["out"] = input_noop_names;
+  }
+
+  // FusedBN [an op with multiple outputs] with multiple consumers (including
+  // control dependency).
+  void CreateGrapplerItemWithBatchNorm() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = tensorflow::ops::RandomUniform(
+        s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto scale = tensorflow::ops::RandomUniform(s.WithOpName("scale"),
+                                                {depth_in_}, DT_FLOAT);
+    auto offset = tensorflow::ops::RandomUniform(s.WithOpName("offset"),
+                                                 {depth_in_}, DT_FLOAT);
+    auto mean =
+        tensorflow::ops::RandomUniform(s.WithOpName("mean"), {0}, DT_FLOAT);
+    auto var =
+        tensorflow::ops::RandomUniform(s.WithOpName("var"), {0}, DT_FLOAT);
+
+    auto batch_norm = tensorflow::ops::FusedBatchNorm(
+        s.WithOpName("bn"), x, scale, offset, mean, var,
+        ops::FusedBatchNorm::IsTraining(true).Epsilon(0.1f));
+    auto y = batch_norm.y;
+    auto batch_mean = batch_norm.batch_mean;
+    auto batch_var = batch_norm.batch_variance;
+
+    auto z1 = tensorflow::ops::Add(s.WithOpName("z1"), x, y);
+    auto z2 = tensorflow::ops::Add(s.WithOpName("z2"), batch_var, batch_var);
+    auto z3 = tensorflow::ops::Add(s.WithOpName("z3"), batch_var, batch_var);
+    std::vector<tensorflow::Operation> input_tensors = {
+        batch_mean.op(), z1.z.op(), z2.z.op(), z3.z.op(),
+    };
+    auto z4 = tensorflow::ops::NoOp(
+        s.WithControlDependencies(batch_var).WithOpName("z4"));
+
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_complex_dependency_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"z1", "z2", "z3", "z4"};
+
+    dependency_["bn"] = {"x", "scale", "offset", "mean", "var"};
+    dependency_["z1"] = {"x", "bn"};
+    dependency_["z2"] = {"bn"};
+    dependency_["z3"] = {"bn"};
+    dependency_["z4"] = {"bn"};
+  }
+
+  // Call this after creating grappler_item_ and setting up dependency_.
+  void InitScheduler() {
+    scheduler_.reset(new TestVirtualScheduler(
+        grappler_item_.get(), true /* use_static_shapes */, cluster_.get()));
+    TF_CHECK_OK(scheduler_->Init());
+  }
+
+  // Call this after init scheduler_. Scheduler stops after executing
+  // target_node.
+  std::unordered_map<string, NodeInfo> RunScheduler(const string& target_node) {
+    Costs zero_costs = Costs::ZeroCosts();
+    std::unordered_map<string, NodeInfo> ops_executed;
+    bool more_nodes = true;
+    do {
+      NodeInfo node_info = scheduler_->GetCurrNodeInfo();
+      ops_executed[node_info.name] = node_info;
+
+      // Check scheduling order.
+      auto it = dependency_.find(node_info.name);
+      if (it != dependency_.end()) {
+        for (const auto& preceding_node : it->second) {
+          EXPECT_GT(ops_executed.count(preceding_node), 0);
+        }
+      }
+      more_nodes = scheduler_->MarkCurrNodeExecuted(zero_costs);
+
+      if (node_info.name == target_node) {
+        // Scheduler has the state after executing the target node.
+        break;
+      }
+    } while (more_nodes);
+    return ops_executed;
+  }
+
+  // Helper method for validating a vector.
+  template <typename T>
+  void ExpectVectorEq(const std::vector<T>& expected,
+                      const std::vector<T>& test_elements) {
+    // Set of expected elements for an easy comparison.
+    std::set<T> expected_set(expected.begin(), expected.end());
+    for (const auto& element : test_elements) {
+      EXPECT_GT(expected_set.count(element), 0);
+    }
+    EXPECT_EQ(expected.size(), test_elements.size());
+  }
+
+  // Helper method that checks the name of nodes.
+  void ValidateNodeDefs(const std::vector<string>& expected,
+                        const std::vector<const NodeDef*>& node_defs) {
+    std::vector<string> node_names;
+    std::transform(node_defs.begin(), node_defs.end(),
+                   std::back_inserter(node_names),
+                   [](const NodeDef* node) { return node->name(); });
+    ExpectVectorEq(expected, node_names);
+  }
+
+  // Helper method for validating a set.
+  template <typename T>
+  void ExpectSetEq(const std::set<T>& expected,
+                   const std::set<T>& test_elements) {
+    for (const auto& element : test_elements) {
+      EXPECT_GT(expected.count(element), 0);
+    }
+    EXPECT_EQ(expected.size(), test_elements.size());
+  }
+
+  // Helper method tthat checks name - port pairs.
+  void ValidateMemoryUsageSnapshot(
+      const std::vector<string>& expected_names, const int port_num_expected,
+      const std::set<std::pair<const NodeDef*, int>>& mem_usage_snapshot) {
+    std::set<std::pair<string, int>> nodes_at_peak_mem_usage;
+    std::transform(
+        mem_usage_snapshot.begin(), mem_usage_snapshot.end(),
+        std::inserter(nodes_at_peak_mem_usage, nodes_at_peak_mem_usage.begin()),
+        [](const std::pair<const NodeDef*, int>& node_port) {
+          return std::make_pair(node_port.first->name(), node_port.second);
+        });
+    std::set<std::pair<string, int>> expected;
+    std::transform(expected_names.begin(), expected_names.end(),
+                   std::inserter(expected, expected.begin()),
+                   [port_num_expected](const string& name) {
+                     return std::make_pair(name, port_num_expected);
+                   });
+    ExpectSetEq(expected, nodes_at_peak_mem_usage);
+  }
+
+  // Helper method for converting shape vector to TensorProperty.
+  OpInfo::TensorProperties ShapeToTensorProperty(
+      const std::vector<int> shape, const DataType& data_type) const {
+    OpInfo::TensorProperties tensor_property;
+    tensor_property.set_dtype(data_type);
+    for (const auto& x : shape) {
+      tensor_property.mutable_shape()->add_dim()->set_size(x);
+    }
+    return tensor_property;
+  }
+
+  // SetUp() inits cluster_ and placer_.
+  std::unique_ptr<VirtualCluster> cluster_;
+  std::unique_ptr<VirtualPlacer> placer_;
+
+  // grappler_item_ and scheduler_ will be initialized differently for each test
+  // case.
+  std::unique_ptr<GrapplerItem> grappler_item_;
+  std::unique_ptr<TestVirtualScheduler> scheduler_;
+  // Node name -> its preceding nodes map for testing scheduling order.
+  std::unordered_map<string, std::vector<string>> dependency_;
+
+  // Shared params for Conv2D related graphs:
+  const int batch_size_ = 4;
+  const int width_ = 10;
+  const int height_ = 10;
+  const int depth_in_ = 8;
+  const int kernel_ = 3;
+  const int depth_out_ = 16;
+};
+
+TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
+  // Init.
+  CreateGrapplerItemWithConv2Ds();
+  InitScheduler();
+
+  // Run the scheduler.
+  auto ops_executed = RunScheduler("");  // Run all the nodes.
+
+  // [const and rand] * (x, y, f), and c0 and c1. c2 and z shouldn't be
+  // executed.
+  EXPECT_EQ(8, ops_executed.size());
+
+  // x, y, f, c0, and c1 should be in the ops executed.
+  EXPECT_GT(ops_executed.count("x"), 0);
+  EXPECT_GT(ops_executed.count("y"), 0);
+  EXPECT_GT(ops_executed.count("f"), 0);
+  EXPECT_GT(ops_executed.count("c0"), 0);
+  EXPECT_GT(ops_executed.count("c1"), 0);
+
+  // z and c2 shouldn't be part of it.
+  EXPECT_EQ(ops_executed.count("z"), 0);
+  EXPECT_EQ(ops_executed.count("c2"), 0);
+
+  // Check input / output properties.
+  EXPECT_EQ(1, ops_executed["x"].op_info.outputs_size());
+  EXPECT_EQ(1, ops_executed["y"].op_info.outputs_size());
+  EXPECT_EQ(1, ops_executed["f"].op_info.outputs_size());
+  EXPECT_EQ(2, ops_executed["c0"].op_info.inputs_size());
+  EXPECT_EQ(2, ops_executed["c1"].op_info.inputs_size());
+}
+
+TEST_F(VirtualSchedulerTest, CalculateOutputSize) {
+  // Init.
+  CreateGrapplerItemWithAddN();
+  InitScheduler();
+
+  // Create a set of tensor properties.
+  std::vector<OpInfo::TensorProperties> output;
+  output.push_back(ShapeToTensorProperty({4, 4}, DT_FLOAT));           // 0
+  output.push_back(ShapeToTensorProperty({1}, DT_FLOAT));              // 1
+  output.push_back(ShapeToTensorProperty({10, 10, 10}, DT_HALF));      // 2
+  output.push_back(ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT));  // 3
+  output.push_back(ShapeToTensorProperty({-1, 7, 8, 99}, DT_FLOAT));   // 4
+  output.push_back(ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT));  // 4
+
+  // port_num -1 is for control dependency: hard coded 4B.
+  EXPECT_EQ(4, scheduler_->CalculateOutputSize(output, -1));
+
+  // Test valid outputs.
+  EXPECT_EQ(4 * 4 * 4, scheduler_->CalculateOutputSize(output, 0));
+  EXPECT_EQ(4 * 1, scheduler_->CalculateOutputSize(output, 1));
+  EXPECT_EQ(2 * 10 * 10 * 10, scheduler_->CalculateOutputSize(output, 2));
+  EXPECT_EQ(4 * 100 * 7 * 8 * 99, scheduler_->CalculateOutputSize(output, 3));
+
+  // Any uknown shape (-1) shall yield zero output size.
+  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 4));
+  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 5));
+
+  // Invalid port_num (though it may be an error) shall yield zero
+  // output size.
+  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 6));
+}
+
+TEST_F(VirtualSchedulerTest, MemoryUsage) {
+  // Init.
+  CreateGrapplerItemWithAddN();
+  InitScheduler();
+
+  // Run the scheduler.
+  RunScheduler("");
+
+  const auto& device_states = scheduler_->GetDeviceStates();
+  const auto& cpu_state = device_states.at(kCPU0);
+
+  // out node adds 4 tensors, each with 10x10x10x10, so the peak memory usage
+  // is 4 x the input tensor size while executing the out node.
+  int64 one_input_node_size = 4 * 10 * 10 * 10 * 10;
+  const std::vector<string> expected_names = {"x", "y", "z", "w"};
+  EXPECT_EQ(expected_names.size() * one_input_node_size,
+            cpu_state.max_memory_usage);
+  ValidateMemoryUsageSnapshot(expected_names, 0 /* port_num_expected */,
+                              cpu_state.mem_usage_snapshot_at_peak);
+}
+
+TEST_F(VirtualSchedulerTest, ControlDependency) {
+  // Init.
+  CreateGrapplerItemWithControlDependency();
+  InitScheduler();
+
+  // Run the scheduler.
+  RunScheduler("");
+
+  const auto& device_states = scheduler_->GetDeviceStates();
+  const auto& cpu_state = device_states.at(kCPU0);
+
+  // The graph has a NoOp that takes control dependency from 7 NoOps. The peak
+  // memory usage is when executing the final NoOp.
+  int64 one_input_node_size = 4;  // control dependency
+  const std::vector<string> expected_names = {"x", "y", "z", "w",
+                                              "u", "v", "t"};
+  EXPECT_EQ(expected_names.size() * one_input_node_size,
+            cpu_state.max_memory_usage);
+  ValidateMemoryUsageSnapshot(expected_names, -1 /* port_num_expected */,
+                              cpu_state.mem_usage_snapshot_at_peak);
+}
+
+TEST_F(VirtualSchedulerTest, ComplexDependency) {
+  // Init.
+  CreateGrapplerItemWithBatchNorm();
+  InitScheduler();
+
+  // Run the scheduler.
+  RunScheduler("bn");
+
+  const auto& device_states = scheduler_->GetDeviceStates();
+  const auto& cpu_state = device_states.at(kCPU0);
+
+  // The graph is
+  //  bn = FusedBatchNorm(x, scale, offset, mean, var)
+  //  z1 = bn.y + x
+  //  z2 = bn.var + bn.var
+  //  z3 = bn.var + bn.var
+  //  z4 = control dependency from bn.
+  //  Note that bn.mean doesn't have any consumer.
+  const int x_size = batch_size_ * width_ * height_ * depth_in_;
+  int64 expected_size =
+      4 * (2 * x_size /* x and bn.y */ + depth_in_ /* bn.var */ +
+           1 /* control dependency */);
+  EXPECT_EQ(expected_size, cpu_state.memory_usage);
+
+  // Nodes currently in memory: bn's port -1, 0, and 2, and x's port 0.
+  std::set<std::pair<string, int>> nodes_in_memory;
+  std::transform(
+      cpu_state.nodes_in_memory.begin(), cpu_state.nodes_in_memory.end(),
+      std::inserter(nodes_in_memory, nodes_in_memory.begin()),
+      [](const std::pair<const NodeDef*, int>& node_port) {
+        return std::make_pair(node_port.first->name(), node_port.second);
+      });
+  std::set<std::pair<string, int>> expected = {
+      std::make_pair("bn", -1), std::make_pair("bn", 0),
+      std::make_pair("bn", 2), std::make_pair("x", 0),
+  };
+  ExpectSetEq(expected, nodes_in_memory);
+
+  const auto& node_states = scheduler_->GetNodeStates();
+  const NodeState* bn_node = nullptr;
+  const NodeState* x_node = nullptr;
+  for (const auto& nodedef_node_state : node_states) {
+    const NodeDef* node = nodedef_node_state.first;
+    const NodeState& node_state = nodedef_node_state.second;
+    if (node->name() == "bn") {
+      bn_node = &node_state;
+    }
+    if (node->name() == "x") {
+      x_node = &node_state;
+    }
+  }
+  CHECK_NOTNULL(bn_node);
+  CHECK_NOTNULL(x_node);
+
+  ValidateNodeDefs({"bn", "z1"}, x_node->outputs.at(0));
+  ValidateNodeDefs({"z4"}, bn_node->outputs.at(-1));
+  ValidateNodeDefs({"z1"}, bn_node->outputs.at(0));
+  // z2 and z3 are bn.var + bn.var, so they appear twice in bn's output port 2.
+  ValidateNodeDefs({"z2", "z3", "z2", "z3"}, bn_node->outputs.at(2));
+}
+
+TEST_F(VirtualSchedulerTest, Variable) {
+  // Init.
+  CreateGrapplerItemWithConv2DAndVariable();
+  InitScheduler();
+
+  // Run the scheduler.
+  RunScheduler("");
+
+  const auto& device_states = scheduler_->GetDeviceStates();
+  const auto& cpu_state = device_states.at(kCPU0);
+
+  // There is one Conv2D that takes x and f, but f is variable, so it should be
+  // in persistent nodes.
+  // f is variable.
+  ValidateMemoryUsageSnapshot({"f"}, 0 /* port_num_expected */,
+                              cpu_state.persistent_nodes);
+  // Only x in peak memory usage snapshot.
+  ValidateMemoryUsageSnapshot({"x"}, 0 /* port_num_expected */,
+                              cpu_state.mem_usage_snapshot_at_peak);
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
new file mode 100644
index 00000000000..b318ac22d4b
--- /dev/null
+++ b/tensorflow/core/grappler/devices.cc
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/platform/cpu_info.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace grappler {
+
+int GetNumAvailableGPUs() {
+  int num_eligible_gpus = 0;
+#if GOOGLE_CUDA
+  if (ValidateGPUMachineManager().ok()) {
+    perftools::gputools::Platform* gpu_manager = GPUMachineManager();
+    if (gpu_manager != nullptr) {
+      int num_gpus = gpu_manager->VisibleDeviceCount();
+      for (int i = 0; i < num_gpus; i++) {
+        auto exec_status = gpu_manager->ExecutorForDevice(i);
+        if (exec_status.ok()) {
+          perftools::gputools::StreamExecutor* se = exec_status.ValueOrDie();
+          const perftools::gputools::DeviceDescription& desc =
+              se->GetDeviceDescription();
+          int min_gpu_core_count = 8;
+          if (desc.core_count() >= min_gpu_core_count) {
+            num_eligible_gpus++;
+          }
+        }
+      }
+    }
+  }
+#endif  // GOOGLE_CUDA
+  LOG(INFO) << "Number of eligible GPUs (core count >= 8): "
+            << num_eligible_gpus;
+  return num_eligible_gpus;
+}
+
+int64 AvailableGPUMemory(int gpu_id) {
+#if GOOGLE_CUDA
+  // Look up the device, to see its attributes.
+  perftools::gputools::Platform* gpu_platform = GPUMachineManager();
+  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
+  perftools::gputools::StreamExecutor* se =
+      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+  int64 total_memory, available_memory;
+  CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
+
+  return available_memory;
+#else
+  return 0;
+#endif
+}
+
+int GetNumAvailableLogicalCPUCores() { return port::NumSchedulableCPUs(); }
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/devices.h b/tensorflow/core/grappler/devices.h
new file mode 100644
index 00000000000..2d6c41888d9
--- /dev/null
+++ b/tensorflow/core/grappler/devices.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_DEVICES_H_
+#define TENSORFLOW_GRAPPLER_DEVICES_H_
+
+#include <functional>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Get the number of available GPUs whose number of multiprocessors is no less
+// than 8.
+int GetNumAvailableGPUs();
+
+// Maximum amount of gpu memory available per gpu. gpu_id must be in the range
+// [0, num_available_gpu)
+int64 AvailableGPUMemory(int gpu_id);
+
+// Get the number of logical CPU cores (aka hyperthreads) available.
+int GetNumAvailableLogicalCPUCores();
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_DEVICES_H_
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
new file mode 100644
index 00000000000..312a457abf4
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/grappler_item.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
+  return ComputeTransitiveFanin(graph, fetch);
+}
+
+std::vector<const NodeDef*> GrapplerItem::InitOpsFanin() const {
+  return ComputeTransitiveFanin(graph, init_ops);
+}
+
+std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
+  std::vector<const NodeDef*> fanin = ComputeTransitiveFanin(graph, init_ops);
+  std::vector<const NodeDef*> vars;
+  for (const NodeDef* node : fanin) {
+    if (IsVariable(*node)) {
+      vars.push_back(node);
+    }
+  }
+  return vars;
+}
+
+std::vector<const NodeDef*> ComputeTransitiveFanin(
+    const GraphDef& graph, const std::vector<string>& terminal_nodes) {
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  std::vector<const NodeDef*> queue;
+  for (const string& root : terminal_nodes) {
+    const NodeDef* node = name_to_node[NodeName(root)];
+    CHECK(node);
+    queue.push_back(node);
+  }
+
+  std::vector<const NodeDef*> result;
+  std::unordered_set<const NodeDef*> visited;
+
+  while (!queue.empty()) {
+    const NodeDef* node = queue.back();
+    queue.pop_back();
+    if (!visited.insert(node).second) {
+      // The node has already been visited.
+      continue;
+    }
+    result.push_back(node);
+    for (const string& input : node->input()) {
+      const NodeDef* in = name_to_node[NodeName(input)];
+      CHECK(in);
+      queue.push_back(in);
+    }
+  }
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
new file mode 100644
index 00000000000..e0709c682b0
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A TensorFlow model to optimize.
+// Models are represented by the combination of a graph, one of more fetch
+// nodes, and potentially a set of nodes to feed.
+// TODO(volunteer_needed): turn this struct into a class.
+struct GrapplerItem {
+  string id;  // A unique id for this item
+
+  // Inputs
+  GraphDef graph;
+  std::vector<std::pair<string, Tensor>> feed;
+  std::vector<string> fetch;
+
+  // Initialization op(s).
+  std::vector<string> init_ops;
+  // Expected initialization time in seconds, or 0 if unknown
+  int64 expected_init_time = 0;
+
+  // Queue runner(s) required to run the queue(s) of this model.
+  std::vector<QueueRunnerDef> queue_runners;
+
+  // Return the set of node evaluated during a regular train/inference step.
+  std::vector<const NodeDef*> MainOpsFanin() const;
+  // Return the set nodes used by TensorFlow to initialize the graph.
+  std::vector<const NodeDef*> InitOpsFanin() const;
+  // Return the set of variables accessed during a regular train/inference step.
+  std::vector<const NodeDef*> MainVariables() const;
+};
+
+// Return the transitive fanin of a set of terminal nodes.
+std::vector<const NodeDef*> ComputeTransitiveFanin(
+    const GraphDef& graph, const std::vector<string>& terminal_nodes);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
new file mode 100644
index 00000000000..bb36152bd87
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -0,0 +1,339 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+void InitializeTensor(DataType type, Tensor* tensor) {
+  const int period = 7;
+  if (type == DT_FLOAT) {
+    auto flat = tensor->flat<float>();
+    // Populate numbers 0, 0.1, 0.2, ..., 0.5, 0.6, 0, 0.1, 0.2, ...
+    for (int i = 0; i < flat.size(); i++) {
+      flat(i) = static_cast<float>(i % period) / 10.0f;
+    }
+  } else if (type == DT_INT64) {
+    auto flat = tensor->flat<int64>();
+    // Populate numbers 0, 1, 2, ..., 5, 6, 0, 1, 2, ...
+    for (int i = 0; i < flat.size(); i++) {
+      flat(i) = i % period;
+    }
+  } else {
+    memset(const_cast<char*>(tensor->tensor_data().data()), 0,
+           tensor->tensor_data().size());
+  }
+}
+
+// Optimize the graph def (including function inlining and other optimizations).
+// This is a temporary change that optimizes the graph in context of a single
+// gpu machine. Down the line, we may want to make grappler_item_builder aware
+// of the cluster type (E.g: single cpu, multiple gpu, etc)  being simulated in
+// order to get the correct session options and environment, and performing the
+// correct optimizations.
+Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
+                     const ItemConfig& cfg) {
+  // Create a session option for a single GPU device.
+  SessionOptions options;
+
+  // Inline all functions.
+  GraphDef inlined_graph_def(graph_def);
+  for (int i = 0; i < inlined_graph_def.library().function().size(); i++) {
+    FunctionDef* fdef =
+        inlined_graph_def.mutable_library()->mutable_function(i);
+    SetAttrValue(false, &((*fdef->mutable_attr())[kNoInlineAttr]));
+  }
+
+  // Instantiate all variables for function library runtime creation.
+  std::vector<Device*> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             inlined_graph_def.library());
+  Env* env = Env::Default();
+
+  // Optimizer options: L1 and inlining. L1 is default.
+  OptimizerOptions* optimizer_opts =
+      options.config.mutable_graph_options()->mutable_optimizer_options();
+  if (cfg.apply_optimizations) {
+    optimizer_opts->set_opt_level(::tensorflow::OptimizerOptions_Level_L1);
+  } else {
+    optimizer_opts->set_opt_level(::tensorflow::OptimizerOptions_Level_L0);
+  }
+  optimizer_opts->set_do_function_inlining(cfg.inline_functions);
+
+  // Create the function library runtime.
+  std::unique_ptr<FunctionLibraryRuntime> flib(NewFunctionLibraryRuntime(
+      dvc_mgr.get(), env, devices[0], inlined_graph_def.versions().producer(),
+      &function_library, *optimizer_opts));
+
+  // Create the GraphOptimizer to optimize the graph def.
+  GraphConstructorOptions graph_ctor_opts;
+  graph_ctor_opts.allow_internal_ops = true;
+  graph_ctor_opts.expect_device_spec = false;
+  std::unique_ptr<Graph> graphptr(new Graph(function_library));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_ctor_opts, inlined_graph_def,
+                                            graphptr.get()));
+
+  // Optimize the graph.
+  GraphOptimizer optimizer(*optimizer_opts);
+  optimizer.Optimize(flib.get(), env, devices[0], &graphptr);
+  graphptr->ToGraphDef(output_graph_def);
+
+  return Status::OK();
+}
+}  // namespace
+
+// static
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
+    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg) {
+  if (id.empty()) {
+    LOG(ERROR) << "id must be non-empty.";
+    return nullptr;
+  }
+  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+  new_item->id = id;
+  new_item->graph = meta_graph.graph_def();
+
+  // Attempt to detect the fetch node(s).
+  if (meta_graph.collection_def().count("train_op") > 0) {
+    const CollectionDef& nodes = meta_graph.collection_def().at("train_op");
+    if (nodes.has_node_list()) {
+      for (const auto& node : nodes.node_list().value()) {
+        const string name = NodeName(node);
+        if (name.empty()) {
+          LOG(ERROR) << "Invalid fetch node name " << node
+                     << ", skipping this input";
+          return nullptr;
+        }
+        LOG(INFO) << "Will use fetch node " << name;
+        new_item->fetch.push_back(name);
+      }
+    }
+  }
+  if (new_item->fetch.empty()) {
+    LOG(ERROR) << "Failed to detect the fetch node(s), skipping this input";
+    return nullptr;
+  }
+
+  for (auto& node : *new_item->graph.mutable_node()) {
+    if (IsPlaceholder(node)) {
+      if (node.attr().count("dtype") == 0) {
+        LOG(ERROR) << "Unknown type for placeholder " << node.name()
+                   << ", skipping this input";
+        return nullptr;
+      }
+      DataType type = node.attr().at("dtype").type();
+
+      if (node.attr().count("shape") == 0) {
+        LOG(INFO) << "Unknown shape for placeholder " << node.name()
+                  << ", skipping this input";
+        return nullptr;
+      }
+
+      // Replace all unknown dimensions in the placeholder's tensorshape proto
+      // with cfg.placeholder_unknown_output_shape_dim and create a tensorshape
+      // from it. We do this because in newer protos, the input placeholder
+      // shape is not empty if the shape is partially defined.
+      TensorShape shape;
+      TensorShapeProto shape_proto;
+      std::vector<int32> dims;
+      for (const auto& dim_proto : node.attr().at("shape").shape().dim()) {
+        if (cfg.placeholder_unknown_output_shape_dim >= 0 &&
+            dim_proto.size() == -1) {
+          dims.push_back(cfg.placeholder_unknown_output_shape_dim);
+          shape_proto.add_dim()->set_size(
+              cfg.placeholder_unknown_output_shape_dim);
+        } else {
+          dims.push_back(dim_proto.size());
+          shape_proto.add_dim()->set_size(dim_proto.size());
+        }
+      }
+      Status make_shape_status =
+          TensorShapeUtils::MakeShape(dims.data(), dims.size(), &shape);
+      if (!make_shape_status.ok()) {
+        LOG(ERROR) << "Invalid shape for placeholder " << node.name() << ": "
+                   << make_shape_status << ", skipping this input";
+        return nullptr;
+      }
+
+      // Some placeholder nodes have a mis-match between the node
+      // attribute "shape" and a different node attribute "_output_shapes".
+      // Specifically, a shape with shape.dims() == 0 could indicate either
+      // a scalar or an unknown shape. In those cases, we check _output_shapes
+      // for additional information.
+      // This case is observed in the bnmt graphs. Have not observed any
+      // cases where there was more than 1 _output_shapes, so limit it
+      // to cases where there is only 1 _output_shapes.
+      // We only do this if cfg.placeholder_unknown_output_shape_dim has
+      // been set to avoid crashing non-BNMT graphs.
+      if ((cfg.placeholder_unknown_output_shape_dim >= 0) &&
+          (shape.dims() == 0) && (node.attr().count("_output_shapes") == 1) &&
+          (node.attr().at("_output_shapes").list().shape(0).dim_size() != 0)) {
+        shape.Clear();
+        shape_proto.clear_dim();
+        for (int dim_i = 0;
+             dim_i <
+             node.attr().at("_output_shapes").list().shape(0).dim_size();
+             dim_i++) {
+          const ::tensorflow::TensorShapeProto_Dim dim =
+              node.attr().at("_output_shapes").list().shape(0).dim(dim_i);
+          if (dim.size() == -1) {
+            shape.AddDim(cfg.placeholder_unknown_output_shape_dim);
+            shape_proto.add_dim()->set_size(
+                cfg.placeholder_unknown_output_shape_dim);
+          } else {
+            int size = node.attr()
+                           .at("_output_shapes")
+                           .list()
+                           .shape(0)
+                           .dim(dim_i)
+                           .size();
+            shape.AddDim(size);
+            shape_proto.add_dim()->set_size(size);
+          }
+        }
+      }
+      Tensor fake_input(type, shape);
+      InitializeTensor(type, &fake_input);
+      new_item->feed.emplace_back(node.name(), fake_input);
+      // Set the shape of the node in the graph. This is needed for statically
+      // inferring shapes and is a no-op when dynamically inferring shapes as
+      // the Placeholder shape will match the shape passed from new_item->feed.
+      *(node.mutable_attr()->at("shape").mutable_shape()) = shape_proto;
+    }
+
+    // Erase the recorded result of any previous shape inference to start again
+    // from scratch.
+    node.mutable_attr()->erase("_output_shapes");
+
+    // Delete user specified placement if requested.
+    if (cfg.ignore_user_placement) {
+      node.clear_device();
+    }
+    // Delete colocation constraints if requested.
+    if (cfg.ignore_colocation) {
+      auto attr = node.mutable_attr();
+      auto it = attr->find("_class");
+      if (it != attr->end()) {
+        attr->erase(it);
+      }
+    }
+  }
+
+  for (const string& var_collection :
+       {"variables", "local_variables", "model_variables",
+        "trainable_variables"}) {
+    if (meta_graph.collection_def().count(var_collection) == 0) {
+      continue;
+    }
+    const CollectionDef& vars = meta_graph.collection_def().at(var_collection);
+    for (const auto& raw_var : vars.bytes_list().value()) {
+      VariableDef var;
+      var.ParseFromString(raw_var);
+      if (!var.initializer_name().empty()) {
+        new_item->init_ops.push_back(var.initializer_name());
+      }
+    }
+  }
+
+  if (meta_graph.collection_def().count("table_initializer") > 0) {
+    const CollectionDef& inits =
+        meta_graph.collection_def().at("table_initializer");
+    if (inits.has_node_list()) {
+      for (const auto& node : inits.node_list().value()) {
+        new_item->init_ops.push_back(node);
+        // Tables are initialized from files, which can take a long time. Add 30
+        // minutes to the initialization time for each table to avoid timing
+        // out.
+        // TODO(bsteiner): adjust the timeout based on the file size.
+        new_item->expected_init_time += 30 * 60;
+      }
+    }
+  }
+
+  if (meta_graph.collection_def().count("queue_runners") > 0) {
+    const CollectionDef& vars = meta_graph.collection_def().at("queue_runners");
+    for (const auto& raw : vars.bytes_list().value()) {
+      QueueRunnerDef queue_runner;
+      if (!queue_runner.ParseFromString(raw)) {
+        LOG(ERROR) << "Could parse queue_runners, skipping this input";
+        return nullptr;
+      }
+      if (queue_runner.cancel_op_name().empty()) {
+        LOG(ERROR) << "Queue without a cancel op, skipping this input";
+        return nullptr;
+      }
+      new_item->queue_runners.push_back(queue_runner);
+    }
+  }
+
+  // Make sure we still can access the input files (aka "asset_filepaths") since
+  // these might have been moved or deleted, the cns cell might have been shut
+  // down, or we might be running as a user who does not have access to the
+  // files.
+  if (meta_graph.collection_def().count("asset_filepaths") > 0) {
+    const CollectionDef& file_paths =
+        meta_graph.collection_def().at("asset_filepaths");
+    std::vector<string> paths;
+    for (const auto& raw_path : file_paths.bytes_list().value()) {
+      paths.push_back(raw_path);
+    }
+    if (!FilesExist(paths, nullptr)) {
+      LOG(ERROR)
+          << "Can't access one or more of the asset files, skipping this input";
+      return nullptr;
+    }
+  }
+
+  // Optimize the graph (function inlining, l1 optimizations, etc).
+  Status optimize_status =
+      OptimizeGraph(new_item->graph, &new_item->graph, cfg);
+  if (!optimize_status.ok()) {
+    LOG(ERROR) << "Function optimization failed: " << optimize_status;
+    return nullptr;
+  }
+
+  return new_item;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
new file mode 100644
index 00000000000..3aa1d2027f5
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+
+class MetaGraphDef;
+
+namespace grappler {
+
+struct ItemConfig {
+  ItemConfig()
+      : ignore_user_placement(true),
+        ignore_colocation(true),
+        placeholder_unknown_output_shape_dim(-1),
+        apply_optimizations(true),
+        inline_functions(true) {}
+
+  // If true, ignore all user specified node placement.
+  bool ignore_user_placement;
+  // If true, ignore all user specified colocation attributes.
+  bool ignore_colocation;
+  // Dimension to use if a placeholder node has an _output_shapes attribute with
+  // a dimension of -1.
+  int placeholder_unknown_output_shape_dim;
+  // If true, does L1 optimizations.
+  bool apply_optimizations;
+  // If true, does inlining.
+  bool inline_functions;
+};
+
+// Factory method for creating a GrapplerItem from a MetaGraphDef.
+// Returns nullptr if the given meta_graph cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
+    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
new file mode 100644
index 00000000000..92225ffb1b4
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/gradients/grad_testutil.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GrapplerItemBuilderTest : public ::testing::Test {};
+
+// Create a sample graph with a symbolic gradient for sum.
+void SampleSumSymbolicGradientGraphdef(
+    GraphDef *def, CollectionDef *fetches,
+    std::vector<string> *names_of_ops_of_inline) {
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  auto dummy_variable = Variable(scope, {2, 2}, DT_FLOAT);
+  auto x = Const(scope, 1.0f);
+  auto y = Const(scope, 2);
+  auto z = Const(scope, 3.0f);
+  TF_ASSERT_OK(scope.status());
+
+  NameAttrList fn;
+  fn.set_name("Sum");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
+                             {DT_FLOAT, DT_INT32}, fn);
+
+  fetches->mutable_node_list()->add_value(g0[0].name());
+
+  TF_CHECK_OK(scope.ToGraphDef(def));
+
+  // Add names of the ops that replace the Mul symbolic gradient during
+  // inlining. This is for validation.
+  *names_of_ops_of_inline = {
+      "SymbolicGradient/dx",          "SymbolicGradient/tile_scaling",
+      "SymbolicGradient/dy_reshaped", "SymbolicGradient/y_shape",
+      "SymbolicGradient/x_shape",     "SymbolicGradient/stitch_idx0",
+      "SymbolicGradient/x_rank",      "SymbolicGradient/stitch_val1",
+      "SymbolicGradient/i_shape",     "SymbolicGradient/di",
+      "SymbolicGradient/zero",        "SymbolicGradient/one"};
+}
+
+std::unique_ptr<GrapplerItem> CreateGrapplerItem(const GraphDef &def,
+                                                 const CollectionDef &fetches) {
+  MetaGraphDef meta_def;
+  ItemConfig cfg;
+  cfg.inline_functions = true;
+  *meta_def.mutable_graph_def() = def;
+  (*meta_def.mutable_collection_def())["train_op"] = fetches;
+  return GrapplerItemFromMetaGraphDef("0", meta_def, cfg);
+}
+
+int CountSymbolicGradientOps(const std::unique_ptr<GrapplerItem> &item) {
+  int n_symb_grads = 0;
+  for (const auto &node : item->graph.node()) {
+    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
+      n_symb_grads++;
+    }
+  }
+  return n_symb_grads;
+}
+
+int CountOpsWithNames(const std::unique_ptr<GrapplerItem> &item,
+                      const std::vector<string> &names) {
+  std::set<string> names_set(names.begin(), names.end());
+  int n_with_names = 0;
+  for (const auto &node : item->graph.node()) {
+    if (names_set.find(node.name()) != names_set.end()) {
+      n_with_names++;
+    }
+  }
+  return n_with_names;
+}
+
+TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
+  // Create sample sum symbolic gradient graph.
+  GraphDef def;
+  CollectionDef fetches;
+  std::vector<string> ops_of_inline;
+  SampleSumSymbolicGradientGraphdef(&def, &fetches, &ops_of_inline);
+
+  // Create the inlined graph.
+  std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
+
+  // For the inlined graph, there should be 0 symbolic gradient ops.
+  CHECK_EQ(0, CountSymbolicGradientOps(with_inline));
+
+  // For the inlined graph, make sure all the required expanded op’s are in the
+  // graph.
+  CHECK_EQ(ops_of_inline.size(), CountOpsWithNames(with_inline, ops_of_inline));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
new file mode 100644
index 00000000000..72a9f481cab
--- /dev/null
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GrapplerItemTest : public ::testing::Test {};
+
+TEST_F(GrapplerItemTest, Basic) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {{"CPU:0"}});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  EXPECT_TRUE(item.InitOpsFanin().empty());
+
+  std::vector<string> graph_nodes;
+  for (const auto& node : item.graph.node()) {
+    graph_nodes.push_back(node.name());
+  }
+  std::vector<string> main_ops;
+  for (const auto& node : item.MainOpsFanin()) {
+    main_ops.push_back(node->name());
+  }
+  std::sort(graph_nodes.begin(), graph_nodes.end());
+  std::sort(main_ops.begin(), main_ops.end());
+  EXPECT_EQ(main_ops, graph_nodes);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
new file mode 100644
index 00000000000..176b3e982fb
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -0,0 +1,70 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils.cc",
+    ],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "utils_test",
+    size = "small",
+    srcs = [
+        "utils_test.cc",
+    ],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "input_yielder",
+    hdrs = [
+        "input_yielder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [],
+)
+
+cc_library(
+    name = "trivial_test_graph_input_yielder",
+    srcs = ["trivial_test_graph_input_yielder.cc"],
+    hdrs = [
+        "trivial_test_graph_input_yielder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":input_yielder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/kernels:aggregate_ops",
+    ],
+)
diff --git a/tensorflow/core/grappler/inputs/input_yielder.h b/tensorflow/core/grappler/inputs/input_yielder.h
new file mode 100644
index 00000000000..c9f90820a99
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/input_yielder.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+
+namespace tensorflow {
+namespace grappler {
+
+struct GrapplerItem;
+
+// Abstract interface for yielding graphs that we want to optimize.
+class InputYielder {
+ public:
+  virtual ~InputYielder() {}
+
+  virtual bool NextItem(GrapplerItem* item) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/testdata/test_file.txt b/tensorflow/core/grappler/inputs/testdata/test_file.txt
new file mode 100644
index 00000000000..557db03de99
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/testdata/test_file.txt
@@ -0,0 +1 @@
+Hello World
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
new file mode 100644
index 00000000000..446ae2df643
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The builtin inputs provide a mechanism to generate simple TensorFlow graphs
+// and feed them as inputs to Grappler. This can be used for quick experiments
+// or to derive small regression tests.
+
+#include "tensorflow/cc/ops/standard_ops.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Make a program with specified number of stages and "width" ops per stage.
+namespace {
+GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
+                        bool use_multiple_devices, bool insert_queue,
+                        const std::vector<string>& device_names) {
+  CHECK_GE(device_names.size(), width);
+
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // x is from the feed.
+  const int batch_size = tensor_size < 0 ? 1 : tensor_size;
+  Output x =
+      RandomNormal(s.WithOpName("x"), {batch_size, 1}, DataType::DT_FLOAT);
+
+  // Create stages.
+  std::vector<Output> last_stage;
+  last_stage.push_back(x);
+  for (int i = 0; i < num_stages; i++) {
+    std::vector<Output> this_stage;
+    for (int j = 0; j < width; j++) {
+      Output combine = AddN(
+          s.WithDevice(device_names[use_multiple_devices ? j : 0]), last_stage);
+      this_stage.push_back(combine);
+    }
+    last_stage = this_stage;
+  }
+
+  if (insert_queue) {
+    FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_FLOAT});
+    QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, last_stage);
+    QueueDequeue dequeue(s.WithOpName("dequeue"), queue, {DataType::DT_FLOAT});
+    QueueClose cancel(s.WithOpName("cancel"), queue,
+                      QueueClose::CancelPendingEnqueues(true));
+    last_stage = {dequeue[0]};
+  }
+
+  // Create output.
+  AddN output(s.WithOpName("y"), last_stage);
+
+  GraphDef def;
+  TF_CHECK_OK(s.ToGraphDef(&def));
+  return def;
+}
+}  // namespace
+
+TrivialTestGraphInputYielder::TrivialTestGraphInputYielder(
+    int num_stages, int width, int tensor_size, bool insert_queue,
+    const std::vector<string>& device_names)
+    : num_stages_(num_stages),
+      width_(width),
+      tensor_size_(tensor_size),
+      insert_queue_(insert_queue),
+      device_names_(device_names) {}
+
+bool TrivialTestGraphInputYielder::NextItem(GrapplerItem* item) {
+  GrapplerItem r;
+  r.id = strings::StrCat("ns:", num_stages_, "/",  // wrap
+                         "w:", width_, "/",        // wrap
+                         "ts:", tensor_size_);
+  r.graph = CreateGraphDef(num_stages_, width_, tensor_size_,
+                           true /*use_multiple_devices*/, insert_queue_,
+                           device_names_);
+  // If the batch size is variable, we need to choose a value to create a feed
+  const int batch_size = tensor_size_ < 0 ? 1 : tensor_size_;
+  Tensor x(DT_FLOAT, TensorShape({batch_size, 1}));
+  r.feed.push_back(std::make_pair("x", x));
+  r.fetch.push_back("y");
+
+  if (insert_queue_) {
+    QueueRunnerDef queue_runner;
+    queue_runner.set_queue_name("queue");
+    queue_runner.set_cancel_op_name("cancel");
+    *queue_runner.add_enqueue_op_name() = "enqueue";
+    r.queue_runners.push_back(queue_runner);
+  }
+
+  *item = std::move(r);
+  return true;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
new file mode 100644
index 00000000000..434b660614b
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/grappler/inputs/input_yielder.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+class TrivialTestGraphInputYielder : public InputYielder {
+ public:
+  TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
+                               bool insert_queue,
+                               const std::vector<string>& device_names);
+  bool NextItem(GrapplerItem* item) override;
+
+ private:
+  const int num_stages_;
+  const int width_;
+  const int tensor_size_;
+  const bool insert_queue_;
+  std::vector<string> device_names_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
new file mode 100644
index 00000000000..17f41105b28
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/platform/env.h"
+
+#include <vector>
+
+namespace tensorflow {
+namespace grappler {
+
+bool FilesExist(const std::vector<string>& files, std::vector<Status>* status) {
+  return Env::Default()->FilesExist(files, status);
+}
+
+bool FilesExist(const std::set<string>& files) {
+  return FilesExist(std::vector<string>(files.begin(), files.end()), nullptr);
+}
+
+}  // End namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
new file mode 100644
index 00000000000..ee65ca031d4
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+
+#include <set>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool FilesExist(const std::vector<string>& files,
+                std::vector<Status>* status = nullptr);
+bool FilesExist(const std::set<string>& files);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
new file mode 100644
index 00000000000..694a8552809
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class UtilsTest : public ::testing::Test {
+ protected:
+  string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
+
+  void SetUp() override {
+    TF_CHECK_OK(env_->CreateDir(BaseDir()));
+    non_existent_file_ = io::JoinPath(BaseDir(), "non_existent_file.txt");
+    actual_file_ = io::JoinPath(BaseDir(), "test_file.txt");
+    TF_CHECK_OK(WriteStringToFile(env_, actual_file_, "Some test data"));
+  }
+
+  void TearDown() override {
+    int64 undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(
+        env_->DeleteRecursively(BaseDir(), &undeleted_files, &undeleted_dirs));
+  }
+
+  string non_existent_file_;
+  string actual_file_;
+  Env* env_ = Env::Default();
+};
+
+TEST_F(UtilsTest, FilesExist) {
+  EXPECT_FALSE(FilesExist(std::vector<string>{{non_existent_file_}}));
+  EXPECT_FALSE(
+      FilesExist(std::vector<string>{{non_existent_file_}, {actual_file_}}));
+  EXPECT_TRUE(FilesExist(std::vector<string>{{actual_file_}}));
+
+  std::vector<Status> status;
+  EXPECT_FALSE(FilesExist(
+      std::vector<string>{{non_existent_file_}, {actual_file_}}, &status));
+  EXPECT_EQ(status.size(), 2);
+  EXPECT_FALSE(status[0].ok());
+  EXPECT_TRUE(status[1].ok());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
new file mode 100644
index 00000000000..51146011b01
--- /dev/null
+++ b/tensorflow/core/grappler/op_types.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/op_types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsConcat(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat" || op == "ConcatV2";
+}
+
+bool IsConstant(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Const";
+}
+
+bool IsDequeueOp(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "QueueDequeueManyV2" || op == "QueueDequeueMany" ||
+         op == "QueueDequeueV2" || op == "QueueDequeue" ||
+         op == "QueueDequeueUpToV2" || op == "QueueDequeueUpTo";
+}
+
+bool IsIdentity(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Identity";
+}
+
+bool IsMerge(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Merge";
+}
+
+bool IsNoOp(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "NoOp";
+}
+
+bool IsPlaceholder(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Placeholder" || op == "PlaceholderV2" ||
+         op == "PlaceholderWithDefault";
+}
+
+bool IsRecv(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "_Recv";
+}
+
+bool IsReduction(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Sum" || op == "Prod" || op == "Min" || op == "Max" ||
+         op == "Mean" || op == "Any" || op == "All";
+}
+
+bool IsSend(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "_Send";
+}
+
+bool IsSwitch(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Switch";
+}
+
+bool IsTranspose(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Transpose";
+}
+
+bool IsVariable(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
+         op == "VarHandleOp" || op == "TemporaryVariable";
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
new file mode 100644
index 00000000000..b2102c688df
--- /dev/null
+++ b/tensorflow/core/grappler/op_types.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#define TENSORFLOW_GRAPPLER_OP_TYPES_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsConcat(const NodeDef& node);
+bool IsConstant(const NodeDef& node);
+bool IsDequeueOp(const NodeDef& node);
+bool IsIdentity(const NodeDef& node);
+bool IsMerge(const NodeDef& node);
+bool IsNoOp(const NodeDef& node);
+bool IsPlaceholder(const NodeDef& node);
+bool IsRecv(const NodeDef& node);
+bool IsReduction(const NodeDef& node);
+bool IsSend(const NodeDef& node);
+bool IsSwitch(const NodeDef& node);
+bool IsTranspose(const NodeDef& node);
+bool IsVariable(const NodeDef& node);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OP_TYPES_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
new file mode 100644
index 00000000000..6bb3d50b76d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -0,0 +1,280 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "static_schedule",
+    srcs = ["static_schedule.cc"],
+    hdrs = [
+        "static_schedule.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+    ],
+)
+
+cc_test(
+    name = "static_schedule_test",
+    size = "small",
+    srcs = ["static_schedule_test.cc"],
+    deps = [
+        ":static_schedule",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "auto_parallel",
+    srcs = ["auto_parallel.cc"],
+    hdrs = [
+        "auto_parallel.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "auto_parallel_test",
+    size = "small",
+    srcs = ["auto_parallel_test.cc"],
+    deps = [
+        ":auto_parallel",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "constant_folding",
+    srcs = ["constant_folding.cc"],
+    hdrs = [
+        "constant_folding.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+cc_test(
+    name = "constant_folding_test",
+    size = "small",
+    srcs = ["constant_folding_test.cc"],
+    deps = [
+        ":constant_folding",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_library(
+    name = "graph_rewriter",
+    srcs = ["graph_rewriter.cc"],
+    hdrs = [
+        "graph_rewriter.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_library(
+    name = "graph_optimizer",
+    hdrs = [
+        "graph_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "model_pruner",
+    srcs = ["model_pruner.cc"],
+    hdrs = [
+        "model_pruner.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        ":graph_rewriter",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "model_pruner_test",
+    size = "small",
+    srcs = ["model_pruner_test.cc"],
+    deps = [
+        ":model_pruner",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "memory_optimizer",
+    srcs = ["memory_optimizer.cc"],
+    hdrs = [
+        "memory_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        ":graph_rewriter",
+        ":static_schedule",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+cc_test(
+    name = "memory_optimizer_test",
+    size = "small",
+    srcs = ["memory_optimizer_test.cc"],
+    deps = [
+        ":memory_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+    ],
+)
+
+cc_library(
+    name = "layout_optimizer",
+    srcs = ["layout_optimizer.cc"],
+    hdrs = [
+        "layout_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+cc_test(
+    name = "layout_optimizer_test",
+    size = "small",
+    srcs = ["layout_optimizer_test.cc"],
+    deps = [
+        ":layout_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "meta_optimizer",
+    srcs = ["meta_optimizer.cc"],
+    hdrs = [
+        "meta_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":auto_parallel",
+        ":constant_folding",
+        ":graph_optimizer",
+        ":layout_optimizer",
+        ":memory_optimizer",
+        ":model_pruner",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
new file mode 100644
index 00000000000..d4326a022f4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -0,0 +1,268 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+const char kAutoParallelPrefix[] = "AutoParallel";
+
+NodeDef* AutoParallel::AddNodeDivConst() {
+  NodeDef* node = graph_.add_node();
+  node->set_name(strings::StrCat(kAutoParallelPrefix, "-Div-Const"));
+  node->set_op("Const");
+
+  AttrValue attr_data_type;
+  attr_data_type.set_type(DT_FLOAT);
+  node->mutable_attr()->insert({"dtype", attr_data_type});
+
+  AttrValue attr_tensor;
+  auto tensor = attr_tensor.mutable_tensor();
+  tensor->add_float_val(static_cast<float>(num_replicas_));
+  tensor->set_dtype(DT_FLOAT);
+  node->mutable_attr()->insert({"value", attr_tensor});
+  return node;
+}
+
+NodeDef* AutoParallel::AddNodeDiv(const string& name, const string& input_a,
+                                  const string& input_b) {
+  NodeDef* node = graph_.add_node();
+  node->set_name(strings::StrCat(kAutoParallelPrefix, "-Div-", name));
+  node->set_op("RealDiv");
+  node->add_input(input_a);
+  node->add_input(input_b);
+  AttrValue attr_type;
+  attr_type.set_type(DT_FLOAT);
+  node->mutable_attr()->insert({"T", attr_type});
+  return node;
+}
+
+NodeDef* AutoParallel::AddNodeControl(const string& name,
+                                      const std::set<string>& deps,
+                                      GraphDef* graph) {
+  NodeDef* node = graph->add_node();
+  node->set_name(name);
+  node->set_op("NoOp");
+  for (const auto& dep : deps) {
+    node->add_input(strings::StrCat("^", dep));
+  }
+  return node;
+}
+
+Status AutoParallel::Initialize(const GrapplerItem& item) {
+  num_gpus_ = GetNumAvailableGPUs();
+  LOG(INFO) << "Number of GPUs: " << num_gpus_;
+  item_ = &item;
+  graph_ = item.graph;
+  LOG(INFO) << "Original graph size: " << graph_.node_size();
+  if (item.fetch.empty()) {
+    return Status(error::INVALID_ARGUMENT, "No fetch nodes provided.");
+  }
+
+  if (item.MainVariables().empty()) {
+    return Status(error::INVALID_ARGUMENT, "No variables provided.");
+  }
+
+  for (const auto& init : item.init_ops) {
+    VLOG(1) << "Init node: " << init;
+  }
+
+  for (const auto& fetch : item.fetch) {
+    VLOG(1) << "Fetch node: " << fetch;
+  }
+
+  for (const auto& var : item.MainVariables()) {
+    VLOG(2) << "Variable: " << var->name();
+  }
+
+  const std::set<string> apply_gradients_ops = {"ApplyGradientDescent",
+                                                "ApplyProximalGradientDescent",
+                                                "ApplyAdadelta",
+                                                "ApplyAdagrad",
+                                                "ApplyProximalAdagrad",
+                                                "ApplyAdagradDA",
+                                                "ApplyFtrl",
+                                                "ApplyMomentum",
+                                                "ApplyAdam",
+                                                "ApplyRMSProp",
+                                                "ApplyCenteredRMSProp"};
+  for (int i = 0; i < graph_.node_size(); i++) {
+    all_nodes_.insert(
+        std::make_pair(graph_.node(i).name(), graph_.mutable_node(i)));
+    if (apply_gradients_ops.find(graph_.node(i).op()) !=
+        apply_gradients_ops.end()) {
+      apply_gradients_nodes_.insert(graph_.node(i).name());
+      VLOG(2) << "Apply gradients node: " << graph_.node(i).name();
+    }
+  }
+
+  auto div_const_node = AddNodeDivConst();
+  all_nodes_.insert(std::make_pair(div_const_node->name(), div_const_node));
+  std::map<string, int> gradient_pos = {{"ApplyGradientDescent", 2},
+                                        {"ApplyProximalGradientDescent", 4},
+                                        {"ApplyAdadelta", 6},
+                                        {"ApplyAdagrad", 3},
+                                        {"ApplyProximalAdagrad", 5},
+                                        {"ApplyAdagradDA", 3},
+                                        {"ApplyFtrl", 3},
+                                        {"ApplyMomentum", 3},
+                                        {"ApplyAdam", 9},
+                                        {"ApplyRMSProp", 7},
+                                        {"ApplyCenteredRMSProp", 8}};
+  for (const auto& apply_gradient_node_name : apply_gradients_nodes_) {
+    auto apply_gradients_op = all_nodes_[apply_gradient_node_name]->op();
+    auto apply_gradients_node = all_nodes_[apply_gradient_node_name];
+
+    auto div_node = AddNodeDiv(
+        apply_gradient_node_name,
+        apply_gradients_node->input(gradient_pos[apply_gradients_op]),
+        div_const_node->name());
+    all_nodes_.insert(std::make_pair(div_node->name(), div_node));
+    *apply_gradients_node->mutable_input(gradient_pos[apply_gradients_op]) =
+        div_node->name();
+  }
+  LOG(INFO) << "Graph size after adding div nodes: " << all_nodes_.size();
+
+  auto train_nodes = ComputeTransitiveFanin(graph_, item.fetch);
+  LOG(INFO) << "Number of training nodes: " << train_nodes.size();
+
+  const NodeDef* dequeue_node;
+  for (const auto& train_node : train_nodes) {
+    if (IsDequeueOp(*train_node)) {
+      dequeue_node = train_node;
+      break;
+    }
+  }
+
+  std::vector<const NodeDef*> input_nodes;
+  if (dequeue_node) {
+    LOG(INFO) << "Dequeue node: " << dequeue_node->name();
+    input_nodes = ComputeTransitiveFanin(graph_, {dequeue_node->name()});
+  }
+  LOG(INFO) << "Number of input nodes: " << input_nodes.size();
+
+  std::set<string> dont_replicate_nodes;
+  for (const auto& variable : item.MainVariables()) {
+    dont_replicate_nodes.insert(variable->name());
+  }
+  // Don't replicate all input nodes, except the dequeue node.
+  for (const auto& input_node : input_nodes) {
+    if (input_node->name() != dequeue_node->name()) {
+      dont_replicate_nodes.insert(input_node->name());
+    }
+  }
+
+  for (const auto& node : train_nodes) {
+    if (dont_replicate_nodes.find(node->name()) == dont_replicate_nodes.end()) {
+      replica_nodes_.insert(node->name());
+    }
+  }
+  LOG(INFO) << "Number of replica nodes: " << replica_nodes_.size();
+
+  for (const auto& node : all_nodes_) {
+    if (replica_nodes_.find(node.first) == replica_nodes_.end()) {
+      shared_nodes_.insert(node.first);
+    }
+  }
+  LOG(INFO) << "Number of shared nodes: " << shared_nodes_.size();
+  return Status::OK();
+}
+
+bool AutoParallel::NotSharedNode(const string& name) {
+  return shared_nodes_.find(name) == shared_nodes_.end();
+}
+
+void AutoParallel::AddSharedNodes(GraphDef* graph) {
+  string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", 0);
+  for (const auto& node : shared_nodes_) {
+    auto new_node = graph->add_node();
+    *new_node = *all_nodes_[node];
+    for (int i = 0; i < new_node->input_size(); i++) {
+      if (NotSharedNode(NodeName(new_node->input(i)))) {
+        string new_name = AddPrefixToNodeName(new_node->input(i), prefix);
+        *new_node->mutable_input(i) = new_name;
+      }
+    }
+  }
+}
+
+void AutoParallel::AddOneReplica(GraphDef* graph, int number) {
+  string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", number);
+  for (const auto& node : replica_nodes_) {
+    auto new_node = graph->add_node();
+    *new_node = *all_nodes_[node];
+    if (NotSharedNode(new_node->name())) {
+      new_node->set_name(AddPrefixToNodeName(new_node->name(), prefix));
+      if (num_gpus_ > 0) {
+        new_node->set_device(strings::StrCat("/gpu:", number % num_gpus_));
+      }
+      for (int i = 0; i < new_node->input_size(); i++) {
+        if (NotSharedNode(NodeName(new_node->input(i)))) {
+          string new_name = AddPrefixToNodeName(new_node->input(i), prefix);
+          *new_node->mutable_input(i) = new_name;
+        }
+      }
+    }
+  }
+}
+
+void AutoParallel::BuildGraph(GraphDef* graph) {
+  AddSharedNodes(graph);
+  for (int i = 0; i < num_replicas_; i++) {
+    AddOneReplica(graph, i);
+  }
+  std::set<string> fetches;
+  for (size_t i = 0; i < item_->fetch.size(); i++) {
+    for (int j = 0; j < num_replicas_; j++) {
+      string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", j);
+      string fetch = AddPrefixToNodeName(item_->fetch[i], prefix);
+      fetches.insert(fetch);
+    }
+  }
+  string name_control =
+      strings::StrCat(kAutoParallelPrefix, "-Control-", "Fetch");
+  auto control = AddNodeControl(name_control, fetches, graph);
+
+  for (const auto& fetch : item_->fetch) {
+    AddNodeControl(fetch, {control->name()}, graph);
+  }
+  *(graph->mutable_library()) = item_->graph.library();
+  LOG(INFO) << "Parallelized graph size: " << graph->node_size();
+}
+
+Status AutoParallel::Optimize(Cluster* cluster, const GrapplerItem& item,
+                              GraphDef* output) {
+  TF_RETURN_IF_ERROR(Initialize(item));
+  BuildGraph(output);
+  return Status::OK();
+}
+
+void AutoParallel::Feedback(Cluster* cluster, const GrapplerItem& item,
+                            const GraphDef& optimize_output, double result) {
+  // TODO(yaozhang): Add feedback.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
new file mode 100644
index 00000000000..ad90bbe0289
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Automatically parallelize a graph by splitting in the batch dimension.
+class AutoParallel : public GraphOptimizer {
+ public:
+  AutoParallel(int num_replicas) : num_replicas_(num_replicas) {
+    CHECK(num_replicas_ >= 2);
+  }
+  ~AutoParallel() override {}
+
+  string name() const override { return "autoparallel"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  GraphDef graph_;
+  std::map<string, NodeDef*> all_nodes_;
+  std::set<string> apply_gradients_nodes_;
+  std::set<string> replica_nodes_;
+  std::set<string> shared_nodes_;
+  const GrapplerItem* item_;
+  int num_replicas_;
+  int num_gpus_;
+  Status Initialize(const GrapplerItem& item);
+  NodeDef* AddNodeDivConst();
+  NodeDef* AddNodeDiv(const string& name, const string& input_a,
+                      const string& input_b);
+  NodeDef* AddNodeControl(const string& name, const std::set<string>& deps,
+                          GraphDef* graph);
+  bool NotSharedNode(const string& name);
+  void AddSharedNodes(GraphDef* graph);
+  void AddOneReplica(GraphDef* graph, int number);
+  void BuildGraph(GraphDef* graph);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
new file mode 100644
index 00000000000..3d1b4a34bfc
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class AutoParallelTest : public ::testing::Test {};
+
+TEST_F(AutoParallelTest, SimpleParallel) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant_a = ops::Const(s.WithOpName("constant_a"), 1.0f, {1});
+  Output constant_b = ops::Const(s.WithOpName("constant_b"), 1, {1});
+  Output var = ops::Variable(s.WithOpName("var"), {1}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign"), {var}, {constant_a});
+  Output fifo_queue = ops::FIFOQueue(s.WithOpName("fifo_queue"), {DT_FLOAT});
+  auto dequeue = ops::QueueDequeueMany(s.WithOpName("dequeue"), {fifo_queue},
+                                       {constant_b}, {DT_FLOAT});
+  Output add = ops::AddN(s.WithOpName("add"), {constant_a, dequeue[0]});
+  Output learning_rate = ops::Const(s.WithOpName("learning_rate"), 0.01f, {1});
+  Output apply_gradient = ops::ApplyGradientDescent(
+      s.WithOpName("apply_gradient"), {var}, {learning_rate}, {add});
+
+  GrapplerItem item;
+  item.init_ops.push_back("assign");
+  item.fetch.push_back("apply_gradient");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  AutoParallel parallel(2);
+  GraphDef output;
+  Status status = parallel.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(20, output.node_size());
+
+  const NodeDef& node_assign = output.node(0);
+  EXPECT_EQ("assign", node_assign.name());
+  EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_assign.input(1));
+
+  const NodeDef& node_constant_b = output.node(1);
+  EXPECT_EQ("constant_b", node_constant_b.name());
+
+  const NodeDef& node_fifo_queue = output.node(2);
+  EXPECT_EQ("fifo_queue", node_fifo_queue.name());
+
+  const NodeDef& node_var = output.node(3);
+  EXPECT_EQ("var", node_var.name());
+
+  const NodeDef& node_div_const0 = output.node(4);
+  EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-Const",
+            node_div_const0.name());
+
+  const NodeDef& node_div0 = output.node(5);
+  EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-apply_gradient",
+            node_div0.name());
+  const NodeDef& node_add0 = output.node(6);
+  EXPECT_EQ("AutoParallel-Replica-0/add", node_add0.name());
+
+  const NodeDef& node_gradient0 = output.node(7);
+  EXPECT_EQ("AutoParallel-Replica-0/apply_gradient", node_gradient0.name());
+
+  const NodeDef& node_constant_a0 = output.node(8);
+  EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_constant_a0.name());
+
+  const NodeDef& node_dequeue0 = output.node(9);
+  EXPECT_EQ("AutoParallel-Replica-0/dequeue", node_dequeue0.name());
+
+  const NodeDef& node_learning_rate0 = output.node(10);
+  EXPECT_EQ("AutoParallel-Replica-0/learning_rate", node_learning_rate0.name());
+
+  const NodeDef& node_div_const1 = output.node(11);
+  EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-Const",
+            node_div_const1.name());
+
+  const NodeDef& node_div1 = output.node(12);
+  EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-apply_gradient",
+            node_div1.name());
+
+  const NodeDef& node_add1 = output.node(13);
+  EXPECT_EQ("AutoParallel-Replica-1/add", node_add1.name());
+
+  const NodeDef& node_gradient1 = output.node(14);
+  EXPECT_EQ("AutoParallel-Replica-1/apply_gradient", node_gradient1.name());
+
+  const NodeDef& node_constant_a1 = output.node(15);
+  EXPECT_EQ("AutoParallel-Replica-1/constant_a", node_constant_a1.name());
+
+  const NodeDef& node_dequeue1 = output.node(16);
+  EXPECT_EQ("AutoParallel-Replica-1/dequeue", node_dequeue1.name());
+
+  const NodeDef& node_learning_rate1 = output.node(17);
+  EXPECT_EQ("AutoParallel-Replica-1/learning_rate", node_learning_rate1.name());
+
+  const NodeDef& node_fetch = output.node(18);
+  EXPECT_EQ("AutoParallel-Control-Fetch", node_fetch.name());
+  EXPECT_EQ("^AutoParallel-Replica-0/apply_gradient", node_fetch.input(0));
+  EXPECT_EQ("^AutoParallel-Replica-1/apply_gradient", node_fetch.input(1));
+
+  const NodeDef& node_gradient = output.node(19);
+  EXPECT_EQ("apply_gradient", node_gradient.name());
+  EXPECT_EQ("^AutoParallel-Control-Fetch", node_gradient.input(0));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
new file mode 100644
index 00000000000..ac04be6d331
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -0,0 +1,561 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace grappler {
+using TensorVector = gtl::InlinedVector<TensorValue, 4>;
+
+namespace {
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+  explicit EigenThreadPoolWrapper(thread::ThreadPool* pool) : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+  void Schedule(std::function<void()> fn) override {
+    pool_->Schedule(std::move(fn));
+  }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+  thread::ThreadPool* pool_ = nullptr;
+};
+
+class DeviceSimple : public DeviceBase {
+ public:
+  DeviceSimple() : DeviceBase(Env::Default()) {
+    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.workers = new thread::ThreadPool(
+        Env::Default(), "constant_folding", eigen_worker_threads_.num_threads);
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+  ~DeviceSimple() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+    delete eigen_worker_threads_.workers;
+  }
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+string AsControlDependency(const NodeDef& node) {
+  return strings::StrCat("^", node.name());
+}
+
+}  // namespace
+
+ConstantFolding::ConstantFolding() {
+  ops_to_preserve_ = std::regex(
+      "Placeholder.*|Const|.*Save.*|.*Restore.*|.*Reader|Enter|Exit|"
+      "NextIteration");
+}
+
+string ConstantFolding::AddControlDependency(const string& input_name) {
+  const NodeDef* node = node_map_->GetNode(input_name);
+  if (!IsSwitch(*node)) {
+    return AsControlDependency(*node);
+  } else {
+    // We can't anchor control dependencies directly on the switch node: unlike
+    // other nodes only one of the outputs of the switch node will be generated
+    // when the switch node is executed, and we need to make sure the control
+    // dependency is only triggered when the corresponding output is triggered.
+    // We start by looking for an identity node connected to the output of the
+    // switch node, and use it to anchor the control dependency.
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (const NodeDef* node : outputs) {
+      if (IsIdentity(*node)) {
+        CHECK_EQ(1, node->input_size());
+        if (IsSameInput(node->input(0), input_name)) {
+          return AsControlDependency(*node);
+        }
+      }
+    }
+    // We haven't found an existing node where we can anchor the control
+    // dependency: add a new identity node.
+    int position = 0;
+    string ctrl_dep_name = ParseNodeName(input_name, &position);
+    strings::StrAppend(&ctrl_dep_name, "_", position);
+    ctrl_dep_name = AddPrefixToNodeName(ctrl_dep_name, kConstantFoldingCtrl);
+    const DataType output_type = node->attr().at("T").type();
+
+    NodeDef* added_node = graph_.add_node();
+    added_node->set_name(ctrl_dep_name);
+    added_node->set_op("Identity");
+    (*added_node->mutable_attr())["T"].set_type(output_type);
+    *added_node->add_input() = input_name;
+    node_map_->AddNode(added_node->name(), added_node);
+    node_map_->AddOutput(node->name(), added_node->name());
+    return AsControlDependency(*added_node);
+  }
+}
+
+Status ConstantFolding::MaterializeShapes(const GrapplerItem& item) {
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+  // We may add some nodes to the graph to encode control dependencies: there is
+  // no need to process these, so only iterate over the nodes of the input
+  // graph.
+  const int node_count = graph_.node_size();
+  for (int i = 0; i < node_count; ++i) {
+    NodeDef& node = *graph_.mutable_node(i);
+    const string op = node.op();
+    if (op != "Shape" && op != "Size" && op != "Rank") {
+      continue;
+    }
+    std::vector<OpInfo::TensorProperties> output =
+        properties.GetOutputProperties(node.name());
+    CHECK_EQ(1, output.size());
+    const DataType type = output[0].dtype();
+    CHECK(type == DT_INT32 || type == DT_INT64);
+
+    std::vector<OpInfo::TensorProperties> input =
+        properties.GetInputProperties(node.name());
+    CHECK_EQ(1, input.size());
+
+    const TensorShapeProto shape = input[0].shape();
+    // Materialize the shapes using constants whenever possible.
+    PartialTensorShape shp(shape);
+    if (shp.IsFullyDefined() || (!shp.unknown_rank() && op == "Rank")) {
+      bool valid = true;
+      Tensor value(type);
+      if (op == "Shape") {
+        value = Tensor(type, TensorShape({shp.dims()}));
+        for (int i = 0; i < shp.dims(); ++i) {
+          if (type == DT_INT32) {
+            if (shp.dim_size(i) >= INT_MAX) {
+              valid = false;
+              break;
+            }
+            value.flat<int32>()(i) = shp.dim_size(i);
+          } else {
+            value.flat<int64>()(i) = shp.dim_size(i);
+          }
+        }
+      } else if (op == "Size") {
+        int64 size = 1;
+        for (int i = 0; i < shp.dims(); ++i) {
+          size *= shp.dim_size(i);
+        }
+        value = Tensor(type, TensorShape({}));
+        if (type == DT_INT32) {
+          if (size >= INT_MAX) {
+            valid = false;
+          } else {
+            value.flat<int32>()(0) = size;
+          }
+        } else {
+          value.flat<int64>()(0) = size;
+        }
+      } else {
+        value = Tensor(type, TensorShape({}));
+        if (type == DT_INT32) {
+          if (shp.dims() >= INT_MAX) {
+            valid = false;
+          } else {
+            value.flat<int32>()(0) = shp.dims();
+          }
+        } else {
+          value.flat<int64>()(0) = shp.dims();
+        }
+      }
+
+      if (valid) {
+        // Replace the node with the corresponding constant.
+        node.set_op("Const");
+        node.clear_attr();
+        (*node.mutable_attr())["dtype"].set_type(type);
+        value.AsProtoTensorContent(
+            (*node.mutable_attr())["value"].mutable_tensor());
+
+        // Turn the data input into a control dependency: this is needed to
+        // ensure that the constant value will only be generated in the cases
+        // where the shape/rank/size would have been generated in the original
+        // graph. Additional inputs are extra control dependencies that we
+        // preserve.
+        CHECK_LE(1, node.input_size());
+        string ctrl_dep = AddControlDependency(node.input(0));
+        node.set_input(0, ctrl_dep);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+bool ConstantFolding::IsFoldable(const NodeDef& node) const {
+  // Skips nodes that must be preserved, and op_types that don't benefit from
+  // folding
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  std::cmatch match;
+  if (std::regex_match(node.op().c_str(), match, ops_to_preserve_)) {
+    return false;
+  }
+
+  // Don't fold stateful ops such as TruncatedNormal.
+  const OpDef* op_def = nullptr;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok()) {
+    return false;
+  }
+  if (op_def->is_stateful()) {
+    return false;
+  }
+
+  if (op_def->output_arg_size() == 0) {
+    return false;
+  }
+
+  DeviceTypeVector device_types;
+  status = SupportedDeviceTypesForNode({DeviceType(DEVICE_CPU)}, node,
+                                       &device_types);
+  if (!status.ok()) {
+    return false;
+  }
+  // Only fold ops with a CPU implementation available.
+  if (device_types[0] != DeviceType(DEVICE_CPU)) {
+    return false;
+  }
+
+  // Folding not applicable to ops with no inputs.
+  if (node.input().empty()) {
+    return false;
+  }
+
+  // No need to (and don't) fold nodes that have no outgoing edges. Such nodes
+  // could be introduced by an earlier constant folding pass and are preserved
+  // in case users want to fetch their values; re-processing them would
+  // lead to an error of adding a duplicated node to graph.
+  auto outputs = node_map_->GetOutputs(node.name());
+  if (outputs.empty()) {
+    return false;
+  }
+
+  for (const auto& input : node.input()) {
+    if (IsControlInput(input)) {
+      continue;
+    }
+    bool is_const = IsConstant(*node_map_->GetNode(input));
+    if (!is_const) {
+      return false;
+    }
+  }
+  return true;
+}
+
+NodeDef ConstantFolding::CreateNodeDef(const string& name,
+                                       const TensorValue& tensor) {
+  NodeDef node;
+  node.set_name(name);
+  node.set_op("Const");
+  AttrValue attr_output_shape;
+  auto output_shape = attr_output_shape.mutable_list()->add_shape();
+  TensorShapeProto shape;
+  tensor->shape().AsProto(&shape);
+  *output_shape = shape;
+  node.mutable_attr()->insert({"_output_shapes", attr_output_shape});
+
+  AttrValue attr_type;
+  attr_type.set_type(tensor->dtype());
+  node.mutable_attr()->insert({"dtype", attr_type});
+
+  AttrValue attr_tensor;
+  tensor->AsProtoTensorContent(attr_tensor.mutable_tensor());
+  node.mutable_attr()->insert({"value", attr_tensor});
+  return node;
+}
+
+Status ConstantFolding::EvaluateNode(const NodeDef& node,
+                                     const TensorVector& inputs,
+                                     TensorVector* output) const {
+  Status status;
+  auto op_kernel =
+      CreateOpKernel("CPU", device_.get(), device_->GetAllocator({}), node,
+                     TF_GRAPH_DEF_VERSION, &status);
+  TF_RETURN_IF_ERROR(status);
+  OpKernelContext::Params params;
+  params.device = device_.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  params.inputs = &inputs;
+  params.op_kernel = op_kernel.get();
+
+  gtl::InlinedVector<AllocatorAttributes, 4> output_attrs;
+  const int num_outputs = op_kernel->num_outputs();
+  for (int i = 0; i < num_outputs; i++) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    output_attrs.push_back(attr);
+  }
+  params.output_attr_array = output_attrs.data();
+
+  OpKernelContext op_context(&params);
+  op_kernel->Compute(&op_context);
+  for (int i = 0; i < num_outputs; i++) {
+    output->push_back(op_context.release_output(i));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
+                                            std::vector<NodeDef>* outputs) {
+  TensorVector inputs;
+  for (const auto& input : node.input()) {
+    if (IsControlInput(input)) {
+      break;
+    }
+    TensorVector output;
+    TF_RETURN_IF_ERROR(
+        EvaluateNode(*node_map_->GetNode(input), TensorVector(), &output));
+    inputs.push_back(output[0]);
+  }
+
+  TensorVector output_tensors;
+  TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, &output_tensors));
+  for (const auto& input : inputs) {
+    delete input.tensor;
+  }
+  if (output_tensors.empty()) {
+    Status(error::INVALID_ARGUMENT, "Expected at least one output.");
+  }
+  for (int i = 0; i < output_tensors.size(); i++) {
+    string node_name = AddPrefixToNodeName(node.name(), kConstantFoldingConst);
+    if (output_tensors.size() > 1) {
+      node_name = strings::StrCat(node_name, "-", i);
+    }
+    if (output_tensors[i].tensor) {
+      outputs->push_back(CreateNodeDef(node_name, output_tensors[i]));
+      delete output_tensors[i].tensor;
+    } else {
+      // Create an empty NodeDef to identify dead outputs (e.g. the output of a
+      // switch that's not selected by the switch predicate).
+      outputs->push_back(NodeDef());
+    }
+  }
+  return Status::OK();
+}
+
+Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) {
+  std::vector<NodeDef> const_nodes;
+  TF_RETURN_IF_ERROR(EvaluateOneFoldable(node, &const_nodes));
+
+  NodeDef* constant_output = nullptr;
+  for (const auto& const_node : const_nodes) {
+    if (const_node.name().empty()) {
+      // Dead output: we can't create a constant to encode its value, so we'll
+      // just skip it. We'll preserve the edges that originate from that output
+      // below to preserve the overall behavior of the graph wrt dead edges.
+      continue;
+    }
+    NodeDef* added_node = output->add_node();
+    *added_node = const_node;
+    node_map_->AddNode(added_node->name(), added_node);
+
+    for (const auto& input : node.input()) {
+      if (IsControlInput(input)) {
+        *added_node->add_input() = input;
+      } else {
+        NodeDef* input_node = node_map_->GetNode(input);
+        for (const auto& fanin_of_input : input_node->input()) {
+          if (IsControlInput(fanin_of_input)) {
+            *added_node->add_input() = fanin_of_input;
+          }
+        }
+      }
+    }
+
+    // All the constant nodes encoding output values have the same control
+    // dependencies (since these are the control dependencies of the node we're
+    // trying to fold). Record one such constant node.
+    constant_output = added_node;
+  }
+
+  auto outputs = node_map_->GetOutputs(node.name());
+  for (const auto& output : outputs) {
+    for (int i = 0; i < output->input_size(); i++) {
+      int position;
+      string node_name = ParseNodeName(output->input(i), &position);
+      if (node_name == node.name()) {
+        if (position < 0) {
+          // Propagate control dependencies if possible. If not, we'll just
+          // preserve the existing control dependencies.
+          if (constant_output != nullptr) {
+            *output->mutable_input(i) = AsControlDependency(*constant_output);
+          }
+
+        } else if (position < const_nodes.size() &&
+                   !const_nodes[position].name().empty()) {
+          // Replace alive outputs with the corresponding constant.
+          *output->mutable_input(i) = const_nodes[position].name();
+        } else {
+          // Leave this edge alone.
+          VLOG(1) << "Preserving edge from " << node.name() << ":" << position
+                  << "[" << node.op() << "] to " << output->name() << ":" << i
+                  << "[" << output->op() << "]";
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status ConstantFolding::FoldGraph(GraphDef* output) {
+  std::set<string> processed_nodes;
+  while (1) {
+    int previous_processed = processed_nodes.size();
+    for (const auto& node : graph_.node()) {
+      if (IsFoldable(node) &&
+          processed_nodes.find(node.name()) == processed_nodes.end()) {
+        TF_RETURN_IF_ERROR(FoldNode(node, output));
+        processed_nodes.insert(node.name());
+      }
+    }
+    int current_processed = processed_nodes.size();
+    LOG(INFO) << "Previous number of processed nodes: " << previous_processed
+              << "; Current number of processed nodes: " << current_processed;
+    if (current_processed == previous_processed) {
+      break;
+    }
+  }
+
+  // Build the graph after constant folding. Note that we keep all processed
+  // nodes in the graph in case users need to fetch their values.
+  for (const auto& node : graph_.node()) {
+    auto added_node = output->add_node();
+    *added_node = node;
+  }
+  return Status::OK();
+}
+
+// Returns true iff this reduction can be reduced to an identity (i.e if the set
+// of dimensions to reduce along is empty). This happens often in the gradient
+// graphs.
+bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
+  if (IsReduction(node)) {
+    CHECK_LE(2, node.input_size());
+    const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
+    if (IsConstant(*reductions_indices)) {
+      TensorVector output;
+      Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
+      if (!s.ok()) {
+        return false;
+      }
+      CHECK_EQ(1, output.size());
+      int output_size = output[0]->NumElements();
+      delete output[0].tensor;
+      if (output_size == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+Status ConstantFolding::SimplifyGraph(GraphDef* output) {
+  for (auto& node : *output->mutable_node()) {
+    if (IsSimplifiableReduction(node)) {
+      // Replace the reduction node with an identity node, that can be further
+      // optimized by the model pruner.
+      const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
+      DataType output_type;
+      if (node.attr().count("T") > 0) {
+        output_type = node.attr().at("T").type();
+      } else {
+        // This is an 'any' or 'all' reduction. The output is always boolean.
+        output_type = DT_BOOL;
+      }
+      node.set_op("Identity");
+      node.clear_attr();
+      (*node.mutable_attr())["T"].set_type(output_type);
+      if (node.input_size() > 2) {
+        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+      }
+      node.mutable_input()->RemoveLast();
+      for (const auto& input : reductions_indices->input()) {
+        if (IsControlInput(input)) {
+          *node.add_input() = input;
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output) {
+  graph_ = item.graph;
+  LOG(INFO) << "Initial graph size: " << item.graph.node_size();
+  node_map_.reset(new NodeMap(&graph_));
+  for (const auto& node : item.fetch) {
+    nodes_to_preserve_.insert(NodeName(node));
+  }
+  for (const auto& node : item.feed) {
+    nodes_to_preserve_.insert(NodeName(node.first));
+  }
+  device_.reset(new DeviceSimple());
+  *output = GraphDef();
+  TF_RETURN_IF_ERROR(MaterializeShapes(item));
+  TF_RETURN_IF_ERROR(FoldGraph(output));
+  TF_RETURN_IF_ERROR(SimplifyGraph(output));
+  LOG(INFO) << "Optimized graph size: " << output->node_size();
+  return Status::OK();
+}
+
+void ConstantFolding::Feedback(Cluster* cluster, const GrapplerItem& item,
+                               const GraphDef& optimize_output, double result) {
+  // Nothing to do for ConstantFolding.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
new file mode 100644
index 00000000000..36c9ad2c649
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+
+#include <regex>
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char kConstantFoldingConst[] = "ConstantFolding";
+const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
+
+// Constant folding optimization for a graph.
+class ConstantFolding : public GraphOptimizer {
+ public:
+  ConstantFolding();
+
+  ~ConstantFolding() override {}
+
+  string name() const override { return "constant folding"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  string AddControlDependency(const string& input_name);
+  Status MaterializeShapes(const GrapplerItem& item);
+
+  bool IsFoldable(const NodeDef& node) const;
+
+  NodeDef CreateNodeDef(const string& name, const TensorValue& tensor);
+
+  Status EvaluateNode(const NodeDef& node,
+                      const gtl::InlinedVector<TensorValue, 4>& inputs,
+                      gtl::InlinedVector<TensorValue, 4>* output) const;
+
+  Status EvaluateOneFoldable(const NodeDef& node,
+                             std::vector<NodeDef>* outputs);
+
+  Status FoldNode(const NodeDef& node, GraphDef* output);
+
+  Status FoldGraph(GraphDef* output);
+
+  bool IsSimplifiableReduction(const NodeDef& node) const;
+  Status SimplifyGraph(GraphDef* output);
+
+  std::unique_ptr<DeviceBase> device_;
+  GraphDef graph_;
+  std::unique_ptr<NodeMap> node_map_;
+  std::set<string> nodes_to_preserve_;
+  std::regex ops_to_preserve_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
new file mode 100644
index 00000000000..566d3cd9a39
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -0,0 +1,429 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class ConstantFoldingTest : public ::testing::Test {
+ protected:
+  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
+                                    const std::vector<string>& fetch) {
+    SessionOptions options;
+    std::unique_ptr<tensorflow::Session> session(NewSession(options));
+    TF_CHECK_OK(session->Create(graph));
+    RunOptions run_options;
+    std::vector<Tensor> output_tensors;
+    TF_CHECK_OK(
+        session->Run(run_options, {}, fetch, fetch, &output_tensors, nullptr));
+    TF_CHECK_OK(session->Close());
+    return output_tensors;
+  }
+};
+
+TEST_F(ConstantFoldingTest, SimpleFolding) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 1.0f, {1});
+  Output b = ops::Const(s.WithOpName("b"), 2.0f, {1});
+  Output c = ops::AddN(s.WithOpName("c"), {a, b});
+  Output d = ops::AddN(s.WithOpName("d"), {b, c});
+
+  GrapplerItem item;
+  item.fetch.push_back("d");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+
+  const NodeDef& new_c = output.node(0);
+  EXPECT_EQ("ConstantFolding/c", new_c.name());
+  EXPECT_EQ("Const", new_c.op());
+
+  const NodeDef& new_a = output.node(1);
+  EXPECT_EQ("a", new_a.name());
+
+  const NodeDef& new_b = output.node(2);
+  EXPECT_EQ("b", new_b.name());
+
+  const NodeDef& old_c = output.node(3);
+  EXPECT_EQ("c", old_c.name());
+
+  const NodeDef& new_d = output.node(4);
+  EXPECT_EQ("d", new_d.name());
+  EXPECT_EQ("ConstantFolding/c", new_d.input(1));
+
+  std::vector<string> fetch = {"a", "b", "c", "d"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(4, tensors_expected.size());
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < 4; i++) {
+    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 10, {3});
+  auto b = ops::Unique(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), {b.y});
+  Output d = ops::Identity(s.WithOpName("d"), {b.idx});
+
+  GrapplerItem item;
+  item.fetch.push_back("c");
+  item.fetch.push_back("d");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+
+  const NodeDef& new_b_0 = output.node(0);
+  EXPECT_EQ("ConstantFolding/b-0", new_b_0.name());
+  EXPECT_EQ("Const", new_b_0.op());
+
+  const NodeDef& new_b_1 = output.node(1);
+  EXPECT_EQ("ConstantFolding/b-1", new_b_1.name());
+  EXPECT_EQ("Const", new_b_1.op());
+
+  const NodeDef& new_a = output.node(2);
+  EXPECT_EQ("a", new_a.name());
+
+  const NodeDef& new_b = output.node(3);
+  EXPECT_EQ("b", new_b.name());
+
+  const NodeDef& new_c = output.node(4);
+  EXPECT_EQ("c", new_c.name());
+  EXPECT_EQ("ConstantFolding/b-0", new_c.input(0));
+
+  const NodeDef& new_d = output.node(5);
+  EXPECT_EQ("d", new_d.name());
+  EXPECT_EQ("ConstantFolding/b-1", new_d.input(0));
+
+  std::vector<string> fetch = {"a", "b", "c", "d"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(4, tensors_expected.size());
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < 4; i++) {
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, ControlDependencies) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output dflt = ops::Const(scope.WithOpName("dflt"), 3.14f, {1});
+  Output p1 = ops::PlaceholderWithDefault(scope.WithOpName("p1"), dflt, {1});
+  Output p2 = ops::PlaceholderWithDefault(scope.WithOpName("p2"), dflt, {1});
+  Output c = ops::Const(scope.WithOpName("c"), 10, {3});
+  Output i1 = ops::Identity(scope.WithOpName("i1"), {c});
+  Output i2 = ops::Identity(scope.WithOpName("i2"), {i1});
+  Output i3 = ops::Identity(scope.WithOpName("e"), {i2});
+
+  GrapplerItem item;
+  item.fetch.push_back("i3");
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  ASSERT_EQ("c", item.graph.node(3).name());
+  (*item.graph.mutable_node(3)->add_input()) = "^p1";
+  ASSERT_EQ("i2", item.graph.node(5).name());
+  (*item.graph.mutable_node(5)->add_input()) = "^p2";
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "ConstantFolding/i1") {
+      ++found;
+      auto folded = EvaluateNodes(output, {"ConstantFolding/i1"});
+      auto expected = EvaluateNodes(item.graph, {"i1"});
+      EXPECT_EQ(1, expected.size());
+      EXPECT_EQ(1, folded.size());
+      test::ExpectTensorEqual<int>(folded[0], expected[0]);
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^p1", node.input(0));
+    }
+    if (node.name() == "ConstantFolding/i2") {
+      ++found;
+      auto folded = EvaluateNodes(output, {"ConstantFolding/i2"});
+      auto expected = EvaluateNodes(item.graph, {"i2"});
+      EXPECT_EQ(1, expected.size());
+      EXPECT_EQ(1, folded.size());
+      test::ExpectTensorEqual<int>(folded[0], expected[0]);
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^p1", node.input(0));
+      EXPECT_EQ("^p2", node.input(1));
+    }
+  }
+  EXPECT_EQ(2, found);
+}
+
+TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  // Add a DynamicPartition node to the graph
+  Output input = ops::Const(scope.WithOpName("in0"), 314, {3, 4, 5});
+  Output indices = ops::Const(scope.WithOpName("indices"), 1, {3, 4});
+  int num_partitions = 4;
+  ops::DynamicPartition part(scope.WithOpName("partition"), input, indices,
+                             num_partitions);
+
+  std::vector<string> outputs;
+  for (int i = 0; i < num_partitions; ++i) {
+    string part_out_name = strings::StrCat("part_out", i);
+    ops::Identity partition_out(scope.WithOpName(part_out_name),
+                                {part.outputs[i]});
+    outputs.push_back(part_out_name);
+  }
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  // Add a ConcatOffset node to the graph
+  Tensor initial_val(DT_INT32, TensorShape({3}));
+  test::FillIota<int>(&initial_val, 7);
+  for (int i = 1; i < 5; ++i) {
+    TF_CHECK_OK(NodeDefBuilder(strings::StrCat("in", i), "Const")
+                    .Attr("dtype", DT_INT32)
+                    .Attr("value", initial_val)
+                    .Finalize(item.graph.add_node()));
+  }
+  Tensor concat_dim(DT_INT32, TensorShape({}));
+  test::FillIota<int>(&concat_dim, 0);
+  TF_CHECK_OK(NodeDefBuilder("concat_dim", "Const")
+                  .Attr("dtype", DT_INT32)
+                  .Attr("value", concat_dim)
+                  .Finalize(item.graph.add_node()));
+
+  TF_CHECK_OK(NodeDefBuilder("concat_offsets", "ConcatOffset")
+                  .Input("concat_dim", 0, DT_INT32)
+                  .Input({NodeDefBuilder::NodeOut("in1", 0, DT_INT32),
+                          NodeDefBuilder::NodeOut("in2", 0, DT_INT32),
+                          NodeDefBuilder::NodeOut("in3", 0, DT_INT32),
+                          NodeDefBuilder::NodeOut("in4", 0, DT_INT32)})
+                  .Finalize(item.graph.add_node()));
+
+  for (int i = 0; i < 4; ++i) {
+    string concat_offset_out_name = strings::StrCat("concat_offset_out", i);
+    TF_CHECK_OK(NodeDefBuilder(concat_offset_out_name, "Identity")
+                    .Attr("T", DT_INT32)
+                    .Input("concat_offsets", i, DT_INT32)
+                    .Finalize(item.graph.add_node()));
+    outputs.push_back(concat_offset_out_name);
+  }
+
+  item.fetch = outputs;
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int constant_folded = 0;
+  for (const auto& node : output.node()) {
+    if (node.name().find("ConstantFolding/partition") != string::npos ||
+        node.name().find("ConstantFolding/concat_offsets") != string::npos) {
+      ++constant_folded;
+      EXPECT_EQ("Const", node.op());
+    }
+  }
+  EXPECT_EQ(8, constant_folded);
+
+  auto expected = EvaluateNodes(item.graph, outputs);
+  auto optimized = EvaluateNodes(output, outputs);
+  ASSERT_EQ(expected.size(), optimized.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    test::ExpectTensorEqual<int>(expected[i], optimized[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, ShapeMaterialization) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output v1 = ops::Variable(scope.WithOpName("v1"), {3}, DT_FLOAT);
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {5, 7}, DT_FLOAT);
+  Output v3 = ops::Variable(scope.WithOpName("v3"), {11, 13}, DT_FLOAT);
+  Output rank = ops::Rank(scope.WithOpName("rank"), v1);
+  Output shape = ops::Shape(scope.WithOpName("shape"), v2);
+  Output size = ops::Size(scope.WithOpName("size"), v3);
+  Output p1 = ops::Multiply(scope.WithOpName("p1"), size, rank);
+  Output p2 = ops::Multiply(scope.WithOpName("p2"), p1, shape);
+
+  GrapplerItem item;
+  item.fetch.push_back("p2");
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "size") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^v3", node.input(0));
+      Tensor value;
+      CHECK(value.FromProto(node.attr().at("value").tensor()));
+      EXPECT_EQ(11 * 13, value.flat<int>()(0));
+    } else if (node.name() == "rank") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^v1", node.input(0));
+      Tensor value;
+      CHECK(value.FromProto(node.attr().at("value").tensor()));
+      EXPECT_EQ(1, value.flat<int>()(0));
+    } else if (node.name() == "shape") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^v2", node.input(0));
+      Tensor value;
+      CHECK(value.FromProto(node.attr().at("value").tensor()));
+      EXPECT_EQ(5, value.flat<int>()(0));
+      EXPECT_EQ(7, value.flat<int>()(1));
+    }
+  }
+  EXPECT_EQ(3, found);
+}
+
+TEST_F(ConstantFoldingTest, SwitchNodes) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  ops::Variable v_ctrl(scope.WithOpName("v_ctrl"), {}, DT_BOOL);
+  ops::Switch s1(scope.WithOpName("switch"), v_in, v_ctrl);
+  ops::Rank rank(scope.WithOpName("rank"), s1.output_false);
+  ops::Identity i(scope.WithOpName("i"), s1.output_true);
+  ops::Size size(scope.WithOpName("size"), i);
+  ops::Square p1(scope.WithOpName("p1"), rank);
+  ops::Square p2(scope.WithOpName("p2"), size);
+  ops::Merge m(scope.WithOpName("m"), {p1.y, p2.y});
+
+  Output predicate =
+      ops::Const(scope.WithOpName("false"), false, TensorShape({}));
+  Output constant =
+      ops::Const(scope.WithOpName("constant"), 1.0f, TensorShape({1}));
+  ops::Switch s2(scope.WithOpName("switch2"), constant, predicate);
+  ops::Identity statically_known(scope.WithOpName("i2"), s2.output_false);
+  ops::Identity never_generated(scope.WithOpName("i3"), s2.output_true);
+  ops::Merge m2(scope.WithOpName("m2"),
+                {statically_known.output, never_generated.output});
+
+  GrapplerItem item;
+  item.fetch.push_back("m");
+  item.fetch.push_back("m2");
+
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "rank") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFoldingCtrl/switch_0", node.input(0));
+    }
+    if (node.name() == "size") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^i", node.input(0));
+    }
+    if (node.name() == "ConstantFolding/switch2-0") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    }
+    if (node.name() == "ConstantFolding/i2") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    }
+    if (node.name() == "i3") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("switch2:1", node.input(0));
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, NoOpReduction) {
+  // Build a simple graph with a reduction that can be reduced to the identity.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output d = ops::Const(scope.WithOpName("d"), 3.14f, {3, 5, 7});
+  Output v = ops::PlaceholderWithDefault(scope.WithOpName("v"), d, {3, 5, 7});
+  Output c = ops::Const(scope.WithOpName("c"), 0, {0});
+  Output i = ops::Identity(scope.WithOpName("i"), c);
+  Output p = ops::Prod(scope.WithOpName("p"), v, i);
+  Output s = ops::Square(scope.WithOpName("s"), p);
+
+  GrapplerItem item;
+  item.fetch.push_back("s");
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  ASSERT_EQ("c", item.graph.node(2).name());
+  (*item.graph.mutable_node(2)->add_input()) = "^v";
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  auto expected = EvaluateNodes(item.graph, {"s"});
+  auto optimized = EvaluateNodes(output, {"s"});
+  EXPECT_EQ(1, expected.size());
+  EXPECT_EQ(1, optimized.size());
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+
+  bool found = false;
+  for (const auto& node : output.node()) {
+    if (node.name() == "p") {
+      found = true;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v", node.input(0));
+      EXPECT_EQ("^v", node.input(1));
+    }
+  }
+  EXPECT_TRUE(found);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
new file mode 100644
index 00000000000..55a90dce88f
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// An abstract interface for an algorithm for generating a candidate
+// optimization of a GrapplerItem for running on a cluster.
+class GraphOptimizer {
+ public:
+  virtual ~GraphOptimizer() {}
+
+  virtual string name() const = 0;
+
+  // Routine called to allow an algorithm to propose a rewritten graph
+  // for the graph, feeds and fetches in "item" to run more efficiently
+  // on "cluster".
+  // Returns true iff it managed to generate a solution, false otherwise.
+  virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                          GraphDef* optimized_graph) = 0;
+
+  // Method invoked by the framework so that it can provide feedback
+  // on how well the "optimize_output" (produced as *output from a
+  // call to Optimize) performed.  Lower "result" scores are better.
+  virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
+                        const GraphDef& optimized_graph, double result) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
new file mode 100644
index 00000000000..d1ab5a1d9b4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+GraphRewriter::GraphRewriter(const GrapplerItem& item) {
+  for (auto& node : item.graph.node()) {
+    nodes_[node.name()] = &node;
+  }
+
+  for (auto& node : item.graph.node()) {
+    for (const auto& input : node.input()) {
+      int position = 0;
+      string input_node_name = ParseNodeName(input, &position);
+      if (position < 0) {
+        // This is a control edge
+        auto itr = nodes_.find(input_node_name);
+        CHECK(itr != nodes_.end());
+        control_dependency_drivers_.insert(itr->second);
+      }
+    }
+  }
+}
+
+void GraphRewriter::ForwardInputs(
+    const NodeDef& original_node,
+    const std::unordered_set<const NodeDef*>& nodes_to_delete,
+    NodeDef* new_node) {
+  for (const auto& input : original_node.input()) {
+    string input_node_name = NodeName(input);
+    auto itr = nodes_.find(input_node_name);
+    CHECK(itr != nodes_.end());
+    const NodeDef* input_node = itr->second;
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputs(*input_node, nodes_to_delete, new_node);
+    } else {
+      *new_node->add_input() = input;
+    }
+  }
+}
+
+bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
+  return control_dependency_drivers_.find(&node) !=
+         control_dependency_drivers_.end();
+}
+
+bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
+  for (const auto& input : node.input()) {
+    CHECK(!input.empty());
+    if (input[0] == '^') {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
new file mode 100644
index 00000000000..adbe5a24c86
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Tools and utilities to simplify common graph rewrites.
+class GraphRewriter {
+ public:
+  GraphRewriter(const GrapplerItem& item);
+
+  // Forward the inputs of original_node as needed to skip over the nodes that
+  // are to be deleted. In other words, if I is an input of 'original_node', and
+  // I doesn't belong to one of the nodes in 'nodes_to_delete', I will be an
+  // input to 'new_node'. On the other hand, if I belong to a node that will be
+  // deleted, I will be replaced with the inputs J of the deleted node (unless J
+  // belong to nodes that will be deleted, in which case we'll look for
+  // preserved inputs further down the graph).
+  void ForwardInputs(const NodeDef& original_node,
+                     const std::unordered_set<const NodeDef*>& nodes_to_delete,
+                     NodeDef* new_node);
+
+  // Returns true if at least one of the edges in the direct fanout of 'node' is
+  // a control dependency edge.
+  bool DrivesControlDependency(const NodeDef& node) const;
+
+  // Returns true if at least one of the incident edges is a control dependency
+  // edge.
+  bool IsDrivenByControlDependency(const NodeDef& node) const;
+
+ private:
+  std::unordered_map<string, const NodeDef*> nodes_;
+  std::unordered_set<const NodeDef*> control_dependency_drivers_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
new file mode 100644
index 00000000000..5ac42c2abab
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -0,0 +1,1211 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char kConcatConst[] = "LayoutOptimizerConcatConst";
+const char kPermNHWCToNCHW[] = "LayoutOptimizerPermConstNHWCToNCHW";
+const char kPermNCHWToNHWC[] = "LayoutOptimizerPermConstNCHWToNHWC";
+const char kTransposeNHWCToNCHW[] = "LayoutOptimizerTransposeNHWCToNCHW";
+const char kTransposeNCHWToNHWC[] = "LayoutOptimizerTransposeNCHWToNHWC";
+const char kPermVecNHWCToNCHW[] = "LayoutOptimizerPermVecNHWCToNCHW";
+const char kReshapeNHWCToNCHW[] = "LayoutOptimizerReshapeNHWCToNCHW";
+const char kReshapeConst[] = "LayoutOptimizerReshapeConst";
+const char kReductionConst[] = "LayoutOptimizerReductionConst";
+
+std::set<string> GetOpsFormatSupported() {
+  std::set<string> ops_format_supported = {"AvgPool",
+                                           "AvgPoolGrad",
+                                           "Conv2D",
+                                           "Conv2DBackpropFilter",
+                                           "Conv2DBackpropInput",
+                                           "BiasAdd",
+                                           "BiasAddGrad",
+                                           "FusedBatchNorm",
+                                           "FusedBatchNormGrad",
+                                           "MaxPool",
+                                           "MaxPoolGrad"};
+  return ops_format_supported;
+}
+
+std::set<string> GetOpsFormatAgnostic() {
+  std::set<string> ops_format_agnostic = {"Add",
+                                          "AddN",
+                                          "Concat",
+                                          "ConcatV2",
+                                          "Floor",
+                                          "Identity",
+                                          "Mul",
+                                          "Neg",
+                                          "RealDiv",
+                                          "Relu",
+                                          "ReluGrad",
+                                          "Slice",
+                                          "SquaredDifference",
+                                          "Squeeze",
+                                          "Sub"};
+  return ops_format_agnostic;
+}
+
+bool IsNodeNHWCToNCHW(const string& node_name) {
+  const string transpose_node_prefix = kTransposeNHWCToNCHW;
+  string prefix = node_name.substr(0, transpose_node_prefix.length());
+  if (prefix.compare(transpose_node_prefix) == 0) {
+    return true;
+  }
+  return false;
+}
+
+bool IsNodeNCHWToNHWC(const string& node_name) {
+  const string transpose_node_prefix = kTransposeNCHWToNHWC;
+  string prefix = node_name.substr(0, transpose_node_prefix.length());
+  if (prefix.compare(transpose_node_prefix) == 0) {
+    return true;
+  }
+  return false;
+}
+
+class NodeProcessor {
+ public:
+  NodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : graph_(graph), node_(node), node_map_(node_map) {}
+  virtual ~NodeProcessor() {}
+  virtual Status ConvertNode() {
+    if (ShouldProcess()) {
+      UpdateAttrDataFormat();
+      UpdateAttrKSize();
+      UpdateAttrStrides();
+      UpdateAttrShape();
+      TF_RETURN_IF_ERROR(AddLayoutTransposeToInputs());
+      TF_RETURN_IF_ERROR(AddLayoutTransposeToOutputs());
+      TF_RETURN_IF_ERROR(CustomizedProcessing());
+    }
+    return Status::OK();
+  }
+
+ protected:
+  bool IsDimsN(const NodeDef& node, int n) const {
+    if (node.attr().find("_output_shapes") != node.attr().end()) {
+      auto shape = node.attr().at("_output_shapes").list().shape(0);
+      if (shape.dim_size() == n) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool IsDimsFour(const NodeDef& node) const { return IsDimsN(node, 4); }
+
+  bool IsNHWC() const {
+    if (node_->attr().find("data_format") != node_->attr().end()) {
+      if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool HasOutputs() const {
+    auto outputs = node_map_->GetOutputs(node_->name());
+    return !outputs.empty();
+  }
+
+  Status HasAttribute(const NodeDef& node, const string& attr) const {
+    if (node.attr().find(attr) == node.attr().end()) {
+      return Status(error::INVALID_ARGUMENT,
+                    strings::StrCat("Missing attribute ", attr));
+    }
+    return Status::OK();
+  }
+
+  virtual bool ShouldProcess() const {
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs();
+  }
+
+  void UpdateAttrDataFormat() {
+    if (node_->attr().find("data_format") != node_->attr().end()) {
+      if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
+        string* data_format =
+            node_->mutable_attr()->at("data_format").mutable_s();
+        *data_format = "NCHW";
+      }
+    }
+  }
+
+  virtual void UpdateAttrShape() {
+    if (node_->attr().find("_output_shapes") != node_->attr().end()) {
+      auto shape = node_->mutable_attr()
+                       ->at("_output_shapes")
+                       .mutable_list()
+                       ->mutable_shape(0);
+      if (shape->dim_size() == 4) {
+        int64 h = shape->dim(1).size();
+        int64 w = shape->dim(2).size();
+        int64 c = shape->dim(3).size();
+        shape->mutable_dim(1)->set_size(c);
+        shape->mutable_dim(2)->set_size(h);
+        shape->mutable_dim(3)->set_size(w);
+      }
+    }
+  }
+
+  void UpdateAttrKSize() {
+    if (node_->attr().find("ksize") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("ksize").mutable_list();
+      UpdateTuple(list);
+    }
+  }
+
+  void UpdateAttrStrides() {
+    if (node_->attr().find("strides") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("strides").mutable_list();
+      UpdateTuple(list);
+    }
+  }
+
+  Status UpdateAttrValue(NodeDef* node) {
+    TF_RETURN_IF_ERROR(HasAttribute(*node, "value"));
+    Tensor tensor;
+    auto success =
+        tensor.FromProto(node->mutable_attr()->at({"value"}).tensor());
+    if (!success) {
+      LOG(ERROR) << "Failed to parse TensorProto.";
+    }
+    int c = tensor.flat<int>()(3);
+    tensor.flat<int>()(3) = tensor.flat<int>()(2);
+    tensor.flat<int>()(2) = tensor.flat<int>()(1);
+    tensor.flat<int>()(1) = c;
+    tensor.AsProtoTensorContent(
+        node->mutable_attr()->at({"value"}).mutable_tensor());
+    return Status::OK();
+  }
+
+  Status UpdateAttrValueOfInput(int input_index) {
+    auto input_node = node_map_->GetNode(node_->input(input_index));
+    NodeDef* added_node = graph_->add_node();
+    *added_node = *input_node;
+    string base_name = strings::StrCat(node_->name(), "-", input_node->name());
+    string node_name = AddPrefixToNodeName(base_name, "LayoutOptimizer", "-");
+    added_node->set_name(node_name);
+    *node_->mutable_input(input_index) = node_name;
+    node_map_->AddNode(node_name, added_node);
+    node_map_->AddOutput(node_name, node_->name());
+    return UpdateAttrValue(added_node);
+  }
+
+  virtual std::vector<int> GetInputPos() const {
+    std::vector<int> input_pos = {0};
+    return input_pos;
+  }
+
+  void AddNodeTranspose(const string& node_name, const string& input_name,
+                        DataType data_type, const TensorShapeProto& input_shape,
+                        bool NHWCToNCHW) {
+    NodeDef* node = graph_->add_node();
+    node_map_->AddNode(node_name, node);
+    node->set_name(node_name);
+    *node->add_input() = input_name;
+    *node->add_input() = NHWCToNCHW ? kPermNHWCToNCHW : kPermNCHWToNHWC;
+    node->set_op("Transpose");
+    AttrValue attr_data_type;
+    attr_data_type.set_type(data_type);
+    node->mutable_attr()->insert({"T", attr_data_type});
+    AttrValue attr_data_type_perm;
+    attr_data_type_perm.set_type(DT_INT32);
+    node->mutable_attr()->insert({"Tperm", attr_data_type_perm});
+    AttrValue attr_output_shape;
+    auto output_shape = attr_output_shape.mutable_list()->add_shape();
+    if (NHWCToNCHW) {
+      output_shape->add_dim()->set_size(input_shape.dim(0).size());
+      output_shape->add_dim()->set_size(input_shape.dim(3).size());
+      output_shape->add_dim()->set_size(input_shape.dim(1).size());
+      output_shape->add_dim()->set_size(input_shape.dim(2).size());
+    } else {
+      output_shape->add_dim()->set_size(input_shape.dim(0).size());
+      output_shape->add_dim()->set_size(input_shape.dim(2).size());
+      output_shape->add_dim()->set_size(input_shape.dim(3).size());
+      output_shape->add_dim()->set_size(input_shape.dim(1).size());
+    }
+    node->mutable_attr()->insert({"_output_shapes", attr_output_shape});
+  }
+
+  virtual Status AddLayoutTransposeToInputs() {
+    std::vector<int> input_pos = GetInputPos();
+    for (const auto& pos : input_pos) {
+      string base_name = strings::StrCat(node_->name(), "-", node_->input(pos));
+      string node_name =
+          AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
+      auto input_node = node_map_->GetNode(node_->input(pos));
+      int output_pos = NodePosition(node_->input(pos));
+      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+      TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
+      AddNodeTranspose(
+          node_name, node_->input(pos), node_->attr().at("T").type(),
+          input_node->attr().at("_output_shapes").list().shape(output_pos),
+          true);
+      node_map_->UpdateOutput(node_->input(pos), node_->name(), node_name);
+      node_map_->AddOutput(node_name, node_->name());
+      *node_->mutable_input(pos) = node_name;
+    }
+    return Status::OK();
+  }
+
+  virtual Status AddLayoutTransposeToOutputs() {
+    auto outputs = node_map_->GetOutputs(node_->name());
+    for (const auto& output : outputs) {
+      string base_name = strings::StrCat(node_->name(), "-", output->name());
+      string node_name =
+          AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
+      // TODO(yaozhang): handle the rare case where node A is connected to more
+      // than one input of node B.
+      auto it = std::find_if(output->mutable_input()->begin(),
+                             output->mutable_input()->end(),
+                             [this](const string& input) {
+                               string node_name = NodeName(input);
+                               return node_name.compare(node_->name()) == 0;
+                             });
+      if (it == output->mutable_input()->end()) {
+        return Status(error::INVALID_ARGUMENT,
+                      strings::StrCat("Expect ", node_->name(),
+                                      " to be an input of ", output->name()));
+      }
+      int output_pos = NodePosition(*it);
+      // No need to process control nodes or nodes that use an output
+      // other than the first output: only the first output is of 4D NCHW/NHWC
+      // format and thus relevant here.
+      if (output_pos != 0) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+      TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
+      AddNodeTranspose(node_name, node_->name(), node_->attr().at("T").type(),
+                       node_->attr().at("_output_shapes").list().shape(0),
+                       false);
+      *it = node_name;
+      node_map_->UpdateOutput(node_->name(), output->name(), node_name);
+      node_map_->AddOutput(node_name, output->name());
+    }
+    return Status::OK();
+  }
+
+  virtual Status CustomizedProcessing() { return Status::OK(); }
+
+  GraphDef* graph_;
+  NodeDef* node_;
+  NodeMap* node_map_;
+
+ private:
+  void UpdateTuple(AttrValue_ListValue* list) {
+    int64 h = list->i(1);
+    int64 w = list->i(2);
+    int64 c = list->i(3);
+    list->set_i(1, c);
+    list->set_i(2, h);
+    list->set_i(3, w);
+  }
+};
+
+class AvgPoolGradProcessor : public NodeProcessor {
+ public:
+  AvgPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : NodeProcessor(graph, node, node_map) {}
+
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {1};
+    return input_pos;
+  }
+  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
+};
+
+class BiasAddGradProcessor : public NodeProcessor {
+ public:
+  BiasAddGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : NodeProcessor(graph, node, node_map) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    auto input = node_map_->GetNode(node_->input(0));
+    if (input) {
+      if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+};
+
+class Conv2DProcessor : public NodeProcessor {
+ public:
+  Conv2DProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  bool no_gemm)
+      : NodeProcessor(graph, node, node_map), no_gemm_(no_gemm) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
+           (!IsGemmUsed() || no_gemm_);
+  }
+
+  TensorShapeProto GetShape(const string& input_name) const {
+    string node_name;
+    int output_pos;
+    node_name = ParseNodeName(input_name, &output_pos);
+    NodeDef* node = node_map_->GetNode(node_name);
+    if (node->attr().find("_output_shapes") != node->attr().end()) {
+      return node->attr().at("_output_shapes").list().shape(output_pos);
+    }
+    TensorShapeProto shape;
+    return shape;
+  }
+
+  bool IsStrideOne() const {
+    if (node_->attr().find("strides") != node_->attr().end()) {
+      auto list = node_->attr().at("strides").list();
+      return list.i(1) == 1 && list.i(2) == 1;
+    }
+    return false;
+  }
+
+  bool IsValidPadding() const {
+    if (node_->attr().find("padding") != node_->attr().end()) {
+      auto padding = node_->attr().at("padding").s();
+      return padding == "VALID";
+    }
+    return false;
+  }
+
+  // The logic inside this function is based on the internal implementation of
+  // Conv2D, Conv2DBackpropInput, and Conv2DBackpropFilter ops, and thus
+  // needs to be updated accordingly if the internal implementation changes.
+  bool IsGemmUsed(const TensorShapeProto& filter_shape,
+                  const TensorShapeProto& input_shape) const {
+    if (filter_shape.dim_size() == 4) {
+      if (filter_shape.dim(0).size() == 1 && filter_shape.dim(1).size() == 1 &&
+          IsStrideOne()) {
+        return true;
+      }
+    }
+    if (input_shape.dim_size() == 4 && filter_shape.dim_size() == 4) {
+      if (input_shape.dim(1).size() == filter_shape.dim(0).size() &&
+          input_shape.dim(2).size() == filter_shape.dim(1).size() &&
+          IsValidPadding()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual bool IsGemmUsed() const {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->input(0));
+    return IsGemmUsed(filter_shape, input_shape);
+  }
+
+  bool no_gemm_;
+};
+
+class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
+ public:
+  Conv2DBackpropFilterProcessor(GraphDef* graph, NodeDef* node,
+                                NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
+
+ protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->name());
+    auto input_shape = GetShape(node_->input(0));
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0, 2};
+    return input_pos;
+  }
+
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+  // No need to update output shape, as it is always of shape
+  // [filter_height, filter_width, in_channels, out_channels], regardless of
+  // whether NCHW or NHWC is used.
+  void UpdateAttrShape() override {}
+};
+
+class Conv2DBackpropInputProcessor : public Conv2DProcessor {
+ public:
+  Conv2DBackpropInputProcessor(GraphDef* graph, NodeDef* node,
+                               NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
+
+ protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->name());
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {2};
+    return input_pos;
+  }
+
+  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
+};
+
+class FusedBatchNormGradProcessor : public NodeProcessor {
+ public:
+  FusedBatchNormGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : NodeProcessor(graph, node, node_map) {}
+
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0, 1};
+    return input_pos;
+  }
+};
+
+class MaxPoolGradProcessor : public NodeProcessor {
+ public:
+  MaxPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : NodeProcessor(graph, node, node_map) {}
+
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0, 1, 2};
+    return input_pos;
+  }
+};
+
+class AgnosticNodeProcessor : public NodeProcessor {
+ public:
+  AgnosticNodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : NodeProcessor(graph, node, node_map) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
+  }
+
+  bool IsNodeAfterNCHWToNHWC() const {
+    std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
+    auto node = node_map_->GetNode(node_->name());
+    while (node->input_size() > 0) {
+      int data_input_pos = 0;
+      if (node->op().compare("Concat") == 0) {
+        data_input_pos = 1;
+      }
+      node = node_map_->GetNode(node->input(data_input_pos));
+      if (IsNodeNCHWToNHWC(node->name())) {
+        return true;
+      }
+      bool connected =
+          ops_format_agnostic.find(node->name()) != ops_format_agnostic.end();
+      if (!connected) {
+        return false;
+      }
+    }
+    return false;
+  }
+};
+
+class AddNProcessor : public AgnosticNodeProcessor {
+ public:
+  AddNProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos;
+    input_pos.reserve(node_->input_size());
+    for (int i = 0; i < node_->input_size(); i++) {
+      input_pos.push_back(i);
+    }
+    return input_pos;
+  }
+};
+
+class BinaryOpProcessor : public AgnosticNodeProcessor {
+ public:
+  BinaryOpProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {
+    is_4d_with_vector_ = Is4DOperateWithVector();
+  }
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
+            Is4DOperateWithVector());
+  }
+
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0};
+    if (Is4DOperateWithND(4)) {
+      input_pos.push_back(1);
+    }
+    return input_pos;
+  }
+
+  bool Is4DOperateWithND(int n) const {
+    auto input0 = node_map_->GetNode(node_->input(0));
+    auto input1 = node_map_->GetNode(node_->input(1));
+    if (input0 && input1) {
+      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
+             ((n == 4)
+                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
+                  : IsDimsN(*input1, n));
+    }
+    return false;
+  }
+
+  bool Is4DOperateWithScalar() const { return Is4DOperateWithND(0); }
+
+  bool Is4DOperateWithVector() const { return Is4DOperateWithND(1); }
+
+  void AddNodeShapeConst(const string& name, int num_channels) {
+    NodeDef* node = graph_->add_node();
+    node_map_->AddNode(name, node);
+    node->set_name(name);
+    node->set_op("Const");
+    AttrValue attr_data_type;
+    attr_data_type.set_type(DT_INT32);
+    node->mutable_attr()->insert({"dtype", attr_data_type});
+
+    AttrValue attr_tensor;
+    Tensor tensor(DT_INT32, TensorShape({4}));
+    std::vector<int> shape = {1, num_channels, 1, 1};
+    for (int i = 0; i < static_cast<int>(shape.size()); i++) {
+      tensor.flat<int>()(i) = shape[i];
+    }
+    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
+    node->mutable_attr()->insert({"value", attr_tensor});
+  }
+
+  void AddNodeReshape(const string& node_name, const string& input_name,
+                      const string& shape_const_node_name, DataType data_type) {
+    NodeDef* node = graph_->add_node();
+    node_map_->AddNode(node_name, node);
+    node->set_name(node_name);
+    *node->add_input() = input_name;
+    *node->add_input() = shape_const_node_name;
+    node->set_op("Reshape");
+
+    AttrValue attr_type_indices;
+    attr_type_indices.set_type(DT_INT32);
+    node->mutable_attr()->insert({"Tshape", attr_type_indices});
+
+    AttrValue attr_type_params;
+    attr_type_params.set_type(data_type);
+    node->mutable_attr()->insert({"T", attr_type_params});
+  }
+
+  Status CustomizedProcessing() override {
+    if (is_4d_with_vector_) {
+      string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
+      string reshape_node_name =
+          AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
+      string shape_const_node_name =
+          AddPrefixToNodeName(base_name, kReshapeConst, "-");
+      auto input_node = node_map_->GetNode(node_->input(1));
+      TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
+      int vector_size =
+          input_node->attr().at("_output_shapes").list().shape(0).dim(0).size();
+      AddNodeShapeConst(shape_const_node_name, vector_size);
+      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+      AddNodeReshape(reshape_node_name, node_->input(1), shape_const_node_name,
+                     node_->attr().at("T").type());
+      node_map_->AddOutput(shape_const_node_name, reshape_node_name);
+      node_map_->UpdateOutput(node_->input(1), node_->name(),
+                              reshape_node_name);
+      node_map_->AddOutput(reshape_node_name, node_->name());
+      *node_->mutable_input(1) = reshape_node_name;
+    }
+    return Status::OK();
+  }
+
+ private:
+  bool is_4d_with_vector_;
+};
+
+class ConcatProcessor : public AgnosticNodeProcessor {
+ public:
+  ConcatProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {
+    // For Concat,  the concat axis is the first input; for ConcatV2,
+    // the last input.
+    axis_node_pos_ =
+        (node_->op().compare("Concat") == 0) ? 0 : (node_->input_size() - 1);
+  }
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsAlongDimC();
+  }
+
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos;
+    int start = (node_->op().compare("Concat") == 0) ? 1 : 0;
+    int end = (node_->op().compare("Concat") == 0) ? node_->input_size()
+                                                   : (node_->input_size() - 1);
+    for (int i = start; i < end; i++) {
+      input_pos.push_back(i);
+    }
+    return input_pos;
+  }
+
+  Status CustomizedProcessing() override {
+    node_map_->AddOutput(kConcatConst, node_->name());
+    *node_->mutable_input(axis_node_pos_) = kConcatConst;
+    return Status::OK();
+  }
+
+  bool IsAlongDimC() const {
+    auto axis_node = node_map_->GetNode(node_->input(axis_node_pos_));
+    if (axis_node->attr().find("value") != axis_node->attr().end()) {
+      return axis_node->attr().at("value").tensor().int_val(0) == 3;
+    }
+    return false;
+  }
+
+  int axis_node_pos_;
+};
+
+class ReluGradProcessor : public AgnosticNodeProcessor {
+ public:
+  ReluGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0, 1};
+    return input_pos;
+  }
+};
+
+class SliceProcessor : public AgnosticNodeProcessor {
+ public:
+  SliceProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    // Skip the first input, which is the data to be sliced.
+    for (int i = 1; i < node_->input_size(); i++) {
+      string base_name = strings::StrCat(node_->name(), "-input", i);
+      string node_name =
+          AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
+      TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
+      AddNodePermVec(node_name, node_->input(i),
+                     node_->attr().at("Index").type(), true);
+      node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
+      node_map_->AddOutput(node_name, node_->name());
+      *node_->mutable_input(i) = node_name;
+    }
+    return Status::OK();
+  }
+
+ private:
+  void AddNodePermVec(const string& node_name, const string& input_name,
+                      DataType data_type, bool NHWCToNCHW) {
+    NodeDef* node = graph_->add_node();
+    node_map_->AddNode(node_name, node);
+    node->set_name(node_name);
+    *node->add_input() = input_name;
+    *node->add_input() = NHWCToNCHW ? kPermNHWCToNCHW : kPermNCHWToNHWC;
+    node->set_op("Gather");
+
+    AttrValue attr_type_indices;
+    attr_type_indices.set_type(DT_INT32);
+    node->mutable_attr()->insert({"Tindices", attr_type_indices});
+
+    AttrValue attr_type_params;
+    attr_type_params.set_type(data_type);
+    node->mutable_attr()->insert({"Tparams", attr_type_params});
+
+    AttrValue attr_validate;
+    attr_validate.set_b(true);
+    node->mutable_attr()->insert({"validate_indices", attr_validate});
+  }
+};
+
+// Specialized SliceProcessor, used if the second and third input are const
+// nodes, which could be the case if a constant folding pass is applied
+// before this optimization.
+class SliceProcessorConst : public AgnosticNodeProcessor {
+ public:
+  SliceProcessorConst(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    // Skip the first input, which is the data to be sliced.
+    for (int i = 1; i < node_->input_size(); i++) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
+    }
+    return Status::OK();
+  }
+};
+
+// Specialized SliceProcessor, used if the second input is ConcatOffset. An
+// example use case is in the gradient computation of Concat for InceptionV3.
+class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
+ public:
+  SliceProcessorConcatOffset(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    auto maybe_concatoffset_node =
+        node_map_->GetNode(NodeName(node_->input(1)));
+    if (maybe_concatoffset_node->op() == "ConcatOffset") {
+      auto maybe_axis_node =
+          node_map_->GetNode(maybe_concatoffset_node->input(0));
+      NodeDef* axis_node;
+      if (maybe_axis_node->op() == "Const") {
+        axis_node = maybe_axis_node;
+        // A FloorMod node might be added between ConcatOffset and the concat
+        // dimension const node to handle a negative dimension index -1, meaning
+        // the last dimension, which is consistent with the python's notation
+        // for negative index.
+      } else if (maybe_axis_node->op() == "FloorMod") {
+        axis_node = node_map_->GetNode(maybe_axis_node->input(0));
+      } else {
+        return Status(error::INVALID_ARGUMENT,
+                      strings::StrCat("Expect either Const or FloorMod for the "
+                                      "input 1 of ConcatOffset"));
+      }
+      // Need to process if the channel is at dimension 3, which indicates the
+      // NHWC format is being used. As multiple Slice nodes may share the same
+      // ConcatOffset node, the NHWC to NCHW conversion may have already
+      // been performed when processing other Slice nodes.
+      TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
+      int concat_dim = axis_node->attr().at("value").tensor().int_val(0);
+      if (concat_dim == -1 || concat_dim == 3) {
+        // Update the dimension order for shape input nodes. Note that the input
+        // 2 of Slice also shares one of the shape nodes.
+        for (int i = 1; i < maybe_concatoffset_node->input_size(); i++) {
+          auto shape_node =
+              node_map_->GetNode(maybe_concatoffset_node->input(i));
+          TF_RETURN_IF_ERROR(UpdateAttrValue(shape_node));
+        }
+        // Set the channel dimension to 1, as we have converted the vector
+        // element order from NHWC to NCHW.
+        axis_node->mutable_attr()->at("value").mutable_tensor()->set_int_val(0,
+                                                                             1);
+      }
+    }
+    return Status::OK();
+  }
+};
+
+class SqueezeProcessor : public AgnosticNodeProcessor {
+ public:
+  SqueezeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsDimsN(*node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsInputConvertible() && IsAlongDimHW();
+  }
+
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+
+  bool IsInputConvertible() const {
+    auto input = node_map_->GetNode(node_->input(0));
+    if (IsNodeNCHWToNHWC(input->name())) {
+      input = node_map_->GetNode(input->input(0));
+    }
+    if (input->attr().find("_output_shapes") != input->attr().end()) {
+      auto shape = input->attr().at("_output_shapes").list().shape(0);
+      if (shape.dim_size() != 4) {
+        return false;
+      }
+      if (shape.dim(1).size() == 1 && shape.dim(2).size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool IsAlongDimHW() const {
+    if (node_->attr().find("squeeze_dims") != node_->attr().end()) {
+      auto list = node_->attr().at("squeeze_dims").list();
+      if (list.i(0) == 1 && list.i(1) == 2) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status CustomizedProcessing() override {
+    TF_RETURN_IF_ERROR(HasAttribute(*node_, "squeeze_dims"));
+    auto list = node_->mutable_attr()->at("squeeze_dims").mutable_list();
+    list->set_i(0, 2);
+    list->set_i(1, 3);
+    return Status::OK();
+  }
+};
+
+class SumProcessor : public AgnosticNodeProcessor {
+ public:
+  SumProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    auto input0 = node_map_->GetNode(node_->input(0));
+    return HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
+           IsAlongDimNHW();
+  }
+
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+
+  Status CustomizedProcessing() override {
+    node_map_->AddOutput(kReductionConst, node_->name());
+    *node_->mutable_input(1) = kReductionConst;
+    return Status::OK();
+  }
+
+ private:
+  bool IsAlongDimNHW() const {
+    NodeDef* node = node_map_->GetNode(node_->input(1));
+    Tensor tensor;
+    if (node->attr().find({"value"}) == node->attr().end()) {
+      return false;
+    }
+    auto success = tensor.FromProto(node->attr().at({"value"}).tensor());
+    if (!success) {
+      LOG(ERROR) << "Failed to parse TensorProto.";
+      return false;
+    }
+    if (tensor.flat<int>().size() != 3) {
+      return false;
+    }
+    if (tensor.flat<int>()(0) == 0 && tensor.flat<int>()(1) == 1 &&
+        tensor.flat<int>()(2) == 2) {
+      return true;
+    }
+    return false;
+  }
+};
+
+struct TuningConfig {
+  // If true, do not use the NHWC GEMM implementation. When filter size is
+  // one or filter size is equal to input image size,
+  // the NHWC implementation of Conv2D, Conv2DBackpropInput, and
+  // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
+  // usually faster than the NCHW implementation. The downside is that this
+  // might result in more non-cancellable layout conversion nodes (implemented
+  // by the Transpose op).
+  bool no_gemm;
+};
+
+class DataLayoutOptimizer {
+ public:
+  explicit DataLayoutOptimizer(GraphDef* graph, TuningConfig config)
+      : graph_(graph), node_map_(graph_), config_(config) {}
+
+  Status Optimize() {
+    LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
+    TF_RETURN_IF_ERROR(Expand());
+    LOG(INFO) << "Number of nodes after Expand: " << graph_->node_size();
+    TF_RETURN_IF_ERROR(Collapse());
+    LOG(INFO) << "Number of nodes after Collapse: " << graph_->node_size();
+    return Status::OK();
+  }
+
+ private:
+  void AddNodePermConst(const string& name,
+                        const std::vector<int>& permutation) {
+    NodeDef* node = graph_->add_node();
+    node_map_.AddNode(name, node);
+    node->set_name(name);
+    node->set_op("Const");
+    AttrValue attr_data_type;
+    attr_data_type.set_type(DT_INT32);
+    node->mutable_attr()->insert({"dtype", attr_data_type});
+    AttrValue attr_tensor;
+    Tensor tensor(DT_INT32, TensorShape({4}));
+    for (int i = 0; static_cast<size_t>(i) < permutation.size(); i++) {
+      tensor.flat<int>()(i) = permutation[i];
+    }
+    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
+    node->mutable_attr()->insert({"value", attr_tensor});
+  }
+
+  void AddNodeConcatConst() {
+    NodeDef* node = graph_->add_node();
+    node_map_.AddNode(kConcatConst, node);
+    node->set_name(kConcatConst);
+    node->set_op("Const");
+    AttrValue attr_data_type;
+    attr_data_type.set_type(DT_INT32);
+    node->mutable_attr()->insert({"dtype", attr_data_type});
+    AttrValue attr_tensor;
+    Tensor tensor(DT_INT32, TensorShape({}));
+    tensor.scalar<int>()() = 1;
+    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
+    node->mutable_attr()->insert({"value", attr_tensor});
+  }
+
+  void AddNodeReductionConst() {
+    NodeDef* node = graph_->add_node();
+    node_map_.AddNode(kReductionConst, node);
+    node->set_name(kReductionConst);
+    node->set_op("Const");
+    AttrValue attr_data_type;
+    attr_data_type.set_type(DT_INT32);
+    node->mutable_attr()->insert({"dtype", attr_data_type});
+
+    AttrValue attr_tensor;
+    Tensor tensor(DT_INT32, TensorShape({3}));
+    std::vector<int> axis = {0, 2, 3};
+    for (int i = 0; static_cast<size_t>(i) < axis.size(); i++) {
+      tensor.flat<int>()(i) = axis[i];
+    }
+    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
+    node->mutable_attr()->insert({"value", attr_tensor});
+  }
+
+  // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
+  Status Expand() {
+    int node_size_original = graph_->node_size();
+    // This is the first pass where we expand the nodes which support NCHW.
+    std::set<string> ops_format_supported = GetOpsFormatSupported();
+    for (int i = 0; i < graph_->node_size(); i++) {
+      if (ops_format_supported.find(graph_->node(i).op()) !=
+          ops_format_supported.end()) {
+        auto node = graph_->mutable_node(i);
+        std::unique_ptr<NodeProcessor> node_processor;
+        if (node->op().compare("AvgPoolGrad") == 0) {
+          node_processor.reset(
+              new AvgPoolGradProcessor(graph_, node, &node_map_));
+        } else if (node->op().compare("BiasAddGrad") == 0) {
+          node_processor.reset(
+              new BiasAddGradProcessor(graph_, node, &node_map_));
+        } else if (node->op().compare("Conv2D") == 0) {
+          node_processor.reset(
+              new Conv2DProcessor(graph_, node, &node_map_, config_.no_gemm));
+        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
+          node_processor.reset(new Conv2DBackpropFilterProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
+        } else if (node->op().compare("Conv2DBackpropInput") == 0) {
+          node_processor.reset(new Conv2DBackpropInputProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
+        } else if (node->op().compare("FusedBatchNormGrad") == 0) {
+          node_processor.reset(
+              new FusedBatchNormGradProcessor(graph_, node, &node_map_));
+        } else if (node->op().compare("MaxPoolGrad") == 0) {
+          node_processor.reset(
+              new MaxPoolGradProcessor(graph_, node, &node_map_));
+        } else {
+          node_processor.reset(new NodeProcessor(graph_, node, &node_map_));
+        }
+        TF_RETURN_IF_ERROR(node_processor->ConvertNode());
+      }
+    }
+
+    // This is the second pass where we expand layout-agnostic nodes. This pass
+    // only needs to be performed if at least one node in the previous pass is
+    // expanded.
+    if (graph_->node_size() > node_size_original) {
+      AddNodePermConst(kPermNHWCToNCHW, {0, 3, 1, 2});
+      AddNodePermConst(kPermNCHWToNHWC, {0, 2, 3, 1});
+      AddNodeConcatConst();
+      AddNodeReductionConst();
+      std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
+      for (int i = 0; i < graph_->node_size(); i++) {
+        if (ops_format_agnostic.find(graph_->node(i).op()) !=
+            ops_format_agnostic.end()) {
+          auto node = graph_->mutable_node(i);
+          std::unique_ptr<NodeProcessor> node_processor;
+          if (node->op().compare("AddN") == 0) {
+            node_processor.reset(new AddNProcessor(graph_, node, &node_map_));
+          } else if (node->op().compare("Add") == 0 ||
+                     node->op().compare("Mul") == 0 ||
+                     node->op().compare("RealDiv") == 0 ||
+                     node->op().compare("SquaredDifference") == 0 ||
+                     node->op().compare("Sub") == 0) {
+            node_processor.reset(
+                new BinaryOpProcessor(graph_, node, &node_map_));
+          } else if (node->op().compare("Concat") == 0 ||
+                     node->op().compare("ConcatV2") == 0) {
+            node_processor.reset(new ConcatProcessor(graph_, node, &node_map_));
+          } else if (node->op().compare("ReluGrad") == 0) {
+            node_processor.reset(
+                new ReluGradProcessor(graph_, node, &node_map_));
+          } else if (node->op().compare("Slice") == 0) {
+            auto input1 = node_map_.GetNode(NodeName(node->input(1)));
+            auto input2 = node_map_.GetNode(NodeName(node->input(2)));
+            if (input1->op() == "ConcatOffset") {
+              node_processor.reset(
+                  new SliceProcessorConcatOffset(graph_, node, &node_map_));
+            } else if (input1->op() == "Const" && input2->op() == "Const") {
+              node_processor.reset(
+                  new SliceProcessorConst(graph_, node, &node_map_));
+            } else {
+              node_processor.reset(
+                  new SliceProcessor(graph_, node, &node_map_));
+            }
+
+          } else if (node->op().compare("Squeeze") == 0) {
+            node_processor.reset(
+                new SqueezeProcessor(graph_, node, &node_map_));
+          } else if (node->op().compare("Sum") == 0) {
+            node_processor.reset(new SumProcessor(graph_, node, &node_map_));
+          } else {
+            node_processor.reset(
+                new AgnosticNodeProcessor(graph_, node, &node_map_));
+          }
+          TF_RETURN_IF_ERROR(node_processor->ConvertNode());
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  // Remove all node pairs, where a NCHW-to-NHWC node is followed by
+  // a NHWC-to-NCHW node.
+  Status Collapse() {
+    std::unordered_set<string> nodes_removable;
+    for (int i = 0; i < graph_->node_size(); i++) {
+      auto node = graph_->mutable_node(i);
+      if (IsNodeNHWCToNCHW(node->name())) {
+        if (IsNodeNCHWToNHWC(node->input(0))) {
+          const string& trans_first = node->input(0);
+          const string& trans_second = node->name();
+          auto outputs = node_map_.GetOutputs(trans_second);
+          CHECK(outputs.size() == 1)
+              << "There is always only a single output for a Transpose node, "
+              << "due to the way it is added by NodeProcessor.";
+          NodeDef* output = *outputs.begin();
+          string input = node_map_.GetNode(trans_first)->input(0);
+          for (int i = 0; i < output->input_size(); i++) {
+            if (output->input(i).compare(trans_second) == 0) {
+              *output->mutable_input(i) = input;
+              break;
+            }
+          }
+          nodes_removable.insert(trans_first);
+          nodes_removable.insert(trans_second);
+        }
+      }
+    }
+    graph_->mutable_node()->erase(
+        std::remove_if(
+            graph_->mutable_node()->begin(), graph_->mutable_node()->end(),
+            [nodes_removable](const NodeDef& node) {
+              return nodes_removable.find(node.name()) != nodes_removable.end();
+            }),
+        graph_->mutable_node()->end());
+    return Status::OK();
+  }
+
+  GraphDef* graph_;
+  NodeMap node_map_;
+  TuningConfig config_;
+};
+
+int GetNumTranspose(const GraphDef& graph) {
+  int number = 0;
+  for (const auto& node : graph.node()) {
+    if (IsTranspose(node)) {
+      number++;
+    }
+  }
+  LOG(INFO) << "Number of Transpose nodes: " << number;
+  return number;
+}
+
+Status LayoutOptimizer::InferOutputShapes(GrapplerItem* item) {
+  GraphProperties graph_properties(*item);
+  TF_RETURN_IF_ERROR(graph_properties.InferStatically());
+  for (int i = 0; i < item->graph.node_size(); i++) {
+    auto node = item->graph.mutable_node(i);
+    AttrValue attr_output_shape;
+    auto tensor_properties = graph_properties.GetOutputProperties(node->name());
+    for (const auto& tensor_property : tensor_properties) {
+      *attr_output_shape.mutable_list()->add_shape() = tensor_property.shape();
+    }
+    node->mutable_attr()->insert({"_output_shapes", attr_output_shape});
+  }
+  return Status::OK();
+}
+
+Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output) {
+  if (num_gpus_ == 0) {
+    num_gpus_ = GetNumAvailableGPUs();
+  }
+  if (num_gpus_ < 1) {
+    // LayoutOptimizer is currently only tuned for GPU.
+    *output = item.graph;
+    return Status::OK();
+  }
+
+  GrapplerItem new_item = item;
+  auto status = InferOutputShapes(&new_item);
+  if (!status.ok()) {
+    *output = item.graph;
+    return status;
+  }
+
+  *output = new_item.graph;
+  TuningConfig config;
+  config.no_gemm = false;
+  DataLayoutOptimizer layout_optimizer(output, config);
+  status = layout_optimizer.Optimize();
+  // This is based on an empirical observation that if the introduced Transpose
+  // nodes is more than 30, not using GEMM implementation would result in better
+  // performance.
+  if (status.ok() && GetNumTranspose(*output) > 30) {
+    *output = new_item.graph;
+    config.no_gemm = true;
+    DataLayoutOptimizer layout_optimizer(output, config);
+    status = layout_optimizer.Optimize();
+  }
+
+  if (!status.ok()) {
+    *output = item.graph;
+  }
+  return status;
+}
+
+void LayoutOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                               const GraphDef& optimize_output, double result) {
+  // Nothing to do for LayoutOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
new file mode 100644
index 00000000000..d47c2ff1eae
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Convert the NHWC layout to NCHW for Conv-related ops on GPUs.
+class LayoutOptimizer : public GraphOptimizer {
+ public:
+  LayoutOptimizer() {}
+  ~LayoutOptimizer() override {}
+
+  string name() const override { return "layout"; };
+
+  // This is for testing only.
+  void set_num_gpus(int num_gpus) { num_gpus_ = num_gpus; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  Status InferOutputShapes(GrapplerItem* item);
+  int num_gpus_ = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
new file mode 100644
index 00000000000..5edfe8065f0
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class LayoutOptimizerTest : public ::testing::Test {
+ protected:
+  Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
+                      const string& padding) {
+    int batch_size = 128;
+    int input_height = input_size;
+    int input_width = input_size;
+    int input_depth = 3;
+    int filter_count = 2;
+    int stride = 1;
+    TensorShape input_shape(
+        {batch_size, input_height, input_width, input_depth});
+    Tensor input_data(DT_FLOAT, input_shape);
+    test::FillIota<float>(&input_data, 1.0f);
+    Output input =
+        ops::Const(s->WithOpName("Input"), Input::Initializer(input_data));
+
+    TensorShape filter_shape(
+        {filter_size, filter_size, input_depth, filter_count});
+    Tensor filter_data(DT_FLOAT, filter_shape);
+    test::FillIota<float>(&filter_data, 1.0f);
+    Output filter =
+        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+
+    Output conv = ops::Conv2D(s->WithOpName("Conv2D"), input, filter,
+                              {1, stride, stride, 1}, padding);
+    return conv;
+  }
+
+  Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+                                   int filter_size, const string& padding) {
+    int batch_size = 128;
+    int input_height = input_size;
+    int input_width = input_size;
+    int input_depth = 3;
+    int filter_count = 2;
+    int stride = 1;
+    TensorShape input_sizes_shape({4});
+    Tensor input_data(DT_INT32, input_sizes_shape);
+    test::FillValues<int>(&input_data,
+                          {batch_size, input_height, input_width, input_depth});
+    Output input_sizes =
+        ops::Const(s->WithOpName("InputSizes"), Input::Initializer(input_data));
+
+    TensorShape filter_shape(
+        {filter_size, filter_size, input_depth, filter_count});
+    Tensor filter_data(DT_FLOAT, filter_shape);
+    test::FillIota<float>(&filter_data, 1.0f);
+    Output filter =
+        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+
+    int output_height = input_height;
+    int output_width = input_width;
+    TensorShape output_shape(
+        {batch_size, output_height, output_width, filter_count});
+    Tensor output_data(DT_FLOAT, output_shape);
+    test::FillIota<float>(&output_data, 1.0f);
+    Output output =
+        ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
+
+    Output conv_backprop_input = ops::Conv2DBackpropInput(
+        s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+        {1, stride, stride, 1}, padding);
+    TensorShape input_shape(
+        {batch_size, input_height, input_width, input_depth});
+    return conv_backprop_input;
+  }
+
+  Tensor GetAttrValue(const NodeDef& node) {
+    Tensor tensor;
+    CHECK(tensor.FromProto(node.attr().at({"value"}).tensor()));
+    return tensor;
+  }
+};
+
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  string input_name = AddPrefixToNodeName("Conv2DBackpropInput-InputSizes",
+                                          "LayoutOptimizer", "-");
+  auto input_sizes_node = node_map.GetNode(input_name);
+  CHECK(input_sizes_node);
+  auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
+  CHECK(conv2d_backprop_node);
+  EXPECT_EQ(input_name, conv2d_backprop_node->input(0));
+  auto input_sizes = GetAttrValue(*input_sizes_node);
+  Tensor input_sizes_expected(DT_INT32, {4});
+  test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
+  test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+}
+
+TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 2, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 2, 2, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
new file mode 100644
index 00000000000..1ed7cab4abf
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -0,0 +1,290 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
+
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char* kRecomputedNodePrefix = "Recomputed";
+
+string RecomputedOrOriginalNodeName(
+    const std::unordered_set<string>& recomputed_node_names,
+    const string& original_node_name) {
+  if (recomputed_node_names.find(original_node_name) ==
+      recomputed_node_names.end()) {
+    return original_node_name;
+  } else {
+    return AddPrefixToNodeName(original_node_name, kRecomputedNodePrefix);
+  }
+}
+
+void RecomputeSubgraph(
+    const std::vector<const NodeDef*>& recomputed_source_nodes,
+    const string& recompute_trigger_node_name,
+    const std::vector<NodeDef*>& target_nodes, GraphDef* graph) {
+  std::unordered_set<string> recomputed_node_names;
+  for (const NodeDef* to_recompute : recomputed_source_nodes) {
+    recomputed_node_names.insert(to_recompute->name());
+  }
+  // Create the recomputed sub-graph
+  for (const NodeDef* original_node : recomputed_source_nodes) {
+    NodeDef* copied_node = graph->add_node();
+    copied_node->set_name(
+        AddPrefixToNodeName(original_node->name(), kRecomputedNodePrefix));
+    copied_node->set_op(original_node->op());
+    *copied_node->mutable_attr() = original_node->attr();
+    copied_node->set_device(original_node->device());
+    for (const string& original_input_name : original_node->input()) {
+      // Set inputs which are internal to the copied subgraph to their copied
+      // versions.
+      *copied_node->add_input() = RecomputedOrOriginalNodeName(
+          recomputed_node_names, original_input_name);
+    }
+    // Set control dependencies on the recomputed nodes so that they are not run
+    // until the specified trigger runs.
+    *copied_node->add_input() =
+        strings::StrCat("^", recompute_trigger_node_name);
+  }
+  // Set the inputs of nodes in the target subgraph to the recomputed nodes
+  // where applicable.
+  for (NodeDef* target_node : target_nodes) {
+    for (string& target_input_name : *target_node->mutable_input()) {
+      target_input_name = RecomputedOrOriginalNodeName(recomputed_node_names,
+                                                       target_input_name);
+    }
+  }
+}
+
+std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
+                                            GraphDef* graph) {
+  string tensor_to_swap = strings::StrCat(node->name(), "_", input_to_swap);
+
+  // Force the tensor to be copied to cpu.
+  NodeDef* swap_out_node = graph->add_node();
+  swap_out_node->set_name(strings::StrCat("swap_out_", tensor_to_swap));
+  swap_out_node->set_op("Identity");
+  swap_out_node->set_device("/CPU");
+
+  // Force the tensor to be restored to the device.
+  NodeDef* swap_in_node = graph->add_node();
+  swap_in_node->set_name(strings::StrCat("swap_in_", tensor_to_swap));
+  swap_in_node->set_op("Identity");
+  *swap_in_node->add_input() = swap_out_node->name();
+
+  // Colocate the swap_in_ node with the node itself.
+  string coloc_group = strings::StrCat("loc@", tensor_to_swap);
+  (*swap_in_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
+  (*node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
+
+  return std::make_pair(swap_out_node, swap_in_node);
+}
+
+static int64 EstimateSize(const OpInfo::TensorProperties& t) {
+  DataType dtype = t.dtype();
+  int64 size = DataTypeSize(dtype);
+  TensorShapeProto shape = t.shape();
+  if (shape.unknown_rank()) {
+    // Can't infer the size if the rank is unknown. It has to be at least a
+    // scalar though.
+    return size;
+  }
+  // If one of the dimensions is unknown statically, assume it's at least one.
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    if (shape.dim(i).size() < 0) {
+      shape.mutable_dim(i)->set_size(1);
+    }
+  }
+  int64 num_elems = TensorShape(shape).num_elements();
+  return num_elems * size;
+}
+
+struct SwapInfo {
+  std::vector<int> inputs_to_swap;
+  Costs::NanoSeconds time_to_swap = 0;
+};
+
+static const NodeDef* FindSwapTrigger(
+    const NodeDef* node, const SwapInfo& swap_info,
+    const std::unordered_map<string, const NodeDef*>& name_map,
+    const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
+        execution_times) {
+  // max_trigger_time stores the time before which the swap operation needs to
+  // be started in order to load the data back onto the accelerator without
+  // delaying the downstream computation.
+  Costs::NanoSeconds max_trigger_time(0);
+  std::set<string> possible_inputs;
+  for (int i = 0; i < node->input_size(); ++i) {
+    const string input_node_name = NodeName(node->input(i));
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    max_trigger_time = std::max(max_trigger_time, it2->second);
+    possible_inputs.insert(input_node_name);
+  }
+
+  for (const int i : swap_info.inputs_to_swap) {
+    const string input_node_name = NodeName(node->input(i));
+    possible_inputs.erase(input_node_name);
+  }
+  if (possible_inputs.empty()) {
+    return nullptr;
+  }
+
+  max_trigger_time -= swap_info.time_to_swap;
+
+  std::map<Costs::NanoSeconds, const NodeDef*> candidates;
+  while (!possible_inputs.empty()) {
+    const string input_node_name = *possible_inputs.begin();
+    possible_inputs.erase(possible_inputs.begin());
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+    // Don't jump over frames, since adding a control dependency from one frame
+    // to the next isn't supported. Don't go through branches, since we don't
+    // know whether they'll be executed or not.
+    if (input_node->op() == "NextIteration" || input_node->op() == "Switch" ||
+        input_node->op() == "Merge") {
+      continue;
+    }
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    if (it2->second < max_trigger_time) {
+      candidates[it2->second] = input_node;
+    } else {
+      for (const string& fanin : input_node->input()) {
+        possible_inputs.insert(NodeName(fanin));
+      }
+    }
+  }
+
+  // Select the candidate that will execute last, since we want to swap the data
+  // back at the last minute while still allowing enough time for data to be
+  // swapped back timely to feed the downstream nodes.
+  if (!candidates.empty()) {
+    return candidates.rbegin()->second;
+  }
+  return nullptr;
+}
+
+Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  // Figure out what needs to be swapped;
+  std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
+  for (auto& node : *optimized_graph->mutable_node()) {
+    if (node.attr().count("_swap_to_host") != 0) {
+      SwapInfo& swap_info = nodes_to_swap[&node];
+      const AttrValue& val = node.attr().at("_swap_to_host");
+      if (val.has_list()) {
+        for (int64 input_id : val.list().i()) {
+          swap_info.inputs_to_swap.push_back(input_id);
+        }
+      } else {
+        int64 input_id = val.i();
+        swap_info.inputs_to_swap.push_back(input_id);
+      }
+    }
+  }
+  if (nodes_to_swap.empty()) {
+    // Nothing to do.
+    return Status::OK();
+  }
+
+  {
+    // Estimate the size of the data to swap for each node.
+    GraphProperties properties(item);
+    TF_RETURN_IF_ERROR(properties.InferStatically());
+    for (auto& swap : nodes_to_swap) {
+      const NodeDef* node = swap.first;
+      std::vector<OpInfo::TensorProperties> props =
+          properties.GetInputProperties(node->name());
+      SwapInfo& swap_info = swap.second;
+      int64 bytes_to_swap = 0;
+      for (int64 input_id : swap_info.inputs_to_swap) {
+        const OpInfo::TensorProperties& t = props[input_id];
+        bytes_to_swap += EstimateSize(t);
+      }
+      // Let's assume we're going to swap over PCIe running at 16 GBps.
+      swap_info.time_to_swap = bytes_to_swap / 16;
+    }
+  }
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
+  TF_RETURN_IF_ERROR(
+      EstimateEarliestExecutionTimes(item, cluster, &execution_times));
+
+  std::unordered_map<string, const NodeDef*> name_map;
+  for (const auto& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+  }
+
+  for (auto& swap : nodes_to_swap) {
+    NodeDef* node = swap.first;
+    SwapInfo& swap_info = swap.second;
+
+    // Make sure the tensor isn't swapped back in right away: look for node that
+    // will execute just before we need to swap the data back, and add a control
+    // dependency from that node to the swap node.
+    const NodeDef* trigger =
+        FindSwapTrigger(node, swap_info, name_map, execution_times);
+    if (!trigger) {
+      continue;
+    }
+    // Swap all the tensors that are marked with the 'swap_to_host' attribute.
+    for (int input_id : swap_info.inputs_to_swap) {
+      std::pair<NodeDef*, NodeDef*> swap_nodes =
+          BuildSwapPair(node, input_id, optimized_graph);
+      *swap_nodes.first->add_input() = node->input(input_id);
+      *node->mutable_input(input_id) = swap_nodes.second->name();
+
+      // Add the control dependency needed to delay the execution of the swap.
+      *swap_nodes.second->add_input() = strings::StrCat("^", trigger->name());
+    }
+  }
+
+  return Status::OK();
+}
+
+void MemoryOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                               const GraphDef& optimized_graph, double result) {
+  // Nothing to do for MemoryOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
new file mode 100644
index 00000000000..dfb24c05c99
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+
+#include <vector>
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Swap tensors in and out of device memory.
+class MemoryOptimizer : public GraphOptimizer {
+ public:
+  MemoryOptimizer() {}
+  ~MemoryOptimizer() override {}
+
+  string name() const override { return "memory_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* pruned_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& pruned_graph, double result) override;
+};
+
+// Helper function to recompute a sub-graph (recomputed_source_nodes) on a
+// trigger. Edges from recomputed_source_nodes to target_nodes are changed to
+// start from the recomputed nodes.
+void RecomputeSubgraph(
+    const std::vector<const NodeDef*>& recomputed_source_nodes,
+    const string& recompute_trigger_node_name,
+    const std::vector<NodeDef*>& target_nodes, GraphDef* graph);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
new file mode 100644
index 00000000000..a4f8e22e1d8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -0,0 +1,179 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RecomputeSubgraphTest : public ::testing::Test {};
+
+TEST_F(RecomputeSubgraphTest, SimpleSubgraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 1.f, {2, 3, 4});
+  Output b = ops::AddN(s.WithOpName("b"), {a});  // Recomputed
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d"), {c});
+  Output e = ops::AddN(s.WithOpName("e"), {d, b});
+  Output f = ops::AddN(s.WithOpName("f"), {e, a});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_EQ(6, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  std::vector<const NodeDef*> recomputed_source_nodes;
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
+  std::vector<NodeDef*> target_nodes;
+  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
+  RecomputeSubgraph(recomputed_source_nodes, d.name(), target_nodes,
+                    &item.graph);
+  NodeMap post_transform_node_map(&item.graph);
+  EXPECT_EQ(7, item.graph.node_size());
+  NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
+  EXPECT_EQ(2, transformed_e->input_size());
+  EXPECT_EQ("d", transformed_e->input(0));
+  EXPECT_EQ("Recomputed/b", transformed_e->input(1));
+  NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/b");
+  EXPECT_EQ(2, recomputed_b->input_size());
+  EXPECT_EQ("a", recomputed_b->input(0));
+  EXPECT_EQ("^d", recomputed_b->input(1).substr(0, 2));
+}
+
+TEST_F(RecomputeSubgraphTest, MultiNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("Conv"), 1.f, {2, 3, 4});
+  Output b = ops::AddN(s.WithOpName("BN"), {a});    // Recomputed
+  Output c = ops::AddN(s.WithOpName("ReLU"), {b});  // Recomputed
+  Output d = ops::AddN(s.WithOpName("Conv1"), {c});
+
+  Output trigger = ops::Const(s.WithOpName("BN1Grad"), 0.f, {2, 3, 4});
+  Output e = ops::AddN(s.WithOpName("Conv1Grad"), {trigger, c});
+  Output f = ops::AddN(s.WithOpName("ReLUGrad"), {e, c});
+  Output g = ops::AddN(s.WithOpName("BNGrad"), {f, a});
+  Output h = ops::AddN(s.WithOpName("ConvGrad"), {g});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_EQ(9, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  std::vector<const NodeDef*> recomputed_source_nodes;
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(c.name()));
+  std::vector<NodeDef*> target_nodes;
+  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
+  target_nodes.push_back(pre_transform_node_map.GetNode(f.name()));
+  target_nodes.push_back(pre_transform_node_map.GetNode(g.name()));
+  RecomputeSubgraph(recomputed_source_nodes, trigger.name(), target_nodes,
+                    &item.graph);
+  NodeMap post_transform_node_map(&item.graph);
+  EXPECT_EQ(11, item.graph.node_size());
+  NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
+  EXPECT_EQ(2, transformed_e->input_size());
+  EXPECT_EQ("BN1Grad", transformed_e->input(0));
+  EXPECT_EQ("Recomputed/ReLU", transformed_e->input(1));
+  NodeDef* transformed_f = post_transform_node_map.GetNode(f.name());
+  EXPECT_EQ(2, transformed_f->input_size());
+  EXPECT_EQ("Conv1Grad", transformed_f->input(0));
+  EXPECT_EQ("Recomputed/ReLU", transformed_f->input(1));
+  NodeDef* transformed_g = post_transform_node_map.GetNode(g.name());
+  EXPECT_EQ(2, transformed_g->input_size());
+  EXPECT_EQ("ReLUGrad", transformed_g->input(0));
+  EXPECT_EQ("Conv", transformed_g->input(1));
+
+  NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/BN");
+  EXPECT_EQ(2, recomputed_b->input_size());
+  EXPECT_EQ("Conv", recomputed_b->input(0));
+  EXPECT_EQ("^BN1Grad", recomputed_b->input(1).substr(0, 8));
+  NodeDef* recomputed_c = post_transform_node_map.GetNode("Recomputed/ReLU");
+  EXPECT_EQ(2, recomputed_c->input_size());
+  EXPECT_EQ("Recomputed/BN", recomputed_c->input(0));
+  EXPECT_EQ("^BN1Grad", recomputed_c->input(1).substr(0, 8));
+}
+
+class MemoryOptimizerTest : public ::testing::Test {
+ public:
+  static VirtualCluster CreateVirtualCluster() {
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
+
+TEST_F(MemoryOptimizerTest, SimpleSwapping) {
+  // Build a simple graph with an op that's marked for swapping.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d"), {c});
+  Output e = ops::AddN(s.WithOpName("e"), {b, d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  EXPECT_EQ(5, item.graph.node_size());
+  EXPECT_EQ(NodeName(e.name()), item.graph.node(4).name());
+  AttrValue& val =
+      (*item.graph.mutable_node(4)->mutable_attr())["_swap_to_host"];
+  val.mutable_list()->add_i(0);
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  MemoryOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(&cluster, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(1));
+  EXPECT_EQ("swap_in_e_0", new_e.input(0));
+
+  const NodeDef& swap_out = output.node(5);
+  EXPECT_EQ("swap_out_e_0", swap_out.name());
+
+  const NodeDef& swap_in = output.node(6);
+  EXPECT_EQ("swap_in_e_0", swap_in.name());
+
+  EXPECT_EQ(NodeName(b.name()), swap_out.input(0));
+  EXPECT_EQ(NodeName(swap_out.name()), swap_in.input(0));
+  EXPECT_EQ("^c", swap_in.input(1));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
new file mode 100644
index 00000000000..8bb7800df4e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
+    const string& optimizer) {
+  VLOG(1) << "Adding graph optimization pass: " << optimizer;
+  std::unique_ptr<GraphOptimizer> graph_optimizer;
+  if (optimizer == "pruning") {
+    graph_optimizer.reset(new ModelPruner());
+  }
+  if (optimizer == "constfold") {
+    graph_optimizer.reset(new ConstantFolding());
+  }
+  if (optimizer == "layout") {
+    graph_optimizer.reset(new LayoutOptimizer());
+  }
+  if (optimizer == "memory") {
+    graph_optimizer.reset(new MemoryOptimizer());
+  }
+  if (optimizer == "autoparallel") {
+    graph_optimizer.reset(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  }
+  return graph_optimizer;
+}
+
+Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  if (cfg_.optimizers().empty()) {
+    if (!cfg_.disable_model_pruning()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
+    }
+    if (cfg_.constant_folding()) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new ConstantFolding()));
+    }
+    if (cfg_.optimize_tensor_layout()) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
+    }
+    if (cfg_.memory_optimization() > 0) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new MemoryOptimizer()));
+    }
+    if (cfg_.auto_parallel().enable()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new AutoParallel(cfg_.auto_parallel().num_replicas())));
+    }
+  } else {
+    std::set<string> available_optimizers = {"pruning", "constfold", "layout",
+                                             "memory", "autoparallel"};
+    for (const auto& optimizer : cfg_.optimizers()) {
+      if (available_optimizers.find(optimizer) != available_optimizers.end()) {
+        optimizers.push_back(NewOptimizer(optimizer));
+      }
+    }
+  }
+
+  if (optimizers.empty()) {
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  bool already_optimized = false;
+  for (const auto& optimizer : optimizers) {
+    if (!already_optimized) {
+      TF_RETURN_IF_ERROR(optimizer->Optimize(cluster, item, optimized_graph));
+      already_optimized = true;
+    } else {
+      GrapplerItem optimized_item = item;
+      optimized_item.graph = *optimized_graph;
+      TF_RETURN_IF_ERROR(
+          optimizer->Optimize(cluster, optimized_item, optimized_graph));
+    }
+  }
+  TopologicalSort(optimized_graph);
+  // Copy the graph version.
+  *optimized_graph->mutable_versions() = item.graph.versions();
+
+  return Status::OK();
+}
+
+void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                             const GraphDef& pruned_graph, double result) {
+  // Nothing to do for MetaOptimizer.
+}
+
+bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
+  return cfg.optimize_tensor_layout() || cfg.constant_folding() ||
+         cfg.auto_parallel().enable() || !cfg.optimizers().empty();
+}
+
+Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+                        Cluster* cluster, GraphDef* optimized_graph) {
+  MetaOptimizer optimizer(cfg);
+  return optimizer.Optimize(cluster, item, optimized_graph);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
new file mode 100644
index 00000000000..6b950c973d9
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Run the other grappler optimizers based on the specified rewriter config.
+class MetaOptimizer : public GraphOptimizer {
+ public:
+  MetaOptimizer(const RewriterConfig& cfg) : cfg_(cfg) {}
+  ~MetaOptimizer() override {}
+
+  string name() const override { return "meta_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
+  RewriterConfig cfg_;
+};
+
+bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+
+Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+                        Cluster* cluster, GraphDef* optimized_graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
new file mode 100644
index 00000000000..efa21638369
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include <unordered_set>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
+                             GraphDef* pruned_graph) {
+  GraphRewriter rewriter(item);
+
+  std::unordered_set<string> nodes_to_preserve;
+  for (const auto& node : item.fetch) {
+    nodes_to_preserve.insert(NodeName(node));
+  }
+  for (const auto& node : item.init_ops) {
+    nodes_to_preserve.insert(NodeName(node));
+  }
+
+  std::unordered_set<const NodeDef*> nodes_to_delete;
+  for (auto& node : item.graph.node()) {
+    // Remove the stop gradient nodes since they serve no purpose once the graph
+    // is built. Also remove Identity ops.
+    if (node.op() != "StopGradient" && node.op() != "Identity") {
+      continue;
+    }
+    // Don't remove nodes that must be preserved.
+    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
+      continue;
+    }
+    // Don't remove nodes that are explicitly placed.
+    if (!node.device().empty()) {
+      continue;
+    }
+    // Don't remove nodes that drive control dependencies.
+    // Don't remove nodes that are driven by control dependencies either since
+    // we can't ensure (yet) that we won't increase the number of control
+    // dependency edges by deleting them (for example, removing a node driven by
+    // 10 control edges and driving 10 control edges would result in the
+    // creation of 100 edges).
+    if (!rewriter.DrivesControlDependency(node) &&
+        !rewriter.IsDrivenByControlDependency(node)) {
+      nodes_to_delete.insert(&node);
+    }
+  }
+
+  for (auto& node : item.graph.node()) {
+    NodeDef* new_node = pruned_graph->add_node();
+    *new_node = node;
+    new_node->clear_input();
+    rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+  }
+
+  VLOG(1) << "Pruned " << nodes_to_delete.size()
+          << " nodes from the graph. The graph now contains "
+          << pruned_graph->node_size() << " nodes.";
+
+  return Status::OK();
+}
+
+void ModelPruner::Feedback(Cluster* cluster, const GrapplerItem& item,
+                           const GraphDef& pruned_graph, double result) {
+  // Nothing to do for ModelPruner.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
new file mode 100644
index 00000000000..3d76aebef43
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Prune a model to make it more efficient:
+// * Remove unnecessary operations.
+// * Optimize gradient computations.
+class ModelPruner : public GraphOptimizer {
+ public:
+  ModelPruner() {}
+  ~ModelPruner() override {}
+
+  string name() const override { return "model_pruner"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* pruned_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& pruned_graph, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
new file mode 100644
index 00000000000..67954d29146
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -0,0 +1,250 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class ModelPrunerTest : public ::testing::Test {};
+
+TEST_F(ModelPrunerTest, NoPruning) {
+  // This trivial graph is so basic there's nothing to prune.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status s = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(s);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    const NodeDef& original = item.graph.node(i);
+    const NodeDef& optimized = output.node(i);
+    EXPECT_EQ(original.name(), optimized.name());
+    EXPECT_EQ(original.op(), optimized.op());
+    EXPECT_EQ(original.input_size(), optimized.input_size());
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j));
+    }
+  }
+}
+
+TEST_F(ModelPrunerTest, StopGradientPruning) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::StopGradient(s.WithOpName("c"), b);
+  Output d = ops::StopGradient(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(1, new_e.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_e.input(0));
+  EXPECT_EQ(1, new_d.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+}
+
+TEST_F(ModelPrunerTest, IdentityPruning) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Force the placement of c. This should ensure it is preserved.
+  EXPECT_EQ("c", item.graph.node(2).name());
+  item.graph.mutable_node(2)->set_device("CPU");
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(1, new_e.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_e.input(0));
+  EXPECT_EQ(1, new_d.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
+}
+
+TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Add a control dependency between c and e. This should ensure c is
+  // preserved.
+  EXPECT_EQ("c", item.graph.node(2).name());
+  EXPECT_EQ("e", item.graph.node(4).name());
+  *item.graph.mutable_node(4)->add_input() = "^c";
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_e.input(0));
+  EXPECT_EQ("^c", new_e.input(1));
+}
+
+TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::AddN(s.WithOpName("c"), {a});
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::Identity(s.WithOpName("e"), d);
+  Output f = ops::AddN(s.WithOpName("f"), {e});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Add a control dependency between b and d and another one between c and e.
+  // They should be properly forwarded.
+  EXPECT_EQ("d", item.graph.node(3).name());
+  EXPECT_EQ("e", item.graph.node(4).name());
+  *item.graph.mutable_node(3)->add_input() = "^b";
+  *item.graph.mutable_node(4)->add_input() = "^c";
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+  const NodeDef& new_f = output.node(5);
+  EXPECT_EQ(NodeName(f.name()), new_f.name());
+
+  EXPECT_EQ(1, new_f.input_size());
+  EXPECT_EQ(NodeName(e.name()), new_f.input(0));
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
+  EXPECT_EQ("^c", new_e.input(1));
+  EXPECT_EQ(2, new_d.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
+  EXPECT_EQ("^b", new_d.input(1));
+}
+
+TEST_F(ModelPrunerTest, PruningPerservesFetch) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("c");
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(3, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
new file mode 100644
index 00000000000..e31499eac66
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include <deque>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+
+static Costs::NanoSeconds PredictExecutionTime(
+    const GraphProperties& properties, const OpLevelCostEstimator& estimator,
+    const VirtualPlacer& placer, const NodeDef& node) {
+  OpInfo op_features;
+  op_features.set_op(node.op());
+  *op_features.mutable_attr() = node.attr();
+
+  std::vector<OpInfo::TensorProperties> inputs =
+      properties.GetInputProperties(node.name());
+  for (auto& input : inputs) {
+    op_features.add_inputs()->Swap(&input);
+  }
+
+  DeviceProperties device = placer.get_device(node);
+  op_features.mutable_device()->Swap(&device);
+
+  Costs::NanoSeconds estimate =
+      estimator.PredictCosts(op_features).execution_time;
+
+  // Make sure our estimates are at least one nanosecond per node.
+  return std::max(estimate, Costs::NanoSeconds(1));
+}
+
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* completion_times) {
+  std::unordered_map<string, const NodeDef*> name_map;
+  std::unordered_map<const NodeDef*, int> pending_inputs;
+  std::deque<const NodeDef*> ready_nodes;
+  for (const NodeDef& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+      (*completion_times)[&node] = 0;
+    } else if (IsMerge(node)) {
+      // Merge nodes are processed as soon as one of the input becomes
+      // available.
+      pending_inputs[&node] = 1;
+    } else {
+      pending_inputs[&node] = node.input_size();
+    }
+  }
+
+  std::unordered_map<const NodeDef*, std::vector<const NodeDef*>> fanouts;
+  for (const NodeDef& node : item.graph.node()) {
+    for (const string& input : node.input()) {
+      string node_name = NodeName(input);
+      auto it = name_map.find(node_name);
+      if (it == name_map.end()) {
+        return errors::InvalidArgument(
+            strings::StrCat("Unknown input node ", input));
+      }
+      const NodeDef* fanin = it->second;
+      fanouts[fanin].push_back(&node);
+    }
+  }
+  name_map.clear();
+
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+  OpLevelCostEstimator estimator;
+  VirtualPlacer placer(cluster);
+
+  while (!ready_nodes.empty()) {
+    const NodeDef* node = ready_nodes.front();
+    ready_nodes.pop_front();
+
+    Costs::NanoSeconds execution_time =
+        PredictExecutionTime(properties, estimator, placer, *node);
+    Costs::NanoSeconds completion_time =
+        execution_time + (*completion_times)[node];
+    (*completion_times)[node] = completion_time;
+
+    for (const NodeDef* fanout : fanouts[node]) {
+      int pending = pending_inputs[fanout];
+      if (pending == 0) {
+        // Already processed. Avoid going through loops more than once.
+        continue;
+      } else if (pending == 1) {
+        ready_nodes.push_back(fanout);
+      }
+      pending_inputs[fanout]--;
+
+      Costs::NanoSeconds ready_time =
+          std::max(completion_time, (*completion_times)[fanout]);
+      (*completion_times)[fanout] = ready_time;
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.h b/tensorflow/core/grappler/optimizers/static_schedule.h
new file mode 100644
index 00000000000..0dd82b0dab1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute the earliest time as which the execution of each node in the graph
+// can complete.
+// In our estimation, we ensure that each node takes at least one nanosecond to
+// execute: therefore the execution times can be used to derive a topological
+// ordering of the graph (at least as long as there is no loop in the graph).
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* execution_times);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
new file mode 100644
index 00000000000..c932c90765e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StaticScheduleTest : public ::testing::Test {
+ public:
+  VirtualCluster CreateVirtualCluster() const {
+    // Invent a CPU so that predictions remain the same from machine to machine.
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    cpu_device.set_l1_cache_size(32 * 1024);
+    cpu_device.set_l2_cache_size(256 * 1024);
+    cpu_device.set_l3_cache_size(4 * 1024 * 1024);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
+
+TEST_F(StaticScheduleTest, BasicGraph) {
+  // This trivial graph is so basic there's nothing to prune.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "Const/Const") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "x") {
+      EXPECT_EQ(Costs::NanoSeconds(250002), time.second);
+    } else if (time.first->name() == "AddN") {
+      EXPECT_EQ(Costs::NanoSeconds(1500005), time.second);
+    } else if (time.first->name() == "AddN_1") {
+      EXPECT_EQ(Costs::NanoSeconds(2750008), time.second);
+    } else if (time.first->name() == "AddN_2") {
+      EXPECT_EQ(Costs::NanoSeconds(4000011), time.second);
+    } else if (time.first->name() == "AddN_3") {
+      EXPECT_EQ(Costs::NanoSeconds(5250014), time.second);
+    } else if (time.first->name() == "y") {
+      EXPECT_EQ(Costs::NanoSeconds(6500017), time.second);
+    }
+  }
+}
+
+TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
+  // Build a simple graph with a control dependency.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Add a control dependency between c and e.
+  EXPECT_EQ("c", item.graph.node(2).name());
+  EXPECT_EQ("e", item.graph.node(4).name());
+  *item.graph.mutable_node(4)->add_input() = "^c";
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "a") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "b") {
+      EXPECT_EQ(Costs::NanoSeconds(12500026), time.second);
+    } else if (time.first->name() == "c") {
+      EXPECT_EQ(Costs::NanoSeconds(12500027), time.second);
+    } else if (time.first->name() == "d") {
+      EXPECT_EQ(Costs::NanoSeconds(12500028), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(25000053), time.second);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
new file mode 100644
index 00000000000..714b0e54e9a
--- /dev/null
+++ b/tensorflow/core/grappler/utils.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/notification.h"
+
+namespace tensorflow {
+namespace grappler {
+
+NodeMap::NodeMap(GraphDef* graph) : graph_(graph) {
+  for (int i = 0; i < graph_->node_size(); i++) {
+    auto node = graph_->mutable_node(i);
+    nodes_.insert(std::make_pair(node->name(), node));
+    for (const auto& input : node->input()) {
+      outputs_[NodeName(input)].insert(nodes_[node->name()]);
+    }
+  }
+}
+
+NodeDef* NodeMap::GetNode(const string& name) const {
+  string node_name = NodeName(name);
+  auto it = nodes_.find(node_name);
+  if (it == nodes_.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
+  auto it = outputs_.find(node_name);
+  if (it == outputs_.end()) {
+    return empty_set_;
+  }
+  return it->second;
+}
+
+void NodeMap::AddNode(const string& name, NodeDef* node) {
+  nodes_.insert(std::make_pair(name, node));
+}
+
+void NodeMap::AddOutput(const string& node, const string& output) {
+  outputs_[node].insert(nodes_[output]);
+}
+
+void NodeMap::UpdateOutput(const string& node, const string& old_output,
+                           const string& new_output) {
+  outputs_[node].erase(nodes_[old_output]);
+  outputs_[node].insert(nodes_[new_output]);
+}
+
+bool IsSameInput(const string& name1, const string& name2) {
+  if (name1 == name2) {
+    return true;
+  }
+  int position1;
+  string node1 = ParseNodeName(name1, &position1);
+  int position2;
+  string node2 = ParseNodeName(name2, &position2);
+  return (position1 == position2) && (node1 == node2);
+}
+
+string ParseNodeName(const string& name, int* position) {
+  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
+  // to get a node name.
+  strings::Scanner scan(name);
+  scan.ZeroOrOneLiteral("^")
+      .RestartCapture()
+      .One(strings::Scanner::LETTER_DIGIT_DOT)
+      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  StringPiece capture;
+  StringPiece remaining;
+  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
+    *position = 0;
+    return "";
+  } else {
+    if (name[0] == '^') {
+      *position = -1;
+    } else if (remaining.empty()) {
+      *position = 0;
+    } else {
+      // Skip the first ':' character.
+      CHECK(strings::safe_strto32(remaining.substr(1), position));
+    }
+    return capture.ToString();
+  }
+}
+
+bool IsControlInput(const string& name) {
+  return !name.empty() && name[0] == '^';
+}
+
+string NodeName(const string& name) {
+  int position;
+  return ParseNodeName(name, &position);
+}
+
+int NodePosition(const string& name) {
+  int position;
+  ParseNodeName(name, &position);
+  return position;
+}
+
+string AddPrefixToNodeName(const string& name, const string& prefix,
+                           const string& delimiter) {
+  if (!name.empty()) {
+    if (name[0] == '^') {
+      return strings::StrCat("^", prefix, delimiter, name.substr(1));
+    }
+  }
+  return strings::StrCat(prefix, delimiter, name);
+}
+
+string AddPrefixToNodeName(const string& name, const string& prefix) {
+  return AddPrefixToNodeName(name, prefix, "/");
+}
+
+bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
+                        thread::ThreadPool* const thread_pool) {
+  if (timeout_in_ms <= 0) {
+    fn();
+    return true;
+  }
+  auto done = std::make_shared<Notification>();
+  thread_pool->Schedule([done, fn]() {
+    fn();
+    done->Notify();
+  });
+  const bool notified =
+      WaitForNotificationWithTimeout(done.get(), timeout_in_ms * 1000);
+  return notified;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
new file mode 100644
index 00000000000..a49791bad89
--- /dev/null
+++ b/tensorflow/core/grappler/utils.h
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_UTILS_H_
+#define TENSORFLOW_GRAPPLER_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A utility class to lookup a node and its outputs by node name.
+class NodeMap {
+ public:
+  explicit NodeMap(GraphDef* graph);
+  NodeDef* GetNode(const string& name) const;
+  const std::set<NodeDef*>& GetOutputs(const string& node_name) const;
+  // This method doesn't record the outputs of the added node; the outputs need
+  // to be explicitly added by the AddOutput method.
+  void AddNode(const string& name, NodeDef* node);
+  void AddOutput(const string& node, const string& output);
+  void UpdateOutput(const string& node, const string& old_output,
+                    const string& new_output);
+
+ private:
+  GraphDef* graph_;
+  std::set<NodeDef*> empty_set_;
+  std::unordered_map<string, NodeDef*> nodes_;
+  std::unordered_map<string, std::set<NodeDef*>> outputs_;
+};
+
+// True iff 'name' refers to a control inputs, i.e. a node name prefixed with
+// the ^ character.
+bool IsControlInput(const string& name);
+
+// True iff 'name1' and 'name2' refer to the same input.
+bool IsSameInput(const string& name1, const string& name2);
+
+// Return the node name corresponding to 'name' if name is valid, or the empty
+// string otherwise.
+string NodeName(const string& name);
+
+// Get the trailing position number ":{digits}" (if any) of a node name.
+int NodePosition(const string& name);
+
+// Returns the node name and position in a single call.
+string ParseNodeName(const string& name, int* position);
+
+// Add a prefix to a node name with a custom delimiter.
+string AddPrefixToNodeName(const string& name, const string& prefix,
+                           const string& delimiter);
+
+// Add a prefix to a node name.
+string AddPrefixToNodeName(const string& name, const string& prefix);
+
+// Executes a 'fn' in the 'thread_pool'. The method waits for the configured
+// timeout (in milliseconds) for 'fn' to complete, before returning false.
+//
+// If returning false, the 'fn' may still continue to execute in the
+// thread-pool. It is the responsibility of the caller to reset the thread-pool
+// as appropriate.
+bool ExecuteWithTimeout(std::function<void()> fn, int64 timeout_in_ms,
+                        thread::ThreadPool* thread_pool);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_UTILS_H_
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
new file mode 100644
index 00000000000..8839f07bc5e
--- /dev/null
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -0,0 +1,66 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "scc",
+    srcs = ["scc.cc"],
+    hdrs = ["scc.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "scc_test",
+    size = "small",
+    srcs = ["scc_test.cc"],
+    deps = [
+        ":scc",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "topological_sort",
+    srcs = ["topological_sort.cc"],
+    hdrs = ["topological_sort.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "topological_sort_test",
+    size = "small",
+    srcs = ["topological_sort_test.cc"],
+    deps = [
+        ":topological_sort",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/scc.cc b/tensorflow/core/grappler/utils/scc.cc
new file mode 100644
index 00000000000..6568e99aa3f
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.cc
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Data structure used to store data for Tarjan's Strongly Connected
+// Components algorithm.
+struct SCCNodeData {
+  SCCNodeData()
+      : node(nullptr),
+        index(-1),
+        lowlink(-1),
+        onstack(false),
+        caller(nullptr),
+        caller_loop_location(-1) {}
+  void ResetStack(int new_index, SCCNodeData* new_caller) {
+    index = new_index;
+    lowlink = new_index;
+    onstack = true;
+    caller = new_caller;
+    caller_loop_location = 0;
+  }
+  const NodeDef* node;
+  int index;
+  int lowlink;
+  bool onstack;
+  std::vector<SCCNodeData*> children;
+  // StrongConnect "call stack" storage.
+  SCCNodeData* caller;       // Node calling StrongConnect
+  int caller_loop_location;  // Index in parent StrongConnect for loop
+};
+
+// Core DFS step of Tarjan's Strongly Connected Component algorithm
+// (implemented using iteration instead of recursion).
+void StrongConnect(SCCNodeData* v, std::stack<SCCNodeData*>* stack, int* index,
+                   std::unordered_map<const NodeDef*, int>* components,
+                   int* scc_index) {
+  // Iterative version of Tarjan's StrongConnect function.
+  // The "call stack" state is composed of a SCCNodeData's caller and
+  // caller_loop_location properties.
+  v->ResetStack(*index /* index */, nullptr /* caller */);
+  ++*index;
+  stack->push(v);
+
+  // No one put v on a StrongConnect call stack, reset caller values.
+  v->caller = nullptr;
+  v->caller_loop_location = 0;
+
+  SCCNodeData* last = v;
+  while (true) {
+    if (last->caller_loop_location < last->children.size()) {
+      // Recursive equivalent: Looping over the children of v (possibly
+      // continuing at v->caller_loop_location after having finished a
+      // recursive call.
+      SCCNodeData* w = last->children[last->caller_loop_location];
+      ++(last->caller_loop_location);  // For loop iterator increment
+      if (w->index == -1) {
+        w->ResetStack(*index /* index */, last /* caller */);
+        ++*index;
+        stack->push(w);
+        last = w;
+      } else if (w->onstack == true) {
+        last->lowlink = std::min(last->lowlink, w->index);
+      }
+    } else {
+      // At the end of v's children
+      if (last->lowlink == last->index) {
+        // v is the root of a strongly connected component
+        SCCNodeData* top;
+        while (true) {
+          top = stack->top();
+          stack->pop();
+          top->onstack = false;
+          (*components)[top->node] = *scc_index;
+          if (top == last) {
+            break;
+          }
+        }
+        ++*scc_index;
+      }
+
+      // Go up the recursive call stack
+      SCCNodeData* next_last = last->caller;
+      if (next_last == nullptr) {
+        // All nodes have been seen; finished.
+        break;
+      } else {
+        next_last->lowlink = std::min(next_last->lowlink, last->lowlink);
+        last = next_last;
+      }
+    }
+  }
+}
+
+// This is an implementation of Tarjan's Strongly Connected Components
+// DFS algorithm.  Most of the hard work is done in the function
+// StrongConnect, which is an iterative reimplementation of the
+// recursive version described here:
+//   https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
+//
+// The edges for the purpose of this algorithm are directed from input
+// to op (the reverse of the declarations of the NodeDef, which
+// contain in-edges)
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_components) {
+  std::stack<SCCNodeData*> stack;
+  std::unordered_map<string, SCCNodeData*> name_to_data;
+  std::vector<SCCNodeData> node_data_container;
+  node_data_container.reserve(graph.node_size());
+  std::unordered_map<const NodeDef*, SCCNodeData*> node_to_data;
+
+  for (const NodeDef& node : graph.node()) {
+    SCCNodeData node_data;
+    node_data.node = &node;
+    node_data_container.push_back(node_data);
+    name_to_data[node.name()] = &(*node_data_container.rbegin());
+    node_to_data[&node] = &(*node_data_container.rbegin());
+  }
+
+  // Create a list of top-level parents (add them to object queue)
+  // Also create a mapping from nodes to their children.
+  for (const NodeDef& node : graph.node()) {
+    for (const string& input : node.input()) {
+      name_to_data[NodeName(input)]->children.push_back(node_to_data[&node]);
+    }
+  }
+
+  components->clear();
+  *num_components = 0;
+  int index = 0;
+  for (auto& v : node_data_container) {
+    if (v.index == -1) {
+      // Node has not yet been visited.  Start a DFS at v.
+      StrongConnect(&v, &stack, &index, components, num_components);
+    }
+  }
+
+  std::vector<int> counts_per_component(*num_components, 0);
+  for (auto& component : *components) {
+    DCHECK(component.second >= 0);
+    DCHECK(component.second < *num_components);
+    counts_per_component[component.second]++;
+  }
+  for (auto& component : *components) {
+    if (counts_per_component[component.second] == 1) {
+      component.second = -1;
+      (*num_components)--;
+    }
+  }
+  (*num_components) += 1;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/scc.h b/tensorflow/core/grappler/utils/scc.h
new file mode 100644
index 00000000000..8b0577763d6
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute modified strongly connected components:
+// All nodes that are not part of a loop are assigned the special -1 id
+// All nodes that are part of at least one loop are assigned a positive
+// component id: if 2 nodes v and w are reachable from one another (i.e. if they
+// belong to the same scc), they'll be assigned the same id, otherwise they'll
+// be assigned distinct ids. Returns the number of distinct ids.
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_ids);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
new file mode 100644
index 00000000000..3185cbe2326
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -0,0 +1,410 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SCCTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    std::unordered_map<string, DeviceProperties> devices;
+    DeviceProperties unknown_device;
+    devices["MY_DEVICE"] = unknown_device;
+    cluster_.reset(new VirtualCluster(devices));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override { cluster_.reset(); }
+
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            gtl::ArraySlice<string> inputs) {
+    NodeDef node;
+    node.set_name(name);
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+
+  std::unique_ptr<VirtualCluster> cluster_;
+};
+
+TEST_F(SCCTest, NoLoops) {
+  // Create a simple graph without any loop.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 1);
+  for (const auto& node : item.graph.node()) {
+    EXPECT_EQ(-1, components[&node]);
+  }
+}
+
+TEST_F(SCCTest, DisjointCycleAndPath) {
+  GraphDef graph;
+  // Create a cycle
+  *graph.add_node() = CreateNode("a", {"d"});
+  *graph.add_node() = CreateNode("b", {"a"});
+  *graph.add_node() = CreateNode("c", {"b"});
+  *graph.add_node() = CreateNode("d", {"c"});
+
+  // Add a path disjoint from cycle
+  *graph.add_node() = CreateNode("e", {});
+  *graph.add_node() = CreateNode("f", {"e"});
+  *graph.add_node() = CreateNode("g", {"f"});
+  *graph.add_node() = CreateNode("h", {"g"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+
+  for (const auto& pair : {std::make_pair("a", "b"), std::make_pair("a", "c"),
+                           std::make_pair("a", "d")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& node : {"e", "f", "g", "h"})
+    EXPECT_EQ(-1, components[name_to_node[node]]);
+}
+}  // namespace
+
+TEST_F(SCCTest, WikipediaExample) {
+  // Graph with 4 SCCs:
+
+  // SCC1:
+  // a -> b
+  // b -> c
+  // c -> a
+
+  // d -> b
+  // d -> c
+
+  // SCC2:
+  // d -> e
+  // e -> d
+
+  // e -> f
+  // f -> c
+
+  // SCC3:
+  // f -> g
+  // g -> f
+
+  // h -> g
+  // h -> d
+
+  // SCC4:
+  // h -> h
+
+  // NodeDefs define inbound connections (inputs)
+  GraphDef graph;
+  *graph.add_node() = CreateNode("a", {"c"});
+  *graph.add_node() = CreateNode("b", {"a", "d"});
+  *graph.add_node() = CreateNode("c", {"b", "d", "f"});
+  *graph.add_node() = CreateNode("d", {"e"});
+  *graph.add_node() = CreateNode("e", {"d"});
+  *graph.add_node() = CreateNode("f", {"e", "g"});
+  *graph.add_node() = CreateNode("g", {"f", "h"});
+  *graph.add_node() = CreateNode("h", {"h"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 4);
+  for (const auto& pair :
+       {std::make_pair("a", "b"), std::make_pair("a", "c"),
+        std::make_pair("d", "e"), std::make_pair("f", "g")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& pair :
+       {std::make_pair("a", "d"), std::make_pair("a", "f"),
+        std::make_pair("a", "h"), std::make_pair("d", "f"),
+        std::make_pair("d", "h"), std::make_pair("f", "h")}) {
+    EXPECT_NE(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+}
+
+TEST_F(SCCTest, TensorFlowLoop) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+  for (const auto& node : item.graph.node()) {
+    if (node.name() == "Const" || node.name() == "while/Enter" ||
+        node.name() == "while/Exit") {
+      // These nodes are not part of the loop, they should be assigned the id
+      // -1.
+      EXPECT_EQ(-1, components[&node]);
+    } else {
+      // These nodes are part of the loop, they should be assigned a positive
+      // id.
+      EXPECT_LE(0, components[&node]);
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
new file mode 100644
index 00000000000..131756fc5c2
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include <deque>
+#include <unordered_map>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Kahn's algorithm is implemented.
+// For details, see https://en.wikipedia.org/wiki/Topological_sorting
+void TopologicalSort(GraphDef* graph) {
+  NodeMap node_map(graph);
+  std::deque<const NodeDef*> ready_nodes;
+  std::unordered_map<const NodeDef*, int> ready_inputs;
+  for (const NodeDef& node : graph->node()) {
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+    }
+    if (node.op() == "Merge") {
+      ready_inputs[&node] = 0;
+      for (const auto& input : node.input()) {
+        if (node_map.GetNode(input)->op() == "NextIteration") {
+          ready_inputs[&node]++;
+        }
+      }
+    } else {
+      ready_inputs[&node] = 0;
+    }
+  }
+  GraphDef sorted_graph;
+  while (!ready_nodes.empty()) {
+    auto ready_node = ready_nodes.front();
+    *sorted_graph.add_node() = *ready_node;
+    for (const auto& fanout : node_map.GetOutputs(ready_node->name())) {
+      ready_inputs[fanout]++;
+      if (ready_inputs[fanout] == fanout->input_size()) {
+        ready_nodes.push_back(fanout);
+      }
+    }
+    ready_nodes.pop_front();
+  }
+  if (sorted_graph.node_size() == graph->node_size()) {
+    *graph = sorted_graph;
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
new file mode 100644
index 00000000000..d4d8034ef57
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Sort a graph in topological order.
+void TopologicalSort(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
new file mode 100644
index 00000000000..55f66b27349
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TopologicalSortTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TopologicalSortTest, NoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithIllegalLoop) {
+  GraphDef graph;
+  // A loop without Merge and NextIteration is illegal and the original node
+  // order and graph will be preserved.
+  *graph.add_node() = CreateNode("2", {"1", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"2", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
new file mode 100644
index 00000000000..f055eb776b9
--- /dev/null
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class UtilsTest : public ::testing::Test {};
+
+TEST_F(UtilsTest, NodeName) {
+  EXPECT_EQ("abc", NodeName("abc"));
+  EXPECT_EQ("abc", NodeName("^abc"));
+  EXPECT_EQ("abc", NodeName("abc:0"));
+  EXPECT_EQ("abc", NodeName("^abc:0"));
+
+  EXPECT_EQ("abc/def", NodeName("abc/def"));
+  EXPECT_EQ("abc/def", NodeName("^abc/def"));
+  EXPECT_EQ("abc/def", NodeName("abc/def:1"));
+  EXPECT_EQ("abc/def", NodeName("^abc/def:1"));
+
+  EXPECT_EQ("abc/def0", NodeName("abc/def0"));
+  EXPECT_EQ("abc/def0", NodeName("^abc/def0"));
+  EXPECT_EQ("abc/def0", NodeName("abc/def0:0"));
+  EXPECT_EQ("abc/def0", NodeName("^abc/def0:0"));
+
+  EXPECT_EQ("abc/def_0", NodeName("abc/def_0"));
+  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0"));
+  EXPECT_EQ("abc/def_0", NodeName("abc/def_0:3"));
+  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3"));
+
+  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3214"));
+}
+
+TEST_F(UtilsTest, NodePosition) {
+  EXPECT_EQ(2, NodePosition("abc:2"));
+  EXPECT_EQ(123, NodePosition("abc:123"));
+  EXPECT_EQ(-1, NodePosition("^abc:123"));
+  EXPECT_EQ(-1, NodePosition("^abc"));
+  EXPECT_EQ(0, NodePosition(""));
+}
+
+TEST_F(UtilsTest, AddNodeNamePrefix) {
+  EXPECT_EQ("OPTIMIZED/abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
+  EXPECT_EQ("^OPTIMIZED/abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
+  EXPECT_EQ("OPTIMIZED/", AddPrefixToNodeName("", "OPTIMIZED"));
+}
+
+TEST_F(UtilsTest, ExecuteWithTimeout) {
+  std::unique_ptr<thread::ThreadPool> thread_pool(
+      new thread::ThreadPool(Env::Default(), "ExecuteWithTimeout", 2));
+
+  // This should run till the end.
+  ASSERT_TRUE(ExecuteWithTimeout(
+      []() {  // Do nothing.
+      },
+      1000 /* timeout_in_ms */, thread_pool.get()));
+
+  // This should time out.
+  Notification notification;
+  ASSERT_FALSE(ExecuteWithTimeout(
+      [&notification]() { notification.WaitForNotification(); },
+      1 /* timeout_in_ms */, thread_pool.get()));
+  // Make sure to unblock the thread.
+  notification.Notify();
+
+  // This should run till the end.
+  ASSERT_TRUE(ExecuteWithTimeout([]() { sleep(1); }, 0 /* timeout_in_ms */,
+                                 thread_pool.get()));
+
+  // Deleting before local variables go off the stack.
+  thread_pool.reset();
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 704dfe2232b..b8370a96a85 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -18,7 +18,7 @@ licenses(["notice"])  # Apache 2.0
 package_group(
     name = "friends",
     packages = [
-        "//learning/deepmind/...",
+        "//learning/brain/contrib/...",
         "//tensorflow/...",
     ],
 )
@@ -30,8 +30,10 @@ load(
     "tf_copts",
     "tf_opts_nortti_if_android",
     "tf_kernel_library",
+    "tf_mkl_kernel_library",
     "cc_header_only_library",
 )
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
@@ -39,6 +41,11 @@ load(
     "tf_proto_library",
     "tf_kernel_tests_linkstatic",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -102,6 +109,7 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
+        ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -139,8 +147,6 @@ cc_library(
         "concat_lib_cpu.h",
     ],
     deps = [
-        ":eigen_helpers",
-        ":ops_util_hdrs",
         "//third_party/eigen3",
     ],
 )
@@ -174,7 +180,6 @@ cc_library(
     deps = [
         ":eigen_helpers",
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
     ],
 )
 
@@ -219,7 +224,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
-        "//third_party/eigen3",
     ],
 )
 
@@ -239,7 +243,6 @@ cc_library(
     name = "ops_util_hdrs",
     hdrs = ["ops_util.h"],
     deps = [
-        ":eigen_helpers",
         "//third_party/eigen3",
     ],
 )
@@ -257,6 +260,21 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "variable_ops_test",
+    size = "small",
+    srcs = ["variable_ops_test.cc"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "stage_op",
     srcs = ["stage_op.cc"],
@@ -266,6 +284,15 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "map_stage_op",
+    srcs = ["map_stage_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "queue_base",
     srcs = ["queue_base.cc"],
@@ -292,33 +319,13 @@ cc_library(
     hdrs = ["priority_queue.h"],
     deps = [
         ":queue_base",
-        ":queue_op",
         ":typed_queue",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
 )
 
-tf_proto_library(
-    name = "reader_base_proto",
-    srcs = ["reader_base.proto"],
-    cc_api_version = 2,
-    go_api_version = 2,
-    java_api_version = 2,
-)
-
-cc_library(
-    name = "reader_base",
-    srcs = ["reader_base.cc"],
-    hdrs = ["reader_base.h"],
-    deps = [
-        ":reader_base_proto_cc",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
+tf_kernel_library(
     name = "record_input_op",
     srcs = [
         "record_input_op.cc",
@@ -360,6 +367,16 @@ tf_kernel_library(
     alwayslink = 0,
 )
 
+cc_library(
+    name = "split_lib_hdrs",
+    hdrs = [
+        "split_lib.h",
+    ],
+    deps = [
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "typed_queue",
     hdrs = ["typed_queue.h"],
@@ -368,18 +385,39 @@ cc_library(
     ],
 )
 
-# Private support libraries ---------------------------------------------------
+cc_library(
+    name = "training_op_helpers",
+    srcs = ["training_op_helpers.cc"],
+    hdrs = ["training_op_helpers.h"],
+    visibility = [":friends"],
+    deps = [
+        ":variable_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
 
 cc_library(
     name = "bounds_check",
     hdrs = ["bounds_check.h"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
 )
 
+cc_library(
+    name = "warn_about_ints",
+    srcs = ["warn_about_ints.cc"],
+    hdrs = ["warn_about_ints.h"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+)
+
+# Private support libraries ---------------------------------------------------
+
 cc_header_only_library(
     name = "bounds_check_lib",
     deps = [":bounds_check"],
@@ -405,10 +443,10 @@ cc_library(
         "eigen_backward_cuboid_convolutions.h",
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
-        "eigen_patch_3d.h",
         "eigen_pooling.h",
         "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
+        "eigen_volume_patch.h",
     ],
     deps = [
         "//third_party/eigen3",
@@ -448,7 +486,7 @@ ARRAY_DEPS = [
     "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
     "//third_party/eigen3",
-]
+] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 cc_library(
     name = "array_not_windows",
@@ -491,7 +529,6 @@ cc_library(
         ":bitcast_op",
         ":concat_op",
         ":constant_op",
-        ":debug_ops",
         ":depth_space_ops",
         ":diag_op",
         ":edit_distance_op",
@@ -612,7 +649,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "one_hot_op",
     prefix = "one_hot_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + ["//tensorflow/core:overflow"],
 )
 
 tf_kernel_library(
@@ -691,8 +728,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "transpose_op",
-    prefix = "transpose_op",
-    deps = ARRAY_DEPS,
+    srcs = [
+        "transpose_op.cc",
+    ] + if_mkl([
+        "mkl_transpose_op.cc",
+    ]),
+    hdrs = ["transpose_op.h"],
+    deps = ARRAY_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -814,7 +858,12 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + select({
+        ":xsmm": [
+            "@libxsmm_archive//:xsmm_avx",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 tf_cc_test(
@@ -839,6 +888,51 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "decode_wav_op_test",
+    size = "small",
+    srcs = ["decode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "encode_wav_op_test",
+    size = "small",
+    srcs = ["encode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":encode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "example_parsing_ops_test",
     size = "large",
@@ -967,12 +1061,14 @@ tf_cc_test(
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/debug:debug_io_utils",
     ],
 )
 
@@ -995,6 +1091,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "dequantize_op_test",
+    size = "small",
+    srcs = ["dequantize_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "reverse_op_test",
     size = "small",
@@ -1096,11 +1211,25 @@ tf_kernel_library(
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:conv_ops",
         "//third_party/eigen3",
     ],
     alwayslink = 0,
 )
 
+tf_cc_test(
+    name = "transpose_util_test",
+    size = "small",
+    srcs = ["transpose_util_test.cc"],
+    deps = [
+        ":transpose_functor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "candidate_sampler_ops",
     prefix = "candidate_sampler_ops",
@@ -1187,6 +1316,7 @@ cc_library(
         ":fifo_queue_op",
         ":lookup_table_init_op",
         ":lookup_table_op",
+        ":map_stage_op",
         ":padding_fifo_queue_op",
         ":priority_queue_op",
         ":queue_ops",
@@ -1200,6 +1330,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lookup",
+    deps = [
+        ":lookup_table_init_op",
+        ":lookup_table_op",
+    ],
+)
+
+cc_header_only_library(
+    name = "lookup_headers_lib",
+    deps = [":lookup"],
+)
+
 DATA_FLOW_DEPS = [
     ":bounds_check",
     ":concat_lib",
@@ -1323,10 +1466,10 @@ LOOKUP_DEPS = [
     ":initializable_lookup_table",
     ":lookup_util",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -1489,9 +1632,8 @@ cc_library(
         ":attention_ops",
         ":colorspace_op",
         ":crop_and_resize_op",
-        ":decode_gif_op",
-        ":decode_jpeg_op",
-        ":decode_png_op",
+        ":decode_bmp_op",
+        ":decode_image_op",
         ":draw_bounding_box_op",
         ":encode_jpeg_op",
         ":encode_png_op",
@@ -1556,20 +1698,14 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "decode_jpeg_op",
-    prefix = "decode_jpeg_op",
+    name = "decode_bmp_op",
+    prefix = "decode_bmp_op",
     deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
-    name = "decode_png_op",
-    prefix = "decode_png_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "decode_gif_op",
-    prefix = "decode_gif_op",
+    name = "decode_image_op",
+    prefix = "decode_image_op",
     deps = IMAGE_DEPS,
 )
 
@@ -1633,6 +1769,31 @@ tf_kernel_library(
     deps = IMAGE_DEPS,
 )
 
+tf_kernel_library(
+    name = "encode_wav_op",
+    prefix = "encode_wav_op",
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "decode_wav_op",
+    prefix = "decode_wav_op",
+    deps = [
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_cc_tests(
     name = "eigen_test",
     size = "small",
@@ -1719,11 +1880,28 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "io",
     deps = [
         ":fixed_length_record_reader_op",
         ":identity_reader_op",
+        ":lmdb_reader_op",
         ":matching_files_op",
         ":reader_ops",
         ":restore_op",
@@ -1737,12 +1915,12 @@ cc_library(
 
 IO_DEPS = [
     ":ops_util",
-    ":reader_base",
     "//tensorflow/core:framework",
     "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:reader_base",
     "//tensorflow/core/util/tensor_bundle",
 ]
 
@@ -1758,6 +1936,23 @@ tf_kernel_library(
     deps = IO_DEPS,
 )
 
+# TODO(jhseu): Restore after merge.
+#tf_kernel_library(
+#    name = "lmdb_reader_op",
+#    prefix = "lmdb_reader_op",
+#    deps = IO_DEPS + [
+#        "@lmdb",
+#    ],
+#)
+
+tf_kernel_library(
+    name = "lmdb_reader_op",
+    prefix = "lmdb_reader_op",
+    deps = IO_DEPS + [
+        "@lmdb",
+    ],
+)
+
 tf_kernel_library(
     name = "matching_files_op",
     prefix = "matching_files_op",
@@ -1861,6 +2056,32 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "cuda_solvers",
+    srcs = ["cuda_solvers.cc"],
+    hdrs = ["cuda_solvers.h"],
+    gpu_srcs = [
+        "cuda_solvers.h",
+        "cuda_solvers_gpu.cu.cc",
+    ],
+    # @local_config_cuda//cuda:cusolver, //third_party/eigen3:blas,
+    # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
+    # and f2c helper functions in global namespace. Tell the compiler to
+    # allow multiple definitions when linking this.
+    linkopts = select({
+        "//tensorflow:darwin": [],
+        "//conditions:default": ["-Wl,-z,muldefs"],
+    }),
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+)
+
 LINALG_DEPS = [
     ":linalg_ops_common",
     "//third_party/eigen3",
@@ -1872,7 +2093,10 @@ LINALG_DEPS = [
 tf_kernel_library(
     name = "cholesky_op",
     prefix = "cholesky_op",
-    deps = LINALG_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+        ":matrix_band_part_op",
+    ]) + LINALG_DEPS,
 )
 
 tf_kernel_library(
@@ -1902,7 +2126,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_inverse_op",
     prefix = "matrix_inverse_op",
-    deps = LINALG_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+    ]) + LINALG_DEPS,
 )
 
 tf_kernel_library(
@@ -1920,7 +2146,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2004,6 +2232,7 @@ tf_cc_tests(
         "summary_audio_op_test.cc",
         "summary_image_op_test.cc",
         "summary_op_test.cc",
+        "summary_tensor_op_test.cc",
     ],
     deps = [
         ":logging",
@@ -2060,6 +2289,8 @@ cc_library(
         ":argmax_op",
         ":batch_matmul_op",
         ":betainc_op",
+        ":bincount_op",
+        ":bucketize_op",
         ":cast_op",
         ":check_numerics_op",
         ":cross_op",
@@ -2097,6 +2328,12 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_kernel_library(
+    name = "bucketize_op",
+    prefix = "bucketize_op",
+    deps = MATH_DEPS,
+)
+
 tf_kernel_library(
     name = "cast_op",
     prefix = "cast_op",
@@ -2124,11 +2361,21 @@ tf_kernel_library(
 tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + [
+        "//tensorflow/core:spectral_ops_op_lib",
+    ] + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
+    ]),
 )
 
 tf_kernel_library(
     name = "matmul_op",
+    srcs = [
+        "matmul_op.cc",
+    ] + if_mkl([
+        "mkl_matmul_op.cc",
+    ]),
+    hdrs = ["matmul_op.h"],
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -2136,13 +2383,16 @@ tf_kernel_library(
         ],
         "//conditions:default": [],
     }),
-    prefix = "matmul_op",
     deps = MATH_DEPS + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2421,13 +2671,17 @@ tf_kernel_library(
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+    ]),
 )
 
 tf_kernel_library(
     name = "depthwise_conv_op",
     prefix = "depthwise_conv_op",
     deps = [
+        ":bounds_check",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -2444,6 +2698,7 @@ tf_kernel_library(
     ],
     prefix = "depthwise_conv_grad_op",
     deps = [
+        ":bounds_check",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -2533,13 +2788,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "softplus_op",
     prefix = "softplus_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
     name = "softsign_op",
     prefix = "softsign_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
@@ -2554,6 +2809,17 @@ tf_kernel_library(
     deps = NN_DEPS,
 )
 
+tf_kernel_library(
+    name = "bincount_op",
+    prefix = "bincount_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
@@ -2645,6 +2911,7 @@ tf_kernel_library(
         "cudnn_pooling_gpu.h",
         "fractional_pool_common.h",
         "maxpooling_op.h",
+        "pooling_ops_3d.h",
         "pooling_ops_common.h",
     ],
     gpu_srcs = [
@@ -2655,6 +2922,8 @@ tf_kernel_library(
         "maxpooling_op_gpu.h",
         "pooling_ops_common.h",
         "pooling_ops_common_gpu.h",
+        "pooling_ops_3d_gpu.h",
+        "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
         ":conv_2d",
@@ -2878,6 +3147,18 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "stateless_random_ops",
+    prefix = "stateless_random_ops",
+    deps = [
+        ":bounds_check",
+        ":random_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stateless_random_ops_op_lib",
+    ],
+)
+
 cc_library(
     name = "required",
     deps = [
@@ -2905,6 +3186,21 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+tf_cc_test(
+    name = "sendrecv_ops_test",
+    srcs = ["sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),  # Required for benchmarking
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":sendrecv_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "sparse",
     deps = [
@@ -2912,7 +3208,9 @@ cc_library(
         ":sparse_add_grad_op",
         ":sparse_add_op",
         ":sparse_concat_op",
+        ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
+        ":sparse_fill_empty_rows_op",
         ":sparse_reduce_sum_op",
         ":sparse_reorder_op",
         ":sparse_reshape_op",
@@ -2956,6 +3254,18 @@ tf_kernel_library(
     deps = SPARSE_DEPS,
 )
 
+tf_kernel_library(
+    name = "sparse_fill_empty_rows_op",
+    prefix = "sparse_fill_empty_rows_op",
+    deps = SPARSE_DEPS,
+)
+
+tf_kernel_library(
+    name = "sparse_cross_op",
+    prefix = "sparse_cross_op",
+    deps = SPARSE_DEPS,
+)
+
 tf_kernel_library(
     name = "sparse_reduce_sum_op",
     prefix = "sparse_reduce_sum_op",
@@ -3147,7 +3457,7 @@ STATE_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:state_ops_op_lib",
-]
+] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 tf_kernel_library(
     name = "count_up_to_op",
@@ -3197,7 +3507,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "scatter_nd_op_test",
     size = "small",
     srcs = ["scatter_nd_op_test.cc"],
@@ -3206,6 +3516,7 @@ tf_cc_test(
         ":ops_testutil",
         ":ops_util",
         ":scatter_nd_op",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -3284,6 +3595,7 @@ tf_kernel_library(
     prefix = "training_ops",
     deps = [
         ":bounds_check",
+        ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3312,10 +3624,12 @@ tf_kernel_library(
     name = "multinomial_op",
     prefix = "multinomial_op",
     deps = [
+        ":random_op",
         ":random_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
     ],
 )
 
@@ -3361,6 +3675,33 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "random_poisson_op",
+    prefix = "random_poisson_op",
+    deps = [
+        ":random_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:random_ops_op_lib",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "random_poisson_op_test",
+    size = "small",
+    srcs = ["random_poisson_op_test.cc"],
+    deps = [
+        ":ops_util",
+        ":random_poisson_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "word2vec_kernels",
     prefix = "word2vec_kernels",
@@ -3373,13 +3714,243 @@ tf_kernel_library(
     ],
 )
 
+filegroup(
+    name = "spectrogram_test_data",
+    srcs = [
+        "spectrogram_test_data/short_test_segment.wav",
+        "spectrogram_test_data/short_test_segment_spectrogram.csv.bin",
+        "spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "spectrogram",
+    srcs = ["spectrogram.cc"],
+    hdrs = ["spectrogram.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d//:fft2d",
+    ],
+)
+
+cc_library(
+    name = "spectrogram_test_utils",
+    testonly = 1,
+    srcs = ["spectrogram_test_utils.cc"],
+    hdrs = ["spectrogram_test_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_binary(
+    name = "spectrogram_convert_test_data",
+    testonly = 1,
+    srcs = ["spectrogram_convert_test_data.cc"],
+    deps = [
+        ":spectrogram_test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "spectrogram_test",
+    size = "medium",
+    srcs = ["spectrogram_test.cc"],
+    data = [":spectrogram_test_data"],
+    deps = [
+        ":spectrogram",
+        ":spectrogram_test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "spectrogram_op",
+    prefix = "spectrogram_op",
+    deps = [
+        ":spectrogram",
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "spectrogram_op_test",
+    size = "small",
+    srcs = ["spectrogram_op_test.cc"],
+    deps = [
+        ":ops_util",
+        ":spectrogram_op",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "mfcc_dct",
+    srcs = ["mfcc_dct.cc"],
+    hdrs = ["mfcc_dct.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_dct_test",
+    size = "small",
+    srcs = ["mfcc_dct_test.cc"],
+    deps = [
+        ":mfcc_dct",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "mfcc_mel_filterbank",
+    srcs = ["mfcc_mel_filterbank.cc"],
+    hdrs = ["mfcc_mel_filterbank.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_mel_filterbank_test",
+    size = "small",
+    srcs = ["mfcc_mel_filterbank_test.cc"],
+    deps = [
+        ":mfcc_mel_filterbank",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "mfcc",
+    srcs = ["mfcc.cc"],
+    hdrs = ["mfcc.h"],
+    copts = tf_copts(),
+    deps = [
+        ":mfcc_dct",
+        ":mfcc_mel_filterbank",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_test",
+    size = "small",
+    srcs = ["mfcc_test.cc"],
+    deps = [
+        ":mfcc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "mfcc_op",
+    prefix = "mfcc_op",
+    deps = [
+        ":mfcc",
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "mfcc_op_test",
+    size = "small",
+    srcs = ["mfcc_op_test.cc"],
+    deps = [
+        ":mfcc_op",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "audio",
+    deps = [
+        ":decode_wav_op",
+        ":encode_wav_op",
+        ":mfcc_op",
+        ":spectrogram_op",
+    ],
+)
+
 # Android libraries -----------------------------------------------------------
 
 # Changes to the Android srcs here should be replicated in
 # tensorflow/contrib/makefile/tf_op_files.txt
 # LINT.IfChange
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         "avgpooling_op.h",
         "bounds_check.h",
@@ -3391,10 +3962,10 @@ filegroup(
         "eigen_backward_cuboid_convolutions.h",
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
-        "eigen_patch_3d.h",
         "eigen_pooling.h",
         "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
+        "eigen_volume_patch.h",
         "fifo_queue.h",
         "maxpooling_op.h",
         "ops_util.cc",
@@ -3408,6 +3979,11 @@ filegroup(
     ],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
 # Core kernels we want on Android. Only a subset of kernels to keep
 # base library small.
 filegroup(
@@ -3553,17 +4129,21 @@ filegroup(
         "relu_op.h",
         "relu_op_functor.h",
         "resize_bilinear_op.h",
+        "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
         "softplus_op.h",
         "softsign_op.h",
         "spacetobatch_functor.h",
+        "spacetodepth_op.h",
         "tensor_array.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
+        "training_op_helpers.h",
         "training_ops.h",
         "transpose_functor.h",
         "transpose_op.h",
+        "warn_about_ints.h",
         "where_op.h",
         "xent_op.h",
     ],
@@ -3596,6 +4176,7 @@ filegroup(
         "cwise_op_equal_to_2.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
+        "cwise_op_floor_div.cc",
         "cwise_op_greater.cc",
         "cwise_op_greater_equal.cc",
         "cwise_op_isfinite.cc",
@@ -3642,6 +4223,7 @@ filegroup(
     srcs = [
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
+        "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
         "in_topk_op.cc",
@@ -3660,6 +4242,7 @@ filegroup(
         "queue_base.cc",
         "queue_ops.cc",
         "random_op.cc",
+        "reduction_ops_all.cc",
         "reduction_ops_any.cc",
         "reduction_ops_common.cc",
         "reduction_ops_max.cc",
@@ -3680,8 +4263,10 @@ filegroup(
         "softsign_op.cc",
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
+        "spacetodepth_op.cc",
         "sparse_to_dense_op.cc",
         "stack_ops.cc",
+        "string_join_op.cc",
         "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
@@ -3694,9 +4279,11 @@ filegroup(
         "tile_ops_cpu_impl_6.cc",
         "tile_ops_cpu_impl_7.cc",
         "topk_op.cc",
+        "training_op_helpers.cc",
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
+        "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
@@ -3714,6 +4301,7 @@ filegroup(
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
+        "quantized_add_op.cc",
         "quantized_batch_norm_op.cc",
         "quantized_bias_add_op.cc",
         "quantized_concat_op.cc",
@@ -3723,6 +4311,7 @@ filegroup(
         "quantized_mul_op.cc",
         "quantized_pooling_ops.cc",
         "quantized_reshape_op.cc",
+        "quantized_resize_bilinear_op.cc",
         "reference_gemm.h",
         "requantization_range_op.cc",
         "requantize.cc",
@@ -3743,6 +4332,7 @@ filegroup(
         ],
         exclude = [
             "*test.cc",
+            "*test_util*",
             "*testutil*",
             "*testlib*",
             "*main.cc",
@@ -3756,28 +4346,29 @@ filegroup(
             # not used on Android. Those ops also do not compile if included,
             # unless we add the additional deps they need.
             "tf_record_reader_op.*",
+            "lmdb_reader_op.*",
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
             "sdca_internal.*",
+            "sparse_cross_op.*",
             "text_line_reader_op.*",
             "summary_image_op.*",
+            "decode_image_op.*",
             "encode_png_op.*",
-            "decode_png_op.*",
             "encode_jpeg_op.*",
             "decode_jpeg_op.*",
             "decode_gif_op.*",
             "identity_reader_op.*",
-            "reader_base.*",
             "remote_fused_graph_execute_op.*",
+            "remote_fused_graph_rewriter_transform.*",
             "fixed_length_record_reader_op.*",
             "whole_file_read_ops.*",
             "sample_distorted_bounding_box_op.*",
             "ctc_loss_op.*",
+            "spectrogram_convert_test_data.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
-            # Lib CURL is not supported on Android.
-            "bigquery*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -3794,6 +4385,12 @@ cc_library(
         "//conditions:default": [],
     }),
     copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-ldl",
+        ],
+        "//conditions:default": [],
+    }),
     tags = [
         "manual",
         "notap",
@@ -3819,6 +4416,7 @@ tf_kernel_library(
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
+        "quantized_add_op.cc",
         "quantized_batch_norm_op.cc",
         "quantized_bias_add_op.cc",
         "quantized_concat_op.cc",
@@ -3828,6 +4426,7 @@ tf_kernel_library(
         "quantized_mul_op.cc",
         "quantized_pooling_ops.cc",
         "quantized_reshape_op.cc",
+        "quantized_resize_bilinear_op.cc",
         "requantization_range_op.cc",
         "requantize.cc",
         "reshape_op.h",
@@ -3841,6 +4440,7 @@ tf_kernel_library(
         ":concat_lib_hdrs",
         ":conv_ops",
         ":eigen_helpers",
+        ":image_resizer_state",
         ":ops_util",
         ":pooling_ops",
         "//tensorflow/core",
@@ -3922,6 +4522,48 @@ tf_cc_test(
     ],
 )
 
+# Android-only test for quantization utilities.
+cc_binary(
+    name = "quantization_utils_test_android_only",
+    testonly = 1,
+    srcs = ["quantization_utils_test.cc"],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lm",
+            "-llog",
+            "-pie",
+            "-std=c++11",
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+    ] + select({
+        "//tensorflow:android": [
+            ":android_tensorflow_kernels",
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            ":quantized_ops",
+            "//third_party/eigen3",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "//tensorflow/cc:cc_ops",
+            "//tensorflow/cc:client_session",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core:test_main",
+        ],
+    }),
+)
+
 tf_cc_test(
     name = "quantized_activation_ops_test",
     srcs = ["quantized_activation_ops_test.cc"],
@@ -3940,6 +4582,138 @@ tf_cc_test(
     ],
 )
 
+# Android-only test for quantized addition.
+cc_binary(
+    name = "quantized_add_op_test_android_only",
+    testonly = 1,
+    srcs = ["quantized_add_op_test.cc"],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lm",
+            "-llog",
+            "-pie",
+            "-std=c++11",
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+    ] + select({
+        "//tensorflow:android": [
+            ":android_tensorflow_kernels",
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            ":ops_util",
+            ":quantized_ops",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core:tensorflow",
+            "//tensorflow/core:test",
+            "//tensorflow/core:test_main",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "quantized_add_op_test",
+    size = "small",
+    srcs = ["quantized_add_op_test.cc"],
+    deps = [
+        ":math",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "quantized_resize_bilinear_op_test",
+    size = "small",
+    srcs = ["quantized_resize_bilinear_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:image_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Android-only test for quantized resize bilinear.
+cc_binary(
+    name = "quantized_resize_bilinear_op_test_android_only",
+    testonly = 1,
+    srcs = ["quantized_resize_bilinear_op_test.cc"],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lm",
+            "-llog",
+            "-pie",
+            "-std=c++11",
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+    ] + select({
+        "//tensorflow:android": [
+            ":android_tensorflow_kernels",
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            ":ops_testutil",
+            ":ops_util",
+            ":quantized_ops",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:direct_session",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:image_ops_op_lib",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:test",
+            "//tensorflow/core:test_main",
+            "//tensorflow/core:testlib",
+        ],
+    }),
+)
+
 tf_cc_test(
     name = "quantized_bias_add_op_test",
     size = "small",
@@ -4018,7 +4792,7 @@ tf_cc_test(
     ],
 )
 
-# Android-only test for quantized instance norm.
+# Android-only test for quantized multiply.
 cc_binary(
     name = "quantized_mul_op_test_android_only",
     testonly = 1,
@@ -4035,12 +4809,24 @@ cc_binary(
         "notap",
     ],
     deps = [
-        ":android_tensorflow_kernels",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/core:android_tensorflow_lib",
-        "//tensorflow/core:android_tensorflow_test_lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            ":android_tensorflow_kernels",
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            ":ops_util",
+            ":quantized_ops",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core:test_main",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:test",
+        ],
+    }),
 )
 
 tf_cc_test(
@@ -4163,11 +4949,20 @@ cc_binary(
         "notap",
     ],
     deps = [
-        ":android_tensorflow_kernels",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/core:android_tensorflow_test_lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            ":android_tensorflow_kernels",
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core:test_main",
+        ],
+    }),
 )
 
 tf_cc_test(
@@ -4193,13 +4988,15 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "remote_fused_graph_ops",
-    prefix = "remote_fused_graph_ops",
+    prefix = "remote_fused_graph_execute_op",
     deps = [
         ":remote_fused_graph_execute_op",
+        ":remote_fused_graph_execute_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:remote_fused_graph_ops_op_lib",
-        "//tensorflow/core/kernels/hexagon:graph_transferer",
     ],
 )
 
@@ -4207,10 +5004,70 @@ cc_library(
     name = "remote_fused_graph_execute_op",
     srcs = ["remote_fused_graph_execute_op.cc"],
     deps = [
+        ":remote_fused_graph_execute_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/hexagon:graph_transferer",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "remote_fused_graph_execute_utils",
+    srcs = ["remote_fused_graph_execute_utils.cc"],
+    hdrs = [
+        "i_remote_fused_graph_executor.h",
+        "remote_fused_graph_execute_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "remote_fused_graph_execute_op_test_utils",
+    testonly = 1,
+    srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
+    hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+    ],
+)
+
+tf_cc_test(
+    name = "remote_fused_graph_execute_utils_test",
+    size = "small",
+    srcs = [
+        "remote_fused_graph_execute_utils_test.cc",
+    ],
+    deps = [
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -4224,7 +5081,13 @@ tf_cc_test(
         ":ops_testutil",
         ":ops_util",
         ":remote_fused_graph_execute_op",
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -4235,6 +5098,462 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "remote_fused_graph_rewriter_transform",
+    srcs = [
+        "remote_fused_graph_rewriter_transform.cc",
+    ],
+    deps = [
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:remote_fused_graph_ops",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "remote_fused_graph_rewriter_transform_test",
+    size = "small",
+    srcs = ["remote_fused_graph_rewriter_transform_test.cc"],
+    deps = [
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        ":remote_fused_graph_rewriter_transform",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_conv_op",
+    prefix = "mkl_conv",
+    deps = [
+        ":bounds_check",
+        ":conv_ops",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_tfconv_op",
+    prefix = "mkl_tfconv",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_pooling_ops",
+    srcs = [
+        "mkl_avgpooling_op.cc",
+        "mkl_maxpooling_op.cc",
+        "mkl_pooling_ops_common.cc",
+    ],
+    hdrs = ["mkl_pooling_ops_common.h"],
+    deps = [
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_relu_op",
+    prefix = "mkl_relu",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_fused_batch_norm_op",
+    srcs = ["mkl_fused_batch_norm_op.cc"],
+    deps = NN_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_concat_op",
+    prefix = "mkl_concat_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_reshape_op",
+    prefix = "mkl_reshape_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_identity_op",
+    prefix = "mkl_identity_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_lrn_op",
+    prefix = "mkl_lrn_op",
+    deps = NN_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+cc_library(
+    name = "dataset",
+    srcs = ["dataset.cc"],
+    hdrs = ["dataset.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "captured_function",
+    srcs = ["captured_function.cc"],
+    hdrs = ["captured_function.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "window_dataset",
+    srcs = ["window_dataset.cc"],
+    hdrs = ["window_dataset.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "batch_dataset_op",
+    srcs = ["batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "padded_batch_dataset_op",
+    srcs = ["padded_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":window_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "filter_dataset_op",
+    srcs = ["filter_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_dataset_op",
+    srcs = ["map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_map_dataset_op",
+    srcs = ["parallel_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_map_dataset_op",
+    srcs = ["flat_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "repeat_dataset_op",
+    srcs = ["repeat_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "take_dataset_op",
+    srcs = ["take_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_dataset_op",
+    srcs = ["skip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_dataset_op",
+    srcs = ["range_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "shuffle_dataset_op",
+    srcs = ["shuffle_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sparse_tensor_slice_dataset_op",
+    srcs = ["sparse_tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_dataset_op",
+    srcs = ["tensor_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_slice_dataset_op",
+    srcs = ["tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "zip_dataset_op",
+    srcs = ["zip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "reader_dataset_ops",
+    srcs = ["reader_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "iterator_ops",
+    srcs = ["iterator_ops.cc"],
+    deps = [
+        ":dataset",
+        ":ops_util",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "cache_dataset_ops",
+    srcs = ["cache_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/tensor_bundle",
+    ],
+)
+
+tf_kernel_library(
+    name = "dataset_ops",
+    deps = [
+        ":batch_dataset_op",
+        ":cache_dataset_ops",
+        ":dense_to_sparse_batch_dataset_op",
+        ":filter_dataset_op",
+        ":flat_map_dataset_op",
+        ":group_by_window_dataset_op",
+        ":iterator_ops",
+        ":map_dataset_op",
+        ":padded_batch_dataset_op",
+        ":parallel_map_dataset_op",
+        ":range_dataset_op",
+        ":reader_dataset_ops",
+        ":repeat_dataset_op",
+        ":shuffle_dataset_op",
+        ":skip_dataset_op",
+        ":sparse_tensor_slice_dataset_op",
+        ":take_dataset_op",
+        ":tensor_dataset_op",
+        ":tensor_slice_dataset_op",
+        ":zip_dataset_op",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
index f8da54b9dba..ffd47406eb6 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -29,10 +29,10 @@ static Graph* BM_AdjustContrast(int batches, int width, int height) {
   factor.flat<float>().setConstant(1.2);
 
   Node* ret;
-  NodeBuilder(g->NewName("n"), "AdjustContrastv2")
-      .Input(test::graph::Constant(g, in))
-      .Input(test::graph::Constant(g, factor))
-      .Finalize(g, &ret);
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "AdjustContrastv2")
+                  .Input(test::graph::Constant(g, in))
+                  .Input(test::graph::Constant(g, factor))
+                  .Finalize(g, &ret));
   return g;
 }
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 06fd7ca419b..0fc03b5a236 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -34,7 +33,7 @@ class AdjustContrastOpTest : public OpsTestBase {
 };
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
-  TF_EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrastv2")
+  TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Finalize(node_def()));
@@ -49,7 +48,7 @@ TEST_F(AdjustContrastOpTest, Simple_1113) {
 }
 
 TEST_F(AdjustContrastOpTest, Simple_1223) {
-  TF_EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrastv2")
+  TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Finalize(node_def()));
@@ -66,13 +65,14 @@ TEST_F(AdjustContrastOpTest, Simple_1223) {
 }
 
 TEST_F(AdjustContrastOpTest, Big_99x99x3) {
-  TF_EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrastv2")
+  TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
 
   std::vector<float> values;
+  values.reserve(99 * 99 * 3);
   for (int i = 0; i < 99 * 99 * 3; ++i) {
     values.push_back(i % 255);
   }
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 98934b4e5be..6079aa749d5 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -1,5 +1,4 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -12,12 +11,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include <memory>
+
+#include "tensorflow/core/kernels/adjust_hue_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -30,7 +39,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 class AdjustHueOpBase : public OpKernel {
  protected:
-  AdjustHueOpBase(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit AdjustHueOpBase(OpKernelConstruction* context) : OpKernel(context) {}
 
   struct ComputeOptions {
     const Tensor* input;
@@ -58,8 +67,8 @@ class AdjustHueOpBase : public OpKernel {
                                 channels, " channels."));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
 
     if (input.NumElements() > 0) {
       const int64 channel_count = input.NumElements() / channels;
@@ -77,6 +86,7 @@ template <class Device>
 class AdjustHueOp;
 
 namespace internal {
+
 // Helper function to convert a RGB color to H-and-V-range. H is in the range
 // of [0, 6] instead of the normal [0, 1]
 static void rgb_to_hv_range(float r, float g, float b, float* h, float* v_min,
@@ -205,8 +215,9 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, delta_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, delta_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
@@ -237,4 +248,36 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
 REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
                         AdjustHueOp<CPUDevice>);
 
+#if GOOGLE_CUDA
+template <>
+class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
+ public:
+  explicit AdjustHueOp(OpKernelConstruction* context)
+      : AdjustHueOpBase(context) {}
+
+  void DoCompute(OpKernelContext* context,
+                 const ComputeOptions& options) override {
+    const Tensor* input = options.input;
+    const Tensor* delta = options.delta;
+    Tensor* output = options.output;
+    const int64 number_of_elements = input->NumElements();
+    GPUDevice device = context->eigen_gpu_device();
+    const auto stream = device.stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    if (number_of_elements > 0) {
+      const float* input_data = input->flat<float>().data();
+      const float* delta_h = delta->flat<float>().data();
+      float* const output_data = output->flat<float>().data();
+      functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
+                              output_data);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU),
+                        AdjustHueOp<GPUDevice>);
+
+#endif
+
+//} // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h
new file mode 100644
index 00000000000..03d52a9e77f
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHueGPU {
+  void operator()(GPUDevice* device, const int64 number_of_elements,
+                  const float* const input, const float* const delta,
+                  float* const output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
new file mode 100644
index 00000000000..865583c1c30
--- /dev/null
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+typedef struct RgbTuple {
+  float r;
+  float g;
+  float b;
+} RgbTuple;
+
+typedef struct HsvTuple {
+  float h;
+  float s;
+  float v;
+} HsvTuple;
+}  // namespace
+
+__device__ HsvTuple rgb2hsv_cuda(const float r, const float g, const float b) {
+  HsvTuple tuple;
+  const float M = fmaxf(r, fmaxf(g, b));
+  const float m = fminf(r, fminf(g, b));
+  const float chroma = M - m;
+  float h = 0.0f, s = 0.0f;
+  // hue
+  if (chroma > 0.0f) {
+    if (M == r) {
+      const float num = (g - b) / chroma;
+      const float sign = copysignf(1.0f, num);
+      h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f;
+    } else if (M == g) {
+      h = ((b - r) / chroma + 2.0f) / 6.0f;
+    } else {
+      h = ((r - g) / chroma + 4.0f) / 6.0f;
+    }
+  } else {
+    h = 0.0f;
+  }
+  // saturation
+  if (M > 0.0) {
+    s = chroma / M;
+  } else {
+    s = 0.0f;
+  }
+  tuple.h = h;
+  tuple.s = s;
+  tuple.v = M;
+  return tuple;
+}
+
+__device__ RgbTuple hsv2rgb_cuda(const float h, const float s, const float v) {
+  RgbTuple tuple;
+  const float new_h = h * 6.0f;
+  const float chroma = v * s;
+  const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f));
+  const float new_m = v - chroma;
+  const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f;
+  const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f;
+  const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f;
+  const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f;
+  const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f;
+  const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f;
+  tuple.r = chroma * (between_0_and_1 || between_5_and_6) +
+            x * (between_1_and_2 || between_4_and_5) + new_m;
+  tuple.g = chroma * (between_1_and_2 || between_2_and_3) +
+            x * (between_0_and_1 || between_3_and_4) + new_m;
+  tuple.b = chroma * (between_3_and_4 || between_4_and_5) +
+            x * (between_2_and_3 || between_5_and_6) + new_m;
+  return tuple;
+}
+
+__global__ void adjust_hue_nhwc(const int64 number_elements,
+                                const float* const __restrict__ input,
+                                float* const output,
+                                const float* const hue_delta) {
+  // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
+  // (NHWC)
+  const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
+  // bounds check
+  if (idx > number_elements - 1) {
+    return;
+  }
+  const float delta = hue_delta[0];
+  const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
+  // hue adjustment
+  float new_h = fmodf(hsv.h + delta, 1.0f);
+  if (new_h < 0.0f) {
+    new_h = fmodf(1.0f + new_h, 1.0f);
+  }
+  const RgbTuple rgb = hsv2rgb_cuda(new_h, hsv.s, hsv.v);
+  output[idx] = rgb.r;
+  output[idx + 1] = rgb.g;
+  output[idx + 2] = rgb.b;
+}
+}  // namespace internal
+
+namespace functor {
+
+void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,
+                              const float* const input,
+                              const float* const delta, float* const output) {
+  const auto stream = device->stream();
+  const CudaLaunchConfig config =
+      GetCudaLaunchConfig(number_of_elements, *device);
+  const int threads_per_block = config.thread_per_block;
+  const int block_count =
+      (number_of_elements + threads_per_block - 1) / threads_per_block;
+  internal::adjust_hue_nhwc<<<block_count, threads_per_block, 0, stream>>>(
+      number_of_elements, input, output, delta);
+}
+}  // namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 3d3991103cc..34a65815148 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -30,7 +30,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 class AdjustSaturationOpBase : public OpKernel {
  protected:
-  AdjustSaturationOpBase(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit AdjustSaturationOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   struct ComputeOptions {
     const Tensor* input;
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 50d0cc1727f..0aa65729de2 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -17,13 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include <numeric>
+
 #include "tensorflow/core/kernels/aggregate_ops.h"
 #include "tensorflow/core/kernels/aggregate_ops_cpu.h"
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
-
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -48,11 +51,29 @@ class AddNOp : public OpKernel {
       return;
     }
 
+    // Try to forward and accumulate the result in one of the input buffers.
+    int reused_input = -1;
+    gtl::InlinedVector<int, 8> input_indices(num);
+    std::iota(input_indices.begin(), input_indices.end(), 0);
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+    for (int input_idx = 0; input_idx < num; ++input_idx) {
+      if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(),
+                                                  &output)) {
+        reused_input = input_idx;
+        break;
+      }
+    }
+    if (reused_input == -1) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+    } else if (reused_input > 0) {
+      // Move the forwarded buffer to the front so we don't double count
+      // anything if there are more than 8 inputs.
+      input_indices[0] = reused_input;
+      input_indices[reused_input] = 0;
+    }
     auto To = output->flat<T>();
 
-#define I(IDX) ctx->input(IDX).flat<T>()
+#define I(IDX) ctx->input(input_indices[IDX]).flat<T>()
 
 #if defined(__ANDROID_TYPES_SLIM__)
     // On Android by default,we only support additions of two arguments, so we
@@ -140,9 +161,11 @@ TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
 #undef REGISTER_ADDN_CPU
 
 #if GOOGLE_CUDA
-REGISTER_ADDN(Eigen::half, GPU);
-REGISTER_ADDN(float, GPU);
-REGISTER_ADDN(double, GPU);
+#define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
+TF_CALL_complex64(REGISTER_ADDN_GPU);
+TF_CALL_complex128(REGISTER_ADDN_GPU);
+#undef REGISTER_ADDN_GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 51393787acb..3f449be7544 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -154,6 +154,8 @@ struct Add9Functor<GPUDevice, T> {
   template struct functor::Add9Functor<GPUDevice, type>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_FUNCTORS);
+TF_CALL_complex64(REGISTER_FUNCTORS);
+TF_CALL_complex128(REGISTER_FUNCTORS);
 
 #undef REGISTER_FUNCTORS
 
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index da0ccda9eb5..1d2e1c8c9aa 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -39,60 +39,88 @@ class AssignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    Tensor rhs = context->input(1);
+    const Tensor& rhs = context->input(1);
 
     // We always return the input ref.
     context->forward_ref_input_to_ref_output(0, 0);
 
-    // If the left hand side is not initialized, or the shape of the
-    // right-hand side is different than the left hand side, we need
-    // to allocate a new tensor.
+    // We can't always know how this value will be used downstream,
+    // so make conservative assumptions in specifying constraints on
+    // the memory allocation attributes.
+    // TODO(rmlarsen): These conservative constraints make buffer
+    // forwarding unlikely to happen very often. Try to use graph analysis
+    // (possibly the InferAllocAttr pass in the executer) to improve the
+    // situation.
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+
     {
       mutex_lock l(*context->input_ref_mutex(0));
-
-      Tensor old_lhs = context->mutable_input(0, true);
-
+      const Tensor& old_lhs = context->mutable_input(0, /* lock_held */ true);
+      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
       if (validate_shape_) {
         OP_REQUIRES(
-            context, old_lhs.shape().IsSameSize(rhs.shape()),
+            context, same_shape,
             errors::InvalidArgument(
                 "Assign requires shapes of both tensors to match. lhs shape= ",
-                old_lhs.shape().DebugString(), " rhs shape= ",
-                rhs.shape().DebugString()));
+                old_lhs.shape().DebugString(),
+                " rhs shape= ", rhs.shape().DebugString()));
       }
 
-      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
-      if (!old_lhs.IsInitialized() || !same_shape) {
-        // Create new tensor whose shape matches the right hand side
-        // and copy then hand off to lhs.
-        // We can't always know how this value will be used downstream,
-        // so make conservative assumptions in specifying the memory
-        // allocation attributes.
-        AllocatorAttributes attr;
-        attr.set_gpu_compatible(true);
-        attr.set_nic_compatible(true);
+      // In the code below we try to minimize the amount of memory allocation
+      // and copying by trying the following two shortcuts:
+      // 1. If we can reuse the rhs buffer we avoid both a memory allocation
+      //   and copying.
+      // 2. If the lhs is initialized and has the same number of elements as the
+      //    rhs we can avoid a memory allocation.
+
+      // 1. Try to reuse the rhs.
+      std::unique_ptr<Tensor> input_alias = context->forward_input(
+          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+      if (input_alias != nullptr) {
+        // Transfer ownership to the ref.
+        context->replace_ref_input(0, *input_alias.release(),
+                                   /* lock_held */ true);
+        return;
+      }
+
+      // 2. Try to copy into an existing buffer.
+      if (old_lhs.IsInitialized() &&
+          old_lhs.shape().num_elements() == rhs.shape().num_elements()) {
+        // The existing lhs tensor has already been initialized and the right
+        // hand side can fit in the underlying buffer.
+        Tensor reshaped_old_lhs;
+        if (same_shape) {
+          reshaped_old_lhs = old_lhs;
+        } else {
+          CHECK(reshaped_old_lhs.CopyFrom(old_lhs, rhs.shape()));
+          context->replace_ref_input(0, reshaped_old_lhs, /* lock_held */ true);
+        }
+        if (use_exclusive_lock_) {
+          Copy(context, &reshaped_old_lhs, rhs);
+          return;
+        }
+      } else {
+        // Create a new persistent tensor whose shape matches the right hand
+        // side, hand off to lhs and copy the rhs into it.
         PersistentTensor copy;
         Tensor* copyTensor = nullptr;
         OP_REQUIRES_OK(
             context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
                                                   &copy, &copyTensor, attr));
-        Copy(context, copyTensor, rhs);
-        context->replace_ref_input(0, *copyTensor, true);
-        return;
-      }
-
-      // The tensor has already been initialized and the right hand side
-      // matches the left hand side's shape.
-      if (use_exclusive_lock_) {
-        Copy(context, &old_lhs, rhs);
-        return;
+        context->replace_ref_input(0, *copyTensor, /* lock_held */ true);
+        if (use_exclusive_lock_) {
+          Copy(context, copyTensor, rhs);
+          return;
+        }
       }
     }
 
     // The tensor has already been initialized and the right hand side
     // matches the left hand side's shape. We have been told to do the
     // copy outside the lock.
-    Tensor old_unlocked_lhs = context->mutable_input(0, false);
+    Tensor old_unlocked_lhs = context->mutable_input(0, /* lock_held */ false);
     Copy(context, &old_unlocked_lhs, rhs);
   }
 
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index 29870a93cac..af629d0de8c 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
-#include "tensorflow/core/util/use_cudnn.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -151,7 +150,7 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
     TensorShape output_shape = params.forward_output_shape();
 
-    if (internal::AvgPoolUseCudnn() || data_format_ == FORMAT_NCHW) {
+    if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
           stride_, padding_, data_format_, tensor_in, output_shape);
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 03880b98273..83633a1dd98 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -88,7 +88,7 @@ class Barrier : public ResourceBase {
   template <typename T>
   void TryInsertMany(const Tensor& keys, int component_index,
                      const Tensor& values, OpKernelContext* ctx,
-                     DoneCallback callback) {
+                     const DoneCallback& callback) {
     TensorShape element_shape = values.shape();
     OP_REQUIRES_ASYNC(
         ctx, keys.NumElements() == 0 || element_shape.num_elements() > 0,
@@ -195,7 +195,8 @@ class Barrier : public ResourceBase {
   }
 
   void TryTakeMany(int num_elements, bool allow_small_batch, int64 timeout,
-                   OpKernelContext* ctx, IndicesKeysValuesCallback callback) {
+                   OpKernelContext* ctx,
+                   const IndicesKeysValuesCallback& callback) {
     int num_elements_to_deliver = num_elements;
     {
       mutex_lock lock(mu_);
@@ -247,7 +248,7 @@ class Barrier : public ResourceBase {
   }
 
   void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
-             DoneCallback callback) {
+             const DoneCallback& callback) {
     mutex_lock lock(mu_);
     // We're allowed to close twice if the first close wasn't a
     // cancel but the second one is.
@@ -399,7 +400,8 @@ class Barrier : public ResourceBase {
   }
 
   void CloseQueueLocked(OpKernelContext* ctx, bool cancel_pending_enqueues,
-                        DoneCallback callback) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                        const DoneCallback& callback)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // CloseQueueLocked may only be called with mu_ held.
     if (!cancel_pending_enqueues && queue_closed_) {
       callback();
diff --git a/tensorflow/core/kernels/basic_ops_benchmark_test.cc b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
index 54532318cec..5726062938b 100644
--- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// We focus on the single thread performance of runing ops.
+// We focus on the single thread performance of running ops.
 static SessionOptions InitOptions() {
   SessionOptions opts;
   opts.config.set_intra_op_parallelism_threads(1);
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/batch_dataset_op.cc
new file mode 100644
index 00000000000..67eff44a5d4
--- /dev/null
+++ b/tensorflow/core/kernels/batch_dataset_op.cc
@@ -0,0 +1,211 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class BatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit BatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    *output = new Dataset(batch_size, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, const DatasetBase* input)
+        : batch_size_(batch_size), input_(input) {
+      input_->Ref();
+
+      // NOTE(mrry): Currently we implement "batch up to" semantics. If
+      // we could tell statically that the input dataset is infinite,
+      // then we could always report `batch_size` as the 0th dimension.
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (const auto& input_shape : input_shapes) {
+        output_shapes_.emplace_back(
+            PartialTensorShape({-1}).Concatenate(input_shape));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
+    }
+
+   private:
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    //
+    // TODO(mrry): Reconcile this method with the similar method in
+    // the queue implementation.
+    template <DataType DT>
+    static Status HandleElementToSlice(const Tensor& element, Tensor* parent,
+                                       int index) {
+      typedef typename EnumToDataType<DT>::Type T;
+      if (element.NumElements() !=
+          (parent->NumElements() / parent->dim_size(0))) {
+        TensorShape chip_shape = parent->shape();
+        chip_shape.RemoveDim(0);
+        return errors::Internal(
+            "HandleElementToSlice Cannot copy slice: number of elements does "
+            "not "
+            "match.  Shapes are: [element]: ",
+            element.shape().DebugString(),
+            ", [parent slice]: ", chip_shape.DebugString());
+      }
+      auto parent_as_matrix = parent->flat_outer_dims<T>();
+      parent_as_matrix.chip(index, 0) = element.flat<T>();
+      return Status::OK();
+    }
+
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
+                                     int64 index) {
+#define HANDLE_TYPE(DT)                                                   \
+  if (element.dtype() == DT) {                                            \
+    TF_RETURN_IF_ERROR(HandleElementToSlice<DT>(element, parent, index)); \
+    return Status::OK();                                                  \
+  }
+      HANDLE_TYPE(DT_FLOAT);
+      HANDLE_TYPE(DT_HALF);
+      HANDLE_TYPE(DT_DOUBLE);
+      HANDLE_TYPE(DT_INT32);
+      HANDLE_TYPE(DT_UINT8);
+      HANDLE_TYPE(DT_INT16);
+      HANDLE_TYPE(DT_INT8);
+      HANDLE_TYPE(DT_STRING);
+      HANDLE_TYPE(DT_COMPLEX64);
+      HANDLE_TYPE(DT_COMPLEX128);
+      HANDLE_TYPE(DT_INT64);
+      HANDLE_TYPE(DT_BOOL);
+      HANDLE_TYPE(DT_QINT8);
+      HANDLE_TYPE(DT_QUINT8);
+      HANDLE_TYPE(DT_QINT32);
+      HANDLE_TYPE(DT_QINT16);
+      HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+      return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
+                                   element.dtype());
+    }
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of `batch_elements` is a tuple of tensors from the
+        // input iterator.
+        std::vector<std::vector<Tensor>> batch_elements;
+        batch_elements.reserve(dataset()->batch_size_);
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              batch_elements.emplace_back(std::move(batch_element_tuple));
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Copy the retrieved batch elements into one output tensor
+        // per tuple component.
+        // NOTE(mrry): If the input or output sizes are statically
+        // known, we could potentially read the input values in-place
+        // into their respective slice locations. This would require a
+        // different GetNext() overload that supports zero-copy, and might
+        // make sense in an optimization pass.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          const Tensor& first_element = batch_elements[0][component_index];
+          TensorShape batch_component_shape({num_batch_elements});
+          batch_component_shape.AppendShape(first_element.shape());
+          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+                                 batch_component_shape);
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (size_t i = 0; i < num_batch_elements; ++i) {
+            TF_RETURN_IF_ERROR(CopyElementToSlice(
+                batch_elements[i][component_index], &batch_component, i));
+          }
+          out_tensors->emplace_back(std::move(batch_component));
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("BatchDataset").Device(DEVICE_CPU),
+                        BatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index f2b74fc8c24..b87c98c374e 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -39,6 +39,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -343,27 +346,110 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     // C = A x B
     // where A, B and C are assumed to be in column major.
     // We want the output to be in row-major, so we can compute
-    // C' = B' x A' (' stands for transpose)
-    CublasScratchAllocator scratch_allocator(context);
-    bool blas_launch_status =
-        stream
-            ->ThenBlasGemmBatchedWithScratch(
-                blas_transpose_b, blas_transpose_a, n, m, k,
-                static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size,
-                &scratch_allocator)
-            .ok();
-    if (!blas_launch_status) {
-      context->SetStatus(errors::Internal(
-          "Blas SGEMMBatched launch failed : a.shape=",
-          in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(),
-          ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      if (n == 1 &&
+          blas_transpose_b !=
+              perftools::gputools::blas::Transpose::kConjugateTranspose &&
+          blas_transpose_a !=
+              perftools::gputools::blas::Transpose::kConjugateTranspose) {
+        // This is a matrix*vector multiply so use GEMV to compute A * b.
+        // Here we are multiplying in the natural order, so we have to flip
+        // the transposition flag to compensate for the tensor being stored
+        // row-major. Since GEMV doesn't provide a way to just conjugate an
+        // argument, we have to defer those cases to GEMM below.
+        auto gemv_trans_a =
+            blas_transpose_a == perftools::gputools::blas::Transpose::kTranspose
+                ? perftools::gputools::blas::Transpose::kNoTranspose
+                : perftools::gputools::blas::Transpose::kTranspose;
+        bool blas_launch_status =
+            stream
+                ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
+                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               adj_x ? m : k, *(b_ptrs[0]), 1,
+                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMV launch failed : a.shape=", in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k));
+        }
+      } else {
+        bool blas_launch_status =
+            stream
+                ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k));
+        }
+      }
+    } else {
+      CublasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMMBatched launch failed : a.shape=",
+            in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k, ", batch_size=", batch_size));
+      }
     }
   }
 };
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename Scalar>
+struct ParallelMatMulKernelSYCL {
+  static void Run(const OpKernelContext* context, const Tensor& in_x,
+                  const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out,
+                  int start, int limit) {
+    auto Tx = in_x.tensor<Scalar, 3>();
+    auto Ty = in_y.tensor<Scalar, 3>();
+    auto Tz = out->tensor<Scalar, 3>();
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+    contract_pairs[0] = ContractionDims(adj_x, adj_y);
+    auto d = context->eigen_sycl_device();
+    for (int i = start; i < limit; ++i) {
+      auto x = Tx.template chip<0>(i);
+      auto y = Ty.template chip<0>(i);
+      auto z = Tz.template chip<0>(i);
+      z.device(d) = x.contract(y, contract_pairs);
+    }
+  }
+};
+
+template <typename Scalar>
+struct LaunchBatchMatMul<SYCLDevice, Scalar> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+
+  // Number of matrix multiplies i.e. size of the batch.
+  const int64 num_units = in_x.dim_size(0);
+  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
+                           0, num_units);
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
  public:
@@ -443,4 +529,10 @@ class BatchMatMul : public OpKernel {
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_BATCH_MATMUL_SYCL(TYPE)                                 \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BatchMatMul<SYCLDevice, TYPE>)
+#endif // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index c719e30c4d2..1900ed8e314 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -30,4 +30,8 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
+TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index f4aa7596435..d3ed617f713 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -28,6 +28,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
@@ -115,13 +118,17 @@ class BatchNormGradOp : public OpKernel {
                                         out_backprop.shape().DebugString()));
 
     Tensor* dx = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0, 4}, 0, input.shape(), &dx));
     Tensor* dm = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 1, mean.shape(), &dm));
     Tensor* dv = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 2, var.shape(), &dv));
     Tensor* db = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {3}, 3, mean.shape(), &db));
     Tensor* dg = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
 
@@ -197,6 +204,18 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
+                              .Device(DEVICE_SYCL)                 \
+                              .TypeConstraint<T>("T"),             \
+                          BatchNormOp<SYCLDevice, T>);
+
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
 #define REGISTER_KERNEL(T)                                             \
   REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
                               .Device(DEVICE_CPU)                      \
@@ -244,4 +263,17 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
+                              .Device(DEVICE_SYCL)                     \
+                              .TypeConstraint<T>("T"),                 \
+                          BatchNormGradOp<SYCLDevice, T>);
+
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index c5e55346eb3..5e3fcd2114a 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index b24a8340839..99b5d3daaa4 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -97,6 +97,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
   for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
     block_shape_product *= block_shape[block_dim];
   }
+  OP_REQUIRES(
+      context, block_shape_product > 0,
+      errors::InvalidArgument("Product of block sizes must be positive, got ",
+                              block_shape_product));
 
   const int64 orig_input_batch_size = orig_input_tensor.dim_size(0);
   OP_REQUIRES(
diff --git a/tensorflow/core/kernels/betainc_op.cc b/tensorflow/core/kernels/betainc_op.cc
index f625520374d..e1fab18d153 100644
--- a/tensorflow/core/kernels/betainc_op.cc
+++ b/tensorflow/core/kernels/betainc_op.cc
@@ -39,7 +39,7 @@ class BetaincOp : public OpKernel {
  public:
   explicit BetaincOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  void Compute(OpKernelContext* ctx) {
+  void Compute(OpKernelContext* ctx) override {
     const Tensor& a = ctx->input(0);
     const Tensor& b = ctx->input(1);
     const Tensor& x = ctx->input(2);
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 46e12cff2af..10f5d4ce85d 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -74,8 +74,8 @@ class BiasOp<CPUDevice, T> : public BinaryOp<T> {
             bias.shape().DebugString(), " vs. ", input.shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
     if (input.NumElements() == 0) return;
 
     switch (input.shape().dims()) {
@@ -271,8 +271,8 @@ class BiasOp<GPUDevice, T> : public BinaryOp<T> {
                     bias.shape().DebugString(), " vs. ", channel, " in ",
                     input.shape().DebugString()));
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
     if (input.NumElements() > 0) {
       BiasGPU<T>::compute(context->template eigen_device<Device>(),
                           input.flat<T>().data(), bias.flat<T>().data(),
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
new file mode 100644
index 00000000000..1cd5943ef3a
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using thread::ThreadPool;
+
+template <typename T>
+class BincountOp : public OpKernel {
+ public:
+  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& arr_t = ctx->input(0);
+    const Tensor& size_tensor = ctx->input(1);
+    const Tensor& weights_t = ctx->input(2);
+    int32 size = size_tensor.scalar<int32>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+    const bool has_weights = weights_t.NumElements() > 0;
+    OP_REQUIRES(ctx, !(has_weights && arr_t.shape() != weights_t.shape()),
+                errors::InvalidArgument(
+                    "If weights are passed, they must have the same shape (" +
+                    weights_t.shape().DebugString() + ") as arr (" +
+                    arr_t.shape().DebugString() + ")"));
+    const auto arr = arr_t.flat<int32>();
+    const auto weights = weights_t.flat<T>();
+
+    Tensor all_nonneg_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_BOOL, TensorShape({}), &all_nonneg_t,
+                                      AllocatorAttributes()));
+    all_nonneg_t.scalar<bool>().device(ctx->eigen_cpu_device()) =
+        (arr >= 0).all();
+    OP_REQUIRES(ctx, all_nonneg_t.scalar<bool>()(),
+                errors::InvalidArgument("Input arr must be non-negative!"));
+
+    // Allocate partial output bin sums for each worker thread. Worker ids in
+    // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
+    ThreadPool* thread_pool =
+        ctx->device()->tensorflow_cpu_worker_threads()->workers;
+    const int64 num_threads = thread_pool->NumThreads() + 1;
+    Tensor partial_bins_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(weights_t.dtype(),
+                                           TensorShape({num_threads, size}),
+                                           &partial_bins_t));
+    auto partial_bins = partial_bins_t.matrix<T>();
+    partial_bins.setZero();
+    thread_pool->ParallelForWithWorkerId(
+        arr.size(), 8 /* cost */,
+        [&](int64 start_ind, int64 limit_ind, int64 worker_id) {
+          for (int64 i = start_ind; i < limit_ind; i++) {
+            int32 value = arr(i);
+            if (value < size) {
+              if (has_weights) {
+                partial_bins(worker_id, value) += weights(i);
+              } else {
+                // Complex numbers don't support "++".
+                partial_bins(worker_id, value) += T(1);
+              }
+            }
+          }
+        });
+    TensorShape output_shape({size});
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+    // Sum the partial bins along the 0th axis.
+    Eigen::array<int, 1> reduce_dims({0});
+    output_t->flat<T>().device(ctx->eigen_cpu_device()) =
+        partial_bins.sum(reduce_dims);
+  }
+};
+
+#define REGISTER(TYPE)                                               \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      BincountOp<TYPE>)
+
+TF_CALL_NUMBER_TYPES(REGISTER);
+
+// TODO(ringwalt): Add a GPU implementation. We probably want to take a
+// different approach, e.g. threads in a warp each taking a pass over the same
+// data, and each thread summing a single bin.
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bitcast_op.cc b/tensorflow/core/kernels/bitcast_op.cc
index 5f69217ee9b..90825e6d39a 100644
--- a/tensorflow/core/kernels/bitcast_op.cc
+++ b/tensorflow/core/kernels/bitcast_op.cc
@@ -42,11 +42,12 @@ class BitcastOp : public OpKernel {
     const Tensor& input_tensor = context->input(0);
 
     TensorShape adjusted_shape = input_tensor.shape();
-    OP_REQUIRES(context, in_size_ >= out_size_ ||
-                             (input_tensor.dims() > 0 &&
-                              input_tensor.dim_size(input_tensor.dims() - 1) ==
-                                  out_size_ / in_size_) ||
-                             input_tensor.dim_size(input_tensor.dims()) == -1,
+    OP_REQUIRES(context,
+                in_size_ >= out_size_ ||
+                    (input_tensor.dims() > 0 &&
+                     input_tensor.dim_size(input_tensor.dims() - 1) ==
+                         out_size_ / in_size_) ||
+                    input_tensor.dim_size(input_tensor.dims()) == -1,
                 errors::InvalidArgument(
                     "Cannot bitcast from ", DataTypeString(input_data_type_),
                     " to ", DataTypeString(output_data_type_), ": shape ",
@@ -75,4 +76,8 @@ class BitcastOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("Bitcast").Device(DEVICE_CPU), BitcastOp);
 
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Bitcast").Device(DEVICE_GPU), BitcastOp);
+#endif  // GOOGLE_CUDA
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
new file mode 100644
index 00000000000..93c2d01221f
--- /dev/null
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -0,0 +1,69 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+template <typename T>
+class BucketizeOp : public OpKernel {
+ public:
+  explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("boundaries", &boundaries_));
+    OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()),
+                errors::InvalidArgument("Expected sorted boundaries"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<T>();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+    auto output = output_tensor->template flat<int32>();
+
+    const int N = input.size();
+    for (int i = 0; i < N; i++) {
+      output(i) = CalculateBucketIndex(input(i));
+    }
+  }
+
+ private:
+  int32 CalculateBucketIndex(const T value) {
+    auto first_bigger_it =
+        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
+    return first_bigger_it - boundaries_.begin();
+  }
+  std::vector<float> boundaries_;
+};
+
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      BucketizeOp<T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cache_dataset_ops.cc b/tensorflow/core/kernels/cache_dataset_ops.cc
new file mode 100644
index 00000000000..327432fbc06
--- /dev/null
+++ b/tensorflow/core/kernels/cache_dataset_ops.cc
@@ -0,0 +1,386 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level description of
+// the following op.
+
+class CacheDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit CacheDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Parse out the filenames tensor.
+    string filename;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<string>(ctx, "filename", &filename));
+
+    if (filename.empty()) {
+      *output = new MemoryDataset(input);
+    } else {
+      *output = new FileDataset(input, filename, ctx->env());
+    }
+  }
+
+ private:
+  class FileDataset : public DatasetBase {
+   public:
+    explicit FileDataset(const DatasetBase* input, string filename, Env* env)
+        : input_(input),
+          filename_(std::move(filename)),
+          env_(env),
+          num_tensors_(input->output_dtypes().size()),
+          tensor_index_padding_size_(StringPaddingSize(num_tensors_)),
+          item_index_padding_size_(StringPaddingSize(kMaxItems)),
+          tensor_format_string_(strings::Printf("%%%zuzu_%%%zuzu",
+                                                item_index_padding_size_,
+                                                tensor_index_padding_size_)) {
+      input_->Ref();
+      DCHECK_EQ(item_index_padding_size_, 7);
+    }
+
+    ~FileDataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (env_->FileExists(strings::StrCat(filename_, ".index")).ok()) {
+        return std::unique_ptr<IteratorBase>(new FileReaderIterator(this));
+      } else {
+        return std::unique_ptr<IteratorBase>(new FileWriterIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "CacheDatasetOp::FileDataset"; }
+
+   private:
+    static size_t StringPaddingSize(size_t num_tensors) {
+      return strings::Printf("%zu", num_tensors - 1).size();
+    }
+
+    string FormatName(size_t item_index, size_t tensor_index) const {
+      return strings::Printf(tensor_format_string_.c_str(), item_index,
+                             tensor_index);
+    }
+
+    // FileWriterIterator passes through and caches items from the input
+    // FileDataset.
+    //
+    // This iterator is used when the cache directory is not found on disk. It
+    // creates the cache directory, and passes on the underlying iterator's
+    // elements.
+    class FileWriterIterator : public DatasetIterator<FileDataset> {
+     public:
+      explicit FileWriterIterator(const FileDataset* dataset)
+          : DatasetIterator<FileDataset>(dataset),
+            cur_index_(0),
+            input_impl_(dataset->input_->MakeIterator()),
+            writer_(dataset->env_, dataset->filename_),
+            lockfile_(strings::StrCat(dataset->filename_, ".lockfile")),
+            lockfile_created_(false),
+            iteration_completed_(false) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureLockFileExists());
+        TF_RETURN_IF_ERROR(writer_.status());
+        if (cur_index_ >= kMaxItems) {
+          // As a courtesy, close the [truncated] cache file.
+          Status s = Finish();
+          if (!s.ok()) {
+            LOG(ERROR) << s;
+          }
+          return errors::InvalidArgument(
+              "Upstream iterator is producing more than ", kMaxItems,
+              " items, which is more than the cache limit.");
+        }
+
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+        if (*end_of_sequence && out_tensors->empty()) {
+          TF_RETURN_IF_ERROR(Finish());
+          cur_index_++;
+          return Status::OK();
+        }
+        if (out_tensors->size() != dataset()->num_tensors_) {
+          return errors::Internal(
+              "Upstream iterator returned invalid number of tensors. Expected ",
+              dataset()->num_tensors_, " got: ", out_tensors->size());
+        }
+        size_t tensor_index = 0;
+        for (const Tensor& t : *out_tensors) {
+          DCHECK_LT(tensor_index, dataset()->num_tensors_);
+          string key = dataset()->FormatName(cur_index_, tensor_index++);
+          TF_RETURN_IF_ERROR(writer_.Add(key, t));
+        }
+        if (*end_of_sequence) {
+          TF_RETURN_IF_ERROR(Finish());
+        }
+        cur_index_++;
+        return Status::OK();
+      }
+
+     private:
+      Status EnsureLockFileExists() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (iteration_completed_)
+          return errors::OutOfRange(
+              "Attempting to call get_next after iteration should have "
+              "finished.");
+        if (lockfile_created_ && !iteration_completed_) return Status::OK();
+        // Perform rudimentary locking to help catch concurrent writes to the
+        // same cache files.
+        if (dataset()->env_->FileExists(lockfile_).ok()) {
+          // Attempt to read the contents of the lockfile.
+          char contents_scratch[151] = {0};  // Initialize all to 0.
+          StringPiece contents;
+          std::unique_ptr<RandomAccessFile> file;
+          if (dataset()->env_->NewRandomAccessFile(lockfile_, &file).ok()) {
+            file->Read(0, 150, &contents, contents_scratch).IgnoreError();
+          }
+          return errors::AlreadyExists(
+              "There appears to be a concurrent caching iterator running - "
+              "cache lockfile already exists ('",
+              lockfile_,
+              "'). If you are sure no other running TF computations are using "
+              "this cache prefix, delete the lockfile and re-initialize the "
+              "iterator. Lockfile contents: ",
+              contents);
+        } else {
+          // Create the file, and write some basic contents.
+          std::unique_ptr<WritableFile> lockfile;
+          TF_RETURN_IF_ERROR(
+              dataset()->env_->NewWritableFile(lockfile_, &lockfile));
+          TF_RETURN_IF_ERROR(lockfile->Append(
+              strings::StrCat("Created at: ", dataset()->env_->NowSeconds())));
+          lockfile_created_ = true;
+          return Status::OK();
+        }
+      }
+
+      Status Finish() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        iteration_completed_ = true;
+        TF_RETURN_IF_ERROR(writer_.Finish());
+        TF_RETURN_IF_ERROR(dataset()->env_->DeleteFile(lockfile_));
+        return Status::OK();
+      }
+
+      mutex mu_;
+      size_t cur_index_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      BundleWriter writer_ GUARDED_BY(mu_);
+      const string lockfile_;
+      bool lockfile_created_ GUARDED_BY(mu_);
+      bool iteration_completed_ GUARDED_BY(mu_);
+    };  // FileWriterIterator
+
+    class FileReaderIterator : public DatasetIterator<FileDataset> {
+     public:
+      explicit FileReaderIterator(const FileDataset* dataset)
+          : DatasetIterator<FileDataset>(dataset),
+            cur_index_(0),
+            reader_(dataset->env_, dataset->filename_) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        *end_of_sequence = false;
+        TF_RETURN_IF_ERROR(reader_.status());
+        if (!reader_.Valid()) {
+          return errors::Internal(
+              "Cache iterator is in an invalid state. (Perhaps GetNext called "
+              "after end_of_sequence?)");
+        }
+        out_tensors->clear();
+        out_tensors->resize(dataset()->num_tensors_);
+
+        for (size_t i = 0; i < dataset()->num_tensors_; ++i) {
+          reader_.Next();  // The first entry in the table is a header entry.
+          if (!reader_.Valid()) {
+            out_tensors->clear();
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          StringPiece key = reader_.key();
+          DCHECK_EQ(key, dataset()->FormatName(cur_index_, i));
+          TF_RETURN_IF_ERROR(reader_.ReadCurrent(&(*out_tensors)[i]));
+          TF_RETURN_IF_ERROR(reader_.status());
+        }
+        cur_index_++;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      size_t cur_index_ GUARDED_BY(mu_);
+      BundleReader reader_ GUARDED_BY(mu_);
+    };  // FileReaderIterator
+
+    const DatasetBase* const input_;
+    const string filename_;
+    Env* const env_;
+    const size_t num_tensors_;
+    const size_t tensor_index_padding_size_;
+    static const size_t kMaxItems = 10000000;  // 10 million
+    const size_t item_index_padding_size_;
+    const string tensor_format_string_;
+  };  // FileDataset
+
+  class MemoryDataset : public DatasetBase {
+   public:
+    explicit MemoryDataset(const DatasetBase* input) : input_(input) {
+      input->Ref();
+    }
+
+    ~MemoryDataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      mutex_lock l(mu_);
+      if (cache_) {
+        return std::unique_ptr<IteratorBase>(
+            new MemoryReaderIterator(this, cache_.get()));
+      }
+      if (!writer_iterator_created_) {
+        writer_iterator_created_ = true;
+        return std::unique_ptr<IteratorBase>(new MemoryWriterIterator(this));
+      }
+      return std::unique_ptr<IteratorBase>(new DuplicateWriterIterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "CacheDatasetOp::MemoryDataset"; }
+
+   private:
+    // MemoryWriterIterator passes through and appends items from the input
+    // dataset to its vector.
+    //
+    // This iterator is used when dataset->cache_ is null. After buffering
+    // the tensors in memory, upon exhausing the underlying iterator, they are
+    // updated into the parent dataset's cache_ pointer.
+    class MemoryWriterIterator : public DatasetIterator<MemoryDataset> {
+     public:
+      explicit MemoryWriterIterator(const MemoryDataset* dataset)
+          : DatasetIterator<MemoryDataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()),
+            cache_(new std::vector<std::vector<Tensor>>) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+        if (*end_of_sequence) {
+          // Guard on cache_ to not crash if GetNext is called a second time
+          // after *end_of_sequence == true
+          if (cache_) {
+            mutex_lock l2(dataset()->mu_);
+            DCHECK(dataset()->writer_iterator_created_);
+            DCHECK(!dataset()->cache_);
+            cache_.swap(dataset()->cache_);
+          }
+          return Status::OK();
+        }
+        cache_->emplace_back(*out_tensors);
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<std::vector<std::vector<Tensor>>> cache_ GUARDED_BY(mu_);
+    };  // MemoryWriterIterator
+
+    class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
+     public:
+      explicit MemoryReaderIterator(
+          const MemoryDataset* dataset,
+          const std::vector<std::vector<Tensor>>* cache)
+          : DatasetIterator<MemoryDataset>(dataset), cache_(cache), index_(0) {
+        CHECK(cache);
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (index_ < cache_->size()) {
+          const std::vector<Tensor>& cache_tensors = (*cache_)[index_];
+          out_tensors->insert(out_tensors->begin(), cache_tensors.begin(),
+                              cache_tensors.end());
+          index_++;
+          *end_of_sequence = false;
+          return Status::OK();
+        } else {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+      }
+
+     private:
+      mutex mu_;
+      const std::vector<std::vector<Tensor>>* const cache_;
+      size_t index_ GUARDED_BY(mu_);
+    };  // MemoryReaderIterator
+
+    class DuplicateWriterIterator : public DatasetIterator<MemoryDataset> {
+     public:
+      explicit DuplicateWriterIterator(const MemoryDataset* dataset)
+          : DatasetIterator<MemoryDataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        return errors::AlreadyExists(
+            "There appears to be a concurrent caching iterator running.");
+      }
+    };  // DuplicateWriterIterator
+
+    const DatasetBase* const input_;
+    mutable mutex mu_;
+    mutable std::unique_ptr<std::vector<std::vector<Tensor>>> cache_
+        GUARDED_BY(mu_);
+    mutable bool writer_iterator_created_ GUARDED_BY(mu_) = false;
+  };  // MemoryDataset
+};    // CacheDatasetOp
+
+REGISTER_KERNEL_BUILDER(Name("CacheDataset").Device(DEVICE_CPU),
+                        CacheDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index 6aa9059dc70..9e8b122801a 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -47,6 +47,12 @@ class BaseCandidateSamplerOp : public OpKernel {
     OP_REQUIRES(context, true_classes.dim_size(1) == num_true_,
                 errors::InvalidArgument("true_classes must have "
                                         "num_true columns"));
+    CHECK(sampler_) << "CandidateSamplerOp did not set sampler_";
+
+    if (unique_) {
+      OP_REQUIRES(context, num_sampled_ <= sampler_->range(),
+                  errors::InvalidArgument("Sampler's range is too small."));
+    }
 
     // Output candidates and expected_count.
     Tensor* out_sampled_candidates = nullptr;
@@ -73,8 +79,6 @@ class BaseCandidateSamplerOp : public OpKernel {
     gtl::MutableArraySlice<float> sampled_expected_count(
         out_sampled_expected_count->vec<float>().data(), num_sampled_);
 
-    CHECK(sampler_) << "CandidateSamplerOp did not set sampler_";
-
     // Approximately conservatively estimate the number of samples required.
     // In cases where rejection sampling is used we may occasionally use more
     // samples than expected, which will result in reused random bits.
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
new file mode 100644
index 00000000000..4f231331bb4
--- /dev/null
+++ b/tensorflow/core/kernels/captured_function.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/captured_function.h"
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/framework/resource_handle.pb_text.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/public/session_options.h"
+
+
+namespace tensorflow {
+
+/* static */
+Status CapturedFunction::Create(
+    OpKernelContext* ctx, const NameAttrList* func, int graph_def_version,
+    std::vector<Tensor> captured_inputs,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  // NOTE(mrry): We need to assign a name to the device, and we choose
+  // the same name as the calling context's device so that we do not
+  // need to rewrite resource handles that are found in `captured_inputs`.
+  std::unique_ptr<Device> device(new ThreadPoolDevice(
+      SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
+      DeviceLocality(), cpu_allocator()));
+
+// TODO(mrry): Handle arbitrary resource types, which might require a
+// redesign (or opening up access to `ResourceMgr::DoLookup()` and
+// `ResourceMgr::DoCreate()` to this code).
+#define HANDLE_RESOURCE_TYPE(ResourceType)                                     \
+  if (input_handle.hash_code() == MakeTypeIndex<ResourceType>().hash_code()) { \
+    ResourceType* resource;                                                    \
+    Status s = LookupResource(ctx, input_handle, &resource);                   \
+    if (errors::IsNotFound(s)) {                                               \
+      return errors::FailedPrecondition(                                       \
+          "Failed to capture resource named \"", input_handle.name(),          \
+          "\" in a dataset function. You may need to initialize it "           \
+          "explicitly before initializing an iterator that uses it.");         \
+    } else if (!s.ok()) {                                                      \
+      return s;                                                                \
+    }                                                                          \
+    TF_RETURN_IF_ERROR(device->resource_manager()->Create(                     \
+        input_handle.container(), input_handle.name(), resource));             \
+    continue;                                                                  \
+  }
+
+  for (size_t i = 0; i < captured_inputs.size(); ++i) {
+    if (captured_inputs[i].dtype() == DT_RESOURCE) {
+      // Extract the resource from `ctx->resource_manager()` and
+      // insert it into `device->resource_manager()` so that it can be
+      // used when the function executes.
+      ResourceHandle input_handle =
+          captured_inputs[i].scalar<ResourceHandle>()();
+      HANDLE_RESOURCE_TYPE(lookup::LookupInterface);
+      HANDLE_RESOURCE_TYPE(QueueInterface);
+      HANDLE_RESOURCE_TYPE(Var);
+      return errors::Unimplemented(
+          "Cannot currently capture resource '",
+          ProtoDebugString(input_handle),
+          "' in a dataset function (type not supported).");
+    }
+  }
+#undef HANDLE_RESOURCE_TYPE
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def(
+      new FunctionLibraryDefinition(
+          *ctx->function_library()->GetFunctionLibraryDefinition()));
+  std::unique_ptr<FunctionLibraryRuntime> lib(NewFunctionLibraryRuntime(
+      nullptr /* device_mgr */, ctx->env(), device.get(), graph_def_version,
+      flib_def.get(), {} /* TODO(mrry): OptimizerOptions? */));
+
+  FunctionLibraryRuntime::Handle f_handle;
+  TF_RETURN_IF_ERROR(
+      lib->Instantiate(func->name(), AttrSlice(&func->attr()), &f_handle));
+
+  out_function->reset(new CapturedFunction(
+      std::move(device), std::move(flib_def), std::move(lib), f_handle,
+      std::move(captured_inputs)));
+  return Status::OK();
+}
+
+Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
+                             gtl::ArraySlice<Tensor> args,
+                             std::vector<Tensor>* rets) {
+  Notification n;
+  Status s;
+  auto done_callback = [&n, &s](Status func_status) {
+    s.Update(func_status);
+    n.Notify();
+  };
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  CancellationManager c_mgr;
+  f_opts.cancellation_manager = &c_mgr;
+  // TODO(mrry): Implement a synchronous version of
+  // FunctionLibraryRuntime::Run() that avoids a context switch for small
+  // functions.
+  if (captured_inputs_.empty()) {
+    lib_->Run(f_opts, f_handle_, args, rets, done_callback);
+  } else {
+    std::vector<Tensor> args_with_captured;
+    args_with_captured.reserve(args.size() + captured_inputs_.size());
+    args_with_captured.insert(args_with_captured.end(), args.begin(),
+                              args.end());
+    args_with_captured.insert(args_with_captured.end(),
+                              captured_inputs_.begin(), captured_inputs_.end());
+    lib_->Run(f_opts, f_handle_, args_with_captured, rets, done_callback);
+  }
+  n.WaitForNotification();
+  return s;
+}
+
+CapturedFunction::CapturedFunction(
+    std::unique_ptr<Device> device,
+    std::unique_ptr<FunctionLibraryDefinition> flib_def,
+    std::unique_ptr<FunctionLibraryRuntime> lib,
+    FunctionLibraryRuntime::Handle f_handle,
+    std::vector<Tensor> captured_inputs)
+    : device_(std::move(device)),
+      flib_def_(std::move(flib_def)),
+      lib_(std::move(lib)),
+      f_handle_(f_handle),
+      captured_inputs_(std::move(captured_inputs)) {}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
new file mode 100644
index 00000000000..2a132cdc3a0
--- /dev/null
+++ b/tensorflow/core/kernels/captured_function.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class Device;
+class OpKernelContext;
+class ResourceMgr;
+
+// A `CapturedFunction` encapsulates a TensorFlow function and all of
+// the runtime support required to execute it.
+//
+// The `Dataset`-related classes use `CapturedFunction` to execute
+// TensorFlow functions outside a the normal `OpKernel::Compute()`
+// context.
+//
+// NOTE(mrry): Here we are taking a conservative approach to dealing with
+// ownership of the various framework and runtime objects that are needed
+// to execute functions. We copy the function library *definition* (i.e.
+// a set of FunctionDefs) out of this kernel's context's function library
+// *runtime*, then we use that together with a specially-created
+// ThreadPoolDevice to build a new FunctionLibraryRuntime for the Dataset.
+//
+// We need to do this (or refactor the ownership of framework components
+// in each of the session implementations) to make it possible to close
+// down a ParallelMapDataset::Iterator when its session is closed.
+//
+// TODO(mrry): Clean this up. Investigate whether it would be possible to
+// reuse the session's FunctionLibraryRuntime(s) or Device(s).
+class CapturedFunction {
+ public:
+  // NOTE(mrry): The `captured_inputs` are passed by value. For
+  // efficiency, you are recommended to move this argument into the call.
+  static Status Create(OpKernelContext* ctx, const NameAttrList* func,
+                       int graph_def_version,
+                       std::vector<Tensor> captured_inputs,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  Status Run(FunctionLibraryRuntime::Options f_opts,
+             gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets);
+
+  Device* device() const { return device_.get(); }
+
+  ResourceMgr* resource_manager() const { return device_->resource_manager(); }
+
+ private:
+  CapturedFunction(std::unique_ptr<Device> device,
+                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                   std::unique_ptr<FunctionLibraryRuntime> lib,
+                   FunctionLibraryRuntime::Handle f_handle,
+                   std::vector<Tensor> captured_inputs);
+
+  const std::unique_ptr<Device> device_;
+  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  const std::unique_ptr<FunctionLibraryRuntime> lib_;
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const std::vector<Tensor> captured_inputs_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index ab82c247d65..8bad488482c 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -34,6 +34,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -206,6 +209,51 @@ REGISTER_CAST_GPU(bfloat16, float);
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+class SyclCastOp : public CastOpBase {
+ public:
+  explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, Prepare());
+  }
+
+ private:
+  Status Prepare() {
+    if (src_dtype_ == dst_dtype_) {
+      work_ = nullptr;  // Identity
+      return Status::OK();
+    }
+    if (src_dtype_ == DT_BOOL) {
+      work_ = GetSyclCastFromBool(dst_dtype_);
+    } else if (src_dtype_ == DT_INT32) {
+      work_ = GetSyclCastFromInt32(dst_dtype_);
+    } else if (src_dtype_ == DT_INT64) {
+      work_ = GetSyclCastFromInt64(dst_dtype_);
+    } else if (src_dtype_ == DT_FLOAT) {
+      work_ = GetSyclCastFromFloat(dst_dtype_);
+    } else if (src_dtype_ == DT_DOUBLE) {
+      work_ = GetSyclCastFromDouble(dst_dtype_);
+    }
+
+    return work_ == nullptr ? Unimplemented() : Status::OK();
+  }
+};
+
+#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
+                              .TypeConstraint<srctype>("SrcT")  \
+                              .TypeConstraint<dsttype>("DstT")  \
+                              .Device(DEVICE_SYCL),             \
+                          SyclCastOp)
+CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
+CURRY_TYPES2(REGISTER_CAST_SYCL, float);
+CURRY_TYPES2(REGISTER_CAST_SYCL, double);
+
+#undef REGISTER_CAST_SYCL
+
+#endif // TENSORFLOW_USE_SYCL
+
 #undef CURRY_TYPES2
 
 // HostCast differs from Cast in that its input and output are in host memory.
@@ -213,5 +261,10 @@ REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
+    CpuCastOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 0def600ac0c..5c24f164a41 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -50,7 +50,7 @@ template <typename From, typename To>
 struct scalar_cast_op<std::complex<From>, To> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE To
   operator()(const std::complex<From>& a) const {
-    // Replicate numpy behaviour of returning just the real part
+    // Replicate numpy behavior of returning just the real part
     return static_cast<To>(a.real());
   }
 };
@@ -59,7 +59,7 @@ template <typename From, typename To>
 struct scalar_cast_op<From, std::complex<To>> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
       const From& a) const {
-    // Replicate numpy behaviour of setting the imaginary part to 0
+    // Replicate numpy behavior of setting the imaginary part to 0
     return std::complex<To>(static_cast<To>(a), To(0));
   }
 };
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index cb7cc81937a..1ee0796ac14 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -33,6 +33,16 @@ struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename O, typename I>
+struct CastFunctor<Eigen::SyclDevice, O, I> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
+                  typename TTypes<I>::ConstFlat i) {
+    o.device(d) = i.template cast<O>();
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 
 #define CURRY_TYPES3(FN, arg0, arg1)   \
@@ -140,6 +150,25 @@ GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype);
+
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 92fee89a475..a13f1630092 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -34,4 +34,14 @@ GetGpuCastFromBool(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, bool);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index fd20061d216..fdc8d51158f 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -34,4 +34,14 @@ GetGpuCastFromDouble(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, double);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYC
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 71e63fbff0f..1241dcd8f2e 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -49,4 +49,14 @@ GetGpuCastFromFloat(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, float);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 0fc6e16afea..69ed7604558 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -34,4 +34,13 @@ GetGpuCastFromInt32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, int32);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index b5571b19a5d..7a8363ca39c 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -34,4 +34,13 @@ GetGpuCastFromInt64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, SYCLDevice, int64);
+  return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 5b7529bb8a9..a106f287c18 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(int64)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(bool) + sizeof(float)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::BytesProcessed(static_cast<int64>(iters) * num *
                           (sizeof(float) + sizeof(Eigen::half)));
   testing::UseRealTime();
+#if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index e5bf164cfaa..755ce7c43bd 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -14,33 +14,41 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/linalg_ops.cc.
-// TODO(konstantinos): Enable complex inputs. This will require additional tests
-//                     and OP_REQUIRES.
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
 #include "third_party/eigen3/Eigen/Cholesky"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif
+
 namespace tensorflow {
 
+static const char kErrMsg[] =
+    "Cholesky decomposition was not successful. The input might not be valid.";
+
 template <class Scalar>
 class CholeskyOp : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit CholeskyOp(OpKernelConstruction* context) : Base(context) {}
 
-  using Matrix = typename Base::Matrix;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
@@ -56,17 +64,123 @@ class CholeskyOp : public LinearAlgebraOp<Scalar> {
         Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
         llt_decomposition(input);
 
+    OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
+                errors::InvalidArgument(kErrMsg));
+
     // Output the lower triangular in a dense form.
     outputs->at(0) = llt_decomposition.matrixL();
-
-    OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
-                errors::InvalidArgument("LLT decomposition was not successful. "
-                                        "The input might not be valid."));
   }
 };
 
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void MatrixBandPart<GPUDevice, T>::Compute(                                \
+      const GPUDevice& d, Eigen::DenseIndex num_lower,                       \
+      Eigen::DenseIndex num_upper, typename TTypes<T, 3>::ConstTensor input, \
+      typename TTypes<T, 3>::Tensor output);                                 \
+  extern template struct MatrixBandPart<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
+
+}  // namespace functor
+
+template <class Scalar>
+class CholeskyOpGpu : public AsyncOpKernel {
+ public:
+  explicit CholeskyOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &output),
+                         done);
+
+    if (n == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      done();
+      return;
+    }
+
+    // Copy the lower triangular part of the input matrices to the output and
+    // set the strictly upper triangular part to zero. We use a pre-existing
+    // kernel MatrixBandPart to do this for all matrices in the batch at once,
+    // before we launch each of the Cholesky factorization kernels in paralle.
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
+    functor::MatrixBandPart<GPUDevice, Scalar>::Compute(
+        context->eigen_device<GPUDevice>(), n, 0, input_reshaped,
+        output_reshaped);
+
+    // Launch a Cholesky kernel for each matrix in the batch.
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "potrf");
+    // TODO(rmlarsen): Parallelize over batches if it turns out to be
+    // an important use case.
+    CudaSolver solver(context);
+    for (int64 i = 0; i < batch_size; ++i) {
+      Scalar* output_ptr = output_reshaped.data() + i * n * n;
+      int* dev_info_ptr = dev_info.back().mutable_data() + i;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver.Potrf(CUBLAS_FILL_MODE_UPPER, n, output_ptr, n, dev_info_ptr),
+          done);
+    }
+
+    // Register callback to check info after kernels finish.
+    auto info_checker = [context, dev_info, done](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& /* unused */) {
+      Status full_status = status;
+      if (!full_status.ok()) {
+        full_status.Update(errors::InvalidArgument(kErrMsg));
+      }
+      OP_REQUIRES_OK_ASYNC(context, full_status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+};
+
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<double>), double);
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<complex64>), complex64);
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<complex128>), complex128);
+
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("Cholesky", (CholeskyOp<float>), float);
 REGISTER_LINALG_OP("Cholesky", (CholeskyOp<double>), double);
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<complex64>), complex64);
+REGISTER_LINALG_OP("Cholesky", (CholeskyOp<complex128>), complex128);
 REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<float>), float);
 REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<double>), double);
 
diff --git a/tensorflow/core/kernels/cloud/BUILD b/tensorflow/core/kernels/cloud/BUILD
deleted file mode 100644
index 710cb5aa14b..00000000000
--- a/tensorflow/core/kernels/cloud/BUILD
+++ /dev/null
@@ -1,98 +0,0 @@
-# Description:
-# BigQueryReader implementation
-
-package(
-    default_visibility = ["//visibility:private"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
-    "tf_cc_test",
-)
-
-# For platform specific build config
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tf_kernel_library(
-    name = "bigquery_reader_ops",
-    srcs = [
-        "bigquery_reader_ops.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bigquery_table_accessor",
-        ":bigquery_table_partition_proto_cc",
-        "//tensorflow/core:cloud_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:reader_base",
-    ],
-)
-
-cc_library(
-    name = "bigquery_table_accessor",
-    srcs = [
-        "bigquery_table_accessor.cc",
-    ],
-    hdrs = [
-        "bigquery_table_accessor.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bigquery_table_partition_proto_cc",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:reader_base",
-        "//tensorflow/core/platform/cloud:google_auth_provider",
-        "//tensorflow/core/platform/cloud:http_request",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library(
-    name = "bigquery_table_partition_proto",
-    srcs = [
-        "bigquery_table_partition.proto",
-    ],
-    cc_api_version = 2,
-)
-
-tf_cc_test(
-    name = "bigquery_table_accessor_test",
-    size = "small",
-    srcs = [
-        "bigquery_table_accessor_test.cc",
-        "bigquery_table_accessor_test_data.h",
-    ],
-    deps = [
-        ":bigquery_table_accessor",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform/cloud:http_request_fake",
-    ],
-)
diff --git a/tensorflow/core/kernels/cloud/bigquery_reader_ops.cc b/tensorflow/core/kernels/cloud/bigquery_reader_ops.cc
deleted file mode 100644
index a3b026e2a15..00000000000
--- a/tensorflow/core/kernels/cloud/bigquery_reader_ops.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <map>
-#include <memory>
-#include <set>
-
-#include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/framework/reader_op_kernel.h"
-#include "tensorflow/core/kernels/cloud/bigquery_table_accessor.h"
-#include "tensorflow/core/kernels/cloud/bigquery_table_partition.pb.h"
-#include "tensorflow/core/kernels/reader_base.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-
-namespace tensorflow {
-namespace {
-
-constexpr int64 kDefaultRowBufferSize = 1000;  // Number of rows to buffer.
-
-// This is a helper function for reading table attributes from context.
-Status GetTableAttrs(OpKernelConstruction* context, string* project_id,
-                     string* dataset_id, string* table_id,
-                     int64* timestamp_millis, std::vector<string>* columns,
-                     string* test_end_point) {
-  TF_RETURN_IF_ERROR(context->GetAttr("project_id", project_id));
-  TF_RETURN_IF_ERROR(context->GetAttr("dataset_id", dataset_id));
-  TF_RETURN_IF_ERROR(context->GetAttr("table_id", table_id));
-  TF_RETURN_IF_ERROR(context->GetAttr("timestamp_millis", timestamp_millis));
-  TF_RETURN_IF_ERROR(context->GetAttr("columns", columns));
-  TF_RETURN_IF_ERROR(context->GetAttr("test_end_point", test_end_point));
-  return Status::OK();
-}
-
-}  // namespace
-
-// Note that overriden methods with names ending in "Locked" are called by
-// ReaderBase while a mutex is held.
-// See comments for ReaderBase.
-class BigQueryReader : public ReaderBase {
- public:
-  explicit BigQueryReader(BigQueryTableAccessor* bigquery_table_accessor,
-                          const string& node_name)
-      : ReaderBase(strings::StrCat("BigQueryReader '", node_name, "'")),
-        bigquery_table_accessor_(CHECK_NOTNULL(bigquery_table_accessor)) {}
-
-  Status OnWorkStartedLocked() override {
-    BigQueryTablePartition partition;
-    if (!partition.ParseFromString(current_work())) {
-      return errors::InvalidArgument(
-          "Could not parse work as as valid partition.");
-    }
-    TF_RETURN_IF_ERROR(bigquery_table_accessor_->SetPartition(partition));
-    return Status::OK();
-  }
-
-  Status ReadLocked(string* key, string* value, bool* produced,
-                    bool* at_end) override {
-    *at_end = false;
-    *produced = false;
-    if (bigquery_table_accessor_->Done()) {
-      *at_end = true;
-      return Status::OK();
-    }
-
-    Example example;
-    int64 row_id;
-    TF_RETURN_IF_ERROR(bigquery_table_accessor_->ReadRow(&row_id, &example));
-
-    *key = std::to_string(row_id);
-    *value = example.SerializeAsString();
-    *produced = true;
-    return Status::OK();
-  }
-
- private:
-  // Not owned.
-  BigQueryTableAccessor* bigquery_table_accessor_;
-};
-
-class BigQueryReaderOp : public ReaderOpKernel {
- public:
-  explicit BigQueryReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    string table_id;
-    string project_id;
-    string dataset_id;
-    int64 timestamp_millis;
-    std::vector<string> columns;
-    string test_end_point;
-
-    OP_REQUIRES_OK(context,
-                   GetTableAttrs(context, &project_id, &dataset_id, &table_id,
-                                 &timestamp_millis, &columns, &test_end_point));
-    OP_REQUIRES_OK(context,
-                   BigQueryTableAccessor::New(
-                       project_id, dataset_id, table_id, timestamp_millis,
-                       kDefaultRowBufferSize, test_end_point, columns,
-                       BigQueryTablePartition(), &bigquery_table_accessor_));
-
-    SetReaderFactory([this]() {
-      return new BigQueryReader(bigquery_table_accessor_.get(), name());
-    });
-  }
-
- private:
-  std::unique_ptr<BigQueryTableAccessor> bigquery_table_accessor_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("BigQueryReader").Device(DEVICE_CPU),
-                        BigQueryReaderOp);
-
-class GenerateBigQueryReaderPartitionsOp : public OpKernel {
- public:
-  explicit GenerateBigQueryReaderPartitionsOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string project_id;
-    string dataset_id;
-    string table_id;
-    int64 timestamp_millis;
-    std::vector<string> columns;
-    string test_end_point;
-
-    OP_REQUIRES_OK(context,
-                   GetTableAttrs(context, &project_id, &dataset_id, &table_id,
-                                 &timestamp_millis, &columns, &test_end_point));
-    OP_REQUIRES_OK(context,
-                   BigQueryTableAccessor::New(
-                       project_id, dataset_id, table_id, timestamp_millis,
-                       kDefaultRowBufferSize, test_end_point, columns,
-                       BigQueryTablePartition(), &bigquery_table_accessor_));
-    OP_REQUIRES_OK(context, InitializeNumberOfPartitions(context));
-    OP_REQUIRES_OK(context, InitializeTotalNumberOfRows());
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const int64 partition_size = tensorflow::MathUtil::CeilOfRatio<int64>(
-        total_num_rows_, num_partitions_);
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({num_partitions_}),
-                                            &output_tensor));
-
-    auto output = output_tensor->template flat<string>();
-    for (int64 i = 0; i < num_partitions_; ++i) {
-      BigQueryTablePartition partition;
-      partition.set_start_index(i * partition_size);
-      partition.set_end_index(
-          std::min(total_num_rows_, (i + 1) * partition_size) - 1);
-      output(i) = partition.SerializeAsString();
-    }
-  }
-
- private:
-  Status InitializeTotalNumberOfRows() {
-    total_num_rows_ = bigquery_table_accessor_->total_num_rows();
-    if (total_num_rows_ <= 0) {
-      return errors::FailedPrecondition("Invalid total number of rows.");
-    }
-    return Status::OK();
-  }
-
-  Status InitializeNumberOfPartitions(OpKernelConstruction* context) {
-    TF_RETURN_IF_ERROR(context->GetAttr("num_partitions", &num_partitions_));
-    if (num_partitions_ <= 0) {
-      return errors::FailedPrecondition("Invalid number of partitions.");
-    }
-    return Status::OK();
-  }
-
-  int64 num_partitions_;
-  int64 total_num_rows_;
-  std::unique_ptr<BigQueryTableAccessor> bigquery_table_accessor_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("GenerateBigQueryReaderPartitions").Device(DEVICE_CPU),
-    GenerateBigQueryReaderPartitionsOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 943d25a9757..8c6fb732abf 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index cef873f804a..14e6e1bc324 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -38,6 +38,14 @@ void ConcatGPU(
     Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 #endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index f83aed6aefd..258ce154560 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -74,4 +74,29 @@ REGISTER(qint16)
 REGISTER(qint32)
 REGISTER(bfloat16)
 
+#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
+// Primarily used for SavedModel support on mobile.
+REGISTER(string);
+#endif  // defined(IS_MOBILE_PLATFORM) &&
+        // !defined(SUPPORT_SELECTIVE_REGISTRATION)
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+               const std::vector<
+                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+               typename TTypes<T, 2>::Matrix* output) {
+  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
+                   output);
+}
+#define REGISTER_SYCL(T)                                                      \
+ template void ConcatSYCL<T>(                                                 \
+     const Eigen::SyclDevice&,                                                \
+     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+     typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
+
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 9d37cafb4ed..6a933efde4b 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -126,4 +126,39 @@ void ConcatCPUImpl(
         cost_per_unit, work);
 }
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename ElementCopier>
+void ConcatSYCLImpl(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    int64 cost_per_unit, ElementCopier copier,
+    typename TTypes<T, 2>::Matrix* output) {
+  size_t num_inputs = inputs.size();
+
+  std::vector<ptrdiff_t> sizes;
+  sizes.reserve(num_inputs);
+  int64 row_size = 0;
+  for (const auto& input : inputs) {
+    sizes.push_back(input->dimension(1));
+    row_size += sizes.back();
+  }
+
+  T* out = &(*output)(0, 0);
+  std::vector<const T*> inp;
+  inp.reserve(num_inputs);
+  for (const auto& input : inputs) {
+    inp.push_back(&(*input)(0, 0));
+  }
+  const int64 dim0 = output->dimension(0);
+  for (int64 i = 0; i < dim0; ++i) {
+    for (int64 j = 0; j < num_inputs; ++j) {
+      auto size = sizes[j];
+      d.memcpy(out, inp[j], size * sizeof(T));
+      out += size;
+      inp[j] += size;
+    }
+  }
+}
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 592621c52af..cd0414ef409 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -113,6 +113,8 @@ void ConcatGPU(
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_complex64(REGISTER);
+TF_CALL_complex128(REGISTER);
 REGISTER(bfloat16);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 019d6b6ab28..3ed6241b7a7 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -198,15 +198,23 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
+TF_CALL_complex64(REGISTER_GPUCONCAT32);
+TF_CALL_complex128(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
+TF_CALL_complex64(REGISTER_GPUCONCAT64);
+TF_CALL_complex128(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
+TF_CALL_complex64(REGISTER_GPU32);
+TF_CALL_complex128(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
+TF_CALL_complex64(REGISTER_GPU64);
+TF_CALL_complex128(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 
 #undef REGISTER_GPUCONCAT32
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index e6dae5fa7eb..e7848a7e260 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -134,6 +137,12 @@ class ConcatBaseOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+      if (std::is_same<Device, SYCLDevice>::value) {
+        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
+        return;
+      }
+#endif // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -184,6 +193,8 @@ REGISTER_CONCAT(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
@@ -207,6 +218,41 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("Concat")                     \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .HostMemory("concat_dim"),     \
+                          ConcatOp<SYCLDevice, type>)        \
+  REGISTER_KERNEL_BUILDER(Name("ConcatV2")                   \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int32>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ConcatV2Op<SYCLDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
+
+REGISTER_KERNEL_BUILDER(Name("Concat")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("concat_dim")
+                            .HostMemory("values")
+                            .HostMemory("output"),
+                        ConcatOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ConcatV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tidx")
+                            .HostMemory("values")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ConcatV2Op<CPUDevice, int32>);
+
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 class ConcatOffsetOp : public OpKernel {
  public:
   explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -293,4 +339,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("concat_dim")
+                            .HostMemory("shape")
+                            .HostMemory("offset"),
+                        ConcatOffsetOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index f8c340a7691..414891b1427 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -85,8 +85,10 @@ class ConditionalAccumulator
 
   void AllocateAndAssignToAccumGradFunction(OpKernelContext* ctx,
                                             const Tensor* grad) override {
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
     ctx->allocate_persistent(dtype_, grad->shape(), &accum_grad_persistent_,
-                             &accum_grad_);
+                             &accum_grad_)
+        .IgnoreError();
     accum_grad_->flat<T>().device(ctx->template eigen_device<Device>()) =
         grad->flat<T>();
   }
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 3f8717f77f3..68e960d6b75 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -30,6 +30,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 ConstantOp::ConstantOp(OpKernelConstruction* ctx)
@@ -52,15 +56,6 @@ ConstantOp::~ConstantOp() {}
 
 REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Const").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"), \
-      ConstantOp);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif
-
 #if GOOGLE_CUDA
 #define REGISTER_KERNEL(D, TYPE)                                      \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -82,6 +77,22 @@ REGISTER_KERNEL(GPU, bool);
 #undef REGISTER_KERNEL
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(D, TYPE)                                  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"),  \
+      ConstantOp);
+REGISTER_SYCL_KERNEL(SYCL, float);
+REGISTER_SYCL_KERNEL(SYCL, double);
+REGISTER_SYCL_KERNEL(SYCL, uint8);
+REGISTER_SYCL_KERNEL(SYCL, int8);
+REGISTER_SYCL_KERNEL(SYCL, uint16);
+REGISTER_SYCL_KERNEL(SYCL, int16);
+REGISTER_SYCL_KERNEL(SYCL, int64);
+REGISTER_SYCL_KERNEL(SYCL, bool);
+#undef REGISTER_SYCL_KERNEL
+#endif
+
 HostConstantOp::HostConstantOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
@@ -112,6 +123,14 @@ REGISTER_KERNEL_BUILDER(Name("Const")
                         HostConstantOp);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        HostConstantOp);
+#endif  // TENSORFLOW_USE_SYCL
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -129,17 +148,6 @@ struct FillFunctor<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct FillFunctor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-    To32Bit(out).device(d) = To32Bit(out).constant(in());
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // end namespace functor
 
 template <typename Device, typename T>
@@ -170,6 +178,28 @@ class FillOp : public OpKernel {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+
+namespace functor {
+// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
+template <typename T>
+struct FillFunctor<SYCLDevice, T> {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  Eigen::array<int, 1> rank1{1};
+#else
+  Eigen::IndexList<Eigen::type2index<1>> rank1;
+#endif
+  const int size  = out.dimension(0);
+  Eigen::array<int, 1> broadcast_dims{size};
+
+  To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
+  }
+};
+}
+#endif // TENSORFLOW_USE_SYCL
+
 #define REGISTER_KERNEL(D, TYPE)                         \
   REGISTER_KERNEL_BUILDER(Name("Fill")                   \
                               .Device(DEVICE_##D)        \
@@ -185,7 +215,14 @@ REGISTER_KERNEL(CPU, quint8);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(SYCL, float)
+REGISTER_KERNEL(SYCL, float);
+REGISTER_KERNEL(SYCL, double);
+REGISTER_KERNEL(SYCL, uint8);
+REGISTER_KERNEL(SYCL, int8);
+REGISTER_KERNEL(SYCL, uint16);
+REGISTER_KERNEL(SYCL, int16);
+REGISTER_KERNEL(SYCL, int64);
+
 REGISTER_KERNEL_BUILDER(Name("Fill")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -193,6 +230,7 @@ REGISTER_KERNEL_BUILDER(Name("Fill")
                             .HostMemory("value")
                             .HostMemory("output"),
                         FillOp<CPUDevice, int32>);
+#undef REGISTER_KERNEL_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
@@ -228,7 +266,8 @@ class ZerosLikeOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {0}, 0, input.shape(), &out));
     functor::SetZeroFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), out->flat<T>());
   }
@@ -244,7 +283,10 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL(bool, SYCL);
 REGISTER_KERNEL(float, SYCL);
+REGISTER_KERNEL(double, SYCL);
+REGISTER_KERNEL(int64, SYCL);
 REGISTER_KERNEL_BUILDER(Name("ZerosLike")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -269,31 +311,76 @@ REGISTER_KERNEL_BUILDER(Name("ZerosLike")
 
 #undef REGISTER_KERNEL
 
-class PlaceholderOp : public OpKernel {
+template <typename Device, typename T>
+class OnesLikeOp : public OpKernel {
  public:
-  explicit PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
-  }
+  explicit OnesLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
-    if (expected_shape_.dims() > 0) {
-      OP_REQUIRES(ctx, false,
-                  errors::InvalidArgument(
-                      "You must feed a value for placeholder tensor '", name(),
-                      "' with dtype ", DataTypeString(output_type(0)),
-                      " and shape ", expected_shape_.DebugString()));
-    } else {
-      OP_REQUIRES(ctx, false,
-                  errors::InvalidArgument(
-                      "You must feed a value for placeholder tensor '", name(),
-                      "' with dtype ", DataTypeString(output_type(0))));
-    }
+    const Tensor& input = ctx->input(0);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {0}, 0, input.shape(), &out));
+    functor::SetOneFunctor<Device, T> f;
+    f(ctx->eigen_device<Device>(), out->flat<T>());
   }
-
- private:
-  TensorShape expected_shape_;
 };
 
+#define REGISTER_KERNEL(type, dev)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("OnesLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \
+      OnesLikeOp<dev##Device, type>)
+
+#define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU)
+TF_CALL_POD_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL(float, SYCL);
+REGISTER_KERNEL(bool, SYCL);
+REGISTER_KERNEL_BUILDER(Name("OnesLike")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("y"),
+                        OnesLikeOp<CPUDevice, int32>);
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL(bool, GPU);
+REGISTER_KERNEL(Eigen::half, GPU);
+REGISTER_KERNEL(float, GPU);
+REGISTER_KERNEL(double, GPU);
+REGISTER_KERNEL(complex64, GPU);
+REGISTER_KERNEL(complex128, GPU);
+REGISTER_KERNEL(int64, GPU);
+REGISTER_KERNEL_BUILDER(Name("OnesLike")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("y"),
+                        OnesLikeOp<CPUDevice, int32>);
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNEL
+
+PlaceholderOp::PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
+}
+
+void PlaceholderOp::Compute(OpKernelContext* ctx) {
+  if (expected_shape_.dims() > 0) {
+    OP_REQUIRES(ctx, false,
+                errors::InvalidArgument(
+                    "You must feed a value for placeholder tensor '", name(),
+                    "' with dtype ", DataTypeString(output_type(0)),
+                    " and shape ", expected_shape_.DebugString()));
+  } else {
+    OP_REQUIRES(ctx, false,
+                errors::InvalidArgument(
+                    "You must feed a value for placeholder tensor '", name(),
+                    "' with dtype ", DataTypeString(output_type(0))));
+  }
+}
+
 REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE_CPU), PlaceholderOp);
 REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE_CPU),
                         PlaceholderOp);
@@ -309,5 +396,5 @@ REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE_GPU),
 REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE_SYCL), PlaceholderOp);
 REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE_SYCL),
                         PlaceholderOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
index 30c65cdfd14..8531a7a4b43 100644
--- a/tensorflow/core/kernels/constant_op.h
+++ b/tensorflow/core/kernels/constant_op.h
@@ -50,6 +50,15 @@ class HostConstantOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp);
 };
 
+class PlaceholderOp : public OpKernel {
+ public:
+  explicit PlaceholderOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  TensorShape expected_shape_;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index 6177385d152..56bd918f3c3 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -98,6 +98,24 @@ DEFINE_SETZERO_GPU(complex128);
 DEFINE_SETZERO_GPU(int64);
 #undef DEFINE_SETZERO_GPU
 
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetOneFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(1));
+  }
+};
+
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>
+DEFINE_SETONE_GPU(bool);
+DEFINE_SETONE_GPU(Eigen::half);
+DEFINE_SETONE_GPU(float);
+DEFINE_SETONE_GPU(double);
+DEFINE_SETONE_GPU(complex64);
+DEFINE_SETONE_GPU(complex128);
+DEFINE_SETONE_GPU(int64);
+#undef DEFINE_SETONE_GPU
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 1a73a3d0f8f..64c06786bc1 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -32,7 +32,7 @@ void SwitchOp::Compute(OpKernelContext* context) {
 
   bool pred = outputPorts.scalar<bool>()();
   int port = (pred) ? 1 : 0;
-  if (IsRefType(context->input_dtype(0))) {
+  if (context->input_is_ref(0)) {
     context->forward_ref_input_to_ref_output(0, port);
   } else {
     context->set_output(port, context->input(0));
@@ -112,15 +112,14 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                       \
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_SWITCH(type)                       \
   REGISTER_KERNEL_BUILDER(Name("Switch")                 \
                               .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("pred"),       \
+                              .HostMemory("pred")        \
+                              .TypeConstraint<type>("T"),\
                           SwitchOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
 #define REGISTER_SYCL_REF_SWITCH(type)                     \
   REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
@@ -128,12 +127,41 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                               .HostMemory("pred")          \
                               .TypeConstraint<type>("T"),  \
                           SwitchOp)
-REGISTER_SYCL_REF_SWITCH(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
+TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
-#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
+#define REGISTER_SYCL_HOST_KERNEL(type)                  \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .HostMemory("data")        \
+                              .HostMemory("pred")        \
+                              .HostMemory("output_false")\
+                              .HostMemory("output_true") \
+                              .TypeConstraint<type>("T"),\
+                          SwitchOp)
+
+REGISTER_SYCL_HOST_KERNEL(bool);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(int32);
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
+                              .Device(DEVICE_SYCL)         \
+                              .HostMemory("data")          \
+                              .HostMemory("pred")          \
+                              .HostMemory("output_false")  \
+                              .HostMemory("output_true")   \
+                              .TypeConstraint<type>("T"),  \
+                          SwitchOp)
+
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(bool);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
@@ -233,13 +261,13 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                        \
   REGISTER_KERNEL_BUILDER(Name("Merge")                   \
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp)
+                          MergeOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
@@ -248,9 +276,10 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
                               .Device(DEVICE_SYCL)        \
                               .TypeConstraint<type>("T")  \
                               .HostMemory("value_index"), \
-                          MergeOp)
+                          MergeOp);
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 #endif // TENSORFLOW_USE_SYCL
@@ -280,6 +309,30 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp);                       \
+  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
@@ -306,7 +359,7 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(          \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
@@ -321,7 +374,31 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
+#endif // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -370,30 +447,25 @@ REGISTER_KERNEL_BUILDER(Name("RefExit").Device(DEVICE_CPU), ExitOp);
       Name("RefExit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-  Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
-  Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp)
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("Exit")                    \
                               .Device(DEVICE_SYCL)        \
@@ -483,31 +555,19 @@ REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-  REGISTER_SYCL_KERNEL(bool);
-  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#define REGISTER_SYCL_REF_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-  REGISTER_SYCL_REF_KERNEL(bool);
-  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      NextIterationOp);                                                      \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      NextIterationOp)
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
 
-// Special GPU kernels for int32 and string.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_SYCL_HOST_KERNEL(type)                   \
   REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
                               .Device(DEVICE_SYCL)        \
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index 301609e04dc..affa0e8ca6b 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -105,7 +104,7 @@ TEST_F(AbortOpTest, pass_error_msg) {
                    .Attr("error_msg", "abort_op_test")
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel().IgnoreError(), KilledBySignal(SIGABRT),
               "Abort_op intentional failure; abort_op_test");
 }
 
@@ -113,7 +112,7 @@ TEST_F(AbortOpTest, pass_error_msg) {
 TEST_F(AbortOpTest, default_msg) {
   TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort").Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel().IgnoreError(), KilledBySignal(SIGABRT),
               "Abort_op intentional failure; ");
 }
 
@@ -123,7 +122,7 @@ TEST_F(AbortOpTest, exit_normally) {
                    .Attr("exit_without_error", true)
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), ::testing::ExitedWithCode(0), "");
+  EXPECT_EXIT(RunOpKernel().IgnoreError(), ::testing::ExitedWithCode(0), "");
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 4fbf2db6562..4bb0b7f3b41 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -256,6 +256,26 @@ struct NCHWToNHWC {
                   typename TTypes<T, NDIMS>::Tensor out);
 };
 
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim0, dim2, dim1]
+template <typename Device, typename T>
+struct SwapDimension1And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& input_dims, T* out);
+};
+
+// Converts a tensor from:
+//   [dim0, dim1, dim2]
+// to:
+//   [dim2, dim1, dim0]
+template <typename Device, typename T>
+struct SwapDimension0And2InTensor3 {
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& input_dims, T* out);
+};
+
 // Reverses the effect of TransformFilter above.
 template <typename Device, typename T, int NDIMS>
 struct ReverseTransformFilter {
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 44770eba360..98c2ea1362d 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -88,6 +91,75 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_LIBXSMM
+template <typename Device, class T>
+struct LaunchXsmmBackwardFilter {
+  bool operator()(OpKernelContext* context, const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_backward,
+                  typename TTypes<T, 4>::Tensor kernel,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int input_rows, int input_cols, int row_stride,
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
+    return false;
+  }
+};
+
+template <>
+struct LaunchXsmmBackwardFilter<CPUDevice, float> {
+  bool operator()(OpKernelContext* context, const CPUDevice& d,
+                  typename TTypes<float, 4>::ConstTensor input,
+                  typename TTypes<float, 4>::Tensor filter,
+                  typename TTypes<float, 4>::ConstTensor output, int input_rows,
+                  int input_cols, int row_stride, int col_stride, int pad_h,
+                  int pad_w, TensorFormat data_format) const {
+    auto batch = input.dimension(0);
+    auto in_depth = input.dimension(3);
+    auto out_depth = output.dimension(3);
+    auto filter_rows = filter.dimension(0);
+    auto filter_cols = filter.dimension(1);
+
+    auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = row_stride;
+    desc.v = col_stride;
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
+    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = num_threads;
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
+
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
+    auto input_ptr = input.data();
+    auto filter_ptr = filter.data();
+    auto output_ptr = output.data();
+    bool success = functor::XsmmBkwFilterConv2D<CPUDevice, float>()(
+        context, desc, input_ptr, filter_ptr, output_ptr);
+    return success;
+  }
+};
+#endif
+
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
  public:
@@ -124,21 +196,52 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 filter_sizes.vec<int32>(), &filter_shape));
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DFastBackpropFilter", input.shape(),
-                                filter_shape, out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensions(
+            type_string(), /*num_spatial_dims=*/2, input.shape(), filter_shape,
+            out_backprop.shape(), strides_, padding_, data_format_, &dims));
 
     Tensor* filter_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
+      }
+    }
+#endif
+
     functor::SpatialConvolutionBackwardKernel<Device, T>()(
         context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
         input.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-        dims.rows.filter_size, dims.cols.filter_size, dims.rows.stride,
-        dims.cols.stride);
+        dims.spatial_dims[0].filter_size, dims.spatial_dims[1].filter_size,
+        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride);
   }
 
  private:
@@ -187,11 +290,12 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 filter_sizes.vec<int32>(), &filter_shape));
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DCustomBackpropFilter", input.shape(),
-                                filter_shape, out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
+                       input.shape(), filter_shape, out_backprop.shape(),
+                       strides_, padding_, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
@@ -199,20 +303,40 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
 
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.rows.input_size, dims.rows.filter_size,
-                                dims.rows.stride, padding_,
-                                &dims.rows.output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.cols.input_size, dims.cols.filter_size,
-                                dims.cols.stride, padding_,
-                                &dims.cols.output_size, &pad_left, &pad_right));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
+      }
+    }
+#endif
 
     // The total dimension size of each kernel.
-    const int filter_total_size =
-        dims.rows.filter_size * dims.cols.filter_size * dims.in_depth;
+    const int filter_total_size = dims.spatial_dims[0].filter_size *
+                                  dims.spatial_dims[1].filter_size *
+                                  dims.in_depth;
     // The output image size is the spatial size of the output.
-    const int output_image_size = dims.rows.output_size * dims.cols.output_size;
+    const int output_image_size =
+        dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
 
     // Shard 'batch' images into 'shard_size' groups of images to be fed
     // into the parallel matmul. Calculate 'shard_size' by dividing the L3 cache
@@ -246,11 +370,11 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                        &col_buffer));
 
     // The input offset corresponding to a single input image.
-    const int input_offset =
-        dims.rows.input_size * dims.cols.input_size * dims.in_depth;
+    const int input_offset = dims.spatial_dims[0].input_size *
+                             dims.spatial_dims[1].input_size * dims.in_depth;
     // The output offset corresponding to a single output image.
-    const int output_offset =
-        dims.rows.output_size * dims.cols.output_size * dims.out_depth;
+    const int output_offset = dims.spatial_dims[0].output_size *
+                              dims.spatial_dims[1].output_size * dims.out_depth;
 
     const T* input_data = input.template flat<T>().data();
     T* col_buffer_data = col_buffer.template flat<T>().data();
@@ -288,11 +412,12 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
 
           // When we compute the gradient with respect to the filters, we need
           // to do im2col to allow gemm-type computation.
-          Im2col<T>(input_data_shard, dims.in_depth, dims.rows.input_size,
-                    dims.cols.input_size, dims.rows.filter_size,
-                    dims.cols.filter_size, pad_top, pad_left, pad_bottom,
-                    pad_right, dims.rows.stride, dims.cols.stride,
-                    col_data_shard);
+          Im2col<T>(
+              input_data_shard, dims.in_depth, dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size, dims.spatial_dims[0].filter_size,
+              dims.spatial_dims[1].filter_size, pad_top, pad_left, pad_bottom,
+              pad_right, dims.spatial_dims[0].stride,
+              dims.spatial_dims[1].stride, col_data_shard);
         }
       };
       Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
@@ -343,7 +468,9 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 // The slow version (but compiles for GPU)
 
 // A dummy type to group forward backward filter autotune results together.
-struct ConvBackwardFilterAutoTuneGroup {};
+struct ConvBackwardFilterAutoTuneGroup {
+  static string name() { return "ConvBwdFilter"; }
+};
 typedef AutoTuneSingleton<ConvBackwardFilterAutoTuneGroup, ConvParameters,
                           perftools::gputools::dnn::AlgorithmConfig>
     AutoTuneConvBwdFilter;
@@ -389,11 +516,12 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 filter_sizes.vec<int32>(), &filter_shape));
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DSlowBackpropFilter", input.shape(),
-                                filter_shape, out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
+                       input.shape(), filter_shape, out_backprop.shape(),
+                       strides_, padding_, data_format_, &dims));
 
     Tensor* filter_backprop = nullptr;
     OP_REQUIRES_OK(context,
@@ -402,15 +530,17 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     const int padding_rows =
         (padding_ == VALID)
             ? 0
-            : std::max<int>(0, (dims.rows.output_size - 1) * dims.rows.stride +
-                                   dims.rows.filter_size -
-                                   dims.rows.input_size);
+            : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
+                                       dims.spatial_dims[0].stride +
+                                   dims.spatial_dims[0].filter_size -
+                                   dims.spatial_dims[0].input_size);
     const int padding_cols =
         (padding_ == VALID)
             ? 0
-            : std::max<int>(0, (dims.cols.output_size - 1) * dims.cols.stride +
-                                   dims.cols.filter_size -
-                                   dims.cols.input_size);
+            : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
+                                       dims.spatial_dims[1].stride +
+                                   dims.spatial_dims[1].filter_size -
+                                   dims.spatial_dims[1].input_size);
 
     // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
     // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -428,12 +558,13 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
       return;
     }
 
-    if (dims.rows.filter_size == 1 && dims.cols.filter_size == 1 &&
-        dims.rows.stride == 1 && dims.cols.stride == 1 &&
+    if (dims.spatial_dims[0].filter_size == 1 &&
+        dims.spatial_dims[1].filter_size == 1 &&
+        dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
         data_format_ == FORMAT_NHWC) {
       const uint64 m = dims.in_depth;
-      const uint64 k =
-          dims.batch_size * dims.rows.input_size * dims.cols.input_size;
+      const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size;
       const uint64 n = dims.out_depth;
 
       // The shape of output backprop is
@@ -465,13 +596,15 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
                                             ", n=", n, ", k=", k));
       }
       return;
-    } else if (dims.rows.filter_size == dims.rows.input_size &&
-               dims.cols.filter_size == dims.cols.input_size &&
+    } else if (dims.spatial_dims[0].filter_size ==
+                   dims.spatial_dims[0].input_size &&
+               dims.spatial_dims[1].filter_size ==
+                   dims.spatial_dims[1].input_size &&
                padding_ == VALID && data_format_ == FORMAT_NHWC) {
       // The input data and filter have the same height/width, so call cublas
       // directly.
-      const uint64 m =
-          dims.rows.input_size * dims.cols.input_size * dims.in_depth;
+      const uint64 m = dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size * dims.in_depth;
       const uint64 k = dims.batch_size;
       const uint64 n = dims.out_depth;
 
@@ -505,8 +638,9 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
           context->allocate_temp(
               DataTypeToEnum<T>::value,
               ShapeFromFormat(data_format_, dims.batch_size,
-                              dims.rows.input_size + rows_odd,
-                              dims.cols.input_size + cols_odd, dims.in_depth),
+                              dims.spatial_dims[0].input_size + rows_odd,
+                              dims.spatial_dims[1].input_size + cols_odd,
+                              dims.in_depth),
               &compatible_input));
 
       functor::PadInput<GPUDevice, T, int, 4>()(
@@ -528,18 +662,18 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
         .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     perftools::gputools::dnn::BatchDescriptor output_desc;
     output_desc.set_count(dims.batch_size)
-        .set_height(dims.rows.output_size)
-        .set_width(dims.cols.output_size)
+        .set_height(dims.spatial_dims[0].output_size)
+        .set_width(dims.spatial_dims[1].output_size)
         .set_feature_map_count(dims.out_depth)
         .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     perftools::gputools::dnn::FilterDescriptor filter_desc;
-    filter_desc.set_input_filter_height(dims.rows.filter_size)
-        .set_input_filter_width(dims.cols.filter_size)
+    filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
+        .set_input_filter_width(dims.spatial_dims[1].filter_size)
         .set_input_feature_map_count(dims.in_depth)
         .set_output_feature_map_count(dims.out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-    conv_desc.set_vertical_filter_stride(dims.rows.stride)
-        .set_horizontal_filter_stride(dims.cols.stride)
+    conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+        .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
         .set_zero_padding_height(padding_rows / 2)
         .set_zero_padding_width(padding_cols / 2);
 
@@ -561,15 +695,15 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_temp(
                                 DataTypeToEnum<T>::value,
                                 TensorShape({dims.out_depth, dims.in_depth,
-                                             dims.rows.filter_size,
-                                             dims.cols.filter_size}),
+                                             dims.spatial_dims[0].filter_size,
+                                             dims.spatial_dims[1].filter_size}),
                                 &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
     if (data_format_ == FORMAT_NHWC) {
-      TensorShape nchw_shape =
-          ShapeFromFormat(FORMAT_NCHW, dims.batch_size, dims.rows.output_size,
-                          dims.cols.output_size, dims.out_depth);
+      TensorShape nchw_shape = ShapeFromFormat(
+          FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
+          dims.spatial_dims[1].output_size, dims.out_depth);
       if (dims.out_depth > 1) {
         OP_REQUIRES_OK(context, context->allocate_temp(
                                     DataTypeToEnum<T>::value, nchw_shape,
@@ -622,26 +756,28 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
         );
     int device_id = stream->parent()->device_ordinal();
+    DataType dtype = input.dtype();
     ConvParameters conv_parameters = {
-        dims.batch_size,        // batch
-        dims.in_depth,          // in_depths
-        input_desc.height(),    // in_rows
-        input_desc.width(),     // in_cols
-        dims.out_depth,         // out_depths
-        dims.rows.filter_size,  // filter_rows
-        dims.cols.filter_size,  // filter_cols
-        dims.rows.stride,       // stride_rows
-        dims.cols.stride,       // stride_cols
-        padding_rows,           // padding_rows
-        padding_cols,           // padding_cols
-        device_id,              // device_id
+        dims.batch_size,                       // batch
+        dims.in_depth,                         // in_depths
+        {{input_desc.height(),                 // in_rows
+          input_desc.width()}},                // in_cols
+        dims.out_depth,                        // out_depths
+        {{dims.spatial_dims[0].filter_size,    // filter_rows
+          dims.spatial_dims[1].filter_size}},  // filter_cols
+        {{dims.spatial_dims[0].stride,         // stride_rows
+          dims.spatial_dims[1].stride}},       // stride_cols
+        {{padding_rows,                        // padding_rows
+          padding_cols}},                      // padding_cols
+        dtype,                                 // tensor datatype
+        device_id,                             // device_id
     };
     AlgorithmConfig algorithm_config;
-    if (cudnn_use_autotune_ &&
-        !AutoTuneConvBwdFilter::GetInstance()->Find(conv_parameters,
-                                                    &algorithm_config)) {
+    if (cudnn_use_autotune_ && !AutoTuneConvBwdFilter::GetInstance()->Find(
+                                   conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmType> algorithms;
-      CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(&algorithms));
+      CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -672,8 +808,9 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
           }
         }
       }
-      OP_REQUIRES(context, best_result.is_valid() &&
-                               best_result.algorithm() != kDefaultAlgorithm,
+      OP_REQUIRES(context,
+                  best_result.is_valid() &&
+                      best_result.algorithm() != kDefaultAlgorithm,
                   errors::NotFound("No algorithm worked!"));
       OP_REQUIRES(context,
                   best_result_no_scratch.is_valid() &&
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 19661584504..eb9a6169661 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,7 +131,8 @@ struct LaunchXsmmBackwardInputConvolution {
                   typename TTypes<T, 4>::ConstTensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     return false;
   }
 };
@@ -143,7 +144,8 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
                   typename TTypes<float, 4>::ConstTensor kernel,
                   typename TTypes<float, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     auto batch = input_backward.dimension(0);
     auto in_depth = input_backward.dimension(3);
     auto out_depth = output_backward.dimension(3);
@@ -162,18 +164,19 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.S = filter_cols;
     desc.u = row_stride;
     desc.v = col_stride;
-    desc.pad_h = 0;
-    desc.pad_w = 0;
-    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
-    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;  // LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     auto input_ptr = input_backward.data();
@@ -223,32 +226,53 @@ class Conv2DFastBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 input_sizes.vec<int32>(), &input_shape));
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DFastBackpropInput", input_shape,
-                                filter.shape(), out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DFastBackpropInput", /*num_spatial_dims=*/2,
+                       input_shape, filter.shape(), out_backprop.shape(),
+                       strides_, padding_, data_format_, &dims));
 
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.rows.input_size,
-            dims.cols.input_size, dims.rows.stride, dims.cols.stride,
-            data_format_)) {
-      return;
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
+      }
     }
 #endif
 
     LaunchBackwardInputConvolution<Device, T>()(
         context, context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
         filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-        dims.rows.input_size, dims.cols.input_size, dims.rows.stride,
-        dims.cols.stride, data_format_);
+        dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, data_format_);
   }
 
  private:
@@ -296,45 +320,72 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 input_sizes.vec<int32>(), &input_shape));
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DCustomBackpropInput", input_shape,
-                                filter.shape(), out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2,
+                       input_shape, filter.shape(), out_backprop.shape(),
+                       strides_, padding_, data_format_, &dims));
 
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
+// TODO(andydavis) Consider moving code shared with
+// Conv2DCustomBackpropFilterOp into a shared helper function.
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.rows.input_size,
-            dims.cols.input_size, dims.rows.stride, dims.cols.stride,
-            data_format_)) {
-      return;
-    }
-#endif
-
-    // TODO(andydavis) Consider moving code shared with
-    // Conv2DCustomBackpropFilterOp into a shared helper function.
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.rows.input_size, dims.rows.filter_size,
-                                dims.rows.stride, padding_,
-                                &dims.rows.output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.cols.input_size, dims.cols.filter_size,
-                                dims.cols.stride, padding_,
-                                &dims.cols.output_size, &pad_left, &pad_right));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
+      }
+    }
+#else
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+#endif
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
 
     // The total dimension size of each kernel.
-    const int filter_total_size =
-        dims.rows.filter_size * dims.cols.filter_size * dims.in_depth;
+    const int filter_total_size = dims.spatial_dims[0].filter_size *
+                                  dims.spatial_dims[1].filter_size *
+                                  dims.in_depth;
     // The output image size is the spatial size of the output.
-    const int output_image_size = dims.rows.output_size * dims.cols.output_size;
+    const int output_image_size =
+        dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
 
     // TODO(andydavis) Get L2/L3 cache sizes from device.
     const size_t l2_cache_size = 256LL << 10;
@@ -386,11 +437,11 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                        &col_buffer));
 
     // The input offset corresponding to a single input image.
-    const int input_offset =
-        dims.rows.input_size * dims.cols.input_size * dims.in_depth;
+    const int input_offset = dims.spatial_dims[0].input_size *
+                             dims.spatial_dims[1].input_size * dims.in_depth;
     // The output offset corresponding to a single output image.
-    const int output_offset =
-        dims.rows.output_size * dims.cols.output_size * dims.out_depth;
+    const int output_offset = dims.spatial_dims[0].output_size *
+                              dims.spatial_dims[1].output_size * dims.out_depth;
 
     const T* filter_data = filter.template flat<T>().data();
     T* col_buffer_data = col_buffer.template flat<T>().data();
@@ -424,11 +475,12 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
 
         C.device(context->eigen_cpu_device()) = A.contract(B, contract_dims);
 
-        Col2im<T>(col_buffer_data, dims.in_depth, dims.rows.input_size,
-                  dims.cols.input_size, dims.rows.filter_size,
-                  dims.cols.filter_size, pad_top, pad_left, pad_bottom,
-                  pad_right, dims.rows.stride, dims.cols.stride,
-                  input_backprop_data);
+        Col2im<T>(
+            col_buffer_data, dims.in_depth, dims.spatial_dims[0].input_size,
+            dims.spatial_dims[1].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[1].filter_size, pad_top, pad_left, pad_bottom,
+            pad_right, dims.spatial_dims[0].stride, dims.spatial_dims[1].stride,
+            input_backprop_data);
 
         input_backprop_data += input_offset;
       }
@@ -464,11 +516,13 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
 
             C.noalias() = A * B.transpose();
 
-            Col2im<T>(im2col_buf, dims.in_depth, dims.rows.input_size,
-                      dims.cols.input_size, dims.rows.filter_size,
-                      dims.cols.filter_size, pad_top, pad_left, pad_bottom,
-                      pad_right, dims.rows.stride, dims.cols.stride,
-                      input_data);
+            Col2im<T>(im2col_buf, dims.in_depth,
+                      dims.spatial_dims[0].input_size,
+                      dims.spatial_dims[1].input_size,
+                      dims.spatial_dims[0].filter_size,
+                      dims.spatial_dims[1].filter_size, pad_top, pad_left,
+                      pad_bottom, pad_right, dims.spatial_dims[0].stride,
+                      dims.spatial_dims[1].stride, input_data);
           }
         };
         Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
@@ -512,7 +566,9 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 // The slow version (but compiles for GPU)
 
 // A dummy type to group forward backward data autotune results together.
-struct ConvBackwardDataAutoTuneGroup {};
+struct ConvBackwardDataAutoTuneGroup {
+  static string name() { return "ConvBwdData"; }
+};
 typedef AutoTuneSingleton<ConvBackwardDataAutoTuneGroup, ConvParameters,
                           perftools::gputools::dnn::AlgorithmConfig>
     AutoTuneConvBwdData;
@@ -561,11 +617,12 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                 input_sizes.vec<int32>(), &input_shape));
     const TensorShape& filter_shape = filter.shape();
 
-    Conv2DBackpropDimensions dims;
-    OP_REQUIRES_OK(context, Conv2DBackpropComputeDimensions(
-                                "Conv2DSlowBackpropInput", input_shape,
-                                filter_shape, out_backprop.shape(), strides_,
-                                padding_, data_format_, &dims));
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        context, ConvBackpropComputeDimensions(
+                     "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
+                     input_shape, filter_shape, out_backprop.shape(), strides_,
+                     padding_, data_format_, &dims));
 
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
@@ -574,15 +631,17 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     const int padding_rows =
         (padding_ == VALID)
             ? 0
-            : std::max<int>(0, (dims.rows.output_size - 1) * dims.rows.stride +
-                                   dims.rows.filter_size -
-                                   dims.rows.input_size);
+            : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
+                                       dims.spatial_dims[0].stride +
+                                   dims.spatial_dims[0].filter_size -
+                                   dims.spatial_dims[0].input_size);
     const int padding_cols =
         (padding_ == VALID)
             ? 0
-            : std::max<int>(0, (dims.cols.output_size - 1) * dims.cols.stride +
-                                   dims.cols.filter_size -
-                                   dims.cols.input_size);
+            : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
+                                       dims.spatial_dims[1].stride +
+                                   dims.spatial_dims[1].filter_size -
+                                   dims.spatial_dims[1].input_size);
 
     // TODO(keveman): cuDNN only supports equal padding on both sides, so only
     // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -600,12 +659,13 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
       return;
     }
 
-    if (dims.rows.filter_size == 1 && dims.cols.filter_size == 1 &&
-        dims.rows.stride == 1 && dims.cols.stride == 1 &&
+    if (dims.spatial_dims[0].filter_size == 1 &&
+        dims.spatial_dims[1].filter_size == 1 &&
+        dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
         data_format_ == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
-      const uint64 m =
-          dims.batch_size * dims.rows.input_size * dims.cols.input_size;
+      const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size;
       const uint64 k = dims.out_depth;
       const uint64 n = dims.in_depth;
 
@@ -629,15 +689,17 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                             ", n=", n, ", k=", k));
       }
       return;
-    } else if (dims.rows.filter_size == dims.rows.input_size &&
-               dims.cols.filter_size == dims.cols.input_size &&
+    } else if (dims.spatial_dims[0].filter_size ==
+                   dims.spatial_dims[0].input_size &&
+               dims.spatial_dims[1].filter_size ==
+                   dims.spatial_dims[1].input_size &&
                padding_ == VALID && data_format_ == FORMAT_NHWC) {
       // The input data and filter have the same height/width, so call cublas
       // directly.
       const uint64 m = dims.batch_size;
       const uint64 k = dims.out_depth;
-      const uint64 n =
-          dims.rows.input_size * dims.cols.input_size * dims.in_depth;
+      const uint64 n = dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size * dims.in_depth;
 
       auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                   out_backprop.template flat<T>().size());
@@ -667,8 +729,9 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
       // side or the bottom side. This is unsupported in cudnn. Therefore,
       // we pad that extra element and make it compatible.
       compatible_input_shape = ShapeFromFormat(
-          data_format_, dims.batch_size, dims.rows.input_size + rows_odd,
-          dims.cols.input_size + cols_odd, dims.in_depth);
+          data_format_, dims.batch_size,
+          dims.spatial_dims[0].input_size + rows_odd,
+          dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
     } else {
       compatible_input_shape = input_shape;
     }
@@ -684,18 +747,18 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
         .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     perftools::gputools::dnn::BatchDescriptor output_desc;
     output_desc.set_count(dims.batch_size)
-        .set_height(dims.rows.output_size)
-        .set_width(dims.cols.output_size)
+        .set_height(dims.spatial_dims[0].output_size)
+        .set_width(dims.spatial_dims[1].output_size)
         .set_feature_map_count(dims.out_depth)
         .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     perftools::gputools::dnn::FilterDescriptor filter_desc;
-    filter_desc.set_input_filter_height(dims.rows.filter_size)
-        .set_input_filter_width(dims.cols.filter_size)
+    filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
+        .set_input_filter_width(dims.spatial_dims[1].filter_size)
         .set_input_feature_map_count(dims.in_depth)
         .set_output_feature_map_count(dims.out_depth);
     perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-    conv_desc.set_vertical_filter_stride(dims.rows.stride)
-        .set_horizontal_filter_stride(dims.cols.stride)
+    conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+        .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
         .set_zero_padding_height(padding_rows / 2)
         .set_zero_padding_width(padding_cols / 2);
 
@@ -716,8 +779,8 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_temp(
                                 DataTypeToEnum<T>::value,
                                 TensorShape({dims.out_depth, dims.in_depth,
-                                             dims.rows.filter_size,
-                                             dims.cols.filter_size}),
+                                             dims.spatial_dims[0].filter_size,
+                                             dims.spatial_dims[1].filter_size}),
                                 &transformed_filter));
 
     functor::TransformFilter<Device, T, int, 4>()(
@@ -726,9 +789,9 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 
     Tensor transformed_out_backprop;
     if (data_format_ == FORMAT_NHWC) {
-      TensorShape nchw_shape =
-          ShapeFromFormat(FORMAT_NCHW, dims.batch_size, dims.rows.output_size,
-                          dims.cols.output_size, dims.out_depth);
+      TensorShape nchw_shape = ShapeFromFormat(
+          FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
+          dims.spatial_dims[1].output_size, dims.out_depth);
       if (dims.out_depth > 1) {
         OP_REQUIRES_OK(context, context->allocate_temp(
                                     DataTypeToEnum<T>::value, nchw_shape,
@@ -773,26 +836,28 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                             context);
     int device_id = stream->parent()->device_ordinal();
+    DataType dtype = out_backprop.dtype();
     ConvParameters conv_parameters = {
-        dims.batch_size,        // batch
-        dims.in_depth,          // in_depths
-        input_desc.height(),    // in_rows
-        input_desc.width(),     // in_cols
-        dims.out_depth,         // out_depths
-        dims.rows.filter_size,  // filter_rows
-        dims.cols.filter_size,  // filter_cols
-        dims.rows.stride,       // stride_rows
-        dims.cols.stride,       // stride_cols
-        padding_rows,           // padding_rows
-        padding_cols,           // padding_cols
-        device_id,              // device_id
+        dims.batch_size,                       // batch
+        dims.in_depth,                         // in_depths
+        {{input_desc.height(),                 // in_rows
+          input_desc.width()}},                // in_cols
+        dims.out_depth,                        // out_depths
+        {{dims.spatial_dims[0].filter_size,    // filter_rows
+          dims.spatial_dims[1].filter_size}},  // filter_cols
+        {{dims.spatial_dims[0].stride,         // stride_rows
+          dims.spatial_dims[1].stride}},       // stride_cols
+        {{padding_rows,                        // padding_rows
+          padding_cols}},                      // padding_cols
+        dtype,                                 // tensor data type
+        device_id,                             // device_id
     };
     AlgorithmConfig algorithm_config;
-    if (cudnn_use_autotune_ &&
-        !AutoTuneConvBwdData::GetInstance()->Find(conv_parameters,
-                                                  &algorithm_config)) {
+    if (cudnn_use_autotune_ && !AutoTuneConvBwdData::GetInstance()->Find(
+                                   conv_parameters, &algorithm_config)) {
       std::vector<AlgorithmType> algorithms;
-      CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(&algorithms));
+      CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
       for (auto profile_algorithm : algorithms) {
@@ -822,8 +887,9 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
           }
         }
       }
-      OP_REQUIRES(context, best_result.is_valid() &&
-                               best_result.algorithm() != kDefaultAlgorithm,
+      OP_REQUIRES(context,
+                  best_result.is_valid() &&
+                      best_result.algorithm() != kDefaultAlgorithm,
                   errors::NotFound("No algorithm worked!"));
       OP_REQUIRES(context,
                   best_result_no_scratch.is_valid() &&
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 8b6a4ac3697..4c864c08a5a 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -73,46 +73,55 @@ Status ConvBackpropExtractAndVerifyDimension(
   return Status::OK();
 }
 
-Status Conv2DBackpropComputeDimensions(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
-    const std::vector<int32>& strides, Padding padding,
-    TensorFormat data_format, Conv2DBackpropDimensions* dims) {
-  if (input_shape.dims() != 4) {
-    return errors::InvalidArgument(label, ": input must be 4-dimensional");
+Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
+                                     const TensorShape& input_shape,
+                                     const TensorShape& filter_shape,
+                                     const TensorShape& out_backprop_shape,
+                                     const std::vector<int32>& strides,
+                                     Padding padding, TensorFormat data_format,
+                                     ConvBackpropDimensions* dims) {
+  // The + 2 in the following line is for the batch and feature dimensions.
+  const int num_dims = num_spatial_dims + 2;
+  if (input_shape.dims() != num_dims) {
+    return errors::InvalidArgument(label, ": input must be ", num_dims,
+                                   "-dimensional");
   }
-  if (filter_shape.dims() != 4) {
-    return errors::InvalidArgument(label, ": filter must be 4-dimensional");
+  if (filter_shape.dims() != num_dims) {
+    return errors::InvalidArgument(label, ": filter must be ", num_dims,
+                                   "-dimensional");
   }
-  if (out_backprop_shape.dims() != 4) {
-    errors::InvalidArgument(label, ": out_backprop must be 4-dimensional");
+  if (out_backprop_shape.dims() != num_dims) {
+    return errors::InvalidArgument(label, ": out_backprop must be ", num_dims,
+                                   "-dimensional");
   }
-  dims->batch_size = GetTensorDim(input_shape, data_format, 'N');
-  if (dims->batch_size != GetTensorDim(out_backprop_shape, data_format, 'N')) {
+  int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
+  dims->batch_size = input_shape.dim_size(batch_dim);
+  if (dims->batch_size != out_backprop_shape.dim_size(batch_dim)) {
     return errors::InvalidArgument(
         label, ": input and out_backprop must have the same batch size");
   }
 
-  dims->in_depth = GetTensorDim(input_shape, data_format, 'C');
-  if (dims->in_depth != filter_shape.dim_size(2)) {
+  int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
+  dims->in_depth = input_shape.dim_size(feature_dim);
+  // The input and output feature dimensions are the second last and last
+  // dimensions of the filter Tensor.
+  if (dims->in_depth != filter_shape.dim_size(num_dims - 2)) {
     return errors::InvalidArgument(
         label, ": input and filter must have the same depth");
   }
-  dims->out_depth = filter_shape.dim_size(3);
-  if (dims->out_depth != GetTensorDim(out_backprop_shape, data_format, 'C')) {
+  dims->out_depth = filter_shape.dim_size(num_dims - 1);
+  if (dims->out_depth != out_backprop_shape.dim_size(feature_dim)) {
     return errors::InvalidArgument(
         label, ": filter and out_backprop must have the same out_depth");
   }
 
-  const int row_dim = GetTensorDimIndex(data_format, 'H');
-  const int col_dim = GetTensorDimIndex(data_format, 'W');
-  const int filter_row_dim = 0, filter_col_dim = 1;
-  TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
-      label, input_shape, filter_shape, out_backprop_shape, strides, padding,
-      row_dim, filter_row_dim, &dims->rows));
-  TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
-      label, input_shape, filter_shape, out_backprop_shape, strides, padding,
-      col_dim, filter_col_dim, &dims->cols));
+  dims->spatial_dims.resize(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    int image_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
+        label, input_shape, filter_shape, out_backprop_shape, strides, padding,
+        image_dim, i, &dims->spatial_dims[i]));
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index 507ffde9f68..3ea9510afba 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -178,14 +178,14 @@ struct ConvBackpropSpatialDimension {
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
-  // the input when computing Conv2DBackpropInput.
+  // the input when computing Conv?DBackpropInput.
   int64 pad_before, pad_after;
 };
 
-// Computed dimensions for a Conv2D backpropagation.
-struct Conv2DBackpropDimensions {
+// Computed dimensions for a backwards convolution.
+struct ConvBackpropDimensions {
   // Information about each spatial dimension.
-  ConvBackpropSpatialDimension rows, cols;
+  gtl::InlinedVector<ConvBackpropSpatialDimension, 3> spatial_dims;
 
   // Batch size.
   int64 batch_size;
@@ -194,14 +194,16 @@ struct Conv2DBackpropDimensions {
   int64 in_depth, out_depth;
 };
 
-// Common code between implementations of Conv2DBackpropInput and
-// Conv2DBackpropFilter. Verifies that the dimensions all match, and computes
-// sizes/padding for rows and columns.
-Status Conv2DBackpropComputeDimensions(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
-    const std::vector<int32>& strides, Padding padding,
-    TensorFormat data_format, Conv2DBackpropDimensions* dims);
+// Common code between implementations of Conv?DBackpropInput and
+// Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
+// sizes/padding for the spatial dimensions.
+Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
+                                     const TensorShape& input_shape,
+                                     const TensorShape& filter_shape,
+                                     const TensorShape& out_backprop_shape,
+                                     const std::vector<int32>& strides,
+                                     Padding padding, TensorFormat data_format,
+                                     ConvBackpropDimensions* dims);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 65ea783cd4e..b4d0bf2cfaa 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -28,8 +28,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -58,25 +60,29 @@ typedef Eigen::GpuDevice GPUDevice;
       context, batch == out_backprop.dim_size(0),                              \
       errors::InvalidArgument(                                                 \
           label, ": input and out_backprop must have the same batch size"));   \
-  const std::array<int64, 3> input_size = {{input_shape.dim_size(1),           \
-                                            input_shape.dim_size(2),           \
-                                            input_shape.dim_size(3)}};         \
-  const int64 in_depth = input_shape.dim_size(4);                              \
+  const std::array<int64, 3> input_size = {                                    \
+      {GetTensorDim(input_shape, data_format_, '0'),                           \
+       GetTensorDim(input_shape, data_format_, '1'),                           \
+       GetTensorDim(input_shape, data_format_, '2')}};                         \
+  const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
   const std::array<int64, 3> filter_size = {{filter_shape.dim_size(0),         \
                                              filter_shape.dim_size(1),         \
                                              filter_shape.dim_size(2)}};       \
-  const int64 output_cols = out_backprop.dim_size(3);                          \
-  const int64 output_rows = out_backprop.dim_size(2);                          \
-  const int64 output_planes = out_backprop.dim_size(1);                        \
+  const int64 output_cols = GetTensorDim(out_backprop, data_format_, '2');     \
+  const int64 output_rows = GetTensorDim(out_backprop, data_format_, '1');     \
+  const int64 output_planes = GetTensorDim(out_backprop, data_format_, '0');   \
   OP_REQUIRES(context, in_depth == filter_shape.dim_size(3),                   \
               errors::InvalidArgument(                                         \
                   label, ": input and filter must have the same depth"));      \
   const int64 out_depth = filter_shape.dim_size(4);                            \
   OP_REQUIRES(                                                                 \
-      context, out_depth == out_backprop.dim_size(4),                          \
+      context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'),     \
       errors::InvalidArgument(                                                 \
           label, ": filter and out_backprop must have the same out_depth"));   \
-  const std::array<int64, 3> strides = {{stride_[1], stride_[2], stride_[3]}}; \
+  const std::array<int64, 3> strides = {                                       \
+      {GetTensorDim(stride_, data_format_, '0'),                               \
+       GetTensorDim(stride_, data_format_, '1'),                               \
+       GetTensorDim(stride_, data_format_, '2')}};                             \
   std::array<int64, 3> out, padding;                                           \
   OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,    \
                                           padding_, &out, &padding));          \
@@ -131,13 +137,28 @@ class Conv3DBackpropInputOp : public OpKernel {
  public:
   explicit Conv3DBackpropInputOp(OpKernelConstruction* context)
       : OpKernel(context),
+        data_format_(FORMAT_NHWC),
         takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU."));
+    }
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES(
-        context, (stride_[0] == 1 && stride_[4] == 1),
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -204,6 +225,7 @@ class Conv3DBackpropInputOp : public OpKernel {
  private:
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
   bool takes_shape_;
 };
 
@@ -224,13 +246,28 @@ class Conv3DBackpropFilterOp : public OpKernel {
  public:
   explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
       : OpKernel(context),
+        data_format_(FORMAT_NHWC),
         takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
+    }
+
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES(
-        context, (stride_[0] == 1 && stride_[4] == 1),
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -260,6 +297,11 @@ class Conv3DBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
+    if (input_shape.num_elements() == 0) {
+      filter_backprop->template flat<T>().setZero();
+      return;
+    }
+
     // For the backprop of the filter, we need to also transpose the
     // out_backprop.
     // The shape of backprop is
@@ -329,6 +371,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
  private:
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
   bool takes_shape_;
 };
 
@@ -370,21 +413,40 @@ DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
+// A dummy type to group backward data autotune results together.
+struct Conv3dBackwardDataAutoTuneGroup {
+  static string name() { return "Conv3dBwdData"; }
+};
+typedef AutoTuneSingleton<Conv3dBackwardDataAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
+
+    AutoTuneConv3dBwdData;
 template <typename T>
 class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
  public:
   explicit Conv3DBackpropInputOp(OpKernelConstruction* context)
       : OpKernel(context),
+        data_format_(FORMAT_NHWC),
         takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES(
-        context, (stride_[0] == 1 && stride_[4] == 1),
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
   }
   void Compute(OpKernelContext* context) override {
     const Tensor& filter = context->input(1);
@@ -406,7 +468,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 &&
-        stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1) {
+        stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 &&
+        data_format_ == FORMAT_NHWC) {
       const uint64 m = batch * input_size[0] * input_size[1] * input_size[2];
       const uint64 k = out_depth;
       const uint64 n = in_depth;
@@ -433,7 +496,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       return;
     } else if (filter_size[0] == input_size[0] &&
                filter_size[1] == input_size[1] &&
-               filter_size[2] == input_size[2] && padding_ == Padding::VALID) {
+               filter_size[2] == input_size[2] && padding_ == Padding::VALID &&
+               data_format_ == FORMAT_NHWC) {
       const uint64 m = batch;
       const uint64 k = out_depth;
       const uint64 n = input_size[0] * input_size[1] * input_size[2] * in_depth;
@@ -534,15 +598,22 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 
     // Shape: batch, filters, z, y, x.
     Tensor transformed_out_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value,
-                                          {batch, out_depth, output_planes,
-                                           output_rows, output_cols},
-                                          &transformed_out_backprop));
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-        transformed_out_backprop.tensor<T, 5>());
-
+    if (data_format_ == FORMAT_NHWC) {
+      TensorShape nchw_shape = {batch, out_depth, output_planes, output_rows,
+                                output_cols};
+      if (out_depth > 1) {
+        OP_REQUIRES_OK(context, context->allocate_temp(
+                                    DataTypeToEnum<T>::value, nchw_shape,
+                                    &transformed_out_backprop));
+        functor::NHWCToNCHW<GPUDevice, T, 5>()(
+            context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+            transformed_out_backprop.tensor<T, 5>());
+      } else {
+        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+      }
+    } else {
+      transformed_out_backprop = out_backprop;
+    }
     // Shape: batch, filters, z, y, x.
     Tensor pre_transformed_in_backprop;
     OP_REQUIRES_OK(
@@ -563,13 +634,81 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
+    const int device_id = stream->parent()->device_ordinal();
+    DataType dtype = context->input(0).dtype();
+    const ConvParameters conv_parameters = {
+        batch,
+        in_depth,
+        {{input_size[0], input_size[1], input_size[2]}},
+        out_depth,
+        {{filter_size[0], filter_size[1], filter_size[2]}},
+        {{strides[0], strides[1], strides[2]}},
+        {{padding_planes, padding_rows, padding_cols}},
+        dtype,
+        device_id,
+    };
+
+    using perftools::gputools::dnn::AlgorithmConfig;
+    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::ProfileResult;
+    using perftools::gputools::dnn::kDefaultAlgorithm;
+    AlgorithmConfig algorithm_config;
+    if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
+                                   conv_parameters, &algorithm_config)) {
+      std::vector<AlgorithmType> algorithms;
+      CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+      ProfileResult best_result;
+      ProfileResult best_result_no_scratch;
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                                context);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardDataWithAlgorithm(
+                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
+          }
+        }
+      }
+      OP_REQUIRES(context,
+                  best_result.is_valid() &&
+                      best_result.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm worked!"));
+      OP_REQUIRES(context,
+                  best_result_no_scratch.is_valid() &&
+                      best_result_no_scratch.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm without scratch worked!"));
+      algorithm_config.set_algorithm(best_result.algorithm());
+      algorithm_config.set_algorithm_no_scratch(
+          best_result_no_scratch.algorithm());
+      AutoTuneConv3dBwdData::GetInstance()->Insert(conv_parameters,
+                                                   algorithm_config);
+    }
     CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                             context);
     bool cudnn_launch_status =
         stream
-            ->ThenConvolveBackwardDataWithScratch(
+            ->ThenConvolveBackwardDataWithAlgorithm(
                 filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator)
+                conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                algorithm_config, nullptr)
             .ok();
 
     if (!cudnn_launch_status) {
@@ -597,34 +736,60 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 
       pre_transformed_in_backprop = in_backprop_remove_padding;
     }
-    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::NCHWToNHWC<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(),
-        toConstTensor(pre_transformed_in_backprop).template tensor<T, 5>(),
-        in_backprop->tensor<T, 5>());
+
+    if (data_format_ == FORMAT_NHWC) {
+      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+      functor::NCHWToNHWC<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(),
+          toConstTensor(pre_transformed_in_backprop).template tensor<T, 5>(),
+          in_backprop->tensor<T, 5>());
+    } else {
+      *in_backprop = pre_transformed_in_backprop;
+    }
   }
 
  private:
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
   bool takes_shape_;
+  bool cudnn_use_autotune_;
 };
 
+// A dummy type to group backward filter autotune results together.
+struct Conv3dBackwardFilterAutoTuneGroup {
+  static string name() { return "Conv3dBwdFilter"; }
+};
+typedef AutoTuneSingleton<Conv3dBackwardFilterAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
+    AutoTuneConv3dBwdFilter;
+
 template <typename T>
 class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
  public:
   explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
       : OpKernel(context),
+        data_format_(FORMAT_NHWC),
         takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES(
-        context, (stride_[0] == 1 && stride_[4] == 1),
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
   }
 
   void Compute(OpKernelContext* context) override {
@@ -649,7 +814,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
     if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 &&
-        strides[2] == 1 && strides[1] == 1 && strides[0] == 1) {
+        strides[2] == 1 && strides[1] == 1 && strides[0] == 1 &&
+        data_format_ == FORMAT_NHWC) {
       const uint64 m = in_depth;
       const uint64 k = batch * input_size[1] * input_size[2] * input_size[0];
       const uint64 n = out_depth;
@@ -685,7 +851,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       return;
     } else if (filter_size[0] == input_size[0] &&
                filter_size[1] == input_size[1] &&
-               filter_size[2] == input_size[2] && padding_ == Padding::VALID) {
+               filter_size[2] == input_size[2] && padding_ == Padding::VALID &&
+               data_format_ == FORMAT_NHWC) {
       const uint64 m = input_size[0] * input_size[1] * input_size[2] * in_depth;
       const uint64 k = batch;
       const uint64 n = out_depth;
@@ -726,18 +893,19 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
     Tensor compatible_input;
     if (rows_odd || cols_odd || planes_odd) {
-      OP_REQUIRES_OK(
-          context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                          {batch, input_size[0] + planes_odd,
-                                           input_size[1] + rows_odd,
-                                           input_size[2] + cols_odd, in_depth},
-                                          &compatible_input));
-
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<T>::value,
+                                  ShapeFromFormat(data_format_, batch,
+                                                  {{input_size[0] + planes_odd,
+                                                    input_size[1] + rows_odd,
+                                                    input_size[2] + cols_odd}},
+                                                  in_depth),
+                                  &compatible_input));
       functor::PadInput<GPUDevice, T, int, 5>()(
           context->template eigen_device<GPUDevice>(),
           To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
           {{planes_odd, rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 5>()), FORMAT_NHWC);
+          To32Bit(compatible_input.tensor<T, 5>()), data_format_);
     } else {
       compatible_input = input;
     }
@@ -747,9 +915,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         << ", " << padding_planes << ")";
     perftools::gputools::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
-        .set_spatial_dim(DimIndex::X, compatible_input.dim_size(3))
-        .set_spatial_dim(DimIndex::Y, compatible_input.dim_size(2))
-        .set_spatial_dim(DimIndex::Z, compatible_input.dim_size(1))
+        .set_spatial_dim(DimIndex::X,
+                         GetTensorDim(compatible_input, data_format_, '2'))
+        .set_spatial_dim(DimIndex::Y,
+                         GetTensorDim(compatible_input, data_format_, '1'))
+        .set_spatial_dim(DimIndex::Z,
+                         GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(in_depth)
         .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
     perftools::gputools::dnn::BatchDescriptor output_desc(3);
@@ -782,26 +953,41 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                                &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value,
-                                          {batch, out_depth, output_planes,
-                                           output_rows, output_cols},
+    if (data_format_ == FORMAT_NHWC) {
+      TensorShape nchw_shape = {batch, out_depth, output_planes, output_rows,
+                                output_cols};
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
                                           &transformed_out_backprop));
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-        transformed_out_backprop.tensor<T, 5>());
-
+      if (out_depth > 1) {
+        functor::NHWCToNCHW<GPUDevice, T, 5>()(
+            context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+            transformed_out_backprop.tensor<T, 5>());
+      } else {
+        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+      }
+    } else {
+      transformed_out_backprop = out_backprop;
+    }
     Tensor transformed_input;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                {batch, in_depth, compatible_input.dim_size(1),
-                                 compatible_input.dim_size(2),
-                                 compatible_input.dim_size(3)},
-                                &transformed_input));
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(),
-        const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
-        transformed_input.tensor<T, 5>());
+    if (data_format_ == FORMAT_NHWC) {
+      TensorShape nchw_shape = {batch, in_depth, compatible_input.dim_size(1),
+                                compatible_input.dim_size(2),
+                                compatible_input.dim_size(3)};
+      if (in_depth > 1) {
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<T>::value,
+                                              nchw_shape, &transformed_input));
+        functor::NHWCToNCHW<GPUDevice, T, 5>()(
+            context->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
+            transformed_input.tensor<T, 5>());
+      } else {
+        CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
+      }
+    } else {
+      transformed_input = compatible_input;
+    }
 
     auto out_backprop_ptr =
         AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -815,13 +1001,83 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
     static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
+
+    const int device_id = stream->parent()->device_ordinal();
+    DataType dtype = input.dtype();
+    const ConvParameters conv_parameters = {
+        batch,
+        in_depth,
+        {{input_size[0], input_size[1], input_size[2]}},
+        out_depth,
+        {{filter_size[0], filter_size[1], filter_size[2]}},
+        {{strides[0], strides[1], strides[2]}},
+        {{padding_planes, padding_rows, padding_cols}},
+        dtype,
+        device_id,
+    };
+
+    using perftools::gputools::dnn::AlgorithmConfig;
+    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::ProfileResult;
+    using perftools::gputools::dnn::kDefaultAlgorithm;
+    AlgorithmConfig algorithm_config;
+    if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
+                                   conv_parameters, &algorithm_config)) {
+      std::vector<AlgorithmType> algorithms;
+      CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+      ProfileResult best_result;
+      ProfileResult best_result_no_scratch;
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(
+            ConvolveBackwardFilterScratchSize, context);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardFilterWithAlgorithm(
+                    input_desc, input_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, filter_desc, &filter_backprop_ptr,
+                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                    &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
+          }
+        }
+      }
+      OP_REQUIRES(context,
+                  best_result.is_valid() &&
+                      best_result.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm worked!"));
+      OP_REQUIRES(context,
+                  best_result_no_scratch.is_valid() &&
+                      best_result_no_scratch.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm without scratch worked!"));
+      algorithm_config.set_algorithm(best_result.algorithm());
+      algorithm_config.set_algorithm_no_scratch(
+          best_result_no_scratch.algorithm());
+      AutoTuneConv3dBwdFilter::GetInstance()->Insert(conv_parameters,
+                                                     algorithm_config);
+    }
     CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
                                             context);
     bool cudnn_launch_status =
         stream
-            ->ThenConvolveBackwardFilterWithScratch(
+            ->ThenConvolveBackwardFilterWithAlgorithm(
                 input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
-                filter_desc, &filter_backprop_ptr, &scratch_allocator)
+                filter_desc, &filter_backprop_ptr, &scratch_allocator,
+                algorithm_config, nullptr)
             .ok();
 
     if (!cudnn_launch_status) {
@@ -841,7 +1097,9 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
  private:
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
   bool takes_shape_;
+  bool cudnn_use_autotune_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index df278d036a1..f8eb9c555ef 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -117,6 +117,12 @@ class LaunchConv2DOp<CPUDevice, T> {
               const Tensor& input, const Tensor& filter, int row_stride,
               int col_stride, const Eigen::PaddingType& padding, Tensor* output,
               TensorFormat data_format) {
+    if (data_format != FORMAT_NHWC) {
+      ctx->SetStatus(
+          errors::Unimplemented("Generic conv implementation only supports "
+                                "NHWC tensor format for now."));
+      return;
+    }
     LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, row_stride,
                                         col_stride, padding, output,
                                         data_format);
@@ -213,8 +219,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.v = stride_cols;
     desc.pad_h = pad_rows;
     desc.pad_w = pad_cols;
-    desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
-    desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
@@ -222,13 +228,17 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
     desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
 
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
     auto input_ptr = input.template flat<float>().data();
     auto filter_ptr = filter.template flat<float>().data();
     auto output_ptr = output->template flat<float>().data();
@@ -283,18 +293,19 @@ class Conv2DOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -302,18 +313,20 @@ class Conv2DOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -424,7 +437,9 @@ int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
 }
 
 // A dummy type to group forward convolution autotune results together.
-struct ConvAutoTuneGroup {};
+struct ConvAutoTuneGroup {
+  static string name() { return "Conv"; }
+};
 typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
                           perftools::gputools::dnn::AlgorithmConfig>
     AutoTuneConv;
@@ -633,25 +648,28 @@ void LaunchConv2DOp<GPUDevice, T>::launch(
       );
 
   int device_id = stream->parent()->device_ordinal();
+  DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
-      in_batch,      // batch
-      in_depths,     // in_depths
-      in_rows,       // in_rows
-      in_cols,       // in_cols
-      out_depths,    // out_depths
-      patch_rows,    // filter_rows
-      patch_cols,    // filter_cols
-      row_stride,    // stride_rows
-      col_stride,    // stride_cols
-      padding_rows,  // padding_rows
-      padding_cols,  // padding_cols
-      device_id,     // device_id
+      in_batch,          // batch
+      in_depths,         // in_depths
+      {{in_rows,         // in_rows
+        in_cols}},       // in_cols
+      out_depths,        // out_depths
+      {{patch_rows,      // filter_rows
+        patch_cols}},    // filter_cols
+      {{row_stride,      // stride_rows
+        col_stride}},    // stride_cols
+      {{padding_rows,    // padding_rows
+        padding_cols}},  // padding_cols
+      dtype,             // tensor datatype
+      device_id,         // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
     std::vector<AlgorithmType> algorithms;
-    CHECK(stream->parent()->GetConvolveAlgorithms(&algorithms));
+    CHECK(stream->parent()->GetConvolveAlgorithms(
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
@@ -680,9 +698,10 @@ void LaunchConv2DOp<GPUDevice, T>::launch(
         }
       }
     }
-    OP_REQUIRES(ctx, best_result.is_valid() &&
-                         best_result.algorithm() != kDefaultAlgorithm,
-                errors::NotFound("No algorithm worked!"));
+    OP_REQUIRES(
+        ctx,
+        best_result.is_valid() && best_result.algorithm() != kDefaultAlgorithm,
+        errors::NotFound("No algorithm worked!"));
     OP_REQUIRES(ctx,
                 best_result_no_scratch.is_valid() &&
                     best_result_no_scratch.algorithm() != kDefaultAlgorithm,
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index a9edfbcdde6..58f8e3b2cd0 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -46,9 +47,14 @@ struct LaunchConvOp;
 
 template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
-  static void launch(OpKernelContext* context, const Tensor& input,
-                     const Tensor& filter, const std::array<int64, 3>& strides,
-                     const Padding padding, Tensor* output) {
+  static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                     const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
+    OP_REQUIRES(context, data_format == FORMAT_NHWC,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports the NHWC "
+                                        "tensor format."));
     functor::CuboidConvolution<CPUDevice, T>()(
         context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
         input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
@@ -60,15 +66,22 @@ template <typename Device, typename T>
 class Conv3DOp : public BinaryOp<T> {
  public:
   explicit Conv3DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 5,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES(
-        context, (stride_[0] == 1 && stride_[4] == 1),
+        context,
+        (GetTensorDim(stride_, data_format_, 'N') == 1 &&
+         GetTensorDim(stride_, data_format_, 'C') == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
   }
 
   void Compute(OpKernelContext* context) override {
@@ -87,8 +100,8 @@ class Conv3DOp : public BinaryOp<T> {
     OP_REQUIRES(context, filter.dims() == 5,
                 errors::InvalidArgument("filter must be 5-dimensional"));
 
-    const int64 in_depth = input.dim_size(4);
-    const int64 in_batch = input.dim_size(0);
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    const int64 in_batch = GetTensorDim(input, data_format_, 'N');
 
     const int64 out_depth = filter.dim_size(4);
     OP_REQUIRES(
@@ -97,29 +110,35 @@ class Conv3DOp : public BinaryOp<T> {
 
     // Dimension order for these arrays is: z, y, x.
     std::array<int64, 3> input_size = {
-        {input.dim_size(1), input.dim_size(2), input.dim_size(3)}};
+        {GetTensorDim(input, data_format_, '0'),
+         GetTensorDim(input, data_format_, '1'),
+         GetTensorDim(input, data_format_, '2')}};
     std::array<int64, 3> filter_size = {
         {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
-    std::array<int64, 3> strides = {{stride_[1], stride_[2], stride_[3]}};
+    std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
+                                     GetTensorDim(stride_, data_format_, '1'),
+                                     GetTensorDim(stride_, data_format_, '2')}};
     std::array<int64, 3> out, padding;
 
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides,
                                             padding_, &out, &padding));
-
-    TensorShape out_shape = {in_batch, out[0], out[1], out[2], out_depth};
+    TensorShape out_shape = ShapeFromFormat(
+        data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
     // Return early if nothing to do.
     if (out_shape.num_elements() == 0) return;
 
-    LaunchConvOp<Device, T>::launch(context, input, filter, strides, padding_,
-                                    output);
+    LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
+                                    strides, padding_, data_format_, output);
   }
 
  private:
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
+  bool cudnn_use_autotune_;
 };
 
 #define REGISTER_CPU_KERNEL(T)                                  \
@@ -132,22 +151,31 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 
 #if GOOGLE_CUDA
 
+// A dummy type to group forward convolution autotune results together.
+struct Conv3dAutoTuneGroup {
+  static string name() { return "Conv3d"; }
+};
+typedef AutoTuneSingleton<Conv3dAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
+    AutoTuneConv3d;
+
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
 template <typename T>
 struct LaunchConvOp<GPUDevice, T> {
-  static void launch(OpKernelContext* ctx, const Tensor& input_param,
-                     const Tensor& filter, const std::array<int64, 3>& strides,
-                     const Padding padding, Tensor* output) {
+  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                     const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
     auto* stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
     Tensor input = input_param;
 
-    const int64 in_batch = input.dim_size(0);
-    int64 in_planes = input.dim_size(1);
-    int64 in_rows = input.dim_size(2);
-    int64 in_cols = input.dim_size(3);
-    const int64 in_depth = input.dim_size(4);
+    const int64 in_batch = GetTensorDim(input, data_format, 'N');
+    int64 in_planes = GetTensorDim(input, data_format, '0');
+    int64 in_rows = GetTensorDim(input, data_format, '1');
+    int64 in_cols = GetTensorDim(input, data_format, '2');
+    const int64 in_depth = GetTensorDim(input, data_format, 'C');
 
     const int64 filter_planes = filter.dim_size(0);
     const int64 filter_rows = filter.dim_size(1);
@@ -155,9 +183,9 @@ struct LaunchConvOp<GPUDevice, T> {
     const int64 out_depth = filter.dim_size(4);
 
     int64 pad_planes = 0, pad_rows = 0, pad_cols = 0;
-    int64 out_planes = output->dim_size(1);
-    int64 out_rows = output->dim_size(2);
-    int64 out_cols = output->dim_size(3);
+    int64 out_planes = GetTensorDim(*output, data_format, '0');
+    int64 out_rows = GetTensorDim(*output, data_format, '1');
+    int64 out_cols = GetTensorDim(*output, data_format, '2');
 
     if (padding == Padding::SAME) {
       pad_planes = std::max<int64>(
@@ -170,7 +198,8 @@ struct LaunchConvOp<GPUDevice, T> {
 
     // NOTE: This only works in NHWC.
     if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 &&
-        strides[0] == 1 && strides[1] == 1 && strides[2] == 1) {
+        strides[0] == 1 && strides[1] == 1 && strides[2] == 1 &&
+        data_format == FORMAT_NHWC) {
       // 1x1 filter, so call cublas directly.
       const uint64 m = in_batch * in_planes * in_rows * in_cols;
       const uint64 k = in_depth;
@@ -195,7 +224,8 @@ struct LaunchConvOp<GPUDevice, T> {
       }
       return;
     } else if (filter_planes == in_planes && filter_rows == in_rows &&
-               filter_cols == in_cols && padding == Padding::VALID) {
+               filter_cols == in_cols && padding == Padding::VALID &&
+               data_format == FORMAT_NHWC) {
       // The input data and filter have the same planes/height/width, so call
       // cublas directly.
       const uint64 m = in_batch;
@@ -232,13 +262,14 @@ struct LaunchConvOp<GPUDevice, T> {
       // overhead and would work as long as an op trained this way is only
       // used on GPU.
       if (rows_odd || cols_odd || planes_odd) {
-        Tensor transformed_input;
-        int64 new_in_rows = in_rows + rows_odd;
-        int64 new_in_cols = in_cols + cols_odd;
-        int64 new_in_planes = in_planes + planes_odd;
+        const int64 new_in_rows = in_rows + rows_odd;
+        const int64 new_in_cols = in_cols + cols_odd;
+        const int64 new_in_planes = in_planes + planes_odd;
 
-        TensorShape transformed_shape(
-            {in_batch, new_in_planes, new_in_rows, new_in_cols, in_depth});
+        Tensor transformed_input;
+        TensorShape transformed_shape = ShapeFromFormat(
+            data_format, in_batch, {{new_in_planes, new_in_rows, new_in_cols}},
+            in_depth);
         OP_REQUIRES_OK(
             ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
                                     &transformed_input));
@@ -246,7 +277,7 @@ struct LaunchConvOp<GPUDevice, T> {
         functor::PadInput<GPUDevice, T, int, 5>()(
             ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
             {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
-            To32Bit(transformed_input.tensor<T, 5>()), FORMAT_NHWC);
+            To32Bit(transformed_input.tensor<T, 5>()), data_format);
         input = transformed_input;
         in_rows = new_in_rows;
         in_cols = new_in_cols;
@@ -254,20 +285,25 @@ struct LaunchConvOp<GPUDevice, T> {
       }
     }
 
-    Tensor transformed_input;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(
-                 DataTypeToEnum<T>::value,
-                 TensorShape({in_batch, in_depth, in_planes, in_rows, in_cols}),
-                 &transformed_input));
-    // input: [b, x, y, z, d]
-    // t_input: [b, d, x, y, z]
-    // NCDHW is the only format universally supported by cuDNN.
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(
-        ctx->eigen_device<GPUDevice>(),
-        const_cast<const Tensor&>(input).tensor<T, 5>(),
-        transformed_input.tensor<T, 5>());
-    input = transformed_input;
+    if (data_format == FORMAT_NHWC) {
+      const TensorShape nchw_shape = ShapeFromFormat(
+          FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
+      if (in_depth > 1) {
+        Tensor transformed_input;
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                               nchw_shape, &transformed_input));
+        // input: [b, x, y, z, d]
+        // t_input: [b, d, x, y, z]
+        // NCDHW is the only format universally supported by cuDNN.
+        functor::NHWCToNCHW<GPUDevice, T, 5>()(
+            ctx->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(input).tensor<T, 5>(),
+            transformed_input.tensor<T, 5>());
+        input = transformed_input;
+      } else {
+        CHECK(input.CopyFrom(input, nchw_shape));
+      }
+    }
 
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
@@ -314,10 +350,11 @@ struct LaunchConvOp<GPUDevice, T> {
 
     Tensor transformed_output;
     OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                TensorShape({in_batch, out_depth, out_planes,
-                                             out_rows, out_cols}),
-                                &transformed_output));
+        ctx, ctx->allocate_temp(
+                 DataTypeToEnum<T>::value,
+                 ShapeFromFormat(FORMAT_NCHW, in_batch,
+                                 {{out_planes, out_rows, out_cols}}, out_depth),
+                 &transformed_output));
 
     auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                     input.template flat<T>().size());
@@ -330,12 +367,82 @@ struct LaunchConvOp<GPUDevice, T> {
 
     static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
+
+    int device_id = stream->parent()->device_ordinal();
+    DataType dtype = input.dtype();
+    ConvParameters conv_parameters = {
+        in_batch,
+        in_depth,
+        {{in_planes, in_rows, in_cols}},
+        out_depth,
+        {{filter_planes, filter_rows, filter_cols}},
+        {{strides[0], strides[1], strides[2]}},
+        {{pad_planes, pad_rows, pad_cols}},
+        dtype,
+        device_id,
+    };
+
+    using perftools::gputools::dnn::AlgorithmConfig;
+    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::ProfileResult;
+    using perftools::gputools::dnn::kDefaultAlgorithm;
+
+    AlgorithmConfig algorithm_config;
+
+    if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
+                                  conv_parameters, &algorithm_config)) {
+      std::vector<AlgorithmType> algorithms;
+      CHECK(stream->parent()->GetConvolveAlgorithms(
+          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+      ProfileResult best_result;
+      ProfileResult best_result_no_scratch;
+      for (auto profile_algorithm : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveWithAlgorithm(
+                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                    output_desc, &output_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
+          }
+        }
+      }
+      OP_REQUIRES(ctx,
+                  best_result.is_valid() &&
+                      best_result.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm worked!"));
+      OP_REQUIRES(ctx,
+                  best_result_no_scratch.is_valid() &&
+                      best_result_no_scratch.algorithm() != kDefaultAlgorithm,
+                  errors::NotFound("No algorithm without scratch worked!"));
+      algorithm_config.set_algorithm(best_result.algorithm());
+      algorithm_config.set_algorithm_no_scratch(
+          best_result_no_scratch.algorithm());
+      AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
+    }
+
     CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
     bool cudnn_launch_status =
         stream
-            ->ThenConvolveWithScratch(input_desc, input_ptr, filter_desc,
-                                      filter_ptr, conv_desc, output_desc,
-                                      &output_ptr, &scratch_allocator)
+            ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
+                                        filter_ptr, conv_desc, output_desc,
+                                        &output_ptr, &scratch_allocator,
+                                        algorithm_config, nullptr)
             .ok();
 
     if (!cudnn_launch_status) {
@@ -344,12 +451,16 @@ struct LaunchConvOp<GPUDevice, T> {
           ") filter shape(", filter.shape().DebugString(), ")"));
     }
 
-    // t_output: [b, out, x, y, z]
-    // output: [b, x, y, z, out]
-    functor::NCHWToNHWC<GPUDevice, T, 5>()(
-        ctx->eigen_device<GPUDevice>(),
-        const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
-        output->tensor<T, 5>());
+    if (data_format == FORMAT_NHWC) {
+      // t_output: [b, out, x, y, z]
+      // output: [b, x, y, z, out]
+      functor::NCHWToNHWC<GPUDevice, T, 5>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
+          output->tensor<T, 5>());
+    } else {
+      *output = transformed_output;
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 219e6d5e978..291ebf22987 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -74,8 +74,9 @@ enum SamplingMode {
 //       my_vector[current] *= 10.0f;
 //     }
 // });
-void FusedConvParallelFor(OpKernelContext* context, int64 begin, int64 end,
-                          std::function<void(int64, int64)> task_function) {
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
 // On iOS, the thread management imposes a very big performance penalty, so
 // just call the function directly with no multithreading.
 #if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
@@ -712,7 +713,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 before =
           paddings_matrix(d, 0);  // Pad before existing elements.
       const int32 after =
-          paddings_matrix(d, 1);  // Pad after exisitng elements.
+          paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
                   errors::InvalidArgument("paddings must be non-negative: ",
                                           before, " ", after));
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 072096f4928..b268f8dbd2e 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -21,7 +21,12 @@ limitations under the License.
 #include <tuple>
 #include <unordered_map>
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
@@ -94,33 +99,28 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
 // backward conv operations.
 class ConvParameters {
  public:
-  ConvParameters(int64 batch, int64 in_depths, int64 in_rows, int64 in_cols,
-                 int64 out_depths, int64 filter_rows, int64 filter_cols,
-                 int64 stride_rows, int64 stride_cols, int64 padding_rows,
-                 int64 padding_cols, int device_id)
+  using SpatialArray = gtl::InlinedVector<int64, 3>;
+  ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
+                 int64 out_depths, const SpatialArray& filter,
+                 const SpatialArray& stride, const SpatialArray& padding,
+                 const DataType& dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
-        in_rows_(in_rows),
-        in_cols_(in_cols),
+        in_(in),
         out_depths_(out_depths),
-        filter_rows_(filter_rows),
-        filter_cols_(filter_cols),
-        stride_rows_(stride_rows),
-        stride_cols_(stride_cols),
-        padding_rows_(padding_rows),
-        padding_cols_(padding_cols),
+        filter_(filter),
+        stride_(stride),
+        padding_(padding),
+        dtype_(dtype),
         device_id_(device_id) {
     hash_code_ = batch;
     hash_code_ = Hash64Combine(hash_code_, in_depths);
-    hash_code_ = Hash64Combine(hash_code_, in_rows);
-    hash_code_ = Hash64Combine(hash_code_, in_cols);
+    for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
-    hash_code_ = Hash64Combine(hash_code_, filter_rows);
-    hash_code_ = Hash64Combine(hash_code_, filter_cols);
-    hash_code_ = Hash64Combine(hash_code_, stride_rows);
-    hash_code_ = Hash64Combine(hash_code_, stride_cols);
-    hash_code_ = Hash64Combine(hash_code_, padding_rows);
-    hash_code_ = Hash64Combine(hash_code_, padding_cols);
+    for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
+    hash_code_ = Hash64Combine(hash_code_, dtype);
     hash_code_ = Hash64Combine(hash_code_, device_id);
   }
   bool operator==(const ConvParameters& other) const {
@@ -132,29 +132,53 @@ class ConvParameters {
   }
   uint64 hash() const { return hash_code_; }
 
- private:
-  typedef std::tuple<int64, int64, int64, int64, int64, int64, int64, int64,
-                     int64, int64, int64, int>
-      DataType;
+  string ToString() const {
+    // clang-format off
+    return strings::StrCat(
+        batch_, ", ", in_depths_, ", ",
+        "(", str_util::Join(in_, ", "), "), ",
+        out_depths_, ", ",
+        "(", str_util::Join(filter_, ", "), "), ",
+        "(", str_util::Join(stride_, ", "), "), ",
+        "(", str_util::Join(padding_, ", "), "), ",
+        dtype_, ", ", device_id_);
+    // clang-format on
+  }
 
-  DataType get_data_as_tuple() const {
-    return std::make_tuple(batch_, in_depths_, in_rows_, in_cols_, out_depths_,
-                           filter_rows_, filter_cols_, stride_rows_,
-                           stride_cols_, padding_rows_, padding_cols_,
-                           device_id_);
+  // TODO(yangzihao): The purpose of this function is to disable winograd
+  // nonfused conv algorithm for certain input parameters so as to avoid a bug
+  // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7.
+  template <typename T>
+  bool ShouldIncludeWinogradNonfusedAlgo() const {
+    int64 total_size = 16 * std::ceil(batch_ / 16.0) *
+                       std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
+                       sizeof(T);
+    int64 threshold = 1L << 31;
+    if (total_size >= threshold) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+ private:
+  typedef std::tuple<int64, int64, SpatialArray, int64, SpatialArray,
+                     SpatialArray, SpatialArray, DataType, int>
+      ParameterDataType;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
+                           stride_, padding_, dtype_, device_id_);
   }
 
   int64 batch_;
   int64 in_depths_;
-  int64 in_rows_;
-  int64 in_cols_;
+  SpatialArray in_;
   int64 out_depths_;
-  int64 filter_rows_;
-  int64 filter_cols_;
-  int64 stride_rows_;
-  int64 stride_cols_;
-  int64 padding_rows_;
-  int64 padding_cols_;
+  SpatialArray filter_;
+  SpatialArray stride_;
+  SpatialArray padding_;
+  DataType dtype_;
   int device_id_;
   uint64 hash_code_;
 };
@@ -162,25 +186,67 @@ class ConvParameters {
 typedef Eigen::GpuDevice GPUDevice;
 
 // A helper class that looks up the best autotuned config from parameters.
+// Due to the noisy nature of autotune, especially with multiple devices, it
+// only accepts a config if its margin exceeds a threshold.
+// For the same shape configs, if a new best config matches the previous best,
+// they get promoted; otherwise, the winner gets demoted. This process stops
+// when the winner's score exceeds the threshold.
+// In a bad case when two configs are very close to each other and flips
+// back and forth randomly, the expected number of experiments before autotune
+// settles is O(threshold ^ 2). So we recommend that number of warmup runs
+// for any benchmarks.
 template <typename Parameters, typename Config>
 class AutoTuneMap {
  public:
   bool Find(const Parameters& params, Config* config) const {
     mutex_lock lock(mu_);
     auto iter = params_config_map_.find(params);
-    if (iter == params_config_map_.end()) {
+    if (iter == params_config_map_.end() ||
+        iter->second.score < min_score_threshold_) {
       return false;
     }
-    *config = iter->second;
+    *config = iter->second.config;
     return true;
   }
   void Insert(const ConvParameters& params, const Config& config) {
     mutex_lock lock(mu_);
-    params_config_map_[params] = config;
+    auto iter = params_config_map_.find(params);
+    int new_score = 0;
+    if (iter == params_config_map_.end()) {
+      // Create a new entry if params is new.
+      VLOG(1) << GetActionSummary("creates", params, config);
+      params_config_map_.insert(std::make_pair(params, ValueType{config, 1}));
+      new_score = 1;
+    } else if (iter->second.score < min_score_threshold_) {
+      DCHECK(iter->second.score > 0);
+      if (iter->second.config != config) {
+        // If it is different from the current winner, demotes the winner.
+        VLOG(1) << GetActionSummary("demotes", params, config);
+        new_score = --iter->second.score;
+        if (new_score <= 0) {
+          VLOG(1) << GetActionSummary("erases", params, config);
+          params_config_map_.erase(iter);
+        }
+      } else {
+        // If it is the same as the current winner, promotes the winner.
+        VLOG(1) << GetActionSummary("promotes", params, config);
+        new_score = ++iter->second.score;
+      }
+    }
+    if (new_score >= min_score_threshold_) {
+      VLOG(1) << GetActionSummary("accepts", params, config);
+    }
   }
 
  private:
-  AutoTuneMap() {}
+  AutoTuneMap(const string& name) : name_(name) {
+    min_score_threshold_ = 1;
+    const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
+    if (threshold_str != nullptr) {
+      strings::safe_strto32(threshold_str, &min_score_threshold_);
+    }
+    min_score_threshold_ = std::max(min_score_threshold_, 1);
+  }
 
   template <class Group, class Params, class Cfg>
   friend class AutoTuneSingleton;
@@ -190,9 +256,23 @@ class AutoTuneMap {
       return parameter.hash();
     }
   };
+
+  string GetActionSummary(StringPiece action, const Parameters& params,
+                          const Config& config) {
+    return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
+                           action.ToString().c_str(), params.ToString().c_str(),
+                           config.ToString().c_str());
+  }
+
   mutable mutex mu_;
-  std::unordered_map<Parameters, Config, Hasher> params_config_map_
+  struct ValueType {
+    Config config;
+    int32 score;
+  };
+  std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
       GUARDED_BY(mu_);
+  string name_;
+  int32 min_score_threshold_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
 };
@@ -206,7 +286,7 @@ class AutoTuneSingleton {
  public:
   typedef AutoTuneMap<Parameters, Config> AutoTuneType;
   static AutoTuneType* GetInstance() {
-    static AutoTuneType* instance = new AutoTuneType;
+    static AutoTuneType* instance = new AutoTuneType(Group::name());
     return instance;
   }
 };
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index d7f6923f17b..2307c2de0e6 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -126,9 +126,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
 
 // A Cuda custom kernel that swaps dimension-0 and dimension-2 of a 3D tensor.
 template <typename T>
-__global__ void SwapDimension0And2InTensor3(int nthreads, const T* input,
-                                            Dimension<3> input_dims,
-                                            T* output) {
+__global__ void SwapDimension0And2InTensor3Simple(int nthreads, const T* input,
+                                                  Dimension<3> input_dims,
+                                                  T* output) {
   Dimension<3> output_dims;
   output_dims[0] = input_dims[2];
   output_dims[1] = input_dims[1];
@@ -152,9 +152,9 @@ __global__ void SwapDimension0And2InTensor3(int nthreads, const T* input,
 
 // A Cuda custom kernel that swaps dimension-1 and dimension-2 of a 3D tensor.
 template <typename T>
-__global__ void SwapDimension1And2InTensor3(int nthreads, const T* input,
-                                            Dimension<3> input_dims,
-                                            T* output) {
+__global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
+                                                  Dimension<3> input_dims,
+                                                  T* output) {
   Dimension<3> output_dims;
   output_dims[0] = input_dims[0];
   output_dims[1] = input_dims[2];
@@ -348,9 +348,9 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
     combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
     combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    SwapDimension0And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, in.data(), combined_dims, out.data());
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
   }
 };
 
@@ -368,9 +368,9 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
       combined_dims[2] *= in.dimension(i);
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    SwapDimension0And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, in.data(), combined_dims, out.data());
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
   }
 };
 
@@ -442,12 +442,44 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
-    SwapDimension1And2InTensor3<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, input_dims, output);
+    SwapDimension1And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, input_dims, output);
   }
 }
 
+// A GPU helper functor that does general dimension 1 and 2 switch for 3D
+// tensor.
+template <typename T>
+struct SwapDimension1And2InTensor3<GPUDevice, T> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    RunSwapDimension1And2InTensor3(d, in, input_dims, out);
+  }
+};
+
+// A GPU helper functor that does general dimension 0 and 2 switch for 3D
+// tensor.
+template <typename T>
+struct SwapDimension0And2InTensor3<GPUDevice, T> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
+    SwapDimension0And2InTensor3Simple<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in, input_dims, out);
+  }
+};
+
 // A GPU helper functor that converts NHWC TensorFlow data format to
 // NCHW format that is accepted by Cudnn.
 template <typename T, int NDIMS>
@@ -497,6 +529,18 @@ template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
 template struct functor::TransformDepth<GPUDevice, float, int>;
 template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
 
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
+
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
+template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
+
 // For 2d ops.
 template struct functor::TransformFilter<GPUDevice, float, int, 4>;
 template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
@@ -504,9 +548,11 @@ template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
 
+template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
 
+template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
 
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index b122e7f0e84..88ba4330500 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -28,8 +28,50 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+
 namespace tensorflow {
 
+#if GOOGLE_CUDA
+
+TEST(ConvParameters, WinogradNonfusedAlgoSize) {
+  ConvParameters conv_params_small = {
+      1,         // batch
+      32,        // in_depths
+      {{300,     // in_rows
+        300}},   // in_cols
+      128,       // out_depths
+      {{3,       // filter_rows
+        3}},     // filter_cols
+      {{1,       // stride_rows
+        1}},     // stride_cols
+      {{0,       // padding_rows
+        0}},     // padding_cols
+      DT_FLOAT,  // tensor datatype
+      0,         // device_id
+  };
+  EXPECT_TRUE(conv_params_small.ShouldIncludeWinogradNonfusedAlgo<float>());
+
+  ConvParameters conv_params_large = {
+      1,         // batch
+      128,       // in_depths
+      {{300,     // in_rows
+        300}},   // in_cols
+      768,       // out_depths
+      {{3,       // filter_rows
+        3}},     // filter_cols
+      {{1,       // stride_rows
+        1}},     // stride_cols
+      {{0,       // padding_rows
+        0}},     // padding_cols
+      DT_FLOAT,  // tensor datatype
+      0,         // device_id
+  };
+  EXPECT_FALSE(conv_params_large.ShouldIncludeWinogradNonfusedAlgo<float>());
+}
+
+#endif  // GOOGLE_CUDA
+
 class FusedResizePadConvOpTest : public OpsTestBase {
  protected:
   void HandwrittenConv() {
@@ -116,8 +158,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                int input_depth, int resize_width,
                                int resize_height, int y_padding, int x_padding,
                                int filter_size, int filter_count,
-                               bool resize_align_corners, string pad_mode,
-                               int stride, string padding) {
+                               bool resize_align_corners,
+                               const string& pad_mode, int stride,
+                               const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -170,8 +213,8 @@ class FusedResizePadConvOpTest : public OpsTestBase {
   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
                                       int input_depth, int y_padding,
                                       int x_padding, int filter_size,
-                                      int filter_count, string pad_mode,
-                                      int stride, string padding) {
+                                      int filter_count, const string& pad_mode,
+                                      int stride, const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index caf73420ba9..746fe63e2a0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -216,12 +216,14 @@ struct CropAndResize<CPUDevice, T> {
           const float x_lerp = in_x - left_x_index;
 
           for (int d = 0; d < depth; ++d) {
-            const float top_left(image(b_in, top_y_index, left_x_index, d));
-            const float top_right(image(b_in, top_y_index, right_x_index, d));
-            const float bottom_left(
-                image(b_in, bottom_y_index, left_x_index, d));
-            const float bottom_right(
-                image(b_in, bottom_y_index, right_x_index, d));
+            const float top_left(
+                static_cast<float>(image(b_in, top_y_index, left_x_index, d)));
+            const float top_right(
+                static_cast<float>(image(b_in, top_y_index, right_x_index, d)));
+            const float bottom_left(static_cast<float>(
+                image(b_in, bottom_y_index, left_x_index, d)));
+            const float bottom_right(static_cast<float>(
+                image(b_in, bottom_y_index, right_x_index, d)));
             const float top = top_left + (top_right - top_left) * x_lerp;
             const float bottom =
                 bottom_left + (bottom_right - bottom_left) * x_lerp;
@@ -545,12 +547,14 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
           const float x_lerp = in_x - left_x_index;
 
           for (int d = 0; d < depth; ++d) {
-            const float top_left(image(b_in, top_y_index, left_x_index, d));
-            const float top_right(image(b_in, top_y_index, right_x_index, d));
-            const float bottom_left(
-                image(b_in, bottom_y_index, left_x_index, d));
-            const float bottom_right(
-                image(b_in, bottom_y_index, right_x_index, d));
+            const float top_left(
+                static_cast<float>(image(b_in, top_y_index, left_x_index, d)));
+            const float top_right(
+                static_cast<float>(image(b_in, top_y_index, right_x_index, d)));
+            const float bottom_left(static_cast<float>(
+                image(b_in, bottom_y_index, left_x_index, d)));
+            const float bottom_right(static_cast<float>(
+                image(b_in, bottom_y_index, right_x_index, d)));
             // Compute the image gradient.
             float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
                                  x_lerp * (bottom_right - top_right);
@@ -606,18 +610,25 @@ inline void CheckValidBoxInd<CPUDevice>(
                               .HostMemory("crop_size"),            \
                           CropAndResizeOp<CPUDevice, T>);          \
                                                                    \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage")           \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .HostMemory("image_size"),           \
-                          CropAndResizeGradImageOp<CPUDevice, T>); \
-                                                                   \
   REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")           \
                               .Device(DEVICE_CPU)                  \
                               .TypeConstraint<T>("T"),             \
                           CropAndResizeGradBoxesOp<CPUDevice, T>);
 
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
+
+#undef REGISTER_KERNEL
+
+#define REGISTER_KERNEL(T)                               \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage") \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<T>("T")    \
+                              .HostMemory("image_size"), \
+                          CropAndResizeGradImageOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_KERNEL);
 TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
@@ -685,7 +696,7 @@ inline void CheckValidBoxInd<GPUDevice>(
                               .TypeConstraint<T>("T"),             \
                           CropAndResizeGradBoxesOp<GPUDevice, T>);
 
-TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 75146b28e66..254475db465 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -88,26 +88,26 @@ __global__ void CropAndResizeKernel(
     const int right_x_index = ceilf(in_x);
     const float x_lerp = in_x - left_x_index;
 
-    const float top_left(
+    const float top_left(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d]);
-    const float top_right(
+                  d]));
+    const float top_right(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d]);
-    const float bottom_left(
+                  d]));
+    const float bottom_left(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d]);
-    const float bottom_right(
+                  d]));
+    const float bottom_right(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d]);
+                  d]));
     const float top = top_left + (top_right - top_left) * x_lerp;
     const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
     crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
@@ -258,26 +258,26 @@ __global__ void CropAndResizeBackpropBoxesKernel(
     const int right_x_index = ceilf(in_x);
     const float x_lerp = in_x - left_x_index;
 
-    const float top_left =
+    const float top_left(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d];
-    const float top_right =
+                  d]));
+    const float top_right(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d];
-    const float bottom_left =
+                  d]));
+    const float bottom_left(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d];
-    const float bottom_right =
+                  d]));
+    const float bottom_right(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d];
+                  d]));
 
     // Compute the image gradient.
     float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
@@ -436,7 +436,7 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
   template struct CropAndResizeBackpropImage<GPUDevice, T>; \
   template struct CropAndResizeBackpropBoxes<GPUDevice, T>;
 
-TF_CALL_float(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 68e077e44df..d94eee5111f 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,9 +31,10 @@ namespace tensorflow {
 
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
+  template <typename T>
   void MakeOp(float extrapolation_value) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Input(FakeInput(DT_INT32))
@@ -43,12 +44,33 @@ class CropAndResizeOpTest : public OpsTestBase {
   }
 };
 
-TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1) {
-  MakeOp(0);
+#define REGISTER_TEST(T)                                               \
+  TEST_F(CropAndResizeOpTest, TestCropAndResize##T) {                  \
+    MakeOp<T>(0);                                                      \
+    AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
+    AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
+    AddInputFromArray<int32>(TensorShape({1}), {0});                   \
+    AddInputFromArray<int32>(TensorShape({2}), {1, 1});                \
+    TF_ASSERT_OK(RunOpKernel());                                       \
+                                                                       \
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
+    test::FillValues<float>(&expected, {2.5});                         \
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
+  }
+
+REGISTER_TEST(float)
+REGISTER_TEST(double)
+REGISTER_TEST(int8)
+REGISTER_TEST(uint8)
+
+#undef REGISTER_TEST
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
+  MakeOp<uint8>(0);
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
   AddInputFromArray<int32>(TensorShape({2}), {1, 1});
@@ -60,7 +82,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -76,7 +98,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -97,7 +119,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -118,7 +140,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -143,7 +165,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -169,7 +191,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   const float v = -1;
-  MakeOp(v);
+  MakeOp<float>(v);
   // Input:
   //  1, 2
   //  3, 4
@@ -190,7 +212,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -208,7 +230,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
@@ -220,7 +242,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -233,7 +255,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {1});
diff --git a/tensorflow/core/kernels/cross_op_test.cc b/tensorflow/core/kernels/cross_op_test.cc
index c3eac14ef53..d27520b80e2 100644
--- a/tensorflow/core/kernels/cross_op_test.cc
+++ b/tensorflow/core/kernels/cross_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 07f15223125..73ee3106048 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -306,8 +306,10 @@ class CTCBeamSearchDecoderOp : public OpKernel {
             Eigen::Map<const Eigen::ArrayXf>(input_chip_t.data(), num_classes);
         beam_search.Step(input_bi);
       }
-      beam_search.TopPaths(decode_helper_.GetTopPaths(), &best_paths_b,
-                           &log_probs, merge_repeated_);
+      OP_REQUIRES_OK(
+          ctx, beam_search.TopPaths(decode_helper_.GetTopPaths(), &best_paths_b,
+                                    &log_probs, merge_repeated_));
+
       beam_search.Reset();
 
       for (int bp = 0; bp < decode_helper_.GetTopPaths(); ++bp) {
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 05d0169b112..426382edeca 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -42,6 +42,8 @@ class CTCLossOp : public OpKernel {
                                      &preprocess_collapse_repeated_));
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("ctc_merge_repeated", &ctc_merge_repeated_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ignore_longer_outputs_than_inputs",
+                                     &ignore_longer_outputs_than_inputs_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -150,12 +152,15 @@ class CTCLossOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctc_loss_calculator.CalculateLoss(
                             seq_len_t, labels_t, input_list_t,
                             preprocess_collapse_repeated_, ctc_merge_repeated_,
-                            &loss_t, &gradient_list_t, &workers));
+                            ignore_longer_outputs_than_inputs_, &loss_t,
+                            &gradient_list_t, &workers));
   }
 
  private:
   bool preprocess_collapse_repeated_;
   bool ctc_merge_repeated_;
+  bool ignore_longer_outputs_than_inputs_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp);
 };
 
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
new file mode 100644
index 00000000000..914627992b6
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================
+*/
+#ifdef GOOGLE_CUDA
+#include "tensorflow/core/kernels/cuda_solvers.h"
+
+#include <chrono>
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cuda/include/cublas_v2.h"
+#include "cuda/include/cusolverDn.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
+                             const void* src, uint64 bytes) {
+  auto stream = context->op_device_context()->stream();
+  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
+}
+
+// Type traits to get CUDA complex types from std::complex<>.
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+// Converts pointers of std::complex<> to pointers of
+// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename CUDAComplexT<T>::type* CUDAComplex(const T* p) {
+  return reinterpret_cast<const typename CUDAComplexT<T>::type*>(p);
+}
+template <typename T>
+inline typename CUDAComplexT<T>::type* CUDAComplex(T* p) {
+  return reinterpret_cast<typename CUDAComplexT<T>::type*>(p);
+}
+
+// A set of initialized handles to the underlying Cuda libraries used by
+// CudaSolver. We maintain one such set of handles per unique stream.
+struct CudaSolverHandles {
+  explicit CudaSolverHandles(cudaStream_t stream) {
+    CHECK(cusolverDnCreate(&cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to create cuSolverDN instance.";
+    CHECK(cusolverDnSetStream(cusolver_dn_handle, stream) ==
+          CUSOLVER_STATUS_SUCCESS)
+        << "Failed to set cuSolverDN stream.";
+    CHECK(cublasCreate(&cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to create cuBlas instance.";
+    CHECK(cublasSetStream(cublas_handle, stream) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to set cuBlas stream.";
+  }
+
+  ~CudaSolverHandles() {
+    CHECK(cublasDestroy(cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to destroy cuBlas instance.";
+    CHECK(cusolverDnDestroy(cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to destroy cuSolverDN instance.";
+  }
+  cublasHandle_t cublas_handle;
+  cusolverDnHandle_t cusolver_dn_handle;
+};
+
+static mutex handle_map_mutex(LINKER_INITIALIZED);
+
+using HandleMap =
+    std::unordered_map<cudaStream_t, std::unique_ptr<CudaSolverHandles>>;
+
+// Returns a singleton map used for storing initialized handles for each unique
+// cuda stream.
+HandleMap* GetHandleMapSingleton() {
+  static HandleMap* cm = new HandleMap;
+  return cm;
+}
+
+}  // namespace
+
+#define TF_RETURN_IF_CUSOLVER_ERROR(expr)                                      \
+  do {                                                                         \
+    auto status = (expr);                                                      \
+    if (TF_PREDICT_FALSE(status != CUSOLVER_STATUS_SUCCESS)) {                 \
+      return errors::Internal("cuSolverDN call failed with status =", status); \
+    }                                                                          \
+  } while (0)
+
+#define TF_RETURN_IF_CUBLAS_ERROR(expr)                                \
+  do {                                                                 \
+    auto status = (expr);                                              \
+    if (TF_PREDICT_FALSE(status != CUBLAS_STATUS_SUCCESS)) {           \
+      return errors::Internal("cuBlas call failed status = ", status); \
+    }                                                                  \
+  } while (0)
+
+CudaSolver::CudaSolver(OpKernelContext* context) : context_(context) {
+  const cudaStream_t* cu_stream_ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  cuda_stream_ = *cu_stream_ptr;
+  HandleMap* handle_map = CHECK_NOTNULL(GetHandleMapSingleton());
+  mutex_lock lock(handle_map_mutex);
+  auto it = handle_map->find(cuda_stream_);
+  if (it == handle_map->end()) {
+    LOG(INFO) << "Creating CudaSolver handles for stream " << cuda_stream_;
+    // Previously unseen Cuda stream. Initialize a set of Cuda solver library
+    // handles for it.
+    std::unique_ptr<CudaSolverHandles> new_handles(
+        new CudaSolverHandles(cuda_stream_));
+    it =
+        handle_map->insert(std::make_pair(cuda_stream_, std::move(new_handles)))
+            .first;
+  }
+  cusolver_dn_handle_ = it->second->cusolver_dn_handle;
+  cublas_handle_ = it->second->cublas_handle;
+}
+
+Status CudaSolver::CopyLapackInfoToHostAsync(
+    const std::vector<DeviceLapackInfo>& dev_lapack_infos,
+    std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+        info_checker_callback) const {
+  std::vector<HostLapackInfo> host_lapack_infos;
+  if (dev_lapack_infos.empty()) {
+    info_checker_callback(Status::OK(), host_lapack_infos);
+    return Status::OK();
+  }
+
+  // Launch memcpys to copy info back from the device to the host.
+  for (const auto& dev_lapack_info : dev_lapack_infos) {
+    bool success = true;
+    auto host_copy = dev_lapack_info.CopyToHost(&success);
+    if (!success) {
+      return errors::Internal(
+          "Failed to launch copy of dev_lapack_info to host, debug_info = ",
+          dev_lapack_info.debug_info());
+    }
+    host_lapack_infos.push_back(std::move(host_copy));
+  }
+
+  // This callback checks that all batch items in all calls were processed
+  // successfully and passes status to the info_checker_callback accordingly.
+  auto wrapped_info_checker_callback =
+      [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
+        Status status;
+        for (const auto& host_lapack_info : host_lapack_infos) {
+          for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
+            const int info_value = (host_lapack_info.data())[i];
+            if (info_value != 0) {
+              status = errors::InvalidArgument(
+                  "Got info = ", info_value, " for batch index ", i,
+                  ", expected info = 0. Debug_info =",
+                  host_lapack_info.debug_info());
+            }
+          }
+          if (!status.ok()) {
+            break;
+          }
+        }
+        info_checker_callback(status, host_lapack_infos);
+      };
+  auto cb =
+      std::bind(wrapped_info_checker_callback, std::move(host_lapack_infos));
+  auto stream = context_->op_device_context()->stream();
+  context_->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+      stream, std::move(cb));
+  return Status::OK();
+}
+
+// Macro that specializes a solver method for all 4 standard
+// numeric types.
+#define TF_CALL_LAPACK_TYPES(m) \
+  m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
+
+// Macros to construct cusolverDn method names.
+#define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method
+#define DN_SOLVER_NAME(method, lapack_prefix) \
+  "cusolverDn" #lapack_prefix #method
+#define DN_BUFSIZE_FN(method, lapack_prefix) \
+  cusolverDn##lapack_prefix##method##_bufferSize
+
+// Macros to construct cublas method names.
+#define BLAS_SOLVER_FN(method, lapack_prefix) cublas##lapack_prefix##method
+#define BLAS_SOLVER_NAME(method, lapack_prefix) "cublas" #lapack_prefix #method
+
+//=============================================================================
+// Wrappers of cuSolverDN computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cusolver_api.h header file.
+//=============================================================================
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
+                               OpKernelContext* context,
+                               cusolverDnHandle_t cusolver_dn_handle,
+                               cublasFillMode_t uplo, int n, Scalar* A, int lda,
+                               int* dev_lapack_info) {
+  /* Get amount of workspace memory required. */
+  int lwork;
+  TF_RETURN_IF_CUSOLVER_ERROR(
+      bufsize(cusolver_dn_handle, uplo, n, CUDAComplex(A), lda, &lwork));
+  /* Allocate device memory for workspace. */
+  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+  /* Launch the solver kernel. */
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, uplo, n, CUDAComplex(A), lda,
+      CUDAComplex(dev_workspace.mutable_data()), lwork, dev_lapack_info));
+  return Status::OK();
+}
+
+#define POTRF_INSTANCE(Scalar, lapack_prefix)                                \
+  template <>                                                                \
+  Status CudaSolver::Potrf<Scalar>(cublasFillMode_t uplo, int n, Scalar* A,  \
+                                   int lda, int* dev_lapack_info) const {    \
+    return PotrfImpl(DN_BUFSIZE_FN(potrf, lapack_prefix),                    \
+                     DN_SOLVER_FN(potrf, lapack_prefix), context_,           \
+                     cusolver_dn_handle_, uplo, n, A, lda, dev_lapack_info); \
+  }
+
+TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
+
+//=============================================================================
+// Wrappers of cuBlas computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cublas_api.h header file.
+//=============================================================================
+template <typename Scalar, typename SolverFnT>
+static inline Status GetrfBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  if (!CopyHostToDevice(context, dev_a_dev_ptrs.mutable_data() /* dest */,
+                        host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes())) {
+    return errors::Internal("GetrfBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (CudaScalar**)dev_a_dev_ptrs.mutable_data(), lda,
+             dev_pivots, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRF_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetrfBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,        \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetrfBatchedImpl(BLAS_SOLVER_FN(getrfBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, dev_lapack_info, batch_size);          \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);
+
+template <typename Scalar, typename SolverFnT>
+static inline Status GetriBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,
+    const Scalar* host_a_inv_dev_ptrs[], int ldainv,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  ScratchSpace<uint8> dev_a_inv_dev_ptrs(
+      context, sizeof(CudaScalar*) * batch_size, /* on_host */ false);
+  if (!CopyHostToDevice(context, dev_a_dev_ptrs.mutable_data() /* dest */,
+                        host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes()) ||
+      !CopyHostToDevice(context, dev_a_inv_dev_ptrs.mutable_data(),
+                        host_a_inv_dev_ptrs, dev_a_inv_dev_ptrs.bytes())) {
+    return errors::Internal("GetriBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (const CudaScalar**)dev_a_dev_ptrs.data(), lda,
+             dev_pivots, (CudaScalar**)dev_a_inv_dev_ptrs.mutable_data(),
+             ldainv, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRI_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetriBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,  \
+      const Scalar* host_a_inv_dev_ptrs[], int ldainv,                         \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetriBatchedImpl(BLAS_SOLVER_FN(getriBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, host_a_inv_dev_ptrs, ldainv,           \
+                            dev_lapack_info, batch_size);                      \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
new file mode 100644
index 00000000000..5d1c807e66e
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -0,0 +1,320 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================
+*/
+
+// This header declares the class CudaSolver, which contains wrappers of linear
+// algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
+// kernels.
+
+#ifdef GOOGLE_CUDA
+
+#include <functional>
+#include <vector>
+
+#include "cuda/include/cublas_v2.h"
+#include "cuda/include/cusolverDn.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+// Container of LAPACK info data (an array of int) generated on-device by
+// a CudaSolver call. One or more such objects can be passed to
+// CudaSolver::CopyLapackInfoToHostAsync() along with a callback to
+// check the LAPACK info data after the corresponding kernels
+// finish and LAPACK info has been copied from the device to the host.
+class DeviceLapackInfo;
+
+// Host-side copy of LAPACK info.
+class HostLapackInfo;
+
+// The CudaSolver class provides a simplified templated API for the dense linear
+// solvers implemented in cuSolverDN (http://docs.nvidia.com/cuda/cusolver) and
+// cuBlas (http://docs.nvidia.com/cuda/cublas/#blas-like-extension/).
+// An object of this class wraps static cuSolver and cuBlas instances,
+// and will launch Cuda kernels on the stream wrapped by the GPU device
+// in the OpKernelContext provided to the constructor.
+//
+// Notice: All the computational member functions are asynchronous and simply
+// launch one or more Cuda kernels on the Cuda stream wrapped by the CudaSolver
+// object. To check the final status of the kernels run, call
+// CopyLapackInfoToHostAsync() on the CudaSolver object to set a callback that
+// will be invoked with the status of the kernels launched thus far as
+// arguments.
+//
+// Example of an asynchronous TensorFlow kernel using CudaSolver:
+//
+// template <typename Scalar>
+// class SymmetricPositiveDefiniteSolveOpGpu : public AsyncOpKernel {
+//  public:
+//   explicit SymmetricPositiveDefiniteSolveOpGpu(OpKernelConstruction* context)
+//       : AsyncOpKernel(context) { }
+//   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+//     // 1. Set up input and output device ptrs. See, e.g.,
+//     // matrix_inverse_op.cc for a full example.
+//     ...
+//
+//     // 2. Initialize the solver object.
+//     CudaSolver solver(context);
+//
+//     // 3. Launch the two compute kernels back to back on the stream without
+//     // synchronizing.
+//     std::vector<DeviceLapackInfo> dev_info;
+//     const int batch_size = 1;
+//     dev_info.emplace_back(context, batch_size, "potrf");
+//     // Compute the Cholesky decomposition of the input matrix.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrf(uplo, n, dev_matrix_ptrs, n,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//     dev_info.emplace_back(context, batch_size, "potrs");
+//     // Use the Cholesky decomposition of the input matrix to solve A X = RHS.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrs(uplo, n, nrhs, dev_matrix_ptrs, n,
+//                                       dev_output_ptrs, ldrhs,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//
+//     // 4. Check the status after the computation finishes and call done.
+//     // Capture dev_info so the underlying buffers don't get deallocated
+//     // before the kernels run.
+//     auto check_status = [context, done, dev_info](const Status& status,
+//       const std::vector<HostLapackInfo>& /* unused */) {
+//           // In this example we don't care about the exact cause of
+//           // death, so just check status.
+//           OP_REQUIRES_OK_ASYNC(context, status, done);
+//           done();
+//     };
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.CopyLapackInfoToHostAsync(
+//                            dev_info, std::move(check_status));
+//                          done);
+//   }
+// };
+
+class CudaSolver {
+ public:
+  // This object stores a pointer to context, which must outlive it.
+  explicit CudaSolver(OpKernelContext* context);
+  virtual ~CudaSolver() {}
+
+  // Launches a memcpy of solver status data specified by dev_lapack_info from
+  // device to the host, and asynchronously invokes the given callback when the
+  // copy is complete. The first Status argument to the callback will be
+  // Status::OK if all lapack infos retrieved are zero, otherwise an error status
+  // is given. The second argument contains a host-side copy of the entire set
+  // of infos retrieved, and can be used for generating detailed error messages.
+  Status CopyLapackInfoToHostAsync(
+      const std::vector<DeviceLapackInfo>& dev_lapack_info,
+      std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+          info_checker_callback) const;
+
+  // ====================================================================
+  // Wrappers for cuSolverDN and cuBlas solvers start here.
+  //
+  // Apart from capitalization of the first letter, the method names below map
+  // to those in cuSolverDN and cuBlas, which follow the naming convention in
+  // LAPACK see, e.g., http://docs.nvidia.com/cuda/cusolver/#naming-convention
+
+  // Computes the Cholesky factorization A = L * L^T for a single matrix.
+  // Returns Status::OK(), if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
+  template <typename Scalar>
+  Status Potrf(cublasFillMode_t uplo, int n, Scalar* dev_A, int lda,
+               int* dev_lapack_info) const;
+
+  // Computes partially pivoted LU factorizations for a batch of matrices.
+  // Returns Status::OK() if the kernel was launched successfully.See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+  template <typename Scalar>
+  Status GetrfBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      int* dev_pivots, DeviceLapackInfo* dev_lapack_info,
+                      int batch_size) const;
+
+  // Computes matrix inverses for a batch of matrices. Uses the outputs from
+  // GetrfBatched. Returns Status::OK() if the kernel was launched successfully.
+  // See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched
+  template <typename Scalar>
+  Status GetriBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      const int* dev_pivots,
+                      const Scalar* host_a_inverse_dev_ptrs[], int ldainv,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size) const;
+
+  /*
+  TODO(rmlarsen, volunteers): Implement the kernels below.
+  // Uses Cholesky factorization to solve A * X = B.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrs
+  template <typename Scalar>
+  Status Potrs(cublasFillMode_t uplo, int n, int nrhs, const Scalar* dev_A, int
+  lda, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
+
+  // LU factorization.
+  // Computes LU factorization with partial pivoting P * A = L * U.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrf
+  template <typename Scalar>
+  Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
+             int* dev_lapack_info) const;
+
+  // Uses LU factorization to solve A * X = B.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrs
+  template <typename Scalar>
+  Status Getrs(int n, int nrhs, const Scalar* dev_A, int lda, const int*
+  dev_pivots, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
+
+  // QR factorization.
+  // Computes QR factorization A = Q * R.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
+  template <typename Scalar>
+  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_TAU, int*
+  devInfo) const;
+
+  // Multiplies by Q.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-ormqr
+  template <typename Scalar>
+  Status Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n, int
+  k, const Scalar* dev_a, int lda, const Scalar* dev_tau, Scalar* dev_c, int
+  ldc, int* dev_lapack_info) const;
+
+  // Generate Q.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-orgqr
+  template <typename Scalar>
+  Status Orgqr(int m, int n, int k, Scalar* dev_A, int lda, const Scalar*
+  dev_tau, int* dev_lapack_info) const;
+
+  // Symmetric/Hermitian Eigen decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
+  template <typename Scalar>
+  Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar*
+  dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const;
+
+  // Singular value decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd
+  template <typename Scalar>
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+             int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+             int ldvt, int* dev_lapack_info);
+
+  // Batched linear solver using LU factorization from getrfBatched.
+  // See:
+  http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
+  template <typename Scalar>
+  Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
+                    const Scalar* dev_Aarray[], int lda, const int* devIpiv,
+                    Scalar* dev_Barray[], int ldb, int* info, int batch_size)
+  const;
+  */
+
+ private:
+  OpKernelContext* context_;  // not owned.
+  cudaStream_t cuda_stream_;
+  cusolverDnHandle_t cusolver_dn_handle_;
+  cublasHandle_t cublas_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CudaSolver);
+};
+
+// Helper class to allocate scratch memory and keep track of debug info.
+// Mostly a thin wrapper around Tensor.
+template <typename Scalar>
+class ScratchSpace {
+ public:
+  ScratchSpace(OpKernelContext* context, int size, bool on_host)
+      : ScratchSpace(context, size, "", on_host) {}
+
+  ScratchSpace(OpKernelContext* context, int size, const string& debug_info,
+               bool on_host)
+      : context_(context), debug_info_(debug_info), on_host_(on_host) {
+    AllocatorAttributes alloc_attr;
+    if (on_host) {
+      // Allocate pinned memory on the host to avoid unnecessary
+      // synchronization.
+      alloc_attr.set_on_host(true);
+      alloc_attr.set_gpu_compatible(true);
+    }
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                       TensorShape({size}), &scratch_tensor_,
+                                       alloc_attr));
+  }
+
+  virtual ~ScratchSpace() {}
+
+  Scalar* mutable_data() {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  const Scalar* data() const {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  int64 bytes() const { return scratch_tensor_.TotalBytes(); }
+  int64 size() const { return scratch_tensor_.NumElements(); }
+  const string& debug_info() const { return debug_info_; }
+
+  // Returns true if this ScratchSpace is in host memory.
+  bool on_host() const { return on_host_; }
+
+ protected:
+  OpKernelContext* context() const { return context_; }
+
+ private:
+  OpKernelContext* context_;  // not owned
+  const string debug_info_;
+  const bool on_host_;
+  Tensor scratch_tensor_;
+};
+
+class HostLapackInfo : public ScratchSpace<int> {
+ public:
+  HostLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true){};
+};
+
+class DeviceLapackInfo : public ScratchSpace<int> {
+ public:
+  DeviceLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ false) {}
+
+  // Allocates a new scratch space on the host and launches a copy of the
+  // contents of *this to the new scratch space. Sets success to true if
+  // the copy kernel was launched successfully.
+  HostLapackInfo CopyToHost(bool* success) const {
+    CHECK(success != nullptr);
+    HostLapackInfo copy(context(), size(), debug_info());
+    auto stream = context()->op_device_context()->stream();
+    perftools::gputools::DeviceMemoryBase wrapped_src(
+        static_cast<void*>(const_cast<int*>(this->data())));
+    *success =
+        stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
+            .ok();
+    return copy;
+  }
+};
+
+namespace functor {
+// Helper functor to transpose and conjugate all matrices in a flattened batch.
+template <typename Device, typename Scalar>
+struct AdjointBatchFunctor {
+  // We assume that the tensor sizes are correct.
+  void operator()(const Device& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
new file mode 100644
index 00000000000..d32f5065577
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cuda_solvers.h"
+
+#include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// TODO(rmlarsen): Add a faster custom kernel similar to
+// SwapDimension1And2InTensor3 in tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+template <typename Scalar>
+struct AdjointBatchFunctor<GPUDevice, Scalar> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output) {
+    const Eigen::array<int, 3> perm({0, 2, 1});
+    To32Bit(output).device(d) = To32Bit(input).shuffle(perm).conjugate();
+  }
+};
+
+// Instantiate implementations for the 4 numeric types.
+template struct AdjointBatchFunctor<GPUDevice, float>;
+template struct AdjointBatchFunctor<GPUDevice, double>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<float>>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<double>>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index ae7ffe8b929..5939ecdf62b 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -18,10 +18,11 @@ limitations under the License.
 
 #include <array>
 
-#include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_3d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
 
 typedef Eigen::GpuDevice GPUDevice;
 
@@ -34,31 +35,37 @@ void DnnPooling3dOp<T>::Compute(
     OpKernelContext* context,
     perftools::gputools::dnn::PoolingMode pooling_mode,
     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
-    const std::array<int64, 3>& padding, const Tensor& tensor_in,
-    Tensor* output) {
+    const std::array<int64, 3>& padding, TensorFormat data_format,
+    const Tensor& tensor_in, Tensor* output) {
   const auto in_shape = tensor_in.shape();
   const auto out_shape = output->shape();
 
-  const int64 in_batch = in_shape.dim_size(0);
-  const int64 in_features = in_shape.dim_size(4);
+  const int64 in_batch = GetTensorDim(tensor_in, data_format, 'N');
+  const int64 in_features = GetTensorDim(tensor_in, data_format, 'C');
 
   Tensor transformed_input;
-  OP_REQUIRES_OK(context, context->allocate_temp(
-                              DataTypeToEnum<T>::value,
-                              {in_shape.dim_size(0), in_shape.dim_size(4),
-                               in_shape.dim_size(1), in_shape.dim_size(2),
-                               in_shape.dim_size(3)},
-                              &transformed_input));
-  functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
-                                         tensor_in.tensor<T, 5>(),
-                                         transformed_input.tensor<T, 5>());
+  if (data_format == FORMAT_NHWC) {
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
+                                                data_format),
+                                &transformed_input));
+    functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
+                                           tensor_in.tensor<T, 5>(),
+                                           transformed_input.tensor<T, 5>());
+  } else {
+    transformed_input = tensor_in;
+  }
   Tensor transformed_output;
-  OP_REQUIRES_OK(context, context->allocate_temp(
-                              DataTypeToEnum<T>::value,
-                              {out_shape.dim_size(0), out_shape.dim_size(4),
-                               out_shape.dim_size(1), out_shape.dim_size(2),
-                               out_shape.dim_size(3)},
-                              &transformed_output));
+  if (data_format == FORMAT_NHWC) {
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       ShapeFromFormat(FORMAT_NCHW, out_shape, data_format),
+                       &transformed_output));
+  } else {
+    transformed_output = *output;
+  }
 
   perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
@@ -75,8 +82,10 @@ void DnnPooling3dOp<T>::Compute(
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
-    input_desc.set_spatial_dim(dim_i, in_shape.dim_size(3 - i));
-    output_desc.set_spatial_dim(dim_i, out_shape.dim_size(3 - i));
+    input_desc.set_spatial_dim(dim_i,
+                               GetTensorDim(tensor_in, data_format, '2' - i));
+    output_desc.set_spatial_dim(dim_i,
+                                GetTensorDim(out_shape, data_format, '2' - i));
   }
 
   auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
@@ -95,11 +104,13 @@ void DnnPooling3dOp<T>::Compute(
   OP_REQUIRES(context, status,
               errors::Internal("cudnn PoolForward launch failed"));
 
-  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-  functor::NCHWToNHWC<GPUDevice, T, 5>()(
-      context->eigen_device<GPUDevice>(),
-      toConstTensor(transformed_output).template tensor<T, 5>(),
-      output->tensor<T, 5>());
+  if (data_format == FORMAT_NHWC) {
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 5>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_output).template tensor<T, 5>(),
+        output->tensor<T, 5>());
+  }
 }
 
 template <typename T>
@@ -108,53 +119,72 @@ void DnnPooling3dGradOp<T>::Compute(
     perftools::gputools::dnn::PoolingMode pooling_mode,
     const std::array<int64, 3>& window, const std::array<int64, 3>& stride,
     const std::array<int64, 3>& padding,
-    const std::array<int64, 3>& output_size, const Tensor& out_backprop,
-    const TensorShape& tensor_in_shape, const Tensor* tensor_in,
-    const Tensor* tensor_out, Tensor* input_backprop) {
+    const std::array<int64, 3>& output_size, TensorFormat data_format,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
   CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
 
-  const int64 in_batch = tensor_in_shape.dim_size(0);
-  const int64 in_features = tensor_in_shape.dim_size(4);
+  const int64 in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+  const int64 in_features = GetTensorDim(tensor_in_shape, data_format, 'C');
 
   Tensor transformed_input;
-  TensorShape transformed_input_shape = {
-      in_batch, in_features, tensor_in_shape.dim_size(1),
-      tensor_in_shape.dim_size(2), tensor_in_shape.dim_size(3)};
-  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                                 transformed_input_shape,
-                                                 &transformed_input));
+  TensorShape transformed_input_shape;
+  if (data_format == FORMAT_NHWC || tensor_in == nullptr) {
+    transformed_input_shape =
+        ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_input_shape,
+                                                   &transformed_input));
+  } else {
+    transformed_input = *tensor_in;
+  }
   Tensor transformed_output;
-  TensorShape transformed_output_shape = {
-      out_backprop.dim_size(0), out_backprop.dim_size(4),
-      out_backprop.dim_size(1), out_backprop.dim_size(2),
-      out_backprop.dim_size(3)};
-  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                                 transformed_output_shape,
-                                                 &transformed_output));
+  TensorShape transformed_output_shape;
+  if (data_format == FORMAT_NHWC || tensor_out == nullptr) {
+    transformed_output_shape =
+        ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
+  } else {
+    transformed_output = *tensor_out;
+  }
   Tensor transformed_input_backprop;
-  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                                 transformed_input_shape,
-                                                 &transformed_input_backprop));
+  if (data_format == FORMAT_NHWC) {
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_input_shape,
+                                          &transformed_input_backprop));
+  } else {
+    transformed_input_backprop = *input_backprop;
+  }
   Tensor transformed_output_backprop;
-  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                                 transformed_output_shape,
-                                                 &transformed_output_backprop));
-  if (tensor_in != nullptr) {
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
-                                           tensor_in->tensor<T, 5>(),
-                                           transformed_input.tensor<T, 5>());
+  if (data_format == FORMAT_NHWC) {
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_output_shape,
+                                          &transformed_output_backprop));
+  } else {
+    transformed_output_backprop = out_backprop;
   }
-  if (tensor_out != nullptr) {
-    functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
-                                           tensor_out->tensor<T, 5>(),
-                                           transformed_output.tensor<T, 5>());
+  if (data_format == FORMAT_NHWC) {
+    if (tensor_in != nullptr) {
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
+                                             tensor_in->tensor<T, 5>(),
+                                             transformed_input.tensor<T, 5>());
+    }
+    if (tensor_out != nullptr) {
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
+                                             tensor_out->tensor<T, 5>(),
+                                             transformed_output.tensor<T, 5>());
+    }
+    functor::NHWCToNCHW<GPUDevice, T, 5>()(
+        context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+        transformed_output_backprop.tensor<T, 5>());
   }
-  functor::NHWCToNCHW<GPUDevice, T, 5>()(
-      context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-      transformed_output_backprop.tensor<T, 5>());
 
   perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
@@ -174,7 +204,8 @@ void DnnPooling3dGradOp<T>::Compute(
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
-    orig_input_desc.set_spatial_dim(dim_i, tensor_in_shape.dim_size(3 - i));
+    orig_input_desc.set_spatial_dim(
+        dim_i, GetTensorDim(tensor_in_shape, data_format, '2' - i));
     orig_output_desc.set_spatial_dim(dim_i, output_size[i]);
   }
 
@@ -203,15 +234,20 @@ void DnnPooling3dGradOp<T>::Compute(
   OP_REQUIRES(context, status,
               errors::Internal("cudnn PoolBackward launch failed"));
 
-  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-  functor::NCHWToNHWC<GPUDevice, T, 5>()(
-      context->eigen_device<GPUDevice>(),
-      toConstTensor(transformed_input_backprop).template tensor<T, 5>(),
-      input_backprop->tensor<T, 5>());
+  if (data_format == FORMAT_NHWC) {
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 5>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_input_backprop).template tensor<T, 5>(),
+        input_backprop->tensor<T, 5>());
+  }
 }
 
-template class DnnPooling3dOp<float>;
-template class DnnPooling3dGradOp<float>;
+#define DEFINE_DNN_OPS(T)           \
+  template class DnnPooling3dOp<T>; \
+  template class DnnPooling3dGradOp<T>;
+TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index 296928b3dfe..ff4de758451 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -42,7 +42,8 @@ class DnnPooling3dOp {
                       const std::array<int64, 3>& size,
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
-                      const Tensor& tensor_in, Tensor* output);
+                      TensorFormat data_format, const Tensor& tensor_in,
+                      Tensor* output);
 };
 
 // Computes the gradient of (avg/max)pooling on GPU.
@@ -56,7 +57,7 @@ class DnnPooling3dGradOp {
                       const std::array<int64, 3>& stride,
                       const std::array<int64, 3>& padding,
                       const std::array<int64, 3>& output_size,
-                      const Tensor& out_backprop,
+                      TensorFormat data_format, const Tensor& out_backprop,
                       const TensorShape& tensor_in_shape,
                       const Tensor* tensor_in, const Tensor* tensor_out,
                       Tensor* input_backprop);
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 1d2d815027f..65801da3c7c 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::acos<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index a6bff78694a..f6e9b59cf8d 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
           int64);
-          
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -26,10 +26,19 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::add<TYPE>>);
-  REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("Add")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32>>);
 #endif // TENSORFLOW_USE_SYCL
-          
+
 #if GOOGLE_CUDA
 REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
 
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 5d3385b0ed6..5dea00e95c7 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,10 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER5(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
+          uint8, complex128, string);
 #if GOOGLE_CUDA
-REGISTER3(BinaryOp, GPU, "Add", functor::add, int64, complex64, complex128);
+REGISTER4(BinaryOp, GPU, "Add", functor::add, uint8, int64, complex64,
+          complex128);
 #endif  // GOOGLE_CUDA
 
 #endif  // !defined(__ANDROID_TYPES_SLIM__)
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index 92a22e90c4a..c9ebfe759b1 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::asin<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 825e85283f4..72645b303fc 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::atan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_atan2.cc b/tensorflow/core/kernels/cwise_op_atan2.cc
new file mode 100644
index 00000000000..68f67c444ef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_atan2.cc
@@ -0,0 +1,23 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER2(BinaryOp, CPU, "Atan2", functor::atan2, float, double);
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "Atan2", functor::atan2, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index c5a4aaf831f..c74e10576d5 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index a758da58421..634c90adc63 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::cos<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
new file mode 100644
index 00000000000..bca99a4f897
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Cosh")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 74d8faedb5e..1e2300832fc 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -37,8 +37,18 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::div<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(int32)
+REGISTER_SYCL_KERNEL(double)
 #undef REGISTER_SYCL_KERNEL
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Div")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
 #endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 7bd44abd393..7049305deb0 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -18,9 +18,21 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, float, Eigen::half, double,
           uint8, int8, int16);
+REGISTER_KERNEL_BUILDER(
+    Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    ApproximateEqualOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<double>("T"),
+    ApproximateEqualOp<CPUDevice, double>);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Equal", functor::equal_to, float, Eigen::half, double,
           uint8);
+REGISTER_KERNEL_BUILDER(
+    Name("ApproximateEqual").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    ApproximateEqualOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("ApproximateEqual").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    ApproximateEqualOp<GPUDevice, double>);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -34,4 +46,16 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                         BinaryOp<CPUDevice, functor::equal_to<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double);
+
+REGISTER_KERNEL_BUILDER(Name("Equal")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
index e7f4d3c07e2..57e19c7202d 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -25,7 +25,8 @@ namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32, int64, complex64,
           complex128, string, bool);
 #if GOOGLE_CUDA
-REGISTER4(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64, bool);
+REGISTER6(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64,
+          complex64, complex128, bool);
 #endif  // GOOGLE_CUDA
 
 #endif  // !defined(__ANDROID_TYPES_SLIM__)
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index f1c53ca272c..5573c2bcc2f 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -21,4 +21,7 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 129d754b826..59e32d7f6f4 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::floor<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 8a600f8f95e..fa81ef0872d 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
 REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
           Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("FloorDiv")                            \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::floor_div_real<TYPE>>);
-REGISTER_SYCL_KERNEL(float)
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
           int64);
@@ -51,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorDiv")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 4e641a8bb33..55f8a30461f 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorMod")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
index 5aaf2b5b4b8..61079ebab39 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(add, Eigen::half, float, double, int64, complex64, complex128);
+DEFINE_BINARY7(add, Eigen::half, float, double, uint8, int64, complex64,
+               complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
new file mode 100644
index 00000000000..137e14ef840
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY2(atan2, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
new file mode 100644
index 00000000000..267a381d1a5
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(cosh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index bdf880db1ab..3675398126f 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
                complex64, complex128, bool);
+DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
index 2f234f2bab2..d74cab6edf5 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY5(neg, Eigen::half, float, double, int32, int64);
+DEFINE_UNARY7(neg, Eigen::half, float, double, int32, int64, complex64,
+              complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
new file mode 100644
index 00000000000..f8329e50d62
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(sinh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index 8c9691d1ea2..6b5a806aa21 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -33,5 +33,19 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Greater")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index a6083cb9cd5..ac215282561 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -34,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 59976141c78..0faeffa95ca 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 675cb95b955..df63006b3fd 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index c394087ed80..e1cf7a86375 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 701007d6376..a38f1024a9a 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
 
+REGISTER_KERNEL_BUILDER(Name("Less")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97fd1ae9192..3a2cc2ae0e8 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("LessEqual")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 71c4588b3de..5e74e778c76 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::log<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 03ea3a0a894..edb821318e8 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index f93b5a83031..7311f25ec0c 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -34,4 +34,19 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Maximum")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::maximum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 36800975a80..99e5a766203 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float);
+
+REGISTER_KERNEL_BUILDER(Name("Minimum")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::minimum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index e23fe6761d7..a3cdfa5f84d 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -19,6 +19,12 @@ namespace tensorflow {
 
 REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
           uint8, int32);
+#if defined(__ANDROID_TYPES_SLIM__)
+// We only register the first type when we have multi-argument calls in the
+// case where we're trying to reduce executable size, but it turns out that the
+// int32 version of this op is needed, so explicitly include it.
+REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
+#endif  // __ANDROID_TYPES_SLIM__
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
@@ -28,7 +34,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::mul<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
 #undef REGISTER_SYCL_KERNEL
+REGISTER_KERNEL_BUILDER(Name("Mul")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32>>);
 #endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index c4a9b228835..eb7e3764d9d 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -43,7 +43,8 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
 #endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64);
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
+          complex64, complex128);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 59424d7a28d..7d4ecec59f1 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -25,8 +25,8 @@ namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32, int64,
           complex64, complex128, string, bool);
 #if GOOGLE_CUDA
-REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
-          bool);
+REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
+          complex64, complex128, bool);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 8eeba6ab14f..f1780168e45 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::pow<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index d858a077f5c..8c0e21f9cf3 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -36,6 +36,9 @@ REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -43,4 +46,7 @@ REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 7a4482dbb2b..e192f89782d 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
+REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
 namespace functor {
-DEFINE_UNARY1(round, float);
+DEFINE_UNARY2(round, float, double);
 }  // namespace functor
 #endif
 
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 7dc96d47a60..f23725f48e3 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 8160fb74c2a..709628da136 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -28,6 +28,10 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
  public:
@@ -41,9 +45,9 @@ class SelectOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("t", &then));
     OP_REQUIRES_OK(ctx, ctx->input("e", &else_));
 
-    if (TensorShapeUtils::IsScalar(cond->shape())){
-        ComputeScalar(ctx, cond, then, else_);
-        return;
+    if (TensorShapeUtils::IsScalar(cond->shape())) {
+      ComputeScalar(ctx, cond, then, else_);
+      return;
     }
 
     bool broadcasting = (TensorShapeUtils::IsVector(cond->shape()) &&
@@ -65,13 +69,15 @@ class SelectOp : public OpKernel {
         errors::InvalidArgument("'cond' must be a vector, but saw shape: ",
                                 cond->shape().DebugString()));
     OP_REQUIRES(
-        ctx, FastBoundsCheck(cond->NumElements(),
-                             std::numeric_limits<Eigen::DenseIndex>::max()),
+        ctx,
+        FastBoundsCheck(cond->NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("cond vector larger than ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
     OP_REQUIRES(
-        ctx, FastBoundsCheck(then->flat_outer_dims<T>().dimension(1),
-                             std::numeric_limits<Eigen::DenseIndex>::max()),
+        ctx,
+        FastBoundsCheck(then->flat_outer_dims<T>().dimension(1),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("flat outer dims dim 1 size >= ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -92,7 +98,8 @@ class SelectOp : public OpKernel {
             else_->shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", then->shape(), &output));
     if (output->NumElements() > 0) {
       functor::BatchSelectFunctor<Device, T> func;
       func(ctx->eigen_device<Device>(), output->flat_outer_dims<T>(),
@@ -105,7 +112,8 @@ class SelectOp : public OpKernel {
                           const Tensor* then, const Tensor* else_) {
     if (!ctx->ValidateInputsAreSameShape(this)) return;
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", then->shape(), &output));
     if (output->NumElements() > 0) {
       functor::SelectFunctor<Device, T> func;
       func(ctx->eigen_device<Device>(), output->flat<T>(), cond->flat<bool>(),
@@ -114,7 +122,7 @@ class SelectOp : public OpKernel {
   }
 
   void ComputeScalar(OpKernelContext* ctx, const Tensor* cond,
-                          const Tensor* then, const Tensor* else_) {
+                     const Tensor* then, const Tensor* else_) {
     OP_REQUIRES(
         ctx, then->shape().IsSameSize(else_->shape()),
         errors::InvalidArgument(
@@ -123,7 +131,8 @@ class SelectOp : public OpKernel {
             else_->shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", then->shape(), &output));
 
     if (output->NumElements() > 0) {
       functor::SelectScalarFunctor<Device, T> func;
@@ -132,6 +141,7 @@ class SelectOp : public OpKernel {
            then->flat<T>(), else_->flat<T>());
     }
   }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
@@ -163,12 +173,24 @@ REGISTER_SELECT_GPU(complex128);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the SYCL implementations.
+#define REGISTER_SELECT_SYCL(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      SelectOp<SYCLDevice, type>);
+
+REGISTER_SELECT_SYCL(float);
+REGISTER_SELECT_SYCL(int32);
+#undef REGISTER_SELECT_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 namespace functor {
 
 // CPU Specializations of Select functors.
-template <typename T>
-struct SelectFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+template <typename Device, typename T>
+struct SelectFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
                   typename TTypes<bool>::ConstFlat cond_flat,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
@@ -176,10 +198,18 @@ struct SelectFunctor<CPUDevice, T> {
   }
 };
 
-// CPU Specializations of Select functors with scalar
 template <typename T>
-struct SelectScalarFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+struct SelectFunctor<CPUDevice, T>
+        : SelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectFunctor<SYCLDevice, T>
+        : SelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct SelectScalarFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
                   TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
@@ -187,9 +217,19 @@ struct SelectScalarFunctor<CPUDevice, T> {
   }
 };
 
+// CPU Specializations of Select functors with scalar
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d,
+struct SelectScalarFunctor<CPUDevice, T>
+        : SelectScalarFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectScalarFunctor<SYCLDevice, T>
+        : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct BatchSelectFunctorBase {
+  void operator()(const Device& d,
                   typename TTypes<T>::Matrix output_flat_outer_dims,
                   TTypes<bool>::ConstVec cond_vec,
                   typename TTypes<T>::ConstMatrix then_flat_outer_dims,
@@ -214,6 +254,15 @@ struct BatchSelectFunctor<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct BatchSelectFunctor<CPUDevice, T>
+        : BatchSelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct BatchSelectFunctor<SYCLDevice, T>
+        : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index cc1f9b8f03e..a76a088ac8f 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -23,6 +23,9 @@ REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
           double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -30,5 +33,8 @@ REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 568906612a6..dedd414db55 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -33,4 +33,17 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                         UnaryOp<CPUDevice, functor::sign<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float);
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sign")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::sign<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index 8d0c0959f74..ab54c61b56d 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::sin<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
new file mode 100644
index 00000000000..055f0b12e14
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -0,0 +1,37 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sinh")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 710001517b5..55acf648db0 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index f867f127a72..afcacfec1c7 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::square<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index ac49cad88fd..9c850c94207 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::tan<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index ae2c473e20b..1dbc13061ba 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYC
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 3349447460d..6d80e4bfc1d 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -483,6 +483,12 @@ struct log1p : base<T, Eigen::internal::scalar_log1p_op<T> > {};
 template <typename T>
 struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
 
+template <typename T>
+struct sinh : base<T, Eigen::internal::scalar_sinh_op<T> > {};
+
+template <typename T>
+struct cosh : base<T, Eigen::internal::scalar_cosh_op<T> > {};
+
 template <typename T>
 struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
 
@@ -658,6 +664,22 @@ struct zeta : base<T, Eigen::internal::scalar_zeta_op<T>> {};
 template <typename T>
 struct polygamma : base<T, Eigen::internal::scalar_polygamma_op<T>> {};
 
+template <typename Scalar>
+struct scalar_atan2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& y, const Scalar& x) const {
+#if GOOGLE_CUDA
+    return ::atan2(y, x);
+#else
+    return std::atan2(y, x);
+#endif
+  }
+};
+
+template <typename T>
+struct atan2 : base<T, scalar_atan2_op<T>> {};
+
 template <typename T>
 struct squared_difference
     : base<T, Eigen::internal::scalar_compose_op<
@@ -753,6 +775,13 @@ struct BinaryFunctor {
              bool* error);
 };
 
+template <typename Device, typename T>
+struct ApproximateEqual {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z);
+};
+
 template <int NDIMS>
 bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) {
   for (size_t i = 0; i < a.size(); ++i) {
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index c675faeea16..192a4f732ef 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -55,11 +55,12 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
                                            in1.shape().DebugString()));
     return;
   }
-  OP_REQUIRES_OK(
-      ctx, ctx->allocate_output(0, BCast::ToShape(bcast.output_shape()), &out));
-  out_num_elements = out->NumElements();
+  const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
+  out_num_elements = output_shape.num_elements();
   in0_num_elements = in0.NumElements();
   in1_num_elements = in1.NumElements();
+  OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                          {0, 1}, 0, output_shape, &out));
 
   ndims = static_cast<int>(bcast.x_reshape().size());
 }
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index c825a91fb16..b43370ee65a 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -48,7 +48,9 @@ class BinaryOpShared : public OpKernel {
  protected:
   struct BinaryOpState {
     // Sets up bcast with the shape of in0 and in1, ensures that the bcast
-    // is valid, and if so, allocates out using ctx->output(...).
+    // is valid, and if so, set out, either by allocating a new buffer using
+    // ctx->output(...) or by creating an alias for an owned input buffer for
+    // in-place computation.
     // Caller must check ctx->status() upon return for non-ok status.
     // If ctx->status().ok() is true, then out is guaranteed to be allocated.
     BinaryOpState(OpKernelContext* ctx);
@@ -152,6 +154,37 @@ class BinaryOp : public BinaryOpShared {
   }
 };
 
+template <typename Device, typename T>
+class ApproximateEqualOp : public OpKernel {
+ public:
+  explicit ApproximateEqualOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float tolerance;
+    OP_REQUIRES_OK(context, context->GetAttr("tolerance", &tolerance));
+    tolerance_ = T(tolerance);
+  }
+  void Compute(OpKernelContext* context) override {
+    const Tensor& x_input = context->input(0);
+    const Tensor& y_input = context->input(1);
+    OP_REQUIRES(
+        context, x_input.shape() == y_input.shape(),
+        errors::InvalidArgument("x and y must be of the same shape. ",
+                                "x shape: ", x_input.shape().DebugString(),
+                                ". y shape: ", y_input.shape().DebugString()));
+    Tensor* z_output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, x_input.shape(), &z_output));
+    const Device& d = context->eigen_device<Device>();
+    typename TTypes<T>::ConstFlat x(x_input.flat<T>());
+    typename TTypes<T>::ConstFlat y(y_input.flat<T>());
+    typename TTypes<bool>::Flat z(z_output->flat<bool>());
+    functor::ApproximateEqual<Device, T>()(d, x, y, tolerance_, z);
+  }
+
+ private:
+  T tolerance_;
+};
+
 // Basic coefficient-wise binary operations that are known to not require
 // any broadcasting. This is the case for example of the gradients of
 // unary operations.
@@ -168,14 +201,18 @@ class SimpleBinaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
-
-    Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
-    auto out_flat = out->flat<Tout>();
     auto in0_flat = in0.flat<Tin>();
     auto in1_flat = in1.flat<Tin>();
     const Device& eigen_device = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0, 1}, 0, in0.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    }
+    auto out_flat = out->flat<Tout>();
     functor::SimpleBinaryFunctor<Device, Functor>()(eigen_device, out_flat,
                                                     in0_flat, in1_flat);
   }
@@ -200,7 +237,12 @@ class UnaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& inp = ctx->input(0);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, inp.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    }
     functor::UnaryFunctor<Device, Functor>()(
         ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
   }
@@ -407,6 +449,17 @@ struct UnaryFunctor<CPUDevice, Functor> {
   }
 };
 
+// Partial specialization of ApproximateEqual<Device=CPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
 }  // end namespace functor
 
 #define REGISTER(OP, D, N, F, T)                                             \
@@ -415,7 +468,7 @@ struct UnaryFunctor<CPUDevice, Functor> {
 
 // Macros to register kernels for multiple types (T0, T1, etc.)  on
 // device type "D" (CPU or GPU) for operation "N" (e.g., sqrt) using
-// the functor "F" (e.g., functor:sqrt).
+// the functor "F" (e.g., functor::sqrt).
 
 #if defined(__ANDROID_TYPES_SLIM__)
 // Note that __ANDROID_TYPES_SLIM__ is also checked in the cwise_ops*.cc files.
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index d0db68bfa87..6dd108f7226 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -104,6 +104,17 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
   }
 };
 
+// Partial specialization of ApproximateEqual<Device=GPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
 #define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> >
@@ -122,6 +133,9 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
 #define DEFINE_UNARY6(F, T0, T1, T2, T3, T4, T5) \
   DEFINE_UNARY2(F, T0, T1);                      \
   DEFINE_UNARY4(F, T2, T3, T4, T5)
+#define DEFINE_UNARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_UNARY2(F, T0, T1);                          \
+  DEFINE_UNARY5(F, T2, T3, T4, T5, T6)
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for BinaryFunctor.
@@ -162,6 +176,12 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
   DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                                \
   DEFINE_BINARY6(F, T5, T6, T7, T8, T9, T10)
 
+#define DEFINE_APPROXIMATE_EQUAL1(T) \
+  template struct ApproximateEqual<GPUDevice, T>;
+#define DEFINE_APPROXIMATE_EQUAL2(T0, T1) \
+  DEFINE_APPROXIMATE_EQUAL1(T0);          \
+  DEFINE_APPROXIMATE_EQUAL1(T1);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 671de380d37..77b330f5899 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,6 +171,21 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
+
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization of BinaryFunctor for SYCL devices
+typedef Eigen::SyclDevice SYCLDevice;
+template <typename Functor>
+struct SimpleBinaryFunctor<SYCLDevice, Functor> {
+  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+                  typename Functor::tin_type in0,
+                  typename Functor::tin_type in1) {
+    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+  }
+};
+
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
 
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 6250928aca1..bca0f1004d5 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
+namespace {
 
 // Creates a Graph which applies a unary "func" on a 3D tensor of
 // type T with "num" elements.
@@ -35,14 +36,14 @@ static Graph* Unary(const string& func, int num, DataType dtype) {
   return g;
 }
 
-static int kRows = 100000;
+const int kRows = 100000;
 
-static int RowsAndColsArg(int r, int c) { return r * kRows + c; }
-static int RowsFromArg(int arg) { return (arg / kRows); }
-static int ColsFromArg(int arg) { return (arg % kRows); }
+int RowsAndColsArg(int r, int c) { return r * kRows + c; }
+int RowsFromArg(int arg) { return (arg / kRows); }
+int ColsFromArg(int arg) { return (arg % kRows); }
 
 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
-  static void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {    \
+  void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
     const int64 tot = static_cast<int64>(iters) * num;               \
     testing::ItemsProcessed(tot);                                    \
     testing::BytesProcessed(tot * sizeof(T));                        \
@@ -51,21 +52,41 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
   BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
 
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, float, DT_FLOAT);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, double, DT_DOUBLE);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
+#endif // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 
 // data func scalar.
-static Graph* BinaryScalar(int num, const string& func) {
+Graph* BinaryScalar(int num, const string& func) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
   lhs.flat<float>().setRandom();
@@ -77,7 +98,7 @@ static Graph* BinaryScalar(int num, const string& func) {
 }
 
 #define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
-  static void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {  \
+  void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
     const int64 tot = static_cast<int64>(iters) * num;             \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
@@ -90,13 +111,24 @@ static Graph* BinaryScalar(int num, const string& func) {
       ->Arg(1048576);
 
 BM_BINARY_SCALAR(cpu, Less);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Less);
+#endif // TENSORFLOW_USE_SYCL
+
 BM_BINARY_SCALAR(cpu, Add);
+#if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Add);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
-static Graph* BiasAdd(int rows, int cols, DataType type) {
+Graph* BiasAdd(int rows, int cols, DataType type) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor lhs(type, TensorShape({rows, cols}));
   lhs.template flat<T>().setRandom();
@@ -110,8 +142,7 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
 }
 
 #define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                             \
-  static void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters,          \
-                                                           int arg) {          \
+  void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) {      \
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
@@ -130,15 +161,19 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
 
 using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
 template <class T>
-static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
-                          TensorFormat format) {
+Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
+                   TensorFormat format) {
   Graph* g = new Graph(OpRegistry::Global());
   TensorShape lhs_shape;
   if (format == FORMAT_NCHW) {
@@ -151,15 +186,14 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
   Node* n;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BiasAddGrad")
                   .Attr("data_format", ToString(format))
-                  .Input(test::graph::Constant(g, lhs), /*index=*/0)
+                  .Input(test::graph::Constant(g, lhs), /*src_index=*/0)
                   .Finalize(g, &n));
   return g;
 }
 
 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH)               \
-  static void                                                                  \
-      BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(       \
-          int iters, int arg, int channels) {                                  \
+  void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(      \
+      int iters, int arg, int channels) {                                      \
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
@@ -180,16 +214,22 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
 
 using Eigen::half;
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
+#if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
+#endif // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
-static Graph* BcastAdd(int rows, int cols, int dim) {
+Graph* BcastAdd(int rows, int cols, int dim) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
   lhs.flat<float>().setRandom();
@@ -206,15 +246,15 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
   return g;
 }
 
-#define BM_BCAST_ADD_ROW(DEVICE, R, C)                                    \
-  static void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                    \
-    const int cols = ColsFromArg(arg);                                    \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;            \
-    testing::ItemsProcessed(tot);                                         \
-    testing::BytesProcessed(tot * sizeof(float));                         \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);         \
-  }                                                                       \
+#define BM_BCAST_ADD_ROW(DEVICE, R, C)                             \
+  void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                             \
+    const int cols = ColsFromArg(arg);                             \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::ItemsProcessed(tot);                                  \
+    testing::BytesProcessed(tot * sizeof(float));                  \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
+  }                                                                \
   BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
@@ -223,19 +263,24 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
   BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
   BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
 BM_BCAST_ADD_ROW_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_ROW_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
-#define BM_BCAST_ADD_COL(DEVICE, R, C)                                    \
-  static void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                    \
-    const int cols = ColsFromArg(arg);                                    \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;            \
-    testing::ItemsProcessed(tot);                                         \
-    testing::BytesProcessed(tot * sizeof(float));                         \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);         \
-  }                                                                       \
+#define BM_BCAST_ADD_COL(DEVICE, R, C)                             \
+  void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
+    const int rows = RowsFromArg(arg);                             \
+    const int cols = ColsFromArg(arg);                             \
+    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::ItemsProcessed(tot);                                  \
+    testing::BytesProcessed(tot * sizeof(float));                  \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
+  }                                                                \
   BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
 
 #define BM_BCAST_ADD_COL_ALL(DEVICE)   \
@@ -244,8 +289,14 @@ BM_BCAST_ADD_ROW_ALL(gpu);
   BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
   BM_BCAST_ADD_COL(DEVICE, 4096, 512);
 BM_BCAST_ADD_COL_ALL(cpu);
+#if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_COL_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
new file mode 100644
index 00000000000..925cbda56ea
--- /dev/null
+++ b/tensorflow/core/kernels/dataset.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+namespace tensorflow {
+
+void DatasetOpKernel::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset = nullptr;
+  MakeDataset(ctx, &dataset);
+  if (ctx->status().ok()) {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+}
+
+void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                       DatasetBase** output) {
+  DatasetBase* input;
+  OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+  core::ScopedUnref unref_input(input);
+
+  MakeDataset(ctx, input, output);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
new file mode 100644
index 00000000000..da56844dbe1
--- /dev/null
+++ b/tensorflow/core/kernels/dataset.h
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+class ResourceMgr;
+
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We will need to fabricate step IDs for calls to ops
+// that are not nested within a particular step.
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the FunctionLibraryRuntime and runner passed in here. Once
+// created, a FunctionLibraryRuntime should stay alive for the
+// remainder of a session, so we copy the pointer. A runner will be
+// deleted when the original step ends, but all existing runners only
+// close over session-lifetime (or longer-lived) state, so we can make
+// a copy of the function. There's nothing in the definition of either
+// class to guarantee that what we are doing is safe. We should
+// formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // The step being executed.
+    int64 step_id = 0;
+
+    // Shared resources accessible by this iterator invocation.
+    ResourceMgr* resource_manager = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  int64 step_id() const { return params_.step_id; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  ResourceMgr* resource_manager() const { return params_.resource_manager; }
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public ResourceBase {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  virtual std::unique_ptr<IteratorBase> MakeIterator() const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  explicit DatasetIterator(const DatasetType* dataset) : dataset_(dataset) {
+    dataset_->Ref();
+  }
+
+  ~DatasetIterator() override { dataset_->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return dataset_; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dataset_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return dataset_->output_shapes();
+  }
+
+ private:
+  const DatasetType* const dataset_;  // Owns one reference on the
+                                      // shared dataset resource.
+};
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 0706b72a895..965a60c7e05 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -28,6 +28,16 @@ REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_CPU), CopyOp);
 
 REGISTER_KERNEL_BUILDER(Name("CopyHost").Device(DEVICE_CPU), CopyOp);
 
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_GPU), CopyOp);
+
+REGISTER_KERNEL_BUILDER(Name("CopyHost")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        CopyOp);
+#endif  // GOOGLE_CUDA
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp);
 
@@ -38,16 +48,6 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                         CopyOp);
 #endif // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_GPU), CopyOp);
-
-REGISTER_KERNEL_BUILDER(Name("CopyHost")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        CopyOp);
-#endif
-
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
                         DebugIdentityOp);
@@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                               .TypeConstraint<type>("T"), \
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
+REGISTER_GPU_DEBUG_NAN_COUNT(double);
 #endif // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
@@ -105,8 +106,10 @@ REGISTER_GPU_DEBUG_NAN_COUNT(float);
                               .Device(DEVICE_CPU)         \
                               .TypeConstraint<type>("T"), \
                           DebugNumericSummaryOp<type>);
-REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(float);
-REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(double);
+TF_CALL_bool(REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_INTEGRAL_TYPES(REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_float(REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_double(REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT);
 
 #if GOOGLE_CUDA
 #define REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(type)    \
@@ -116,19 +119,23 @@ REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(double);
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
                           DebugNumericSummaryOp<type>);
-REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float);
-REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double);
+TF_CALL_bool(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_float(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(type)    \
+#define REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT(type)   \
   REGISTER_KERNEL_BUILDER(Name("DebugNumericSummary")     \
                               .Device(DEVICE_SYCL)        \
                               .HostMemory("input")        \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
                           DebugNumericSummaryOp<type>);
-REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float);
+TF_CALL_bool(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_float(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
+TF_CALL_double(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
 #endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index c6395f88722..ef12e2e42cb 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -16,7 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_DEBUG_OP_H_
 #define TENSORFLOW_KERNELS_DEBUG_OP_H_
 
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -33,13 +38,32 @@ class CopyOp : public OpKernel {
  public:
   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
+
+    std::vector<string> debug_ops_spec;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("debug_ops_spec", &debug_ops_spec));
+    for (const string& debug_op_spec : debug_ops_spec) {
+      // Assume debug_op_spec has the format
+      // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
+      // DebugIdentity;grpc://localhost:3333;1
+      const std::vector<string> items = str_util::Split(debug_op_spec, ";");
+      OP_REQUIRES(
+          context, items.size() == 3,
+          errors::Internal(
+              "Unexpected number of semicolons in debug_ops_spec element: ",
+              debug_op_spec));
+      debug_op_and_url_specs_.push_back(
+          DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
+                               items[1], items[2] == "1"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& src_tensor = context->input(0);
 
     if (src_tensor.IsInitialized() &&
-        DataTypeCanUseMemcpy(src_tensor.dtype())) {
+        DataTypeCanUseMemcpy(src_tensor.dtype()) &&
+        DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
       // Source tensor is initialized and is mem-copyable. Make a copy.
       Tensor* copied_tensor;
       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
@@ -63,6 +87,22 @@ class CopyOp : public OpKernel {
         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
         *copied_tensor = tensor::DeepCopy(src_tensor);
       }
+#elif defined(TENSORFLOW_USE_SYCL)
+      Device* device = static_cast<Device*>(context->device());
+      // Determine if the input tensor is not on CPU (e.g., on GPU).
+      const bool off_host_input = device->device_type() == DEVICE_SYCL &&
+                            !context->input_alloc_attr(0).on_host();
+
+      if (off_host_input) {
+        auto size = src_tensor.NumElements() * sizeof(src_tensor.dtype());
+        auto dst_ptr = GetBase(copied_tensor);
+        auto src_ptr = GetBase(&src_tensor);
+        typedef decltype(src_tensor.dtype()) ttype;
+        context->eigen_sycl_device().memcpy(
+            dst_ptr, static_cast<const ttype*>(src_ptr), size);
+      } else {
+        *copied_tensor = tensor::DeepCopy(src_tensor);
+      }
 #else
       *copied_tensor = tensor::DeepCopy(src_tensor);
 #endif
@@ -77,46 +117,125 @@ class CopyOp : public OpKernel {
 
  private:
   string tensor_name_;
+  std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
+};
+
+// Base class of all debug ops.
+class BaseDebugOp : public OpKernel {
+ public:
+  explicit BaseDebugOp(const string& debug_op_name,
+                       OpKernelConstruction* context)
+      : OpKernel(context), debug_op_name_(debug_op_name) {
+    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+    OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
+
+    string device_name;
+    string tensor_name;
+    OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
+
+    std::vector<string> name_items = str_util::Split(tensor_name, ':');
+    string node_name;
+    int32 output_slot = 0;
+    OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
+                errors::InvalidArgument("Failed to parse tensor name: \"",
+                                        tensor_name, "\""));
+    if (name_items.size() == 2) {
+      node_name = name_items[0];
+      OP_REQUIRES(
+          context, strings::safe_strto32(name_items[1], &output_slot),
+          errors::InvalidArgument("Invalid string value for output_slot: \"",
+                                  name_items[1], "\""));
+    } else if (name_items.size() == 1) {
+      node_name = name_items[0];
+    }
+
+    debug_watch_key_.reset(
+        new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
+  }
+
+  bool IsExpensive() override { return false; }
+
+ protected:
+  // Apply gRPC gating (if gated_grpc_ attribute is true).
+  //
+  // Returns false if and only if all grpc:// debug URLs of the debug op are
+  // disabled currently (i.e., gated off), in which case the debug op will emit
+  // an empty (size {0}) tensor of undefined data type.
+  bool ApplyGrpcGating(OpKernelContext* context) {
+    if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
+                           debug_watch_key_->debug_node_name, debug_urls_)) {
+      // The entire node is gated off: Output an empty tensor and avoid
+      // expensive computation.
+      Tensor* output_tensor;
+      TensorShape shape({0});
+      if (!context->allocate_output(0, shape, &output_tensor).ok()) {
+        LOG(ERROR) << "Debug node of watch key "
+                   << debug_watch_key_->debug_node_name
+                   << " failed to allocate empty tensor under gated-off state.";
+      }
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  // Publish a tensor to all debug URLs of the debug op.
+  // Log an error if the publishing failed.
+  void PublishTensor(const Tensor& tensor) {
+    if (!debug_urls_.empty()) {
+      Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
+                                                  Env::Default()->NowMicros(),
+                                                  debug_urls_, gated_grpc_);
+      if (!status.ok()) {
+        LOG(ERROR) << "Debug node of watch key "
+                   << debug_watch_key_->debug_node_name
+                   << "failed to publish debug tensor data to all URLs "
+                   << str_util::Join(debug_urls_, ", ")
+                   << ", due to: " << status.error_message();
+      }
+    }
+  }
+
+ private:
+  const string debug_op_name_;
+  std::unique_ptr<DebugNodeKey> debug_watch_key_;
+  std::vector<string> debug_urls_;
+  bool gated_grpc_;
 };
 
 // Identity op for debugging.
 //   Output slot 0 carries the debug signal and is always allocated on the
 //   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
 //   the debug signal is equal to the input tensor.
-class DebugIdentityOp : public OpKernel {
+class DebugIdentityOp : public BaseDebugOp {
  public:
-  explicit DebugIdentityOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
-  }
+  explicit DebugIdentityOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugIdentity", context) {}
 
   void Compute(OpKernelContext* context) override {
-    if (!debug_urls_.empty()) {
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugIdentity",
-                                  context->input(0),
-                                  Env::Default()->NowMicros(), debug_urls_);
+    if (!ApplyGrpcGating(context)) {
+      return;
     }
 
+    PublishTensor(context->input(0));
     context->set_output(0, context->input(0));
   }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  string tensor_name_;
-  std::vector<string> debug_urls_;
 };
 
 // NaN-counter op for debugging.
 template <typename T>
-class DebugNanCountOp : public OpKernel {
+class DebugNanCountOp : public BaseDebugOp {
  public:
-  explicit DebugNanCountOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
-  }
+  explicit DebugNanCountOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugNanCount", context) {}
 
   void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
     const Tensor& input = context->input(0);
 
     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
@@ -136,35 +255,30 @@ class DebugNanCountOp : public OpKernel {
     }
 
     TensorShape shape({1});
-
-    Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<int64>()(0) = nan_count;
-
-    if (!debug_urls_.empty()) {
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugNanCount", *output_tensor,
-                                  Env::Default()->NowMicros(), debug_urls_);
-    }
+    PublishTensor(*output_tensor);
   }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  string tensor_name_;
-  std::vector<string> debug_urls_;
 };
 
 // Numeric summary op for debugging.
 template <typename T>
-class DebugNumericSummaryOp : public OpKernel {
+class DebugNumericSummaryOp : public BaseDebugOp {
  public:
   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+      : BaseDebugOp("DebugNumericSummary", context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
+    OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("mute_if_healthy", &mute_if_healthy_));
   }
 
   void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
     const Tensor& input = context->input(0);
 
     int64 is_initialized = 0;
@@ -184,14 +298,17 @@ class DebugNumericSummaryOp : public OpKernel {
     // Equal to negative_count + zero_count + positive_count.
     int64 non_inf_nan_count = 0;
 
+    const TensorShape& input_shape = input.shape();
     if (input.IsInitialized()) {
       is_initialized = 1;
-      const TensorShape& input_shape = input.shape();
       const T* input_flat = input.template flat<T>().data();
 
       element_count = input_shape.num_elements();
+      const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
+      const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
+
       for (int64 i = 0; i < element_count; ++i) {
-        T x = input_flat[i];
+        const double x = static_cast<double>(input_flat[i]);
         if (Eigen::numext::isnan(x)) {
           nan_count++;
         } else if (Eigen::numext::isinf(x)) {
@@ -201,7 +318,11 @@ class DebugNumericSummaryOp : public OpKernel {
             positive_inf_count++;
           }
         } else {
-          if (x < 0.0) {
+          if (is_lower_bound_custom && x <= lower_bound_) {
+            negative_inf_count++;
+          } else if (is_upper_bound_custom && x >= upper_bound_) {
+            positive_inf_count++;
+          } else if (x < 0.0) {
             negative_count++;
           } else if (x > 0.0) {
             positive_count++;
@@ -211,7 +332,8 @@ class DebugNumericSummaryOp : public OpKernel {
 
           if (x < min) {
             min = x;
-          } else if (x > max) {
+          }
+          if (x > max) {
             max = x;
           }
 
@@ -226,7 +348,7 @@ class DebugNumericSummaryOp : public OpKernel {
         // Do a second pass to compute variance.
         variance = 0.0;
         for (int64 i = 0; i < element_count; ++i) {
-          T x = input_flat[i];
+          const double x = static_cast<double>(input_flat[i]);
           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
             variance += (x - mean) * (x - mean);
           }
@@ -235,35 +357,39 @@ class DebugNumericSummaryOp : public OpKernel {
       }
     }
 
-    TensorShape shape({12});
-
-    Tensor* output_tensor;
+    TensorShape shape({14 + input_shape.dims()});
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
-    output_tensor->vec<double>()(2) = static_cast<double>(negative_inf_count);
-    output_tensor->vec<double>()(3) = static_cast<double>(negative_count);
-    output_tensor->vec<double>()(4) = static_cast<double>(zero_count);
-    output_tensor->vec<double>()(5) = static_cast<double>(positive_count);
-    output_tensor->vec<double>()(6) = static_cast<double>(positive_inf_count);
-    output_tensor->vec<double>()(7) = static_cast<double>(nan_count);
+    output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
+    output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
+    output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
+    output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
+    output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
+    output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
     output_tensor->vec<double>()(8) = min;
     output_tensor->vec<double>()(9) = max;
     output_tensor->vec<double>()(10) = mean;
     output_tensor->vec<double>()(11) = variance;
 
-    if (!debug_urls_.empty()) {
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugNumericSummary",
-                                  *output_tensor, Env::Default()->NowMicros(),
-                                  debug_urls_);
+    output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
+    output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
+    for (size_t d = 0; d < input_shape.dims(); ++d) {
+      output_tensor->vec<double>()(14 + d) =
+          static_cast<double>(input_shape.dim_sizes()[d]);
+    }
+
+    bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
+                positive_inf_count == 0;
+    if (!mute) {
+      PublishTensor(*output_tensor);
     }
   }
 
-  bool IsExpensive() override { return false; }
-
  private:
-  string tensor_name_;
-  std::vector<string> debug_urls_;
+  float lower_bound_;
+  float upper_bound_;
+  bool mute_if_healthy_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index ecddf2141d7..8f73a44a718 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -18,25 +18,27 @@ limitations under the License.
 #include <fstream>
 #include <vector>
 
+#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
-namespace {
 
 class DebugIdentityOpTest : public OpsTestBase {
  protected:
-  Status Init(DataType input_type, const std::vector<string> debug_urls) {
+  Status Init(DataType input_type, const std::vector<string>& debug_urls) {
     env_ = Env::Default();
 
     TF_CHECK_OK(NodeDefBuilder("op", "DebugIdentity")
@@ -93,16 +95,30 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
     ASSERT_TRUE(env_->FileExists(dump_roots[i]).ok());
     ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok());
 
-    DIR* dir = opendir(dump_roots[i].c_str());
+    std::vector<string> device_roots;
+    DIR* dir0 = opendir(dump_roots[i].c_str());
+    struct dirent* ent0;
+    const string kDeviceDirPrefix =
+        strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag);
+    while ((ent0 = readdir(dir0)) != nullptr) {
+      if (!strncmp(ent0->d_name, kDeviceDirPrefix.c_str(),
+                   kDeviceDirPrefix.size())) {
+        device_roots.push_back(io::JoinPath(dump_roots[i], ent0->d_name));
+      }
+    }
+    ASSERT_EQ(1, device_roots.size());
+    closedir(dir0);
+
+    const string& device_root = device_roots[0];
+    DIR* dir = opendir(device_root.c_str());
     struct dirent* ent;
     int dump_files_found = 0;
-    while ((ent = readdir(dir)) != NULL) {
+    while ((ent = readdir(dir)) != nullptr) {
       if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) {
         dump_files_found++;
 
         // Try reading the file into a Event proto.
-        const string dump_file_path =
-            strings::StrCat(dump_roots[i], "/", ent->d_name);
+        const string dump_file_path = io::JoinPath(device_root, ent->d_name);
         std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary);
         Event event;
         event.ParseFromIstream(&ifs);
@@ -230,6 +246,24 @@ class DebugNumericSummaryOpTest : public OpsTestBase {
                     .Finalize(node_def()));
     return InitOp();
   }
+
+  Status InitGated(DataType input_type, const std::vector<string>& debug_urls) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("gated_grpc", true)
+                    .Attr("debug_urls", debug_urls)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+
+#if defined(PLATFORM_GOOGLE)
+  void ClearEnabledWatchKeys() { DebugGrpcIO::ClearEnabledWatchKeys(); }
+
+  void CreateEmptyEnabledSet(const string& grpc_debug_url) {
+    DebugGrpcIO::CreateEmptyEnabledSet(grpc_debug_url);
+  }
+#endif
 };
 
 TEST_F(DebugNumericSummaryOpTest, Float_full_house) {
@@ -249,21 +283,24 @@ TEST_F(DebugNumericSummaryOpTest, Float_full_house) {
        std::numeric_limits<float>::quiet_NaN()});
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({15}));
   test::FillValues<double>(
       &expected,
-      {1.0,              // Is initialized.
-       18.0,             // Total element count.
-       2.0,              // -inf count.
-       2.0,              // negative number count (excluding -inf).
-       3.0,              // zero count.
-       2.0,              // positive number count (excluding +inf).
-       5.0,              // +inf count.
-       4.0,              // nan count.
-       -3.0,             // minimum of non-inf and non-nan elements.
-       7.0,              // maximum of non-inf and non-nan elements.
-       0.85714285714,    // mean of non-inf and non-nan elements.
-       8.97959183673});  // variance of non-inf and non-nan elements.
+      {1.0,            // Is initialized.
+       18.0,           // Total element count.
+       4.0,            // nan count.
+       2.0,            // -inf count.
+       2.0,            // negative number count (excluding -inf).
+       3.0,            // zero count.
+       2.0,            // positive number count (excluding +inf).
+       5.0,            // +inf count.
+       -3.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       0.85714285714,  // mean of non-inf and non-nan elements.
+       8.97959183673,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_FLOAT),  // dtype.
+       1.0,                            // Number of dimensions.
+       18.0});                         // Dimension size.
 
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
@@ -285,21 +322,24 @@ TEST_F(DebugNumericSummaryOpTest, Double_full_house) {
        std::numeric_limits<double>::quiet_NaN()});
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({15}));
   test::FillValues<double>(
       &expected,
-      {1.0,              // Is initialized.
-       18.0,             // Total element count.
-       2.0,              // -inf count.
-       2.0,              // negative count (excluding -inf).
-       3.0,              // zero count.
-       2.0,              // positive count (excluding +inf).
-       5.0,              // +inf count.
-       4.0,              // nan count.
-       -3.0,             // minimum of non-inf and non-nan elements.
-       7.0,              // maximum of non-inf and non-nan elements.
-       0.85714285714,    // mean of non-inf and non-nan elements.
-       8.97959183673});  // variance of non-inf and non-nan elements.
+      {1.0,            // Is initialized.
+       18.0,           // Total element count.
+       4.0,            // nan count.
+       2.0,            // -inf count.
+       2.0,            // negative count (excluding -inf).
+       3.0,            // zero count.
+       2.0,            // positive count (excluding +inf).
+       5.0,            // +inf count.
+       -3.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       0.85714285714,  // mean of non-inf and non-nan elements.
+       8.97959183673,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_DOUBLE),  // dtype.
+       1.0,                             // Number of dimensions.
+       18.0});                          // Dimension size.
 
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
@@ -310,21 +350,24 @@ TEST_F(DebugNumericSummaryOpTest, Float_only_valid_values) {
                            {0.0f, 0.0f, -1.0f, 3.0f, 3.0f, 7.0f});
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
   test::FillValues<double>(
       &expected,
-      {1.0,              // Is initialized.
-       6.0,              // Total element count.
-       0.0,              // -inf count.
-       1.0,              // negative count (excluding -inf).
-       2.0,              // zero count.
-       3.0,              // positive count (excluding +inf).
-       0.0,              // +inf count.
-       0.0,              // nan count.
-       -1.0,             // minimum of non-inf and non-nan elements.
-       7.0,              // maximum of non-inf and non-nan elements.
-       2.0,              // mean of non-inf and non-nan elements.
-       7.33333333333});  // variance of non-inf and non-nan elements.
+      {1.0,            // Is initialized.
+       6.0,            // Total element count.
+       0.0,            // nan count.
+       0.0,            // -inf count.
+       1.0,            // negative count (excluding -inf).
+       2.0,            // zero count.
+       3.0,            // positive count (excluding +inf).
+       0.0,            // +inf count.
+       -1.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       2.0,            // mean of non-inf and non-nan elements.
+       7.33333333333,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_FLOAT),  // dtype
+       2.0,                            // Number of dimensions.
+       2.0, 3.0});                     // Dimensoin sizes.
 
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
@@ -346,24 +389,338 @@ TEST_F(DebugNumericSummaryOpTest, Float_all_Inf_or_NaN) {
   Tensor output_tensor = *GetOutput(0);
   const double* output = output_tensor.template flat<double>().data();
 
-  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
   // Use ASSERT_NEAR below because test::ExpectTensorNear does not work with
   // NaNs.
   ASSERT_NEAR(1.0, output[0], 1e-8);  // Is initialized.
   ASSERT_NEAR(9.0, output[1], 1e-8);  // Total element count.
-  ASSERT_NEAR(2.0, output[2], 1e-8);  // -inf count.
-  ASSERT_NEAR(0.0, output[3], 1e-8);  // negative count (excluding -inf).
-  ASSERT_NEAR(0.0, output[4], 1e-8);  // zero count.
-  ASSERT_NEAR(0.0, output[5], 1e-8);  // positive count (excluding +inf).
-  ASSERT_NEAR(3.0, output[6], 1e-8);  // +inf count.
-  ASSERT_NEAR(4.0, output[7], 1e-8);  // nan count.
+  ASSERT_NEAR(4.0, output[2], 1e-8);  // nan count.
+  ASSERT_NEAR(2.0, output[3], 1e-8);  // -inf count.
+  ASSERT_NEAR(0.0, output[4], 1e-8);  // negative count (excluding -inf).
+  ASSERT_NEAR(0.0, output[5], 1e-8);  // zero count.
+  ASSERT_NEAR(0.0, output[6], 1e-8);  // positive count (excluding +inf).
+  ASSERT_NEAR(3.0, output[7], 1e-8);  // +inf count.
   // Due to the absence of any non-inf and non-nan values, the output of min,
   // max, mean and var are all degenerate.
   ASSERT_EQ(std::numeric_limits<float>::infinity(), output[8]);
   ASSERT_EQ(-std::numeric_limits<float>::infinity(), output[9]);
   ASSERT_TRUE(Eigen::numext::isnan(output[10]));
   ASSERT_TRUE(Eigen::numext::isnan(output[11]));
+  ASSERT_EQ(static_cast<double>(DT_FLOAT), output[12]);
+  ASSERT_EQ(2.0, output[13]);
+  ASSERT_EQ(3.0, output[14]);
+  ASSERT_EQ(3.0, output[15]);
+}
+
+TEST_F(DebugNumericSummaryOpTest, Many_dimensions_tensor_shape) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+  AddInputFromArray<float>(TensorShape({1, 3, 1, 1, 1, 1, 1}),
+                           {std::numeric_limits<float>::quiet_NaN(),
+                            -std::numeric_limits<float>::infinity(), -8.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({21}));
+  test::FillValues<double>(&expected,
+                           {1.0,   // Is initialized.
+                            3.0,   // Total element count.
+                            1.0,   // nan count.
+                            1.0,   // -inf count.
+                            1.0,   // negative number count (excluding -inf).
+                            0.0,   // zero count.
+                            0.0,   // positive number count (excluding +inf).
+                            0.0,   // +inf count.
+                            -8.0,  // minimum of non-inf and non-nan elements.
+                            -8.0,  // maximum of non-inf and non-nan elements.
+                            -8.0,  // mean of non-inf and non-nan elements.
+                            0.0,   // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_FLOAT),  // dtype.
+                            7.0,  // Number of dimensions.
+                            1.0,
+                            3.0,
+                            1.0,
+                            1.0,
+                            1.0,
+                            1.0,
+                            1.0});  // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, Scalar_tensor_shape) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+  AddInputFromArray<float>(TensorShape({}), {42.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({14}));
+  test::FillValues<double>(&expected,
+                           {1.0,   // Is initialized.
+                            1.0,   // Total element count.
+                            0.0,   // nan count.
+                            0.0,   // -inf count.
+                            0.0,   // negative number count (excluding -inf).
+                            0.0,   // zero count.
+                            1.0,   // positive number count (excluding +inf).
+                            0.0,   // +inf count.
+                            42.0,  // minimum of non-inf and non-nan elements.
+                            42.0,  // maximum of non-inf and non-nan elements.
+                            42.0,  // mean of non-inf and non-nan elements.
+                            0.0,   // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_FLOAT),  // dtype.
+                            0.0});  // Number of dimensions.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, Int16Success) {
+  TF_ASSERT_OK(Init(DT_INT16));
+  AddInputFromArray<int16>(TensorShape({4, 1}), {-1, -3, 3, 7});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
+  test::FillValues<double>(&expected,
+                           {1.0,    // Is initialized.
+                            4.0,    // Total element count.
+                            0.0,    // nan count.
+                            0.0,    // -inf count.
+                            2.0,    // negative count (excluding -inf).
+                            0.0,    // zero count.
+                            2.0,    // positive count (excluding +inf).
+                            0.0,    // +inf count.
+                            -3.0,   // minimum of non-inf and non-nan elements.
+                            7.0,    // maximum of non-inf and non-nan elements.
+                            1.5,    // mean of non-inf and non-nan elements.
+                            14.75,  // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_INT16),  // dtype.
+                            2.0,         // Number of dimensions.
+                            4.0, 1.0});  // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, Int32Success) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {0, 0, -1, 3, 3, 7});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
+  test::FillValues<double>(
+      &expected,
+      {1.0,            // Is initialized.
+       6.0,            // Total element count.
+       0.0,            // nan count.
+       0.0,            // -inf count.
+       1.0,            // negative count (excluding -inf).
+       2.0,            // zero count.
+       3.0,            // positive count (excluding +inf).
+       0.0,            // +inf count.
+       -1.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       2.0,            // mean of non-inf and non-nan elements.
+       7.33333333333,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_INT32),  // dtype.
+       2.0,                            // Number of dimensions.
+       2.0, 3.0});                     // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, Int64Success) {
+  TF_ASSERT_OK(Init(DT_INT64));
+  AddInputFromArray<int64>(TensorShape({2, 2, 2}), {0, 0, -1, 3, 3, 7, 0, 0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({17}));
+  test::FillValues<double>(&expected,
+                           {1.0,   // Is initialized.
+                            8.0,   // Total element count.
+                            0.0,   // nan count.
+                            0.0,   // -inf count.
+                            1.0,   // negative count (excluding -inf).
+                            4.0,   // zero count.
+                            3.0,   // positive count (excluding +inf).
+                            0.0,   // +inf count.
+                            -1.0,  // minimum of non-inf and non-nan elements.
+                            7.0,   // maximum of non-inf and non-nan elements.
+                            1.5,   // mean of non-inf and non-nan elements.
+                            6.25,  // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_INT64),  // dtype.
+                            3.0,              // Number of dimensions.
+                            2.0, 2.0, 2.0});  // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, UInt8Success) {
+  TF_ASSERT_OK(Init(DT_UINT8));
+  AddInputFromArray<uint8>(TensorShape({1, 5}), {0, 10, 30, 30, 70});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
+  test::FillValues<double>(&expected,
+                           {1.0,    // Is initialized.
+                            5.0,    // Total element count.
+                            0.0,    // nan count.
+                            0.0,    // -inf count.
+                            0.0,    // negative count (excluding -inf).
+                            1.0,    // zero count.
+                            4.0,    // positive count (excluding +inf).
+                            0.0,    // +inf count.
+                            0.0,    // minimum of non-inf and non-nan elements.
+                            70.0,   // maximum of non-inf and non-nan elements.
+                            28.0,   // mean of non-inf and non-nan elements.
+                            576.0,  // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_UINT8),  // dtypes.
+                            2.0,         // Number of dimensions.
+                            1.0, 5.0});  // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, BoolSuccess) {
+  TF_ASSERT_OK(Init(DT_BOOL));
+  AddInputFromArray<bool>(TensorShape({2, 3}), {0, 0, 1, 1, 1, 0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
+  test::FillValues<double>(&expected,
+                           {1.0,   // Is initialized.
+                            6.0,   // Total element count.
+                            0.0,   // nan count.
+                            0.0,   // -inf count.
+                            0.0,   // negative count (excluding -inf).
+                            3.0,   // zero count.
+                            3.0,   // positive count (excluding +inf).
+                            0.0,   // +inf count.
+                            0.0,   // minimum of non-inf and non-nan elements.
+                            1.0,   // maximum of non-inf and non-nan elements.
+                            0.5,   // mean of non-inf and non-nan elements.
+                            0.25,  // variance of non-inf and non-nan elements.
+                            static_cast<double>(DT_BOOL),  // dtype.
+                            2.0,         // Number of dimensions.
+                            2.0, 3.0});  // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+#if defined(PLATFORM_GOOGLE)
+TEST_F(DebugNumericSummaryOpTest, DisabledDueToEmptyEnabledSet) {
+  ClearEnabledWatchKeys();
+  CreateEmptyEnabledSet("grpc://server:3333");
+
+  std::vector<string> debug_urls({"grpc://server:3333"});
+  TF_ASSERT_OK(InitGated(DT_FLOAT, debug_urls));
+  AddInputFromArray<float>(TensorShape({2, 2}), {1.0, 3.0, 3.0, 7.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_disabled(allocator(), DT_DOUBLE, TensorShape({0}));
+  test::ExpectTensorNear<double>(expected_disabled, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, DisabledDueToNonMatchingWatchKey) {
+  ClearEnabledWatchKeys();
+  DebugGrpcIO::EnableWatchKey("grpc://server:3333",
+                              "FakeTensor:1:DebugNumeriSummary");
+
+  std::vector<string> debug_urls({"grpc://server:3333"});
+  TF_ASSERT_OK(InitGated(DT_FLOAT, debug_urls));
+  AddInputFromArray<float>(TensorShape({2, 2}), {1.0, 3.0, 3.0, 7.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_disabled(allocator(), DT_DOUBLE, TensorShape({0}));
+  test::ExpectTensorNear<double>(expected_disabled, *GetOutput(0), 1e-8);
+}
+#endif
+
+// Tests for DebugNumericSummaryOp
+class DebugNumericSummaryOpCustomLowerBoundTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("lower_bound", -1.2f)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(DebugNumericSummaryOpCustomLowerBoundTest, Float_full_house) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+  AddInputFromArray<float>(
+      TensorShape({18}),
+      {std::numeric_limits<float>::quiet_NaN(),
+       std::numeric_limits<float>::quiet_NaN(), 0.0f, 0.0f, 0.0f, -1.0f, -3.0f,
+       3.0f, 7.0f, -std::numeric_limits<float>::infinity(),
+       -std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::quiet_NaN(),
+       std::numeric_limits<float>::quiet_NaN()});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({15}));
+  test::FillValues<double>(
+      &expected,
+      {1.0,            // Is initialized.
+       18.0,           // Total element count.
+       4.0,            // nan count.
+       3.0,            // -inf count.
+       1.0,            // negative number count (excluding -inf).
+       3.0,            // zero count.
+       2.0,            // positive number count (excluding +inf).
+       5.0,            // +inf count.
+       -3.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       0.85714285714,  // mean of non-inf and non-nan elements.
+       8.97959183673,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_FLOAT),  // dtype.
+       1.0,                            // Number of dimensions.
+       18.0});                         // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+// Tests for DebugNumericSummaryOp
+class DebugNumericSummaryOpCustomLowerUpperBoundsTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("lower_bound", -0.5f)
+                    .Attr("upper_bound", 3.6f)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(DebugNumericSummaryOpCustomLowerUpperBoundsTest, Int32Success) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {0, 0, -1, 3, 3, 7});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
+  test::FillValues<double>(
+      &expected,
+      {1.0,            // Is initialized.
+       6.0,            // Total element count.
+       0.0,            // nan count.
+       1.0,            // -inf count.
+       0.0,            // negative count (excluding -inf).
+       2.0,            // zero count.
+       2.0,            // positive count (excluding +inf).
+       1.0,            // +inf count.
+       -1.0,           // minimum of non-inf and non-nan elements.
+       7.0,            // maximum of non-inf and non-nan elements.
+       2.0,            // mean of non-inf and non-nan elements.
+       7.33333333333,  // variance of non-inf and non-nan elements.
+       static_cast<double>(DT_INT32),  // dtype.
+       2.0,                            // Number of dimensions.
+       2.0, 3.0});                     // Dimension sizes.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
 
-}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
new file mode 100644
index 00000000000..086369a9f12
--- /dev/null
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Decode the contents of a BMP file
+class DecodeBmpOp : public OpKernel {
+ public:
+  explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+    OP_REQUIRES(
+        context, channels_ == 0 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("channels must be 0, 3 or 4, got ", channels_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Start decoding image to get shape details
+    const StringPiece input = contents.scalar<string>()();
+
+    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    const int32 header_size = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+    const int32 width = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+    const int32 height = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+    const int32 bpp = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 28)));
+
+    if (channels_) {
+      OP_REQUIRES(context, (channels_ == bpp / 8),
+                  errors::InvalidArgument(
+                      "channels attribute ", channels_,
+                      " does not match bits per pixel from file ", bpp / 8));
+    } else {
+      channels_ = bpp / 8;
+    }
+
+    // Current implementation only supports 3 or 4 channel
+    // bitmaps.
+    OP_REQUIRES(context, (channels_ == 3 || channels_ == 4),
+                errors::InvalidArgument(
+                    "Number of channels must be 3 or 4, was ", channels_));
+
+    // if height is negative, data layout is top down
+    // otherwise, it's bottom up
+    bool top_down = (height < 0);
+
+    // Decode image, allocating tensor once the image size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     0, TensorShape({abs(height), width, channels_}), &output));
+
+    const uint8* bmp_pixels = &img_bytes[header_size];
+
+    Decode(bmp_pixels, output->flat<uint8>().data(), width, abs(height),
+           channels_, top_down);
+  }
+
+  uint8* Decode(const uint8* input, uint8* const output, const int width,
+                const int height, const int channles, bool top_down);
+
+ private:
+  int channels_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
+
+uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
+                           const int width, const int height,
+                           const int channels, bool top_down) {
+  // there may be padding bytes when the width is not a multiple of 4 bytes
+  // 8 * channels == bits per pixel
+  int row_size = (8 * channels * width + 31) / 32 * 4;
+
+  for (int i = 0; i < height; i++) {
+    int src_pos;
+    int dst_pos;
+
+    for (int j = 0; j < width; j++) {
+      if (!top_down) {
+        src_pos = ((height - 1 - i) * row_size) + j * channels;
+      } else {
+        src_pos = i * row_size + j * channels;
+      }
+
+      dst_pos = (i * width + j) * channels;
+
+      switch (channels) {
+        case 3:
+          // BGR -> RGB
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          break;
+        case 4:
+          // BGRA -> RGBA
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          output[dst_pos + 3] = input[src_pos + 3];
+          break;
+        default:
+          LOG(FATAL) << "Unexpected number of channels: " << channels;
+          break;
+      }
+    }
+  }
+
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 4e19ce89eec..42ea23553b7 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -33,7 +33,7 @@ class DecodeCSVOp : public OpKernel {
     OP_REQUIRES(ctx, out_type_.size() < std::numeric_limits<int>::max(),
                 errors::InvalidArgument("Out type too large"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
-
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
 
@@ -62,7 +62,7 @@ class DecodeCSVOp : public OpKernel {
 
     for (int i = 0; i < static_cast<int>(out_type_.size()); ++i) {
       Tensor* out = nullptr;
-      output.allocate(i, records->shape(), &out);
+      OP_REQUIRES_OK(ctx, output.allocate(i, records->shape(), &out));
     }
 
     for (int64 i = 0; i < records_size; ++i) {
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -164,6 +164,7 @@ class DecodeCSVOp : public OpKernel {
  private:
   std::vector<DataType> out_type_;
   char delim_;
+  bool use_quote_delim_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
@@ -176,7 +177,7 @@ class DecodeCSVOp : public OpKernel {
         }
 
         bool quoted = false;
-        if (input[current_idx] == '"') {
+        if (use_quote_delim_ && input[current_idx] == '"') {
           quoted = true;
           current_idx++;
         }
@@ -186,9 +187,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, input[current_idx] != '"' &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -197,7 +199,7 @@ class DecodeCSVOp : public OpKernel {
 
           // Go to next field or the end
           current_idx++;
-        } else {
+        } else if (use_quote_delim_) {
           // Quoted field needs to be ended with '"' and delim or end
           while (
               (static_cast<size_t>(current_idx) < input.size() - 1) &&
@@ -216,10 +218,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_gif_op.cc b/tensorflow/core/kernels/decode_gif_op.cc
deleted file mode 100644
index 2bc17f8a309..00000000000
--- a/tensorflow/core/kernels/decode_gif_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gif/gif_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a GIF file
-class DecodeGifOp : public OpKernel {
- public:
-  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece input = contents.scalar<string>()();
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = nullptr;
-    OP_REQUIRES(
-        context,
-        gif::Decode(input.data(), input.size(),
-                    [=, &output](int num_frames, int width, int height,
-                                 int channels) -> uint8* {
-                      Status status(context->allocate_output(
-                          0, TensorShape({num_frames, height, width, channels}),
-                          &output));
-                      if (!status.ok()) {
-                        VLOG(1) << status;
-                        context->SetStatus(status);
-                        return nullptr;
-                      }
-                      return output->flat<uint8>().data();
-                    }),
-        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
new file mode 100644
index 00000000000..76f8c225432
--- /dev/null
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -0,0 +1,315 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+enum FileFormat {
+  kUnknownFormat = 0,
+  kPngFormat = 1,
+  kJpgFormat = 2,
+  kGifFormat = 3,
+};
+
+// Classify the contents of a file based on starting bytes (the magic number).
+FileFormat ClassifyFileFormat(StringPiece data) {
+  // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
+  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
+  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  return kUnknownFormat;
+}
+
+string FileFormatString(FileFormat magic, StringPiece data) {
+  switch (magic) {
+    case kPngFormat:
+      return "PNG";
+    case kJpgFormat:
+      return "JPEG";
+    case kGifFormat:
+      return "GIF";
+    default: {
+      if (data.empty()) return "empty file";
+      return strings::StrCat("unknown format starting with '",
+                             str_util::CEscape(data.substr(0, 16)), "'");
+    }
+  }
+}
+
+// Decode an image (either jpeg, png, or gif).  We use a single op so that
+// users don't have to care about which format they have.
+class DecodeImageOp : public OpKernel {
+ public:
+  explicit DecodeImageOp(OpKernelConstruction* context) : OpKernel(context) {
+    // Determine which op we are: jpeg, png, gif, or any
+    if (type_string() == "DecodeJpeg") {
+      format_ = kJpgFormat;
+    } else if (type_string() == "DecodePng") {
+      format_ = kPngFormat;
+    } else if (type_string() == "DecodeGif") {
+      format_ = kGifFormat;
+    } else {
+      OP_REQUIRES_OK(context,
+                     errors::InvalidArgument("Bad op type ", type_string()));
+    }
+
+    if (format_ == kGifFormat) {
+      channels_ = 3;
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
+    }
+    flags_.components = channels_;
+
+    // In the case of png, we support uint16 output
+    if (format_ == kPngFormat) {
+      DataType dt;
+      OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
+      OP_REQUIRES(
+          context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
+          errors::InvalidArgument("Type must be uint8 or uint16, got ", dt));
+      if (dt == DataType::DT_UINT8) {
+        channel_bits_ = 8;
+      } else {
+        channel_bits_ = 16;
+      }
+    }
+
+    // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing
+    // image quality for speed.
+    flags_.dct_method = JDCT_IFAST;
+
+    if (format_ == kJpgFormat) {
+      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
+                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+                                          flags_.ratio));
+      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
+                                               &flags_.fancy_upscaling));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("try_recover_truncated",
+                                      &flags_.try_recover_truncated_jpeg));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("acceptable_fraction",
+                                      &flags_.min_acceptable_fraction));
+
+      string dct_method;
+      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
+      OP_REQUIRES(
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
+          errors::InvalidArgument("dct_method must be one of "
+                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
+      if (dct_method == "INTEGER_FAST") {
+        flags_.dct_method = JDCT_IFAST;
+      } else if (dct_method == "INTEGER_ACCURATE") {
+        flags_.dct_method = JDCT_ISLOW;
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Determine format
+    const StringPiece input = contents.scalar<string>()();
+    const auto magic = ClassifyFileFormat(input);
+    OP_REQUIRES(
+        context,
+        magic == kJpgFormat || magic == kPngFormat || magic == kGifFormat,
+        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
+                                FileFormatString(magic, input)));
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
+    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
+                errors::InvalidArgument(FileFormatString(magic, input),
+                                        " does not support uint16 output"));
+
+    switch (magic) {
+      case kJpgFormat:
+        DecodeJpeg(context, input);
+        break;
+      case kPngFormat:
+        DecodePng(context, input);
+        break;
+      case kGifFormat:
+        DecodeGif(context, input);
+        break;
+      default:
+        LOG(FATAL) << "Should never get here after check above";
+        break;
+    }
+  }
+
+  void DecodeJpeg(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
+                errors::InvalidArgument(
+                    "channels must be 0, 1, or 3 for JPEG, got ", channels_));
+
+    // Decode jpeg, allocating tensor once the size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        jpeg::Uncompress(
+            input.data(), input.size(), flags_, nullptr /* nwarn */,
+            [=, &output](int width, int height, int channels) -> uint8* {
+              Status status(context->allocate_output(
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
+                  &output));
+              if (!status.ok()) {
+                VLOG(1) << status;
+                context->SetStatus(status);
+                return nullptr;
+              }
+              return output->flat<uint8>().data();
+            }),
+        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
+  }
+
+  void DecodePng(OpKernelContext* context, StringPiece input) {
+    // Start decoding png to get shape details
+    png::DecodeContext decode;
+    OP_REQUIRES(context,
+                png::CommonInitDecode(input, channels_, channel_bits_, &decode),
+                errors::InvalidArgument("Invalid PNG header, data size ",
+                                        input.size()));
+
+    // Verify that width and height are not too large:
+    // - verify width and height don't overflow int.
+    // - width can later be multiplied by channels_ and sizeof(uint16), so
+    //   verify single dimension is not too large.
+    // - verify when width and height are multiplied together, there are a few
+    //   bits to spare as well.
+    const int width = static_cast<int>(decode.width);
+    const int height = static_cast<int>(decode.height);
+    const int64 total_size =
+        static_cast<int64>(width) * static_cast<int64>(height);
+    if (width != static_cast<int64>(decode.width) || width <= 0 ||
+        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
+        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    // Allocate tensor
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0,
+        format_ == kGifFormat ? TensorShape({1, height, width, decode.channels})
+                              : TensorShape({height, width, decode.channels}),
+        &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    if (channel_bits_ == 8) {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
+              decode.channels * width * sizeof(uint8), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
+              decode.channels * width * sizeof(uint16), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    }
+  }
+
+  void DecodeGif(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
+                                        channels_));
+
+    // Decode GIF, allocating tensor once the size is known.
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        gif::Decode(input.data(), input.size(),
+                    [=, &output](int num_frames, int width, int height,
+                                 int channels) -> uint8* {
+                      Status status;
+                      if (format_ == kGifFormat) {
+                        status = context->allocate_output(
+                            0,
+                            TensorShape({num_frames, height, width, channels}),
+                            &output);
+                      } else if (num_frames == 1) {
+                        status = context->allocate_output(
+                            0, TensorShape({height, width, channels}), &output);
+                      } else {
+                        status = errors::InvalidArgument(
+                            "Got ", num_frames, " frames, but animated gifs ",
+                            "can only be decoded by tf.image.decode_gif or ",
+                            "tf.image.decode_image");
+                      }
+                      if (!status.ok()) {
+                        VLOG(1) << status;
+                        context->SetStatus(status);
+                        return nullptr;
+                      }
+                      return output->flat<uint8>().data();
+                    }),
+        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+  }
+
+ private:
+  FileFormat format_;
+  int channels_;
+  int channel_bits_ = 8;
+  jpeg::UncompressFlags flags_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc
deleted file mode 100644
index b795f395503..00000000000
--- a/tensorflow/core/kernels/decode_jpeg_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a JPEG file
-class DecodeJpegOp : public OpKernel {
- public:
-  explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components));
-    OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 ||
-                             flags_.components == 3,
-                errors::InvalidArgument("channels must be 0, 1, or 3, got ",
-                                        flags_.components));
-    OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-    OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                             flags_.ratio == 4 || flags_.ratio == 8,
-                errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
-                                        flags_.ratio));
-    OP_REQUIRES_OK(
-        context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("try_recover_truncated",
-                                    &flags_.try_recover_truncated_jpeg));
-    OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction",
-                                             &flags_.min_acceptable_fraction));
-
-    string dct_method;
-    OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
-    OP_REQUIRES(
-        context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                  dct_method == "INTEGER_ACCURATE"),
-        errors::InvalidArgument("dct_method must be one of "
-                                "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
-    if (dct_method == "INTEGER_FAST") {
-      flags_.dct_method = JDCT_IFAST;
-    } else if (dct_method == "INTEGER_ACCURATE") {
-      flags_.dct_method = JDCT_ISLOW;
-    } else {
-      // The TensorFlow-chosen default is IFAST, sacrificing decoding
-      // image quality for speed.
-      flags_.dct_method = JDCT_IFAST;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-    const StringPiece input = contents.scalar<string>()();
-    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument("JPEG contents are too large for int: ",
-                                        input.size()));
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = NULL;
-    OP_REQUIRES(
-        context,
-        jpeg::Uncompress(
-            input.data(), input.size(), flags_, nullptr /* nwarn */,
-            [=, &output](int width, int height, int channels) -> uint8* {
-              Status status(context->allocate_output(
-                  0, TensorShape({height, width, channels}), &output));
-              if (!status.ok()) {
-                VLOG(1) << status;
-                context->SetStatus(status);
-                return nullptr;
-              }
-              return output->flat<uint8>().data();
-            }),
-        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
-  }
-
- private:
-  jpeg::UncompressFlags flags_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc
deleted file mode 100644
index 1906ae7746c..00000000000
--- a/tensorflow/core/kernels/decode_png_op.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a PNG file
-class DecodePngOp : public OpKernel {
- public:
-  explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                             channels_ == 4,
-                errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                        channels_));
-
-    DataType dt;
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
-    OP_REQUIRES(
-        context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
-        errors::InvalidArgument("Type must be UINT8 or UINT16, got ", dt));
-    if (dt == DataType::DT_UINT8) {
-      desired_channel_bits_ = 8;
-    } else {
-      desired_channel_bits_ = 16;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece data = contents.scalar<string>()();
-    png::DecodeContext decode;
-    OP_REQUIRES(
-        context,
-        png::CommonInitDecode(data, channels_, desired_channel_bits_, &decode),
-        errors::InvalidArgument("Invalid PNG header, data size ", data.size()));
-
-    // Verify that width and height are not too large:
-    // - verify width and height don't overflow int.
-    // - width can later be multiplied by channels_ and sizeof(uint16), so
-    //   verify single dimension is not too large.
-    // - verify when width and height are multiplied together, there are a few
-    //   bits to spare as well.
-    const int width = static_cast<int>(decode.width);
-    const int height = static_cast<int>(decode.height);
-    const int64 total_size =
-        static_cast<int64>(width) * static_cast<int64>(height);
-    if (width != static_cast<int64>(decode.width) || width <= 0 ||
-        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
-        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
-      png::CommonFreeDecode(&decode);
-      OP_REQUIRES(context, false,
-                  errors::InvalidArgument("PNG size too large for int: ",
-                                          decode.width, " by ", decode.height));
-    }
-
-    // Allocate tensor
-    Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0, TensorShape({height, width, decode.channels}), &output);
-    if (!status.ok()) png::CommonFreeDecode(&decode);
-    OP_REQUIRES_OK(context, status);
-
-    if (desired_channel_bits_ == 8) {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    } else {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    }
-  }
-
- private:
-  int channels_;
-  int desired_channel_bits_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 4247abcd710..9492a4e26d4 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -50,7 +50,7 @@ class DecodeRawOp : public OpKernel {
       }
     }
     TensorShape out_shape = input.shape();
-    if (str_size == -1) {  // Empty input
+    if (str_size == -1 || str_size == 0) {  // Empty input
       out_shape.AddDim(1);
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
@@ -70,10 +70,24 @@ class DecodeRawOp : public OpKernel {
     auto out = output_tensor->flat_inner_dims<T>();
     DCHECK_EQ(flat_in.size(), out.dimensions()[0]);
     T* out_data = out.data();
-    for (int64 i = 0; i < flat_in.size(); ++i) {
-      const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
-      memcpy(out_data, in_data, str_size);
-      out_data += added_dim;
+    if (port::kLittleEndian == little_endian_ || sizeof(T) == 1) {
+      for (int64 i = 0; i < flat_in.size(); ++i) {
+        const T* in_data = reinterpret_cast<const T*>(flat_in(i).data());
+        memcpy(out_data, in_data, str_size);
+        out_data += added_dim;
+      }
+    } else {
+      for (int64 i = 0; i < flat_in.size(); ++i) {
+        const char* in_data_bytes =
+            reinterpret_cast<const char*>(flat_in(i).data());
+        char* out_data_bytes = reinterpret_cast<char*>(out_data);
+        const char* p = in_data_bytes;
+        char* q = out_data_bytes;
+        for (; p < in_data_bytes + str_size; p += sizeof(T), q += sizeof(T)) {
+          std::reverse_copy(p, p + sizeof(T), q);
+        }
+        out_data += added_dim;
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/decode_wav_op.cc b/tensorflow/core/kernels/decode_wav_op.cc
new file mode 100644
index 00000000000..4bd5d7ac2a6
--- /dev/null
+++ b/tensorflow/core/kernels/decode_wav_op.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+
+namespace tensorflow {
+
+// Decode the contents of a WAV file
+class DecodeWavOp : public OpKernel {
+ public:
+  explicit DecodeWavOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("desired_channels", &desired_channels_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("desired_samples", &desired_samples_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+    const string wav_string = contents.scalar<string>()();
+    OP_REQUIRES(context, wav_string.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument("WAV contents are too large for int: ",
+                                        wav_string.size()));
+
+    std::vector<float> decoded_samples;
+    uint32 decoded_sample_count;
+    uint16 decoded_channel_count;
+    uint32 decoded_sample_rate;
+    OP_REQUIRES_OK(context,
+                   wav::DecodeLin16WaveAsFloatVector(
+                       wav_string, &decoded_samples, &decoded_sample_count,
+                       &decoded_channel_count, &decoded_sample_rate));
+
+    int32 output_sample_count;
+    if (desired_samples_ == -1) {
+      output_sample_count = decoded_sample_count;
+    } else {
+      output_sample_count = desired_samples_;
+    }
+    int32 output_channel_count;
+    if (desired_channels_ == -1) {
+      output_channel_count = decoded_channel_count;
+    } else {
+      output_channel_count = desired_channels_;
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({output_sample_count, output_channel_count}),
+            &output));
+
+    auto output_matrix = output->matrix<float>();
+    for (int sample = 0; sample < output_sample_count; ++sample) {
+      for (int channel = 0; channel < output_channel_count; ++channel) {
+        float output_value;
+        if (sample >= decoded_sample_count) {
+          output_value = 0.0f;
+        } else {
+          int source_channel;
+          if (channel < decoded_channel_count) {
+            source_channel = channel;
+          } else {
+            source_channel = decoded_channel_count - 1;
+          }
+          const int decoded_index =
+              (sample * decoded_channel_count) + source_channel;
+          output_value = decoded_samples[decoded_index];
+        }
+        output_matrix(sample, channel) = output_value;
+      }
+    }
+
+    Tensor* sample_rate_output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}),
+                                                     &sample_rate_output));
+    sample_rate_output->flat<int32>()(0) = decoded_sample_rate;
+  }
+
+ private:
+  int32 desired_channels_;
+  int32 desired_samples_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeWav").Device(DEVICE_CPU), DecodeWavOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_wav_op_test.cc b/tensorflow/core/kernels/decode_wav_op_test.cc
new file mode 100644
index 00000000000..c282d53a5a1
--- /dev/null
+++ b/tensorflow/core/kernels/decode_wav_op_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(DecodeWavOpTest, DecodeWavTest) {
+  Scope root = Scope::NewRootScope();
+
+  std::vector<uint8> wav_data = {
+      'R',  'I',  'F', 'F', 44,  0,   0,   0,  // size of whole file - 8
+      'W',  'A',  'V', 'E', 'f', 'm', 't', ' ', 16, 0, 0,
+      0,                   // size of fmt block - 8: 24 - 8
+      1,    0,             // format: PCM (1)
+      1,    0,             // channels: 1
+      0x13, 0x37, 0,   0,  // sample rate: 14099
+      0x26, 0x6e, 0,   0,  // byte rate: 2 * 14099
+      2,    0,             // block align: NumChannels * BytesPerSample
+      16,   0,             // bits per sample: 2 * 8
+      'd',  'a',  't', 'a', 8,   0,   0,   0,  // size of payload: 8
+      0,    0,                                 // first sample: 0
+      0xff, 0x3f,                              // second sample: 16383
+      0xff, 0x7f,  // third sample: 32767 (saturated)
+      0x00, 0x80,  // fourth sample: -32768 (saturated)
+  };
+  Tensor content_tensor =
+      test::AsScalar<string>(string(wav_data.begin(), wav_data.end()));
+  Output content_op =
+      Const(root.WithOpName("content_op"), Input::Initializer(content_tensor));
+
+  DecodeWav decode_wav_op =
+      DecodeWav(root.WithOpName("decode_wav_op"), content_op);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {decode_wav_op.audio, decode_wav_op.sample_rate},
+                           &outputs));
+
+  const Tensor& audio = outputs[0];
+  const int sample_rate = outputs[1].flat<int32>()(0);
+
+  EXPECT_EQ(2, audio.dims());
+  EXPECT_EQ(1, audio.dim_size(1));
+  EXPECT_EQ(4, audio.dim_size(0));
+  EXPECT_NEAR(0.0f, audio.flat<float>()(0), 1e-4f);
+  EXPECT_NEAR(0.5f, audio.flat<float>()(1), 1e-4f);
+  EXPECT_NEAR(1.0f, audio.flat<float>()(2), 1e-4f);
+  EXPECT_NEAR(-1.0f, audio.flat<float>()(3), 1e-4f);
+  EXPECT_EQ(14099, sample_rate);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 9e6d8e42a47..8e9b8a7e2e7 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// DeepConv2D is a Conv2D implementation specialzied for deep convolutions (i.e
+// DeepConv2D is a Conv2D implementation specialized for deep convolutions (i.e
 // large 'in_depth' and 'out_depth' product. See cost models below for details).
 //
 // DeepConv2D is implemented by computing the following equation:
@@ -1069,7 +1069,7 @@ struct DeepConv2D<CPUDevice, T> {
       // Allocate temporary buffer 'buffer2', which is first used for
       // transformed input tiles, then re-used for transformed output tiles.
       // Calculate required buffer size for 'buffer2' as max required buffer
-      // between input and output tranform buffer sizes.
+      // between input and output transform buffer sizes.
       const int64 buffer2_tile_transform_size =
           tile_spatial_size * num_tiles * in_depth;
       const int64 buffer2_out_transform_size =
diff --git a/tensorflow/core/kernels/deep_conv2d.h b/tensorflow/core/kernels/deep_conv2d.h
index a9de20e7ae7..c3f6f66dc9b 100644
--- a/tensorflow/core/kernels/deep_conv2d.h
+++ b/tensorflow/core/kernels/deep_conv2d.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 class OpKernelContext;
 
-// DeepConv2D is a Conv2D implementation specialzied for deep (i.e. large
+// DeepConv2D is a Conv2D implementation specialized for deep (i.e. large
 // in_depth * out_depth product) convolutions (see deep_conv2d.cc for details).
 
 // DeepConv2DTransform is an interface for implementing transforms for
diff --git a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
new file mode 100644
index 00000000000..50b8a054925
--- /dev/null
+++ b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
@@ -0,0 +1,262 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit DenseToSparseBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Create a new DenseToSparseBatchDatasetOp::Dataset, insert it in the
+    // step-local container, and return it as the output.
+    OP_REQUIRES(
+        ctx, input->output_dtypes().size() == 1,
+        errors::InvalidArgument("DenseToSparseBatchDataset only supports "
+                                "inputs with a single component."));
+
+    int64 batch_size;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    const Tensor* row_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->input("row_shape", &row_shape_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(row_shape_t->shape()),
+                errors::InvalidArgument("row_shape must be a vector"));
+    TensorShape row_shape;
+    for (size_t i = 0; i < row_shape_t->dim_size(0); ++i) {
+      row_shape.AddDim(row_shape_t->vec<int64>()(i));
+    }
+
+    *output = nullptr;
+
+#define HANDLE_TYPE(DT)                                                      \
+  if (input->output_dtypes()[0] == DT) {                                     \
+    *output =                                                                \
+        new Dataset<EnumToDataType<DT>::Type>(batch_size, row_shape, input); \
+  }
+    HANDLE_TYPE(DT_FLOAT);
+    HANDLE_TYPE(DT_HALF);
+    HANDLE_TYPE(DT_DOUBLE);
+    HANDLE_TYPE(DT_INT32);
+    HANDLE_TYPE(DT_UINT8);
+    HANDLE_TYPE(DT_INT16);
+    HANDLE_TYPE(DT_INT8);
+    HANDLE_TYPE(DT_STRING);
+    HANDLE_TYPE(DT_COMPLEX64);
+    HANDLE_TYPE(DT_COMPLEX128);
+    HANDLE_TYPE(DT_INT64);
+    HANDLE_TYPE(DT_BOOL);
+    HANDLE_TYPE(DT_QINT8);
+    HANDLE_TYPE(DT_QUINT8);
+    HANDLE_TYPE(DT_QINT32);
+    HANDLE_TYPE(DT_QINT16);
+    HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+    OP_REQUIRES(
+        ctx, *output != nullptr,
+        errors::Unimplemented("DenseToSparseBatchDataset unhandled data type: ",
+                              input->output_dtypes()[0]));
+  }
+
+ private:
+  // TODO(mrry): Push the templated code down to the raw copying routine.
+  template <class T>
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, const TensorShape& row_shape,
+            const DatasetBase* input)
+        : batch_size_(batch_size), row_shape_(row_shape), input_(input) {
+      input_->Ref();
+
+      output_shapes_.reserve(3);
+      // Outputs represent a SparseTensor as (indices, values, dense_shape).
+      output_shapes_.push_back({-1, row_shape_.dims() + 1});
+      output_shapes_.push_back({-1});
+      output_shapes_.push_back({row_shape_.dims() + 1});
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* output_dtypes_ =
+          new DataTypeVector({DT_INT64, DataTypeToEnum<T>::value, DT_INT64});
+      return *output_dtypes_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("DenseToSparseBatchDatasetOp(", batch_size_,
+                             ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset<T>> {
+     public:
+      explicit Iterator(const Dataset<T>* dataset)
+          : DatasetIterator<Dataset<T>>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of the output SparseTensor is an individual tensor
+        // from the input iterator.
+        std::vector<Tensor> batch_elements;
+        int64 total_elements = 0;
+        batch_elements.reserve(
+            DatasetIterator<Dataset<T>>::dataset()->batch_size_);
+        const TensorShape& row_shape =
+            DatasetIterator<Dataset<T>>::dataset()->row_shape_;
+        const int row_ndims = row_shape.dims();
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0;
+               i < DatasetIterator<Dataset<T>>::dataset()->batch_size_ &&
+               !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              DCHECK_EQ(1, batch_element_tuple.size());
+              batch_elements.push_back(std::move(batch_element_tuple[0]));
+              total_elements += batch_element_tuple[0].NumElements();
+
+              // TODO(mrry): Investigate how to hoist this check when we
+              // have static information that renders it unnecessary.
+              if (batch_element_tuple[0].shape().dims() != row_ndims) {
+                return errors::InvalidArgument(
+                    "Input element had shape (",
+                    batch_element_tuple[0].shape().DebugString(),
+                    ") that is incompatible with the row shape (",
+                    row_shape.DebugString(), ").");
+              }
+              for (int i = 0; i < row_ndims; ++i) {
+                if (batch_element_tuple[0].shape().dim_size(i) >
+                    row_shape.dim_size(i)) {
+                  return errors::DataLoss(
+                      "Input element had shape (",
+                      batch_element_tuple[0].shape().DebugString(),
+                      ") that is larger than the row shape (",
+                      row_shape.DebugString(), ").");
+                }
+              }
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Determine the size of the output tensors:
+        // * indices will be [`total_elements`, `row_shape + 1`].
+        // * values will be [`total_elements`].
+        // * dense_shape will be [`row_shape + 1`].
+        Tensor indices(cpu_allocator(), DT_INT64,
+                       {total_elements, row_ndims + 1});
+        Tensor values(
+            cpu_allocator(),
+            DatasetIterator<Dataset<T>>::dataset()->output_dtypes()[1],
+            {total_elements});
+        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        auto indices_matrix = indices.matrix<int64>();
+        auto values_flat = values.flat<T>();
+        auto dense_shape_vec = dense_shape.vec<int64>();
+
+        int64 current_position_in_values = 0;
+        for (int64 i = 0; i < batch_elements.size(); ++i) {
+          const Tensor& t = batch_elements[i];
+          const auto& t_flat = t.flat<T>();
+          // TODO(mrry): Replace with a memcpy or something more
+          // efficient. (Maybe an Eigen assign op?)
+          gtl::InlinedVector<int64, 4> strides(row_ndims);
+          if (!strides.empty()) {
+            strides[row_ndims - 1] = 1;
+            for (int64_t row_dim = strides.size() - 2; row_dim >= 0;
+                 --row_dim) {
+              strides[row_dim] =
+                  strides[row_dim + 1] * t.shape().dim_size(row_dim + 1);
+            }
+          }
+
+          for (int64 j = 0; j < t.NumElements(); ++j) {
+            values_flat(current_position_in_values) = t_flat(j);
+            indices_matrix(current_position_in_values, 0) = i;
+            int64 index = j;
+            for (size_t k = 0; k < strides.size(); ++k) {
+              indices_matrix(current_position_in_values, k + 1) =
+                  index / strides[k];
+              index %= strides[k];
+            }
+            ++current_position_in_values;
+          }
+        }
+
+        dense_shape_vec(0) = batch_elements.size();
+        for (size_t i = 0; i < row_ndims; ++i) {
+          dense_shape_vec(i + 1) = row_shape.dim_size(i);
+        }
+
+        out_tensors->push_back(std::move(indices));
+        out_tensors->push_back(std::move(values));
+        out_tensors->push_back(std::move(dense_shape));
+
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const TensorShape row_shape_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
+                        DenseToSparseBatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 92988d72d09..33991fa1f9d 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -50,10 +50,15 @@ struct DenseUpdate<Eigen::ThreadPoolDevice, string, ASSIGN> {
                   update.data()[i].data(), update.data()[i].size());
         }
       };
-      // first element of the tensor seems as good a guess as any of the sizes
-      // of the strings contained within...
-      const int64 estimated_string_size =
-          std::max(update.data()[0].size(), sizeof(string));
+      int64 estimated_string_size;
+      if (update.size() > 0) {
+        // first element of the tensor seems as good a guess as any of the sizes
+        // of the strings contained within...
+        estimated_string_size =
+            std::max(update.data()[0].size(), sizeof(string));
+      } else {
+        estimated_string_size = sizeof(string);
+      }
       d.parallelFor(
           params.dimension(0),
           Eigen::TensorOpCost(estimated_string_size, estimated_string_size, 0),
@@ -121,6 +126,9 @@ class DenseUpdateOp : public OpKernel {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -131,25 +139,6 @@ TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#define REGISTER_SYCL_KERNEL(type)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-                          Name("Assign")                               \
-                          .Device(DEVICE_SYCL)                         \
-                          .TypeConstraint<type>("T"),                  \
-                          AssignOpT<SYCLDevice, type>);                \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
-
-REGISTER_SYCL_KERNEL(float);
-#undef REGISTER_SYCL_KERNEL
-#endif
-
 #if GOOGLE_CUDA
 // Only register 'Assign' on GPU for the subset of types also supported by
 // 'Variable' (see variable_ops.cc.)
@@ -169,6 +158,16 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                \
+REGISTER_KERNEL_BUILDER(                                           \
+    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+    AssignOpT<SYCLDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
@@ -208,4 +207,16 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // end GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index f9076cb903f..00d7f564082 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -25,12 +25,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
@@ -62,23 +64,51 @@ typedef Eigen::GpuDevice GPUDevice;
       context, batch == out_backprop.dim_size(0),                              \
       errors::InvalidArgument(                                                 \
           label, ": input and out_backprop must have the same batch size"));   \
-  const int64 input_rows = input_shape.dim_size(1);                            \
-  const int64 input_cols = input_shape.dim_size(2);                            \
+  const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
+      errors::InvalidArgument("Input rows too large"));                        \
+  const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
+  const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
+      errors::InvalidArgument("Input cols too large"));                        \
+  const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
   const int64 filter_rows = filter_shape.dim_size(0);                          \
   const int64 filter_cols = filter_shape.dim_size(1);                          \
-  const int64 output_rows = out_backprop.dim_size(1);                          \
-  const int64 output_cols = out_backprop.dim_size(2);                          \
-  const int64 in_depth = input_shape.dim_size(3);                              \
+  const int64 output_rows_raw =                                                \
+      GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
+      errors::InvalidArgument("Output rows too large"));                       \
+  const int32 output_rows = static_cast<int32>(output_rows_raw);               \
+  const int64 output_cols_raw =                                                \
+      GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
+      errors::InvalidArgument("Output cols too large"));                       \
+  const int32 output_cols = static_cast<int32>(output_cols_raw);               \
+  const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
               errors::InvalidArgument(                                         \
                   label, ": input and filter must have the same in_depth"));   \
   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
-  const int64 out_depth = out_backprop.dim_size(3);                            \
+  const int64 out_depth_raw =                                                  \
+      GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
+      errors::InvalidArgument("Output depth too large"));                      \
+  const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
   OP_REQUIRES(                                                                 \
       context, (depth_multiplier * in_depth) == out_depth,                     \
       errors::InvalidArgument(                                                 \
           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
-  const auto stride = strides_[1];                                             \
+  const auto stride = stride_;                                                 \
   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                \
   OP_REQUIRES_OK(context,                                                      \
                  GetWindowedOutputSize(input_rows, filter_rows, stride,        \
@@ -343,7 +373,12 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
   static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
                      const T* out_backprop, const T* depthwise_filter,
-                     T* in_backprop) {
+                     T* in_backprop, TensorFormat data_format) {
+    OP_REQUIRES(
+        ctx, data_format == FORMAT_NHWC,
+        errors::Unimplemented(
+            "Depthwise convolution on CPU is only supported for NHWC format"));
+
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
     // Pad 'depthwise_filter' to vector register width (if needed).
@@ -482,16 +517,18 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
   static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* filter, T* in_backprop);
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename T>
 struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* filter, T* in_backprop) {
+                     const T* out_backprop, const T* filter, T* in_backprop,
+                     TensorFormat data_format) {
     const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    DepthwiseConv2dBackpropInputGPULaunch<T>().Run(d, args, out_backprop,
-                                                   filter, in_backprop);
+    DepthwiseConv2dBackpropInputGPULaunch<T>().Run(
+        d, args, out_backprop, filter, in_backprop, data_format);
     auto stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
                                                     "DepthwiseConv2dBackpropInp"
@@ -511,12 +548,23 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, strides_[1] == strides_[2],
+
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    stride_ = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+    OP_REQUIRES(context, stride_ == stride_w,
                 errors::InvalidArgument(
                     "Current implementation only supports equal length "
                     "strides in the row and column dimensions."));
     OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
+        context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -539,12 +587,10 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       input_shape.AddDim(in_sizes_data[i]);
     }
     const TensorShape& filter_shape = filter.shape();
-
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
     Tensor* in_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_shape, &in_backprop));
-
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input_shape, &in_backprop));
     auto out_backprop_ptr = out_backprop.template flat<T>().data();
     auto filter_ptr = filter.template flat<T>().data();
     auto in_backprop_ptr = in_backprop->template flat<T>().data();
@@ -553,12 +599,15 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       return;
     }
     LaunchDepthwiseConvBackpropInputOp<Device, T>::launch(
-        context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr);
+        context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
+        data_format_);
   }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
+  TensorFormat data_format_;
+  int64 stride_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };
@@ -696,8 +745,13 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
   static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* out_backprop, const T* input,
-                     T* filter_backprop) {
+                     const T* out_backprop, const T* input, T* filter_backprop,
+                     TensorFormat data_format) {
+    OP_REQUIRES(
+        ctx, data_format == FORMAT_NHWC,
+        errors::Unimplemented(
+            "Depthwise convolution on CPU is only supported for NHWC format"));
+
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
@@ -856,14 +910,15 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
   static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* input, T* filter_backprop);
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename T>
 struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* input,
-                     T* filter_backprop) {
+                     const T* out_backprop, const T* input, T* filter_backprop,
+                     TensorFormat data_format) {
     const GPUDevice& d = ctx->eigen_device<GPUDevice>();
     auto stream = ctx->op_device_context()->stream();
 
@@ -874,8 +929,8 @@ struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
                                                         num_filter_backprop);
     stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
 
-    DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(d, args, out_backprop,
-                                                    input, filter_backprop);
+    DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(
+        d, args, out_backprop, input, filter_backprop, data_format);
     OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
                                                     "DepthwiseConv2dBackpropFil"
                                                     "terGPULaunch failed"));
@@ -894,12 +949,23 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, strides_[1] == strides_[2],
+
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    stride_ = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+    OP_REQUIRES(context, stride_ == stride_w,
                 errors::InvalidArgument(
                     "Current implementation only supports equal length "
                     "strides in the row and column dimensions."));
     OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
+        context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -925,8 +991,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
     Tensor* filter_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, filter_shape, &filter_backprop));
 
     auto out_backprop_ptr = out_backprop.template flat<T>().data();
     auto input_ptr = input.template flat<T>().data();
@@ -936,12 +1002,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       return;
     }
     LaunchDepthwiseConvBackpropFilterOp<Device, T>::launch(
-        context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr);
+        context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
+        data_format_);
   }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
+  TensorFormat data_format_;
+  int64 stride_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
 };
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 172bc1bcbaf..ccd33c08612 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -82,7 +84,7 @@ struct DepthwiseConv2DKernel {
   static void Run(const DepthwiseArgs& args,
                   const int64 padded_filter_inner_dim_size, const int64 out_r,
                   const int64 out_c, const T* filter, const T* input_buffer,
-                  T* output) {
+                  T* output, TensorFormat data_format) {
     typedef typename Eigen::internal::packet_traits<T>::type Packet;
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
@@ -155,7 +157,12 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
   static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* input, const T* depthwise_filter, T* output) {
+                     const T* input, const T* depthwise_filter, T* output,
+                     TensorFormat data_format) {
+    OP_REQUIRES(
+        ctx, data_format == FORMAT_NHWC,
+        errors::Unimplemented(
+            "Depthwise convolution on CPU is only supported for NHWC format"));
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
     // Pad 'depthwise_filter' to vector register width (if needed).
@@ -179,8 +186,8 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
 
     // Computes one shard of depthwise conv2d output.
-    auto shard = [&ctx, &args, &input, &filter_data, &output](int64 start,
-                                                              int64 limit) {
+    auto shard = [&ctx, &args, &input, &filter_data, &output, data_format](
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 input_image_size =
           args.in_rows * args.in_cols * args.in_depth;
@@ -199,29 +206,37 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
                                   &input_buffer));
       T* input_buffer_data = input_buffer.template flat<T>().data();
 
-      for (int64 b = start; b < limit; ++b) {
+      for (int64 i = start; i < limit; ++i) {
+        const int64 b = i / args.out_rows;
         const int64 in_base = b * input_image_size;
         const int64 out_base = b * output_image_size;
 
-        for (int64 out_r = 0; out_r < args.out_rows; ++out_r) {
-          for (int64 out_c = 0; out_c < args.out_cols; ++out_c) {
-            // Populate 'input_buffer_data' with data from local input region.
-            functor::DepthwiseInputCopyOp<T>()(
-                args, padded_filter_inner_dim_size, out_r, out_c,
-                input + in_base, input_buffer_data);
+        const int64 out_r = i % args.out_rows;
 
-            // Process buffered input across all filters and store to output.
-            DepthwiseConv2DKernel<T>::Run(args, padded_filter_inner_dim_size,
-                                          out_r, out_c, filter_data,
-                                          input_buffer_data, output + out_base);
-          }
+        for (int64 out_c = 0; out_c < args.out_cols; ++out_c) {
+          // Populate 'input_buffer_data' with data from local input region.
+          functor::DepthwiseInputCopyOp<T>()(args, padded_filter_inner_dim_size,
+                                             out_r, out_c, input + in_base,
+                                             input_buffer_data);
+
+          // Process buffered input across all filters and store to output.
+          DepthwiseConv2DKernel<T>::Run(
+              args, padded_filter_inner_dim_size, out_r, out_c, filter_data,
+              input_buffer_data, output + out_base, data_format);
         }
       }
     };
 
-    // TODO(andydavis) Shard over batch X out_rows (instead of just batch).
-    const int64 total_shards = args.batch;
-    const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
+    const int64 total_shards = args.batch * args.out_rows;
+
+    // Empirically tested to give reasonable performance boosts at batch size 1
+    // without reducing throughput at batch size 32.
+    const float kCostMultiplier = 2.5f;
+
+    // TODO(andydavis): Estimate shard cost (in cycles) based on the number of
+    // flops/loads/stores required to compute one shard.
+    const int64 shard_cost = kCostMultiplier * args.out_cols * args.out_depth;
+
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
           shard_cost, shard);
@@ -236,15 +251,17 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
 template <typename T>
 struct DepthwiseConv2dGPULaunch {
   static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
-                  const T* filter, T* output);
+                  const T* filter, T* output, TensorFormat data_format);
 };
 
 template <typename T>
 struct LaunchDepthwiseConvOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* input, const T* filter, T* output) {
+                     const T* input, const T* filter, T* output,
+                     TensorFormat data_format) {
     const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    DepthwiseConv2dGPULaunch<T>().Run(d, args, input, filter, output);
+    DepthwiseConv2dGPULaunch<T>().Run(d, args, input, filter, output,
+                                      data_format);
     auto stream = ctx->op_device_context()->stream();
     OP_REQUIRES(
         ctx, stream->ok(),
@@ -264,15 +281,25 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
   explicit DepthwiseConv2dNativeOp(OpKernelConstruction* context)
       : BinaryOp<T>(context) {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, strides_[1] == strides_[2],
+    stride_ = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+    OP_REQUIRES(context, stride_ == stride_w,
                 errors::InvalidArgument(
                     "Current implementation only supports equal length "
                     "strides in the row and column dimensions."));
     OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
+        context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -299,9 +326,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
                 errors::InvalidArgument("filter must be 4-dimensional: ",
                                         filter.shape().DebugString()));
 
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int32 in_depth = input.dim_size(3);
+    // in_depth for input and filter must match.
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
     OP_REQUIRES(
         context, in_depth == filter.dim_size(2),
         errors::InvalidArgument("input and filter must have the same depth: ",
@@ -313,40 +339,40 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // The output depth is input depth x depth multipler
     const int32 out_depth = in_depth * depth_multiplier;
 
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int32 input_rows = input.dim_size(1);
+    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Input rows too large"));
+    const int32 input_rows = static_cast<int32>(input_rows_raw);
     const int32 filter_rows = filter.dim_size(0);
 
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int32 input_cols = input.dim_size(2);
+    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Input cols too large"));
+    const int32 input_cols = static_cast<int32>(input_cols_raw);
     const int32 filter_cols = filter.dim_size(1);
 
     // The first dimension for input is batch.
     const int32 batch = input.dim_size(0);
 
-    // For now we take the stride from the second dimension only (we
-    // assume row = col stride, and do not support striding on the
-    // batch or depth dimension).
-    const int32 stride = strides_[1];
-
     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
     OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_,
                                          padding_, &out_rows, &pad_rows));
     OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_,
                                          padding_, &out_cols, &pad_cols));
-    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
     OP_REQUIRES(
         context, out_shape.num_elements() <= 2147483647,
         errors::InvalidArgument("total number of outputs should be within the "
                                 "range of int which is used in the GPU kernel",
                                 in_depth, " vs ", filter.dim_size(2)));
 
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
@@ -354,7 +380,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
             << filter_cols << ", " << in_depth << ", " << depth_multiplier
-            << "]; stride = " << stride << ", pad_rows = " << pad_rows
+            << "]; stride = " << stride_ << ", pad_rows = " << pad_rows
             << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
             << out_rows << ", " << out_cols << ", " << out_depth << "]";
 
@@ -363,10 +389,12 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       return;
     }
 
+    // If in_depth==1, this operation is just a standard convolution, so
+    // invoke that op.
     if (std::is_same<T, float>::value && in_depth == 1) {
       launcher_.launch(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                       stride, stride, BrainPadding2EigenPadding(padding_),
-                       output, FORMAT_NHWC);
+                       stride_, stride_, BrainPadding2EigenPadding(padding_),
+                       output, data_format_);
       return;
     }
 
@@ -378,7 +406,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     args.filter_rows = filter_rows;
     args.filter_cols = filter_cols;
     args.depth_multiplier = depth_multiplier;
-    args.stride = stride;
+    args.stride = stride_;
     args.pad_rows = pad_rows;
     args.pad_cols = pad_cols;
     args.out_rows = out_rows;
@@ -388,13 +416,16 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     auto input_ptr = input.template flat<T>().data();
     auto filter_ptr = filter.template flat<T>().data();
     auto output_ptr = output->template flat<T>().data();
-    LaunchDepthwiseConvOp<Device, T>::launch(context, args, input_ptr,
-                                             filter_ptr, output_ptr);
+    LaunchDepthwiseConvOp<Device, T>::launch(
+        context, args, input_ptr, filter_ptr, output_ptr, data_format_);
   }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
+  TensorFormat data_format_;
+
+  int64 stride_;  // in height/width dimension.
 
   // For the case in_depth == 1.
   LaunchConv2DOp<Device, T> launcher_;
@@ -410,7 +441,9 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       DepthwiseConv2dNativeOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
+#if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
+#endif
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index ed5bf05031e..1960b02bbea 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index b256d24517f..be9fc5de693 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -16,34 +16,65 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #else
-#define UNROLL 
+#define UNROLL
+#define NOUNROLL
 #endif
 
 namespace tensorflow {
 
-namespace {
+using Eigen::GpuDevice;
 
-typedef Eigen::GpuDevice GPUDevice;
+// Returns whether depthwise convolution forward or backward input pass can be
+// performed using the faster ('Small') variant of the kernel.
+EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+    const DepthwiseArgs args) {
+  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
+         args.in_cols <= 32 && args.in_rows == args.out_rows &&
+         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+         args.pad_cols < args.filter_cols &&
+         args.filter_rows * args.filter_cols <=
+             (args.in_rows + 1) / 2 * args.in_cols;
+}
 
-// A Cuda kernel to compute the depthwise convolution forward pass.
-template <typename T>
-__global__ void DepthwiseConv2dGPUKernel(const DepthwiseArgs args,
-                                         const T* input, const T* filter,
-                                         T* output, int num_outputs) {
+// Returns whether depthwise convolution backward filter pass can be performed
+// using the faster ('Small') variant of the kernel.
+EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    const DepthwiseArgs args, const int block_rows) {
+  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
+         args.in_cols <= 32 && args.in_rows == args.out_rows &&
+         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+         args.pad_cols < args.filter_cols && block_rows <= args.in_rows &&
+         args.filter_rows * args.filter_cols <= args.in_cols * block_rows;
+}
+
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NHWC format.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -61,8 +92,8 @@ __global__ void DepthwiseConv2dGPUKernel(const DepthwiseArgs args,
     const int in_d = OD / depth_multiplier;
     const int multiplier = OD % depth_multiplier;
 
-    // Decide if all input is valid, if yes, we can skip the boundary checks for
-    // each input.
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
     const int input_row_start = OR * stride - pad_rows;
     const int input_col_start = OC * stride - pad_cols;
     const int input_row_end = input_row_start + filter_rows;
@@ -99,9 +130,8 @@ __global__ void DepthwiseConv2dGPUKernel(const DepthwiseArgs args,
             const int input_offset =
                 in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
             const int filter_offset =
-                multiplier +
-                depth_multiplier *
-                    (in_d + in_depth * (f_c + filter_offset_temp));
+                multiplier + depth_multiplier *
+                                 (in_d + in_depth * (f_c + filter_offset_temp));
             sum += ldg(input + input_offset) * ldg(filter + filter_offset);
           }
         }
@@ -110,23 +140,551 @@ __global__ void DepthwiseConv2dGPUKernel(const DepthwiseArgs args,
     output[thread_id] = sum;
   }
 }
-}  // namespace
+
+// CUDA kernel to compute the depthwise convolution forward pass in NHWC format,
+// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
+    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.x depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  const int block_rows = blockDim.z;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_size = block_rows * in_cols * kBlockSlices;
+  const int in_row_size = in_cols * in_depth;
+  const int in_size = in_rows * in_row_size;
+  const int in_increment = (in_cols - 1) * kBlockSlices;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+  const int tile_rows = in_rows + filter_rows - even_rows;
+  const int tile_row_size = tile_cols * kBlockSlices;
+  const int tile_size = tile_rows * tile_row_size;
+  const int tile_offset = block_rows * tile_row_size;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
+  const int in_blocks = batch_blocks * batches;
+  const int tensor_offset =
+      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
+
+  const int thread_depth = threadIdx.x;
+  const int thread_col = threadIdx.y;
+  const int thread_row = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_pix * in_depth + thread_depth;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = data_pix * kBlockSlices + thread_depth;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_pix = data_pix + pad_offset;
+  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
+
+  const int max_depth = in_depth - thread_depth;
+  const int filter_write_offset =
+      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset = tile_size + thread_depth;
+  const bool skip_second =
+      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int batch = b / batch_blocks;
+    const int stack = b - batch * batch_blocks;
+
+    const int start_depth = stack * kBlockSlices;
+    const int filter_offset = tensor_idx + start_depth;
+    const int inout_offset = batch * in_size + filter_offset;
+    const bool depth_in_range = start_depth < max_depth;
+
+    if (depth_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
+      }
+
+      if (filter_write_offset != 0) {
+        shared_data[filter_write_offset] = ldg(filter_offset + filter);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (depth_in_range) {
+      T sum1 = 0;
+      T sum2 = 0;
+      int shared_offset = data_idx;
+      const T* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          const T filter_value = *filter_ptr;
+          const T* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          shared_offset += kBlockSlices;
+          filter_ptr += kBlockSlices;
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = sum1;
+      if (!skip_second) {
+        out_ptr[tensor_offset] = sum2;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
+    // Compute the indexes of this thread in the output.
+    //
+    // We want coalesced reads so we make sure that each warp reads
+    // a contiguous chunk of memory.
+    //
+    // THIS IS PROBABLY WRONG, we are not doing coalesced reads
+    // into the input, because of the depth multiplier division...
+    const int OC = thread_id % out_cols;
+    const int OR = (thread_id / out_cols) % out_rows;
+    const int OD = (thread_id / out_cols / out_rows) % out_depth;
+    const int OB = thread_id / out_cols / out_rows / out_depth;
+
+    // Compute the input depth and the index of depth multiplier
+    // based off the output depth index that this thread is
+    // computing n.
+    const int in_d = OD / depth_multiplier;
+    const int multiplier = OD % depth_multiplier;
+
+    // Data is stored in the following format (let's assume we
+    // flatten the height and width into one contiguous dimension
+    // called "P".
+    //
+    // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
+    // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
+    //
+    // Each row contains in_depth * in_rows * in_cols values
+    // for each sample in the batch.
+    //
+    // We can further flatten it into:
+    //
+    // B1C1P1 B1C1P2 .....
+    // B1C2P1 B1C2P2 ....
+    // B2C1P1 B2C1P2 .....
+    // B2C2P1 B2C2P2 ....
+    //
+    // where each row is a contiguous array of all of the spatial
+    // pixels for a given batch and input depth.  The following
+    // loop unrolls across the filter dimensions for a given thread,
+    // indexing into the filter value and the corresponding input
+    // patch.
+    //
+    // We can compute the index into the patch once right here.
+    const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols);
+
+    // Finally, we can iterate over the spatial dimensions and perform the
+    // convolution, writing into the output at the end.
+    //
+    // We perform an additional optimization, where we can determine
+    // whether the patch fits within the image indices statically, and
+    // avoid boundary checking within the loop.
+    const int input_row_start = OR * stride - pad_rows;
+    const int input_col_start = OC * stride - pad_cols;
+    const int input_row_end = input_row_start + filter_rows;
+    const int input_col_end = input_col_start + filter_cols;
+
+    T sum = 0;
+    if (input_row_start >= 0 && input_col_start >= 0 &&
+        input_row_end < in_rows && input_col_end < in_cols) {
+      // Loop that doesn't need to check for boundary conditions.
+      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = input_row_start + f_r;
+        const int filter_offset_temp = filter_cols * f_r;
+        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = input_col_start + f_c;
+
+          const int input_offset =
+              (input_offset_temp) + (in_r * in_cols) + in_c;
+          const int filter_offset =
+              multiplier +
+              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
+          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
+        }
+      }
+    } else {
+      // Loop that needs to check for boundary conditions.
+      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = input_row_start + f_r;
+        const int filter_offset_temp = filter_cols * f_r;
+        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = input_col_start + f_c;
+          // TODO(vrv): the in_r check can be done outside of this loop;
+          // benchmark both methods to determine the better decision.
+          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
+            const int in_c = input_col_start + f_c;
+
+            // input_offset_temp indexes into the start of memory
+            // where the spatial data starts.
+            const int input_offset =
+                (input_offset_temp) + (in_r * in_cols) + in_c;
+
+            const int filter_offset =
+                multiplier + depth_multiplier *
+                                 (in_d + in_depth * (f_c + filter_offset_temp));
+            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
+          }
+        }
+      }
+    }
+
+    output[thread_id] = sum;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
+// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+  const int block_rows = blockDim.y;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_cols * block_rows;
+  const int block_size = block_pixels * kBlockSlices;
+  const int in_pixels = in_cols * in_rows;
+  const int in_increment = in_cols - 1;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+  const int tile_rows = in_rows + filter_rows - even_rows;
+  const int tile_pixels = tile_cols * tile_rows;
+  const int tile_size = tile_pixels * kBlockSlices;
+  const int tile_offset = block_rows * tile_cols;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int in_slices = in_depth * batches;
+  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Filter is always in HWCK format, irrespective of the input/output format.
+  const int filter_pix = thread_idx / kBlockSlices;
+  const int filter_depth = thread_idx % kBlockSlices;
+  const int filter_idx = filter_pix * in_depth;
+
+  const int max_slice = in_slices - thread_depth;
+  const int filter_write_offset =
+      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset = tile_size + thread_depth;
+  const bool skip_second =
+      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int slice = b * kBlockSlices;
+
+    const int inout_offset = slice * in_pixels + tensor_idx;
+    const bool slice_in_range = slice < max_slice;
+
+    if (slice_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+      }
+    }
+
+    if (filter_write_offset != 0) {
+      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
+      shared_data[filter_write_offset] = ldg(filter_offset + filter);
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (slice_in_range) {
+      T sum1 = 0;
+      T sum2 = 0;
+      int shared_offset = data_idx;
+      const T* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          const T filter_value = *filter_ptr;
+          const T* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          ++shared_offset;
+          filter_ptr += kBlockSlices;
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = sum1;
+      if (!skip_second) {
+        out_ptr[block_pixels] = sum2;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+                                   const T* input, const T* filter, T* output,
+                                   TensorFormat data_format) {
+  const int block_rows = (args.in_rows + 1) / 2;
+  const int tile_cols = args.in_cols + args.filter_cols - 1;
+  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_rows * tile_cols;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
+
+  const int shared_memory_size =
+      kBlockSlices * (tile_pixels + filter_pixels) * sizeof(T);
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+
+  if (data_format == FORMAT_NHWC) {
+    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
+                                          kKnownFilterHeight, kBlockSlices,
+                                          kKnownEvenRows>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                      kBlockSlices, kKnownEvenRows>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, input, filter, output);
+  } else if (data_format == FORMAT_NCHW) {
+    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth,
+                                          kKnownFilterHeight, kBlockSlices,
+                                          kKnownEvenRows>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                      kBlockSlices, kKnownEvenRows>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, input, filter, output);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+                                   const T* input, const T* filter, T* output,
+                                   TensorFormat data_format) {
+  if (args.in_rows & 1) {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                  kBlockSlices, false>(d, args, input, filter,
+                                                       output, data_format);
+  } else {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                  kBlockSlices, true>(d, args, input, filter,
+                                                      output, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+                                   const T* input, const T* filter, T* output,
+                                   TensorFormat data_format) {
+  // Maximize (power of two) kBlockSlices while keeping a block within 1024
+  // threads (2 pixels per thread).
+  const int in_pixels = args.in_rows * args.in_cols;
+  if (in_pixels > 512) {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 2>(
+        d, args, input, filter, output, data_format);
+  } else if (in_pixels > 256) {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 4>(
+        d, args, input, filter, output, data_format);
+  } else {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 8>(
+        d, args, input, filter, output, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+                              const T* input, const T* filter, T* output,
+                              TensorFormat data_format) {
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  // The compile-time constant version runs faster with a single block.
+  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
+                                      kKnownDepthMultiplier < 0
+                                  ? std::numeric_limits<int>::max()
+                                  : d.getNumCudaMultiProcessors();
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+                              const T* input, const T* filter, T* output,
+                              TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+      LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>(
+          d, args, input, filter, output, data_format);
+      return;
+    }
+
+    LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, 1>(
+        d, args, input, filter, output, data_format);
+  } else {
+    LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, -1>(
+        d, args, input, filter, output, data_format);
+  }
+}
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
-                  const T* filter, T* output) {
-    // In this kernel, each thread is computing the gradients from one element
-    // in the out_backprop. Note that one element in the out_backprop can map
-    // to multiple filter elements.
-    const int num_outputs =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
-
-    DepthwiseConv2dGPUKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        args, input, filter, output, num_outputs);
+  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
+                  const T* filter, T* output, TensorFormat data_format) {
+    if (args.filter_rows == 3 && args.filter_cols == 3) {
+      LaunchDepthwiseConv2dGPU<T, 3, 3>(d, args, input, filter, output,
+                                        data_format);
+    } else {
+      LaunchDepthwiseConv2dGPU<T, -1, -1>(d, args, input, filter, output,
+                                          data_format);
+    }
   }
 };
 
@@ -134,18 +692,22 @@ template struct DepthwiseConv2dGPULaunch<float>;
 template struct DepthwiseConv2dGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropInputGPUKernel(const DepthwiseArgs args,
-                                                      const T* out_backprop,
-                                                      const T* filter,
-                                                      T* in_backprop,
-                                                      int num_in_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
+                                              const T* out_backprop,
+                                              const T* filter, T* in_backprop,
+                                              int num_in_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -160,6 +722,211 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernel(const DepthwiseArgs args,
     const int in_r = (thread_id / in_depth / in_cols) % in_rows;
     const int b = thread_id / in_depth / in_cols / in_rows;
 
+    T sum = 0;
+
+    const int out_r_start =
+        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
+    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
+    const int out_c_start =
+        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
+    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
+
+    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+      const int f_r = in_r + pad_rows - out_r * stride;
+      const int temp_out_backprop_offset =
+          out_depth * out_cols * (out_r + out_rows * b);
+      const int temp_filter_offset = filter_cols * f_r;
+      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+        const int f_c = in_c + pad_cols - out_c * stride;
+        int filter_offset =
+            depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
+        const int out_backprop_offset =
+            out_depth * out_c + temp_out_backprop_offset;
+#pragma unroll 6
+        for (int i = 0; i < depth_multiplier; ++i) {
+          sum += ldg(out_backprop + out_backprop_offset +
+                     in_d * depth_multiplier + i) *
+                 ldg(filter + filter_offset + i);
+        }
+      }
+    }
+    const int in_backprop_offset =
+        in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
+    in_backprop[in_backprop_offset] = sum;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backward w.r.t. input in
+// NHWC format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME', which allows to reuse the index
+// computation. Only use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args)
+// returns true.
+// Implementation is the same as the forward pass, except that the filter is
+// rotate by 180°, see filter_read_offset and filter_ptr.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
+    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.x depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
+  const int block_rows = blockDim.z;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_size = block_rows * in_cols * kBlockSlices;
+  const int in_row_size = in_cols * in_depth;
+  const int in_size = in_rows * in_row_size;
+  const int in_increment = (in_cols - 1) * kBlockSlices;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+  const int tile_rows = in_rows + filter_rows - even_rows;
+  const int tile_row_size = tile_cols * kBlockSlices;
+  const int tile_size = tile_rows * tile_row_size;
+  const int tile_offset = block_rows * tile_row_size;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
+  const int in_blocks = batch_blocks * batches;
+  const int tensor_offset =
+      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
+
+  const int thread_depth = threadIdx.x;
+  const int thread_col = threadIdx.y;
+  const int thread_row = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_pix * in_depth + thread_depth;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = data_pix * kBlockSlices + thread_depth;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_pix = data_pix + pad_offset;
+  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
+
+  const int max_depth = in_depth - thread_depth;
+  const int filter_write_offset =
+      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset =
+      tile_size + filter_pixels * kBlockSlices + thread_depth;
+  const bool skip_second =
+      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int batch = b / batch_blocks;
+    const int stack = b - batch * batch_blocks;
+
+    const int start_depth = stack * kBlockSlices;
+    const int filter_offset = tensor_idx + start_depth;
+    const int inout_offset = batch * in_size + filter_offset;
+    const bool depth_in_range = start_depth < max_depth;
+
+    if (depth_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
+      }
+
+      if (filter_write_offset != 0) {
+        shared_data[filter_write_offset] = ldg(filter_offset + filter);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (depth_in_range) {
+      T sum1 = 0;
+      T sum2 = 0;
+      int shared_offset = data_idx;
+      const T* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          filter_ptr -= kBlockSlices;
+          const T filter_value = *filter_ptr;
+          const T* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          shared_offset += kBlockSlices;
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = sum1;
+      if (!skip_second) {
+        out_ptr[tensor_offset] = sum2;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
+                                              const T* out_backprop,
+                                              const T* filter, T* in_backprop,
+                                              int num_in_backprop) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  // TODO(vrv): Consider assigning threads to output and using
+  // atomics for accumulation, similar to the filter case.
+  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
+    // Compute the indexes of this thread in the input.
+    const int in_c = thread_id % in_cols;
+    const int in_r = (thread_id / in_cols) % in_rows;
+    const int in_d = (thread_id / in_cols / in_rows) % in_depth;
+    const int b = thread_id / in_depth / in_cols / in_rows;
+
     T sum = 0;
     const int out_d_start = in_d * depth_multiplier;
     const int out_d_end = out_d_start + depth_multiplier;
@@ -175,39 +942,329 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernel(const DepthwiseArgs args,
       UNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
         const int f_r = in_r + pad_rows - out_r * stride;
         const int filter_dm = out_d - out_d_start;
-        const int temp_out_backprop_offset = out_cols * (out_r + out_rows * b);
+
         const int temp_filter_offset = filter_cols * f_r;
         for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
           const int f_c = in_c + pad_cols - out_c * stride;
           const int filter_offset =
-              filter_dm +
-              args.depth_multiplier *
-                  (in_d + in_depth * (f_c + temp_filter_offset));
+              filter_dm + args.depth_multiplier *
+                              (in_d + in_depth * (f_c + temp_filter_offset));
+
           const int out_backprop_offset =
-              out_d + out_depth * (out_c + temp_out_backprop_offset);
+              (b * out_depth * out_rows * out_cols) +
+              (out_d * out_rows * out_cols) + (out_r * out_cols) + (out_c);
+
           sum += ldg(out_backprop + out_backprop_offset) *
                  ldg(filter + filter_offset);
         }
       }
     }
-    const int in_backprop_offset =
-        in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
+    const int in_backprop_offset = (b * in_rows * in_cols * in_depth) +
+                                   (in_d * in_rows * in_cols) +
+                                   (in_r * in_cols) + (in_c);
     in_backprop[in_backprop_offset] = sum;
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backward w.r.t. input in
+// NHWC format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME', which allows to reuse the index
+// computation. Only use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args)
+// returns true.
+// Implementation is the same as the forward pass, except that the filter is
+// rotate by 180°, see filter_read_offset and filter_ptr.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+  const int block_rows = blockDim.y;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_cols * block_rows;
+  const int block_size = block_pixels * kBlockSlices;
+  const int in_pixels = in_cols * in_rows;
+  const int in_increment = in_cols - 1;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+  const int tile_rows = in_rows + filter_rows - even_rows;
+  const int tile_pixels = tile_cols * tile_rows;
+  const int tile_size = tile_pixels * kBlockSlices;
+  const int tile_offset = block_rows * tile_cols;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int in_slices = in_depth * batches;
+  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Filter is always in HWCK format, irrespective of the input/output format.
+  const int filter_pix = thread_idx / kBlockSlices;
+  const int filter_depth = thread_idx % kBlockSlices;
+  const int filter_idx = filter_pix * in_depth;
+
+  const int max_slice = in_slices - thread_depth;
+  const int filter_write_offset =
+      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset =
+      tile_size + filter_pixels * kBlockSlices + thread_depth;
+  const bool skip_second =
+      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int slice = b * kBlockSlices;
+
+    const int inout_offset = slice * in_pixels + tensor_idx;
+    const bool slice_in_range = slice < max_slice;
+
+    if (slice_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+      }
+    }
+
+    if (filter_write_offset != 0) {
+      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
+      shared_data[filter_write_offset] = ldg(filter_offset + filter);
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (slice_in_range) {
+      T sum1 = 0;
+      T sum2 = 0;
+      int shared_offset = data_idx;
+      const T* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          filter_ptr -= kBlockSlices;
+          const T filter_value = *filter_ptr;
+          const T* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          ++shared_offset;
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = sum1;
+      if (!skip_second) {
+        out_ptr[block_pixels] = sum2;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, bool kKnownEvenRows>
+void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
+                                                const DepthwiseArgs args,
+                                                const T* out_backprop,
+                                                const T* filter, T* in_backprop,
+                                                TensorFormat data_format) {
+  const int block_rows = (args.in_rows + 1) / 2;
+  const int tile_cols = args.in_cols + args.filter_cols - 1;
+  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_rows * tile_cols;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
+
+  const int shared_memory_size =
+      kBlockSlices * (tile_pixels + filter_pixels) * sizeof(T);
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+
+  if (data_format == FORMAT_NHWC) {
+    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
+            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
+            kKnownEvenRows>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kKnownEvenRows>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, out_backprop, filter, in_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dBackpropInputGPUKernelNCHWSmall<
+            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
+            kKnownEvenRows>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dBackpropInputGPUKernelNCHWSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kKnownEvenRows>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, out_backprop, filter, in_backprop);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices>
+void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
+                                                const DepthwiseArgs args,
+                                                const T* out_backprop,
+                                                const T* filter, T* in_backprop,
+                                                TensorFormat data_format) {
+  if (args.in_rows & 1) {
+    LaunchDepthwiseConv2dBackpropInputGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, false>(
+        d, args, out_backprop, filter, in_backprop, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropInputGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, true>(
+        d, args, out_backprop, filter, in_backprop, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
+                                                const DepthwiseArgs args,
+                                                const T* input, const T* filter,
+                                                T* output,
+                                                TensorFormat data_format) {
+  // Maximize (power of two) kBlockSlices while keeping a block within 1024
+  // threads (2 pixels per thread).
+  const int in_pixels = args.in_rows * args.in_cols;
+  if (in_pixels > 512) {
+    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
+                                               kKnownFilterHeight, 2>(
+        d, args, input, filter, output, data_format);
+  } else if (in_pixels > 256) {
+    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
+                                               kKnownFilterHeight, 4>(
+        d, args, input, filter, output, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
+                                               kKnownFilterHeight, 8>(
+        d, args, input, filter, output, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
+                                           const DepthwiseArgs args,
+                                           const T* out_backprop,
+                                           const T* filter, T* in_backprop,
+                                           TensorFormat data_format) {
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dBackpropInputGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dBackpropInputGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
+                                           const DepthwiseArgs args,
+                                           const T* out_backprop,
+                                           const T* filter, T* in_backprop,
+                                           TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+      LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
+                                                 kKnownFilterHeight>(
+          d, args, out_backprop, filter, in_backprop, data_format);
+      return;
+    }
+
+    LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
+                                          kKnownFilterHeight, 1>(
+        d, args, out_backprop, filter, in_backprop, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
+                                          kKnownFilterHeight, -1>(
+        d, args, out_backprop, filter, in_backprop, data_format);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* filter, T* in_backprop) {
-    const int num_in_backprop =
-        args.batch * args.in_rows * args.in_cols * args.in_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
-
-    DepthwiseConv2dBackpropInputGPUKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        args, out_backprop, filter, in_backprop, num_in_backprop);
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format) {
+    if (args.filter_rows == 3 && args.filter_cols == 3) {
+      LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
+          d, args, out_backprop, filter, in_backprop, data_format);
+    } else {
+      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
+          d, args, out_backprop, filter, in_backprop, data_format);
+    }
   }
 };
 
@@ -215,18 +1272,23 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<float>;
 template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernel(const DepthwiseArgs args,
-                                                       const T* out_backprop,
-                                                       const T* input,
-                                                       T* filter_backprop,
-                                                       int num_out_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -244,8 +1306,8 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernel(const DepthwiseArgs args,
     const int in_d = out_d / depth_multiplier;
     const int dm = out_d % depth_multiplier;
 
-    // Decide if all input is valid, if yes, we can skip the boundary checks for
-    // each input.
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
     const int in_r_start = out_r * stride - pad_rows;
     const int in_c_start = out_c * stride - pad_cols;
     const int in_r_end = in_r_start + filter_rows;
@@ -266,9 +1328,8 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernel(const DepthwiseArgs args,
           const int input_offset = in_d + in_depth * (in_c + input_offset_temp);
           T partial_sum = ldg(input + input_offset) * out_bp;
           T* addr = filter_backprop +
-                    (dm +
-                     depth_multiplier *
-                         (in_d + in_depth * (f_c + filter_cols * f_r)));
+                    (dm + depth_multiplier *
+                              (in_d + in_depth * (f_c + filter_cols * f_r)));
           CudaAtomicAdd(addr, partial_sum);
         }
       }
@@ -292,11 +1353,10 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernel(const DepthwiseArgs args,
             // to use atomic add here.
             // TODO(jmchen): If atomic add turns out to be slow, we can:
             // 1. allocate multiple buffers for the gradients (one for each
-            // example in a batch, for example). This can reduce the contention
-            // on the destination;
-            // 2. Have each thread compute one gradient for an element in the
-            // filters. This should work well when the input depth is big and
-            // filter size is not too small.
+            // example in a batch, for example). This can reduce the
+            // contention on the destination; 2. Have each thread compute one
+            // gradient for an element in the filters. This should work well
+            // when the input depth is big and filter size is not too small.
             CudaAtomicAdd(addr, partial_sum);
           }
         }
@@ -305,20 +1365,592 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernel(const DepthwiseArgs args,
   }
 }
 
+// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
+// NHWC format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input tensor are loaded into shared memory before performing the
+// convolution. Per iteration and filter element, each thread first performs
+// a partial convolution for two elements, one each in the lower and upper half
+// of a tile. The intermediate result of all pixels of a warp are then
+// accumulated and written to shared memory. Finally, the values in shared
+// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
+// up in global memory using atomics.
+// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockSlices.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, int kAccumPixels>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
+    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
+  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
+  // Holds block plus halo and filter data for blockDim.x depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = blockDim.y;  // slower (see b/62280718): args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  const int block_rows = blockDim.z;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_size = block_rows * in_cols * kBlockSlices;
+  assert((block_size & 31) == 0);
+  const int in_row_size = in_cols * in_depth;
+  const int in_size = in_rows * in_row_size;
+  const int in_increment = (in_cols - 1) * kBlockSlices;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int tile_rows = 2 * block_rows + filter_rows - 1;
+  const int tile_row_size = tile_cols * kBlockSlices;
+  const int tile_size = tile_rows * tile_row_size;
+  const int tile_offset = block_rows * tile_row_size;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
+  const int in_blocks = batch_blocks * batches;
+  const int tensor_offset = block_rows * in_row_size;
+  // The accumulator has a fixed number of pixels that can be reduced by one
+  // warp. Pixels beyond ceil(in_pixels * kBlockSlices / 64) are never written.
+  assert(kAccumPixels * 64 >= in_rows * in_cols * kBlockSlices);
+  const int accum_increment = kAccumPixels * kBlockSlices;
+  const int accum_size = filter_pixels * accum_increment;
+
+  const int thread_depth = threadIdx.x;
+  const int thread_col = threadIdx.y;
+  const int thread_row = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
+
+  // Initialize tile, in particular the padding and accumulator.
+  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_pix * in_depth + thread_depth;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = data_pix * kBlockSlices + thread_depth;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_pix = data_pix + pad_offset;
+  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
+
+  // Position in accumulator (kBlockSlices per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockSlices);
+  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
+
+  const int max_depth = in_depth - thread_depth;
+  const int accum_offset = tile_size + accum_idx;
+  const bool skip_second = block_rows + thread_row >= in_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int batch = b / batch_blocks;
+    const int stack = b - batch * batch_blocks;
+
+    const int start_depth = stack * kBlockSlices;
+    const int filter_offset = tensor_idx + start_depth;
+    const int inout_offset = batch * in_size + filter_offset;
+    const bool depth_in_range = start_depth < max_depth;
+
+    if (depth_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (depth_in_range) {
+      const T* const out_ptr = inout_offset + output;
+      const T out1 = ldg(out_ptr);
+      const T out2 = skip_second ? T(0) : ldg(tensor_offset + out_ptr);
+      int shared_offset = data_idx;
+      T* accum_ptr = accum_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          const T* const tile_ptr = shared_offset + shared_data;
+          T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
+          // Warp-accumulate pixels of the same depth and write to accumulator.
+          for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
+            val += CudaShuffleDown(val, delta);
+          }
+          if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
+            *accum_ptr = val;
+          }
+          shared_offset += kBlockSlices;
+          accum_ptr += accum_increment;
+        }
+        shared_offset += in_increment;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    const T* const accum_data = tile_size + shared_data;
+    for (int i = thread_idx; i < accum_size; i += block_size) {
+      const int filter_idx = i / kAccumPixels;
+      const int filter_pix = filter_idx / kBlockSlices;
+      const int filter_depth = filter_idx % kBlockSlices + start_depth;
+      const int filter_offset = filter_pix * in_depth + filter_depth;
+      if (filter_depth < in_depth) {
+        T val = accum_data[i];
+        // Warp-accumulate the pixels of the same depth from the accumulator.
+        for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
+          val += CudaShuffleDown(val, delta);
+        }
+        if (!(thread_idx & kAccumPixels - 1)) {
+          CudaAtomicAdd(filter_offset + filter, val);
+        }
+      }
+    }
+  }
+}
+
+// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
+  const int stride = args.stride;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+  const int out_rows = args.out_rows;
+  const int out_cols = args.out_cols;
+  const int out_depth = args.out_depth;
+
+  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
+    // Compute the indexes of this thread in the output.
+    const int out_c = thread_id % out_cols;
+    const int out_r = (thread_id / out_cols) % out_rows;
+    const int out_d = (thread_id / out_cols / out_rows) % out_depth;
+
+    const int b = thread_id / out_depth / out_cols / out_rows;
+    // Compute the input depth and the index of depth multiplier.
+    const int in_d = out_d / depth_multiplier;
+    const int dm = out_d % depth_multiplier;
+
+    // Decide if all input is valid, if yes, we can skip the boundary checks
+    // for each input.
+    const int in_r_start = out_r * stride - pad_rows;
+    const int in_c_start = out_c * stride - pad_cols;
+    const int in_r_end = in_r_start + filter_rows;
+    const int in_c_end = in_c_start + filter_cols;
+
+    const int out_backprop_offset = (b * out_depth * out_rows * out_cols) +
+                                    (out_d * out_rows * out_cols) +
+                                    (out_r * out_cols) + (out_c);
+
+    const T out_bp = ldg(out_backprop + out_backprop_offset);
+    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
+        in_c_end < in_cols) {
+      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = in_r_start + f_r;
+        // Avoid repeated computation.
+        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
+                                      (in_d * in_rows * in_cols) +
+                                      (in_r * in_cols);
+
+        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = in_c_start + f_c;
+          const int input_offset = input_offset_temp + in_c;
+          T partial_sum = ldg(input + input_offset) * out_bp;
+          T* addr = filter_backprop +
+                    (dm + depth_multiplier *
+                              (in_d + in_depth * (f_c + filter_cols * f_r)));
+          CudaAtomicAdd(addr, partial_sum);
+        }
+      }
+    } else {
+      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
+        const int in_r = in_r_start + f_r;
+        // Avoid repeated computation.
+        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
+                                      (in_d * in_rows * in_cols) +
+                                      (in_r * in_cols);
+        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
+          const int in_c = in_c_start + f_c;
+          const int addr_temp = filter_cols * f_r;
+
+          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
+            const int input_offset = input_offset_temp + in_c;
+            T partial_sum = ldg(input + input_offset) * out_bp;
+            T* addr =
+                filter_backprop +
+                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
+            // Potentially many threads can add to the same address so we have
+            // to use atomic add here.
+            // TODO(jmchen): If atomic add turns out to be slow, we can:
+            // 1. allocate multiple buffers for the gradients (one for each
+            // example in a batch, for example). This can reduce the
+            // contention on the destination; 2. Have each thread compute one
+            // gradient for an element in the filters. This should work well
+            // when the input depth is big and filter size is not too small.
+            CudaAtomicAdd(addr, partial_sum);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
+// NCHW format, tailored for small images up to 32x32. Stride and depth
+// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input tensor are loaded into shared memory before performing the
+// convolution. Per iteration and filter element, each thread first performs
+// a partial convolution for two elements, one each in the lower and upper half
+// of a tile. The intermediate result of all pixels of a warp are then
+// accumulated and written to shared memory. Finally, the values in shared
+// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
+// up in global memory using atomics.
+// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockSlices.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, int kAccumPixels>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
+  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = blockDim.x;  // slower (see b/62280718): args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  const int block_rows = blockDim.y;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_cols * block_rows;
+  const int block_size = block_pixels * kBlockSlices;
+  assert((block_size & 31) == 0);
+  const int in_pixels = in_cols * in_rows;
+  const int in_increment = in_cols - 1;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int tile_rows = 2 * block_rows + filter_rows - 1;
+  const int tile_pixels = tile_cols * tile_rows;
+  const int tile_size = tile_pixels * kBlockSlices;
+  const int tile_offset = block_rows * tile_cols;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int in_slices = in_depth * batches;
+  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+  // The accumulator has a fixed number of pixels that can be reduced by one
+  // warp. Pixels beyond ceil(in_pixels * kBlockSlices / 64) are never written.
+  assert(kAccumPixels * 64 >= in_rows * in_cols * kBlockSlices);
+  const int accum_increment = kAccumPixels * kBlockSlices;
+  const int accum_size = filter_pixels * accum_increment;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding and accumulator.
+  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
+    shared_data[i] = T(0);
+  }
+  __syncthreads();
+
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Position in accumulator (kBlockSlices per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockSlices);
+  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
+
+  const int max_slice = in_slices - thread_depth;
+  const int accum_offset = tile_size + accum_idx;
+  const bool skip_second = block_rows + thread_row >= in_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int slice = b * kBlockSlices;
+
+    const int inout_offset = slice * in_pixels + tensor_idx;
+    const bool slice_in_range = slice < max_slice;
+
+    if (slice_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (slice_in_range) {
+      const T* const out_ptr = inout_offset + output;
+      const T out1 = ldg(out_ptr);
+      const T out2 = skip_second ? T(0) : ldg(block_pixels + out_ptr);
+      int shared_offset = data_idx;
+      T* accum_ptr = accum_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          const T* const tile_ptr = shared_offset + shared_data;
+          T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
+          // Warp-accumulate pixels of the same depth and write to accumulator.
+          for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
+            val += CudaShuffleDown(val, delta);
+          }
+          if (!(thread_idx & 32 / kBlockSlices - 1)) {
+            *accum_ptr = val;
+          }
+          ++shared_offset;
+          accum_ptr += accum_increment;
+        }
+        shared_offset += in_increment;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    const T* const accum_data = tile_size + shared_data;
+    for (int i = thread_idx; i < accum_size; i += block_size) {
+      const int filter_idx = i / kAccumPixels;
+      const int filter_pix = filter_idx / kBlockSlices;
+      const int filter_depth = (slice + filter_idx % kBlockSlices) % in_depth;
+      const int filter_offset = filter_pix * in_depth + filter_depth;
+      if (filter_depth < in_depth) {
+        T val = accum_data[i];
+        // Warp-accumulate pixels of the same depth from the accumulator.
+        for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
+          val += CudaShuffleDown(val, delta);
+        }
+        if (!(thread_idx & kAccumPixels - 1)) {
+          CudaAtomicAdd(filter_offset + filter, val);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices, int kAccumPixels>
+bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  int block_rows = (args.in_rows + 1) / 2;
+  // args.in_cols * block_rows * kBlockSlices must be multiple of 32.
+  for (int round_mask = 1; args.in_cols * block_rows * kBlockSlices & 31;
+       round_mask = round_mask * 2 + 1) {
+    block_rows = block_rows + round_mask & ~round_mask;
+  }
+  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_rows)) {
+    return false;
+  }
+  const int tile_cols = args.in_cols + args.filter_cols - 1;
+  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_rows * tile_cols;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
+  const int shared_memory_size =
+      kBlockSlices * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(T);
+  if (shared_memory_size > d.sharedMemPerBlock()) {
+    return false;
+  }
+
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  if (data_format == FORMAT_NHWC) {
+    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
+            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
+            kAccumPixels>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, out_backprop, input, filter_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
+            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
+            kAccumPixels>,
+        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
+    DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>
+        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+            args, out_backprop, input, filter_backprop);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+  return true;
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kBlockSlices>
+bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  // Minimize (power of two) kAccumPixels, while satisfying
+  // kAccumPixels * 64 >= in_pixels * kBlockSlices.
+  const int block_pixels = args.in_rows * args.in_cols * kBlockSlices;
+  if (block_pixels > 1024) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 32>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else if (block_pixels > 512) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 16>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 8>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  // Maximize (power of two) kBlockSlices while keeping a block within 1024
+  // threads (2 pixels per thread).
+  const int in_pixels = args.in_rows * args.in_cols;
+  if (in_pixels > 512) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, 2>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else if (in_pixels > 256) {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, 4>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else {
+    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, 8>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
+                                            const DepthwiseArgs args,
+                                            const T* out_backprop,
+                                            const T* input, T* filter_backprop,
+                                            TensorFormat data_format) {
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0, 0);
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else {
+    assert(false && "Incorrect data format");
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
+                                            const DepthwiseArgs args,
+                                            const T* out_backprop,
+                                            const T* input, T* filter_backprop,
+                                            TensorFormat data_format) {
+  if (args.depth_multiplier == 1) {
+    if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth,
+                                                       kKnownFilterHeight>(
+            d, args, out_backprop, input, filter_backprop, data_format)) {
+      return;
+    }
+
+    LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
+                                           kKnownFilterHeight, 1>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
+                                           kKnownFilterHeight, -1>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* input, T* filter_backprop) {
-    // In this kernel, each thread is computing the gradients for one element in
-    // the out_backprop.
-    const int num_out_backprop =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
-
-    DepthwiseConv2dBackpropFilterGPUKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        args, out_backprop, input, filter_backprop, num_out_backprop);
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format) {
+    if (args.filter_rows == 3 && args.filter_cols == 3) {
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
+          d, args, out_backprop, input, filter_backprop, data_format);
+    } else {
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index c28909e03ba..8efe0d1e35f 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -69,12 +69,15 @@ class DequantizeOp : public OpKernel {
           (static_cast<float>(std::numeric_limits<T>::max()) -
            std::numeric_limits<T>::min());
 
-      // Multiply by scale factor and add min_range.
-      output->flat<float>() =
-          ((input.flat<T>().template cast<int>().template cast<float>() +
-            half_range_) *
-           scale_factor) +
-          min_range;
+      float* out_ptr = output->flat<float>().data();
+      const T* in_ptr = input.flat<T>().data();
+
+      const int64 num_elements = input.NumElements();
+      for (int i = 0; i < num_elements; ++i) {
+        out_ptr[i] =
+            ((static_cast<int>(in_ptr[i]) + half_range_) * scale_factor) +
+            min_range;
+      }
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
       if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
         auto input_ui8_array = input.flat<quint8>();
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
new file mode 100644
index 00000000000..8992629d426
--- /dev/null
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class DequantizeOpTest : public OpsTestBase {
+ protected:
+  template <typename T>
+  void ComputeDequantizeMinCombinedUsingEigen(const Tensor& input,
+                                              float min_range, float max_range,
+                                              Tensor* output) {
+    float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+    const float scale_factor =
+        (max_range - min_range) /
+        (static_cast<float>(std::numeric_limits<T>::max()) -
+         std::numeric_limits<T>::min());
+    output->flat<float>() =
+        ((input.flat<T>().template cast<int>().template cast<float>() +
+          half_range) *
+         scale_factor) +
+        min_range;
+  }
+
+  // Compares dequantize min vs the same using eigen. This tests that a change
+  // to not use eigen gives equivalent results to using eigen.
+  template <typename T>
+  void RunDequantizeMinCombinedTest(float min_range, float max_range) {
+    TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Attr("mode", "MIN_COMBINED")
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    std::vector<T> input;
+    for (int64 i = std::numeric_limits<T>::min();
+         i < std::numeric_limits<T>::max(); ++i) {
+      input.push_back(static_cast<T>(i));
+    }
+    TensorShape shape({static_cast<int64>(input.size())});
+    AddInputFromArray<T>(shape, input);
+    AddInputFromArray<float>(TensorShape({1}), {min_range});
+    AddInputFromArray<float>(TensorShape({1}), {max_range});
+    TF_ASSERT_OK(RunOpKernel());
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    ComputeDequantizeMinCombinedUsingEigen<T>(GetInput(0), min_range, max_range,
+                                              &expected);
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  }
+};
+
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint8) {
+  RunDequantizeMinCombinedTest<quint8>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQint8) {
+  RunDequantizeMinCombinedTest<qint8>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQint16) {
+  RunDequantizeMinCombinedTest<qint16>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
+  RunDequantizeMinCombinedTest<quint16>(0, 255.0f);
+}
+
+template <typename T>
+static void BM_DequantizeMinCombinedCpu(int iters) {
+  auto root = Scope::NewRootScope().ExitOnError();
+  const int64 num_values = 1500 * 250;
+  std::vector<T> inputs;
+  inputs.reserve(num_values);
+  for (int i = 0; i < num_values; ++i) inputs.push_back(i);
+  ops::Dequantize(root, test::AsTensor<T>(inputs),
+                  test::AsTensor<float>({-1.5f}),
+                  test::AsTensor<float>({20.5f}),
+                  ops::Dequantize::Attrs().Mode("MIN_COMBINED"));
+  TF_CHECK_OK(root.status());
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(root.ToGraph(g));
+
+  test::Benchmark("cpu", g).Run(iters);
+  testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
+  testing::ItemsProcessed(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
+  BM_DequantizeMinCombinedCpu<quint16>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQint16(int iters) {
+  BM_DequantizeMinCombinedCpu<qint16>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
+  BM_DequantizeMinCombinedCpu<quint8>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQint8(int iters) {
+  BM_DequantizeMinCombinedCpu<qint8>(iters);
+}
+
+BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
+BENCHMARK(BM_DequantizeMinCombinedCpuQint16);
+BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
+BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 06765d8ee3a..861e16b2fd0 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -104,6 +104,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
                                    Eigen::Aligned> > out_vec;
+      out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
       }
@@ -124,6 +125,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
                                    Eigen::Aligned> > out_flat;
+      out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
       }
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index be5408d93b4..0e8fbc0a67b 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 08ae787c863..135d6355147 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -165,20 +165,6 @@ class DynamicStitchOp : public OpKernel {
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          DynamicStitchOp<type>)
-
-TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
-#undef REGISTER_DYNAMIC_STITCH_SYCL
-#endif  // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 #define REGISTER_DYNAMIC_STITCH_GPU(type)                \
   REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
@@ -194,4 +180,17 @@ TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("indices")     \
+                              .HostMemory("data")        \
+                              .HostMemory("merged"),     \
+                          DynamicStitchOp<type>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
+#undef REGISTER_DYNAMIC_STITCH_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 7f2f965a63a..6775893ce63 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
index 22b29208d16..a44e7197a94 100644
--- a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
@@ -17,31 +17,31 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_patch_3d.h"
+#include "tensorflow/core/kernels/eigen_volume_patch.h"
 
 namespace Eigen {
 
 /** CuboidConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 3D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 4 or
-  * more (channels, depth, height, width, and optionally others)
-  * The kernel parameter is expected to be a 5D tensor (filters, channels,
-  * kernel_depth, kernel_height, kernel_width)
-  * output_backward and kernel have to be in the same layout.
-  *
-  * The dimensions of the result will be filters, depth, height, width (and
-  * others if applicable).
-  *
-  * It is possible to swap the order of the depth, width and height dimensions
-  * provided that the same order is used in the input, the kernel, and the
-  * output.
-  *
-  * All dimension orders above are given for col-major, and should be reversed
-  * for row-major.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 3D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 4 or
+ * more (channels, depth, height, width, and optionally others)
+ * The kernel parameter is expected to be a 5D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width)
+ * output_backward and kernel have to be in the same layout.
+ *
+ * The dimensions of the result will be filters, depth, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the depth, width and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ *
+ * All dimension orders above are given for col-major, and should be reversed
+ * for row-major.
+ */
 
 template <typename OutputBackward, typename Kernel>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -300,26 +300,26 @@ CuboidConvolutionBackwardInput(
 }
 
 /** CuboidConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 3D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 4 or
-  * more (channels, depth, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
-  * kernel_depth, kernel_height, kernel_width)
-  * output_backward and kernel have to be in the same layout.
-  *
-  * The dimensions of the result will be filters, depth, height, width (and
-  * others if applicable).
-  *
-  * It is possible to swap the order of the depth, width and height dimensions
-  * provided that the same order is used in the input, the kernel, and the
-  * output.
-  *
-  * All dimension orders above are given for col-major, and should be reversed
-  * for row-major.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 3D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 4 or
+ * more (channels, depth, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width)
+ * output_backward and kernel have to be in the same layout.
+ *
+ * The dimensions of the result will be filters, depth, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the depth, width and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ *
+ * All dimension orders above are given for col-major, and should be reversed
+ * for row-major.
+ */
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
     internal::traits<OutputBackward>::Layout == ColMajor,
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 26f0eb4aa27..2dca664a86d 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -17,31 +17,31 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_patch_3d.h"
+#include "tensorflow/core/kernels/eigen_volume_patch.h"
 
 namespace Eigen {
 
 /** CuboidConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 3D convolution over a multichannel input voxel block.
-  *
-  * The input parameter is expected to be a tensor with a rank of 4 or more
-  * (channels, depth, height, width, and optionally others).
-  * The kernel parameter is expected to be a 5D tensor (filters, channels,
-  * kernel_depth, kernel_height, kernel_width).
-  * The result can be assigned to a tensor of rank equal to the rank of the
-  * input. The dimensions of the result will be filters, depth, height, width
-  * (and others if applicable).
-  *
-  * The input and kernel have to be in the same layout, and both row-major and
-  * col-major are supported. The shapes given above are for col-major layout.
-  * For row-major, all dimensions should be reversed.
-  *
-  * It is possible to swap the order of the depth, width, and height dimensions
-  * provided that the same order is used in the input, the kernel, and the
-  * output.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 3D convolution over a multichannel input voxel block.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 or more
+ * (channels, depth, height, width, and optionally others).
+ * The kernel parameter is expected to be a 5D tensor (filters, channels,
+ * kernel_depth, kernel_height, kernel_width).
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, depth, height, width
+ * (and others if applicable).
+ *
+ * The input and kernel have to be in the same layout, and both row-major and
+ * col-major are supported. The shapes given above are for col-major layout.
+ * For row-major, all dimensions should be reversed.
+ *
+ * It is possible to swap the order of the depth, width, and height dimensions
+ * provided that the same order is used in the input, the kernel, and the
+ * output.
+ */
 template <typename Input, typename Kernel>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
     internal::traits<Input>::Layout == ColMajor,
@@ -55,26 +55,8 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
                 const Kernel>,
             const TensorReshapingOp<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
-                // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const
-                // Input>
-                const Eigen::TensorStridingOp<
-                    const Eigen::array<typename internal::traits<Input>::Index,
-                                       internal::traits<Input>::NumDimensions +
-                                           3>,
-                    const Eigen::TensorReshapingOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index,
-                            internal::traits<Input>::NumDimensions + 3>,
-                        const Eigen::TensorPatchOp<
-                            const Eigen::DSizes<
-                                typename internal::traits<Input>::Index,
-                                internal::traits<Input>::NumDimensions>,
-                            const Eigen::TensorPaddingOp<
-                                const Eigen::array<
-                                    Eigen::IndexPair<typename internal::traits<
-                                        Input>::Index>,
-                                    internal::traits<Input>::NumDimensions>,
-                                const Input> > > > > > >,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> > > >,
     TensorReshapingOp<
         const DSizes<typename internal::traits<Input>::Index,
                      internal::traits<Input>::NumDimensions>,
@@ -82,26 +64,8 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
             const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
             const TensorReshapingOp<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
-                // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const
-                // Input>
-                const Eigen::TensorStridingOp<
-                    const Eigen::array<typename internal::traits<Input>::Index,
-                                       internal::traits<Input>::NumDimensions +
-                                           3>,
-                    const Eigen::TensorReshapingOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index,
-                            internal::traits<Input>::NumDimensions + 3>,
-                        const Eigen::TensorPatchOp<
-                            const Eigen::DSizes<
-                                typename internal::traits<Input>::Index,
-                                internal::traits<Input>::NumDimensions>,
-                            const Eigen::TensorPaddingOp<
-                                const Eigen::array<
-                                    Eigen::IndexPair<typename internal::traits<
-                                        Input>::Index>,
-                                    internal::traits<Input>::NumDimensions>,
-                                const Input> > > > >,
+                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                          const Input> >,
             const TensorReshapingOp<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
                 const Kernel> > > >::type
@@ -242,15 +206,17 @@ CuboidConvolution(const Input& input, const Kernel& kernel,
   return choose(
       Cond<internal::traits<Input>::Layout == ColMajor>(),
       kernel.reshape(kernel_dims)
-          .contract(internal::Extract3DPatches(
-                        input, kernelDepth, kernelRows, kernelCols,
-                        stridePlanes, strideRows, strideCols, padding_type)
+          .contract(input
+                        .extract_volume_patches(
+                            kernelDepth, kernelRows, kernelCols, stridePlanes,
+                            strideRows, strideCols, padding_type)
                         .reshape(pre_contract_dims),
                     contract_dims)
           .reshape(post_contract_dims),
-      internal::Extract3DPatches(input, kernelDepth, kernelRows, kernelCols,
-                                 stridePlanes, strideRows, strideCols,
-                                 padding_type)
+      input
+          .extract_volume_patches(kernelDepth, kernelRows, kernelCols,
+                                  stridePlanes, strideRows, strideCols,
+                                  padding_type)
           .reshape(pre_contract_dims)
           .contract(kernel.reshape(kernel_dims), contract_dims)
           .reshape(post_contract_dims));
diff --git a/tensorflow/core/kernels/eigen_patch_3d.h b/tensorflow/core/kernels/eigen_patch_3d.h
deleted file mode 100644
index 5e7a64ae05b..00000000000
--- a/tensorflow/core/kernels/eigen_patch_3d.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_PATCH_3D_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_PATCH_3D_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#if not defined(__CUDACC__)
-#include <type_traits>
-#endif
-
-namespace Eigen {
-namespace internal {
-
-/** Extract3DPatches
- * \ingroup CXX11_NeuralNetworksModule
- *
- * \brief Extracts 3D patches from a multichannel input volume.
- *
- * The input parameter is expected to be a tensor with a rank of 4 or more
- * (channels, depth, height, width, optional others in col-major, and the
- * reverse order in row-major).
-
- * The return value will be a tensor of 3 more dimension than the input tensor.
- * In col-major, the first 4 dimensions of the result are: channels,
- patch_depth,
- * patch_height, patch_width. The next dimensions will identify the patch
- * position on the 3D grid of extracted patches: z, y, x. The remaining
- * dimensions, if any, will be the same as the 'other' dimensions of the input
- * tensor.
- */
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorStridingOp<
-    const array<typename internal::traits<Input>::Index,
-                internal::traits<Input>::NumDimensions + 3>,
-    const TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions + 3>,
-        const TensorPatchOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorPaddingOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            internal::traits<Input>::NumDimensions>,
-                const Input> > > >
-Extract3DPatches(
-    const Input& input, const DenseIndex patchPlanes,
-    const DenseIndex patchRows, const DenseIndex patchCols,
-    const DenseIndex stridePlanes, const DenseIndex strideRows,
-    const DenseIndex strideCols, const DenseIndex paddingZTop,
-    const DenseIndex paddingZBottom, const DenseIndex paddingTop,
-    const DenseIndex paddingBottom, const DenseIndex paddingLeft,
-    const DenseIndex paddingRight,
-    const typename internal::traits<Input>::Scalar padding_value = 0) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
-                   internal::traits<Input>::NumDimensions,
-                   internal::traits<Input>::Layout, TensorIndex> >
-      in(input);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4,
-                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-  static const int ExtDims = NumDims + 3;
-
-  // Tensor size after patch extraction. We add three dimensions to unpack the
-  // linear patch index into a 3D grid over which stride() can work.
-  DSizes<TensorIndex, ExtDims> pre_stride_dims;
-
-  if (isColMajor) {
-    pre_stride_dims[0] = in.dimension(0);
-    pre_stride_dims[1] = patchPlanes;
-    pre_stride_dims[2] = patchRows;
-    pre_stride_dims[3] = patchCols;
-  } else {
-    pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1);
-    pre_stride_dims[ExtDims - 4] = patchCols;
-    pre_stride_dims[ExtDims - 3] = patchRows;
-    pre_stride_dims[ExtDims - 2] = patchPlanes;
-  }
-
-  const TensorIndex inputPlanes =
-      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows =
-      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols =
-      isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  array<IndexPair<TensorIndex>, NumDims> paddings;
-  for (int i = 0; i < NumDims; ++i) {
-    paddings[i] = IndexPair<TensorIndex>(0, 0);
-  }
-
-  paddings[isColMajor ? 1 : (NumDims - 2)] =
-      IndexPair<TensorIndex>(paddingZTop, paddingZBottom);
-  paddings[isColMajor ? 2 : (NumDims - 3)] =
-      IndexPair<TensorIndex>(paddingTop, paddingBottom);
-  paddings[isColMajor ? 3 : (NumDims - 4)] =
-      IndexPair<TensorIndex>(paddingLeft, paddingRight);
-
-  pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] =
-      inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1;
-  pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] =
-      inputRows + paddingTop + paddingBottom - patchRows + 1;
-  pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] =
-      inputCols + paddingLeft + paddingRight - patchCols + 1;
-
-  if (isColMajor) {
-    for (int i = 7; i < NumDims + 3; ++i) {
-      pre_stride_dims[i] = in.dimension(i - 3);
-    }
-  } else {
-    for (int i = 0; i < NumDims - 4; ++i) {
-      pre_stride_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, NumDims> patch_dims;
-  if (isColMajor) {
-    patch_dims[0] = in.dimension(0);
-    patch_dims[1] = patchPlanes;
-    patch_dims[2] = patchRows;
-    patch_dims[3] = patchCols;
-    for (int i = 4; i < NumDims; ++i) {
-      patch_dims[i] = 1;
-    }
-  } else {
-    patch_dims[NumDims - 1] = in.dimension(NumDims - 1);
-    patch_dims[NumDims - 4] = patchCols;
-    patch_dims[NumDims - 3] = patchRows;
-    patch_dims[NumDims - 2] = patchPlanes;
-    for (int i = 0; i < NumDims - 4; i++) {
-      patch_dims[i] = 1;
-    }
-  }
-
-  array<TensorIndex, NumDims + 3> strides;
-  if (isColMajor) {
-    // No striding within the patches.
-    for (int i = 0; i < 4; ++i) {
-      strides[i] = 1;
-    }
-    // Apply striding in the spatial patch grid dimensions only.
-    strides[4] = stridePlanes;
-    strides[5] = strideRows;
-    strides[6] = strideCols;
-    // No striding in the remaining dimensions (batches, ...).
-    for (int i = 7; i < NumDims + 3; i++) {
-      strides[i] = 1;
-    }
-  } else {
-    // No striding within the patches.
-    for (int i = 1; i <= 4; ++i) {
-      strides[ExtDims - i] = 1;
-    }
-    // Apply striding in the spatial patch grid dimensions only.
-    strides[ExtDims - 7] = strideCols;
-    strides[ExtDims - 6] = strideRows;
-    strides[ExtDims - 5] = stridePlanes;
-    // No striding in the remaining dimensions (batches, ...).
-    for (int i = 0; i < NumDims - 4; i++) {
-      strides[i] = 1;
-    }
-  }
-
-  // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
-  // extract_patches to take additional parameters for padding/striding,
-  // similarly to etract_image_patches.
-  return input.pad(paddings, padding_value)
-      .extract_patches(patch_dims)
-      .reshape(pre_stride_dims)
-      .stride(strides);
-}
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorStridingOp<
-    const array<typename internal::traits<Input>::Index,
-                internal::traits<Input>::NumDimensions + 3>,
-    const TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions + 3>,
-        const TensorPatchOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorPaddingOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            internal::traits<Input>::NumDimensions>,
-                const Input> > > >
-Extract3DPatches(
-    const Input& input, const DenseIndex patchPlanes,
-    const DenseIndex patchRows, const DenseIndex patchCols,
-    const DenseIndex stridePlanes, const DenseIndex strideRows,
-    const DenseIndex strideCols, const PaddingType padding_type,
-    const typename internal::traits<Input>::Scalar padding_value = 0) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
-                   internal::traits<Input>::NumDimensions,
-                   internal::traits<Input>::Layout, TensorIndex> >
-      in(input);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4,
-                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-
-  const TensorIndex inputPlanes =
-      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows =
-      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols =
-      isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  switch (padding_type) {
-    case PADDING_VALID:
-      // No padding in any dimension.
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols, 0, 0, 0, 0,
-                              0, 0, padding_value);
-    case PADDING_SAME: {
-      // The side of the tensor before striding should be just the expected
-      // output times the stride.
-      const TensorIndex size_z =
-          numext::ceil(inputPlanes / static_cast<float>(stridePlanes));
-      const TensorIndex size_y =
-          numext::ceil(inputRows / static_cast<float>(strideRows));
-      const TensorIndex size_x =
-          numext::ceil(inputCols / static_cast<float>(strideCols));
-
-      // The size of the patch space is going to be: padded_input_size -
-      // patch_size + 1.
-      // This has to match the expected size before striding (pre_stride_dims).
-      // The deltas below extend the input to the expected size.
-      const TensorIndex dz =
-          (size_z - 1) * stridePlanes + patchPlanes - inputPlanes;
-      const TensorIndex dy = (size_y - 1) * strideRows + patchRows - inputRows;
-      const TensorIndex dx = (size_x - 1) * strideCols + patchCols - inputCols;
-
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols, dz / 2,
-                              dz - dz / 2, dy / 2, dy - dy / 2, dx / 2,
-                              dx - dx / 2, padding_value);
-    }
-    default:
-      eigen_assert(false && "unexpected padding");
-      // unreachable code to avoid missing return warning.
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols, 0, 0, 0, 0,
-                              0, 0, padding_value);
-  }
-}
-
-// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11.
-template <typename Input>
-struct Extract3DPatchesType {
-  typedef const TensorStridingOp<
-      const array<typename internal::traits<Input>::Index,
-                  internal::traits<Input>::NumDimensions + 3>,
-      const TensorReshapingOp<
-          const DSizes<typename internal::traits<Input>::Index,
-                       internal::traits<Input>::NumDimensions + 3>,
-          const TensorPatchOp<
-              const DSizes<typename internal::traits<Input>::Index,
-                           internal::traits<Input>::NumDimensions>,
-              const TensorPaddingOp<
-                  const array<
-                      IndexPair<typename internal::traits<Input>::Index>,
-                      internal::traits<Input>::NumDimensions>,
-                  const Input> > > >
-      type;
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_PATCH_3D_H_
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index e13c8b98357..94100d71ec3 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -17,25 +17,25 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_patch_3d.h"
+#include "tensorflow/core/kernels/eigen_volume_patch.h"
 
 namespace Eigen {
 
 /** SpatialMaxPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a max-pooling over a multichannel input image.
-  *
-  * The input parameter is expected to be a with a rank of 4 (channels, height,
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a max-pooling over a multichannel input image.
+ *
+ * The input parameter is expected to be a with a rank of 4 (channels, height,
  * width, others in col-major, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be channels, height, width, and
  * others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the width and height dimensions can be swapped if needed.
-  *
-*/
+ *
+ * The order of the width and height dimensions can be swapped if needed.
+ *
+ */
 #if !defined(EIGEN_HAS_INDEX_LIST)
 template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
@@ -138,23 +138,23 @@ SpatialMaxPooling(const Input& input, DenseIndex patchRows,
 }
 
 /** CuboidMaxPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a max-pooling over a multichannel input volume.
-  *
-  * The input parameter is expected to be a tensor with a rank of 5 (channels,
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a max-pooling over a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 5 (channels,
  * depth, height, width, others in col-major, and the reverse of that in
  * row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be channels, depth, height, width,
  * and others (in col-major, and the reverse of that if the input was
  * row-major).
-  *
-  * The order of the depth, width and height dimensions can be swapped if
+ *
+ * The order of the depth, width and height dimensions can be swapped if
  * needed.
-  *
-*/
+ *
+ */
 #if !defined(EIGEN_HAS_INDEX_LIST)
 template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
@@ -163,21 +163,8 @@ EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
         internal::MaxReducer<float>, const Eigen::array<int, 1>,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>
-            const Eigen::TensorStridingOp<
-                const Eigen::array<typename internal::traits<Input>::Index, 8>,
-                const Eigen::TensorReshapingOp<
-                    const Eigen::DSizes<typename internal::traits<Input>::Index,
-                                        8>,
-                    const Eigen::TensorPatchOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index, 5>,
-                        const Eigen::TensorPaddingOp<
-                            const Eigen::array<
-                                Eigen::IndexPair<
-                                    typename internal::traits<Input>::Index>,
-                                5>,
-                            const Input> > > > > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                      const Input> > > >
 #else
 template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
@@ -187,21 +174,8 @@ EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
         const Eigen::IndexList<Eigen::type2index<1> >,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>
-            const Eigen::TensorStridingOp<
-                const Eigen::array<typename internal::traits<Input>::Index, 8>,
-                const Eigen::TensorReshapingOp<
-                    const Eigen::DSizes<typename internal::traits<Input>::Index,
-                                        8>,
-                    const Eigen::TensorPatchOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index, 5>,
-                        const Eigen::TensorPaddingOp<
-                            const Eigen::array<
-                                Eigen::IndexPair<
-                                    typename internal::traits<Input>::Index>,
-                                5>,
-                            const Input> > > > > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                      const Input> > > >
 #endif
 CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
                  DenseIndex patchRows, DenseIndex patchCols,
@@ -272,29 +246,30 @@ CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
   // optimize the code.
   Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
 #endif
-  return internal::Extract3DPatches(
-             input, patchPlanes, patchRows, patchCols, stridePlanes, strideRows,
-             strideCols, padding_type, -Eigen::NumTraits<float>::highest())
+  return input
+      .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
+                              strideRows, strideCols, padding_type,
+                              -Eigen::NumTraits<float>::highest())
       .reshape(pre_reduce_dims)
       .maximum(reduction_dims)
       .reshape(post_reduce_dims);
 }
 
 /** SpatialAvgPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies an average pooling over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 4 (channels,
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies an average pooling over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 4 (channels,
  * height, width, others in col-major, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be channels, height, width, and
  * others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the width and height dimensions can be swapped if needed.
-  *
-*/
+ *
+ * The order of the width and height dimensions can be swapped if needed.
+ *
+ */
 namespace internal {
 
 template <typename T>
@@ -330,9 +305,19 @@ struct AvgPoolMeanReducer {
 
 #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
 #ifdef EIGEN_VECTORIZE_AVX512
-#define pequal(a, b) \
-  _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)
-#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca)
+#define pequal(a, b)   \
+  _mm512_castsi512_ps( \
+      _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
+
+// The ternarylogic function immediate determines the values in the result
+// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+// For details, refer to the vpternlogd instruction table at
+// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+
+#define psel(a, b, false_mask)                        \
+  _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
+      _mm512_castps_si512(a), _mm512_castps_si512(b), \
+      _mm512_castps_si512(false_mask), 0xd8))
 #elif defined EIGEN_VECTORIZE_AVX
 #define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
 #define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
@@ -505,21 +490,21 @@ SpatialAvgPooling(const Input& input, DenseIndex patchRows,
 }
 
 /** CuboidAvgPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies an average pooling over a multichannel input volume.
-  *
-  * The input parameter is expected to be a tensor with a rank of 5 (channels,
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies an average pooling over a multichannel input volume.
+ *
+ * The input parameter is expected to be a tensor with a rank of 5 (channels,
  * depth, height, width, others, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be channels, depth, width, and
  * others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the depth, width and height dimensions can be swapped if
+ *
+ * The order of the depth, width and height dimensions can be swapped if
  * needed.
-  *
-*/
+ *
+ */
 #if !defined(EIGEN_HAS_INDEX_LIST)
 template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
@@ -528,22 +513,8 @@ EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
         internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>
-            // > > >
-            const Eigen::TensorStridingOp<
-                const Eigen::array<typename internal::traits<Input>::Index, 8>,
-                const Eigen::TensorReshapingOp<
-                    const Eigen::DSizes<typename internal::traits<Input>::Index,
-                                        8>,
-                    const Eigen::TensorPatchOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index, 5>,
-                        const Eigen::TensorPaddingOp<
-                            const Eigen::array<
-                                Eigen::IndexPair<
-                                    typename internal::traits<Input>::Index>,
-                                5>,
-                            const Input> > > > > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                      const Input> > > >
 #else
 template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
@@ -553,22 +524,8 @@ EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
         const Eigen::IndexList<Eigen::type2index<1> >,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            // const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>
-            // > > >
-            const Eigen::TensorStridingOp<
-                const Eigen::array<typename internal::traits<Input>::Index, 8>,
-                const Eigen::TensorReshapingOp<
-                    const Eigen::DSizes<typename internal::traits<Input>::Index,
-                                        8>,
-                    const Eigen::TensorPatchOp<
-                        const Eigen::DSizes<
-                            typename internal::traits<Input>::Index, 5>,
-                        const Eigen::TensorPaddingOp<
-                            const Eigen::array<
-                                Eigen::IndexPair<
-                                    typename internal::traits<Input>::Index>,
-                                5>,
-                            const Input> > > > > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
+                                      const Input> > > >
 #endif
 CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
                  DenseIndex patchRows, DenseIndex patchCols,
@@ -642,9 +599,10 @@ CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
   // optimize the code.
   Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
 #endif
-  return internal::Extract3DPatches(
-             input, patchPlanes, patchRows, patchCols, stridePlanes, strideRows,
-             strideCols, padding_type, -Eigen::NumTraits<float>::highest())
+  return input
+      .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
+                              strideRows, strideCols, padding_type,
+                              -Eigen::NumTraits<float>::highest())
       .reshape(pre_reduce_dims)
       .reduce(reduction_dims, mean_with_nan)
       .reshape(post_reduce_dims);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 2121cbf28c3..450b98785ba 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -20,12 +20,9 @@ limitations under the License.
 
 namespace Eigen {
 
-namespace {
-void EigenApprox(float a, float b) {
-  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
-}
+#define EigenApprox(a, b) \
+  { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
 
 TEST(EigenSpatialConvolutionsTest, Simple) {
   const int input_depth = 7;
diff --git a/tensorflow/core/kernels/eigen_volume_patch.h b/tensorflow/core/kernels/eigen_volume_patch.h
new file mode 100644
index 00000000000..afd5f37e352
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_volume_patch.h
@@ -0,0 +1,656 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace Eigen {
+
+// Changes the interpretation of padding in TensorVolumePatchOp to be compatible
+// with the rest of TensorFlow (odd padding is split so that more padding is put
+// on the right end of the tensor).
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType,
+          typename Device>
+struct CustomTensorEvaluator {
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<
+      typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef
+      typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const Index PacketSize =
+      internal::unpacket_traits<PacketReturnType>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = NumDims == 6,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CustomTensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device) {
+    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
+        m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims - 1];
+      m_inputPlanes = input_dims[NumInputDims - 2];
+      m_inputRows = input_dims[NumInputDims - 3];
+      m_inputCols = input_dims[NumInputDims - 4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff =
+        op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff =
+        op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff =
+        op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes = Eigen::divup(
+          m_input_planes_eff +
+              static_cast<Index>(op.padding_top_z() + op.padding_bottom_z()) -
+              m_patch_planes_eff + 1,
+          m_plane_strides);
+      m_outputRows = Eigen::divup(
+          m_input_rows_eff +
+              static_cast<Index>(op.padding_top() + op.padding_bottom()) -
+              m_patch_rows_eff + 1,
+          m_row_strides);
+      m_outputCols = Eigen::divup(
+          m_input_cols_eff +
+              static_cast<Index>(op.padding_left() + op.padding_right()) -
+              m_patch_cols_eff + 1,
+          m_col_strides);
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes = Eigen::divup(
+              m_input_planes_eff - m_patch_planes_eff + 1, m_plane_strides);
+          m_outputRows = Eigen::divup(m_input_rows_eff - m_patch_rows_eff + 1,
+                                      m_row_strides);
+          m_outputCols = Eigen::divup(m_input_cols_eff - m_patch_cols_eff + 1,
+                                      m_col_strides);
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = Eigen::divup(m_input_planes_eff, m_plane_strides);
+          m_outputRows = Eigen::divup(m_input_rows_eff, m_row_strides);
+          m_outputCols = Eigen::divup(m_input_cols_eff, m_col_strides);
+          const Index dz = numext::maxi<DenseIndex>(
+              0, (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff -
+                     m_input_planes_eff);
+          const Index dy = numext::maxi<DenseIndex>(
+              0, (m_outputRows - 1) * m_row_strides + m_patch_rows_eff -
+                     m_input_rows_eff);
+          const Index dx = numext::maxi<DenseIndex>(
+              0, (m_outputCols - 1) * m_col_strides + m_patch_cols_eff -
+                     m_input_cols_eff);
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
+          break;
+        }
+        default:
+          eigen_assert(false && "unexpected padding");
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i - 1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1];
+      m_dimensions[NumDims - 2] = op.patch_planes();
+      m_dimensions[NumDims - 3] = op.patch_rows();
+      m_dimensions[NumDims - 4] = op.patch_cols();
+      m_dimensions[NumDims - 5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims - 6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims - 2];
+      m_colStride = m_dimensions[NumDims - 3] * m_rowStride;
+      m_patchStride =
+          m_colStride * m_dimensions[NumDims - 4] * m_dimensions[NumDims - 1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims - 5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride =
+        m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride =
+        internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride =
+        internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride =
+        internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows =
+        internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth =
+          internal::TensorIntDivisor<Index>(m_dimensions[NumDims - 1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(
+      Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  coeff(Index index) const {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset =
+        (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex =
+        (NumDims == 5)
+            ? patchIndex
+            : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides +
+                           colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) &&
+         (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex =
+        (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset =
+        (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides +
+                           rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) &&
+         (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex =
+        (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset =
+        patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides +
+                             planeOffset * m_in_plane_strides -
+                             m_planePaddingTop;
+    const Index origInputPlane =
+        (m_plane_inflate_strides == 1)
+            ? inputPlane
+            : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) &&
+         (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0
+                                                               : NumDims - 1;
+    const Index depth =
+        index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth + origInputRow * m_rowInputStride +
+                             origInputCol * m_colInputStride +
+                             origInputPlane * m_planeInputStride +
+                             otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packet(Index index) const {
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 ||
+        m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex =
+        (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {
+        (indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+        (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch3DIndex =
+        (NumDims == 5)
+            ? patchIndex
+            : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex ==
+                 (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                 patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {
+        colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+        colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex =
+        (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {
+        (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+        (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {
+        rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+        rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex =
+        (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {
+        patchOffsets[0] - colOffsets[0] * m_colStride -
+            rowOffsets[0] * m_rowStride,
+        patchOffsets[1] - colOffsets[1] * m_colStride -
+            rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {
+        planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+        planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index =
+          static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0
+                                                                 : NumDims - 1;
+      const Index depth =
+          index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                               inputCols[0] * m_colInputStride +
+                               m_planeInputStride * inputPlanes[0] +
+                               otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = 10 * TensorOpCost::DivCost<Index>() +
+                                21 * TensorOpCost::MulCost<Index>() +
+                                8 * TensorOpCost::AddCost<Index>();
+    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index planePaddingTop() const { return m_planePaddingTop; }
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputPlanes() const { return m_outputPlanes; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userPlaneStride() const { return m_plane_strides; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+  Index userInPlaneStride() const { return m_in_plane_strides; }
+  Index userInRowStride() const { return m_in_row_strides; }
+  Index userInColStride() const { return m_in_col_strides; }
+  Index planeInflateStride() const { return m_plane_inflate_strides; }
+  Index rowInflateStride() const { return m_row_inflate_strides; }
+  Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  coeff(const array<Index, NumDims>& coords) const {
+    // ColMajor
+    //   0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of
+    //   patches, 5: batches
+    // RowMajor
+    //   0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4:
+    //   patch_planes, 5: depth
+    const Index patch3DIndex =
+        coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
+    const Index colOffset =
+        coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
+    const Index rowOffset =
+        coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
+    const Index planeOffset =
+        coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
+
+    array<Index, NumDims - 1> inputCoords;
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index inputCol = colIndex * m_col_strides +
+                           colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) &&
+         (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index rowIndex =
+        (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index inputRow = rowIndex * m_row_strides +
+                           rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) &&
+         (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const Index planeIndex =
+        patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
+    const Index inputPlane = planeIndex * m_plane_strides +
+                             planeOffset * m_in_plane_strides -
+                             m_planePaddingTop;
+    const Index origInputPlane =
+        (m_plane_inflate_strides == 1)
+            ? inputPlane
+            : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) &&
+         (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputCoords[0] = coords[0];  // depth
+      inputCoords[1] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[3] = origInputCol;
+      inputCoords[4] = coords[5];  // batch
+    } else {
+      inputCoords[4] = coords[5];  // depth
+      inputCoords[3] = origInputPlane;
+      inputCoords[2] = origInputRow;
+      inputCoords[1] = origInputCol;
+      inputCoords[0] = coords[0];  // batch
+    }
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      return m_impl.coeff(inputCoords);
+    } else {
+      Index inputIndex;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        inputIndex = inputCoords[4] * m_otherInputStride +
+                     inputCoords[3] * m_colInputStride +
+                     inputCoords[2] * m_rowInputStride +
+                     inputCoords[1] * m_planeInputStride + inputCoords[0];
+      } else {
+        inputIndex = inputCoords[0] * m_otherInputStride +
+                     inputCoords[1] * m_colInputStride +
+                     inputCoords[2] * m_rowInputStride +
+                     inputCoords[3] * m_planeInputStride + inputCoords[4];
+      }
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
+        values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the costructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Override the default TensorEvaluator for TensorVolumePatchOp for CPU.
+#define OVERRIDE_EVALUATOR(Device)                                          \
+  template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols,            \
+            typename ArgType>                                               \
+  struct TensorEvaluator<                                                   \
+      const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>       \
+      : public CustomTensorEvaluator<Planes, Rows, Cols, ArgType, Device> { \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(                  \
+        const typename CustomTensorEvaluator<Planes, Rows, Cols, ArgType,   \
+                                             Device>::XprType& op,          \
+        const Device& device)                                               \
+        : CustomTensorEvaluator<Planes, Rows, Cols, ArgType, Device>(       \
+              op, device) {}                                                \
+  };
+
+OVERRIDE_EVALUATOR(Eigen::ThreadPoolDevice);
+OVERRIDE_EVALUATOR(Eigen::DefaultDevice);
+
+#undef OVERRIDE_EVALUATOR
+
+};  // namespace Eigen
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index fe63c5ef970..8e021b92563 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -119,7 +119,7 @@ class EncodeJpegOp : public OpKernel {
     }
 
     // Encode image to jpeg string
-    Tensor* output = NULL;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
     OP_REQUIRES(context,
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
index f21ab3a8170..8fcda25e692 100644
--- a/tensorflow/core/kernels/encode_png_op.cc
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -73,7 +73,7 @@ class EncodePngOp : public OpKernel {
                     "image must have 1, 2, 3, or 4 channels, got ", channels));
 
     // Encode image to png string
-    Tensor* output = NULL;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
     if (desired_channel_bits_ == 8) {
diff --git a/tensorflow/core/kernels/encode_wav_op.cc b/tensorflow/core/kernels/encode_wav_op.cc
new file mode 100644
index 00000000000..aed095076b9
--- /dev/null
+++ b/tensorflow/core/kernels/encode_wav_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+
+namespace tensorflow {
+
+// Encode a tensor as audio samples into the contents of a WAV format file.
+class EncodeWavOp : public OpKernel {
+ public:
+  explicit EncodeWavOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& audio = context->input(0);
+    OP_REQUIRES(context, audio.dims() == 2,
+                errors::InvalidArgument("audio must be 2-dimensional",
+                                        audio.shape().DebugString()));
+    const Tensor& sample_rate_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()),
+                errors::InvalidArgument(
+                    "Input sample_rate should be a scalar tensor, got ",
+                    sample_rate_tensor.shape().DebugString(), " instead."));
+    const int32 sample_rate = sample_rate_tensor.scalar<int32>()();
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(audio.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode audio with >= max int32 elements"));
+
+    const int32 channel_count = static_cast<int32>(audio.dim_size(1));
+    const int32 sample_count = static_cast<int32>(audio.dim_size(0));
+
+    // Encode audio to wav string.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES_OK(context,
+                   wav::EncodeAudioAsS16LEWav(
+                       audio.flat<float>().data(), sample_rate, channel_count,
+                       sample_count, &output->scalar<string>()()));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("EncodeWav").Device(DEVICE_CPU), EncodeWavOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_wav_op_test.cc b/tensorflow/core/kernels/encode_wav_op_test.cc
new file mode 100644
index 00000000000..2f92c13268b
--- /dev/null
+++ b/tensorflow/core/kernels/encode_wav_op_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(EncodeWavOpTest, EncodeWavTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor audio_tensor(DT_FLOAT, {4, 2});
+  test::FillValues<float>(
+      &audio_tensor, {0.0f, 0.5f, 1.0f, -1.0f, 0.25f, 0.75f, 1.25f, -0.5f});
+  Output audio_op =
+      Const(root.WithOpName("audio_op"), Input::Initializer(audio_tensor));
+
+  Output sample_rate_op = Const(root.WithOpName("sample_rate_op"), 44100);
+
+  EncodeWav encode_wav_op =
+      EncodeWav(root.WithOpName("encode_wav_op"), audio_op, sample_rate_op);
+
+  DecodeWav decode_wav_op =
+      DecodeWav(root.WithOpName("decode_wav_op"), encode_wav_op);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {decode_wav_op.audio, decode_wav_op.sample_rate},
+                           &outputs));
+
+  const Tensor& audio = outputs[0];
+  const int sample_rate = outputs[1].flat<int32>()(0);
+
+  EXPECT_EQ(2, audio.dims());
+  EXPECT_EQ(2, audio.dim_size(1));
+  EXPECT_EQ(4, audio.dim_size(0));
+  EXPECT_NEAR(0.0f, audio.flat<float>()(0), 1e-4f);
+  EXPECT_NEAR(0.5f, audio.flat<float>()(1), 1e-4f);
+  EXPECT_NEAR(1.0f, audio.flat<float>()(2), 1e-4f);
+  EXPECT_NEAR(-1.0f, audio.flat<float>()(3), 1e-4f);
+  EXPECT_NEAR(0.25f, audio.flat<float>()(4), 1e-4f);
+  EXPECT_NEAR(0.75f, audio.flat<float>()(5), 1e-4f);
+  EXPECT_NEAR(1.0f, audio.flat<float>()(6), 1e-4f);
+  EXPECT_NEAR(-0.5f, audio.flat<float>()(7), 1e-4f);
+  EXPECT_EQ(44100, sample_rate);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index d03f8fa33ae..e0cc08f101c 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -92,7 +92,18 @@ class ExampleParserOp : public OpKernel {
 
     for (int d = 0; d < static_cast<int>(attrs_.num_dense); ++d) {
       const Tensor& def_value = dense_defaults[d];
-      if (def_value.NumElements() > 0) {
+      if (attrs_.variable_length[d]) {
+        OP_REQUIRES(ctx, def_value.NumElements() == 1,
+                    errors::InvalidArgument(
+                        "dense_shape[", d, "] is a variable length shape: ",
+                        attrs_.dense_shapes[d].DebugString(),
+                        ", therefore "
+                        "def_value[",
+                        d,
+                        "] must contain a single element ("
+                        "the padding element).  But its shape is: ",
+                        def_value.shape().DebugString()));
+      } else if (def_value.NumElements() > 0) {
         OP_REQUIRES(ctx,
                     attrs_.dense_shapes[d].IsCompatibleWith(def_value.shape()),
                     errors::InvalidArgument(
@@ -100,12 +111,12 @@ class ExampleParserOp : public OpKernel {
                         "].shape() == ", def_value.shape().DebugString(),
                         " is not compatible with dense_shapes_[", d,
                         "] == ", attrs_.dense_shapes[d].DebugString()));
-        OP_REQUIRES(ctx, def_value.dtype() == attrs_.dense_types[d],
-                    errors::InvalidArgument(
-                        "dense_defaults[", d, "].dtype() == ",
-                        DataTypeString(def_value.dtype()), " != dense_types_[",
-                        d, "] == ", DataTypeString(attrs_.dense_types[d])));
       }
+      OP_REQUIRES(ctx, def_value.dtype() == attrs_.dense_types[d],
+                  errors::InvalidArgument(
+                      "dense_defaults[", d, "].dtype() == ",
+                      DataTypeString(def_value.dtype()), " != dense_types_[", d,
+                      "] == ", DataTypeString(attrs_.dense_types[d])));
     }
 
     example::Result result;
@@ -114,6 +125,7 @@ class ExampleParserOp : public OpKernel {
     for (int d = 0; d < attrs_.num_dense; ++d) {
       config.dense.push_back({dense_keys_t[d], attrs_.dense_types[d],
                               attrs_.dense_shapes[d], dense_defaults[d],
+                              attrs_.variable_length[d],
                               attrs_.elements_per_stride[d]});
     }
     for (int d = 0; d < attrs_.num_sparse; ++d) {
@@ -341,7 +353,7 @@ class SingleSequenceExampleParserOp : public OpKernel {
       for (const int dim : attrs_.context_dense_shapes[d].dim_sizes())
         out_shape.AddDim(dim);
       Tensor* out = nullptr;
-      context_dense_values.allocate(d, out_shape, &out);
+      OP_REQUIRES_OK(ctx, context_dense_values.allocate(d, out_shape, &out));
     }
 
     for (int d = 0; d < attrs_.num_context_dense; ++d) {
@@ -399,9 +411,11 @@ class SingleSequenceExampleParserOp : public OpKernel {
         TensorShape indices_shape({num_elements, 1});
         Tensor* sp_indices_d = nullptr;
         Tensor* sp_shape_d = nullptr;
-        context_sparse_indices.allocate(d, indices_shape, &sp_indices_d);
+        OP_REQUIRES_OK(ctx, context_sparse_indices.allocate(d, indices_shape,
+                                                            &sp_indices_d));
         context_sparse_values.set(d, feature_values);
-        context_sparse_shapes.allocate(d, TensorShape({1}), &sp_shape_d);
+        OP_REQUIRES_OK(ctx, context_sparse_shapes.allocate(d, TensorShape({1}),
+                                                           &sp_shape_d));
         auto shape_t = sp_shape_d->vec<int64>();
         shape_t(0) = num_elements;
         auto indices_t = sp_indices_d->matrix<int64>();
@@ -412,9 +426,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         Tensor* sp_indices_d = nullptr;
         Tensor* sp_values_d = nullptr;
         Tensor* sp_shape_d = nullptr;
-        context_sparse_indices.allocate(d, indices_shape, &sp_indices_d);
-        context_sparse_values.allocate(d, values_shape, &sp_values_d);
-        context_sparse_shapes.allocate(d, TensorShape({1}), &sp_shape_d);
+        OP_REQUIRES_OK(ctx, context_sparse_indices.allocate(d, indices_shape,
+                                                            &sp_indices_d));
+        OP_REQUIRES_OK(
+            ctx, context_sparse_values.allocate(d, values_shape, &sp_values_d));
+        OP_REQUIRES_OK(ctx, context_sparse_shapes.allocate(d, TensorShape({1}),
+                                                           &sp_shape_d));
         auto shape_t = sp_shape_d->vec<int64>();
         shape_t(0) = 0;
       }
@@ -456,7 +473,8 @@ class SingleSequenceExampleParserOp : public OpKernel {
         out_shape.AddDim(dim);
       }
       Tensor* out = nullptr;
-      feature_list_dense_values.allocate(d, out_shape, &out);
+      OP_REQUIRES_OK(ctx,
+                     feature_list_dense_values.allocate(d, out_shape, &out));
 
       for (int64 t = 0; t < fl.feature_size(); ++t) {
         const Feature& f = fl.feature(t);
@@ -518,9 +536,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       Tensor* sp_indices_d = nullptr;
       Tensor* sp_values_d = nullptr;
       Tensor* sp_shape_d = nullptr;
-      feature_list_sparse_indices.allocate(d, indices_shape, &sp_indices_d);
-      feature_list_sparse_values.allocate(d, values_shape, &sp_values_d);
-      feature_list_sparse_shapes.allocate(d, TensorShape({2}), &sp_shape_d);
+      OP_REQUIRES_OK(ctx, feature_list_sparse_indices.allocate(d, indices_shape,
+                                                               &sp_indices_d));
+      OP_REQUIRES_OK(ctx, feature_list_sparse_values.allocate(d, values_shape,
+                                                              &sp_values_d));
+      OP_REQUIRES_OK(ctx, feature_list_sparse_shapes.allocate(
+                              d, TensorShape({2}), &sp_shape_d));
       auto shape_t = sp_shape_d->vec<int64>();
       shape_t(0) = feature_list_size;
       shape_t(1) = max_num_features;
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 67ac4777130..29dbfd3b1bd 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -127,9 +127,11 @@ template <>
 ExampleTensorMap ExampleStore<FloatFiller>::serialized_example =
     ExampleStore<FloatFiller>::GetSerializedExamples();
 
-template <typename S, bool BenchmarkDense>
+enum BenchmarkType { kDense, kSparse, kVarLenDense };
+
+template <typename S, BenchmarkType b_type>
 struct BenchmarkOptions {
-  bool benchmark_dense = BenchmarkDense;
+  int benchmark_type = b_type;
   typedef S Store;
   typename S::Filler filler;
 };
@@ -145,19 +147,28 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
   std::vector<NodeBuilder::NodeOut> dense_keys;
   std::vector<NodeBuilder::NodeOut> dense_defaults;
   std::vector<DataType> sparse_types;
-  std::vector<TensorShape> dense_shapes;
+  std::vector<PartialTensorShape> dense_shapes;
   Options opt;
   for (int i = 0; i < num_keys; ++i) {
     Tensor key(DT_STRING, TensorShape());
     key.scalar<string>()() = strings::Printf("feature_%d", i);
-    if (opt.benchmark_dense) {
-      dense_keys.emplace_back(test::graph::Constant(g, key));
-      dense_defaults.emplace_back(test::graph::Constant(
-          g, opt.filler.make_dense_default(feature_size)));
-      dense_shapes.push_back(TensorShape({feature_size}));
-    } else {
-      sparse_keys.emplace_back(test::graph::Constant(g, key));
-      sparse_types.push_back(opt.filler.dtype);
+    switch (opt.benchmark_type) {
+      case kDense:
+        dense_keys.emplace_back(test::graph::Constant(g, key));
+        dense_defaults.emplace_back(test::graph::Constant(
+            g, opt.filler.make_dense_default(feature_size)));
+        dense_shapes.push_back(PartialTensorShape({feature_size}));
+        break;
+      case kVarLenDense:
+        dense_keys.emplace_back(test::graph::Constant(g, key));
+        dense_defaults.emplace_back(
+            test::graph::Constant(g, opt.filler.make_dense_default(1)));
+        dense_shapes.push_back(PartialTensorShape({-1}));
+        break;
+      case kSparse:
+        sparse_keys.emplace_back(test::graph::Constant(g, key));
+        sparse_types.push_back(opt.filler.dtype);
+        break;
     }
   }
 
@@ -176,12 +187,18 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
 }
 
 // Benchmark settings (Sparse, Dense) X (Bytes, Int64, Float)
-typedef BenchmarkOptions<ExampleStore<BytesFiller>, false> SparseString;
-typedef BenchmarkOptions<ExampleStore<BytesFiller>, true> DenseString;
-typedef BenchmarkOptions<ExampleStore<Int64Filler>, false> SparseInt64;
-typedef BenchmarkOptions<ExampleStore<Int64Filler>, true> DenseInt64;
-typedef BenchmarkOptions<ExampleStore<FloatFiller>, false> SparseFloat;
-typedef BenchmarkOptions<ExampleStore<FloatFiller>, true> DenseFloat;
+typedef BenchmarkOptions<ExampleStore<BytesFiller>, kSparse> SparseString;
+typedef BenchmarkOptions<ExampleStore<BytesFiller>, kDense> DenseString;
+typedef BenchmarkOptions<ExampleStore<BytesFiller>, kVarLenDense>
+    VarLenDenseString;
+typedef BenchmarkOptions<ExampleStore<Int64Filler>, kSparse> SparseInt64;
+typedef BenchmarkOptions<ExampleStore<Int64Filler>, kDense> DenseInt64;
+typedef BenchmarkOptions<ExampleStore<Int64Filler>, kVarLenDense>
+    VarLenDenseInt64;
+typedef BenchmarkOptions<ExampleStore<FloatFiller>, kSparse> SparseFloat;
+typedef BenchmarkOptions<ExampleStore<FloatFiller>, kDense> DenseFloat;
+typedef BenchmarkOptions<ExampleStore<FloatFiller>, kVarLenDense>
+    VarLenDenseFloat;
 
 // B == batch_size, K == num_keys. F == feature_size.
 // K must be one of 10, 100, 1000
@@ -205,9 +222,12 @@ typedef BenchmarkOptions<ExampleStore<FloatFiller>, true> DenseFloat;
 
 BM_AllParseExample(SparseString);
 BM_AllParseExample(DenseString);
+BM_AllParseExample(VarLenDenseString);
 BM_AllParseExample(SparseInt64);
 BM_AllParseExample(DenseInt64);
+BM_AllParseExample(VarLenDenseInt64);
 BM_AllParseExample(SparseFloat);
 BM_AllParseExample(DenseFloat);
+BM_AllParseExample(VarLenDenseFloat);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index f1ab4c4a4de..4fbf76d2d0d 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -82,7 +82,7 @@ class FactOpKernel : public OpKernel {
  protected:
   void Compute(OpKernelContext* context, const char* const facts[],
                uint64 count) {
-    Tensor* output_tensor = NULL;
+    Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
     auto output = output_tensor->template scalar<string>();
@@ -95,7 +95,8 @@ class FactOpKernel : public OpKernel {
 
 class FactOpKernel1 : public FactOpKernel {
  public:
-  FactOpKernel1(OpKernelConstruction* context) : FactOpKernel(context) {}
+  explicit FactOpKernel1(OpKernelConstruction* context)
+      : FactOpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     FactOpKernel::Compute(context, kFacts1, kNum1);
@@ -104,7 +105,8 @@ class FactOpKernel1 : public FactOpKernel {
 
 class FactOpKernel2 : public FactOpKernel {
  public:
-  FactOpKernel2(OpKernelConstruction* context) : FactOpKernel(context) {}
+  explicit FactOpKernel2(OpKernelConstruction* context)
+      : FactOpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     FactOpKernel::Compute(context, kFacts2, kNum2);
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 41f9c218437..68762af8cf1 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#define FAKE_QUANT_NO_DEBUG
-
 #include "tensorflow/core/kernels/fake_quant_ops_functor.h"
 
 #include "tensorflow/core/framework/numeric_op.h"
@@ -33,11 +31,9 @@ using tensorflow::DEVICE_CPU;
 #if GOOGLE_CUDA
 using tensorflow::DEVICE_GPU;
 #endif
-using tensorflow::DT_BOOL;
 using tensorflow::OpKernel;
 using tensorflow::OpKernelConstruction;
 using tensorflow::OpKernelContext;
-using tensorflow::PersistentTensor;
 using tensorflow::Tensor;
 using tensorflow::TensorShape;
 using tensorflow::TTypes;  // NOLINT This is needed in CUDA mode, do not remove.
@@ -48,6 +44,10 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+namespace {
+bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 8; }
+}  // namespace
+
 // -----------------------------------------------------------------------------
 // Implementation of FakeQuantWithMinMaxArgsOp, see its documentation in
 // core/ops/array_ops.cc.
@@ -63,16 +63,27 @@ class FakeQuantWithMinMaxArgsOp
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     FakeQuantWithMinMaxArgsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(), min_, max_,
-            output->flat<float>());
+            quant_min_, quant_max_, output->flat<float>());
   }
+
  private:
   float min_;
   float max_;
+  int quant_min_;
+  int quant_max_;
 };
 
 // Implementation of FakeQuantWithMinMaxArgsGradientOp, see its documentation in
@@ -91,6 +102,14 @@ class FakeQuantWithMinMaxArgsGradientOp
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   template <int NDIMS>
@@ -105,11 +124,15 @@ class FakeQuantWithMinMaxArgsGradientOp
                 InvalidArgument("gradient and input must be the same size"));
     FakeQuantWithMinMaxArgsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
-            input.flat<float>(), min_, max_, output->flat<float>());
+            input.flat<float>(), min_, max_, quant_min_, quant_max_,
+            output->flat<float>());
   }
+
  private:
   float min_;
   float max_;
+  int quant_min_;
+  int quant_max_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_CPU),
@@ -124,9 +147,8 @@ typedef Eigen::GpuDevice GPUDevice;
 // Forward declarations for functor specializations for GPU.
 template <>
 void FakeQuantWithMinMaxArgsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs,
+    const float min, const float max, const int quant_min, const int quant_max,
     typename TTypes<float>::Flat outputs);
 extern template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_GPU),
@@ -134,10 +156,9 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_GPU),
 
 template <>
 void FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs, const float min, const float max,
+    const int quant_min, const int quant_max,
     typename TTypes<float>::Flat backprops);
 REGISTER_KERNEL_BUILDER(
     Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_GPU),
@@ -152,12 +173,14 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    OP_REQUIRES_OK(context,
-                   context->allocate_persistent(DT_BOOL, {},
-                                                &check_min_max_handle_,
-                                                nullptr));
-#endif
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -165,9 +188,6 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
     const Tensor& input = context->input(0);
     const Tensor& min = context->input(1);
     const Tensor& max = context->input(2);
-#ifndef FAKE_QUANT_NO_DEBUG
-    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
-#endif
 
     Tensor* output;
     OP_REQUIRES_OK(context,
@@ -175,17 +195,13 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
 
     FakeQuantWithMinMaxVarsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(),
-            min.scalar<float>(), max.scalar<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-            check_min_max->scalar<bool>(),
-#endif
+            min.scalar<float>(), max.scalar<float>(), quant_min_, quant_max_,
             output->flat<float>());
   }
 
  private:
-#ifndef FAKE_QUANT_NO_DEBUG
-  PersistentTensor check_min_max_handle_;
-#endif
+  int quant_min_;
+  int quant_max_;
 };
 
 // Implementation of FakeQuantWithMinMaxVarsGradientOp, see its documentation in
@@ -195,12 +211,14 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsGradientOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    OP_REQUIRES_OK(context,
-                   context->allocate_persistent(DT_BOOL, {},
-                                                &check_min_max_handle_,
-                                                nullptr));
-#endif
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -211,9 +229,6 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
                 InvalidArgument("gradient and input must be the same size"));
     const Tensor& min = context->input(2);
     const Tensor& max = context->input(3);
-#ifndef FAKE_QUANT_NO_DEBUG
-    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
-#endif
 
     Tensor* grad_wrt_input;
     OP_REQUIRES_OK(context,
@@ -231,17 +246,13 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
     FakeQuantWithMinMaxVarsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
             input.flat<float>(), min.scalar<float>(), max.scalar<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-            check_min_max->scalar<bool>(),
-#endif
-            grad_wrt_input->flat<float>(), grad_wrt_min->scalar<float>(),
-            grad_wrt_max->scalar<float>());
+            quant_min_, quant_max_, grad_wrt_input->flat<float>(),
+            grad_wrt_min->scalar<float>(), grad_wrt_max->scalar<float>());
   }
 
  private:
-#ifndef FAKE_QUANT_NO_DEBUG
-  PersistentTensor check_min_max_handle_;
-#endif
+  int quant_min_;
+  int quant_max_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars").Device(DEVICE_CPU),
@@ -253,14 +264,10 @@ REGISTER_KERNEL_BUILDER(
 #if GOOGLE_CUDA
 template <>
 void FakeQuantWithMinMaxVarsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat output);
+    typename TTypes<float>::ConstScalar max, const int quant_min,
+    const int quant_max, typename TTypes<float>::Flat output);
 extern template struct FakeQuantWithMinMaxVarsFunctor<GPUDevice>;
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars")
                             .Device(DEVICE_GPU)
@@ -270,15 +277,11 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars")
 
 template <>
 void FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat backprops_wrt_input,
+    typename TTypes<float>::ConstScalar max, const int quant_min,
+    const int quant_max, typename TTypes<float>::Flat backprops_wrt_input,
     typename TTypes<float>::Scalar backprop_wrt_min,
     typename TTypes<float>::Scalar backprop_wrt_max);
 extern template struct FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>;
@@ -297,12 +300,14 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    OP_REQUIRES_OK(context,
-                   context->allocate_persistent(DT_BOOL, {},
-                                                &check_min_max_handle_,
-                                                nullptr));
-#endif
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -317,58 +322,20 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
     OP_REQUIRES(context, max.dim_size(0) == depth,
                 InvalidArgument("max has incorrect size, expected ", depth,
                                 " was ", max.dim_size(0)));
-#ifndef FAKE_QUANT_NO_DEBUG
-    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
-#endif
 
     Tensor* output;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
 
-    switch (input.dims()) {
-      case 4: {
-        FakeQuant4WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(), input.dim_size(0),
-                input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                output->flat<float>());
-        break;
-      }
-      case 2: {
-        FakeQuant2WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                output->flat<float>());
-        break;
-      }
-      case 1: {
-        FakeQuant1WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.vec<float>(), min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                output->vec<float>());
-        break;
-      }
-      default:
-        context->SetStatus(InvalidArgument("Only inputs of dimensions 1, 2 or "
-                                           "4 supported, was: ", input.dims()));
-        break;
-    }
+    FakeQuantWithMinMaxVarsPerChannelFunctor<Device> functor;
+    functor(context->eigen_device<Device>(), input.flat_inner_dims<float, 2>(),
+            min.vec<float>(), max.vec<float>(), quant_min_, quant_max_,
+            output->flat_inner_dims<float, 2>());
   }
 
  private:
-#ifndef FAKE_QUANT_NO_DEBUG
-  PersistentTensor check_min_max_handle_;
-#endif
+  int quant_min_;
+  int quant_max_;
 };
 
 // Implementation of FakeQuantWithMinMaxVarsPerChannelGradientOp, see its
@@ -377,13 +344,16 @@ template <typename Device>
 class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelGradientOp(
-      OpKernelConstruction* context) : OpKernel::OpKernel(context) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    OP_REQUIRES_OK(context,
-                   context->allocate_persistent(DT_BOOL, {},
-                                                &check_min_max_handle_,
-                                                nullptr));
-#endif
+      OpKernelConstruction* context)
+      : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    bool narrow_range;
+    OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -401,9 +371,6 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
     OP_REQUIRES(context, max.dim_size(0) == depth,
                 InvalidArgument("max has incorrect size, expected ", depth,
                                 " was ", max.dim_size(0)));
-#ifndef FAKE_QUANT_NO_DEBUG
-    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
-#endif
 
     Tensor* grad_wrt_input;
     OP_REQUIRES_OK(context,
@@ -418,102 +385,34 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(2, min_max_shape, &grad_wrt_max));
 
-    switch (input.dims()) {
-      case 4: {
-        FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(), input.dim_size(0),
-                input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
-        break;
-      }
-      case 2: {
-        FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
-        break;
-      }
-      case 1: {
-        FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                gradient.vec<float>(), input.vec<float>(),
-                min.vec<float>(), max.vec<float>(),
-#ifndef FAKE_QUANT_NO_DEBUG
-                check_min_max->scalar<bool>(),
-#endif
-                grad_wrt_input->vec<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
-        break;
-      }
-      default:
-        context->SetStatus(InvalidArgument("Only inputs of dimensions 1, 2 or "
-                                           "4 supported, was: ", input.dims()));
-        break;
-    }
+    FakeQuantWithMinMaxVarsPerChannelGradientFunctor<Device> functor;
+    functor(
+        context->eigen_device<Device>(), gradient.flat_inner_dims<float, 2>(),
+        input.flat_inner_dims<float, 2>(), min.vec<float>(), max.vec<float>(),
+        quant_min_, quant_max_, grad_wrt_input->flat_inner_dims<float, 2>(),
+        grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
   }
 
  private:
-#ifndef FAKE_QUANT_NO_DEBUG
-  PersistentTensor check_min_max_handle_;
-#endif
+  int quant_min_;
+  int quant_max_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
-                            .Device(DEVICE_CPU),
-                        FakeQuantWithMinMaxVarsPerChannelOp<CPUDevice>);
-REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
-                            .Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(
+    Name("FakeQuantWithMinMaxVarsPerChannel").Device(DEVICE_CPU),
+    FakeQuantWithMinMaxVarsPerChannelOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("FakeQuantWithMinMaxVarsPerChannelGradient").Device(DEVICE_CPU),
     FakeQuantWithMinMaxVarsPerChannelGradientOp<CPUDevice>);
 
 #if GOOGLE_CUDA
 template <>
-void FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Vec outputs);
-extern template struct FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>;
-
-template <>
-void FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
-    const GPUDevice& d, const Index batch_size, const Index depth,
-    typename TTypes<float>::ConstFlat inputs,
+void FakeQuantWithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<float>::ConstMatrix inputs,
     typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat outputs);
-extern template struct FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>;
-
-template <>
-void FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
-    const GPUDevice& d, const Index batch_size, const Index height,
-    const Index width, const Index depth,
-    typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat outputs);
-extern template struct FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+    typename TTypes<float>::ConstFlat max, const int quant_min,
+    const int quant_max, typename TTypes<float>::Matrix outputs);
+extern template struct FakeQuantWithMinMaxVarsPerChannelFunctor<GPUDevice>;
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
                             .Device(DEVICE_GPU)
@@ -522,53 +421,16 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
                         FakeQuantWithMinMaxVarsPerChannelOp<GPUDevice>);
 
 template <>
-void FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec gradients,
-    typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Vec backprops_wrt_input,
+void FakeQuantWithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<float>::ConstMatrix gradients,
+    typename TTypes<float>::ConstMatrix inputs,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    const int quant_min, const int quant_max,
+    typename TTypes<float>::Matrix backprops_wrt_input,
     typename TTypes<float>::Vec backprop_wrt_min,
     typename TTypes<float>::Vec backprop_wrt_max);
-extern template struct
-    FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
-
-template <>
-void FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d, const Index batch_size, const Index depth,
-    typename TTypes<float>::ConstFlat gradients,
-    typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat backprops_wrt_input,
-    typename TTypes<float>::Vec backprop_wrt_min,
-    typename TTypes<float>::Vec backprop_wrt_max);
-extern template struct
-    FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
-
-template <>
-void FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d, const Index batch_size, const Index height,
-    const Index width, const Index depth,
-    typename TTypes<float>::ConstFlat gradients,
-    typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
-#ifndef FAKE_QUANT_NO_DEBUG
-    typename TTypes<bool>::Scalar check_min_max,
-#endif
-    typename TTypes<float>::Flat backprops_wrt_input,
-    typename TTypes<float>::Vec backprop_wrt_min,
-    typename TTypes<float>::Vec backprop_wrt_max);
-extern template struct
-    FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+extern template struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor<
+    GPUDevice>;
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
                             .Device(DEVICE_GPU)
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 242eddfb799..a6cd0078eed 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -35,31 +35,29 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float StdRound(float input) {
 
 namespace tensorflow {
 
-static constexpr int kSteps = 255;
-static constexpr float kStepsFloat = static_cast<float>(kSteps);
-
 // Gymnastics with nudged zero point is to ensure that real zero maps to
 // an integer, which is required for e.g. zero-padding in convolutional layers.
-// Returns (nudged_min, nudged_max, nudged_scale).
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(const float min,
-                                                 const float max,
-                                                 float* nudged_min,
-                                                 float* nudged_max,
-                                                 float* scale) {
-  *scale = (max - min) / (kStepsFloat - 0.0f);
-  const float zero_point_from_min = 0.0f - min / *scale;
-  const uint8 nudged_zero_point = [zero_point_from_min] {
-    if (zero_point_from_min < 0.0f) {
-      return static_cast<uint8>(0);
-    } else if (zero_point_from_min > kStepsFloat) {
-      return static_cast<uint8>(kSteps);
-    } else {
-      return static_cast<uint8>(StdRound(zero_point_from_min));
+// Outputs nudged_min, nudged_max, nudged_scale.
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
+    const float min, const float max, const int quant_min, const int quant_max,
+    float* nudged_min, float* nudged_max, float* scale) {
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  const uint8 nudged_zero_point = [zero_point_from_min, quant_min,
+                                   quant_min_float, quant_max,
+                                   quant_max_float] {
+    if (zero_point_from_min < quant_min_float) {
+      return static_cast<uint8>(quant_min);
     }
+    if (zero_point_from_min > quant_max_float) {
+      return static_cast<uint8>(quant_max);
+    }
+    return static_cast<uint8>(StdRound(zero_point_from_min));
   }();
-
-  *nudged_min = (0.0f - nudged_zero_point) * (*scale);
-  *nudged_max = (kStepsFloat - nudged_zero_point) * (*scale);
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
 }
 
 template <typename T>
@@ -80,13 +78,15 @@ using Flat = typename tensorflow::TTypes<T>::Flat;
 template <typename Device>
 struct FakeQuantWithMinMaxArgsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs, const float min,
-                  const float max, Flat<float> outputs) {
+                  const float max, const int quant_min, const int quant_max,
+                  Flat<float> outputs) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale);
     const float inv_nudged_scale = 1.0f / nudged_scale;
 
     auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -103,13 +103,15 @@ template <typename Device>
 struct FakeQuantWithMinMaxArgsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, const float min, const float max,
+                  const int quant_min, const int quant_max,
                   Flat<float> backprops) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale);
 
     auto between_nudged_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -124,21 +126,11 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs,
                   ConstScalar<float> min, ConstScalar<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
+                  const int quant_min, const int quant_max,
                   Flat<float> outputs) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale);
     const auto nudged_scale_repl = inputs.constant(nudged_scale);
 
     const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -155,24 +147,13 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, ConstScalar<float> min,
-                  ConstScalar<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Flat<float> backprops_wrt_input,
+                  ConstScalar<float> max, const int quant_min,
+                  const int quant_max, Flat<float> backprops_wrt_input,
                   Scalar<float> backprop_wrt_min,
                   Scalar<float> backprop_wrt_max) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), quant_min, quant_max, &nudged_min, &nudged_max,
+          &nudged_scale);
 
     const auto between_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -196,103 +177,22 @@ using Index = typename tensorflow::TTypes<float>::ConstTensor::Index;
 // Functor called by FakeQuantWithMinMaxVarsPerChannelOp to do the work.
 // Compiles both for CPU and GPU.
 //
-// Already verified: inputs, outputs, min, max are of shape [d].
-template <typename Device>
-struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
-  void operator()(const Device& d, ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Vec<float> outputs) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    for (Index i = 0; i < min.size(); ++i) {
-      float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
-      const float clamped =
-          std::max(std::min(inputs(i), nudged_max), nudged_min);
-      const float clamped_shifted = clamped - nudged_min;
-
-      outputs(i) =
-          StdRound(clamped_shifted / nudged_scale) * nudged_scale + nudged_min;
-    }
-  }
-};
-
 // Already verified: inputs, outputs are of shape [b, d], min, max are of shape
 // [d].
 template <typename Device>
-struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
-  void operator()(const Device& d, const Index batch_size, const Index depth,
-                  ConstFlat<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Flat<float> outputs) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    Eigen::DSizes<Index, 2> restored(batch_size, depth);
-    const auto inputs_restored = inputs.reshape(restored);
+struct FakeQuantWithMinMaxVarsPerChannelFunctor {
+  void operator()(const Device& d, TTypes<float>::ConstMatrix inputs,
+                  ConstVec<float> min, ConstVec<float> max, const int quant_min,
+                  const int quant_max, TTypes<float>::Matrix outputs) {
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), quant_min, quant_max, &nudged_min, &nudged_max,
+            &nudged_scale);
       const auto clamped =
-          inputs_restored.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
+          inputs.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
 
-      outputs.reshape(restored).chip<1>(i).device(d) =
-          (clamped_shifted / nudged_scale + 0.5f).floor() * nudged_scale +
-          nudged_min;
-    }
-  }
-};
-
-// Already verified: inputs, outputs are of shape [b, h, w, d], min, max are
-// of shape [d].
-template <typename Device>
-struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
-  void operator()(const Device& d, const Index batch_size, const Index height,
-                  const Index width, const Index depth, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Flat<float> outputs) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    Eigen::DSizes<Index, 4> restored(batch_size, height, width, depth);
-    const auto inputs_restored = inputs.reshape(restored);
-    for (Index i = 0; i < min.size(); ++i) {
-      float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
-      const auto clamped =
-          inputs_restored.chip<3>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
-      const auto clamped_shifted = clamped - nudged_min;
-
-      outputs.reshape(restored).chip<3>(i).device(d) =
+      outputs.chip<1>(i).device(d) =
           (clamped_shifted / nudged_scale + 0.5f).floor() * nudged_scale +
           nudged_min;
     }
@@ -302,78 +202,26 @@ struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
 // Functor called by FakeQuantWithMinMaxVarsPerChannelGradientOp to do the work.
 // Compiles both for CPU and GPU.
 //
-// Already verified: gradients, inputs, outputs, min, max, backprops_wrt_input,
-// backprop_wrt_min, backprop_wrt_max are of shape [d].
-template <typename Device>
-struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
-  void operator()(const Device& d, ConstVec<float> gradients,
-                  ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Vec<float> backprops_wrt_input, Vec<float> backprop_wrt_min,
-                  Vec<float> backprop_wrt_max) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    for (Index i = 0; i < min.size(); ++i) {
-      float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
-
-      const bool between_min_max =
-          inputs(i) >= nudged_min && inputs(i) <= nudged_max;
-      backprops_wrt_input(i) = between_min_max ? gradients(i) : 0.0f;
-
-      const bool below_min = inputs(i) < nudged_min;
-      backprop_wrt_min(i) = below_min ? gradients(i) : 0.0f;
-
-      const bool above_max = inputs(i) > nudged_max;
-      backprop_wrt_max(i) = above_max ? gradients(i) : 0.0f;
-    }
-  }
-};
-
 // Already verified: gradients, inputs, backprops_wrt_input are of shape [b, d],
 // min, max, backprop_wrt_min, backprop_wrt_max are of shape [d].
 template <typename Device>
-struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
-  void operator()(const Device& d, const Index batch_size, const Index depth,
-                  ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Flat<float> backprops_wrt_input, Vec<float> backprop_wrt_min,
-                  Vec<float> backprop_wrt_max) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    Eigen::DSizes<Index, 2> restored(batch_size, depth);
-    const auto gradients_restored = gradients.reshape(restored);
-    const auto inputs_restored = inputs.reshape(restored);
+struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
+  void operator()(const Device& d, TTypes<float>::ConstMatrix gradients,
+                  TTypes<float>::ConstMatrix inputs, ConstVec<float> min,
+                  ConstVec<float> max, const int quant_min, const int quant_max,
+                  TTypes<float>::Matrix backprops_wrt_input,
+                  Vec<float> backprop_wrt_min, Vec<float> backprop_wrt_max) {
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
-      const auto gradients_chip = gradients_restored.chip<1>(i);
-      const auto inputs_chip = inputs_restored.chip<1>(i);
+      Nudge(min(i), max(i), quant_min, quant_max, &nudged_min, &nudged_max,
+            &nudged_scale);
+      const auto gradients_chip = gradients.chip<1>(i);
+      const auto inputs_chip = inputs.chip<1>(i);
 
       const auto between_min_max =
           (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
               .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
-      backprops_wrt_input.reshape(restored).chip<1>(i).device(d) =
+      backprops_wrt_input.chip<1>(i).device(d) =
           gradients_chip * between_min_max;
 
       const auto below_min =
@@ -392,59 +240,6 @@ struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
   }
 };
 
-// Already verified: gradients, inputs, backprops_wrt_input are of shape
-// [b, h, w, d], min, max, backprop_wrt_min, backprop_wrt_max are of shape [d].
-template <typename Device>
-struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
-  void operator()(const Device& d, const Index batch_size, const Index height,
-                  const Index width, const Index depth,
-                  ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
-#ifndef FAKE_QUANT_NO_DEBUG
-                  Scalar<bool> check_min_max,
-#endif
-                  Flat<float> backprops_wrt_input, Vec<float> backprop_wrt_min,
-                  Vec<float> backprop_wrt_max) {
-#ifndef FAKE_QUANT_NO_DEBUG
-    check_min_max.device(d) = (min <= 0.0f).all();
-    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
-    check_min_max.device(d) = (max >= 0.0f).all();
-    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
-    check_min_max.device(d) = (min < max).all();
-    eigen_assert(check_min_max() && "min should be < max coeff-wise");
-#endif
-
-    Eigen::DSizes<Index, 4> restored(batch_size, height, width, depth);
-    const auto gradients_restored = gradients.reshape(restored);
-    const auto inputs_restored = inputs.reshape(restored);
-    for (Index i = 0; i < min.size(); ++i) {
-      float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
-      const auto gradients_chip = gradients_restored.chip<3>(i);
-      const auto inputs_chip = inputs_restored.chip<3>(i);
-
-      const auto between_min_max =
-          (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
-              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
-      backprops_wrt_input.reshape(restored).chip<3>(i).device(d) =
-          gradients_chip * between_min_max;
-
-      const auto below_min =
-          (inputs_chip < nudged_min)
-              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
-      Eigen::DSizes<Index, 3> reduce(0, 1, 2);
-      backprop_wrt_min.chip<0>(i).device(d) =
-          (gradients_chip * below_min).sum(reduce);
-
-      const auto above_max =
-          (inputs_chip > nudged_max)
-              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
-      backprop_wrt_max.chip<0>(i).device(d) =
-          (gradients_chip * above_max).sum(reduce);
-    }
-  }
-};
-
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
index ad327937877..f6bfb884d94 100644
--- a/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
@@ -29,12 +29,8 @@ template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
 template struct FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>;
 template struct FakeQuantWithMinMaxVarsFunctor<GPUDevice>;
 template struct FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>;
-template struct FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>;
-template struct FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>;
-template struct FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>;
-template struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
-template struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
-template struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+template struct FakeQuantWithMinMaxVarsPerChannelFunctor<GPUDevice>;
+template struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 38ad345f0d3..8dd3f055bc7 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -48,105 +48,312 @@ class QuantOpsTest : public OpsTestBase {
       inputs_.push_back({nullptr, input});
     }
   }
+
+  void RunTestFakeQuantWithMinMaxArgs(const int num_bits,
+                                      const bool narrow_range, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Attr("min", min)
+                     .Attr("max", max)
+                     .Attr("num_bits", num_bits)
+                     .Attr("narrow_range", narrow_range)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVars(const int num_bits,
+                                      const bool narrow_range, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Attr("narrow_range", narrow_range)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(TensorShape({}), {min});
+    // Max.
+    AddInputFromArray<float>(TensorShape({}), {max});
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVarsPerChannel(
+      const int num_bits, const bool narrow_range,
+      const TensorShape& minmax_shape, const gtl::ArraySlice<float>& min,
+      const gtl::ArraySlice<float>& max, const TensorShape& shape,
+      const gtl::ArraySlice<float>& data,
+      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Attr("narrow_range", narrow_range)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(minmax_shape, min);
+    // Max.
+    AddInputFromArray<float>(minmax_shape, max);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
 };
 
-TEST_F(QuantOpsTest, WithArgsNoNudging) {
+TEST_F(QuantOpsTest, WithArgsNoNudging_RegularRange) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -10.0f)
-                   .Attr("max", 53.75f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
-  ExpectClose(expected, *output);
+  // Expected quantized values: -10.0, -9.75, ..., 53.75.
+  RunTestFakeQuantWithMinMaxArgs(
+      8, false, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
 }
 
-TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithArgsNoNudging_NarrowRange) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 254 / 4], scale: 1/4.
+  // Original zero point: 41, no nudging necessary.
+  // Expected quantized values: -10.0, -9.75, ..., 53.5.
+  RunTestFakeQuantWithMinMaxArgs(
+      8, true, -10.0f, 53.5f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.5f, 53.6f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.5f, 53.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedDown_RegularRange) {
   // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.1f)
-                   .Attr("max", 63.65f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, false, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
 }
 
-TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
-  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+TEST_F(QuantOpsTest, WithArgsNudgedDown_NarrowRange) {
+  // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged range: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  RunTestFakeQuantWithMinMaxArgs(8, true, -0.1f, 63.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.5f, 63.6f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedUp_RegularRange) {
+  // Original quantization range: [-0.51 / 4 + 0 / 4, -0.51 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.51, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.125f)
-                   .Attr("max", 63.625f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, false, -0.1275f, 63.6225f,
+                                 TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
 }
 
-TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
+TEST_F(QuantOpsTest, WithArgsNudgedUp_NarrowRange) {
+  // Original quantization range: [-0.51 / 4 + 0 / 4, -0.51 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.51, nudged to 2.
+  // Nudged range: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  RunTestFakeQuantWithMinMaxArgs(
+      8, true, -0.1275f, 63.3725f, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.25f, 63.3f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.25f, 63.25f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255_RegularRange) {
   // Original quantization range: [0.4 / 4 - 255 / 4, 0.4 / 4 + 0 / 4].
   // Scale: 1/4,  original zero point: 254.6, nudged to 255.
   // Nudged range: [-63.75; 0.0].
   // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -63.65f)
-                   .Attr("max", 0.1f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(
+      8, false, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
 }
 
-TEST_F(QuantOpsTest, WithArgsGradient) {
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255_NarrowRange) {
+  // Original quantization range: [0.4 / 4 - 254 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.5; 0.0].
+  // Expected quantized values: -63.5, -63.25, -63.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxArgs(8, true, -63.4f, 0.1f, TensorShape({2, 3}),
+                                 {-63.6f, -63.5f, -63.4f, -63.25f, 0.0f, 0.1f},
+                                 {-63.5f, -63.5f, -63.5f, -63.25f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_4Bits_RegularRange) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxArgs(4, false, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_4Bits_NarrowRange) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 14 / 2], scale: 1/2.
+  // Original zero point: 13, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.0.
+  RunTestFakeQuantWithMinMaxArgs(4, true, -6.0f, 1.0f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.0f, 1.1f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.0f, 1.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxArgs(4, false, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxArgs(4, true, -0.1f, 6.9f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.0f, 7.1f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxArgs(4, false, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, -0.00f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  RunTestFakeQuantWithMinMaxArgs(4, true, -0.4f, 6.6f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 6.5f, 6.6f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 6.5f, 6.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs15_4Bits_RegularRange) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxArgs(4, false, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs15_4Bits_NarrowRange) {
+  // Original quantization range: [0.4 / 2 - 14 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.0; 0.0].
+  // Expected quantized values: -7.0, -6.5, ..., 0.0.
+  RunTestFakeQuantWithMinMaxArgs(4, true, -6.8f, 0.2f, TensorShape({2, 3}),
+                                 {-7.1f, -7.0f, -6.9f, -6.7f, 0.0f, 0.1f},
+                                 {-7.0f, -7.0f, -7.0f, -6.5f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_2Bits_RegularRange) {
+  // Original quantization range: [-1 + 0 / 2, -1 + 3 / 2], scale: 1/2.
+  // Original zero point: 2, no nudging necessary.
+  // Expected quantized values: -1.0, -0.5, 0.0, 0.5.
+  RunTestFakeQuantWithMinMaxArgs(2, false, -1.0f, 0.5f, TensorShape({2, 3}),
+                                 {-1.1f, -1.0f, -0.9f, -0.3f, 0.1f, 0.6f},
+                                 {-1.0f, -1.0f, -1.0f, -0.5f, 0.0f, 0.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_2Bits_NarrowRange) {
+  // Original quantization range: [-1 + 0 / 2, -1 + 2 / 2], scale: 1/2.
+  // Original zero point: 3, no nudging necessary.
+  // Expected quantized values: -1.0, -0.5, 0.0.
+  RunTestFakeQuantWithMinMaxArgs(2, true, -1.0f, 0.0f, TensorShape({2, 3}),
+                                 {-1.1f, -1.0f, -0.9f, -0.3f, 0.0f, 0.1f},
+                                 {-1.0f, -1.0f, -1.0f, -0.5f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedDown_2Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 1.5].
+  // Expected quantized values: 0.0, 0.5, 1.0, 1.5.
+  RunTestFakeQuantWithMinMaxArgs(2, false, -0.1f, 1.4f, TensorShape({2, 3}),
+                                 {-0.2f, 0.1f, 0.7f, 1.0f, 1.3f, 1.6f},
+                                 {0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedDown_2Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 2 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 1.0].
+  // Expected quantized values: 0.0, 0.5, 1.0.
+  RunTestFakeQuantWithMinMaxArgs(2, true, -0.1f, 0.9f, TensorShape({2, 3}),
+                                 {-0.1f, 0.1f, 0.7f, 0.9f, 1.0f, 1.1f},
+                                 {-0.0f, 0.0f, 0.5f, 1.0f, 1.0f, 1.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedUp_2Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 1.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, 1.0.
+  RunTestFakeQuantWithMinMaxArgs(2, false, -0.4f, 1.1f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 1.0f, 1.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 1.0f, 1.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedUp_2Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 2 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 0.5].
+  // Expected quantized values: -0.5, 0.0, 0.5.
+  RunTestFakeQuantWithMinMaxArgs(2, true, -0.4f, 0.6f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 0.5f, 0.6f},
+                                 {-0.5f, -0.5f, -0.00f, 0.0f, 0.5f, 0.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
@@ -156,6 +363,7 @@ TEST_F(QuantOpsTest, WithArgsGradient) {
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Attr("min", -0.125f)
                    .Attr("max", 63.625f)
+                   .Attr("narrow_range", false)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
@@ -176,98 +384,266 @@ TEST_F(QuantOpsTest, WithArgsGradient) {
   ExpectClose(expected, *output);
 }
 
-TEST_F(QuantOpsTest, WithVarsNoNudging) {
-  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
-  // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+TEST_F(QuantOpsTest, WithArgsGradient_NarrowRange) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged range: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
                    .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("min", -0.125f)
+                   .Attr("max", 63.375f)
+                   .Attr("narrow_range", true)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-10.0f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {53.75f});
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.25f, 63.3f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
-TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithArgsGradient_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -0.4f)
+                   .Attr("max", 7.1f)
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithArgsGradient_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -0.4f)
+                   .Attr("max", 6.6f)
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 6.5f, 6.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsNoNudging_RegularRange) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
+  // Original zero point: 40, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.75.
+  RunTestFakeQuantWithMinMaxVars(
+      8, false, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNoNudging_NarrowRange) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 254 / 4], scale: 1/4.
+  // Original zero point: 41, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.5.
+  RunTestFakeQuantWithMinMaxVars(
+      8, true, -10.0f, 53.5f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.90f, -9.75f, 53.5f, 53.6f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.5f, 53.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedDown_RegularRange) {
   // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.1f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.65f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxVars(8, false, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {-0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
 }
 
-TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsNudgedDown_NarrowRange) {
+  // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged range: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVars(8, true, -0.1f, 63.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.5f, 63.6f},
+                                 {-0.0f, 0.0f, 0.0f, 0.25f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedUp_RegularRange) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.125f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.625f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxVars(8, false, -0.125f, 63.625f,
+                                 TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
 }
 
-TEST_F(QuantOpsTest, WithVarsGradient) {
+TEST_F(QuantOpsTest, WithVarsNudgedUp_NarrowRange) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged range: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  RunTestFakeQuantWithMinMaxVars(
+      8, true, -0.125f, 63.375f, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.25f, 63.3f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.25f, 63.25f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs255_RegularRange) {
+  // Original quantization range: [0.4 / 4 - 255 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.75; 0.0].
+  // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(
+      8, false, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.80f, -63.75f, -63.70f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs255_NarrowRange) {
+  // Original quantization range: [0.4 / 4 - 254 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.5; 0.0].
+  // Expected quantized values: -63.5, -63.25, -63.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(8, true, -63.4f, 0.1f, TensorShape({2, 3}),
+                                 {-63.6f, -63.5f, -63.4f, -63.25f, 0.0f, 0.1f},
+                                 {-63.5f, -63.5f, -63.5f, -63.25f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNoNudging_4Bits_RegularRange) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxVars(4, false, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNoNudging_4Bits_NarrowRange) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 14 / 2], scale: 1/2.
+  // Original zero point: 13, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.0.
+  RunTestFakeQuantWithMinMaxVars(4, true, -6.0f, 1.0f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.0f, 1.1f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.0f, 1.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVars(4, false, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f},
+                                 {-0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVars(4, true, -0.1f, 6.9f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.0f, 7.1f},
+                                 {-0.0f, 0.0f, 0.0f, 0.5f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVars(4, false, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, -0.00f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  RunTestFakeQuantWithMinMaxVars(4, true, -0.4f, 6.6f, TensorShape({2, 3}),
+                                 {-0.6f, -0.5f, -0.24f, 0.0f, 6.5f, 6.6f},
+                                 {-0.5f, -0.5f, -0.00f, 0.0f, 6.5f, 6.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZero15_4Bits_RegularRange) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(4, false, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZero15_4Bits_NarrowRange) {
+  // Original quantization range: [0.4 / 2 - 14 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.0; 0.0].
+  // Expected quantized values: -7.0, -6.5, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(4, true, -6.8f, 0.2f, TensorShape({2, 3}),
+                                 {-7.1f, -7.0f, -6.9f, -6.5f, 0.0f, 0.1f},
+                                 {-7.0f, -7.0f, -7.0f, -6.5f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsGradient_RegularRange) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Attr("narrow_range", false)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -290,10 +666,8 @@ TEST_F(QuantOpsTest, WithVarsGradient) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto in_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, in_flat(1),
-                     in_flat(2), in_flat(3),
-                     in_flat(4), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
@@ -307,212 +681,498 @@ TEST_F(QuantOpsTest, WithVarsGradient) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
-}
-
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.26f, -0.25f, -0.24f, 63.6f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 63.5f});
-  ExpectClose(expected, *output);
-}
-
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+TEST_F(QuantOpsTest, WithVarsGradient_NarrowRange) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged range: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                           0.25f, 63.75f, 63.8f});
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.25f, 63.3f});
   // Min.
-  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  AddInputFromArray<float>(TensorShape({}), {-0.125f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({}), {63.375f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f,
-                                0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+TEST_F(QuantOpsTest, WithVarsGradient_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({}), {-0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({}), {7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f,
-                                0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsGradient_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 6.5f, 6.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {-0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {6.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f,
-                             0.5f, 0.75f, 1.0f, 1.25f,
-                             1.5f, 1.75f, 2.0f, 2.25f,
-
-                             63.0f,  63.25f, 63.5f,   63.7f,
-                             63.75f, 63.8f,  63.9f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f,  0.0f, 0.25f,
-                     0.5f, 0.75f, 1.0f, 1.25f,
-                     1.5f, 1.75f, 2.0f, 2.25f,
-
-                     63.0f,  63.25f, 63.5f,  63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f}, TensorShape({4}),
+      {-0.1f, 0.0f, 63.75f, 63.8f}, {0.0f, 0.0f, 63.75f, 63.75f});
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.4f, 63.4f, 63.4f, 63.4f}, TensorShape({4}),
+      {-0.1f, 0.0f, 63.5f, 63.6f}, {0.0f, 0.0f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedUp_RegularRange) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, -0.2f,  0.0f,
-                             0.25f, 0.5f,   0.75f, 1.0f,
-                             1.25f, 1.5f,   1.75f, 2.0f,
-
-                             63.0f,  63.25f, 63.4f,   63.5f,
-                             63.6f,  63.7f, 100.0f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f,
-                      0.25f,  0.5f,   0.75f, 1.0f,
-                      1.25f,  1.5f,   1.75f, 2.0f,
-
-                      63.0f, 63.25f, 63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f}, TensorShape({4}),
+      {-0.26f, -0.25f, -0.24f, 63.6f}, {-0.25f, -0.25f, -0.25f, 63.5f});
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.375f, 63.375f, 63.375f, 63.375f}, TensorShape({4}),
+      {-0.26f, -0.25f, -0.24f, 63.3f}, {-0.25f, -0.25f, -0.25f, 63.25f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedDown_RegularRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({3}), {-0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f}, TensorShape({2, 3}),
+      {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.80f},
+      {-0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {63.4f, 63.4f, 63.4f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.25f, 63.5f, 63.6f},
+      {0.0f, 0.0f, 0.0f, 0.25f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedUp_RegularRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({3}), {-0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f}, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({3}), {-0.125f, -0.125f, -0.125f},
+      {63.375f, 63.375f, 63.375f}, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.25f, 63.3f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.25f, 63.25f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedDown_RegularRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false,
+      TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f},
+      TensorShape({1, 2, 3, 4}),
+      {-0.1f,   0.0f,   0.1f,   0.25f,  0.5f,    0.75f,
+        1.0f,   1.25f,  1.5f,   1.75f,  2.0f,    2.25f,
+       63.0f,  63.25f, 63.5f,  63.7f,  63.75f,  63.8f,
+       63.9f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { 0.0f,   0.0f,   0.0f,   0.25f,  0.5f,    0.75f,
+        1.0f,   1.25f,  1.5f,   1.75f,  2.0f,    2.25f,
+       63.0f,  63.25f, 63.5f,  63.75f, 63.75f,  63.75f,
+       63.75f, 63.75f, 63.75f, 63.75f, 63.75f,  63.75f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true,
+      TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.4f, 63.4f, 63.4f, 63.4f},
+      TensorShape({1, 2, 3, 4}),
+      {-0.1f,   0.0f,   0.1f,   0.25f,  0.5f,    0.75f,
+        1.0f,   1.25f,  1.5f,   1.75f,  2.0f,    2.25f,
+       63.0f,  63.25f, 63.3f,  63.4f,  63.5f,   63.6f,
+       63.7f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { 0.0f,   0.0f,   0.0f,   0.25f,  0.5f,    0.75f,
+        1.0f,   1.25f,  1.5f,   1.75f,  2.0f,    2.25f,
+       63.0f,  63.25f, 63.25f, 63.5f,  63.5f,   63.5f,
+       63.5f,  63.5f,  63.5f,  63.5f,  63.5f,   63.5f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedUp_RegularRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false,
+      TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f},
+      TensorShape({1, 2, 3, 4}),
+      { -0.3f,  -0.25f, -0.2f,   0.0f,    0.25f,  0.5f,
+         0.75f,  1.0f,   1.25f,  1.5f,    1.75f,  2.0f,
+        63.0f,  63.25f, 63.4f,  63.5f,   63.6f,  63.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {-0.25f,  -0.25f, -0.25f,  0.0f,   0.25f,   0.5f,
+        0.75f,   1.0f,   1.25f,  1.5f,   1.75f,   2.0f,
+        63.0f,  63.25f, 63.5f,  63.5f,  63.5f,   63.5f,
+        63.5f,  63.5f,  63.5f,  63.5f,  63.5f,   63.5f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true,
+      TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.375f, 63.375f, 63.375f, 63.375f},
+      TensorShape({1, 2, 3, 4}),
+      { -0.3f,  -0.25f, -0.2f,   0.0f,   0.25f,   0.5f,
+         0.75f,  1.0f,   1.25f,  1.5f,   1.75f,   2.0f,
+        63.0f,  63.2f,  63.25f, 63.3f,  63.4f,   63.5f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { -0.25f, -0.25f, -0.25f,  0.0f,   0.25f,   0.5f,
+         0.75f,  1.0f,   1.25f,  1.5f,   1.75f,   2.0f,
+        63.0f,  63.25f, 63.25f, 63.25f, 63.25f,  63.25f,
+        63.25f, 63.25f, 63.25f, 63.25f, 63.25f,  63.25f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {7.4f, 7.4f, 7.4f, 7.4f}, TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {6.9f, 6.9f, 6.9f, 6.9f}, TensorShape({4}), {-0.1f, 0.0f, 7.0f, 7.1f},
+      {0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {7.1f, 7.1f, 7.1f, 7.1f}, TensorShape({4}), {-0.6f, -0.5f, 7.0f, 7.1f},
+      {-0.5f, -0.5f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {6.6f, 6.6f, 6.6f, 6.6f}, TensorShape({4}), {-0.6f, -0.5f, 6.5f, 6.6f},
+      {-0.5f, -0.5f, 6.5f, 6.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {7.4f, 7.4f, 7.4f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {6.9f, 6.9f, 6.9f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.5f, 7.0f, 7.1f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false, TensorShape({3}), {-0.4f, -0.4f, -0.4f}, {7.1f, 7.1f, 7.1f},
+      TensorShape({2, 3}), {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+      {-0.5f, -0.5f, 0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true, TensorShape({3}), {-0.4f, -0.4f, -0.4f}, {6.6f, 6.6f, 6.6f},
+      TensorShape({2, 3}), {-0.6f, -0.5f, -0.24f, 0.0f, 6.5f, 6.6f},
+      {-0.5f, -0.5f, 0.0f, 0.0f, 6.5f, 6.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false,
+      TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f}, {7.4f, 7.4f, 7.4f, 7.4f},
+      TensorShape({1, 2, 3, 4}),
+      {-0.1f,   0.0f,   0.1f,   0.5f,   1.0f,    1.5f,
+        1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+        6.0f,   6.5f,   7.0f,   7.4f,   7.5f,    7.7f,
+        7.8f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { 0.0f,   0.0f,   0.0f,   0.5f,   1.0f,    1.5f,
+        1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+        6.0f,   6.5f,   7.0f,   7.5f,   7.5f,    7.5f,
+        7.5f,   7.5f,   7.5f,   7.5f,   7.5f,    7.5f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true,
+      TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f}, {6.9f, 6.9f, 6.9f, 6.9f},
+      TensorShape({1, 2, 3, 4}),
+      {-0.1f,   0.0f,   0.1f,   0.5f,   1.0f,    1.5f,
+        1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+        6.0f,   6.5f,   6.8f,   6.9f,   7.0f,    7.1f,
+        7.2f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { 0.0f,   0.0f,   0.0f,   0.5f,   1.0f,    1.5f,
+        1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+        6.0f,   6.5f,   7.0f,   7.0f,   7.0f,    7.0f,
+        7.0f,   7.0f,   7.0f,   7.0f,   7.0f,    7.0f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, false,
+      TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f}, {7.1f, 7.1f, 7.1f, 7.1f},
+      TensorShape({1, 2, 3, 4}),
+      { -0.6f,  -0.5f,  -0.4f,   0.0f,   0.5f,    1.0f,
+         1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+         6.0f,   6.5f,   6.9f,   7.0f,   7.1f,    7.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { -0.5f, -0.5f,   -0.5f,   0.0f,   0.5f,    1.0f,
+         1.5f,  2.0f,    2.5f,   3.0f,   3.5f,    4.0f,
+         6.0f,  6.5f,    7.0f,   7.0f,   7.0f,    7.0f,
+         7.0f,  7.0f,    7.0f,   7.0f,   7.0f,    7.0f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  // clang-format off
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, true,
+      TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f}, {6.6f, 6.6f, 6.6f, 6.6f},
+      TensorShape({1, 2, 3, 4}),
+      { -0.6f,  -0.5f,  -0.4f,   0.0f,   0.5f,    1.0f,
+         1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+         5.5f,   6.0f,   6.4f,   6.5f,   6.6f,    6.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      { -0.5f , -0.5f,  -0.5f,   0.0f,   0.5f,    1.0f,
+         1.5f,   2.0f,   2.5f,   3.0f,   3.5f,    4.0f,
+         5.5f,   6.0f,   6.5f,   6.5f,   6.5f,    6.5f,
+         6.5f,   6.5f,   6.5f,   6.5f,   6.5f,    6.5f});
+  // clang-format on
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -540,23 +1200,65 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.4f, 63.4f, 63.4f, 63.4f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedUp_RegularRange) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -586,23 +1288,67 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.25f, 63.3f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.375f, 63.375f, 63.375f, 63.375f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -613,8 +1359,7 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                            0.25f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
   // Max.
@@ -626,30 +1371,29 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -660,8 +1404,52 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.3f, -0.25f, -0.2f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.4f, 63.4f, 63.4f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_RegularRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.3f, -0.25f, -0.2f, 0.0f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
   // Max.
@@ -670,6 +1458,619 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.3f, -0.25f, -0.2f, 0.0f, 63.25f, 63.3f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.375f, 63.375f, 63.375f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedDown_RegularRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.1f,   0.0f, 63.75f, 63.8f, -0.1f,   0.0f,
+                            63.75f, 63.8f, -0.1f,   0.0f, 63.75f, 63.8f,
+                            -0.1f,   0.0f, 63.75f, 63.8f, -0.1f,   0.0f,
+                            63.75f, 63.8f, -0.1f,   0.0f, 63.75f, 63.8f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedDown_NarrowRange) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.4, nudged to 1.
+  // Nudged ranges: [0.0; 63.5].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.1f,  0.0f, 63.5f, 63.6f, -0.1f,  0.0f,
+                            63.5f, 63.6f, -0.1f,  0.0f, 63.5f, 63.6f,
+                            -0.1f,  0.0f, 63.5f, 63.6f, -0.1f,  0.0f,
+                            63.5f, 63.6f, -0.1f,  0.0f, 63.5f, 63.6f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.4f, 63.4f, 63.4f, 63.4f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedUp_RegularRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f,
+                            -0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.625f, 63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedUp_NarrowRange) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 254 / 4].
+  // Scale: 1/4,  original zero point: 1.5, nudged to 2.
+  // Nudged ranges: [-0.25; 63.25].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.25.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           { -0.3f,  -0.25f, 63.25f, 63.3f,  -0.3f,  -0.25f,
+                             63.25f, 63.3f,  -0.3f,  -0.25f, 63.25f, 63.3f,
+                             -0.3f,  -0.25f, 63.25f, 63.3f,  -0.3f,  -0.25f,
+                             63.25f, 63.3f,  -0.3f,  -0.25f, 63.25f, 63.3f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.375f, 63.375f, 63.375f, 63.375f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim1GradientNudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim1GradientNudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 7.0f, 7.1f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {6.9f, 6.9f, 6.9f, 6.9f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim1GradientNudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.6f, -0.5f, 7.0f, 7.1f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.6f, -0.5f, 6.5f, 6.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {6.6f, 6.6f, 6.6f, 6.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim2GradientNudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {7.4f, 7.4f, 7.4f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim2GradientNudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f, 0.5f, 7.0f, 7.1f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {6.9f, 6.9f, 6.9f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim2GradientNudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {7.1f, 7.1f, 7.1f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
@@ -691,12 +2092,63 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 6.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 6.5f, 6.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {6.6f, 6.6f, 6.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2),
+                     grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim4GradientNudgedDown_4Bits_RegularRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -706,18 +2158,17 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
+  // clang-format off
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f,  7.5f, 7.6f, -0.1f, 0.0f,
+                             7.5f, 7.6f, -0.1f, 0.0f,  7.5f, 7.6f,
+                            -0.1f, 0.0f,  7.5f, 7.6f, -0.1f, 0.0f,
+                             7.5f, 7.6f, -0.1f, 0.0f,  7.5f, 7.6f});
+  // clang-format on
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -731,7 +2182,6 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
       {0.0f, grad_flat(1), grad_flat(2), 0.0f,
        0.0f, grad_flat(5), grad_flat(6), 0.0f,
        0.0f, grad_flat(9), grad_flat(10), 0.0f,
-
        0.0f, grad_flat(13), grad_flat(14), 0.0f,
        0.0f, grad_flat(17), grad_flat(18), 0.0f,
        0.0f, grad_flat(21), grad_flat(22), 0.0f});
@@ -754,12 +2204,15 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim4GradientNudgedDown_4Bits_NarrowRange) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.2, nudged to 1.
+  // Nudged range: [0.0; 7.0].
+  // Expected quantized values: 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
@@ -769,20 +2222,17 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
+  // clang-format off
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f});
+                           {-0.1f, 0.0f,  7.0f, 7.1f, -0.1f, 0.0f,
+                             7.0f, 7.1f, -0.1f, 0.0f,  7.0f, 7.1f,
+                            -0.1f, 0.0f,  7.0f, 7.1f, -0.1f, 0.0f,
+                             7.0f, 7.1f, -0.1f, 0.0f,  7.0f, 7.1f});
+  // clang-format on
   // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({4}), {6.9f, 6.9f, 6.9f, 6.9f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -792,10 +2242,134 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
   FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-                     0.0f, grad_flat(5), grad_flat(6), 0.0f,
-                     0.0f, grad_flat(9), grad_flat(10), 0.0f,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelDim4GradientNudgedUp_4Bits_RegularRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.6f, -0.5f,  7.0f,  7.1f, -0.6f, -0.5f,
+                             7.0f,  7.1f, -0.6f, -0.5f,  7.0f,  7.1f,
+                            -0.6f, -0.5f,  7.0f,  7.1f, -0.6f, -0.5f,
+                             7.0f,  7.1f, -0.6f, -0.5f,  7.0f,  7.1f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedUp_4Bits_NarrowRange) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 14 / 2].
+  // Scale: 1/2,  original zero point: 1.8, nudged to 2.
+  // Nudged range: [-0.5; 6.5].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("num_bits", 4)
+                   .Attr("narrow_range", true)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  // clang-format off
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.6f, -0.5f,  6.5f,  6.6f, -0.6f, -0.5f,
+                             6.5f,  6.6f, -0.6f, -0.5f,  6.5f,  6.6f,
+                            -0.6f, -0.5f,  6.5f,  6.6f, -0.6f, -0.5f,
+                             6.5f,  6.6f, -0.6f, -0.5f,  6.5f,  6.6f});
+  // clang-format on
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {6.6f, 6.6f, 6.6f, 6.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
                      0.0f, grad_flat(13), grad_flat(14), 0.0f,
                      0.0f, grad_flat(17), grad_flat(18), 0.0f,
                      0.0f, grad_flat(21), grad_flat(22), 0.0f});
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 38dfd8701b7..32936d65c8e 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -15,9 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-// See docs in ../ops/fft_ops.cc.
+// See docs in ../ops/spectral_ops.cc.
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -26,12 +25,258 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
+#endif
 
 namespace tensorflow {
 
+class FFTBase : public OpKernel {
+ public:
+  explicit FFTBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in = ctx->input(0);
+    const TensorShape& input_shape = in.shape();
+    const int fft_rank = Rank();
+    OP_REQUIRES(
+        ctx, input_shape.dims() >= fft_rank,
+        errors::InvalidArgument("Input must have rank of at least ", fft_rank,
+                                " but got: ", input_shape.DebugString()));
+
+    Tensor* out;
+    TensorShape output_shape = input_shape;
+    uint64 fft_shape[3] = {0, 0, 0};
+
+    // In R2C or C2R mode, we use a second input to specify the FFT length
+    // instead of inferring it from the input shape.
+    if (IsReal()) {
+      const Tensor& fft_length = ctx->input(1);
+      OP_REQUIRES(ctx,
+                  fft_length.shape().dims() == 1 &&
+                      fft_length.shape().dim_size(0) == fft_rank,
+                  errors::InvalidArgument("fft_length must have shape [",
+                                          fft_rank, "]"));
+
+      auto fft_length_as_vec = fft_length.vec<int32>();
+      for (int i = 0; i < fft_rank; ++i) {
+        fft_shape[i] = fft_length_as_vec(i);
+        // Each input dimension must have length of at least fft_shape[i]. For
+        // IRFFTs, the inner-most input dimension must have length of at least
+        // fft_shape[i] / 2 + 1.
+        bool inner_most = (i == fft_rank - 1);
+        uint64 min_input_dim_length =
+            !IsForward() && inner_most ? fft_shape[i] / 2 + 1 : fft_shape[i];
+        auto input_index = input_shape.dims() - fft_rank + i;
+        OP_REQUIRES(
+            ctx,
+            // We pass through empty tensors, so special case them here.
+            input_shape.dim_size(input_index) == 0 ||
+                input_shape.dim_size(input_index) >= min_input_dim_length,
+            errors::InvalidArgument(
+                "Input dimension ", input_index,
+                " must have length of at least ", min_input_dim_length,
+                " but got: ", input_shape.dim_size(input_index)));
+        uint64 dim = IsForward() && inner_most && fft_shape[i] != 0
+                         ? fft_shape[i] / 2 + 1
+                         : fft_shape[i];
+        output_shape.set_dim(output_shape.dims() - fft_rank + i, dim);
+      }
+    } else {
+      for (int i = 0; i < fft_rank; ++i) {
+        fft_shape[i] =
+            output_shape.dim_size(output_shape.dims() - fft_rank + i);
+      }
+    }
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &out));
+    if (input_shape.num_elements() == 0) {
+      return;
+    }
+
+    DoFFT(ctx, in, fft_shape, out);
+  }
+
+ protected:
+  virtual int Rank() const = 0;
+  virtual bool IsForward() const = 0;
+  virtual bool IsReal() const = 0;
+
+  // The function that actually computes the FFT.
+  virtual void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+                     Tensor* out) = 0;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <bool Forward, bool _Real, int FFTRank>
+class FFTCPU : public FFTBase {
+ public:
+  using FFTBase::FFTBase;
+
+ protected:
+  int Rank() const override { return FFTRank; }
+  bool IsForward() const override { return Forward; }
+  bool IsReal() const override { return _Real; }
+
+  void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+             Tensor* out) override {
+    // Create the axes (which are always trailing).
+    const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
+    auto device = ctx->eigen_device<CPUDevice>();
+
+    if (!IsReal()) {
+      auto input = (Tensor(in)).flat_inner_dims<complex64, FFTRank + 1>();
+      // Compute the FFT using eigen.
+      auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+      constexpr auto direction =
+          Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE;
+      output.device(device) =
+          input.template fft<Eigen::BothParts, direction>(axes);
+    } else {
+      if (IsForward()) {
+        auto input = (Tensor(in)).flat_inner_dims<float, FFTRank + 1>();
+        const auto input_dims = input.dimensions();
+
+        // Slice input to fft_shape on its inner-most dimensions.
+        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
+        input_slice_sizes[0] = input_dims[0];
+        TensorShape temp_shape{input_dims[0]};
+        for (int i = 1; i <= FFTRank; ++i) {
+          input_slice_sizes[i] = fft_shape[i - 1];
+          temp_shape.AddDim(fft_shape[i - 1]);
+        }
+
+        auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+        const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
+
+        // Compute the full FFT using a temporary tensor.
+        Tensor temp;
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
+                                               temp_shape, &temp));
+        auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+        full_fft.device(device) =
+            input.slice(zero_start_indices, input_slice_sizes)
+                .template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
+
+        // Slice away the negative frequency components.
+        output.device(device) =
+            full_fft.slice(zero_start_indices, output.dimensions());
+      } else {
+        // Reconstruct the full FFT and take the inverse.
+        auto input = ((Tensor)in).flat_inner_dims<complex64, FFTRank + 1>();
+        auto output = out->flat_inner_dims<float, FFTRank + 1>();
+        const auto input_dims = input.dimensions();
+
+        // Calculate the shape of the temporary tensor for the full FFT and the
+        // region we will slice from input given fft_shape. We slice input to
+        // fft_shape on its inner-most dimensions, except the last (which we
+        // slice to fft_shape[-1] / 2 + 1).
+        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> input_slice_sizes;
+        input_slice_sizes[0] = input_dims[0];
+        TensorShape full_fft_shape;
+        full_fft_shape.AddDim(input_dims[0]);
+        for (auto i = 1; i <= FFTRank; i++) {
+          input_slice_sizes[i] =
+              i == FFTRank ? fft_shape[i - 1] / 2 + 1 : fft_shape[i - 1];
+          full_fft_shape.AddDim(fft_shape[i - 1]);
+        }
+
+        Tensor temp;
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
+                                               full_fft_shape, &temp));
+        auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+
+        // Calculate the starting point and range of the source of
+        // negative frequency part.
+        auto neg_sizes = input_slice_sizes;
+        neg_sizes[FFTRank] =
+            fft_shape[FFTRank - 1] - input_slice_sizes[FFTRank];
+        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_target_indices;
+        neg_target_indices[FFTRank] = input_slice_sizes[FFTRank];
+
+        const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> start_indices;
+        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_start_indices;
+        neg_start_indices[FFTRank] = 1;
+
+        full_fft.slice(start_indices, input_slice_sizes).device(device) =
+            input.slice(start_indices, input_slice_sizes);
+
+        // First, conduct IFFTs on outer dimensions. We save computation (and
+        // avoid touching uninitialized memory) by slicing full_fft to the
+        // subregion we wrote input to.
+        if (FFTRank > 1) {
+          const auto outer_axes =
+              Eigen::ArrayXi::LinSpaced(FFTRank - 1, 1, FFTRank - 1);
+          full_fft.slice(start_indices, input_slice_sizes).device(device) =
+              full_fft.slice(start_indices, input_slice_sizes)
+                  .template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(
+                      outer_axes);
+        }
+
+        // Reconstruct the full FFT by appending reversed and conjugated
+        // spectrum as the negative frequency part.
+        Eigen::array<bool, FFTRank + 1> reverse_last_axis;
+        for (auto i = 0; i <= FFTRank; i++) {
+          reverse_last_axis[i] = i == FFTRank;
+        }
+
+        if (neg_sizes[FFTRank] != 0) {
+          full_fft.slice(neg_target_indices, neg_sizes).device(device) =
+              full_fft.slice(neg_start_indices, neg_sizes)
+                  .reverse(reverse_last_axis)
+                  .conjugate();
+        }
+
+        auto inner_axis = Eigen::array<int, 1>{FFTRank};
+        output.device(device) =
+            full_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(
+                inner_axis);
+      }
+    }
+  }
+};
+
+// Use labels to distinguish between internal and open source versions
+// of these kernels.
+#ifdef PLATFORM_GOOGLE
+#define FFT_LABEL "eigen"
+#else
+#define FFT_LABEL ""
+#endif
+
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 3>);
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 3>);
+
+REGISTER_KERNEL_BUILDER(Name("RFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 1>);
+REGISTER_KERNEL_BUILDER(Name("IRFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, true, 1>);
+REGISTER_KERNEL_BUILDER(Name("RFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 2>);
+REGISTER_KERNEL_BUILDER(Name("IRFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, true, 2>);
+REGISTER_KERNEL_BUILDER(Name("RFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 3>);
+REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, true, 3>);
+
+#undef FFT_LABEL
+
+#if GOOGLE_CUDA
+
 namespace {
 // TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
 template <typename T>
@@ -42,79 +287,95 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 }
 }  // end namespace
 
-class FFTGPUBase : public OpKernel {
+class FFTGPUBase : public FFTBase {
  public:
-  explicit FFTGPUBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& in = ctx->input(0);
-    const TensorShape& shape = in.shape();
-    OP_REQUIRES(
-        ctx, shape.dims() >= Rank(),
-        errors::InvalidArgument("Input must have rank of at least ", Rank(),
-                                " but got: ", shape.DebugString()));
-    Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &out));
-    if (shape.num_elements() == 0) {
-      return;
-    }
-    DoFFT(ctx, in, out);
-  }
+  using FFTBase::FFTBase;
 
  protected:
-  virtual int Rank() const = 0;
-  virtual bool IsForward() const = 0;
-
- private:
-  void DoFFT(OpKernelContext* ctx, const Tensor& in, Tensor* out) {
+  void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+             Tensor* out) override {
     auto* stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
-    const TensorShape& shape = in.shape();
-    auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
-    auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
+    const TensorShape& input_shape = in.shape();
+    const TensorShape& output_shape = out->shape();
 
-    const int rank = Rank();
+    const int fft_rank = Rank();
     int batch_size = 1;
-    for (int i = 0; i < shape.dims() - rank; ++i) {
-      batch_size *= shape.dim_size(i);
+    for (int i = 0; i < input_shape.dims() - fft_rank; ++i) {
+      batch_size *= input_shape.dim_size(i);
     }
-    uint64 data_length = 1;
-    uint64 data_dims[3];
-    for (int i = 0; i < rank; ++i) {
-      auto dim = shape.dim_size(shape.dims() - rank + i);
-      data_length *= dim;
-      data_dims[i] = dim;
+    uint64 input_embed[3];
+    uint64 input_stride = 1;
+    uint64 input_distance = 1;
+    uint64 output_embed[3];
+    uint64 output_stride = 1;
+    uint64 output_distance = 1;
+
+    for (int i = 0; i < fft_rank; ++i) {
+      auto dim_offset = input_shape.dims() - fft_rank + i;
+      input_embed[i] = input_shape.dim_size(dim_offset);
+      input_distance *= input_shape.dim_size(dim_offset);
+      output_embed[i] = output_shape.dim_size(dim_offset);
+      output_distance *= output_shape.dim_size(dim_offset);
     }
 
-    constexpr uint64* kInputEmbed = nullptr;
-    constexpr uint64 kInputStride = 1;
-    constexpr uint64 kInputDistance = 1;
-    constexpr uint64* kOutputEmbed = nullptr;
-    constexpr uint64 kOutputStride = 1;
-    constexpr uint64 kOutputDistance = 1;
     constexpr bool kInPlaceFft = false;
+    const auto kFftType =
+        IsReal() ? (IsForward() ? perftools::gputools::fft::Type::kR2C
+                                : perftools::gputools::fft::Type::kC2R)
+                 : (IsForward() ? perftools::gputools::fft::Type::kC2CForward
+                                : perftools::gputools::fft::Type::kC2CInverse);
 
     auto plan = stream->parent()->AsFft()->CreateBatchedPlan(
-        stream, rank, data_dims, kInputEmbed, kInputStride, kInputDistance,
-        kOutputEmbed, kOutputStride, kOutputDistance,
-        IsForward() ? perftools::gputools::fft::Type::kC2CForward
-                    : perftools::gputools::fft::Type::kC2CInverse,
-        kInPlaceFft, batch_size);
+        stream, fft_rank, fft_shape, input_embed, input_stride, input_distance,
+        output_embed, output_stride, output_distance, kFftType, kInPlaceFft,
+        batch_size);
 
-    OP_REQUIRES(
-        ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-        errors::Internal("c2c fft failed : in.shape=", shape.DebugString()));
-    if (!IsForward()) {
-      auto alpha = complex64(1.f / data_length);
+    if (IsReal()) {
+      if (IsForward()) {
+        auto src = AsDeviceMemory<float>(in.flat<float>().data());
+        auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
+        OP_REQUIRES(
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+      } else {
+        auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
+        auto dst = AsDeviceMemory<float>(out->flat<float>().data());
+        OP_REQUIRES(
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+        auto alpha = 1.f / output_distance;
+        OP_REQUIRES(
+            ctx,
+            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                .ok(),
+            errors::Internal("BlasScal failed : in.shape=",
+                             input_shape.DebugString()));
+      }
+    } else {
+      auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
+      auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
       OP_REQUIRES(
-          ctx, stream->ThenBlasScal(shape.num_elements(), alpha, &dst, 1).ok(),
-          errors::Internal("BlasScal failed : in.shape=", shape.DebugString()));
+          ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+          errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                           " in.shape=", input_shape.DebugString()));
+      if (!IsForward()) {
+        auto alpha = complex64(1.f / output_distance);
+        OP_REQUIRES(
+            ctx,
+            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                .ok(),
+            errors::Internal("BlasScal failed : in.shape=",
+                             input_shape.DebugString()));
+      }
     }
   }
 };
 
-template <bool Forward, int FFTRank>
+template <bool Forward, bool _Real, int FFTRank>
 class FFTGPU : public FFTGPUBase {
  public:
   static_assert(FFTRank >= 1 && FFTRank <= 3,
@@ -124,25 +385,53 @@ class FFTGPU : public FFTGPUBase {
  protected:
   int Rank() const override { return FFTRank; }
   bool IsForward() const override { return Forward; }
+  bool IsReal() const override { return _Real; }
 };
 
-REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU), FFTGPU<true, 1>);
-REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU), FFTGPU<false, 1>);
-REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU), FFTGPU<true, 2>);
-REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU), FFTGPU<false, 2>);
-REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU), FFTGPU<true, 3>);
-REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU), FFTGPU<false, 3>);
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU), FFTGPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 3>);
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 3>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 1>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 1>);
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 2>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 2>);
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 3>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 3>);
 
 // Deprecated kernels.
-REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU), FFTGPU<true, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU), FFTGPU<false, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU), FFTGPU<true, 2>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 2>);
 REGISTER_KERNEL_BUILDER(Name("BatchIFFT2D").Device(DEVICE_GPU),
-                        FFTGPU<false, 2>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU), FFTGPU<true, 3>);
+                        FFTGPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 3>);
 REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU),
-                        FFTGPU<false, 3>);
+                        FFTGPU<false, false, 3>);
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 9acf8cbd2db..030cf8a49db 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -229,7 +229,13 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // an optimized case where the queue 'knows' what attributes to
       // use, and plumbs them through here.
       Tensor element;
-      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      Status status = ctx->allocate_temp(component_dtypes_[i],
+                                         ManyOutShape(i, 0), &element);
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+        callback(Tuple());
+        return;
+      }
       tuple.emplace_back(element);
     }
     callback(tuple);
@@ -275,7 +281,7 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       }
                     }
                   }
-                  if (allow_small_batch && queues_[0].size() > 0) {
+                  if (allow_small_batch && !queues_[0].empty()) {
                     // Request all remaining elements in the queue.
                     queue_size = queues_[0].size();
                     attempt->tuple.clear();
@@ -309,8 +315,10 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       const TensorShape shape =
                           ManyOutShape(i, attempt->elements_requested);
                       Tensor element;
-                      attempt->context->allocate_temp(component_dtypes_[i],
-                                                      shape, &element);
+                      attempt->context->SetStatus(
+                          attempt->context->allocate_temp(component_dtypes_[i],
+                                                          shape, &element));
+                      if (!attempt->context->status().ok()) return kComplete;
                       attempt->tuple.emplace_back(element);
                     }
                   }
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 08ec4baff3e..8a0a558eefa 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -56,14 +56,59 @@ DEFINE_SETZERO_CPU(complex128);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-  out.device(d) = out.constant(T(0));
+      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
   template struct SetZeroFunctor<Eigen::SyclDevice, T>;
+DEFINE_SETZERO_SYCL(bool);
 DEFINE_SETZERO_SYCL(float);
+DEFINE_SETZERO_SYCL(double);
+DEFINE_SETZERO_SYCL(uint8);
+DEFINE_SETZERO_SYCL(int8);
+DEFINE_SETZERO_SYCL(uint16);
+DEFINE_SETZERO_SYCL(int16);
+DEFINE_SETZERO_SYCL(int32);
+DEFINE_SETZERO_SYCL(int64);
 #undef DEFINE_SETZERO_SYCL
 #endif  // TENSORFLOW_USE_SYCL
+template <typename T>
+void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
+    const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out) {
+  out.device(d) = out.constant(T(1));
+}
+
+// Explicit instantiations.
+#define DEFINE_SETONE_CPU(T) \
+  template struct SetOneFunctor<Eigen::ThreadPoolDevice, T>;
+DEFINE_SETONE_CPU(bool);
+DEFINE_SETONE_CPU(Eigen::half);
+DEFINE_SETONE_CPU(float);
+DEFINE_SETONE_CPU(double);
+DEFINE_SETONE_CPU(uint8);
+DEFINE_SETONE_CPU(int8);
+DEFINE_SETONE_CPU(uint16);
+DEFINE_SETONE_CPU(int16);
+DEFINE_SETONE_CPU(int32);
+DEFINE_SETONE_CPU(int64);
+DEFINE_SETONE_CPU(complex64);
+DEFINE_SETONE_CPU(complex128);
+#undef DEFINE_SETONE_CPU
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void SetOneFunctor<Eigen::SyclDevice, T>::operator()(
+    const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
+  out.device(d) = out.constant(T(1));
+}
+
+#define DEFINE_SETONE_SYCL(T) \
+  template struct SetOneFunctor<Eigen::SyclDevice, T>;
+DEFINE_SETONE_SYCL(float);
+DEFINE_SETONE_SYCL(bool);
+DEFINE_SETONE_SYCL(double);
+#undef DEFINE_SETONE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
index 7e34f82edc8..4c8b3f01a7b 100644
--- a/tensorflow/core/kernels/fill_functor.h
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -59,6 +59,33 @@ struct SetZeroFunctor<Eigen::ThreadPoolDevice, string> {
                   typename TTypes<string>::Flat out);
 };
 
+template <typename Device, typename T>
+struct SetOneFunctor {
+  // Computes on device "d": out = out.setOne(),
+  void operator()(const Device& d, typename TTypes<T>::Flat out);
+};
+
+// Partial specialization of SetOneFunctor<Device=Eigen::ThreadPoolDevice, T>.
+template <typename T>
+struct SetOneFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out);
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization of SetOneFunctor<Device=Eigen::SyclDevice, T>.
+template <typename T>
+struct SetOneFunctor<Eigen::SyclDevice, T> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out);
+};
+#endif  // TENSORFLOW_USE_SYCL
+
+template <>
+struct SetOneFunctor<Eigen::ThreadPoolDevice, string> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<string>::Flat out);
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/filter_dataset_op.cc
new file mode 100644
index 00000000000..3503c45f9af
--- /dev/null
+++ b/tensorflow/core/kernels/filter_dataset_op.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class FilterDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit FilterDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    *output = new Dataset(input, std::move(captured_func));
+  }
+
+ private:
+  const int graph_def_version_;
+
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : input_(input), captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "FilterDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // NOTE(mrry): This method is thread-safe as long as
+        // `input_impl_` and `f` are thread-safe. However, if multiple
+        // threads enter this method, outputs may be observed in a
+        // non-deterministic order.
+        bool matched;
+        do {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+
+          FunctionLibraryRuntime::Options opts;
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+          opts.runner = ctx->runner();
+          // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+          // stack-rip the iterators and use async kernels.
+          Notification n;
+          Status ret;
+          std::vector<Tensor> result;
+          ret = dataset()->captured_func_->Run(opts, *out_tensors, &result);
+
+          if (!ret.ok()) {
+            return ret;
+          } else if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+                     result[0].NumElements() != 1) {
+            return errors::InvalidArgument(
+                "Filter predicate `f` must return a scalar bool.");
+          }
+          matched = result[0].scalar<bool>()();
+        } while (!matched);
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+ private:
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FilterDataset").Device(DEVICE_CPU),
+                        FilterDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index 9a43dd6d08c..ce7fb9c332b 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+#include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
-#include "tensorflow/core/kernels/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -28,20 +28,22 @@ namespace tensorflow {
 class FixedLengthRecordReader : public ReaderBase {
  public:
   FixedLengthRecordReader(const string& node_name, int64 header_bytes,
-                          int64 record_bytes, int64 footer_bytes, Env* env)
+                          int64 record_bytes, int64 footer_bytes,
+                          int64 hop_bytes, Env* env)
       : ReaderBase(
             strings::StrCat("FixedLengthRecordReader '", node_name, "'")),
         header_bytes_(header_bytes),
         record_bytes_(record_bytes),
         footer_bytes_(footer_bytes),
+        hop_bytes_(hop_bytes),
         env_(env),
         file_pos_limit_(-1),
         record_number_(0) {}
 
   // On success:
   // * input_buffer_ != nullptr,
-  // * input_buffer_->Tell() == footer_bytes_
-  // * file_pos_limit_ == file size - header_bytes_
+  // * input_buffer_->Tell() == header_bytes_
+  // * file_pos_limit_ == file size - footer_bytes_
   Status OnWorkStartedLocked() override {
     record_number_ = 0;
     uint64 file_size = 0;
@@ -62,14 +64,31 @@ class FixedLengthRecordReader : public ReaderBase {
 
   Status ReadLocked(string* key, string* value, bool* produced,
                     bool* at_end) override {
-    if (input_buffer_->Tell() >= file_pos_limit_) {
+    // The condition `input_buffer_->Tell() + record_bytes_ > file_pos_limit_`
+    // is to confirm that none of record bytes is out of the range of
+    // file_pos_limit_.
+    // This is necessary for the condition `hop_bytes > 0`. For example.
+    // File: "0123456"
+    // Reader setting: `record_bytes=3`, `hop_bytes=2`, `footer_bytes=0`,
+    //     `header_bytes=0`
+    // Without this checking condition, the forth time the reader will at
+    // this position: "012345|6" and the reading operation will result in
+    // an error.
+    if (input_buffer_->Tell() >= file_pos_limit_ ||
+        input_buffer_->Tell() + record_bytes_ > file_pos_limit_) {
       *at_end = true;
       return Status::OK();
     }
+    const int64 pos_before_read = input_buffer_->Tell();
     TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value));
     *key = strings::StrCat(current_work(), ":", record_number_);
     *produced = true;
     ++record_number_;
+
+    if (hop_bytes_ > 0) {
+      input_buffer_->Seek(pos_before_read + hop_bytes_).IgnoreError();
+    }
+
     return Status::OK();
   }
 
@@ -87,6 +106,7 @@ class FixedLengthRecordReader : public ReaderBase {
   const int64 header_bytes_;
   const int64 record_bytes_;
   const int64 footer_bytes_;
+  const int64 hop_bytes_;
   Env* const env_;
   int64 file_pos_limit_;
   int64 record_number_;
@@ -98,10 +118,12 @@ class FixedLengthRecordReaderOp : public ReaderOpKernel {
  public:
   explicit FixedLengthRecordReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
-    int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1;
+    int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1,
+          hop_bytes = -1;
     OP_REQUIRES_OK(context, context->GetAttr("header_bytes", &header_bytes));
     OP_REQUIRES_OK(context, context->GetAttr("record_bytes", &record_bytes));
     OP_REQUIRES_OK(context, context->GetAttr("footer_bytes", &footer_bytes));
+    OP_REQUIRES_OK(context, context->GetAttr("hop_bytes", &hop_bytes));
     OP_REQUIRES(context, header_bytes >= 0,
                 errors::InvalidArgument("header_bytes must be >= 0 not ",
                                         header_bytes));
@@ -111,11 +133,15 @@ class FixedLengthRecordReaderOp : public ReaderOpKernel {
     OP_REQUIRES(context, footer_bytes >= 0,
                 errors::InvalidArgument("footer_bytes must be >= 0 not ",
                                         footer_bytes));
+    OP_REQUIRES(
+        context, hop_bytes >= 0,
+        errors::InvalidArgument("hop_bytes must be >= 0 not ", hop_bytes));
     Env* env = context->env();
-    SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, env]() {
-      return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
-                                         footer_bytes, env);
-    });
+    SetReaderFactory(
+        [this, header_bytes, record_bytes, footer_bytes, hop_bytes, env]() {
+          return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
+                                             footer_bytes, hop_bytes, env);
+        });
   }
 };
 
diff --git a/tensorflow/core/kernels/flat_map_dataset_op.cc b/tensorflow/core/kernels/flat_map_dataset_op.cc
new file mode 100644
index 00000000000..eb55c01b125
--- /dev/null
+++ b/tensorflow/core/kernels/flat_map_dataset_op.cc
@@ -0,0 +1,206 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class FlatMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit FlatMapDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    *output = new Dataset(input, std::move(captured_func), output_types_,
+                          output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (current_element_iterator_) {
+            // We are currently precessing a mapped element, so try to get the
+            // next subelement.
+            bool end_of_element;
+            TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
+                ctx, out_tensors, &end_of_element));
+            if (!end_of_element) {
+              // Produce the subelement as output.
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+
+            // We have reached the end of the current element, so maybe move on
+            // to the next element.
+            current_element_iterator_.reset();
+          }
+
+          // Get the next element from the input dataset.
+          std::vector<Tensor> args;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+
+          FunctionLibraryRuntime::Options opts;
+          opts.runner = ctx->runner();
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+          ScopedStepContainer step_container(
+              opts.step_id, [this, ctx](const string& name) {
+                dataset()
+                    ->captured_func_->resource_manager()
+                    ->Cleanup(name)
+                    .IgnoreError();
+              });
+          opts.step_container = &step_container;
+          std::vector<Tensor> return_values;
+          TF_RETURN_IF_ERROR(
+              dataset()->captured_func_->Run(opts, args, &return_values));
+
+          if (!(return_values.size() == 1 &&
+                return_values[0].dtype() == DT_RESOURCE &&
+                TensorShapeUtils::IsScalar(return_values[0].shape()))) {
+            return errors::InvalidArgument(
+                "`f` must return a single scalar of dtype DT_RESOURCE.");
+          }
+
+          // Retrieve the dataset that was created in `f`.
+          DatasetBase* returned_dataset;
+          const ResourceHandle& dataset_resource =
+              return_values[0].scalar<ResourceHandle>()();
+
+          // NOTE(mrry): We cannot use the core `LookupResource()` or
+          // `DeleteResource()` functions, because we have an
+          // `IteratorContext*` and not an `OpKernelContext*`, so we
+          // replicate the necessary functionality here.
+          auto type_index = MakeTypeIndex<DatasetBase>();
+          if (type_index.hash_code() != dataset_resource.hash_code()) {
+            return errors::InvalidArgument(
+                "`f` must return a Dataset resource.");
+          }
+          TF_RETURN_IF_ERROR(
+              dataset()->captured_func_->resource_manager()->Lookup(
+                  dataset_resource.container(), dataset_resource.name(),
+                  &returned_dataset));
+          core::ScopedUnref unref_dataset(returned_dataset);
+
+          // Create an iterator for the dataset that was returned by
+          // `f`. This transfers ownership of the dataset to the
+          // iterator, so we can delete it from the resource manager.
+          current_element_iterator_ = returned_dataset->MakeIterator();
+          TF_RETURN_IF_ERROR(
+              dataset()
+                  ->captured_func_->resource_manager()
+                  ->Delete<DatasetBase>(dataset_resource.container(),
+                                        dataset_resource.name()));
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlatMapDataset").Device(DEVICE_CPU),
+                        FlatMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 9bba6712a22..bfdb7b4a1e4 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -271,9 +271,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Create intermediate in_backprop.
     Tensor in_backprop_tensor_temp;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<double>::v(), in_shape,
-                                          &in_backprop_tensor_temp));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {0}, DataTypeToEnum<double>::v(), in_shape,
+                                &in_backprop_tensor_temp));
     in_backprop_tensor_temp.flat<double>().setZero();
     // Transform 4D tensor to 2D matrix.
     EigenDoubleMatrixMap in_backprop_tensor_temp_mat(
@@ -323,8 +323,8 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Depending on the type, cast double to type T.
     Tensor* in_backprop_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, in_shape, &in_backprop_tensor));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, in_shape, &in_backprop_tensor));
     auto in_backprop_tensor_flat = in_backprop_tensor->flat<T>();
     auto in_backprop_tensor_temp_flat = in_backprop_tensor_temp.flat<double>();
     for (int64 i = 0; i < in_backprop_tensor_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/fractional_max_pool_op.cc b/tensorflow/core/kernels/fractional_max_pool_op.cc
index a422433ecf3..33d73c84776 100644
--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@@ -245,9 +245,11 @@ class FractionalMaxPoolGradOp : public OpKernel {
     constexpr int tensor_in_and_out_dims = 4;
     std::vector<int64> input_size;
     std::vector<int64> output_size;
+    input_size.reserve(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
       input_size.push_back(tensor_in.dim_size(i));
     }
+    output_size.reserve(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
       output_size.push_back(tensor_out.dim_size(i));
     }
@@ -256,9 +258,9 @@ class FractionalMaxPoolGradOp : public OpKernel {
     // Step 1
     // ---------
     Tensor tensor_out_dup;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::v(),
-                                          tensor_out.shape(), &tensor_out_dup));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
+                                &tensor_out_dup));
     Tensor tensor_out_arg_max;
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
                                                    tensor_out.shape(),
@@ -343,8 +345,8 @@ class FractionalMaxPoolGradOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, tensor_in.shape(), &output));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, tensor_in.shape(), &output));
     output->flat<T>().setZero();
 
     auto out_backprop_flat = out_backprop.flat<T>();
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 9aa289c3c95..8c3137ece9f 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -84,8 +84,8 @@ class RetvalOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
-REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER(type)     \
@@ -185,10 +185,37 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .TypeConstraint<int32>("T"),
                         PassOn);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      PassOn);                                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      PassOn);
+
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(Name("_ListToArray")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
+REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
+#endif  // TENSORFLOW_USE_SYCL
+
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
-  SymbolicGradientOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx), handle_(-1) {}
+  explicit SymbolicGradientOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {}
 
   ~SymbolicGradientOp() override {}
 
@@ -198,8 +225,9 @@ class SymbolicGradientOp : public AsyncOpKernel {
                       errors::Internal("No function library is provided."),
                       done);
 
+    FunctionLibraryRuntime::Handle handle;
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle_), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
@@ -211,7 +239,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
     lib->Run(
-        opts, handle_, args, rets, [ctx, done, rets](const Status& status) {
+        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
           if (!status.ok()) {
             ctx->SetStatus(status);
           } else if (rets->size() != ctx->num_outputs()) {
@@ -229,8 +257,6 @@ class SymbolicGradientOp : public AsyncOpKernel {
   }
 
  private:
-  FunctionLibraryRuntime::Handle handle_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientOp);
 };
 
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index c79f6d82496..37758e82ebd 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -108,7 +108,7 @@ struct FusedBatchNorm<CPUDevice, T> {
         x_rest_by_depth - mean.reshape(one_by_depth).broadcast(bcast_spec);
 
     if (is_training) {
-      variance = x_centered.square().sum(reduce_dims) * rest_size_inv;
+      variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
       batch_var.device(d) = variance * rest_size_adjust;
       saved_var.device(d) = variance;
     } else {
@@ -300,8 +300,9 @@ struct FusedBatchNorm<GPUDevice, T> {
     GPUDevice d = context->eigen_device<GPUDevice>();
     using perftools::gputools::DeviceMemory;
     Tensor inv_var;
-    context->allocate_temp(DataTypeToEnum<T>::value, estimated_variance.shape(),
-                           &inv_var);
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                        estimated_variance.shape(), &inv_var));
     auto inv_var_ptr = StreamExecutorUtil::AsDeviceMemory<T>(inv_var);
     std::function<const DeviceMemory<T>&()> var_to_inv_var =
         [d, epsilon, estimated_variance,
@@ -520,7 +521,8 @@ class FusedBatchNormOp : public OpKernel {
     }
 
     Tensor* y = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, x.shape(), &y));
     Tensor* batch_mean = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, scale.shape(), &batch_mean));
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index be220d5c95d..c1d58733a2a 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -38,6 +38,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_complex64(DECLARE_GPU_SPECS);
+TF_CALL_complex128(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index f1c10250786..39b6924d74a 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -32,6 +32,8 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_complex64(DEFINE_GPU_SPECS);
+TF_CALL_complex128(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_nd_op_test.cc b/tensorflow/core/kernels/gather_nd_op_test.cc
index fa93e8a9047..9f8658ef0e8 100644
--- a/tensorflow/core/kernels/gather_nd_op_test.cc
+++ b/tensorflow/core/kernels/gather_nd_op_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index d8182218af1..dd25f589574 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -114,6 +114,8 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
+TF_CALL_complex64(REGISTER_GATHER_GPU);
+TF_CALL_complex128(REGISTER_GATHER_GPU);
 
 #undef REGISTER_GATHER_GPU
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index c340223aa10..10d5aefe437 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -40,9 +39,9 @@ namespace {
 
 class GatherOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType index_type) {
+  void MakeOp(DataType data_type, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "Gather")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
@@ -50,7 +49,7 @@ class GatherOpTest : public OpsTestBase {
 };
 
 TEST_F(GatherOpTest, ScalarIndices) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
@@ -63,8 +62,26 @@ TEST_F(GatherOpTest, ScalarIndices) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({}));
+  test::FillValues<std::complex<float>>(&expected,
+                                        {std::complex<float>(3, 13)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
 TEST_F(GatherOpTest, Simple_TwoD32) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
@@ -79,7 +96,7 @@ TEST_F(GatherOpTest, Simple_TwoD32) {
 }
 
 TEST_F(GatherOpTest, ZeroSize_TwoD32) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 0}), {});
@@ -92,7 +109,7 @@ TEST_F(GatherOpTest, ZeroSize_TwoD32) {
 }
 
 TEST_F(GatherOpTest, Simple_TwoD64) {
-  MakeOp(DT_INT64);
+  MakeOp(DT_FLOAT, DT_INT64);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
@@ -107,7 +124,7 @@ TEST_F(GatherOpTest, Simple_TwoD64) {
 }
 
 TEST_F(GatherOpTest, HighRank) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
@@ -121,7 +138,7 @@ TEST_F(GatherOpTest, HighRank) {
 }
 
 TEST_F(GatherOpTest, Error_IndexOutOfRange) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
@@ -146,6 +163,7 @@ static Graph* Gather(int dim) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   std::vector<Index> indices_vec;
+  indices_vec.reserve(kLookups);
   for (int i = 0; i < kLookups; i++) {
     indices_vec.push_back(rnd.Uniform(kRows));
   }
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc
new file mode 100644
index 00000000000..948e83390e1
--- /dev/null
+++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/window_dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 window_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
+    OP_REQUIRES(
+        ctx, window_size > 0,
+        errors::InvalidArgument("Window size must be greater than zero."));
+
+    // Get captured inputs for the key and reduce functions.
+    OpInputList key_func_other_argument_inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("key_func_other_arguments",
+                                        &key_func_other_argument_inputs));
+    std::vector<Tensor> key_func_other_arguments;
+    key_func_other_arguments.reserve(key_func_other_argument_inputs.size());
+    for (const Tensor& t : key_func_other_argument_inputs) {
+      key_func_other_arguments.push_back(t);
+    }
+    OpInputList reduce_func_other_argument_inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("reduce_func_other_arguments",
+                                        &reduce_func_other_argument_inputs));
+    std::vector<Tensor> reduce_func_other_arguments;
+    reduce_func_other_arguments.reserve(
+        reduce_func_other_argument_inputs.size());
+    for (const Tensor& t : reduce_func_other_argument_inputs) {
+      reduce_func_other_arguments.push_back(t);
+    }
+    // TODO(mrry): Refactor CapturedFunction to share the runtime
+    // state between multiple functions?
+    std::unique_ptr<CapturedFunction> captured_key_func;
+    OP_REQUIRES_OK(ctx,
+                   CapturedFunction::Create(ctx, key_func_, graph_def_version_,
+                                            std::move(key_func_other_arguments),
+                                            &captured_key_func));
+    std::unique_ptr<CapturedFunction> captured_reduce_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, reduce_func_, graph_def_version_,
+                                      std::move(reduce_func_other_arguments),
+                                      &captured_reduce_func));
+
+    *output = new Dataset(input, window_size, std::move(captured_key_func),
+                          std::move(captured_reduce_func), output_types_,
+                          output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int64 window_size,
+            std::unique_ptr<CapturedFunction> captured_key_func,
+            std::unique_ptr<CapturedFunction> captured_reduce_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          window_size_(window_size),
+          captured_key_func_(std::move(captured_key_func)),
+          captured_reduce_func_(std::move(captured_reduce_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "GroupByWindowDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (current_group_iterator_) {
+            // We are currently processing a group, so try to get the
+            // next element.
+            bool end_of_group;
+            TF_RETURN_IF_ERROR(current_group_iterator_->GetNext(
+                ctx, out_tensors, &end_of_group));
+            if (!end_of_group) {
+              // Produce the subelement as output.
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+            // We have reached the end of the current group, so maybe move on
+            // to the next group.
+            current_group_iterator_.reset();
+          }
+
+          // Iterate through the input dataset until we get a full
+          // group, or reach the end.
+          while (!end_of_input_) {
+            std::vector<Tensor> next_input_element;
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, &next_input_element, &end_of_input_));
+
+            if (!end_of_input_) {
+              FunctionLibraryRuntime::Options opts;
+              // Choose a step ID that is guaranteed not to clash with any
+              // Session-generated step ID. DirectSession only generates
+              // non-negative step IDs (contiguous, starting from 0), and
+              // MasterSession generates 56-bit random step IDs whose MSB is
+              // always 0, so a negative random step ID should suffice.
+              opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+              opts.runner = ctx->runner();
+              ScopedStepContainer step_container(
+                  opts.step_id, [this, ctx](const string& name) {
+                    dataset()
+                        ->captured_key_func_->resource_manager()
+                        ->Cleanup(name)
+                        .IgnoreError();
+                  });
+              opts.step_container = &step_container;
+
+              // Run the key function on the input element to identify its
+              // group.
+              std::vector<Tensor> key_func_output;
+              TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Run(
+                  opts, next_input_element, &key_func_output));
+
+              if (key_func_output.size() != 1 ||
+                  key_func_output[0].dtype() != DT_INT64 ||
+                  key_func_output[0].NumElements() != 1) {
+                // TODO(mrry): Support non-int64 keys.
+                return errors::InvalidArgument(
+                    "`key_func` must return a scalar int64.");
+              }
+              const int64 key = key_func_output[0].scalar<int64>()();
+
+              std::vector<std::vector<Tensor>>& group = groups_[key];
+              group.push_back(std::move(next_input_element));
+
+              if (group.size() == dataset()->window_size_) {
+                TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, key));
+                break;
+              }
+            }
+          }
+
+          if (end_of_input_) {
+            if (!groups_.empty()) {
+              // We have consumed all of the input, so flush an
+              // arbitrarily chosen group.
+              TF_RETURN_IF_ERROR(
+                  StartFlushingGroup(ctx, groups_.begin()->first));
+            }
+          }
+        } while (current_group_iterator_ || !end_of_input_);
+
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     private:
+      Status StartFlushingGroup(IteratorContext* ctx, int64 key)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        FunctionLibraryRuntime::Options opts;
+        // Choose a step ID that is guaranteed not to clash with any
+        // Session-generated step ID. DirectSession only generates
+        // non-negative step IDs (contiguous, starting from 0), and
+        // MasterSession generates 56-bit random step IDs whose MSB is
+        // always 0, so a negative random step ID should suffice.
+        opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+        opts.runner = ctx->runner();
+        ScopedStepContainer step_container(
+            opts.step_id, [this, ctx](const string& name) {
+              dataset()
+                  ->captured_reduce_func_->resource_manager()
+                  ->Cleanup(name)
+                  .IgnoreError();
+            });
+        opts.step_container = &step_container;
+
+        DatasetBase* group_dataset;
+        TF_RETURN_IF_ERROR(NewWindowDataset(
+            std::move(groups_[key]), dataset()->input_->output_dtypes(),
+            dataset()->input_->output_shapes(), &group_dataset));
+        groups_.erase(key);
+
+        Tensor key_arg(DT_INT64, TensorShape({}));
+        key_arg.scalar<int64>()() = key;
+
+        Tensor group_dataset_arg(DT_RESOURCE, TensorShape({}));
+
+        // NOTE(mrry): We cannot use the core `MakeResourceHandle()`,
+        // `LookupResource()` or `DeleteResource()` functions, because
+        // we have an `IteratorContext*` and not an
+        // `OpKernelContext*`, so we replicate the necessary
+        // functionality here.
+        ResourceHandle group_dataset_handle;
+        group_dataset_handle.set_device(
+            dataset()->captured_reduce_func_->device()->attributes().name());
+        group_dataset_handle.set_container(step_container.name());
+        group_dataset_handle.set_name(kWindowResourceName);
+        auto type_index = MakeTypeIndex<DatasetBase>();
+        group_dataset_handle.set_hash_code(type_index.hash_code());
+        group_dataset_handle.set_maybe_type_name(type_index.name());
+        // NOTE(mrry): Ownership of `group_dataset` transfers to
+        // `step_container` here.
+        TF_RETURN_IF_ERROR(dataset()
+                               ->captured_reduce_func_->resource_manager()
+                               ->Create<DatasetBase>(
+                                   group_dataset_handle.container(),
+                                   group_dataset_handle.name(), group_dataset));
+
+        group_dataset_arg.scalar<ResourceHandle>()() = group_dataset_handle;
+
+        std::vector<Tensor> args(
+            {std::move(key_arg), std::move(group_dataset_arg)});
+        std::vector<Tensor> return_values;
+
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->Run(opts, args, &return_values));
+
+        if (!(return_values.size() == 1 &&
+              return_values[0].dtype() == DT_RESOURCE &&
+              TensorShapeUtils::IsScalar(return_values[0].shape()))) {
+          return errors::InvalidArgument(
+              "`reduce_func` must return a single scalar of dtype "
+              "DT_RESOURCE.");
+        }
+
+        // Retrieve the dataset that was created in `f`.
+        DatasetBase* returned_dataset;
+        const ResourceHandle& dataset_resource =
+            return_values[0].scalar<ResourceHandle>()();
+        if (type_index.hash_code() != dataset_resource.hash_code()) {
+          return errors::InvalidArgument(
+              "`reduce_func` must return a Dataset resource.");
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->resource_manager()->Lookup(
+                dataset_resource.container(), dataset_resource.name(),
+                &returned_dataset));
+        core::ScopedUnref unref_returned_dataset(returned_dataset);
+
+        // Create an iterator for the dataset that was returned by
+        // `f`. This transfers ownership of the dataset to the
+        // iterator.
+        current_group_iterator_ = returned_dataset->MakeIterator();
+        return Status::OK();
+      }
+
+      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      // TODO(mrry): Optimize for dense key space if appropriate.
+      bool end_of_input_ GUARDED_BY(mu_) = false;
+      std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
+    };
+
+    // A resource name for the temporary window dataset that is
+    // created as the input to the reduce function.
+    static constexpr const char* kWindowResourceName = "__window_dataset";
+
+    const DatasetBase* const input_;
+    const int64 window_size_;
+    const std::unique_ptr<CapturedFunction> captured_key_func_;
+    const std::unique_ptr<CapturedFunction> captured_reduce_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* key_func_;
+  const NameAttrList* reduce_func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
+                        GroupByWindowDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 3a903c51736..e9391c89544 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -50,6 +50,7 @@ tf_cc_test(
         "graph_transferer_test.cc",
         "hexagon_graph_execution_test.cc",
     ],
+    data = ["//tensorflow/core:example_parser_configuration_testdata"],
     deps = [
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
@@ -64,6 +65,8 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:reduction_ops",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+        "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:reshape_op",
         "//tensorflow/core/kernels:softmax_op",
     ],
@@ -84,15 +87,73 @@ tf_kernel_library(
         "hexagon_control_wrapper.h",
         "hexagon_ops_definitions.h",
         "i_graph_transfer_ops_definitions.h",
-        "i_soc_control_wrapper.h",
     ],
-    data = ["//tensorflow/core:example_parser_configuration_testdata"],
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:remote_fused_graph_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
     ],
 )
+
+cc_library(
+    name = "hexagon_rewriter_transform",
+    srcs = [
+        "hexagon_rewriter_transform.cc",
+    ],
+    deps = [
+        ":graph_transferer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:remote_fused_graph_ops",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "hexagon_rewriter_transform_test",
+    size = "small",
+    srcs = ["hexagon_rewriter_transform_test.cc"],
+    deps = [
+        ":hexagon_rewriter_transform",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+)
+
+cc_library(
+    name = "hexagon_remote_fused_graph_executor_build",
+    srcs = [
+        "hexagon_remote_fused_graph_executor_build.cc",
+    ],
+    deps = [
+        ":graph_transferer",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "hexagon_remote_fused_graph_executor_build_test",
+    size = "small",
+    srcs = ["hexagon_remote_fused_graph_executor_build_test.cc"],
+    deps = [
+        ":hexagon_remote_fused_graph_executor_build",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+    ],
+)
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index c37e49f2423..04697c3b15f 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
 
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/logging.h"
-
 namespace tensorflow {
 
+// function alias
+constexpr auto AddOutputTensorShapeTypeByTensorShapeMap =
+    &RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap;
+
 /* static */ std::priority_queue<std::tuple<float, int, string>>
-GraphTransferUtils::GetTopNFloatResults(const float *const data,
-                                        const string *const labels,
+GraphTransferUtils::GetTopNFloatResults(const float* const data,
+                                        const string* const labels,
                                         const int element_count) {
   CHECK(data != nullptr);
   CHECK(labels != nullptr);
@@ -33,7 +39,7 @@ GraphTransferUtils::GetTopNFloatResults(const float *const data,
 }
 
 /* static */ void GraphTransferUtils::DumpTopNFloatResults(
-    const float *const data, const string *const labels,
+    const float* const data, const string* const labels,
     const int element_count, const int top_n) {
   std::priority_queue<std::tuple<float, int, string>> queue =
       GetTopNFloatResults(data, labels, element_count);
@@ -46,4 +52,113 @@ GraphTransferUtils::GetTopNFloatResults(const float *const data,
   }
 }
 
+/* static */ RemoteFusedGraphExecuteInfo
+GraphTransferUtils::BuildRemoteFusedGraphExecuteInfo(
+    const GraphDef& graph_def,
+    const std::vector<std::pair<string, Tensor>>& inputs,
+    const std::vector<string>& outputs,
+    const RemoteFusedGraphExecuteUtils::TensorShapeMap& tensor_shape_map) {
+  RemoteFusedGraphExecuteInfo execute_info;
+  execute_info.set_executor_name("build_hexagon_remote_fused_graph_executor");
+
+  // copy graph
+  *execute_info.mutable_remote_graph() = graph_def;
+
+  for (const std::pair<string, Tensor>& input : inputs) {
+    execute_info.add_graph_input_node_name(input.first);
+    RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+        *execute_info.add_default_graph_input_tensor_shape();
+    tensor_shape_type.set_dtype(input.second.dtype());
+    TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+    for (const int64 dim : input.second.shape().dim_sizes()) {
+      tensor_shape_proto.add_dim()->set_size(dim);
+    }
+  }
+
+  for (const string& output_name : outputs) {
+    const std::pair<DataType, TensorShape>* tensor_shape_type =
+        RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                         output_name);
+    CHECK_NOTNULL(tensor_shape_type);
+    execute_info.add_graph_output_node_name(output_name);
+    RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type_proto =
+        *execute_info.add_default_graph_output_tensor_shape();
+    tensor_shape_type_proto.set_dtype(tensor_shape_type->first);
+    TensorShapeProto& tensor_shape_proto =
+        *tensor_shape_type_proto.mutable_shape();
+    for (const int64 dim : tensor_shape_type->second.dim_sizes()) {
+      tensor_shape_proto.add_dim()->set_size(dim);
+    }
+  }
+
+  return execute_info;
+}
+
+/* static */ GraphDef GraphTransferUtils::BuildFusedGraphDef(
+    const IGraphTransferOpsDefinitions& ops_definitions,
+    const string& remote_graph_execute_name,
+    const std::vector<std::pair<string, Tensor>>& inputs,
+    const std::vector<string>& outputs, GraphDef* original_def) {
+  RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+  Status status = RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
+      *original_def, inputs, true /* initialize_by_zero */, &tensor_shape_map);
+  for (NodeDef& node_def : *original_def->mutable_node()) {
+    TF_CHECK_OK(
+        AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map, &node_def));
+  }
+  CHECK(status.ok());
+
+  Scope root = Scope::NewRootScope();
+  std::vector<Output> output_list;
+  DataTypeVector input_types;
+  for (const std::pair<string, Tensor>& input_node_info : inputs) {
+    const Scope& scope = root.WithOpName(input_node_info.first);
+    Node* ret;
+    const auto unique_name = scope.GetUniqueNameForOp("Placeholder");
+    auto builder = NodeBuilder(unique_name, "Placeholder")
+                       .Attr("dtype", input_node_info.second.dtype())
+                       .Attr("shape", input_node_info.second.shape());
+    scope.UpdateBuilder(&builder);
+    scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+    TF_CHECK_OK(scope.status());
+    output_list.emplace_back(Output(ret, 0));
+    input_types.push_back(input_node_info.second.dtype());
+  }
+
+  const RemoteFusedGraphExecuteInfo execute_info =
+      BuildRemoteFusedGraphExecuteInfo(*original_def, inputs, outputs,
+                                       tensor_shape_map);
+
+  DataTypeVector output_types;
+  // Sanity-check to confirm all output data types are same.
+  for (const string& output_node_name : outputs) {
+    const std::pair<DataType, TensorShape>* tst =
+        RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                         output_node_name);
+    CHECK_NE(tst, nullptr);
+    output_types.push_back(tst->first);
+  }
+
+  const Scope& scope = root.WithOpName(remote_graph_execute_name);
+  CHECK(scope.ok());
+  auto node_out_list = ops::AsNodeOutList(scope, InputList(output_list));
+  Node* node;
+  const auto unique_name = scope.GetUniqueNameForOp("RemoteFusedGraphExecute");
+
+  auto builder = NodeBuilder(unique_name, "RemoteFusedGraphExecute")
+                     .Input(node_out_list)
+                     .Attr("Tinputs", input_types)
+                     .Attr("Toutputs", output_types)
+                     .Attr("serialized_remote_fused_graph_execute_info",
+                           StringPiece(execute_info.SerializeAsString()));
+  CHECK(scope.ok());
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &node));
+  CHECK(scope.ok()) << scope.status();
+
+  GraphDef fusedGraphDef;
+  TF_CHECK_OK(root.ToGraphDef(&fusedGraphDef));
+  return fusedGraphDef;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index 85af9b5ce3d..a11e2e6eb1b 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -17,8 +17,13 @@ limitations under the License.
 #define TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
 
 #include <queue>
+#include <utility>
+#include <vector>
 
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -26,13 +31,26 @@ namespace tensorflow {
 class GraphTransferUtils {
  public:
   static std::priority_queue<std::tuple<float, int, string>>
-  GetTopNFloatResults(const float *const data, const string *const labels,
+  GetTopNFloatResults(const float* const data, const string* const labels,
                       const int element_count);
-  static void DumpTopNFloatResults(const float *const data,
-                                   const string *const labels,
+
+  static void DumpTopNFloatResults(const float* const data,
+                                   const string* const labels,
                                    const int element_count, const int top_n);
 
+  static GraphDef BuildFusedGraphDef(
+      const IGraphTransferOpsDefinitions& ops_definitions,
+      const string& remote_graph_execute_name,
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const std::vector<string>& outputs, GraphDef* original_def);
+
  private:
+  static RemoteFusedGraphExecuteInfo BuildRemoteFusedGraphExecuteInfo(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const std::vector<string>& outputs,
+      const RemoteFusedGraphExecuteUtils::TensorShapeMap& tensor_shape_map);
+
   TF_DISALLOW_COPY_AND_ASSIGN(GraphTransferUtils);
 };
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index e5fd127abea..d927ef3efa0 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cinttypes>
 
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
@@ -28,20 +29,24 @@ limitations under the License.
 
 namespace tensorflow {
 
+// function alias
+constexpr auto AddOutputTensorShapeTypeByTensorShapeMap =
+    &RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap;
+
 constexpr bool DBG_DUMP_VERIFICATION_STRING = false;
 constexpr bool DBG_DUMP_PARAMS = false;
 
-const string RESHAPE_NODE_TYPE_STRING = "Reshape";
-const string SOURCE_NODE_NAME = "_SOURCE";
-const string SINK_NODE_NAME = "_SINK";
-const string INPUTS_NODE_PREFIX = "inputs_for_";
-const string OUTPUTS_NODE_PREFIX = "outputs_for_";
-const string DATA_NODE_PREFIX = "data_for_op_";
-const string CONST_SHAPE_PREFIX = "const_shape_";
-const string PADDING_ATTR_NAME = "padding";
-const string STRIDES_ATTR_NAME = "strides";
-const string KSIZE_ATTR_NAME = "ksize";
-const string NULL_OUTPUT_NAME = "NULL";
+const char RESHAPE_NODE_TYPE_STRING[] = "Reshape";
+const char SOURCE_NODE_NAME[] = "_SOURCE";
+const char SINK_NODE_NAME[] = "_SINK";
+const char INPUTS_NODE_PREFIX[] = "inputs_for_";
+const char OUTPUTS_NODE_PREFIX[] = "outputs_for_";
+const char DATA_NODE_PREFIX[] = "data_for_op_";
+const char CONST_SHAPE_PREFIX[] = "const_shape_";
+const char PADDING_ATTR_NAME[] = "padding";
+const char STRIDES_ATTR_NAME[] = "strides";
+const char KSIZE_ATTR_NAME[] = "ksize";
+const char NULL_OUTPUT_NAME[] = "NULL";
 const int PADDING_NA_ID = 0;  // VALID = 1, SAME = 2
 
 // This is a temporary workaround to support android build
@@ -56,26 +61,31 @@ static string ToString(T val) {
 /**
  * graph loading functions
  * - LoadGraphFromProto
- * - LoadGraphFromProtoFile
+ * - LoadGraphFromProptoFile
  * These functions read a graph definition and store parameters
  * of node to transfer the graph to SOC.
  */
 Status GraphTransferer::LoadGraphFromProto(
     const IGraphTransferOpsDefinitions& ops_definitions,
     const GraphDef& graph_def,
-    const std::vector<InputNodeInfo>& input_node_info_list,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names,
-    const OutputTensorMap& output_tensor_map) {
-  ImportGraphDefOptions opts;
+    const bool shape_inference_for_unknown_shape) {
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.op_registry());
-  VLOG(1) << "Start import graph";
-  Status status = ImportGraphDef(opts, graph_def, &graph, &shape_refiner);
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  Status status = ImportGraphDef({}, graph_def, &graph, &shape_refiner);
   if (!status.ok()) {
-    VLOG(1) << "Failed to import graph " << status.ToString();
     return status;
   }
 
+  if (shape_inference_for_unknown_shape) {
+    status = RemoteFusedGraphExecuteUtils::PropagateShapeInference(
+        graph_def, input_node_info_list, &graph, &shape_refiner);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
   std::unordered_multimap<string, const Node*> op_name_to_node_multimap(
       graph.num_nodes());
   for (const Node* const node : graph.nodes()) {
@@ -94,13 +104,55 @@ Status GraphTransferer::LoadGraphFromProto(
   for (const Node* const node : graph.nodes()) {
     status = RegisterNodeIfAllInputsAreCached(
         ops_definitions, shape_refiner, *node, false, input_node_info_list,
-        output_node_names, output_tensor_map);
+        output_node_names);
     if (!status.ok()) {
       LOG(ERROR) << "Failed to transfer graph " << status;
       return status;
     }
   }
+
   SortParams(output_node_names);
+
+  for (const std::pair<string, Tensor>& input_node_info :
+       input_node_info_list) {
+    GraphTransferInfo::GraphInputNodeInfo& graph_input_node_info =
+        *graph_transfer_info_.add_graph_input_node_info();
+    graph_input_node_info.set_name(input_node_info.first);
+    graph_input_node_info.set_dtype(input_node_info.second.dtype());
+    for (const int64 dim : ToTensorShapeArray(input_node_info.second.shape())) {
+      graph_input_node_info.add_shape(dim);
+    }
+  }
+
+  for (const string& output_node_name : output_node_names) {
+    const TensorId tid = ParseTensorName(output_node_name);
+    const string node_name = tid.first.ToString();
+    const int port = tid.second;
+    const int node_id = node_name_to_id_cache_map_.at(node_name);
+    const Node* node = node_name_cache_list_.at(node_id);
+    CHECK_NOTNULL(node);
+
+    GraphTransferInfo::GraphOutputNodeInfo& graph_output_node_info =
+        *graph_transfer_info_.add_graph_output_node_info();
+    graph_output_node_info.set_name(strings::StrCat(node_name, ":", port));
+
+    // Get output tensor shape type
+    std::vector<DataType> data_types;
+    std::vector<TensorShape> shapes;
+    status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+        node->attrs(), &data_types, &shapes);
+    if (status.ok()) {
+      CHECK(data_types.size() > port);
+      graph_output_node_info.set_dtype(data_types.at(port));
+      for (const int64 dim : ToTensorShapeArray(shapes.at(port))) {
+        graph_output_node_info.add_shape(dim);
+      }
+    }
+  }
+
+  graph_transfer_info_.set_destination(
+      ops_definitions.GetTransferDestination());
+
   ClearCache();
   if (DBG_DUMP_PARAMS) {
     DumpNodeTransferParams();
@@ -114,10 +166,10 @@ Status GraphTransferer::LoadGraphFromProto(
 Status GraphTransferer::LoadGraphFromProtoFile(
     const IGraphTransferOpsDefinitions& ops_definitions,
     const string& graph_def_path,
-    const std::vector<InputNodeInfo>& input_node_info_list,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names, const bool is_text_proto,
-    const bool dry_run_for_unknown_shape,
-    OutputTensorInfo* output_tensor_info) {
+    const bool shape_inference_for_unknown_shape,
+    const bool dry_run_for_unknown_shape) {
   GraphDef graph_def;
   string output;
   Status status;
@@ -136,137 +188,21 @@ Status GraphTransferer::LoadGraphFromProtoFile(
   }
   if (dry_run_for_unknown_shape) {
     VLOG(1) << "Dry run graph to obtain shape of nodes";
-    status = DryRunInferenceForAllNode(graph_def, input_node_info_list, true,
-                                       output_tensor_info);
+    RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+    status = RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
+        graph_def, input_node_info_list, true, &tensor_shape_map);
     if (!status.ok()) {
       return status;
     }
+    for (NodeDef& node_def : *graph_def.mutable_node()) {
+      TF_CHECK_OK(AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map,
+                                                           &node_def));
+    }
   }
   VLOG(1) << "Load graph with output tensors";
   return LoadGraphFromProto(ops_definitions, graph_def, input_node_info_list,
                             output_node_names,
-                            output_tensor_info->output_tensor_map);
-}
-
-/**
- * Dryrun functions
- * - DryRunInference
- * To determine shapes of output tensors of all nodes, dryrun the graph.
- * This function supplies memory allocation information when loading
- * the graph.
- * TODO(satok): Delete this function when all shapes of ops are implemented.
- * This function doesn't work if some ops changes its shape even if input shape
- * is fixed.
- */
-/* static */ Status GraphTransferer::DryRunInference(
-    const GraphDef& graph_def,
-    const std::vector<InputNodeInfo>& input_node_info_list,
-    const std::vector<string>& output_node_names, const bool initialize_by_zero,
-    std::vector<tensorflow::Tensor>* output_tensors) {
-  // Create input tensor vector.  If "initialize_by_zero" is true,
-  // input tensor fields are initialized by 0.
-  std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
-  for (const InputNodeInfo& input : input_node_info_list) {
-    CHECK(input.tensor.IsInitialized());
-    if (!initialize_by_zero) {
-      input_tensors.push_back({input.name, input.tensor});
-      continue;
-    }
-    // If input tensor is not initialized, initialize by 0-filling
-    const DataType data_type = input.tensor.dtype();
-    const TensorShape& shape = input.tensor.shape();
-    Tensor input_tensor(data_type, shape);
-    switch (data_type) {
-      case DT_INT32: {
-        auto int_tensor = input_tensor.flat<int32>();
-        int_tensor = int_tensor.constant(0);
-        break;
-      }
-      case DT_FLOAT: {
-        auto float_tensor = input_tensor.flat<float>();
-        float_tensor = float_tensor.constant(0.0f);
-        break;
-      }
-      case DT_QUINT8: {
-        auto int_tensor = input_tensor.flat<quint8>();
-        int_tensor = int_tensor.constant(0);
-        break;
-      }
-      default:
-        LOG(FATAL) << "Unsupported input type: " << data_type;
-    }
-    input_tensors.push_back({input.name, input_tensor});
-  }
-
-  // Setup session
-  CHECK(output_tensors != nullptr);
-  SessionOptions session_options;
-  session_options.env = Env::Default();
-  std::unique_ptr<Session> session =
-      std::unique_ptr<Session>(NewSession(session_options));
-  Status status = session->Create(graph_def);
-  if (!status.ok()) {
-    return status;
-  }
-
-  // Setup session arguments
-  RunOptions run_options;
-  run_options.set_trace_level(RunOptions::FULL_TRACE);
-  RunMetadata run_metadata;
-
-  // Run inference with all node as output
-  status = session->Run(run_options, input_tensors, output_node_names, {},
-                        output_tensors, &run_metadata);
-  if (!status.ok()) {
-    LOG(ERROR) << "Error during inference: " << status;
-    return status;
-  }
-  return Status();
-}
-
-/* static */ Status GraphTransferer::DryRunInferenceForAllNode(
-    const GraphDef& graph_def,
-    const std::vector<GraphTransferer::InputNodeInfo>& input_node_info_list,
-    const bool initialize_by_zero, OutputTensorInfo* const output_tensor_info) {
-  CHECK(output_tensor_info != nullptr);
-  auto& output_tensors = output_tensor_info->output_tensors;
-  output_tensors.reserve(graph_def.node_size());
-  auto& output_tensor_map = output_tensor_info->output_tensor_map;
-  std::vector<string> output_node_names;
-  for (const NodeDef& node : graph_def.node()) {
-    if (!IsInputNode(input_node_info_list, node.name())) {
-      output_node_names.emplace_back(node.name());
-    }
-  }
-  const Status status =
-      DryRunInference(graph_def, input_node_info_list, output_node_names,
-                      initialize_by_zero, &output_tensors);
-  if (!status.ok()) {
-    VLOG(1) << "Failed to dryrun " << status;
-    return status;
-  }
-  CHECK(output_node_names.size() == output_tensors.size())
-      << output_node_names.size() << ", " << output_tensors.size();
-
-  // Append output tensor of input node in advance to create a map
-  // to avoid memory reallocation inside vector
-  for (const InputNodeInfo& input_node_info : input_node_info_list) {
-    output_tensors.push_back(input_node_info.tensor);
-  }
-
-  for (int i = 0; i < output_node_names.size(); ++i) {
-    const string& name = output_node_names.at(i);
-    CHECK(output_tensor_map.count(name) == 0);
-    output_tensor_map[name] = &output_tensors.at(i);
-  }
-  for (int i = 0; i < input_node_info_list.size(); ++i) {
-    const string& name = input_node_info_list.at(i).name;
-    CHECK(output_tensor_map.count(name) == 0);
-    output_tensor_map.emplace(name,
-                              &output_tensors.at(output_node_names.size() + i));
-  }
-  CHECK(graph_def.node_size() == output_tensors.size());
-  return status;
+                            shape_inference_for_unknown_shape);
 }
 
 void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
@@ -294,7 +230,7 @@ void GraphTransferer::SortParams(const std::vector<string>& output_node_names) {
     if (params.input_count() == 0) {
       continue;
     }
-    CHECK(input_map.count(node_id) == 1);
+    CHECK_EQ(input_map.count(node_id), 1);
     for (const GraphTransferInfo::NodeInput& node_input :
          input_map.at(node_id)->node_input()) {
       dependency_map.at(node_id).emplace(node_input.node_id());
@@ -325,6 +261,10 @@ const GraphTransferInfo& GraphTransferer::GetGraphTransferInfo() const {
   return graph_transfer_info_;
 }
 
+GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() {
+  return graph_transfer_info_;
+}
+
 int GraphTransferer::CacheNode(const Node& node) {
   if (node_name_to_id_cache_map_.count(node.name()) > 0) {
     VLOG(1) << "Emplace node to cache failed";
@@ -340,17 +280,6 @@ int GraphTransferer::CacheNode(const Node& node) {
   return node_name_cache_list_.size() - 1;
 }
 
-/* static */ bool GraphTransferer::IsInputNode(
-    const std::vector<InputNodeInfo>& input_node_info_list,
-    const string& node_name) {
-  for (const InputNodeInfo& input_node_info : input_node_info_list) {
-    if (node_name == input_node_info.name) {
-      return true;
-    }
-  }
-  return false;
-}
-
 bool GraphTransferer::AreAllInputsCached(const Node& node) const {
   for (const Node* const input_node : node.in_nodes()) {
     if (node_name_to_id_cache_map_.count(input_node->name()) <= 0) {
@@ -364,102 +293,87 @@ bool GraphTransferer::AreAllInputsCached(const Node& node) const {
 
 Status GraphTransferer::RegisterNode(
     const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node, const std::vector<InputNodeInfo>& input_node_info_list,
+    const ShapeRefiner& shape_refiner, const Node& node,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names) {
   VLOG(1) << "Register node: " << node.name();
   if (node.name() == SOURCE_NODE_NAME || node.name() == SINK_NODE_NAME) {
     // Just ignore sink and source
     return Status();
-  } else if (IsInputNode(input_node_info_list, node.name())) {
-    RegisterInputNode(ops_definitions, shape_refiner, output_tensor_map, node);
-  } else if (std::find(output_node_names.begin(), output_node_names.end(),
-                       node.name()) != output_node_names.end()) {
-    RegisterOutputNode(ops_definitions, shape_refiner, output_tensor_map, node);
+  } else if (RemoteFusedGraphExecuteUtils::IsInputNode(input_node_info_list,
+                                                       node.name())) {
+    RegisterInputNode(ops_definitions, shape_refiner, node);
   } else if (node.IsConstant()) {
-    RegisterConstantNode(shape_refiner, node, output_tensor_map);
+    RegisterConstantNode(shape_refiner, node);
   } else if (HasPaddingAndStrides(node)) {
-    RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner,
-                                      output_tensor_map, node);
-  } else if (IsNodeFlattenReshape(node, output_tensor_map, shape_refiner)) {
-    RegisterFlattenNode(ops_definitions, shape_refiner, output_tensor_map,
-                        node);
-  } else if (ops_definitions.GetOpIdFor(node.type_string()) !=
+    RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node);
+  } else if (IsNodeFlattenReshape(node, shape_refiner)) {
+    RegisterFlattenNode(ops_definitions, shape_refiner, node);
+  } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) !=
              IGraphTransferOpsDefinitions::INVALID_OP_ID) {
-    RegisterGenericNode(ops_definitions, shape_refiner, output_tensor_map,
-                        node);
+    // TODO(satok): Set correct data type if it's given.
+    RegisterGenericNode(ops_definitions, shape_refiner, node);
   } else {
     return errors::InvalidArgument(node.type_string() +
-                                   " has not implemented yet.");
+                                   " has not been implemented yet.");
   }
+
   return Status();
 }
 
-void GraphTransferer::RegisterConstantNode(
-    const ShapeRefiner& shape_refiner, const Node& node,
-    const OutputTensorMap& output_tensor_map) {
+void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
+                                           const Node& node) {
   VLOG(1) << "Register constant node: " << node.name();
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const int output_node_size = node.num_outputs();
-  CHECK(output_node_size == 1);
+  CHECK_EQ(output_node_size, 1);
   // TODO(satok): support multiple outputs?
   const int output_index = 0;
   const DataType dt = node.output_type(output_index);
   const size_t max_bytes_per_data = DataTypeSize(dt);
-  CHECK(max_bytes_per_data > 0) << "dt = " << dt << ", " + DataTypeString(dt)
-                                << ", " << max_bytes_per_data << ", "
-                                << (int)(DataTypeSize(dt)) << ",,,,,,,";
+  CHECK_GT(max_bytes_per_data, 0)
+      << "dt = " << dt << ", " + DataTypeString(dt) << ", "
+      << max_bytes_per_data << ", " << static_cast<int>(DataTypeSize(dt))
+      << ",,,,,,,";
   shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
   shape_inference::ShapeHandle shape_handle = context->output(output_index);
   const shape_inference::DimensionHandle num_elements_dim =
       context->NumElements(shape_handle);
-  std::array<int64, SHAPE_ARRAY_SIZE> shape;
+  std::array<int64, SHAPE_ARRAY_SIZE> shape_array;
   int data_size;
-  if (context->ValueKnown(num_elements_dim)) {
-    const int64 num_output_elements = context->Value(num_elements_dim);
-    data_size = max_bytes_per_data * num_output_elements;
-    shape = BuildShapeArray(shape_handle, context);
-    CheckShape(output_tensor_map, node.name(), shape);
-  } else {
-    // Use output tensor for unknown shape
-    // TODO(stok): Remove this fallback
-    CHECK(!output_tensor_map.empty());
-    const TensorShape& tensor_shape =
-        output_tensor_map.at(node.name())->shape();
-    shape = ToTensorShapeArray(tensor_shape);
-    data_size = max_bytes_per_data * tensor_shape.num_elements();
-  }
+  // Shape of constant node must be known
   CHECK(context->ValueKnown(num_elements_dim));
+  const int64 num_output_elements = context->Value(num_elements_dim);
+  data_size = max_bytes_per_data * num_output_elements;
+  shape_array = BuildShapeArray(shape_handle, context);
+
   GraphTransferInfo::ConstNodeInfo& const_node_info =
       *graph_transfer_info_.add_const_node_info();
   const_node_info.set_name(node.name());
   const_node_info.set_node_id(id);
   // TODO(satok): Make this generic. Never assume rank is 4.
-  const_node_info.add_shape(shape[0]);
-  const_node_info.add_shape(shape[1]);
-  const_node_info.add_shape(shape[2]);
-  const_node_info.add_shape(shape[3]);
-  // TODO(satok): Remove. Determine constant value without dryrun
+  CHECK_EQ(4, SHAPE_ARRAY_SIZE);
+  const_node_info.add_shape(shape_array[0]);
+  const_node_info.add_shape(shape_array[1]);
+  const_node_info.add_shape(shape_array[2]);
+  const_node_info.add_shape(shape_array[3]);
+  const TensorProto* proto = nullptr;
+  TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto));
+  Tensor const_tensor;
+  // TODO(b/32704451): Don't just ignore this status!
+  MakeTensorFromProto(*proto, &const_tensor).IgnoreError();
+
+  const_node_info.set_dtype(const_tensor.dtype());
   if (data_size > 0) {
-    if (output_tensor_map.empty()) {
-      // setting dummy data if we don't generate node output
-      std::vector<uint8> dummy_data(data_size);
-      const_node_info.set_data(dummy_data.data(), data_size);
-    } else {
-      const Tensor* tensor = output_tensor_map.at(node.name());
-      CHECK(tensor != nullptr);
-      StringPiece sp = tensor->tensor_data();
-      CHECK(data_size == sp.size());
-      const_node_info.set_data(sp.data(), data_size);
-    }
+    const_node_info.set_data(const_tensor.tensor_data().data(), data_size);
   }
 }
 
 int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
   VLOG(1) << "Cache constant shape.";
   // TODO(satok): Handle non-4dim strides
-  CHECK(shape.size() == 4);
+  CHECK_EQ(shape.size(), 4);
   const string shape_name = CONST_SHAPE_PREFIX + ToString(shape.at(0)) + 'x' +
                             ToString(shape.at(1)) + 'x' +
                             ToString(shape.at(2)) + 'x' + ToString(shape.at(3));
@@ -481,13 +395,13 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
 }
 
 bool GraphTransferer::HasPaddingAndStrides(const Node& node) {
-  return node.def().attr().count(PADDING_ATTR_NAME) > 0 &&
-         node.def().attr().count(STRIDES_ATTR_NAME) > 0;
+  auto attrs = node.attrs();
+  return attrs.Find(PADDING_ATTR_NAME) != nullptr &&
+         attrs.Find(STRIDES_ATTR_NAME) != nullptr;
 }
 
-bool GraphTransferer::IsNodeFlattenReshape(
-    const Node& node, const OutputTensorMap& output_tensor_map,
-    const ShapeRefiner& shape_refiner) {
+bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
+                                           const ShapeRefiner& shape_refiner) {
   // Check if node is reshape op
   if (node.type_string() != RESHAPE_NODE_TYPE_STRING) {
     return false;
@@ -500,24 +414,25 @@ bool GraphTransferer::IsNodeFlattenReshape(
   }
 
   shape_inference::ShapeHandle shape_handle = context->output(0);
-  std::array<int64, SHAPE_ARRAY_SIZE> shape;
+  std::array<int64, SHAPE_ARRAY_SIZE> shape_array;
   const shape_inference::DimensionHandle dim_handle =
       context->NumElements(shape_handle);
 
   // Obtain shape of output of node
   if (context->ValueKnown(dim_handle)) {
-    shape = BuildShapeArray(shape_handle, context);
+    shape_array = BuildShapeArray(shape_handle, context);
   } else {
-    // Use output tensor for unknown shape
-    // TODO(stok): Remove this fallback
-    CHECK(!output_tensor_map.empty());
-    const TensorShape& tensor_shape =
-        output_tensor_map.at(node.name())->shape();
-    shape = ToTensorShapeArray(tensor_shape);
+    std::vector<TensorShape> shapes;
+    TF_CHECK_OK(RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+        node.attrs(), nullptr, &shapes));
+
+    // Number of outputs should be 1 for reshape node.
+    CHECK_EQ(1, shapes.size());
+    shape_array = ToTensorShapeArray(shapes.at(0));
   }
 
   // check if reshape op just does flatten
-  if (shape[0] == 1 && shape[1] == 1 && shape[2] == 1) {
+  if (shape_array[0] == 1 && shape_array[1] == 1 && shape_array[2] == 1) {
     return true;
   } else {
     return false;
@@ -526,103 +441,87 @@ bool GraphTransferer::IsNodeFlattenReshape(
 
 void GraphTransferer::RegisterNodeWithPaddingAndStrides(
     const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node) {
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
+    const ShapeRefiner& shape_refiner, const Node& node) {
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
-  CHECK(node.def().attr().count(PADDING_ATTR_NAME) > 0);
+  CHECK(node.attrs().Find(PADDING_ATTR_NAME));
   // TODO(satok): Use context->GetAttr(...) instead?
   Padding padding;
-  context->GetAttr(PADDING_ATTR_NAME, &padding);
-  CHECK(node.def().attr().count(STRIDES_ATTR_NAME) > 0);
+  TF_CHECK_OK(context->GetAttr(PADDING_ATTR_NAME, &padding));
+  CHECK(node.attrs().Find(STRIDES_ATTR_NAME));
   std::vector<int32> strides;
-  context->GetAttr(STRIDES_ATTR_NAME, &strides);
+  TF_CHECK_OK(context->GetAttr(STRIDES_ATTR_NAME, &strides));
   const int stride_id = RegisterConstantShape(strides);
   std::vector<int> extra_inputs{stride_id};
-  if (node.def().attr().count(KSIZE_ATTR_NAME) > 0) {
+  if (node.attrs().Find(KSIZE_ATTR_NAME)) {
     std::vector<int32> kernel_sizes;
-    context->GetAttr(KSIZE_ATTR_NAME, &kernel_sizes);
+    TF_CHECK_OK(context->GetAttr(KSIZE_ATTR_NAME, &kernel_sizes));
     const int ksize_id = RegisterConstantShape(kernel_sizes);
     extra_inputs.insert(extra_inputs.begin(), ksize_id);
   }
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
   // Safety check of padding id
   CHECK(padding == Padding::VALID ? 1 : 2);
   AppendNodeParamsWithIoParams(
-      shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, static_cast<int>(padding),
-      node.num_inputs(), extra_inputs, node.num_outputs(),
-      true /* append_input */, true /* append_output */);
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      static_cast<int>(padding), node.num_inputs(), extra_inputs,
+      node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
 void GraphTransferer::RegisterInputNode(
     const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node) {
+    const ShapeRefiner& shape_refiner, const Node& node) {
   VLOG(1) << "Register input node: " << node.name();
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const string op_type = IGraphTransferOpsDefinitions::INPUT_OP_NAME;
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
-  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
+  const string op_type = node.type_string();
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
+  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
+      << "Op" << node.name() << ", " << op_type << " is not supported,"
+      << op_type_id;
   AppendNodeParamsWithIoParams(
-      shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
-      node.num_outputs(), true /* append_input */, true /* append_output */);
-}
-
-void GraphTransferer::RegisterOutputNode(
-    const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node) {
-  VLOG(1) << "Register output node: " << node.name();
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
-  const int id = node_name_to_id_cache_map_[node.name()];
-  const string op_type = IGraphTransferOpsDefinitions::OUTPUT_OP_NAME;
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
-  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
-  // TODO(satok): Set output for output node?
-  AppendNodeParamsWithIoParams(
-      shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
-      0 /* outputs_size */, true /* append_input */, false /* append_output */);
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
 }
 
 void GraphTransferer::RegisterFlattenNode(
     const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node) {
+    const ShapeRefiner& shape_refiner, const Node& node) {
   VLOG(1) << "Register flatten node: " << node.name();
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const string op_type = IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
-      shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
-      node.num_outputs(), true /* append_input */, true /* append_output */);
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
 }
 
 void GraphTransferer::RegisterGenericNode(
     const IGraphTransferOpsDefinitions& ops_definitions,
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node) {
+    const ShapeRefiner& shape_refiner, const Node& node) {
   VLOG(1) << "Register generic node: " << node.name();
-  CHECK(node_name_to_id_cache_map_.count(node.name()) == 1);
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
-      shape_refiner, output_tensor_map, node, node.name(), id,
-      node.type_string(), op_type_id, PADDING_NA_ID, node.num_inputs(), {},
-      node.num_outputs(), true /* append_input */, true /* append_output */);
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
 }
 
 // TODO(satok): Remove this function.
@@ -631,14 +530,13 @@ Status GraphTransferer::RegisterNodeIfAllInputsAreCached(
     const IGraphTransferOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node,
     const bool only_register_const_node,
-    const std::vector<InputNodeInfo>& input_node_info_list,
-    const std::vector<string>& output_node_names,
-    const OutputTensorMap& output_tensor_map) {
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    const std::vector<string>& output_node_names) {
   if (only_register_const_node && !node.IsConstant()) {
     return Status();
   }
   CHECK(AreAllInputsCached(node));
-  return RegisterNode(ops_definitions, shape_refiner, output_tensor_map, node,
+  return RegisterNode(ops_definitions, shape_refiner, node,
                       input_node_info_list, output_node_names);
 }
 
@@ -675,7 +573,7 @@ void GraphTransferer::AppendNodeInputParams(
     const int port = edge->src_output();
 
     const std::string& op_name = input_node->name();
-    CHECK(node_name_to_id_cache_map_.count(op_name) > 0) << op_name;
+    CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
     const int src_id = node_name_to_id_cache_map_[op_name];
     GraphTransferInfo::NodeInput& node_input =
         *node_input_info.add_node_input();
@@ -690,69 +588,55 @@ void GraphTransferer::AppendNodeInputParams(
   }
 }
 
-void GraphTransferer::AppendNodeOutputParams(
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const int id, const Node& node) {
+void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
+                                             const int id, const Node& node) {
   VLOG(1) << "Append output params: " << node.name() << ", "
           << node.num_outputs();
   GraphTransferInfo::NodeOutputInfo& node_output_info =
       *graph_transfer_info_.add_node_output_info();
   node_output_info.set_node_id(id);
+
+  std::vector<TensorShape> shapes;
+  Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+      node.attrs(), nullptr, &shapes);
+
   for (int i = 0; i < node.num_outputs(); ++i) {
-    const Node* output_node = nullptr;
-    for (const Edge* const output_edge : node.out_edges()) {
-      if (output_edge->src_output() == i) {
-        output_node = output_edge->src();
-      }
-    }
-    CHECK(output_node != nullptr) << node.name() << ", " << node.type_string();
+    int data_size = -1;
     const int output_index = i;
     const DataType dt = node.output_type(output_index);
     const size_t max_bytes_per_data = DataTypeSize(dt);
+
     shape_inference::InferenceContext* context =
-        shape_refiner.GetContext(output_node);
+        shape_refiner.GetContext(&node);
     shape_inference::ShapeHandle shape_handle = context->output(output_index);
     const shape_inference::DimensionHandle num_elements_dim =
         context->NumElements(shape_handle);
-    int data_size = -1;
     if (context->ValueKnown(num_elements_dim)) {
       const int64 num_output_elements = context->Value(num_elements_dim);
       data_size = max_bytes_per_data * num_output_elements;
-      if (!output_tensor_map.empty() && strict_check_mode_) {
-        CHECK(output_tensor_map.count(node.name()) == 1) << node.name();
-        const TensorShape& tensor_shape =
-            output_tensor_map.at(node.name())->shape();
-        CHECK(num_output_elements == tensor_shape.num_elements())
-            << "num elements of node " << node.name() << " doesn't match "
-            << num_output_elements << " vs " << tensor_shape.num_elements()
-            << ", " << node.type_string();
-      }
     } else {
-      // Use dryrun result to get the output data size
-      // TODO(satok): Remove and stop using dryrun result
-      CHECK(!output_tensor_map.empty());
-      CHECK(output_tensor_map.count(node.name()) == 1);
-      const TensorShape& tensor_shape =
-          output_tensor_map.at(node.name())->shape();
-      data_size = max_bytes_per_data * tensor_shape.num_elements();
+      TF_CHECK_OK(status);
+      // Use attribute attached to node
+      CHECK_EQ(node.num_outputs(), shapes.size()) << node.name();
+      data_size = max_bytes_per_data * shapes.at(i).num_elements();
     }
-    CHECK(data_size >= 0);
+    CHECK_GE(data_size, 0);
     node_output_info.add_max_byte_size(data_size);
   }
 }
 
 void GraphTransferer::AppendNodeParamsWithIoParams(
-    const ShapeRefiner& shape_refiner, const OutputTensorMap& output_tensor_map,
-    const Node& node, const string& name, const int id, const string& type,
-    const int type_id, const int padding, const int inputs_size,
-    const std::vector<int>& extra_inputs, const int outputs_size,
-    const bool append_input_params, const bool append_output_params) {
+    const ShapeRefiner& shape_refiner, const Node& node, const string& name,
+    const int id, const string& type, const int type_id, const int padding,
+    const int inputs_size, const std::vector<int>& extra_inputs,
+    const int outputs_size, const bool append_input_params,
+    const bool append_output_params) {
   VLOG(1) << "Append node with io params: " << node.name();
   if (append_input_params) {
     AppendNodeInputParams(id, node, extra_inputs);
   }
   if (append_output_params) {
-    AppendNodeOutputParams(shape_refiner, output_tensor_map, id, node);
+    AppendNodeOutputParams(shape_refiner, id, node);
   }
   AppendNodeParams(name, id, type, type_id, padding, inputs_size, extra_inputs,
                    outputs_size);
@@ -828,22 +712,6 @@ GraphTransferer::ToTensorShapeArray(const TensorShape& shape) {
   }
 }
 
-/* static */ void GraphTransferer::CheckShape(
-    const OutputTensorMap& output_tensor_map, const string& node_name,
-    const std::array<int64, SHAPE_ARRAY_SIZE>& expected) {
-  if (output_tensor_map.empty()) {
-    // As output_tensor_map is empty, skip checking tensor shape.
-    return;
-  }
-  VLOG(1) << "Check shape for " << node_name;
-  CHECK(output_tensor_map.count(node_name) == 1);
-  const std::array<int64, SHAPE_ARRAY_SIZE> actual =
-      ToTensorShapeArray(output_tensor_map.at(node_name)->shape());
-  for (int i = 0; i < SHAPE_ARRAY_SIZE; ++i) {
-    CHECK(expected[i] == actual[i]);
-  }
-}
-
 GraphTransferer::TransferParamsComparator::TransferParamsComparator(
     const std::unordered_map<int, std::unordered_set<int>>& dep_map)
     : dependency_map_(dep_map) {}
@@ -878,7 +746,7 @@ bool GraphTransferer::TransferParamsComparator::operator()(
       completed.count(node_id) == 1) {
     return;
   }
-  CHECK(dep_map.count(node_id) == 1);
+  CHECK_EQ(dep_map.count(node_id), 1);
 
   // Complete children's dependency map
   for (int child_node_id : dep_map.at(node_id)) {
@@ -911,6 +779,19 @@ bool GraphTransferer::TransferParamsComparator::operator()(
   completed.emplace(node_id);
 }
 
+/* static */ Status GraphTransferer::MakeTensorFromProto(
+    const TensorProto& tensor_proto, Tensor* tensor) {
+  if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
+    Tensor parsed(tensor_proto.dtype());
+    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      *tensor = parsed;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                 tensor_proto.DebugString());
+}
+
 void GraphTransferer::ClearCache() {
   node_name_cache_list_.clear();
   node_name_to_id_cache_map_.clear();
@@ -921,7 +802,7 @@ void GraphTransferer::DumpNodeTransferParams() const {
   for (const GraphTransferInfo::ConstNodeInfo& params :
        graph_transfer_info_.const_node_info()) {
     // TODO(satok): Stop assuming shape size is 4.
-    CHECK(params.shape_size() == 4);
+    CHECK_EQ(params.shape_size(), 4);
     LOG(INFO) << "[ " << params.node_id() << " \"" << params.name()
               << "\" (Const)";
     LOG(INFO) << "  shape: " << params.shape(0) << params.shape(1)
@@ -975,7 +856,7 @@ void GraphTransferer::DumpVerificationStringOfNodeTransferParams() const {
        graph_transfer_info_.const_node_info()) {
     std::stringstream sstream;
     // TODO(satok): Stop assuming shape size is 4.
-    CHECK(params.shape_size() == 4);
+    CHECK_EQ(params.shape_size(), 4);
     sstream << "---(CONST) [" << std::hex << params.node_id() << std::dec << ","
             << params.shape(0) << "," << params.shape(1) << ","
             << params.shape(2) << "," << params.shape(3) << ","
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 2e74cf3b057..fa12b22d75d 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/padding.h"
@@ -44,53 +45,30 @@ class GraphTransferer {
   static constexpr int MAX_SUPPORTED_RANK = 4;
   // TODO(satok): Remove. Use proto definition instead.
   static constexpr int SHAPE_ARRAY_SIZE = MAX_SUPPORTED_RANK;
-  using OutputTensorMap = std::unordered_map<string, Tensor*>;
-
-  struct InputNodeInfo {
-    string name;
-    Tensor tensor;
-  };
-
-  struct OutputTensorInfo {
-    std::vector<Tensor> output_tensors;
-    OutputTensorMap output_tensor_map;
-  };
+  using TensorShapeMap = RemoteFusedGraphExecuteUtils::TensorShapeMap;
 
   GraphTransferer() = default;
 
   // Load graph structure into GraphTransferer
+  // TODO(satok): Pass a pair of TensorShape and DataType instead of
+  // Tensor as input_node_info_list.
   Status LoadGraphFromProto(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names,
-      const OutputTensorMap& output_tensor_map);
+      const bool shape_inference_for_unknown_shape);
 
   // Load graph structure into GraphTransferer from protobuf file
+  // TODO(satok): Pass a pair of TensorShape and DataType instead of
+  // Tensor as input_node_info_list.
   Status LoadGraphFromProtoFile(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const string& graph_def_path,
-      const std::vector<InputNodeInfo>& input_node_info_list,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names, const bool is_text_proto,
-      const bool dry_run_for_unknown_shape,
-      OutputTensorInfo* output_tensor_info);
-
-  // Dry run inference and cache the result to get memory mapping
-  static Status DryRunInference(
-      const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const std::vector<string>& output_node_names,
-      const bool initialize_by_zero,
-      std::vector<tensorflow::Tensor>* output_tensors);
-
-  // Dry run inference and fill output tensors to output tensor info
-  // CAVEAT: Do not add or modify output_tensors in output_tensor_info
-  // otherwise, address map may be broken by re-allocation inside
-  // std::vector
-  static Status DryRunInferenceForAllNode(
-      const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const bool initialize_by_zero, OutputTensorInfo* output_tensor_info);
+      const bool shape_inference_for_unknown_shape,
+      const bool dry_run_for_unknown_shape);
 
   // Sort params so that all input nodes appear before consumer nodes.
   // CAVEAT: This may be slow if the number of nodes are too large
@@ -104,6 +82,12 @@ class GraphTransferer {
   // Return parameters for graph transfer
   const GraphTransferInfo& GetGraphTransferInfo() const;
 
+  // Return mutable GraphTransferInfo for graph transfer
+  GraphTransferInfo& GetMutableGraphTransferInfo();
+
+  // Dump verification string of parameters to verify with offline tools
+  void DumpVerificationStringOfNodeTransferParams() const;
+
  private:
   class TransferParamsComparator {
    public:
@@ -116,21 +100,16 @@ class GraphTransferer {
 
   int CacheNode(const Node& node);
 
-  static bool IsInputNode(
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const string& node_name);
-
   bool AreAllInputsCached(const Node& node) const;
 
-  Status RegisterNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                      const ShapeRefiner& shape_refiner,
-                      const OutputTensorMap& output_tensor_map,
-                      const Node& node,
-                      const std::vector<InputNodeInfo>& input_node_info_list,
-                      const std::vector<string>& output_node_names);
+  Status RegisterNode(
+      const IGraphTransferOpsDefinitions& ops_definitions,
+      const ShapeRefiner& shape_refiner, const Node& node,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names);
 
-  void RegisterConstantNode(const ShapeRefiner& shape_refiner, const Node& node,
-                            const OutputTensorMap& output_tensor_map);
+  void RegisterConstantNode(const ShapeRefiner& shape_refiner,
+                            const Node& node);
 
   int RegisterConstantShape(const std::vector<int>& shape);
 
@@ -140,41 +119,30 @@ class GraphTransferer {
   // TODO(satok): Remove this method once generic reshape op is implemented in
   // SOC
   bool IsNodeFlattenReshape(const Node& node,
-                            const OutputTensorMap& output_tensor_map,
                             const ShapeRefiner& shape_refiner);
 
   void RegisterNodeWithPaddingAndStrides(
       const IGraphTransferOpsDefinitions& ops_definitions,
-      const ShapeRefiner& shape_refiner,
-      const OutputTensorMap& output_tensor_map, const Node& node);
+      const ShapeRefiner& shape_refiner, const Node& node);
 
   void RegisterInputNode(const IGraphTransferOpsDefinitions& ops_definitions,
                          const ShapeRefiner& shape_refiner,
-                         const OutputTensorMap& output_tensor_map,
                          const Node& node);
 
-  void RegisterOutputNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                          const ShapeRefiner& shape_refiner,
-                          const OutputTensorMap& output_tensor_map,
-                          const Node& node);
-
   void RegisterFlattenNode(const IGraphTransferOpsDefinitions& ops_definitions,
                            const ShapeRefiner& shape_refiner,
-                           const OutputTensorMap& output_tensor_map,
                            const Node& node);
 
   void RegisterGenericNode(const IGraphTransferOpsDefinitions& ops_definitions,
                            const ShapeRefiner& shape_refiner,
-                           const OutputTensorMap& output_tensor_map,
                            const Node& node);
 
   Status RegisterNodeIfAllInputsAreCached(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node,
       const bool only_register_const_node,
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const std::vector<string>& output_node_names,
-      const OutputTensorMap& output_tensor_map);
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names);
 
   void AppendNodeParams(const string& name, const int id, const string& type,
                         const int type_id, const int padding,
@@ -186,7 +154,6 @@ class GraphTransferer {
                              const std::vector<int>& extra_inputs);
 
   void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const OutputTensorMap& output_tensor_map,
                               const int id, const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
@@ -194,35 +161,31 @@ class GraphTransferer {
       shape_inference::InferenceContext* context);
 
   void AppendNodeParamsWithIoParams(
-      const ShapeRefiner& shape_refiner,
-      const OutputTensorMap& output_tensor_map, const Node& node,
-      const string& name, const int id, const string& type, const int type_id,
-      const int padding, const int inputs_size,
-      const std::vector<int>& extra_inputs, const int outputs_size,
-      const bool append_input_params, const bool append_output_params);
+      const ShapeRefiner& shape_refiner, const Node& node, const string& name,
+      const int id, const string& type, const int type_id, const int padding,
+      const int inputs_size, const std::vector<int>& extra_inputs,
+      const int outputs_size, const bool append_input_params,
+      const bool append_output_params);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
       const TensorShape& shape);
 
   static string ToPaddingDebugString(int padding);
 
-  static void CheckShape(const OutputTensorMap& output_tensor_map,
-                         const string& node_name,
-                         const std::array<int64, SHAPE_ARRAY_SIZE>& actual);
-
   // Create dependency map
   static void FillDependencyRec(
       int node_id, std::unordered_map<int, std::unordered_set<int>>& dep_map,
       std::unordered_set<int>& completed);
 
+  // Build tensor from proto
+  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                    Tensor* tensor);
+
   void ClearCache();
 
   // Dump pretty print of parameters
   void DumpNodeTransferParams() const;
 
-  // Dump verification string of parameters to verify with offline tools
-  void DumpVerificationStringOfNodeTransferParams() const;
-
   GraphTransferInfo graph_transfer_info_{};
 
   std::vector<const Node*> node_name_cache_list_{};
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index da2fd8871c1..ebd4a903301 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -16,14 +16,14 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/nn_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
@@ -47,32 +47,108 @@ class GraphTransfererTest : public ::testing::Test {
   GraphTransferer gt_;
 };
 
-static const std::vector<string> OP_TYPES{"INPUT",   "OUTPUT", "Conv2D",
-                                          "MaxPool", "NoOp",   "Add"};
-const GraphTransferer::OutputTensorMap EMPTY_OUTPUT_TENSOR_MAP;
+static const std::vector<string> OP_TYPES{
+    "INPUT", "OUTPUT", "Conv2D", "MaxPool", "NoOp", "Add", "Const", "Softmax"};
+const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP;
 
 class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
  public:
   int GetTotalOpsCount() const final { return OP_TYPES.size(); }
-  int GetInputNodeOpId() const final { return GetOpIdFor("INPUT"); }
-  int GetOutputNodeOpId() const final { return GetOpIdFor("OUTPUT"); }
-  int GetOpIdFor(const string& op_type) const final {
-    for (int i = 0; i < OP_TYPES.size(); ++i) {
-      if (OP_TYPES[i] == op_type) {
-        return i;
-      }
+
+int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
+  for (int i = 0; i < OP_TYPES.size(); ++i) {
+    if (OP_TYPES[i] == op_type) {
+      return i;
     }
-    return -1;
+  }
+  return -1;
+}
+
+GraphTransferInfo::Destination GetTransferDestination() const final {
+  return GraphTransferInfo::NOP;
   }
 
  private:
 } TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
 
+static Output BuildAddOps(const Scope& scope, const Input& x, const Input& y) {
+  EXPECT_TRUE(scope.ok());
+  auto _x = ops::AsNodeOut(scope, x);
+  EXPECT_TRUE(scope.ok());
+  auto _y = ops::AsNodeOut(scope, y);
+  EXPECT_TRUE(scope.ok());
+  Node* ret;
+  const auto unique_name = scope.GetUniqueNameForOp("Add");
+  auto builder = NodeBuilder(unique_name, "Add").Input(_x).Input(_y);
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  EXPECT_TRUE(scope.ok());
+  return Output(ret, 0);
+}
+
+static Output BuildSoftmaxOps(const Scope& scope, const Input& logits) {
+  EXPECT_TRUE(scope.ok());
+  auto _logits = ops::AsNodeOut(scope, logits);
+  EXPECT_TRUE(scope.ok());
+  Node* ret;
+  const auto unique_name = scope.GetUniqueNameForOp("Softmax");
+  auto builder = NodeBuilder(unique_name, "Softmax").Input(_logits);
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  EXPECT_TRUE(scope.ok());
+  return Output(ret, 0);
+}
+
+static Output BuildConv2DOps(const Scope& scope, const Input& input,
+                             const Input& filter,
+                             const gtl::ArraySlice<int>& strides,
+                             const StringPiece& padding) {
+  EXPECT_TRUE(scope.ok());
+  auto _input = ops::AsNodeOut(scope, input);
+  EXPECT_TRUE(scope.ok());
+  auto _filter = ops::AsNodeOut(scope, filter);
+  EXPECT_TRUE(scope.ok());
+  Node* ret;
+  const auto unique_name = scope.GetUniqueNameForOp("Conv2D");
+  auto builder = NodeBuilder(unique_name, "Conv2D")
+                     .Input(_input)
+                     .Input(_filter)
+                     .Attr("strides", strides)
+                     .Attr("use_cudnn_on_gpu", true)
+                     .Attr("padding", padding)
+                     .Attr("data_format", "NHWC");
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  EXPECT_TRUE(scope.ok());
+  return Output(ret, 0);
+}
+
+static Output BuildMaxPoolOps(const Scope& scope, const Input& input,
+                              const gtl::ArraySlice<int>& ksize,
+                              const gtl::ArraySlice<int>& strides,
+                              const StringPiece& padding) {
+  EXPECT_TRUE(scope.ok());
+  auto _input = ops::AsNodeOut(scope, input);
+  EXPECT_TRUE(scope.ok());
+  Node* ret;
+  const auto unique_name = scope.GetUniqueNameForOp("MaxPool");
+  auto builder = NodeBuilder(unique_name, "MaxPool")
+                     .Input(_input)
+                     .Attr("ksize", ksize)
+                     .Attr("strides", strides)
+                     .Attr("padding", padding)
+                     .Attr("data_format", "NHWC");
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  EXPECT_TRUE(scope.ok());
+  return Output(ret, 0);
+}
+
 static GraphDef CreateAddGraphDef() {
   Scope root = Scope::NewRootScope();
   Output node_a = ops::Const(root.WithOpName(NAME_A), NODE_A_VAL);
   Output node_b = ops::Const(root.WithOpName(NAME_B), NODE_B_VAL);
-  Output node_add = ops::Add(root.WithOpName(NAME_A_PLUS_B), node_a, node_b);
+  Output node_add = BuildAddOps(root.WithOpName(NAME_A_PLUS_B), node_a, node_b);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -90,8 +166,8 @@ static GraphDef CreateConvGraphDef() {
       ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data));
   const std::vector<int> strides{1, 1, 1, 1};
   Output conv =
-      ops::Conv2D(root.WithOpName("conv"), input, filter, strides, "SAME");
-  Output softmax = ops::Softmax(root.WithOpName("softmax"), conv);
+      BuildConv2DOps(root.WithOpName("conv"), input, filter, strides, "SAME");
+  Output softmax = BuildSoftmaxOps(root.WithOpName("softmax"), conv);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -110,9 +186,9 @@ static GraphDef CreatePoolGraphDef() {
   const std::vector<int> ksize{1, 1, 1, 1};
   const std::vector<int> padding{0, 0, 0, 0};
   const std::vector<int> strides{1, 1, 1, 1};
-  Output max_pool =
-      ops::MaxPool(root.WithOpName("maxpool"), input, ksize, strides, "SAME");
-  Output softmax = ops::Softmax(root.WithOpName("softmax"), max_pool);
+  Output max_pool = BuildMaxPoolOps(root.WithOpName("maxpool"), input, ksize,
+                                    strides, "SAME");
+  Output softmax = BuildSoftmaxOps(root.WithOpName("softmax"), max_pool);
   GraphDef def;
   TF_CHECK_OK(root.ToGraphDef(&def));
   return def;
@@ -193,7 +269,7 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   GraphDef def = CreateAddGraphDef();
   ASSERT_TRUE(gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
                                      {}, std::vector<string>{NAME_A_PLUS_B},
-                                     EMPTY_OUTPUT_TENSOR_MAP)
+                                     false)
                   .ok());
   SanityCheckNodes(gt_);
 
@@ -222,127 +298,32 @@ TEST_F(GraphTransfererTest, LoadAddGraph) {
   EXPECT_EQ(4, params_b->data().length());
 }
 
-TEST_F(GraphTransfererTest, DryRunAddGraphA) {
-  GraphDef def = CreateAddGraphDef();
-  GraphTransferer::InputNodeInfo input_node_info;
-  input_node_info.name = NAME_A;
-  input_node_info.tensor = Tensor(DT_FLOAT, {});
-  input_node_info.tensor.scalar<float>()() = 1.0f;
-  const std::vector<GraphTransferer::InputNodeInfo> inputs{input_node_info};
-  std::vector<string> outputs = {NAME_B, NAME_A_PLUS_B};
-  std::vector<tensorflow::Tensor> output_tensors;
-  Status status = gt_.DryRunInference(
-      def, inputs, outputs, false /* initialize_by_zero */, &output_tensors);
-  ASSERT_TRUE(status.ok()) << status;
-  EXPECT_EQ(outputs.size(), output_tensors.size());
-  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(0).scalar<float>()(),
-              VALUE_TOLERANCE_FLOAT);
-  EXPECT_NEAR(1.0f + NODE_B_VAL, output_tensors.at(1).scalar<float>()(),
-              VALUE_TOLERANCE_FLOAT);
-}
-
-TEST_F(GraphTransfererTest, DryRunAddGraphAUninitialized) {
-  GraphDef def = CreateAddGraphDef();
-  GraphTransferer::InputNodeInfo input_node_info;
-  input_node_info.name = NAME_A;
-  input_node_info.tensor = Tensor(DT_FLOAT, {});
-  const std::vector<GraphTransferer::InputNodeInfo> inputs{input_node_info};
-  std::vector<string> outputs = {NAME_B, NAME_A_PLUS_B};
-  std::vector<tensorflow::Tensor> output_tensors;
-  Status status = gt_.DryRunInference(
-      def, inputs, outputs, true /* initialize_by_zero */, &output_tensors);
-  ASSERT_TRUE(status.ok()) << status;
-  EXPECT_EQ(outputs.size(), output_tensors.size());
-  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(0).scalar<float>()(),
-              VALUE_TOLERANCE_FLOAT);
-  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(1).scalar<float>()(),
-              VALUE_TOLERANCE_FLOAT);
-}
-
-TEST_F(GraphTransfererTest, DryRunAddGraphAB) {
-  GraphDef def = CreateAddGraphDef();
-  GraphTransferer::InputNodeInfo input_node_info_a;
-  input_node_info_a.name = NAME_A;
-  input_node_info_a.tensor = Tensor(DT_FLOAT, {});
-  input_node_info_a.tensor.scalar<float>()() = 1.0f;
-  GraphTransferer::InputNodeInfo input_node_info_b;
-  input_node_info_b.name = NAME_B;
-  input_node_info_b.tensor = Tensor(DT_FLOAT, {});
-  input_node_info_b.tensor.scalar<float>()() = 10.0f;
-  const std::vector<GraphTransferer::InputNodeInfo> inputs{input_node_info_a,
-                                                           input_node_info_b};
-  std::vector<string> outputs = {NAME_A_PLUS_B};
-  std::vector<tensorflow::Tensor> output_tensors;
-  Status status = gt_.DryRunInference(
-      def, inputs, outputs, false /* initialize_by_zero */, &output_tensors);
-  ASSERT_TRUE(status.ok()) << status;
-  EXPECT_EQ(outputs.size(), output_tensors.size());
-  EXPECT_NEAR(11.0f, output_tensors.at(0).scalar<float>()(),
-              VALUE_TOLERANCE_FLOAT);
-}
-
-TEST_F(GraphTransfererTest, DryRunAddGraphForAllNodes) {
-  // Set Node "a" as an input with value (= 1.0f)
-  GraphTransferer::InputNodeInfo input_node_info_a;
-  input_node_info_a.name = NAME_A;
-  input_node_info_a.tensor = Tensor(DT_FLOAT, {});
-  input_node_info_a.tensor.scalar<float>()() = 1.0f;
-
-  // Setup dryrun arguments
-  const std::vector<GraphTransferer::InputNodeInfo> inputs{input_node_info_a};
-  GraphTransferer::OutputTensorInfo output_tensor_info;
-  GraphDef def = CreateAddGraphDef();
-
-  // dryrun
-  const Status status = GraphTransferer::DryRunInferenceForAllNode(
-      def, inputs, false /* initialize_by_zero */, &output_tensor_info);
-  const std::vector<Tensor>& output_tensors = output_tensor_info.output_tensors;
-  const std::unordered_map<string, Tensor*>& output_tensor_map =
-      output_tensor_info.output_tensor_map;
-  ASSERT_TRUE(status.ok()) << status;
-
-  // Assert output node count
-  ASSERT_EQ(3, output_tensors.size());
-  ASSERT_EQ(1, output_tensor_map.count(NAME_A));
-  ASSERT_EQ(1, output_tensor_map.count(NAME_B));
-  ASSERT_EQ(1, output_tensor_map.count(NAME_A_PLUS_B));
-
-  // Assert output nodes' values
-  const float name_b_output = output_tensor_map.at(NAME_B)->scalar<float>()();
-  const float name_a_b_output =
-      output_tensor_map.at(NAME_A_PLUS_B)->scalar<float>()();
-  EXPECT_NEAR(NODE_B_VAL, name_b_output, VALUE_TOLERANCE_FLOAT);
-  EXPECT_NEAR(1.0f + NODE_B_VAL, name_a_b_output, VALUE_TOLERANCE_FLOAT);
-}
-
 TEST_F(GraphTransfererTest, LoadAddGraphWithOutputTensorMap) {
   GraphDef def = CreateAddGraphDef();
-  GraphTransferer::InputNodeInfo input_node_info_a;
-  input_node_info_a.name = NAME_A;
-  input_node_info_a.tensor = Tensor(DT_FLOAT, {});
-  input_node_info_a.tensor.scalar<float>()() = 1.0f;
-  const std::vector<GraphTransferer::InputNodeInfo> inputs{input_node_info_a};
-  GraphTransferer::OutputTensorInfo output_tensor_info;
-  Status status = GraphTransferer::DryRunInferenceForAllNode(
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = 1.0f;
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a};
+  RemoteFusedGraphExecuteUtils::TensorShapeMap output_tensor_info;
+  Status status = RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
       def, inputs, {}, &output_tensor_info);
   ASSERT_TRUE(status.ok()) << status;
-  const GraphTransferer::OutputTensorMap& output_tensor_map =
-      output_tensor_info.output_tensor_map;
   const std::vector<string> output_node_names = {NAME_A_PLUS_B};
   status = gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
-                                  inputs, output_node_names, output_tensor_map);
+                                  inputs, output_node_names, false);
   ASSERT_TRUE(status.ok());
 }
 
 TEST_F(GraphTransfererTest, LoadConvGraph) {
   GraphDef def = CreateConvGraphDef();
-  std::vector<GraphTransferer::InputNodeInfo> input_node_info_list;
+  std::vector<std::pair<string, Tensor>> input_node_info_list;
   input_node_info_list.emplace_back(
-      GraphTransferer::InputNodeInfo{"input", Tensor{DT_FLOAT, {1, 1, 1, 1}}});
+      std::pair<string, Tensor>{"input", Tensor{DT_FLOAT, {1, 1, 1, 1}}});
   const std::vector<string> output_node_names = {"softmax"};
   ASSERT_TRUE(gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
                                      input_node_info_list, output_node_names,
-                                     EMPTY_OUTPUT_TENSOR_MAP)
+                                     false)
                   .ok());
   SanityCheckNodes(gt_);
   const int const_node_count =
@@ -362,13 +343,13 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
 
 TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
   GraphDef def = CreatePoolGraphDef();
-  std::vector<GraphTransferer::InputNodeInfo> input_node_info_list;
+  std::vector<std::pair<string, Tensor>> input_node_info_list;
   input_node_info_list.emplace_back(
-      GraphTransferer::InputNodeInfo{"input", Tensor{DT_FLOAT, {1, 1, 1, 1}}});
+      std::pair<string, Tensor>{"input", Tensor{DT_FLOAT, {1, 1, 1, 1}}});
   const std::vector<string> output_node_names = {"softmax"};
   ASSERT_TRUE(gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
                                      input_node_info_list, output_node_names,
-                                     EMPTY_OUTPUT_TENSOR_MAP)
+                                     false)
                   .ok());
   SanityCheckNodes(gt_);
   const int const_node_count =
@@ -392,14 +373,6 @@ TEST(HexagonOpsDefinitions, CheckOpsDefinitions) {
       HexagonOpsDefinitions::getInstance();
   const int total_ops_count = ops_definitions.GetTotalOpsCount();
   EXPECT_GT(total_ops_count, 0);
-  const int input_op_id =
-      ops_definitions.GetOpIdFor(IGraphTransferOpsDefinitions::INPUT_OP_NAME);
-  EXPECT_GE(input_op_id, 0);
-  EXPECT_EQ(input_op_id, ops_definitions.GetInputNodeOpId());
-  const int output_op_id =
-      ops_definitions.GetOpIdFor(IGraphTransferOpsDefinitions::OUTPUT_OP_NAME);
-  EXPECT_GE(output_op_id, 0);
-  EXPECT_EQ(output_op_id, ops_definitions.GetOutputNodeOpId());
 }
 
 TEST(GraphTransferer, LoadGraphFromProtoFile) {
@@ -408,23 +381,107 @@ TEST(GraphTransferer, LoadGraphFromProtoFile) {
   string filename =
       io::JoinPath(testing::TensorFlowSrcRoot(),
                    "core/example/testdata/parse_example_graph_def.pbtxt");
-  std::vector<GraphTransferer::InputNodeInfo> input_node_info_list = {};
+  std::vector<std::pair<string, Tensor>> input_node_info_list = {};
   std::vector<string> output_node_names = {};
   bool is_text_proto = true;
 
   // Keep following comments for debugging purpose for now
   // filename = "v3_stripped_quantized_graph_opt.pb";
   // input_node_info_list.emplace_back(
-  // GraphTransferer::InputNodeInfo{"Mul", Tensor{DT_FLOAT, {1,299,299,3}}});
+  // std::pair<string, Tensor>{"Mul", Tensor{DT_FLOAT, {1,299,299,3}}});
   // output_node_names.emplace_back("softmax");
   // is_text_proto = false;
   // ops_definitions = &HexagonOpsDefinitions::getInstance();
 
-  GraphTransferer::OutputTensorInfo output_tensor_info;
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
   Status status = gt.LoadGraphFromProtoFile(
       *ops_definitions, filename, input_node_info_list, output_node_names,
-      is_text_proto, true, &output_tensor_info);
+      is_text_proto, false, true);
 }
+
+TEST_F(GraphTransfererTest, BuildRemoteFusedGraphDefAddGraph) {
+  GraphDef def = CreateAddGraphDef();
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = 1.0f;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = 10.0f;
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a,
+                                                      input_node_info_b};
+  std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef fused_graph_def = GraphTransferUtils::BuildFusedGraphDef(
+      TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, "remote_fused_graph_execute_node",
+      inputs, outputs, &def);
+
+  EXPECT_EQ(3, fused_graph_def.node_size());
+}
+
+namespace {
+// Just compares the max_byte_size attributes present.
+void CompareGraphTransferInfo(const GraphTransferInfo& a,
+                              const GraphTransferInfo& b) {
+  EXPECT_EQ(a.node_output_info_size(), b.node_output_info_size());
+  for (int i = 0; i < a.node_output_info_size(); ++i) {
+    EXPECT_EQ(a.node_output_info(i).node_id(), b.node_output_info(i).node_id());
+    EXPECT_EQ(a.node_output_info(i).max_byte_size_size(),
+              b.node_output_info(i).max_byte_size_size());
+    for (int j = 0; j < a.node_output_info(i).max_byte_size_size(); ++j) {
+      EXPECT_EQ(a.node_output_info(i).max_byte_size(j),
+                b.node_output_info(i).max_byte_size(j));
+    }
+  }
+}
+}  // anonymous namespace
+
+TEST(GraphTransferer, LoadGraphFromProtoFileShapeInferenceSimple) {
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
+  string filename =
+      io::JoinPath(testing::TensorFlowSrcRoot(),
+                   "core/example/testdata/parse_example_graph_def.pbtxt");
+  std::vector<std::pair<string, Tensor>> input_node_info_list = {};
+  std::vector<string> output_node_names = {};
+  bool is_text_proto = true;
+
+  // In order to run with a more complex graph uncomment the following lines
+  // filename = "v3_stripped_quantized_graph_opt.pb";
+  // input_node_info_list.emplace_back(
+  // std::pair<string, Tensor>{"Mul", Tensor{DT_FLOAT, {1,299,299,3}}});
+  // output_node_names.emplace_back("softmax");
+  // is_text_proto = false;
+  // ops_definitions = &HexagonOpsDefinitions::getInstance();
+
+  // First compute using Shape inference.
+  GraphTransferer si_gt;
+  si_gt.EnableStrictCheckMode(false);
+  bool shape_inference_for_unknown_shape = true;
+  bool dry_run_for_unknown_shape = false;
+  Status status1 = si_gt.LoadGraphFromProtoFile(
+      *ops_definitions, filename, input_node_info_list, output_node_names,
+      is_text_proto, shape_inference_for_unknown_shape,
+      dry_run_for_unknown_shape);
+  const GraphTransferInfo& si_graph_transfer_info =
+      si_gt.GetGraphTransferInfo();
+
+  // Now compute using dry run.
+  GraphTransferer dr_gt;
+  dr_gt.EnableStrictCheckMode(false);
+  shape_inference_for_unknown_shape = false;
+  dry_run_for_unknown_shape = true;
+  Status status2 = dr_gt.LoadGraphFromProtoFile(
+      *ops_definitions, filename, input_node_info_list, output_node_names,
+      is_text_proto, shape_inference_for_unknown_shape,
+      dry_run_for_unknown_shape);
+  const GraphTransferInfo& dr_graph_transfer_info =
+      dr_gt.GetGraphTransferInfo();
+
+  // Now compare both of them.
+  CompareGraphTransferInfo(si_graph_transfer_info, dr_graph_transfer_info);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index a96c0e281eb..518b399c374 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
+#include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+
 #ifdef USE_HEXAGON_LIBS
 #include "tensorflow/core/platform/hexagon/soc_interface.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
@@ -22,35 +24,122 @@ limitations under the License.
 
 namespace tensorflow {
 
-const bool SHOW_DBG_IN_SOC = false;
+constexpr const char* const INPUT_OP_NAME = "INPUT";
+constexpr const char* const OUTPUT_OP_NAME = "OUTPUT";
+
+const bool DBG_DUMP_VERIFICATION_STRING = false;
+const int DBG_LEVEL = 0;  // -2: verbose, -1: debug, 0: info
 const bool DBG_USE_DUMMY_INPUT = false;
 const bool DBG_USE_SAMPLE_INPUT = false;
 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
+const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false;
+
+/* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo(
+    const string& name, GraphTransferInfo* graph_transfer_info) {
+  for (GraphTransferInfo::NodeInfo& node_info :
+       *graph_transfer_info->mutable_node_info()) {
+    if (node_info.name() == name) {
+      return &node_info;
+    }
+  }
+  return nullptr;
+}
 
 #ifdef USE_HEXAGON_LIBS
 int HexagonControlWrapper::GetVersion() {
   return soc_interface_GetSocControllerVersion();
 }
 
-bool HexagonControlWrapper::Init() {
-  soc_interface_SetLogLevel(SHOW_DBG_IN_SOC ? -1 /* debug */ : 0 /* info */);
+bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) {
+  soc_interface_SetLogLevel(DBG_LEVEL);
   if (DBG_USE_SAMPLE_INPUT) {
     soc_interface_SetDebugFlag(FLAG_ENABLE_PANDA_BINARY_INPUT);
   }
+  if (info.serialized_executor_parameters().empty()) {
+    std::vector<std::pair<string, Tensor>> inputs;
+    std::vector<string> outputs;
+    RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
+        info, &inputs, &outputs);
+    graph_transferer_.LoadGraphFromProto(
+        HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs,
+        outputs,
+        false  // shape_inference_for_unknown_shape
+        );
+  } else {
+    // If graph transfer info is attached, just import it.
+    graph_transferer_.SetSerializedGraphTransferInfo(
+        info.serialized_executor_parameters());
+  }
+  execute_info_ = &info;
   return soc_interface_Init();
 }
 
 bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); }
-bool HexagonControlWrapper::SetupGraph(
-    const GraphTransferer& graph_transferer) {
-  const GraphTransferInfo& graph_transfer_info =
-      graph_transferer.GetGraphTransferInfo();
+bool HexagonControlWrapper::SetupGraph() {
+  // Copy graph transfer info to modify to adapt hexnn library
+  GraphTransferInfo& graph_transfer_info =
+      graph_transferer_.GetMutableGraphTransferInfo();
+
+  // Overwrite op type of input nodes for hexagon
+  for (const GraphTransferInfo::GraphInputNodeInfo& graph_input :
+       graph_transfer_info.graph_input_node_info()) {
+    GraphTransferInfo::NodeInfo* node_info =
+        FindNodeInfo(graph_input.name(), &graph_transfer_info);
+    CHECK_NE(node_info, nullptr);
+    node_info->set_type_name(INPUT_OP_NAME);
+    node_info->set_soc_op_id(
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {}));
+  }
+
+  // Generate a new output node which is connected to graph output node
+  // TODO(satok): Support multiple output nodes
+  CHECK_EQ(graph_transfer_info.graph_output_node_info_size(), 1);
+  for (const GraphTransferInfo::GraphOutputNodeInfo& graph_output :
+       graph_transfer_info.graph_output_node_info()) {
+    const int new_output_node_id = graph_transfer_info.node_info_size() +
+                                   graph_transfer_info.const_node_info_size() +
+                                   2 /* offset for ids */;
+    // Register a new output node
+    GraphTransferInfo::NodeInfo& new_output_node_info =
+        *graph_transfer_info.add_node_info();
+    new_output_node_info.set_name(OUTPUT_OP_NAME);
+    new_output_node_info.set_node_id(new_output_node_id);
+    new_output_node_info.set_type_name(OUTPUT_OP_NAME);
+    new_output_node_info.set_soc_op_id(
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME, {}));
+    new_output_node_info.set_padding_id(0 /* PADDING_NA_ID */);
+    new_output_node_info.set_input_count(1);
+    new_output_node_info.set_output_count(0);
+
+    const TensorId tid = ParseTensorName(graph_output.name());
+    const string node_name = tid.first.ToString();
+    const int port = tid.second;
+    // Register node input for the new output node
+    const GraphTransferInfo::NodeInfo* node_info =
+        FindNodeInfo(node_name, &graph_transfer_info);
+    CHECK_NE(node_info, nullptr);
+    GraphTransferInfo::NodeInputInfo& node_input_info =
+        *graph_transfer_info.add_node_input_info();
+    node_input_info.set_node_id(new_output_node_id);
+    GraphTransferInfo::NodeInput& node_input =
+        *node_input_info.add_node_input();
+    node_input.set_node_id(node_info->node_id());
+    node_input.set_output_port(port);
+  }
+
+  if (DBG_DUMP_VERIFICATION_STRING) {
+    GraphTransferer gt;
+    gt.SetSerializedGraphTransferInfo(graph_transfer_info.SerializeAsString());
+    gt.DumpVerificationStringOfNodeTransferParams();
+  }
+
   int inputs_count = 0;
   int outputs_count = 0;
   for (const GraphTransferInfo::NodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     inputs_count += input_params.node_input_size();
   }
+
   for (const GraphTransferInfo::NodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     outputs_count += output_params.max_byte_size_size();
@@ -127,7 +216,7 @@ bool HexagonControlWrapper::SetupGraph(
 
   // 2. Setup op nodes
   for (const GraphTransferInfo::NodeInfo& params :
-       graph_transferer.GetGraphTransferInfo().node_info()) {
+       graph_transfer_info.node_info()) {
     const int node_id = params.node_id();
     const int op_id = params.soc_op_id();
     CHECK(inputs_map.count(node_id) == 1);
@@ -143,7 +232,7 @@ bool HexagonControlWrapper::SetupGraph(
       const auto& output_ptr_and_count = outputs_map.at(node_id);
       output_ptr = std::get<0>(output_ptr_and_count);
       output_count = std::get<1>(output_ptr_and_count);
-      CHECK(output_count > 0);
+      // CHECK(output_count > 0);
     }
     int padding_id = -1;
     if (params.padding_id() == 0) {
@@ -202,7 +291,33 @@ bool HexagonControlWrapper::FillInputNode(const string& node_name,
 }
 
 bool HexagonControlWrapper::ReadOutputNode(
-    const string node_name, std::vector<ByteArray> *const outputs) {
+    const string& node_name, TensorAllocatorFunc tensor_allocator) {
+  CHECK_NE(execute_info_, nullptr);
+  TensorShape output_shape;
+  // TODO(satok): Switch shape corresponding to input shape
+  for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) {
+    if (execute_info_->graph_output_node_name(i) == node_name) {
+      for (const TensorShapeProto::Dim& dim :
+           execute_info_->default_graph_output_tensor_shape(i).shape().dim()) {
+        output_shape.AddDim(dim.size());
+      }
+      break;
+    }
+  }
+  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
+  ReadOutputNode(node_name, &outputs);
+  CHECK_EQ(1, outputs.size());
+  IRemoteFusedGraphExecutor::ByteArray& output = outputs[0];
+  Tensor* output_tensor = tensor_allocator(output_shape);
+  CHECK(output_tensor->TotalBytes() >= std::get<1>(output))
+      << output_tensor->TotalBytes() << ", " << std::get<1>(output);
+  // TODO(satok): Avoid specifying float
+  std::memcpy(output_tensor->flat<float>().data(), std::get<0>(output),
+              std::get<1>(output));
+}
+
+bool HexagonControlWrapper::ReadOutputNode(
+    const string& node_name, std::vector<ByteArray>* const outputs) {
   CHECK(outputs != nullptr);
   ByteArray output;
   soc_interface_ReadOutputNodeFloat(node_name.c_str(), &std::get<0>(output),
@@ -219,17 +334,30 @@ bool HexagonControlWrapper::FillInputNode(const string& node_name,
   const ConstByteArray ba =
       ConstByteArray(reinterpret_cast<const uint8*>(tensor_data.data()),
                      tensor_data.size(), tensor.dtype());
+  if (DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA) {
+    LOG(INFO) << "Input tensor data: element size = " << tensor.NumElements()
+              << ", byte syze = " << tensor.TotalBytes();
+    std::stringstream line;
+    for (int i = 0; i < tensor.NumElements(); ++i) {
+      line << tensor.flat<float>().data()[i] << ", ";
+      if ((i - 2) % 3 == 0 || i == tensor.NumElements() - 1) {
+        LOG(INFO) << "(" << ((i - 2) / 3) << ") " << line.str();
+        line.str("");
+        line.clear();
+      }
+    }
+  }
   FillInputNode(node_name, ba);
   return true;
 }
 
 #else
 int HexagonControlWrapper::GetVersion() { return -1; }
-bool HexagonControlWrapper::Init() { return false; }
-bool HexagonControlWrapper::Finalize() { return false; }
-bool HexagonControlWrapper::SetupGraph(const GraphTransferer &) {
+bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo&) {
   return false;
 }
+bool HexagonControlWrapper::Finalize() { return false; }
+bool HexagonControlWrapper::SetupGraph() { return false; }
 bool HexagonControlWrapper::ExecuteGraph() { return false; }
 bool HexagonControlWrapper::TeardownGraph() { return false; }
 bool HexagonControlWrapper::FillInputNode(const string&, const ConstByteArray) {
@@ -238,7 +366,11 @@ bool HexagonControlWrapper::FillInputNode(const string&, const ConstByteArray) {
 bool HexagonControlWrapper::FillInputNode(const string&, const Tensor&) {
   return false;
 }
-bool HexagonControlWrapper::ReadOutputNode(const string,
+bool HexagonControlWrapper::ReadOutputNode(
+    const string& node_name, TensorAllocatorFunc tensor_allocator) {
+  return false;
+}
+bool HexagonControlWrapper::ReadOutputNode(const string&,
                                            std::vector<ByteArray>* const) {
   return false;
 }
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 7598cd16927..97448884e1d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -20,39 +20,47 @@ limitations under the License.
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
-#include "tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
 /*
-  HexagonControlWrapper is implementing interfaces in ISocControlWrapper.
+  HexagonControlWrapper is implementing interfaces in IRemoteFusedGraphExecutor.
   This class calls APIs on hexagon via hexagon control binary.
   TODO(satok): Add more documents about hexagon control binary.
  */
-class HexagonControlWrapper final : public ISocControlWrapper {
+class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
  public:
   HexagonControlWrapper() = default;
   int GetVersion() final;
-  bool Init() final;
+  bool Init(const RemoteFusedGraphExecuteInfo& info) final;
   bool Finalize() final;
-  bool SetupGraph(const GraphTransferer& graph_transferer) final;
+  bool SetupGraph() final;
   bool ExecuteGraph() final;
   bool TeardownGraph() final;
-  bool FillInputNode(const string& node_name, const ConstByteArray bytes) final;
   bool FillInputNode(const string& node_name, const Tensor& tensor) final;
-  bool ReadOutputNode(string node_name, std::vector<ByteArray>* outputs) final;
+  bool ReadOutputNode(const string& node_name,
+                      TensorAllocatorFunc tensor_allocator) final;
+  bool ReadOutputNode(const string& node_name, std::vector<ByteArray>* outputs);
 
  private:
+  bool FillInputNode(const string& node_name, const ConstByteArray bytes);
+
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
 
+  static GraphTransferInfo::NodeInfo* FindNodeInfo(
+      const string& node_name, GraphTransferInfo* graph_transfer_info);
+
+  const RemoteFusedGraphExecuteInfo* execute_info_{};
+  GraphTransferer graph_transferer_{};
   // Dummy float array for input node.
   // TODO(satok): Use actual data passed by FillInputNode and remove
-  std::vector<float> dummy_input_float_;
+  std::vector<float> dummy_input_float_{};
   // Dummy byte array for cosnt node.
   // TODO(satok): Remove
-  std::unordered_map<int, std::vector<uint8>> dummy_const_data_;
+  std::unordered_map<int, std::vector<uint8>> dummy_const_data_{};
 
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 81e49bd1479..54ba101501f 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -12,15 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Before calling this test program, download a model as follows.
-// $ curl https://storage.googleapis.com/download.tensorflow.org/models/tensorflow_inception_v3_stripped_optimized_quantized.pb \
-// -o /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb
-// adb push /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
-// /data/local/tmp
-// $ curl
-// https://storage.googleapis.com/download.tensorflow.org/models/imagenet_comp_graph_label_strings.txt
-// -o /tmp/imagenet_comp_graph_label_strings.txt
-// adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
+/* Before calling this test program, download a model as follows.
+$ curl
+https://storage.googleapis.com/download.tensorflow.org/models/tensorflow_inception_v3_stripped_optimized_quantized.pb
+\ -o /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb
+$ adb push /tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+/data/local/tmp
+$ curl
+https://storage.googleapis.com/download.tensorflow.org/models/imagenet_comp_graph_label_strings.txt
+-o /tmp/imagenet_comp_graph_label_strings.txt
+adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
+*/
 
 #include <memory>
 
@@ -30,18 +32,32 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
-#include "tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/profile_utils/clock_cycle_profiler.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
-using ByteArray = ISocControlWrapper::ByteArray;
+using ByteArray = IRemoteFusedGraphExecutor::ByteArray;
+using ConstByteArray = IRemoteFusedGraphExecutor::ConstByteArray;
+
+constexpr const char* const IMAGE_FILENAME = "/data/local/tmp/img_299x299.bmp";
+constexpr const char* const MODEL_FILENAME =
+    "/data/local/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb";
+constexpr const char* const FUSED_MODEL_FILENAME =
+    "/data/local/tmp/"
+    "tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb";
+constexpr const char* const REMOTE_FUSED_GRAPH_EXECUTE_NODE_NAME =
+    "remote_fused_graph_execute_node";
+constexpr bool USE_SHAPE_INFERENCE = false;
 
 const bool DBG_DUMP_FLOAT_DATA = false;
 const int WIDTH = 299;
@@ -50,13 +66,16 @@ const int DEPTH = 3;
 const int EXPECTED_FIRST_RESULT_ID = 59;
 const int EXECUTION_REPEAT_COUNT = 3;
 
-static void DumpTop10Results(
-    const std::vector<ISocControlWrapper::ByteArray>& outputs) {
-  CHECK(outputs.size() == 1);
-  const int byte_size = std::get<1>(outputs.at(0));
+static void CheckHexagonControllerVersion() {
+  HexagonControlWrapper hexagon_control_wrapper;
+  const int version = hexagon_control_wrapper.GetVersion();
+  ASSERT_GE(version, 1);
+  LOG(INFO) << "Hexagon controller version is " << version;
+}
+
+static void DumpTop10Results(const int byte_size,
+                             const float* const float_array) {
   const int element_count = byte_size / sizeof(float);
-  const float* float_array =
-      reinterpret_cast<float*>(std::get<0>(outputs.at(0)));
   const string label_filename =
       "/data/local/tmp/imagenet_comp_graph_label_strings.txt";
   string label_str;
@@ -68,8 +87,17 @@ static void DumpTop10Results(
       10 /* show top_n results */);
 }
 
+static void DumpTop10Results(
+    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs) {
+  CHECK(outputs.size() == 1);
+  const int byte_size = std::get<1>(outputs.at(0));
+  const float* float_array =
+      reinterpret_cast<float*>(std::get<0>(outputs.at(0)));
+  DumpTop10Results(byte_size, float_array);
+}
+
 static void CheckFirstResult(
-    const std::vector<ISocControlWrapper::ByteArray>& outputs,
+    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs,
     const int expected_first_id) {
   EXPECT_GE(outputs.size(), 1);
   const int byte_size = std::get<1>(outputs.at(0));
@@ -85,41 +113,14 @@ static void CheckFirstResult(
   EXPECT_EQ(expected_first_id, std::get<1>(entry));
 }
 
-// CAVEAT: This test only runs when you specify hexagon library using
-// makefile.
-// TODO(satok): Make this generic so that this can run without any
-// additionanl steps.
-#ifdef USE_HEXAGON_LIBS
-TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
-  const string image_filename = "/data/local/tmp/img_299x299.bmp";
-  const string model_filename =
-      "/data/local/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb";
-  const IGraphTransferOpsDefinitions* ops_definitions =
-      &HexagonOpsDefinitions::getInstance();
-  std::vector<GraphTransferer::InputNodeInfo> input_node_info_list = {
-      GraphTransferer::InputNodeInfo{
-          "Mul", Tensor{DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}}}};
-  std::vector<string> output_node_names = {"softmax"};
-  const bool is_text_proto = false;
-
-  GraphTransferer::OutputTensorInfo output_tensor_info;
-  GraphTransferer gt;
-  gt.EnableStrictCheckMode(false);
-  Status status = gt.LoadGraphFromProtoFile(
-      *ops_definitions, model_filename, input_node_info_list, output_node_names,
-      is_text_proto, true /* dry_run_for_unknown_shape */, &output_tensor_info);
-  ASSERT_TRUE(status.ok()) << status;
-
-  HexagonControlWrapper hexagon_control_wrapper;
-  const int version = hexagon_control_wrapper.GetVersion();
-  ASSERT_GE(version, 1);
-  LOG(INFO) << "Hexagon controller version is " << version;
-
+static void LoadImage(std::vector<float>* img_floats_ptr) {
+  CHECK(img_floats_ptr != nullptr);
+  std::vector<float>& img_floats = *img_floats_ptr;
   // Read the data from the bitmap file into memory
   string bmp;
-  TF_CHECK_OK(ReadFileToString(Env::Default(), image_filename, &bmp));
+  TF_CHECK_OK(ReadFileToString(Env::Default(), IMAGE_FILENAME, &bmp));
   const int fsize = bmp.size();
-  LOG(INFO) << "Read " << image_filename << ", size = " << fsize << "bytes";
+  LOG(INFO) << "Read " << IMAGE_FILENAME << ", size = " << fsize << "bytes";
   const int64 pixel_count = WIDTH * HEIGHT * DEPTH;
   CHECK(fsize >= 22 /* pos of height */ + sizeof(int));
   CHECK(bmp.data() != nullptr);
@@ -136,7 +137,7 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
 
   uint8* const bmp_pixels = &img_bytes[header_size];
 
-  std::vector<float> img_floats(pixel_count);
+  img_floats.resize(pixel_count);
   int src_pixel_index = 0;
   CHECK(pixel_count % 3 == 0);
   for (int i = 0; i < pixel_count / 3; ++i) {
@@ -164,18 +165,70 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
       ++src_pixel_index;
     }
   }
-  const ByteArray ba =
-      std::make_tuple(reinterpret_cast<uint8*>(img_floats.data()),
-                      pixel_count * sizeof(float), DT_FLOAT);
+}
 
+static Tensor BuildImageTensor(const std::vector<float>& img_floats) {
+  LOG(INFO) << "Ioading image finished.";
+  Tensor img_tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH});
+  CHECK_EQ(WIDTH * HEIGHT * DEPTH, img_floats.size());
+  CHECK_EQ(img_tensor.TotalBytes(), img_floats.size() * sizeof(float));
+  LOG(INFO) << "Copy data to tensor.";
+  std::memcpy(img_tensor.flat<float>().data(), img_floats.data(),
+              img_tensor.TotalBytes());
+  return img_tensor;
+}
+
+/* static */ RemoteFusedGraphExecuteInfo
+BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
+    const GraphTransferInfo& graph_transfer_info) {
+  RemoteFusedGraphExecuteInfo execute_info;
+  execute_info.set_executor_name("build_hexagon_remote_fused_graph_executor");
+  for (const GraphTransferInfo::GraphInputNodeInfo& input :
+       graph_transfer_info.graph_input_node_info()) {
+    execute_info.add_graph_input_node_name(input.name());
+    RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+        *execute_info.add_default_graph_input_tensor_shape();
+    tensor_shape_type.set_dtype(input.dtype());
+    TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+    for (const int64 dim : input.shape()) {
+      tensor_shape_proto.add_dim()->set_size(dim);
+    }
+  }
+
+  for (const GraphTransferInfo::GraphOutputNodeInfo& output :
+       graph_transfer_info.graph_output_node_info()) {
+    execute_info.add_graph_output_node_name(output.name());
+    RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+        *execute_info.add_default_graph_output_tensor_shape();
+    tensor_shape_type.set_dtype(output.dtype());
+    TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+    for (const int64 dim : output.shape()) {
+      tensor_shape_proto.add_dim()->set_size(dim);
+    }
+  }
+
+  execute_info.set_serialized_executor_parameters(
+      graph_transfer_info.SerializeAsString());
+  return execute_info;
+}
+
+static void RunInferenceByHexagonControlWrapper(
+    const GraphTransferer& gt, const std::vector<float>& img_floats) {
+  const Tensor img_tensor = BuildImageTensor(img_floats);
+
+  const RemoteFusedGraphExecuteInfo execute_info =
+      BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
+          gt.GetGraphTransferInfo());
+
+  HexagonControlWrapper hexagon_control_wrapper;
   // 1. Initialize hexagon
-  hexagon_control_wrapper.Init();
+  hexagon_control_wrapper.Init(execute_info);
 
   // 2. Setup graph in hexagon
-  hexagon_control_wrapper.SetupGraph(gt);
+  hexagon_control_wrapper.SetupGraph();
 
   // 3. Fill input node's output
-  hexagon_control_wrapper.FillInputNode("Mul", ba);
+  hexagon_control_wrapper.FillInputNode("Mul", img_tensor);
 
   // 4. Execute graph
   profile_utils::CpuUtils::EnableClockCycleProfiling(true);
@@ -187,7 +240,7 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
   }
 
   // 5-1. Read output node's outputs
-  std::vector<ISocControlWrapper::ByteArray> outputs;
+  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
   hexagon_control_wrapper.ReadOutputNode("softmax", &outputs);
 
   // 5-2. Dump results
@@ -201,6 +254,294 @@ TEST(GraphTransferer, RunInceptionV3OnHexagonExample) {
   // 7. Finalize hexagon
   hexagon_control_wrapper.Finalize();
 }
+
+static void RunFusedGraph(const GraphDef& fused_graph_def) {
+  // Setup input tensor
+  std::vector<float> img_floats;
+  LoadImage(&img_floats);
+
+  LOG(INFO) << "Ioading image finished.";
+  const Tensor img_tensor = BuildImageTensor(img_floats);
+
+  // Setup session
+  std::vector<Tensor> output_tensors;
+  SessionOptions session_options;
+  session_options.env = Env::Default();
+  std::unique_ptr<Session> session =
+      std::unique_ptr<Session>(NewSession(session_options));
+  TF_ASSERT_OK(session->Create(fused_graph_def));
+
+  // Setup session arguments
+  RunOptions run_options;
+  run_options.set_trace_level(RunOptions::FULL_TRACE);
+  RunMetadata run_metadata;
+
+  std::vector<std::pair<string, tensorflow::Tensor>> input_tensors;
+  input_tensors.emplace_back("Mul", img_tensor);
+  std::vector<string> output_node_names;
+  output_node_names.emplace_back(REMOTE_FUSED_GRAPH_EXECUTE_NODE_NAME);
+
+  LOG(INFO) << "Run graph";
+  // Run inference with all node as output
+  TF_ASSERT_OK(session->Run(run_options, input_tensors, output_node_names, {},
+                            &output_tensors, &run_metadata));
+  ASSERT_EQ(1, output_tensors.size());
+  const Tensor& output_tensor = output_tensors.at(0);
+  LOG(INFO) << "Output byte size = " << output_tensor.TotalBytes();
+  LOG(INFO) << "Output shape = " << output_tensor.shape().DebugString();
+  DumpTop10Results(
+      output_tensor.TotalBytes(),
+      reinterpret_cast<const float*>(output_tensor.flat<float>().data()));
+}
+
+static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
+                                     const GraphTransferInfo& gfi1) {
+  LOG(INFO) << "(1) node count: " << gfi1.node_info_size() << ", "
+            << gfi1.const_node_info_size();
+
+  // 1. check node_info
+  ASSERT_EQ(gfi0.node_info_size(), gfi1.node_info_size());
+  for (int i = 0; i < gfi0.node_info_size(); ++i) {
+    const GraphTransferInfo::NodeInfo& ni0 = gfi0.node_info(i);
+    const GraphTransferInfo::NodeInfo& ni1 = gfi1.node_info(i);
+    EXPECT_EQ(ni0.DebugString(), ni1.DebugString());
+    EXPECT_EQ(ni0.ByteSize(), ni1.ByteSize());
+  }
+
+  // 2. check const_node_info
+  ASSERT_EQ(gfi0.const_node_info_size(), gfi1.const_node_info_size());
+  for (int i = 0; i < gfi0.const_node_info_size(); ++i) {
+    const GraphTransferInfo::ConstNodeInfo& cni0 = gfi0.const_node_info(i);
+    const GraphTransferInfo::ConstNodeInfo& cni1 = gfi1.const_node_info(i);
+    ASSERT_EQ(cni0.shape_size(), cni1.shape_size());
+    for (int j = 0; j < cni0.shape_size(); ++j) {
+      EXPECT_EQ(cni0.shape(j), cni1.shape(j));
+    }
+    EXPECT_EQ(cni0.ByteSize(), cni1.ByteSize());
+    EXPECT_EQ(cni0.DebugString(), cni1.DebugString());
+  }
+
+  // 3. check node_input_info
+  ASSERT_EQ(gfi0.node_input_info_size(), gfi1.node_input_info_size());
+  for (int i = 0; i < gfi0.node_input_info_size(); ++i) {
+    const GraphTransferInfo::NodeInputInfo& nii0 = gfi0.node_input_info(i);
+    const GraphTransferInfo::NodeInputInfo& nii1 = gfi1.node_input_info(i);
+    EXPECT_EQ(nii0.ByteSize(), nii1.ByteSize());
+    EXPECT_EQ(nii0.DebugString(), nii1.DebugString());
+  }
+
+  // 4. check node_output_info
+  ASSERT_EQ(gfi0.node_output_info_size(), gfi1.node_output_info_size());
+  for (int i = 0; i < gfi0.node_output_info_size(); ++i) {
+    const GraphTransferInfo::NodeOutputInfo& noi0 = gfi0.node_output_info(i);
+    const GraphTransferInfo::NodeOutputInfo& noi1 = gfi1.node_output_info(i);
+    ASSERT_EQ(noi0.max_byte_size_size(), noi1.max_byte_size_size());
+    for (int j = 0; j < noi0.max_byte_size_size(); ++j) {
+      EXPECT_EQ(noi0.max_byte_size(j), noi1.max_byte_size(j));
+    }
+    EXPECT_EQ(noi0.ByteSize(), noi1.ByteSize());
+    EXPECT_EQ(noi0.DebugString(), noi1.DebugString());
+  }
+
+  // 5. check graph_input_node_info
+  ASSERT_EQ(gfi0.graph_input_node_info_size(),
+            gfi1.graph_input_node_info_size());
+  for (int i = 0; i < gfi0.graph_input_node_info_size(); ++i) {
+    const GraphTransferInfo::GraphInputNodeInfo& gini0 =
+        gfi0.graph_input_node_info(i);
+    const GraphTransferInfo::GraphInputNodeInfo& gini1 =
+        gfi0.graph_input_node_info(i);
+    EXPECT_EQ(gini0.ByteSize(), gini1.ByteSize());
+    EXPECT_EQ(gini0.DebugString(), gini1.DebugString());
+  }
+
+  // 6. check graph_output_node_info
+  ASSERT_EQ(gfi0.graph_output_node_info_size(),
+            gfi1.graph_output_node_info_size());
+  for (int i = 0; i < gfi0.graph_output_node_info_size(); ++i) {
+    const GraphTransferInfo::GraphOutputNodeInfo& goni0 =
+        gfi0.graph_output_node_info(i);
+    const GraphTransferInfo::GraphOutputNodeInfo& goni1 =
+        gfi0.graph_output_node_info(i);
+    EXPECT_EQ(goni0.ByteSize(), goni1.ByteSize());
+    EXPECT_EQ(goni0.DebugString(), goni1.DebugString());
+  }
+
+  // 7. check destination
+  EXPECT_EQ(gfi0.destination(), gfi1.destination());
+}
+
+// CAVEAT: This test only runs when you specify hexagon library using
+// makefile.
+// CAVEAT: This test is disabled by default because hexagon can keep only
+// two inception graphs on memory which are allocated by other two tests.
+// Memory of these graphs are not released until process is killed right now.
+// TODO(satok): Figure out how to release memory on hexagon without process
+// termination.
+#ifdef USE_HEXAGON_LIBS
+TEST(GraphTransferer,
+     DISABLED_RunInceptionV3OnHexagonExampleWithHexagonWrapper) {
+  LOG(INFO) << "Run inception v3 on hexagon with hexagon controller";
+  CheckHexagonControllerVersion();
+
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
+  std::vector<string> output_node_names = {"softmax"};
+
+  GraphTransferer gt;
+  gt.EnableStrictCheckMode(false);
+  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  ClockCycleProfiler prof;
+  prof.Start();
+  Status status = gt.LoadGraphFromProtoFile(
+      *ops_definitions, MODEL_FILENAME, inputs, output_node_names,
+      false,  // is_text_proto
+      false,  // shape_inference_for_unknown_shape
+      true    // dry_run_for_unknown_shape
+      );
+  ASSERT_TRUE(status.ok()) << status;
+  prof.Stop();
+  prof.DumpStatistics("LoadGraphFromProtoFile");
+
+  std::vector<float> img_floats;
+  LoadImage(&img_floats);
+  RunInferenceByHexagonControlWrapper(gt, img_floats);
+}
+
+TEST(GraphTransferer,
+     DISABLED_RunInceptionV3OnHexagonExampleWithHexagonWrapperShapeInference) {
+  LOG(INFO) << "Run inception v3 on hexagon with hexagon controller";
+  CheckHexagonControllerVersion();
+
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
+  std::vector<string> output_node_names = {"softmax"};
+
+  GraphTransferer gt;
+  gt.EnableStrictCheckMode(false);
+  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  ClockCycleProfiler prof;
+  prof.Start();
+  Status status = gt.LoadGraphFromProtoFile(
+      *ops_definitions, MODEL_FILENAME, inputs, output_node_names,
+      false,  // is_text_proto
+      true,   // shape_inference_for_unknown_shape
+      false   // dry_run_for_unknown_shape
+      );
+  ASSERT_TRUE(status.ok()) << status;
+  prof.Stop();
+  prof.DumpStatistics("LoadGraphFromProtoFile");
+
+  std::vector<float> img_floats;
+  LoadImage(&img_floats);
+  RunInferenceByHexagonControlWrapper(gt, img_floats);
+}
+
+TEST(GraphTransferer, RunInceptionV3OnHexagonExampleWithTfRuntime) {
+  LOG(INFO) << "Fuse and run inception v3 on hexagon with tf runtime";
+  CheckHexagonControllerVersion();
+
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
+  std::vector<string> outputs = {"softmax"};
+
+  std::vector<float> img_floats;
+  LoadImage(&img_floats);
+
+  LOG(INFO) << "Ioading image finished.";
+
+  GraphDef graph_def;
+  Status status = ReadBinaryProto(Env::Default(), MODEL_FILENAME, &graph_def);
+
+  ASSERT_TRUE(status.ok());
+
+  LOG(INFO) << "Build fused graph";
+  GraphDef fused_graph_def = GraphTransferUtils::BuildFusedGraphDef(
+      HexagonOpsDefinitions::getInstance(),
+      REMOTE_FUSED_GRAPH_EXECUTE_NODE_NAME, inputs, outputs, &graph_def);
+
+  RunFusedGraph(fused_graph_def);
+}
+
+TEST(GraphTransferer, DISABLED_RunInceptionV3OnHexagonExampleWithFusedGraph) {
+  LOG(INFO) << "Run inception v3 with fused graph";
+  CheckHexagonControllerVersion();
+
+  GraphDef fused_graph_def;
+  Status status =
+      ReadBinaryProto(Env::Default(), FUSED_MODEL_FILENAME, &fused_graph_def);
+  RunFusedGraph(fused_graph_def);
+}
+
+TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
+  CheckHexagonControllerVersion();
+  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+
+  const IGraphTransferOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
+  std::vector<string> output_node_names = {"softmax"};
+
+  RemoteFusedGraphExecuteUtils::TensorShapeMap output_tensor_info0;
+  GraphTransferer gt0;
+  gt0.EnableStrictCheckMode(false);
+  ClockCycleProfiler prof0;
+  prof0.Start();
+  Status status = gt0.LoadGraphFromProtoFile(
+      *ops_definitions, MODEL_FILENAME, inputs, output_node_names,
+      false,  // is_text_proto
+      false,  // shape_inference_for_unknown_shape
+      true    // dry_run_for_unknown_shape
+      );
+  const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo();
+
+  ASSERT_TRUE(status.ok());
+  prof0.Stop();
+  prof0.DumpStatistics("Estimate shape by dryrun");
+
+  LOG(INFO) << "(0) node count: " << gfi0.node_info_size() << ", "
+            << gfi0.const_node_info_size();
+
+  RemoteFusedGraphExecuteUtils::TensorShapeMap output_tensor_info1;
+  GraphTransferer gt1;
+  gt1.EnableStrictCheckMode(true);
+  ClockCycleProfiler prof1;
+  prof1.Start();
+  status = gt1.LoadGraphFromProtoFile(
+      *ops_definitions, MODEL_FILENAME, inputs, output_node_names,
+      false,  // is_text_proto
+      true,   // shape_inference_for_unknown_shape
+      false   // dry_run_for_unknown_shape
+      );
+  const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo();
+
+  ASSERT_TRUE(status.ok());
+  prof1.Stop();
+  prof1.DumpStatistics("Estiame shape by shape inference");
+
+  CompareGraphTransferInfo(gfi0, gfi1);
+
+  const RemoteFusedGraphExecuteInfo ei0 =
+      BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(gfi0);
+  const RemoteFusedGraphExecuteInfo ei1 =
+      BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(gfi1);
+
+  GraphTransferInfo rgfi0;
+  rgfi0.ParseFromString(ei0.serialized_executor_parameters());
+  GraphTransferInfo rgfi1;
+  rgfi1.ParseFromString(ei1.serialized_executor_parameters());
+
+  CompareGraphTransferInfo(rgfi0, rgfi1);
+  CompareGraphTransferInfo(gfi0, rgfi0);
+  CompareGraphTransferInfo(gfi1, rgfi1);
+}
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
index f170a4d5569..a4b79e6ec4f 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 
-#include <unordered_map>
-
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 
+// CAVEAT: Comment-out the following macro if you want to use experimental
+// hexagon ops.
+//#define ENABLE_EXPERIMENTAL_HEXNN_OPS
+
 namespace tensorflow {
 
 // HVX internal supported ops names
-enum class SupportedOpType {
+// TODO(satok): Remove this map once hexnn lib supports an API to retrieve op id
+// from op name and data type
+enum class HexagonOpsDefinitions::SupportedOpType {
   INPUT,
   OUTPUT,
   NOP,
@@ -38,6 +43,9 @@ enum class SupportedOpType {
   PPRINT_FLOAT,
   PREFREE,
   FLATTEN,
+
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  // With Reference
   QUANTIZEDCONV2D_8X8TO32,
   QUANTIZEDCONV2D_8X8TO32_REF,
   QUANTIZEDMATMUL_8X8TO32,
@@ -66,33 +74,298 @@ enum class SupportedOpType {
   DEQUANTIZE_REF,
   SUPERNODE_8X8P8TO8,
   SUPERNODE_8X8P8TO8_REF,
+
   QUANTIZEDFLATTEN,
-  SUPPORTED_OP_TYPE_COUNT,
+  SOFTMAX_F,
+  CONV2D_F,
+  MATMUL_F,
+  RELU_F,
+  RELUX_F,
+  AVGPOOL_F,
+  MAXPOOL_F,
+  CONCAT_F,
+  BIASADD_F,
+  LRN_F,
+
+  VARIABLE,
+  ASSIGN,
+  RESHAPE,
+  QUANTIZED_RESHAPE,
+  TANH_F,
+  SIGMOID_F,
+  SLICE_8,
+  SLICE_F,
+  QUANTIZED_SLICE_8,
+  ADD_F,
+  MUL_F,
+  MINIMUM_F,
+  MAXIMAM_F,
+
+  REQUANTIZE_32_TO_8,
+  REQUANTIZE_32_TO_8_REF,
+  REQUANTIZATION_RANGE_32,
+  REQUANTIZATION_RANGE_32_REF,
+
+  NEG_F,
+  SUB_F,
+  ADD_N_F,
+  RANGE_INT32,
+  RANK_INT32,
+  TRANSPOSE_INT32,
+  TRANSPOSE_F,
+  INSTANCE_NORM_F,
+  QUANTIZED_INSTANCENORM_8,
+  QUANTIZED_INSTANCENORM_8_REF,
+  SUB_INT32,
+  ADD_INT32,
+  SPLIT_F,
+  DEQUANTIZE_QINT32_F,
+  PRELU_F,
+  QUANTIZED_PRELU_8,
+  SUM_F,
+  PROD_F,
+  MUL_INT32,
+  LOGICAL_AND_INT32,
+  LOGICALOR_INT32,
+  LOGICAL_XOR_INT32,
+  SPAPE_INT32,
+  PACK_INT32,
+  MIRROR_PAD_F,
+  RESIZE_NEAREST_NEIGHBOR_F,
+  STRIDED_SLICE_INT32,
+  STRIDED_SLICE_F,
+  EXPAND_DIMS_INT32,
+  EXPAND_DIMS_F,
+
+  LOG_SOFTMAX_F,
+  SPLIT_INT32,
+  QUANTIZED_SPLIT_8,
+
+  DECONV_F,
+  QUANTIZED_DECONV_8X8TO32,
+  QUANTIZED_DECONV_8X8TO32_REF,
+
+  QUANTIZED_MUL_8x8to32,
+  QUANTIZED_MUL_8x8to32_REF,
+  QUANTIZED_ADD_8p8to32,
+  QUANTIZED_ADD_8p8to32_REF,
+  QUANTIZED_SIGMOID_8,
+  QUANTIZED_SIGMOID_8_REF,
+  QUANTIZED_TANH_8,
+  QUANTIZED_TANH_8_REF,
+  QUANTIZED_SOFTMAX_8,
+  QUANTIZED_SOFTMAX_8_REF,
+  QUANTIZED_LRN_8,
+  QUANTIZED_LRN_8_REF,
+  QUANTIZED_PAD2D_FRAME_8P,
+  QUANTIZED_PAD2D_FRAME_8P_REF,
+  QUANTIZED_SUB_8P8TO32,
+  QUANTIZED_SUB_8P8TO32_REF,
+  QUANTIZED_MAXIMUM_8,
+  QUANTIZED_MAXIMUM_8_REF,
+  QUANTIZED_MINIMUM_8,
+  QUANTIZED_MINIMUM_8_REF,
+
+  PAD_F,
+  SPACE_TO_BATCH_ND_F,
+  BATCH_TO_SPACE_ND_F,
+  RESIZE_BILINEAR_F,
+  CONCAT_V2_F,
+
+#else
+  // With Reference
+  QUANTIZEDCONV2D_8X8TO32,
+  QUANTIZEDCONV2D_8X8TO32_REF,
+  QUANTIZEDMATMUL_8X8TO32,
+  QUANTIZEDMATMUL_8X8TO32_REF,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8_REF,
+  QUANTIZEDRELU_8,
+  QUANTIZEDRELU_8_REF,
+  QUANTIZEDRELUX_8,
+  QUANTIZEDRELUX_8_REF,
+  QUANTIZEDSIGMOID_8,
+  QUANTIZEDSIGMOID_8_REF,
+  QUANTIZEDTANH_8,
+  QUANTIZEDTANH_8_REF,
+  QUANTIZEDMAXPOOL_8,
+  QUANTIZEDMAXPOOL_8_REF,
+  QUANTIZEDAVGPOOL_8,
+  QUANTIZEDAVGPOOL_8_REF,
+  QUANTIZEDCONCAT_8,
+  QUANTIZEDCONCAT_8_REF,
+  QUANTIZEDBIASADD_8P8TO32,
+  QUANTIZEDBIASADD_8P8TO32_REF,
+  QUANTIZEDSOFTMAX_8,
+  QUANTIZEDSOFTMAX_8_REF,
+  QUANTIZEDLRN_8,
+  QUANTIZEDLRN_8_REF,
+  MIN_F,
+  MIN_F_REF,
+  MAX_F,
+  MAX_F_REF,
+  QUANTIZE,
+  QUANTIZE_REF,
+  DEQUANTIZE,
+  DEQUANTIZE_REF,
+  SUPERNODE_8X8P8TO8,
+  SUPERNODE_8X8P8TO8_REF,
+
+  QUANTIZEDFLATTEN,
+  SOFTMAX_F,
+  CONV2D_F,
+  MATMUL_F,
+  RELU_F,
+  RELUX_F,
+  AVGPOOL_F,
+  MAXPOOL_F,
+  CONCAT_F,
+  BIASADD_F,
+  LRN_F,
+
+  VARIABLE,
+  ASSIGN,
+  RESHAPE,
+  QUANTIZED_RESHAPE,
+  TANH_F,
+  SIGMOID_F,
+  SLICE_8,
+  SLICE_F,
+  QUANTIZED_SLICE_8,
+  ADD_F,
+  MUL_F,
+  MINIMUM_F,
+  MAXIMAM_F,
+
+  REQUANTIZE_32_TO_8,
+  REQUANTIZE_32_TO_8_REF,
+  REQUANTIZATION_RANGE_32,
+  REQUANTIZATION_RANGE_32_REF,
+
+  NEG_F,
+  SUB_F,
+  ADD_N_F,
+  RANGE_INT32,
+  RANK_INT32,
+  TRANSPOSE_INT32,
+  TRANSPOSE_F,
+  INSTANCE_NORM_F,
+  QUANTIZED_INSTANCENORM_8,
+  QUANTIZED_INSTANCENORM_8_REF,
+  SUB_INT32,
+  ADD_INT32,
+  SPLIT_F,
+  DEQUANTIZE_QINT32_F,
+  PRELU_F,
+  QUANTIZED_PRELU_8,
+  SUM_F,
+  PROD_F,
+  MUL_INT32,
+  LOGICAL_AND_INT32,
+  LOGICALOR_INT32,
+  LOGICAL_XOR_INT32,
+  SPAPE_INT32,
+  PACK_INT32,
+  MIRROR_PAD_F,
+  RESIZE_NEAREST_NEIGHBOR_F,
+  STRIDED_SLICE_INT32,
+  STRIDED_SLICE_F,
+  EXPAND_DIMS_INT32,
+  EXPAND_DIMS_F,
+
+  LOG_SOFTMAX_F,
+  SPLIT_INT32,
+  QUANTIZED_SPLIT_8,
+
+  DECONV_F,
+  QUANTIZED_DECONV_8X8TO32,
+  QUANTIZED_DECONV_8X8TO32_REF,
+#endif
+
+  SUPPORTED_OP_TYPE_COUNT  // TERMINATOR. DO NOT REMOVE
 };
 
-const std::unordered_map<string, SupportedOpType> OP_NAME_TO_SOC_OP_TYPE_MAP{
-    // Custom Op name
-    {IGraphTransferOpsDefinitions::INPUT_OP_NAME, SupportedOpType::INPUT},
-    {IGraphTransferOpsDefinitions::OUTPUT_OP_NAME, SupportedOpType::OUTPUT},
-    {"NoOp", SupportedOpType::NOP},
-    {IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, SupportedOpType::FLATTEN},
-    // Tensorflow op name
-    {"QuantizedConv2D", SupportedOpType::QUANTIZEDCONV2D_8X8TO32},
-    {"QuantizedMatMul", SupportedOpType::QUANTIZEDMATMUL_8X8TO32},
-    {"QuantizeDownAndShrinkRange",
-     SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8},
-    {"QuantizedRelu", SupportedOpType::QUANTIZEDRELU_8},
-    {"QuantizedReluX", SupportedOpType::QUANTIZEDRELUX_8},
-    {"QuantizedMaxPool", SupportedOpType::QUANTIZEDMAXPOOL_8},
-    {"QuantizedAvgPool", SupportedOpType::QUANTIZEDAVGPOOL_8},
-    {"QuantizedConcat", SupportedOpType::QUANTIZEDCONCAT_8},
-    {"QuantizedBiasAdd", SupportedOpType::QUANTIZEDBIASADD_8P8TO32},
-    {"Min", SupportedOpType::MIN_F},
-    {"Max", SupportedOpType::MAX_F},
-    {"QuantizeV2", SupportedOpType::QUANTIZE},
-    {"Dequantize", SupportedOpType::DEQUANTIZE},
+/* static */ void HexagonOpsDefinitions::EmplaceOpType(
+    const string& op_type, const DataTypeVector& dt_vec,
+    const SupportedOpType supported_op_type,
+    std::unordered_map<string, std::vector<DataTypeToOp>>* map) {
+  if (map->count(op_type) <= 0) {
+    map->emplace(op_type, std::vector<DataTypeToOp>());
+  }
+  map->at(op_type).emplace_back(
+      std::forward_as_tuple(dt_vec, supported_op_type));
+}
+
+/* static */ std::unordered_map<
+    string, std::vector<HexagonOpsDefinitions::DataTypeToOp>>
+HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
+  std::unordered_map<string, std::vector<DataTypeToOp>> op_map;
+  // Custom Op name
+  EmplaceOpType("INPUT", {}, SupportedOpType::INPUT, &op_map);
+  EmplaceOpType("OUTPUT", {}, SupportedOpType::OUTPUT, &op_map);
+  EmplaceOpType("NoOp", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType(IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, {},
+                SupportedOpType::FLATTEN, &op_map);
+  // Tensorflow op name
+  // CAVEAT: Keep order of SupportedOpType
+  EmplaceOpType("Identity", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Placeholder", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Const", {}, SupportedOpType::OP_CONST, &op_map);
+  EmplaceOpType("QuantizedConv2D", {}, SupportedOpType::QUANTIZEDCONV2D_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizedMatMul", {}, SupportedOpType::QUANTIZEDMATMUL_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizeDownAndShrinkRange", {},
+                SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8, &op_map);
+  EmplaceOpType("QuantizedRelu", {}, SupportedOpType::QUANTIZEDRELU_8, &op_map);
+  EmplaceOpType("QuantizedReluX", {}, SupportedOpType::QUANTIZEDRELUX_8,
+                &op_map);
+  EmplaceOpType("QuantizedMaxPool", {}, SupportedOpType::QUANTIZEDMAXPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedAvgPool", {}, SupportedOpType::QUANTIZEDAVGPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedConcat", {}, SupportedOpType::QUANTIZEDCONCAT_8,
+                &op_map);
+  EmplaceOpType("QuantizedBiasAdd", {},
+                SupportedOpType::QUANTIZEDBIASADD_8P8TO32, &op_map);
+  EmplaceOpType("Min", {}, SupportedOpType::MIN_F, &op_map);
+  EmplaceOpType("Max", {}, SupportedOpType::MAX_F, &op_map);
+  EmplaceOpType("QuantizeV2", {}, SupportedOpType::QUANTIZE, &op_map);
+  EmplaceOpType("Dequantize", {}, SupportedOpType::DEQUANTIZE, &op_map);
+  EmplaceOpType("Softmax", {}, SupportedOpType::SOFTMAX_F, &op_map);
+  EmplaceOpType("Reshape", {}, SupportedOpType::RESHAPE, &op_map);
+  EmplaceOpType("QuantizedReshape", {}, SupportedOpType::QUANTIZED_RESHAPE,
+                &op_map);
+  EmplaceOpType("Sigmoid", {}, SupportedOpType::SIGMOID_F, &op_map);
+  EmplaceOpType("Slice", {}, SupportedOpType::SLICE_F, &op_map);
+  EmplaceOpType("Add", {}, SupportedOpType::ADD_F, &op_map);
+  EmplaceOpType("Mul", {}, SupportedOpType::MUL_F, &op_map);
+  EmplaceOpType("Requantize", {}, SupportedOpType::REQUANTIZE_32_TO_8, &op_map);
+  EmplaceOpType("RequantizationRange", {},
+                SupportedOpType::REQUANTIZATION_RANGE_32, &op_map);
+  EmplaceOpType("Sub", {}, SupportedOpType::SUB_F, &op_map);
+  EmplaceOpType("Pack", {}, SupportedOpType::PACK_INT32, &op_map);
+  EmplaceOpType("StridedSlice", {}, SupportedOpType::STRIDED_SLICE_F, &op_map);
+  EmplaceOpType("ExpandDims", {}, SupportedOpType::EXPAND_DIMS_F, &op_map);
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32,
+                &op_map);
+  EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map);
+  EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F,
+                &op_map),
+      EmplaceOpType("BatchToSpaceND", {}, SupportedOpType::BATCH_TO_SPACE_ND_F,
+                    &op_map);
+  EmplaceOpType("ResizeBilinear", {}, SupportedOpType::RESIZE_BILINEAR_F,
+                &op_map);
+  EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map);
+  EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map);
+#endif
+  return op_map;
 };
 
+HexagonOpsDefinitions::HexagonOpsDefinitions()
+    : op_name_to_soc_op_type_map_(BuildOpNameToSocOpTypeMap()) {}
+
 /* static */ const IGraphTransferOpsDefinitions&
 HexagonOpsDefinitions::getInstance() {
   const static HexagonOpsDefinitions instance{};
@@ -103,18 +376,27 @@ int HexagonOpsDefinitions::GetTotalOpsCount() const {
   return static_cast<int>(SupportedOpType::SUPPORTED_OP_TYPE_COUNT);
 }
 
-int HexagonOpsDefinitions::GetInputNodeOpId() const {
-  return static_cast<int>(SupportedOpType::INPUT);
-}
-
-int HexagonOpsDefinitions::GetOutputNodeOpId() const {
-  return static_cast<int>(SupportedOpType::OUTPUT);
-}
-
-int HexagonOpsDefinitions::GetOpIdFor(const string& op_type) const {
-  if (OP_NAME_TO_SOC_OP_TYPE_MAP.count(op_type) > 0) {
-    return static_cast<int>(OP_NAME_TO_SOC_OP_TYPE_MAP.at(op_type));
+int HexagonOpsDefinitions::GetOpIdFor(const string& op_type,
+                                      const DataTypeVector& dt_vec) const {
+  if (op_name_to_soc_op_type_map_.count(op_type) > 0) {
+    const std::vector<DataTypeToOp>& dt_to_op_vec =
+        op_name_to_soc_op_type_map_.at(op_type);
+    CHECK(!dt_to_op_vec.empty());
+    // If argument DataType is empty, return the first entry.
+    if (dt_vec.empty()) {
+      return static_cast<int>(std::get<1>(dt_to_op_vec.front()));
+    }
+    for (const DataTypeToOp& data_type_to_op : dt_to_op_vec) {
+      if (std::get<0>(data_type_to_op) == dt_vec) {
+        return static_cast<int>(std::get<1>(data_type_to_op));
+      }
+    }
   }
   return IGraphTransferOpsDefinitions::INVALID_OP_ID;
 }
-};
+
+GraphTransferInfo::Destination HexagonOpsDefinitions::GetTransferDestination()
+    const {
+  return GraphTransferInfo::HEXAGON;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index e20dc867c79..bd1120e1df6 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 
+#include <unordered_map>
+
 #include "i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
@@ -29,12 +31,25 @@ class HexagonOpsDefinitions final : public IGraphTransferOpsDefinitions {
   static const IGraphTransferOpsDefinitions& getInstance();
 
   int GetTotalOpsCount() const final;
-  int GetInputNodeOpId() const final;
-  int GetOutputNodeOpId() const final;
-  int GetOpIdFor(const string& op_type) const final;
+  int GetOpIdFor(const string& op_type, const DataTypeVector& dt) const final;
+  GraphTransferInfo::Destination GetTransferDestination() const final;
 
  private:
-  HexagonOpsDefinitions() = default;
+  enum class SupportedOpType;
+  using DataTypeToOp = std::tuple<DataTypeVector, SupportedOpType>;
+
+  HexagonOpsDefinitions();
+
+  static void EmplaceOpType(
+      const string& op_type, const DataTypeVector& dt_vec,
+      const SupportedOpType supported_op_type,
+      std::unordered_map<string, std::vector<DataTypeToOp>>* map);
+
+  static std::unordered_map<string, std::vector<DataTypeToOp>>
+  BuildOpNameToSocOpTypeMap();
+
+  const std::unordered_map<string, std::vector<DataTypeToOp>>
+      op_name_to_soc_op_type_map_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonOpsDefinitions);
 };
diff --git a/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
new file mode 100644
index 00000000000..d22bfa1be2c
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+
+namespace tensorflow {
+namespace hexagon_remote_fused_graph_executor_build {
+
+Status BuildRemoteFusedGraphExecutor(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(new HexagonControlWrapper());
+  return Status::OK();
+}
+
+static RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+    k_hexagon_remote_fused_graph_executor_build(
+        "build_hexagon_remote_fused_graph_executor",
+        BuildRemoteFusedGraphExecutor);
+
+}  // namespace hexagon_remote_fused_graph_executor_build
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build_test.cc b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build_test.cc
new file mode 100644
index 00000000000..84e31e62979
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build_test.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace hexagon_remote_fused_graph_executor_build {
+
+Status BuildRemoteFusedGraphExecutor(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor);
+
+namespace {
+
+TEST(HexagonBuildRemoteFusedGraphExecutorTest, BasicRun) {
+  std::unique_ptr<IRemoteFusedGraphExecutor> executor;
+  ASSERT_FALSE(static_cast<bool>(executor));
+  TF_ASSERT_OK(BuildRemoteFusedGraphExecutor(&executor));
+  ASSERT_TRUE(static_cast<bool>(executor));
+  ASSERT_NE(RemoteFusedGraphExecuteUtils::GetExecutorBuildFunc(
+                "build_hexagon_remote_fused_graph_executor"),
+            nullptr);
+}
+
+}  // namespace
+
+}  // namespace hexagon_remote_fused_graph_executor_build
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
new file mode 100644
index 00000000000..ee548c6887e
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
@@ -0,0 +1,89 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wraps the hexagon rewriter in a transform so it can be used as part of the
+// graph transform tool.
+// A usage example, based on the Image Understanding pipeline:
+/*
+bazel build tensorflow/tools/graph_transforms:transform_graph
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+--in_graph=/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+--out_graph=\
+/tmp/tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb \
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='\
+rewrite_quantized_stripped_model_for_hexagon(
+input_shape0="1,299,299,3" \
+input_type0="float" \
+)'
+*/
+
+#include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
+#include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+constexpr const char* const INPUT_SHAPE_PREFIX = "input_shape";
+constexpr const char* const INPUT_TYPE_PREFIX = "input_type";
+
+Status RewriteQuantizedStrippedModelForHexagon(
+    const GraphDef& input_graph_def, const TransformFuncContext& context,
+    GraphDef* output_graph_def) {
+  LOG(INFO) << "Transforming quantized stripped model to a remote fused "
+               "graph execute op...";
+  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<string> outputs;
+  for (auto i = 0; static_cast<size_t>(i) < context.input_names.size(); ++i) {
+    const string& input_name = context.input_names.at(i);
+
+    // Get input shape
+    string shape_string;
+    TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+        INPUT_SHAPE_PREFIX + std::to_string(i), "", &shape_string));
+    std::vector<int64> dims;
+    CHECK(str_util::SplitAndParseAsInts(shape_string, ',', &dims));
+
+    // Get input data type
+    string data_type_string;
+    TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+        INPUT_TYPE_PREFIX + std::to_string(i), "", &data_type_string));
+    DataType data_type;
+    CHECK(DataTypeFromString(data_type_string, &data_type))
+        << "\"" << data_type_string << "\" was an invalid type";
+
+    LOG(INFO) << "Input(" << i << "): name = " << input_name
+              << ", shape = " << shape_string
+              << ", type = " << data_type_string;
+
+    inputs.emplace_back(input_name, Tensor(data_type, TensorShape(dims)));
+  }
+
+  for (const string& output_name : context.output_names) {
+    outputs.emplace_back(output_name);
+  }
+  GraphDef mutable_input_graph_def = input_graph_def;
+  *output_graph_def = GraphTransferUtils::BuildFusedGraphDef(
+      HexagonOpsDefinitions::getInstance(), "remote_fused_graph_execute_node",
+      inputs, outputs, &mutable_input_graph_def);
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("rewrite_quantized_stripped_model_for_hexagon",
+                         RewriteQuantizedStrippedModelForHexagon);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform_test.cc b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform_test.cc
new file mode 100644
index 00000000000..718bdf9d127
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform_test.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declared here so we don't have to put it in a public header.
+Status RewriteQuantizedStrippedModelForHexagon(
+    const GraphDef& input_graph_def, const TransformFuncContext& context,
+    GraphDef* output_graph_def);
+
+namespace {
+
+TEST(HexagonRewriteTransformTest, BasicRun) {
+  Scope root = tensorflow::Scope::NewRootScope();
+
+  // Create a simple graph that calculates (a + b) * placeholder.
+  Tensor a_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&a_data, 1.0f);
+  Output a_const = ops::Const(root.WithOpName("a"), Input::Initializer(a_data));
+
+  Tensor b_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&b_data, 1.0f);
+  Output b_const = ops::Const(root.WithOpName("b"), Input::Initializer(b_data));
+
+  Output add = ops::Add(root.WithOpName("add"), a_const, b_const);
+
+  Output placeholder =
+      ops::Placeholder(root.WithOpName("placeholder"), DT_FLOAT);
+
+  Output mul = ops::Mul(root.WithOpName("output"), add, placeholder);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  GraphDef result;
+  TransformFuncContext context;
+  context.input_names = {"placeholder"};
+  context.output_names = {"output"};
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"input_shape0", {string("1,1,1,1")}}));
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"input_type0", {string("float")}}));
+  TF_ASSERT_OK(
+      RewriteQuantizedStrippedModelForHexagon(graph_def, context, &result));
+
+  // Node in the input graph is fused to
+  // 1 input placeholder node + 1 fused output node
+  EXPECT_EQ(2, result.node_size());
+}
+
+}  // namespace
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc
index a4f6ec402ec..36d48a46cc1 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc
@@ -17,10 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 /* static */ constexpr int IGraphTransferOpsDefinitions::INVALID_OP_ID;
-/* static */ constexpr const char* const
-    IGraphTransferOpsDefinitions::INPUT_OP_NAME;
-/* static */ constexpr const char* const
-    IGraphTransferOpsDefinitions::OUTPUT_OP_NAME;
+// TODO(satok): Remove
 /* static */ constexpr const char* const
     IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
 }
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
index 7e733e1f631..d5b4cf74510 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
 
+#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -28,10 +29,6 @@ class IGraphTransferOpsDefinitions {
  public:
   // op id which is not supported by SOC
   static constexpr int INVALID_OP_ID = -1;
-  // Custom op name for input node
-  static constexpr const char* const INPUT_OP_NAME = "INPUT";
-  // Custom op name for output node
-  static constexpr const char* const OUTPUT_OP_NAME = "OUTPUT";
   // Custom op name for flatten node
   static constexpr const char* const FLATTEN_OP_NAME = "FLATTEN";
 
@@ -39,14 +36,11 @@ class IGraphTransferOpsDefinitions {
   virtual ~IGraphTransferOpsDefinitions() = default;
   // Return total ops count supported by SOC
   virtual int GetTotalOpsCount() const = 0;
-  // Return op id for input node
-  // TODO(satok): Return vector isntead of integer
-  virtual int GetInputNodeOpId() const = 0;
-  // Return op id for output node
-  // TODO(satok): Return vector isntead of integer
-  virtual int GetOutputNodeOpId() const = 0;
   // Return op id for given string op name
-  virtual int GetOpIdFor(const string& op_name) const = 0;
+  virtual int GetOpIdFor(const string& op_name,
+                         const DataTypeVector& dt) const = 0;
+  // Return destination of transfer
+  virtual GraphTransferInfo::Destination GetTransferDestination() const = 0;
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IGraphTransferOpsDefinitions);
@@ -54,4 +48,4 @@ class IGraphTransferOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h b/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
deleted file mode 100644
index 86d01b3dcb6..00000000000
--- a/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-vcyou may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
-
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-class ISocControlWrapper {
- public:
-  using ByteArray =
-      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
-  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
-                                    DataType /* type */>;
-
-  ISocControlWrapper() = default;
-  virtual ~ISocControlWrapper() = default;
-
-  // Return version of SOC controller library.
-  // This function is mainly for a debug purpose to verify SOC controller.
-  virtual int GetVersion() = 0;
-
-  // Initialize SOC. This function should be called before
-  // starting graph transfer.
-  virtual bool Init() = 0;
-
-  // Finalize SOC. This function should be called when all graph executions
-  // are finished.
-  virtual bool Finalize() = 0;
-
-  // Setup graph on SOC
-  virtual bool SetupGraph(const GraphTransferer &graph_transferer) = 0;
-
-  // Execute graph on SOC
-  virtual bool ExecuteGraph() = 0;
-
-  // Teardown Graph on SOC
-  virtual bool TeardownGraph() = 0;
-
-  // Fill input node's output on SOC with ByteArray
-  virtual bool FillInputNode(const string& node_name,
-                             const ConstByteArray bytes) = 0;
-
-  // Fill input node's output on SOC with Tensor
-  virtual bool FillInputNode(const string& node_name, const Tensor& tensor) = 0;
-
-  // Read output node's outputs on SOC
-  virtual bool ReadOutputNode(string node_name,
-                              std::vector<ByteArray> *outputs) = 0;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ISocControlWrapper);
-};
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc b/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
index 73a5eff5fb1..285993aaaa7 100644
--- a/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
+++ b/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 36b02fcc5d6..789a7ce7a3d 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -44,7 +44,7 @@ class HingeLossUpdater : public DualLossUpdater {
                             const double current_dual, const double wx,
                             const double weighted_example_norm) const final {
     // Intutitvely there are 3 cases:
-    // a. new optimal value of the dual variable falls withing the admissible
+    // a. new optimal value of the dual variable falls within the admissible
     // range [0, 1]. In this case we set new dual to this value.
     // b. new optimal value is < 0. Then, because of convexity, the optimal
     // valid value for new dual = 0
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
new file mode 100644
index 00000000000..fe62a259de8
--- /dev/null
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+vcyou may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class IRemoteFusedGraphExecutor {
+ public:
+  using ByteArray =
+      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
+  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
+                                    DataType /* type */>;
+  using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
+
+  IRemoteFusedGraphExecutor() = default;
+  virtual ~IRemoteFusedGraphExecutor() = default;
+
+  // Return version of executor.
+  // This function is mainly for a debug purpose to verify version of
+  // executor info.
+  virtual int GetVersion() = 0;
+
+  // Initialize executor. This function is called before
+  // starting graph transfer.
+  virtual bool Init(const RemoteFusedGraphExecuteInfo& info) = 0;
+
+  // Finalize executor. This function is called when all graph executions
+  // are finished.
+  virtual bool Finalize() = 0;
+
+  // Setup graph
+  virtual bool SetupGraph() = 0;
+
+  // Execute graph
+  virtual bool ExecuteGraph() = 0;
+
+  // Teardown Graph
+  virtual bool TeardownGraph() = 0;
+
+  // Fill input node's output with Tensor
+  virtual bool FillInputNode(const string& node_name, const Tensor& tensor) = 0;
+
+  // Read output node's outputs as ByteArrays
+  virtual bool ReadOutputNode(const string& node_name,
+                              TensorAllocatorFunc tensor_allocator) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(IRemoteFusedGraphExecutor);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
index 38a3f2ca4d5..9975cd35376 100644
--- a/tensorflow/core/kernels/identity_op_test.cc
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index 02e4987abb4..ddd012b9108 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+#include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
-#include "tensorflow/core/kernels/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 33383d16a86..f088315ff53 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is a helper struct to package up the input and ouput
+// This is a helper struct to package up the input and output
 // parameters of an image resizer (the height, widths, etc.).  To
 // reduce code duplication and ensure consistency across the different
 // resizers, it performs the input validation.
@@ -122,7 +122,7 @@ struct ImageResizerState {
   int64 channels;
   float height_scale;
   float width_scale;
-  Tensor* output;
+  Tensor* output = nullptr;
 
  private:
   bool align_corners_;
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index a4f2e1053dd..67bec7d50e9 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -25,11 +25,14 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SyclDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
-template <typename T>
-Status DoParallelConcatUpdate(const CPUDevice& d, const Tensor& value,
+template <typename Device, typename T>
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
                               int32 loc, Tensor* output) {
   auto Tvalue = value.flat_outer_dims<T>();
   auto Toutput = output->flat_outer_dims<T>();
@@ -46,7 +49,7 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
   switch (value.dtype()) {
 #define CASE(type)                  \
   case DataTypeToEnum<type>::value: \
-    return DoParallelConcatUpdate<type>(d, value, loc, output);
+    return DoParallelConcatUpdate<CPUDevice, type>(d, value, loc, output);
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
 #undef CASE
@@ -55,6 +58,23 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
   }
 }
 
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
+                        Tensor* output) {
+  CHECK_EQ(value.dtype(), output->dtype());
+  switch (value.dtype()) {
+#define CASE(type)                  \
+  case DataTypeToEnum<type>::value: \
+    return DoParallelConcatUpdate<SyclDevice, type>(d, value, loc, output);
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", value.dtype());
+  }
+}
+#endif // TENSORFLOW_USE_SYCL
+
 }  // end namespace functor
 
 namespace {
@@ -125,7 +145,7 @@ class FailureKernel : public OpKernel {
                                     "could not be properly replaced."));
   }
 
-  void Compute(OpKernelContext*) {}
+  void Compute(OpKernelContext*) override {}
 };
 
 #define REGISTER(type)                                    \
@@ -152,6 +172,42 @@ TF_CALL_POD_STRING_TYPES(REGISTER_EMPTY)
 TF_CALL_POD_STRING_TYPES(REGISTER_PARALLEL_CONCAT);
 #undef REGISTER_PARALLEL_CONCAT
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_EMPTY(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
+                              .Device(DEVICE_SYCL)            \
+                              .TypeConstraint<type>("dtype"), \
+                          ParallelConcatStart<SyclDevice, type>);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_EMPTY)
+#undef REGISTER_EMPTY
+
+#define REGISTER_PARALLEL_CONCAT(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("ParallelConcat").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      FailureKernel);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_PARALLEL_CONCAT);
+#undef REGISTER_PARALLEL_CONCAT
+
+#define REGISTER(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")   \
+                              .Device(DEVICE_SYCL)        \
+                              .TypeConstraint<type>("T"), \
+                          ParallelConcatUpdate<SyclDevice>);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER)
+#undef REGISTER
+
+// Register versions that operate on int32 data on the CPU even though the op
+// has been placed on the SYCL
+
+REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("value")
+                            .HostMemory("update")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ParallelConcatUpdate<CPUDevice>);
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
new file mode 100644
index 00000000000..ed350d98331
--- /dev/null
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -0,0 +1,363 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following ops.
+
+Status VerifyTypesMatch(const DataTypeVector& expected,
+                        const DataTypeVector& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " types but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (expected[i] != received[i]) {
+      return errors::InvalidArgument("Data type mismatch at component ", i,
+                                     ": expected ", DataTypeString(expected[i]),
+                                     " but got ", DataTypeString(received[i]),
+                                     ".");
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
+                              const std::vector<PartialTensorShape>& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " shapes but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (!expected[i].IsCompatibleWith(received[i])) {
+      return errors::InvalidArgument("Incompatible shapes at component ", i,
+                                     ": expected ", expected[i].DebugString(),
+                                     " but got ", received[i].DebugString(),
+                                     ".");
+    }
+  }
+
+  return Status::OK();
+}
+
+class IteratorResource : public ResourceBase {
+ public:
+  IteratorResource(const DataTypeVector& output_dtypes,
+                   const std::vector<PartialTensorShape>& output_shapes)
+      : iterator_(nullptr),
+        output_dtypes_(output_dtypes),
+        output_shapes_(output_shapes) {}
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+    if (captured_iterator) {
+      return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
+    } else {
+      return errors::FailedPrecondition(
+          "GetNext() failed because the iterator has not been initialized. "
+          "Ensure that you have run the initializer operation for this "
+          "iterator before getting the next element.");
+    }
+  }
+
+  // Transfers ownership of iterator to this. This method is thread-safe.
+  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
+    if (iterator) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    }
+    iterator_.reset(iterator.release());
+    return Status::OK();
+  }
+
+  string DebugString() override { return "Iterator resource"; }
+
+  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+ private:
+  std::shared_ptr<IteratorBase> iterator_;
+  const DataTypeVector output_dtypes_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+// TODO(mrry): Can we simply use the template kernel here?
+class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
+ public:
+  explicit IteratorHandleOp(OpKernelConstruction* ctx)
+      : ResourceOpKernel<IteratorResource>(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ private:
+  Status CreateResource(IteratorResource** ret) override
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    *ret = new IteratorResource(output_dtypes_, output_shapes_);
+    return Status::OK();
+  }
+
+  Status VerifyResource(IteratorResource* resource) override {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+ private:
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class MakeIteratorOp : public OpKernel {
+ public:
+  explicit MakeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &dataset));
+    core::ScopedUnref unref_dataset(dataset);
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
+    OP_REQUIRES_OK(ctx,
+                   iterator_resource->set_iterator(dataset->MakeIterator()));
+    iterator_resource->Unref();
+  }
+};
+
+class OneShotIteratorOp : public OpKernel {
+ public:
+  explicit OneShotIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string shared_name;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &shared_name));
+    OP_REQUIRES(ctx, shared_name.empty(),
+                errors::InvalidArgument("OneShotIteratorOp does not currently "
+                                        "support the 'shared_name' attr."));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("dataset_factory", &dataset_factory_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  ~OneShotIteratorOp() override {
+    if (iterator_resource_ != nullptr) {
+      iterator_resource_->Unref();
+      if (!cinfo_.resource_manager()
+               ->Delete<IteratorResource>(cinfo_.container(), cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
+  }
+
+  // NOTE(mrry): This is based on `ResourceOpKernel<T>::Compute()`,
+  // but due to the fact that `ResourceOpKernel<T>::CreateResource()`
+  // does not provide access to the `OpKernelContext*` and we need this
+  // to invoke the factory function, it's not possible to implement
+  // this kernel by implementing `CreateResource()`.
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (iterator_resource_ == nullptr) {
+      ResourceMgr* mgr = ctx->resource_manager();
+      OP_REQUIRES_OK(ctx, cinfo_.Init(mgr, def()));
+
+      // Create an IteratorResource that will hold the iterator for this op.
+      IteratorResource* resource;
+      OP_REQUIRES_OK(
+          ctx,
+          mgr->LookupOrCreate<IteratorResource>(
+              cinfo_.container(), cinfo_.name(), &resource,
+              [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                *ret = new IteratorResource(output_dtypes_, output_shapes_);
+                return Status::OK();
+              }));
+      Status s = VerifyTypesMatch(output_dtypes_, resource->output_dtypes());
+      s.Update(
+          VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+      if (TF_PREDICT_FALSE(!s.ok())) {
+        resource->Unref();
+        ctx->SetStatus(s);
+        return;
+      }
+      iterator_resource_ = resource;
+
+      // Call the dataset_factory_func_ to create a new dataset,
+      // over which this op will iterate.
+      FunctionLibraryRuntime::Handle f_handle;
+      OP_REQUIRES_OK(ctx,
+                     ctx->function_library()->Instantiate(
+                         dataset_factory_func_->name(),
+                         AttrSlice(&dataset_factory_func_->attr()), &f_handle));
+      FunctionLibraryRuntime::Options opts;
+      opts.cancellation_manager = ctx->cancellation_manager();
+      // Choose a step ID that is guaranteed not to clash with any
+      // Session-generated step ID. DirectSession only generates
+      // non-negative step IDs (contiguous, starting from 0), and
+      // MasterSession generates 56-bit random step IDs whose MSB is
+      // always 0, so a negative random step ID should suffice.
+      opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+      ScopedStepContainer step_container(
+          opts.step_id, [ctx](const string& name) {
+            ctx->resource_manager()->Cleanup(name).IgnoreError();
+          });
+      opts.step_container = &step_container;
+      opts.runner = ctx->runner();
+      Notification n;
+      Status factory_status;
+      std::vector<Tensor> return_values;
+      ctx->function_library()->Run(opts, f_handle, {}, &return_values,
+                                   [&n, &factory_status](Status s) {
+                                     factory_status.Update(s);
+                                     n.Notify();
+                                   });
+      n.WaitForNotification();
+      OP_REQUIRES_OK(ctx, factory_status);
+      OP_REQUIRES(
+          ctx,
+          return_values.size() == 1 &&
+              return_values[0].dtype() == DT_RESOURCE &&
+              TensorShapeUtils::IsScalar(return_values[0].shape()),
+          errors::InvalidArgument("The `dataset_factory` function must return "
+                                  "a single scalar of dtype DT_RESOURCE."));
+
+      // Retrieve the dataset that was created in the factory function.
+      DatasetBase* dataset;
+      const ResourceHandle& dataset_resource =
+          return_values[0].flat<ResourceHandle>()(0);
+      OP_REQUIRES_OK(ctx, LookupResource(ctx, dataset_resource, &dataset));
+      core::ScopedUnref unref_dataset(dataset);
+
+      // Create an iterator for the dataset that was created in the
+      // factory function. This transfers ownership of the dataset to
+      // the iterator, so we can delete it from the resource manager.
+      OP_REQUIRES_OK(ctx,
+                     iterator_resource_->set_iterator(dataset->MakeIterator()));
+      OP_REQUIRES_OK(ctx, DeleteResource<DatasetBase>(ctx, dataset_resource));
+    }
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<IteratorResource>(
+        ctx, cinfo_.container(), cinfo_.name());
+  }
+
+ private:
+  const NameAttrList* dataset_factory_func_;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+
+  mutex mu_;
+  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  IteratorResource* iterator_resource_ = nullptr;
+};
+
+class IteratorGetNextOp : public AsyncOpKernel {
+ public:
+  explicit IteratorGetNextOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("iterator_get_next_thread_",
+                            SanitizeThreadSuffix(def().name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([this, ctx, iterator, done]() {
+      core::ScopedUnref unref_iterator(iterator);
+
+      std::vector<Tensor> components;
+      bool end_of_sequence = false;
+
+      IteratorContext::Params params;
+      params.env = ctx->env();
+      params.step_id = ctx->step_id();
+      params.resource_manager = ctx->resource_manager();
+      params.runner = *(ctx->runner());
+      IteratorContext iter_ctx(std::move(params));
+
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+                        errors::OutOfRange("End of sequence"), done);
+
+      for (int i = 0; i < components.size(); ++i) {
+        // TODO(mrry): Check that the shapes match the shape attrs.
+        ctx->set_output(i, components[i]);
+      }
+
+      done();
+    });
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+class IteratorDisposeOp : public OpKernel {
+ public:
+  explicit IteratorDisposeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
+    OP_REQUIRES_OK(ctx, iterator->set_iterator(nullptr));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
+REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
+                        MakeIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
+                        OneShotIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
+                        IteratorGetNextOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorDispose").Device(DEVICE_CPU),
+                        IteratorDisposeOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 5fde6969631..36907fb5716 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 
+#include <utility>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -106,6 +108,7 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
+
 }
 
 template <typename Scalar>
@@ -145,10 +148,9 @@ void LinearAlgebraOp<Scalar>::AnalyzeInputs(OpKernelContext* context,
     const int col_dimension = input_rank - 1;
     const int64 num_rows = in.dim_size(row_dimension);
     const int64 num_cols = in.dim_size(col_dimension);
-    // TODO(rmlarsen): Use emplace_back when it is added to InlinedVector. Same
-    // in several places below.
-    input_matrix_shapes->push_back(TensorShape({num_rows, num_cols}));
-    inputs->push_back(in);
+    input_matrix_shapes->emplace_back(
+        std::initializer_list<int64>({num_rows, num_cols}));
+    inputs->emplace_back(&in);
   }
   // Have the derived class validate that the inputs are as expected.
   ValidateInputMatrixShapes(context, *input_matrix_shapes);
@@ -171,28 +173,45 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
           num_outputs, context->num_outputs()));
 
   // Allocate outputs.
-  for (int i = 0; i < context->num_outputs(); ++i) {
-    TensorShape output_tensor_shape({0});
-    if (i < num_outputs) {
+  std::set<int> unused_inputs;
+  for (int input_idx = 0; input_idx < context->num_inputs(); ++input_idx) {
+    unused_inputs.insert(input_idx);
+  }
+  for (int output_idx = 0; output_idx < context->num_outputs(); ++output_idx) {
+    TensorShape output_tensor_shape({});
+    if (output_idx < num_outputs) {
       // This output is used, set up output shape and allocate it.
-      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      const TensorShape& output_matrix_shape =
+          output_matrix_shapes->at(output_idx);
       OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
                   errors::InvalidArgument(
                       "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
-                      i, output_matrix_shape.dims()));
+                      output_idx, output_matrix_shape.dims()));
 
       // The final output has the shape of the outer batch dimensions
       // concatenated with the output_matrix_shape (if the output is not
       // scalar).
       output_tensor_shape = batch_shape;
-      for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
-        output_tensor_shape.AddDim(output_matrix_shape.dim_size(dim));
-      }
+      output_tensor_shape.AppendShape(output_matrix_shape);
     }
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(i, output_tensor_shape, &out));
-    outputs->push_back(out);
+    // See if there is an input buffer we can reuse for this output.
+    bool reused_input = false;
+    if (EnableInputForwarding()) {
+      for (int input_idx : unused_inputs) {
+        if (context->forward_input_to_output_with_shape(
+                input_idx, output_idx, output_tensor_shape, &out)) {
+          reused_input = true;
+          unused_inputs.erase(input_idx);
+          break;
+        }
+      }
+    }
+    if (!reused_input) {
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  output_idx, output_tensor_shape, &out));
+    }
+    outputs->emplace_back(out);
   }
 }
 
@@ -205,11 +224,10 @@ void LinearAlgebraOp<Scalar>::ComputeTensorSlice(
   for (size_t i = 0; i < inputs.size(); ++i) {
     // TODO(kalakris): Handle alignment if possible. Eigen::Map is
     // unaligned by default.
-    matrix_inputs.push_back(
-        ConstMatrixMap(inputs[i].flat<Scalar>().data() +
-                           matrix_index * input_matrix_shapes[i].num_elements(),
-                       input_matrix_shapes[i].dim_size(0),
-                       input_matrix_shapes[i].dim_size(1)));
+    matrix_inputs.emplace_back(
+        inputs[i]->flat<Scalar>().data() +
+            matrix_index * input_matrix_shapes[i].num_elements(),
+        input_matrix_shapes[i].dim_size(0), input_matrix_shapes[i].dim_size(1));
   }
 
   MatrixMaps matrix_outputs;
@@ -221,10 +239,10 @@ void LinearAlgebraOp<Scalar>::ComputeTensorSlice(
     int num_output_cols = output_matrix_shapes[i].dims() == 2
                               ? output_matrix_shapes[i].dim_size(1)
                               : 1;
-    matrix_outputs.push_back(
-        MatrixMap(outputs[i]->flat<Scalar>().data() +
-                      matrix_index * output_matrix_shapes[i].num_elements(),
-                  num_output_rows, num_output_cols));
+    matrix_outputs.emplace_back(
+        outputs[i]->flat<Scalar>().data() +
+            matrix_index * output_matrix_shapes[i].num_elements(),
+        num_output_rows, num_output_cols);
   }
   ComputeMatrix(context, matrix_inputs, &matrix_outputs);
 }
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index 8a606ed0a9a..1d31786728f 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -21,10 +21,7 @@ limitations under the License.
 // computations across different threads if necessary.
 #include <algorithm>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/Eigen/Core"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,6 +105,10 @@ class LinearAlgebraOp : public OpKernel {
                                                   : static_cast<int64>(cost);
   }
 
+  // Returns true if it is safe to forward (alias) input to output buffer
+  // and expect the kernel to perform the computation inplace.
+  virtual bool EnableInputForwarding() const { return true; }
+
   using Matrix =
       Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using ConstMatrixMap = Eigen::Map<const Matrix>;
@@ -127,11 +128,10 @@ class LinearAlgebraOp : public OpKernel {
                              MatrixMaps* outputs) = 0;
 
  private:
-  using TensorInputs = gtl::InlinedVector<Tensor, 4>;
+  using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
   using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
-
-  // This function maps slices (matrices) of the input and output tensors using
-  // Eigen::Map and calls ComputeMatrix implemented in terms of the
+  // This function maps 2-d slices (matrices) of the input and output tensors
+  // using Eigen::Map and calls ComputeMatrix implemented in terms of the
   // Eigen::MatrixBase API by the derived class.
   //
   // The 'matrix_index' parameter specifies the index of the matrix to be used
@@ -163,8 +163,8 @@ class LinearAlgebraOp : public OpKernel {
                       TensorShapes* output_matrix_shapes);
 };
 
-// Declare that LinearAlgebraOp is explicitly instantiated in
-// linalg_ops_common.cc for float and double.
+// Declare LinearAlgebraOp, which is explicitly instantiated in
+// linalg_ops_common.cc for float, double, complex64, and complex128.
 extern template class LinearAlgebraOp<float>;
 extern template class LinearAlgebraOp<double>;
 extern template class LinearAlgebraOp<complex64>;
@@ -172,8 +172,25 @@ extern template class LinearAlgebraOp<complex128>;
 
 }  // namespace tensorflow
 
-#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
-  REGISTER_KERNEL_BUILDER(                          \
+#define INHERIT_LINALG_TYPEDEFS(Scalar)                   \
+  typedef LinearAlgebraOp<Scalar> Base;                   \
+  using Matrix = typename Base::Matrix;                   \
+  using MatrixMap = typename Base::MatrixMap;             \
+  using MatrixMaps = typename Base::MatrixMaps;           \
+  using ConstMatrixMap = typename Base::ConstMatrixMap;   \
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps; \
+  using TensorShapes = typename Base::TensorShapes;
+
+#define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
       Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
 
+#define REGISTER_LINALG_OP_GPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), OpClass)
+
+// Deprecated, use one of the device-specific macros above.
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+  REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
+
 #endif  // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
new file mode 100755
index 00000000000..23cabe7b547
--- /dev/null
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "lmdb.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include <sys/stat.h>
+
+namespace tensorflow {
+
+inline void MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBReader : public ReaderBase {
+ public:
+  LMDBReader(const string& node_name, Env* env)
+      : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
+        env_(env),
+        mdb_env_(nullptr),
+        mdb_dbi_(0),
+        mdb_txn_(nullptr),
+        mdb_cursor_(nullptr) {}
+
+  Status OnWorkStartedLocked() override {
+    MDB_CHECK(mdb_env_create(&mdb_env_));
+    int flags = MDB_RDONLY | MDB_NOTLS;
+
+    // Check if the LMDB filename is actually a file instead of a directory.
+    // If so, set appropriate flags so we can open it.
+    struct stat source_stat;
+    if (stat(current_work().c_str(), &source_stat) == 0 &&
+        (source_stat.st_mode & S_IFREG)) {
+      flags |= MDB_NOSUBDIR;
+    }
+
+    MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664));
+    MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_));
+
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    if (mdb_env_ != nullptr) {
+      if (mdb_cursor_) {
+        mdb_cursor_close(mdb_cursor_);
+      }
+      mdb_txn_abort(mdb_txn_);
+      mdb_dbi_close(mdb_env_, mdb_dbi_);
+      mdb_env_close(mdb_env_);
+      mdb_env_ = nullptr;
+    }
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    if (mdb_cursor_ == nullptr) {
+      MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+      if (Seek(MDB_FIRST) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    else {
+      if (Seek(MDB_NEXT) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    *key = string(static_cast<const char*>(mdb_key_.mv_data),
+                  mdb_key_.mv_size);
+    *value = string(static_cast<const char*>(mdb_value_.mv_data),
+                    mdb_value_.mv_size);
+    *produced = true;
+    return Status::OK();
+  }
+
+  Status ResetLocked() override {
+    CHECK_EQ(Seek(MDB_FIRST), true);
+    return ReaderBase::ResetLocked();
+  }
+
+ private:
+  bool Seek(MDB_cursor_op op) {
+    CHECK_NOTNULL(mdb_cursor_);
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      return false;
+    } else {
+      MDB_CHECK(mdb_status);
+      return true;
+    }
+  }
+
+  Env* const env_;
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+
+  MDB_txn* mdb_txn_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+};
+
+class LMDBReaderOp : public ReaderOpKernel {
+ public:
+  explicit LMDBReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory([this, env]() {
+      return new LMDBReader(name(), env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
+                        LMDBReaderOp);
+
+}
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 6e9fbcdb4c7..9cf669a7efc 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 35f563d9fef..ada6fe8d950 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/lookup_table_init_op.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -205,12 +205,12 @@ class TextFileLineIterator
         return;
       }
     }
-    status_ = SetValue(line, tokens, key_index_, key_.dtype(), &key_);
+    status_ = SetValue(line, tokens, key_index_, &key_);
     if (!status_.ok()) {
       valid_ = false;
       return;
     }
-    status_ = SetValue(line, tokens, value_index_, value_.dtype(), &value_);
+    status_ = SetValue(line, tokens, value_index_, &value_);
     if (!status_.ok()) {
       valid_ = false;
       return;
@@ -247,17 +247,14 @@ class TextFileLineIterator
   // Set the corresponding value from line or tokens based on 'index' into the
   // tensor 't'. The value is transformed to the given data type 'dtype'.
   Status SetValue(const string& line, const std::vector<string>& tokens,
-                  int64 index, DataType dtype, Tensor* tensor) {
+                  int64 index, Tensor* tensor) {
     if (index == kLineNumber) {
       tensor->flat<int64>()(0) = next_id_;
       return Status::OK();
     }
-    if (index == kWholeLine) {
-      tensor->flat<string>()(0) = line;
-      return Status::OK();
-    }
-    const string& token = tokens[index];
-    switch (tensor->dtype()) {
+    const string& token = (index == kWholeLine) ? line : tokens[index];
+    const DataType& dtype = tensor->dtype();
+    switch (dtype) {
       case DT_INT32: {
         int32 value;
         if (!strings::safe_strto32(token.c_str(), &value)) {
@@ -307,6 +304,8 @@ class TextFileLineIterator
   TF_DISALLOW_COPY_AND_ASSIGN(TextFileLineIterator);
 };
 
+}  // namespace
+
 // Helper function to initialize an InitializableLookupTable from a text file.
 Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
                                    char delimiter, int32 key_index,
@@ -317,26 +316,28 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
         "Key index for line number requires table key dtype of int64, got ",
         table->key_dtype());
   }
-  if (key_index == kWholeLine && table->key_dtype() != DT_STRING) {
+  const DataType& key_dtype = table->key_dtype();
+  const DataType& value_dtype = table->value_dtype();
+  if (key_index == kWholeLine && !DataTypeIsInteger(key_dtype) &&
+      key_dtype != DT_STRING) {
     return errors::InvalidArgument(
-        "Key index for whole line requires table key dtype of string, got ",
+        "Key index for whole line requires string or integer table key, got ",
         table->key_dtype());
   }
-  if (value_index == kLineNumber && table->value_dtype() != DT_INT64) {
+  if (value_index == kLineNumber && value_dtype != DT_INT64) {
     return errors::InvalidArgument(
         "Value index for line number requires table value dtype of int64, got ",
         table->value_dtype());
   }
-  if (value_index == kWholeLine && table->value_dtype() != DT_STRING) {
+  if (value_index == kWholeLine && value_dtype != DT_STRING) {
     return errors::InvalidArgument(
         "Value index for whole line requires table value dtype of string, got ",
         table->value_dtype());
   }
 
   TextFileLineIterator iter;
-  TF_RETURN_IF_ERROR(iter.Init(filename, vocab_size, delimiter,
-                               table->key_dtype(), key_index,
-                               table->value_dtype(), value_index, env));
+  TF_RETURN_IF_ERROR(iter.Init(filename, vocab_size, delimiter, key_dtype,
+                               key_index, value_dtype, value_index, env));
   // For initialization from files, ignore if the table is already
   // initialized. The table shared name should contain the filename to
   // avoid trying to initialize the same table from the same file at the same
@@ -350,7 +351,6 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
   return s;
 }
 
-}  // namespace
 }  // namespace lookup
 
 // Kernel to initialize a look table given a key and value tensors.
@@ -367,20 +367,23 @@ class InitializeTableOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
 
     const Tensor& keys = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(keys.shape()),
-                errors::InvalidArgument("Keys must be a vector, but received ",
-                                        keys.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(keys.shape()),
+        errors::InvalidArgument("Keys must be a vector, but received shape",
+                                keys.shape().DebugString()));
 
     const Tensor& values = ctx->input(2);
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsVector(values.shape()),
-        errors::InvalidArgument("Values must be a vector, but received ",
+        errors::InvalidArgument("Values must be a vector, but received shape",
                                 values.shape().DebugString()));
 
     OP_REQUIRES(ctx, keys.NumElements() == values.NumElements(),
@@ -389,7 +392,16 @@ class InitializeTableOp : public OpKernel {
                     keys.NumElements(), " vs ", values.NumElements()));
 
     lookup::KeyValueTensorIterator iter(&keys, &values);
+
+    int memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
     OP_REQUIRES_OK(ctx, table->Initialize(iter));
+    if (ctx->track_allocations()) {
+      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
+                                                    memory_used_before);
+    }
   }
 
  private:
@@ -398,6 +410,8 @@ class InitializeTableOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU),
                         InitializeTableOp);
+REGISTER_KERNEL_BUILDER(Name("InitializeTableV2").Device(DEVICE_CPU),
+                        InitializeTableOp);
 
 // Kernel to initialize a lookup table from a text file.
 //
@@ -423,7 +437,9 @@ class InitializeTableFromTextFileOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, DT_STRING};
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, DT_STRING};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
 
@@ -437,9 +453,17 @@ class InitializeTableFromTextFileOp : public OpKernel {
     OP_REQUIRES(ctx, !vocab_filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
+    int64 memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
     OP_REQUIRES_OK(ctx, lookup::InitializeTableFromTextFile(
                             vocab_filename, vocab_size_, delimiter_, key_index_,
                             value_index_, ctx->env(), table));
+    if (ctx->track_allocations()) {
+      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
+                                                    memory_used_before);
+    }
   }
 
  private:
@@ -454,5 +478,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTableFromTextFile").Device(DEVICE_CPU),
                         InitializeTableFromTextFileOp);
+REGISTER_KERNEL_BUILDER(
+    Name("InitializeTableFromTextFileV2").Device(DEVICE_CPU),
+    InitializeTableFromTextFileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_init_op.h b/tensorflow/core/kernels/lookup_table_init_op.h
new file mode 100644
index 00000000000..177a26daa8a
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_init_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#define TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Helper function to initialize an InitializableLookupTable from a text file.
+Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
+                                   char delimiter, int32 key_index,
+                                   int32 value_index, Env* env,
+                                   InitializableLookupTable* table);
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 7b465604256..11ce2a71dcb 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,126 +22,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 namespace lookup {
-namespace {
-
-// Ensure that the compiler cannot elide a copy into a local, for
-// bounds checking on source tensors that might be updated asynchronously for
-// integral types. However non-integer variables are not allowed and therefore
-// the local copy is unnecessary.
-template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
-  return internal::SubtleMustCopy(value);
-}
-
-const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
-  return value;
-}
-
-const float SubtleMustCopyUnlessStringOrFloat(const float value) {
-  return value;
-}
-
-const double SubtleMustCopyUnlessStringOrFloat(const double value) {
-  return value;
-}
-
-}  // namespace
-
-// Lookup table that wraps an unordered_map, where the key and value data type
-// is specified.
-//
-// This table is recommended for any variations to key values.
-//
-// For look up, the table is required to be initialized (allocated
-// and populated). Once the table is marked as initialized it becomes read-only.
-//
-// Sample use case:
-//
-// HashTable<int64, int64> table;  // int64 -> int64.
-// table.Prepare(10); // Prepare the underlying data structure, the number of
-//                    // elements is required by interface, but not used.
-// // Populate the table, elements could be added in one or multiple calls.
-// table.Insert(key_tensor, value_tensor); // Populate the table.
-// ...
-// table.set_is_initialized();
-//
-// table.Find(in_t, &out_t, default_t)
-//
-template <class K, class V>
-class HashTable : public InitializableLookupTable {
- public:
-  HashTable(OpKernelContext* ctx, OpKernel* kernel) {}
-
-  size_t size() const override {
-    // return the size of the table only if it's initialized, otherwise 0.
-    if (!is_initialized_) {
-      return 0;
-    }
-    std::atomic_thread_fence(std::memory_order_acquire);
-    return table_ ? table_->size() : 0;
-  }
-
-  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
-
-  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
-
- protected:
-  Status DoPrepare(size_t unused) override {
-    if (is_initialized_) {
-      return errors::Aborted("HashTable already initialized.");
-    }
-    if (!table_) {
-      table_ = std::unique_ptr<std::unordered_map<K, V>>(
-          new std::unordered_map<K, V>());
-    }
-    return Status::OK();
-  };
-
-  Status DoInsert(const Tensor& keys, const Tensor& values) override {
-    if (!table_) {
-      return errors::FailedPrecondition("HashTable is not prepared.");
-    }
-
-    const auto key_values = keys.flat<K>();
-    const auto value_values = values.flat<V>();
-    for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
-      const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
-      if (previous_value != value) {
-        return errors::FailedPrecondition(
-            "HashTable has different value for same key. Key ", key, " has ",
-            previous_value, " and trying to add value ", value);
-      }
-    }
-    return Status::OK();
-  }
-
-  Status DoFind(const Tensor& key, Tensor* value,
-                const Tensor& default_value) override {
-    const V default_val = default_value.flat<V>()(0);
-    const auto key_values = key.flat<K>();
-    auto value_values = value->flat<V>();
-
-    for (int64 i = 0; i < key_values.size(); ++i) {
-      value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
-    }
-    return Status::OK();
-  }
-
- private:
-  std::unique_ptr<std::unordered_map<K, V>> table_;
-};
 
 // Lookup table that wraps an unordered_map, where the key and value data type
 // is specified. Each individual value must be a scalar. If vector values are
@@ -738,7 +624,10 @@ class LookupTableFindOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    // Input 0 could be a STRING_REF or a RESOURCE
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
@@ -761,6 +650,8 @@ class LookupTableFindOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU),
                         LookupTableFindOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableFindV2").Device(DEVICE_CPU),
+                        LookupTableFindOp);
 
 // Table insert op.
 class LookupTableInsertOp : public OpKernel {
@@ -772,19 +663,32 @@ class LookupTableInsertOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
     const Tensor& keys = ctx->input(1);
     const Tensor& values = ctx->input(2);
     OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForInsert(keys, values));
+
+    int64 memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
     OP_REQUIRES_OK(ctx, table->Insert(ctx, keys, values));
+    if (ctx->track_allocations()) {
+      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
+                                                    memory_used_before);
+    }
   }
 };
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableInsert").Device(DEVICE_CPU),
                         LookupTableInsertOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableInsertV2").Device(DEVICE_CPU),
+                        LookupTableInsertOp);
 
 // Op that returns the size of the given table.
 class LookupTableSizeOp : public OpKernel {
@@ -804,6 +708,8 @@ class LookupTableSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU),
                         LookupTableSizeOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableSizeV2").Device(DEVICE_CPU),
+                        LookupTableSizeOp);
 
 // Op that outputs tensors of all keys and all values.
 class LookupTableExportOp : public OpKernel {
@@ -821,6 +727,8 @@ class LookupTableExportOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableExport").Device(DEVICE_CPU),
                         LookupTableExportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableExportV2").Device(DEVICE_CPU),
+                        LookupTableExportOp);
 
 // Clear the table and insert data.
 class LookupTableImportOp : public OpKernel {
@@ -832,19 +740,32 @@ class LookupTableImportOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
     const Tensor& keys = ctx->input(1);
     const Tensor& values = ctx->input(2);
     OP_REQUIRES_OK(ctx, table->CheckKeyAndValueTensorsForImport(keys, values));
+
+    int memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
     OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values));
+    if (ctx->track_allocations()) {
+      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
+                                                    memory_used_before);
+    }
   }
 };
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
                         LookupTableImportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
+                        LookupTableImportOp);
 
 // Register the HashTable op with the currently supported key and value types.
 #define REGISTER_KERNEL(key_dtype, value_dtype)                           \
@@ -853,6 +774,13 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
           .Device(DEVICE_CPU)                                             \
           .TypeConstraint<key_dtype>("key_dtype")                         \
           .TypeConstraint<value_dtype>("value_dtype"),                    \
+      LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
+                    value_dtype>)                                         \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("HashTableV2")                                                 \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<key_dtype>("key_dtype")                         \
+          .TypeConstraint<value_dtype>("value_dtype"),                    \
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
@@ -874,6 +802,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableV2")                                               \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -881,6 +816,7 @@ REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int64, float);
 
 #undef REGISTER_KERNEL
 
@@ -891,6 +827,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableOfTensorsV2")                                      \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -908,6 +851,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                              \
           .TypeConstraint<key_dtype>("key_dtype")                          \
           .TypeConstraint<value_dtype>("value_dtype"),                     \
+      LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MutableDenseHashTableV2")                                      \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                          \
+          .TypeConstraint<value_dtype>("value_dtype"),                     \
       LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -916,6 +866,7 @@ REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int64, bool);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 062de041c8c..ff23a09a24f 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -49,36 +51,52 @@ class LookupTableOp : public OpKernel {
   // ctx is not owned by this function.
   void Compute(OpKernelContext* ctx) override {
     mutex_lock l(mu_);
+
     if (!table_handle_set_) {
       OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
                                       use_node_name_sharing_));
-      auto creator = [ctx, this](lookup::LookupInterface** ret) {
-        lookup::LookupInterface* container = new Container(ctx, this);
-        if (!ctx->status().ok()) {
-          container->Unref();
-          return ctx->status();
-        }
-        *ret = container;
-        return Status::OK();
-      };
-
-      lookup::LookupInterface* table = nullptr;
-      OP_REQUIRES_OK(
-          ctx, cinfo_.resource_manager()
-                   ->template LookupOrCreate<lookup::LookupInterface>(
-                       cinfo_.container(), cinfo_.name(), &table, creator));
-      core::ScopedUnref unref_me(table);
-
-      OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
-                              *table, DataTypeToEnum<key_dtype>::v(),
-                              DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
-
-      auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      table_handle_set_ = true;
     }
-    ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+
+    auto creator = [ctx, this](lookup::LookupInterface** ret) {
+      lookup::LookupInterface* container = new Container(ctx, this);
+      if (!ctx->status().ok()) {
+        container->Unref();
+        return ctx->status();
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_host_persistent_memory_allocation(
+            container->MemoryUsed() + table_handle_.AllocatedBytes());
+      }
+      *ret = container;
+      return Status::OK();
+    };
+
+    lookup::LookupInterface* table = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   cinfo_.resource_manager()
+                       ->template LookupOrCreate<lookup::LookupInterface>(
+                           cinfo_.container(), cinfo_.name(), &table, creator));
+    core::ScopedUnref unref_me(table);
+
+    OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+                            *table, DataTypeToEnum<key_dtype>::v(),
+                            DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+    if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
+      Tensor* handle;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+      handle->scalar<ResourceHandle>()() =
+          MakeResourceHandle<lookup::LookupInterface>(ctx, cinfo_.container(),
+                                                      cinfo_.name());
+    } else {
+      if (!table_handle_set_) {
+        auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+        h(0) = cinfo_.container();
+        h(1) = cinfo_.name();
+      }
+      ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+    }
+    table_handle_set_ = true;
   }
 
   ~LookupTableOp() override {
@@ -100,6 +118,128 @@ class LookupTableOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp);
 };
 
+namespace lookup {
+
+// Ensure that the compiler cannot elide a copy into a local, for
+// bounds checking on source tensors that might be updated asynchronously for
+// integral types. However non-integer variables are not allowed and therefore
+// the local copy is unnecessary.
+template <typename T>
+T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+  return internal::SubtleMustCopy(value);
+}
+
+inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+  return value;
+}
+
+inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+  return value;
+}
+
+inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+  return value;
+}
+
+// Lookup table that wraps an unordered_map, where the key and value data type
+// is specified.
+//
+// This table is recommended for any variations to key values.
+//
+// For look up, the table is required to be initialized (allocated
+// and populated). Once the table is marked as initialized it becomes read-only.
+//
+// Sample use case:
+//
+// HashTable<int64, int64> table;  // int64 -> int64.
+// table.Prepare(10); // Prepare the underlying data structure, the number of
+//                    // elements is required by interface, but not used.
+// // Populate the table, elements could be added in one or multiple calls.
+// table.Insert(key_tensor, value_tensor); // Populate the table.
+// ...
+// table.set_is_initialized();
+//
+// table.Find(in_t, &out_t, default_t)
+//
+template <class K, class V>
+class HashTable : public InitializableLookupTable {
+ public:
+  HashTable(OpKernelContext* ctx, OpKernel* kernel) {}
+
+  size_t size() const override {
+    // return the size of the table only if it's initialized, otherwise 0.
+    if (!is_initialized_) {
+      return 0;
+    }
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return table_ ? table_->size() : 0;
+  }
+
+  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+
+  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+
+ protected:
+  Status DoPrepare(size_t unused) override {
+    if (is_initialized_) {
+      return errors::Aborted("HashTable already initialized.");
+    }
+    if (!table_) {
+      table_ = std::unique_ptr<std::unordered_map<K, V>>(
+          new std::unordered_map<K, V>());
+    }
+    return Status::OK();
+  };
+
+  Status DoInsert(const Tensor& keys, const Tensor& values) override {
+    if (!table_) {
+      return errors::FailedPrecondition("HashTable is not prepared.");
+    }
+
+    const auto key_values = keys.flat<K>();
+    const auto value_values = values.flat<V>();
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
+      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
+      if (previous_value != value) {
+        return errors::FailedPrecondition(
+            "HashTable has different value for same key. Key ", key, " has ",
+            previous_value, " and trying to add value ", value);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status DoFind(const Tensor& key, Tensor* value,
+                const Tensor& default_value) override {
+    const V default_val = default_value.flat<V>()(0);
+    const auto key_values = key.flat<K>();
+    auto value_values = value->flat<V>();
+
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      value_values(i) = gtl::FindWithDefault(
+          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
+          default_val);
+    }
+    return Status::OK();
+  }
+
+  int64 MemoryUsed() const override {
+    if (table_) {
+      const int64 num_elements = table_->size();
+      return num_elements * (sizeof(K) + sizeof(V));
+    } else {
+      return 0;
+    }
+  }
+
+ private:
+  std::unique_ptr<std::unordered_map<K, V>> table_;
+};
+
+}  // namespace lookup
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index f87ce0e6b20..d0f269be231 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -49,26 +49,48 @@ Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
                       LookupInterface** table) {
   string container;
   string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
-  return ctx->resource_manager()->Lookup(container, table_handle, table);
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    return LookupResource(ctx, handle, table);
+  } else {
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    return ctx->resource_manager()->Lookup(container, table_handle, table);
+  }
 }
 
 Status GetInitializableLookupTable(const string& input_name,
                                    OpKernelContext* ctx,
                                    InitializableLookupTable** table) {
-  string container;
-  string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
   LookupInterface* lookup_table;
-  TF_RETURN_IF_ERROR(
-      ctx->resource_manager()->Lookup(container, table_handle, &lookup_table));
-  *table = lookup_table->GetInitializableLookupTable();
-  if (*table == nullptr) {
-    lookup_table->Unref();
-    return errors::InvalidArgument("Table ", container, " ", table_handle,
-                                   " is not initializable");
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    TF_RETURN_IF_ERROR(LookupResource(ctx, handle, &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", handle.container(), " ",
+                                     handle.name(), " is not initializable");
+    }
+  } else {
+    string container;
+    string table_handle;
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup(container, table_handle,
+                                                       &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", container, " ", table_handle,
+                                     " is not initializable");
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index 3435486c953..c905ebc84a6 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -79,11 +79,11 @@ struct LaunchLRN<CPUDevice, T> {
     const int rows = static_cast<int>(in.dim_size(1));
     const int cols = static_cast<int>(in.dim_size(2));
     const int depth = static_cast<int>(in.dim_size(3));
-    const int nodes = cols * rows;
 
 #if defined(IS_MOBILE_PLATFORM)
     SingleThreadedLRN(in, batch, rows, cols, depth, output);
 #else
+    const int nodes = cols * rows;
     if (depth > kSingleThreadedLRNDepthCutoff &&
         (beta_ == T(0.5) || beta_ == T(1))) {
       SingleThreadedLRN(in, batch, rows, cols, depth, output);
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 6b69491474d..9c8a1dfa9a5 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/map_dataset_op.cc
new file mode 100644
index 00000000000..f755097324e
--- /dev/null
+++ b/tensorflow/core/kernels/map_dataset_op.cc
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class MapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit MapDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    *output = new Dataset(input, std::move(captured_func), output_types_,
+                          output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "MapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // NOTE(mrry): This method is thread-safe as long as
+        // `input_impl_` and `f` are thread-safe. However, if multiple
+        // threads enter this method, outputs may be observed in a
+        // non-deterministic order.
+
+        std::vector<Tensor> args;
+        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
+
+        FunctionLibraryRuntime::Options opts;
+        // Choose a step ID that is guaranteed not to clash with any
+        // Session-generated step ID. DirectSession only generates
+        // non-negative step IDs (contiguous, starting from 0), and
+        // MasterSession generates 56-bit random step IDs whose MSB is
+        // always 0, so a negative random step ID should suffice.
+        opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+        opts.runner = ctx->runner();
+        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+        // stack-rip the iterators and use async kernels.
+        return dataset()->captured_func_->Run(opts, args, out_tensors);
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
new file mode 100644
index 00000000000..46eaf3d9e7a
--- /dev/null
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -0,0 +1,850 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <functional>
+#include <map>
+#include <mutex>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+
+// Partial Ordering Comparator for Tensor keys containing scalar int64's
+struct KeyTensorLess {
+  bool operator()(const Tensor& lhs, const Tensor& rhs) const {
+    return std::less<int64>{}(lhs.scalar<int64>()(),
+                              rhs.scalar<int64>()());
+  }
+};
+
+// Key Equality operator for Tensor keys containing scalar int64's
+struct KeyTensorEqual {
+  bool operator()(const Tensor& lhs, const Tensor& rhs) const {
+    return std::equal_to<int64>{}(lhs.scalar<int64>()(),
+                                  rhs.scalar<int64>()());
+  }
+};
+
+// Hash for Tensor keys containing scalar int64's
+struct KeyTensorHash {
+  std::size_t operator()(const Tensor& key) const {
+    return std::hash<int64>{}(key.scalar<int64>()());
+  }
+};
+
+// Primary template.
+template <bool Ordered, typename Data>
+struct MapTraits;
+
+// Partial specialization for ordered.
+template <typename Data>
+struct MapTraits<true, Data> {
+  using KeyType = Tensor;
+  using DataType = Data;
+  using MapType = std::map<KeyType, Data, KeyTensorLess>;
+};
+
+// Partial specialization for unordered.
+template <typename Data>
+struct MapTraits<false, Data> {
+  using KeyType = Tensor;
+  using DataType = Data;
+  using MapType =
+      std::unordered_map<KeyType, Data, KeyTensorHash, KeyTensorEqual>;
+};
+
+// Wrapper around map/unordered_map.
+template <bool Ordered>
+class StagingMap : public ResourceBase {
+ public:
+  // Public typedefs
+  using Tuple = std::vector<Tensor>;
+  using OptionalTensor = gtl::optional<Tensor>;
+  using OptionalTuple = std::vector<OptionalTensor>;
+
+  using MapType = typename MapTraits<Ordered, OptionalTuple>::MapType;
+  using KeyType = typename MapTraits<Ordered, OptionalTuple>::KeyType;
+
+  using IncompleteType = typename MapTraits<false, OptionalTuple>::MapType;
+
+ private:
+  // Private variables
+  DataTypeVector dtypes_;
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable not_empty_;
+  std::condition_variable full_;
+  IncompleteType incomplete_;
+  MapType map_;
+
+ private:
+  // private methods
+
+  // If map is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (has_capacity() || has_memory_limit()) {
+      lock->unlock();
+      full_.notify_one();
+    }
+  }
+
+  // Notify any removers waiting to extract values
+  // that data is now available
+  void notify_removers(std::unique_lock<std::mutex>* lock) {
+    lock->unlock();
+    not_empty_.notify_one();
+  }
+
+  bool has_capacity() const { return capacity_ > 0; }
+
+  bool has_memory_limit() const { return memory_limit_ > 0; }
+
+  bool would_exceed_memory_limit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  bool is_capacity_full() const { return map_.size() >= capacity_; }
+
+  // Get number of bytes in the tuple
+  std::size_t get_tuple_bytes(const Tuple& tuple) {
+    return std::accumulate(tuple.begin(), tuple.end(),
+                           static_cast<std::size_t>(0),
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
+  }
+
+  // Get number of bytes in the incomplete tuple
+  std::size_t get_tuple_bytes(const OptionalTuple& tuple) {
+    return std::accumulate(
+        tuple.begin(), tuple.end(), static_cast<std::size_t>(0),
+        [](const std::size_t& lhs, const OptionalTensor& rhs) {
+          return (lhs + rhs.has_value()) ? rhs.value().TotalBytes() : 0;
+        });
+  }
+
+  // Check that the index is within bounds
+  Status check_index(const Tensor& key, std::size_t index) {
+    if (index >= dtypes_.size()) {
+      return Status(errors::InvalidArgument(
+          "Index '", index, "' for key '", key.scalar<int64>()(),
+          "' was out of bounds '", dtypes_.size(), "'."));
+    }
+
+    return Status::OK();
+  }
+
+  Status copy_or_move_tensors(OptionalTuple* map_tuple, const Tensor& key,
+                              const Tensor& indices, Tuple* output,
+                              bool copy = false) {
+    auto findices = indices.flat<int>();
+
+    // Return values at specified indices
+    for (std::size_t i = 0; i < findices.dimension(0); ++i) {
+      std::size_t index = findices(i);
+
+      TF_RETURN_IF_ERROR(check_index(key, index));
+
+      // Insist on a value present at the specified index
+      if (!(*map_tuple)[index].has_value()) {
+        return Status(errors::InvalidArgument(
+            "Tensor at index '", index, "' for key '", key.scalar<int64>()(),
+            "' has already been removed."));
+      }
+
+      // Copy the contained tensor and
+      // remove from the OptionalTuple
+      output->push_back((*map_tuple)[index].value());
+
+      // Clear out the entry if we're not copying (moving)
+      if (!copy) {
+        (*map_tuple)[index].reset();
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Check that the optional value at the specified index
+  // is uninitialized
+  Status check_index_uninitialized(const Tensor& key, std::size_t index,
+                                   const OptionalTuple& tuple) {
+    if (tuple[index].has_value()) {
+      return Status(errors::InvalidArgument("The tensor for index '",
+        index, "' for key '", key.scalar<int64>()(),
+        "' was already initialized '", dtypes_.size(), "'."));
+    }
+
+    return Status::OK();
+  }
+
+  // Check that the indices are strictly ordered
+  Status check_index_ordering(const Tensor& indices) {
+    auto findices = indices.flat<int>();
+
+    for (std::size_t i = 0; i < findices.dimension(0) - 1; ++i) {
+      if (findices(i) < findices(i + 1)) {
+        continue;
+      }
+
+      return Status(
+          errors::InvalidArgument("Indices are not strictly ordered"));
+    }
+
+    return Status::OK();
+  }
+
+  // Check bytes are within memory limits memory limits
+  Status check_memory_limit(std::size_t bytes) {
+    if (has_memory_limit() && bytes > memory_limit_) {
+      return Status(errors::ResourceExhausted(
+          "Attempted to insert tensors with combined size of '", bytes,
+          "' bytes into Staging Area with a memory limit of '", memory_limit_,
+          "'."));
+    }
+
+    return Status::OK();
+  }
+
+  // Insert incomplete data into the Barrier
+  Status put_incomplete(const KeyType& key, const Tensor& indices,
+                        OptionalTuple* tuple,
+                        std::unique_lock<std::mutex>* lock) {
+    auto findices = indices.flat<int>();
+
+    // Search for the key in our incomplete set
+    auto it = incomplete_.find(key);
+
+    // Check that the tuple fits within the memory limit
+    std::size_t tuple_bytes = get_tuple_bytes(*tuple);
+    TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
+
+    if (has_memory_limit()) {
+      full_.wait(*lock, [tuple_bytes, this]() {
+        // Stop waiting if we don't exceed the memory limit
+        return !would_exceed_memory_limit(tuple_bytes);
+      });
+    }
+
+    // This key isn't present in the incomplete set
+    // Create OptionalTuple and insert
+    if (it == incomplete_.end()) {
+      OptionalTuple empty(dtypes_.size());
+
+      // Initialize empty tuple with given dta
+      for (std::size_t i = 0; i < findices.dimension(0); ++i) {
+        std::size_t index = findices(i);
+        TF_RETURN_IF_ERROR(check_index(key, index));
+
+        // Assign tuple at this index
+        empty[index] = std::move((*tuple)[i]);
+      }
+
+      // Insert into incomplete map
+      incomplete_.insert({key, std::move(empty)});
+
+      // Increment size
+      current_bytes_ += tuple_bytes;
+    }
+    // Found an entry in the incomplete index
+    // Update with given data and insert complete entries
+    // into the main map
+    else
+    {
+      // Reference existing incomplete tuple
+      OptionalTuple& present = it->second;
+
+      // Assign given data
+      for (std::size_t i = 0; i < findices.dimension(0); ++i) {
+        std::size_t index = findices(i);
+        TF_RETURN_IF_ERROR(check_index(key, index));
+        TF_RETURN_IF_ERROR(check_index_uninitialized(key, index, present));
+
+        // Assign tuple at this index
+        present[index] = std::move((*tuple)[i]);
+      }
+
+      // Increment size
+      current_bytes_ += tuple_bytes;
+
+      // Do we have values at all tuple elements?
+      bool complete =
+          std::all_of(present.begin(), present.end(),
+                      [](const OptionalTensor& v) { return v.has_value(); });
+
+      // If so, put the tuple in the actual map
+      if (complete) {
+        OptionalTuple insert_tuple = std::move(it->second);
+
+        // Remove from incomplete
+        incomplete_.erase(it);
+
+        TF_RETURN_IF_ERROR(put_complete(key, &insert_tuple, lock));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Does the insertion into the actual staging area
+  Status put_complete(const KeyType& key, OptionalTuple* tuple,
+                      std::unique_lock<std::mutex>* lock) {
+    // Insert key and tuples into the map
+    map_.insert({key, std::move(*tuple)});
+
+    notify_removers(lock);
+
+    return Status::OK();
+  }
+
+ public:
+  // public methods
+  explicit StagingMap(const DataTypeVector& dtypes, std::size_t capacity,
+                      std::size_t memory_limit)
+      : dtypes_(dtypes),
+        capacity_(capacity),
+        memory_limit_(memory_limit),
+        current_bytes_(0) {}
+
+  Status put(KeyType* key, const Tensor* indices, OptionalTuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Sanity check the indices
+    TF_RETURN_IF_ERROR(check_index_ordering(*indices));
+
+    // Handle incomplete inserts
+    if (indices->NumElements() != dtypes_.size()) {
+      return put_incomplete(*key, *indices, tuple, &lock);
+    }
+
+    std::size_t tuple_bytes = get_tuple_bytes(*tuple);
+    // Check that tuple_bytes fits within the memory limit
+    TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
+
+    // If map capacity is bounded wait until map is not full
+    if (has_capacity() || has_memory_limit()) {
+      full_.wait(lock, [tuple_bytes, this]() {
+        // If there's a memory limit, check if there's space for insertion
+        bool memory_limit_valid =
+            has_memory_limit() ? !would_exceed_memory_limit(tuple_bytes) : true;
+        // If we're configured for capacity check if there's space for insertion
+        bool capacity_valid = has_capacity() ? !is_capacity_full() : true;
+
+        // Stop waiting upon success for both conditions
+        return memory_limit_valid && capacity_valid;
+      });
+    }
+
+    // Do the put operation
+    TF_RETURN_IF_ERROR(put_complete(*key, tuple, &lock));
+
+    // Update the current size
+    current_bytes_ += tuple_bytes;
+
+    return Status::OK();
+  }
+
+  Status get(const KeyType* key, const Tensor* indices, Tuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Sanity check the indices
+    TF_RETURN_IF_ERROR(check_index_ordering(*indices));
+
+    typename MapType::iterator it;
+
+    // Wait until the element with the requested key is present
+    not_empty_.wait(
+        lock, [&, this]() { return (it = map_.find(*key)) != map_.end(); });
+
+    TF_RETURN_IF_ERROR(
+        copy_or_move_tensors(&it->second, *key, *indices, tuple, true));
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    return Status::OK();
+  }
+
+  Status pop(const KeyType* key, const Tensor* indices, Tuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Sanity check the indices
+    TF_RETURN_IF_ERROR(check_index_ordering(*indices));
+
+    typename MapType::iterator it;
+
+    // Wait until the element with the requested key is present
+    not_empty_.wait(
+        lock, [&, this]() { return (it = map_.find(*key)) != map_.end(); });
+
+    TF_RETURN_IF_ERROR(
+        copy_or_move_tensors(&it->second, *key, *indices, tuple));
+
+    // Remove entry if all the values have been consumed
+    if (!std::any_of(it->second.begin(), it->second.end(),
+                     std::mem_fn(&OptionalTensor::has_value))) {
+      map_.erase(it);
+    }
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    notify_inserters_if_bounded(&lock);
+
+    return Status::OK();
+  }
+
+  Status popitem(KeyType* key, const Tensor* indices, Tuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Sanity check the indices
+    TF_RETURN_IF_ERROR(check_index_ordering(*indices));
+
+    // Wait until map is not empty
+    not_empty_.wait(lock, [this]() { return !this->map_.empty(); });
+
+    // Move from the first element and erase it
+
+    auto it = map_.begin();
+
+    TF_RETURN_IF_ERROR(
+        copy_or_move_tensors(&it->second, *key, *indices, tuple));
+
+    *key = it->first;
+
+    // Remove entry if all the values have been consumed
+    if (!std::any_of(it->second.begin(), it->second.end(),
+                     std::mem_fn(&OptionalTensor::has_value))) {
+      map_.erase(it);
+    }
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    notify_inserters_if_bounded(&lock);
+
+    return Status::OK();
+  }
+
+  Status clear() {
+    std::unique_lock<std::mutex> lock(mu_);
+    map_.clear();
+    incomplete_.clear();
+    current_bytes_ = 0;
+
+    notify_inserters_if_bounded(&lock);
+
+    return Status::OK();
+  }
+
+  std::size_t incomplete_size() {
+    std::unique_lock<std::mutex> lock(mu_);
+    return incomplete_.size();
+  }
+
+  std::size_t size() {
+    std::unique_lock<std::mutex> lock(mu_);
+    return map_.size();
+  }
+
+  string DebugString() override { return "StagingMap"; }
+};
+
+template <bool Ordered>
+Status GetStagingMap(OpKernelContext* ctx, const NodeDef& ndef,
+                     StagingMap<Ordered>** map) {
+  auto rm = ctx->resource_manager();
+  ContainerInfo cinfo;
+
+  // Lambda for creating the Staging Area
+  auto create_fn = [&ndef](StagingMap<Ordered>** ret) -> Status {
+    DataTypeVector dtypes;
+    int64 capacity;
+    int64 memory_limit;
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "dtypes", &dtypes));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "memory_limit", &memory_limit));
+    *ret = new StagingMap<Ordered>(dtypes, capacity, memory_limit);
+    return Status::OK();
+  };
+
+  TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<StagingMap<Ordered>>(
+      cinfo.container(), cinfo.name(), map, create_fn));
+  return Status::OK();
+}
+
+template <bool Ordered>
+class MapStageOp : public OpKernel {
+ public:
+  explicit MapStageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::OptionalTuple tuple;
+
+    const Tensor* key_tensor;
+    const Tensor* indices_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input_list("values", &values_tensor));
+
+    // Create copy for insertion into Staging Area
+    Tensor key(*key_tensor);
+
+    // Create the tuple to store
+    for (std::size_t i = 0; i < values_tensor.size(); ++i) {
+      tuple.push_back(values_tensor[i]);
+    }
+
+    // Store the tuple in the map
+    OP_REQUIRES_OK(ctx, map->put(&key, indices_tensor, &tuple));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapStage").Device(DEVICE_CPU),
+                      MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage").Device(DEVICE_CPU),
+                      MapStageOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MapStage").HostMemory("key").HostMemory("indices").Device(DEVICE_GPU),
+    MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapStageOp<true>);
+#endif // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapStage").HostMemory("key").Device(DEVICE_SYCL),
+                        MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(
+    Name("OrderedMapStage").HostMemory("key").Device(DEVICE_SYCL),
+    MapStageOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapUnstageOp : public OpKernel {
+ public:
+  explicit MapUnstageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor* key_tensor;
+    const Tensor* indices_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
+    OP_REQUIRES_OK(ctx, map->pop(key_tensor, indices_tensor, &tuple));
+
+    OP_REQUIRES(
+        ctx, tuple.size() == indices_tensor->NumElements(),
+        errors::InvalidArgument("output/indices size mismatch: ", tuple.size(),
+                                " vs. ", indices_tensor->NumElements()));
+
+    for (std::size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapUnstage").Device(DEVICE_CPU),
+                            MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage").Device(DEVICE_CPU),
+                            MapUnstageOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapUnstage")
+                        .HostMemory("key")
+                        .HostMemory("indices")
+                        .Device(DEVICE_GPU), MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
+                        .HostMemory("key")
+                        .HostMemory("indices")
+                        .Device(DEVICE_GPU), MapUnstageOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapUnstage")
+                        .HostMemory("key")
+                        .HostMemory("indices")
+                        .Device(DEVICE_SYCL), MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
+                        .HostMemory("key")
+                        .HostMemory("indices")
+                        .Device(DEVICE_SYCL), MapUnstageOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapPeekOp : public OpKernel {
+ public:
+  explicit MapPeekOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor* key_tensor;
+    const Tensor* indices_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
+    OP_REQUIRES_OK(ctx, map->get(key_tensor, indices_tensor, &tuple));
+
+    OP_REQUIRES(
+        ctx, tuple.size() == indices_tensor->NumElements(),
+        errors::InvalidArgument("output/indices size mismatch: ", tuple.size(),
+                                " vs. ", indices_tensor->NumElements()));
+
+    for (std::size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapPeek").Device(DEVICE_CPU), MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek").Device(DEVICE_CPU),
+                        MapPeekOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_GPU),
+    MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapPeekOp<true>);
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_SYCL),
+    MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapPeekOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapUnstageNoKeyOp : public OpKernel {
+ public:
+  explicit MapUnstageNoKeyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Pop a random (key, value) off the map
+    typename StagingMap<Ordered>::KeyType key;
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor* indices_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
+    OP_REQUIRES_OK(ctx, map->popitem(&key, indices_tensor, &tuple));
+
+    // Allocate a key tensor and assign the key as the first output
+    ctx->set_output(0, key);
+
+    // Set the rest of the outputs to the tuple Tensors
+    OP_REQUIRES(
+        ctx, tuple.size() == indices_tensor->NumElements(),
+        errors::InvalidArgument("output/indices size mismatch: ", tuple.size(),
+                                " vs. ", indices_tensor->NumElements()));
+
+    for (std::size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i+1, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey").Device(DEVICE_CPU),
+                        MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey").Device(DEVICE_CPU),
+                        MapUnstageNoKeyOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapUnstageNoKeyOp<true>);
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapUnstageNoKeyOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapSizeOp : public OpKernel {
+ public:
+  explicit MapSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Allocate size output tensor
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(map->size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_CPU),
+                        MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_CPU),
+                        MapSizeOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapSizeOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapSizeOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapIncompleteSizeOp : public OpKernel {
+ public:
+  explicit MapIncompleteSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Allocate size output tensor
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(map->incomplete_size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_CPU),
+                        MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_CPU),
+                        MapIncompleteSizeOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapIncompleteSizeOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapIncompleteSizeOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapClearOp : public OpKernel {
+ public:
+  explicit MapClearOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    OP_REQUIRES_OK(ctx, map->clear());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_CPU),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_CPU),
+                        MapClearOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_GPU),
+                        MapClearOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_SYCL),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_SYCL),
+                        MapClearOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index a2b0127fac1..8003f7ff67f 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -46,6 +46,9 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, bool USE_CUBLAS>
 struct LaunchMatMul;
@@ -118,29 +121,85 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
   return false;
 }
 
-// On CPUs, we ignore USE_CUBLAS
-template <typename T>
-struct LaunchMatMulCPU {
+template <typename Device, typename T>
+struct LaunchMatMulBase {
   static void launch(
       OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       Tensor* out) {
+#ifndef TENSORFLOW_USE_SYCL
     // An explicit vector-matrix multiply is much better optimized than an
     // implicit one and this is a bottleneck during non-batched inference.
     bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
     if (!was_vector) {
-      functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
-                                             out->matrix<T>(), a.matrix<T>(),
-                                             b.matrix<T>(), dim_pair);
+#endif  // TENSORFLOW_USE_SYCL
+      functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                          out->matrix<T>(), a.matrix<T>(),
+                                          b.matrix<T>(), dim_pair);
+#ifndef TENSORFLOW_USE_SYCL
     }
+#endif  // TENSORFLOW_USE_SYCL
   }
 };
+// On CPUs, we ignore USE_CUBLAS
+template <typename T>
+struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
 
 template <typename T, bool USE_CUBLAS>
 struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
+
+template <typename T, bool USE_CUBLAS>
+struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
+#endif  // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 
+namespace {
+template <typename T>
+struct LaunchBlasGemv {
+  static void Compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
+                      bool trans, uint64 m, uint64 n,
+                      const perftools::gputools::DeviceMemory<T>& a,
+                      const perftools::gputools::DeviceMemory<T>& b,
+                      perftools::gputools::DeviceMemory<T>* c) {
+    const auto blas_trans =
+        trans ? perftools::gputools::blas::Transpose::kTranspose
+              : perftools::gputools::blas::Transpose::kNoTranspose;
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1,
+                           static_cast<T>(0.0), c, 1)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(
+          errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
+    }
+  }
+
+  static bool IsSupported() { return true; }
+};
+
+template <>
+void LaunchBlasGemv<Eigen::half>::Compute(
+    OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
+    uint64 m, uint64 n, const perftools::gputools::DeviceMemory<Eigen::half>& a,
+    const perftools::gputools::DeviceMemory<Eigen::half>& b,
+    perftools::gputools::DeviceMemory<Eigen::half>* c) {
+  ctx->SetStatus(errors::Internal(
+      "Blas GEMV launch failed: GEMV is not implemented for float16."));
+}
+
+template <>
+bool LaunchBlasGemv<Eigen::half>::IsSupported() {
+  return false;
+}
+
+}  // namespace
+
 template <typename T>
 struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
   static void launch(
@@ -164,22 +223,31 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
     auto a_ptr = AsDeviceMemory(a.template flat<T>().data());
     auto b_ptr = AsDeviceMemory(b.template flat<T>().data());
     auto c_ptr = AsDeviceMemory(out->template flat<T>().data());
-
     // Cublas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
     // We want the output to be in row-major, so we can compute
     // C' = B' x A' (' stands for transpose)
-    bool blas_launch_status =
-        stream->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
+    if (LaunchBlasGemv<T>::IsSupported() && n == 1) {
+      // This is a matrix*vector multiply so use GEMV to compute A * b.
+      // Here we are multiplying in the natural order, so we have to flip
+      // the transposition flag to compensate for the tensor being stored
+      // row-major.
+      LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a, transpose_a ? m : k,
+                                 transpose_a ? k : m, a_ptr, b_ptr, &c_ptr);
+    } else {
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                              b_ptr, transpose_b ? k : n, a_ptr,
                              transpose_a ? m : k, 0.0f, &c_ptr, n)
-            .ok();
-    if (!blas_launch_status) {
-      ctx->SetStatus(errors::Internal(
-          "Blas SGEMM launch failed : a.shape=(", a.dim_size(0), ", ",
-          a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
-          "), m=", m, ", n=", n, ", k=", k));
+              .ok();
+      if (!blas_launch_status) {
+        ctx->SetStatus(errors::Internal(
+            "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
+            a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
+            "), m=", m, ", n=", n, ", k=", k));
+      }
     }
   }
 };
@@ -207,11 +275,11 @@ class MatMulOp : public OpKernel {
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
 
-    OP_REQUIRES(ctx,
-                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
+            ", In[1]: ", b.shape().DebugString()));
     int a_dim_remaining = 1 - dim_pair[0].first;
     int b_dim_remaining = 1 - dim_pair[0].second;
     TensorShape out_shape(
@@ -256,6 +324,20 @@ struct MatMulFunctor<CPUDevice, T> {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
+template <typename T>
+struct MatMulFunctor<SYCLDevice, T> {
+  void operator()(
+      const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
+      typename MatMulTypes<T>::in_type in0,
+      typename MatMulTypes<T>::in_type in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // end namespace functor
 
 #define REGISTER_CPU(T)                                                        \
@@ -276,6 +358,13 @@ struct MatMulFunctor<CPUDevice, T> {
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
+#if defined(INTEL_MKL)
+// MKL does not support half and int32 types for matrix-multiplication, so
+// register the kernel to use default Eigen based implementations for these
+// types
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_int32(REGISTER_CPU);
+#else
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
@@ -283,6 +372,7 @@ TF_CALL_half(REGISTER_CPU);
 TF_CALL_int32(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
+#endif
 
 #if GOOGLE_CUDA
 TF_CALL_float(REGISTER_GPU);
@@ -294,4 +384,18 @@ TF_CALL_half(REGISTER_GPU);
 #endif
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      MatMulOp<SYCLDevice, T, false /* xxblas */>);              \
+  REGISTER_KERNEL_BUILDER(Name("MatMul")                         \
+                              .Device(DEVICE_SYCL)               \
+                              .TypeConstraint<T>("T")            \
+                              .Label("eigen"),                   \
+                          MatMulOp<SYCLDevice, T, false /* xxblas */>)
+TF_CALL_float(REGISTER_SYCL);
+TF_CALL_double(REGISTER_SYCL);
+
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 37c185e7560..63e0f251143 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -85,8 +85,32 @@ BM_Matmul(8, 10000, 200, false, true);
 BM_Matmul(20, 10000, 200, false, true);
 BM_Matmul(20, 20000, 200, false, true);
 
-// Test some other matrix-vector multiplies.
-BM_Matmul(10000, 200, 1, false, false);
-BM_Matmul(10000, 200, 1, true, false);
+// Test some matrix-vector multiplies.
+BM_Matmul(50, 50, 1, false, false);
+BM_Matmul(50, 50, 1, true, false);
+BM_Matmul(50, 50, 1, false, true);
+BM_Matmul(50, 50, 1, true, true);
+BM_Matmul(500, 500, 1, false, false);
+BM_Matmul(500, 500, 1, true, false);
+BM_Matmul(500, 500, 1, false, true);
+BM_Matmul(500, 500, 1, true, true);
+BM_Matmul(2000, 2000, 1, false, false);
+BM_Matmul(2000, 2000, 1, true, false);
+BM_Matmul(2000, 2000, 1, false, true);
+BM_Matmul(2000, 2000, 1, true, true);
+
+// Test some vector-matrix multiplies.
+BM_Matmul(1, 50, 50, false, false);
+BM_Matmul(1, 50, 50, true, false);
+BM_Matmul(1, 50, 50, false, true);
+BM_Matmul(1, 50, 50, true, true);
+BM_Matmul(1, 500, 500, false, false);
+BM_Matmul(1, 500, 500, true, false);
+BM_Matmul(1, 500, 500, false, true);
+BM_Matmul(1, 500, 500, true, true);
+BM_Matmul(1, 2000, 2000, false, false);
+BM_Matmul(1, 2000, 2000, true, false);
+BM_Matmul(1, 2000, 2000, false, true);
+BM_Matmul(1, 2000, 2000, true, true);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index 029ea400923..894b0113c28 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -83,7 +83,7 @@ class MatrixBandPartOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("MatrixBandPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixBandPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_BAND_PART);
 #undef REGISTER_MATRIX_BAND_PART
 
 // Registration of the deprecated kernel.
@@ -143,7 +143,9 @@ namespace functor {
   extern template struct MatrixBandPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-
+TF_CALL_bool(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 // Registration of the GPU implementations.
@@ -155,6 +157,9 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
                               .HostMemory("num_upper"),  \
                           MatrixBandPartOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_bool(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_complex64(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_complex128(REGISTER_MATRIX_BAND_PART_GPU);
 #undef REGISTER_MATRIX_BAND_PART_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index 483eda76bf3..ccc10ebada9 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -29,6 +29,9 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixBandPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
+TF_CALL_complex64(DEFINE_GPU_SPEC);
+TF_CALL_complex128(DEFINE_GPU_SPEC);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc
index 58b1b3a5e13..75c49baaa84 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_diag_op.cc
@@ -123,7 +123,7 @@ class MatrixDiagOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("MatrixDiagPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_DIAG);
 #undef REGISTER_MATRIX_DIAG
 
 // Registration of the deprecated kernel.
@@ -136,7 +136,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
                               .Device(DEVICE_CPU)                           \
                               .TypeConstraint<type>("T"),                   \
                           MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_DIAG);
 #undef REGISTER_BATCH_MATRIX_DIAG
 
 // Implementation of the functor specialization for CPU.
@@ -187,6 +187,9 @@ namespace functor {
   extern template struct MatrixDiagPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
 
 }  // namespace functor
 
@@ -199,6 +202,9 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("MatrixDiagPart").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       MatrixDiagPartOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_complex64(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_complex128(REGISTER_MATRIX_DIAG_GPU);
 #undef REGISTER_MATRIX_DIAG_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
index 8d3bbb048e0..cfb1fa10fc3 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
@@ -31,6 +31,9 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixDiagPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
+TF_CALL_complex64(DEFINE_GPU_SPEC);
+TF_CALL_complex128(DEFINE_GPU_SPEC);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index 0572d48b3ec..cdb2ba36556 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -26,22 +30,22 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#endif
+
 namespace tensorflow {
 
 template <class Scalar>
 class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit MatrixInverseOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
 
-  using Matrix = typename Base::Matrix;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
@@ -64,9 +68,10 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
     // a result of basic user mistakes, such as providing integer valued
     // matrices that are exactly singular, or due to underflow if this
     // code is run with denormals being flushed to zero.
-    const Scalar min_abs_pivot =
+    using RealScalar = typename Base::RealScalar;
+    const RealScalar min_abs_pivot =
         lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
-    OP_REQUIRES(context, min_abs_pivot > Scalar(0),
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
                 errors::InvalidArgument("Input is not invertible."));
     outputs->at(0).noalias() = lu_decomposition.inverse();
   }
@@ -77,8 +82,150 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
 };
 
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Scalar>
+class MatrixInverseOpGpu : public AsyncOpKernel {
+ public:
+  explicit MatrixInverseOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* out;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->forward_input_or_allocate_output({0}, 0, input.shape(), &out),
+        done);
+
+    // By definition, an empty matrix's inverse is an empty matrix.
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Make a copy of the (possible adjointed) input that we will use for the
+    // factorization step.
+    Tensor input_copy;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                                input.shape(), &input_copy),
+                         done);
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+    auto input_copy_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    if (!adjoint_) {
+      d.memcpy(input_copy_reshaped.data(), input_reshaped.data(),
+               input.NumElements() * sizeof(Scalar));
+    } else {
+      functor::AdjointBatchFunctor<GPUDevice, Scalar> functor;
+      functor(d, input_reshaped, input_copy_reshaped);
+    }
+    const int64 batch_size = input_copy_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    ScratchSpace<int> pivots(context, n * batch_size, /* on_host */ false);
+
+    // Prepare pointer arrays for cuBlas' batch interface.
+    // TODO(rmlarsen): Find a way to encode pointer arrays in pinned host memory
+    // without the ugly casting.
+    ScratchSpace<uint8> input_copy_ptrs(context, sizeof(Scalar*) * batch_size,
+                                        /* on_host */ true);
+    ScratchSpace<uint8> output_ptrs(context, sizeof(Scalar*) * batch_size,
+                                    /* on_host */ true);
+    const Scalar** input_copy_ptrs_base =
+        reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
+    const Scalar** output_ptrs_base =
+        reinterpret_cast<const Scalar**>(output_ptrs.mutable_data());
+    auto output_reshaped = out->template flat_inner_dims<Scalar, 3>();
+    for (int64 i = 0; i < batch_size; ++i) {
+      input_copy_ptrs_base[i] = input_copy_reshaped.data() + i * n * n;
+      output_ptrs_base[i] = output_reshaped.data() + i * n * n;
+    }
+
+    // Launch the two solver kernels back to back without waiting.
+    // 1. Compute the partially pivoted LU factorization(s) of the
+    // matrix/matrices.
+    CudaSolver solver(context);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "getrf");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetrfBatched(n, input_copy_ptrs_base, n, pivots.mutable_data(),
+                            &dev_info.back(), batch_size),
+        done);
+    // 2. Compute the inverse(s).
+    dev_info.emplace_back(context, batch_size, "getri");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetriBatched(n, input_copy_ptrs_base, n, pivots.data(),
+                            output_ptrs_base, n, &dev_info.back(), batch_size),
+        done);
+
+    // Register callback to check info after kernels finish. Also capture the
+    // temporary Tensors/ScratchSpace so they don't get deallocated before the
+    // kernels run. TODO(rmlarsen): Use move capture once C++14 becomes
+    // available.
+    auto info_checker = [context, dev_info, input_copy, pivots, input_copy_ptrs,
+                         output_ptrs,
+                         done](const Status& status,
+                               const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the call itself
+          // below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+
+ private:
+  bool adjoint_;
+};
+
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<double>), double);
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<complex64>),
+                       complex64);
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<complex128>),
+                       complex128);
+
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double>), double);
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<complex64>), complex64);
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<complex128>), complex128);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<double>), double);
 
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 952da7d8df3..3397af56bcf 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -78,8 +78,8 @@ class MatrixSetDiagOp : public OpKernel {
     auto diag_reshaped = diag.flat_inner_dims<T, 2>();
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_shape, &output));
-
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input_shape, &output));
     auto output_reshaped = output->flat_inner_dims<T, 3>();
     Tensor scratch_tensor;
     OP_REQUIRES_OK(context,
@@ -100,7 +100,7 @@ class MatrixSetDiagOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                \
       Name("MatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_SET_DIAG);
 #undef REGISTER_MATRIX_SET_DIAG
 
 // Registration of the deprecated kernel.
@@ -109,7 +109,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("BatchMatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
 #undef REGISTER_BATCH_MATRIX_SET_DIAG
 
 namespace functor {
@@ -131,6 +131,21 @@ struct MatrixSetDiag<CPUDevice, T> {
   }
 };
 
+template <>
+struct MatrixSetDiag<CPUDevice, bool> {
+  static void Compute(const CPUDevice& d, TTypes<bool, 3>::ConstTensor input,
+                      TTypes<bool, 2>::ConstTensor diag,
+                      TTypes<bool>::Scalar scratch,
+                      TTypes<bool, 3>::Tensor output) {
+    output.device(d) = input;
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < diag.dimension(1); ++d) {
+        output(r, d, d) = diag(r, d);
+      }
+    }
+  }
+};
+
 }  // namespace functor
 
 #if GOOGLE_CUDA
@@ -147,6 +162,9 @@ namespace functor {
   extern template struct MatrixSetDiag<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
 
 }  // namespace functor
 
@@ -156,6 +174,9 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("MatrixSetDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_complex64(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_complex128(REGISTER_MATRIX_SET_DIAG_GPU);
 #undef REGISTER_MATRIX_SET_DIAG_GPU
 
 // Registration of the deprecated kernel.
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/matrix_set_diag_op.h
index 8ba2f3756a2..63e5650bf02 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.h
+++ b/tensorflow/core/kernels/matrix_set_diag_op.h
@@ -71,6 +71,23 @@ struct MatrixSetDiag {
   }
 };
 
+template <typename Device>
+struct MatrixSetDiag<Device, bool> {
+  EIGEN_ALWAYS_INLINE static void Compute(const Device& d,
+                                          TTypes<bool, 3>::ConstTensor input,
+                                          TTypes<bool, 2>::ConstTensor diag,
+                                          TTypes<bool>::Scalar scratch,
+                                          TTypes<bool, 3>::Tensor output) {
+    output.device(d) = input;
+    generator::OverwriteDiagGenerator<bool> generator(diag, output);
+    // Use all() to force the generation to aggregate to the scalar
+    // output scratch.  This in turn forces each element of the
+    // generator to execute.  The side effect of the execution is to
+    // update the diagonal components of output with diag.
+    scratch.device(d) = diag.generate(generator).all();
+  }
+};
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index ba3e475ee8a..8e41ce5860c 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -29,6 +29,9 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixSetDiag<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
+TF_CALL_complex64(DEFINE_GPU_SPEC);
+TF_CALL_complex128(DEFINE_GPU_SPEC);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op.cc b/tensorflow/core/kernels/matrix_solve_ls_op.cc
index 716015e7de1..381a5ec7b9d 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_ls_op.cc
@@ -47,7 +47,7 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
   // in context->input(2).
   int NumMatrixInputs(const OpKernelContext* context) const final { return 2; }
 
-  virtual void ValidateInputMatrixShapes(
+  void ValidateInputMatrixShapes(
       OpKernelContext* context,
       const TensorShapes& input_matrix_shapes) const final {
     Base::ValidateSolver(context, input_matrix_shapes);
@@ -68,6 +68,8 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
                                                   : static_cast<int64>(cost);
   }
 
+  bool EnableInputForwarding() const final { return false; }
+
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
@@ -103,18 +105,19 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
         // using Cholesky decomposition.
         Matrix gramian(cols, cols);
         gramian.template triangularView<Eigen::Lower>() =
-            matrix.transpose() * matrix;
+            matrix.adjoint() * matrix;
         if (l2_regularizer > 0) {
           gramian +=
               (Scalar(l2_regularizer) * Matrix::Ones(cols, 1)).asDiagonal();
         }
-        const Eigen::LLT<Matrix, Eigen::Lower> llt(gramian);
+        const Eigen::LLT<Eigen::Ref<Matrix>, Eigen::Lower> llt(gramian);
         OP_REQUIRES(
             context, llt.info() == Eigen::Success,
             errors::InvalidArgument("Input matrix was rank deficient or "
                                     "ill-conditioned. Try setting fast=False "
                                     "or provide a larger l2_regularizer > 0."));
-        outputs->at(0) = llt.solve(matrix.transpose() * rhs);
+        outputs->at(0).noalias() = matrix.adjoint() * rhs;
+        llt.solveInPlace(outputs->at(0));
       } else {
         // Underdetermined case (rows < cols): Solves the minimum-norm problem
         //   min ||X||_F^2 s.t. A*X = RHS
@@ -123,18 +126,18 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
         // using Cholesky decomposition.
         Matrix gramian(rows, rows);
         gramian.template triangularView<Eigen::Lower>() =
-            matrix * matrix.transpose();
+            matrix * matrix.adjoint();
         if (l2_regularizer > 0) {
           gramian +=
               (Scalar(l2_regularizer) * Matrix::Ones(rows, 1)).asDiagonal();
         }
-        const Eigen::LLT<Matrix, Eigen::Lower> llt(gramian);
+        const Eigen::LLT<Eigen::Ref<Matrix>, Eigen::Lower> llt(gramian);
         OP_REQUIRES(
             context, llt.info() == Eigen::Success,
             errors::InvalidArgument("Input matrix was rank deficient or "
                                     "ill-conditioned. Try setting fast=False "
                                     "or provide an l2_regularizer > 0."));
-        outputs->at(0) = matrix.transpose() * llt.solve(rhs);
+        outputs->at(0).noalias() = matrix.adjoint() * llt.solve(rhs);
       }
     } else {
       // Use complete orthogonal decomposition which is backwards stable and
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index e10a1028711..32329beeffb 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -63,6 +63,8 @@ class MatrixSolveOp : public LinearAlgebraOp<Scalar> {
                                                   : static_cast<int64>(cost);
   }
 
+  bool EnableInputForwarding() const final { return false; }
+
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 5f30a95108a..953f37fa029 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -62,7 +62,7 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
   using ConstMatrixMap = typename Base::ConstMatrixMap;
   using ConstMatrixMaps = typename Base::ConstMatrixMaps;
 
-  virtual void ValidateInputMatrixShapes(
+  void ValidateInputMatrixShapes(
       OpKernelContext* context,
       const TensorShapes& input_matrix_shapes) const final {
     Base::ValidateSquareSolver(context, input_matrix_shapes);
@@ -77,13 +77,15 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
   int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
     double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
-    double cost = rows * rows * num_rhss * 
-          (Eigen::TensorOpCost::AddCost<Scalar>() + 
-           Eigen::TensorOpCost::MulCost<Scalar>());
+    double cost = rows * rows * num_rhss *
+                  (Eigen::TensorOpCost::AddCost<Scalar>() +
+                   Eigen::TensorOpCost::MulCost<Scalar>());
     return cost >= static_cast<double>(kint64max) ? kint64max
                                                   : static_cast<int64>(cost);
   }
 
+  bool EnableInputForwarding() const final { return false; }
+
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
@@ -95,8 +97,9 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
       // an empty set of equation as the empty matrix.
       return;
     }
-    const Scalar min_abs_pivot = matrix.diagonal().cwiseAbs().minCoeff();
-    OP_REQUIRES(context, min_abs_pivot > Scalar(0),
+    using RealScalar = typename Base::RealScalar;
+    const RealScalar min_abs_pivot = matrix.diagonal().cwiseAbs().minCoeff();
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
                 errors::InvalidArgument("Input matrix is not invertible."));
     if (lower_) {
       auto triangle = matrix.template triangularView<Eigen::Lower>();
@@ -122,12 +125,29 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };
 
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<double>), double);
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<complex64>), complex64);
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<complex128>), complex128);
+REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<double>), double);
 
 #ifdef GOOGLE_CUDA
+
+// TODO(rmlarsen): Re-factor to
+// 1. Enable buffer forwarding from rhs->out.
+// 2. Save Memcpy when buffer forwarding is used.
+// 3. Copy entire rhs in a single Memcpy when forwarding is not used.
 template <class Scalar>
 class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit MatrixTriangularSolveOpGPU(OpKernelConstruction* context)
       : Base(context), lower_(true), adjoint_(false) {
@@ -135,14 +155,7 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
 
-  using TensorShapes = typename Base::TensorShapes;
-  using Matrix = typename Base::Matrix;
-  using MatrixMap = typename Base::MatrixMap;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
-  virtual void ValidateInputMatrixShapes(
+  void ValidateInputMatrixShapes(
       OpKernelContext* context,
       const TensorShapes& input_matrix_shapes) const final {
     Base::ValidateSquareSolver(context, input_matrix_shapes);
@@ -157,13 +170,15 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
   int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
     double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
-    double cost = rows * rows * num_rhss * 
-          (Eigen::TensorOpCost::AddCost<Scalar>() + 
-           Eigen::TensorOpCost::MulCost<Scalar>());
+    double cost = rows * rows * num_rhss *
+                  (Eigen::TensorOpCost::AddCost<Scalar>() +
+                   Eigen::TensorOpCost::MulCost<Scalar>());
     return cost >= static_cast<double>(kint64max) ? kint64max
                                                   : static_cast<int64>(cost);
   }
 
+  bool EnableInputForwarding() const final { return false; }
+
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
@@ -184,7 +199,7 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     uint64 rhs_elems = rhs.rows() * rhs.cols();
     bool copy_status =
         stream->ThenMemcpyD2D(&out_ptr, rhs_ptr, sizeof(Scalar) * rhs_elems)
-        .ok();
+            .ok();
     if (!copy_status) {
       context->SetStatus(
           errors::Internal("Failed to copy rhs into output before solve"));
@@ -205,25 +220,25 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
       upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
     }
     if (adjoint_) {
-      transpose_matrix = perftools::gputools::blas::Transpose::kTranspose;
+      transpose_matrix =
+          perftools::gputools::blas::Transpose::kConjugateTranspose;
     } else {
       transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
     }
-    uint64 leading_dim_matrix = matrix.cols();   
-    uint64 leading_dim_output = output.cols();      
-    uint64 colmajor_rows = output.cols(); 
-    uint64 colmajor_cols = output.rows(); 
+    uint64 leading_dim_matrix = matrix.cols();
+    uint64 leading_dim_output = output.cols();
+    uint64 colmajor_rows = output.cols();
+    uint64 colmajor_cols = output.rows();
     bool blas_launch_status =
-      stream
-        ->ThenBlasTrsm(perftools::gputools::blas::Side::kRight /*side*/, 
-                       upper_lower_matrix /*uplo*/, 
-                       transpose_matrix /*trans*/,
-                       perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
-                       colmajor_rows /*m*/, colmajor_cols /*n*/, 
-                       Scalar(1.0) /*alpha*/, 
-                       matrix_ptr, leading_dim_matrix /*lda*/, 
-                       &out_ptr, leading_dim_output /*ldb*/)
-        .ok();
+        stream
+            ->ThenBlasTrsm(
+                perftools::gputools::blas::Side::kRight /*side*/,
+                upper_lower_matrix /*uplo*/, transpose_matrix /*trans*/,
+                perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
+                colmajor_rows /*m*/, colmajor_cols /*n*/, Scalar(1.0) /*alpha*/,
+                matrix_ptr, leading_dim_matrix /*lda*/, &out_ptr,
+                leading_dim_output /*ldb*/)
+            .ok();
     if (!blas_launch_status) {
       context->SetStatus(errors::Internal("Blas TRSM launch failed"));
     }
@@ -235,41 +250,20 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOpGPU);
 };
+
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<float>), float);
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<double>), double);
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<complex64>), complex64);
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<complex128>), complex128);
+REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<float>), float);
+REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<double>), double);
+
 #endif  // GOOGLE_CUDA
 
-REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<float>),
-                   float);
-REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<double>),
-                   double);
-REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
-                   (MatrixTriangularSolveOp<float>), float);
-REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
-                   (MatrixTriangularSolveOp<double>), double);
-
-#ifdef GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T"),
-    MatrixTriangularSolveOpGPU<float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("MatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T"),
-    MatrixTriangularSolveOpGPU<double>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchMatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T"),
-    MatrixTriangularSolveOpGPU<float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchMatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T"),
-    MatrixTriangularSolveOpGPU<double>);
-#endif  //GOOGLE_CUDA
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 98b4558a3a8..6cb56797bff 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -46,6 +47,7 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 const int kInvalidMaxPoolingIndex = -1;
 
@@ -187,40 +189,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         params.tensor_in_batch, shard_cost, shard);
 }
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingOp<CPUDevice, Eigen::half>);
-
-#if GOOGLE_CUDA
-// Forward declarations for the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                            \
-  template <>                                                          \
-  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
-      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
-      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
-      int window_cols, int row_stride, int col_stride,                 \
-      const Eigen::PaddingType& padding);                              \
-  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
-
-DECLARE_GPU_SPEC(float);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
-// Note(jiayq): Currently, the Caffe custom implementation is faster than the
-// default Eigen implementation so we are using the custom kernel as the
-// default. However, you can explicitly invoke the eigen version using
-// kernel_label_map.
-REGISTER_KERNEL_BUILDER(Name("MaxPool")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .Label("eigen_tensor"),
-                        MaxPoolingOp<Eigen::GpuDevice, float>);
-#endif  // GOOGLE_CUDA
-
 // The operation to compute MaxPool gradients.
 // It takes three inputs:
 //   - The original input tensor
@@ -237,7 +205,7 @@ class MaxPoolingGradOp : public OpKernel {
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES(
         context, data_format_ == FORMAT_NHWC,
-        errors::InvalidArgument("Default MaxPoolinGradOp only supports NHWC ",
+        errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
                                 "on device type ",
                                 DeviceTypeString(context->device_type())));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
@@ -275,9 +243,9 @@ class MaxPoolingGradOp : public OpKernel {
     const TensorShape& output_shape = tensor_in.shape();
 
     Tensor tensor_out_dup;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::v(),
-                                          tensor_out.shape(), &tensor_out_dup));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
+                                &tensor_out_dup));
     Tensor tensor_out_arg_max;
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
                                                    tensor_out.shape(),
@@ -290,7 +258,8 @@ class MaxPoolingGradOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, output_shape, &output));
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
@@ -304,13 +273,6 @@ class MaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<CPUDevice, Eigen::half>);
-
 #ifdef GOOGLE_CUDA
 
 template <typename T>
@@ -319,9 +281,8 @@ static void MaxPoolingBackwardCustomKernel(
     const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
   Tensor* output = nullptr;
-
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(0, tensor_in_shape, &output));
+  OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                              {0}, 0, tensor_in_shape, &output));
 
   PoolParameters params{context, size,        stride,
                         padding, FORMAT_NHWC, tensor_in_shape};
@@ -329,13 +290,13 @@ static void MaxPoolingBackwardCustomKernel(
     return;
   }
 
-  MaxPoolBackwardNoMask(
+  functor::MaxPoolBackwardNoMask<T>()(
       tensor_in->flat<T>().data(), params.tensor_in_batch,
       params.tensor_in_rows, params.tensor_in_cols, params.depth,
       params.out_height, params.out_width, params.window_rows,
       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(),
-      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
+      context->eigen_device<Eigen::GpuDevice>());
 }
 
 template <class T>
@@ -403,12 +364,252 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   bool use_dnn_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);
+#endif  // GOOGLE_CUDA
+
+// The operation to compute gradient of MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output gradients
+// It produces one output: backprop tensor for output gradient.
+template <class Device, class T>
+class MaxPoolingGradGradOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(
+        context, data_format_ == FORMAT_NHWC,
+        errors::InvalidArgument(
+            "Default MaxPoolingGradGradOp only supports NHWC ",
+            "on device type ", DeviceTypeString(context->device_type())));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize_[3] == 1 && stride_[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGradGrad is not yet supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
+                           out_grad_backprop, params, padding_);
+  }
+
+ private:
+  void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
+                              const Tensor& tensor_in, const Tensor& tensor_out,
+                              const Tensor& top_diff,
+                              const PoolParameters& params,
+                              const Padding& padding) {
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(
+        tensor_in.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(
+        tensor_out.flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        bottom_diff->flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    // The following code basically does the following:
+    // 1. Flattens the input, output, top_diff and bottom_diff tensors into
+    //    two dimensional arrays.
+    //    tensor_in_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    tensor_out_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //    top_diff_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    bottom_diff_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //
+    // 2. Walks through the set of columns in the flattened
+    //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
+    //    and updates the column(s) corresponding to the maximum values in
+    //    tensor_out_as_matrix with the corresponding values in
+    //    top_diff_as_matrix.
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size = out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int ph = 0; ph < out_height; ++ph) {
+          for (int pw = 0; pw < out_width; ++pw) {
+            // (h_start, h_end) * (w_start, w_end) is the range that the input
+            // vector projects to.
+            int h_start = ph * row_stride - pad_rows;
+            const int h_end = std::min(h_start + window_rows, in_rows);
+            int w_start = pw * col_stride - pad_cols;
+            const int w_end = std::min(w_start + window_cols, in_cols);
+            h_start = std::max(h_start, 0);
+            w_start = std::max(w_start, 0);
+            const int out_index = (b * out_height + ph) * out_width + pw;
+            // Find value corresponding to the input maximum in top_diff.
+            for (int d = 0; d < depth; ++d) {
+              const T& output_ref = out_mat.coeffRef(d, out_index);
+              bool should_stop = false;
+              for (int h = h_start; h < h_end && !should_stop; ++h) {
+                for (int w = w_start; w < w_end && !should_stop; ++w) {
+                  const int in_index = (b * in_rows + h) * in_cols + w;
+                  const T& input_ref = in_mat.coeffRef(d, in_index);
+                  if (output_ref == input_ref) {
+                    T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
+                    bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                    should_stop = true;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+
+    const int64 shard_cost = params.out_width * params.out_height *
+                             params.depth * params.window_rows *
+                             params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#ifdef GOOGLE_CUDA
+
+template <class T>
+class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
+ public:
+  typedef Eigen::GpuDevice Device;
+
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, tensor_out.shape(), &output));
+
+    PoolParameters params{context,  ksize_,       stride_,
+                          padding_, data_format_, tensor_in.shape()};
+
+    functor::MaxPoolGradBackwardNoMask<T>()(
+        data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
+        params.tensor_in_batch, params.out_height, params.out_width,
+        params.depth, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_rows, params.window_cols, params.row_stride,
+        params.col_stride, params.pad_rows, params.pad_cols,
+        out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
+        context->eigen_device<Eigen::GpuDevice>());
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool use_dnn_;
+};
 
 #endif  // GOOGLE_CUDA
 
@@ -552,7 +753,8 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
     TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
                            params.tensor_in_cols, params.depth});
     Tensor* grad_out = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, out_shape, &grad_out));
 
     LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
                                                       argmax, grad_out);
@@ -564,6 +766,56 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
   Padding padding_;
 };
 
+template <typename Device, typename T>
+struct LaunchMaxPoolingGradGradWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& grad_in = context->input(1);
+    const Tensor& argmax = context->input(2);
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+
+    Tensor* grad_out = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, out_shape, &grad_out));
+
+    LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
+        context, params, grad_in, argmax, grad_out);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
 #if GOOGLE_CUDA
 template <typename T>
 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
@@ -630,7 +882,7 @@ template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
@@ -643,18 +895,11 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output, Tensor* argmax) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
@@ -669,17 +914,6 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<float>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<Eigen::half>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
@@ -692,30 +926,118 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
     const int top_offset = params.out_height * params.out_width * params.depth;
     const int bottom_offset =
         params.tensor_in_rows * params.tensor_in_cols * params.depth;
-    bool status = MaxPoolBackwardWithArgmax(
+    bool status = functor::MaxPoolBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
     if (!status) {
       context->SetStatus(
-          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+          errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<Eigen::half>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
+template <typename T>
+struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+                           params.tensor_in_cols * params.depth;
+    const int output_size = params.tensor_in_batch * params.out_height *
+                            params.out_width * params.depth;
+    const int top_offset =
+        params.tensor_in_rows * params.tensor_in_cols * params.depth;
+    const int bottom_offset =
+        params.out_width * params.out_height * params.depth;
+    bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
+        output_size, input_size, grad_in.flat<T>().data(),
+        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
+    }
+  }
+};
 
 #endif  // GOOGLE_CUDA
 
+#define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
+      MaxPoolingGradOp<D##Device, T>);                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPoolingGradGradOp<D##Device, T>);
+
+// Below kernels implemented only for CPU device.
+#define REGISTER_CPU_ONLY_POOL_KERNELS(T)                        \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MaxPoolingOp<CPUDevice, T>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
+#undef REGISTER_CPU_ONLY_POOL_KERNELS
+
+#define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+// Forward declarations for the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                            \
+  template <>                                                          \
+  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
+      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
+      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
+      int window_cols, int row_stride, int col_stride,                 \
+      const Eigen::PaddingType& padding);                              \
+  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
+#undef REGISTER_GPU_MAX_POOL_KERNELS
+
+// Below kernels currently implemented only for GPU device.
+// Note(jiayq): Currently, the Caffe custom implementation is faster than the
+// default Eigen implementation so we are using the custom kernel as the
+// default. However, you can explicitly invoke the eigen version using
+// kernel_label_map.
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .Label("eigen_tensor"),                \
+                          MaxPoolingOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<int64>("Targmax")      \
+                              .TypeConstraint<T>("T"),               \
+                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
+#undef REGISTER_GPU_ONLY_POOL_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_MAX_POOL_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 91b50b1e111..e3a57d2f28a 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
     const dtype* bottom_data_n = bottom_data + n * channels * height * width;
     for (int h = hstart; h < hend; ++h) {
@@ -199,15 +199,142 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
   }
 }
 
+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
+//     output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items.
+//     height, width, pooled_height, pooled_width: the input and output sizes.
+//     kernel_h, kernel_w: the kernel sizes.
+//     stride_h, stride_w: the strides.
+//     pad_t, pad_l: the padding values on the top and left side.
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W).
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = c * height * width + h * width + w;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * channels * height * width + maxidx];
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * height * width * channels + maxidx];
+    }
+  }
+}
+
+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W). As we have stored the
+//         flattened index of the input entries, the backward function is
+//         agnostic of the input storage order.
+//     mask: the output mask of the same size as top_data. It is stored in
+//         int form, keeping track of the flattened index of the input item that
+//         produces the max output.
+//     top_offset: the pre-computed per-image offset of the maxpool input
+//         gradient. This is equal to H*W*C. We choose to pre-compute this so we
+//         do not  need to compute it every time inside the kernel.
+//     bottom_offset: the pre-computed per-image offset of the maxpool output.
+//         This is equal to Hout*Wout*C.
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+template <typename dtype>
+__global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
+                                    const int64* mask, const int top_offset,
+                                    const int bottom_offset,
+                                    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int image_id = (index / bottom_offset);
+    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
+  }
+}
+
 #undef CUDA_1D_KERNEL_LOOP
 }  // namespace
 
-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d) {
+namespace functor {
+
+template <typename T>
+bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, T* top_data,
+    int64* mask, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
 
@@ -219,38 +346,20 @@ bool MaxPoolForwardWithOptionalArgmax(
   return d.ok();
 }
 
-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardNoMask<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
-  const int output_size = batch * channels * pooled_height * pooled_width;
 
-  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                       kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_data, mask);
-  return d.ok();
-}
-
-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
   const int bottom_size = batch * channels * height * width;
-  const int top_size = batch * channels * pooled_height * pooled_width;
-
   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
 
+  const int top_size = batch * channels * pooled_height * pooled_width;
   MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
                                   kThreadsPerBlock,
                               kThreadsPerBlock, 0, d.stream()>>>(
@@ -260,67 +369,75 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
   return d.ok();
 }
 
-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
-  const int bottom_size = batch * channels * height * width;
-  const int top_size = batch * channels * pooled_height * pooled_width;
-
-  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
-
-  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
-                                  kThreadsPerBlock,
-                              kThreadsPerBlock, 0, d.stream()>>>(
-      top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff);
+  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                    kThreadsPerBlock, 0, d.stream()>>>(
+                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
-  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                    kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+template <typename T>
+bool MaxPoolGradBackwardNoMask<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, const T* top_diff, T* bottom_diff,
+    const Eigen::GpuDevice& d) {
+  const int num_kernels = batch * channels * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  }
   return d.ok();
 }
 
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
-  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                    kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+template <typename T>
+bool MaxPoolGradBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+  MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
+                        d.stream()>>>(output_size, top_diff, mask, top_offset,
+                                      bottom_offset, bottom_diff);
   return d.ok();
 }
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_KERNELS(T) \
-  template struct functor::SpatialMaxPooling<GPUDevice, T>;
+#define DEFINE_GPU_KERNELS(T)                          \
+  template struct SpatialMaxPooling<GPUDevice, T>;     \
+  template struct MaxPoolForwardWithOptionalArgmax<T>; \
+  template struct MaxPoolBackwardWithArgmax<T>;        \
+  template struct MaxPoolBackwardNoMask<T>;            \
+  template struct MaxPoolGradBackwardWithArgmax<T>;    \
+  template struct MaxPoolGradBackwardNoMask<T>;
 
-DEFINE_GPU_KERNELS(float)
-DEFINE_GPU_KERNELS(Eigen::half)
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
+}  // namespace functor
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index d1c73a372e9..d2029f5719a 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -24,54 +24,62 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
+namespace functor {
 // Run the forward pass of max pooling, optionally writing the argmax indices to
 // the mask array, if it is not nullptr. If mask is passed in as nullptr, the
 // argmax indices are not written.
-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolForwardWithOptionalArgmax {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, T* top_data, int64* mask,
+                  const Eigen::GpuDevice& d);
+};
 
-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
 
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolBackwardNoMask {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, const T* top_diff,
+                  T* bottom_diff, const Eigen::GpuDevice& d);
+};
 
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolGradBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
 
-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolGradBackwardNoMask {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int height, const int width,
+                  const int kernel_h, const int kernel_w, const int stride_h,
+                  const int stride_w, const int pad_t, const int pad_l,
+                  const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d);
+};
 
-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d);
+}  // namespace functor
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index d8f6622c681..0e899402a28 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -332,7 +332,7 @@ void Quantize(OpKernelContext* tf_context, const float* input, int count,
   // The float to int/uint cast in NEON uses round toward 0. To keep the
   // rounding consistent with Eigen, which uses round toward closest, we can
   // add 0.5f and exploit the fact that we only operate on non negative values.
-  // TODO(maciekc): fix the the actual kernel in gemmlowp/meta
+  // TODO(maciekc): fix the actual kernel in gemmlowp/meta
   params.kernel.range_offset =
       static_cast<float>(std::numeric_limits<uint8_t>::lowest()) + 0.5f;
 
diff --git a/tensorflow/core/kernels/meta_support.h b/tensorflow/core/kernels/meta_support.h
index 0d87baf0344..53aece78e87 100644
--- a/tensorflow/core/kernels/meta_support.h
+++ b/tensorflow/core/kernels/meta_support.h
@@ -64,7 +64,7 @@ bool IsSupportedAndEnabled();
 //     sum((a_data[i, l] + offset_a) * (b_data[l, j] + offset_b)) : l in [0, k)
 //
 // If transpose_a is false the lhs operand has row major layout, otherwise
-// column major. Similarily transpose_b describes the layout of the rhs operand.
+// column major. Similarly transpose_b describes the layout of the rhs operand.
 // lda, ldb, and ldc are the strides of the lhs operand, rhs operand and the
 // result arrays.
 void QuantizedGemm(OpKernelContext* context, bool transpose_a, bool transpose_b,
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
new file mode 100644
index 00000000000..2793005aa26
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "tensorflow/core/kernels/mfcc.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+const double kDefaultUpperFrequencyLimit = 4000;
+const double kDefaultLowerFrequencyLimit = 20;
+const double kFilterbankFloor = 1e-12;
+const int kDefaultFilterbankChannelCount = 40;
+const int kDefaultDCTCoefficientCount = 13;
+
+Mfcc::Mfcc() : initialized_(false),
+               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+               filterbank_channel_count_(kDefaultFilterbankChannelCount),
+               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+
+bool Mfcc::Initialize(int input_length,
+                      double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(input_length,
+                                                input_sample_rate,
+                                                filterbank_channel_count_,
+                                                lower_frequency_limit_,
+                                                upper_frequency_limit_);
+  initialized &= dct_.Initialize(filterbank_channel_count_,
+                                 dct_coefficient_count_);
+  initialized_ = initialized;
+  return initialized;
+}
+
+void Mfcc::Compute(const std::vector<double>& spectrogram_frame,
+                   std::vector<double>* output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "Mfcc not initialized.";
+    return;
+  }
+  std::vector<double> working;
+  mel_filterbank_.Compute(spectrogram_frame, &working);
+  for (int i = 0; i < working.size(); ++i) {
+    double val = working[i];
+    if (val < kFilterbankFloor) {
+      val = kFilterbankFloor;
+    }
+    working[i] = log(val);
+  }
+  dct_.Compute(working, output);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
new file mode 100644
index 00000000000..c39f1049909
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for computing MFCCs from spectrogram slices.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+
+#include <vector>
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class Mfcc {
+ public:
+  Mfcc();
+  bool Initialize(int input_length,
+                  double input_sample_rate);
+
+  // Input is a single magnitude spectrogram frame. The input spectrum
+  // is filtered into bands using a triangular mel filterbank and a
+  // discrete cosine transform (DCT) of the values is taken. Output is
+  // populated with the lowest dct_coefficient_count of these values.
+  void Compute(const std::vector<double>& spectrogram_frame,
+               std::vector<double>* output) const;
+
+  void set_upper_frequency_limit(double upper_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    upper_frequency_limit_ = upper_frequency_limit;
+  }
+
+  void set_lower_frequency_limit(double lower_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    lower_frequency_limit_ = lower_frequency_limit;
+  }
+
+  void set_filterbank_channel_count(int filterbank_channel_count) {
+    CHECK(!initialized_) << "Set channel count before calling Initialize.";
+    filterbank_channel_count_ = filterbank_channel_count;
+  }
+
+  void set_dct_coefficient_count(int dct_coefficient_count) {
+    CHECK(!initialized_) << "Set coefficient count before calling Initialize.";
+    dct_coefficient_count_ = dct_coefficient_count;
+  }
+
+ private:
+  MfccMelFilterbank mel_filterbank_;
+  MfccDct dct_;
+  bool initialized_;
+  double lower_frequency_limit_;
+  double upper_frequency_limit_;
+  int filterbank_channel_count_;
+  int dct_coefficient_count_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Mfcc);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
diff --git a/tensorflow/core/kernels/mfcc_dct.cc b/tensorflow/core/kernels/mfcc_dct.cc
new file mode 100644
index 00000000000..aa67a8d6499
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+
+#include <math.h>
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+MfccDct::MfccDct() : initialized_(false) {}
+
+bool MfccDct::Initialize(int input_length, int coefficient_count) {
+  coefficient_count_ = coefficient_count;
+  input_length_ = input_length;
+
+  if (coefficient_count_ < 1) {
+    LOG(ERROR) << "Coefficient count must be positive.";
+    return false;
+  }
+
+  if (input_length < 1) {
+    LOG(ERROR) << "Input length must be positive.";
+    return false;
+  }
+
+  if (coefficient_count_ > input_length_) {
+    LOG(ERROR) << "Coefficient count must be less than or equal to "
+               << "input length.";
+    return false;
+  }
+
+  cosines_.resize(coefficient_count_);
+  double fnorm = sqrt(2.0 / input_length_);
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = std::atan(1) * 4;
+  double arg = pi / input_length_;
+  for (int i = 0; i < coefficient_count_; ++i) {
+    cosines_[i].resize(input_length_);
+    for (int j = 0; j < input_length_; ++j) {
+      cosines_[i][j] = fnorm * cos(i * arg * (j + 0.5));
+    }
+  }
+  initialized_ = true;
+  return true;
+}
+
+void MfccDct::Compute(const std::vector<double> &input,
+                      std::vector<double> *output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "DCT not initialized.";
+    return;
+  }
+
+  output->resize(coefficient_count_);
+  int length = input.size();
+  if (length > input_length_) {
+    length = input_length_;
+  }
+
+  for (int i = 0; i < coefficient_count_; ++i) {
+    double sum = 0.0;
+    for (int j = 0; j < length; ++j) {
+      sum += cosines_[i][j] * input[j];
+    }
+    (*output)[i] = sum;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_dct.h b/tensorflow/core/kernels/mfcc_dct.h
new file mode 100644
index 00000000000..4fa3c01628d
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic minimal DCT class for MFCC speech processing.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccDct {
+ public:
+  MfccDct();
+  bool Initialize(int input_length, int coefficient_count);
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  bool initialized_;
+  int coefficient_count_;
+  int input_length_;
+  std::vector<std::vector<double> > cosines_;
+  TF_DISALLOW_COPY_AND_ASSIGN(MfccDct);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
diff --git a/tensorflow/core/kernels/mfcc_dct_test.cc b/tensorflow/core/kernels/mfcc_dct_test.cc
new file mode 100644
index 00000000000..7526278fe9e
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+TEST(MfccDctTest, AgreesWithMatlab) {
+  // This test verifies the DCT against MATLAB's dct function.
+  MfccDct dct;
+  std::vector<double> input = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  const int kCoefficientCount = 6;
+  ASSERT_TRUE(dct.Initialize(input.size(), kCoefficientCount));
+  std::vector<double> output;
+  dct.Compute(input, &output);
+  // Note, the matlab dct function divides the first coefficient by
+  // sqrt(2), whereas we don't, so we multiply the first element of
+  // the matlab result by sqrt(2) to get the expected values below.
+  std::vector<double> expected = {12.1243556530, -4.1625617959, 0.0,
+                                  -0.4082482905, 0.0,           -0.0800788912};
+  ASSERT_EQ(output.size(), kCoefficientCount);
+  for (int i = 0; i < kCoefficientCount; ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-10);
+  }
+}
+
+TEST(MfccDctTest, InitializeFailsOnInvalidInput) {
+  MfccDct dct1;
+  EXPECT_FALSE(dct1.Initialize(-50, 1));
+  MfccDct dct2;
+  EXPECT_FALSE(dct1.Initialize(10, -4));
+  MfccDct dct3;
+  EXPECT_FALSE(dct1.Initialize(-1, -1));
+  MfccDct dct4;
+  EXPECT_FALSE(dct1.Initialize(20, 21));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
new file mode 100644
index 00000000000..d68c60280d9
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This code resamples the FFT bins, and smooths then with triangle-shaped
+// weights to create a mel-frequency filter bank. For filter i centered at f_i,
+// there is a triangular weighting of the FFT bins that extends from
+// filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i
+// (where the filter value is 1) to f_i+1 (where the filter values returns to
+// zero).
+
+// Note: this code fails if you ask for too many channels.  The algorithm used
+// here assumes that each FFT bin contributes to at most two channels: the
+// right side of a triangle for channel i, and the left side of the triangle
+// for channel i+1.  If you ask for so many channels that some of the
+// resulting mel triangle filters are smaller than a single FFT bin, these
+// channels may end up with no contributing FFT bins.  The resulting mel
+// spectrum output will have some channels that are always zero.
+
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+
+#include <math.h>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
+
+bool MfccMelFilterbank::Initialize(int input_length,
+                               double input_sample_rate,
+                               int output_channel_count,
+                               double lower_frequency_limit,
+                               double upper_frequency_limit) {
+  num_channels_ = output_channel_count;
+  sample_rate_  = input_sample_rate;
+  input_length_ = input_length;
+
+  if (num_channels_ < 1) {
+    LOG(ERROR) << "Number of filterbank channels must be positive.";
+    return false;
+  }
+
+  if (sample_rate_ <= 0) {
+    LOG(ERROR) << "Sample rate must be positive.";
+    return false;
+  }
+
+  if (input_length < 2) {
+    LOG(ERROR) << "Input length must greater than 1.";
+    return false;
+  }
+
+  if (lower_frequency_limit <= 0) {
+    LOG(ERROR) << "Lower frequency limit must be positive.";
+    return false;
+  }
+
+  if (upper_frequency_limit <= lower_frequency_limit) {
+    LOG(ERROR) << "Upper frequency limit must be greater than "
+               << "lower frequency limit.";
+    return false;
+  }
+
+  // An extra center frequency is computed at the top to get the upper
+  // limit on the high side of the final triangular filter.
+  center_frequencies_.resize(num_channels_ + 1);
+  const double mel_low = FreqToMel(lower_frequency_limit);
+  const double mel_hi = FreqToMel(upper_frequency_limit);
+  const double mel_span = mel_hi - mel_low;
+  const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1);
+  for (int i = 0; i < num_channels_ + 1; ++i) {
+    center_frequencies_[i] = mel_low + (mel_spacing * (i + 1));
+  }
+
+  // Always exclude DC; emulate HTK.
+  const double hz_per_sbin = 0.5 * sample_rate_ /
+      static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
+                                           hz_per_sbin));
+  end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
+
+  // Maps the input spectrum bin indices to filter bank channels/indices. For
+  // each FFT bin, band_mapper tells us which channel this bin contributes to
+  // on the right side of the triangle.  Thus this bin also contributes to the
+  // left side of the next channel's triangle response.
+  band_mapper_.resize(input_length_);
+  int channel = 0;
+  for (int i = 0; i < input_length_; ++i) {
+    double melf = FreqToMel(i * hz_per_sbin);
+    if ((i < start_index_) || (i > end_index_)) {
+      band_mapper_[i] = -2;  // Indicate an unused Fourier coefficient.
+    } else {
+      while ((center_frequencies_[channel] < melf) &&
+             (channel < num_channels_)) {
+        ++channel;
+      }
+      band_mapper_[i] = channel - 1;  // Can be == -1
+    }
+  }
+
+  // Create the weighting functions to taper the band edges.  The contribution
+  // of any one FFT bin is based on its distance along the continuum between two
+  // mel-channel center frequencies.  This bin contributes weights_[i] to the
+  // current channel and 1-weights_[i] to the next channel.
+  weights_.resize(input_length_);
+  for (int i = 0; i < input_length_; ++i) {
+    channel = band_mapper_[i];
+    if ((i < start_index_) || (i > end_index_)) {
+      weights_[i] = 0.0;
+    } else {
+      if (channel >= 0) {
+        weights_[i] = (center_frequencies_[channel + 1] -
+                       FreqToMel(i * hz_per_sbin)) /
+            (center_frequencies_[channel + 1] - center_frequencies_[channel]);
+      } else {
+        weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
+            (center_frequencies_[0] - mel_low);
+      }
+    }
+  }
+  // Check the sum of FFT bin weights for every mel band to identify
+  // situations where the mel bands are so narrow that they don't get
+  // significant weight on enough (or any) FFT bins -- i.e., too many
+  // mel bands have been requested for the given FFT size.
+  std::vector<int> bad_channels;
+  for (int c = 0; c < num_channels_; ++c) {
+    float band_weights_sum = 0.0;
+    for (int i = 0; i < input_length_; ++i) {
+      if (band_mapper_[i] == c - 1) {
+        band_weights_sum += (1.0 - weights_[i]);
+      } else if (band_mapper_[i] == c) {
+        band_weights_sum += weights_[i];
+      }
+    }
+    // The lowest mel channels have the fewest FFT bins and the lowest
+    // weights sum.  But given that the target gain at the center frequency
+    // is 1.0, if the total sum of weights is 0.5, we're in bad shape.
+    if (band_weights_sum < 0.5) {
+      bad_channels.push_back(c);
+    }
+  }
+  if (!bad_channels.empty()) {
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
+        " starting at " << bad_channels[0] <<
+        " in mel-frequency design. " <<
+        "Perhaps too many channels or " <<
+        "not enough frequency resolution in spectrum. (" <<
+        "input_length: " << input_length <<
+        " input_sample_rate: " << input_sample_rate <<
+        " output_channel_count: " << output_channel_count <<
+        " lower_frequency_limit: " << lower_frequency_limit <<
+        " upper_frequency_limit: " << upper_frequency_limit;
+  }
+  initialized_ = true;
+  return true;
+}
+
+// Compute the mel spectrum from the squared-magnitude FFT input by taking the
+// square root, then summing FFT magnitudes under triangular integration windows
+// whose widths increase with frequency.
+void MfccMelFilterbank::Compute(const std::vector<double> &input,
+                            std::vector<double> *output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "Mel Filterbank not initialized.";
+    return;
+  }
+
+  if (input.size() <= end_index_) {
+    LOG(ERROR) << "Input too short to compute filterbank";
+    return;
+  }
+
+  // Ensure output is right length and reset all values.
+  output->assign(num_channels_, 0.0);
+
+  for (int i = start_index_; i <= end_index_; i++) {  // For each FFT bin
+    double spec_val = sqrt(input[i]);
+    double weighted = spec_val * weights_[i];
+    int channel = band_mapper_[i];
+    if (channel >= 0)
+      (*output)[channel] += weighted;  // Right side of triangle, downward slope
+    channel++;
+    if (channel < num_channels_)
+      (*output)[channel] += spec_val - weighted;  // Left side of triangle
+  }
+}
+
+double MfccMelFilterbank::FreqToMel(double freq) const {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
new file mode 100644
index 00000000000..33ea1bdb5bc
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for applying a mel-scale filterbank to an input.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccMelFilterbank {
+ public:
+  MfccMelFilterbank();
+  bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
+                  double input_sample_rate,
+                  int output_channel_count,
+                  double lower_frequency_limit,
+                  double upper_frequency_limit);
+
+  // Takes a magnitude spectrogram slice as input, computes a
+  // traingular mel filterbank and places the result in output.
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  double FreqToMel(double freq) const;
+  bool initialized_;
+  int num_channels_;
+  double sample_rate_;
+  int input_length_;
+  std::vector<double> center_frequencies_;  // In mel, for each mel channel.
+
+  // Each FFT bin b contributes to two triangular mel channels, with
+  // proportion weights_[b] going into mel channel band_mapper_[b], and
+  // proportion (1 - weights_[b]) going into channel band_mapper_[b] + 1.
+  // Thus, weights_ contains the weighting applied to each FFT bin for the
+  // upper-half of the triangular band.
+  std::vector<double> weights_;  // Right-side weight for this fft  bin.
+
+  // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
+  std::vector<int> band_mapper_;
+  int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
+  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
new file mode 100644
index 00000000000..602dfeb4e54
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
+  // This test verifies the Mel filterbank against "golden values".
+  // Golden values are from an independent Python Mel implementation.
+  MfccMelFilterbank filterbank;
+
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(i + 1);
+  }
+  const int kChannelCount = 20;
+  filterbank.Initialize(input.size(),
+                        22050 /* sample rate */,
+                        kChannelCount /* channels */,
+                        20.0 /*  lower frequency limit */,
+                        4000.0 /* upper frequency limit */);
+
+  std::vector<double> output;
+  filterbank.Compute(input, &output);
+
+  std::vector<double> expected = {
+      7.38894574,   10.30330648, 13.72703292,  17.24158686,  21.35253118,
+      25.77781089,  31.30624108, 37.05877236,  43.9436536,   51.80306637,
+      60.79867148,  71.14363376, 82.90910141,  96.50069158,  112.08428368,
+      129.96721968, 150.4277597, 173.74997634, 200.86037462, 231.59802942};
+
+  ASSERT_EQ(output.size(), kChannelCount);
+
+  for (int i = 0; i < kChannelCount; ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-04);
+  }
+}
+
+TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
+  // Test for bug where the output vector was not cleared before
+  // accumulating next frame's weighted spectral values.
+  MfccMelFilterbank filterbank;
+
+  const int kSampleCount = 513;
+  std::vector<double> input;
+  std::vector<double> output;
+
+  filterbank.Initialize(kSampleCount,
+                        22050 /* sample rate */,
+                        20 /* channels */,
+                        20.0 /*  lower frequency limit */,
+                        4000.0 /* upper frequency limit */);
+
+
+  // First call with nonzero input value, and an empty output vector,
+  // will resize the output and fill it with the correct, nonzero outputs.
+  input.assign(kSampleCount, 1.0);
+  filterbank.Compute(input, &output);
+  for (const double value : output) {
+    EXPECT_LE(0.0, value);
+  }
+
+  // Second call with zero input should also generate zero output.  However,
+  // the output vector now is already the correct size, but full of nonzero
+  // values.  Make sure these don't affect the output.
+  input.assign(kSampleCount, 0.0);
+  filterbank.Compute(input, &output);
+  for (const double value : output) {
+    EXPECT_EQ(0.0, value);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_op.cc b/tensorflow/core/kernels/mfcc_op.cc
new file mode 100644
index 00000000000..02643857c1f
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_op.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/mfcc.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Create a speech fingerpring from spectrogram data.
+class MfccOp : public OpKernel {
+ public:
+  explicit MfccOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("upper_frequency_limit",
+                                             &upper_frequency_limit_));
+    OP_REQUIRES_OK(context, context->GetAttr("lower_frequency_limit",
+                                             &lower_frequency_limit_));
+    OP_REQUIRES_OK(context, context->GetAttr("filterbank_channel_count",
+                                             &filterbank_channel_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("dct_coefficient_count",
+                                             &dct_coefficient_count_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& spectrogram = context->input(0);
+    OP_REQUIRES(context, spectrogram.dims() == 3,
+                errors::InvalidArgument("spectrogram must be 3-dimensional",
+                                        spectrogram.shape().DebugString()));
+    const Tensor& sample_rate_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()),
+                errors::InvalidArgument(
+                    "Input sample_rate should be a scalar tensor, got ",
+                    sample_rate_tensor.shape().DebugString(), " instead."));
+    const int32 sample_rate = sample_rate_tensor.scalar<int32>()();
+
+    const int spectrogram_channels = spectrogram.dim_size(2);
+    const int spectrogram_samples = spectrogram.dim_size(1);
+    const int audio_channels = spectrogram.dim_size(0);
+
+    Mfcc mfcc;
+    mfcc.set_upper_frequency_limit(upper_frequency_limit_);
+    mfcc.set_lower_frequency_limit(lower_frequency_limit_);
+    mfcc.set_filterbank_channel_count(filterbank_channel_count_);
+    mfcc.set_dct_coefficient_count(dct_coefficient_count_);
+    OP_REQUIRES(context, mfcc.Initialize(spectrogram_channels, sample_rate),
+                errors::InvalidArgument(
+                    "Mfcc initialization failed for channel count ",
+                    spectrogram_channels, " and sample rate ", sample_rate));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0,
+                       TensorShape({audio_channels, spectrogram_samples,
+                                    dct_coefficient_count_}),
+                       &output_tensor));
+
+    const float* spectrogram_flat = spectrogram.flat<float>().data();
+    float* output_flat = output_tensor->flat<float>().data();
+
+    for (int audio_channel = 0; audio_channel < audio_channels;
+         ++audio_channel) {
+      for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
+           ++spectrogram_sample) {
+        const float* sample_data =
+            spectrogram_flat +
+            (audio_channel * spectrogram_samples * spectrogram_channels) +
+            (spectrogram_sample * spectrogram_channels);
+        std::vector<double> mfcc_input(sample_data,
+                                       sample_data + spectrogram_channels);
+        std::vector<double> mfcc_output;
+        mfcc.Compute(mfcc_input, &mfcc_output);
+        DCHECK_EQ(dct_coefficient_count_, mfcc_output.size());
+        float* output_data =
+            output_flat +
+            (audio_channel * spectrogram_samples * dct_coefficient_count_) +
+            (spectrogram_sample * dct_coefficient_count_);
+        for (int i = 0; i < dct_coefficient_count_; ++i) {
+          output_data[i] = mfcc_output[i];
+        }
+      }
+    }
+  }
+
+ private:
+  float upper_frequency_limit_;
+  float lower_frequency_limit_;
+  int32 filterbank_channel_count_;
+  int32 dct_coefficient_count_;
+};
+REGISTER_KERNEL_BUILDER(Name("Mfcc").Device(DEVICE_CPU), MfccOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_op_test.cc b/tensorflow/core/kernels/mfcc_op_test.cc
new file mode 100644
index 00000000000..d16171d5265
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_op_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(MfccOpTest, SimpleTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor spectrogram_tensor(DT_FLOAT, TensorShape({1, 1, 513}));
+  test::FillIota<float>(&spectrogram_tensor, 1.0f);
+
+  Output spectrogram_const_op = Const(root.WithOpName("spectrogram_const_op"),
+                                      Input::Initializer(spectrogram_tensor));
+
+  Output sample_rate_const_op =
+      Const(root.WithOpName("sample_rate_const_op"), 22050);
+
+  Mfcc mfcc_op = Mfcc(root.WithOpName("mfcc_op"), spectrogram_const_op,
+                      sample_rate_const_op);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {mfcc_op.output}, &outputs));
+
+  const Tensor& mfcc_tensor = outputs[0];
+
+  EXPECT_EQ(3, mfcc_tensor.dims());
+  EXPECT_EQ(13, mfcc_tensor.dim_size(2));
+  EXPECT_EQ(1, mfcc_tensor.dim_size(1));
+  EXPECT_EQ(1, mfcc_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      mfcc_tensor,
+      test::AsTensor<float>(
+          {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+           -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+           -0.0769791, -0.10806114, -0.06047613},
+          TensorShape({1, 1, 13})),
+      1e-3);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
new file mode 100644
index 00000000000..cb32df8811e
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+TEST(MfccTest, AgreesWithPythonGoldenValues) {
+  Mfcc mfcc;
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(i + 1);
+  }
+
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 22050 /*sample rate*/));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
+                             -0.96778652, -0.26819878, -0.40907028,
+                             -0.15614748, -0.23203119, -0.10481487,
+                             -0.1543029,  -0.0769791,  -0.10806114,
+                             -0.06047613};
+
+  ASSERT_EQ(expected.size(), output.size());
+  for (int i = 0; i < output.size(); ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-04);
+  }
+}
+
+TEST(MfccTest, AvoidsNansWithZeroInput) {
+  Mfcc mfcc;
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(0.0);
+  }
+
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 22050 /*sample rate*/));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  int expected_size = 13;
+  ASSERT_EQ(expected_size, output.size());
+  for (const double value : output) {
+    EXPECT_FALSE(std::isnan(value));
+  }
+}
+
+TEST(MfccTest, SimpleInputSaneResult) {
+  Mfcc mfcc;
+  mfcc.set_lower_frequency_limit(125.0);
+  mfcc.set_upper_frequency_limit(3800.0);
+  mfcc.set_filterbank_channel_count(40);
+  mfcc.set_dct_coefficient_count(40);
+  const int kSpectrogramSize = 129;
+  std::vector<double> input(kSpectrogramSize, 0.0);
+
+  // Simulate a low-frequency sinusoid from the spectrogram.
+  const int kHotBin = 10;
+  input[kHotBin] = 1.0;
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 8000));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  // For a single low-frequency input, output beyond c_0 should look like
+  // a slow cosine, with a slight delay.  Largest value will be c_1.
+  EXPECT_EQ(output.begin() + 1, std::max_element(output.begin(), output.end()));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index f694198b6a5..e3643f9447b 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -85,7 +85,7 @@ class MirrorPadOp : public OpKernel {
     TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
     for (int d = 0; d < dims; ++d) {
       const int32 before = paddings(d, 0);  // Pad before existing elements.
-      const int32 after = paddings(d, 1);   // Pad after exisitng elements.
+      const int32 after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
                   errors::InvalidArgument("paddings must be non-negative: ",
                                           before, " ", after));
@@ -272,7 +272,7 @@ class MirrorPadGradOp : public OpKernel {
     TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
     for (int d = 0; d < dims; ++d) {
       const int32 before = paddings(d, 0);  // Pad before existing elements.
-      const int32 after = paddings(d, 1);   // Pad after exisitng elements.
+      const int32 after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
                   errors::InvalidArgument("Paddings must be non-negative: ",
                                           before, ", ", after));
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
new file mode 100644
index 00000000000..d90baee069c
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -0,0 +1,429 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ==============================================================================*/
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public OpKernel {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingOpContext mkl_context;
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (!input_in_mkl_format)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (!input_in_mkl_format) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    Tensor mkl_tmp_input_buf_tensor_;
+    mkl_context.MklCreateLayoutsAndPrimitives(context,
+                                              &mkl_tmp_input_buf_tensor_);
+    OP_REQUIRES_OK(context, context->status());
+
+    Tensor workspace_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+
+    if (mkl_context.convert_input != nullptr) {
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_context.convert_input,
+                static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+                mkl_context.input_buf),
+            E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_input), E_SUCCESS);
+      } else {
+        mkl_context.input_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_input,
+            static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+            mkl_context.input_buf);
+      }
+      mkl_context.pooling_res[dnnResourceSrc] = mkl_context.input_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceSrc] =
+          static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data()));
+    }
+
+    // Declare output tensor and allocate memory
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+    mkl_context.pooling_res[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    dnnPrimitive_t prim_pooling_fwd = nullptr, convert_input = nullptr;
+    dnnLayout_t lt_user_input = nullptr, lt_prim_input = nullptr,
+                lt_workspace = nullptr;
+    void* input_buf = nullptr;
+    void* pooling_res[dnnResourceNumber];
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context,
+                                       Tensor* mkl_tmp_input_buf_tensor) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_input, prim_pooling_fwd, dnnResourceSrc),
+               E_SUCCESS);
+      if (!dnnLayoutCompare_F32(lt_user_input, lt_prim_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_user_input,
+                                         lt_prim_input),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_prim_input,
+                       &input_buf);
+      }
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_input), E_SUCCESS);
+    }
+  } MklAvgPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public OpKernel {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingGradOpContext mkl_context;
+    const Tensor& tensor_in_shape = MklGetInput(context, 0);
+    const Tensor& out_backprop = MklGetInput(context, 1);
+    GetMklShape(context, 1, &mkl_context.out_backprop_shape);
+    bool outbackprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+
+    MklPoolParameters pool_params;
+    pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                     output_shape);
+
+    if (outbackprop_in_mkl_format == false)
+      mkl_context.params.in_dim = out_backprop.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.out_backprop_shape.GetDimension();
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    // Tensors needed to create temporary buffers
+    Tensor outbackprop_buf_tensor;
+    void* outbackprop_buf;
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+    OP_REQUIRES_OK(context, context->status());
+
+    // Check if outbackprop layout requires conversion.
+    if (!dnnLayoutCompare_F32(mkl_context.lt_user_outbackprop,
+                              mkl_context.lt_prim_outbackprop)) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_context.convert_outbackprop,
+                                       mkl_context.lt_user_outbackprop,
+                                       mkl_context.lt_prim_outbackprop),
+               E_SUCCESS);
+
+      AllocTmpBuffer(context, &outbackprop_buf_tensor,
+                     mkl_context.lt_prim_outbackprop, &outbackprop_buf);
+
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          outbackprop_buf),
+                 E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_outbackprop), E_SUCCESS);
+      } else {
+        mkl_context.out_backprop_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_outbackprop,
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data())),
+            outbackprop_buf);
+      }
+      mkl_context.pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceDiffDst] =
+          static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+    }
+
+    // Handle workspace requirements.
+    Tensor workspace_buf_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_buf_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    // Handle MKL output tensor setup.
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                               dnnResourceDiffSrc);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.in_sizes,
+                              mkl_context.params.in_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+
+    // Set output tensor.
+    mkl_context.pooling_res[dnnResourceDiffSrc] =
+        static_cast<void*>(output->flat<T>().data());
+
+    // Execute primitive.
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape out_backprop_shape;
+    dnnPrimitive_t prim_pooling_bwd = nullptr, convert_outbackprop = nullptr;
+    void* pooling_res[dnnResourceNumber];
+    dnnLayout_t lt_user_input = nullptr, lt_user_outbackprop = nullptr,
+                lt_prim_outbackprop = nullptr, lt_workspace = nullptr;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      const Tensor& tensor_in_shape = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 1);
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+
+      if (!outbackprop_in_mkl_format) {
+        // For avgpooling, tensor_in_shape should have 1 dimension, and 4
+        // elements.
+        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+                                 tensor_in_shape.NumElements() == 4,
+                    errors::InvalidArgument("original input shape must be "
+                                            "1-dimensional and 4 elements"));
+
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop.dims() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      } else {
+        // Input in MKL format.
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      }
+
+      // TODO(inteltf): Get outbackprop layout.
+      // Do we need to create layout in every invocation?
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_outbackprop, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_outbackprop = (dnnLayout_t)out_backprop_shape.GetCurLayout();
+      }
+
+      // Create the backward primitive
+      // Create DNN user layout
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                   params.in_sizes, params.in_strides),
+               E_SUCCESS);
+
+      // Create PoolingBackward primitive
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Create expected outbackprop layout from the primitive.
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_outbackprop, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_bwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_outbackprop), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_outbackprop), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklAvgPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklAvgPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklAvgPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
new file mode 100644
index 00000000000..d8a8cc74bfa
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -0,0 +1,461 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <limits>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
+
+// TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
+// reference inputs.
+// --------------------------------------------------------------------------
+//                      Eigen Concat Op
+// --------------------------------------------------------------------------
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class EigenConcatBaseOp : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit EigenConcatBaseOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  // Although, we modify Compute for this call to accept one extra param,
+  // we need to have empty Compute because Compute is pure virtual function.
+  void Compute(OpKernelContext* c) {}
+
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = values[0].dims();
+    const TensorShape& input_shape = values[0].shape();
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(in.shape());
+      OP_REQUIRES(
+          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
+      for (int j = 0; j < input_dims; ++j) {
+        if (j == axis) {
+          continue;
+        }
+        OP_REQUIRES(
+            c, in.dim_size(j) == input_shape.dim_size(j),
+            errors::InvalidArgument(
+                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
+      }
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      // TODO(irving): Remove check once !allow_legacy_scalars().
+      output_concat_dim += in.dims() > 0 ? in.dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    // TODO(irving): Remove rank 0 case once !allow_legacy_scalars().
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+};
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklConcatOpContext mkl_context;
+
+    // Get input tensors.
+    OpInputList input_tensors;
+    GetMklInputList(context, "values", &input_tensors);
+    const int N = input_tensors.size();
+    // Get MKL shapes.
+    MklShapeList input_shapes(N);
+    GetMklShapeList(context, "values", &input_shapes);
+
+    // If this is Concat, then concat_dim is 0th input.
+    // If this is ConcatV2, then axis is Nth input.
+    const Tensor& concat_dim_tensor = AxisArgName == NAME_IS_CONCAT_DIM
+                                          ? MklGetInput(context, 0)
+                                          : MklGetInput(context, N);
+
+    // Sanity checks
+    OP_REQUIRES(
+        context, IsLegacyScalar(concat_dim_tensor.shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor.shape().DebugString()));
+    int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
+
+    MklShape& inpshape0 = input_shapes[0];
+
+    // Check that all tensors are Mkl, if not we call Eigen version.
+    bool invoke_eigen = false;
+    bool is_concat_dim_channel = true;
+    if (!AreAllMklTensors(input_shapes)) {
+      invoke_eigen = true;
+    }
+
+    // Check that total number of dimensions is 4, if not call Eigen.
+    if (!invoke_eigen) {
+      for (auto& s : input_shapes) {
+        if (s.GetDimension() != 4) {
+          invoke_eigen = true;
+          break;
+        }
+      }
+    }
+
+    // check that concat_dim is channel, if not call Eigen version.
+    if (!invoke_eigen) {
+      for (auto& s : input_shapes) {
+        if (!s.IsMklChannelDim(concat_dim)) {
+          invoke_eigen = true;
+          is_concat_dim_channel = false;
+          break;
+        }
+      }
+    }
+
+    if (invoke_eigen) {
+      string msg = std::string("Invoking Eigen version of Concat. Reason:") +
+                   (!is_concat_dim_channel
+                        ? std::string("Concat dimension is not channel")
+                        : std::string("Not all tensors are in Mkl layout"));
+      VLOG(1) << "_MklConcatOp: " << msg;
+      CallEigenVersion(context, input_tensors, input_shapes);
+      return;
+    }
+
+    // For MKL format, the channel is dimension number 2.
+    // So if we are concating over channel and _all_ inputs are in MKL
+    // format, then we set concat_dim to 2.
+    // Since we have reached till here, it means we are concating
+    // over channel.
+    concat_dim = MklDims::C;
+
+    // One more sanity check: check that ranks of all tensors match
+    // and that their shapes match except for concat_dim.
+    int i = 0;
+    for (auto& s : input_shapes) {
+      size_t exp_dims = inpshape0.GetDimension();
+      OP_REQUIRES(context, s.GetDimension() == exp_dims,
+                  errors::InvalidArgument(
+                      "_MklConcatOp : Ranks of all input tensors should match:"
+                      " input dimensions = ",
+                      s.GetDimension(), " vs. expected rank = ", exp_dims));
+
+      for (int d = 0; d < exp_dims; ++d) {
+        if (d == concat_dim) {
+          continue;
+        }
+
+        size_t exp_size = inpshape0.GetSizes()[d];
+        OP_REQUIRES(
+            context, exp_size == s.GetSizes()[d],
+            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs"
+                                    "should match: shape[0][",
+                                    d, "]= ", exp_size, " vs. shape[", i, "][",
+                                    d, "] = ", s.GetSizes()[d]));
+      }
+      ++i;
+    }
+
+    // Use input MKL layout instead of creating new layouts.
+    int64 output_concat_dim_size = 0;
+    for (auto& s : input_shapes) {
+      output_concat_dim_size +=
+          s.GetDimension() > 0 ? s.GetSizes()[concat_dim] : 1;
+    }
+    mkl_context.MklCreateInputLayouts(context, input_shapes);
+    OP_REQUIRES_OK(context, context->status());
+
+    CHECK_EQ(dnnConcatCreate_F32(&mkl_context.prim_concat, NULL, N,
+                                 &mkl_context.lt_inputs[0]),
+             E_SUCCESS);
+
+    // Calculate output sizes and strides
+    TensorFormat data_format;
+    if (inpshape0.IsTensorInNHWCFormat()) {
+      data_format = FORMAT_NHWC;
+    } else {
+      OP_REQUIRES(
+          context, inpshape0.IsTensorInNCHWFormat(),
+          errors::InvalidArgument(
+              "_MklConcat only supports all inputs in NCHW or NHWC format "));
+      data_format = FORMAT_NCHW;
+    }
+
+    // Since all tensors are in Mkl layout, we copy sizes from input tensor.
+    mkl_context.out_sizes[MklDims::W] = inpshape0.GetSizes()[MklDims::W];
+    mkl_context.out_sizes[MklDims::H] = inpshape0.GetSizes()[MklDims::H];
+    mkl_context.out_sizes[MklDims::C] = output_concat_dim_size;
+    mkl_context.out_sizes[MklDims::N] = inpshape0.GetSizes()[MklDims::N];
+    GetStridesFromSizes(data_format, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+
+    // Set output Mkl shape.
+    int64 dim = 4;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(true);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_concat, dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(dim, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    mkl_output_mkl_shape.SetTfDimOrder(dim, inpshape0.GetTfToMklDimMap());
+
+    TensorShape mkl_output_tf_shape;
+    mkl_output_tf_shape.AddDim(1);
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+
+    Tensor* output = nullptr;
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+
+    // Set destination resource.
+    mkl_context.concat_res[dnnResourceDst] =
+        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+
+    mkl_context.mkl_tmp_tensors.resize(N);
+    mkl_context.MklPrepareConcatInputs(context, input_tensors);
+    OP_REQUIRES_OK(context, context->status());
+
+    // Execute primitive.
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_concat, mkl_context.concat_res),
+             E_SUCCESS);
+
+    mkl_context.MklCleanup();
+    OP_REQUIRES_OK(context, context->status());
+  }
+
+ private:
+  typedef struct {
+    TensorFormat data_format;
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    dnnPrimitive_t prim_concat;
+    void* concat_res[dnnResourceNumber];
+    std::vector<dnnLayout_t> lt_inputs;
+    std::vector<Tensor> mkl_tmp_tensors;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    // We only support case where input tensors are all in Mkl layout.
+    void MklCreateInputLayouts(OpKernelContext* context,
+                               MklShapeList& input_shapes) {
+      for (auto& is : input_shapes) {
+        CHECK_EQ(is.IsMklTensor(), true);
+        lt_inputs.push_back((dnnLayout_t)is.GetCurLayout());
+      }
+    }
+
+    void MklPrepareConcatInputs(OpKernelContext* context,
+                                OpInputList& input_tensors) {
+      CHECK_EQ(lt_inputs.size(), mkl_tmp_tensors.size());
+
+      for (int i = 0; i < lt_inputs.size(); ++i) {
+        dnnPrimitive_t mkl_prim_convert_input;
+        dnnLayout_t mkl_lt_internal_input;
+        void* mkl_buf_convert_input = nullptr;
+
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &mkl_lt_internal_input, prim_concat,
+                     (dnnResourceType_t)(dnnResourceMultipleSrc + i)),
+                 E_SUCCESS);
+
+        if (!dnnLayoutCompare_F32(lt_inputs[i], mkl_lt_internal_input)) {
+          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                                           lt_inputs[i], mkl_lt_internal_input),
+                   E_SUCCESS);
+
+          AllocTmpBuffer(context, &mkl_tmp_tensors[i], mkl_lt_internal_input,
+                         &mkl_buf_convert_input);
+
+          CHECK_EQ(dnnConversionExecute_F32(
+                       mkl_prim_convert_input,
+                       const_cast<void*>(static_cast<const void*>(
+                           input_tensors[i].flat<T>().data())),
+                       mkl_buf_convert_input),
+                   E_SUCCESS);
+
+          concat_res[dnnResourceMultipleSrc + i] = mkl_buf_convert_input;
+          CHECK_EQ(dnnDelete_F32(mkl_prim_convert_input), E_SUCCESS);
+        } else {
+          concat_res[dnnResourceMultipleSrc + i] = const_cast<void*>(
+              static_cast<const void*>(input_tensors[i].flat<T>().data()));
+        }
+
+        CHECK_EQ(dnnLayoutDelete_F32(mkl_lt_internal_input), E_SUCCESS);
+      }
+    }
+
+    void MklCleanup() {
+      for (auto& lt : lt_inputs) {
+        lt = nullptr;
+      }
+      CHECK_EQ(dnnDelete_F32(prim_concat), E_SUCCESS);
+    }
+  } MklConcatOpContext;
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const MklShapeList& input_shapes) {
+    // Before calling Eigen version, we need to convert Mkl tensors to TF.
+    // First check that the number of input tensors and the number of Mkl
+    // shapes match.
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      if (input_shapes[i].IsMklTensor()) {
+        // If input tensor is Mkl, then do the conversion.
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+      } else {
+        // If input tensor is TF already, then we do not need any conversion.
+        converted_values.push_back(values[i]);
+      }
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    mkl_tensor_mkl_shape.SetTfDimOrder(4);  // Dimensions
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+/* Use optimized concat for float type only */
+#define REGISTER_MKL_CPU(type)                                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .HostMemory("concat_dim")                     \
+                              .Label(mkl_op_registry::kMklOpLabel),         \
+                          MklConcatOp<CPUDevice, type, NAME_IS_CONCAT_DIM>) \
+  REGISTER_KERNEL_BUILDER(Name("_MklConcatV2")                              \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<int32>("Tidx")                \
+                              .HostMemory("axis")                           \
+                              .Label(mkl_op_registry::kMklOpLabel),         \
+                          MklConcatOp<CPUDevice, type, NAME_IS_AXIS>)
+
+TF_CALL_float(REGISTER_MKL_CPU);
+
+#undef REGISTER_CONCAT_MKL
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
new file mode 100644
index 00000000000..d4364d31e41
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -0,0 +1,264 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// bias.
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropBiasOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropBiasOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+  ~MklConv2DCustomBackpropBiasOp() {}
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackBiasOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+
+    if (input_is_mkl) {
+      OP_REQUIRES(
+          context, mkl_context.input_shape.GetDimension() == 4,
+          errors::InvalidArgument("Input tensor must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    if (input_is_mkl) {
+      mkl_context.c_size = mkl_context.input_shape.GetSizes()[MklDims::C];
+    } else if (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW) {
+      mkl_context.c_size = GetTensorDim(input, data_format_, 'C');
+    } else {
+      errors::InvalidArgument("Unknown format ",
+                              " Format must be either NCHW or NHWC. ");
+    }
+    TensorShape output_shape{mkl_context.c_size};
+
+    Tensor* bias_backprop = nullptr;
+    MklShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 0, &bias_backprop, output_shape,
+                              output_mkl_shape);
+
+    mkl_context.in_dims = 4;
+
+    if (input_is_mkl) {  // get the shape from the mkl shape
+      mkl_context.in_sizes[MklDims::W] =
+          mkl_context.input_shape.GetSizes()[MklDims::W];
+      mkl_context.in_sizes[MklDims::H] =
+          mkl_context.input_shape.GetSizes()[MklDims::H];
+      mkl_context.in_sizes[MklDims::C] =
+          mkl_context.input_shape.GetSizes()[MklDims::C];
+      mkl_context.in_sizes[MklDims::N] =
+          mkl_context.input_shape.GetSizes()[MklDims::N];
+    } else {
+      mkl_context.in_sizes[MklDims::W] = GetTensorDim(input, data_format_, 'W');
+      mkl_context.in_sizes[MklDims::H] = GetTensorDim(input, data_format_, 'H');
+      mkl_context.in_sizes[MklDims::C] = GetTensorDim(input, data_format_, 'C');
+      mkl_context.in_sizes[MklDims::N] = GetTensorDim(input, data_format_, 'N');
+      GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                          mkl_context.in_sizes);
+    }
+
+    mkl_context.out_sizes[0] = mkl_context.c_size;
+    mkl_context.out_strides[0] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardBias_F32(
+            &mkl_context.prim_conv_bwdbias, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes),
+        E_SUCCESS);
+
+    mkl_context.MklCreateInputLayouts(context);
+
+    Tensor mkl_tmp_input_buf, mkl_tmp_outbackprop_buf;
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf);
+    mkl_context.MklPrepareConvolutionOutputs(context, &mkl_tmp_outbackprop_buf,
+                                             bias_backprop);
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdbias, mkl_context.conv_res),
+        E_SUCCESS);
+    if (mkl_context.should_convert_output) {
+      CHECK_EQ(dnnConversionExecute_F32(
+                   mkl_context.convert_outbackprop, mkl_context.outbackprop_buf,
+                   static_cast<void*>(bias_backprop->flat<T>().data())),
+               E_SUCCESS);
+    }
+    // deletes layouts
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    int c_size;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[1];
+    size_t out_strides[1];
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+    dnnPrimitive_t prim_conv_bwdbias;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_input, lt_outbackprop;
+    bool should_convert_output;
+    dnnPrimitive_t convert_outbackprop;
+    void* outbackprop_buf;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_is_mkl = input_shape.IsMklTensor();
+
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, 1, out_sizes, out_strides),
+               E_SUCCESS);
+      if (input_is_mkl) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming output tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionOutputs(OpKernelContext* context,
+                                      Tensor* mkl_tmp_outbackprop_buf,
+                                      Tensor* bias_backprop) {
+      dnnLayout_t mkl_prim_internal_outbackprop = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_prim_internal_outbackprop,
+                                                prim_conv_bwdbias,
+                                                dnnResourceDiffBias),
+               E_SUCCESS);
+      should_convert_output =
+          !dnnLayoutCompare_F32(lt_outbackprop, mkl_prim_internal_outbackprop);
+      if (should_convert_output) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_outbackprop,
+                                         mkl_prim_internal_outbackprop,
+                                         lt_outbackprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf,
+                       mkl_prim_internal_outbackprop, &outbackprop_buf);
+        conv_res[dnnResourceDiffBias] = outbackprop_buf;
+      } else {
+        conv_res[dnnResourceDiffBias] =
+            static_cast<void*>(const_cast<T*>(bias_backprop->flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_outbackprop);
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_input_buf) {
+      dnnLayout_t mkl_prim_internal_input = nullptr;
+      dnnPrimitive_t mkl_convert_input = nullptr;
+      void* input_buf = nullptr;
+      const Tensor& input = MklGetInput(context, 0);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_prim_internal_input, prim_conv_bwdbias, dnnResourceDiffDst),
+          E_SUCCESS);
+
+      if (!dnnLayoutCompare_F32(lt_input, mkl_prim_internal_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_input, lt_input,
+                                         mkl_prim_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf, mkl_prim_internal_input,
+                       &input_buf);
+        CHECK_EQ(dnnConversionExecute_F32(
+                     mkl_convert_input,
+                     static_cast<void*>(const_cast<T*>(input.flat<T>().data())),
+                     input_buf),
+                 E_SUCCESS);
+        conv_res[dnnResourceDiffDst] = input_buf;
+        dnnDelete_F32(mkl_convert_input);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_input);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_outbackprop);
+
+      if (should_convert_output) dnnDelete_F32(convert_outbackprop);
+      dnnDelete_F32(prim_conv_bwdbias);
+    }
+  } MklConvBackBiasOpContext;
+
+  TensorFormat data_format_;
+  TF_DISALLOW_COPY_AND_ASSIGN(MklConv2DCustomBackpropBiasOp);
+};
+
+#define REGISTER_CPU_KERNELS(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBiasBackpropBias")    \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropBiasOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+} /* namespace tensorflow */
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
new file mode 100644
index 00000000000..ddcf2412770
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -0,0 +1,458 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConv2DGradFilterOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    const Tensor& filter_sizes = MklGetInput(context, 1);
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.out_backprop_shape));
+    bool out_backprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, out_backprop_shape;
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
+            "not ",
+            filter_sizes.dims()));
+    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                filter_sizes.vec<int32>(), &filter_shape));
+
+    ConvBackpropDimensions backprop_dims;
+
+    // Generate shape for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.input_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropFilter: input size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.input_shape,
+                        &input_shape);
+    } else {
+      input_shape = input.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (out_backprop_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.out_backprop_shape.GetDimension() == 4,
+          errors::InvalidArgument(
+              "Conv2DCustomBackpropFilter: outbackprop size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.out_backprop_shape,
+                        &out_backprop_shape);
+    } else {
+      out_backprop_shape = out_backprop.shape();
+    }
+
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
+                       input_shape, filter_shape, out_backprop_shape, strides_,
+                       padding_, data_format_, &backprop_dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[0].input_size,
+                                backprop_dims.spatial_dims[0].filter_size,
+                                backprop_dims.spatial_dims[0].stride, padding_,
+                                &backprop_dims.spatial_dims[0].output_size,
+                                &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[1].input_size,
+                                backprop_dims.spatial_dims[1].filter_size,
+                                backprop_dims.spatial_dims[1].stride, padding_,
+                                &backprop_dims.spatial_dims[1].output_size,
+                                &pad_left, &pad_right));
+
+    // Create MKL primitives for convolution filter grad
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    mkl_context.out_dims = out_backprop_in_mkl_format
+                               ? mkl_context.out_backprop_shape.GetDimension()
+                               : out_backprop.dims();
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(backprop_dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(backprop_dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.input_offsets[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offsets[1] = static_cast<int>(-pad_top);
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+
+    // MKL understands dimensions in 0, 1, 2, and 3 indices denotes
+    // filter cols, rows, input channels, and output depth/channels.
+    mkl_context.filter_dims = 4;
+    mkl_context.filter_sizes[0] = backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_sizes[1] = backprop_dims.spatial_dims[0].filter_size;
+    mkl_context.filter_sizes[2] = backprop_dims.in_depth;
+    mkl_context.filter_sizes[3] = backprop_dims.out_depth;
+
+    // We want filter grad to be in TF format, so
+    // make the strides accordingly to reflect this fact.
+    // Note TF filter layout : (rows, cols, in_depth, out_depth),
+    // while row is the innermost dimension.
+    mkl_context.filter_strides[0] =
+        backprop_dims.out_depth * backprop_dims.in_depth;
+    mkl_context.filter_strides[1] = backprop_dims.out_depth *
+                                    backprop_dims.in_depth *
+                                    backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_strides[2] = backprop_dims.out_depth;
+    mkl_context.filter_strides[3] = 1;
+
+    mkl_context.conv_strides[0] = backprop_dims.spatial_dims[1].stride;
+    mkl_context.conv_strides[1] = backprop_dims.spatial_dims[0].stride;
+
+    // Create convolution-grad-filter primitive
+    CHECK_EQ(dnnConvolutionCreateBackwardFilter_F32(
+                 &mkl_context.prim_conv_bwdfilter, nullptr,
+                 dnnAlgorithmConvolutionDirect, mkl_context.in_dims,
+                 mkl_context.in_sizes, mkl_context.out_sizes,
+                 mkl_context.filter_sizes, mkl_context.conv_strides,
+                 mkl_context.input_offsets, dnnBorderZeros),
+             E_SUCCESS);
+
+    // Create the layouts for entities in received context.
+    mkl_context.MklCreateInputLayouts(context);
+
+    // Mkl needs the entities in its native format.
+    // So create temporary tensors along with buffers to
+    // convert the received entities.
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor,
+           mkl_tmp_buf_trans_input;
+    // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
+    mkl_context.MklPrepareInputs(context, data_format_,
+                                 input_in_mkl_format,
+                                 out_backprop_in_mkl_format,
+                                 &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor,
+                                 &mkl_tmp_buf_trans_input);
+
+    // Final conv-grad-filter should be in TF layout.
+    Tensor* grad_filter;
+    mkl_context.grad_filter_shape.SetMklTensor(false);
+    mkl_context.grad_filter_shape.SetTfLayout(mkl_context.filter_dims,
+                                              mkl_context.filter_sizes,
+                                              mkl_context.filter_strides);
+    AllocateOutputSetMklShape(context, 0, &grad_filter, filter_shape,
+                              mkl_context.grad_filter_shape);
+
+    // Need to set member variable for TF layout
+    mkl_context.lt_grad_filter = mkl_context.grad_filter_shape.GetTfLayout();
+
+    // MKL conv-grad-filter might produce grad in its internal layout
+    Tensor mkl_tmp_grad_filter_buf_tensor;
+    // This preparation sets conversion primitive if required
+    // and allocates temporary tensor and its buffer without doing conversions.
+    // Also sets (3) dnnResourceDiffFilter accordingly
+    mkl_context.MklPrepareGradFilter(context, grad_filter,
+                                     &mkl_tmp_grad_filter_buf_tensor);
+
+    // After setting all the required dnnResources, ready for execution!
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdfilter, mkl_context.conv_res),
+        E_SUCCESS);
+
+    // Convert grad-filter to TF layout
+    if (mkl_context.convert_bwdfilter != nullptr) {
+      void* mkl_buf_convert_grad_filter =
+          const_cast<void*>(static_cast<const void*>(
+              mkl_tmp_grad_filter_buf_tensor.flat<T>().data()));
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_bwdfilter,
+                                        mkl_buf_convert_grad_filter,
+                                        mkl_buf_grad_filter),
+               E_SUCCESS);
+    }
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    int out_dims;
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offsets[2];
+    size_t conv_strides[2];
+    MklShape input_shape, grad_filter_shape, out_backprop_shape;
+    dnnPrimitive_t prim_conv_bwdfilter = nullptr;
+    dnnPrimitive_t convert_bwdfilter = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_grad_filter = nullptr;
+    dnnLayout_t lt_out_backprop = nullptr;
+    void* conv_res[dnnResourceNumber];
+
+    void MklCleanup() {
+      // Cleanup member layouts and primitives except "lt_grad_filter_"
+      // which points to MklShape's TFLayout
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(lt_out_backprop);
+      if (convert_bwdfilter != nullptr) dnnDelete_F32(convert_bwdfilter);
+      dnnDelete_F32(prim_conv_bwdfilter);
+    }
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (out_backprop_in_mkl_format) {
+        lt_out_backprop =
+            static_cast<dnnLayout_t>(out_backprop_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_out_backprop, out_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context, TensorFormat format,
+                          bool input_in_mkl_format,
+                          bool out_backprop_in_mkl_format,
+                          Tensor* mkl_tmp_input_buf_tensor,
+                          Tensor* mkl_tmp_out_backprop_buf_tensor,
+                          Tensor* mkl_tmp_buf_trans_input) {
+      bool mkl_convert_input, mkl_convert_out_backprop;
+      dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop,
+                  mkl_lt_trans_input;
+      void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
+      void *mkl_buf_input, *mkl_buf_out_backprop;
+
+      mkl_prim_convert_input = nullptr;
+      mkl_prim_convert_out_backprop = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_lt_internal_out_backprop = nullptr;
+      mkl_lt_trans_input = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_out_backprop = nullptr;
+      mkl_buf_input = nullptr;
+      mkl_buf_out_backprop = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      if (!input_in_mkl_format && format == FORMAT_NHWC){
+        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
+            in_sizes[MklDims::N], in_sizes[MklDims::H],
+            in_sizes[MklDims::W], in_sizes[MklDims::C]);
+        OP_REQUIRES_OK(context, context->allocate_temp(
+            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
+        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
+        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
+            mkl_tmp_buf_trans_input->flat<float>().data()));
+        size_t strides[4];
+        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
+        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
+            strides), E_SUCCESS);
+      }
+      else {
+        mkl_buf_input =
+            const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+        mkl_lt_trans_input = lt_input;
+      }
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                     mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      if (!input_in_mkl_format && format == FORMAT_NHWC)
+        dnnLayoutDelete_F32(mkl_lt_trans_input);
+
+
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      mkl_buf_out_backprop = const_cast<void*>(
+                      static_cast<const void*>(out_backprop.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      mkl_convert_out_backprop =
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop,
+                                lt_out_backprop);
+      if (mkl_convert_out_backprop) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
+                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
+            mkl_lt_internal_out_backprop, &mkl_buf_convert_out_backprop);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
+                                          mkl_buf_out_backprop,
+                                          mkl_buf_convert_out_backprop),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_out_backprop);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
+
+      conv_res[dnnResourceDiffDst] = (mkl_convert_out_backprop)
+                                         ? mkl_buf_convert_out_backprop
+                                         : mkl_buf_out_backprop;
+    }
+
+    void MklPrepareGradFilter(OpKernelContext* context, Tensor* grad_filter,
+                              Tensor* mkl_tmp_grad_filter_buf_tensor) {
+      bool mkl_convert_grad_filter;
+      dnnLayout_t mkl_lt_internal_grad_filter = nullptr;
+      void* mkl_buf_convert_grad_filter = nullptr;
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_grad_filter,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffFilter),
+               E_SUCCESS);
+      mkl_convert_grad_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_grad_filter, lt_grad_filter);
+      if (mkl_convert_grad_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_bwdfilter,
+                                         mkl_lt_internal_grad_filter,
+                                         lt_grad_filter),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_grad_filter_buf_tensor,
+                       mkl_lt_internal_grad_filter,
+                       &mkl_buf_convert_grad_filter);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_grad_filter);
+
+      conv_res[dnnResourceDiffFilter] = (mkl_convert_grad_filter)
+                                            ? mkl_buf_convert_grad_filter
+                                            : mkl_buf_grad_filter;
+    }
+  } MklConv2DGradFilterOpContext;
+
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
new file mode 100644
index 00000000000..23827ceea50
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -0,0 +1,355 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// input
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+#include <algorithm>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  ~MklConv2DCustomBackpropInputOp() {}
+  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string dataformat;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &dataformat));
+    OP_REQUIRES(context, FormatFromString(dataformat, &data_format),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
+    int stride_n = GetTensorDim(strides, data_format, 'N');
+    int stride_c = GetTensorDim(strides, data_format, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackInputOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& filter = MklGetInput(context, 1);
+
+    GetMklShape(context, 1, &(mkl_context.filter_shape));
+    bool filter_in_mkl_format = mkl_context.filter_shape.IsMklTensor();
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.outback_shape));
+    bool outback_in_mkl_format = mkl_context.outback_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, outback_shape;
+
+    // Generate input shape.
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input.dims()));
+    OP_REQUIRES_OK(
+        context, TensorShapeUtils::MakeShape(input.vec<int32>(), &input_shape));
+
+    // Generate shape for filter prop if input is in MKL format.
+    if (filter_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.filter_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.filter_shape,
+                        &filter_shape);
+    } else {
+      filter_shape = filter.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (outback_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.outback_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.outback_shape,
+                        &outback_shape);
+    } else {
+      outback_shape = out_backprop.shape();
+    }
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensions(
+            "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, input_shape,
+            filter_shape, outback_shape, strides, padding, data_format, &dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    mkl_context.in_dims = 4;
+
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_top);
+
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+
+    mkl_context.filter_size[0] = dims.spatial_dims[1].filter_size;
+    mkl_context.filter_size[1] = dims.spatial_dims[0].filter_size;
+    mkl_context.filter_size[2] = dims.in_depth;
+    mkl_context.filter_size[3] = dims.out_depth;
+
+    mkl_context.filter_stride[0] =
+        mkl_context.filter_size[2] * mkl_context.filter_size[3];
+    mkl_context.filter_stride[1] = mkl_context.filter_size[2] *
+                                   mkl_context.filter_size[0] *
+                                   mkl_context.filter_size[3];
+    mkl_context.filter_stride[2] = mkl_context.filter_size[3];
+    mkl_context.filter_stride[3] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardData_F32(
+            &mkl_context.prim_bwddata, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+            mkl_context.filter_size, mkl_context.conv_strides,
+            mkl_context.input_offset, dnnBorderZeros),
+        E_SUCCESS);
+
+    // Allocate output tensor and shape
+    TensorShape mkl_out_shape;
+    MklShape mklOutputShape;
+    mklOutputShape.SetMklTensor(true);
+    mklOutputShape.SetMklLayout(mkl_context.prim_bwddata, dnnResourceDiffSrc);
+    mklOutputShape.SetTfLayout(mkl_context.in_dims, mkl_context.in_sizes,
+                               mkl_context.in_strides);
+    // MKL might change the dimension ordering.
+    // Create mapping to recover the original TF dimension order
+    mklOutputShape.SetTfDimOrder(mkl_context.in_dims, data_format);
+
+    Tensor* in_backprop = nullptr;
+    mkl_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                             mklOutputShape.GetMklLayout())) /
+                         sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &in_backprop, mkl_out_shape,
+                              mklOutputShape);
+
+    mkl_context.conv_res[dnnResourceDiffSrc] =
+        static_cast<void*>(const_cast<T*>(in_backprop->flat<T>().data()));
+
+    mkl_context.MklCreateInputLayouts(context);
+    Tensor mkl_tmp_outbackprop_buf_tensor, mkl_tmp_filter_buf_tensor;
+    mkl_context.MklPrepareConvolutionInputs(
+        context, &mkl_tmp_outbackprop_buf_tensor, &mkl_tmp_filter_buf_tensor);
+
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_bwddata, mkl_context.conv_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int input_offset[2];
+    size_t filter_size[4];
+    size_t filter_stride[4];
+    size_t conv_strides[2];
+    MklShape filter_shape, outback_shape;
+    dnnPrimitive_t prim_bwddata;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_outbackprop;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (filter_in_mkl_format) {
+        lt_filter = (dnnLayout_t)filter_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, in_dims, filter_size,
+                                     filter_stride),
+                 E_SUCCESS);
+      }
+
+      if (outback_in_mkl_format) {
+        lt_outbackprop = (dnnLayout_t)outback_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, in_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_outbackprop_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor) {
+      dnnPrimitive_t mkl_convert_filter = nullptr,
+                     mkl_convert_outbackprop = nullptr;
+      void *mkl_filter_buf = nullptr, *mkl_outbackprop_buf = nullptr;
+      dnnLayout_t mkl_lt_filter_internal = nullptr,
+                  mkl_lt_outbackprop_internal = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_filter_internal, prim_bwddata, dnnResourceFilter),
+               E_SUCCESS);
+
+      const Tensor& filter = MklGetInput(context, 1);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(&mkl_lt_outbackprop_internal,
+                                           prim_bwddata, dnnResourceDiffDst),
+          E_SUCCESS);
+      if (!dnnLayoutCompare_F32(mkl_lt_filter_internal, lt_filter)) {
+        // Create conversion primitive
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_filter, lt_filter,
+                                         mkl_lt_filter_internal),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_filter_internal, &mkl_filter_buf);
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_convert_filter,
+                static_cast<void*>(const_cast<T*>(filter.flat<T>().data())),
+                mkl_filter_buf),
+            E_SUCCESS);
+
+        // Assign filter buf to resources[] for convolution.
+        conv_res[dnnResourceFilter] = mkl_filter_buf;
+        dnnDelete_F32(mkl_convert_filter);
+      } else {
+        // If we do not need any layout conversion for filter, then
+        // we directly assign input filter to resources[].
+        conv_res[dnnResourceFilter] =
+            static_cast<void*>(const_cast<T*>(filter.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_filter_internal);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      // --
+      // We do similar steps as above for outputbackprop.
+      if (!dnnLayoutCompare_F32(mkl_lt_outbackprop_internal, lt_outbackprop)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&mkl_convert_outbackprop, lt_outbackprop,
+                                    mkl_lt_outbackprop_internal),
+            E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
+                       mkl_lt_outbackprop_internal, &mkl_outbackprop_buf);
+
+        CHECK_EQ(dnnConversionExecute_F32(mkl_convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          mkl_outbackprop_buf),
+                 E_SUCCESS);
+
+        conv_res[dnnResourceDiffDst] = mkl_outbackprop_buf;
+        dnnDelete_F32(mkl_convert_outbackprop);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_outbackprop_internal);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (!filter_in_mkl_format) dnnLayoutDelete_F32(lt_filter);
+      if (!outback_in_mkl_format) dnnLayoutDelete_F32(lt_outbackprop);
+      dnnDelete_F32(prim_bwddata);
+    }
+  } MklConvBackInputOpContext;
+
+  std::vector<int32> strides;
+  Padding padding;
+  TensorFormat data_format;
+};
+
+#define REGISTER_MKL_CPU_KERNELS(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
+#undef REGISTER_MKL_CPU_KERNELS
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
new file mode 100644
index 00000000000..df49e03f31c
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -0,0 +1,476 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include <string.h>
+#include <map>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T, bool biasEnabled>
+class MklConv2DOp : public OpKernel {
+ public:
+  ~MklConv2DOp() {}
+
+  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConv2DOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    const Tensor& filter = MklGetInput(context, 1);
+    MklShape mkl_filter_shape;
+    GetMklShape(context, 1, &mkl_filter_shape);
+    CHECK(!mkl_filter_shape.IsMklTensor())
+        << "Conv filter should not be in MKL Layout";
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      OP_REQUIRES(context, bias.dims() == 1,
+                  errors::InvalidArgument("bias must be 1-dimensional: ",
+                                          bias.shape().DebugString()));
+    }
+
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    const int64 input_depth =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
+                            : GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(
+        context, input_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter.dim_size(2)));
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
+                            : GetTensorDim(input, data_format_, 'H');
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
+    const int input_rows = static_cast<int>(input_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
+                            : GetTensorDim(input, data_format_, 'W');
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
+    const int input_cols = static_cast<int>(input_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 input_batch_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
+                            : GetTensorDim(input, data_format_, 'N');
+    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(input_batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      // TODO(jbobba): Verify correctness here
+      //               Need semantics for Null MKL tensor
+      return;
+    }
+
+    if (batch == 0) {
+      // Nothing to do, allocate output tensor and return
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
+                                mkl_output_mkl_shape);
+      return;
+    }
+
+    // Create MKL convolution primitives
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    mkl_context.filter_dims = filter.dims();
+
+    mkl_context.in_sizes[MklDims::W] = static_cast<size_t>(input_cols);
+    mkl_context.in_sizes[MklDims::H] = static_cast<size_t>(input_rows);
+    mkl_context.in_sizes[MklDims::C] = static_cast<size_t>(input_depth);
+    mkl_context.in_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.out_sizes[MklDims::W] = static_cast<size_t>(out_cols);
+    mkl_context.out_sizes[MklDims::H] = static_cast<size_t>(out_rows);
+    mkl_context.out_sizes[MklDims::C] = static_cast<size_t>(out_depth);
+    mkl_context.out_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_cols);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_rows);
+
+    mkl_context.conv_stride[0] = static_cast<size_t>(stride_cols);
+    mkl_context.conv_stride[1] = static_cast<size_t>(stride_rows);
+
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+
+    // TF filter dimension order (out_depth, in_depth, cols, rows) ->
+    // MKL filter dimension order (out_depth, in_depth, rows, cols)
+    mkl_context.filter_sizes[0] = filter.dim_size(1);  // cols
+    mkl_context.filter_sizes[1] = filter.dim_size(0);  // rows
+    mkl_context.filter_sizes[2] = filter.dim_size(2);  // in_depth
+    mkl_context.filter_sizes[3] = filter.dim_size(3);  // out_depth
+
+    // TF filter layout - (rows, cols, in_depth, out_depth)
+    mkl_context.filter_strides[0] =
+        filter.dim_size(2) * filter.dim_size(3);  // cols
+    mkl_context.filter_strides[1] =
+        filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
+    mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
+    mkl_context.filter_strides[3] = 1;                   // out_depth
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
+      mkl_context.bias_strides[0] = {1};
+    }
+
+    // Create Convolution Primitive
+    if (biasEnabled) {
+      CHECK_EQ(
+          dnnConvolutionCreateForwardBias_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
+    } else {
+      CHECK_EQ(
+          dnnConvolutionCreateForward_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
+    }
+
+    TensorShape mkl_output_tf_shape;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(true);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd, dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    // MKL might change the dimension ordering
+    // Create mapping to recover the original TF dimension order
+    mkl_output_mkl_shape.SetTfDimOrder(mkl_context.in_dims, data_format_);
+
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+    mkl_context.conv_res[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    mkl_context.MklCreateInputLayouts(context);
+
+    // Temp tensor used to allocate tmp buffers
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
+        mkl_tmp_bias_buf_tensor, mkl_tmp_buf_trans_input;
+    mkl_context.MklPrepareConvolutionInputs(context, data_format_,
+                                            input_in_mkl_format,
+                                            &mkl_tmp_input_buf_tensor,
+                                            &mkl_tmp_filter_buf_tensor,
+                                            &mkl_tmp_bias_buf_tensor,
+                                            &mkl_tmp_buf_trans_input);
+
+    // Execute convolution
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
+             E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    size_t bias_sizes[1];
+    size_t bias_strides[1];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+    dnnPrimitive_t prim_fwd;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_bias, lt_input;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, filter_dims, filter_sizes,
+                                   filter_strides),
+               E_SUCCESS);
+
+      if (biasEnabled) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     TensorFormat format,
+                                     bool input_in_mkl_format,
+                                     Tensor* mkl_tmp_input_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor,
+                                     Tensor* mkl_tmp_bias_buf_tensor,
+                                     Tensor* mkl_tmp_buf_trans_input) {
+      bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
+      dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
+          mkl_prim_convert_input;
+      dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
+          mkl_lt_internal_input, mkl_lt_trans_input;
+      void *mkl_buf_convert_input, *mkl_buf_convert_filter,
+          *mkl_buf_convert_bias, *mkl_buf_input;
+      mkl_prim_convert_filter = nullptr;
+      mkl_prim_convert_bias = nullptr;
+      mkl_prim_convert_input = nullptr;
+      mkl_lt_internal_filter = nullptr;
+      mkl_lt_internal_bias = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_lt_trans_input = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_filter = nullptr;
+      mkl_buf_convert_bias = nullptr;
+      mkl_buf_input = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      if (!input_in_mkl_format && format == FORMAT_NHWC) {
+        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
+            in_sizes[MklDims::N], in_sizes[MklDims::H],
+            in_sizes[MklDims::W], in_sizes[MklDims::C]);
+        OP_REQUIRES_OK(context, context->allocate_temp(
+            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
+        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
+        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
+            mkl_tmp_buf_trans_input->flat<float>().data()));
+        size_t strides[4];
+        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
+        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
+            strides), E_SUCCESS);
+      } else {
+          mkl_buf_input = const_cast<void*>(
+                              static_cast<const void*>(input.flat<T>().data()));
+          mkl_lt_trans_input = lt_input;
+      }
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
+                                                prim_fwd, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                 mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      if (!input_in_mkl_format && format == FORMAT_NHWC)
+        dnnLayoutDelete_F32(mkl_lt_trans_input);
+
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+      const Tensor& filter = MklGetInput(context, 1);
+      void* mkl_buf_filter =
+          const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
+                                                prim_fwd, dnnResourceFilter),
+               E_SUCCESS);
+      mkl_convert_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_filter, lt_filter);
+      if (mkl_convert_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
+                                         mkl_lt_internal_filter),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_internal_filter, &mkl_buf_convert_filter);
+        CHECK_EQ(
+            dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
+                                     mkl_buf_convert_filter),
+            E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_filter);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_filter);
+
+      conv_res[dnnResourceFilter] =
+          (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
+
+      if (biasEnabled) {
+        const Tensor& bias = MklGetInput(context, 2);
+        void* mkl_buf_bias =
+            const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
+                                                  prim_fwd, dnnResourceBias),
+                 E_SUCCESS);
+        mkl_convert_bias = !dnnLayoutCompare_F32(mkl_lt_internal_bias, lt_bias);
+        if (mkl_convert_bias) {
+          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, lt_bias,
+                                           mkl_lt_internal_bias),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
+                         &mkl_buf_convert_bias);
+          CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
+                                            mkl_buf_convert_bias),
+                   E_SUCCESS);
+          dnnDelete_F32(mkl_prim_convert_bias);
+        }
+        dnnLayoutDelete_F32(mkl_lt_internal_bias);
+
+        conv_res[dnnResourceBias] =
+            (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+      }
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_fwd);
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_filter);
+      if (biasEnabled) dnnLayoutDelete_F32(lt_bias);
+    }
+  } MklConv2DOpContext;
+
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DOp<CPUDevice, T, false>);        \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DOp<CPUDevice, T, true>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
new file mode 100644
index 00000000000..d5335368024
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -0,0 +1,689 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+// TODO(inteltf) Address comments from PR 8968.
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklFusedBatchNormOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& scale = MklGetInput(context, 1);
+    const Tensor& shift = MklGetInput(context, 2);
+    const Tensor& est_mean = MklGetInput(context, 3);
+    const Tensor& est_variance = MklGetInput(context, 4);
+
+    GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
+    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+    OP_REQUIRES(context, scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    OP_REQUIRES(context, shift.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        shift.shape().DebugString()));
+    OP_REQUIRES(context, est_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        est_mean.shape().DebugString()));
+    OP_REQUIRES(
+        context, est_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                est_variance.shape().DebugString()));
+    if (is_training_) {
+      OP_REQUIRES(context, est_mean.dim_size(0) == 0,
+                  errors::InvalidArgument("estimated_mean empty for training",
+                                          est_mean.shape().DebugString()));
+      OP_REQUIRES(context, est_variance.dim_size(0) == 0,
+                  errors::InvalidArgument(
+                      "estimated_variance must be empty for training",
+                      est_variance.shape().DebugString()));
+    }
+
+    unsigned int flag_batch_norm =
+        is_training_ ? dnnUseScaleShift
+                     : (dnnUseInputMeanVariance | dnnUseScaleShift);
+
+    mkl_context.MklExtractParams(context, tensor_format_);
+
+    // Create layout only for input data as it is used in Op primitive.
+    mkl_context.MklCreateInputLayout(context);
+
+    // Create Op primitive.
+    CHECK_EQ(dnnBatchNormalizationCreateForward_v2_F32(
+                 &(mkl_context.mkl_prim_batchnorm), nullptr,
+                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
+                 flag_batch_norm),
+             E_SUCCESS);
+
+    // Temporary tensors with buffers for the context inputs, if
+    // conversion to MKL-Op specific layouts are required. It is assumed here
+    // that TF's 1D tensors (scale, shift, est_mean, and est_variance) won't
+    // require any conversion.
+    // Since scale-shift is combined in MKL, a buffer is required.
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_scale_shift_buf_tensor;
+    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
+                                        &mkl_tmp_scale_shift_buf_tensor);
+
+    // Output data in MKL layout
+    Tensor* output = nullptr;
+    TensorShape tf_shape_output;
+    MklShape mkl_shape_output;
+    mkl_shape_output.SetMklTensor(true);
+    mkl_shape_output.SetMklLayout(mkl_context.mkl_prim_batchnorm,
+                                  dnnResourceDst);
+    mkl_shape_output.SetTfLayout(mkl_context.mkl_params.in_dim,
+                                 mkl_context.mkl_params.in_sizes,
+                                 mkl_context.mkl_params.in_strides);
+    mkl_shape_output.SetTfDimOrder(mkl_context.mkl_params.in_dim,
+                                   tensor_format_);
+    tf_shape_output.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                               mkl_shape_output.GetMklLayout())) /
+                           sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output, tf_shape_output,
+                              mkl_shape_output);
+    mkl_context.mkl_res_batchnorm[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    // Batch mean in TF layout
+    Tensor* batch_mean = nullptr;
+    MklShape mkl_shape_batch_mean;
+    mkl_shape_batch_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, &batch_mean, scale.shape(),
+                              mkl_shape_batch_mean);
+    // Batch variance in TF layout
+    Tensor* batch_variance = nullptr;
+    MklShape mkl_shape_batch_variance;
+    mkl_shape_batch_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 2, &batch_variance, scale.shape(),
+                              mkl_shape_batch_variance);
+    // If training mode, set dnnResourceMean and dnnResourceVariance to
+    // output tensors for batch mean and variance.
+    // Otherwise, set dnnResourceMean and dnnResourceVariance to
+    // estimated mean and variance.
+    if (is_training_)
+      mkl_context.MklSetMeanVariance(*batch_mean, *batch_variance);
+    else
+      mkl_context.MklSetMeanVariance(est_mean, est_variance);
+
+    // Now that all resources are set, it is ready for dnnExecute
+    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm,
+                            mkl_context.mkl_res_batchnorm),
+             E_SUCCESS);
+
+    // Mean and variance (without Bessel's correction) saved for backward
+    // computation to serve as pre-computed mean and variance.
+    Tensor* saved_mean = nullptr;
+    MklShape mkl_shape_saved_mean;
+    mkl_shape_saved_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 3, &saved_mean, scale.shape(),
+                              mkl_shape_saved_mean);
+    std::memcpy(
+        reinterpret_cast<char*>(saved_mean->flat<float>().data()),
+        reinterpret_cast<char*>(mkl_context.mkl_res_batchnorm[dnnResourceMean]),
+        scale.NumElements() * sizeof(float));
+    Tensor* saved_variance = nullptr;
+    MklShape mkl_shape_saved_variance;
+    mkl_shape_saved_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 4, &saved_variance, scale.shape(),
+                              mkl_shape_saved_variance);
+    std::memcpy(reinterpret_cast<char*>(saved_variance->flat<float>().data()),
+                reinterpret_cast<char*>(
+                    mkl_context.mkl_res_batchnorm[dnnResourceVariance]),
+                scale.NumElements() * sizeof(float));
+
+    // Bessel's correction on variance, if training mode is on
+    if (is_training_) {
+      float* p_var = static_cast<float*>(batch_variance->flat<T>().data());
+      auto depth = mkl_context.mkl_params.depth;
+      size_t orig_size = mkl_context.mkl_params.in_sizes[0] *
+                         mkl_context.mkl_params.in_sizes[1] *
+                         mkl_context.mkl_params.in_sizes[3];
+      size_t adjust_size = orig_size - 1;
+      float adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      for (int i = 0; i < depth; i++) p_var[i] = adjust_factor * p_var[i];
+    }
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+
+  // Structure containing all info for MklOp
+  typedef struct {
+    // Parameters used for input and output layouts
+    struct MklBatchNormParams {
+      // BatchNormOp src and
+      size_t in_dim;
+      size_t in_sizes[4];
+      size_t in_strides[4];
+      size_t depth;  // Batch normalization is done for per channel.
+    } mkl_params;
+
+    MklShape mkl_shape_input_shape;
+
+    // MKL primitive and resources for BatchNormOp
+    dnnPrimitive_t mkl_prim_batchnorm = nullptr;
+    void* mkl_res_batchnorm[dnnResourceNumber];
+
+    // MKL layouts for inputs in the context
+    dnnLayout_t mkl_lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
+      if (mkl_prim_batchnorm != nullptr) dnnDelete_F32(mkl_prim_batchnorm);
+    }
+
+    void MklExtractParams(OpKernelContext* context,
+                          const TensorFormat& tensor_format) {
+      const Tensor& input = MklGetInput(context, 0);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      mkl_params.in_dim = input_in_mkl_format
+                              ? mkl_shape_input_shape.GetDimension()
+                              : input.dims();
+      mkl_params.in_sizes[0] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
+                              : GetTensorDim(input, tensor_format, 'W'));
+      mkl_params.in_sizes[1] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
+                              : GetTensorDim(input, tensor_format, 'H'));
+      mkl_params.in_sizes[2] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
+                              : GetTensorDim(input, tensor_format, 'C'));
+      mkl_params.in_sizes[3] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
+                              : GetTensorDim(input, tensor_format, 'N'));
+      mkl_params.depth = mkl_params.in_sizes[2];
+      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
+                          mkl_params.in_sizes);
+    }
+
+    void MklCreateInputLayout(OpKernelContext* context) {
+      const Tensor& input = MklGetInput(context, 0);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        mkl_lt_input =
+            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dim,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+    }
+
+    void MklPrepareContextInputs(OpKernelContext* context,
+                                 Tensor* mkl_tmp_input_buf_tensor,
+                                 Tensor* mkl_tmp_scale_shift_buf_tensor) {
+      bool mkl_convert_input;
+      dnnPrimitive_t mkl_prim_convert_input = nullptr;
+      dnnLayout_t mkl_lt_internal_input = nullptr;
+      void* mkl_buf_converted_input = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_input, mkl_prim_batchnorm, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_converted_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_converted_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      mkl_res_batchnorm[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
+
+      // scale-shift layout is created from primitive. So no conversion
+      // is needed, however, a buffer has to be allocated.
+      dnnLayout_t mkl_lt_scale_shift = nullptr;
+      void* mkl_buf_scale_shift = nullptr;
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_lt_scale_shift, mkl_prim_batchnorm, dnnResourceScaleShift),
+          E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_scale_shift_buf_tensor,
+                     mkl_lt_scale_shift, &mkl_buf_scale_shift);
+      // Fill the scale-shift buffer with data, presumably buffer is 2D array
+      const Tensor& scale = MklGetInput(context, 1);
+      const Tensor& shift = MklGetInput(context, 2);
+      float* buf_scale_shift = static_cast<float*>(mkl_buf_scale_shift);
+      float* buf_scale = const_cast<float*>(
+          static_cast<const float*>(scale.flat<float>().data()));
+      float* buf_shift = const_cast<float*>(
+          static_cast<const float*>(shift.flat<float>().data()));
+      auto depth = mkl_params.depth;
+      for (int i = 0; i < depth; i++) {
+        buf_scale_shift[i] = buf_scale[i];
+        buf_scale_shift[i + depth] = buf_shift[i];
+      }
+      mkl_res_batchnorm[dnnResourceScaleShift] = mkl_buf_scale_shift;
+    }
+
+    inline void MklSetMeanVariance(const Tensor& mean, const Tensor& variance) {
+      mkl_res_batchnorm[dnnResourceMean] = const_cast<void*>(
+          static_cast<const void*>(mean.flat<float>().data()));
+      mkl_res_batchnorm[dnnResourceVariance] = const_cast<void*>(
+          static_cast<const void*>(variance.flat<float>().data()));
+    }
+  } MklFusedBatchNormOpContext;
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklFusedBatchNormGradOpContext mkl_context;
+
+    const Tensor& out_backprop = MklGetInput(context, 0);
+    const Tensor& input = MklGetInput(context, 1);
+    const Tensor& scale = MklGetInput(context, 2);
+    const Tensor& saved_mean = MklGetInput(context, 3);
+    const Tensor& saved_var = MklGetInput(context, 4);
+
+    // Here scale, mean, and variance are 1D and considered
+    // those having same layout in MKL and TF
+    GetMklShape(context, 0, &(mkl_context.mkl_shape_out_backprop));
+    GetMklShape(context, 1, &(mkl_context.mkl_shape_input_shape));
+
+    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+    bool out_backprop_in_mkl_format =
+        mkl_context.mkl_shape_out_backprop.IsMklTensor();
+    if (!out_backprop_in_mkl_format) {
+      OP_REQUIRES(context, out_backprop.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          out_backprop.shape().DebugString()));
+    }
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+    OP_REQUIRES(context, scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    OP_REQUIRES(context, saved_mean.dims() == 1,
+                errors::InvalidArgument("saved mean must be 1-dimensional",
+                                        saved_mean.shape().DebugString()));
+    OP_REQUIRES(context, saved_var.dims() == 1,
+                errors::InvalidArgument("saved variance must be 1-dimensional",
+                                        saved_var.shape().DebugString()));
+
+    mkl_context.MklExtractParams(context, tensor_format_);
+
+    mkl_context.MklCreateInputLayout(context);
+
+    unsigned int flag_batch_norm_grad = dnnUseScaleShift;
+
+    // Create Backward Op primitive.
+    CHECK_EQ(dnnBatchNormalizationCreateBackward_v2_F32(
+                 &(mkl_context.mkl_prim_batchnorm_bwd), nullptr,
+                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
+                 flag_batch_norm_grad),
+             E_SUCCESS);
+
+    // Temporary tensors and their buffers if conversion is required
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_outbackprop_buf_tensor,
+        mkl_tmp_scaleshift_buf_tensor;
+    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
+                                        &mkl_tmp_outbackprop_buf_tensor,
+                                        &mkl_tmp_scaleshift_buf_tensor);
+
+    // Allocate tensor for grad w.r.t. input(x)
+    Tensor* in_backprop = nullptr;
+    TensorShape tf_shape_in_backprop;
+    MklShape mkl_shape_in_backprop;
+    mkl_shape_in_backprop.SetMklTensor(true);
+    mkl_shape_in_backprop.SetMklLayout(mkl_context.mkl_prim_batchnorm_bwd,
+                                       dnnResourceDiffSrc);
+    mkl_shape_in_backprop.SetTfLayout(mkl_context.mkl_params.in_dims,
+                                      mkl_context.mkl_params.in_sizes,
+                                      mkl_context.mkl_params.in_strides);
+    mkl_shape_in_backprop.SetTfDimOrder(mkl_context.mkl_params.in_dims,
+                                        tensor_format_);
+    tf_shape_in_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_shape_in_backprop.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &in_backprop, tf_shape_in_backprop,
+                              mkl_shape_in_backprop);
+    mkl_context.mkl_res_batchnorm_bwd[dnnResourceDiffSrc] =
+        static_cast<void*>(in_backprop->flat<T>().data());
+
+    // grad_scale and grad_shift are combined together in MKL
+    // So create a single temporary buffer for those.
+    // Also set dnnResourceDiffScaleShift to the temporary buffer
+    Tensor mkl_tmp_grad_scale_shift_buf_tensor;
+    mkl_context.MklPrepareGradScaleShift(context,
+                                         &mkl_tmp_grad_scale_shift_buf_tensor);
+
+    // All dnn resources are set now, ready to execute
+    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm_bwd,
+                            mkl_context.mkl_res_batchnorm_bwd),
+             E_SUCCESS);
+
+    // Now separate out scale and shift grad and copy to individual tensors
+    const TensorShape& tf_shape_scale_shift = scale.shape();
+    // Allocate tensor for grad w.r.t. scale (beta)
+    Tensor* scale_backprop = nullptr;
+    MklShape mkl_shape_scale_backprop;
+    AllocateOutputSetMklShape(context, 1, &scale_backprop, tf_shape_scale_shift,
+                              mkl_shape_scale_backprop);
+
+    // Allocate tensor for grad w.r.t. shift(gamma)
+    Tensor* shift_backprop = nullptr;
+    MklShape mkl_shape_shift_backprop;
+    AllocateOutputSetMklShape(context, 2, &shift_backprop, tf_shape_scale_shift,
+                              mkl_shape_shift_backprop);
+
+    // copy scale and shift grads to tensors
+    float* mkl_buf_scale_shift = const_cast<float*>(static_cast<const float*>(
+        mkl_tmp_grad_scale_shift_buf_tensor.flat<T>().data()));
+    float* tf_buf_scale = const_cast<float*>(
+        static_cast<const float*>(scale_backprop->flat<T>().data()));
+    float* tf_buf_shift = const_cast<float*>(
+        static_cast<const float*>(shift_backprop->flat<T>().data()));
+    auto depth = mkl_context.mkl_params.depth;
+    for (int i = 0; i < depth; i++) {
+      tf_buf_scale[i] = mkl_buf_scale_shift[i];
+      tf_buf_shift[i] = mkl_buf_scale_shift[i + depth];
+    }
+
+    // Two placeholders for estimated_mean and estimated_variance, which are
+    // used for inference and thus not needed here for gradient computation.
+    Tensor* placeholder_1 = nullptr;
+    MklShape mkl_shape_placeholder_1;
+    AllocateOutputSetMklShape(context, 3, &placeholder_1, TensorShape({}),
+                              mkl_shape_placeholder_1);
+    Tensor* placeholder_2 = nullptr;
+    MklShape mkl_shape_placeholder_2;
+    AllocateOutputSetMklShape(context, 4, &placeholder_2, TensorShape({}),
+                              mkl_shape_placeholder_2);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+
+  // Structure containing all info for MklOp
+  typedef struct {
+    // Parameters used for input and output layouts
+    struct MklBatchNormParams {
+      // BatchNormOp src and
+      size_t in_dims;
+      size_t in_sizes[4];
+      size_t in_strides[4];
+      size_t depth;  // Batch normalization is done for per channel.
+    } mkl_params;
+
+    MklShape mkl_shape_out_backprop;
+    MklShape mkl_shape_input_shape;
+
+    // MKL primitive and resources for BatchNormOp
+    dnnPrimitive_t mkl_prim_batchnorm_bwd = nullptr;
+    void* mkl_res_batchnorm_bwd[dnnResourceNumber];
+
+    // MKL layouts for inputs in the context
+    dnnLayout_t mkl_lt_out_backprop = nullptr;
+    dnnLayout_t mkl_lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
+      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_out_backprop);
+
+      dnnDelete_F32(mkl_prim_batchnorm_bwd);
+    }
+
+    void MklExtractParams(OpKernelContext* context,
+                          const TensorFormat& tensor_format) {
+      const Tensor& input = MklGetInput(context, 1);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      mkl_params.in_dims = input_in_mkl_format
+                               ? mkl_shape_input_shape.GetDimension()
+                               : input.dims();
+      mkl_params.in_sizes[0] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
+                              : GetTensorDim(input, tensor_format, 'W'));
+      mkl_params.in_sizes[1] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
+                              : GetTensorDim(input, tensor_format, 'H'));
+      mkl_params.in_sizes[2] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
+                              : GetTensorDim(input, tensor_format, 'C'));
+      mkl_params.in_sizes[3] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
+                              : GetTensorDim(input, tensor_format, 'N'));
+      mkl_params.depth = mkl_params.in_sizes[2];
+      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
+                          mkl_params.in_sizes);
+    }
+
+    void MklCreateInputLayout(OpKernelContext* context) {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        mkl_lt_input =
+            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dims,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+
+      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
+      if (out_backprop_in_mkl_format) {
+        mkl_lt_out_backprop =
+            static_cast<dnnLayout_t>(mkl_shape_out_backprop.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_out_backprop, mkl_params.in_dims,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+    }
+
+    void MklPrepareContextInputs(OpKernelContext* context,
+                                 Tensor* mkl_tmp_input_buf_tensor,
+                                 Tensor* mkl_tmp_outbackprop_buf_tensor,
+                                 Tensor* mkl_tmp_scaleshift_buf_tensor) {
+      bool mkl_convert_input;
+      dnnPrimitive_t mkl_prim_convert_input = nullptr;
+      dnnLayout_t mkl_lt_internal_input = nullptr;
+      void* mkl_buf_converted_input = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 1);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_lt_internal_input, mkl_prim_batchnorm_bwd, dnnResourceSrc),
+          E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_converted_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_converted_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      mkl_res_batchnorm_bwd[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
+
+      bool mkl_convert_out_backprop;
+      dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
+      dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
+      void* mkl_buf_converted_out_backprop = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& out_backprop = MklGetInput(context, 0);
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      mkl_convert_out_backprop = !dnnLayoutCompare_F32(
+          mkl_lt_internal_out_backprop, mkl_lt_out_backprop);
+      if (mkl_convert_out_backprop) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
+                                         mkl_lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
+                       mkl_lt_internal_out_backprop,
+                       &mkl_buf_converted_out_backprop);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
+                                          mkl_buf_out_backprop,
+                                          mkl_buf_converted_out_backprop),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_out_backprop);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
+      mkl_res_batchnorm_bwd[dnnResourceDiffDst] =
+          (mkl_convert_out_backprop) ? mkl_buf_converted_out_backprop
+                                     : mkl_buf_out_backprop;
+
+      // Set dnnResourceMean and dnnResourceVariance
+      const Tensor& saved_mean = MklGetInput(context, 3);
+      const Tensor& saved_var = MklGetInput(context, 4);
+      void* mkl_buf_saved_mean = const_cast<void*>(
+          static_cast<const void*>(saved_mean.flat<T>().data()));
+      void* mkl_buf_saved_var = const_cast<void*>(
+          static_cast<const void*>(saved_var.flat<T>().data()));
+      mkl_res_batchnorm_bwd[dnnResourceMean] = mkl_buf_saved_mean;
+      mkl_res_batchnorm_bwd[dnnResourceVariance] = mkl_buf_saved_var;
+
+      // Set dnnResourceScaleShift
+      // Note backward Op needs only current values of scale parameters,
+      // shift parameters could be garbage and won't be used
+      const Tensor& scale = MklGetInput(context, 2);
+      dnnLayout_t mkl_lt_scale_shift = nullptr;
+      void* mkl_buf_scale_shift = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_scale_shift,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceScaleShift),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_scaleshift_buf_tensor, mkl_lt_scale_shift,
+                     &mkl_buf_scale_shift);
+      float* pscale =
+          const_cast<float*>(static_cast<const float*>(scale.flat<T>().data()));
+      float* pscale_shift = static_cast<float*>(mkl_buf_scale_shift);
+      auto depth = mkl_params.depth;
+      for (int i = 0; i < depth; i++) pscale_shift[i] = pscale[i];
+      mkl_res_batchnorm_bwd[dnnResourceScaleShift] = mkl_buf_scale_shift;
+      dnnLayoutDelete_F32(mkl_lt_scale_shift);
+    }
+
+    void MklPrepareGradScaleShift(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_scale_shift_buf_tensor) {
+      dnnLayout_t mkl_lt_grad_scaleshift = nullptr;
+      void* mkl_buf_grad_scaleshift = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_grad_scaleshift,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceDiffScaleShift),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_grad_scale_shift_buf_tensor,
+                     mkl_lt_grad_scaleshift, &mkl_buf_grad_scaleshift);
+      mkl_res_batchnorm_bwd[dnnResourceDiffScaleShift] =
+          mkl_buf_grad_scaleshift;
+      dnnLayoutDelete_F32(mkl_lt_grad_scaleshift);
+    }
+  } MklFusedBatchNormGradOpContext;
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormGradOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
new file mode 100644
index 00000000000..cb7ea7e7f90
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklShape mkl_shape_input;
+    GetMklShape(context, 0, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+
+    if (input_in_mkl_format) {
+      ForwarMklTensorInToOut(context, 0, 0);
+    } else {
+      FowardTfTensorInToOut(context, 0, 0);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklIdentityOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
new file mode 100644
index 00000000000..07a7e6b5daf
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -0,0 +1,736 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LRN = Local Response Normalization
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute local
+// response normalization
+
+#ifdef INTEL_MKL
+
+#define EIGEN_USE_THREADS
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/util/work_sharder.h"
+#endif
+
+namespace tensorflow {
+
+namespace {
+// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
+// depth_radius + 1) around the diagonal.
+template <typename T>
+void GetBandMatrix(int depth, int depth_radius,
+                   Eigen::Tensor<T, 2, Eigen::RowMajor>* result) {
+  result->setZero();
+  for (int row = 0; row < depth; ++row) {
+    const int begin = std::max<int>(0, row - depth_radius);
+    const int end = std::min<int>(depth, row + depth_radius + 1);
+    Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin);
+    Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin);
+    result->slice(start, sizes).setConstant(T(1));
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class MklLRNOp : public OpKernel {
+ public:
+  ~MklLRNOp() {}
+
+  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
+    depth_radius_ = static_cast<size_t>(depth_radius64);
+
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklLRNOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    // Sanity checks
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    OP_REQUIRES(context, mkl_context.in_dims == 4,
+                errors::InvalidArgument("input must be 4-dimensional"));
+    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("argument to LRN too large"));
+
+    if (!input_in_mkl_format) {
+      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                    beta_, input);
+      return;
+    }
+
+    if (input_in_mkl_format) {
+      // MKL supports normalization over channel dimension only
+      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
+          MklDims::C) {
+        mkl_context.lt_input =
+            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
+        workspace_enabled_ = true;
+      } else {
+        Tensor converted_tensor =
+            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
+        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                      beta_, converted_tensor);
+        return;
+      }
+    }
+
+    int kernel_size = 2 * depth_radius_ + 1;
+
+    CHECK_EQ(dnnLRNCreateForward_F32(
+                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
+                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
+             E_SUCCESS);
+
+    // Allocate output tensor and shape
+    Tensor* output = nullptr;
+    Tensor* workspace = nullptr;
+
+    // Convert Inputs if needed
+    Tensor mkl_tmp_input_buf_tensor;
+    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
+
+    // Allocate Layer Outputs
+    mkl_context.MklAllocateOutputs(context, &output, &workspace,
+                                   workspace_enabled_);
+
+    Tensor mkl_tmp_workspace_buf_tensor;
+    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
+                                     &mkl_tmp_workspace_buf_tensor,
+                                     workspace_enabled_);
+
+    // Execute LRN.
+    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
+             E_SUCCESS);
+
+    // Release MKL resources.
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    size_t in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    MklShape input_shape;
+    dnnPrimitive_t lrn_fwd = nullptr;
+    dnnPrimitive_t convert_input = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_internal_input = nullptr;
+    dnnLayout_t lt_internal_workspace = nullptr;
+    dnnLayout_t lt_internal_output = nullptr;
+    void* lrn_res[dnnResourceNumber];
+
+    // Convert Inputs if needed
+    void MklPrepareLRNInputs(OpKernelContext* context,
+                             Tensor* mkl_tmp_input_buf_tensor) {
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
+                                                dnnResourceSrc),
+               E_SUCCESS);
+
+      void* mkl_buf_convert_input = nullptr;
+      bool mkl_convert_input = false;
+      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
+
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
+                                         lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(convert_input);
+      }
+
+      lrn_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+    }
+
+    // Allocate Layer Outputs
+    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
+                            Tensor** workspace, bool workspace_enabled_) {
+      TensorShape mkl_output_tf_shape; /* First tensor */
+      MklShape mkl_output_mkl_shape;   /* Second tensor */
+
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
+      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+                                       input_shape.GetStrides());
+      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
+                                         input_shape.GetTfToMklDimMap());
+      mkl_output_tf_shape.AddDim(
+          dnnLayoutGetMemorySize_F32(
+              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+          sizeof(T));
+      AllocateOutputSetMklShape(context, 0, output,
+                                mkl_output_tf_shape /* First tensor */,
+                                mkl_output_mkl_shape /* Second Tensor */);
+
+      if (workspace_enabled_) {
+        TensorShape mkl_workspace_tf_shape; /* First tensor */
+        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
+        mkl_workspace_mkl_shape.SetMklTensor(false);
+        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
+        // Assumes workspace has same TF layout and TF dim order as input
+        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+                                            input_shape.GetStrides());
+        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
+                                              input_shape.GetTfToMklDimMap());
+        mkl_workspace_tf_shape.AddDim(
+            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                mkl_workspace_mkl_shape.GetMklLayout())) /
+            sizeof(T));
+        AllocateOutputSetMklShape(context, 1, workspace,
+                                  mkl_workspace_tf_shape /* First tensor */,
+                                  mkl_workspace_mkl_shape /* Second Tensor */);
+      }
+    }
+
+    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
+                              Tensor* workspace,
+                              Tensor* mkl_tmp_workspace_buf_tensor,
+                              bool workspace_enabled_) {
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
+                                                dnnResourceDst),
+               E_SUCCESS);
+
+      void* mkl_buf_output =
+          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+      lrn_res[dnnResourceDst] = mkl_buf_output;
+
+      void* mkl_buf_workspace = nullptr;
+      if (workspace_enabled_) {
+        mkl_buf_workspace = const_cast<void*>(
+            static_cast<const void*>(workspace->flat<T>().data()));
+      } else {
+        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
+                       lt_internal_workspace, &mkl_buf_workspace);
+      }
+      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
+    }
+
+    // Fallback implementation - Taken from lrn_op.cc
+    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+    // copy.
+    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
+                           float bias_, float alpha_, float beta_,
+                           const Tensor& input) {
+      const int batch = static_cast<int>(input.dim_size(0));
+      const int rows = static_cast<int>(input.dim_size(1));
+      const int cols = static_cast<int>(input.dim_size(2));
+      const int depth = static_cast<int>(input.dim_size(3));
+      const int nodes = cols * rows;
+
+      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+      // Multiplying the input with the band matrix has the effect of reducing
+      // the
+      // correct patch along the depth.
+      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+      Tensor *output, *workspace;
+      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      mkl_output_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
+                                mkl_output_mkl_shape);
+
+      mkl_workspace_mkl_shape.SetMklTensor(false);
+      mkl_workspace_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
+                                mkl_workspace_mkl_shape);
+
+      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+      if (beta_ == T(1)) {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * tmp.inverse();
+      } else if (beta_ == T(0.5)) {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * tmp.rsqrt();
+      } else {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * (tmp.log() * -beta_).exp();
+      }
+    }
+
+    // Release MKL resources.
+    void MklCleanup() {
+      dnnDelete_F32(lrn_fwd);
+      dnnLayoutDelete_F32(lt_internal_input);
+      dnnLayoutDelete_F32(lt_internal_workspace);
+      dnnLayoutDelete_F32(lt_internal_output);
+    }
+  } MklLRNOpContext;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+template <typename T>
+class MklLRNGradOp : public OpKernel {
+ public:
+  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
+    depth_radius_ = static_cast<int>(depth_radius64);
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklLRNGradOpContext mkl_context;
+    mkl_context.depth_radius_ = depth_radius_;
+    mkl_context.bias_ = bias_;
+    mkl_context.alpha_ = alpha_;
+    mkl_context.beta_ = beta_;
+
+    const Tensor& in_grads = MklGetInput(context, 0);
+    const Tensor& in_image = MklGetInput(context, 1);
+    const Tensor& out_image = MklGetInput(context, 2);
+
+    GetMklShape(context, 0, &mkl_context.ingrad_shape);
+    GetMklShape(context, 1, &mkl_context.inimage_shape);
+    GetMklShape(context, 2, &mkl_context.outimage_shape);
+
+    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
+    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
+    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
+
+    mkl_context.in_dims = inimage_in_mkl_format
+                              ? mkl_context.inimage_shape.GetDimension()
+                              : in_image.dims();
+    OP_REQUIRES(context, mkl_context.in_dims == 4,
+                errors::InvalidArgument("input images must be 4-dimensional"));
+
+    if (!workspace_enabled_) {
+      mkl_context.MklDefaultToEigen(context);
+      return;
+    }
+
+    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
+      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
+                                          ? &mkl_context.ingrad_shape
+                                          : &mkl_context.inimage_shape;
+      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
+        // Fallback to eigen
+        mkl_context.MklDefaultToEigen(context);
+        return;
+      } else {  // MKL supports normalization over channel dimension only
+        for (int i = 0; i < mkl_context.in_dims; i++) {
+          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
+              tmp_mkl_shape->GetSizes()[i];
+          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
+              tmp_mkl_shape->GetStrides()[i];
+        }
+      }
+    } else {
+      // Fallback to eigen
+      mkl_context.MklDefaultToEigen(context);
+      return;
+    }
+
+    // Dimensions check for sanity purpose
+    if (ingrad_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.ingrad_shape.GetDimension() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, in_grads.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
+    }
+
+    if (outimage_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.outimage_shape.GetDimension() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, out_image.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
+    }
+
+    // Prepare mkl input layout
+    mkl_context.MklPrepareLRNInputsLayouts(context);
+    int ksize = 2 * depth_radius_ + 1;
+
+    CHECK_EQ(dnnLRNCreateBackward_F32(
+                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
+                 mkl_context.lt_output, ksize,
+                 static_cast<float>(alpha_ * ksize), beta_, bias_),
+             E_SUCCESS);
+
+    // Allocate output tensor and shape.
+    TensorShape mkl_output_tf_shape; /* First tensor */
+    MklShape mkl_output_mkl_shape;   /* Second tensor */
+    mkl_output_mkl_shape.SetMklTensor(true);
+    CHECK_NE(mkl_context.lrn_bwd, nullptr);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
+    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    if (ingrad_in_mkl_format) {
+      mkl_output_mkl_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_output_mkl_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
+    }
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+    Tensor* output = nullptr;
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+
+    // Get pointers to output data.
+    void* user_output =
+        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
+        mkl_tmp_outimage_buf_tensor;
+    // Convert Inputs if needed
+    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
+                                       &mkl_tmp_image_buf_tensor,
+                                       &mkl_tmp_outimage_buf_tensor);
+
+    // We do not do any conversion for output. But we simply emit it
+    // in MKL format.
+    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
+    // Execute LRN backward using dnnExecute
+    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
+             E_SUCCESS);
+    // Release MKL resources.
+    mkl_context.Mklcleanup();
+  }
+
+ private:
+  typedef struct {
+    int depth_radius_;
+    float bias_;
+    float alpha_;
+    float beta_;
+    size_t in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    MklShape ingrad_shape, inimage_shape, outimage_shape;
+    dnnPrimitive_t lrn_bwd = nullptr;
+    dnnPrimitive_t convert_input = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_output = nullptr;
+    dnnLayout_t lt_bdw_input = nullptr;
+    dnnLayout_t lt_workspace = nullptr;
+    dnnLayout_t lt_internal_input = nullptr;
+    void* res_lrn_bwd[dnnResourceNumber];
+
+    // prepare mkl input
+    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (!ingrad_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
+      }
+
+      if (!inimage_in_mkl_format) {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
+            E_SUCCESS);
+      } else {
+        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
+      }
+    }
+
+    // convert input if needed
+    void MklPrepareLRNGradInput(OpKernelContext* context,
+                                Tensor* mkl_tmp_input_buf_tensor,
+                                Tensor* mkl_tmp_image_buf_tensor,
+                                Tensor* mkl_tmp_outimage_buf_tensor) {
+      const Tensor& in_grads = MklGetInput(context, 0);
+      const Tensor& in_image = MklGetInput(context, 1);
+      const Tensor& out_image = MklGetInput(context, 2);
+      const Tensor& workspace = MklGetInput(
+          context,
+          3); /*Worskpsace is enabled, get the buffer to the workspace */
+
+      void* user_input = const_cast<void*>(
+          static_cast<const void*>(in_grads.flat<T>().data()));
+      void* user_fwd_input = const_cast<void*>(
+          static_cast<const void*>(in_image.flat<T>().data()));
+      void* user_fwd_output = const_cast<void*>(
+          static_cast<const void*>(out_image.flat<T>().data()));
+      void* workspace_buffer = const_cast<void*>(
+          static_cast<const void*>(workspace.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
+                                                dnnResourceSrc),
+               E_SUCCESS);
+
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      if (ingrad_in_mkl_format) {
+        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+                         &res_lrn_bwd[dnnResourceDiffDst]);
+          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
+                                            res_lrn_bwd[dnnResourceDiffDst]);
+        } else {
+          res_lrn_bwd[dnnResourceDiffDst] = user_input;
+        }
+      } else {
+        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+          CHECK_EQ(
+              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
+              E_SUCCESS);
+
+          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+                         &res_lrn_bwd[dnnResourceDiffDst]);
+          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
+                                            res_lrn_bwd[dnnResourceDiffDst]),
+                   E_SUCCESS);
+          dnnDelete_F32(convert_input);
+        } else {
+          res_lrn_bwd[dnnResourceDiffDst] = user_input;
+        }
+      }
+
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (inimage_in_mkl_format) {
+        if (!dnnLayoutCompare_F32(
+                lt_internal_input,
+                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
+          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                         &res_lrn_bwd[dnnResourceSrc]);
+          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
+                                            res_lrn_bwd[dnnResourceSrc]);
+        } else {
+          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+        }
+      } else {
+        if (!dnnLayoutCompare_F32(
+                lt_internal_input,
+                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
+          CHECK_EQ(dnnConversionCreate_F32(
+                       &convert_input,
+                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
+                       lt_internal_input),
+                   E_SUCCESS);
+
+          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                         &res_lrn_bwd[dnnResourceSrc]);
+          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
+                                            res_lrn_bwd[dnnResourceSrc]),
+                   E_SUCCESS);
+          dnnDelete_F32(convert_input);
+        } else {
+          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+        }
+      }
+
+      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
+    }
+
+    // Fallback implementation - Taken from lrn_op.cc
+    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+    // copy.
+    void MklDefaultToEigen(OpKernelContext* context) {
+      Tensor in_grads;
+      Tensor in_image;
+      Tensor out_image;
+
+      GetMklShape(context, 0, &ingrad_shape);
+      GetMklShape(context, 1, &inimage_shape);
+      GetMklShape(context, 2, &outimage_shape);
+
+      if (ingrad_shape.IsMklTensor()) {
+        in_grads =
+            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
+      } else {
+        in_grads = MklGetInput(context, 0);
+      }
+
+      if (inimage_shape.IsMklTensor()) {
+        in_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
+      } else {
+        in_image = MklGetInput(context, 1);
+      }
+
+      if (outimage_shape.IsMklTensor()) {
+        out_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
+      } else {
+        out_image = MklGetInput(context, 2);
+      }
+
+      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
+      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
+      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
+      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
+      const auto nodes = cols * rows;
+
+      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
+      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
+      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
+
+      Tensor* output;
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      mkl_output_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
+                                mkl_output_mkl_shape);
+
+      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+      out_shaped.setZero();
+      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                    depth](int64 begin, int64 end) {
+        for (int64 i = begin; i < end; ++i) {
+          for (int64 j = 0; j < depth; ++j) {
+            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+            T norm(0);
+            for (int64 k = depth_begin; k < depth_end; ++k) {
+              norm += in_shaped(i, k) * in_shaped(i, k);
+            }
+            norm = alpha_ * norm + bias_;
+            DCHECK_GT(norm, T(1e-6));
+            for (int64 k = depth_begin; k < depth_end; ++k) {
+              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                      activations(i, j) / norm;
+              if (k == j) {
+                dyi += Eigen::numext::pow(norm, -beta_);
+              }
+              dyi *= grads_shaped(i, j);
+              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
+                  dyi;
+            }
+          }
+        }
+      };
+      auto worker_threads =
+          *(context->device()->tensorflow_cpu_worker_threads());
+      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+            depth * depth, shard);
+    }
+
+    // release mkl resources
+    void Mklcleanup() {
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (!ingrad_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
+      }
+
+      if (!inimage_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
+      }
+      dnnDelete_F32(lrn_bwd);
+      dnnLayoutDelete_F32(lt_bdw_input);
+      dnnLayoutDelete_F32(lt_workspace);
+    }
+  } MklLRNGradOpContext;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+#define REGISTER_MKL_LRN_CPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLRNOp<T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLRNGradOp<T>);
+
+TF_CALL_float(REGISTER_MKL_LRN_CPU);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
new file mode 100644
index 00000000000..16143191a34
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix
+// Multiplication (MatMul) operations.
+// We currently register this kernel only for MKL supported data
+// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
+// by the build system only when MKL is chosen as an option at configure stage
+// and when it is undefined at build time, this file becomes an empty
+// compilation unit
+
+#if defined(INTEL_MKL)
+
+#include "third_party/mkl/include/mkl_cblas.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class MklMatMulOp : public OpKernel {
+ public:
+  explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& a = ctx->input(0);
+    const Tensor& b = ctx->input(1);
+
+    // Check that the dimensions of the two matrices are valid.
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+                errors::InvalidArgument("In[0] is not a matrix"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+                errors::InvalidArgument("In[1] is not a matrix"));
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = transpose_a_ ? 0 : 1;
+    dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+    OP_REQUIRES(
+        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
+            ", In[1]: ", b.shape().DebugString()));
+    int a_dim_remaining = 1 - dim_pair[0].first;
+    int b_dim_remaining = 1 - dim_pair[0].second;
+    TensorShape out_shape(
+        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (out->NumElements() == 0) {
+      // If a has shape [0, x] or b has shape [x, 0], the output shape
+      // is a 0-element matrix, so there is nothing to do.
+      return;
+    }
+
+    if (a.NumElements() == 0 || b.NumElements() == 0) {
+      // If a has shape [x, 0] and b has shape [0, y], the
+      // output shape is [x, y] where x and y are non-zero, so we fill
+      // the output with zeros.
+      functor::SetZeroFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), out->flat<T>());
+      return;
+    }
+
+    const int m = a.dim_size(1 - dim_pair[0].first);
+    const int k = a.dim_size(dim_pair[0].first);
+    const int n = b.dim_size(1 - dim_pair[0].second);
+    bool transpose_a = dim_pair[0].first == 0;
+    bool transpose_b = dim_pair[0].second == 1;
+
+    auto a_ptr = (a.template flat<T>().data());
+    auto b_ptr = (b.template flat<T>().data());
+    auto c_ptr = (out->template flat<T>().data());
+
+    MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k,
+                b_ptr, transpose_b ? k : n, c_ptr, n);
+  }
+
+ private:
+  bool transpose_a_;
+  bool transpose_b_;
+
+  // --------------------------------------------------------------------------
+  //
+  // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
+  // interface. c = op(a) * op(b)
+  //
+  // @param transa  Specifies the form of op(a) used in MatMul. If transa is
+  // true, then op(a) = a^T, otherwise op(a) = a
+  //
+  // @param transb  Specifies the form of op(b) used in MatMul. If transb is
+  // true, then op(b) = b^T, otherwise op(b) = b
+  //
+  // @param m       Specifies the number of rows of the matrix op(a) and of the
+  // matrix c. The value of m must be at least zero.
+  //
+  // @param n       Specifies the number of columns of the matrix op(b) and the
+  // number of columns of the matrix c. The value of n must be at least zero.
+  //
+  // @param k       Specifies the number of columns of the matrix op(a) and the
+  // number of rows of the matrix op(b)
+  //
+  // @param a       Address of matrix a
+  //
+  // @param lda     Leading dimension of 'a' matrix. This is set at calling site
+  // depending on transa parameter. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows
+  // lda = max(1,k) when transa is false, otherwise lda = max(1,m)
+  //
+  // @param b       Address of matrix b
+  //
+  // @param ldb     Leading dimension of 'b' matrix. This is set at calling site
+  // depending on transb parameter. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows
+  // ldb = max(1,n) when transb is false, otherwise ldb = max(1,k)
+  //
+  // @param c       Address of matrix c
+  //
+  // @param ldc     Leading dimension of 'c' matrix. Since TF uses row-major
+  // layout, leading dimension is the stride between consecutive rows, max(1,n)
+  //
+  // --------------------------------------------------------------------------
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const float* a, const int lda, const float* b,
+                   const int ldb, float* c, const int ldc) {
+    // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
+    // + beta * c.
+    // Since TF MatMul does not have parameters for alpha, beta, we set them to
+    // 1.0 and 0.0 respectively.
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+                ldb, beta, c, ldc);
+  }
+
+  // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
+  // parameters, look at FP32 function description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const double* a, const int lda, const double* b,
+                   const int ldb, double* c, const int ldc) {
+    const double alpha = 1.0;
+    const double beta = 0.0;
+    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+                ldb, beta, c, ldc);
+  }
+
+  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
+  // For detailed info about parameters, look at FP32 function description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const std::complex<float>* a, const int lda,
+                   const std::complex<float>* b, const int ldb,
+                   std::complex<float>* c, int const ldc) {
+    const MKL_Complex8 alpha = {1.0f, 0.0f};
+    const MKL_Complex8 beta = {0.0f, 0.0f};
+    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k,
+                static_cast<const void*>(&alpha), static_cast<const void*>(a),
+                lda, static_cast<const void*>(b), ldb,
+                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+  }
+
+  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
+  // tensors. For detailed info about parameters, look at FP32 function
+  // description.
+  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+                   const int k, const std::complex<double>* a, const int lda,
+                   const std::complex<double>* b, const int ldb,
+                   std::complex<double>* c, const int ldc) {
+    const MKL_Complex16 alpha = {1.0, 0.0};
+    const MKL_Complex16 beta = {0.0, 0.0};
+    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k,
+                static_cast<const void*>(&alpha), static_cast<const void*>(a),
+                lda, static_cast<const void*>(b), ldb,
+                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+  }
+};
+
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
+
+// TODO(inteltf) Consider template specialization when adding/removing
+// additional types
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
new file mode 100644
index 00000000000..846bb5710de
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -0,0 +1,493 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingOpContext mkl_context;
+    // Get the input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    mkl_context.params.in_dim = 4;
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+    OP_REQUIRES_OK(context, context->status());
+
+    // Declare output tensor
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape, mkl_workspace_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    Tensor* output_tensor = nullptr;
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output_tensor, tensor_out_shape,
+                              mkl_out_shape);
+
+    Tensor* workspace_tensor;
+    void* workspace_buf = nullptr;
+
+    TensorShape workspace_shape;
+    mkl_workspace_shape.SetMklTensor(false);
+    workspace_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                               mkl_context.lt_workspace)) /
+                           sizeof(T));
+    AllocateOutputSetMklShape(context, 1, &workspace_tensor, workspace_shape,
+                              mkl_workspace_shape);
+
+    mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+        static_cast<const void*>(workspace_tensor->flat<T>().data()));
+    mkl_context.pooling_res[dnnResourceSrc] =
+        const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
+    mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd = nullptr;
+    dnnLayout_t lt_user_input = nullptr, lt_workspace = nullptr;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      // Create or use existing DNN user layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates layout for the workspace
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklMaxPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool workspace_enabled_;
+};
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingGradOpContext mkl_context;
+    // Input - The original input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+
+    // Output - Backprop tensor for input.
+    Tensor* output_tensor = nullptr;
+
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    GetMklShape(context, 2, &mkl_context.output_backprop_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (input_in_mkl_format == false)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayouts(context);
+    OP_REQUIRES_OK(context, context->status());
+
+    mkl_context.MklCreatePrimitives(context, workspace_enabled_);
+    OP_REQUIRES_OK(context, context->status());
+
+    mkl_context.MklPrepareInputs(context, workspace_enabled_);
+    OP_REQUIRES_OK(context, context->status());
+
+    // Create shape for the input back prop output
+    TensorShape mkl_input_backprop;
+    MklShape mkl_output_shape;
+    mkl_output_shape.SetMklTensor(true);
+    mkl_output_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                                  dnnResourceDiffSrc);
+    mkl_output_shape.SetTfLayout(mkl_context.params.in_dim,
+                                 mkl_context.params.in_sizes,
+                                 mkl_context.params.in_strides);
+    mkl_output_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    mkl_input_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output_tensor, mkl_input_backprop,
+                              mkl_output_shape);
+    mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup(workspace_enabled_);
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape, output_backprop_shape;
+    void* pooling_resfwd[dnnResourceNumber];
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd = nullptr, prim_pooling_bwd = nullptr,
+                   convert_input = nullptr, convert_outbackprop = nullptr;
+    dnnLayout_t lt_outbackprop_user = nullptr, lt_outbackprop_prim = nullptr,
+                lt_input_user = nullptr, lt_input_prim = nullptr;
+    void* input_buf;
+    void* outbackprop_buf;
+    Tensor tmp_output_buf_tensor;
+    Tensor workspace_buf_tensor;
+    Tensor input_buf_tensor, outbackprop_buf_tensor;
+
+    void MklCreateLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      // Create DNN user layout for input and outbackprop or get existing layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input_user, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input_user = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      // We don't care about the output layout for now as we can create it from
+      // primitives for the max pooling fwd prop
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop_user, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_outbackprop_user = (dnnLayout_t)output_backprop_shape.GetCurLayout();
+      }
+    }
+
+    // Create DNN primitives
+    void MklCreatePrimitives(OpKernelContext* context, bool workspace_enabled) {
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnPoolingCreateForward_F32(
+                     &prim_pooling_fwd, primAttr, algorithm, lt_input_user,
+                     params.kernel_size, params.kernel_stride, params.in_offset,
+                     dnnBorderZerosAsymm),
+                 E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_input_user,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates conversions
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
+                 E_SUCCESS);
+        if (!dnnLayoutCompare_F32(lt_input_user, lt_input_prim)) {
+          CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input_user,
+                                           lt_input_prim),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, &input_buf_tensor, lt_input_prim, &input_buf);
+        }
+      }
+
+      if (!dnnLayoutCompare_F32(lt_outbackprop_user, lt_outbackprop_prim)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&convert_outbackprop, lt_outbackprop_user,
+                                    lt_outbackprop_prim),
+            E_SUCCESS);
+        AllocTmpBuffer(context, &outbackprop_buf_tensor, lt_outbackprop_prim,
+                       &outbackprop_buf);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context, bool workspace_enabled) {
+      const Tensor& tensor_in = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+
+      void* tmp_output_buf = nullptr;
+      void* workspace_buf = nullptr;
+
+      if (workspace_enabled == false) {
+        if (convert_input != nullptr) {
+          if (input_in_mkl_format == false) {
+            CHECK_EQ(
+                dnnConversionExecute_F32(
+                    convert_input, const_cast<void*>(static_cast<const void*>(
+                                       tensor_in.flat<T>().data())),
+                    input_buf),
+                E_SUCCESS);
+            CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
+            convert_input = nullptr;
+          } else {
+            input_shape.GetConvertedFlatData(
+                lt_input_prim, const_cast<void*>(static_cast<const void*>(
+                                   tensor_in.flat<T>().data())),
+                input_buf);
+          }
+          pooling_resfwd[dnnResourceSrc] = input_buf;
+        } else {
+          pooling_resfwd[dnnResourceSrc] = const_cast<void*>(
+              static_cast<const void*>(tensor_in.flat<T>().data()));
+        }
+
+        dnnLayout_t lt_workspace;
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_workspace, prim_pooling_fwd, dnnResourceWorkspace),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, &workspace_buf_tensor, lt_workspace,
+                       &workspace_buf);
+        pooling_resfwd[dnnResourceWorkspace] = workspace_buf;
+
+        dnnLayoutDelete_F32(lt_workspace);
+
+        // We create the layout for max pooling fwd prop tmp output here
+        AllocTmpBuffer(context, &tmp_output_buf_tensor, lt_outbackprop_prim,
+                       &tmp_output_buf);
+        pooling_resfwd[dnnResourceDst] = tmp_output_buf;
+
+        CHECK_EQ(dnnExecute_F32(prim_pooling_fwd, pooling_resfwd), E_SUCCESS);
+        pooling_res[dnnResourceWorkspace] =
+            pooling_resfwd[dnnResourceWorkspace];
+      } else {
+        const Tensor& workspace = MklGetInput(context, 3);
+        pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+            static_cast<const void*>(workspace.flat<T>().data()));
+      }
+
+      // Out backprop conversions if needed
+      if (convert_outbackprop != nullptr) {
+        if (outbackprop_in_mkl_format == false) {
+          CHECK_EQ(dnnConversionExecute_F32(
+                       convert_outbackprop,
+                       const_cast<void*>(static_cast<const void*>(
+                           out_backprop.flat<T>().data())),
+                       outbackprop_buf),
+                   E_SUCCESS);
+          CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
+        } else {
+          output_backprop_shape.GetConvertedFlatData(
+              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
+                                       out_backprop.flat<T>().data())),
+              outbackprop_buf);
+        }
+        pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+      } else {
+        pooling_res[dnnResourceDiffDst] = const_cast<void*>(
+            static_cast<const void*>(out_backprop.flat<T>().data()));
+      }
+    }
+
+    void MklCleanup(bool workspace_enabled) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      }
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_user), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_prim), E_SUCCESS);
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_user), E_SUCCESS);
+      }
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_prim), E_SUCCESS);
+      }
+    }
+  } MklMaxPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  bool workspace_enabled_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklMaxPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklMaxPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklMaxPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
new file mode 100644
index 00000000000..65e8852cfb1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include <vector>
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+
+// Initialization for TensorFlow format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetSizes()[2];
+  tensor_in_cols = mklInputShape->GetSizes()[0];
+  tensor_in_rows = mklInputShape->GetSizes()[1];
+  tensor_in_batch = mklInputShape->GetSizes()[3];
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Common Initialization for TensorFlow and MKL formats
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format) {
+  // Get the data format
+  this->data_format = data_format;
+
+  // Get the output sizes
+  window_rows = GetTensorDim(ksize, data_format, 'H');
+  window_cols = GetTensorDim(ksize, data_format, 'W');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+
+  // Get the strides
+  row_stride = GetTensorDim(stride, data_format, 'H');
+  col_stride = GetTensorDim(stride, data_format, 'W');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 2D pooling across width/height and depthwise
+  // pooling, not a combination.
+  OP_REQUIRES(context,
+              (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+              errors::Unimplemented(
+                  "MaxPooling supports exactly one of pooling across depth "
+                  "or pooling across width/height."));
+
+  if (depth_window == 1) {
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
+  } else {
+    // Our current version of depthwise max pooling does not support
+    // any padding, and expects the depth_window to equal the depth
+    // stride (no overlapping).
+    OP_REQUIRES(context, depth % depth_window == 0,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to evenly divide the"
+                                      " input depth"));
+    OP_REQUIRES(context, depth_stride == depth_window,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to equal the depth"
+                                      " stride"));
+
+    // The current version of depthwise max is only implemented on CPU.
+    OP_REQUIRES(context,
+                (DeviceType(static_cast<Device*>(context->device())
+                                ->attributes()
+                                .device_type()) == DeviceType(DEVICE_CPU)),
+                errors::Unimplemented("Depthwise max pooling is currently "
+                                      "only implemented for CPU devices."));
+
+    pad_depth = 0;
+    out_depth = depth / depth_window;
+  }
+}
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params) {
+  mkl_params->in_sizes[0] = params.tensor_in_cols;
+  mkl_params->in_sizes[1] = params.tensor_in_rows;
+  mkl_params->in_sizes[2] = params.depth;
+  mkl_params->in_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->in_strides,
+                      mkl_params->in_sizes);
+
+  mkl_params->out_sizes[0] = params.out_width;
+  mkl_params->out_sizes[1] = params.out_height;
+  mkl_params->out_sizes[2] = params.depth;
+  mkl_params->out_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->out_strides,
+                      mkl_params->out_sizes);
+
+  mkl_params->in_offset[0] = -params.pad_left;
+  mkl_params->in_offset[1] = -params.pad_top;
+  mkl_params->in_offset[2] = -params.pad_right;
+  mkl_params->in_offset[3] = -params.pad_bottom;
+
+  mkl_params->kernel_stride[0] = params.col_stride;
+  mkl_params->kernel_stride[1] = params.row_stride;
+
+  mkl_params->kernel_size[0] = params.window_cols;
+  mkl_params->kernel_size[1] = params.window_rows;
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
new file mode 100644
index 00000000000..92ea2beb25a
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+
+#ifdef INTEL_MKL
+#include <vector>
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklPoolParameters {
+  int depth;
+
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+  int64 out_height;
+  int64 out_width;
+  int out_depth;
+
+  int64 pad_left;
+  int64 pad_right;
+  int64 pad_top;
+  int64 pad_bottom;
+  int pad_depth;
+
+  TensorFormat data_format;
+
+  // Updates context->status if there is an invalid input.
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const TensorShape& tensor_in_shape);
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklShape* mkl_in_shape);
+
+ private:
+  // Common initialization for TensorFlow and MKL formats
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format);
+};
+
+//-------------------------------------------------------------------
+// Utility functions
+
+typedef struct {
+  size_t in_dim;
+  size_t in_sizes[4];
+  size_t in_strides[4];
+  size_t out_sizes[4];
+  size_t out_strides[4];
+  int in_offset[4];
+  size_t kernel_stride[2];
+  size_t kernel_size[2];
+} MklPoolingOpParams;
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params);
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
new file mode 100644
index 00000000000..fabecc39a88
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -0,0 +1,372 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
+  }
+};
+
+template <typename Device, typename T>
+class MklReluOp : public OpKernel {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklReluOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
+      const TensorShape& o_shape = input.shape();
+      Tensor* out_tensor = nullptr;
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &out_tensor, o_shape,
+                                mkl_context.output_shape);
+      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
+      (static_cast<T*>(out_o))[0] =
+          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+      return;
+    }
+
+    // Generate size, stride for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
+        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
+      }
+    } else {
+      mkl_context.in_dims = input.dims();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
+      }
+      mkl_context.in_strides[0] = 1;
+      for (int i = 1; i < mkl_context.in_dims; i++) {
+        mkl_context.in_strides[i] =
+            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+      }
+    }
+
+    float negative_slope = 0.0;
+    mkl_context.MklCreateInputLayouts(context);
+    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
+                                      mkl_context.lt_input, negative_slope),
+             E_SUCCESS);
+
+    Tensor* output = nullptr;
+
+    if (input_in_mkl_format) {
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
+                                            dnnResourceDst);
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+      AllocateOutputSetMklShape(context, 0, &output, tf_shape,
+                                mkl_context.output_shape);
+    } else {
+      const TensorShape& o_shape = input.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &output, o_shape,
+                                mkl_context.output_shape);
+    }
+
+    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
+
+    mkl_context.relu_res[dnnResourceDst] = user_o;
+    mkl_context.relu_res[dnnResourceSrc] = user_i;
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, output_shape;
+    dnnPrimitive_t prim_relu_fwd = nullptr;
+    void* relu_res[dnnResourceNumber];
+    dnnLayout_t lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      dnnDelete_F32(prim_relu_fwd);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+    }
+  } MklReluOpContext;
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public OpKernel {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, grad_shape, output_shape;
+    void* relu_res[dnnResourceNumber];
+    dnnPrimitive_t prim_relu_bwd;
+    dnnLayout_t lt_input, lt_grad;
+
+    void MklPrepareReluGradInputs(OpKernelContext* context,
+                                  Tensor* mkl_tmp_input_buf_tensor) {
+      const Tensor& g = MklGetInput(context, 0);
+      const Tensor& a = MklGetInput(context, 1);
+      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+      void* mkl_buffer_convert = nullptr;
+      dnnPrimitive_t cv_input_to_grad = nullptr;
+
+      // if input and grad are not in the same layout, do a conversion between
+      // them.
+      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
+                       &mkl_buffer_convert);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
+                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
+        relu_res[dnnResourceSrc] = mkl_buffer_convert;
+        dnnDelete_F32(cv_input_to_grad);
+      } else {
+        relu_res[dnnResourceSrc] = buf_input;
+      }
+
+      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+      relu_res[dnnResourceDiffDst] = buf_grad;
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+
+      if (!grad_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
+      }
+    }
+
+    void MklCleanup() {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_relu_bwd);
+      if (!input_is_mkl) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      if (!grad_is_mkl) {
+        dnnLayoutDelete_F32(lt_grad);
+      }
+    }
+  } MklReluGradOpContext;
+};
+
+template <typename Device, typename T>
+
+void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
+  MklReluGradOpContext mkl_context;
+  const Tensor& g = MklGetInput(context, 0);
+  const Tensor& a = MklGetInput(context, 1);
+
+  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+  GetMklShape(context, 0, &mkl_context.grad_shape);
+  GetMklShape(context, 1, &mkl_context.input_shape);
+
+  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
+  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+  if (!input_is_mkl && !grad_is_mkl &&
+      !MklReluHelpers::ValidateSameSize(context, g, a))
+    return;
+  Tensor* output = nullptr;
+  if (!input_is_mkl && !grad_is_mkl &&
+      !a.dims()) {  // handle the case of a scalar
+    // Allocate space for g and
+    const TensorShape& g_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 0, &output, g_shape,
+                              mkl_context.output_shape);
+    void* out_o = static_cast<void*>(output->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+
+  // Generate size, stride for input if input/grad is in MKL format.
+  if (grad_is_mkl || input_is_mkl) {
+    const MklShape* tmp_mkl_shape =
+        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
+
+    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
+      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
+    }
+  } else {
+    mkl_context.in_dims = g.dims();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
+    }
+    mkl_context.in_strides[0] = 1;
+    for (int i = 1; i < mkl_context.in_dims; i++) {
+      mkl_context.in_strides[i] =
+          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+    }
+  }
+
+  mkl_context.MklCreateInputLayouts(context);
+  float negative_slope = 0.0;
+  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
+                                     mkl_context.lt_grad, mkl_context.lt_grad,
+                                     negative_slope),
+           E_SUCCESS);
+  Tensor mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
+
+  if (input_is_mkl ||
+      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+    TensorShape tf_shape;
+    mkl_context.output_shape.SetMklTensor(true);
+    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
+                                          dnnResourceDiffSrc);
+    mkl_context.output_shape.SetTfLayout(
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
+    // shape of one that is in MKL layout.
+    if (grad_is_mkl == true) {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+    }
+
+    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output, tf_shape,
+                              mkl_context.output_shape);
+
+  } else {
+    const TensorShape& o_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 0, &output, o_shape,
+                              mkl_context.output_shape);
+  }
+
+  mkl_context.relu_res[dnnResourceDiffSrc] =
+      static_cast<void*>(output->flat<T>().data());
+
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
+  mkl_context.MklCleanup();
+}
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Relu and f32*/
+#define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReluOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklReluGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
new file mode 100644
index 00000000000..593aa3a2fd6
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -0,0 +1,149 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+template <typename Device, typename T>
+class MklReshapeOp : public OpKernel {
+ public:
+  explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+    const int64 num_dims = sizes.NumElements();
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    auto vec_size = sizes.flat<int32>();
+    for (int d = 0; d < num_dims; ++d) {
+      const int32 size = vec_size(d);
+      if (size == -1) {
+        OP_REQUIRES(
+            context, unknown_index == -1,
+            errors::InvalidArgument("only one input size may be -1, not both ",
+                                    unknown_index, " and ", d));
+        unknown_index = d;
+        shape.AddDim(1);
+      } else {
+        OP_REQUIRES(context, size >= 0,
+                    errors::InvalidArgument(
+                        "size ", d, " must be non-negative, not ", size));
+        shape.AddDim(size);
+        product *= size;
+      }
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = input.NumElements() / product;
+      OP_REQUIRES(
+          context, product * missing == input.NumElements(),
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", input.NumElements(),
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        input.NumElements(),
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    MklShape mkl_shape_input;
+    GetMklShape(context, 0, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from;
+      for (size_t i = 0; i < mkl_shape_input.GetDimension(); i++) {
+        // Outermost to innermost dimension
+        shape_from.AddDim(
+            mkl_shape_input.GetSizes()[mkl_shape_input.tf_dim_idx(i)]);
+      }
+
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, 0, 0);
+        return;
+      } else {
+        // Allocate output tensor.
+        Tensor* output_tensor = NULL;
+        MklShape mkl_shape_output;
+        mkl_shape_output.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 0, &output_tensor, shape_to,
+                                  mkl_shape_output);
+
+        // Get output layout pointer.
+        dnnLayout_t output_layout =
+            static_cast<dnnLayout_t>(mkl_shape_input.GetTfLayout());
+
+        // Execute DNNConversion.
+        // Note: we  assume an MKL tensor always have float as its data type.
+        void* input_buffer =
+            static_cast<void*>(const_cast<float*>(input.flat<float>().data()));
+        void* output_buffer = static_cast<void*>(
+            const_cast<float*>(output_tensor->flat<float>().data()));
+        mkl_shape_input.GetConvertedFlatData(output_layout, input_buffer,
+                                             output_buffer);
+
+        VLOG(1) << "MKLToTFConversion complete successfully.";
+        return;
+      }
+    } else {
+      CopyTfTensorInToOutWithShape(context, 0, 0, shape);
+    }
+  }
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklReshape")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .HostMemory("shape")                  \
+                              .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int32>("Tshape")      \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReshapeOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
new file mode 100644
index 00000000000..48bd2e88bac
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+//               Op kernel
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklToTfOp : public OpKernel {
+ public:
+  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Check that input tensor is in MKL format.
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    MklShape input_shape;
+    GetMklShape(context, 0, &input_shape);
+
+    // if input is already in Tf format, then just copy input tensor to output.
+    if (!input_shape.IsMklTensor()) {
+      context->set_output(0, input_tensor);
+      VLOG(1) << "MKLToTFConversion: No conversion needed, "
+              << "copying input to output";
+      return;
+    }
+
+    // Check that input data type is same as operator data type and that it is
+    // same as output data type.
+    DataType input_data_type = input_type(0);
+    DataType output_data_type = output_type(0);
+    CHECK_EQ(op_data_type, input_data_type);
+    CHECK_EQ(op_data_type, output_data_type);
+
+    TensorShape output_shape;
+    size_t ndims = input_shape.GetDimension();
+    size_t* in_sizes = new size_t[ndims];
+    for (size_t i = 0; i < ndims; i++) {
+      // Outermost to innermost dimension
+      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
+      in_sizes[i] = input_shape.GetSizes()[i];
+    }
+
+    // Allocate output tensor.
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+
+    // If data format is NHWC, transform MKL tensor to NCHW format and then
+    // do NCHW -> NHWC.
+    dnnLayout_t lt_trans_input = nullptr;
+    Tensor mkl_tmp_trans_input_buf_tensor;
+    void* buf_trans_input = nullptr;
+    bool input_fmt_nhwc = input_shape.IsTensorInNHWCFormat();
+    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
+      size_t strides_nchw[4];
+      GetStridesFromSizes(FORMAT_NCHW, strides_nchw, in_sizes);
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes,
+                                           strides_nchw), E_SUCCESS);
+      AllocTmpBuffer(context, &mkl_tmp_trans_input_buf_tensor, lt_trans_input,
+                     &buf_trans_input);
+    }
+    else {
+      lt_trans_input = static_cast<dnnLayout_t>(input_shape.GetTfLayout());
+      buf_trans_input =
+          static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+    }
+
+    // Execute DNNConversion.
+    void* input_buffer =
+        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
+    input_shape.GetConvertedFlatData(lt_trans_input, input_buffer,
+                                     buf_trans_input);
+    // NCHW -> NHWC, if data format is NHWC
+    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
+      dnnLayoutDelete_F32(lt_trans_input);
+      TensorShape nhwc_shape = ShapeFromFormat(FORMAT_NHWC,
+          in_sizes[MklDims::N], in_sizes[MklDims::H],
+          in_sizes[MklDims::W], in_sizes[MklDims::C]);
+      MklNCHWToNHWC(mkl_tmp_trans_input_buf_tensor, &output_tensor);
+    }
+
+    delete in_sizes;
+
+    VLOG(1) << "MKLToTFConversion complete successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+///////////////////////////////////////////////////////////
+//               Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklToTfOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
new file mode 100644
index 00000000000..fcbf105f8fa
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "third_party/mkl/include/mkl_trans.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/kernels/transpose_op.h"
+
+namespace tensorflow {
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+//      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+//    where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                      gtl::ArraySlice<int32> perm,
+                                      Tensor* out) {
+  if (in.dims() == 2 && in.dtype() == DT_FLOAT) {
+    float* user_o = out->flat<float>().data();
+    const float* user_i = in.flat<float>().data();
+
+    // Documentation here: https://software.intel.com/en-us/node/520863
+    // Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols,
+    //              alpha (for scaling), array, dist_bet_adjacent_cols/rows
+    //              (source), array, dist_bet_adjacent_cols/rows (dest))
+    mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1, user_i,
+                  in.dim_size(1), user_o, in.dim_size(0));
+
+    return Status::OK();
+  }
+
+  // Fallback to eigen if transpose parameters not supported by MKL
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
+                                   out);
+}  // MklTransposeCpuOp::DoTranspose
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index 5a4bc80d8e7..8c0109f5c87 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <cmath>
 #include <memory>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -67,8 +68,8 @@ struct MultinomialFunctor<CPUDevice, T> {
     //
     // This takes O(BatchSize * NumSamples * log(NumClasses) + NumClasses) CPU
     // time.
-    auto DoWork = [num_samples, num_classes, &gen, &output, &logits](
-        int64 start_row, int64 limit_row) {
+    auto DoWork = [ctx, num_samples, num_classes, &gen, &output, &logits](
+                      int64 start_row, int64 limit_row) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
@@ -78,35 +79,41 @@ struct MultinomialFunctor<CPUDevice, T> {
       gen_copy.Skip(start_row * (num_samples + 3) / 4);
       random::SimplePhilox simple_philox(&gen_copy);
 
-      std::vector<float> cdf(num_classes);
-
+      Tensor cdf_tensor;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DT_DOUBLE, TensorShape({num_classes}),
+                                        &cdf_tensor));
+      auto cdf = cdf_tensor.flat<double>();
       for (int64 b = start_row; b < limit_row; ++b) {
         const auto* logits_row = &logits(b, 0);
 
         // Takes an along-class maximum (for numerical stability).
         T max = std::numeric_limits<T>::lowest();
         for (int64 j = 0; j < num_classes; ++j) {
-          if (std::isfinite(static_cast<float>(logits_row[j]))) {
+          if (Eigen::numext::isfinite(logits_row[j])) {
             max = std::max(max, logits_row[j]);
           }
         }
-        const float max_logit = static_cast<float>(max);
+        const double max_logit = static_cast<double>(max);
 
         // Precompute cumulative probability distribution across classes.
         // Note: This isn't normalized.
-        float running_total = 0;
+        cdf = (logits.template chip<0>(b).template cast<double>() - max_logit)
+                  .exp();
+        double running_total = 0;
         for (int64 j = 0; j < num_classes; ++j) {
-          if (std::isfinite(static_cast<float>(logits_row[j]))) {
-            running_total +=
-                std::exp(static_cast<float>(logits_row[j]) - max_logit);
+          if (Eigen::numext::isfinite(logits_row[j])) {
+            running_total += cdf(j);
           }
-          cdf[j] = running_total;
+          cdf(j) = running_total;
         }
         // Generate each sample.
+        const double* cdf_begin = cdf.data();
+        const double* cdf_end = cdf.data() + num_classes;
         for (int64 j = 0; j < num_samples; ++j) {
-          float to_find = simple_philox.RandFloat() * running_total;
-          auto found_iter = std::upper_bound(cdf.begin(), cdf.end(), to_find);
-          output(b, j) = std::distance(cdf.begin(), found_iter);
+          const double to_find = simple_philox.RandDouble() * running_total;
+          auto found_iter = std::upper_bound(cdf_begin, cdf_end, to_find);
+          output(b, j) = std::distance(cdf_begin, found_iter);
         }
       }
     };
@@ -148,9 +155,9 @@ class MultinomialOp : public OpKernel {
     for (int i = 0; i < 2; i++) {
       const int64 dim = logits_t.dim_size(i);
       OP_REQUIRES(ctx, static_cast<int>(dim) == dim,
-                  errors::InvalidArgument("logits.shape = ",
-                                          logits_t.shape().DebugString(),
-                                          " too large for int"));
+                  errors::InvalidArgument(
+                      "logits.shape = ", logits_t.shape().DebugString(),
+                      " too large for int"));
     }
     const int batch_size = static_cast<int>(logits_t.dim_size(0));
     const int num_classes = static_cast<int>(logits_t.dim_size(1));
@@ -183,7 +190,9 @@ class MultinomialOp : public OpKernel {
                                &scratch));
       }
 
-      const int num_samples_ceil_4 = (num_samples + 3) / 4 * 4;
+      int num_samples_ceil_4 = (num_samples + 3) / 4 * 4;
+      // CPU generates doubles = 2 samples per number.
+      if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
           generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
       functor::MultinomialFunctor<Device, T>()(
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 97c00ad9c74..19b4f3ca559 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -87,9 +87,13 @@ struct MultinomialFunctor<GPUDevice, T> {
     // Calculates "scores = logits - log(-log(noises))"; B*C*S elements.
     // NOTE: we don't store back to "noises" because having it appear on both
     // sides is potentially unsafe (e.g. Eigen may use ldg() to load RHS data).
+    // 2e-30 is chosen so as to be small enough to only change 0 -> 2e-30 while
+    // not affect any of the other numbers (smallest is ~1e-7), but not so small
+    // that log(x) == -inf, which is why it needs to be larger than 0 in the
+    // first place.
     To32Bit(scores).device(d) =
         To32Bit(logits).reshape(boc).broadcast(oso).template cast<float>() -
-        ((-(To32Bit(noises).log())).log());
+        ((-((To32Bit(noises) + 2e-30f).log())).log());
 
     // Max-reduce along classes for each (batch, sample).
     To32Bit(maxima).device(d) = To32Bit(scores).reshape(bsc).maximum(kTwo);
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
new file mode 100644
index 00000000000..7641516e3b0
--- /dev/null
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -0,0 +1,43 @@
+# Description:
+# Kernel implementations using Neon intrinsics.
+#
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_kernel_library(
+    name = "neon_depthwise_conv_op",
+    hdrs = [
+        "depthwiseconv_float.h",
+        "types.h",
+    ],
+    prefix = "neon_depthwise_conv_op",
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:ops_util",
+        "@gemmlowp//:gemmlowp",
+    ],
+)
diff --git a/tensorflow/core/kernels/neon/depthwiseconv_float.h b/tensorflow/core/kernels/neon/depthwiseconv_float.h
new file mode 100644
index 00000000000..acd58a644f3
--- /dev/null
+++ b/tensorflow/core/kernels/neon/depthwiseconv_float.h
@@ -0,0 +1,725 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/core/kernels/neon/types.h"
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace tensorflow {
+namespace neon {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input[4];
+        for (int i = 0; i < 4; i++) {
+          input[i] = vld1q_f32(local_input_ptr + 4 * i);
+        }
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const float* input_data,
+    int pad_width, int depth_multiplier, int filter_width,
+    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, float* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+
+  VLOG(1) << "DepthwiseConv2d using slow path with "
+          << "stride = " << stride << ", "
+          << "input_depth = " << input_depth << ", "
+          << "depth_multiplier = " << depth_multiplier << ".";
+
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  DCHECK(output_depth == input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 1024;
+  float acc_buffer[kAccBufferMaxSize];
+  DCHECK_GE(kAccBufferMaxSize, output_depth)
+      << "Too small kAccBufferMaxSize for this model!";
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
+  DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+
+  const int kMaxFixedDepthMultiplier = 8;
+  int fixed_depth_multiplier = 0;
+  if (depth_multiplier <= kMaxFixedDepthMultiplier) {
+    fixed_depth_multiplier = depth_multiplier;
+  }
+  // kMaxUnrolling is the max number of output values that we aim to handle
+  // in one unrolled iteration of the inner loop. For practical performance
+  // reasons, it is limited by the number of available registers. We could
+  // fine-tune it depending on the architecture, but that's not worth doing
+  // since this whole code is not very optimized to begin with. The
+  // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
+  // vector registers.
+  const int kMaxUnrolling = 8;
+  int fixed_input_depth = 0;
+  if (fixed_depth_multiplier &&
+      input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
+    fixed_input_depth = input_depth;
+  }
+#define TF_NEON_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                         FIXED_DEPTH_MULTIPLIER)           \
+  if ((stride == 1 || ALLOW_STRIDED) &&                                    \
+      fixed_input_depth == FIXED_INPUT_DEPTH &&                            \
+      fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                  \
+    row_accum_func =                                                       \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,       \
+                                   FIXED_DEPTH_MULTIPLIER>;                \
+  }
+
+#ifdef USE_NEON
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+#endif  // USE_NEON
+
+#undef TF_NEON_USE_DEPTHWISECONV_KERNEL
+
+  // Now that we have determined row_accum_func, we can start work.
+  float* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(stride, input_depth, input_width,
+                         input_data + in_y * input_dims.strides[2] +
+                             b * input_dims.strides[3],
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_dims.strides[2],
+                         out_x_buffer_start, out_x_buffer_end, output_depth,
+                         acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]);
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f),
+                                 vminq_f32(vdupq_n_f32(6.f), acc[k]));
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(-1.f),
+                                 vminq_f32(vdupq_n_f32(1.f), acc[k]));
+            }
+          }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc =
+                vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc));
+          }
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = std::max(0.f, acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = std::max(0.f, std::min(6.f, acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc = std::max(-1.f, std::min(1.f, acc));
+          }
+          *output_ptr++ = acc;
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace neon
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
new file mode 100644
index 00000000000..818b44aab39
--- /dev/null
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -0,0 +1,203 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#include "public/gemmlowp.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+// A version of tensorflow/core/kernels/depthwise_conv_op.cc that
+// uses the neon intrinsics.
+class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
+ public:
+  explicit NeonDepthwiseConv2dNativeOp(OpKernelConstruction* context)
+      : BinaryOp<float>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    const int32 in_depth = input.dim_size(3);
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+    const int32 batch = input.dim_size(0);
+    const int32 input_rows = input.dim_size(1);
+    const int32 input_cols = input.dim_size(2);
+
+    const int32 filter_rows = filter.dim_size(0);
+    const int32 filter_cols = filter.dim_size(1);
+    const int32 depth_multiplier = filter.dim_size(3);
+
+    const int32 out_depth = in_depth * depth_multiplier;
+
+    const int32 stride = strides_[1];
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+    OP_REQUIRES(
+        context, out_shape.num_elements() <= 2147483647,
+        errors::InvalidArgument("total number of outputs should be within the "
+                                "range of int which is used in the GPU kernel",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "NeonDepthwiseConv2dNative: "
+            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
+            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
+            << filter_cols << ", " << in_depth << ", " << depth_multiplier
+            << "]; stride = " << stride << ", pad_rows = " << pad_rows
+            << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
+            << out_rows << ", " << out_cols << ", " << out_depth << "]";
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    const float* input_ptr = input.template flat<float>().data();
+    const float* filter_ptr = filter.template flat<float>().data();
+    float* output_ptr = output->template flat<float>().data();
+
+    auto input_neon_dims = ToNeonDims(input.shape());
+    auto filter_neon_dims = FilterToNeonDims(filter.shape());
+    auto bias_neon_dims = BiasNeonDims(filter.shape());
+
+    int64 bias_size = bias_neon_dims.sizes[0];
+    float* bias_ptr = static_cast<float*>(port::AlignedMalloc(
+        bias_size * sizeof(float), Allocator::kAllocatorAlignment));
+    memset(bias_ptr, 0, bias_size * sizeof(float));
+
+    neon::DepthwiseConv<neon::FusedActivationFunctionType::kNone>(
+        input_ptr, input_neon_dims, filter_ptr, filter_neon_dims, bias_ptr,
+        bias_neon_dims, stride, pad_cols, pad_rows, depth_multiplier,
+        output_ptr, ToNeonDims(out_shape));
+
+    port::AlignedFree(bias_ptr);
+  }
+
+ private:
+  void SetNeonDimStrides(neon::Dims<4>* d) {
+    int64 stride = 1;
+    for (int i = 0; i < 4; ++i) {
+      d->strides[i] = stride;
+      stride *= d->sizes[i];
+    }
+  }
+
+  neon::Dims<4> ToNeonDims(const TensorShape& input) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    neon::Dims<4> result;
+    result.sizes[0] = input.dim_size(3);
+    result.sizes[1] = input.dim_size(2);
+    result.sizes[2] = input.dim_size(1);
+    result.sizes[3] = input.dim_size(0);
+    SetNeonDimStrides(&result);
+    return result;
+  }
+
+  neon::Dims<4> FilterToNeonDims(const TensorShape& filter) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    neon::Dims<4> result;
+    result.sizes[0] = filter.dim_size(2) * filter.dim_size(3);
+    result.sizes[1] = filter.dim_size(1);
+    result.sizes[2] = filter.dim_size(0);
+    result.sizes[3] = 1;
+    SetNeonDimStrides(&result);
+
+    return result;
+  }
+
+  neon::Dims<4> BiasNeonDims(const TensorShape& filter) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    // Bias has only output channel set.
+    neon::Dims<4> result;
+    result.sizes[0] =
+        filter.dim_size(2) * filter.dim_size(3);  // output channels
+    result.sizes[1] = 1;
+    result.sizes[2] = 1;
+    result.sizes[3] = 1;
+    SetNeonDimStrides(&result);
+
+    return result;
+  }
+
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NeonDepthwiseConv2dNativeOp);
+};
+
+#define REGISTER_CPU_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<float>("T") \
+                              .Label("neon"),             \
+                          NeonDepthwiseConv2dNativeOp);
+
+TF_CALL_float(REGISTER_CPU_KERNEL);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/neon/types.h b/tensorflow/core/kernels/neon/types.h
new file mode 100644
index 00000000000..e258ee0dfb0
--- /dev/null
+++ b/tensorflow/core/kernels/neon/types.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+
+namespace tensorflow {
+namespace neon {
+
+enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu };
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+  DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+  DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+  DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  int max_offset = 0;
+  for (int i = 0; i < 4; i++) {
+    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  }
+  return max_offset + 1;
+}
+
+}  // end namespace neon
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 2cc853094be..867cbc05b0e 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -155,9 +155,10 @@ static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
   SetConstSizesOp("input_sizes",
                   std::vector<int32>({batch, rows, cols, in_depth}),
                   graph.add_node());
-  SetConstSizesOp("filter_sizes", std::vector<int32>({filter_rows, filter_cols,
-                                                      in_depth, out_depth}),
-                  graph.add_node());
+  SetConstSizesOp(
+      "filter_sizes",
+      std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}),
+      graph.add_node());
   SetConstSizesOp("resize_size", std::vector<int32>({rows, cols}),
                   graph.add_node());
 
@@ -711,10 +712,19 @@ BM_ConvFloatDepthwiseBk(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
 BM_ConvFloatDepthwiseBk(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
 BM_ConvFloatDepthwiseBk(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
 BM_ConvFloatDepthwiseBk(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
-// Benchmarks with different stride and padding options.
+// Benchmarks with different stride and padding options, varying depth
+// multiplier.
 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
 
+// Vary depth multiplier.
+BM_ConvFloatDepthwiseBk(32, 112, 112, 1, 24, 24, 3, 3, 1, SAME, conv9);
+BM_ConvFloatDepthwiseBk(32, 112, 112, 2, 12, 24, 3, 3, 1, SAME, conv10);
+BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv11);
+BM_ConvFloatDepthwiseBk(32, 112, 112, 8, 3, 24, 3, 3, 1, SAME, conv12);
+BM_ConvFloatDepthwiseBk(32, 112, 112, 12, 2, 24, 3, 3, 1, SAME, conv13);
+BM_ConvFloatDepthwiseBk(32, 112, 112, 24, 1, 24, 3, 3, 1, SAME, conv14);
+
 static void BM_LRNFloat(int iters, int depth, int cols, int rows,
                         int batch_size, int range, int num_threads,
                         const string& label) {
@@ -1305,4 +1315,97 @@ BM_ImageNetSoftmaxFwdCPU(128, 1008, 1, "softmax128");
 BM_ImageNetSoftmaxFwdCPU(32, 1008, 4, "softmax32");
 BM_ImageNetSoftmaxFwdCPU(128, 1008, 4, "softmax128");
 
+static void BM_TopK(int iters, int rows, int cols, int k, int num_threads,
+                    bool use_gpu, const string& label) {
+  testing::StopTiming();
+  auto root = Scope::NewRootScope().ExitOnError();
+
+  Tensor input(DT_FLOAT, TensorShape({rows, cols}));
+  input.flat<float>().setRandom();
+
+  Tensor input_k(DT_INT32, TensorShape({}));
+  input_k.scalar<int32>()() = k;
+
+  auto top_k = ops::TopK(root, input, input_k, ops::TopK::Sorted(true));
+
+  TF_CHECK_OK(root.status());
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(root.ToGraph(g));
+  string device = use_gpu ? "gpu" : "cpu";
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(1);
+  opts.config.set_intra_op_parallelism_threads(num_threads);
+  opts.config.set_use_per_session_threads(true);
+  opts.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_opt_level(OptimizerOptions_Level_L0);
+  testing::UseRealTime();
+  testing::StartTiming();
+  test::Benchmark(device, g, &opts).Run(iters);
+  testing::ItemsProcessed(rows * cols * iters);
+  testing::SetLabel(label);
+}
+
+// IR: input_rows
+// IC: input_cols
+// IK: k
+// TH: number of threads
+#define BM_TopKGPU(IR, IC, IK, TH, LABEL)                        \
+  static void BM_TopK_GPU_##IR##_##IC##_##IK##_##TH(int iters) { \
+    BM_TopK(iters, IR, IC, IK, TH, true, LABEL);                 \
+  }                                                              \
+  BENCHMARK(BM_TopK_GPU_##IR##_##IC##_##IK##_##TH)
+
+#define BM_TopKCPU(IR, IC, IK, TH, LABEL)                        \
+  static void BM_TopK_CPU_##IR##_##IC##_##IK##_##TH(int iters) { \
+    BM_TopK(iters, IR, IC, IK, TH, false, LABEL);                \
+  }                                                              \
+  BENCHMARK(BM_TopK_CPU_##IR##_##IC##_##IK##_##TH)
+
+// clang-format on
+
+BM_TopKCPU(1, 100, 1, 16, "topk_r_1_c_100_k_1_th_16");
+BM_TopKCPU(1, 100, 2, 16, "topk_r_1_c_100_k_2_th_16");
+BM_TopKCPU(1, 100, 10, 16, "topk_r_1_c_100_k_10_th_16");
+BM_TopKCPU(1, 100, 50, 16, "topk_r_1_c_100_k_50_th_16");
+BM_TopKCPU(1, 100, 100, 16, "topk_r_1_c_100_k_100_th_16");
+BM_TopKCPU(32, 100, 1, 16, "topk_r_32_c_100_k_1_th_16");
+BM_TopKCPU(32, 100, 2, 16, "topk_r_32_c_100_k_2_th_16");
+BM_TopKCPU(32, 100, 10, 16, "topk_r_32_c_100_k_10_th_16");
+BM_TopKCPU(32, 100, 50, 16, "topk_r_32_c_100_k_50_th_16");
+BM_TopKCPU(32, 100, 100, 16, "topk_r_32_c_100_k_100_th_16");
+BM_TopKCPU(128, 100, 1, 16, "topk_r_128_c_100_k_1_th_16");
+BM_TopKCPU(128, 100, 2, 16, "topk_r_128_c_100_k_2_th_16");
+BM_TopKCPU(128, 100, 10, 16, "topk_r_128_c_100_k_10_th_16");
+BM_TopKCPU(128, 100, 50, 16, "topk_r_128_c_100_k_50_th_16");
+BM_TopKCPU(128, 100, 100, 16, "topk_r_128_c_100_k_100_th_16");
+BM_TopKCPU(128, 1000, 1, 16, "topk_r_128_c_1000_k_1_th_16");
+BM_TopKCPU(128, 1000, 2, 16, "topk_r_128_c_1000_k_2_th_16");
+BM_TopKCPU(128, 1000, 10, 16, "topk_r_128_c_1000_k_10_th_16");
+BM_TopKCPU(128, 1000, 50, 16, "topk_r_128_c_1000_k_50_th_16");
+BM_TopKCPU(128, 1000, 100, 16, "topk_r_128_c_1000_k_100_th_16");
+BM_TopKCPU(128, 1000, 500, 16, "topk_r_128_c_1000_k_500_th_16");
+BM_TopKCPU(128, 1000, 1000, 16, "topk_r_128_c_1000_k_1000_th_16");
+
+// From NMT Codebase:
+//   batch_sizes: 16, 128
+//   vocab_sizes: 10000 for small dataset, 35000 for large.
+//   beam_widths: 1, 2, 5, 10
+BM_TopKCPU(16, 10000, 10000, 16, "topk_nmt_r_16_c_10000_k_10000_th_16");
+BM_TopKCPU(16, 20000, 20000, 16, "topk_nmt_r_16_c_20000_k_20000_th_16");
+BM_TopKCPU(16, 50000, 50000, 16, "topk_nmt_r_16_c_50000_k_50000_th_16");
+BM_TopKCPU(16, 100000, 100000, 16, "topk_nmt_r_16_c_100000_k_100000_th_16");
+BM_TopKCPU(16, 35000, 35000, 16, "topk_nmt_r_16_c_35000_k_35000_th_16");
+BM_TopKCPU(16, 70000, 70000, 16, "topk_nmt_r_16_c_70000_k_70000_th_16");
+BM_TopKCPU(16, 175000, 175000, 16, "topk_nmt_r_16_c_175000_k_175000_th_16");
+BM_TopKCPU(16, 350000, 350000, 16, "topk_nmt_r_16_c_350000_k_350000_th_16");
+BM_TopKCPU(128, 10000, 10000, 16, "topk_nmt_r_128_c_10000_k_10000_th_16");
+BM_TopKCPU(128, 20000, 20000, 16, "topk_nmt_r_128_c_20000_k_20000_th_16");
+BM_TopKCPU(128, 50000, 50000, 16, "topk_nmt_r_128_c_50000_k_50000_th_16");
+BM_TopKCPU(128, 100000, 100000, 16, "topk_nmt_r_128_c_100000_k_100000_th_16");
+BM_TopKCPU(128, 35000, 35000, 16, "topk_nmt_r_128_c_35000_k_35000_th_16");
+BM_TopKCPU(128, 70000, 70000, 16, "topk_nmt_r_128_c_70000_k_70000_th_16");
+BM_TopKCPU(128, 175000, 175000, 16, "topk_nmt_r_128_c_175000_k_175000_th_16");
+BM_TopKCPU(128, 350000, 350000, 16, "topk_nmt_r_128_c_350000_k_350000_th_16");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 4d4851c70cb..dc95f67ff00 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -89,6 +90,63 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
+void DoNonMaxSuppressionOp(OpKernelContext* context,
+                           const Tensor& boxes,
+                           const Tensor& scores,
+                           const Tensor& max_output_size,
+                           const float iou_threshold) {
+  OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
+      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+  
+  int num_boxes = 0;
+  ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
+  if (!context->status().ok()) {
+    return;
+  }
+
+  const int output_size =
+      std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data =
+      boxes.tensor<float, 2>();
+
+  std::vector<float> scores_data(num_boxes);
+  std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
+  std::vector<int> sorted_indices;
+  DecreasingArgSort(scores_data, &sorted_indices);
+
+  std::vector<bool> active(num_boxes, true);
+  std::vector<int> selected;
+  int num_active = active.size();
+  for (int i = 0; i < num_boxes; ++i) {
+    if (num_active == 0 || selected.size() >= output_size) break;
+    if (active[i]) {
+      selected.push_back(sorted_indices[i]);
+    } else {
+      continue;
+    }
+    for (int j = i + 1; j < num_boxes; ++j) {
+      if (active[j]) {
+        float iou =
+          ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
+        if (iou > iou_threshold) {
+          active[j] = false;
+          num_active--;
+        }
+      }
+    }
+  }
+
+  // Allocate output tensor
+  Tensor* output = nullptr;
+  TensorShape output_shape({static_cast<int>(selected.size())});
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+  typename TTypes<int, 1>::Tensor selected_indices_data =
+      output->tensor<int, 1>();
+  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
+}
+
+} // namespace
+
 template <typename Device>
 class NonMaxSuppressionOp : public OpKernel {
  public:
@@ -98,9 +156,6 @@ class NonMaxSuppressionOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
-                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
-
     // boxes: [num_boxes, 4]
     const Tensor& boxes = context->input(0);
     // scores: [num_boxes]
@@ -112,59 +167,48 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
-    if (!context->status().ok()) {
-      return;
-    }
-
-    const int output_size =
-        std::min(max_output_size.scalar<int>()(), num_boxes);
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-
-    std::vector<float> scores_data(num_boxes);
-    std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
-    std::vector<int> sorted_indices;
-    DecreasingArgSort(scores_data, &sorted_indices);
-
-    std::vector<bool> active(num_boxes, true);
-    std::vector<int> selected;
-    int num_active = active.size();
-    for (int i = 0; i < num_boxes; ++i) {
-      if (num_active == 0 || selected.size() >= output_size) break;
-      if (active[i]) {
-        selected.push_back(sorted_indices[i]);
-      } else {
-        continue;
-      }
-      for (int j = i + 1; j < num_boxes; ++j) {
-        if (active[j]) {
-          float iou =
-              ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
-          if (iou > iou_threshold_) {
-            active[j] = false;
-            num_active--;
-          }
-        }
-      }
-    }
-
-    // Allocate output tensor
-    Tensor* output = nullptr;
-    TensorShape output_shape({static_cast<int>(selected.size())});
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    typename TTypes<int, 1>::Tensor selected_indices_data =
-        output->tensor<int, 1>();
-    std::copy_n(selected.begin(), selected.size(),
-                selected_indices_data.data());
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_);
   }
 
  private:
   float iou_threshold_;
 };
 
+template <typename Device>
+class NonMaxSuppressionV2Op : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                iou_threshold.shape().DebugString()));
+
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size, iou_threshold_val);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
                         NonMaxSuppressionOp<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
+                        NonMaxSuppressionV2Op<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 72e368db773..e0e8c87f953 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -144,8 +143,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
-      << s;
+              StringPiece(s.ToString()).contains("scores has incompatible shape"))
+    << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
@@ -157,8 +156,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
-      << s;
+              StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+    << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
@@ -173,4 +172,167 @@ TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+//
+// NonMaxSuppressionV2Op Tests
+//
+
+class NonMaxSuppressionV2OpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV2")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest,
+       TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest,
+       TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 1dc1bf65b22..79824b8fa0b 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/overflow.h"
 
 namespace tensorflow {
 
@@ -75,6 +76,15 @@ class OneHotOp : public OpKernel {
 
     // The one-hot dimension.
     const int32 depth_v = depth.scalar<int32>()();
+    OP_REQUIRES(
+        ctx, depth_v >= 0,
+        errors::InvalidArgument("depth must be non-negative, got: ", depth_v));
+    OP_REQUIRES(
+        ctx,
+        MultiplyWithoutOverflow(indices_shape.num_elements(), depth_v) >= 0,
+        errors::InvalidArgument("OneHot result would have shape ",
+                                indices_shape.DebugString(), " + [", depth_v,
+                                "], which exceeds 2**63 - 1 elements"));
 
     TensorShape output_shape = indices_shape;
     output_shape.InsertDim(axis, depth_v);
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index 2d81e682ca2..68a9c37406a 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -50,8 +50,12 @@ bool IsInnerDimsSizeAligned(const TensorShape& s) {
   if (s.dims() == 0) return false;
   const int64 dim0_size = s.dim_size(0);
   if (dim0_size == 0) return false;
+#if EIGEN_MAX_ALIGN_BYTES == 0
+  return true;
+#else
   const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
   return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
+#endif
 }
 
 // Given a shape 's' of a tensor of type T and the `start` and `end` index of a
@@ -61,6 +65,9 @@ bool IsInnerDimsSizeAligned(const TensorShape& s) {
 template <typename T>
 bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
   if (s.dims() == 1) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+    return true;
+#else
     bool start_aligned = (start * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
     // End is aligned if either the explicit end index is passed and is a
     // a multiple of EIGEN_MAX_ALIGN_BYTES, or the start index is aligned and
@@ -68,6 +75,7 @@ bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
     // index, or start and size.
     bool end_aligned = (end_or_size * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
     return start_aligned && end_aligned;
+#endif
   } else {
     return IsInnerDimsSizeAligned<T>(s);
   }
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 04a42a9921e..42ffef6735b 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -286,6 +286,14 @@ TEST_F(OpsUtilTest, SanitizeThreadSuffix) {
 }
 
 TEST_F(OpsUtilTest, Aligned1DSlice) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+  // When EIGEN_MAX_ALIGN_BYTES is 0, a 1D tensor is always aligned.
+  Tensor t(DT_FLOAT, TensorShape({3}));
+  int64 start = 0;
+  int64 end = 1;
+  bool output = IsDim0SliceAligned<float>(t.shape(), start, end);
+  EXPECT_EQ(output, true);
+#else
   Tensor t(DT_FLOAT, TensorShape({EIGEN_MAX_ALIGN_BYTES * 2}));
   int64 start = 0;
   int64 end = EIGEN_MAX_ALIGN_BYTES;
@@ -295,8 +303,10 @@ TEST_F(OpsUtilTest, Aligned1DSlice) {
   Tensor sliced;
   CHECK(sliced.CopyFrom(t.Slice(start, end), TensorShape({end - start})));
   EXPECT_EQ(sliced.IsAligned(), true);
+#endif
 }
 
+#if EIGEN_MAX_ALIGN_BYTES > 0
 TEST_F(OpsUtilTest, Misaligned1DSlice) {
   Tensor t(DT_FLOAT, TensorShape({EIGEN_MAX_ALIGN_BYTES * 2}));
   int64 start = 1;
@@ -308,8 +318,18 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
   CHECK(sliced.CopyFrom(t.Slice(start, end), TensorShape({end - start})));
   EXPECT_EQ(sliced.IsAligned(), false);
 }
+#endif
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
+  // a multidimensional tensor is always aligned.
+  Tensor t(DT_FLOAT, TensorShape({3, 4}));
+  int64 start = 1;
+  int64 end = 2;
+  bool output = IsDim0SliceAligned<float>(t.shape(), start, end);
+  EXPECT_EQ(output, true);
+#else
   // For multidimensional tensors, alignment is dictated by inner_dim_size.
   int64 inner_dim_size = EIGEN_MAX_ALIGN_BYTES;
   Tensor t(DT_FLOAT, TensorShape({3, inner_dim_size}));
@@ -321,8 +341,10 @@ TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
   Tensor sliced;
   CHECK(sliced.CopyFrom(t.Slice(start, end), TensorShape({1, inner_dim_size})));
   EXPECT_EQ(sliced.IsAligned(), true);
+#endif
 }
 
+#if EIGEN_MAX_ALIGN_BYTES > 0
 TEST_F(OpsUtilTest, Misaligned2DSliceOfDim0) {
   // For multidimensional tensors, alignment is dictated by inner_dim_size.
   int64 inner_dim_size = EIGEN_MAX_ALIGN_BYTES + 1;
@@ -336,6 +358,23 @@ TEST_F(OpsUtilTest, Misaligned2DSliceOfDim0) {
   CHECK(sliced.CopyFrom(t.Slice(start, end), TensorShape({1, inner_dim_size})));
   EXPECT_EQ(sliced.IsAligned(), false);
 }
+#endif
+
+TEST_F(OpsUtilTest, MisalignedEmptyShape) {
+  TensorShape shape({});
+  int64 start = 1;
+  int64 end = 2;
+  bool output = IsDim0SliceAligned<float>(shape, start, end);
+  EXPECT_EQ(output, false);
+}
+
+TEST_F(OpsUtilTest, MisalignedEmptyDim0) {
+  TensorShape shape({0, 1, 2});
+  int64 start = 0;
+  int64 end = 1;
+  bool output = IsDim0SliceAligned<float>(shape, start, end);
+  EXPECT_EQ(output, false);
+}
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 4977ad1d7cb..edaa10761eb 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -118,6 +118,12 @@ class PackOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+      if (std::is_same<Device, SYCLDevice>::value) {
+        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
+        return;
+      }
+#endif // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -135,6 +141,12 @@ TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
 TF_CALL_bfloat16(REGISTER_PACK);
 
+#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
+// Primarily used for SavedModel support on mobile.
+REGISTER_PACK(string);
+#endif  // defined(IS_MOBILE_PLATFORM) &&
+        // !defined(SUPPORT_SELECTIVE_REGISTRATION)
+
 #undef REGISTER_PACK
 
 #if GOOGLE_CUDA
@@ -160,25 +172,18 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-
 #define REGISTER_SYCL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                        \
       Name("Pack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       PackOp<SYCLDevice, type>)
 
-REGISTER_SYCL(float);
-#undef REGISTER_SYCL
-
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
 REGISTER_KERNEL_BUILDER(Name("Pack")
                             .Device(DEVICE_SYCL)
                             .HostMemory("values")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         PackOp<CPUDevice, int32>);
-
+#undef REGISTER_SYCL
 #endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index bec2d02cb5a..4c431935794 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -38,6 +38,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class PadOp : public OpKernel {
@@ -199,4 +202,26 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
                         PadOp<CPUDevice, int32>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNEL(T)                                   \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<T>("T")             \
+                              .TypeConstraint<int32>("Tpaddings") \
+                              .HostMemory("paddings"),            \
+                          PadOp<SYCLDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
+REGISTER_KERNEL_BUILDER(Name("Pad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32>);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/padded_batch_dataset_op.cc b/tensorflow/core/kernels/padded_batch_dataset_op.cc
new file mode 100644
index 00000000000..bb1eda101fc
--- /dev/null
+++ b/tensorflow/core/kernels/padded_batch_dataset_op.cc
@@ -0,0 +1,367 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_ALL_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
+class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument("All padding values must be scalars"));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(tensor::DeepCopy(padding_value_t));
+    }
+
+    *output = new Dataset(batch_size, std::move(padded_shapes),
+                          std::move(padding_values), input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, std::vector<PartialTensorShape> padded_shapes,
+            std::vector<Tensor> padding_values, const DatasetBase* input)
+        : batch_size_(batch_size),
+          padded_shapes_(std::move(padded_shapes)),
+          padding_values_(std::move(padding_values)),
+          input_(input) {
+      input_->Ref();
+
+      // NOTE(mrry): Currently we implement "batch up to"
+      // semantics. If we could tell statically that the input dataset
+      // is infinite, then we could always report `batch_size` as the
+      // 0th dimension.
+      // TODO(mrry): Need to validate that the input shape and the
+      // padded shape are "compatible" (i.e. that padded shape is >=
+      // input shape, with both static and dynamic checks as appropriate).
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (size_t i = 0; i < input_shapes.size(); ++i) {
+        output_shapes_.push_back(
+            PartialTensorShape({-1}).Concatenate(padded_shapes_[i]));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("PaddedBatchDatasetOp(", batch_size_,
+                             ")::Dataset");
+    }
+
+   private:
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    //
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of `batch_elements` is a tuple of tensors from the
+        // input iterator.
+        std::vector<std::vector<Tensor>> batch_elements;
+        batch_elements.reserve(dataset()->batch_size_);
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              batch_elements.push_back(std::move(batch_element_tuple));
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Copy the retrieved batch elements into one output tensor
+        // per tuple component.
+        // NOTE(mrry): If the input or output sizes are statically
+        // known, we could potentially read the input values in-place
+        // into their respective slice locations. This would require a
+        // different GetNext() overload that supports zero-copy, and might
+        // make sense in an optimization pass.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          // 1. Determine the shape of the padded tensor.
+          TensorShape batch_component_shape({num_batch_elements});
+          const PartialTensorShape& padded_shape =
+              dataset()->padded_shapes_[component_index];
+
+          for (int dim = 0; dim < padded_shape.dims(); ++dim) {
+            if (padded_shape.dim_size(dim) == -1) {
+              batch_component_shape.AddDim(0);
+            } else {
+              batch_component_shape.AddDim(padded_shape.dim_size(dim));
+            }
+          }
+
+          for (int64 i = 0; i < num_batch_elements; ++i) {
+            const TensorShape& element_shape =
+                batch_elements[i][component_index].shape();
+            // TODO(mrry): Perform this check in the shape function if
+            // enough static information is available to do so.
+            if (element_shape.dims() != padded_shape.dims()) {
+              return errors::InvalidArgument(
+                  "All elements in a batch must have the same rank as the "
+                  "padded shape for component",
+                  component_index, ": expected rank ", padded_shape.dims(),
+                  " but got element with rank ", element_shape.dims());
+            }
+            for (int dim = 0; dim < padded_shape.dims(); ++dim) {
+              if (padded_shape.dim_size(dim) == -1) {
+                // Take the max of all batch elements in this dimension.
+                if (batch_elements[i][component_index].shape().dim_size(dim) >
+                    batch_component_shape.dim_size(dim + 1)) {
+                  batch_component_shape.set_dim(
+                      dim + 1,
+                      batch_elements[i][component_index].shape().dim_size(dim));
+                }
+              } else {
+                if (batch_elements[i][component_index].shape().dim_size(dim) >
+                    batch_component_shape.dim_size(dim + 1)) {
+                  return errors::DataLoss(
+                      "Attempted to pad to a smaller size than the input "
+                      "element.");
+                }
+              }
+            }
+          }
+
+          // 2. Copy each batch element to the appropriate location in
+          // the output component tensor.
+          Tensor batch_component(cpu_allocator(),
+                                 output_dtypes()[component_index],
+                                 batch_component_shape);
+          TF_RETURN_IF_ERROR(SetElementZero(
+              &batch_component, dataset()->padding_values_[component_index]));
+
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (int64 i = 0; i < num_batch_elements; ++i) {
+            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
+                batch_elements[i][component_index], &batch_component));
+
+            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
+                batch_elements[i][component_index], &batch_component, i));
+          }
+          out_tensors->push_back(std::move(batch_component));
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const std::vector<PartialTensorShape> padded_shapes_;
+    const std::vector<Tensor> padding_values_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("PaddedBatchDataset").Device(DEVICE_CPU),
+                        PaddedBatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 07902cd48bc..f4626d4a5d4 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -76,7 +76,8 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       Tensor element;
       // Here, ManyOutShape returns zeros for undetermined shapes,
       // which is exactly what we want to use.
-      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(component_dtypes_[i],
+                                             ManyOutShape(i, 0), &element));
       tuple.emplace_back(element);
     }
     callback(tuple);
@@ -118,7 +119,7 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   }
                 }
               }
-              if (allow_small_batch && queues_[0].size() > 0) {
+              if (allow_small_batch && !queues_[0].empty()) {
                 // Request all remaining elements in the queue.
                 queue_size = queues_[0].size();
                 attempt->tuples.clear();
@@ -179,8 +180,9 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   }
 
                   Tensor element;
-                  attempt->context->allocate_temp(component_dtypes_[i], shape,
-                                                  &element);
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
 
                   bool has_dynamic_shape = !partial_shape.IsFullyDefined();
                   if (has_dynamic_shape) {
diff --git a/tensorflow/core/kernels/parallel_map_dataset_op.cc b/tensorflow/core/kernels/parallel_map_dataset_op.cc
new file mode 100644
index 00000000000..93ed644d72d
--- /dev/null
+++ b/tensorflow/core/kernels/parallel_map_dataset_op.cc
@@ -0,0 +1,347 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <deque>
+
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelMapDatasetOp : public OpKernel {
+ public:
+  explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    const Tensor* num_threads_t;
+    OP_REQUIRES_OK(ctx, ctx->input("num_threads", &num_threads_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(num_threads_t->shape()),
+                errors::InvalidArgument("num_threads must be a scalar"));
+    const int32 num_threads = num_threads_t->flat<int32>()(0);
+    OP_REQUIRES(
+        ctx, num_threads > 0,
+        errors::InvalidArgument("num_threads must be greater than zero."));
+
+    const Tensor* output_buffer_size_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("output_buffer_size", &output_buffer_size_t));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(output_buffer_size_t->shape()),
+        errors::InvalidArgument("output_buffer_size must be a scalar."));
+    const int64 output_buffer_size = output_buffer_size_t->flat<int64>()(0);
+
+    // TODO(mrry): Relax this requirement? If the output buffer owns
+    // the (tuples of) tensors into which `f` writes its output, it
+    // seems like this constraint would make it easier to (i)
+    // constrain the memory usage of the iterator, and (ii) enforce a
+    // consistent ordering between input and output.
+    OP_REQUIRES(ctx, output_buffer_size >= num_threads,
+                errors::InvalidArgument(
+                    "output_buffer_size (", output_buffer_size,
+                    ") must be greater than or equal to num_threads (",
+                    num_threads, ")."));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    // TODO(mrry): It seems unnatural to capture the params from *this
+    // kernel's* OpKernelContext, although the captured values should
+    // be the same for any kernel in the same session. Consider adding
+    // an IteratorContext* argument to Dataset::MakeIterator(), and
+    // threading the context information through that
+    // way. Alternatively, provide a session-scoped context that will
+    // provide this information to all users in the same session (and
+    // that will have the appropriate lifetime).
+    IteratorContext::Params params;
+    params.env = ctx->env();
+    params.resource_manager = ctx->resource_manager();
+    params.runner = *(ctx->runner());
+
+    DatasetBase* dataset =
+        new Dataset(input, num_threads, output_buffer_size, std::move(params),
+                    output_types_, output_shapes_, std::move(captured_func));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int32 num_threads,
+            int64 output_buffer_size, IteratorContext::Params ctx_params,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : input_(input),
+          num_threads_(num_threads),
+          output_buffer_size_(output_buffer_size),
+          ctx_params_(std::move(ctx_params)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            iter_ctx_(dataset->ctx_params_),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      ~Iterator() override {
+        // Signal the mapper threads, if any, so that they terminate.
+        // We will then join those threads when we delete
+        // `this->mapper_threads_`.
+        //
+        // TODO(mrry): Replace this cancellation logic with a
+        // CancellationManager. The syntax would be more heavyweight,
+        // but it would be possible to thread a cancellation manager
+        // through the IteratorContext to upstream,
+        // potentially-blocking iterators, when we add these.
+        {
+          mutex_lock l(output_mu_);
+          cancelled_ = true;
+          cond_var_.notify_all();
+        }
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(output_mu_);
+        TF_RETURN_IF_ERROR(EnsureMapperThreadsStarted(ctx));
+
+        while (true) {
+          // 1. Wait until the next element in the output queue has
+          // been produced, or we are shutting down.
+          while (
+              !cancelled_ && active_threads_ > 0 &&
+              (output_buffer_.empty() || !output_buffer_.front().is_produced)) {
+            cond_var_.wait(l);
+          }
+
+          if (cancelled_) {
+            return errors::Cancelled(
+                "ParallelMapDatasetOp::Dataset::Iterator::GetNext");
+          }
+
+          if (!output_buffer_.empty() && output_buffer_.front().is_produced) {
+            // A new output element is available. Forward the status
+            // from computing it, and (if we successfully got an
+            // element) the output values.
+            Status s = output_buffer_.front().output_status;
+            if (s.ok()) {
+              *out_tensors = std::move(output_buffer_.front().output_value);
+            }
+            output_buffer_.pop_front();
+            *end_of_sequence = false;
+
+            // Wake one of the producing threads, in case they have been
+            // waiting for space in the queue.
+            cond_var_.notify_one();
+            return s;
+          } else if (active_threads_ == 0) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+        }
+      }
+
+     private:
+      // An output queue element comprises a bool (which indicates
+      // whether the element has been produced yet) and a vector of
+      // tensors (which contains the tuple of tensors if the bool is
+      // true).
+      struct OutputQueueElement {
+        // The producer must set `is_produced` to `true` after
+        // `output_status` or `output_value` has been written.
+        bool is_produced = false;
+        // The producer sets `output_status` if either getting the
+        // input element or applying the mapper function to it fails.
+        Status output_status;
+        // The mapped data element.
+        std::vector<Tensor> output_value;
+      };
+
+      Status EnsureMapperThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(output_mu_) {
+        if (mapper_threads_.empty()) {
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          f_opts_.step_id = -std::abs(static_cast<int64>(random::New64()));
+          f_opts_.runner = iter_ctx_.runner();
+
+          active_threads_ = dataset()->num_threads_;
+          for (int i = 0; i < dataset()->num_threads_; ++i) {
+            mapper_threads_.emplace_back(
+                std::unique_ptr<Thread>(ctx->env()->StartThread(
+                    {}, "mapper_thread", [this]() { MapperThread(); })));
+          }
+        }
+        return Status::OK();
+      }
+
+      void MapperThread() {
+        while (true) {
+          OutputQueueElement* output_queue_element_;
+
+          std::vector<Tensor> input_args;
+          std::vector<Tensor> output_value;
+
+          Status s;
+
+          // 1. Acquire a slot in the output queue and a corresponding input
+          // element.
+          {
+            // First acquire the input lock. Only one MapperThread may
+            // call GetNext() on the input iterator at a time, to
+            // preserve the ordering of elements.
+            mutex_lock input_lock(input_mu_);
+            {
+              // This MapperThread is now responsible for producing
+              // the next element in the output queue. We acquire a
+              // slot in the output queue atomically, which may block,
+              // but we deliberately do not release input_mu_ to
+              // prevent another MapperThread from overtaking us.
+              mutex_lock output_lock(output_mu_);
+              while (!cancelled_ &&
+                     output_buffer_.size() == dataset()->output_buffer_size_) {
+                cond_var_.wait(output_lock);
+              }
+
+              if (cancelled_) {
+                --active_threads_;
+                return;
+              }
+
+              output_buffer_.push_back(OutputQueueElement());
+              output_queue_element_ = &output_buffer_.back();
+            }
+
+            bool end_of_sequence;
+            s = input_impl_->GetNext(&iter_ctx_, &input_args, &end_of_sequence);
+            if (s.ok() && end_of_sequence) {
+              mutex_lock output_lock(output_mu_);
+              --active_threads_;
+              if (active_threads_ == 0) {
+                cond_var_.notify_all();
+              }
+              return;
+            }
+          }
+
+          if (s.ok()) {
+            s = dataset()->captured_func_->Run(f_opts_, input_args,
+                                               &output_value);
+          }
+
+          // 3. Signal that the element has been produced.
+          {
+            mutex_lock output_lock(output_mu_);
+            output_queue_element_->output_status.Update(s);
+            output_queue_element_->is_produced = true;
+            std::swap(output_queue_element_->output_value, output_value);
+            cond_var_.notify_all();
+          }
+        }
+      }
+
+      IteratorContext iter_ctx_;
+      mutex input_mu_;
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(input_mu_);
+      FunctionLibraryRuntime::Options f_opts_;
+      mutex output_mu_;
+      condition_variable cond_var_;
+      std::deque<OutputQueueElement> output_buffer_ GUARDED_BY(output_mu_);
+      std::vector<std::unique_ptr<Thread>> mapper_threads_
+          GUARDED_BY(output_mu_);
+      bool cancelled_ GUARDED_BY(output_mu_) = false;
+      int32 active_threads_ GUARDED_BY(output_mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const int32 num_threads_;
+    const int64 output_buffer_size_;
+    const IteratorContext::Params ctx_params_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
+                        ParallelMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 77c4b7a7299..b232ba16a76 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -44,7 +44,6 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 using random::PhiloxRandom;
-using random::SingleSampleAdapter;
 
 template <typename T>
 struct TruncatedNormalFunctor<CPUDevice, T> {
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 6be03891cda..538dca24ae6 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
 #include <array>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -27,15 +30,65 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+Pool3dParameters::Pool3dParameters(OpKernelContext* context,
+                                   const std::vector<int32>& ksize,
+                                   const std::vector<int32>& stride,
+                                   Padding padding, TensorFormat data_format,
+                                   const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 5,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  this->data_format = data_format;
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+  window_planes = GetTensorDim(ksize, data_format, '0');
+  window_rows = GetTensorDim(ksize, data_format, '1');
+  window_cols = GetTensorDim(ksize, data_format, '2');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+  plane_stride = GetTensorDim(stride, data_format, '0');
+  row_stride = GetTensorDim(stride, data_format, '1');
+  col_stride = GetTensorDim(stride, data_format, '2');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 3D pooling across plane/width/height. Depthwise
+  // pooling is not supported.
+  OP_REQUIRES(
+      context, depth_window == 1 && depth_stride == 1,
+      errors::Unimplemented(
+          "Pooling3d only supports pooling across plane/width/height."));
+
+  OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
+                                                plane_stride, padding,
+                                                &out_plane, &pad_planes));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
+                                       padding, &out_height, &pad_rows));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
+                                       padding, &out_width, &pad_cols));
+}
+
+TensorShape Pool3dParameters::forward_output_shape() {
+  return ShapeFromFormat(data_format, tensor_in_batch,
+                         {{out_plane, out_height, out_width}}, depth);
+}
+
 enum PoolingType { MAX, AVG };
 
 template <typename Device, typename T, PoolingType Type>
@@ -46,7 +99,8 @@ struct LaunchPoolingOp<CPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding, Padding padding_type,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
         Eigen::CuboidAvgPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
@@ -60,7 +114,8 @@ struct LaunchPoolingOp<CPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding, Padding padding_type,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
         Eigen::CuboidMaxPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
@@ -73,6 +128,17 @@ template <typename Device, typename T, PoolingType Type>
 class Pooling3DOp : public UnaryOp<T> {
  public:
   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    if (context->device_type() == DEVICE_CPU) {
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument("Default Pooling3DOp only supports NDHWC ",
+                                  "on device type ",
+                                  DeviceTypeString(context->device_type())));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
     OP_REQUIRES(context, ksize_.size() == 5,
                 errors::InvalidArgument("Sliding window ksize field must "
@@ -82,10 +148,14 @@ class Pooling3DOp : public UnaryOp<T> {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'N') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    OP_REQUIRES(context, ksize_[4] == 1 && stride_[4] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'C') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the depth dimension."));
   }
@@ -95,37 +165,40 @@ class Pooling3DOp : public UnaryOp<T> {
 
     OP_REQUIRES(context, tensor_in.dims() == 5,
                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
-    const int64 depth = tensor_in.dim_size(4);
-    const int64 in_batch = tensor_in.dim_size(0);
+    const int64 depth = GetTensorDim(tensor_in, data_format_, 'C');
+    const int64 in_batch = GetTensorDim(tensor_in, data_format_, 'N');
 
     // Dimension order for these arrays is: x, y, z.
     std::array<int64, 3> input_size{
-        {tensor_in.dim_size(3), tensor_in.dim_size(2), tensor_in.dim_size(1)}};
-    std::array<int64, 3> window{{ksize_[3], ksize_[2], ksize_[1]}};
-    std::array<int64, 3> stride{{stride_[3], stride_[2], stride_[1]}};
+        {GetTensorDim(tensor_in, data_format_, '2'),
+         GetTensorDim(tensor_in, data_format_, '1'),
+         GetTensorDim(tensor_in, data_format_, '0')}};
+    std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
+                                 GetTensorDim(ksize_, data_format_, '1'),
+                                 GetTensorDim(ksize_, data_format_, '0')}};
+    std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
+                                 GetTensorDim(stride_, data_format_, '1'),
+                                 GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64, 3> padding, out;
 
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
                                             padding_, &out, &padding));
 
-    TensorShape out_shape({in_batch, out[2], out[1], out[0], depth});
+    TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
+                                            {{out[2], out[1], out[0]}}, depth);
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
-                                             padding, padding_, output);
+                                             padding, data_format_, padding_,
+                                             output);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
 };
-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, MAX>);
 
 template <typename Device, typename T>
 struct LaunchMaxPooling3dGradOp;
@@ -137,7 +210,8 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
                      const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding, Tensor* output) {
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
     output->flat<T>().setZero();
     for (int64 p = 0; p < out_backprop.dim_size(3); ++p) {
       // Calculate broadcast size for planes/rows/cols. For SAME padding,
@@ -230,6 +304,17 @@ class MaxPooling3dGradOp : public OpKernel {
  public:
   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    if (context->device_type() == DEVICE_CPU) {
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Default MaxPooling3dGradOp only supports NDHWC ",
+              "on device type ", DeviceTypeString(context->device_type())));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
     OP_REQUIRES(context, ksize_.size() == 5,
                 errors::InvalidArgument("Sliding window ksize field must "
@@ -239,10 +324,14 @@ class MaxPooling3dGradOp : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'N') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    OP_REQUIRES(context, ksize_[4] == 1 && stride_[4] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'C') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the depth dimension."));
   }
@@ -262,31 +351,32 @@ class MaxPooling3dGradOp : public OpKernel {
     Tensor* input_backprop;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &input_backprop));
-
-    std::array<int64, 3> input_size = {{output_shape.dim_size(3),
-                                        output_shape.dim_size(2),
-                                        output_shape.dim_size(1)}};
-    std::array<int64, 3> window = {{ksize_[3], ksize_[2], ksize_[1]}};
-    std::array<int64, 3> stride = {{stride_[3], stride_[2], stride_[1]}};
+    std::array<int64, 3> input_size{
+        {GetTensorDim(output_shape, data_format_, '2'),
+         GetTensorDim(output_shape, data_format_, '1'),
+         GetTensorDim(output_shape, data_format_, '0')}};
+    std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
+                                 GetTensorDim(ksize_, data_format_, '1'),
+                                 GetTensorDim(ksize_, data_format_, '0')}};
+    std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
+                                 GetTensorDim(stride_, data_format_, '1'),
+                                 GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64, 3> out, padding;
 
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
                                             padding_, &out, &padding));
-    LaunchMaxPooling3dGradOp<Device, T>::launch(context, tensor_in, tensor_out,
-                                                out_backprop, window, stride,
-                                                out, padding, input_backprop);
+    LaunchMaxPooling3dGradOp<Device, T>::launch(
+        context, tensor_in, tensor_out, out_backprop, window, stride, out,
+        padding, data_format_, input_backprop);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<CPUDevice, float>);
-
 template <typename Device, typename T>
 struct LaunchAvgPooling3dGradOp;
 
@@ -298,7 +388,8 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
                      const std::array<int64, 3>& output_shape,
-                     const std::array<int64, 3>& padding, Tensor* output) {
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
     output->flat<T>().setZero();
     std::array<int64, 3> input_size = {{tensor_in_shape.dim_size(3),
                                         tensor_in_shape.dim_size(2),
@@ -370,6 +461,17 @@ class AvgPooling3dGradOp : public OpKernel {
  public:
   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    if (context->device_type() == DEVICE_CPU) {
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Default AvgPooling3dGradOp only supports NDHWC ",
+              "on device type ", DeviceTypeString(context->device_type())));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
     OP_REQUIRES(context, ksize_.size() == 5,
                 errors::InvalidArgument("Sliding window ksize field must "
@@ -379,10 +481,14 @@ class AvgPooling3dGradOp : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 5 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'N') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    OP_REQUIRES(context, ksize_[4] == 1 && stride_[4] == 1,
+    OP_REQUIRES(context,
+                (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
+                 GetTensorDim(stride_, data_format_, 'C') == 1),
                 errors::Unimplemented(
                     "Pooling is not yet supported on the depth dimension."));
   }
@@ -390,10 +496,11 @@ class AvgPooling3dGradOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in_shape = context->input(0);
     const Tensor& out_backprop = context->input(1);
-    OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                             tensor_in_shape.NumElements() == 5,
-                errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
-                                        "elements"));
+    OP_REQUIRES(
+        context,
+        tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 5,
+        errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
+                                "elements"));
     OP_REQUIRES(context, out_backprop.dims() == 5,
                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
 
@@ -407,32 +514,235 @@ class AvgPooling3dGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
     // Dimension order for these arrays is x, y, z.
-    std::array<int64, 3> input_size = {{output_shape.dim_size(3),
-                                        output_shape.dim_size(2),
-                                        output_shape.dim_size(1)}};
-    std::array<int64, 3> window = {{ksize_[3], ksize_[2], ksize_[1]}};
-    std::array<int64, 3> stride = {{stride_[3], stride_[2], stride_[1]}};
+    std::array<int64, 3> input_size{
+        {GetTensorDim(output_shape, data_format_, '2'),
+         GetTensorDim(output_shape, data_format_, '1'),
+         GetTensorDim(output_shape, data_format_, '0')}};
+    std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
+                                 GetTensorDim(ksize_, data_format_, '1'),
+                                 GetTensorDim(ksize_, data_format_, '0')}};
+    std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
+                                 GetTensorDim(stride_, data_format_, '1'),
+                                 GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64, 3> padding, out;
 
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
                                             padding_, &out, &padding));
 
-    LaunchAvgPooling3dGradOp<Device, T>::launch(context, output_shape,
-                                                out_backprop, window, stride,
-                                                out, padding, output);
+    LaunchAvgPooling3dGradOp<Device, T>::launch(
+        context, output_shape, out_backprop, window, stride, out, padding,
+        data_format_, output);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<CPUDevice, float>);
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    OP_REQUIRES(
+        context, params.data_format == FORMAT_NHWC,
+        errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
+                                "NDHWC on CPU device type"));
+
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                               params.tensor_in_planes * params.tensor_in_cols *
+                                   params.tensor_in_rows *
+                                   params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
+                                params.out_plane * params.out_width *
+                                    params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        tensor_top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_planes * params.tensor_in_cols *
+            params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        tensor_bottom_diff->flat<T>().data(), params.depth,
+        params.out_plane * params.out_width * params.out_height *
+            params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_planes = params.tensor_in_planes;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_planes = params.pad_planes;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_planes = params.window_planes;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 plane_stride = params.plane_stride;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_plane = params.out_plane;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size =
+            out_plane * out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int pp = 0; pp < out_plane; ++pp) {
+          for (int ph = 0; ph < out_height; ++ph) {
+            for (int pw = 0; pw < out_width; ++pw) {
+              // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
+              // range that the input vector projects to.
+              int p_start = pp * plane_stride - pad_planes;
+              const int p_end = std::min(p_start + window_planes, in_planes);
+              int h_start = ph * row_stride - pad_rows;
+              const int h_end = std::min(h_start + window_rows, in_rows);
+              int w_start = pw * col_stride - pad_cols;
+              const int w_end = std::min(w_start + window_cols, in_cols);
+              p_start = std::max(p_start, 0);
+              h_start = std::max(h_start, 0);
+              w_start = std::max(w_start, 0);
+              const int out_index =
+                  ((b * out_plane + pp) * out_height + ph) * out_width + pw;
+              // Find value corresponding to the input maximum in top_diff.
+              for (int d = 0; d < depth; ++d) {
+                const T& output_ref = out_mat.coeffRef(d, out_index);
+                bool should_stop = false;
+                for (int p = p_start; p < p_end && !should_stop; ++p) {
+                  for (int h = h_start; h < h_end && !should_stop; ++h) {
+                    for (int w = w_start; w < w_end && !should_stop; ++w) {
+                      const int in_index =
+                          ((b * in_planes + p) * in_rows + h) * in_cols + w;
+                      const T& input_ref = in_mat.coeffRef(d, in_index);
+                      if (output_ref == input_ref) {
+                        T& bottom_diff_ref =
+                            bottom_diff_mat.coeffRef(d, out_index);
+                        bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                        should_stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    const int64 shard_cost =
+        params.out_plane * params.out_height * params.out_width * params.depth *
+        params.window_planes * params.window_rows * params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+};
+
+template <class Device, class T>
+class MaxPooling3dGradGradOp : public OpKernel {
+ public:
+  explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 5,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
+    const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
+    OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
+                errors::Unimplemented("MaxPooling3dGradGrad is not yet "
+                                      "supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling3d, tensor_in should have 5 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 5,
+                errors::InvalidArgument("tensor_in must be 5-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 5,
+                errors::InvalidArgument("tensor_out must be 5-dimensional"));
+    // For maxpooling3d, out_grad_backprop should have 5 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 5,
+        errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
+
+    Pool3dParameters params{context,  ksize_,       stride_,
+                            padding_, data_format_, tensor_in.shape()};
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    LaunchMaxPooling3dGradGradOp<Device, T>::launch(
+        context, params, tensor_in, tensor_out, out_grad_backprop, output);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_KERNELS(D, T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, MAX>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<T>("TInput"),                \
+                          MaxPooling3dGradOp<D##Device, T>);               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPooling3dGradGradOp<D##Device, T>);                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, AVG>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .HostMemory("orig_input_shape"),             \
+                          AvgPooling3dGradOp<D##Device, T>);
+
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
 
@@ -441,11 +751,12 @@ struct LaunchPoolingOp<GPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding, Padding padding_type,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kAverage,
-                               window, stride, padding, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(
+        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
+        stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -454,21 +765,15 @@ struct LaunchPoolingOp<GPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding, Padding padding_type,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp<T>::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               window, stride, padding, tensor_in, output);
+    DnnPooling3dOp<T>::Compute(
+        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
+        stride, padding, data_format, tensor_in, output);
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, MAX>);
-
 template <typename T>
 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -477,19 +782,15 @@ struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& stride,
                      const std::array<int64, 3>& out,
                      const std::array<int64, 3>& padding,
-                     Tensor* input_backprop) {
+                     TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
     DnnPooling3dGradOp<T>::Compute(
         context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, out, out_backprop, output_shape, &tensor_in,
-        &tensor_out, input_backprop);
+        stride, padding, out, data_format, out_backprop, output_shape,
+        &tensor_in, &tensor_out, input_backprop);
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<GPUDevice, float>);
-
 template <typename T>
 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context,
@@ -498,19 +799,44 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
                      const std::array<int64, 3>& window,
                      const std::array<int64, 3>& stride,
                      const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding, Tensor* output) {
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp<T>::Compute(
         context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, out, out_backprop, tensor_in_shape, nullptr, nullptr,
-        output);
+        stride, padding, out, data_format, out_backprop, tensor_in_shape,
+        nullptr, nullptr, output);
   }
 };
-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<GPUDevice, float>);
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    bool status = functor::MaxPool3dGradBackward<T>()(
+        params.data_format, tensor_in.flat<T>().data(),
+        tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
+        params.out_height, params.out_width, params.depth,
+        params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_planes, params.window_rows, params.window_cols,
+        params.plane_stride, params.row_stride, params.col_stride,
+        params.pad_planes, params.pad_rows, params.pad_cols,
+        tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
+        context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPool3dGradBackward"));
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
+TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
 
 #endif  // GOOGLE_CUDA
 
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
new file mode 100644
index 00000000000..7954e2cf834
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -0,0 +1,66 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+#define TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// A helper class to manage sizes and shapes for 3d pooling operations.
+struct Pool3dParameters {
+  // Updates context->status if there is an invalid input.
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                   const std::vector<int32>& stride, Padding padding,
+                   TensorFormat data_format,
+                   const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  TensorShape forward_output_shape();
+
+  int depth;
+
+  int tensor_in_planes;
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_planes;
+  int window_cols;
+  int window_rows;
+  int depth_window;
+
+  int plane_stride;
+  int col_stride;
+  int row_stride;
+  int depth_stride;
+
+  int64 out_plane;
+  int64 out_height;
+  int64 out_width;
+
+  int64 pad_planes;
+  int64 pad_cols;
+  int64 pad_rows;
+
+  TensorFormat data_format;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
new file mode 100644
index 00000000000..341a43c368e
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -0,0 +1,172 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCDHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int pp = (index / pooled_width / pooled_height) % pooled_plane;
+    int c = (index / pooled_width / pooled_height / pooled_plane) % channels;
+    int n = (index / pooled_width / pooled_height / pooled_plane / channels);
+    int pstart = pp * stride_p - pad_p;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int pend = min(pstart + kernel_p, plane);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    pstart = max(pstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * channels * plane * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = c * plane * height * width + (p * height + h) * width + w;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * channels * plane * height * width + maxidx];
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNDHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    n /= pooled_height;
+    int pstart = (n % pooled_plane) * stride_p - pad_p;
+    int pend = min(pstart + kernel_p, plane);
+    pstart = max(pstart, 0);
+    n /= pooled_plane;
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * plane * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = ((p * height + h) * width + w) * channels + c;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * plane * height * width * channels + maxidx];
+    }
+  }
+}
+
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+bool MaxPool3dGradBackward<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_plane, const int pooled_height,
+    const int pooled_width, const int channels, const int plane,
+    const int height, const int width, const int kernel_p, const int kernel_h,
+    const int kernel_w, const int stride_p, const int stride_h,
+    const int stride_w, const int pad_p, const int pad_t, const int pad_l,
+    const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
+  int num_kernels =
+      batch * channels * pooled_plane * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNDHWC<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCDHW<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  }
+  return d.ok();
+}
+
+}  // namespace functor
+
+#define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+#undef DEFINE_GPU_SPECS
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.h b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
new file mode 100644
index 00000000000..350b1b67324
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace functor {
+template <typename T>
+struct MaxPool3dGradBackward {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch, const int pooled_plane,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int plane, const int height,
+                  const int width, const int kernel_p, const int kernel_h,
+                  const int kernel_w, const int stride_p, const int stride_h,
+                  const int stride_w, const int pad_p, const int pad_t,
+                  const int pad_l, const T* top_diff, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ddc9c9823b1..37747a31999 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
@@ -64,6 +65,8 @@ PoolParameters::PoolParameters(OpKernelContext* context,
     OP_REQUIRES_OK(
         context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
                                        padding, &out_width, &pad_cols));
+    pad_depth = 0;
+    out_depth = depth;
   } else {
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the
@@ -125,8 +128,7 @@ namespace functor {
       typename TTypes<T, 4>::Tensor out);                           \
   extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
 
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -371,10 +373,11 @@ void DnnPoolingGradOp<T>::Compute(
   }
 }
 
-template class DnnPoolingOp<Eigen::half>;
-template class DnnPoolingOp<float>;
-template class DnnPoolingGradOp<Eigen::half>;
-template class DnnPoolingGradOp<float>;
+#define DEFINE_DNN_OPS(T)         \
+  template class DnnPoolingOp<T>; \
+  template class DnnPoolingGradOp<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 85749a29546..9cbd832957c 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -272,7 +272,13 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // an optimized case where the queue 'knows' what attributes to
       // use, and plumbs them through here.
       Tensor element;
-      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      Status status = ctx->allocate_temp(component_dtypes_[i],
+                                         ManyOutShape(i, 0), &element);
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+        callback(Tuple());
+        return;
+      }
       tuple.emplace_back(element);
     }
     callback(tuple);
@@ -332,15 +338,16 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
             for (; s > 0; --s) {
               if (attempt->tuple.empty()) {
                 // Only allocate tuple when we have something to dequeue
-                // so we don't use exceessive memory when there are many
+                // so we don't use excessive memory when there are many
                 // blocked dequeue attempts waiting.
                 attempt->tuple.reserve(num_components());
                 for (int i = 0; i < num_components(); ++i) {
                   const TensorShape shape =
                       ManyOutShape(i, attempt->elements_requested);
                   Tensor element;
-                  attempt->context->allocate_temp(component_dtypes_[i], shape,
-                                                  &element);
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
                   attempt->tuple.emplace_back(element);
                 }
               }
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 91dc52dc7dd..cb4fcbd7883 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -24,6 +24,13 @@ limitations under the License.
 // optimized. They should be implementable using fixed point representations
 // to avoid a dependency on floating-point hardware.
 
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define QUANTIZATION_UTILS_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include <array>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
@@ -56,6 +63,12 @@ int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
 // any over or underflows.
 template <class T>
 T FloatToQuantized(float input, float range_min, float range_max) {
+  if (std::is_same<T, float>::value) {
+    // Specialization for float. This is used in reference implementation
+    // for float which is useful to compare performance between float
+    // and quantized type.
+    return input;
+  }
   int64 quantized = FloatToQuantizedUnclamped<T>(input, range_min, range_max);
   const int64 lowest_quantized =
       static_cast<int64>(Eigen::NumTraits<T>::lowest());
@@ -68,6 +81,12 @@ T FloatToQuantized(float input, float range_min, float range_max) {
 
 template <class T>
 float QuantizedToFloat(T input, float range_min, float range_max) {
+  if (std::is_same<T, float>::value) {
+    // Specialization for float. This is used in reference implementation
+    // for float which is useful to compare performance between float
+    // and quantized type.
+    return input;
+  }
   if (range_min == range_max) {
     return range_min;
   }
@@ -79,7 +98,13 @@ float QuantizedToFloat(T input, float range_min, float range_max) {
   const int64 lowest_quantized =
       static_cast<int64>(Eigen::NumTraits<T>::lowest());
   const double offset_input = static_cast<double>(input) - lowest_quantized;
-  const double result = range_min + (offset_input * range_scale);
+  // For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
+  // range_scale to a float, otherwise range_min_rounded might be slightly
+  // different.
+  const double range_min_rounded =
+      round(range_min / static_cast<float>(range_scale)) *
+      static_cast<float>(range_scale);
+  const double result = range_min_rounded + (offset_input * range_scale);
   return static_cast<float>(result);
 }
 
@@ -113,8 +138,8 @@ void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
 // input_array is an eigen Tensor.  q2f is a QuantizedToFloatStruct.
 // This evaluates to an eigen tensor expression, to be used like:
 // auto tensor = DEQUANTIZE_WITH_EIGEN(input_tensor, q2f);
-#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                 \
-  ((q2f.range_min - q2f.lowest_quantized() * q2f.range_scale) + \
+#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                         \
+  ((q2f.range_min_rounded - q2f.lowest_quantized() * q2f.range_scale) + \
    input_array.template cast<float>() * q2f.range_scale)
 
 // input_array is an eigen Tensor.  f2q is a FloatToQuantizedStruct.
@@ -142,10 +167,14 @@ struct QuantizedToFloatStruct {
 
   QuantizedToFloatStruct(float range_min, float range_max)
       : range_min(range_min),
-        range_scale((range_max - range_min) / (number_of_steps - 1.0)) {}
+        range_scale((range_max - range_min) / (number_of_steps - 1.0)),
+        range_min_rounded(range_max == range_min
+                              ? range_min
+                              : round(range_min / range_scale) * range_scale) {}
 
   const float range_min;
   const float range_scale;
+  const float range_min_rounded;
 };
 
 // For use with QUANTIZE_WITH_EIGEN.
@@ -193,7 +222,7 @@ inline T2 RequantizeInNewRange(T1 input, float min_input, float max_input,
 }
 
 template <class T1, class T2>
-inline void RequantizeManyInNewRange(const T1* input, size_t count,
+inline void RequantizeManyInNewRange(const T1* input, int64 count,
                                      float min_input, float max_input,
                                      float min_output, float max_output,
                                      T2* output) {
@@ -207,10 +236,11 @@ inline void RequantizeManyInNewRange(const T1* input, size_t count,
 // Because converting 32-bit accumulated results down to eight bit is a common
 // case, we have a specialized code path to handle it as efficiently as
 // possible using only fixed-point math for the inner loop.
-template <>
-inline void RequantizeManyInNewRange<qint32, quint8>(
-    const qint32* input, size_t count, float min_input, float max_input,
-    float min_output, float max_output, quint8* output) {
+inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
+                                              float min_input, float max_input,
+                                              float min_output,
+                                              float max_output,
+                                              quint8* output) {
   // Initially we calculate all the constants we need once, before we go into
   // the inner loop.  If this is updated, also update the Eigen version.
   const int fp_shift = 16;
@@ -226,9 +256,10 @@ inline void RequantizeManyInNewRange<qint32, quint8>(
   const int64 input_offset_fp =
       static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
   const int64 output_offset_fp =
-      output_range == 0.0 ? 0 : static_cast<int64>((1 << fp_shift) *
-                                                   (min_output * 255.0) /
-                                                   output_range);
+      output_range == 0.0
+          ? 0
+          : static_cast<int64>((1 << fp_shift) * (min_output * 255.0) /
+                               output_range);
   const int64 rounding_delta = 1 << (fp_shift - 1);
 
   // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
@@ -248,6 +279,382 @@ inline void RequantizeManyInNewRange<qint32, quint8>(
   }
 }
 
+// Another common case is converting eight bit inputs up to thirty two bits, so
+// we have specialized fixed-point code to accelerate that. There is also a NEON
+// version for ARM devices below.
+inline void RequantizeManyInNewRange8To32BitReference(
+    const quint8* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
+  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);
+  const int64 code_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
+  const int64 code_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);
+  const int32 mult_int32 = code_1_int64 - code_0_int64;
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+  for (int64 i = 0; i < count; ++i) {
+    const int64 input_value = static_cast<int64>(input[i]);
+    int64 output_value = code_0_int64 + (input_value * mult_int32);
+    output_value = std::max(output_value, lowest_quantized);
+    output_value = std::min(output_value, highest_quantized);
+    output[i] = static_cast<int32>(output_value);
+  }
+}
+
+#ifdef QUANTIZATION_UTILS_USE_NEON
+// Speeds up the 32->8bit conversion using fixed-point arithmetic and NEON SIMD
+// intrinsics for ARM platforms.
+inline void RequantizeManyInNewRangeNeon(const qint32* input, int64 count,
+                                         float min_input, float max_input,
+                                         float min_output, float max_output,
+                                         quint8* output) {
+  // Initially we calculate all the constants we need once, before we go into
+  // the inner loop.  If this is updated, also update the Eigen version.
+  const int fp_shift = 16;
+
+  // Calculate range variables in advance.
+  // Input range.
+  const float input_range = max_input - min_input;
+  // Output range.
+  const float output_range = max_output - min_output;
+  // Ratio of output range.
+  const float recip_output_range =
+      output_range == 0.0 ? 0.0 : (255.0 / output_range);
+  // Average of input range as zero position of input.
+  const float input_rezero = (min_input + max_input) / 2.0;
+  // In-out range scale.
+  const int32 range_scale_fp =
+      output_range == 0.0 ? 0.0
+                          : static_cast<int32>(255.0 * (1 << (fp_shift - 16)) *
+                                               input_range / output_range);
+  // Input zero position offset to output.
+  const int32 input_offset_fp =
+      static_cast<int32>(input_rezero * recip_output_range * (1 << fp_shift));
+  // Output min offset.
+  const int32 output_offset_fp =
+      output_range == 0.0
+          ? 0
+          : static_cast<int32>((1 << fp_shift) * (min_output * 255.0) /
+                               output_range);
+  const int32 rounding_delta = 1 << (fp_shift - 1);
+
+  // broadcast range to each lane
+  const int32x4_t range_scale_fp_32x4 = vmovq_n_s32(range_scale_fp);
+  const int32x4_t input_offset_fp_32x4 = vmovq_n_s32(input_offset_fp);
+  const int32x4_t output_offset_fp_32x4 = vmovq_n_s32(output_offset_fp);
+  const int32x4_t rounding_delta_32x4 = vmovq_n_s32(rounding_delta);
+
+  int64 index = 0;
+  // Use SIMD to requantize.
+  for (; index < (count - 7); index += 8) {
+    const int32* input_ptr = &(input->value) + index;
+    const int32x4_t input_value_low_32x4 = vld1q_s32(input_ptr);
+    const int32x4_t input_value_high_32x4 = vld1q_s32(input_ptr + 4);
+    const int32x4_t fp_value_low_32x4 = vaddq_s32(
+        input_offset_fp_32x4,
+        vmulq_s32(vshrq_n_s32(input_value_low_32x4, 16), range_scale_fp_32x4));
+    const int32x4_t fp_value_high_32x4 = vaddq_s32(
+        input_offset_fp_32x4,
+        vmulq_s32(vshrq_n_s32(input_value_high_32x4, 16), range_scale_fp_32x4));
+    const int32x4_t offset_intermediate_low_32x4 =
+        vsubq_s32(fp_value_low_32x4, output_offset_fp_32x4);
+    const int32x4_t offset_intermediate_high_32x4 =
+        vsubq_s32(fp_value_high_32x4, output_offset_fp_32x4);
+    const int32x4_t round_intermediate_low_32x4 =
+        vaddq_s32(offset_intermediate_low_32x4, rounding_delta_32x4);
+    const int32x4_t round_intermediate_high_32x4 =
+        vaddq_s32(offset_intermediate_high_32x4, rounding_delta_32x4);
+    const int16x4_t quantized_low_16x4 =
+        vqmovn_s32(vshrq_n_s32(round_intermediate_low_32x4, fp_shift));
+    const int16x4_t quantized_high_16x4 =
+        vqmovn_s32(vshrq_n_s32(round_intermediate_high_32x4, fp_shift));
+    const uint8x8_t quantized_8x8 =
+        vqmovun_s16(vcombine_s16(quantized_low_16x4, quantized_high_16x4));
+    uint8* output_ptr = &(output->value) + index;
+    vst1_u8(output_ptr, quantized_8x8);
+  }
+
+  // Requantize remaining elements in array without SIMD.
+  for (; index < count; ++index) {
+    const int32 input_value = static_cast<int32>(input[index]);
+    const int32 fp_value =
+        static_cast<int32>(
+            (static_cast<int32>(input_value >> 16) * (range_scale_fp))) +
+        input_offset_fp;
+    const int32 offset_intermediate = fp_value - output_offset_fp;
+    const int32 round_intermediate = offset_intermediate + rounding_delta;
+    int32 quantized_int32 = round_intermediate >> fp_shift;
+    quantized_int32 = std::max(quantized_int32, 0);
+    quantized_int32 = std::min(quantized_int32, 255);
+    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int32));
+  }
+}
+
+template <>
+inline void RequantizeManyInNewRange<qint32, quint8>(
+    const qint32* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, quint8* output) {
+  const float input_range = max_input - min_input;
+  const float output_range = max_output - min_output;
+  if ((input_range / output_range) > 16384.0f) {
+    // Our NEON implementation uses 32-bit math and can't handle very
+    // large ranges, so fall back to the reference implementation. We don't
+    // expect these to be common in models, so this shouldn't be a performance
+    // problem in practice.
+    RequantizeManyInNewRangeReference(input, count, min_input, max_input,
+                                      min_output, max_output, output);
+  } else {
+    RequantizeManyInNewRangeNeon(input, count, min_input, max_input, min_output,
+                                 max_output, output);
+  }
+}
+
+// NEON accelerated 16bit rounded division by 2^n.
+template <int POW>
+inline int16x8_t Divide16x8PowRound(const int16x8_t val) {
+  const int16x8_t val_sign = vshrq_n_s16(val, 15);
+  const int16x8_t val_xor = veorq_s16(val, val_sign);
+  const int16x8_t val_pos = vsubq_s16(val_xor, val_sign);
+  const int16x8_t shifted_val_pos = vrshrq_n_s16(val_pos, POW);
+  const int16x8_t shifted_val_pos_xor = veorq_s16(shifted_val_pos, val_sign);
+  const int16x8_t shifted_val = vsubq_s16(shifted_val_pos_xor, val_sign);
+  return shifted_val;
+}
+
+// NEON accelerated 64bit rounded division by 2^n.
+template <int POW>
+inline int64x2_t Divide64x2PowRound(const int64x2_t val) {
+  const int64x2_t val_sign = vshrq_n_s64(val, 63);
+  const int64x2_t val_xor = veorq_s64(val, val_sign);
+  const int64x2_t val_pos = vsubq_s64(val_xor, val_sign);
+  const int64x2_t shifted_val_pos = vrshrq_n_s64(val_pos, POW);
+  const int64x2_t shifted_val_pos_xor = veorq_s64(shifted_val_pos, val_sign);
+  const int64x2_t shifted_val = vsubq_s64(shifted_val_pos_xor, val_sign);
+  return shifted_val;
+}
+
+// NEON accelerated 16bit division by 2^n.
+// CAVEAT: The input must be greater than min-int16 to avoid underflow.
+template <int POW>
+inline int16x8_t Divide16x8Pow(const int16x8_t val) {
+  static constexpr int16 FIRST_BIT_VAL = 0x0000000000000001;
+  static const int16x8_t FIRST_BIT = vmovq_n_s16(FIRST_BIT_VAL);
+  const int16x8_t val_sign = vshrq_n_s16(val, 15);
+  const int16x8_t neg_offset = vandq_s16(val_sign, FIRST_BIT);
+  const int16x8_t val_with_offset = vsubq_s16(val, neg_offset);
+  const int16x8_t shifted_wo_offset =
+      vsraq_n_s16(neg_offset, val_with_offset, POW);
+  return shifted_wo_offset;
+}
+
+// NEON accelerated 64bit division by 2^n.
+// CAVEAT: The input must be greater than min-int64 to avoid underflow.
+template <int POW>
+inline int64x2_t Divide64x2Pow(const int64x2_t val) {
+  static constexpr int64 FIRST_BIT_VAL = 0x0000000000000001;
+  static const int64x2_t FIRST_BIT = vmovq_n_s64(FIRST_BIT_VAL);
+  const int64x2_t val_sign = vshrq_n_s64(val, 63);
+  const int64x2_t neg_offset = vandq_s64(val_sign, FIRST_BIT);
+  const int64x2_t val_with_offset = vsubq_s64(val, neg_offset);
+  const int64x2_t shifted_wo_offset =
+      vsraq_n_s64(neg_offset, val_with_offset, POW);
+  return shifted_wo_offset;
+}
+
+// 32bit x 2 NEON accelerated lerp computation.
+template <int RESOLUTION>
+inline int32x2_t ComputeLerp32x2(const int32x2_t top_left,
+                                 const int32x2_t top_right,
+                                 const int32x2_t bottom_left,
+                                 const int32x2_t bottom_right,
+                                 const int32x2_t x_lerp,
+                                 const int32x2_t y_lerp) {
+  static_assert(RESOLUTION < 31, "RESOLUTION must be less than 31");
+  constexpr int32 RESOLUTION_MULT32 = (1 << RESOLUTION);
+  static const int32x2_t RESOLUTION_MULT32x2 = vmov_n_s32(RESOLUTION_MULT32);
+
+  const int64x2_t top_left_x_res = vmull_s32(top_left, RESOLUTION_MULT32x2);
+  const int64x2_t bottom_left_x_res =
+      vmull_s32(bottom_left, RESOLUTION_MULT32x2);
+
+  const int32x2_t top_right_sub_top_left = vsub_s32(top_right, top_left);
+  const int64x2_t top_x_res =
+      vmlal_s32(top_left_x_res, top_right_sub_top_left, x_lerp);
+  const int32x2_t bottom_right_sub_bottom_left =
+      vsub_s32(bottom_right, bottom_left);
+  const int64x2_t bottom_x_res =
+      vmlal_s32(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);
+
+  const int64x2_t bottom_sub_top_x_res = vsubq_s64(bottom_x_res, top_x_res);
+  const int64x2_t bottom_sub_top =
+      Divide64x2Pow<RESOLUTION>(bottom_sub_top_x_res);
+  const int32x2_t bottom_sub_top_32 = vqmovn_s64(bottom_sub_top);
+  const int64x2_t top_add_bottom_sub_top_mul_ylerp_x_res =
+      vmlal_s32(top_x_res, bottom_sub_top_32, y_lerp);
+  const int64x2_t retval =
+      Divide64x2PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
+  const int32x2_t retval32 = vqmovn_s64(retval);
+  return retval32;
+}
+
+// 8bit x 8 NEON accelerated lerp computation.
+template <int RESOLUTION>
+inline uint8x8_t ComputeLerp8x8(const uint8x8_t top_left8x8,
+                                const uint8x8_t top_right8x8,
+                                const uint8x8_t bottom_left8x8,
+                                const uint8x8_t bottom_right8x8,
+                                const int16x8_t x_lerp,
+                                const int16x8_t y_lerp) {
+  static_assert(RESOLUTION < 8, "RESOLUTION must be less than 8");
+  constexpr uint8 RESOLUTION_MULT_VAL = (1 << RESOLUTION);
+  static const uint8x8_t RESOLUTION_MULT = vdup_n_u8(RESOLUTION_MULT_VAL);
+
+  const int16x8_t top_left_x_res =
+      vreinterpretq_s16_u16(vmull_u8(top_left8x8, RESOLUTION_MULT));
+  const int16x8_t bottom_left_x_res =
+      vreinterpretq_s16_u16(vmull_u8(bottom_left8x8, RESOLUTION_MULT));
+
+  const int16x8_t top_right_sub_top_left =
+      vreinterpretq_s16_u16(vsubl_u8(top_right8x8, top_left8x8));
+  const int16x8_t top_x_res =
+      vmlaq_s16(top_left_x_res, top_right_sub_top_left, x_lerp);
+
+  const int16x8_t bottom_right_sub_bottom_left =
+      vreinterpretq_s16_u16(vsubl_u8(bottom_right8x8, bottom_left8x8));
+  const int16x8_t bottom_x_res =
+      vmlaq_s16(bottom_left_x_res, bottom_right_sub_bottom_left, x_lerp);
+
+  const int16x8_t bottom_sub_top_x_res = vsubq_s16(bottom_x_res, top_x_res);
+  const int16x8_t bottom_sub_top =
+      Divide16x8Pow<RESOLUTION>(bottom_sub_top_x_res);
+  const int16x8_t top_add_bottom_sub_top_mul_ylerp_x_res =
+      vmlaq_s16(top_x_res, bottom_sub_top, y_lerp);
+  const int16x8_t retval16 =
+      Divide16x8PowRound<RESOLUTION>(top_add_bottom_sub_top_mul_ylerp_x_res);
+  const uint8x8_t retval = vmovn_u16(vreinterpretq_u16_s16(retval16));
+  return retval;
+}
+
+// Requantize 8 x 8 quints to 8 x 32 qints in parallel by neon
+// Return std::array instead of pointer to leverage return value optimization
+inline std::array<int32x4_t, 2> Requantize8x8To32Neon(
+    const uint8* input_ptr, const int64x2_t input_0_64x2,
+    const int32x2_t input_mult_32x2) {
+  const uint8x8_t input_value_8x8 = vld1_u8(input_ptr);
+  const int16x8_t input_value_16x8 =
+      vreinterpretq_s16_u16(vmovl_u8(input_value_8x8));
+  const int16x4_t input_value_low_16x4 = vget_low_s16(input_value_16x8);
+  const int16x4_t input_value_high_16x4 = vget_high_s16(input_value_16x8);
+  const int32x4_t input_value_low_32x4 = vmovl_s16(input_value_low_16x4);
+  const int32x4_t input_value_high_32x4 = vmovl_s16(input_value_high_16x4);
+  const int32x2_t input_value_low_low_32x2 = vget_low_s32(input_value_low_32x4);
+  const int32x2_t input_value_low_high_32x2 =
+      vget_high_s32(input_value_low_32x4);
+  const int32x2_t input_value_high_low_32x2 =
+      vget_low_s32(input_value_high_32x4);
+  const int32x2_t input_value_high_high_32x2 =
+      vget_high_s32(input_value_high_32x4);
+  const int64x2_t mult_result_low_low_64x2 =
+      vmlal_s32(input_0_64x2, input_value_low_low_32x2, input_mult_32x2);
+  const int64x2_t mult_result_low_high_64x2 =
+      vmlal_s32(input_0_64x2, input_value_low_high_32x2, input_mult_32x2);
+  const int64x2_t mult_result_high_low_64x2 =
+      vmlal_s32(input_0_64x2, input_value_high_low_32x2, input_mult_32x2);
+  const int64x2_t mult_result_high_high_64x2 =
+      vmlal_s32(input_0_64x2, input_value_high_high_32x2, input_mult_32x2);
+  const int32x2_t output_value_low_low_32x2 =
+      vqmovn_s64(mult_result_low_low_64x2);
+  const int32x2_t output_value_low_high_32x2 =
+      vqmovn_s64(mult_result_low_high_64x2);
+  const int32x2_t output_value_high_low_32x2 =
+      vqmovn_s64(mult_result_high_low_64x2);
+  const int32x2_t output_value_high_high_32x2 =
+      vqmovn_s64(mult_result_high_high_64x2);
+  const int32x4_t output_value_low_32x4 =
+      vcombine_s32(output_value_low_low_32x2, output_value_low_high_32x2);
+  const int32x4_t output_value_high_32x4 =
+      vcombine_s32(output_value_high_low_32x2, output_value_high_high_32x2);
+  return std::array<int32x4_t, 2>{
+      {output_value_low_32x4, output_value_high_32x4}};
+}
+
+// Speeds up the 8->32bit conversion using fixed-point arithmetic and NEON SIMD
+// intrinsics for ARM platforms.
+template <>
+inline void RequantizeManyInNewRange<quint8, qint32>(
+    const quint8* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  // Pre-calculate zero position and multiplier.
+  // Calculate 0 and 1 value in float.
+  const float code_0_float = QuantizedToFloat<quint8>(0, min_input, max_input);
+  const float code_1_float = QuantizedToFloat<quint8>(1, min_input, max_input);
+
+  // Cast 0 and 1 value in int64.
+  const int64 code_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_0_float, min_output, max_output);
+  const int64 code_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(code_1_float, min_output, max_output);
+
+  // Calculate multiplier.
+  const int32 mult_int32 = static_cast<int32>(code_1_int64 - code_0_int64);
+
+  // Broadcast 0 position and multiplier to lanes
+  const int64x2_t code_0_64x2 = vmovq_n_s64(code_0_int64);
+  const int32x2_t mult_32x2 = vmov_n_s32(mult_int32);
+
+  int64 i = 0;
+
+  // Use SIMD to requantize array.
+  for (; i < (count - 7); i += 8) {
+    const uint8* input_ptr = &(input->value) + i;
+    int32* output_ptr = &(output->value) + i;
+    const std::array<int32x4_t, 2> output_value =
+        Requantize8x8To32Neon(input_ptr, code_0_64x2, mult_32x2);
+    vst1q_s32(output_ptr + 0, output_value[0]);
+    vst1q_s32(output_ptr + 4, output_value[1]);
+  }
+
+  // Requantize remaining elements in array without SIMD.
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  for (; i < count; ++i) {
+    const int64 input_value = static_cast<int64>(input[i]);
+    int64 output_value = code_0_int64 + (input_value * mult_int32);
+    output_value = std::max(output_value, lowest_quantized);
+    output_value = std::min(output_value, highest_quantized);
+    output[i] = static_cast<int32>(output_value);
+  }
+}
+
+#else
+
+// If SIMD implementations aren't available, then use these default reference
+// versions.
+template <>
+inline void RequantizeManyInNewRange<qint32, quint8>(
+    const qint32* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, quint8* output) {
+  RequantizeManyInNewRangeReference(input, count, min_input, max_input,
+                                    min_output, max_output, output);
+}
+
+template <>
+inline void RequantizeManyInNewRange<quint8, qint32>(
+    const quint8* input, int64 count, float min_input, float max_input,
+    float min_output, float max_output, qint32* output) {
+  RequantizeManyInNewRange8To32BitReference(input, count, min_input, max_input,
+                                            min_output, max_output, output);
+}
+
+#endif
+
 template <int shift>
 struct int64_right_shift_op {
   EIGEN_EMPTY_STRUCT_CTOR(int64_right_shift_op)
@@ -295,9 +702,10 @@ inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
   const int64 input_offset_fp =
       static_cast<int64>(input_rezero * recip_output_range * (1 << fp_shift));
   const int64 output_offset_fp =
-      output_range == 0.0 ? 0 : static_cast<int64>((1 << fp_shift) *
-                                                   (min_output * 255.0) /
-                                                   output_range);
+      output_range == 0.0
+          ? 0
+          : static_cast<int64>((1 << fp_shift) * (min_output * 255.0) /
+                               output_range);
   const int64 rounding_delta = 1 << (fp_shift - 1);
 
   // Inside this eigen expression we just do minimal adds, multiplies, and
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 84566047405..eae303b85e4 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -18,66 +18,101 @@ limitations under the License.
 #include <limits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 
-class QuantizationUtilsTest : public ::testing::Test {
- protected:
-  void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device,
-                          float input_min, float input_max, float output_min,
-                          float output_max,
-                          const std::vector<qint32>& values_quantized,
-                          int tolerance = 1) {
-    const int values_count = values_quantized.size();
-    std::vector<quint8> expected_values;
-    for (int value_index = 0; value_index < values_count; ++value_index) {
-      expected_values.push_back(FloatToQuantized<quint8>(
-          QuantizedToFloat(values_quantized[value_index], input_min, input_max),
-          output_min, output_max));
-    }
-
-    Tensor i_tensor =
-        tensorflow::test::AsTensor(gtl::ArraySlice<qint32>(values_quantized));
-    Tensor o_tensor(DT_QUINT8, TensorShape{values_count});
-    auto output_values = o_tensor.flat<quint8>();
-
-    if (eigen_device == nullptr) {
-      auto input_array = i_tensor.flat<qint32>();
-      RequantizeManyInNewRange(input_array.data(), input_array.size(),
-                               input_min, input_max, output_min, output_max,
-                               output_values.data());
-    } else {
-      RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
-          *eigen_device, i_tensor, input_min, input_max, output_min, output_max,
-          &o_tensor);
-    }
-
-    const string tolerance_str = strings::StrCat("+-", tolerance);
-    for (size_t value_index = 0; value_index < values_count; ++value_index) {
-      int e = expected_values[value_index];
-      int v = output_values(value_index);
-      ASSERT_TRUE(std::abs(e - v) <= tolerance)
-          << "actual=" << v << ", expected=" << e << tolerance_str
-          << ", values_quantized[" << value_index
-          << "]=" << values_quantized[value_index]
-          << ", input_min=" << input_min << ", input_max=" << input_max
-          << ", output_min=" << output_min << ", output_max=" << output_max
-          << ", value_index=" << value_index;
-    }
+void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
+                        float input_max, float output_min, float output_max,
+                        const std::vector<qint32>& values_quantized,
+                        int tolerance = 1) {
+  const int values_count = values_quantized.size();
+  std::vector<quint8> expected_values;
+  expected_values.reserve(values_count);
+  for (int value_index = 0; value_index < values_count; ++value_index) {
+    expected_values.push_back(FloatToQuantized<quint8>(
+        QuantizedToFloat(values_quantized[value_index], input_min, input_max),
+        output_min, output_max));
   }
 
-  // If eigen_device is NULL, then the reference implementation is tested.
-  void TestRequantizeManyInNewRange32To8Bit(
-      Eigen::ThreadPoolDevice* eigen_device) {
+  Tensor i_tensor =
+      tensorflow::test::AsTensor(gtl::ArraySlice<qint32>(values_quantized));
+  Tensor o_tensor(DT_QUINT8, TensorShape{values_count});
+  auto output_values = o_tensor.flat<quint8>();
+
+  if (eigen_device == nullptr) {
+    auto input_array = i_tensor.flat<qint32>();
+    RequantizeManyInNewRange(input_array.data(), input_array.size(), input_min,
+                             input_max, output_min, output_max,
+                             output_values.data());
+  } else {
+    RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
+        *eigen_device, i_tensor, input_min, input_max, output_min, output_max,
+        &o_tensor);
+  }
+
+  const string tolerance_str = strings::StrCat("+-", tolerance);
+  for (size_t value_index = 0; value_index < values_count; ++value_index) {
+    int e = expected_values[value_index];
+    int v = output_values(value_index);
+    ASSERT_TRUE(std::abs(e - v) <= tolerance)
+        << "actual=" << v << ", expected=" << e << tolerance_str
+        << ", values_quantized[" << value_index
+        << "]=" << values_quantized[value_index] << ", input_min=" << input_min
+        << ", input_max=" << input_max << ", output_min=" << output_min
+        << ", output_max=" << output_max << ", value_index=" << value_index;
+  }
+}
+
+void TestRequantizeMany8To32Bit(float input_min, float input_max,
+                                float output_min, float output_max,
+                                const std::vector<quint8>& values_quantized,
+                                int tolerance = 256) {
+  const int values_count = values_quantized.size();
+  std::vector<qint32> expected_values;
+  expected_values.reserve(values_count);
+  for (int value_index = 0; value_index < values_count; ++value_index) {
+    expected_values.push_back(FloatToQuantized<qint32>(
+        QuantizedToFloat(values_quantized[value_index], input_min, input_max),
+        output_min, output_max));
+  }
+
+  const Tensor i_tensor =
+      tensorflow::test::AsTensor(gtl::ArraySlice<quint8>(values_quantized));
+  Tensor o_tensor(DT_QINT32, TensorShape{values_count});
+  auto output_values = o_tensor.flat<qint32>();
+
+  const auto input_array = i_tensor.flat<quint8>();
+  RequantizeManyInNewRange(input_array.data(), input_array.size(), input_min,
+                           input_max, output_min, output_max,
+                           output_values.data());
+
+  const string tolerance_str = strings::StrCat("+-", tolerance);
+  for (int value_index = 0; value_index < values_count; ++value_index) {
+    const qint32 e = expected_values[value_index];
+    const qint32 v = output_values(value_index);
+    ASSERT_TRUE(std::abs(e - v) <= tolerance)
+        << "actual=" << v << ", expected=" << e << tolerance_str
+        << ", values_quantized[" << value_index
+        << "]=" << values_quantized[value_index] << ", input_min=" << input_min
+        << ", input_max=" << input_max << ", output_min=" << output_min
+        << ", output_max=" << output_max << ", value_index=" << value_index;
+  }
+}
+
+// If eigen_device is NULL, then the reference implementation is tested.
+void TestRequantizeManyInNewRange32To8Bit(
+    Eigen::ThreadPoolDevice* eigen_device) {
+  if (true) {
     // These are the float values we're going to test the conversions on.
     const size_t values_count = 6;
     const float values[values_count] = {0.0f,  0.45f,  1.0f,
@@ -108,7 +143,7 @@ class QuantizationUtilsTest : public ::testing::Test {
     qint32 high = Eigen::NumTraits<qint32>::highest();
     std::vector<qint32> vals{low, high};
     int num_steps = 14419;
-    qint32 step = static_cast<int32>((1L << 32) / num_steps);
+    qint32 step = static_cast<int32>((1LL << 32) / num_steps);
     qint32 v = low + static_cast<qint32>(1);
     for (int i = 0; i < num_steps; ++i) {
       vals.push_back(v);
@@ -120,181 +155,278 @@ class QuantizationUtilsTest : public ::testing::Test {
                        vals);
     TestRequantizeMany(eigen_device, -1.0f, 12345678.0f, -12345678.0f,
                        12345678.0f, vals);
+  }
+  // Test when the input range is large and output range is small.
+  // Use all quantized values where the float is in the output range.
+  const float out_min = -29.1234;
+  const float out_max = 23.1234;
+  const float in_min = -1e6;
+  const float in_max = 1e6;
 
-    // Test when the input range is large and output range is small.
-    // Use all quantized values where the float is in the output range.
-    const float out_min = -29.1234;
-    const float out_max = 23.1234;
-    const float in_min = -1e6;
-    const float in_max = 1e6;
+  qint32 low = FloatToQuantized<qint32>(out_min, in_min, in_max);
+  qint32 high = FloatToQuantized<qint32>(out_max, in_min, in_max);
+  std::vector<qint32> vals;
+  vals.clear();
+  for (int32 i = low; i <= high; ++i) vals.push_back(i);
+  TestRequantizeMany(eigen_device, in_min, in_max, out_min, out_max, vals);
+}
 
-    low = FloatToQuantized<qint32>(out_min, in_min, in_max);
-    high = FloatToQuantized<qint32>(out_max, in_min, in_max);
-    vals.clear();
-    for (int32 i = low; i <= high; ++i) vals.push_back(i);
-    TestRequantizeMany(eigen_device, in_min, in_max, out_min, out_max, vals);
+void TestRequantizeManyInNewRange8To32Bit() {
+  // These are the float values we're going to test the conversions on.
+  const size_t values_count = 6;
+  const float values[values_count] = {0.0f, 0.45f, 1.0f, -1.0f, 127.0f, 255.0f};
+  // These are the input and output ranges we'll test.
+  const size_t ranges_count = 6;
+  const float ranges[ranges_count][4] = {
+      {0.0f, 255.0f, 0.0f, 255.0f},    //
+      {0.0f, 1.0f, 0.0f, 1.0f},        //
+      {-1.0f, 1.0f, -1.0f, 1.0f},      //
+      {-1.0f, 1.0f, -255.0f, 255.0f},  //
+      {3.0f, 3.0f, 0.0f, 255.0f},      // input min == max
+      {0.0f, 255.0f, 5.0f, 5.0f},      // output min == max
+  };
+  for (int i = 0; i < ranges_count; ++i) {
+    const auto& r = ranges[i];
+    std::vector<quint8> values_quantized;
+    for (int value_index = 0; value_index < values_count; ++value_index) {
+      const float v = values[value_index];
+      values_quantized.push_back(FloatToQuantized<quint8>(v, r[0], r[1]));
+    }
+    TestRequantizeMany8To32Bit(r[0], r[1], r[2], r[3], values_quantized);
   }
 
-  template <typename InputType, typename OutputType>
-  void TestRequantizeManyInNewRangeEigenVsNonEigen() {
-    thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
-    EigenThreadPoolWrapper wrapper(&threadpool);
-    Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
+  // Test with many different values in the input quantized range.
+  int low = Eigen::NumTraits<quint8>::lowest();
+  int high = Eigen::NumTraits<quint8>::highest();
+  std::vector<quint8> vals;
+  for (int val = low; val <= high; ++val) {
+    vals.push_back(val);
+  }
+  TestRequantizeMany8To32Bit(-1.0f, 1.0f, -1.0f, 1.0f, vals);
+  TestRequantizeMany8To32Bit(-255.0f, 255.0f, -255.0f, 255.0f, vals);
+  TestRequantizeMany8To32Bit(-1.0f, 1.0f, -12345678.0f, 12345678.0f, vals);
+  TestRequantizeMany8To32Bit(-1.0f, 12345678.0f, -12345678.0f, 12345678.0f,
+                             vals);
+}
 
-    const size_t ranges_count = 6;
-    const float ranges[ranges_count][4] = {
-        {0.0f, 255.0f, 0.0f, 255.0f},    //
-        {0.0f, 1.0f, 0.0f, 1.0f},        //
-        {-1.0f, 1.0f, -1.0f, 1.0f},      //
-        {-1.0f, 1.0f, -255.0f, 255.0f},  //
-        {3.0f, 3.0f, 0.0f, 255.0f},      // input min == max
-        {0.0f, 255.0f, 5.0f, 5.0f},      // output min == max
-    };
+template <typename InputType, typename OutputType>
+void TestRequantizeManyInNewRangeEigenVsNonEigen() {
+  thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
 
-    // Random values.
-    for (size_t range_index = 0; range_index < ranges_count; ++range_index) {
-      const float input_min = ranges[range_index][0];
-      const float input_max = ranges[range_index][1];
-      const float output_min = ranges[range_index][2];
-      const float output_max = ranges[range_index][3];
-      const int values_count = 10000;
-      random::PhiloxRandom philox(testing::RandomSeed(), 17);
-      random::SimplePhilox rnd(&philox);
-      std::vector<InputType> values_quantized;
-      for (int i = 0; i < values_count; ++i) {
-        float v = (rnd.RandFloat() * (input_max - input_min)) + input_min;
-        values_quantized.push_back(
-            FloatToQuantized<InputType>(v, input_min, input_max));
-      }
+  const size_t ranges_count = 6;
+  const float ranges[ranges_count][4] = {
+      {0.0f, 255.0f, 0.0f, 255.0f},    //
+      {0.0f, 1.0f, 0.0f, 1.0f},        //
+      {-1.0f, 1.0f, -1.0f, 1.0f},      //
+      {-1.0f, 1.0f, -255.0f, 255.0f},  //
+      {3.0f, 3.0f, 0.0f, 255.0f},      // input min == max
+      {0.0f, 255.0f, 5.0f, 5.0f},      // output min == max
+  };
 
-      Tensor i_tensor = tensorflow::test::AsTensor(
-          gtl::ArraySlice<InputType>(values_quantized));
-      const auto i_array = i_tensor.flat<InputType>();
-      Tensor o_tensor_eigen(DataTypeToEnum<OutputType>::v(),
-                            TensorShape{values_count});
-      auto output_values_eigen = o_tensor_eigen.flat<OutputType>();
-      Tensor o_tensor_ref(DataTypeToEnum<OutputType>::v(),
+  // Random values.
+  for (size_t range_index = 0; range_index < ranges_count; ++range_index) {
+    const float input_min = ranges[range_index][0];
+    const float input_max = ranges[range_index][1];
+    const float output_min = ranges[range_index][2];
+    const float output_max = ranges[range_index][3];
+    const int values_count = 10000;
+    random::PhiloxRandom philox(testing::RandomSeed(), 17);
+    random::SimplePhilox rnd(&philox);
+    std::vector<InputType> values_quantized;
+    for (int i = 0; i < values_count; ++i) {
+      float v = (rnd.RandFloat() * (input_max - input_min)) + input_min;
+      values_quantized.push_back(
+          FloatToQuantized<InputType>(v, input_min, input_max));
+    }
+
+    Tensor i_tensor = tensorflow::test::AsTensor(
+        gtl::ArraySlice<InputType>(values_quantized));
+    const auto i_array = i_tensor.flat<InputType>();
+    Tensor o_tensor_eigen(DataTypeToEnum<OutputType>::v(),
                           TensorShape{values_count});
-      auto output_values_ref = o_tensor_ref.flat<OutputType>();
+    auto output_values_eigen = o_tensor_eigen.flat<OutputType>();
+    Tensor o_tensor_ref(DataTypeToEnum<OutputType>::v(),
+                        TensorShape{values_count});
+    auto output_values_ref = o_tensor_ref.flat<OutputType>();
 
-      RequantizeManyInNewRange(i_array.data(), i_array.size(), input_min,
-                               input_max, output_min, output_max,
-                               output_values_ref.data());
+    RequantizeManyInNewRange(i_array.data(), i_array.size(), input_min,
+                             input_max, output_min, output_max,
+                             output_values_ref.data());
+    RequantizeManyInNewRangeUsingEigen<InputType, OutputType>(
+        eigen_device, i_tensor, input_min, input_max, output_min, output_max,
+        &o_tensor_eigen);
+
+    const int tolerance = 1;
+    for (int i = 0; i < values_quantized.size(); ++i) {
+      auto expected = output_values_ref(i);
+      auto actual = output_values_eigen(i);
+      // The eigen computation uses float for constants and computation
+      // instead of doubles, so can be different by 1 or 2 in some cases
+      // (e.g., input value 144.062744140625, min -1, max 255, type quint8).
+      ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
+          << "expected=" << expected << " actual=" << actual
+          << " tolerance=" << tolerance << " v=" << values_quantized[i]
+          << " i=" << i << " input_min=" << input_min
+          << " input_max=" << input_max
+          << " input_type=" << DataTypeString(DataTypeToEnum<InputType>::v())
+          << " output_type=" << DataTypeString(DataTypeToEnum<OutputType>::v());
+    }
+  }
+}
+
+template <typename InputType, typename OutputType>
+void TimeRequantizeManyInNewRange(int64 num_elements, int64 iterations,
+                                  bool use_eigen) {
+  const float input_min = -100.0f;
+  const float input_max = 100.0f;
+  const float output_min = -1000000.0f;
+  const float output_max = 1000000.0f;
+
+  random::PhiloxRandom philox(testing::RandomSeed(), 17);
+  random::SimplePhilox rnd(&philox);
+  std::vector<InputType> values_quantized;
+  for (int i = 0; i < num_elements; ++i) {
+    float v = (rnd.RandFloat() * (input_max - input_min)) + input_min;
+    values_quantized.push_back(
+        FloatToQuantized<InputType>(v, input_min, input_max));
+  }
+
+  thread::ThreadPool threadpool(Env::Default(), "test", 4 /* num_threads */);
+  EigenThreadPoolWrapper wrapper(&threadpool);
+  Eigen::ThreadPoolDevice eigen_device(&wrapper, 4 /* num_threads */);
+
+  Tensor i_tensor =
+      tensorflow::test::AsTensor(gtl::ArraySlice<InputType>(values_quantized));
+  const auto i_array = i_tensor.flat<InputType>();
+  Tensor o_tensor_eigen(DataTypeToEnum<OutputType>::v(),
+                        TensorShape{num_elements});
+  Tensor o_tensor_ref(DataTypeToEnum<OutputType>::v(),
+                      TensorShape{num_elements});
+  auto output_values_ref = o_tensor_ref.flat<OutputType>();
+
+  int64 total_duration = 0;
+  for (int i = 0; i < iterations; ++i) {
+    const int64 start_time = Env::Default()->NowMicros();
+    if (use_eigen) {
       RequantizeManyInNewRangeUsingEigen<InputType, OutputType>(
           eigen_device, i_tensor, input_min, input_max, output_min, output_max,
           &o_tensor_eigen);
+    } else {
+      RequantizeManyInNewRange<InputType, OutputType>(
+          i_array.data(), i_array.size(), input_min, input_max, output_min,
+          output_max, output_values_ref.data());
+    }
+    const int64 end_time = Env::Default()->NowMicros();
+    total_duration += end_time - start_time;
+  }
+  const int64 one_run_duration = total_duration / iterations;
 
-      const int tolerance = 1;
-      for (int i = 0; i < values_quantized.size(); ++i) {
-        auto expected = output_values_ref(i);
-        auto actual = output_values_eigen(i);
-        // The eigen computation uses float for constants and computation
-        // instead of doubles, so can be different by 1 or 2 in some cases
-        // (e.g., input value 144.062744140625, min -1, max 255, type quint8).
-        ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
-            << "expected=" << expected << " actual=" << actual
-            << " tolerance=" << tolerance << " v=" << values_quantized[i]
-            << " i=" << i << " input_min=" << input_min
-            << " input_max=" << input_max
-            << " input_type=" << DataTypeString(DataTypeToEnum<InputType>::v())
-            << " output_type="
-            << DataTypeString(DataTypeToEnum<OutputType>::v());
-      }
+  const int64 num_ops = num_elements;
+
+  const double million_ops_per_second =
+      (iterations * num_ops) / static_cast<double>(total_duration);
+
+  LOG(INFO) << "TimeRequantizeManyInNewRange: " << num_elements
+            << (use_eigen ? " eigen" : " ref") << ": iterations=" << iterations
+            << ", MOps/s=" << million_ops_per_second
+            << ", one_run_duration=" << one_run_duration
+            << ", total_duration=" << total_duration;
+}
+
+template <typename T>
+void TestFloatToQuantizedInPlaceUsingEigen(
+    Eigen::ThreadPoolDevice* eigen_device) {
+  // These are the float values we're going to test the conversions on.
+  typedef std::pair<float, float> FPair;
+  for (FPair min_and_max : std::vector<FPair>{FPair(-255.0f, 255.0f),  //
+                                              FPair(-1.0f, 1.0f),      //
+                                              FPair(-1.0f, 255.0f),    //
+                                              FPair(0.0f, 1e6),        //
+                                              FPair(0.0f, 1.0f),       //
+                                              FPair(-31.0f, 13.0f)}) {
+    const float f_min = min_and_max.first;
+    const float f_max = min_and_max.second;
+    const float f_range = f_max - f_min;
+    const int values_count = 50000;
+    Tensor input(DT_FLOAT, TensorShape{values_count});
+    auto input_array = input.flat<float>();
+    for (int i = 0; i < values_count; ++i) {
+      input_array(i) = f_min + f_range * i / (values_count - 1);
+    }
+
+    Tensor output(DataTypeToEnum<T>::v(), TensorShape{values_count});
+    FloatTensorToQuantizedInPlaceUsingEigen<T>(*eigen_device, input, f_min,
+                                               f_max, &output);
+    auto output_array = output.flat<T>();
+
+    const int tolerance = 1;
+    for (int i = 0; i < values_count; ++i) {
+      int32 expected = FloatToQuantized<T>(input_array(i), f_min, f_max);
+      int32 actual = output_array(i);
+
+      // The eigen computation uses float for constants and computation
+      // instead
+      // of doubles, so can be different by 1 or 2 in some cases (e.g., input
+      // value 144.062744140625, min -1, max 255, type quint8).
+      ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
+          << "expected=" << expected << " actual=" << actual
+          << " tolerance=" << tolerance << " v=" << input_array(i) << " i=" << i
+          << " f_min=" << f_min << " f_max=" << f_max
+          << " type=" << DataTypeString(DataTypeToEnum<T>::v());
     }
   }
+}
 
-  template <typename T>
-  void TestFloatToQuantizedInPlaceUsingEigen(
-      Eigen::ThreadPoolDevice* eigen_device) {
-    // These are the float values we're going to test the conversions on.
-    typedef std::pair<float, float> FPair;
-    for (FPair min_and_max : std::vector<FPair>{FPair(-255.0f, 255.0f),  //
-                                                FPair(-1.0f, 1.0f),      //
-                                                FPair(-1.0f, 255.0f),    //
-                                                FPair(0.0f, 1e6),        //
-                                                FPair(0.0f, 1.0f),       //
-                                                FPair(-31.0f, 13.0f)}) {
-      const float f_min = min_and_max.first;
-      const float f_max = min_and_max.second;
-      const float f_range = f_max - f_min;
-      const int values_count = 50000;
-      Tensor input(DT_FLOAT, TensorShape{values_count});
-      auto input_array = input.flat<float>();
-      for (int i = 0; i < values_count; ++i) {
-        input_array(i) = f_min + f_range * i / (values_count - 1);
-      }
-
-      Tensor output(DataTypeToEnum<T>::v(), TensorShape{values_count});
-      FloatTensorToQuantizedInPlaceUsingEigen<T>(*eigen_device, input, f_min,
-                                                 f_max, &output);
-      auto output_array = output.flat<T>();
-
-      const int tolerance = 1;
-      for (int i = 0; i < values_count; ++i) {
-        int32 expected = FloatToQuantized<T>(input_array(i), f_min, f_max);
-        int32 actual = output_array(i);
-
-        // The eigen computation uses float for constants and computation
-        // instead
-        // of doubles, so can be different by 1 or 2 in some cases (e.g., input
-        // value 144.062744140625, min -1, max 255, type quint8).
-        ASSERT_TRUE(std::abs(expected - actual) <= tolerance)
-            << "expected=" << expected << " actual=" << actual
-            << " tolerance=" << tolerance << " v=" << input_array(i)
-            << " i=" << i << " f_min=" << f_min << " f_max=" << f_max
-            << " type=" << DataTypeString(DataTypeToEnum<T>::v());
+template <typename T>
+void TestQuantizedToFloatInPlaceUsingEigen(
+    Eigen::ThreadPoolDevice* eigen_device) {
+  // These are the float values we're going to test the conversions on.
+  typedef std::pair<float, float> FPair;
+  for (FPair min_and_max : std::vector<FPair>{
+           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-5.89505e+08, 5.89505e+08),
+       }) {
+    const float f_min = min_and_max.first;
+    const float f_max = min_and_max.second;
+    const int values_count = sizeof(T) == 1 ? 256 : 50000;
+    Tensor input(DataTypeToEnum<T>::v(), TensorShape{values_count});
+    auto input_array = input.flat<T>();
+    const double q_range = static_cast<double>(Eigen::NumTraits<T>::highest()) -
+                           Eigen::NumTraits<T>::lowest();
+    for (int i = 0; i < values_count; ++i) {
+      if (sizeof(T) == 1) {
+        input_array(i) = Eigen::NumTraits<T>::lowest() + i;
+      } else {
+        int64 offset = static_cast<int64>(q_range / values_count * i);
+        input_array(i) = static_cast<int32>(
+            std::min<int64>(Eigen::NumTraits<T>::lowest() + offset,
+                            Eigen::NumTraits<T>::highest()));
       }
     }
-  }
 
-  template <typename T>
-  void TestQuantizedToFloatInPlaceUsingEigen(
-      Eigen::ThreadPoolDevice* eigen_device) {
-    // These are the float values we're going to test the conversions on.
-    typedef std::pair<float, float> FPair;
-    for (FPair min_and_max : std::vector<FPair>{
-             FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-             FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
-             FPair(-5.89505e+08, 5.89505e+08),
-         }) {
-      const float f_min = min_and_max.first;
-      const float f_max = min_and_max.second;
-      const int values_count = sizeof(T) == 1 ? 256 : 50000;
-      Tensor input(DataTypeToEnum<T>::v(), TensorShape{values_count});
-      auto input_array = input.flat<T>();
-      const double q_range =
-          static_cast<double>(Eigen::NumTraits<T>::highest()) -
-          Eigen::NumTraits<T>::lowest();
-      for (int i = 0; i < values_count; ++i) {
-        if (sizeof(T) == 1) {
-          input_array(i) = Eigen::NumTraits<T>::lowest() + i;
-        } else {
-          int64 offset = static_cast<int64>(q_range / values_count * i);
-          input_array(i) = static_cast<int32>(
-              std::min<int64>(Eigen::NumTraits<T>::lowest() + offset,
-                              Eigen::NumTraits<T>::highest()));
-        }
-      }
-
-      Tensor output(DT_FLOAT, TensorShape{values_count});
-      QuantizedTensorToFloatInPlaceUsingEigen<T>(*eigen_device, input, f_min,
-                                                 f_max, &output);
-      auto output_array = output.flat<float>();
-      const double range = static_cast<double>(f_max) - f_min;
-      for (int i = 0; i < values_count; ++i) {
-        float expected = QuantizedToFloat<T>(input_array(i), f_min, f_max);
-        float actual = output_array(i);
-        ASSERT_NEAR(expected, actual, range * 1.1e-7)
-            << "expected=" << expected << " actual=" << actual
-            << " v=" << input_array(i) << " i=" << i << " f_min=" << f_min
-            << " f_max=" << f_max
-            << " type=" << DataTypeString(DataTypeToEnum<T>::v());
-      }
+    Tensor output(DT_FLOAT, TensorShape{values_count});
+    QuantizedTensorToFloatInPlaceUsingEigen<T>(*eigen_device, input, f_min,
+                                               f_max, &output);
+    auto output_array = output.flat<float>();
+    const double range = static_cast<double>(f_max) - f_min;
+    for (int i = 0; i < values_count; ++i) {
+      float expected = QuantizedToFloat<T>(input_array(i), f_min, f_max);
+      float actual = output_array(i);
+      ASSERT_NEAR(expected, actual, range * 1.1e-7)
+          << "expected=" << expected << " actual=" << actual
+          << " v=" << input_array(i) << " i=" << i << " f_min=" << f_min
+          << " f_max=" << f_max
+          << " type=" << DataTypeString(DataTypeToEnum<T>::v());
     }
   }
-};
+}
 
-TEST_F(QuantizationUtilsTest, FloatToQuantized) {
+}  // namespace
+
+void TestFloatToQuantized() {
   EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(0.0f, 0.0f, 1.0f));
   EXPECT_EQ(quint8(0), FloatToQuantized<quint8>(0.0f, 0.0f, 2.0f));
   EXPECT_EQ(quint8(128), FloatToQuantized<quint8>(0.5f, 0.0f, 1.0f));
@@ -318,7 +450,7 @@ TEST_F(QuantizationUtilsTest, FloatToQuantized) {
             FloatToQuantized<qint32>(128.0f, -128.0f, 128.0f));
 }
 
-TEST_F(QuantizationUtilsTest, QuantizedToFloat) {
+void TestQuantizedToFloat() {
   EXPECT_LT(fabsf(0.0f - QuantizedToFloat<quint8>(0, 0.0f, 1.0f)), 1 / 255.0f);
   EXPECT_LT(fabsf(0.0f - QuantizedToFloat<quint8>(0, 0.0f, 2.0f)), 1 / 255.0f);
   EXPECT_LT(fabsf(0.5f - QuantizedToFloat<quint8>(127, 0.0f, 1.0f)),
@@ -349,15 +481,29 @@ TEST_F(QuantizationUtilsTest, QuantizedToFloat) {
               1.0);
 }
 
-TEST_F(QuantizationUtilsTest, AvoidBias) {
+void TestAvoidBias() {
   for (int i = 0; i < 256; ++i) {
     const float as_float = QuantizedToFloat<quint8>(i, 0.0f, 2.0f);
     const int back_to_int = FloatToQuantized<quint8>(as_float, 0.0f, 2.0f);
     EXPECT_EQ(i, back_to_int);
   }
+
+  // All perfectly representable floats should survive quantization, even
+  // if we pick a range where min is not itself perfectly representable.
+  const float min = -0.1375f;
+  const float max = 1.1385f;
+  const float step_size = (max - min) / 255.0f;
+  const float tolerance = step_size / 1000.0f;
+  // This is the smallest perfectly representable float in the range.
+  float first_float = ceil(min / step_size) * step_size;
+  for (float f = first_float; f <= max; f += step_size) {
+    const int as_int = FloatToQuantized<quint8>(f, min, max);
+    const float back_to_float = QuantizedToFloat<quint8>(as_int, min, max);
+    EXPECT_NEAR(f, back_to_float, tolerance);
+  }
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeInNewRange) {
+void TestRequantizeInNewRange() {
   // These are the float values we're going to test the conversions on.
   const size_t values_count = 6;
   const float values[values_count] = {0.0f, 0.5f, 1.0f, -1.0f, 127.0f, 255.0f};
@@ -393,12 +539,17 @@ TEST_F(QuantizationUtilsTest, RequantizeInNewRange) {
   }
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeInNewRangeRealData) {
-  const float value_as_float = -0.290169f;
+void TestRequantizeInNewRangeRealData() {
   const float input_min = -0.739539f;
   const float input_max = 0.641057f;
   const float output_min = -2381.49f;
   const float output_max = 2207.6f;
+
+  // Start with a value that can be perfectly represented in 8 bits. This
+  // ensures minimal quantization error, and allows us to use EXPECT_LT below.
+  const float value_as_float =
+      QuantizedToFloat<quint8>(83, input_min, input_max);
+
   const quint8 value_as_quint8 =
       FloatToQuantized<quint8>(value_as_float, input_min, input_max);
   EXPECT_EQ(quint8(83), value_as_quint8);
@@ -409,7 +560,7 @@ TEST_F(QuantizationUtilsTest, RequantizeInNewRangeRealData) {
   EXPECT_LT(std::abs(value_as_qint32 - actual_output), 10);
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeInNewRange32To8Bit) {
+void TestRequantizeInNewRange32To8Bit() {
   // These are the float values we're going to test the conversions on.
   const size_t values_count = 6;
   const float values[values_count] = {0.0f, 0.45f, 1.0f, -1.0f, 127.0f, 255.0f};
@@ -445,27 +596,26 @@ TEST_F(QuantizationUtilsTest, RequantizeInNewRange32To8Bit) {
   }
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8Bit) {
+void TestRequantizeManyInNewRange32To8Bit() {
   TestRequantizeManyInNewRange32To8Bit(nullptr /* eigen_device */);
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8BitUsingEigen) {
+void TestRequantizeManyInNewRange32To8BitUsingEigen() {
   thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
   EigenThreadPoolWrapper wrapper(&threadpool);
   Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
   TestRequantizeManyInNewRange32To8Bit(&eigen_device);
 }
 
-TEST_F(QuantizationUtilsTest, RequantizeManyInNewRange32To8BitEigenVsNonEigen) {
+void TestRequantizeManyInNewRange32To8BitEigenVsNonEigen() {
   TestRequantizeManyInNewRangeEigenVsNonEigen<qint32, quint8>();
 }
 
-TEST_F(QuantizationUtilsTest,
-       RequantizeManyInNewRange32To8BitSignedEigenVsNonEigen) {
+void TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen() {
   TestRequantizeManyInNewRangeEigenVsNonEigen<qint32, qint8>();
 }
 
-TEST_F(QuantizationUtilsTest, FloatTensorToQuantized) {
+void TestFloatTensorToQuantized() {
   const int input_width = 3;
   const int input_height = 3;
   const float input_min = 0.0f;
@@ -481,7 +631,7 @@ TEST_F(QuantizationUtilsTest, FloatTensorToQuantized) {
 
 // Verify that FloatToQuantizedInPlaceUsingEigen is same result as
 // FloatToQuantized.
-TEST_F(QuantizationUtilsTest, FloatToQuantizedInPlaceUsingEigen) {
+void TestFloatToQuantizedInPlaceUsingEigen() {
   thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
   EigenThreadPoolWrapper wrapper(&threadpool);
   Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
@@ -492,7 +642,7 @@ TEST_F(QuantizationUtilsTest, FloatToQuantizedInPlaceUsingEigen) {
   TestFloatToQuantizedInPlaceUsingEigen<qint16>(&eigen_device);
 }
 
-TEST_F(QuantizationUtilsTest, OverflowWithEigen) {
+void TestOverflowWithEigen() {
   thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
   EigenThreadPoolWrapper wrapper(&threadpool);
   Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
@@ -518,7 +668,7 @@ TEST_F(QuantizationUtilsTest, OverflowWithEigen) {
   test::ExpectTensorEqual<qint32>(expected, output);
 }
 
-TEST_F(QuantizationUtilsTest, QuantizedTensorToFloat) {
+void TestQuantizedTensorToFloat() {
   const int input_width = 3;
   const int input_height = 3;
   const float input_min = -128.0f;
@@ -560,7 +710,7 @@ TEST_F(QuantizationUtilsTest, QuantizedTensorToFloat) {
 
 // Verify that QuantizedToFloatInPlaceUsingEigen is same result as
 // QuantizedToFloat.
-TEST_F(QuantizationUtilsTest, QuantizedToFloatInPlaceUsingEigen) {
+void TestQuantizedToFloatInPlaceUsingEigen() {
   thread::ThreadPool threadpool(Env::Default(), "test", 2 /* num_threads */);
   EigenThreadPoolWrapper wrapper(&threadpool);
   Eigen::ThreadPoolDevice eigen_device(&wrapper, 2 /* num_threads */);
@@ -572,4 +722,230 @@ TEST_F(QuantizationUtilsTest, QuantizedToFloatInPlaceUsingEigen) {
   TestQuantizedToFloatInPlaceUsingEigen<qint32>(&eigen_device);
 }
 
+void BenchmarkRequantizeManyInNewRange() {
+  TimeRequantizeManyInNewRange<qint32, quint8>(1000, 1000, false);
+  TimeRequantizeManyInNewRange<qint32, quint8>(1000, 1000, true);
+  TimeRequantizeManyInNewRange<qint32, quint8>(100000, 100, false);
+  TimeRequantizeManyInNewRange<qint32, quint8>(100000, 100, true);
+  TimeRequantizeManyInNewRange<qint32, quint8>(1000000, 10, false);
+  TimeRequantizeManyInNewRange<qint32, quint8>(1000000, 10, true);
+
+  TimeRequantizeManyInNewRange<quint8, qint32>(1000, 1000, false);
+  TimeRequantizeManyInNewRange<quint8, qint32>(1000, 1000, true);
+  TimeRequantizeManyInNewRange<quint8, qint32>(100000, 100, false);
+  TimeRequantizeManyInNewRange<quint8, qint32>(100000, 100, true);
+  TimeRequantizeManyInNewRange<quint8, qint32>(1000000, 10, false);
+  TimeRequantizeManyInNewRange<quint8, qint32>(1000000, 10, true);
+}
+
+#ifdef QUANTIZATION_UTILS_USE_NEON
+template <int POW>
+void TestDivide64x2Pow(int64 val, int64 ref) {
+  const int64x2_t val_64x2 = vmovq_n_s64(val);
+  const int64x2_t ret = Divide64x2Pow<POW>(val_64x2);
+  int64 rets[2];
+  vst1q_s64(rets, ret);
+  EXPECT_EQ(rets[0], ref);
+  EXPECT_EQ(rets[1], ref);
+  VLOG(1) << "div: val " << val << ", " << ref;
+}
+
+template <int POW>
+void TestDivide64x2PowRound(int64 val, int64 ref) {
+  const int64x2_t val_64x2 = vmovq_n_s64(val);
+  const int64x2_t shifted = Divide64x2PowRound<POW>(val_64x2);
+  int64 rets[2];
+  vst1q_s64(rets, shifted);
+  EXPECT_EQ(rets[0], ref) << "in = " << val << ", " << POW
+                          << ", act = " << rets[0] << ", ref = " << ref;
+  EXPECT_EQ(rets[1], ref);
+  VLOG(1) << "div round: " << val << ", " << rets[0];
+}
+
+void TestDivide64x2PowAll() {
+  for (int64 i = 0; i < 1000; ++i) {
+    TestDivide64x2PowRound<1>(
+        i, static_cast<int64>(static_cast<float>(i) / 2.0f + 0.5f));
+    TestDivide64x2PowRound<1>(
+        -i, static_cast<int64>(static_cast<float>(-i) / 2.0f - 0.5f));
+    TestDivide64x2PowRound<2>(
+        i, static_cast<int64>(static_cast<float>(i) / 4.0f + 0.5f));
+    TestDivide64x2PowRound<2>(
+        -i, static_cast<int64>(static_cast<float>(-i) / 4.0f - 0.5f));
+    TestDivide64x2PowRound<4>(
+        i, static_cast<int64>(static_cast<float>(i) / 16.0f + 0.5f));
+    TestDivide64x2PowRound<4>(
+        -i, static_cast<int64>(static_cast<float>(-i) / 16.0f - 0.5f));
+    TestDivide64x2PowRound<8>(
+        i, static_cast<int64>(static_cast<float>(i) / 256.0f + 0.5f));
+    TestDivide64x2PowRound<8>(
+        -i, static_cast<int64>(static_cast<float>(-i) / 256.0f - 0.5f));
+    TestDivide64x2PowRound<16>(
+        i, static_cast<int64>(static_cast<float>(i) / 65536.0f + 0.5f));
+    TestDivide64x2PowRound<16>(
+        -i, static_cast<int64>(static_cast<float>(-i) / 65536.0f - 0.5f));
+  }
+
+  TestDivide64x2Pow<2>(100, 25);
+  TestDivide64x2Pow<2>(-100, -25);
+  TestDivide64x2Pow<4>(100, 6);
+  TestDivide64x2Pow<4>(-100, -6);
+
+  for (int64 i = 0; i < 1000; ++i) {
+    TestDivide64x2Pow<1>(i, i / 2);
+    TestDivide64x2Pow<1>(-i, -i / 2);
+    TestDivide64x2Pow<2>(i, i / 4);
+    TestDivide64x2Pow<2>(-i, -i / 4);
+    TestDivide64x2Pow<4>(i, i / 16);
+    TestDivide64x2Pow<4>(-i, -i / 16);
+    TestDivide64x2Pow<8>(i, i / 256);
+    TestDivide64x2Pow<8>(-i, -i / 256);
+    TestDivide64x2Pow<16>(i, i / 65536);
+    TestDivide64x2Pow<16>(-i, -i / 65536);
+  }
+}
+
+uint8x8_t To8x8(uint8 val) { return vmov_n_u8(val); }
+
+int16x8_t To16x8(int16 val) { return vmovq_n_s16(val); }
+
+int32x2_t To32x2(int32 val) {
+  int32 vals[2];
+  vals[0] = val;
+  vals[1] = val;
+  return vld1_s32(vals);
+}
+
+template <int RESOLUTION, typename T_CALC>
+T_CALC ComputeRefLerp(T_CALC top_left, T_CALC top_right, T_CALC bottom_left,
+                      T_CALC bottom_right, T_CALC x_lerp, T_CALC y_lerp) {
+  constexpr T_CALC RESOLUTION_POW = (1 << RESOLUTION);
+  const T_CALC top =
+      top_left * RESOLUTION_POW + (top_right - top_left) * x_lerp;
+  const T_CALC bottom =
+      bottom_left * RESOLUTION_POW + (bottom_right - bottom_left) * x_lerp;
+  const T_CALC out = top + (bottom - top) / RESOLUTION_POW * y_lerp;
+  return (out + RESOLUTION_POW / 2) / RESOLUTION_POW;
+}
+
+template <int RESOLUTION>
+void TestComputeLerp8x8(uint8 top_left, uint8 top_right, uint8 bottom_left,
+                        uint8 bottom_right, int16 x_lerp, int16 y_lerp) {
+  uint8x8_t top_left8x8 = To8x8(top_left);
+  uint8x8_t top_right8x8 = To8x8(top_right);
+  uint8x8_t bottom_left8x8 = To8x8(bottom_left);
+  uint8x8_t bottom_right8x8 = To8x8(bottom_right);
+  int16x8_t x_lerp16x8 = To16x8(x_lerp);
+  int16x8_t y_lerp16x8 = To16x8(y_lerp);
+  const uint8x8_t ret =
+      ComputeLerp8x8<RESOLUTION>(top_left8x8, top_right8x8, bottom_left8x8,
+                                 bottom_right8x8, x_lerp16x8, y_lerp16x8);
+
+  uint8 rets[8];
+  vst1_u8(rets, ret);
+
+  const int16 ref = ComputeRefLerp<RESOLUTION, int16>(
+      static_cast<int16>(top_left), static_cast<int16>(top_right),
+      static_cast<int16>(bottom_left), static_cast<int16>(bottom_right), x_lerp,
+      y_lerp);
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(ref, static_cast<int16>(rets[i]));
+  }
+
+  VLOG(1) << "Lerp(8): " << static_cast<int>(top_left) << ", "
+          << static_cast<int>(top_right) << ", "
+          << static_cast<int>(bottom_left) << ", "
+          << static_cast<int>(bottom_right) << ", " << x_lerp << ", " << y_lerp
+          << ", " << static_cast<int>(rets[0]) << ", " << ref;
+}
+
+template <int RESOLUTION>
+void TestComputeLerp32x2(int32 top_left, int32 top_right, int32 bottom_left,
+                         int32 bottom_right, int32 x_lerp, int32 y_lerp) {
+  int32x2_t top_left32x2 = To32x2(top_left);
+  int32x2_t top_right32x2 = To32x2(top_right);
+  int32x2_t bottom_left32x2 = To32x2(bottom_left);
+  int32x2_t bottom_right32x2 = To32x2(bottom_right);
+  int32x2_t x_lerp32x2 = To32x2(x_lerp);
+  int32x2_t y_lerp32x2 = To32x2(y_lerp);
+  const int32x2_t ret =
+      ComputeLerp32x2<RESOLUTION>(top_left32x2, top_right32x2, bottom_left32x2,
+                                  bottom_right32x2, x_lerp32x2, y_lerp32x2);
+  int32 rets[2];
+  vst1_s32(rets, ret);
+  const int64 ref = ComputeRefLerp<RESOLUTION, int64>(
+      static_cast<int64>(top_left), static_cast<int64>(top_right),
+      static_cast<int64>(bottom_left), static_cast<int64>(bottom_right),
+      static_cast<int64>(x_lerp), static_cast<int64>(y_lerp));
+  EXPECT_EQ(static_cast<int64>(rets[0]), ref);
+  VLOG(1) << "Lerp(32): " << top_left << ", " << top_right << ", "
+          << bottom_left << ", " << bottom_right << ", " << x_lerp << ", "
+          << y_lerp << ", " << rets[0] << ", " << ref;
+}
+
+void TestComputeLerp4xAll() {
+  constexpr int32 RESOLUTION_32 = 30;
+  constexpr int32 RESOLUTION_MULT_32 = (1 << RESOLUTION_32);
+  constexpr int32 HALF_32 = RESOLUTION_MULT_32 / 2;
+  TestComputeLerp32x2<RESOLUTION_32>(100, 200, 300, 400, HALF_32, HALF_32);
+  TestComputeLerp32x2<RESOLUTION_32>(100, 100, 200, 200, HALF_32, HALF_32);
+  TestComputeLerp32x2<RESOLUTION_32>(200, 200, 100, 100, HALF_32, HALF_32);
+  TestComputeLerp32x2<RESOLUTION_32>(100, 200, 100, 200, HALF_32, HALF_32);
+  TestComputeLerp32x2<RESOLUTION_32>(200, 100, 200, 100, HALF_32, HALF_32);
+  TestComputeLerp32x2<RESOLUTION_32>(200, 200, 200, 200, HALF_32, HALF_32);
+
+  constexpr int32 RESOLUTION_8 = 7;
+  constexpr int32 RESOLUTION_MULT_8 = (1 << RESOLUTION_8);
+  constexpr int32 HALF_8 = RESOLUTION_MULT_8 / 2;
+  TestComputeLerp8x8<RESOLUTION_8>(10, 20, 30, 40, HALF_8, HALF_8);
+  TestComputeLerp8x8<RESOLUTION_8>(100, 100, 200, 200, HALF_8, HALF_8);
+  TestComputeLerp8x8<RESOLUTION_8>(200, 200, 100, 100, HALF_8, HALF_8);
+  TestComputeLerp8x8<RESOLUTION_8>(100, 200, 100, 200, HALF_8, HALF_8);
+  TestComputeLerp8x8<RESOLUTION_8>(200, 100, 200, 100, HALF_8, HALF_8);
+  TestComputeLerp8x8<RESOLUTION_8>(200, 200, 200, 200, HALF_8, HALF_8);
+}
+
+#endif
+
 }  // namespace tensorflow
+
+#if defined(__ANDROID__)
+int main(int argc, char** argv) {
+#define RUN_TEST(t)            \
+  LOG(INFO) << "Test: " << #t; \
+  tensorflow::t();
+#else
+#define RUN_TEST(t) \
+  TEST(QuantizationUtilsTest, t) { tensorflow::t(); }
+#endif
+
+  RUN_TEST(TestFloatToQuantized);
+  RUN_TEST(TestQuantizedToFloat);
+  RUN_TEST(TestAvoidBias);
+  RUN_TEST(TestRequantizeInNewRange);
+  RUN_TEST(TestRequantizeInNewRangeRealData);
+  RUN_TEST(TestRequantizeInNewRange32To8Bit);
+  RUN_TEST(TestRequantizeManyInNewRange32To8Bit);
+  RUN_TEST(TestRequantizeManyInNewRange32To8BitUsingEigen);
+  RUN_TEST(TestRequantizeManyInNewRange32To8BitEigenVsNonEigen);
+  RUN_TEST(TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen);
+  RUN_TEST(TestFloatTensorToQuantized);
+  RUN_TEST(TestRequantizeManyInNewRange8To32Bit);
+  RUN_TEST(TestFloatToQuantizedInPlaceUsingEigen);
+  RUN_TEST(TestOverflowWithEigen);
+  RUN_TEST(TestQuantizedTensorToFloat);
+  RUN_TEST(TestQuantizedToFloatInPlaceUsingEigen);
+
+#if defined(__ANDROID__)
+#ifdef QUANTIZATION_UTILS_USE_NEON
+  RUN_TEST(TestDivide64x2PowAll);
+  RUN_TEST(TestComputeLerp4xAll);
+#endif
+
+  tensorflow::BenchmarkRequantizeManyInNewRange();
+
+  LOG(INFO) << "All tests complete.";
+  return 0;
+}
+#endif
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index cdef5771e15..4dd5e24e5a1 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -39,6 +39,54 @@ typedef Eigen::GpuDevice GPUDevice;
 // 2. Dequantize it back to floating point numbers for the following ops, most
 //    likely matmul.
 template <typename Device, typename T>
+class QuantizeAndDequantizeV2Op : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV2Op(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_));
+    OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
+                errors::InvalidArgument("num_bits is out of range: ", num_bits_,
+                                        " with signed_input_ ", signed_input_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+
+    Tensor input_min_tensor;
+    Tensor input_max_tensor;
+    if (range_given_) {
+      input_min_tensor = ctx->input(1);
+      input_max_tensor = ctx->input(2);
+      auto min_val = input_min_tensor.scalar<T>()();
+      auto max_val = input_max_tensor.scalar<T>()();
+      OP_REQUIRES(ctx, min_val <= max_val,
+                  errors::InvalidArgument("Invalid range: input_min ", min_val,
+                                          " > input_max ", max_val));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             TensorShape(), &input_min_tensor));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             TensorShape(), &input_max_tensor));
+    }
+
+    functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
+    f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
+      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+  }
+
+ private:
+  bool signed_input_;
+  int num_bits_;
+  bool range_given_;
+};
+
+// DEPRECATED: Use QuantizeAndDequantizeV2Op.
+template <typename Device, typename T>
 class QuantizeAndDequantizeOp : public OpKernel {
  public:
   explicit QuantizeAndDequantizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -65,20 +113,16 @@ class QuantizeAndDequantizeOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
 
     // One global scale.
-    Tensor input_min_tensor;
-    Tensor input_max_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape(), &input_min_tensor));
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape(), &input_max_tensor));
+    Tensor input_min_tensor(DataTypeToEnum<T>::value, TensorShape());
+    Tensor input_max_tensor(DataTypeToEnum<T>::value, TensorShape());
+    // Initialize the tensors with the values in the Attrs.
+    input_min_tensor.template scalar<T>()() = static_cast<T>(input_min_);
+    input_max_tensor.template scalar<T>()() = static_cast<T>(input_max_);
 
-    auto input_min = input_min_tensor.scalar<T>();
-    auto input_max = input_max_tensor.scalar<T>();
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> functor;
-    functor(ctx->template eigen_device<Device>(), input.template flat<T>(),
-            signed_input_, num_bits_, range_given_, input_min, input_max,
-            static_cast<T>(input_min_), static_cast<T>(input_max_),
-            output->template flat<T>());
+    functor(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
+            num_bits_, range_given_, &input_min_tensor, &input_max_tensor,
+            output->flat<T>());
   }
 
  private:
@@ -95,17 +139,20 @@ template <typename T>
 struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstVec input,
                   const bool signed_input, const int num_bits,
-                  const bool range_given, typename TTypes<T>::Scalar input_min,
-                  typename TTypes<T>::Scalar input_max, const T input_min_init,
-                  const T input_max_init, typename TTypes<T>::Vec out) {
+                  const bool range_given, Tensor* input_min_tensor,
+                  Tensor* input_max_tensor, typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<CPUDevice, T>::Compute(
-        d, input, signed_input, num_bits, range_given, input_min, input_max,
-        input_min_init, input_max_init, out);
+        d, input, signed_input, num_bits, range_given, input_min_tensor,
+        input_max_tensor, out);
   }
 };
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<CPUDevice, T>);
@@ -114,13 +161,18 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    QuantizeAndDequantizeOp<GPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantize")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<double>("T"),
-                        QuantizeAndDequantizeOp<GPUDevice, double>);
+#define REGISTER_GPU_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_max")                         \
+                              .HostMemory("input_min")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      QuantizeAndDequantizeOp<GPUDevice, T>);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
 #endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 30c6244aa2b..1363c7e325b 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -26,9 +27,8 @@ template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
-                  typename TTypes<T>::Scalar input_min,
-                  typename TTypes<T>::Scalar input_max, const T input_min_init,
-                  const T input_max_init, typename TTypes<T>::Vec out);
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  typename TTypes<T>::Vec out);
 };
 
 // The implementation below runs on both CPU and GPU.
@@ -36,18 +36,18 @@ template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleImpl {
   static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
                       bool signed_input, int num_bits, bool range_given,
-                      typename TTypes<T>::Scalar input_min,
-                      typename TTypes<T>::Scalar input_max,
-                      const T input_min_init, const T input_max_init,
+                      Tensor* input_min_tensor, Tensor* input_max_tensor,
                       typename TTypes<T>::Vec out) {
-    T min_range = input_min_init;
-    T max_range = input_max_init;
+    T min_range;
+    T max_range;
+    auto input_min = input_min_tensor->scalar<T>();
+    auto input_max = input_max_tensor->scalar<T>();
     if (!range_given) {
       input_min.device(d) = input.minimum();
       input_max.device(d) = input.maximum();
-      d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
-      d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
     }
+    d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
+    d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
 
     // Make sure the range is symmetric for signed quantization, or start from
     // 0 for unsigned quantization.
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index c7e4b97db1c..61c79cf6959 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -31,12 +31,11 @@ template <typename T>
 struct QuantizeAndDequantizeOneScaleFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
-                  typename TTypes<T>::Scalar input_min,
-                  typename TTypes<T>::Scalar input_max, const T input_min_init,
-                  const T input_max_init, typename TTypes<T>::Vec out) {
+                  Tensor* input_min_tensor, Tensor* input_max_tensor,
+                  typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<GPUDevice, T>::Compute(
-        d, input, signed_input, range_given, num_bits, input_min, input_max,
-        input_min_init, input_max_init, out);
+        d, input, signed_input, num_bits, range_given, input_min_tensor,
+        input_max_tensor, out);
   }
 };
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 92b24a424ca..ccd0d7203aa 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -40,36 +39,44 @@ class QuantizeAndDequantizeTest : public OpsTestBase {};
 // Convert a simple scalar tensor.
 TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", true)
           .Attr("num_bits", 8)
           .Attr("range_given", false)
-          .Attr("input_min", 0.0)
-          .Attr("input_max", 0.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({1}), {-3.5});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
   test::FillValues<float>(&expected, {-3.5});
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
 // Convert a 1D tensor with signed 8 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", true)
           .Attr("num_bits", 8)
           .Attr("range_given", false)
-          .Attr("input_min", 0.0)
-          .Attr("input_max", 0.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
   // Scale is: 1/127
@@ -79,21 +86,27 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   test::FillValues<float>(
       &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", true)
           .Attr("num_bits", 4)
           .Attr("range_given", false)
-          .Attr("input_min", 0.0)
-          .Attr("input_max", 0.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
   // Scale is: 1/7
@@ -102,23 +115,29 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   test::FillValues<float>(&expected,
                           {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
 // Convert a 2D tensor with signed 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", true)
           .Attr("num_bits", 8)
           .Attr("range_given", true)
-          .Attr("input_min", -1.0)
-          .Attr("input_max", 1.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   // Note that the last two values are saturated.
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
+  AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
 
   // Note that the range is given as [-1, 1].
   // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
@@ -134,16 +153,18 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
 // Convert a 4D tensor with unsigned 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", false)
           .Attr("num_bits", 8)
           .Attr("range_given", true)
-          .Attr("input_min", 0.0)
-          .Attr("input_max", 1.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
 
   // Note that the range is given as [0, 1].
   // With int8, the tensor is quantized to {0, 0, 77, 204}
@@ -157,16 +178,18 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
 // Convert a tensor with all 0.
 TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("signed_input", false)
           .Attr("num_bits", 8)
           .Attr("range_given", false)
-          .Attr("input_min", 0.0)
-          .Attr("input_max", 1.0)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {0, 0, 0, 0});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
@@ -177,29 +200,33 @@ TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0) {
 // Range is invalid
 TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   TF_ASSERT_OK(
-      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantize")
+      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
           .Input(FakeInput(DT_FLOAT))
           .Attr("num_bits", 8)
           .Attr("range_given", true)
-          .Attr("input_min", 2.0)
-          .Attr("input_max", 1.0)
           .Finalize(node_def()));
-  Status s = InitOp();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 2 > input_max 1"))
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
-#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                     \
-  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) { \
-    auto root = Scope::NewRootScope().ExitOnError();      \
-    ops::QuantizeAndDequantize(root, {-3.5} /* input */); \
-    TF_CHECK_OK(root.status());                           \
-    Graph* g = new Graph(OpRegistry::Global());           \
-    root.ToGraph(g);                                      \
-    test::Benchmark(#DEVICE, g).Run(iters);               \
-  }                                                       \
+#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                           \
+  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) {       \
+    auto root = Scope::NewRootScope().ExitOnError();            \
+    ops::QuantizeAndDequantizeV2(root, {-3.5}, {-3.5}, {-3.5}); \
+    TF_CHECK_OK(root.status());                                 \
+    Graph* g = new Graph(OpRegistry::Global());                 \
+    TF_CHECK_OK(root.ToGraph(g));                               \
+    test::Benchmark(#DEVICE, g).Run(iters);                     \
+  }                                                             \
   BENCHMARK(BM_SIMPLE_QUAN_DEQUAN_##DEVICE);
 
 BM_SIMPLE_QUAN_DEQUAN(cpu);
diff --git a/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc b/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc
index 73a50aad268..48b56ae0ac2 100644
--- a/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc
+++ b/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index 7b34c32cebd..f649287fc1d 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -86,6 +86,7 @@ class QuantizeV2Op : public OpKernel {
                                                   fabsf(input_max_range))) /
                           100.0f;
     max_range = std::max(input_max_range, min_range + epsilon);
+    max_range = std::max(0.0f, max_range);
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 41996852f16..48bde3b4971 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -132,6 +132,50 @@ TEST_F(QuantizedOpTest, QuantizeV2EqualRange) {
   EXPECT_LT(0.0f, output_max);
 }
 
+TEST_F(QuantizedOpTest, QuantizeV2MovesMinToIncludeZero) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({3}), {0.1, 0.2, 0.3});
+  AddInputFromArray<float>(TensorShape({1}), {0.1});
+  AddInputFromArray<float>(TensorShape({1}), {0.3});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({3}));
+  test::FillValues<quint8>(&expected, {85, 170, 255});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(0.0f, output_min, 1e-5f);
+  EXPECT_NEAR(0.3f, output_max, 1e-5f);
+}
+
+TEST_F(QuantizedOpTest, QuantizeV2MovesMaxToIncludeZero) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({3}), {-0.1, -0.2, -0.3});
+  AddInputFromArray<float>(TensorShape({1}), {-0.3});
+  AddInputFromArray<float>(TensorShape({1}), {-0.1});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({3}));
+  test::FillValues<quint8>(&expected, {170, 85, 0});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(-0.3f, output_min, 1e-5f);
+  EXPECT_NEAR(0.0f, output_max, 1e-5f);
+}
+
 TEST_F(QuantizedOpTest, Dequantize) {
   TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
                    .Input(FakeInput(DT_QUINT8))
diff --git a/tensorflow/core/kernels/quantized_activation_ops_test.cc b/tensorflow/core/kernels/quantized_activation_ops_test.cc
index 38c7d4ffef8..b3b7cb58b9a 100644
--- a/tensorflow/core/kernels/quantized_activation_ops_test.cc
+++ b/tensorflow/core/kernels/quantized_activation_ops_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/quantization_utils.h"
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
new file mode 100644
index 00000000000..8be0c567987
--- /dev/null
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -0,0 +1,581 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements a quantized eight-bit version of the matmul operation.
+
+#define EIGEN_USE_THREADS
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#define QUANTIZED_ADD_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/bcast.h"
+
+// There are implementations for three broadcast patterns for add:
+//  - Scalar * Array
+//  - Array * Array
+//  - Array * Shorter Array (repeated to match first)
+//
+// These handle a lot of common broadcast patterns, and we have NEON SIMD
+// versions to accelerate performance on ARM platforms.
+
+namespace tensorflow {
+namespace {
+
+template <class T, class Toutput>
+void ScalarAddition(OpKernelContext* context, const T* full_input,
+                    float full_input_min, float full_input_max,
+                    int64 num_elements, T scalar_input, float scalar_input_min,
+                    float scalar_input_max, float output_min, float output_max,
+                    Toutput* output) {
+  const Toutput scalar_in_output_range = RequantizeInNewRange<T, Toutput>(
+      scalar_input, scalar_input_min, scalar_input_max, output_min, output_max);
+  for (int i = 0; i < num_elements; ++i) {
+    const Toutput full_input_in_output_range = RequantizeInNewRange<T, Toutput>(
+        full_input[i], full_input_min, full_input_max, output_min, output_max);
+    output[i] = full_input_in_output_range + scalar_in_output_range;
+  }
+}
+
+#ifdef QUANTIZED_ADD_USE_NEON
+
+template <>
+void ScalarAddition(OpKernelContext* context, const quint8* full_input,
+                    float full_input_min, float full_input_max,
+                    int64 num_elements, quint8 scalar_input,
+                    float scalar_input_min, float scalar_input_max,
+                    float output_min, float output_max, qint32* output) {
+  const int32 scalar_in_output_range = RequantizeInNewRange<quint8, qint32>(
+      scalar_input, scalar_input_min, scalar_input_max, output_min, output_max);
+
+  const float input_0_float =
+      QuantizedToFloat<quint8>(0, full_input_min, full_input_max);
+  const float input_1_float =
+      QuantizedToFloat<quint8>(1, full_input_min, full_input_max);
+  const int64 input_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(input_0_float, output_min, output_max);
+  const int64 input_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(input_1_float, output_min, output_max);
+  const int32 input_mult_int32 = input_1_int64 - input_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  const int64x2_t input_0_64x2 = vmovq_n_s64(input_0_int64);
+  const int32x2_t input_mult_32x2 = vmov_n_s32(input_mult_int32);
+  const int32x4_t scalar_in_output_range_32x4 =
+      vmovq_n_s32(scalar_in_output_range);
+  int64 i = 0;
+  for (; i < (num_elements - 7); i += 8) {
+    const uint8* full_input_ptr = &(full_input->value) + i;
+    const std::array<int32x4_t, 2> output_value =
+        Requantize8x8To32Neon(full_input_ptr, input_0_64x2, input_mult_32x2);
+    const int32x4_t result_low_32x4 =
+        vaddq_s32(output_value[0], scalar_in_output_range_32x4);
+    const int32x4_t result_high_32x4 =
+        vaddq_s32(output_value[1], scalar_in_output_range_32x4);
+    int32* output_ptr = &(output->value) + i;
+    vst1q_s32(output_ptr + 0, result_low_32x4);
+    vst1q_s32(output_ptr + 4, result_high_32x4);
+  }
+  for (; i < num_elements; ++i) {
+    const int64 full_input_value = static_cast<int64>(full_input[i]);
+    int64 full_input_in_output_range_64 =
+        input_0_int64 + (full_input_value * input_mult_int32);
+    full_input_in_output_range_64 =
+        std::max(full_input_in_output_range_64, lowest_quantized);
+    full_input_in_output_range_64 =
+        std::min(full_input_in_output_range_64, highest_quantized);
+    const int32 full_input_in_output_range =
+        static_cast<int32>(full_input_in_output_range_64);
+    output[i] = full_input_in_output_range + scalar_in_output_range;
+  }
+}
+
+#else  // QUANTIZED_ADD_USE_NEON
+
+template <>
+void ScalarAddition(OpKernelContext* context, const quint8* full_input,
+                    float full_input_min, float full_input_max,
+                    int64 num_elements, quint8 scalar_input,
+                    float scalar_input_min, float scalar_input_max,
+                    float output_min, float output_max, qint32* output) {
+  const int32 scalar_in_output_range = RequantizeInNewRange<quint8, qint32>(
+      scalar_input, scalar_input_min, scalar_input_max, output_min, output_max);
+
+  const float input_0_float =
+      QuantizedToFloat<quint8>(0, full_input_min, full_input_max);
+  const float input_1_float =
+      QuantizedToFloat<quint8>(1, full_input_min, full_input_max);
+  const int64 input_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(input_0_float, output_min, output_max);
+  const int64 input_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(input_1_float, output_min, output_max);
+  const int32 input_mult_int32 = input_1_int64 - input_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  for (int i = 0; i < num_elements; ++i) {
+    const int64 full_input_value = static_cast<int64>(full_input[i]);
+    int64 full_input_in_output_range_64 =
+        input_0_int64 + (full_input_value * input_mult_int32);
+    full_input_in_output_range_64 =
+        std::max(full_input_in_output_range_64, lowest_quantized);
+    full_input_in_output_range_64 =
+        std::min(full_input_in_output_range_64, highest_quantized);
+    const int32 full_input_in_output_range =
+        static_cast<int32>(full_input_in_output_range_64);
+    output[i] = full_input_in_output_range + scalar_in_output_range;
+  }
+}
+
+#endif  // QUANTIZED_ADD_USE_NEON
+
+template <class T, class Toutput>
+void VectorAddition(OpKernelContext* context, const T* x_data, float min_x,
+                    float max_x, const T* y_data, float min_y, float max_y,
+                    int64 num_elements, float output_min, float output_max,
+                    Toutput* output) {
+  for (int i = 0; i < num_elements; ++i) {
+    const Toutput x_in_output_range = RequantizeInNewRange<T, Toutput>(
+        x_data[i], min_x, max_x, output_min, output_max);
+    const Toutput y_in_output_range = RequantizeInNewRange<T, Toutput>(
+        y_data[i], min_y, max_y, output_min, output_max);
+    output[i] = x_in_output_range + y_in_output_range;
+  }
+}
+
+#ifdef QUANTIZED_ADD_USE_NEON
+
+template <>
+void VectorAddition(OpKernelContext* context, const quint8* x_data, float min_x,
+                    float max_x, const quint8* y_data, float min_y, float max_y,
+                    int64 num_elements, float output_min, float output_max,
+                    qint32* output) {
+  const float x_0_float = QuantizedToFloat<quint8>(0, min_x, max_x);
+  const float x_1_float = QuantizedToFloat<quint8>(1, min_x, max_x);
+  const int64 x_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(x_0_float, output_min, output_max);
+  const int64 x_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(x_1_float, output_min, output_max);
+  const int32 x_mult_int32 = x_1_int64 - x_0_int64;
+
+  const float y_0_float = QuantizedToFloat<quint8>(0, min_y, max_y);
+  const float y_1_float = QuantizedToFloat<quint8>(1, min_y, max_y);
+  const int64 y_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(y_0_float, output_min, output_max);
+  const int64 y_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(y_1_float, output_min, output_max);
+  const int32 y_mult_int32 = y_1_int64 - y_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  const int64x2_t x_0_64x2 = vmovq_n_s64(x_0_int64);
+  const int32x2_t x_mult_32x2 = vmov_n_s32(x_mult_int32);
+
+  const int64x2_t y_0_64x2 = vmovq_n_s64(y_0_int64);
+  const int32x2_t y_mult_32x2 = vmov_n_s32(y_mult_int32);
+
+  int64 i = 0;
+  for (; i < (num_elements - 7); i += 8) {
+    const uint8* x_ptr = &(x_data->value) + i;
+    const std::array<int32x4_t, 2> x_output_value =
+        Requantize8x8To32Neon(x_ptr, x_0_64x2, x_mult_32x2);
+    const uint8* y_ptr = &(y_data->value) + i;
+    const std::array<int32x4_t, 2> y_output_value =
+        Requantize8x8To32Neon(y_ptr, y_0_64x2, y_mult_32x2);
+
+    const int32x4_t result_low_32x4 =
+        vaddq_s32(x_output_value[0], y_output_value[0]);
+    const int32x4_t result_high_32x4 =
+        vaddq_s32(x_output_value[1], y_output_value[1]);
+    int32* output_ptr = &(output->value) + i;
+    vst1q_s32(output_ptr + 0, result_low_32x4);
+    vst1q_s32(output_ptr + 4, result_high_32x4);
+  }
+
+  for (; i < num_elements; ++i) {
+    const int64 x_value = static_cast<int64>(x_data[i]);
+    int64 x_in_output_range_64 = x_0_int64 + (x_value * x_mult_int32);
+    x_in_output_range_64 = std::max(x_in_output_range_64, lowest_quantized);
+    x_in_output_range_64 = std::min(x_in_output_range_64, highest_quantized);
+    const int32 x_in_output_range = static_cast<int32>(x_in_output_range_64);
+
+    const int64 y_value = static_cast<int64>(y_data[i]);
+    int64 y_in_output_range_64 = y_0_int64 + (y_value * y_mult_int32);
+    y_in_output_range_64 = std::max(y_in_output_range_64, lowest_quantized);
+    y_in_output_range_64 = std::min(y_in_output_range_64, highest_quantized);
+    const int32 y_in_output_range = static_cast<int32>(y_in_output_range_64);
+
+    output[i] = x_in_output_range + y_in_output_range;
+  }
+}
+
+#else  // QUANTIZED_ADD_USE_NEON
+
+template <>
+void VectorAddition(OpKernelContext* context, const quint8* x_data, float min_x,
+                    float max_x, const quint8* y_data, float min_y, float max_y,
+                    int64 num_elements, float output_min, float output_max,
+                    qint32* output) {
+  const float x_0_float = QuantizedToFloat<quint8>(0, min_x, max_x);
+  const float x_1_float = QuantizedToFloat<quint8>(1, min_x, max_x);
+  const int64 x_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(x_0_float, output_min, output_max);
+  const int64 x_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(x_1_float, output_min, output_max);
+  const int32 x_mult_int32 = x_1_int64 - x_0_int64;
+
+  const float y_0_float = QuantizedToFloat<quint8>(0, min_y, max_y);
+  const float y_1_float = QuantizedToFloat<quint8>(1, min_y, max_y);
+  const int64 y_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(y_0_float, output_min, output_max);
+  const int64 y_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(y_1_float, output_min, output_max);
+  const int32 y_mult_int32 = y_1_int64 - y_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  for (int i = 0; i < num_elements; ++i) {
+    const int64 x_value = static_cast<int64>(x_data[i]);
+    int64 x_in_output_range_64 = x_0_int64 + (x_value * x_mult_int32);
+    x_in_output_range_64 = std::max(x_in_output_range_64, lowest_quantized);
+    x_in_output_range_64 = std::min(x_in_output_range_64, highest_quantized);
+    const int32 x_in_output_range = static_cast<int32>(x_in_output_range_64);
+
+    const int64 y_value = static_cast<int64>(y_data[i]);
+    int64 y_in_output_range_64 = y_0_int64 + (y_value * y_mult_int32);
+    y_in_output_range_64 = std::max(y_in_output_range_64, lowest_quantized);
+    y_in_output_range_64 = std::min(y_in_output_range_64, highest_quantized);
+    const int32 y_in_output_range = static_cast<int32>(y_in_output_range_64);
+
+    output[i] = x_in_output_range + y_in_output_range;
+  }
+}
+
+#endif  // QUANTIZED_ADD_USE_NEON
+
+template <class T, class Toutput>
+void VectorTensorAddition(const T* vector_data, float min_vector,
+                          float max_vector, int64 vector_num_elements,
+                          const T* tensor_data, float min_tensor,
+                          float max_tensor, int64 tensor_num_elements,
+                          float output_min, float output_max, Toutput* output) {
+  for (int i = 0; i < tensor_num_elements; ++i) {
+    const int64 vector_i = i % vector_num_elements;
+    const Toutput vector_in_output_range = RequantizeInNewRange<T, Toutput>(
+        vector_data[vector_i], min_vector, max_vector, output_min, output_max);
+    const Toutput tensor_in_output_range = RequantizeInNewRange<T, Toutput>(
+        tensor_data[i], min_tensor, max_tensor, output_min, output_max);
+    output[i] = vector_in_output_range + tensor_in_output_range;
+  }
+}
+
+#ifdef QUANTIZED_ADD_USE_NEON
+
+template <>
+void VectorTensorAddition(const quint8* vector_data, float min_vector,
+                          float max_vector, int64 vector_num_elements,
+                          const quint8* tensor_data, float min_tensor,
+                          float max_tensor, int64 tensor_num_elements,
+                          float output_min, float output_max, qint32* output) {
+  const float vector_0_float =
+      QuantizedToFloat<quint8>(0, min_vector, max_vector);
+  const float vector_1_float =
+      QuantizedToFloat<quint8>(1, min_vector, max_vector);
+  const int64 vector_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(vector_0_float, output_min, output_max);
+  const int64 vector_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(vector_1_float, output_min, output_max);
+  const int32 vector_mult_int32 = vector_1_int64 - vector_0_int64;
+
+  const float tensor_0_float =
+      QuantizedToFloat<quint8>(0, min_tensor, max_tensor);
+  const float tensor_1_float =
+      QuantizedToFloat<quint8>(1, min_tensor, max_tensor);
+  const int64 tensor_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(tensor_0_float, output_min, output_max);
+  const int64 tensor_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(tensor_1_float, output_min, output_max);
+  const int32 tensor_mult_int32 = tensor_1_int64 - tensor_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  const int64x2_t vector_0_64x2 = vmovq_n_s64(vector_0_int64);
+  const int32x2_t vector_mult_32x2 = vmov_n_s32(vector_mult_int32);
+
+  const int64x2_t tensor_0_64x2 = vmovq_n_s64(tensor_0_int64);
+  const int32x2_t tensor_mult_32x2 = vmov_n_s32(tensor_mult_int32);
+
+  for (int64 base_i = 0; base_i < tensor_num_elements;
+       base_i += vector_num_elements) {
+    int64 i = base_i;
+    int64 vector_i = 0;
+    for (; vector_i < (vector_num_elements - 7); vector_i += 8, i += 8) {
+      const uint8* vector_ptr = &(vector_data->value) + vector_i;
+      const std::array<int32x4_t, 2> vector_output_value =
+          Requantize8x8To32Neon(vector_ptr, vector_0_64x2, vector_mult_32x2);
+      const uint8* tensor_ptr = &(tensor_data->value) + i;
+      const std::array<int32x4_t, 2> tensor_output_value =
+          Requantize8x8To32Neon(tensor_ptr, tensor_0_64x2, tensor_mult_32x2);
+
+      const int32x4_t result_low_32x4 =
+          vaddq_s32(vector_output_value[0], tensor_output_value[0]);
+      const int32x4_t result_high_32x4 =
+          vaddq_s32(vector_output_value[1], tensor_output_value[1]);
+      int32* output_ptr = &(output->value) + i;
+      vst1q_s32(output_ptr + 0, result_low_32x4);
+      vst1q_s32(output_ptr + 4, result_high_32x4);
+    }
+    for (; vector_i < vector_num_elements; ++vector_i, ++i) {
+      const int64 vector_value = static_cast<int64>(vector_data[vector_i]);
+      int64 vector_in_output_range_64 =
+          vector_0_int64 + (vector_value * vector_mult_int32);
+      vector_in_output_range_64 =
+          std::max(vector_in_output_range_64, lowest_quantized);
+      vector_in_output_range_64 =
+          std::min(vector_in_output_range_64, highest_quantized);
+      const int32 vector_in_output_range =
+          static_cast<int32>(vector_in_output_range_64);
+
+      const int64 tensor_value = static_cast<int64>(tensor_data[i]);
+      int64 tensor_in_output_range_64 =
+          tensor_0_int64 + (tensor_value * tensor_mult_int32);
+      tensor_in_output_range_64 =
+          std::max(tensor_in_output_range_64, lowest_quantized);
+      tensor_in_output_range_64 =
+          std::min(tensor_in_output_range_64, highest_quantized);
+      const int32 tensor_in_output_range =
+          static_cast<int32>(tensor_in_output_range_64);
+
+      output[i] = vector_in_output_range + tensor_in_output_range;
+    }
+  }
+}
+
+#else  // QUANTIZED_ADD_USE_NEON
+
+template <>
+void VectorTensorAddition(const quint8* vector_data, float min_vector,
+                          float max_vector, int64 vector_num_elements,
+                          const quint8* tensor_data, float min_tensor,
+                          float max_tensor, int64 tensor_num_elements,
+                          float output_min, float output_max, qint32* output) {
+  const float vector_0_float =
+      QuantizedToFloat<quint8>(0, min_vector, max_vector);
+  const float vector_1_float =
+      QuantizedToFloat<quint8>(1, min_vector, max_vector);
+  const int64 vector_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(vector_0_float, output_min, output_max);
+  const int64 vector_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(vector_1_float, output_min, output_max);
+  const int32 vector_mult_int32 = vector_1_int64 - vector_0_int64;
+
+  const float tensor_0_float =
+      QuantizedToFloat<quint8>(0, min_tensor, max_tensor);
+  const float tensor_1_float =
+      QuantizedToFloat<quint8>(1, min_tensor, max_tensor);
+  const int64 tensor_0_int64 =
+      FloatToQuantizedUnclamped<qint32>(tensor_0_float, output_min, output_max);
+  const int64 tensor_1_int64 =
+      FloatToQuantizedUnclamped<qint32>(tensor_1_float, output_min, output_max);
+  const int32 tensor_mult_int32 = tensor_1_int64 - tensor_0_int64;
+
+  const int64 lowest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::lowest());
+  const int64 highest_quantized =
+      static_cast<int64>(Eigen::NumTraits<qint32>::highest());
+
+  for (int i = 0; i < tensor_num_elements; ++i) {
+    const int64 vector_i = i % vector_num_elements;
+    const int64 vector_value = static_cast<int64>(vector_data[vector_i]);
+    int64 vector_in_output_range_64 =
+        vector_0_int64 + (vector_value * vector_mult_int32);
+    vector_in_output_range_64 =
+        std::max(vector_in_output_range_64, lowest_quantized);
+    vector_in_output_range_64 =
+        std::min(vector_in_output_range_64, highest_quantized);
+    const int32 vector_in_output_range =
+        static_cast<int32>(vector_in_output_range_64);
+
+    const int64 tensor_value = static_cast<int64>(tensor_data[i]);
+    int64 tensor_in_output_range_64 =
+        tensor_0_int64 + (tensor_value * tensor_mult_int32);
+    tensor_in_output_range_64 =
+        std::max(tensor_in_output_range_64, lowest_quantized);
+    tensor_in_output_range_64 =
+        std::min(tensor_in_output_range_64, highest_quantized);
+    const int32 tensor_in_output_range =
+        static_cast<int32>(tensor_in_output_range_64);
+
+    output[i] = vector_in_output_range + tensor_in_output_range;
+  }
+}
+
+#endif  // QUANTIZED_ADD_USE_NEON
+
+}  // namespace
+
+template <class T, class Toutput>
+class QuantizedAddOp : public OpKernel {
+ public:
+  explicit QuantizedAddOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& x = context->input(0);
+    const Tensor& y = context->input(1);
+    const float min_x = context->input(2).flat<float>()(0);
+    const float max_x = context->input(3).flat<float>()(0);
+    const float min_y = context->input(4).flat<float>()(0);
+    const float max_y = context->input(5).flat<float>()(0);
+
+    BCast bcast(BCast::FromShape(x.shape()), BCast::FromShape(y.shape()));
+    if (!bcast.IsValid()) {
+      context->SetStatus(errors::InvalidArgument(
+          "Incompatible shapes: ", x.shape().DebugString(), " vs. ",
+          y.shape().DebugString()));
+      return;
+    }
+    Tensor* z;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
+
+    // Make sure that we have valid quantization ranges for the input buffers.
+    // If the difference between the min and max is negative or zero, it makes
+    // it hard to do meaningful intermediate operations on the values.
+    OP_REQUIRES(context, (max_x > min_x),
+                errors::InvalidArgument("max_x must be larger than min_x."));
+    OP_REQUIRES(context, (max_y > min_y),
+                errors::InvalidArgument("max_y must be larger than min_y."));
+    const T* x_data = x.flat<T>().data();
+    const T* y_data = y.flat<T>().data();
+    Toutput* z_data = z->flat<Toutput>().data();
+
+    // We want the range of the output to be symmetrical around zero so that
+    // adding zero leaves the result unchanged, and to contain the largest of
+    // the two input values with some room to spare.
+    const float smallest_min = std::min(min_x, min_y);
+    const float largest_max = std::min(max_x, max_y);
+    const float biggest_range =
+        std::max(std::abs(smallest_min), std::abs(largest_max));
+    const float output_range = (biggest_range * (1 << 14));
+    const float min_z_value = -output_range;
+    const float max_z_value = output_range;
+
+    const int ndims = bcast.x_reshape().size();
+    if (ndims <= 1) {
+      if (x.NumElements() == 1) {
+        ScalarAddition<T, Toutput>(context, y_data, min_y, max_y,
+                                   y.NumElements(), x_data[0], min_x, max_x,
+                                   min_z_value, max_z_value, z_data);
+      } else if (y.NumElements() == 1) {
+        ScalarAddition<T, Toutput>(context, x_data, min_x, max_x,
+                                   x.NumElements(), y_data[0], min_y, max_y,
+                                   min_z_value, max_z_value, z_data);
+      } else {
+        VectorAddition<T, Toutput>(context, x_data, min_x, max_x, y_data, min_y,
+                                   max_y, x.NumElements(), min_z_value,
+                                   max_z_value, z_data);
+      }
+    } else if (ndims == 2) {
+      const T* vector_data;
+      int64 vector_num_elements;
+      float vector_min;
+      float vector_max;
+      const T* tensor_data;
+      int64 tensor_num_elements;
+      float tensor_min;
+      float tensor_max;
+      if (x.NumElements() < y.NumElements()) {
+        vector_data = x_data;
+        vector_num_elements = x.NumElements();
+        vector_min = min_x;
+        vector_max = max_x;
+        tensor_data = y_data;
+        tensor_num_elements = y.NumElements();
+        tensor_min = min_y;
+        tensor_max = max_y;
+      } else {
+        vector_data = y_data;
+        vector_num_elements = y.NumElements();
+        vector_min = min_y;
+        vector_max = max_y;
+        tensor_data = x_data;
+        tensor_num_elements = x.NumElements();
+        tensor_min = min_x;
+        tensor_max = max_x;
+      }
+      VectorTensorAddition<T, Toutput>(
+          vector_data, vector_min, vector_max, vector_num_elements, tensor_data,
+          tensor_min, tensor_max, tensor_num_elements, min_z_value, max_z_value,
+          z_data);
+    } else {
+      LOG(INFO) << "ndims=" << ndims;
+      LOG(INFO) << "bcast.x_reshape()="
+                << TensorShape(bcast.x_reshape()).DebugString();
+      LOG(INFO) << "bcast.y_reshape()="
+                << TensorShape(bcast.y_reshape()).DebugString();
+      LOG(INFO) << "bcast.x_bcast()="
+                << TensorShape(bcast.x_bcast()).DebugString();
+      LOG(INFO) << "bcast.y_bcast()="
+                << TensorShape(bcast.y_bcast()).DebugString();
+
+      context->SetStatus(errors::Unimplemented(
+          "Broadcast between ", context->input(0).shape().DebugString(),
+          " and ", context->input(1).shape().DebugString(),
+          " is not supported yet."));
+      return;
+    }
+
+    Tensor* z_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &z_min));
+    z_min->flat<float>()(0) = min_z_value;
+
+    Tensor* z_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &z_max));
+    z_max->flat<float>()(0) = max_z_value;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedAdd")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T1")
+                            .TypeConstraint<quint8>("T2")
+                            .TypeConstraint<qint32>("Toutput"),
+                        QuantizedAddOp<quint8, qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_add_op_test.cc b/tensorflow/core/kernels/quantized_add_op_test.cc
new file mode 100644
index 00000000000..74d16b282df
--- /dev/null
+++ b/tensorflow/core/kernels/quantized_add_op_test.cc
@@ -0,0 +1,311 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+namespace {
+
+void TestAdd(const std::vector<int64>& x_shape,
+             const std::vector<float>& x_values, float x_min_value,
+             float x_max_value, const std::vector<int64>& y_shape,
+             const std::vector<float>& y_values, float y_min_value,
+             float y_max_value, const std::vector<int64>& expected_shape,
+             const std::vector<float>& expected_values, double tolerance) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor x_float_tensor(DT_FLOAT, TensorShape(x_shape));
+  test::FillValues<float>(&x_float_tensor, x_values);
+  Tensor x_quantized_tensor(DT_QUINT8, x_float_tensor.shape());
+  FloatTensorToQuantizedInPlace<quint8>(x_float_tensor, x_min_value,
+                                        x_max_value, &x_quantized_tensor);
+  Output x =
+      Const(root.WithOpName("x"), Input::Initializer(x_quantized_tensor));
+  Output x_min = Const(root.WithOpName("x_min"), x_min_value);
+  Output x_max = Const(root.WithOpName("x_max"), x_max_value);
+
+  Tensor y_float_tensor(DT_FLOAT, TensorShape(y_shape));
+  test::FillValues<float>(&y_float_tensor, y_values);
+  Tensor y_quantized_tensor(DT_QUINT8, y_float_tensor.shape());
+  FloatTensorToQuantizedInPlace<quint8>(y_float_tensor, y_min_value,
+                                        y_max_value, &y_quantized_tensor);
+  Output y =
+      Const(root.WithOpName("y"), Input::Initializer(y_quantized_tensor));
+  Output y_min = Const(root.WithOpName("y_min"), y_min_value);
+  Output y_max = Const(root.WithOpName("y_max"), y_max_value);
+
+  ops::QuantizedAdd add = ops::QuantizedAdd(root.WithOpName("add"), x, y, x_min,
+                                            x_max, y_min, y_max);
+
+  TF_EXPECT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {add.z, add.min_z, add.max_z}, &outputs));
+
+  const Tensor& z_quantized = outputs[0];
+  const float z_min = outputs[1].flat<float>()(0);
+  const float z_max = outputs[2].flat<float>()(0);
+
+  Tensor z_float = QuantizedTensorToFloat<qint32>(z_quantized, z_min, z_max);
+  Tensor expected_z_float(DT_FLOAT, TensorShape(expected_shape));
+  test::FillValues<float>(&expected_z_float, expected_values);
+  test::ExpectTensorNear<float>(expected_z_float, z_float, tolerance);
+}
+
+void TestAddShape(const std::vector<int64>& x_shape,
+                  const std::vector<int64>& y_shape) {
+  const size_t x_num_elements = TensorShape(x_shape).num_elements();
+  std::vector<float> x_values(x_num_elements);
+  for (int i = 0; i < x_num_elements; ++i) {
+    x_values[i] = i % 256;
+  }
+  const float x_min_value = 0.0f;
+  const float x_max_value = 256.0f;
+
+  const size_t y_num_elements = TensorShape(y_shape).num_elements();
+  std::vector<float> y_values(y_num_elements);
+  for (int i = 0; i < y_num_elements; ++i) {
+    y_values[i] = ((i + 23) % 123) - 50;
+  }
+  const float y_min_value = -150.0f;
+  const float y_max_value = 150.0f;
+
+  Scope root = Scope::NewRootScope();
+
+  Tensor x_float_tensor(DT_FLOAT, TensorShape(x_shape));
+  test::FillValues<float>(&x_float_tensor, x_values);
+  Output x = Const(root.WithOpName("x"), Input::Initializer(x_float_tensor));
+
+  Tensor y_float_tensor(DT_FLOAT, TensorShape(y_shape));
+  test::FillValues<float>(&y_float_tensor, y_values);
+  Output y = Const(root.WithOpName("y"), Input::Initializer(y_float_tensor));
+
+  Add add = Add(root.WithOpName("add"), x, y);
+
+  TF_EXPECT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {add.z}, &outputs));
+
+  const Tensor& expected_values_tensor = outputs[0];
+  const float* expected_values_data =
+      expected_values_tensor.flat<float>().data();
+  std::vector<float> expected_values(
+      expected_values_data,
+      expected_values_data + expected_values_tensor.NumElements());
+  std::vector<int64> expected_shape;
+  for (const int64 dim : expected_values_tensor.shape().dim_sizes()) {
+    expected_shape.push_back(dim);
+  }
+  TestAdd(x_shape, x_values, x_min_value, x_max_value, y_shape, y_values,
+          y_min_value, y_max_value, expected_shape, expected_values, 256.0);
+}
+
+void TimeAdd(const std::vector<int64>& x_shape,
+             const std::vector<int64>& y_shape, int64 iterations) {
+  TestAddShape(x_shape, y_shape);
+
+  Scope root = Scope::NewRootScope();
+
+  Tensor x_quantized_tensor(DT_QUINT8, TensorShape(x_shape));
+  Output placeholder = Placeholder(root.WithOpName("placeholder"), DT_QUINT8);
+  Output x_min = Const(root.WithOpName("x_min"), 0.0f);
+  Output x_max = Const(root.WithOpName("x_max"), 1.0f);
+
+  Tensor y_quantized_tensor(DT_QUINT8, TensorShape(y_shape));
+  Output y =
+      Const(root.WithOpName("y"), Input::Initializer(y_quantized_tensor));
+  Output y_min = Const(root.WithOpName("y_min"), 0.0f);
+  Output y_max = Const(root.WithOpName("y_max"), 1.0f);
+
+  ops::QuantizedAdd add = ops::QuantizedAdd(root.WithOpName("add"), placeholder,
+                                            y, x_min, x_max, y_min, y_max);
+
+  TF_EXPECT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  int64 total_duration = 0;
+  for (int i = 0; i < iterations; ++i) {
+    const int64 start_time = Env::Default()->NowMicros();
+    TF_EXPECT_OK(session.Run({{placeholder, x_quantized_tensor}},
+                             {add.z, add.min_z, add.max_z}, &outputs));
+    const int64 end_time = Env::Default()->NowMicros();
+    total_duration += end_time - start_time;
+  }
+  const int64 one_run_duration = total_duration / iterations;
+
+  const int64 num_ops = outputs[0].NumElements();
+
+  const double million_ops_per_second =
+      (iterations * num_ops) / static_cast<double>(total_duration);
+
+  LOG(INFO) << "TimeAdd: " << TensorShape(x_shape).DebugString() << " * "
+            << TensorShape(y_shape).DebugString()
+            << ": iterations=" << iterations
+            << ", MOps/s=" << million_ops_per_second
+            << ", one_run_duration=" << one_run_duration
+            << ", total_duration=" << total_duration;
+}
+
+}  // namespace
+
+void TestManualScalar() {
+  TestAdd(
+      {10}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f},
+      1.0f);
+  TestAdd(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f},
+      1.0f);
+}
+
+void TestScalar() {
+  TestAddShape({100}, {1});
+  TestAddShape({1}, {100});
+}
+
+void TestManualVector() {
+  TestAdd({10}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f},
+          0.0f, 10.0f, {10},
+          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+          10.0f, {10},
+          {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f},
+          1.0f);
+}
+
+void TestVector() { TestAddShape({100}, {100}); }
+
+void TestManualVectorPlusTensor() {
+  TestAdd(
+      {10}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {2, 10},
+      {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
+       11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f},
+      0.0f, 20.0f, {2, 10},
+      {2.0f,  4.0f,  6.0f,  8.0f,  10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f,
+       12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f},
+      1.0f);
+  TestAdd({2, 10}, {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                    8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f,
+                    15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f},
+          0.0f, 20.0f, {10},
+          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+          10.0f, {2, 10}, {2.0f,  4.0f,  6.0f,  8.0f,  10.0f, 12.0f, 14.0f,
+                           16.0f, 18.0f, 20.0f, 12.0f, 14.0f, 16.0f, 18.0f,
+                           20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f},
+          1.0f);
+  TestAdd(
+      {5, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f},
+      0.0f, 10.0f, {2, 5, 2},
+      {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
+       11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f},
+      0.0f, 20.0f, {2, 5, 2},
+      {2.0f,  4.0f,  6.0f,  8.0f,  10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f,
+       12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f, 28.0f, 30.0f},
+      1.0f);
+}
+
+void TestVectorPlusTensor() {
+  TestAddShape({100}, {2, 100});
+  TestAddShape({2, 100}, {100});
+  TestAddShape({5, 2}, {2, 5, 2});
+}
+
+void BenchmarkTensorScalar() {
+  TimeAdd({200}, {1}, 1000);
+  TimeAdd({10000}, {1}, 100);
+  TimeAdd({1000000}, {1}, 10);
+  TimeAdd({10000000}, {1}, 1);
+}
+
+void BenchmarkVector() {
+  TimeAdd({200}, {200}, 1000);
+  TimeAdd({10000}, {10000}, 100);
+  TimeAdd({1000000}, {1000000}, 10);
+  TimeAdd({10000000}, {10000000}, 1);
+}
+
+void BenchmarkVectorPlusTensor() {
+  TimeAdd({10, 20}, {20}, 100);
+  TimeAdd({10, 1000}, {1000}, 10);
+  TimeAdd({1000, 1000}, {1000}, 1);
+  TimeAdd({10000, 1000}, {1000}, 1);
+  TimeAdd({100, 100}, {100}, 10);
+  TimeAdd({10000, 100}, {100}, 1);
+  TimeAdd({100000, 100}, {100}, 1);
+}
+
+#if !defined(__ANDROID__)
+
+#define RUN_TEST(t) \
+  TEST(QuantizedAddOpTest, t) { t(); }
+
+RUN_TEST(TestManualScalar);
+RUN_TEST(TestManualVector);
+RUN_TEST(TestManualVectorPlusTensor);
+RUN_TEST(TestScalar);
+RUN_TEST(TestVector);
+RUN_TEST(TestVectorPlusTensor);
+
+#undef RUN_TEST
+
+#endif  // __ANDROID__
+
+}  // end namespace tensorflow
+
+#if defined(__ANDROID__)
+int main(int argc, char** argv) {
+  LOG(INFO) << "TestManualScalar:";
+  tensorflow::TestManualScalar();
+  LOG(INFO) << "TestManualVector:";
+  tensorflow::TestManualVector();
+  LOG(INFO) << "TestManualVectorPlusTensor:";
+  tensorflow::TestManualVectorPlusTensor();
+  tensorflow::BenchmarkTensorScalar();
+  tensorflow::BenchmarkVector();
+  tensorflow::BenchmarkVectorPlusTensor();
+  LOG(INFO) << "All tests complete";
+  return 0;
+}
+#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op_test.cc b/tensorflow/core/kernels/quantized_batch_norm_op_test.cc
index 9880d972cde..78e4fb8617a 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -26,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/batch_norm_op.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/quantized_bias_add_op_test.cc b/tensorflow/core/kernels/quantized_bias_add_op_test.cc
index 3fd0eaa9814..7b99ceafe26 100644
--- a/tensorflow/core/kernels/quantized_bias_add_op_test.cc
+++ b/tensorflow/core/kernels/quantized_bias_add_op_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
 #include <functional>
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/quantized_concat_op_test.cc b/tensorflow/core/kernels/quantized_concat_op_test.cc
index dc1439279e5..2b7fd248e9e 100644
--- a/tensorflow/core/kernels/quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/quantized_concat_op_test.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index c3f19c729c6..56a7e161df4 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -88,9 +88,9 @@ class ReferenceConvFunctor {
     int filter_top_offset;
     if (padding == VALID) {
       filter_left_offset =
-          ((output_width - 1) * stride + filter_width - input_width) / 2;
+          ((output_width - 1) * stride + filter_width - input_width + 1) / 2;
       filter_top_offset =
-          ((output_height - 1) * stride + filter_height - input_height) / 2;
+          ((output_height - 1) * stride + filter_height - input_height + 1) / 2;
     } else {
       filter_left_offset =
           ((output_width - 1) * stride + filter_width - input_width) / 2;
@@ -233,9 +233,9 @@ class Im2ColConvFunctor {
     int filter_top_offset;
     if (padding == VALID) {
       filter_left_offset =
-          ((output_width - 1) * stride + filter_width - input_width) / 2;
+          ((output_width - 1) * stride + filter_width - input_width + 1) / 2;
       filter_top_offset =
-          ((output_height - 1) * stride + filter_height - input_height) / 2;
+          ((output_height - 1) * stride + filter_height - input_height + 1) / 2;
     } else {
       filter_left_offset =
           ((output_width - 1) * stride + filter_width - input_width) / 2;
diff --git a/tensorflow/core/kernels/quantized_conv_ops_test.cc b/tensorflow/core/kernels/quantized_conv_ops_test.cc
index 01e55f85939..4226378bb64 100644
--- a/tensorflow/core/kernels/quantized_conv_ops_test.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index d55d6667097..86c722e5875 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -139,7 +139,7 @@ class QuantizedMatMulOp : public OpKernel {
       // Gemmlowp/meta code path works on 32 & 64 bit Arm with NEON Simd and
       // allows optimized quantized 8bit to 32bit gemm.
       meta::QuantizedGemm(context, transpose_a_, transpose_b_, a_data, b_data,
-                          c_data, m, n, k, offset_a, offset_b, lda, ldb, ldc);
+                          c_data, m, n, k, -offset_a, -offset_b, lda, ldb, ldc);
     } else if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
                std::is_same<Toutput, qint32>() && (offset_c == 0) &&
                (mult_c == 1) && (shift_c == 0) && (transpose_c == false)) {
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 8b3a7adfa0a..535b5115c34 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/quantized_pooling_ops_test.cc b/tensorflow/core/kernels/quantized_pooling_ops_test.cc
index d3247d15d6c..fc0417e5431 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops_test.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/quantization_utils.h"
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/quantized_reshape_op_test.cc b/tensorflow/core/kernels/quantized_reshape_op_test.cc
index cd80808b606..a7066f98f39 100644
--- a/tensorflow/core/kernels/quantized_reshape_op_test.cc
+++ b/tensorflow/core/kernels/quantized_reshape_op_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
new file mode 100644
index 00000000000..fb2faede2f9
--- /dev/null
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -0,0 +1,731 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements a quantized version of the resize bilinear op.
+
+#define EIGEN_USE_THREADS
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#define QUANTIZED_RESIZE_BILINEAR_USE_NEON
+#include <arm_neon.h>
+#endif
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+static constexpr bool USE_REFERENCE = false;
+
+namespace {
+// Compute the interpolation indices only once.
+template <typename T_SCALE>
+struct InterpolationCache {
+  std::vector<int64> lower;  // Lower source index used in the interpolation
+  std::vector<int64> upper;  // Upper source index used in the interpolation
+  // 1-D linear iterpolation scale (see:
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  std::vector<float> lerp;
+  std::vector<T_SCALE> ilerp;
+};
+
+template <typename T_SCALE>
+inline void ComputeInterpolationWeights(
+    const int64 out_size, const int64 in_size, const float scale,
+    const int resolution, InterpolationCache<T_SCALE>* interpolation) {
+  interpolation->lower.resize(out_size + 1);
+  interpolation->upper.resize(out_size + 1);
+  interpolation->lerp.resize(out_size + 1);
+  interpolation->ilerp.resize(out_size + 1);
+
+  interpolation->lower[out_size] = 0;
+  interpolation->upper[out_size] = 0;
+  for (int64 i = out_size - 1; i >= 0; --i) {
+    const float in = i * scale;
+    interpolation->lower[i] = static_cast<int64>(in);
+    interpolation->upper[i] =
+        std::min(interpolation->lower[i] + 1, in_size - 1);
+    interpolation->lerp[i] = in - interpolation->lower[i];
+    interpolation->ilerp[i] = static_cast<T_SCALE>(
+        (in - interpolation->lower[i]) * (1 << resolution));
+  }
+}
+
+template <typename T_SCALE>
+inline InterpolationCache<T_SCALE> BuildLerpCache(const int64 out_size,
+                                                  const int64 in_size,
+                                                  const float scale,
+                                                  const int index_step,
+                                                  const int resolution) {
+  InterpolationCache<T_SCALE> cache;
+  // Compute the cached interpolation weights on the x and y dimensions.
+  ComputeInterpolationWeights<T_SCALE>(out_size, in_size, scale, resolution,
+                                       &cache);
+  CHECK(index_step > 0);
+  if (index_step > 1) {
+    for (int i = 0; i < cache.lower.size(); ++i) {
+      cache.lower[i] *= index_step;
+      cache.upper[i] *= index_step;
+    }
+  }
+  return cache;
+}
+
+/**
+ * Computes the bilinear interpolation from the appropriate 4 float points
+ * and the linear interpolation weights.
+ */
+template <typename T>
+inline T ComputeLerpReference(const T in_top_left, const T in_top_right,
+                              const T in_bottom_left, const T in_bottom_right,
+                              const float x_lerp, const float y_lerp,
+                              const float min, const float max) {
+  const float top_left = QuantizedToFloat<T>(in_top_left, min, max);
+  const float top_right = QuantizedToFloat<T>(in_top_right, min, max);
+  const float bottom_left = QuantizedToFloat<T>(in_bottom_left, min, max);
+  const float bottom_right = QuantizedToFloat<T>(in_bottom_right, min, max);
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  const float out = top + (bottom - top) * y_lerp;
+  return FloatToQuantized<T>(out, min, max);
+}
+
+template <typename T, typename T_SCALE, typename T_CALC>
+inline T_CALC MulOffset(T a, T b, T_SCALE c) {
+  return (static_cast<T_CALC>(a) - static_cast<T_CALC>(b)) *
+         static_cast<T_CALC>(c);
+}
+
+template <int RESOLUTION, typename T, typename T_SCALE, typename T_CALC>
+inline T ComputeLerp(const T top_left, const T top_right, const T bottom_left,
+                     const T bottom_right, const T_SCALE x_lerp,
+                     const T_SCALE y_lerp) {
+  constexpr T_CALC RESOLUTION_MULT = (1 << RESOLUTION);
+  const T_CALC top = static_cast<T_CALC>(top_left) * RESOLUTION_MULT +
+                     MulOffset<T, T_SCALE, T_CALC>(top_right, top_left, x_lerp);
+  const T_CALC bottom =
+      static_cast<T_CALC>(bottom_left) * RESOLUTION_MULT +
+      MulOffset<T, T_SCALE, T_CALC>(bottom_right, bottom_left, x_lerp);
+  const T_CALC out = top + (bottom - top) / RESOLUTION_MULT * y_lerp;
+  return static_cast<T>(
+      static_cast<int32>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
+}
+
+#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
+inline uint8x8_t ToUint8x8(const quint8* v0, const quint8* v1, const quint8* v2,
+                           const quint8* v3, const quint8* v4, const quint8* v5,
+                           const quint8* v6, const quint8* v7) {
+  static const uint8x8_t ZERO_8x8 = vmov_n_u8(0);
+  uint8x8_t ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v0), ZERO_8x8, 0);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v1), ret, 1);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v2), ret, 2);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v3), ret, 3);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v4), ret, 4);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v5), ret, 5);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v6), ret, 6);
+  ret = vld1_lane_u8(reinterpret_cast<const uint8*>(v7), ret, 7);
+  return ret;
+}
+
+inline int16x8_t ToInt16x8(const int16* v0, const int16* v1, const int16* v2,
+                           const int16* v3, const int16* v4, const int16* v5,
+                           const int16* v6, const int16* v7) {
+  static const int16x8_t ZERO_16x8 = vmovq_n_s16(0);
+  int16x8_t ret = vld1q_lane_s16(v0, ZERO_16x8, 0);
+  ret = vld1q_lane_s16(v1, ret, 1);
+  ret = vld1q_lane_s16(v2, ret, 2);
+  ret = vld1q_lane_s16(v3, ret, 3);
+  ret = vld1q_lane_s16(v4, ret, 4);
+  ret = vld1q_lane_s16(v5, ret, 5);
+  ret = vld1q_lane_s16(v6, ret, 6);
+  ret = vld1q_lane_s16(v7, ret, 7);
+  return ret;
+}
+
+inline int32x2_t ToInt32x2(const qint32* v0, const qint32* v1) {
+  static const int32x2_t ZERO_32x2 = vmov_n_s32(0);
+  const int32x2_t ret0 =
+      vld1_lane_s32(reinterpret_cast<const int32*>(v0), ZERO_32x2, 0);
+  const int32x2_t ret1 =
+      vld1_lane_s32(reinterpret_cast<const int32*>(v1), ret0, 1);
+  return ret1;
+}
+
+template <int RESOLUTION, bool X_LERP_SAME>
+inline int32x2_t ComputeLerpx2(
+    const qint32* top_left0, const qint32* top_right0,
+    const qint32* bottom_left0, const qint32* bottom_right0,
+    const qint32* top_left1, const qint32* top_right1,
+    const qint32* bottom_left1, const qint32* bottom_right1,
+    const int32* x_lerp, const int32x2_t y_lerpsx) {
+  const int32x2_t x_lerpsx =
+      X_LERP_SAME ? vld1_dup_s32(reinterpret_cast<const int32*>(x_lerp))
+                  : vld1_s32(reinterpret_cast<const int32*>(x_lerp));
+
+  const int32x2_t top_leftsx = ToInt32x2(top_left0, top_left1);
+  const int32x2_t top_rightsx = ToInt32x2(top_right0, top_right1);
+  const int32x2_t bottom_leftsx = ToInt32x2(bottom_left0, bottom_left1);
+  const int32x2_t bottom_rightsx = ToInt32x2(bottom_right0, bottom_right1);
+
+  const int32x2_t retval =
+      ComputeLerp32x2<RESOLUTION>(top_leftsx, top_rightsx, bottom_leftsx,
+                                  bottom_rightsx, x_lerpsx, y_lerpsx);
+  return retval;
+}
+
+template <int RESOLUTION>
+inline uint8x8_t ComputeLerpx8(
+    const quint8* tl0, const quint8* tr0, const quint8* bl0, const quint8* br0,
+    const int16* xlp0, const quint8* tl1, const quint8* tr1, const quint8* bl1,
+    const quint8* br1, const int16* xlp1, const quint8* tl2, const quint8* tr2,
+    const quint8* bl2, const quint8* br2, const int16* xlp2, const quint8* tl3,
+    const quint8* tr3, const quint8* bl3, const quint8* br3, const int16* xlp3,
+    const quint8* tl4, const quint8* tr4, const quint8* bl4, const quint8* br4,
+    const int16* xlp4, const quint8* tl5, const quint8* tr5, const quint8* bl5,
+    const quint8* br5, const int16* xlp5, const quint8* tl6, const quint8* tr6,
+    const quint8* bl6, const quint8* br6, const int16* xlp6, const quint8* tl7,
+    const quint8* tr7, const quint8* bl7, const quint8* br7, const int16* xlp7,
+    const int16x8_t ys_lerpsx) {
+  const uint8x8_t tl8x8 = ToUint8x8(tl0, tl1, tl2, tl3, tl4, tl5, tl6, tl7);
+  const uint8x8_t tr8x8 = ToUint8x8(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);
+  const uint8x8_t bl8x8 = ToUint8x8(bl0, bl1, bl2, bl3, bl4, bl5, bl6, bl7);
+  const uint8x8_t br8x8 = ToUint8x8(br0, br1, br2, br3, br4, br5, br6, br7);
+  const int16x8_t xs_lerpsx =
+      ToInt16x8(xlp0, xlp1, xlp2, xlp3, xlp4, xlp5, xlp6, xlp7);
+  return ComputeLerp8x8<RESOLUTION>(tl8x8, tr8x8, bl8x8, br8x8, xs_lerpsx,
+                                    ys_lerpsx);
+}
+
+// Expand address at compile time to improve performance
+template <int RESOLUTION, int ID0, int CH0, int ID1, int CH1, int ID2, int CH2,
+          int ID3, int CH3, int ID4, int CH4, int ID5, int CH5, int ID6,
+          int CH6, int ID7, int CH7>
+inline uint8x8_t ComputeLerpx8Tmpl(const quint8* const yl, const quint8* yu,
+                                   const int64* xl, const int64* xu,
+                                   const int16* xlp,
+                                   const int16x8_t ys_lerpsx) {
+  return ComputeLerpx8<RESOLUTION>(
+      yl + xl[ID0] + CH0, yl + xu[ID0] + CH0, yu + xl[ID0] + CH0,
+      yu + xu[ID0] + CH0, xlp + ID0, yl + xl[ID1] + CH1, yl + xu[ID1] + CH1,
+      yu + xl[ID1] + CH1, yu + xu[ID1] + CH1, xlp + ID1, yl + xl[ID2] + CH2,
+      yl + xu[ID2] + CH2, yu + xl[ID2] + CH2, yu + xu[ID2] + CH2, xlp + ID2,
+      yl + xl[ID3] + CH3, yl + xu[ID3] + CH3, yu + xl[ID3] + CH3,
+      yu + xu[ID3] + CH3, xlp + ID3, yl + xl[ID4] + CH4, yl + xu[ID4] + CH4,
+      yu + xl[ID4] + CH4, yu + xu[ID4] + CH4, xlp + ID4, yl + xl[ID5] + CH5,
+      yl + xu[ID5] + CH5, yu + xl[ID5] + CH5, yu + xu[ID5] + CH5, xlp + ID5,
+      yl + xl[ID6] + CH6, yl + xu[ID6] + CH6, yu + xl[ID6] + CH6,
+      yu + xu[ID6] + CH6, xlp + ID6, yl + xl[ID7] + CH7, yl + xu[ID7] + CH7,
+      yu + xl[ID7] + CH7, yu + xu[ID7] + CH7, xlp + ID7, ys_lerpsx);
+}
+
+#endif
+
+template <int RESOLUTION, typename T, typename T_SCALE, typename T_CALC>
+inline void OutputLerpForChannels(const InterpolationCache<T_SCALE>& xs,
+                                  const int64 x, const T_SCALE ys_ilerp,
+                                  const int channels, const float min,
+                                  const float max, const T* ys_input_lower_ptr,
+                                  const T* ys_input_upper_ptr,
+                                  T* output_y_ptr) {
+  const int64 xs_lower = xs.lower[x];
+  const int64 xs_upper = xs.upper[x];
+  const T_SCALE xs_ilerp = xs.ilerp[x];
+  for (int c = 0; c < channels; ++c) {
+    const T top_left = ys_input_lower_ptr[xs_lower + c];
+    const T top_right = ys_input_lower_ptr[xs_upper + c];
+    const T bottom_left = ys_input_upper_ptr[xs_lower + c];
+    const T bottom_right = ys_input_upper_ptr[xs_upper + c];
+    const T val = ComputeLerp<RESOLUTION, T, T_SCALE, T_CALC>(
+        top_left, top_right, bottom_left, bottom_right, xs_ilerp, ys_ilerp);
+    output_y_ptr[x * channels + c] = val;
+  }
+}
+
+template <int RES>
+inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
+                            const int64 x_start, const int16 ys_ilerp,
+                            const float min, const float max,
+                            const quint8* const ys_input_lower_ptr,
+                            const quint8* const ys_input_upper_ptr,
+                            quint8* output_y_ptr) {
+#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
+  const int16x8_t y_lerpsx = vmovq_n_s16(ys_ilerp);
+
+  const uint8x8_t x0x7 =
+      ComputeLerpx8Tmpl<RES, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0>(
+          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
+          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);
+
+  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start), x0x7);
+
+#else
+  for (int x = x_start; x < x_start + 8; ++x) {
+    OutputLerpForChannels<RES, quint8, int16, int16>(
+        xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
+        output_y_ptr);
+  }
+#endif
+}
+
+template <int RES>
+inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
+                            const int64 x_start, const int16 ys_ilerp,
+                            const float min, const float max,
+                            const quint8* const ys_input_lower_ptr,
+                            const quint8* const ys_input_upper_ptr,
+                            quint8* output_y_ptr) {
+#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
+  const int16x8_t y_lerpsx = vmovq_n_s16(ys_ilerp);
+
+  const uint8x8_t x0c0x2c1 =
+      ComputeLerpx8Tmpl<RES, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 2, 2, 0, 2, 1>(
+          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
+          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);
+
+  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3), x0c0x2c1);
+
+  const uint8x8_t x2c2x5c0 =
+      ComputeLerpx8Tmpl<RES, 2, 2, 3, 0, 3, 1, 3, 2, 4, 0, 4, 1, 4, 2, 5, 0>(
+          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
+          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);
+
+  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3 + 8), x2c2x5c0);
+
+  const uint8x8_t x5c1x7c2 =
+      ComputeLerpx8Tmpl<RES, 5, 1, 5, 2, 6, 0, 6, 1, 6, 2, 7, 0, 7, 1, 7, 2>(
+          ys_input_lower_ptr, ys_input_upper_ptr, &xs.lower[x_start],
+          &xs.upper[x_start], &xs.ilerp[x_start], y_lerpsx);
+
+  vst1_u8(reinterpret_cast<uint8_t*>(output_y_ptr + x_start * 3 + 16),
+          x5c1x7c2);
+
+#else
+  for (int x = x_start; x < x_start + 8; ++x) {
+    OutputLerpForChannels<RES, quint8, int16, int16>(
+        xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
+        output_y_ptr);
+  }
+#endif
+}
+
+template <int RESOLUTION>
+inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
+                             const int64 x_start, const int32 ys_ilerp,
+                             const float min, const float max,
+                             const qint32* const ys_input_lower_ptr,
+                             const qint32* const ys_input_upper_ptr,
+                             qint32* output_y_ptr) {
+#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
+  const int64 xs_lower0 = xs.lower[x_start];
+  const int64 xs_upper0 = xs.upper[x_start];
+  const int32* const xs_ilerp0 = &xs.ilerp[x_start];
+  const int64 xs_lower1 = xs.lower[x_start + 1];
+  const int64 xs_upper1 = xs.upper[x_start + 1];
+  const int64 xs_lower2 = xs.lower[x_start + 2];
+  const int64 xs_upper2 = xs.upper[x_start + 2];
+  const int32* const xs_ilerp2 = &xs.ilerp[x_start + 2];
+  const int64 xs_lower3 = xs.lower[x_start + 3];
+  const int64 xs_upper3 = xs.upper[x_start + 3];
+
+  const int32x2_t y_lerpsx = vmov_n_s32(ys_ilerp);
+
+  const int32x2_t x0x1 = ComputeLerpx2<RESOLUTION, false>(
+      ys_input_lower_ptr + xs_lower0, ys_input_lower_ptr + xs_upper0,
+      ys_input_upper_ptr + xs_lower0, ys_input_upper_ptr + xs_upper0,
+      ys_input_lower_ptr + xs_lower1, ys_input_lower_ptr + xs_upper1,
+      ys_input_upper_ptr + xs_lower1, ys_input_upper_ptr + xs_upper1, xs_ilerp0,
+      y_lerpsx);
+
+  const int32x2_t x1x2 = ComputeLerpx2<RESOLUTION, false>(
+      ys_input_lower_ptr + xs_lower2, ys_input_lower_ptr + xs_upper2,
+      ys_input_upper_ptr + xs_lower2, ys_input_upper_ptr + xs_upper2,
+      ys_input_lower_ptr + xs_lower3, ys_input_lower_ptr + xs_upper3,
+      ys_input_upper_ptr + xs_lower3, ys_input_upper_ptr + xs_upper3, xs_ilerp2,
+      y_lerpsx);
+
+  const int32x4_t x0x1x2x3 = vcombine_s32(x0x1, x1x2);
+
+  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start), x0x1x2x3);
+
+#else
+  for (int x = x_start; x < x_start + 4; ++x) {
+    OutputLerpForChannels<RESOLUTION, qint32, int32, int64>(
+        xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
+        output_y_ptr);
+  }
+#endif
+}
+
+template <int RESOLUTION>
+inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
+                             const int64 x_start, const int32 ys_ilerp,
+                             const float min, const float max,
+                             const qint32* const ys_input_lower_ptr,
+                             const qint32* const ys_input_upper_ptr,
+                             qint32* output_y_ptr) {
+#ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
+  const int64 xs_lower0 = xs.lower[x_start];
+  const int64 xs_upper0 = xs.upper[x_start];
+  const int32* const xs_ilerp0 = &xs.ilerp[x_start];
+  const int64 xs_lower1 = xs.lower[x_start + 1];
+  const int64 xs_upper1 = xs.upper[x_start + 1];
+  const int32* const xs_ilerp1 = &xs.ilerp[x_start + 1];
+  const int64 xs_lower2 = xs.lower[x_start + 2];
+  const int64 xs_upper2 = xs.upper[x_start + 2];
+  const int32* const xs_ilerp2 = &xs.ilerp[x_start + 2];
+  const int64 xs_lower3 = xs.lower[x_start + 3];
+  const int64 xs_upper3 = xs.upper[x_start + 3];
+  const int32* const xs_ilerp3 = &xs.ilerp[x_start + 3];
+
+  const int32x2_t y_lerpsx = vmov_n_s32(ys_ilerp);
+
+  const int32x2_t x0c0x0c1 = ComputeLerpx2<RESOLUTION, true>(
+      ys_input_lower_ptr + xs_lower0, ys_input_lower_ptr + xs_upper0,
+      ys_input_upper_ptr + xs_lower0, ys_input_upper_ptr + xs_upper0,
+      ys_input_lower_ptr + xs_lower0 + 1, ys_input_lower_ptr + xs_upper0 + 1,
+      ys_input_upper_ptr + xs_lower0 + 1, ys_input_upper_ptr + xs_upper0 + 1,
+      xs_ilerp0, y_lerpsx);
+
+  const int32x2_t x0c2x1c0 = ComputeLerpx2<RESOLUTION, false>(
+      ys_input_lower_ptr + xs_lower0 + 2, ys_input_lower_ptr + xs_upper0 + 2,
+      ys_input_upper_ptr + xs_lower0 + 2, ys_input_upper_ptr + xs_upper0 + 2,
+      ys_input_lower_ptr + xs_lower1, ys_input_lower_ptr + xs_upper1,
+      ys_input_upper_ptr + xs_lower1, ys_input_upper_ptr + xs_upper1, xs_ilerp0,
+      y_lerpsx);
+
+  const int32x2_t x1c1x1c2 = ComputeLerpx2<RESOLUTION, true>(
+      ys_input_lower_ptr + xs_lower1 + 1, ys_input_lower_ptr + xs_upper1 + 1,
+      ys_input_upper_ptr + xs_lower1 + 1, ys_input_upper_ptr + xs_upper1 + 1,
+      ys_input_lower_ptr + xs_lower1 + 2, ys_input_lower_ptr + xs_upper1 + 2,
+      ys_input_upper_ptr + xs_lower1 + 2, ys_input_upper_ptr + xs_upper1 + 2,
+      xs_ilerp1, y_lerpsx);
+
+  const int32x2_t x2c0x2c1 = ComputeLerpx2<RESOLUTION, true>(
+      ys_input_lower_ptr + xs_lower2, ys_input_lower_ptr + xs_upper2,
+      ys_input_upper_ptr + xs_lower2, ys_input_upper_ptr + xs_upper2,
+      ys_input_lower_ptr + xs_lower2 + 1, ys_input_lower_ptr + xs_upper2 + 1,
+      ys_input_upper_ptr + xs_lower2 + 1, ys_input_upper_ptr + xs_upper2 + 1,
+      xs_ilerp2, y_lerpsx);
+
+  const int32x2_t x2c2x3c0 = ComputeLerpx2<RESOLUTION, false>(
+      ys_input_lower_ptr + xs_lower2 + 2, ys_input_lower_ptr + xs_upper2 + 2,
+      ys_input_upper_ptr + xs_lower2 + 2, ys_input_upper_ptr + xs_upper2 + 2,
+      ys_input_lower_ptr + xs_lower3, ys_input_lower_ptr + xs_upper3,
+      ys_input_upper_ptr + xs_lower3, ys_input_upper_ptr + xs_upper3, xs_ilerp2,
+      y_lerpsx);
+
+  const int32x2_t x3c1x3c2 = ComputeLerpx2<RESOLUTION, true>(
+      ys_input_lower_ptr + xs_lower3 + 1, ys_input_lower_ptr + xs_upper3 + 1,
+      ys_input_upper_ptr + xs_lower3 + 1, ys_input_upper_ptr + xs_upper3 + 1,
+      ys_input_lower_ptr + xs_lower3 + 2, ys_input_lower_ptr + xs_upper3 + 2,
+      ys_input_upper_ptr + xs_lower3 + 2, ys_input_upper_ptr + xs_upper3 + 2,
+      xs_ilerp3, y_lerpsx);
+
+  const int32x4_t x0c0x0c1x0c2x1c0 = vcombine_s32(x0c0x0c1, x0c2x1c0);
+  const int32x4_t x1c1x1c2x2c0x2c1 = vcombine_s32(x1c1x1c2, x2c0x2c1);
+  const int32x4_t x2c2x3c0x3c1x3c2 = vcombine_s32(x2c2x3c0, x3c1x3c2);
+
+  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3),
+            x0c0x0c1x0c2x1c0);
+  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3 + 4),
+            x1c1x1c2x2c0x2c1);
+  vst1q_s32(reinterpret_cast<int32*>(output_y_ptr + x_start * 3 + 8),
+            x2c2x3c0x3c1x3c2);
+
+#else
+  for (int x = x_start; x < x_start + 4; ++x) {
+    OutputLerpForChannels<RESOLUTION, qint32, int32, int64>(
+        xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
+        output_y_ptr);
+  }
+#endif
+}
+
+template <typename T>
+void ResizeImageReference(typename TTypes<T, 4>::ConstTensor images,
+                          const int batch_size, const int64 in_height,
+                          const int64 in_width, const int64 out_height,
+                          const int64 out_width, const int channels,
+                          const float height_scale, const float width_scale,
+                          const float in_min, const float in_max,
+                          typename TTypes<T, 4>::Tensor* output) {
+  CHECK_NOTNULL(output);
+
+  const InterpolationCache<float> xs =
+      BuildLerpCache<float>(out_width, in_width, width_scale, channels, 0);
+  const InterpolationCache<float> ys =
+      BuildLerpCache<float>(out_height, in_height, height_scale, 1, 0);
+
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const T* input_b_ptr = images.data();
+
+  T* output_y_ptr = output->data();
+  for (int b = 0; b < batch_size; ++b) {
+    for (int64 y = 0; y < out_height; ++y) {
+      const T* ys_input_lower_ptr = input_b_ptr + ys.lower[y] * in_row_size;
+      const T* ys_input_upper_ptr = input_b_ptr + ys.upper[y] * in_row_size;
+      const float ys_lerp = ys.lerp[y];
+      for (int64 x = 0; x < out_width; ++x) {
+        const int64 xs_lower = xs.lower[x];
+        const int64 xs_upper = xs.upper[x];
+        const float xs_lerp = xs.lerp[x];
+        for (int c = 0; c < channels; ++c) {
+          const T top_left = ys_input_lower_ptr[xs_lower + c];
+          const T top_right = ys_input_lower_ptr[xs_upper + c];
+          const T bottom_left = ys_input_upper_ptr[xs_lower + c];
+          const T bottom_right = ys_input_upper_ptr[xs_upper + c];
+          const T val = ComputeLerpReference<T>(
+              top_left, top_right, bottom_left, bottom_right, xs_lerp, ys_lerp,
+              in_min, in_max);
+          output_y_ptr[x * channels + c] = val;
+        }
+      }
+      output_y_ptr += out_row_size;
+    }
+    input_b_ptr += in_batch_num_values;
+  }
+}
+
+template <typename T>
+void ResizeImage(typename TTypes<T, 4>::ConstTensor images,
+                 const int batch_size, const int64 in_height,
+                 const int64 in_width, const int64 out_height,
+                 const int64 out_width, const int channels,
+                 const float height_scale, const float width_scale,
+                 const float in_min, const float in_max,
+                 typename TTypes<T, 4>::Tensor* output) {
+  ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
+                          out_width, channels, height_scale, width_scale,
+                          in_min, in_max, output);
+}
+
+template <>
+void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
+                         const int batch_size, const int64 in_height,
+                         const int64 in_width, const int64 out_height,
+                         const int64 out_width, const int channels,
+                         const float height_scale, const float width_scale,
+                         const float in_min, const float in_max,
+                         typename TTypes<qint32, 4>::Tensor* output) {
+  // 30 is maximum resolution for signed int.
+  constexpr int RESOLUTION = 30;
+  constexpr int SIMD_STEP = 4;
+
+  CHECK_NOTNULL(output);
+
+  const InterpolationCache<int32> xs = BuildLerpCache<int32>(
+      out_width, in_width, width_scale, channels, RESOLUTION);
+  const InterpolationCache<int32> ys =
+      BuildLerpCache<int32>(out_height, in_height, height_scale, 1, RESOLUTION);
+
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const qint32* input_b_ptr = images.data();
+
+  qint32* output_y_ptr = output->data();
+
+  for (int b = 0; b < batch_size; ++b) {
+    for (int64 y = 0; y < out_height; ++y) {
+      const qint32* ys_input_lower_ptr =
+          input_b_ptr + ys.lower[y] * in_row_size;
+      const qint32* ys_input_upper_ptr =
+          input_b_ptr + ys.upper[y] * in_row_size;
+      const int32 ys_ilerp = ys.ilerp[y];
+      // Optimized for channels == 1 or channels == 3 as this
+      // is typical channels.
+      int64 x = 0;
+      if (channels == 1) {
+        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
+          OutputLerp32x4x1<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
+                                       ys_input_lower_ptr, ys_input_upper_ptr,
+                                       output_y_ptr);
+        }
+      } else if (channels == 3) {
+        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
+          OutputLerp32x4x3<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
+                                       ys_input_lower_ptr, ys_input_upper_ptr,
+                                       output_y_ptr);
+        }
+      }
+      for (; x < out_width; ++x) {
+        OutputLerpForChannels<RESOLUTION, qint32, int32, int64>(
+            xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
+            ys_input_upper_ptr, output_y_ptr);
+      }
+      output_y_ptr += out_row_size;
+    }
+    input_b_ptr += in_batch_num_values;
+  }
+}
+
+template <>
+void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
+                         const int batch_size, const int64 in_height,
+                         const int64 in_width, const int64 out_height,
+                         const int64 out_width, const int channels,
+                         const float height_scale, const float width_scale,
+                         const float in_min, const float in_max,
+                         typename TTypes<quint8, 4>::Tensor* output) {
+  // 7 is maximum resolution for unsigned byte.
+  constexpr int RESOLUTION = 7;
+  constexpr int SIMD_STEP = 8;
+
+  CHECK_NOTNULL(output);
+
+  const InterpolationCache<int16> xs = BuildLerpCache<int16>(
+      out_width, in_width, width_scale, channels, RESOLUTION);
+  const InterpolationCache<int16> ys =
+      BuildLerpCache<int16>(out_height, in_height, height_scale, 1, RESOLUTION);
+
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const quint8* input_b_ptr = images.data();
+
+  quint8* output_y_ptr = output->data();
+
+  for (int b = 0; b < batch_size; ++b) {
+    for (int64 y = 0; y < out_height; ++y) {
+      const quint8* ys_input_lower_ptr =
+          input_b_ptr + ys.lower[y] * in_row_size;
+      const quint8* ys_input_upper_ptr =
+          input_b_ptr + ys.upper[y] * in_row_size;
+      const int32 ys_ilerp = ys.ilerp[y];
+      // Optimized for channels == 1 or channels == 3 as this
+      // is typical channels.
+      // TODO(satok): Support more generic NEON optimized implementation
+      // for different channels.
+      int64 x = 0;
+      if (channels == 1) {
+        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
+          OutputLerp8x8x1<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
+                                      ys_input_lower_ptr, ys_input_upper_ptr,
+                                      output_y_ptr);
+        }
+      } else if (channels == 3) {
+        for (; x < out_width - SIMD_STEP + 1; x += SIMD_STEP) {
+          OutputLerp8x8x3<RESOLUTION>(xs, x, ys_ilerp, in_min, in_max,
+                                      ys_input_lower_ptr, ys_input_upper_ptr,
+                                      output_y_ptr);
+        }
+      }
+      for (; x < out_width; ++x) {
+        OutputLerpForChannels<RESOLUTION, quint8, int16, int16>(
+            xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
+            ys_input_upper_ptr, output_y_ptr);
+      }
+      output_y_ptr += out_row_size;
+    }
+    input_b_ptr += in_batch_num_values;
+  }
+}
+
+template <typename T>
+void ResizeBilinear(const typename TTypes<T, 4>::ConstTensor& images,
+                    const float height_scale, const float width_scale,
+                    const float in_min, const float in_max,
+                    typename TTypes<T, 4>::Tensor* output) {
+  CHECK_NOTNULL(output);
+
+  const int batch_size = images.dimension(0);
+  const int64 in_height = images.dimension(1);
+  const int64 in_width = images.dimension(2);
+  const int channels = images.dimension(3);
+
+  const int64 out_height = output->dimension(1);
+  const int64 out_width = output->dimension(2);
+
+  // Handle no-op resizes efficiently.
+  if (out_height == in_height && out_width == in_width) {
+    *output = images.template cast<T>();
+    return;
+  }
+
+  if (USE_REFERENCE) {
+    ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
+                            out_width, channels, height_scale, width_scale,
+                            in_min, in_max, output);
+  } else {
+    ResizeImage<T>(images, batch_size, in_height, in_width, out_height,
+                   out_width, channels, height_scale, width_scale, in_min,
+                   in_max, output);
+  }
+}
+
+}  // namespace
+
+template <class T>
+class QuantizedResizeBilinearOp : public OpKernel {
+ public:
+  explicit QuantizedResizeBilinearOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const float in_min = context->input(2).flat<float>()(0);
+    const float in_max = context->input(3).flat<float>()(0);
+
+    ImageResizerState st(align_corners_);
+    st.ValidateAndCreateOutput(context, input);
+
+    if (!context->status().ok()) return;
+
+    // Return if the output is empty.
+    if (st.output->NumElements() == 0) return;
+
+    typename TTypes<T, 4>::ConstTensor image_data = input.tensor<T, 4>();
+    typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
+
+    ResizeBilinear<T>(image_data, st.height_scale, st.width_scale, in_min,
+                      in_max, &output_data);
+    Tensor* out_min = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, {}, &out_min));
+    out_min->flat<float>()(0) = in_min;
+
+    Tensor* out_max = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {}, &out_max));
+    out_max->flat<float>()(0) = in_max;
+  }
+
+ private:
+  bool align_corners_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(QuantizedResizeBilinearOp<T>);
+};
+
+#define REGISTER_CPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("QuantizedResizeBilinear") \
+                              .Device(DEVICE_CPU)         \
+                              .HostMemory("size")         \
+                              .TypeConstraint<type>("T"), \
+                          QuantizedResizeBilinearOp<type>)
+
+REGISTER_CPU_KERNEL(::tensorflow::quint8);
+REGISTER_CPU_KERNEL(::tensorflow::qint32);
+REGISTER_CPU_KERNEL(float);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
new file mode 100644
index 00000000000..8d3d7105a4a
--- /dev/null
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -0,0 +1,394 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/gradients.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+constexpr const float RESIZE_VAL_TOLERANCE = 1.0e-8;
+
+template <typename T>
+Tensor BuildTensor(const int batch_size, const int height, const int width,
+                   const int channels, const float ratio, const float min,
+                   const float max) {
+  Tensor tensor(DataTypeToEnum<T>::value,
+                TensorShape({batch_size, height, width, channels}));
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    tensor.flat<T>()(i) =
+        FloatToQuantized<T>(static_cast<float>(i) / ratio, min, max);
+  }
+  return tensor;
+}
+
+template <>
+Tensor BuildTensor<float>(const int batch_size, const int height,
+                          const int width, const int channels,
+                          const float ratio, const float min, const float max) {
+  Tensor tensor(DT_FLOAT, TensorShape({batch_size, height, width, channels}));
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    tensor.flat<float>()(i) = static_cast<float>(i) / ratio;
+  }
+  return tensor;
+}
+
+float CalculateResizeScale(int64 in_size, int64 out_size, bool align_corners) {
+  return (align_corners && out_size > 1)
+             ? (in_size - 1) / static_cast<float>(out_size - 1)
+             : in_size / static_cast<float>(out_size);
+}
+
+inline std::tuple<int64, int64, float> GetReferenceWeight(const int64 out_size,
+                                                          const int64 in_size,
+                                                          const int step,
+                                                          const int index,
+                                                          const float scale) {
+  const float in = index * scale;
+  const int64 lower = static_cast<int64>(in);
+  const int64 upper = std::min(lower + 1, in_size - 1);
+  return std::make_tuple(lower * step, upper * step, in - lower);
+}
+
+template <typename T>
+T ComputeLerpReference(const T in_top_left, const T in_top_right,
+                       const T in_bottom_left, const T in_bottom_right,
+                       const float x_lerp, const float y_lerp, const float min,
+                       const float max) {
+  const float top_left = QuantizedToFloat<T>(in_top_left, min, max);
+  const float top_right = QuantizedToFloat<T>(in_top_right, min, max);
+  const float bottom_left = QuantizedToFloat<T>(in_bottom_left, min, max);
+  const float bottom_right = QuantizedToFloat<T>(in_bottom_right, min, max);
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  const float out = top + (bottom - top) * y_lerp;
+  return FloatToQuantized<T>(out, min, max);
+}
+
+template <>
+float ComputeLerpReference<float>(const float in_top_left,
+                                  const float in_top_right,
+                                  const float in_bottom_left,
+                                  const float in_bottom_right,
+                                  const float x_lerp, const float y_lerp,
+                                  const float min, const float max) {
+  const float top = in_top_left + (in_top_right - in_top_left) * x_lerp;
+  const float bottom =
+      in_bottom_left + (in_bottom_right - in_bottom_left) * x_lerp;
+  return top + (bottom - top) * y_lerp;
+}
+
+template <typename T>
+T CalcReferenceResizedVal(const T* image_data, const int batch_size,
+                          const int64 in_height, const int64 in_width,
+                          const int64 out_height, const int64 out_width,
+                          const int channels, const float height_scale,
+                          const float width_scale, const float min,
+                          const float max, const int b, const int64 x,
+                          const int64 y, const int c) {
+  const std::tuple<int64, int64, float> x_weight =
+      GetReferenceWeight(out_width, in_width, channels, x, width_scale);
+  const std::tuple<int64, int64, float> y_weight =
+      GetReferenceWeight(out_height, in_height, 1, y, height_scale);
+
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+
+  const int y_lower_index =
+      b * in_batch_num_values + std::get<0>(y_weight) * in_row_size;
+  const int y_upper_index =
+      b * in_batch_num_values + std::get<1>(y_weight) * in_row_size;
+
+  const int64 xs_lower = std::get<0>(x_weight);
+  const int64 xs_upper = std::get<1>(x_weight);
+  const float xs_lerp = std::get<2>(x_weight);
+  const float ys_lerp = std::get<2>(y_weight);
+  const float top_left = image_data[y_lower_index + xs_lower + c];
+  const float top_right = image_data[y_lower_index + xs_upper + c];
+  const float bottom_left = image_data[y_upper_index + xs_lower + c];
+  const float bottom_right = image_data[y_upper_index + xs_upper + c];
+  const float val =
+      ComputeLerpReference<T>(top_left, top_right, bottom_left, bottom_right,
+                              xs_lerp, ys_lerp, min, max);
+  return val;
+}
+
+template <typename T>
+void CheckTensorValue(const T* in_data, const T* out_data, const int batch_size,
+                      const int64 in_height, const int64 in_width,
+                      const int64 out_height, const int64 out_width,
+                      const int channels, const bool align_corners,
+                      const float min, const float max, const float tolerance,
+                      const bool relative) {
+  const int64 out_row_size = out_width * channels;
+  const float height_scale =
+      CalculateResizeScale(in_height, out_height, align_corners);
+  const float width_scale =
+      CalculateResizeScale(in_width, out_width, align_corners);
+
+  for (int b = 0; b < batch_size; ++b) {
+    for (int64 y = 0; y < out_height; ++y) {
+      for (int64 x = 0; x < out_width; ++x) {
+        for (int c = 0; c < channels; ++c) {
+          const T ref_qval = CalcReferenceResizedVal<T>(
+              in_data, batch_size, in_height, in_width, out_height, out_width,
+              channels, height_scale, width_scale, min, max, b, x, y, c);
+          const T qval =
+              out_data[(b * out_height + y) * out_row_size + x * channels + c];
+          const float ref_val = QuantizedToFloat<T>(ref_qval, min, max);
+          const float val = QuantizedToFloat<T>(qval, min, max);
+          if (!relative) {
+            const int q_tolerance = std::round(tolerance);
+            EXPECT_TRUE(std::abs(static_cast<int32>(ref_qval) -
+                                 static_cast<int32>(qval)) <= q_tolerance)
+                << "ref = " << ref_val << ", val = " << val << ", " << b << ", "
+                << y << ", " << x << ", " << c << ", qval = " << qval
+                << ", ref qval = " << ref_qval << ", " << q_tolerance;
+          } else {
+            const float rel_tolerance = std::max(ref_val, 1.0f) * tolerance;
+            EXPECT_NEAR(ref_val, val, rel_tolerance)
+                << "ref = " << ref_val << ", val = " << val << ", " << b << ", "
+                << y << ", " << x << ", " << c << ", ref qval = " << qval;
+          }
+        }
+      }
+    }
+  }
+}
+
+void TestResizeBilinear(const Tensor& image_tensor, const DataType dt,
+                        const Input::Initializer& new_size,
+                        const bool show_time, const int64 iterations,
+                        const float min, const float max,
+                        std::vector<Tensor>* outputs) {
+  Scope root = Scope::NewRootScope();
+
+  Output placeholder = ops::Placeholder(root.WithOpName("placeholder"), dt);
+  Output size = ops::Const<int32>(root.WithOpName("size"), new_size);
+  Output in_min = ops::Const<float>(root.WithOpName("min"), min);
+  Output in_max = ops::Const<float>(root.WithOpName("max"), max);
+
+  ops::QuantizedResizeBilinear qrb = ops::QuantizedResizeBilinear(
+      root.WithOpName("qrb"), placeholder, size, in_min, in_max);
+
+  TF_EXPECT_OK(root.status());
+
+  ClientSession session(root);
+
+  int64 total_duration = 0;
+  outputs->clear();
+
+  for (int i = 0; i < iterations; ++i) {
+    const int64 start_time = Env::Default()->NowMicros();
+    TF_EXPECT_OK(session.Run({{placeholder, image_tensor}},
+                             {qrb.resized_images, qrb.out_min, qrb.out_max},
+                             outputs));
+    const int64 end_time = Env::Default()->NowMicros();
+    total_duration += end_time - start_time;
+  }
+  const int64 one_run_duration = total_duration / iterations;
+
+  const int64 num_ops = outputs->at(0).NumElements();
+
+  const double million_ops_per_second =
+      (iterations * num_ops) / static_cast<double>(total_duration);
+
+  if (show_time) {
+    LOG(INFO) << "Time resize bilinear: "
+              << TensorShape(image_tensor.shape()).DebugString()
+              << ": iterations=" << iterations
+              << ", MOps/s=" << million_ops_per_second
+              << ", one_run_duration=" << one_run_duration
+              << ", total_duration=" << total_duration;
+  }
+}
+
+}  // namespace
+
+void TestResizeBilinearOneDim() {
+  constexpr float TOLERANCE = 1.0e-5;
+  constexpr int IN_WIDTH = 128;
+  constexpr int OUT_WIDTH = 256;
+  constexpr float MIN = 0.0f;
+  constexpr float MAX = 256.0f;
+  constexpr float SCALE = static_cast<float>(IN_WIDTH) / OUT_WIDTH;
+  Tensor image_quantized_tensor(DT_QINT32, TensorShape({1, 1, IN_WIDTH, 1}));
+
+  for (int64 i = 0; i < image_quantized_tensor.NumElements(); ++i) {
+    image_quantized_tensor.flat<qint32>()(i) =
+        FloatToQuantized<qint32>(static_cast<float>(i), MIN, MAX);
+  }
+
+  std::vector<Tensor> outputs;
+  TestResizeBilinear(image_quantized_tensor, DT_QINT32, {1, OUT_WIDTH}, false,
+                     1, MIN, MAX, &outputs);
+  ASSERT_EQ(3, outputs.size());
+  ASSERT_EQ(OUT_WIDTH, outputs.at(0).NumElements());
+  ASSERT_EQ(4, outputs.at(0).shape().dims());
+  ASSERT_EQ(OUT_WIDTH, outputs.at(0).shape().dim_size(2));
+
+  // Manual value testing
+  for (int64 i = 0; i < outputs.at(0).NumElements(); ++i) {
+    const float resized_image_val =
+        QuantizedToFloat<qint32>(outputs.at(0).flat<qint32>()(i), MIN, MAX);
+    float expected_val = 0.0f;
+    if (i == 0 || i == outputs.at(0).NumElements() - 1 || i % 2 == 0) {
+      expected_val = QuantizedToFloat<qint32>(
+          image_quantized_tensor.flat<qint32>()(i / 2), MIN, MAX);
+    } else {
+      const float image_val0 = QuantizedToFloat<qint32>(
+          image_quantized_tensor.flat<qint32>()(i / 2), MIN, MAX);
+      const float image_val1 = QuantizedToFloat<qint32>(
+          image_quantized_tensor.flat<qint32>()(i / 2 + 1), MIN, MAX);
+      expected_val = (image_val0 + image_val1) * SCALE;
+    }
+    VLOG(1) << "(" << i << ") " << expected_val << ", " << resized_image_val;
+    EXPECT_NEAR(expected_val, resized_image_val, RESIZE_VAL_TOLERANCE)
+        << expected_val << ", " << resized_image_val;
+  }
+
+  // Value testing with reference implemenatation
+  CheckTensorValue<qint32>(image_quantized_tensor.flat<qint32>().data(),
+                           outputs.at(0).flat<qint32>().data(),
+                           /*batch_size=*/1,
+                           /*in_height=*/IN_WIDTH,
+                           /*in_width=*/1,
+                           /*out_height=*/OUT_WIDTH,
+                           /*out_width=*/1,
+                           /*channels=*/1,
+                           /*align_corners=*/false, MIN, MAX, TOLERANCE, true);
+}
+
+template <typename T>
+void RunTestResizeBilinearTwoDims(int batch_size, int in_height, int in_width,
+                                  int out_height, int out_width, int channels,
+                                  float tolerance, bool relative) {
+  constexpr float RATIO = 100.0f;
+  const float min = 0.0f;
+  const float max = batch_size * in_height * in_width * channels / RATIO;
+
+  const Tensor image_quantized_tensor = BuildTensor<T>(
+      batch_size, in_height, in_width, channels, RATIO, min, max);
+
+  std::vector<Tensor> outputs;
+  TestResizeBilinear(image_quantized_tensor, DataTypeToEnum<T>::value,
+                     {out_height, out_width}, false, 1, min, max, &outputs);
+  CheckTensorValue<T>(image_quantized_tensor.flat<T>().data(),
+                      outputs.at(0).flat<T>().data(), batch_size, in_height,
+                      in_width, out_height, out_width, channels,
+                      /*align_corners=*/false, min, max, tolerance, relative);
+}
+
+template <typename T>
+void RunBenchmarkResizeBilinearTwoDims(int batch_size, int in_height,
+                                       int in_width, int out_height,
+                                       int out_width, int channels,
+                                       int iteration) {
+  constexpr float RATIO = 100.0f;
+  const float min = 0.0f;
+  const float max = batch_size * in_height * in_width * channels / RATIO;
+
+  const Tensor image_quantized_tensor = BuildTensor<T>(
+      batch_size, in_height, in_width, channels, RATIO, min, max);
+
+  std::vector<Tensor> outputs;
+  TestResizeBilinear(image_quantized_tensor, DataTypeToEnum<T>::value,
+                     {out_height, out_width}, true, iteration, min, max,
+                     &outputs);
+}
+
+template <typename T>
+void TestResizeBilinearTwoDimsType(const float tolerance, const bool relative) {
+  RunTestResizeBilinearTwoDims<T>(1, 1, 1, 1, 1, 1, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 1, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 1, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, tolerance,
+                                  relative);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 1, tolerance,
+                                  relative);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 2, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 2, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 2, tolerance,
+                                  relative);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 2, tolerance,
+                                  relative);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 16, 1, 32, 3, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 3, tolerance, relative);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, tolerance,
+                                  relative);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 3, tolerance,
+                                  relative);
+}
+
+void TestResizeBilinearTwoDims() {
+  TestResizeBilinearTwoDimsType<quint8>(1.0f, false);
+  TestResizeBilinearTwoDimsType<qint32>(1.0e-5, true);
+  TestResizeBilinearTwoDimsType<float>(1.0e-5, true);
+}
+
+template <typename T>
+void RunBenchmarkResizeBilinearTwoDimsType() {
+  constexpr int ITER = 100;
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 1, 1, 2, 2, 1, ITER);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, ITER);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, ITER);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 64, 64, 128, 128, 2, ITER);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 32, 32, 64, 64, 16, ITER);
+}
+
+void RunBenchmarkResizeBilinearTwoDims() {
+  LOG(INFO) << "Benchmark quint8";
+  RunBenchmarkResizeBilinearTwoDimsType<quint8>();
+  LOG(INFO) << "Benchmark qint32";
+  RunBenchmarkResizeBilinearTwoDimsType<qint32>();
+  LOG(INFO) << "Benchmark float";
+  RunBenchmarkResizeBilinearTwoDimsType<float>();
+}
+
+}  // namespace tensorflow
+
+#if defined(__ANDROID__)
+int main(int argc, char** argv) {
+#define RUN_TEST(t)            \
+  LOG(INFO) << "Test: " << #t; \
+  tensorflow::t();
+#else
+#define RUN_TEST(t) \
+  TEST(QuantizationResizeBilenarTest, t) { tensorflow::t(); }
+#endif
+
+  RUN_TEST(TestResizeBilinearOneDim);
+  RUN_TEST(TestResizeBilinearTwoDims);
+
+#if defined(__ANDROID__)
+  RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
+  LOG(INFO) << "All tests complete.";
+  return 0;
+}
+#endif
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index a0a0cbb616d..07ff70a8752 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -375,6 +375,7 @@ Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
   HANDLE_TYPE(DT_QINT32);
   HANDLE_TYPE(DT_QINT16);
   HANDLE_TYPE(DT_QUINT16);
+  HANDLE_TYPE(DT_UINT16);
 #undef HANDLE_TYPE
   return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
                                parent.dtype());
@@ -405,6 +406,7 @@ Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
   HANDLE_TYPE(DT_QINT32);
   HANDLE_TYPE(DT_QINT16);
   HANDLE_TYPE(DT_QUINT16);
+  HANDLE_TYPE(DT_UINT16);
 #undef HANDLE_TYPE
   return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
                                element.dtype());
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 79b479b44b5..0a0e51a7c3c 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -143,7 +143,7 @@ class QueueBase : public QueueInterface {
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
   const string name_;
-  mutex mu_;
+  mutable mutex mu_;
   bool closed_ GUARDED_BY(mu_);
 
   struct Attempt;
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index e13ea46e56f..99d2d19bfda 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -56,6 +56,13 @@ class TypedQueueOp : public QueueOp {
  public:
   using QueueOp::QueueOp;
 
+  void Compute(OpKernelContext* context) override {
+    QueueOp::Compute(context);
+    if (queue_ && context->track_allocations()) {
+      context->record_host_persistent_memory_allocation(queue_->MemoryUsed());
+    }
+  }
+
  protected:
   template <typename TypedQueue>
   Status CreateTypedQueue(TypedQueue* queue, QueueInterface** ret) {
@@ -63,8 +70,12 @@ class TypedQueueOp : public QueueOp {
       return errors::ResourceExhausted("Failed to allocate queue.");
     }
     *ret = queue;
+    queue_ = queue;
     return queue->Initialize();
   }
+
+ private:
+  QueueInterface* queue_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 301d1420a43..f2ac09c4e6d 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -433,7 +433,7 @@ class FakeQueueOp : public OpKernel {
                                                 &handle_, nullptr));
   }
 
-  void Compute(OpKernelContext* context) {
+  void Compute(OpKernelContext* context) override {
     ResourceHandle ref = context->input(0).flat<ResourceHandle>()(0);
     handle_.AccessTensor(context)->flat<string>()(0) = ref.container();
     handle_.AccessTensor(context)->flat<string>()(1) = ref.name();
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 0a1de111627..e78f8e26211 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -48,6 +48,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -178,27 +181,9 @@ namespace {
 
 static Status AllocateOutputWithShape(OpKernelContext* ctx, const Tensor& shape,
                                       int index, Tensor** output) {
-  if (!ctx->op_kernel().IsLegacyVector(shape.shape())) {
-    return errors::InvalidArgument(
-        "shape must be a vector of {int32,int64}, got shape ",
-        shape.shape().DebugString());
-  }
-  if (shape.dtype() == DataType::DT_INT32) {
-    auto vec = shape.flat<int32>();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeUtils::MakeShape(vec.data(), vec.size(), &tensor_shape));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(index, tensor_shape, output));
-  } else if (shape.dtype() == DataType::DT_INT64) {
-    auto vec = shape.flat<int64>();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeUtils::MakeShape(vec.data(), vec.size(), &tensor_shape));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(index, tensor_shape, output));
-  } else {
-    return errors::InvalidArgument("shape must be a vector of {int32,int64}.");
-  }
-  return Status::OK();
+  TensorShape tensor_shape;
+  TF_RETURN_IF_ERROR(ctx->op_kernel().MakeShape(shape, &tensor_shape));
+  return ctx->allocate_output(index, tensor_shape, output);
 }
 
 // For now, use the same interface as RandomOp, so we can choose either one
@@ -303,10 +288,7 @@ class RandomGammaOp : public OpKernel {
                                                       &samples_shape));
     }
     const int64 num_samples = samples_shape.num_elements();
-    OP_REQUIRES(ctx, num_samples > 0,
-                errors::InvalidArgument(
-                    "Input shape should have non-zero element count, got: ",
-                    num_samples));
+    if (num_samples == 0) return;
 
     samples_shape.AppendShape(alpha_t.shape());
     // Allocate output samples.
@@ -468,6 +450,12 @@ class RandomGammaOp : public OpKernel {
 #define REGISTER(TYPE)                                                      \
   template struct functor::FillPhiloxRandom<                                \
       CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
+  template struct functor::FillPhiloxRandom<                                \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
+  template struct functor::FillPhiloxRandom<                                \
+      CPUDevice,                                                            \
+      random::TruncatedNormalDistribution<                                  \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
   REGISTER_KERNEL_BUILDER(                                                  \
       Name("RandomUniform")                                                 \
           .Device(DEVICE_CPU)                                               \
@@ -541,7 +529,7 @@ TF_CALL_int64(REGISTER_INT);
       PhiloxRandomOp<                                               \
           GPUDevice,                                                \
           random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -564,4 +552,193 @@ TF_CALL_int64(REGISTER_INT);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+
+namespace functor {
+
+using namespace cl;
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
+      : data_(data),
+        gen_(gen),
+        dist_(dist) {
+  }
+
+  void operator()(sycl::nd_item<1> item) {
+    const size_t kGroupSize = Distribution::kResultElementCount;
+
+    const size_t item_id = item.get_global(0);
+    const size_t total_item_count = item.get_global_range(0);
+    size_t offset = item_id * kGroupSize;
+    gen_.Skip(item_id);
+
+    const size_t size = data_.get_size() / sizeof(T);
+    T* data = ConvertToActualTypeSycl(T, data_);
+
+    while (offset + kGroupSize <= size) {
+      const typename Distribution::ResultType samples = dist_(&gen_);
+      for (size_t i = 0; i < kGroupSize; ++i) {
+        data[offset + i] = samples[i];
+      }
+
+      offset += (total_item_count - 1) * kGroupSize;
+      gen_.Skip(total_item_count - 1);
+    }
+
+    const typename Distribution::ResultType samples = dist_(&gen_);
+    for (size_t i = 0; i < kGroupSize; ++i) {
+      if (offset >= size) {
+          return;
+      }
+      data[offset] = samples[i];
+      ++offset;
+    }
+  }
+
+ private:
+  write_accessor data_;
+  random::PhiloxRandom gen_;
+  Distribution dist_;
+};
+
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
+      : data_(data),
+        gen_(gen),
+        dist_(dist) {
+  }
+
+  void operator()(sycl::nd_item<1> item) {
+    using random::PhiloxRandom;
+    using random::SingleSampleAdapter;
+
+    const size_t kReservedSamplesPerOutput = 256;
+    const size_t kGroupSize = Distribution::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
+                                                kReservedSamplesPerOutput /
+                                                PhiloxRandom::kResultElementCount;
+
+    const size_t item_id = item.get_global(0);
+    const size_t total_item_count = item.get_global_range(0);
+    size_t group_index = item_id;
+    size_t offset = group_index * kGroupSize;
+
+    T* data = ConvertToActualTypeSycl(T, data_);
+    const size_t size = data_.get_size() / sizeof(T);
+
+    while (offset < size) {
+      // Since each output takes a variable number of samples, we need to
+      // realign the generator to the beginning for the current output group
+      PhiloxRandom gen = gen_;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      const typename Distribution::ResultType samples = dist_(&single_samples);
+
+      for (size_t i = 0; i < kGroupSize; ++i) {
+        if (offset >= size) {
+          return;
+        }
+        data[offset] = samples[i];
+        ++offset;
+      }
+
+      offset += (total_item_count - 1) * kGroupSize;
+      group_index += total_item_count;
+    }
+  }
+
+ private:
+  write_accessor data_;
+  random::PhiloxRandom gen_;
+  Distribution dist_;
+};
+
+template <typename T>
+class FillRandomKernel;
+// Partial specialization for SYCL to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
+    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
+    typename Distribution::ResultElementType* data, int64 size,
+    Distribution dist) {
+
+  const size_t group_size = device.maxSyclThreadsPerBlock();
+  const size_t group_count = (size + group_size - 1) / group_size;
+
+  auto buffer = device.get_sycl_buffer(data);
+
+  device.sycl_queue().submit([&](sycl::handler& cgh) {
+    auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
+
+    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    cgh.parallel_for<class FillRandomKernel<Distribution>>(
+      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
+      task
+    );
+  });
+}
+
+}
+
+#define REGISTER(TYPE)                                                       \
+  template struct functor::FillPhiloxRandom<                                 \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RandomUniform")                                                  \
+          .Device(DEVICE_SYCL)                                               \
+          .HostMemory("shape")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                    \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
+                                    random::PhiloxRandom, TYPE> >);          \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RandomStandardNormal")                                           \
+          .Device(DEVICE_SYCL)                                               \
+          .HostMemory("shape")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                    \
+      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
+                                    random::PhiloxRandom, TYPE> >);          \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("TruncatedNormal")                                                \
+          .Device(DEVICE_SYCL)                                               \
+          .HostMemory("shape")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                    \
+      PhiloxRandomOp<                                                        \
+          SYCLDevice,                                                        \
+          random::TruncatedNormalDistribution<                               \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+#define REGISTER_INT(IntType)                                    \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
+                              .Device(DEVICE_SYCL)               \
+                              .HostMemory("shape")               \
+                              .HostMemory("minval")              \
+                              .HostMemory("maxval")              \
+                              .TypeConstraint<IntType>("Tout"),  \
+                          RandomUniformIntOp<SYCLDevice, IntType>);
+
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+TF_CALL_int32(REGISTER_INT);
+TF_CALL_int64(REGISTER_INT);
+
+#undef REGISTER
+#undef REGISTER_INT
+
+#endif // TENSORFLOW_USE_SYCL
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index b52901c38e3..97bcaf1a49a 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -54,6 +54,18 @@ struct FillPhiloxRandom<GPUDevice, Distribution> {
 };
 #endif  // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+// Declares the partially SYCL-specialized functor struct.
+template <class Distribution>
+struct FillPhiloxRandom<SYCLDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const SYCLDevice& d,
+                  random::PhiloxRandom gen,
+                  typename Distribution::ResultElementType* data, int64 size,
+                  Distribution dist);
+};
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 5f7d9b7dd69..7afa6974c6a 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -141,7 +141,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
       const typename Distribution::ResultType samples = dist(&gen);
       copier(&data[offset], samples);
 
-      offset += (total_thread_count - 1) * kGroupSize;
+      offset += total_thread_count * kGroupSize;
       gen.Skip(total_thread_count - 1);
     }
 
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
new file mode 100644
index 00000000000..66123e47c6e
--- /dev/null
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -0,0 +1,339 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/random_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/random_poisson_op.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push")       \
+      _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+#define UNIFORM(X)                                    \
+  if (uniform_remaining == 0) {                       \
+    uniform_remaining = Uniform::kResultElementCount; \
+    uniform_result = uniform(&gen);                   \
+  }                                                   \
+  uniform_remaining--;                                \
+  CT X = uniform_result[uniform_remaining]
+
+namespace tensorflow {
+namespace {
+
+static constexpr int kReservedSamplesPerOutput = 256;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// We will compute half-precision Poisson samples with float precision
+// intermediate calculations.
+template <typename T>
+struct PoissonComputeType {
+  typedef T ComputeType;
+};
+
+template <>
+struct PoissonComputeType<Eigen::half> {
+  typedef float ComputeType;
+};
+
+}  // namespace
+
+namespace functor {
+
+template <typename Device, typename T>
+struct PoissonFunctor {
+  void operator()(OpKernelContext* ctx, const Device& d, const T* rate_flat,
+                  int num_rate, int num_samples,
+                  const random::PhiloxRandom& rng, T* samples_flat);
+};
+
+template <typename T>
+struct PoissonFunctor<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const T* rate_flat,
+                  int num_rate, int num_samples,
+                  const random::PhiloxRandom& rng, T* samples_flat) {
+    // Two different algorithms are employed, depending on the size of
+    // rate.
+    // If rate < 10, we use an algorithm attributed to Knuth:
+    // Seminumerical Algorithms. Art of Computer Programming, Volume 2.
+    //
+    // This algorithm runs in O(rate) time, and will require O(rate)
+    // uniform
+    // variates.
+    //
+    // If rate >= 10 we use a transformation-rejection algorithm from
+    // pairs
+    // of uniform random variables due to Hormann.
+    // http://www.sciencedirect.com/science/article/pii/0167668793909974
+    //
+    // The algorithm has an acceptance rate of ~89% for the smallest rate
+    // (~10),
+    // and higher accept rates for higher rate, so runtime is
+    // O(NumRate * NumSamples * k) with k ~ 1 / 0.89.
+    //
+    // We partition work first across rates then across
+    // samples-per-rate to
+    // avoid a couple flops which can be done on a per-rate basis.
+
+    typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
+
+    auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
+        int start_output, int limit_output) {
+      // Capturing "rng" by value would only make a copy for the _shared_
+      // lambda.  Since we want to let each worker have its own copy, we pass
+      // "rng" by reference and explicitly do a copy assignment.
+
+      Uniform uniform;
+      typename Uniform::ResultType uniform_result;
+      for (int64 output_idx = start_output; output_idx < limit_output;
+           /* output_idx incremented within inner loop below */) {
+        const int64 rate_idx = output_idx / num_samples;
+
+        // Several calculations can be done on a per-rate basis.
+        const CT rate = CT(rate_flat[rate_idx]);
+
+        auto samples_rate_output = samples_flat + rate_idx;
+
+        if (rate < CT(10)) {
+          // Knuth's algorithm for generating Poisson random variates.
+          // Given a Poisson process, the time between events is exponentially
+          // distributed. If we have a Poisson process with rate lambda, then,
+          // the time between events is distributed Exp(lambda). If X ~
+          // Uniform(0, 1), then Y ~ Exp(lambda), where Y = -log(X) / lambda.
+          // Thus to simulate a Poisson draw, we can draw X_i ~ Exp(lambda),
+          // and N ~ Poisson(lambda), where N is the least number such that
+          // \sum_i^N X_i > 1.
+          const CT exp_neg_rate = Eigen::numext::exp(-rate);
+
+          // Compute the rest of the samples for the current rate value.
+          for (int64 sample_idx = output_idx % num_samples;
+               sample_idx < num_samples && output_idx < limit_output;
+               sample_idx++, output_idx++) {
+            random::PhiloxRandom gen = rng;
+            gen.Skip(kReservedSamplesPerOutput * output_idx);
+            int16 uniform_remaining = 0;
+
+            CT prod = 1;
+            CT x = 0;
+
+            // Keep trying until we surpass e^(-rate). This will take
+            // expected time proportional to rate.
+            while (true) {
+              UNIFORM(u);
+              prod = prod * u;
+              if (prod <= exp_neg_rate) {
+                samples_rate_output[sample_idx * num_rate] = T(x);
+                break;
+              }
+              x += 1;
+            }
+          }
+          continue;
+        }
+        // Transformed rejection due to Hormann.
+        //
+        // Given a CDF F(x), and G(x), a dominating distribution chosen such
+        // that it is close to the inverse CDF F^-1(x), compute the following
+        // steps:
+        //
+        // 1) Generate U and V, two independent random variates. Set U = U - 0.5
+        // (this step isn't strictly necessary, but is done to make some
+        // calculations symmetric and convenient. Henceforth, G is defined on
+        // [-0.5, 0.5]).
+        //
+        // 2) If V <= alpha * F'(G(U)) * G'(U), return floor(G(U)), else return
+        // to step 1. alpha is the acceptance probability of the rejection
+        // algorithm.
+        //
+        // For more details on transformed rejection, see:
+        // http://citeseer.ist.psu.edu/viewdoc/citations;jsessionid=1BEB35946CC807879F55D42512E5490C?doi=10.1.1.48.3054.
+        //
+        // The dominating distribution in this case:
+        //
+        // G(u) = (2 * a / (2 - |u|) + b) * u + c
+
+        using Eigen::numext::log;
+        const CT log_rate = log(rate);
+
+        // Constants used to define the dominating distribution. Names taken
+        // from Hormann's paper. Constants were chosen to define the tightest
+        // G(u) for the inverse Poisson CDF.
+        const CT b = CT(0.931) + CT(2.53) * Eigen::numext::sqrt(rate);
+        const CT a = CT(-0.059) + CT(0.02483) * b;
+
+        // This is the inverse acceptance rate. At a minimum (when rate = 10),
+        // this corresponds to ~75% acceptance. As the rate becomes larger, this
+        // approaches ~89%.
+        const CT inv_alpha = CT(1.1239) + CT(1.1328) / (b - CT(3.4));
+
+        // Compute the rest of the samples for the current rate value.
+        for (int64 sample_idx = output_idx % num_samples;
+             sample_idx < num_samples && output_idx < limit_output;
+             sample_idx++, output_idx++) {
+          random::PhiloxRandom gen = rng;
+          gen.Skip(kReservedSamplesPerOutput * output_idx);
+          int16 uniform_remaining = 0;
+
+          while (true) {
+            UNIFORM(u);
+            u -= CT(0.5);
+            UNIFORM(v);
+
+            CT u_shifted = CT(0.5) - Eigen::numext::abs(u);
+            CT k = Eigen::numext::floor((CT(2) * a / u_shifted + b) * u + rate +
+                                        CT(0.43));
+
+            // When alpha * f(G(U)) * G'(U) is close to 1, it is possible to
+            // find a rectangle (-u_r, u_r) x (0, v_r) under the curve, such
+            // that if v <= v_r and |u| <= u_r, then we can accept.
+            // Here v_r = 0.9227 - 3.6224 / (b - 2) and u_r = 0.43.
+            if (u_shifted >= CT(0.07) &&
+                v <= CT(0.9277) - CT(3.6224) / (b - CT(2))) {
+              samples_rate_output[sample_idx * num_rate] = T(k);
+              break;
+            }
+
+            if (k < 0 || (u_shifted < CT(0.013) && v > u_shifted)) {
+              continue;
+            }
+
+            // The expression below is equivalent to the computation of step 2)
+            // in transformed rejection (v <= alpha * F'(G(u)) * G'(u)).
+            CT s = log(v * inv_alpha / (a / (u_shifted * u_shifted) + b));
+            CT t = -rate + k * log_rate - Eigen::numext::lgamma(k + 1);
+            if (s <= t) {
+              samples_rate_output[sample_idx * num_rate] = T(k);
+              break;
+            }
+          }
+        }
+      }
+    };
+
+    // This will depend on rate.
+    // For rate < 10, on average, O(rate) calls to uniform are
+    // needed, with that
+    // many multiplies. ~10 uniform calls on average with ~25 cost op calls.
+    //
+    // Very roughly, for rate >= 10, the single call to log + call to
+    // lgamma
+    // occur for ~60 percent of samples.
+    // 2 x 100 (64-bit cycles per log) * 0.62 = ~124
+    // Additionally, there are ~10 other ops (+, *, /, ...) at 3-6 cycles each:
+    // 40 * .62  = ~25.
+    //
+    // Finally, there are several other ops that are done every loop along with
+    // 2 uniform generations along with 5 other ops at 3-6 cycles each.
+    // ~15 / .89 = ~16
+    //
+    // In total this should be ~165 + 2 * Uniform::kElementCost.
+    // We assume that half the tensor has rate < 10, so on average 6
+    // uniform's
+    // will be needed. We will upper bound the other op cost by the one for
+    // rate > 10.
+    static const int kElementCost = 165 + 6 * Uniform::kElementCost +
+                                    6 * random::PhiloxRandom::kElementCost;
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          num_rate * num_samples, kElementCost, DoWork);
+  }
+
+ private:
+  typedef typename PoissonComputeType<T>::ComputeType CT;
+};
+
+}  // namespace functor
+
+namespace {
+
+// Samples from one or more Poisson distributions.
+template <typename T>
+class RandomPoissonOp : public OpKernel {
+ public:
+  explicit RandomPoissonOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_t = ctx->input(0);
+    const Tensor& rate_t = ctx->input(1);
+
+    TensorShape samples_shape;
+    OP_REQUIRES_OK(ctx, MakeShape(shape_t, &samples_shape));
+    const int64 num_samples = samples_shape.num_elements();
+
+    samples_shape.AppendShape(rate_t.shape());
+    // Allocate output samples.
+    Tensor* samples_t = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, samples_shape, &samples_t));
+    if (num_samples == 0) return;
+
+    const auto rate_flat = rate_t.flat<T>().data();
+    const int64 num_rate = rate_t.NumElements();
+    OP_REQUIRES(
+        ctx, num_rate > 0,
+        errors::InvalidArgument(
+            "Input rate should have non-zero element count, got: ", num_rate));
+    auto samples_flat = samples_t->flat<T>().data();
+    random::PhiloxRandom rng = generator_.ReserveRandomOutputs(
+        num_samples * num_rate, kReservedSamplesPerOutput);
+
+    functor::PoissonFunctor<CPUDevice, T>()(ctx, ctx->eigen_device<CPUDevice>(),
+                                            rate_flat, num_rate, num_samples,
+                                            rng, samples_flat);
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomPoissonOp);
+};
+}  // namespace
+
+#undef UNIFORM
+
+#define REGISTER(TYPE)                                                        \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RandomPoisson").Device(DEVICE_CPU).TypeConstraint<TYPE>("dtype"), \
+      RandomPoissonOp<TYPE>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_poisson_op.h b/tensorflow/core/kernels/random_poisson_op.h
new file mode 100644
index 00000000000..6c49acc8007
--- /dev/null
+++ b/tensorflow/core/kernels/random_poisson_op.h
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
+#define TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
+
+namespace tensorflow {
+
+namespace functor {
+
+// Generic helper functor for the Random Poisson Op.
+template <typename Device, typename T>
+struct PoissonFunctor;
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
diff --git a/tensorflow/core/kernels/random_poisson_op_test.cc b/tensorflow/core/kernels/random_poisson_op_test.cc
new file mode 100644
index 00000000000..bccdbf6c7f5
--- /dev/null
+++ b/tensorflow/core/kernels/random_poisson_op_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <random>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Tensor VecShape(int64 v) {
+  if (v >= std::numeric_limits<int32>::max()) {
+    Tensor shape(DT_INT64, TensorShape({1}));
+    shape.vec<int64>()(0) = v;
+    return shape;
+  } else {
+    Tensor shape(DT_INT32, TensorShape({1}));
+    shape.vec<int32>()(0) = v;
+    return shape;
+  }
+}
+
+Tensor VecLam32(int64 n, int magnitude) {
+  std::mt19937 gen(0x12345);
+  std::uniform_real_distribution<float> dist(0.0, 1.0);
+  Tensor lams(DT_FLOAT, TensorShape({n}));
+  for (int i = 0; i < n; i++) {
+    // Generate in range (magnitude, 2 * magnitude)
+    lams.vec<float>()(i) = magnitude * (1 + dist(gen));
+  }
+  return lams;
+}
+
+Tensor VecLam64(int64 n, int magnitude) {
+  std::mt19937 gen(0x12345);
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  Tensor lams(DT_DOUBLE, TensorShape({n}));
+  for (int i = 0; i < n; i++) {
+    // Generate in range (magnitude, 2 * magnitude)
+    lams.vec<double>()(i) = magnitude * (1 + dist(gen));
+  }
+  return lams;
+}
+
+#define BM_Poisson(DEVICE, BITS, MAGNITUDE)                            \
+  static void BM_##DEVICE##_RandomPoisson_lam_##MAGNITUDE##_##BITS(    \
+      int iters, int nsamp, int nlam) {                                \
+    testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nlam); \
+    Graph* g = new Graph(OpRegistry::Global());                        \
+    test::graph::RandomPoisson(                                        \
+        g, test::graph::Constant(g, VecShape(nsamp)),                  \
+        test::graph::Constant(g, VecLam##BITS(nlam, MAGNITUDE)));      \
+    test::Benchmark(#DEVICE, g).Run(iters);                            \
+  }                                                                    \
+  BENCHMARK(BM_##DEVICE##_RandomPoisson_lam_##MAGNITUDE##_##BITS)      \
+      ->RangePair(1, 64, 2, 50);
+
+BM_Poisson(cpu, 32, 1);
+BM_Poisson(cpu, 32, 8);
+BM_Poisson(cpu, 32, 32);
+
+BM_Poisson(cpu, 64, 1);
+BM_Poisson(cpu, 64, 8);
+BM_Poisson(cpu, 64, 32);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index a973bc2b1c6..d9efb5fe7dc 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -308,7 +308,13 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // an optimized case where the queue 'knows' what attributes to
       // use, and plumbs them through here.
       Tensor element;
-      ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element);
+      Status s = ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0),
+                                    &element);
+      if (!s.ok()) {
+        ctx->SetStatus(s);
+        callback(Tuple());
+        return;
+      }
       tuple.emplace_back(element);
     }
     callback(tuple);
@@ -352,7 +358,7 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       }
                     }
                   }
-                  if (allow_small_batch && queues_[0].size() > 0) {
+                  if (allow_small_batch && !queues_[0].empty()) {
                     // Request all remaining elements in the queue.
                     queue_size = queues_[0].size();
                     attempt->tuple.clear();
@@ -387,8 +393,10 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       const TensorShape shape =
                           ManyOutShape(i, attempt->elements_requested);
                       Tensor element;
-                      attempt->context->allocate_temp(component_dtypes_[i],
-                                                      shape, &element);
+                      attempt->context->SetStatus(
+                          attempt->context->allocate_temp(component_dtypes_[i],
+                                                          shape, &element));
+                      if (!attempt->context->status().ok()) return kComplete;
                       attempt->tuple.emplace_back(element);
                     }
                   }
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/range_dataset_op.cc
new file mode 100644
index 00000000000..8220cb93792
--- /dev/null
+++ b/tensorflow/core/kernels/range_dataset_op.cc
@@ -0,0 +1,113 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RangeDatasetOp : public DatasetOpKernel {
+ public:
+  explicit RangeDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    int64 start;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "start", &start));
+
+    int64 stop;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stop", &stop));
+
+    int64 step;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "step", &step));
+    OP_REQUIRES(ctx, step != 0,
+                errors::InvalidArgument("step must be a non-zero integer."));
+
+    *output = new Dataset(start, stop, step);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 start, int64 stop, int64 step)
+        : start_(start), stop_(stop), step_(step) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("RangeDatasetOp(", start_, ", ", stop_, ", ",
+                             step_, ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {
+        next_ = dataset->start_;
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
+            (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        value_tensor.scalar<int64>()() = next_;
+        out_tensors->emplace_back(std::move(value_tensor));
+        *end_of_sequence = false;
+        next_ += dataset()->step_;
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 next_;
+    };
+
+    const int64 start_;
+    const int64 stop_;
+    const int64 step_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RangeDataset").Device(DEVICE_CPU),
+                        RangeDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index 39a9fd37d2d..d682cd3b52d 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -105,7 +105,7 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
     num_tries = batch_size;
   }
   // Compute the expected counts of the batch and the extra values
-  if (batch_expected_count.size() > 0) {
+  if (!batch_expected_count.empty()) {
     CHECK_EQ(batch_size, batch_expected_count.size());
     for (int i = 0; i < batch_size; i++) {
       batch_expected_count[i] =
@@ -131,7 +131,7 @@ void AllSampler::SampleBatchGetExpectedCountAvoid(
   for (int i = 0; i < batch_size; i++) {
     batch[i] = i;
   }
-  if (batch_expected_count.size() > 0) {
+  if (!batch_expected_count.empty()) {
     CHECK_EQ(batch_size, batch_expected_count.size());
     for (int i = 0; i < batch_size; i++) {
       batch_expected_count[i] = 1;
@@ -262,6 +262,9 @@ FixedUnigramSampler::FixedUnigramSampler(int64 range,
 }
 
 float FixedUnigramSampler::Probability(int64 value) const {
+  if (value < 0 || static_cast<size_t>(value) >= weights_.size()) {
+    return 0.0;
+  }
   return weights_.at(value) / total_weight_;
 }
 
@@ -287,7 +290,7 @@ Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
     // The vocabulary file should be in csv like format, with the last
     // field the weight associated with the word.
     std::vector<string> cols = str_util::Split(line, ',');
-    if (cols.size() == 0) continue;
+    if (cols.empty()) continue;
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
       float w = 0.0;
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/reader_dataset_ops.cc
new file mode 100644
index 00000000000..e7f65c39cb9
--- /dev/null
+++ b/tensorflow/core/kernels/reader_dataset_ops.cc
@@ -0,0 +1,420 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following ops.
+
+class TextLineDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    DatasetBase* dataset = new Dataset(std::move(filenames));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames)
+        : filenames_(std::move(filenames)) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "TextLineDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next line.
+          if (input_buffer_) {
+            string line_contents;
+            Status s = input_buffer_->ReadLine(&line_contents);
+            if (s.ok()) {
+              // Produce the line as output.
+              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              line_tensor.scalar<string>()() = line_contents;
+              out_tensors->emplace_back(std::move(line_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            } else if (!errors::IsOutOfRange(s)) {
+              // Report non-EOF errors to the caller.
+              return s;
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            input_buffer_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
+        } while (true);
+      }
+
+     private:
+      // TODO(mrry): Make this configurable via an attr on the dataset op?
+      // Or maybe via a data input?
+      enum { kBufferSize = 256 << 10 /* 256 kB */ };
+
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_buffer_
+      std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
+                        TextLineDatasetOp);
+
+class FixedLengthRecordDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    const Tensor* header_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("header_bytes", &header_bytes_tensor));
+    OP_REQUIRES(ctx, header_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`header_bytes` must be a scalar."));
+    const int64 header_bytes = header_bytes_tensor->scalar<int64>()();
+
+    const Tensor* record_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("record_bytes", &record_bytes_tensor));
+    OP_REQUIRES(ctx, record_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`record_bytes` must be a scalar."));
+    const int64 record_bytes = record_bytes_tensor->scalar<int64>()();
+
+    const Tensor* footer_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("footer_bytes", &footer_bytes_tensor));
+    OP_REQUIRES(ctx, footer_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`footer_bytes` must be a scalar."));
+    const int64 footer_bytes = footer_bytes_tensor->scalar<int64>()();
+
+    DatasetBase* dataset = new Dataset(std::move(filenames), header_bytes,
+                                       record_bytes, footer_bytes);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames, int64 header_bytes,
+                     int64 record_bytes, int64 footer_bytes)
+        : filenames_(std::move(filenames)),
+          header_bytes_(header_bytes),
+          record_bytes_(record_bytes),
+          footer_bytes_(footer_bytes) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return "FixedLengthRecordDatasetOp::Dataset";
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (input_buffer_) {
+            const int64 current_pos = input_buffer_->Tell();
+            DCHECK_GE(file_pos_limit_, 0);
+            if (current_pos < file_pos_limit_) {
+              string record;
+              TF_RETURN_IF_ERROR(
+                  input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
+              // Produce the record as output.
+              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              record_tensor.scalar<string>()() = record;
+              out_tensors->emplace_back(std::move(record_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            input_buffer_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          uint64 file_size;
+          TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
+              dataset()->filenames_[current_file_index_], &file_size));
+          file_pos_limit_ = file_size - dataset()->footer_bytes_;
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
+          TF_RETURN_IF_ERROR(
+              input_buffer_->SkipNBytes(dataset()->header_bytes_));
+        } while (true);
+      }
+
+     private:
+      // TODO(mrry): Make this configurable via an attr on the dataset op?
+      // Or maybe via a data input?
+      enum { kBufferSize = 256 << 10 /* 256 kB */ };
+
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_buffer_
+      std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
+      int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+    };
+
+    const std::vector<string> filenames_;
+    const int64 header_bytes_;
+    const int64 record_bytes_;
+    const int64 footer_bytes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
+                        FixedLengthRecordDatasetOp);
+
+class TFRecordDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    const Tensor* compression_type_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("compression_type", &compression_type_tensor));
+    OP_REQUIRES(
+        ctx, compression_type_tensor->dims() == 0,
+        errors::InvalidArgument("`compression_type` must be a scalar."));
+    const string& compression_type =
+        compression_type_tensor->scalar<string>()();
+
+    DatasetBase* dataset = new Dataset(std::move(filenames), compression_type);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames,
+                     const string& compression_type)
+        : filenames_(std::move(filenames)),
+          options_(io::RecordReaderOptions::CreateRecordReaderOptions(
+              compression_type)) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "TFRecordDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (reader_) {
+            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Status s = reader_->ReadRecord(&offset_,
+                                           &result_tensor.scalar<string>()());
+            if (s.ok()) {
+              out_tensors->emplace_back(std::move(result_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            } else if (!errors::IsOutOfRange(s)) {
+              return s;
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            reader_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          const string& next_filename =
+              dataset()->filenames_[current_file_index_];
+          TF_RETURN_IF_ERROR(
+              ctx->env()->NewRandomAccessFile(next_filename, &file_));
+          reader_.reset(new io::RecordReader(file_.get(), dataset()->options_));
+          offset_ = 0;
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      uint64 offset_ GUARDED_BY(mu_) = 0;
+
+      // `reader_` will borrow the object that `file_` points to, so
+      // we must destroy `reader_` before `file_`.
+      std::unique_ptr<RandomAccessFile> file_ GUARDED_BY(mu_);
+      std::unique_ptr<io::RecordReader> reader_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+    io::RecordReaderOptions options_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TFRecordDataset").Device(DEVICE_CPU),
+                        TFRecordDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
index 4b949ef82b9..e2eb40677b4 100644
--- a/tensorflow/core/kernels/reader_ops.cc
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -49,9 +49,10 @@ class ReaderVerbAsyncOpKernel : public AsyncOpKernel {
   explicit ReaderVerbAsyncOpKernel(OpKernelConstruction* context)
       : AsyncOpKernel(context),
         thread_pool_(new thread::ThreadPool(
-            context->env(), strings::StrCat("reader_thread_",
-                                            SanitizeThreadSuffix(def().name())),
-            1 /* num_threads */)) {}
+            context->env(), ThreadOptions(),
+            strings::StrCat("reader_thread_",
+                            SanitizeThreadSuffix(def().name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     ReaderInterface* reader;
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index e391752289b..3af555da1aa 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -24,8 +24,9 @@ namespace tensorflow {
 RecordYielder::RecordYielder(OpKernelConstruction* context,
                              const RecordYielder::Options& opts)
     : opts_(opts),
-      thread_(new thread::ThreadPool(context->env(), "record_yielder",
-                                     1 + opts.parallelism)),
+      thread_(new thread::ThreadPool(context->env(), ThreadOptions(),
+                                     "record_yielder", 1 + opts.parallelism,
+                                     /* low_latency_hint */ false)),
       epoch_(0),
       rnd_(opts.seed) {
   thread_->Schedule([this]() { MainLoop(); });
@@ -114,7 +115,7 @@ void RecordYielder::MainLoop() {
     std::shuffle(filenames.begin(), filenames.end(), shuffle_rnd);
 
     // Left-shift the filename list.
-    const int64 num = filenames.size();
+    const std::vector<string>::size_type num = filenames.size();
     int64 shift;
     if (0 <= opts_.file_shuffle_shift_ratio &&
         opts_.file_shuffle_shift_ratio < 1) {
@@ -129,7 +130,7 @@ void RecordYielder::MainLoop() {
     for (int i = 0; i < N; ++i) {
       Shard* shard = &shards[i];
       shard->index = i;
-      for (int j = i; j < filenames.size(); j += N) {
+      for (std::vector<string>::size_type j = i; j < filenames.size(); j += N) {
         shard->filenames.push_back(filenames[j]);
       }
       thread_->Schedule([this, shard]() { ShardLoop(shard); });
@@ -144,6 +145,9 @@ void RecordYielder::MainLoop() {
     {
       mutex_lock l(mu_);
       epoch_end_ = true;
+      if (BufEnough()) {
+        buf_enough_.notify_all();
+      }
       while (!BufEmpty()) {
         buf_empty_.wait(l);
       }
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 503644f3b87..44f7c9511f1 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -142,7 +142,7 @@ class RecordYielder {
     // any.
     return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
            (!epoch_end_ &&
-            buf_.size() >= std::max<int64>(1, opts_.bufsize / 2));
+            buf_.size() >= std::max<uint64>(1, opts_.bufsize / 2));
   }
 
   void MainLoop();
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 625cea42282..553f8895232 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -151,19 +151,16 @@ class ReductionOp : public OpKernel {
     OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
     CHECK_GE(helper.ndims(), 0);
 
-    // The real output shape will be assigned below.
-    TensorShape empty_shape;
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &out));
-
     if (helper.ndims() == 0 ||
         (helper.ndims() == 1 && !helper.reduce_first_axis())) {
       // Special case. Reduces nothing.  It is unclear why this is
       // necessary, but tests fail without it.  Look into why this
       // case occurs.
-      if (!out->CopyFrom(data, helper.out_shape())) {
+      Tensor out;
+      if (!out.CopyFrom(data, helper.out_shape())) {
         ctx->SetStatus(errors::Internal("Error during reduction copy."));
       }
+      ctx->set_output(0, out);
       return;
     }
 
@@ -174,8 +171,9 @@ class ReductionOp : public OpKernel {
     // A temporary tensor whose size matches the size of the reduced
     // output.
     Tensor tmp_out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(out->dtype(), helper.out_reshape(),
-                                           &tmp_out, alloc_attr));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
+                                helper.out_reshape(), &tmp_out, alloc_attr));
 
     typedef functor::ReduceFunctor<Device, Reducer> Functor;
     Constants<Device> constants;
@@ -233,9 +231,21 @@ class ReductionOp : public OpKernel {
     // Set the real output using the contents of the reduction but the
     // real expected output shape.  The number of elements should
     // match between the two shapes.
-    if (!out->CopyFrom(tmp_out, helper.out_shape())) {
+    Tensor out;
+    if (!out.CopyFrom(tmp_out, helper.out_shape())) {
       ctx->SetStatus(errors::Internal("Error during reduction copy."));
     }
+    if (ctx->track_allocations()) {
+      // The temporary memory becomes the output memory.
+      if (ctx->allocate_on_host(alloc_attr)) {
+        ctx->record_host_temp_memory_size(
+            -static_cast<int64>(out.AllocatedBytes()));
+      } else {
+        ctx->record_device_temp_memory_size(
+            -static_cast<int64>(out.AllocatedBytes()));
+      }
+    }
+    ctx->set_output(0, out);
   }
 
  private:
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
index fbc4ee31a4c..ec4490db83f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -47,7 +47,7 @@ struct ReduceFunctor<GPUDevice, Reducer> {
 };
 
 template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T> > {
+struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
   static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
@@ -60,7 +60,7 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T> > {
          ++i) {
       num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
     }
-    T scale = T(1.0) / num_coeffs_to_reduce;
+    T scale = T(1.0 / num_coeffs_to_reduce);
     out.device(d) = (in * scale).sum(reduction_axes);
   }
 
@@ -108,6 +108,10 @@ DEFINE_FOR_ALL_REDUCERS(double);
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::AndReducer);
 DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::OrReducer);
 #undef DEFINE_FOR_TYPE_AND_R
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index db86157c8ee..d243e7c55f4 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -57,4 +57,28 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Max")                           \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+
+REGISTER_KERNEL_BUILDER(
+    Name("Max")
+        .Device(DEVICE_SYCL)
+        .HostMemory("reduction_indices")
+        .HostMemory("input")
+        .HostMemory("output")
+        .TypeConstraint<int32>("T")
+        .TypeConstraint<int32>("Tidx"),
+    ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index fef3cd06991..5b01de8ddbc 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -37,11 +37,25 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::MeanReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Mean")                          \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MeanReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index c362bc88674..1e394bea41f 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -57,4 +57,28 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Min")                           \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::MinReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+
+REGISTER_KERNEL_BUILDER(
+    Name("Min")
+        .Device(DEVICE_SYCL)
+        .HostMemory("reduction_indices")
+        .HostMemory("input")
+        .HostMemory("output")
+        .TypeConstraint<int32>("T")
+        .TypeConstraint<int32>("Tidx"),
+    ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index c6aff8c2ed4..33f6ae6bae1 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -37,12 +37,27 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(int32);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_int32(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)         \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("Prod")                          \
+          .Device(DEVICE_SYCL)              \
+          .TypeConstraint<type>("T")        \
+          .TypeConstraint<int32>("Tidx")    \
+          .HostMemory("reduction_indices"), \
+      ReductionOp<SYCLDevice, type, Eigen::internal::ProdReducer<type>>);
+REGISTER_SYCL_KERNELS(int32);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 3aa38f418ee..c1f4f3475af 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -24,12 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
-// NOTE: We should have mean(complex64,int32), too. But that needs to
-// change Eigen::internal::MeanReducer to cast int to complex<float>.
-// We don't see immediate need of mean(complex64,int32) anyway.
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -42,11 +37,9 @@ TF_CALL_complex128(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
-REGISTER_GPU_KERNELS(complex64);
-REGISTER_GPU_KERNELS(complex128);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 // A special GPU kernel for int32.
@@ -75,11 +68,7 @@ REGISTER_KERNEL_BUILDER(
       ReductionOp<SYCLDevice, type, Eigen::internal::SumReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(
     Name("Sum")
         .Device(DEVICE_SYCL)
@@ -89,6 +78,7 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("output")
         .HostMemory("reduction_indices"),
     ReductionOp<CPUDevice, int32, Eigen::internal::SumReducer<int32>>);
+#undef REGISTER_SYCL_KERNELS
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
index 5e4cde07d76..ad8f4274295 100644
--- a/tensorflow/core/kernels/reference_gemm.h
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -21,7 +21,7 @@ limitations under the License.
 // for bit depths or argument combinations that aren't supported by optimized
 // code.
 // It assumes the row-major convention used by TensorFlow, and implements
-// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are
+// C = A * B, like the standard BLAS GEMM interface. If the transpose flags are
 // true, then the relevant matrix is treated as stored in column-major order.
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index f24a71ec8ca..d8d30e87e22 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -29,6 +29,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -131,4 +134,30 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNELS(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
+      ReluOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+      ReluGradOp<SYCLDevice, type>);                                   \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      Relu6Op<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      Relu6GradOp<SYCLDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
+      EluOp<SYCLDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      EluGradOp<SYCLDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
index fc7350d54c6..aa3835ecc56 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
@@ -16,8 +16,10 @@ limitations under the License.
 // See docs in ../ops/remote_fused_graph_ops.cc.
 
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
-#include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -25,86 +27,88 @@ namespace tensorflow {
 class RemoteFusedGraphExecuteOp : public OpKernel {
  public:
   explicit RemoteFusedGraphExecuteOp(OpKernelConstruction* const ctx)
-      : OpKernel(ctx), graph_transferer_() {
+      : OpKernel(ctx), execute_info_() {
     string serialized_proto;
-    ctx->GetAttr("serialized_graph_transfer_info", &serialized_proto);
-    graph_transferer_.SetSerializedGraphTransferInfo(serialized_proto);
-    const GraphTransferInfo& gt_info = graph_transferer_.GetGraphTransferInfo();
-    switch (gt_info.destination()) {
-      case GraphTransferInfo::NOP:
-        break;
-      case GraphTransferInfo::HEXAGON:
-        soc_control_wrapper_.reset(new HexagonControlWrapper());
-        break;
-      default:
-        // Other destination is not supported yet.
-        CHECK(false);
-        break;
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(RemoteFusedGraphExecuteUtils::
+                              ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+                          &serialized_proto));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tinputs", &input_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutputs", &output_types_));
+    execute_info_.ParseFromString(serialized_proto);
+    if (!execute_info_.executor_name().empty()) {
+      const RemoteFusedGraphExecuteUtils::ExecutorBuildFunc* build_func =
+          RemoteFusedGraphExecuteUtils::GetExecutorBuildFunc(
+              execute_info_.executor_name());
+      if (build_func != nullptr) {
+        Status status = (*build_func)(&remote_fused_graph_executor_);
+      } else {
+        LOG(ERROR) << "Executor not found for "
+                   << execute_info_.executor_name();
+      }
     }
 
-    if (soc_control_wrapper_) {
+    if (remote_fused_graph_executor_) {
       // 1. Initialize remote processor
-      soc_control_wrapper_->Init();
+      remote_fused_graph_executor_->Init(execute_info_);
+      // Explicitly clear serialized executor parameter after initialization
+      // to release unnecessary memory.
+      execute_info_.clear_serialized_executor_parameters();
 
       // 2. Setup graph in remote processor
-      soc_control_wrapper_->SetupGraph(graph_transferer_);
+      remote_fused_graph_executor_->SetupGraph();
     }
   }
 
   ~RemoteFusedGraphExecuteOp() final {
-    if (soc_control_wrapper_) {
+    if (remote_fused_graph_executor_) {
       // 6. Teardown graph in remote processor
-      soc_control_wrapper_->TeardownGraph();
+      remote_fused_graph_executor_->TeardownGraph();
 
       // 7. Finalize remote processor
-      soc_control_wrapper_->Finalize();
+      remote_fused_graph_executor_->Finalize();
     }
   }
 
   void Compute(OpKernelContext* const ctx) final {
     CHECK(ctx != nullptr);
     const int input_count = ctx->num_inputs();
-    const GraphTransferInfo& gt_info = graph_transferer_.GetGraphTransferInfo();
-    CHECK(input_count == gt_info.graph_input_node_info_size());
+    const int graph_input_count = execute_info_.graph_input_node_name_size();
+    CHECK(input_count == graph_input_count &&
+          input_count == input_types_.size())
+        << "input_count = " << input_count
+        << ", gt input count = " << execute_info_.graph_input_node_name_size()
+        << ", type count = " << input_types_.size();
 
-    // 3. Send inputs into remote processor
-    for (int i = 0; i < input_count; ++i) {
+    // 3. Send first data type inputs into remote processor
+    for (int i = 0; i < graph_input_count; ++i) {
       const Tensor& input_tensor = ctx->input(i);
-      const GraphTransferInfo::GraphInputNodeInfo& input_node_info =
-          gt_info.graph_input_node_info(i);
-      const string& input_node_name = input_node_info.name();
-      if (soc_control_wrapper_) {
-        soc_control_wrapper_->FillInputNode(input_node_name, input_tensor);
+      const string& input_node_name = execute_info_.graph_input_node_name(i);
+      if (remote_fused_graph_executor_) {
+        remote_fused_graph_executor_->FillInputNode(input_node_name,
+                                                    input_tensor);
       }
     }
 
     // 4. Execute graph in remote processor
-    if (soc_control_wrapper_) {
-      soc_control_wrapper_->ExecuteGraph();
+    if (remote_fused_graph_executor_) {
+      remote_fused_graph_executor_->ExecuteGraph();
     }
 
     // 5. Load outputs from remote processor
     const int output_count = ctx->num_outputs();
-    CHECK(output_count == gt_info.graph_output_node_info_size());
+    CHECK(output_count == execute_info_.graph_output_node_name_size() &&
+          output_count == output_types_.size());
     for (int i = 0; i < output_count; ++i) {
       Tensor* output = nullptr;
-      TensorShape output_shape;
-      const GraphTransferInfo::GraphOutputNodeInfo& output_node_info =
-          gt_info.graph_output_node_info(i);
-      for (const int64 dim : output_node_info.shape()) {
-        output_shape.AddDim(dim);
-      }
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output));
-      if (soc_control_wrapper_) {
-        std::vector<ISocControlWrapper::ByteArray> outputs;
-        soc_control_wrapper_->ReadOutputNode(output_node_info.name(), &outputs);
-        // TODO(satok): Remove this check (<= 1). And support multiple outputs
-        // for each output node
-        CHECK(outputs.size() <= 1);
-        if (!outputs.empty()) {
-          std::memcpy(output->vec<uint8>().data(), std::get<0>(outputs[0]),
-                      std::get<1>(outputs[2]));
-        }
+      const string& output_node_name = execute_info_.graph_output_node_name(i);
+      if (remote_fused_graph_executor_) {
+        remote_fused_graph_executor_->ReadOutputNode(
+            output_node_name,
+            [i, &ctx, &output](const TensorShape& shape) -> Tensor* {
+              TF_CHECK_OK(ctx->allocate_output(i, shape, &output));
+              return output;
+            });
       }
     }
   }
@@ -112,8 +116,10 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
   bool IsExpensive() final { return true; }
 
  private:
-  GraphTransferer graph_transferer_;
-  std::unique_ptr<ISocControlWrapper> soc_control_wrapper_;
+  RemoteFusedGraphExecuteInfo execute_info_;
+  std::unique_ptr<IRemoteFusedGraphExecutor> remote_fused_graph_executor_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteOp);
 };
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
index 866554aa559..655de2f98f3 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
@@ -13,32 +13,305 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
 class RemoteFusedGraphExecuteTest : public OpsTestBase {};
 
-TEST_F(RemoteFusedGraphExecuteTest, ExecuteAddGraph) {
+TEST_F(RemoteFusedGraphExecuteTest, BuildModelWithOneDataType) {
+  DataTypeVector input_types({DT_FLOAT, DT_FLOAT});
+  DataTypeVector output_types({DT_FLOAT});
   TF_ASSERT_OK(
       NodeDefBuilder("remote_fused_graph_execute_op", "RemoteFusedGraphExecute")
           .Input(FakeInput(2, DT_FLOAT))
-          .Attr("M", 2)
-          .Attr("N", 1)
-          .Attr("T", DataTypeToEnum<float>::v())
-          .Attr("serialized_graph_transfer_info", "")
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   // TODO(satok): Add benchmark
 }
 
+TEST_F(RemoteFusedGraphExecuteTest, BuildModelWithWrongDataType) {
+  DataTypeVector input_types({DT_INT32, DT_INT32});
+  DataTypeVector output_types({DT_FLOAT});
+  ASSERT_FALSE(
+      NodeDefBuilder("remote_fused_graph_execute_op", "RemoteFusedGraphExecute")
+          .Input(FakeInput(2, DT_FLOAT))
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
+          .Finalize(node_def())
+          .ok());
+  // TODO(satok): Add benchmark
+}
+
+////////////////////////////
+// End-to-end test: Begin //
+////////////////////////////
+// This test does a end-to-end test for a simple usage of
+// RemoteFusedGraphExecuteOp.
+
+constexpr const char* const NAME_A = "a";
+constexpr const char* const NAME_B = "b";
+constexpr const char* const NAME_A_PLUS_B = "a_plus_b";
+constexpr const char* const REMOTE_FUSED_EXECUTE_OP_NODE_NAME =
+    "remote_fused_execute_op";
+constexpr const char* const REMOTE_FUSED_EXECUTOR_NAME =
+    "build_test_remote_fused_graph_executor";
+
+constexpr float NODE_A_VAL = 2.0f;
+constexpr float NODE_A_VAL2 = 10.0f;
+constexpr float NODE_B_VAL = 3.0f;
+constexpr float FLOAT_VALUE_TOLERANCE = 1e-8f;
+
+// Utility functions //
+static Output BuildPlaceHolderOp(const string& name, const DataType dt,
+                                 const TensorShape& tensor_shape, Scope* root) {
+  const Scope& scope = root->WithOpName(name);
+  Node* ret;
+  const string unique_name = scope.GetUniqueNameForOp("Placeholder");
+  NodeBuilder builder = NodeBuilder(unique_name, "Placeholder")
+                            .Attr("dtype", dt)
+                            .Attr("shape", tensor_shape);
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  CHECK(scope.ok());
+  return Output(ret, 0);
+}
+
+static Output BuildRemoteFusedGraphExecuteOp(
+    const string& name, const std::vector<Output>& output_list,
+    const int output_node_count,
+    const RemoteFusedGraphExecuteInfo& execute_info, Scope* root) {
+  const Scope& scope = root->WithOpName(name);
+  Node* ret;
+  CHECK(scope.ok());
+  auto node_out_list = ops::AsNodeOutList(scope, InputList(output_list));
+  const auto unique_name = scope.GetUniqueNameForOp("RemoteFusedGraphExecute");
+
+  DataTypeVector input_types{DT_FLOAT};
+  DataTypeVector output_types{DT_FLOAT};
+
+  auto builder = NodeBuilder(unique_name, "RemoteFusedGraphExecute")
+                     .Input(node_out_list)
+                     .Attr("Tinputs", input_types)
+                     .Attr("Toutputs", output_types)
+                     .Attr("serialized_remote_fused_graph_execute_info",
+                           StringPiece(execute_info.SerializeAsString()));
+  CHECK(scope.ok());
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  CHECK(scope.ok());
+  return Output(ret, 0);
+}
+
+static RemoteFusedGraphExecuteInfo BuildRemoteFusedGraphExecuteInfo(
+    const GraphDef& original_graph) {
+  RemoteFusedGraphExecuteInfo execute_info;
+  execute_info.set_executor_name(REMOTE_FUSED_EXECUTOR_NAME);
+
+  // In this example, simply copy all nodes. Basically, you don't need to add
+  // unused node for inference.
+  for (const NodeDef& node : original_graph.node()) {
+    NodeDef& copied_node = *execute_info.mutable_remote_graph()->add_node();
+    copied_node = node;
+    // Adding tensor shape type to the node
+    // TODO(satok): Use TensorShapeMap to detime tensor shape type
+    RemoteFusedGraphExecuteUtils::AddOutputTensorShapeType(
+        std::vector<DataType>({DT_FLOAT}),
+        std::vector<TensorShape>({TensorShape()}), &copied_node);
+  }
+
+  // Add node A as input
+  execute_info.add_graph_input_node_name(NAME_A);
+  RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_a =
+      *execute_info.add_default_graph_input_tensor_shape();
+  shape_a.set_dtype(DT_FLOAT);
+  // (skip setting shape to shape_a as it's shape is rank = 0.)
+
+  // Add node A + B as output
+  execute_info.add_graph_output_node_name(NAME_A_PLUS_B);
+  RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_a_plus_b =
+      *execute_info.add_default_graph_output_tensor_shape();
+  shape_a_plus_b.set_dtype(DT_FLOAT);
+  // (skip setting shape to shape_a_plus_b as it's shape is rank = 0.)
+
+  return execute_info;
+}
+
+// 1. Create TestRemoteFusedGraphExecutor to execute your fused graph
+class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
+ public:
+  int GetVersion() final { return 1; }
+  bool Init(const RemoteFusedGraphExecuteInfo& info) final {
+    info_ = &info;
+    for (const NodeDef& node_def : info.remote_graph().node()) {
+      node_def_map_.emplace(node_def.name(), &node_def);
+    }
+    return true;
+  }
+  bool Finalize() final { return true; }
+  bool SetupGraph() final { return true; }
+  bool ExecuteGraph() final {
+    CHECK(info_ != nullptr);
+    // TODO(satok): Add utilities to implement this function more easily.
+    // CAVEAT: This test only handles add op. You can implement here as you
+    // like.
+    CHECK_EQ(1, info_->graph_input_node_name_size());
+    const string& input_node_name = info_->graph_input_node_name(0);
+    const Tensor& input_tensor = input_tensor_cache_[input_node_name];
+    const float input_val = *input_tensor.scalar<float>().data();
+    // TODO(satok): Read NAME_B from node_a_plus_b
+    const NodeDef& node_b = *node_def_map_.at(NAME_B);
+    const TensorProto* proto = nullptr;
+    TF_CHECK_OK(GetNodeAttr(node_b, "value", &proto));
+    Tensor const_tensor;
+    TF_CHECK_OK(RemoteFusedGraphExecuteUtils::MakeTensorFromProto(
+        *proto, &const_tensor));
+    const float b_val = *const_tensor.scalar<float>().data();
+    Tensor output_a_plus_b(DT_FLOAT, {});
+    output_a_plus_b.flat<float>().data()[0] = input_val + b_val;
+    output_tensor_buf_.emplace(info_->graph_output_node_name(0),
+                               output_a_plus_b);
+    return true;
+  }
+
+  bool TeardownGraph() final { return true; }
+
+  bool FillInputNode(const string& node_name, const Tensor& tensor) final {
+    input_tensor_cache_[node_name] = tensor;
+    return true;
+  }
+
+  bool ReadOutputNode(const string& node_name,
+                      TensorAllocatorFunc tensor_allocator) final {
+    // TODO(satok): Specify tensor shape by using default_graph_tensor_shape.
+    const Tensor& buffered_output_tensor = output_tensor_buf_.at(node_name);
+    const TensorShape& output_shape = buffered_output_tensor.shape();
+    Tensor* output_tensor = tensor_allocator(output_shape);
+    CHECK_EQ(buffered_output_tensor.dtype(), output_tensor->dtype());
+    CHECK(output_tensor->CopyFrom(buffered_output_tensor, output_shape));
+    return true;
+  }
+
+ private:
+  const RemoteFusedGraphExecuteInfo* info_;
+  std::unordered_map<string, Tensor> input_tensor_cache_;
+  std::unordered_map<string, const NodeDef*> node_def_map_;
+  std::unordered_map<string, Tensor> output_tensor_buf_;
+};
+
+// 2. Register a builder of your custom executor
+namespace remote_fused_graph_execute_op {
+Status BuildRemoteFusedGraphExecutor(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(new TestRemoteFusedGraphExecutor());
+  return Status::OK();
+}
+
+// This class instantiation registers executor to the
+// RemoteFusedGraphExecuteOp. This architecture makes executors to be
+// pluggable in order not to link unnecessary libraries.
+static RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+    k_test_remote_fused_graph_executor_build(REMOTE_FUSED_EXECUTOR_NAME,
+                                             BuildRemoteFusedGraphExecutor);
+}  // namespace remote_fused_graph_execute_op
+
+// 3. Create Graph transform function to fuse your graph
+static Status RewriteGraphToFusedGraph(const GraphDef& original_graph,
+                                       GraphDef* fused_graph) {
+  Scope root = Scope::NewRootScope();
+  std::vector<Output> output_list;
+  const Output op_a = BuildPlaceHolderOp(NAME_A, DT_FLOAT, {}, &root);
+  output_list.emplace_back(op_a);
+  const RemoteFusedGraphExecuteInfo execute_info =
+      BuildRemoteFusedGraphExecuteInfo(original_graph);
+  BuildRemoteFusedGraphExecuteOp(REMOTE_FUSED_EXECUTE_OP_NODE_NAME, output_list,
+                                 1, execute_info, &root);
+  GraphDef fused_graph_def;
+  TF_CHECK_OK(root.ToGraphDef(&fused_graph_def));
+  *fused_graph = fused_graph_def;
+  return Status::OK();
+}
+
+// 4. Register transform function
+// You can register transform function by REGISTER_GRAPH_TRANSFORM.
+// In this test, we don't use graph transform tool to avoid linking to
+// the graph transform library.
+// To register transform function, you need to change the interface of
+// BuildFusedGraphDefOfAddGraph to
+// Status BuildFusedGraphDefOfAddGraph(
+// const GraphDef& original_graph, const TransformFuncContext& context,
+// GraphDef* output_graph_def);
+// Then register the function like:
+// REGISTER_GRAPH_TRANSFORM("rewrite_graph", RewriteGraph);
+
+// 5. Fuse the original graph and run the inference the new fused graph
+TEST(RemoteFusedExecuteGraphOp, EndToEndTest) {
+  // 5.1 Load original graph
+  GraphDef original_graph;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &original_graph));
+
+  // 5.2 Fuse graph
+  GraphDef fused_graph;
+  TF_ASSERT_OK(RewriteGraphToFusedGraph(original_graph, &fused_graph));
+
+  // 5.3 Setup session
+  std::vector<Tensor> output_tensors;
+  SessionOptions session_options;
+  session_options.env = Env::Default();
+  std::unique_ptr<Session> session =
+      std::unique_ptr<Session>(NewSession(session_options));
+  Status status = session->Create(fused_graph);
+  ASSERT_TRUE(status.ok());
+  RunOptions run_options;
+  run_options.set_trace_level(RunOptions::FULL_TRACE);
+  RunMetadata run_metadata;
+
+  // 5.4 Setup input
+  Tensor input_a(DT_FLOAT, {});
+  input_a.flat<float>().data()[0] = NODE_A_VAL2;
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back(NAME_A, input_a);
+
+  // 5.5 Setup output
+  const std::vector<string> outputs{REMOTE_FUSED_EXECUTE_OP_NODE_NAME};
+
+  // 5.6 Run inference with all node as output
+  status = session->Run(run_options, inputs, outputs, {}, &output_tensors,
+                        &run_metadata);
+  ASSERT_TRUE(status.ok());
+
+  // 5.7 Check output tensor value
+  ASSERT_EQ(1, output_tensors.size());
+  EXPECT_NEAR(NODE_A_VAL2 + NODE_B_VAL,
+              output_tensors.at(0).flat<float>().data()[0],
+              FLOAT_VALUE_TOLERANCE);
+}
+
+////////////////////////////
+// End-to-end test: End   //
+////////////////////////////
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
new file mode 100644
index 00000000000..31c48082dd9
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+/* static */ Output RemoteFusedGraphExecuteOpTestUtils::BuildAddOp(
+    const Scope& scope, const Input& x, const Input& y) {
+  CHECK(scope.ok());
+  auto _x = ops::AsNodeOut(scope, x);
+  CHECK(scope.ok());
+  auto _y = ops::AsNodeOut(scope, y);
+  CHECK(scope.ok());
+  Node* ret;
+  const auto unique_name = scope.GetUniqueNameForOp("Add");
+  auto builder = NodeBuilder(unique_name, "Add").Input(_x).Input(_y);
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  CHECK(scope.ok()) << scope.status();
+  return Output(ret, 0);
+}
+
+/* static */ Status RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+    const string& name0, const float val0, const string& name1,
+    const float val1, const string& name_out, GraphDef* graph_def) {
+  Scope root = Scope::NewRootScope();
+  Output node0 = ops::Const(root.WithOpName(name0), val0);
+  Output node1 = ops::Const(root.WithOpName(name1), val1);
+  RemoteFusedGraphExecuteOpTestUtils::BuildAddOp(root.WithOpName(name_out),
+                                                 node0, node1);
+  TF_RETURN_IF_ERROR(root.ToGraphDef(graph_def));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(
+    GraphDef* graph_def) {
+  Scope root = tensorflow::Scope::NewRootScope();
+
+  Tensor a_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&a_data, 1.0f);
+  Output a_const = ops::Const(root.WithOpName("A"), Input::Initializer(a_data));
+
+  Tensor b_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&b_data, 1.0f);
+  Output b_const = ops::Const(root.WithOpName("B"), Input::Initializer(b_data));
+
+  Tensor c_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&c_data, 1.0f);
+  Output c_const = ops::Const(root.WithOpName("C"), Input::Initializer(c_data));
+
+  Tensor d_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&d_data, 1.0f);
+  Output d_const = ops::Const(root.WithOpName("D"), Input::Initializer(d_data));
+
+  Tensor e_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&e_data, 1.0f);
+  Output e_const = ops::Const(root.WithOpName("E"), Input::Initializer(e_data));
+
+  Output f_add = ops::Add(root.WithOpName("F"), a_const, b_const);
+
+  Output g_add = ops::Add(root.WithOpName("G"), d_const, e_const);
+
+  Output h_add = ops::Add(root.WithOpName("H"), f_add, c_const);
+
+  Output i_add = ops::Add(root.WithOpName("I"), c_const, g_add);
+
+  Output j_add = ops::Add(root.WithOpName("J"), h_add, i_add);
+
+  Output k_add = ops::Add(root.WithOpName("K"), j_add, g_add);
+
+  TF_RETURN_IF_ERROR(root.ToGraphDef(graph_def));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
new file mode 100644
index 00000000000..a0df50162b6
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RemoteFusedGraphExecuteOpTestUtils is a set of utilities in tests for
+// RemoteFusedGraphExecuteOp.
+class RemoteFusedGraphExecuteOpTestUtils {
+ public:
+  static Output BuildAddOp(const Scope& scope, const Input& x, const Input& y);
+  static Status BuildAddGraph(const string& name0, const float val0,
+                              const string& name1, const float val1,
+                              const string& name_out, GraphDef* graph_def);
+
+  // BuildMultipleAddGraph builds the following graph
+  //
+  //  A         B         C         D         E
+  //  |         |         |         |         |
+  //  +----+----+         |         +----+----+
+  //       |              |              |
+  //       F             / \             G
+  //       |            |   |           / \
+  //       +-----+------+   +-----+----+   +
+  //             |                |        |
+  //             H                I        |
+  //             |                |        |
+  //             +-------+--------+        |
+  //                     |                 |
+  //                     J                 |
+  //                     |                 |
+  //                     +--------+--------+
+  //                              |
+  //                              K
+  //
+  static Status BuildMultipleAddGraph(GraphDef* graph_def);
+
+ private:
+  RemoteFusedGraphExecuteOpTestUtils() = delete;
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteOpTestUtils);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
new file mode 100644
index 00000000000..103b2be6914
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -0,0 +1,1330 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+
+#include <algorithm>
+#include <queue>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+const Node* FindNodeByName(const string& name, const Graph& graph) {
+  for (const Node* node : graph.nodes()) {
+    CHECK_NOTNULL(node);
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+std::unordered_set<string> BuildNodeSetFromNodeNamesAndPorts(
+    const std::vector<string>& node_names_and_ports) {
+  std::unordered_set<string> retval;
+  for (const string& node_name_and_port : node_names_and_ports) {
+    const TensorId tid = ParseTensorName(node_name_and_port);
+    retval.emplace(tid.first.ToString());
+  }
+  return retval;
+}
+
+Node* FindMutableNodeByName(const string& name, Graph* graph) {
+  for (Node* node : graph->nodes()) {
+    if (node != nullptr && node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+const NodeDef* FindNodeDefByName(const string& input,
+                                 const GraphDef& graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  const string name = tid.first.ToString();
+  for (const NodeDef& node_def : graph_def.node()) {
+    if (node_def.name() == name) {
+      return &node_def;
+    }
+  }
+  return nullptr;
+}
+
+bool IsSameNodeName(const NodeDef& node_def, const string& node_name_and_port,
+                    TensorId* tid) {
+  CHECK_NOTNULL(tid);
+  *tid = ParseTensorName(node_name_and_port);
+  if (node_def.name() == tid->first.ToString()) {
+    return true;
+  }
+  return false;
+}
+
+bool ContainsSameTensorId(const string& tensor_name,
+                          const std::vector<string>& tensor_names) {
+  const TensorId tid0 = ParseTensorName(tensor_name);
+  for (const string& name : tensor_names) {
+    const TensorId tid1 = ParseTensorName(name);
+    if (tid0.first == tid1.first && tid0.second == tid1.second) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void AppendDeliminator(string* str) {
+  CHECK_NOTNULL(str);
+  if (!str->empty()) {
+    *str += ":";
+  }
+}
+
+void ConvertMapToVector(const std::unordered_map<int, string>& in,
+                        std::vector<string>* out) {
+  CHECK_NOTNULL(out);
+  out->resize(in.size());
+  for (int i = 0; i < in.size(); ++i) {
+    CHECK(in.count(i) > 0);
+    out->at(i) = in.at(i);
+  }
+}
+
+string DumpGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+string DumpCluster(const RemoteFusedGraphExecuteUtils::ClusterInfo& cluster) {
+  string out;
+  out += "Nodes:\n";
+  for (const string& str : std::get<0>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nInput border:\n";
+  for (const string& str : std::get<1>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nOutput border:\n";
+  for (const string& str : std::get<2>(cluster)) {
+    out += str + ", ";
+  }
+  return out;
+}
+
+}  // namespace
+
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES;
+/* static */ constexpr const char* const RemoteFusedGraphExecuteUtils::
+    ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::ATTR_NODE_TYPE;
+/* static */ constexpr const char* const RemoteFusedGraphExecuteUtils::
+    TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES;
+
+RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar::ExecutorBuildRegistrar(
+    const string& name, ExecutorBuildFunc executor_build_func) {
+  ExecutorBuildRegistry& executor_build_registry = *GetExecutorBuildRegistry();
+  executor_build_registry[name] = std::move(executor_build_func);
+}
+
+/* static */ const RemoteFusedGraphExecuteUtils::ExecutorBuildFunc*
+RemoteFusedGraphExecuteUtils::GetExecutorBuildFunc(const string& name) {
+  ExecutorBuildRegistry& executor_build_registry = *GetExecutorBuildRegistry();
+  if (executor_build_registry.count(name) <= 0) {
+    return nullptr;
+  }
+  return &executor_build_registry.at(name);
+}
+
+/* static */ RemoteFusedGraphExecuteUtils::ExecutorBuildRegistry*
+RemoteFusedGraphExecuteUtils::GetExecutorBuildRegistry() {
+  static ExecutorBuildRegistry executor_builder_registry;
+  return &executor_builder_registry;
+}
+
+/**
+ * - DryRunInference
+ * To determine shapes of output tensors of all nodes, dryrun the graph.
+ * This function supplies memory allocation information when loading
+ * the graph. This function is used to verify shape inference and actual
+ * output shape.
+ */
+/* static */ Status RemoteFusedGraphExecuteUtils::DryRunInference(
+    const GraphDef& graph_def,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    const std::vector<string>& output_node_names, const bool initialize_by_zero,
+    std::vector<tensorflow::Tensor>* output_tensors) {
+  // Create input tensor vector.  If "initialize_by_zero" is true,
+  // input tensor fields are initialized by 0.
+  std::vector<std::pair<string, tensorflow::Tensor>> input_tensors;
+  for (const std::pair<string, Tensor>& input : input_node_info_list) {
+    CHECK(input.second.IsInitialized());
+    if (!initialize_by_zero) {
+      input_tensors.push_back({input.first, input.second});
+      continue;
+    }
+    // If input tensor is not initialized, initialize by 0-filling
+    const DataType data_type = input.second.dtype();
+    const TensorShape& shape = input.second.shape();
+    Tensor input_tensor(data_type, shape);
+    switch (data_type) {
+      case DT_INT32: {
+        auto int_tensor = input_tensor.flat<int32>();
+        int_tensor = int_tensor.constant(0);
+        break;
+      }
+      case DT_FLOAT: {
+        auto float_tensor = input_tensor.flat<float>();
+        float_tensor = float_tensor.constant(0.0f);
+        break;
+      }
+      case DT_QUINT8: {
+        auto int_tensor = input_tensor.flat<quint8>();
+        int_tensor = int_tensor.constant(0);
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unsupported input type: " << data_type;
+    }
+    input_tensors.push_back({input.first, input_tensor});
+  }
+
+  // Setup session
+  CHECK(output_tensors != nullptr);
+  SessionOptions session_options;
+  session_options.env = Env::Default();
+  std::unique_ptr<Session> session =
+      std::unique_ptr<Session>(NewSession(session_options));
+  Status status = session->Create(graph_def);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Setup session arguments
+  RunOptions run_options;
+  run_options.set_trace_level(RunOptions::FULL_TRACE);
+  RunMetadata run_metadata;
+
+  // Run inference with all node as output
+  status = session->Run(run_options, input_tensors, output_node_names, {},
+                        output_tensors, &run_metadata);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error during inference: " << status;
+    return status;
+  }
+  return Status();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
+    const GraphDef& graph_def,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    const bool initialize_by_zero,
+    RemoteFusedGraphExecuteUtils::TensorShapeMap* tensor_shape_map) {
+  CHECK(tensor_shape_map != nullptr);
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(graph_def.node_size());
+  std::vector<string> output_node_names;
+
+  Graph graph(OpRegistry::Global());
+  Status status = ImportGraphDef({}, graph_def, &graph, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  for (const Node* node : graph.nodes()) {
+    if (IsInputNode(input_node_info_list, node->name())) {
+      continue;
+    }
+    for (int i = 0; i < node->num_outputs(); ++i) {
+      output_node_names.emplace_back(strings::StrCat(node->name(), ":", i));
+    }
+  }
+
+  status = DryRunInference(graph_def, input_node_info_list, output_node_names,
+                           initialize_by_zero, &output_tensors);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dryrun " << status;
+    return status;
+  }
+
+  CHECK_EQ(output_node_names.size(), output_tensors.size())
+      << output_node_names.size() << ", " << output_tensors.size();
+
+  // Append output tensor of input node in advance to create a map
+  // to avoid memory reallocation inside vector
+  for (const std::pair<string, Tensor>& input_node_info :
+       input_node_info_list) {
+    output_tensors.push_back(input_node_info.second);
+  }
+
+  for (int i = 0; static_cast<size_t>(i) < output_node_names.size(); ++i) {
+    const string& name = output_node_names.at(i);
+    const Tensor& tensor = output_tensors.at(i);
+    EmplaceTensorShapeType(name, tensor, tensor_shape_map);
+  }
+  for (int i = 0; static_cast<size_t>(i) < input_node_info_list.size(); ++i) {
+    const string& name = input_node_info_list.at(i).first;
+    const Tensor& tensor = output_tensors.at(output_node_names.size() + i);
+    EmplaceTensorShapeType(name, tensor, tensor_shape_map);
+  }
+  CHECK_EQ(output_node_names.size() + input_node_info_list.size(),
+           output_tensors.size());
+  return status;
+}
+
+/* static */ bool RemoteFusedGraphExecuteUtils::IsInputNode(
+    const std::vector<std::pair<string, Tensor>>& input_tensor_vector,
+    const string& node_name) {
+  for (const std::pair<string, Tensor>& pair : input_tensor_vector) {
+    const TensorId tid = ParseTensorName(pair.first);
+    if (node_name == tid.first.ToString()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/* static */ void RemoteFusedGraphExecuteUtils::ConvertToTensorShapeMap(
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    const std::vector<string>& output_node_names,
+    const std::vector<tensorflow::Tensor>& output_tensors,
+    TensorShapeMap* tensor_shape_map) {
+  CHECK_NE(tensor_shape_map, nullptr);
+  tensor_shape_map->clear();
+  tensor_shape_map->reserve(input_node_info_list.size() +
+                            output_node_names.size());
+  const int output_node_count = output_node_names.size();
+  CHECK_EQ(output_node_count, output_tensors.size());
+  for (int i = 0; i < output_node_count; ++i) {
+    const string& node_name = output_node_names.at(i);
+    const Tensor& tensor = output_tensors.at(i);
+    EmplaceTensorShapeType(node_name, tensor, tensor_shape_map);
+  }
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::MakeTensorFromProto(
+    const TensorProto& tensor_proto, Tensor* tensor) {
+  if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
+    Tensor parsed(tensor_proto.dtype());
+    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      *tensor = parsed;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Cannot parse tensor from proto");
+}
+
+/* static */ bool RemoteFusedGraphExecuteUtils::AddOutputTensorShapeType(
+    const std::vector<DataType>& data_types,
+    const std::vector<TensorShape>& shapes, NodeDef* node_def) {
+  AddNodeAttr(ATTR_OUTPUT_DATA_TYPES, data_types, node_def);
+  AddNodeAttr(ATTR_OUTPUT_SHAPES, shapes, node_def);
+  return true;
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
+    const TensorShapeMap& tensor_shape_map, NodeDef* node_def) {
+  CHECK_NE(node_def, nullptr);
+  std::priority_queue<std::tuple<int, const TensorShapeType*>> queue;
+  auto its = tensor_shape_map.equal_range(node_def->name());
+  for (auto it = its.first; it != its.second; ++it) {
+    queue.emplace(std::make_tuple(it->second.first, &it->second.second));
+  }
+  int last_port = queue.size();
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  while (!queue.empty()) {
+    const int port = std::get<0>(queue.top());
+    const TensorShapeType* tst = std::get<1>(queue.top());
+    CHECK_NE(tst, nullptr);
+    data_types.emplace(data_types.begin(), tst->first);
+    shapes.emplace(shapes.begin(), tst->second);
+    CHECK_EQ(last_port - 1, port);
+    last_port = port;
+    queue.pop();
+  }
+  AddOutputTensorShapeType(data_types, shapes, node_def);
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+    AttrSlice attrs, std::vector<DataType>* data_types,
+    std::vector<TensorShape>* shapes) {
+  Status status;
+  if (data_types != nullptr) {
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_DATA_TYPES, data_types);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+  if (shapes != nullptr) {
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_SHAPES, shapes);
+    if (status.ok() && data_types != nullptr) {
+      CHECK_EQ(data_types->size(), shapes->size());
+    }
+  }
+
+  return status;
+}
+
+/* static */ bool RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+    const GraphDef& graph_def, const string& name_and_port, DataType* data_type,
+    TensorShape* shape) {
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  const TensorId tid = ParseTensorName(name_and_port);
+  const string node_name = tid.first.ToString();
+  const int port = tid.second;
+  const NodeDef* node_def = FindNodeDefByName(node_name, graph_def);
+  CHECK_NOTNULL(node_def);
+  GetOutputTensorShapeType(*node_def, &data_types, &shapes).IgnoreError();
+  if (data_types.empty()) {
+    return false;
+  }
+  CHECK(data_types.size() > port);
+  *data_type = data_types.at(port);
+  *shape = shapes.at(port);
+  return true;
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::PropagateShapeInference(
+    const GraphDef& graph_def,
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    Graph* graph, ShapeRefiner* shape_refiner) {
+  Status status;
+  auto visit = [&shape_refiner, &input_node_info_list, &status](Node* node) {
+    if (!status.ok()) {
+      return;
+    }
+    CHECK_NE(node, nullptr);
+    // If we visit an input node, we use the shape provided and set the
+    // shape accordingly.
+    bool is_input_node = false;
+    for (const std::pair<string, Tensor>& input_node_info :
+         input_node_info_list) {
+      if (node->name() == input_node_info.first) {
+        shape_inference::InferenceContext* context =
+            shape_refiner->GetContext(node);
+        shape_inference::ShapeHandle handle;
+        status = context->MakeShapeFromTensorShape(
+            input_node_info.second.shape(), &handle);
+        if (!status.ok()) {
+          break;
+        }
+        status = shape_refiner->SetShape(node, 0, handle);
+        if (!status.ok()) {
+          break;
+        }
+        is_input_node = true;
+      }
+      if (!status.ok()) {
+        break;
+      }
+    }
+    // If not an input node call AddNode() that recomputes the shape.
+    if (!is_input_node && status.ok()) {
+      status = shape_refiner->AddNode(node);
+    }
+    if (!status.ok()) {
+      VLOG(1) << "Shape inference failed for node: " << node->name();
+    }
+  };
+
+  ReverseDFS(*graph, {}, visit);
+
+  return status;
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildTensorShapeMapFromGraph(
+    const Graph& graph, const ShapeRefiner& shape_refiner,
+    TensorShapeMap* tensor_shape_map) {
+  for (int i = 0; i < graph.num_node_ids(); ++i) {
+    const Node* node = graph.FindNodeId(i);
+    CHECK_NE(node, nullptr);
+    for (int j = 0; j < node->num_outputs(); ++j) {
+      const int output_index = j;
+      const DataType dt = node->output_type(output_index);
+      shape_inference::InferenceContext* context =
+          shape_refiner.GetContext(node);
+      CHECK_NE(context, nullptr);
+      shape_inference::ShapeHandle shape_handle = context->output(output_index);
+      if (context->RankKnown(shape_handle)) {
+        TensorShape ts;
+        for (int k = 0; k < context->Rank(shape_handle); ++k) {
+          shape_inference::DimensionHandle dh = context->Dim(shape_handle, k);
+          CHECK(context->ValueKnown(dh));
+          ts.AddDim(context->Value(dh));
+        }
+        const string& node_name = node->name();
+        CHECK(tensor_shape_map->count(node_name) == 0);
+        tensor_shape_map->emplace(node_name,
+                                  std::make_pair(j, std::make_pair(dt, ts)));
+      } else {
+        return errors::InvalidArgument("Graph contains unknow shapes");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ const RemoteFusedGraphExecuteUtils::TensorShapeType*
+RemoteFusedGraphExecuteUtils::GetTensorShapeType(
+    const TensorShapeMap& tensor_shape_map, const string& node_name) {
+  if (node_name.find(':') != string::npos) {
+    const TensorId tid = ParseTensorName(node_name);
+    return GetTensorShapeType(tensor_shape_map, tid.first.ToString(),
+                              tid.second);
+  } else {
+    return GetTensorShapeType(tensor_shape_map, node_name, 0);
+  }
+}
+
+/* static */ const RemoteFusedGraphExecuteUtils::TensorShapeType*
+RemoteFusedGraphExecuteUtils::GetTensorShapeType(
+    const TensorShapeMap& tensor_shape_map, const string& node_name,
+    const int port) {
+  CHECK_EQ(node_name.find(':'), string::npos);
+  if (tensor_shape_map.count(node_name) <= 0) {
+    return nullptr;
+  }
+  auto its = tensor_shape_map.equal_range(node_name);
+  for (auto it = its.first; it != its.second; ++it) {
+    if (it->second.first == port) {
+      return &it->second.second;
+    }
+  }
+  return nullptr;
+}
+
+/* static */ void
+RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
+    const RemoteFusedGraphExecuteInfo& proto,
+    std::vector<std::pair<string, Tensor>>* inputs,
+    std::vector<string>* outputs) {
+  CHECK_EQ(proto.graph_input_node_name_size(),
+           proto.default_graph_input_tensor_shape_size());
+  for (int i = 0; i < proto.graph_input_node_name_size(); ++i) {
+    inputs->emplace_back(
+        proto.graph_input_node_name(i),
+        Tensor(proto.default_graph_input_tensor_shape(i).dtype(),
+               TensorShape(proto.default_graph_input_tensor_shape(i).shape())));
+  }
+  for (const string& output_node_name : proto.graph_output_node_name()) {
+    outputs->emplace_back(output_node_name);
+  }
+}
+
+/* static */ void RemoteFusedGraphExecuteUtils::EmplaceTensorShapeType(
+    const string& name, const Tensor& tensor,
+    TensorShapeMap* tensor_shape_map) {
+  const TensorId tid = ParseTensorName(name);
+  CHECK_EQ(tensor_shape_map->count(name), 0);
+  tensor_shape_map->emplace(
+      tid.first.ToString(),
+      std::make_pair(tid.second,
+                     std::make_pair(tensor.dtype(), tensor.shape())));
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+    const std::vector<std::pair<string, Tensor>>& input_tensors,
+    const bool dry_run_inference, GraphDef* graph_def) {
+  TensorShapeMap tensor_shape_map;
+  if (dry_run_inference) {
+    TF_RETURN_IF_ERROR(DryRunInferenceForAllNode(*graph_def, input_tensors,
+                                                 /*initialize_by_zero=*/true,
+                                                 &tensor_shape_map));
+  } else {
+    ImportGraphDefOptions opts;
+    Graph graph(OpRegistry::Global());
+    ShapeRefiner shape_refiner(graph.versions().producer(),
+                               graph.op_registry());
+    TF_RETURN_IF_ERROR(
+        ImportGraphDef(opts, *graph_def, &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(PropagateShapeInference(*graph_def, input_tensors,
+                                               &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(
+        BuildTensorShapeMapFromGraph(graph, shape_refiner, &tensor_shape_map));
+  }
+
+  for (NodeDef& node_def : *graph_def->mutable_node()) {
+    TF_RETURN_IF_ERROR(
+        AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map, &node_def));
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+    const string& executor_name, const GraphDef& subgraph_def,
+    const std::vector<string>& inputs, const std::vector<string>& outputs,
+    const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+    DataTypeVector* input_types, DataTypeVector* output_types) {
+  CHECK_NOTNULL(execute_info);
+  CHECK_NOTNULL(input_types);
+  CHECK_NOTNULL(output_types);
+
+  execute_info->Clear();
+  execute_info->set_executor_name(executor_name);
+
+  // copy graph
+  *execute_info->mutable_remote_graph() = subgraph_def;
+
+  for (const string& input : inputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, input, &dt, &shape);
+
+    execute_info->add_graph_input_node_name(input);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+          *execute_info->add_default_graph_input_tensor_shape();
+      tensor_shape_type.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      input_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << input << DumpGraphDef(subgraph_def);
+      // Assuming input type is float if no data provided.
+      input_types->push_back(DT_FLOAT);
+    }
+  }
+
+  for (const string& output : outputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, output, &dt, &shape);
+
+    execute_info->add_graph_output_node_name(output);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto&
+          tensor_shape_type_proto =
+              *execute_info->add_default_graph_output_tensor_shape();
+      tensor_shape_type_proto.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto =
+          *tensor_shape_type_proto.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      output_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << output << DumpGraphDef(subgraph_def);
+      // Assuming output type is float if no data provided.
+      output_types->push_back(DT_FLOAT);
+    }
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+    const string& node_name, const string& executor_name,
+    const GraphDef& subgraph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, const bool require_shape_type,
+    Graph* graph, Node** created_node) {
+  CHECK_NOTNULL(graph);
+  CHECK_NOTNULL(created_node);
+
+  RemoteFusedGraphExecuteInfo execute_info;
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+
+  TF_CHECK_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      executor_name, subgraph_def, inputs, outputs, require_shape_type,
+      &execute_info, &input_types, &output_types));
+
+  std::vector<NodeBuilder::NodeOut> node_out_list;
+  for (const string& input : inputs) {
+    const TensorId tid = ParseTensorName(input);
+    Node* node = FindMutableNodeByName(tid.first.ToString(), graph);
+    CHECK_NOTNULL(node);
+    node_out_list.emplace_back(node, tid.second);
+  }
+
+  const string execute_info_str = execute_info.SerializeAsString();
+
+  auto builder =
+      NodeBuilder(node_name, "RemoteFusedGraphExecute")
+          .Input(node_out_list)
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", execute_info_str);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildIdentityOpNode(
+    const string& node_name, const string& input_node_name,
+    const int input_node_port, const DataType dt, Graph* graph,
+    Node** created_node) {
+  Node* node = FindMutableNodeByName(input_node_name, graph);
+  CHECK_NOTNULL(node);
+  NodeBuilder::NodeOut node_out(node, input_node_port);
+
+  auto builder =
+      NodeBuilder(node_name, "Identity").Input(node_out).Attr("T", dt);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+    const std::unordered_set<string>& node_names, const GraphDef& graph_def,
+    std::vector<ClusterInfo>* cluster_infos) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+  std::unordered_set<string> remaining_nodes = node_names;
+
+  while (!remaining_nodes.empty()) {
+    ClusterInfo ci;
+
+    // Determine one cluster nodes
+    std::unordered_set<const Node*> visited;
+    std::deque<const Node*> queue;
+    queue.emplace_back(FindNodeByName(*remaining_nodes.begin(), graph));
+    while (!queue.empty()) {
+      const Node* node = queue.front();
+      CHECK_NOTNULL(node);
+      queue.pop_front();
+      const string& node_name = node->name();
+      if (node_names.count(node_name) > 0) {
+        std::get<0>(ci).emplace(node_name);
+        remaining_nodes.erase(node_name);
+      } else {
+        // Edge of subgraph.  Do nothing.
+        continue;
+      }
+      for (const Node* in : node->in_nodes()) {
+        if (visited.insert(in).second) {
+          queue.push_back(in);
+        }
+      }
+      for (const Node* out : node->out_nodes()) {
+        if (visited.insert(out).second) {
+          queue.push_back(out);
+        }
+      }
+    }
+
+    // Determine one cluster border
+    std::vector<string>& border_inputs = std::get<1>(ci);
+    std::vector<string>& border_outputs = std::get<2>(ci);
+    for (const string& node_name : node_names) {
+      Node* node = FindMutableNodeByName(node_name, &graph);
+      CHECK_NOTNULL(node);
+      int input_count = 0;
+      for (const Edge* in_edge : node->in_edges()) {
+        const Node* src_node = in_edge->src();
+        const bool src_is_outside =
+            node_names.count(src_node->name()) <= 0 && !src_node->IsSource();
+        if (src_is_outside) {
+          const string src_name =
+              strings::StrCat(src_node->name(), ":", in_edge->src_output());
+          CHECK_EQ(1, src_node->num_outputs())
+              << "output count of input border node must be one."
+              << src_node->name();
+          if (std::find(border_inputs.begin(), border_inputs.end(), src_name) ==
+              border_inputs.end()) {
+            border_inputs.emplace_back(src_name);
+          }
+        } else {
+          ++input_count;
+        }
+      }
+      CHECK(input_count == 0 || input_count == node->in_edges().size());
+
+      for (const Edge* out_edge : node->out_edges()) {
+        const Node* dst_node = out_edge->dst();
+        CHECK_NOTNULL(dst_node);
+        const bool dst_is_outside = node_names.count(dst_node->name()) <= 0;
+        const string dst_name =
+            strings::StrCat(node->name(), ":", out_edge->src_output());
+        if (dst_is_outside) {
+          if (dst_node->IsSink()) {
+            CHECK_EQ(1, node->num_outputs())
+                << "If you want to specify output node as subgraph output node "
+                << "the output count of the node must be 1 "
+                << "because that node is replaced by identity node.";
+            const string identity_dst_name =
+                strings::StrCat(node->name(), ":", 0);
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          identity_dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(identity_dst_name);
+            }
+          } else {
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(dst_name);
+            }
+          }
+        }
+      }
+    }
+    cluster_infos->emplace_back(ci);
+    VLOG(1) << DumpCluster(ci);
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+    const ClusterInfo& cluster, const GraphDef& graph_def,
+    GraphDef* subgraph_def) {
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  const std::unordered_set<string>& border_input_names =
+      BuildNodeSetFromNodeNamesAndPorts(std::get<1>(cluster));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  for (Node* node : graph.nodes()) {
+    if (node != nullptr && node_names.count(node->name()) <= 0 &&
+        border_input_names.count(node->name()) <= 0 && !node->IsSource() &&
+        !node->IsSink()) {
+      graph.RemoveNode(node);
+    }
+  }
+  graph.ToGraphDef(subgraph_def);
+
+  for (const string& subgraph_input : std::get<1>(cluster)) {
+    const TensorId tid = ParseTensorName(subgraph_input);
+    const string subgraph_input_name = tid.first.ToString();
+    const int subgraph_input_port = tid.second;
+    const NodeDef* node_def = FindNodeDefByName(subgraph_input_name, graph_def);
+    CHECK_NOTNULL(node_def);
+    std::vector<DataType> dt_vec;
+    std::vector<TensorShape> shape_vec;
+    GetOutputTensorShapeType(*node_def, &dt_vec, &shape_vec).IgnoreError();
+    const DataType& dt =
+        dt_vec.empty() ? DT_FLOAT : dt_vec.at(subgraph_input_port);
+    const TensorShape& shape =
+        shape_vec.empty() ? TensorShape({}) : shape_vec.at(subgraph_input_port);
+
+    TF_RETURN_IF_ERROR(ReplaceInputNodeByPlaceHolder(subgraph_input_name, dt,
+                                                     shape, subgraph_def));
+  }
+
+  // sort subgraph_def to align order in graph_def
+  std::unordered_map<string, int> name_to_id_map;
+  for (int i = 0; i < graph_def.node_size(); ++i) {
+    name_to_id_map.emplace(graph_def.node(i).name(), i);
+  }
+  std::sort(subgraph_def->mutable_node()->begin(),
+            subgraph_def->mutable_node()->end(),
+            [&name_to_id_map](const NodeDef& node0, const NodeDef& node1) {
+              CHECK(name_to_id_map.count(node0.name()) > 0);
+              CHECK(name_to_id_map.count(node1.name()) > 0);
+              const int id0 = name_to_id_map.at(node0.name());
+              const int id1 = name_to_id_map.at(node1.name());
+              return id0 < id1;
+            });
+
+  VLOG(1) << DumpGraphDef(*subgraph_def);
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs, const GraphDef& graph_def,
+    ClusterInfo* cluster) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  std::unordered_set<const Node*> visited;
+  std::deque<const Node*> queue;
+  for (const string& output : border_outputs) {
+    const TensorId tid = ParseTensorName(output);
+    const string& output_node_name = tid.first.ToString();
+    for (const Node* node : graph.nodes()) {
+      if (output_node_name == node->name()) {
+        queue.push_back(node);
+        visited.insert(node);
+      }
+    }
+  }
+
+  std::unordered_set<const Node*> border_input_nodes;
+  // propagate visit to parent nodes until input nodes
+  while (!queue.empty()) {
+    const Node* node = queue.front();
+    queue.pop_front();
+    for (const Edge* edge : node->in_edges()) {
+      const Node* src_node = edge->src();
+      CHECK_NOTNULL(src_node);
+      const int src_port = edge->src_output();
+      bool input_found = false;
+      for (const string& input : border_inputs) {
+        const TensorId tid = ParseTensorName(input);
+        if (tid.first.ToString() == src_node->name() &&
+            tid.second == src_port) {
+          input_found = true;
+          border_input_nodes.insert(src_node);
+        }
+      }
+      if (visited.insert(src_node).second) {
+        if (!input_found) {
+          queue.push_back(src_node);
+        }
+      }
+    }
+  }
+
+  for (const Node* node : visited) {
+    if (node != nullptr && !node->IsSource() && !node->IsSink() &&
+        border_input_nodes.count(node) <= 0) {
+      std::get<0>(*cluster).insert(node->name());
+    }
+  }
+  std::get<1>(*cluster) = border_inputs;
+  std::get<2>(*cluster) = border_outputs;
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseCluster(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name, const ClusterInfo& cluster,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  LOG(INFO) << "Transforming quantized stripped model to a remote fused "
+               "graph execute op by fusing a specified subgraph...";
+
+  CHECK(!remote_graph_executor_name.empty());
+
+  const std::vector<string>& border_inputs = std::get<1>(cluster);
+  const std::vector<string>& border_outputs = std::get<2>(cluster);
+
+  GraphDef subgraph_def;
+  TF_RETURN_IF_ERROR(
+      BuildClusterSubgraphDef(cluster, input_graph_def, &subgraph_def));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(
+      ImportGraphDef({}, input_graph_def, &graph, &shape_refiner));
+
+  Node* fused_node;
+  TF_RETURN_IF_ERROR(BuildRemoteFusedGraphExecuteOpNode(
+      remote_fused_graph_node_name, remote_graph_executor_name, subgraph_def,
+      border_inputs, border_outputs, require_shape_type, &graph, &fused_node));
+
+  for (const Node* node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      const Edge* edge = nullptr;
+      TF_RETURN_IF_ERROR(node->input_edge(i, &edge));
+      for (int j = 0; j < border_outputs.size(); ++j) {
+        const string& output = border_outputs.at(j);
+        const TensorId tid = ParseTensorName(output);
+        const string output_name = tid.first.ToString();
+        Node* src_node = edge->src();
+        if (src_node != nullptr && src_node->name() == output_name &&
+            edge->src_output() == tid.second) {
+          // Source node is replaced by new fused node.
+          Node* dst_node = edge->dst();
+          const int dst_input = edge->dst_input();
+          LOG(INFO) << "Removing existing edge to " << edge->dst()->name()
+                    << " from " << edge->src()->name();
+          graph.RemoveEdge(edge);
+          graph.AddEdge(fused_node, j, dst_node, dst_input);
+        }
+      }
+    }
+  }
+
+  // Replace output nodes by identity nodes which forward outputs from
+  // RemoteFusedGraphExecuteOpNode
+  for (const string& output : outputs) {
+    const TensorId output_tid = ParseTensorName(output);
+    const string output_name = output_tid.first.ToString();
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      const TensorId subgraph_output_tid =
+          ParseTensorName(border_outputs.at(i));
+      const string& subgraph_output_name = subgraph_output_tid.first.ToString();
+      if (output_name == subgraph_output_name) {
+        LOG(INFO) << "As graph output and subgraph output are same, "
+                  << "the graph output node is replaced by identity node";
+        Node* original_output_node = FindMutableNodeByName(output_name, &graph);
+        CHECK_NOTNULL(original_output_node);
+        CHECK_EQ(1, original_output_node->num_outputs())
+            << "Num outputs should be 1 for " << output << ".";
+        graph.RemoveNode(original_output_node);
+        Node* new_node;
+        TF_RETURN_IF_ERROR(BuildIdentityOpNode(output_name,
+                                               remote_fused_graph_node_name, i,
+                                               DT_FLOAT, &graph, &new_node));
+        CHECK_NOTNULL(new_node);
+      }
+    }
+  }
+
+  GraphDef result_graph_def;
+
+  graph.ToGraphDef(&result_graph_def);
+
+  ClusterInfo graph_cluster;
+  TF_RETURN_IF_ERROR(
+      BuildClusterByBorder(inputs, outputs, result_graph_def, &graph_cluster));
+
+  // Remove unvisited nodes
+  TF_RETURN_IF_ERROR(BuildClusterSubgraphDef(graph_cluster, result_graph_def,
+                                             output_graph_def));
+
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name_prefix,
+    const std::unordered_set<string>& subgraph_nodes,
+    const string& remote_fused_graph_executor_name,
+    const bool require_shape_type, GraphDef* output_graph_def) {
+  std::vector<ClusterInfo> ci_vec;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      subgraph_nodes, input_graph_def, &ci_vec));
+
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    const string remote_fused_graph_node_name =
+        strings::StrCat(remote_fused_graph_node_name_prefix, "/", i);
+    TF_RETURN_IF_ERROR(FuseCluster(input_graph_def, inputs, outputs,
+                                   remote_fused_graph_node_name, ci_vec.at(i),
+                                   remote_fused_graph_executor_name,
+                                   require_shape_type, output_graph_def));
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name,
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  ClusterInfo cluster;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      border_inputs, border_outputs, input_graph_def, &cluster));
+
+  return FuseCluster(
+      input_graph_def, inputs, outputs, remote_fused_graph_node_name, cluster,
+      remote_graph_executor_name, require_shape_type, output_graph_def);
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
+    const std::vector<string>& inputs, const std::vector<string>& outputs,
+    const std::unordered_set<string>& fused_node_names,
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs,
+    const string& remote_fused_graph_node_name,
+    const string& remote_graph_executor_name, GraphDef* graph_def) {
+  CHECK_NOTNULL(graph_def);
+  for (NodeDef& node_def : *graph_def->mutable_node()) {
+    string attr_str;
+    TensorId tid;
+    for (int i = 0; i < inputs.size(); ++i) {
+      if (IsSameNodeName(node_def, inputs.at(i), &tid)) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_INPUT,
+                                      tid.second, i, remote_graph_executor_name,
+                                      remote_fused_graph_node_name);
+      }
+    }
+    for (int i = 0; i < outputs.size(); ++i) {
+      if (IsSameNodeName(node_def, outputs.at(i), &tid)) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT,
+                                      tid.second, i);
+      }
+    }
+    for (const string& fused_node_name : fused_node_names) {
+      if (fused_node_name == node_def.name()) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+      }
+    }
+    for (int i = 0; i < border_inputs.size(); ++i) {
+      if (IsSameNodeName(node_def, border_inputs.at(i), &tid)) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::BORDER_INPUT,
+                                      tid.second, i);
+      }
+    }
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      if (IsSameNodeName(node_def, border_outputs.at(i), &tid)) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(
+            RemoteFusedGraphExecuteInfo::BORDER_OUTPUT, tid.second, i);
+      }
+    }
+    if (attr_str.empty()) {
+      attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::UNUSED);
+    }
+    AddNodeAttr(ATTR_NODE_TYPE, attr_str, &node_def);
+  }
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
+    const GraphDef& input_graph_def,
+    const std::vector<std::pair<string, Tensor>>& input_tensors,
+    GraphDef* output_graph_def) {
+  std::unordered_map<int, string> input_map;
+  std::unordered_map<int, string> output_map;
+  std::unordered_set<string> fused_node_names;
+  std::unordered_map<int, string> border_input_map;
+  std::unordered_map<int, string> border_output_map;
+  string remote_graph_executor_name;
+  string remote_fused_graph_node_name;
+
+  for (const NodeDef& node_def : input_graph_def.node()) {
+    string attr_str;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node_def, ATTR_NODE_TYPE, &attr_str));
+    std::vector<std::vector<string>> attr_strs;
+    for (const string& str : str_util::Split(attr_str, ":")) {
+      attr_strs.emplace_back(str_util::Split(str, ","));
+    }
+    if (attr_strs.empty()) {
+      return errors::InvalidArgument("Remote graph node type not found.");
+    }
+    for (const std::vector<string>& attr : attr_strs) {
+      if (attr.empty()) {
+        return errors::InvalidArgument("Empty remote graph node type attr.");
+      }
+      int node_type_int;
+      CHECK(strings::safe_strto32(attr.at(0), &node_type_int)) << attr.at(0);
+      const RemoteFusedGraphExecuteInfo::NodeType node_type =
+          static_cast<RemoteFusedGraphExecuteInfo::NodeType>(node_type_int);
+      const string& name = node_def.name();
+      int port;
+      int index;
+
+      switch (node_type) {
+        case RemoteFusedGraphExecuteInfo::GRAPH_INPUT:
+          VLOG(2) << "Graph input: " << name;
+          CHECK_EQ(5, attr.size());
+          CHECK(strings::safe_strto32(attr.at(1), &port));
+          CHECK(strings::safe_strto32(attr.at(2), &index));
+          CHECK(!attr.at(3).empty());
+          remote_graph_executor_name = attr.at(3);
+          CHECK(!attr.at(4).empty());
+          remote_fused_graph_node_name = attr.at(4);
+          input_map.emplace(index, strings::StrCat(name, ":", port));
+          if (GetExecutorBuildFunc(remote_graph_executor_name) == nullptr) {
+            LOG(INFO) << "Executor for " << remote_graph_executor_name
+                      << " not registered.  Do not fuse.";
+            *output_graph_def = input_graph_def;
+            return Status::OK();
+          }
+          break;
+        case RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT:
+          VLOG(2) << "Graph output: " << name;
+          CHECK_EQ(3, attr.size());
+          CHECK(strings::safe_strto32(attr.at(1), &port));
+          CHECK(strings::safe_strto32(attr.at(2), &index));
+          output_map.emplace(index, strings::StrCat(name, ":", port));
+          break;
+        case RemoteFusedGraphExecuteInfo::FUSED_NODE:
+          VLOG(2) << "Fused node: " << name;
+          CHECK_EQ(1, attr.size());
+          fused_node_names.emplace(name);
+          break;
+        case RemoteFusedGraphExecuteInfo::BORDER_INPUT:
+          VLOG(2) << "Border input: " << name;
+          CHECK_EQ(3, attr.size());
+          CHECK(strings::safe_strto32(attr.at(1), &port));
+          CHECK(strings::safe_strto32(attr.at(2), &index));
+          border_input_map.emplace(index, strings::StrCat(name, ":", port));
+          break;
+        case RemoteFusedGraphExecuteInfo::BORDER_OUTPUT:
+          VLOG(2) << "Border output: " << name;
+          CHECK_EQ(3, attr.size());
+          CHECK(strings::safe_strto32(attr.at(1), &port));
+          CHECK(strings::safe_strto32(attr.at(2), &index));
+          border_output_map.emplace(index, strings::StrCat(name, ":", port));
+          break;
+        case RemoteFusedGraphExecuteInfo::UNUSED:
+          // do nothing
+          break;
+        default:
+          // unsupported value
+          CHECK(false);
+      }
+    }
+  }
+  bool require_shape_type = false;
+  std::vector<string> inputs;
+  std::vector<string> outputs;
+  std::vector<string> border_inputs;
+  std::vector<string> border_outputs;
+  ConvertMapToVector(input_map, &inputs);
+  ConvertMapToVector(output_map, &outputs);
+  ConvertMapToVector(border_input_map, &border_inputs);
+  ConvertMapToVector(border_output_map, &border_outputs);
+
+  if (!input_tensors.empty()) {
+    bool input_match = false;
+    if (inputs.size() == input_tensors.size()) {
+      for (const std::pair<string, Tensor>& input_tensor : input_tensors) {
+        if (!ContainsSameTensorId(input_tensor.first, inputs)) {
+          break;
+        }
+        DataType data_type;
+        TensorShape shape;
+        if (GetOutputTensorShapeType(input_graph_def, input_tensor.first,
+                                     &data_type, &shape)) {
+          if (data_type == input_tensor.second.dtype() &&
+              shape == input_tensor.second.shape()) {
+            VLOG(2) << "Input matched!";
+            // Shape type matched.
+            input_match = true;
+            require_shape_type = true;
+          }
+        } else {
+          // Shape type not required.
+          input_match = true;
+        }
+      }
+    }
+    if (!input_match) {
+      // Input mismatch.  Just copy original graph
+      *output_graph_def = input_graph_def;
+      return Status::OK();
+    }
+  }
+
+  if (!fused_node_names.empty()) {
+    TF_RETURN_IF_ERROR(FuseRemoteGraphByNodeNames(
+        input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        fused_node_names, remote_graph_executor_name, require_shape_type,
+        output_graph_def));
+  } else if (!border_inputs.empty() || !border_outputs.empty()) {
+    TF_RETURN_IF_ERROR(FuseRemoteGraphByBorder(
+        input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        border_inputs, border_outputs, remote_graph_executor_name,
+        require_shape_type, output_graph_def));
+  } else {
+    *output_graph_def = input_graph_def;
+  }
+
+  return Status::OK();
+}
+
+/* static */ bool RemoteFusedGraphExecuteUtils::IsFuseReady(
+    const GraphDef& graph_def,
+    const std::vector<std::pair<string, Tensor>>& input_tensors) {
+  for (const std::pair<string, Tensor>& input_tensor : input_tensors) {
+    const NodeDef* node_def = FindNodeDefByName(input_tensor.first, graph_def);
+    if (node_def == nullptr) {
+      return false;
+    }
+    string attr;
+    const Status status = GetNodeAttr(*node_def, ATTR_NODE_TYPE, &attr);
+    if (!status.ok() || attr.empty()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder(
+    const string& input, const DataType type, const TensorShape& shape,
+    GraphDef* graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  CHECK_EQ(0, tid.second);
+  const string node_name = tid.first.ToString();
+  for (NodeDef& node : *graph_def->mutable_node()) {
+    if (node.name() != node_name) {
+      continue;
+    }
+    if (node.op() == "Placeholder") {
+      return Status::OK();
+    } else {
+      NodeDef placeholder_node;
+      placeholder_node.set_op("Placeholder");
+      placeholder_node.set_name(node_name);
+      AddNodeAttr("dtype", type, &placeholder_node);
+      AddNodeAttr("shape", shape, &placeholder_node);
+      // TODO(satok): Remove once we merge attributes
+      AddOutputTensorShapeType({type}, {shape}, &placeholder_node);
+      node.Clear();
+      node = placeholder_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument(
+      strings::StrCat(node_name, " not found for replacement."));
+}
+
+/* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
+    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
+    const int index, const string& executor_name, const string& node_name) {
+  return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index,
+                         ",", executor_name, ",", node_name);
+}
+
+/* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
+    const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
+    const int index) {
+  return strings::StrCat(static_cast<int>(node_type), ",", port, ",", index);
+}
+
+/* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr(
+    const RemoteFusedGraphExecuteInfo::NodeType node_type) {
+  return strings::StrCat(static_cast<int>(node_type));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
new file mode 100644
index 00000000000..a80fc797841
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -0,0 +1,278 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RemoteFusedGraphExecuteUtils provides APIs to register and get builder
+// functions for IRemoteFusedGraphExecutor.
+class RemoteFusedGraphExecuteUtils {
+ public:
+  // TODO(satok): Use "_output_data_types" to share a spec with other ops
+  static constexpr const char* const ATTR_OUTPUT_DATA_TYPES =
+      "_default_remote_graph_output_data_types";
+  // TODO(satok): Use "_output_shapes" to share a spec with other ops
+  static constexpr const char* const ATTR_OUTPUT_SHAPES =
+      "_default_remote_output_shapes";
+  static constexpr const char* const
+      ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO =
+          "serialized_remote_fused_graph_execute_info";
+  static constexpr const char* const ATTR_NODE_TYPE =
+      "_remote_fused_graph_node_type";
+
+  // Argument key strings to fuse a subgraph into RemoteFusedGraphExecuteOp.
+  static constexpr const char* const
+      TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
+          "remote_fused_graph_executor_name";
+  static constexpr const char* const
+      TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME =
+          "remote_fused_graph_node_name";
+  static constexpr const char* const TRANSFORM_ARG_FUSED_NODES = "fused_nodes";
+  static constexpr const char* const TRANSFORM_ARG_BORDER_INPUTS =
+      "border_inputs";
+  static constexpr const char* const TRANSFORM_ARG_BORDER_OUTPUTS =
+      "border_outputs";
+  static constexpr const char* const TRANSFORM_ARG_INPUT_TYPES = "input_types";
+  static constexpr const char* const TRANSFORM_ARG_INPUT_SHAPES =
+      "input_shapes";
+
+  using ExecutorBuildFunc = std::function<Status(
+      std::unique_ptr<IRemoteFusedGraphExecutor>* executor)>;
+  // Registrar class for IRemoteFusedGraphExecutor.
+  class ExecutorBuildRegistrar {
+   public:
+    ExecutorBuildRegistrar(const string& name, ExecutorBuildFunc func);
+
+   private:
+    TF_DISALLOW_COPY_AND_ASSIGN(ExecutorBuildRegistrar);
+  };
+  using ExecutorBuildRegistry = std::map<string, ExecutorBuildFunc>;
+
+  using TensorShapeType = std::pair<DataType, TensorShape>;
+  using TensorShapeMap = std::unordered_multimap<string,         // node name
+                                                 std::pair<int,  // port
+                                                           TensorShapeType>>;
+  using ClusterInfo = std::tuple<std::unordered_set<string>,  // node names
+                                 std::vector<string>,         // border inputs
+                                 std::vector<string>>;        // border outputs
+
+  // Return registered ExecutorBuildFunc for given name.
+  static const ExecutorBuildFunc* GetExecutorBuildFunc(const string& name);
+
+  // To determine shapes of output tensors of all nodes, dryrun the graph.
+  // This function supplies memory allocation information when loading
+  // the graph. This function is used to verify shape inference and actual
+  // output shape.
+  static Status DryRunInference(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names,
+      const bool initialize_by_zero,
+      std::vector<tensorflow::Tensor>* output_tensors);
+
+  // Dry run inference to obtain shapes for all nodes.
+  // CAVEAT: Do not add or modify output_tensors in output_tensor_info
+  // otherwise, address map may be broken by re-allocation inside
+  // std::vector.
+  static Status DryRunInferenceForAllNode(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const bool initialize_by_zero, TensorShapeMap* tensor_shape_map);
+
+  static bool IsInputNode(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const string& node_name);
+
+  static void ConvertToTensorShapeMap(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names,
+      const std::vector<tensorflow::Tensor>& output_tensors,
+      TensorShapeMap* tensor_shape_map);
+
+  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                    Tensor* tensor);
+
+  static bool AddOutputTensorShapeType(const std::vector<DataType>& data_types,
+                                       const std::vector<TensorShape>& shapes,
+                                       NodeDef* node_def);
+
+  static Status AddOutputTensorShapeTypeByTensorShapeMap(
+      const TensorShapeMap& tensor_shape_map, NodeDef* node_def);
+
+  static Status GetOutputTensorShapeType(AttrSlice attrs,
+                                         std::vector<DataType>* data_types,
+                                         std::vector<TensorShape>* shapes);
+
+  static bool GetOutputTensorShapeType(const GraphDef& graph_def,
+                                       const string& name_and_port,
+                                       DataType* data_type, TensorShape* shape);
+
+  static Status PropagateShapeInference(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      Graph* graph, ShapeRefiner* shape_refiner);
+
+  static Status BuildTensorShapeMapFromGraph(const Graph& graph,
+                                             const ShapeRefiner& shape_refiner,
+                                             TensorShapeMap* tensor_shape_map);
+
+  static const TensorShapeType* GetTensorShapeType(
+      const TensorShapeMap& tensor_shape_map, const string& node_name);
+
+  static const TensorShapeType* GetTensorShapeType(
+      const TensorShapeMap& tensor_shape_map, const string& node_name,
+      const int port);
+
+  static void BuildRemoteGraphInputsAndOutputsFromProto(
+      const RemoteFusedGraphExecuteInfo& proto,
+      std::vector<std::pair<string, Tensor>>* inputs,
+      std::vector<string>* outputs);
+
+  static Status BuildAndAddTensorShapes(
+      const std::vector<std::pair<string, Tensor>>& input_tensors,
+      const bool dry_run_inference, GraphDef* graph_def);
+
+  // Build remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteInfo(
+      const string& executor_name, const GraphDef& subgraph_def,
+      const std::vector<string>& inputs, const std::vector<string>& outputs,
+      const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+      DataTypeVector* input_types, DataTypeVector* output_types);
+
+  // Build remote fused graph execute op node by fusing specified subgraph
+  // as remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteOpNode(
+      const string& node_name, const string& executor_name,
+      const GraphDef& subgraph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs, const bool require_shape_type,
+      Graph* graph, Node** created_node);
+
+  // Build Identity node to forward remote graph node output
+  static Status BuildIdentityOpNode(const string& node_name,
+                                    const string& input_node_name,
+                                    const int input_node_port,
+                                    const DataType dt, Graph* graph,
+                                    Node** created_node);
+
+  // Create clusters of given nodes
+  static Status ClusterizeNodes(const std::unordered_set<string>& node_names,
+                                const GraphDef& graph_def,
+                                std::vector<ClusterInfo>* cluster_infos);
+
+  // Build GraphDef of a given cluster
+  static Status BuildClusterSubgraphDef(const ClusterInfo& cluster,
+                                        const GraphDef& graph_def,
+                                        GraphDef* subgraph_def);
+
+  // Build a cluster by given border
+  // CAVEAT: The border must be consistent for one cluster.
+  static Status BuildClusterByBorder(const std::vector<string>& border_inputs,
+                                     const std::vector<string>& border_outputs,
+                                     const GraphDef& graph_def,
+                                     ClusterInfo* cluster);
+
+  // Fuse one cluster into a newly created RemoteFusedGraphExecuteOp node.
+  // The subgraph is stored as a graph in RemoteFusedGraphExecuteInfo.
+  // CAVEAT1: This transform strips unvisited nodes with given outputs.
+  // CAVEAT2: If you want to use a graph output as a border output,
+  // that graph output node is replaced by an identity node.  Therefore,
+  // the number of output of the node must be 1.
+  static Status FuseCluster(const GraphDef& input_graph_def,
+                            const std::vector<string>& inputs,
+                            const std::vector<string>& outputs,
+                            const string& remote_fused_graph_node_name,
+                            const ClusterInfo& cluster,
+                            const string& remote_graph_executor_name,
+                            const bool require_shape_type,
+                            GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified nodes
+  static Status FuseRemoteGraphByNodeNames(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name_prefix,
+      const std::unordered_set<string>& subgraph_nodes,
+      const string& remote_fused_graph_executor_name,
+      const bool require_shape_type, GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified border
+  static Status FuseRemoteGraphByBorder(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name,
+      const std::vector<string>& border_inputs,
+      const std::vector<string>& border_outputs,
+      const string& remote_graph_executor_name, const bool require_shape_type,
+      GraphDef* output_graph_def);
+
+  // Place arguments to fuse remote graph
+  static Status PlaceRemoteGraphArguments(
+      const std::vector<string>& inputs, const std::vector<string>& outputs,
+      const std::unordered_set<string>& fused_node_names,
+      const std::vector<string>& border_inputs,
+      const std::vector<string>& border_outputs,
+      const string& remote_fused_graph_node_name,
+      const string& remote_graph_executor_name, GraphDef* graph_def);
+
+  // Fuse remote graph by placed arguments
+  static Status FuseRemoteGraphByPlacedArguments(
+      const GraphDef& input_graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_tensors,
+      GraphDef* output_graph_def);
+
+  static bool IsFuseReady(
+      const GraphDef& input_graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_tensors);
+
+ private:
+  static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
+                                     TensorShapeMap* tensor_shape_map);
+
+  static Status ReplaceInputNodeByPlaceHolder(const string& input,
+                                              const DataType type,
+                                              const TensorShape& shape,
+                                              GraphDef* graph_def);
+
+  static ExecutorBuildRegistry* GetExecutorBuildRegistry();
+
+  static string BuildNodeTypeAttr(
+      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
+      const int index, const string& executor_name, const string& node_name);
+
+  static string BuildNodeTypeAttr(
+      const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port,
+      const int index);
+
+  static string BuildNodeTypeAttr(
+      const RemoteFusedGraphExecuteInfo::NodeType node_type);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
+};
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
new file mode 100644
index 00000000000..b24482f2d54
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -0,0 +1,759 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ClusterInfo = RemoteFusedGraphExecuteUtils::ClusterInfo;
+
+constexpr const char* const NAME_A = "A";
+constexpr const char* const NAME_B = "B";
+constexpr const char* const NAME_A_PLUS_B = "A_PLUS_B";
+constexpr float NODE_A_VAL = 2.0f;
+constexpr float NODE_B_VAL = 3.0f;
+constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
+
+static NodeDef* GetNodeDef(const string& name, GraphDef* def) {
+  CHECK_NE(def, nullptr);
+  for (NodeDef& node_def : *def->mutable_node()) {
+    if (node_def.name() == name) {
+      return &node_def;
+    }
+  }
+  return nullptr;
+}
+
+class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    TF_ASSERT_OK(
+        RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def_));
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        k_hexagon_remote_fused_graph_executor_build(
+            "remote_graph_executor_name",
+            [](std::unique_ptr<IRemoteFusedGraphExecutor>* executor) -> Status {
+              return Status::OK();
+            });
+  }
+
+  void TearDown() final {}
+
+  Status FuseByInOut() {
+    // Feed output shapes and types
+    RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+    GraphDef graph_def_with_shapetype = graph_def_;
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+        input_tensors_, /*dry_run_inference*/ true, &graph_def_with_shapetype));
+
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+        graph_def_with_shapetype, inputs_, outputs_,
+        "remote_fused_graph_node_names", subgraph_input_names_,
+        subgraph_output_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/true, &result_graph_def_);
+  }
+
+  Status FuseByNodes() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+        graph_def_, inputs_, outputs_, "remote_fused_graph_node_names",
+        subgraph_node_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/false, &result_graph_def_);
+  }
+
+  Status BuildAndAddTensorShape() {
+    return RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+        input_tensors_, /*dry_run_inference=*/true, &graph_def_);
+  }
+
+  Status PlaceRemoteGraphArguments() {
+    return RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
+        inputs_, outputs_, subgraph_node_names_, subgraph_input_names_,
+        subgraph_output_names_, "remote_fused_graph_node_names",
+        "remote_graph_executor_name", &graph_def_);
+  }
+
+  Status FuseByPlacedArguments() {
+    const Status status =
+        RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
+            graph_def_, input_tensors_, &graph_def_);
+    result_graph_def_ = graph_def_;
+    return status;
+  }
+
+  bool IsFuseReady() {
+    return RemoteFusedGraphExecuteUtils::IsFuseReady(graph_def_,
+                                                     input_tensors_);
+  }
+
+ public:
+  const std::vector<std::pair<string, Tensor>> input_tensors_{
+      {"A", {DT_FLOAT, {1, 1, 1, 1}}}};
+  const std::vector<string> inputs_{"A"};
+  const std::vector<string> outputs_{"K"};
+  GraphDef graph_def_;
+  GraphDef result_graph_def_;
+  std::vector<string> subgraph_input_names_;
+  std::vector<string> subgraph_output_names_;
+  std::unordered_set<string> subgraph_node_names_;
+};
+
+void SetSubgraphArguments(const std::vector<string>& input_names,
+                          const std::vector<string>& output_names,
+                          FuseRemoteGraphMultipleAddOpsTest* fixture) {
+  for (const string& input_name : input_names) {
+    fixture->subgraph_input_names_.emplace_back(input_name);
+  }
+
+  fixture->subgraph_output_names_ = output_names;
+}
+
+template <typename T>
+static string IterToString(const T& set) {
+  string out;
+  for (const string& val : set) {
+    if (!out.empty()) {
+      out += ", ";
+    }
+    out += val;
+  }
+  return out;
+}
+
+static string SummarizeGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+static string DumpInOutNames(const std::vector<ClusterInfo>& ci_vec) {
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    LOG(INFO) << "Cluster(" << i << ")";
+    LOG(INFO) << "input: " << IterToString(std::get<1>(ci_vec.at(i)));
+    LOG(INFO) << "output: " << IterToString(std::get<2>(ci_vec.at(i)));
+  }
+  return "";
+}
+
+static void ClearCluster(ClusterInfo* cluster) {
+  std::get<0>(*cluster).clear();
+  std::get<1>(*cluster).clear();
+  std::get<2>(*cluster).clear();
+}
+
+TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphA) {
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  std::pair<string, Tensor> input_node_info;
+  input_node_info.first = NAME_A;
+  input_node_info.second = Tensor(DT_FLOAT, {});
+  input_node_info.second.scalar<float>()() = 1.0f;
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info};
+  std::vector<string> outputs = {NAME_B, NAME_A_PLUS_B};
+  std::vector<tensorflow::Tensor> output_tensors;
+  Status status = RemoteFusedGraphExecuteUtils::DryRunInference(
+      def, inputs, outputs, false /* initialize_by_zero */, &output_tensors);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_EQ(outputs.size(), output_tensors.size());
+  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(0).scalar<float>()(),
+              VALUE_TOLERANCE_FLOAT);
+  EXPECT_NEAR(1.0f + NODE_B_VAL, output_tensors.at(1).scalar<float>()(),
+              VALUE_TOLERANCE_FLOAT);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAUninitialized) {
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  std::pair<string, Tensor> input_node_info;
+  input_node_info.first = NAME_A;
+  input_node_info.second = Tensor(DT_FLOAT, {});
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info};
+  std::vector<string> outputs = {NAME_B, NAME_A_PLUS_B};
+  std::vector<tensorflow::Tensor> output_tensors;
+  Status status = RemoteFusedGraphExecuteUtils::DryRunInference(
+      def, inputs, outputs, true /* initialize_by_zero */, &output_tensors);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_EQ(outputs.size(), output_tensors.size());
+  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(0).scalar<float>()(),
+              VALUE_TOLERANCE_FLOAT);
+  EXPECT_NEAR(NODE_B_VAL, output_tensors.at(1).scalar<float>()(),
+              VALUE_TOLERANCE_FLOAT);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAB) {
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = NODE_A_VAL;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = NODE_B_VAL;
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a,
+                                                      input_node_info_b};
+  std::vector<string> outputs = {NAME_A_PLUS_B};
+  std::vector<tensorflow::Tensor> output_tensors;
+  Status status = RemoteFusedGraphExecuteUtils::DryRunInference(
+      def, inputs, outputs, false /* initialize_by_zero */, &output_tensors);
+  ASSERT_TRUE(status.ok()) << status;
+  EXPECT_EQ(outputs.size(), output_tensors.size());
+  EXPECT_NEAR(NODE_A_VAL + NODE_B_VAL, output_tensors.at(0).scalar<float>()(),
+              VALUE_TOLERANCE_FLOAT);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphForAllNodes) {
+  // Set Node "A" as an input with value (= 1.0f)
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = 1.0f;
+
+  // Setup dryrun arguments
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a};
+  RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+
+  // dryrun
+  const Status status = RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
+      def, inputs, false /* initialize_by_zero */, &tensor_shape_map);
+
+  ASSERT_TRUE(status.ok()) << status;
+
+  // Assert output node count
+  ASSERT_EQ(3, tensor_shape_map.size());
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_A));
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_B));
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_A_PLUS_B));
+
+  const RemoteFusedGraphExecuteUtils::TensorShapeType* tst =
+      RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                       NAME_B);
+  EXPECT_NE(tst, nullptr);
+  EXPECT_EQ(DT_FLOAT, tst->first);
+  EXPECT_EQ(0, tst->second.dims());
+
+  tst = RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                         NAME_A_PLUS_B);
+  EXPECT_NE(tst, nullptr);
+  EXPECT_EQ(DT_FLOAT, tst->first);
+  EXPECT_EQ(0, tst->second.dims());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, PropagateAndBuildTensorShapeMap) {
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = NODE_A_VAL;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = NODE_B_VAL;
+  const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a,
+                                                      input_node_info_b};
+
+  RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  ImportGraphDefOptions opts;
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  Status status = ImportGraphDef(opts, def, &graph, &shape_refiner);
+  ASSERT_TRUE(RemoteFusedGraphExecuteUtils::PropagateShapeInference(
+                  def, inputs, &graph, &shape_refiner)
+                  .ok());
+  ASSERT_TRUE(RemoteFusedGraphExecuteUtils::BuildTensorShapeMapFromGraph(
+                  graph, shape_refiner, &tensor_shape_map)
+                  .ok());
+
+  ASSERT_EQ(3, tensor_shape_map.size());
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_A));
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_B));
+  ASSERT_EQ(1, tensor_shape_map.count(NAME_A_PLUS_B));
+
+  const RemoteFusedGraphExecuteUtils::TensorShapeType* tst =
+      RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                       NAME_B);
+  EXPECT_NE(tst, nullptr);
+  EXPECT_EQ(DT_FLOAT, tst->first);
+  EXPECT_EQ(0, tst->second.dims());
+
+  tst = RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
+                                                         NAME_A_PLUS_B);
+  EXPECT_NE(tst, nullptr);
+  EXPECT_EQ(DT_FLOAT, tst->first);
+  EXPECT_EQ(0, tst->second.dims());
+
+  {
+    NodeDef* node_def = GetNodeDef(NAME_B, &def);
+    TF_ASSERT_OK(
+        RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
+            tensor_shape_map, node_def));
+    std::vector<DataType> data_types;
+    TF_ASSERT_OK(GetNodeAttr(
+        *node_def, RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES,
+        &data_types));
+    ASSERT_EQ(1, data_types.size());
+    EXPECT_EQ(DT_FLOAT, data_types.at(0));
+
+    std::vector<TensorShape> shapes;
+    TF_ASSERT_OK(GetNodeAttr(
+        *node_def, RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, &shapes));
+    ASSERT_EQ(1, shapes.size());
+    EXPECT_EQ(0, shapes.at(0).dims());
+  }
+
+  {
+    NodeDef* node_def = GetNodeDef(NAME_A_PLUS_B, &def);
+    TF_ASSERT_OK(
+        RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
+            tensor_shape_map, node_def));
+    std::vector<DataType> data_types;
+    TF_ASSERT_OK(GetNodeAttr(
+        *node_def, RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES,
+        &data_types));
+    ASSERT_EQ(1, data_types.size());
+    EXPECT_EQ(DT_FLOAT, data_types.at(0));
+
+    std::vector<TensorShape> shapes;
+    TF_ASSERT_OK(GetNodeAttr(
+        *node_def, RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, &shapes));
+    ASSERT_EQ(1, shapes.size());
+    EXPECT_EQ(0, shapes.at(0).dims());
+  }
+}
+
+TEST(RemoteFusedGraphExecuteUtils,
+     BuildRemoteFusedGraphExecuteInfoWithShapeInference) {
+  // Build inputs
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = NODE_A_VAL;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = NODE_B_VAL;
+  const std::vector<std::pair<string, Tensor>> input_tensors{input_node_info_a,
+                                                             input_node_info_b};
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+      input_tensors, /*dry_run_inference*/ true, &def));
+
+  RemoteFusedGraphExecuteInfo execute_info0;
+  DataTypeVector input_types0;
+  DataTypeVector output_types0;
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      "executor", def, inputs, outputs, /*require_shape_type=*/true,
+      &execute_info0, &input_types0, &output_types0));
+
+  EXPECT_EQ(inputs.size(),
+            execute_info0.default_graph_input_tensor_shape_size());
+  EXPECT_EQ(outputs.size(),
+            execute_info0.default_graph_output_tensor_shape_size());
+  EXPECT_EQ(inputs.size(), input_types0.size());
+  EXPECT_EQ(outputs.size(), output_types0.size());
+
+  EXPECT_EQ(def.node_size(), execute_info0.remote_graph().node_size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildRemoteFusedGraphExecuteOpNode) {
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_ASSERT_OK(ImportGraphDef({}, def, &graph, &shape_refiner));
+
+  Node* node;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+      "fused_name", "executor", def, inputs, outputs,
+      /*require_shape_type=*/false, &graph, &node));
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ExtractSubgraphNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+  ClusterInfo cluster;
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"H", "I"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(1, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F", "C", "G"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(3, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(5, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"K"}, graph_def, &cluster));
+  EXPECT_EQ(6, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F"}, {"H"}, graph_def, &cluster));
+  EXPECT_EQ(2, node_names.size()) << IterToString(node_names);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ClusterizeNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+
+  std::vector<ClusterInfo> ci_vec;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteUtils::ClusterizeNodes({"J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(2, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(3, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"F", "C", "G", "H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(4, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(2, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "C", "D", "E"}, graph_def, &ci_vec));
+  ASSERT_EQ(5, ci_vec.size());
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "D", "E", "F", "G"}, graph_def, &ci_vec));
+  ASSERT_EQ(2, ci_vec.size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildSubgraphDefByInOut) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+
+  ClusterInfo cluster;
+  GraphDef subgraph_def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"H", "I"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F", "C", "G"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(6, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"J"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(10, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"K"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(11, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F"}, std::vector<string>{"H"}, graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_HI_J) {
+  SetSubgraphArguments(std::vector<string>{"H", "I"}, std::vector<string>{"J"},
+                       this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_FCG_J) {
+  SetSubgraphArguments(std::vector<string>{"F", "C", "G"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_ABCDE_J) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(8, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_ABCDE_K) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"K"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(7, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_H) {
+  subgraph_node_names_ = {"H"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_HIJ) {
+  subgraph_node_names_ = {"H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_CFGHIJ) {
+  subgraph_node_names_ = {"C", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(6, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_ABCDEFGHIJ) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_ABCDEFGHIJK) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F",
+                          "G", "H", "I", "J", "K"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_H) {
+  subgraph_node_names_ = {"H"};
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_CFGHIJ) {
+  subgraph_node_names_ = {"C", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(6, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_ABCDEFGHIJK) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F",
+                          "G", "H", "I", "J", "K"};
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_HI_J) {
+  SetSubgraphArguments(std::vector<string>{"H", "I"}, std::vector<string>{"J"},
+                       this);
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_FCG_J) {
+  SetSubgraphArguments(std::vector<string>{"F", "C", "G"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_ABCDE_K) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"K"}, this);
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(7, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
new file mode 100644
index 00000000000..ac0503088c4
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
@@ -0,0 +1,230 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wraps the hexagon rewriter in a transform so it can be used as part of the
+// graph transform tool.
+// A usage example, based on inception v3 model:
+/*
+bazel build tensorflow/tools/graph_transforms:transform_graph
+
+
+// Specify remote graph by node names
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+--in_graph=/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+--out_graph=\
+/tmp/tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb \
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='\
+fuse_remote_graph(
+input_types="float" \
+input_shapes="1,299,299,3" \
+fused_nodes="NodeA,NodeB,NodeC",
+remote_fused_graph_executor_name="executor" \
+remote_fused_graph_node_name="node_name" \
+)'
+
+// Specify remote graph by border inputs and outputs
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+--in_graph=/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+--out_graph=\
+/tmp/tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb \
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='\
+fuse_remote_graph(
+input_types="float" \
+input_shapes="1,299,299,3" \
+border_inputs="NodeA:0,NodeB:0" \
+border_outputs="NodeC" \
+remote_fused_graph_executor_name="executor" \
+remote_fused_graph_node_name="node_name" \
+)'
+*/
+
+#include <unordered_set>
+
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+static Status ParseArguments(const TransformFuncContext& context,
+                             string* input_types_str, string* input_shapes_str,
+                             string* fused_nodes_str, string* border_inputs_str,
+                             string* border_outputs_str,
+                             string* remote_fused_graph_node_name,
+                             string* remote_graph_executor_name) {
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES, "",
+      input_types_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES, "",
+      input_shapes_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES, "",
+      fused_nodes_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS, "",
+      border_inputs_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS, "",
+      border_outputs_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::
+          TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+      "", remote_graph_executor_name));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME,
+      "", remote_fused_graph_node_name));
+
+  CHECK(!remote_graph_executor_name->empty());
+  return Status::OK();
+}
+
+static Status PlaceShapeType(const std::vector<string>& inputs,
+                             const std::vector<string>& outputs,
+                             const string& input_types_str,
+                             const string& input_shapes_str,
+                             GraphDef* mutable_input_graph_def) {
+  const std::vector<string> input_types_strs =
+      str_util::Split(input_types_str, ",");
+  const std::vector<string> input_shapes_strs =
+      str_util::Split(input_shapes_str, ":");
+  CHECK_EQ(inputs.size(), input_types_strs.size());
+  CHECK_EQ(inputs.size(), input_shapes_strs.size());
+  std::vector<std::pair<string, Tensor>> input_tensors;
+  for (int i = 0; i < inputs.size(); ++i) {
+    const string& name = inputs.at(i);
+    std::vector<int64> dims;
+    CHECK(str_util::SplitAndParseAsInts(input_shapes_strs.at(i), ',', &dims));
+    DataType data_type;
+    CHECK(DataTypeFromString(input_types_strs.at(i), &data_type))
+        << "\"" << input_types_strs.at(i) << "\" was an invalid type";
+    input_tensors.emplace_back(
+        std::make_pair(name, Tensor(data_type, TensorShape(dims))));
+  }
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+      input_tensors, /*dry_run_inference=*/true, mutable_input_graph_def));
+  return Status::OK();
+}
+
+Status FuseRemoteGraph(const GraphDef& input_graph_def,
+                       const TransformFuncContext& context,
+                       GraphDef* output_graph_def) {
+  GraphDef mutable_input_graph_def = input_graph_def;
+
+  const std::vector<string>& inputs = context.input_names;
+  const std::vector<string>& outputs = context.output_names;
+
+  string input_types_str;
+  string input_shapes_str;
+  string fused_nodes_str;
+  string border_inputs_str;
+  string border_outputs_str;
+  string remote_fused_graph_node_name;
+  string remote_graph_executor_name;
+  TF_RETURN_IF_ERROR(ParseArguments(
+      context, &input_types_str, &input_shapes_str, &fused_nodes_str,
+      &border_inputs_str, &border_outputs_str, &remote_fused_graph_node_name,
+      &remote_graph_executor_name));
+
+  if (!input_types_str.empty()) {
+    TF_RETURN_IF_ERROR(PlaceShapeType(inputs, outputs, input_types_str,
+                                      input_shapes_str,
+                                      &mutable_input_graph_def));
+  }
+
+  const bool require_shape_type = !input_types_str.empty();
+  if (!fused_nodes_str.empty()) {
+    const std::vector<string> fused_node_name_vector =
+        str_util::Split(fused_nodes_str, ",");
+    const std::unordered_set<string> fused_node_names(
+        fused_node_name_vector.begin(), fused_node_name_vector.end());
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+        mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        fused_node_names, remote_graph_executor_name, require_shape_type,
+        output_graph_def));
+  } else if (!border_inputs_str.empty() && !border_outputs_str.empty()) {
+    const std::vector<string> border_inputs =
+        str_util::Split(border_inputs_str, ",");
+    const std::vector<string> border_outputs =
+        str_util::Split(border_outputs_str, ",");
+    for (int i = 0; i < border_inputs.size(); ++i) {
+      VLOG(2) << "Border Input(" << i << "): " << border_inputs.at(i);
+    }
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      VLOG(2) << "Border Output(" << i << "): " << border_outputs.at(i);
+    }
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+        mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        border_inputs, border_outputs, remote_graph_executor_name,
+        require_shape_type, output_graph_def));
+  } else {
+    CHECK(false) << "Fuse targets are not specified.";
+  }
+
+  return Status::OK();
+}
+
+Status PlaceRemoteGraphArguments(const GraphDef& input_graph_def,
+                                 const TransformFuncContext& context,
+                                 GraphDef* output_graph_def) {
+  *output_graph_def = input_graph_def;
+
+  const std::vector<string>& inputs = context.input_names;
+  const std::vector<string>& outputs = context.output_names;
+
+  string input_types_str;
+  string input_shapes_str;
+  string fused_nodes_str;
+  string border_inputs_str;
+  string border_outputs_str;
+  string remote_fused_graph_node_name;
+  string remote_graph_executor_name;
+  TF_RETURN_IF_ERROR(ParseArguments(
+      context, &input_types_str, &input_shapes_str, &fused_nodes_str,
+      &border_inputs_str, &border_outputs_str, &remote_fused_graph_node_name,
+      &remote_graph_executor_name));
+
+  if (!input_types_str.empty()) {
+    TF_RETURN_IF_ERROR(PlaceShapeType(inputs, outputs, input_types_str,
+                                      input_shapes_str, output_graph_def));
+  }
+
+  const std::vector<string> fused_node_name_vector =
+      str_util::Split(fused_nodes_str, ",");
+  const std::unordered_set<string> fused_node_names(
+      fused_node_name_vector.begin(), fused_node_name_vector.end());
+  const std::vector<string> border_inputs =
+      str_util::Split(border_inputs_str, ",");
+  const std::vector<string> border_outputs =
+      str_util::Split(border_outputs_str, ",");
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
+      inputs, outputs, fused_node_names, border_inputs, border_outputs,
+      remote_fused_graph_node_name, remote_graph_executor_name,
+      output_graph_def));
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("fuse_remote_graph", FuseRemoteGraph);
+
+REGISTER_GRAPH_TRANSFORM("place_remote_graph_arguments",
+                         PlaceRemoteGraphArguments);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
new file mode 100644
index 00000000000..143cdd7c0ae
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -0,0 +1,264 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declared here so we don't have to put it in a public header.
+Status FuseRemoteGraph(const GraphDef& input_graph_def,
+                       const TransformFuncContext& context,
+                       GraphDef* output_graph_def);
+
+Status PlaceRemoteGraphArguments(const GraphDef& input_graph_def,
+                                 const TransformFuncContext& context,
+                                 GraphDef* output_graph_def);
+
+namespace {
+
+constexpr const char* const REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
+    "remote_fused_graph_executor_name";
+constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME =
+    "remote_fused_graph_node_name";
+
+class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(
+        &input_graph_def_));
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        k_hexagon_remote_fused_graph_executor_build(
+            REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+            [](std::unique_ptr<IRemoteFusedGraphExecutor>* executor) -> Status {
+              return Status::OK();
+            });
+  }
+
+  void TearDown() final {}
+
+  Status Fuse() { return FuseInternal(/*only_place_args=*/false); }
+
+  Status PlaceFuseArgs() { return FuseInternal(/*only_place_args*/ true); }
+
+  Status FuseWithPlacedArgs() {
+    const std::vector<std::pair<string, Tensor>> input_tensors{
+        {"A", {DT_FLOAT, {1, 1, 1, 1}}}};
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
+        input_graph_def_with_fuse_args_, input_tensors, &output_graph_def_);
+  }
+
+  Status FuseInternal(bool only_place_args) {
+    TransformFuncContext context;
+    context.input_names = inputs_;
+    context.output_names = outputs_;
+
+    if (!input_types_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES,
+           {input_types_}}));
+    }
+    if (!input_shapes_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES,
+           {input_shapes_}}));
+    }
+    if (!fused_node_names_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES,
+           {fused_node_names_str_}}));
+    }
+
+    if (!border_inputs_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS,
+           {border_inputs_str_}}));
+    }
+    if (!border_outputs_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS,
+           {border_outputs_str_}}));
+    }
+
+    context.params.insert(std::pair<string, std::vector<string>>(
+        {RemoteFusedGraphExecuteUtils::
+             TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+         {REMOTE_FUSED_GRAPH_EXECUTOR_NAME}}));
+    context.params.insert(std::pair<string, std::vector<string>>(
+        {RemoteFusedGraphExecuteUtils::
+             TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME,
+         {REMOTE_FUSED_GRAPH_NODE_NAME}}));
+
+    if (only_place_args) {
+      return PlaceRemoteGraphArguments(input_graph_def_, context,
+                                       &input_graph_def_with_fuse_args_);
+    } else {
+      return FuseRemoteGraph(input_graph_def_, context, &output_graph_def_);
+    }
+  }
+
+  void SetInputShapeType() {
+    input_types_ = "float";
+    input_shapes_ = "1,1,1,1";
+  }
+
+  void CheckGraph(int expected_node_count, int expected_cluster_count) {
+    EXPECT_EQ(expected_node_count, output_graph_def_.node_size());
+
+    int cluster_count = 0;
+    for (const NodeDef& node_def : output_graph_def_.node()) {
+      const string& name = node_def.name();
+      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+        ++cluster_count;
+        RemoteFusedGraphExecuteInfo info;
+        string serialized_proto;
+        TF_ASSERT_OK(
+            GetNodeAttr(node_def,
+                        RemoteFusedGraphExecuteUtils::
+                            ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+                        &serialized_proto));
+        info.ParseFromString(serialized_proto);
+        CHECK_EQ(REMOTE_FUSED_GRAPH_EXECUTOR_NAME, info.executor_name());
+      }
+    }
+    EXPECT_EQ(expected_cluster_count, cluster_count);
+  }
+
+ public:
+  const std::vector<string> inputs_{"A"};
+  const std::vector<string> outputs_{"K"};
+  GraphDef input_graph_def_;
+  string input_types_;
+  string input_shapes_;
+  GraphDef input_graph_def_with_fuse_args_;
+  GraphDef output_graph_def_;
+  string fused_node_names_str_;
+  string border_inputs_str_;
+  string border_outputs_str_;
+};
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithShapeType_HIJ) {
+  SetInputShapeType();
+  fused_node_names_str_ = "H,I,J";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithoutShapeType_HIJ) {
+  fused_node_names_str_ = "H,I,J";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithShapeType_ABCDEFGHIJK) {
+  SetInputShapeType();
+  fused_node_names_str_ = "A,B,C,D,E,F,G,H,I,J,K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithoutShapeType_ABCDEFGHIJK) {
+  fused_node_names_str_ = "A,B,C,D,E,F,G,H,I,J,K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithShapeType_FCG_J) {
+  SetInputShapeType();
+  border_inputs_str_ = "F:0,C:0,G";
+  border_outputs_str_ = "J:0";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithoutShapeType_FCG_J) {
+  border_inputs_str_ = "F:0,C:0,G";
+  border_outputs_str_ = "J:0";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithShapeType_ABCDE_K) {
+  SetInputShapeType();
+  border_inputs_str_ = "A,B,C,D,E";
+  border_outputs_str_ = "K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(7, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithoutShapeType_ABCDE_K) {
+  border_inputs_str_ = "A,B,C,D,E";
+  border_outputs_str_ = "K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(7, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_HIJ) {
+  fused_node_names_str_ = "H,I,J";
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_ABCDEFGHIJK) {
+  fused_node_names_str_ = "A,B,C,D,E,F,G,H,I,J,K";
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_FCG_J) {
+  border_inputs_str_ = "F:0,C:0,G";
+  border_outputs_str_ = "J:0";
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_ABCDE_K) {
+  SetInputShapeType();
+  border_inputs_str_ = "A,B,C,D,E";
+  border_outputs_str_ = "K";
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(7, 1);
+}
+
+}  // namespace
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/repeat_dataset_op.cc
new file mode 100644
index 00000000000..02103400b1b
--- /dev/null
+++ b/tensorflow/core/kernels/repeat_dataset_op.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RepeatDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit RepeatDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Create a new RepeatDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    int64 count;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
+
+    *output = new Dataset(count, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return std::unique_ptr<IteratorBase>(new ForeverIterator(this));
+      } else if (count_ == 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "RepeatDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        while (i_ < dataset()->count_) {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            return Status::OK();
+          }
+          ++i_;
+          input_impl_ = dataset()->input_->MakeIterator();
+        }
+        *end_of_sequence = true;
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    class ForeverIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit ForeverIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset), input_impl_(nullptr) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        do {
+          if (!input_impl_) {
+            input_impl_ = dataset()->input_->MakeIterator();
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+            // If the first call to GetNext() fails because the end of
+            // sequence has been reached, we return an OutOfRange
+            // error to terminate the iteration. (Otherwise, this
+            // iterator would loop infinitely and never produce a
+            // value.)
+            if (!*end_of_sequence) {
+              return Status::OK();
+            } else {
+              input_impl_.reset();
+              return errors::OutOfRange(
+                  "Attempted to repeat an empty dataset infinitely.");
+            }
+          } else {
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+            if (!*end_of_sequence) {
+              return Status::OK();
+            } else {
+              input_impl_.reset();
+            }
+          }
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RepeatDataset").Device(DEVICE_CPU),
+                        RepeatDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/requantization_range_op_test.cc b/tensorflow/core/kernels/requantization_range_op_test.cc
index fb8e034c89c..dd04da373d8 100644
--- a/tensorflow/core/kernels/requantization_range_op_test.cc
+++ b/tensorflow/core/kernels/requantization_range_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/requantize_op_test.cc b/tensorflow/core/kernels/requantize_op_test.cc
index 44cacf890b6..133f92b029d 100644
--- a/tensorflow/core/kernels/requantize_op_test.cc
+++ b/tensorflow/core/kernels/requantize_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 6589a546243..04454b76c1a 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -42,8 +42,12 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
                               .TypeConstraint<type>("T")        \
                               .TypeConstraint<int32>("Tshape"), \
                           ReshapeOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
+REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
+REGISTER_SYCL_KERNEL(uint8)
+REGISTER_SYCL_KERNEL(int8)
+REGISTER_SYCL_KERNEL(int64)
+REGISTER_SYCL_KERNEL(uint16)
 
 REGISTER_KERNEL_BUILDER(Name("Reshape")
                             .Device(DEVICE_SYCL)
@@ -53,6 +57,7 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int32>("Tshape"),
                         ReshapeOp);
+#undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/resize_area_op_test.cc
index 415bce3cce6..cc5244d3a07 100644
--- a/tensorflow/core/kernels/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/resize_area_op_test.cc
@@ -162,7 +162,7 @@ class ResizeAreaOpTest : public OpsTestBase {
       for (int in_h : {1, 3, 5, 8, 100, 233}) {
         for (int target_height : {1, 2, 3, 50, 113}) {
           for (int target_width : {target_height, target_height / 2 + 1}) {
-            RunRandomTest(in_w, in_h, target_height, target_width, channels);
+            RunRandomTest(in_h, in_w, target_height, target_width, channels);
           }
         }
       }
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 5df36ef4cd5..5131bce448e 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -60,24 +60,33 @@ inline int64 Bound(int64 val, int64 limit) {
   return std::min(limit - 1ll, std::max(0ll, val));
 }
 
+struct WeightsAndIndices {
+  float weight_0;
+  float weight_1;
+  float weight_2;
+  float weight_3;
+  int64 index_0;
+  int64 index_1;
+  int64 index_2;
+  int64 index_3;
+
+  int advance;  // advance value.
+};
+
 inline void GetWeightsAndIndices(const float scale, const int64 out_loc,
-                                 const int64 limit, float* weight_0,
-                                 float* weight_1, float* weight_2,
-                                 float* weight_3, int64* index_0,
-                                 int64* index_1, int64* index_2,
-                                 int64* index_3) {
+                                 const int64 limit, WeightsAndIndices* out) {
   const int64 in_loc = scale * out_loc;
   const float delta = scale * out_loc - in_loc;
   const int64 offset = lrintf(delta * kTableSize);
   const float* coeffs_table = GetCoeffsTable();
-  *weight_0 = coeffs_table[offset * 2 + 1];
-  *weight_1 = coeffs_table[offset * 2];
-  *weight_2 = coeffs_table[(kTableSize - offset) * 2];
-  *weight_3 = coeffs_table[(kTableSize - offset) * 2 + 1];
-  *index_0 = Bound(in_loc - 1, limit);
-  *index_1 = Bound(in_loc, limit);
-  *index_2 = Bound(in_loc + 1, limit);
-  *index_3 = Bound(in_loc + 2, limit);
+  out->weight_0 = coeffs_table[offset * 2 + 1];
+  out->weight_1 = coeffs_table[offset * 2];
+  out->weight_2 = coeffs_table[(kTableSize - offset) * 2];
+  out->weight_3 = coeffs_table[(kTableSize - offset) * 2 + 1];
+  out->index_0 = Bound(in_loc - 1, limit);
+  out->index_1 = Bound(in_loc, limit);
+  out->index_2 = Bound(in_loc + 1, limit);
+  out->index_3 = Bound(in_loc + 2, limit);
 }
 
 template <typename T>
@@ -91,54 +100,39 @@ inline float Interpolate1D(const float weight_0, const float weight_1,
          static_cast<float>(value_3) * weight_3;
 }
 
+// Compute the 1D interpolation for a given X index using the y_weights
+static float Compute(float values_[4], const float xw_0, const float xw_1,
+                     const float xw_2, const float xw_3) {
+  return Interpolate1D(xw_0, xw_1, xw_2, xw_3, values_[0], values_[1],
+                       values_[2], values_[3]);
+}
+
 // In order to compute a single output value, we look at a 4x4 patch in the
 // source image. As we iterate increasing X across the image, the new 4x4 patch
 // often overlaps with the previous 4x4 patch we just looked at.
 //
-// This class helps retain that intermediate computation work.
-class CachedInterpolation {
+// This class helps compute the number of values to copy from the previous
+// point's values.
+class CachedInterpolationCalculator {
  public:
-  CachedInterpolation()
-      : values_({{std::make_pair(-1, -1), std::make_pair(-1, -1),
-                  std::make_pair(-1, -1), std::make_pair(-1, -1)}}) {}
+  CachedInterpolationCalculator() : indexes_{-1, -1, -1, -1} {}
 
-  // Advances the buffer. Returns the number of valid values.
+  // Advances iteration. Returns the number of values that should be copied from
+  // the current point to the next point. The copying should always be done by
+  // copying the last <retval> values from the old point to the first <retval>
+  // values of the new point.
   inline int Advance(const int64 x_0, const int64 x_1, const int64 x_2,
                      const int64 x_3) {
-    // Either we have started a new line, or we don't have any values yet.
-    if (x_0 < values_[0].first || values_[0].first == -1) {
-      // Zero cached values were valid, we must recompute everything.
-      return 0;
-    }
-    if (values_[0].first == x_0 && values_[3].first == x_3) {
-      // Everything's the same. Yay!
-      return 4;
-    }
-    if (values_[1].first != 0 && values_[2].first != values_[3].first) {
-      // Fast (normal) path
-      if (values_[1].first == x_0) {
-        CopyPoint(1, 0);
-        CopyPoint(2, 1);
-        CopyPoint(3, 2);
-        return 3;
-      }
-      if (values_[2].first == x_0) {
-        CopyPoint(2, 0);
-        CopyPoint(3, 1);
-        return 2;
-      }
-    }
     // We use 2 hands and walk through, copying from one to another where
     // we already have values.
-    // Invarient, new_indicies_hand <= cached_values_hand
+    // Invariant, new_indicies_hand <= cached_values_hand
     const std::array<int64, 4> new_x_indices{{x_0, x_1, x_2, x_3}};
     int cached_values_hand = 0;
     int new_indicies_hand = 0;
     while (cached_values_hand < 4) {
-      if (values_[cached_values_hand].first ==
-          new_x_indices[new_indicies_hand]) {
+      if (indexes_[cached_values_hand] == new_x_indices[new_indicies_hand]) {
         if (new_indicies_hand < cached_values_hand) {
-          CopyPoint(cached_values_hand, new_indicies_hand);
+          indexes_[new_indicies_hand] = indexes_[cached_values_hand];
         }
         cached_values_hand++;
         new_indicies_hand++;
@@ -146,111 +140,225 @@ class CachedInterpolation {
         cached_values_hand++;
       }
     }
+    switch (new_indicies_hand) {
+      case 0:
+        indexes_[0] = x_0;
+        TF_FALLTHROUGH_INTENDED;
+      case 1:
+        indexes_[1] = x_1;
+        TF_FALLTHROUGH_INTENDED;
+      case 2:
+        indexes_[2] = x_2;
+        TF_FALLTHROUGH_INTENDED;
+      case 3:
+        indexes_[3] = x_3;
+        break;
+    }
     return new_indicies_hand;
   }
 
-  inline void SetPoint(const int index, const int64 x_index,
-                       const float value) {
-    values_[index] = std::make_pair(x_index, value);
-  }
-
-  // Compute the 1D interpolation for a given X index using the y_weights
-  inline float Compute(const float xw_0, const float xw_1, const float xw_2,
-                       const float xw_3) const {
-    return Interpolate1D(xw_0, xw_1, xw_2, xw_3, values_[0].second,
-                         values_[1].second, values_[2].second,
-                         values_[3].second);
-  }
-
  private:
-  inline void CopyPoint(const int source, const int dest) {
-    values_[dest] = values_[source];
-  }
-
-  std::array<std::pair<int64, float>, 4> values_;
+  int64 indexes_[4];
 };
 
+static void ComputeXWeightsAndIndices(const ImageResizerState& resizer_state,
+                                      std::vector<WeightsAndIndices>* x_wais) {
+  CachedInterpolationCalculator calc;
+  for (int64 x = 0; x < resizer_state.out_width; ++x) {
+    GetWeightsAndIndices(resizer_state.width_scale, x, resizer_state.in_width,
+                         &(*x_wais)[x]);
+    auto& x_wai = (*x_wais)[x];
+    x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
+                                 x_wai.index_3);
+  }
+  // Scale the values so they can be used as offsets into buffers.
+  for (int x = 0; x < resizer_state.out_width; ++x) {
+    (*x_wais)[x].index_0 *= resizer_state.channels;
+    (*x_wais)[x].index_1 *= resizer_state.channels;
+    (*x_wais)[x].index_2 *= resizer_state.channels;
+    (*x_wais)[x].index_3 *= resizer_state.channels;
+  }
+}
+
+template <typename T>
+static EIGEN_ALWAYS_INLINE float ComputeYInterpolation(
+    int which, int channel_num, const WeightsAndIndices& y_wai,
+    const T* y_ptr_0, const T* y_ptr_1, const T* y_ptr_2, const T* y_ptr_3,
+    const WeightsAndIndices& x_wai) {
+  int x_index;
+  switch (which) {
+    case 0:
+      x_index = x_wai.index_0;
+      break;
+    case 1:
+      x_index = x_wai.index_1;
+      break;
+    case 2:
+      x_index = x_wai.index_2;
+      break;
+    default:
+      x_index = x_wai.index_3;
+      break;
+  }
+  const int64 pt_index = x_index + channel_num;
+  return Interpolate1D<T>(y_wai.weight_0, y_wai.weight_1, y_wai.weight_2,
+                          y_wai.weight_3, y_ptr_0[pt_index], y_ptr_1[pt_index],
+                          y_ptr_2[pt_index], y_ptr_3[pt_index]);
+}
+
 template <typename T>
 inline void interpolate_with_caching(
     const typename TTypes<T, 4>::ConstTensor& input_data,
     const ImageResizerState& resizer_state,
     typename TTypes<float, 4>::Tensor output_data) {
-  std::vector<CachedInterpolation> cached_values(resizer_state.channels);
-  for (int64 b = 0; b < resizer_state.batch_size; ++b) {
-    for (int64 y = 0; y < resizer_state.out_height; ++y) {
-      float y_weight_0;
-      float y_weight_1;
-      float y_weight_2;
-      float y_weight_3;
-      int64 y_index_0;
-      int64 y_index_1;
-      int64 y_index_2;
-      int64 y_index_3;
+  std::vector<WeightsAndIndices> x_wais(resizer_state.out_width);
+  ComputeXWeightsAndIndices(resizer_state, &x_wais);
+
+  const auto num_channels = resizer_state.channels;
+  const int64 in_row_width = resizer_state.in_width * num_channels;
+  const int64 in_batch_width = resizer_state.in_height * in_row_width;
+
+  const T* input_b_ptr = input_data.data();
+  float* output_y_ptr = output_data.data();
+
+  for (int64 b = 0; b < resizer_state.batch_size;
+       ++b, input_b_ptr += in_batch_width) {
+    for (int64 y = 0; y < resizer_state.out_height;
+         ++y, output_y_ptr += resizer_state.out_width * num_channels) {
+      WeightsAndIndices y_wai;
       GetWeightsAndIndices(resizer_state.height_scale, y,
-                           resizer_state.in_height, &y_weight_0, &y_weight_1,
-                           &y_weight_2, &y_weight_3, &y_index_0, &y_index_1,
-                           &y_index_2, &y_index_3);
-      for (int64 x = 0; x < resizer_state.out_width; ++x) {
-        float xw_0;
-        float xw_1;
-        float xw_2;
-        float xw_3;
-        int64 x_index_0;
-        int64 x_index_1;
-        int64 x_index_2;
-        int64 x_index_3;
-        GetWeightsAndIndices(resizer_state.width_scale, x,
-                             resizer_state.in_width, &xw_0, &xw_1, &xw_2, &xw_3,
-                             &x_index_0, &x_index_1, &x_index_2, &x_index_3);
-        for (int64 c = 0; c < resizer_state.channels; ++c) {
-          const int advance = cached_values[c].Advance(x_index_0, x_index_1,
-                                                       x_index_2, x_index_3);
-          switch (advance) {
+                           resizer_state.in_height, &y_wai);
+      // Make pointers represent offsets of data in input_b_ptr.
+      const T* y_ptr_0 = input_b_ptr + y_wai.index_0 * in_row_width;
+      const T* y_ptr_1 = input_b_ptr + y_wai.index_1 * in_row_width;
+      const T* y_ptr_2 = input_b_ptr + y_wai.index_2 * in_row_width;
+      const T* y_ptr_3 = input_b_ptr + y_wai.index_3 * in_row_width;
+      if (num_channels == 3) {
+        // Manually unroll case of 3 channels.
+        float cached_value_0[4] = {0};
+        float cached_value_1[4] = {0};
+        float cached_value_2[4] = {0};
+        for (int64 x = 0; x < resizer_state.out_width; ++x) {
+          const WeightsAndIndices& x_wai = x_wais[x];
+          // Shift values in cached_value_* to fill first 'advance' values.
+          switch (x_wai.advance) {
+            case 3:
+              cached_value_0[0] = cached_value_0[1];
+              cached_value_0[1] = cached_value_0[2];
+              cached_value_0[2] = cached_value_0[3];
+              cached_value_1[0] = cached_value_1[1];
+              cached_value_1[1] = cached_value_1[2];
+              cached_value_1[2] = cached_value_1[3];
+              cached_value_2[0] = cached_value_2[1];
+              cached_value_2[1] = cached_value_2[2];
+              cached_value_2[2] = cached_value_2[3];
+              break;
+            case 2:
+              cached_value_0[0] = cached_value_0[2];
+              cached_value_0[1] = cached_value_0[3];
+              cached_value_1[0] = cached_value_1[2];
+              cached_value_1[1] = cached_value_1[3];
+              cached_value_2[0] = cached_value_2[2];
+              cached_value_2[1] = cached_value_2[3];
+              break;
+            case 1: {
+              cached_value_0[0] = cached_value_0[3];
+              cached_value_1[0] = cached_value_1[3];
+              cached_value_2[0] = cached_value_2[3];
+              break;
+            }
+          }
+
+          // Set the remaining '4-advance' values by computing.
+          switch (x_wai.advance) {
             case 0:
-              cached_values[c].SetPoint(
-                  0, x_index_0,
-                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
-                                   y_weight_3,
-                                   input_data(b, y_index_0, x_index_0, c),
-                                   input_data(b, y_index_1, x_index_0, c),
-                                   input_data(b, y_index_2, x_index_0, c),
-                                   input_data(b, y_index_3, x_index_0, c)));
+              cached_value_0[0] = ComputeYInterpolation(
+                  0, 0, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_1[0] = ComputeYInterpolation(
+                  0, 1, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_2[0] = ComputeYInterpolation(
+                  0, 2, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
               TF_FALLTHROUGH_INTENDED;
             case 1:
-              cached_values[c].SetPoint(
-                  1, x_index_1,
-                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
-                                   y_weight_3,
-                                   input_data(b, y_index_0, x_index_1, c),
-                                   input_data(b, y_index_1, x_index_1, c),
-                                   input_data(b, y_index_2, x_index_1, c),
-                                   input_data(b, y_index_3, x_index_1, c)));
+              cached_value_0[1] = ComputeYInterpolation(
+                  1, 0, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_1[1] = ComputeYInterpolation(
+                  1, 1, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_2[1] = ComputeYInterpolation(
+                  1, 2, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
               TF_FALLTHROUGH_INTENDED;
             case 2:
-              cached_values[c].SetPoint(
-                  2, x_index_2,
-                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
-                                   y_weight_3,
-                                   input_data(b, y_index_0, x_index_2, c),
-                                   input_data(b, y_index_1, x_index_2, c),
-                                   input_data(b, y_index_2, x_index_2, c),
-                                   input_data(b, y_index_3, x_index_2, c)));
+              cached_value_0[2] = ComputeYInterpolation(
+                  2, 0, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_1[2] = ComputeYInterpolation(
+                  2, 1, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_2[2] = ComputeYInterpolation(
+                  2, 2, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
               TF_FALLTHROUGH_INTENDED;
             case 3:
-              cached_values[c].SetPoint(
-                  3, x_index_3,
-                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
-                                   y_weight_3,
-                                   input_data(b, y_index_0, x_index_3, c),
-                                   input_data(b, y_index_1, x_index_3, c),
-                                   input_data(b, y_index_2, x_index_3, c),
-                                   input_data(b, y_index_3, x_index_3, c)));
-              TF_FALLTHROUGH_INTENDED;
-            default:
-              output_data(b, y, x, c) =
-                  cached_values[c].Compute(xw_0, xw_1, xw_2, xw_3);
+              cached_value_0[3] = ComputeYInterpolation(
+                  3, 0, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_1[3] = ComputeYInterpolation(
+                  3, 1, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+              cached_value_2[3] = ComputeYInterpolation(
+                  3, 2, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
               break;
           }
+          output_y_ptr[x * num_channels + 0] =
+              Compute(cached_value_0, x_wai.weight_0, x_wai.weight_1,
+                      x_wai.weight_2, x_wai.weight_3);
+          output_y_ptr[x * num_channels + 1] =
+              Compute(cached_value_1, x_wai.weight_0, x_wai.weight_1,
+                      x_wai.weight_2, x_wai.weight_3);
+          output_y_ptr[x * num_channels + 2] =
+              Compute(cached_value_2, x_wai.weight_0, x_wai.weight_1,
+                      x_wai.weight_2, x_wai.weight_3);
+        }
+      } else {
+        for (int64 c = 0; c < num_channels; ++c) {
+          float cached_value[4] = {0};
+          for (int64 x = 0; x < resizer_state.out_width; ++x) {
+            const WeightsAndIndices& x_wai = x_wais[x];
+            // Shift values in cached_value to fill first 'advance' values.
+            switch (x_wai.advance) {
+              case 3:
+                cached_value[0] = cached_value[1];
+                cached_value[1] = cached_value[2];
+                cached_value[2] = cached_value[3];
+                break;
+              case 2:
+                cached_value[0] = cached_value[2];
+                cached_value[1] = cached_value[3];
+                break;
+              case 1: {
+                cached_value[0] = cached_value[3];
+                break;
+              }
+            }
+
+            // Set the remaining '4-advance' values by computing.
+            switch (x_wai.advance) {
+              case 0:
+                cached_value[0] = ComputeYInterpolation(
+                    0, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+                TF_FALLTHROUGH_INTENDED;
+              case 1:
+                cached_value[1] = ComputeYInterpolation(
+                    1, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+                TF_FALLTHROUGH_INTENDED;
+              case 2:
+                cached_value[2] = ComputeYInterpolation(
+                    2, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+                TF_FALLTHROUGH_INTENDED;
+              case 3:
+                cached_value[3] = ComputeYInterpolation(
+                    3, c, y_wai, y_ptr_0, y_ptr_1, y_ptr_2, y_ptr_3, x_wai);
+                break;
+            }
+            output_y_ptr[x * num_channels + c] =
+                Compute(cached_value, x_wai.weight_0, x_wai.weight_1,
+                        x_wai.weight_2, x_wai.weight_3);
+          }
         }
       }
     }
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 814102dda8e..ae14d2804e2 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -34,9 +34,9 @@ class ResizeBicubicOpTest : public OpsTestBase {
     TF_EXPECT_OK(InitOp());
   }
 
-  const Tensor* AddRandomImageInput(const TensorShape& shape) {
-    CHECK_GT(input_types_.size(), inputs_.size())
-        << "Adding more inputs than types; perhaps you need to call MakeOp";
+  const Tensor* SetRandomImageInput(const TensorShape& shape) {
+    inputs_.clear();
+
     CHECK_EQ(shape.dims(), 4) << "All images must have 4 dimensions.";
     bool is_ref = IsRefType(input_types_[inputs_.size()]);
     Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
@@ -155,16 +155,22 @@ class ResizeBicubicOpTest : public OpsTestBase {
   }
 
  protected:
-  void RunRandomTest(const int64 in_height, const int64 in_width) {
-    const Tensor* input =
-        AddRandomImageInput(TensorShape({1, in_height, in_width, 1}));
-    AddInputFromArray<int32>(TensorShape({2}), {299, 299});
+  void RunRandomTest(const int batch_size, const int64 in_height,
+                     const int64 in_width, const int target_height,
+                     const int target_width, int channels) {
+    LOG(INFO) << "Running random test " << in_height << "x" << in_width << "x"
+              << channels << " to " << target_height << "x" << target_width
+              << "x" << channels;
+    const Tensor* input = SetRandomImageInput(
+        TensorShape({batch_size, in_height, in_width, channels}));
+    AddInputFromArray<int32>(TensorShape({2}), {target_height, target_width});
 
     TF_ASSERT_OK(RunOpKernel());
 
-    std::unique_ptr<Tensor> expected(
-        new Tensor(device_->GetAllocator(AllocatorAttributes()),
-                   DataTypeToEnum<float>::v(), TensorShape({1, 299, 299, 1})));
+    std::unique_ptr<Tensor> expected(new Tensor(
+        device_->GetAllocator(AllocatorAttributes()),
+        DataTypeToEnum<float>::v(),
+        TensorShape({batch_size, target_height, target_width, channels})));
 
     ResizeBicubicBaseline(input->tensor<float, 4>(),
                           expected->tensor<float, 4>());
@@ -175,6 +181,21 @@ class ResizeBicubicOpTest : public OpsTestBase {
     // 0.00001 of the previous implementation.
     test::ExpectTensorNear<float>(*expected, *GetOutput(0), 0.00001);
   }
+
+  void RunManyRandomTests(int channels) {
+    for (int batch_size : {1, 2, 5}) {
+      for (int in_w : {2, 4, 7, 20, 165}) {
+        for (int in_h : {1, 3, 5, 8, 100, 233}) {
+          for (int target_height : {1, 2, 3, 50, 113}) {
+            for (int target_width : {target_height, target_height / 2 + 1}) {
+              RunRandomTest(batch_size, in_h, in_w, target_height, target_width,
+                            channels);
+            }
+          }
+        }
+      }
+    }
+  }
 };
 
 TEST_F(ResizeBicubicOpTest, TestBicubic2x2To1x1) {
@@ -204,15 +225,30 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
 }
 
 TEST_F(ResizeBicubicOpTest, TestBicubicRandom141x186) {
-  RunRandomTest(141, 186);
+  RunRandomTest(2, 141, 186, 299, 299, 1 /* channels */);
+  RunRandomTest(2, 141, 186, 299, 299, 3 /* channels */);
 }
 
 TEST_F(ResizeBicubicOpTest, TestBicubicRandom183x229) {
-  RunRandomTest(183, 229);
+  RunRandomTest(2, 183, 229, 299, 299, 1 /* channels */);
+  RunRandomTest(2, 183, 229, 299, 299, 3 /* channels */);
 }
 
 TEST_F(ResizeBicubicOpTest, TestBicubicRandom749x603) {
-  RunRandomTest(749, 603);
+  RunRandomTest(2, 749, 603, 299, 299, 1 /* channels */);
+  RunRandomTest(2, 749, 603, 299, 299, 3 /* channels */);
+}
+
+TEST_F(ResizeBicubicOpTest, TestAreaRandomDataSeveralInputsSizes1Channel) {
+  RunManyRandomTests(1);
+}
+
+TEST_F(ResizeBicubicOpTest, TestAreaRandomDataSeveralInputsSizes3Channels) {
+  RunManyRandomTests(3);
+}
+
+TEST_F(ResizeBicubicOpTest, TestAreaRandomDataSeveralInputsSizes4Channels) {
+  RunManyRandomTests(4);
 }
 
 static Graph* ResizeBicubic(int batch_size, int size, int channels) {
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 2c5aeaada45..d9cb993a4b2 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -72,43 +72,19 @@ struct CachedInterpolation {
   // 1-D linear iterpolation scale (see:
   // https://en.wikipedia.org/wiki/Bilinear_interpolation)
   float lerp;
-  // How many consecutive points use the same lower & upper indices
-  int consecutive;
 };
 
-enum ImageScalePattern { SCALE_UP, SIMILAR, SCALE_DOWN };
-
-inline ImageScalePattern compute_image_scale_pattern(const int64 out_height,
-                                                     const int64 out_width,
-                                                     const int64 in_height,
-                                                     const int64 in_width) {
-  if (in_height * 2 < out_height || in_width * 2 < out_width) {
-    return SCALE_UP;
-  } else if (out_height * 2 < in_height || out_width * 2 < in_width) {
-    return SCALE_DOWN;
-  } else {
-    return SIMILAR;
-  }
-}
-
-inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
-                                          const int64 out_size,
+inline void compute_interpolation_weights(const int64 out_size,
                                           const int64 in_size,
                                           const float scale,
                                           CachedInterpolation* interpolation) {
   interpolation[out_size].lower = 0;
   interpolation[out_size].upper = 0;
-  interpolation[out_size].consecutive = 0;
   for (int64 i = out_size - 1; i >= 0; --i) {
     const float in = i * scale;
     interpolation[i].lower = static_cast<int64>(in);
     interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
     interpolation[i].lerp = in - interpolation[i].lower;
-    interpolation[i].consecutive =
-        interpolation[i + 1].lower == interpolation[i].lower &&
-                interpolation[i + 1].upper == interpolation[i].upper
-            ? interpolation[i + 1].consecutive + 1
-            : 1;
   }
 }
 
@@ -125,200 +101,97 @@ inline float compute_lerp(const float top_left, const float top_right,
 }
 
 template <typename T>
-inline float image_lerp(const T* input_image, int64 in_x_lower,
-                        int64 in_x_upper, float xs_lerp, int64 in_y_lower,
-                        int64 in_y_upper, float ys_lerp, int c) {
-  const float top_left(input_image[in_y_lower + in_x_lower + c]);
-  const float top_right(input_image[in_y_lower + in_x_upper + c]);
-  const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
-  const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
-  return compute_lerp(top_left, top_right, bottom_left, bottom_right, xs_lerp,
-                      ys_lerp);
-}
-
-template <typename T>
-void scale_down_image(
+void resize_image(
     typename TTypes<T, 4>::ConstTensor images, const int batch_size,
-    const int64 out_height, const int64 out_width, const int channels,
+    const int64 in_height, const int64 in_width, const int64 out_height,
+    const int64 out_width, const int channels,
     const std::vector<CachedInterpolation>& xs,
     const std::vector<CachedInterpolation>& ys,
     typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
 template <typename T>
-void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
-                      const int batch_size, const int64 out_height,
-                      const int64 out_width, const int channels,
-                      const std::vector<CachedInterpolation>& xs_vec,
-                      const std::vector<CachedInterpolation>& ys,
-                      typename TTypes<float, 4>::Tensor output) {
-  // Do not eagerly convert all input data points, as we ignore most.
+void resize_image(typename TTypes<T, 4>::ConstTensor images,
+                  const int batch_size, const int64 in_height,
+                  const int64 in_width, const int64 out_height,
+                  const int64 out_width, const int channels,
+                  const std::vector<CachedInterpolation>& xs_vec,
+                  const std::vector<CachedInterpolation>& ys,
+                  typename TTypes<float, 4>::Tensor output) {
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const T* input_b_ptr = images.data();
+  const CachedInterpolation* xs = xs_vec.data();
+
   if (channels == 3) {
+    float* output_y_ptr = output.data();
     for (int b = 0; b < batch_size; ++b) {
-      // Compute the interpolation
       for (int64 y = 0; y < out_height; ++y) {
-        const int64 ys_lower = ys[y].lower;
-        const int64 ys_upper = ys[y].upper;
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
         const float ys_lerp = ys[y].lerp;
-        const CachedInterpolation* xs_ptr = xs_vec.data();
         for (int64 x = 0; x < out_width; ++x) {
-          const int64 xs_lower = xs_ptr->lower;
-          const int64 xs_upper = xs_ptr->upper;
-          const float xs_lerp = xs_ptr->lerp;
-          xs_ptr++;
+          const int64 xs_lower = xs[x].lower;
+          const int64 xs_upper = xs[x].upper;
+          const float xs_lerp = xs[x].lerp;
 
-          const float top_left0(images(b, ys_lower, xs_lower, 0));
-          const float top_right0(images(b, ys_lower, xs_upper, 0));
-          const float bottom_left0(images(b, ys_upper, xs_lower, 0));
-          const float bottom_right0(images(b, ys_upper, xs_upper, 0));
-          const float out0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                          bottom_right0, xs_lerp, ys_lerp);
+          // Read channel 0.
+          const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
+          const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
+          const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
+          const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
 
-          const float top_left1(images(b, ys_lower, xs_lower, 1));
-          const float top_right1(images(b, ys_lower, xs_upper, 1));
-          const float bottom_left1(images(b, ys_upper, xs_lower, 1));
-          const float bottom_right1(images(b, ys_upper, xs_upper, 1));
-          const float out1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                          bottom_right1, xs_lerp, ys_lerp);
+          // Read channel 1.
+          const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
+          const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
+          const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
+          const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
 
-          const float top_left2(images(b, ys_lower, xs_lower, 2));
-          const float top_right2(images(b, ys_lower, xs_upper, 2));
-          const float bottom_left2(images(b, ys_upper, xs_lower, 2));
-          const float bottom_right2(images(b, ys_upper, xs_upper, 2));
-          const float out2 = compute_lerp(top_left2, top_right2, bottom_left2,
-                                          bottom_right2, xs_lerp, ys_lerp);
+          // Read channel 2.
+          const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
+          const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
+          const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
+          const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
 
-          float* dest = &output(b, y, x, 0);
-          dest[0] = out0;
-          dest[1] = out1;
-          dest[2] = out2;
+          // Compute output.
+          output_y_ptr[x * channels + 0] =
+              compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 1] =
+              compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 2] =
+              compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2,
+                           xs_lerp, ys_lerp);
         }
+        output_y_ptr += out_row_size;
       }
+      input_b_ptr += in_batch_num_values;
     }
   } else {
+    float* output_y_ptr = output.data();
     for (int b = 0; b < batch_size; ++b) {
-      // Compute the interpolation
       for (int64 y = 0; y < out_height; ++y) {
-        const CachedInterpolation* xs = xs_vec.data();
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
+        const float ys_lerp = ys[y].lerp;
         for (int64 x = 0; x < out_width; ++x) {
+          auto xs_lower = xs[x].lower;
+          auto xs_upper = xs[x].upper;
+          auto xs_lerp = xs[x].lerp;
           for (int c = 0; c < channels; ++c) {
-            const float top_left(images(b, ys[y].lower, xs[x].lower, c));
-            const float top_right(images(b, ys[y].lower, xs[x].upper, c));
-            const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
-            const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
-            output(b, y, x, c) =
+            const float top_left(ys_input_lower_ptr[xs_lower + c]);
+            const float top_right(ys_input_lower_ptr[xs_upper + c]);
+            const float bottom_left(ys_input_upper_ptr[xs_lower + c]);
+            const float bottom_right(ys_input_upper_ptr[xs_upper + c]);
+            output_y_ptr[x * channels + c] =
                 compute_lerp(top_left, top_right, bottom_left, bottom_right,
-                             xs[x].lerp, ys[y].lerp);
+                             xs_lerp, ys_lerp);
           }
         }
+        output_y_ptr += out_row_size;
       }
-    }
-  }
-}
-
-template <typename T>
-void scale_up_image(
-    const T* input_image, const int batch_index, const int64 out_height,
-    const int64 out_width, const int channels, const int64 in_height,
-    const int64 in_width, const std::vector<CachedInterpolation>& xs,
-    const std::vector<CachedInterpolation>& ys,
-    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
-
-template <typename T>
-void scale_up_image(const T* input_image, const int batch_index,
-                    const int64 out_height, const int64 out_width,
-                    const int channels, const int64 in_height,
-                    const int64 in_width,
-                    const std::vector<CachedInterpolation>& xs,
-                    const std::vector<CachedInterpolation>& ys,
-                    typename TTypes<float, 4>::Tensor output) {
-  for (int64 y = 0; y < out_height; y += ys[y].consecutive) {
-    const int64 in_y_lower = ys[y].lower * in_width * channels;
-    const int64 in_y_upper = ys[y].upper * in_width * channels;
-    for (int64 x = 0; x < out_width; x += xs[x].consecutive) {
-      const int64 in_x_lower = xs[x].lower * channels;
-      const int64 in_x_upper = xs[x].upper * channels;
-      for (int c = 0; c < channels; ++c) {
-        const float top_left(input_image[in_y_lower + in_x_lower + c]);
-        const float top_right(input_image[in_y_lower + in_x_upper + c]);
-        const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
-        const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
-        for (int64 y_inner = y; y_inner < y + ys[y].consecutive; ++y_inner) {
-          for (int64 x_inner = x; x_inner < x + xs[x].consecutive; ++x_inner) {
-            output(batch_index, y_inner, x_inner, c) =
-                compute_lerp(top_left, top_right, bottom_left, bottom_right,
-                             xs[x_inner].lerp, ys[y_inner].lerp);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void scale_similar_image(
-    const T* input_image, const int b, const int64 out_height,
-    const int64 out_width, const int channels, const int64 in_height,
-    const int64 in_width, const std::vector<CachedInterpolation>& xs_vec,
-    const std::vector<CachedInterpolation>& ys,
-    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
-template <typename T>
-void scale_similar_image(const T* input_image, const int b,
-                         const int64 out_height, const int64 out_width,
-                         const int channels, const int64 in_height,
-                         const int64 in_width,
-                         const std::vector<CachedInterpolation>& xs_vec,
-                         const std::vector<CachedInterpolation>& ys,
-                         typename TTypes<float, 4>::Tensor output) {
-  if (channels == 3) {
-    // Compute the interpolation
-    for (int64 y = 0; y < out_height; ++y) {
-      const int64 in_y_lower = ys[y].lower * in_width * channels;
-      const int64 in_y_upper = ys[y].upper * in_width * channels;
-      const float ys_lerp = ys[y].lerp;
-      // Similar-sized images do not have a set of inner loops.
-      const CachedInterpolation* xs_ptr = xs_vec.data();
-      for (int64 x = 0; x < out_width; ++x) {
-        const int64 in_x_lower = xs_ptr->lower * 3;
-        const int64 in_x_upper = xs_ptr->upper * 3;
-        const float xs_lerp = xs_ptr->lerp;
-        xs_ptr++;
-
-        const float out0 =
-            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
-                       in_y_upper, ys_lerp, 0);
-        const float out1 =
-            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
-                       in_y_upper, ys_lerp, 1);
-        const float out2 =
-            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
-                       in_y_upper, ys_lerp, 2);
-        float* dest = &output(b, y, x, 0);
-        dest[0] = out0;
-        dest[1] = out1;
-        dest[2] = out2;
-      }
-    }
-  } else {
-    // Compute the interpolation
-    for (int64 y = 0; y < out_height; ++y) {
-      const int64 in_y_lower = ys[y].lower * in_width * channels;
-      const int64 in_y_upper = ys[y].upper * in_width * channels;
-      const float ys_lerp = ys[y].lerp;
-      // Similar-sized images do not have a set of inner loops.
-      const CachedInterpolation* xs_ptr = xs_vec.data();
-      for (int64 x = 0; x < out_width; ++x) {
-        const int64 in_x_lower = xs_ptr->lower * channels;
-        const int64 in_x_upper = xs_ptr->upper * channels;
-        const float xs_lerp = xs_ptr->lerp;
-        xs_ptr++;
-        for (int c = 0; c < channels; ++c) {
-          const float top_left(input_image[in_y_lower + in_x_lower + c]);
-          const float top_right(input_image[in_y_lower + in_x_upper + c]);
-          const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
-          const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
-          output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
-                                            bottom_right, xs_lerp, ys_lerp);
-        }
-      }
+      input_b_ptr += in_batch_num_values;
     }
   }
 }
@@ -346,32 +219,22 @@ struct ResizeBilinear<CPUDevice, T> {
       return;
     }
 
-    const ImageScalePattern scale_pattern =
-        compute_image_scale_pattern(out_height, out_width, in_height, in_width);
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
 
     // Compute the cached interpolation weights on the x and y dimensions.
-    compute_interpolation_weights(scale_pattern, out_height, in_height,
-                                  height_scale, ys.data());
-    compute_interpolation_weights(scale_pattern, out_width, in_width,
-                                  width_scale, xs.data());
+    compute_interpolation_weights(out_height, in_height, height_scale,
+                                  ys.data());
+    compute_interpolation_weights(out_width, in_width, width_scale, xs.data());
 
-    if (scale_pattern == SCALE_UP) {
-      for (int b = 0; b < batch_size; ++b) {
-        scale_up_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
-                          channels, in_height, in_width, xs, ys, output);
-      }
-    } else if (scale_pattern == SCALE_DOWN) {
-      // Do not eagerly convert all input data points, as we ignore most.
-      scale_down_image<T>(images, batch_size, out_height, out_width, channels,
-                          xs, ys, output);
-    } else {
-      for (int b = 0; b < batch_size; ++b) {
-        scale_similar_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
-                               channels, in_height, in_width, xs, ys, output);
-      }
+    // Scale x interpolation weights to avoid a multiplication during iteration.
+    for (int i = 0; i < xs.size(); ++i) {
+      xs[i].lower *= channels;
+      xs[i].upper *= channels;
     }
+
+    resize_image<T>(images, batch_size, in_height, in_width, out_height,
+                    out_width, channels, xs, ys, output);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index d88762e4a44..a920e602810 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -41,9 +40,9 @@ class ResizeBilinearOpTest : public OpsTestBase {
     TF_EXPECT_OK(InitOp());
   }
 
-  const Tensor* AddRandomImageInput(const TensorShape& shape) {
-    CHECK_GT(input_types_.size(), inputs_.size())
-        << "Adding more inputs than types; perhaps you need to call MakeOp";
+  const Tensor* SetRandomImageInput(const TensorShape& shape) {
+    inputs_.clear();
+
     CHECK_EQ(shape.dims(), 4) << "All images must have 4 dimensions.";
     bool is_ref = IsRefType(input_types_[inputs_.size()]);
     Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
@@ -109,21 +108,36 @@ class ResizeBilinearOpTest : public OpsTestBase {
     }
   }
 
-  void TestResize(int input_width, int input_height, int channels,
-                  int output_width, int output_height) {
-    const TensorShape shape({1, input_width, input_height, channels});
-    const Tensor* input = AddRandomImageInput(shape);
+  void TestResize(int batch_size, int input_width, int input_height,
+                  int channels, int output_width, int output_height) {
+    const TensorShape shape({batch_size, input_width, input_height, channels});
+    const Tensor* input = SetRandomImageInput(shape);
     AddInputFromArray<int32>(TensorShape({2}), {output_width, output_height});
     TF_ASSERT_OK(RunOpKernel());
 
-    std::unique_ptr<Tensor> expected(
-        new Tensor(device_->GetAllocator(AllocatorAttributes()),
-                   DataTypeToEnum<float>::v(),
-                   TensorShape({1, output_width, output_height, channels})));
+    std::unique_ptr<Tensor> expected(new Tensor(
+        device_->GetAllocator(AllocatorAttributes()),
+        DataTypeToEnum<float>::v(),
+        TensorShape({batch_size, output_width, output_height, channels})));
     ResizeBilinearBaseline(input->tensor<float, 4>(),
                            expected->tensor<float, 4>());
     test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
   }
+
+  void RunManyRandomTests(int channels) {
+    for (int batch_size : {1, 2, 5}) {
+      for (int in_w : {2, 4, 7, 20, 165}) {
+        for (int in_h : {1, 3, 5, 8, 100, 233}) {
+          for (int target_height : {1, 2, 3, 50, 113}) {
+            for (int target_width : {target_height, target_height / 2 + 1}) {
+              TestResize(batch_size, in_w, in_h, channels, target_width,
+                         target_height);
+            }
+          }
+        }
+      }
+    }
+  }
 };
 
 class ResizeBilinearOpAlignCornersTest : public OpsTestBase {
@@ -138,6 +152,18 @@ class ResizeBilinearOpAlignCornersTest : public OpsTestBase {
   }
 };
 
+TEST_F(ResizeBilinearOpTest, TestResizeRandomDataSeveralInputsSizes1Channel) {
+  RunManyRandomTests(1);
+}
+
+TEST_F(ResizeBilinearOpTest, TestResizeRandomDataSeveralInputsSizes3Channels) {
+  RunManyRandomTests(3);
+}
+
+TEST_F(ResizeBilinearOpTest, TestResizeRandomDataSeveralInputsSizes4Channels) {
+  RunManyRandomTests(4);
+}
+
 TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
   // Input:
   //  1, 2
@@ -154,7 +180,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
-  const Tensor* input = AddRandomImageInput(TensorShape({1, 2, 2, 1}));
+  const Tensor* input = SetRandomImageInput(TensorShape({1, 2, 2, 1}));
   AddInputFromArray<int32>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -167,7 +193,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
   ResizeBilinearBaseline(input->tensor<float, 4>(),
                          expected->tensor<float, 4>());
   EXPECT_EQ(input->flat<float>()(0), output->flat<float>()(0));
-  test::ExpectTensorEqual<float>(*expected.get(), *output);
+  test::ExpectTensorEqual<float>(*expected, *output);
 }
 
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
@@ -404,28 +430,28 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
 }
 
 // similar_size case
-TEST_F(ResizeBilinearOpTest, Test1_1c) { TestResize(183, 299, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test1_3c) { TestResize(183, 299, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test1_1c) { TestResize(1, 183, 299, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test1_3c) { TestResize(1, 183, 299, 3, 299, 299); }
 
 // Significantly smaller: scale_up case
-TEST_F(ResizeBilinearOpTest, Test2_1c) { TestResize(141, 186, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test2_3c) { TestResize(141, 186, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test2_1c) { TestResize(1, 141, 186, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test2_3c) { TestResize(1, 141, 186, 3, 299, 299); }
 
 // Significantly larger: scale_down case
-TEST_F(ResizeBilinearOpTest, Test3_1c) { TestResize(749, 603, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test3_3c) { TestResize(749, 603, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test3_1c) { TestResize(1, 749, 603, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test3_3c) { TestResize(1, 749, 603, 3, 299, 299); }
 
 // Exactly the same size
-TEST_F(ResizeBilinearOpTest, Test4_1c) { TestResize(299, 299, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test4_3c) { TestResize(299, 299, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test4_1c) { TestResize(1, 299, 299, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test4_3c) { TestResize(1, 299, 299, 3, 299, 299); }
 
 // Slightly smaller: similar_size case
-TEST_F(ResizeBilinearOpTest, Test5_1c) { TestResize(298, 297, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test5_3c) { TestResize(298, 297, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test5_1c) { TestResize(1, 298, 297, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test5_3c) { TestResize(1, 298, 297, 3, 299, 299); }
 
 // Slightly bigger: similar_size case
-TEST_F(ResizeBilinearOpTest, Test6_1c) { TestResize(304, 303, 1, 299, 299); }
-TEST_F(ResizeBilinearOpTest, Test6_3c) { TestResize(304, 303, 3, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test6_1c) { TestResize(1, 304, 303, 1, 299, 299); }
+TEST_F(ResizeBilinearOpTest, Test6_3c) { TestResize(1, 304, 303, 3, 299, 299); }
 
 TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index 9bfbe2a61a1..bfd29b7ec89 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,13 +29,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Device, typename T>
 class ResizeNearestNeighborOp : public OpKernel {
@@ -54,23 +53,27 @@ class ResizeNearestNeighborOp : public OpKernel {
                 errors::InvalidArgument("nearest neighbor requires max height "
                                         "& width of 2^24"));
 
+    // Return if the output is empty.
+    if (st.output->NumElements() == 0) return;
+
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
 
-    for (int b = 0; b < st.batch_size; ++b) {
-      for (int y = 0; y < st.out_height; ++y) {
-        const int64 in_y =
-            std::min(static_cast<int64>(floorf(y * st.height_scale)),
-                     (st.in_height - 1));
-        for (int x = 0; x < st.out_width; ++x) {
-          const int64 in_x =
-              std::min(static_cast<int64>(floorf(x * st.width_scale)),
-                       (st.in_width - 1));
-          for (int c = 0; c < st.channels; ++c) {
-            output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
-          }
-        }
-      }
+    bool status;
+    if (align_corners_) {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/true>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    } else {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/false>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighbor"));
     }
   }
 
@@ -78,6 +81,41 @@ class ResizeNearestNeighborOp : public OpKernel {
   bool align_corners_;
 };
 
+// Partial specialization of ResizeNearestNeighbor functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < out_height; ++y) {
+        const int64 in_y = std::min(
+            (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                            : static_cast<int64>(floorf(y * height_scale)),
+            in_height - 1);
+        for (int x = 0; x < out_width; ++x) {
+          const int64 in_x = std::min(
+              (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                              : static_cast<int64>(floorf(x * width_scale)),
+              in_width - 1);
+          std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
+        }
+      }
+    }
+    return true;
+  }
+};
+}  // namespace functor
+
 template <typename Device, typename T>
 class ResizeNearestNeighborOpGrad : public OpKernel {
  public:
@@ -106,21 +144,23 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
     OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
                 errors::InvalidArgument("shape_t's elements must be positive"));
 
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, TensorShape({input.dim_size(0), sizes(0),
-                                                          sizes(1), input.dim_size(3)}),
-                                          &output));
-
     const int64 batch_size = input.dim_size(0);
     const int64 in_height = input.dim_size(1);
     const int64 in_width = input.dim_size(2);
     const int64 channels = input.dim_size(3);
 
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
+    const int64 out_height = sizes(0);
+    const int64 out_width = sizes(1);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, out_height, out_width, channels}),
+            &output));
+
+    // Return if the output is empty.
+    if (output->NumElements() == 0) return;
 
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
@@ -129,22 +169,22 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
         CalculateResizeScale(out_height, in_height, align_corners_);
     const float width_scale =
         CalculateResizeScale(out_width, in_width, align_corners_);
-    output_data.setZero();
 
-    for (int c = 0; c < channels; ++c) {
-      for (int y = 0; y < in_height; ++y) {
-        const int64 out_y = std::min(
-            static_cast<int64>(floorf(y * height_scale)), (out_height - 1));
-
-        for (int x = 0; x < in_width; ++x) {
-          const int64 out_x = std::min(
-              static_cast<int64>(floorf(x * width_scale)), (out_width - 1));
-
-          for (int b = 0; b < batch_size; ++b) {
-            output_data(b, out_y, out_x, c) += input_data(b, y, x, c);
-          }
-        }
-      }
+    bool status;
+    if (align_corners_) {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/true>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    } else {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/false>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
     }
   }
 
@@ -152,6 +192,45 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
   bool align_corners_;
 };
 
+// Partial specialization of ResizeNearestNeighborGrad functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    output.setZero();
+
+    for (int y = 0; y < in_height; ++y) {
+      const int64 out_y = std::min(
+          (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                          : static_cast<int64>(floorf(y * height_scale)),
+          out_height - 1);
+      for (int x = 0; x < in_width; ++x) {
+        const int64 out_x = std::min(
+            (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                            : static_cast<int64>(floorf(x * width_scale)),
+            out_width - 1);
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < channels; ++c) {
+            output(b, out_y, out_x, c) += input(b, y, x, c);
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+}  // namespace functor
+
 #define REGISTER_KERNEL(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
                               .Device(DEVICE_CPU)                 \
@@ -170,120 +249,22 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
 
 #if GOOGLE_CUDA
 
-template <typename T>
-class ResizeNearestNeighborGPUOp : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
-    st.ValidateAndCreateOutput(context, input);
-    if (!context->status().ok()) return;
-
-    bool status = ResizeNearestNeighbor<T>(
-        input.flat<T>().data(), st.batch_size, st.in_height, st.in_width,
-        st.channels, st.out_height, st.out_width, st.height_scale,
-        st.width_scale, st.output->flat<T>().data(),
-        context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighbor"));
-    }
-  }
- private:
-  bool align_corners_;
-};
-
 #define REGISTER_KERNEL(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
                               .Device(DEVICE_GPU)                 \
                               .TypeConstraint<T>("T")             \
                               .HostMemory("size"),                \
-                          ResizeNearestNeighborGPUOp<T>);
+                          ResizeNearestNeighborOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad")       \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .HostMemory("size"),                \
+                          ResizeNearestNeighborOpGrad<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
-template <typename T>
-class ResizeNearestNeighborGPUOpGrad : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOpGrad(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab and validate the input:
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-
-    // Grab and validate the output shape:
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
-
-    auto sizes = shape_t.vec<int32>();
-    OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
-                errors::InvalidArgument("shape_t's elements must be positive"));
-
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, TensorShape({input.dim_size(0), sizes(0),
-                                                          sizes(1), input.dim_size(3)}),
-                                          &output));
-
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
-
-    const float height_scale =
-        CalculateResizeScale(out_height, in_height, align_corners_);
-    const float width_scale =
-        CalculateResizeScale(out_width, in_width, align_corners_);
-
-    bool status = ResizeNearestNeighborBackward(
-        input.flat<T>().data(), batch_size, in_height,
-        in_width, channels, out_height, out_width,
-        height_scale, width_scale, output->flat<T>().data(),
-        context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
-    }
-  }
-  bool align_corners_;
-};
-
-#define REGISTER_KERNEL(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad")          \
-                            .Device(DEVICE_GPU)                      \
-                            .TypeConstraint<T>("T")                  \
-                            .HostMemory("size"),                     \
-                          ResizeNearestNeighborGPUOpGrad<T>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
-
-#undef REGISTER_KERNEL
-
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
new file mode 100644
index 00000000000..9db331ffdcd
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighbor {
+  bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output);
+};
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighborGrad {
+  bool operator()(const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output_grad);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc
deleted file mode 100644
index 07cf653c2fe..00000000000
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-
-static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
-  Graph* g = new Graph(OpRegistry::Global());
-  Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3}));
-  in.flat<float>().setRandom();
-
-  Tensor out_size(DT_INT32, TensorShape({2}));
-  auto out_size_flat = out_size.flat<int32>();
-  out_size_flat(0) = width * 2;
-  out_size_flat(1) = height * 2;
-
-  Node* ret;
-  NodeBuilder(g->NewName("n"), "ResizeNearestNeighbor")
-      .Input(test::graph::Constant(g, in))
-      .Input(test::graph::Constant(g, out_size))
-      .Finalize(g, &ret);
-  return g;
-}
-
-#define BM_ResizeNearestNeighborDev(DEVICE, B, W, H)                           \
-  static void BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H(int iters) { \
-    testing::ItemsProcessed(iters* B* W* H * 3);                               \
-    test::Benchmark(#DEVICE, BM_ResizeNearestNeighbor(B, W, H)).Run(iters);    \
-  }                                                                            \
-  BENCHMARK(BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H)
-
-BM_ResizeNearestNeighborDev(cpu, 1, 499, 499);
-BM_ResizeNearestNeighborDev(gpu, 1, 499, 499);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index 1a3a64f482b..d65c8fb949a 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -19,21 +19,25 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
 namespace {
 
-template <typename T>
-__global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_data,
-                                          const int in_height, const int in_width,
-                                          const int channels, const int out_height,
-                                          const int out_width, const float height_scale,
-                                          const float width_scale, T* top_data) {
+template <typename T, bool align_corners>
+__global__ void ResizeNearestNeighborNHWC(
+    const int nthreads, const T* bottom_data, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -44,20 +48,25 @@ __global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_da
     n /= out_height;
 
     const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
-    const int in_x = min(static_cast<int>(floorf(out_x * width_scale)), in_width - 1);
-    const int in_y = min(static_cast<int>(floorf(out_y * height_scale)), in_height - 1);
+    const int in_y =
+        min((align_corners) ? static_cast<int>(roundf(out_y * height_scale))
+                            : static_cast<int>(floorf(out_y * height_scale)),
+            in_height - 1);
+    const int in_x =
+        min((align_corners) ? static_cast<int>(roundf(out_x * width_scale))
+                            : static_cast<int>(floorf(out_x * width_scale)),
+            in_width - 1);
     const int idx = (in_y * in_width + in_x) * channels + c;
     top_data[index] = ldg(bottom_data_n + idx);
   }
 }
 
-template <typename T>
+template <typename T, bool align_corners>
 __global__ void ResizeNearestNeighborBackwardNHWC(
-                                   const int nthreads, const T* top_diff,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width, const float height_scale,
-                                   const float width_scale, T* bottom_diff) {
+    const int nthreads, const T* top_diff, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -68,8 +77,14 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
     n /= in_height;
 
     T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
-    const int out_x = min(static_cast<int>(floorf(in_x * width_scale)), out_width - 1);
-    const int out_y = min(static_cast<int>(floorf(in_y * height_scale)), out_height - 1);
+    const int out_y =
+        min((align_corners) ? static_cast<int>(roundf(in_y * height_scale))
+                            : static_cast<int>(floorf(in_y * height_scale)),
+            out_height - 1);
+    const int out_x =
+        min((align_corners) ? static_cast<int>(roundf(in_x * width_scale))
+                            : static_cast<int>(floorf(in_x * width_scale)),
+            out_width - 1);
     const int idx = (out_y * out_width + out_x) * channels + c;
     CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index));
   }
@@ -77,69 +92,86 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
 
 }  // namespace
 
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch,
-                           const int in_height, const int in_width,
-                           const int channels, const int out_height,
-                           const int out_width,  const float height_scale,
-                           const float width_scale, T* top_data,
-                           const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+namespace functor {
 
-  ResizeNearestNeighborNHWC<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      output_size, bottom_data, in_height, in_width, channels, out_height,
-      out_width, height_scale, width_scale, top_data);
-  return d.ok();
-}
+// Partial specialization of ResizeNearestNeighbor functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
 
-#define DECLARE_GPU_SPEC(T)                                                        \
-  template bool ResizeNearestNeighbor(const T* bottom_data, const int batch,       \
-                               const int in_height, const int in_width,            \
-                               const int channels, const int out_height,           \
-                               const int out_width,  const float height_scale,     \
-                               const float width_scale, T* top_data,               \
-                               const Eigen::GpuDevice& d);
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const int output_size = batch_size * out_height * out_width * channels;
+    if (output_size == 0) return true;
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+    ResizeNearestNeighborNHWC<T, align_corners>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            output_size, input.data(), in_height, in_width, channels,
+            out_height, out_width, height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
+
+#define DECLARE_GPU_SPEC(T)                                   \
+  template struct ResizeNearestNeighbor<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighbor<GPUDevice, T, true>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width,
-                                   const float height_scale,
-                                   const float width_scale, T* bottom_diff,
-                                   const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
-  SetZero<<<output_config.block_count,
-            output_config.thread_per_block, 0, d.stream()>>>(output_size, bottom_diff);
+// Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
 
-  const int input_size = batch * channels * in_height * in_width;
-  CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
-  ResizeNearestNeighborBackwardNHWC<T><<<
-      input_config.block_count, input_config.thread_per_block, 0, d.stream()>>>(
-      input_config.virtual_thread_count, top_diff, in_height, in_width,
-      channels, out_height, out_width, height_scale, width_scale, bottom_diff);
-  return d.ok();
-}
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
 
-#define DECLARE_GPU_SPEC(T)                                                           \
-  template bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,     \
-                               const int in_height, const int in_width,               \
-                               const int channels, const int out_height,              \
-                               const int out_width, const float height_scale,         \
-                               const float width_scale, T* bottom_diff,               \
-                               const Eigen::GpuDevice& d);
+    const int output_size = batch_size * channels * out_height * out_width;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPEC);
+    CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
+    SetZero<<<output_config.block_count, output_config.thread_per_block, 0,
+              d.stream()>>>(output_size, output.data());
+    if (!d.ok()) return false;
+
+    const int input_size = batch_size * channels * in_height * in_width;
+    if (input_size == 0) return true;
+
+    CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
+    ResizeNearestNeighborBackwardNHWC<T, align_corners>
+        <<<input_config.block_count, input_config.thread_per_block, 0,
+           d.stream()>>>(input_config.virtual_thread_count, input.data(),
+                         in_height, in_width, channels, out_height, out_width,
+                         height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
+
+#define DECLARE_GPU_SPEC(T)                                       \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, true>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-}  // end namespace tensorflow
+}  // namespace functor
+
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
deleted file mode 100644
index 0a8fd6e1665..00000000000
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch, const int in_height,
-                           const int in_width, const int channels, const int out_height,
-                           const int out_width, const float height_scale, const float width_scale,
-                           T* top_data, const Eigen::GpuDevice& d);
-
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch, const int in_height,
-                                   const int in_width, const int channels, const int out_height,
-                                   const int out_width, const float height_scale, const float width_scale,
-                                   T* bottom_diff, const Eigen::GpuDevice& d);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
index 34ebff6c680..bd8a064228a 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 // Image Library.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -124,9 +123,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    {1, 1, 2,
-     1, 1, 2,
-     3, 3, 4});
+    {1, 2, 2,
+     3, 4, 4,
+     3, 4, 4});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -235,9 +234,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    { 1,  2,  4,
-      5,  6,  8,
-     13, 14, 16});
+    { 1,  3,  4,
+      9, 11, 12,
+     13, 15, 16});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
diff --git a/tensorflow/core/kernels/resize_op_benchmark_test.cc b/tensorflow/core/kernels/resize_op_benchmark_test.cc
new file mode 100644
index 00000000000..6b424502f6f
--- /dev/null
+++ b/tensorflow/core/kernels/resize_op_benchmark_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* BM_Resize(const char* algorithm, int batches, int width,
+                        int height) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3}));
+  in.flat<float>().setRandom();
+
+  Tensor out_size(DT_INT32, TensorShape({2}));
+  auto out_size_flat = out_size.flat<int32>();
+  out_size_flat(0) = width * 2;
+  out_size_flat(1) = height * 2;
+
+  Node* ret;
+  Status s = NodeBuilder(g->NewName("n"), algorithm)
+                 .Input(test::graph::Constant(g, in))
+                 .Input(test::graph::Constant(g, out_size))
+                 .Finalize(g, &ret);
+  assert(s.ok());
+  return g;
+}
+
+#define BM_ResizeDev(DEVICE, ALGORITHM, B, W, H)                              \
+  static void BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H(int iters) { \
+    testing::ItemsProcessed(iters* B* W* H * 3);                              \
+    test::Benchmark(#DEVICE, BM_Resize(#ALGORITHM, B, W, H)).Run(iters);      \
+  }                                                                           \
+  BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H)
+
+BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499);
+
+BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index a5a26a0175f..fadfb1e637a 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -37,12 +37,18 @@ REGISTER_RESOURCE_HANDLE_KERNEL(Var);
 template <typename Device, typename T>
 class ReadVariableOp : public OpKernel {
  public:
-  ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {}
 
-  void Compute(OpKernelContext* ctx) {
+  void Compute(OpKernelContext* ctx) override {
     Var* variable = nullptr;
-    OP_REQUIRES_OK(ctx,
-                   LookupResource(ctx, HandleFromInput(ctx, 0), &variable));
+    ResourceHandle handle = HandleFromInput(ctx, 0);
+    OP_REQUIRES(
+        ctx, LookupResource(ctx, handle, &variable).ok(),
+        errors::NotFound("Attempted to read a nonexistent variable. "
+                         "This usually means that the variable was not "
+                         "initialized. Container: ",
+                         handle.container(), ", name: ", handle.name()));
+
     core::ScopedUnref s(variable);
     // TODO(apassos): It's possible to do copy-on-write here instead of always
     // copying by coordinating with the writing code. Do this. This will also
@@ -57,6 +63,65 @@ class ReadVariableOp : public OpKernel {
   }
 };
 
+// TODO(apassos) register for the GPU as well.
+#define REGISTER_KERNELS(type)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ReadVariableOp").Device(DEVICE_CPU).TypeConstraint<type>("dtype"), \
+      ReadVariableOp<Eigen::ThreadPoolDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)                             \
+  namespace functor {                                          \
+  template <>                                                  \
+  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(       \
+      const GPUDevice& d, typename TTypes<type>::Flat lhs,     \
+      typename TTypes<type>::ConstFlat rhs);                   \
+  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>; \
+  }                                                            \
+  REGISTER_KERNEL_BUILDER(Name("VarHandleOp")                  \
+                              .Device(DEVICE_GPU)              \
+                              .HostMemory("resource")          \
+                              .TypeConstraint<type>("dtype"),  \
+                          ResourceHandleOp<Var>)               \
+  REGISTER_KERNEL_BUILDER(Name("ReadVariableOp")               \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("dtype")   \
+                              .HostMemory("resource"),         \
+                          ReadVariableOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#endif  // GOOGLE_CUDA
+
+class UnsafeReadVariableOp : public OpKernel {
+ public:
+  explicit UnsafeReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Var* variable = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &variable));
+    core::ScopedUnref s(variable);
+    ctx->set_output(0, *variable->tensor());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("_UnsafeReadVariable").Device(DEVICE_CPU),
+                        UnsafeReadVariableOp);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("_UnsafeReadVariable").Device(DEVICE_GPU).HostMemory("resource"),
+    UnsafeReadVariableOp);
+
+#endif  // GOOGLE_CUDA
+
 class DestroyResourceOp : public OpKernel {
  public:
   explicit DestroyResourceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -80,41 +145,18 @@ class DestroyResourceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("DestroyResourceOp").Device(DEVICE_CPU),
                         DestroyResourceOp);
 
-// TODO(apassos) register for the GPU as well.
-#define REGISTER_KERNELS(type)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ReadVariableOp").Device(DEVICE_CPU).TypeConstraint<type>("dtype"), \
-      ReadVariableOp<Eigen::ThreadPoolDevice, type>);
-
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-#if GOOGLE_CUDA
-#define REGISTER_GPU_KERNELS(type)                                             \
-  namespace functor {                                                          \
-  template <>                                                                  \
-  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(                       \
-      const GPUDevice& d, typename TTypes<type>::Flat lhs,                     \
-      typename TTypes<type>::ConstFlat rhs);                                   \
-  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>;                 \
-  }                                                                            \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ReadVariableOp").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \
-      ReadVariableOp<GPUDevice, type>);
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-#undef REGISTER_GPU_KERNELS
-#endif  // GOOGLE_CUDA
-
 template <typename Device, typename T>
 class AssignVariableOp : public OpKernel {
  public:
-  AssignVariableOp(OpKernelConstruction* c) : OpKernel(c) {
+  explicit AssignVariableOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
   }
 
   void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(context, dtype_ == context->input(1).dtype(),
+                errors::InvalidArgument(
+                    "Variable and value dtypes don't match; respectively, ",
+                    dtype_, " and ", context->input(1).dtype()));
     Var* variable = nullptr;
     OP_REQUIRES_OK(
         context,
@@ -141,6 +183,18 @@ class AssignVariableOp : public OpKernel {
     // ownership.
     mutex_lock ml(*variable->mu());
     const Tensor& value = context->input(1);
+    // TODO(apassos): should check that the declared shapes are compatible
+    // somewhere, probably.
+    if (!variable->tensor()->shape().IsSameSize(value.shape())) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      OP_REQUIRES_OK(context, context->allocate_persistent(
+                                  dtype_, value.shape(), &unused, &tmp, attr));
+      *variable->tensor() = *tmp;
+    }
     functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
     copy_functor(context->eigen_device<Device>(), variable->tensor()->flat<T>(),
                  value.flat<T>());
@@ -162,10 +216,18 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU_KERNELS(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("AssignVariableOp")            \
-                              .Device(DEVICE_GPU)             \
-                              .TypeConstraint<type>("dtype"), \
+#define REGISTER_GPU_KERNELS(type)                             \
+  namespace functor {                                          \
+  template <>                                                  \
+  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(       \
+      const GPUDevice& d, typename TTypes<type>::Flat lhs,     \
+      typename TTypes<type>::ConstFlat rhs);                   \
+  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>; \
+  }                                                            \
+  REGISTER_KERNEL_BUILDER(Name("AssignVariableOp")             \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("dtype")   \
+                              .HostMemory("resource"),         \
                           AssignVariableOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
@@ -229,10 +291,12 @@ TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
   }                                                                      \
   REGISTER_KERNEL_BUILDER(Name("AssignAddVariableOp")                    \
                               .Device(DEVICE_GPU)                        \
+                              .HostMemory("resource")                    \
                               .TypeConstraint<type>("dtype"),            \
                           AssignUpdateVariableOp<GPUDevice, type, ADD>); \
   REGISTER_KERNEL_BUILDER(Name("AssignSubVariableOp")                    \
                               .Device(DEVICE_GPU)                        \
+                              .HostMemory("resource")                    \
                               .TypeConstraint<type>("dtype"),            \
                           AssignUpdateVariableOp<GPUDevice, type, SUB>);
 
@@ -244,10 +308,11 @@ REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp").Device(DEVICE_CPU),
                         IsResourceInitialized<Var>);
 
 #if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("is_initialized"),
                         IsResourceInitialized<Var>);
-
 #endif  // GOOGLE_CUDA
 
 template <typename Device, typename T, typename Index>
@@ -303,6 +368,7 @@ class ResourceGatherOp : public OpKernel {
 #define REGISTER_GATHER_FULL(dev, type, index_type)                    \
   REGISTER_KERNEL_BUILDER(Name("ResourceGather")                       \
                               .Device(DEVICE_##dev)                    \
+                              .HostMemory("resource")                  \
                               .TypeConstraint<type>("dtype")           \
                               .TypeConstraint<index_type>("Tindices"), \
                           ResourceGatherOp<dev##Device, type, index_type>)
@@ -370,6 +436,7 @@ class ResourceScatterUpdateOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                             \
       Name(name)                                                       \
           .Device(DEVICE_##dev)                                        \
+          .HostMemory("resource")                                      \
           .TypeConstraint<type>("dtype")                               \
           .TypeConstraint<index_type>("Tindices"),                     \
       ResourceScatterUpdateOp<dev##Device, type, index_type, op>)
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index 16bc31be27a..b6f15a9dc25 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/restore_v2_op_test.cc b/tensorflow/core/kernels/restore_v2_op_test.cc
index c92b0c09606..36631570c7b 100644
--- a/tensorflow/core/kernels/restore_v2_op_test.cc
+++ b/tensorflow/core/kernels/restore_v2_op_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 7852499965c..4f2afa52579 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -33,6 +33,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -116,11 +119,7 @@ class ReverseOp : public OpKernel {
     const Tensor& dims = context->input(1);
 
     if (TensorShapeUtils::IsScalar(input.shape())) {
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(0, input.shape(), &output));
-      output->scalar<T>() = input.scalar<T>();
-
+      context->set_output(0, input);
     } else {
       const int input_dims = input.dims();
       OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()),
@@ -141,9 +140,9 @@ class ReverseOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-#define HANDLE_REVERSE(NDIMS)                                               \
-  case NDIMS:                                                               \
-    HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
+#define HANDLE_REVERSE(NDIMS)                                                 \
+  case NDIMS:                                                                 \
+    HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output);   \
     return;
 
       switch (input_dims) {
@@ -197,10 +196,7 @@ class ReverseV2Op : public OpKernel {
     const Tensor& sparse_dims = context->input(1);
 
     if (TensorShapeUtils::IsScalar(input.shape())) {
-      Tensor* output = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(0, input.shape(), &output));
-      output->scalar<T>() = input.scalar<T>();
+      context->set_output(0, input);
     } else {
       const int input_dims = input.dims();
       const TensorShape& sparse_dims_shape = sparse_dims.shape();
@@ -270,6 +266,7 @@ class ReverseV2Op : public OpKernel {
                               .HostMemory("axis"),           \
                           ReverseV2Op<CPUDevice, T>)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
+TF_CALL_string(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
@@ -351,4 +348,38 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                         ReverseV2Op<CPUDevice, int32>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T)                             \
+  REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<T>("T")        \
+                              .HostMemory("dims"),           \
+                          ReverseOp<SYCLDevice, T>)          \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int32>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ReverseV2Op<SYCLDevice, T>)
+TF_CALL_uint8(REGISTER_SYCL_KERNELS);
+TF_CALL_int8(REGISTER_SYCL_KERNELS);
+TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
+
+REGISTER_KERNEL_BUILDER(Name("Reverse")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("tensor")
+                            .HostMemory("dims")
+                            .HostMemory("output"),
+                        ReverseOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ReverseV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tidx")
+                            .HostMemory("tensor")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ReverseV2Op<CPUDevice, int32>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 19e25b887d7..9829e40fe85 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -120,7 +119,7 @@ static SessionOptions GetOptions(int intra_threads) {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-static Graph* Reverse(TensorShape shape, int reverse_axis) {
+static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, shape);
   data.flat<float>().setRandom();
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index 96fff23f475..4dae5da6356 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -298,7 +298,7 @@ class SampleDistortedBoundingBoxOp : public OpKernel {
 
     // Insert the entire image if no bounding boxes are supplied.
     const Rectangle image_rect(0, 0, width, height);
-    if (bounding_boxes.size() < 1) {
+    if (bounding_boxes.empty()) {
       OP_REQUIRES(context, use_image_if_no_bounding_boxes_,
                   errors::InvalidArgument(
                       "No bounding boxes provided as input. One must "
diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc
index 735e3606d8d..9feb0ce4169 100644
--- a/tensorflow/core/kernels/save_op_test.cc
+++ b/tensorflow/core/kernels/save_op_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -681,7 +680,7 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
 
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
-  root.ToGraph(g);
+  TF_CHECK_OK(root.ToGraph(g));
   VLOG(1) << "Save op's output path: " << temp_filename;
   VLOG(1) << "# nodes in Graph: " << g->num_nodes();
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 35c5d1d4f02..80d49017406 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 
@@ -79,7 +80,7 @@ void SaveTensors(
   VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
           << "...";
   checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
-                                       builder_func);
+                                       std::move(builder_func));
 
   Status s;
   auto tensor_names_flat = tensor_names_t.flat<string>();
@@ -268,7 +269,8 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                                          &parsed_slice, &parsed_slice_shape));
       if (!restored_full_shape.IsSameSize(parsed_full_shape)) {
         return errors::InvalidArgument(
-            "Shape in shape_and_slice spec ", parsed_full_shape.DebugString(),
+            "tensor_name = ", tensor_name, "; shape in shape_and_slice spec ",
+            parsed_full_shape.DebugString(),
             " does not match the shape stored in checkpoint: ",
             restored_full_shape.DebugString());
       }
@@ -279,10 +281,10 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
           reader.LookupSlice(tensor_name, parsed_slice, restored_tensor));
     }
     if (dtypes[i] != restored_tensor->dtype()) {
-      return errors::InvalidArgument("Expected dtype ",
-                                     DataTypeString(dtypes[i]),
-                                     " does not equal restored dtype ",
-                                     DataTypeString(restored_tensor->dtype()));
+      return errors::InvalidArgument(
+          "tensor_name = ", tensor_name, "; expected dtype ",
+          DataTypeString(dtypes[i]), " does not equal restored dtype ",
+          DataTypeString(restored_tensor->dtype()));
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 4618536b648..c665bc5b03c 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -47,8 +47,9 @@ void ValidateInputs(bool is_save_op, OpKernelContext* context,
       context, prefix.NumElements() == 1,
       errors::InvalidArgument("Input prefix should have a single element, got ",
                               prefix.NumElements(), " instead."));
-  OP_REQUIRES(context, TensorShapeUtils::IsVector(tensor_names.shape()) &&
-                           TensorShapeUtils::IsVector(shape_and_slices.shape()),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsVector(tensor_names.shape()) &&
+                  TensorShapeUtils::IsVector(shape_and_slices.shape()),
               errors::InvalidArgument(
                   "Input tensor_names and shape_and_slices "
                   "should be an 1-D tensors, got ",
@@ -105,6 +106,7 @@ class SaveV2 : public OpKernel {
     const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
 
     BundleWriter writer(Env::Default(), prefix_string);
+    OP_REQUIRES_OK(context, writer.status());
     VLOG(1) << "BundleWriter, prefix_string: " << prefix_string;
 
     for (int i = 0; i < num_tensors; ++i) {
@@ -126,9 +128,10 @@ class SaveV2 : public OpKernel {
                                             shape_spec, ", tensor: ",
                                             tensor.shape().DebugString()));
 
-        writer.AddSlice(tensor_name, shape, slice, tensor);
+        OP_REQUIRES_OK(context,
+                       writer.AddSlice(tensor_name, shape, slice, tensor));
       } else {
-        writer.Add(tensor_name, tensor);
+        OP_REQUIRES_OK(context, writer.Add(tensor_name, tensor));
       }
     }
     OP_REQUIRES_OK(context, writer.Finish());
@@ -186,7 +189,8 @@ class MergeV2Checkpoints : public OpKernel {
  public:
   explicit MergeV2Checkpoints(OpKernelConstruction* context)
       : OpKernel(context) {
-    context->GetAttr("delete_old_dirs", &delete_old_dirs_);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("delete_old_dirs", &delete_old_dirs_));
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 63add61ba72..c6e35fe329e 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -75,6 +75,50 @@ struct Assign<scatter_op::UpdateOp::DIV> {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <scatter_op::UpdateOp Op>
+struct AssignSYCL {};
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::ASSIGN> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = u;
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::ADD> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) += u;
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::SUB> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) -= u;
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::MUL> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p * u;
+  }
+};
+
+template <>
+struct AssignSYCL<scatter_op::UpdateOp::DIV> {
+  template <typename Device, typename Params, typename Update>
+  static void Run(Device d, Params p, Update u) {
+    p.device(d) = p / u;
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace internal
 }  // namespace scatter_op
 
@@ -110,6 +154,31 @@ struct ScatterFunctorBase {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  An earlier version of the
+      // code checked it and then grabbed it from memory a second time, which
+      // was a security risk since it could have changed in between.
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
+                                            updates.template chip<0>(i));
+    }
+    return -1;
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
   Index operator()(OpKernelContext* c, const CPUDevice& d,
@@ -149,10 +218,27 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
         : ScatterFunctorBase<CPUDevice, T, Index, op>{};
-#if TENSORFLOW_USE_SYCL
-template<typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctor<SYCLDevice, T, Index, op>
-        : ScatterFunctorBase<SYCLDevice, T, Index, op>{};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+struct ScatterFunctorSYCL {
+  Index operator()(OpKernelContext* c, const SYCLDevice& d,
+                   typename TTypes<T>::Matrix params,
+                   typename TTypes<T>::ConstMatrix updates,
+                   typename TTypes<Index>::Flat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    for (Index i = 0; i < N; i++) {
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
+    }
+    return -1;
+  }
+};
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 55d3ee36da0..48565d8cb97 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,42 +27,73 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
-// Check whether updates.shape = indices.shape[0] + params.shape[IXDIM:]
-static bool ValidUpdateShape(const TensorShape& params_shape,
-                             const Tensor& indices, const Tensor& updates) {
-  int64 indices_nd = 1;
-  if (indices.dims() > 1) {
-    indices_nd = indices.dim_size(indices.dims() - 1);
+// Check whether updates.shape = indices.shape[:batch_dim] +
+// params_shape[slice_dim:]
+static Status ValidateUpdateShape(const TensorShape& params_shape,
+                                  const Tensor& indices,
+                                  const Tensor& updates) {
+  const int64 slice_dim =
+      (indices.dims() > 1) ? indices.dim_size(indices.dims() - 1) : 1;
+  const int64 batch_dim = (indices.dims() > 1) ? indices.dims() - 1 : 1;
+
+#define SHAPE_ERR                                               \
+  errors::InvalidArgument(                                      \
+      "Must have updates.shape = indices.shape[:batch_dim] + ", \
+      "params_shape[slice_dim:], got updates.shape: ",          \
+      updates.shape().DebugString(),                            \
+      ", indices.shape: ", indices.shape().DebugString(),       \
+      ", params_shape: ", params_shape.DebugString(),           \
+      ", slice_dim: ", slice_dim, ", and batch_dim: ", batch_dim)
+
+  if (updates.dims() < batch_dim) return SHAPE_ERR;
+  if (params_shape.dims() < slice_dim + (updates.dims() - batch_dim)) {
+    return SHAPE_ERR;
   }
-  for (int d = indices_nd; d < params_shape.dims(); d++) {
-    if (updates.dim_size(d - indices_nd + 1) != params_shape.dim_size(d)) {
-      return false;
+  if (updates.dims() != batch_dim + params_shape.dims() - slice_dim) {
+    return SHAPE_ERR;
+  }
+  for (int d = 0; d < batch_dim; ++d) {
+    if (updates.dim_size(d) != indices.dim_size(d)) return SHAPE_ERR;
+  }
+  for (int d = 0; d < updates.dims() - batch_dim; ++d) {
+    if (updates.dim_size(d + batch_dim) !=
+        params_shape.dim_size(d + slice_dim)) {
+      return SHAPE_ERR;
     }
   }
-  return true;
+#undef SHAPE_ERR
+  return Status::OK();
 }
 
 template <typename Index>
 static void PrepareAndValidateInputs(OpKernelContext* c,
                                      const TensorShape& params_shape,
                                      const Tensor& indices,
-                                     const Tensor& updates, int64* indices_nd,
+                                     const Tensor& updates, int64* slice_dim,
                                      Index* num_updates, Index* slice_size) {
   const TensorShape& indices_shape(indices.shape());
   const TensorShape& updates_shape(updates.shape());
 
   OP_REQUIRES(
       c, TensorShapeUtils::IsVectorOrHigher(params_shape),
-      errors::InvalidArgument("Output must be at least 1-D, ", "got shape: ",
-                              params_shape.DebugString()));
+      errors::InvalidArgument("Output must be at least 1-D, ",
+                              "got shape: ", params_shape.DebugString()));
 
-  OP_REQUIRES(c, params_shape.num_elements() >= 0 ||
-                     (indices.NumElements() == 0 && updates.NumElements() == 0),
+  OP_REQUIRES(c,
+              params_shape.num_elements() >= 0 ||
+                  (indices.NumElements() == 0 && updates.NumElements() == 0),
               errors::InvalidArgument(
                   "Indices and updates specified for empty output", " shape"));
 
@@ -71,20 +102,16 @@ static void PrepareAndValidateInputs(OpKernelContext* c,
                   "The outermost dimension of updates and indices ",
                   "must match. Got indices.shape ", indices_shape.DebugString(),
                   ", updates.shape ", updates_shape.DebugString()));
-  OP_REQUIRES(c, ValidUpdateShape(params_shape, indices, updates),
-              errors::InvalidArgument(
-                  "Must have updates.shape = indices.shape[:IXDIM] + ",
-                  "params_shape[IXDIM:], got updates.shape ",
-                  updates_shape.DebugString(), ", indices.shape ",
-                  indices_shape.DebugString(), ", params_shape ",
-                  params_shape.DebugString()));
+  OP_REQUIRES_OK(c, ValidateUpdateShape(params_shape, indices, updates));
+
   // Check that we have enough index space
   const int64 N_big = indices.NumElements();
-  OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-              errors::InvalidArgument(
-                  "indices has too many elements for ",
-                  DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                  N_big, " > ", std::numeric_limits<Index>::max()));
+  OP_REQUIRES(
+      c, N_big <= std::numeric_limits<Index>::max(),
+      errors::InvalidArgument("indices has too many elements for ",
+                              DataTypeString(DataTypeToEnum<Index>::v()),
+                              " indexing: ", N_big, " > ",
+                              std::numeric_limits<Index>::max()));
   OP_REQUIRES(
       c, params_shape.dim_size(0) <= std::numeric_limits<Index>::max(),
       errors::InvalidArgument("params_shape[0] too large for ",
@@ -93,10 +120,9 @@ static void PrepareAndValidateInputs(OpKernelContext* c,
                               std::numeric_limits<Index>::max()));
 
   // Calculate the number of dimensions in indices
-  *indices_nd = 1;
-  if (indices_shape.dims() > 1) {
-    *indices_nd = indices_shape.dim_size(indices_shape.dims() - 1);
-  }
+  *slice_dim = (indices_shape.dims() > 1)
+                   ? indices_shape.dim_size(indices_shape.dims() - 1)
+                   : 1;
 
   // Calculate the number of elements that make up each slice of our updated
   // tensor. This allows us to work with flattened tensors and copy over whole
@@ -104,21 +130,55 @@ static void PrepareAndValidateInputs(OpKernelContext* c,
   Index total_nd = params_shape.dims();
 
   int64 slice_size_big = 1;
-  for (int64 i = *indices_nd; i < total_nd; ++i) {
+  for (int64 i = *slice_dim; i < total_nd; ++i) {
     slice_size_big *= params_shape.dim_size(i);
   }
 
   OP_REQUIRES(c, slice_size_big <= std::numeric_limits<Index>::max(),
-              errors::InvalidArgument("slice size is too large for indexing: ",
-                                      slice_size_big, " > ",
-                                      std::numeric_limits<Index>::max()));
+              errors::InvalidArgument(
+                  "slice size is too large for indexing: ", slice_size_big,
+                  " > ", std::numeric_limits<Index>::max()));
 
   *slice_size = static_cast<Index>(slice_size_big);
 
-  const int64 safe_indices_nd = (*indices_nd < 1) ? 1 : *indices_nd;
-  *num_updates = indices_shape.num_elements() / safe_indices_nd;
+  const int64 safe_slice_dim = (*slice_dim < 1) ? 1 : *slice_dim;
+  *num_updates = indices_shape.num_elements() / safe_slice_dim;
 }
 
+template <typename Device, typename Index>
+class IndexFlattener {
+public:
+  inline typename TTypes<Index, 2>::ConstTensor
+  operator()(OpKernelContext*, const Tensor& indices) {
+    return indices.flat_inner_dims<Index>();
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename Index>
+class IndexFlattener<SYCLDevice, Index> {
+public:
+  IndexFlattener() { indices_host_ = nullptr; }
+  ~IndexFlattener() { delete[] indices_host_; }
+
+  inline typename TTypes<Index, 2>::ConstTensor
+  operator()(OpKernelContext* c, const Tensor& indices) {
+    size_t num_indices = indices.NumElements();
+    indices_host_ = new Index[num_indices];
+    auto device = c->eigen_sycl_device();
+    auto size = sizeof(Index) * num_indices;
+    auto src_ptr = GetBase(&indices);
+    device.memcpyDeviceToHost(indices_host_, static_cast<const Index*>(src_ptr),
+                              size);
+    return typename TTypes<Index, 2>::ConstTensor(indices_host_,
+           indices.shape().AsEigenDSizes<2>());
+  }
+
+private:
+  Index* indices_host_;
+};
+#endif
+
 template <typename Device, typename T, typename Index>
 class ScatterNdOp : public OpKernel {
  public:
@@ -137,16 +197,18 @@ class ScatterNdOp : public OpKernel {
                 errors::InvalidArgument("Shape must be a vector"));
     auto vec = shape_input.flat<Index>();
     TensorShape shape;
-    TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape);
+    OP_REQUIRES_OK(c,
+                   TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape));
 
-    int64 indices_nd;
+    int64 slice_dim;
     Index num_updates;
     Index slice_size;
-    PrepareAndValidateInputs<Index>(c, shape, indices, updates, &indices_nd,
+    PrepareAndValidateInputs<Index>(c, shape, indices, updates, &slice_dim,
                                     &num_updates, &slice_size);
     if (!c->status().ok()) return;
 
-    auto indices_flat = indices.flat_inner_dims<Index>();
+    IndexFlattener<Device, Index> index_flattener;
+    auto indices_flat = index_flattener(c, indices);
     auto updates_flat = updates.shaped<T, 2>({num_updates, slice_size});
 
     Tensor* out = nullptr;
@@ -159,7 +221,7 @@ class ScatterNdOp : public OpKernel {
     Index bad_i = -1;
 
     if (shape.num_elements() > 0) {
-      switch (indices_nd) {
+      switch (slice_dim) {
 #define PARAMS_CASE(IXDIM)                                                    \
   case IXDIM: {                                                               \
     typename Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix;      \
@@ -186,16 +248,17 @@ class ScatterNdOp : public OpKernel {
                       errors::InvalidArgument(
                           "Only indices.shape[-1] values between 1 and 5 "
                           "are currently supported.  Requested rank: ",
-                          indices_nd));
+                          slice_dim));
       }
     }
     OP_REQUIRES(
         c, bad_i < 0,
         errors::InvalidArgument(
             "Invalid indices: ", SliceDebugString(indices.shape(), bad_i),
-            " = [", str_util::Join(gtl::ArraySlice<Index>(
-                                       &indices_flat(bad_i, 0), indices_nd),
-                                   ", "),
+            " = [",
+            str_util::Join(
+                gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim),
+                ", "),
             "] does not index into ", shape.DebugString()));
   }
 };
@@ -231,24 +294,25 @@ class ScatterNdUpdateOp : public OpKernel {
     const Tensor& updates = c->input(2);
     const TensorShape& params_shape(params.shape());
 
-    int64 indices_nd;
+    int64 slice_dim;
     Index num_updates;
     Index slice_size;
 
     OP_REQUIRES(c, params.IsInitialized(),
                 errors::FailedPrecondition("Null ref for params"));
     PrepareAndValidateInputs<Index>(c, params_shape, indices, updates,
-                                    &indices_nd, &num_updates, &slice_size);
+                                    &slice_dim, &num_updates, &slice_size);
     if (!c->status().ok()) return;
 
-    auto indices_flat = indices.flat_inner_dims<Index>();
+    IndexFlattener<Device, Index> index_flattener;
+    auto indices_flat = index_flattener(c, indices);
     auto updates_flat = updates.shaped<T, 2>({num_updates, slice_size});
     auto params_matrix = params.template shaped<T, 2>(
         {params_shape.num_elements() / slice_size, slice_size});
     Index bad_i = -1;
     c->forward_ref_input_to_ref_output(0, 0);
 
-    switch (indices_nd) {
+    switch (slice_dim) {
 #define PARAMS_CASE(IXDIM)                                                  \
   case IXDIM: {                                                             \
     typename Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix;    \
@@ -273,24 +337,26 @@ class ScatterNdUpdateOp : public OpKernel {
                     errors::InvalidArgument(
                         "Only indices.shape[-1] values between 1 and 5 "
                         "are currently supported.  Requested rank: ",
-                        indices_nd));
+                        slice_dim));
     }
     OP_REQUIRES(
         c, bad_i < 0,
         errors::InvalidArgument(
             "Invalid indices: ", SliceDebugString(indices.shape(), bad_i),
-            " = [", str_util::Join(gtl::ArraySlice<Index>(
-                                       &indices_flat(bad_i, 0), indices_nd),
-                                   ", "),
+            " = [",
+            str_util::Join(
+                gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim),
+                ", "),
             "] is not in [0, ", params.dim_size(0), ")"));
   }
 };
 
-#define REGISTER_SCATTER_ND_KERNEL_INDEX(type, index_type, dev, name)  \
-  REGISTER_KERNEL_BUILDER(Name(name)                                   \
-                              .Device(DEVICE_##dev)                    \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
+#define REGISTER_SCATTER_ND_KERNEL_INDEX(type, index_type, dev, name) \
+  REGISTER_KERNEL_BUILDER(Name(name)                                  \
+                              .Device(DEVICE_##dev)                   \
+                              .TypeConstraint<type>("T")              \
+                              .TypeConstraint<index_type>("Tindices") \
+                              .HostMemory("shape"),                   \
                           ScatterNdOp<dev##Device, type, index_type>)
 
 #define REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, dev, name, \
@@ -337,6 +403,7 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE(type, CPU);
 
 #define REGISTER_SCATTER_ND_CPU(type) REGISTER_SCATTER_ND(type, CPU);
+#define REGISTER_SCATTER_ND_GPU(type) REGISTER_SCATTER_ND(type, GPU);
 
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 // TODO(simister): Re-enable all types after binary size is under control.
@@ -345,18 +412,69 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
+
 #define REGISTER_SCATTER_ND_ADD_SUB_GPU(type) \
   REGISTER_SCATTER_ND_ADD_SUB(type, GPU);
 
 #define REGISTER_SCATTER_ND_UPDATE_GPU(type) \
   REGISTER_SCATTER_ND_UPDATE(type, GPU);
 
-// TODO(simister): Re-enable when GPU support is working.
-// TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_GPU);
-// TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_GPU);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_GPU);
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM)           \
+  template <>                                                           \
+  Index ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>::operator()(   \
+      const GPUDevice& d, const Index slice_size,                       \
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix, \
+      typename TTypes<T, 2>::Tensor Tparams,                            \
+      typename TTypes<Index, 2>::ConstTensor Tindices,                  \
+      typename TTypes<T, 2>::ConstTensor Tupdates,                      \
+      typename TTypes<T, 2>::Tensor Toutput);                           \
+  extern template struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
+
+#define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op)     \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5)
+
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ADD);    \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::SUB)
+
+#define DECLARE_GPU_SPECS(T)         \
+  DECLARE_GPU_SPECS_INDEX(T, int32); \
+  DECLARE_GPU_SPECS_INDEX(T, int64)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_OP
+}  // namespace functor
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
+  REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
+
+#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
+  REGISTER_SCATTER_ND_UPDATE(type, SYCL);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
+#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
+#undef REGISTER_SCATTER_ND_UPDATE_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
 #undef REGISTER_SCATTER_ND_ADD_SUB_CPU
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index bbe2c6864ff..788797b668d 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -38,6 +38,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -186,6 +189,92 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
 #undef REGISTER_SCATTER_ND_INDEX
 #undef REGISTER_SCATTER_ND_FULL
 
+#ifdef TENSORFLOW_USE_SYCL
+// Implementation of update functor for SYCL.
+template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
+struct ScatterNdFunctor<SYCLDevice, T, Index, OP, IXDIM> {
+  Index operator()(
+      const SYCLDevice& d, const Index slice_size,
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
+      typename TTypes<T, 2>::Tensor Tparams,
+      typename TTypes<Index, 2>::ConstTensor Tindices,
+      typename TTypes<T, 2>::ConstTensor Tupdates,
+      typename TTypes<T, 2>::Tensor Toutput) {
+    // error_loc is -1 if there's no out-of-bounds index,
+    // otherwise it is the location of an OOB index in Tindices.
+    Index error_loc = -1;
+
+    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
+
+    Index batch_strides[IXDIM];
+    for (int dim = IXDIM - 1; dim >= 0; --dim) {
+      if (dim == IXDIM - 1) {
+        batch_strides[dim] = 1;
+      } else {
+        batch_strides[dim] =
+            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
+      }
+    }
+
+    for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
+      Index i = 0;
+      bool out_of_bounds = false;
+      for (int dim = 0; dim < IXDIM; ++dim) {
+        const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim));
+        out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
+        i += ix_d * batch_strides[dim];
+      }
+      if (TF_PREDICT_FALSE(out_of_bounds)) {
+        error_loc = loc;
+        break;
+      } else {
+        auto input_chip = Toutput.template chip<0>(i);
+        auto output_chip = input_chip.device(d);
+        auto update_chip = Tupdates.template chip<0>(loc);
+        update_executor::UpdateExecutor<
+            decltype(input_chip), decltype(update_chip), decltype(output_chip),
+            OP>::Execute(input_chip, update_chip, output_chip);
+      }
+    }
+
+    return error_loc;
+  }
+};
+
+#define REGISTER_SCATTER_ND_FULL_SYCL(T, Index, op)                           \
+  template Index                                                              \
+  ScatterNdFunctor<SYCLDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
+      const SYCLDevice& d, const Index slice_size,                            \
+      const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>               \
+          output_shape_prefix,                                                \
+      typename TTypes<T, 2>::Tensor Tparams,                                  \
+      typename TTypes<Index, 2>::ConstTensor Tindices,                        \
+      typename TTypes<T, 2>::ConstTensor Tupdates,                            \
+      typename TTypes<T, 2>::Tensor Toutput)
+
+#define REGISTER_SCATTER_ND_INDEX_SYCL(type, op)  \
+  REGISTER_SCATTER_ND_FULL_SYCL(type, int32, op); \
+  REGISTER_SCATTER_ND_FULL_SYCL(type, int64, op)
+
+#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
+  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ASSIGN);
+
+#define REGISTER_SCATTER_ND_MATH_SYCL(type)                           \
+  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ADD); \
+  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::SUB);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL)
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MATH_SYCL)
+REGISTER_SCATTER_ND_UPDATE_SYCL(int32);
+REGISTER_SCATTER_ND_MATH_SYCL(int32);
+
+#undef REGISTER_SCATTER_ND_MATH_SYCL
+#undef REGISTER_SCATTER_ND_UPDATE_SYCL
+#undef REGISTER_SCATTER_ND_INDEX_SYCL
+#undef REGISTER_SCATTER_ND_FULL_SYCL
+
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
new file mode 100644
index 00000000000..dbd6791bd2c
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/scatter_nd_op.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
+__global__ void ScatterNdOpKernel(
+    const Index* indices, const T* updates, T* out,
+    const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
+    const Eigen::array<int64, IXDIM> batch_strides, const int64 num_indices,
+    const Index slice_size) {
+#define ASSIGN(dst, src) (*(dst) = src)
+
+#define OP_OVER_SLICE(op)                                       \
+  for (int si = 0; si < slice_size; si++) {                     \
+    op(out + i + si, ldg(updates + (index * slice_size + si))); \
+  }
+  CUDA_1D_KERNEL_LOOP(index, num_indices) {
+    Index i = 0;
+    bool out_of_bounds = false;
+#pragma unroll
+    for (int dim = 0; dim < IXDIM; ++dim) {
+      int offset = (IXDIM * index + dim);
+      const Index ix_d = internal::SubtleMustCopy(ldg(indices + offset));
+      out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
+      i += ix_d * batch_strides[dim] * slice_size;
+    }
+    if (!out_of_bounds) {
+      switch (op) {
+        case scatter_nd_op::UpdateOp::ASSIGN:
+#pragma unroll
+          OP_OVER_SLICE(ASSIGN);
+          break;
+        case scatter_nd_op::UpdateOp::ADD:
+#pragma unroll
+          OP_OVER_SLICE(CudaAtomicAdd);
+          break;
+        case scatter_nd_op::UpdateOp::SUB:
+#pragma unroll
+          OP_OVER_SLICE(CudaAtomicSub);
+          break;
+        case scatter_nd_op::UpdateOp::MUL:
+#pragma unroll
+          OP_OVER_SLICE(CudaAtomicMul);
+          break;
+        case scatter_nd_op::UpdateOp::DIV:
+#pragma unroll
+          OP_OVER_SLICE(CudaAtomicDiv);
+          break;
+      }
+    }
+  }
+#undef OP_OVER_SLICE
+#undef ASSIGN
+}
+
+namespace functor {
+
+// Functor used by ScatterOp to do the computations.
+template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
+struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
+  Index operator()(
+      const GPUDevice& d, const Index slice_size,
+      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
+      typename TTypes<T, 2>::Tensor Tparams,
+      typename TTypes<Index, 2>::ConstTensor Tindices,
+      typename TTypes<T, 2>::ConstTensor Tupdates,
+      typename TTypes<T, 2>::Tensor Toutput) {
+    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
+
+    // Index batch_strides[IXDIM];
+    Eigen::array<int64, IXDIM> batch_strides;
+    for (int dim = IXDIM - 1; dim >= 0; --dim) {
+      if (dim == IXDIM - 1) {
+        batch_strides[dim] = 1;
+      } else {
+        batch_strides[dim] =
+            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
+      }
+    }
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
+    // clang-format off
+    ScatterNdOpKernel<T, Index, op, IXDIM>
+    <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      Tindices.data(), Tupdates.data(), Toutput.data(), output_shape_prefix,
+      batch_strides, batch_size, slice_size);
+    // clang-format on
+
+    return -1;
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM) \
+  template struct functor::ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM>;
+
+#define DECLARE_GPU_SPECS_INDEX_OP(T, Index, op)     \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 1); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5)
+
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ADD);    \
+  DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::SUB)
+
+#define DECLARE_GPU_SPECS(T)         \
+  DECLARE_GPU_SPECS_INDEX(T, int32); \
+  DECLARE_GPU_SPECS_INDEX(T, int64)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_OP
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index cc4772c001d..bd36dfe1885 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -216,8 +215,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape[:IXDIM]"))
+  EXPECT_TRUE(
+      StringPiece(s.ToString())
+          .contains("Must have updates.shape = indices.shape[:batch_dim]"))
 
       << s;
 }
@@ -241,7 +241,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
 
 class ScatterNdUpdateBM : public ScatterNdUpdateOpTest {
  public:
-  virtual void TestBody() {}
+  void TestBody() override {}
   void MakeBenchmarkOp(const char* op, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", op)
                      .Input(FakeInput(DT_FLOAT_REF))
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 827eb7dbca7..8607c7f95af 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -131,6 +135,79 @@ class ScatterUpdateOp : public OpKernel {
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename Index, scatter_op::UpdateOp op>
+class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+ public:
+  explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    if (use_exclusive_lock_) {
+      // Hold mutex while we apply updates
+      mutex_lock l(*c->input_ref_mutex(0));
+      DoCompute(c);
+    } else {
+      DoCompute(c);
+    }
+  }
+
+ private:
+  bool use_exclusive_lock_;
+
+  void DoCompute(OpKernelContext* c) {
+    Tensor params = c->mutable_input(0, use_exclusive_lock_);
+    const Tensor& indices = c->input(1);
+    const Tensor& updates = c->input(2);
+    DoValidationChecking(c, params, indices, updates);
+    if (!c->status().ok()) return;
+
+    // Check that we have enough index space
+    const int64 N_big = indices.NumElements();
+    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
+                errors::InvalidArgument(
+                    "indices has too many elements for ",
+                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
+                    N_big, " > ", std::numeric_limits<Index>::max()));
+    const Index N = static_cast<Index>(indices.NumElements());
+    OP_REQUIRES(
+        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("params.shape[0] too large for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", params.dim_size(0), " > ",
+                                std::numeric_limits<Index>::max()));
+
+    // We always return the input ref.
+    c->forward_ref_input_to_ref_output(0, 0);
+
+    if (N > 0) {
+      auto index_size = indices.NumElements() * sizeof(Index);
+      Tensor indices_host = Tensor(indices.dtype(), indices.shape());
+
+      auto src_ptr = GetBase(&indices);
+      auto dst_ptr = GetBase(&indices_host);
+
+      c->eigen_sycl_device().memcpyDeviceToHost(
+          dst_ptr, static_cast<const Index*>(src_ptr), index_size);
+
+      auto indices_flat = indices_host.flat<Index>();
+      auto params_flat = params.flat_outer_dims<T>();
+      auto updates_flat = updates.shaped<T, 2>({N, updates.NumElements() / N});
+
+      functor::ScatterFunctorSYCL<T, Index, op> functor;
+      const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
+                                  params_flat, updates_flat, indices_flat);
+      OP_REQUIRES(
+          c, bad_i < 0,
+          errors::InvalidArgument(
+              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
+              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+    }
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
                               .Device(DEVICE_##dev)                    \
@@ -180,8 +257,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
 
-REGISTER_SCATTER_ARITHEMTIC_SYCL(float);
-REGISTER_SCATTER_UPDATE_SYCL(float);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
 
 #undef REGISTER_SCATTER_ARITHEMTIC_SYCL
 #undef REGISTER_SCATTER_UPDATE_SYCL
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index a9398f378a2..0b8645a2ae9 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -226,7 +225,7 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
 
 class ScatterUpdateBM : public ScatterUpdateOpTest {
  public:
-  virtual void TestBody() {}
+  void TestBody() override {}
   void MakeBenchmarkOp(const char* op, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", op)
                      .Input(FakeInput(DT_FLOAT_REF))
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index a73e3ee8173..5042cfafc0e 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -109,8 +109,8 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
 
   for (int i = 0; i < sparse_weights_inputs.size(); ++i) {
     Tensor* delta_t;
-    sparse_weights_outputs.allocate(i, sparse_weights_inputs[i].shape(),
-                                    &delta_t);
+    TF_RETURN_IF_ERROR(sparse_weights_outputs.allocate(
+        i, sparse_weights_inputs[i].shape(), &delta_t));
     // Convert the input vector to a row matrix in internal representation.
     auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
     deltas.setZero();
@@ -127,7 +127,8 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
       std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
     for (int i = 0; i < weight_inputs.size(); ++i) {
       Tensor* delta_t;
-      weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t);
+      TF_RETURN_IF_ERROR(
+          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
       // Convert the input vector to a row matrix in internal representation.
       auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
       deltas.setZero();
@@ -136,12 +137,11 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
                                          {1, weight_inputs[i].NumElements()}),
                                      deltas});
     }
+    return Status::OK();
   };
 
-  initialize_weights(dense_weights_inputs, &dense_weights_outputs,
-                    &dense_weights_);
-
-  return Status::OK();
+  return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
+                            &dense_weights_);
 }
 
 // Computes the example statistics for given example, and model. Defined here
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index d7365c65601..0f5c2424b38 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -64,7 +64,7 @@ using sdca::ExampleStatistics;
 using sdca::ModelWeights;
 
 struct ComputeOptions {
-  ComputeOptions(OpKernelConstruction* const context) {
+  explicit ComputeOptions(OpKernelConstruction* const context) {
     string loss_type;
     OP_REQUIRES_OK(context, context->GetAttr("loss_type", &loss_type));
     if (loss_type == "logistic_loss") {
@@ -142,7 +142,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
 
   Tensor mutable_example_state_data_t(*example_state_data_t);
   auto example_state_data = mutable_example_state_data_t.matrix<float>();
-  context->set_output("out_example_state_data", mutable_example_state_data_t);
+  OP_REQUIRES_OK(context, context->set_output("out_example_state_data",
+                                              mutable_example_state_data_t));
 
   if (options.adaptative) {
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/sdca_ops_test.cc b/tensorflow/core/kernels/sdca_ops_test.cc
index 400f330ce7b..ce50116a2d0 100644
--- a/tensorflow/core/kernels/sdca_ops_test.cc
+++ b/tensorflow/core/kernels/sdca_ops_test.cc
@@ -57,6 +57,7 @@ Node* Var(Graph* const g, const int n) {
 std::vector<Node*> VarVector(Graph* const g, const int nodes,
                              const int node_size) {
   std::vector<Node*> result;
+  result.reserve(nodes);
   for (int i = 0; i < nodes; ++i) {
     result.push_back(Var(g, node_size));
   }
@@ -164,6 +165,7 @@ void GetGraphs(const int32 num_examples, const int32 num_sparse_feature_groups,
       sparse_weights.push_back(NodeBuilder::NodeOut(n));
     }
     std::vector<NodeBuilder::NodeOut> dense_weights;
+    dense_weights.reserve(dense_weight_nodes.size());
     for (Node* n : dense_weight_nodes) {
       dense_weights.push_back(NodeBuilder::NodeOut(n));
     }
@@ -171,20 +173,24 @@ void GetGraphs(const int32 num_examples, const int32 num_sparse_feature_groups,
     std::vector<NodeBuilder::NodeOut> sparse_example_indices;
     std::vector<NodeBuilder::NodeOut> sparse_feature_indices;
     std::vector<NodeBuilder::NodeOut> sparse_values;
+    sparse_example_indices.reserve(num_sparse_feature_groups);
     for (int i = 0; i < num_sparse_feature_groups; ++i) {
       sparse_example_indices.push_back(NodeBuilder::NodeOut(
           SparseExampleIndices(g, sparse_features_per_group, num_examples)));
     }
+    sparse_feature_indices.reserve(num_sparse_feature_groups);
     for (int i = 0; i < num_sparse_feature_groups; ++i) {
       sparse_feature_indices.push_back(NodeBuilder::NodeOut(
           SparseFeatureIndices(g, sparse_features_per_group, num_examples)));
     }
+    sparse_values.reserve(num_sparse_feature_groups);
     for (int i = 0; i < num_sparse_feature_groups; ++i) {
       sparse_values.push_back(
           NodeBuilder::NodeOut(RandomZeroOrOne(g, num_examples * 4)));
     }
 
     std::vector<NodeBuilder::NodeOut> dense_features;
+    dense_features.reserve(num_dense_feature_groups);
     for (int i = 0; i < num_dense_feature_groups; ++i) {
       dense_features.push_back(NodeBuilder::NodeOut(
           RandomZeroOrOneMatrix(g, num_examples, dense_features_per_group)));
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 5bd43628016..9cdbe89457c 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -59,7 +59,8 @@ static bool SegmentReductionDoValidation(OpKernelContext* c,
 
 // This operator handles reducing segments along the first dimension.
 // See core/ops/math_ops.cc for more details.
-template <typename Device, class T, class Index, typename Reducer>
+template <typename Device, class T, class Index, typename Reducer,
+          int default_value>
 class SegmentReductionOp : public OpKernel {
  public:
   explicit SegmentReductionOp(OpKernelConstruction* context)
@@ -90,9 +91,8 @@ class SegmentReductionOp : public OpKernel {
     TensorShape output_shape = input.shape();
     output_shape.set_dim(0, output_rows);
 
-    // Note that we do not initialize the output buffer with a default value.
-    // We require that segment ids be sorted and cover all values (otherwise we
-    // return an error).
+    // Note that we do not initialize the output buffer with a default value, so
+    // we need to explicitly set missing indices to the default value.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     if (num_indices == 0) return;
@@ -108,9 +108,8 @@ class SegmentReductionOp : public OpKernel {
 #endif
     Index start = 0, end = 1;
 
+    Index uninitialized_index = 0;  // Index from which the output is not set.
     Index out_index = internal::SubtleMustCopy(segment_vec(start));
-    OP_REQUIRES(context, out_index == 0,
-                errors::InvalidArgument("segment ids do not start at 0"));
 
     // TODO(agarwal): if this loop becomes a bottleneck, consider sharding it
     // across threads.
@@ -126,11 +125,9 @@ class SegmentReductionOp : public OpKernel {
           ++end;
           continue;
         }
-        // We have a new segment here.  Verify that the segment ids grow by one
-        // each time, so that we cover every possible output value.
-        OP_REQUIRES(
-            context, out_index + 1 == next_index,
-            errors::InvalidArgument("segment ids are not increasing by 1"));
+        // We have a new segment here.  Verify that the segment ids are growing.
+        OP_REQUIRES(context, out_index < next_index,
+                    errors::InvalidArgument("segment ids are not increasing"));
       }
 
       // Process segment [start, end)
@@ -143,7 +140,18 @@ class SegmentReductionOp : public OpKernel {
           context, FastBoundsCheck(out_index, output_rows),
           errors::InvalidArgument(
               "Segment id ", out_index, " out of range [0, ", output_rows,
-              "), probably because 'segment_ids' input is not sorted."));
+              "), possibly because 'segment_ids' input is not sorted."));
+
+      // If there is a gap between two indices, we need to set that gap to the
+      // default value.
+      if (out_index > uninitialized_index) {
+        Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+            out_index - uninitialized_index, num_col);
+        Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+            gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+        gap_slice.setConstant(T(default_value));
+      }
+
       T* out_slice_ptr = &output_flat(out_index, 0);
       OutT out_slice(out_slice_ptr, out_slice_shape);
       // We don't use out_slice.device(context->eigen_device<Device>)
@@ -169,36 +177,38 @@ class SegmentReductionOp : public OpKernel {
       if (end >= num_indices) break;
       start = end;
       ++end;
+      uninitialized_index = out_index + 1;
       out_index = next_index;
     }
   }
 };
 
-#define REGISTER_CPU_KERNEL_SEGMENT(name, functor, type, index_type) \
+#define REGISTER_CPU_KERNEL_SEGMENT(name, functor, type, index_type, \
+                                    default_value)                   \
   REGISTER_KERNEL_BUILDER(                                           \
       Name(name)                                                     \
           .Device(DEVICE_CPU)                                        \
           .TypeConstraint<type>("T")                                 \
           .TypeConstraint<index_type>("Tindices"),                   \
-      SegmentReductionOp<CPUDevice, type, index_type, functor>)
+      SegmentReductionOp<CPUDevice, type, index_type, functor, default_value>)
 
-#define REGISTER_REAL_CPU_KERNELS(type, index_type)                         \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentSum", Eigen::internal::SumReducer<type>, type, index_type);   \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type); \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type); \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentMin", Eigen::internal::MinReducer<type>, type, index_type);   \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentMax", Eigen::internal::MaxReducer<type>, type, index_type)
+#define REGISTER_REAL_CPU_KERNELS(type, index_type)                            \
+  REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
+                              type, index_type, 0);                            \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type, 0); \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1); \
+  REGISTER_CPU_KERNEL_SEGMENT("SegmentMin", Eigen::internal::MinReducer<type>, \
+                              type, index_type, 0);                            \
+  REGISTER_CPU_KERNEL_SEGMENT("SegmentMax", Eigen::internal::MaxReducer<type>, \
+                              type, index_type, 0)
 
-#define REGISTER_COMPLEX_CPU_KERNELS(type, index_type)                      \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentSum", Eigen::internal::SumReducer<type>, type, index_type);   \
-  REGISTER_CPU_KERNEL_SEGMENT(                                              \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type)
+#define REGISTER_COMPLEX_CPU_KERNELS(type, index_type)                         \
+  REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
+                              type, index_type, 0);                            \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1)
 
 #define REGISTER_REAL_CPU_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_KERNELS(type, int32);   \
@@ -254,7 +264,7 @@ struct UnsortedSegmentMaxFunctor<CPUDevice, T, Index>
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output) override {
-    output.setConstant(std::numeric_limits<T>::min());
+    output.setConstant(std::numeric_limits<T>::lowest());
     if (data_size == 0) {
       return;
     }
@@ -397,6 +407,8 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
   REGISTER_GPU_UNSORTED_KERNELS(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
 #undef REGISTER_GPU_UNSORTED_KERNELS
 #undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
@@ -408,8 +420,12 @@ template <typename Device, class T>
 class SparseSegmentReductionOpBase : public OpKernel {
  public:
   explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
-                                        bool is_mean, bool is_sqrtn)
-      : OpKernel(context), is_mean_(is_mean), is_sqrtn_(is_sqrtn) {}
+                                        bool is_mean, bool is_sqrtn,
+                                        T default_value)
+      : OpKernel(context),
+        is_mean_(is_mean),
+        is_sqrtn_(is_sqrtn),
+        default_value_(default_value) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
@@ -427,6 +443,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
                     "segment_ids and indices should have same size."));
 
     auto input_flat = input.flat_outer_dims<T>();
+    const int64 num_col = input_flat.dimension(1);
     const auto indices_vec = indices.vec<Index>();
     typedef int32 OutputRow;
     const auto segment_vec = segment_ids.vec<OutputRow>();
@@ -442,9 +459,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
     TensorShape output_shape = input.shape();
     output_shape.set_dim(0, output_rows);
 
-    // Note that we do not initialize the output buffer with a default value.
-    // We require that segment ids be sorted and cover all values (otherwise we
-    // return an error).
+    // Note that we do not initialize the output buffer with a default value, so
+    // we need to explicitly set missing indices to the default value.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     if (num_indices == 0) return;
@@ -453,9 +469,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
     auto output_flat = output->flat_outer_dims<T>();
 
     int64 start = 0, end = 1;
+    // Index from which the output is not initialized.
+    OutputRow uninitialized_index = 0;
     OutputRow out_index = internal::SubtleMustCopy(segment_vec(start));
-    OP_REQUIRES(context, out_index == 0,
-                errors::InvalidArgument("segment ids do not start at 0"));
 
     while (true) {
       // We initialize next_index to 0 to avoid "warning: 'next_index' may be
@@ -468,18 +484,27 @@ class SparseSegmentReductionOpBase : public OpKernel {
           ++end;
           continue;
         }
-        // We have a new segment here.  Verify that the segment ids grow by one
-        // each time, so that we cover every possible output value.
-        OP_REQUIRES(
-            context, out_index + 1 == next_index,
-            errors::InvalidArgument("segment ids are not increasing by 1"));
+        // We have a new segment here.  Verify that the segment ids are growing.
+        OP_REQUIRES(context, out_index < next_index,
+                    errors::InvalidArgument("segment ids are not increasing"));
       }
 
       OP_REQUIRES(
           context, FastBoundsCheck(out_index, output_rows),
           errors::InvalidArgument(
               "Segment id ", out_index, " out of range [0, ", output_rows,
-              "), probably because 'segment_ids' input is not sorted."));
+              "), possibly because 'segment_ids' input is not sorted."));
+
+      // If there is a gap between two indices, we need to set that gap to the
+      // default value.
+      if (out_index > uninitialized_index) {
+        Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+            out_index - uninitialized_index, num_col);
+        Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+            gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+        gap_slice.setConstant(default_value_);
+      }
+
       auto out = output_flat.template chip<0>(out_index);
       const int bad_offset =
           Reduce(input_flat, indices_vec, start, end - start, out);
@@ -492,6 +517,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
       if (end >= num_indices) break;
       start = end;
       ++end;
+      uninitialized_index = out_index + 1;
       out_index = next_index;
     }
   }
@@ -628,6 +654,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
 
   const bool is_mean_;
   const bool is_sqrtn_;
+  const T default_value_;
 };
 
 template <typename Device, class T>
@@ -636,7 +663,8 @@ class SparseSegmentReductionMeanOp
  public:
   explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
       : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/,
-                                                false /*is_sqrtn*/) {}
+                                                false /*is_sqrtn*/,
+                                                T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -645,7 +673,8 @@ class SparseSegmentReductionSqrtNOp
  public:
   explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
       : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                true /*is_sqrtn*/) {}
+                                                true /*is_sqrtn*/,
+                                                T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -654,7 +683,8 @@ class SparseSegmentReductionSumOp
  public:
   explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
       : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                false /*is_sqrtn*/) {}
+                                                false /*is_sqrtn*/,
+                                                T(0) /* default_value */) {}
 };
 
 #define REGISTER_CPU_SPARSE_KERNELS(type)                     \
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 5f53f098aa2..b132b1e8f8b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -19,8 +19,6 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 
-#include <stdio.h>
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -28,6 +26,34 @@ namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
+// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
+// atomically.
+template <typename T>
+static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
+  CudaAtomicAdd(dest, value);
+}
+
+// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+template <>
+__device__ __forceinline__ void AccumulateInto(
+    std::complex<float>* dest, const std::complex<float>& value) {
+  auto dest_scalar = reinterpret_cast<float*>(dest);
+  CudaAtomicAdd(dest_scalar, value.real());
+  CudaAtomicAdd(dest_scalar + 1, value.imag());
+}
+
+template <>
+__device__ __forceinline__ void AccumulateInto(
+    std::complex<double>* dest, const std::complex<double>& value) {
+  auto dest_scalar = reinterpret_cast<double*>(dest);
+  CudaAtomicAdd(dest_scalar, value.real());
+  CudaAtomicAdd(dest_scalar + 1, value.imag());
+}
+
 // UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
@@ -48,7 +74,7 @@ __global__ void UnsortedSegmentSumCustomKernel(
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    CudaAtomicAdd(output + output_index, ldg(input + input_index));
+    AccumulateInto<T>(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -99,6 +125,8 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_complex64(DEFINE_GPU_SPECS);
+TF_CALL_complex128(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 0a281835a4b..8d7b70878b7 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -40,8 +39,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename Index>
-static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
-                                Index num_cols, Index segment_size) {
+static void BM_SegmentReduction(int iters, const string& reduction,
+                                Index num_rows, Index num_cols,
+                                Index segment_size) {
   testing::StopTiming();
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
index c647d3aaac6..7a1db4e558e 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
@@ -69,7 +69,7 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
         errors::InvalidArgument("Self Adjoint Eigen decomposition was not "
                                 "successful. The input might not be valid."));
 
-    outputs->at(0) = eig.eigenvalues();
+    outputs->at(0) = eig.eigenvalues().template cast<Scalar>();
     if (compute_v_) {
       outputs->at(1) = eig.eigenvectors();
     }
@@ -81,7 +81,15 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<float>), float);
 REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<double>), double);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<complex128>),
+                   complex128);
 REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<float>), float);
 REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<double>),
                    double);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<complex128>),
+                   complex128);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 1c7d50e161c..2a98a6530cf 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -52,17 +52,16 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
                                        send_device_incarnation, tensor_name);
+  // The vast majority of Send nodes are outside any loop context, so
+  // proactively cache the rendezvous key for the top-level.
+  GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
 }
 
 void SendOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES(
       ctx, ctx->rendezvous() != nullptr,
       errors::Internal("Op kernel context needs to provide a rendezvous."));
-  Rendezvous::ParsedKey parsed;
-  GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
-  VLOG(2) << "Send " << parsed.buf_;
-
-  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed));
 
   // The device context may be passed between the Send/Recv
   // boundary, so that the device context used to produce the Tensor
@@ -71,18 +70,34 @@ void SendOp::Compute(OpKernelContext* ctx) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->input_alloc_attr(0);
-  OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed, args, ctx->input(0),
-                                              ctx->is_input_dead()));
+
+  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+    // Use the cached rendezvous key.
+    VLOG(2) << "Send " << parsed_key_.buf_;
+    OP_REQUIRES_OK(ctx,
+                   ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
+                                           ctx->is_input_dead()));
+  } else {
+    Rendezvous::ParsedKey in_loop_parsed;
+    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    VLOG(2) << "Send " << in_loop_parsed.buf_;
+    OP_REQUIRES_OK(ctx,
+                   Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0),
+                                           ctx->is_input_dead()));
+  }
 }
 
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -101,17 +116,16 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
                                        send_device_incarnation, tensor_name);
+  // The vast majority of Recv nodes are outside any loop context, so
+  // proactively cache the rendezvous key for the top-level.
+  GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
 }
 
 void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   OP_REQUIRES(
       ctx, ctx->rendezvous() != nullptr,
       errors::Internal("Op kernel context needs to provide a rendezvous."));
-  Rendezvous::ParsedKey parsed;
-  GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
-  VLOG(2) << "Recv " << parsed.buf_;
-
-  OP_REQUIRES_OK_ASYNC(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed), done);
 
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -136,23 +150,35 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         done();
       },
       std::move(done), _1, _2, _3, _4, _5);
-  ctx->rendezvous()->RecvAsync(parsed, args, std::move(done_cb));
+
+  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+    VLOG(2) << "Recv " << parsed_key_.buf_;
+    ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
+  } else {
+    Rendezvous::ParsedKey in_loop_parsed;
+    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    VLOG(2) << "Recv " << in_loop_parsed.buf_;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
+
+    ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb));
+  }
 }
 
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp);
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 6e914226821..67867e33086 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -28,6 +28,7 @@ class SendOp : public OpKernel {
 
  private:
   string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
 };
@@ -39,6 +40,7 @@ class RecvOp : public AsyncOpKernel {
 
  private:
   string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
 };
diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc
new file mode 100644
index 00000000000..092a29f2f3c
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    static Tensor* t = new Tensor(DT_FLOAT, TensorShape({0}));
+    done(Status::OK(), args, args, *t, false);
+  }
+  void StartAbort(const Status& status) override {}
+};
+
+static Graph* Send() {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(DT_FLOAT, TensorShape({0}));
+  test::graph::Send(g, test::graph::Constant(g, in0), "T", "/cpu:0", 1,
+                    "/cpu:0");
+  test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+  return g;
+}
+
+static Graph* Recv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+  return g;
+}
+
+static void BM_Send(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
+      .Run(iters);
+}
+BENCHMARK(BM_Send);
+
+static void BM_Recv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
+      .Run(iters);
+}
+BENCHMARK(BM_Recv);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index c24ecdf8b97..c8ea9230201 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -92,9 +92,11 @@ class RangeOp : public OpKernel {
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
 TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
 TF_CALL_int32(REGISTER_SYCL_KERNEL);
 TF_CALL_int64(REGISTER_SYCL_KERNEL);
-#endif  // TENSORFLOW_USE_SYCL
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
+TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 4f73583ed80..67234e2a401 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -361,6 +361,7 @@ class DeserializeManySparseOp : public OpKernel {
     std::iota(std_order.begin(), std_order.end(), 0);
 
     std::vector<SparseTensor> tensors_to_concat;
+    tensors_to_concat.reserve(num_sparse_tensors);
     for (int i = 0; i < num_sparse_tensors; ++i) {
       tensors_to_concat.emplace_back(indices_to_concat[i], values_to_concat[i],
                                      preconcat_shape, std_order);
@@ -370,7 +371,7 @@ class DeserializeManySparseOp : public OpKernel {
 
     Tensor final_output_shape(DT_INT64, TensorShape({output.dims()}));
 
-    std::copy_n(output.shape().dim_sizes().data(), output.dims(),
+    std::copy_n(output.shape().data(), output.dims(),
                 final_output_shape.vec<int64>().data());
 
     context->set_output(0, output.indices());
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 4550115c19d..27ad2fcd87e 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -41,13 +41,24 @@ class GetSessionHandleOp : public OpKernel {
       : OpKernel(context) {}
 
   void Compute(OpKernelContext* ctx) override {
-    Tensor val = ctx->input(0);
+    const Tensor& val = ctx->input(0);
     int64 id = ctx->session_state()->GetNewId();
     TensorStore::TensorAndKey tk{val, id, def().device()};
     OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(def().name(), tk));
+
     Tensor* handle = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-    handle->flat<string>().setConstant(tk.GetHandle(def().name()));
+    if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
+      ResourceHandle resource_handle = MakeResourceHandle<Tensor>(
+          ctx, SessionState::kTensorHandleResourceTypeName,
+          tk.GetHandle(def().name()));
+      resource_handle.set_maybe_type_name(
+          SessionState::kTensorHandleResourceTypeName);
+      handle->scalar<ResourceHandle>()() = resource_handle;
+    } else {
+      // Legacy behavior in V1.
+      handle->flat<string>().setConstant(tk.GetHandle(def().name()));
+    }
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(GetSessionHandleOp);
@@ -55,12 +66,19 @@ class GetSessionHandleOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("GetSessionHandle").Device(DEVICE_CPU),
                         GetSessionHandleOp);
+REGISTER_KERNEL_BUILDER(Name("GetSessionHandleV2").Device(DEVICE_CPU),
+                        GetSessionHandleOp);
 
 #define REGISTER_GPU_KERNEL(type)                         \
   REGISTER_KERNEL_BUILDER(Name("GetSessionHandle")        \
                               .Device(DEVICE_GPU)         \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
+                          GetSessionHandleOp)             \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionHandleV2")      \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
                           GetSessionHandleOp)
 
 TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
@@ -73,12 +91,17 @@ REGISTER_GPU_KERNEL(bool);
                               .Device(DEVICE_SYCL)        \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
+                          GetSessionHandleOp)             \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionHandleV2")      \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
                           GetSessionHandleOp)
 
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class GetSessionTensorOp : public OpKernel {
  public:
@@ -147,5 +170,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("DeleteSessionTensor").Device(DEVICE_SYCL).HostMemory("handle"),
     DeleteSessionTensorOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 61fe250206c..5a2b18b41ca 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -39,6 +39,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+using ShapeArray = sparse::SparseTensor::ShapeArray;
+using VarDimArray = sparse::SparseTensor::VarDimArray;
+
 // Validate rank >= 2.
 void CheckRankAtLeast2(OpKernelContext* ctx, const TensorShape& shape) {
   const auto rank = shape.dims();
@@ -47,18 +50,15 @@ void CheckRankAtLeast2(OpKernelContext* ctx, const TensorShape& shape) {
 }
 
 // Return group shape, which is the 1st n-1 dimensions of shape.
-const TensorShape GroupShape(OpKernelContext* ctx,
-                             const TensorShape& input_shape) {
-  CheckRankAtLeast2(ctx, input_shape);
-  TensorShape shape(input_shape);
-  shape.RemoveDim(shape.dims() - 1);
-  return TensorShape(shape);
-}
-
-// Validate sparse indices are valid. This is O(n), so use sparingly.
-void CheckSparseTensorIndices(OpKernelContext* ctx,
-                              const sparse::SparseTensor& st) {
-  OP_REQUIRES_OK(ctx, st.IndicesValid());
+Status GroupShape(const VarDimArray& input_shape, ShapeArray* grouped_shape) {
+  if (input_shape.size() < 2) {
+    // TODO(irving): Why can't 2 be 1 here?
+    return errors::InvalidArgument("Shape [", str_util::Join(input_shape, ","),
+                                   "] has rank ", input_shape.size(), " < 2");
+  }
+  // grouped_shape is input_shape[:-1]
+  *grouped_shape = ShapeArray(input_shape.begin(), input_shape.end() - 1);
+  return Status::OK();
 }
 
 // Build `SparseTensor` from indices, values, and shape in inputs
@@ -76,7 +76,8 @@ sparse::SparseTensor SparseTensorFromContext(OpKernelContext* ctx,
   const sparse::SparseTensor st(ctx->input(base_index),
                                 ctx->input(base_index + 1), shape, order);
   if (validate_indices) {
-    CheckSparseTensorIndices(ctx, st);
+    Status s = st.IndicesValid();
+    if (!s.ok()) ctx->SetStatus(s);
   }
   return st;
 }
@@ -87,7 +88,7 @@ sparse::SparseTensor SparseTensorFromContext(OpKernelContext* ctx,
 // was created, and is used to sanity check the indices in `group'.
 template <typename T>
 void CheckGroup(OpKernelContext* ctx, const sparse::Group& group,
-                const TensorShape& sparse_tensor_shape) {
+                const VarDimArray& sparse_tensor_shape) {
   const auto& indices = group.indices();
   const auto& values = group.values<T>();
 
@@ -101,14 +102,15 @@ void CheckGroup(OpKernelContext* ctx, const sparse::Group& group,
 
   // Sanity check: valid indices.
   const auto group_rank = indices.dimension(1);
-  const auto expected_rank = sparse_tensor_shape.dims();
+  const auto expected_rank = sparse_tensor_shape.size();
   OP_REQUIRES(ctx, expected_rank == group_rank,
               errors::Internal("Rank expected ", expected_rank, ", got ",
                                group_rank, "."));
   for (int32 j = 0; j < expected_rank; ++j) {
-    const auto dim_size = sparse_tensor_shape.dim_size(j);
-    OP_REQUIRES(ctx, dim_size > 0, errors::Internal("Invalid dim_size[", j,
-                                                    "] = ", dim_size, "."));
+    const auto dim_size = sparse_tensor_shape[j];
+    OP_REQUIRES(
+        ctx, dim_size > 0,
+        errors::Internal("Invalid dim_size[", j, "] = ", dim_size, "."));
     for (int64 i = 0; i < num_values; ++i) {
       const auto index = indices(i, j);
       OP_REQUIRES(ctx, dim_size > index,
@@ -119,12 +121,12 @@ void CheckGroup(OpKernelContext* ctx, const sparse::Group& group,
 }
 
 // This lets us calculate the row-major index into flattened output.
-const gtl::InlinedVector<int64, 8> Strides(const TensorShape& shape) {
-  gtl::InlinedVector<int64, 8> result(shape.dims());
+const ShapeArray Strides(const VarDimArray& shape) {
+  ShapeArray result(shape.size());
   int64 product = 1;
-  for (auto i = shape.dims() - 1; i >= 0; --i) {
+  for (int i = shape.size() - 1; i >= 0; --i) {
     result[i] = product;
-    product *= shape.dim_size(i);
+    product *= shape[i];
   }
   return result;
 }
@@ -204,7 +206,7 @@ bool ValidateIndicesFromContext(OpKernelConstruction* ctx) {
 // values.
 template <typename T>
 void PopulateFromDenseGroup(OpKernelContext* ctx, const Tensor& input_tensor,
-                            const gtl::InlinedVector<int64, 8>& input_strides,
+                            const VarDimArray& input_strides,
                             const std::vector<int64>& group_indices,
                             std::set<T>* result) {
   OP_REQUIRES(ctx, group_indices.size() == input_strides.size() - 1,
@@ -227,7 +229,7 @@ void PopulateFromDenseGroup(OpKernelContext* ctx, const Tensor& input_tensor,
 // indices in `group'.
 template <typename T>
 void PopulateFromSparseGroup(OpKernelContext* ctx, const sparse::Group& group,
-                             const TensorShape& sparse_tensor_shape,
+                             const VarDimArray& sparse_tensor_shape,
                              std::set<T>* result) {
   CheckGroup<T>(ctx, group, sparse_tensor_shape);
   result->clear();
@@ -256,18 +258,21 @@ void SetSizeOp<T>::Compute(OpKernelContext* ctx) {
 
   // Output shape is same as input except for last dimension, which reduces to
   // the set size of values along that dimension.
-  const TensorShape output_shape = GroupShape(ctx, set_st.shape());
+  ShapeArray output_shape;
+  OP_REQUIRES_OK(ctx, GroupShape(set_st.shape(), &output_shape));
   const auto output_strides = Strides(output_shape);
 
+  TensorShape output_shape_ts;
+  OP_REQUIRES_OK(ctx,
+                 TensorShapeUtils::MakeShape(output_shape, &output_shape_ts));
   Tensor* out_t;
-  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &out_t));
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape_ts, &out_t));
   auto out = out_t->flat<int32>();
   out.device(ctx->eigen_cpu_device()) = out.constant(static_cast<int32>(0.0));
 
   // Group by all but last dimension, create a set of group values, and add set
   // size to output.
-  sparse::SparseTensor::VarDimArray group_ix(set_st.order(), 0,
-                                             set_st.order().size() - 1);
+  VarDimArray group_ix(set_st.order(), 0, set_st.order().size() - 1);
   std::set<T> group_set;
   for (const auto& group : set_st.group(group_ix)) {
     PopulateFromSparseGroup<T>(ctx, group, set_st.shape(), &group_set);
@@ -375,39 +380,48 @@ void SetOperationOp<T>::ApplySetOperation(const std::set<T>& set1,
 }
 
 // Validate shapes have the same dimensions.
-void CheckShapesMatch(OpKernelContext* ctx, const TensorShape& shape1,
-                      const TensorShape& shape2) {
-  OP_REQUIRES(
-      ctx, shape1 == shape2,
-      errors::InvalidArgument("Mismatched shapes ", shape1.DebugString(),
-                              " vs ", shape2.DebugString(), "."));
+Status CheckShapesMatch(VarDimArray shape1, VarDimArray shape2) {
+  if (shape1 != shape2) {
+    return errors::InvalidArgument("Mismatched shapes [",
+                                   str_util::Join(shape1, ","), "] vs [",
+                                   str_util::Join(shape2, ","), "]");
+  }
+  return Status::OK();
 }
 
 // Validate ranks are the same, and all but last dimension are the same.
 // Return GroupShape.
-const TensorShape GroupShapeFromInputs(OpKernelContext* ctx,
-                                       const TensorShape& shape1,
-                                       const TensorShape& shape2) {
-  const TensorShape group_shape = GroupShape(ctx, shape1);
-  CheckShapesMatch(ctx, group_shape, GroupShape(ctx, shape2));
-  return group_shape;
+Status GroupShapeFromInputs(VarDimArray shape1, VarDimArray shape2,
+                            ShapeArray* group_shape) {
+  ShapeArray group_shape_1;
+  TF_RETURN_IF_ERROR(GroupShape(shape1, &group_shape_1));
+  ShapeArray group_shape_2;
+  TF_RETURN_IF_ERROR(GroupShape(shape2, &group_shape_2));
+  TF_RETURN_IF_ERROR(CheckShapesMatch(group_shape_1, group_shape_2));
+  *group_shape = group_shape_1;
+  return Status::OK();
 }
 
 // Split `flat_group_index` into separate dimensions based on `group_shape`.
-void PopulateGroupIndices(const int64 flat_group_index,
-                          const TensorShape& group_shape,
+void PopulateGroupIndices(const int64 flat_group_index, VarDimArray group_shape,
                           std::vector<int64>* group_indices) {
   group_indices->clear();
   int64 running_flat_group_index = flat_group_index;
-  for (auto group_dim_index = group_shape.dims() - 1; group_dim_index >= 0;
+  for (int group_dim_index = group_shape.size() - 1; group_dim_index >= 0;
        --group_dim_index) {
-    const auto group_dim = group_shape.dim_size(group_dim_index);
+    const auto group_dim = group_shape[group_dim_index];
     group_indices->insert(group_indices->begin(),
                           running_flat_group_index % group_dim);
     running_flat_group_index /= group_dim;
   }
 }
 
+ShapeArray TensorShapeToArray(const TensorShape& t) {
+  ShapeArray vec(t.dims());
+  for (int i = 0; i < t.dims(); ++i) vec[i] = t.dim_size(i);
+  return vec;
+};
+
 // `ctx` contains set1 and set2 dense tensors.
 // Iterate over groups in set1 and set2, applying `ApplySetOperation` to each,
 // and outputing the result `SparseTensor`. A "group" is a collection of values
@@ -419,11 +433,13 @@ void SetOperationOp<T>::ComputeDenseToDense(OpKernelContext* ctx) const {
   // The following should stay in sync with `_dense_to_dense_shape` shape
   // assertions in python/ops/set_ops.py, and `SetShapeFn` for
   // `DenseToDenseSetOperation` in ops/set_ops.cc.
-  const TensorShape group_shape =
-      GroupShapeFromInputs(ctx, set1_t.shape(), set2_t.shape());
+  ShapeArray group_shape;
+  const auto shape1 = TensorShapeToArray(set1_t.shape());
+  const auto shape2 = TensorShapeToArray(set2_t.shape());
+  OP_REQUIRES_OK(ctx, GroupShapeFromInputs(shape1, shape2, &group_shape));
 
-  const auto set1_strides = Strides(set1_t.shape());
-  const auto set2_strides = Strides(set2_t.shape());
+  const auto set1_strides = Strides(shape1);
+  const auto set2_strides = Strides(shape2);
 
   std::map<std::vector<int64>, std::set<T>> group_sets;
   int64 num_result_values = 0;
@@ -432,8 +448,11 @@ void SetOperationOp<T>::ComputeDenseToDense(OpKernelContext* ctx) const {
   std::set<T> set1_group_set;
   std::set<T> set2_group_set;
   std::vector<int64> group_indices;
-  for (int64 flat_group_index = 0;
-       flat_group_index < group_shape.num_elements(); ++flat_group_index) {
+  int64 num_elements;
+  OP_REQUIRES_OK(ctx,
+                 TensorShapeUtils::NumElements(group_shape, &num_elements));
+  for (int64 flat_group_index = 0; flat_group_index < num_elements;
+       ++flat_group_index) {
     PopulateGroupIndices(flat_group_index, group_shape, &group_indices);
     PopulateFromDenseGroup<T>(ctx, set1_t, set1_strides, group_indices,
                               &set1_group_set);
@@ -452,7 +471,8 @@ void SetOperationOp<T>::ComputeDenseToDense(OpKernelContext* ctx) const {
     }
   }
 
-  TensorShape output_shape(group_shape);
+  TensorShape output_shape;
+  OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
   output_shape.AddDim(max_set_size);
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
@@ -469,10 +489,11 @@ void SetOperationOp<T>::ComputeDenseToSparse(OpKernelContext* ctx) const {
   // The following should stay in sync with `_dense_to_sparse_shape` shape
   // assertions in python/ops/set_ops.py, and `SetShapeFn` for
   // `DenseToSparseSetOperation` in ops/set_ops.cc.
-  const TensorShape group_shape =
-      GroupShapeFromInputs(ctx, set1_t.shape(), set2_st.shape());
+  ShapeArray group_shape;
+  OP_REQUIRES_OK(ctx, GroupShapeFromInputs(TensorShapeToArray(set1_t.shape()),
+                                           set2_st.shape(), &group_shape));
 
-  const auto set1_strides = Strides(set1_t.shape());
+  const ShapeArray set1_strides = Strides(TensorShapeToArray(set1_t.shape()));
 
   std::map<std::vector<int64>, std::set<T>> group_sets;
   int64 num_result_values = 0;
@@ -480,12 +501,15 @@ void SetOperationOp<T>::ComputeDenseToSparse(OpKernelContext* ctx) const {
 
   std::set<T> set1_group_set;
   std::set<T> set2_group_set;
-  auto set2_grouper = set2_st.group(sparse::SparseTensor::VarDimArray(
-      set2_st.order(), 0, set2_st.order().size() - 1));
+  auto set2_grouper = set2_st.group(
+      VarDimArray(set2_st.order(), 0, set2_st.order().size() - 1));
   auto set2_group_it = set2_grouper.begin();
   std::vector<int64> group_indices;
-  for (int64 flat_group_index = 0;
-       flat_group_index < group_shape.num_elements(); ++flat_group_index) {
+  int64 num_elements;
+  OP_REQUIRES_OK(ctx,
+                 TensorShapeUtils::NumElements(group_shape, &num_elements));
+  for (int64 flat_group_index = 0; flat_group_index < num_elements;
+       ++flat_group_index) {
     PopulateGroupIndices(flat_group_index, group_shape, &group_indices);
 
     // Get values from set1.
@@ -527,7 +551,8 @@ void SetOperationOp<T>::ComputeDenseToSparse(OpKernelContext* ctx) const {
     }
   }
 
-  TensorShape output_shape(group_shape);
+  TensorShape output_shape;
+  OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
   output_shape.AddDim(max_set_size);
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
@@ -579,11 +604,12 @@ void SetOperationOp<T>::ComputeSparseToSparse(OpKernelContext* ctx) const {
   // The following should stay in sync with `_sparse_to_sparse_shape` shape
   // assertions in python/ops/set_ops.py, and `SetShapeFn` for
   // `SparseToSparseSetOperation` in ops/set_ops.cc.
-  const TensorShape group_shape =
-      GroupShapeFromInputs(ctx, set1_st.shape(), set2_st.shape());
+  ShapeArray group_shape;
+  OP_REQUIRES_OK(ctx, GroupShapeFromInputs(set1_st.shape(), set2_st.shape(),
+                                           &group_shape));
 
-  const auto set1_strides = Strides(set1_st.shape());
-  const auto set2_strides = Strides(set2_st.shape());
+  const ShapeArray set1_strides = Strides(set1_st.shape());
+  const ShapeArray set2_strides = Strides(set2_st.shape());
 
   std::map<std::vector<int64>, std::set<T>> group_sets;
   int64 num_result_values = 0;
@@ -591,11 +617,11 @@ void SetOperationOp<T>::ComputeSparseToSparse(OpKernelContext* ctx) const {
 
   std::set<T> set1_group_set;
   std::set<T> set2_group_set;
-  auto set1_grouper = set1_st.group(sparse::SparseTensor::VarDimArray(
-      set1_st.order(), 0, set1_st.order().size() - 1));
+  auto set1_grouper = set1_st.group(
+      VarDimArray(set1_st.order(), 0, set1_st.order().size() - 1));
   auto set1_group_it = set1_grouper.begin();
-  auto set2_grouper = set2_st.group(sparse::SparseTensor::VarDimArray(
-      set2_st.order(), 0, set2_st.order().size() - 1));
+  auto set2_grouper = set2_st.group(
+      VarDimArray(set2_st.order(), 0, set2_st.order().size() - 1));
   auto set2_group_it = set2_grouper.begin();
 
   // Group by rows, and iterate over rows of both sets in parallel, creating a
@@ -643,7 +669,8 @@ void SetOperationOp<T>::ComputeSparseToSparse(OpKernelContext* ctx) const {
     }
   }
 
-  TensorShape output_shape(group_shape);
+  TensorShape output_shape;
+  OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
   output_shape.AddDim(max_set_size);
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 6bc0b4560b7..c5e31641454 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -48,6 +48,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("Shape")
@@ -82,6 +83,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -101,7 +103,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
 
 // ShapeN ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
@@ -131,6 +133,7 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -150,9 +153,9 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int64>("out_type"),
                         ShapeNOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
 
-#if TENSORFLOW_USE_SYCL
+#ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                               \
   REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
                               .Device(DEVICE_SYCL)               \
@@ -168,11 +171,9 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("ShapeN")
                             .Device(DEVICE_SYCL)
                             .HostMemory("input")
@@ -200,12 +201,9 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                               .TypeConstraint<type>("T") \
                               .HostMemory("output"),     \
                           RankOp);
-REGISTER_SYCL_KERNEL(float);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 
-// A special GPU kernel for int32 and bool.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Rank")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -247,7 +245,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank")
                             .HostMemory("input")
                             .HostMemory("output"),
                         RankOp);
-#endif
+#endif  // GOOGLE_CUDA
 
 // Size ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Size")
@@ -276,6 +274,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                               .HostMemory("output"),             \
                           SizeOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -295,7 +294,41 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SizeOp<int64>);
-#endif
+#endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Size")                           \
+                              .Device(DEVICE_SYCL)               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_type") \
+                              .HostMemory("output"),             \
+                          SizeOp<int32>);                        \
+  REGISTER_KERNEL_BUILDER(Name("Size")                           \
+                              .Device(DEVICE_SYCL)               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_type") \
+                              .HostMemory("output"),             \
+                          SizeOp<int64>);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
+#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("Size")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("out_type")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SizeOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("Size")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("out_type")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SizeOp<int64>);
+#endif  // TENSORFLOW_USE_SYCL
 
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -313,6 +346,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .HostMemory("dim"),            \
                           ExpandDimsOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -323,7 +357,29 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("dim")
                             .HostMemory("output"),
                         ExpandDimsOp);
-#endif
+#endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int32>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
+#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -334,6 +390,7 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
       Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       SqueezeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -345,6 +402,23 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                             .HostMemory("input")
                             .HostMemory("output"),
                         SqueezeOp);
-#endif
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      SqueezeOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+TF_CALL_bool(REGISTER_SYCL_KERNEL);
+#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("Squeeze")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        SqueezeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/shuffle_dataset_op.cc
new file mode 100644
index 00000000000..86786287bb1
--- /dev/null
+++ b/tensorflow/core/kernels/shuffle_dataset_op.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ShuffleDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ShuffleDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Create a new ShuffleDatasetOp::Dataset, and return it as the output.
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    *output = new Dataset(input, buffer_size, seed, seed2);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int64 buffer_size, int64 seed,
+            int64 seed2)
+        : input_(input), buffer_size_(buffer_size), seed_(seed), seed2_(seed2) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
+                             ", ", seed2_, ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()),
+            generator_(&parent_generator_) {
+        buffer_.reserve(dataset->buffer_size_);
+        int64 seed = dataset->seed_;
+        int64 seed2 = dataset->seed2_;
+        if (seed == 0 && seed2 == 0) {
+          // If both seeds are unspecified, use completely random seeds.
+          seed = random::New64();
+          seed2 = random::New64();
+        }
+        parent_generator_ = random::PhiloxRandom(seed, seed2);
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        while (!end_of_input_sequence_ &&
+               buffer_.size() < dataset()->buffer_size_) {
+          std::vector<Tensor> input_element;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
+                                                  &end_of_input_sequence_));
+          if (!end_of_input_sequence_) {
+            buffer_.emplace_back(std::move(input_element));
+          }
+        }
+
+        if (!buffer_.empty()) {
+          *end_of_sequence = false;
+          // Choose an element to produce uniformly at random, and
+          // swap the last element into its place in the buffer.
+          int64 index = generator_() % buffer_.size();
+          *out_tensors = std::move(buffer_[index]);
+          std::swap(buffer_[index], buffer_.back());
+          buffer_.pop_back();
+        } else {
+          DCHECK(end_of_input_sequence_);
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      bool end_of_input_sequence_ GUARDED_BY(mu_) = false;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const int64 buffer_size_;
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
+                        ShuffleDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/skip_dataset_op.cc
new file mode 100644
index 00000000000..ea749c7365d
--- /dev/null
+++ b/tensorflow/core/kernels/skip_dataset_op.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class SkipDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SkipDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Create a new RepeatDatasetOp::Dataset, and return it as the output.
+    int64 count;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
+
+    *output = new Dataset(count, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      }  else if (count_ == 0) {
+        // Pass through.
+        return input_->MakeIterator();
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "SkipDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+
+        // Keep calling GetNext().  TODO(vrv): Figure out a way to
+        // skip records without reading, perhaps by adding an
+        // interface to iterator.
+        while (i_ < dataset()->count_) {
+          // Fetch and throw away Tensors.
+          std::vector<Tensor> dummy_out_tensors;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
+                                                  end_of_sequence));
+          if (*end_of_sequence) {
+            // We reached the end before the count was reached.
+            input_impl_.reset();
+            return Status::OK();
+          }
+
+          ++i_;
+        }
+
+        // Return GetNext() on the underlying iterator.
+        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
+                                                end_of_sequence));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
+                        SkipDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 2a9ff40f8ca..d46701749be 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -118,6 +118,43 @@ static void SharedValidation(OpKernelContext* context,
   }
 }
 
+// Extracted out code in SliceOp::Compute so that MklSliceOp can reuse this
+// generic code
+template <typename T>
+static void SharedSliceCommonCases(OpKernelContext* context,
+                                   TensorShape* output_shape,
+                                   gtl::InlinedVector<int64, 4>* begin,
+                                   gtl::InlinedVector<int64, 4>* size,
+                                   Tensor** result,
+                                   bool* done) {
+  bool is_identity = true;
+  bool slice_dim0 = true;
+  *done = false;
+
+  SharedValidation(context, output_shape, &is_identity, &slice_dim0, begin,
+                   size);
+  if (!context->status().ok()) return;
+  const Tensor& input = context->input(0);
+  if (is_identity) {
+    VLOG(1) << "Slice identity";
+    context->set_output(0, input);
+    *done = true;
+    return;
+  }
+
+  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
+                                          (*size)[0])) {
+    VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
+    CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
+    context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
+    *done = true;
+    return;
+  }
+
+  OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
+}
+
+
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -125,29 +162,15 @@ class SliceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     TensorShape output_shape;
-    bool is_identity = true;
-    bool slice_dim0 = true;
     gtl::InlinedVector<int64, 4> begin;
     gtl::InlinedVector<int64, 4> size;
-    SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin,
-                     &size);
-    if (!context->status().ok()) return;
-    const Tensor& input = context->input(0);
-    if (is_identity) {
-      VLOG(1) << "Slice identity";
-      context->set_output(0, input);
-      return;
-    }
-
-    if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), begin[0], size[0])) {
-      VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
-      CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
-      context->set_output(0, input.Slice(begin[0], begin[0] + size[0]));
-      return;
-    }
-
     Tensor* result = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
+    bool done = false;
+    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
+                              &done);
+    if (!context->status().ok() || done == true) return;
+
+    const Tensor& input = context->input(0);
     const int input_dims = input.dims();
 
     if (output_shape.num_elements() > 0) {
@@ -205,6 +228,195 @@ class SliceOp : public OpKernel {
   }
 };
 
+#ifdef INTEL_MKL
+template <typename Device, typename T>
+class MklSliceOp : public OpKernel {
+ public:
+  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorShape output_shape;
+    gtl::InlinedVector<int64, 4> begin;
+    gtl::InlinedVector<int64, 4> size;
+    Tensor* result = nullptr;
+    bool done = false;
+    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
+                              &done);
+    if (!context->status().ok() || done == true) return;
+
+    const Tensor& input = context->input(0);
+    const int input_dims = input.dims();
+
+    if (output_shape.num_elements() > 0) {
+      if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
+          DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        auto input = context->input(0).tensor<T, 2>();
+        auto output = result->tensor<T, 2>();
+        // TODO(agarwal): Consider multi-threading this loop for cases where
+        // size[0] is very large.
+        for (int i = 0; i < size[0]; ++i) {
+          const int64 row = begin[0] + i;
+          if (i + 1 < size[0]) {
+            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
+            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
+          }
+          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
+        }
+        return;
+      }
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
+  }
+
+      HANDLE_DIM(1);
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+      HANDLE_DIM(6);
+      HANDLE_DIM(7);
+
+#undef HANDLE_DIM
+
+      OP_REQUIRES(context, false, errors::Unimplemented(
+                                      "SliceOp : Unhandled input dimensions"));
+    }
+  }
+
+ private:
+  // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following
+  // criteria matches for slice_dim: if indices for slice are 0 in all dims
+  // except slice_dim and if sizes of all the dimensions of the slice are same
+  // as the sizes of all the dimensions of the input except slice_dim, then
+  // returns True. Otherwise, returns False.
+  bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
+                          const gtl::ArraySlice<int64>& begin,
+                          const gtl::ArraySlice<int64>& size,
+                          int slice_dim) {
+    for (int dim = 0; dim < 4; dim++) {
+      if (dim != slice_dim &&
+          (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Is 'input' tensor being sliced over a single dimension out of 4?
+  //
+  // This check is applicable in the context of Slice of a 4-D tensor in
+  // NHWC or NCHW format over channel dimension.
+  //
+  // If indices for slice are 0 in all dims except one dimension and if sizes of
+  // all dimensions of slice are same as sizes of all dimensions of inputs
+  // except that dimension, then we are slicing over a single dimension.
+  //
+  // Returns True if Slicing over a single dimension, and sets slice_dim
+  // to the number of the dimension that satisfies criteria.
+  bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
+                          const gtl::ArraySlice<int64>& begin,
+                          const gtl::ArraySlice<int64>& size,
+                          int* slice_dim) {
+    for (int dim = 0; dim < 4; dim++) {
+      if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
+        *slice_dim = dim;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context,
+                  const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+    int slice_dim = -1;
+    TensorShape in_shape = context->input(0).shape();
+    // Special case for handling 4-D tensor slice when shape of the slice
+    // differs from the input tensor in only 1 out of 4 dimensions.
+    // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
+    // format over channel dimension.
+    if (NDIM == 4 &&
+        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
+                                          in_shape.dim_size(2) *
+                                          in_shape.dim_size(3),
+                                 (size_t) in_shape.dim_size(2) *
+                                          in_shape.dim_size(3),
+                                 (size_t) in_shape.dim_size(3),
+                                 (size_t) 1
+                               };
+
+        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
+                                  (size_t) size[2] * size[3],
+                                  (size_t) size[3],
+                                  (size_t) 1 };
+
+        T *in_buf = const_cast<T*>(const_cast<const T*>(
+                    context->input(0).flat<T>().data()));
+        T *op_buf = result->flat<T>().data();
+
+        if (slice_dim == 1) {
+          /* data format = NCHW */
+
+          #pragma omp parallel for
+          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+              T *ip  = in_buf + (d0 * in_strides[0]);
+              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
+            #pragma omp parallel for
+            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+              T *ip1 = ip + (d1 * in_strides[1]);
+              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
+              // For NCHW, H and W will be contiguous. So we can copy
+              // both with one memcpy.
+              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                     sizeof(T) * in_strides[1]);
+            }
+          }
+          return;
+        } else if (slice_dim == 3) {
+          /* data_format = NHWC */
+
+          #pragma omp parallel for
+          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+              T *ip = in_buf + (d0 * in_strides[0]);
+              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+            #pragma omp parallel for
+            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+              T *ip1 = ip + (d1 * in_strides[1]);
+              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
+              #pragma omp parallel for
+              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+                T *ip2 = ip1 + (d2 * in_strides[2]);
+                T *ip3 = ip2 + begin[3];
+                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+                T *op3 = op2;
+                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                       sizeof(T) * size[3]);
+              }
+            }
+          }
+          return;
+        }
+        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+    }
+
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
+    }
+
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
+  }
+};
+#endif
+
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
@@ -233,6 +445,7 @@ DECLARE_FOR_N(bfloat16);
 #undef DECLARE_CPU_SPEC
 }  // namespace functor
 
+#ifndef INTEL_MKL
 #define REGISTER_SLICE(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Slice")                  \
                               .Device(DEVICE_CPU)        \
@@ -244,8 +457,21 @@ DECLARE_FOR_N(bfloat16);
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
 REGISTER_SLICE(bfloat16);
-
 #undef REGISTER_SLICE
+#else
+#define REGISTER_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("begin")       \
+                              .HostMemory("size"),       \
+                          MklSliceOp<CPUDevice, type>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
+TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
+REGISTER_SLICE(bfloat16);
+#undef REGISTER_SLICE
+#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -328,8 +554,9 @@ namespace functor {
   DECLARE_SYCL_SPEC(T, 6); \
   DECLARE_SYCL_SPEC(T, 7);
 
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N);
 DECLARE_FOR_N(int32);
+DECLARE_FOR_N(bool);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_SYCL_SPEC
@@ -344,11 +571,8 @@ DECLARE_FOR_N(int32);
                               .TypeConstraint<int32>("Index"), \
                           SliceOp<SYCLDevice, type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Slice")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
@@ -358,7 +582,6 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
                             .HostMemory("size")
                             .HostMemory("output"),
                         SliceOp<CPUDevice, int32>);
-
 #undef REGISTER_SYCL
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc
index ed24923a2d5..2ddc3e9220f 100644
--- a/tensorflow/core/kernels/slice_op_test.cc
+++ b/tensorflow/core/kernels/slice_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/smooth-hinge-loss.h b/tensorflow/core/kernels/smooth-hinge-loss.h
index 45da0fb1172..5074ad0795d 100644
--- a/tensorflow/core/kernels/smooth-hinge-loss.h
+++ b/tensorflow/core/kernels/smooth-hinge-loss.h
@@ -35,7 +35,7 @@ class SmoothHingeLossUpdater : public DualLossUpdater {
                             const double current_dual, const double wx,
                             const double weighted_example_norm) const final {
     // Intutitvely there are 3 cases:
-    // a. new optimal value of the dual variable falls withing the admissible
+    // a. new optimal value of the dual variable falls within the admissible
     // range [0, 1]. In this case we set new dual to this value.
     // b. new optimal value is < 0. Then, because of convexity, the optimal
     // valid value for new dual = 0
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index c7ae93852f8..8345a98a0d3 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -28,17 +28,27 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
 namespace functor {
-template <typename T>
-struct SoftmaxFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct SoftmaxFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::Matrix softmax, const bool log) {
-    SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax, log);
+    SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax, log);
   }
 };
+template <typename T>
+struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                          \
@@ -76,4 +86,12 @@ REGISTER_KERNEL_BUILDER(
     SoftmaxOp<GPUDevice, float>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
+    SoftmaxOp<SYCLDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
+    SoftmaxOp<SYCLDevice, double>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h
index dc61e268096..2ae77951da5 100644
--- a/tensorflow/core/kernels/softmax_op.h
+++ b/tensorflow/core/kernels/softmax_op.h
@@ -40,8 +40,8 @@ class SoftmaxOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     Tensor* softmax_out = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, logits_in.shape(), &softmax_out));
     if (logits_in.NumElements()) {
       functor::SoftmaxFunctor<Device, T> functor;
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index 5650435781a..494a83ed14e 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftplusOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftplusOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softplus<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftplusGradOp
     : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftplusGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index 33b9628b321..00ee649b175 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftsignOp : public UnaryElementWiseOp<T, SoftsignOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftsignOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftsignOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftsignOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softsign<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftsignGradOp
     : public BinaryElementWiseOp<T, SoftsignGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftsignGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index a8c4b3746ad..c25ce2d8bb5 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -37,19 +37,19 @@ static Graph* ConstructSpaceToBatchGraph(
   if (dtype == DT_FLOAT) {
     Tensor input(DT_FLOAT, input_shape);
     input.flat<float>().setRandom();
-    NodeBuilder(g->NewName("n"), op_name)
-        .Input(test::graph::Constant(g, input))
-        .Input(test::graph::Constant(g, paddings_tensor))
-        .Attr("block_size", block_size)
-        .Finalize(g, &ret);
+    TF_CHECK_OK(NodeBuilder(g->NewName("n"), op_name)
+                    .Input(test::graph::Constant(g, input))
+                    .Input(test::graph::Constant(g, paddings_tensor))
+                    .Attr("block_size", block_size)
+                    .Finalize(g, &ret));
   } else if (dtype == DT_HALF) {
     Tensor input(DT_HALF, input_shape);
     input.flat<Eigen::half>().setRandom();
-    NodeBuilder(g->NewName("n"), op_name)
-        .Input(test::graph::Constant(g, input))
-        .Input(test::graph::Constant(g, paddings_tensor))
-        .Attr("block_size", block_size)
-        .Finalize(g, &ret);
+    TF_CHECK_OK(NodeBuilder(g->NewName("n"), op_name)
+                    .Input(test::graph::Constant(g, input))
+                    .Input(test::graph::Constant(g, paddings_tensor))
+                    .Attr("block_size", block_size)
+                    .Finalize(g, &ret));
   }
   return g;
 }
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 3815716ccd9..c513683918e 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -100,6 +100,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
   for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
     block_shape_product *= block_shape[block_dim];
   }
+  OP_REQUIRES(
+      context, block_shape_product > 0,
+      errors::InvalidArgument("Product of block sizes must be positive, got ",
+                              block_shape_product));
 
   const int internal_block_dims =
       block_dims - removed_prefix_block_dims - removed_suffix_block_dims;
diff --git a/tensorflow/core/kernels/sparse_concat_op.cc b/tensorflow/core/kernels/sparse_concat_op.cc
index 4fd91f5305b..f813794374a 100644
--- a/tensorflow/core/kernels/sparse_concat_op.cc
+++ b/tensorflow/core/kernels/sparse_concat_op.cc
@@ -137,12 +137,13 @@ class SparseConcatOp : public OpKernel {
     context->set_output(1, concat.values());
 
     Tensor* output_shape_out = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                2, TensorShape({concat.shape().dims()}),
-                                &output_shape_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape({concat.dims()}),
+                                            &output_shape_out));
     auto output_shape = output_shape_out->vec<int64>();
-    for (int j = 0; j < concat.shape().dims(); ++j) {
-      output_shape(j) = concat.shape().dim_size(j);
+    auto concat_shape = concat.shape();
+    for (int j = 0; j < concat.dims(); ++j) {
+      output_shape(j) = concat_shape[j];
     }
   }
 
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 89560094af6..2c1bffbee48 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -172,8 +172,10 @@ class SparseConditionalAccumulator
     }
 
     // Assign values to accum_val_tensor
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
     ctx->allocate_persistent(dtype_, grad_val->shape(), accum_val_persistent_,
-                             &accum_val_);
+                             &accum_val_)
+        .IgnoreError();
     accum_val_->flat<T>().device(ctx->template eigen_device<Device>()) =
         grad_val->flat<T>();
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
new file mode 100644
index 00000000000..c7bf250fad7
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -0,0 +1,579 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains OP to generate sparse crosses.
+#include <assert.h>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+namespace {
+// An interface that represents a column with batches.
+template <typename InternalType>
+class ColumnInterface {
+ public:
+  // Returns the number of features in the specified batch.
+  virtual int64 FeatureCount(int64 batch) const = 0;
+
+  // Returns the fingerprint of nth feature from the specified batch.
+  virtual InternalType Feature(int64 batch, int64 n) const = 0;
+
+  virtual ~ColumnInterface() {}
+};
+
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class SparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  SparseTensorColumn(const Tensor& values, std::vector<int64> feature_counts,
+                     std::vector<int64> feature_start_indices)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    CHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  InternalType Feature(int64 batch, int64 n) const override;
+
+  ~SparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
+// InternalType is int64 only when using HashCrosser.
+template <>
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<string>().data()[start + n]);
+  return values_.vec<int64>().data()[start + n];
+}
+
+// InternalType is string or StringPiece when using StringCrosser.
+template <>
+string SparseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<string>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
+                                                     int64 n) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<string>().data()[start + n];
+}
+
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class DenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit DenseTensorColumn(const Tensor& tensor) : tensor_(tensor) {}
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  InternalType Feature(int64 batch, int64 n) const override;
+
+  ~DenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+};
+
+// InternalType is int64 only when using HashCrosser.
+template <>
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<string>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+// Internal type is string or StringPiece when using StringCrosser.
+template <>
+string DenseTensorColumn<string>::Feature(int64 batch, int64 n) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
+                                                    int64 n) const {
+  return tensor_.matrix<string>()(batch, n);
+}
+
+// Updates Output tensors with sparse crosses.
+template <typename OutType>
+class OutputUpdater {
+ public:
+  OutputUpdater(const std::vector<int64>& output_start_indices,
+                Tensor* indices_out, Tensor* values_out)
+      : output_start_indices_(output_start_indices),
+        indices_out_(indices_out),
+        values_out_(values_out) {}
+
+  void Update(const int64 batch_index, const int64 cross_count,
+              const OutType& cross) const {
+    const int64 output_index = output_start_indices_[batch_index] + cross_count;
+
+    auto indices_matrix = indices_out_->matrix<int64>();
+    indices_matrix(output_index, 0) = batch_index;
+    indices_matrix(output_index, 1) = cross_count;
+
+    auto value_vec = values_out_->vec<OutType>();
+    value_vec(output_index) = cross;
+  }
+
+ private:
+  const std::vector<int64>& output_start_indices_;
+  Tensor* indices_out_;
+  Tensor* values_out_;
+};
+
+// Generates the sparse crosses as concatenation of strings.
+template <typename InternalType>
+class StringCrosser {
+ public:
+  StringCrosser(const std::vector<
+                    std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+                const int64 num_buckets_unused, const uint64 hash_key_unused)
+      : columns_(columns) {}
+
+  string Generate(const int64 batch_index,
+                  const std::vector<int>& permutation) const {
+    static const auto k_feature_separator = "_X_";
+
+    gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
+    for (int i = 0; i < permutation.size(); i++) {
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+    }
+    // TODO(zakaria): this will copy the string twice, might effect
+    // performance.
+    return str_util::Join(cross_vec, k_feature_separator);
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+};
+
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosser {
+ public:
+  HashCrosser(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key)
+      : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
+
+  int64 Generate(const int64 batch_index,
+                 const std::vector<int>& permutation) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output = hash_key_;
+    for (size_t i = 0; i < permutation.size(); ++i) {
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+  const uint64 hash_key_;
+};
+
+// ProductIterator generates cartesian products based on indices.
+template <typename InternalType>
+class ProductIterator {
+ public:
+  explicit ProductIterator(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_index)
+      : columns_(columns), batch_index_(batch_index) {
+    next_permutation_.resize(columns_.size(), 0);
+    // Sets has_next_ to false if any feature column has 0 features.
+    has_next_ = true;
+    for (int i = 0; i < columns_.size(); i++) {
+      if (columns_[i]->FeatureCount(batch_index_) == 0) {
+        has_next_ = false;
+        break;
+      }
+    }
+  }
+
+  std::vector<int> Next() {
+    std::vector<int> permutation(next_permutation_);
+
+    // Generates next permutation, if available.
+    bool carry = true;
+    for (int i = next_permutation_.size() - 1; i >= 0; i--) {
+      if (carry) {
+        next_permutation_[i] = next_permutation_[i] + 1;
+      }
+      if (next_permutation_[i] == columns_[i]->FeatureCount(batch_index_)) {
+        next_permutation_[i] = 0;
+      } else {
+        carry = false;
+        break;
+      }
+    }
+    has_next_ = !carry;
+    return permutation;
+  }
+
+  bool HasNext() { return has_next_; }
+
+ private:
+  bool has_next_;
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const int64 batch_index_;
+  std::vector<int> next_permutation_;
+};
+
+template <bool HASHED_OUTPUT, typename InternalType>
+struct CrossTraits;
+
+template <typename InternalType>
+struct CrossTraits<false, InternalType> {
+  typedef StringCrosser<InternalType> Crosser;
+  typedef OutputUpdater<string> Updater;
+};
+
+template <>
+struct CrossTraits<true, int64> {
+  typedef HashCrosser Crosser;
+  typedef OutputUpdater<int64> Updater;
+};
+}  // namespace
+
+template <bool HASHED_OUTPUT, typename InternalType>
+class SparseCrossOp : public OpKernel {
+ public:
+  explicit SparseCrossOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    // Read signed_hash_key_ as int64 since uint64 attributes are not
+    // supported by REGISTER_OP.
+    int64 signed_hash_key_;
+    OP_REQUIRES_OK(context, context->GetAttr("hash_key", &signed_hash_key_));
+    hash_key_ = static_cast<uint64>(signed_hash_key_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    ValidateInput(context, indices_list_in, values_list_in, shapes_list_in,
+                  dense_list_in);
+
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
+        GenerateColumnsFromInput(indices_list_in, values_list_in,
+                                 shapes_list_in, dense_list_in);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
+        crosser(columns, num_buckets_, hash_key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
+                        &shape_out, &output_start_indices);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
+        updater(output_start_indices, indices_out, values_out);
+    auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<InternalType> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+
+ private:
+  // Validates input tensors.
+  void ValidateInput(OpKernelContext* context,
+                     const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+    const auto size = indices_list_in.size();
+    // Validates indices_list_in OpInputList.
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(indices_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input indices should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(1) == 2,
+          errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                  indices_list_in[i].shape().dim_size(1),
+                                  " at position ", i));
+    }
+
+    // Validates values_list_in OpInputList.
+    OP_REQUIRES(
+        context, values_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input values, got ",
+                                values_list_in.size()));
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(values_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input values should be a std::vector but received shape ",
+              values_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(0) ==
+                       values_list_in[i].shape().dim_size(0),
+          errors::InvalidArgument(
+              "Expected size of values to be ",
+              indices_list_in[i].shape().dim_size(0), " got ",
+              values_list_in[i].shape().dim_size(0), " at position ", i));
+    }
+
+    // Validates shapes_list_in OpInputList
+    OP_REQUIRES(
+        context, shapes_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                shapes_list_in.size()));
+    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(shapes_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input shapes should be a std::vector but received shape ",
+              shapes_list_in[i].shape().DebugString(), " at position ", i));
+
+      OP_REQUIRES(
+          context, shapes_list_in[i].vec<int64>().size() == 2,
+          errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                  shapes_list_in[i].shape().DebugString(),
+                                  " at position ", i));
+      OP_REQUIRES(context, shapes_list_in[i].vec<int64>()(0) == batch_size,
+                  errors::InvalidArgument(
+                      "Expected batch size ", batch_size, " got ",
+                      shapes_list_in[i].vec<int64>()(0), " at position ", i));
+    }
+
+    // Validates dense_list_in OpInputList
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Dense inputs should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
+                  errors::InvalidArgument("Expected batch size ", batch_size,
+                                          " got ", dense_list_in[i].dim_size(0),
+                                          " at dense tensor ", i));
+    }
+  }
+
+  // Calculate the batch size from either the shapes input or the dense input.
+  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    if (shapes_list_in.size() > 0) {
+      return shapes_list_in[0].vec<int64>()(0);
+    }
+
+    if (dense_list_in.size() > 0) {
+      return dense_list_in[0].dim_size(0);
+    }
+
+    return 0;
+  }
+
+  // Generate the columns given the sparse and dense inputs.
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+  GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                           const OpInputList& values_list_in,
+                           const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    const int64 number_of_columns = shapes_list_in.size();
+
+    std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                   std::vector<int64>());
+    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                          std::vector<int64>());
+
+    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                       &feature_start_indices);
+
+    columns.reserve(values_list_in.size());
+    for (int i = 0; i < values_list_in.size(); ++i) {
+      columns.emplace_back(new SparseTensorColumn<InternalType>(
+          values_list_in[i], std::move(feature_counts[i]),
+          std::move(feature_start_indices[i])));
+    }
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      columns.emplace_back(
+          new DenseTensorColumn<InternalType>(dense_list_in[i]));
+    }
+
+    return columns;
+  }
+
+  // Extracts data about the features and populates feature data.
+  void ExtractFeatureData(
+      const OpInputList& indices_list_in, int64 batch_size,
+      std::vector<std::vector<int64>>* feature_counts,
+      std::vector<std::vector<int64>>* feature_start_indices) {
+    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+    for (int b = 0; b < batch_size; b++) {
+      for (int i = 0; i < indices_list_in.size(); i++) {
+        const auto indices = indices_list_in[i].matrix<int64>();
+        int64 feature_count = 0;
+        int64 start_index = current_row[i];
+        // Loops until we reach next batch index for current feature column.
+        while (current_row[i] < indices_list_in[i].dim_size(0) &&
+               indices(current_row[i], 0) == b) {
+          feature_count++;
+          current_row[i]++;
+        }
+        (*feature_counts)[i].push_back(feature_count);
+        (*feature_start_indices)[i].push_back(start_index);
+      }
+    }
+  }
+
+  // Allocates output tensors with proper size and sets the shape tensor of
+  // the output SparseTensor.
+  // It also output_start_indices which contains the start indices for each
+  // input in the output SparseTensor.
+  void CreateOutputTensors(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+      Tensor** values_out, Tensor** shape_out,
+      std::vector<int64>* output_start_indices) {
+    // Calculates dimensions for output tensors.
+    int64 cross_count_total = 0;
+    int64 max_cross_count = 0;
+    for (int64 b = 0; b < batch_size; b++) {
+      // For each input, sets starting indices in output SparseTensor
+      (*output_start_indices)[b] = cross_count_total;
+      const auto cross_count = CrossCountByBatchIndex(columns, b);
+      max_cross_count = std::max(max_cross_count, cross_count);
+      cross_count_total += cross_count;
+    }
+
+    // Allocates tensors.
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({cross_count_total, 2}), indices_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({cross_count_total}),
+                                            values_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape({2}), shape_out));
+
+    // Sets shape.
+    auto shape_vec = (*shape_out)->vec<int64>();
+    shape_vec(0) = batch_size;
+    shape_vec(1) = max_cross_count;
+  }
+
+  // Returns number of crosses for a given batch_index
+  int64 CrossCountByBatchIndex(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int batch_index) {
+    int64 cross_count = 1;
+    for (int i = 0; i < columns.size(); i++) {
+      const auto feature_count = columns[i]->FeatureCount(batch_index);
+      // If one column is missing any feature, there won't be any cross.
+      if (feature_count == 0) {
+        return 0;
+      }
+      cross_count *= feature_count;
+    }
+    return cross_count;
+  }
+  int64 num_buckets_;
+  uint64 hash_key_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<false, StringPiece>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<false, string>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
new file mode 100644
index 00000000000..3b915e419bc
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <numeric>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template <typename T>
+class SparseFillEmptyRowsOp : public OpKernel {
+ public:
+  explicit SparseFillEmptyRowsOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* indices_t;
+    const Tensor* values_t;
+    const Tensor* dense_shape_t;
+    const Tensor* default_value_t;
+    OP_REQUIRES_OK(context, context->input("indices", &indices_t));
+    OP_REQUIRES_OK(context, context->input("values", &values_t));
+    OP_REQUIRES_OK(context, context->input("dense_shape", &dense_shape_t));
+    OP_REQUIRES_OK(context, context->input("default_value", &default_value_t));
+
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(dense_shape_t->shape()),
+                errors::InvalidArgument("dense_shape must be a vector, saw: ",
+                                        dense_shape_t->shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices_t->shape()),
+                errors::InvalidArgument("indices must be a matrix, saw: ",
+                                        indices_t->shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()),
+                errors::InvalidArgument("values must be a vector, saw: ",
+                                        values_t->shape().DebugString()));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(default_value_t->shape()),
+        errors::InvalidArgument("default_value must be a scalar, saw: ",
+                                default_value_t->shape().DebugString()));
+    // TODO(ebrevdo): add shape checks between values, indices,
+    // dense_shape.  Also add check that dense rank > 0.
+
+    const T& default_value = default_value_t->scalar<T>()();
+    const auto indices = indices_t->matrix<int64>();
+    const auto values = values_t->vec<T>();
+    const auto dense_shape = dense_shape_t->vec<int64>();
+
+    const int64 N = indices_t->shape().dim_size(0);
+    const int64 dense_rows = dense_shape(0);
+    Tensor* empty_row_indicator_t;
+    OP_REQUIRES_OK(context, context->allocate_output("empty_row_indicator",
+                                                     TensorShape({dense_rows}),
+                                                     &empty_row_indicator_t));
+    auto empty_row_indicator = empty_row_indicator_t->vec<bool>();
+    Tensor* reverse_index_map_t;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("reverse_index_map", TensorShape({N}),
+                                          &reverse_index_map_t));
+    auto reverse_index_map = reverse_index_map_t->vec<int64>();
+    Tensor scratch_t;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_INT64, TensorShape({dense_rows}),
+                                          &scratch_t));
+    auto scratch = scratch_t.vec<int64>();
+    scratch.device(d) = scratch.constant(0);
+    int64 prev_row = -1;
+    for (int i = 0; i < N; ++i) {
+      const int64 row = indices(i, 0);
+      OP_REQUIRES(context, indices(i, 0) >= 0 && indices(i, 0) < dense_rows,
+                  errors::InvalidArgument("indices(", i, ", 0) is invalid: ",
+                                          indices(i, 0), " >= ", dense_rows));
+      prev_row = row;
+      ++scratch(indices(i, 0));
+    }
+    for (int row = 0; row < dense_rows; ++row) {
+      // Scratch here describes the number of elements in this dense row
+      empty_row_indicator(row) = (scratch(row) == 0);
+      // In filled version, each row has at least one element.
+      scratch(row) = std::max(scratch(row), 1LL);
+      // Update scratch to represent the number of elements up to and
+      // including dense_row + 1:
+      //  scratch(0) == #{elements of row 0}
+      //  scratch(1) == #{elements of row 1} + #{elements of row 0}
+      //  ..
+      //  scratch(i) == starting index for elements in row i + 1.
+      if (row > 0) {
+        scratch(row) += scratch(row - 1);
+      }
+    }
+    Tensor* output_indices_t;
+    int rank = indices_t->shape().dim_size(1);
+    const int64 N_full = scratch(dense_rows - 1);
+    TensorShape output_indices_shape({N_full, rank});
+    OP_REQUIRES_OK(context, context->allocate_output("output_indices",
+                                                     output_indices_shape,
+                                                     &output_indices_t));
+    auto output_indices = output_indices_t->matrix<int64>();
+    output_indices.device(d) = output_indices.constant(0);
+
+    Tensor* output_values_t;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     "output_values", TensorShape({N_full}), &output_values_t));
+    auto output_values = output_values_t->vec<T>();
+    output_values.device(d) = output_values.constant(default_value);
+
+    Tensor filled_count_t;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_INT64, TensorShape({dense_rows}),
+                                          &filled_count_t));
+    auto filled_count = filled_count_t.vec<int64>();
+    filled_count.device(d) = filled_count.constant(0);
+
+    // Fill in values for rows that are not missing
+    for (int64 i = 0; i < N; ++i) {
+      const int64 row = indices(i, 0);
+      int64& offset = filled_count(row);
+      const int64 output_i = ((row == 0) ? 0 : scratch(row - 1)) + offset;
+      offset++;  // Increment the filled count for this row.
+      std::copy_n(&indices(i, 0), rank, &output_indices(output_i, 0));
+      output_values(output_i) = values(i);
+      // We'll need this reverse index map to backprop correctly.
+      reverse_index_map(i) = output_i;
+    }
+
+    // Fill in values for rows that are missing
+    for (int64 row = 0; row < dense_rows; ++row) {
+      const int64 row_count = filled_count(row);
+      if (row_count == 0) {  // We haven't filled this row
+        const int64 starting_index = (row == 0) ? 0 : scratch(row - 1);
+        // Remaining index values were set to zero already.
+        // The value at this index was set to default_value already.
+        // Just need to set the row index in the right location.
+        output_indices(starting_index, 0) = row;
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("SparseFillEmptyRows")     \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<type>("T"), \
+                          SparseFillEmptyRowsOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <typename T>
+class SparseFillEmptyRowsGradOp : public OpKernel {
+ public:
+  explicit SparseFillEmptyRowsGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* reverse_index_map_t;
+    const Tensor* grad_values_t;
+    OP_REQUIRES_OK(context,
+                   context->input("reverse_index_map", &reverse_index_map_t));
+    OP_REQUIRES_OK(context, context->input("grad_values", &grad_values_t));
+
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(reverse_index_map_t->shape()),
+        errors::InvalidArgument("reverse_index_map must be a vector, saw: ",
+                                reverse_index_map_t->shape().DebugString()));
+
+    const auto reverse_index_map = reverse_index_map_t->vec<int64>();
+    const auto grad_values = grad_values_t->vec<T>();
+
+    const int64 N = reverse_index_map_t->shape().dim_size(0);
+    const int64 N_full = grad_values_t->shape().dim_size(0);
+
+    Tensor* d_values_t;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "d_values", TensorShape({N}), &d_values_t));
+    auto d_values = d_values_t->vec<T>();
+    Tensor* d_default_value_t;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("d_default_value", TensorShape({}),
+                                            &d_default_value_t));
+    T& d_default_value = d_default_value_t->scalar<T>()();
+    d_default_value = T();
+
+    Tensor visited_t;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DT_BOOL, TensorShape({N_full}), &visited_t));
+    auto visited = visited_t.vec<bool>();
+    visited.device(d) = visited.constant(false);
+
+    for (int i = 0; i < N; ++i) {
+      // Locate the index of the output of the forward prop associated
+      // with this location in the input of the forward prop.  Copy
+      // the gradient into it.  Mark it as visited.
+      d_values(i) = grad_values(reverse_index_map(i));
+      visited(reverse_index_map(i)) = true;
+    }
+    for (int j = 0; j < N_full; ++j) {
+      // The default value gradient gets the accumulated remainder of
+      // the backprop values (since the default value was used to fill
+      // in these slots in the forward calculation).
+      if (!visited(j)) {
+        d_default_value += grad_values(j);
+      }
+    }
+  }
+};
+
+#define REGISTER_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("SparseFillEmptyRowsGrad") \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<type>("T"), \
+                          SparseFillEmptyRowsGradOp<type>)
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a9ea7261cca..0bbb52bc329 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -19,7 +19,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_matmul_op.h"
 
+#include <map>
+#include <memory>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bfloat16.h"
@@ -42,26 +45,27 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-
 namespace {
 
 using Eigen::operator==;
-typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix;
-typedef Eigen::DSizes<Eigen::DenseIndex, 2> DSizes;
-typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>,
-                         Eigen::Aligned>
-    ConstMatrixMap;
-typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>,
-                         Eigen::Aligned>
-    MatrixMap;
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
+template <typename T>
+using BasicMatrix = Eigen::Tensor<T, 2, Eigen::RowMajor>;
+
+template <typename T>
+using BasicMatrixMap =
+    Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>;
+
+using Matrix = BasicMatrix<float>;
+using MatrixMap = BasicMatrixMap<float>;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using DSizes = Eigen::DSizes<Eigen::DenseIndex, 2>;
 
 // Blocksizes
 // TODO(agarwal): compute these sizes based on cache sizes.
-static const int K = 64;
-static const int M = 64;
-static const int N = 128;
+const int K = 64;
+const int M = 64;
+const int N = 128;
 
 // This stores a sparse representation of a slice of a matrix with size
 // (num_rows, num_cols). The slice is represented as a series of blocks of size
@@ -86,9 +90,7 @@ static const int N = 128;
 // index_offset.
 template <typename T>
 struct SparseSlice {
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMap;
+  using ConstMatrixMap = BasicMatrixMap<const T>;
 
  public:
   // Indices of three elements on the same row.
@@ -244,8 +246,8 @@ void SparseSlice<T>::Clear() {
   data.clear();
 }
 
-typedef Eigen::internal::packet_traits<float>::type Packet;
-static const int kNumOperands = (sizeof(Packet) / sizeof(float));
+using Packet = Eigen::internal::packet_traits<float>::type;
+const int kNumOperands = (sizeof(Packet) / sizeof(float));
 #define LOAD(x) Eigen::internal::pload<Packet>(x);
 #define EXPAND_BFLOAT_L(x, y) \
   const auto y = Eigen::internal::pexpand_bf16_l<Packet>(x);
@@ -607,8 +609,8 @@ inline void GEPP(
   }
   for (const auto* left_slice : left_slices) {
     const auto& left = *left_slice;
-    const auto* data3 = (left.data3.size() > 0) ? &left.data3[0] : nullptr;
-    const auto* data = (left.data.size() > 0) ? &left.data[0] : nullptr;
+    const auto* data3 = (!left.data3.empty()) ? &left.data3[0] : nullptr;
+    const auto* data = (!left.data.empty()) ? &left.data[0] : nullptr;
     const int num_blocks = left.index3_offset.size();
     int begin3 = 0;
     int begin = 0;
@@ -752,17 +754,11 @@ inline void GEPP(
 
 template <typename TL, typename TR>
 class SparseMatMul {
-  typedef Eigen::Tensor<TL, 2, Eigen::RowMajor> MatrixL;
-  typedef Eigen::Tensor<TR, 2, Eigen::RowMajor> MatrixR;
-  typedef Eigen::TensorMap<Eigen::Tensor<const TL, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMapL;
-  typedef Eigen::TensorMap<Eigen::Tensor<const TR, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMapR;
-  typedef Eigen::TensorMap<Eigen::Tensor<TR, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      MatrixMapR;
+  using MatrixL = BasicMatrix<TL>;
+  using MatrixR = BasicMatrix<TR>;
+  using ConstMatrixMapL = BasicMatrixMap<const TL>;
+  using ConstMatrixMapR = BasicMatrixMap<const TR>;
+  using MatrixMapR = BasicMatrixMap<TR>;
 
  public:
   // Not used; added to match interface of LibxsmmSparseMatMul
@@ -792,7 +788,7 @@ class SparseMatMul {
   // "slice_num_cols", each grid element is converted into a SparseSlice and
   // stored in mat_slices. "slice_block_size" is used to perform further column
   // blocking of each slice.
-  static inline BlockingCounter* CreateSparseSlices(
+  static inline std::unique_ptr<BlockingCounter> CreateSparseSlices(
       const ConstMatrixMapL& mat, bool transpose, int slice_num_rows,
       int slice_block_size, int slice_num_cols,
       std::vector<std::vector<SparseSlice<TL>*>>* mat_slices,
@@ -802,7 +798,7 @@ class SparseMatMul {
   // columns, and concatenates the pieces one after the other in "buffer". It
   // returns the list of the pieces in "slices". It returns a BlockingCounter
   // which should be used to wait for the shuffle operations to complete.
-  static inline BlockingCounter* CreateDenseSlices(
+  static inline std::unique_ptr<BlockingCounter> CreateDenseSlices(
       const ConstMatrixMapR& mat, int row_start, int num_rows, int col_start,
       int num_cols, const DeviceBase::CpuWorkerThreads* thread_pool,
       MatrixR* buffer, std::vector<ConstMatrixMapR*>* slices);
@@ -839,17 +835,11 @@ class SparseMatMul {
 #ifdef TENSORFLOW_USE_LIBXSMM
 template <typename TL, typename TR>
 class LibxsmmSparseMatMul {
-  typedef Eigen::Tensor<TL, 2, Eigen::RowMajor> MatrixL;
-  typedef Eigen::Tensor<TR, 2, Eigen::RowMajor> MatrixR;
-  typedef Eigen::TensorMap<Eigen::Tensor<const TL, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMapL;
-  typedef Eigen::TensorMap<Eigen::Tensor<const TR, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMapR;
-  typedef Eigen::TensorMap<Eigen::Tensor<TR, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      MatrixMapR;
+  using MatrixL = BasicMatrix<TL>;
+  using MatrixR = BasicMatrix<TR>;
+  using ConstMatrixMapL = BasicMatrixMap<const TL>;
+  using ConstMatrixMapR = BasicMatrixMap<const TR>;
+  using MatrixMapR = BasicMatrixMap<TR>;
 
  public:
   // This structure contains a set of libxsmm kernels for sizes that have been
@@ -939,10 +929,8 @@ class LibxsmmSparseMatMul {
 template <typename TL, typename TR,
           template <typename TL2, typename TR2> class DoMatMul>
 class SparseMatMulOp : public OpKernel {
-  typedef Eigen::Tensor<TR, 2, Eigen::RowMajor> MatrixR;
-  typedef Eigen::TensorMap<Eigen::Tensor<const TR, 2, Eigen::RowMajor>,
-                           Eigen::Aligned>
-      ConstMatrixMapR;
+  using MatrixR = BasicMatrix<TR>;
+  using ConstMatrixMapR = BasicMatrixMap<const TR>;
 
  public:
   explicit SparseMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1122,7 +1110,8 @@ inline void SparseMatMul<TL, TR>::ComputeOutputBlock(
 }
 
 template <typename TL, typename TR>
-inline BlockingCounter* SparseMatMul<TL, TR>::CreateSparseSlices(
+inline std::unique_ptr<BlockingCounter>
+SparseMatMul<TL, TR>::CreateSparseSlices(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& mat, bool transpose,
     int slice_num_rows, int slice_block_size, int slice_num_cols,
     std::vector<std::vector<SparseSlice<TL>*>>* mat_slices,
@@ -1170,7 +1159,7 @@ inline BlockingCounter* SparseMatMul<TL, TR>::CreateSparseSlices(
           [=]() { work(sparse_slice, slice, slice_num_cols * j); });
     }
   }
-  return counter;
+  return std::unique_ptr<BlockingCounter>(counter);
 }
 #define LOAD(x) Eigen::internal::ploadu<Packet>((x));
 #define INTERLEAVE(x) Eigen::internal::pinterleave4x64<Packet>(x);
@@ -1286,13 +1275,13 @@ inline void SparseMatMul<TL, TR>::SliceMatrix(
 }
 
 template <typename TL, typename TR>
-inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices(
+inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int row_start,
     int num_rows, int col_start, int num_cols,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer,
     std::vector<typename SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
-  BlockingCounter* shuffle_counter = ShuffleMatrix(
-      mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer);
+  std::unique_ptr<BlockingCounter> shuffle_counter(ShuffleMatrix(
+      mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer));
   const int num_slices = (num_cols + N - 1) / N;
   SliceMatrix(*buffer, num_rows, num_slices, slices);
   return shuffle_counter;
@@ -1484,15 +1473,11 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
       if (work_item >= total_num_creation_blocks) break;
       wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
           empty_type_wrapper<TL>{}, &entry->handle,
-          (transpose_left ? 'Y' : 'N'), left_data, entry->output_csr, work_item,
+          (transpose_left ? 'T' : 'N'), left_data, entry->output_csr, work_item,
           i, num_threads);
     }
   });
   // Do matrix-matrix multiplication
-  // TODO(jewillco): libxsmm doesn't support beta != 1 yet -- remove when
-  // release
-  // includes beta handling
-  memset(output_data, 0, left_dim0 * right_dim1 * sizeof(TR));
   ptrdiff_t total_num_mult_blocks =
       libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
   std::atomic<int> cur_mult_block_number;
@@ -1506,8 +1491,8 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
       const TL beta(0.0);   // Stored in a variable so we can get a pointer
       wrapper_libxsmm_spmdm_compute_generic_thread(
           empty_type_wrapper<TL>{}, &entry->handle,
-          (transpose_left ? 'Y' : 'N'), 'N', &alpha, entry->output_csr,
-          right_data, (transpose_output ? 'Y' : 'N'), &beta, output_data,
+          (transpose_left ? 'T' : 'N'), 'N', &alpha, entry->output_csr,
+          right_data, (transpose_output ? 'T' : 'N'), &beta, output_data,
           work_item, i, num_threads);
     }
   });
@@ -1558,10 +1543,9 @@ inline void SparseMatMul<TL, TR>::Compute(
                     &JB, &IB);
   // Slice the left matrix
   std::vector<std::vector<SparseSlice<TL>*>> left_slices;
-  std::unique_ptr<BlockingCounter> sparse_slice_counter;
-  sparse_slice_counter.reset(
+  std::unique_ptr<BlockingCounter> sparse_slice_counter =
       CreateSparseSlices(ConstMatrixMapL(left.data(), left.dimensions()),
-                         transpose_left, M, K, KL, &left_slices, thread_pool));
+                         transpose_left, M, K, KL, &left_slices, thread_pool);
   const int num_left_slices = left_slices.size();
 
   const int right_dim0 = right.dimension(0);
@@ -1587,9 +1571,9 @@ inline void SparseMatMul<TL, TR>::Compute(
     for (int kb = 0; kb < num_k_blocks; ++kb) {
       const int right_num_rows =
           std::min(KR, static_cast<int>(right_dim0 - KR * kb));
-      dense_slice_counter.reset(CreateDenseSlices(
+      dense_slice_counter = CreateDenseSlices(
           right, kb * KR, right_num_rows, nb * NR, right_num_cols, thread_pool,
-          &buffer, &right_slices));
+          &buffer, &right_slices);
       const int num_right_slices = right_slices.size();
       tasks.reserve(num_left_slices * num_right_slices);
       for (int j_outer = 0; j_outer < num_right_slices; j_outer += JB) {
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 61bd6593c37..098b2d65000 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -31,11 +31,11 @@ namespace internal {
 // in the lower 16-bits of input
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_l(const Packet& from) {
-  tensorflow::uint32 tmp;  
+  tensorflow::uint32 tmp;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp = (reinterpret_cast<const tensorflow::uint32&>(from) ) & 0xffff0000;  
-#else    
-    tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;  
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
+#else
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
 #endif
   return reinterpret_cast<const float&>(tmp);
 }
@@ -44,12 +44,12 @@ EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_l(const Packet& from) {
 // in the upper 16-bits of input
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_u(const Packet& from) {
-  tensorflow::uint32 tmp;  
+  tensorflow::uint32 tmp;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16 ) & 0xffff0000;  
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from) << 16) & 0xffff0000;
 #else
-    tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;  
-#endif 
+  tmp = (reinterpret_cast<const tensorflow::uint32&>(from)) & 0xffff0000;
+#endif
   return reinterpret_cast<const float&>(tmp);
 }
 
@@ -61,12 +61,12 @@ EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
   float r[4];
   tensorflow::uint32 p[4];
   pstoreu(r, from);
-  tensorflow::uint32 * ir = reinterpret_cast<tensorflow::uint32 *>(r);
+  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
   p[0] = (ir[0] << 16) & 0xffff0000;
-  p[1] = ir[0]& 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
   p[2] = (ir[1] << 16) & 0xffff0000;
   p[3] = ir[1] & 0xffff0000;
-  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
 }
 
 template <typename Packet>
@@ -74,12 +74,12 @@ EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
   float r[4];
   tensorflow::uint32 p[4];
   pstoreu(r, from);
-  tensorflow::uint32 * ir = reinterpret_cast<tensorflow::uint32 *>(r);
+  tensorflow::uint32* ir = reinterpret_cast<tensorflow::uint32*>(r);
   p[0] = (ir[2] << 16) & 0xffff0000;
   p[1] = ir[2] & 0xffff0000;
   p[2] = (ir[3] << 16) & 0xffff0000;
   p[3] = ir[3] & 0xffff0000;
-  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
 }
 #endif
 
@@ -131,23 +131,25 @@ EIGEN_DEVICE_FUNC inline Packet pload2bf16(
 template <>
 EIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
   tensorflow::uint32 p[4];
-  const tensorflow::uint32* ir = reinterpret_cast<const tensorflow::uint32 *>(from);
+  const tensorflow::uint32* ir =
+      reinterpret_cast<const tensorflow::uint32*>(from);
   p[0] = (ir[0] << 16) & 0xffff0000;
-  p[1] = ir[0]& 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
   p[2] = (ir[1] << 16) & 0xffff0000;
   p[3] = ir[1] & 0xffff0000;
-  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
   tensorflow::uint32 p[4];
-  const tensorflow::uint32* ir = reinterpret_cast<const tensorflow::uint32 *>(from);
+  const tensorflow::uint32* ir =
+      reinterpret_cast<const tensorflow::uint32*>(from);
   p[0] = (ir[0] << 16) & 0xffff0000;
-  p[1] = ir[0]& 0xffff0000;
+  p[1] = ir[0] & 0xffff0000;
   p[2] = (ir[0] << 16) & 0xffff0000;
   p[3] = ir[0] & 0xffff0000;
-  return ploadu<Packet4f>(reinterpret_cast<float *>(p));  
+  return ploadu<Packet4f>(reinterpret_cast<float*>(p));
 }
 #endif
 
@@ -255,12 +257,13 @@ EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
+  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
+  Packet2d a =
+      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
@@ -417,14 +420,17 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
-  return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
-                           16);
+  return _mm512_castsi512_ps(_mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
+      16));
 }
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
-  return _mm512_slli_epi32(
-      _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
+  Packet16i tmp = _mm512_castps_si512(from);
+  Packet16i tmp2 = _mm512_alignr_epi32(tmp, tmp, 8);
+  return _mm512_castsi512_ps(_mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp2)), 16));
 }
 
 #endif
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index 42fdde23dd2..b5c69466f8b 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -94,6 +94,16 @@ static Graph* SparseMatMul(int m, int n, int d, float sparsity_a,
                                     transpose_a, transpose_b);
 }
 
+static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
+                                     float sparsity_2, int copies) {
+  Graph* g = new Graph(OpRegistry::Global());
+  for (int i = 0; i < copies; ++i) {
+    SparseMatMulHelper<float, float>(g, m, n, d, sparsity_1, sparsity_2, false,
+                                     false);
+  }
+  return g;
+}
+
 #define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB)                           \
   static void                                                                  \
       BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
@@ -112,6 +122,23 @@ static Graph* SparseMatMul(int m, int n, int d, float sparsity_a,
   BENCHMARK(                                                                   \
       BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB);
 
+#define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies)                          \
+  static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
+      int iters) {                                                             \
+    testing::StopTiming();                                                     \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies *   \
+                            2);                                                \
+    std::string label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f",  \
+                                        (Copies), S1 / 100.0, S2 / 100.0);     \
+    testing::SetLabel(label);                                                  \
+    testing::UseRealTime();                                                    \
+    auto g =                                                                   \
+        ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies));     \
+    testing::StartTiming();                                                    \
+    test::Benchmark("cpu", g).Run(iters);                                      \
+  }                                                                            \
+  BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
+
 #define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
   BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
 #define BM_SPARSE_BFLOAT16(M, K, N, S1, S2, TRA, TRB) \
@@ -144,6 +171,33 @@ BM_SPARSE_FLOAT(1024, 1024, 1024, 1, 0, false, false);
 BM_SPARSE_FLOAT(1024, 1024, 1024, 85, 0, false, false);
 BM_SPARSE_FLOAT(256, 256, 256, 1, 0, false, false);
 BM_SPARSE_FLOAT(512, 512, 512, 1, 0, false, false);
+BM_SPARSE_FLOAT(2560, 400, 1024, 85, 0, false, false);
+BM_SPARSE_FLOAT(2560, 400, 1024, 85, 0, true, false);
+
+BM_SPARSE_FLOAT(400, 800, 2560, 85, 0, false, false);
+BM_SPARSE_FLOAT(400, 2560, 1024, 85, 0, false, false);
+BM_SPARSE_FLOAT(400, 1024, 256, 85, 0, false, false);
+BM_SPARSE_FLOAT(400, 256, 1, 85, 0, false, false);
+
+BM_SPARSE_REPLICATED(400, 800, 2560, 85, 0, 6);
+BM_SPARSE_REPLICATED(400, 2560, 1024, 85, 0, 6);
+BM_SPARSE_REPLICATED(400, 1024, 256, 85, 0, 6);
+BM_SPARSE_REPLICATED(400, 256, 1, 85, 0, 6);
+
+BM_SPARSE_FLOAT(2048, 1792, 1024, 85, 0, false, false);
+BM_SPARSE_FLOAT(2048, 1024, 768, 85, 0, false, false);
+BM_SPARSE_FLOAT(2048, 768, 512, 85, 0, false, false);
+BM_SPARSE_FLOAT(2048, 512, 256, 85, 0, false, false);
+
+BM_SPARSE_FLOAT(2049, 1792, 1024, 85, 0, false, false);
+BM_SPARSE_FLOAT(2049, 1024, 768, 85, 0, false, false);
+BM_SPARSE_FLOAT(2049, 768, 512, 85, 0, false, false);
+BM_SPARSE_FLOAT(2049, 512, 256, 85, 0, false, false);
+
+BM_SPARSE_REPLICATED(2048, 1792, 1024, 85, 0, 6);
+BM_SPARSE_REPLICATED(2048, 1024, 768, 85, 0, 6);
+BM_SPARSE_REPLICATED(2048, 768, 512, 85, 0, 6);
+BM_SPARSE_REPLICATED(2048, 512, 256, 85, 0, 6);
 
 // Test bfloat16
 BM_SPARSE_BFLOAT16(2048, 2048, 2048, 0, 0, false, false);
@@ -156,30 +210,53 @@ BM_SPARSE_FLOAT_BFLOAT16(2048, 2048, 2048, 85, 0, false, false);
 BM_SPARSE_FLOAT_BFLOAT16(2048, 2048, 2048, 99, 0, false, false);
 
 static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
-                                float sparsity_2) {
+                                float sparsity_2, int copies) {
   Graph* g = new Graph(OpRegistry::Global());
-  SparseMatMulHelper<float, float>(g, d, n, m, sparsity_1, sparsity_2, true,
-                                   false);
-  SparseMatMulHelper<float, float>(g, m, d, n, sparsity_2, 0, false, true);
+  for (int i = 0; i < copies; ++i) {
+    SparseMatMulHelper<float, float>(g, d, n, m, sparsity_1, sparsity_2, true,
+                                     false);
+    SparseMatMulHelper<float, float>(g, m, d, n, sparsity_2, 0, false, true);
+  }
   return g;
 }
 
-#define BM_SPARSE_MULTI(M, K, N, S1, S2)                                    \
-  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) {  \
+#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies)                            \
+  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(   \
+      int iters) {                                                          \
     testing::StopTiming();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3); \
-    std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N,    \
-                                        S1 / 100.0, S2 / 100.0);            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 * \
+                            Copies);                                        \
+    std::string label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, \
+                                        Copies, S1 / 100.0, S2 / 100.0);    \
     testing::SetLabel(label);                                               \
     testing::UseRealTime();                                                 \
-    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0);            \
+    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies);    \
     testing::StartTiming();                                                 \
     test::Benchmark("cpu", g).Run(iters);                                   \
   }                                                                         \
-  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2);
+  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
 
-BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82);
-BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83);
+BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
+BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
+BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);
+BM_SPARSE_MULTI(400, 2560, 1024, 85, 85, 1);
+BM_SPARSE_MULTI(400, 1024, 256, 85, 85, 1);
+BM_SPARSE_MULTI(400, 256, 1, 85, 85, 1);
+
+BM_SPARSE_MULTI(2048, 1792, 1024, 85, 85, 1);
+BM_SPARSE_MULTI(2048, 1024, 768, 85, 85, 1);
+BM_SPARSE_MULTI(2048, 768, 512, 85, 85, 1);
+BM_SPARSE_MULTI(2048, 512, 256, 85, 85, 1);
+
+BM_SPARSE_MULTI(2048, 1792, 1024, 85, 85, 3);
+BM_SPARSE_MULTI(2048, 1024, 768, 85, 85, 3);
+BM_SPARSE_MULTI(2048, 768, 512, 85, 85, 3);
+BM_SPARSE_MULTI(2048, 512, 256, 85, 85, 3);
+
+BM_SPARSE_MULTI(2048, 1792, 1024, 85, 85, 6);
+BM_SPARSE_MULTI(2048, 1024, 768, 85, 85, 6);
+BM_SPARSE_MULTI(2048, 768, 512, 85, 85, 6);
+BM_SPARSE_MULTI(2048, 512, 256, 85, 85, 6);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op.cc b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
index a7e35afe087..074aab9f9e2 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
@@ -57,7 +57,7 @@ ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
   ReduceDetails reduction;
 
   std::vector<int32> reduction_axes(axes_slice.begin(), axes_slice.end());
-  int ndims = sp.shape().dims();
+  int ndims = sp.dims();
   for (int64 i = 0; i < reduction_axes.size(); ++i) {
     reduction_axes[i] = (reduction_axes[i] + ndims) % ndims;
   }
@@ -82,6 +82,7 @@ ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
                       std::back_inserter(reduction.reorder_dims));
 
   // (1) Calculate the shape after reduction.
+  auto sp_shape = sp.shape();
   std::vector<int64> out_dim_sizes;
   if (keep_dims) {
     out_dim_sizes.reserve(ndims);
@@ -91,7 +92,7 @@ ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
       if (std::find(beg, end, d) == end) {
         out_dim_sizes.push_back(1);  // A reduced axis.
       } else {
-        out_dim_sizes.push_back(sp.shape().dim_size(d));
+        out_dim_sizes.push_back(sp_shape[d]);
       }
     }
   } else {
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 9f8ed5ab18c..6171b532aa2 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -71,12 +71,12 @@ class SparseSplitOp : public OpKernel {
       context->set_output(slice_index + num_split_,
                           outputs[slice_index].values());
       Tensor* shape = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(
-                         slice_index + 2 * num_split_,
-                         {outputs[slice_index].shape().dims()}, &shape));
-      for (int dim = 0; dim < outputs[slice_index].shape().dims(); ++dim) {
-        shape->vec<int64>()(dim) = outputs[slice_index].shape().dim_size(dim);
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  slice_index + 2 * num_split_,
+                                  {outputs[slice_index].dims()}, &shape));
+      auto output_shape = outputs[slice_index].shape();
+      for (int dim = 0; dim < outputs[slice_index].dims(); ++dim) {
+        shape->vec<int64>()(dim) = output_shape[dim];
       }
     }
   }
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
index b5093d59fc0..0b664fc357e 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
@@ -29,6 +29,43 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 // NOTE: does not support GPU yet.
 
+namespace {
+
+template <typename Index>
+Status ValidateInputs(const Tensor *a_indices, const Tensor *a_values,
+                      const Tensor *a_shape, const Tensor *b) {
+  if (!TensorShapeUtils::IsMatrix(a_indices->shape())) {
+    return errors::InvalidArgument(
+        "Input a_indices should be a matrix but received shape: ",
+        a_indices->shape().DebugString());
+  }
+  if (!TensorShapeUtils::IsVector(a_values->shape()) ||
+      !TensorShapeUtils::IsVector(a_shape->shape())) {
+    return errors::InvalidArgument(
+        "Inputs a_values and a_shape should be vectors "
+        "but received shapes: ",
+        a_values->shape().DebugString(), " and ",
+        a_shape->shape().DebugString());
+  }
+  if (a_shape->NumElements() != b->dims()) {
+    return errors::InvalidArgument(
+        "Two operands have different ranks; received: ", a_shape->NumElements(),
+        " and ", b->dims());
+  }
+  const auto a_shape_flat = a_shape->flat<Index>();
+  for (int i = 0; i < b->dims(); ++i) {
+    if (a_shape_flat(i) != b->dim_size(i)) {
+      return errors::InvalidArgument(
+          "Dimension ", i,
+          " does not equal (no broadcasting is supported): sparse side ",
+          a_shape_flat(i), " vs dense side ", b->dim_size(i));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 template <typename Device, typename T, typename Index>
 class SparseTensorDenseAddOp : public OpKernel {
  public:
@@ -36,27 +73,12 @@ class SparseTensorDenseAddOp : public OpKernel {
 
   void Compute(OpKernelContext *ctx) override {
     const Tensor *a_indices_t, *a_values_t, *a_shape_t, *b;
-
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices_t));
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape_t));
     OP_REQUIRES_OK(ctx, ctx->input("b", &b));
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()),
-                errors::InvalidArgument(
-                    "Input a_indices should be a matrix but received shape: ",
-                    a_indices_t->shape().DebugString()));
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                 TensorShapeUtils::IsVector(a_shape_t->shape()),
-        errors::InvalidArgument("Inputs a_values and a_shape should be vectors "
-                                "but received shapes: ",
-                                a_values_t->shape().DebugString(), " and ",
-                                a_shape_t->shape().DebugString()));
-    OP_REQUIRES(ctx, a_shape_t->NumElements() == b->dims(),
-                errors::InvalidArgument(
-                    "Two operands have different dimensions; received: ",
-                    a_shape_t->NumElements(), " and ", b->dims()));
+    OP_REQUIRES_OK(
+        ctx, ValidateInputs<Index>(a_indices_t, a_values_t, a_shape_t, b));
 
     Tensor *out_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, b->shape(), &out_t));
@@ -66,14 +88,20 @@ class SparseTensorDenseAddOp : public OpKernel {
     const auto a_values_flat = a_values_t->flat<T>();
 
     switch (ndims) {
-#define NDIMS_CASE(N)                                                   \
-  case N: {                                                             \
-    auto out_tensor = out_t->tensor<T, N>();                            \
-    out_tensor.device(ctx->eigen_device<Device>()) = b->tensor<T, N>(); \
-    functor::ScatterNdFunctor<Device, T, Index, N,                      \
-                              scatter_op::UpdateOp::ADD>()(             \
-        ctx->eigen_device<Device>(), a_indices_mat, a_values_flat,      \
-        out_tensor);                                                    \
+#define NDIMS_CASE(N)                                                     \
+  case N: {                                                               \
+    auto out_tensor = out_t->tensor<T, N>();                              \
+    out_tensor.device(ctx->eigen_device<Device>()) = b->tensor<T, N>();   \
+    const Index result =                                                  \
+        functor::ScatterNdFunctor<Device, T, Index, N,                    \
+                                  scatter_op::UpdateOp::ADD>()(           \
+            ctx->eigen_device<Device>(), a_indices_mat, a_values_flat,    \
+            out_tensor);                                                  \
+    OP_REQUIRES(                                                          \
+        ctx, result == -1,                                                \
+        errors::InvalidArgument(                                          \
+            "Sparse tensor has some invalid index on dimension ", result, \
+            "; dense tensor shape: ", b->shape().DebugString()));         \
   } break;
 
       NDIMS_CASE(1);
@@ -82,8 +110,9 @@ class SparseTensorDenseAddOp : public OpKernel {
       NDIMS_CASE(4);
       NDIMS_CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef NDIMS_CASE
@@ -102,11 +131,14 @@ struct ScatterNdFunctor<CPUDevice, T, Index, NDIMS, scatter_op::UpdateOp::ADD> {
     const int num_nnz = static_cast<int>(indices.dimension(0));
     for (int i = 0; i < num_nnz; ++i) {
       for (int d = 0; d < NDIMS; ++d) {
-        idx[d] = indices(i, d);
+        idx[d] = internal::SubtleMustCopy(indices(i, d));
+        if (!FastBoundsCheck(idx[d], out.dimension(d))) {
+          return d;  // on failure: d nonnegative
+        }
       }
       out(idx) += updates(i);
     }
-    return -1;
+    return -1;  // on success
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
index b06dcf143ec..353cf0e5190 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
@@ -24,7 +24,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-// TOOD(zongheng): this should be a general functor that powers SparseAdd and
+// TODO(zongheng): this should be a general functor that powers SparseAdd and
 // ScatterNd ops.  It should be moved to its own head file, once the other ops
 // are implemented.
 template <typename Device, typename T, typename Index, int NDIMS,
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 1669ac47c82..30c57ef287f 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tindices>
 class SparseTensorDenseMatMulOp : public OpKernel {
  public:
   explicit SparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
@@ -65,7 +65,8 @@ class SparseTensorDenseMatMulOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
                 errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
 
-    OP_REQUIRES(ctx, a_indices->shape().dim_size(0) == a_values->NumElements(),
+    const int64 nnz = a_indices->shape().dim_size(0);
+    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
                 errors::InvalidArgument("Number of rows of a_indices does not "
                                         "match number of entries in a_values"));
 
@@ -89,8 +90,28 @@ class SparseTensorDenseMatMulOp : public OpKernel {
             inner_left, " vs. ", inner_right,
             ".  Did you forget a transpose?  "
             "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1), ").  Dimensions of B: ",
-            b->shape().DebugString()));
+            a_shape_t(0), ", ", a_shape_t(1),
+            ").  Dimensions of B: ", b->shape().DebugString()));
+
+    if (std::is_same<Device, GPUDevice>::value) {
+      // The GPU implementation is optimized to use 32 bit indexing, so
+      // give a friendly error to the programmer early on if they
+      // exceed.
+      const int int32max = std::numeric_limits<int>::max();
+      OP_REQUIRES(
+          ctx,
+          (FastBoundsCheck(inner_left, int32max) &&
+           FastBoundsCheck(inner_right, int32max) &&
+           FastBoundsCheck(outer_left, int32max) &&
+           FastBoundsCheck(outer_right, int32max) &&
+           FastBoundsCheck(b->NumElements(), int32max) &&
+           FastBoundsCheck(outer_left * outer_right, int32max) &&
+           FastBoundsCheck(a_values->NumElements(), int32max)),
+          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
+      OP_REQUIRES(ctx, FastBoundsCheck(nnz * outer_right, int32max),
+                  errors::InvalidArgument(
+                      "Cannot use GPU when output.shape[1] * nnz(a) > 2^31"));
+    }
 
     TensorShape out_shape({outer_left, outer_right});
     Tensor* out = nullptr;
@@ -111,40 +132,14 @@ class SparseTensorDenseMatMulOp : public OpKernel {
       return;
     }
 
-    Tensor scratch;
-
-    if (std::is_same<Device, GPUDevice>::value) {
-      // The GPU implementation is optimized to use 32 bit indexing, so
-      // give a friendly error to the programmer early on if they exceed.
-      OP_REQUIRES(
-          ctx,
-          FastBoundsCheck(inner_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(inner_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(b->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(out->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(a_values->NumElements(),
-                              std::numeric_limits<int>::max()),
-          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
-      const int nnz = static_cast<const int>(a_values->NumElements());
-      // Need nnz length vec scratch space on the GPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({nnz}), &scratch));
-    } else {
-      // We don't need scratch space on the CPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({0}), &scratch));
-    }
-
-#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                            \
-  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                            \
-    functor::SparseTensorDenseMatMulFunctor<Device, T, ADJ_A, ADJ_B>::Compute( \
-        ctx->eigen_device<Device>(), out->matrix<T>(),                         \
-        a_indices->matrix<int64>(), a_values->vec<T>(), b->matrix<T>(),        \
-        scratch.vec<T>());                                                     \
+#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
+  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
+    Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
+        Device, T, Tindices, ADJ_A,                                        \
+        ADJ_B>::Compute(ctx->eigen_device<Device>(), out->matrix<T>(),     \
+                        a_indices->matrix<Tindices>(), a_values->vec<T>(), \
+                        b->matrix<T>());                                   \
+    OP_REQUIRES_OK(ctx, functor_status);                                   \
   }
 
     MAYBE_ADJOINT(false, false);
@@ -160,67 +155,99 @@ class SparseTensorDenseMatMulOp : public OpKernel {
   bool adjoint_b_;
 };
 
-#define REGISTER_CPU(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("SparseTensorDenseMatMul") \
-                              .Device(DEVICE_CPU)         \
-                              .TypeConstraint<T>("T")     \
-                              .HostMemory("a_shape"),     \
-                          SparseTensorDenseMatMulOp<CPUDevice, T>);
+#define REGISTER_CPU(TypeT, TypeIndex)           \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("SparseTensorDenseMatMul")            \
+          .Device(DEVICE_CPU)                    \
+          .TypeConstraint<TypeT>("T")            \
+          .TypeConstraint<TypeIndex>("Tindices") \
+          .HostMemory("a_shape"),                \
+      SparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
 
-REGISTER_CPU(float);
-REGISTER_CPU(double);
-REGISTER_CPU(int32);
-REGISTER_CPU(complex64);
-REGISTER_CPU(complex128);
+#define REGISTER_KERNELS_CPU(T) \
+  REGISTER_CPU(T, int64);       \
+  REGISTER_CPU(T, int32)
+
+REGISTER_KERNELS_CPU(float);
+REGISTER_KERNELS_CPU(double);
+REGISTER_KERNELS_CPU(int32);
+REGISTER_KERNELS_CPU(complex64);
+REGISTER_KERNELS_CPU(complex128);
 
 #if GOOGLE_CUDA
 
 namespace functor {
-#define DECLARE_GPU_SPEC(T, ADJ_A, ADJ_B)                                    \
-  template <>                                                                \
-  void SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B>::Compute(  \
-      const GPUDevice& d, typename TTypes<T>::Matrix out,                    \
-      TTypes<int64>::ConstMatrix a_indices,                                  \
-      typename TTypes<T>::ConstVec a_values,                                 \
-      typename TTypes<T>::ConstMatrix b, typename TTypes<T>::Vec scratch);   \
-  extern template struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, \
-                                                        ADJ_B>;
+#define DECLARE_GPU_SPEC(T, Tindices, ADJ_A, ADJ_B)                       \
+  template <>                                                             \
+  Status SparseTensorDenseMatMulFunctor<                                  \
+      GPUDevice, T, Tindices, ADJ_A,                                      \
+      ADJ_B>::Compute(const GPUDevice& d, typename TTypes<T>::Matrix out, \
+                      TTypes<Tindices>::ConstMatrix a_indices,            \
+                      typename TTypes<T>::ConstVec a_values,              \
+                      typename TTypes<T>::ConstMatrix b);                 \
+  extern template struct SparseTensorDenseMatMulFunctor<                  \
+      GPUDevice, T, Tindices, ADJ_A, ADJ_B>;
 
-#define DECLARE_ADJOINT_GPU_SPEC(T) \
-  DECLARE_GPU_SPEC(T, false, false) \
-  DECLARE_GPU_SPEC(T, false, true)  \
-  DECLARE_GPU_SPEC(T, true, false)  \
-  DECLARE_GPU_SPEC(T, true, true)
+#define REGISTER_GPU_SPEC(T, ADJ_A, ADJ_B)  \
+  DECLARE_GPU_SPEC(T, int32, ADJ_A, ADJ_B); \
+  DECLARE_GPU_SPEC(T, int64, ADJ_A, ADJ_B)
+
+#define DECLARE_ADJOINT_GPU_SPEC(T)  \
+  REGISTER_GPU_SPEC(T, false, false) \
+  REGISTER_GPU_SPEC(T, false, true)  \
+  REGISTER_GPU_SPEC(T, true, false)  \
+  REGISTER_GPU_SPEC(T, true, true)
 
 DECLARE_ADJOINT_GPU_SPEC(float);
 #undef DECLARE_ADJOINT_GPU_SPEC
 #undef DECLARE_GPU_SPEC
+#undef REGISTER_GPU_SPEC
 
 }  // namespace functor
 
-#define REGISTER_GPU(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("SparseTensorDenseMatMul") \
-                              .Device(DEVICE_GPU)         \
-                              .TypeConstraint<T>("T")     \
-                              .HostMemory("a_shape"),     \
-                          SparseTensorDenseMatMulOp<GPUDevice, T>);
+#define REGISTER_GPU(TypeT, TypeIndex)           \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("SparseTensorDenseMatMul")            \
+          .Device(DEVICE_GPU)                    \
+          .TypeConstraint<TypeT>("T")            \
+          .TypeConstraint<TypeIndex>("Tindices") \
+          .HostMemory("a_shape"),                \
+      SparseTensorDenseMatMulOp<GPUDevice, TypeT, TypeIndex>);
 
-REGISTER_GPU(float);
+#define REGISTER_KERNELS_GPU(T) \
+  REGISTER_GPU(T, int64);       \
+  REGISTER_GPU(T, int32)
+
+REGISTER_KERNELS_GPU(float);
 #undef REGISTER_GPU
+#undef REGISTER_KERNELS_GPU
 #endif  // GOOGLE_CUDA
 
 namespace functor {
 
-template <typename T, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
+namespace {
+Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
+                         std::size_t lhs_right) {
+  return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
+                                 "] out of bounds (>=", lhs_right, ")");
+}
+
+Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
+                         int64 out_dim0) {
+  return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
+                                 "] out of bounds (>=", out_dim0, ")");
+}
+}  // namespace
+
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   // Vectorize certain operations above this size.
   static const std::size_t kNumVectorize = 32;
 
-  static void Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
-                      TTypes<int64>::ConstMatrix a_indices,
-                      typename TTypes<T>::ConstVec a_values,
-                      typename TTypes<T>::ConstMatrix b,
-                      typename TTypes<T>::Vec scratch) {
+  static Status Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
+                        typename TTypes<Tindices>::ConstMatrix a_indices,
+                        typename TTypes<T>::ConstVec a_values,
+                        typename TTypes<T>::ConstMatrix b) {
     const std::size_t nnz = a_values.size();
     const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
     const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
@@ -236,11 +263,16 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
     if (rhs_right < kNumVectorize) {
       // Disable vectorization if the RHS of output is too small
       auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
+
       for (std::size_t i = 0; i < nnz; ++i) {
-        const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
-        const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
-        CHECK_LT(k, lhs_right);
-        CHECK_LT(m, out.dimension(0));
+        const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
+        const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
+        if (!FastBoundsCheck(k, lhs_right)) {
+          return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
+        }
+        if (!FastBoundsCheck(m, out.dimension(0))) {
+          return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
+        }
         const T a_value = ADJ_A ? MaybeConj(a_values(i)) : a_values(i);
         for (std::size_t n = 0; n < rhs_right; ++n) {
           const T b_value = maybe_adjoint_b(k, n);
@@ -251,15 +283,19 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
       // Vectorization via Eigen.
       const int b_chip_index = ADJ_B ? 1 : 0;
 
-#define LOOP_NNZ(b_passed)                                               \
-  for (std::size_t i = 0; i < nnz; ++i) {                                \
-    const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
-    const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
-    const T a_value = (ADJ_A) ? MaybeConj(a_values(i)) : a_values(i);    \
-    CHECK_LT(m, out.dimension(0));                                       \
-    CHECK_LT(k, lhs_right);                                              \
-    out.template chip<0>(m) +=                                           \
-        b_passed.template chip<b_chip_index>(k) * a_value;               \
+#define LOOP_NNZ(b_passed)                                                  \
+  for (std::size_t i = 0; i < nnz; ++i) {                                   \
+    const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
+    const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
+    const T a_value = (ADJ_A) ? MaybeConj(a_values(i)) : a_values(i);       \
+    if (!FastBoundsCheck(k, lhs_right)) {                                   \
+      return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);               \
+    }                                                                       \
+    if (!FastBoundsCheck(m, out.dimension(0))) {                            \
+      return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));        \
+    }                                                                       \
+    out.template chip<0>(m) +=                                              \
+        b_passed.template chip<b_chip_index>(k) * a_value;                  \
   }
 
       if (ADJ_B) {
@@ -274,6 +310,7 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
       }
 #undef LOOP_NNZ
     }
+    return Status::OK();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
index 3bec4ce5f2d..da131904949 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
@@ -19,19 +19,19 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 namespace functor {
 
-template <typename Device, typename T, bool ADJ_A, bool ADJ_B>
+template <typename Device, typename T, typename Tindices, bool ADJ_A,
+          bool ADJ_B>
 struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE void Compute(const Device& d,
-                                          typename TTypes<T>::Matrix out,
-                                          TTypes<int64>::ConstMatrix a_indices,
-                                          typename TTypes<T>::ConstVec a_values,
-                                          typename TTypes<T>::ConstMatrix b,
-                                          typename TTypes<T>::Vec scratch);
+  static EIGEN_ALWAYS_INLINE Status Compute(
+      const Device& d, typename TTypes<T>::Matrix out,
+      typename TTypes<Tindices>::ConstMatrix a_indices,
+      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
 };
 
 template <typename MATRIX, bool ADJ>
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index a1776968f07..e261e42e0d3 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -20,142 +20,90 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace generator {
-
-template <typename T, bool ADJ_A, bool ADJ_B>
-class SparseTensorDenseMatMulGPUGenerator {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SparseTensorDenseMatMulGPUGenerator(
-      typename TTypes<T, 2>::Tensor32Bit out,
-      TTypes<const int64, 2>::Tensor32Bit a_indices,
-      typename TTypes<const T, 1>::Tensor32Bit a_values,
-      typename TTypes<const T, 2>::Tensor32Bit b)
-      : out_(out),
-        lhs_index_a_(ADJ_A ? 1 : 0),
-        rhs_index_a_(ADJ_A ? 0 : 1),
-        a_indices_(a_indices),
-        a_values_(a_values),
-        lhs_right_size(ADJ_B ? b.dimension(1) : b.dimension(0)),
-        maybe_adjoint_b_(
-            functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit,
-                                  ADJ_B>(b)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
-  operator()(const Eigen::array<int, 2>& j_and_ix) const {
-#ifdef __CUDA_ARCH__
-    const int j = j_and_ix[0];
-    const int ix = j_and_ix[1];
-    int m = a_indices_(ix, lhs_index_a_);
-    int k = a_indices_(ix, rhs_index_a_);
-    assert(k < lhs_right_size);
-    assert(m < out_.dimension(0));
-    // If asserts are disabled, the caller is violating the sparse
-    // tensor index contract, and so we return invalid results.
-    // Force returning NaNs to try to signal that something is amiss.
-    T b_value;
-    if (k >= lhs_right_size || m >= out_.dimension(0)) {
-      m = 0;
-      k = 0;
-      b_value = std::numeric_limits<T>::quiet_NaN();
-    } else {
-      b_value = maybe_adjoint_b_(k, j);
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+__global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
+                                              int b_cols, int p,
+                                              const Tindices* a_indices,
+                                              const T* a_values, const T* b,
+                                              T* out) {
+  // out_{ij} = sum_k {a_ik b_kj}
+  // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
+  const int n = (ADJ_B) ? b_cols : b_rows;
+  CUDA_1D_KERNEL_LOOP(index, nnz * p) {
+    const int a_ix = index / p;
+    const int j = index % p;
+    const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0));
+    const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1));
+    if (!FastBoundsCheck(i, m)) {
+      continue;  // Nowhere to signal an error :(
     }
-    atomicAdd(&out_(m, j), a_values_(ix) * b_value);
-#else
-    assert(false && "This should only be run on the device");
-#endif
-    // Return something
-    return T(0);
+    // out[i, j]
+    T* out_location = out + i * p + j;
+    if (!FastBoundsCheck(k, n)) {
+      CudaAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
+      continue;
+    }
+
+    // a_value == (ADJ_A) ? a[k, i] : a[i, k]
+    const T a_value = ldg(a_values + a_ix);
+
+    // b_value == (ADJ_B) ? b[j, k] : b[k, j]
+    const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
+    CudaAtomicAdd(out_location, a_value * b_value);
   }
-
- private:
-  mutable typename TTypes<T, 2>::Tensor32Bit out_;
-  const int lhs_index_a_;
-  const int rhs_index_a_;
-  TTypes<const int64, 2>::Tensor32Bit a_indices_;
-  typename TTypes<const T, 1>::Tensor32Bit a_values_;
-  const int lhs_right_size;
-  functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit, ADJ_B>
-      maybe_adjoint_b_;
-};
-
-}  // namespace generator
+}
 
 namespace functor {
 
-template <typename T, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B> {
-  static EIGEN_ALWAYS_INLINE void Compute(const GPUDevice& d,
-                                          typename TTypes<T>::Matrix out,
-                                          TTypes<int64>::ConstMatrix a_indices,
-                                          typename TTypes<T>::ConstVec a_values,
-                                          typename TTypes<T>::ConstMatrix b,
-                                          typename TTypes<T>::Vec scratch) {
-    generator::SparseTensorDenseMatMulGPUGenerator<T, ADJ_A, ADJ_B>
-        sparse_tensor_dense_matmul_generator(To32Bit(out), To32Bit(a_indices),
-                                             To32Bit(a_values), To32Bit(b));
-    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
+  static EIGEN_ALWAYS_INLINE Status
+  Compute(const GPUDevice& d, typename TTypes<T>::Matrix out,
+          typename TTypes<Tindices>::ConstMatrix a_indices,
+          typename TTypes<T>::ConstVec a_values,
+          typename TTypes<T>::ConstMatrix b) {
+    out.device(d) = out.constant(T(0));
     int nnz = a_values.size();
-    int n = (ADJ_B) ? b.dimension(0) : b.dimension(1);
+    // out = A * B, A is [m x n] and B is [n x p], out is [m x p]
+    int m = out.dimension(0);
+    int p = out.dimension(1);
+    int b_rows = b.dimension(0);
+    int b_cols = b.dimension(1);
 
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
-    Eigen::array<int, 2> n_by_1{{ n, 1 }};
-    Eigen::array<int, 1> reduce_on_rows{{ 0 }};
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> matrix_1_by_nnz;
-    matrix_1_by_nnz.set(1, nnz);
-    Eigen::IndexList<int, Eigen::type2index<1> > n_by_1;
-    n_by_1.set(0, n);
-    Eigen::IndexList<Eigen::type2index<0> > reduce_on_rows;
-#endif
+    // TODO(ebrevdo): Should this be alpha * nnz instead of
+    // out.size()?  Perhaps p * nnz ?
+    CudaLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
 
-    // How this works: the generator iterates over (j, ix) where j
-    // iterates from 0 .. n - 1 and ix iterates from
-    // 0 .. nnz - 1.  A side effect of the generator is to accumulate
-    // the products of values in A and B into the appropriate location
-    // in the dense matrix out.  In order to run the iteration,
-    // we take a smaller variable and broadcast to a size (n, nnz).
-    // This is the scratch variable.  In order to enforce execution,
-    // we have to perform assignment back into scratch (taking the sum).
-    // We don't care what gets assigned to scratch - only the side effect
-    // of the execution in the generator.
-    //
-    // Note it's not sufficient that scratch be a scalar, and to
-    // broadcast it to a matrix.  Eigen splits the computation not
-    // based on the largest intermediate shape (the size of the
-    // broadcast of scratch) but based on the output shape.  So
-    // scratch needs to be a vector at least.
-    //
-    // Note also that only float type is supported because the
-    // atomicAdd operation is only supported for floats in hardware.
-    To32Bit(scratch).device(d) =
-        To32Bit(scratch)
-            .reshape(matrix_1_by_nnz)
-            .broadcast(n_by_1)
-            .generate(sparse_tensor_dense_matmul_generator)
-            .sum(reduce_on_rows);
+    SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            nnz, m, b_rows, b_cols, p, a_indices.data(), a_values.data(),
+            b.data(), out.data());
+
+    return Status::OK();
   }
 };
 
 }  // namespace functor
 
-#define DEFINE(T)                                                              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, false, \
-                                                          false>;              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, false, \
-                                                          true>;               \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, true,  \
-                                                          false>;              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, true,  \
-                                                          true>;
+#define DEFINE(T, Tindices)                                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, false, false>;               \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, false, true>;                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, true, false>;                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, true, true>;
 
-DEFINE(float);
+DEFINE(float, int32);
+DEFINE(float, int64);
 #undef DEFINE
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
new file mode 100644
index 00000000000..679b39bef9b
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
@@ -0,0 +1,216 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <numeric>
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+template <typename T>
+class Dataset : public DatasetBase {
+ public:
+  explicit Dataset(const sparse::SparseTensor& sparse_tensor)
+      : sparse_tensor_(sparse_tensor),
+        dtypes_({DT_INT64, sparse_tensor.dtype(), DT_INT64}),
+        shapes_({{-1, sparse_tensor.dims() - 1},
+                 {-1},
+                 {sparse_tensor.dims() - 1}}) {}
+
+  std::unique_ptr<IteratorBase> MakeIterator() const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(this));
+  }
+
+  const DataTypeVector& output_dtypes() const override { return dtypes_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return shapes_;
+  }
+
+  string DebugString() override {
+    return "SparseTensorSliceDatasetOp::Dataset";
+  }
+
+ private:
+  class Iterator : public DatasetIterator<Dataset<T>> {
+   public:
+    explicit Iterator(const Dataset<T>* dataset)
+        : DatasetIterator<Dataset<T>>(dataset),
+          dataset_(dataset),
+          num_elements_(dataset->sparse_tensor_.shape()[0]),
+          dense_shape_(DT_INT64, {dataset->sparse_tensor_.dims() - 1}),
+          group_iterable_(dataset->sparse_tensor_.group({0})),
+          iter_(group_iterable_.begin()) {
+      for (size_t i = 0; i < dense_shape_.NumElements(); ++i) {
+        dense_shape_.vec<int64>()(i) = dataset_->sparse_tensor_.shape()[i + 1];
+      }
+    }
+
+    Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      if (i_ == num_elements_) {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+      out_tensors->clear();
+      out_tensors->reserve(3);
+      const int rank = dataset_->sparse_tensor_.dims();
+
+      if (i_ > next_non_empty_i_ && iter_ != group_iterable_.end()) {
+        // We still have elements to consume from `group_iterable_`
+        // and we have emitted all elements up to and including the
+        // current position.
+        sparse::Group group = *iter_;
+        const auto indices = group.indices();
+        const auto values = group.values<T>();
+        const int64 num_entries = values.size();
+        next_non_empty_i_ = indices(0, 0);
+
+        next_indices_ = Tensor(DT_INT64, {num_entries, rank - 1});
+        next_values_ = Tensor(DataTypeToEnum<T>::value, {num_entries});
+
+        auto next_indices_t = next_indices_.matrix<int64>();
+        auto next_values_t = next_values_.vec<T>();
+
+        for (int64 i = 0; i < num_entries; ++i) {
+          for (int d = 1; d < rank; ++d) {
+            next_indices_t(i, d - 1) = indices(i, d);
+          }
+          next_values_t(i) = values(i);
+        }
+
+        ++iter_;
+      }
+
+      if (i_ == next_non_empty_i_) {
+        // The current position is non-empty in the input
+        // `SparseTensor`, and we have already read the value from the
+        // `GroupIterable`.
+        out_tensors->push_back(std::move(next_indices_));
+        out_tensors->push_back(std::move(next_values_));
+        out_tensors->push_back(dense_shape_);
+        next_non_empty_i_ = kNextNonEmptyUnknown;
+      } else {
+        DCHECK(i_ < next_non_empty_i_ || iter_ == group_iterable_.end());
+        // The current position is empty in the input `SparseTensor`,
+        // so emit empty indices and values.
+        out_tensors->push_back(Tensor(DT_INT64, TensorShape({0, rank - 1})));
+        out_tensors->push_back(Tensor(DataTypeToEnum<T>::value, {0}));
+        out_tensors->push_back(dense_shape_);
+      }
+
+      ++i_;
+      *end_of_sequence = false;
+      return Status::OK();
+    }
+
+   private:
+    const Dataset<T>* const dataset_;
+    const int64 num_elements_;
+
+    Tensor dense_shape_;
+
+    mutex mu_;
+    sparse::GroupIterable group_iterable_ GUARDED_BY(mu_);
+    sparse::GroupIterable::IteratorStep iter_ GUARDED_BY(mu_);
+    int64 i_ GUARDED_BY(mu_) = 0;
+    const int64 kNextNonEmptyUnknown = -1;
+    int64 next_non_empty_i_ GUARDED_BY(mu_) = kNextNonEmptyUnknown;
+    Tensor next_indices_ GUARDED_BY(mu_);
+    Tensor next_values_ GUARDED_BY(mu_);
+  };
+
+  const sparse::SparseTensor sparse_tensor_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+};
+
+template <typename T>
+class SparseTensorSliceDatasetOp : public DatasetOpKernel {
+ public:
+  explicit SparseTensorSliceDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Create a new SparseTensorSliceDatasetOp::Dataset, insert it in
+    // the step container, and return it as the output.
+    const Tensor* indices;
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices));
+    const Tensor* values;
+    OP_REQUIRES_OK(ctx, ctx->input("values", &values));
+    const Tensor* dense_shape;
+    OP_REQUIRES_OK(ctx, ctx->input("dense_shape", &dense_shape));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(indices->shape()),
+                errors::InvalidArgument(
+                    "Input indices should be a matrix but received shape ",
+                    indices->shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values->shape()),
+                errors::InvalidArgument(
+                    "Input values should be a vector but received shape ",
+                    indices->shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(dense_shape->shape()),
+                errors::InvalidArgument(
+                    "Input shape should be a vector but received shape ",
+                    dense_shape->shape().DebugString()));
+
+    // We currently ensure that `sparse_tensor` is ordered in the
+    // batch dimension.
+    // TODO(mrry): Investigate ways to avoid this unconditional check
+    // if we can be sure that the sparse tensor was produced in an
+    // appropriate order (e.g. by `tf.parse_example()` or a Dataset
+    // that batches elements into rows of a SparseTensor).
+    int64 previous_batch_index = -1;
+    for (int64 i = 0; i < indices->dim_size(0); ++i) {
+      int64 next_batch_index = indices->matrix<int64>()(i, 0);
+      OP_REQUIRES(
+          ctx, next_batch_index >= previous_batch_index,
+          errors::Unimplemented("The SparseTensor must be ordered in the batch "
+                                "dimension; handling arbitrarily ordered input "
+                                "is not currently supported."));
+      previous_batch_index = next_batch_index;
+    }
+    gtl::InlinedVector<int64, 8> std_order(dense_shape->NumElements(), 0);
+    sparse::SparseTensor sparse_tensor(
+        *indices, *values, TensorShape(dense_shape->vec<int64>()), std_order);
+
+    *output = new Dataset<T>(sparse_tensor);
+  }
+
+ private:
+};
+
+#define REGISTER_DATASET_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("SparseTensorSliceDataset")      \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<type>("Tvalues"), \
+                          SparseTensorSliceDatasetOp<type>);
+
+TF_CALL_ALL_TYPES(REGISTER_DATASET_KERNEL);
+#undef REGISTER_DATASET_KERNEL
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_tensors_map_ops.cc b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
index 8101d7ca840..2aadd92475c 100644
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@@ -41,14 +41,14 @@ using sparse::SparseTensor;
 
 class SparseTensorsMap : public ResourceBase {
  public:
-  SparseTensorsMap(const string& name) : name_(name), counter_(0) {}
+  explicit SparseTensorsMap(const string& name) : name_(name), counter_(0) {}
 
   string DebugString() override { return "A SparseTensorsMap"; }
 
   typedef struct {
     PersistentTensor indices;
     PersistentTensor values;
-    TensorShape shape;
+    gtl::InlinedVector<int64, 8> shape;
   } PersistentSparseTensor;
 
   Status AddSparseTensor(OpKernelContext* ctx, const SparseTensor& sp,
@@ -68,8 +68,9 @@ class SparseTensorsMap : public ResourceBase {
     {
       mutex_lock l(mu_);
       int64 unique_st_handle = counter_++;  // increment is guarded on purpose
-      sp_tensors_[unique_st_handle] =
-          PersistentSparseTensor{persistent_ix, persistent_values, sp.shape()};
+      sp_tensors_[unique_st_handle] = PersistentSparseTensor{
+          persistent_ix, persistent_values,
+          gtl::InlinedVector<int64, 8>(sp.shape().begin(), sp.shape().end())};
       *handle = unique_st_handle;
     }
     return Status::OK();
@@ -86,12 +87,12 @@ class SparseTensorsMap : public ResourceBase {
         const int64 handle = handles(i);
         auto sp_iter = sp_tensors_.find(handle);
         if (sp_iter == sp_tensors_.end()) {
-          return errors::InvalidArgument("Unable to find SparseTensor: ",
-                                         handle, " in map: ", name_);
+          return errors::InvalidArgument(
+              "Unable to find SparseTensor: ", handle, " in map: ", name_);
         }
         const Tensor* ix = sp_iter->second.indices.AccessTensor(ctx);
         const Tensor* values = sp_iter->second.values.AccessTensor(ctx);
-        const TensorShape& shape = sp_iter->second.shape;
+        const auto& shape = sp_iter->second.shape;
         sparse_tensors->emplace_back(*ix, *values, shape);
 
         sp_tensors_.erase(sp_iter);
@@ -116,11 +117,11 @@ class SparseTensorAccessingOp : public OpKernel {
  public:
   typedef std::function<Status(SparseTensorsMap**)> CreatorCallback;
 
-  SparseTensorAccessingOp(OpKernelConstruction* context)
+  explicit SparseTensorAccessingOp(OpKernelConstruction* context)
       : OpKernel(context), sparse_tensors_map_(nullptr) {}
 
  protected:
-  ~SparseTensorAccessingOp() {
+  ~SparseTensorAccessingOp() override {
     if (sparse_tensors_map_) sparse_tensors_map_->Unref();
   }
 
@@ -137,7 +138,7 @@ class SparseTensorAccessingOp : public OpKernel {
                                    is_writing /* use_node_name_as_default */));
 
     CreatorCallback sparse_tensors_map_creator = [this](SparseTensorsMap** c) {
-      SparseTensorsMap *map = new SparseTensorsMap(cinfo_.name());
+      SparseTensorsMap* map = new SparseTensorsMap(cinfo_.name());
       *c = map;
       return Status::OK();
     };
@@ -375,7 +376,7 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
       const SparseTensor& st = sparse_tensors[i];
       const Tensor& output_indices = st.indices();
       const Tensor& output_values = st.values();
-      const TensorShape& output_shape = st.shape();
+      const auto output_shape = st.shape();
 
       OP_REQUIRES(context, TensorShapeUtils::IsMatrix(output_indices.shape()),
                   errors::InvalidArgument(
@@ -403,12 +404,12 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
                       " vs. ", output_values.dim_size(0)));
       int rank = output_indices.dim_size(1);
       OP_REQUIRES(
-          context, rank == output_shape.dims(),
+          context, rank == output_shape.size(),
           errors::InvalidArgument("Expected column counts of SparseTensor[", i,
                                   "].indices to match size of SparseTensor[", i,
                                   "].shape "
                                   "but they do not: ",
-                                  rank, " vs. ", output_shape.dims()));
+                                  rank, " vs. ", output_shape.size()));
 
       // Now we expand each SparseTensors' indices and shape by
       // prefixing a dimension
@@ -426,7 +427,7 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
       // TODO: copy shape from TensorShape to &expanded_shape_t(1)
       // std::copy_n(&output_shape_t(0), rank, &expanded_shape_t(1));
       for (int i = 0; i < rank; ++i) {
-        expanded_shape_t(i + 1) = output_shape.dim_size(i);
+        expanded_shape_t(i + 1) = output_shape[i];
       }
       TensorShape expanded_tensor_shape(expanded_shape_t);
 
@@ -463,6 +464,7 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
     std::iota(std_order.begin(), std_order.end(), 0);
 
     std::vector<SparseTensor> tensors_to_concat;
+    tensors_to_concat.reserve(N);
     for (int i = 0; i < N; ++i) {
       tensors_to_concat.emplace_back(std::move(indices_to_concat[i]),
                                      std::move(values_to_concat[i]),
@@ -473,7 +475,7 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
 
     Tensor final_output_shape(DT_INT64, TensorShape({output.dims()}));
 
-    std::copy_n(output.shape().dim_sizes().data(), output.dims(),
+    std::copy_n(output.shape().data(), output.dims(),
                 final_output_shape.vec<int64>().data());
 
     context->set_output(0, output.indices());
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index 0b339f832ef..f0d19da8046 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index 9c39841feec..c35ba42db29 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -78,11 +78,11 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
                                                    labels.shape(), &scratch));
 
     Tensor* loss_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, labels.shape(), &loss_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, labels.shape(), &loss_out));
     Tensor* back_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, logits.shape(), &back_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 1, logits.shape(), &back_out));
 
     if (logits.dim_size(0) > 0) {
       if (std::is_same<Device, CPUDevice>::value) {
diff --git a/tensorflow/core/kernels/spectrogram.cc b/tensorflow/core/kernels/spectrogram.cc
new file mode 100644
index 00000000000..7531d5d64a5
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.cc
@@ -0,0 +1,212 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <math.h>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+
+using std::complex;
+
+namespace {
+// Returns the default Hann window function for the spectrogram.
+void GetPeriodicHann(int window_length, std::vector<double>* window) {
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = std::atan(1) * 4;
+  window->resize(window_length);
+  for (int i = 0; i < window_length; ++i) {
+    (*window)[i] = 0.5 - 0.5 * cos((2 * pi * i) / window_length);
+  }
+}
+}  // namespace
+
+bool Spectrogram::Initialize(int window_length, int step_length) {
+  std::vector<double> window;
+  GetPeriodicHann(window_length, &window);
+  return Initialize(window, step_length);
+}
+
+bool Spectrogram::Initialize(const std::vector<double>& window,
+                             int step_length) {
+  window_length_ = window.size();
+  window_ = window;  // Copy window.
+  if (window_length_ < 2) {
+    LOG(ERROR) << "Window length too short.";
+    initialized_ = false;
+    return false;
+  }
+
+  step_length_ = step_length;
+  if (step_length_ < 1) {
+    LOG(ERROR) << "Step length must be positive.";
+    initialized_ = false;
+    return false;
+  }
+
+  fft_length_ = NextPowerOfTwo(window_length_);
+  CHECK(fft_length_ >= window_length_);
+  output_frequency_channels_ = 1 + fft_length_ / 2;
+
+  // Allocate 2 more than what rdft needs, so we can rationalize the layout.
+  fft_input_output_.assign(fft_length_ + 2, 0.0);
+
+  int half_fft_length = fft_length_ / 2;
+  fft_double_working_area_.assign(half_fft_length, 0.0);
+  fft_integer_working_area_.assign(2 + static_cast<int>(sqrt(half_fft_length)),
+                                   0);
+  // Set flag element to ensure that the working areas are initialized
+  // on the first call to cdft.  It's redundant given the assign above,
+  // but keep it as a reminder.
+  fft_integer_working_area_[0] = 0;
+  input_queue_.clear();
+  samples_to_next_step_ = window_length_;
+  initialized_ = true;
+  return true;
+}
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<complex<OutputSample>>>* output) {
+  if (!initialized_) {
+    LOG(ERROR) << "ComputeComplexSpectrogram() called before successful call "
+               << "to Initialize().";
+    return false;
+  }
+  CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // This will convert double to float if it needs to.
+      spectrogram_slice[i] = complex<OutputSample>(
+          fft_input_output_[2 * i], fft_input_output_[2 * i + 1]);
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input,
+    std::vector<std::vector<complex<double>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<double>>>*);
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<OutputSample>>* output) {
+  if (!initialized_) {
+    LOG(ERROR) << "ComputeSquaredMagnitudeSpectrogram() called before "
+               << "successful call to Initialize().";
+    return false;
+  }
+  CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // Similar to the Complex case, except storing the norm.
+      // But the norm function is known to be a performance killer,
+      // so do it this way with explicit real and imagninary temps.
+      const double re = fft_input_output_[2 * i];
+      const double im = fft_input_output_[2 * i + 1];
+      // Which finally converts double to float if it needs to.
+      spectrogram_slice[i] = re * re + im * im;
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<double>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<double>>*);
+
+// Return true if a full window of samples is prepared; manage the queue.
+template <class InputSample>
+bool Spectrogram::GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                                         int* input_start) {
+  auto input_it = input.begin() + *input_start;
+  int input_remaining = input.end() - input_it;
+  if (samples_to_next_step_ > input_remaining) {
+    // Copy in as many samples are left and return false, no full window.
+    input_queue_.insert(input_queue_.end(), input_it, input.end());
+    *input_start += input_remaining;  // Increases it to input.size().
+    samples_to_next_step_ -= input_remaining;
+    return false;  // Not enough for a full window.
+  } else {
+    // Copy just enough into queue to make a new window, then trim the
+    // front off the queue to make it window-sized.
+    input_queue_.insert(input_queue_.end(), input_it,
+                        input_it + samples_to_next_step_);
+    *input_start += samples_to_next_step_;
+    input_queue_.erase(
+        input_queue_.begin(),
+        input_queue_.begin() + input_queue_.size() - window_length_);
+    DCHECK_EQ(window_length_, input_queue_.size());
+    samples_to_next_step_ = step_length_;  // Be ready for next time.
+    return true;  // Yes, input_queue_ now contains exactly a window-full.
+  }
+}
+
+void Spectrogram::ProcessCoreFFT() {
+  for (int j = 0; j < window_length_; ++j) {
+    fft_input_output_[j] = input_queue_[j] * window_[j];
+  }
+  // Zero-pad the rest of the input buffer.
+  for (int j = window_length_; j < fft_length_; ++j) {
+    fft_input_output_[j] = 0.0;
+  }
+  const int kForwardFFT = 1;  // 1 means forward; -1 reverse.
+  // This real FFT is a fair amount faster than using cdft here.
+  rdft(fft_length_, kForwardFFT, &fft_input_output_[0],
+       &fft_integer_working_area_[0], &fft_double_working_area_[0]);
+  // Make rdft result look like cdft result;
+  // unpack the last real value from the first position's imag slot.
+  fft_input_output_[fft_length_] = fft_input_output_[1];
+  fft_input_output_[fft_length_ + 1] = 0;
+  fft_input_output_[1] = 0;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
new file mode 100644
index 00000000000..5476a0a9618
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions.  Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class Spectrogram {
+ public:
+  Spectrogram() : initialized_(false) {}
+  ~Spectrogram() {}
+
+  // Initializes the class with a given window length and step length
+  // (both in samples). Internally a Hann window is used as the window
+  // function. Returns true on success, after which calls to Process()
+  // are possible. window_length must be greater than 1 and step
+  // length must be greater than 0.
+  bool Initialize(int window_length, int step_length);
+
+  // Initialize with an explicit window instead of a length.
+  bool Initialize(const std::vector<double>& window, int step_length);
+
+  // Processes an arbitrary amount of audio data (contained in input)
+  // to yield complex spectrogram frames. After a successful call to
+  // Initialize(), Process() may be called repeatedly with new input data
+  // each time.  The audio input is buffered internally, and the output
+  // vector is populated with as many temporally-ordered spectral slices
+  // as it is possible to generate from the input.  The output is cleared
+  // on each call before the new frames (if any) are added.
+  //
+  // The template parameters can be float or double.
+  template <class InputSample, class OutputSample>
+  bool ComputeComplexSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+  // This function works as the one above, but returns the power
+  // (the L2 norm, or the squared magnitude) of each complex value.
+  template <class InputSample, class OutputSample>
+  bool ComputeSquaredMagnitudeSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<OutputSample>>* output);
+
+  // Return reference to the window function used internally.
+  const std::vector<double>& GetWindow() const { return window_; }
+
+  // Return the number of frequency channels in the spectrogram.
+  int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+  template <class InputSample>
+  bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                              int* input_start);
+  void ProcessCoreFFT();
+
+  int fft_length_;
+  int output_frequency_channels_;
+  int window_length_;
+  int step_length_;
+  bool initialized_;
+  int samples_to_next_step_;
+
+  std::vector<double> window_;
+  std::vector<double> fft_input_output_;
+  std::deque<double> input_queue_;
+
+  // Working data areas for the FFT routines.
+  std::vector<int> fft_integer_working_area_;
+  std::vector<double> fft_double_working_area_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Spectrogram);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/tensorflow/core/kernels/spectrogram_convert_test_data.cc b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
new file mode 100644
index 00000000000..bae13c0213e
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace wav {
+
+// This takes a CSV file representing an array of complex numbers, and saves out
+// a version using a binary format to save space in the repository.
+Status ConvertCsvToRaw(const string& input_filename) {
+  std::vector<std::vector<std::complex<double>>> input_data;
+  ReadCSVFileToComplexVectorOrDie(input_filename, &input_data);
+  const string output_filename = input_filename + ".bin";
+  if (!WriteComplexVectorToRawFloatFile(output_filename, input_data)) {
+    return errors::InvalidArgument("Failed to write raw float file ",
+                                   input_filename);
+  }
+  LOG(INFO) << "Wrote raw file to " << output_filename;
+  return Status::OK();
+}
+
+}  // namespace wav
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc < 2) {
+    LOG(ERROR) << "You must supply a CSV file as the first argument";
+    return 1;
+  }
+  tensorflow::string filename(argv[1]);
+  tensorflow::Status status = tensorflow::wav::ConvertCsvToRaw(filename);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error processing '" << filename << "':" << status;
+    return 1;
+  }
+  return 0;
+}
diff --git a/tensorflow/core/kernels/spectrogram_op.cc b/tensorflow/core/kernels/spectrogram_op.cc
new file mode 100644
index 00000000000..8e64596b532
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/spectrogram.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Create a spectrogram frequency visualization from audio data.
+class AudioSpectrogramOp : public OpKernel {
+ public:
+  explicit AudioSpectrogramOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("window_size", &window_size_));
+    OP_REQUIRES_OK(context, context->GetAttr("stride", &stride_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("magnitude_squared", &magnitude_squared_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 2,
+                errors::InvalidArgument("input must be 2-dimensional",
+                                        input.shape().DebugString()));
+    Spectrogram spectrogram;
+    OP_REQUIRES(context, spectrogram.Initialize(window_size_, stride_),
+                errors::InvalidArgument(
+                    "Spectrogram initialization failed for window size ",
+                    window_size_, " and stride ", stride_));
+
+    const auto input_as_matrix = input.matrix<float>();
+
+    const int64 sample_count = input.dim_size(0);
+    const int64 channel_count = input.dim_size(1);
+
+    const int64 output_width = spectrogram.output_frequency_channels();
+    const int64 length_minus_window = (sample_count - window_size_);
+    int64 output_height;
+    if (length_minus_window < 0) {
+      output_height = 0;
+    } else {
+      output_height = 1 + (length_minus_window / stride_);
+    }
+    const int64 output_slices = channel_count;
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({output_slices, output_height, output_width}),
+            &output_tensor));
+    auto output_flat = output_tensor->flat<float>().data();
+
+    std::vector<float> input_for_channel(sample_count);
+    for (int64 channel = 0; channel < channel_count; ++channel) {
+      float* output_slice =
+          output_flat + (channel * output_height * output_width);
+      for (int i = 0; i < sample_count; ++i) {
+        input_for_channel[i] = input_as_matrix(i, channel);
+      }
+      std::vector<std::vector<float>> spectrogram_output;
+      OP_REQUIRES(context,
+                  spectrogram.ComputeSquaredMagnitudeSpectrogram(
+                      input_for_channel, &spectrogram_output),
+                  errors::InvalidArgument("Spectrogram compute failed"));
+      OP_REQUIRES(context, (spectrogram_output.size() == output_height),
+                  errors::InvalidArgument(
+                      "Spectrogram size calculation failed: Expected height ",
+                      output_height, " but got ", spectrogram_output.size()));
+      OP_REQUIRES(context,
+                  spectrogram_output.empty() ||
+                      (spectrogram_output[0].size() == output_width),
+                  errors::InvalidArgument(
+                      "Spectrogram size calculation failed: Expected width ",
+                      output_width, " but got ", spectrogram_output[0].size()));
+      for (int row_index = 0; row_index < output_height; ++row_index) {
+        const std::vector<float>& spectrogram_row =
+            spectrogram_output[row_index];
+        DCHECK_EQ(spectrogram_row.size(), output_width);
+        float* output_row = output_slice + (row_index * output_width);
+        if (magnitude_squared_) {
+          for (int i = 0; i < output_width; ++i) {
+            output_row[i] = spectrogram_row[i];
+          }
+        } else {
+          for (int i = 0; i < output_width; ++i) {
+            output_row[i] = sqrtf(spectrogram_row[i]);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  int32 window_size_;
+  int32 stride_;
+  bool magnitude_squared_;
+};
+REGISTER_KERNEL_BUILDER(Name("AudioSpectrogram").Device(DEVICE_CPU),
+                        AudioSpectrogramOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
new file mode 100644
index 00000000000..5c3cbeeeb93
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(SpectrogramOpTest, SimpleTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+  test::FillValues<float>(&audio_tensor,
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {spectrogram_op.spectrogram}, &outputs));
+
+  const Tensor& spectrogram_tensor = outputs[0];
+
+  EXPECT_EQ(3, spectrogram_tensor.dims());
+  EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      spectrogram_tensor,
+      test::AsTensor<float>({0, 1, 2, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+TEST(SpectrogramOpTest, SquaredTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+  test::FillValues<float>(&audio_tensor,
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1,
+                       AudioSpectrogram::Attrs().MagnitudeSquared(true));
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {spectrogram_op.spectrogram}, &outputs));
+
+  const Tensor& spectrogram_tensor = outputs[0];
+
+  EXPECT_EQ(3, spectrogram_tensor.dims());
+  EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      spectrogram_tensor,
+      test::AsTensor<float>({0, 1, 4, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test.cc b/tensorflow/core/kernels/spectrogram_test.cc
new file mode 100644
index 00000000000..73175a91a00
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test.cc
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The MATLAB test data were generated using GenerateTestData.m.
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using ::std::complex;
+
+const char kInputFilename[] =
+    "core/kernels/spectrogram_test_data/short_test_segment.wav";
+
+const char kExpectedFilename[] =
+    "core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin";
+const int kDataVectorLength = 257;
+const int kNumberOfFramesInTestData = 178;
+
+const char kExpectedNonPowerOfTwoFilename[] =
+    "core/kernels/spectrogram_test_data/"
+    "short_test_segment_spectrogram_400_200.csv.bin";
+const int kNonPowerOfTwoDataVectorLength = 257;
+const int kNumberOfFramesInNonPowerOfTwoTestData = 228;
+
+TEST(SpectrogramTest, TooLittleDataYieldsNoFrames) {
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  // Generate 44 samples of audio.
+  SineWave(44100, 1000.0, 0.001, &input);
+  EXPECT_EQ(44, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(0, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeSmallerThanWindow) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(400, 200));
+  std::vector<double> input;
+  // Generate 661 samples of audio.
+  SineWave(44100, 1000.0, 0.015, &input);
+  EXPECT_EQ(661, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(200, 400));
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow2) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(200, 400));
+  std::vector<double> input;
+  // Generate more than 600 but fewer than 800 samples of audio.
+  SineWave(44100, 1000.0, 0.016, &input);
+  EXPECT_GT(input.size(), 600);
+  EXPECT_LT(input.size(), 800);
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest,
+     MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames) {
+  // Repeatedly pass inputs with "extra" samples beyond complete windows
+  // and check that the excess points cumulate to eventually cause an
+  // extra output frame.
+  Spectrogram sgram;
+  sgram.Initialize(200, 400);
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  const std::vector<int> expected_output_sizes = {
+      2,  // One pass of input leaves 82 samples buffered after two steps of
+          // 400.
+      2,  // Passing in 882 samples again will now leave 164 samples buffered.
+      3,  // Third time gives 246 extra samples, triggering an extra output
+          // frame.
+  };
+  for (int expected_output_size : expected_output_sizes) {
+    sgram.ComputeComplexSpectrogram(input, &output);
+    EXPECT_EQ(expected_output_size, output.size());
+  }
+}
+
+TEST(SpectrogramTest, CumulatingExcessInputsForOverlappingFrames) {
+  // Input frames that don't fit into whole windows are cumulated even when
+  // the windows have overlap (similar to
+  // MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames
+  // but with window size/hop size swapped).
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  const std::vector<int> expected_output_sizes = {
+      3,  // Windows 0..400, 200..600, 400..800 with 82 samples buffered.
+      4,  // 1764 frames input; outputs from 600, 800, 1000, 1200..1600.
+      5,  // 2646 frames in; outputs from 1400, 1600, 1800, 2000, 2200..2600.
+  };
+  for (int expected_output_size : expected_output_sizes) {
+    sgram.ComputeComplexSpectrogram(input, &output);
+    EXPECT_EQ(expected_output_size, output.size());
+  }
+}
+
+TEST(SpectrogramTest, StepSizeEqualToWindowWorks) {
+  Spectrogram sgram;
+  sgram.Initialize(200, 200);
+  std::vector<double> input;
+  // Generate 2205 samples of audio.
+  SineWave(44100, 1000.0, 0.05, &input);
+  EXPECT_EQ(2205, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(11, output.size());
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareComplexData(
+    const std::vector<std::vector<complex<ExpectedSample>>>& expected,
+    const std::vector<std::vector<complex<ActualSample>>>& actual,
+    double tolerance) {
+  ASSERT_EQ(actual.size(), expected.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i].size(), actual[i].size());
+    for (int j = 0; j < expected[i].size(); ++j) {
+      ASSERT_NEAR(real(expected[i][j]), real(actual[i][j]), tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+      ASSERT_NEAR(imag(expected[i][j]), imag(actual[i][j]), tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+    }
+  }
+}
+
+template <class Sample>
+double GetMaximumAbsolute(const std::vector<std::vector<Sample>>& spectrogram) {
+  double max_absolute = 0.0;
+  for (int i = 0; i < spectrogram.size(); ++i) {
+    for (int j = 0; j < spectrogram[i].size(); ++j) {
+      double absolute_value = std::abs(spectrogram[i][j]);
+      if (absolute_value > max_absolute) {
+        max_absolute = absolute_value;
+      }
+    }
+  }
+  return max_absolute;
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareMagnitudeData(
+    const std::vector<std::vector<complex<ExpectedSample>>>&
+        expected_complex_output,
+    const std::vector<std::vector<ActualSample>>& actual_squared_magnitude,
+    double tolerance) {
+  ASSERT_EQ(actual_squared_magnitude.size(), expected_complex_output.size());
+  for (int i = 0; i < expected_complex_output.size(); ++i) {
+    ASSERT_EQ(expected_complex_output[i].size(),
+              actual_squared_magnitude[i].size());
+    for (int j = 0; j < expected_complex_output[i].size(); ++j) {
+      ASSERT_NEAR(norm(expected_complex_output[i][j]),
+                  actual_squared_magnitude[i][j], tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+    }
+  }
+}
+
+TEST(SpectrogramTest, ReInitializationWorks) {
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  std::vector<std::vector<complex<double>>> first_output;
+  std::vector<std::vector<complex<double>>> second_output;
+  sgram.Initialize(512, 256);
+  sgram.ComputeComplexSpectrogram(input, &first_output);
+  // Re-Initialize it.
+  sgram.Initialize(512, 256);
+  sgram.ComputeComplexSpectrogram(input, &second_output);
+  // Verify identical outputs.
+  ASSERT_EQ(first_output.size(), second_output.size());
+  int slice_size = first_output[0].size();
+  for (int i = 0; i < first_output.size(); ++i) {
+    ASSERT_EQ(slice_size, first_output[i].size());
+    ASSERT_EQ(slice_size, second_output[i].size());
+    for (int j = 0; j < slice_size; ++j) {
+      ASSERT_EQ(first_output[i][j], second_output[i][j]);
+    }
+  }
+}
+
+TEST(SpectrogramTest, ComputedComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-5);
+}
+
+TEST(SpectrogramTest, ComputedFloatComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> double_input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &double_input));
+  std::vector<float> input;
+  input.assign(double_input.begin(), double_input.end());
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<float>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-4);
+}
+
+TEST(SpectrogramTest, ComputedSquaredMagnitudeDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<double>> output;
+  sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+  CompareMagnitudeData(expected_output, output, 1e-3);
+}
+
+TEST(SpectrogramTest, ComputedFloatSquaredMagnitudeDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> double_input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &double_input));
+  EXPECT_EQ(kInputDataLength, double_input.size());
+  std::vector<float> input;
+  input.assign(double_input.begin(), double_input.end());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<float>> output;
+  sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+  double max_absolute = GetMaximumAbsolute(output);
+  EXPECT_GT(max_absolute, 2300.0);  // Verify that we have some big numbers.
+  // Squaring increases dynamic range; max square is about 2300,
+  // so 2e-4 is about 7 decimal digits; not bad for a float.
+  CompareMagnitudeData(expected_output, output, 2e-4);
+}
+
+TEST(SpectrogramTest, ComputedNonPowerOfTwoComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(),
+                               kExpectedNonPowerOfTwoFilename),
+      kNonPowerOfTwoDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInNonPowerOfTwoTestData, expected_output.size());
+  EXPECT_EQ(kNonPowerOfTwoDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-5);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_data/README b/tensorflow/core/kernels/spectrogram_test_data/README
new file mode 100644
index 00000000000..271238e0c9c
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/README
@@ -0,0 +1,8 @@
+The CSV spectrogram files in this directory are generated from the
+matlab code in ./matlab/GenerateTestData.m
+To save space in the repo, you'll then need to convert them into a binary packed
+format using the convert_test_data.cc command line tool.
+
+
+short_test_segment.wav is approximately 1s of music audio.
+
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav
new file mode 100644
index 00000000000..7339dfd08c8
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin
new file mode 100644
index 00000000000..67b9e2487c3
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin
new file mode 100644
index 00000000000..d5e4cc5dd60
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin differ
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
new file mode 100644
index 00000000000..046f6344dfe
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -0,0 +1,287 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include <math.h>
+#include <stddef.h>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data) {
+  string wav_data;
+  if (!ReadFileToString(Env::Default(), file_name, &wav_data).ok()) {
+    LOG(ERROR) << "Wave file read failed for " << file_name;
+    return false;
+  }
+  std::vector<float> decoded_data;
+  uint32 decoded_sample_count;
+  uint16 decoded_channel_count;
+  uint32 decoded_sample_rate;
+  if (!wav::DecodeLin16WaveAsFloatVector(
+           wav_data, &decoded_data, &decoded_sample_count,
+           &decoded_channel_count, &decoded_sample_rate)
+           .ok()) {
+    return false;
+  }
+  // Convert from float to double for the output value.
+  data->resize(decoded_data.size());
+  for (int i = 0; i < decoded_data.size(); ++i) {
+    (*data)[i] = decoded_data[i];
+  }
+  return true;
+}
+
+bool ReadRawFloatFileToComplexVector(
+    const string& file_name, int row_length,
+    std::vector<std::vector<std::complex<double> > >* data) {
+  data->clear();
+  string data_string;
+  if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  float real_out;
+  float imag_out;
+  const int kBytesPerValue = 4;
+  CHECK_EQ(sizeof(real_out), kBytesPerValue);
+  std::vector<std::complex<double> > data_row;
+  int row_counter = 0;
+  int offset = 0;
+  const int end = data_string.size();
+  while (offset < end) {
+    memcpy(&real_out, data_string.data() + offset, kBytesPerValue);
+    offset += kBytesPerValue;
+    memcpy(&imag_out, data_string.data() + offset, kBytesPerValue);
+    offset += kBytesPerValue;
+    if (row_counter >= row_length) {
+      data->push_back(data_row);
+      data_row.clear();
+      row_counter = 0;
+    }
+    data_row.push_back(std::complex<double>(real_out, imag_out));
+    ++row_counter;
+  }
+  if (row_counter >= row_length) {
+    data->push_back(data_row);
+  }
+  return true;
+}
+
+void ReadCSVFileToComplexVectorOrDie(
+    const string& file_name,
+    std::vector<std::vector<std::complex<double> > >* data) {
+  data->clear();
+  string data_string;
+  if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+    LOG(FATAL) << "Failed to open file " << file_name;
+    return;
+  }
+  std::vector<string> lines = str_util::Split(data_string, '\n');
+  for (const string& line : lines) {
+    if (line.empty()) {
+      continue;
+    }
+    std::vector<std::complex<double> > data_line;
+    std::vector<string> values = str_util::Split(line, ',');
+    for (std::vector<string>::const_iterator i = values.begin();
+         i != values.end(); ++i) {
+      // each element of values may be in the form:
+      // 0.001+0.002i, 0.001, 0.001i, -1.2i, -1.2-3.2i, 1.5, 1.5e-03+21.0i
+      std::vector<string> parts;
+      // Find the first instance of + or - after the second character
+      // in the string, that does not immediately follow an 'e'.
+      size_t operator_index = i->find_first_of("+-", 2);
+      if (operator_index < i->size() &&
+          i->substr(operator_index - 1, 1) == "e") {
+        operator_index = i->find_first_of("+-", operator_index + 1);
+      }
+      parts.push_back(i->substr(0, operator_index));
+      if (operator_index < i->size()) {
+        parts.push_back(i->substr(operator_index, string::npos));
+      }
+
+      double real_part = 0.0;
+      double imaginary_part = 0.0;
+      for (std::vector<string>::const_iterator j = parts.begin();
+           j != parts.end(); ++j) {
+        if (j->find_first_of("ij") != string::npos) {
+          strings::safe_strtod((*j).c_str(), &imaginary_part);
+        } else {
+          strings::safe_strtod((*j).c_str(), &real_part);
+        }
+      }
+      data_line.push_back(std::complex<double>(real_part, imaginary_part));
+    }
+    data->push_back(data_line);
+  }
+}
+
+void ReadCSVFileToArrayOrDie(const string& filename,
+                             std::vector<std::vector<float> >* array) {
+  string contents;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), filename, &contents));
+  std::vector<string> lines = str_util::Split(contents, '\n');
+  contents.clear();
+
+  array->clear();
+  std::vector<float> values;
+  for (int l = 0; l < lines.size(); ++l) {
+    values.clear();
+    CHECK(str_util::SplitAndParseAsFloats(lines[l], ',', &values));
+    array->push_back(values);
+  }
+}
+
+bool WriteDoubleVectorToFile(const string& file_name,
+                             const std::vector<double>& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteFloatVectorToFile(const string& file_name,
+                            const std::vector<float>& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+                            const double* data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < size; ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteFloatArrayToFile(const string& file_name, int size,
+                           const float* data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < size; ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteComplexVectorToRawFloatFile(
+    const string& file_name,
+    const std::vector<std::vector<std::complex<double> > >& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    for (int j = 0; j < data[i].size(); ++j) {
+      const float real_part(real(data[i][j]));
+      if (!file->Append(StringPiece(reinterpret_cast<const char*>(&real_part),
+                                    sizeof(real_part)))
+               .ok()) {
+        LOG(ERROR) << "Failed to append to file " << file_name;
+        return false;
+      }
+
+      const float imag_part(imag(data[i][j]));
+      if (!file->Append(StringPiece(reinterpret_cast<const char*>(&imag_part),
+                                    sizeof(imag_part)))
+               .ok()) {
+        LOG(ERROR) << "Failed to append to file " << file_name;
+        return false;
+      }
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+              std::vector<double>* data) {
+  data->clear();
+  for (int i = 0; i < static_cast<int>(sample_rate * duration_seconds); ++i) {
+    data->push_back(
+        sin(2.0 * M_PI * i * frequency / static_cast<double>(sample_rate)));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.h b/tensorflow/core/kernels/spectrogram_test_utils.h
new file mode 100644
index 00000000000..59a903549e8
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Reads a wav format file into a vector of floating-point values with range
+// -1.0 to 1.0.
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data);
+
+// Reads a binary file containing 32-bit floating point values in the
+// form [real_1, imag_1, real_2, imag_2, ...] into a rectangular array
+// of complex values where row_length is the length of each inner vector.
+bool ReadRawFloatFileToComplexVector(
+    const string& file_name, int row_length,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a CSV file of numbers in the format 1.1+2.2i,1.1,2.2i,3.3j into data.
+void ReadCSVFileToComplexVectorOrDie(
+    const string& file_name,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a 2D array of floats from an ASCII text file, where each line is a row
+// of the array, and elements are separated by commas.
+void ReadCSVFileToArrayOrDie(const string& filename,
+                             std::vector<std::vector<float> >* array);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleVectorToFile(const string& file_name,
+                             const std::vector<double>& data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatVectorToFile(const string& file_name,
+                            const std::vector<float>& data);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+                            const double* data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatArrayToFile(const string& file_name, int size,
+                           const float* data);
+
+// Write a binary file in the format read by
+// ReadRawDoubleFileToComplexVector above.
+bool WriteComplexVectorToRawFloatFile(
+    const string& file_name,
+    const std::vector<std::vector<std::complex<double> > >& data);
+
+// Generate a sine wave with the provided parameters, and populate
+// data with the samples.
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+              std::vector<double>* data);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index e377e4d97a4..6583f96a917 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -50,16 +50,12 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-  if (output.size() < 131072) {
-    output = input.slice(slice_indices, slice_sizes);
-  } else {
     output.device(d) = input.slice(slice_indices, slice_sizes);
-  }
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS)
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 40b8952e14a..3c0b5d113b0 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -50,12 +50,16 @@ void SplitCustom<Device, T>::operator()(
 #define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(bfloat16);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(bfloat16);
 
 #undef DEFINE_GPU_KERNELS
@@ -236,12 +240,16 @@ struct SplitVOpGPULaunch {
 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
   template struct SplitVOpGPULaunch<T, int32>; \
   template struct SplitVOpGPULaunch<T, int64>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bfloat16);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index cca2fc41c2c..c4f312f3f63 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -46,15 +46,18 @@ class SplitOpBase : public OpKernel {
   explicit SplitOpBase(OpKernelConstruction* c) : OpKernel(c) {}
 
   void ComputeEasyCases(OpKernelContext* context, bool* done) {
-    const int32 split_dim = context->input(0).flat<int32>()(0);
-    const int32 num_split = num_outputs();
     const Tensor& input = context->input(1);
     const TensorShape& input_shape = input.shape();
+    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
+    const int32 num_split = num_outputs();
 
     OP_REQUIRES(
         context, 0 <= split_dim && split_dim < input_shape.dims(),
-        errors::InvalidArgument("0 <= split_dim < number of input dimensions (",
-                                input_shape.dims(), "), but got ", split_dim));
+        errors::InvalidArgument("-input rank(-", input.dims(),
+                                ") <= split_dim < input rank (", input.dims(),
+                                "), but got ", split_dim_orig));
 
     OP_REQUIRES(
         context, num_split > 0,
@@ -129,10 +132,12 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
     if (!context->status().ok() || done) {
       return;
     }
-    const int32 split_dim = context->input(0).flat<int32>()(0);
     const int32 num_split = Base::num_outputs();
     const Tensor& input = context->input(1);
     const TensorShape& input_shape = input.shape();
+    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
@@ -204,15 +209,16 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     if (!context->status().ok() || done) {
       return;
     }
-    const int32 split_dim = context->input(0).flat<int32>()(0);
-    const int32 num_split = Base::num_outputs();
     const Tensor& input = context->input(1);
     const TensorShape& input_shape = input.shape();
+    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
+    const int32 num_split = Base::num_outputs();
     OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
                                          std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("Split on GPU requires input size "
                                         "< max int32"));
-
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -247,7 +253,6 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-
 template <typename T>
 class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
  public:
@@ -260,10 +265,12 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
     if (!context->status().ok() || done) {
       return;
     }
-    const int32 split_dim = context->input(0).flat<int32>()(0);
-    const int32 num_split = Base::num_outputs();
     const Tensor& input = context->input(1);
     const TensorShape& input_shape = input.shape();
+    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
+    const int32 num_split = Base::num_outputs();
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
@@ -312,8 +319,7 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
     }
   }
 };
-
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -337,6 +343,8 @@ REGISTER_SPLIT(quint8);
                           SplitOpGPU<type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
@@ -349,7 +357,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
                               .HostMemory("split_dim"),   \
                           SplitOpSYCL<type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
 #undef REGISTER_SYCL
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 47d98a9f084..53cf6d3a916 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -55,7 +55,9 @@ class SplitVOpBase : public OpKernel {
     const TensorShape& input_shape = input.shape();
     const Tensor& split_tensor = context->input(1);
 
-    const int32 split_dim = context->input(2).flat<int32>()(0);
+    const int32 split_dim_orig = context->input(2).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
 
     OP_REQUIRES(
         context,
@@ -79,14 +81,21 @@ class SplitVOpBase : public OpKernel {
 
     OP_REQUIRES(
         context, 0 <= split_dim && split_dim < input.dims(),
-        errors::InvalidArgument("0 <= split_dim < number of input dimensions (",
-                                input.dims(), "), but got ", split_dim));
+        errors::InvalidArgument("-input rank(-", input.dims(),
+                                ") <= split_dim < input rank (", input.dims(),
+                                "), but got ", split_dim_orig));
 
     Tlen input_size_split_dim = input_shape.dim_size(split_dim);
 
     // Special case 1: num_split == 1. Nothing to do.
     if (num_split == 1) {
       context->set_output(0, context->input(0));
+      OP_REQUIRES(
+          context, (*split_sizes_vec)[0] == input_size_split_dim,
+          errors::InvalidArgument("If there is only one output, it must have "
+                                  "the same size as the input. Input size: ",
+                                  input_size_split_dim,
+                                  " output size: ", (*split_sizes_vec)[0]));
       *done = true;
       return;
     }
@@ -118,8 +127,9 @@ class SplitVOpBase : public OpKernel {
                                 "specified.  Got: ",
                                 determined_size));
 
-    if (neg_one_dim >= 0)
+    if (neg_one_dim >= 0) {
       (*split_sizes_vec)[neg_one_dim] = input_size_split_dim - determined_size;
+    }
 
     // Special case 2: split along the 1st dimension. We can share the
     // underlying buffer.
@@ -139,7 +149,6 @@ class SplitVOpBase : public OpKernel {
       *done = true;
       return;
     }
-    return;
   }
 
   template <typename IndexType>
@@ -181,7 +190,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
     const int32 num_split = Base::num_outputs();
     const Tensor& input = context->input(0);
     const TensorShape& input_shape = input.shape();
-    const int32 split_dim = context->input(2).flat<int32>()(0);
+    const int32 split_dim_orig = context->input(2).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
@@ -251,7 +262,9 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 num_split = Base::num_outputs();
     const Tensor& input = context->input(0);
     const TensorShape& input_shape = input.shape();
-    const int32 split_dim = context->input(2).flat<int32>()(0);
+    const int32 split_dim_orig = context->input(2).flat<int32>()(0);
+    const int32 split_dim =
+        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
                                          std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("Split on GPU requires input size "
@@ -374,6 +387,8 @@ REGISTER_SPLIT_LEN(bfloat16);
   REGISTER_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN);
+TF_CALL_complex64(REGISTER_GPU_LEN);
+TF_CALL_complex128(REGISTER_GPU_LEN);
 REGISTER_GPU_LEN(bfloat16);
 #undef REGISTER_GPU_LEN
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index 241d39ba442..2db3e5ef777 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -40,6 +40,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -147,7 +150,7 @@ class StackOp : public OpKernel {
   explicit StackOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
     OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
-    if (stack_name_ == "") stack_name_ = name();
+    if (stack_name_.empty()) stack_name_ = name();
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -182,6 +185,10 @@ class StackOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_SYCL).HostMemory("handle"),
+                        StackOp);
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -213,7 +220,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        std::is_same<Device, GPUDevice>::value &&
+        ( std::is_same<Device, GPUDevice>::value
+#ifdef TENSORFLOW_USE_SYCL
+          || std::is_same<Device, SYCLDevice>::value
+#endif // TENSORFLOW_USE_SYCL
+        ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -289,6 +300,31 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .HostMemory("elem")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackPopOp : public AsyncOpKernel {
  public:
   explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
@@ -359,6 +395,31 @@ REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .HostMemory("elem")                 \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackCloseOp : public OpKernel {
  public:
   explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -376,5 +437,8 @@ class StackCloseOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
-
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StackClose").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c18b992ea15..1717428adfb 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <deque>
+#include <mutex>
+#include <numeric>
 #include <vector>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,55 +28,172 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
-
 namespace {
 
 class Buffer : public ResourceBase {
  public:
-  explicit Buffer() {}
-
-  typedef std::vector<Tensor> Tuple;
-
-  // the Buffer takes ownership of the Tuple
-  void Put(Tuple* tuple) {
-    mutex_lock l(mu_);
-    buf_.push_back(std::move(*tuple));
-    non_empty_cond_var_.notify_one();  // maybe possible to optimize by reducing
-                                       // how often this signal is sent
-  }
-
-  void Get(Tuple* tuple) {  // TODO(zhifengc): Support cancellation.
-    mutex_lock l(mu_);
-    while (buf_.empty()) {
-      non_empty_cond_var_.wait(l);
-    }
-
-    *tuple = std::move(buf_.front());
-    buf_.pop_front();
-  }
-
-  string DebugString() {
-    mutex_lock l(mu_);
-    return strings::StrCat("Staging size: ", buf_.size());
-  }
+  // public types
+  using Tuple = std::vector<Tensor>;
 
  private:
-  mutex mu_;
-  condition_variable non_empty_cond_var_;
-  std::deque<Tuple> buf_ GUARDED_BY(mu_);
-};
+  // private variables
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable non_empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<Tuple> buf_;
 
-Status CreateBuffer(Buffer** ret) {
-  *ret = new Buffer;
-  return Status::OK();
-}
+ private:
+  // private methods
+
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      full_cond_var_.notify_one();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
+
+  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple & tuple)
+  {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+      [](const std::size_t & lhs, const Tensor & rhs) {
+        return lhs + rhs.TotalBytes();
+    });
+  }
+
+ public:
+  // public methods
+  explicit Buffer(std::size_t capacity, std::size_t memory_limit)
+      : capacity_(capacity), memory_limit_(memory_limit), current_bytes_(0) {}
+
+  // the Buffer takes ownership of the Tuple
+  Status Put(Tuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    std::size_t tuple_bytes = GetTupleBytes(*tuple);
+
+    // Sanity check so that we don't block for ever below
+    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(errors::ResourceExhausted("Attempted to insert "
+        "tensors with combined size of '", tuple_bytes, "' bytes into "
+        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    }
+
+
+    // If buffer capacity is bounded wait until elements have been removed
+    if(IsBounded()) {
+      full_cond_var_.wait(lock, [tuple_bytes, this]() {
+        // If there's a memory limit, check if there's space for insertion
+        bool memory_limit_valid = memory_limit_ > 0 ?
+            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        // If we're configured for capacity check if there's space for insertion
+        bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
+
+        // Stop waiting upon success for both conditions
+        return capacity_valid && memory_limit_valid;
+      });
+    }
+
+    // Update bytes in the Staging Area
+    current_bytes_ += tuple_bytes;
+
+    // Store tuple
+    buf_.push_back(std::move(*tuple));
+
+    lock.unlock();
+    // maybe possible to optimize by reducing
+    // how often this signal is sent
+    non_empty_cond_var_.notify_one();
+
+    return Status::OK();
+  }
+
+  // Get tuple at front of the buffer
+  void Get(Tuple* tuple) {  // TODO(zhifengc): Support cancellation.
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Wait for data if the buffer is empty
+    non_empty_cond_var_.wait(lock, [this]() { return !buf_.empty(); });
+
+    // Move data into the output tuple
+    *tuple = std::move(buf_.front());
+    buf_.pop_front();
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= GetTupleBytes(*tuple);
+
+    notify_inserters_if_bounded(&lock);
+  }
+
+  // Return tuple at index
+  Status Peek(std::size_t index, Tuple* tuple) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    // Wait if the requested index is not available
+    non_empty_cond_var_.wait(
+        lock, [index, this]() { return index < this->buf_.size(); });
+
+    // Place tensors in the output tuple
+    for (const auto& tensor : buf_[index]) {
+      tuple->push_back(tensor);
+    }
+
+    return Status::OK();
+  }
+
+  // Buffer size
+  size_t Size() {
+    std::unique_lock<std::mutex> lock(mu_);
+    return buf_.size();
+  }
+
+  void Clear() {
+    std::unique_lock<std::mutex> lock(mu_);
+    buf_.clear();
+    current_bytes_ = 0;
+
+    notify_inserters_if_bounded(&lock);
+  }
+
+  string DebugString() override {
+    std::unique_lock<std::mutex> lock(mu_);
+    return strings::StrCat("Staging size: ", buf_.size());
+  }
+};
 
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   auto rm = ctx->resource_manager();
   ContainerInfo cinfo;
+
+  // Lambda for creating the Staging Area
+  auto create_fn = [&ndef](Buffer** ret) -> Status
+  {
+    int64 capacity;
+    int64 memory_limit;
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "memory_limit", &memory_limit));
+    *ret = new Buffer(capacity, memory_limit);
+    return Status::OK();
+  };
+
+
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
-                                                buf, CreateBuffer));
+                                                buf, create_fn));
   return Status::OK();
 }
 
@@ -88,10 +208,11 @@ class StageOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
     core::ScopedUnref scope(buf);
     Buffer::Tuple tuple;
+    tuple.reserve(ctx->num_inputs());
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       tuple.push_back(ctx->input(i));
     }
-    buf->Put(&tuple);
+    OP_REQUIRES_OK(ctx, buf->Put(&tuple));
   }
 };
 
@@ -99,6 +220,9 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
+#endif // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -111,11 +235,13 @@ class UnstageOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
     core::ScopedUnref scope(buf);
     Buffer::Tuple tuple;
+
     buf->Get(&tuple);
-    OP_REQUIRES(
-        ctx, tuple.size() == (size_t)ctx->num_outputs(),
+
+    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
+
     for (size_t i = 0; i < tuple.size(); ++i) {
       ctx->set_output(i, tuple[i]);
     }
@@ -126,5 +252,101 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
+#endif // TENSORFLOW_USE_SYCL
+
+class StagePeekOp : public OpKernel {
+ public:
+  explicit StagePeekOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+    Buffer::Tuple tuple;
+
+    std::size_t index = ctx->input(0).scalar<int>()();
+
+    OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
+
+    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+        errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                                " vs. ", ctx->num_outputs()));
+
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
+                                              StagePeekOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
+                            Device(DEVICE_GPU), StagePeekOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
+                          .Device(DEVICE_SYCL), StagePeekOp);
+#endif // TENSORFLOW_USE_SYCL
+
+
+class StageSizeOp : public OpKernel {
+ public:
+  explicit StageSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+
+    // Allocate size output tensor
+    Tensor * size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
+                                                     &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(buf->Size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
+                        .Device(DEVICE_GPU), StageSizeOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
+                        .Device(DEVICE_SYCL), StageSizeOp);
+#endif // TENSORFLOW_USE_SYCL
+
+class StageClearOp : public OpKernel {
+ public:
+  explicit StageClearOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+
+    buf->Clear();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
+#endif // TENSORFLOW_USE_SYCL
+
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
new file mode 100644
index 00000000000..79d0c07acde
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -0,0 +1,173 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+class StatelessRandomOpBase : public OpKernel {
+ public:
+  explicit StatelessRandomOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Sanitize input
+    const Tensor& shape_t = context->input(0);
+    const Tensor& seed_t = context->input(1);
+    TensorShape shape;
+    OP_REQUIRES_OK(context, MakeShape(shape_t, &shape));
+    OP_REQUIRES(context, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    // Allocate output
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) return;
+
+    // Grab the two seeds
+    const auto seed = seed_t.flat<int64>();
+    const uint64 seed0 = internal::SubtleMustCopy(seed(0));
+    const uint64 seed1 = internal::SubtleMustCopy(seed(1));
+
+    // Scramble the seeds so that the user doesn't need to worry about which
+    // part of the seed needs to be strong.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    key[0] = 0x3ec8f720;
+    key[1] = 0x02461e29;
+    counter[0] = static_cast<uint32>(seed0);
+    counter[1] = static_cast<uint32>(seed0 >> 32);
+    counter[2] = static_cast<uint32>(seed1);
+    counter[3] = static_cast<uint32>(seed1 >> 32);
+    const auto mix = random::PhiloxRandom(counter, key)();
+    key[0] = mix[0];
+    key[1] = mix[1];
+    counter[0] = counter[1] = 0;
+    counter[2] = mix[2];
+    counter[3] = mix[3];
+
+    // Fill in the random numbers
+    Fill(context, random::PhiloxRandom(counter, key), output);
+  }
+
+  // The part of Compute that depends on device, type, and distribution
+  virtual void Fill(OpKernelContext* context, random::PhiloxRandom random,
+                    Tensor* output) = 0;
+};
+
+template <typename Device, class Distribution>
+class StatelessRandomOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* context, random::PhiloxRandom random,
+            Tensor* output) override {
+    typedef typename Distribution::ResultElementType T;
+    auto flat = output->flat<T>();
+    // Reuse the compute kernels from the stateful random ops
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        context, context->eigen_device<Device>(), random, flat.data(),
+        flat.size(), Distribution());
+  }
+};
+
+}  // namespace
+
+#define REGISTER(TYPE)                                                 \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomUniform")                                   \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<CPUDevice, random::UniformDistribution<        \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomNormal")                                    \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<CPUDevice, random::NormalDistribution<         \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessTruncatedNormal")                                 \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<                                               \
+          CPUDevice,                                                   \
+          random::TruncatedNormalDistribution<                         \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
+#if GOOGLE_CUDA
+
+#define REGISTER(TYPE)                                                 \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomUniform")                                   \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<GPUDevice, random::UniformDistribution<        \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomNormal")                                    \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<GPUDevice, random::NormalDistribution<         \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessTruncatedNormal")                                 \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<                                               \
+          GPUDevice,                                                   \
+          random::TruncatedNormalDistribution<                         \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index a3738655098..47eb85999e1 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
@@ -294,8 +295,16 @@ class StridedSliceAssignOp : public OpKernel {
     gtl::InlinedVector<int64, 4> end;
     gtl::InlinedVector<int64, 4> strides;
 
-    context->forward_ref_input_to_ref_output(0, 0);
-    Tensor old_lhs = context->mutable_input(0, true);
+    Tensor old_lhs;
+    if (context->input_dtype(0) == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(context,
+                     LookupResource(context, HandleFromInput(context, 0), &v));
+      old_lhs = *v->tensor();
+    } else {
+      context->forward_ref_input_to_ref_output(0, 0);
+      old_lhs = context->mutable_input(0, true);
+    }
 
     ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
     ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
@@ -354,28 +363,36 @@ class StridedSliceAssignOp : public OpKernel {
   int32 ellipsis_mask, new_axis_mask, shrink_axis_mask;
 };
 
-#define REGISTER_STRIDED_SLICE(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceOp<CPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceGradOp<CPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
+#define REGISTER_STRIDED_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceOp<CPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceGradOp<CPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceAssignOp<CPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("ref")                 \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
@@ -385,34 +402,45 @@ REGISTER_STRIDED_SLICE(bfloat16);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceOp<GPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<GPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_GPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceOp<GPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceGradOp<GPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceAssignOp<GPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("ref")                 \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
                           StridedSliceAssignOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -447,7 +475,97 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
                             .HostMemory("end")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                    \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceOp<SYCLDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")                \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("shape")                \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceGradOp<SYCLDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")              \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceAssignOp<SYCLDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")      \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("ref")                  \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceAssignOp<SYCLDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
+
+REGISTER_KERNEL_BUILDER(Name("StridedSlice")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("input")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("output"),
+                        StridedSliceOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("shape")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("dy")
+                            .HostMemory("output"),
+                        StridedSliceGradOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index 040f38ef5ac..a8487f49f44 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -51,6 +51,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index e89d1920b9c..d0ccd5c6521 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -277,14 +277,32 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
 
 #if GOOGLE_CUDA
 TF_CALL_GPU_PROXY_TYPES(PREVENT_FOR_N_GPU);
+TF_CALL_complex64(PREVENT_FOR_N_GPU);
+TF_CALL_complex128(PREVENT_FOR_N_GPU);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
+TF_CALL_complex64(DECLARE_FOR_N_GPU);
+TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
 DECLARE_FOR_N_CPU(bfloat16);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define PREVENT_FOR_N_SYCL(T) \
+  PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#define DECLARE_FOR_N_SYCL(T) \
+  INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
+DECLARE_FOR_N_SYCL(int32);
+
+#undef DECLARE_FOR_N_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
 #undef DECLARE_FOR_N_GPU
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index 2b4e5f10c0e..ca66ccad8ba 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -83,6 +82,12 @@ static void BM_SliceFloat(int iters, int dim2) {
 
 BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
 
+static void BM_SliceComplex64(int iters, int dim2) {
+  SliceHelper<std::complex<float>>(iters, dim2);
+}
+
+BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
+
 static void BM_SliceBFloat16(int iters, int dim2) {
   SliceHelper<bfloat16>(iters, dim2);
 }
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 3226e5e0f8f..d7b804daebc 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace {
 
 std::vector<string> Split(const string& str, const string& delimiter) {
-  if (delimiter.size()) {
+  if (!delimiter.empty()) {
     return str_util::Split(str, delimiter, str_util::SkipEmpty());
   }
   std::vector<string> char_vector(str.size());
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 68126710916..d583e4e6bba 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -65,6 +65,13 @@ void StringToNumberOp<float>::Convert(const string& s, float* output_data,
               errors::InvalidArgument(kErrorMessage, s));
 }
 
+template <>
+void StringToNumberOp<double>::Convert(const string& s, double* output_data,
+                                       OpKernelContext* context) {
+  OP_REQUIRES(context, strings::safe_strtod(s.c_str(), output_data),
+              errors::InvalidArgument(kErrorMessage, s));
+}
+
 template <>
 void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
                                       OpKernelContext* context) {
@@ -72,6 +79,13 @@ void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
               errors::InvalidArgument(kErrorMessage, s));
 }
 
+template <>
+void StringToNumberOp<int64>::Convert(const string& s, int64* output_data,
+                                      OpKernelContext* context) {
+  OP_REQUIRES(context, strings::safe_strto64(s, output_data),
+              errors::InvalidArgument(kErrorMessage, s));
+}
+
 // Registers the currently supported output types.
 #define REGISTER(type)                                           \
   REGISTER_KERNEL_BUILDER(Name("StringToNumber")                 \
@@ -79,7 +93,9 @@ void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
                               .TypeConstraint<type>("out_type"), \
                           StringToNumberOp<type>)
 REGISTER(float);
+REGISTER(double);
 REGISTER(int32);
+REGISTER(int64);
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_audio_op_test.cc b/tensorflow/core/kernels/summary_audio_op_test.cc
index 2a535c96e14..1b957c548b6 100644
--- a/tensorflow/core/kernels/summary_audio_op_test.cc
+++ b/tensorflow/core/kernels/summary_audio_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 39022cd1c32..22f593ddcad 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -84,7 +84,8 @@ class SummaryImageOp : public OpKernel {
         return typename TTypes<uint8>::ConstMatrix(
             &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
       };
-      AddImages(base_tag, batch_size, w, h, depth, ith_image, &s);
+      OP_REQUIRES_OK(
+          c, AddImages(base_tag, batch_size, w, h, depth, ith_image, &s));
     } else if (tensor.dtype() == DT_HALF) {
       NormalizeAndAddImages<Eigen::half>(c, tensor, h, w, hw, depth, batch_size,
                                          base_tag, &s);
@@ -121,7 +122,8 @@ class SummaryImageOp : public OpKernel {
       NormalizeFloatImage<T>(hw, depth, values, bad_color, &image);
       return image;
     };
-    AddImages(base_tag, batch_size, w, h, depth, ith_image, s);
+    OP_REQUIRES_OK(c,
+                   AddImages(base_tag, batch_size, w, h, depth, ith_image, s));
   }
 
   // Add the sequence of images specified by ith_image to the summary.
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
index f9362769256..74e0d092c2d 100644
--- a/tensorflow/core/kernels/summary_image_op_test.cc
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index d8596ba9ea0..b818724ec2e 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -149,7 +149,7 @@ class SummaryMergeOp : public OpKernel {
           const string& tag = summary_in.value(v).tag();
           // The tag is unused by the TensorSummary op, so no need to check
           // for duplicates.
-          if ((tag != "") && !tags.insert(tag).second) {
+          if ((!tag.empty()) && !tags.insert(tag).second) {
             c->SetStatus(errors::InvalidArgument(strings::StrCat(
                 "Duplicate tag ", tag, " found in summary inputs")));
             return;
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 05b1687e5ff..3c46abb8ab1 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/core/kernels/summary_tensor_op.cc b/tensorflow/core/kernels/summary_tensor_op.cc
index e939a58794d..c816974378b 100644
--- a/tensorflow/core/kernels/summary_tensor_op.cc
+++ b/tensorflow/core/kernels/summary_tensor_op.cc
@@ -17,12 +17,60 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
+template <typename T>
+class SummaryTensorOpV2 : public OpKernel {
+ public:
+  explicit SummaryTensorOpV2(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& tag = c->input(0);
+    OP_REQUIRES(c, IsLegacyScalar(tag.shape()),
+                errors::InvalidArgument("tag must be scalar"));
+    const Tensor& tensor = c->input(1);
+    const Tensor& serialized_summary_metadata_tensor = c->input(2);
+
+    Summary s;
+    Summary::Value* v = s.add_value();
+    v->set_tag(tag.scalar<string>()());
+
+    if (tensor.dtype() == DT_STRING) {
+      // tensor_util.makeNdarray doesn't work for strings in tensor_content
+      tensor.AsProtoField(v->mutable_tensor());
+    } else {
+      tensor.AsProtoTensorContent(v->mutable_tensor());
+    }
+
+    v->mutable_metadata()->ParseFromString(
+        serialized_summary_metadata_tensor.scalar<string>()());
+
+    Tensor* summary_tensor = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
+    CHECK(s.SerializeToString(&summary_tensor->scalar<string>()()));
+  }
+};
+
+#define REGISTER(T)                                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("TensorSummaryV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SummaryTensorOpV2<T>);
+
+TF_CALL_ALL_TYPES(REGISTER)
+
+#undef REGISTER
+
+// NOTE(chizeng): We are phasing out the use of SummaryTensorOp in favor of
+// SummaryTensorOpV2. This is because SummaryTensorOpV2 allows the callers to
+// pass a tag (more consistent with other summaries) as well as serialized
+// summary metadata used by plugins (which lets TensorBoard determine which
+// events are relevant to which plugins).
 template <typename T>
 class SummaryTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/summary_tensor_op_test.cc b/tensorflow/core/kernels/summary_tensor_op_test.cc
new file mode 100644
index 00000000000..0006a71bd7b
--- /dev/null
+++ b/tensorflow/core/kernels/summary_tensor_op_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+static void EXPECT_SummaryMatches(const Summary& actual,
+                                  const string& expected_str) {
+  Summary expected;
+  CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
+  EXPECT_EQ(expected.DebugString(), actual.DebugString());
+}
+
+// --------------------------------------------------------------------------
+// SummaryTensorOpV2
+// --------------------------------------------------------------------------
+class SummaryTensorOpV2Test : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "TensorSummaryV2")
+                     .Input(FakeInput(DT_STRING))
+                     .Input(FakeInput(DT_STRING))
+                     .Input(FakeInput(DT_STRING))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(SummaryTensorOpV2Test, BasicPluginData) {
+  MakeOp();
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({}), {"tag_foo"});
+  AddInputFromArray<string>(TensorShape({}), {"some string tensor content"});
+
+  // Create a SummaryMetadata that stores data for 2 plugins.
+  SummaryMetadata summary_metadata;
+  SummaryMetadata::PluginData* plugin_data_0 =
+      summary_metadata.add_plugin_data();
+  plugin_data_0->set_plugin_name("foo");
+  plugin_data_0->set_content("content_for_plugin_foo");
+  SummaryMetadata::PluginData* plugin_data_1 =
+      summary_metadata.add_plugin_data();
+  plugin_data_1->set_plugin_name("bar");
+  plugin_data_1->set_content("content_for_plugin_bar");
+  AddInputFromArray<string>(TensorShape({}),
+                            {summary_metadata.SerializeAsString()});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output size.
+  Tensor* out_tensor = GetOutput(0);
+  ASSERT_EQ(0, out_tensor->dims());
+  Summary summary;
+  ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
+
+  ASSERT_EQ(1, summary.value_size());
+  ASSERT_EQ("tag_foo", summary.value(0).tag());
+  ASSERT_EQ(2, summary.value(0).metadata().plugin_data_size());
+  ASSERT_EQ("foo", summary.value(0).metadata().plugin_data(0).plugin_name());
+  ASSERT_EQ("content_for_plugin_foo",
+            summary.value(0).metadata().plugin_data(0).content());
+  ASSERT_EQ("bar", summary.value(0).metadata().plugin_data(1).plugin_name());
+  ASSERT_EQ("content_for_plugin_bar",
+            summary.value(0).metadata().plugin_data(1).content());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/take_dataset_op.cc
new file mode 100644
index 00000000000..068ec4b9e36
--- /dev/null
+++ b/tensorflow/core/kernels/take_dataset_op.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TakeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit TakeDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    // Create a new TakeDatasetOp::Dataset, and return it as the output.
+    int64 count;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
+    *output = new Dataset(count, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return input_->MakeIterator();
+      } else if (count_ == 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "TakeDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        while (i_ < dataset()->count_) {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            ++i_;
+            return Status::OK();
+          }
+          break;
+        }
+        *end_of_sequence = true;
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TakeDataset").Device(DEVICE_CPU), TakeDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index ad3f7cb1e55..7b85ff2ea41 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -45,6 +45,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA
@@ -67,6 +69,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU)
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 47041309942..b43fafe9218 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -61,6 +61,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA
@@ -86,6 +88,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU)
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 166aa8fb34c..075bacb432b 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -101,7 +102,7 @@ Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
 class TensorArrayCreationOp : public OpKernel {
  public:
   explicit TensorArrayCreationOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context), device_type_(context->device_type()) {}
 
   void Compute(OpKernelContext* ctx) override {
     Tensor tensor_array_output_handle;
@@ -133,6 +134,12 @@ class TensorArrayCreationOp : public OpKernel {
       // Create the flow output.
       Tensor* flow;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
+      if (device_type_ == DEVICE_CPU) {
+        // Value doesn't matter, but this makes msan not complaint about
+        // copying an uninitialized value. To do this on GPU would require
+        // a kernel launch or a host->device memcpy, so we avoid that.
+        flow->flat<float>()(0) = 0;
+      }
     }
   }
 
@@ -140,6 +147,9 @@ class TensorArrayCreationOp : public OpKernel {
   virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
                                    Tensor* tensor_array_output_handle,
                                    TensorArray** output_tensor_array) = 0;
+
+ private:
+  const DeviceType device_type_;
 };
 
 // A per-run local tensor array. The tensor array uses a "per-step" resource
@@ -156,7 +166,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
                    context->GetAttr("clear_after_read", &clear_after_read_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("tensor_array_name", &tensor_array_name_));
-    if (tensor_array_name_ == "") tensor_array_name_ = name();
+    if (tensor_array_name_.empty()) tensor_array_name_ = name();
   }
 
   Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
@@ -171,6 +181,9 @@ class TensorArrayOp : public TensorArrayCreationOp {
           tensor_size->shape().DebugString());
     }
     const int32 size = tensor_size->scalar<int32>()();
+    if (size < 0) {
+      return errors::InvalidArgument("Size should be >= 0.");
+    }
 
     auto handle = tensor_array_output_handle->flat<string>();
     string unique_tensor_array_name =
@@ -233,6 +246,8 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
                           TensorArrayOp);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -285,11 +300,14 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     // may no longer be resized by new Writes.
     tensor_array->DisableDynamicSize();
 
-    int32 array_size;
-    int32 marked_size;
+    int32 array_size = 0;
+    int32 marked_size = 0;
     TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
     TF_RETURN_IF_ERROR(tensor_array->MarkedSize(&marked_size));
 
+    if (array_size < 0) {
+      return errors::InvalidArgument("ArraySize should be >= 0.");
+    }
     if (!tensor_array->GradientsAllowed()) {
       return errors::InvalidArgument(
           "Unable to create a gradients TensorArray for ", tensor_array_name,
@@ -426,6 +444,8 @@ TF_CALL_ALL_TYPES(REGISTER_WRITE);
                           TensorArrayWriteOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -513,6 +533,8 @@ TF_CALL_ALL_TYPES(REGISTER_READ)
                           TensorArrayReadOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -609,6 +631,12 @@ class TensorArrayPackOrGatherOp : public OpKernel {
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    // If output_tensor is empty, there is nothing to concatenate so return it.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
     ConstMatrixVector input_tensors_flat;
     input_tensors_flat.reserve(num_indices);
     auto output_flat =
@@ -706,6 +734,8 @@ REGISTER_GATHER_AND_PACK(bfloat16);
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -926,6 +956,8 @@ REGISTER_CONCAT(bfloat16);
                           TensorArrayConcatOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -1143,6 +1175,8 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
                                    false /* LEGACY_UNPACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
@@ -1308,6 +1342,8 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT);
                           TensorArraySplitOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_dataset_op.cc b/tensorflow/core/kernels/tensor_dataset_op.cc
new file mode 100644
index 00000000000..5674af787b3
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_dataset_op.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TensorDatasetOp : public DatasetOpKernel {
+ public:
+  explicit TensorDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Create a new TensorDatasetOp::Dataset, insert it in the step
+    // container, and return it as the output.
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs));
+    // TODO(mrry): Validate that the shapes of the "components" tensors match
+    // the "shapes" attr.;
+    std::vector<Tensor> components;
+    components.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      components.push_back(t);
+    }
+    *output = new Dataset(std::move(components));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<Tensor> tensors)
+        : tensors_(std::move(tensors)) {
+      for (const Tensor& t : tensors_) {
+        dtypes_.push_back(t.dtype());
+        shapes_.emplace_back(t.shape().dim_sizes());
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return dtypes_; }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "TensorDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset), produced_(false) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!produced_) {
+          *out_tensors = dataset()->tensors_;
+          produced_ = true;
+          *end_of_sequence = false;
+          return Status::OK();
+        } else {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+      }
+
+     private:
+      mutex mu_;
+      bool produced_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<Tensor> tensors_;
+    DataTypeVector dtypes_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorDataset").Device(DEVICE_CPU),
+                        TensorDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
new file mode 100644
index 00000000000..69dd1584b63
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
@@ -0,0 +1,180 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TensorSliceDatasetOp : public DatasetOpKernel {
+ public:
+  explicit TensorSliceDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Create a new TensorDatasetOp::Dataset, insert it in the step
+    // container, and return it as the output.
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs));
+    std::vector<Tensor> components;
+    components.reserve(inputs.size());
+    OP_REQUIRES(ctx, inputs[0].dims() > 0,
+                errors::InvalidArgument(
+                    "All components must be at least 1-dimensional"));
+    const int64 num_slices = inputs[0].dim_size(0);
+    for (const Tensor& t : inputs) {
+      components.push_back(t);
+      OP_REQUIRES(ctx, t.dims() > 0,
+                  errors::InvalidArgument(
+                      "All components must be at least 1-dimensional"));
+      OP_REQUIRES(
+          ctx, t.dim_size(0) == num_slices,
+          errors::InvalidArgument(
+              "All components must have the same size in the 0th dimension"));
+    }
+    *output = new Dataset(std::move(components));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<Tensor> tensors)
+        : tensors_(std::move(tensors)) {
+      for (const Tensor& t : tensors_) {
+        dtypes_.push_back(t.dtype());
+        gtl::InlinedVector<int64, 4> partial_dim_sizes;
+        // Handle scalar here. Check that everyone matches here? Or fail
+        // at runtime?
+        for (int i = 1; i < t.dims(); ++i) {
+          partial_dim_sizes.push_back(t.dim_size(i));
+        }
+        shapes_.emplace_back(std::move(partial_dim_sizes));
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return dtypes_; }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "TensorSliceDatasetOp::Dataset"; }
+
+   private:
+    template <DataType DT>
+    static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
+                                       int64 index) {
+      typedef typename EnumToDataType<DT>::Type T;
+      DCHECK_NE(parent.dim_size(0), 0);
+      DCHECK_GE(index, 0);
+      if (element->NumElements() !=
+          (parent.NumElements() / parent.dim_size(0))) {
+        TensorShape chip_shape = parent.shape();
+        chip_shape.RemoveDim(0);
+        return errors::Internal(
+            "HandleSliceToElement Cannot copy slice: number of elements does "
+            "not match.  Shapes are: [element]: ",
+            element->shape().DebugString(), ", [parent slice]: ",
+            chip_shape.DebugString());
+      }
+      auto parent_as_matrix = parent.flat_outer_dims<T>();
+      element->flat<T>() = parent_as_matrix.chip(index, 0);
+      return Status::OK();
+    }
+
+    static Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                     int64 index) {
+#define HANDLE_TYPE(DT)                                                   \
+  if (parent.dtype() == DT) {                                             \
+    TF_RETURN_IF_ERROR(HandleSliceToElement<DT>(parent, element, index)); \
+    return Status::OK();                                                  \
+  }
+      HANDLE_TYPE(DT_FLOAT);
+      HANDLE_TYPE(DT_HALF);
+      HANDLE_TYPE(DT_DOUBLE);
+      HANDLE_TYPE(DT_INT32);
+      HANDLE_TYPE(DT_UINT8);
+      HANDLE_TYPE(DT_INT16);
+      HANDLE_TYPE(DT_INT8);
+      HANDLE_TYPE(DT_STRING);
+      HANDLE_TYPE(DT_COMPLEX64);
+      HANDLE_TYPE(DT_COMPLEX128);
+      HANDLE_TYPE(DT_INT64);
+      HANDLE_TYPE(DT_BOOL);
+      HANDLE_TYPE(DT_QINT8);
+      HANDLE_TYPE(DT_QUINT8);
+      HANDLE_TYPE(DT_QINT32);
+      HANDLE_TYPE(DT_QINT16);
+      HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+      return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
+                                   element->dtype());
+    }
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            n_(dataset->tensors_[0].dim_size(0)) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (i_ < n_) {
+          out_tensors->clear();
+          out_tensors->reserve(dataset()->tensors_.size());
+          for (int i = 0; i < dataset()->tensors_.size(); ++i) {
+            const Tensor& t = dataset()->tensors_[i];
+            Tensor t_slice(cpu_allocator(), t.dtype(),
+                           TensorShape(dataset()->shapes_[i].dim_sizes()));
+            TF_RETURN_IF_ERROR(CopySliceToElement(t, &t_slice, i_));
+            out_tensors->emplace_back(std::move(t_slice));
+          }
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int i_ GUARDED_BY(mu_);
+      const int n_;
+    };
+
+    const std::vector<Tensor> tensors_;
+    DataTypeVector dtypes_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorSliceDataset").Device(DEVICE_CPU),
+                        TensorSliceDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
index ffa647d8ef9..41b59949465 100644
--- a/tensorflow/core/kernels/text_line_reader_op.cc
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+#include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
-#include "tensorflow/core/kernels/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index efadd9b7e93..e63f6206689 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+#include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
-#include "tensorflow/core/kernels/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -87,7 +87,8 @@ class TFRecordReaderOp : public ReaderOpKernel {
     Env* env = context->env();
 
     string compression_type;
-    context->GetAttr("compression_type", &compression_type);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("compression_type", &compression_type));
 
     SetReaderFactory([this, compression_type, env]() {
       return new TFRecordReader(name(), compression_type, env);
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index e55c8679e92..7c72487d3f2 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -124,6 +124,10 @@ class TileOp : public OpKernel {
                                   multiples_array[i]));
       output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
     }
+    if (output_shape == input.shape()) {
+      context->set_output(0, input);
+      return;
+    }
     Tensor* result = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
 
@@ -260,6 +264,10 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
 #endif // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
@@ -313,6 +321,10 @@ class TileGradientOp : public OpKernel {
       output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
       input_dim_size_vec.push_back(input.dim_size(i));
     }
+    if (output_shape == input.shape()) {
+      context->set_output(0, input);
+      return;
+    }
     Tensor* result = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
 
@@ -506,6 +518,18 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_SYCL
+#define HANDLE_TYPE_NAME_SYCL(T) \
+  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
+
+TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
+#undef HANDLE_TYPE_NAME_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
 #undef HANDLE_CASE_DIM
@@ -605,6 +629,25 @@ REGISTER_KERNEL_BUILDER(Name("Tile")
                             .TypeConstraint<int32>("Tmultiples")
                             .HostMemory("multiples"),
                         TileOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<double>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileOp<SYCLDevice>);
+
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileGradientOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<double>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileGradientOp<SYCLDevice>);
 #endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 650c739ed59..db3f0464391 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -69,7 +69,13 @@ typedef Eigen::SyclDevice SYCLDevice;
 #define DEFINE_DIM(T, NDIM) template struct Tile<SYCLDevice, T, NDIM>;
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
+TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
@@ -80,7 +86,13 @@ TF_CALL_float(DEFINE_TYPE);
   template struct ReduceAndReshape<SYCLDevice, T, NDIM, 1>;
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
+TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 00c2e6072c6..5c89eaef5fe 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include <algorithm>
+#include <numeric>
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -24,9 +26,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 template <typename T>
 class TopK : public OpKernel {
  public:
@@ -59,8 +65,8 @@ class TopK : public OpKernel {
 
     const auto& input = input_in.flat_inner_dims<T>();
 
-    const auto num_rows = input.dimension(0);  // generally batch_size
-    const auto num_cols = input.dimension(1);
+    const int64 num_rows = input.dimension(0);  // generally batch_size
+    const int64 num_cols = input.dimension(1);
 
     TensorShape output_shape = input_in.shape();
     output_shape.set_dim(input_in.dims() - 1, k);
@@ -76,32 +82,94 @@ class TopK : public OpKernel {
 
     auto values = values_out->flat_inner_dims<T>();
     auto indices = indices_out->flat_inner_dims<int32>();
-    gtl::TopN<std::pair<T, int32>> filter(k);
-    for (int r = 0; r < num_rows; r++) {
-      for (int32 c = 0; c < num_cols; ++c) {
-        // The second element is the negated index, so that lower-index elements
-        // are considered larger than higher-index elements in case of ties.
-        filter.push(std::make_pair(input(r, c), -c));
+
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+
+    // Special case for k == 1.
+    if (k == 1) {
+#ifdef EIGEN_HAS_INDEX_LIST
+      typename Eigen::IndexList<Eigen::type2index<1>> reduce_on_cols;
+      typename Eigen::IndexList<int, Eigen::type2index<1>> rows_by_one;
+      rows_by_one.set(0, num_rows);
+#else
+      Eigen::array<int, 1> reduce_on_cols = {1};
+      Eigen::array<int, 2> rows_by_one = {static_cast<int>(num_rows), 1};
+#endif
+
+      values.device(d) =
+          input.maximum(/*dims=*/reduce_on_cols).eval().reshape(rows_by_one);
+      // Get the indices of the maximum values.
+      for (int r = 0; r < num_rows; ++r) {
+        for (int c = 0; c < num_cols; ++c) {
+          if (values(r, 0) == input(r, c)) {
+            indices(r, 0) = c;
+            break;
+          }
+        }
       }
 
-      int32 i = 0;
-      if (sorted_ && k > 1) {
-        std::unique_ptr<std::vector<std::pair<T, int32>>> top_k(
-            filter.Extract());
-        for (auto top_k_it = top_k->begin(); top_k_it != top_k->end();
-             ++top_k_it, ++i) {
-          values(r, i) = top_k_it->first;
-          indices(r, i) = -top_k_it->second;
-        }
-      } else {
-        for (auto top_k_it = filter.unsorted_begin();
-             top_k_it != filter.unsorted_end(); ++top_k_it, ++i) {
-          values(r, i) = top_k_it->first;
-          indices(r, i) = -top_k_it->second;
-        }
-      }
-      filter.Reset();
+      return;
     }
+
+    auto SortIndices = [&, context](int start_batch, int limit_batch) {
+      for (int32 b = start_batch; b < limit_batch; ++b) {
+        const T* input_data = &input(b, 0);
+        const auto comp = [input_data](const int32 a, const int32 b) {
+          return input_data[a] > input_data[b];
+        };
+        gtl::TopN<int32, decltype(comp)> filter(k, comp);
+        // TODO(ebrevdo): For large k < num_cols, instead of using
+        // TopN, it may be faster to create a temporary vector of
+        // values 0..num_cols - 1 and then use std::partial_sort_copy
+        // of this into indices. Choosing the appropriate minimum k or
+        // ratio of k/num_cols will require some experimentation.
+        if (k == num_cols) {
+          // Set the initial array of indices 0 ... k - 1.
+          std::iota(&indices(b, 0), &indices(b, k), 0);
+          // Use an in-place sort.
+          std::sort(&indices(b, 0), &indices(b, k), comp);
+        } else {
+          // Use the TopN heap object to sort.
+          filter.reserve(num_cols);
+          for (int32 c = 0; c < num_cols; ++c) {
+            filter.push(c);
+          }
+
+          int32 i = 0;
+          if (sorted_) {
+            std::unique_ptr<std::vector<int32>> top_k(filter.Extract());
+            for (auto top_k_it = top_k->begin(); top_k_it != top_k->end();
+                 ++top_k_it, ++i) {
+              indices(b, i) = *top_k_it;
+            }
+          } else {
+            for (auto top_k_it = filter.unsorted_begin();
+                 top_k_it != filter.unsorted_end(); ++top_k_it, ++i) {
+              indices(b, i) = *top_k_it;
+            }
+          }
+        }
+        // Now that the indices are sorted, copy the values over in
+        // sorted order.
+        std::transform(&indices(b, 0), &indices(b, k), &values(b, 0),
+                       [b, &input](const int32 loc) { return input(b, loc); });
+      }  // for (int32 b = ...
+    };
+
+    // Guesstimate of cost; 4*N*log(K) where N == num_cols.
+    // If K == N, assume the cost is N*log(K + 1).
+    const int64 cmp_cost = 3 * Eigen::TensorOpCost::AddCost<int32>() +
+                           Eigen::TensorOpCost::AddCost<T>();
+    const int64 base_cost =
+        cmp_cost *
+        static_cast<int64>(num_cols *
+                           Eigen::numext::log2(static_cast<float>(k + 1)));
+    const int64 sort_cost = (k == num_cols) ? base_cost : 4 * base_cost;
+    const int64 copy_cost = 2 * k * Eigen::TensorOpCost::AddCost<T>();
+    const int64 total_cost = sort_cost + copy_cost;
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
+          total_cost, SortIndices);
   }
 
  private:
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
new file mode 100644
index 00000000000..11d51188fcc
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      return var->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a vector of acquired mutexes.
+// Safe to pass duplicates - will only lock each distinct mutex once.  If
+// do_lock is false, returns immediately.  Note that this silently doesn't lock
+// mutexes for invalid variable references; in all usages this is followed by
+// GetInputTensor which will signal a failure.
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  std::vector<mutex_lock> locks;
+  if (!do_lock) {
+    return locks;
+  }
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    mutex* mutex = GetTrainingVariableMutex(ctx, input);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  for (auto input : acquire_order) {
+    mutex* mu = GetTrainingVariableMutex(ctx, input);
+    if (mu != nullptr) {
+      locks.emplace_back(*mu);
+    }
+  }
+  return locks;
+}
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      if (lock_held) {
+        *out = *var->tensor();
+      } else {
+        mutex_lock ml(*var->mu());
+        *out = *var->tensor();
+      }
+      return Status::OK();
+    } else {
+      return errors::Internal("Invalid variable reference.");
+    }
+  }
+  *out = ctx->mutable_input(input, lock_held);
+  return Status::OK();
+}
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output) {
+  if (ctx->input_dtype(input) != DT_RESOURCE) {
+    ctx->forward_ref_input_to_ref_output(input, output);
+  }
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
new file mode 100644
index 00000000000..f2577d452fa
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+#define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input);
+
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids);
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out);
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 336c6b0ccc9..f6b6194f0ab 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -20,8 +20,13 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -49,15 +54,26 @@ struct ApplyGradientDescent<CPUDevice, T> {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct ApplyGradientDescent<SYCLDevice, T> {
+struct ApplyGradientDescentSYCL {
   void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
-                  typename TTypes<T>::ConstScalar lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    var.device(d) -= grad * lr();
+                  T lr, typename TTypes<T>::ConstFlat grad) {
+    var.device(d) -= grad * lr;
   }
 };
 #endif
 
+template <typename T>
+struct ApplyDelayCompensatedGradientDescent<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar variance,
+                  typename TTypes<T>::Flat shadow) {
+    var.device(d) -= lr() * (grad + variance() * grad * (var - shadow));
+    shadow.device(d) = var;
+  }
+};
+
 template <typename T>
 struct ApplyAdadelta<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -92,15 +108,14 @@ struct ApplyProximalGradientDescent<CPUDevice, T> {
     // compute v = w - lr * grad.
     prox_var.device(d) -= grad * lr();
     if (l1() > 0) {
-      var.device(d) = prox_var.abs() - var.constant(lr() * l1());
       // compute sign(v) * max(|v| - lr * l1, 0)
-      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+      var.device(d) =
+          prox_var.sign() *
+          (prox_var.abs() - var.constant(lr() * l1())).cwiseMax(T(0.0)) /
+          (var.constant(1.0) + var.constant(l2() * lr()));
     } else {
-      var.device(d) = prox_var;
-    }
-    if (l2() > 0) {
-      // compute v / (1.0 + l2 * lr)
-      var.device(d) = var / (var.constant(1.0) + var.constant(l2() * lr()));
+      var.device(d) =
+          prox_var / (var.constant(1.0) + var.constant(l2() * lr()));
     }
   }
 };
@@ -169,15 +184,14 @@ struct ApplyProximalAdagrad<CPUDevice, T> {
     // compute v = w - lr * grad.
     prox_var.device(d) -= grad * learning_rate;
     if (l1() > 0) {
-      var.device(d) = prox_var.abs() - learning_rate * prox_var.constant(l1());
       // compute sign(v) * max(|v| - lr * l1, 0)
-      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+      var.device(d) = prox_var.sign() *
+                      (prox_var.abs() - learning_rate * prox_var.constant(l1()))
+                          .cwiseMax(T(0.0)) /
+                      (var.constant(1.0) + var.constant(l2()) * learning_rate);
     } else {
-      var.device(d) = prox_var;
-    }
-    if (l2() > 0) {
       var.device(d) =
-          var / (var.constant(1.0) + var.constant(l2()) * learning_rate);
+          prox_var / (var.constant(1.0) + var.constant(l2()) * learning_rate);
     }
   }
 };
@@ -205,14 +219,17 @@ struct ApplyFtrl<CPUDevice, T> {
     if (lr_power() == static_cast<T>(-0.5)) {
       auto y = new_accum.sqrt() / new_accum.constant(lr()) +
                linear.constant(static_cast<T>(2) * l2());
-      var.device(d) = x / y;
+      auto pre_shrink = x / y;
+      var.device(d) = (linear.abs() > linear.constant(l1()))
+                          .select(pre_shrink, var.constant(static_cast<T>(0)));
+
     } else {
       auto y = new_accum.pow(-lr_power()) / new_accum.constant(lr()) +
                linear.constant(static_cast<T>(2) * l2());
-      var.device(d) = x / y;
+      auto pre_shrink = x / y;
+      var.device(d) = (linear.abs() > linear.constant(l1()))
+                          .select(pre_shrink, var.constant(static_cast<T>(0)));
     }
-    var.device(d) = (linear.abs() > linear.constant(l1()))
-                        .select(var, var.constant(static_cast<T>(0)));
     accum.device(d) += grad.square();
   }
 };
@@ -243,19 +260,43 @@ struct ApplyAdamNonCuda {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
     const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
                     (T(1) - beta1_power());
+    // beta1 == μ
+    // beta2 == ν
+    // v     == n
+    // var   == θ
+
     m.device(d) += (grad - m) * (T(1) - beta1());
     v.device(d) += (grad.square() - v) * (T(1) - beta2());
-    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    if (use_nesterov) {
+      var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) /
+                       (v.sqrt() + epsilon());
+    } else {
+      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    }
   }
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct ApplyAdamSYCL {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  T beta1_power, T beta2_power, T lr, T beta1, T beta2, T epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr * Eigen::numext::sqrt(T(1) - beta2_power) /
+                    (T(1) - beta1_power);
+    m.device(d) += (grad - m) * (T(1) - beta1);
+    v.device(d) += (grad.square() - v) * (T(1) - beta2);
+    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon);
+  }
+};
+#endif // TENSORFLOW_USE_SYCL
+
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
-template <typename T>
-struct ApplyAdam<SYCLDevice, T> : ApplyAdamNonCuda<SYCLDevice, T> {};
 
 template <typename T>
 struct ApplyRMSProp<CPUDevice, T> {
@@ -293,80 +334,6 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
 
 }  // namespace functor
 
-mutex* GetMutex(OpKernelContext* ctx, int input) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      return var->mu();
-    } else {
-      ctx->CtxFailureWithWarning(
-          errors::Internal("Invalid variable reference."));
-      return nullptr;
-    }
-  }
-  return ctx->input_ref_mutex(input);
-}
-
-// MaybeLockMutexesInOrder is a helper function to acquire mutexes in address
-// order to mitigate deadlock.  Returns a vector of acquired mutexes.  Safe to
-// pass duplicates - will only lock each distinct mutex once.  If do_lock is
-// false, returns immediately.  Note that this silently doesn't lock mutexes for
-// invalid variable references; in all usages this is followed by GetInputTensor
-// which will signal a failure.
-std::vector<mutex_lock> MaybeLockMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
-  std::vector<mutex_lock> locks;
-  if (!do_lock) {
-    return locks;
-  }
-  std::vector<mutex*> mutexes;
-  std::vector<int> acquire_order;
-  for (auto input : input_ids) {
-    mutex* mutex = GetMutex(ctx, input);
-    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
-    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
-      acquire_order.push_back(input);
-      mutexes.push_back(mutex);
-    }
-  }
-  std::sort(acquire_order.begin(), acquire_order.end(),
-            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
-
-  for (auto input : acquire_order) {
-    mutex* mu = GetMutex(ctx, input);
-    if (mu != nullptr) {
-      locks.emplace_back(*mu);
-    }
-  }
-  return locks;
-}
-
-Status GetInputTensor(OpKernelContext* ctx, int input, bool lock_held,
-                      Tensor* out) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      if (lock_held) {
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        *out = *var->tensor();
-      }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
-    }
-  }
-  *out = ctx->mutable_input(input, lock_held);
-  return Status::OK();
-}
-
-void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
-                                     int output) {
-  if (ctx->input_dtype(input) != DT_RESOURCE) {
-    ctx->forward_ref_input_to_ref_output(input, output);
-  }
-}
 
 template <typename Device, typename T>
 class ApplyGradientDescentOp : public OpKernel {
@@ -376,9 +343,11 @@ class ApplyGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -406,12 +375,58 @@ class ApplyGradientDescentOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+class ApplyGradientDescentOp < SYCLDevice, T > : public OpKernel {
+ public:
+  explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha_dev = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(alpha_dev.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha_dev.shape().DebugString()));
+    const Tensor& delta = ctx->input(2);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+
+    auto device = ctx->eigen_sycl_device();
+    auto size = sizeof(T);
+    T alpha = T(0);
+    auto src_ptr = GetBase(&alpha_dev);
+    device.memcpyDeviceToHost(&alpha, static_cast<const T *>(src_ptr), size);
+
+    functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(),
+        alpha, delta.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+#endif // TENSORFLOW_USE_SYCL
+
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyGradientDescentOp<D##Device, T>);                                  \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyGradientDescent")                \
                               .Device(DEVICE_##D)                             \
+                              .HostMemory("var")                              \
                               .TypeConstraint<T>("T"),                        \
                           ApplyGradientDescentOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
@@ -420,12 +435,6 @@ TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif
-
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
@@ -446,6 +455,81 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
+TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
+ public:
+  explicit ApplyDelayCompensatedGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 4});
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& delta = ctx->input(2);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+    const Tensor& lambda = ctx->input(3);
+    OP_REQUIRES(ctx, IsLegacyScalar(lambda.shape()),
+                errors::InvalidArgument("lambda is not a scalar: ",
+                                        lambda.shape().DebugString()));
+    Tensor shadow;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 4, use_exclusive_lock_, &shadow));
+    OP_REQUIRES(
+        ctx, shadow.shape().IsSameSize(var.shape()),
+        errors::InvalidArgument("shadow and var do not have the same shape",
+                                shadow.shape().DebugString(), " ",
+                                var.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyDelayCompensatedGradientDescent<Device, T>()(
+        device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>(),
+        lambda.scalar<T>(), shadow.flat<T>()
+    );
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("ApplyDelayCompensatedGradientDescent")             \
+          .Device(DEVICE_##D)                                  \
+          .HostMemory("var")                                   \
+          .HostMemory("shadow")                                \
+          .TypeConstraint<T>("T"),                             \
+      ApplyDelayCompensatedGradientDescentOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
@@ -458,7 +542,7 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     if (use_exclusive_lock_) {
-      mutex_lock l1(*GetMutex(ctx, 0));
+      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -479,12 +563,14 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoValidate(OpKernelContext* ctx) {
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -531,12 +617,14 @@ class ApplyAdadeltaOp : public OpKernel {
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -552,13 +640,17 @@ class ApplyAdadeltaOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
-      ApplyAdadeltaOp<D##Device, T>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ResourceApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdadeltaOp<D##Device, T>);
+#define REGISTER_KERNELS(D, T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdadeltaOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdadelta")                \
+                              .Device(DEVICE_##D)                      \
+                              .HostMemory("var")                       \
+                              .HostMemory("accum")                     \
+                              .HostMemory("accum_update")              \
+                              .TypeConstraint<T>("T"),                 \
+                          ApplyAdadeltaOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -599,7 +691,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetMutex(ctx, 0);
+    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
@@ -608,13 +700,14 @@ class SparseApplyAdadeltaOp : public OpKernel {
       mu_var->lock();
     }
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum_grad;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 1, use_exclusive_lock_, &accum_grad));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &accum_grad));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -749,9 +842,11 @@ class ApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -797,6 +892,7 @@ class ApplyProximalGradientDescentOp : public OpKernel {
                               .TypeConstraint<T>("T"),                   \
                           ApplyProximalGradientDescentOp<D##Device, T>); \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalGradientDescent")   \
+                              .HostMemory("var")                         \
                               .Device(DEVICE_##D)                        \
                               .TypeConstraint<T>("T"),                   \
                           ApplyProximalGradientDescentOp<D##Device, T>);
@@ -815,9 +911,11 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -882,14 +980,14 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
           // v = w - g * learning_rate.
           prox_v -= g * learning_rate;
           if (l1_scalar > 0) {
-            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
             // compute sign(v) * max(|v|, 0)
-            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+            v = prox_v.sign() *
+                (prox_v.abs() - learning_rate * prox_v.constant(l1_scalar))
+                    .cwiseMax(static_cast<T>(0.0)) /
+                (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
           } else {
-            v = prox_v;
-          }
-          if (l2_scalar > 0) {
-            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+            v = prox_v /
+                (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
           }
         }
       } else {
@@ -912,14 +1010,13 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
           auto prox_v = var_flat(index);
           prox_v -= learning_rate * g;
           if (l1_scalar > 0) {
-            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
             var_flat(index) =
-                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+                sgn(prox_v) *
+                std::max(std::abs(prox_v) - learning_rate * l1_scalar,
+                         static_cast<T>(0.0)) /
+                (1.0 + l2_scalar * learning_rate);
           } else {
-            var_flat(index) = prox_v;
-          }
-          if (l2_scalar > 0) {
-            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+            var_flat(index) = prox_v / (1.0 + l2_scalar * learning_rate);
           }
         }
       }
@@ -958,11 +1055,14 @@ class ApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1001,13 +1101,16 @@ class ApplyAdagradOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                                \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
-      ApplyAdagradOp<D##Device, T>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("ResourceApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdagradOp<D##Device, T>);
+#define REGISTER_KERNELS(D, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdagradOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagrad")                \
+                              .HostMemory("var")                      \
+                              .HostMemory("accum")                    \
+                              .Device(DEVICE_##D)                     \
+                              .TypeConstraint<T>("T"),                \
+                          ApplyAdagradOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -1045,11 +1148,14 @@ class ApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1111,6 +1217,8 @@ using GPUDevice = Eigen::GpuDevice;
       ApplyProximalAdagradOp<D##Device, T>);                                  \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalAdagrad")                \
                               .Device(DEVICE_##D)                             \
+                              .HostMemory("var")                              \
+                              .HostMemory("accum")                            \
                               .TypeConstraint<T>("T"),                        \
                           ApplyProximalAdagradOp<D##Device, T>);
 
@@ -1147,11 +1255,14 @@ class SparseApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1278,11 +1389,14 @@ class SparseApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1369,14 +1483,14 @@ class SparseApplyProximalAdagradOp : public OpKernel {
           // v = w - g * learning_rate.
           prox_v -= g * learning_rate;
           if (l1_scalar > 0) {
-            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
             // compute sign(v) * max(|v|, 0)
-            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+            v = prox_v.sign() *
+                (prox_v.abs() - learning_rate * prox_v.constant(l1_scalar))
+                    .cwiseMax(static_cast<T>(0.0)) /
+                (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
           } else {
-            v = prox_v;
-          }
-          if (l2_scalar > 0) {
-            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+            v = prox_v /
+                (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
           }
         }
       } else {
@@ -1402,14 +1516,13 @@ class SparseApplyProximalAdagradOp : public OpKernel {
           auto prox_v = var_flat(index);
           prox_v -= learning_rate * g;
           if (l1_scalar > 0) {
-            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
             var_flat(index) =
-                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+                sgn(prox_v) *
+                std::max(std::abs(prox_v) - learning_rate * l1_scalar,
+                         static_cast<T>(0.0)) /
+                (1.0 + l2_scalar * learning_rate);
           } else {
-            var_flat(index) = prox_v;
-          }
-          if (l2_scalar > 0) {
-            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+            var_flat(index) = prox_v / (1.0 + l2_scalar * learning_rate);
           }
         }
       }
@@ -1448,15 +1561,17 @@ class ApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1523,13 +1638,16 @@ class ApplyAdagradDAOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                          \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("ApplyAdagradDA").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdagradDAOp<D##Device, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagradDA")                \
-                              .Device(DEVICE_##D)                       \
-                              .TypeConstraint<T>("T"),                  \
+#define REGISTER_KERNELS(D, T)                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ApplyAdagradDA").Device(DEVICE_##D).TypeConstraint<T>("T"),   \
+      ApplyAdagradDAOp<D##Device, T>);                                    \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdagradDA")                  \
+                              .Device(DEVICE_##D)                         \
+                              .HostMemory("var")                          \
+                              .HostMemory("gradient_accumulator")         \
+                              .HostMemory("gradient_squared_accumulator") \
+                              .TypeConstraint<T>("T"),                    \
                           ApplyAdagradDAOp<D##Device, T>);
 
 REGISTER_KERNELS(CPU, float);
@@ -1545,15 +1663,17 @@ class SparseApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1657,10 +1777,10 @@ class SparseApplyAdagradDAOp : public OpKernel {
           ga += g;
           da += g.square();
           if (l1_scalar > 0) {
-            v = (ga.abs() / ga.constant(global_step_scalar)) -
-                ga.constant(l1_scalar);
             v = ga.constant(-1.0) * ga.sign() *
-                v.cwiseMax(static_cast<T>(0.0)) /
+                ((ga.abs() / ga.constant(global_step_scalar)) -
+                 ga.constant(l1_scalar))
+                    .cwiseMax(static_cast<T>(0.0)) /
                 (v.constant(l2_scalar) + da.sqrt() / v.constant(gs_lr));
           } else {
             v = ga.constant(-1.0) * (ga / ga.constant(global_step_scalar)) /
@@ -1710,16 +1830,19 @@ class SparseApplyAdagradDAOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagradDA")               \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
-                          SparseApplyAdagradDAOp<T, Tindices>);      \
-  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagradDA")       \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
+#define REGISTER_KERNELS(T, Tindices)                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagradDA")                    \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .TypeConstraint<Tindices>("Tindices"),      \
+                          SparseApplyAdagradDAOp<T, Tindices>);           \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagradDA")            \
+                              .Device(DEVICE_CPU)                         \
+                              .HostMemory("var")                          \
+                              .HostMemory("gradient_accumulator")         \
+                              .HostMemory("gradient_squared_accumulator") \
+                              .TypeConstraint<T>("T")                     \
+                              .TypeConstraint<Tindices>("Tindices"),      \
                           SparseApplyAdagradDAOp<T, Tindices>);
 
 REGISTER_KERNELS(float, int32);
@@ -1736,14 +1859,18 @@ class ApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1818,13 +1945,17 @@ class ApplyFtrlOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                             \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
-      ApplyFtrlOp<D##Device, T>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ResourceApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyFtrlOp<D##Device, T>);
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyFtrlOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyFtrl")                \
+                              .HostMemory("var")                   \
+                              .HostMemory("accum")                 \
+                              .HostMemory("linear")                \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<T>("T"),             \
+                          ApplyFtrlOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -1843,13 +1974,17 @@ class SparseApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2049,12 +2184,15 @@ class ApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2099,13 +2237,16 @@ class ApplyMomentumOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
-      ApplyMomentumOp<D##Device, T>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("ResourceApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyMomentumOp<D##Device, T>);
+#define REGISTER_KERNELS(D, T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyMomentumOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyMomentum")                \
+                              .Device(DEVICE_##D)                      \
+                              .HostMemory("var")                       \
+                              .HostMemory("accum")                     \
+                              .TypeConstraint<T>("T"),                 \
+                          ApplyMomentumOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -2146,12 +2287,15 @@ class SparseApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2256,17 +2400,22 @@ class ApplyAdamOp : public OpKernel {
  public:
   explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &m));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
     Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &v));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2322,11 +2471,125 @@ class ApplyAdamOp : public OpKernel {
                                 grad.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
-    functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(),
-                                    v.flat<T>(), beta1_power.scalar<T>(),
-                                    beta2_power.scalar<T>(), lr.scalar<T>(),
-                                    beta1.scalar<T>(), beta2.scalar<T>(),
-                                    epsilon.scalar<T>(), grad.flat<T>());
+    functor::ApplyAdam<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>(), use_nesterov_);
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+class ApplyAdamOp < SYCLDevice, T> : public OpKernel {
+ public:
+  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(2)));
+
+    const Tensor& beta1_power_dev = ctx->input(3);
+    const Tensor& beta2_power_dev = ctx->input(4);
+    const Tensor& lr_dev = ctx->input(5);
+    const Tensor& beta1_dev = ctx->input(6);
+    const Tensor& beta2_dev = ctx->input(7);
+    const Tensor& epsilon_dev = ctx->input(8);
+
+    T beta1_power = 0;
+    T beta2_power = 0;
+    T lr = 0;
+    T beta1 = 0;
+    T beta2 = 0;
+    T epsilon = 0;
+
+    auto device = ctx->eigen_sycl_device();
+    auto size = sizeof(T);
+    auto src_ptr = GetBase(&beta1_power_dev);
+    device.memcpyDeviceToHost(&beta1_power, static_cast<const T *>(src_ptr), size);
+
+    src_ptr = GetBase(&beta2_power_dev);
+    device.memcpyDeviceToHost(&beta2_power, static_cast<const T *>(src_ptr), size);
+
+    src_ptr = GetBase(&lr_dev);
+    device.memcpyDeviceToHost(&lr, static_cast<const T *>(src_ptr), size);
+
+    src_ptr = GetBase(&beta1_dev);
+    device.memcpyDeviceToHost(&beta1, static_cast<const T *>(src_ptr), size);
+
+    src_ptr = GetBase(&beta2_dev);
+    device.memcpyDeviceToHost(&beta2, static_cast<const T *>(src_ptr), size);
+
+    src_ptr = GetBase(&epsilon_dev);
+    device.memcpyDeviceToHost(&epsilon, static_cast<const T *>(src_ptr), size);
+
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_dev.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power_dev.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_dev.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power_dev.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_dev.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr_dev.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_dev.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1_dev.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_dev.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2_dev.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_dev.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_dev.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(9);
+
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(),
+                                    v.flat<T>(), beta1_power,
+                                    beta2_power, lr,
+                                    beta1, beta2,
+                                    epsilon, grad.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2334,17 +2597,22 @@ class ApplyAdamOp : public OpKernel {
  private:
   bool use_exclusive_lock_;
 };
+#endif // TENSORFLOW_USE_SYCL
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                             \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
-      ApplyAdamOp<D##Device, T>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ResourceApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyAdamOp<D##Device, T>);
+#define REGISTER_KERNELS(D, T)                                     \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdamOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdam")                \
+                              .HostMemory("var")                   \
+                              .HostMemory("m")                     \
+                              .HostMemory("v")                     \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<T>("T"),             \
+                          ApplyAdamOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -2355,6 +2623,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
 
 TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
 #endif
 
 #if GOOGLE_CUDA
@@ -2371,7 +2640,7 @@ namespace functor {
       typename TTypes<T>::ConstScalar beta1,                  \
       typename TTypes<T>::ConstScalar beta2,                  \
       typename TTypes<T>::ConstScalar epsilon,                \
-      typename TTypes<T>::ConstFlat grad);                    \
+      typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
   extern template struct ApplyAdam<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -2394,14 +2663,18 @@ class ApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2472,17 +2745,21 @@ class ApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2563,11 +2840,19 @@ using GPUDevice = Eigen::GpuDevice;
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyCenteredRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyCenteredRMSPropOp<D##Device, T>);                                  \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("ResourceApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyRMSPropOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyRMSProp")                        \
+                              .Device(DEVICE_##D)                             \
+                              .HostMemory("var")                              \
+                              .HostMemory("ms")                               \
+                              .HostMemory("mom")                              \
+                              .TypeConstraint<T>("T"),                        \
+                          ApplyRMSPropOp<D##Device, T>);                      \
   REGISTER_KERNEL_BUILDER(Name("ResourceApplyCenteredRMSProp")                \
                               .Device(DEVICE_##D)                             \
+                              .HostMemory("var")                              \
+                              .HostMemory("mg")                               \
+                              .HostMemory("ms")                               \
+                              .HostMemory("mom")                              \
                               .TypeConstraint<T>("T"),                        \
                           ApplyCenteredRMSPropOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
@@ -2621,14 +2906,18 @@ class SparseApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2746,17 +3035,21 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 6b599656ce1..0a3c5d361ed 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -34,6 +34,15 @@ struct ApplyGradientDescent {
                   typename TTypes<T>::ConstFlat delta);
 };
 
+template <typename Device, typename T>
+struct ApplyDelayCompensatedGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstFlat delta,
+                  typename TTypes<T>::ConstScalar lambda,
+                  typename TTypes<T>::Flat shadow);
+};
+
 template <typename Device, typename T>
 struct ApplyAdadelta {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -123,7 +132,7 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad);
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
 template <typename Device, typename T>
@@ -148,7 +157,6 @@ struct ApplyCenteredRMSProp {
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad);
 };
-
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index f6acdf2422c..3678b96e98f 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -109,7 +109,7 @@ struct ApplyAdam<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -122,11 +122,25 @@ struct ApplyAdam<GPUDevice, T> {
         v +
         (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
             (grad.square() - v);
-    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
-                      (beta1_power.constant(one) - beta1_power))
-                         .reshape(single)
-                         .broadcast(bcast) *
-                     m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+
+    if (use_nesterov) {
+      var.device(d) -=
+          (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+           (beta1_power.constant(one) - beta1_power))
+              .reshape(single)
+              .broadcast(bcast) *
+          (m * beta1.reshape(single).broadcast(bcast) +
+           (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+               grad) /
+          (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+    } else {
+      var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                        (beta1_power.constant(one) - beta1_power))
+                           .reshape(single)
+                           .broadcast(bcast) *
+                       m /
+                       (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 99f7d8e56d5..124cf14dd2c 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -35,6 +35,9 @@ Status DoTranspose(const Device& device, const Tensor& in,
 // Implementation details.
 namespace internal {
 
+typedef gtl::InlinedVector<int64, 8> TransposeDimsVec;
+typedef gtl::InlinedVector<int32, 8> TransposePermsVec;
+
 // Helper to compute 'strides' given a tensor 'shape'. I.e.,
 // strides[i] = prod(shape.dim_size[(i+1):])
 template <typename Index>
@@ -47,6 +50,78 @@ void ComputeStride(const TensorShape& shape, Index* strides) {
   }
 }
 
+// Helper function that takes a tensor shape, a permutation, combines the
+// neighboring shapes if their indices in the permutation are consecutive.
+// The function outputs the combined shape and new permutation.
+// Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
+// produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
+inline void ReduceTransposeDimensions(const TensorShape& shape,
+                                      gtl::ArraySlice<int32> perm,
+                                      TransposePermsVec* new_perm,
+                                      TransposeDimsVec* new_dims) {
+  CHECK_EQ(shape.dims(), perm.size());
+  if (shape.dims() == 1) {
+    // If input dimension is already 1, no need to reduce dimension.
+    new_perm->resize(1);
+    (*new_perm)[0] = perm[0];
+    (*new_dims)[0] = shape.dim_size(0);
+    return;
+  }
+  TransposePermsVec new_dim_position(shape.dims(), -1);
+  TransposeDimsVec combined_dims(shape.dims(), 0);
+  int cur_head = perm[0];
+  new_dim_position[cur_head] = 0;
+  combined_dims[0] = shape.dim_size(cur_head);
+  int dim_idx = 0;
+  for (int perm_idx = 1; perm_idx < shape.dims(); ++perm_idx) {
+    // If two indices in permutation are consecutive numbers, combine their
+    // dimensions.
+    if (cur_head + 1 == perm[perm_idx]) {
+      cur_head = perm[perm_idx];
+      combined_dims[dim_idx] *= shape.dim_size(cur_head);
+    } else {
+      // Else start a new dimension.
+      cur_head = perm[perm_idx];
+      dim_idx++;
+      new_dim_position[cur_head] = dim_idx;
+      combined_dims[dim_idx] = shape.dim_size(cur_head);
+    }
+  }
+  // Compact the new permutations and dimension sizes.
+  new_perm->resize(dim_idx + 1);
+  new_dims->resize(dim_idx + 1);
+  dim_idx = 0;
+  for (int i = 0; i < new_dim_position.size(); ++i) {
+    if (new_dim_position[i] >= 0) {
+      int new_perm_idx = new_dim_position[i];
+      (*new_perm)[dim_idx] = new_perm_idx;
+      (*new_dims)[dim_idx] = combined_dims[new_perm_idx];
+      dim_idx++;
+    }
+  }
+}
+
+// If all non-singleton dimensions remain in ascending order, the shuffled
+// singletons can be transposed by a reshape, saving a memory allocation & copy.
+// |permutation| must be a permutation of {0, .., input_shape.dims() - 1}.
+// That is, for all i, 0 <= perm[i] < input_shape.dims().
+// In practice, this is checked in TransposeOp::Compute prior to calling this
+// function, and the function sits here to facilitate unit testing.
+inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
+                                        const std::vector<int32>& permutation) {
+  int last_nonsingleton_perm_dim = -1;
+  for (int perm_dim : permutation) {
+    if (input_shape.dim_size(perm_dim) == 1) {
+      continue;
+    }
+    if (perm_dim < last_nonsingleton_perm_dim) {
+      return false;
+    }
+    last_nonsingleton_perm_dim = perm_dim;
+  }
+  return true;
+}
+
 // Device-specific naive implementation for transpose.
 template <typename Device, typename T>
 void TransposeSimple(const Device& d, const Tensor& in,
@@ -57,25 +132,14 @@ template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
                          const gtl::ArraySlice<int32> perm, Tensor* out);
 
-template <typename Device, typename T>
-void Transpose(const Device& d, const Tensor& in,
-               const gtl::ArraySlice<int32> perm, Tensor* out) {
-  switch (in.dims()) {
-    case 2:
-      TransposeUsingEigen<Device, T, 2>(d, in, perm, out);
-      break;
-    case 3:
-      TransposeUsingEigen<Device, T, 3>(d, in, perm, out);
-      break;
-    case 4:
-      TransposeUsingEigen<Device, T, 4>(d, in, perm, out);
-      break;
-    default:
-      TransposeSimple<Device, T>(d, in, perm, out);
-      break;
-  }
-}
 }  // namespace internal
+
+template <typename Device, typename T>
+struct Transpose {
+  static void run(const Device& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_TRANSPOSE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 30b82f18431..248c11976e7 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -61,11 +61,38 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 }  // end namespace internal
 
-typedef Eigen::ThreadPoolDevice Device;
+typedef Eigen::ThreadPoolDevice CPUDevice;
 
+template <typename T>
+struct Transpose<CPUDevice, T> {
+  static void run(const CPUDevice& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+    switch (in.dims()) {
+      case 2:
+        internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, out);
+        break;
+      case 3:
+        internal::TransposeUsingEigen<CPUDevice, T, 3>(d, in, perm, out);
+        break;
+      case 4:
+        internal::TransposeUsingEigen<CPUDevice, T, 4>(d, in, perm, out);
+        break;
+      case 5:
+        internal::TransposeUsingEigen<CPUDevice, T, 5>(d, in, perm, out);
+        break;
+      default:
+        internal::TransposeSimple<CPUDevice, T>(d, in, perm, out);
+        break;
+    }
+  }
+};
+
+// TODO(yangzihao): Merge this code with its GPU counterpart to reduce code
+// duplication.
 template <>
-Status DoTranspose<Device>(const Device& d, const Tensor& in,
-                           const gtl::ArraySlice<int32> perm, Tensor* out) {
+Status DoTranspose<CPUDevice>(const CPUDevice& d, const Tensor& in,
+                              const gtl::ArraySlice<int32> perm, Tensor* out) {
+  typedef CPUDevice Device;
   CHECK_GE(in.dims(), 2);
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
@@ -76,7 +103,7 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT8:
     case DT_QUINT8:
     case DT_UINT8:
-      internal::Transpose<Device, uint8>(d, in, perm, out);
+      Transpose<Device, uint8>::run(d, in, perm, out);
       break;
 
     case DT_BFLOAT16:
@@ -85,27 +112,27 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT16:
     case DT_QUINT16:
     case DT_UINT16:
-      internal::Transpose<Device, uint16>(d, in, perm, out);
+      Transpose<Device, uint16>::run(d, in, perm, out);
       break;
 
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
-      internal::Transpose<Device, uint32>(d, in, perm, out);
+      Transpose<Device, uint32>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX64:
     case DT_DOUBLE:
     case DT_INT64:
-      internal::Transpose<Device, uint64>(d, in, perm, out);
+      Transpose<Device, uint64>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX128:
-      internal::Transpose<Device, complex128>(d, in, perm, out);
+      Transpose<Device, complex128>::run(d, in, perm, out);
       break;
 
     case DT_STRING:
-      internal::Transpose<Device, string>(d, in, perm, out);
+      Transpose<Device, string>::run(d, in, perm, out);
       break;
 
     default:
@@ -117,6 +144,48 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 
+template <typename Device, typename T>
+void TransposeSYCL(const Device& d, const Tensor& in,
+               const gtl::ArraySlice<int32> perm, Tensor* out) {
+  switch (in.dims()) {
+    case 1:
+      internal::TransposeUsingEigen<Device, T, 1>(d, in, perm, out);
+      break;
+    case 2:
+      internal::TransposeUsingEigen<Device, T, 2>(d, in, perm, out);
+      break;
+    case 3:
+      internal::TransposeUsingEigen<Device, T, 3>(d, in, perm, out);
+      break;
+    case 4:
+      internal::TransposeUsingEigen<Device, T, 4>(d, in, perm, out);
+      break;
+    case 5:
+      internal::TransposeUsingEigen<Device, T, 5>(d, in, perm, out);
+      break;
+    case 6:
+      internal::TransposeUsingEigen<Device, T, 6>(d, in, perm, out);
+      break;
+    case 7:
+      internal::TransposeUsingEigen<Device, T, 7>(d, in, perm, out);
+      break;
+    case 8:
+      internal::TransposeUsingEigen<Device, T, 8>(d, in, perm, out);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported TransposeUsingEigen for: " << in.dims();
+      break;
+  }
+}
+
+template <typename T>
+struct Transpose<SYCLDevice, T> {
+  static void run(const SYCLDevice& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+    // Should add a specialized implementation for SYCLDevice here.
+  }
+};
+
 template <>
 Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
                            const gtl::ArraySlice<int32> perm, Tensor* out) {
@@ -125,10 +194,36 @@ Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
   CHECK_EQ(in.dims(), perm.size());
   CHECK_EQ(in.dtype(), out->dtype());
   switch (in.dtype()) {
+    case DT_BOOL:
+    case DT_INT8:
+    case DT_QINT8:
+    case DT_QUINT8:
+    case DT_UINT8:
+      TransposeSYCL<SYCLDevice, uint8>(d, in, perm, out);
+      break;
 
+    case DT_BFLOAT16:
+    case DT_HALF:
+    case DT_INT16:
+    case DT_QINT16:
+    case DT_QUINT16:
+    case DT_UINT16:
+      TransposeSYCL<SYCLDevice, uint16>(d, in, perm, out);
+      break;
     case DT_FLOAT:
     case DT_INT32:
-      internal::Transpose<SYCLDevice, uint32>(d, in, perm, out);
+    case DT_QINT32:
+      TransposeSYCL<SYCLDevice, uint32>(d, in, perm, out);
+      break;
+
+    case DT_COMPLEX64:
+    case DT_DOUBLE:
+    case DT_INT64:
+      TransposeSYCL<SYCLDevice, uint64>(d, in, perm, out);
+      break;
+
+    case DT_COMPLEX128:
+      TransposeSYCL<SYCLDevice, complex128>(d, in, perm, out);
       break;
 
     default:
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index d9ff02f44c3..bc72bfb2fdd 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
+// TODO(yangzihao): Remove the dependency of conv_2d.h once we move all
+// GPU util functions and transpose kernels into separate files.
+#include "tensorflow/core/kernels/conv_2d.h"
+
 namespace tensorflow {
 namespace internal {
 
@@ -89,13 +93,94 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
   y.device(d) = x.shuffle(p);
 }
 
+// TransposeUsingTile tries to reduce the dimension of the input tensor to 3 and
+// then call special kernels to swap either dimension 1 and dimension 2 or
+// dimension 0 and dimension 2. It returns true if the operation is success,
+// false otherwise.
+template <typename T>
+bool TransposeUsingTile(const Eigen::GpuDevice& d, const Tensor& in,
+                        const gtl::ArraySlice<int32> perm, Tensor* out) {
+  // First try to reduce the dimensions of the input tensor.
+  TransposePermsVec new_perm;
+  TransposeDimsVec new_dims;
+  ReduceTransposeDimensions(in.shape(), perm, &new_perm, &new_dims);
+
+  // Only use special GPU kernel when dimension is 2 or 3.
+  int dims = new_dims.size();
+  if (dims < 2 || dims > 3) return false;
+  auto in_data = reinterpret_cast<const T*>(in.tensor_data().data());
+  auto out_data =
+      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data()));
+  switch (dims) {
+    case 2:
+      if (new_perm[0] == 1 && new_perm[1] == 0) {
+        // Add the first dimension size as 1.
+        new_dims.insert(new_dims.begin(), 1);
+        tensorflow::functor::SwapDimension1And2InTensor3<Eigen::GpuDevice, T>()(
+            d, in_data, new_dims, out_data);
+        return true;
+      }
+      break;
+    case 3:
+      if (new_perm == TransposePermsVec({0, 2, 1})) {
+        tensorflow::functor::SwapDimension1And2InTensor3<Eigen::GpuDevice, T>()(
+            d, in_data, new_dims, out_data);
+        return true;
+      } else if (new_perm == TransposePermsVec({2, 1, 0})) {
+        tensorflow::functor::SwapDimension0And2InTensor3<Eigen::GpuDevice, T>()(
+            d, in_data, new_dims, out_data);
+        return true;
+      } else {
+        // do not handle other 3D permutations
+        return false;
+      }
+      break;
+    default:
+      return false;
+  }
+  return false;
+}
+
 }  // end namespace internal
 
-typedef Eigen::GpuDevice Device;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Transpose kernel specialized for CPU Device.
+template <typename T>
+struct Transpose<GPUDevice, T> {
+  static void run(const GPUDevice& d, const Tensor& in,
+                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+    switch (in.dims()) {
+      case 2:
+        if (!internal::TransposeUsingTile<T>(d, in, perm, out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 2>(d, in, perm, out);
+        }
+        break;
+      case 3:
+        if (!internal::TransposeUsingTile<T>(d, in, perm, out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 3>(d, in, perm, out);
+        }
+        break;
+      case 4:
+        if (!internal::TransposeUsingTile<T>(d, in, perm, out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 4>(d, in, perm, out);
+        }
+        break;
+      case 5:
+        if (!internal::TransposeUsingTile<T>(d, in, perm, out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 5>(d, in, perm, out);
+        }
+        break;
+      default:
+        internal::TransposeSimple<GPUDevice, T>(d, in, perm, out);
+        break;
+    }
+  }
+};
 
 template <>
-Status DoTranspose<Device>(const Device& d, const Tensor& in,
-                           const gtl::ArraySlice<int32> perm, Tensor* out) {
+Status DoTranspose<GPUDevice>(const GPUDevice& d, const Tensor& in,
+                              const gtl::ArraySlice<int32> perm, Tensor* out) {
   CHECK_GE(in.dims(), 2);
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
@@ -106,7 +191,7 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT8:
     case DT_QUINT8:
     case DT_UINT8:
-      internal::Transpose<Device, uint8>(d, in, perm, out);
+      Transpose<GPUDevice, uint8>::run(d, in, perm, out);
       break;
 
     case DT_BFLOAT16:
@@ -115,23 +200,23 @@ Status DoTranspose<Device>(const Device& d, const Tensor& in,
     case DT_QINT16:
     case DT_QUINT16:
     case DT_UINT16:
-      internal::Transpose<Device, uint16>(d, in, perm, out);
+      Transpose<GPUDevice, uint16>::run(d, in, perm, out);
       break;
 
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
-      internal::Transpose<Device, uint32>(d, in, perm, out);
+      Transpose<GPUDevice, uint32>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX64:
     case DT_DOUBLE:
     case DT_INT64:
-      internal::Transpose<Device, uint64>(d, in, perm, out);
+      Transpose<GPUDevice, uint64>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX128:
-      internal::Transpose<Device, float4>(d, in, perm, out);
+      Transpose<GPUDevice, float4>::run(d, in, perm, out);
       break;
 
     default:
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 67300c1e961..75ed76a6979 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .HostMemory("y"),
                         InvertPermutationOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp);
+#endif  // TENSORFLOW_USE_SYCL
+
 // output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
 // of type T and rank N, and a permutation of 0, 1, ..., N-1. It
 // shuffles the dimensions of the input tensor according to permutation.
@@ -106,11 +115,6 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
                                       perm.shape().DebugString()));
   auto Vperm = perm.vec<int32>();
   const int dims = input.dims();
-  static const int kMinDims = 0;
-  static const int kMaxDims = 10;
-  OP_REQUIRES(ctx, kMinDims <= dims && dims <= kMaxDims,
-              errors::Unimplemented("Transposing a tensor of rank ", dims,
-                                    " is not implemented."));
   OP_REQUIRES(ctx, dims == Vperm.size(),
               errors::InvalidArgument(
                   "transpose expects a vector of size ", input.dims(),
@@ -125,7 +129,6 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
   // Check whether permutation is a permutation of integers of [0 .. dims).
   gtl::InlinedVector<bool, 8> bits(dims);
   bool is_identity = true;
-  int32 non_singleton_dims = 0;
   for (int i = 0; i < dims; ++i) {
     const int32 d = permutation[i];
     OP_REQUIRES(
@@ -134,7 +137,6 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     bits[d] = true;
     const auto dim_size = input.dim_size(d);
     shape.AddDim(dim_size);
-    non_singleton_dims += dim_size != 1 ? 1 : 0;
     if (d != i) {
       is_identity = false;
     }
@@ -149,7 +151,8 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
   if (dims <= 1 || is_identity) {
     ctx->set_output(0, input);
     return;
-  } else if (non_singleton_dims <= 1) {
+  } else if (internal::NonSingletonDimensionsAlign(input.shape(),
+                                                   permutation)) {
     Tensor output;
     OP_REQUIRES(ctx, output.CopyFrom(input, shape),
                 errors::Unknown("Error reshaping Tensor."));
@@ -171,6 +174,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                                    out);
 }
 
+#ifdef INTEL_MKL
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          MklTransposeCpuOp);
+TF_CALL_ALL_TYPES(REGISTER);
+REGISTER(bfloat16);
+#undef REGISTER
+
+#else  // INTEL_MKL
+
 #define REGISTER(T)                                           \
   REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
                               .Device(DEVICE_CPU)             \
@@ -181,6 +198,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 TF_CALL_ALL_TYPES(REGISTER)
 REGISTER(bfloat16);
 #undef REGISTER
+#endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
@@ -201,4 +219,24 @@ TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                   gtl::ArraySlice<int32> perm, Tensor* out) {
+  typedef Eigen::SyclDevice SYCLDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
+                                   out);
+}
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_SYCL)            \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          TransposeSyclOp);
+REGISTER(float);
+REGISTER(bool);
+REGISTER(int32);
+#undef REGISTER
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 3b209c0ccc8..a69eecc2f83 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef INTEL_MKL
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
 class TransposeGpuOp : public TransposeOp {
  public:
   explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
@@ -50,6 +61,17 @@ class TransposeGpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+class TransposeSyclOp : public TransposeOp {
+ public:
+  explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
diff --git a/tensorflow/core/kernels/transpose_util_test.cc b/tensorflow/core/kernels/transpose_util_test.cc
new file mode 100644
index 00000000000..4a479285f55
--- /dev/null
+++ b/tensorflow/core/kernels/transpose_util_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TransposeUtilTest : public ::testing::Test {
+ protected:
+  void TestDimensionReduction(const TensorShape& shape,
+                              const gtl::ArraySlice<int32>& perm,
+                              const gtl::ArraySlice<int32>& expected_perm,
+                              const gtl::ArraySlice<int64>& expected_dims) {
+    internal::TransposePermsVec new_perm;
+    internal::TransposeDimsVec new_dims;
+    internal::ReduceTransposeDimensions(shape, perm, &new_perm, &new_dims);
+
+    gtl::ArraySlice<int32> computed_perm(new_perm);
+    gtl::ArraySlice<int64> computed_dims(new_dims);
+    EXPECT_EQ(computed_perm, expected_perm);
+    EXPECT_EQ(computed_dims, expected_dims);
+  }
+};
+
+TEST_F(TransposeUtilTest, NormalDimensionReduction) {
+  TestDimensionReduction({2, 3, 4}, {0, 2, 1}, {0, 2, 1}, {2, 3, 4});
+
+  TestDimensionReduction({2, 3, 4}, {1, 0, 2}, {1, 0, 2}, {2, 3, 4});
+
+  TestDimensionReduction({2, 3, 4}, {2, 1, 0}, {2, 1, 0}, {2, 3, 4});
+
+  TestDimensionReduction({2, 3, 4, 5}, {0, 2, 3, 1}, {0, 2, 1}, {2, 3, 20});
+
+  TestDimensionReduction({2, 3, 4, 5}, {0, 3, 1, 2}, {0, 2, 1}, {2, 12, 5});
+
+  TestDimensionReduction({2, 3, 4, 5}, {3, 1, 2, 0}, {2, 1, 0}, {2, 12, 5});
+
+  TestDimensionReduction({2, 3, 4, 5}, {2, 3, 1, 0}, {2, 1, 0}, {2, 3, 20});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {0, 2, 3, 4, 1}, {0, 2, 1},
+                         {2, 3, 120});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {0, 4, 1, 2, 3}, {0, 2, 1},
+                         {2, 60, 6});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {4, 1, 2, 3, 0}, {2, 1, 0},
+                         {2, 60, 6});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {3, 4, 1, 2, 0}, {2, 1, 0},
+                         {2, 12, 30});
+
+  TestDimensionReduction({2, 3}, {1, 0}, {1, 0}, {2, 3});
+
+  TestDimensionReduction({2, 3, 4}, {2, 0, 1}, {1, 0}, {6, 4});
+
+  TestDimensionReduction({2, 3, 4}, {1, 2, 0}, {1, 0}, {2, 12});
+
+  TestDimensionReduction({2, 3, 4, 5}, {2, 3, 0, 1}, {1, 0}, {6, 20});
+
+  TestDimensionReduction({2, 3, 4, 5}, {1, 2, 3, 0}, {1, 0}, {2, 60});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {2, 3, 4, 0, 1}, {1, 0}, {6, 120});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {4, 0, 1, 2, 3}, {1, 0}, {120, 6});
+
+  TestDimensionReduction({2, 3, 4, 5, 6}, {0, 1, 2, 3, 4}, {0}, {720});
+
+  TestDimensionReduction({2, 3, 4, 5}, {0, 1, 2, 3}, {0}, {120});
+
+  TestDimensionReduction({2, 3, 4}, {0, 1, 2}, {0}, {24});
+
+  TestDimensionReduction({2, 3}, {0, 1}, {0}, {6});
+}
+
+TEST_F(TransposeUtilTest, LargeDimensionReduction) {
+  TestDimensionReduction({2, 3, 4, 5, 6, 7, 8, 9, 10, 20},
+                         {0, 2, 3, 4, 5, 6, 7, 8, 9, 1}, {0, 2, 1},
+                         {2, 3, 12096000});
+  TestDimensionReduction({2, 3, 4, 5, 6, 7, 8, 9, 10, 20},
+                         {0, 1, 2, 3, 4, 5, 6, 7, 9, 8}, {0, 2, 1},
+                         {362880, 10, 20});
+  TestDimensionReduction({2, 3, 4, 5, 6, 7, 8, 9, 10, 20},
+                         {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0}, {72576000});
+}
+
+TEST_F(TransposeUtilTest, NonSingletonDimensionAlignment) {
+  // Non-singleton dims 0, 2
+  EXPECT_TRUE(internal::NonSingletonDimensionsAlign({2, 1, 2}, {1, 0, 2}));
+  EXPECT_TRUE(internal::NonSingletonDimensionsAlign({2, 1, 2}, {0, 2, 1}));
+  EXPECT_FALSE(internal::NonSingletonDimensionsAlign({2, 1, 2}, {2, 0, 1}));
+  EXPECT_FALSE(internal::NonSingletonDimensionsAlign({2, 1, 2}, {2, 1, 0}));
+
+  // Non-singleton dims 0, 2, 4
+  EXPECT_TRUE(
+      internal::NonSingletonDimensionsAlign({2, 1, 2, 1, 2}, {0, 2, 4, 1, 3}));
+  EXPECT_TRUE(
+      internal::NonSingletonDimensionsAlign({2, 1, 2, 1, 2}, {0, 2, 1, 4, 3}));
+  EXPECT_TRUE(
+      internal::NonSingletonDimensionsAlign({2, 1, 2, 1, 2}, {1, 3, 0, 2, 4}));
+  EXPECT_TRUE(
+      internal::NonSingletonDimensionsAlign({2, 1, 2, 1, 2}, {3, 0, 1, 2, 4}));
+  EXPECT_FALSE(
+      internal::NonSingletonDimensionsAlign({2, 1, 2, 1, 2}, {3, 2, 0, 1, 4}));
+
+  // Non-singleton dims 2, 4, 5
+  EXPECT_TRUE(internal::NonSingletonDimensionsAlign({1, 1, 2, 1, 2, 2},
+                                                    {3, 2, 1, 4, 0, 5}));
+  EXPECT_TRUE(internal::NonSingletonDimensionsAlign({1, 1, 2, 1, 2, 2},
+                                                    {3, 1, 0, 2, 4, 5}));
+  EXPECT_TRUE(internal::NonSingletonDimensionsAlign({1, 1, 2, 1, 2, 2},
+                                                    {2, 4, 5, 0, 3, 1}));
+  EXPECT_FALSE(internal::NonSingletonDimensionsAlign({1, 1, 2, 1, 2, 2},
+                                                     {0, 1, 5, 2, 4, 3}));
+  EXPECT_FALSE(internal::NonSingletonDimensionsAlign({1, 1, 2, 1, 2, 2},
+                                                     {0, 1, 2, 5, 4, 3}));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index dbd7de7ce0e..1980f758fc1 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 /*
  * TypedConditionalAccumulatorBase is a templated companion of
- * ConditionalAccumulatorBase which allows for subclassses to use different
+ * ConditionalAccumulatorBase which allows for subclasses to use different
  * types for the input gradients. (See ConditionalAccumulator and
  * SparseConditionalAccumulator.)
  *
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index b09b60f773a..0d608d9b879 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -16,9 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
 #define TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
 
+#include <deque>
+#include <queue>
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -34,6 +38,8 @@ class TypedQueue : public QueueBase {
 
   virtual Status Initialize();  // Must be called before any other method.
 
+  int64 MemoryUsed() const override;
+
  protected:
   std::vector<SubQueue> queues_ GUARDED_BY(mu_);
 };  // class TypedQueue
@@ -65,6 +71,52 @@ Status TypedQueue<SubQueue>::Initialize() {
   return Status::OK();
 }
 
+namespace {
+
+template <typename SubQueue>
+int64 SizeOf(const SubQueue& sq) {
+  static_assert(sizeof(SubQueue) != sizeof(SubQueue), "SubQueue size unknown.");
+  return 0;
+}
+
+template <>
+int64 SizeOf(const std::deque<PersistentTensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+template <>
+int64 SizeOf(const std::vector<PersistentTensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+using TensorPair = std::pair<int64, PersistentTensor>;
+
+template <typename U, typename V>
+int64 SizeOf(const std::priority_queue<TensorPair, U, V>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * (sizeof(TensorPair) + sq.top().second.AllocatedBytes());
+}
+
+}  // namespace
+
+template <typename SubQueue>
+int64 TypedQueue<SubQueue>::MemoryUsed() const {
+  int memory_size = 0;
+  mutex_lock l(mu_);
+  for (const auto& sq : queues_) {
+    memory_size += SizeOf(sq);
+  }
+  return memory_size;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 9f44ee5b19e..b57e13a28c3 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -21,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
 
@@ -46,11 +46,11 @@ class UniqueOp : public OpKernel {
     const int64 N = static_cast<int64>(Tin.size());
 
     Tensor* idx = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 1, input.shape(), &idx));
     auto idx_vec = idx->template vec<int32>();
 
-    std::unordered_map<T, int32> uniq;
-    uniq.reserve(2 * N);
+    gtl::FlatMap<T, int32> uniq(N);
     for (int64 i = 0, j = 0; i < N; ++i) {
       auto it = uniq.insert(std::make_pair(Tin(i), j));
       idx_vec(i) = it.first->second;
@@ -95,9 +95,10 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
 
-// A fake int32 GPU kernel so that the use of Unique in optimizers (to
+// Fake integer GPU kernels so that the use of Unique in optimizers (to
 // de-duplicate sparse gradient indices) does not conflict with gradients being
-// located on a GPU.
+// located on a GPU. These kernels run on the CPU, their inputs and outputs
+// residing in host (not GPU) memory.
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
@@ -106,4 +107,31 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("y")
                             .HostMemory("idx"),
                         UniqueOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .TypeConstraint<int32>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int64>);
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .TypeConstraint<int32>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int64>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 2a14fa32651..c3bebfcbf9d 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -159,19 +159,15 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
       Name("Unpack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       UnpackOp<SYCLDevice, type>)
 
-REGISTER_SYCL(float);
-#undef REGISTER_SYCL
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
 
-// A special SYCL kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .Device(DEVICE_SYCL)
                             .HostMemory("value")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         UnpackOp<CPUDevice, int32>);
-
+#undef REGISTER_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 34e227156d8..36b8ff09d73 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -32,34 +32,31 @@ REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-                          Name("Variable")                              \
-                          .Device(DEVICE_SYCL)                          \
-                          .TypeConstraint<TYPE>("dtype"),               \
-                          VariableOp);                                  \
-  REGISTER_KERNEL_BUILDER(Name("VariableV2")                            \
-                          .Device(DEVICE_SYCL)                          \
-                          .TypeConstraint<TYPE>("dtype"),               \
-                          VariableOp);                                  \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                     \
-                          .Device(DEVICE_SYCL)                          \
-                          .TypeConstraint<TYPE>("dtype"),               \
-                          TemporaryVariableOp);                         \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")              \
-                          .Device(DEVICE_SYCL)                          \
-                          .TypeConstraint<TYPE>("T"),                   \
-                          DestroyTemporaryVariableOp);                  \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                 \
-                          .Device(DEVICE_SYCL)                          \
-                          .TypeConstraint<TYPE>("dtype")                \
-                          .HostMemory("is_initialized"),                \
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),  \
+      VariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),\
+      VariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                        \
+                              .Device(DEVICE_SYCL)                         \
+                              .TypeConstraint<type>("dtype"),              \
+                          TemporaryVariableOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                 \
+                              .Device(DEVICE_SYCL)                         \
+                              .TypeConstraint<type>("T"),                  \
+                          DestroyTemporaryVariableOp);                     \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                    \
+                              .Device(DEVICE_SYCL)                         \
+                              .TypeConstraint<type>("dtype")               \
+                              .HostMemory("is_initialized"),               \
                           IsVariableInitializedOp);
 
-REGISTER_SYCL_KERNEL(float);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
-#endif
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 2839c3d8cf5..355140d44c5 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_
 #define TENSORFLOW_KERNELS_VARIABLE_OPS_H_
 
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -75,6 +76,18 @@ class VariableOp : public OpKernel {
     // As long as the resource manager hasn't been cleared the ref we return
     // here is valid because it owns a ref on var.
     ctx->set_output_ref(0, var->mu(), var->tensor());
+    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      if (ctx->allocate_on_host(attr)) {
+        ctx->record_host_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      } else {
+        ctx->record_device_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      }
+    }
     var->Unref();
   }
 
@@ -114,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
                                        var_name_, tmp_var));
     context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
   }
 
  private:
@@ -154,6 +177,15 @@ class DestroyTemporaryVariableOp : public OpKernel {
     OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
     OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
                                 context->step_container()->name(), var_name_));
+    if (context->track_allocations()) {
+      if (context->allocate_on_host(AllocatorAttributes())) {
+        context->record_host_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      } else {
+        context->record_device_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      }
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/variable_ops_test.cc b/tensorflow/core/kernels/variable_ops_test.cc
new file mode 100644
index 00000000000..7a615788cc9
--- /dev/null
+++ b/tensorflow/core/kernels/variable_ops_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+// Benchmark to simulate the overhead in training and serving workloads from too
+// many threads grabbing the ResourceMgr lock at the same time because of the
+// variable and queue ops.
+void ManyManyVariablesHelper(int threads, int variables, int iters) {
+  testing::StopTiming();
+  Graph g(OpRegistry::Global());
+  std::vector<string> targets;
+  for (int i = 0; i < variables; ++i) {
+    Node* v;
+    TF_CHECK_OK(
+        NodeBuilder(
+            g.NewName("VeryVeryLongRealistSoundingVariableName/weights"),
+            "VariableV2")
+            .Attr("shape", TensorShape())
+            .Attr("dtype", DT_FLOAT)
+            .Finalize(&g, &v));
+    targets.push_back(v->name());
+  }
+  GraphDef gd;
+  g.ToGraphDef(&gd);
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(threads);
+  Session* sess = NewSession(opts);
+  TF_CHECK_OK(sess->Create(gd));
+  TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
+  }
+  testing::StopTiming();
+  delete sess;
+}
+
+void BM_ManyManyVariablesManyThreads(int iters, int threads) {
+  ManyManyVariablesHelper(threads, 1000, iters);
+}
+
+BENCHMARK(BM_ManyManyVariablesManyThreads)->Arg(50);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/warn_about_ints.cc b/tensorflow/core/kernels/warn_about_ints.cc
new file mode 100644
index 00000000000..fd0a889c99d
--- /dev/null
+++ b/tensorflow/core/kernels/warn_about_ints.cc
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/warn_about_ints.h"
+
+namespace tensorflow {
+
+void WarnAboutInts(OpKernelConstruction* context) {
+  DataType dtype;
+  OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
+  if (DataTypeIsInteger(dtype)) {
+    LOG(WARNING) << "Op " << context->def().name() << " of type "
+                 << context->def().op() << " used with integer dtype "
+                 << DataTypeString(dtype)
+                 << ".  This op was registered with integer support "
+                 << "accidentally, and you won't like the result.";
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/warn_about_ints.h b/tensorflow/core/kernels/warn_about_ints.h
new file mode 100644
index 00000000000..20666b230ec
--- /dev/null
+++ b/tensorflow/core/kernels/warn_about_ints.h
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+#define TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Warn if a kernel is being created using ints
+// TODO(irving): Remove in TF 2.0 along with the bad op registrations.
+void WarnAboutInts(OpKernelConstruction* context);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 5851fe0a12e..8f42bb28324 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -29,20 +31,11 @@ namespace tensorflow {
 
 static Status ReadEntireFile(Env* env, const string& filename,
                              string* contents) {
-  uint64 file_size = 0;
-  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
-  contents->resize(file_size);
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
-  StringPiece data;
-  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(*contents)[0]));
-  if (data.size() != file_size) {
-    return errors::DataLoss("Truncated read of '", filename, "' expected ",
-                            file_size, " got ", data.size());
-  }
-  if (data.data() != &(*contents)[0]) {
-    memmove(&(*contents)[0], data.data(), data.size());
-  }
+  io::RandomAccessInputStream input_stream(file.get());
+  io::BufferedInputStream in(&input_stream, 1 << 20);
+  TF_RETURN_IF_ERROR(in.ReadAll(contents));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/window_dataset.cc b/tensorflow/core/kernels/window_dataset.cc
new file mode 100644
index 00000000000..8396d9b700c
--- /dev/null
+++ b/tensorflow/core/kernels/window_dataset.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/window_dataset.h"
+
+namespace tensorflow {
+namespace {
+
+class WindowDataset : public DatasetBase {
+ public:
+  WindowDataset(std::vector<std::vector<Tensor>> elements,
+                DataTypeVector output_types,
+                std::vector<PartialTensorShape> output_shapes)
+      : elements_(std::move(elements)),
+        output_types_(std::move(output_types)),
+        output_shapes_(std::move(output_shapes)) {}
+
+  std::unique_ptr<IteratorBase> MakeIterator() const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(this));
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() override { return "WindowDataset"; }
+
+ private:
+  class Iterator : public DatasetIterator<WindowDataset> {
+   public:
+    explicit Iterator(const WindowDataset* dataset)
+        : DatasetIterator<WindowDataset>(dataset) {}
+
+    Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      if (i_ == dataset()->elements_.size()) {
+        *end_of_sequence = true;
+      } else {
+        *end_of_sequence = false;
+        *out_tensors = dataset()->elements_[i_++];
+      }
+      return Status::OK();
+    }
+
+    mutex mu_;
+    size_t i_ GUARDED_BY(mu_) = 0;
+  };
+
+  const std::vector<std::vector<Tensor>> elements_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace
+
+Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
+                        DataTypeVector output_types,
+                        std::vector<PartialTensorShape> output_shapes,
+                        DatasetBase** out_dataset) {
+  // TODO(mrry): If this becomes more public, we must validate that
+  // the elements match the output_types and output_shapes.
+  *out_dataset = new WindowDataset(std::move(elements), std::move(output_types),
+                                   std::move(output_shapes));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/window_dataset.h b/tensorflow/core/kernels/window_dataset.h
new file mode 100644
index 00000000000..a4fccf17b4c
--- /dev/null
+++ b/tensorflow/core/kernels/window_dataset.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/dataset.h"
+
+namespace tensorflow {
+
+// Creates a dataset representing an eagerly-collected window of elements.
+//
+// The `elements` argument defines the elements of the resulting
+// dataset, which is stored in `out_dataset`.
+//
+// This dataset is constructed internally for use in datasets that
+// build nested dataset expressions (e.g. the reducer function for
+// GroupByBatchDataset). It efficiently supports multiple iterators on
+// the same window without recomputation.
+//
+// REQUIRES: `output_types` must match the types of the respective
+// element components in `elements`.
+// REQUIRES: `output_shapes` must be compatible with the shapes of the
+// respective element components in `elements`.a
+Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
+                        DataTypeVector output_types,
+                        std::vector<PartialTensorShape> output_shapes,
+                        DatasetBase** out_dataset);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 0fe7fe286a7..2d05d72bff1 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -257,7 +257,7 @@ class NegTrainOp : public OpKernel {
     sampler_ = new random::DistributionSampler(vocab_weights);
   }
 
-  ~NegTrainOp() { delete sampler_; }
+  ~NegTrainOp() override { delete sampler_; }
 
   void Compute(OpKernelContext* ctx) override {
     Tensor w_in = ctx->mutable_input(0, false);
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 639bad5f04f..dc21cee3a8a 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -28,6 +28,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -61,9 +64,9 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
                    context->allocate_output(
                        0, TensorShape({logits_in.dim_size(0)}), &loss_out));
     Tensor* back_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, logits_in.shape(), &back_out));
-
+    // Try to reuse the logits_in buffer for the backprop output.
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 1, logits_in.shape(), &back_out));
     functor::XentFunctor<Device, T> functor;
     functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
             labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
@@ -74,17 +77,25 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from XentEigenImpl.
 namespace functor {
-template <typename T>
-struct XentFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct XentFunctorBase {
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<T>::ConstMatrix labels,
                   typename TTypes<T>::Matrix scratch,
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+    XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
                                          backprop);
   }
 };
+
+template <typename T>
+struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -111,4 +122,11 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                         SoftmaxXentWithLogitsOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<float>("T"),
+                        SoftmaxXentWithLogitsOp<SYCLDevice, float>);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 823cdf7e098..601704c8a70 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // libxsmm is not available.
 
 #ifndef TENSORFLOW_USE_LIBXSMM
-void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
+void dummy_xsmm_conv2d_ensure_file_is_not_empty();
 #else
 
 #define USE_EIGEN_TENSOR
@@ -26,14 +26,18 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 
 #include <stdlib.h>
+#include <cstring>
+#if 0
+#include <omp.h>
+#endif
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
+#include "libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 #include "include/libxsmm_cpuid.h"
-#include "libxsmm_dnn_handle.h"
-#include "libxsmm_malloc.h"
+#include "include/libxsmm_malloc.h"
 
 namespace tensorflow {
 
@@ -59,10 +63,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
     VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
     return false;
   }
-  if (desc.pad_h_in != 0 || desc.pad_w_in != 0) {
-    VLOG(1) << "Cannot use XSMM convolutions: unsupported padding!";
-    return false;
-  }
   if (desc.K % VECTOR_SIZE != 0) {
     VLOG(1) << "Cannot use XSMM convolutions: output features count not"
                " divisible by vector size!";
@@ -72,7 +72,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
   return true;
 }
 
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace functor {
@@ -83,25 +82,34 @@ static void chk_libxsmm_err(libxsmm_dnn_err_t status, string msg) {
   }
 }
 
-LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, int S, int C, int K,int blocksifm, int blocksofm, int ifmblock,int ofmblock, int start, int end)
-{
-  LIBXSMM_VLA_DECL(4, const      float, input, rsck, S, C,K);
-  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm,R,S,ifmblock, ofmblock);
-  int r, s, k,c, v1,v2;
-  
-  for (k = start; k < end ; k++ ) { 
-    for(c = 0; c < blocksifm;c++){
-      for ( r = 0; r < R; r++ ) {
-        for ( s = 0; s < S; s++ ){
-          for ( v1 = c*ifmblock; v1 < std::min(C,(c+1)*ifmblock) ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < std::min(K, (k+1)*ofmblock); v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2,  S, C, K);
-            for ( v2 = K; v2 < (k+1)*ofmblock ; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f; 
-            }
-          for ( v1 = C; v1 < (c+1)*ifmblock ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < (k+1)*ofmblock; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f;
+LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float* kcrs, int R,
+                                        int S, int C, int K, int blocksifm,
+                                        int blocksofm, int ifmblock,
+                                        int ofmblock, int start, int end) {
+  LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K);
+  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm, R, S, ifmblock, ofmblock);
+  int r, s, k, c, v1, v2;
+
+  for (k = start; k < end; k++) {
+    for (c = 0; c < blocksifm; c++) {
+      for (r = 0; r < R; r++) {
+        for (s = 0; s < S; s++) {
+          for (v1 = c * ifmblock; v1 < std::min(C, (c + 1) * ifmblock); v1++) {
+            for (v2 = k * ofmblock; v2 < std::min(K, (k + 1) * ofmblock); v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) =
+                  LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2, S, C, K);
+            for (v2 = K; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
+          }
+          for (v1 = C; v1 < (c + 1) * ifmblock; v1++) {
+            for (v2 = k * ofmblock; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
           }
         }
       }
@@ -109,106 +117,85 @@ LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, i
   }
 }
 
- 
+class libxsmm_dnn_conv_desc_wrap {
+ public:
+  const libxsmm_dnn_conv_desc d;
 
-class libxsmm_dnn_conv_desc_wrap{
-  public:
-    const libxsmm_dnn_conv_desc d;
- 
-    libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc &d_) : d(d_){
-    }
-    bool operator==(const libxsmm_dnn_conv_desc_wrap  &w) const{
-      return( d.N == w.d.N &&
-              d.C == w.d.C &&
-              d.H == w.d.H &&
-              d.W == w.d.W &&
-              d.K == w.d.K &&
-              d.R == w.d.R &&
-              d.S == w.d.S &&
-              d.u == w.d.u &&
-              d.v == w.d.v &&
-              d.pad_h_in == w.d.pad_h_in &&
-              d.pad_w_in == w.d.pad_w_in
-            );
-    }
-};
- 
- 
-struct HashFunction{
-  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
-    std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
- 
-    N << w.d.N; C << w.d.C;
-    H << w.d.H; W << w.d.W;
-    K << w.d.K; R << w.d.R;
-    S << w.d.S; u << w.d.u;
-    v << w.d.v; padh << w.d.pad_h_in;
-    padw << w.d.pad_w_in;
-
-
-    std::string out_ =   N.str() + C.str()\
-                       + H.str() + W.str()\
-                       + K.str() + R.str()\
-                       + S.str() + u.str()\
-                       + v.str() + padh.str()\
-                       + padw.str();
-
-    return ( std::hash<std::string>()(out_));
+  libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc& d_) : d(d_) {}
+  bool operator==(const libxsmm_dnn_conv_desc_wrap& w) const {
+    return (d.N == w.d.N && d.C == w.d.C && d.H == w.d.H && d.W == w.d.W &&
+            d.K == w.d.K && d.R == w.d.R && d.S == w.d.S && d.u == w.d.u &&
+            d.v == w.d.v && d.pad_h == w.d.pad_h && d.pad_w == w.d.pad_w);
   }
 };
 
-class handles{
-  public:
-    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                         HashFunction>::iterator i = libxsmm_handles.find(w);
-      if (i == libxsmm_handles.end()){
-        libxsmm_dnn_err_t status;
-        libxsmm_dnn_layer* libxsmm_handle =
-            libxsmm_dnn_create_conv_layer(w.d, &status);
-        chk_libxsmm_err(status, "Create handle");
-        libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
-        return libxsmm_handle;
-      }
-      else
-        return i->second;
+struct HashFunction {
+  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap& w) const {
+    return libxsmm_hash(&w.d, sizeof(w.d), 25071975);
+  }
+};
+
+class handles {
+ public:
+  libxsmm_dnn_layer* find(const libxsmm_dnn_conv_desc_wrap& w) {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i = libxsmm_handles.find(w);
+    if (i == libxsmm_handles.end()) {
+      libxsmm_dnn_err_t status;
+      libxsmm_dnn_layer* libxsmm_handle =
+          libxsmm_dnn_create_conv_layer(w.d, &status);
+      chk_libxsmm_err(status, "Create handle");
+      libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
+      return libxsmm_handle;
+    } else {
+      return i->second;
     }
-   ~handles(){
-     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                        HashFunction>::iterator i;
-    for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
+  }
+  ~handles() {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i;
+    for (i = libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
       chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
-                    "Destroy handle");
-    }
-  private:
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
+                      "Destroy handle");
+  }
+
+ private:
+  std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                     HashFunction>
+      libxsmm_handles;
 };
 
 static handles libxsmm_handles;
 
+// #define LIBXSMM_DETAILED_TIMING
+
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_compute_kind kind, InputPtr input,
-                                   FilterPtr filter, OutputPtr output) {
+                                   libxsmm_dnn_compute_kind kind,
+                                   InputPtr input, FilterPtr filter,
+                                   OutputPtr output) {
+#if defined(LIBXSMM_DETAILED_TIMING)
+  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6,
+      l_tick7, l_tick8, l_tick9, l_tick10;
+  l_tick1 = libxsmm_timer_tick();
+#endif
   // setup scoped allocator, which adopts the allocator from the context
   const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
   libxsmm_dnn_err_t status;
   libxsmm_dnn_layer* libxsmm_handle;
   libxsmm_dnn_conv_desc_wrap w(desc);
   void* scratch;
- 
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    libxsmm_handle = libxsmm_handles.find(w);
-  else {
-    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
-    chk_libxsmm_err(status, "Create handle");
-  }
-  
+
+  // if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  libxsmm_handle = libxsmm_handles.find(w);
+  // else{
+  //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
+  //  chk_libxsmm_err(status, "Create handle");
+  //}
+
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                    "Destroy handle");
     return false;  // Use non-libxsmm code
   }
   chk_libxsmm_err(status, "Check codegen status");
@@ -217,100 +204,165 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_buffer* libxsmm_output;
   libxsmm_dnn_filter* libxsmm_filter;
 
- /*
-  const DeviceBase::CpuWorkerThreads* worker_threads =
-      ctx->device()->tensorflow_cpu_worker_threads();
-
-  int num_threads = worker_threads->num_threads;
-*/
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick2 = libxsmm_timer_tick();
+#endif
 
   int ifmblock = (libxsmm_handle->ifmblock);
   int ofmblock = (libxsmm_handle->ofmblock);
 
-  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;
-  int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_scratch(
-      blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float),
-      2097152);
+  int blocksifm =
+      desc.C % ifmblock == 0 ? desc.C / ifmblock : desc.C / ifmblock + 1;
+  int blocksofm =
+      desc.K % ofmblock == 0 ? desc.K / ofmblock : desc.K / ofmblock + 1;
+  float* native_filter =
+      (float*)libxsmm_aligned_scratch(blocksofm * blocksifm * desc.R * desc.S *
+                                          ifmblock * ofmblock * sizeof(float),
+                                      2097152);
 
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
 
   int num_threads = worker_threads->num_threads;
 
-
-  if(blocksofm > num_threads){
-    int work = blocksofm;
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+#if 1
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    if (blocksofm > num_threads) {
+      int work = blocksofm;
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = work/num_threads*i;
-        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;  
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock,start, end);
-        count.DecrementCount();
+          int start = work / num_threads * i;
+          int end = (start + work / num_threads) > work
+                        ? work
+                        : start + work / num_threads;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
-    }
-    count.Wait();
-  }
-  else{
+      }
+      count.Wait();
+    } else {
+      int work = blocksofm;
+      int num_threads = work;
 
-    int work = blocksofm;
-    int num_threads = work;
-    
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = i;
-        int end =  i+1;
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock, start, end);
-        count.DecrementCount();
+          int start = i;
+          int end = i + 1;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
+      }
+      count.Wait();
     }
-    count.Wait();
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    // Added: for weight update
+    libxsmm_filter =
+        libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
+                                LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
+    chk_libxsmm_err(status,
+                    "Link filter");  // weight update is in RSCK as
+                                     // filter should be returned in RSCK
+                                     // format
   }
+#else
+  memset(native_filter, 0,
+         blocksofm * blocksifm * desc.R * desc.S * ifmblock * ofmblock *
+             sizeof(float));
+#endif
 
-  libxsmm_input = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick3 = libxsmm_timer_tick();
+#endif
+
+  libxsmm_input =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_INPUT, input,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  libxsmm_output =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
-  libxsmm_filter = libxsmm_dnn_link_filter(
-      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
-  chk_libxsmm_err(status, "Link filter");
-
-  chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
-
-
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
-                    "Bind input forward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT),
-        "Bind output forward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
-                    "Bind filter forward");
-  } else {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_GRADIENT_INPUT),
-                    "Bind input backward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
-        "Bind output backward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
-                    "Bind filter backward");
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_filter = libxsmm_dnn_link_filter(
+        libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter,
+        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+    chk_libxsmm_err(status, "Link filter");
   }
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_REGULAR_OUTPUT),
+                    "Bind output forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
+                    "Bind filter forward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_GRADIENT_INPUT),
+                    "Bind input backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
+                    "Bind filter backward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_GRADIENT_FILTER),
+                    "Bind filter weight update");
+  } else {
+    /* shouldn't happen */
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick4 = libxsmm_timer_tick();
+#endif
 
   /* bind scratch */
-  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, kind, &status ), 2097152);
-  chk_libxsmm_err( status, "scratch allocation" );
-  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, kind, scratch ), "binding scratch" );
+  scratch = (void*)libxsmm_aligned_scratch(
+      libxsmm_dnn_get_scratch_size(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL,
+                                   &status),
+      2097152);
+  chk_libxsmm_err(status, "scratch allocation");
+  chk_libxsmm_err(libxsmm_dnn_bind_scratch(
+                      libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch),
+                  "binding scratch");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick5 = libxsmm_timer_tick();
+#endif
 
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
   }
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick6 = libxsmm_timer_tick();
+#endif
+
+#if 1
   BlockingCounter counter(num_threads);
-  
+
   for (int i = 0; i < num_threads; ++i) {
     worker_threads->workers->Schedule([=, &counter]() {
       chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
@@ -319,28 +371,97 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     });
   }
   counter.Wait();
+#else
+#pragma omp parallel
+  {
+    chk_libxsmm_err(
+        libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
+        "Worker");
+  }
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick7 = libxsmm_timer_tick();
+#endif
+
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_dnn_reduce_wu_filters(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER);
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick8 = libxsmm_timer_tick();
+#endif
 
   /* clean up */
-  chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
+  chk_libxsmm_err(
+      libxsmm_dnn_release_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL),
+      "release scratch");
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER),
+        "release filter");
   } else {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    /* shouldn't happen */
   }
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
-  
-  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                  "Destroy handle");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick9 = libxsmm_timer_tick();
+#endif
+
+  // if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  // chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+  //               "Destroy handle");
 
   libxsmm_free(native_filter);
   libxsmm_free(scratch);
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick10 = libxsmm_timer_tick();
+  printf(
+      "time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, "
+      "%f, %f, %f\n",
+      desc.N, desc.C, desc.K, desc.R, desc.S,
+      libxsmm_timer_duration(l_tick1, l_tick2),
+      libxsmm_timer_duration(l_tick2, l_tick3),
+      libxsmm_timer_duration(l_tick3, l_tick4),
+      libxsmm_timer_duration(l_tick4, l_tick5),
+      libxsmm_timer_duration(l_tick5, l_tick6),
+      libxsmm_timer_duration(l_tick6, l_tick7),
+      libxsmm_timer_duration(l_tick7, l_tick8),
+      libxsmm_timer_duration(l_tick8, l_tick9),
+      libxsmm_timer_duration(l_tick9, l_tick10),
+      libxsmm_timer_duration(l_tick1, l_tick10));
+#endif
+
   return true;  // Succeeded
 }
 
@@ -348,8 +469,8 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD,
+                                  input, filter, output);
   }
 };
 
@@ -357,8 +478,8 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD,
+                                  input, filter, output);
   }
 };
 
@@ -366,8 +487,8 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD,
+                                  input, filter, output);
   }
 };
 
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/zip_dataset_op.cc
new file mode 100644
index 00000000000..325e8a5df9b
--- /dev/null
+++ b/tensorflow/core/kernels/zip_dataset_op.cc
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ZipDatasetOp : public DatasetOpKernel {
+ public:
+  explicit ZipDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    std::vector<DatasetBase*> inputs;
+    Status s;
+    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
+      // Create a new ZipDatasetOp::Dataset, insert it in the step-local
+      // container, and return it as the output.
+      DatasetBase* input;
+      s.Update(LookupResource(ctx, HandleFromInput(ctx, i), &input));
+      if (!s.ok()) {
+        break;
+      }
+      inputs.push_back(input);
+    }
+
+    if (s.ok()) {
+      *output = new Dataset(inputs);
+    }
+
+    // TODO(mrry): Implement a container that acts as a
+    // `std::vector<core::ScopedUnref>`, to avoid having to unref the
+    // inputs manually, and re-enable the use of `OP_REQUIRES_OK()`.
+    for (DatasetBase* input : inputs) {
+      input->Unref();
+    }
+    ctx->SetStatus(s);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const std::vector<DatasetBase*>& inputs)
+        : inputs_(inputs) {
+      for (const auto& input : inputs_) {
+        input->Ref();
+        for (DataType dt : input->output_dtypes()) {
+          output_dtypes_.push_back(dt);
+        }
+        output_shapes_.insert(output_shapes_.end(),
+                              input->output_shapes().begin(),
+                              input->output_shapes().end());
+      }
+    }
+
+    ~Dataset() override {
+      for (const auto& input : inputs_) {
+        input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_dtypes_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ZipDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {
+        input_impls_.reserve(dataset->inputs_.size());
+        for (const auto& input : dataset->inputs_) {
+          input_impls_.emplace_back(input->MakeIterator());
+        }
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        out_tensors->clear();
+        out_tensors->reserve(dataset()->output_dtypes().size());
+        for (const auto& input_impl : input_impls_) {
+          std::vector<Tensor> input_tensors;
+          TF_RETURN_IF_ERROR(
+              input_impl->GetNext(ctx, &input_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+          out_tensors->insert(out_tensors->end(), input_tensors.begin(),
+                              input_tensors.end());
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::vector<std::unique_ptr<IteratorBase>> input_impls_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<DatasetBase*> inputs_;
+    DataTypeVector output_dtypes_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ZipDataset").Device(DEVICE_CPU), ZipDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/bmp/testdata/lena.bmp b/tensorflow/core/lib/bmp/testdata/lena.bmp
new file mode 100644
index 00000000000..8c4882de4a7
Binary files /dev/null and b/tensorflow/core/lib/bmp/testdata/lena.bmp differ
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index ef1b9a5468c..2a04f7bd39d 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -44,9 +44,9 @@ namespace core {
 Arena::Arena(const size_t block_size)
     : remaining_(0),
       block_size_(block_size),
-      freestart_(NULL),  // set for real in Reset()
+      freestart_(nullptr),  // set for real in Reset()
       blocks_alloced_(1),
-      overflow_blocks_(NULL) {
+      overflow_blocks_(nullptr) {
   assert(block_size > kDefaultAlignment);
 
   first_blocks_[0].mem =
@@ -59,7 +59,7 @@ Arena::Arena(const size_t block_size)
 
 Arena::~Arena() {
   FreeBlocks();
-  assert(overflow_blocks_ == NULL);  // FreeBlocks() should do that
+  assert(overflow_blocks_ == nullptr);  // FreeBlocks() should do that
   // The first X blocks stay allocated always by default.  Delete them now.
   for (size_t i = 0; i < blocks_alloced_; ++i) {
     port::AlignedFree(first_blocks_[i].mem);
@@ -152,7 +152,7 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
     // Use one of the pre-allocated blocks
     block = &first_blocks_[blocks_alloced_++];
   } else {  // oops, out of space, move to the vector
-    if (overflow_blocks_ == NULL)
+    if (overflow_blocks_ == nullptr)
       overflow_blocks_ = new std::vector<AllocatedBlock>;
     // Adds another block to the vector.
     overflow_blocks_->resize(overflow_blocks_->size() + 1);
@@ -185,10 +185,10 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
   block->mem = reinterpret_cast<char*>(
       port::AlignedMalloc(adjusted_block_size, adjusted_alignment));
   block->size = adjusted_block_size;
-  CHECK(NULL != block->mem) << "block_size=" << block_size
-                            << " adjusted_block_size=" << adjusted_block_size
-                            << " alignment=" << alignment
-                            << " adjusted_alignment=" << adjusted_alignment;
+  CHECK(nullptr != block->mem) << "block_size=" << block_size
+                               << " adjusted_block_size=" << adjusted_block_size
+                               << " alignment=" << alignment
+                               << " adjusted_alignment=" << adjusted_alignment;
 
   return block;
 }
@@ -205,7 +205,7 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
 
 void* Arena::GetMemoryFallback(const size_t size, const int alignment) {
   if (0 == size) {
-    return NULL;  // stl/stl_alloc.h says this is okay
+    return nullptr;  // stl/stl_alloc.h says this is okay
   }
 
   // alignment must be a positive power of 2.
@@ -246,17 +246,17 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) {
 void Arena::FreeBlocks() {
   for (size_t i = 1; i < blocks_alloced_; ++i) {  // keep first block alloced
     port::AlignedFree(first_blocks_[i].mem);
-    first_blocks_[i].mem = NULL;
+    first_blocks_[i].mem = nullptr;
     first_blocks_[i].size = 0;
   }
   blocks_alloced_ = 1;
-  if (overflow_blocks_ != NULL) {
+  if (overflow_blocks_ != nullptr) {
     std::vector<AllocatedBlock>::iterator it;
     for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
       port::AlignedFree(it->mem);
     }
     delete overflow_blocks_;  // These should be used very rarely
-    overflow_blocks_ = NULL;
+    overflow_blocks_ = nullptr;
   }
 }
 
diff --git a/tensorflow/core/lib/core/bits.h b/tensorflow/core/lib/core/bits.h
index 30ad0c2bea9..1110ef5c2a4 100644
--- a/tensorflow/core/lib/core/bits.h
+++ b/tensorflow/core/lib/core/bits.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_CORE_BITS_H_
 #define TENSORFLOW_LIB_CORE_BITS_H_
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -91,6 +92,18 @@ inline int Log2Ceiling64(uint64 n) {
     return floor + 1;
 }
 
+inline uint32 NextPowerOfTwo(uint32 value) {
+  int exponent = Log2Ceiling(value);
+  DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
+  return 1 << exponent;
+}
+
+inline uint64 NextPowerOfTwo64(uint64 value) {
+  int exponent = Log2Ceiling(value);
+  DCHECK_LT(exponent, std::numeric_limits<uint64>::digits);
+  return 1LL << exponent;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_CORE_BITS_H_
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index 21fb548053c..bb95c274104 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -148,14 +148,14 @@ const char* GetVarint32PtrFallback(const char* p, const char* limit,
       return reinterpret_cast<const char*>(p);
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 bool GetVarint32(StringPiece* input, uint32* value) {
   const char* p = input->data();
   const char* limit = p + input->size();
   const char* q = GetVarint32Ptr(p, limit, value);
-  if (q == NULL) {
+  if (q == nullptr) {
     return false;
   } else {
     *input = StringPiece(q, limit - q);
@@ -177,14 +177,14 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64* value) {
       return reinterpret_cast<const char*>(p);
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 bool GetVarint64(StringPiece* input, uint64* value) {
   const char* p = input->data();
   const char* limit = p + input->size();
   const char* q = GetVarint64Ptr(p, limit, value);
-  if (q == NULL) {
+  if (q == nullptr) {
     return false;
   } else {
     *input = StringPiece(q, limit - q);
diff --git a/tensorflow/core/lib/core/coding_test.cc b/tensorflow/core/lib/core/coding_test.cc
index 9e5710d9b76..9efe3d8ec10 100644
--- a/tensorflow/core/lib/core/coding_test.cc
+++ b/tensorflow/core/lib/core/coding_test.cc
@@ -125,7 +125,7 @@ TEST(Coding, Varint32) {
     uint32 expected = (i / 32) << (i % 32);
     uint32 actual;
     p = GetVarint32Ptr(p, limit, &actual);
-    ASSERT_TRUE(p != NULL);
+    ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(expected, actual);
   }
   ASSERT_EQ(p, s.data() + s.size());
@@ -158,7 +158,7 @@ TEST(Coding, Varint64) {
     ASSERT_TRUE(p < limit);
     uint64 actual;
     p = GetVarint64Ptr(p, limit, &actual);
-    ASSERT_TRUE(p != NULL);
+    ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(values[i], actual);
   }
   ASSERT_EQ(p, limit);
@@ -168,7 +168,7 @@ TEST(Coding, Varint32Overflow) {
   uint32 result;
   string input("\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(),
-                             &result) == NULL);
+                             &result) == nullptr);
 }
 
 TEST(Coding, Varint32Truncation) {
@@ -177,9 +177,10 @@ TEST(Coding, Varint32Truncation) {
   PutVarint32(&s, large_value);
   uint32 result;
   for (size_t len = 0; len < s.size() - 1; len++) {
-    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL);
+    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
-  ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL);
+  ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) !=
+              nullptr);
   ASSERT_EQ(large_value, result);
 }
 
@@ -187,7 +188,7 @@ TEST(Coding, Varint64Overflow) {
   uint64 result;
   string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(),
-                             &result) == NULL);
+                             &result) == nullptr);
 }
 
 TEST(Coding, Varint64Truncation) {
@@ -196,9 +197,10 @@ TEST(Coding, Varint64Truncation) {
   PutVarint64(&s, large_value);
   uint64 result;
   for (size_t len = 0; len < s.size() - 1; len++) {
-    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL);
+    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
-  ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL);
+  ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) !=
+              nullptr);
   ASSERT_EQ(large_value, result);
 }
 
diff --git a/tensorflow/core/lib/core/notification_test.cc b/tensorflow/core/lib/core/notification_test.cc
index 9d96708b6f1..d58b907d1dc 100644
--- a/tensorflow/core/lib/core/notification_test.cc
+++ b/tensorflow/core/lib/core/notification_test.cc
@@ -81,7 +81,7 @@ TEST(NotificationTest, TestMultipleThreadsWaitingOnNotification) {
 TEST(NotificationTest, TestWaitWithTimeoutOnNotifiedNotification) {
   Notification n;
   n.Notify();
-  EXPECT_TRUE(WaitForNotificationWithTimeout(&n, 1000));
+  EXPECT_TRUE(WaitForNotificationWithTimeout(&n, 1000 * 1000));
 }
 
 }  // namespace
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index 32c33ff079f..bbb68ef8d48 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -46,7 +46,7 @@ const string& Status::empty_string() {
 }
 
 string Status::ToString() const {
-  if (state_ == NULL) {
+  if (state_ == nullptr) {
     return "OK";
   } else {
     char tmp[30];
@@ -113,9 +113,23 @@ string Status::ToString() const {
   }
 }
 
+void Status::IgnoreError() const {
+  // no-op
+}
+
 std::ostream& operator<<(std::ostream& os, const Status& x) {
   os << x.ToString();
   return os;
 }
 
+string* TfCheckOpHelperOutOfLine(const ::tensorflow::Status& v,
+                                 const char* msg) {
+  string r("Non-OK-status: ");
+  r += msg;
+  r += " status: ";
+  r += v.ToString();
+  // Leaks string but this is only to be used in a fatal error message
+  return new string(r);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 734ea91c80f..e345a5dfb76 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -22,9 +22,16 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+class TF_MUST_USE_RESULT Status;
+#endif
+
+/// @ingroup core
 /// Denotes success or failure of a call in Tensorflow.
 class Status {
  public:
@@ -71,6 +78,11 @@ class Status {
   /// printing. Returns the string `"OK"` for success.
   string ToString() const;
 
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const;
+
  private:
   static const string& empty_string();
   struct State {
@@ -101,12 +113,24 @@ inline bool Status::operator==(const Status& x) const {
 
 inline bool Status::operator!=(const Status& x) const { return !(*this == x); }
 
+/// @ingroup core
 std::ostream& operator<<(std::ostream& os, const Status& x);
 
 typedef std::function<void(const Status&)> StatusCallback;
 
-#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
-#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+extern tensorflow::string* TfCheckOpHelperOutOfLine(
+    const ::tensorflow::Status& v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+                                           const char* msg) {
+  if (v.ok()) return nullptr;
+  return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val)                                             \
+  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(FATAL) << *(_result)
+#define TF_QCHECK_OK(val)                                            \
+  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(QFATAL) << *(_result)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
 // mode.
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index a0413388294..d95d8f20aa3 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 
@@ -96,4 +97,13 @@ TEST(Status, EqualsDifferentMessage) {
   ASSERT_NE(a, b);
 }
 
+static void BM_TF_CHECK_OK(int iters) {
+  tensorflow::Status s =
+      (iters < 0) ? errors::InvalidArgument("Invalid") : Status::OK();
+  for (int i = 0; i < iters; i++) {
+    TF_CHECK_OK(s);
+  }
+}
+BENCHMARK(BM_TF_CHECK_OK);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 12f9fe38290..984f4404ce2 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -40,7 +40,7 @@ size_t StringPiece::find(char c, size_t pos) const {
   }
   const char* result =
       reinterpret_cast<const char*>(memchr(data_ + pos, c, size_ - pos));
-  return result != NULL ? result - data_ : npos;
+  return result != nullptr ? result - data_ : npos;
 }
 
 // Search range is [0..pos] inclusive.  If pos == npos, search everything.
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 7f4adbb9405..2b10ebeaf7c 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 namespace thread {
 
@@ -50,10 +49,10 @@ struct EigenEnvironment {
 
   EnvThread* CreateThread(std::function<void()> f) {
     return env_->StartThread(thread_options_, name_, [=]() {
-      // Set the processor flag to flush denormals to zero
+      // Set the processor flag to flush denormals to zero.
       port::ScopedFlushDenormal flush;
-      // Set the C++ rounding mode to ROUND TO NEAREST
-      port::ScopedSetRound round;
+      // Set the processor rounding mode to ROUND TO NEAREST.
+      port::ScopedSetRound round(FE_TONEAREST);
       f();
     });
   }
@@ -86,9 +85,10 @@ struct EigenEnvironment {
 
 struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
   Impl(Env* env, const ThreadOptions& thread_options, const string& name,
-       int num_threads)
+       int num_threads, bool low_latency_hint)
       : Eigen::ThreadPoolTempl<EigenEnvironment>(
-            num_threads, EigenEnvironment(env, thread_options, name)) {}
+            num_threads, low_latency_hint,
+            EigenEnvironment(env, thread_options, name)) {}
 
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn) {
@@ -102,13 +102,18 @@ struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
 };
 
 ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads) {}
+    : ThreadPool(env, ThreadOptions(), name, num_threads, true) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
-                       const string& name, int num_threads) {
+                       const string& name, int num_threads)
+    : ThreadPool(env, thread_options, name, num_threads, true) {}
+
+ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
+                       const string& name, int num_threads,
+                       bool low_latency_hint) {
   CHECK_GE(num_threads, 1);
-  impl_.reset(
-      new ThreadPool::Impl(env, thread_options, "tf_" + name, num_threads));
+  impl_.reset(new ThreadPool::Impl(env, thread_options, "tf_" + name,
+                                   num_threads, low_latency_hint));
 }
 
 ThreadPool::~ThreadPool() {}
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 4de1177ef54..251d58817e7 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -27,24 +27,36 @@ namespace thread {
 
 class ThreadPool {
  public:
-  // Construct a pool that contains "num_threads" threads with specified "name".
-  // env->StartThread() is used to create individual threads.
+  // Constructs a pool that contains "num_threads" threads with specified
+  // "name". env->StartThread() is used to create individual threads with the
+  // given ThreadOptions. If "low_latency_hint" is true the thread pool
+  // implementation may use it as a hint that lower latency if preferred at the
+  // cost of higher CPU usage, e.g. by letting one or more idle threads spin
+  // wait. Conversely, if the threadpool is used to schedule high-latency
+  // operations like I/O the hint should be set to false.
   //
   // REQUIRES: num_threads > 0
+  ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
+             int num_threads, bool low_latency_hint);
+
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads.
+  // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const string& name, int num_threads);
 
-  // Construct a pool that contains "num_threads" threads with specified "name".
-  // env->StartThread() is used to create individual threads.
-  //
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads with the given ThreadOptions.
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
              int num_threads);
 
-  // Wait until all scheduled work has finished and then destroy the
+  // Waits until all scheduled work has finished and then destroy the
   // set of threads.
   ~ThreadPool();
 
-  // Schedule fn() for execution in the pool of threads.
+  // Schedules fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
   // ParallelFor shards the "total" units of work assuming each unit of work
@@ -60,7 +72,7 @@ class ThreadPool {
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn);
 
-  // Shard the "total" units of work. For more details, see "ParallelFor".
+  // Shards the "total" units of work. For more details, see "ParallelFor".
   //
   // The function is passed a thread_id between 0 and NumThreads() *inclusive*.
   // This is because some work can happen on the caller thread while the threads
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index df8887b1c47..b5c0d9f621d 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -24,9 +25,19 @@ limitations under the License.
 namespace tensorflow {
 namespace gif {
 
+struct InputBufferInfo {
+  const uint8_t* buf;
+  int bytes_left;
+};
+
 int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
-  if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
-    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
+  InputBufferInfo* const info =
+      reinterpret_cast<InputBufferInfo*>(gif_file->UserData);
+  if (info != nullptr) {
+    if (size > info->bytes_left) size = info->bytes_left;
+    memcpy(buf, info->buf, size);
+    info->buf += size;
+    info->bytes_left -= size;
     return size;
   }
   return 0;
@@ -35,8 +46,16 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
 uint8* Decode(const void* srcdata, int datasize,
               std::function<uint8*(int, int, int, int)> allocate_output) {
   int error_code = D_GIF_SUCCEEDED;
+  InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
-      DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
+      DGifOpen(static_cast<void*>(&info), &input_callback, &error_code);
+  const auto cleanup = gtl::MakeCleanup([gif_file]() {
+    int error_code = D_GIF_SUCCEEDED;
+    if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
+      LOG(WARNING) << "Fail to close gif file, reason: "
+                   << GifErrorString(error_code);
+    }
+  });
   if (error_code != D_GIF_SUCCEEDED) {
     LOG(ERROR) << "Fail to open gif file, reason: "
                << GifErrorString(error_code);
@@ -52,12 +71,13 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
-  int num_frames = gif_file->ImageCount;
-  int width = gif_file->SWidth;
-  int height = gif_file->SHeight;
-  int channel = 3;
+  const int num_frames = gif_file->ImageCount;
+  const int width = gif_file->SWidth;
+  const int height = gif_file->SHeight;
+  const int channel = 3;
 
-  uint8* dstdata = allocate_output(num_frames, width, height, channel);
+  uint8* const dstdata = allocate_output(num_frames, width, height, channel);
+  if (!dstdata) return nullptr;
   for (int k = 0; k < num_frames; k++) {
     SavedImage* this_image = &gif_file->SavedImages[k];
     GifImageDesc* img_desc = &this_image->ImageDesc;
@@ -84,10 +104,6 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   }
 
-  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
-    LOG(WARNING) << "Fail to close gif file, reason: "
-                 << GifErrorString(error_code);
-  }
   return dstdata;
 }
 
diff --git a/tensorflow/core/lib/gif/testdata/lena.gif b/tensorflow/core/lib/gif/testdata/lena.gif
new file mode 100644
index 00000000000..12980a3b28a
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/lena.gif differ
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index d14233686a6..19e1eb5c3bb 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <stddef.h>
 #include <functional>
+#include <initializer_list>
+#include <iterator>
 #include <utility>
 #include "tensorflow/core/lib/gtl/flatrep.h"
 #include "tensorflow/core/platform/logging.h"
@@ -38,6 +40,21 @@ class FlatMap {
   // Forward declare some internal types needed in public section.
   struct Bucket;
 
+  // We cannot use std::pair<> since internal representation stores
+  // keys and values in separate arrays, so we make a custom struct
+  // that holds references to the internal key, value elements.
+  //
+  // We define the struct as private ValueType, and typedef it as public
+  // value_type, to work around a gcc bug when compiling the iterators.
+  struct ValueType {
+    typedef Key first_type;
+    typedef Val second_type;
+
+    const Key& first;
+    Val& second;
+    ValueType(const Key& k, Val& v) : first(k), second(v) {}
+  };
+
  public:
   typedef Key key_type;
   typedef Val mapped_type;
@@ -45,18 +62,7 @@ class FlatMap {
   typedef Eq key_equal;
   typedef size_t size_type;
   typedef ptrdiff_t difference_type;
-
-  // We cannot use std::pair<> since internal representation stores
-  // keys and values in separate arrays, so we make a custom struct
-  // that holds references to the internal key, value elements.
-  struct value_type {
-    typedef Key first_type;
-    typedef Val second_type;
-
-    const Key& first;
-    Val& second;
-    value_type(const Key& k, Val& v) : first(k), second(v) {}
-  };
+  typedef ValueType value_type;
   typedef value_type* pointer;
   typedef const value_type* const_pointer;
   typedef value_type& reference;
@@ -76,6 +82,10 @@ class FlatMap {
     insert(first, last);
   }
 
+  FlatMap(std::initializer_list<std::pair<const Key, Val>> init, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatMap(init.begin(), init.end(), N, hf, eq) {}
+
   FlatMap& operator=(const FlatMap& src) {
     rep_.CopyFrom(src.rep_);
     return *this;
@@ -97,20 +107,24 @@ class FlatMap {
 
   class iterator {
    public:
+    typedef typename FlatMap::difference_type difference_type;
+    typedef typename FlatMap::value_type value_type;
+    typedef typename FlatMap::pointer pointer;
+    typedef typename FlatMap::reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
     iterator() : b_(nullptr), end_(nullptr), i_(0) {}
 
     // Make iterator pointing at first element at or after b.
-    explicit iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
-      SkipUnused();
-    }
+    iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) { SkipUnused(); }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
     iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {
       FillValue();
     }
 
-    value_type& operator*() { return *val(); }
-    value_type* operator->() { return val(); }
+    reference operator*() { return *val(); }
+    pointer operator->() { return val(); }
     bool operator==(const iterator& x) const {
       return b_ == x.b_ && i_ == x.i_;
     }
@@ -121,6 +135,11 @@ class FlatMap {
       SkipUnused();
       return *this;
     }
+    iterator operator++(int /*indicates postfix*/) {
+      iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
 
    private:
     friend class FlatMap;
@@ -129,7 +148,7 @@ class FlatMap {
     uint32 i_;
     char space_[sizeof(value_type)];
 
-    value_type* val() { return reinterpret_cast<value_type*>(space_); }
+    pointer val() { return reinterpret_cast<pointer>(space_); }
     void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
     void SkipUnused() {
       while (b_ < end_) {
@@ -150,18 +169,29 @@ class FlatMap {
    private:
     mutable iterator rep_;  // Share state and logic with non-const iterator.
    public:
+    typedef typename FlatMap::difference_type difference_type;
+    typedef typename FlatMap::value_type value_type;
+    typedef typename FlatMap::const_pointer pointer;
+    typedef typename FlatMap::const_reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
     const_iterator() : rep_() {}
-    explicit const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
+    const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
     const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
 
-    const value_type& operator*() const { return *rep_.val(); }
-    const value_type* operator->() const { return rep_.val(); }
+    reference operator*() const { return *rep_.val(); }
+    pointer operator->() const { return rep_.val(); }
     bool operator==(const const_iterator& x) const { return rep_ == x.rep_; }
     bool operator!=(const const_iterator& x) const { return rep_ != x.rep_; }
     const_iterator& operator++() {
       ++rep_;
       return *this;
     }
+    const_iterator operator++(int /*indicates postfix*/) {
+      const_iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
   };
 
   iterator begin() { return iterator(rep_.start(), rep_.limit()); }
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index 81fa08a628a..d8e50830e6a 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/test.h"
@@ -335,6 +336,28 @@ TEST(FlatMap, InitFromIter) {
   }
 }
 
+TEST(FlatMap, InitializerList) {
+  NumMap a{{1, 10}, {2, 20}, {3, 30}};
+  NumMap b({{1, 10}, {2, 20}, {3, 30}});
+  NumMap c = {{1, 10}, {2, 20}, {3, 30}};
+
+  typedef std::unordered_map<int64, int32> StdNumMap;
+  StdNumMap std({{1, 10}, {2, 20}, {3, 30}});
+  StdNumMap::value_type std_r1 = *std.find(1);
+  StdNumMap::value_type std_r2 = *std.find(2);
+  StdNumMap::value_type std_r3 = *std.find(3);
+  NumMap d{std_r1, std_r2, std_r3};
+  NumMap e({std_r1, std_r2, std_r3});
+  NumMap f = {std_r1, std_r2, std_r3};
+
+  for (NumMap* map : std::vector<NumMap*>({&a, &b, &c, &d, &e, &f})) {
+    EXPECT_EQ(Get(*map, 1), 10);
+    EXPECT_EQ(Get(*map, 2), 20);
+    EXPECT_EQ(Get(*map, 3), 30);
+    EXPECT_EQ(Contents(*map), NumMapContents({{1, 10}, {2, 20}, {3, 30}}));
+  }
+}
+
 TEST(FlatMap, InsertIter) {
   NumMap a, b;
   Fill(&a, 1, 10);
@@ -438,33 +461,127 @@ TEST(FlatMap, Prefetch) {
   }
 }
 
-// Non-copyable values should work.
-struct NC {
+// Non-assignable values should work.
+struct NA {
   int64 value;
-  NC() : value(-1) {}
-  NC(int64 v) : value(v) {}
-  NC(const NC& x) : value(x.value) {}
-  bool operator==(const NC& x) const { return value == x.value; }
+  NA() : value(-1) {}
+  explicit NA(int64 v) : value(v) {}
+  NA(const NA& x) : value(x.value) {}
+  bool operator==(const NA& x) const { return value == x.value; }
 };
-struct HashNC {
-  size_t operator()(NC x) const { return x.value; }
+struct HashNA {
+  size_t operator()(NA x) const { return x.value; }
 };
 
-TEST(FlatMap, NonCopyable) {
-  FlatMap<NC, NC, HashNC> map;
+TEST(FlatMap, NonAssignable) {
+  FlatMap<NA, NA, HashNA> map;
   for (int i = 0; i < 100; i++) {
-    map[NC(i)] = NC(i * 100);
+    map[NA(i)] = NA(i * 100);
   }
   for (int i = 0; i < 100; i++) {
-    EXPECT_EQ(map.count(NC(i)), 1);
-    auto iter = map.find(NC(i));
+    EXPECT_EQ(map.count(NA(i)), 1);
+    auto iter = map.find(NA(i));
     EXPECT_NE(iter, map.end());
-    EXPECT_EQ(iter->first, NC(i));
-    EXPECT_EQ(iter->second, NC(i * 100));
-    EXPECT_EQ(map[NC(i)], NC(i * 100));
+    EXPECT_EQ(iter->first, NA(i));
+    EXPECT_EQ(iter->second, NA(i * 100));
+    EXPECT_EQ(map[NA(i)], NA(i * 100));
   }
-  map.erase(NC(10));
-  EXPECT_EQ(map.count(NC(10)), 0);
+  map.erase(NA(10));
+  EXPECT_EQ(map.count(NA(10)), 0);
+}
+
+TEST(FlatMap, ForwardIterator) {
+  // Test the requirements of forward iterators
+  typedef FlatMap<NA, NA, HashNA> NAMap;
+  NAMap map({{NA(1), NA(10)}, {NA(2), NA(20)}});
+  NAMap::iterator it1 = map.find(NA(1));
+  NAMap::iterator it2 = map.find(NA(2));
+
+  // Test operator != and ==
+  EXPECT_TRUE(it1 != map.end());
+  EXPECT_TRUE(it2 != map.end());
+  EXPECT_FALSE(it1 == map.end());
+  EXPECT_FALSE(it2 == map.end());
+  EXPECT_TRUE(it1 != it2);
+  EXPECT_FALSE(it1 == it2);
+
+  // Test operator * and ->
+  EXPECT_EQ((*it1).first, NA(1));
+  EXPECT_EQ((*it1).second, NA(10));
+  EXPECT_EQ((*it2).first, NA(2));
+  EXPECT_EQ((*it2).second, NA(20));
+  EXPECT_EQ(it1->first, NA(1));
+  EXPECT_EQ(it1->second, NA(10));
+  EXPECT_EQ(it2->first, NA(2));
+  EXPECT_EQ(it2->second, NA(20));
+
+  // Test prefix ++
+  NAMap::iterator copy_it1 = it1;
+  NAMap::iterator copy_it2 = it2;
+  EXPECT_EQ(copy_it1->first, NA(1));
+  EXPECT_EQ(copy_it1->second, NA(10));
+  EXPECT_EQ(copy_it2->first, NA(2));
+  EXPECT_EQ(copy_it2->second, NA(20));
+  NAMap::iterator& pp_copy_it1 = ++copy_it1;
+  NAMap::iterator& pp_copy_it2 = ++copy_it2;
+  EXPECT_TRUE(pp_copy_it1 == copy_it1);
+  EXPECT_TRUE(pp_copy_it2 == copy_it2);
+  // Check either possible ordering of the two items
+  EXPECT_TRUE(copy_it1 != it1);
+  EXPECT_TRUE(copy_it2 != it2);
+  if (copy_it1 == map.end()) {
+    EXPECT_TRUE(copy_it2 != map.end());
+    EXPECT_EQ(copy_it2->first, NA(1));
+    EXPECT_EQ(copy_it2->second, NA(10));
+    EXPECT_EQ(pp_copy_it2->first, NA(1));
+    EXPECT_EQ(pp_copy_it2->second, NA(10));
+  } else {
+    EXPECT_TRUE(copy_it2 == map.end());
+    EXPECT_EQ(copy_it1->first, NA(2));
+    EXPECT_EQ(copy_it1->second, NA(20));
+    EXPECT_EQ(pp_copy_it1->first, NA(2));
+    EXPECT_EQ(pp_copy_it1->second, NA(20));
+  }
+  // Ensure it{1,2} haven't moved
+  EXPECT_EQ(it1->first, NA(1));
+  EXPECT_EQ(it1->second, NA(10));
+  EXPECT_EQ(it2->first, NA(2));
+  EXPECT_EQ(it2->second, NA(20));
+
+  // Test postfix ++
+  copy_it1 = it1;
+  copy_it2 = it2;
+  EXPECT_EQ(copy_it1->first, NA(1));
+  EXPECT_EQ(copy_it1->second, NA(10));
+  EXPECT_EQ(copy_it2->first, NA(2));
+  EXPECT_EQ(copy_it2->second, NA(20));
+  NAMap::iterator copy_it1_pp = copy_it1++;
+  NAMap::iterator copy_it2_pp = copy_it2++;
+  EXPECT_TRUE(copy_it1_pp != copy_it1);
+  EXPECT_TRUE(copy_it2_pp != copy_it2);
+  EXPECT_TRUE(copy_it1_pp == it1);
+  EXPECT_TRUE(copy_it2_pp == it2);
+  EXPECT_EQ(copy_it1_pp->first, NA(1));
+  EXPECT_EQ(copy_it1_pp->second, NA(10));
+  EXPECT_EQ(copy_it2_pp->first, NA(2));
+  EXPECT_EQ(copy_it2_pp->second, NA(20));
+  // Check either possible ordering of the two items
+  EXPECT_TRUE(copy_it1 != it1);
+  EXPECT_TRUE(copy_it2 != it2);
+  if (copy_it1 == map.end()) {
+    EXPECT_TRUE(copy_it2 != map.end());
+    EXPECT_EQ(copy_it2->first, NA(1));
+    EXPECT_EQ(copy_it2->second, NA(10));
+  } else {
+    EXPECT_TRUE(copy_it2 == map.end());
+    EXPECT_EQ(copy_it1->first, NA(2));
+    EXPECT_EQ(copy_it1->second, NA(20));
+  }
+  // Ensure it{1,2} haven't moved
+  EXPECT_EQ(it1->first, NA(1));
+  EXPECT_EQ(it1->second, NA(10));
+  EXPECT_EQ(it2->first, NA(2));
+  EXPECT_EQ(it2->second, NA(20));
 }
 
 // Test with heap-allocated objects so that mismanaged constructions
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index 26d6bd3f3ad..c5dd45457fa 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <stddef.h>
 #include <functional>
+#include <initializer_list>
+#include <iterator>
 #include <utility>
 #include "tensorflow/core/lib/gtl/flatrep.h"
 #include "tensorflow/core/platform/logging.h"
@@ -64,6 +66,10 @@ class FlatSet {
     insert(first, last);
   }
 
+  FlatSet(std::initializer_list<value_type> init, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatSet(init.begin(), init.end(), N, hf, eq) {}
+
   FlatSet& operator=(const FlatSet& src) {
     rep_.CopyFrom(src.rep_);
     return *this;
@@ -83,30 +89,42 @@ class FlatSet {
   hasher hash_function() const { return rep_.hash_function(); }
   key_equal key_eq() const { return rep_.key_eq(); }
 
-  class iterator {
+  class const_iterator {
    public:
-    iterator() : b_(nullptr), end_(nullptr), i_(0) {}
+    typedef typename FlatSet::difference_type difference_type;
+    typedef typename FlatSet::value_type value_type;
+    typedef typename FlatSet::const_pointer pointer;
+    typedef typename FlatSet::const_reference reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    const_iterator() : b_(nullptr), end_(nullptr), i_(0) {}
 
     // Make iterator pointing at first element at or after b.
-    explicit iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
+    const_iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
       SkipUnused();
     }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
-    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {}
+    const_iterator(Bucket* b, Bucket* end, uint32 i)
+        : b_(b), end_(end), i_(i) {}
 
-    Key& operator*() { return key(); }
-    Key* operator->() { return &key(); }
-    bool operator==(const iterator& x) const {
+    reference operator*() { return key(); }
+    pointer operator->() { return &key(); }
+    bool operator==(const const_iterator& x) const {
       return b_ == x.b_ && i_ == x.i_;
     }
-    bool operator!=(const iterator& x) const { return !(*this == x); }
-    iterator& operator++() {
+    bool operator!=(const const_iterator& x) const { return !(*this == x); }
+    const_iterator& operator++() {
       DCHECK(b_ != end_);
       i_++;
       SkipUnused();
       return *this;
     }
+    const_iterator operator++(int /*indicates postfix*/) {
+      const_iterator tmp(*this);
+      ++*this;
+      return tmp;
+    }
 
    private:
     friend class FlatSet;
@@ -114,7 +132,7 @@ class FlatSet {
     Bucket* end_;
     uint32 i_;
 
-    Key& key() const { return b_->key(i_); }
+    reference key() const { return b_->key(i_); }
     void SkipUnused() {
       while (b_ < end_) {
         if (i_ >= Rep::kWidth) {
@@ -129,23 +147,7 @@ class FlatSet {
     }
   };
 
-  class const_iterator {
-   private:
-    mutable iterator rep_;  // Share state and logic with non-const iterator.
-   public:
-    const_iterator() : rep_() {}
-    explicit const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
-    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
-
-    const Key& operator*() const { return rep_.key(); }
-    const Key* operator->() const { return &rep_.key(); }
-    bool operator==(const const_iterator& x) const { return rep_ == x.rep_; }
-    bool operator!=(const const_iterator& x) const { return rep_ != x.rep_; }
-    const_iterator& operator++() {
-      ++rep_;
-      return *this;
-    }
-  };
+  typedef const_iterator iterator;
 
   iterator begin() { return iterator(rep_.start(), rep_.limit()); }
   iterator end() { return iterator(rep_.limit(), rep_.limit()); }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 27bac942377..54def603c35 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -45,10 +45,7 @@ bool Has(const NumSet& set, int64 k) {
 // Return contents of set as a sorted list of numbers.
 typedef std::vector<int64> NumSetContents;
 NumSetContents Contents(const NumSet& set) {
-  NumSetContents result;
-  for (int64 n : set) {
-    result.push_back(n);
-  }
+  NumSetContents result(set.begin(), set.end());
   std::sort(result.begin(), result.end());
   return result;
 }
@@ -270,6 +267,18 @@ TEST(FlatSet, InitFromIter) {
   }
 }
 
+TEST(FlatSet, InitializerList) {
+  NumSet a{1, 2, 3};
+  NumSet b({1, 2, 3});
+  NumSet c = {1, 2, 3};
+  for (NumSet* set : std::vector<NumSet*>({&a, &b, &c})) {
+    EXPECT_TRUE(Has(*set, 1));
+    EXPECT_TRUE(Has(*set, 2));
+    EXPECT_TRUE(Has(*set, 3));
+    EXPECT_EQ(Contents(*set), NumSetContents({1, 2, 3}));
+  }
+}
+
 TEST(FlatSet, InsertIter) {
   NumSet a, b;
   Fill(&a, 1, 10);
@@ -371,31 +380,105 @@ TEST(FlatSet, Prefetch) {
   }
 }
 
-// Non-copyable values should work.
-struct NC {
+// Non-assignable values should work.
+struct NA {
   int64 value;
-  NC() : value(-1) {}
-  NC(int64 v) : value(v) {}
-  NC(const NC& x) : value(x.value) {}
-  bool operator==(const NC& x) const { return value == x.value; }
+  NA() : value(-1) {}
+  explicit NA(int64 v) : value(v) {}
+  NA(const NA& x) : value(x.value) {}
+  bool operator==(const NA& x) const { return value == x.value; }
 };
-struct HashNC {
-  size_t operator()(NC x) const { return x.value; }
+struct HashNA {
+  size_t operator()(NA x) const { return x.value; }
 };
 
-TEST(FlatSet, NonCopyable) {
-  FlatSet<NC, HashNC> set;
+TEST(FlatSet, NonAssignable) {
+  FlatSet<NA, HashNA> set;
   for (int i = 0; i < 100; i++) {
-    set.insert(NC(i));
+    set.insert(NA(i));
   }
   for (int i = 0; i < 100; i++) {
-    EXPECT_EQ(set.count(NC(i)), 1);
-    auto iter = set.find(NC(i));
+    EXPECT_EQ(set.count(NA(i)), 1);
+    auto iter = set.find(NA(i));
     EXPECT_NE(iter, set.end());
-    EXPECT_EQ(*iter, NC(i));
+    EXPECT_EQ(*iter, NA(i));
   }
-  set.erase(NC(10));
-  EXPECT_EQ(set.count(NC(10)), 0);
+  set.erase(NA(10));
+  EXPECT_EQ(set.count(NA(10)), 0);
+}
+
+TEST(FlatSet, ForwardIterator) {
+  // Test the requirements of forward iterators
+  typedef FlatSet<NA, HashNA> NASet;
+  NASet set({NA(1), NA(2)});
+  NASet::iterator it1 = set.find(NA(1));
+  NASet::iterator it2 = set.find(NA(2));
+
+  // Test operator != and ==
+  EXPECT_TRUE(it1 != set.end());
+  EXPECT_TRUE(it2 != set.end());
+  EXPECT_FALSE(it1 == set.end());
+  EXPECT_FALSE(it2 == set.end());
+  EXPECT_TRUE(it1 != it2);
+  EXPECT_FALSE(it1 == it2);
+
+  // Test operator * and ->
+  EXPECT_EQ(*it1, NA(1));
+  EXPECT_EQ(*it2, NA(2));
+  EXPECT_EQ(it1->value, 1);
+  EXPECT_EQ(it2->value, 2);
+
+  // Test prefix ++
+  NASet::iterator copy_it1 = it1;
+  NASet::iterator copy_it2 = it2;
+  EXPECT_EQ(*copy_it1, NA(1));
+  EXPECT_EQ(*copy_it2, NA(2));
+  NASet::iterator& pp_copy_it1 = ++copy_it1;
+  NASet::iterator& pp_copy_it2 = ++copy_it2;
+  EXPECT_TRUE(pp_copy_it1 == copy_it1);
+  EXPECT_TRUE(pp_copy_it2 == copy_it2);
+  // Check either possible ordering of the two items
+  EXPECT_TRUE(copy_it1 != it1);
+  EXPECT_TRUE(copy_it2 != it2);
+  if (copy_it1 == set.end()) {
+    EXPECT_TRUE(copy_it2 != set.end());
+    EXPECT_EQ(*copy_it2, NA(1));
+    EXPECT_EQ(*pp_copy_it2, NA(1));
+  } else {
+    EXPECT_TRUE(copy_it2 == set.end());
+    EXPECT_EQ(*copy_it1, NA(2));
+    EXPECT_EQ(*pp_copy_it1, NA(2));
+  }
+  // Ensure it{1,2} haven't moved
+  EXPECT_EQ(*it1, NA(1));
+  EXPECT_EQ(*it2, NA(2));
+
+  // Test postfix ++
+  copy_it1 = it1;
+  copy_it2 = it2;
+  EXPECT_EQ(*copy_it1, NA(1));
+  EXPECT_EQ(*copy_it2, NA(2));
+  NASet::iterator copy_it1_pp = copy_it1++;
+  NASet::iterator copy_it2_pp = copy_it2++;
+  EXPECT_TRUE(copy_it1_pp != copy_it1);
+  EXPECT_TRUE(copy_it2_pp != copy_it2);
+  EXPECT_TRUE(copy_it1_pp == it1);
+  EXPECT_TRUE(copy_it2_pp == it2);
+  EXPECT_EQ(*copy_it1_pp, NA(1));
+  EXPECT_EQ(*copy_it2_pp, NA(2));
+  // Check either possible ordering of the two items
+  EXPECT_TRUE(copy_it1 != it1);
+  EXPECT_TRUE(copy_it2 != it2);
+  if (copy_it1 == set.end()) {
+    EXPECT_TRUE(copy_it2 != set.end());
+    EXPECT_EQ(*copy_it2, NA(1));
+  } else {
+    EXPECT_TRUE(copy_it2 == set.end());
+    EXPECT_EQ(*copy_it1, NA(2));
+  }
+  // Ensure it{1,2} haven't moved
+  EXPECT_EQ(*it1, NA(1));
+  EXPECT_EQ(*it2, NA(2));
 }
 
 // Test with heap-allocated objects so that mismanaged constructions
@@ -496,6 +579,27 @@ TEST(FlatSet, UniqueSetIter) {
   EXPECT_EQ(sum, (kCount * (kCount + 1)) / 2);
 }
 
+/* This would be a good negative compilation test, if we could do that.
+
+TEST(FlatSet, MutableIterator_ShouldNotCompile) {
+  NumSet set;
+  set.insert(5);
+  EXPECT_TRUE(Has(set, 5));
+  EXPECT_EQ(Contents(set), NumSetContents({5}));
+
+  // Here's where things go bad.  We shouldn't be allowed to mutate the set key
+  // directly, since there's no way the update the underlying hashtable after
+  // the mutation, regardless of how we implemented it.
+  //
+  // This doesn't compile, since iterator is an alias of const_iterator.
+  *set.begin() = 6;
+
+  // If it does compile, this should expose a failure.
+  EXPECT_TRUE(Has(set, 6));
+  EXPECT_EQ(Contents(set), NumSetContents({6}));
+}
+*/
+
 }  // namespace
 }  // namespace gtl
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/inlined_vector_test.cc b/tensorflow/core/lib/gtl/inlined_vector_test.cc
index b957fedc4a0..2721885c4a7 100644
--- a/tensorflow/core/lib/gtl/inlined_vector_test.cc
+++ b/tensorflow/core/lib/gtl/inlined_vector_test.cc
@@ -65,7 +65,7 @@ class RefCounted {
 
   ~RefCounted() {
     Unref();
-    count_ = NULL;
+    count_ = nullptr;
   }
 
   friend void swap(RefCounted& a, RefCounted& b) {
@@ -81,7 +81,7 @@ class RefCounted {
   }
 
   void Ref() const {
-    CHECK(count_ != NULL);
+    CHECK(count_ != nullptr);
     ++(*count_);
     VLOG(5) << "[Ref: refcount " << *count_ << " on count @" << count_ << "]";
   }
@@ -778,6 +778,7 @@ BENCHMARK(BM_InlinedVectorFillRange)->Range(0, 1024);
 static void BM_StdVectorFill(int iters, int len) {
   for (int i = 0; i < iters; i++) {
     std::vector<int> v;
+    v.reserve(len);
     for (int j = 0; j < len; j++) {
       v.push_back(j);
     }
@@ -810,13 +811,14 @@ static void BM_StdVectorFillString(int iters, int len) {
                        "012345678901234567", "to cause allocation"};
   for (int i = 0; i < iters; i++) {
     std::vector<string> v;
+    v.reserve(len);
     for (int j = 0; j < len; j++) {
       v.push_back(strings[j & 3]);
     }
   }
   testing::ItemsProcessed(int64{iters} * len);
   // The purpose of the benchmark is to verify that inlined vector is
-  // efficient when moving is more efficent than copying. To do so, we
+  // efficient when moving is more efficient than copying. To do so, we
   // use strings that are larger than the small string optimization.
   CHECK(!StringRepresentedInline(strings[0]));
 }
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
new file mode 100644
index 00000000000..8ba4b091434
--- /dev/null
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -0,0 +1,876 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LIB_GTL_OPTIONAL_H_
+#define TENSORFLOW_LIB_GTL_OPTIONAL_H_
+
+#include <assert.h>
+#include <functional>
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// A value of type gtl::optional<T> holds either a value of T or an
+// "empty" value.  When it holds a value of T, it stores it as a direct
+// subobject, so sizeof(optional<T>) is approximately sizeof(T)+1. The interface
+// is based on the upcoming std::optional<T>, and gtl::optional<T> is
+// designed to be cheaply drop-in replaceable by std::optional<T>, once it is
+// rolled out.
+//
+// This implementation is based on the specification in the latest draft as of
+// 2017-01-05, section 20.6.
+//
+// Differences between gtl::optional<T> and std::optional<T> include:
+//    - constexpr not used for nonconst member functions.
+//      (dependency on some differences between C++11 and C++14.)
+//    - nullopt and in_place are not constexpr. We need the inline variable
+//      support in C++17 for external linkage.
+//    - CHECK instead of throwing std::bad_optional_access.
+//    - optional::swap() and swap() relies on std::is_(nothrow_)swappable
+//      which is introduced in C++17. So we assume is_swappable is always true
+//      and is_nothrow_swappable is same as std::is_trivial.
+//    - make_optional cannot be constexpr due to absence of guaranteed copy
+//      elision.
+//
+// Synopsis:
+//
+//     #include "tensorflow/core/lib/gtl/optional.h"
+//
+//     tensorflow::gtl::optional<string> f() {
+//       string result;
+//       if (...) {
+//          ...
+//          result = ...;
+//          return result;
+//       } else {
+//          ...
+//          return tensorflow::gtl::nullopt;
+//       }
+//     }
+//
+//     int main() {
+//         tensorflow::gtl::optional<string> optstr = f();
+//         if (optstr) {
+//            // non-empty
+//            print(optstr.value());
+//         } else {
+//            // empty
+//            error();
+//         }
+//     }
+template <typename T>
+class optional;
+
+// The tag constant `in_place` is used as the first parameter of an optional<T>
+// constructor to indicate that the remaining arguments should be forwarded
+// to the underlying T constructor.
+struct in_place_t {};
+extern const in_place_t in_place;
+
+// The tag constant `nullopt` is used to indicate an empty optional<T> in
+// certain functions, such as construction or assignment.
+struct nullopt_t {
+  struct init_t {};
+  static init_t init;
+  // It must not be default-constructible to avoid ambiguity for opt = {}.
+  // Note the non-const reference, it is to eliminate ambiguity for code like:
+  // struct S { int value; };
+  //
+  // void Test() {
+  //   optional<S> opt;
+  //   opt = {{}};
+  // }
+  explicit constexpr nullopt_t(init_t& /*unused*/) {}  // NOLINT
+};
+extern const nullopt_t nullopt;
+
+namespace internal_optional {
+
+// define forward locally because std::forward is not constexpr until C++14
+template <typename T>
+constexpr T&& forward(typename std::remove_reference<T>::type&
+                          t) noexcept {  // NOLINT(runtime/references)
+  return static_cast<T&&>(t);
+}
+
+struct empty_struct {};
+// This class stores the data in optional<T>.
+// It is specialized based on whether T is trivially destructible.
+// This is the specialization for non trivially destructible type.
+template <typename T, bool = std::is_trivially_destructible<T>::value>
+class optional_data_dtor_base {
+ protected:
+  // Whether there is data or not.
+  bool engaged_;
+  // data storage
+  union {
+    empty_struct dummy_;
+    T data_;
+  };
+
+  void destruct() noexcept {
+    if (engaged_) {
+      data_.~T();
+      engaged_ = false;
+    }
+  }
+
+  // dummy_ must be initialized for constexpr constructor
+  constexpr optional_data_dtor_base() noexcept : engaged_(false), dummy_{} {}
+
+  template <typename... Args>
+  constexpr explicit optional_data_dtor_base(in_place_t, Args&&... args)
+      : engaged_(true), data_(internal_optional::forward<Args>(args)...) {}
+
+  ~optional_data_dtor_base() { destruct(); }
+};
+
+// Specialization for trivially destructible type.
+template <typename T>
+class optional_data_dtor_base<T, true> {
+ protected:
+  // Whether there is data or not.
+  bool engaged_;
+  // data storage
+  union {
+    empty_struct dummy_;
+    T data_;
+  };
+  void destruct() noexcept { engaged_ = false; }
+
+  // dummy_ must be initialized for constexpr constructor
+  constexpr optional_data_dtor_base() noexcept : engaged_(false), dummy_{} {}
+
+  template <typename... Args>
+  constexpr explicit optional_data_dtor_base(in_place_t, Args&&... args)
+      : engaged_(true), data_(internal_optional::forward<Args>(args)...) {}
+
+  ~optional_data_dtor_base() = default;
+};
+
+template <typename T>
+class optional_data : public optional_data_dtor_base<T> {
+ protected:
+  using base = optional_data_dtor_base<T>;
+  using base::base;
+
+  T* pointer() { return &this->data_; }
+
+  constexpr const T* pointer() const { return &this->data_; }
+
+  template <typename... Args>
+  void construct(Args&&... args) {
+    new (pointer()) T(std::forward<Args>(args)...);
+    this->engaged_ = true;
+  }
+
+  template <typename U>
+  void assign(U&& u) {
+    if (this->engaged_) {
+      this->data_ = std::forward<U>(u);
+    } else {
+      construct(std::forward<U>(u));
+    }
+  }
+
+  optional_data() = default;
+
+  optional_data(const optional_data& rhs) {
+    if (rhs.engaged_) {
+      construct(rhs.data_);
+    }
+  }
+
+  optional_data(optional_data&& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.engaged_) {
+      construct(std::move(rhs.data_));
+    }
+  }
+
+  optional_data& operator=(const optional_data& rhs) {
+    if (rhs.engaged_) {
+      assign(rhs.data_);
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  optional_data& operator=(optional_data&& rhs) noexcept(
+      std::is_nothrow_move_assignable<T>::value&&
+          std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.engaged_) {
+      assign(std::move(rhs.data_));
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+};
+
+// ordered by level of restriction, from low to high.
+// copyable implies movable.
+enum class copy_traits { copyable = 0, movable = 1, non_movable = 2 };
+
+// base class for enabling/disabling copy/move constructor.
+template <copy_traits>
+class optional_ctor_base;
+
+template <>
+class optional_ctor_base<copy_traits::copyable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = default;
+  optional_ctor_base(optional_ctor_base&&) = default;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+template <>
+class optional_ctor_base<copy_traits::movable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = delete;
+  optional_ctor_base(optional_ctor_base&&) = default;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+template <>
+class optional_ctor_base<copy_traits::non_movable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = delete;
+  optional_ctor_base(optional_ctor_base&&) = delete;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+// base class for enabling/disabling copy/move assignment.
+template <copy_traits>
+class optional_assign_base;
+
+template <>
+class optional_assign_base<copy_traits::copyable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = default;
+  optional_assign_base& operator=(optional_assign_base&&) = default;
+};
+
+template <>
+class optional_assign_base<copy_traits::movable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = delete;
+  optional_assign_base& operator=(optional_assign_base&&) = default;
+};
+
+template <>
+class optional_assign_base<copy_traits::non_movable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = delete;
+  optional_assign_base& operator=(optional_assign_base&&) = delete;
+};
+
+template <typename T>
+constexpr copy_traits get_ctor_copy_traits() {
+  return std::is_copy_constructible<T>::value
+             ? copy_traits::copyable
+             : std::is_move_constructible<T>::value ? copy_traits::movable
+                                                    : copy_traits::non_movable;
+}
+
+template <typename T>
+constexpr copy_traits get_assign_copy_traits() {
+  return std::is_copy_assignable<T>::value &&
+                 std::is_copy_constructible<T>::value
+             ? copy_traits::copyable
+             : std::is_move_assignable<T>::value &&
+                       std::is_move_constructible<T>::value
+                   ? copy_traits::movable
+                   : copy_traits::non_movable;
+}
+
+// Whether T is constructible or convertible from optional<U>.
+template <typename T, typename U>
+struct is_constructible_convertible_from_optional
+    : std::integral_constant<
+          bool, std::is_constructible<T, optional<U>&>::value ||
+                    std::is_constructible<T, optional<U>&&>::value ||
+                    std::is_constructible<T, const optional<U>&>::value ||
+                    std::is_constructible<T, const optional<U>&&>::value ||
+                    std::is_convertible<optional<U>&, T>::value ||
+                    std::is_convertible<optional<U>&&, T>::value ||
+                    std::is_convertible<const optional<U>&, T>::value ||
+                    std::is_convertible<const optional<U>&&, T>::value> {};
+
+// Whether T is constructible or convertible or assignable from optional<U>.
+template <typename T, typename U>
+struct is_constructible_convertible_assignable_from_optional
+    : std::integral_constant<
+          bool, is_constructible_convertible_from_optional<T, U>::value ||
+                    std::is_assignable<T&, optional<U>&>::value ||
+                    std::is_assignable<T&, optional<U>&&>::value ||
+                    std::is_assignable<T&, const optional<U>&>::value ||
+                    std::is_assignable<T&, const optional<U>&&>::value> {};
+
+}  // namespace internal_optional
+
+template <typename T>
+class optional : private internal_optional::optional_data<T>,
+                 private internal_optional::optional_ctor_base<
+                     internal_optional::get_ctor_copy_traits<T>()>,
+                 private internal_optional::optional_assign_base<
+                     internal_optional::get_assign_copy_traits<T>()> {
+  using data_base = internal_optional::optional_data<T>;
+
+ public:
+  typedef T value_type;
+
+  // [optional.ctor], constructors
+
+  // A default constructed optional holds the empty value, NOT a default
+  // constructed T.
+  constexpr optional() noexcept {}
+
+  // An optional initialized with `nullopt` holds the empty value.
+  constexpr optional(nullopt_t) noexcept {}  // NOLINT(runtime/explicit)
+
+  // Copy constructor, standard semantics.
+  optional(const optional& src) = default;
+
+  // Move constructor, standard semantics.
+  optional(optional&& src) = default;
+
+  // optional<T>(in_place, arg1, arg2, arg3) constructs a non-empty optional
+  // with an in-place constructed value of T(arg1,arg2,arg3).
+  // TODO(b/34201852): Add std::is_constructible<T, Args&&...> SFINAE.
+  template <typename... Args>
+  constexpr explicit optional(in_place_t, Args&&... args)
+      : data_base(in_place_t(), internal_optional::forward<Args>(args)...) {}
+
+  // optional<T>(in_place, {arg1, arg2, arg3}) constructs a non-empty optional
+  // with an in-place list-initialized value of T({arg1, arg2, arg3}).
+  template <typename U, typename... Args,
+            typename = typename std::enable_if<std::is_constructible<
+                T, std::initializer_list<U>&, Args&&...>::value>::type>
+  constexpr explicit optional(in_place_t, std::initializer_list<U> il,
+                              Args&&... args)
+      : data_base(in_place_t(), il, internal_optional::forward<Args>(args)...) {
+  }
+
+  template <
+      typename U = T,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !std::is_same<in_place_t, typename std::decay<U>::type>::value &&
+              !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+              std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  constexpr optional(U&& v)  // NOLINT
+      : data_base(in_place_t(), internal_optional::forward<U>(v)) {}
+
+  template <
+      typename U = T,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !std::is_same<in_place_t, typename std::decay<U>::type>::value &&
+              !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+              !std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  explicit constexpr optional(U&& v)
+      : data_base(in_place_t(), internal_optional::forward<U>(v)) {}
+
+  // Converting copy constructor (implicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, const U&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              std::is_convertible<const U&, T>::value,
+          bool>::type = false>
+  optional(const optional<U>& rhs) {  // NOLINT
+    if (rhs) {
+      this->construct(*rhs);
+    }
+  }
+
+  // Converting copy constructor (explicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, const U&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              !std::is_convertible<const U&, T>::value,
+          bool>::type = false>
+  explicit optional(const optional<U>& rhs) {
+    if (rhs) {
+      this->construct(*rhs);
+    }
+  }
+
+  // Converting move constructor (implicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  optional(optional<U>&& rhs) {  // NOLINT
+    if (rhs) {
+      this->construct(std::move(*rhs));
+    }
+  }
+
+  // Converting move constructor (explicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              !std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  explicit optional(optional<U>&& rhs) {
+    if (rhs) {
+      this->construct(std::move(*rhs));
+    }
+  }
+
+  // [optional.dtor], destructor, trivial if T is trivially destructible.
+  ~optional() = default;
+
+  // [optional.assign], assignment
+
+  // Assignment from nullopt: opt = nullopt
+  optional& operator=(nullopt_t) noexcept {
+    this->destruct();
+    return *this;
+  }
+
+  // Copy assigment, standard semantics.
+  optional& operator=(const optional& src) = default;
+
+  // Move assignment, standard semantics.
+  optional& operator=(optional&& src) = default;
+
+  // Value assignment
+  template <
+      typename U = T,
+      typename = typename std::enable_if<
+          !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+          (!std::is_scalar<T>::value ||
+           !std::is_same<T, typename std::decay<U>::type>::value) &&
+          std::is_constructible<T, U>::value &&
+          std::is_assignable<T&, U>::value>::type>
+  optional& operator=(U&& v) {
+    this->assign(std::forward<U>(v));
+    return *this;
+  }
+
+  template <typename U,
+            typename = typename std::enable_if<
+                std::is_constructible<T, const U&>::value &&
+                std::is_assignable<T&, const U&>::value &&
+                !internal_optional::
+                    is_constructible_convertible_assignable_from_optional<
+                        T, U>::value>::type>
+  optional& operator=(const optional<U>& rhs) {
+    if (rhs) {
+      this->assign(*rhs);
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  template <typename U,
+            typename = typename std::enable_if<
+                std::is_constructible<T, U>::value &&
+                std::is_assignable<T&, U>::value &&
+                !internal_optional::
+                    is_constructible_convertible_assignable_from_optional<
+                        T, U>::value>::type>
+  optional& operator=(optional<U>&& rhs) {
+    if (rhs) {
+      this->assign(std::move(*rhs));
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  // [optional.mod], modifiers
+  // Destroys the inner T value if one is present.
+  void reset() noexcept { this->destruct(); }
+
+  // Emplace reconstruction.  (Re)constructs the underlying T in-place with the
+  // given arguments forwarded:
+  //
+  // optional<Foo> opt;
+  // opt.emplace(arg1,arg2,arg3);  (Constructs Foo(arg1,arg2,arg3))
+  //
+  // If the optional is non-empty, and the `args` refer to subobjects of the
+  // current object, then behavior is undefined.  This is because the current
+  // object will be destructed before the new object is constructed with `args`.
+  //
+  template <typename... Args,
+            typename = typename std::enable_if<
+                std::is_constructible<T, Args&&...>::value>::type>
+  void emplace(Args&&... args) {
+    this->destruct();
+    this->construct(std::forward<Args>(args)...);
+  }
+
+  // Emplace reconstruction with initializer-list.  See immediately above.
+  template <class U, class... Args,
+            typename = typename std::enable_if<std::is_constructible<
+                T, std::initializer_list<U>&, Args&&...>::value>::type>
+  void emplace(std::initializer_list<U> il, Args&&... args) {
+    this->destruct();
+    this->construct(il, std::forward<Args>(args)...);
+  }
+
+  // [optional.swap], swap
+  // Swap, standard semantics.
+  void swap(optional& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value&&
+          std::is_trivial<T>::value) {
+    if (*this) {
+      if (rhs) {
+        using std::swap;
+        swap(**this, *rhs);
+      } else {
+        rhs.construct(std::move(**this));
+        this->destruct();
+      }
+    } else {
+      if (rhs) {
+        this->construct(std::move(*rhs));
+        rhs.destruct();
+      } else {
+        // no effect (swap(disengaged, disengaged))
+      }
+    }
+  }
+
+  // [optional.observe], observers
+  // You may use `*opt`, and `opt->m`, to access the underlying T value and T's
+  // member `m`, respectively.  If the optional is empty, behavior is
+  // undefined.
+  constexpr const T* operator->() const { return this->pointer(); }
+  T* operator->() {
+    assert(this->engaged_);
+    return this->pointer();
+  }
+  constexpr const T& operator*() const & { return reference(); }
+  T& operator*() & {
+    assert(this->engaged_);
+    return reference();
+  }
+  constexpr const T&& operator*() const && { return std::move(reference()); }
+  T&& operator*() && {
+    assert(this->engaged_);
+    return std::move(reference());
+  }
+
+  // In a bool context an optional<T> will return false if and only if it is
+  // empty.
+  //
+  //   if (opt) {
+  //     // do something with opt.value();
+  //   } else {
+  //     // opt is empty
+  //   }
+  //
+  constexpr explicit operator bool() const noexcept { return this->engaged_; }
+
+  // Returns false if and only if *this is empty.
+  constexpr bool has_value() const noexcept { return this->engaged_; }
+
+  // Use `opt.value()` to get a reference to underlying value.  The constness
+  // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
+  // subobject.
+  const T& value() const & {
+    CHECK(*this) << "Bad optional access";
+    return reference();
+  }
+  T& value() & {
+    CHECK(*this) << "Bad optional access";
+    return reference();
+  }
+  T&& value() && {  // NOLINT(build/c++11)
+    CHECK(*this) << "Bad optional access";
+    return std::move(reference());
+  }
+  const T&& value() const && {  // NOLINT(build/c++11)
+    CHECK(*this) << "Bad optional access";
+    return std::move(reference());
+  }
+
+  // Use `opt.value_or(val)` to get either the value of T or the given default
+  // `val` in the empty case.
+  template <class U>
+  constexpr T value_or(U&& v) const & {
+    return static_cast<bool>(*this) ? **this
+                                    : static_cast<T>(std::forward<U>(v));
+  }
+  template <class U>
+  T value_or(U&& v) && {  // NOLINT(build/c++11)
+    return static_cast<bool>(*this) ? std::move(**this)
+                                    : static_cast<T>(std::forward<U>(v));
+  }
+
+ private:
+  // Private accessors for internal storage viewed as reference to T.
+  constexpr const T& reference() const { return *this->pointer(); }
+  T& reference() { return *(this->pointer()); }
+
+  // T constaint checks.  You can't have an optional of nullopt_t, in_place_t or
+  // a reference.
+  static_assert(
+      !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
+      "optional<nullopt_t> is not allowed.");
+  static_assert(
+      !std::is_same<in_place_t, typename std::remove_cv<T>::type>::value,
+      "optional<in_place_t> is not allowed.");
+  static_assert(!std::is_reference<T>::value,
+                "optional<reference> is not allowed.");
+};
+
+// [optional.specalg]
+// Swap, standard semantics.
+// This function shall not participate in overload resolution unless
+// is_move_constructible_v<T> is true and is_swappable_v<T> is true.
+// NOTE: we assume is_swappable is always true. There will be a compiling error
+// if T is actually not Swappable.
+template <typename T,
+          typename std::enable_if<std::is_move_constructible<T>::value,
+                                  bool>::type = false>
+void swap(optional<T>& a, optional<T>& b) noexcept(noexcept(a.swap(b))) {
+  a.swap(b);
+}
+
+// NOTE: make_optional cannot be constexpr in C++11 because the copy/move
+// constructor is not constexpr and we don't have guaranteed copy elision
+// util C++17. But they are still declared constexpr for consistency with
+// the standard.
+
+// make_optional(v) creates a non-empty optional<T> where the type T is deduced
+// from v.  Can also be explicitly instantiated as make_optional<T>(v).
+template <typename T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v) {
+  return optional<typename std::decay<T>::type>(std::forward<T>(v));
+}
+
+template <typename T, typename... Args>
+constexpr optional<T> make_optional(Args&&... args) {
+  return optional<T>(in_place_t(), internal_optional::forward<Args>(args)...);
+}
+
+template <typename T, typename U, typename... Args>
+constexpr optional<T> make_optional(std::initializer_list<U> il,
+                                    Args&&... args) {
+  return optional<T>(in_place_t(), il,
+                     internal_optional::forward<Args>(args)...);
+}
+
+// Relational operators. Empty optionals are considered equal to each
+// other and less than non-empty optionals. Supports relations between
+// optional<T> and optional<T>, between optional<T> and T, and between
+// optional<T> and nullopt.
+// Note: We're careful to support T having non-bool relationals.
+
+// Relational operators [optional.relops]
+// The C++17 (N4606) "Returns:" statements are translated into code
+// in an obvious way here, and the original text retained as function docs.
+// Returns: If bool(x) != bool(y), false; otherwise if bool(x) == false, true;
+// otherwise *x == *y.
+template <class T>
+constexpr bool operator==(const optional<T>& x, const optional<T>& y) {
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? false
+             : static_cast<bool>(x) == false ? true : *x == *y;
+}
+// Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false;
+// otherwise *x != *y.
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const optional<T>& y) {
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? true
+             : static_cast<bool>(x) == false ? false : *x != *y;
+}
+// Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y.
+template <class T>
+constexpr bool operator<(const optional<T>& x, const optional<T>& y) {
+  return !y ? false : !x ? true : *x < *y;
+}
+// Returns: If !x, false; otherwise, if !y, true; otherwise *x > *y.
+template <class T>
+constexpr bool operator>(const optional<T>& x, const optional<T>& y) {
+  return !x ? false : !y ? true : *x > *y;
+}
+// Returns: If !x, true; otherwise, if !y, false; otherwise *x <= *y.
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const optional<T>& y) {
+  return !x ? true : !y ? false : *x <= *y;
+}
+// Returns: If !y, true; otherwise, if !x, false; otherwise *x >= *y.
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const optional<T>& y) {
+  return !y ? true : !x ? false : *x >= *y;
+}
+
+// Comparison with nullopt [optional.nullops]
+// The C++17 (N4606) "Returns:" statements are used directly here.
+template <class T>
+constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator<(const optional<T>& x, nullopt_t) noexcept {
+  return false;
+}
+template <class T>
+constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator<=(nullopt_t, const optional<T>& x) noexcept {
+  return true;
+}
+template <class T>
+constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator>(nullopt_t, const optional<T>& x) noexcept {
+  return false;
+}
+template <class T>
+constexpr bool operator>=(const optional<T>& x, nullopt_t) noexcept {
+  return true;
+}
+template <class T>
+constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
+  return !x;
+}
+
+// Comparison with T [optional.comp_with_t]
+// The C++17 (N4606) "Equivalent to:" statements are used directly here.
+template <class T>
+constexpr bool operator==(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x == v : false;
+}
+template <class T>
+constexpr bool operator==(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v == *x : false;
+}
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x != v : true;
+}
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v != *x : true;
+}
+template <class T>
+constexpr bool operator<(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x < v : true;
+}
+template <class T>
+constexpr bool operator<(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v < *x : false;
+}
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x <= v : true;
+}
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v <= *x : false;
+}
+template <class T>
+constexpr bool operator>(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x > v : false;
+}
+template <class T>
+constexpr bool operator>(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v > *x : true;
+}
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x >= v : false;
+}
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v >= *x : true;
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+namespace std {
+
+// Normally std::hash specializations are not recommended in tensorflow code,
+// but we allow this as it is following a standard library component.
+template <class T>
+struct hash<::tensorflow::gtl::optional<T>> {
+  size_t operator()(const ::tensorflow::gtl::optional<T>& opt) const {
+    if (opt) {
+      return hash<T>()(*opt);
+    } else {
+      return static_cast<size_t>(0x297814aaad196e6dULL);
+    }
+  }
+};
+
+}  // namespace std
+
+#endif  // TENSORFLOW_LIB_GTL_OPTIONAL_H_
diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc
new file mode 100644
index 00000000000..547bee7b75f
--- /dev/null
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@@ -0,0 +1,1086 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/gtl/optional.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::gtl::optional;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::nullopt_t;
+using tensorflow::gtl::in_place;
+using tensorflow::gtl::in_place_t;
+using tensorflow::gtl::make_optional;
+
+template <typename T> string TypeQuals(T&) { return "&"; }
+template <typename T> string TypeQuals(T&&) { return "&&"; }
+template <typename T> string TypeQuals(const T&) { return "c&"; }
+template <typename T> string TypeQuals(const T&&) { return "c&&"; }
+
+struct StructorListener {
+  int construct0 = 0;
+  int construct1 = 0;
+  int construct2 = 0;
+  int listinit = 0;
+  int copy = 0;
+  int move = 0;
+  int copy_assign = 0;
+  int move_assign = 0;
+  int destruct = 0;
+};
+
+struct Listenable {
+  static StructorListener* listener;
+
+  Listenable() { ++listener->construct0; }
+  Listenable(int /*unused*/) { ++listener->construct1; }  // NOLINT
+  Listenable(int /*unused*/, int /*unused*/) { ++listener->construct2; }
+  Listenable(std::initializer_list<int> /*unused*/) { ++listener->listinit; }
+  Listenable(const Listenable& /*unused*/) { ++listener->copy; }
+  Listenable(Listenable&& /*unused*/) { ++listener->move; }  // NOLINT
+  Listenable& operator=(const Listenable& /*unused*/) {
+    ++listener->copy_assign;
+    return *this;
+  }
+  Listenable& operator=(Listenable&& /*unused*/) {  // NOLINT
+    ++listener->move_assign;
+    return *this;
+  }
+  ~Listenable() { ++listener->destruct; }
+};
+
+StructorListener* Listenable::listener = nullptr;
+
+// clang on macos -- even the latest major version at time of writing (8.x) --
+// does not like much of our constexpr business.  clang < 3.0 also has trouble.
+#if defined(__clang__) && defined(__APPLE__)
+#define SKIP_CONSTEXPR_TEST_DUE_TO_CLANG_BUG
+#endif
+
+struct ConstexprType {
+  constexpr ConstexprType() : x(0) {}
+  constexpr explicit ConstexprType(int i) : x(i) {}
+#ifndef SKIP_CONSTEXPR_TEST_DUE_TO_CLANG_BUG
+  constexpr ConstexprType(std::initializer_list<int> il) : x(il.size()) {}
+#endif
+  constexpr ConstexprType(const char* s) : x(-1) {}  // NOLINT
+  int x;
+};
+
+struct Copyable {
+  Copyable() {}
+  Copyable(const Copyable&) {}
+  Copyable& operator=(const Copyable&) { return *this; }
+};
+
+struct MoveableThrow {
+  MoveableThrow() {}
+  MoveableThrow(MoveableThrow&&) {}
+  MoveableThrow& operator=(MoveableThrow&&) { return *this; }
+};
+
+struct MoveableNoThrow {
+  MoveableNoThrow() {}
+  MoveableNoThrow(MoveableNoThrow&&) noexcept {}
+  MoveableNoThrow& operator=(MoveableNoThrow&&) noexcept { return *this; }
+};
+
+struct NonMovable {
+  NonMovable() {}
+  NonMovable(const NonMovable&) = delete;
+  NonMovable& operator=(const NonMovable&) = delete;
+  NonMovable(NonMovable&&) = delete;
+  NonMovable& operator=(NonMovable&&) = delete;
+};
+
+TEST(optionalTest, DefaultConstructor) {
+  optional<int> empty;
+  EXPECT_FALSE(!!empty);
+  constexpr optional<int> cempty;
+  static_assert(!cempty.has_value(), "");
+  EXPECT_TRUE(std::is_nothrow_default_constructible<optional<int>>::value);
+}
+
+TEST(optionalTest, NullOptConstructor) {
+  optional<int> empty(nullopt);
+  EXPECT_FALSE(!!empty);
+  // Creating a temporary nullopt_t object instead of using nullopt because
+  // nullopt cannot be constexpr and have external linkage at the same time.
+  constexpr optional<int> cempty{nullopt_t(nullopt_t::init)};
+  static_assert(!cempty.has_value(), "");
+  EXPECT_TRUE((std::is_nothrow_constructible<optional<int>, nullopt_t>::value));
+}
+
+TEST(optionalTest, CopyConstructor) {
+  optional<int> empty, opt42 = 42;
+  optional<int> empty_copy(empty);
+  EXPECT_FALSE(!!empty_copy);
+  optional<int> opt42_copy(opt42);
+  EXPECT_TRUE(!!opt42_copy);
+  EXPECT_EQ(42, opt42_copy);
+  // test copyablility
+  EXPECT_TRUE(std::is_copy_constructible<optional<int>>::value);
+  EXPECT_TRUE(std::is_copy_constructible<optional<Copyable>>::value);
+  EXPECT_FALSE(std::is_copy_constructible<optional<MoveableThrow>>::value);
+  EXPECT_FALSE(std::is_copy_constructible<optional<MoveableNoThrow>>::value);
+  EXPECT_FALSE(std::is_copy_constructible<optional<NonMovable>>::value);
+}
+
+TEST(optionalTest, MoveConstructor) {
+  optional<int> empty, opt42 = 42;
+  optional<int> empty_move(std::move(empty));
+  EXPECT_FALSE(!!empty_move);
+  optional<int> opt42_move(std::move(opt42));
+  EXPECT_TRUE(!!opt42_move);
+  EXPECT_EQ(42, opt42_move);
+  // test movability
+  EXPECT_TRUE(std::is_move_constructible<optional<int>>::value);
+  EXPECT_TRUE(std::is_move_constructible<optional<Copyable>>::value);
+  EXPECT_TRUE(std::is_move_constructible<optional<MoveableThrow>>::value);
+  EXPECT_TRUE(std::is_move_constructible<optional<MoveableNoThrow>>::value);
+  EXPECT_FALSE(std::is_move_constructible<optional<NonMovable>>::value);
+  // test noexcept
+  EXPECT_TRUE(std::is_nothrow_move_constructible<optional<int>>::value);
+  EXPECT_FALSE(
+      std::is_nothrow_move_constructible<optional<MoveableThrow>>::value);
+  EXPECT_TRUE(
+      std::is_nothrow_move_constructible<optional<MoveableNoThrow>>::value);
+}
+
+TEST(optionalTest, Destructor) {
+  struct Trivial {};
+
+  struct NonTrivial {
+    ~NonTrivial() {}
+  };
+
+  EXPECT_TRUE(std::is_trivially_destructible<optional<int>>::value);
+  EXPECT_TRUE(std::is_trivially_destructible<optional<Trivial>>::value);
+  EXPECT_FALSE(std::is_trivially_destructible<optional<NonTrivial>>::value);
+}
+
+TEST(optionalTest, InPlaceConstructor) {
+  constexpr optional<ConstexprType> opt0{in_place_t()};
+  static_assert(opt0, "");
+  static_assert(opt0->x == 0, "");
+  constexpr optional<ConstexprType> opt1{in_place_t(), 1};
+  static_assert(opt1, "");
+  static_assert(opt1->x == 1, "");
+#ifndef SKIP_CONSTEXPR_TEST_DUE_TO_CLANG_BUG
+  constexpr optional<ConstexprType> opt2{in_place_t(), {1, 2}};
+  static_assert(opt2, "");
+  static_assert(opt2->x == 2, "");
+#endif
+
+  // TODO(b/34201852): uncomment these when std::is_constructible<T, Args&&...>
+  // SFINAE is added to optional::optional(in_place_t, Args&&...).
+  // struct I {
+  //   I(in_place_t);
+  // };
+
+  // EXPECT_FALSE((std::is_constructible<optional<I>, in_place_t>::value));
+  // EXPECT_FALSE((std::is_constructible<optional<I>, const
+  // in_place_t&>::value));
+}
+
+// template<U=T> optional(U&&);
+TEST(optionalTest, ValueConstructor) {
+  constexpr optional<int> opt0(0);
+  static_assert(opt0, "");
+  static_assert(*opt0 == 0, "");
+  EXPECT_TRUE((std::is_convertible<int, optional<int>>::value));
+  // Copy initialization ( = "abc") won't work due to optional(optional&&)
+  // is not constexpr. Use list initialization instead. This invokes
+  // optional<ConstexprType>::optional<U>(U&&), with U = const char (&) [4],
+  // which direct-initializes the ConstexprType value held by the optional
+  // via ConstexprType::ConstexprType(const char*).
+  constexpr optional<ConstexprType> opt1 = {"abc"};
+  static_assert(opt1, "");
+  static_assert(-1 == opt1->x, "");
+  EXPECT_TRUE(
+      (std::is_convertible<const char*, optional<ConstexprType>>::value));
+  // direct initialization
+  constexpr optional<ConstexprType> opt2{2};
+  static_assert(opt2, "");
+  static_assert(2 == opt2->x, "");
+  EXPECT_FALSE((std::is_convertible<int, optional<ConstexprType>>::value));
+
+  // this invokes optional<int>::optional(int&&)
+  // NOTE: this has different behavior than assignment, e.g.
+  // "opt3 = {};" clears the optional rather than setting the value to 0
+  constexpr optional<int> opt3({});
+  static_assert(opt3, "");
+  static_assert(*opt3 == 0, "");
+
+  // this invokes the move constructor with a default constructed optional
+  // because non-template function is a better match than template function.
+  optional<ConstexprType> opt4({});
+  EXPECT_FALSE(!!opt4);
+}
+
+struct Implicit {};
+
+struct Explicit {};
+
+struct Convert {
+  Convert(const Implicit&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(false) {}
+  Convert(Implicit&&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(true) {}
+  explicit Convert(const Explicit&) : implicit(false), move(false) {}
+  explicit Convert(Explicit&&) : implicit(false), move(true) {}
+
+  bool implicit;
+  bool move;
+};
+
+struct ConvertFromOptional {
+  ConvertFromOptional(const Implicit&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(false), from_optional(false) {}
+  ConvertFromOptional(Implicit&&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(true), from_optional(false) {}
+  ConvertFromOptional(const optional<Implicit>&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(false), from_optional(true) {}
+  ConvertFromOptional(optional<Implicit>&&)  // NOLINT(runtime/explicit)
+      : implicit(true), move(true), from_optional(true) {}
+  explicit ConvertFromOptional(const Explicit&)
+      : implicit(false), move(false), from_optional(false) {}
+  explicit ConvertFromOptional(Explicit&&)
+      : implicit(false), move(true), from_optional(false) {}
+  explicit ConvertFromOptional(const optional<Explicit>&)
+      : implicit(false), move(false), from_optional(true) {}
+  explicit ConvertFromOptional(optional<Explicit>&&)
+      : implicit(false), move(true), from_optional(true) {}
+
+  bool implicit;
+  bool move;
+  bool from_optional;
+};
+
+TEST(optionalTest, ConvertingConstructor) {
+  optional<Implicit> i_empty;
+  optional<Implicit> i(in_place);
+  optional<Explicit> e_empty;
+  optional<Explicit> e(in_place);
+  {
+    // implicitly constructing optional<Convert> from optional<Implicit>
+    optional<Convert> empty = i_empty;
+    EXPECT_FALSE(!!empty);
+    optional<Convert> opt_copy = i;
+    EXPECT_TRUE(!!opt_copy);
+    EXPECT_TRUE(opt_copy->implicit);
+    EXPECT_FALSE(opt_copy->move);
+    optional<Convert> opt_move = optional<Implicit>(in_place);
+    EXPECT_TRUE(!!opt_move);
+    EXPECT_TRUE(opt_move->implicit);
+    EXPECT_TRUE(opt_move->move);
+  }
+  {
+    // explicitly constructing optional<Convert> from optional<Explicit>
+    optional<Convert> empty(e_empty);
+    EXPECT_FALSE(!!empty);
+    optional<Convert> opt_copy(e);
+    EXPECT_TRUE(!!opt_copy);
+    EXPECT_FALSE(opt_copy->implicit);
+    EXPECT_FALSE(opt_copy->move);
+    EXPECT_FALSE((std::is_convertible<const optional<Explicit>&,
+                                      optional<Convert>>::value));
+    optional<Convert> opt_move{optional<Explicit>(in_place)};
+    EXPECT_TRUE(!!opt_move);
+    EXPECT_FALSE(opt_move->implicit);
+    EXPECT_TRUE(opt_move->move);
+    EXPECT_FALSE(
+        (std::is_convertible<optional<Explicit>&&, optional<Convert>>::value));
+  }
+  {
+    // implicitly constructing optional<ConvertFromOptional> from
+    // optional<Implicit> via ConvertFromOptional(optional<Implicit>&&)
+    // check that ConvertFromOptional(Implicit&&) is NOT called
+    static_assert(
+        gtl::internal_optional::is_constructible_convertible_from_optional<
+            ConvertFromOptional, Implicit>::value,
+        "");
+    optional<ConvertFromOptional> opt0 = i_empty;
+    EXPECT_TRUE(!!opt0);
+    EXPECT_TRUE(opt0->implicit);
+    EXPECT_FALSE(opt0->move);
+    EXPECT_TRUE(opt0->from_optional);
+    optional<ConvertFromOptional> opt1 = optional<Implicit>();
+    EXPECT_TRUE(!!opt1);
+    EXPECT_TRUE(opt1->implicit);
+    EXPECT_TRUE(opt1->move);
+    EXPECT_TRUE(opt1->from_optional);
+  }
+  {
+    // implicitly constructing optional<ConvertFromOptional> from
+    // optional<Explicit> via ConvertFromOptional(optional<Explicit>&&)
+    // check that ConvertFromOptional(Explicit&&) is NOT called
+    optional<ConvertFromOptional> opt0(e_empty);
+    EXPECT_TRUE(!!opt0);
+    EXPECT_FALSE(opt0->implicit);
+    EXPECT_FALSE(opt0->move);
+    EXPECT_TRUE(opt0->from_optional);
+    EXPECT_FALSE((std::is_convertible<const optional<Explicit>&,
+                                      optional<ConvertFromOptional>>::value));
+    optional<ConvertFromOptional> opt1{optional<Explicit>()};
+    EXPECT_TRUE(!!opt1);
+    EXPECT_FALSE(opt1->implicit);
+    EXPECT_TRUE(opt1->move);
+    EXPECT_TRUE(opt1->from_optional);
+    EXPECT_FALSE((std::is_convertible<optional<Explicit>&&,
+                                      optional<ConvertFromOptional>>::value));
+  }
+}
+
+TEST(optionalTest, StructorBasic) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  {
+    optional<Listenable> empty;
+    EXPECT_FALSE(!!empty);
+    optional<Listenable> opt0(in_place);
+    EXPECT_TRUE(!!opt0);
+    optional<Listenable> opt1(in_place, 1);
+    EXPECT_TRUE(!!opt1);
+    optional<Listenable> opt2(in_place, 1, 2);
+    EXPECT_TRUE(!!opt2);
+  }
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.construct1);
+  EXPECT_EQ(1, listener.construct2);
+  EXPECT_EQ(3, listener.destruct);
+}
+
+TEST(optionalTest, CopyMoveStructor) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> original(in_place);
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(0, listener.copy);
+  EXPECT_EQ(0, listener.move);
+  optional<Listenable> copy(original);
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.copy);
+  EXPECT_EQ(0, listener.move);
+  optional<Listenable> move(std::move(original));
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.copy);
+  EXPECT_EQ(1, listener.move);
+}
+
+TEST(optionalTest, ListInit) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> listinit1(in_place, {1});
+  optional<Listenable> listinit2(in_place, {1, 2});
+  EXPECT_EQ(2, listener.listinit);
+}
+
+TEST(optionalTest, AssignFromNullopt) {
+  optional<int> opt(1);
+  opt = nullopt;
+  EXPECT_FALSE(!!opt);
+
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> opt1(in_place);
+  opt1 = nullopt;
+  EXPECT_FALSE(opt1);
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.destruct);
+
+  EXPECT_TRUE((std::is_nothrow_assignable<optional<int>, nullopt_t>::value));
+  EXPECT_TRUE(
+      (std::is_nothrow_assignable<optional<Listenable>, nullopt_t>::value));
+}
+
+TEST(optionalTest, CopyAssignment) {
+  const optional<int> empty, opt1 = 1, opt2 = 2;
+  optional<int> empty_to_opt1, opt1_to_opt2, opt2_to_empty;
+
+  EXPECT_FALSE(!!empty_to_opt1);
+  empty_to_opt1 = empty;
+  EXPECT_FALSE(!!empty_to_opt1);
+  empty_to_opt1 = opt1;
+  EXPECT_TRUE(!!empty_to_opt1);
+  EXPECT_EQ(1, empty_to_opt1.value());
+
+  EXPECT_FALSE(!!opt1_to_opt2);
+  opt1_to_opt2 = opt1;
+  EXPECT_TRUE(!!opt1_to_opt2);
+  EXPECT_EQ(1, opt1_to_opt2.value());
+  opt1_to_opt2 = opt2;
+  EXPECT_TRUE(!!opt1_to_opt2);
+  EXPECT_EQ(2, opt1_to_opt2.value());
+
+  EXPECT_FALSE(!!opt2_to_empty);
+  opt2_to_empty = opt2;
+  EXPECT_TRUE(!!opt2_to_empty);
+  EXPECT_EQ(2, opt2_to_empty.value());
+  opt2_to_empty = empty;
+  EXPECT_FALSE(!!opt2_to_empty);
+
+  EXPECT_TRUE(std::is_copy_assignable<optional<Copyable>>::value);
+  EXPECT_FALSE(std::is_copy_assignable<optional<MoveableThrow>>::value);
+  EXPECT_FALSE(std::is_copy_assignable<optional<MoveableNoThrow>>::value);
+  EXPECT_FALSE(std::is_copy_assignable<optional<NonMovable>>::value);
+}
+
+TEST(optionalTest, MoveAssignment) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+
+  optional<Listenable> empty1, empty2, set1(in_place), set2(in_place);
+  EXPECT_EQ(2, listener.construct0);
+  optional<Listenable> empty_to_empty, empty_to_set, set_to_empty(in_place),
+      set_to_set(in_place);
+  EXPECT_EQ(4, listener.construct0);
+  empty_to_empty = std::move(empty1);
+  empty_to_set = std::move(set1);
+  set_to_empty = std::move(empty2);
+  set_to_set = std::move(set2);
+  EXPECT_EQ(0, listener.copy);
+  EXPECT_EQ(1, listener.move);
+  EXPECT_EQ(1, listener.destruct);
+  EXPECT_EQ(1, listener.move_assign);
+
+  EXPECT_TRUE(std::is_move_assignable<optional<Copyable>>::value);
+  EXPECT_TRUE(std::is_move_assignable<optional<MoveableThrow>>::value);
+  EXPECT_TRUE(std::is_move_assignable<optional<MoveableNoThrow>>::value);
+  EXPECT_FALSE(std::is_move_assignable<optional<NonMovable>>::value);
+
+  EXPECT_FALSE(std::is_nothrow_move_assignable<optional<MoveableThrow>>::value);
+  EXPECT_TRUE(
+      std::is_nothrow_move_assignable<optional<MoveableNoThrow>>::value);
+}
+
+struct NoConvertToOptional {
+  // disable implicit conversion from const NoConvertToOptional&
+  // to optional<NoConvertToOptional>.
+  NoConvertToOptional(const NoConvertToOptional&) = delete;
+};
+
+struct CopyConvert {
+  CopyConvert(const NoConvertToOptional&);
+  CopyConvert& operator=(const CopyConvert&) = delete;
+  CopyConvert& operator=(const NoConvertToOptional&);
+};
+
+struct CopyConvertFromOptional {
+  CopyConvertFromOptional(const NoConvertToOptional&);
+  CopyConvertFromOptional(const optional<NoConvertToOptional>&);
+  CopyConvertFromOptional& operator=(const CopyConvertFromOptional&) = delete;
+  CopyConvertFromOptional& operator=(const NoConvertToOptional&);
+  CopyConvertFromOptional& operator=(const optional<NoConvertToOptional>&);
+};
+
+struct MoveConvert {
+  MoveConvert(NoConvertToOptional&&);
+  MoveConvert& operator=(const MoveConvert&) = delete;
+  MoveConvert& operator=(NoConvertToOptional&&);
+};
+
+struct MoveConvertFromOptional {
+  MoveConvertFromOptional(NoConvertToOptional&&);
+  MoveConvertFromOptional(optional<NoConvertToOptional>&&);
+  MoveConvertFromOptional& operator=(const MoveConvertFromOptional&) = delete;
+  MoveConvertFromOptional& operator=(NoConvertToOptional&&);
+  MoveConvertFromOptional& operator=(optional<NoConvertToOptional>&&);
+};
+
+// template <class U = T> optional<T>& operator=(U&& v);
+TEST(optionalTest, ValueAssignment) {
+  optional<int> opt;
+  EXPECT_FALSE(!!opt);
+  opt = 42;
+  EXPECT_TRUE(!!opt);
+  EXPECT_EQ(42, opt.value());
+  opt = nullopt;
+  EXPECT_FALSE(!!opt);
+  opt = 42;
+  EXPECT_TRUE(!!opt);
+  EXPECT_EQ(42, opt.value());
+  opt = 43;
+  EXPECT_TRUE(!!opt);
+  EXPECT_EQ(43, opt.value());
+  opt = {};  // this should clear optional
+  EXPECT_FALSE(!!opt);
+
+  opt = {44};
+  EXPECT_TRUE(!!opt);
+  EXPECT_EQ(44, opt.value());
+
+  // U = const NoConvertToOptional&
+  EXPECT_TRUE((std::is_assignable<optional<CopyConvert>&,
+                                  const NoConvertToOptional&>::value));
+  // U = const optional<NoConvertToOptional>&
+  EXPECT_TRUE((std::is_assignable<optional<CopyConvertFromOptional>&,
+                                  const NoConvertToOptional&>::value));
+  // U = const NoConvertToOptional& triggers SFINAE because
+  // std::is_constructible_v<MoveConvert, const NoConvertToOptional&> is false
+  EXPECT_FALSE((std::is_assignable<optional<MoveConvert>&,
+                                   const NoConvertToOptional&>::value));
+  // U = NoConvertToOptional
+  EXPECT_TRUE((std::is_assignable<optional<MoveConvert>&,
+                                  NoConvertToOptional&&>::value));
+  // U = const NoConvertToOptional& triggers SFINAE because
+  // std::is_constructible_v<MoveConvertFromOptional, const
+  // NoConvertToOptional&> is false
+  EXPECT_FALSE((std::is_assignable<optional<MoveConvertFromOptional>&,
+                                   const NoConvertToOptional&>::value));
+  // U = NoConvertToOptional
+  EXPECT_TRUE((std::is_assignable<optional<MoveConvertFromOptional>&,
+                                  NoConvertToOptional&&>::value));
+  // U = const optional<NoConvertToOptional>&
+  EXPECT_TRUE(
+      (std::is_assignable<optional<CopyConvertFromOptional>&,
+                          const optional<NoConvertToOptional>&>::value));
+  // U = optional<NoConvertToOptional>
+  EXPECT_TRUE((std::is_assignable<optional<MoveConvertFromOptional>&,
+                                  optional<NoConvertToOptional>&&>::value));
+}
+
+// template <class U> optional<T>& operator=(const optional<U>& rhs);
+// template <class U> optional<T>& operator=(optional<U>&& rhs);
+TEST(optionalTest, ConvertingAssignment) {
+  optional<int> opt_i;
+  optional<char> opt_c('c');
+  opt_i = opt_c;
+  EXPECT_TRUE(!!opt_i);
+  EXPECT_EQ(*opt_c, *opt_i);
+  opt_i = optional<char>();
+  EXPECT_FALSE(!!opt_i);
+  opt_i = optional<char>('d');
+  EXPECT_TRUE(!!opt_i);
+  EXPECT_EQ('d', *opt_i);
+
+  optional<string> opt_str;
+  optional<const char*> opt_cstr("abc");
+  opt_str = opt_cstr;
+  EXPECT_TRUE(!!opt_str);
+  EXPECT_EQ(string("abc"), *opt_str);
+  opt_str = optional<const char*>();
+  EXPECT_FALSE(!!opt_str);
+  opt_str = optional<const char*>("def");
+  EXPECT_TRUE(!!opt_str);
+  EXPECT_EQ(string("def"), *opt_str);
+
+  // operator=(const optional<U>&) with U = NoConvertToOptional
+  EXPECT_TRUE(
+      (std::is_assignable<optional<CopyConvert>,
+                          const optional<NoConvertToOptional>&>::value));
+  // operator=(const optional<U>&) with U = NoConvertToOptional
+  // triggers SFINAE because
+  // std::is_constructible_v<MoveConvert, const NoConvertToOptional&> is false
+  EXPECT_FALSE(
+      (std::is_assignable<optional<MoveConvert>&,
+                          const optional<NoConvertToOptional>&>::value));
+  // operator=(optional<U>&&) with U = NoConvertToOptional
+  EXPECT_TRUE((std::is_assignable<optional<MoveConvert>&,
+                                  optional<NoConvertToOptional>&&>::value));
+  // operator=(const optional<U>&) with U = NoConvertToOptional triggers SFINAE
+  // because std::is_constructible_v<MoveConvertFromOptional,
+  // const NoConvertToOptional&> is false.
+  // operator=(U&&) with U = const optional<NoConverToOptional>& triggers SFINAE
+  // because std::is_constructible<MoveConvertFromOptional,
+  // optional<NoConvertToOptional>&&> is true.
+  EXPECT_FALSE(
+      (std::is_assignable<optional<MoveConvertFromOptional>&,
+                          const optional<NoConvertToOptional>&>::value));
+}
+
+TEST(optionalTest, ResetAndHasValue) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> opt;
+  EXPECT_FALSE(!!opt);
+  EXPECT_FALSE(opt.has_value());
+  opt.emplace();
+  EXPECT_TRUE(!!opt);
+  EXPECT_TRUE(opt.has_value());
+  opt.reset();
+  EXPECT_FALSE(!!opt);
+  EXPECT_FALSE(opt.has_value());
+  EXPECT_EQ(1, listener.destruct);
+  opt.reset();
+  EXPECT_FALSE(!!opt);
+  EXPECT_FALSE(opt.has_value());
+
+  constexpr optional<int> empty;
+  static_assert(!empty.has_value(), "");
+  constexpr optional<int> nonempty(1);
+  static_assert(nonempty.has_value(), "");
+}
+
+TEST(optionalTest, Emplace) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> opt;
+  EXPECT_FALSE(!!opt);
+  opt.emplace(1);
+  EXPECT_TRUE(!!opt);
+  opt.emplace(1, 2);
+  EXPECT_EQ(1, listener.construct1);
+  EXPECT_EQ(1, listener.construct2);
+  EXPECT_EQ(1, listener.destruct);
+}
+
+TEST(optionalTest, ListEmplace) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+  optional<Listenable> opt;
+  EXPECT_FALSE(!!opt);
+  opt.emplace({1});
+  EXPECT_TRUE(!!opt);
+  opt.emplace({1, 2});
+  EXPECT_EQ(2, listener.listinit);
+  EXPECT_EQ(1, listener.destruct);
+}
+
+TEST(optionalTest, Swap) {
+  optional<int> opt_empty, opt1 = 1, opt2 = 2;
+  EXPECT_FALSE(!!opt_empty);
+  EXPECT_TRUE(!!opt1);
+  EXPECT_EQ(1, opt1.value());
+  EXPECT_TRUE(!!opt2);
+  EXPECT_EQ(2, opt2.value());
+  swap(opt_empty, opt1);
+  EXPECT_FALSE(!!opt1);
+  EXPECT_TRUE(!!opt_empty);
+  EXPECT_EQ(1, opt_empty.value());
+  EXPECT_TRUE(!!opt2);
+  EXPECT_EQ(2, opt2.value());
+  swap(opt_empty, opt1);
+  EXPECT_FALSE(!!opt_empty);
+  EXPECT_TRUE(!!opt1);
+  EXPECT_EQ(1, opt1.value());
+  EXPECT_TRUE(!!opt2);
+  EXPECT_EQ(2, opt2.value());
+  swap(opt1, opt2);
+  EXPECT_FALSE(!!opt_empty);
+  EXPECT_TRUE(!!opt1);
+  EXPECT_EQ(2, opt1.value());
+  EXPECT_TRUE(!!opt2);
+  EXPECT_EQ(1, opt2.value());
+
+  EXPECT_TRUE(noexcept(opt1.swap(opt2)));
+  EXPECT_TRUE(noexcept(swap(opt1, opt2)));
+}
+
+TEST(optionalTest, PointerStuff) {
+  optional<string> opt(in_place, "foo");
+  EXPECT_EQ("foo", *opt);
+  const auto& opt_const = opt;
+  EXPECT_EQ("foo", *opt_const);
+  EXPECT_EQ(opt->size(), 3);
+  EXPECT_EQ(opt_const->size(), 3);
+
+  constexpr optional<ConstexprType> opt1(1);
+  static_assert(opt1->x == 1, "");
+}
+
+// gcc has a bug pre 4.9 where it doesn't do correct overload resolution
+// between rvalue reference qualified member methods. Skip that test to make
+// the build green again when using the old compiler.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
+#define SKIP_OVERLOAD_TEST_DUE_TO_GCC_BUG
+#endif
+#endif
+
+TEST(optionalTest, Value) {
+  using O = optional<string>;
+  using CO = const optional<string>;
+  O lvalue(in_place, "lvalue");
+  CO clvalue(in_place, "clvalue");
+  EXPECT_EQ("lvalue", lvalue.value());
+  EXPECT_EQ("clvalue", clvalue.value());
+  EXPECT_EQ("xvalue", O(in_place, "xvalue").value());
+#ifndef SKIP_OVERLOAD_TEST_DUE_TO_GCC_BUG
+  EXPECT_EQ("cxvalue", CO(in_place, "cxvalue").value());
+  EXPECT_EQ("&", TypeQuals(lvalue.value()));
+  EXPECT_EQ("c&", TypeQuals(clvalue.value()));
+  EXPECT_EQ("&&", TypeQuals(O(in_place, "xvalue").value()));
+  EXPECT_EQ("c&&", TypeQuals(CO(in_place, "cxvalue").value()));
+#endif
+}
+
+TEST(optionalTest, DerefOperator) {
+  using O = optional<string>;
+  using CO = const optional<string>;
+  O lvalue(in_place, "lvalue");
+  CO clvalue(in_place, "clvalue");
+  EXPECT_EQ("lvalue", *lvalue);
+  EXPECT_EQ("clvalue", *clvalue);
+  EXPECT_EQ("xvalue", *O(in_place, "xvalue"));
+#ifndef SKIP_OVERLOAD_TEST_DUE_TO_GCC_BUG
+  EXPECT_EQ("cxvalue", *CO(in_place, "cxvalue"));
+  EXPECT_EQ("&", TypeQuals(*lvalue));
+  EXPECT_EQ("c&", TypeQuals(*clvalue));
+  EXPECT_EQ("&&", TypeQuals(*O(in_place, "xvalue")));
+  EXPECT_EQ("c&&", TypeQuals(*CO(in_place, "cxvalue")));
+#endif
+
+  constexpr optional<int> opt1(1);
+  static_assert(*opt1 == 1, "");
+
+#if !defined(SKIP_CONSTEXPR_TEST_DUE_TO_CLANG_BUG) && \
+    !defined(SKIP_OVERLOAD_TEST_DUE_TO_GCC_BUG)
+  using COI = const optional<int>;
+  static_assert(*COI(2) == 2, "");
+#endif
+}
+
+TEST(optionalTest, ValueOr) {
+  optional<double> opt_empty, opt_set = 1.2;
+  EXPECT_EQ(42.0, opt_empty.value_or(42));
+  EXPECT_EQ(1.2, opt_set.value_or(42));
+  EXPECT_EQ(42.0, optional<double>().value_or(42));
+  EXPECT_EQ(1.2, optional<double>(1.2).value_or(42));
+
+#ifndef SKIP_CONSTEXPR_TEST_DUE_TO_CLANG_BUG
+  constexpr optional<double> copt_empty;
+  static_assert(42.0 == copt_empty.value_or(42), "");
+
+  constexpr optional<double> copt_set = {1.2};
+  static_assert(1.2 == copt_set.value_or(42), "");
+
+  using COD = const optional<double>;
+  static_assert(42.0 == COD().value_or(42), "");
+  static_assert(1.2 == COD(1.2).value_or(42), "");
+#endif
+}
+
+// make_optional cannot be constexpr until C++17
+TEST(optionalTest, make_optional) {
+  auto opt_int = make_optional(42);
+  EXPECT_TRUE((std::is_same<decltype(opt_int), optional<int>>::value));
+  EXPECT_EQ(42, opt_int);
+
+  StructorListener listener;
+  Listenable::listener = &listener;
+
+  optional<Listenable> opt0 = make_optional<Listenable>();
+  EXPECT_EQ(1, listener.construct0);
+  optional<Listenable> opt1 = make_optional<Listenable>(1);
+  EXPECT_EQ(1, listener.construct1);
+  optional<Listenable> opt2 = make_optional<Listenable>(1, 2);
+  EXPECT_EQ(1, listener.construct2);
+  optional<Listenable> opt3 = make_optional<Listenable>({1});
+  optional<Listenable> opt4 = make_optional<Listenable>({1, 2});
+  EXPECT_EQ(2, listener.listinit);
+}
+
+TEST(optionalTest, Comparisons) {
+  optional<int> ae, be, a2 = 2, b2 = 2, a4 = 4, b4 = 4;
+
+#define optionalTest_Comparisons_EXPECT_LESS(x, y) \
+  EXPECT_FALSE((x) == (y));                        \
+  EXPECT_TRUE((x) != (y));                         \
+  EXPECT_TRUE((x) < (y));                          \
+  EXPECT_FALSE((x) > (y));                         \
+  EXPECT_TRUE((x) <= (y));                         \
+  EXPECT_FALSE((x) >= (y));
+
+#define optionalTest_Comparisons_EXPECT_SAME(x, y) \
+  EXPECT_TRUE((x) == (y));                         \
+  EXPECT_FALSE((x) != (y));                        \
+  EXPECT_FALSE((x) < (y));                         \
+  EXPECT_FALSE((x) > (y));                         \
+  EXPECT_TRUE((x) <= (y));                         \
+  EXPECT_TRUE((x) >= (y));
+
+#define optionalTest_Comparisons_EXPECT_GREATER(x, y) \
+  EXPECT_FALSE((x) == (y));                           \
+  EXPECT_TRUE((x) != (y));                            \
+  EXPECT_FALSE((x) < (y));                            \
+  EXPECT_TRUE((x) > (y));                             \
+  EXPECT_FALSE((x) <= (y));                           \
+  EXPECT_TRUE((x) >= (y));
+
+  // LHS: nullopt, ae, a2, 3, a4
+  // RHS: nullopt, be, b2, 3, b4
+
+  // optionalTest_Comparisons_EXPECT_NOT_TO_WORK(nullopt,nullopt);
+  optionalTest_Comparisons_EXPECT_SAME(nullopt, be);
+  optionalTest_Comparisons_EXPECT_LESS(nullopt, b2);
+  // optionalTest_Comparisons_EXPECT_NOT_TO_WORK(nullopt,3);
+  optionalTest_Comparisons_EXPECT_LESS(nullopt, b4);
+
+  optionalTest_Comparisons_EXPECT_SAME(ae, nullopt);
+  optionalTest_Comparisons_EXPECT_SAME(ae, be);
+  optionalTest_Comparisons_EXPECT_LESS(ae, b2);
+  optionalTest_Comparisons_EXPECT_LESS(ae, 3);
+  optionalTest_Comparisons_EXPECT_LESS(ae, b4);
+
+  optionalTest_Comparisons_EXPECT_GREATER(a2, nullopt);
+  optionalTest_Comparisons_EXPECT_GREATER(a2, be);
+  optionalTest_Comparisons_EXPECT_SAME(a2, b2);
+  optionalTest_Comparisons_EXPECT_LESS(a2, 3);
+  optionalTest_Comparisons_EXPECT_LESS(a2, b4);
+
+  // optionalTest_Comparisons_EXPECT_NOT_TO_WORK(3,nullopt);
+  optionalTest_Comparisons_EXPECT_GREATER(3, be);
+  optionalTest_Comparisons_EXPECT_GREATER(3, b2);
+  optionalTest_Comparisons_EXPECT_SAME(3, 3);
+  optionalTest_Comparisons_EXPECT_LESS(3, b4);
+
+  optionalTest_Comparisons_EXPECT_GREATER(a4, nullopt);
+  optionalTest_Comparisons_EXPECT_GREATER(a4, be);
+  optionalTest_Comparisons_EXPECT_GREATER(a4, b2);
+  optionalTest_Comparisons_EXPECT_GREATER(a4, 3);
+  optionalTest_Comparisons_EXPECT_SAME(a4, b4);
+}
+
+TEST(optionalTest, SwapRegression) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+
+  {
+    optional<Listenable> a;
+    optional<Listenable> b(in_place);
+    a.swap(b);
+  }
+
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.move);
+  EXPECT_EQ(2, listener.destruct);
+
+  {
+    optional<Listenable> a(in_place);
+    optional<Listenable> b;
+    a.swap(b);
+  }
+
+  EXPECT_EQ(2, listener.construct0);
+  EXPECT_EQ(2, listener.move);
+  EXPECT_EQ(4, listener.destruct);
+}
+
+TEST(optionalTest, BigStringLeakCheck) {
+  constexpr size_t n = 1 << 16;
+
+  using OS = optional<string>;
+
+  OS a;
+  OS b = nullopt;
+  OS c = string(n, 'c');
+  string sd(n, 'd');
+  OS d = sd;
+  OS e(in_place, n, 'e');
+  OS f;
+  f.emplace(n, 'f');
+
+  OS ca(a);
+  OS cb(b);
+  OS cc(c);
+  OS cd(d);
+  OS ce(e);
+
+  OS oa;
+  OS ob = nullopt;
+  OS oc = string(n, 'c');
+  string sod(n, 'd');
+  OS od = sod;
+  OS oe(in_place, n, 'e');
+  OS of;
+  of.emplace(n, 'f');
+
+  OS ma(std::move(oa));
+  OS mb(std::move(ob));
+  OS mc(std::move(oc));
+  OS md(std::move(od));
+  OS me(std::move(oe));
+  OS mf(std::move(of));
+
+  OS aa1;
+  OS ab1 = nullopt;
+  OS ac1 = string(n, 'c');
+  string sad1(n, 'd');
+  OS ad1 = sad1;
+  OS ae1(in_place, n, 'e');
+  OS af1;
+  af1.emplace(n, 'f');
+
+  OS aa2;
+  OS ab2 = nullopt;
+  OS ac2 = string(n, 'c');
+  string sad2(n, 'd');
+  OS ad2 = sad2;
+  OS ae2(in_place, n, 'e');
+  OS af2;
+  af2.emplace(n, 'f');
+
+  aa1 = af2;
+  ab1 = ae2;
+  ac1 = ad2;
+  ad1 = ac2;
+  ae1 = ab2;
+  af1 = aa2;
+
+  OS aa3;
+  OS ab3 = nullopt;
+  OS ac3 = string(n, 'c');
+  string sad3(n, 'd');
+  OS ad3 = sad3;
+  OS ae3(in_place, n, 'e');
+  OS af3;
+  af3.emplace(n, 'f');
+
+  aa3 = nullopt;
+  ab3 = nullopt;
+  ac3 = nullopt;
+  ad3 = nullopt;
+  ae3 = nullopt;
+  af3 = nullopt;
+
+  OS aa4;
+  OS ab4 = nullopt;
+  OS ac4 = string(n, 'c');
+  string sad4(n, 'd');
+  OS ad4 = sad4;
+  OS ae4(in_place, n, 'e');
+  OS af4;
+  af4.emplace(n, 'f');
+
+  aa4 = OS(in_place, n, 'a');
+  ab4 = OS(in_place, n, 'b');
+  ac4 = OS(in_place, n, 'c');
+  ad4 = OS(in_place, n, 'd');
+  ae4 = OS(in_place, n, 'e');
+  af4 = OS(in_place, n, 'f');
+
+  OS aa5;
+  OS ab5 = nullopt;
+  OS ac5 = string(n, 'c');
+  string sad5(n, 'd');
+  OS ad5 = sad5;
+  OS ae5(in_place, n, 'e');
+  OS af5;
+  af5.emplace(n, 'f');
+
+  string saa5(n, 'a');
+  string sab5(n, 'a');
+  string sac5(n, 'a');
+  string sad52(n, 'a');
+  string sae5(n, 'a');
+  string saf5(n, 'a');
+
+  aa5 = saa5;
+  ab5 = sab5;
+  ac5 = sac5;
+  ad5 = sad52;
+  ae5 = sae5;
+  af5 = saf5;
+
+  OS aa6;
+  OS ab6 = nullopt;
+  OS ac6 = string(n, 'c');
+  string sad6(n, 'd');
+  OS ad6 = sad6;
+  OS ae6(in_place, n, 'e');
+  OS af6;
+  af6.emplace(n, 'f');
+
+  aa6 = string(n, 'a');
+  ab6 = string(n, 'b');
+  ac6 = string(n, 'c');
+  ad6 = string(n, 'd');
+  ae6 = string(n, 'e');
+  af6 = string(n, 'f');
+
+  OS aa7;
+  OS ab7 = nullopt;
+  OS ac7 = string(n, 'c');
+  string sad7(n, 'd');
+  OS ad7 = sad7;
+  OS ae7(in_place, n, 'e');
+  OS af7;
+  af7.emplace(n, 'f');
+
+  aa7.emplace(n, 'A');
+  ab7.emplace(n, 'B');
+  ac7.emplace(n, 'C');
+  ad7.emplace(n, 'D');
+  ae7.emplace(n, 'E');
+  af7.emplace(n, 'F');
+}
+
+TEST(optionalTest, MoveAssignRegression) {
+  StructorListener listener;
+  Listenable::listener = &listener;
+
+  {
+    optional<Listenable> a;
+    Listenable b;
+    a = std::move(b);
+  }
+
+  EXPECT_EQ(1, listener.construct0);
+  EXPECT_EQ(1, listener.move);
+  EXPECT_EQ(2, listener.destruct);
+}
+
+TEST(optionalTest, ValueType) {
+  EXPECT_TRUE((std::is_same<optional<int>::value_type, int>::value));
+  EXPECT_TRUE((std::is_same<optional<string>::value_type, string>::value));
+  EXPECT_FALSE((std::is_same<optional<int>::value_type, nullopt_t>::value));
+}
+
+TEST(optionalTest, Hash) {
+  std::hash<optional<int>> hash;
+  std::set<size_t> hashcodes;
+  hashcodes.insert(hash(nullopt));
+  for (int i = 0; i < 100; ++i) {
+    hashcodes.insert(hash(i));
+  }
+  EXPECT_GT(hashcodes.size(), 90);
+}
+
+struct MoveMeNoThrow {
+  MoveMeNoThrow() : x(0) {}
+  MoveMeNoThrow(const MoveMeNoThrow& other) : x(other.x) {
+    LOG(FATAL) << "Should not be called.";
+  }
+  MoveMeNoThrow(MoveMeNoThrow&& other) noexcept : x(other.x) {}
+  int x;
+};
+
+struct MoveMeThrow {
+  MoveMeThrow() : x(0) {}
+  MoveMeThrow(const MoveMeThrow& other) : x(other.x) {}
+  MoveMeThrow(MoveMeThrow&& other) : x(other.x) {}
+  int x;
+};
+
+TEST(optionalTest, NoExcept) {
+  static_assert(
+      std::is_nothrow_move_constructible<optional<MoveMeNoThrow>>::value, "");
+  static_assert(
+      !std::is_nothrow_move_constructible<optional<MoveMeThrow>>::value, "");
+  std::vector<optional<MoveMeNoThrow>> v;
+  v.reserve(10);
+  for (int i = 0; i < 10; ++i) v.emplace_back();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fafcd445b87..fae85570dc0 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -172,13 +172,13 @@ TEST(TopNTest, Ptr) {
   LOG(INFO) << "Testing 2-argument push()";
   TopN<string *> topn(3);
   for (int i = 0; i < 8; ++i) {
-    string *dropped = NULL;
+    string *dropped = nullptr;
     topn.push(new string(std::to_string(i)), &dropped);
     delete dropped;
   }
 
   for (int i = 8; i > 0; --i) {
-    string *dropped = NULL;
+    string *dropped = nullptr;
     topn.push(new string(std::to_string(i)), &dropped);
     delete dropped;
   }
@@ -254,7 +254,7 @@ TEST(TopNTest, Iteration) {
   for (int i = 0; i < 8; ++i) top.push(i);
   std::vector<int> actual(top.unsorted_begin(), top.unsorted_end());
   // Check that we have 4,5,6,7 as the top 4 (in some order, so we sort)
-  sort(actual.begin(), actual.end());
+  std::sort(actual.begin(), actual.end());
   EXPECT_EQ(actual.size(), 4);
   EXPECT_EQ(actual[0], 4);
   EXPECT_EQ(actual[1], 5);
diff --git a/tensorflow/core/lib/hash/crc32c_accelerate.cc b/tensorflow/core/lib/hash/crc32c_accelerate.cc
index 07fa2faeddd..b87550fba9b 100644
--- a/tensorflow/core/lib/hash/crc32c_accelerate.cc
+++ b/tensorflow/core/lib/hash/crc32c_accelerate.cc
@@ -31,6 +31,12 @@ limitations under the License.
 #endif
 #endif /* __SSE4_2__ */
 
+// This version of Apple clang has a bug:
+// https://llvm.org/bugs/show_bug.cgi?id=25510
+#if defined(__APPLE__) && (__clang_major__ <= 8)
+#undef USE_SSE_CRC32C
+#endif
+
 #ifdef USE_SSE_CRC32C
 #include <nmmintrin.h>
 #endif
diff --git a/tensorflow/core/lib/io/block.cc b/tensorflow/core/lib/io/block.cc
index 6dd6a9ce6f4..1fa26d91470 100644
--- a/tensorflow/core/lib/io/block.cc
+++ b/tensorflow/core/lib/io/block.cc
@@ -64,7 +64,7 @@ Block::~Block() {
 static inline const char* DecodeEntry(const char* p, const char* limit,
                                       uint32* shared, uint32* non_shared,
                                       uint32* value_length) {
-  if (limit - p < 3) return NULL;
+  if (limit - p < 3) return nullptr;
   *shared = reinterpret_cast<const unsigned char*>(p)[0];
   *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
   *value_length = reinterpret_cast<const unsigned char*>(p)[2];
@@ -72,13 +72,15 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
     // Fast path: all three values are encoded in one byte each
     p += 3;
   } else {
-    if ((p = core::GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
-    if ((p = core::GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
-    if ((p = core::GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
+    if ((p = core::GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+    if ((p = core::GetVarint32Ptr(p, limit, non_shared)) == nullptr)
+      return nullptr;
+    if ((p = core::GetVarint32Ptr(p, limit, value_length)) == nullptr)
+      return nullptr;
   }
 
   if (static_cast<uint32>(limit - p) < (*non_shared + *value_length)) {
-    return NULL;
+    return nullptr;
   }
   return p;
 }
@@ -130,23 +132,23 @@ class Block::Iter : public Iterator {
     assert(num_restarts_ > 0);
   }
 
-  virtual bool Valid() const { return current_ < restarts_; }
-  virtual Status status() const { return status_; }
-  virtual StringPiece key() const {
+  bool Valid() const override { return current_ < restarts_; }
+  Status status() const override { return status_; }
+  StringPiece key() const override {
     assert(Valid());
     return key_;
   }
-  virtual StringPiece value() const {
+  StringPiece value() const override {
     assert(Valid());
     return value_;
   }
 
-  virtual void Next() {
+  void Next() override {
     assert(Valid());
     ParseNextKey();
   }
 
-  virtual void Seek(const StringPiece& target) {
+  void Seek(const StringPiece& target) override {
     // Binary search in restart array to find the last restart point
     // with a key < target
     uint32 left = 0;
@@ -158,7 +160,7 @@ class Block::Iter : public Iterator {
       const char* key_ptr =
           DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
                       &non_shared, &value_length);
-      if (key_ptr == NULL || (shared != 0)) {
+      if (key_ptr == nullptr || (shared != 0)) {
         CorruptionError();
         return;
       }
@@ -186,7 +188,7 @@ class Block::Iter : public Iterator {
     }
   }
 
-  virtual void SeekToFirst() {
+  void SeekToFirst() override {
     SeekToRestartPoint(0);
     ParseNextKey();
   }
@@ -214,7 +216,7 @@ class Block::Iter : public Iterator {
     // Decode next entry
     uint32 shared, non_shared, value_length;
     p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
-    if (p == NULL || key_.size() < shared) {
+    if (p == nullptr || key_.size() < shared) {
       CorruptionError();
       return false;
     } else {
diff --git a/tensorflow/core/lib/io/block_builder.cc b/tensorflow/core/lib/io/block_builder.cc
index 5a87da6c86a..b2921c076cc 100644
--- a/tensorflow/core/lib/io/block_builder.cc
+++ b/tensorflow/core/lib/io/block_builder.cc
@@ -70,10 +70,12 @@ size_t BlockBuilder::CurrentSizeEstimate() const {
 
 StringPiece BlockBuilder::Finish() {
   // Append restart array
-  for (size_t i = 0; i < restarts_.size(); i++) {
-    core::PutFixed32(&buffer_, restarts_[i]);
+  CHECK_LE(restarts_.size(), std::numeric_limits<uint32_t>::max());
+  for (const auto r : restarts_) {
+    core::PutFixed32(&buffer_, r);
   }
-  core::PutFixed32(&buffer_, restarts_.size());
+  // Downcast safe because of the CHECK.
+  core::PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
   finished_ = true;
   return StringPiece(buffer_);
 }
@@ -93,19 +95,24 @@ void BlockBuilder::Add(const StringPiece& key, const StringPiece& value) {
     }
   } else {
     // Restart compression
-    restarts_.push_back(buffer_.size());
+    CHECK_LE(buffer_.size(), std::numeric_limits<uint32_t>::max());
+    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
     counter_ = 0;
   }
   const size_t non_shared = key.size() - shared;
 
+  CHECK_LE(shared, std::numeric_limits<uint32_t>::max());
+  CHECK_LE(non_shared, std::numeric_limits<uint32_t>::max());
+  CHECK_LE(value.size(), std::numeric_limits<uint32_t>::max());
+
   // Add "<shared><non_shared><value_size>" to buffer_
-  core::PutVarint32(&buffer_, shared);
-  core::PutVarint32(&buffer_, non_shared);
-  core::PutVarint32(&buffer_, value.size());
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(shared));
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(non_shared));
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(value.size()));
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
-  buffer_.append(value.data(), value.size());
+  buffer_.append(value.data(), static_cast<uint32_t>(value.size()));
 
   // Update state
   last_key_.resize(shared);
diff --git a/tensorflow/core/lib/io/buffered_inputstream.cc b/tensorflow/core/lib/io/buffered_inputstream.cc
index d4a5ed48bdb..6f72da47131 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream.cc
@@ -150,6 +150,24 @@ Status BufferedInputStream::Seek(int64 position) {
   return SkipNBytes(position - bufpos);
 }
 
+Status BufferedInputStream::ReadAll(string* result) {
+  result->clear();
+  Status status;
+  while (status.ok()) {
+    status = FillBuffer();
+    if (limit_ == 0) {
+      break;
+    }
+    result->append(buf_);
+    pos_ = limit_;
+  }
+
+  if (errors::IsOutOfRange(status)) {
+    return Status::OK();
+  }
+  return status;
+}
+
 Status BufferedInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
   pos_ = 0;
@@ -163,7 +181,7 @@ Status BufferedInputStream::ReadLine(string* result) {
 
 string BufferedInputStream::ReadLineAsString() {
   string result;
-  ReadLineHelper(&result, true);
+  ReadLineHelper(&result, true).IgnoreError();
   return result;
 }
 
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index d55dd5cdafb..b37766005a9 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -75,6 +75,12 @@ class BufferedInputStream : public InputStreamInterface {
   // no special treatment.
   string ReadLineAsString();
 
+  // Reads the entire contents of the file into *result.
+  //
+  // Note: the amount of memory used by this function call is unbounded, so only
+  // use in ops that expect that behavior.
+  Status ReadAll(string* result);
+
   Status Reset() override;
 
  private:
diff --git a/tensorflow/core/lib/io/buffered_inputstream_test.cc b/tensorflow/core/lib/io/buffered_inputstream_test.cc
index 4488637cadc..7265101e1be 100644
--- a/tensorflow/core/lib/io/buffered_inputstream_test.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream_test.cc
@@ -32,7 +32,7 @@ static std::vector<int> BufferSizes() {
 TEST(BufferedInputStream, ReadLine_Empty) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -48,7 +48,8 @@ TEST(BufferedInputStream, ReadLine_Empty) {
 TEST(BufferedInputStream, ReadLine1) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "line one\nline two\nline three\n");
+  TF_ASSERT_OK(
+      WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -72,7 +73,7 @@ TEST(BufferedInputStream, ReadLine1) {
 TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "line one\nline two\nline three");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -96,7 +97,8 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
 TEST(BufferedInputStream, ReadLine_EmptyLines) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "line one\n\n\nline two\nline three");
+  TF_ASSERT_OK(
+      WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -124,7 +126,8 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
 TEST(BufferedInputStream, ReadLine_CRLF) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "line one\r\n\r\n\r\nline two\r\nline three");
+  TF_ASSERT_OK(WriteStringToFile(env, fname,
+                                 "line one\r\n\r\n\r\nline two\r\nline three"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -152,7 +155,7 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
 TEST(BufferedInputStream, ReadNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -189,7 +192,7 @@ TEST(BufferedInputStream, ReadNBytes) {
 TEST(BufferedInputStream, SkipNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -226,7 +229,7 @@ TEST(BufferedInputStream, SkipNBytes) {
 TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -261,7 +264,7 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
 TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -296,7 +299,7 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
 TEST(BufferedInputStream, Seek) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/buffered_inputstream_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
 
@@ -323,6 +326,42 @@ TEST(BufferedInputStream, Seek) {
   }
 }
 
+TEST(BufferedInputStream, ReadAll_Empty) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  const string expected = "";
+  TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
+
+  for (auto buf_size : BufferSizes()) {
+    RandomAccessInputStream input_stream(file.get());
+    string read;
+    BufferedInputStream in(&input_stream, buf_size);
+    string contents;
+    TF_ASSERT_OK(in.ReadAll(&contents));
+    EXPECT_EQ(expected, contents);
+  }
+}
+
+TEST(BufferedInputStream, ReadAll_Text) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  const string expected = "line one\nline two\nline three";
+  TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
+
+  for (auto buf_size : BufferSizes()) {
+    RandomAccessInputStream input_stream(file.get());
+    string read;
+    BufferedInputStream in(&input_stream, buf_size);
+    string contents;
+    TF_ASSERT_OK(in.ReadAll(&contents));
+    EXPECT_EQ(expected, contents);
+  }
+}
+
 }  // anonymous namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index 9cff1d349e8..7efe2dc5434 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace io {
@@ -43,25 +44,26 @@ Status InputBuffer::FillBuffer() {
 
 Status InputBuffer::ReadLine(string* result) {
   result->clear();
-  int i;
   Status s;
-  for (i = 0;; i++) {
-    if (pos_ == limit_) {
-      // Get more data into buffer
-      s = FillBuffer();
-      if (limit_ == buf_) {
-        break;
+  do {
+    size_t buf_remain = limit_ - pos_;
+    char* newline = static_cast<char*>(memchr(pos_, '\n', buf_remain));
+    if (newline != nullptr) {
+      size_t result_len = newline - pos_;
+      result->append(pos_, result_len);
+      pos_ = newline + 1;
+      if (!result->empty() && result->back() == '\r') {
+        result->resize(result->size() - 1);
       }
-    }
-    char c = *pos_++;
-    if (c == '\n') {
-      // We don't append the '\n' to *result
       return Status::OK();
     }
-    // We don't append '\r' to *result
-    if (c != '\r') {
-      *result += c;
-    }
+    if (buf_remain > 0) result->append(pos_, buf_remain);
+    // Get more data into buffer
+    s = FillBuffer();
+    DCHECK_EQ(pos_, buf_);
+  } while (limit_ != buf_);
+  if (!result->empty() && result->back() == '\r') {
+    result->resize(result->size() - 1);
   }
   if (errors::IsOutOfRange(s) && !result->empty()) {
     return Status::OK();
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 31fa96bf769..6771697a165 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -37,7 +37,7 @@ static std::vector<int> BufferSizes() {
 TEST(InputBuffer, ReadLine_Empty) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -51,7 +51,8 @@ TEST(InputBuffer, ReadLine_Empty) {
 TEST(InputBuffer, ReadLine1) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "line one\nline two\nline three\n");
+  TF_CHECK_OK(
+      WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -73,7 +74,7 @@ TEST(InputBuffer, ReadLine1) {
 TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "line one\nline two\nline three");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -95,7 +96,8 @@ TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
 TEST(InputBuffer, ReadLine_EmptyLines) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "line one\n\n\nline two\nline three");
+  TF_CHECK_OK(
+      WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -121,7 +123,8 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
 TEST(InputBuffer, ReadLine_CRLF) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "line one\r\n\r\n\r\nline two\r\nline three");
+  TF_ASSERT_OK(WriteStringToFile(env, fname,
+                                 "line one\r\n\r\n\r\nline two\r\nline three"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -147,7 +150,7 @@ TEST(InputBuffer, ReadLine_CRLF) {
 TEST(InputBuffer, ReadNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   // ReadNBytes(int64, string*).
   for (auto buf_size : BufferSizes()) {
@@ -220,7 +223,7 @@ TEST(InputBuffer, ReadNBytes) {
 TEST(InputBuffer, SkipNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -255,7 +258,7 @@ TEST(InputBuffer, SkipNBytes) {
 TEST(InputBuffer, Seek) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/inputbuffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
@@ -295,9 +298,9 @@ TEST(InputBuffer, ReadVarint32) {
   // Generates data.
   std::vector<uint32> data;
   uint32 i = 0;
-  for (; i < (1 << 10); i += 1) data.push_back(i);
-  for (; i < (1 << 15); i += 5) data.push_back(i);
-  for (; i < (1 << 31); i += 132817) data.push_back(i);
+  for (; i < (1U << 10); i += 1) data.push_back(i);
+  for (; i < (1U << 15); i += 5) data.push_back(i);
+  for (; i < (1U << 31); i += 132817) data.push_back(i);
   data.push_back(std::numeric_limits<uint32>::max());
 
   // Writes the varints.
diff --git a/tensorflow/core/lib/io/inputstream_interface_test.cc b/tensorflow/core/lib/io/inputstream_interface_test.cc
index 09d5d74869e..43c4c55b7ad 100644
--- a/tensorflow/core/lib/io/inputstream_interface_test.cc
+++ b/tensorflow/core/lib/io/inputstream_interface_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 class TestStringStream : public InputStreamInterface {
  public:
-  TestStringStream(const string& content) : content_(content) {}
+  explicit TestStringStream(const string& content) : content_(content) {}
 
   Status ReadNBytes(int64 bytes_to_read, string* result) override {
     result->clear();
diff --git a/tensorflow/core/lib/io/iterator.cc b/tensorflow/core/lib/io/iterator.cc
index 328b03a9ca4..50eaa50b5cc 100644
--- a/tensorflow/core/lib/io/iterator.cc
+++ b/tensorflow/core/lib/io/iterator.cc
@@ -19,14 +19,14 @@ namespace tensorflow {
 namespace table {
 
 Iterator::Iterator() {
-  cleanup_.function = NULL;
-  cleanup_.next = NULL;
+  cleanup_.function = nullptr;
+  cleanup_.next = nullptr;
 }
 
 Iterator::~Iterator() {
-  if (cleanup_.function != NULL) {
+  if (cleanup_.function != nullptr) {
     (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
-    for (Cleanup* c = cleanup_.next; c != NULL;) {
+    for (Cleanup* c = cleanup_.next; c != nullptr;) {
       (*c->function)(c->arg1, c->arg2);
       Cleanup* next = c->next;
       delete c;
@@ -36,9 +36,9 @@ Iterator::~Iterator() {
 }
 
 void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
-  assert(func != NULL);
+  assert(func != nullptr);
   Cleanup* c;
-  if (cleanup_.function == NULL) {
+  if (cleanup_.function == nullptr) {
     c = &cleanup_;
   } else {
     c = new Cleanup;
@@ -53,20 +53,20 @@ void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
 namespace {
 class EmptyIterator : public Iterator {
  public:
-  EmptyIterator(const Status& s) : status_(s) {}
-  virtual bool Valid() const { return false; }
-  virtual void Seek(const StringPiece& target) {}
-  virtual void SeekToFirst() {}
-  virtual void Next() { assert(false); }
-  StringPiece key() const {
+  explicit EmptyIterator(const Status& s) : status_(s) {}
+  bool Valid() const override { return false; }
+  void Seek(const StringPiece& target) override {}
+  void SeekToFirst() override {}
+  void Next() override { assert(false); }
+  StringPiece key() const override {
     assert(false);
     return StringPiece();
   }
-  StringPiece value() const {
+  StringPiece value() const override {
     assert(false);
     return StringPiece();
   }
-  virtual Status status() const { return status_; }
+  Status status() const override { return status_; }
 
  private:
   Status status_;
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index ab2fd7739f7..d93dd0296e4 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -177,7 +177,7 @@ string CleanPath(StringPiece unclean_path) {
   }
 
   // Calculate and check the length of the cleaned path.
-  int path_length = dst - path.begin();
+  string::difference_type path_length = dst - path.begin();
   if (path_length != 0) {
     // Remove trailing '/' except if it is root path ("/" ==> path_length := 1)
     if (path_length > 1 && path[path_length - 1] == '/') {
diff --git a/tensorflow/core/lib/io/random_inputstream_test.cc b/tensorflow/core/lib/io/random_inputstream_test.cc
index 40b5be8571f..7f697d5fa4f 100644
--- a/tensorflow/core/lib/io/random_inputstream_test.cc
+++ b/tensorflow/core/lib/io/random_inputstream_test.cc
@@ -26,7 +26,7 @@ namespace {
 TEST(RandomInputStream, ReadNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/random_inputbuffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -55,7 +55,7 @@ TEST(RandomInputStream, ReadNBytes) {
 TEST(RandomInputStream, SkipNBytes) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/random_inputbuffer_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -86,7 +86,7 @@ TEST(RandomInputStream, SkipNBytes) {
 TEST(RandomInputStream, Seek) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/random_inputbuffer_seek_test";
-  WriteStringToFile(env, fname, "0123456789");
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 450f10d2999..ff2fd48de96 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -102,7 +102,7 @@ Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
     TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage));
 
     if (storage->size() != expected) {
-      if (storage->size() == 0) {
+      if (storage->empty()) {
         return errors::OutOfRange("eof");
       } else {
         return errors::DataLoss("truncated record at ", offset);
@@ -121,7 +121,7 @@ Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
     StringPiece data;
     TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0]));
     if (data.size() != expected) {
-      if (data.size() == 0) {
+      if (data.empty()) {
         return errors::OutOfRange("eof");
       } else {
         return errors::DataLoss("truncated record at ", offset);
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 0a656473e4b..95ac040602d 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -45,8 +45,8 @@ TEST(RecordReaderWriterTest, TestBasics) {
       io::RecordWriterOptions options;
       options.zlib_options.output_buffer_size = buf_size;
       io::RecordWriter writer(file.get(), options);
-      writer.WriteRecord("abc");
-      writer.WriteRecord("defg");
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
       TF_CHECK_OK(writer.Flush());
     }
 
@@ -82,8 +82,8 @@ TEST(RecordReaderWriterTest, TestZlib) {
       options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
       options.zlib_options.output_buffer_size = buf_size;
       io::RecordWriter writer(file.get(), options);
-      writer.WriteRecord("abc");
-      writer.WriteRecord("defg");
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
       TF_CHECK_OK(writer.Flush());
     }
 
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index d77a1016dad..3657243c5d3 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -80,15 +80,12 @@ RecordWriter::RecordWriter(WritableFile* dest,
 }
 
 RecordWriter::~RecordWriter() {
-#if !defined(IS_SLIM_BUILD)
-  if (IsZlibCompressed(options_)) {
-    Status s = dest_->Close();
+  if (dest_ != nullptr) {
+    Status s = Close();
     if (!s.ok()) {
       LOG(ERROR) << "Could not finish writing file: " << s;
     }
-    delete dest_;
   }
-#endif  // IS_SLIM_BUILD
 }
 
 static uint32 MaskedCrc(const char* data, size_t n) {
@@ -113,6 +110,18 @@ Status RecordWriter::WriteRecord(StringPiece data) {
   return dest_->Append(StringPiece(footer, sizeof(footer)));
 }
 
+Status RecordWriter::Close() {
+#if !defined(IS_SLIM_BUILD)
+  if (IsZlibCompressed(options_)) {
+    Status s = dest_->Close();
+    delete dest_;
+    dest_ = nullptr;
+    return s;
+  }
+#endif  // IS_SLIM_BUILD
+  return Status::OK();
+}
+
 Status RecordWriter::Flush() {
   if (IsZlibCompressed(options_)) {
     return dest_->Flush();
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 63f0a7c5d07..daed809af3c 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -53,6 +53,10 @@ class RecordWriter {
   RecordWriter(WritableFile* dest,
                const RecordWriterOptions& options = RecordWriterOptions());
 
+  // Calls Close() and logs if an error occurs.
+  //
+  // TODO(jhseu): Require that callers explicitly call Close() and remove the
+  // implicit Close() call in the destructor.
   ~RecordWriter();
 
   Status WriteRecord(StringPiece slice);
@@ -62,6 +66,12 @@ class RecordWriter {
   // WritableFile.
   Status Flush();
 
+  // Writes all output to the file. Does *not* close the WritableFile.
+  //
+  // After calling Close(), any further calls to `WriteRecord()` or `Flush()`
+  // are invalid.
+  Status Close();
+
  private:
   WritableFile* dest_;
   RecordWriterOptions options_;
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
index ac1ba72b169..853d86cb230 100644
--- a/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_inputbuffer.cc
@@ -106,7 +106,7 @@ Status SnappyInputBuffer::Inflate() {
 
   // Output buffer must be large enough to fit the uncompressed block.
   DCHECK_GE(output_buffer_capacity_, uncompressed_length);
-  next_out_ = (char*)output_buffer_.get();
+  next_out_ = output_buffer_.get();
 
   bool status = port::Snappy_Uncompress(next_in_, compressed_block_length,
                                         output_buffer_.get());
@@ -129,7 +129,10 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
     size_t readable = std::min(bytes_to_read, avail_in_);
 
     for (int i = 0; i < readable; i++) {
-      *length = (*length << 8) | next_in_[0];
+      // The "unsigned char" type cast is intentional to avoid implicit type
+      // casting of the signed char to unsigned int during bitwise OR which
+      // causes weird overflow errors.
+      *length = (*length << 8) | static_cast<unsigned char>(next_in_[0]);
       bytes_to_read--;
       next_in_++;
       avail_in_--;
@@ -180,7 +183,7 @@ Status SnappyInputBuffer::ReadFromFile() {
   // possible that on the last read there isn't enough data in the file to
   // fill up the buffer in which case file_->ReadNBytes would return an
   // OutOfRange error.
-  if (data.size() == 0) {
+  if (data.empty()) {
     return errors::OutOfRange("EOF reached");
   }
   if (errors::IsOutOfRange(s)) {
diff --git a/tensorflow/core/lib/io/table.cc b/tensorflow/core/lib/io/table.cc
index 41eb38fe194..1ef7bb6ccda 100644
--- a/tensorflow/core/lib/io/table.cc
+++ b/tensorflow/core/lib/io/table.cc
@@ -40,7 +40,7 @@ struct Table::Rep {
 
 Status Table::Open(const Options& options, RandomAccessFile* file, uint64 size,
                    Table** table) {
-  *table = NULL;
+  *table = nullptr;
   if (size < Footer::kEncodedLength) {
     return errors::DataLoss("file is too short to be an sstable");
   }
@@ -57,7 +57,7 @@ Status Table::Open(const Options& options, RandomAccessFile* file, uint64 size,
 
   // Read the index block
   BlockContents contents;
-  Block* index_block = NULL;
+  Block* index_block = nullptr;
   if (s.ok()) {
     s = ReadBlock(file, footer.index_handle(), &contents);
     if (s.ok()) {
@@ -94,7 +94,7 @@ static void DeleteBlock(void* arg, void* ignored) {
 Iterator* Table::BlockReader(void* arg, const StringPiece& index_value) {
   Table* table = reinterpret_cast<Table*>(arg);
   //  Cache* block_cache = table->rep_->options.block_cache;
-  Block* block = NULL;
+  Block* block = nullptr;
   //  Cache::Handle* cache_handle = NULL;
 
   BlockHandle handle;
@@ -112,9 +112,9 @@ Iterator* Table::BlockReader(void* arg, const StringPiece& index_value) {
   }
 
   Iterator* iter;
-  if (block != NULL) {
+  if (block != nullptr) {
     iter = block->NewIterator();
-    iter->RegisterCleanup(&DeleteBlock, block, NULL);
+    iter->RegisterCleanup(&DeleteBlock, block, nullptr);
   } else {
     iter = NewErrorIterator(s);
   }
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index e0e449d45a1..78a3fa501c6 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -74,7 +74,7 @@ static StringPiece CompressibleString(random::SimplePhilox* rnd,
   dst->resize(len);
   return StringPiece(*dst);
 }
-}
+}  // namespace test
 
 static void Increment(string* key) { key->push_back('\0'); }
 
@@ -90,15 +90,15 @@ struct STLLessThan {
 
 class StringSink : public WritableFile {
  public:
-  ~StringSink() {}
+  ~StringSink() override {}
 
   const string& contents() const { return contents_; }
 
-  virtual Status Close() { return Status::OK(); }
-  virtual Status Flush() { return Status::OK(); }
-  virtual Status Sync() { return Status::OK(); }
+  Status Close() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
 
-  virtual Status Append(const StringPiece& data) {
+  Status Append(const StringPiece& data) override {
     contents_.append(data.data(), data.size());
     return Status::OK();
   }
@@ -109,15 +109,15 @@ class StringSink : public WritableFile {
 
 class StringSource : public RandomAccessFile {
  public:
-  StringSource(const StringPiece& contents)
+  explicit StringSource(const StringPiece& contents)
       : contents_(contents.data(), contents.size()), bytes_read_(0) {}
 
-  virtual ~StringSource() {}
+  ~StringSource() override {}
 
   uint64 Size() const { return contents_.size(); }
 
-  virtual Status Read(uint64 offset, size_t n, StringPiece* result,
-                      char* scratch) const {
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
     if (offset > contents_.size()) {
       return errors::InvalidArgument("invalid Read offset");
     }
@@ -177,11 +177,11 @@ class Constructor {
 
 class BlockConstructor : public Constructor {
  public:
-  BlockConstructor() : block_(NULL) {}
-  ~BlockConstructor() { delete block_; }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  BlockConstructor() : block_(nullptr) {}
+  ~BlockConstructor() override { delete block_; }
+  Status FinishImpl(const Options& options, const KVMap& data) override {
     delete block_;
-    block_ = NULL;
+    block_ = nullptr;
     BlockBuilder builder(&options);
 
     for (KVMap::const_iterator it = data.begin(); it != data.end(); ++it) {
@@ -196,7 +196,7 @@ class BlockConstructor : public Constructor {
     block_ = new Block(contents);
     return Status::OK();
   }
-  virtual Iterator* NewIterator() const { return block_->NewIterator(); }
+  Iterator* NewIterator() const override { return block_->NewIterator(); }
 
  private:
   string data_;
@@ -205,9 +205,9 @@ class BlockConstructor : public Constructor {
 
 class TableConstructor : public Constructor {
  public:
-  TableConstructor() : source_(NULL), table_(NULL) {}
-  ~TableConstructor() { Reset(); }
-  virtual Status FinishImpl(const Options& options, const KVMap& data) {
+  TableConstructor() : source_(nullptr), table_(nullptr) {}
+  ~TableConstructor() override { Reset(); }
+  Status FinishImpl(const Options& options, const KVMap& data) override {
     Reset();
     StringSink sink;
     TableBuilder builder(options, &sink);
@@ -227,7 +227,7 @@ class TableConstructor : public Constructor {
     return Table::Open(table_options, source_, sink.contents().size(), &table_);
   }
 
-  virtual Iterator* NewIterator() const { return table_->NewIterator(); }
+  Iterator* NewIterator() const override { return table_->NewIterator(); }
 
   uint64 ApproximateOffsetOf(const StringPiece& key) const {
     return table_->ApproximateOffsetOf(key);
@@ -239,8 +239,8 @@ class TableConstructor : public Constructor {
   void Reset() {
     delete table_;
     delete source_;
-    table_ = NULL;
-    source_ = NULL;
+    table_ = nullptr;
+    source_ = nullptr;
   }
 
   StringSource* source_;
@@ -262,11 +262,11 @@ static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]);
 
 class Harness : public ::testing::Test {
  public:
-  Harness() : constructor_(NULL) {}
+  Harness() : constructor_(nullptr) {}
 
   void Init(const TestArgs& args) {
     delete constructor_;
-    constructor_ = NULL;
+    constructor_ = nullptr;
     options_ = Options();
 
     options_.block_restart_interval = args.restart_interval;
@@ -283,7 +283,7 @@ class Harness : public ::testing::Test {
     }
   }
 
-  ~Harness() { delete constructor_; }
+  ~Harness() override { delete constructor_; }
 
   void Add(const string& key, const string& value) {
     constructor_->Add(key, value);
@@ -396,7 +396,7 @@ class Harness : public ::testing::Test {
           break;
         case 1: {
           // Attempt to return something smaller than an existing key
-          if (result.size() > 0 && result[result.size() - 1] > '\0') {
+          if (!result.empty() && result[result.size() - 1] > '\0') {
             result[result.size() - 1]--;
           }
           break;
@@ -526,8 +526,9 @@ static bool Between(uint64 val, uint64 low, uint64 high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
     fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
-            (unsigned long long)(val), (unsigned long long)(low),
-            (unsigned long long)(high));
+            static_cast<unsigned long long>(val),
+            static_cast<unsigned long long>(low),
+            static_cast<unsigned long long>(high));
   }
   return result;
 }
diff --git a/tensorflow/core/lib/io/two_level_iterator.cc b/tensorflow/core/lib/io/two_level_iterator.cc
index de35148b385..ad66ae40d80 100644
--- a/tensorflow/core/lib/io/two_level_iterator.cc
+++ b/tensorflow/core/lib/io/two_level_iterator.cc
@@ -32,29 +32,29 @@ class TwoLevelIterator : public Iterator {
   TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
                    void* arg);
 
-  virtual ~TwoLevelIterator();
+  ~TwoLevelIterator() override;
 
-  virtual void Seek(const StringPiece& target);
-  virtual void SeekToFirst();
-  virtual void Next();
+  void Seek(const StringPiece& target) override;
+  void SeekToFirst() override;
+  void Next() override;
 
-  virtual bool Valid() const {
+  bool Valid() const override {
     return (data_iter_ == nullptr) ? false : data_iter_->Valid();
   }
-  virtual StringPiece key() const {
+  StringPiece key() const override {
     assert(Valid());
     return data_iter_->key();
   }
-  virtual StringPiece value() const {
+  StringPiece value() const override {
     assert(Valid());
     return data_iter_->value();
   }
-  virtual Status status() const {
+  Status status() const override {
     // It'd be nice if status() returned a const Status& instead of a
     // Status
     if (!index_iter_->status().ok()) {
       return index_iter_->status();
-    } else if (data_iter_ != NULL && !data_iter_->status().ok()) {
+    } else if (data_iter_ != nullptr && !data_iter_->status().ok()) {
       return data_iter_->status();
     } else {
       return status_;
@@ -84,7 +84,7 @@ TwoLevelIterator::TwoLevelIterator(Iterator* index_iter,
     : block_function_(block_function),
       arg_(arg),
       index_iter_(index_iter),
-      data_iter_(NULL) {}
+      data_iter_(nullptr) {}
 
 TwoLevelIterator::~TwoLevelIterator() {
   delete index_iter_;
@@ -94,14 +94,14 @@ TwoLevelIterator::~TwoLevelIterator() {
 void TwoLevelIterator::Seek(const StringPiece& target) {
   index_iter_->Seek(target);
   InitDataBlock();
-  if (data_iter_ != NULL) data_iter_->Seek(target);
+  if (data_iter_ != nullptr) data_iter_->Seek(target);
   SkipEmptyDataBlocksForward();
 }
 
 void TwoLevelIterator::SeekToFirst() {
   index_iter_->SeekToFirst();
   InitDataBlock();
-  if (data_iter_ != NULL) data_iter_->SeekToFirst();
+  if (data_iter_ != nullptr) data_iter_->SeekToFirst();
   SkipEmptyDataBlocksForward();
 }
 
@@ -112,20 +112,20 @@ void TwoLevelIterator::Next() {
 }
 
 void TwoLevelIterator::SkipEmptyDataBlocksForward() {
-  while (data_iter_ == NULL || !data_iter_->Valid()) {
+  while (data_iter_ == nullptr || !data_iter_->Valid()) {
     // Move to next block
     if (!index_iter_->Valid()) {
-      SetDataIterator(NULL);
+      SetDataIterator(nullptr);
       return;
     }
     index_iter_->Next();
     InitDataBlock();
-    if (data_iter_ != NULL) data_iter_->SeekToFirst();
+    if (data_iter_ != nullptr) data_iter_->SeekToFirst();
   }
 }
 
 void TwoLevelIterator::SetDataIterator(Iterator* data_iter) {
-  if (data_iter_ != NULL) {
+  if (data_iter_ != nullptr) {
     SaveError(data_iter_->status());
     delete data_iter_;
   }
@@ -134,10 +134,10 @@ void TwoLevelIterator::SetDataIterator(Iterator* data_iter) {
 
 void TwoLevelIterator::InitDataBlock() {
   if (!index_iter_->Valid()) {
-    SetDataIterator(NULL);
+    SetDataIterator(nullptr);
   } else {
     StringPiece handle = index_iter_->value();
-    if (data_iter_ != NULL && handle.compare(data_block_handle_) == 0) {
+    if (data_iter_ != nullptr && handle.compare(data_block_handle_) == 0) {
       // data_iter_ is already constructed with this iterator, so
       // no need to change anything
     } else {
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 6b64011cc34..4999d5cc90b 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -37,7 +37,7 @@ ZlibInputStream::ZlibInputStream(
 }
 
 ZlibInputStream::~ZlibInputStream() {
-  if (z_stream_.get()) {
+  if (z_stream_) {
     inflateEnd(z_stream_.get());
   }
 }
@@ -58,16 +58,14 @@ void ZlibInputStream::InitZlibBuffer() {
   z_stream_->avail_in = 0;
 
   int status = inflateInit2(z_stream_.get(), zlib_options_.window_bits);
-  if (status != Z_OK) {
-    LOG(FATAL) << "inflateInit failed with status " << status;
-    z_stream_.reset(NULL);
-  } else {
-    z_stream_->next_in = z_stream_input_.get();
-    z_stream_->next_out = z_stream_output_.get();
-    next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
-    z_stream_->avail_in = 0;
-    z_stream_->avail_out = output_buffer_capacity_;
-  }
+
+  CHECK_EQ(status, Z_OK) << "inflateInit failed with status " << status;
+
+  z_stream_->next_in = z_stream_input_.get();
+  z_stream_->next_out = z_stream_output_.get();
+  next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
+  z_stream_->avail_in = 0;
+  z_stream_->avail_out = output_buffer_capacity_;
 }
 
 Status ZlibInputStream::ReadFromStream() {
@@ -110,7 +108,7 @@ Status ZlibInputStream::ReadFromStream() {
   // possible that on the last read there isn't enough data in the stream to
   // fill up the buffer in which case input_stream_->ReadNBytes would return an
   // OutOfRange error.
-  if (data.size() == 0) {
+  if (data.empty()) {
     return errors::OutOfRange("EOF reached");
   }
   if (errors::IsOutOfRange(s)) {
@@ -180,7 +178,7 @@ Status ZlibInputStream::Inflate() {
   if (error != Z_OK && error != Z_STREAM_END) {
     string error_string =
         strings::StrCat("inflate() failed with error ", error);
-    if (z_stream_->msg != NULL) {
+    if (z_stream_->msg != nullptr) {
       strings::StrAppend(&error_string, ": ", z_stream_->msg);
     }
     return errors::DataLoss(error_string);
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index a65b36b64d4..4a6bedbad88 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -36,7 +36,7 @@ ZlibOutputBuffer::ZlibOutputBuffer(
       z_stream_(new z_stream) {}
 
 ZlibOutputBuffer::~ZlibOutputBuffer() {
-  if (z_stream_.get()) {
+  if (z_stream_) {
     LOG(WARNING) << "ZlibOutputBuffer::Close() not called. Possible data loss";
   }
 }
@@ -58,7 +58,7 @@ Status ZlibOutputBuffer::Init() {
                    zlib_options_.compression_method, zlib_options_.window_bits,
                    zlib_options_.mem_level, zlib_options_.compression_strategy);
   if (status != Z_OK) {
-    z_stream_.reset(NULL);
+    z_stream_.reset(nullptr);
     return errors::InvalidArgument("deflateInit failed with status", status);
   }
   z_stream_->next_in = z_stream_input_.get();
@@ -206,7 +206,7 @@ Status ZlibOutputBuffer::Close() {
   TF_RETURN_IF_ERROR(DeflateBuffered(true));
   TF_RETURN_IF_ERROR(FlushOutputBufferToFile());
   deflateEnd(z_stream_.get());
-  z_stream_.reset(NULL);
+  z_stream_.reset(nullptr);
   return Status::OK();
 }
 
@@ -217,7 +217,7 @@ Status ZlibOutputBuffer::Deflate(int flush) {
     return Status::OK();
   }
   string error_string = strings::StrCat("deflate() failed with error ", error);
-  if (z_stream_->msg != NULL) {
+  if (z_stream_->msg != nullptr) {
     strings::StrAppend(&error_string, ": ", z_stream_->msg);
   }
   return errors::DataLoss(error_string);
diff --git a/tensorflow/core/lib/jpeg/jpeg_handle.cc b/tensorflow/core/lib/jpeg/jpeg_handle.cc
index ce639870926..0ab9249cf59 100644
--- a/tensorflow/core/lib/jpeg/jpeg_handle.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_handle.cc
@@ -79,14 +79,14 @@ void MemTermDestination(j_compress_ptr cinfo) {
 
 // -----------------------------------------------------------------------------
 void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize) {
-  SetDest(cinfo, buffer, bufsize, NULL);
+  SetDest(cinfo, buffer, bufsize, nullptr);
 }
 
 // -----------------------------------------------------------------------------
 void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize,
              string *destination) {
   MemDestMgr *dest;
-  if (cinfo->dest == NULL) {
+  if (cinfo->dest == nullptr) {
     cinfo->dest = reinterpret_cast<struct jpeg_destination_mgr *>(
         (*cinfo->mem->alloc_small)(reinterpret_cast<j_common_ptr>(cinfo),
                                    JPOOL_PERMANENT, sizeof(MemDestMgr)));
@@ -177,7 +177,7 @@ void SetSrc(j_decompress_ptr cinfo, const void *data,
   src->data = reinterpret_cast<const unsigned char *>(data);
   src->datasize = datasize;
   src->pub.bytes_in_buffer = 0;
-  src->pub.next_input_byte = NULL;
+  src->pub.next_input_byte = nullptr;
   src->try_recover_truncated_jpeg = try_recover_truncated_jpeg;
 }
 
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index f9846968afc..258793aa1e6 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -45,7 +45,7 @@ enum JPEGErrors {
   JPEGERRORS_BAD_PARAM
 };
 
-// Prevent bad compiler behaviour in ASAN mode by wrapping most of the
+// Prevent bad compiler behavior in ASAN mode by wrapping most of the
 // arguments in a struct struct.
 class FewerArgsForCompiler {
  public:
@@ -90,7 +90,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   }
 
   // if empty image, return
-  if (datasize == 0 || srcdata == NULL) return nullptr;
+  if (datasize == 0 || srcdata == nullptr) return nullptr;
 
   // Declare temporary buffer pointer here so that we can free on error paths
   JSAMPLE* tempdata = nullptr;
@@ -183,7 +183,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 
   // Temporary buffer used for CMYK -> RGB conversion.
   const bool use_cmyk = (cinfo.out_color_space == JCS_CMYK);
-  tempdata = use_cmyk ? new JSAMPLE[cinfo.output_width * 4] : NULL;
+  tempdata = use_cmyk ? new JSAMPLE[cinfo.output_width * 4] : nullptr;
 
   // If there is an error reading a line, this aborts the reading.
   // Save the fraction of the image that has been read.
@@ -246,7 +246,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
     output_line += stride;
   }
   delete[] tempdata;
-  tempdata = NULL;
+  tempdata = nullptr;
 
   // Convert the RGB data to RGBA, with alpha set to 0xFF to indicate
   // opacity.
@@ -337,17 +337,18 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 uint8* Uncompress(const void* srcdata, int datasize,
                   const UncompressFlags& flags, int64* nwarn,
                   std::function<uint8*(int, int, int)> allocate_output) {
-  FewerArgsForCompiler argball(datasize, flags, nwarn, allocate_output);
+  FewerArgsForCompiler argball(datasize, flags, nwarn,
+                               std::move(allocate_output));
   uint8* const dstdata = UncompressLow(srcdata, &argball);
 
   const float fraction_read =
       argball.height_ == 0
           ? 1.0
           : (static_cast<float>(argball.height_read_) / argball.height_);
-  if (dstdata == NULL ||
+  if (dstdata == nullptr ||
       fraction_read < std::min(1.0f, flags.min_acceptable_fraction)) {
     // Major failure, none or too-partial read returned; get out
-    return NULL;
+    return nullptr;
   }
 
   // If there was an error in reading the jpeg data,
@@ -365,7 +366,7 @@ uint8* Uncompress(const void* srcdata, int datasize,
 uint8* Uncompress(const void* srcdata, int datasize,
                   const UncompressFlags& flags, int* pwidth, int* pheight,
                   int* pcomponents, int64* nwarn) {
-  uint8* buffer = NULL;
+  uint8* buffer = nullptr;
   uint8* result =
       Uncompress(srcdata, datasize, flags, nwarn,
                  [=, &buffer](int width, int height, int components) {
@@ -390,7 +391,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
   if (components) *components = 0;
 
   // If empty image, return
-  if (datasize == 0 || srcdata == NULL) return false;
+  if (datasize == 0 || srcdata == nullptr) return false;
 
   // Initialize libjpeg structures to have a memory source
   // Modify the usual jpeg error manager to catch fatal errors.
@@ -448,7 +449,7 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
     return false;
   }
 
-  JOCTET* buffer = 0;
+  JOCTET* buffer = nullptr;
 
   // NOTE: for broader use xmp_metadata should be made a unicode string
   CHECK(srcdata != nullptr);
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index edd20371b4e..cc8646750e1 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -70,18 +70,18 @@ void TestJPEG(Env* env, const string& jpegfile) {
 
   // Set min_acceptable_fraction to something insufficient
   flags.min_acceptable_fraction = 0.8;
-  imgdata.reset(Uncompress(temp, fsize / 2, flags, &w, &h, &c, NULL));
-  CHECK(imgdata.get() == NULL);
+  imgdata.reset(Uncompress(temp, fsize / 2, flags, &w, &h, &c, nullptr));
+  CHECK(imgdata == nullptr);
 
   // Now, use a value that makes fsize/2 be enough for a black-filling
   flags.min_acceptable_fraction = 0.01;
-  imgdata.reset(Uncompress(temp, fsize / 2, flags, &w, &h, &c, NULL));
-  CHECK(imgdata.get() != NULL);
+  imgdata.reset(Uncompress(temp, fsize / 2, flags, &w, &h, &c, nullptr));
+  CHECK(imgdata != nullptr);
 
   // Finally, uncompress the whole data
   flags.min_acceptable_fraction = 1.0;
-  imgdata.reset(Uncompress(temp, fsize, flags, &w, &h, &c, NULL));
-  CHECK(imgdata.get() != NULL);
+  imgdata.reset(Uncompress(temp, fsize, flags, &w, &h, &c, nullptr));
+  CHECK(imgdata != nullptr);
 }
 
 TEST(JpegMemTest, Jpeg) {
@@ -153,8 +153,8 @@ TEST(JpegMemTest, Jpeg2) {
     UncompressFlags flags;
     flags.components = components;
     int w, h, c;
-    imgdata1.reset(
-        Uncompress(cpdata1.c_str(), cpdata1.length(), flags, &w, &h, &c, NULL));
+    imgdata1.reset(Uncompress(cpdata1.c_str(), cpdata1.length(), flags, &w, &h,
+                              &c, nullptr));
 
     // Check obvious formatting stuff
     CHECK_EQ(w, in_w);
@@ -193,8 +193,8 @@ TEST(JpegMemTest, Jpeg2) {
     flags.components = 3;
     flags.dct_method = JDCT_IFAST;
     int w, h, c;
-    imgdata1.reset(
-        Uncompress(cpdata1.c_str(), cpdata1.length(), flags, &w, &h, &c, NULL));
+    imgdata1.reset(Uncompress(cpdata1.c_str(), cpdata1.length(), flags, &w, &h,
+                              &c, nullptr));
 
     // Check obvious formatting stuff
     CHECK_EQ(w, in_w);
@@ -267,7 +267,7 @@ TEST(JpegMemTest, ChromaDownsampling) {
   int64 num_warnings;
   std::unique_ptr<uint8[]> uncompressed(Uncompress(
       jpeg.c_str(), jpeg.size(), unflags, &w, &h, &c, &num_warnings));
-  CHECK(uncompressed.get() != NULL);
+  CHECK(uncompressed != nullptr);
   CHECK_EQ(num_warnings, 0);
 
   // Recompress the JPEG with and without chroma downsampling
@@ -296,7 +296,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
   int width, height, components;
   std::unique_ptr<uint8[]> imgdata;
   imgdata.reset(Uncompress(jpeg.c_str(), jpeg.size(), flags, &width, &height,
-                           &components, NULL));
+                           &components, nullptr));
   if (expected_width > 0) {  // we expect the file to decode into 'something'
     CHECK_EQ(width, expected_width);
     CHECK_EQ(height, expected_height);
diff --git a/tensorflow/core/lib/lmdb/testdata/data.mdb b/tensorflow/core/lib/lmdb/testdata/data.mdb
new file mode 100644
index 00000000000..3ea75699cb6
Binary files /dev/null and b/tensorflow/core/lib/lmdb/testdata/data.mdb differ
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index bdc39e5d6f7..354c819b090 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string.h>
 #include <sys/types.h>
+#include <zlib.h>
 #include <string>
 #include <utility>
 #include <vector>
@@ -124,7 +125,8 @@ char* check_metadata_string(const string& s) {
 void CommonFreeDecode(DecodeContext* context) {
   if (context->png_ptr) {
     png_destroy_read_struct(&context->png_ptr,
-                            context->info_ptr ? &context->info_ptr : NULL, 0);
+                            context->info_ptr ? &context->info_ptr : nullptr,
+                            nullptr);
     context->png_ptr = nullptr;
     context->info_ptr = nullptr;
   }
@@ -149,10 +151,13 @@ bool DecodeHeader(StringPiece png_string, int* width, int* height,
   *width = static_cast<int>(context.width);
   CHECK_NOTNULL(height);
   *height = static_cast<int>(context.height);
-  if (components != NULL) {
+  if (components != nullptr) {
     switch (context.color_type) {
       case PNG_COLOR_TYPE_PALETTE:
-        *components = (context.info_ptr->valid & PNG_INFO_tRNS) ? 4 : 3;
+        *components =
+            (png_get_valid(context.png_ptr, context.info_ptr, PNG_INFO_tRNS))
+                ? 4
+                : 3;
         break;
       case PNG_COLOR_TYPE_GRAY:
         *components = 1;
@@ -171,13 +176,16 @@ bool DecodeHeader(StringPiece png_string, int* width, int* height,
         break;
     }
   }
-  if (channel_bit_depth != NULL) {
+  if (channel_bit_depth != nullptr) {
     *channel_bit_depth = context.bit_depth;
   }
-  if (metadata != NULL) {
+  if (metadata != nullptr) {
     metadata->clear();
-    for (int i = 0; i < context.info_ptr->num_text; i++) {
-      const png_text& text = context.info_ptr->text[i];
+    png_textp text_ptr = nullptr;
+    int num_text = 0;
+    png_get_text(context.png_ptr, context.info_ptr, &text_ptr, &num_text);
+    for (int i = 0; i < num_text; i++) {
+      const png_text& text = text_ptr[i];
       metadata->push_back(std::make_pair(text.key, text.text));
     }
   }
@@ -215,8 +223,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   png_set_read_fn(context->png_ptr, context, StringReader);
   png_read_info(context->png_ptr, context->info_ptr);
   png_get_IHDR(context->png_ptr, context->info_ptr, &context->width,
-               &context->height, &context->bit_depth, &context->color_type, 0,
-               0, 0);
+               &context->height, &context->bit_depth, &context->color_type,
+               nullptr, nullptr, nullptr);
   if (context->error_condition) {
     VLOG(1) << ": DecodePNG <- error during header parsing.";
     CommonFreeDecode(context);
@@ -228,9 +236,10 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     return false;
   }
   if (context->channels == 0) {  // Autodetect number of channels
-    context->channels = context->info_ptr->channels;
+    context->channels = png_get_channels(context->png_ptr, context->info_ptr);
   }
-  const bool has_tRNS = (context->info_ptr->valid & PNG_INFO_tRNS) != 0;
+  const bool has_tRNS =
+      (png_get_valid(context->png_ptr, context->info_ptr, PNG_INFO_tRNS)) != 0;
   const bool has_alpha = (context->color_type & PNG_COLOR_MASK_ALPHA) != 0;
   if ((context->channels & 1) == 0) {  // We desire alpha
     if (has_alpha) {                   // There is alpha
@@ -268,7 +277,9 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   const bool want_gray = (context->channels < 3);
   const bool is_gray = !(context->color_type & PNG_COLOR_MASK_COLOR);
   if (is_gray) {  // upconvert gray to 8-bit if needed.
-    if (context->bit_depth < 8) png_set_gray_1_2_4_to_8(context->png_ptr);
+    if (context->bit_depth < 8) {
+      png_set_expand_gray_1_2_4_to_8(context->png_ptr);
+    }
   }
   if (want_gray) {  // output is grayscale
     if (!is_gray)
@@ -297,11 +308,13 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
   for (int p = 0; p < context->num_passes; ++p) {
     png_bytep row = data;
     for (int h = context->height; h-- != 0; row += row_bytes) {
-      png_read_row(context->png_ptr, row, NULL);
+      png_read_row(context->png_ptr, row, nullptr);
     }
   }
 
-  context->info_ptr->valid |= PNG_INFO_IDAT;
+  // Marks iDAT as valid.
+  png_set_rows(context->png_ptr, context->info_ptr,
+               png_get_rows(context->png_ptr, context->info_ptr));
   png_read_end(context->png_ptr, context->info_ptr);
 
   // Clean up.
@@ -327,17 +340,17 @@ bool WriteImageToBuffer(
   if (width == 0 || height == 0) return false;
 
   png_string->resize(0);
-  png_infop info_ptr = NULL;
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL,
+  png_infop info_ptr = nullptr;
+  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr,
                                                 ErrorHandler, WarningHandler);
-  if (png_ptr == NULL) return false;
+  if (png_ptr == nullptr) return false;
   if (setjmp(png_jmpbuf(png_ptr))) {
-    png_destroy_write_struct(&png_ptr, info_ptr ? &info_ptr : NULL);
+    png_destroy_write_struct(&png_ptr, info_ptr ? &info_ptr : nullptr);
     return false;
   }
   info_ptr = png_create_info_struct(png_ptr);
-  if (info_ptr == NULL) {
-    png_destroy_write_struct(&png_ptr, NULL);
+  if (info_ptr == nullptr) {
+    png_destroy_write_struct(&png_ptr, nullptr);
     return false;
   }
 
@@ -388,7 +401,7 @@ bool WriteImageToBuffer(
 
   png_byte* row = reinterpret_cast<png_byte*>(const_cast<void*>(image));
   for (; height--; row += row_bytes) png_write_row(png_ptr, row);
-  png_write_end(png_ptr, NULL);
+  png_write_end(png_ptr, nullptr);
 
   png_destroy_write_struct(&png_ptr, &info_ptr);
   return true;
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index 1fec5a3b441..b2adb4462ba 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -101,12 +101,15 @@ class Array {
 // 2. PhiloxRandom is compilable by gcc and nvcc.
 class PhiloxRandom {
  public:
-  typedef Array<uint32, 4> ResultType;
-  typedef uint32 ResultElementType;
+  using ResultType = Array<uint32, 4>;
+  using ResultElementType = uint32;
   // The number of elements that will be returned.
   static const int kResultElementCount = 4;
   // Cost of generation of a single element (in cycles).
   static const int kElementCost = 10;
+  // The type for the 64-bit key stored in the form of two 32-bit uint
+  // that are used in the diffusion process.
+  using Key = Array<uint32, 2>;
 
   PHILOX_DEVICE_INLINE
   PhiloxRandom() {}
@@ -125,6 +128,9 @@ class PhiloxRandom {
     counter_[3] = static_cast<uint32>(seed_hi >> 32);
   }
 
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
+
   // Skip the specified number of samples of 128-bits in the current stream.
   PHILOX_DEVICE_INLINE
   void Skip(uint64 count) {
@@ -178,10 +184,6 @@ class PhiloxRandom {
   }
 
  private:
-  // The type for the 64-bit key stored in the form of two 32-bit uint
-  // that are used in the diffusion process.
-  typedef Array<uint32, 2> Key;
-
   // We use the same constants as recommended by the original paper.
   static const uint32 kPhiloxW32A = 0x9E3779B9;
   static const uint32 kPhiloxW32B = 0xBB67AE85;
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 03b155344ce..c15a6436d6e 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/philox_random.h"
 
+
 namespace tensorflow {
 namespace random {
 
@@ -373,7 +374,7 @@ class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
       for (int i = 0; i < 2; ++i) {
-        if (fabs(f[i]) < kTruncateValue) {
+        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
           results[index++] = Eigen::half(f[i]);
           if (index >= kResultElementCount) {
             return results;
@@ -416,7 +417,7 @@ class TruncatedNormalDistribution<SingleSampleGenerator, float> {
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
       for (int i = 0; i < 2; ++i) {
-        if (fabs(f[i]) < kTruncateValue) {
+        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
           results[index++] = f[i];
           if (index >= kResultElementCount) {
             return results;
@@ -458,7 +459,7 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
 
       for (int i = 0; i < 2; ++i) {
-        if (fabs(d[i]) < kTruncateValue) {
+        if (Eigen::numext::abs(d[i]) < kTruncateValue) {
           results[index++] = d[i];
           if (index >= kResultElementCount) {
             return results;
@@ -483,12 +484,12 @@ void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
     u1 = epsilon;
   }
   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
-  const float u2 = sqrt(-2.0f * log(u1));
-#if defined(__linux__)
-  sincosf(v1, f0, f1);
+  const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *f0 = Eigen::numext::sin(v1);
+  *f1 = Eigen::numext::cos(v1);
 #else
-  *f0 = sinf(v1);
-  *f1 = cosf(v1);
+  sincosf(v1, f0, f1);
 #endif
   *f0 *= u2;
   *f1 *= u2;
@@ -509,12 +510,12 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
     u1 = epsilon;
   }
   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
-  const double u2 = sqrt(-2.0 * log(u1));
-#if defined(__linux__)
-  sincos(v1, d0, d1);
+  const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *d0 = Eigen::numext::sin(v1);
+  *d1 = Eigen::numext::cos(v1);
 #else
-  *d0 = sin(v1);
-  *d1 = cos(v1);
+  sincos(v1, d0, d1);
 #endif
   *d0 *= u2;
   *d1 *= u2;
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 531ed781095..28ff5bf6e8e 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -70,7 +70,7 @@ void FillRandomsWithSingles(PhiloxRandom gen,
 //   z_limit: the maximum z-test we would consider the test to pass;
 template <typename T>
 bool CheckSamplesMoments(const std::vector<T>& samples,
-                         std::function<double(int)> theoretical_moments,
+                         const std::function<double(int)>& theoretical_moments,
                          int max_moments, int stride, T z_limit) {
   const T* const samples_data = &samples[0];
   const int samples_size = samples.size();
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index fc07bd446c1..0dea0f89f9b 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -166,7 +166,7 @@ char* DoubleToBuffer(double value, char* buffer) {
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
 
     full_precision_needed =
-        locale_independent_strtonum<double>(buffer, NULL) != value;
+        locale_independent_strtonum<double>(buffer, nullptr) != value;
   }
 
   if (full_precision_needed) {
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index e025b1375a0..af9a1512594 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -134,7 +134,9 @@ inline static void AppendBytes(string* dest, const char* src, size_t len) {
   dest->append(src, len);
 }
 
-inline bool IsSpecialByte(char c) { return ((unsigned char)(c + 1)) < 2; }
+inline bool IsSpecialByte(char c) {
+  return (static_cast<unsigned char>(c + 1)) < 2;
+}
 
 // Return a pointer to the first byte in the range "[start..limit)"
 // whose value is 0 or 255 (kEscape1 or kEscape2).  If no such byte
@@ -201,7 +203,7 @@ void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
     buf[9 - len] = (val & 0xff);
     val >>= 8;
   }
-  buf[9 - len - 1] = (unsigned char)len;
+  buf[9 - len - 1] = len;
   len++;
   AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len);
 }
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index 3eac2a3cabc..fee8a6f93e9 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -29,8 +29,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace strings {
+namespace {
 
-static string RandomString(random::SimplePhilox* rnd, size_t len) {
+string RandomString(random::SimplePhilox* rnd, size_t len) {
   string x;
   for (size_t i = 0; i < len; i++) {
     x += rnd->Uniform(256);
@@ -43,9 +44,9 @@ static string RandomString(random::SimplePhilox* rnd, size_t len) {
 
 // Read/WriteIncreasing are defined for string, uint64, int64 below.
 template <typename T>
-static void OCWriteIncreasing(string* dest, const T& val);
+void OCWriteIncreasing(string* dest, const T& val);
 template <typename T>
-static bool OCReadIncreasing(StringPiece* src, T* result);
+bool OCReadIncreasing(StringPiece* src, T* result);
 
 // Read/WriteIncreasing<string>
 template <>
@@ -98,11 +99,11 @@ bool OCRead(StringPiece* s, T* val) {
 // Numbers
 
 template <typename T>
-static T TestRead(const string& a) {
+T TestRead(const string& a) {
   // gracefully reject any proper prefix of an encoding
   for (int i = 0; i < a.size() - 1; ++i) {
     StringPiece s(a.data(), i);
-    CHECK(!OCRead<T>(&s, NULL));
+    CHECK(!OCRead<T>(&s, nullptr));
     CHECK_EQ(s, a.substr(0, i));
   }
 
@@ -114,14 +115,14 @@ static T TestRead(const string& a) {
 }
 
 template <typename T>
-static void TestWriteRead(T expected) {
+void TestWriteRead(T expected) {
   EXPECT_EQ(expected, TestRead<T>(OCWrite<T>(expected)));
 }
 
 // Verifies that the second Write* call appends a non-empty string to its
 // output.
 template <typename T, typename U>
-static void TestWriteAppends(T first, U second) {
+void TestWriteAppends(T first, U second) {
   string encoded;
   OCWriteToString<T>(&encoded, first);
   string encoded_first_only = encoded;
@@ -131,7 +132,7 @@ static void TestWriteAppends(T first, U second) {
 }
 
 template <typename T>
-static void TestNumbers(T multiplier) {
+void TestNumbers(T multiplier) {
   // first test powers of 2 (and nearby numbers)
   for (T x = std::numeric_limits<T>().max(); x != 0; x /= 2) {
     TestWriteRead(multiplier * (x - 1));
@@ -158,10 +159,10 @@ static void TestNumbers(T multiplier) {
 }
 
 // Return true iff 'a' is "before" 'b'
-static bool CompareStrings(const string& a, const string& b) { return (a < b); }
+bool CompareStrings(const string& a, const string& b) { return (a < b); }
 
 template <typename T>
-static void TestNumberOrdering() {
+void TestNumberOrdering() {
   // first the negative numbers (if T is signed, otherwise no-op)
   string laststr = OCWrite<T>(std::numeric_limits<T>().min());
   for (T num = std::numeric_limits<T>().min() / 2; num != 0; num /= 2) {
@@ -197,13 +198,20 @@ static void TestNumberOrdering() {
 }
 
 // Helper routine for testing TEST_SkipToNextSpecialByte
-static size_t FindSpecial(const string& x) {
+size_t FindSpecial(const string& x) {
   const char* p = x.data();
   const char* limit = p + x.size();
   const char* result = OrderedCode::TEST_SkipToNextSpecialByte(p, limit);
   return result - p;
 }
 
+// Helper function template to create strings from string literals (excluding
+// the terminal zero byte of the underlying character array).
+template <size_t N>
+string ByteSequence(const char (&arr)[N]) {
+  return string(arr, N - 1);
+}
+
 TEST(OrderedCode, SkipToNextSpecialByte) {
   for (size_t len = 0; len < 256; len++) {
     random::PhiloxRandom philox(301, 17);
@@ -286,7 +294,7 @@ TEST(Int64, EncodeDecode) {
 TEST(Int64, Ordering) { TestNumberOrdering<int64>(); }
 
 // Returns the bitwise complement of s.
-static inline string StrNot(const string& s) {
+inline string StrNot(const string& s) {
   string result;
   for (string::const_iterator it = s.begin(); it != s.end(); ++it)
     result.push_back(~*it);
@@ -294,9 +302,9 @@ static inline string StrNot(const string& s) {
 }
 
 template <typename T>
-static void TestInvalidEncoding(const string& s) {
+void TestInvalidEncoding(const string& s) {
   StringPiece p(s);
-  EXPECT_FALSE(OCRead<T>(&p, static_cast<T*>(NULL)));
+  EXPECT_FALSE(OCRead<T>(&p, nullptr));
   EXPECT_EQ(s, p);
 }
 
@@ -330,7 +338,8 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
     EXPECT_NE(OCWrite<uint64>(0), non_minimal);
 #ifndef NDEBUG
     StringPiece s(non_minimal);
-    EXPECT_DEATH(OrderedCode::ReadNumIncreasing(&s, NULL), "invalid encoding");
+    EXPECT_DEATH(OrderedCode::ReadNumIncreasing(&s, nullptr),
+                 "invalid encoding");
 #else
     TestRead<uint64>(non_minimal);
 #endif
@@ -348,7 +357,7 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
     EXPECT_NE(OCWrite<int64>(0), non_minimal);
 #ifndef NDEBUG
     StringPiece s(non_minimal);
-    EXPECT_DEATH(OrderedCode::ReadSignedNumIncreasing(&s, NULL),
+    EXPECT_DEATH(OrderedCode::ReadSignedNumIncreasing(&s, nullptr),
                  "invalid encoding")
         << n;
 #else
@@ -359,15 +368,15 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
 // Returns random number with specified number of bits,
 // i.e., in the range [2^(bits-1),2^bits).
-static uint64 NextBits(random::SimplePhilox* rnd, int bits) {
+uint64 NextBits(random::SimplePhilox* rnd, int bits) {
   return (bits != 0)
              ? (rnd->Rand64() % (1LL << (bits - 1))) + (1LL << (bits - 1))
              : 0;
 }
 
 template <typename T>
-static void BM_WriteNum(int n, T multiplier) {
-  static const int kValues = 64;
+void BM_WriteNum(int n, T multiplier) {
+  constexpr int kValues = 64;
   T values[kValues];
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
@@ -385,12 +394,12 @@ static void BM_WriteNum(int n, T multiplier) {
 }
 
 template <typename T>
-static void BM_ReadNum(int n, T multiplier) {
+void BM_ReadNum(int n, T multiplier) {
   string x;
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   // Use enough distinct values to confuse the branch predictor
-  static const int kValues = 64;
+  constexpr int kValues = 64;
   string values[kValues];
   for (int i = 0; i < kValues; i++) {
     T val = NextBits(&rnd, i % 64) * multiplier;
@@ -404,10 +413,10 @@ static void BM_ReadNum(int n, T multiplier) {
   }
 }
 
-#define BENCHMARK_NUM(name, T, multiplier)                             \
-  static void BM_Write##name(int n) { BM_WriteNum<T>(n, multiplier); } \
-  BENCHMARK(BM_Write##name);                                           \
-  static void BM_Read##name(int n) { BM_ReadNum<T>(n, multiplier); }   \
+#define BENCHMARK_NUM(name, T, multiplier)                      \
+  void BM_Write##name(int n) { BM_WriteNum<T>(n, multiplier); } \
+  BENCHMARK(BM_Write##name);                                    \
+  void BM_Read##name(int n) { BM_ReadNum<T>(n, multiplier); }   \
   BENCHMARK(BM_Read##name)
 
 BENCHMARK_NUM(NumIncreasing, uint64, 1);
@@ -439,15 +448,15 @@ TEST(String, EncodeDecode) {
       StringPiece s = out;
       StringPiece s2 = out;
       CHECK(OCRead<string>(&s, &a2));
-      CHECK(OCRead<string>(&s2, NULL));
+      CHECK(OCRead<string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
       CHECK(OCRead<string>(&s, &b2));
-      CHECK(OCRead<string>(&s2, NULL));
+      CHECK(OCRead<string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
       CHECK(!OCRead<string>(&s, &dummy));
-      CHECK(!OCRead<string>(&s2, NULL));
+      CHECK(!OCRead<string>(&s2, nullptr));
       CHECK_EQ(a, a2);
       CHECK_EQ(b, b2);
       CHECK(s.empty());
@@ -456,10 +465,10 @@ TEST(String, EncodeDecode) {
   }
 }
 
-// 'str' is a static C-style string that may contain '\0'
+// 'str' is a string literal that may contain '\0'.
 #define STATIC_STR(str) StringPiece((str), sizeof(str) - 1)
 
-static string EncodeStringIncreasing(StringPiece value) {
+string EncodeStringIncreasing(StringPiece value) {
   string encoded;
   OrderedCode::WriteString(&encoded, value);
   return encoded;
@@ -523,204 +532,221 @@ TEST(EncodingIsExpected, String) {
 
 TEST(EncodingIsExpected, Unsigned) {
   std::vector<std::pair<uint64, string>> data = {
-      {0x0ull, string("\000", 1)},
-      {0x1ull, string("\001\001", 2)},
-      {0x2ull, string("\001\002", 2)},
-      {0x1ull, string("\001\001", 2)},
-      {0x2ull, string("\001\002", 2)},
-      {0x3ull, string("\001\003", 2)},
-      {0x3ull, string("\001\003", 2)},
-      {0x4ull, string("\001\004", 2)},
-      {0x5ull, string("\001\005", 2)},
-      {0x7ull, string("\001\007", 2)},
-      {0x8ull, string("\001\010", 2)},
-      {0x9ull, string("\001\t", 2)},
-      {0xfull, string("\001\017", 2)},
-      {0x10ull, string("\001\020", 2)},
-      {0x11ull, string("\001\021", 2)},
-      {0x1full, string("\001\037", 2)},
-      {0x20ull, string("\001 ", 2)},
-      {0x21ull, string("\001!", 2)},
-      {0x3full, string("\001?", 2)},
-      {0x40ull, string("\001@", 2)},
-      {0x41ull, string("\001A", 2)},
-      {0x7full, string("\001\177", 2)},
-      {0x80ull, string("\001\200", 2)},
-      {0x81ull, string("\001\201", 2)},
-      {0xffull, string("\001\377", 2)},
-      {0x100ull, string("\002\001\000", 3)},
-      {0x101ull, string("\002\001\001", 3)},
-      {0x1ffull, string("\002\001\377", 3)},
-      {0x200ull, string("\002\002\000", 3)},
-      {0x201ull, string("\002\002\001", 3)},
-      {0x3ffull, string("\002\003\377", 3)},
-      {0x400ull, string("\002\004\000", 3)},
-      {0x401ull, string("\002\004\001", 3)},
-      {0x7ffull, string("\002\007\377", 3)},
-      {0x800ull, string("\002\010\000", 3)},
-      {0x801ull, string("\002\010\001", 3)},
-      {0xfffull, string("\002\017\377", 3)},
-      {0x1000ull, string("\002\020\000", 3)},
-      {0x1001ull, string("\002\020\001", 3)},
-      {0x1fffull, string("\002\037\377", 3)},
-      {0x2000ull, string("\002 \000", 3)},
-      {0x2001ull, string("\002 \001", 3)},
-      {0x3fffull, string("\002?\377", 3)},
-      {0x4000ull, string("\002@\000", 3)},
-      {0x4001ull, string("\002@\001", 3)},
-      {0x7fffull, string("\002\177\377", 3)},
-      {0x8000ull, string("\002\200\000", 3)},
-      {0x8001ull, string("\002\200\001", 3)},
-      {0xffffull, string("\002\377\377", 3)},
-      {0x10000ull, string("\003\001\000\000", 4)},
-      {0x10001ull, string("\003\001\000\001", 4)},
-      {0x1ffffull, string("\003\001\377\377", 4)},
-      {0x20000ull, string("\003\002\000\000", 4)},
-      {0x20001ull, string("\003\002\000\001", 4)},
-      {0x3ffffull, string("\003\003\377\377", 4)},
-      {0x40000ull, string("\003\004\000\000", 4)},
-      {0x40001ull, string("\003\004\000\001", 4)},
-      {0x7ffffull, string("\003\007\377\377", 4)},
-      {0x80000ull, string("\003\010\000\000", 4)},
-      {0x80001ull, string("\003\010\000\001", 4)},
-      {0xfffffull, string("\003\017\377\377", 4)},
-      {0x100000ull, string("\003\020\000\000", 4)},
-      {0x100001ull, string("\003\020\000\001", 4)},
-      {0x1fffffull, string("\003\037\377\377", 4)},
-      {0x200000ull, string("\003 \000\000", 4)},
-      {0x200001ull, string("\003 \000\001", 4)},
-      {0x3fffffull, string("\003?\377\377", 4)},
-      {0x400000ull, string("\003@\000\000", 4)},
-      {0x400001ull, string("\003@\000\001", 4)},
-      {0x7fffffull, string("\003\177\377\377", 4)},
-      {0x800000ull, string("\003\200\000\000", 4)},
-      {0x800001ull, string("\003\200\000\001", 4)},
-      {0xffffffull, string("\003\377\377\377", 4)},
-      {0x1000000ull, string("\004\001\000\000\000", 5)},
-      {0x1000001ull, string("\004\001\000\000\001", 5)},
-      {0x1ffffffull, string("\004\001\377\377\377", 5)},
-      {0x2000000ull, string("\004\002\000\000\000", 5)},
-      {0x2000001ull, string("\004\002\000\000\001", 5)},
-      {0x3ffffffull, string("\004\003\377\377\377", 5)},
-      {0x4000000ull, string("\004\004\000\000\000", 5)},
-      {0x4000001ull, string("\004\004\000\000\001", 5)},
-      {0x7ffffffull, string("\004\007\377\377\377", 5)},
-      {0x8000000ull, string("\004\010\000\000\000", 5)},
-      {0x8000001ull, string("\004\010\000\000\001", 5)},
-      {0xfffffffull, string("\004\017\377\377\377", 5)},
-      {0x10000000ull, string("\004\020\000\000\000", 5)},
-      {0x10000001ull, string("\004\020\000\000\001", 5)},
-      {0x1fffffffull, string("\004\037\377\377\377", 5)},
-      {0x20000000ull, string("\004 \000\000\000", 5)},
-      {0x20000001ull, string("\004 \000\000\001", 5)},
-      {0x3fffffffull, string("\004?\377\377\377", 5)},
-      {0x40000000ull, string("\004@\000\000\000", 5)},
-      {0x40000001ull, string("\004@\000\000\001", 5)},
-      {0x7fffffffull, string("\004\177\377\377\377", 5)},
-      {0x80000000ull, string("\004\200\000\000\000", 5)},
-      {0x80000001ull, string("\004\200\000\000\001", 5)},
-      {0xffffffffull, string("\004\377\377\377\377", 5)},
-      {0x100000000ull, string("\005\001\000\000\000\000", 6)},
-      {0x100000001ull, string("\005\001\000\000\000\001", 6)},
-      {0x1ffffffffull, string("\005\001\377\377\377\377", 6)},
-      {0x200000000ull, string("\005\002\000\000\000\000", 6)},
-      {0x200000001ull, string("\005\002\000\000\000\001", 6)},
-      {0x3ffffffffull, string("\005\003\377\377\377\377", 6)},
-      {0x400000000ull, string("\005\004\000\000\000\000", 6)},
-      {0x400000001ull, string("\005\004\000\000\000\001", 6)},
-      {0x7ffffffffull, string("\005\007\377\377\377\377", 6)},
-      {0x800000000ull, string("\005\010\000\000\000\000", 6)},
-      {0x800000001ull, string("\005\010\000\000\000\001", 6)},
-      {0xfffffffffull, string("\005\017\377\377\377\377", 6)},
-      {0x1000000000ull, string("\005\020\000\000\000\000", 6)},
-      {0x1000000001ull, string("\005\020\000\000\000\001", 6)},
-      {0x1fffffffffull, string("\005\037\377\377\377\377", 6)},
-      {0x2000000000ull, string("\005 \000\000\000\000", 6)},
-      {0x2000000001ull, string("\005 \000\000\000\001", 6)},
-      {0x3fffffffffull, string("\005?\377\377\377\377", 6)},
-      {0x4000000000ull, string("\005@\000\000\000\000", 6)},
-      {0x4000000001ull, string("\005@\000\000\000\001", 6)},
-      {0x7fffffffffull, string("\005\177\377\377\377\377", 6)},
-      {0x8000000000ull, string("\005\200\000\000\000\000", 6)},
-      {0x8000000001ull, string("\005\200\000\000\000\001", 6)},
-      {0xffffffffffull, string("\005\377\377\377\377\377", 6)},
-      {0x10000000000ull, string("\006\001\000\000\000\000\000", 7)},
-      {0x10000000001ull, string("\006\001\000\000\000\000\001", 7)},
-      {0x1ffffffffffull, string("\006\001\377\377\377\377\377", 7)},
-      {0x20000000000ull, string("\006\002\000\000\000\000\000", 7)},
-      {0x20000000001ull, string("\006\002\000\000\000\000\001", 7)},
-      {0x3ffffffffffull, string("\006\003\377\377\377\377\377", 7)},
-      {0x40000000000ull, string("\006\004\000\000\000\000\000", 7)},
-      {0x40000000001ull, string("\006\004\000\000\000\000\001", 7)},
-      {0x7ffffffffffull, string("\006\007\377\377\377\377\377", 7)},
-      {0x80000000000ull, string("\006\010\000\000\000\000\000", 7)},
-      {0x80000000001ull, string("\006\010\000\000\000\000\001", 7)},
-      {0xfffffffffffull, string("\006\017\377\377\377\377\377", 7)},
-      {0x100000000000ull, string("\006\020\000\000\000\000\000", 7)},
-      {0x100000000001ull, string("\006\020\000\000\000\000\001", 7)},
-      {0x1fffffffffffull, string("\006\037\377\377\377\377\377", 7)},
-      {0x200000000000ull, string("\006 \000\000\000\000\000", 7)},
-      {0x200000000001ull, string("\006 \000\000\000\000\001", 7)},
-      {0x3fffffffffffull, string("\006?\377\377\377\377\377", 7)},
-      {0x400000000000ull, string("\006@\000\000\000\000\000", 7)},
-      {0x400000000001ull, string("\006@\000\000\000\000\001", 7)},
-      {0x7fffffffffffull, string("\006\177\377\377\377\377\377", 7)},
-      {0x800000000000ull, string("\006\200\000\000\000\000\000", 7)},
-      {0x800000000001ull, string("\006\200\000\000\000\000\001", 7)},
-      {0xffffffffffffull, string("\006\377\377\377\377\377\377", 7)},
-      {0x1000000000000ull, string("\007\001\000\000\000\000\000\000", 8)},
-      {0x1000000000001ull, string("\007\001\000\000\000\000\000\001", 8)},
-      {0x1ffffffffffffull, string("\007\001\377\377\377\377\377\377", 8)},
-      {0x2000000000000ull, string("\007\002\000\000\000\000\000\000", 8)},
-      {0x2000000000001ull, string("\007\002\000\000\000\000\000\001", 8)},
-      {0x3ffffffffffffull, string("\007\003\377\377\377\377\377\377", 8)},
-      {0x4000000000000ull, string("\007\004\000\000\000\000\000\000", 8)},
-      {0x4000000000001ull, string("\007\004\000\000\000\000\000\001", 8)},
-      {0x7ffffffffffffull, string("\007\007\377\377\377\377\377\377", 8)},
-      {0x8000000000000ull, string("\007\010\000\000\000\000\000\000", 8)},
-      {0x8000000000001ull, string("\007\010\000\000\000\000\000\001", 8)},
-      {0xfffffffffffffull, string("\007\017\377\377\377\377\377\377", 8)},
-      {0x10000000000000ull, string("\007\020\000\000\000\000\000\000", 8)},
-      {0x10000000000001ull, string("\007\020\000\000\000\000\000\001", 8)},
-      {0x1fffffffffffffull, string("\007\037\377\377\377\377\377\377", 8)},
-      {0x20000000000000ull, string("\007 \000\000\000\000\000\000", 8)},
-      {0x20000000000001ull, string("\007 \000\000\000\000\000\001", 8)},
-      {0x3fffffffffffffull, string("\007?\377\377\377\377\377\377", 8)},
-      {0x40000000000000ull, string("\007@\000\000\000\000\000\000", 8)},
-      {0x40000000000001ull, string("\007@\000\000\000\000\000\001", 8)},
-      {0x7fffffffffffffull, string("\007\177\377\377\377\377\377\377", 8)},
-      {0x80000000000000ull, string("\007\200\000\000\000\000\000\000", 8)},
-      {0x80000000000001ull, string("\007\200\000\000\000\000\000\001", 8)},
-      {0xffffffffffffffull, string("\007\377\377\377\377\377\377\377", 8)},
-      {0x100000000000000ull, string("\010\001\000\000\000\000\000\000\000", 9)},
-      {0x100000000000001ull, string("\010\001\000\000\000\000\000\000\001", 9)},
-      {0x1ffffffffffffffull, string("\010\001\377\377\377\377\377\377\377", 9)},
-      {0x200000000000000ull, string("\010\002\000\000\000\000\000\000\000", 9)},
-      {0x200000000000001ull, string("\010\002\000\000\000\000\000\000\001", 9)},
-      {0x3ffffffffffffffull, string("\010\003\377\377\377\377\377\377\377", 9)},
-      {0x400000000000000ull, string("\010\004\000\000\000\000\000\000\000", 9)},
-      {0x400000000000001ull, string("\010\004\000\000\000\000\000\000\001", 9)},
-      {0x7ffffffffffffffull, string("\010\007\377\377\377\377\377\377\377", 9)},
-      {0x800000000000000ull, string("\010\010\000\000\000\000\000\000\000", 9)},
-      {0x800000000000001ull, string("\010\010\000\000\000\000\000\000\001", 9)},
-      {0xfffffffffffffffull, string("\010\017\377\377\377\377\377\377\377", 9)},
+      {0x0ull, ByteSequence("\000")},
+      {0x1ull, ByteSequence("\001\001")},
+      {0x2ull, ByteSequence("\001\002")},
+      {0x1ull, ByteSequence("\001\001")},
+      {0x2ull, ByteSequence("\001\002")},
+      {0x3ull, ByteSequence("\001\003")},
+      {0x3ull, ByteSequence("\001\003")},
+      {0x4ull, ByteSequence("\001\004")},
+      {0x5ull, ByteSequence("\001\005")},
+      {0x7ull, ByteSequence("\001\007")},
+      {0x8ull, ByteSequence("\001\010")},
+      {0x9ull, ByteSequence("\001\t")},
+      {0xfull, ByteSequence("\001\017")},
+      {0x10ull, ByteSequence("\001\020")},
+      {0x11ull, ByteSequence("\001\021")},
+      {0x1full, ByteSequence("\001\037")},
+      {0x20ull, ByteSequence("\001 ")},
+      {0x21ull, ByteSequence("\001!")},
+      {0x3full, ByteSequence("\001?")},
+      {0x40ull, ByteSequence("\001@")},
+      {0x41ull, ByteSequence("\001A")},
+      {0x7full, ByteSequence("\001\177")},
+      {0x80ull, ByteSequence("\001\200")},
+      {0x81ull, ByteSequence("\001\201")},
+      {0xffull, ByteSequence("\001\377")},
+      {0x100ull, ByteSequence("\002\001\000")},
+      {0x101ull, ByteSequence("\002\001\001")},
+      {0x1ffull, ByteSequence("\002\001\377")},
+      {0x200ull, ByteSequence("\002\002\000")},
+      {0x201ull, ByteSequence("\002\002\001")},
+      {0x3ffull, ByteSequence("\002\003\377")},
+      {0x400ull, ByteSequence("\002\004\000")},
+      {0x401ull, ByteSequence("\002\004\001")},
+      {0x7ffull, ByteSequence("\002\007\377")},
+      {0x800ull, ByteSequence("\002\010\000")},
+      {0x801ull, ByteSequence("\002\010\001")},
+      {0xfffull, ByteSequence("\002\017\377")},
+      {0x1000ull, ByteSequence("\002\020\000")},
+      {0x1001ull, ByteSequence("\002\020\001")},
+      {0x1fffull, ByteSequence("\002\037\377")},
+      {0x2000ull, ByteSequence("\002 \000")},
+      {0x2001ull, ByteSequence("\002 \001")},
+      {0x3fffull, ByteSequence("\002?\377")},
+      {0x4000ull, ByteSequence("\002@\000")},
+      {0x4001ull, ByteSequence("\002@\001")},
+      {0x7fffull, ByteSequence("\002\177\377")},
+      {0x8000ull, ByteSequence("\002\200\000")},
+      {0x8001ull, ByteSequence("\002\200\001")},
+      {0xffffull, ByteSequence("\002\377\377")},
+      {0x10000ull, ByteSequence("\003\001\000\000")},
+      {0x10001ull, ByteSequence("\003\001\000\001")},
+      {0x1ffffull, ByteSequence("\003\001\377\377")},
+      {0x20000ull, ByteSequence("\003\002\000\000")},
+      {0x20001ull, ByteSequence("\003\002\000\001")},
+      {0x3ffffull, ByteSequence("\003\003\377\377")},
+      {0x40000ull, ByteSequence("\003\004\000\000")},
+      {0x40001ull, ByteSequence("\003\004\000\001")},
+      {0x7ffffull, ByteSequence("\003\007\377\377")},
+      {0x80000ull, ByteSequence("\003\010\000\000")},
+      {0x80001ull, ByteSequence("\003\010\000\001")},
+      {0xfffffull, ByteSequence("\003\017\377\377")},
+      {0x100000ull, ByteSequence("\003\020\000\000")},
+      {0x100001ull, ByteSequence("\003\020\000\001")},
+      {0x1fffffull, ByteSequence("\003\037\377\377")},
+      {0x200000ull, ByteSequence("\003 \000\000")},
+      {0x200001ull, ByteSequence("\003 \000\001")},
+      {0x3fffffull, ByteSequence("\003?\377\377")},
+      {0x400000ull, ByteSequence("\003@\000\000")},
+      {0x400001ull, ByteSequence("\003@\000\001")},
+      {0x7fffffull, ByteSequence("\003\177\377\377")},
+      {0x800000ull, ByteSequence("\003\200\000\000")},
+      {0x800001ull, ByteSequence("\003\200\000\001")},
+      {0xffffffull, ByteSequence("\003\377\377\377")},
+      {0x1000000ull, ByteSequence("\004\001\000\000\000")},
+      {0x1000001ull, ByteSequence("\004\001\000\000\001")},
+      {0x1ffffffull, ByteSequence("\004\001\377\377\377")},
+      {0x2000000ull, ByteSequence("\004\002\000\000\000")},
+      {0x2000001ull, ByteSequence("\004\002\000\000\001")},
+      {0x3ffffffull, ByteSequence("\004\003\377\377\377")},
+      {0x4000000ull, ByteSequence("\004\004\000\000\000")},
+      {0x4000001ull, ByteSequence("\004\004\000\000\001")},
+      {0x7ffffffull, ByteSequence("\004\007\377\377\377")},
+      {0x8000000ull, ByteSequence("\004\010\000\000\000")},
+      {0x8000001ull, ByteSequence("\004\010\000\000\001")},
+      {0xfffffffull, ByteSequence("\004\017\377\377\377")},
+      {0x10000000ull, ByteSequence("\004\020\000\000\000")},
+      {0x10000001ull, ByteSequence("\004\020\000\000\001")},
+      {0x1fffffffull, ByteSequence("\004\037\377\377\377")},
+      {0x20000000ull, ByteSequence("\004 \000\000\000")},
+      {0x20000001ull, ByteSequence("\004 \000\000\001")},
+      {0x3fffffffull, ByteSequence("\004?\377\377\377")},
+      {0x40000000ull, ByteSequence("\004@\000\000\000")},
+      {0x40000001ull, ByteSequence("\004@\000\000\001")},
+      {0x7fffffffull, ByteSequence("\004\177\377\377\377")},
+      {0x80000000ull, ByteSequence("\004\200\000\000\000")},
+      {0x80000001ull, ByteSequence("\004\200\000\000\001")},
+      {0xffffffffull, ByteSequence("\004\377\377\377\377")},
+      {0x100000000ull, ByteSequence("\005\001\000\000\000\000")},
+      {0x100000001ull, ByteSequence("\005\001\000\000\000\001")},
+      {0x1ffffffffull, ByteSequence("\005\001\377\377\377\377")},
+      {0x200000000ull, ByteSequence("\005\002\000\000\000\000")},
+      {0x200000001ull, ByteSequence("\005\002\000\000\000\001")},
+      {0x3ffffffffull, ByteSequence("\005\003\377\377\377\377")},
+      {0x400000000ull, ByteSequence("\005\004\000\000\000\000")},
+      {0x400000001ull, ByteSequence("\005\004\000\000\000\001")},
+      {0x7ffffffffull, ByteSequence("\005\007\377\377\377\377")},
+      {0x800000000ull, ByteSequence("\005\010\000\000\000\000")},
+      {0x800000001ull, ByteSequence("\005\010\000\000\000\001")},
+      {0xfffffffffull, ByteSequence("\005\017\377\377\377\377")},
+      {0x1000000000ull, ByteSequence("\005\020\000\000\000\000")},
+      {0x1000000001ull, ByteSequence("\005\020\000\000\000\001")},
+      {0x1fffffffffull, ByteSequence("\005\037\377\377\377\377")},
+      {0x2000000000ull, ByteSequence("\005 \000\000\000\000")},
+      {0x2000000001ull, ByteSequence("\005 \000\000\000\001")},
+      {0x3fffffffffull, ByteSequence("\005?\377\377\377\377")},
+      {0x4000000000ull, ByteSequence("\005@\000\000\000\000")},
+      {0x4000000001ull, ByteSequence("\005@\000\000\000\001")},
+      {0x7fffffffffull, ByteSequence("\005\177\377\377\377\377")},
+      {0x8000000000ull, ByteSequence("\005\200\000\000\000\000")},
+      {0x8000000001ull, ByteSequence("\005\200\000\000\000\001")},
+      {0xffffffffffull, ByteSequence("\005\377\377\377\377\377")},
+      {0x10000000000ull, ByteSequence("\006\001\000\000\000\000\000")},
+      {0x10000000001ull, ByteSequence("\006\001\000\000\000\000\001")},
+      {0x1ffffffffffull, ByteSequence("\006\001\377\377\377\377\377")},
+      {0x20000000000ull, ByteSequence("\006\002\000\000\000\000\000")},
+      {0x20000000001ull, ByteSequence("\006\002\000\000\000\000\001")},
+      {0x3ffffffffffull, ByteSequence("\006\003\377\377\377\377\377")},
+      {0x40000000000ull, ByteSequence("\006\004\000\000\000\000\000")},
+      {0x40000000001ull, ByteSequence("\006\004\000\000\000\000\001")},
+      {0x7ffffffffffull, ByteSequence("\006\007\377\377\377\377\377")},
+      {0x80000000000ull, ByteSequence("\006\010\000\000\000\000\000")},
+      {0x80000000001ull, ByteSequence("\006\010\000\000\000\000\001")},
+      {0xfffffffffffull, ByteSequence("\006\017\377\377\377\377\377")},
+      {0x100000000000ull, ByteSequence("\006\020\000\000\000\000\000")},
+      {0x100000000001ull, ByteSequence("\006\020\000\000\000\000\001")},
+      {0x1fffffffffffull, ByteSequence("\006\037\377\377\377\377\377")},
+      {0x200000000000ull, ByteSequence("\006 \000\000\000\000\000")},
+      {0x200000000001ull, ByteSequence("\006 \000\000\000\000\001")},
+      {0x3fffffffffffull, ByteSequence("\006?\377\377\377\377\377")},
+      {0x400000000000ull, ByteSequence("\006@\000\000\000\000\000")},
+      {0x400000000001ull, ByteSequence("\006@\000\000\000\000\001")},
+      {0x7fffffffffffull, ByteSequence("\006\177\377\377\377\377\377")},
+      {0x800000000000ull, ByteSequence("\006\200\000\000\000\000\000")},
+      {0x800000000001ull, ByteSequence("\006\200\000\000\000\000\001")},
+      {0xffffffffffffull, ByteSequence("\006\377\377\377\377\377\377")},
+      {0x1000000000000ull, ByteSequence("\007\001\000\000\000\000\000\000")},
+      {0x1000000000001ull, ByteSequence("\007\001\000\000\000\000\000\001")},
+      {0x1ffffffffffffull, ByteSequence("\007\001\377\377\377\377\377\377")},
+      {0x2000000000000ull, ByteSequence("\007\002\000\000\000\000\000\000")},
+      {0x2000000000001ull, ByteSequence("\007\002\000\000\000\000\000\001")},
+      {0x3ffffffffffffull, ByteSequence("\007\003\377\377\377\377\377\377")},
+      {0x4000000000000ull, ByteSequence("\007\004\000\000\000\000\000\000")},
+      {0x4000000000001ull, ByteSequence("\007\004\000\000\000\000\000\001")},
+      {0x7ffffffffffffull, ByteSequence("\007\007\377\377\377\377\377\377")},
+      {0x8000000000000ull, ByteSequence("\007\010\000\000\000\000\000\000")},
+      {0x8000000000001ull, ByteSequence("\007\010\000\000\000\000\000\001")},
+      {0xfffffffffffffull, ByteSequence("\007\017\377\377\377\377\377\377")},
+      {0x10000000000000ull, ByteSequence("\007\020\000\000\000\000\000\000")},
+      {0x10000000000001ull, ByteSequence("\007\020\000\000\000\000\000\001")},
+      {0x1fffffffffffffull, ByteSequence("\007\037\377\377\377\377\377\377")},
+      {0x20000000000000ull, ByteSequence("\007 \000\000\000\000\000\000")},
+      {0x20000000000001ull, ByteSequence("\007 \000\000\000\000\000\001")},
+      {0x3fffffffffffffull, ByteSequence("\007?\377\377\377\377\377\377")},
+      {0x40000000000000ull, ByteSequence("\007@\000\000\000\000\000\000")},
+      {0x40000000000001ull, ByteSequence("\007@\000\000\000\000\000\001")},
+      {0x7fffffffffffffull, ByteSequence("\007\177\377\377\377\377\377\377")},
+      {0x80000000000000ull, ByteSequence("\007\200\000\000\000\000\000\000")},
+      {0x80000000000001ull, ByteSequence("\007\200\000\000\000\000\000\001")},
+      {0xffffffffffffffull, ByteSequence("\007\377\377\377\377\377\377\377")},
+      {0x100000000000000ull,
+       ByteSequence("\010\001\000\000\000\000\000\000\000")},
+      {0x100000000000001ull,
+       ByteSequence("\010\001\000\000\000\000\000\000\001")},
+      {0x1ffffffffffffffull,
+       ByteSequence("\010\001\377\377\377\377\377\377\377")},
+      {0x200000000000000ull,
+       ByteSequence("\010\002\000\000\000\000\000\000\000")},
+      {0x200000000000001ull,
+       ByteSequence("\010\002\000\000\000\000\000\000\001")},
+      {0x3ffffffffffffffull,
+       ByteSequence("\010\003\377\377\377\377\377\377\377")},
+      {0x400000000000000ull,
+       ByteSequence("\010\004\000\000\000\000\000\000\000")},
+      {0x400000000000001ull,
+       ByteSequence("\010\004\000\000\000\000\000\000\001")},
+      {0x7ffffffffffffffull,
+       ByteSequence("\010\007\377\377\377\377\377\377\377")},
+      {0x800000000000000ull,
+       ByteSequence("\010\010\000\000\000\000\000\000\000")},
+      {0x800000000000001ull,
+       ByteSequence("\010\010\000\000\000\000\000\000\001")},
+      {0xfffffffffffffffull,
+       ByteSequence("\010\017\377\377\377\377\377\377\377")},
       {0x1000000000000000ull,
-       string("\010\020\000\000\000\000\000\000\000", 9)},
+       ByteSequence("\010\020\000\000\000\000\000\000\000")},
       {0x1000000000000001ull,
-       string("\010\020\000\000\000\000\000\000\001", 9)},
+       ByteSequence("\010\020\000\000\000\000\000\000\001")},
       {0x1fffffffffffffffull,
-       string("\010\037\377\377\377\377\377\377\377", 9)},
-      {0x2000000000000000ull, string("\010 \000\000\000\000\000\000\000", 9)},
-      {0x2000000000000001ull, string("\010 \000\000\000\000\000\000\001", 9)},
-      {0x3fffffffffffffffull, string("\010?\377\377\377\377\377\377\377", 9)},
-      {0x4000000000000000ull, string("\010@\000\000\000\000\000\000\000", 9)},
-      {0x4000000000000001ull, string("\010@\000\000\000\000\000\000\001", 9)},
+       ByteSequence("\010\037\377\377\377\377\377\377\377")},
+      {0x2000000000000000ull,
+       ByteSequence("\010 \000\000\000\000\000\000\000")},
+      {0x2000000000000001ull,
+       ByteSequence("\010 \000\000\000\000\000\000\001")},
+      {0x3fffffffffffffffull,
+       ByteSequence("\010?\377\377\377\377\377\377\377")},
+      {0x4000000000000000ull,
+       ByteSequence("\010@\000\000\000\000\000\000\000")},
+      {0x4000000000000001ull,
+       ByteSequence("\010@\000\000\000\000\000\000\001")},
       {0x7fffffffffffffffull,
-       string("\010\177\377\377\377\377\377\377\377", 9)},
+       ByteSequence("\010\177\377\377\377\377\377\377\377")},
       {0x8000000000000000ull,
-       string("\010\200\000\000\000\000\000\000\000", 9)},
+       ByteSequence("\010\200\000\000\000\000\000\000\000")},
       {0x8000000000000001ull,
-       string("\010\200\000\000\000\000\000\000\001", 9)},
+       ByteSequence("\010\200\000\000\000\000\000\000\001")},
   };
   for (const auto& t : data) {
     uint64 num = t.first;
@@ -738,402 +764,436 @@ TEST(EncodingIsExpected, Unsigned) {
 
 TEST(EncodingIsExpected, Signed) {
   std::vector<std::pair<int64, string>> data = {
-      {0ll, string("\200", 1)},
-      {1ll, string("\201", 1)},
-      {2ll, string("\202", 1)},
-      {1ll, string("\201", 1)},
-      {2ll, string("\202", 1)},
-      {3ll, string("\203", 1)},
-      {3ll, string("\203", 1)},
-      {4ll, string("\204", 1)},
-      {5ll, string("\205", 1)},
-      {7ll, string("\207", 1)},
-      {8ll, string("\210", 1)},
-      {9ll, string("\211", 1)},
-      {15ll, string("\217", 1)},
-      {16ll, string("\220", 1)},
-      {17ll, string("\221", 1)},
-      {31ll, string("\237", 1)},
-      {32ll, string("\240", 1)},
-      {33ll, string("\241", 1)},
-      {63ll, string("\277", 1)},
-      {64ll, string("\300@", 2)},
-      {65ll, string("\300A", 2)},
-      {127ll, string("\300\177", 2)},
-      {128ll, string("\300\200", 2)},
-      {129ll, string("\300\201", 2)},
-      {255ll, string("\300\377", 2)},
-      {256ll, string("\301\000", 2)},
-      {257ll, string("\301\001", 2)},
-      {511ll, string("\301\377", 2)},
-      {512ll, string("\302\000", 2)},
-      {513ll, string("\302\001", 2)},
-      {1023ll, string("\303\377", 2)},
-      {1024ll, string("\304\000", 2)},
-      {1025ll, string("\304\001", 2)},
-      {2047ll, string("\307\377", 2)},
-      {2048ll, string("\310\000", 2)},
-      {2049ll, string("\310\001", 2)},
-      {4095ll, string("\317\377", 2)},
-      {4096ll, string("\320\000", 2)},
-      {4097ll, string("\320\001", 2)},
-      {8191ll, string("\337\377", 2)},
-      {8192ll, string("\340 \000", 3)},
-      {8193ll, string("\340 \001", 3)},
-      {16383ll, string("\340?\377", 3)},
-      {16384ll, string("\340@\000", 3)},
-      {16385ll, string("\340@\001", 3)},
-      {32767ll, string("\340\177\377", 3)},
-      {32768ll, string("\340\200\000", 3)},
-      {32769ll, string("\340\200\001", 3)},
-      {65535ll, string("\340\377\377", 3)},
-      {65536ll, string("\341\000\000", 3)},
-      {65537ll, string("\341\000\001", 3)},
-      {131071ll, string("\341\377\377", 3)},
-      {131072ll, string("\342\000\000", 3)},
-      {131073ll, string("\342\000\001", 3)},
-      {262143ll, string("\343\377\377", 3)},
-      {262144ll, string("\344\000\000", 3)},
-      {262145ll, string("\344\000\001", 3)},
-      {524287ll, string("\347\377\377", 3)},
-      {524288ll, string("\350\000\000", 3)},
-      {524289ll, string("\350\000\001", 3)},
-      {1048575ll, string("\357\377\377", 3)},
-      {1048576ll, string("\360\020\000\000", 4)},
-      {1048577ll, string("\360\020\000\001", 4)},
-      {2097151ll, string("\360\037\377\377", 4)},
-      {2097152ll, string("\360 \000\000", 4)},
-      {2097153ll, string("\360 \000\001", 4)},
-      {4194303ll, string("\360?\377\377", 4)},
-      {4194304ll, string("\360@\000\000", 4)},
-      {4194305ll, string("\360@\000\001", 4)},
-      {8388607ll, string("\360\177\377\377", 4)},
-      {8388608ll, string("\360\200\000\000", 4)},
-      {8388609ll, string("\360\200\000\001", 4)},
-      {16777215ll, string("\360\377\377\377", 4)},
-      {16777216ll, string("\361\000\000\000", 4)},
-      {16777217ll, string("\361\000\000\001", 4)},
-      {33554431ll, string("\361\377\377\377", 4)},
-      {33554432ll, string("\362\000\000\000", 4)},
-      {33554433ll, string("\362\000\000\001", 4)},
-      {67108863ll, string("\363\377\377\377", 4)},
-      {67108864ll, string("\364\000\000\000", 4)},
-      {67108865ll, string("\364\000\000\001", 4)},
-      {134217727ll, string("\367\377\377\377", 4)},
-      {134217728ll, string("\370\010\000\000\000", 5)},
-      {134217729ll, string("\370\010\000\000\001", 5)},
-      {268435455ll, string("\370\017\377\377\377", 5)},
-      {268435456ll, string("\370\020\000\000\000", 5)},
-      {268435457ll, string("\370\020\000\000\001", 5)},
-      {536870911ll, string("\370\037\377\377\377", 5)},
-      {536870912ll, string("\370 \000\000\000", 5)},
-      {536870913ll, string("\370 \000\000\001", 5)},
-      {1073741823ll, string("\370?\377\377\377", 5)},
-      {1073741824ll, string("\370@\000\000\000", 5)},
-      {1073741825ll, string("\370@\000\000\001", 5)},
-      {2147483647ll, string("\370\177\377\377\377", 5)},
-      {2147483648ll, string("\370\200\000\000\000", 5)},
-      {2147483649ll, string("\370\200\000\000\001", 5)},
-      {4294967295ll, string("\370\377\377\377\377", 5)},
-      {4294967296ll, string("\371\000\000\000\000", 5)},
-      {4294967297ll, string("\371\000\000\000\001", 5)},
-      {8589934591ll, string("\371\377\377\377\377", 5)},
-      {8589934592ll, string("\372\000\000\000\000", 5)},
-      {8589934593ll, string("\372\000\000\000\001", 5)},
-      {17179869183ll, string("\373\377\377\377\377", 5)},
-      {17179869184ll, string("\374\004\000\000\000\000", 6)},
-      {17179869185ll, string("\374\004\000\000\000\001", 6)},
-      {34359738367ll, string("\374\007\377\377\377\377", 6)},
-      {34359738368ll, string("\374\010\000\000\000\000", 6)},
-      {34359738369ll, string("\374\010\000\000\000\001", 6)},
-      {68719476735ll, string("\374\017\377\377\377\377", 6)},
-      {68719476736ll, string("\374\020\000\000\000\000", 6)},
-      {68719476737ll, string("\374\020\000\000\000\001", 6)},
-      {137438953471ll, string("\374\037\377\377\377\377", 6)},
-      {137438953472ll, string("\374 \000\000\000\000", 6)},
-      {137438953473ll, string("\374 \000\000\000\001", 6)},
-      {274877906943ll, string("\374?\377\377\377\377", 6)},
-      {274877906944ll, string("\374@\000\000\000\000", 6)},
-      {274877906945ll, string("\374@\000\000\000\001", 6)},
-      {549755813887ll, string("\374\177\377\377\377\377", 6)},
-      {549755813888ll, string("\374\200\000\000\000\000", 6)},
-      {549755813889ll, string("\374\200\000\000\000\001", 6)},
-      {1099511627775ll, string("\374\377\377\377\377\377", 6)},
-      {1099511627776ll, string("\375\000\000\000\000\000", 6)},
-      {1099511627777ll, string("\375\000\000\000\000\001", 6)},
-      {2199023255551ll, string("\375\377\377\377\377\377", 6)},
-      {2199023255552ll, string("\376\002\000\000\000\000\000", 7)},
-      {2199023255553ll, string("\376\002\000\000\000\000\001", 7)},
-      {4398046511103ll, string("\376\003\377\377\377\377\377", 7)},
-      {4398046511104ll, string("\376\004\000\000\000\000\000", 7)},
-      {4398046511105ll, string("\376\004\000\000\000\000\001", 7)},
-      {8796093022207ll, string("\376\007\377\377\377\377\377", 7)},
-      {8796093022208ll, string("\376\010\000\000\000\000\000", 7)},
-      {8796093022209ll, string("\376\010\000\000\000\000\001", 7)},
-      {17592186044415ll, string("\376\017\377\377\377\377\377", 7)},
-      {17592186044416ll, string("\376\020\000\000\000\000\000", 7)},
-      {17592186044417ll, string("\376\020\000\000\000\000\001", 7)},
-      {35184372088831ll, string("\376\037\377\377\377\377\377", 7)},
-      {35184372088832ll, string("\376 \000\000\000\000\000", 7)},
-      {35184372088833ll, string("\376 \000\000\000\000\001", 7)},
-      {70368744177663ll, string("\376?\377\377\377\377\377", 7)},
-      {70368744177664ll, string("\376@\000\000\000\000\000", 7)},
-      {70368744177665ll, string("\376@\000\000\000\000\001", 7)},
-      {140737488355327ll, string("\376\177\377\377\377\377\377", 7)},
-      {140737488355328ll, string("\376\200\000\000\000\000\000", 7)},
-      {140737488355329ll, string("\376\200\000\000\000\000\001", 7)},
-      {281474976710655ll, string("\376\377\377\377\377\377\377", 7)},
-      {281474976710656ll, string("\377\001\000\000\000\000\000\000", 8)},
-      {281474976710657ll, string("\377\001\000\000\000\000\000\001", 8)},
-      {562949953421311ll, string("\377\001\377\377\377\377\377\377", 8)},
-      {562949953421312ll, string("\377\002\000\000\000\000\000\000", 8)},
-      {562949953421313ll, string("\377\002\000\000\000\000\000\001", 8)},
-      {1125899906842623ll, string("\377\003\377\377\377\377\377\377", 8)},
-      {1125899906842624ll, string("\377\004\000\000\000\000\000\000", 8)},
-      {1125899906842625ll, string("\377\004\000\000\000\000\000\001", 8)},
-      {2251799813685247ll, string("\377\007\377\377\377\377\377\377", 8)},
-      {2251799813685248ll, string("\377\010\000\000\000\000\000\000", 8)},
-      {2251799813685249ll, string("\377\010\000\000\000\000\000\001", 8)},
-      {4503599627370495ll, string("\377\017\377\377\377\377\377\377", 8)},
-      {4503599627370496ll, string("\377\020\000\000\000\000\000\000", 8)},
-      {4503599627370497ll, string("\377\020\000\000\000\000\000\001", 8)},
-      {9007199254740991ll, string("\377\037\377\377\377\377\377\377", 8)},
-      {9007199254740992ll, string("\377 \000\000\000\000\000\000", 8)},
-      {9007199254740993ll, string("\377 \000\000\000\000\000\001", 8)},
-      {18014398509481983ll, string("\377?\377\377\377\377\377\377", 8)},
-      {18014398509481984ll, string("\377@\000\000\000\000\000\000", 8)},
-      {18014398509481985ll, string("\377@\000\000\000\000\000\001", 8)},
-      {36028797018963967ll, string("\377\177\377\377\377\377\377\377", 8)},
-      {36028797018963968ll, string("\377\200\200\000\000\000\000\000\000", 9)},
-      {36028797018963969ll, string("\377\200\200\000\000\000\000\000\001", 9)},
-      {72057594037927935ll, string("\377\200\377\377\377\377\377\377\377", 9)},
-      {72057594037927936ll, string("\377\201\000\000\000\000\000\000\000", 9)},
-      {72057594037927937ll, string("\377\201\000\000\000\000\000\000\001", 9)},
-      {144115188075855871ll, string("\377\201\377\377\377\377\377\377\377", 9)},
-      {144115188075855872ll, string("\377\202\000\000\000\000\000\000\000", 9)},
-      {144115188075855873ll, string("\377\202\000\000\000\000\000\000\001", 9)},
-      {288230376151711743ll, string("\377\203\377\377\377\377\377\377\377", 9)},
-      {288230376151711744ll, string("\377\204\000\000\000\000\000\000\000", 9)},
-      {288230376151711745ll, string("\377\204\000\000\000\000\000\000\001", 9)},
-      {576460752303423487ll, string("\377\207\377\377\377\377\377\377\377", 9)},
-      {576460752303423488ll, string("\377\210\000\000\000\000\000\000\000", 9)},
-      {576460752303423489ll, string("\377\210\000\000\000\000\000\000\001", 9)},
+      {0ll, ByteSequence("\200")},
+      {1ll, ByteSequence("\201")},
+      {2ll, ByteSequence("\202")},
+      {1ll, ByteSequence("\201")},
+      {2ll, ByteSequence("\202")},
+      {3ll, ByteSequence("\203")},
+      {3ll, ByteSequence("\203")},
+      {4ll, ByteSequence("\204")},
+      {5ll, ByteSequence("\205")},
+      {7ll, ByteSequence("\207")},
+      {8ll, ByteSequence("\210")},
+      {9ll, ByteSequence("\211")},
+      {15ll, ByteSequence("\217")},
+      {16ll, ByteSequence("\220")},
+      {17ll, ByteSequence("\221")},
+      {31ll, ByteSequence("\237")},
+      {32ll, ByteSequence("\240")},
+      {33ll, ByteSequence("\241")},
+      {63ll, ByteSequence("\277")},
+      {64ll, ByteSequence("\300@")},
+      {65ll, ByteSequence("\300A")},
+      {127ll, ByteSequence("\300\177")},
+      {128ll, ByteSequence("\300\200")},
+      {129ll, ByteSequence("\300\201")},
+      {255ll, ByteSequence("\300\377")},
+      {256ll, ByteSequence("\301\000")},
+      {257ll, ByteSequence("\301\001")},
+      {511ll, ByteSequence("\301\377")},
+      {512ll, ByteSequence("\302\000")},
+      {513ll, ByteSequence("\302\001")},
+      {1023ll, ByteSequence("\303\377")},
+      {1024ll, ByteSequence("\304\000")},
+      {1025ll, ByteSequence("\304\001")},
+      {2047ll, ByteSequence("\307\377")},
+      {2048ll, ByteSequence("\310\000")},
+      {2049ll, ByteSequence("\310\001")},
+      {4095ll, ByteSequence("\317\377")},
+      {4096ll, ByteSequence("\320\000")},
+      {4097ll, ByteSequence("\320\001")},
+      {8191ll, ByteSequence("\337\377")},
+      {8192ll, ByteSequence("\340 \000")},
+      {8193ll, ByteSequence("\340 \001")},
+      {16383ll, ByteSequence("\340?\377")},
+      {16384ll, ByteSequence("\340@\000")},
+      {16385ll, ByteSequence("\340@\001")},
+      {32767ll, ByteSequence("\340\177\377")},
+      {32768ll, ByteSequence("\340\200\000")},
+      {32769ll, ByteSequence("\340\200\001")},
+      {65535ll, ByteSequence("\340\377\377")},
+      {65536ll, ByteSequence("\341\000\000")},
+      {65537ll, ByteSequence("\341\000\001")},
+      {131071ll, ByteSequence("\341\377\377")},
+      {131072ll, ByteSequence("\342\000\000")},
+      {131073ll, ByteSequence("\342\000\001")},
+      {262143ll, ByteSequence("\343\377\377")},
+      {262144ll, ByteSequence("\344\000\000")},
+      {262145ll, ByteSequence("\344\000\001")},
+      {524287ll, ByteSequence("\347\377\377")},
+      {524288ll, ByteSequence("\350\000\000")},
+      {524289ll, ByteSequence("\350\000\001")},
+      {1048575ll, ByteSequence("\357\377\377")},
+      {1048576ll, ByteSequence("\360\020\000\000")},
+      {1048577ll, ByteSequence("\360\020\000\001")},
+      {2097151ll, ByteSequence("\360\037\377\377")},
+      {2097152ll, ByteSequence("\360 \000\000")},
+      {2097153ll, ByteSequence("\360 \000\001")},
+      {4194303ll, ByteSequence("\360?\377\377")},
+      {4194304ll, ByteSequence("\360@\000\000")},
+      {4194305ll, ByteSequence("\360@\000\001")},
+      {8388607ll, ByteSequence("\360\177\377\377")},
+      {8388608ll, ByteSequence("\360\200\000\000")},
+      {8388609ll, ByteSequence("\360\200\000\001")},
+      {16777215ll, ByteSequence("\360\377\377\377")},
+      {16777216ll, ByteSequence("\361\000\000\000")},
+      {16777217ll, ByteSequence("\361\000\000\001")},
+      {33554431ll, ByteSequence("\361\377\377\377")},
+      {33554432ll, ByteSequence("\362\000\000\000")},
+      {33554433ll, ByteSequence("\362\000\000\001")},
+      {67108863ll, ByteSequence("\363\377\377\377")},
+      {67108864ll, ByteSequence("\364\000\000\000")},
+      {67108865ll, ByteSequence("\364\000\000\001")},
+      {134217727ll, ByteSequence("\367\377\377\377")},
+      {134217728ll, ByteSequence("\370\010\000\000\000")},
+      {134217729ll, ByteSequence("\370\010\000\000\001")},
+      {268435455ll, ByteSequence("\370\017\377\377\377")},
+      {268435456ll, ByteSequence("\370\020\000\000\000")},
+      {268435457ll, ByteSequence("\370\020\000\000\001")},
+      {536870911ll, ByteSequence("\370\037\377\377\377")},
+      {536870912ll, ByteSequence("\370 \000\000\000")},
+      {536870913ll, ByteSequence("\370 \000\000\001")},
+      {1073741823ll, ByteSequence("\370?\377\377\377")},
+      {1073741824ll, ByteSequence("\370@\000\000\000")},
+      {1073741825ll, ByteSequence("\370@\000\000\001")},
+      {2147483647ll, ByteSequence("\370\177\377\377\377")},
+      {2147483648ll, ByteSequence("\370\200\000\000\000")},
+      {2147483649ll, ByteSequence("\370\200\000\000\001")},
+      {4294967295ll, ByteSequence("\370\377\377\377\377")},
+      {4294967296ll, ByteSequence("\371\000\000\000\000")},
+      {4294967297ll, ByteSequence("\371\000\000\000\001")},
+      {8589934591ll, ByteSequence("\371\377\377\377\377")},
+      {8589934592ll, ByteSequence("\372\000\000\000\000")},
+      {8589934593ll, ByteSequence("\372\000\000\000\001")},
+      {17179869183ll, ByteSequence("\373\377\377\377\377")},
+      {17179869184ll, ByteSequence("\374\004\000\000\000\000")},
+      {17179869185ll, ByteSequence("\374\004\000\000\000\001")},
+      {34359738367ll, ByteSequence("\374\007\377\377\377\377")},
+      {34359738368ll, ByteSequence("\374\010\000\000\000\000")},
+      {34359738369ll, ByteSequence("\374\010\000\000\000\001")},
+      {68719476735ll, ByteSequence("\374\017\377\377\377\377")},
+      {68719476736ll, ByteSequence("\374\020\000\000\000\000")},
+      {68719476737ll, ByteSequence("\374\020\000\000\000\001")},
+      {137438953471ll, ByteSequence("\374\037\377\377\377\377")},
+      {137438953472ll, ByteSequence("\374 \000\000\000\000")},
+      {137438953473ll, ByteSequence("\374 \000\000\000\001")},
+      {274877906943ll, ByteSequence("\374?\377\377\377\377")},
+      {274877906944ll, ByteSequence("\374@\000\000\000\000")},
+      {274877906945ll, ByteSequence("\374@\000\000\000\001")},
+      {549755813887ll, ByteSequence("\374\177\377\377\377\377")},
+      {549755813888ll, ByteSequence("\374\200\000\000\000\000")},
+      {549755813889ll, ByteSequence("\374\200\000\000\000\001")},
+      {1099511627775ll, ByteSequence("\374\377\377\377\377\377")},
+      {1099511627776ll, ByteSequence("\375\000\000\000\000\000")},
+      {1099511627777ll, ByteSequence("\375\000\000\000\000\001")},
+      {2199023255551ll, ByteSequence("\375\377\377\377\377\377")},
+      {2199023255552ll, ByteSequence("\376\002\000\000\000\000\000")},
+      {2199023255553ll, ByteSequence("\376\002\000\000\000\000\001")},
+      {4398046511103ll, ByteSequence("\376\003\377\377\377\377\377")},
+      {4398046511104ll, ByteSequence("\376\004\000\000\000\000\000")},
+      {4398046511105ll, ByteSequence("\376\004\000\000\000\000\001")},
+      {8796093022207ll, ByteSequence("\376\007\377\377\377\377\377")},
+      {8796093022208ll, ByteSequence("\376\010\000\000\000\000\000")},
+      {8796093022209ll, ByteSequence("\376\010\000\000\000\000\001")},
+      {17592186044415ll, ByteSequence("\376\017\377\377\377\377\377")},
+      {17592186044416ll, ByteSequence("\376\020\000\000\000\000\000")},
+      {17592186044417ll, ByteSequence("\376\020\000\000\000\000\001")},
+      {35184372088831ll, ByteSequence("\376\037\377\377\377\377\377")},
+      {35184372088832ll, ByteSequence("\376 \000\000\000\000\000")},
+      {35184372088833ll, ByteSequence("\376 \000\000\000\000\001")},
+      {70368744177663ll, ByteSequence("\376?\377\377\377\377\377")},
+      {70368744177664ll, ByteSequence("\376@\000\000\000\000\000")},
+      {70368744177665ll, ByteSequence("\376@\000\000\000\000\001")},
+      {140737488355327ll, ByteSequence("\376\177\377\377\377\377\377")},
+      {140737488355328ll, ByteSequence("\376\200\000\000\000\000\000")},
+      {140737488355329ll, ByteSequence("\376\200\000\000\000\000\001")},
+      {281474976710655ll, ByteSequence("\376\377\377\377\377\377\377")},
+      {281474976710656ll, ByteSequence("\377\001\000\000\000\000\000\000")},
+      {281474976710657ll, ByteSequence("\377\001\000\000\000\000\000\001")},
+      {562949953421311ll, ByteSequence("\377\001\377\377\377\377\377\377")},
+      {562949953421312ll, ByteSequence("\377\002\000\000\000\000\000\000")},
+      {562949953421313ll, ByteSequence("\377\002\000\000\000\000\000\001")},
+      {1125899906842623ll, ByteSequence("\377\003\377\377\377\377\377\377")},
+      {1125899906842624ll, ByteSequence("\377\004\000\000\000\000\000\000")},
+      {1125899906842625ll, ByteSequence("\377\004\000\000\000\000\000\001")},
+      {2251799813685247ll, ByteSequence("\377\007\377\377\377\377\377\377")},
+      {2251799813685248ll, ByteSequence("\377\010\000\000\000\000\000\000")},
+      {2251799813685249ll, ByteSequence("\377\010\000\000\000\000\000\001")},
+      {4503599627370495ll, ByteSequence("\377\017\377\377\377\377\377\377")},
+      {4503599627370496ll, ByteSequence("\377\020\000\000\000\000\000\000")},
+      {4503599627370497ll, ByteSequence("\377\020\000\000\000\000\000\001")},
+      {9007199254740991ll, ByteSequence("\377\037\377\377\377\377\377\377")},
+      {9007199254740992ll, ByteSequence("\377 \000\000\000\000\000\000")},
+      {9007199254740993ll, ByteSequence("\377 \000\000\000\000\000\001")},
+      {18014398509481983ll, ByteSequence("\377?\377\377\377\377\377\377")},
+      {18014398509481984ll, ByteSequence("\377@\000\000\000\000\000\000")},
+      {18014398509481985ll, ByteSequence("\377@\000\000\000\000\000\001")},
+      {36028797018963967ll, ByteSequence("\377\177\377\377\377\377\377\377")},
+      {36028797018963968ll,
+       ByteSequence("\377\200\200\000\000\000\000\000\000")},
+      {36028797018963969ll,
+       ByteSequence("\377\200\200\000\000\000\000\000\001")},
+      {72057594037927935ll,
+       ByteSequence("\377\200\377\377\377\377\377\377\377")},
+      {72057594037927936ll,
+       ByteSequence("\377\201\000\000\000\000\000\000\000")},
+      {72057594037927937ll,
+       ByteSequence("\377\201\000\000\000\000\000\000\001")},
+      {144115188075855871ll,
+       ByteSequence("\377\201\377\377\377\377\377\377\377")},
+      {144115188075855872ll,
+       ByteSequence("\377\202\000\000\000\000\000\000\000")},
+      {144115188075855873ll,
+       ByteSequence("\377\202\000\000\000\000\000\000\001")},
+      {288230376151711743ll,
+       ByteSequence("\377\203\377\377\377\377\377\377\377")},
+      {288230376151711744ll,
+       ByteSequence("\377\204\000\000\000\000\000\000\000")},
+      {288230376151711745ll,
+       ByteSequence("\377\204\000\000\000\000\000\000\001")},
+      {576460752303423487ll,
+       ByteSequence("\377\207\377\377\377\377\377\377\377")},
+      {576460752303423488ll,
+       ByteSequence("\377\210\000\000\000\000\000\000\000")},
+      {576460752303423489ll,
+       ByteSequence("\377\210\000\000\000\000\000\000\001")},
       {1152921504606846975ll,
-       string("\377\217\377\377\377\377\377\377\377", 9)},
+       ByteSequence("\377\217\377\377\377\377\377\377\377")},
       {1152921504606846976ll,
-       string("\377\220\000\000\000\000\000\000\000", 9)},
+       ByteSequence("\377\220\000\000\000\000\000\000\000")},
       {1152921504606846977ll,
-       string("\377\220\000\000\000\000\000\000\001", 9)},
+       ByteSequence("\377\220\000\000\000\000\000\000\001")},
       {2305843009213693951ll,
-       string("\377\237\377\377\377\377\377\377\377", 9)},
+       ByteSequence("\377\237\377\377\377\377\377\377\377")},
       {2305843009213693952ll,
-       string("\377\240\000\000\000\000\000\000\000", 9)},
+       ByteSequence("\377\240\000\000\000\000\000\000\000")},
       {2305843009213693953ll,
-       string("\377\240\000\000\000\000\000\000\001", 9)},
+       ByteSequence("\377\240\000\000\000\000\000\000\001")},
       {4611686018427387903ll,
-       string("\377\277\377\377\377\377\377\377\377", 9)},
+       ByteSequence("\377\277\377\377\377\377\377\377\377")},
       {4611686018427387904ll,
-       string("\377\300@\000\000\000\000\000\000\000", 10)},
+       ByteSequence("\377\300@\000\000\000\000\000\000\000")},
       {4611686018427387905ll,
-       string("\377\300@\000\000\000\000\000\000\001", 10)},
+       ByteSequence("\377\300@\000\000\000\000\000\000\001")},
       {9223372036854775807ll,
-       string("\377\300\177\377\377\377\377\377\377\377", 10)},
+       ByteSequence("\377\300\177\377\377\377\377\377\377\377")},
       {-9223372036854775807ll,
-       string("\000?\200\000\000\000\000\000\000\001", 10)},
-      {0ll, string("\200", 1)},
-      {-1ll, string("\177", 1)},
-      {-2ll, string("~", 1)},
-      {-1ll, string("\177", 1)},
-      {-2ll, string("~", 1)},
-      {-3ll, string("}", 1)},
-      {-3ll, string("}", 1)},
-      {-4ll, string("|", 1)},
-      {-5ll, string("{", 1)},
-      {-7ll, string("y", 1)},
-      {-8ll, string("x", 1)},
-      {-9ll, string("w", 1)},
-      {-15ll, string("q", 1)},
-      {-16ll, string("p", 1)},
-      {-17ll, string("o", 1)},
-      {-31ll, string("a", 1)},
-      {-32ll, string("`", 1)},
-      {-33ll, string("_", 1)},
-      {-63ll, string("A", 1)},
-      {-64ll, string("@", 1)},
-      {-65ll, string("?\277", 2)},
-      {-127ll, string("?\201", 2)},
-      {-128ll, string("?\200", 2)},
-      {-129ll, string("?\177", 2)},
-      {-255ll, string("?\001", 2)},
-      {-256ll, string("?\000", 2)},
-      {-257ll, string(">\377", 2)},
-      {-511ll, string(">\001", 2)},
-      {-512ll, string(">\000", 2)},
-      {-513ll, string("=\377", 2)},
-      {-1023ll, string("<\001", 2)},
-      {-1024ll, string("<\000", 2)},
-      {-1025ll, string(";\377", 2)},
-      {-2047ll, string("8\001", 2)},
-      {-2048ll, string("8\000", 2)},
-      {-2049ll, string("7\377", 2)},
-      {-4095ll, string("0\001", 2)},
-      {-4096ll, string("0\000", 2)},
-      {-4097ll, string("/\377", 2)},
-      {-8191ll, string(" \001", 2)},
-      {-8192ll, string(" \000", 2)},
-      {-8193ll, string("\037\337\377", 3)},
-      {-16383ll, string("\037\300\001", 3)},
-      {-16384ll, string("\037\300\000", 3)},
-      {-16385ll, string("\037\277\377", 3)},
-      {-32767ll, string("\037\200\001", 3)},
-      {-32768ll, string("\037\200\000", 3)},
-      {-32769ll, string("\037\177\377", 3)},
-      {-65535ll, string("\037\000\001", 3)},
-      {-65536ll, string("\037\000\000", 3)},
-      {-65537ll, string("\036\377\377", 3)},
-      {-131071ll, string("\036\000\001", 3)},
-      {-131072ll, string("\036\000\000", 3)},
-      {-131073ll, string("\035\377\377", 3)},
-      {-262143ll, string("\034\000\001", 3)},
-      {-262144ll, string("\034\000\000", 3)},
-      {-262145ll, string("\033\377\377", 3)},
-      {-524287ll, string("\030\000\001", 3)},
-      {-524288ll, string("\030\000\000", 3)},
-      {-524289ll, string("\027\377\377", 3)},
-      {-1048575ll, string("\020\000\001", 3)},
-      {-1048576ll, string("\020\000\000", 3)},
-      {-1048577ll, string("\017\357\377\377", 4)},
-      {-2097151ll, string("\017\340\000\001", 4)},
-      {-2097152ll, string("\017\340\000\000", 4)},
-      {-2097153ll, string("\017\337\377\377", 4)},
-      {-4194303ll, string("\017\300\000\001", 4)},
-      {-4194304ll, string("\017\300\000\000", 4)},
-      {-4194305ll, string("\017\277\377\377", 4)},
-      {-8388607ll, string("\017\200\000\001", 4)},
-      {-8388608ll, string("\017\200\000\000", 4)},
-      {-8388609ll, string("\017\177\377\377", 4)},
-      {-16777215ll, string("\017\000\000\001", 4)},
-      {-16777216ll, string("\017\000\000\000", 4)},
-      {-16777217ll, string("\016\377\377\377", 4)},
-      {-33554431ll, string("\016\000\000\001", 4)},
-      {-33554432ll, string("\016\000\000\000", 4)},
-      {-33554433ll, string("\r\377\377\377", 4)},
-      {-67108863ll, string("\014\000\000\001", 4)},
-      {-67108864ll, string("\014\000\000\000", 4)},
-      {-67108865ll, string("\013\377\377\377", 4)},
-      {-134217727ll, string("\010\000\000\001", 4)},
-      {-134217728ll, string("\010\000\000\000", 4)},
-      {-134217729ll, string("\007\367\377\377\377", 5)},
-      {-268435455ll, string("\007\360\000\000\001", 5)},
-      {-268435456ll, string("\007\360\000\000\000", 5)},
-      {-268435457ll, string("\007\357\377\377\377", 5)},
-      {-536870911ll, string("\007\340\000\000\001", 5)},
-      {-536870912ll, string("\007\340\000\000\000", 5)},
-      {-536870913ll, string("\007\337\377\377\377", 5)},
-      {-1073741823ll, string("\007\300\000\000\001", 5)},
-      {-1073741824ll, string("\007\300\000\000\000", 5)},
-      {-1073741825ll, string("\007\277\377\377\377", 5)},
-      {-2147483647ll, string("\007\200\000\000\001", 5)},
-      {-2147483648ll, string("\007\200\000\000\000", 5)},
-      {-2147483649ll, string("\007\177\377\377\377", 5)},
-      {-4294967295ll, string("\007\000\000\000\001", 5)},
-      {-4294967296ll, string("\007\000\000\000\000", 5)},
-      {-4294967297ll, string("\006\377\377\377\377", 5)},
-      {-8589934591ll, string("\006\000\000\000\001", 5)},
-      {-8589934592ll, string("\006\000\000\000\000", 5)},
-      {-8589934593ll, string("\005\377\377\377\377", 5)},
-      {-17179869183ll, string("\004\000\000\000\001", 5)},
-      {-17179869184ll, string("\004\000\000\000\000", 5)},
-      {-17179869185ll, string("\003\373\377\377\377\377", 6)},
-      {-34359738367ll, string("\003\370\000\000\000\001", 6)},
-      {-34359738368ll, string("\003\370\000\000\000\000", 6)},
-      {-34359738369ll, string("\003\367\377\377\377\377", 6)},
-      {-68719476735ll, string("\003\360\000\000\000\001", 6)},
-      {-68719476736ll, string("\003\360\000\000\000\000", 6)},
-      {-68719476737ll, string("\003\357\377\377\377\377", 6)},
-      {-137438953471ll, string("\003\340\000\000\000\001", 6)},
-      {-137438953472ll, string("\003\340\000\000\000\000", 6)},
-      {-137438953473ll, string("\003\337\377\377\377\377", 6)},
-      {-274877906943ll, string("\003\300\000\000\000\001", 6)},
-      {-274877906944ll, string("\003\300\000\000\000\000", 6)},
-      {-274877906945ll, string("\003\277\377\377\377\377", 6)},
-      {-549755813887ll, string("\003\200\000\000\000\001", 6)},
-      {-549755813888ll, string("\003\200\000\000\000\000", 6)},
-      {-549755813889ll, string("\003\177\377\377\377\377", 6)},
-      {-1099511627775ll, string("\003\000\000\000\000\001", 6)},
-      {-1099511627776ll, string("\003\000\000\000\000\000", 6)},
-      {-1099511627777ll, string("\002\377\377\377\377\377", 6)},
-      {-2199023255551ll, string("\002\000\000\000\000\001", 6)},
-      {-2199023255552ll, string("\002\000\000\000\000\000", 6)},
-      {-2199023255553ll, string("\001\375\377\377\377\377\377", 7)},
-      {-4398046511103ll, string("\001\374\000\000\000\000\001", 7)},
-      {-4398046511104ll, string("\001\374\000\000\000\000\000", 7)},
-      {-4398046511105ll, string("\001\373\377\377\377\377\377", 7)},
-      {-8796093022207ll, string("\001\370\000\000\000\000\001", 7)},
-      {-8796093022208ll, string("\001\370\000\000\000\000\000", 7)},
-      {-8796093022209ll, string("\001\367\377\377\377\377\377", 7)},
-      {-17592186044415ll, string("\001\360\000\000\000\000\001", 7)},
-      {-17592186044416ll, string("\001\360\000\000\000\000\000", 7)},
-      {-17592186044417ll, string("\001\357\377\377\377\377\377", 7)},
-      {-35184372088831ll, string("\001\340\000\000\000\000\001", 7)},
-      {-35184372088832ll, string("\001\340\000\000\000\000\000", 7)},
-      {-35184372088833ll, string("\001\337\377\377\377\377\377", 7)},
-      {-70368744177663ll, string("\001\300\000\000\000\000\001", 7)},
-      {-70368744177664ll, string("\001\300\000\000\000\000\000", 7)},
-      {-70368744177665ll, string("\001\277\377\377\377\377\377", 7)},
-      {-140737488355327ll, string("\001\200\000\000\000\000\001", 7)},
-      {-140737488355328ll, string("\001\200\000\000\000\000\000", 7)},
-      {-140737488355329ll, string("\001\177\377\377\377\377\377", 7)},
-      {-281474976710655ll, string("\001\000\000\000\000\000\001", 7)},
-      {-281474976710656ll, string("\001\000\000\000\000\000\000", 7)},
-      {-281474976710657ll, string("\000\376\377\377\377\377\377\377", 8)},
-      {-562949953421311ll, string("\000\376\000\000\000\000\000\001", 8)},
-      {-562949953421312ll, string("\000\376\000\000\000\000\000\000", 8)},
-      {-562949953421313ll, string("\000\375\377\377\377\377\377\377", 8)},
-      {-1125899906842623ll, string("\000\374\000\000\000\000\000\001", 8)},
-      {-1125899906842624ll, string("\000\374\000\000\000\000\000\000", 8)},
-      {-1125899906842625ll, string("\000\373\377\377\377\377\377\377", 8)},
-      {-2251799813685247ll, string("\000\370\000\000\000\000\000\001", 8)},
-      {-2251799813685248ll, string("\000\370\000\000\000\000\000\000", 8)},
-      {-2251799813685249ll, string("\000\367\377\377\377\377\377\377", 8)},
-      {-4503599627370495ll, string("\000\360\000\000\000\000\000\001", 8)},
-      {-4503599627370496ll, string("\000\360\000\000\000\000\000\000", 8)},
-      {-4503599627370497ll, string("\000\357\377\377\377\377\377\377", 8)},
-      {-9007199254740991ll, string("\000\340\000\000\000\000\000\001", 8)},
-      {-9007199254740992ll, string("\000\340\000\000\000\000\000\000", 8)},
-      {-9007199254740993ll, string("\000\337\377\377\377\377\377\377", 8)},
-      {-18014398509481983ll, string("\000\300\000\000\000\000\000\001", 8)},
-      {-18014398509481984ll, string("\000\300\000\000\000\000\000\000", 8)},
-      {-18014398509481985ll, string("\000\277\377\377\377\377\377\377", 8)},
-      {-36028797018963967ll, string("\000\200\000\000\000\000\000\001", 8)},
-      {-36028797018963968ll, string("\000\200\000\000\000\000\000\000", 8)},
-      {-36028797018963969ll, string("\000\177\177\377\377\377\377\377\377", 9)},
-      {-72057594037927935ll, string("\000\177\000\000\000\000\000\000\001", 9)},
-      {-72057594037927936ll, string("\000\177\000\000\000\000\000\000\000", 9)},
-      {-72057594037927937ll, string("\000~\377\377\377\377\377\377\377", 9)},
-      {-144115188075855871ll, string("\000~\000\000\000\000\000\000\001", 9)},
-      {-144115188075855872ll, string("\000~\000\000\000\000\000\000\000", 9)},
-      {-144115188075855873ll, string("\000}\377\377\377\377\377\377\377", 9)},
-      {-288230376151711743ll, string("\000|\000\000\000\000\000\000\001", 9)},
-      {-288230376151711744ll, string("\000|\000\000\000\000\000\000\000", 9)},
-      {-288230376151711745ll, string("\000{\377\377\377\377\377\377\377", 9)},
-      {-576460752303423487ll, string("\000x\000\000\000\000\000\000\001", 9)},
-      {-576460752303423488ll, string("\000x\000\000\000\000\000\000\000", 9)},
-      {-576460752303423489ll, string("\000w\377\377\377\377\377\377\377", 9)},
-      {-1152921504606846975ll, string("\000p\000\000\000\000\000\000\001", 9)},
-      {-1152921504606846976ll, string("\000p\000\000\000\000\000\000\000", 9)},
-      {-1152921504606846977ll, string("\000o\377\377\377\377\377\377\377", 9)},
-      {-2305843009213693951ll, string("\000`\000\000\000\000\000\000\001", 9)},
-      {-2305843009213693952ll, string("\000`\000\000\000\000\000\000\000", 9)},
-      {-2305843009213693953ll, string("\000_\377\377\377\377\377\377\377", 9)},
-      {-4611686018427387903ll, string("\000@\000\000\000\000\000\000\001", 9)},
-      {-4611686018427387904ll, string("\000@\000\000\000\000\000\000\000", 9)},
+       ByteSequence("\000?\200\000\000\000\000\000\000\001")},
+      {0ll, ByteSequence("\200")},
+      {-1ll, ByteSequence("\177")},
+      {-2ll, ByteSequence("~")},
+      {-1ll, ByteSequence("\177")},
+      {-2ll, ByteSequence("~")},
+      {-3ll, ByteSequence("}")},
+      {-3ll, ByteSequence("}")},
+      {-4ll, ByteSequence("|")},
+      {-5ll, ByteSequence("{")},
+      {-7ll, ByteSequence("y")},
+      {-8ll, ByteSequence("x")},
+      {-9ll, ByteSequence("w")},
+      {-15ll, ByteSequence("q")},
+      {-16ll, ByteSequence("p")},
+      {-17ll, ByteSequence("o")},
+      {-31ll, ByteSequence("a")},
+      {-32ll, ByteSequence("`")},
+      {-33ll, ByteSequence("_")},
+      {-63ll, ByteSequence("A")},
+      {-64ll, ByteSequence("@")},
+      {-65ll, ByteSequence("?\277")},
+      {-127ll, ByteSequence("?\201")},
+      {-128ll, ByteSequence("?\200")},
+      {-129ll, ByteSequence("?\177")},
+      {-255ll, ByteSequence("?\001")},
+      {-256ll, ByteSequence("?\000")},
+      {-257ll, ByteSequence(">\377")},
+      {-511ll, ByteSequence(">\001")},
+      {-512ll, ByteSequence(">\000")},
+      {-513ll, ByteSequence("=\377")},
+      {-1023ll, ByteSequence("<\001")},
+      {-1024ll, ByteSequence("<\000")},
+      {-1025ll, ByteSequence(";\377")},
+      {-2047ll, ByteSequence("8\001")},
+      {-2048ll, ByteSequence("8\000")},
+      {-2049ll, ByteSequence("7\377")},
+      {-4095ll, ByteSequence("0\001")},
+      {-4096ll, ByteSequence("0\000")},
+      {-4097ll, ByteSequence("/\377")},
+      {-8191ll, ByteSequence(" \001")},
+      {-8192ll, ByteSequence(" \000")},
+      {-8193ll, ByteSequence("\037\337\377")},
+      {-16383ll, ByteSequence("\037\300\001")},
+      {-16384ll, ByteSequence("\037\300\000")},
+      {-16385ll, ByteSequence("\037\277\377")},
+      {-32767ll, ByteSequence("\037\200\001")},
+      {-32768ll, ByteSequence("\037\200\000")},
+      {-32769ll, ByteSequence("\037\177\377")},
+      {-65535ll, ByteSequence("\037\000\001")},
+      {-65536ll, ByteSequence("\037\000\000")},
+      {-65537ll, ByteSequence("\036\377\377")},
+      {-131071ll, ByteSequence("\036\000\001")},
+      {-131072ll, ByteSequence("\036\000\000")},
+      {-131073ll, ByteSequence("\035\377\377")},
+      {-262143ll, ByteSequence("\034\000\001")},
+      {-262144ll, ByteSequence("\034\000\000")},
+      {-262145ll, ByteSequence("\033\377\377")},
+      {-524287ll, ByteSequence("\030\000\001")},
+      {-524288ll, ByteSequence("\030\000\000")},
+      {-524289ll, ByteSequence("\027\377\377")},
+      {-1048575ll, ByteSequence("\020\000\001")},
+      {-1048576ll, ByteSequence("\020\000\000")},
+      {-1048577ll, ByteSequence("\017\357\377\377")},
+      {-2097151ll, ByteSequence("\017\340\000\001")},
+      {-2097152ll, ByteSequence("\017\340\000\000")},
+      {-2097153ll, ByteSequence("\017\337\377\377")},
+      {-4194303ll, ByteSequence("\017\300\000\001")},
+      {-4194304ll, ByteSequence("\017\300\000\000")},
+      {-4194305ll, ByteSequence("\017\277\377\377")},
+      {-8388607ll, ByteSequence("\017\200\000\001")},
+      {-8388608ll, ByteSequence("\017\200\000\000")},
+      {-8388609ll, ByteSequence("\017\177\377\377")},
+      {-16777215ll, ByteSequence("\017\000\000\001")},
+      {-16777216ll, ByteSequence("\017\000\000\000")},
+      {-16777217ll, ByteSequence("\016\377\377\377")},
+      {-33554431ll, ByteSequence("\016\000\000\001")},
+      {-33554432ll, ByteSequence("\016\000\000\000")},
+      {-33554433ll, ByteSequence("\r\377\377\377")},
+      {-67108863ll, ByteSequence("\014\000\000\001")},
+      {-67108864ll, ByteSequence("\014\000\000\000")},
+      {-67108865ll, ByteSequence("\013\377\377\377")},
+      {-134217727ll, ByteSequence("\010\000\000\001")},
+      {-134217728ll, ByteSequence("\010\000\000\000")},
+      {-134217729ll, ByteSequence("\007\367\377\377\377")},
+      {-268435455ll, ByteSequence("\007\360\000\000\001")},
+      {-268435456ll, ByteSequence("\007\360\000\000\000")},
+      {-268435457ll, ByteSequence("\007\357\377\377\377")},
+      {-536870911ll, ByteSequence("\007\340\000\000\001")},
+      {-536870912ll, ByteSequence("\007\340\000\000\000")},
+      {-536870913ll, ByteSequence("\007\337\377\377\377")},
+      {-1073741823ll, ByteSequence("\007\300\000\000\001")},
+      {-1073741824ll, ByteSequence("\007\300\000\000\000")},
+      {-1073741825ll, ByteSequence("\007\277\377\377\377")},
+      {-2147483647ll, ByteSequence("\007\200\000\000\001")},
+      {-2147483648ll, ByteSequence("\007\200\000\000\000")},
+      {-2147483649ll, ByteSequence("\007\177\377\377\377")},
+      {-4294967295ll, ByteSequence("\007\000\000\000\001")},
+      {-4294967296ll, ByteSequence("\007\000\000\000\000")},
+      {-4294967297ll, ByteSequence("\006\377\377\377\377")},
+      {-8589934591ll, ByteSequence("\006\000\000\000\001")},
+      {-8589934592ll, ByteSequence("\006\000\000\000\000")},
+      {-8589934593ll, ByteSequence("\005\377\377\377\377")},
+      {-17179869183ll, ByteSequence("\004\000\000\000\001")},
+      {-17179869184ll, ByteSequence("\004\000\000\000\000")},
+      {-17179869185ll, ByteSequence("\003\373\377\377\377\377")},
+      {-34359738367ll, ByteSequence("\003\370\000\000\000\001")},
+      {-34359738368ll, ByteSequence("\003\370\000\000\000\000")},
+      {-34359738369ll, ByteSequence("\003\367\377\377\377\377")},
+      {-68719476735ll, ByteSequence("\003\360\000\000\000\001")},
+      {-68719476736ll, ByteSequence("\003\360\000\000\000\000")},
+      {-68719476737ll, ByteSequence("\003\357\377\377\377\377")},
+      {-137438953471ll, ByteSequence("\003\340\000\000\000\001")},
+      {-137438953472ll, ByteSequence("\003\340\000\000\000\000")},
+      {-137438953473ll, ByteSequence("\003\337\377\377\377\377")},
+      {-274877906943ll, ByteSequence("\003\300\000\000\000\001")},
+      {-274877906944ll, ByteSequence("\003\300\000\000\000\000")},
+      {-274877906945ll, ByteSequence("\003\277\377\377\377\377")},
+      {-549755813887ll, ByteSequence("\003\200\000\000\000\001")},
+      {-549755813888ll, ByteSequence("\003\200\000\000\000\000")},
+      {-549755813889ll, ByteSequence("\003\177\377\377\377\377")},
+      {-1099511627775ll, ByteSequence("\003\000\000\000\000\001")},
+      {-1099511627776ll, ByteSequence("\003\000\000\000\000\000")},
+      {-1099511627777ll, ByteSequence("\002\377\377\377\377\377")},
+      {-2199023255551ll, ByteSequence("\002\000\000\000\000\001")},
+      {-2199023255552ll, ByteSequence("\002\000\000\000\000\000")},
+      {-2199023255553ll, ByteSequence("\001\375\377\377\377\377\377")},
+      {-4398046511103ll, ByteSequence("\001\374\000\000\000\000\001")},
+      {-4398046511104ll, ByteSequence("\001\374\000\000\000\000\000")},
+      {-4398046511105ll, ByteSequence("\001\373\377\377\377\377\377")},
+      {-8796093022207ll, ByteSequence("\001\370\000\000\000\000\001")},
+      {-8796093022208ll, ByteSequence("\001\370\000\000\000\000\000")},
+      {-8796093022209ll, ByteSequence("\001\367\377\377\377\377\377")},
+      {-17592186044415ll, ByteSequence("\001\360\000\000\000\000\001")},
+      {-17592186044416ll, ByteSequence("\001\360\000\000\000\000\000")},
+      {-17592186044417ll, ByteSequence("\001\357\377\377\377\377\377")},
+      {-35184372088831ll, ByteSequence("\001\340\000\000\000\000\001")},
+      {-35184372088832ll, ByteSequence("\001\340\000\000\000\000\000")},
+      {-35184372088833ll, ByteSequence("\001\337\377\377\377\377\377")},
+      {-70368744177663ll, ByteSequence("\001\300\000\000\000\000\001")},
+      {-70368744177664ll, ByteSequence("\001\300\000\000\000\000\000")},
+      {-70368744177665ll, ByteSequence("\001\277\377\377\377\377\377")},
+      {-140737488355327ll, ByteSequence("\001\200\000\000\000\000\001")},
+      {-140737488355328ll, ByteSequence("\001\200\000\000\000\000\000")},
+      {-140737488355329ll, ByteSequence("\001\177\377\377\377\377\377")},
+      {-281474976710655ll, ByteSequence("\001\000\000\000\000\000\001")},
+      {-281474976710656ll, ByteSequence("\001\000\000\000\000\000\000")},
+      {-281474976710657ll, ByteSequence("\000\376\377\377\377\377\377\377")},
+      {-562949953421311ll, ByteSequence("\000\376\000\000\000\000\000\001")},
+      {-562949953421312ll, ByteSequence("\000\376\000\000\000\000\000\000")},
+      {-562949953421313ll, ByteSequence("\000\375\377\377\377\377\377\377")},
+      {-1125899906842623ll, ByteSequence("\000\374\000\000\000\000\000\001")},
+      {-1125899906842624ll, ByteSequence("\000\374\000\000\000\000\000\000")},
+      {-1125899906842625ll, ByteSequence("\000\373\377\377\377\377\377\377")},
+      {-2251799813685247ll, ByteSequence("\000\370\000\000\000\000\000\001")},
+      {-2251799813685248ll, ByteSequence("\000\370\000\000\000\000\000\000")},
+      {-2251799813685249ll, ByteSequence("\000\367\377\377\377\377\377\377")},
+      {-4503599627370495ll, ByteSequence("\000\360\000\000\000\000\000\001")},
+      {-4503599627370496ll, ByteSequence("\000\360\000\000\000\000\000\000")},
+      {-4503599627370497ll, ByteSequence("\000\357\377\377\377\377\377\377")},
+      {-9007199254740991ll, ByteSequence("\000\340\000\000\000\000\000\001")},
+      {-9007199254740992ll, ByteSequence("\000\340\000\000\000\000\000\000")},
+      {-9007199254740993ll, ByteSequence("\000\337\377\377\377\377\377\377")},
+      {-18014398509481983ll, ByteSequence("\000\300\000\000\000\000\000\001")},
+      {-18014398509481984ll, ByteSequence("\000\300\000\000\000\000\000\000")},
+      {-18014398509481985ll, ByteSequence("\000\277\377\377\377\377\377\377")},
+      {-36028797018963967ll, ByteSequence("\000\200\000\000\000\000\000\001")},
+      {-36028797018963968ll, ByteSequence("\000\200\000\000\000\000\000\000")},
+      {-36028797018963969ll,
+       ByteSequence("\000\177\177\377\377\377\377\377\377")},
+      {-72057594037927935ll,
+       ByteSequence("\000\177\000\000\000\000\000\000\001")},
+      {-72057594037927936ll,
+       ByteSequence("\000\177\000\000\000\000\000\000\000")},
+      {-72057594037927937ll, ByteSequence("\000~\377\377\377\377\377\377\377")},
+      {-144115188075855871ll,
+       ByteSequence("\000~\000\000\000\000\000\000\001")},
+      {-144115188075855872ll,
+       ByteSequence("\000~\000\000\000\000\000\000\000")},
+      {-144115188075855873ll,
+       ByteSequence("\000}\377\377\377\377\377\377\377")},
+      {-288230376151711743ll,
+       ByteSequence("\000|\000\000\000\000\000\000\001")},
+      {-288230376151711744ll,
+       ByteSequence("\000|\000\000\000\000\000\000\000")},
+      {-288230376151711745ll,
+       ByteSequence("\000{\377\377\377\377\377\377\377")},
+      {-576460752303423487ll,
+       ByteSequence("\000x\000\000\000\000\000\000\001")},
+      {-576460752303423488ll,
+       ByteSequence("\000x\000\000\000\000\000\000\000")},
+      {-576460752303423489ll,
+       ByteSequence("\000w\377\377\377\377\377\377\377")},
+      {-1152921504606846975ll,
+       ByteSequence("\000p\000\000\000\000\000\000\001")},
+      {-1152921504606846976ll,
+       ByteSequence("\000p\000\000\000\000\000\000\000")},
+      {-1152921504606846977ll,
+       ByteSequence("\000o\377\377\377\377\377\377\377")},
+      {-2305843009213693951ll,
+       ByteSequence("\000`\000\000\000\000\000\000\001")},
+      {-2305843009213693952ll,
+       ByteSequence("\000`\000\000\000\000\000\000\000")},
+      {-2305843009213693953ll,
+       ByteSequence("\000_\377\377\377\377\377\377\377")},
+      {-4611686018427387903ll,
+       ByteSequence("\000@\000\000\000\000\000\000\001")},
+      {-4611686018427387904ll,
+       ByteSequence("\000@\000\000\000\000\000\000\000")},
       {-4611686018427387905ll,
-       string("\000?\277\377\377\377\377\377\377\377", 10)},
+       ByteSequence("\000?\277\377\377\377\377\377\377\377")},
       {-9223372036854775807ll,
-       string("\000?\200\000\000\000\000\000\000\001", 10)},
+       ByteSequence("\000?\200\000\000\000\000\000\000\001")},
       {9223372036854775807ll,
-       string("\377\300\177\377\377\377\377\377\377\377", 10)},
+       ByteSequence("\377\300\177\377\377\377\377\377\377\377")},
   };
   for (const auto& t : data) {
     int64 num = t.first;
@@ -1149,7 +1209,7 @@ TEST(EncodingIsExpected, Signed) {
   }
 }
 
-static void BM_WriteString(int n, int len) {
+void BM_WriteString(int n, int len) {
   testing::StopTiming();
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
@@ -1167,7 +1227,7 @@ static void BM_WriteString(int n, int len) {
   }
 }
 
-static void BM_ReadString(int n, int len) {
+void BM_ReadString(int n, int len) {
   testing::StopTiming();
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
@@ -1188,11 +1248,12 @@ static void BM_ReadString(int n, int len) {
   }
 }
 
-static void BM_WriteStringIncreasing(int n, int len) { BM_WriteString(n, len); }
-static void BM_ReadStringIncreasing(int n, int len) { BM_ReadString(n, len); }
+void BM_WriteStringIncreasing(int n, int len) { BM_WriteString(n, len); }
+void BM_ReadStringIncreasing(int n, int len) { BM_ReadString(n, len); }
 
 BENCHMARK(BM_WriteStringIncreasing)->Range(0, 1024);
 BENCHMARK(BM_ReadStringIncreasing)->Range(0, 1024);
 
+}  // namespace
 }  // namespace strings
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index 8bcf05104cb..d3b63357ee7 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -88,7 +88,9 @@ class Scanner {
     return *this;
   }
 
-  // Consume characters from the input as long as they match <clz>.
+  // Consume characters from the input as long as they match <clz>. Zero
+  // characters is still considered a match, so it will never cause GetResult to
+  // return false.
   Scanner& Any(CharClass clz) {
     while (!cur_.empty() && Matches(clz, cur_[0])) {
       cur_.remove_prefix(1);
diff --git a/tensorflow/core/lib/strings/scanner_test.cc b/tensorflow/core/lib/strings/scanner_test.cc
index 0d37b100703..55ff3405c35 100644
--- a/tensorflow/core/lib/strings/scanner_test.cc
+++ b/tensorflow/core/lib/strings/scanner_test.cc
@@ -73,6 +73,24 @@ TEST_F(ScannerTest, AnySpace) {
   EXPECT_EQ("b ", remaining.ToString());
 }
 
+TEST_F(ScannerTest, AnyEscapedNewline) {
+  StringPiece remaining, match;
+  EXPECT_TRUE(Scanner("\\\n")
+                  .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+                  .GetResult(&remaining, &match));
+  EXPECT_EQ("\\\n", remaining);
+  EXPECT_EQ("", match);
+}
+
+TEST_F(ScannerTest, AnyEmptyString) {
+  StringPiece remaining, match;
+  EXPECT_TRUE(Scanner("")
+                  .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+                  .GetResult(&remaining, &match));
+  EXPECT_EQ("", remaining);
+  EXPECT_EQ("", match);
+}
+
 TEST_F(ScannerTest, Eos) {
   EXPECT_FALSE(Scanner("a").Eos().GetResult());
   EXPECT_TRUE(Scanner("").Eos().GetResult());
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index da369ea516a..c68e14f09fb 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -25,7 +25,7 @@ namespace str_util {
 
 static char hex_char[] = "0123456789abcdef";
 
-string CEscape(const string& src) {
+string CEscape(StringPiece src) {
   string dest;
 
   for (unsigned char c : src) {
@@ -258,6 +258,25 @@ void TitlecaseString(string* s, StringPiece delimiters) {
   }
 }
 
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all) {
+  // TODO(jlebar): We could avoid having to shift data around in the string if
+  // we had a StringPiece::find() overload that searched for a StringPiece.
+  string res = s.ToString();
+  size_t pos = 0;
+  while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
+    res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
+    pos += newsub.size();
+    if (oldsub.empty()) {
+      pos++;  // Match at the beginning of the text and after every byte
+    }
+    if (!replace_all) {
+      break;
+    }
+  }
+  return res;
+}
+
 size_t RemoveLeadingWhitespace(StringPiece* text) {
   size_t count = 0;
   const char* ptr = text->data();
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index bfecfef6cbb..669f0d3c527 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -30,7 +30,7 @@ namespace str_util {
 
 // Returns a version of 'src' where unprintable characters have been
 // escaped using C-style escape sequences.
-string CEscape(const string& src);
+string CEscape(StringPiece src);
 
 // Copies "source" to "dest", rewriting C-style escape sequences --
 // '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
@@ -85,6 +85,11 @@ string Uppercase(StringPiece s);
 // set of characters that can be used as word boundaries.
 void TitlecaseString(string* s, StringPiece delimiters);
 
+// Replaces the first occurrence (if replace_all is false) or all occurrences
+// (if replace_all is true) of oldsub in s with newsub.
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all);
+
 // Join functionality
 template <typename T>
 string Join(const T& s, const char* sep);
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 486690cf57f..040f7447e4d 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -352,4 +352,37 @@ TEST(TitlecaseString, Basic) {
   ASSERT_EQ(s, "Dense");
 }
 
+TEST(StringReplace, Basic) {
+  EXPECT_EQ("XYZ_XYZ_XYZ", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/true));
+}
+
+TEST(StringReplace, OnlyFirst) {
+  EXPECT_EQ("XYZ_ABC_ABC", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/false));
+}
+
+TEST(StringReplace, IncreaseLength) {
+  EXPECT_EQ("a b c",
+            str_util::StringReplace("abc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, IncreaseLengthMultipleMatches) {
+  EXPECT_EQ("a b  b c",
+            str_util::StringReplace("abbc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, NoChange) {
+  EXPECT_EQ("abc",
+            str_util::StringReplace("abc", "d", "X", /*replace_all=*/true));
+}
+
+TEST(StringReplace, EmptyStringReplaceFirst) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/false));
+}
+
+TEST(StringReplace, EmptyStringReplaceAll) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index b078e8cf945..3e864c4f282 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -27,8 +27,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-AlphaNum gEmptyAlphaNum("");
-
 AlphaNum::AlphaNum(const Eigen::half &f)
     : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {}
 
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 9434b944110..8e35549ed4b 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -144,8 +144,6 @@ class AlphaNum {
   TF_DISALLOW_COPY_AND_ASSIGN(AlphaNum);
 };
 
-extern AlphaNum gEmptyAlphaNum;
-
 // ----------------------------------------------------------------------
 // StrCat()
 //    This merges the given strings or numbers, with no delimiter.  This
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc
index 855734fc144..03eba4c851f 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/lib/strings/stringprintf.cc
@@ -52,7 +52,7 @@ void Appendv(string* dst, const char* format, va_list ap) {
       // Error or MSVC running out of space.  MSVC 8.0 and higher
       // can be asked about space needed with the special idiom below:
       va_copy(backup_ap, ap);
-      result = vsnprintf(NULL, 0, format, backup_ap);
+      result = vsnprintf(nullptr, 0, format, backup_ap);
       va_end(backup_ap);
     }
 
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc
index 8d2bc897b0b..d61a1a945ae 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/lib/strings/stringprintf_test.cc
@@ -66,7 +66,7 @@ TEST(PrintfTest, Multibyte) {
   // out of memory while trying to determine destination buffer size.
   // see b/4194543.
 
-  char* old_locale = setlocale(LC_CTYPE, NULL);
+  char* old_locale = setlocale(LC_CTYPE, nullptr);
   // Push locale with multibyte mode
   setlocale(LC_CTYPE, "en_US.utf8");
 
@@ -95,7 +95,7 @@ TEST(PrintfTest, Multibyte) {
 
 TEST(PrintfTest, NoMultibyte) {
   // No multibyte handling, but the string contains funny chars.
-  char* old_locale = setlocale(LC_CTYPE, NULL);
+  char* old_locale = setlocale(LC_CTYPE, nullptr);
   setlocale(LC_CTYPE, "POSIX");
   string value = Printf("%.*s", 3, "\375\067s");
   setlocale(LC_CTYPE, old_locale);
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 31c81b7dde2..028ff26ffb9 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -65,21 +65,64 @@ static_assert(sizeof(WavHeader) ==
                   sizeof(RiffChunk) + sizeof(FormatChunk) + sizeof(DataChunk),
               "TF_PACKED does not work.");
 
+constexpr char kRiffChunkId[] = "RIFF";
+constexpr char kRiffType[] = "WAVE";
+constexpr char kFormatChunkId[] = "fmt ";
+constexpr char kDataChunkId[] = "data";
+
 inline int16 FloatToInt16Sample(float data) {
   constexpr float kMultiplier = 1.0f * (1 << 15);
   return std::min<float>(std::max<float>(roundf(data * kMultiplier), kint16min),
                          kint16max);
 }
 
+inline float Int16SampleToFloat(int16 data) {
+  constexpr float kMultiplier = 1.0f / (1 << 15);
+  return data * kMultiplier;
+}
+
+Status ExpectText(const string& data, const string& expected_text,
+                  int* offset) {
+  const int new_offset = *offset + expected_text.size();
+  if (new_offset > data.size()) {
+    return errors::InvalidArgument("Data too short when trying to read ",
+                                   expected_text);
+  }
+  const string found_text(data.begin() + *offset, data.begin() + new_offset);
+  if (found_text != expected_text) {
+    return errors::InvalidArgument("Header mismatch: Expected ", expected_text,
+                                   " but found ", found_text);
+  }
+  *offset = new_offset;
+  return Status::OK();
+}
+
+template <class T>
+Status ReadValue(const string& data, T* value, int* offset) {
+  const int new_offset = *offset + sizeof(T);
+  if (new_offset > data.size()) {
+    return errors::InvalidArgument("Data too short when trying to read value");
+  }
+  if (port::kLittleEndian) {
+    memcpy(value, data.data() + *offset, sizeof(T));
+  } else {
+    *value = 0;
+    const uint8* data_buf =
+        reinterpret_cast<const uint8*>(data.data() + *offset);
+    int shift = 0;
+    for (int i = 0; i < sizeof(T); ++i, shift += 8) {
+      *value = *value | (data_buf[i] >> shift);
+    }
+  }
+  *offset = new_offset;
+  return Status::OK();
+}
+
 }  // namespace
 
 Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
                              size_t num_channels, size_t num_frames,
                              string* wav_string) {
-  constexpr char kRiffChunkId[] = "RIFF";
-  constexpr char kRiffType[] = "WAVE";
-  constexpr char kFormatChunkId[] = "fmt ";
-  constexpr char kDataChunkId[] = "data";
   constexpr size_t kFormatChunkSize = 16;
   constexpr size_t kCompressionCodePcm = 1;
   constexpr size_t kBitsPerSample = 16;
@@ -153,5 +196,79 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   return Status::OK();
 }
 
+Status DecodeLin16WaveAsFloatVector(const string& wav_string,
+                                    std::vector<float>* float_values,
+                                    uint32* sample_count, uint16* channel_count,
+                                    uint32* sample_rate) {
+  int offset = 0;
+  TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffChunkId, &offset));
+  uint32 total_file_size;
+  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &total_file_size, &offset));
+  TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffType, &offset));
+  TF_RETURN_IF_ERROR(ExpectText(wav_string, kFormatChunkId, &offset));
+  uint32 format_chunk_size;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
+  if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
+    return errors::InvalidArgument(
+        "Bad file size for WAV: Expected 16 or 18, but got", format_chunk_size);
+  }
+  uint16 audio_format;
+  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));
+  if (audio_format != 1) {
+    return errors::InvalidArgument(
+        "Bad audio format for WAV: Expected 1 (PCM), but got", audio_format);
+  }
+  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, channel_count, &offset));
+  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, sample_rate, &offset));
+  uint32 bytes_per_second;
+  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &bytes_per_second, &offset));
+  uint16 bytes_per_sample;
+  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bytes_per_sample, &offset));
+  // Confusingly, bits per sample is defined as holding the number of bits for
+  // one channel, unlike the definition of sample used elsewhere in the WAV
+  // spec. For example, bytes per sample is the memory needed for all channels
+  // for one point in time.
+  uint16 bits_per_sample;
+  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bits_per_sample, &offset));
+  if (bits_per_sample != 16) {
+    return errors::InvalidArgument(
+        "Can only read 16-bit WAV files, but received ", bits_per_sample);
+  }
+  const uint32 expected_bytes_per_sample =
+      ((bits_per_sample * *channel_count) + 7) / 8;
+  if (bytes_per_sample != expected_bytes_per_sample) {
+    return errors::InvalidArgument(
+        "Bad bytes per sample in WAV header: Expected ",
+        expected_bytes_per_sample, " but got ", bytes_per_sample);
+  }
+  const uint32 expected_bytes_per_second =
+      (bytes_per_sample * (*sample_rate)) / *channel_count;
+  if (bytes_per_second != expected_bytes_per_second) {
+    return errors::InvalidArgument(
+        "Bad bytes per second in WAV header: Expected ",
+        expected_bytes_per_second, " but got ", bytes_per_second,
+        " (sample_rate=", *sample_rate, ", bytes_per_sample=", bytes_per_sample,
+        ")");
+  }
+  if (format_chunk_size == 18) {
+    // Skip over this unused section.
+    offset += 2;
+  }
+  TF_RETURN_IF_ERROR(ExpectText(wav_string, kDataChunkId, &offset));
+  uint32 data_size;
+  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &data_size, &offset));
+  *sample_count = data_size / bytes_per_sample;
+  const uint32 data_count = *sample_count * *channel_count;
+  float_values->resize(data_count);
+  for (int i = 0; i < data_count; ++i) {
+    int16 single_channel_value = 0;
+    TF_RETURN_IF_ERROR(
+        ReadValue<int16>(wav_string, &single_channel_value, &offset));
+    (*float_values)[i] = Int16SampleToFloat(single_channel_value);
+  }
+  return Status::OK();
+}
+
 }  // namespace wav
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index 68629996e1e..adca0ee3034 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_WAV_WAV_IO_H_
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -42,6 +43,18 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
                              size_t num_channels, size_t num_frames,
                              string* wav_string);
 
+// Decodes the little-endian signed 16-bit PCM WAV file data (aka LIN16
+// encoding) into a float Tensor. The channels are encoded as the lowest
+// dimension of the tensor, with the number of frames as the second. This means
+// that a four frame stereo signal will have the shape [4, 2]. The sample rate
+// is read from the file header, and an error is returned if the format is not
+// supported.
+// The results are output as floats within the range -1 to 1,
+Status DecodeLin16WaveAsFloatVector(const string& wav_string,
+                                    std::vector<float>* float_values,
+                                    uint32* sample_count, uint16* channel_count,
+                                    uint32* sample_rate);
+
 }  // namespace wav
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index 11f1bfa527a..e54b9445abc 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -78,5 +78,24 @@ TEST(WavIO, BasicOdd) {
   EXPECT_EQ(54, result.size());
 }
 
+TEST(WavIO, EncodeThenDecode) {
+  float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
+  string wav_data;
+  TF_ASSERT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 3, &wav_data));
+  std::vector<float> decoded_audio;
+  uint32 decoded_sample_count;
+  uint16 decoded_channel_count;
+  uint32 decoded_sample_rate;
+  TF_ASSERT_OK(DecodeLin16WaveAsFloatVector(
+      wav_data, &decoded_audio, &decoded_sample_count, &decoded_channel_count,
+      &decoded_sample_rate));
+  EXPECT_EQ(2, decoded_channel_count);
+  EXPECT_EQ(3, decoded_sample_count);
+  EXPECT_EQ(44100, decoded_sample_rate);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_NEAR(audio[i], decoded_audio[i], 1e-4f) << "i=" << i;
+  }
+}
+
 }  // namespace wav
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 2894e3cd698..325dbc48835 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -25,6 +25,7 @@ REGISTER_OP_NO_GRADIENT("Shape");
 REGISTER_OP_NO_GRADIENT("Rank");
 REGISTER_OP_NO_GRADIENT("Size");
 REGISTER_OP_NO_GRADIENT("ZerosLike");
+REGISTER_OP_NO_GRADIENT("OnesLike");
 REGISTER_OP_NO_GRADIENT("Const");
 REGISTER_OP_NO_GRADIENT("EditDistance");
 REGISTER_OP_NO_GRADIENT("StopGradient");
@@ -247,6 +248,7 @@ Status ArrayToListGrad(const AttrSlice& attrs, FunctionDef* g) {
   int N;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &N));
   std::vector<string> dys;
+  dys.reserve(N);
   for (int i = 0; i < N; ++i) {
     dys.push_back(strings::StrCat("dy:", i));
   }
diff --git a/tensorflow/core/ops/array_grad_test.cc b/tensorflow/core/ops/array_grad_test.cc
index c58b1dded79..e665d179386 100644
--- a/tensorflow/core/ops/array_grad_test.cc
+++ b/tensorflow/core/ops/array_grad_test.cc
@@ -13,23 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <vector>
+
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
+namespace {
 
 namespace f = test::function;
-typedef FunctionDefHelper FDH;
+using FDH = FunctionDefHelper;
 
-class ArrayGradTest : public ::testing::Test {};
-
-Session* NewSession() {
+std::unique_ptr<Session> NewSession() {
   SessionOptions opts;
   (*opts.config.mutable_device_count())["CPU"] = 1;
-  return NewSession(opts);
+  return std::unique_ptr<Session>(NewSession(opts));
 }
 
 std::vector<Tensor> PackGrad(const Tensor& x0, const Tensor& x1,
@@ -56,11 +57,10 @@ std::vector<Tensor> PackGrad(const Tensor& x0, const Tensor& x1,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, PackGrad) {
+TEST(ArrayGradTest, PackGrad) {
   Tensor x0(DT_FLOAT, {2, 3});
   x0.flat<float>().setZero();
   Tensor x1(DT_FLOAT, {2, 3});
@@ -98,11 +98,10 @@ std::vector<Tensor> UnpackGrad(const Tensor& x, const Tensor& dy0,
                         {"dx:0"}, {}, &out));
   CHECK_EQ(out.size(), 1);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, UnpackGrad) {
+TEST(ArrayGradTest, UnpackGrad) {
   Tensor x(DT_FLOAT, {2, 2, 3});
   x.flat<float>().setZero();
   Tensor dy0(DT_FLOAT, {2, 3});
@@ -136,7 +135,6 @@ std::vector<Tensor> ConcatGrad(int dim, const Tensor& x0, const Tensor& x1,
       {"dx:0", "dx:1", "dx:2"}, {}, &out));
   CHECK_EQ(out.size(), 3);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
@@ -161,11 +159,10 @@ std::vector<Tensor> ConcatGradV2(int dim, const Tensor& x0, const Tensor& x1,
       {"dx:0", "dx:1", "dx:2"}, {}, &out));
   CHECK_EQ(out.size(), 3);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, ConcatGrad) {
+TEST(ArrayGradTest, ConcatGrad) {
   Tensor x0(DT_FLOAT, {2, 3, 5});
   x0.flat<float>().setZero();
   Tensor x1(DT_FLOAT, {2, 1, 5});
@@ -238,11 +235,10 @@ std::vector<Tensor> SplitGrad(int dim, const Tensor& x, const Tensor& dy0,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, SplitGrad) {
+TEST(ArrayGradTest, SplitGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
   Tensor dy0(DT_FLOAT, {2, 2, 5});
@@ -279,11 +275,10 @@ std::vector<Tensor> ReshapeGrad(const Tensor& x, const Tensor& s,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, ReshapeGrad) {
+TEST(ArrayGradTest, ReshapeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
   auto s = test::AsTensor<int32>({8, 5});
@@ -319,11 +314,10 @@ std::vector<Tensor> ExpandDimsGrad(const Tensor& x, const Tensor& s,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, ExpandDimsGrad) {
+TEST(ArrayGradTest, ExpandDimsGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
   auto s = test::AsTensor<int32>({1});
@@ -356,11 +350,10 @@ std::vector<Tensor> SqueezeGrad(const Tensor& x, const Tensor& dy) {
   TF_CHECK_OK(sess->Run({{"x:0", x}, {"dy:0", dy}}, {"dx:0"}, {}, &out));
   CHECK_EQ(out.size(), 1);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, SqueezeGrad) {
+TEST(ArrayGradTest, SqueezeGrad) {
   Tensor x(DT_FLOAT, {2, 1, 3});
   x.flat<float>().setZero();
   Tensor dy(DT_FLOAT, {2, 3});
@@ -389,11 +382,10 @@ std::vector<Tensor> TransposeGrad(const Tensor& x, const Tensor& p,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, TransposeGrad) {
+TEST(ArrayGradTest, TransposeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
   auto p = test::AsTensor<int32>({2, 0, 1});
@@ -428,11 +420,10 @@ std::vector<Tensor> ReverseGrad(const Tensor& x, const Tensor& dims,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, ReverseGrad) {
+TEST(ArrayGradTest, ReverseGrad) {
   Tensor x(DT_FLOAT, {2, 3});
   x.flat<float>().setZero();
   auto dims = test::AsTensor<bool>({false, true});
@@ -465,11 +456,10 @@ std::vector<Tensor> ReverseV2Grad(const Tensor& x, const Tensor& axis,
                         {"dx:0", "dx:1"}, {}, &out));
   CHECK_EQ(out.size(), 2);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, ReverseV2Grad) {
+TEST(ArrayGradTest, ReverseV2Grad) {
   Tensor x(DT_FLOAT, {2, 3});
   x.flat<float>().setZero();
   auto axis = test::AsTensor<int32>({1});
@@ -502,11 +492,10 @@ std::vector<Tensor> SliceGrad(const Tensor& x, const Tensor& b, const Tensor& s,
                         {"dx:0", "dx:1", "dx:2"}, {}, &out));
   CHECK_EQ(out.size(), 3);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, SliceGrad) {
+TEST(ArrayGradTest, SliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
   auto begin = test::AsTensor<int32>({1, 1, 1});
@@ -564,7 +553,6 @@ std::vector<Tensor> StridedSliceGrad(const Tensor& x, const Tensor& begin,
                         {"dx:0", "dx:1", "dx:2", "dx:3"}, {}, &out));
   CHECK_EQ(out.size(), 4);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
@@ -611,11 +599,10 @@ std::vector<Tensor> StridedSliceGradGrad(
                         {"dx:0", "dx:1", "dx:2", "dx:3", "dx:4"}, {}, &out));
   CHECK_EQ(out.size(), 5);
   TF_CHECK_OK(sess->Close());
-  delete sess;
   return out;
 }
 
-TEST_F(ArrayGradTest, StridedSliceGrad) {
+TEST(ArrayGradTest, StridedSliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
   Tensor x_shape = test::AsTensor<int32>({2, 3, 4}, {3});
@@ -730,4 +717,5 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
   }
 }
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f54c7b0cfd7..85a6cfcac91 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -41,10 +41,10 @@ Status GetAxisForPackAndUnpack(InferenceContext* c, int32 rank_after_pack,
 }
 
 template <typename T>
-std::vector<int64> AsInt64(const Tensor* tensor, int num_elements) {
+std::vector<int64> AsInt64(const Tensor* tensor, int64 num_elements) {
   std::vector<int64> ret(num_elements);
   auto data = tensor->vec<T>();
-  for (int i = 0; i < num_elements; ++i) {
+  for (int64 i = 0; i < num_elements; ++i) {
     ret[i] = data(i);
   }
   return ret;
@@ -52,11 +52,11 @@ std::vector<int64> AsInt64(const Tensor* tensor, int num_elements) {
 
 template <typename T>
 Status PadKnown(InferenceContext* c, ShapeHandle input,
-                const Tensor* paddings_t, int32 num_dims) {
+                const Tensor* paddings_t, int64 num_dims) {
   // paddings_t is known.
   std::vector<DimensionHandle> dims(num_dims);
   auto paddings_data = paddings_t->matrix<T>();
-  for (int i = 0; i < num_dims; ++i) {
+  for (int64 i = 0; i < num_dims; ++i) {
     const T pad0 = paddings_data(i, 0);
     const T pad1 = paddings_data(i, 1);
     if (pad0 < 0 || pad1 < 0) {
@@ -209,7 +209,7 @@ The input tensors are all required to have size 1 in the first dimension.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [[1, 4]]
 # 'y' is [[2, 5]]
 # 'z' is [[3, 6]]
@@ -277,7 +277,7 @@ Etc.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [1, 4]
 # 'y' is [2, 5]
 # 'z' is [3, 6]
@@ -394,6 +394,28 @@ output: A `Tensor` with the concatenation of values stacked along the
   in `concat_dim` where it has the sum of the sizes.
 )doc");
 
+// TODO(vivek.v.rane@intel.com): Prefix the op names with underscore if the ops
+// are not to be made user-accessible.
+#ifdef INTEL_MKL
+REGISTER_OP("_MklConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("mkl_values: N * uint8")
+    .Input("mkl_axis: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::ConcatV2Shape)
+    .Doc(R"doc(
+MKL version of ConcatV2 operator. Uses MKL DNN APIs to perform concatenation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+#endif
+
 REGISTER_OP("ConcatOffset")
     .Input("concat_dim: int32")
     .Input("shape: N * int32")
@@ -410,19 +432,19 @@ Computes offsets of concat inputs within its output.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [2, 2, 7]
 # 'y' is [2, 3, 7]
 # 'z' is [2, 5, 7]
 concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 ```
 
+This is typically used by gradient computations for a concat operation.
+
 concat_dim: The dimension along which to concatenate.
 shape: The `N` int32 vectors representing shape of tensors being concatenated.
 offset: The `N` int32 vectors representing the starting offset
         of input tensors within the concatenated output.
-
-This is typically used by gradient computations for a concat operation.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -434,9 +456,10 @@ REGISTER_OP("Split")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
       DimensionHandle split_dimension;
-      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(0, &split_dimension));
-      int num_split = c->num_outputs();
       ShapeHandle input = c->input(1);
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInputWithNegativeIndexing(
+          0, c->Rank(input), &split_dimension));
+      int num_split = c->num_outputs();
       ShapeHandle out;
       if (!c->ValueKnown(split_dimension)) {
         if (c->RankKnown(input)) {
@@ -462,7 +485,7 @@ REGISTER_OP("Split")
 Splits a tensor into `num_split` tensors along one dimension.
 
 split_dim: 0-D.  The dimension along which to split.  Must be in the range
-  `[0, rank(value))`.
+  `[-rank(value), rank(value))`.
 num_split: The number of ways to split.  Must evenly divide
   `value.shape[split_dim]`.
 value: The tensor to split.
@@ -481,14 +504,15 @@ REGISTER_OP("SplitV")
     .Attr("Tlen: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       DimensionHandle split_dimension;
-      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(2, &split_dimension));
-      int32 num_outputs = c->num_outputs();
       ShapeHandle input = c->input(0);
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInputWithNegativeIndexing(
+          2, c->Rank(input), &split_dimension));
+      int32 num_outputs = c->num_outputs();
       int32 rank = c->Rank(input);
       ShapeHandle output_shape;
       const Tensor* size_splits = c->input_tensor(1);
       if (rank == InferenceContext::kUnknownRank) {
-        // If the rank of input tensor is unknown, then return unkown shapes.
+        // If the rank of input tensor is unknown, then return unknown shapes.
         output_shape = c->UnknownShape();
         for (int i = 0; i < num_outputs; ++i) {
           c->set_output(i, output_shape);
@@ -496,8 +520,18 @@ REGISTER_OP("SplitV")
       } else if (rank == 0) {
         // Throw error if input is a scalar.
         return errors::InvalidArgument("Can't split scalars");
-      } else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
-        // If split dimension or tensor containing the split sizes is unkown,
+      } else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
+        // If split dimension is known, but the sizes are unknown, then
+        // only the split dimension is unknown
+        output_shape = input;
+        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                         c->Value(split_dimension),
+                                         c->UnknownDim(), &output_shape));
+        for (int i = 0; i < num_outputs; ++i) {
+          c->set_output(i, output_shape);
+        }
+      } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
+        // If split dimension or tensor containing the split sizes is unknown,
         // then return unknown shapes of same rank as input.
         output_shape = c->UnknownShapeOfRank(rank);
         for (int i = 0; i < num_outputs; ++i) {
@@ -518,12 +552,38 @@ REGISTER_OP("SplitV")
           return errors::InvalidArgument(
               "Length of size_splits should be equal to num_outputs");
         }
+        int64_t cumsum_outputs = 0;
+        bool has_neg_one = false;
+        // If the sizes of the splits are known, then
+        // make sure that the sizes add up to the expected
+        // dimension size, with the possibility of a -1.
+        // Specify the full output shapes.
         for (int i = 0; i < num_outputs; ++i) {
           output_shape = c->UnknownShapeOfRank(rank);
           TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
                                            c->MakeDim(data[i]), &output_shape));
           c->set_output(i, output_shape);
+          if (data[i] == -1 && !has_neg_one)
+            has_neg_one = true;
+          else if (data[i] == -1 && has_neg_one)
+            return errors::InvalidArgument("size_splits can only have one -1");
+          else
+            cumsum_outputs += data[i];
         }
+        auto split_dim_size = c->Value(c->Dim(input, split_dim));
+        if (has_neg_one) {
+          if (cumsum_outputs < split_dim_size)
+            cumsum_outputs = split_dim_size;
+          else
+            cumsum_outputs = split_dim_size + 1;
+        }
+        if (c->ValueKnown(c->Dim(input, split_dim)) &&
+            cumsum_outputs != c->Value(c->Dim(input, split_dim)))
+          return errors::InvalidArgument(
+              "Sum of output sizes must match "
+              "the size of the original Tensor along the split dimension "
+              "or the sum of the positive sizes must be less if it contains a "
+              "-1");
       }
 
       return Status::OK();
@@ -536,7 +596,7 @@ size_splits: list containing the sizes of each output tensor along the split
              dimension. Must sum to the dimension of value along split_dim.
              Can contain one -1 indicating that dimension is to be inferred.
 split_dim: 0-D.  The dimension along which to split.  Must be in the range
-  `[0, rank(value))`.
+  `[-rank(value), rank(value))`.
 output: Tensors whose shape matches that of `value`
   except along `split_dim`, where their sizes are
   `size_splits[i]`.
@@ -553,6 +613,7 @@ REGISTER_OP("Const")
       TF_RETURN_IF_ERROR(TensorShape::IsValidShape(proto->tensor_shape()));
       TensorShape shape(proto->tensor_shape());
       std::vector<DimensionHandle> dims;
+      dims.reserve(shape.dims());
       for (int i = 0; i < shape.dims(); ++i) {
         dims.push_back(c->MakeDim(shape.dim_size(i)));
       }
@@ -608,6 +669,19 @@ x: a tensor of type T.
 y: a tensor of the same shape and type as x but filled with zeros.
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("OnesLike")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns a tensor of ones with the same shape and type as x.
+
+x: a tensor of type T.
+y: a tensor of the same shape and type as x but filled with ones.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Diag")
     .Input("diagonal: T")
@@ -635,7 +709,7 @@ rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [1, 2, 3, 4]
 tf.diag(diagonal) ==> [[1, 0, 0, 0]
                        [0, 2, 0, 0]
@@ -687,7 +761,7 @@ tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[1, 0, 0, 0]
               [0, 2, 0, 0]
               [0, 0, 3, 0]
@@ -733,7 +807,7 @@ tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
 
 and diagonal.shape = (2, 4)
@@ -821,6 +895,7 @@ REGISTER_OP("MatrixDiagPart")
       }
       const int32 rank = c->Rank(in);
       std::vector<DimensionHandle> dims;
+      dims.reserve(rank - 2);
       for (int i = 0; i < rank - 2; ++i) dims.push_back(c->Dim(in, i));
 
       DimensionHandle min_dim;
@@ -845,7 +920,7 @@ The input must be at least a matrix.
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[[1, 0, 0, 0]
                [0, 2, 0, 0]
                [0, 0, 3, 0]
@@ -892,7 +967,7 @@ The indicator function
 
 For example:
 
-```prettyprint
+```
 # if 'input' is [[ 0,  1,  2, 3]
                  [-1,  0,  1, 2]
                  [-2, -1,  0, 1]
@@ -911,7 +986,7 @@ tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
 
 Useful special cases:
 
-```prettyprint
+```
  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
@@ -933,7 +1008,7 @@ REGISTER_OP("Reverse")
     .Output("output: T")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -963,7 +1038,7 @@ of `tensor` must equal the number of elements in `dims`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1010,7 +1085,7 @@ REGISTER_OP("ReverseV2")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1039,7 +1114,7 @@ once, a InvalidArgument error is raised.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1210,7 +1285,7 @@ This operation creates a tensor of shape `dims` and fills it with `value`.
 
 For example:
 
-```prettyprint
+```
 # Output tensor has shape [2, 3].
 fill([2, 3], 9) ==> [[9, 9, 9]
                      [9, 9, 9]]
@@ -1231,9 +1306,12 @@ REGISTER_OP("_ParallelConcatStart")
     .Attr("dtype: type")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
-      c->set_output(0, out);
+      TensorShapeProto shape_proto;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+      c->set_output(0, output_shape);
       return Status::OK();
     })
     .Doc(R"doc(
@@ -1310,8 +1388,13 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 If `indices` is a permutation and `len(indices) == params.shape[0]` then
 this operation will permute `params` accordingly.
 
+`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+`indices` are always validated to be within range. If assigned to GPU,
+out-of-bound indices result in safe but unspecified behavior, which may include
+raising an error.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/Gather.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
 )doc");
 
@@ -1353,20 +1436,17 @@ REGISTER_OP("GatherNd")
     .Doc(R"doc(
 Gather values or slices from `params` according to `indices`.
 
-`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
+`indices` is an integer tensor containing indices into `params`.  The last
+dimension of `indices` can be at most the rank of `params`:
 
-`indices` must be integer tensor, containing indices into `params`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+    indices.shape[-1] <= params.rank
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `params`.
+The last dimension of `indices` corresponds to elements
+(if `indices.shape[-1] = params.rank`) or slices
+(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+of `params`.  The output tensor has shape
 
-Produces an output tensor with shape
-
-```
-[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-```
+    indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
 Some examples below.
 
@@ -1445,10 +1525,10 @@ Batched indexing into a 3-tensor:
     output = [['b0', 'b1'], ['d0', 'c1']]
 ```
 
-params: `P-D`.  The tensor from which to gather values.
-indices: `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
-output: `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-  `indices`.
+params: The tensor from which to gather values.
+indices: Index tensor.
+output: Values from `params` gathered from indices given by `indices`, with
+  shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1458,14 +1538,35 @@ REGISTER_OP("Identity")
     .Attr("T: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      c->set_output_handle_dtype(0, c->input_handle_dtype(0));
-      c->set_output_handle_shape(0, c->input_handle_shape(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
       return Status::OK();
     })
     .Doc(R"Doc(
 Return a tensor with the same shape and contents as the input tensor or value.
 )Doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklIdentity")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    })
+    .Doc(R"Doc( Mkl implementation of IdentityOp
+)Doc");
+#endif
+
 // --------------------------------------------------------------------------
 REGISTER_OP("RefIdentity")
     .Input("input: Ref(T)")
@@ -1511,6 +1612,7 @@ REGISTER_OP("PreventGradient")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
+    .Attr("message: string = ''")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc(
 An identity op that triggers an error if a gradient is requested.
@@ -1522,6 +1624,11 @@ will return an error when trying to lookup the gradient of this op,
 because no gradient must ever be registered for this function.  This
 op exists to prevent subtle bugs from silently returning unimplemented
 gradients in some corner cases.
+
+input: any tensor.
+output: the same input tensor.
+message: Will be printed in the error when anyone tries to differentiate
+this operation.
 )Doc");
 
 // --------------------------------------------------------------------------
@@ -1564,7 +1671,7 @@ implied by `shape` must be the same as the number of elements in `tensor`.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
 # tensor 't' has shape [9]
 reshape(t, [3, 3]) ==> [[1, 2, 3],
@@ -1611,6 +1718,21 @@ reshape(t, []) ==> 7
 shape: Defines the shape of the output tensor.
 )Doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklReshape")
+    .Input("tensor: T")
+    .Input("shape: Tshape")
+    .Input("mkl_tensor: uint8")
+    .Input("mkl_shape: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: type")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return SetOutputShapeForReshape(c); })
+    .Doc(R"Doc( MKL implementation of ReshapeOp.
+)Doc");
+#endif  // INTEL_MKL
+
 // --------------------------------------------------------------------------
 REGISTER_OP("InvertPermutation")
     .Input("x: T")
@@ -1636,7 +1758,7 @@ The values must include 0. There can be no duplicate values or negative values.
 
 For example:
 
-```prettyprint
+```
 # tensor `x` is [3, 4, 0, 2, 1]
 invert_permutation(x) ==> [2, 4, 3, 0, 1]
 ```
@@ -1741,7 +1863,7 @@ in the unique output `y`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx = unique(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1781,7 +1903,7 @@ contains the count of each element of `y` in `x`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx, count = unique_with_counts(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1826,7 +1948,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@@ -1907,7 +2029,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 
 For example:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 0
 seq_dim = 1
@@ -1929,7 +2051,7 @@ output[3, 2:, :, ...] = input[3, 2:, :, ...]
 
 In contrast, if:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 2
 seq_dim = 0
@@ -1970,7 +2092,7 @@ This operation returns an integer representing the rank of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 # shape of tensor 't' is [2, 2, 3]
 rank(t) ==> 3
@@ -1996,7 +2118,7 @@ This operation returns an integer representing the number of elements in
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
 size(t) ==> 12
 ```
@@ -2229,7 +2351,7 @@ encoding is best understand by considering a non-trivial example. In
 particular,
 `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 
-```prettyprint
+```
 begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 end = [2, 4, x, x, -3, x]
 strides = [1, 1, x, x, -1, 1]
@@ -2371,6 +2493,32 @@ shape must be exactly the shape produced by the slice of `ref`.
 // broadcasting.
 // --------------------------------------------------------------------------
 
+REGISTER_OP("ResourceStridedSliceAssign")
+    .Input("ref: resource")
+    .Input("begin: Index")
+    .Input("end: Index")
+    .Input("strides: Index")
+    .Input("value: T")
+    .Attr("T: type")
+    .Attr("Index: {int32, int64}")
+    .Attr("begin_mask: int = 0")
+    .Attr("end_mask: int = 0")
+    .Attr("ellipsis_mask: int = 0")
+    .Attr("new_axis_mask: int = 0")
+    .Attr("shrink_axis_mask: int = 0")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Assign `value` to the sliced l-value reference of `ref`.
+
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+
+)doc");
+
 REGISTER_OP("Tile")
     .Input("input: T")
     .Input("multiples: Tmultiples")
@@ -2451,7 +2599,7 @@ the output tensor can vary depending on how many true values there are in
 
 For example:
 
-```prettyprint
+```
 # 'input' tensor is [[True, False]
 #                    [True, False]]
 # 'input' has two true values, so output has two coordinates.
@@ -2555,7 +2703,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 1], [2, 2]]
 # 'paddings' is [[1, 1], [2, 2]]
 # rank of 't' is 2
@@ -2594,7 +2742,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6]].
 # 'paddings' is [[1, 1]], [2, 2]].
 # 'mode' is SYMMETRIC.
@@ -2620,10 +2768,10 @@ output: The padded tensor.
 namespace {
 template <typename T>
 Status MirrorPadKnown(InferenceContext* c, ShapeHandle input,
-                      const Tensor* paddings_t, int32 input_rank) {
+                      const Tensor* paddings_t, int64 input_rank) {
   auto paddings_data = paddings_t->matrix<T>();
   std::vector<DimensionHandle> dims(input_rank);
-  for (int i = 0; i < input_rank; ++i) {
+  for (int64 i = 0; i < input_rank; ++i) {
     const int64 pad0 = static_cast<int64>(paddings_data(i, 0));
     const int64 pad1 = static_cast<int64>(paddings_data(i, 1));
     if (pad0 < 0 || pad1 < 0) {
@@ -2690,7 +2838,7 @@ The folded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
 # 'paddings' is [[0, 1]], [0, 1]].
 # 'mode' is SYMMETRIC.
@@ -2710,7 +2858,7 @@ output: The folded tensor.
 REGISTER_OP("Placeholder")
     .Output("output: dtype")
     .Attr("dtype: type")
-    .Attr("shape: shape = {}")
+    .Attr("shape: shape = { unknown_rank: true }")
     .SetShapeFn([](InferenceContext* c) {
       PartialTensorShape shape;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
@@ -2718,7 +2866,7 @@ REGISTER_OP("Placeholder")
       // Placeholder has legacy behavior where we cannot tell the difference
       // between a scalar shape attribute and 'unknown shape'.  So if the shape
       // is a scalar, we return an unknown shape.
-      if (shape.dims() <= 0) {
+      if (c->graph_def_version() <= 21 && shape.dims() <= 0) {
         return shape_inference::UnknownShape(c);
       }
 
@@ -2742,11 +2890,9 @@ shape: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
   shape is unconstrained.
 )doc");
 
-// This version fixes an issue with the original version of Placeholder
-// where the empty shape attribute "[]" was used to denote
-// an unknown shape.  This meant that scalars (added later) could
-// not be represented natively.  This new version fixes that
-// limitation.
+// Placeholder was modified in a backwards compatible way to do what
+// PlaceholderV2 did, so we have deprecated V2 (no one was really
+// using it).
 REGISTER_OP("PlaceholderV2")
     .Output("output: dtype")
     .Attr("dtype: type")
@@ -2759,6 +2905,7 @@ REGISTER_OP("PlaceholderV2")
       c->set_output(0, output);
       return Status::OK();
     })
+    .Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.")
     .Doc(R"doc(
 A placeholder op for a value that will be fed into the computation.
 
@@ -2867,7 +3014,7 @@ which will make the shape `[1, height, width, channels]`.
 
 Other examples:
 
-```prettyprint
+```
 # 't' is a tensor of shape [2]
 shape(expand_dims(t, 0)) ==> [1, 2]
 shape(expand_dims(t, 1)) ==> [2, 1]
@@ -2969,14 +3116,14 @@ dimensions, you can remove specific size 1 dimensions by specifying
 
 For example:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t)) ==> [2, 3]
 ```
 
 Or, to remove specific size 1 dimensions:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 ```
@@ -3019,14 +3166,14 @@ position of each `out` element in `x`. In other words:
 
 For example, given this input:
 
-```prettyprint
+```
 x = [1, 2, 3, 4, 5, 6]
 y = [1, 3, 5]
 ```
 
 This operation would return:
 
-```prettyprint
+```
 out ==> [2, 4, 6]
 idx ==> [1, 3, 5]
 ```
@@ -3285,34 +3432,34 @@ Some examples:
 (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3321,7 +3468,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3331,7 +3478,7 @@ x = [[[[1], [3]], [[9], [11]]],
 (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
     paddings = `[[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3340,7 +3487,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 3, 1]` and value:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3414,32 +3561,32 @@ Some examples:
 
 (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3448,7 +3595,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3457,7 +3604,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3466,7 +3613,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
@@ -3552,26 +3699,26 @@ Some examples:
 (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3579,7 +3726,7 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3588,7 +3735,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3598,7 +3745,7 @@ x = [[[1],   [2],  [3],  [4]],
 (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3607,7 +3754,7 @@ x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3672,32 +3819,32 @@ Some examples:
 
 (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3706,7 +3853,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3715,14 +3862,14 @@ x = [[[1],   [2],  [3],  [4]],
 
 (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[5], [7]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3788,14 +3935,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]],
       [[3], [4]]]]
 ```
 
 This operation will output a tensor of shape `[1, 1, 1, 4]`:
 
-```prettyprint
+```
 [[[[1, 2, 3, 4]]]]
 ```
 
@@ -3806,7 +3953,7 @@ The output element shape is `[1, 1, 4]`.
 
 For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3814,13 +3961,13 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 This operation, for block_size of 2, will return the following tensor of shape
 `[1, 1, 1, 12]`
 
-```prettyprint
+```
 [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [5],  [6]],
       [[3],   [4],  [7],  [8]],
       [[9],  [10], [13],  [14]],
@@ -3829,7 +3976,7 @@ x = [[[[1],   [2],  [5],  [6]],
 
 the operator will return the following tensor of shape `[1 2 2 4]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -3898,14 +4045,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4]]]]
 
 ```
 
 This operation will output a tensor of shape `[1, 2, 2, 1]`:
 
-```prettyprint
+```
    [[[[1], [2]],
      [[3], [4]]]]
 ```
@@ -3917,14 +4064,14 @@ The output element shape is `[2, 2, 1]`.
 
 For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 This operation, for block size of 2, will return the following tensor of shape
 `[1, 2, 2, 3]`
 
-```prettyprint
+```
    [[[[1, 2, 3], [4, 5, 6]],
      [[7, 8, 9], [10, 11, 12]]]]
 
@@ -3932,7 +4079,7 @@ This operation, for block size of 2, will return the following tensor of shape
 
 Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
 
-```prettyprint
+```
 x =  [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -3941,7 +4088,7 @@ x =  [[[[1, 2, 3, 4],
 
 the operator will return the following tensor of shape `[1 4 4 1]`:
 
-```prettyprint
+```
 x = [[ [1],   [2],  [5],  [6]],
      [ [3],   [4],  [7],  [8]],
      [ [9],  [10], [13],  [14]],
@@ -4283,6 +4430,27 @@ REGISTER_OP("QuantizeAndDequantize")
     .Output("output: T")
     .Attr("T: {float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
+    .Deprecated(22, "Replaced by QuantizeAndDequantizeV2")
+    .Doc(R"doc(
+Use QuantizeAndDequantizeV2 instead.
+)doc");
+
+REGISTER_OP("QuantizeAndDequantizeV2")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Attr("signed_input: bool = true")
+    .Attr("num_bits: int = 8")
+    .Attr("range_given: bool = false")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Quantizes then dequantizes a tensor.
 
@@ -4301,7 +4469,7 @@ To perform this op, we first find the range of values in our tensor. The range
 we use is always centered on 0, so we find m such that
 
 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
 
 Our input tensor range is then [-m, m].
 
@@ -4340,122 +4508,10 @@ input: Tensor to quantize and then dequantize.
 signed_input: If the quantization is signed or unsigned.
 num_bits: The bitwidth of the quantization.
 range_given: If the range is given or should be computed from the tensor.
-input_min: If range is given, this is the min of the range.
-input_max: If range is given, this is the max of the range.
-)doc");
-
-// EXPERIMENTAL: tfdbg debugger-inserted ops.
-REGISTER_OP("Copy")
-    .Input("input: T")
-    .Output("output: T")
-    .Attr("T: type")
-    .Attr("tensor_name: string = ''")
-    .SetAllowsUninitializedInput()
-    .Doc(R"doc(
-Copy Op.
-
-Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
-device on which the tensor is allocated.
-
-Unlike the CopyHost Op, this op does not have HostMemory constraint on its
-input or output.
-
-input: Input tensor.
-output: Output tensor, deep-copied from input.
-tensor_name: The name of the input tensor.
-)doc");
-
-REGISTER_OP("CopyHost")
-    .Input("input: T")
-    .Output("output: T")
-    .Attr("T: type")
-    .Attr("tensor_name: string = ''")
-    .SetAllowsUninitializedInput()
-    .Doc(R"doc(
-Copy Host Op.
-
-Performs CPU-to-CPU deep-copying of tensor.
-
-Unlike the Copy Op, this op has HostMemory constraint on its input or output.
-
-input: Input tensor.
-output: Output tensor, deep-copied from input.
-tensor_name: The name of the input tensor.
-)doc");
-
-REGISTER_OP("DebugIdentity")
-    .Input("input: T")
-    .Output("output: T")
-    .Attr("T: type")
-    .Attr("tensor_name: string = ''")
-    .Attr("debug_urls: list(string) = []")
-    .SetAllowsUninitializedInput()
-    .Doc(R"doc(
-Debug Identity Op.
-
-Provides an identity mapping of the non-Ref type input tensor for debugging.
-
-input: Input tensor, non-Reference type.
-output: Output tensor that equals the input tensor.
-tensor_name: Name of the input tensor.
-debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
-)doc");
-
-REGISTER_OP("DebugNanCount")
-    .Input("input: T")
-    .Output("output: int64")  // The debug signal (nan count) is int64
-    .Attr("T: type")
-    .Attr("tensor_name: string = ''")
-    .Attr("debug_urls: list(string) = []")
-    .SetAllowsUninitializedInput()
-    .Doc(R"doc(
-Debug NaN Value Counter Op
-
-Counts number of NaNs in the input tensor, for debugging.
-
-input: Input tensor, non-Reference type.
-output: An integer output tensor that is the number of NaNs in the input.
-tensor_name: Name of the input tensor.
-debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
-)doc");
-
-REGISTER_OP("DebugNumericSummary")
-    .Input("input: T")
-    .Output("output: double")
-    .Attr("T: type")
-    .Attr("tensor_name: string = ''")
-    .Attr("debug_urls: list(string) = []")
-    .SetAllowsUninitializedInput()
-    .Doc(R"doc(
-Debug Numeric Summary Op.
-
-Provide a basic summary of numeric value types, range and distribution.
-
-input: Input tensor, non-Reference type, float or double.
-output: A double tensor of shape [12], the elements of which are:
-  [0]: is initialized (1.0) or not (0.0).
-  [1]: total number of elements
-  [2]: -inf count
-  [3]: negative element count (excluding -inf)
-  [4]: zero element count
-  [5]: positive element count (excluding +inf)
-  [6]: +inf element count
-  [7]: NaN element count
-Output elements [1:8] are all zero, if the tensor is uninitialized.
-  [8]: minimum of all non-inf and non-NaN elements.
-       If uninitialized or no such element exists: +inf.
-  [9]: maximum of all non-inf and non-NaN elements.
-       If uninitialized or no such element exists: -inf.
-  [10]: mean of all non-inf and non-NaN elements.
-        If uninitialized or no such element exists: NaN.
-  [11]: variance of all non-inf and non-NaN elements.
-        If uninitialized or no such element exists: NaN.
-
-tensor_name: Name of the input tensor.
-debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
+input_min: If range_given, this is the min of the range, otherwise this input
+           will be ignored.
+input_max: If range_given, this is the max of the range, otherwise this input
+           will be ignored.
 )doc");
 
 REGISTER_OP("QuantizeV2")
@@ -4578,7 +4634,7 @@ each value by 128 prior to casting.
 
 If the mode is 'MIN_FIRST', then this approach is used:
 
-```
+```c++
 number_of_steps = 1 << (# of bits in T)
 range_adjust = number_of_steps / (number_of_steps - 1)
 range = (range_max - range_min) * range_adjust
@@ -4778,47 +4834,47 @@ REGISTER_OP("ScatterNd")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
-    .Doc(
-        R"doc(Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` tensor according to
+    .Doc(R"doc(
+Scatter `updates` into a new (initially zero) tensor according to `indices`.
+
+Creates a new tensor by applying sparse `updates` to individual
+values or slices within a zero tensor of the given `shape` according to
 indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
 operator which extracts values or slices from a given tensor.
 
-TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-syntax.
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates.
 
-`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-`Q`.
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 
-`indices` must be integer tensor, containing indices into `shape`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+    indices.shape[-1] <= shape.rank
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `shape`.
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
 
-`updates` is Tensor of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-```
+    indices.shape[:-1] + shape[indices.shape[-1]:]
 
 The simplest form of scatter is to insert individual elements in a tensor by
 index. For example, say we want to insert 4 scattered elements in a rank-1
 tensor with 8 elements.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd1.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
 
+```python
     indices = tf.constant([[4], [3], [1], [7]])
     updates = tf.constant([9, 10, 11, 12])
     shape = tf.constant([8])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
+```
 
 The resulting tensor would look like this:
 
@@ -4829,11 +4885,12 @@ example, if we wanted to insert two slices in the first dimension of a
 rank-3 tensor with two matrices of new values.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd2.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
 
+```python
     indices = tf.constant([[0], [2]])
     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
                             [7, 7, 7, 7], [8, 8, 8, 8]],
@@ -4842,7 +4899,8 @@ In Python, this scatter operation would look like this:
     shape = tf.constant([4, 4, 4])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
+```
 
 The resulting tensor would look like this:
 
@@ -4851,11 +4909,9 @@ The resulting tensor would look like this:
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as tensor. A tensor of updated values
-  to store in ref.
-shape: A vector. The shape of the resulting tensor.
+indices: Index tensor.
+updates: Updates to scatter into output.
+shape: 1-D. The shape of the resulting tensor.
 output: A new tensor with the given shape and updates applied according
   to the indices.
 )doc");
@@ -4863,15 +4919,19 @@ output: A new tensor with the given shape and updates applied according
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("inputs: float")
     .Output("outputs: float")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 
-Attributes [min; max] define the clamping range for the 'inputs' data.  Op
-divides this range into 255 steps (total of 256 values), then replaces each
-'inputs' value with the closest of the quantized step values.
+Attributes `[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 Quantization is called fake since the output is still in floating point.
 )doc");
@@ -4879,6 +4939,8 @@ Quantization is called fake since the output is still in floating point.
 REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("gradients: float")
     .Input("inputs: float")
     .Output("backprops: float")
@@ -4893,6 +4955,8 @@ backprops: Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVars")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4905,18 +4969,22 @@ REGISTER_OP("FakeQuantWithMinMaxVars")
       return Status::OK();
     })
     .Doc(R"doc(
-Fake-quantize the 'inputs' tensor of type float and shape `[b, h, w, d]` via
-global float scalars `min` and `max` to 'outputs' tensor of same shape as
-`inputs`.
+Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+and `max` to 'outputs' tensor of same shape as `inputs`.
 
-[min; max] is the clamping range for the 'inputs' data.  Op divides this range
-into 255 steps (total of 256 values), then replaces each 'inputs' value with the
-closest of the quantized step values.
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 
-This operation has a gradient and thus allows for training `min` and `max` values.
+This operation has a gradient and thus allows for training `min` and `max`
+values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -4945,6 +5013,8 @@ Compute gradients for a FakeQuantWithMinMaxVars operation.
 gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
 min, max: Quantization interval, scalar floats.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
+narrow_range: Whether to quantize into 2^num_bits - 1 distinct values.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs:
   `gradients * (inputs >= min && inputs <= max)`.
 backprop_wrt_min: Backpropagated gradients w.r.t. min parameter:
@@ -4954,6 +5024,8 @@ backprop_wrt_max: Backpropagated gradients w.r.t. max parameter:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4977,14 +5049,19 @@ Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
 `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
 to 'outputs' tensor of same shape as `inputs`.
 
-[min; max] is the clamping range for the 'inputs' data in the corresponding
-depth channel.  Op divides this range into 255 steps (total of 256 values), then
-replaces each 'inputs' value with the closest of the quantized step values.
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 
-This operation has a gradient and thus allows for training `min` and `max` values.
+This operation has a gradient and thus allows for training `min` and `max`
+values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
+    .Attr("num_bits: int = 8")
+    .Attr("narrow_range: bool = false")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -5018,6 +5095,8 @@ gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
   same as `gradients`.
 min, max: Quantization interval, floats of shape `[d]`.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
+narrow_range: Whether to quantize into 2^num_bits - 1 distinct values.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs, shape same as
   `inputs`:
     `gradients * (inputs >= min && inputs <= max)`.
@@ -5027,6 +5106,27 @@ backprop_wrt_max: Backpropagated gradients w.r.t. max parameter, shape `[d]`:
   `sum_per_d(gradients * (inputs > max))`.
 )doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklConcat")
+    .Input("concat_dim: int32")
+    .Input("values: N * T")
+    .Input("mkl_concat_dim: uint8")
+    .Input("mkl_values: N * uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::ConcatShape(c, c->num_inputs() - 3);
+    })
+    .Doc(R"doc(
+MKL version of Concat operator. Uses MKL DNN APIs to perform concatenation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+#endif
+
 // Deprecated op registrations:
 
 // The following can be deleted after 10mar2017.
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 0f5d71fcd72..b1d334e4545 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -22,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -30,6 +30,7 @@ TEST(ArrayOpsTest, Pack_ShapeFn) {
   auto set_axis = [&op](int axis) {
     int n = 3;
     std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
     for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
     TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
                      .Input(src_list)
@@ -134,13 +135,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
   shape_proto->add_dim()->set_size(-1);
   rebuild_node_def();
-  INFER_ERROR("Shape [1,2,3,4,-1] has negative dimensions", op, "");
+  INFER_ERROR("Shape [1,2,3,4,?] is not fully defined", op, "");
 }
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "QuantizeAndDequantize", "RefIdentity",
-           "StopGradient", "ZerosLike",
+           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
+           "StopGradient", "ZerosLike", "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
@@ -161,13 +162,23 @@ TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
   // Check that handle dtypes are preserved.
   const OpRegistrationData* op_reg_data;
   TF_ASSERT_OK(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
-  shape_inference::InferenceContext c(&op.node_def, op_reg_data->op_def,
-                                      {TensorShapeProto()}, {}, {}, {},
-                                      {DT_BOOL});
+  std::vector<
+      std::unique_ptr<std::vector<std::pair<TensorShapeProto, DataType>>>>
+      handle_data;
+  handle_data.emplace_back(
+      new std::vector<std::pair<TensorShapeProto, DataType>>{
+          {TensorShapeProto(), DT_BOOL}});
+  shape_inference::InferenceContext c(TF_GRAPH_DEF_VERSION, &op.node_def,
+                                      op_reg_data->op_def, {TensorShapeProto()},
+                                      {}, {}, handle_data);
   TF_ASSERT_OK(c.construction_status());
   ASSERT_TRUE(op_reg_data->shape_inference_fn != nullptr);
   TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn));
-  EXPECT_TRUE(c.output_handle_dtype(0) == DT_BOOL);
+
+  const auto* shapes_and_types = c.output_handle_shapes_and_types(0);
+  ASSERT_TRUE(shapes_and_types != nullptr);
+  ASSERT_EQ(1, shapes_and_types->size());
+  EXPECT_EQ((*shapes_and_types)[0].dtype, DT_BOOL);
 }
 
 TEST(ArrayOpsTest, Diag_ShapeFn) {
@@ -274,6 +285,7 @@ TEST(ArrayOpsTest, ShapeN_ShapeFn) {
   ShapeInferenceTestOp op("ShapeN");
   int n = 3;
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
   TF_ASSERT_OK(NodeDefBuilder("test", "ShapeN")
                    .Input(src_list)
@@ -539,6 +551,7 @@ TEST(ArrayOpsTest, Concat_ShapeFn) {
   ShapeInferenceTestOp op("Concat");
   auto set_n = [&op](int n) {
     std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
     for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
     TF_ASSERT_OK(NodeDefBuilder("test", "Concat")
                      .Input({"concat_dim", 0, DT_INT32})
@@ -612,6 +625,7 @@ TEST(ArrayOpsTest, ConcatV2_ShapeFn) {
   ShapeInferenceTestOp op("ConcatV2");
   auto set_n = [&op](int n) {
     std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
     for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
     TF_ASSERT_OK(NodeDefBuilder("test", "ConcatV2")
                      .Input(src_list)
@@ -688,6 +702,7 @@ TEST(ArrayOpsTest, ConcatOffset_ShapeFn) {
 
   const int n = 4;
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_INT32);
   TF_ASSERT_OK(NodeDefBuilder("test", "ConcatOffset")
                    .Input({"concat_dim", 0, DT_INT32})
@@ -785,79 +800,23 @@ TEST(ArrayOpsTest, Placeholder_ShapeFn) {
   }
 
   {
-    // Scalar shapes are unknown shapes due to legacy.
+    // Scalar shapes are supported
     ShapeInferenceTestOp op("Placeholder");
     TensorShape shape({});
     TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
                      .Attr("shape", shape)
                      .Attr("dtype", DT_FLOAT)
                      .Finalize(&op.node_def));
-    INFER_OK(op, "", "?");
-  }
-
-  {
-    // Partial shape
-    ShapeInferenceTestOp op("Placeholder");
-    const int64 dims[2] = {1, -1};
-    PartialTensorShape shape;
-    TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
-    TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "", "[1,?]");
-  }
-
-  {
-    ShapeInferenceTestOp op("PlaceholderWithDefault");
-    const int64 dims[2] = {1, -1};
-    PartialTensorShape shape;
-    TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderWithDefault")
-                     .Input("input", 0, DT_FLOAT)
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "[1,2]", "[1,?]");
-
-    // input shape is not compatible with output shape.
-    INFER_ERROR("Dimension 0 in both shapes must be equal, but are 2 and 1", op,
-                "[2,3]");
-    // Wrong rank
-    INFER_ERROR("Shapes must be equal rank, but are 3 and 2", op, "[1,3,10]");
-  }
-}
-
-TEST(ArrayOpsTest, PlaceholderV2_ShapeFn) {
-  {
-    // 2D shape
-    ShapeInferenceTestOp op("PlaceholderV2");
-    TensorShape shape({1, 2});
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "", "[1,2]");
-  }
-
-  {
-    // Scalar shapes are supported in V2.
-    ShapeInferenceTestOp op("PlaceholderV2");
-    TensorShape shape({});
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
     INFER_OK(op, "", "[]");
   }
 
   {
     // Partial shape
-    ShapeInferenceTestOp op("PlaceholderV2");
+    ShapeInferenceTestOp op("Placeholder");
     const int64 dims[2] = {1, -1};
     PartialTensorShape shape;
     TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
+    TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
                      .Attr("shape", shape)
                      .Attr("dtype", DT_FLOAT)
                      .Finalize(&op.node_def));
@@ -866,9 +825,9 @@ TEST(ArrayOpsTest, PlaceholderV2_ShapeFn) {
 
   {
     // Unknown shape
-    ShapeInferenceTestOp op("PlaceholderV2");
+    ShapeInferenceTestOp op("Placeholder");
     PartialTensorShape shape;
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
+    TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
                      .Attr("shape", shape)
                      .Attr("dtype", DT_FLOAT)
                      .Finalize(&op.node_def));
@@ -1042,6 +1001,9 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
   // If the rank is known, we know the rank of each output.
   INFER_OK(op, "?;[?,?]", "[?,?];[?,?]");
 
+  // split_dim is unknown but other inputs are known.
+  INFER_OK(op, "?;[1,4]", "[?,?];[?,?]");
+
   // split_dim is known.
   Tensor split_dim = test::AsTensor<int32>({1, 2});
   op.input_tensors[0] = &split_dim;
@@ -1053,6 +1015,26 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
   INFER_OK(op, "?;[1,?]", "[d1_0,?];[d1_0,?]");
   INFER_ERROR("Dimension size must be evenly divisible by 2 but is 5", op,
               "?;[1,5]");
+
+  // split_dim too large.
+  split_dim = test::AsScalar<int32>(3);
+  INFER_ERROR(
+      "Dimension size, given by scalar input 3 must be in range [-3, 3)", op,
+      "?;[1,4,8]");
+
+  // Negative split_dim.
+  split_dim = test::AsScalar<int32>(-1);
+  INFER_OK(op, "?;?", "?;?");
+  INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
+  INFER_OK(op, "?;[1,?]", "[d1_0,?];[d1_0,?]");
+  INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
+  INFER_OK(op, "?;[1,4,8]", "[d1_0,d1_1,4];[d1_0,d1_1,4]");
+  split_dim = test::AsScalar<int32>(-2);
+  INFER_OK(op, "?;[1,4,8]", "[d1_0,2,d1_2];[d1_0,2,d1_2]");
+  split_dim = test::AsScalar<int32>(-4);
+  INFER_ERROR(
+      "Dimension size, given by scalar input -4 must be in range [-3, 3)", op,
+      "?;[1,4,8]");
 }
 
 TEST(ArrayOpsTest, Tile_ShapeFn) {
@@ -1175,6 +1157,17 @@ TEST(ArrayOpsTest, ExtractImagePatchesShapeTest) {
       op, "[1,7,7,2]");
 }
 
+TEST(ArrayOpsTest, QuantizeAndDequantizeV2_ShapeFn) {
+  ShapeInferenceTestOp op("QuantizeAndDequantizeV2");
+  INFER_OK(op, "?;?;?", "in0");
+  INFER_OK(op, "[];?;?", "in0");
+  INFER_OK(op, "[1,2,?,4,5];?;?", "in0");
+
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[1];[]");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[];[1]");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1,2,?,4,5];[1];[1]");
+}
+
 TEST(ArrayOpsTest, SpaceToBatch_ShapeFn) {
   ShapeInferenceTestOp op("SpaceToBatch");
   op.input_tensors.resize(2);
@@ -1614,4 +1607,16 @@ TEST(ArrayOpsTest, QuantizedConcat_ShapeFn) {
   // Note that other cases of concat are covered in the Concat tests.
 }
 
+TEST(StateOpsTest, _ParallelConcatStart_ShapeFn) {
+  ShapeInferenceTestOp op("_ParallelConcatStart");
+  TensorShape shape({1, 2, 3});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "_ParallelConcatStart")
+                   .Attr("shape", shape_proto)
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
new file mode 100644
index 00000000000..02b13a455ce
--- /dev/null
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -0,0 +1,253 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+
+namespace {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status DecodeWavShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+
+  DimensionHandle channels_dim;
+  int32 desired_channels;
+  TF_RETURN_IF_ERROR(c->GetAttr("desired_channels", &desired_channels));
+  if (desired_channels == 0) {
+    channels_dim = c->UnknownDim();
+  } else {
+    if (desired_channels < 0) {
+      return errors::InvalidArgument("channels must be non-negative, got ",
+                                     desired_channels);
+    }
+    channels_dim = c->MakeDim(desired_channels);
+  }
+  DimensionHandle samples_dim;
+  int32 desired_samples;
+  TF_RETURN_IF_ERROR(c->GetAttr("desired_samples", &desired_samples));
+  if (desired_samples == 0) {
+    samples_dim = c->UnknownDim();
+  } else {
+    if (desired_samples < 0) {
+      return errors::InvalidArgument("samples must be non-negative, got ",
+                                     desired_samples);
+    }
+    samples_dim = c->MakeDim(desired_samples);
+  }
+  c->set_output(0, c->MakeShape({samples_dim, channels_dim}));
+  c->set_output(1, c->Scalar());
+  return Status::OK();
+}
+
+Status EncodeWavShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
+
+Status SpectrogramShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+  int32 window_size;
+  TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size));
+  int32 stride;
+  TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride));
+
+  DimensionHandle input_channels = c->Dim(input, 0);
+  DimensionHandle input_length = c->Dim(input, 1);
+
+  DimensionHandle output_length;
+  if (!c->ValueKnown(input_length)) {
+    output_length = c->UnknownDim();
+  } else {
+    const int64 input_length_value = c->Value(input_length);
+    const int64 length_minus_window = (input_length_value - window_size);
+    int64 output_length_value;
+    if (length_minus_window < 0) {
+      output_length_value = 0;
+    } else {
+      output_length_value = 1 + (length_minus_window / stride);
+    }
+    output_length = c->MakeDim(output_length_value);
+  }
+
+  DimensionHandle output_channels =
+      c->MakeDim(1 + NextPowerOfTwo(window_size) / 2);
+  c->set_output(0,
+                c->MakeShape({input_channels, output_length, output_channels}));
+  return Status::OK();
+}
+
+Status MfccShapeFn(InferenceContext* c) {
+  ShapeHandle spectrogram;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &spectrogram));
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+
+  int32 dct_coefficient_count;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr("dct_coefficient_count", &dct_coefficient_count));
+
+  DimensionHandle spectrogram_channels = c->Dim(spectrogram, 0);
+  DimensionHandle spectrogram_length = c->Dim(spectrogram, 1);
+
+  DimensionHandle output_channels = c->MakeDim(dct_coefficient_count);
+
+  c->set_output(0, c->MakeShape({spectrogram_channels, spectrogram_length,
+                                 output_channels}));
+  return Status::OK();
+}
+
+}  // namespace
+
+REGISTER_OP("DecodeWav")
+    .Input("contents: string")
+    .Attr("desired_channels: int = -1")
+    .Attr("desired_samples: int = -1")
+    .Output("audio: float")
+    .Output("sample_rate: int32")
+    .SetShapeFn(DecodeWavShapeFn)
+    .Doc(R"doc(
+Decode a 16-bit PCM WAV file to a float tensor.
+
+The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+
+When desired_channels is set, if the input contains fewer channels than this
+then the last channel will be duplicated to give the requested number, else if
+the input has more channels than requested then the additional channels will be
+ignored.
+
+If desired_samples is set, then the audio will be cropped or padded with zeroes
+to the requested length.
+
+The first output contains a Tensor with the content of the audio samples. The
+lowest dimension will be the number of channels, and the second will be the
+number of samples. For example, a ten-sample-long stereo WAV file should give an
+output shape of [10, 2].
+
+contents: The WAV-encoded audio, usually from a file.
+desired_channels: Number of sample channels wanted.
+desired_samples: Length of audio requested.
+audio: 2-D with shape `[length, channels]`.
+sample_rate: Scalar holding the sample rate found in the WAV header.
+)doc");
+
+REGISTER_OP("EncodeWav")
+    .Input("audio: float")
+    .Input("sample_rate: int32")
+    .Output("contents: string")
+    .SetShapeFn(EncodeWavShapeFn)
+    .Doc(R"doc(
+Encode audio data using the WAV file format.
+
+This operation will generate a string suitable to be saved out to create a .wav
+audio file. It will be encoded in the 16-bit PCM format. It takes in float
+values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+that range.
+
+`audio` is a 2-D float Tensor of shape `[length, channels]`.
+`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+
+audio: 2-D with shape `[length, channels]`.
+sample_rate: Scalar containing the sample frequency.
+contents: 0-D. WAV-encoded file contents.
+)doc");
+
+REGISTER_OP("AudioSpectrogram")
+    .Input("input: float")
+    .Attr("window_size: int")
+    .Attr("stride: int")
+    .Attr("magnitude_squared: bool = false")
+    .Output("spectrogram: float")
+    .SetShapeFn(SpectrogramShapeFn)
+    .Doc(R"doc(
+Produces a visualization of audio data over time.
+
+Spectrograms are a standard way of representing audio information as a series of
+slices of frequency information, one slice for each window of time. By joining
+these together into a sequence, they form a distinctive fingerprint of the sound
+over time.
+
+This op expects to receive audio data as an input, stored as floats in the range
+-1 to 1, together with a window width in samples, and a stride specifying how
+far to move the window between slices. From this it generates a three
+dimensional output. The lowest dimension has an amplitude value for each
+frequency during that time slice. The next dimension is time, with successive
+frequency slices. The final dimension is for the channels in the input, so a
+stereo audio input would have two here for example.
+
+This means the layout when converted and saved as an image is rotated 90 degrees
+clockwise from a typical spectrogram. Time is descending down the Y axis, and
+the frequency decreases from left to right.
+
+Each value in the result represents the square root of the sum of the real and
+imaginary parts of an FFT on the current window of samples. In this way, the
+lowest dimension represents the power of each frequency in the current window,
+and adjacent windows are concatenated in the next dimension.
+
+To get a more intuitive and visual look at what this operation does, you can run
+tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+resulting spectrogram as a PNG image.
+
+input: Float representation of audio data.
+window_size: How wide the input window is in samples. For the highest efficiency
+  this should be a power of two, but other values are accepted.
+stride: How widely apart the center of adjacent sample windows should be.
+magnitude_squared: Whether to return the squared magnitude or just the
+  magnitude. Using squared magnitude can avoid extra calculations.
+spectrogram: 3D representation of the audio frequencies as an image.
+)doc");
+
+REGISTER_OP("Mfcc")
+    .Input("spectrogram: float")
+    .Input("sample_rate: int32")
+    .Attr("upper_frequency_limit: float = 4000")
+    .Attr("lower_frequency_limit: float = 20")
+    .Attr("filterbank_channel_count: int = 40")
+    .Attr("dct_coefficient_count: int = 13")
+    .Output("output: float")
+    .SetShapeFn(MfccShapeFn)
+    .Doc(R"doc(
+Transforms a spectrogram into a form that's useful for speech recognition.
+
+Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+been effective as an input feature for machine learning. They are created by
+taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+higher frequencies that are less significant to the human ear. They have a long
+history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+is a good resource to learn more.
+
+spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+  set to true.
+sample_rate: How many samples per second the source audio used.
+upper_frequency_limit: The highest frequency to use when calculating the
+  ceptstrum.
+lower_frequency_limit: The lowest frequency to use when calculating the
+  ceptstrum.
+filterbank_channel_count: Resolution of the Mel bank used internally.
+dct_coefficient_count: How many output channels to produce per time slice.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 037c393574d..18700be67a6 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -55,6 +55,7 @@ REGISTER_OP("UniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a uniform distribution.
 
@@ -80,7 +81,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -103,6 +104,7 @@ REGISTER_OP("LogUniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a log-uniform distribution.
 
@@ -129,7 +131,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -152,6 +154,7 @@ REGISTER_OP("LearnedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -177,7 +180,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -200,6 +203,7 @@ REGISTER_OP("ThreadUnsafeUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -225,7 +229,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -254,6 +258,7 @@ REGISTER_OP("FixedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -284,7 +289,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -329,6 +334,7 @@ REGISTER_OP("AllCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -354,7 +360,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to produce per batch.
+num_sampled: Number of candidates to produce.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index 9619ce63e04..c79b4439148 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
diff --git a/tensorflow/core/ops/cloud_ops.cc b/tensorflow/core/ops/cloud_ops.cc
deleted file mode 100644
index 89f31a46abe..00000000000
--- a/tensorflow/core/ops/cloud_ops.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* This file registers all cloud ops. */
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-namespace tensorflow {
-
-using shape_inference::InferenceContext;
-
-REGISTER_OP("BigQueryReader")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("project_id: string")
-    .Attr("dataset_id: string")
-    .Attr("table_id: string")
-    .Attr("columns: list(string)")
-    .Attr("timestamp_millis: int")
-    .Attr("test_end_point: string = ''")
-    .Output("reader_handle: Ref(string)")
-    .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(2));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-A Reader that outputs rows from a BigQuery table as tensorflow Examples.
-
-container: If non-empty, this reader is placed in the given container.
-           Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-project_id: GCP project ID.
-dataset_id: BigQuery Dataset ID.
-table_id: Table to read.
-columns: List of columns to read. Leave empty to read all columns.
-timestamp_millis: Table snapshot timestamp in millis since epoch. Relative
-(negative or zero) snapshot times are not allowed. For more details, see
-'Table Decorators' in BigQuery docs.
-test_end_point: Do not use. For testing purposes only.
-reader_handle: The handle to reference the Reader.
-)doc");
-
-REGISTER_OP("GenerateBigQueryReaderPartitions")
-    .Attr("project_id: string")
-    .Attr("dataset_id: string")
-    .Attr("table_id: string")
-    .Attr("columns: list(string)")
-    .Attr("timestamp_millis: int")
-    .Attr("num_partitions: int")
-    .Attr("test_end_point: string = ''")
-    .Output("partitions: string")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Generates serialized partition messages suitable for batch reads.
-
-This op should not be used directly by clients. Instead, the
-bigquery_reader_ops.py file defines a clean interface to the reader.
-
-project_id: GCP project ID.
-dataset_id: BigQuery Dataset ID.
-table_id: Table to read.
-columns: List of columns to read. Leave empty to read all columns.
-timestamp_millis: Table snapshot timestamp in millis since epoch. Relative
-(negative or zero) snapshot times are not allowed. For more details, see
-'Table Decorators' in BigQuery docs.
-num_partitions: Number of partitions to split the table into.
-test_end_point: Do not use. For testing purposes only.
-partitions: Serialized table partitions.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 98a08a801d3..c5903431e02 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -18,6 +18,7 @@ cc_library(
     hdrs = ["op_compatibility_lib.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
@@ -31,6 +32,7 @@ tf_cc_test(
     srcs = ["backwards_compatibility_test.cc"],
     data = [
         ":ops_history.v0.pbtxt",
+        ":ops_history.v1.pbtxt",
         "//tensorflow/core:ops/ops.pbtxt",
     ],
     deps = [
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.cc b/tensorflow/core/ops/compat/op_compatibility_lib.cc
index bd9928021fb..61243d2bd23 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.cc
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.cc
@@ -82,11 +82,9 @@ Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
   {  // Read op history.
     printf("Reading op history from %s...\n", op_history_file_.c_str());
     string op_history_str;
-    Status status = ReadFileToString(env, op_history_file_, &op_history_str);
-    if (!errors::IsNotFound(status)) {
-      if (!status.ok()) return status;
-      protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
-    }
+    TF_RETURN_IF_ERROR(
+        ReadFileToString(env, op_history_file_, &op_history_str));
+    protobuf::TextFormat::ParseFromString(op_history_str, &in_op_history);
   }
 
   int cur = 0;
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index d0230012f2e..38e91085d28 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -20158,6 +20158,28 @@ op {
     type: "type"
   }
 }
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Print"
   input_arg {
@@ -22595,6 +22617,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "RandomShuffle"
   input_arg {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
new file mode 100644
index 00000000000..4b10e5b79e3
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -0,0 +1,28899 @@
+op {
+  name: "Abort"
+  attr {
+    name: "error_msg"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "exit_without_error"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorNumAccumulated"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "num_accumulated"
+    type: DT_INT32
+  }
+}
+op {
+  name: "AccumulatorSetGlobalStep"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "new_global_step"
+    type: DT_INT64
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "AddManySparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddSparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handle"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "AdjustContrast"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 2
+  }
+}
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "All"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Any"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyDelayCompensatedGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lambda"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shadow"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_INT8
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Assert"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Assign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "validate_shape"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+  }
+  attr {
+    name: "stride"
+    type: "int"
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AudioSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "sample_rate"
+    type: "float"
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "AudioSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Barrier"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BarrierClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BarrierIncompleteSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "BarrierInsertMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "component_index"
+    type: "int"
+  }
+}
+op {
+  name: "BarrierReadySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "BarrierTakeMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "allow_small_batch"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "wait_for_incomplete"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "BatchCholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchCholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BatchFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchSelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BatchToSpaceND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tcrops"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Betainc"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "BroadcastArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BroadcastGradientArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
+}
+op {
+  name: "CTCBeamSearchDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCGreedyDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "ConcatV2"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DecodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+}
+op {
+  name: "DecodeJSONExample"
+  input_arg {
+    name: "json_examples"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "binary_examples"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodePng"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  attr {
+    name: "desired_channels"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "desired_samples"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "DeserializeManySparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DestroyTemporaryVariable"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DrawBoundingBoxes"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "DynamicPartition"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "partitions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_partitions"
+  }
+  attr {
+    name: "num_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "DynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "EditDistance"
+  input_arg {
+    name: "hypothesis_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "hypothesis_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "hypothesis_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "truth_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "normalize"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EncodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pad"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "EncodeJpeg"
+  input_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "format"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    allowed_values {
+      list {
+        s: ""
+        s: "grayscale"
+        s: "rgb"
+      }
+    }
+  }
+  attr {
+    name: "quality"
+    type: "int"
+    default_value {
+      i: 95
+    }
+  }
+  attr {
+    name: "progressive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "optimize_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "chroma_downsampling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "density_unit"
+    type: "string"
+    default_value {
+      s: "in"
+    }
+    allowed_values {
+      list {
+        s: "in"
+        s: "cm"
+      }
+    }
+  }
+  attr {
+    name: "x_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "y_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "xmp_metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "EncodePng"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "EncodeWav"
+  input_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "Enter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Exit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExpandDims"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dim"
+    type_attr: "Tdim"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tdim"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Fact"
+  output_arg {
+    name: "fact"
+    type: DT_STRING
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQueue"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  is_stateful: true
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FractionalAvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalAvgPoolGrad"
+  input_arg {
+    name: "orig_input_tensor_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGrad"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Gather"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Identity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Igamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Igammac"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Imag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImmutableConst"
+  output_arg {
+    name: "tensor"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "memory_region_name"
+    type: "string"
+  }
+}
+op {
+  name: "InTopK"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "InitializeTable"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+}
+op {
+  name: "InitializeTableFromTextFile"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+}
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvertPermutation"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsVariableInitialized"
+  input_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorDispose"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ListDiff"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LogicalAnd"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableFind"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Merge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "NegTrain"
+  input_arg {
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "lr"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "vocab_count"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_negative_samples"
+    type: "int"
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "NoOp"
+}
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConcat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueUpTo"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueUpToV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "QueueSizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RandomCrop"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  deprecation {
+    version: 8
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomGamma"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "Tout"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Rank"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ReadFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderNumRecordsProduced"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumRecordsProducedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderNumWorkUnitsCompleted"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumWorkUnitsCompletedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRead"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpTo"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpToV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReadV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReset"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
+op {
+  name: "ReaderResetV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRestoreState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderRestoreStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderSerializeState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderSerializeStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "Real"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "RefEnter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "RefExit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "RefMerge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefNextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefSelect"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefSwitch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Reshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseSequence"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seq_lengths"
+    type_attr: "Tlen"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "min_object_covered"
+    type: "float"
+    default_value {
+      f: 0.1
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNd"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "SdcaFprint"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SdcaOptimizer"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SdcaShrinkL1"
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+    number_attr: "num_features"
+    is_ref: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Select"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Shape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ShapeN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ShardedFilename"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ShardedFilespec"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Size"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Skipgram"
+  output_arg {
+    name: "vocab_word"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "vocab_freq"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "words_per_epoch"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "current_epoch"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "total_words_processed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  attr {
+    name: "filename"
+    type: "string"
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "min_count"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "subsample"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "Slice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Softmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "softmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SpaceToBatch"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SpaceToDepth"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseConcat"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "concat_dim"
+    type: "int"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseFillEmptyRows"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "empty_row_indicator"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "product"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "a_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "b_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Ta"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tb"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReorder"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseReshape"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "new_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmax"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSplit"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseToDense"
+  input_arg {
+    name: "sparse_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "output_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseToSparseSetOperation"
+  input_arg {
+    name: "set1_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set1_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set1_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Split"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SplitV"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size_splits"
+    type_attr: "Tlen"
+  }
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Squeeze"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "squeeze_dims"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "Stack"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StackClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
+op {
+  name: "StackPop"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+}
+op {
+  name: "StackPush"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StageClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StagePeek"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StageSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StopGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "StridedSlice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StridedSliceGrad"
+  input_arg {
+    name: "shape"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StringJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "StringToHashBucket"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "StringToHashBucketFast"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "StringToHashBucketStrong"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "key"
+    type: "list(int)"
+  }
+}
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+      }
+    }
+  }
+}
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Switch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SymbolicGradient"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TakeManySparseFromTensorsMap"
+  input_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TemporaryVariable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TensorArrayCloseV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayConcat"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayConcatV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorArrayConcatV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGather"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorArrayGatherV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGrad"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayPack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayRead"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayReadV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayScatter"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 19
+  }
+}
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayScatterV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArraySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorArraySizeV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArraySplit"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArraySplitV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayUnpack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 20
+  }
+}
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayWrite"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayWriteV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorSummary"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "description"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "labels"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "TensorSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "serialized_summary_metadata"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Tile"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type_attr: "Tmultiples"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tmultiples"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TileGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 3
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Transpose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unique"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueWithCounts"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Unpack"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num"
+  }
+  attr {
+    name: "num"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Variable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "VariableV2"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "WholeFileReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WholeFileReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ZerosLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Zeta"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_RESOURCE
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 71520f3caaf..9e39b396e1f 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -32,10 +32,11 @@ Status SwitchShape(InferenceContext* c) {
   c->set_output(1, out);
 
   // Handle resource shape / dtype.
-  c->set_output_handle_shape(0, c->input_handle_shape(0));
-  c->set_output_handle_shape(1, c->input_handle_shape(0));
-  c->set_output_handle_dtype(0, c->input_handle_dtype(0));
-  c->set_output_handle_dtype(1, c->input_handle_dtype(0));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr) {
+    c->set_output_handle_shapes_and_types(0, *handle_data);
+    c->set_output_handle_shapes_and_types(1, *handle_data);
+  }
   return Status::OK();
 }
 }  // namespace
@@ -159,7 +160,7 @@ Forwards the value of an available tensor from `inputs` to `output`.
 `Merge` waits for at least one of the tensors in `inputs` to become available.
 It is usually combined with `Switch` to implement branching.
 
-`Merge` forwards the first tensor for become available to `output`, and sets
+`Merge` forwards the first tensor to become available to `output`, and sets
 `value_index` to its index in `inputs`.
 
 inputs: The input tensors, exactly one of which will become available.
@@ -200,8 +201,10 @@ REGISTER_OP("Enter")
       c->set_output(0, c->UnknownShape());
 
       // Handle resource shape / dtype, if present.
-      c->set_output_handle_shape(0, c->input_handle_shape(0));
-      c->set_output_handle_dtype(0, c->input_handle_dtype(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
 
       return Status::OK();
     })
@@ -331,7 +334,10 @@ REGISTER_OP("Abort")
     .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
+Raise a exception to abort the process when called.
+
+If exit_without_error is true, the process will exit normally,
+otherwise it will exit with a SIGABORT signal.
 
 Returns nothing but an exception.
 
diff --git a/tensorflow/core/ops/control_flow_ops_test.cc b/tensorflow/core/ops/control_flow_ops_test.cc
index 9aa14e27a0a..2c0736c8bcc 100644
--- a/tensorflow/core/ops/control_flow_ops_test.cc
+++ b/tensorflow/core/ops/control_flow_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -28,6 +27,7 @@ TEST(ControlFlowOpsTest, Merge_ShapeFn) {
 
   int n = 3;
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
   TF_ASSERT_OK(NodeDefBuilder("test", "Merge")
                    .Input(src_list)
@@ -54,6 +54,7 @@ TEST(ControlFlowOpsTest, RefSelect_ShapeFn) {
 
   int n = 3;
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 1, DT_FLOAT_REF);
   TF_ASSERT_OK(NodeDefBuilder("test", "RefSelect")
                    .Input("index", 0, DT_INT32)
diff --git a/tensorflow/core/ops/ctc_ops.cc b/tensorflow/core/ops/ctc_ops.cc
index c94ce577c0b..1a69106d80b 100644
--- a/tensorflow/core/ops/ctc_ops.cc
+++ b/tensorflow/core/ops/ctc_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("CTCLoss")
     .Input("sequence_length: int32")
     .Attr("preprocess_collapse_repeated: bool = false")
     .Attr("ctc_merge_repeated: bool = true")
+    .Attr("ignore_longer_outputs_than_inputs: bool = false")
     .Output("loss: float")
     .Output("gradient: float")
     .SetShapeFn([](InferenceContext* c) {
@@ -75,6 +76,9 @@ preprocess_collapse_repeated: Scalar, if true then repeated labels are
 ctc_merge_repeated: Scalar.  If set to false, *during* CTC calculation
   repeated non-blank labels will not be merged and are interpreted as
   individual labels.  This is a simplified version of CTC.
+ignore_longer_outputs_than_inputs: Scalar. If set to true, during CTC
+  calculation, items that have longer output sequences than input sequences
+  are skipped: they don't contribute to the loss term and have zero-gradient.
 loss: A vector (batch) containing log-probabilities.
 gradient: The gradient of `loss`.  3-D, shape:
   `(max_time x batch_size x num_classes)`.
diff --git a/tensorflow/core/ops/ctc_ops_test.cc b/tensorflow/core/ops/ctc_ops_test.cc
index 37b79c694d5..87f31dad7b5 100644
--- a/tensorflow/core/ops/ctc_ops_test.cc
+++ b/tensorflow/core/ops/ctc_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 365716b3725..f0fcd028350 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -101,8 +101,10 @@ For example:
     outputs[1] = [30, 40]
 ```
 
+See `dynamic_stitch` for an example on how to merge partitions back.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicPartition.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
 </div>
 
 partitions: Any shape.  Indices in the range `[0, num_partitions)`.
@@ -120,7 +122,7 @@ REGISTER_OP("DynamicStitch")
       TF_RETURN_IF_ERROR(c->GetAttr("N", &num_partitions));
 
       ShapeHandle extra_shape = c->UnknownShape();
-      for (int i = 0; i < num_partitions; ++i) {
+      for (int64 i = 0; i < num_partitions; ++i) {
         ShapeHandle indices_shape = c->input(i);
         ShapeHandle data_shape = c->input(i + num_partitions);
         if (!c->RankKnown(indices_shape)) {
@@ -189,8 +191,26 @@ For example:
               [51, 52], [61, 62]]
 ```
 
+This method can be used to merge partitions created by `dynamic_partition`
+as illustrated on the following example:
+
+```python
+    # Apply function (increments x_i) on elements for which a certain condition
+    # apply (x_i != -1 in this example).
+    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+    condition_mask=tf.not_equal(x,tf.constant(-1.))
+    partitioned_data = tf.dynamic_partition(
+        x, tf.cast(condition_mask, tf.int32) , 2)
+    partitioned_data[1] = partitioned_data[1] + 1.0
+    condition_indices = tf.dynamic_partition(
+        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+    x = tf.dynamic_stitch(condition_indices, partitioned_data)
+    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+    # unchanged.
+```
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicStitch.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
 </div>
 )doc");
 
@@ -210,10 +230,29 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
@@ -604,7 +643,17 @@ REGISTER_OP("QueueDequeueV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      auto* t = c->input_handle_shapes_and_types(0);
+      if (t != nullptr && t->size() == c->num_outputs()) {
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          c->set_output(i, (*t)[i].shape);
+        }
+        return Status::OK();
+      } else {
+        return shape_inference::UnknownShape(c);
+      }
+    })
     .Doc(R"doc(
 Dequeues a tuple of one or more tensors from the given queue.
 
@@ -631,20 +680,20 @@ REGISTER_OP("QueueDequeueMany")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -664,20 +713,20 @@ REGISTER_OP("QueueDequeueManyV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -697,24 +746,24 @@ REGISTER_OP("QueueDequeueUpTo")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has k outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -734,24 +783,24 @@ REGISTER_OP("QueueDequeueUpToV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
 in the dequeued tuple will have size n in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -778,7 +827,7 @@ operations that would block will fail immediately.
 
 handle: The handle to a queue.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be cancelled.
+  blocked on the given queue will be canceled.
 )doc");
 
 REGISTER_OP("QueueCloseV2")
@@ -796,7 +845,7 @@ operations that would block will fail immediately.
 
 handle: The handle to a queue.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be cancelled.
+  blocked on the given queue will be canceled.
 )doc");
 
 REGISTER_OP("QueueSize")
@@ -843,8 +892,10 @@ REGISTER_OP("AccumulatorSetGlobalStep")
       return Status::OK();
     })
     .Doc(R"doc(
-Updates the accumulator with a new value for global_step. Logs warning if the
-accumulator's value is already higher than new_global_step.
+Updates the accumulator with a new value for global_step.
+
+Logs warning if the accumulator's value is already higher than
+new_global_step.
 
 handle: The handle to an accumulator.
 new_global_step: The new global_step value to set.
@@ -862,20 +913,22 @@ REGISTER_OP("ConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating gradients. The accumulator accepts
-gradients marked with local_step greater or equal to the most recent global_step
-known to the accumulator. The average can be extracted from the accumulator,
-provided sufficient gradients have been accumulated. Extracting the average
-automatically resets the aggregate to 0, and increments the global_step recorded
-by the accumulator.
+A conditional accumulator for aggregating gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
 shape: The shape of the values, can be [], in which case shape is unknown.
 container: If non-empty, this accumulator is placed in the given container.
   Otherwise, a default container is used.
-shared_name: If non-empty, this accumulator will be shared under the given name
-  across multiple sessions.
+shared_name: If non-empty, this accumulator will be shared under the
+  given name across multiple sessions.
 )doc");
 
 REGISTER_OP("AccumulatorApplyGradient")
@@ -889,8 +942,9 @@ REGISTER_OP("AccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a gradient to a given accumulator. Does not add if local_step is lesser
-than the accumulator's global_step.
+Applies a gradient to a given accumulator.
+
+Does not add if local_step is lesser than the accumulator's global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the gradient was computed.
@@ -913,13 +967,13 @@ REGISTER_OP("AccumulatorTakeGradient")
     })
     .Attr("dtype: numbertype")
     .Doc(R"doc(
-Extracts the average gradient in the given ConditionalAccumulator, provided
-that sufficient (i.e., more than num_required) gradients have been accumulated.
-The op blocks until sufficient gradients have been accumulated.
-If the accumulator has already aggregated more than num_required gradients, it
-returns the average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average gradient in the given ConditionalAccumulator.
+
+The op blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated.  If the accumulator has already
+aggregated more than num_required gradients, it returns the average of
+the accumulated gradients.  Also automatically increments the recorded
+global_step in the accumulator by 1, and resets the aggregate to 0.
 
 handle: The handle to an accumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -940,12 +994,14 @@ REGISTER_OP("SparseConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating sparse gradients. The accumulator
-accepts gradients marked with local_step greater or equal to the most recent
-global_step known to the accumulator. The average can be extracted from the
-accumulator, provided sufficient gradients have been accumulated. Extracting the
-average automatically resets the aggregate to 0, and increments the global_step
-recorded by the accumulator.
+A conditional accumulator for aggregating sparse gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
@@ -970,8 +1026,10 @@ REGISTER_OP("SparseAccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a sparse gradient to a given accumulator. Does not add if local_step is
-lesser than the accumulator's global_step.
+Applies a sparse gradient to a given accumulator.
+
+Does not add if local_step is smaller than the accumulator's
+global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the sparse gradient was computed.
@@ -1003,13 +1061,14 @@ REGISTER_OP("SparseAccumulatorTakeGradient")
       return shape_inference::UnknownShape(c);
     })
     .Doc(R"doc(
-Extracts the average sparse gradient in the given SparseConditionalAccumulator,
-provided that sufficient (i.e., more than num_required) gradients have been
-accumulated. The op will blocks until sufficient gradients have been
-accumulated. If the accumulator has already aggregated more than num_required
-gradients, it will return its average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average sparse gradient in a SparseConditionalAccumulator.
+
+The op will blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated. If the accumulator has already
+aggregated more than num_required gradients, it will return its
+average of the accumulated gradients.  Also automatically increments
+the recorded global_step in the accumulator by 1, and resets the
+aggregate to 0.
 
 handle: The handle to a SparseConditionalAccumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -1043,7 +1102,10 @@ REGISTER_OP("StackPush")
     .Output("output: T")
     .Attr("T: type")
     .Attr("swap_memory: bool = false")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Push an element onto the stack.
 
@@ -1091,11 +1153,13 @@ REGISTER_OP("TensorArrayV3")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       c->set_output(0, c->Vector(2));
+      c->set_output(1, c->Scalar());
       return Status::OK();
     })
     .Doc(R"doc(
-An array of Tensors of given size, with data written via Write and read
-via Read or Pack.
+An array of Tensors of given size.
+
+Write data via Write and read via Read or Pack.
 
 handle: The handle to the TensorArray.
 flow: A scalar used to control gradient flow.
@@ -1127,6 +1191,7 @@ REGISTER_OP("TensorArrayGradV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       c->set_output(0, c->Vector(2));
+      c->set_output(1, c->Scalar());
       return Status::OK();
     })
     .Doc(R"doc(
@@ -1156,7 +1221,7 @@ of the forward TensorArray is known when this operation is called.
 
 TensorArray gradient calls use an accumulator TensorArray object.  If
 multiple gradients are calculated and run in the same session, the multiple
-gradient nodes may accidentally flow throuth the same accumulator TensorArray.
+gradient nodes may accidentally flow through the same accumulator TensorArray.
 This double counts and generally breaks the TensorArray gradient flow.
 
 The solution is to identify which gradient call this particular
@@ -1410,8 +1475,10 @@ REGISTER_OP("TensorArrayCloseV3")
       return Status::OK();
     })
     .Doc(R"doc(
-Delete the TensorArray from its resource container.  This enables
-the user to close and release the resource in the middle of a step/run.
+Delete the TensorArray from its resource container.
+
+This enables the user to close and release the resource in the middle
+of a step/run.
 
 handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 )doc");
@@ -1429,7 +1496,7 @@ REGISTER_OP("TensorArray")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .Output("handle: Ref(string)")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayV2")
@@ -1454,7 +1521,7 @@ REGISTER_OP("TensorArrayGrad")
     .Output("grad_handle: Ref(string)")
     .Attr("source: string")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGradV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGradV2")
@@ -1479,7 +1546,7 @@ REGISTER_OP("TensorArrayWrite")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayWriteV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayWriteV2")
@@ -1507,7 +1574,7 @@ REGISTER_OP("TensorArrayRead")
     .Input("flow_in: float")
     .Output("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayReadV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayReadV2")
@@ -1533,7 +1600,7 @@ REGISTER_OP("TensorArrayPack")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGatherV3 with RangeOp");
 REGISTER_OP("TensorArrayUnpack")
     .Input("handle: Ref(string)")
@@ -1541,7 +1608,7 @@ REGISTER_OP("TensorArrayUnpack")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(20, "Use TensorArrayScatterV3 with RangeOp");
 REGISTER_OP("TensorArrayGather")
     .Input("handle: Ref(string)")
@@ -1550,7 +1617,7 @@ REGISTER_OP("TensorArrayGather")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGatherV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGatherV2")
@@ -1577,7 +1644,7 @@ REGISTER_OP("TensorArrayScatter")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(19, "Use TensorArrayGradV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayScatterV2")
@@ -1604,7 +1671,7 @@ REGISTER_OP("TensorArrayConcat")
     .Output("lengths: int64")
     .Attr("dtype: type")
     .Attr("element_shape_except0: shape = { unknown_rank: true }")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGradV3");
 REGISTER_OP("TensorArrayConcatV2")
     .Input("handle: string")
@@ -1632,7 +1699,7 @@ REGISTER_OP("TensorArraySplit")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySplitV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySplitV2")
@@ -1657,7 +1724,7 @@ REGISTER_OP("TensorArraySize")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
     .Output("size: int32")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySizeV3");
 // TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySizeV2")
@@ -1812,7 +1879,7 @@ Subsequent TakeMany operations that would block will fail immediately.
 
 handle: The handle to a barrier.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the barrier's queue will be cancelled. InsertMany will fail, even
+  blocked on the barrier's queue will be canceled. InsertMany will fail, even
   if no new key is introduced.
 )doc");
 
@@ -1842,311 +1909,6 @@ size: The number of incomplete elements (i.e. those with some of their value
 
 // --------------------------------------------------------------------------
 
-REGISTER_OP("LookupTableFind")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("default_value: Tout")
-    .Output("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // Default value must be scalar or vector.
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
-      c->set_output(0, c->UnknownShape());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Looks up keys in a table, outputs the corresponding values.
-
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Same shape as `keys`.  Values found in the table, or `default_values`
-   for missing keys.
-)doc");
-
-REGISTER_OP("LookupTableInsert")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Updates the table to associates keys with values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
-
-REGISTER_OP("LookupTableSize")
-    .Input("table_handle: Ref(string)")
-    .Output("size: int64")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of elements in the given table.
-
-table_handle: Handle to the table.
-size: Scalar that contains number of elements in the table.
-)doc");
-
-REGISTER_OP("LookupTableExport")
-    .Input("table_handle: Ref(string)")
-    .Output("keys: Tkeys")
-    .Output("values: Tvalues")
-    .Attr("Tkeys: type")
-    .Attr("Tvalues: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      ShapeHandle values = c->UnknownShape();
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
-      ShapeHandle keys = c->Vector(c->Dim(values, 0));
-      c->set_output(0, keys);
-      c->set_output(1, values);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs all keys and values in the table.
-
-table_handle: Handle to the table.
-keys: Vector of all keys present in the table.
-values: Tensor of all values in the table. Indexed in parallel with `keys`.
-)doc");
-
-REGISTER_OP("LookupTableImport")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Replaces the contents of the table with the specified keys and values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
-
-REGISTER_OP("HashTable")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates a non-initialized hash table.
-
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableHashTable")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableHashTableOfTensors")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
-
-REGISTER_OP("MutableDenseHashTable")
-    .Input("empty_key: key_dtype")
-    .Output("table_handle: Ref(string)")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .Attr("initial_num_buckets: int = 131072")  // 2^17
-    .Attr("max_load_factor: float = 0.8")
-    .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table that uses tensors as the backing store. It uses
-"open addressing" with quadratic reprobing to resolve collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-empty_key: The key used to represent empty key buckets internally. Must not
-  be used in insert or lookup operations.
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-value_shape: The shape of each value.
-initial_num_buckets: The initial number of hash table buckets. Must be a power
-  to 2.
-max_load_factor: The maximum ratio between number of entries and number of
-  buckets before growing the table. Must be between 0 and 1.
-)doc");
-
-REGISTER_OP("InitializeTable")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tkey")
-    .Input("values: Tval")
-    .Attr("Tkey: type")
-    .Attr("Tval: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      ShapeHandle keys;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
-      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Table initializer that takes two tensors for keys and values respectively.
-
-table_handle: Handle to a table which will be initialized.
-keys: Keys of type Tkey.
-values: Values of type Tval.
-)doc");
-
-REGISTER_OP("InitializeTableFromTextFile")
-    .Input("table_handle: Ref(string)")
-    .Input("filename: string")
-    .Attr("key_index: int >= -2")
-    .Attr("value_index: int >= -2")
-    .Attr("vocab_size: int >= -1 = -1")
-    .Attr("delimiter: string = '\t'")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Initializes a table from a text file.
-
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-
-table_handle: Handle to a table which will be initialized.
-filename: Filename of a vocabulary text file.
-key_index: Column index in a line to get the table `key` values from.
-value_index: Column index that represents information of a line to get the table
-  `value` values from.
-vocab_size: Number of elements of the file, use -1 if unknown.
-delimiter: Delimiter to separate fields in a line.
-)doc");
-
 REGISTER_OP("GetSessionHandle")
     .Input("value: T")
     .Output("handle: string")
@@ -2156,7 +1918,21 @@ REGISTER_OP("GetSessionHandle")
 Store the input tensor in the state of the current session.
 
 value: The tensor to be stored.
-handle: The handle for the tensor stored in the session state.
+handle: The handle for the tensor stored in the session state, represented
+  as a string.
+)doc");
+
+REGISTER_OP("GetSessionHandleV2")
+    .Input("value: T")
+    .Output("handle: resource")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Store the input tensor in the state of the current session.
+
+value: The tensor to be stored.
+handle: The handle for the tensor stored in the session state, represented
+  as a ResourceHandle object.
 )doc");
 
 REGISTER_OP("GetSessionTensor")
@@ -2191,35 +1967,323 @@ handle: The handle for a tensor stored in the session state.
 
 REGISTER_OP("Stage")
     .Input("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
     .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
     .Doc(R"doc(
-Stage values similar to a lightweight Enqueue.  The basic functionality of this
-Op is similar to a queue with many fewer capabilities and options.  This Op is
-optimized for performance.
+Stage values similar to a lightweight Enqueue.
+
+The basic functionality of this Op is similar to a queue with many
+fewer capabilities and options.  This Op is optimized for performance.
 
 values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
+memory_limit: The maximum number of bytes allowed for Tensors in the Staging Area.
+  If > 0, inserts will block until sufficient space is available.
 container: If non-empty, this queue is placed in the given container. Otherwise,
   a default container is used.
 shared_name: It is necessary to match this name to the matching Unstage Op.
-    )doc");
+)doc");
 
 REGISTER_OP("Unstage")
     .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
     .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
     .Doc(R"doc(
-Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
-dequeue with many fewer capabilities and options.  This Op is optimized for
+Op is similar to a lightweight Dequeue.
+
+The basic functionality is similar to dequeue with many fewer
+capabilities and options.  This Op is optimized for performance.
+)doc");
+
+REGISTER_OP("StagePeek")
+    .Input("index: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op peeks at the values at the specified index.  If the
+underlying container does not contain sufficient elements
+this op will block until it does.   This Op is optimized for
 performance.
     )doc");
 
+
+REGISTER_OP("StageSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of elements in the underlying container.
+    )doc");
+
+REGISTER_OP("StageClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes all elements in the underlying container.
+    )doc");
+
+// UnorderedMap
+REGISTER_OP("MapStage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Input("values: fake_dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("fake_dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
+    .Doc(R"doc(
+Stage (key, values) in the underlying container which behaves like a hashtable.
+
+key: int64
+values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
+container: If non-empty, this queue is placed in the given container. Otherwise,
+  a default container is used.
+shared_name: It is necessary to match this name to the matching Unstage Op.
+)doc");
+
+REGISTER_OP("MapPeek")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op peeks at the values at the specified key.  If the
+underlying container does not contain this key
+this op will block until it does.
+    )doc");
+
+REGISTER_OP("MapUnstage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the values associated with the key
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+    )doc");
+
+REGISTER_OP("MapUnstageNoKey")
+    .Input("indices: int32")
+    .Output("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns a random (key, value)
+from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+      )doc");
+
+REGISTER_OP("MapSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of elements in the underlying container.
+    )doc");
+
+REGISTER_OP("MapIncompleteSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of incomplete elements in the underlying container.
+    )doc");
+
+
+REGISTER_OP("MapClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes all elements in the underlying container.
+    )doc");
+
+
+// OrderedMap
+REGISTER_OP("OrderedMapStage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Input("values: fake_dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("fake_dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
+    .Doc(R"doc(
+Stage (key, values) in the underlying container which behaves like a ordered
+associative container.   Elements are ordered by key.
+
+key: int64
+values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
+container: If non-empty, this queue is placed in the given container. Otherwise,
+  a default container is used.
+shared_name: It is necessary to match this name to the matching Unstage Op.
+)doc");
+
+REGISTER_OP("OrderedMapPeek")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op peeks at the values at the specified key.  If the
+underlying container does not contain this key
+this op will block until it does.   This Op is optimized for
+performance.
+    )doc");
+
+REGISTER_OP("OrderedMapUnstage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the values associated with the key
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+    )doc");
+
+REGISTER_OP("OrderedMapUnstageNoKey")
+    .Input("indices: int32")
+    .Output("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the (key, value) element with the smallest
+key from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+      )doc");
+
+REGISTER_OP("OrderedMapSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of elements in the underlying container.
+    )doc");
+
+REGISTER_OP("OrderedMapIncompleteSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of incomplete elements in the underlying container.
+    )doc");
+
+REGISTER_OP("OrderedMapClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes all elements in the underlying container.
+    )doc");
+
 REGISTER_OP("RecordInput")
     .Output("records: string")
     .Attr("file_pattern: string")
diff --git a/tensorflow/core/ops/data_flow_ops_test.cc b/tensorflow/core/ops/data_flow_ops_test.cc
index 7ecc0137fb7..53c843eb60b 100644
--- a/tensorflow/core/ops/data_flow_ops_test.cc
+++ b/tensorflow/core/ops/data_flow_ops_test.cc
@@ -137,4 +137,16 @@ TEST(DataFlowOpsTest, DynamicStitch) {
               "[2,3];[5,6];[2,3,4,5];[5,6,13,14]");
 }
 
+TEST(DataFlowOpsTest, TensorArrayV3) {
+  ShapeInferenceTestOp op("TensorArrayV3");
+  TF_ASSERT_OK(NodeDefBuilder("test", "TensorArrayV3")
+                   .Input({"size", 0, DT_INT32})
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(&op.node_def));
+
+  INFER_OK(op, "[]", "[2];[]");
+  INFER_OK(op, "?", "[2];[]");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
new file mode 100644
index 00000000000..4fb554ea16c
--- /dev/null
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -0,0 +1,490 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+
+// The ops in this section can be composed to define an input
+// pipeline. Each op produces a (step-local) resource that represents
+// a DAG of "dataset" objects. An "dataset" object can be converted
+// to a stateful "iterator" by passing the "dataset" to the
+// "MakeIterator" op.
+
+REGISTER_OP("TensorDataset")
+    .Input("components: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
+                                               // `components` have shapes
+                                               // compatible with
+                                               // `output_shapes`.
+    .Doc(R"doc(
+Creates a dataset that emits `components` as a tuple of tensors once.
+)doc");
+
+REGISTER_OP("TensorSliceDataset")
+    .Input("components: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that the
+                                               // dim-0 slices of `components`
+                                               // have shapes compatible with
+                                               // `output_shapes`.
+    .Doc(R"doc(
+Creates a dataset that emits each dim-0 slice of `components` once.
+)doc");
+
+REGISTER_OP("SparseTensorSliceDataset")
+    .Input("indices: int64")
+    .Input("values: Tvalues")
+    .Input("dense_shape: int64")
+    .Output("handle: resource")
+    .Attr("Tvalues: type")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that splits a SparseTensor into elements row-wise.
+)doc");
+
+REGISTER_OP("ZipDataset")
+    .Input("input_datasets: N * resource")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that zips together `input_datasets`.
+)doc");
+
+REGISTER_OP("RepeatDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate the shape
+                                               // of `count`.
+    .Doc(R"doc(
+Creates a dataset that emits the outputs of `input_dataset` `count` times.
+
+count: A scalar representing the number of times that `input_dataset` should
+  be repeated. A value of `-1` indicates that it should be repeated infinitely.
+)doc");
+
+REGISTER_OP("TakeDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains `count` elements from the `input_dataset`.
+
+count: A scalar representing the number of elements from the `input_dataset`
+  that should be taken. A value of `-1` indicates that all of `input_dataset`
+  is taken.
+)doc");
+
+REGISTER_OP("SkipDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that skips `count` elements from the `input_dataset`.
+
+count: A scalar representing the number of elements from the `input_dataset`
+  that should be skipped.  If count is -1, skips everything.
+)doc");
+
+REGISTER_OP("MapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+)doc");
+
+REGISTER_OP("ParallelMapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Input("num_threads: int32")
+    .Input("output_buffer_size: int64")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset uses
+up to `num_threads` threads to process elements from `input_dataset`
+in parallel.
+
+num_threads: The number of threads to use to process elements from
+  `input_dataset`.
+output_buffer_size: The maximum number of output elements to buffer in an
+  iterator over this dataset.
+)doc");
+
+REGISTER_OP("FlatMapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+Unlike MapDataset, the `f` in FlatMapDataset is expected to return a
+Dataset resource, and FlatMapDataset will flatten successive results
+into a single Dataset.
+
+f: A function mapping elements of `input_dataset`, concatenated with
+  `other_arguments`, to a Dataset resource that contains elements matching
+  `output_types` and `output_shapes`.
+)doc");
+
+REGISTER_OP("GroupByWindowDataset")
+    .Input("input_dataset: resource")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("window_size: int64")
+    .Output("handle: resource")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that computes a windowed group-by on `input_dataset`.
+
+// TODO(mrry): Support non-int64 keys.
+
+key_func: A function mapping an element of `input_dataset`, concatenated
+  with `key_func_other_arguments` to a scalar value of type DT_INT64.
+)doc");
+
+REGISTER_OP("FilterDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("predicate: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset containing elements of `input_dataset` matching `predicate`.
+
+The `predicate` function must return a scalar boolean and accept the
+following arguments:
+
+* One tensor for each component of an element of `input_dataset`.
+* One tensor for each value in `other_arguments`.
+
+predicate: A function returning a scalar boolean.
+other_arguments: A list of tensors, typically values that were captured when
+  building a closure for `predicate`.
+)doc");
+
+REGISTER_OP("BatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that batches `batch_size` elements from `input_dataset`.
+
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+)doc");
+
+REGISTER_OP("PaddedBatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
+                                               // `padded_shapes` are all
+                                               // vectors, the lengths of
+                                               // `output_types` and
+                                               // `output_shapes` are `N`,
+                                               // the `output_shapes` are (as
+                                               // far as possible to tell
+                                               // statically) compatible with
+                                               // `padded_shapes`, and
+                                               // that `padding_values` are
+                                               // all scalars.
+    .Doc(R"doc(
+Creates a dataset that batches and pads `batch_size` elements from the input.
+
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+padded_shapes: A list of int64 tensors representing the desired padded shapes
+  of the corresponding output components. These shapes may be partially
+  specified, using `-1` to indicate that a particular dimension should be
+  padded to the maximum size of all batch elements.
+padding_values: A list of scalars containing the padding value to use for
+  each of the outputs.
+)doc");
+
+REGISTER_OP("DenseToSparseBatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: resource")
+    // NOTE(mrry): the 0th and 2nd elements will be DT_INT64.
+    .Attr("output_types: list(type) >= 1")
+    // NOTE(mrry): the 1st and 2nd elements will be vectors.
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that yields a SparseTensor for each element of the input.
+
+input_dataset: A handle to an input dataset. Must have a single component.
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+row_shape: A vector representing the dense shape of each row in the produced
+  SparseTensor.
+)doc");
+
+REGISTER_OP("RangeDataset")
+    .Input("start: int64")
+    .Input("stop: int64")
+    .Input("step: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset with a range of values. Corresponds to python's xrange.
+
+start: corresponds to start in python's xrange().
+stop: corresponds to stop in python's xrange().
+step: corresponds to step in python's xrange().
+)doc");
+
+REGISTER_OP("ShuffleDataset")
+    .Input("input_dataset: resource")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+
+buffer_size: The number of output elements to buffer in an iterator over
+  this dataset. Compare with the `min_after_dequeue` attr when creating a
+  `RandomShuffleQueue`.
+seed: A scalar seed for the random number generator. If either seed or
+  seed2 is set to be non-zero, the random number generator is seeded
+  by the given seed.  Otherwise, a random seed is used.
+seed2: A second scalar seed to avoid seed collision.
+)doc");
+
+REGISTER_OP("CacheDataset")
+    .Input("input_dataset: resource")
+    .Input("filename: string")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that caches elements from `input_dataset`.
+
+A CacheDataset will iterate over the input_dataset, and store tensors. If the
+cache already exists, the cache will be used. If the cache is inappropriate
+(e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+will the returned when used.
+
+filename: A path on the filesystem where we should cache the dataset. Note: this
+  will be a directory.
+)doc");
+
+REGISTER_OP("TextLineDataset")
+    .Input("filenames: string")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): validate
+                                               // that `filenames` is
+                                               // a scalar or a
+                                               // vector.
+    .Doc(R"doc(
+Creates a dataset that emits the lines of one or more text files.
+
+filenames: A scalar or a vector containing the name(s) of the file(s) to be
+  read.
+)doc");
+
+REGISTER_OP("FixedLengthRecordDataset")
+    .Input("filenames: string")
+    .Input("header_bytes: int64")
+    .Input("record_bytes: int64")
+    .Input("footer_bytes: int64")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the records from one or more binary files.
+
+filenames: A scalar or a vector containing the name(s) of the file(s) to be
+  read.
+header_bytes: A scalar representing the number of bytes to skip at the
+  beginning of a file.
+record_bytes: A scalar representing the number of bytes in each record.
+footer_bytes: A scalar representing the number of bytes to skip at the end
+  of a file.
+)doc");
+
+REGISTER_OP("TFRecordDataset")
+    .Input("filenames: string")
+    .Input("compression_type: string")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the records from one or more TFRecord files.
+
+filenames: A scalar or vector containing the name(s) of the file(s) to be
+  read.
+compression_type: A scalar containing either (i) the empty string (no
+  compression), (ii) "ZLIB", or (iii) "GZIP".
+)doc");
+
+REGISTER_OP("Iterator")
+    .Output("handle: resource")
+    .Attr("shared_name: string")
+    .Attr("container: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+A container for an iterator resource.
+
+handle: A handle to the iterator that can be passed to a "MakeIterator"
+  or "IteratorGetNext" op.
+)doc");
+
+REGISTER_OP("MakeIterator")
+    .Input("dataset: resource")
+    .Input("iterator: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Makes a new iterator from the given `dataset` and stores it in `iterator`.
+
+This operation may be executed multiple times. Each execution will reset the
+iterator in `iterator` to the first element of `dataset`.
+)doc");
+
+REGISTER_OP("OneShotIterator")
+    .Output("handle: resource")
+    .Attr("dataset_factory: func")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Makes a "one-shot" iterator that can be iterated only once.
+
+A one-shot iterator bundles the logic for defining the dataset and
+the state of the iterator in a single op, which allows simple input
+pipelines to be defined without an additional initialization
+("MakeIterator") step.
+
+One-shot iterators have the following limitations:
+
+* They do not support parameterization: all logic for creating the underlying
+  dataset must be bundled in the `dataset_factory` function.
+* They are not resettable. Once a one-shot iterator reaches the end of its
+  underlying dataset, subsequent "IteratorGetNext" operations on that
+  iterator will always produce an `OutOfRange` error.
+
+For greater flexibility, use "Iterator" and "MakeIterator" to define
+an iterator using an arbitrary subgraph, which may capture tensors
+(including fed values) as parameters, and which may be reset multiple
+times by rerunning "MakeIterator".
+
+handle: A handle to the iterator that can be passed to an "IteratorGetNext"
+  op.
+dataset_factory: A function of type `() -> DT_RESOURCE`, where the returned
+  DT_RESOURCE is a handle to a dataset.
+)doc");
+
+REGISTER_OP("IteratorGetNext")
+    .Input("iterator: resource")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as `output_types` (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gets the next output from the given iterator.
+)doc");
+
+REGISTER_OP("IteratorDispose")
+    .Input("iterator: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Releases any resources used by the given iterator.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
new file mode 100644
index 00000000000..f7a96b58da3
--- /dev/null
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -0,0 +1,201 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file registers all TensorFlow Debugger (tfdbg) ops.
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+// EXPERIMENTAL: tfdbg debugger-inserted ops.
+// These ops are used only internally by tfdbg. There is no API for users to
+// direct create them. Users can create them indirectly by using
+// RunOptions.debug_options during Session::Run() call. See tfdbg documentation
+// for more details.
+REGISTER_OP("Copy")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string = ''")
+    .Attr("debug_ops_spec: list(string) = []")
+    .SetAllowsUninitializedInput()
+    .Doc(R"doc(
+Copy Op.
+
+Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
+device on which the tensor is allocated.
+N.B.: If the all downstream attached debug ops are disabled given the current
+gRPC gating status, the output will simply forward the input tensor without
+deep-copying. See the documentation of Debug* ops for more details.
+
+Unlike the CopyHost Op, this op does not have HostMemory constraint on its
+input or output.
+
+input: Input tensor.
+output: Output tensor, deep-copied from input.
+tensor_name: The name of the input tensor.
+debug_ops_spec: A list of debug op spec (op, url, gated_grpc) for attached debug
+  ops. Each element of the list has the format
+  <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
+  as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
+  "DebugIdentity;file:///tmp/tfdbg_1;0".
+)doc");
+
+REGISTER_OP("CopyHost")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string = ''")
+    .Attr("debug_ops_spec: list(string) = []")
+    .SetAllowsUninitializedInput()
+    .Doc(R"doc(
+Copy Host Op.
+
+Performs CPU-to-CPU deep-copying of tensor.
+N.B.: If the all downstream attached debug ops are disabled given the current
+gRPC gating status, the output will simply forward the input tensor without
+deep-copying. See the documentation of Debug* ops for more details.
+
+Unlike the Copy Op, this op has HostMemory constraint on its input or output.
+
+input: Input tensor.
+output: Output tensor, deep-copied from input.
+tensor_name: The name of the input tensor.
+debug_ops_spec: A list of debug op spec (op, url, gated_grpc) for attached debug
+  ops. Each element of the list has the format
+  <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
+  as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
+  "DebugIdentity;file:///tmp/tfdbg_1;0".
+)doc");
+
+REGISTER_OP("DebugIdentity")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("device_name: string = ''")
+    .Attr("tensor_name: string = ''")
+    .Attr("debug_urls: list(string) = []")
+    .Attr("gated_grpc: bool = false")
+    .SetAllowsUninitializedInput()
+    .Doc(R"doc(
+Debug Identity Op.
+
+Provides an identity mapping of the non-Ref type input tensor for debugging.
+
+input: Input tensor, non-Reference type.
+output: Output tensor that equals the input tensor.
+tensor_name: Name of the input tensor.
+debug_urls: List of URLs to debug targets, e.g.,
+  file:///foo/tfdbg_dump, grpc:://localhost:11011
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
+)doc");
+
+REGISTER_OP("DebugNanCount")
+    .Input("input: T")
+    .Output("output: int64")  // The debug signal (nan count) is int64
+    .Attr("T: type")
+    .Attr("device_name: string = ''")
+    .Attr("tensor_name: string = ''")
+    .Attr("debug_urls: list(string) = []")
+    .Attr("gated_grpc: bool = false")
+    .SetAllowsUninitializedInput()
+    .Doc(R"doc(
+Debug NaN Value Counter Op
+
+Counts number of NaNs in the input tensor, for debugging.
+
+input: Input tensor, non-Reference type.
+output: An integer output tensor that is the number of NaNs in the input.
+tensor_name: Name of the input tensor.
+debug_urls: List of URLs to debug targets, e.g.,
+  file:///foo/tfdbg_dump, grpc:://localhost:11011.
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
+)doc");
+
+REGISTER_OP("DebugNumericSummary")
+    .Input("input: T")
+    .Output("output: double")
+    .Attr("T: type")
+    .Attr("device_name: string = ''")
+    .Attr("tensor_name: string = ''")
+    .Attr("debug_urls: list(string) = []")
+    .Attr("lower_bound: float = -inf")
+    .Attr("upper_bound: float = inf")
+    .Attr("mute_if_healthy: bool = false")
+    .Attr("gated_grpc: bool = false")
+    .SetAllowsUninitializedInput()
+    .Doc(R"doc(
+Debug Numeric Summary Op.
+
+Provide a basic summary of numeric value types, range and distribution.
+
+input: Input tensor, non-Reference type, float or double.
+output: A double tensor of shape [14 + nDimensions], where nDimensions is the
+  the number of dimensions of the tensor's shape. The elements of output are:
+  [0]: is initialized (1.0) or not (0.0).
+  [1]: total number of elements
+  [2]: NaN element count
+  [3]: generalized -inf count: elements <= lower_bound. lower_bound is -inf by
+    default.
+  [4]: negative element count (excluding -inf), if lower_bound is the default
+    -inf. Otherwise, this is the count of elements > lower_bound and < 0.
+  [5]: zero element count
+  [6]: positive element count (excluding +inf), if upper_bound is the default
+    -inf. Otherwise, this is the count of elements < upper_bound and > 0.
+  [7]: generalized +inf count, elements >= upper_bound. upper_bound is +inf by
+    default.
+Output elements [1:8] are all zero, if the tensor is uninitialized.
+  [8]: minimum of all non-inf and non-NaN elements.
+       If uninitialized or no such element exists: +inf.
+  [9]: maximum of all non-inf and non-NaN elements.
+       If uninitialized or no such element exists: -inf.
+  [10]: mean of all non-inf and non-NaN elements.
+        If uninitialized or no such element exists: NaN.
+  [11]: variance of all non-inf and non-NaN elements.
+        If uninitialized or no such element exists: NaN.
+  [12]: Data type of the tensor encoded as an enum integer. See the DataType
+        proto for more details.
+  [13]: Number of dimensions of the tensor (ndims).
+  [14+]: Sizes of the dimensions.
+
+tensor_name: Name of the input tensor.
+debug_urls: List of URLs to debug targets, e.g.,
+  file:///foo/tfdbg_dump, grpc:://localhost:11011
+lower_bound: (float) The lower bound <= which values will be included in the
+  generalized -inf count. Default: -inf.
+upper_bound: (float) The upper bound >= which values will be included in the
+  generalized +inf count. Default: +inf.
+mute_if_healthy: (bool) Do not send data to the debug URLs unless at least one
+  of elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and
+  inf counts) is non-zero.
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
+
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 9fbebdb088a..ada96fa1d2d 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_Arg")
+REGISTER_SYSTEM_OP("_Arg")
     .Output("output: T")
     .Attr("T: type")
     .Attr("index: int >= 0")
@@ -34,7 +34,7 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
-REGISTER_OP("_Retval")
+REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
     .Attr("index: int >= 0")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 63b15204295..d1f9e949425 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -18,9 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::Dimension;
 using shape_inference::InferenceContext;
-using shape_inference::Shape;
 
 REGISTER_OP("SymbolicGradient")
     .Input("input: Tin")
diff --git a/tensorflow/core/ops/functional_ops_test.cc b/tensorflow/core/ops/functional_ops_test.cc
index 37ee301c3bd..0283bb22033 100644
--- a/tensorflow/core/ops/functional_ops_test.cc
+++ b/tensorflow/core/ops/functional_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -33,6 +32,7 @@ TEST(FunctionalOpsTest, SymbolicGradient_ShapeFn) {
     in_type_list.emplace_back(DT_FLOAT);
     src_list.emplace_back("a", 0, DT_FLOAT);
   }
+  out_type_list.reserve(num_outputs);
   for (int i = 0; i < num_outputs; ++i) {
     out_type_list.emplace_back(DT_FLOAT);
   }
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index cb216e70073..68f48630264 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -180,6 +180,42 @@ resized_images: 4-D with shape
   `[batch, new_height, new_width, channels]`.
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("QuantizedResizeBilinear")
+    .Input("images: T")
+    .Input("size: int32")
+    .Input("min: float")
+    .Input("max: float")
+    .Output("resized_images: T")
+    .Output("out_min: float")
+    .Output("out_max: float")
+    .Attr("T: {quint8, qint32, float}")
+    .Attr("align_corners: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(ResizeShapeFn(c));
+      ShapeHandle min_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &min_shape));
+      ShapeHandle max_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &max_shape));
+      c->set_output(1, c->MakeShape({}));
+      c->set_output(2, c->MakeShape({}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Resize quantized `images` to `size` using quantized bilinear interpolation.
+
+Input images and output images must be quantized types.
+
+images: 4-D with shape `[batch, height, width, channels]`.
+size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+  new size for the images.
+align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
+  exactly aligns the 4 corners of images and resized images. If false, rescale
+  by new_height / height. Treat similarly the width dimension.
+resized_images: 4-D with shape
+  `[batch, new_height, new_width, channels]`.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeBilinearGrad")
     .Input("grads: float")
@@ -349,6 +385,9 @@ The attr `ratio` allows downscaling the image by an integer factor during
 decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
 downscaling the image later.
 
+This op also supports decoding PNGs and non-animated GIFs since the interface is
+the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The JPEG-encoded image.
 channels: Number of color channels for the decoded image.
 ratio: Downscaling ratio.
@@ -525,6 +564,9 @@ Accepted values are:
 If needed, the PNG-encoded image is transformed to match the requested number
 of color channels.
 
+This op also supports decoding JPEGs and non-animated GIFs since the interface
+is the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The PNG-encoded image.
 channels: Number of color channels for the decoded image.
 image: 3-D with shape `[height, width, channels]`.
@@ -557,6 +599,28 @@ compression: Compression level.
 contents: 0-D. PNG-encoded image.
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeBmp")
+    .Input("contents: string")
+    .Output("image: uint8")
+    .Attr("channels: int = 0")
+    .SetShapeFn(DecodeImageShapeFn)
+    .Doc(R"doc(
+Decode the first frame of a BMP-encoded image to a uint8 tensor.
+
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the BMP-encoded image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+
+contents: 0-D.  The BMP-encoded image.
+image: 3-D with shape `[height, width, channels]`. RGB order
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeGif")
     .Input("contents: string")
@@ -576,7 +640,10 @@ Decode the first frame of a GIF-encoded image to a uint8 tensor.
 GIF with frame or transparency compression are not supported
 convert animated GIF from compressed to uncompressed by:
 
-convert $src.gif -coalesce $dst.gif
+    convert $src.gif -coalesce $dst.gif
+
+This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+`tf.image.decode_image`.
 
 contents: 0-D.  The GIF-encoded image.
 image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
@@ -807,7 +874,7 @@ size: A 1-D tensor of 2 elements containing the size of the glimpses
   to extract.  The glimpse height must be specified first, following
   by the glimpse width.
 offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-  the x, y locations of the center of each window.
+  the y, x locations of the center of each window.
 glimpse: A tensor representing the glimpses `[batch_size,
   glimpse_height, glimpse_width, channels]`.
 centered: indicates if the offset coordinates are centered relative to
@@ -869,7 +936,7 @@ boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
   in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
   `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
   `[0, 1]` interval of normalized image height is mapped to
-  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+  `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
   which case the sampled crop is an up-down flipped version of the original
   image. The width dimension is treated similarly. Normalized coordinates
   outside the `[0, 1]` range are allowed, in which case we use
@@ -963,11 +1030,50 @@ method: A string specifying the interpolation method. Only 'bilinear' is
 // --------------------------------------------------------------------------
 
 REGISTER_OP("NonMaxSuppression")
+  .Input("boxes: float")
+  .Input("scores: float")
+  .Input("max_output_size: int32")
+  .Output("selected_indices: int32")
+  .Attr("iou_threshold: float = 0.5")
+  .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+  .Doc(R"doc(
+Greedily selects a subset of bounding boxes in descending order of score,
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system.  Note that this
+algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression(
+      boxes, scores, max_output_size, iou_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+  score corresponding to each box (each row of boxes).
+max_output_size: A scalar integer tensor representing the maximum number of
+  boxes to be selected by non max suppression.
+iou_threshold: A float representing the threshold for deciding whether boxes
+  overlap too much with respect to IOU.
+selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
+  indices from the boxes tensor, where `M <= max_output_size`.
+)doc");
+
+REGISTER_OP("NonMaxSuppressionV2")
     .Input("boxes: float")
     .Input("scores: float")
     .Input("max_output_size: int32")
+    .Input("iou_threshold: float")
     .Output("selected_indices: int32")
-    .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
@@ -989,7 +1095,7 @@ collection of bounding boxes representing the selected boxes.  The bounding
 box coordinates corresponding to the selected indices can then be obtained
 using the `tf.gather operation`.  For example:
 
-  selected_indices = tf.image.non_max_suppression(
+  selected_indices = tf.image.non_max_suppression_v2(
       boxes, scores, max_output_size, iou_threshold)
   selected_boxes = tf.gather(boxes, selected_indices)
 
@@ -998,8 +1104,8 @@ scores: A 1-D float tensor of shape `[num_boxes]` representing a single
   score corresponding to each box (each row of boxes).
 max_output_size: A scalar integer tensor representing the maximum number of
   boxes to be selected by non max suppression.
-iou_threshold: A float representing the threshold for deciding whether boxes
-  overlap too much with respect to IOU.
+iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+  boxes overlap too much with respect to IOU.
 selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
   indices from the boxes tensor, where `M <= max_output_size`.
 )doc");
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index ab2b1574a38..ea202edfb37 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -226,4 +225,29 @@ TEST(ImageOpsTest, RandomCrop_ShapeFn) {
   INFER_OK(op, "[?,?,?];[2]", "[10,20,d0_2]");
 }
 
+TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
+  ShapeInferenceTestOp op("QuantizedResizeBilinear");
+  op.input_tensors.resize(4);
+
+  NodeDefBuilder builder =
+      NodeDefBuilder("test", "QuantizedResizeBilinear")
+          .Input(NodeDefBuilder::NodeOut{"images", 0, DT_QINT32})
+          .Input(NodeDefBuilder::NodeOut{"size", 0, DT_INT32})
+          .Input(NodeDefBuilder::NodeOut{"min", 0, DT_FLOAT})
+          .Input(NodeDefBuilder::NodeOut{"max", 0, DT_FLOAT})
+          .Attr("T", DT_QINT32)
+          .Attr("Toutput", DT_QINT32);
+  TF_ASSERT_OK(builder.Finalize(&op.node_def));
+
+  // When the size tensor is not a constant, the middle dims are unknown.
+  INFER_OK(op, "[1,?,3,?];[2];[];[]",
+           "[d0_0,?,?,d0_3];[];[]");  // output rank unknown
+  INFER_ERROR("must be rank 0", op, "[1,?,3,?];[2];[?];[]");
+  INFER_ERROR("must be rank 0", op, "[1,?,3,?];[2];[];[?]");
+
+  const Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  op.input_tensors.at(1) = &size_tensor;
+  INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 3e2583f7060..fa12816c92c 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -62,6 +62,7 @@ REGISTER_OP("SaveV2")
     .Input("shape_and_slices: string")
     .Input("tensors: dtypes")
     .Attr("dtypes: list(type)")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       ShapeHandle s;
@@ -101,6 +102,7 @@ REGISTER_OP("RestoreV2")
     .Input("shape_and_slices: string")
     .Output("tensors: dtypes")
     .Attr("dtypes: list(type)")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle shape0, shape1, shape2;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &shape0));
@@ -141,6 +143,7 @@ REGISTER_OP("MergeV2Checkpoints")
     .Input("checkpoint_prefixes: string")
     .Input("destination_prefix: string")
     .Attr("delete_old_dirs: bool = true")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
@@ -169,6 +172,7 @@ REGISTER_OP("Save")
     .Input("tensor_names: string")
     .Input("data: T")
     .Attr("T: list(type)")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       ShapeHandle s;
@@ -204,6 +208,7 @@ REGISTER_OP("SaveSlices")
     .Input("shapes_and_slices: string")
     .Input("data: T")
     .Attr("T: list(type)")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       ShapeHandle s;
@@ -261,6 +266,7 @@ REGISTER_OP("Restore")
     .Output("tensor: dt")
     .Attr("dt: type")
     .Attr("preferred_shard: int = -1")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
@@ -305,6 +311,7 @@ REGISTER_OP("RestoreSlice")
     .Output("tensor: dt")
     .Attr("dt: type")
     .Attr("preferred_shard: int = -1")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
@@ -440,6 +447,7 @@ REGISTER_OP("FixedLengthRecordReader")
     .Attr("header_bytes: int = 0")
     .Attr("record_bytes: int")
     .Attr("footer_bytes: int = 0")
+    .Attr("hop_bytes: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
@@ -448,6 +456,11 @@ REGISTER_OP("FixedLengthRecordReader")
 A Reader that outputs fixed-length records from a file.
 
 reader_handle: The handle to reference the Reader.
+header_bytes: Number of bytes in the header, defaults to 0.
+record_bytes: Number of bytes in the record.
+footer_bytes: Number of bytes in the footer, defaults to 0.
+hop_bytes: Number of bytes to hop before each read. Default of 0 means using
+        record_bytes.
 container: If non-empty, this reader is placed in the given container.
         Otherwise, a default container is used.
 shared_name: If non-empty, this reader is named in the given bucket
@@ -459,6 +472,7 @@ REGISTER_OP("FixedLengthRecordReaderV2")
     .Attr("header_bytes: int = 0")
     .Attr("record_bytes: int")
     .Attr("footer_bytes: int = 0")
+    .Attr("hop_bytes: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
@@ -467,6 +481,11 @@ REGISTER_OP("FixedLengthRecordReaderV2")
 A Reader that outputs fixed-length records from a file.
 
 reader_handle: The handle to reference the Reader.
+header_bytes: Number of bytes in the header, defaults to 0.
+record_bytes: Number of bytes in the record.
+footer_bytes: Number of bytes in the footer, defaults to 0.
+hop_bytes: Number of bytes to hop before each read. Default of 0 means using
+        record_bytes.
 container: If non-empty, this reader is placed in the given container.
         Otherwise, a default container is used.
 shared_name: If non-empty, this reader is named in the given bucket
@@ -508,6 +527,21 @@ shared_name: If non-empty, this reader is named in the given bucket
              with this shared_name. Otherwise, the node name is used instead.
 )doc");
 
+REGISTER_OP("LMDBReader")
+    .Output("reader_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+A Reader that outputs the records from a LMDB file.
+reader_handle: The handle to reference the Reader.
+container: If non-empty, this reader is placed in the given container.
+        Otherwise, a default container is used.
+shared_name: If non-empty, this reader is named in the given bucket
+             with this shared_name. Otherwise, the node name is used instead.
+)doc");
+
 // TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("IdentityReader")
     .Output("reader_handle: Ref(string)")
diff --git a/tensorflow/core/ops/io_ops_test.cc b/tensorflow/core/ops/io_ops_test.cc
index 9d98cb9048e..a915cdbe12c 100644
--- a/tensorflow/core/ops/io_ops_test.cc
+++ b/tensorflow/core/ops/io_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index a2762cf206a..6e1f2dc0529 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -218,7 +218,7 @@ REGISTER_OP("MatrixInverse")
     .Input("input: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
 Computes the inverse of one or more square invertible matrices or their
@@ -245,16 +245,25 @@ Equivalent to np.linalg.inv
 REGISTER_OP("Cholesky")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
 Computes the Cholesky decomposition of one or more square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix Cholesky
-decomposition above. The output is a tensor of the same shape as the input
+form square matrices.
+
+The input has to be symmetric and positive definite. Only the lower-triangular
+part of the input will be used for this operation. The upper-triangular part
+will not be read.
+
+The output is a tensor of the same shape as the input
 containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 
+**Note**: The gradient computation on GPU is faster for large matrices but
+not for large batch dimensions when the submatrices are small. In this
+case it might be faster to use the CPU.
+
 input: Shape is `[..., M, M]`.
 output: Shape is `[..., M, M]`.
 )doc");
@@ -318,7 +327,7 @@ REGISTER_OP("SelfAdjointEigV2")
     .Output("e: T")
     .Output("v: T")
     .Attr("compute_v: bool = True")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn)
     .Doc(R"doc(
 Computes the eigen decomposition of one or more square self-adjoint matrices.
@@ -326,7 +335,7 @@ Computes the eigen decomposition of one or more square self-adjoint matrices.
 Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
 `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
 
-```prettyprint
+```python
 # a is a tensor.
 # e is a tensor of eigenvalues.
 # v is a tensor of eigenvectors.
@@ -373,7 +382,7 @@ REGISTER_OP("MatrixTriangularSolve")
     .Output("output: T")
     .Attr("lower: bool = True")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
     })
@@ -478,7 +487,7 @@ Computes the QR decompositions of one or more matrices.
 Computes the QR decomposition of each inner matrix in `tensor` such that
 `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 
-```prettyprint
+```python
 # a is a tensor.
 # q is a tensor of orthonormal matrices.
 # r is a tensor of upper triangular matrices.
@@ -512,7 +521,7 @@ Computes the singular value decompositions of one or more matrices.
 Computes the SVD of each inner matrix in `input` such that
 `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 
-```prettyprint
+```python
 # a is a tensor containing a batch of matrices.
 # s is a tensor of singular values for each matrix.
 # u is the tensor containing of left singular vectors for each matrix.
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 42bd12a5b3f..4f5191f9f5b 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -65,6 +65,24 @@ summarize: Only print this many entries of each tensor.
 // Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
 // inputs or outputs in various ways.
 
+REGISTER_OP("TensorSummaryV2")
+    .Input("tag: string")
+    .Input("tensor: T")
+    // This serialized summary metadata field describes a summary value,
+    // specifically which plugins may use that summary.
+    .Input("serialized_summary_metadata: string")
+    .Output("summary: string")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+
+tag: A string attached to this summary. Used for organization in TensorBoard.
+tensor: A tensor to serialize.
+serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+  data.
+)doc");
+
 REGISTER_OP("TensorSummary")
     .Input("tensor: T")
     .Output("summary: string")
@@ -76,6 +94,10 @@ REGISTER_OP("TensorSummary")
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with a tensor.
 
+This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+a tag as well as a serialized SummaryMetadata proto string that contains
+plugin-specific data. We will keep this op to maintain backwards compatibility.
+
 tensor: A tensor to serialize.
 description: A json-encoded SummaryDescription proto.
 labels: An unused list of strings.
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
new file mode 100644
index 00000000000..dac02dad8bb
--- /dev/null
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -0,0 +1,670 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// --------------------------------------------------------------------------
+
+namespace {
+Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  for (int i = 0; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status TwoElementOutput(InferenceContext* c) {
+  c->set_output(0, c->Vector(2));
+  return Status::OK();
+}
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
+}  // namespace
+
+REGISTER_OP("LookupTableFind")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableFindV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableInsert")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableInsertV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableSize")
+    .Input("table_handle: Ref(string)")
+    .Output("size: int64")
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableSizeV2")
+    .Input("table_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableExport")
+    .Input("table_handle: Ref(string)")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableExportV2")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableImport")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableImportV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("HashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("HashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensors")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensorsV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableDenseHashTable")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("MutableDenseHashTableV2")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("InitializeTable")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFile")
+    .Input("table_handle: Ref(string)")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFileV2")
+    .Input("table_handle: resource")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index a530d286f76..9a58a31757f 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -155,6 +155,26 @@ Status Log1pGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Log1p", Log1pGrad);
 
+Status SinhGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"cosh"}, "Cosh", {"x"}, {}, {"dy"}},
+      {{"dx"}, "Mul", {"dy", "cosh"}},  // dy * cosh(x)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Sinh", SinhGrad);
+
+Status CoshGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"sinh"}, "Sinh", {"x"}, {}, {"dy"}},
+      {{"dx"}, "Mul", {"dy", "sinh"}},  // dy * sinh(x)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Cosh", CoshGrad);
+
 Status TanhGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 2def59ff04a..aa9706a3286 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <vector>
+
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -21,17 +23,16 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
+namespace {
 
 namespace f = test::function;
-typedef FunctionDefHelper FDH;
+using FDH = FunctionDefHelper;
 
-namespace {
-Session* NewSession() {
+std::unique_ptr<Session> NewSession() {
   SessionOptions opts;
   (*opts.config.mutable_device_count())["CPU"] = 1;
-  return NewSession(opts);
+  return std::unique_ptr<Session>(NewSession(opts));
 }
-}  // end namespace
 
 class MathGradTest : public ::testing::Test {
  protected:
@@ -85,7 +86,6 @@ class MathGradTest : public ::testing::Test {
       *y = outputs[0];
     }
     TF_CHECK_OK(sess->Close());
-    delete sess;
     return s;
   }
 
@@ -148,7 +148,6 @@ class MathGradTest : public ::testing::Test {
         sess->Run({{"x:0", x}, {"y:0", y}}, {"d:0", "d:1"}, {}, &outputs));
     CHECK_EQ(outputs.size(), 2);
     TF_CHECK_OK(sess->Close());
-    delete sess;
     *dx = outputs[0];
     *dy = outputs[1];
   }
@@ -204,7 +203,6 @@ class MathGradTest : public ::testing::Test {
         sess->Run({{"x:0", x}, {"i:0", idx}}, {"d:0", "d:1"}, {}, &outputs));
     CHECK_EQ(outputs.size(), 2);
     TF_CHECK_OK(sess->Close());
-    delete sess;
     *dx = outputs[0];
     *di = outputs[1];
   }
@@ -227,7 +225,6 @@ class MathGradTest : public ::testing::Test {
     TF_CHECK_OK(sess->Run({{"x:0", x}, {"y:0", y}}, {"z:0"}, {}, &outputs));
     CHECK_EQ(outputs.size(), 1);
     TF_CHECK_OK(sess->Close());
-    delete sess;
     return outputs[0];
   }
 
@@ -295,7 +292,6 @@ class MathGradTest : public ::testing::Test {
         sess->Run({{"x:0", x}, {"y:0", y}}, {"d:0", "d:1"}, {}, &outputs));
     CHECK_EQ(outputs.size(), 2);
     TF_CHECK_OK(sess->Close());
-    delete sess;
     *dx = outputs[0];
     *dy = outputs[1];
   }
@@ -359,14 +355,13 @@ class MathGradTest : public ::testing::Test {
                           {"d:0", "d:1", "d:2"}, {}, &outputs));
     CHECK_EQ(outputs.size(), 3);
     TF_CHECK_OK(sess->Close());
-    delete sess;
     *dc = outputs[0];
     *dx = outputs[1];
     *dy = outputs[2];
   }
 };
 
-static void HasError(const Status& s, const string& substr) {
+void HasError(const Status& s, const string& substr) {
   EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
       << s << ", expected substring " << substr;
 }
@@ -390,7 +385,7 @@ class TestOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
-#endif  // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Error_Reporting) {
   auto x = test::AsTensor<float>({-3.f});
@@ -500,6 +495,26 @@ TEST_F(MathGradTest, Log1p) {
   test::ExpectClose(ans, dx);
 }
 
+TEST_F(MathGradTest, Sinh) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) { return std::cosh(x); };
+  auto dx = test::AsTensor<float>(
+      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Sinh", x);
+  test::ExpectClose(ans, dx);
+}
+
+TEST_F(MathGradTest, Cosh) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) { return std::sinh(x); };
+  auto dx = test::AsTensor<float>(
+      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Cosh", x);
+  test::ExpectClose(ans, dx);
+}
+
 TEST_F(MathGradTest, Tanh) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
@@ -707,6 +722,8 @@ TEST_F(MathGradTest, Pow) {
   }
 }
 
+//TODO{lukeiwanski}: Implement Complex Pow for SYCL
+#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, ComplexPow) {
   auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
   auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@@ -725,6 +742,7 @@ TEST_F(MathGradTest, ComplexPow) {
       dy, test::AsTensor<complex64>({h(0.f, 2.f), h(2.f, 2.f), h(-2.f, 2.f)},
                                     TensorShape({3})));
 }
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Maximum) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
@@ -886,6 +904,8 @@ TEST_F(MathGradTest, MatMul_11) {
   test::ExpectClose(dy, MatMul(dz, true, x, true));
 }
 
+//TODO{lukeiwanski}: Implement BatchMatMul for SYCL
+#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, BatchMatMul_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
                                  TensorShape({1, 2, 3}));
@@ -933,6 +953,7 @@ TEST_F(MathGradTest, BatchMatMul_11) {
   test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
   test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
 }
+#endif // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Sum_dim0) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
@@ -1120,4 +1141,5 @@ TEST_F(MathGradTest, Max_dim0_dim1_Dups) {
       di, test::AsTensor<int32>({0, 0}, TensorShape({2})));
 }
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 00876bc18c3..8b9dd395360 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -99,10 +99,10 @@ individual slices can optionally be adjointed (to adjoint a matrix
 means to transpose and conjugate it) before multiplication by setting
 the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
 
-The input tensors `x` and `y` are 3-D or higher with shape `[..., r_x, c_x]`
+The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
 and `[..., r_y, c_y]`.
 
-The output tensor is 3-D or higher with shape `[..., r_o, c_o]`, where:
+The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
 
     r_o = c_x if adj_x else r_x
     c_o = r_y if adj_y else c_y
@@ -111,8 +111,8 @@ It is computed as:
 
     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 
-x: 3-D or higher with shape `[..., r_x, c_x]`.
-y: 3-D or higher with shape `[..., r_y, c_y]`.
+x: 2-D or higher with shape `[..., r_x, c_x]`.
+y: 2-D or higher with shape `[..., r_y, c_y]`.
 output: 3-D or higher with shape `[..., r_o, c_o]`
 adj_x: If `True`, adjoint the slices of `x`. Defaults to `False`.
 adj_y: If `True`, adjoint the slices of `y`. Defaults to `False`.
@@ -293,6 +293,14 @@ Computes natural logarithm of (1 + x) element-wise.
 I.e., \\(y = \log_e (1 + x)\\).
 )doc");
 
+REGISTER_OP("Sinh").UNARY_COMPLEX().Doc(R"doc(
+Computes hyperbolic sine of x element-wise.
+)doc");
+
+REGISTER_OP("Cosh").UNARY_COMPLEX().Doc(R"doc(
+Computes hyperbolic cosine of x element-wise.
+)doc");
+
 REGISTER_OP("Tanh").UNARY_COMPLEX().Doc(R"doc(
 Computes hyperbolic tangent of `x` element-wise.
 )doc");
@@ -595,7 +603,9 @@ REGISTER_OP("Mod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division.
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
 *NOTE*: `Mod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
@@ -623,12 +633,11 @@ REGISTER_OP("TruncateMod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division. This emulates C semantics where
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
-true, this follows C semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
+*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
@@ -662,13 +671,12 @@ Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 
 The upper regularized incomplete Gamma function is defined as:
 
-```
-Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-```
+\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+
 where
-```
-Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-```
+
+\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+
 is the upper incomplete Gama function.
 
 Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
@@ -686,13 +694,13 @@ Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 
 The lower regularized incomplete Gamma function is defined as:
 
-```
-P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-```
+
+\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+
 where
-```
-gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-```
+
+\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+
 is the lower incomplete Gamma function.
 
 Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
@@ -710,9 +718,9 @@ Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 
 The Hurwitz zeta function is defined as:
 
-```
-\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-```
+
+\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+
 )doc");
 
 REGISTER_OP("Polygamma")
@@ -726,12 +734,27 @@ Compute the polygamma function \\(\psi^{(n)}(x)\\).
 
 The polygamma function is defined as:
 
-```
-\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-```
+
+\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+
 where \\(\psi(x)\\) is the digamma function.
 )doc");
 
+REGISTER_OP("Atan2")
+    .Input("y: T")
+    .Input("x: T")
+    .Output("z: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .Doc(R"doc(
+Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+This is the angle \( \theta \in [-\pi, \pi] \) such that
+\[ x = r \cos(\theta) \]
+and
+\[ y = r \sin(\theta) \]
+where \(r = \sqrt(x^2 + y^2) \).
+)doc");
+
 REGISTER_OP("Betainc")
     .Input("a: T")
     .Input("b: T")
@@ -775,14 +798,14 @@ Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 
 The regularized incomplete beta integral is defined as:
 
-```
-I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-```
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
 where
 
-```
-B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-```
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
 
 is the incomplete beta function and \\(B(a, b)\\) is the *complete*
 beta function.
@@ -858,6 +881,18 @@ Returns the truth value of (x != y) element-wise.
 
 #undef EQUALITY_COMPARISON
 
+REGISTER_OP("ApproximateEqual")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: bool")
+    .SetIsCommutative()
+    .Attr("T: numbertype")
+    .Attr("tolerance: float = 0.00001")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns the truth value of abs(x-y) < tolerance element-wise.
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("LogicalNot")
@@ -900,18 +935,33 @@ REGISTER_OP("Select")
     .Output("output: T")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
+      auto* handle_data_1 = c->input_handle_shapes_and_types(1);
+      auto* handle_data_2 = c->input_handle_shapes_and_types(2);
       // Merge handle shape and dtype if applicable.
-      if (c->input_handle_dtype(1) != c->input_handle_dtype(2)) {
-        // TODO(apassos) resolve this in the manner of b/32476923
-        return errors::InvalidArgument(
-            "Trying to merge handles pointing to different dtypes.");
+      if (handle_data_1 != nullptr && handle_data_2 != nullptr) {
+        const auto size = handle_data_1->size();
+        std::vector<shape_inference::ShapeAndType> merged_handle_data(size);
+        if (size != handle_data_2->size()) {
+          return errors::InvalidArgument(
+              "Trying to merge handles pointing to different numbers of "
+              "tensors.");
+        }
+
+        for (int i = 0; i < size; ++i) {
+          const shape_inference::ShapeAndType& s1 = (*handle_data_1)[i];
+          const shape_inference::ShapeAndType& s2 = (*handle_data_2)[i];
+          if (s1.dtype != s2.dtype) {
+            // TODO(apassos) resolve this in the manner of b/32476923
+            return errors::InvalidArgument(
+                "Trying to merge handles pointing to different dtypes.");
+          }
+          merged_handle_data[i].dtype = s1.dtype;
+          TF_RETURN_IF_ERROR(
+              c->Merge(s1.shape, s2.shape, &merged_handle_data[i].shape));
+        }
+
+        c->set_output_handle_shapes_and_types(0, merged_handle_data);
       }
-      c->set_output_handle_dtype(0, c->input_handle_dtype(1));
-      ShapeHandle output_handle_shape;
-      TF_RETURN_IF_ERROR(c->Merge(c->input_handle_shape(1),
-                                  c->input_handle_shape(2),
-                                  &output_handle_shape));
-      c->set_output_handle_shape(0, output_handle_shape);
 
       // The inputs 'then' and 'else' must have the same shape.
       ShapeHandle data = c->input(1);
@@ -988,15 +1038,14 @@ element to copy from `t` and `e`.
 
 For example:
 
-```prettyprint
+```python
 # 'condition' tensor is [[True,  False]
 #                        [False, True]]
 # 't' is [[1, 2],
 #         [3, 4]]
 # 'e' is [[5, 6],
 #         [7, 8]]
-select(condition, t, e) ==> [[1, 6],
-                             [7, 4]]
+select(condition, t, e)  # => [[1, 6], [7, 4]]
 
 
 # 'condition' tensor is [True, False]
@@ -1059,6 +1108,9 @@ The inputs must be two-dimensional matrices and the inner dimension of "a" must
 match the outer dimension of "b". This op is optimized for the case where at
 least one of "a" or "b" is sparse. The breakeven for using this versus a dense
 matrix multiply on one platform was 30% zero values in the sparse matrix.
+
+The gradient computation of this operation will only take advantage of sparsity
+in the input gradient when that gradient comes from a Relu.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1244,6 +1296,8 @@ REGISTER_OP("ArgMax")
     .Doc(R"doc(
 Returns the index with the largest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -1258,6 +1312,8 @@ REGISTER_OP("ArgMin")
     .Doc(R"doc(
 Returns the index with the smallest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -1384,15 +1440,17 @@ REGISTER_OP("SegmentSum")
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
 
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \sum_j data_j\\) where sum is over `j` such
 that `segment_ids[j] == i`.
 
+If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1412,17 +1470,18 @@ REGISTER_OP("SegmentMean")
     .Doc(R"doc(
 Computes the mean along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
 over `j` such that `segment_ids[j] == i` and `N` is the total number of
 values summed.
 
+If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMean.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1442,16 +1501,17 @@ REGISTER_OP("SegmentProd")
     .Doc(R"doc(
 Computes the product along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \prod_j data_j\\) where the product is over `j` such
 that `segment_ids[j] == i`.
 
+If the product is empty for a given segment ID `i`, `output[i] = 1`.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentProd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1471,16 +1531,17 @@ REGISTER_OP("SegmentMin")
     .Doc(R"doc(
 Computes the minimum along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
 that `segment_ids[j] == i`.
 
+If the min is empty for a given segment ID `i`, `output[i] = 0`.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMin.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1500,15 +1561,17 @@ REGISTER_OP("SegmentMax")
     .Doc(R"doc(
 Computes the maximum along segments of a tensor.
 
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
 that `segment_ids[j] == i`.
 
+If the max is empty for a given segment ID `i`, `output[i] = 0`.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMax.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1529,9 +1592,8 @@ REGISTER_OP("UnsortedSegmentSum")
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
@@ -1544,7 +1606,7 @@ If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 `num_segments` should equal the number of distinct segment IDs.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
 
 segment_ids: A tensor whose shape is a prefix of `data.shape`.
@@ -1555,7 +1617,6 @@ output: Has same shape as data, except for the first `segment_ids.rank`
 
 )doc");
 
-
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
     .Input("segment_ids: Tindices")
@@ -1567,11 +1628,10 @@ REGISTER_OP("UnsortedSegmentMax")
     .Doc(R"doc(
 Computes the Max along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
-This operator is similar to the [unsorted segment sum operator](../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
 Instead of computing the sum over segments, it computes the maximum
 such that:
 
@@ -1582,7 +1642,7 @@ If the maximum is empty for a given segment ID `i`, it outputs the smallest poss
  `output[i] = numeric_limits<T>::min()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1603,31 +1663,30 @@ REGISTER_OP("SparseSegmentSum")
     .Doc(R"doc(
 Computes the sum along sparse segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
 
 For example:
 
-```prettyprint
+```python
 c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 
 # Select two rows, one segment.
 tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-  ==> [[0 0 0 0]]
+# => [[0 0 0 0]]
 
 # Select two rows, two segment.
 tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-  ==> [[ 1  2  3  4]
-       [-1 -2 -3 -4]]
+# => [[ 1  2  3  4]
+#     [-1 -2 -3 -4]]
 
 # Select all rows, two segments.
 tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-  ==> [[0 0 0 0]
-       [5 6 7 8]]
+# => [[0 0 0 0]
+#     [5 6 7 8]]
 
 # Which is equivalent to:
 tf.segment_sum(c, tf.constant([0, 0, 1]))
@@ -1652,9 +1711,8 @@ REGISTER_OP("SparseSegmentMean")
     .Doc(R"doc(
 Computes the mean along sparse segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -1702,9 +1760,8 @@ Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 
 N is the size of the segment being reduced.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 indices: A 1-D tensor. Has same rank as `segment_ids`.
 
@@ -2008,112 +2065,6 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
 ```
 )doc");
 
-REGISTER_OP("FFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Compute the 1-dimensional discrete Fourier Transform over the inner-most
-dimension of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its 1D Fourier Transform.
-)doc");
-
-REGISTER_OP("IFFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
-dimension of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its inverse 1D Fourier Transform.
-)doc");
-
-REGISTER_OP("FFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
-    })
-    .Doc(R"doc(
-Compute the 2-dimensional discrete Fourier Transform over the inner-most
-2 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their 2D Fourier Transform.
-
-@compatibility(numpy)
-Equivalent to np.fft2
-@end_compatibility
-)doc");
-
-REGISTER_OP("IFFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
-    })
-    .Doc(R"doc(
-Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
-2 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their inverse 2D Fourier Transform.
-
-@compatibility(numpy)
-Equivalent to np.ifft2
-@end_compatibility
-)doc");
-
-REGISTER_OP("FFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"doc(
-Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
-dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their 3D Fourier Transform.
-
-@compatibility(numpy)
-Equivalent to np.fft3
-@end_compatibility
-)doc");
-
-REGISTER_OP("IFFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"doc(
-Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
-3 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their inverse 3D Fourier Transform.
-
-@compatibility(numpy)
-Equivalent to np.fft3
-@end_compatibility
-)doc");
-
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Cross")
@@ -2140,6 +2091,37 @@ product: Pairwise cross product of the vectors in `a` and `b`.
 
 // --------------------------------------------------------------------------
 
+REGISTER_OP("Bincount")
+    .Input("arr: int32")
+    .Input("size: int32")
+    .Input("weights: T")
+    .Attr("T: {int32, int64, float32, float64}")
+    .Output("bins: T")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(1));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Counts the number of occurrences of each value in an integer array.
+
+Outputs a vector with length `size` and the same dtype as `weights`. If
+`weights` are empty, then index `i` stores the number of times the value `i` is
+counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+the value in `weights` at each index where the corresponding value in `arr` is
+`i`.
+
+Values in `arr` outside of the range [0, size) are ignored.
+
+arr: int32 `Tensor`.
+size: non-negative int32 scalar `Tensor`.
+weights: is an int32, int64, float32, or float64 `Tensor` with the same
+    shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+    equal to 1.
+
+bins: 1D `Tensor` with length equal to `size`. The counts or summed weights for
+    each value in the range [0, size).
+)doc");
+
 REGISTER_OP("Cumsum")
     .Input("x: T")
     .Input("axis: Tidx")
@@ -2154,26 +2136,31 @@ Compute the cumulative sum of the tensor `x` along `axis`.
 
 By default, this op performs an inclusive cumsum, which means that the first
 element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+
+```python
+tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
 ```
 
 By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
 performed instead:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+
+```python
+tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
 ```
 
 By setting the `reverse` kwarg to `True`, the cumsum is performed in the
 opposite direction:
-```prettyprint
-tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+
+```python
+tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
 ```
+
 This is more efficient than using separate `tf.reverse` ops.
 
 The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+
+```python
+tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
 ```
 )doc");
 
@@ -2191,26 +2178,31 @@ Compute the cumulative product of the tensor `x` along `axis`.
 
 By default, this op performs an inclusive cumprod, which means that the first
 element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+
+```python
+tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
 ```
 
 By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
 performed instead:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
+
+```python
+tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
 ```
 
 By setting the `reverse` kwarg to `True`, the cumprod is performed in the
 opposite direction:
-```prettyprint
-tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+
+```python
+tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 ```
+
 This is more efficient than using separate `tf.reverse` ops.
 
 The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
+
+```python
+tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 ```
 )doc");
 
@@ -2294,6 +2286,35 @@ max_z: The float value that the highest quantized output value represents.
 broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
+REGISTER_OP("QuantizedAdd")
+    .Input("x: T1")
+    .Input("y: T2")
+    .Input("min_x: float")
+    .Input("max_x: float")
+    .Input("min_y: float")
+    .Input("max_y: float")
+    .Output("z: Toutput")
+    .Output("min_z: float")
+    .Output("max_z: float")
+    .Attr("T1: quantizedtype")
+    .Attr("T2: quantizedtype")
+    .Attr("Toutput: quantizedtype = DT_QINT32")
+    .SetIsCommutative()
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .Doc(R"doc(
+Returns x + y element-wise, working on quantized buffers.
+
+min_x: The float value that the lowest quantized `x` value represents.
+max_x: The float value that the highest quantized `x` value represents.
+min_y: The float value that the lowest quantized `y` value represents.
+max_y: The float value that the highest quantized `y` value represents.
+min_z: The float value that the lowest quantized output value represents.
+max_z: The float value that the highest quantized output value represents.
+
+*NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
 REGISTER_OP("QuantizeDownAndShrinkRange")
     .Input("input: Tinput")
     .Input("input_min: float")
@@ -2417,30 +2438,35 @@ output_max: the computed max output.
 
 )doc");
 
-// Deprecated ops:
-REGISTER_OP("BatchFFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use FFT");
-REGISTER_OP("BatchIFFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use IFFT");
-REGISTER_OP("BatchFFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use FFT2D");
-REGISTER_OP("BatchIFFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use IFFT2D");
-REGISTER_OP("BatchFFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use FFT3D");
-REGISTER_OP("BatchIFFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
-    .Deprecated(15, "Use IFFT3D");
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Bucketize")
+    .Input("input: T")
+    .Output("output: int32")
+    .Attr("T: {int32, int64, float, double}")
+    .Attr("boundaries: list(float)")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Bucketizes 'input' based on 'boundaries'.
+
+For example, if the inputs are
+    boundaries = [0, 10, 100]
+    input = [[-5, 10000]
+             [150,   10]
+             [5,    100]]
+
+then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+input: Any shape of Tensor contains with int or float type.
+boundaries: A sorted list of floats gives the boundary of the buckets.
+output: Same shape with 'input', each value of input replaced with bucket index.
+
+@compatibility(numpy)
+Equivalent to np.digitize.
+@end_compatibility
+)doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 84264f13dc6..c10e667f564 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -27,6 +27,7 @@ TEST(MathOpsTest, AddN_ShapeFn) {
   ShapeInferenceTestOp op("AddN");
   auto set_n = [&op](int n) {
     std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
     for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
     TF_ASSERT_OK(NodeDefBuilder("test", "AddN")
                      .Input(src_list)
@@ -78,35 +79,6 @@ TEST(MathOpsTest, UnchangedShape_ShapeFn) {
   INFER_OK(op, "[1,?,3,4]", "in0");
 }
 
-TEST(MathOpsTest, FFT_ShapeFn) {
-  for (const auto* op_name : {"FFT", "IFFT"}) {
-    ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
-    INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[]");
-    INFER_OK(op, "[?]", "in0");
-    INFER_OK(op, "[1]", "in0");
-    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
-  }
-
-  for (const auto* op_name : {"FFT2D", "IFFT2D"}) {
-    ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
-    INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
-    INFER_OK(op, "[?,1]", "in0");
-    INFER_OK(op, "[1,2]", "in0");
-    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
-  }
-
-  for (const auto* op_name : {"FFT3D", "IFFT3D"}) {
-    ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
-    INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
-    INFER_OK(op, "[?,1,?]", "in0");
-    INFER_OK(op, "[1,2,3]", "in0");
-    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
-  }
-}
-
 TEST(MathOpsTest, Segment_ShapeFn) {
   // Tests SegmentReductionShapeFn.
   for (const auto* op_name : {"SegmentMax", "SegmentMean", "SegmentMin",
@@ -217,36 +189,70 @@ TEST(MathOpsTest, Select_ShapeFn) {
   INFER_ERROR("Dimension 2 in both shapes must be equal, but are 3 and 5", op,
               "[2,?,5];[?,?,3];[?,2,?]");
 
-  // Test that handle shapes were merged.
+  // Test that handles were merged.
+  //
+  // Tests below will modify handle_data and call run_inference_for_handles to
+  // rerun shape inference, updating the context <c>.
   const OpRegistrationData* op_reg_data;
   TF_ASSERT_OK(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
-  TensorShapeProto i0;
-  i0.add_dim()->set_size(1);
-  i0.add_dim()->set_size(-1);
-  TensorShapeProto i1;
-  i1.add_dim()->set_size(-1);
-  i1.add_dim()->set_size(2);
+  typedef std::vector<std::pair<TensorShapeProto, DataType>> ShapeDtypeV;
+  std::vector<std::unique_ptr<ShapeDtypeV>> handle_data;
+  std::unique_ptr<shape_inference::InferenceContext> c;
+  Status run_status;
+  auto run_inference_for_handles = [&]() -> Status {
+    CHECK(op_reg_data->shape_inference_fn != nullptr);
+    c.reset(new shape_inference::InferenceContext(
+        TF_GRAPH_DEF_VERSION, &op.node_def, op_reg_data->op_def,
+        {TensorShapeProto(), TensorShapeProto(), TensorShapeProto()}, {}, {},
+        handle_data));
+    TF_CHECK_OK(c->construction_status());
+    Status s = c->Run(op_reg_data->shape_inference_fn);
+    LOG(INFO) << "Inference got " << s;
+    return s;
+  };
+  auto shape_proto = [](std::initializer_list<int64> dim_sizes) {
+    TensorShapeProto p;
+    for (auto i : dim_sizes) p.add_dim()->set_size(i);
+    return p;
+  };
 
-  ASSERT_TRUE(op_reg_data->shape_inference_fn != nullptr);
-  shape_inference::InferenceContext c(
-      &op.node_def, op_reg_data->op_def,
-      {TensorShapeProto(), TensorShapeProto(), TensorShapeProto()}, {}, {},
-      {TensorShapeProto(), i0, i1}, {});
-  TF_ASSERT_OK(c.construction_status());
-  TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn));
-  EXPECT_TRUE(c.FullyDefined(c.output_handle_shape(0)));
-  EXPECT_EQ("[1,2]", c.DebugString(c.output_handle_shape(0)));
+  TensorShapeProto i0 = shape_proto({1, -1});
+  TensorShapeProto i1 = shape_proto({-1, 2});
+  TensorShapeProto unknown_shape;
+  unknown_shape.set_unknown_rank(true);
+  TensorShapeProto scalar;
+
+  handle_data.emplace_back(
+      new ShapeDtypeV{{scalar, DT_FLOAT}, {unknown_shape, DT_INT32}});
+  handle_data.emplace_back(new ShapeDtypeV{{i0, DT_FLOAT}, {i1, DT_INT32}});
+  handle_data.emplace_back(
+      new ShapeDtypeV{{i1, DT_FLOAT}, {unknown_shape, DT_INT32}});
+
+  TF_ASSERT_OK(run_inference_for_handles());
+  auto* out = c->output_handle_shapes_and_types(0);
+  ASSERT_EQ(2, out->size());
+  EXPECT_EQ("[1,2]", c->DebugString(out->at(0).shape));
+  EXPECT_EQ(DT_FLOAT, out->at(0).dtype);
+  EXPECT_EQ("[?,2]", c->DebugString(out->at(1).shape));
+  EXPECT_EQ(DT_INT32, out->at(1).dtype);
 
   // Expect an error when the shapes can't be merged.
-  TensorShapeProto i2;
-  i1.add_dim()->set_size(2);
-  i1.add_dim()->set_size(2);
-  shape_inference::InferenceContext c2(
-      &op.node_def, op_reg_data->op_def,
-      {TensorShapeProto(), TensorShapeProto(), TensorShapeProto()}, {}, {},
-      {TensorShapeProto(), i0, i2}, {});
-  TF_ASSERT_OK(c.construction_status());
-  EXPECT_FALSE(c2.Run(op_reg_data->shape_inference_fn).ok());
+  handle_data[2]->at(0).first = shape_proto({2, 2});
+  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
+                  .contains("must be equal, but are 1 and 2"));
+  handle_data[2]->at(0).first = i1;  // restore to valid
+
+  // Expect an error when the types can't be merged.
+  handle_data[2]->at(1).second = DT_INT64;
+  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
+                  .contains("pointing to different dtypes"));
+  handle_data[2]->at(1).second = DT_INT32;  // restore to valid
+
+  // Expect an error when different numbers of tensors are merged.
+  handle_data[2]->push_back({i1, DT_FLOAT});
+  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
+                  .contains("pointing to different numbers of tensors"));
+  handle_data[2]->pop_back();  // restore to valid.
 }
 
 TEST(MathOpsTest, Range_ShapeFn) {
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index e3b876b2401..560b71a337e 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -181,4 +181,82 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("MaxPool", MaxPoolGrad);
 
+Status AvgPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "grad: T"},
+    // Ret val defs
+    {"output: T"},
+    // Attr defs
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
+     "strides: list(int) >= 4",
+     GetPaddingAttrString()},
+    // Nodes
+    {
+      {{"i_shape"}, "Shape", {"input"}, {{"T", "$T"}}},
+      {{"output"}, "AvgPoolGrad", {"i_shape", "grad"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("AvgPool", AvgPoolGrad);
+
+Status MaxPoolGradGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "grad: T"},
+    // Ret val defs
+    {"output: T"},
+    // Attr defs
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
+     "strides: list(int) >= 4",
+     GetPaddingAttrString()},
+    // Nodes
+    {
+      // Invoke MaxPool again to recompute the outputs (removed by CSE?).
+      {{"maxpool"}, "MaxPool", {"input"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}},
+      {{"output"}, "MaxPoolGradGrad", {"input", "maxpool", "grad"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("MaxPoolGrad", MaxPoolGradGrad);
+
+Status BiasAddGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "bias: T", "grad: T"},
+    // Ret val defs
+    {"grad: T", "bias_grad: T"},
+    // Attr defs
+    {{"T: {float, double}"},
+     GetConvnetDataFormatAttrString()},
+    // Nodes
+    {
+      {{"bias_grad"}, "BiasAddGrad", {"grad"},
+           /*Attrs=*/{{"T", "$T"},
+                      {"data_format", "$data_format"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("BiasAdd", BiasAddGrad);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 2d757b7a17d..70302c38861 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -89,7 +89,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -117,7 +117,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -272,7 +272,7 @@ REGISTER_OP("FusedBatchNorm")
     .Output("batch_variance: T")
     .Output("reserve_space_1: T")
     .Output("reserve_space_2: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -281,10 +281,10 @@ REGISTER_OP("FusedBatchNorm")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
 
       bool is_training;
-      c->GetAttr("is_training", &is_training);
+      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
       int number_inputs = (is_training) ? 3 : 5;
       string data_format;
-      c->GetAttr("data_format", &data_format);
+      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
       DimensionHandle channel_dim =
           (data_format == "NHWC") ? c->Dim(x, 3) : c->Dim(x, 1);
 
@@ -348,7 +348,7 @@ REGISTER_OP("FusedBatchNormGrad")
     .Output("offset_backprop: T")
     .Output("reserve_space_3: T")
     .Output("reserve_space_4: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -360,8 +360,8 @@ REGISTER_OP("FusedBatchNormGrad")
 
       bool is_training;
       string data_format;
-      c->GetAttr("is_training", &is_training);
-      c->GetAttr("data_format", &data_format);
+      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
+      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
       DimensionHandle channel_dim = (data_format == "NHWC")
                                         ? c->Dim(y_backprop, 3)
                                         : c->Dim(y_backprop, 1);
@@ -504,7 +504,7 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -535,14 +535,21 @@ In detail, with the default NHWC format,
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 
-strides: 1-D of length 4.  The stride of the sliding window for each dimension
-  of `input`. Must be in the same order as the dimension specified with format.
+input: A 4-D tensor. The dimension order is interpreted according to the value
+    of `data_format`, see below for details.
+filter: A 4-D tensor of shape
+    `[filter_height, filter_width, in_channels, out_channels]`
+output: A 4-D tensor. The dimension order is determined by the value of
+    `data_format`, see below for details.
+strides: 1-D tensor of length 4.  The stride of the sliding window for each
+  dimension of `input`. The dimension order is determined by the value of
+    `data_format`, see below for details.
 padding: The type of padding algorithm to use.
 data_format: Specify the data format of the input and output data. With the
     default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
+        [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
+        [batch, channels, height, width].
 )doc");
 
 REGISTER_OP("Conv2DBackpropInput")
@@ -550,7 +557,7 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -592,7 +599,7 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -728,7 +735,7 @@ REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr("resize_align_corners: bool = false")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
@@ -770,7 +777,7 @@ REGISTER_OP("FusedPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
@@ -811,6 +818,7 @@ REGISTER_OP("DepthwiseConv2dNative")
     .Attr("T: {float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape)
     .Doc(R"doc(
 Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
@@ -835,6 +843,11 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 strides: 1-D of length 4.  The stride of the sliding window for each dimension
   of `input`.
 padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, height, width, channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, channels, height, width].
 )doc");
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
@@ -845,6 +858,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Attr("T: {float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -855,17 +869,27 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Doc(R"doc(
 Computes the gradients of depthwise convolution with respect to the input.
 
-input_sizes: An integer vector representing the shape of `input`,
-  where `input` is a 4-D `[batch, height, width, channels]` tensor.
+input_sizes: An integer vector representing the shape of `input`, based
+  on `data_format`.  For example, if `data_format` is 'NHWC' then
+   `input` is a 4-D `[batch, height, width, channels]` tensor.
 filter: 4-D with shape
   `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+out_backprop: 4-D with shape  based on `data_format`.
+  For example, if `data_format` is 'NHWC' then
+  out_backprop shape is `[batch, out_height, out_width, out_channels]`.
   Gradients w.r.t. the output of the convolution.
 strides: The stride of the sliding window for each dimension of the input
   of the convolution.
 padding: The type of padding algorithm to use.
-output: 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, height, width, channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, channels, height, width].
+output: 4-D with shape according to `data_format`.  For example, if
+  `data_format` is 'NHWC', output shape is `[batch, in_height,
+  in_width, in_channels]`.  Gradient w.r.t. the input of the
+  convolution.
 )doc");
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
@@ -876,6 +900,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Attr("T: {float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -886,15 +911,24 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Doc(R"doc(
 Computes the gradients of depthwise convolution with respect to the filter.
 
-input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+input: 4-D with shape based on `data_format`.  For example, if
+  `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+  in_width, in_channels]` tensor.
 filter_sizes: An integer vector representing the tensor shape of `filter`,
   where `filter` is a 4-D
   `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+out_backprop: 4-D with shape  based on `data_format`.
+  For example, if `data_format` is 'NHWC' then
+  out_backprop shape is `[batch, out_height, out_width, out_channels]`.
   Gradients w.r.t. the output of the convolution.
 strides: The stride of the sliding window for each dimension of the input
   of the convolution.
 padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, height, width, channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, channels, height, width].
 output: 4-D with shape
   `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
   the `filter` input of the convolution.
@@ -905,9 +939,10 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
     .SetShapeFn(shape_inference::Conv3DShape)
     .Doc(R"doc(
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
@@ -924,7 +959,11 @@ filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
 strides: 1-D tensor of length 5. The stride of the sliding window for each
   dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 padding: The type of padding algorithm to use.
-
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
 REGISTER_OP("Conv3DBackpropInput")
@@ -932,7 +971,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
@@ -958,7 +997,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
@@ -987,9 +1026,10 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1010,6 +1050,11 @@ out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
 strides: 1-D tensor of length 5. The stride of the sliding window for each
   dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 padding: The type of padding algorithm to use.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 
 )doc");
 
@@ -1018,9 +1063,10 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -1041,6 +1087,11 @@ out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
 strides: 1-D tensor of length 5. The stride of the sliding window for each
   dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 padding: The type of padding algorithm to use.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 
 )doc");
 
@@ -1052,7 +1103,8 @@ REGISTER_OP("AvgPool3D")
     .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
-    .Attr("T: numbertype")
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float, double}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
@@ -1064,6 +1116,11 @@ strides: 1-D tensor of length 5. The stride of the sliding window for each
 padding: The type of padding algorithm to use.
 input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
 output: The average pooled output tensor.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
 REGISTER_OP("AvgPool3DGrad")
@@ -1073,7 +1130,8 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
-    .Attr("T: numbertype")
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1092,6 +1150,11 @@ padding: The type of padding algorithm to use.
 orig_input_shape: The original input dimensions.
 grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
 output: The backprop for input.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1102,7 +1165,8 @@ REGISTER_OP("MaxPool3D")
     .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
-    .Attr("T: numbertype")
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
@@ -1114,17 +1178,24 @@ strides: 1-D tensor of length 5. The stride of the sliding window for each
 padding: The type of padding algorithm to use.
 input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
 output: The max pooled output tensor.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
 REGISTER_OP("MaxPool3DGrad")
-    .Input("orig_input: float")
-    .Input("orig_output: float")
+    .Input("orig_input: TInput")
+    .Input("orig_output: TInput")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("ksize: list(int) >= 5 ")
+    .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
-    .Attr("T: numbertype")
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float} = DT_FLOAT")
+    .Attr("TInput: {float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1139,6 +1210,49 @@ padding: The type of padding algorithm to use.
 orig_input: The original input tensor.
 orig_output: The original output tensor.
 grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
+)doc");
+
+REGISTER_OP("MaxPool3DGradGrad")
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5 ")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float}")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Pool3DShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: 1-D tensor of length 5. The size of the window for each dimension of
+  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+strides: 1-D tensor of length 5. The stride of the sliding window for each
+  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+padding: The type of padding algorithm to use.
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1146,7 +1260,7 @@ grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
@@ -1228,7 +1342,7 @@ output: The gradients for LRN.
 // --------------------------------------------------------------------------
 
 REGISTER_OP("MaxPool")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
@@ -1261,7 +1375,7 @@ REGISTER_OP("MaxPoolGrad")
     .Input("orig_output: T")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1283,6 +1397,43 @@ grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
 output: Gradients w.r.t. the input to `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradGrad")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+)doc");
+
 REGISTER_OP("MaxPoolWithArgmax")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1291,7 +1442,7 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Input("input: T")
     .Output("output: T")
     .Output("argmax: Targmax")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
       c->set_output(1, c->output(0));
@@ -1304,6 +1455,11 @@ The indices in `argmax` are flattened, so that a maximum value at position
 `[b, y, x, c]` becomes flattened index
 `((b * height + y) * width + x) * channels + c`.
 
+The indices returned are always in `[0, height) x [0, width)` before flattening,
+even if padding is involved and the mathematically correct answer is outside
+(either negative or too large).  This is a bug, but fixing it is difficult to do
+in a safe backwards compatible way, especially due to flattening.
+
 ksize: The size of the window for each dimension of the input tensor.
 strides: The stride of the sliding window for each dimension of the
   input tensor.
@@ -1322,7 +1478,7 @@ REGISTER_OP("MaxPoolGradWithArgmax")
     .Input("grad: T")
     .Input("argmax: Targmax")
     .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1340,6 +1496,39 @@ argmax: The indices of the maximum values chosen for each output of `max_pool`.
 output: Gradients w.r.t. the input of `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradGradWithArgmax")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr("Targmax: {int32, int64}")
+    .Input("input: T")
+    .Input("grad: T")
+    .Input("argmax: Targmax")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &unused));
+      // Validate 'argmax' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+input: The original input.
+grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+  input of `max_pool`.
+argmax: The indices of the maximum values chosen for each output of `max_pool`.
+output: Gradients of gradients w.r.t. the input of `max_pool`.
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Dilation2D")
@@ -1564,7 +1753,7 @@ backprops: The gradients:
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
@@ -1577,7 +1766,7 @@ REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the exponential linear (Elu) operation.
@@ -1810,9 +1999,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -2427,4 +2616,455 @@ scale_after_normalization: A bool indicating whether the resulted tensor
   needs to be multiplied with gamma.
 )doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Conv2D operator. Uses MKL DNN APIs to perform 2D convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
+2D convolution and add Bias to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 1 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilter. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DWithBiasBackpropBias")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2DBackpropBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the bias.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input_sizes: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Convolution2D backward input. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklRelu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Relu operator. Uses MKL DNN APIs to implement Relu operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklReluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklMaxPool")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("workspace_enabled: bool = false")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("workspace: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_workspace: uint8")
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of MaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklMaxPoolGrad")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr("workspace_enabled: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Input("workspace: T")
+    .Input("mkl_orig_input: uint8")
+    .Input("mkl_orig_output: uint8")
+    .Input("mkl_grad: uint8")
+    .Input("mkl_workspace: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of
+MaxPool operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklAvgPool")
+    .Input("value: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn(shape_inference::AvgPoolShape)
+    .Doc(R"doc(
+MKL version of AvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklAvgPoolGrad")
+    .Input("orig_input_shape: int32")
+    .Input("grad: T")
+    .Input("mkl_orig_input: uint8")
+    .Input("mkl_grad: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of AvgPoolGrad operator. Uses MKL DNN APIs to compute gradients
+of AvgPool function.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklLRN")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("workspace: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_workspace: uint8")
+    .Attr("depth_radius: int = 5")
+    .Attr("bias: float = 1.0")
+    .Attr("alpha: float = 1.0")
+    .Attr("beta: float = 0.5")
+    .Attr("workspace_enabled: bool = false")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+MKL version of LRN operator. Uses MKL DNN APIs to perform local response
+normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklLRNGrad")
+    .Input("input_grads: T")
+    .Input("input_image: T")
+    .Input("output_image: T")
+    .Input("workspace: T")
+    .Input("mkl_input_grads: uint8")
+    .Input("mkl_input_image: uint8")
+    .Input("mkl_output_image: uint8")
+    .Input("mkl_workspace: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("depth_radius: int = 5")
+    .Attr("bias: float = 1.0")
+    .Attr("alpha: float = 1.0")
+    .Attr("beta: float = 0.5")
+    .Attr("workspace_enabled: bool = false")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
+      TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));     // input_image
+      TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));     // output_image
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of LRNGrad operator. Uses MKL DNN APIs to compute gradient for
+local response normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklFusedBatchNorm")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("offset: T")
+    .Input("mean: T")
+    .Input("variance: T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Output("y: T")
+    .Output("batch_mean: T")
+    .Output("batch_variance: T")
+    .Output("reserve_space_1: T")
+    .Output("reserve_space_2: T")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle x;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
+
+      bool is_training;
+      c->GetAttr("is_training", &is_training);
+      int number_inputs = (is_training) ? 3 : 5;
+      string data_format;
+      c->GetAttr("data_format", &data_format);
+      DimensionHandle channel_dim =
+          (data_format == "NHWC") ? c->Dim(x, 3) : c->Dim(x, 1);
+
+      // covers scale, offset, and if is_training is false, mean, variance
+      for (int i = 1; i < number_inputs; ++i) {
+        ShapeHandle vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
+      }
+
+      ShapeHandle y;
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 3, channel_dim, &y));
+      } else {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 1, channel_dim, &y));
+      }
+      c->set_output(0, y);
+      ShapeHandle vector_shape = c->Vector(channel_dim);
+      c->set_output(1, vector_shape);
+      c->set_output(2, vector_shape);
+      c->set_output(3, vector_shape);
+      c->set_output(4, vector_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of FusedBatchNorm operator. Uses MKL DNN APIs to perform fused
+batch normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklFusedBatchNormGrad")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("reserve_space_1: T")
+    .Input("reserve_space_2: T")
+    .Input("mkl_y_backprop: uint8")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_reserve_space_1: uint8")
+    .Input("mkl_reserve_space_2: uint8")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: T")
+    .Output("offset_backprop: T")
+    .Output("reserve_space_3: T")
+    .Output("reserve_space_4: T")
+    .Output("mkl_x_backprop: uint8")
+    .Output("mkl_scale_backprop: uint8")
+    .Output("mkl_offset_backprop: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Output("mkl_reserve_space_4: uint8")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle y_backprop;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
+      ShapeHandle x;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
+
+      bool is_training;
+      string data_format;
+      c->GetAttr("is_training", &is_training);
+      c->GetAttr("data_format", &data_format);
+      DimensionHandle channel_dim = (data_format == "NHWC")
+                                        ? c->Dim(y_backprop, 3)
+                                        : c->Dim(y_backprop, 1);
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 3), &channel_dim));
+      } else {
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 1), &channel_dim));
+      }
+
+      // covers scale, mean (reserve_space_1), variance (reserve_space_2)
+      for (int i = 2; i < 5; ++i) {
+        ShapeHandle vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
+      }
+
+      ShapeHandle x_backprop;
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(y_backprop, 3, channel_dim, &x_backprop));
+      } else {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(y_backprop, 1, channel_dim, &x_backprop));
+      }
+      c->set_output(0, x_backprop);
+      c->set_output(1, c->Vector(channel_dim));
+      c->set_output(2, c->Vector(channel_dim));
+      // Set the correct shapes for reserve_spaces
+      // so that gradients can be performed when
+      // the op is in a symbolic condition.
+      if (is_training) {
+        c->set_output(3, c->Vector(0));
+        c->set_output(4, c->Vector(0));
+      } else {
+        c->set_output(3, c->Vector(channel_dim));
+        c->set_output(4, c->Vector(channel_dim));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of FusedBatchNormGrad operator. Uses MKL DNN APIs to compute
+gradients for fused batch normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklToTf")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL operator to convert a tensor from MKL layout to TensorFlow layout.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+#endif  // INTEL_MKL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 974d7aa87ba..a60b1c37880 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
diff --git a/tensorflow/core/ops/no_op.cc b/tensorflow/core/ops/no_op.cc
index 94eaec3fdf8..e62353bb7f9 100644
--- a/tensorflow/core/ops/no_op.cc
+++ b/tensorflow/core/ops/no_op.cc
@@ -18,8 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::InferenceContext;
-
 REGISTER_OP("NoOp")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc("Does nothing. Only useful as a placeholder for control edges.");
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 73635d34171..37d4379d48d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15,8 +15,8 @@ op {
       b: false
     }
   }
-  summary: "Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal."
-  description: "Returns nothing but an exception."
+  summary: "Raise a exception to abort the process when called."
+  description: "If exit_without_error is true, the process will exit normally,\notherwise it will exit with a SIGABORT signal.\n\nReturns nothing but an exception."
 }
 op {
   name: "Abs"
@@ -85,8 +85,8 @@ op {
       }
     }
   }
-  summary: "Applies a gradient to a given accumulator. Does not add if local_step is lesser"
-  description: "than the accumulator\'s global_step."
+  summary: "Applies a gradient to a given accumulator."
+  description: "Does not add if local_step is lesser than the accumulator\'s global_step."
 }
 op {
   name: "AccumulatorNumAccumulated"
@@ -116,8 +116,8 @@ op {
     description: "The new global_step value to set."
     type: DT_INT64
   }
-  summary: "Updates the accumulator with a new value for global_step. Logs warning if the"
-  description: "accumulator\'s value is already higher than new_global_step."
+  summary: "Updates the accumulator with a new value for global_step."
+  description: "Logs warning if the accumulator\'s value is already higher than\nnew_global_step."
 }
 op {
   name: "AccumulatorTakeGradient"
@@ -160,8 +160,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average gradient in the given ConditionalAccumulator, provided"
-  description: "that sufficient (i.e., more than num_required) gradients have been accumulated.\nThe op blocks until sufficient gradients have been accumulated.\nIf the accumulator has already aggregated more than num_required gradients, it\nreturns the average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average gradient in the given ConditionalAccumulator."
+  description: "The op blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated.  If the accumulator has already\naggregated more than num_required gradients, it returns the average of\nthe accumulated gradients.  Also automatically increments the recorded\nglobal_step in the accumulator by 1, and resets the aggregate to 0."
 }
 op {
   name: "Acos"
@@ -538,7 +538,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to produce per batch."
+    description: "Number of candidates to produce."
     has_minimum: true
     minimum: 1
   }
@@ -565,6 +565,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Any"
@@ -923,6 +924,14 @@ op {
     }
     description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, uses the nesterov update."
+  }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
 }
@@ -1015,6 +1024,67 @@ op {
   summary: "Update \'*var\' according to the centered RMSProp algorithm."
   description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\n\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nmg <- rho * mg_{t-1} + (1-rho) * grad\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\nvar <- var - mom"
 }
+op {
+  name: "ApplyDelayCompensatedGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lambda"
+    description: "The variance parameter."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shadow"
+    description: "Same as \"var\"."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "var -= alpha * (delta + lambda * delta * (var - shadow))"
+  description: "Update \'*shadow\' by changing it to the new value of \'var\'"
+  is_stateful: true
+}
 op {
   name: "ApplyFtrl"
   input_arg {
@@ -1454,6 +1524,52 @@ op {
   summary: "Update \'*var\' according to the RMSProp algorithm."
   description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
 }
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  summary: "Returns the truth value of abs(x-y) < tolerance element-wise."
+  is_commutative: true
+}
 op {
   name: "ArgMax"
   input_arg {
@@ -1505,6 +1621,7 @@ op {
     }
   }
   summary: "Returns the index with the largest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "ArgMin"
@@ -1557,6 +1674,7 @@ op {
     }
   }
   summary: "Returns the index with the smallest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "AsString"
@@ -1857,6 +1975,66 @@ op {
   }
   summary: "Computes atan of x element-wise."
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
+  description: "This is the angle \\( \\theta \\in [-\\pi, \\pi] \\) such that\n\\[ x = r \\cos(\\theta) \\]\nand\n\\[ y = r \\sin(\\theta) \\]\nwhere \\(r = \\sqrt(x^2 + y^2) \\)."
+}
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    description: "Float representation of audio data."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    description: "3D representation of the audio frequencies as an image."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    description: "How wide the input window is in samples. For the highest efficiency\nthis should be a power of two, but other values are accepted."
+  }
+  attr {
+    name: "stride"
+    type: "int"
+    description: "How widely apart the center of adjacent sample windows should be."
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether to return the squared magnitude or just the\nmagnitude. Using squared magnitude can avoid extra calculations."
+  }
+  summary: "Produces a visualization of audio data over time."
+  description: "Spectrograms are a standard way of representing audio information as a series of\nslices of frequency information, one slice for each window of time. By joining\nthese together into a sequence, they form a distinctive fingerprint of the sound\nover time.\n\nThis op expects to receive audio data as an input, stored as floats in the range\n-1 to 1, together with a window width in samples, and a stride specifying how\nfar to move the window between slices. From this it generates a three\ndimensional output. The lowest dimension has an amplitude value for each\nfrequency during that time slice. The next dimension is time, with successive\nfrequency slices. The final dimension is for the channels in the input, so a\nstereo audio input would have two here for example.\n\nThis means the layout when converted and saved as an image is rotated 90 degrees\nclockwise from a typical spectrogram. Time is descending down the Y axis, and\nthe frequency decreases from left to right.\n\nEach value in the result represents the square root of the sum of the real and\nimaginary parts of an FFT on the current window of samples. In this way, the\nlowest dimension represents the power of each frequency in the current window,\nand adjacent windows are concatenated in the next dimension.\n\nTo get a more intuitive and visual look at what this operation does, you can run\ntensorflow/examples/wav_to_spectrogram to read in an audio file and save out the\nresulting spectrogram as a PNG image."
+}
 op {
   name: "AudioSummary"
   input_arg {
@@ -1987,8 +2165,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
@@ -2033,6 +2211,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -2040,18 +2232,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2099,6 +2279,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -2106,18 +2300,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2184,8 +2366,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
@@ -2259,7 +2441,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the barrier\'s queue will be cancelled. InsertMany will fail, even\nif no new key is introduced."
+    description: "If true, all pending enqueue requests that are\nblocked on the barrier\'s queue will be canceled. InsertMany will fail, even\nif no new key is introduced."
   }
   summary: "Closes the given barrier."
   description: "This operation signals that no more new elements will be inserted in the\ngiven barrier. Subsequent InsertMany that try to introduce a new key will fail.\nSubsequent InsertMany operations that just add missing components to already\nexisting elements will continue to succeed. Subsequent TakeMany operations will\ncontinue to succeed if sufficient completed elements remain in the barrier.\nSubsequent TakeMany operations that would block will fail immediately."
@@ -2439,6 +2621,36 @@ op {
     explanation: "Use CholeskyGrad instead."
   }
 }
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
+  is_stateful: true
+}
 op {
   name: "BatchFFT"
   input_arg {
@@ -2533,12 +2745,12 @@ op {
   name: "BatchMatMul"
   input_arg {
     name: "x"
-    description: "3-D or higher with shape `[..., r_x, c_x]`."
+    description: "2-D or higher with shape `[..., r_x, c_x]`."
     type_attr: "T"
   }
   input_arg {
     name: "y"
-    description: "3-D or higher with shape `[..., r_y, c_y]`."
+    description: "2-D or higher with shape `[..., r_y, c_y]`."
     type_attr: "T"
   }
   output_arg {
@@ -2577,7 +2789,7 @@ op {
     description: "If `True`, adjoint the slices of `y`. Defaults to `False`."
   }
   summary: "Multiplies slices of two tensors in batches."
-  description: "Multiplies all slices of `Tensor` `x` and `y` (each slice can be\nviewed as an element of a batch), and arranges the individual results\nin a single output tensor of the same batch size. Each of the\nindividual slices can optionally be adjointed (to adjoint a matrix\nmeans to transpose and conjugate it) before multiplication by setting\nthe `adj_x` or `adj_y` flag to `True`, which are by default `False`.\n\nThe input tensors `x` and `y` are 3-D or higher with shape `[..., r_x, c_x]`\nand `[..., r_y, c_y]`.\n\nThe output tensor is 3-D or higher with shape `[..., r_o, c_o]`, where:\n\n    r_o = c_x if adj_x else r_x\n    c_o = r_y if adj_y else c_y\n\nIt is computed as:\n\n    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])"
+  description: "Multiplies all slices of `Tensor` `x` and `y` (each slice can be\nviewed as an element of a batch), and arranges the individual results\nin a single output tensor of the same batch size. Each of the\nindividual slices can optionally be adjointed (to adjoint a matrix\nmeans to transpose and conjugate it) before multiplication by setting\nthe `adj_x` or `adj_y` flag to `True`, which are by default `False`.\n\nThe input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`\nand `[..., r_y, c_y]`.\n\nThe output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:\n\n    r_o = c_x if adj_x else r_x\n    c_o = r_y if adj_y else c_y\n\nIt is computed as:\n\n    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])"
 }
 op {
   name: "BatchMatrixBandPart"
@@ -3128,7 +3340,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
+    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
     type_attr: "T"
   }
   attr {
@@ -3171,7 +3383,7 @@ op {
   }
   input_arg {
     name: "crops"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
     type_attr: "Tcrops"
   }
   output_arg {
@@ -3240,7 +3452,7 @@ op {
     }
   }
   summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
-  description: "The regularized incomplete beta integral is defined as:\n\n```\nI_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\n```\nwhere\n\n```\nB(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\n```\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
+  description: "The regularized incomplete beta integral is defined as:\n\n\n\\\\(I_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\\\\)\n\nwhere\n\n\n\\\\(B(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\\\\)\n\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
 }
 op {
   name: "BiasAdd"
@@ -3391,6 +3603,43 @@ op {
   summary: "Adds `bias` to `value`."
   description: "This is a deprecated version of BiasAdd and will be soon removed.\n\nThis is a special case of `tf.add` where `bias` is restricted to be 1-D.\nBroadcasting is supported, so `value` may have any number of dimensions."
 }
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    description: "int32 `Tensor`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    description: "non-negative int32 scalar `Tensor`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    description: "is an int32, int64, float32, or float64 `Tensor` with the same\nshape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights\nequal to 1."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    description: "1D `Tensor` with length equal to `size`. The counts or summed weights for\neach value in the range [0, size)."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: "Outputs a vector with length `size` and the same dtype as `weights`. If\n`weights` are empty, then index `i` stores the number of times the value `i` is\ncounted in `arr`. If `weights` are non-empty, then index `i` stores the sum of\nthe value in `weights` at each index where the corresponding value in `arr` is\n`i`.\n\nValues in `arr` outside of the range [0, size) are ignored."
+}
 op {
   name: "Bitcast"
   input_arg {
@@ -3512,6 +3761,38 @@ op {
   summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
   description: "This is typically used by gradient computations for a broadcasting operation."
 }
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    description: "Any shape of Tensor contains with int or float type."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+    description: "A sorted list of floats gives the boundary of the buckets."
+  }
+  summary: "Bucketizes \'input\' based on \'boundaries\'."
+  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -3663,9 +3944,48 @@ op {
     }
     description: "Scalar.  If set to false, *during* CTC calculation\nrepeated non-blank labels will not be merged and are interpreted as\nindividual labels.  This is a simplified version of CTC."
   }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Scalar. If set to true, during CTC\ncalculation, items that have longer output sequences than input sequences\nare skipped: they don\'t contribute to the loss term and have zero-gradient."
+  }
   summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
   description: "the gradient.  This class performs the softmax operation for you, so inputs\nshould be e.g. linear projections of outputs by an LSTM."
 }
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    description: "A path on the filesystem where we should cache the dataset. Note: this\nwill be a directory."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that caches elements from `input_dataset`."
+  description: "A CacheDataset will iterate over the input_dataset, and store tensors. If the\ncache already exists, the cache will be used. If the cache is inappropriate\n(e.g. cannot be opened, contains tensors of the wrong shape / size), an error\nwill the returned when used."
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -3757,11 +4077,13 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   summary: "Computes the Cholesky decomposition of one or more square matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix Cholesky\ndecomposition above. The output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`."
+  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices.\n\nThe input has to be symmetric and positive definite. Only the lower-triangular\npart of the input will be used for this operation. The upper-triangular part\nwill not be read.\n\nThe output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`.\n\n**Note**: The gradient computation on GPU is faster for large matrices but\nnot for large batch dimensions when the submatrices are small. In this\ncase it might be faster to use the CPU."
 }
 op {
   name: "CholeskyGrad"
@@ -3971,7 +4293,7 @@ op {
   }
   output_arg {
     name: "offset"
-    description: "The `N` int32 vectors representing the starting offset\n        of input tensors within the concatenated output.\n\nThis is typically used by gradient computations for a concat operation."
+    description: "The `N` int32 vectors representing the starting offset\nof input tensors within the concatenated output."
     type: DT_INT32
     number_attr: "N"
   }
@@ -3982,7 +4304,7 @@ op {
     minimum: 2
   }
   summary: "Computes offsets of concat inputs within its output."
-  description: "For example:\n\n```prettyprint\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```"
+  description: "For example:\n\n```\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```\n\nThis is typically used by gradient computations for a concat operation."
 }
 op {
   name: "ConcatV2"
@@ -4077,10 +4399,10 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
+    description: "If non-empty, this accumulator will be shared under the\ngiven name across multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating gradients. The accumulator accepts"
-  description: "gradients marked with local_step greater or equal to the most recent global_step\nknown to the accumulator. The average can be extracted from the accumulator,\nprovided sufficient gradients have been accumulated. Extracting the average\nautomatically resets the aggregate to 0, and increments the global_step recorded\nby the accumulator."
+  summary: "A conditional accumulator for aggregating gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
 op {
@@ -4135,14 +4457,17 @@ op {
   name: "Conv2D"
   input_arg {
     name: "input"
+    description: "A 4-D tensor. The dimension order is interpreted according to the value\nof `data_format`, see below for details."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
+    description: "A 4-D tensor of shape\n`[filter_height, filter_width, in_channels, out_channels]`"
     type_attr: "T"
   }
   output_arg {
     name: "output"
+    description: "A 4-D tensor. The dimension order is determined by the value of\n`data_format`, see below for details."
     type_attr: "T"
   }
   attr {
@@ -4152,14 +4477,13 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4.  The stride of the sliding window for each dimension\nof `input`. Must be in the same order as the dimension specified with format."
+    description: "1-D tensor of length 4.  The stride of the sliding window for each\ndimension of `input`. The dimension order is determined by the value of\n  `data_format`, see below for details."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -4185,7 +4509,7 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -4225,7 +4549,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4297,7 +4620,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4363,18 +4685,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4396,6 +4706,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
   description: "In signal processing, cross-correlation is a measure of similarity of\ntwo waveforms as a function of a time-lag applied to one of them. This\nis also known as a sliding dot product or sliding inner-product.\n\nOur Conv3D implements a form of cross-correlation."
 }
@@ -4427,18 +4751,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4494,18 +4806,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4527,6 +4827,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   summary: "Computes the gradients of 3-D convolution with respect to the filter."
 }
 op {
@@ -4557,18 +4871,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4624,18 +4926,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4657,6 +4947,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   summary: "Computes the gradients of 3-D convolution with respect to the input."
 }
 op {
@@ -4683,8 +4987,17 @@ op {
     }
     description: "The name of the input tensor."
   }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
   summary: "Copy Op."
-  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
+  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
   allows_uninitialized_input: true
 }
 op {
@@ -4711,8 +5024,17 @@ op {
     }
     description: "The name of the input tensor."
   }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
   summary: "Copy Host Op."
-  description: "Performs CPU-to-CPU deep-copying of tensor.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
+  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
   allows_uninitialized_input: true
 }
 op {
@@ -4740,6 +5062,30 @@ op {
   }
   summary: "Computes cos of x element-wise."
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes hyperbolic cosine of x element-wise."
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -4779,7 +5125,7 @@ op {
   }
   input_arg {
     name: "boxes"
-    description: "A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor\nspecifies the coordinates of a box in the `box_ind[i]` image and is specified\nin normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of\n`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the\n`[0, 1]` interval of normalized image height is mapped to\n`[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in\nwhich case the sampled crop is an up-down flipped version of the original\nimage. The width dimension is treated similarly. Normalized coordinates\noutside the `[0, 1]` range are allowed, in which case we use\n`extrapolation_value` to extrapolate the input image values."
+    description: "A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor\nspecifies the coordinates of a box in the `box_ind[i]` image and is specified\nin normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of\n`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the\n`[0, 1]` interval of normalized image height is mapped to\n`[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in\nwhich case the sampled crop is an up-down flipped version of the original\nimage. The width dimension is treated similarly. Normalized coordinates\noutside the `[0, 1]` range are allowed, in which case we use\n`extrapolation_value` to extrapolate the input image values."
     type: DT_FLOAT
   }
   input_arg {
@@ -5049,7 +5395,7 @@ op {
     }
   }
   summary: "Compute the cumulative product of the tensor `x` along `axis`."
-  description: "By default, this op performs an inclusive cumprod, which means that the first\nelement of the input is identical to the first element of the output:\n```prettyprint\ntf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumprod is\nperformed instead:\n```prettyprint\ntf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumprod is performed in the\nopposite direction:\n```prettyprint\ntf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]\n```\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n```prettyprint\ntf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]\n```"
+  description: "By default, this op performs an inclusive cumprod, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumprod([a, b, c])  # => [a, a * b, a * b * c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumprod is\nperformed instead:\n\n```python\ntf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumprod is performed in the\nopposite direction:\n\n```python\ntf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]\n```"
 }
 op {
   name: "Cumsum"
@@ -5115,7 +5461,7 @@ op {
     }
   }
   summary: "Compute the cumulative sum of the tensor `x` along `axis`."
-  description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n```prettyprint\ntf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n```prettyprint\ntf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n```prettyprint\ntf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]\n```\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n```prettyprint\ntf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]\n```"
+  description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumsum([a, b, c])  # => [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n\n```python\ntf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n\n```python\ntf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]\n```"
 }
 op {
   name: "DebugIdentity"
@@ -5133,6 +5479,13 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "tensor_name"
     type: "string"
@@ -5150,6 +5503,14 @@ op {
     }
     description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
+  }
   summary: "Debug Identity Op."
   description: "Provides an identity mapping of the non-Ref type input tensor for debugging."
   allows_uninitialized_input: true
@@ -5170,6 +5531,13 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "tensor_name"
     type: "string"
@@ -5185,7 +5553,15 @@ op {
       list {
       }
     }
-    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
+    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011."
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
   }
   summary: "Debug NaN Value Counter Op"
   description: "Counts number of NaNs in the input tensor, for debugging."
@@ -5200,13 +5576,20 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A double tensor of shape [12], the elements of which are:\n  [0]: is initialized (1.0) or not (0.0).\n  [1]: total number of elements\n  [2]: -inf count\n  [3]: negative element count (excluding -inf)\n  [4]: zero element count\n  [5]: positive element count (excluding +inf)\n  [6]: +inf element count\n  [7]: NaN element count\nOutput elements [1:8] are all zero, if the tensor is uninitialized.\n  [8]: minimum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: +inf.\n  [9]: maximum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: -inf.\n  [10]: mean of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN.\n  [11]: variance of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN."
+    description: "A double tensor of shape [14 + nDimensions], where nDimensions is the\n  the number of dimensions of the tensor\'s shape. The elements of output are:\n  [0]: is initialized (1.0) or not (0.0).\n  [1]: total number of elements\n  [2]: NaN element count\n  [3]: generalized -inf count: elements <= lower_bound. lower_bound is -inf by\n    default.\n  [4]: negative element count (excluding -inf), if lower_bound is the default\n    -inf. Otherwise, this is the count of elements > lower_bound and < 0.\n  [5]: zero element count\n  [6]: positive element count (excluding +inf), if upper_bound is the default\n    -inf. Otherwise, this is the count of elements < upper_bound and > 0.\n  [7]: generalized +inf count, elements >= upper_bound. upper_bound is +inf by\n    default.\nOutput elements [1:8] are all zero, if the tensor is uninitialized.\n  [8]: minimum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: +inf.\n  [9]: maximum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: -inf.\n  [10]: mean of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN.\n  [11]: variance of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN.\n  [12]: Data type of the tensor encoded as an enum integer. See the DataType\n        proto for more details.\n  [13]: Number of dimensions of the tensor (ndims).\n  [14+]: Sizes of the dimensions."
     type: DT_DOUBLE
   }
   attr {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "tensor_name"
     type: "string"
@@ -5224,6 +5607,38 @@ op {
     }
     description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
   }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+    description: "(float) The lower bound <= which values will be included in the\ngeneralized -inf count. Default: -inf."
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+    description: "(float) The upper bound >= which values will be included in the\ngeneralized +inf count. Default: +inf."
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "(bool) Do not send data to the debug URLs unless at least one\nof elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and\ninf counts) is non-zero."
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
+  }
   summary: "Debug Numeric Summary Op."
   description: "Provide a basic summary of numeric value types, range and distribution."
   allows_uninitialized_input: true
@@ -5243,6 +5658,28 @@ op {
   summary: "Decode web-safe base64-encoded strings."
   description: "Input may or may not have padding at the end. See EncodeBase64 for padding.\nWeb-safe means that input must use - and _ instead of + and /."
 }
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    description: "0-D.  The BMP-encoded image."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    description: "3-D with shape `[height, width, channels]`. RGB order"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Decode the first frame of a BMP-encoded image to a uint8 tensor."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the BMP-encoded image.\n*   3: output an RGB image.\n*   4: output an RGBA image."
+}
 op {
   name: "DecodeCSV"
   input_arg {
@@ -5280,7 +5717,15 @@ op {
     default_value {
       s: ","
     }
-    description: "delimiter to separate fields in a record."
+    description: "char delimiter to separate fields in a record."
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If false, treats double quotation marks as regular\ncharacters inside of the string fields (ignoring RFC 4180, Section 2,\nBullet 5)."
   }
   summary: "Convert CSV records to tensors. Each column maps to one tensor."
   description: "RFC 4180 format is expected for the CSV records.\n(https://tools.ietf.org/html/rfc4180)\nNote that we allow leading and trailing spaces with int or float field."
@@ -5298,7 +5743,7 @@ op {
     type: DT_UINT8
   }
   summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
-  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\nconvert $src.gif -coalesce $dst.gif"
+  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\n    convert $src.gif -coalesce $dst.gif\n\nThis op also supports decoding JPEGs and PNGs, though it is cleaner to use\n`tf.image.decode_image`."
 }
 op {
   name: "DecodeJSONExample"
@@ -5376,7 +5821,7 @@ op {
     description: "string specifying a hint about the algorithm used for\ndecompression.  Defaults to \"\" which maps to a system-specific\ndefault.  Currently valid values are [\"INTEGER_FAST\",\n\"INTEGER_ACCURATE\"].  The hint may be ignored (e.g., the internal\njpeg library changes to a version that does not have that specific\noption.)"
   }
   summary: "Decode a JPEG-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later.\n\nThis op also supports decoding PNGs and non-animated GIFs since the interface is\nthe same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodePng"
@@ -5412,7 +5857,7 @@ op {
     }
   }
   summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels.\n\nThis op also supports decoding JPEGs and non-animated GIFs since the interface\nis the same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodeRaw"
@@ -5452,6 +5897,42 @@ op {
   }
   summary: "Reinterpret the bytes of a string as a vector of numbers."
 }
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    description: "The WAV-encoded audio, usually from a file."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "audio"
+    description: "2-D with shape `[length, channels]`."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sample_rate"
+    description: "Scalar holding the sample rate found in the WAV header."
+    type: DT_INT32
+  }
+  attr {
+    name: "desired_channels"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Number of sample channels wanted."
+  }
+  attr {
+    name: "desired_samples"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Length of audio requested."
+  }
+  summary: "Decode a 16-bit PCM WAV file to a float tensor."
+  description: "The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.\n\nWhen desired_channels is set, if the input contains fewer channels than this\nthen the last channel will be duplicated to give the requested number, else if\nthe input has more channels than requested then the additional channels will be\nignored.\n\nIf desired_samples is set, then the audio will be cropped or padded with zeroes\nto the requested length.\n\nThe first output contains a Tensor with the content of the audio samples. The\nlowest dimension will be the number of channels, and the second will be the\nnumber of samples. For example, a ten-sample-long stereo WAV file should give an\noutput shape of [10, 2]."
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -5517,6 +5998,42 @@ op {
   summary: "Applies set operation along last dimension of 2 `Tensor` inputs."
   description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    description: "A handle to an input dataset. Must have a single component."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    description: "A vector representing the dense shape of each row in the produced\nSparseTensor."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
+  is_stateful: true
+}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -5605,7 +6122,7 @@ op {
     minimum: 2
   }
   summary: "DepthToSpace for tensors of type T."
-  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```prettyprint\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```prettyprint\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```prettyprint\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```prettyprint\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
+  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
 }
 op {
   name: "DepthwiseConv2dNative"
@@ -5647,6 +6164,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
   description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
@@ -5654,7 +6185,7 @@ op {
   name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`."
+    description: "4-D with shape based on `data_format`.  For example, if\n`data_format` is \'NHWC\' then `input` is a 4-D `[batch, in_height,\nin_width, in_channels]` tensor."
     type_attr: "T"
   }
   input_arg {
@@ -5664,7 +6195,7 @@ op {
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
+    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
@@ -5698,13 +6229,27 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the filter."
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
     name: "input_sizes"
-    description: "An integer vector representing the shape of `input`,\nwhere `input` is a 4-D `[batch, height, width, channels]` tensor."
+    description: "An integer vector representing the shape of `input`, based\non `data_format`.  For example, if `data_format` is \'NHWC\' then\n `input` is a 4-D `[batch, height, width, channels]` tensor."
     type: DT_INT32
   }
   input_arg {
@@ -5714,12 +6259,12 @@ op {
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
+    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient\nw.r.t. the input of the convolution."
+    description: "4-D with shape according to `data_format`.  For example, if\n`data_format` is \'NHWC\', output shape is `[batch, in_height,\nin_width, in_channels]`.  Gradient w.r.t. the input of the\nconvolution."
     type_attr: "T"
   }
   attr {
@@ -5748,6 +6293,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the input."
 }
 op {
@@ -5797,7 +6356,7 @@ op {
     }
   }
   summary: "Dequantize the \'input\' tensor into a float Tensor."
-  description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nif T == qint8, in[i] += (range(T) + 1)/ 2.0\nout[i] = min_range + (in[i]* (max_range - min_range) / range(T))\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nIf the input comes from a QuantizedRelu6, the output type is\nquint8 (range of 0-255) but the possible range of QuantizedRelu6 is\n0-6.  The min_range and max_range values are therefore 0.0 and 6.0.\nDequantize on quint8 will take each value, cast to float, and multiply\nby 6 / 255.\nNote that if quantizedtype is qint8, the operation will additionally add\neach value by 128 prior to casting.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```\nnumber_of_steps = 1 << (# of bits in T)\nrange_adjust = number_of_steps / (number_of_steps - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = range / number_of_steps\nconst double offset_input = static_cast<double>(input) - lowest_quantized;\nresult = range_min + ((input - numeric_limits<T>::min()) * range_scale)\n```"
+  description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nif T == qint8, in[i] += (range(T) + 1)/ 2.0\nout[i] = min_range + (in[i]* (max_range - min_range) / range(T))\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nIf the input comes from a QuantizedRelu6, the output type is\nquint8 (range of 0-255) but the possible range of QuantizedRelu6 is\n0-6.  The min_range and max_range values are therefore 0.0 and 6.0.\nDequantize on quint8 will take each value, cast to float, and multiply\nby 6 / 255.\nNote that if quantizedtype is qint8, the operation will additionally add\neach value by 128 prior to casting.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```c++\nnumber_of_steps = 1 << (# of bits in T)\nrange_adjust = number_of_steps / (number_of_steps - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = range / number_of_steps\nconst double offset_input = static_cast<double>(input) - lowest_quantized;\nresult = range_min + ((input - numeric_limits<T>::min()) * range_scale)\n```"
 }
 op {
   name: "DeserializeManySparse"
@@ -5876,7 +6435,7 @@ op {
     }
   }
   summary: "Returns a diagonal tensor with a given diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
 }
 op {
   name: "DiagPart"
@@ -5905,7 +6464,7 @@ op {
     }
   }
   summary: "Returns the diagonal part of the tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
 }
 op {
   name: "Digamma"
@@ -6222,7 +6781,7 @@ op {
     type: "type"
   }
   summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
-  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicPartition.png\" alt>\n</div>"
+  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\nSee `dynamic_stitch` for an example on how to merge partitions back.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicPartition.png\" alt>\n</div>"
 }
 op {
   name: "DynamicStitch"
@@ -6251,7 +6810,7 @@ op {
     type: "type"
   }
   summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicStitch.png\" alt>\n</div>"
+  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
 op {
   name: "EditDistance"
@@ -6320,15 +6879,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -6357,15 +6910,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -6530,6 +7077,26 @@ op {
   summary: "PNG-encode an image."
   description: "`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`\nwhere `channels` is:\n\n*   1: for grayscale.\n*   2: for grayscale + alpha.\n*   3: for RGB.\n*   4: for RGBA.\n\nThe ZLIB compression level, `compression`, can be -1 for the PNG-encoder\ndefault or a value from 0 to 9.  9 is the highest compression level, generating\nthe smallest output, but is slower."
 }
+op {
+  name: "EncodeWav"
+  input_arg {
+    name: "audio"
+    description: "2-D with shape `[length, channels]`."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    description: "Scalar containing the sample frequency."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    description: "0-D. WAV-encoded file contents."
+    type: DT_STRING
+  }
+  summary: "Encode audio data using the WAV file format."
+  description: "This operation will generate a string suitable to be saved out to create a .wav\naudio file. It will be encoded in the 16-bit PCM format. It takes in float\nvalues in the range -1.0f to 1.0f, and any outside that value will be clamped to\nthat range.\n\n`audio` is a 2-D float Tensor of shape `[length, channels]`.\n`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100)."
+}
 op {
   name: "Enter"
   input_arg {
@@ -6735,7 +7302,7 @@ op {
     }
   }
   summary: "Inserts a dimension of 1 into a tensor\'s shape."
-  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```prettyprint\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
+  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
 }
 op {
   name: "Expm1"
@@ -6777,7 +7344,7 @@ op {
   }
   input_arg {
     name: "offsets"
-    description: "A 2-D integer tensor of shape `[batch_size, 2]` containing\nthe x, y locations of the center of each window."
+    description: "A 2-D integer tensor of shape `[batch_size, 2]` containing\nthe y, x locations of the center of each window."
     type: DT_FLOAT
   }
   output_arg {
@@ -6884,11 +7451,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\ndimension of `input` is replaced with its 1D Fourier Transform."
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 1-dimensional discrete Fourier Transform over the inner-most"
-  description: "dimension of `input`."
+  summary: "Fast Fourier transform."
+  description: "Computes the 1-dimensional discrete Fourier transform over the inner-most\ndimension of `input`."
 }
 op {
   name: "FFT2D"
@@ -6899,11 +7466,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 2-dimensional discrete Fourier Transform over the inner-most"
-  description: "2 dimensions of `input`."
+  summary: "2D fast Fourier transform."
+  description: "Computes the 2-dimensional discrete Fourier transform over the inner-most\n2 dimensions of `input`."
 }
 op {
   name: "FFT3D"
@@ -6914,11 +7481,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft3\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 3-dimensional discrete Fourier Transform over the inner-most 3"
-  description: "dimensions of `input`."
+  summary: "3D fast Fourier transform."
+  description: "Computes the 3-dimensional discrete Fourier transform over the inner-most 3\ndimensions of `input`."
 }
 op {
   name: "FIFOQueue"
@@ -7055,8 +7622,22 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
-  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\nQuantization is called fake since the output is still in floating point."
+  description: "Attributes `[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nQuantization is called fake since the output is still in floating point."
 }
 op {
   name: "FakeQuantWithMinMaxArgsGradient"
@@ -7089,6 +7670,20 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
 }
 op {
@@ -7109,8 +7704,22 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
-  summary: "Fake-quantize the \'inputs\' tensor of type float and shape `[b, h, w, d]` via"
-  description: "global float scalars `min` and `max` to \'outputs\' tensor of same shape as\n`inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
+  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n`[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max`\nvalues."
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
@@ -7147,6 +7756,22 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter:\n`sum(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether to quantize into 2^num_bits - 1 distinct values."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
 }
 op {
@@ -7167,8 +7792,22 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
-  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n`[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max`\nvalues."
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
@@ -7205,6 +7844,22 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether to quantize into 2^num_bits - 1 distinct values."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
 }
 op {
@@ -7242,7 +7897,77 @@ op {
     type: "type"
   }
   summary: "Creates a tensor filled with a scalar value."
-  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```prettyprint\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    description: "A list of tensors, typically values that were captured when\nbuilding a closure for `predicate`."
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+    description: "A function returning a scalar boolean."
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset containing elements of `input_dataset` matching `predicate`."
+  description: "The `predicate` function must return a scalar boolean and accept the\nfollowing arguments:\n\n* One tensor for each component of an element of `input_dataset`.\n* One tensor for each value in `other_arguments`."
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    description: "A scalar representing the number of bytes to skip at the\nbeginning of a file."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    description: "A scalar representing the number of bytes in each record."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    description: "A scalar representing the number of bytes to skip at the end\nof a file."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the records from one or more binary files."
+  is_stateful: true
 }
 op {
   name: "FixedLengthRecordReader"
@@ -7258,10 +7983,12 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
+    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -7269,6 +7996,15 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the footer, defaults to 0."
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -7302,10 +8038,12 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
+    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -7313,6 +8051,15 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the footer, defaults to 0."
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -7365,7 +8112,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -7451,6 +8198,47 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "A unigram sampler could use a fixed unigram distribution read from a\nfile or passed in as an in-memory array instead of building up the distribution\nfrom data on the fly. There is also an option to skew the distribution by\napplying a distortion power to the weights.\n\nThe vocabulary file should be in CSV-like format, with the last field\nbeing the weight associated with the word.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset resource that contains elements matching\n`output_types` and `output_shapes`."
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: "Unlike MapDataset, the `f` in FlatMapDataset is expected to return a\nDataset resource, and FlatMapDataset will flatten successive results\ninto a single Dataset."
+  is_stateful: true
 }
 op {
   name: "Floor"
@@ -7871,19 +8659,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -7973,19 +8748,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8042,9 +8804,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8108,9 +8868,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8187,23 +8945,23 @@ op {
     }
   }
   summary: "Gather slices from `params` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/Gather.png\" alt>\n</div>"
+  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in\n`indices` are always validated to be within range. If assigned to GPU,\nout-of-bound indices result in safe but unspecified behavior, which may include\nraising an error.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GatherNd"
   input_arg {
     name: "params"
-    description: "`P-D`.  The tensor from which to gather values."
+    description: "The tensor from which to gather values."
     type_attr: "Tparams"
   }
   input_arg {
     name: "indices"
-    description: "`Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "`(P+Q-K-1)-D`.  Values from `params` gathered from indices given by\n`indices`."
+    description: "Values from `params` gathered from indices given by `indices`, with\nshape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`."
     type_attr: "Tparams"
   }
   attr {
@@ -8221,7 +8979,7 @@ op {
     }
   }
   summary: "Gather values or slices from `params` according to `indices`."
-  description: "`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `params`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `params`.\n\nProduces an output tensor with shape\n\n```\n[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].\n```\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
+  description: "`indices` is an integer tensor containing indices into `params`.  The last\ndimension of `indices` can be at most the rank of `params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] = params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
 }
 op {
   name: "GetSessionHandle"
@@ -8232,7 +8990,7 @@ op {
   }
   output_arg {
     name: "handle"
-    description: "The handle for the tensor stored in the session state."
+    description: "The handle for the tensor stored in the session state, represented\nas a string."
     type: DT_STRING
   }
   attr {
@@ -8241,6 +8999,25 @@ op {
   }
   summary: "Store the input tensor in the state of the current session."
 }
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    description: "The tensor to be stored."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    description: "The handle for the tensor stored in the session state, represented\nas a ResourceHandle object."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Store the input tensor in the state of the current session."
+  is_stateful: true
+}
 op {
   name: "GetSessionTensor"
   input_arg {
@@ -8328,6 +9105,63 @@ op {
   summary: "Returns the truth value of (x >= y) element-wise."
   description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+    description: "A function mapping an element of `input_dataset`, concatenated\nwith `key_func_other_arguments` to a scalar value of type DT_INT64."
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
+  description: "// TODO(mrry): Support non-int64 keys."
+  is_stateful: true
+}
 op {
   name: "HSVToRGB"
   input_arg {
@@ -8402,6 +9236,51 @@ op {
   description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
   is_stateful: true
 }
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates a non-initialized hash table."
+  description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
+  is_stateful: true
+}
 op {
   name: "HistogramSummary"
   input_arg {
@@ -8451,11 +9330,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\ndimension of `input` is replaced with its inverse 1D Fourier Transform."
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its inverse 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most"
-  description: "dimension of `input`."
+  summary: "Inverse fast Fourier transform."
+  description: "Computes the inverse 1-dimensional discrete Fourier transform over the\ninner-most dimension of `input`."
 }
 op {
   name: "IFFT2D"
@@ -8466,11 +9345,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.ifft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most"
-  description: "2 dimensions of `input`."
+  summary: "Inverse 2D fast Fourier transform."
+  description: "Computes the inverse 2-dimensional discrete Fourier transform over the\ninner-most 2 dimensions of `input`."
 }
 op {
   name: "IFFT3D"
@@ -8481,11 +9360,71 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft3\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most"
-  description: "3 dimensions of `input`."
+  summary: "Inverse 3D fast Fourier transform."
+  description: "Computes the inverse 3-dimensional discrete Fourier transform over the\ninner-most 3 dimensions of `input`."
+}
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [1]. The FFT length."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length` samples of its inverse\n  1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Inverse real-valued fast Fourier transform."
+  description: "Computes the inverse 1-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most dimension of `input`.\n\nThe inner-most dimension of `input` is assumed to be the result of `RFFT`: the\n`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If\n`fft_length` is not provided, it is computed from the size of the inner-most\ndimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to\ncompute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller\nthan the corresponding dimension of `input`, the dimension is cropped. If it is\nlarger, the dimension is padded with zeros."
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft2\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Inverse 2D real-valued fast Fourier transform."
+  description: "Computes the inverse 2-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 2 dimensions of `input`.\n\nThe inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 2 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong each axis `IRFFT2D` is computed on, if `fft_length` (or\n`fft_length / 2 + 1` for the inner-most dimension) is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 3D real Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.irfftn with 3 dimensions.\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Inverse 3D real-valued fast Fourier transform."
+  description: "Computes the inverse 3-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 3 dimensions of `input`.\n\nThe inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 3 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong each axis `IRFFT3D` is computed on, if `fft_length` (or\n`fft_length / 2 + 1` for the inner-most dimension) is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "Identity"
@@ -8583,7 +9522,7 @@ op {
     }
   }
   summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
-  description: "The lower regularized incomplete Gamma function is defined as:\n\n```\nP(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\n```\nwhere\n```\ngamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\n```\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
+  description: "The lower regularized incomplete Gamma function is defined as:\n\n\n\\\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\\\)\n\nwhere\n\n\\\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\\\)\n\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
 }
 op {
   name: "Igammac"
@@ -8610,7 +9549,7 @@ op {
     }
   }
   summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
-  description: "The upper regularized incomplete Gamma function is defined as:\n\n```\nQ(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\n```\nwhere\n```\nGamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\n```\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
+  description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
 }
 op {
   name: "Imag"
@@ -8852,6 +9791,82 @@ op {
   summary: "Initializes a table from a text file."
   description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
 }
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    description: "Filename of a vocabulary text file."
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    description: "Column index in a line to get the table `key` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    description: "Column index that represents information of a line to get the table\n`value` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Number of elements of the file, use -1 if unknown."
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+    description: "Delimiter to separate fields in a line."
+  }
+  summary: "Initializes a table from a text file."
+  description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Keys of type Tkey."
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    description: "Values of type Tval."
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  summary: "Table initializer that takes two tensors for keys and values respectively."
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -8944,7 +9959,7 @@ op {
     }
   }
   summary: "Computes the inverse permutation of a tensor."
-  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```prettyprint\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
+  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
 }
 op {
   name: "IsFinite"
@@ -9039,6 +10054,70 @@ op {
   description: "Outputs boolean scalar indicating whether the tensor has been initialized."
   allows_uninitialized_input: true
 }
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    description: "A handle to the iterator that can be passed to a \"MakeIterator\"\nor \"IteratorGetNext\" op."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "A container for an iterator resource."
+  is_stateful: true
+}
+op {
+  name: "IteratorDispose"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  summary: "Releases any resources used by the given iterator."
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Gets the next output from the given iterator."
+  is_stateful: true
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -9056,26 +10135,42 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
   summary: "L2 Loss."
   description: "Computes half the L2 norm of a tensor without the `sqrt`:\n\n    output = sum(t ** 2) / 2"
 }
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    description: "The handle to reference the Reader."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "A Reader that outputs the records from a LMDB file."
+  is_stateful: true
+}
 op {
   name: "LRN"
   input_arg {
@@ -9236,7 +10331,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -9270,6 +10365,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Less"
@@ -9450,7 +10546,7 @@ op {
     }
   }
   summary: "Computes the difference between two lists of numbers or strings."
-  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```prettyprint\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```prettyprint\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
+  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
 }
 op {
   name: "Log"
@@ -9562,7 +10658,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -9596,6 +10692,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a log-uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "LogicalAnd"
@@ -9673,6 +10770,34 @@ op {
   }
   summary: "Outputs all keys and values in the table."
 }
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    description: "Vector of all keys present in the table."
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    description: "Tensor of all values in the table. Indexed in parallel with `keys`."
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  summary: "Outputs all keys and values in the table."
+  is_stateful: true
+}
 op {
   name: "LookupTableFind"
   input_arg {
@@ -9706,6 +10831,39 @@ op {
   summary: "Looks up keys in a table, outputs the corresponding values."
   description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
 }
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    description: "Same shape as `keys`.  Values found in the table, or `default_values`\nfor missing keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Looks up keys in a table, outputs the corresponding values."
+  description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableImport"
   input_arg {
@@ -9735,6 +10893,35 @@ op {
   summary: "Replaces the contents of the table with the specified keys and values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Replaces the contents of the table with the specified keys and values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableInsert"
   input_arg {
@@ -9764,6 +10951,35 @@ op {
   summary: "Updates the table to associates keys with values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Updates the table to associates keys with values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -9779,6 +10995,21 @@ op {
   }
   summary: "Computes the number of elements in the given table."
 }
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    description: "Scalar that contains number of elements in the table."
+    type: DT_INT64
+  }
+  summary: "Computes the number of elements in the given table."
+  is_stateful: true
+}
 op {
   name: "LoopCond"
   input_arg {
@@ -9794,6 +11025,407 @@ op {
   summary: "Forwards the input to the output."
   description: "This operator represents the loop termination condition used by the\n\"pivot\" switches of a loop."
 }
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  summary: "Makes a new iterator from the given `dataset` and stores it in `iterator`."
+  description: "This operation may be executed multiple times. Each execution will reset the\niterator in `iterator` to the first element of `dataset`."
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes all elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  is_stateful: true
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op returns the number of incomplete elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op peeks at the values at the specified key.  If the"
+  description: "underlying container does not contain this key\nthis op will block until it does."
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op returns the number of elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    description: "int64"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "It is necessary to match this name to the matching Unstage Op."
+  }
+  summary: "Stage (key, values) in the underlying container which behaves like a hashtable."
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes and returns the values associated with the key"
+  description: "from the underlying container.   If the underlying container\ndoes not contain this key, the op will block until it does."
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes and returns a random (key, value)"
+  description: "from the underlying container.   If the underlying container\ndoes not contain elements, the op will block until it does."
+  is_stateful: true
+}
 op {
   name: "MatMul"
   input_arg {
@@ -9883,7 +11515,7 @@ op {
     type: "type"
   }
   summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
-  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```prettyprint\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```prettyprint\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
+  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
 }
 op {
   name: "MatrixDeterminant"
@@ -9927,7 +11559,7 @@ op {
     type: "type"
   }
   summary: "Returns a batched diagonal tensor with a given batched diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
 }
 op {
   name: "MatrixDiagPart"
@@ -9946,7 +11578,7 @@ op {
     type: "type"
   }
   summary: "Returns the batched diagonal part of a batched tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
 }
 op {
   name: "MatrixInverse"
@@ -9974,6 +11606,8 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -10126,6 +11760,8 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -10216,6 +11852,13 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -10298,25 +11941,26 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10327,12 +11971,12 @@ op {
   input_arg {
     name: "orig_input"
     description: "The original input tensor."
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
     description: "The original output tensor."
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
@@ -10368,29 +12012,117 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes gradients of max pooling function."
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    description: "The original input tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    description: "The original output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "Output backprop of shape `[batch, depth, rows, cols, channels]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients of gradients w.r.t. the input to `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
-  summary: "Computes gradients of max pooling function."
+  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGrad"
@@ -10462,12 +12194,175 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
   summary: "Computes gradients of the maxpooling function."
 }
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    description: "The original input tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    description: "The original output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "4-D.  Gradients of gradients w.r.t. the input of `max_pool`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients of gradients w.r.t. the input to `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "The size of the window for each dimension of the input tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    description: "The original input."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\ninput of `max_pool`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    description: "The indices of the maximum values chosen for each output of `max_pool`."
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients of gradients w.r.t. the input of `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "The size of the window for each dimension of the input tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
 op {
   name: "MaxPoolGradWithArgmax"
   input_arg {
@@ -10528,12 +12423,16 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -10598,18 +12497,22 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
   summary: "Performs max pooling on the input and outputs both max values and indices."
-  description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`."
+  description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`.\n\nThe indices returned are always in `[0, height) x [0, width)` before flattening,\neven if padding is involved and the mathematically correct answer is outside\n(either negative or too large).  This is a bug, but fixing it is difficult to do\nin a safe backwards compatible way, especially due to flattening."
 }
 op {
   name: "Maximum"
@@ -10734,7 +12637,7 @@ op {
     minimum: 1
   }
   summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor for become available to `output`, and sets\n`value_index` to its index in `inputs`."
+  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor to become available to `output`, and sets\n`value_index` to its index in `inputs`."
 }
 op {
   name: "MergeSummary"
@@ -10780,6 +12683,58 @@ op {
   }
   summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
   description: "result is one logical checkpoint, with one physical metadata file and renamed\ndata files.\n\nIntended for \"grouping\" multiple checkpoints in a sharded checkpoint setup.\n\nIf delete_old_dirs is true, attempts to delete recursively the dirname of each\npath in the input checkpoint_prefixes.  This is useful when those paths are non\nuser-facing temporary locations."
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    description: "Typically produced by the Spectrogram op, with magnitude_squared\nset to true."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    description: "How many samples per second the source audio used."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+    description: "The highest frequency to use when calculating the\nceptstrum."
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+    description: "The lowest frequency to use when calculating the\nceptstrum."
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+    description: "Resolution of the Mel bank used internally."
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+    description: "How many output channels to produce per time slice."
+  }
+  summary: "Transforms a spectrogram into a form that\'s useful for speech recognition."
+  description: "Mel Frequency Cepstral Coefficients are a way of representing audio data that\'s\nbeen effective as an input feature for machine learning. They are created by\ntaking the spectrum of a spectrogram (a \'cepstrum\'), and discarding some of the\nhigher frequencies that are less significant to the human ear. They have a long\nhistory in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum\nis a good resource to learn more."
 }
 op {
   name: "Min"
@@ -10921,7 +12876,7 @@ op {
     }
   }
   summary: "Pads a tensor with mirrored values."
-  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
+  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
 }
 op {
   name: "MirrorPadGrad"
@@ -10969,7 +12924,7 @@ op {
     }
   }
   summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
-  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
+  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
 }
 op {
   name: "Mod"
@@ -10997,8 +12952,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division."
-  description: "*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -11161,8 +13116,82 @@ op {
     }
     description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
   }
-  summary: "Creates an empty hash table that uses tensors as the backing store. It uses"
-  description: "\"open addressing\" with quadratic reprobing to resolve collisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+    description: "The shape of each value."
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+    description: "The initial number of hash table buckets. Must be a power\nto 2."
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
@@ -11264,6 +13293,103 @@ op {
   description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -11390,7 +13516,37 @@ op {
     description: "A float representing the threshold for deciding whether boxes\noverlap too much with respect to IOU."
   }
   summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\n\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n\n  selected_indices = tf.image.non_max_suppression(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
+  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n  selected_indices = tf.image.non_max_suppression(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    description: "A 2-D float tensor of shape `[num_boxes, 4]`."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    description: "A 1-D float tensor of shape `[num_boxes]` representing a single\nscore corresponding to each box (each row of boxes)."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    description: "A scalar integer tensor representing the maximum number of\nboxes to be selected by non max suppression."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    description: "A 0-D float tensor representing the threshold for deciding whether\nboxes overlap too much with respect to IOU."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    description: "A 1-D integer tensor of shape `[M]` representing the selected\nindices from the boxes tensor, where `M <= max_output_size`."
+    type: DT_INT32
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\n\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n\n  selected_indices = tf.image.non_max_suppression_v2(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
 }
 op {
   name: "NotEqual"
@@ -11489,6 +13645,426 @@ op {
   summary: "Returns a one-hot tensor."
   description: "The locations represented by indices in `indices` take value `on_value`,\nwhile all other locations take value `off_value`.\n\nIf the input `indices` is rank `N`, the output will have rank `N+1`,\nThe new axis is created at dimension `axis` (default: the new axis is\nappended at the end).\n\nIf `indices` is a scalar the output shape will be a vector of length `depth`.\n\nIf `indices` is a vector of length `features`, the output shape will be:\n```\n  features x depth if axis == -1\n  depth x features if axis == 0\n```\n\nIf `indices` is a matrix (batch) with shape `[batch, features]`,\nthe output shape will be:\n```\n  batch x features x depth if axis == -1\n  batch x depth x features if axis == 1\n  depth x batch x features if axis == 0\n```\n\n\nExamples\n=========\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 5.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[4 x 3]`:\n\n    ```output =\n      [5.0 0.0 0.0]  // one_hot(0)\n      [0.0 0.0 5.0]  // one_hot(2)\n      [0.0 0.0 0.0]  // one_hot(-1)\n      [0.0 5.0 0.0]  // one_hot(1)\n    ```\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 0.0\n  off_value = 3.0\n  axis = 0\n```\n\nThen output is `[3 x 4]`:\n\n    ```output =\n      [0.0 3.0 3.0 3.0]\n      [3.0 3.0 3.0 0.0]\n      [3.0 3.0 3.0 3.0]\n      [3.0 0.0 3.0 3.0]\n    //  ^                one_hot(0)\n    //      ^            one_hot(2)\n    //          ^        one_hot(-1)\n    //              ^    one_hot(1)\n    ```\nSuppose that\n\n```\n  indices = [[0, 2], [1, -1]]\n  depth = 3\n  on_value = 1.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[2 x 2 x 3]`:\n\n    ```output =\n      [\n        [1.0, 0.0, 0.0]  // one_hot(0)\n        [0.0, 0.0, 1.0]  // one_hot(2)\n      ][\n        [0.0, 1.0, 0.0]  // one_hot(1)\n        [0.0, 0.0, 0.0]  // one_hot(-1)\n      ]```"
 }
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    description: "A handle to the iterator that can be passed to an \"IteratorGetNext\"\nop."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+    description: "A function of type `() -> DT_RESOURCE`, where the returned\nDT_RESOURCE is a handle to a dataset."
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Makes a \"one-shot\" iterator that can be iterated only once."
+  description: "A one-shot iterator bundles the logic for defining the dataset and\nthe state of the iterator in a single op, which allows simple input\npipelines to be defined without an additional initialization\n(\"MakeIterator\") step.\n\nOne-shot iterators have the following limitations:\n\n* They do not support parameterization: all logic for creating the underlying\n  dataset must be bundled in the `dataset_factory` function.\n* They are not resettable. Once a one-shot iterator reaches the end of its\n  underlying dataset, subsequent \"IteratorGetNext\" operations on that\n  iterator will always produce an `OutOfRange` error.\n\nFor greater flexibility, use \"Iterator\" and \"MakeIterator\" to define\nan iterator using an arbitrary subgraph, which may capture tensors\n(including fed values) as parameters, and which may be reset multiple\ntimes by rerunning \"MakeIterator\"."
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    description: "a tensor of type T."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    description: "a tensor of the same shape and type as x but filled with ones."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Returns a tensor of ones with the same shape and type as x."
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes all elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op returns the number of incomplete elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op peeks at the values at the specified key.  If the"
+  description: "underlying container does not contain this key\nthis op will block until it does.   This Op is optimized for\nperformance."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op returns the number of elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    description: "int64"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "It is necessary to match this name to the matching Unstage Op."
+  }
+  summary: "Stage (key, values) in the underlying container which behaves like a ordered"
+  description: "associative container.   Elements are ordered by key."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes and returns the values associated with the key"
+  description: "from the underlying container.   If the underlying container\ndoes not contain this key, the op will block until it does."
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes and returns the (key, value) element with the smallest"
+  description: "key from the underlying container.   If the underlying container\ndoes not contain elements, the op will block until it does."
+  is_stateful: true
+}
 op {
   name: "Pack"
   input_arg {
@@ -11521,7 +14097,7 @@ op {
     description: "Dimension along which to pack.  Negative values wrap around, so the\nvalid range is `[-(R+1), R+1)`."
   }
   summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
-  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```prettyprint\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
+  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
 }
 op {
   name: "Pad"
@@ -11555,7 +14131,54 @@ op {
     }
   }
   summary: "Pads a tensor with zeros."
-  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    description: "A list of int64 tensors representing the desired padded shapes\nof the corresponding output components. These shapes may be partially\nspecified, using `-1` to indicate that a particular dimension should be\npadded to the maximum size of all batch elements."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    description: "A list of scalars containing the padding value to use for\neach of the outputs."
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
+  is_stateful: true
 }
 op {
   name: "PaddingFIFOQueue"
@@ -11691,7 +14314,56 @@ op {
     description: "the final shape of the result; should be equal to the shapes of any input\nbut with the number of input values in the first dimension."
   }
   summary: "Concatenates a list of `N` tensors along the first dimension."
-  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```prettyprint\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
+  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_threads"
+    description: "The number of threads to use to process elements from\n`input_dataset`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_buffer_size"
+    description: "The maximum number of output elements to buffer in an\niterator over this dataset."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: "Unlike a \"MapDataset\", which applies `f` sequentially, this dataset uses\nup to `num_threads` threads to process elements from `input_dataset`\nin parallel."
+  is_stateful: true
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -11793,7 +14465,7 @@ op {
   }
   input_arg {
     name: "dense_defaults"
-    description: "A list of Ndense Tensors (some may be empty).\ndense_defaults[j] provides default values\nwhen the example\'s feature_map lacks dense_key[j].  If an empty Tensor is\nprovided for dense_defaults[j], then the Feature dense_keys[j] is required.\nThe input type is inferred from dense_defaults[j], even when it\'s empty.\nIf dense_defaults[j] is not empty, its shape must match dense_shapes[j]."
+    description: "A list of Ndense Tensors (some may be empty).\ndense_defaults[j] provides default values\nwhen the example\'s feature_map lacks dense_key[j].  If an empty Tensor is\nprovided for dense_defaults[j], then the Feature dense_keys[j] is required.\nThe input type is inferred from dense_defaults[j], even when it\'s empty.\nIf dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,\nthen the shape of dense_defaults[j] must match that of dense_shapes[j].\nIf dense_shapes[j] has an undefined major dimension (variable strides dense\nfeature), dense_defaults[j] must contain a single element:\nthe padding element."
     type_list_attr: "Tdense"
   }
   output_arg {
@@ -11852,7 +14524,7 @@ op {
   attr {
     name: "dense_shapes"
     type: "list(shape)"
-    description: "A list of Ndense shapes; the shapes of data in each Feature\ngiven in dense_keys.\nThe number of elements in the Feature corresponding to dense_key[j]\nmust always equal dense_shapes[j].NumEntries().\nIf dense_shapes[j] == (D0, D1, ..., DN) then the shape of output\nTensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):\nThe dense outputs are just the inputs row-stacked by batch."
+    description: "A list of Ndense shapes; the shapes of data in each Feature\ngiven in dense_keys.\nThe number of elements in the Feature corresponding to dense_key[j]\nmust always equal dense_shapes[j].NumEntries().\nIf dense_shapes[j] == (D0, D1, ..., DN) then the shape of output\nTensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):\nThe dense outputs are just the inputs row-stacked by batch.\nThis works for dense_shapes[j] = (-1, D1, ..., DN).  In this case\nthe shape of the output Tensor dense_values[j] will be\n(|serialized|, M, D1, .., DN), where M is the maximum number of blocks\nof elements of length D1 * .... * DN, across all minibatch entries\nin the input.  Any minibatch entry with less than M blocks of elements of\nlength D1 * ... * DN will be padded with the corresponding default_value\nscalar element along the second dimension."
     has_minimum: true
   }
   summary: "Transforms a vector of brain.Example protos (as strings) into typed tensors."
@@ -12095,6 +14767,7 @@ op {
     type: "shape"
     default_value {
       shape {
+        unknown_rank: true
       }
     }
     description: "(Optional) The shape of the tensor. If the shape has 0 dimensions, the\nshape is unconstrained."
@@ -12121,6 +14794,10 @@ op {
   }
   summary: "A placeholder op for a value that will be fed into the computation."
   description: "N.B. This operation will fail with an error if it is executed. It is\nintended as a way to represent a value that will always be fed, and to\nprovide attrs that enable the fed value to be checked at runtime."
+  deprecation {
+    version: 23
+    explanation: "Placeholder now behaves the same as PlaceholderV2."
+  }
 }
 op {
   name: "PlaceholderWithDefault"
@@ -12171,7 +14848,7 @@ op {
     }
   }
   summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
-  description: "The polygamma function is defined as:\n\n```\n\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\n```\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
+  description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
 op {
   name: "Pow"
@@ -12209,16 +14886,26 @@ op {
   name: "PreventGradient"
   input_arg {
     name: "input"
+    description: "any tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
+    description: "the same input tensor."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "Will be printed in the error when anyone tries to differentiate\nthis operation."
+  }
   summary: "An identity op that triggers an error if a gradient is requested."
   description: "When executed in a graph, this op outputs its input tensor as-is.\n\nWhen building ops to compute gradients, the TensorFlow gradient system\nwill return an error when trying to lookup the gradient of this op,\nbecause no gradient must ever be registered for this function.  This\nop exists to prevent subtle bugs from silently returning unimplemented\ngradients in some corner cases."
 }
@@ -12540,15 +15227,86 @@ op {
     }
   }
   summary: "Computes the QR decompositions of one or more matrices."
-  description: "Computes the QR decomposition of each inner matrix in `tensor` such that\n`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`\n\n```prettyprint\n# a is a tensor.\n# q is a tensor of orthonormal matrices.\n# r is a tensor of upper triangular matrices.\nq, r = qr(a)\nq_full, r_full = qr(a, full_matrices=True)\n```"
+  description: "Computes the QR decomposition of each inner matrix in `tensor` such that\n`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`\n\n```python\n# a is a tensor.\n# q is a tensor of orthonormal matrices.\n# r is a tensor of upper triangular matrices.\nq, r = qr(a)\nq_full, r_full = qr(a, full_matrices=True)\n```"
 }
 op {
   name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Use QuantizeAndDequantizeV2 instead."
+  deprecation {
+    version: 22
+    explanation: "Replaced by QuantizeAndDequantizeV2"
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     description: "Tensor to quantize and then dequantize."
     type_attr: "T"
   }
+  input_arg {
+    name: "input_min"
+    description: "If range_given, this is the min of the range, otherwise this input\nwill be ignored."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    description: "If range_given, this is the max of the range, otherwise this input\nwill be ignored."
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -12577,22 +15335,6 @@ op {
     }
     description: "If the range is given or should be computed from the tensor."
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-    description: "If range is given, this is the min of the range."
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-    description: "If range is given, this is the max of the range."
-  }
   attr {
     name: "T"
     type: "type"
@@ -12604,7 +15346,7 @@ op {
     }
   }
   summary: "Quantizes then dequantizes a tensor."
-  description: "This op simulates the precision loss from the quantized forward pass by:\n1. Quantizing the tensor to fixed point numbers, which should match the target\n   quantization method when it is used in inference.\n2. Dequantizing it back to floating point numbers for the following ops, most\n   likely matmul.\n\nThere are different ways to quantize. This version does not use the full range\nof the output type, choosing to elide the lowest possible value for symmetry\n(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit\nquantization), so that 0.0 maps to 0.\n\nTo perform this op, we first find the range of values in our tensor. The range\nwe use is always centered on 0, so we find m such that\n\n1. m = max(abs(input_min), abs(input_max)) if range_given is true,\n2. m = max(max(abs(min_elem(input)), abs(max_elem(input))) otherwise.\n\nOur input tensor range is then [-m, m].\n\nNext, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].\nIf signed_input is true, this is\n\n  [min_fixed, max_fixed ] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].\n\nOtherwise, if signed_input is false, the fixed-point range is\n\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].\n\nFrom this we compute our scaling factor, s:\n\n  s = (max_fixed - min_fixed) / (2 * m).\n\nNow we can quantize and dequantize the elements of our tensor.  An element e\nis transformed into e\':\n\n  e\' = (e * s).round_to_nearest() / s.\n\nNote that we have a different number of buckets in the signed vs. unsigned\ncases.  For example, if num_bits == 8, we get 254 buckets in the signed case\nvs. 255 in the unsigned case.\n\nFor example, suppose num_bits = 8 and m = 1.  Then\n\n  [min_fixed, max_fixed] = [-127, 127], and\n  s = (127 + 127) / 2 = 127.\n\nGiven the vector {-1, -0.5, 0, 0.3}, this is quantized to\n{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}."
+  description: "This op simulates the precision loss from the quantized forward pass by:\n1. Quantizing the tensor to fixed point numbers, which should match the target\n   quantization method when it is used in inference.\n2. Dequantizing it back to floating point numbers for the following ops, most\n   likely matmul.\n\nThere are different ways to quantize. This version does not use the full range\nof the output type, choosing to elide the lowest possible value for symmetry\n(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit\nquantization), so that 0.0 maps to 0.\n\nTo perform this op, we first find the range of values in our tensor. The range\nwe use is always centered on 0, so we find m such that\n\n1. m = max(abs(input_min), abs(input_max)) if range_given is true,\n2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.\n\nOur input tensor range is then [-m, m].\n\nNext, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].\nIf signed_input is true, this is\n\n  [min_fixed, max_fixed ] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].\n\nOtherwise, if signed_input is false, the fixed-point range is\n\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].\n\nFrom this we compute our scaling factor, s:\n\n  s = (max_fixed - min_fixed) / (2 * m).\n\nNow we can quantize and dequantize the elements of our tensor.  An element e\nis transformed into e\':\n\n  e\' = (e * s).round_to_nearest() / s.\n\nNote that we have a different number of buckets in the signed vs. unsigned\ncases.  For example, if num_bits == 8, we get 254 buckets in the signed case\nvs. 255 in the unsigned case.\n\nFor example, suppose num_bits = 8 and m = 1.  Then\n\n  [min_fixed, max_fixed] = [-127, 127], and\n  s = (127 + 127) / 2 = 127.\n\nGiven the vector {-1, -0.5, 0, 0.3}, this is quantized to\n{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}."
 }
 op {
   name: "QuantizeDownAndShrinkRange"
@@ -12727,6 +15469,95 @@ op {
   summary: "Quantize the \'input\' tensor of type float to \'output\' tensor of type \'T\'."
   description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nout[i] = (in[i] - min_range) * range(T) / (max_range - min_range)\nif T == qint8, out[i] -= (range(T) + 1) / 2.0\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nAssume the input is type float and has a possible range of [0.0, 6.0] and the\noutput type is quint8 ([0, 255]). The min_range and max_range values should be\nspecified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each\nvalue of the input by 255/6 and cast to quint8.\n\nIf the output type was qint8 ([-128, 127]), the operation will additionally\nsubtract each value by 128 prior to casting, so that the range of values aligns\nwith the range of qint8.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```\nnumber_of_steps = 1 << (# of bits in T)\nrange_adjust = number_of_steps / (number_of_steps - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = number_of_steps / range\nquantized = round(input * range_scale) - round(range_min * range_scale) +\n  numeric_limits<T>::min()\nquantized = max(quantized, numeric_limits<T>::min())\nquantized = min(quantized, numeric_limits<T>::max())\n```\n\nThe biggest difference between this and MIN_COMBINED is that the minimum range\nis rounded first, before it\'s subtracted from the rounded value. With\nMIN_COMBINED, a small bias is introduced where repeated iterations of quantizing\nand dequantizing will introduce a larger and larger error.\n\nOne thing to watch out for is that the operator may choose to adjust the\nrequested minimum and maximum values slightly during the quantization process,\nso you should always use the output ports as the range for further calculations.\nFor example, if the requested minimum and maximum values are close to equal,\nthey will be separated by a small epsilon value to prevent ill-formed quantized\nbuffers from being created. Otherwise, you can end up with buffers where all the\nquantized values map to the same float value, which causes problems for\noperations that have to perform further calculations on them."
 }
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    description: "The float value that the lowest quantized `x` value represents."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    description: "The float value that the highest quantized `x` value represents."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    description: "The float value that the lowest quantized `y` value represents."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    description: "The float value that the highest quantized `y` value represents."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    description: "The float value that the lowest quantized output value represents."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    description: "The float value that the highest quantized output value represents.\n\n*NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about\nbroadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  summary: "Returns x + y element-wise, working on quantized buffers."
+  is_commutative: true
+}
 op {
   name: "QuantizedAvgPool"
   input_arg {
@@ -13779,6 +16610,61 @@ op {
   summary: "Reshapes a quantized tensor as per the Reshape op."
   description: "```"
 }
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    description: "4-D with shape `[batch, height, width, channels]`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
+  }
+  summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
+  description: "Input images and output images must be quantized types."
+}
 op {
   name: "QueueClose"
   input_arg {
@@ -13793,7 +16679,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be cancelled."
+    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
   summary: "Closes the given queue."
   description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
@@ -13811,7 +16697,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be cancelled."
+    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
   summary: "Closes the given queue."
   description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
@@ -13881,8 +16767,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueDequeueManyV2"
@@ -13916,8 +16802,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
@@ -13953,8 +16839,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has k outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
 }
 op {
   name: "QueueDequeueUpToV2"
@@ -13988,8 +16874,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
   is_stateful: true
 }
 op {
@@ -14173,6 +17059,66 @@ op {
   summary: "Computes the number of elements in the given queue."
   is_stateful: true
 }
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [1]. The FFT length."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length / 2 + 1` unique\n  frequency components of its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "Real-valued fast Fourier transform."
+  description: "Computes the 1-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most dimension of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the\n`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,\nfollowed by the `fft_length / 2` positive-frequency terms.\n\nAlong the axis `RFFT` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft2\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "2D real-valued fast Fourier transform."
+  description: "Computes the 2-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 2 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms.\n\nAlong each axis `RFFT2D` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the their 3D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfftn with 3 dimensions.\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "3D real-valued fast Fourier transform."
+  description: "Computes the 3-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 3 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms.\n\nAlong each axis `RFFT3D` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
+}
 op {
   name: "RGBToHSV"
   input_arg {
@@ -14315,6 +17261,64 @@ op {
   description: "This op uses the algorithm by Marsaglia et al. to acquire samples via\ntransformation-rejection from pairs of uniform and normal random variables.\nSee http://dl.acm.org/citation.cfm?id=358414"
   is_stateful: true
 }
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    description: "1-D integer tensor. Shape of independent samples to draw from each\ndistribution described by the shape parameters given in rate."
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    description: "A tensor in which each scalar is a \"rate\" parameter describing the\nassociated poisson distribution."
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    description: "A tensor with shape `shape + shape(rate)`. Each slice\n`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for\n`rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of\nrate."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "A second seed to avoid seed collision."
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Outputs random values from the Poisson distribution(s) described by rate."
+  description: "This op uses two algorithms, depending on rate. If rate >= 10, then\nthe algorithm by Hormann is used to acquire samples via\ntransformation-rejection.\nSee http://www.sciencedirect.com/science/article/pii/0167668793909974.\n\nOtherwise, Knuth\'s algorithm is used to acquire samples via multiplying uniform\nrandom variables.\nSee Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer\nProgramming, Volume 2. Addison Wesley"
+  is_stateful: true
+}
 op {
   name: "RandomShuffle"
   input_arg {
@@ -14348,7 +17352,7 @@ op {
     type: "type"
   }
   summary: "Randomly shuffles a tensor along its first dimension."
-  description: "  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped\n  to one and only one `output[i]`. For example, a mapping that might occur for a\n  3x2 tensor is:\n\n```prettyprint\n[[1, 2],       [[5, 6],\n [3, 4],  ==>   [1, 2],\n [5, 6]]        [3, 4]]\n```"
+  description: "  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped\n  to one and only one `output[i]`. For example, a mapping that might occur for a\n  3x2 tensor is:\n\n```\n[[1, 2],       [[5, 6],\n [3, 4],  ==>   [1, 2],\n [5, 6]]        [3, 4]]\n```"
   is_stateful: true
 }
 op {
@@ -14712,6 +17716,42 @@ op {
   summary: "Creates a sequence of numbers."
   description: "This operation creates a sequence of numbers that begins at `start` and\nextends by increments of `delta` up to but not including `limit`.\n\nFor example:\n\n```\n# \'start\' is 3\n# \'limit\' is 18\n# \'delta\' is 3\ntf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]\n```"
 }
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    description: "corresponds to start in python\'s xrange()."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    description: "corresponds to stop in python\'s xrange()."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    description: "corresponds to step in python\'s xrange()."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset with a range of values. Corresponds to python\'s xrange."
+  is_stateful: true
+}
 op {
   name: "Rank"
   input_arg {
@@ -14727,7 +17767,7 @@ op {
     type: "type"
   }
   summary: "Returns the rank of a tensor."
-  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
+  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
 }
 op {
   name: "ReadFile"
@@ -15219,7 +18259,7 @@ op {
     description: "The separator to use when joining."
   }
   summary: "Joins a string Tensor across the given dimensions."
-  description: "Computes the string join across dimensions in the given string Tensor of shape\n`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input\nstrings with the given separator (default: empty string).  Negative indices are\ncounted backwards from the end, with `-1` being equivalent to `n - 1`.\n\nFor example:\n\n```\n# tensor `a` is [[\"a\", \"b\"], [\"c\", \"d\"]]\ntf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, 0, keep_dims=True) ==> [[\"ac\", \"bd\"]]\ntf.reduce_join(a, 1, keep_dims=True) ==> [[\"ab\"], [\"cd\"]]\ntf.reduce_join(a, 0, separator=\".\") ==> [\"a.c\", \"b.d\"]\ntf.reduce_join(a, [0, 1]) ==> [\"acbd\"]\ntf.reduce_join(a, [1, 0]) ==> [\"abcd\"]\ntf.reduce_join(a, []) ==> [\"abcd\"]\n```"
+  description: "Computes the string join across dimensions in the given string Tensor of shape\n`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input\nstrings with the given separator (default: empty string).  Negative indices are\ncounted backwards from the end, with `-1` being equivalent to `n - 1`.\n\nFor example:\n\n```python\n# tensor `a` is [[\"a\", \"b\"], [\"c\", \"d\"]]\ntf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, 0, keep_dims=True) ==> [[\"ac\", \"bd\"]]\ntf.reduce_join(a, 1, keep_dims=True) ==> [[\"ab\"], [\"cd\"]]\ntf.reduce_join(a, 0, separator=\".\") ==> [\"a.c\", \"b.d\"]\ntf.reduce_join(a, [0, 1]) ==> [\"acbd\"]\ntf.reduce_join(a, [1, 0]) ==> [\"abcd\"]\ntf.reduce_join(a, []) ==> [\"abcd\"]\n```"
 }
 op {
   name: "RefEnter"
@@ -15551,6 +18591,36 @@ op {
   }
   summary: "Computes rectified linear gradients for a Relu operation."
 }
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of times that `input_dataset` should\nbe repeated. A value of `-1` indicates that it should be repeated infinitely."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits the outputs of `input_dataset` `count` times."
+  is_stateful: true
+}
 op {
   name: "RequantizationRange"
   input_arg {
@@ -15698,7 +18768,7 @@ op {
     }
   }
   summary: "Reshapes a tensor."
-  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
+  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
 }
 op {
   name: "ResizeArea"
@@ -16238,6 +19308,14 @@ op {
     }
     description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, uses the nesterov update."
+  }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
   is_stateful: true
@@ -16395,7 +19473,7 @@ op {
     description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "accum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  description: "accum_new = accum + grad * grad\nlinear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
@@ -17481,6 +20559,81 @@ op {
   description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Assign `value` to the sliced l-value reference of `ref`."
+  description: "The values of `value` are assigned to the positions in the variable\n`ref` that are selected by the slice parameters. The slice parameters\n`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.\n\nNOTE this op currently does not support broadcasting and so `value`\'s\nshape must be exactly the shape produced by the slice of `ref`."
+  is_stateful: true
+}
 op {
   name: "Restore"
   input_arg {
@@ -17513,6 +20666,7 @@ op {
   }
   summary: "Restores a tensor from checkpoint files."
   description: "Reads a tensor stored in one or several files. If there are several files (for\ninstance because a tensor was saved as slices), `file_pattern` may contain\nwildcard symbols (`*` and `?`) in the filename portion only, not in the\ndirectory portion.\n\nIf a `file_pattern` matches several files, `preferred_shard` can be used to hint\nin which file the requested tensor is likely to be found. This op will first\nopen the file at index `preferred_shard` in the list of matching files and try\nto restore tensors from that file.  Only if some tensors or tensor slices are\nnot found in that first file, then the Op opens all the files. Setting\n`preferred_shard` to match the value passed as the `shard` input\nof a matching `Save` Op may speed up Restore.  This attribute only affects\nperformance, not correctness.  The default value -1 means files are processed in\norder.\n\nSee also `RestoreSlice`."
+  is_stateful: true
 }
 op {
   name: "RestoreSlice"
@@ -17551,6 +20705,7 @@ op {
   }
   summary: "Restores a tensor from checkpoint files."
   description: "This is like `Restore` except that restored tensor can be listed as filling\nonly a slice of a larger tensor.  `shape_and_slice` specifies the shape of the\nlarger tensor and the slice that the restored tensor covers.\n\nThe `shape_and_slice` input has the same format as the\nelements of the `shapes_and_slices` input of the `SaveSlices` op."
+  is_stateful: true
 }
 op {
   name: "RestoreV2"
@@ -17583,6 +20738,7 @@ op {
   }
   summary: "Restores tensors from a V2 checkpoint."
   description: "For backward compatibility with the V1 format, this Op currently allows\nrestoring from a V1 checkpoint as well:\n  - This Op first attempts to find the V2 index file pointed to by \"prefix\", and\n    if found proceed to read it as a V2 checkpoint;\n  - Otherwise the V1 read path is invoked.\nRelying on this behavior is not recommended, as the ability to fall back to read\nV1 might be deprecated and eventually removed.\n\nBy default, restores the named tensors in full.  If the caller wishes to restore\nspecific slices of stored tensors, \"shape_and_slices\" should be non-empty\nstrings and correspondingly well-formed.\n\nCallers must ensure all the named tensors are indeed stored in the checkpoint."
+  is_stateful: true
 }
 op {
   name: "Reverse"
@@ -17616,11 +20772,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "ReverseSequence"
@@ -17670,7 +20827,7 @@ op {
     }
   }
   summary: "Reverses variable length slices."
-  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
+  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "ReverseV2"
@@ -17717,11 +20874,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "Rint"
@@ -17961,6 +21119,7 @@ op {
   }
   summary: "Saves the input tensors to disk."
   description: "The size of `tensor_names` must match the number of tensors in `data`. `data[i]`\nis written to `filename` with name `tensor_names[i]`.\n\nSee also `SaveSlices`."
+  is_stateful: true
 }
 op {
   name: "SaveSlices"
@@ -17992,6 +21151,7 @@ op {
   }
   summary: "Saves input tensors slices to disk."
   description: "This is like `Save` except that tensors can be listed in the saved file as being\na slice of a larger tensor.  `shapes_and_slices` specifies the shape of the\nlarger tensor and the slice that this tensor covers. `shapes_and_slices` must\nhave as many elements as `tensor_names`.\n\nElements of the `shapes_and_slices` input must either be:\n\n*  The empty string, in which case the corresponding tensor is\n   saved normally.\n*  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the\n   `dimI` are the dimensions of the larger tensor and `slice-spec`\n   specifies what part is covered by the tensor to save.\n\n`slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`\nwhere each `sliceI` is either:\n\n*  The string `-` meaning that the slice covers all indices of this dimension\n*  `start,length` where `start` and `length` are integers.  In that\n   case the slice covers `length` indices starting at `start`.\n\nSee also `Save`."
+  is_stateful: true
 }
 op {
   name: "SaveV2"
@@ -18023,6 +21183,7 @@ op {
   }
   summary: "Saves tensors in V2 checkpoint format."
   description: "By default, saves the named tensors in full.  If the caller wishes to save\nspecific slices of full tensors, \"shape_and_slices\" should be non-empty strings\nand correspondingly well-formed."
+  is_stateful: true
 }
 op {
   name: "ScalarSummary"
@@ -18126,7 +21287,7 @@ op {
     description: "If True, the addition will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Adds sparse updates to a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterAdd.png\" alt>\n</div>"
+  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterAdd.png\" alt>\n</div>"
 }
 op {
   name: "ScatterDiv"
@@ -18193,7 +21354,7 @@ op {
     description: "If True, the operation will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Divides a variable reference by sparse updates."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] /= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] /= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions divide.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
+  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] /= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] /= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions divide.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
 }
 op {
   name: "ScatterMul"
@@ -18260,23 +21421,23 @@ op {
     description: "If True, the operation will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Multiplies sparse updates into a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] *= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] *= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions multiply.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
+  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] *= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] *= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions multiply.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
 }
 op {
   name: "ScatterNd"
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as tensor. A tensor of updated values\nto store in ref."
+    description: "Updates to scatter into output."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "A vector. The shape of the resulting tensor."
+    description: "1-D. The shape of the resulting tensor."
     type_attr: "Tindices"
   }
   output_arg {
@@ -18298,8 +21459,8 @@ op {
       }
     }
   }
-  summary: "Creates a new tensor by applying sparse `updates` to individual"
-  description: "values or slices within a zero tensor of the given `shape` tensor according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\nTODO(simister): Add a link to Variable.__getitem__ documentation on slice\nsyntax.\n\n`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank\n`Q`.\n\n`indices` must be integer tensor, containing indices into `shape`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `shape`.\n\n`updates` is Tensor of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].\n```\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
+  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
 }
 op {
   name: "ScatterNdAdd"
@@ -18482,7 +21643,7 @@ op {
     description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
   summary: "Applies sparse `updates` to individual values or slices within a given"
-  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee [tf.scatter_nd](#scatter_nd) for more details about how to make updates to\nslices."
+  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n```python\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n```\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee [tf.scatter_nd](#scatter_nd) for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterSub"
@@ -18549,7 +21710,7 @@ op {
     description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Subtracts sparse updates to a variable reference."
-  description: "    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterSub.png\" alt>\n</div>"
+  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterSub.png\" alt>\n</div>"
 }
 op {
   name: "ScatterUpdate"
@@ -18598,7 +21759,7 @@ op {
     description: "If True, the assignment will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Applies sparse updates to a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterUpdate.png\" alt>\n</div>"
+  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterUpdate.png\" alt>\n</div>"
 }
 op {
   name: "SdcaFprint"
@@ -18652,7 +21813,7 @@ op {
   }
   input_arg {
     name: "sparse_indices"
-    description: "a list of vectors where each value is the indices which has\ncorresponding weights in sparse_weights. This field maybe ommitted for the\ndense approach."
+    description: "a list of vectors where each value is the indices which has\ncorresponding weights in sparse_weights. This field maybe omitted for the\ndense approach."
     type: DT_INT64
     number_attr: "num_sparse_features"
   }
@@ -18754,7 +21915,7 @@ op {
     minimum: 1
   }
   summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
-  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\nProximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.\n2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf\n\n  Loss objective = \\sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|\n\nAdding vs. Averaging in Distributed Primal-Dual Optimization.\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,\nMartin Takac http://arxiv.org/abs/1502.03508\n\nStochastic Dual Coordinate Ascent with Adaptive Probabilities\nDominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053"
+  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\n[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>\nShai Shalev-Shwartz, Tong Zhang. 2012\n\n$$Loss Objective = \\sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$\n\n[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,\nPeter Richtarik, Martin Takac. 2015\n\n[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>\nDominik Csiba, Zheng Qu, Peter Richtarik. 2015"
 }
 op {
   name: "SdcaShrinkL1"
@@ -18827,7 +21988,7 @@ op {
     }
   }
   summary: "Computes the maximum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMax.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the max is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMax.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMean"
@@ -18873,7 +22034,7 @@ op {
     }
   }
   summary: "Computes the mean along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMean.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\nIf the mean is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMean.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMin"
@@ -18919,7 +22080,7 @@ op {
     }
   }
   summary: "Computes the minimum along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMin.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the min is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMin.png\" alt>\n</div>"
 }
 op {
   name: "SegmentProd"
@@ -18970,7 +22131,7 @@ op {
     }
   }
   summary: "Computes the product along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentProd.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the product is empty for a given segment ID `i`, `output[i] = 1`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentProd.png\" alt>\n</div>"
 }
 op {
   name: "SegmentSum"
@@ -19021,7 +22182,7 @@ op {
     }
   }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Select"
@@ -19049,7 +22210,7 @@ op {
     type: "type"
   }
   summary: "Selects elements from `t` or `e`, depending on `condition`."
-  description: "The `t`, and `e` tensors must all have the same shape, and the\noutput will also have that shape.\n\nThe `condition` tensor must be a scalar if `t` and `e` are scalars.\nIf `t` and `e` are vectors or higher rank, then `condition` must be either a\nscalar, a vector with size matching the first dimension of `t`, or must have\nthe same shape as `t`.\n\nThe `condition` tensor acts as a mask that chooses, based on the value at each\nelement, whether the corresponding element / row in the output should be\ntaken from `t` (if true) or `e` (if false).\n\nIf `condition` is a vector and `t` and `e` are higher rank matrices, then\nit chooses which row (outer dimension) to copy from `t` and `e`.\nIf `condition` has the same shape as `t` and `e`, then it chooses which\nelement to copy from `t` and `e`.\n\nFor example:\n\n```prettyprint\n# \'condition\' tensor is [[True,  False]\n#                        [False, True]]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e) ==> [[1, 6],\n                             [7, 4]]\n\n\n# \'condition\' tensor is [True, False]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e) ==> [[1, 2],\n                             [7, 8]]\n\n```"
+  description: "The `t`, and `e` tensors must all have the same shape, and the\noutput will also have that shape.\n\nThe `condition` tensor must be a scalar if `t` and `e` are scalars.\nIf `t` and `e` are vectors or higher rank, then `condition` must be either a\nscalar, a vector with size matching the first dimension of `t`, or must have\nthe same shape as `t`.\n\nThe `condition` tensor acts as a mask that chooses, based on the value at each\nelement, whether the corresponding element / row in the output should be\ntaken from `t` (if true) or `e` (if false).\n\nIf `condition` is a vector and `t` and `e` are higher rank matrices, then\nit chooses which row (outer dimension) to copy from `t` and `e`.\nIf `condition` has the same shape as `t` and `e`, then it chooses which\nelement to copy from `t` and `e`.\n\nFor example:\n\n```python\n# \'condition\' tensor is [[True,  False]\n#                        [False, True]]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e)  # => [[1, 6], [7, 4]]\n\n\n# \'condition\' tensor is [True, False]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e) ==> [[1, 2],\n                             [7, 8]]\n\n```"
 }
 op {
   name: "SelfAdjointEig"
@@ -19112,11 +22273,13 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
-  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```prettyprint\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
+  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```python\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
 }
 op {
   name: "SerializeManySparse"
@@ -19248,7 +22411,7 @@ op {
     }
   }
   summary: "Returns the shape of a tensor."
-  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
+  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
 }
 op {
   name: "ShapeN"
@@ -19325,6 +22488,46 @@ op {
   }
   summary: "Generate a glob pattern matching all sharded file names."
 }
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "buffer_size"
+    description: "The number of output elements to buffer in an iterator over\nthis dataset. Compare with the `min_after_dequeue` attr when creating a\n`RandomShuffleQueue`."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    description: "A scalar seed for the random number generator. If either seed or\nseed2 is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    description: "A second scalar seed to avoid seed collision."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that shuffles elements from `input_dataset` pseudorandomly."
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -19434,6 +22637,30 @@ op {
   }
   summary: "Computes sin of x element-wise."
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes hyperbolic sine of x element-wise."
+}
 op {
   name: "Size"
   input_arg {
@@ -19462,7 +22689,37 @@ op {
     }
   }
   summary: "Returns the size of a tensor."
-  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
+  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be skipped.  If count is -1, skips everything."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that skips `count` elements from the `input_dataset`."
+  is_stateful: true
 }
 op {
   name: "Skipgram"
@@ -19780,7 +23037,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -19827,7 +23084,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -19889,7 +23146,7 @@ op {
     minimum: 2
   }
   summary: "SpaceToDepth for tensors of type T."
-  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```prettyprint\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```prettyprint\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```prettyprint\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
+  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
 }
 op {
   name: "SparseAccumulatorApplyGradient"
@@ -19947,8 +23204,8 @@ op {
     type: "bool"
     description: "Boolean indicating whether gradient_shape is unknown, in which\ncase the input is ignored during validation."
   }
-  summary: "Applies a sparse gradient to a given accumulator. Does not add if local_step is"
-  description: "lesser than the accumulator\'s global_step."
+  summary: "Applies a sparse gradient to a given accumulator."
+  description: "Does not add if local_step is smaller than the accumulator\'s\nglobal_step."
 }
 op {
   name: "SparseAccumulatorTakeGradient"
@@ -20001,8 +23258,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average sparse gradient in the given SparseConditionalAccumulator,"
-  description: "provided that sufficient (i.e., more than num_required) gradients have been\naccumulated. The op will blocks until sufficient gradients have been\naccumulated. If the accumulator has already aggregated more than num_required\ngradients, it will return its average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
+  description: "The op will blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated. If the accumulator has already\naggregated more than num_required gradients, it will return its\naverage of the accumulated gradients.  Also automatically increments\nthe recorded global_step in the accumulator by 1, and resets the\naggregate to 0."
 }
 op {
   name: "SparseAdd"
@@ -21087,10 +24344,115 @@ op {
     }
     description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating sparse gradients. The accumulator"
-  description: "accepts gradients marked with local_step greater or equal to the most recent\nglobal_step known to the accumulator. The average can be extracted from the\naccumulator, provided sufficient gradients have been accumulated. Extracting the\naverage automatically resets the aggregate to 0, and increments the global_step\nrecorded by the accumulator."
+  summary: "A conditional accumulator for aggregating sparse gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    description: "2-D.  Indices of each input `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    description: "1-D.   values of each `SparseTensor`."
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    description: "1-D.   Shapes of each `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    description: "2-D.    Columns represented by dense `Tensor`."
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    description: "2-D.  Indices of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    description: "1-D.  Non-empty values of the concatenated or hashed\n`SparseTensor`."
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    description: "1-D.  Shape of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+    description: "If true, returns the hash of the cross instead of the string.\nThis will allow us avoiding string manipulations."
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    description: "It is used if hashed_output is true.\noutput = hashed_value%num_buckets if num_buckets > 0 else hashed_value."
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+    description: "Specify the hash_key that will be used by the `FingerprintCat64`\nfunction to combine the crosses fingerprints."
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: "The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each\nrepresenting features of one feature column. It outputs a 2D `SparseTensor` with\nthe batchwise crosses of these features.\n\nFor example, if the inputs are\n\n    inputs[0]: SparseTensor with shape = [2, 2]\n    [0, 0]: \"a\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\n    inputs[1]: SparseTensor with shape = [2, 1]\n    [0, 0]: \"d\"\n    [1, 0]: \"e\"\n\n    inputs[2]: Tensor [[\"f\"], [\"g\"]]\n\nthen the output will be\n\n    shape = [2, 2]\n    [0, 0]: \"a_X_d_X_f\"\n    [1, 0]: \"b_X_e_X_g\"\n    [1, 1]: \"c_X_e_X_g\"\n\nif hashed_output=true then the output will be\n\n    shape = [2, 2]\n    [0, 0]: FingerprintCat64(\n                Fingerprint64(\"f\"), FingerprintCat64(\n                    Fingerprint64(\"d\"), Fingerprint64(\"a\")))\n    [1, 0]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"b\")))\n    [1, 1]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"c\")))"
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -21247,6 +24609,83 @@ op {
   summary: "Component-wise multiplies a SparseTensor by a dense Tensor."
   description: "The output locations corresponding to the implicitly zero elements in the sparse\ntensor will be zero (i.e., will not take up storage space), regardless of the\ncontents of the dense tensor (even if it\'s +/-INF and that INF*0 == NaN).\n\n*Limitation*: this Op only broadcasts the dense side to the sparse side, but not\nthe other direction."
 }
+op {
+  name: "SparseFillEmptyRows"
+  input_arg {
+    name: "indices"
+    description: "2-D. the indices of the sparse tensor."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    description: "1-D. the values of the sparse tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    description: "1-D. the shape of the sparse tensor."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "default_value"
+    description: "0-D. default value to insert into location `[row, 0, ..., 0]`\n  for rows missing from the input sparse tensor.\noutput indices: 2-D. the indices of the filled sparse tensor."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    description: "1-D. the values of the filled sparse tensor."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "empty_row_indicator"
+    description: "1-D. whether the dense row was missing in the\ninput sparse tensor."
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "reverse_index_map"
+    description: "1-D. a map from the input indices to the output indices."
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Fills empty rows in the input 2-D `SparseTensor` with a default value."
+  description: "The input `SparseTensor` is represented via the tuple of inputs\n(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the\nsame `dense_shape` but with indices `output_indices` and values\n`output_values`.\n\nThis op inserts a single entry for every row that doesn\'t have any values.\nThe index is created as `[row, 0, ..., 0]` and the inserted value\nis `default_value`.\n\nFor example, suppose `sp_input` has shape `[5, 6]` and non-empty values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [2, 0]: c\n    [3, 1]: d\n\nRows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [1, 0]: default_value\n    [2, 0]: c\n    [3, 1]: d\n    [4, 0]: default_value\n\nThe output `SparseTensor` will be in row-major order and will have the\nsame shape as the input.\n\nThis op also returns an indicator vector shaped `[dense_shape[0]]` such that\n\n    empty_row_indicator[i] = True iff row i was an empty row.\n\nAnd a reverse index map vector shaped `[indices.shape[0]]` that is used during\nbackpropagation,\n\n    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]"
+}
+op {
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    description: "1-D.  The reverse index map from SparseFillEmptyRows."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    description: "1-D.  The gradients from backprop."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    description: "1-D.  The backprop into values."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    description: "0-D.  The backprop into default_value."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "The gradient of SparseFillEmptyRows."
+  description: "Takes vectors reverse_index_map, shaped `[N]`, and grad_values,\nshaped `[N_full]`, where `N_full >= N` and copies data into either\n`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and\n`d_default_value` is a scalar.\n\n  d_values[j] = grad_values[reverse_index_map[j]]\n  d_default_value = sum_{k : 0 .. N_full - 1} (\n     grad_values[k] * 1{k not in reverse_index_map})"
+}
 op {
   name: "SparseMatMul"
   input_arg {
@@ -21316,7 +24755,7 @@ op {
     }
   }
   summary: "Multiply matrix \"a\" by matrix \"b\"."
-  description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix."
+  description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix.\n\nThe gradient computation of this operation will only take advantage of sparsity\nin the input gradient when that gradient comes from a Relu."
 }
 op {
   name: "SparseReduceSum"
@@ -21554,7 +24993,7 @@ op {
     }
   }
   summary: "Computes the mean along sparse segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
 }
 op {
   name: "SparseSegmentMeanGrad"
@@ -21653,7 +25092,7 @@ op {
     }
   }
   summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
-  description: "N is the size of the segment being reduced.\n\nRead [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments."
+  description: "N is the size of the segment being reduced.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
 }
 op {
   name: "SparseSegmentSqrtNGrad"
@@ -21759,7 +25198,7 @@ op {
     }
   }
   summary: "Computes the sum along sparse segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```prettyprint\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n  ==> [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n  ==> [[ 1  2  3  4]\n       [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n  ==> [[0 0 0 0]\n       [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n# => [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n# => [[ 1  2  3  4]\n#     [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n# => [[0 0 0 0]\n#     [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
 }
 op {
   name: "SparseSoftmax"
@@ -22093,7 +25532,7 @@ op {
   input_arg {
     name: "a_indices"
     description: "2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix."
-    type: DT_INT64
+    type_attr: "Tindices"
   }
   input_arg {
     name: "a_values"
@@ -22118,6 +25557,19 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "adjoint_a"
     type: "bool"
@@ -22137,6 +25589,31 @@ op {
   summary: "Multiply SparseTensor (of rank 2) \"A\" by dense matrix \"B\"."
   description: "No validity checking is performed on the indices of A.  However, the following\ninput format is recommended for optimal behavior:\n\nif adjoint_a == false:\n  A should be sorted in lexicographically increasing order.  Use SparseReorder\n  if you\'re not sure.\nif adjoint_a == true:\n  A should be sorted in order of increasing dimension 1 (i.e., \"column major\"\n  order instead of \"row major\" order)."
 }
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
+  is_stateful: true
+}
 op {
   name: "SparseToDense"
   input_arg {
@@ -22187,7 +25664,7 @@ op {
     }
   }
   summary: "Converts a sparse representation into a dense tensor."
-  description: "Builds an array `dense` with shape `output_shape` such that\n\n```prettyprint\n# If sparse_indices is scalar\ndense[i] = (i == sparse_indices ? sparse_values : default_value)\n\n# If sparse_indices is a vector, then for each i\ndense[sparse_indices[i]] = sparse_values[i]\n\n# If sparse_indices is an n by d matrix, then for each i in [0, n)\ndense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]\n```\n\nAll other values in `dense` are set to `default_value`.  If `sparse_values` is a\nscalar, all sparse indices are set to this single value.\n\nIndices should be sorted in lexicographic order, and indices must not\ncontain any repeats. If `validate_indices` is true, these properties\nare checked during execution."
+  description: "Builds an array `dense` with shape `output_shape` such that\n\n```\n# If sparse_indices is scalar\ndense[i] = (i == sparse_indices ? sparse_values : default_value)\n\n# If sparse_indices is a vector, then for each i\ndense[sparse_indices[i]] = sparse_values[i]\n\n# If sparse_indices is an n by d matrix, then for each i in [0, n)\ndense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]\n```\n\nAll other values in `dense` are set to `default_value`.  If `sparse_values` is a\nscalar, all sparse indices are set to this single value.\n\nIndices should be sorted in lexicographic order, and indices must not\ncontain any repeats. If `validate_indices` is true, these properties\nare checked during execution."
 }
 op {
   name: "SparseToSparseSetOperation"
@@ -22269,7 +25746,7 @@ op {
   name: "Split"
   input_arg {
     name: "split_dim"
-    description: "0-D.  The dimension along which to split.  Must be in the range\n`[0, rank(value))`."
+    description: "0-D.  The dimension along which to split.  Must be in the range\n`[-rank(value), rank(value))`."
     type: DT_INT32
   }
   input_arg {
@@ -22310,7 +25787,7 @@ op {
   }
   input_arg {
     name: "split_dim"
-    description: "0-D.  The dimension along which to split.  Must be in the range\n`[0, rank(value))`."
+    description: "0-D.  The dimension along which to split.  Must be in the range\n`[-rank(value), rank(value))`."
     type: DT_INT32
   }
   output_arg {
@@ -22488,7 +25965,7 @@ op {
     has_minimum: true
   }
   summary: "Removes dimensions of size 1 from the shape of a tensor."
-  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
+  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
 }
 op {
   name: "Stack"
@@ -22580,9 +26057,27 @@ op {
   name: "Stage"
   input_arg {
     name: "values"
-    description: "a list of tensors"
+    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
     type_list_attr: "dtypes"
   }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "The maximum number of bytes allowed for Tensors in the Staging Area.\nIf > 0, inserts will block until sufficient space is available."
+    has_minimum: true
+  }
   attr {
     name: "dtypes"
     type: "list(type)"
@@ -22605,10 +26100,286 @@ op {
     }
     description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage values similar to a lightweight Enqueue.  The basic functionality of this"
-  description: "Op is similar to a queue with many fewer capabilities and options.  This Op is\noptimized for performance."
+  summary: "Stage values similar to a lightweight Enqueue."
+  description: "The basic functionality of this Op is similar to a queue with many\nfewer capabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
+op {
+  name: "StageClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op removes all elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "StagePeek"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op peeks at the values at the specified index.  If the"
+  description: "underlying container does not contain sufficient elements\nthis op will block until it does.   This Op is optimized for\nperformance."
+  is_stateful: true
+}
+op {
+  name: "StageSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op returns the number of elements in the underlying container."
+  is_stateful: true
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Outputs deterministic pseudorandom values from a normal distribution."
+  description: "The generated values will have mean 0 and standard deviation 1.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
+  description: "The generated values follow a uniform distribution in the range `[0, 1)`. The\nlower bound 0 is included in the range, while the upper bound 1 is excluded.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
+  description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -22706,7 +26477,7 @@ op {
     description: "a bitmask where bit `i` implies that the `i`th\nspecification should shrink the dimensionality. begin and end\nmust imply a slice of size 1 in the dimension. For example in\npython one might do `foo[:, 3, :]` which would result in\n`shrink_axis_mask` being 2."
   }
   summary: "Return a strided slice from `input`."
-  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```prettyprint\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
+  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
 }
 op {
   name: "StridedSliceAssign"
@@ -22993,7 +26764,7 @@ op {
     description: "The key for the keyed hash function passed as a list of two uint64\nelements."
   }
   summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: "The hash function is deterministic on the content of the string within the\nprocess. The hash function is a keyed hash function, where attribute `key`\ndefines the key of the hash function. `key` is an array of 2 elements.\n\nA strong hash is important when inputs may be malicious, e.g. URLs with\nadditional components. Adversaries could try to make their inputs hash to the\nsame bucket for a denial-of-service attack or to skew the results. A strong\nhash prevents this by making it dificult, if not infeasible, to compute inputs\nthat hash to the same bucket. This comes at a cost of roughly 4x higher compute\ntime than `tf.string_to_hash_bucket_fast`."
+  description: "The hash function is deterministic on the content of the string within the\nprocess. The hash function is a keyed hash function, where attribute `key`\ndefines the key of the hash function. `key` is an array of 2 elements.\n\nA strong hash is important when inputs may be malicious, e.g. URLs with\nadditional components. Adversaries could try to make their inputs hash to the\nsame bucket for a denial-of-service attack or to skew the results. A strong\nhash prevents this by making it difficult, if not infeasible, to compute inputs\nthat hash to the same bucket. This comes at a cost of roughly 4x higher compute\ntime than `tf.string_to_hash_bucket_fast`."
 }
 op {
   name: "StringToNumber"
@@ -23016,7 +26787,9 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -23088,7 +26861,7 @@ op {
     }
   }
   summary: "Return substrings from `Tensor` of strings."
-  description: "For each string in the input `Tensor`, creates a substring starting at index\n`pos` with a total length of `len`.\n\nIf `len` defines a substring that would extend beyond the length of the input\nstring, then as many characters as possible are used.\n\nIf `pos` is negative or specifies a character index larger than any of the input\nstrings, then an `InvalidArgumentError` is thrown.\n\n`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on\nOp creation.\n\n*NOTE*: `Substr` supports broadcasting up to two dimensions. More about\nbroadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n\n---\n\nExamples\n\nUsing scalar `pos` and `len`:\n\n```\ninput = [b\'Hello\', b\'World\']\nposition = 1\nlength = 3\n\noutput = [b\'ell\', b\'orl\']\n```\n\nUsing `pos` and `len` with same shape as `input`:\n\n```\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\']]\nposition = [[1, 2, 3],\n            [1, 2, 3],\n            [1, 2, 3]]\nlength =   [[2, 3, 4],\n            [4, 3, 2],\n            [5, 5, 5]]\n\noutput = [[b\'en\', b\'eve\', b\'lve\'],\n          [b\'hirt\', b\'urt\', b\'te\'],\n          [b\'ixtee\', b\'vente\', b\'hteen\']]\n```\n\nBroadcasting `pos` and `len` onto `input`:\n\n```\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\'],\n         [b\'nineteen\', b\'twenty\', b\'twentyone\']]\nposition = [1, 2, 3]\nlength =   [1, 2, 3]\n\noutput = [[b\'e\', b\'ev\', b\'lve\'],\n          [b\'h\', b\'ur\', b\'tee\'],\n          [b\'i\', b\'ve\', b\'hte\'],\n          [b\'i\', b\'en\', b\'nty\']]\n```\n\nBroadcasting `input` onto `pos` and `len`:\n\n```\ninput = b\'thirteen\'\nposition = [1, 5, 7]\nlength =   [3, 2, 1]\n\noutput = [b\'hir\', b\'ee\', b\'n\"]\n```"
+  description: "For each string in the input `Tensor`, creates a substring starting at index\n`pos` with a total length of `len`.\n\nIf `len` defines a substring that would extend beyond the length of the input\nstring, then as many characters as possible are used.\n\nIf `pos` is negative or specifies a character index larger than any of the input\nstrings, then an `InvalidArgumentError` is thrown.\n\n`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on\nOp creation.\n\n*NOTE*: `Substr` supports broadcasting up to two dimensions. More about\nbroadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n\n---\n\nExamples\n\nUsing scalar `pos` and `len`:\n\n```python\ninput = [b\'Hello\', b\'World\']\nposition = 1\nlength = 3\n\noutput = [b\'ell\', b\'orl\']\n```\n\nUsing `pos` and `len` with same shape as `input`:\n\n```python\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\']]\nposition = [[1, 2, 3],\n            [1, 2, 3],\n            [1, 2, 3]]\nlength =   [[2, 3, 4],\n            [4, 3, 2],\n            [5, 5, 5]]\n\noutput = [[b\'en\', b\'eve\', b\'lve\'],\n          [b\'hirt\', b\'urt\', b\'te\'],\n          [b\'ixtee\', b\'vente\', b\'hteen\']]\n```\n\nBroadcasting `pos` and `len` onto `input`:\n\n```\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\'],\n         [b\'nineteen\', b\'twenty\', b\'twentyone\']]\nposition = [1, 2, 3]\nlength =   [1, 2, 3]\n\noutput = [[b\'e\', b\'ev\', b\'lve\'],\n          [b\'h\', b\'ur\', b\'tee\'],\n          [b\'i\', b\'ve\', b\'hte\'],\n          [b\'i\', b\'en\', b\'nty\']]\n```\n\nBroadcasting `input` onto `pos` and `len`:\n\n```\ninput = b\'thirteen\'\nposition = [1, 5, 7]\nlength =   [3, 2, 1]\n\noutput = [b\'hir\', b\'ee\', b\'n\"]\n```"
 }
 op {
   name: "Sum"
@@ -23204,7 +26977,7 @@ op {
     }
   }
   summary: "Computes the singular value decompositions of one or more matrices."
-  description: "Computes the SVD of each inner matrix in `input` such that\n`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`\n\n```prettyprint\n# a is a tensor containing a batch of matrices.\n# s is a tensor of singular values for each matrix.\n# u is the tensor containing of left singular vectors for each matrix.\n# v is the tensor containing of right singular vectors for each matrix.\ns, u, v = svd(a)\ns, _, _ = svd(a, compute_uv=False)\n```"
+  description: "Computes the SVD of each inner matrix in `input` such that\n`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`\n\n```python\n# a is a tensor containing a batch of matrices.\n# s is a tensor of singular values for each matrix.\n# u is the tensor containing of left singular vectors for each matrix.\n# v is the tensor containing of right singular vectors for each matrix.\ns, u, v = svd(a)\ns, _, _ = svd(a, compute_uv=False)\n```"
 }
 op {
   name: "Switch"
@@ -23268,6 +27041,25 @@ op {
   }
   summary: "Computes the gradient function for function f via backpropagation."
 }
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the records from one or more TFRecord files."
+  is_stateful: true
+}
 op {
   name: "TFRecordReader"
   output_arg {
@@ -23335,6 +27127,63 @@ op {
   summary: "A Reader that outputs the records from a TensorFlow Records file."
   is_stateful: true
 }
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    description: "The handle to reference the Reader."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "A Reader that outputs the records from a LMDB database."
+  is_stateful: true
+}
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be taken. A value of `-1` indicates that all of `input_dataset`\nis taken."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that contains `count` elements from the `input_dataset`."
+  is_stateful: true
+}
 op {
   name: "TakeManySparseFromTensorsMap"
   input_arg {
@@ -23572,8 +27421,8 @@ op {
     description: "The handle to a TensorArray (output of TensorArray or TensorArrayGrad)."
     type: DT_RESOURCE
   }
-  summary: "Delete the TensorArray from its resource container.  This enables"
-  description: "the user to close and release the resource in the middle of a step/run."
+  summary: "Delete the TensorArray from its resource container."
+  description: "This enables the user to close and release the resource in the middle\nof a step/run."
   is_stateful: true
 }
 op {
@@ -23870,7 +27719,7 @@ op {
     description: "The gradient source string, used to decide which gradient TensorArray\nto return."
   }
   summary: "Creates a TensorArray for storing the gradients of values in the given handle."
-  description: "If the given TensorArray gradient already exists, returns a reference to it.\n\nLocks the size of the original TensorArray by disabling its dynamic size flag.\n\n**A note about the input flow_in:**\n\nThe handle flow_in forces the execution of the gradient lookup to occur\nonly after certain other operations have occurred.  For example, when\nthe forward TensorArray is dynamically sized, writes to this TensorArray\nmay resize the object.  The gradient TensorArray is statically sized based\non the size of the forward TensorArray when this operation executes.\nFurthermore, the size of the forward TensorArray is frozen by this call.\nAs a result, the flow is used to ensure that the call to generate the gradient\nTensorArray only happens after all writes are executed.\n\nIn the case of dynamically sized TensorArrays, gradient computation should\nonly be performed on read operations that have themselves been chained via\nflow to occur only after all writes have executed. That way the final size\nof the forward TensorArray is known when this operation is called.\n\n**A note about the source attribute:**\n\nTensorArray gradient calls use an accumulator TensorArray object.  If\nmultiple gradients are calculated and run in the same session, the multiple\ngradient nodes may accidentally flow throuth the same accumulator TensorArray.\nThis double counts and generally breaks the TensorArray gradient flow.\n\nThe solution is to identify which gradient call this particular\nTensorArray gradient is being called in.  This is performed by identifying\na unique string (e.g. \"gradients\", \"gradients_1\", ...) from the input\ngradient Tensor\'s name.  This string is used as a suffix when creating\nthe TensorArray gradient object here (the attribute `source`).\n\nThe attribute `source` is added as a suffix to the forward TensorArray\'s\nname when performing the creation / lookup, so that each separate gradient\ncalculation gets its own TensorArray accumulator."
+  description: "If the given TensorArray gradient already exists, returns a reference to it.\n\nLocks the size of the original TensorArray by disabling its dynamic size flag.\n\n**A note about the input flow_in:**\n\nThe handle flow_in forces the execution of the gradient lookup to occur\nonly after certain other operations have occurred.  For example, when\nthe forward TensorArray is dynamically sized, writes to this TensorArray\nmay resize the object.  The gradient TensorArray is statically sized based\non the size of the forward TensorArray when this operation executes.\nFurthermore, the size of the forward TensorArray is frozen by this call.\nAs a result, the flow is used to ensure that the call to generate the gradient\nTensorArray only happens after all writes are executed.\n\nIn the case of dynamically sized TensorArrays, gradient computation should\nonly be performed on read operations that have themselves been chained via\nflow to occur only after all writes have executed. That way the final size\nof the forward TensorArray is known when this operation is called.\n\n**A note about the source attribute:**\n\nTensorArray gradient calls use an accumulator TensorArray object.  If\nmultiple gradients are calculated and run in the same session, the multiple\ngradient nodes may accidentally flow through the same accumulator TensorArray.\nThis double counts and generally breaks the TensorArray gradient flow.\n\nThe solution is to identify which gradient call this particular\nTensorArray gradient is being called in.  This is performed by identifying\na unique string (e.g. \"gradients\", \"gradients_1\", ...) from the input\ngradient Tensor\'s name.  This string is used as a suffix when creating\nthe TensorArray gradient object here (the attribute `source`).\n\nThe attribute `source` is added as a suffix to the forward TensorArray\'s\nname when performing the creation / lookup, so that each separate gradient\ncalculation gets its own TensorArray accumulator."
   is_stateful: true
 }
 op {
@@ -24364,8 +28213,8 @@ op {
     }
     description: "Overrides the name used for the temporary tensor_array\nresource. Default value is the name of the \'TensorArray\' op (which\nis guaranteed unique)."
   }
-  summary: "An array of Tensors of given size, with data written via Write and read"
-  description: "via Read or Pack."
+  summary: "An array of Tensors of given size."
+  description: "Write data via Write and read via Read or Pack."
   is_stateful: true
 }
 op {
@@ -24462,6 +28311,56 @@ op {
   summary: "Push an element onto the tensor_array."
   is_stateful: true
 }
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits `components` as a tuple of tensors once."
+  is_stateful: true
+}
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits each dim-0 slice of `components` once."
+  is_stateful: true
+}
 op {
   name: "TensorSummary"
   input_arg {
@@ -24503,6 +28402,48 @@ op {
     description: "An unused string."
   }
   summary: "Outputs a `Summary` protocol buffer with a tensor."
+  description: "This op is being phased out in favor of TensorSummaryV2, which lets callers pass\na tag as well as a serialized SummaryMetadata proto string that contains\nplugin-specific data. We will keep this op to maintain backwards compatibility."
+}
+op {
+  name: "TensorSummaryV2"
+  input_arg {
+    name: "tag"
+    description: "A string attached to this summary. Used for organization in TensorBoard."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    description: "A tensor to serialize."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "serialized_summary_metadata"
+    description: "A serialized SummaryMetadata proto. Contains plugin\ndata."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Outputs a `Summary` protocol buffer with a tensor and per-plugin data."
+}
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the lines of one or more text files."
+  is_stateful: true
 }
 op {
   name: "TextLineReader"
@@ -24605,7 +28546,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -24639,6 +28580,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Tile"
@@ -24902,8 +28844,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division. This emulates C semantics where"
-  description: "true, this follows C semantics in that the result here is consistent\nwith a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "TruncatedNormal"
@@ -24991,7 +28933,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -25025,6 +28967,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Unique"
@@ -25061,7 +29004,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
 }
 op {
   name: "UniqueWithCounts"
@@ -25103,7 +29046,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
 }
 op {
   name: "Unpack"
@@ -25138,6 +29081,56 @@ op {
   summary: "Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors."
   description: "Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.\nFor example, given a tensor of shape `(A, B, C, D)`;\n\nIf `axis == 0` then the i\'th tensor in `output` is the slice `value[i, :, :, :]`\n  and each tensor in `output` will have shape `(B, C, D)`. (Note that the\n  dimension unpacked along is gone, unlike `split`).\n\nIf `axis == 1` then the i\'th tensor in `output` is the slice `value[:, i, :, :]`\n  and each tensor in `output` will have shape `(A, C, D)`.\nEtc.\n\nThis is the opposite of `pack`."
 }
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for dimension 0 which\nhas size `num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the Max along segments of a tensor."
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
 op {
   name: "UnsortedSegmentSum"
   input_arg {
@@ -25191,60 +29184,7 @@ op {
     }
   }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    description: "A tensor whose shape is a prefix of `data.shape`."
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  summary: "Computes the max along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Unstage"
@@ -25252,6 +29192,22 @@ op {
     name: "values"
     type_list_attr: "dtypes"
   }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
   attr {
     name: "dtypes"
     type: "list(type)"
@@ -25272,8 +29228,8 @@ op {
       s: ""
     }
   }
-  summary: "Op is similar to a lightweight Dequeue.  The basic funtionality is similar to"
-  description: "dequeue with many fewer capabilities and options.  This Op is optimized for\nperformance."
+  summary: "Op is similar to a lightweight Dequeue."
+  description: "The basic functionality is similar to dequeue with many fewer\ncapabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -25357,7 +29313,7 @@ op {
     type: DT_INT64
   }
   summary: "Returns locations of true values in a boolean tensor."
-  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```prettyprint\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
+  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
 }
 op {
   name: "WholeFileReader"
@@ -25471,5 +29427,37 @@ op {
     }
   }
   summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
-  description: "The Hurwitz zeta function is defined as:\n\n```\n\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\n```"
+  description: "The Hurwitz zeta function is defined as:\n\n\n\\\\(\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\\\\)"
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_RESOURCE
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that zips together `input_datasets`."
+  is_stateful: true
 }
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 4ca3f2e07ee..22f87f5fdfe 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -113,7 +112,11 @@ dense_defaults: A list of Ndense Tensors (some may be empty).
   when the example's feature_map lacks dense_key[j].  If an empty Tensor is
   provided for dense_defaults[j], then the Feature dense_keys[j] is required.
   The input type is inferred from dense_defaults[j], even when it's empty.
-  If dense_defaults[j] is not empty, its shape must match dense_shapes[j].
+  If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+  then the shape of dense_defaults[j] must match that of dense_shapes[j].
+  If dense_shapes[j] has an undefined major dimension (variable strides dense
+  feature), dense_defaults[j] must contain a single element:
+  the padding element.
 dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
   given in dense_keys.
   The number of elements in the Feature corresponding to dense_key[j]
@@ -121,6 +124,13 @@ dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
   If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
   Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
   The dense outputs are just the inputs row-stacked by batch.
+  This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+  the shape of the output Tensor dense_values[j] will be
+  (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+  of elements of length D1 * .... * DN, across all minibatch entries
+  in the input.  Any minibatch entry with less than M blocks of elements of
+  length D1 * ... * DN will be padded with the corresponding default_value
+  scalar element along the second dimension.
 sparse_keys: A list of Nsparse string Tensors (scalars).
   The keys expected in the Examples' features associated with sparse values.
 sparse_types: A list of Nsparse types; the data types of data in each Feature
@@ -310,6 +320,7 @@ REGISTER_OP("DecodeCSV")
     .Output("output: OUT_TYPE")
     .Attr("OUT_TYPE: list({float,int32,int64,string})")
     .Attr("field_delim: string = ','")
+    .Attr("use_quote_delim: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
@@ -336,14 +347,17 @@ records: Each string is a record/row in the csv and all records should have
   the same format.
 record_defaults: One tensor per column of the input record, with either a
   scalar default value for that column or empty if the column is required.
-field_delim: delimiter to separate fields in a record.
+field_delim: char delimiter to separate fields in a record.
+use_quote_delim: If false, treats double quotation marks as regular
+  characters inside of the string fields (ignoring RFC 4180, Section 2,
+  Bullet 5).
 output: Each tensor will have the same shape as records.
 )doc");
 
 REGISTER_OP("StringToNumber")
     .Input("string_tensor: string")
     .Output("output: out_type")
-    .Attr("out_type: {float, int32} = DT_FLOAT")
+    .Attr("out_type: {float, double, int32, int64} = DT_FLOAT")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Converts each string in the input Tensor to the specified numeric type.
diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc
index 6167c136b16..5c29e21d00b 100644
--- a/tensorflow/core/ops/parsing_ops_test.cc
+++ b/tensorflow/core/ops/parsing_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -61,11 +60,19 @@ TEST(ParsingOpsTest, DecodeCSV_ShapeFn) {
 }
 
 static std::vector<TensorShapeProto> MakeDenseShapes(int size,
-                                                     bool add_extra_shape) {
+                                                     bool add_extra_shape,
+                                                     int unknown_outer_dims) {
   std::vector<TensorShapeProto> shapes(size);
   for (int i = 0; i < size; ++i) {
-    // Make shapes be the sequence [1]; [1,2], [1,2,3]...
-    if (i > 0) shapes[i] = shapes[i - 1];
+    // Make shapes be the sequence [?,1]; [?,1,2], [?,1,2,3]...
+    // where the number of prefixed ? depends on unknown_outer_dims.
+    if (i == 0) {
+      for (int d = 0; d < unknown_outer_dims; ++d) {
+        shapes[i].add_dim()->set_size(-1);
+      }
+    } else {
+      shapes[i] = shapes[i - 1];
+    }
     shapes[i].add_dim()->set_size(i + 1);
   }
   if (add_extra_shape) {
@@ -77,7 +84,8 @@ static std::vector<TensorShapeProto> MakeDenseShapes(int size,
 TEST(ParsingOpsTest, ParseExample_ShapeFn) {
   ShapeInferenceTestOp op("ParseExample");
   auto set_outputs = [&op](int num_sparse, int num_dense,
-                           bool add_extra_shape = false) {
+                           bool add_extra_shape = false,
+                           int unknown_outer_dims = 0) {
     using NodeOutList = std::vector<NodeDefBuilder::NodeOut>;
     using DataTypeList = std::vector<DataType>;
     NodeDefBuilder::NodeOut string_in{"a", 0, DT_STRING};
@@ -91,7 +99,8 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) {
             .Input(NodeOutList(num_dense, string_in))
             .Attr("sparse_types", DataTypeList(num_sparse, DT_FLOAT))
             .Attr("dense_types", DataTypeList(num_dense, DT_FLOAT))
-            .Attr("dense_shapes", MakeDenseShapes(num_dense, add_extra_shape))
+            .Attr("dense_shapes", MakeDenseShapes(num_dense, add_extra_shape,
+                                                  unknown_outer_dims))
             .Finalize(&op.node_def));
   };
 
@@ -115,6 +124,24 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) {
   set_outputs(2, 3, true /* add_extra_shape */);
   INFER_ERROR("len(dense_keys) != len(dense_shapes)", op,
               "?;?;?;?;?;?;?;?;?;?");
+
+  // Allow variable strides
+  set_outputs(2, 3, false /* add_extra_shape */, 1 /* unknown_outer_dims */);
+  INFER_OK(op, "?;?;?;?;?;?;?;?;?;?",
+           ("[?,2];[?,2];[?];[?];[2];[2];"      // sparse outputs
+            "[?,?,1];[?,?,1,2];[?,?,1,2,3]"));  // dense outputs
+  INFER_OK(op, "[10];?;?;?;?;?;?;?;?;?",
+           ("[?,2];[?,2];[?];[?];[2];[2];"               // sparse outputs
+            "[d0_0,?,1];[d0_0,?,1,2];[d0_0,?,1,2,3]"));  // dense outputs
+
+  set_outputs(2, 3, true /* add_extra_shape */, 1 /* unknown_outer_dims */);
+  INFER_ERROR("len(dense_keys) != len(dense_shapes)", op,
+              "?;?;?;?;?;?;?;?;?;?");
+
+  // Variable inner dimensions are not supported
+  set_outputs(2, 3, false /* add_extra_shape */, 2 /* unknown_outer_dims */);
+  INFER_ERROR("shapes[0] has unknown rank or unknown inner dimensions", op,
+              "?;?;?;?;?;?;?;?;?;?");
 }
 
 TEST(ParsingOpsTest, ParseSingleSequenceExample_ShapeFn) {
@@ -142,13 +169,13 @@ TEST(ParsingOpsTest, ParseSingleSequenceExample_ShapeFn) {
             .Attr("context_dense_types",
                   DataTypeList(num_context_dense, DT_FLOAT))
             .Attr("context_dense_shapes",
-                  MakeDenseShapes(num_context_dense, add_extra_shape))
+                  MakeDenseShapes(num_context_dense, add_extra_shape, 0))
             .Attr("feature_list_sparse_types",
                   DataTypeList(num_feature_list_sparse, DT_FLOAT))
             .Attr("feature_list_dense_types",
                   DataTypeList(num_feature_list_dense, DT_FLOAT))
             .Attr("feature_list_dense_shapes",
-                  MakeDenseShapes(num_feature_list_dense, add_extra_shape))
+                  MakeDenseShapes(num_feature_list_dense, add_extra_shape, 0))
             .Finalize(&op.node_def));
   };
 
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 776523f33fb..2e3fdc7c57e 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -23,17 +23,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-namespace {
-
-Status RandomShape(InferenceContext* c) {
-  ShapeHandle out;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
-  c->set_output(0, out);
-  return Status::OK();
-}
-
-}  // namepsace
-
 REGISTER_OP("RandomUniform")
     .Input("shape: T")
     .SetIsStateful()
@@ -42,7 +31,7 @@ REGISTER_OP("RandomUniform")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a uniform distribution.
 
@@ -69,7 +58,7 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random integers from a uniform distribution.
 
@@ -100,7 +89,7 @@ REGISTER_OP("RandomStandardNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution.
 
@@ -128,7 +117,7 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution. The parameters may each be a
 scalar which applies to the entire output, or a vector of length shape[0] which
@@ -158,7 +147,7 @@ REGISTER_OP("TruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a truncated normal distribution.
 
@@ -192,7 +181,7 @@ Randomly shuffles a tensor along its first dimension.
   to one and only one `output[i]`. For example, a mapping that might occur for a
   3x2 tensor is:
 
-```prettyprint
+```
 [[1, 2],       [[5, 6],
  [3, 4],  ==>   [1, 2],
  [5, 6]]        [3, 4]]
@@ -276,4 +265,48 @@ output: A tensor with shape `shape + shape(alpha)`. Each slice
   `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
 )doc");
 
+REGISTER_OP("RandomPoisson")
+    .SetIsStateful()
+    .Input("shape: S")
+    .Input("rate: dtype")
+    .Output("output: dtype")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .Attr("S: {int32, int64}")
+    .Attr("dtype: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      TF_RETURN_IF_ERROR(c->Concatenate(out, c->input(1), &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs random values from the Poisson distribution(s) described by rate.
+
+This op uses two algorithms, depending on rate. If rate >= 10, then
+the algorithm by Hormann is used to acquire samples via
+transformation-rejection.
+See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+
+Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+random variables.
+See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+Programming, Volume 2. Addison Wesley
+
+shape: 1-D integer tensor. Shape of independent samples to draw from each
+  distribution described by the shape parameters given in rate.
+rate: A tensor in which each scalar is a "rate" parameter describing the
+  associated poisson distribution.
+seed: If either `seed` or `seed2` are set to be non-zero, the random number
+  generator is seeded by the given seed.  Otherwise, it is seeded by a
+  random seed.
+seed2: A second seed to avoid seed collision.
+
+output: A tensor with shape `shape + shape(rate)`. Each slice
+  `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+  `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
+  rate.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/random_ops_test.cc b/tensorflow/core/ops/random_ops_test.cc
index 524e1079981..e9145f8ee02 100644
--- a/tensorflow/core/ops/random_ops_test.cc
+++ b/tensorflow/core/ops/random_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -53,4 +52,19 @@ TEST(RandomOpsTest, RandomGamma_ShapeFn) {
   INFER_OK(op, "[3];[]", "[1,2,3]");
 }
 
+TEST(RandomOpsTest, RandomPoisson_ShapeFn) {
+  ShapeInferenceTestOp op("RandomPoisson");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;[3]", "?");
+  INFER_OK(op, "[1];?", "?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];[3,4]");
+  Tensor shape = test::AsTensor<int32>({1, 2, 3});
+  op.input_tensors[0] = &shape;
+  INFER_OK(op, "[3];[4,?]", "[1,2,3,d1_0,d1_1]");
+  INFER_OK(op, "[3];[4,5]", "[1,2,3,d1_0,d1_1]");
+  INFER_OK(op, "[3];[]", "[1,2,3]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/remote_fused_graph_ops.cc b/tensorflow/core/ops/remote_fused_graph_ops.cc
index 172f3da11aa..6e9f37a6152 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops.cc
@@ -21,12 +21,11 @@ namespace tensorflow {
 
 // TODO(satok): Implement shape_inference
 REGISTER_OP("RemoteFusedGraphExecute")
-    .Input("values: M * T")
-    .Output("output: N * T")
-    .Attr("M: int >= 0")
-    .Attr("N: int >= 0")
-    .Attr("T: type")
-    .Attr("serialized_graph_transfer_info: string")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("serialized_remote_fused_graph_execute_info: string")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Execute a sub graph on a remote processor transferred by GraphTransferer.
diff --git a/tensorflow/core/ops/remote_fused_graph_ops_test.cc b/tensorflow/core/ops/remote_fused_graph_ops_test.cc
index dae6226e471..f5d90a676d7 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops_test.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops_test.cc
@@ -26,19 +26,33 @@ namespace tensorflow {
 
 TEST(RemoteFusedGraphOpsTest, RemoteFusedGraphExecute_ShapeFn) {
   ShapeInferenceTestOp op("RemoteFusedGraphExecute");
-  auto set_n = [&op](int input_count, int output_count) {
+  auto set_n = [&op](int input1_count, int input2_count, int output_count) {
     std::vector<NodeDefBuilder::NodeOut> src_list;
-    for (int i = 0; i < input_count; ++i) {
+    DataTypeVector input_types;
+    for (int i = 0; i < input1_count; ++i) {
       src_list.emplace_back("a", 0, DT_FLOAT);
+      input_types.emplace_back(DT_FLOAT);
     }
-    TF_ASSERT_OK(NodeDefBuilder("test", "RemoteFusedGraphExecute")
-                     .Input(src_list)
-                     .Attr("M", input_count)
-                     .Attr("N", output_count)
-                     .Finalize(&op.node_def));
+    for (int i = 0; i < input2_count; ++i) {
+      src_list.emplace_back("b", 0, DT_INT32);
+      input_types.emplace_back(DT_INT32);
+    }
+    DataTypeVector output_types;
+    for (int i = 0; i < output_count; ++i) {
+      output_types.emplace_back(DT_FLOAT);
+    }
+    NodeDefBuilder builder = NodeDefBuilder("test", "RemoteFusedGraphExecute")
+                                 .Input(src_list)
+                                 .Attr("Tinputs", input_types)
+                                 .Attr("Toutputs", output_types);
+    TF_ASSERT_OK(builder.Finalize(&op.node_def));
   };
-  set_n(4, 2);
+  set_n(4, 0, 2);
   INFER_OK(op, "?;?;?;?", "?;?");  // output rank unknown
+
+  set_n(4, 3, 3);
+  INFER_OK(op, "?;?;?;?;?;?;?", "?;?;?");  // output rank unknown
+
   // TODO(satok): Implement shape inference and do its test here
 }
 
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 00194edea2b..3b48559b1fc 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -20,10 +20,43 @@
 #include "tensorflow/core/framework/shape_inference.h"
 
 using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeAndType;
 using ::tensorflow::shape_inference::ShapeHandle;
 
 namespace tensorflow {
 
+namespace {
+
+Status ValidateVariableResourceHandle(InferenceContext* c,
+                                      ShapeAndType* shape_and_type) {
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data == nullptr || handle_data->empty()) {
+    shape_and_type->shape = c->UnknownShape();
+    shape_and_type->dtype = DT_INVALID;
+  } else {
+    *shape_and_type = (*handle_data)[0];
+    DataType value_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr("dtype", &value_dtype));
+    if (shape_and_type->dtype != value_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read variable with wrong dtype. "
+          "Expected ",
+          DataTypeString(shape_and_type->dtype), " got ",
+          DataTypeString(value_dtype));
+    }
+  }
+  return Status::OK();
+}
+
+Status ReadVariableShapeFn(InferenceContext* c) {
+  ShapeAndType shape_and_type;
+  TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type));
+  c->set_output(0, shape_and_type.shape);
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("VarHandleOp")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
@@ -31,16 +64,17 @@ REGISTER_OP("VarHandleOp")
     .Attr("shape: shape")
     .Output("resource: resource")
     .SetIsStateful()
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
+    .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Scalar());
       DataType t;
-      c->GetAttr("dtype", &t);
-      c->set_output_handle_dtype(0, t);
+      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
       TensorShapeProto p;
-      c->GetAttr("shape", &p);
-      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &p));
+      ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(p, &s));
-      c->set_output_handle_shape(0, s);
+      c->set_output_handle_shapes_and_types(0,
+                                            std::vector<ShapeAndType>{{s, t}});
+
       return Status::OK();
     })
     .Doc(R"(
@@ -57,19 +91,7 @@ REGISTER_OP("ReadVariableOp")
     .Input("resource: resource")
     .Output("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn([](InferenceContext* c) {
-      DataType handle_dtype = c->input_handle_dtype(0);
-      DataType value_dtype;
-      c->GetAttr("dtype", &value_dtype);
-      if (handle_dtype != value_dtype) {
-        return errors::InvalidArgument(
-            "Trying to read variable with wrong dtype. "
-            "Expected ",
-            handle_dtype, " got ", value_dtype);
-      }
-      c->set_output(0, c->input_handle_shape(0));
-      return Status::OK();
-    })
+    .SetShapeFn(ReadVariableShapeFn)
     .Doc(R"(
 Reads the value of a variable.
 
@@ -84,6 +106,23 @@ resource: handle to the resource in which to store the variable.
 dtype: the dtype of the value.
 )");
 
+REGISTER_OP("_UnsafeReadVariable")
+    .Input("resource: resource")
+    .Output("value: dtype")
+    .Attr("dtype: type")
+    .SetShapeFn(ReadVariableShapeFn)
+    .Doc(R"(
+Reads the value of a variable without any memory model.
+
+The tensor returned by this operation aliases a mutable Tensor, and its value
+can be observed to be different by different ops.
+
+Internal and private to the tensorflow implementation.
+
+resource: handle to the resource in which to store the variable.
+dtype: the dtype of the value.
+)");
+
 REGISTER_OP("DestroyResourceOp")
     .Input("resource: resource")
     .Attr("ignore_lookup_error: bool = true")
@@ -101,19 +140,13 @@ ignore_lookup_error: whether to ignore the error when the resource
 )");
 
 Status CreateAssignShapeFn(InferenceContext* c) {
-  DataType handle_dtype = c->input_handle_dtype(0);
-  DataType value_dtype;
-  c->GetAttr("dtype", &value_dtype);
-  if (handle_dtype != value_dtype) {
-    return errors::InvalidArgument(
-        "Trying to initialize handle for variable with wrong dtype. "
-        "Expected ",
-        handle_dtype, " got ", value_dtype);
-  }
-  ShapeHandle s = c->input_handle_shape(0);
+  ShapeAndType handle_shape_and_type;
+  TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
+
   ShapeHandle value_shape = c->input(1);
   ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->Merge(s, value_shape, &unused));
+  TF_RETURN_IF_ERROR(
+      c->Merge(handle_shape_and_type.shape, value_shape, &unused));
   return Status::OK();
 }
 
@@ -191,18 +224,16 @@ REGISTER_OP("ResourceGather")
     .Attr("dtype: type")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn([](InferenceContext* c) {
-      DataType dtype;
-      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &dtype));
-      if (c->input_handle_dtype(0) != dtype) {
-        return errors::InvalidArgument(
-            "Trying to gather from a variable with the wrong dtype.");
-      }
+      ShapeAndType handle_shape_and_type;
+      TF_RETURN_IF_ERROR(
+          ValidateVariableResourceHandle(c, &handle_shape_and_type));
+
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(
-          c->WithRankAtLeast(c->input_handle_shape(0), 1, &unused));
+          c->WithRankAtLeast(handle_shape_and_type.shape, 1, &unused));
       ShapeHandle params_subshape;
       TF_RETURN_IF_ERROR(
-          c->Subshape(c->input_handle_shape(0), 1, &params_subshape));
+          c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
       ShapeHandle indices_shape = c->input(1);
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
@@ -235,7 +266,10 @@ REGISTER_OP("ResourceScatterAdd")
     .Attr("dtype: numbertype")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle var_shape = c->input_handle_shape(0);
+      ShapeAndType handle_shape_and_type;
+      TF_RETURN_IF_ERROR(
+          ValidateVariableResourceHandle(c, &handle_shape_and_type));
+      ShapeHandle var_shape = handle_shape_and_type.shape;
       ShapeHandle indices_shape = c->input(1);
 
       ShapeHandle unused_updates_shape;
@@ -266,7 +300,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 resource: Should be from a `Variable` node.
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index 0aed75366ec..dea75a1af83 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -26,11 +26,12 @@ using shape_inference::InferenceContext;
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
   std::vector<ShapeHandle> sparse_handles;
   if (c->input("sparse_weights", &sparse_handles).ok()) {
-    c->set_output("out_delta_sparse_weights", sparse_handles);
+    TF_RETURN_IF_ERROR(
+        c->set_output("out_delta_sparse_weights", sparse_handles));
   }
   std::vector<ShapeHandle> dense_handles;
   if (c->input("dense_weights", &dense_handles).ok()) {
-    c->set_output("out_delta_dense_weights", dense_handles);
+    TF_RETURN_IF_ERROR(c->set_output("out_delta_dense_weights", dense_handles));
   }
   return c->set_output(
       "out_example_state_data",
@@ -71,17 +72,17 @@ optimizer applies each update one example at a time. Examples are sampled
 uniformly, and the optimizer is learning rate free and enjoys linear convergence
 rate.
 
-Proximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.
-2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf
+[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+Shai Shalev-Shwartz, Tong Zhang. 2012
 
-  Loss objective = \sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|
+$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 
-Adding vs. Averaging in Distributed Primal-Dual Optimization.
-Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,
-Martin Takac http://arxiv.org/abs/1502.03508
+[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+Peter Richtarik, Martin Takac. 2015
 
-Stochastic Dual Coordinate Ascent with Adaptive Probabilities
-Dominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053
+[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 
 loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
   squared and hinge losses.
@@ -104,7 +105,7 @@ example_weights: a vector which contains the weight associated with each
 example_labels: a vector which contains the label/target associated with each
   example.
 sparse_indices: a list of vectors where each value is the indices which has
-  corresponding weights in sparse_weights. This field maybe ommitted for the
+  corresponding weights in sparse_weights. This field maybe omitted for the
   dense approach.
 sparse_weights: a list of vectors where each value is the weight associated with
   a sparse feature group.
diff --git a/tensorflow/core/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc
index fad70072071..85d1335dcf9 100644
--- a/tensorflow/core/ops/set_ops.cc
+++ b/tensorflow/core/ops/set_ops.cc
@@ -235,7 +235,7 @@ REGISTER_OP("SparseToSparseSetOperation")
       DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0);
       DimensionHandle output_rank_dim;
       if (c->ValueKnown(input0_rank_dim)) {
-        const int32 input0_rank = c->Value(input0_rank_dim);
+        const int64 input0_rank = c->Value(input0_rank_dim);
         if (input0_rank < 2) {
           return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
                                          input0_rank, ".");
@@ -244,7 +244,7 @@ REGISTER_OP("SparseToSparseSetOperation")
             c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim));
         output_rank_dim = input0_rank_dim;
       } else if (c->ValueKnown(input1_rank_dim)) {
-        const int32 input1_rank = c->Value(input1_rank_dim);
+        const int64 input1_rank = c->Value(input1_rank_dim);
         if (input1_rank < 2) {
           return errors::InvalidArgument("Input 1, expected rank >= 2, got ",
                                          input1_rank, ".");
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 860b3475e93..9722f0ee9ae 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -128,12 +128,13 @@ pair takes space.
 )doc");
 
 REGISTER_OP("SparseTensorDenseMatMul")
-    .Input("a_indices: int64")
+    .Input("a_indices: Tindices")
     .Input("a_values: T")
     .Input("a_shape: int64")
     .Input("b: T")
     .Output("product: T")
     .Attr("T: type")
+    .Attr("Tindices: {int32,int64} = DT_INT64")
     .Attr("adjoint_a: bool = false")
     .Attr("adjoint_b: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -326,7 +327,7 @@ Converts a sparse representation into a dense tensor.
 
 Builds an array `dense` with shape `output_shape` such that
 
-```prettyprint
+```
 # If sparse_indices is scalar
 dense[i] = (i == sparse_indices ? sparse_values : default_value)
 
@@ -455,6 +456,84 @@ concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
     where rank is the number of dimensions in each input `SparseTensor`.
 )doc");
 
+REGISTER_OP("SparseCross")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Output("output_indices: int64")
+    .Output("output_values: out_type")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("hashed_output: bool")
+    .Attr("num_buckets: int >= 0")
+    .Attr("hash_key: int")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .Attr("out_type: {int64, string}")
+    .Attr("internal_type: {int64, string}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Generates sparse cross from a list of sparse and dense tensors.
+
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+indices: 2-D.  Indices of each input `SparseTensor`.
+values: 1-D.   values of each `SparseTensor`.
+shapes: 1-D.   Shapes of each `SparseTensor`.
+dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+hashed_output: If true, returns the hash of the cross instead of the string.
+  This will allow us avoiding string manipulations.
+num_buckets: It is used if hashed_output is true.
+  output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+  function to combine the crosses fingerprints.
+output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+output_values: 1-D.  Non-empty values of the concatenated or hashed
+  `SparseTensor`.
+output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+)doc");
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
@@ -1050,4 +1129,128 @@ shared_name: The shared name for the `SparseTensorsMap` read by this op.
   of the Op that created the original `SparseTensorsMap` should be used.
 )doc");
 
+REGISTER_OP("SparseFillEmptyRows")
+    .Input("indices: int64")
+    .Input("values: T")
+    .Input("dense_shape: int64")
+    .Input("default_value: T")
+    .Output("output_indices: int64")
+    .Output("output_values: T")
+    .Output("empty_row_indicator: bool")
+    .Output("reverse_index_map: int64")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_indices = c->input(0);
+      TF_RETURN_IF_ERROR(c->WithRank(input_indices, 2, &input_indices));
+      ShapeHandle input_values = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(input_values, 1, &input_values));
+      ShapeHandle input_shape = c->input(2);
+      TF_RETURN_IF_ERROR(c->WithRank(input_shape, 1, &input_shape));
+      ShapeHandle default_value = c->input(3);
+      TF_RETURN_IF_ERROR(c->WithRank(default_value, 0, &default_value));
+      DimensionHandle N = c->Dim(input_indices, 0);
+      TF_RETURN_IF_ERROR(c->Merge(N, c->Dim(input_values, 0), &N));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(input_indices, 1),
+                                  c->Dim(input_shape, 0), &unused_dim));
+      ShapeHandle output_indices =
+          c->Matrix(InferenceContext::kUnknownDim, c->NumElements(input_shape));
+      ShapeHandle output_values = c->Vector(InferenceContext::kUnknownDim);
+      ShapeHandle constant_input_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &constant_input_shape));
+      ShapeHandle empty_row_indicator =
+          c->Vector(c->Dim(constant_input_shape, 0));
+      ShapeHandle reverse_index_map = c->Vector(N);
+      c->set_output(0, output_indices);
+      c->set_output(1, output_values);
+      c->set_output(2, empty_row_indicator);
+      c->set_output(3, reverse_index_map);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Fills empty rows in the input 2-D `SparseTensor` with a default value.
+
+The input `SparseTensor` is represented via the tuple of inputs
+(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+same `dense_shape` but with indices `output_indices` and values
+`output_values`.
+
+This op inserts a single entry for every row that doesn't have any values.
+The index is created as `[row, 0, ..., 0]` and the inserted value
+is `default_value`.
+
+For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [2, 0]: c
+    [3, 1]: d
+
+Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [1, 0]: default_value
+    [2, 0]: c
+    [3, 1]: d
+    [4, 0]: default_value
+
+The output `SparseTensor` will be in row-major order and will have the
+same shape as the input.
+
+This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+
+    empty_row_indicator[i] = True iff row i was an empty row.
+
+And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+backpropagation,
+
+    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+
+
+indices: 2-D. the indices of the sparse tensor.
+values: 1-D. the values of the sparse tensor.
+dense_shape: 1-D. the shape of the sparse tensor.
+default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+  for rows missing from the input sparse tensor.
+output indices: 2-D. the indices of the filled sparse tensor.
+output_values: 1-D. the values of the filled sparse tensor.
+empty_row_indicator: 1-D. whether the dense row was missing in the
+  input sparse tensor.
+reverse_index_map: 1-D. a map from the input indices to the output indices.
+)doc");
+
+REGISTER_OP("SparseFillEmptyRowsGrad")
+    .Input("reverse_index_map: int64")
+    .Input("grad_values: T")
+    .Output("d_values: T")
+    .Output("d_default_value: T")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle reverse_index_map = c->input(0);
+      TF_RETURN_IF_ERROR(c->WithRank(reverse_index_map, 1, &reverse_index_map));
+      ShapeHandle grad_values = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(grad_values, 1, &grad_values));
+      c->set_output(0, reverse_index_map);
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+The gradient of SparseFillEmptyRows.
+
+Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+shaped `[N_full]`, where `N_full >= N` and copies data into either
+`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+`d_default_value` is a scalar.
+
+  d_values[j] = grad_values[reverse_index_map[j]]
+  d_default_value = sum_{k : 0 .. N_full - 1} (
+     grad_values[k] * 1{k not in reverse_index_map})
+
+reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+grad_values: 1-D.  The gradients from backprop.
+d_values: 1-D.  The backprop into values.
+d_default_value: 0-D.  The backprop into default_value.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops_test.cc b/tensorflow/core/ops/sparse_ops_test.cc
index b3ee92fa21e..ea49f1a1991 100644
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -255,6 +254,7 @@ TEST(SparseOpsTest, SparseConcat_ShapeFn) {
   ShapeInferenceTestOp op("SparseConcat");
   std::vector<NodeDefBuilder::NodeOut> src_list;
   int n = 2;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_INT64);
   TF_ASSERT_OK(NodeDefBuilder("test", "SparseConcat")
                    .Input(src_list)
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
new file mode 100644
index 00000000000..592aaa25c3e
--- /dev/null
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -0,0 +1,410 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("FFT")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+Fast Fourier transform.
+
+Computes the 1-dimensional discrete Fourier transform over the inner-most
+dimension of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most
+  dimension of `input` is replaced with its 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fft
+@end_compatibility
+)doc");
+
+REGISTER_OP("IFFT")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+Inverse fast Fourier transform.
+
+Computes the inverse 1-dimensional discrete Fourier transform over the
+inner-most dimension of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most
+  dimension of `input` is replaced with its inverse 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifft
+@end_compatibility
+)doc");
+
+REGISTER_OP("FFT2D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
+    })
+    .Doc(R"doc(
+2D fast Fourier transform.
+
+Computes the 2-dimensional discrete Fourier transform over the inner-most
+2 dimensions of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most 2
+  dimensions of `input` are replaced with their 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fft2
+@end_compatibility
+)doc");
+
+REGISTER_OP("IFFT2D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
+    })
+    .Doc(R"doc(
+Inverse 2D fast Fourier transform.
+
+Computes the inverse 2-dimensional discrete Fourier transform over the
+inner-most 2 dimensions of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most 2
+  dimensions of `input` are replaced with their inverse 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifft2
+@end_compatibility
+)doc");
+
+REGISTER_OP("FFT3D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+    })
+    .Doc(R"doc(
+3D fast Fourier transform.
+
+Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+dimensions of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most 3
+  dimensions of `input` are replaced with their 3D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fftn with 3 dimensions.
+@end_compatibility
+)doc");
+
+REGISTER_OP("IFFT3D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+    })
+    .Doc(R"doc(
+Inverse 3D fast Fourier transform.
+
+Computes the inverse 3-dimensional discrete Fourier transform over the
+inner-most 3 dimensions of `input`.
+
+input: A complex64 tensor.
+output: A complex64 tensor of the same shape as `input`. The inner-most 3
+  dimensions of `input` are replaced with their inverse 3D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifftn with 3 dimensions.
+@end_compatibility
+)doc");
+
+Status RFFTShape(InferenceContext* c, const bool forward, const int rank) {
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), rank, &out));
+
+  // Check that fft_length has shape [rank].
+  ShapeHandle unused_shape;
+  DimensionHandle unused_dim;
+  ShapeHandle fft_length_input = c->input(1);
+  TF_RETURN_IF_ERROR(c->WithRank(fft_length_input, 1, &unused_shape));
+  TF_RETURN_IF_ERROR(
+      c->WithValue(c->Dim(fft_length_input, 0), rank, &unused_dim));
+  const Tensor* fft_length_tensor = c->input_tensor(1);
+
+  // If fft_length is unknown at graph creation time, we can't predict the
+  // output size.
+  if (fft_length_tensor == nullptr) {
+    // We can't know the dimension of any of the rank inner dimensions of the
+    // output without knowing fft_length.
+    for (int i = 0; i < rank; ++i) {
+      TF_RETURN_IF_ERROR(c->ReplaceDim(out, -rank + i, c->UnknownDim(), &out));
+    }
+  } else {
+    auto fft_length_as_vec = fft_length_tensor->vec<int32>();
+    for (int i = 0; i < rank; ++i) {
+      // For RFFT, replace the last dimension with fft_length/2 + 1.
+      auto dim = forward && i == rank - 1 && fft_length_as_vec(i) != 0
+                     ? fft_length_as_vec(i) / 2 + 1
+                     : fft_length_as_vec(i);
+      TF_RETURN_IF_ERROR(c->ReplaceDim(out, -rank + i, c->MakeDim(dim), &out));
+    }
+  }
+
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+REGISTER_OP("RFFT")
+    .Input("input: float")
+    .Input("fft_length: int32")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 1); })
+    .Doc(R"doc(
+Real-valued fast Fourier transform.
+
+Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most dimension of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+followed by the `fft_length / 2` positive-frequency terms.
+
+Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+
+input: A float32 tensor.
+fft_length: An int32 tensor of shape [1]. The FFT length.
+output: A complex64 tensor of the same rank as `input`. The inner-most
+  dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+  frequency components of its 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfft
+@end_compatibility
+)doc");
+
+REGISTER_OP("IRFFT")
+    .Input("input: complex64")
+    .Input("fft_length: int32")
+    .Output("output: float")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 1); })
+    .Doc(R"doc(
+Inverse real-valued fast Fourier transform.
+
+Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most dimension of `input`.
+
+The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+`fft_length` is not provided, it is computed from the size of the inner-most
+dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+than the corresponding dimension of `input`, the dimension is cropped. If it is
+larger, the dimension is padded with zeros.
+
+input: A complex64 tensor.
+fft_length: An int32 tensor of shape [1]. The FFT length.
+output: A float32 tensor of the same rank as `input`. The inner-most
+  dimension of `input` is replaced with the `fft_length` samples of its inverse
+  1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.irfft
+@end_compatibility
+)doc");
+
+REGISTER_OP("RFFT2D")
+    .Input("input: float")
+    .Input("fft_length: int32")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 2); })
+    .Doc(R"doc(
+2D real-valued fast Fourier transform.
+
+Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most 2 dimensions of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+of `output`: the zero-frequency term, followed by the `fft_length / 2`
+positive-frequency terms.
+
+Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+
+input: A float32 tensor.
+fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+output: A complex64 tensor of the same rank as `input`. The inner-most 2
+  dimensions of `input` are replaced with their 2D Fourier transform. The
+  inner-most dimension contains `fft_length / 2 + 1` unique frequency
+  components.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfft2
+@end_compatibility
+)doc");
+
+REGISTER_OP("IRFFT2D")
+    .Input("input: complex64")
+    .Input("fft_length: int32")
+    .Output("output: float")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 2); })
+    .Doc(R"doc(
+Inverse 2D real-valued fast Fourier transform.
+
+Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most 2 dimensions of `input`.
+
+The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+to compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+
+input: A complex64 tensor.
+fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+output: A float32 tensor of the same rank as `input`. The inner-most 2
+  dimensions of `input` are replaced with the `fft_length` samples of their
+  inverse 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.irfft2
+@end_compatibility
+)doc");
+
+REGISTER_OP("RFFT3D")
+    .Input("input: float")
+    .Input("fft_length: int32")
+    .Output("output: complex64")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 3); })
+    .Doc(R"doc(
+3D real-valued fast Fourier transform.
+
+Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most 3 dimensions of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+of `output`: the zero-frequency term, followed by the `fft_length / 2`
+positive-frequency terms.
+
+Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+
+input: A float32 tensor.
+fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+output: A complex64 tensor of the same rank as `input`. The inner-most 3
+  dimensions of `input` are replaced with the their 3D Fourier transform. The
+  inner-most dimension contains `fft_length / 2 + 1` unique frequency
+  components.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfftn with 3 dimensions.
+@end_compatibility
+)doc");
+
+REGISTER_OP("IRFFT3D")
+    .Input("input: complex64")
+    .Input("fft_length: int32")
+    .Output("output: float")
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 3); })
+    .Doc(R"doc(
+Inverse 3D real-valued fast Fourier transform.
+
+Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most 3 dimensions of `input`.
+
+The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+to compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+
+input: A complex64 tensor.
+fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+output: A float32 tensor of the same rank as `input`. The inner-most 3
+  dimensions of `input` are replaced with the `fft_length` samples of their
+  inverse 3D real Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.irfftn with 3 dimensions.
+@end_compatibility
+)doc");
+
+// Deprecated ops:
+REGISTER_OP("BatchFFT")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use FFT");
+REGISTER_OP("BatchIFFT")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use IFFT");
+REGISTER_OP("BatchFFT2D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use FFT2D");
+REGISTER_OP("BatchIFFT2D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use IFFT2D");
+REGISTER_OP("BatchFFT3D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use FFT3D");
+REGISTER_OP("BatchIFFT3D")
+    .Input("input: complex64")
+    .Output("output: complex64")
+    .Deprecated(15, "Use IFFT3D");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/spectral_ops_test.cc b/tensorflow/core/ops/spectral_ops_test.cc
new file mode 100644
index 00000000000..0f8a3e6ef13
--- /dev/null
+++ b/tensorflow/core/ops/spectral_ops_test.cc
@@ -0,0 +1,233 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+
+namespace tensorflow {
+
+TEST(MathOpsTest, FFT_ShapeFn) {
+  for (const auto* op_name : {"FFT", "IFFT"}) {
+    ShapeInferenceTestOp op(op_name);
+    INFER_OK(op, "?", "?");
+    INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[]");
+    INFER_OK(op, "[?]", "in0");
+    INFER_OK(op, "[1]", "in0");
+    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
+  }
+
+  for (const auto* op_name : {"FFT2D", "IFFT2D"}) {
+    ShapeInferenceTestOp op(op_name);
+    INFER_OK(op, "?", "?");
+    INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+    INFER_OK(op, "[?,1]", "in0");
+    INFER_OK(op, "[1,2]", "in0");
+    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
+  }
+
+  for (const auto* op_name : {"FFT3D", "IFFT3D"}) {
+    ShapeInferenceTestOp op(op_name);
+    INFER_OK(op, "?", "?");
+    INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
+    INFER_OK(op, "[?,1,?]", "in0");
+    INFER_OK(op, "[1,2,3]", "in0");
+    INFER_OK(op, "[1,2,3,4,5,6,7]", "in0");
+  }
+}
+
+TEST(MathOpsTest, RFFT_ShapeFn) {
+  // Rank 1
+  for (const bool forward : {true, false}) {
+    ShapeInferenceTestOp op(forward ? "RFFT" : "IRFFT");
+
+    // Unknown rank or shape of inputs.
+    INFER_OK(op, "?;?", "?");
+    INFER_OK(op, "?;[1]", "?");
+
+    // Unknown fft_length (whether or not rank/shape is known) implies unknown
+    // FFT shape.
+    INFER_OK(op, "[1];?", "[?]");
+    INFER_OK(op, "[1];[1]", "[?]");
+    INFER_OK(op, "[?];[1]", "[?]");
+
+    // Batch dimensions preserved.
+    INFER_OK(op, "[1,2,3,4];[1]", "[d0_0,d0_1,d0_2,?]");
+
+    INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[];?");
+    INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1];[1,1]");
+    INFER_ERROR("Dimension must be 1 but is 2", op, "[1];[2]");
+
+    // Tests with known values for fft_length input.
+    op.input_tensors.resize(2);
+    Tensor fft_length = test::AsTensor<int32>({10});
+    op.input_tensors[1] = &fft_length;
+
+    // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
+    if (forward) {
+      INFER_OK(op, "[?];[1]", "[6]");
+      INFER_OK(op, "[1];[1]", "[6]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,6]");
+    } else {
+      INFER_OK(op, "[?];[1]", "[10]");
+      INFER_OK(op, "[1];[1]", "[10]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,10]");
+    }
+
+    fft_length = test::AsTensor<int32>({11});
+    if (forward) {
+      INFER_OK(op, "[?];[1]", "[6]");
+      INFER_OK(op, "[1];[1]", "[6]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,6]");
+    } else {
+      INFER_OK(op, "[?];[1]", "[11]");
+      INFER_OK(op, "[1];[1]", "[11]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,11]");
+    }
+
+    fft_length = test::AsTensor<int32>({12});
+    if (forward) {
+      INFER_OK(op, "[?];[1]", "[7]");
+      INFER_OK(op, "[1];[1]", "[7]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,7]");
+    } else {
+      INFER_OK(op, "[?];[1]", "[12]");
+      INFER_OK(op, "[1];[1]", "[12]");
+      INFER_OK(op, "[1,1];[1]", "[d0_0,12]");
+    }
+  }
+
+  // Rank 2
+  for (const bool forward : {true, false}) {
+    ShapeInferenceTestOp op(forward ? "RFFT2D" : "IRFFT2D");
+
+    // Unknown rank or shape of inputs.
+    INFER_OK(op, "?;?", "?");
+    INFER_OK(op, "?;[2]", "?");
+
+    // Unknown fft_length (whether or not rank/shape is known) implies unknown
+    // FFT shape.
+    INFER_OK(op, "[1,1];?", "[?,?]");
+    INFER_OK(op, "[1,1];[2]", "[?,?]");
+    INFER_OK(op, "[?,?];[2]", "[?,?]");
+
+    // Batch dimensions preserved.
+    INFER_OK(op, "[1,2,3,4];[2]", "[d0_0,d0_1,?,?]");
+
+    INFER_ERROR("Shape must be at least rank 2 but is rank 0", op, "[];?");
+    INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,1];[1,1]");
+    INFER_ERROR("Dimension must be 2 but is 3", op, "[1,1];[3]");
+
+    // Tests with known values for fft_length input.
+    op.input_tensors.resize(2);
+    Tensor fft_length = test::AsTensor<int32>({9, 10});
+    op.input_tensors[1] = &fft_length;
+
+    // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
+    if (forward) {
+      INFER_OK(op, "[?,?];[2]", "[9,6]");
+      INFER_OK(op, "[1,1];[2]", "[9,6]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,9,6]");
+    } else {
+      INFER_OK(op, "[?,?];[2]", "[9,10]");
+      INFER_OK(op, "[1,1];[2]", "[9,10]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,9,10]");
+    }
+
+    fft_length = test::AsTensor<int32>({10, 11});
+    if (forward) {
+      INFER_OK(op, "[?,?];[2]", "[10,6]");
+      INFER_OK(op, "[1,1];[2]", "[10,6]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,10,6]");
+    } else {
+      INFER_OK(op, "[?,?];[2]", "[10,11]");
+      INFER_OK(op, "[1,1];[2]", "[10,11]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,10,11]");
+    }
+
+    fft_length = test::AsTensor<int32>({11, 12});
+    if (forward) {
+      INFER_OK(op, "[?,?];[2]", "[11,7]");
+      INFER_OK(op, "[1,1];[2]", "[11,7]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,11,7]");
+    } else {
+      INFER_OK(op, "[?,?];[2]", "[11,12]");
+      INFER_OK(op, "[1,1];[2]", "[11,12]");
+      INFER_OK(op, "[1,1,1];[2]", "[d0_0,11,12]");
+    }
+  }
+
+  // Rank 3
+  for (const bool forward : {true, false}) {
+    ShapeInferenceTestOp op(forward ? "RFFT3D" : "IRFFT3D");
+
+    // Unknown rank or shape of inputs.
+    INFER_OK(op, "?;?", "?");
+    INFER_OK(op, "?;[3]", "?");
+
+    // Unknown fft_length (whether or not rank/shape is known) implies unknown
+    // FFT shape.
+    INFER_OK(op, "[1,1,1];?", "[?,?,?]");
+    INFER_OK(op, "[1,1,1];[3]", "[?,?,?]");
+    INFER_OK(op, "[?,?,?];[3]", "[?,?,?]");
+
+    // Batch dimensions preserved.
+    INFER_OK(op, "[1,2,3,4];[3]", "[d0_0,?,?,?]");
+
+    INFER_ERROR("Shape must be at least rank 3 but is rank 0", op, "[];?");
+    INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,1,1];[1,1]");
+    INFER_ERROR("Dimension must be 3 but is 4", op, "[1,1,1];[4]");
+
+    // Tests with known values for fft_length input.
+    op.input_tensors.resize(2);
+    Tensor fft_length = test::AsTensor<int32>({10, 11, 12});
+    op.input_tensors[1] = &fft_length;
+
+    // The inner-most dimension of the RFFT is n/2+1 while for IRFFT it's n.
+    if (forward) {
+      INFER_OK(op, "[?,?,?];[3]", "[10,11,7]");
+      INFER_OK(op, "[1,1,1];[3]", "[10,11,7]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,10,11,7]");
+    } else {
+      INFER_OK(op, "[?,?,?];[3]", "[10,11,12]");
+      INFER_OK(op, "[1,1,1];[3]", "[10,11,12]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,10,11,12]");
+    }
+
+    fft_length = test::AsTensor<int32>({11, 12, 13});
+    if (forward) {
+      INFER_OK(op, "[?,?,?];[3]", "[11,12,7]");
+      INFER_OK(op, "[1,1,1];[3]", "[11,12,7]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,11,12,7]");
+    } else {
+      INFER_OK(op, "[?,?,?];[3]", "[11,12,13]");
+      INFER_OK(op, "[1,1,1];[3]", "[11,12,13]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,11,12,13]");
+    }
+
+    fft_length = test::AsTensor<int32>({12, 13, 14});
+    if (forward) {
+      INFER_OK(op, "[?,?,?];[3]", "[12,13,8]");
+      INFER_OK(op, "[1,1,1];[3]", "[12,13,8]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,12,13,8]");
+    } else {
+      INFER_OK(op, "[?,?,?];[3]", "[12,13,14]");
+      INFER_OK(op, "[1,1,1];[3]", "[12,13,14]");
+      INFER_OK(op, "[1,1,1,1];[3]", "[d0_0,12,13,14]");
+    }
+  }
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index e0a10caee32..0890d5fc7c7 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -267,6 +267,7 @@ Applies sparse updates to a variable reference.
 
 This operation computes
 
+```python
     # Scalar indices
     ref[indices, ...] = updates[...]
 
@@ -275,6 +276,7 @@ This operation computes
 
     # High rank indices (for each i, ..., j)
     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+```
 
 This operation outputs `ref` after the update is done.
 This makes it easier to chain operations that need to use the reset value.
@@ -286,7 +288,7 @@ for each value is undefined.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterUpdate.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -330,7 +332,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -354,6 +356,7 @@ REGISTER_OP("ScatterSub")
     .Doc(R"doc(
 Subtracts sparse updates to a variable reference.
 
+```python
     # Scalar indices
     ref[indices, ...] -= updates[...]
 
@@ -362,6 +365,7 @@ Subtracts sparse updates to a variable reference.
 
     # High rank indices (for each i, ..., j)
     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+```
 
 This operation outputs `ref` after the update is done.
 This makes it easier to chain operations that need to use the reset value.
@@ -372,7 +376,7 @@ the same location, their (negated) contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterSub.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -398,6 +402,7 @@ Multiplies sparse updates into a variable reference.
 
 This operation computes
 
+```python
     # Scalar indices
     ref[indices, ...] *= updates[...]
 
@@ -406,6 +411,7 @@ This operation computes
 
     # High rank indices (for each i, ..., j)
     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+```
 
 This operation outputs `ref` after the update is done.
 This makes it easier to chain operations that need to use the reset value.
@@ -438,6 +444,7 @@ Divides a variable reference by sparse updates.
 
 This operation computes
 
+```python
     # Scalar indices
     ref[indices, ...] /= updates[...]
 
@@ -446,6 +453,7 @@ This operation computes
 
     # High rank indices (for each i, ..., j)
     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+```
 
 This operation outputs `ref` after the update is done.
 This makes it easier to chain operations that need to use the reset value.
@@ -552,12 +560,14 @@ dimension of `ref`.
 For example, say we want to update 4 scattered elements to a rank-1 tensor to
 8 elements. In Python, that update would look like this:
 
+```python
     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
     indices = tf.constant([[4], [3], [1] ,[7]])
     updates = tf.constant([9, 10, 11, 12])
     update = tf.scatter_nd_update(ref, indices, updates)
     with tf.Session() as sess:
       print sess.run(update)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
new file mode 100644
index 00000000000..b222b5b2416
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::ShapeHandle;
+
+static Status StatelessShape(shape_inference::InferenceContext* context) {
+  // Check seed shape
+  ShapeHandle seed;
+  TF_RETURN_IF_ERROR(context->WithRank(context->input(1), 1, &seed));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
+
+  // Set output shape
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
+  context->set_output(0, out);
+  return Status::OK();
+}
+
+#define REGISTER_STATELESS_OP(name)                  \
+  REGISTER_OP(name)                                  \
+      .Input("shape: T")                             \
+      .Input("seed: int64")                          \
+      .Output("output: dtype")                       \
+      .Attr("dtype: {half,float,double} = DT_FLOAT") \
+      .Attr("T: {int32, int64} = DT_INT32")          \
+      .SetShapeFn(StatelessShape)
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessRandomUniform").Doc(R"doc(
+Outputs deterministic pseudorandom random values from a uniform distribution.
+
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessRandomNormal").Doc(R"doc(
+Outputs deterministic pseudorandom values from a normal distribution.
+
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessTruncatedNormal").Doc(R"doc(
+Outputs deterministic pseudorandom values from a truncated normal distribution.
+
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+#undef REGISTER_STATELESS_OP
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 0db1a20557b..d6fe8ec3422 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -59,7 +59,7 @@ defines the key of the hash function. `key` is an array of 2 elements.
 A strong hash is important when inputs may be malicious, e.g. URLs with
 additional components. Adversaries could try to make their inputs hash to the
 same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it dificult, if not infeasible, to compute inputs
+hash prevents this by making it difficult, if not infeasible, to compute inputs
 that hash to the same bucket. This comes at a cost of roughly 4x higher compute
 time than `tf.string_to_hash_bucket_fast`.
 
@@ -106,7 +106,7 @@ counted backwards from the end, with `-1` being equivalent to `n - 1`.
 
 For example:
 
-```
+```python
 # tensor `a` is [["a", "b"], ["c", "d"]]
 tf.reduce_join(a, 0) ==> ["ac", "bd"]
 tf.reduce_join(a, 1) ==> ["ab", "cd"]
@@ -308,10 +308,10 @@ REGISTER_OP("Substr")
     .Doc(R"doc(
 Return substrings from `Tensor` of strings.
 
-For each string in the input `Tensor`, creates a substring starting at index 
-`pos` with a total length of `len`. 
+For each string in the input `Tensor`, creates a substring starting at index
+`pos` with a total length of `len`.
 
-If `len` defines a substring that would extend beyond the length of the input 
+If `len` defines a substring that would extend beyond the length of the input
 string, then as many characters as possible are used.
 
 If `pos` is negative or specifies a character index larger than any of the input
@@ -320,7 +320,7 @@ strings, then an `InvalidArgumentError` is thrown.
 `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
 Op creation.
 
-*NOTE*: `Substr` supports broadcasting up to two dimensions. More about 
+*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
 broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
@@ -330,7 +330,7 @@ Examples
 
 Using scalar `pos` and `len`:
 
-```
+```python
 input = [b'Hello', b'World']
 position = 1
 length = 3
@@ -340,7 +340,7 @@ output = [b'ell', b'orl']
 
 Using `pos` and `len` with same shape as `input`:
 
-```
+```python
 input = [[b'ten', b'eleven', b'twelve'],
          [b'thirteen', b'fourteen', b'fifteen'],
          [b'sixteen', b'seventeen', b'eighteen']]
@@ -382,7 +382,7 @@ length =   [3, 2, 1]
 output = [b'hir', b'ee', b'n"]
 ```
 
-input: Tensor of strings 
+input: Tensor of strings
 pos: Scalar defining the position of first character in each substring
 len: Scalar defining the number of characters to include in each substring
 output: Tensor of substrings
diff --git a/tensorflow/core/ops/string_ops_test.cc b/tensorflow/core/ops/string_ops_test.cc
index 79130bae2c0..0ac389f328b 100644
--- a/tensorflow/core/ops/string_ops_test.cc
+++ b/tensorflow/core/ops/string_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
@@ -27,6 +26,7 @@ TEST(StringOpsTest, StringJoin_ShapeFn) {
   ShapeInferenceTestOp op("StringJoin");
   int n = 3;
   std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
   for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_STRING);
   TF_ASSERT_OK(NodeDefBuilder("test", "StringJoin")
                    .Input(src_list)
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 2027bf4603d..5bb93daea23 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -23,11 +23,12 @@ using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
-  auto h_dtype = c->input_handle_dtype(input);
-  if (h_dtype == DT_INVALID) {
-    return c->input(input);
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
   }
-  return c->input_handle_shape(input);
+  return c->input(input);
 }
 
 // Handle the gradient and, if <sparse>, indices inputs.
@@ -102,6 +103,28 @@ use_locking: If `True`, the subtraction will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ApplyDelayCompensatedGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("delta: T")
+    .Input("lambda: T")
+    .Input("shadow: resource")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ApplyGradientDescentShapeFn)
+    .Doc(R"doc(
+var -= alpha * (delta + lambda * delta * (var - shadow))
+Update '*shadow' by changing it to the new value of 'var'
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+delta: The change.
+lambda: The variance parameter.
+shadow: Same as "var".
+use_locking: If `True`, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
                                                   bool sparse) {
   ShapeHandle unused;
@@ -902,7 +925,7 @@ REGISTER_OP("ResourceApplyFtrl")
 Update '*var' according to the Ftrl-proximal scheme.
 
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
@@ -1004,7 +1027,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1043,7 +1066,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1075,7 +1098,7 @@ momentum: Momentum. Must be a scalar.
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1112,7 +1135,7 @@ momentum: Momentum. Must be a scalar.
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1150,6 +1173,7 @@ REGISTER_OP("ApplyAdam")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1175,6 +1199,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 REGISTER_OP("ResourceApplyAdam")
@@ -1190,6 +1215,7 @@ REGISTER_OP("ResourceApplyAdam")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1214,6 +1240,7 @@ grad: The gradient.
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index 9c3489211c8..92d5ad99645 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/platform/test.h"
@@ -21,9 +20,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Used for testing the grad+indices handling for SparseApplyXYZ tests.
-static void TestGradAndIndicesErrorHandling(ShapeInferenceTestOp op,
+static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
-                                            string shape_spec_end = "") {
+                                            const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
       const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
@@ -32,15 +31,15 @@ static void TestGradAndIndicesErrorHandling(ShapeInferenceTestOp op,
 
   // mismatch between grad[1] and var[1].
   INFER_ERROR("Dimension 1 in both shapes must be equal", op,
-              shape_spec("[?,1]", "[?,2];[?]").c_str());
+              shape_spec("[?,1]", "[?,2];[?]"));
   // grad[0] and indices[0] must match.
   INFER_ERROR("Dimensions must be equal, but are 1 and 2", op,
-              shape_spec("?", "[2,?];[1]").c_str());
+              shape_spec("?", "[2,?];[1]"));
   // grad is wrong rank.
-  INFER_ERROR("must be equal rank", op, shape_spec("[1]", "[?,2];[?]").c_str());
+  INFER_ERROR("must be equal rank", op, shape_spec("[1]", "[?,2];[?]"));
   // indices is wrong rank.
   INFER_ERROR("Shape must be rank 1 but is rank 2", op,
-              shape_spec("[?]", "[?];[1,2]").c_str());
+              shape_spec("[?]", "[?];[1,2]"));
 }
 
 TEST(TrainingOpsTest, ApplyGradientDescent_ShapeFn) {
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index c3c995174d6..4df88b8d374 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -38,6 +38,7 @@ cc_library(
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
+        ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -87,6 +88,7 @@ cc_library(
     deps = [
         ":http_request",
         ":oauth_client",
+        ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@jsoncpp_git//:jsoncpp",
@@ -110,6 +112,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "retrying_utils",
+    srcs = [
+        "retrying_utils.cc",
+    ],
+    hdrs = [
+        "retrying_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 cc_library(
     name = "retrying_file_system",
     srcs = [
@@ -119,6 +135,7 @@ cc_library(
         "retrying_file_system.h",
     ],
     deps = [
+        ":retrying_utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
     ],
@@ -223,3 +240,16 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "retrying_utils_test",
+    size = "small",
+    srcs = ["retrying_utils_test.cc"],
+    deps = [
+        ":retrying_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index ab56ad09d11..97e4c207d86 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/cloud/time_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -51,6 +52,8 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
+// The environment variable that overrides the size of the readahead buffer.
+constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 
 // The file statistics returned by Stat() for directories.
 const FileStatistics DIRECTORY_STAT(0, 0, true);
@@ -240,7 +243,11 @@ class GcsRandomAccessFile : public RandomAccessFile {
         buffer_.reserve(desired_buffer_size);
       }
 
+      // Shift the offset and clear the buffer so that the state stays
+      // consistent if loading from GCS fails.
       buffer_start_offset_ = offset;
+      buffer_.clear();
+
       TF_RETURN_IF_ERROR(LoadBufferFromGCS());
 
       // Set the results.
@@ -268,7 +275,7 @@ class GcsRandomAccessFile : public RandomAccessFile {
     std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
     TF_RETURN_IF_ERROR(request->Init());
     TF_RETURN_IF_ERROR(
-        request->SetUri(strings::StrCat("https://", bucket_, ".", kStorageHost,
+        request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket_,
                                         "/", request->EscapeString(object_))));
     TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     TF_RETURN_IF_ERROR(request->SetRange(
@@ -302,13 +309,13 @@ class GcsWritableFile : public WritableFile {
   GcsWritableFile(const string& bucket, const string& object,
                   AuthProvider* auth_provider,
                   HttpRequest::Factory* http_request_factory,
-                  int32 max_upload_attempts)
+                  int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
         sync_needed_(true),
-        max_upload_attempts_(max_upload_attempts) {
+        initial_retry_delay_usec_(initial_retry_delay_usec) {
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -324,19 +331,19 @@ class GcsWritableFile : public WritableFile {
                   AuthProvider* auth_provider,
                   const string& tmp_content_filename,
                   HttpRequest::Factory* http_request_factory,
-                  int32 max_upload_attempts)
+                  int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
         sync_needed_(true),
-        max_upload_attempts_(max_upload_attempts) {
+        initial_retry_delay_usec_(initial_retry_delay_usec) {
     tmp_content_filename_ = tmp_content_filename;
     outfile_.open(tmp_content_filename_,
                   std::ofstream::binary | std::ofstream::app);
   }
 
-  ~GcsWritableFile() { Close(); }
+  ~GcsWritableFile() override { Close().IgnoreError(); }
 
   Status Append(const StringPiece& data) override {
     TF_RETURN_IF_ERROR(CheckWritable());
@@ -388,41 +395,32 @@ class GcsWritableFile : public WritableFile {
     string session_uri;
     TF_RETURN_IF_ERROR(CreateNewUploadSession(&session_uri));
     uint64 already_uploaded = 0;
-    for (int attempt = 0; attempt < max_upload_attempts_; attempt++) {
-      if (attempt > 0) {
-        bool completed;
-        TF_RETURN_IF_ERROR(RequestUploadSessionStatus(session_uri, &completed,
-                                                      &already_uploaded));
-        if (completed) {
-          // It's unclear why UploadToSession didn't return OK in the previous
-          // attempt, but GCS reports that the file is fully uploaded,
-          // so succeed.
-          return Status::OK();
-        }
-      }
-      const Status upload_status =
-          UploadToSession(session_uri, already_uploaded);
-      if (upload_status.ok()) {
-        return Status::OK();
-      }
-      switch (upload_status.code()) {
-        case errors::Code::NOT_FOUND:
-          // GCS docs recommend retrying the whole upload. We're relying on the
-          // RetryingFileSystem to retry the Sync() call.
-          return errors::Unavailable("Could not upload gs://", bucket_, "/",
-                                     object_);
-        case errors::Code::UNAVAILABLE:
-          // The upload can be resumed, but GCS docs recommend an exponential
-          // back-off.
-          Env::Default()->SleepForMicroseconds(kUploadRetryDelayMicros
-                                               << attempt);
-          break;
-        default:
-          // Something unexpected happen, fail.
-          return upload_status;
-      }
+    bool first_attempt = true;
+    const Status upload_status = RetryingUtils::CallWithRetries(
+        [&first_attempt, &already_uploaded, &session_uri, this]() {
+          if (!first_attempt) {
+            bool completed;
+            TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
+                session_uri, &completed, &already_uploaded));
+            if (completed) {
+              // It's unclear why UploadToSession didn't return OK in the
+              // previous attempt, but GCS reports that the file is fully
+              // uploaded, so succeed.
+              return Status::OK();
+            }
+          }
+          first_attempt = false;
+          return UploadToSession(session_uri, already_uploaded);
+        },
+        initial_retry_delay_usec_);
+    if (upload_status.code() == errors::Code::NOT_FOUND) {
+      // GCS docs recommend retrying the whole upload. We're relying on the
+      // RetryingFileSystem to retry the Sync() call.
+      return errors::Unavailable(
+          strings::StrCat("Upload to gs://", bucket_, "/", object_,
+                          " failed, caused by: ", upload_status.ToString()));
     }
-    return errors::Aborted("Upload gs://", bucket_, "/", object_, " failed.");
+    return upload_status;
   }
 
   Status CheckWritable() const {
@@ -571,7 +569,7 @@ class GcsWritableFile : public WritableFile {
   std::ofstream outfile_;
   HttpRequest::Factory* http_request_factory_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
-  int32 max_upload_attempts_;
+  int64 initial_retry_delay_usec_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -589,16 +587,25 @@ class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 
 GcsFileSystem::GcsFileSystem()
     : auth_provider_(new GoogleAuthProvider()),
-      http_request_factory_(new HttpRequest::Factory()) {}
+      http_request_factory_(new HttpRequest::Factory()) {
+  // Apply the sys env override for the readahead buffer size if it's provided.
+  const char* readahead_buffer_size = std::getenv(kReadaheadBufferSize);
+  if (readahead_buffer_size) {
+    uint64 value;
+    if (strings::safe_strtou64(readahead_buffer_size, &value)) {
+      read_ahead_bytes_ = value;
+    }
+  }
+}
 
 GcsFileSystem::GcsFileSystem(
     std::unique_ptr<AuthProvider> auth_provider,
     std::unique_ptr<HttpRequest::Factory> http_request_factory,
-    size_t read_ahead_bytes, int32 max_upload_attempts)
+    size_t read_ahead_bytes, int64 initial_retry_delay_usec)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       read_ahead_bytes_(read_ahead_bytes),
-      max_upload_attempts_(max_upload_attempts) {}
+      initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
@@ -616,7 +623,7 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(bucket, object, auth_provider_.get(),
                                     http_request_factory_.get(),
-                                    max_upload_attempts_));
+                                    initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -656,7 +663,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, auth_provider_.get(), old_content_filename,
-      http_request_factory_.get(), max_upload_attempts_));
+      http_request_factory_.get(), initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -767,8 +774,9 @@ Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
 
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   TF_RETURN_IF_ERROR(request->Init());
-  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
-  request->AddAuthBearerHeader(auth_token);
+  TF_RETURN_IF_ERROR(
+      request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket)));
+  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   const Status status = request->Send();
   switch (status.code()) {
     case errors::Code::OK:
@@ -1018,7 +1026,7 @@ Status GcsFileSystem::CreateDir(const string& dirname) {
 }
 
 // Checks that the directory is empty (i.e no objects with this prefix exist).
-// If it is, does nothing, because directories are not entities in GCS.
+// Deletes the GCS directory marker if it exists.
 Status GcsFileSystem::DeleteDir(const string& dirname) {
   std::vector<string> children;
   // A directory is considered empty either if there are no matching objects
@@ -1110,8 +1118,12 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
         "locations or storage classes is not supported.");
   }
 
-  TF_RETURN_IF_ERROR(DeleteFile(src));
-  return Status::OK();
+  // In case the delete API call failed, but the deletion actually happened
+  // on the server side, we can't just retry the whole RenameFile operation
+  // because the source object is already gone.
+  return RetryingUtils::DeleteWithRetries(
+      std::bind(&GcsFileSystem::DeleteFile, this, src),
+      initial_retry_delay_usec_);
 }
 
 Status GcsFileSystem::IsDirectory(const string& fname) {
@@ -1163,7 +1175,13 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
   for (const string& object : all_objects) {
     const string& full_path = JoinGcsPath(dirname, object);
     // Delete all objects including directory markers for subfolders.
-    if (!DeleteFile(full_path).ok()) {
+    // Since DeleteRecursively returns OK if individual file deletions fail,
+    // and therefore RetryingFileSystem won't pay attention to the failures,
+    // we need to make sure these failures are properly retried.
+    const auto& delete_file_status = RetryingUtils::DeleteWithRetries(
+        std::bind(&GcsFileSystem::DeleteFile, this, full_path),
+        initial_retry_delay_usec_);
+    if (!delete_file_status.ok()) {
       if (IsDirectory(full_path).ok()) {
         // The object is a directory marker.
         (*undeleted_dirs)++;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 28e6a85a089..18d2de482bb 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -35,7 +35,7 @@ class GcsFileSystem : public FileSystem {
   GcsFileSystem();
   GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
                 std::unique_ptr<HttpRequest::Factory> http_request_factory,
-                size_t read_ahead_bytes, int32 max_upload_attempts);
+                size_t read_ahead_bytes, int64 initial_retry_delay_usec);
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -74,6 +74,7 @@ class GcsFileSystem : public FileSystem {
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
+  size_t get_readahead_buffer_size() const { return read_ahead_bytes_; }
 
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
@@ -112,11 +113,10 @@ class GcsFileSystem : public FileSystem {
 
   // The number of bytes to read ahead for buffering purposes in the
   // RandomAccessFile implementation. Defaults to 256Mb.
-  const size_t read_ahead_bytes_ = 256 * 1024 * 1024;
+  size_t read_ahead_bytes_ = 256 * 1024 * 1024;
 
-  // The max number of attempts to upload a file to GCS using the resumable
-  // upload API.
-  const int32 max_upload_attempts_ = 5;
+  // The initial delay for exponential backoffs when retrying failed calls.
+  const int64 initial_retry_delay_usec_ = 1000000L;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GcsFileSystem);
 };
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 84f219616a9..c3a8678fbc6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -33,19 +33,19 @@ class FakeAuthProvider : public AuthProvider {
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-5\n",
            "012345"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 6-11\n",
            "6789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -67,19 +67,19 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead) {
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead_differentN) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-2\n",
            "012"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 3-12\n",
            "3456789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -104,39 +104,39 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead_differentN) {
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithReadAhead) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-8\n",
            "012345678"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 6-14\n",
            "6789abcde"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 6-20\n",
            "6789abcd"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 7-21\n",
            "789abcdef"),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 20-34\n",
            ""),
        new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/random_access.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-14\n",
            "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   char scratch[100];
   StringPiece result;
@@ -197,7 +197,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -221,7 +221,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -254,7 +254,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
+                           "", errors::FailedPrecondition("308"), nullptr,
                            {{"Range", "0-10"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
@@ -265,7 +265,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
+                           "", errors::FailedPrecondition("308"), nullptr,
                            {{"Range", "bytes=0-12"}}, 308),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
@@ -275,7 +275,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -307,7 +307,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -330,32 +330,37 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
                            "Put body: content1,content2\n",
-                           "", errors::Unavailable("503"), 503),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Header Content-Range: bytes */17\n"
-                           "Put: yes\n",
-                           "", errors::Unavailable("308"), nullptr,
-                           {{"Range", "0-10"}}, 308),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Header Content-Range: bytes 11-16/17\n"
-                           "Put body: ntent2\n",
-                           "", errors::Unavailable("503"), 503),
-       // These calls will be made in the Close() attempt from the destructor.
-       // Letting the destructor succeed.
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=path%2Fwriteable.txt\n"
-           "Auth Token: fake_token\n"
-           "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
-           "", {{"Location", "https://custom/upload/location"}}),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Header Content-Range: bytes 0-16/17\n"
-                           "Put body: content1,content2\n",
-                           "")});
+                           "", errors::Unavailable("503"), 503)});
+  for (int i = 0; i < 10; i++) {
+    requests.emplace_back(new FakeHttpRequest(
+        "Uri: https://custom/upload/location\n"
+        "Auth Token: fake_token\n"
+        "Header Content-Range: bytes */17\n"
+        "Put: yes\n",
+        "", errors::FailedPrecondition("important HTTP error 308"), nullptr,
+        {{"Range", "0-10"}}, 308));
+    requests.emplace_back(new FakeHttpRequest(
+        "Uri: https://custom/upload/location\n"
+        "Auth Token: fake_token\n"
+        "Header Content-Range: bytes 11-16/17\n"
+        "Put body: ntent2\n",
+        "", errors::Unavailable("important HTTP error 503"), 503));
+  }
+  // These calls will be made in the Close() attempt from the destructor.
+  // Letting the destructor succeed.
+  requests.emplace_back(new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+      "uploadType=resumable&name=path%2Fwriteable.txt\n"
+      "Auth Token: fake_token\n"
+      "Header X-Upload-Content-Length: 17\n"
+      "Post: yes\n",
+      "", {{"Location", "https://custom/upload/location"}}));
+  requests.emplace_back(
+      new FakeHttpRequest("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-16/17\n"
+                          "Put body: content1,content2\n",
+                          ""));
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
@@ -366,10 +371,15 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
-  EXPECT_EQ(errors::Code::ABORTED, file->Close().code());
+  const auto& status = file->Close();
+  EXPECT_EQ(errors::Code::ABORTED, status.code());
+  EXPECT_TRUE(StringPiece(status.error_message())
+                  .contains("All 10 retry attempts failed. The last failure: "
+                            "Unavailable: important HTTP error 503"))
+      << status;
 }
 
-TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
+TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
@@ -382,7 +392,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
                            "Put body: content1,content2\n",
-                           "", errors::NotFound("404"), 404),
+                           "", errors::NotFound("important HTTP error 410"),
+                           410),
        // These calls will be made in the Close() attempt from the destructor.
        // Letting the destructor succeed.
        new FakeHttpRequest(
@@ -400,14 +411,24 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
-  EXPECT_EQ(errors::Code::UNAVAILABLE, file->Close().code());
+  const auto& status = file->Close();
+  EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
+  EXPECT_TRUE(
+      StringPiece(status.error_message())
+          .contains(
+              "Upload to gs://bucket/path/writeable.txt failed, caused by: "
+              "Not found: important HTTP error 410"))
+      << status;
+  EXPECT_TRUE(StringPiece(status.error_message())
+                  .contains("when uploading gs://bucket/path/writeable.txt"))
+      << status;
 }
 
 TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
@@ -415,7 +436,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -425,7 +446,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 TEST(GcsFileSystemTest, NewAppendableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-           "Uri: https://bucket.storage.googleapis.com/path%2Fappendable.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/path%2Fappendable.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-1048575\n",
            "content1,"),
@@ -444,7 +465,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable.txt", &file));
@@ -458,7 +479,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -475,7 +496,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
            strings::StrCat("{\"size\": \"", content.size(),
                            "\", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
-           strings::StrCat("Uri: https://bucket.storage.googleapis.com/"
+           strings::StrCat("Uri: https://storage.googleapis.com/bucket/"
                            "path%2Frandom_access.txt\n"
                            "Auth Token: fake_token\n"
                            "Range: 0-",
@@ -484,7 +505,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -499,7 +520,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -516,7 +537,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -538,7 +559,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -556,7 +577,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -578,7 +599,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -597,7 +618,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -614,7 +635,7 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -635,7 +656,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -657,7 +678,7 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -678,7 +699,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -696,7 +717,7 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -714,7 +735,7 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -747,7 +768,7 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -767,7 +788,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -788,7 +809,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -810,7 +831,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -829,7 +850,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -848,7 +869,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -860,7 +881,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -877,7 +898,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt"));
 }
@@ -887,7 +908,7 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -902,7 +923,7 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -923,7 +944,7 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -936,7 +957,7 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -951,7 +972,7 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -967,7 +988,7 @@ TEST(GcsFileSystemTest, GetFileSize) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -979,7 +1000,7 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 5 /* max upload attempts */);
+                   5 /* read ahead bytes */, 0 /* initial retry delay */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1051,7 +1072,7 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -1089,7 +1110,54 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+
+  TF_EXPECT_OK(
+      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+}
+
+/// Tests the scenario when deletion returns a failure, but actually succeeds.
+TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
+  std::vector<HttpRequest*> requests(
+      {// IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
+           "&maxResults=1\n"
+           "Auth Token: fake_token\n",
+           "{}"),
+       // IsDirectory is checking if the path exists as an object.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       // Copying to the new location.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
+           "Auth Token: fake_token\n"
+           "Post: yes\n",
+           "{\"done\": true}"),
+       // Deleting the original file - the deletion returns a failure.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt\n"
+           "Auth Token: fake_token\n"
+           "Delete: yes\n",
+           "", errors::Unavailable("503"), 503),
+       // Deleting the original file again - the deletion returns NOT_FOUND.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt\n"
+           "Auth Token: fake_token\n"
+           "Delete: yes\n",
+           "", errors::NotFound("404"), 404)});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -1122,7 +1190,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -1140,7 +1208,7 @@ TEST(GcsFileSystemTest, Stat_Object) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -1166,7 +1234,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -1191,7 +1259,7 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -1205,7 +1273,7 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -1222,7 +1290,7 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -1244,7 +1312,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1267,7 +1335,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1290,7 +1358,7 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -1309,7 +1377,7 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -1323,7 +1391,7 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -1355,7 +1423,7 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
@@ -1374,7 +1442,7 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -1406,7 +1474,13 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                            "Auth Token: fake_token\n"
                            "Delete: yes\n",
                            ""),
-       // Delete the object.
+       // Delete the object - fails and will be retried.
+       new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
+                           "/bucket/o/path%2Ffile1.txt\n"
+                           "Auth Token: fake_token\n"
+                           "Delete: yes\n",
+                           "", errors::Unavailable("500"), 500),
+       // Delete the object again.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
@@ -1427,7 +1501,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -1505,7 +1579,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -1532,7 +1606,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+                   0 /* read ahead bytes */, 0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -1543,5 +1617,14 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   EXPECT_EQ(1, undeleted_dirs);
 }
 
+TEST(GcsFileSystemTest, OverrideReadaheadBufferSize) {
+  GcsFileSystem fs1;
+  EXPECT_EQ(256 * 1024 * 1024, fs1.get_readahead_buffer_size());
+
+  setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
+  GcsFileSystem fs2;
+  EXPECT_EQ(123456789L, fs2.get_readahead_buffer_size());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index 73e9f4f7cf3..f70b431b652 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/base64.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -34,6 +35,9 @@ namespace {
 constexpr char kGoogleApplicationCredentials[] =
     "GOOGLE_APPLICATION_CREDENTIALS";
 
+// The environment variable to override token generation for testing.
+constexpr char kGoogleAuthTokenForTesting[] = "GOOGLE_AUTH_TOKEN_FOR_TESTING";
+
 // The environment variable which can override '~/.config/gcloud' if set.
 constexpr char kCloudSdkConfig[] = "CLOUDSDK_CONFIG";
 
@@ -63,6 +67,9 @@ constexpr char kGceTokenUrl[] =
 // The authentication token scope to request.
 constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
 
+// The default initial delay between retries with exponential backoff.
+constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
+
 /// Returns whether the given path points to a readable file.
 bool IsFile(const string& filename) {
   std::ifstream fstream(filename.c_str());
@@ -115,14 +122,16 @@ GoogleAuthProvider::GoogleAuthProvider()
     : GoogleAuthProvider(
           std::unique_ptr<OAuthClient>(new OAuthClient()),
           std::unique_ptr<HttpRequest::Factory>(new HttpRequest::Factory()),
-          Env::Default()) {}
+          Env::Default(), kInitialRetryDelayUsec) {}
 
 GoogleAuthProvider::GoogleAuthProvider(
     std::unique_ptr<OAuthClient> oauth_client,
-    std::unique_ptr<HttpRequest::Factory> http_request_factory, Env* env)
+    std::unique_ptr<HttpRequest::Factory> http_request_factory, Env* env,
+    int64 initial_retry_delay_usec)
     : oauth_client_(std::move(oauth_client)),
       http_request_factory_(std::move(http_request_factory)),
-      env_(env) {}
+      env_(env),
+      initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
 Status GoogleAuthProvider::GetToken(string* t) {
   mutex_lock lock(mu_);
@@ -134,12 +143,12 @@ Status GoogleAuthProvider::GetToken(string* t) {
     return Status::OK();
   }
 
-  // First, try to get the token using credentials stored in a few special
-  // file locations.
-  auto token_from_files_status = GetTokenFromFiles();
+  if (GetTokenForTesting().ok()) {
+    *t = current_token_;
+    return Status::OK();
+  }
 
-  // If that didn't work, try to get the token assuming we're running on
-  // Google Compute Engine (GCE).
+  auto token_from_files_status = GetTokenFromFiles();
   auto token_from_gce_status =
       token_from_files_status.ok() ? Status::OK() : GetTokenFromGce();
 
@@ -195,20 +204,34 @@ Status GoogleAuthProvider::GetTokenFromFiles() {
 }
 
 Status GoogleAuthProvider::GetTokenFromGce() {
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  std::vector<char> response_buffer;
-  const uint64 request_timestamp_sec = env_->NowSeconds();
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(kGceTokenUrl));
-  TF_RETURN_IF_ERROR(request->AddHeader("Metadata-Flavor", "Google"));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&response_buffer));
-  TF_RETURN_IF_ERROR(request->Send());
-  StringPiece response =
-      StringPiece(&response_buffer[0], response_buffer.size());
+  const auto get_token_from_gce = [this]() {
+    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
+    std::vector<char> response_buffer;
+    const uint64 request_timestamp_sec = env_->NowSeconds();
+    TF_RETURN_IF_ERROR(request->Init());
+    TF_RETURN_IF_ERROR(request->SetUri(kGceTokenUrl));
+    TF_RETURN_IF_ERROR(request->AddHeader("Metadata-Flavor", "Google"));
+    TF_RETURN_IF_ERROR(request->SetResultBuffer(&response_buffer));
+    TF_RETURN_IF_ERROR(request->Send());
+    StringPiece response =
+        StringPiece(&response_buffer[0], response_buffer.size());
 
-  TF_RETURN_IF_ERROR(oauth_client_->ParseOAuthResponse(
-      response, request_timestamp_sec, &current_token_,
-      &expiration_timestamp_sec_));
+    TF_RETURN_IF_ERROR(oauth_client_->ParseOAuthResponse(
+        response, request_timestamp_sec, &current_token_,
+        &expiration_timestamp_sec_));
+    return Status::OK();
+  };
+  return RetryingUtils::CallWithRetries(get_token_from_gce,
+                                        initial_retry_delay_usec_);
+}
+
+Status GoogleAuthProvider::GetTokenForTesting() {
+  const char* token = std::getenv(kGoogleAuthTokenForTesting);
+  if (!token) {
+    return errors::NotFound("The env variable for testing was not set.");
+  }
+  expiration_timestamp_sec_ = UINT64_MAX;
+  current_token_ = token;
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 88b84455840..00da25a9593 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -30,7 +30,8 @@ class GoogleAuthProvider : public AuthProvider {
   GoogleAuthProvider();
   explicit GoogleAuthProvider(
       std::unique_ptr<OAuthClient> oauth_client,
-      std::unique_ptr<HttpRequest::Factory> http_request_factory, Env* env);
+      std::unique_ptr<HttpRequest::Factory> http_request_factory, Env* env,
+      int64 initial_retry_delay_usec);
   virtual ~GoogleAuthProvider() {}
 
   /// \brief Returns the short-term authentication bearer token.
@@ -43,10 +44,13 @@ class GoogleAuthProvider : public AuthProvider {
   ///
   /// Tries the file from $GOOGLE_APPLICATION_CREDENTIALS and the
   /// standard gcloud tool's location.
-  Status GetTokenFromFiles();
+  Status GetTokenFromFiles() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Gets the bearer token from Google Compute Engine environment.
-  Status GetTokenFromGce();
+  Status GetTokenFromGce() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Gets the bearer token from the systen env variable, for testing purposes.
+  Status GetTokenForTesting() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::unique_ptr<OAuthClient> oauth_client_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
@@ -54,6 +58,8 @@ class GoogleAuthProvider : public AuthProvider {
   mutex mu_;
   string current_token_ GUARDED_BY(mu_);
   uint64 expiration_timestamp_sec_ GUARDED_BY(mu_) = 0;
+  // The initial delay for exponential backoffs when retrying failed calls.
+  const int64 initial_retry_delay_usec_;
   TF_DISALLOW_COPY_AND_ASSIGN(GoogleAuthProvider);
 };
 
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index 3626528d5c9..4281c6c7373 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -62,7 +62,20 @@ class FakeOAuthClient : public OAuthClient {
 
 }  // namespace
 
-TEST(GoogleAuthProvider, EnvironmentVariable_Caching) {
+class GoogleAuthProviderTest : public ::testing::Test {
+ protected:
+  void SetUp() override { ClearEnvVars(); }
+
+  void TearDown() override { ClearEnvVars(); }
+
+  void ClearEnvVars() {
+    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
+    unsetenv("CLOUDSDK_CONFIG");
+    unsetenv("GOOGLE_AUTH_TOKEN_FOR_TESTING");
+  }
+};
+
+TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
   setenv("GOOGLE_APPLICATION_CREDENTIALS",
          io::JoinPath(
              io::JoinPath(testing::TensorFlowSrcRoot(), kTestData).c_str(),
@@ -80,7 +93,7 @@ TEST(GoogleAuthProvider, EnvironmentVariable_Caching) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               std::unique_ptr<HttpRequest::Factory>(
                                   new FakeHttpRequestFactory(&requests)),
-                              &env);
+                              &env, 0);
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
@@ -103,8 +116,7 @@ TEST(GoogleAuthProvider, EnvironmentVariable_Caching) {
   EXPECT_EQ("new-fake-token", token);
 }
 
-TEST(GoogleAuthProvider, GCloudRefreshToken) {
-  setenv("GOOGLE_APPLICATION_CREDENTIALS", "", 1);
+TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
   setenv("CLOUDSDK_CONFIG",
          io::JoinPath(testing::TensorFlowSrcRoot(), kTestData).c_str(), 1);
 
@@ -115,7 +127,7 @@ TEST(GoogleAuthProvider, GCloudRefreshToken) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               std::unique_ptr<HttpRequest::Factory>(
                                   new FakeHttpRequestFactory(&requests)),
-                              &env);
+                              &env, 0);
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
@@ -127,10 +139,7 @@ TEST(GoogleAuthProvider, GCloudRefreshToken) {
                 .asString());
 }
 
-TEST(GoogleAuthProvider, RunningOnGCE) {
-  setenv("GOOGLE_APPLICATION_CREDENTIALS", "", 1);
-  setenv("CLOUDSDK_CONFIG", "", 1);
-
+TEST_F(GoogleAuthProviderTest, RunningOnGCE) {
   auto oauth_client = new FakeOAuthClient;
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
@@ -143,6 +152,12 @@ TEST(GoogleAuthProvider, RunningOnGCE) {
             "expires_in": 3920,
             "token_type":"Bearer"
           })"),
+       // The first token refresh request fails and will be retried.
+       new FakeHttpRequest(
+           "Uri: http://metadata/computeMetadata/v1/instance/service-accounts"
+           "/default/token\n"
+           "Header Metadata-Flavor: Google\n",
+           "", errors::Unavailable("503"), 503),
        new FakeHttpRequest(
            "Uri: http://metadata/computeMetadata/v1/instance/service-accounts"
            "/default/token\n"
@@ -158,7 +173,7 @@ TEST(GoogleAuthProvider, RunningOnGCE) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               std::unique_ptr<HttpRequest::Factory>(
                                   new FakeHttpRequestFactory(&requests)),
-                              &env);
+                              &env, 0);
 
   string token;
   TF_EXPECT_OK(provider.GetToken(&token));
@@ -175,10 +190,23 @@ TEST(GoogleAuthProvider, RunningOnGCE) {
   EXPECT_EQ("new-fake-gce-token", token);
 }
 
-TEST(GoogleAuthProvider, NothingAvailable) {
-  setenv("GOOGLE_APPLICATION_CREDENTIALS", "", 1);
-  setenv("CLOUDSDK_CONFIG", "", 1);
+TEST_F(GoogleAuthProviderTest, OverrideForTesting) {
+  setenv("GOOGLE_AUTH_TOKEN_FOR_TESTING", "tokenForTesting", 1);
 
+  auto oauth_client = new FakeOAuthClient;
+  std::vector<HttpRequest*> empty_requests;
+  FakeEnv env;
+  GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
+                              std::unique_ptr<HttpRequest::Factory>(
+                                  new FakeHttpRequestFactory(&empty_requests)),
+                              &env, 0);
+
+  string token;
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("tokenForTesting", token);
+}
+
+TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   auto oauth_client = new FakeOAuthClient;
 
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
@@ -191,7 +219,7 @@ TEST(GoogleAuthProvider, NothingAvailable) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               std::unique_ptr<HttpRequest::Factory>(
                                   new FakeHttpRequestFactory(&requests)),
-                              &env);
+                              &env, 0);
 
   string token;
   TF_EXPECT_OK(provider.GetToken(&token));
diff --git a/tensorflow/core/platform/cloud/http_request.cc b/tensorflow/core/platform/cloud/http_request.cc
index 62453b49ca7..2d0141e50e7 100644
--- a/tensorflow/core/platform/cloud/http_request.cc
+++ b/tensorflow/core/platform/cloud/http_request.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
@@ -36,6 +35,10 @@ constexpr uint32 kRequestTimeoutSeconds = 3600;  // 1 hour
 // Timeout for the connection phase.
 constexpr uint32 kConnectTimeoutSeconds = 120;  // 2 minutes
 
+// The maximum period of request inactivity, after which the request
+// is terminated.
+constexpr uint64 kInactivityTimeoutSeconds = 60;  // 1 minute
+
 // Proxy to the real libcurl implementation.
 class LibCurlProxy : public LibCurl {
  public:
@@ -76,6 +79,13 @@ class LibCurlProxy : public LibCurl {
     return ::curl_easy_setopt(curl, option, param);
   }
 
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    return ::curl_easy_setopt(curl, option, param);
+  }
+
   CURLcode curl_easy_perform(CURL* curl) override {
     return ::curl_easy_perform(curl);
   }
@@ -112,7 +122,8 @@ class LibCurlProxy : public LibCurl {
 
 HttpRequest::HttpRequest() : HttpRequest(LibCurlProxy::Load()) {}
 
-HttpRequest::HttpRequest(LibCurl* libcurl) : libcurl_(libcurl) {
+HttpRequest::HttpRequest(LibCurl* libcurl, Env* env)
+    : libcurl_(libcurl), env_(env) {
   default_response_buffer_.reserve(CURL_MAX_WRITE_SIZE);
 }
 
@@ -153,6 +164,12 @@ Status HttpRequest::Init() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
                              CURL_HTTP_VERSION_2_0);
 
+  // Set up the progress meter.
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
+                             &HttpRequest::ProgressCallback);
+
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
   is_initialized_ = true;
@@ -357,7 +374,7 @@ Status HttpRequest::Send() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
                              &HttpRequest::HeaderCallback);
 
-  char error_buffer[CURL_ERROR_SIZE];
+  char error_buffer[CURL_ERROR_SIZE] = {0};
   libcurl_->curl_easy_setopt(curl_, CURLOPT_ERRORBUFFER, error_buffer);
 
   const auto curl_result = libcurl_->curl_easy_perform(curl_);
@@ -367,31 +384,80 @@ Status HttpRequest::Send() {
 
   libcurl_->curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE, &response_code_);
 
+  const auto& error_message = strings::StrCat(
+      "Error executing an HTTP request (HTTP response code ", response_code_,
+      ", error code ", curl_result, ", error message '", error_buffer, "')");
+
+  Status result;
   switch (response_code_) {
+    // The group of response codes indicating that the request achieved
+    // the expected goal.
     case 200:  // OK
     case 201:  // Created
     case 204:  // No Content
     case 206:  // Partial Content
       if (curl_result != CURLE_OK) {
-        // UNAVAILABLE can be retried by the caller, e.g by RetryingFileSystem.
-        return errors::Unavailable(
-            strings::StrCat("libcurl failed with error code ", curl_result,
-                            ": ", error_buffer));
+        // This means the server executed the request successfully, but then
+        // something went wrong during the transmission of the response.
+        result = errors::Unavailable(error_message);
+      } else {
+        result = Status::OK();
       }
-      return Status::OK();
-    case 401:
-    case 403:
-      return errors::PermissionDenied("Not authorized to access the resource.");
-    case 404:
-      return errors::NotFound("The requested resource was not found.");
+      break;
     case 416:  // Requested Range Not Satisfiable
+      // The requested range had no overlap with the available range.
+      // This doesn't indicate an error, but this does mean an empty response
+      // body.
       response_buffer_->clear();
-      return Status::OK();
-    default:
-      // UNAVAILABLE can be retried by the caller, e.g by RetryingFileSystem.
-      return errors::Unavailable(
-          strings::StrCat("Unexpected response code ", response_code_));
+      result = Status::OK();
+      break;
+
+    // INVALID_ARGUMENT indicates a problem with how the request is constructed.
+    case 400:  // Bad Request
+    case 411:  // Length Required
+      result = errors::InvalidArgument(error_message);
+      break;
+
+    // PERMISSION_DENIED indicates an authentication or an authorization issue.
+    case 401:  // Unauthorized
+    case 403:  // Forbidden
+      result = errors::PermissionDenied(error_message);
+      break;
+
+    // NOT_FOUND indicates that the requested resource does not exist.
+    case 404:  // Not found
+    case 410:  // Gone
+      result = errors::NotFound(error_message);
+      break;
+
+    // FAILED_PRECONDITION indicates that the request failed because some
+    // of the underlying assumptions were not satisfied. The request
+    // shouldn't be retried unless the external context has changed.
+    case 302:  // Found
+    case 303:  // See Other
+    case 304:  // Not Modified
+    case 307:  // Temporary Redirect
+    case 308:  // Resume Incomplete
+    case 412:  // Precondition Failed
+    case 413:  // Payload Too Large
+      result = errors::FailedPrecondition(error_message);
+      break;
+
+    // UNAVAILABLE indicates a problem that can go away if the request
+    // is just retried without any modification.
+    case 409:  // Conflict
+    case 429:  // Too Many Requests
+    case 500:  // Internal Server Error
+    case 502:  // Bad Gateway
+    case 503:  // Service Unavailable
+    default:   // All other HTTP response codes also should be retried.
+      result = errors::Unavailable(error_message);
+      break;
   }
+  if (!result.ok()) {
+    response_buffer_->clear();
+  }
+  return result;
 }
 
 Status HttpRequest::CheckInitialized() const {
@@ -422,4 +488,31 @@ string HttpRequest::GetResponseHeader(const string& name) const {
 
 uint64 HttpRequest::GetResponseCode() const { return response_code_; }
 
+// Cancels the transmission if no progress has been made for too long.
+int HttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
+                                  curl_off_t dlnow, curl_off_t ultotal,
+                                  curl_off_t ulnow) {
+  auto that = reinterpret_cast<HttpRequest*>(this_object);
+  const auto now = that->env_->NowSeconds();
+  const auto current_progress = dlnow + ulnow;
+  if (that->last_progress_timestamp_ == 0 ||
+      current_progress > that->last_progress_bytes_) {
+    // This is the first time the callback is called or some progress
+    // was made since the last tick.
+    that->last_progress_timestamp_ = now;
+    that->last_progress_bytes_ = current_progress;
+    return 0;
+  }
+
+  if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+    LOG(ERROR) << "The transmission has been stuck at " << current_progress
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted.";
+    return 1;  // Will abort the request.
+  }
+
+  // No progress was made since the last call, but we should wait a bit longer.
+  return 0;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 5365c45ca9b..afcbb9f35cf 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,7 +51,9 @@ class HttpRequest {
   };
 
   HttpRequest();
-  explicit HttpRequest(LibCurl* libcurl);
+  explicit HttpRequest(LibCurl* libcurl)
+      : HttpRequest(libcurl, Env::Default()) {}
+  HttpRequest(LibCurl* libcurl, Env* env);
   virtual ~HttpRequest();
 
   virtual Status Init();
@@ -123,11 +126,16 @@ class HttpRequest {
   /// A header callback in the form which can be accepted by libcurl.
   static size_t HeaderCallback(const void* ptr, size_t size, size_t nmemb,
                                void* this_object);
+  /// A progress meter callback in the form which can be accepted by libcurl.
+  static int ProgressCallback(void* this_object, curl_off_t dltotal,
+                              curl_off_t dlnow, curl_off_t ultotal,
+                              curl_off_t ulnow);
   Status CheckInitialized() const;
   Status CheckMethodNotSet() const;
   Status CheckNotSent() const;
 
   LibCurl* libcurl_;
+  Env* env_;
 
   FILE* put_body_ = nullptr;
 
@@ -144,6 +152,12 @@ class HttpRequest {
   std::unordered_map<string, string> response_headers_;
   uint64 response_code_ = 0;
 
+  // The timestamp of the last activity related to the request execution, in
+  // seconds since epoch.
+  uint64 last_progress_timestamp_ = 0;
+  // The last progress in terms of bytes transmitted.
+  curl_off_t last_progress_bytes_ = 0;
+
   // Members to enforce the usage flow.
   bool is_initialized_ = false;
   bool is_uri_set_ = false;
@@ -173,6 +187,10 @@ class LibCurl {
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
                                     size_t (*param)(const void*, size_t, size_t,
                                                     void*)) = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      int (*param)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                   curl_off_t ultotal, curl_off_t ulnow)) = 0;
   virtual CURLcode curl_easy_perform(CURL* curl) = 0;
   virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
                                      uint64* value) = 0;
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index 31ba3e337f9..6d66dfdee18 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -25,18 +25,33 @@ namespace {
 
 const string kTestContent = "random original scratch content";
 
+class FakeEnv : public EnvWrapper {
+ public:
+  FakeEnv() : EnvWrapper(Env::Default()) {}
+
+  uint64 NowSeconds() override { return now_; }
+  uint64 now_ = 10000;
+};
+
 // A fake proxy that pretends to be libcurl.
 class FakeLibCurl : public LibCurl {
  public:
   FakeLibCurl(const string& response_content, uint64 response_code)
-      : response_content(response_content), response_code(response_code) {}
+      : response_content_(response_content), response_code_(response_code) {}
+  FakeLibCurl(const string& response_content, uint64 response_code,
+              std::vector<std::tuple<uint64, curl_off_t>> progress_ticks,
+              FakeEnv* env)
+      : response_content_(response_content),
+        response_code_(response_code),
+        progress_ticks_(std::move(progress_ticks)),
+        env_(env) {}
   FakeLibCurl(const string& response_content, uint64 response_code,
               const std::vector<string>& response_headers)
-      : response_content(response_content),
-        response_code(response_code),
-        response_headers(response_headers) {}
+      : response_content_(response_content),
+        response_code_(response_code),
+        response_headers_(response_headers) {}
   CURL* curl_easy_init() override {
-    is_initialized = true;
+    is_initialized_ = true;
     // The reuslt just needs to be non-null.
     return reinterpret_cast<CURL*>(this);
   }
@@ -44,10 +59,10 @@ class FakeLibCurl : public LibCurl {
                             uint64 param) override {
     switch (option) {
       case CURLOPT_POST:
-        is_post = param;
+        is_post_ = param;
         break;
       case CURLOPT_PUT:
-        is_put = param;
+        is_put_ = param;
         break;
       default:
         break;
@@ -63,28 +78,31 @@ class FakeLibCurl : public LibCurl {
                             void* param) override {
     switch (option) {
       case CURLOPT_URL:
-        url = reinterpret_cast<char*>(param);
+        url_ = reinterpret_cast<char*>(param);
         break;
       case CURLOPT_RANGE:
-        range = reinterpret_cast<char*>(param);
+        range_ = reinterpret_cast<char*>(param);
         break;
       case CURLOPT_CUSTOMREQUEST:
-        custom_request = reinterpret_cast<char*>(param);
+        custom_request_ = reinterpret_cast<char*>(param);
         break;
       case CURLOPT_HTTPHEADER:
-        headers = reinterpret_cast<std::vector<string>*>(param);
+        headers_ = reinterpret_cast<std::vector<string>*>(param);
         break;
       case CURLOPT_ERRORBUFFER:
-        error_buffer = reinterpret_cast<char*>(param);
+        error_buffer_ = reinterpret_cast<char*>(param);
         break;
       case CURLOPT_WRITEDATA:
-        write_data = reinterpret_cast<FILE*>(param);
+        write_data_ = reinterpret_cast<FILE*>(param);
         break;
       case CURLOPT_HEADERDATA:
-        header_data = reinterpret_cast<FILE*>(param);
+        header_data_ = reinterpret_cast<FILE*>(param);
         break;
       case CURLOPT_READDATA:
-        read_data = reinterpret_cast<FILE*>(param);
+        read_data_ = reinterpret_cast<FILE*>(param);
+        break;
+      case CURLOPT_XFERINFODATA:
+        progress_data_ = param;
         break;
       default:
         break;
@@ -94,7 +112,7 @@ class FakeLibCurl : public LibCurl {
   CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
                             size_t (*param)(void*, size_t, size_t,
                                             FILE*)) override {
-    read_callback = param;
+    read_callback_ = param;
     return CURLE_OK;
   }
   CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
@@ -102,41 +120,58 @@ class FakeLibCurl : public LibCurl {
                                             void*)) override {
     switch (option) {
       case CURLOPT_WRITEFUNCTION:
-        write_callback = param;
+        write_callback_ = param;
         break;
       case CURLOPT_HEADERFUNCTION:
-        header_callback = param;
+        header_callback_ = param;
         break;
       default:
         break;
     }
     return CURLE_OK;
   }
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    progress_callback_ = param;
+    return CURLE_OK;
+  }
   CURLcode curl_easy_perform(CURL* curl) override {
-    if (read_data) {
+    if (read_data_) {
       char buffer[3];
       int bytes_read;
-      posted_content = "";
+      posted_content_ = "";
       do {
-        bytes_read = read_callback(buffer, 1, sizeof(buffer), read_data);
-        posted_content =
-            strings::StrCat(posted_content, StringPiece(buffer, bytes_read));
+        bytes_read = read_callback_(buffer, 1, sizeof(buffer), read_data_);
+        posted_content_ =
+            strings::StrCat(posted_content_, StringPiece(buffer, bytes_read));
       } while (bytes_read > 0);
     }
-    if (write_data) {
-      write_callback(response_content.c_str(), 1, response_content.size(),
-                     write_data);
+    if (write_data_) {
+      write_callback_(response_content_.c_str(), 1, response_content_.size(),
+                      write_data_);
     }
-    for (const auto& header : response_headers) {
-      header_callback(header.c_str(), 1, header.size(), header_data);
+    for (const auto& header : response_headers_) {
+      header_callback_(header.c_str(), 1, header.size(), header_data_);
     }
-    return curl_easy_perform_result;
+    if (error_buffer_) {
+      strncpy(error_buffer_, curl_easy_perform_error_message_.c_str(),
+              curl_easy_perform_error_message_.size() + 1);
+    }
+    for (const auto& tick : progress_ticks_) {
+      env_->now_ = std::get<0>(tick);
+      if (progress_callback_(progress_data_, 0, std::get<1>(tick), 0, 0)) {
+        return CURLE_ABORTED_BY_CALLBACK;
+      }
+    }
+    return curl_easy_perform_result_;
   }
   CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
                              uint64* value) override {
     switch (info) {
       case CURLINFO_RESPONSE_CODE:
-        *value = response_code;
+        *value = response_code_;
         break;
       default:
         break;
@@ -147,14 +182,14 @@ class FakeLibCurl : public LibCurl {
                              double* value) override {
     switch (info) {
       case CURLINFO_SIZE_DOWNLOAD:
-        *value = response_content.size();
+        *value = response_content_.size();
         break;
       default:
         break;
     }
     return CURLE_OK;
   }
-  void curl_easy_cleanup(CURL* curl) override { is_cleaned_up = true; }
+  void curl_easy_cleanup(CURL* curl) override { is_cleaned_up_ = true; }
   curl_slist* curl_slist_append(curl_slist* list, const char* str) override {
     std::vector<string>* v = list ? reinterpret_cast<std::vector<string>*>(list)
                                   : new std::vector<string>();
@@ -173,8 +208,8 @@ class FakeLibCurl : public LibCurl {
       temp_str.replace(n, victim.size(), encoded);
       n += encoded.size();
     }
-    char* out_char_str =
-        (char*)port::Malloc(sizeof(char) * temp_str.size() + 1);
+    char* out_char_str = reinterpret_cast<char*>(
+        port::Malloc(sizeof(char) * temp_str.size() + 1));
     std::copy(temp_str.begin(), temp_str.end(), out_char_str);
     out_char_str[temp_str.size()] = '\0';
     return out_char_str;
@@ -185,32 +220,40 @@ class FakeLibCurl : public LibCurl {
   void curl_free(void* p) override { port::Free(p); }
 
   // Variables defining the behavior of this fake.
-  string response_content;
-  uint64 response_code;
-  std::vector<string> response_headers;
+  string response_content_;
+  uint64 response_code_;
+  std::vector<string> response_headers_;
 
   // Internal variables to store the libcurl state.
-  string url;
-  string range;
-  string custom_request;
-  char* error_buffer = nullptr;
-  bool is_initialized = false;
-  bool is_cleaned_up = false;
-  std::vector<string>* headers = nullptr;
-  bool is_post = false;
-  bool is_put = false;
-  void* write_data = nullptr;
-  size_t (*write_callback)(const void* ptr, size_t size, size_t nmemb,
-                           void* userdata) = nullptr;
-  void* header_data = nullptr;
-  size_t (*header_callback)(const void* ptr, size_t size, size_t nmemb,
+  string url_;
+  string range_;
+  string custom_request_;
+  char* error_buffer_ = nullptr;
+  bool is_initialized_ = false;
+  bool is_cleaned_up_ = false;
+  std::vector<string>* headers_ = nullptr;
+  bool is_post_ = false;
+  bool is_put_ = false;
+  void* write_data_ = nullptr;
+  size_t (*write_callback_)(const void* ptr, size_t size, size_t nmemb,
                             void* userdata) = nullptr;
-  FILE* read_data = nullptr;
-  size_t (*read_callback)(void* ptr, size_t size, size_t nmemb,
-                          FILE* userdata) = &fread;
+  void* header_data_ = nullptr;
+  size_t (*header_callback_)(const void* ptr, size_t size, size_t nmemb,
+                             void* userdata) = nullptr;
+  FILE* read_data_ = nullptr;
+  size_t (*read_callback_)(void* ptr, size_t size, size_t nmemb,
+                           FILE* userdata) = &fread;
+  int (*progress_callback_)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                            curl_off_t ultotal, curl_off_t ulnow) = nullptr;
+  void* progress_data_ = nullptr;
   // Outcome of performing the request.
-  string posted_content;
-  CURLcode curl_easy_perform_result = CURLE_OK;
+  string posted_content_;
+  CURLcode curl_easy_perform_result_ = CURLE_OK;
+  string curl_easy_perform_error_message_;
+  // A vector of <timestamp, progress in bytes> pairs that represent the
+  // progress of a transmission.
+  std::vector<std::tuple<uint64, curl_off_t>> progress_ticks_;
+  FakeEnv* env_ = nullptr;
 };
 
 TEST(HttpRequestTest, GetRequest) {
@@ -232,13 +275,13 @@ TEST(HttpRequestTest, GetRequest) {
   EXPECT_EQ("get response", string(scratch.begin(), scratch.end()));
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("100-199", libcurl.range);
-  EXPECT_EQ("", libcurl.custom_request);
-  EXPECT_EQ(1, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_FALSE(libcurl.is_post);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("100-199", libcurl.range_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(1, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_FALSE(libcurl.is_post_);
   EXPECT_EQ(200, http_request.GetResponseCode());
 }
 
@@ -259,19 +302,19 @@ TEST(HttpRequestTest, GetRequest_Empty) {
   EXPECT_TRUE(scratch.empty());
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("100-199", libcurl.range);
-  EXPECT_EQ("", libcurl.custom_request);
-  EXPECT_EQ(1, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_FALSE(libcurl.is_post);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("100-199", libcurl.range_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(1, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_FALSE(libcurl.is_post_);
   EXPECT_EQ(200, http_request.GetResponseCode());
 }
 
 TEST(HttpRequestTest, GetRequest_RangeOutOfBound) {
   FakeLibCurl libcurl("get response", 416);
-  libcurl.curl_easy_perform_result = CURLE_WRITE_ERROR;
+  libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   HttpRequest http_request(&libcurl);
   TF_EXPECT_OK(http_request.Init());
 
@@ -290,7 +333,7 @@ TEST(HttpRequestTest, GetRequest_RangeOutOfBound) {
 
 TEST(HttpRequestTest, GetRequest_503) {
   FakeLibCurl libcurl("get response", 503);
-  libcurl.curl_easy_perform_result = CURLE_WRITE_ERROR;
+  libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   HttpRequest http_request(&libcurl);
   TF_EXPECT_OK(http_request.Init());
 
@@ -301,10 +344,35 @@ TEST(HttpRequestTest, GetRequest_503) {
   TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
   TF_EXPECT_OK(http_request.SetRange(100, 199));
   TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
-  EXPECT_EQ(error::UNAVAILABLE, http_request.Send().code());
+  const auto& status = http_request.Send();
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request (HTTP response code 503, "
+      "error code 23, error message '')",
+      status.error_message());
   EXPECT_EQ(503, http_request.GetResponseCode());
 }
 
+TEST(HttpRequestTest, GetRequest_HttpCode0) {
+  FakeLibCurl libcurl("get response", 0);
+  libcurl.curl_easy_perform_result_ = CURLE_OPERATION_TIMEDOUT;
+  libcurl.curl_easy_perform_error_message_ = "Operation timed out";
+  HttpRequest http_request(&libcurl);
+  TF_EXPECT_OK(http_request.Init());
+
+  std::vector<char> scratch;
+  scratch.insert(scratch.end(), kTestContent.begin(), kTestContent.end());
+
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  const auto& status = http_request.Send();
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request (HTTP response code 0, "
+      "error code 28, error message 'Operation timed out')",
+      status.error_message());
+  EXPECT_EQ(0, http_request.GetResponseCode());
+}
+
 TEST(HttpRequestTest, ResponseHeaders) {
   FakeLibCurl libcurl(
       "get response", 200,
@@ -336,14 +404,14 @@ TEST(HttpRequestTest, PutRequest_WithBody_FromFile) {
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("", libcurl.custom_request);
-  EXPECT_EQ(2, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_EQ("Content-Length: 17", (*libcurl.headers)[1]);
-  EXPECT_TRUE(libcurl.is_put);
-  EXPECT_EQ("post body content", libcurl.posted_content);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_EQ("Content-Length: 17", (*libcurl.headers_)[1]);
+  EXPECT_TRUE(libcurl.is_put_);
+  EXPECT_EQ("post body content", libcurl.posted_content_);
 
   std::remove(content_filename.c_str());
 }
@@ -364,7 +432,7 @@ TEST(HttpRequestTest, PutRequest_WithBody_FromFile_NonZeroOffset) {
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
-  EXPECT_EQ("dy content", libcurl.posted_content);
+  EXPECT_EQ("dy content", libcurl.posted_content_);
 
   std::remove(content_filename.c_str());
 }
@@ -382,14 +450,14 @@ TEST(HttpRequestTest, PostRequest_WithBody_FromMemory) {
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("", libcurl.custom_request);
-  EXPECT_EQ(2, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_EQ("Content-Length: 17", (*libcurl.headers)[1]);
-  EXPECT_TRUE(libcurl.is_post);
-  EXPECT_EQ("post body content", libcurl.posted_content);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_EQ("Content-Length: 17", (*libcurl.headers_)[1]);
+  EXPECT_TRUE(libcurl.is_post_);
+  EXPECT_EQ("post body content", libcurl.posted_content_);
 }
 
 TEST(HttpRequestTest, PostRequest_WithoutBody) {
@@ -403,14 +471,14 @@ TEST(HttpRequestTest, PostRequest_WithoutBody) {
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("", libcurl.custom_request);
-  EXPECT_EQ(2, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_EQ("Content-Length: 0", (*libcurl.headers)[1]);
-  EXPECT_TRUE(libcurl.is_post);
-  EXPECT_EQ("", libcurl.posted_content);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_EQ("Content-Length: 0", (*libcurl.headers_)[1]);
+  EXPECT_TRUE(libcurl.is_post_);
+  EXPECT_EQ("", libcurl.posted_content_);
 }
 
 TEST(HttpRequestTest, DeleteRequest) {
@@ -424,12 +492,12 @@ TEST(HttpRequestTest, DeleteRequest) {
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
-  EXPECT_TRUE(libcurl.is_initialized);
-  EXPECT_EQ("http://www.testuri.com", libcurl.url);
-  EXPECT_EQ("DELETE", libcurl.custom_request);
-  EXPECT_EQ(1, libcurl.headers->size());
-  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers)[0]);
-  EXPECT_FALSE(libcurl.is_post);
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("DELETE", libcurl.custom_request_);
+  EXPECT_EQ(1, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_FALSE(libcurl.is_post_);
 }
 
 TEST(HttpRequestTest, WrongSequenceOfCalls_NoUri) {
@@ -447,8 +515,8 @@ TEST(HttpRequestTest, WrongSequenceOfCalls_TwoSends) {
   HttpRequest http_request(&libcurl);
   TF_EXPECT_OK(http_request.Init());
 
-  http_request.SetUri("http://www.google.com");
-  http_request.Send();
+  TF_EXPECT_OK(http_request.SetUri("http://www.google.com"));
+  TF_EXPECT_OK(http_request.Send());
   auto s = http_request.Send();
   ASSERT_TRUE(errors::IsFailedPrecondition(s));
   EXPECT_TRUE(StringPiece(s.error_message())
@@ -460,8 +528,8 @@ TEST(HttpRequestTest, WrongSequenceOfCalls_ReusingAfterSend) {
   HttpRequest http_request(&libcurl);
   TF_EXPECT_OK(http_request.Init());
 
-  http_request.SetUri("http://www.google.com");
-  http_request.Send();
+  TF_EXPECT_OK(http_request.SetUri("http://www.google.com"));
+  TF_EXPECT_OK(http_request.Send());
   auto s = http_request.SetUri("http://mail.google.com");
   ASSERT_TRUE(errors::IsFailedPrecondition(s));
   EXPECT_TRUE(StringPiece(s.error_message())
@@ -473,7 +541,7 @@ TEST(HttpRequestTest, WrongSequenceOfCalls_SettingMethodTwice) {
   HttpRequest http_request(&libcurl);
   TF_EXPECT_OK(http_request.Init());
 
-  http_request.SetDeleteRequest();
+  TF_EXPECT_OK(http_request.SetDeleteRequest());
   auto s = http_request.SetPostEmptyBody();
   ASSERT_TRUE(errors::IsFailedPrecondition(s));
   EXPECT_TRUE(StringPiece(s.error_message())
@@ -498,5 +566,63 @@ TEST(HttpRequestTest, EscapeString) {
   EXPECT_EQ("a%2Fb%2Fc", http_request.EscapeString(test_string));
 }
 
+TEST(HttpRequestTest, ErrorReturnsNoResponse) {
+  FakeLibCurl libcurl("get response", 500);
+  HttpRequest http_request(&libcurl);
+  TF_EXPECT_OK(http_request.Init());
+
+  std::vector<char> scratch;
+  scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
+  StringPiece result;
+  scratch.reserve(100);
+
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
+  TF_EXPECT_OK(http_request.SetRange(100, 199));
+  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  EXPECT_EQ(error::UNAVAILABLE, http_request.Send().code());
+
+  EXPECT_EQ("", string(scratch.begin(), scratch.end()));
+}
+
+TEST(HttpRequestTest, ProgressIsOk) {
+  // Imitate a steady progress.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 0) /* timestamp 100, 0 bytes */,
+          std::make_tuple(110, 0) /* timestamp 110, 0 bytes */,
+          std::make_tuple(200, 100) /* timestamp 200, 100 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  TF_EXPECT_OK(http_request.Send());
+}
+
+TEST(HttpRequestTest, ProgressIsStuck) {
+  // Imitate a transmission that got stuck for more than a minute.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 10) /* timestamp 100, 10 bytes */,
+          std::make_tuple(130, 10) /* timestamp 130, 10 bytes */,
+          std::make_tuple(170, 10) /* timestamp 170, 10 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  auto status = http_request.Send();
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request (HTTP response code 200, "
+      "error code 42, error message '')",
+      status.error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index a3d5b9a6e45..7a9588b56ad 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -43,7 +43,8 @@ constexpr char kJwtType[] = "JWT";
 constexpr char kGrantType[] =
     "urn%3Aietf%3Aparams%3Aoauth%3Agrant-type%3Ajwt-bearer";
 
-Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
+Status ReadJsonValue(const Json::Value& json, const string& name,
+                     Json::Value* value) {
   if (!value) {
     return errors::FailedPrecondition("'value' cannot be nullptr.");
   }
@@ -55,7 +56,8 @@ Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
   return Status::OK();
 }
 
-Status ReadJsonString(Json::Value json, const string& name, string* value) {
+Status ReadJsonString(const Json::Value& json, const string& name,
+                      string* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isString()) {
@@ -66,7 +68,7 @@ Status ReadJsonString(Json::Value json, const string& name, string* value) {
   return Status::OK();
 }
 
-Status ReadJsonInt(Json::Value json, const string& name, int64* value) {
+Status ReadJsonInt(const Json::Value& json, const string& name, int64* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isIntegral()) {
@@ -98,14 +100,14 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
       EVP_PKEY_new(), [](EVP_PKEY* ptr) { EVP_PKEY_free(ptr); });
   EVP_PKEY_set1_RSA(key.get(), private_key);
 
-  if (EVP_DigestSignInit(md_ctx.get(), NULL, md, NULL, key.get()) != 1) {
+  if (EVP_DigestSignInit(md_ctx.get(), nullptr, md, nullptr, key.get()) != 1) {
     return errors::Internal("DigestInit failed.");
   }
   if (EVP_DigestSignUpdate(md_ctx.get(), to_sign.data(), to_sign.size()) != 1) {
     return errors::Internal("DigestUpdate failed.");
   }
   size_t sig_len = 0;
-  if (EVP_DigestSignFinal(md_ctx.get(), NULL, &sig_len) != 1) {
+  if (EVP_DigestSignFinal(md_ctx.get(), nullptr, &sig_len) != 1) {
     return errors::Internal("DigestFinal (get signature length) failed.");
   }
   std::unique_ptr<unsigned char[]> sig(new unsigned char[sig_len]);
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index 7e98cafa8b8..c3b68313613 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <functional>
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 
@@ -24,44 +25,6 @@ namespace tensorflow {
 
 namespace {
 
-// In case of failure, every call will be retried kMaxRetries times.
-constexpr int kMaxRetries = 3;
-// Maximum backoff time in microseconds.
-constexpr int64 kMaximumBackoffMicroseconds = 32000000;
-
-bool IsRetriable(Status status) {
-  switch (status.code()) {
-    case error::UNAVAILABLE:
-    case error::DEADLINE_EXCEEDED:
-    case error::UNKNOWN:
-      return true;
-    default:
-      // OK also falls here.
-      return false;
-  }
-}
-
-void WaitBeforeRetry(const int64 delay_micros) {
-  const int64 random_micros = random::New64() % 1000000;
-  Env::Default()->SleepForMicroseconds(std::min(delay_micros + random_micros,
-                                                kMaximumBackoffMicroseconds));
-}
-
-Status CallWithRetries(const std::function<Status()>& f,
-                       const int64 initial_delay_microseconds) {
-  int retries = 0;
-  while (true) {
-    auto status = f();
-    if (!IsRetriable(status) || retries >= kMaxRetries) {
-      return status;
-    }
-    const int64 delay_micros = initial_delay_microseconds << retries;
-    if (delay_micros > 0) {
-      WaitBeforeRetry(delay_micros);
-    }
-    retries++;
-  }
-}
 
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
@@ -72,9 +35,10 @@ class RetryingRandomAccessFile : public RandomAccessFile {
 
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    return CallWithRetries(std::bind(&RandomAccessFile::Read, base_file_.get(),
-                                     offset, n, result, scratch),
-                           initial_delay_microseconds_);
+    return RetryingUtils::CallWithRetries(
+        std::bind(&RandomAccessFile::Read, base_file_.get(), offset, n, result,
+                  scratch),
+        initial_delay_microseconds_);
   }
 
  private:
@@ -89,27 +53,30 @@ class RetryingWritableFile : public WritableFile {
       : base_file_(std::move(base_file)),
         initial_delay_microseconds_(delay_microseconds) {}
 
-  ~RetryingWritableFile() {
+  ~RetryingWritableFile() override {
     // Makes sure the retrying version of Close() is called in the destructor.
-    Close();
+    Close().IgnoreError();
   }
 
   Status Append(const StringPiece& data) override {
-    return CallWithRetries(
+    return RetryingUtils::CallWithRetries(
         std::bind(&WritableFile::Append, base_file_.get(), data),
         initial_delay_microseconds_);
   }
   Status Close() override {
-    return CallWithRetries(std::bind(&WritableFile::Close, base_file_.get()),
-                           initial_delay_microseconds_);
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Close, base_file_.get()),
+        initial_delay_microseconds_);
   }
   Status Flush() override {
-    return CallWithRetries(std::bind(&WritableFile::Flush, base_file_.get()),
-                           initial_delay_microseconds_);
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Flush, base_file_.get()),
+        initial_delay_microseconds_);
   }
   Status Sync() override {
-    return CallWithRetries(std::bind(&WritableFile::Sync, base_file_.get()),
-                           initial_delay_microseconds_);
+    return RetryingUtils::CallWithRetries(
+        std::bind(&WritableFile::Sync, base_file_.get()),
+        initial_delay_microseconds_);
   }
 
  private:
@@ -122,10 +89,10 @@ class RetryingWritableFile : public WritableFile {
 Status RetryingFileSystem::NewRandomAccessFile(
     const string& filename, std::unique_ptr<RandomAccessFile>* result) {
   std::unique_ptr<RandomAccessFile> base_file;
-  TF_RETURN_IF_ERROR(CallWithRetries(std::bind(&FileSystem::NewRandomAccessFile,
-                                               base_file_system_.get(),
-                                               filename, &base_file),
-                                     initial_delay_microseconds_));
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewRandomAccessFile, base_file_system_.get(),
+                filename, &base_file),
+      initial_delay_microseconds_));
   result->reset(new RetryingRandomAccessFile(std::move(base_file),
                                              initial_delay_microseconds_));
   return Status::OK();
@@ -134,10 +101,10 @@ Status RetryingFileSystem::NewRandomAccessFile(
 Status RetryingFileSystem::NewWritableFile(
     const string& filename, std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(CallWithRetries(std::bind(&FileSystem::NewWritableFile,
-                                               base_file_system_.get(),
-                                               filename, &base_file),
-                                     initial_delay_microseconds_));
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewWritableFile, base_file_system_.get(), filename,
+                &base_file),
+      initial_delay_microseconds_));
   result->reset(new RetryingWritableFile(std::move(base_file),
                                          initial_delay_microseconds_));
   return Status::OK();
@@ -146,10 +113,10 @@ Status RetryingFileSystem::NewWritableFile(
 Status RetryingFileSystem::NewAppendableFile(
     const string& filename, std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(CallWithRetries(std::bind(&FileSystem::NewAppendableFile,
-                                               base_file_system_.get(),
-                                               filename, &base_file),
-                                     initial_delay_microseconds_));
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewAppendableFile, base_file_system_.get(),
+                filename, &base_file),
+      initial_delay_microseconds_));
   result->reset(new RetryingWritableFile(std::move(base_file),
                                          initial_delay_microseconds_));
   return Status::OK();
@@ -157,68 +124,72 @@ Status RetryingFileSystem::NewAppendableFile(
 
 Status RetryingFileSystem::NewReadOnlyMemoryRegionFromFile(
     const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return CallWithRetries(std::bind(&FileSystem::NewReadOnlyMemoryRegionFromFile,
-                                   base_file_system_.get(), filename, result),
-                         initial_delay_microseconds_);
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::NewReadOnlyMemoryRegionFromFile,
+                base_file_system_.get(), filename, result),
+      initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::FileExists(const string& fname) {
-  // No status -- no retries.
-  return base_file_system_->FileExists(fname);
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::FileExists, base_file_system_.get(), fname),
+      initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  return CallWithRetries(
+  return RetryingUtils::CallWithRetries(
       std::bind(&FileSystem::Stat, base_file_system_.get(), fname, stat),
       initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::GetChildren(const string& dir,
                                        std::vector<string>* result) {
-  return CallWithRetries(std::bind(&FileSystem::GetChildren,
-                                   base_file_system_.get(), dir, result),
-                         initial_delay_microseconds_);
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::GetChildren, base_file_system_.get(), dir, result),
+      initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::GetMatchingPaths(const string& pattern,
                                             std::vector<string>* result) {
-  return CallWithRetries(std::bind(&FileSystem::GetMatchingPaths,
-                                   base_file_system_.get(), pattern, result),
-                         initial_delay_microseconds_);
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::GetMatchingPaths, base_file_system_.get(), pattern,
+                result),
+      initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::DeleteFile(const string& fname) {
-  return CallWithRetries(
+  return RetryingUtils::DeleteWithRetries(
       std::bind(&FileSystem::DeleteFile, base_file_system_.get(), fname),
       initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::CreateDir(const string& dirname) {
-  return CallWithRetries(
+  return RetryingUtils::CallWithRetries(
       std::bind(&FileSystem::CreateDir, base_file_system_.get(), dirname),
       initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::DeleteDir(const string& dirname) {
-  return CallWithRetries(
+  return RetryingUtils::DeleteWithRetries(
       std::bind(&FileSystem::DeleteDir, base_file_system_.get(), dirname),
       initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::GetFileSize(const string& fname, uint64* file_size) {
-  return CallWithRetries(std::bind(&FileSystem::GetFileSize,
-                                   base_file_system_.get(), fname, file_size),
-                         initial_delay_microseconds_);
+  return RetryingUtils::CallWithRetries(
+      std::bind(&FileSystem::GetFileSize, base_file_system_.get(), fname,
+                file_size),
+      initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::RenameFile(const string& src, const string& target) {
-  return CallWithRetries(
+  return RetryingUtils::CallWithRetries(
       std::bind(&FileSystem::RenameFile, base_file_system_.get(), src, target),
       initial_delay_microseconds_);
 }
 
 Status RetryingFileSystem::IsDirectory(const string& dirname) {
-  return CallWithRetries(
+  return RetryingUtils::CallWithRetries(
       std::bind(&FileSystem::IsDirectory, base_file_system_.get(), dirname),
       initial_delay_microseconds_);
 }
@@ -226,7 +197,7 @@ Status RetryingFileSystem::IsDirectory(const string& dirname) {
 Status RetryingFileSystem::DeleteRecursively(const string& dirname,
                                              int64* undeleted_files,
                                              int64* undeleted_dirs) {
-  return CallWithRetries(
+  return RetryingUtils::DeleteWithRetries(
       std::bind(&FileSystem::DeleteRecursively, base_file_system_.get(),
                 dirname, undeleted_files, undeleted_dirs),
       initial_delay_microseconds_);
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index ccb9def3cb3..232dcb3e71a 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -23,10 +23,20 @@ namespace {
 
 typedef std::vector<std::tuple<string, Status>> ExpectedCalls;
 
+ExpectedCalls CreateRetriableErrors(const string& method, int n) {
+  ExpectedCalls expected_calls;
+  expected_calls.reserve(n);
+  for (int i = 0; i < n; i++) {
+    expected_calls.emplace_back(std::make_tuple(
+        method, errors::Unavailable(strings::StrCat("Retriable error #", i))));
+  }
+  return expected_calls;
+}
+
 // A class to manage call expectations on mock implementations.
 class MockCallSequence {
  public:
-  MockCallSequence(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockCallSequence(const ExpectedCalls& calls) : calls_(calls) {}
 
   ~MockCallSequence() {
     EXPECT_TRUE(calls_.empty())
@@ -48,7 +58,7 @@ class MockCallSequence {
 
 class MockRandomAccessFile : public RandomAccessFile {
  public:
-  MockRandomAccessFile(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockRandomAccessFile(const ExpectedCalls& calls) : calls_(calls) {}
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     return calls_.ConsumeNextCall("Read");
@@ -60,7 +70,7 @@ class MockRandomAccessFile : public RandomAccessFile {
 
 class MockWritableFile : public WritableFile {
  public:
-  MockWritableFile(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockWritableFile(const ExpectedCalls& calls) : calls_(calls) {}
   Status Append(const StringPiece& data) override {
     return calls_.ConsumeNextCall("Append");
   }
@@ -74,7 +84,7 @@ class MockWritableFile : public WritableFile {
 
 class MockFileSystem : public FileSystem {
  public:
-  MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
@@ -100,7 +110,9 @@ class MockFileSystem : public FileSystem {
     return calls_.ConsumeNextCall("NewReadOnlyMemoryRegionFromFile");
   }
 
-  Status FileExists(const string& fname) override { return Status::OK(); }
+  Status FileExists(const string& fname) override {
+    return calls_.ConsumeNextCall("FileExists");
+  }
 
   Status GetChildren(const string& dir, std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetChildren");
@@ -204,11 +216,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls(
-      {std::make_tuple("Read", errors::Unavailable("Something is wrong")),
-       std::make_tuple("Read", errors::Unavailable("Wrong again")),
-       std::make_tuple("Read", errors::Unavailable("And again")),
-       std::make_tuple("Read", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_file_calls = CreateRetriableErrors("Read", 11);
   std::unique_ptr<RandomAccessFile> base_file(
       new MockRandomAccessFile(expected_file_calls));
 
@@ -227,8 +235,10 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   // Use it and check the results.
   StringPiece result;
   char scratch[10];
-  EXPECT_EQ("Last error",
-            random_access_file->Read(0, 10, &result, scratch).error_message());
+  const auto& status = random_access_file->Read(0, 10, &result, scratch);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
@@ -361,12 +371,8 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
 
 TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls(
-      {std::make_tuple("Sync", errors::Unavailable("Something is wrong")),
-       std::make_tuple("Sync", errors::Unavailable("Something is wrong again")),
-       std::make_tuple("Sync", errors::Unavailable("...and again")),
-       std::make_tuple("Sync", errors::Unavailable("And again")),
-       std::make_tuple("Close", Status::OK())});
+  ExpectedCalls expected_file_calls = CreateRetriableErrors("Sync", 11);
+  expected_file_calls.emplace_back(std::make_tuple("Close", Status::OK()));
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
@@ -383,7 +389,10 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
 
   // Use it and check the results.
-  EXPECT_EQ("And again", writable_file->Sync().error_message());
+  const auto& status = writable_file->Sync();
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest,
@@ -401,23 +410,18 @@ TEST(RetryingFileSystemTest,
 }
 
 TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewReadOnlyMemoryRegionFromFile",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("NewReadOnlyMemoryRegionFromFile",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("NewReadOnlyMemoryRegionFromFile",
-                       errors::Unavailable("and again")),
-       std::make_tuple("NewReadOnlyMemoryRegionFromFile",
-                       errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls =
+      CreateRetriableErrors("NewReadOnlyMemoryRegionFromFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
-  EXPECT_EQ("Last error",
-            fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result)
-                .error_message());
+  const auto& status =
+      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
@@ -434,19 +438,16 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("GetChildren",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetChildren",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("GetChildren", errors::Unavailable("And again")),
-       std::make_tuple("GetChildren", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetChildren", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::vector<string> result;
-  EXPECT_EQ("Last error", fs.GetChildren("gs://path", &result).error_message());
+  const auto& status = fs.GetChildren("gs://path", &result);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
@@ -463,20 +464,17 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("GetMatchingPaths",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetMatchingPaths",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("GetMatchingPaths", errors::Unavailable("And again")),
-       std::make_tuple("GetMatchingPaths", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls =
+      CreateRetriableErrors("GetMatchingPaths", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::vector<string> result;
-  EXPECT_EQ("Last error",
-            fs.GetMatchingPaths("gs://path/dir", &result).error_message());
+  const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
@@ -492,18 +490,16 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("DeleteFile", errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteFile",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("DeleteFile", errors::Unavailable("And again")),
-       std::make_tuple("DeleteFile", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::vector<string> result;
-  EXPECT_EQ("Last error", fs.DeleteFile("gs://path/file.txt").error_message());
+  const auto& status = fs.DeleteFile("gs://path/file.txt");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
@@ -519,18 +515,16 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("CreateDir", errors::Unavailable("Something is wrong")),
-       std::make_tuple("CreateDir",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("CreateDir", errors::Unavailable("And again")),
-       std::make_tuple("CreateDir", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("CreateDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::vector<string> result;
-  EXPECT_EQ("Last error", fs.CreateDir("gs://path/newdir").error_message());
+  const auto& status = fs.CreateDir("gs://path/newdir");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
@@ -546,18 +540,16 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("DeleteDir", errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteDir",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("DeleteDir", errors::Unavailable("And again")),
-       std::make_tuple("DeleteDir", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   std::vector<string> result;
-  EXPECT_EQ("Last error", fs.DeleteDir("gs://path/dir").error_message());
+  const auto& status = fs.DeleteDir("gs://path/dir");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
@@ -574,20 +566,16 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("GetFileSize",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetFileSize",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("GetFileSize", errors::Unavailable("And again")),
-       std::make_tuple("GetFileSize", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetFileSize", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   uint64 size;
-  EXPECT_EQ("Last error",
-            fs.GetFileSize("gs://path/file.txt", &size).error_message());
+  const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
@@ -602,19 +590,15 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls({
-      std::make_tuple("RenameFile", errors::Unavailable("Something is wrong")),
-      std::make_tuple("RenameFile",
-                      errors::Unavailable("Something is wrong again")),
-      std::make_tuple("RenameFile", errors::Unavailable("And again")),
-      std::make_tuple("RenameFile", errors::Unavailable("Last error")),
-  });
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("RenameFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
-  EXPECT_EQ("Last error",
-            fs.RenameFile("old_name", "new_name").error_message());
+  const auto& status = fs.RenameFile("old_name", "new_name");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
@@ -630,18 +614,39 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls({
-      std::make_tuple("Stat", errors::Unavailable("Something is wrong")),
-      std::make_tuple("Stat", errors::Unavailable("Something is wrong again")),
-      std::make_tuple("Stat", errors::Unavailable("And again")),
-      std::make_tuple("Stat", errors::Unavailable("Last error")),
-  });
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("Stat", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
   FileStatistics stat;
-  EXPECT_EQ("Last error", fs.Stat("file_name", &stat).error_message());
+  const auto& status = fs.Stat("file_name", &stat);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
+}
+
+TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("FileExists", 11);
+  std::unique_ptr<MockFileSystem> base_fs(
+      new MockFileSystem(expected_fs_calls));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+
+  const auto& status = fs.FileExists("file_name");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
+}
+
+TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
+  ExpectedCalls expected_fs_calls(
+      {std::make_tuple("FileExists", errors::Unavailable("Something is wrong")),
+       std::make_tuple("FileExists", Status::OK())});
+  std::unique_ptr<MockFileSystem> base_fs(
+      new MockFileSystem(expected_fs_calls));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+
+  TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
@@ -657,18 +662,15 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("IsDirectory",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("IsDirectory",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("IsDirectory", errors::Unavailable("And again")),
-       std::make_tuple("IsDirectory", errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls = CreateRetriableErrors("IsDirectory", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
 
-  EXPECT_EQ("Last error", fs.IsDirectory("gs://path/dir").error_message());
+  const auto& status = fs.IsDirectory("gs://path/dir");
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
@@ -686,23 +688,18 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
 }
 
 TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
-  ExpectedCalls expected_fs_calls(
-      {std::make_tuple("DeleteRecursively",
-                       errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteRecursively",
-                       errors::Unavailable("Something is wrong again")),
-       std::make_tuple("DeleteRecursively", errors::Unavailable("And again")),
-       std::make_tuple("DeleteRecursively",
-                       errors::Unavailable("Last error"))});
+  ExpectedCalls expected_fs_calls =
+      CreateRetriableErrors("DeleteRecursively", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem fs(std::move(base_fs), 0);
   int64 undeleted_files, undeleted_dirs;
 
-  EXPECT_EQ(
-      "Last error",
-      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs)
-          .error_message());
+  const auto& status =
+      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("Retriable error #10"))
+      << status;
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
new file mode 100644
index 00000000000..99691ecfb9d
--- /dev/null
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -0,0 +1,102 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+namespace {
+
+// In case of failure, every call will be retried kMaxRetries times.
+constexpr int kMaxRetries = 10;
+// Maximum backoff time in microseconds.
+constexpr int64 kMaximumBackoffMicroseconds = 32000000;  // 32 seconds.
+
+bool IsRetriable(error::Code code) {
+  switch (code) {
+    case error::UNAVAILABLE:
+    case error::DEADLINE_EXCEEDED:
+    case error::UNKNOWN:
+      return true;
+    default:
+      // OK also falls here.
+      return false;
+  }
+}
+
+}  // namespace
+
+Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
+                                      const int64 initial_delay_microseconds) {
+  return CallWithRetries(f, initial_delay_microseconds,
+                         std::bind(&Env::SleepForMicroseconds, Env::Default(),
+                                   std::placeholders::_1));
+}
+
+Status RetryingUtils::CallWithRetries(
+    const std::function<Status()>& f, const int64 initial_delay_microseconds,
+    const std::function<void(int64)>& sleep_usec) {
+  int retries = 0;
+  while (true) {
+    auto status = f();
+    if (!IsRetriable(status.code())) {
+      return status;
+    }
+    if (retries >= kMaxRetries) {
+      // Return AbortedError, so that it doesn't get retried again somewhere
+      // at a higher level.
+      return Status(
+          error::ABORTED,
+          strings::StrCat(
+              "All ", kMaxRetries,
+              " retry attempts failed. The last failure: ", status.ToString()));
+    }
+    int64 delay_micros = 0;
+    if (initial_delay_microseconds > 0) {
+      const int64 random_micros = random::New64() % 1000000;
+      delay_micros = std::min(initial_delay_microseconds << retries,
+                              kMaximumBackoffMicroseconds) +
+                     random_micros;
+    }
+    LOG(INFO) << "The operation failed and will be automatically retried in "
+              << (delay_micros / 1000000.0) << " seconds (attempt "
+              << (retries + 1) << " out of " << kMaxRetries
+              << "), caused by: " << status.ToString();
+    sleep_usec(delay_micros);
+    retries++;
+  }
+}
+
+Status RetryingUtils::DeleteWithRetries(
+    const std::function<Status()>& delete_func,
+    const int64 initial_delay_microseconds) {
+  bool is_retried = false;
+  return RetryingUtils::CallWithRetries(
+      [delete_func, &is_retried]() {
+        const Status status = delete_func();
+        if (is_retried && status.code() == error::NOT_FOUND) {
+          return Status::OK();
+        }
+        is_retried = true;
+        return status;
+      },
+      initial_delay_microseconds);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_utils.h b/tensorflow/core/platform/cloud/retrying_utils.h
new file mode 100644
index 00000000000..99ab216e97f
--- /dev/null
+++ b/tensorflow/core/platform/cloud/retrying_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+
+#include <functional>
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RetryingUtils {
+ public:
+  /// \brief Retries the function in case of failure with exponential backoff.
+  ///
+  /// The provided callback is retried with an exponential backoff until it
+  /// returns OK or a non-retriable error status.
+  /// If initial_delay_microseconds is zero, no delays will be made between
+  /// retries.
+  /// If all retries failed, returns the last error status.
+  static Status CallWithRetries(const std::function<Status()>& f,
+                                const int64 initial_delay_microseconds);
+  /// sleep_usec is a function that sleeps for the given number of microseconds.
+  static Status CallWithRetries(const std::function<Status()>& f,
+                                const int64 initial_delay_microseconds,
+                                const std::function<void(int64)>& sleep_usec);
+  /// \brief A retrying wrapper for a function that deletes a resource.
+  ///
+  /// The function takes care of the scenario when a delete operation
+  /// returns a failure but succeeds under the hood: if a retry returns
+  /// NOT_FOUND, the whole operation is considered a success.
+  static Status DeleteWithRetries(const std::function<Status()>& delete_func,
+                                  const int64 initial_delay_microseconds);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
new file mode 100644
index 00000000000..6eb340e0943
--- /dev/null
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include <fstream>
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
+  std::vector<double> requested_delays;  // requested delays in seconds
+  std::function<void(int64)> sleep = [&requested_delays](int64 delay) {
+    requested_delays.emplace_back(delay / 1000000.0);
+  };
+  std::function<Status()> f = []() { return errors::Unavailable("Failed."); };
+
+  const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
+  EXPECT_EQ(errors::Code::ABORTED, status.code());
+  EXPECT_TRUE(StringPiece(status.error_message())
+                  .contains("All 10 retry attempts "
+                            "failed. The last failure: "
+                            "Unavailable: Failed."))
+      << status;
+
+  EXPECT_EQ(10, requested_delays.size());
+  EXPECT_NEAR(0.5, requested_delays[0], 1.0);
+  EXPECT_NEAR(1.0, requested_delays[1], 1.0);
+  EXPECT_NEAR(2.0, requested_delays[2], 1.0);
+  EXPECT_NEAR(4.0, requested_delays[3], 1.0);
+  EXPECT_NEAR(8.0, requested_delays[4], 1.0);
+  EXPECT_NEAR(16.0, requested_delays[5], 1.0);
+
+  // All subsequent delays are capped at 32 seconds (plus jitter).
+  EXPECT_NEAR(32.0, requested_delays[6], 1.0);
+  EXPECT_NEAR(32.0, requested_delays[7], 1.0);
+  EXPECT_NEAR(32.0, requested_delays[8], 1.0);
+  EXPECT_NEAR(32.0, requested_delays[9], 1.0);
+}
+
+TEST(RetryingUtilsTest, CallWithRetries_NotFoundIsNotRetried) {
+  std::vector<Status> results(
+      {errors::Unavailable("Failed."), errors::NotFound("Not found.")});
+  std::function<Status()> f = [&results]() {
+    auto result = results[0];
+    results.erase(results.begin());
+    return result;
+  };
+  EXPECT_EQ(errors::Code::NOT_FOUND,
+            RetryingUtils::CallWithRetries(f, 0).code());
+}
+
+TEST(RetryingUtilsTest, CallWithRetries_ImmediateSuccess) {
+  std::vector<Status> results({Status::OK()});
+  std::function<void(int64)> sleep = [](int64 delay) {
+    ADD_FAILURE() << "Unexpected call to sleep.";
+  };
+  std::function<Status()> f = [&results]() {
+    auto result = results[0];
+    results.erase(results.begin());
+    return result;
+  };
+  TF_EXPECT_OK(RetryingUtils::CallWithRetries(f, 1.0, sleep));
+}
+
+TEST(RetryingUtilsTest, CallWithRetries_EventualSuccess) {
+  std::vector<Status> results({errors::Unavailable("Failed."),
+                               errors::Unavailable("Failed again."),
+                               Status::OK()});
+  std::function<Status()> f = [&results]() {
+    auto result = results[0];
+    results.erase(results.begin());
+    return result;
+  };
+  TF_EXPECT_OK(RetryingUtils::CallWithRetries(f, 0));
+}
+
+TEST(RetryingUtilsTest, DeleteWithRetries_ImmediateSuccess) {
+  std::vector<Status> delete_results({Status::OK()});
+  const auto delete_func = [&delete_results]() {
+    auto result = delete_results[0];
+    delete_results.erase(delete_results.begin());
+    return result;
+  };
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+}
+
+TEST(RetryingUtilsTest, DeleteWithRetries_EventualSuccess) {
+  std::vector<Status> delete_results({errors::Unavailable(""), Status::OK()});
+  const auto delete_func = [&delete_results]() {
+    auto result = delete_results[0];
+    delete_results.erase(delete_results.begin());
+    return result;
+  };
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+}
+
+TEST(RetryingUtilsTest, DeleteWithRetries_PermissionDeniedNotRetried) {
+  std::vector<Status> delete_results(
+      {errors::Unavailable(""), errors::PermissionDenied("")});
+  const auto delete_func = [&delete_results]() {
+    auto result = delete_results[0];
+    delete_results.erase(delete_results.begin());
+    return result;
+  };
+  EXPECT_EQ(errors::Code::PERMISSION_DENIED,
+            RetryingUtils::DeleteWithRetries(delete_func, 0).code());
+}
+
+TEST(RetryingUtilsTest, DeleteWithRetries_SuccessThroughFileNotFound) {
+  std::vector<Status> delete_results(
+      {errors::Unavailable(""), errors::NotFound("")});
+  const auto delete_func = [&delete_results]() {
+    auto result = delete_results[0];
+    delete_results.erase(delete_results.begin());
+    return result;
+  };
+  TF_EXPECT_OK(RetryingUtils::DeleteWithRetries(delete_func, 0));
+}
+
+TEST(RetryingUtilsTest, DeleteWithRetries_FirstNotFoundReturnedAsIs) {
+  std::vector<Status> delete_results({errors::NotFound("")});
+  const auto delete_func = [&delete_results]() {
+    auto result = delete_results[0];
+    delete_results.erase(delete_results.begin());
+    return result;
+  };
+  EXPECT_EQ(error::NOT_FOUND,
+            RetryingUtils::DeleteWithRetries(delete_func, 0).code());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 7bef9d9b4b8..5cade740e47 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -13,19 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// SIMD extension querying is only available on x86.
-#if defined(__x86_64__) || defined(__amd64__)
-#define PORT_IS_X86
-#endif
-
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
-#if defined(PORT_IS_X86)
+#if defined(PLATFORM_IS_X86)
 #include <mutex>  // NOLINT
 #endif
 
-#ifdef PORT_IS_X86
+// SIMD extension querying is only available on x86.
+#ifdef PLATFORM_IS_X86
+#ifdef PLATFORM_WINDOWS
+// Visual Studio defines a builtin function for CPUID, so use that if possible.
+#define GETCPUID(a, b, c, d, a_inp, c_inp) \
+  {                                        \
+    int cpu_info[4] = {-1};                \
+    __cpuidex(cpu_info, a_inp, c_inp);     \
+    a = cpu_info[0];                       \
+    b = cpu_info[1];                       \
+    c = cpu_info[2];                       \
+    d = cpu_info[3];                       \
+  }
+#else
+// Otherwise use gcc-format assembler to implement the underlying instructions.
 #define GETCPUID(a, b, c, d, a_inp, c_inp) \
   asm("mov %%rbx, %%rdi\n"                 \
       "cpuid\n"                            \
@@ -33,25 +43,32 @@ limitations under the License.
       : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \
       : "a"(a_inp), "2"(c_inp))
 #endif
+#endif
 
 namespace tensorflow {
 namespace port {
 namespace {
 
-#ifdef PORT_IS_X86
+#ifdef PLATFORM_IS_X86
 class CPUIDInfo;
 void InitCPUIDInfo();
 
 CPUIDInfo *cpuid = nullptr;
 
+#ifdef PLATFORM_WINDOWS
+// Visual Studio defines a builtin function, so use that if possible.
+int GetXCR0EAX() { return _xgetbv(0); }
+#else
 int GetXCR0EAX() {
   int eax, edx;
   asm("XGETBV" : "=a"(eax), "=d"(edx) : "c"(0));
   return eax;
 }
+#endif
 
 // Structure for basic CPUID info
-struct CPUIDInfo {
+class CPUIDInfo {
+ public:
   CPUIDInfo()
       : have_adx_(0),
         have_aes_(0),
@@ -93,17 +110,26 @@ struct CPUIDInfo {
 
   static void Initialize() {
     // Initialize cpuid struct
-    CHECK(cpuid == NULL) << __func__ << " ran more than once";
+    CHECK(cpuid == nullptr) << __func__ << " ran more than once";
     cpuid = new CPUIDInfo;
 
     uint32 eax, ebx, ecx, edx;
 
+    // Get vendor string (issue CPUID with eax = 0)
+    GETCPUID(eax, ebx, ecx, edx, 0, 0);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ebx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&edx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ecx), 4);
+
     // To get general information and extended features we send eax = 1 and
     // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
     // (See Intel 64 and IA-32 Architectures Software Developer's Manual
     // Volume 2A: Instruction Set Reference, A-M CPUID).
     GETCPUID(eax, ebx, ecx, edx, 1, 0);
 
+    cpuid->model_num_ = static_cast<int>((eax >> 4) & 0xf);
+    cpuid->family_ = static_cast<int>((eax >> 8) & 0xf);
+
     cpuid->have_aes_ = (ecx >> 25) & 0x1;
     cpuid->have_cmov_ = (edx >> 15) & 0x1;
     cpuid->have_cmpxchg16b_ = (ecx >> 13) & 0x1;
@@ -224,6 +250,10 @@ struct CPUIDInfo {
     return false;
   }
 
+  string vendor_str() const { return vendor_str_; }
+  int family() const { return family_; }
+  int model_num() { return model_num_; }
+
  private:
   int highest_eax_;
   int have_adx_ : 1;
@@ -263,6 +293,9 @@ struct CPUIDInfo {
   int have_sse4_2_ : 1;
   int have_ssse3_ : 1;
   int have_hypervisor_ : 1;
+  string vendor_str_;
+  int family_;
+  int model_num_;
 };
 
 std::once_flag cpuid_once_flag;
@@ -273,17 +306,44 @@ void InitCPUIDInfo() {
   std::call_once(cpuid_once_flag, CPUIDInfo::Initialize);
 }
 
-#endif  // PORT_IS_X86
+#endif  // PLATFORM_IS_X86
 
 }  // namespace
 
 bool TestCPUFeature(CPUFeature feature) {
-#ifdef PORT_IS_X86
+#ifdef PLATFORM_IS_X86
   return CPUIDInfo::TestFeature(feature);
 #else
   return false;
 #endif
 }
 
+std::string CPUVendorIDString() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->vendor_str();
+#else
+  return "";
+#endif
+}
+
+int CPUFamily() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->family();
+#else
+  return 0;
+#endif
+}
+
+int CPUModelNum() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->model_num();
+#else
+  return 0;
+#endif
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index f6eee478e8d..331f3e52516 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_CPU_INFO_H_
 
+#include <string>
+
 #if defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
@@ -92,6 +94,18 @@ enum CPUFeature {
 // Checks CPU registers to return hardware capabilities.
 bool TestCPUFeature(CPUFeature feature);
 
+// Returns CPU Vendor string (i.e. 'GenuineIntel', 'AuthenticAMD', etc.)
+std::string CPUVendorIDString();
+
+// Returns CPU family.
+int CPUFamily();
+
+// Returns CPU model number.
+int CPUModelNum();
+
+// Returns nominal core processor cycles per second of each processor.
+double NominalCPUFrequency();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 06418f83a1b..94f255663eb 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -4,11 +4,6 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 
-# configure may change the following lines
-WITH_GCP_SUPPORT = False
-WITH_HDFS_SUPPORT = False
-WITH_JEMALLOC = True
-
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
   tf_deps = []
@@ -31,6 +26,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
                         cc_libs = [],
                         cc_stubby_versions = None,
                         cc_grpc_version = None,
+                        j2objc_api_version = 1,
                         cc_api_version = 2, go_api_version = 2,
                         java_api_version = 2, py_api_version = 2,
                         js_api_version = 2, js_codegen = "jspb"):
@@ -38,6 +34,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
       name = name + "_proto_srcs",
       srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
       testonly = testonly,
+      visibility = visibility,
   )
 
   use_grpc_plugin = None
@@ -78,6 +75,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
                      protodeps = [], visibility = [], testonly = 0,
                      cc_libs = [],
                      cc_api_version = 2, go_api_version = 2,
+                     j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb"):
   """Make a proto library, possibly depending on other proto libraries."""
@@ -100,12 +98,14 @@ def tf_proto_library(name, srcs = [], has_services = None,
   )
 
 def tf_additional_lib_hdrs(exclude = []):
+  windows_hdrs = native.glob([
+      "platform/default/*.h",
+      "platform/windows/*.h",
+      "platform/posix/error.h",
+  ], exclude = exclude)
   return select({
-    "//tensorflow:windows" : native.glob([
-        "platform/default/*.h",
-        "platform/windows/*.h",
-        "platform/posix/error.h",
-      ], exclude = exclude),
+    "//tensorflow:windows" : windows_hdrs,
+    "//tensorflow:windows_msvc" : windows_hdrs,
     "//conditions:default" : native.glob([
         "platform/default/*.h",
         "platform/posix/*.h",
@@ -113,12 +113,14 @@ def tf_additional_lib_hdrs(exclude = []):
   })
 
 def tf_additional_lib_srcs(exclude = []):
+  windows_srcs = native.glob([
+      "platform/default/*.cc",
+      "platform/windows/*.cc",
+      "platform/posix/error.cc",
+  ], exclude = exclude)
   return select({
-    "//tensorflow:windows" : native.glob([
-        "platform/default/*.cc",
-        "platform/windows/*.cc",
-        "platform/posix/error.cc",
-      ], exclude = exclude),
+    "//tensorflow:windows" : windows_srcs,
+    "//tensorflow:windows_msvc" : windows_srcs,
     "//conditions:default" : native.glob([
         "platform/default/*.cc",
         "platform/posix/*.cc",
@@ -150,11 +152,13 @@ def tf_env_time_hdrs():
   ]
 
 def tf_env_time_srcs():
+  win_env_time = native.glob([
+    "platform/windows/env_time.cc",
+    "platform/env_time.cc",
+  ], exclude = [])
   return select({
-    "//tensorflow:windows" : native.glob([
-        "platform/windows/env_time.cc",
-        "platform/env_time.cc",
-      ], exclude = []),
+    "//tensorflow:windows" : win_env_time,
+    "//tensorflow:windows_msvc" : win_env_time,
     "//conditions:default" : native.glob([
         "platform/posix/env_time.cc",
         "platform/env_time.cc",
@@ -194,50 +198,71 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
   return 0
 
-# jemalloc only enabled on Linux for now.
-# TODO(jhseu): Enable on other platforms.
 def tf_additional_lib_defines():
-  defines = []
-  if WITH_JEMALLOC:
-    defines += select({
-        "//tensorflow:linux_x86_64": [
-            "TENSORFLOW_USE_JEMALLOC"
-        ],
-        "//conditions:default": [],
-    })
-  return defines
+  return select({
+      "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
+      "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_lib_deps():
-  deps = []
-  if WITH_JEMALLOC:
-    deps += select({
-        "//tensorflow:linux_x86_64": ["@jemalloc"],
-        "//conditions:default": [],
-    })
-  return deps
+  return select({
+      "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc"],
+      "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_core_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
-  if WITH_HDFS_SUPPORT:
-    deps.append("//tensorflow/core/platform/hadoop:hadoop_file_system")
-  return deps
+  return select({
+      "//tensorflow:with_gcp_support": [
+          "//tensorflow/core/platform/cloud:gcs_file_system",
+      ],
+      "//conditions:default": [],
+  }) + select({
+      "//tensorflow:with_hdfs_support": [
+          "//tensorflow/core/platform/hadoop:hadoop_file_system",
+      ],
+      "//conditions:default": [],
+  })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
-  deps = []
-  # TODO(hormati): Remove the comments below to enable BigQuery op. The op is
-  # not linked for now because it is under perf testing.
-  #if WITH_GCP_SUPPORT:
-  #  deps = if_not_mobile(["//tensorflow/core/kernels/cloud:bigquery_reader_ops"])
-  return deps
+  return select({
+      "//tensorflow:windows": [],
+      "//tensorflow:android": [],
+      "//tensorflow:ios": [],
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
+      ],
+      "//conditions:default": [],
+  })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
-  deps = []
-  # TODO(hormati): Remove the comments below to enable BigQuery op. The op is
-  # not linked for now because it is under perf testing.
-  #if WITH_GCP_SUPPORT:
-  #  deps = if_not_mobile(["//tensorflow/core:cloud_ops_op_lib"])
-  return deps
+  return select({
+      "//tensorflow:windows": [],
+      "//tensorflow:android": [],
+      "//tensorflow:ios": [],
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
+      ],
+      "//conditions:default": [],
+  })
+
+def tf_lib_proto_parsing_deps():
+  return [
+      ":protos_all_cc",
+      "//tensorflow/core/platform/default/build_config:proto_parsing",
+  ]
+
+def tf_additional_verbs_lib_defines():
+  return select({
+      "//tensorflow:with_verbs_support": ["TENSORFLOW_USE_VERBS"],
+      "//conditions:default": [],
+  })
+
+def tf_additional_mpi_lib_defines():
+  return select({
+      "//tensorflow:with_mpi_support": ["TENSORFLOW_USE_MPI"],
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 0857010f7c9..9e3d5f354db 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -58,6 +58,22 @@ cc_library(
     ],
 )
 
+# Dummy stream executor cuda plugins.
+cc_library(
+    name = "cublas_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = [],
+)
+
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",
@@ -92,6 +108,7 @@ cc_library(
         "//tensorflow/core:protos_cc",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
+        "@fft2d//:fft2d",
         "@highwayhash//:sip_hash",
         "@png_archive//:png",
     ],
@@ -170,11 +187,17 @@ cc_library(
 )
 
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = glob(["*.h"]),
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 23a7b9065a6..fa4ac4ba73f 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -2,8 +2,6 @@
 # The functions in this file might be referred by tensorflow.bzl. They have to
 # be separate to avoid cyclic references.
 
-WITH_XLA_SUPPORT = False
-
 def tf_cuda_tests_tags():
   return ["local"]
 
@@ -11,16 +9,33 @@ def tf_sycl_tests_tags():
   return ["local"]
 
 def tf_additional_plugin_deps():
-  deps = []
-  if WITH_XLA_SUPPORT:
-    deps.append("//tensorflow/compiler/jit")
-  return deps
+  return select({
+      "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
+      "//conditions:default": [],
+  })
 
 def tf_additional_xla_deps_py():
   return []
 
 def tf_additional_license_deps():
-  licenses = []
-  if WITH_XLA_SUPPORT:
-    licenses.append("@llvm//:LICENSE.TXT")
-  return licenses
+  return select({
+      "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
+      "//conditions:default": [],
+  })
+
+def tf_additional_verbs_deps():
+  return select({
+      "//tensorflow:with_verbs_support": [
+          "//tensorflow/contrib/verbs:verbs_server_lib",
+          "//tensorflow/contrib/verbs:grpc_verbs_client",
+      ], 
+      "//conditions:default": [],
+  })
+
+def tf_additional_mpi_deps():
+  return select({
+      "//tensorflow:with_mpi_support": [
+          "//tensorflow/contrib/mpi:mpi_server_lib",
+      ],
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 73dac962415..ebdd4b624aa 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -156,7 +156,7 @@ void MakeCheckOpValueString(std::ostream* os, const char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "char value " << (short)v;
+    (*os) << "char value " << static_cast<short>(v);
   }
 }
 
@@ -165,7 +165,7 @@ void MakeCheckOpValueString(std::ostream* os, const signed char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "signed char value " << (short)v;
+    (*os) << "signed char value " << static_cast<short>(v);
   }
 }
 
@@ -174,7 +174,7 @@ void MakeCheckOpValueString(std::ostream* os, const unsigned char& v) {
   if (v >= 32 && v <= 126) {
     (*os) << "'" << v << "'";
   } else {
-    (*os) << "unsigned char value " << (unsigned short)v;
+    (*os) << "unsigned char value " << static_cast<unsigned short>(v);
   }
 }
 
diff --git a/tensorflow/core/platform/default/notification.h b/tensorflow/core/platform/default/notification.h
index 13d2317e389..a78a3cd89b2 100644
--- a/tensorflow/core/platform/default/notification.h
+++ b/tensorflow/core/platform/default/notification.h
@@ -51,10 +51,10 @@ class Notification {
 
  private:
   friend bool WaitForNotificationWithTimeout(Notification* n,
-                                             int64 timeout_in_ms);
-  bool WaitForNotificationWithTimeout(int64 timeout_in_ms) {
+                                             int64 timeout_in_us);
+  bool WaitForNotificationWithTimeout(int64 timeout_in_us) {
     mutex_lock l(mu_);
-    return cv_.wait_for(l, std::chrono::milliseconds(timeout_in_ms),
+    return cv_.wait_for(l, std::chrono::microseconds(timeout_in_us),
                         [this]() { return notified_; });
   }
 
@@ -64,8 +64,8 @@ class Notification {
 };
 
 inline bool WaitForNotificationWithTimeout(Notification* n,
-                                           int64 timeout_in_ms) {
-  return n->WaitForNotificationWithTimeout(timeout_in_ms);
+                                           int64 timeout_in_us) {
+  return n->WaitForNotificationWithTimeout(timeout_in_us);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 08e060bbbd8..f13b0af2a79 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -15,8 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/platform/denormal.h"
 #include "third_party/eigen3/Eigen/Core"
-// Check EIGEN_VECTORIZE_SSE3 since Windows doesn't define __SSE3__ properly
-#ifdef EIGEN_VECTORIZE_SSE3
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
+// If we're on gcc 4.8 or older, there's a known bug that prevents the use of
+// intrinsics when the architecture is not defined in the flags. See
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
+#if !defined(__SSE3__) && !defined(__clang__) && \
+    (defined(__GNUC__) && (__GNUC__ < 4) ||      \
+     ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
+#define GCC_WITHOUT_INTRINSICS
+#endif
+// Only try to use SSE3 instructions if we're on an x86 platform, and it's not
+// mobile, and we're not on a known bad gcc version.
+#if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \
+    !defined(GCC_WITHOUT_INTRINSICS)
+#define DENORM_USE_INTRINSICS
+#endif
+
+#ifdef DENORM_USE_INTRINSICS
 #include <pmmintrin.h>
 #endif
 
@@ -26,26 +43,32 @@ namespace port {
 ScopedFlushDenormal::ScopedFlushDenormal() {
 // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
 // can be added as needed.
-#ifdef EIGEN_VECTORIZE_SSE3
-  // Save existing flags
-  flush_zero_mode_ = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
-  denormals_zero_mode_ = _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
 
-  // Flush denormals to zero (the FTZ flag).
-  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+#ifdef DENORM_USE_INTRINSICS
+  if (TestCPUFeature(SSE3)) {
+    // Save existing flags
+    flush_zero_mode_ = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
+    denormals_zero_mode_ =
+        _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
 
-  // Interpret denormal inputs as zero (the DAZ flag).
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+    // Flush denormals to zero (the FTZ flag).
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+
+    // Interpret denormal inputs as zero (the DAZ flag).
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+  }
 #endif
 }
 
 ScopedFlushDenormal::~ScopedFlushDenormal() {
-#ifdef EIGEN_VECTORIZE_SSE3
-  // Restore flags
-  _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode_ ? _MM_FLUSH_ZERO_ON
-                                           : _MM_FLUSH_ZERO_OFF);
-  _MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode_ ? _MM_DENORMALS_ZERO_ON
-                                                   : _MM_DENORMALS_ZERO_OFF);
+#ifdef DENORM_USE_INTRINSICS
+  if (TestCPUFeature(SSE3)) {
+    // Restore flags
+    _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode_ ? _MM_FLUSH_ZERO_ON
+                                             : _MM_FLUSH_ZERO_OFF);
+    _MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode_ ? _MM_DENORMALS_ZERO_ON
+                                                     : _MM_DENORMALS_ZERO_OFF);
+  }
 #endif
 }
 
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index d2b252629ed..2fdd989c9b9 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <deque>
+#include <utility>
 #include <vector>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
@@ -95,7 +96,7 @@ Status Env::GetRegisteredFileSystemSchemes(std::vector<string>* schemes) {
 
 Status Env::RegisterFileSystem(const string& scheme,
                                FileSystemRegistry::Factory factory) {
-  return file_system_registry_->Register(scheme, factory);
+  return file_system_registry_->Register(scheme, std::move(factory));
 }
 
 Status Env::NewRandomAccessFile(const string& fname,
@@ -132,6 +133,52 @@ Status Env::FileExists(const string& fname) {
   return fs->FileExists(fname);
 }
 
+bool Env::FilesExist(const std::vector<string>& files,
+                     std::vector<Status>* status) {
+  std::unordered_map<string, std::vector<string>> files_per_fs;
+  for (const auto& file : files) {
+    StringPiece scheme, host, path;
+    io::ParseURI(file, &scheme, &host, &path);
+    files_per_fs[scheme.ToString()].push_back(file);
+  }
+
+  std::unordered_map<string, Status> per_file_status;
+  bool result = true;
+  for (auto itr : files_per_fs) {
+    FileSystem* file_system = file_system_registry_->Lookup(itr.first);
+    bool fs_result;
+    std::vector<Status> local_status;
+    std::vector<Status>* fs_status = status ? &local_status : nullptr;
+    if (!file_system) {
+      fs_result = false;
+      if (fs_status) {
+        Status s = errors::Unimplemented("File system scheme ", itr.first,
+                                         " not implemented");
+        local_status.resize(itr.second.size(), s);
+      }
+    } else {
+      fs_result = file_system->FilesExist(itr.second, fs_status);
+    }
+    if (fs_status) {
+      result &= fs_result;
+      for (int i = 0; i < itr.second.size(); ++i) {
+        per_file_status[itr.second[i]] = fs_status->at(i);
+      }
+    } else if (!fs_result) {
+      // Return early
+      return false;
+    }
+  }
+
+  if (status) {
+    for (const auto& file : files) {
+      status->push_back(per_file_status[file]);
+    }
+  }
+
+  return result;
+}
+
 Status Env::GetChildren(const string& dir, std::vector<string>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dir, &fs));
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 29b6b2f2dd2..1b7e024b0f4 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -136,6 +136,12 @@ class Env {
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
   Status FileExists(const string& fname);
 
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  bool FilesExist(const std::vector<string>& files,
+                  std::vector<Status>* status);
+
   /// \brief Stores in *result the names of the children of the specified
   /// directory. The names are relative to "dir".
   ///
@@ -394,8 +400,9 @@ namespace register_file_system {
 template <typename Factory>
 struct Register {
   Register(Env* env, const string& scheme) {
-    env->RegisterFileSystem(scheme,
-                            []() -> FileSystem* { return new Factory; });
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    env->RegisterFileSystem(scheme, []() -> FileSystem* { return new Factory; })
+        .IgnoreError();
   }
 };
 
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 5ecf6688c4f..7bc1882c86d 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <sys/stat.h>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -32,7 +33,7 @@ namespace {
 string CreateTestFile(Env* env, const string& filename, int length) {
   string input(length, 0);
   for (int i = 0; i < length; i++) input[i] = i;
-  WriteStringToFile(env, filename, input);
+  TF_CHECK_OK(WriteStringToFile(env, filename, input));
   return input;
 }
 
@@ -53,11 +54,12 @@ string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
 
 class DefaultEnvTest : public ::testing::Test {
  protected:
-  void SetUp() override { env_->CreateDir(BaseDir()); }
+  void SetUp() override { TF_CHECK_OK(env_->CreateDir(BaseDir())); }
 
   void TearDown() override {
     int64 undeleted_files, undeleted_dirs;
-    env_->DeleteRecursively(BaseDir(), &undeleted_files, &undeleted_dirs);
+    TF_CHECK_OK(
+        env_->DeleteRecursively(BaseDir(), &undeleted_files, &undeleted_dirs));
   }
 
   Env* env_ = Env::Default();
@@ -252,10 +254,10 @@ TEST_F(DefaultEnvTest, SleepForMicroseconds) {
   env_->SleepForMicroseconds(sleep_time);
   const int64 delta = env_->NowMicros() - start;
 
-  // Subtract 50 from the sleep_time for this check because NowMicros can
+  // Subtract 200 from the sleep_time for this check because NowMicros can
   // sometimes give slightly inconsistent values between the start and the
   // finish (e.g. because the two calls run on different CPUs).
-  EXPECT_GE(delta, sleep_time - 50);
+  EXPECT_GE(delta, sleep_time - 200);
 }
 
 class TmpDirFileSystem : public NullFileSystem {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index c82b6403c6e..2abda457145 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -37,7 +37,7 @@ constexpr int kNumThreads = 8;
 
 // Run a function in parallel using a ThreadPool, but skip the ThreadPool
 // on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, std::function<void(int)> f) {
+void ForEach(int first, int last, const std::function<void(int)>& f) {
 #if TARGET_OS_IPHONE
   for (int i = first; i < last; i++) {
     f(i);
@@ -76,15 +76,37 @@ WritableFile::~WritableFile() {}
 
 FileSystemRegistry::~FileSystemRegistry() {}
 
+bool FileSystem::FilesExist(const std::vector<string>& files,
+                            std::vector<Status>* status) {
+  bool result = true;
+  for (const auto& file : files) {
+    Status s = FileExists(file);
+    result &= s.ok();
+    if (status != nullptr) {
+      status->push_back(s);
+    } else if (!result) {
+      // Return early since there is no need to check other files.
+      return false;
+    }
+  }
+  return result;
+}
+
 Status FileSystem::GetMatchingPaths(const string& pattern,
                                     std::vector<string>* results) {
   results->clear();
   // Find the fixed prefix by looking for the first wildcard.
-  const string& fixed_prefix =
-      pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string eval_pattern = pattern;
   std::vector<string> all_files;
   string dir = io::Dirname(fixed_prefix).ToString();
-  if (dir.empty()) dir = ".";
+  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
+  // include . as the top level directory.
+  if (dir.empty()) {
+    dir = ".";
+    fixed_prefix = io::JoinPath(dir, fixed_prefix);
+    eval_pattern = io::JoinPath(dir, pattern);
+  }
 
   // Setup a BFS to explore everything under dir.
   std::deque<string> dir_q;
@@ -132,7 +154,7 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
 
   // Match all obtained files to the input pattern.
   for (const auto& f : all_files) {
-    if (Env::Default()->MatchPath(f, pattern)) {
+    if (Env::Default()->MatchPath(f, eval_pattern)) {
       results->push_back(f);
     }
   }
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index ccecddc2ee5..903df96b58a 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -40,116 +40,264 @@ class RandomAccessFile;
 class ReadOnlyMemoryRegion;
 class WritableFile;
 
-/// A generic interface for accessing a file system.
+/// A generic interface for accessing a file system.  Implementations
+/// of custom filesystem adapters must implement this interface,
+/// RandomAccessFile, WritableFile, and ReadOnlyMemoryRegion classes.
 class FileSystem {
  public:
-  FileSystem() {}
-
-  virtual ~FileSystem();
-
-  /// The following functions are the implementations used by the corresponding
-  /// functions in the Env class.
+  /// \brief Creates a brand new random access read-only file with the
+  /// specified name.
+  ///
+  /// On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.  If the file does not exist, returns a non-OK
+  /// status.
+  ///
+  /// The returned file may be concurrently accessed by multiple threads.
+  ///
+  /// The ownership of the returned RandomAccessFile is passed to the caller
+  /// and the object should be deleted when is not used.
   virtual Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) = 0;
 
+  /// \brief Creates an object that writes to a new file with the specified
+  /// name.
+  ///
+  /// Deletes any existing file with the same name and creates a
+  /// new file.  On success, stores a pointer to the new file in
+  /// *result and returns OK.  On failure stores NULL in *result and
+  /// returns non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used.
   virtual Status NewWritableFile(const string& fname,
                                  std::unique_ptr<WritableFile>* result) = 0;
 
+  /// \brief Creates an object that either appends to an existing file, or
+  /// writes to a new file (if the file does not exist to begin with).
+  ///
+  /// On success, stores a pointer to the new file in *result and
+  /// returns OK.  On failure stores NULL in *result and returns
+  /// non-OK.
+  ///
+  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used.
   virtual Status NewAppendableFile(const string& fname,
                                    std::unique_ptr<WritableFile>* result) = 0;
 
+  /// \brief Creates a readonly region of memory with the file context.
+  ///
+  /// On success, it returns a pointer to read-only memory region
+  /// from the content of file fname. The ownership of the region is passed to
+  /// the caller. On failure stores nullptr in *result and returns non-OK.
+  ///
+  /// The returned memory region can be accessed from many threads in parallel.
+  ///
+  /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
+  /// and the object should be deleted when is not used.
   virtual Status NewReadOnlyMemoryRegionFromFile(
       const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) = 0;
 
+  /// Returns OK if the named path exists and NOT_FOUND otherwise.
   virtual Status FileExists(const string& fname) = 0;
 
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  virtual bool FilesExist(const std::vector<string>& files,
+                          std::vector<Status>* status);
+
   /// \brief Returns the immediate children in the given directory.
   ///
   /// The returned paths are relative to 'dir'.
   virtual Status GetChildren(const string& dir,
                              std::vector<string>* result) = 0;
 
-  // \brief Given a pattern, stores in *results the set of paths that matches
-  // that pattern. *results is cleared.
-  //
-  // pattern must match all of a name, not just a substring.
-  //
-  // pattern: { term }
-  // term:
-  //   '*': matches any sequence of non-'/' characters
-  //   '?': matches a single non-'/' character
-  //   '[' [ '^' ] { match-list } ']':
-  //        matches any single character (not) on the list
-  //   c: matches character c (c != '*', '?', '\\', '[')
-  //   '\\' c: matches character c
-  // character-range:
-  //   c: matches character c (c != '\\', '-', ']')
-  //   '\\' c: matches character c
-  //   lo '-' hi: matches character c for lo <= c <= hi
-  //
-  // Typical return codes:
-  //  * OK - no errors
-  //  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
-  //                    implemented
-  // The default implementation uses a combination of GetChildren, MatchPath
-  // and IsDirectory.
+  /// \brief Given a pattern, stores in *results the set of paths that matches
+  /// that pattern. *results is cleared.
+  ///
+  /// pattern must match all of a name, not just a substring.
+  ///
+  /// pattern: { term }
+  /// term:
+  ///   '*': matches any sequence of non-'/' characters
+  ///   '?': matches a single non-'/' character
+  ///   '[' [ '^' ] { match-list } ']':
+  ///        matches any single character (not) on the list
+  ///   c: matches character c (c != '*', '?', '\\', '[')
+  ///   '\\' c: matches character c
+  /// character-range:
+  ///   c: matches character c (c != '\\', '-', ']')
+  ///   '\\' c: matches character c
+  ///   lo '-' hi: matches character c for lo <= c <= hi
+  ///
+  /// Typical return codes:
+  ///  * OK - no errors
+  ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
+  ///                    implemented
+  /// The default implementation uses a combination of GetChildren, MatchPath
+  /// and IsDirectory.
   virtual Status GetMatchingPaths(const string& pattern,
                                   std::vector<string>* results);
 
+  /// \brief Obtains statistics for the given path.
   virtual Status Stat(const string& fname, FileStatistics* stat) = 0;
 
+  /// \brief Deletes the named file.
   virtual Status DeleteFile(const string& fname) = 0;
 
-  // \brief Creates the specified directory.
-  // Typical return codes:
-  //  * OK - successfully created the directory.
-  //  * ALREADY_EXISTS - directory with name dirname already exists.
-  //  * PERMISSION_DENIED - dirname is not writable.
+  /// \brief Creates the specified directory.
+  /// Typical return codes:
+  ///  * OK - successfully created the directory.
+  ///  * ALREADY_EXISTS - directory with name dirname already exists.
+  ///  * PERMISSION_DENIED - dirname is not writable.
   virtual Status CreateDir(const string& dirname) = 0;
 
-  // \brief Creates the specified directory and all the necessary
-  // subdirectories.
-  // Typical return codes:
-  //  * OK - successfully created the directory and sub directories, even if
-  //         they were already created.
-  //  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
+  /// \brief Creates the specified directory and all the necessary
+  /// subdirectories.
+  /// Typical return codes:
+  ///  * OK - successfully created the directory and sub directories, even if
+  ///         they were already created.
+  ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
   virtual Status RecursivelyCreateDir(const string& dirname);
 
+  /// \brief Deletes the specified directory.
   virtual Status DeleteDir(const string& dirname) = 0;
 
-  // \brief Deletes the specified directory and all subdirectories and files
-  // underneath it. undeleted_files and undeleted_dirs stores the number of
-  // files and directories that weren't deleted (unspecified if the return
-  // status is not OK).
-  // REQUIRES: undeleted_files, undeleted_dirs to be not null.
-  // Typical return codes:
-  //  * OK - dirname exists and we were able to delete everything underneath.
-  //  * NOT_FOUND - dirname doesn't exist
-  //  * PERMISSION_DENIED - dirname or some descendant is not writable
-  //  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
-  //                    implemented
+  /// \brief Deletes the specified directory and all subdirectories and files
+  /// underneath it. undeleted_files and undeleted_dirs stores the number of
+  /// files and directories that weren't deleted (unspecified if the return
+  /// status is not OK).
+  /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  /// Typical return codes:
+  ///  * OK - dirname exists and we were able to delete everything underneath.
+  ///  * NOT_FOUND - dirname doesn't exist
+  ///  * PERMISSION_DENIED - dirname or some descendant is not writable
+  ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
+  ///                    implemented
   virtual Status DeleteRecursively(const string& dirname,
                                    int64* undeleted_files,
                                    int64* undeleted_dirs);
 
+  /// \brief Stores the size of `fname` in `*file_size`.
   virtual Status GetFileSize(const string& fname, uint64* file_size) = 0;
 
-  // Overwrites the target if it exists.
+  /// \brief Overwrites the target if it exists.
   virtual Status RenameFile(const string& src, const string& target) = 0;
 
-  // Translate an URI to a filename usable by the FileSystem implementation. The
-  // implementation in this class cleans up the path, removing duplicate /'s,
-  // resolving .. and . (more details in tensorflow::lib::io::CleanPath).
+  /// \brief Translate an URI to a filename for the FileSystem implementation.
+  ///
+  /// The implementation in this class cleans up the path, removing
+  /// duplicate /'s, resolving .. and . (more details in
+  /// tensorflow::lib::io::CleanPath).
   virtual string TranslateName(const string& name) const;
 
-  // Returns whether the given path is a directory or not.
-  // Typical return codes (not guaranteed exhaustive):
-  //  * OK - The path exists and is a directory.
-  //  * FAILED_PRECONDITION - The path exists and is not a directory.
-  //  * NOT_FOUND - The path entry does not exist.
-  //  * PERMISSION_DENIED - Insufficient permissions.
-  //  * UNIMPLEMENTED - The file factory doesn't support directories.
+  /// \brief Returns whether the given path is a directory or not.
+  ///
+  /// Typical return codes (not guaranteed exhaustive):
+  ///  * OK - The path exists and is a directory.
+  ///  * FAILED_PRECONDITION - The path exists and is not a directory.
+  ///  * NOT_FOUND - The path entry does not exist.
+  ///  * PERMISSION_DENIED - Insufficient permissions.
+  ///  * UNIMPLEMENTED - The file factory doesn't support directories.
   virtual Status IsDirectory(const string& fname);
+
+  FileSystem() {}
+
+  virtual ~FileSystem();
+};
+
+/// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() {}
+  virtual ~RandomAccessFile();
+
+  /// \brief Reads up to `n` bytes from the file starting at `offset`.
+  ///
+  /// `scratch[0..n-1]` may be written by this routine.  Sets `*result`
+  /// to the data that was read (including if fewer than `n` bytes were
+  /// successfully read).  May set `*result` to point at data in
+  /// `scratch[0..n-1]`, so `scratch[0..n-1]` must be live when
+  /// `*result` is used.
+  ///
+  /// On OK returned status: `n` bytes have been stored in `*result`.
+  /// On non-OK returned status: `[0..n]` bytes have been stored in `*result`.
+  ///
+  /// Returns `OUT_OF_RANGE` if fewer than n bytes were stored in `*result`
+  /// because of EOF.
+  ///
+  /// Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64 offset, size_t n, StringPiece* result,
+                      char* scratch) const = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomAccessFile);
+};
+
+/// \brief A file abstraction for sequential writing.
+///
+/// The implementation must provide buffering since callers may append
+/// small fragments at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() {}
+  virtual ~WritableFile();
+
+  /// \brief Append 'data' to the file.
+  virtual Status Append(const StringPiece& data) = 0;
+
+  /// \brief Close the file.
+  ///
+  /// Flush() and de-allocate resources associated with this file
+  ///
+  /// Typical return codes (not guaranteed to be exhaustive):
+  ///  * OK
+  ///  * Other codes, as returned from Flush()
+  virtual Status Close() = 0;
+
+  /// \brief Flushes the file and optionally syncs contents to filesystem.
+  ///
+  /// This should flush any local buffers whose contents have not been
+  /// delivered to the filesystem.
+  ///
+  /// If the process terminates after a successful flush, the contents
+  /// may still be persisted, since the underlying filesystem may
+  /// eventually flush the contents.  If the OS or machine crashes
+  /// after a successful flush, the contents may or may not be
+  /// persisted, depending on the implementation.
+  virtual Status Flush() = 0;
+
+  /// \brief Syncs contents of file to filesystem.
+  ///
+  /// This waits for confirmation from the filesystem that the contents
+  /// of the file have been persisted to the filesystem; if the OS
+  /// or machine crashes after a successful Sync, the contents should
+  /// be properly saved.
+  virtual Status Sync() = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(WritableFile);
+};
+
+/// \brief A readonly memmapped file abstraction.
+///
+/// The implementation must guarantee that all memory is accessible when the
+/// object exists, independently from the Env that created it.
+class ReadOnlyMemoryRegion {
+ public:
+  ReadOnlyMemoryRegion() {}
+  virtual ~ReadOnlyMemoryRegion() = default;
+
+  /// \brief Returns a pointer to the memory region.
+  virtual const void* data() = 0;
+
+  /// \brief Returns the length of the memory region in bytes.
+  virtual uint64 length() = 0;
 };
 
 // START_SKIP_DOXYGEN
@@ -220,64 +368,6 @@ class NullFileSystem : public FileSystem {
 
 // END_SKIP_DOXYGEN
 
-/// A file abstraction for randomly reading the contents of a file.
-class RandomAccessFile {
- public:
-  RandomAccessFile() {}
-  virtual ~RandomAccessFile();
-
-  /// \brief Reads up to `n` bytes from the file starting at `offset`.
-  ///
-  /// `scratch[0..n-1]` may be written by this routine.  Sets `*result`
-  /// to the data that was read (including if fewer than `n` bytes were
-  /// successfully read).  May set `*result` to point at data in
-  /// `scratch[0..n-1]`, so `scratch[0..n-1]` must be live when
-  /// `*result` is used.
-  ///
-  /// On OK returned status: `n` bytes have been stored in `*result`.
-  /// On non-OK returned status: `[0..n]` bytes have been stored in `*result`.
-  ///
-  /// Returns `OUT_OF_RANGE` if fewer than n bytes were stored in `*result`
-  /// because of EOF.
-  ///
-  /// Safe for concurrent use by multiple threads.
-  virtual Status Read(uint64 offset, size_t n, StringPiece* result,
-                      char* scratch) const = 0;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(RandomAccessFile);
-};
-
-/// \brief A file abstraction for sequential writing.
-///
-/// The implementation must provide buffering since callers may append
-/// small fragments at a time to the file.
-class WritableFile {
- public:
-  WritableFile() {}
-  virtual ~WritableFile();
-
-  virtual Status Append(const StringPiece& data) = 0;
-  virtual Status Close() = 0;
-  virtual Status Flush() = 0;
-  virtual Status Sync() = 0;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(WritableFile);
-};
-
-/// \brief A readonly memmapped file abstraction.
-///
-/// The implementation must guarantee that all memory is accessable when the
-/// object exists, independently from the Env that created it.
-class ReadOnlyMemoryRegion {
- public:
-  ReadOnlyMemoryRegion() {}
-  virtual ~ReadOnlyMemoryRegion() = default;
-  virtual const void* data() = 0;
-  virtual uint64 length() = 0;
-};
-
 /// \brief A registry for file system implementations.
 ///
 /// Filenames are specified as an URI, which is of the form
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 67b0a47267b..abe88ab6c7e 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -166,7 +166,7 @@ string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
   }
 }
 
-TEST(TestFileSystem, IPFSMatch) {
+TEST(InterPlanetaryFileSystemTest, IPFSMatch) {
   InterPlanetaryFileSystem ipfs;
   EXPECT_EQ(Match(&ipfs, "thereisnosuchfile"), "");
   EXPECT_EQ(Match(&ipfs, "*"),
@@ -183,7 +183,7 @@ TEST(TestFileSystem, IPFSMatch) {
   EXPECT_EQ(Match(&ipfs, "Planet?"), "Planet0,Planet1");
 }
 
-TEST(TestFileSystem, MatchSimple) {
+TEST(InterPlanetaryFileSystemTest, MatchSimple) {
   InterPlanetaryFileSystem ipfs;
   TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-00")));
   TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-0a")));
@@ -199,7 +199,7 @@ TEST(TestFileSystem, MatchSimple) {
 
 // Create 2 directories abcd and evil_directory. Look for abcd and make sure
 // that evil_directory isn't accessed.
-TEST(TestFileSystem, MatchOnlyNeeded) {
+TEST(InterPlanetaryFileSystemTest, MatchOnlyNeeded) {
   InterPlanetaryFileSystem ipfs;
   TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "abcd")));
   TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "evil_directory")));
@@ -207,7 +207,7 @@ TEST(TestFileSystem, MatchOnlyNeeded) {
   EXPECT_EQ(Match(&ipfs, "abcd"), "abcd");
 }
 
-TEST(TestFileSystem, MatchDirectory) {
+TEST(InterPlanetaryFileSystemTest, MatchDirectory) {
   InterPlanetaryFileSystem ipfs;
   TF_EXPECT_OK(
       ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/x")));
@@ -228,7 +228,7 @@ TEST(TestFileSystem, MatchDirectory) {
   EXPECT_EQ(Match(&ipfs, "match-?[^a]/abc/x"), "match-00/abc/x,match-01/abc/x");
 }
 
-TEST(TestFileSystem, MatchMultipleWildcards) {
+TEST(InterPlanetaryFileSystemTest, MatchMultipleWildcards) {
   InterPlanetaryFileSystem ipfs;
   TF_EXPECT_OK(
       ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/00")));
@@ -249,7 +249,7 @@ TEST(TestFileSystem, MatchMultipleWildcards) {
             "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
 }
 
-TEST(TestFileSystem, RecursivelyCreateAlreadyExistingDir) {
+TEST(InterPlanetaryFileSystemTest, RecursivelyCreateAlreadyExistingDir) {
   InterPlanetaryFileSystem ipfs;
   const string dirname = io::JoinPath(kPrefix, "match-00/abc/00");
   TF_EXPECT_OK(ipfs.RecursivelyCreateDir(dirname));
@@ -259,4 +259,36 @@ TEST(TestFileSystem, RecursivelyCreateAlreadyExistingDir) {
   TF_EXPECT_OK(ipfs.RecursivelyCreateDir(dirname));
 }
 
+// A simple file system with a root directory and a single file underneath it.
+class TestFileSystem : public NullFileSystem {
+ public:
+  // Only allow for a single root directory.
+  Status IsDirectory(const string& dirname) override {
+    if (dirname == "." || dirname.empty()) {
+      return Status::OK();
+    }
+    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
+  }
+
+  // Simulating a FS with a root dir and a single file underneath it.
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    if (dir == "." || dir.empty()) {
+      result->push_back("test");
+    }
+    return Status::OK();
+  }
+};
+
+// Making sure that ./<pattern> and <pattern> have the same result.
+TEST(TestFileSystemTest, RootDirectory) {
+  TestFileSystem fs;
+  std::vector<string> results;
+  auto ret = fs.GetMatchingPaths("./te*", &results);
+  EXPECT_EQ(1, results.size());
+  EXPECT_EQ("./test", results[0]);
+  ret = fs.GetMatchingPaths("te*", &results);
+  EXPECT_EQ(1, results.size());
+  EXPECT_EQ("./test", results[0]);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 16e401a54ec..b0d4f51fe3a 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/posix/error.h"
 #include "third_party/hadoop/hdfs.h"
 
-
 namespace tensorflow {
 
 template <typename R, typename... Args>
@@ -58,6 +57,7 @@ class LibHDFS {
   std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
   std::function<hdfsBuilder*()> hdfsNewBuilder;
   std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
+  std::function<int(const char*, char**)> hdfsConfGetStr;
   std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
       hdfsBuilderSetKerbTicketCachePath;
   std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
@@ -85,6 +85,7 @@ class LibHDFS {
       BIND_HDFS_FUNC(hdfsBuilderConnect);
       BIND_HDFS_FUNC(hdfsNewBuilder);
       BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
+      BIND_HDFS_FUNC(hdfsConfGetStr);
       BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
       BIND_HDFS_FUNC(hdfsCloseFile);
       BIND_HDFS_FUNC(hdfsPread);
@@ -103,12 +104,12 @@ class LibHDFS {
       return Status::OK();
     };
 
-    // libhdfs.so won't be in the standard locations. Use the path as specified
-    // in the libhdfs documentation.
+// libhdfs.so won't be in the standard locations. Use the path as specified
+// in the libhdfs documentation.
 #if defined(PLATFORM_WINDOWS)
-    const char *kLibHdfsDso = "hdfs.dll";
+    const char* kLibHdfsDso = "hdfs.dll";
 #else
-    const char *kLibHdfsDso = "libhdfs.so";
+    const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
     if (hdfs_home == nullptr) {
@@ -119,11 +120,10 @@ class LibHDFS {
     string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
     status_ = TryLoadAndBind(path.c_str(), &handle_);
     if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case libhdfs.so
-      // is installed in non-standard location
+      // try load libhdfs.so using dynamic loader's search path in case
+      // libhdfs.so is installed in non-standard location
       status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
     }
-    return;
   }
 
   Status status_;
@@ -147,6 +147,20 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
   if (scheme == "file") {
     hdfs_->hdfsBuilderSetNameNode(builder, nullptr);
+  } else if (scheme == "viewfs") {
+    char* defaultFS = nullptr;
+    hdfs_->hdfsConfGetStr("fs.defaultFS", &defaultFS);
+    StringPiece defaultScheme, defaultCluster, defaultPath;
+    io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
+
+    if (scheme != defaultScheme || namenode != defaultCluster) {
+      return errors::Unimplemented(
+          "viewfs is only supported as a fs.defaultFS.");
+    }
+    // The default NameNode configuration will be used (from the XML
+    // configuration files). See:
+    // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
+    hdfs_->hdfsBuilderSetNameNode(builder, "default");
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
@@ -259,7 +273,7 @@ class HDFSWritableFile : public WritableFile {
 
   ~HDFSWritableFile() override {
     if (file_ != nullptr) {
-      Close();
+      Close().IgnoreError();
     }
   }
 
@@ -478,5 +492,6 @@ Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) {
 }
 
 REGISTER_FILE_SYSTEM("hdfs", HadoopFileSystem);
+REGISTER_FILE_SYSTEM("viewfs", HadoopFileSystem);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 1ca36db548b..985c061676c 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -26,16 +26,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-namespace port {
-
-// Some platforms require that filenames be of a certain form when
-// used for logging.  This function is invoked to allow platforms to
-// adjust the filename used for logging appropriately, if necessary
-// (most ports can just do nothing).  If any changes are necessary, the
-// implementation should mutate "*filename" appropriately.
-void AdjustFilenameForLogging(string* filename);
-
-}  // namespace port
 
 namespace internal {
 // Emit "message" as a log message to the log for the specified
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index aad35890af6..eaf0171e72d 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -53,6 +53,17 @@ limitations under the License.
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
 #endif
 
+// Control visiblity outside .so
+#if defined(COMPILER_MSVC)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_EXPORT __declspec(dllexport)
+#else
+#define TF_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+
 // GCC can be told that a certain branch is not likely to be taken (for
 // instance, a CHECK failure), and use that information in static analysis.
 // Giving it this information can help it optimize for the common case in
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 55d7954f36d..7c44b22eff6 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -19,7 +19,7 @@ limitations under the License.
 // Set one PLATFORM_* macro and set IS_MOBILE_PLATFORM if the platform is for
 // mobile.
 
-#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) &&                 \
     !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
     !defined(PLATFORM_WINDOWS)
 
@@ -54,4 +54,11 @@ limitations under the License.
 #endif
 #endif
 
+// Look for both gcc/clang and Visual Studio macros indicating we're compiling
+// for an x86 device.
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+#define PLATFORM_IS_X86
+#endif
+
 #endif  // TENSORFLOW_PLATFORM_PLATFORM_DEFINE_H_
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 8930e49ff84..15c3cb24f04 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -26,7 +26,7 @@ namespace port {
 TEST(Port, AlignedMalloc) {
   for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
     void* p = AlignedMalloc(1, alignment);
-    ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")";
+    ASSERT_TRUE(p != nullptr) << "AlignedMalloc(1, " << alignment << ")";
     uintptr_t pval = reinterpret_cast<uintptr_t>(p);
     EXPECT_EQ(pval % alignment, 0);
     AlignedFree(p);
@@ -38,14 +38,14 @@ TEST(ConditionVariable, WaitForMilliseconds_Timeout) {
   mutex_lock l(m);
   condition_variable cv;
   ConditionResult result = kCond_MaybeNotified;
-  time_t start = time(NULL);
+  time_t start = time(nullptr);
   // Condition variables are subject to spurious wakeups on some platforms,
   // so need to check for a timeout within a loop.
   while (result == kCond_MaybeNotified) {
     result = WaitForMilliseconds(&l, &cv, 3000);
   }
   EXPECT_EQ(result, kCond_Timeout);
-  time_t finish = time(NULL);
+  time_t finish = time(nullptr);
   EXPECT_GE(finish - start, 3);
 }
 
@@ -54,7 +54,7 @@ TEST(ConditionVariable, WaitForMilliseconds_Signalled) {
   mutex m;
   mutex_lock l(m);
   condition_variable cv;
-  time_t start = time(NULL);
+  time_t start = time(nullptr);
   // Sleep for just 1 second then notify.  We have a timeout of 3 secs,
   // so the condition variable will notice the cv signal before the timeout.
   pool.Schedule([&m, &cv]() {
@@ -63,7 +63,7 @@ TEST(ConditionVariable, WaitForMilliseconds_Signalled) {
     cv.notify_all();
   });
   EXPECT_EQ(WaitForMilliseconds(&l, &cv, 3000), kCond_MaybeNotified);
-  time_t finish = time(NULL);
+  time_t finish = time(nullptr);
   EXPECT_LT(finish - start, 3);
 }
 
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 92e36cf554f..8cfb39ae18a 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -44,7 +44,7 @@ class StdThread : public Thread {
   StdThread(const ThreadOptions& thread_options, const string& name,
             std::function<void()> fn)
       : thread_(fn) {}
-  ~StdThread() { thread_.join(); }
+  ~StdThread() override { thread_.join(); }
 
  private:
   std::thread thread_;
diff --git a/tensorflow/core/platform/posix/env_time.cc b/tensorflow/core/platform/posix/env_time.cc
index 15254c1eed0..341c585a9e4 100644
--- a/tensorflow/core/platform/posix/env_time.cc
+++ b/tensorflow/core/platform/posix/env_time.cc
@@ -28,7 +28,7 @@ class PosixEnvTime : public EnvTime {
 
   uint64 NowMicros() override {
     struct timeval tv;
-    gettimeofday(&tv, NULL);
+    gettimeofday(&tv, nullptr);
     return static_cast<uint64>(tv.tv_sec) * 1000000 + tv.tv_usec;
   }
 };
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index 94a3a6ab5d7..df5c8008792 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -131,7 +131,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENETUNREACH:   // Network unreachable
     case ENOLCK:        // No locks available
     case ENOLINK:       // Link has been severed
-#if !(defined(__APPLE__) || defined(_WIN32))
+#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32))
     case ENONET:  // Machine is not on the network
 #endif
       code = error::UNAVAILABLE;
diff --git a/tensorflow/core/platform/posix/load_library.cc b/tensorflow/core/platform/posix/load_library.cc
index d795b6058f3..8fad53560c5 100644
--- a/tensorflow/core/platform/posix/load_library.cc
+++ b/tensorflow/core/platform/posix/load_library.cc
@@ -49,7 +49,7 @@ string FormatLibraryFileName(const string& name, const string& version) {
     filename = "lib" + name + "." + version + ".dylib";
   }
 #else
-  if (version.size() == 0) {
+  if (version.empty()) {
     filename = "lib" + name + ".so";
   } else {
     filename = "lib" + name + ".so" + "." + version;
diff --git a/tensorflow/core/platform/posix/net.cc b/tensorflow/core/platform/posix/net.cc
index a7275109c83..414ee6c50c2 100644
--- a/tensorflow/core/platform/posix/net.cc
+++ b/tensorflow/core/platform/posix/net.cc
@@ -57,15 +57,16 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   // Try binding to port.
   addr.sin_family = AF_INET;
   addr.sin_addr.s_addr = INADDR_ANY;
-  addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+  addr.sin_port = htons(static_cast<uint16_t>(*port));
+  if (bind(fd, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)) < 0) {
     LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
     close(fd);
     return false;
   }
 
   // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
+  if (getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addr_len) <
+      0) {
     LOG(WARNING) << "getsockname() failed: " << strerror(errno);
     close(fd);
     return false;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 91d612f2339..3b17bac8089 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
@@ -29,9 +30,9 @@ limitations under the License.
 #include <string.h>
 #include <unistd.h>
 #ifdef SNAPPY
-#include <snappy.h>
+#include "snappy.h"
 #endif
-#if defined(__APPLE__) && defined(__MACH__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
 #include <thread>
 #endif
 
@@ -55,7 +56,7 @@ int NumSchedulableCPUs() {
   }
   perror("sched_getaffinity");
 #endif
-#if defined(__APPLE__) && defined(__MACH__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
   unsigned int count = std::thread::hardware_concurrency();
   if (count > 0) return static_cast<int>(count);
 #endif
@@ -69,7 +70,7 @@ void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
 #else  // !defined(__ANDROID__)
-  void* ptr = NULL;
+  void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return
   // memory aligned to at least the size of a pointer.
@@ -81,7 +82,7 @@ void* AlignedMalloc(size_t size, int minimum_alignment) {
   int err = posix_memalign(&ptr, minimum_alignment, size);
 #endif
   if (err != 0) {
-    return NULL;
+    return nullptr;
   } else {
     return ptr;
   }
@@ -155,5 +156,10 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 
 string Demangle(const char* mangled) { return mangled; }
 
+double NominalCPUFrequency() {
+  // TODO(yuefengz): implement it for this platform.
+  return 1.0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 00560d79c12..fb7a5a99959 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -78,7 +78,7 @@ class PosixWritableFile : public WritableFile {
       : filename_(fname), file_(f) {}
 
   ~PosixWritableFile() override {
-    if (file_ != NULL) {
+    if (file_ != nullptr) {
       // Ignoring any potential errors
       fclose(file_);
     }
@@ -97,7 +97,7 @@ class PosixWritableFile : public WritableFile {
     if (fclose(file_) != 0) {
       result = IOError(filename_, errno);
     }
-    file_ = NULL;
+    file_ = nullptr;
     return result;
   }
 
@@ -121,7 +121,9 @@ class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   PosixReadOnlyMemoryRegion(const void* address, uint64 length)
       : address_(address), length_(length) {}
-  ~PosixReadOnlyMemoryRegion() { munmap(const_cast<void*>(address_), length_); }
+  ~PosixReadOnlyMemoryRegion() override {
+    munmap(const_cast<void*>(address_), length_);
+  }
   const void* data() override { return address_; }
   uint64 length() override { return length_; }
 
@@ -148,7 +150,7 @@ Status PosixFileSystem::NewWritableFile(const string& fname,
   string translated_fname = TranslateName(fname);
   Status s;
   FILE* f = fopen(translated_fname.c_str(), "w");
-  if (f == NULL) {
+  if (f == nullptr) {
     s = IOError(fname, errno);
   } else {
     result->reset(new PosixWritableFile(translated_fname, f));
@@ -161,7 +163,7 @@ Status PosixFileSystem::NewAppendableFile(
   string translated_fname = TranslateName(fname);
   Status s;
   FILE* f = fopen(translated_fname.c_str(), "a");
-  if (f == NULL) {
+  if (f == nullptr) {
     s = IOError(fname, errno);
   } else {
     result->reset(new PosixWritableFile(translated_fname, f));
@@ -203,11 +205,11 @@ Status PosixFileSystem::GetChildren(const string& dir,
   string translated_dir = TranslateName(dir);
   result->clear();
   DIR* d = opendir(translated_dir.c_str());
-  if (d == NULL) {
+  if (d == nullptr) {
     return IOError(dir, errno);
   }
   struct dirent* entry;
-  while ((entry = readdir(d)) != NULL) {
+  while ((entry = readdir(d)) != nullptr) {
     StringPiece basename = entry->d_name;
     if ((basename != ".") && (basename != "..")) {
       result->push_back(entry->d_name);
diff --git a/tensorflow/core/platform/posix/subprocess.cc b/tensorflow/core/platform/posix/subprocess.cc
index fc511fdf727..cefc66831a9 100644
--- a/tensorflow/core/platform/posix/subprocess.cc
+++ b/tensorflow/core/platform/posix/subprocess.cc
@@ -28,7 +28,7 @@ limitations under the License.
 // A danger of calling fork() (as opposed to clone() or vfork()) is that if
 // many people have used pthread_atfork() to acquire locks, fork() can deadlock,
 // because it's unlikely that the locking order will be correct in a large
-// programme where different layers are unaware of one another and using
+// program where different layers are unaware of one another and using
 // pthread_atfork() independently.
 //
 // The danger of not calling fork() is that if libc managed to use
diff --git a/tensorflow/core/platform/prefetch.h b/tensorflow/core/platform/prefetch.h
index cc6f05d575c..81e1a5210a4 100644
--- a/tensorflow/core/platform/prefetch.h
+++ b/tensorflow/core/platform/prefetch.h
@@ -44,7 +44,9 @@ void prefetch(const void* x);
 // ---------------------------------------------------------------------------
 template <PrefetchHint hint>
 inline void prefetch(const void* x) {
-#if defined(__llvm__) || defined(COMPILER_GCC)
+// Check of COMPILER_GCC macro below is kept only for backward-compatibility
+// reasons. COMPILER_GCC3 is the macro that actually enables prefetch.
+#if defined(__llvm__) || defined(COMPILER_GCC) || defined(COMPILER_GCC3)
   __builtin_prefetch(x, 0, hint);
 #else
 // You get no effect.  Feel free to add more sections above.
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index 61b6fa0c84b..fb1955edde2 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
 
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
@@ -24,6 +25,7 @@ limitations under the License.
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 #include <unistd.h>
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -112,39 +114,18 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   if (fp == nullptr) {
     return INVALID_CPU_FREQUENCY;
   }
-  int64 freq = INVALID_CPU_FREQUENCY;
-  const int retval = fscanf(fp, "%lld", &freq);
+  int64 freq_in_khz = INVALID_CPU_FREQUENCY;
+  const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
     return INVALID_CPU_FREQUENCY;
   }
   pclose(fp);
-  return freq;
+  return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 
 }  // namespace profile_utils
 }  // namespace tensorflow
 
-// defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-#else
-
-// Dummy implementations to avoid link error.
-
-namespace tensorflow {
-namespace profile_utils {
-
-void AndroidArmV7ACpuUtilsHelper::ResetClockCycle() {}
-uint64 AndroidArmV7ACpuUtilsHelper::GetCurrentClockCycle() { return 1; }
-void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(bool) {}
-int AndroidArmV7ACpuUtilsHelper::OpenPerfEvent(perf_event_attr *const,
-                                               const pid_t, const int,
-                                               const int, const unsigned long) {
-  return 0;
-}
-int64 AndroidArmV7ACpuUtilsHelper::CalculateCpuFrequency() { return 0; }
-
-}  // namespace profile_utils
-}  // namespace tensorflow
-
-// defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-#endif
+#endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
+        // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index c4fe2fc5b3c..8604b01c53e 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -16,10 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
 #define TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
 
+#include <sys/types.h>
+
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
 struct perf_event_attr;
 
 namespace tensorflow {
@@ -56,4 +61,7 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
 }  // profile_utils
 }  // tensorflow
 
+#endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
+        // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
diff --git a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
index 876bb9c020b..de4eec28e30 100644
--- a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
+++ b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
 #define TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
 
+#include <algorithm>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 9d1e464cb37..22400565d67 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -56,12 +56,7 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 /* static */ int64 CpuUtils::GetCycleCounterFrequencyImpl() {
 // TODO(satok): do not switch by macro here
 #if defined(__ANDROID__)
-#if defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-  // This profiling tool only supports Ver 21 or upper on Android
   return GetCpuUtilsHelperSingletonInstance().CalculateCpuFrequency();
-#else  // defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-  return INVALID_FREQUENCY;
-#endif
 #elif defined(__linux__)
   double bogomips;
   FILE* fp = popen("grep '^bogomips' /proc/cpuinfo | head -1", "r");
@@ -108,7 +103,8 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
     if (cpu_utils_helper_instance_ != nullptr) {
       LOG(FATAL) << "cpu_utils_helper_instance_ is already instantiated.";
     }
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
     cpu_utils_helper_instance_ = new AndroidArmV7ACpuUtilsHelper();
 #else
       cpu_utils_helper_instance_ = new DefaultCpuUtilsHelper();
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 2d80f2e89c3..19471ec8585 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -53,11 +53,7 @@ class CpuUtils {
   // is less than 2 ^ 61.
   static inline uint64 GetCurrentClockCycle() {
 #if defined(__ANDROID__)
-#if defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
-#else   // defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-    return DUMMY_CYCLE_CLOCK;
-#endif  // defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
 // ----------------------------------------------------------------
 #elif defined(__x86_64__) || defined(__amd64__)
     uint64_t high, low;
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index fccc4d38a78..ca487965a0c 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -26,7 +26,7 @@ static constexpr bool DBG = false;
 
 class CpuUtilsTest : public ::testing::Test {
  protected:
-  void SetUp() { CpuUtils::EnableClockCycleProfiling(true); }
+  void SetUp() override { CpuUtils::EnableClockCycleProfiling(true); }
 };
 
 TEST_F(CpuUtilsTest, SetUpTestCase) {}
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index c7a72ee7014..288d0916244 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -25,7 +25,7 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#if defined(PLATFORM_GOOGLE)
+#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
 #include "tensorflow/core/platform/google/protobuf.h"
 #else
 #include "tensorflow/core/platform/default/protobuf.h"
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index febb47781c6..0c66da09bb9 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,21 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-#ifdef __STDC_IEC_559__
-#include <fenv.h> // fesetround, FE_*
-#endif
 
 namespace tensorflow {
 namespace port {
 
-ScopedSetRound::ScopedSetRound() {
-#ifdef __STDC_IEC_559__
-   std::fesetround(FE_TONEAREST);
-#endif
+ScopedSetRound::ScopedSetRound(const int mode) {
+  original_mode_ = std::fegetround();
+  if (original_mode_ < 0) {
+    // Failed to get current mode, assume ROUND TO NEAREST.
+    original_mode_ = FE_TONEAREST;
+  }
+  std::fesetround(mode);
 }
 
-ScopedSetRound::~ScopedSetRound() {
-}
+ScopedSetRound::~ScopedSetRound() { std::fesetround(original_mode_); }
 
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/setround.h b/tensorflow/core/platform/setround.h
index 4b1b3fd4976..d076e7acc6c 100644
--- a/tensorflow/core/platform/setround.h
+++ b/tensorflow/core/platform/setround.h
@@ -16,23 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_SETROUND_H_
 #define TENSORFLOW_PLATFORM_SETROUND_H_
 
+#include <cfenv>
+
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 namespace port {
 
-// While this class is active, floating point numbers are rounded to NEAREST
-// to zero.  The destructor restores the original flags.
+// While this class is active, floating point rounding mode is set to the given
+// mode. The mode can be one of the modes defined in <cfenv>, i.e. FE_DOWNWARD,
+// FE_TONEAREST, FE_TOWARDZERO, or FE_UPWARD. The destructor restores the
+// original rounding mode if it could be determined. If the original rounding
+// mode could not be determined, the destructor sets it to FE_TONEAREST.
 class ScopedSetRound {
  public:
-  ScopedSetRound();
+  ScopedSetRound(int mode);
   ~ScopedSetRound();
 
  private:
+  int original_mode_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ScopedSetRound);
 };
 
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_SETROUN_H_
+#endif  // TENSORFLOW_PLATFORM_SETROUND_H_
diff --git a/tensorflow/core/platform/setround_test.cc b/tensorflow/core/platform/setround_test.cc
new file mode 100644
index 00000000000..8d0995cdd2e
--- /dev/null
+++ b/tensorflow/core/platform/setround_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/setround.h"
+
+#include <cmath>
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+void CheckDownward() {
+  EXPECT_EQ(12, std::nearbyint(12.0));
+  EXPECT_EQ(12, std::nearbyint(12.1));
+  EXPECT_EQ(-13, std::nearbyint(-12.1));
+  EXPECT_EQ(12, std::nearbyint(12.5));
+  EXPECT_EQ(12, std::nearbyint(12.9));
+  EXPECT_EQ(-13, std::nearbyint(-12.9));
+  EXPECT_EQ(13, std::nearbyint(13.0));
+}
+
+void CheckToNearest() {
+  EXPECT_EQ(12, std::nearbyint(12.0));
+  EXPECT_EQ(12, std::nearbyint(12.1));
+  EXPECT_EQ(-12, std::nearbyint(-12.1));
+  EXPECT_EQ(12, std::nearbyint(12.5));
+  EXPECT_EQ(13, std::nearbyint(12.9));
+  EXPECT_EQ(-13, std::nearbyint(-12.9));
+  EXPECT_EQ(13, std::nearbyint(13.0));
+}
+
+void CheckTowardZero() {
+  EXPECT_EQ(12, std::nearbyint(12.0));
+  EXPECT_EQ(12, std::nearbyint(12.1));
+  EXPECT_EQ(-12, std::nearbyint(-12.1));
+  EXPECT_EQ(12, std::nearbyint(12.5));
+  EXPECT_EQ(12, std::nearbyint(12.9));
+  EXPECT_EQ(-12, std::nearbyint(-12.9));
+  EXPECT_EQ(13, std::nearbyint(13.0));
+}
+
+void CheckUpward() {
+  EXPECT_EQ(12, std::nearbyint(12.0));
+  EXPECT_EQ(13, std::nearbyint(12.1));
+  EXPECT_EQ(-12, std::nearbyint(-12.1));
+  EXPECT_EQ(13, std::nearbyint(12.5));
+  EXPECT_EQ(13, std::nearbyint(12.9));
+  EXPECT_EQ(-12, std::nearbyint(-12.9));
+  EXPECT_EQ(13, std::nearbyint(13.0));
+}
+
+TEST(SetScopedSetRound, Downward) {
+  port::ScopedSetRound round(FE_DOWNWARD);
+  CheckDownward();
+}
+
+TEST(SetScopedSetRound, ToNearest) {
+  port::ScopedSetRound round(FE_TONEAREST);
+  CheckToNearest();
+}
+
+TEST(SetScopedSetRound, TowardZero) {
+  port::ScopedSetRound round(FE_TOWARDZERO);
+  CheckTowardZero();
+}
+
+TEST(SetScopedSetRound, Upward) {
+  port::ScopedSetRound round(FE_UPWARD);
+  CheckUpward();
+}
+
+TEST(SetScopedSetRound, Scoped) {
+  std::fesetround(FE_TONEAREST);
+  CheckToNearest();
+  {
+    port::ScopedSetRound round(FE_UPWARD);
+    CheckUpward();
+  }
+  // Check that the rounding mode is reset when round goes out of scope.
+  CheckToNearest();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/test.cc b/tensorflow/core/platform/test.cc
index 920f9839821..30c8b1a794f 100644
--- a/tensorflow/core/platform/test.cc
+++ b/tensorflow/core/platform/test.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID)
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
 #include "tensorflow/core/platform/google/build_config/googletest.h"
 #endif
 
+#include <cstdlib>
+#include <iostream>
+
 namespace tensorflow {
 namespace testing {
 
-#if defined(PLATFORM_GOOGLE) || defined(__ANDROID__)
+#if defined(PLATFORM_GOOGLE)
 string TmpDir() { return FLAGS_test_tmpdir; }
 int RandomSeed() { return FLAGS_test_random_seed; }
 #else
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 3723ae1716e..ff3aa4a6f16 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -197,6 +197,12 @@ class Tracing::ScopedAnnotation {
   // label string is only done if tracing is enabled.
   ScopedAnnotation(StringPiece name_part1, StringPiece name_part2);
 
+  // Returns true iff scoped annotations are active.
+  static bool Enabled() {
+    auto e = Tracing::engine();
+    return e && e->IsEnabled();
+  }
+
  private:
   std::unique_ptr<Engine::Annotation> annotation_;
 };
@@ -209,6 +215,12 @@ class Tracing::TraceMe {
  public:
   explicit TraceMe(StringPiece name);
 
+  // If tracing is enabled, set up a traceMe with a label of
+  // "<name_part1>:<name_part2>".  This can be cheaper than the
+  // single-argument constructor because the concatenation of the
+  // label string is only done if tracing is enabled.
+  TraceMe(StringPiece name_part1, StringPiece name_part2);
+
  private:
   std::unique_ptr<Engine::Tracer> tracer_;
 };
@@ -236,6 +248,15 @@ inline Tracing::TraceMe::TraceMe(StringPiece name) {
   }
 }
 
+inline Tracing::TraceMe::TraceMe(StringPiece name_part1,
+                                 StringPiece name_part2) {
+  auto e = Tracing::engine();
+  if (e && e->IsEnabled()) {
+    tracer_.reset(
+        e->StartTracing(strings::StrCat(name_part1, ":", name_part2)));
+  }
+}
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index 77a1946e61b..d6e78dbc8f9 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 
+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
+
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
diff --git a/tensorflow/core/platform/windows/intrinsics_port.h b/tensorflow/core/platform/windows/intrinsics_port.h
index a4fa1e99710..80754895c6d 100644
--- a/tensorflow/core/platform/windows/intrinsics_port.h
+++ b/tensorflow/core/platform/windows/intrinsics_port.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
 #define TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
 
-
 #ifdef _MSC_VER
 // the following avx intrinsics are not defined on windows
 // in immintrin.h so we define them here.
@@ -24,18 +23,19 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #define _mm_load_pd1 _mm_load1_pd
-static inline int
-_mm256_extract_epi32(__m256i a, const int i)
-{
+
+// only define these intrinsics if immintrin.h doesn't have them (VS2015 and
+// earlier)
+#if _MSC_VER < 1910
+static inline int _mm256_extract_epi32(__m256i a, const int i) {
   return a.m256i_i32[i & 7];
 }
 
-static inline __m256i
-_mm256_insert_epi32(__m256i a, int b, const int i)
-{
+static inline __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
   __m256i c = a;
   c.m256i_i32[i & 7] = b;
   return c;
 }
 #endif
 #endif
+#endif
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index b2167081a69..85b53e07c43 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef SNAPPY
-#include <snappy.h>
+#include "snappy.h"
 #endif
 
 #include <Windows.h>
@@ -53,16 +57,55 @@ int NumSchedulableCPUs() {
 }
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  void* ptr = NULL;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) return Malloc(size);
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+  if (err != 0) {
+    return NULL;
+  } else {
+    return ptr;
+  }
+#else
   return _aligned_malloc(size, minimum_alignment);
+#endif
 }
 
-void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(aligned_memory);
+#else
+  _aligned_free(aligned_memory);
+#endif
+}
 
-void* Malloc(size_t size) { return ::malloc(size); }
+void* Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}
 
-void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+void* Realloc(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}
 
-void Free(void* ptr) { ::free(ptr); }
+void Free(void* ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_free(ptr);
+#else
+  return free(ptr);
+#endif
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
@@ -105,5 +148,10 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 
 string Demangle(const char* mangled) { return mangled; }
 
+double NominalCPUFrequency() {
+  // TODO(yuefengz): implement it for this platform.
+  return 1.0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index facadc7f57f..72e7e06e65c 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -230,11 +230,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   result->reset();
 
   // Open the file for read-only random access
-  // Random access is to disable read-ahead as the system reads too much data
   // Open in async mode which makes Windows allow more parallelism even
   // if we need to do sync I/O on top of it.
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
-      FILE_FLAG_OVERLAPPED;
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED;
   // Shared access is necessary for tests to pass
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
@@ -306,8 +304,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   result->reset();
   Status s = Status::OK();
 
-  // Open the file for read-only random access
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
+  // Open the file for read-only
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY;
 
   // Open in async mode which makes Windows allow more parallelism even
   // if we need to do sync I/O on top of it.
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
new file mode 100644
index 00000000000..33c87eefe02
--- /dev/null
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "ClusterProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.distruntime";
+
+// This file contains protos to be used when defining a TensorFlow
+// cluster.
+//
+// EXAMPLES
+// --------
+//
+// 1. A single-process cluster, containing "/job:local/task:0".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
+//
+//    Server:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//
+// 2. A two-process cluster, containing "/job:local/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
+//                          tasks { key: 1 value: 'localhost:2223' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//      cluster { $CLUSTER } job_name: 'local' task_index: 1
+//
+// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
+//    "/job:ps/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
+//                           tasks { key: 1 value: 'worker2:2222' }
+//                           tasks { key: 2 value: 'worker3:2222' } }
+//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
+//                           tasks { key: 1 value: 'ps1:2222' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
+
+// Defines a single job in a TensorFlow cluster.
+message JobDef {
+  // The name of this job.
+  string name = 1;
+
+  // Mapping from task ID to "hostname:port" string.
+  //
+  // If the `name` field contains "worker", and the `tasks` map contains a
+  // mapping from 7 to "example.org:2222", then the device prefix
+  // "/job:worker/task:7" will be assigned to "example.org:2222".
+  map<int32, string> tasks = 2;
+}
+
+// Defines a TensorFlow cluster as a set of jobs.
+message ClusterDef {
+  // The jobs that comprise the cluster.
+  repeated JobDef job = 1;
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index e8c97c02da9..630f47633f8 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -10,6 +10,8 @@ import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/protobuf/debug.proto";
+import "tensorflow/core/protobuf/cluster.proto";
+import "tensorflow/core/protobuf/rewriter_config.proto";
 
 message GPUOptions {
   // A value between 0 and 1 that indicates what fraction of the
@@ -63,6 +65,18 @@ message GPUOptions {
   // PollEvents calls, when the queue is empty.  If value is not
   // set or set to 0, gets set to a non-zero default.
   int32 polling_inactive_delay_msecs = 7;
+
+  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
+  // enabling this option forces all CPU tensors to be allocated with Cuda
+  // pinned memory. Normally, TensorFlow will infer which tensors should be
+  // allocated as the pinned memory. But in case where the inference is
+  // incomplete, this option can significantly speed up the cross-device memory
+  // copy performance as long as it fits the memory.
+  // Note that this option is not something that should be
+  // enabled by default for unknown or very large models, since all Cuda pinned
+  // memory is unpageable, having too much pinned memory might negatively impact
+  // the overall host system performance.
+  bool force_gpu_compatible = 8;
 };
 
 // Options passed to the graph optimizer
@@ -144,6 +158,9 @@ message GraphOptions {
   // If > 0, record a timeline every this many steps.
   // EXPERIMENTAL: This currently has no effect in MasterSession.
   int32 timeline_step = 8;
+
+  // Options that control the type and amount of graph rewriting.
+  RewriterConfig rewrite_options = 10;
 };
 
 message ThreadPoolOptionProto {
@@ -243,6 +260,11 @@ message ConfigProto {
 
   // Options that apply when this session uses the distributed runtime.
   RPCOptions rpc_options = 13;
+
+  // Optional list of all workers to use in this session.
+  ClusterDef cluster_def = 14;
+
+  // Next: 15
 };
 
 // Options for a single Run() call.
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 24f42322c0f..48f50322544 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -61,6 +61,9 @@ message WhileContextDef {
   // List of names for exit tensors.
   repeated string loop_exit_names = 8;
 
+  // List of names for enter tensors.
+  repeated string loop_enter_names = 10;
+
   // Values and external values in control flow context.
   ValuesDef values_def = 9;
 }
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 5b32f9fc0b6..e1e51b6f18a 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -27,11 +27,27 @@ message DebugTensorWatch {
   //   E.g., "file:///foo/tfdbg_dump", "grpc://localhost:11011"
   // Each debug op listed in debug_ops will publish its output tensor (debug
   // signal) to all URLs in debug_urls.
+  //
+  // N.B. Session::Run() supports concurrent invocations of the same inputs
+  // (feed keys), outputs and target nodes. If such concurrent invocations
+  // are to be debugged, the callers of Session::Run() must use distinct
+  // debug_urls to make sure that the streamed or dumped events do not overlap
+  // among the invocations.
+  // TODO(cais): More visible documentation of this in g3docs.
   repeated string debug_urls = 4;
+
+  // Do not error out if debug op creation fails (e.g., due to dtype
+  // incompatibility). Instead, just log the failure.
+  bool tolerate_debug_op_creation_failures = 5;
 }
 
 // EXPERIMENTAL. Options for initializing DebuggerState.
 message DebugOptions {
   // Debugging options
   repeated DebugTensorWatch debug_tensor_watch_opts = 4;
+
+  // Caller-specified global step count.
+  // Note that this is distinct from the session run count and the executor
+  // step count.
+  int64 global_step = 10;
 }
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
new file mode 100644
index 00000000000..8ef78649900
--- /dev/null
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "DevicePropertiesProtos";
+
+message DeviceProperties {
+  // Device type (CPU, GPU, ...)
+  string type = 1;
+  // Vendor (Intel, nvidia, ...)
+  string vendor = 2;
+  // Model (Haswell, K40, ...)
+  string model = 3;
+  // Core Frequency in Mhz
+  int64 frequency = 4;
+  // Number of cores
+  int64 num_cores = 5;
+  // Version of the tools and libraries used with this device (e.g. gcc 4.9,
+  // cudnn 5.1)
+  map<string, string> environment = 6;
+  // Number of registers per core.
+  int64 num_registers = 7;
+  // L1 cache size in bytes
+  int64 l1_cache_size = 8;
+  // L2 cache size in bytes
+  int64 l2_cache_size = 9;
+  // L3 cache size in bytes
+  int64 l3_cache_size = 10;
+  // Shared memory size per multiprocessor in bytes. This field is
+  // applicable to GPUs only.
+  int64 shared_memory_size_per_multiprocessor = 11;
+  // Memory size in bytes
+  int64 memory_size = 12;
+  // Memory bandwidth in KB/s
+  int64 bandwidth = 13;
+}
+
+message DeviceMap {
+  message NamedDevice {
+    string name = 1;
+    DeviceProperties device = 2;
+  };
+  repeated NamedDevice name_and_device = 1;
+}
\ No newline at end of file
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index d22a68d89c5..6b25a86ba46 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -38,6 +38,9 @@ message CreateSessionRequest {
 
   // Configuration options.
   ConfigProto config = 2;
+
+  // The target string used from the client's perspective.
+  string target = 3;
 }
 
 message CreateSessionResponse {
@@ -188,6 +191,20 @@ message CloseSessionRequest {
 message CloseSessionResponse {
 }
 
+// Reset() allows misbehaving or slow sessions to be aborted and closed, and
+// causes their resources eventually to be released.  Reset() does not wait
+// for the computations in old sessions to cease; it merely starts the
+// process of tearing them down.  However, if a new session is started after
+// a Reset(), the new session is isolated from changes that old sessions
+// (started prior to the Reset()) may continue to make to resources, provided
+// all those resources are in containers listed in "containers".
+//
+// Old sessions may continue to have side-effects on resources not in
+// containers listed in "containers", and thus may affect future
+// sessions' results in ways that are hard to predict.  Thus, if well-defined
+// behavior is desired, is it recommended that all containers be listed in
+// "containers".  Similarly, if a device_filter is specified, results may be
+// hard to predict.
 message ResetRequest {
   // A list of container names, which may be empty.
   //
@@ -197,6 +214,11 @@ message ResetRequest {
   // If 'container' is empty, releases resources in the default
   // container in all devices.
   repeated string container = 1;
+
+  // When any filters are present, only devices that match the filters
+  // will be reset. Each filter can be partially specified,
+  // e.g. "/job:ps" "/job:worker/replica:3", etc.
+  repeated string device_filters = 2;
 }
 
 message ResetResponse {
@@ -212,6 +234,15 @@ message ResetResponse {
 ////////////////////////////////////////////////////////////////////////////////
 
 message ListDevicesRequest {
+  // Optional: session_handle must be returned by a CreateSession call to the
+  // same master service.
+  //
+  // When session_handle is empty, the ClusterSpec provided when the master was
+  // started is used to compute the available devices. If the session_handle is
+  // provided but not recognized, an error is returned. Finally, if a valid
+  // session_handle is provided, the cluster configuration for that session is
+  // used when computing the response.
+  string session_handle = 1;
 }
 
 message ListDevicesResponse {
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index a95420ef687..771c80562a7 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -103,6 +103,8 @@ service MasterService {
   // List the devices usable by the master.
   rpc ListDevices(ListDevicesRequest) returns (ListDevicesResponse);
 
-  // Close all existing sessions.
+  // Close and abandon all existing sessions.  Ongoing computations
+  // will no longer affect fresh ones via the resources in containers listed in
+  // the ResetRequest.  See ResetRequest for more details.
   rpc Reset(ResetRequest) returns (ResetResponse);
 }
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 5b2022321e5..47ec2aa1efe 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -202,8 +202,34 @@ message CollectionDef {
 
 // Information about a Tensor necessary for feeding or retrieval.
 message TensorInfo {
-  string name = 1;
+  // For sparse tensors, The COO encoding stores a triple of values, indices,
+  // and shape.
+  message CooSparse {
+    // The shape of the values Tensor is [?].  Its dtype must be the dtype of
+    // the SparseTensor as a whole, given in the enclosing TensorInfo.
+    string values_tensor_name = 1;
+
+    // The indices Tensor must have dtype int64 and shape [?, ?].
+    string indices_tensor_name = 2;
+
+    // The dynamic logical shape represented by the SparseTensor is recorded in
+    // the Tensor referenced here.  It must have dtype int64 and shape [?].
+    string dense_shape_tensor_name = 3;
+  }
+
+  oneof encoding {
+    // For dense `Tensor`s, the name of the tensor in the graph.
+    string name = 1;
+    // There are many possible encodings of sparse matrices
+    // (https://en.wikipedia.org/wiki/Sparse_matrix).  Currently, TensorFlow
+    // uses only the COO encoding.  This is supported and documented in the
+    // SparseTensor Python class.
+    CooSparse coo_sparse = 4;
+  }
   DataType dtype = 2;
+  // The static shape should be recorded here, to the extent that it can
+  // be known in advance.  In the case of a SparseTensor, this field describes
+  // the logical shape of the represented tensor (aka dense_shape).
   TensorShapeProto tensor_shape = 3;
 }
 
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
new file mode 100644
index 00000000000..4c61e577d21
--- /dev/null
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -0,0 +1,35 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "RewriterConfigProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+message AutoParallelOptions {
+  bool enable = 1;
+  int32 num_replicas = 2;
+}
+
+message RewriterConfig {
+  // Graph rewriting is experimental and subject to change, not subject to any
+  // API guarantees.
+
+  bool optimize_tensor_layout = 1;
+  bool disable_model_pruning = 2;
+  bool constant_folding = 3;
+
+  enum MemOptType {
+    // Fully disabled
+    NO_MEM_OPT = 0;
+    // Driven by manual annotations
+    MANUAL = 1;
+  }
+  MemOptType memory_optimization = 4;
+
+  AutoParallelOptions auto_parallel = 5;
+
+  // If non-empty, will use this as an alternative way to specify a list of
+  // optimizations to turn on and the order of the optimizations.
+  repeated string optimizers = 100;
+}
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index c4077bd98e4..6199e707e5a 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/cluster.proto";
 
 package tensorflow;
 option cc_enable_arenas = true;
@@ -23,69 +24,6 @@ option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
-// This file contains protos to be used when defining a TensorFlow
-// cluster, and a server within that cluster.
-//
-// EXAMPLES
-// --------
-//
-// 1. A single-process cluster, containing "/job:local/task:0".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
-//
-//    Server:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//
-// 2. A two-process cluster, containing "/job:local/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
-//                          tasks { key: 1 value: 'localhost:2223' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//      cluster { $CLUSTER } job_name: 'local' task_index: 1
-//
-// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
-//    "/job:ps/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
-//                           tasks { key: 1 value: 'worker2:2222' }
-//                           tasks { key: 2 value: 'worker3:2222' } }
-//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
-//                           tasks { key: 1 value: 'ps1:2222' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
-
-// Defines a single job in a TensorFlow cluster.
-message JobDef {
-  // The name of this job.
-  string name = 1;
-
-  // Mapping from task ID to "hostname:port" string.
-  //
-  // If the `name` field contains "worker", and the `tasks` map contains a
-  // mapping from 7 to "example.org:2222", then the device prefix
-  // "/job:worker/task:7" will be assigned to "example.org:2222".
-  //
-  // NOTE(mrry): Currently, only a dense task ID space starting at 0 is
-  // supported.
-  map<int32, string> tasks = 2;
-}
-
-// Defines a TensorFlow cluster as a set of jobs.
-message ClusterDef {
-  // The jobs that comprise the cluster.
-  repeated JobDef job = 1;
-}
-
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index c8fa0f3d218..e476a84a137 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -28,7 +28,9 @@ import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
+import "tensorflow/core/protobuf/tensorflow_server.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -43,6 +45,25 @@ message GetStatusResponse {
   repeated DeviceAttributes device_attributes = 1;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+//
+// CreateSession method request/response messages
+//
+// For each session,
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message CreateWorkerSessionRequest {
+  // Sessions are identified by a given handle.
+  string session_handle = 1;
+
+  // Defines the configuration of a TensorFlow worker.
+  ServerDef server_def = 2;
+}
+
+message CreateWorkerSessionResponse {
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // RegisterGraph method request/response messages
@@ -72,6 +93,9 @@ message RegisterGraphRequest {
 
   // Configuration options for the session in which this graph was created.
   GraphOptions graph_options = 4;
+
+  // Field(s) used by TensorFlow Debugger (tfdbg).
+  DebugOptions debug_options = 5;
 }
 
 message RegisterGraphResponse {
@@ -95,6 +119,10 @@ message RegisterGraphResponse {
 ////////////////////////////////////////////////////////////////////////////////
 
 message DeregisterGraphRequest {
+  // The session_handle used when registering the graph. If session_handle is
+  // empty, a single global namespace is used.
+  string session_handle = 2;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -143,6 +171,12 @@ message ExecutorOpts {
 };
 
 message RunGraphRequest {
+  // session_handle is the master-generated unique id for this session.
+  // If session_handle is non-empty, it must be the same as used when
+  // registering the graph. If it is empty, a single global namespace is used to
+  // search for the graph_handle.
+  string session_handle = 8;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -169,6 +203,8 @@ message RunGraphRequest {
   bool is_partial = 6;
   // True if this is the last partial run request in a sequence of requests.
   bool is_last_partial_run = 7;
+
+  // Next: 9
 }
 
 message RunGraphResponse {
@@ -229,6 +265,9 @@ message RecvTensorRequest {
 
   // Optional information on server-side device locality.
   DeviceLocality server_locality = 5;
+
+  // Optional information needed by the RPC subsystem.
+  google.protobuf.Any transport_options = 6;
 }
 
 message RecvTensorResponse {
@@ -243,7 +282,7 @@ message RecvTensorResponse {
   int64 send_start_micros = 3;
 
   // Optional additional information about how to receive the tensor,
-  // in the event that `RecvTensorRequest.dma_ok` was true.
+  // e.g. in the event that `RecvTensorRequest.dma_ok` was true.
   google.protobuf.Any transport_options = 4;
 }
 
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 689752cf3e3..3de9e48b78e 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -39,6 +39,10 @@ service WorkerService {
   // See worker.proto for details.
   rpc GetStatus(GetStatusRequest) returns (GetStatusResponse);
 
+  // See worker.proto for details.
+  rpc CreateWorkerSession(CreateWorkerSessionRequest)
+      returns (CreateWorkerSessionResponse);
+
   // See worker.proto for details.
   rpc RegisterGraph(RegisterGraphRequest) returns (RegisterGraphResponse);
 
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index b2f998c9ba3..c1f097c7c68 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -170,6 +171,12 @@ class Session {
                       const std::vector<string>& output_names,
                       std::vector<Tensor>* outputs);
 
+  /// \brief List devices in the session.
+  ///
+  /// Retrieves the list of available devices within the session, and populates
+  /// *response. This API is optional. If it is unimplemented, Status will
+  /// return a corresponding error message, and *response will be unmodified.
+  virtual Status ListDevices(std::vector<DeviceAttributes>* response) = 0;
   /// \brief Closes this session.
   ///
   /// Closing a session releases the resources used by this session
@@ -188,10 +195,26 @@ Status NewSession(const SessionOptions& options, Session** out_session);
 
 /// \brief Resets resource containers associated with a target.
 ///
+/// Reset() allows misbehaving or slow sessions to be aborted and closed, and
+/// causes their resources eventually to be released.  Reset() does not wait
+/// for the computations in old sessions to cease; it merely starts the
+/// process of tearing them down.  However, if a new session is started after
+/// a Reset(), the new session is isolated from changes that old sessions
+/// (started prior to the Reset()) may continue to make to resources, provided
+/// all those resources are in containers listed in "containers".
+///
+/// Old sessions may continue to have side-effects on resources not in
+/// containers listed in "containers", and thus may affect future
+/// sessions' results in ways that are hard to predict.  Thus, if well-defined
+/// behavior is desired, it is recommended that all containers be listed in
+/// "containers".
+///
 /// `containers` is a vector of string representation of resource container
 /// names. When a resource container is reset, the resources held by the
 /// container will be released. In particular, all Variables in the container
-/// will become undefined.
+/// will become undefined.  If the "containers" vector is empty, the default
+/// container is assumed.  If the "containers" vector is non-empty, the
+/// default container should be listed explicitly.
 ///
 /// If Reset succeeds, this function will return `OK()`. Otherwise, this
 /// function will return an error status.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 37e5dc81b2b..0e5611e359c 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,8 +19,8 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 0
-#define TF_PATCH_VERSION 0-rc1
+#define TF_MINOR_VERSION 2
+#define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
@@ -77,14 +77,22 @@ limitations under the License.
 //     (08dec2016)
 // 20. Catch all version 1.0 changes to Python API generation. SplitV is now
 //     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
-//     now used by tf.concat_v2 (and soon tf.concat). Graphs use flooring
+//     now used by tf.concat. Graphs use flooring
 //     division and mod semantics. TensorArrayV3. (12dec2016)
+//     Also considered the version for when it is required for reduction
+//     ops' indices to be scalar or vector, and not higher rank.
+//     Some earlier graph def versions allowed this.
 // 21. Dropped FunctionDef.Node support, switched to node_def introduced
 //     in version 12. (11jan2017)
+// 22. Placeholder now can specify and enforce scalar and partial
+//     shapes, particularly when restoring a graph from GraphDef
+//     produced at version 22 or later.  (04/10/2016)
+// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
+// 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 21
+#define TF_GRAPH_DEF_VERSION 24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index 78ae297ca14..c5122755064 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -32,7 +32,7 @@ class FactOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     // Output a scalar string.
-    Tensor* output_tensor = NULL;
+    Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape(), &output_tensor));
     auto output = output_tensor->template scalar<string>();
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 03eb076f30a..8373eb1f9e7 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -93,6 +93,22 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
   return false;
 }
 
+bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
+                    float* dst, bool* value_parsing_ok) {
+  *value_parsing_ok = true;
+  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+    char extra;
+    if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
+      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+                 << ".";
+      *value_parsing_ok = false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace
 
 Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
@@ -116,6 +132,12 @@ Flag::Flag(const char* name, string* dst, const string& usage_text)
       string_value_(dst),
       usage_text_(usage_text) {}
 
+Flag::Flag(const char* name, float* dst, const string& usage_text)
+    : name_(name),
+      type_(TYPE_FLOAT),
+      float_value_(dst),
+      usage_text_(usage_text) {}
+
 bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   bool result = false;
   if (type_ == TYPE_INT) {
@@ -126,6 +148,8 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     result = ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
     result = ParseStringFlag(arg, name_, string_value_, value_parsing_ok);
+  } else if (type_ == TYPE_FLOAT) {
+    result = ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
   }
   return result;
 }
@@ -195,6 +219,10 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
       type_name = "string";
       flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
                                     flag.string_value_->c_str());
+    } else if (flag.type_ == Flag::TYPE_FLOAT) {
+      type_name = "float";
+      flag_string =
+          strings::Printf("--%s=%f", flag.name_.c_str(), *flag.float_value_);
     }
     strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
                      type_name, flag.usage_text_.c_str());
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 2c77d7874fd..f349df16fd4 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -65,6 +65,7 @@ class Flag {
   Flag(const char* name, int64* dst1, const string& usage_text);
   Flag(const char* name, bool* dst, const string& usage_text);
   Flag(const char* name, string* dst, const string& usage_text);
+  Flag(const char* name, float* dst, const string& usage_text);
 
  private:
   friend class Flags;
@@ -72,11 +73,12 @@ class Flag {
   bool Parse(string arg, bool* value_parsing_ok) const;
 
   string name_;
-  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING } type_;
+  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
   int* int_value_;
   int64* int64_value_;
   bool* bool_value_;
   string* string_value_;
+  float* float_value_;
   string usage_text_;
 };
 
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index b002e35899e..c86a70ec9d0 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -27,34 +27,41 @@ namespace {
 std::vector<char *> CharPointerVectorFromStrings(
     const std::vector<string> &strings) {
   std::vector<char *> result;
+  result.reserve(strings.size());
   for (const string &string : strings) {
     result.push_back(const_cast<char *>(string.c_str()));
   }
   return result;
 }
-}
+}  // namespace
 
 TEST(CommandLineFlagsTest, BasicUsage) {
   int some_int = 10;
   int64 some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
   string some_name = "something";
-  int argc = 5;
-  std::vector<string> argv_strings = {
-      "program_name", "--some_int=20", "--some_int64=214748364700",
-      "--some_switch", "--some_name=somethingelse"};
+  float some_float = -23.23f;
+  int argc = 6;
+  std::vector<string> argv_strings = {"program_name",
+                                      "--some_int=20",
+                                      "--some_int64=214748364700",
+                                      "--some_switch",
+                                      "--some_name=somethingelse",
+                                      "--some_float=42.0"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
                    {Flag("some_int", &some_int, "some int"),
                     Flag("some_int64", &some_int64, "some int64"),
                     Flag("some_switch", &some_switch, "some switch"),
-                    Flag("some_name", &some_name, "some name")});
+                    Flag("some_name", &some_name, "some name"),
+                    Flag("some_float", &some_float, "some float")});
   EXPECT_EQ(true, parsed_ok);
   EXPECT_EQ(20, some_int);
   EXPECT_EQ(214748364700, some_int64);
   EXPECT_EQ(true, some_switch);
   EXPECT_EQ("somethingelse", some_name);
+  EXPECT_NEAR(42.0f, some_float, 1e-5f);
   EXPECT_EQ(argc, 1);
 }
 
@@ -85,6 +92,21 @@ TEST(CommandLineFlagsTest, BadBoolValue) {
   EXPECT_EQ(argc, 1);
 }
 
+TEST(CommandLineFlagsTest, BadFloatValue) {
+  float some_float = -23.23f;
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name",
+                                      "--some_float=notanumber"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_float", &some_float, "some float")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_NEAR(-23.23f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
 // Return whether str==pat, but allowing any whitespace in pat
 // to match zero or more whitespace characters in str.
 static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
@@ -111,6 +133,8 @@ TEST(CommandLineFlagsTest, UsageString) {
   int64 some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
   string some_name = "something";
+  // Don't test float in this case, because precision is hard to predict and
+  // match against, and we don't want a flakey test.
   const string tool_name = "some_tool_name";
   string usage = Flags::Usage(tool_name + "<flags>",
                               {Flag("some_int", &some_int, "some int"),
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 6dfb0bd731f..357b2535515 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -9,7 +9,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         "ctc_beam_entry.h",
         "ctc_beam_scorer.h",
@@ -19,6 +19,11 @@ filegroup(
     ],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index f63801993db..d1059806d79 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -92,10 +93,10 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   ~CTCBeamSearchDecoder() override {}
 
   // Run the hibernating beam search algorithm on the given input.
-  void Decode(const CTCDecoder::SequenceLength& seq_len,
-              const std::vector<CTCDecoder::Input>& input,
-              std::vector<CTCDecoder::Output>* output,
-              CTCDecoder::ScoreOutput* scores) override;
+  Status Decode(const CTCDecoder::SequenceLength& seq_len,
+                const std::vector<CTCDecoder::Input>& input,
+                std::vector<CTCDecoder::Output>* output,
+                CTCDecoder::ScoreOutput* scores) override;
 
   // Calculate the next step of the beam search and update the internal state.
   template <typename Vector>
@@ -116,8 +117,8 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   void Reset();
 
   // Extract the top n paths at current time step
-  void TopPaths(int n, std::vector<std::vector<int>>* paths,
-                std::vector<float>* log_probs, bool merge_repeated) const;
+  Status TopPaths(int n, std::vector<std::vector<int>>* paths,
+                  std::vector<float>* log_probs, bool merge_repeated) const;
 
  private:
   int beam_width_;
@@ -143,7 +144,7 @@ class CTCBeamSearchDecoder : public CTCDecoder {
 };
 
 template <typename CTCBeamState, typename CTCBeamComparer>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
+Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
     const CTCDecoder::SequenceLength& seq_len,
     const std::vector<CTCDecoder::Input>& input,
     std::vector<CTCDecoder::Output>* output, ScoreOutput* scores) {
@@ -151,6 +152,17 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
   std::vector<std::vector<int>> beams;
   std::vector<float> beam_log_probabilities;
   int top_n = output->size();
+  if (std::any_of(output->begin(), output->end(),
+                  [this](const CTCDecoder::Output& output) -> bool {
+                    return output.size() < this->batch_size_;
+                  })) {
+    return errors::InvalidArgument(
+        "output needs to be of size at least (top_n, batch_size).");
+  }
+  if (scores->rows() < batch_size_ || scores->cols() < top_n) {
+    return errors::InvalidArgument(
+        "scores needs to be of size at least (batch_size, top_n).");
+  }
 
   for (int b = 0; b < batch_size_; ++b) {
     int seq_len_b = seq_len[b];
@@ -172,7 +184,11 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
       leaves_.push(entry);
     }
 
-    TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+    Status status =
+        TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+    if (!status.ok()) {
+      return status;
+    }
 
     CHECK_EQ(top_n, beam_log_probabilities.size());
     CHECK_EQ(beams.size(), beam_log_probabilities.size());
@@ -183,6 +199,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
       (*scores)(b, i) = -beam_log_probabilities[i];
     }
   }  // for (int b...
+  return Status::OK();
 }
 
 template <typename CTCBeamState, typename CTCBeamComparer>
@@ -206,7 +223,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
     // max element is 0, per normalization above
     label_selection_input_min =
         std::max(label_selection_input_min, -label_selection_margin_);
-  };
+  }
 
   // Extract the beams sorted in decreasing new probability
   CHECK_EQ(num_classes_, input.size());
@@ -293,13 +310,15 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
         c.newp.total = c.newp.label;
 
         if (is_candidate(c.newp)) {
-          BeamEntry* bottom = leaves_.peek_bottom();
-          leaves_.push(&c);
+          // Before adding the new node to the beam, check if the beam
+          // is already at maximum width.
           if (leaves_.size() == beam_width_) {
             // Bottom is no longer in the beam search.  Reset
             // its probability; signal it's no longer in the beam search.
+            BeamEntry* bottom = leaves_.peek_bottom();
             bottom->newp.Reset();
           }
+          leaves_.push(&c);
         } else {
           // Deactivate child (signal it's not in the beam)
           c.oldp.Reset();
@@ -328,14 +347,18 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
 }
 
 template <typename CTCBeamState, typename CTCBeamComparer>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
+Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
     int n, std::vector<std::vector<int>>* paths, std::vector<float>* log_probs,
     bool merge_repeated) const {
   CHECK_NOTNULL(paths)->clear();
   CHECK_NOTNULL(log_probs)->clear();
-  CHECK_LE(n, beam_width_) << "Requested more paths than the beam width.";
-  CHECK_LE(n, leaves_.size()) << "Less leaves in the beam search "
-                              << "than requested.  Have you called Step()?";
+  if (n > beam_width_) {
+    return errors::InvalidArgument("requested more paths than the beam width.");
+  }
+  if (n > leaves_.size()) {
+    return errors::InvalidArgument(
+        "Less leaves in the beam search than requested.");
+  }
 
   gtl::TopN<BeamEntry*, CTCBeamComparer> top_branches(n);
 
@@ -351,6 +374,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
     paths->push_back(e->LabelSeq(merge_repeated));
     log_probs->push_back(e->newp.total);
   }
+  return Status::OK();
 }
 
 }  // namespace ctc
diff --git a/tensorflow/core/util/ctc/ctc_beam_search_test.cc b/tensorflow/core/util/ctc/ctc_beam_search_test.cc
index f9e5f6dd23e..b2d5ef56adf 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search_test.cc
+++ b/tensorflow/core/util/ctc/ctc_beam_search_test.cc
@@ -145,11 +145,12 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
       {{3}, {1, 3}, {3, 1}},
   };
 
-  // Convert data containers to the formatat accepted by the decoder, simply
+  // Convert data containers to the format accepted by the decoder, simply
   // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
   // using Eigen::Map.
   Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
   std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
   for (int t = 0; t < timesteps; ++t) {
     inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
   }
@@ -162,7 +163,7 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
   float score[batch_size][top_paths] = {{0.0}};
   Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
 
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output[0][path]);
   }
@@ -172,12 +173,54 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
   for (CTCDecoder::Output& output : dict_outputs) {
     output.resize(batch_size);
   }
-  dictionary_decoder.Decode(seq_len, inputs, &dict_outputs, &scores);
+  EXPECT_TRUE(
+      dictionary_decoder.Decode(seq_len, inputs, &dict_outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(dict_outputs[path][0], expected_dict_output[0][path]);
   }
 }
 
+TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
+  const int batch_size = 1;
+  const int timesteps = 1;
+  const int top_paths = 3;
+  const int num_classes = 6;
+
+  // Plain decoder using hibernating beam search algorithm.
+  CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
+  CTCBeamSearchDecoder<> decoder(num_classes, top_paths, &default_scorer);
+
+  // Raw data containers (arrays of floats, ints, etc.).
+  int sequence_lengths[batch_size] = {timesteps};
+  float input_data_mat[timesteps][batch_size][num_classes] = {
+      {{0.4, 0.3, 0, 0, 0, 0.5}}};
+
+  // Convert data containers to the format accepted by the decoder, simply
+  // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
+  // using Eigen::Map.
+  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
+  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
+  for (int t = 0; t < timesteps; ++t) {
+    inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
+  }
+
+  // Prepare containers for output and scores.
+  std::vector<CTCDecoder::Output> outputs(top_paths);
+  for (CTCDecoder::Output& output : outputs) {
+    output.resize(batch_size);
+  }
+  float score[batch_size][top_paths] = {{0.0}};
+  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
+
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
+  // Make sure all scores are finite.
+  for (int path = 0; path < top_paths; ++path) {
+    LOG(INFO) << "path " << path;
+    EXPECT_FALSE(std::isinf(score[0][path]));
+  }
+}
+
 // A beam decoder to test label selection. It simply models N labels with
 // rapidly dropping off log-probability.
 
@@ -252,6 +295,7 @@ TEST(CtcBeamSearch, LabelSelection) {
   // using Eigen::Map.
   Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
   std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
   for (int t = 0; t < timesteps; ++t) {
     inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
   }
@@ -264,21 +308,21 @@ TEST(CtcBeamSearch, LabelSelection) {
   float score[batch_size][top_paths] = {{0.0}};
   Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], batch_size, top_paths);
 
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
   }
 
   // Try label selection size 2
   decoder.SetLabelSelectionParameters(2, -1);
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
   }
 
   // Try label selection width 2.0
   decoder.SetLabelSelectionParameters(0, 2.0);
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_width2[0][path]);
   }
@@ -286,14 +330,14 @@ TEST(CtcBeamSearch, LabelSelection) {
   // Try both size 2 and width 2.0: the former is more constraining, so
   // it's equivalent to that.
   decoder.SetLabelSelectionParameters(2, 2.0);
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
   }
 
   // Size 4 and width > 3.3 are equivalent to no label selection
   decoder.SetLabelSelectionParameters(4, 3.3001);
-  decoder.Decode(seq_len, inputs, &outputs, &scores);
+  EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
   for (int path = 0; path < top_paths; ++path) {
     EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
   }
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index 77b91d8dd41..5b28aeb70ad 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace ctc {
@@ -47,9 +49,9 @@ class CTCDecoder {
   //  - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_
   //  - output.size() specifies the number of beams to be returned.
   //  - scores(b, i) - b = 0 to batch_size; i = 0 to output.size()
-  virtual void Decode(const SequenceLength& seq_len,
-                      const std::vector<Input>& input,
-                      std::vector<Output>* output, ScoreOutput* scores) = 0;
+  virtual Status Decode(const SequenceLength& seq_len,
+                        const std::vector<Input>& input,
+                        std::vector<Output>* output, ScoreOutput* scores) = 0;
 
   int batch_size() { return batch_size_; }
   int num_classes() { return num_classes_; }
@@ -68,10 +70,18 @@ class CTCGreedyDecoder : public CTCDecoder {
   CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated)
       : CTCDecoder(num_classes, batch_size, merge_repeated) {}
 
-  void Decode(const CTCDecoder::SequenceLength& seq_len,
-              const std::vector<CTCDecoder::Input>& input,
-              std::vector<CTCDecoder::Output>* output,
-              CTCDecoder::ScoreOutput* scores) override {
+  Status Decode(const CTCDecoder::SequenceLength& seq_len,
+                const std::vector<CTCDecoder::Input>& input,
+                std::vector<CTCDecoder::Output>* output,
+                CTCDecoder::ScoreOutput* scores) override {
+    if (output->empty() || (*output)[0].size() < batch_size_) {
+      return errors::InvalidArgument(
+          "output needs to be of size at least (1, batch_size).");
+    }
+    if (scores->rows() < batch_size_ || scores->cols() == 0) {
+      return errors::InvalidArgument(
+          "scores needs to be of size at least (batch_size, 1).");
+    }
     // For each batch entry, identify the transitions
     for (int b = 0; b < batch_size_; ++b) {
       int seq_len_b = seq_len[b];
@@ -79,7 +89,6 @@ class CTCGreedyDecoder : public CTCDecoder {
       std::vector<int>& output_b = (*output)[0][b];
 
       int prev_class_ix = -1;
-      std::vector<int> transcription;
       (*scores)(b, 0) = 0;
       for (int t = 0; t < seq_len_b; ++t) {
         auto row = input[t].row(b);
@@ -88,11 +97,11 @@ class CTCGreedyDecoder : public CTCDecoder {
         if (max_class_ix != blank_index_ &&
             !(merge_repeated_ && max_class_ix == prev_class_ix)) {
           output_b.push_back(max_class_ix);
-          transcription.push_back(max_class_ix);
         }
         prev_class_ix = max_class_ix;
       }
     }
+    return Status::OK();
   }
 };
 
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index eacadd65af8..be00895b0d3 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -30,8 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace ctc {
 
-using strings::StrCat;
-
 class CTCLossCalculator {
   // Connectionist Temporal Classification Loss
   //
@@ -48,7 +46,7 @@ class CTCLossCalculator {
   // these examples.
   //
   // Reference materials:
-  //  GravesTh: Alex Graves, "Supervised Sequence Labelling with Recurrent
+  //  GravesTh: Alex Graves, "Supervised Sequence Labeling with Recurrent
   //    Neural Networks" (PhD Thesis), Technische Universit¨at M¨unchen.
  public:
   typedef std::vector<std::vector<int>> LabelSequences;
@@ -65,7 +63,8 @@ class CTCLossCalculator {
   Status CalculateLoss(const VectorIn& seq_len, const LabelSequences& labels,
                        const std::vector<MatrixIn>& inputs,
                        bool preprocess_collapse_repeated,
-                       bool ctc_merge_repeated, VectorOut* loss,
+                       bool ctc_merge_repeated,
+                       bool ignore_longer_outputs_than_inputs, VectorOut* loss,
                        std::vector<MatrixOut>* gradients,
                        DeviceBase::CpuWorkerThreads* workers = nullptr) const;
 
@@ -90,7 +89,8 @@ class CTCLossCalculator {
   // batch.  Return value:
   //    max_{b in batch_size} l_primes[b].size()
   template <typename Vector>
-  Status PopulateLPrimes(bool preprocess_collapse_repeated, int batch_size,
+  Status PopulateLPrimes(bool preprocess_collapse_repeated,
+                         bool ignore_longer_outputs_than_inputs, int batch_size,
                          int num_classes, const Vector& seq_len,
                          const LabelSequences& labels, size_t* max_u_prime,
                          LabelSequences* l_primes) const;
@@ -108,7 +108,8 @@ template <typename VectorIn, typename VectorOut, typename MatrixIn,
 Status CTCLossCalculator::CalculateLoss(
     const VectorIn& seq_len, const LabelSequences& labels,
     const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
-    bool ctc_merge_repeated, VectorOut* loss, std::vector<MatrixOut>* gradients,
+    bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
+    VectorOut* loss, std::vector<MatrixOut>* gradients,
     DeviceBase::CpuWorkerThreads* workers) const {
   auto num_time_steps = inputs.size();
 
@@ -155,20 +156,31 @@ Status CTCLossCalculator::CalculateLoss(
   // and calculate the maximum necessary allocation size.
   LabelSequences l_primes(batch_size);
   size_t max_u_prime = 0;
-  Status l_p_ret =
-      PopulateLPrimes(preprocess_collapse_repeated, batch_size, num_classes,
-                      seq_len, labels, &max_u_prime, &l_primes);
+  Status l_p_ret = PopulateLPrimes(
+      preprocess_collapse_repeated, ignore_longer_outputs_than_inputs,
+      batch_size, num_classes, seq_len, labels, &max_u_prime, &l_primes);
   if (!l_p_ret.ok()) {
     return l_p_ret;
   }
 
   // Process each item in a batch in parallel, using at most kMaxThreads.
-  auto ComputeLossAndGradients = [this, num_classes, &l_primes, &seq_len,
-                                  &inputs, requires_backprop,
-                                  ctc_merge_repeated, &loss, &gradients](
-      int64 start_row, int64 limit_row) {
+  auto ComputeLossAndGradients = [this, num_classes, &labels, &l_primes,
+                                  &seq_len, &inputs, requires_backprop,
+                                  ctc_merge_repeated,
+                                  ignore_longer_outputs_than_inputs, &loss,
+                                  &gradients](int64 start_row,
+                                              int64 limit_row) {
     for (int b = start_row; b < limit_row; b++) {
-      if (seq_len(b) == 0) {
+      // Return zero gradient for empty sequences or sequences with labels
+      // longer than input, which is not supported by CTC.
+      if (seq_len(b) == 0 ||
+          (ignore_longer_outputs_than_inputs &&
+           labels[b].size() > seq_len(b) - this->output_delay_)) {
+        VLOG(1) << "The sequence length is either zero or shorter than the "
+                   "target output (CTC works only with shorter target sequence "
+                   "than input sequence). You can turn this into a warning by "
+                   "using the flag ignore_longer_outputs_than_inputs - "
+                << b << ": " << str_util::Join(labels[b], " ");
         continue;
       }
 
@@ -263,12 +275,11 @@ Status CTCLossCalculator::CalculateLoss(
 }
 
 template <typename Vector>
-Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
-                                          int batch_size, int num_classes,
-                                          const Vector& seq_len,
-                                          const LabelSequences& labels,
-                                          size_t* max_u_prime,
-                                          LabelSequences* l_primes) const {
+Status CTCLossCalculator::PopulateLPrimes(
+    bool preprocess_collapse_repeated, bool ignore_longer_outputs_than_inputs,
+    int batch_size, int num_classes, const Vector& seq_len,
+    const LabelSequences& labels, size_t* max_u_prime,
+    LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
     return errors::InvalidArgument("labels.size() != batch_size: ",
@@ -311,9 +322,6 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
       }
     }
 
-    // Make sure there is enough time to output the target indices.
-    int time = seq_len(b) - output_delay_;
-    int required_time = label.size();
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
@@ -325,14 +333,19 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
             num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
       }
     }
-    if (required_time > time) {
-      return errors::InvalidArgument(
-          "Not enough time for target transition sequence ("
-          "required: ",
-          required_time, ", available: ", time,
-          "), skipping data instance in batch: ", b);
+    if (!ignore_longer_outputs_than_inputs) {
+      // Make sure there is enough time to output the target indices.
+      int time = seq_len(b) - output_delay_;
+      int required_time = label.size();
+      if (required_time > time) {
+        return errors::InvalidArgument(
+            "Not enough time for target transition sequence ("
+            "required: ",
+            required_time, ", available: ", time, ")", b,
+            "You can turn this error into a warning by using the flag "
+            "ignore_longer_outputs_than_inputs");
+      }
     }
-
     // Target indices with blanks before each index and a blank at the end.
     // Length U' = 2U + 1.
     // Convert l to l_prime
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8bb4ca8ff84..c86c6e4a5d1 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -20,13 +20,95 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
+// GetCuda3DLaunchConfig:
+//
+// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// version uses heuristics without any knowledge of the device kernel, the other
+// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
+// launch parameters that maximize occupancy. Currently, only the maximum
+// occupancy version of GetCuda3DLaunchConfig is available.
+//
+// For large number of work elements, the convention is that each kernel would
+// iterate through its assigned range. The return value of GetCudaLaunchConfig
+// is struct CudaLaunchConfig, which contains all the information needed for the
+// kernel launch, including: virtual number of threads, the number of threads
+// per block and number of threads per block used inside <<< >>> of a kernel
+// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
+// as CudaLaunchConfig. The only difference is the dimension. The macros
+// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
+//
+/* Sample code:
+
+__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    do_your_job_here;
+  }
+}
+
+__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      do_your_job_here;
+    }
+  }
+}
+
+__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        do_your_job_here;
+      }
+    }
+  }
+}
+
+void MyDriverFunc(const GPUDevice &d) {
+  // use heuristics
+  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
+  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
+  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
+
+  // maximize occupancy
+  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
+  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
+  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+}
+
+// See the test for this for more example:
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+*/
 
 #define CUDA_1D_KERNEL_LOOP(i, n)                            \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
        i += blockDim.x * gridDim.x)
 
+#define CUDA_AXIS_KERNEL_LOOP(i, n, axis)                                  \
+  for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
+       i += blockDim.axis * gridDim.axis)
+
+#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -47,33 +129,71 @@ struct CudaLaunchConfig {
 // memory-limited.
 inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
                                             const GPUDevice& d) {
+  CudaLaunchConfig config;
+
+  // in case of invalid input, return the default value config, which has all -1
+  if (work_element_count <= 0) {
+    return config;
+  }
+
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
       d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
       virtual_thread_count);
   const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count = std::min(
-      (physical_thread_count + thread_per_block - 1) / thread_per_block,
-      d.getNumCudaMultiProcessors());
+  const int block_count =
+      std::min(DIV_UP(physical_thread_count, thread_per_block),
+               d.getNumCudaMultiProcessors());
 
-  CudaLaunchConfig config;
   config.virtual_thread_count = virtual_thread_count;
   config.thread_per_block = thread_per_block;
   config.block_count = block_count;
   return config;
 }
 
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const GPUDevice& d, DeviceFunc func,
+                                            size_t dynamic_shared_memory_size,
+                                            int block_size_limit) {
+  CudaLaunchConfig config;
+
+  if (work_element_count <= 0) {
+    return config;
+  }
+
+  int block_count = 0;
+  int thread_per_block = 0;
+
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  block_count =
+      std::min(block_count, DIV_UP(work_element_count, thread_per_block));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
 struct Cuda2DLaunchConfig {
-  dim3 virtual_thread_count;
-  dim3 thread_per_block;
-  dim3 block_count;
+  dim3 virtual_thread_count = dim3(0, 0, 0);
+  dim3 thread_per_block = dim3(0, 0, 0);
+  dim3 block_count = dim3(0, 0, 0);
 };
 
 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
                                                 const GPUDevice& d) {
   Cuda2DLaunchConfig config;
 
-  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  if (xdim <= 0 || ydim <= 0) {
+    return config;
+  }
 
   const int kThreadsPerBlock = 256;
   int block_cols = std::min(xdim, kThreadsPerBlock);
@@ -85,16 +205,78 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
 
   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
 
+  config.virtual_thread_count = dim3(xdim, ydim, 1);
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
-  int grid_x = std::min((xdim + block_cols - 1) / block_cols, max_blocks);
+  int grid_x = std::min(DIV_UP(xdim, block_cols), max_blocks);
 
   config.block_count = dim3(
       grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
-
   return config;
 }
 
+// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
+// This variant takes the resource limits of func into account to maximize
+// occupancy.
+using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+
+template <typename DeviceFunc>
+inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
+    int xdim, int ydim, int zdim, const GPUDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  Cuda3DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
+    return config;
+  }
+
+  int dev;
+  cudaGetDevice(&dev);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev);
+  int xthreadlimit = deviceProp.maxThreadsDim[0];
+  int ythreadlimit = deviceProp.maxThreadsDim[1];
+  int zthreadlimit = deviceProp.maxThreadsDim[2];
+  int xgridlimit = deviceProp.maxGridSize[0];
+  int ygridlimit = deviceProp.maxGridSize[1];
+  int zgridlimit = deviceProp.maxGridSize[2];
+
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+#define MIN3(a, b, c) std::min((a), std::min((b), (c)))
+  int threadsx = MIN3(xdim, thread_per_block, xthreadlimit);
+  int threadsy =
+      MIN3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
+  int threadsz =
+      MIN3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
+           zthreadlimit);
+
+  int blocksx = MIN3(block_count, DIV_UP(xdim, threadsx), xgridlimit);
+  int blocksy =
+      MIN3(DIV_UP(block_count, blocksx), DIV_UP(ydim, threadsy), ygridlimit);
+  int blocksz = MIN3(DIV_UP(block_count, (blocksx * blocksy)),
+                     DIV_UP(zdim, threadsz), zgridlimit);
+#undef MIN3
+
+  config.virtual_thread_count = dim3(xdim, ydim, zdim);
+  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
+  config.block_count = dim3(blocksx, blocksy, blocksz);
+  return config;
+}
+
+template <typename DeviceFunc>
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
+    int xdim, int ydim, const GPUDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
+                               dynamic_shared_memory_size, block_size_limit);
+}
+
 namespace gpu {
 
 template <typename IntType>
@@ -128,6 +310,28 @@ __device__ __host__ inline T ldg(const T* address) {
 #endif
 }
 
+template <>
+__device__ __host__ inline std::complex<float> ldg(
+    const std::complex<float>* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+template <>
+__device__ __host__ inline std::complex<double> ldg(
+    const std::complex<double>* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
 // for some ops and provide implementation for all reasonable types.
 #define CUDA_ATOMIC_WRAPPER(op, T) \
@@ -384,8 +588,90 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) {
   return x < y ? y : x;
 }
 
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(T value, int srcLane,
+                                             int width = warpSize) {
+  return __shfl(value, srcLane, width);
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
+// TODO(csigg): remove when the bug is fixed in the next CUDA release.
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(double value, int srcLane,
+                                                  int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = __shfl(hi, srcLane, width);
+  lo = __shfl(lo, srcLane, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(T value, int delta,
+                                               int width = warpSize) {
+  return __shfl_up(value, delta, width);
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
+// TODO(csigg): remove when the bug is fixed in the next CUDA release.
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(double value, int delta,
+                                                    int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = __shfl_up(hi, delta, width);
+  lo = __shfl_up(lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(T value, int delta,
+                                                 int width = warpSize) {
+  return __shfl_down(value, delta, width);
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
+// TODO(csigg): remove when the bug is fixed in the next CUDA release.
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(double value, int delta,
+                                                      int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = __shfl_down(hi, delta, width);
+  lo = __shfl_down(lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(T value, int laneMask,
+                                                int width = warpSize) {
+  return __shfl_xor(value, laneMask, width);
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
+// TODO(csigg): remove when the bug is fixed in the next CUDA release.
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(double value, int laneMask,
+                                                     int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = __shfl_xor(hi, laneMask, width);
+  lo = __shfl_xor(lo, laneMask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
 }  // namespace tensorflow
 
+#undef DIV_UP
+
 #endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
new file mode 100644
index 00000000000..abd72b7d77f
--- /dev/null
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -0,0 +1,303 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <numeric>
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define CUDA_EXPECT_SUCCESS                                 \
+  {                                                         \
+    cudaDeviceSynchronize();                                \
+    cudaError_t err = cudaGetLastError();                   \
+    EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
+  }
+
+#define CUDA_ASSERT_SUCCESS                                 \
+  {                                                         \
+    cudaDeviceSynchronize();                                \
+    cudaError_t err = cudaGetLastError();                   \
+    ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
+  }
+
+namespace tensorflow {
+
+namespace {
+
+__global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
+}
+
+// counting number of jobs by using atomic +1
+__global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    if (x < 0) {  // x might overflow when testing extreme case
+      break;
+    }
+    atomicAdd(&outbuf[x % bufsize], 1);
+  }
+}
+__global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    if (x < 0) {  // x might overflow when testing extreme case
+      break;
+    }
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      if (y < 0) {  // y might overflow when testing extreme case
+        break;
+      }
+      int idx = x * config.virtual_thread_count.y + y;
+      atomicAdd(&outbuf[idx % bufsize], 1);
+    }
+  }
+}
+__global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    if (x < 0) {  // x might overflow when testing extreme case
+      break;
+    }
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      if (y < 0) {  // y might overflow when testing extreme case
+        break;
+      }
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        if (z < 0) {  // z might overflow when testing extreme case
+          break;
+        }
+        int idx =
+            x * config.virtual_thread_count.y * config.virtual_thread_count.z +
+            y * config.virtual_thread_count.z + z;
+        atomicAdd(&outbuf[idx % bufsize], 1);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+class CudaLaunchConfigTest : public ::testing::Test {
+ protected:
+  const int bufsize = 1024;
+  int* outbuf = nullptr;
+  Eigen::CudaStreamDevice stream;
+  GPUDevice d = GPUDevice(&stream);
+
+  virtual void SetUp() {
+    cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
+    ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
+  }
+
+  virtual void TearDown() {
+    cudaDeviceSynchronize();
+    cudaFree(outbuf);
+    outbuf = nullptr;
+  }
+};
+
+TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
+  CudaLaunchConfig cfg;
+
+  // test invalid inputs
+  CudaLaunchConfig default_value;
+  cfg = GetCudaLaunchConfig(0, d);
+  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
+  EXPECT_EQ(default_value.block_count, cfg.block_count);
+  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
+
+  cfg = GetCudaLaunchConfig(-1, d);
+  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
+  EXPECT_EQ(default_value.block_count, cfg.block_count);
+  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
+
+  cfg = GetCudaLaunchConfig(0, d, Count1D, 0, 0);
+  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
+  EXPECT_EQ(default_value.block_count, cfg.block_count);
+  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
+
+  cfg = GetCudaLaunchConfig(-1, d, Count1D, 0, 0);
+  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
+  EXPECT_EQ(default_value.block_count, cfg.block_count);
+  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
+
+  // test valid inputs
+  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
+    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
+    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
+                                                                (cfg, outbuf);  \
+    CUDA_ASSERT_SUCCESS                                                         \
+    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
+    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
+        cfg, bufsize, outbuf);                                                  \
+    CUDA_EXPECT_SUCCESS                                                         \
+    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
+                                                                                \
+    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
+    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
+                                                                (cfg, outbuf);  \
+    CUDA_ASSERT_SUCCESS                                                         \
+    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
+    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
+        cfg, bufsize, outbuf);                                                  \
+    CUDA_EXPECT_SUCCESS                                                         \
+    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+
+  TEST_LAUNCH_PARAMETER(128);
+  TEST_LAUNCH_PARAMETER(129);
+  TEST_LAUNCH_PARAMETER(511);
+  TEST_LAUNCH_PARAMETER(512);
+  TEST_LAUNCH_PARAMETER(2048);
+  TEST_LAUNCH_PARAMETER(2049);
+  TEST_LAUNCH_PARAMETER(8191);
+  TEST_LAUNCH_PARAMETER(8192);
+  TEST_LAUNCH_PARAMETER(123456);
+  TEST_LAUNCH_PARAMETER(1 << 31 - 1);  // max value of int
+  #undef TEST_LAUNCH_PARAMETER
+}
+
+bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
+  return a.thread_per_block.x == b.thread_per_block.x &&
+         a.thread_per_block.y == b.thread_per_block.y &&
+         a.thread_per_block.z == b.thread_per_block.z &&
+         a.block_count.x == b.block_count.x &&
+         a.block_count.y == b.block_count.y &&
+         a.block_count.z == b.block_count.z &&
+         a.thread_per_block.x == b.thread_per_block.x &&
+         a.thread_per_block.y == b.thread_per_block.y &&
+         a.thread_per_block.z == b.thread_per_block.z;
+}
+
+TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
+  Cuda2DLaunchConfig cfg;
+  CudaLaunchConfig cfg1d;
+
+  // test invalid inputs
+  Cuda2DLaunchConfig default_value;
+  cfg = GetCuda2DLaunchConfig(1, 0, d);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(1, -1, d);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(-1, 1, d);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(-1, 1, d);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(0, -1, d);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(0, 0, d);
+  EXPECT_EQ(default_value, cfg);
+
+  cfg = GetCuda2DLaunchConfig(1, 0, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(1, -1, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(-1, 1, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(-1, 1, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(0, -1, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda2DLaunchConfig(0, 0, d, Count2D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+
+  // test valid inputs
+  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
+    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
+    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
+                                                                (cfg1d, outbuf);\
+    CUDA_ASSERT_SUCCESS                                                         \
+    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
+    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
+        cfg, bufsize, outbuf);                                                  \
+    CUDA_EXPECT_SUCCESS                                                         \
+    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
+                                                                                \
+    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
+    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
+                                                                (cfg1d, outbuf);\
+    CUDA_ASSERT_SUCCESS                                                         \
+    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
+    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
+        cfg, bufsize, outbuf);                                                  \
+    CUDA_EXPECT_SUCCESS                                                         \
+    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+
+  TEST_LAUNCH_PARAMETER(128, 128);
+  TEST_LAUNCH_PARAMETER(129, 64);
+  TEST_LAUNCH_PARAMETER(511, 2048);
+  TEST_LAUNCH_PARAMETER(512, 512);
+  TEST_LAUNCH_PARAMETER(2048, 1024);
+  TEST_LAUNCH_PARAMETER(2049, 32);
+  TEST_LAUNCH_PARAMETER(8191, 1);
+  TEST_LAUNCH_PARAMETER(8192, 10);
+  TEST_LAUNCH_PARAMETER(123456, 12);
+  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1));
+  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1);
+  #undef TEST_LAUNCH_PARAMETER
+}
+
+TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
+  Cuda3DLaunchConfig cfg;
+  CudaLaunchConfig cfg1d;
+
+  // test invalid inputs
+  Cuda3DLaunchConfig default_value;
+  cfg = GetCuda3DLaunchConfig(0, 1, 1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(-1, 1, 1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(1, 0, 1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(1, -1, 1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(1, 1, 0, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(1, 1, -1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(0, 0, 0, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+  cfg = GetCuda3DLaunchConfig(-1, -1, -1, d, Count3D, 0, 0);
+  EXPECT_EQ(default_value, cfg);
+
+  // test valid inputs
+  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
+    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
+    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
+                                                                (cfg1d, outbuf);\
+    CUDA_ASSERT_SUCCESS                                                         \
+    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
+    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
+        cfg, bufsize, outbuf);                                                  \
+    CUDA_EXPECT_SUCCESS                                                         \
+    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+
+  TEST_LAUNCH_PARAMETER(128, 128, 128);
+  TEST_LAUNCH_PARAMETER(129, 64, 1024);
+  TEST_LAUNCH_PARAMETER(511, 2048, 128);
+  TEST_LAUNCH_PARAMETER(512, 512, 64);
+  TEST_LAUNCH_PARAMETER(2048, 1024, 128);
+  TEST_LAUNCH_PARAMETER(2049, 32, 1024);
+  TEST_LAUNCH_PARAMETER(8191, 1, 1024);
+  TEST_LAUNCH_PARAMETER(8192, 10, 32);
+  TEST_LAUNCH_PARAMETER(123456, 12, 21);
+  TEST_LAUNCH_PARAMETER(1, 1, (1 << 31 - 1));
+  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1), 1);
+  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1, 1);
+  #undef TEST_LAUNCH_PARAMETER
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index ac183004966..64aa0ac2097 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -178,7 +178,7 @@ string DeviceNameUtils::ParsedNameToString(const ParsedName& pn) {
   if (pn.has_replica) strings::StrAppend(&buf, "/replica:", pn.replica);
   if (pn.has_task) strings::StrAppend(&buf, "/task:", pn.task);
   if (pn.has_type) {
-    strings::StrAppend(&buf, "/", pn.type, ":");
+    strings::StrAppend(&buf, "/device:", pn.type, ":");
     if (pn.has_id) {
       strings::StrAppend(&buf, pn.id);
     } else {
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 0410317a841..ed511629b6e 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -17,11 +17,54 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 
+namespace {
+
+bool RoundTripParsedName(const string& original, const string& expected) {
+  DeviceNameUtils::ParsedName p;
+  if (!DeviceNameUtils::ParseFullName(original, &p)) {
+    return false;
+  }
+  string round_tripped = DeviceNameUtils::ParsedNameToString(p);
+  return (round_tripped == expected);
+}
+
+enum NamePart { kJob = 0x01, kReplica = 0x02, kTask = 0x04, kDevice = 0x08 };
+
+bool RoundTripPartialName(int parts_to_test, const std::vector<string>& parts,
+                          bool explicitDevice) {
+  string original, expected;
+  if (parts_to_test & kJob) {
+    strings::StrAppend(&original, "/job:", parts[0]);
+    strings::StrAppend(&expected, "/job:", parts[0]);
+  }
+  if (parts_to_test & kReplica) {
+    strings::StrAppend(&original, "/replica:", parts[1]);
+    strings::StrAppend(&expected, "/replica:", parts[1]);
+  }
+  if (parts_to_test & kTask) {
+    strings::StrAppend(&original, "/task:", parts[2]);
+    strings::StrAppend(&expected, "/task:", parts[2]);
+  }
+  if (parts_to_test & kDevice) {
+    if (explicitDevice) {
+      strings::StrAppend(&original, "/device:", parts[3]);
+      strings::StrAppend(&expected, "/device:", parts[3]);
+    } else {
+      strings::StrAppend(&original, "/", parts[3]);
+      strings::StrAppend(&expected, "/device:", str_util::Uppercase(parts[3]));
+    }
+  }
+  return RoundTripParsedName(original, expected);
+}
+
+}  // namespace
+
 TEST(DeviceNameUtilsTest, Basic) {
   EXPECT_EQ(DeviceNameUtils::FullName("hello", 1, 2, "CPU", 3),
             "/job:hello/replica:1/task:2/device:CPU:3");
@@ -204,6 +247,30 @@ TEST(DeviceNameUtilsTest, Basic) {
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("abc", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("myspecialdevice", &p));
   }
+
+  // Test that all parts are round-tripped correctly.
+  {
+    for (int i = 0; i < 0x10; ++i) {
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "CPU:3"},
+                                       /*explicitDevice=*/false));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "GPU:3"},
+                                       /*explicitDevice=*/false));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "cpu:3"},
+                                       /*explicitDevice=*/false));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "gpu:3"},
+                                       /*explicitDevice=*/false));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "CPU:3"},
+                                       /*explicitDevice=*/true));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "GPU:3"},
+                                       /*explicitDevice=*/true));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "cpu:3"},
+                                       /*explicitDevice=*/true));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "gpu:3"},
+                                       /*explicitDevice=*/true));
+      EXPECT_TRUE(RoundTripPartialName(i, {"foo", "3", "2", "someDevice:3"},
+                                       /*explicitDevice=*/true));
+    }
+  }
 }
 
 static bool IsCSHelper(StringPiece pattern, StringPiece actual) {
diff --git a/tensorflow/core/graph/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
similarity index 91%
rename from tensorflow/core/graph/equal_graph_def.cc
rename to tensorflow/core/util/equal_graph_def.cc
index 21b6d55ca85..2db026da56c 100644
--- a/tensorflow/core/graph/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/graph/equal_graph_def.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 #include <unordered_map>
 #include <unordered_set>
@@ -28,13 +28,18 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
                    string* diff, const EqualGraphDefOptions& options) {
   // Intentionally do not check that versions match so that this routine can
   // be used for less brittle golden file tests.
+  return EqualRepeatedNodeDef(actual.node(), expected.node(), diff, options);
+}
 
+bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
+                          const protobuf::RepeatedPtrField<NodeDef>& expected,
+                          string* diff, const EqualGraphDefOptions& options) {
   std::unordered_map<string, const NodeDef*> actual_index;
-  for (const NodeDef& node : actual.node()) {
+  for (const NodeDef& node : actual) {
     actual_index[node.name()] = &node;
   }
 
-  for (const NodeDef& expected_node : expected.node()) {
+  for (const NodeDef& expected_node : expected) {
     auto actual_iter = actual_index.find(expected_node.name());
     if (actual_iter == actual_index.end()) {
       if (diff != nullptr) {
@@ -53,10 +58,9 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 
   if (!actual_index.empty()) {
     if (diff != nullptr) {
-      *diff = strings::StrCat("Found unexpected node '",
-                              SummarizeNodeDef(*actual_index.begin()->second),
-                              "' not in expected graph:\n",
-                              SummarizeGraphDef(expected));
+      *diff =
+          strings::StrCat("Found unexpected node '",
+                          SummarizeNodeDef(*actual_index.begin()->second), "'");
     }
     return false;
   }
diff --git a/tensorflow/core/graph/equal_graph_def.h b/tensorflow/core/util/equal_graph_def.h
similarity index 83%
rename from tensorflow/core/graph/equal_graph_def.h
rename to tensorflow/core/util/equal_graph_def.h
index 82f8bd0713b..1ce6181c2e7 100644
--- a/tensorflow/core/graph/equal_graph_def.h
+++ b/tensorflow/core/util/equal_graph_def.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -44,6 +45,14 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
                   const EqualGraphDefOptions& options = {});
 
+// Determines if actual and expected are equal, ignoring ordering. If they're
+// different and diff != nullptr, *diff is set to an explanation of the
+// difference.
+bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
+                          const protobuf::RepeatedPtrField<NodeDef>& expected,
+                          string* diff,
+                          const EqualGraphDefOptions& options = {});
+
 #define TF_EXPECT_GRAPH_EQ(expected, actual)                  \
   do {                                                        \
     string diff;                                              \
diff --git a/tensorflow/core/graph/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
similarity index 91%
rename from tensorflow/core/graph/equal_graph_def_test.cc
rename to tensorflow/core/util/equal_graph_def_test.cc
index 6f1fce27e29..054cc92c169 100644
--- a/tensorflow/core/graph/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/graph/equal_graph_def.h"
+#include <utility>
+
+#include "tensorflow/core/util/equal_graph_def.h"
 
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -39,21 +42,20 @@ Node* Alternate(const GraphDefBuilder::Options& opts) {
 
 Node* Combine(ops::NodeOut a, ops::NodeOut b,
               const GraphDefBuilder::Options& opts) {
-  return ops::BinaryOp("Combine", a, b, opts);
+  return ops::BinaryOp("Combine", std::move(a), std::move(b), opts);
 }
 
 class EqualGraphDefTest : public ::testing::Test {
  protected:
   EqualGraphDefTest()
       : e_(GraphDefBuilder::kFailImmediately),
-        a_(GraphDefBuilder::kFailImmediately) {
-  }
+        a_(GraphDefBuilder::kFailImmediately) {}
 
   bool Match() {
     GraphDef expected;
-    e_.ToGraphDef(&expected);
+    TF_EXPECT_OK(e_.ToGraphDef(&expected));
     GraphDef actual;
-    a_.ToGraphDef(&actual);
+    TF_EXPECT_OK(a_.ToGraphDef(&actual));
     return EqualGraphDef(actual, expected, &diff_);
   }
 
@@ -88,11 +90,7 @@ TEST_F(EqualGraphDefTest, ExtraNode) {
   Input(a_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ(strings::StrCat(
-                "Found unexpected node 'B = Input[]()' not in expected graph:\n"
-                "versions = producer: ",
-                TF_GRAPH_DEF_VERSION, ";\n", "A = Input[]();\n"),
-            diff_);
+  EXPECT_EQ("Found unexpected node 'B = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, NodeOrder) {
@@ -168,21 +166,23 @@ TEST_F(EqualGraphDefTest, ControlInputOrder) {
   Node* b = Input(e_.opts().WithName("B"));
   Node* c = Input(e_.opts().WithName("C"));
   Node* d = Input(e_.opts().WithName("D"));
-  Combine(a, a, e_.opts()
-                    .WithName("E")
-                    .WithControlInput(b)
-                    .WithControlInput(c)
-                    .WithControlInput(d));
+  Combine(a, a,
+          e_.opts()
+              .WithName("E")
+              .WithControlInput(b)
+              .WithControlInput(c)
+              .WithControlInput(d));
 
   a = Input(a_.opts().WithName("A"));
   b = Input(a_.opts().WithName("B"));
   c = Input(a_.opts().WithName("C"));
   d = Input(a_.opts().WithName("D"));
-  Combine(a, a, a_.opts()
-                    .WithName("E")
-                    .WithControlInput(c)
-                    .WithControlInput(d)
-                    .WithControlInput(b));
+  Combine(a, a,
+          a_.opts()
+              .WithName("E")
+              .WithControlInput(c)
+              .WithControlInput(d)
+              .WithControlInput(b));
   EXPECT_TRUE(Match()) << diff_;
 }
 
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 5c3a88a2d7f..5c3799c1322 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -43,7 +43,11 @@ message Event {
 message LogMessage {
   enum Level {
     UNKNOWN = 0;
-    DEBUG = 10;
+    // Note: The logging level 10 cannot be named DEBUG. Some software
+    // projects compile their C/C++ code with -DDEBUG in debug builds. So the
+    // C++ code generated from this file should not have an identifier named
+    // DEBUG.
+    DEBUGGING = 10;
     INFO = 20;
     WARN = 30;
     ERROR = 40;
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index ee337227189..23b00e23dd0 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -35,8 +35,8 @@ EventsWriter::EventsWriter(const string& file_prefix)
       file_prefix_(file_prefix),
       num_outstanding_events_(0) {}
 
-bool EventsWriter::Init() {
-  if (recordio_writer_.get() != nullptr) {
+bool EventsWriter::InitIfNeeded() {
+  if (recordio_writer_ != nullptr) {
     CHECK(!filename_.empty());
     if (FileHasDisappeared()) {
       // Warn user of data loss and let .reset() below do basic cleanup.
@@ -52,10 +52,10 @@ bool EventsWriter::Init() {
 
   int64 time_in_seconds = env_->NowMicros() / 1000000;
 
-  filename_ = strings::Printf(
-      "%s.out.tfevents.%010lld.%s", file_prefix_.c_str(),
-      static_cast<long long>(time_in_seconds), port::Hostname().c_str());
-  port::AdjustFilenameForLogging(&filename_);
+  filename_ =
+      strings::Printf("%s.out.tfevents.%010lld.%s%s", file_prefix_.c_str(),
+                      static_cast<int64>(time_in_seconds),
+                      port::Hostname().c_str(), file_suffix_.c_str());
 
   Status s = env_->NewWritableFile(filename_, &recordio_file_);
   if (!s.ok()) {
@@ -63,7 +63,7 @@ bool EventsWriter::Init() {
     return false;
   }
   recordio_writer_.reset(new io::RecordWriter(recordio_file_.get()));
-  if (recordio_writer_.get() == NULL) {
+  if (recordio_writer_ == nullptr) {
     LOG(ERROR) << "Could not create record writer";
     return false;
   }
@@ -84,20 +84,20 @@ bool EventsWriter::Init() {
 
 string EventsWriter::FileName() {
   if (filename_.empty()) {
-    Init();
+    InitIfNeeded();
   }
   return filename_;
 }
 
 void EventsWriter::WriteSerializedEvent(StringPiece event_str) {
-  if (recordio_writer_.get() == NULL) {
-    if (!Init()) {
+  if (recordio_writer_ == nullptr) {
+    if (!InitIfNeeded()) {
       LOG(ERROR) << "Write failed because file could not be opened.";
       return;
     }
   }
   num_outstanding_events_++;
-  recordio_writer_->WriteRecord(event_str);
+  recordio_writer_->WriteRecord(event_str).IgnoreError();
 }
 
 // NOTE(touts); This is NOT the function called by the Python code.
@@ -110,7 +110,7 @@ void EventsWriter::WriteEvent(const Event& event) {
 
 bool EventsWriter::Flush() {
   if (num_outstanding_events_ == 0) return true;
-  CHECK(recordio_file_.get() != NULL) << "Unexpected NULL file";
+  CHECK(recordio_file_ != nullptr) << "Unexpected NULL file";
 
   if (!recordio_writer_->Flush().ok()) {
     LOG(ERROR) << "Failed to flush " << num_outstanding_events_ << " events to "
@@ -139,15 +139,15 @@ bool EventsWriter::Flush() {
 
 bool EventsWriter::Close() {
   bool return_value = Flush();
-  if (recordio_file_.get() != NULL) {
+  if (recordio_file_ != nullptr) {
     Status s = recordio_file_->Close();
     if (!s.ok()) {
       LOG(ERROR) << "Error when closing previous event file: " << filename_
                  << ": " << s;
       return_value = false;
     }
-    recordio_writer_.reset(NULL);
-    recordio_file_.reset(NULL);
+    recordio_writer_.reset(nullptr);
+    recordio_file_.reset(nullptr);
   }
   num_outstanding_events_ = 0;
   return return_value;
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index 2604ebdda2c..a1a8cf790d4 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -35,10 +35,10 @@ class EventsWriter {
 #endif
 
   // Events files typically have a name of the form
-  //   '/some/file/path/my.file.out.events.[timestamp].[hostname]'
+  //   '/some/file/path/my.file.out.events.[timestamp].[hostname][suffix]'
   // To create and EventWriter, the user should provide file_prefix =
   //   '/some/file/path/my.file'
-  // The EventsWriter will append '.out.events.[timestamp].[hostname]'
+  // The EventsWriter will append '.out.events.[timestamp].[hostname][suffix]'
   // to the ultimate filename once Init() is called.
   // Note that it is not recommended to simultaneously have two
   // EventWriters writing to the same file_prefix.
@@ -51,10 +51,14 @@ class EventsWriter {
   // and is open this is a no-op.  If on the other hand the file was opened,
   // but has since disappeared (e.g. deleted by another process), this will open
   // a new file with a new timestamp in its filename.
-  bool Init();
+  bool Init() { return InitWithSuffix(""); }
+  bool InitWithSuffix(const string& suffix) {
+    file_suffix_ = suffix;
+    return InitIfNeeded();
+  }
 
   // Returns the filename for the current events file:
-  // filename_ = [file_prefix_].out.events.[timestamp].[hostname]
+  // filename_ = [file_prefix_].out.events.[timestamp].[hostname][suffix]
   string FileName();
 
   // Append "event" to the file.  The "tensorflow::" part is for swig happiness.
@@ -78,9 +82,11 @@ class EventsWriter {
 
  private:
   bool FileHasDisappeared();  // True if event_file_path_ does not exist.
+  bool InitIfNeeded();
 
   Env* env_;
   const string file_prefix_;
+  string file_suffix_;
   string filename_;
   std::unique_ptr<WritableFile> recordio_file_;
   std::unique_ptr<io::RecordWriter> recordio_writer_;
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index a96be619853..a6286ea701f 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/events_writer.h"
 
 #include <math.h>
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -140,7 +141,7 @@ TEST(EventWriter, FailFlush) {
   string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
-  env()->DeleteFile(filename);
+  TF_ASSERT_OK(env()->DeleteFile(filename));
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
   EXPECT_FALSE(writer.Flush());
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
@@ -152,7 +153,7 @@ TEST(EventWriter, FailClose) {
   string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
-  env()->DeleteFile(filename);
+  TF_ASSERT_OK(env()->DeleteFile(filename));
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
   EXPECT_FALSE(writer.Close());
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
@@ -187,7 +188,7 @@ TEST(EventWriter, NameClose) {
   string filename = writer.FileName();
   EXPECT_TRUE(writer.Close());
   TF_EXPECT_OK(env()->FileExists(filename));
-  env()->DeleteFile(filename);
+  TF_ASSERT_OK(env()->DeleteFile(filename));
 }
 
 TEST(EventWriter, FileDeletionBeforeWriting) {
@@ -197,7 +198,7 @@ TEST(EventWriter, FileDeletionBeforeWriting) {
   TF_EXPECT_OK(env()->FileExists(filename0));
   env()->SleepForMicroseconds(
       2000000);  // To make sure timestamp part of filename will differ.
-  env()->DeleteFile(filename0);
+  TF_ASSERT_OK(env()->DeleteFile(filename0));
   EXPECT_TRUE(writer.Init());  // Init should reopen file.
   WriteFile(&writer);
   EXPECT_TRUE(writer.Flush());
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index e14f50551e0..bcf3512efc9 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -66,7 +66,7 @@ namespace parsed {
 class Feature {
  public:
   Feature() {}
-  Feature(StringPiece serialized) : serialized_(serialized) {}
+  explicit Feature(StringPiece serialized) : serialized_(serialized) {}
 
   Status ParseDataType(DataType* dtype) {
     DCHECK(dtype != nullptr);
@@ -89,7 +89,7 @@ class Feature {
       default:
         // Initialize variable to avoid compiler warning
         *dtype = DT_INVALID;
-        return errors::InvalidArgument("Unsuported datatype.");
+        return errors::InvalidArgument("Unsupported datatype.");
     }
     return Status::OK();
   }
@@ -215,6 +215,31 @@ using Example = std::vector<FeatureMapEntry>;
 
 }  // namespace parsed
 
+inline bool SkipExtraneousTag(protobuf::io::CodedInputStream* stream) {
+  uint32 data;
+  protobuf_uint64 dummy;
+  switch (stream->ReadTag() & 0x7) {
+    case 0:  // varint
+      if (!stream->ReadVarint32(&data)) return false;
+      return true;
+    case 1:  // fixed64
+      if (!stream->ReadLittleEndian64(&dummy)) return false;
+      return true;
+    case 2:  // length delimited
+      if (!stream->ReadVarint32(&data)) return false;
+      stream->Skip(data);
+      return true;
+    case 3:          // group begin
+      return false;  // groups not supported.
+    case 4:          // group end
+      return false;  // groups not supported.
+    case 5:          // fixed32
+      if (!stream->ReadLittleEndian32(&data)) return false;
+      return true;
+  }
+  return false;  // unrecognized tag type
+}
+
 bool ParseString(protobuf::io::CodedInputStream* stream, StringPiece* result) {
   DCHECK(stream != nullptr);
   DCHECK(result != nullptr);
@@ -278,7 +303,10 @@ bool ParseExample(protobuf::io::CodedInputStream* stream,
   // protos merged together as strings. This behavior is consistent with Proto's
   // ParseFromString when string representations are concatenated.
   while (!stream->ExpectAtEnd()) {
-    if (!stream->ExpectTag(kDelimitedTag(1))) return false;
+    if (!stream->ExpectTag(kDelimitedTag(1))) {
+      if (!SkipExtraneousTag(stream)) return false;
+      continue;
+    }
     if (!ParseFeatures(stream, example)) return false;
   }
   return true;
@@ -424,6 +452,7 @@ Status FastParseSerializedExample(
     const size_t example_index, const Config& config,
     const PresizedCuckooMap<std::pair<size_t, Type>>& config_index,
     SeededHasher hasher, std::vector<Tensor>* output_dense,
+    std::vector<SparseBuffer>* output_varlen_dense,
     std::vector<SparseBuffer>* output_sparse) {
   DCHECK(output_dense != nullptr);
   DCHECK(output_sparse != nullptr);
@@ -463,9 +492,9 @@ Status FastParseSerializedExample(
     }
 
     auto example_error = [&](StringPiece suffix) {
-      return errors::InvalidArgument("Name: ", example_name, ", Key: ",
-                                     feature_name, ", Index: ", example_index,
-                                     ".  ", suffix);
+      return errors::InvalidArgument("Name: ", example_name,
+                                     ", Key: ", feature_name,
+                                     ", Index: ", example_index, ".  ", suffix);
     };
 
     auto parse_error = [&] {
@@ -494,54 +523,117 @@ Status FastParseSerializedExample(
       dense_feature_last_example[d] = example_index;
 
       if (example_dtype != config.dense[d].dtype) {
-        return example_error(
-            strings::StrCat("Data types don't match. Data type: ",
-                            DataTypeString(example_dtype), "Expected type: ",
-                            DataTypeString(config.dense[d].dtype)));
-      }
-      Tensor& out = (*output_dense)[d];
-
-      const std::size_t num_elements = config.dense[d].elements_per_stride;
-      const std::size_t offset = example_index * num_elements;
-
-      auto shape_error = [&](size_t size, StringPiece type_str) {
         return example_error(strings::StrCat(
-            "Number of ", type_str,
-            " values != expected.  "
-            "Values size: ",
-            size, " but output shape: ", config.dense[d].shape.DebugString()));
-      };
+            "Data types don't match. Data type: ",
+            DataTypeString(example_dtype),
+            " but expected type: ", DataTypeString(config.dense[d].dtype)));
+      }
+      if (!config.dense[d].variable_length) {
+        Tensor& out = (*output_dense)[d];
 
-      switch (config.dense[d].dtype) {
-        case DT_INT64: {
-          auto out_p = out.flat<int64>().data() + offset;
-          LimitedArraySlice<int64> slice(out_p, num_elements);
-          if (!feature.ParseInt64List(&slice)) return parse_error();
-          if (slice.EndDistance() != 0) {
-            return shape_error(num_elements - slice.EndDistance(), "int64");
+        const std::size_t num_elements = config.dense[d].elements_per_stride;
+        const std::size_t offset = example_index * num_elements;
+
+        auto shape_error = [&](size_t size, StringPiece type_str) {
+          return example_error(strings::StrCat(
+              "Number of ", type_str,
+              " values != expected.  "
+              "Values size: ",
+              size,
+              " but output shape: ", config.dense[d].shape.DebugString()));
+        };
+
+        switch (config.dense[d].dtype) {
+          case DT_INT64: {
+            auto out_p = out.flat<int64>().data() + offset;
+            LimitedArraySlice<int64> slice(out_p, num_elements);
+            if (!feature.ParseInt64List(&slice)) return parse_error();
+            if (slice.EndDistance() != 0) {
+              return shape_error(num_elements - slice.EndDistance(), "int64");
+            }
+            break;
           }
-          break;
-        }
-        case DT_FLOAT: {
-          auto out_p = out.flat<float>().data() + offset;
-          LimitedArraySlice<float> slice(out_p, num_elements);
-          if (!feature.ParseFloatList(&slice)) return parse_error();
-          if (slice.EndDistance() != 0) {
-            return shape_error(num_elements - slice.EndDistance(), "float");
+          case DT_FLOAT: {
+            auto out_p = out.flat<float>().data() + offset;
+            LimitedArraySlice<float> slice(out_p, num_elements);
+            if (!feature.ParseFloatList(&slice)) return parse_error();
+            if (slice.EndDistance() != 0) {
+              return shape_error(num_elements - slice.EndDistance(), "float");
+            }
+            break;
           }
-          break;
-        }
-        case DT_STRING: {
-          auto out_p = out.flat<string>().data() + offset;
-          LimitedArraySlice<string> slice(out_p, num_elements);
-          if (!feature.ParseBytesList(&slice)) return parse_error();
-          if (slice.EndDistance() != 0) {
-            return shape_error(num_elements - slice.EndDistance(), "bytes");
+          case DT_STRING: {
+            auto out_p = out.flat<string>().data() + offset;
+            LimitedArraySlice<string> slice(out_p, num_elements);
+            if (!feature.ParseBytesList(&slice)) return parse_error();
+            if (slice.EndDistance() != 0) {
+              return shape_error(num_elements - slice.EndDistance(), "bytes");
+            }
+            break;
           }
-          break;
+          default:
+            CHECK(false) << "Should not happen.";
+        }
+      } else {  // if variable length
+        SparseBuffer& out = (*output_varlen_dense)[d];
+
+        const std::size_t num_elements = config.dense[d].elements_per_stride;
+
+        if (example_dtype != DT_INVALID &&
+            example_dtype != config.dense[d].dtype) {
+          return example_error(strings::StrCat(
+              "Data types don't match. ",
+              "Expected type: ", DataTypeString(config.dense[d].dtype)));
+        }
+
+        auto shape_error = [&](size_t size, StringPiece type_str) {
+          return example_error(strings::StrCat(
+              "Number of ", type_str,
+              " values is not a multiple of stride length. Saw ", size,
+              " values but output shape is: ",
+              config.dense[d].shape.DebugString()));
+        };
+
+        switch (config.dense[d].dtype) {
+          case DT_INT64: {
+            if (example_dtype != DT_INVALID) {
+              if (!feature.ParseInt64List(&out.int64_list)) {
+                return parse_error();
+              }
+              if (out.int64_list.size() % num_elements != 0) {
+                return shape_error(out.int64_list.size(), "int64");
+              }
+            }
+            out.example_end_indices.push_back(out.int64_list.size());
+            break;
+          }
+          case DT_FLOAT: {
+            if (example_dtype != DT_INVALID) {
+              if (!feature.ParseFloatList(&out.float_list)) {
+                return parse_error();
+              }
+              if (out.float_list.size() % num_elements != 0) {
+                return shape_error(out.float_list.size(), "float");
+              }
+            }
+            out.example_end_indices.push_back(out.float_list.size());
+            break;
+          }
+          case DT_STRING: {
+            if (example_dtype != DT_INVALID) {
+              if (!feature.ParseBytesList(&out.bytes_list)) {
+                return parse_error();
+              }
+              if (out.bytes_list.size() % num_elements != 0) {
+                return shape_error(out.bytes_list.size(), "bytes");
+              }
+            }
+            out.example_end_indices.push_back(out.bytes_list.size());
+            break;
+          }
+          default:
+            CHECK(false) << "Should not happen.";
         }
-        default:
-          CHECK(false) << "Should not happen.";
       }
     } else {
       // If feature was already visited, skip.
@@ -563,9 +655,9 @@ Status FastParseSerializedExample(
       SparseBuffer& out = (*output_sparse)[d];
       if (example_dtype != DT_INVALID &&
           example_dtype != config.sparse[d].dtype) {
-        return example_error(
-            strings::StrCat("Data types don't match. ", "Expected type: ",
-                            DataTypeString(config.sparse[d].dtype)));
+        return example_error(strings::StrCat(
+            "Data types don't match. ",
+            "Expected type: ", DataTypeString(config.sparse[d].dtype)));
       }
 
       switch (config.sparse[d].dtype) {
@@ -602,8 +694,9 @@ Status FastParseSerializedExample(
     }
   }
 
-  // Handle missing dense features.
+  // Handle missing dense features for fixed strides.
   for (size_t d = 0; d < config.dense.size(); ++d) {
+    if (config.dense[d].variable_length) continue;
     if (dense_feature_last_example[d] == example_index) continue;
     if (config.dense[d].default_value.NumElements() == 0) {
       return errors::InvalidArgument(
@@ -637,6 +730,16 @@ Status FastParseSerializedExample(
     }
   }
 
+  // Handle missing varlen dense features.
+  for (size_t d = 0; d < config.dense.size(); ++d) {
+    if (!config.dense[d].variable_length) continue;
+    if (dense_feature_last_example[d] == example_index) continue;
+    SparseBuffer& out = (*output_varlen_dense)[d];
+    size_t prev_example_end_index =
+        out.example_end_indices.empty() ? 0 : out.example_end_indices.back();
+    out.example_end_indices.push_back(prev_example_end_index);
+  }
+
   // Handle missing sparse features.
   for (size_t d = 0; d < config.sparse.size(); ++d) {
     if (sparse_feature_last_example[d] == example_index) continue;
@@ -661,6 +764,75 @@ Status CheckConfigDataType(DataType dtype) {
   }
 }
 
+template <typename T>
+const SmallVector<T>& GetListFromBuffer(const SparseBuffer& buffer);
+
+template <>
+const SmallVector<int64>& GetListFromBuffer<int64>(const SparseBuffer& buffer) {
+  return buffer.int64_list;
+}
+template <>
+const SmallVector<float>& GetListFromBuffer<float>(const SparseBuffer& buffer) {
+  return buffer.float_list;
+}
+template <>
+const SmallVector<string>& GetListFromBuffer<string>(
+    const SparseBuffer& buffer) {
+  return buffer.bytes_list;
+}
+
+template <typename T>
+void CopyOrMoveBlock(const T* b, const T* e, T* t) {
+  std::copy(b, e, t);
+}
+template <>
+void CopyOrMoveBlock(const string* b, const string* e, string* t) {
+  std::move(b, e, t);
+}
+
+template <typename T>
+void FillAndCopyVarLen(
+    const int d, const size_t num_elements,
+    const size_t num_elements_per_minibatch, const Config& config,
+    const std::vector<std::vector<SparseBuffer>>& varlen_dense_buffers,
+    Tensor* values) {
+  const Tensor& default_value = config.dense[d].default_value;
+
+  // Copy-fill the tensors (creating the zero/fill-padding)
+  std::fill(values->flat<T>().data(), values->flat<T>().data() + num_elements,
+            default_value.flat<T>()(0));
+
+  // Data is [batch_size, max_num_elements, data_stride_size]
+  //   and num_elements_per_minibatch = max_num_elements * data_stride_size
+  auto data = values->flat<T>().data();
+
+  // Iterate over minibatch elements
+  for (size_t i = 0; i < varlen_dense_buffers.size(); ++i) {
+    const SparseBuffer& buffer = varlen_dense_buffers[i][d];
+    // Number of examples being stored in this buffer
+    const auto& end_indices = buffer.example_end_indices;
+    const size_t examples_in_buffer = end_indices.size();
+    // const size_t stride_size = config.dense[d].elements_per_stride;
+
+    const auto& list = GetListFromBuffer<T>(buffer);
+    auto list_ptr = list.begin();
+
+    size_t elements_tally = 0;
+    // Iterate through all the examples stored in this buffer.
+    for (size_t j = 0; j < examples_in_buffer; ++j) {
+      // Number of elements stored for this example.
+      const size_t num_elems = end_indices[j] - elements_tally;
+      CopyOrMoveBlock(list_ptr, list_ptr + num_elems, data);
+      // Move forward this many elements in the varlen buffer.
+      list_ptr += num_elems;
+      // Move forward to the next minibatch entry in the values output.
+      data += num_elements_per_minibatch;
+      elements_tally = end_indices[j];
+    }
+    DCHECK(elements_tally == list.size());
+  }
+}
+
 }  // namespace
 
 Status FastParseExample(const Config& config,
@@ -701,14 +873,17 @@ Status FastParseExample(const Config& config,
         "Could not avoid collision. This should not happen.");
   }
 
-  // Allocate dense output (sparse have to be buffered).
+  // Allocate dense output for fixed length dense values
+  // (variable-length dense and sparse have to be buffered).
+  std::vector<Tensor> fixed_dense_values(config.dense.size());
   for (size_t d = 0; d < config.dense.size(); ++d) {
+    if (config.dense[d].variable_length) continue;
     TensorShape out_shape;
     out_shape.AddDim(serialized.size());
     for (const int64 dim : config.dense[d].shape.dim_sizes()) {
       out_shape.AddDim(dim);
     }
-    result->dense_values.emplace_back(config.dense[d].dtype, out_shape);
+    fixed_dense_values[d] = Tensor(config.dense[d].dtype, out_shape);
   }
 
   // This parameter affects performance in a big and data-dependent way.
@@ -750,17 +925,19 @@ Status FastParseExample(const Config& config,
 
   // Do minibatches in parallel.
   std::vector<std::vector<SparseBuffer>> sparse_buffers(num_minibatches);
+  std::vector<std::vector<SparseBuffer>> varlen_dense_buffers(num_minibatches);
   std::vector<Status> status_of_minibatch(num_minibatches);
   auto ProcessMiniBatch = [&](size_t minibatch) {
     sparse_buffers[minibatch].resize(config.sparse.size());
+    varlen_dense_buffers[minibatch].resize(config.dense.size());
     size_t start = first_example_of_minibatch(minibatch);
     size_t end = first_example_of_minibatch(minibatch + 1);
     for (size_t e = start; e < end; ++e) {
       status_of_minibatch[minibatch] = FastParseSerializedExample(
           serialized[e],
-          (example_names.size() > 0 ? example_names[e] : "<unknown>"), e,
-          config, config_index, hasher, &result->dense_values,
-          &sparse_buffers[minibatch]);
+          (!example_names.empty() ? example_names[e] : "<unknown>"), e, config,
+          config_index, hasher, &fixed_dense_values,
+          &varlen_dense_buffers[minibatch], &sparse_buffers[minibatch]);
       if (!status_of_minibatch[minibatch].ok()) break;
     }
   };
@@ -771,13 +948,17 @@ Status FastParseExample(const Config& config,
     TF_RETURN_IF_ERROR(status);
   }
 
+  for (size_t d = 0; d < config.dense.size(); ++d) {
+    result->dense_values.push_back(std::move(fixed_dense_values[d]));
+  }
+
   // Merge SparseBuffers from all minibatches for every config.sparse.
-  auto MergeMinibatches = [&](size_t d) {
+  auto MergeSparseMinibatches = [&](size_t d) {
     // Loop over minibatches
     size_t total_num_features = 0;
     size_t max_num_features = 0;
     for (auto& sparse_values_tmp : sparse_buffers) {
-      std::vector<size_t>& end_indices =
+      const std::vector<size_t>& end_indices =
           sparse_values_tmp[d].example_end_indices;
       total_num_features += end_indices.back();
       max_num_features = std::max(max_num_features, end_indices[0]);
@@ -849,8 +1030,69 @@ Status FastParseExample(const Config& config,
     }
   };
 
+  // Merge SparseBuffers from all minibatches for every config.dense having
+  // variable_length.
+  auto MergeDenseVarLenMinibatches = [&](size_t d) {
+    if (!config.dense[d].variable_length) return;
+
+    // Loop over minibatches
+    size_t max_num_features = 0;
+    for (auto& dense_values_tmp : varlen_dense_buffers) {
+      std::vector<size_t>& end_indices =
+          dense_values_tmp[d].example_end_indices;
+      max_num_features = std::max(max_num_features, end_indices[0]);
+      for (size_t i = 1; i < end_indices.size(); ++i) {
+        size_t example_size = end_indices[i] - end_indices[i - 1];
+        max_num_features = std::max(max_num_features, example_size);
+      }
+    }
+
+    const size_t stride_size = config.dense[d].elements_per_stride;
+    const size_t max_num_elements = max_num_features / stride_size;
+    TensorShape values_shape;
+    DCHECK(max_num_features % config.dense[d].elements_per_stride == 0);
+    const size_t batch_size = serialized.size();
+    values_shape.AddDim(batch_size);
+    values_shape.AddDim(max_num_elements);
+    for (int i = 1; i < config.dense[d].shape.dims(); ++i) {
+      values_shape.AddDim(config.dense[d].shape.dim_size(i));
+    }
+    Tensor values(config.dense[d].dtype, values_shape);
+    result->dense_values[d] = values;
+    const size_t num_elements = values.NumElements();
+
+    // Nothing to write, exit early.
+    if (num_elements == 0) return;
+
+    const size_t num_elements_per_minibatch = num_elements / batch_size;
+
+    switch (config.dense[d].dtype) {
+      case DT_INT64: {
+        FillAndCopyVarLen<int64>(d, num_elements, num_elements_per_minibatch,
+                                 config, varlen_dense_buffers, &values);
+        break;
+      }
+      case DT_FLOAT: {
+        FillAndCopyVarLen<float>(d, num_elements, num_elements_per_minibatch,
+                                 config, varlen_dense_buffers, &values);
+        break;
+      }
+      case DT_STRING: {
+        FillAndCopyVarLen<string>(d, num_elements, num_elements_per_minibatch,
+                                  config, varlen_dense_buffers, &values);
+        break;
+      }
+      default:
+        CHECK(false) << "Should not happen.";
+    }
+  };
+
+  for (size_t d = 0; d < config.dense.size(); ++d) {
+    MergeDenseVarLenMinibatches(d);
+  }
+
   for (size_t d = 0; d < config.sparse.size(); ++d) {
-    MergeMinibatches(d);
+    MergeSparseMinibatches(d);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 4878199802c..20536cee163 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -45,9 +45,10 @@ struct FastParseExampleConfig {
     DataType dtype;
     // These 2 fields correspond exactly to dense_shapes and dense_defaults in
     // ParseExample op.
-    // Documentation is avaliable in: tensorflow/core/ops/parsing_ops.cc
+    // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
     PartialTensorShape shape;
     Tensor default_value;
+    bool variable_length;
     std::size_t elements_per_stride;
   };
 
@@ -61,7 +62,7 @@ struct FastParseExampleConfig {
 };
 
 // This is exactly the output of TF's ParseExample Op.
-// Documentation is avaliable in: tensorflow/core/ops/parsing_ops.cc
+// Documentation is available in: tensorflow/core/ops/parsing_ops.cc
 struct Result {
   std::vector<Tensor> sparse_indices;
   std::vector<Tensor> sparse_values;
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 8809839c568..70d40287887 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -15,11 +15,13 @@ limitations under the License.
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
 #include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/example_proto_fast_parsing_test.pb.h"
 
 namespace tensorflow {
 namespace example {
@@ -42,7 +44,8 @@ string SerializedToReadable(string serialized) {
   return result;
 }
 
-string Serialize(const Example& example) {
+template <class T>
+string Serialize(const T& example) {
   string serialized;
   example.SerializeToString(&serialized);
   return serialized;
@@ -67,6 +70,54 @@ void TestCorrectness(const string& serialized) {
 //   TestCorrectness(example);
 // }
 
+TEST(FastParse, IgnoresPrecedingUnknownTopLevelFields) {
+  ExampleWithExtras example;
+  (*example.mutable_features()->mutable_feature())["age"]
+      .mutable_int64_list()
+      ->add_value(13);
+  example.set_extra1("some_str");
+  example.set_extra2(123);
+  example.set_extra3(234);
+  example.set_extra4(345);
+  example.set_extra5(4.56);
+  example.add_extra6(5.67);
+  example.add_extra6(6.78);
+  (*example.mutable_extra7()->mutable_feature())["extra7"]
+      .mutable_int64_list()
+      ->add_value(1337);
+
+  Example context;
+  (*context.mutable_features()->mutable_feature())["zipcode"]
+      .mutable_int64_list()
+      ->add_value(94043);
+
+  TestCorrectness(strings::StrCat(Serialize(example), Serialize(context)));
+}
+
+TEST(FastParse, IgnoresTrailingUnknownTopLevelFields) {
+  Example example;
+  (*example.mutable_features()->mutable_feature())["age"]
+      .mutable_int64_list()
+      ->add_value(13);
+
+  ExampleWithExtras context;
+  (*context.mutable_features()->mutable_feature())["zipcode"]
+      .mutable_int64_list()
+      ->add_value(94043);
+  context.set_extra1("some_str");
+  context.set_extra2(123);
+  context.set_extra3(234);
+  context.set_extra4(345);
+  context.set_extra5(4.56);
+  context.add_extra6(5.67);
+  context.add_extra6(6.78);
+  (*context.mutable_extra7()->mutable_feature())["extra7"]
+      .mutable_int64_list()
+      ->add_value(1337);
+
+  TestCorrectness(strings::StrCat(Serialize(example), Serialize(context)));
+}
+
 TEST(FastParse, SingleInt64WithContext) {
   Example example;
   (*example.mutable_features()->mutable_feature())["age"]
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.proto b/tensorflow/core/util/example_proto_fast_parsing_test.proto
new file mode 100644
index 00000000000..ebd4af47e31
--- /dev/null
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.proto
@@ -0,0 +1,21 @@
+// Protocol message for the fast Example parse unit test.
+syntax = "proto3";
+
+import "tensorflow/core/example/feature.proto";
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// This message is parallel to Example, but with additional fields to test
+// unknown fields handling in example_proto_fast_parsing_test.cc.
+message ExampleWithExtras {
+  Features features = 1;
+
+  string extra1 = 1337;
+  int64 extra2 = 1338;
+  fixed32 extra3 = 1339;
+  fixed64 extra4 = 1340;
+  double extra5 = 1341;
+  repeated float extra6 = 1342;
+  Features extra7 = 1343;
+};
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 658b3588e6b..5ba6cb77b4b 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -323,7 +323,7 @@ Status BatchExampleProtoToTensors(
     std::vector<Tensor>* output_sparse_shapes_tensor) {
   const int batch_size = examples.size();
 
-  const bool has_names = (names.size() > 0);
+  const bool has_names = (!names.empty());
   if (has_names) {
     if (names.size() != examples.size()) {
       return errors::InvalidArgument(
@@ -359,9 +359,9 @@ Status BatchExampleProtoToTensors(
   for (size_t b = 0; b < examples.size(); ++b) {
     const Example& ex = *(examples[b]);
     const string& example_name = (has_names) ? names[b] : "<unknown>";
-    SingleExampleProtoToTensors(
+    TF_RETURN_IF_ERROR(SingleExampleProtoToTensors(
         ex, example_name, b, fixed_len_features, var_len_features,
-        &output_dense_values_tensor_ptrs, &sparse_values_tmp);
+        &output_dense_values_tensor_ptrs, &sparse_values_tmp));
   }
 
   for (size_t d = 0; d < var_len_features.size(); ++d) {
@@ -370,8 +370,9 @@ Status BatchExampleProtoToTensors(
     const std::vector<Tensor>& sparse_values_tensor = sparse_values_tmp[d];
 
     VarLenFeatureBatchShapes sparse_tensor_batch_shapes;
-    GetSparseTensorShapes(feature_config, sparse_values_tensor, batch_size,
-                          &sparse_tensor_batch_shapes);
+    TF_RETURN_IF_ERROR(GetSparseTensorShapes(feature_config,
+                                             sparse_values_tensor, batch_size,
+                                             &sparse_tensor_batch_shapes));
     const TensorShape& indices_shape = sparse_tensor_batch_shapes.indices_shape;
     const TensorShape& values_shape = sparse_tensor_batch_shapes.values_shape;
 
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 971d97266cd..44838d2e541 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -161,13 +161,32 @@ class ParseSingleExampleAttrs {
     // Temporary check until we start allowing a variable length outer
     // dimension.
     for (int i = 0; i < dense_shapes.size(); ++i) {
-      if (!dense_shapes[i].IsFullyDefined()) {
+      bool shape_ok = true;
+      if (dense_shapes[i].dims() == -1) {
+        shape_ok = false;
+      } else {
+        for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+          if (dense_shapes[i].dim_size(d) == -1) {
+            shape_ok = false;
+          }
+        }
+      }
+      if (!shape_ok) {
         return errors::InvalidArgument(
             "dense_shapes[", i,
-            "] is not fully defined: ", dense_shapes[i].DebugString());
+            "] has unknown rank or unknown inner dimensions: ",
+            dense_shapes[i].DebugString());
       }
       TensorShape dense_shape;
-      dense_shapes[i].AsTensorShape(&dense_shape);
+      if (dense_shapes[i].dims() > 0 && dense_shapes[i].dim_size(0) == -1) {
+        variable_length.push_back(true);
+        for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+          dense_shape.AddDim(dense_shapes[i].dim_size(d));
+        }
+      } else {
+        variable_length.push_back(false);
+        dense_shapes[i].AsTensorShape(&dense_shape);
+      }
       elements_per_stride.push_back(dense_shape.num_elements());
     }
     return FinishInit();
@@ -178,6 +197,7 @@ class ParseSingleExampleAttrs {
   std::vector<DataType> sparse_types;
   std::vector<DataType> dense_types;
   std::vector<PartialTensorShape> dense_shapes;
+  std::vector<bool> variable_length;
   std::vector<std::size_t> elements_per_stride;
 
  private:
diff --git a/tensorflow/core/util/example_proto_helper_test.cc b/tensorflow/core/util/example_proto_helper_test.cc
index 15c5d01c3b1..1bf430b2c78 100644
--- a/tensorflow/core/util/example_proto_helper_test.cc
+++ b/tensorflow/core/util/example_proto_helper_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/core/util/example_proto_helper.h"
 
 #include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -99,8 +101,9 @@ TEST_F(SingleExampleProtoToTensorsTest, SparseOnlyTrivial) {
   }
 
   std::vector<FixedLenFeature> empty_dense_vec;
-  SingleExampleProtoToTensors(ex, "", 0, empty_dense_vec, sparse_vec_,
-                              &output_dense_values, &output_sparse_values_tmp);
+  TF_EXPECT_OK(SingleExampleProtoToTensors(ex, "", 0, empty_dense_vec,
+                                           sparse_vec_, &output_dense_values,
+                                           &output_sparse_values_tmp));
 
   const std::vector<Tensor>& int64_tensor_vec = output_sparse_values_tmp[0];
   EXPECT_EQ(1, int64_tensor_vec.size());
@@ -124,8 +127,9 @@ TEST_F(SingleExampleProtoToTensorsTest, SparseOnlyEmpty) {
   }
 
   std::vector<FixedLenFeature> empty_dense_vec;
-  SingleExampleProtoToTensors(empty, "", 0, empty_dense_vec, sparse_vec_,
-                              &output_dense_values, &output_sparse_values_tmp);
+  TF_EXPECT_OK(SingleExampleProtoToTensors(empty, "", 0, empty_dense_vec,
+                                           sparse_vec_, &output_dense_values,
+                                           &output_sparse_values_tmp));
 
   // Each feature will still have a tensor vector, however the tensor
   // in the vector will be empty.
@@ -167,8 +171,9 @@ TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyTrivial) {
 
   std::vector<VarLenFeature> empty_sparse_vec;
   std::vector<std::vector<Tensor>> output_sparse_values_tmp;
-  SingleExampleProtoToTensors(ex, "", 0, dense_vec_, empty_sparse_vec,
-                              &output_dense_values, &output_sparse_values_tmp);
+  TF_EXPECT_OK(SingleExampleProtoToTensors(
+      ex, "", 0, dense_vec_, empty_sparse_vec, &output_dense_values,
+      &output_sparse_values_tmp));
   EXPECT_TRUE(output_sparse_values_tmp.empty());
 
   EXPECT_EQ(1, int64_dense_output.matrix<int64>().size());
@@ -196,8 +201,9 @@ TEST_F(SingleExampleProtoToTensorsTest, DenseOnlyDefaults) {
 
   std::vector<VarLenFeature> empty_sparse_vec;
   std::vector<std::vector<Tensor>> output_sparse_values_tmp;
-  SingleExampleProtoToTensors(empty, "", 0, dense_vec_, empty_sparse_vec,
-                              &output_dense_values, &output_sparse_values_tmp);
+  TF_EXPECT_OK(SingleExampleProtoToTensors(
+      empty, "", 0, dense_vec_, empty_sparse_vec, &output_dense_values,
+      &output_sparse_values_tmp));
 
   EXPECT_EQ(1, int64_dense_output.matrix<int64>().size());
   EXPECT_EQ(0, int64_dense_output.matrix<int64>()(0, 0));
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 179c72c1f5f..1d01c6b0839 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -45,7 +45,8 @@ Status CreateMemmappedFileSystemFile(const string& filename, bool corrupted,
 
   // Create a proto with some fields.
   GraphDef graph_def;
-  graph_def.set_version(kTestGraphDefVersion);
+  graph_def.mutable_versions()->set_producer(kTestGraphDefVersion);
+  graph_def.mutable_versions()->set_min_consumer(kTestGraphDefVersion);
   TF_RETURN_IF_ERROR(writer.SaveProtobuf(graph_def, kProtoFileName));
 
   // Save a tensor after the proto to check that alignment works.
@@ -74,7 +75,8 @@ TEST(MemmappedFileSystemTest, SimpleTest) {
   GraphDef test_graph_def;
   TF_EXPECT_OK(
       ReadBinaryProto(&memmapped_env, kProtoFileName, &test_graph_def));
-  EXPECT_EQ(kTestGraphDefVersion, test_graph_def.version());
+  EXPECT_EQ(kTestGraphDefVersion, test_graph_def.versions().producer());
+  EXPECT_EQ(kTestGraphDefVersion, test_graph_def.versions().min_consumer());
   // Check that we can correctly get a tensor memory.
   std::unique_ptr<ReadOnlyMemoryRegion> memory_region;
   TF_ASSERT_OK(memmapped_env.NewReadOnlyMemoryRegionFromFile(kTensor2FileName,
@@ -142,7 +144,7 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   // Making sure to clean up after the test finishes.
   const auto adh = [&memmapped_env, &filename](WritableFile* f) {
       delete f;
-      memmapped_env.DeleteFile(filename);
+      TF_CHECK_OK(memmapped_env.DeleteFile(filename));
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
diff --git a/tensorflow/core/util/memmapped_file_system_writer.cc b/tensorflow/core/util/memmapped_file_system_writer.cc
index 7e87f4539c7..9556ee385f6 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.cc
+++ b/tensorflow/core/util/memmapped_file_system_writer.cc
@@ -41,7 +41,7 @@ Status MemmappedFileSystemWriter::SaveTensor(const Tensor& tensor,
         " and include [A-Za-z0-9_.]");
   }
   const auto tensor_data = tensor.tensor_data();
-  if (0 == tensor_data.size()) {
+  if (tensor_data.empty()) {
     return errors::InvalidArgument(
         "MemmappedEnvWritter: saving tensor with 0 size");
   }
diff --git a/tensorflow/core/util/mirror_pad_mode.h b/tensorflow/core/util/mirror_pad_mode.h
index 392d41b773e..f703d47ab10 100644
--- a/tensorflow/core/util/mirror_pad_mode.h
+++ b/tensorflow/core/util/mirror_pad_mode.h
@@ -40,7 +40,7 @@ enum class MirrorPadMode {
 // used as an Attr() in REGISTER_OP.
 string GetMirrorPadModeAttrString();
 
-// Forward declaration to avoid including core/framework/graph.pb.h.
+// Forward declaration to avoid including core/framework/graph.proto.
 class NodeDef;
 
 // Specialization to parse an attribute directly into a MirrorPadMode enum.
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
new file mode 100644
index 00000000000..67468bdc3fc
--- /dev/null
+++ b/tensorflow/core/util/mkl_util.h
@@ -0,0 +1,678 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifdef INTEL_MKL
+
+#include <string>
+#include <vector>
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_service.h"
+#include "third_party/mkl/include/mkl_trans.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
+namespace tensorflow {
+
+// This class encapsulates all the meta data that is associated with an MKL
+// tensor. A tensor is an MKL tensor if it was created as the result of an
+// MKL operation, and did not go through a conversion to a standard
+// Tensorflow tensor.
+
+typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
+class MklShape {
+ public:
+  MklShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklShape);  // Cannot copy
+
+  ~MklShape() {
+    if (sizes_) delete[] sizes_;
+    if (strides_) delete[] strides_;
+    if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
+    if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+    if (tf_to_mkl_dim_map_) delete[] tf_to_mkl_dim_map_;
+  }
+
+  const bool IsMklTensor() const { return isMklTensor_; }
+
+  void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
+
+  void SetDimensions(const size_t dimension) { dimension_ = dimension; }
+
+  void SetMklLayout(const void* primitive, size_t resourceType) {
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
+                                         (dnnResourceType_t)resourceType),
+        E_SUCCESS);
+  }
+
+  void SetTfLayout(const size_t dimension, const size_t* sizes,
+                   const size_t* strides) {
+    dimension_ = dimension;
+    if (dimension > 0) {  // MKl doesn't support zero dimension tensors
+      sizes_ = new size_t[dimension];
+      strides_ = new size_t[dimension];
+
+      for (int ii = 0; ii < dimension; ii++) {
+        sizes_[ii] = sizes[ii];
+        strides_[ii] = strides[ii];
+      }
+      CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides),
+               E_SUCCESS);
+    }
+  }
+
+  // Default case - MKL dim ordering is opposite of TF dim ordering
+  // MKL -> (DIMS-1)...0 where (DIMS-1) is outermost dim and 0 is innermost dim
+  // TF  -> 0...(DIMS-1) where 0 is outermost dim and (DIMS-1) is innermost dim
+  // For layers that rely on data_format semantics (conv, pooling etc.)
+  // or operate only on certain dimensions (relu, concat, split etc.),
+  // Mkl APIs might require us to reorder these dimensions. In such cases,
+  // kernels should explicitly set this map
+  void SetTfDimOrder(const size_t dimension) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = dimension - (ii + 1);
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, const size_t* tf_to_mkl_dim_map) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = tf_to_mkl_dim_map[ii];
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    CHECK_EQ(dimension, 4);
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDims::W;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDims::H;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDims::C;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDims::N;
+  }
+
+  const dnnLayout_t GetMklLayout() const { return mklLayout_; }
+  const dnnLayout_t GetTfLayout() const { return tfLayout_; }
+  const dnnLayout_t GetCurLayout() const {
+    return isMklTensor_ ? mklLayout_ : tfLayout_;
+  }
+  size_t GetDimension() const { return dimension_; }
+  const size_t* GetSizes() const { return sizes_; }
+  int64 dim_size(int index) const { return sizes_[index]; }
+  const size_t* GetStrides() const { return strides_; }
+  const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
+  size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
+
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Channel dimension.
+  bool IsMklChannelDim(int d) const { return tf_dim_idx(d) == MklDims::C; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Batch dimension.
+  bool IsMklBatchDim(int d) const { return tf_dim_idx(d) == MklDims::N; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Width dimension.
+  bool IsMklWidthDim(int d) const { return tf_dim_idx(d) == MklDims::W; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Height dimension.
+  bool IsMklHeightDim(int d) const { return tf_dim_idx(d) == MklDims::H; }
+
+  // Check if the TF-Mkl dimension ordering map specifies if the input
+  // tensor is in NCHW format.
+  bool IsTensorInNCHWFormat() const {
+    TensorFormat data_format = FORMAT_NCHW;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  // Check if the TF-Mkl dimension ordering map specifies if the input
+  // tensor is in NHWC format.
+  bool IsTensorInNHWCFormat() const {
+    TensorFormat data_format = FORMAT_NHWC;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
+                            void* output) const {
+    dnnLayout_t curLayout;
+    if (isMklTensor_)
+      curLayout = mklLayout_;
+    else
+      curLayout = tfLayout_;
+    dnnPrimitive_t convert;
+    CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout),
+             E_SUCCESS);
+    CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS);
+    CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
+  }
+
+// The following methods are used for serializing and de-serializing the
+// contents of the mklshape object.
+// The data is serialized in this order
+// isMklTensor_
+// dimension_
+// sizes_
+// strides_
+// mklLayout_
+// tfLayout_
+// tf_to_mkl_dim_map_
+
+#define SIZE_OF_MKL_DNN_BUF \
+  (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
+                                            // serialize dnn_layout pointer
+
+// Size of buffer to hold the serialized object, the size is computed as follows
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
+// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+// + sizeof(tf_to_mkl_dim_map_)
+
+#define SIZE_OF_MKL_SERIAL_DATA(dims) \
+  (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+
+// First we need to define some macro for offsets into the serial buffer where
+// different elements of Mklshape is written/read from
+
+#define IS_MKL_TENSOR_OFFSET 0
+// Location from start of buffer where isMklTensor_ is serialized
+#define DIMS_OFFSET \
+  (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
+#define SIZES_OFFSET(dims) \
+  (DIMS_OFFSET +           \
+   sizeof(size_t))  // Location of sizes. Note dim is not used here, left here
+                    // to make macros consistent.
+#define STRIDES_OFFSET(dims) \
+  (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
+#define MKL_LAYOUT_OFFSET(dims) \
+  (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
+#define TF_LAYOUT_OFFSET(dims) \
+  (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+#define TF_TO_MKL_DIM_MAP_OFFSET(dims) \
+  (TF_LAYOUT_OFFSET(dims) +            \
+   SIZE_OF_MKL_DNN_BUF)  // Location of tf_to_mkl_dim_map_
+
+  // TODO(agramesh1) make sure to create a const to share with rewrite pass
+  // for min size of MKL metadata tensor.
+
+  void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) {
+    CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize";
+    // Make sure buffer holds at least  isMklTensor_
+    isMklTensor_ =
+        *reinterpret_cast<const size_t*>(buf + IS_MKL_TENSOR_OFFSET) != 0;
+
+    if (isMklTensor_) {  // If it is an MKL Tensor then read the rest
+      dimension_ = *(reinterpret_cast<const size_t*>(buf + DIMS_OFFSET));
+      CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+          << "Bufsize too small in DeSerialize";
+      sizes_ = new size_t[dimension_];
+      strides_ = new size_t[dimension_];
+      tf_to_mkl_dim_map_ = new size_t[dimension_];
+      for (int i = 0; i < dimension_; i++) {
+        sizes_[i] =
+            reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
+        strides_[i] = reinterpret_cast<const size_t*>(
+            buf + STRIDES_OFFSET(dimension_))[i];
+        tf_to_mkl_dim_map_[i] = reinterpret_cast<const size_t*>(
+            buf + TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i];
+      }
+      CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
+                                        buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_,
+                                        buf + TF_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+    }
+  }
+
+  void SerializeMklShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+        << "Bufsize too small to Serialize";
+    *reinterpret_cast<size_t*>(buf + IS_MKL_TENSOR_OFFSET) =
+        isMklTensor_ ? 1 : 0;
+    if (isMklTensor_) {
+      *(reinterpret_cast<size_t*>(buf + DIMS_OFFSET)) = dimension_;
+      for (int i = 0; i < dimension_; i++) {
+        reinterpret_cast<size_t*>(buf + SIZES_OFFSET(dimension_))[i] =
+            sizes_[i];
+        reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
+            strides_[i];
+        reinterpret_cast<size_t*>(buf +
+                                  TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i] =
+            tf_to_mkl_dim_map_[i];
+      }
+      CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
+                                      buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(
+          dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)),
+          E_SUCCESS);
+    }
+  }
+
+ private:
+  bool isMklTensor_ =
+      false;  // Flag to indicate if the tensor is an  MKL tensor or not
+  dnnLayout_t mklLayout_ = nullptr;  // Pointer to the MKL layout
+  dnnLayout_t tfLayout_ = nullptr;   // Pointer to layout of corresponding
+  // Tensorflow tensor, used when conversion from MKL to standard tensor
+  size_t dimension_ = 0;
+  size_t* sizes_ = nullptr;    // Required by MKL for conversions
+  size_t* strides_ = nullptr;  // Required by MKL for conversions
+  size_t* tf_to_mkl_dim_map_ =
+      nullptr;  // TF dimension corresponding to this MKL dimension
+};
+
+// List of MklShape objects. Used in Concat/Split layers.
+typedef std::vector<MklShape> MklShapeList;
+
+// Check if all tensors specified by MklShapes are MKL tensors.
+inline bool AreAllMklTensors(const MklShapeList& shapes) {
+  for (auto& s : shapes) {
+    if (!s.IsMklTensor()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
+                             const MklShape& mkl_shape) {
+  Tensor output_tensor;
+  TensorShape output_shape;
+
+  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
+    // Outermost to innermost dimension
+    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
+  }
+
+  // Allocate output tensor.
+  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
+
+  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
+  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
+  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
+
+  if (mkl_tensor.NumElements() != 0) {
+    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
+  }
+
+  return output_tensor;
+}
+
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their appropriate
+// position based on selected ordering. For contiguous ordering, we need to know
+// the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
+  }
+}
+
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
+
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
+
+// Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
+  mklshape->DeSerializeMklShape(
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+          .flat<uint8>()
+          .data(),
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+              .flat<uint8>()
+              .size() *
+          sizeof(uint8));
+}
+
+// Gets the actual input
+inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
+  return ctext->input(GetTensorDataIndex(n, ctext->num_inputs()));
+}
+
+inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
+                            OpInputList* input_tensors) {
+  CHECK_NOTNULL(input_tensors);
+  ctext->input_list(name, input_tensors);
+}
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+// Allocate the second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      const MklShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension()));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tf_shape,
+                                      const MklShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension()));
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
+                                    tf_shape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocates a temp tensor and returns the data buffer for temporary storage.
+// Currently
+// we only support F32, will need to templatize if other types are added
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           dnnLayout_t lt_buff, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(
+      dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(lt_buff)) /
+          sizeof(float) +
+      1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
+}
+
+inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
+                                const size_t* sizes) {
+  // MKL requires strides in NCHW
+  if (data_format == FORMAT_NHWC) {
+    strides[0] = sizes[2];
+    strides[1] = sizes[0] * sizes[2];
+    strides[2] = 1;
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  } else {
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  }
+}
+
+inline void MklSizesToTFSizes(OpKernelContext* context,
+                              TensorFormat data_format_,
+                              const MklShape& mkl_shape,
+                              TensorShape* tf_shape) {
+  size_t tf_dim = mkl_shape.GetDimension();
+  const size_t* tf_sizes = mkl_shape.GetSizes();
+
+  OP_REQUIRES(context, tf_dim == 4,
+              errors::InvalidArgument("MKLSizesToTFSizes: size must be 4-dim"));
+  std::vector<int32> sizes;
+
+  sizes.push_back(tf_sizes[3]);
+
+  if (data_format_ == FORMAT_NHWC) {
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+    sizes.push_back(tf_sizes[2]);
+  } else {
+    sizes.push_back(tf_sizes[2]);
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+  }
+
+  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tf_shape));
+}
+
+inline int32 GetMklTensorDimIndex(char dimension) {
+  switch (dimension) {
+    case 'N':
+      return MklDims::N;
+    case 'C':
+      return MklDims::C;
+    case 'H':
+      return MklDims::H;
+    case 'W':
+      return MklDims::W;
+    default:
+      LOG(FATAL) << "Invalid dimension: " << dimension;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
+  int index = GetMklTensorDimIndex(dimension);
+  CHECK(index >= 0 && index < mkl_shape.GetDimension())
+      << "Invalid index from the dimension: " << index << ", " << dimension;
+  return mkl_shape.dim_size(index);
+}
+
+inline void CopyMklTensorInToOut(OpKernelContext* context,
+                                 int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  const Tensor& meta = context->input(idx_meta_in);
+  Tensor output(data.dtype());
+  Tensor meta_output(meta.dtype());
+
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, data.shape()));
+  CHECK(meta_output.CopyFrom(meta, meta.shape()));
+  context->set_output(idx_data_out, output);
+  context->set_output(idx_meta_out, meta_output);
+}
+
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+
+inline void FowardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+inline void ForwarMklTensorInToOut(OpKernelContext* context,
+                                   int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+    context->set_output(idx_meta_out, context->input(idx_meta_in));
+  }
+}
+
+  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
+  // out. 
+inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
+  const float* buf_in = input.flat<float>().data();
+  float* buf_out = (*output)->flat<float>().data();
+
+  int64 N = input.dim_size(0);
+  int64 H = input.dim_size(1);
+  int64 W = input.dim_size(2);
+  int64 C = input.dim_size(3);
+  int64 stride_n = H*W*C;
+# pragma omp parallel for num_threads(16)
+  for (int64 n = 0; n < N; ++n) {
+    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
+        buf_out + n*stride_n, H*W);
+  }
+}
+
+  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
+  // out. 
+inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
+  const float* buf_in = input.flat<float>().data();
+  float* buf_out = (*output)->flat<float>().data();
+
+  int64 N = (*output)->dim_size(0);
+  int64 H = (*output)->dim_size(1);
+  int64 W = (*output)->dim_size(2);
+  int64 C = (*output)->dim_size(3);
+  int64 stride_n = H*W*C;
+# pragma omp parallel for num_threads(16)
+  for (int64 n = 0; n < N; ++n) {
+    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
+        buf_out + n*stride_n, C);
+  }
+}
+
+namespace mkl_op_registry {
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  if (result) {
+    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
+  }
+  return result;
+}
+
+}  // namespace mkl_op_registry
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/tensorflow/core/util/overflow.h b/tensorflow/core/util/overflow.h
new file mode 100644
index 00000000000..04be68a111e
--- /dev/null
+++ b/tensorflow/core/util/overflow.h
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_OVERFLOW_H_
+#define TENSORFLOW_CORE_UTIL_OVERFLOW_H_
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Multiply two nonnegative int64's, returning negative for overflow
+inline int64 MultiplyWithoutOverflow(const int64 x, const int64 y) {
+  // Multiply in uint64 rather than int64 since signed overflow is undefined.
+  // Negative values will wrap around to large unsigned values in the casts
+  // (see section 4.7 [conv.integral] of the C++14 standard).
+  const uint64 ux = x;
+  const uint64 uy = y;
+  const uint64 uxy = ux * uy;
+
+  // Check if we overflow uint64, using a cheap check if both inputs are small
+  if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
+    // Ensure nonnegativity.  Note that negative numbers will appear "large"
+    // to the unsigned comparisons above.
+    CHECK(x >= 0 && y >= 0);
+
+    // Otherwise, detect overflow using a division
+    if (ux != 0 && uxy / ux != uy) return -1;
+  }
+
+  // Cast back to signed.  Any negative value will signal an error.
+  return static_cast<int64>(uxy);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_OVERFLOW_H_
diff --git a/tensorflow/core/util/overflow_test.cc b/tensorflow/core/util/overflow_test.cc
new file mode 100644
index 00000000000..f93ba885e6d
--- /dev/null
+++ b/tensorflow/core/util/overflow_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/overflow.h"
+#include <cmath>
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(OverflowTest, Nonnegative) {
+  // Various interesting values
+  std::vector<int64> interesting = {0, std::numeric_limits<int64>::max()};
+  for (int i = 0; i < 63; i++) {
+    int64 bit = static_cast<int64>(1) << i;
+    interesting.push_back(bit);
+    interesting.push_back(bit + 1);
+    interesting.push_back(bit - 1);
+  }
+  for (const int64 mid : {static_cast<int64>(1) << 32,
+                          static_cast<int64>(std::pow(2, 63.0 / 2))}) {
+    for (int i = -5; i < 5; i++) {
+      interesting.push_back(mid + i);
+    }
+  }
+
+  // Check all pairs
+  for (auto x : interesting) {
+    for (auto y : interesting) {
+      int64 xy = MultiplyWithoutOverflow(x, y);
+      long double dxy = static_cast<long double>(x) * y;
+      if (dxy > std::numeric_limits<int64>::max()) {
+        EXPECT_LT(xy, 0);
+      } else {
+        EXPECT_EQ(dxy, xy);
+      }
+    }
+  }
+}
+
+TEST(OverflowTest, Negative) {
+  const int64 negatives[] = {-1, std::numeric_limits<int64>::min()};
+  for (const int64 n : negatives) {
+    EXPECT_DEATH(MultiplyWithoutOverflow(n, 0), "");
+    EXPECT_DEATH(MultiplyWithoutOverflow(0, n), "");
+    EXPECT_DEATH(MultiplyWithoutOverflow(n, n), "");
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index cf3b8cf5b31..e7dab830f0e 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -132,6 +132,10 @@ class PresizedCuckooMap {
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
+  int64 MemoryUsed() const {
+    return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
+  }
+
  private:
   static constexpr int kSlotsPerBucket = 4;
 
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index c4896fe2ace..ee38f81f3e1 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -41,8 +41,8 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
                                double throughput) {
   if (closed_) return Status::OK();
   benchmark_entry_.set_iters(iters);
-  benchmark_entry_.set_cpu_time(cpu_time);
-  benchmark_entry_.set_wall_time(wall_time);
+  benchmark_entry_.set_cpu_time(cpu_time / iters);
+  benchmark_entry_.set_wall_time(wall_time / iters);
   benchmark_entry_.set_throughput(throughput);
   return Status::OK();
 }
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index f69e5b546e8..bcae12204ec 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -75,7 +75,8 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
-  ~TestReporter() { Close(); }  // Autoclose in destructor.
+  // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+  ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
  private:
   static string GetLogEnv() {
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 0c7412d2db3..b773b330089 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -47,13 +47,13 @@ class DimComparator {
   typedef typename gtl::ArraySlice<int64> VarDimArray;
 
   DimComparator(const TTypes<int64>::Matrix& ix, const VarDimArray& order,
-                const TensorShape& shape)
-      : ix_(ix), order_(order), dims_(shape.dims()) {
+                const VarDimArray& shape)
+      : ix_(ix), order_(order), dims_(shape.size()) {
     CHECK_GT(order.size(), size_t{0}) << "Must order using at least one index";
-    CHECK_LE(order.size(), shape.dims()) << "Can only sort up to dims";
+    CHECK_LE(order.size(), shape.size()) << "Can only sort up to dims";
     for (size_t d = 0; d < order.size(); ++d) {
       CHECK_GE(order[d], 0);
-      CHECK_LT(order[d], shape.dims());
+      CHECK_LT(order[d], shape.size());
     }
   }
 
@@ -95,7 +95,7 @@ template <int ORDER_DIM>
 class FixedDimComparator : DimComparator {
  public:
   FixedDimComparator(const TTypes<int64>::Matrix& ix, const VarDimArray& order,
-                     const TensorShape& shape)
+                     const VarDimArray& shape)
       : DimComparator(ix, order, shape) {
     CHECK_EQ(order.size(), ORDER_DIM);
   }
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 9d6f9e8bb5d..d8e5d901427 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -39,30 +39,39 @@ namespace sparse {
 class SparseTensor {
  public:
   typedef typename gtl::ArraySlice<int64> VarDimArray;
+  typedef typename gtl::InlinedVector<int64, 8> ShapeArray;
 
   SparseTensor(Tensor ix, Tensor vals, const TensorShape& shape)
+      : SparseTensor(ix, vals, TensorShapeToVector(shape),
+                     UndefinedOrder(TensorShapeToVector(shape))) {}
+
+  SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape)
       : SparseTensor(ix, vals, shape, UndefinedOrder(shape)) {}
 
   SparseTensor(Tensor ix, Tensor vals, const TensorShape& shape,
-               const VarDimArray& order)
+               const VarDimArray order)
+      : SparseTensor(ix, vals, TensorShapeToVector(shape), order) {}
+
+  SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
+               const VarDimArray order)
       : ix_(ix),
         vals_(vals),
-        shape_(shape),
+        shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
-                                   << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64)
+        << "indices must be type int64 but got: " << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
         << "indices and values rows (indexing dimension) must match.";
     CHECK_EQ(order.size(), dims_) << "Order length must be SparseTensor rank.";
-    CHECK_EQ(shape.dims(), dims_) << "Shape rank must be SparseTensor rank.";
+    CHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
   }
 
   std::size_t num_entries() const { return ix_.dim_size(0); }
 
-  int dims() const { return shape_.dims(); }
+  int dims() const { return shape_.size(); }
 
   const Tensor& indices() const { return ix_; }
 
@@ -87,11 +96,9 @@ class SparseTensor {
     return Status::OK();
   }
 
-  // Returns the tensor shape (the dimensions of the "densified"
-  // tensor this tensor represents).
-  const TensorShape& shape() const { return shape_; }
+  VarDimArray shape() const { return shape_; }
 
-  const VarDimArray order() const { return order_; }
+  VarDimArray order() const { return order_; }
 
   // Resorts the indices and values according to the dimensions in order.
   template <typename T>
@@ -152,7 +159,7 @@ class SparseTensor {
   std::vector<int64> PickDims(gtl::ArraySlice<int64> dim_indices) const {
     std::vector<int64> res(dim_indices.size());
     for (size_t i = 0; i < dim_indices.size(); ++i) {
-      res[i] = shape_.dim_size(dim_indices[i]);
+      res[i] = shape_[dim_indices[i]];
     }
     return res;
   }
@@ -164,8 +171,14 @@ class SparseTensor {
     return ix.dim_size(1);
   }
 
-  static gtl::InlinedVector<int64, 8> UndefinedOrder(const TensorShape& shape) {
-    return gtl::InlinedVector<int64, 8>(shape.dims(), -1);
+  static inline ShapeArray UndefinedOrder(const VarDimArray shape) {
+    return ShapeArray(shape.size(), -1);
+  }
+
+  static inline ShapeArray TensorShapeToVector(const TensorShape& shape) {
+    ShapeArray vec(shape.dims());
+    for (int i = 0; i < shape.dims(); ++i) vec[i] = shape.dim_size(i);
+    return vec;
   }
 
   // Helper for IndicesValid()
@@ -176,14 +189,12 @@ class SparseTensor {
     bool increasing = true;
     if (n == 0) {
       for (int di = 0; di < dims_; ++di) {
-        if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_.dim_size(di))
-          valid = false;
+        if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) valid = false;
       }
       different = true;
     } else {
       for (int di = 0; di < dims_; ++di) {
-        if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_.dim_size(di))
-          valid = false;
+        if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) valid = false;
         int64 diff = ix_t(n, order_[di]) - ix_t(n - 1, order_[di]);
         if (diff > 0) different = true;
         if (!different && diff < 0) increasing = false;
@@ -196,8 +207,8 @@ class SparseTensor {
       }
       if (!valid) {
         return errors::InvalidArgument(index,
-                                       " is out of bounds: need 0 <= index < ",
-                                       shape_.DebugString());
+                                       " is out of bounds: need 0 <= index < [",
+                                       str_util::Join(shape_, ","), "]");
       }
       if (!increasing) {
         return errors::InvalidArgument(index, " is out of order");
@@ -256,8 +267,8 @@ class SparseTensor {
 
   Tensor ix_;
   Tensor vals_;
-  TensorShape shape_;
-  gtl::InlinedVector<int64, 8> order_;
+  ShapeArray shape_;
+  ShapeArray order_;
   const int dims_;
 };
 
@@ -320,7 +331,7 @@ void SparseTensor::Reorder(const VarDimArray& order) {
     }
   }
 
-  order_ = gtl::InlinedVector<int64, 8>(order.begin(), order.end());
+  order_ = ShapeArray(order.begin(), order.end());
 }
 
 template <typename T>
@@ -338,9 +349,9 @@ bool SparseTensor::ValidateAndInitializeToDense(Tensor* out, bool initialize) {
   // Make sure the dense output is the same rank and has room
   // to hold the SparseTensor.
   const auto& out_shape = out->shape();
-  if (shape_.dims() != out_shape.dims()) return false;
-  for (int d = 0; d < shape_.dims(); ++d) {
-    if (shape_.dim_size(d) > out_shape.dim_size(d)) return false;
+  if (shape_.size() != out_shape.dims()) return false;
+  for (int d = 0; d < shape_.size(); ++d) {
+    if (shape_[d] > out_shape.dim_size(d)) return false;
   }
 
   if (initialize) {
@@ -392,9 +403,9 @@ SparseTensor SparseTensor::Concat(
   CHECK_GE(dims, 1) << "Cannot concat 0-dimensional SparseTensors";
   auto order_0 = tensors[0].order();
   const int primary_dim = order_0[0];
-  gtl::InlinedVector<int64, 8> final_order(order_0.begin(), order_0.end());
-  TensorShape final_shape(tensors[0].shape());
-  final_shape.set_dim(primary_dim, 0);  // We'll build this up as we go along.
+  ShapeArray final_order(order_0.begin(), order_0.end());
+  ShapeArray final_shape(tensors[0].shape().begin(), tensors[0].shape().end());
+  final_shape[primary_dim] = 0;  // We'll build this up as we go along.
   int num_entries = 0;
 
   bool fully_ordered = true;
@@ -406,20 +417,20 @@ SparseTensor SparseTensor::Concat(
     CHECK_EQ(st.order()[0], primary_dim)
         << "All SparseTensors' order[0] must match.  This is the concat dim.";
     if (st.order() != final_order) fully_ordered = false;
-    const TensorShape st_shape = st.shape();
+    const VarDimArray& st_shape = st.shape();
     for (int d = 0; d < dims - 1; ++d) {
       const int cdim = (d < primary_dim) ? d : d + 1;
-      CHECK_EQ(final_shape.dim_size(cdim), st_shape.dim_size(cdim))
+      CHECK_EQ(final_shape[cdim], st_shape[cdim])
           << "All SparseTensors' shapes must match except on the concat dim.  "
           << "Concat dim: " << primary_dim
           << ", mismatched shape at dim: " << cdim
-          << ".  Expecting shape like: " << final_shape.DebugString()
-          << " but saw shape: " << st_shape.DebugString();
+          << ".  Expecting shape like: [" << str_util::Join(final_shape, ",")
+          << "] but saw shape: [" << str_util::Join(st_shape, ",") << "]";
     }
 
     // Update dimension of final shape
-    final_shape.set_dim(primary_dim, final_shape.dim_size(primary_dim) +
-                                         st_shape.dim_size(primary_dim));
+    final_shape[primary_dim] =
+        (final_shape[primary_dim] + st_shape[primary_dim]);
 
     num_entries += st.num_entries();  // Update number of entries
   }
@@ -450,7 +461,7 @@ SparseTensor SparseTensor::Concat(
     }
 
     offset += st_num_entries;
-    shape_offset += st.shape().dim_size(primary_dim);
+    shape_offset += st.shape()[primary_dim];
   }
 
   return SparseTensor(output_ix, output_vals, final_shape, final_order);
@@ -475,8 +486,8 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   auto input_indices_t = input_tensor.indices().matrix<int64>();
 
   std::vector<int> num_values(num_split, 0);
-  const int num_dim = input_tensor.shape().dims();
-  const int split_dim_size = input_tensor.shape().dim_size(split_dim);
+  const int num_dim = input_tensor.shape().size();
+  const int split_dim_size = input_tensor.shape()[split_dim];
   const int split_size = split_dim_size / num_split;
 
   CHECK(num_split > 0 && num_split <= split_dim_size) << "num_split must be in "
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index f97d56c52c6..5edd6cb1d8d 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -57,7 +57,7 @@ GetSimpleIndexTensor(int N, const int NDIM) {
 }
 
 TEST(SparseTensorTest, DimComparatorSorts) {
-  std::size_t N = 5;
+  int64 N = 5;
   const int NDIM = 3;
   auto ix = GetSimpleIndexTensor(N, NDIM);
   TTypes<int64>::Matrix map(ix.data(), N, NDIM);
@@ -67,8 +67,7 @@ TEST(SparseTensorTest, DimComparatorSorts) {
 
   // new order should be: {0, 4, 3, 2, 1}
   std::vector<int64> order{0, 1, 2};
-  TensorShape shape;
-  for (int i = 0; i < NDIM; ++i) shape.AddDim(N);
+  std::vector<int64> shape{N, N, N};
   DimComparator sorter(map, order, shape);
   std::sort(sorting.begin(), sorting.end(), sorter);
   EXPECT_EQ(sorting, std::vector<int64>({0, 4, 3, 2, 1}));
@@ -209,7 +208,7 @@ TEST(SparseTensorTest, EmptySparseTensorAllowed) {
   Tensor ix(DT_INT64, TensorShape({N, NDIM}));
   Tensor vals(DT_STRING, TensorShape({N}));
 
-  TensorShape shape({10, 10, 10});
+  std::vector<int64> shape{10, 10, 10};
   std::vector<int64> order{0, 1, 2};
   SparseTensor st(ix, vals, shape, order);
   TF_EXPECT_OK(st.IndicesValid());
@@ -529,7 +528,7 @@ TEST(SparseTensorTest, Concat) {
 
   SparseTensor concatted = SparseTensor::Concat<string>({st, st, st, st});
   EXPECT_EQ(concatted.order(), st.order());
-  TensorShape expected_shape({40, 10, 10});
+  gtl::InlinedVector<int64, 8> expected_shape{40, 10, 10};
   EXPECT_EQ(concatted.shape(), expected_shape);
   EXPECT_EQ(concatted.num_entries(), 4 * N);
   TF_EXPECT_OK(concatted.IndicesValid());
@@ -590,7 +589,9 @@ TEST(SparseTensorTest, Split) {
   std::vector<SparseTensor> st_list = SparseTensor::Split<int64>(st, 0, 2);
 
   EXPECT_EQ(st_list.size(), 2);
-  EXPECT_EQ(st_list[0].shape(), TensorShape({2, 3}));
+  auto expected_shape = gtl::InlinedVector<int64, 8>{2, 3};
+
+  EXPECT_EQ(st_list[0].shape(), expected_shape);
   EXPECT_EQ(st_list[0].values().NumElements(), 3);
   EXPECT_EQ(st_list[0].values().vec<int64>()(0), 1);
   EXPECT_EQ(st_list[0].values().vec<int64>()(1), 2);
@@ -603,7 +604,7 @@ TEST(SparseTensorTest, Split) {
   EXPECT_EQ(st_list[0].indices().matrix<int64>()(2, 0), 1);
   EXPECT_EQ(st_list[0].indices().matrix<int64>()(2, 1), 2);
 
-  EXPECT_EQ(st_list[1].shape(), TensorShape({2, 3}));
+  EXPECT_EQ(st_list[1].shape(), expected_shape);
   EXPECT_EQ(st_list[1].values().NumElements(), 1);
   EXPECT_EQ(st_list[1].values().vec<int64>()(0), 4);
   EXPECT_EQ(st_list[1].indices().NumElements(), 2);
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index a90ebab271d..fa59f735818 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -44,26 +45,28 @@ void StatSummarizer::Validate(const Detail* detail,
     for (const auto& output : ns.output()) {
       const int32 slot = output.slot();
       if ((slot < 0) || (slot >= ns.output_size())) {
-        LOG(ERROR) << "Bad output slot '" << slot << "' for '" << ns.node_name()
-                   << "'";
-        return;
+        // This is not a hard error for Switch ops, so just pass.
+        continue;
       }
       const auto& stored = detail->outputs[slot];
       const auto& current = output.tensor_description();
-      bool do_shapes_match = true;
-      if (stored.shape().dim_size() != current.shape().dim_size()) {
-        do_shapes_match = false;
-      } else {
+
+      bool do_tensors_match =
+          (stored.dtype() == current.dtype()) &&
+          (stored.shape().dim_size() == current.shape().dim_size());
+
+      if (do_tensors_match) {
         for (int i = 0; i < stored.shape().dim_size(); ++i) {
           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
-            do_shapes_match = false;
+            do_tensors_match = false;
+            break;
           }
         }
+      }
 
-        if ((stored.dtype() != current.dtype()) || !do_shapes_match) {
-          LOG(WARNING) << "Output tensor changed between runs for '"
-                       << ns.node_name();
-        }
+      if (!do_tensors_match) {
+        LOG(WARNING) << "Output tensor changed between runs for '"
+                     << ns.node_name();
       }
     }
   }
@@ -120,10 +123,46 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
   int node_num = 0;
   for (const auto& ds : step_stats.dev_stats()) {
     for (const auto& ns : ds.node_stats()) {
+      // NOTE(blackhc): To better support GPUs:
+      // GPU kernels are duplicated both in /stream:all and their
+      // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
+      // /stream:$index. So only keep /stream:all and /memcpy and ignore all
+      // /stream:$index to only count GPU executions once.
+      if (ds.device().find("/stream") != std::string::npos &&
+          ds.device().find("/stream:all") == std::string::npos) {
+        continue;
+      }
+
+      std::string name = ns.node_name();
+      std::string op_type = "<>";
+      // NOTE(blackhc): we have to ensure that all keys into the detail map
+      // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
+      // To make the node type summary work better, we prefix "gpu:" to
+      // the op type when the info is from a /gpu/stream or /memcpy channel.
+      if (ds.device().find("/stream") != std::string::npos) {
+        // node_name: name ":" opType
+        auto parts = str_util::Split(ns.node_name(), ':');
+        if (parts.size() == 2) {
+          name = parts[0] + " [Kernel]";
+          op_type = "gpu:" + parts[1];
+        }
+      } else if (ds.device().find("/memcpy") != std::string::npos) {
+        // node_name: name (":" opType)? ":" memCpyType
+        auto parts = str_util::Split(ns.node_name(), ':');
+        if (parts.size() == 2 || parts.size() == 3) {
+          name = parts.front() + " [MemCpy]";
+          // We don't care about the actual op type (it might not be available
+          // for edge_ memcpys). We only care that it's a memcpy for now.
+          op_type = "gpu:" + parts.back();
+        }
+      } else {
+        op_type = OpType(ds, ns);
+      }
+
       ++node_num;
       const int64 curr_time = ns.all_end_rel_micros();
       curr_total_us += curr_time;
-      auto result = details_.emplace(ns.node_name(), Detail());
+      auto result = details_.emplace(name, Detail());
       Detail* detail = &(result.first->second);
 
       detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us);
@@ -131,8 +170,8 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
 
       // If this is the first pass, initialize some values.
       if (result.second) {
-        detail->name = ns.node_name();
-        detail->type = OpType(ds, ns);
+        detail->name = name;
+        detail->type = op_type;
 
         detail->run_order = node_num;
 
@@ -140,12 +179,13 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
         for (const auto& output : ns.output()) {
           const int32 slot = output.slot();
           if ((slot < 0) || (slot >= ns.output_size())) {
-            LOG(ERROR) << "Bad output slot '" << slot << "' for '"
-                       << ns.node_name() << "'";
+            // This is not a hard error for Switch ops, so just pass.
             continue;
           }
           detail->outputs[slot] = output.tensor_description();
         }
+
+        detail->times_called = 0;
       }
 
       int64 curr_node_mem = 0;
@@ -156,6 +196,8 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       detail->mem_used.UpdateStat(curr_node_mem);
       mem_total += curr_node_mem;
 
+      ++detail->times_called;
+
       Validate(detail, ns);
     }
   }
@@ -197,6 +239,7 @@ std::string StatSummarizer::HeaderString(const string& title) const {
   InitField(stream, 8) << "[%]";
   InitField(stream, 8) << "[cdf%]";
   InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 9) << "[times called]";
   stream << "\t"
          << "[Name]";
   return stream.str();
@@ -210,6 +253,7 @@ std::string StatSummarizer::ColumnString(const Detail& detail,
   const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
   const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
   const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
+  const int64 times_called = detail.times_called / num_runs();
 
   std::stringstream stream;
   InitField(stream, 24) << detail.type;
@@ -219,6 +263,7 @@ std::string StatSummarizer::ColumnString(const Detail& detail,
   InitField(stream, 7) << percentage << "%";
   InitField(stream, 7) << cdf_percentage << "%";
   InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
+  InitField(stream, 9) << times_called;
   stream << "\t" << detail.name;
 
   return stream.str();
@@ -266,6 +311,33 @@ void StatSummarizer::OrderNodesByMetric(
   }
 }
 
+void StatSummarizer::ComputeStatsByType(
+    std::map<string, int64>* node_type_map_count,
+    std::map<string, int64>* node_type_map_time,
+    std::map<string, int64>* node_type_map_memory,
+    std::map<string, int64>* node_type_map_times_called,
+    int64* accumulated_us) const {
+  int64 run_count = run_total_us_.count();
+
+  for (const auto& det : details_) {
+    const string node_name = det.first;
+    const Detail& detail = det.second;
+
+    int64 curr_time_val =
+        static_cast<int64>(detail.rel_end_us.sum() / run_count);
+    *accumulated_us += curr_time_val;
+
+    int64 curr_memory_val = detail.mem_used.newest();
+
+    const string& node_type = detail.type;
+
+    (*node_type_map_count)[node_type] += 1;
+    (*node_type_map_time)[node_type] += curr_time_val;
+    (*node_type_map_memory)[node_type] += curr_memory_val;
+    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
+  }
+}
+
 std::string StatSummarizer::GetStatsByNodeType() const {
   std::stringstream stream;
 
@@ -273,34 +345,17 @@ std::string StatSummarizer::GetStatsByNodeType() const {
             "=============================="
          << std::endl;
 
-  int64 accumulated_us = 0;
-  int64 accumulated_bytes = 0;
+  LOG(INFO) << "Number of nodes executed: " << details_.size();
+
   std::map<string, int64> node_type_map_count;
   std::map<string, int64> node_type_map_time;
   std::map<string, int64> node_type_map_memory;
+  std::map<string, int64> node_type_map_times_called;
+  int64 accumulated_us = 0;
 
-  int64 num_processed = 0;
-
-  LOG(INFO) << "Number of nodes executed: " << details_.size();
-  for (const auto& det : details_) {
-    const string node_name = det.first;
-    const Detail& detail = det.second;
-
-    int64 curr_time_val = detail.rel_end_us.avg();
-    accumulated_us += curr_time_val;
-
-    ++num_processed;
-    int64 curr_memory_val = detail.mem_used.newest();
-    accumulated_bytes += curr_memory_val;
-
-    const string& node_type = detail.type;
-
-    node_type_map_count[node_type] += 1;
-    node_type_map_time[node_type] += curr_time_val;
-    node_type_map_memory[node_type] += curr_memory_val;
-  }
-
-  LOG(INFO) << "Processed " << num_processed << " nodes";
+  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                     &node_type_map_memory, &node_type_map_times_called,
+                     &accumulated_us);
 
   // Sort them.
   std::priority_queue<std::pair<int64, std::pair<string, int64>>> timings;
@@ -316,9 +371,9 @@ std::string StatSummarizer::GetStatsByNodeType() const {
   InitField(stream, 11) << "[avg %]";
   InitField(stream, 11) << "[cdf %]";
   InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 10) << "[times called]";
   stream << std::endl;
 
-  float avg_total_time_ms = 0.0f;
   float cdf = 0.0f;
   while (!timings.empty()) {
     auto entry = timings.top();
@@ -330,7 +385,6 @@ std::string StatSummarizer::GetStatsByNodeType() const {
     const int64 node_type_total_us = entry.first;
     const float time_per_run_ms = node_type_total_us / 1000.0f;
 
-    avg_total_time_ms += time_per_run_ms;
     const float percentage =
         ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
     cdf += percentage;
@@ -341,6 +395,7 @@ std::string StatSummarizer::GetStatsByNodeType() const {
     InitField(stream, 10) << percentage << "%";
     InitField(stream, 10) << cdf << "%";
     InitField(stream, 10) << memory;
+    InitField(stream, 9) << node_type_map_times_called[node_type];
     stream << std::endl;
   }
   stream << std::endl;
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index ea12111a394..6111e276ea6 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -169,6 +169,12 @@ class StatSummarizer {
   // Prints the output tensor sizes and types for each node.
   void PrintOutputs() const;
 
+  void ComputeStatsByType(std::map<string, int64>* node_type_map_count,
+                          std::map<string, int64>* node_type_map_time,
+                          std::map<string, int64>* node_type_map_memory,
+                          std::map<string, int64>* node_type_map_times_called,
+                          int64* accumulated_us) const;
+
   std::string GetStatsByNodeType() const;
 
   std::string GetStatsByMetric(const string& title,
@@ -196,6 +202,7 @@ class StatSummarizer {
     Stat<int64> rel_end_us;
     Stat<int64> mem_used;
     std::vector<TensorDescription> outputs;
+    int64 times_called;
   };
 
   void Validate(const Detail* detail, const NodeExecStats& ns) const;
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index f93e0b5e467..bae00f74003 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -3,15 +3,20 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    features = ["-parse_headers"],
 )
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
+    "tf_copts",
+)
 
-# To be exported to tensorflow/core:android_srcs.
+# To be exported to tensorflow/core:mobile_srcs.
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         "naming.cc",
         "naming.h",
@@ -20,6 +25,11 @@ filegroup(
     ],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
 cc_library(
     name = "tensor_bundle",
     srcs = ["tensor_bundle.cc"],
@@ -38,6 +48,13 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "tensor_bundle_headers_lib",
+    deps = [
+        ":tensor_bundle",
+    ],
+)
+
 cc_library(
     name = "naming",
     srcs = ["naming.cc"],
@@ -47,6 +64,7 @@ cc_library(
 
 cc_test(
     name = "tensor_bundle_test",
+    size = "small",
     srcs = ["tensor_bundle_test.cc"],
     deps = [
         ":tensor_bundle",
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 40391bb3c31..301eae2c4de 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb_text.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb_text.h"
 #include "tensorflow/core/framework/versions.h"
@@ -107,7 +107,7 @@ Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
 }
 
 char* GetBackingBuffer(const Tensor& val) {
-  CHECK(DataTypeCanUseMemcpy(val.dtype()));
+  CHECK(DataTypeCanUseMemcpy(val.dtype())) << val.dtype();
   return const_cast<char*>(val.tensor_data().data());
 }
 
@@ -233,11 +233,38 @@ bool IsFullSlice(const TensorSlice& slice_spec,
     return true;
   } else {
     TensorShape sliced_shape;
-    slice_spec.SliceTensorShape(full_tensor_shape, &sliced_shape);
+    slice_spec.SliceTensorShape(full_tensor_shape, &sliced_shape).IgnoreError();
     return sliced_shape == full_tensor_shape;
   }
 }
 
+Status CorruptFileError(const Status& in_status, const string& filename,
+                        const string& detail) {
+  if (in_status.ok()) {
+    return errors::Internal("Unable to read file (", filename,
+                            "). Perhaps the file is corrupt or was produced by "
+                            "a newer version of TensorFlow with format changes "
+                            "(",
+                            detail, ")");
+  }
+  return Status(
+      in_status.code(),
+      strings::StrCat("Unable to read file (", filename,
+                      "). Perhaps the file is corrupt or was produced by a "
+                      "newer version of TensorFlow with format changes (",
+                      detail, "): ", in_status.error_message()));
+}
+
+table::Options TableBuilderOptions() {
+  table::Options o;
+  // Compressed tables cannot be read by TensorFlow releases prior to 1.1.
+  // To smoothen the transition, compressed writes are disabled for now
+  // (version 1.2) with the intention that they will be enabled again at
+  // some point (perhaps the 1.3 release?).
+  o.compression = table::kNoCompression;
+  return o;
+}
+
 }  // namespace
 
 BundleWriter::BundleWriter(Env* env, StringPiece prefix)
@@ -249,8 +276,10 @@ BundleWriter::BundleWriter(Env* env, StringPiece prefix)
                                      random::New64())),
       out_(nullptr),
       size_(0) {
-  status_ =
-      env_->CreateDir(io::Dirname(prefix_).ToString());  // Ignores errors.
+  status_ = env_->CreateDir(io::Dirname(prefix_).ToString());
+  if (!status_.ok() && !errors::IsAlreadyExists(status_)) {
+    return;
+  }
   const string filename = DataFilename(prefix_, 0, 1);
   std::unique_ptr<WritableFile> wrapper;
   status_ = env_->NewWritableFile(tmp_data_path_, &wrapper);
@@ -261,12 +290,10 @@ BundleWriter::BundleWriter(Env* env, StringPiece prefix)
   VLOG(1) << "Writing to file " << tmp_data_path_;
 }
 
-BundleWriter::~BundleWriter() { CHECK(out_ == nullptr); }
-
 Status BundleWriter::Add(StringPiece key, const Tensor& val) {
+  if (!status_.ok()) return status_;
   CHECK_NE(key, kHeaderEntryKey);
   const string key_string = key.ToString();
-  if (!status_.ok()) return status_;
   if (entries_.find(key_string) != entries_.end()) {
     status_ = errors::InvalidArgument("Adding duplicate key: ", key);
     return status_;
@@ -301,14 +328,14 @@ Status BundleWriter::AddSlice(StringPiece full_tensor_key,
                               const TensorShape& full_tensor_shape,
                               const TensorSlice& slice_spec,
                               const Tensor& slice_tensor) {
+  if (!status_.ok()) return status_;
+  CHECK_NE(full_tensor_key, kHeaderEntryKey);
+
   // If just a singleton full slice, use the regular Add() to be more efficient.
   if (IsFullSlice(slice_spec, full_tensor_shape)) {
     return Add(full_tensor_key, slice_tensor);
   }
 
-  CHECK_NE(full_tensor_key, kHeaderEntryKey);
-  if (!status_.ok()) return status_;
-
   // Inserts/updates the full tensor's metadata entry.
   //
   // In the case of a sharded save, MergeBundles() is responsible for merging
@@ -348,7 +375,7 @@ Status BundleWriter::Finish() {
       status_ = Env::Default()->RenameFile(tmp_data_path_,
                                            DataFilename(prefix_, 0, 1));
     } else {
-      Env::Default()->DeleteFile(tmp_data_path_);
+      Env::Default()->DeleteFile(tmp_data_path_).IgnoreError();
     }
   }
   if (!status_.ok()) return status_;
@@ -381,7 +408,7 @@ Status BundleWriter::Finish() {
   }
   status_.Update(file->Close());
   if (!status_.ok()) {
-    Env::Default()->DeleteFile(tmp_metadata_path_);
+    Env::Default()->DeleteFile(tmp_metadata_path_).IgnoreError();
     return status_;
   } else {
     status_ =
@@ -425,7 +452,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
 
   table::Table* table = nullptr;
   TF_RETURN_IF_ERROR(
-      table::Table::Open(table::Options(), file.get(), file_size, &table));
+      table::Table::Open(TableBuilderOptions(), file.get(), file_size, &table));
   std::unique_ptr<table::Table> table_deleter(table);
   std::unique_ptr<table::Iterator> iter(table->NewIterator());
 
@@ -433,10 +460,13 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
   // Process header.
   {
     iter->Seek(kHeaderEntryKey);
-    CHECK(iter->Valid());
+    if (!iter->Valid()) {
+      return CorruptFileError(iter->status(), filename,
+                              "failed to seek to header entry");
+    }
     BundleHeaderProto header;
-    TF_CHECK_OK(ParseEntryProto(iter->key(), iter->value(), &header));
-    CHECK_GE(header.num_shards(), 0);
+    Status s = ParseEntryProto(iter->key(), iter->value(), &header);
+    if (!s.ok()) return CorruptFileError(s, filename, "unable to parse header");
 
     merge_state->num_shards += header.num_shards();
     if (!merge_state->seen_first_bundle) {
@@ -515,7 +545,8 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   // Merges all metadata tables.
   // TODO(zhifengc): KeyValue sorter if it becomes too big.
   MergeState merge;
-  env->CreateDir(io::Dirname(merged_prefix).ToString());  // Ignores errors.
+  Status status = env->CreateDir(io::Dirname(merged_prefix).ToString());
+  if (!status.ok() && !errors::IsAlreadyExists(status)) return status;
   for (int i = 0; i < prefixes.size(); ++i) {
     TF_RETURN_IF_ERROR(MergeOneBundle(env, prefixes[i], &merge));
   }
@@ -533,9 +564,8 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   std::unique_ptr<WritableFile> merged_metadata;
   TF_RETURN_IF_ERROR(
       env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata));
-  Status status;
   {
-    table::TableBuilder builder(table::Options(), merged_metadata.get());
+    table::TableBuilder builder(TableBuilderOptions(), merged_metadata.get());
     // Header entry.
     BundleHeaderProto header;
     header.set_num_shards(merge.num_shards);
@@ -554,7 +584,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
 
   // Cleanup: best effort based and ignores errors.
   for (const string& prefix : prefixes) {
-    env->DeleteFile(MetaFilename(prefix));
+    env->DeleteFile(MetaFilename(prefix)).IgnoreError();
   }
   return status;
 }
@@ -583,9 +613,17 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
 
   // Reads "num_shards_" from the first entry.
   iter_->Seek(kHeaderEntryKey);
-  CHECK(iter_->Valid());
+  if (!iter_->Valid()) {
+    status_ = CorruptFileError(iter_->status(), filename,
+                               "failed to seek to header entry");
+    return;
+  }
   BundleHeaderProto header;
-  TF_CHECK_OK(ParseEntryProto(iter_->key(), iter_->value(), &header));
+  status_ = ParseEntryProto(iter_->key(), iter_->value(), &header);
+  if (!status_.ok()) {
+    status_ = CorruptFileError(status_, filename, "unable to parse header");
+    return;
+  }
   num_shards_ = header.num_shards();
   if ((header.endianness() == BundleHeaderProto::BIG && port::kLittleEndian) ||
       (header.endianness() == BundleHeaderProto::LITTLE &&
@@ -602,6 +640,12 @@ BundleReader::~BundleReader() {
   delete metadata_;
   delete iter_;
   delete table_;
+  // InputBuffer does not own the underlying RandomAccessFile.
+  for (auto pair : data_) {
+    if (pair.second->file() != nullptr) {
+      delete pair.second->file();
+    }
+  }
   gtl::STLDeleteValues(&data_);
   gtl::STLDeleteValues(&tensor_slices_);
 }
@@ -612,8 +656,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   TF_CHECK_OK(status_);
   Seek(key);
   if (!iter_->Valid() || iter_->key() != key) {
-    return errors::NotFound("Key ", key, " not found in checkpoint. Found: \n",
-                            DebugString());
+    return errors::NotFound("Key ", key, " not found in checkpoint");
   }
 
   BundleEntryProto entry_copy;
@@ -657,14 +700,16 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
     }
   }
 
-  // Open the data file if not opened it.
-  std::unique_ptr<RandomAccessFile> file = nullptr;
-  std::unique_ptr<io::InputBuffer> buffered_file(data_[entry.shard_id()]);
+  // Open the data file if it has not been opened.
+  io::InputBuffer* buffered_file = data_[entry.shard_id()];
   if (buffered_file == nullptr) {
+    std::unique_ptr<RandomAccessFile> file = nullptr;
     TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(
         DataFilename(prefix_, entry.shard_id(), num_shards_), &file));
-    buffered_file.reset(
-        new io::InputBuffer(file.get(), 256 << 10 /* 256KB buffer */));
+    buffered_file =
+        new io::InputBuffer(file.release(), 256 << 10 /* 256KB buffer */);
+    // The InputBuffer and RandomAccessFile objects are both released in dtor.
+    data_[entry.shard_id()] = buffered_file;
   }
   CHECK(buffered_file != nullptr);
 
@@ -683,7 +728,7 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
     // Relies on io::InputBuffer's buffering, because we issue many neighboring
     // reads for a single string tensor.
     TF_RETURN_IF_ERROR(ReadStringTensor(
-        buffered_file.get(), ret->NumElements(), entry.offset(), entry.size(),
+        buffered_file, ret->NumElements(), entry.offset(), entry.size(),
         GetStringBackingBuffer(*ret), &actual_crc32c));
   }
   if (crc32c::Unmask(entry.crc32c()) != actual_crc32c) {
@@ -699,6 +744,7 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
 }
 
 Status BundleReader::Lookup(StringPiece key, Tensor* val) {
+  CHECK(val != nullptr);
   BundleEntryProto entry;
   TF_RETURN_IF_ERROR(GetBundleEntryProto(key, &entry));
 
@@ -711,8 +757,39 @@ Status BundleReader::Lookup(StringPiece key, Tensor* val) {
   }
 }
 
+Status BundleReader::ReadCurrent(Tensor* val) {
+  CHECK(val != nullptr);
+  BundleEntryProto entry;
+  TF_RETURN_IF_ERROR(ParseEntryProto(iter_->key(), iter_->value(), &entry));
+  if (!TensorShape::IsValid(entry.shape())) {
+    return errors::DataLoss("Invaid tensor shape: ", iter_->key(), " ",
+                            ProtoShortDebugString(entry.shape()));
+  }
+
+  if (entry.slices().empty()) {
+    return GetValue(entry, val);
+  } else {
+    return GetSliceValue(
+        iter_->key(), entry,
+        /* a full slice */ TensorSlice(TensorShape(entry.shape()).dims()), val);
+  }
+}
+
+Status BundleReader::LookupTensorSlices(StringPiece key,
+                                        std::vector<TensorSlice>* slices) {
+  slices->clear();
+  BundleEntryProto entry;
+  TF_RETURN_IF_ERROR(GetBundleEntryProto(key, &entry));
+  slices->reserve(entry.slices_size());
+  for (const auto& slice : entry.slices()) {
+    slices->emplace_back(slice);
+  }
+  return Status::OK();
+}
+
 Status BundleReader::LookupSlice(StringPiece full_tensor_key,
                                  const TensorSlice& slice_spec, Tensor* val) {
+  CHECK(val != nullptr);
   BundleEntryProto entry;
   TF_RETURN_IF_ERROR(GetBundleEntryProto(full_tensor_key, &entry));
   return GetSliceValue(full_tensor_key, entry, slice_spec, val);
@@ -737,15 +814,15 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
       // Special case: a writer has saved a tensor fully, but the reader wants
       // to read in slices.  We therefore register the full slice on-demand here
       // without further complicating the on-disk bundle format.
-      RegisterTensorSlice(full_tensor_key_string, full_shape,
-                          full_tensor_entry.dtype(), /* tag */ "",
-                          /* full slice */ TensorSlice(full_shape.dims()),
-                          &tensor_slices_);
+      TF_RETURN_IF_ERROR(RegisterTensorSlice(
+          full_tensor_key_string, full_shape, full_tensor_entry.dtype(),
+          /* tag */ "",
+          /* full slice */ TensorSlice(full_shape.dims()), &tensor_slices_));
     }
     for (const TensorSliceProto& slice : full_tensor_entry.slices()) {
-      RegisterTensorSlice(full_tensor_key_string, full_shape,
-                          full_tensor_entry.dtype(),
-                          /* tag */ "", TensorSlice(slice), &tensor_slices_);
+      TF_RETURN_IF_ERROR(RegisterTensorSlice(
+          full_tensor_key_string, full_shape, full_tensor_entry.dtype(),
+          /* tag */ "", TensorSlice(slice), &tensor_slices_));
     }
     tss = gtl::FindPtrOrNull(tensor_slices_, full_tensor_key_string);
     CHECK_NE(tss, nullptr);
@@ -780,10 +857,15 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
     // hard for the caller of the tensor bundle module to allocate these
     // precisely-shaped scratch storage.
 
-    // Optimization for the common case: stored slice == to-restore slice.
-    // TODO(zongheng): also include the case where "slice_spec" is full ("-"),
-    // and "stored_slice" is logically full but contains actual extents.
-    if (stored_slice == slice_spec) {
+    // Optimization for the common case: the stored slice can be directly
+    // copied to the destination without additional slicing. This is true when
+    // either the slices are equal or when they are both full slices having the
+    // same shape.
+    TensorShape stored_slice_shape(stored_slice_entry.shape());
+    if (stored_slice == slice_spec ||
+        (stored_slice_shape == val->shape() &&
+         IsFullSlice(stored_slice, stored_slice_shape) &&
+         IsFullSlice(slice_spec, stored_slice_shape))) {
       VLOG(1) << "Optimized for common case: directly copying into "
                  "pre-allocated buffer; spec: "
               << slice_spec.DebugString();
@@ -791,8 +873,7 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
       return status_;
     }
 
-    Tensor stored_slice_tensor(stored_slice_entry.dtype(),
-                               TensorShape(stored_slice_entry.shape()));
+    Tensor stored_slice_tensor(stored_slice_entry.dtype(), stored_slice_shape);
     status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
     if (!status_.ok()) return status_;
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 6a8104ade5a..3571281820e 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -31,7 +31,7 @@ limitations under the License.
 // (tensorflow::table::Table).  Each key is a name of a tensor and its value is
 // a serialized BundleEntryProto.  Each BundleEntryProto describes the metadata
 // of a tensor: which of the "data" files contains the content of a tensor, the
-// offset into that file, checksum, some auxilary data, etc.
+// offset into that file, checksum, some auxiliary data, etc.
 //
 // A tensor bundle can be accessed randomly using a BundleReader.  Usage:
 //
@@ -100,11 +100,14 @@ extern const int kTensorBundleVersion;
 extern const char* const kHeaderEntryKey;
 
 // Builds a string-string table of tensor names to BundleEntryProto (metadata).
+//
+// On construction, attempts to create a directory given by the dirname of
+// "prefix", so "status()" must be checked before calling any member functions.
+//
 // All threads accessing the same BundleWriter must synchronize.
 class BundleWriter {
  public:
   BundleWriter(Env* env, StringPiece prefix);
-  ~BundleWriter();
 
   // Adds the tensor "val" under key "key".
   // Across calls "key" must be unique but can be added in any order.
@@ -207,6 +210,24 @@ class BundleReader {
   // REQUIRES: status().ok()
   Status Lookup(StringPiece key, Tensor* val) TF_MUST_USE_RESULT;
 
+  // Looks up the tensor pointed to by the internal iterator.
+  //
+  // On error, "val" may contain nonsense data.
+  //
+  // Validates the stored crc32c checksum against the restored bytes.
+  // REQUIRES: status().ok() && Valid()
+  Status ReadCurrent(Tensor* val) TF_MUST_USE_RESULT;
+
+  // Looks up the slices of the tensor keyed by "key".  On OK, "slices"
+  // is non-empty if and only if the tensor is a partitioned tensor.
+  //
+  // Warning - there is no guaranteed ordering for the returned slices, so
+  // a slice with a larger start index in some dimension could come before
+  // another slice with a smaller start index in the same dimension.
+  // REQUIRES: status().ok()
+  Status LookupTensorSlices(StringPiece key, std::vector<TensorSlice>* slices)
+      TF_MUST_USE_RESULT;
+
   // Looks up a specific slice of a partitioned tensor.
   // It is only required that the stored slices cover the requested slice,
   // namely "slice_spec" is a subset of the union of the stored slices.
@@ -260,6 +281,7 @@ class BundleReader {
   RandomAccessFile* metadata_;  // Owned.
   table::Table* table_;
   table::Iterator* iter_;
+  // Owned the InputBuffer objects and their underlying RandomAccessFile's.
   std::unordered_map<int32, io::InputBuffer*> data_;
 
   // Maps each partitioned tensor's key to its stored slices (represented in a
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 4693b4c005e..4ee15785108 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -64,6 +64,16 @@ void Expect(BundleReader* reader, const string& key,
   test::ExpectTensorEqual<T>(val, expected_val);
 }
 
+template <typename T>
+void ExpectNext(BundleReader* reader, const Tensor& expected_val) {
+  EXPECT_TRUE(reader->Valid());
+  reader->Next();
+  TF_ASSERT_OK(reader->status());
+  Tensor val;
+  TF_ASSERT_OK(reader->ReadCurrent(&val));
+  test::ExpectTensorEqual<T>(val, expected_val);
+}
+
 std::vector<string> AllTensorKeys(BundleReader* reader) {
   std::vector<string> ret;
   reader->Seek(kHeaderEntryKey);
@@ -123,10 +133,10 @@ template <typename T>
 void TestBasic() {
   {
     BundleWriter writer(Env::Default(), Prefix("foo"));
-    writer.Add("foo_003", Constant_2x3<T>(3));
-    writer.Add("foo_000", Constant_2x3<T>(0));
-    writer.Add("foo_002", Constant_2x3<T>(2));
-    writer.Add("foo_001", Constant_2x3<T>(1));
+    TF_EXPECT_OK(writer.Add("foo_003", Constant_2x3<T>(3)));
+    TF_EXPECT_OK(writer.Add("foo_000", Constant_2x3<T>(0)));
+    TF_EXPECT_OK(writer.Add("foo_002", Constant_2x3<T>(2)));
+    TF_EXPECT_OK(writer.Add("foo_001", Constant_2x3<T>(1)));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -140,12 +150,23 @@ void TestBasic() {
     Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
     Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
   }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    ExpectNext<T>(&reader, Constant_2x3<T>(0));
+    ExpectNext<T>(&reader, Constant_2x3<T>(1));
+    ExpectNext<T>(&reader, Constant_2x3<T>(2));
+    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    EXPECT_TRUE(reader.Valid());
+    reader.Next();
+    EXPECT_FALSE(reader.Valid());
+  }
   {
     BundleWriter writer(Env::Default(), Prefix("bar"));
-    writer.Add("bar_003", Constant_2x3<T>(3));
-    writer.Add("bar_000", Constant_2x3<T>(0));
-    writer.Add("bar_002", Constant_2x3<T>(2));
-    writer.Add("bar_001", Constant_2x3<T>(1));
+    TF_EXPECT_OK(writer.Add("bar_003", Constant_2x3<T>(3)));
+    TF_EXPECT_OK(writer.Add("bar_000", Constant_2x3<T>(0)));
+    TF_EXPECT_OK(writer.Add("bar_002", Constant_2x3<T>(2)));
+    TF_EXPECT_OK(writer.Add("bar_001", Constant_2x3<T>(1)));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -159,6 +180,17 @@ void TestBasic() {
     Expect<T>(&reader, "bar_001", Constant_2x3<T>(1));
     Expect<T>(&reader, "bar_000", Constant_2x3<T>(0));
   }
+  {
+    BundleReader reader(Env::Default(), Prefix("bar"));
+    TF_ASSERT_OK(reader.status());
+    ExpectNext<T>(&reader, Constant_2x3<T>(0));
+    ExpectNext<T>(&reader, Constant_2x3<T>(1));
+    ExpectNext<T>(&reader, Constant_2x3<T>(2));
+    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    EXPECT_TRUE(reader.Valid());
+    reader.Next();
+    EXPECT_FALSE(reader.Valid());
+  }
   TF_ASSERT_OK(MergeBundles(Env::Default(), {Prefix("foo"), Prefix("bar")},
                             Prefix("merged")));
   {
@@ -177,15 +209,32 @@ void TestBasic() {
     Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
     Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
   }
+  {
+    BundleReader reader(Env::Default(), Prefix("merged"));
+    TF_ASSERT_OK(reader.status());
+    ExpectNext<T>(&reader, Constant_2x3<T>(0));
+    ExpectNext<T>(&reader, Constant_2x3<T>(1));
+    ExpectNext<T>(&reader, Constant_2x3<T>(2));
+    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    ExpectNext<T>(&reader, Constant_2x3<T>(0));
+    ExpectNext<T>(&reader, Constant_2x3<T>(1));
+    ExpectNext<T>(&reader, Constant_2x3<T>(2));
+    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    EXPECT_TRUE(reader.Valid());
+    reader.Next();
+    EXPECT_FALSE(reader.Valid());
+  }
 }
 
 template <typename T>
 void TestNonStandardShapes() {
   {
     BundleWriter writer(Env::Default(), Prefix("nonstandard"));
-    writer.Add("scalar", Constant<T>(0, TensorShape()));
-    writer.Add("non_standard0", Constant<T>(0, TensorShape({0, 1618})));
-    writer.Add("non_standard1", Constant<T>(0, TensorShape({16, 0, 18})));
+    TF_EXPECT_OK(writer.Add("scalar", Constant<T>(0, TensorShape())));
+    TF_EXPECT_OK(
+        writer.Add("non_standard0", Constant<T>(0, TensorShape({0, 1618}))));
+    TF_EXPECT_OK(
+        writer.Add("non_standard1", Constant<T>(0, TensorShape({16, 0, 18}))));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -243,15 +292,14 @@ TEST(TensorBundleTest, PartitionedVariables) {
   // Adds two slices.
   // First slice: column 0, all zeros.
   // Second slice: column 1 to rest, all ones.
+  TensorSlice slice1 = TensorSlice::ParseOrDie("-:0,1");
+  TensorSlice slice2 = TensorSlice::ParseOrDie("-:1,9");
   {
     BundleWriter writer(Env::Default(), Prefix("foo"));
-    TensorSlice slice = TensorSlice::ParseOrDie("-:0,1");
 
-    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape,
-                                 TensorSlice::ParseOrDie("-:0,1"),
+    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape, slice1,
                                  Constant<float>(0., TensorShape({5, 1}))));
-    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape,
-                                 TensorSlice::ParseOrDie("-:1,9"),
+    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape, slice2,
                                  Constant<float>(1., TensorShape({5, 9}))));
     TF_ASSERT_OK(writer.Finish());
   }
@@ -272,6 +320,18 @@ TEST(TensorBundleTest, PartitionedVariables) {
     TF_ASSERT_OK(reader.Lookup("foo", &val));
     test::ExpectTensorEqual<float>(val, expected_val);
   }
+  // Reads all slices.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+
+    std::vector<TensorSlice> slices;
+    TF_ASSERT_OK(reader.LookupTensorSlices("foo", &slices));
+
+    EXPECT_EQ(2, slices.size());
+    EXPECT_EQ(slice1.DebugString(), slices[0].DebugString());
+    EXPECT_EQ(slice2.DebugString(), slices[1].DebugString());
+  }
   // Reads a slice consisting of first two columns, "cutting" both slices.
   {
     BundleReader reader(Env::Default(), Prefix("foo"));
@@ -304,6 +364,56 @@ TEST(TensorBundleTest, PartitionedVariables) {
   }
 }
 
+TEST(TensorBundleTest, EquivalentSliceTest) {
+  const TensorShape kFullShape({5, 10});
+  const Tensor kExpected(Constant<float>(1., kFullShape));
+  {
+    BundleWriter writer(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(writer.AddSlice("no_extents", kFullShape,
+                                 TensorSlice::ParseOrDie("-:-"), kExpected));
+    TF_ASSERT_OK(writer.AddSlice("both_extents", kFullShape,
+                                 TensorSlice::ParseOrDie("0,5:0,10"),
+                                 kExpected));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  // Slices match exactly and are fully abbreviated.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    const TensorSlice slice = TensorSlice::ParseOrDie("-:-");
+    Tensor val(DT_FLOAT, TensorShape(kFullShape));
+    TF_ASSERT_OK(reader.LookupSlice("no_extents", slice, &val));
+    test::ExpectTensorEqual<float>(val, kExpected);
+  }
+  // Slice match exactly and are fully specified.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    const TensorSlice slice = TensorSlice::ParseOrDie("0,5:0,10");
+    Tensor val(DT_FLOAT, TensorShape(kFullShape));
+    TF_ASSERT_OK(reader.LookupSlice("both_extents", slice, &val));
+    test::ExpectTensorEqual<float>(val, kExpected);
+  }
+  // Stored slice has no extents, spec has extents.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    const TensorSlice slice = TensorSlice::ParseOrDie("0,5:0,10");
+    Tensor val(DT_FLOAT, TensorShape(kFullShape));
+    TF_ASSERT_OK(reader.LookupSlice("no_extents", slice, &val));
+    test::ExpectTensorEqual<float>(val, kExpected);
+  }
+  // Stored slice has both extents, spec has no extents.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    const TensorSlice slice = TensorSlice::ParseOrDie("-:-");
+    Tensor val(DT_FLOAT, TensorShape(kFullShape));
+    TF_ASSERT_OK(reader.LookupSlice("both_extents", slice, &val));
+    test::ExpectTensorEqual<float>(val, kExpected);
+  }
+}
+
 TEST(TensorBundleTest, NonStandardShapes) {
   TestNonStandardShapes<float>();
   TestNonStandardShapes<double>();
@@ -323,12 +433,14 @@ TEST(TensorBundleTest, NonStandardShapes) {
 TEST(TensorBundleTest, StringTensors) {
   {
     BundleWriter writer(Env::Default(), Prefix("foo"));
-    writer.Add("string_tensor", Tensor(DT_STRING, TensorShape({1})));  // Empty.
-    writer.Add("scalar", test::AsTensor<string>({"hello"}));
-    writer.Add("strs", test::AsTensor<string>(
-                           {"hello", "", "x01", string(1 << 25, 'c')}));
+    TF_EXPECT_OK(writer.Add("string_tensor",
+                            Tensor(DT_STRING, TensorShape({1}))));  // Empty.
+    TF_EXPECT_OK(writer.Add("scalar", test::AsTensor<string>({"hello"})));
+    TF_EXPECT_OK(writer.Add(
+        "strs",
+        test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')})));
     // Mixes in some floats.
-    writer.Add("floats", Constant_2x3<float>(16.18));
+    TF_EXPECT_OK(writer.Add("floats", Constant_2x3<float>(16.18)));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -355,7 +467,8 @@ TEST(TensorBundleTest, DirectoryStructure) {
                                                Prefix("worker1")};
   for (int i = 0; i < 2; ++i) {
     BundleWriter writer(env, kBundlePrefixes[i]);
-    writer.Add(strings::StrCat("tensor", i), Constant_2x3<float>(0.));
+    TF_EXPECT_OK(
+        writer.Add(strings::StrCat("tensor", i), Constant_2x3<float>(0.)));
     TF_ASSERT_OK(writer.Finish());
   }
 
@@ -396,8 +509,8 @@ TEST(TensorBundleTest, DirectoryStructure) {
 TEST(TensorBundleTest, Error) {
   {  // Dup keys.
     BundleWriter writer(Env::Default(), Prefix("dup"));
-    writer.Add("foo", Constant_2x3(1.f));
-    writer.Add("foo", Constant_2x3(2.f));
+    TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
+    EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
     EXPECT_TRUE(
         StringPiece(writer.status().ToString()).contains("duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
@@ -446,7 +559,7 @@ TEST(TensorBundleTest, Checksum) {
   // Corrupts a float tensor.
   {
     BundleWriter writer(Env::Default(), Prefix("singleton"));
-    writer.Add("foo", Constant_2x3(1.f));
+    TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     TF_ASSERT_OK(writer.Finish());
 
     FlipByte("singleton", 0 /* corrupts any byte */);
@@ -458,7 +571,8 @@ TEST(TensorBundleTest, Checksum) {
   {
     auto WriteStrings = []() {
       BundleWriter writer(Env::Default(), Prefix("strings"));
-      writer.Add("foo", test::AsTensor<string>({"hello", "world"}));
+      TF_EXPECT_OK(
+          writer.Add("foo", test::AsTensor<string>({"hello", "world"})));
       TF_ASSERT_OK(writer.Finish());
     };
     // Corrupts the first two bytes, which are the varint32-encoded lengths
@@ -482,7 +596,7 @@ TEST(TensorBundleTest, Checksum) {
 
 TEST(TensorBundleTest, Endianness) {
   BundleWriter writer(Env::Default(), Prefix("end"));
-  writer.Add("key", Constant_2x3<float>(1.0));
+  TF_EXPECT_OK(writer.Add("key", Constant_2x3<float>(1.0)));
   TF_ASSERT_OK(writer.Finish());
 
   // Flips the endianness bit.
@@ -497,7 +611,7 @@ TEST(TensorBundleTest, Endianness) {
 TEST(TensorBundleTest, TruncatedTensorContents) {
   Env* env = Env::Default();
   BundleWriter writer(env, Prefix("end"));
-  writer.Add("key", Constant_2x3<float>(1.0));
+  TF_EXPECT_OK(writer.Add("key", Constant_2x3<float>(1.0)));
   TF_ASSERT_OK(writer.Finish());
 
   // Truncates the data file by one byte, so that we hit EOF.
@@ -517,7 +631,7 @@ TEST(TensorBundleTest, TruncatedTensorContents) {
 TEST(TensorBundleTest, HeaderEntry) {
   {
     BundleWriter writer(Env::Default(), Prefix("b"));
-    writer.Add("key", Constant_2x3<float>(1.0));
+    TF_EXPECT_OK(writer.Add("key", Constant_2x3<float>(1.0)));
     TF_ASSERT_OK(writer.Finish());
   }
 
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index a572601cda6..70d9bc8e72d 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -21,12 +21,18 @@ string GetConvnetDataFormatAttrString() {
   return "data_format: { 'NHWC', 'NCHW' } = 'NHWC' ";
 }
 
+string GetConvnet3dDataFormatAttrString() {
+  return "data_format: { 'NDHWC', 'NCDHW' } = 'NDHWC' ";
+}
+
 string ToString(TensorFormat format) {
   switch (format) {
     case FORMAT_NHWC:
       return "NHWC";
     case FORMAT_NCHW:
       return "NCHW";
+    case FORMAT_NCHW_VECT_C:
+      return "NCHW_VECT_C";
     default:
       LOG(FATAL) << "Invalid Format: " << static_cast<int32>(format);
       return "INVALID_FORMAT";
@@ -34,13 +40,18 @@ string ToString(TensorFormat format) {
 }
 
 bool FormatFromString(const string& format_str, TensorFormat* format) {
-  if (format_str == "NHWC") {
+  if (format_str == "NHWC" || format_str == "NDHWC") {
     *format = FORMAT_NHWC;
     return true;
-  } else if (format_str == "NCHW") {
+  }
+  if (format_str == "NCHW" || format_str == "NCDHW") {
     *format = FORMAT_NCHW;
     return true;
   }
+  if (format_str == "NCHW_VECT_C") {
+    *format = FORMAT_NCHW_VECT_C;
+    return true;
+  }
   return false;
 }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index a21dee5d586..9923428a347 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_UTIL_TENSOR_FORMAT_H_
 #define TENSORFLOW_UTIL_TENSOR_FORMAT_H_
 
+#include <array>
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -26,6 +28,13 @@ namespace tensorflow {
 enum TensorFormat {
   FORMAT_NHWC = 0,
   FORMAT_NCHW = 1,
+  // NCHW_VECT_C is the most performant tensor format for cudnn6's quantized
+  // convolution. It is laid out in the same order as NCHW, but each element of
+  // a tensor in this format is a vector of 4 feature maps. A batch image with
+  // dimension sizes [N,C,H,W] is represented as a 5D tensor with shape
+  // [N,C/4,H,W,4]. This format requires C to be a multiple of 4, and requires
+  // the data type to be int8.
+  FORMAT_NCHW_VECT_C = 2,
 };
 
 // Parse tensor format from the given string.
@@ -35,9 +44,70 @@ bool FormatFromString(const string& format_str, TensorFormat* format);
 // Convert a tensor format into string.
 string ToString(TensorFormat format);
 
+// Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
+// format 'format'.
+inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
+  if (format == FORMAT_NCHW_VECT_C) {
+    return num_dims - 3;  // Exclude N,C,InnerC.
+  } else {
+    return num_dims - 2;  // Exclude N,C.
+  }
+}
+
+// Returns the index of the batch dimension.
+inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the feature dimension. If format is NCHW_VECT_C, returns
+// the outer feature map count -- the size of the second dimension.
+inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
+  switch (format) {
+    case FORMAT_NHWC:
+      return num_dims - 1;
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+      return 1;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the inner feature dimension.
+inline int GetTensorInnerFeatureDimIndex(int num_dims, TensorFormat format) {
+  return format == FORMAT_NCHW_VECT_C ? num_dims - 1 : -1;
+}
+
+// Returns the index of the `dim`-th spatial dimension.
+inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
+                                    int dim) {
+  CHECK(dim >= 0 && dim < GetTensorSpatialDims(num_dims, format))
+      << dim << " " << num_dims << " " << ToString(format);
+  switch (format) {
+    case FORMAT_NHWC:
+      return dim + 1;
+    case FORMAT_NCHW:
+    case FORMAT_NCHW_VECT_C:
+      return dim + 2;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
 // Return the position index from a format given a dimension specification with
 // a char. The chars can be N (batch), C (channels), H (y), W (x), or
-// 0 .. (NDIMS-1).
+// 0 .. (NDIMS-1). If format is NCHW_VECT_C and dimension is C, returns the
+// outer feature map count -- the size of the second dimension.
 template <int NDIMS>
 inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   if (format == FORMAT_NHWC) {
@@ -58,9 +128,9 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
         return 1 + NDIMS;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
-        return -1; // Avoid compiler warning about missing return value
+        return -1;  // Avoid compiler warning about missing return value
     }
-  } else if (format == FORMAT_NCHW) {
+  } else if (format == FORMAT_NCHW || format == FORMAT_NCHW_VECT_C) {
     switch (dimension) {
       case 'N':
         return 0;
@@ -78,11 +148,11 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
         return NDIMS + 1;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
-        return -1; // Avoid compiler warning about missing return value
+        return -1;  // Avoid compiler warning about missing return value
     }
   } else {
     LOG(FATAL) << "Invalid format: " << static_cast<int>(format);
-    return -1; // Avoid compiler warning about missing return value
+    return -1;  // Avoid compiler warning about missing return value
   }
 }
 
@@ -90,55 +160,79 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   return GetTensorDimIndex<2>(format, dimension);
 }
 
-// Return the given tensor dimension from a tensor. The tensor is interpretted
-// using the specified format, and a dimension specification using a char.
-inline int64 GetTensorDim(const Tensor& tensor, TensorFormat format,
-                          char dimension) {
-  int index = GetTensorDimIndex<2>(format, dimension);
-  CHECK(index >= 0 && index < tensor.dims())
-      << "Invalid index from the dimension: " << index << ", " << format << ", "
-      << dimension;
-  return tensor.dim_size(index);
-}
-
 // Return the given tensor dimension from a vector that represents the
 // dimensions of a tensor.
 // The tensor is interpretted using the specified format, and a dimension
 // specification using a char.
-inline int64 GetTensorDim(const TensorShape& tensor_shape, TensorFormat format,
-                          char dimension) {
-  int index = GetTensorDimIndex<2>(format, dimension);
-  CHECK(index >= 0 && index < tensor_shape.dims())
-      << "Invalid index from the dimension: " << index << ", " << format << ", "
-      << dimension;
-  return tensor_shape.dim_size(index);
-}
-
-// Return the given tensor dimension from a tensor shape.
-// The tensor is interpretted using the specified format, and a dimension
-// specification using a char.
 template <typename T>
-T GetTensorDim(const std::vector<T>& attributes, TensorFormat format,
+T GetTensorDim(gtl::ArraySlice<T> attributes, TensorFormat format,
                char dimension) {
-  int index = GetTensorDimIndex<2>(format, dimension);
+  int index = (GetTensorSpatialDims(attributes.size(), format) == 3)
+                  ? GetTensorDimIndex<3>(format, dimension)
+                  : GetTensorDimIndex<2>(format, dimension);
   CHECK(index >= 0 && index < attributes.size())
       << "Invalid index from the dimension: " << index << ", " << format << ", "
       << dimension;
   return attributes[index];
 }
 
+template <typename T>
+T GetTensorDim(const std::vector<T>& attributes, TensorFormat format,
+               char dimension) {
+  return GetTensorDim(gtl::ArraySlice<T>(attributes), format, dimension);
+}
+
+// Return the given tensor dimension from a tensor shape.
+// The tensor is interpretted using the specified format, and a dimension
+// specification using a char.
+inline int64 GetTensorDim(const TensorShape& tensor_shape, TensorFormat format,
+                          char dimension) {
+  return GetTensorDim(gtl::ArraySlice<int64>(tensor_shape.dim_sizes()), format,
+                      dimension);
+}
+
+// Return the given tensor dimension from a tensor. The tensor is interpretted
+// using the specified format, and a dimension specification using a char.
+inline int64 GetTensorDim(const Tensor& tensor, TensorFormat format,
+                          char dimension) {
+  return GetTensorDim(tensor.shape(), format, dimension);
+}
+
 // Return the string that specifies the data format for convnet operations.
 string GetConvnetDataFormatAttrString();
+string GetConvnet3dDataFormatAttrString();
+
+// Return a tensor shape for the given format. Works for both 2D and 3D
+// operations. If format is FORMAT_NCHW_VECT_C, the output TensorShape has rank
+// spatial.size()+3 (N,C,spatial,InnerC); otherwise, it has rank
+// spatial.size()+2 (e.g. N,C,spatial or N,spatial,C).
+inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
+                                   gtl::ArraySlice<int64> spatial, int64 C) {
+  const int dims =
+      spatial.size() + (format == FORMAT_NCHW_VECT_C ? 3  // Include N,C,InnerC.
+                                                     : 2);  // Include N,C.
+  gtl::InlinedVector<int64, 6> dim_sizes(dims);
+  dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
+  for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
+    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
+  }
+
+  int feature_index = GetTensorFeatureDimIndex(dims, format);
+  if (format == FORMAT_NCHW_VECT_C) {
+    CHECK_EQ(0, C % 4) << "NCHW_VECT_C requires C to be a multiple of 4, but C="
+                       << C;
+    dim_sizes[feature_index] = C / 4;
+    dim_sizes[GetTensorInnerFeatureDimIndex(dims, format)] = 4;
+  } else {
+    dim_sizes[feature_index] = C;
+  }
+  return TensorShape(dim_sizes);
+}
 
 // Return a tensor shape from the given format, and tensor dimensions.
 inline TensorShape ShapeFromFormat(TensorFormat format, int64 N, int64 H,
                                    int64 W, int64 C) {
-  std::vector<int64> dim_sizes(4);
-  dim_sizes[GetTensorDimIndex<2>(format, 'N')] = N;
-  dim_sizes[GetTensorDimIndex<2>(format, 'H')] = H;
-  dim_sizes[GetTensorDimIndex<2>(format, 'W')] = W;
-  dim_sizes[GetTensorDimIndex<2>(format, 'C')] = C;
-  return TensorShape(dim_sizes);
+  return ShapeFromFormat(format, N, {H, W}, C);
 }
 
 // Return a tensor shape from the given format, and tensor dimensions.
@@ -148,16 +242,23 @@ inline TensorShape ShapeFromFormat(TensorFormat dst_format,
   if (src_format == dst_format) {
     return src_shape;
   }
-  std::vector<int64> dim_sizes(4);
-  dim_sizes[GetTensorDimIndex<2>(dst_format, 'N')] =
-      GetTensorDim(src_shape, src_format, 'N');
-  dim_sizes[GetTensorDimIndex<2>(dst_format, 'H')] =
-      GetTensorDim(src_shape, src_format, 'H');
-  dim_sizes[GetTensorDimIndex<2>(dst_format, 'W')] =
-      GetTensorDim(src_shape, src_format, 'W');
-  dim_sizes[GetTensorDimIndex<2>(dst_format, 'C')] =
-      GetTensorDim(src_shape, src_format, 'C');
-  return TensorShape(dim_sizes);
+
+  const int64 batch = GetTensorDim(src_shape, src_format, 'N');
+  const int64 channels = GetTensorDim(src_shape, src_format, 'C') *
+                         (src_format == FORMAT_NCHW_VECT_C ? 4 : 1);
+
+  if (GetTensorSpatialDims(src_shape.dims(), src_format) == 3) {
+    return ShapeFromFormat(dst_format, batch,
+                           {{GetTensorDim(src_shape, src_format, '0'),
+                             GetTensorDim(src_shape, src_format, '1'),
+                             GetTensorDim(src_shape, src_format, '2')}},
+                           channels);
+  }
+
+  return ShapeFromFormat(dst_format, batch,
+                         {{GetTensorDim(src_shape, src_format, 'H'),
+                           GetTensorDim(src_shape, src_format, 'W')}},
+                         channels);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index e750b130b9e..cd490347196 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -102,7 +102,8 @@ TensorSliceReader::TensorSliceReader(const string& filepattern)
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function)
-    : TensorSliceReader(filepattern, open_function, kLoadAllShards) {}
+    : TensorSliceReader(filepattern, std::move(open_function), kLoadAllShards) {
+}
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function,
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index 06fc9aa4445..0f009d7de57 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
+#include <utility>
+
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,7 +38,8 @@ const TensorSliceReader* TensorSliceReaderCacheWrapper::GetReader(
   if (!cache_) {
     cache_ = new TensorSliceReaderCache;
   }
-  return cache_->GetReader(filepattern, open_function, preferred_shard);
+  return cache_->GetReader(filepattern, std::move(open_function),
+                           preferred_shard);
 }
 
 TensorSliceReaderCache::TensorSliceReaderCache() {}
@@ -88,7 +91,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader(
     mu_.lock();
     if (tmp_reader->status().ok()) {
       reader = tmp_reader;
-      readers_[filepattern] = make_pair(*func_ptr, reader);
+      readers_[filepattern] = std::make_pair(*func_ptr, reader);
     } else {
       delete tmp_reader;
     }
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index a01706c78a3..010cc36823b 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -48,8 +51,9 @@ namespace {
 //
 // We assume this is a row-major matrix.
 
-void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                       TensorSliceReader::OpenTableFunction open_function) {
+void SimpleFloatHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
@@ -108,7 +112,7 @@ void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -171,9 +175,10 @@ TEST(TensorSliceReaderTest, SimpleFloat) {
 }
 
 template <typename T, typename U>
-void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                      TensorSliceReader::OpenTableFunction open_function,
-                      const string& checkpoint_file) {
+void SimpleIntXHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function,
+    const string& checkpoint_file) {
   const string fname_base = io::JoinPath(testing::TmpDir(), checkpoint_file);
 
   TensorShape shape({4, 5});
@@ -232,7 +237,7 @@ void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -304,8 +309,8 @@ TEST_SIMPLE_INT(int8, int32)
 TEST_SIMPLE_INT(uint8, int32)
 
 void CachedTensorSliceReaderTesterHelper(
-    TensorSliceWriter::CreateBuilderFunction create_function,
-    TensorSliceReader::OpenTableFunction open_function) {
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    const TensorSliceReader::OpenTableFunction& open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
@@ -401,7 +406,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   {
     // Prepare an empty checkpoint with some version information
     SavedTensorSlices sts;
-    sts.mutable_meta()->mutable_versions()->CopyFrom(versions);
+    *sts.mutable_meta()->mutable_versions() = versions;
     string contents;
     EXPECT_TRUE(sts.SerializeToString(&contents));
 
@@ -410,7 +415,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
     TF_ASSERT_OK(CreateTableTensorSliceBuilder(path, &builder));
     builder->Add(kSavedTensorSlicesKey, contents);
     int64 file_size;
-    builder->Finish(&file_size);
+    TF_EXPECT_OK(builder->Finish(&file_size));
     delete builder;
   }
 
diff --git a/tensorflow/core/util/tensor_slice_set_test.cc b/tensorflow/core/util/tensor_slice_set_test.cc
index e5b31fb3bf3..38ad6adf51a 100644
--- a/tensorflow/core/util/tensor_slice_set_test.cc
+++ b/tensorflow/core/util/tensor_slice_set_test.cc
@@ -242,7 +242,7 @@ static void BM_RegisterOneByOne(int parts) {
   TensorSliceSet slice_set(shape, DT_INT32);
   for (int i = 0; i < parts; ++i) {
     TensorSlice part({{i, 1}, {0, -1}});
-    slice_set.Register(part, part.DebugString(), nullptr);
+    TF_CHECK_OK(slice_set.Register(part, part.DebugString(), nullptr));
   }
 }
 
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index 928d6fe72c7..46274267e9c 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/table_builder.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -123,7 +124,7 @@ Status TensorSliceWriter::Finish() {
       LOG(ERROR) << "Failed to rename file " << tmpname_ << " to " << filename_;
     }
   } else {
-    Env::Default()->DeleteFile(tmpname_);
+    Env::Default()->DeleteFile(tmpname_).IgnoreError();
   }
   return s;
 }
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index b39e14bc73f..95d6384afec 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/saved_tensor_slice.pb.h"
 #include "tensorflow/core/util/saved_tensor_slice.pb_text.h"
+#include "tensorflow/core/util/saved_tensor_slice.pb.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index ad5a9a5b4f8..be636c04c47 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <array>
 
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/logging.h"
@@ -270,12 +272,14 @@ size_t BytesPerElementHelper(DT value) {
   SavedSlice ss;
   std::array<DT, 1> lo_data;
   std::fill(lo_data.begin(), lo_data.end(), value);
-  TensorSliceWriter::SaveData(lo_data.data(), lo_data.size(), &ss);
+  TF_EXPECT_OK(
+      TensorSliceWriter::SaveData(lo_data.data(), lo_data.size(), &ss));
   int lo_byte_size = ss.ByteSize();
 
   std::array<DT, 1001> hi_data;
   std::fill(hi_data.begin(), hi_data.end(), value);
-  TensorSliceWriter::SaveData(hi_data.data(), hi_data.size(), &ss);
+  TF_EXPECT_OK(
+      TensorSliceWriter::SaveData(hi_data.data(), hi_data.size(), &ss));
   int hi_byte_size = ss.ByteSize();
 
   return (hi_byte_size - lo_byte_size) / (hi_data.size() - lo_data.size());
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index 4eb397670d5..409d5db211f 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -174,4 +174,16 @@ message TestResults {
 
   // Benchmark target identifier.
   string name = 9;
+
+  // The type of benchmark.
+  enum BenchmarkType {
+    UNKNOWN = 0;  // Fallback for protos written before Type was introduced.
+    CPP_MICROBENCHMARK = 1;
+    PYTHON_BENCHMARK = 2;
+    ANDROID_BENCHMARK = 3;
+  }
+  BenchmarkType benchmark_type = 10;
+
+  // Used for differentiating between continuous and debug builds.
+  string run_mode = 11;
 };
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index 47f57c909db..f1e3f3bd32c 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -39,16 +39,4 @@ bool CudnnUseAutotune() {
   return value;
 }
 
-namespace internal {
-
-bool AvgPoolUseCudnn() {
-  bool value;
-  Status status = ReadBoolFromEnvVar("TF_AVGPOOL_USE_CUDNN", false, &value);
-  if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
-  }
-  return value;
-}
-
-}  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index c99d23a1404..455ca8cdeb0 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -23,12 +23,6 @@ namespace tensorflow {
 bool CanUseCudnn();
 bool CudnnUseAutotune();
 
-namespace internal {
-
-// This function is for transition only. And it may go away at any time.
-bool AvgPoolUseCudnn();
-
-}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_USE_CUDNN_H_
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 3481a6aaa4d..1e5a9c57126 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -85,7 +85,7 @@ void MovingAverage::AddValue(double v) {
 
 static char hex_char[] = "0123456789abcdef";
 
-string PrintMemory(const char* ptr, int n) {
+string PrintMemory(const char* ptr, size_t n) {
   string ret;
   ret.resize(n * 3);
   for (int i = 0; i < n; ++i) {
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index c142f4d0d26..4adf2f14dcc 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -49,7 +49,7 @@ class MovingAverage {
 
 // Returns a string printing bytes in ptr[0..n).  The output looks
 // like "00 01 ef cd cd ef".
-string PrintMemory(const char* ptr, int n);
+string PrintMemory(const char* ptr, size_t n);
 
 // Given a flattened index into a tensor, computes a string s so that
 // StrAppend("tensor", s) is a Python indexing expression.  E.g.,
diff --git a/tensorflow/g3doc/tutorials/__init__.py b/tensorflow/docs_src/__init__.py
similarity index 100%
rename from tensorflow/g3doc/tutorials/__init__.py
rename to tensorflow/docs_src/__init__.py
diff --git a/tensorflow/docs_src/about/attribution.md b/tensorflow/docs_src/about/attribution.md
new file mode 100644
index 00000000000..a4858b400ab
--- /dev/null
+++ b/tensorflow/docs_src/about/attribution.md
@@ -0,0 +1,9 @@
+# Attribution
+
+Please only use the TensorFlow name and marks when accurately referencing this
+software distribution, and do not use our marks in a way that suggests you are
+endorsed by or otherwise affiliated with Google. When referring to our marks,
+please include the following attribution statement: "TensorFlow, the TensorFlow
+logo and any related marks are trademarks of Google Inc."
+
+
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
new file mode 100644
index 00000000000..0c0e88c1fed
--- /dev/null
+++ b/tensorflow/docs_src/about/bib.md
@@ -0,0 +1,131 @@
+# TensorFlow White Papers
+
+This document identifies white papers about TensorFlow.
+
+## Large-Scale Machine Learning on Heterogeneous Distributed Systems
+
+[Access this white paper.](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+**Abstract:** TensorFlow is an interface for expressing machine learning
+algorithms, and an implementation for executing such algorithms.
+A computation expressed using TensorFlow can be
+executed with little or no change on a wide variety of heterogeneous
+systems, ranging from mobile devices such as phones
+and tablets up to large-scale distributed systems of hundreds
+of machines and thousands of computational devices such as
+GPU cards. The system is flexible and can be used to express
+a wide variety of algorithms, including training and inference
+algorithms for deep neural network models, and it has been
+used for conducting research and for deploying machine learning
+systems into production across more than a dozen areas of
+computer science and other fields, including speech recognition,
+computer vision, robotics, information retrieval, natural
+language processing, geographic information extraction, and
+computational drug discovery. This paper describes the TensorFlow
+interface and an implementation of that interface that
+we have built at Google. The TensorFlow API and a reference
+implementation were released as an open-source package under
+the Apache 2.0 license in November, 2015 and are available at
+www.tensorflow.org.
+
+
+### In BibTeX format
+
+If you use TensorFlow in your research and would like to cite the TensorFlow
+system, we suggest you cite this whitepaper.
+
+<pre>
+@misc{tensorflow2015-whitepaper,
+title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
+url={http://tensorflow.org/},
+note={Software available from tensorflow.org},
+author={
+    Mart\'{\i}n~Abadi and
+    Ashish~Agarwal and
+    Paul~Barham and
+    Eugene~Brevdo and
+    Zhifeng~Chen and
+    Craig~Citro and
+    Greg~S.~Corrado and
+    Andy~Davis and
+    Jeffrey~Dean and
+    Matthieu~Devin and
+    Sanjay~Ghemawat and
+    Ian~Goodfellow and
+    Andrew~Harp and
+    Geoffrey~Irving and
+    Michael~Isard and
+    Yangqing Jia and
+    Rafal~Jozefowicz and
+    Lukasz~Kaiser and
+    Manjunath~Kudlur and
+    Josh~Levenberg and
+    Dan~Man\'{e} and
+    Rajat~Monga and
+    Sherry~Moore and
+    Derek~Murray and
+    Chris~Olah and
+    Mike~Schuster and
+    Jonathon~Shlens and
+    Benoit~Steiner and
+    Ilya~Sutskever and
+    Kunal~Talwar and
+    Paul~Tucker and
+    Vincent~Vanhoucke and
+    Vijay~Vasudevan and
+    Fernanda~Vi\'{e}gas and
+    Oriol~Vinyals and
+    Pete~Warden and
+    Martin~Wattenberg and
+    Martin~Wicke and
+    Yuan~Yu and
+    Xiaoqiang~Zheng},
+  year={2015},
+}
+</pre>
+
+Or in textual form:
+
+<pre>
+Martín Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo,
+Zhifeng Chen, Craig Citro, Greg S. Corrado, Andy Davis,
+Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Ian Goodfellow,
+Andrew Harp, Geoffrey Irving, Michael Isard, Rafal Jozefowicz, Yangqing Jia,
+Lukasz Kaiser, Manjunath Kudlur, Josh Levenberg, Dan Mané, Mike Schuster,
+Rajat Monga, Sherry Moore, Derek Murray, Chris Olah, Jonathon Shlens,
+Benoit Steiner, Ilya Sutskever, Kunal Talwar, Paul Tucker,
+Vincent Vanhoucke, Vijay Vasudevan, Fernanda Viégas,
+Oriol Vinyals, Pete Warden, Martin Wattenberg, Martin Wicke,
+Yuan Yu, and Xiaoqiang Zheng.
+TensorFlow: Large-scale machine learning on heterogeneous systems,
+2015. Software available from tensorflow.org.
+</pre>
+
+
+
+## TensorFlow: A System for Large-Scale Machine Learning
+
+[Access this white paper.](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
+
+**Abstract:** TensorFlow is a machine learning system that operates at
+large scale and in heterogeneous environments. TensorFlow
+uses dataflow graphs to represent computation,
+shared state, and the operations that mutate that state. It
+maps the nodes of a dataflow graph across many machines
+in a cluster, and within a machine across multiple computational
+devices, including multicore CPUs, generalpurpose
+GPUs, and custom-designed ASICs known as
+Tensor Processing Units (TPUs). This architecture gives
+flexibility to the application developer: whereas in previous
+“parameter server” designs the management of shared
+state is built into the system, TensorFlow enables developers
+to experiment with novel optimizations and training algorithms.
+TensorFlow supports a variety of applications,
+with a focus on training and inference on deep neural networks.
+Several Google services use TensorFlow in production,
+we have released it as an open-source project, and
+it has become widely used for machine learning research.
+In this paper, we describe the TensorFlow dataflow model
+and demonstrate the compelling performance that TensorFlow
+achieves for several real-world applications.
+
diff --git a/tensorflow/docs_src/about/index.md b/tensorflow/docs_src/about/index.md
new file mode 100644
index 00000000000..5326b1e1101
--- /dev/null
+++ b/tensorflow/docs_src/about/index.md
@@ -0,0 +1,12 @@
+# About TensorFlow
+
+This section provides a few documents about TensorFlow itself,
+including the following:
+
+  * @{$roadmap$Roadmap}, which summarizes upcoming additions to TensorFlow.
+  * @{$uses$TensorFlow in Use}, which provides a link to our model zoo and
+    lists some popular ways that TensorFlow is being used.
+  * @{$bib$TensorFlow White Papers}, which provides abstracts of white papers
+    about TensorFlow.
+  * @{$attribution$Attribution}, which specifies how to attribute and refer
+    to TensorFlow.
diff --git a/tensorflow/docs_src/about/leftnav_files b/tensorflow/docs_src/about/leftnav_files
new file mode 100644
index 00000000000..28f039e9b5f
--- /dev/null
+++ b/tensorflow/docs_src/about/leftnav_files
@@ -0,0 +1,5 @@
+index.md
+roadmap.md
+uses.md
+bib.md
+attribution.md
diff --git a/tensorflow/g3doc/resources/roadmap.md b/tensorflow/docs_src/about/roadmap.md
similarity index 91%
rename from tensorflow/g3doc/resources/roadmap.md
rename to tensorflow/docs_src/about/roadmap.md
index 76c734830ad..1789e050fac 100644
--- a/tensorflow/g3doc/resources/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@@ -12,9 +12,8 @@ we do not have timelines for these features.
 
 ### Improve non-Python language support
 
-* Improve C++ API for graph construction and gradients
-* Java language support
-* Go language support
+* Support for adding gradient computation for graphs constructed in other
+  languages (C++, Java, Go etc.)
 
 ### Making TensorFlow easier to use
 * High-level APIs
diff --git a/tensorflow/g3doc/resources/uses.md b/tensorflow/docs_src/about/uses.md
similarity index 84%
rename from tensorflow/g3doc/resources/uses.md
rename to tensorflow/docs_src/about/uses.md
index 1d2f3bb8113..d41818e10c9 100644
--- a/tensorflow/g3doc/resources/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -1,6 +1,20 @@
-# TensorFlow Uses
+# TensorFlow In Use
 
-This page describes some of the current uses of the TensorFlow system.
+This page highlights TensorFlow models in real world use.
+
+
+## Model zoo
+
+Please visit our collection of TensorFlow models in the 
+[TensorFlow Zoo](https://github.com/tensorflow/models).
+
+If you have built a model with TensorFlow, please consider publishing it in
+the Zoo.
+
+
+## Current uses
+
+This section describes some of the current uses of the TensorFlow system.
 
 > If you are using TensorFlow for research, for education, or for production
 > usage in some product, we would love to add something about your usage here.
@@ -8,8 +22,6 @@ This page describes some of the current uses of the TensorFlow system.
 > TensorFlow, or even better, send us a pull request to add an entry to this
 > file.
 
-Listed below are some of the many uses of TensorFlow.
-
 * **RankBrain**
 <ul>
    <li>**Organization**: Google</li>
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md
new file mode 100644
index 00000000000..c5473cad973
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/cc/guide.md
@@ -0,0 +1,282 @@
+# C++ API
+[TOC]
+
+TensorFlow's C++ API provides mechanisms for constructing and executing a data
+flow graph. The API is designed to be simple and concise: graph operations are
+clearly expressed using a "functional" construction style, including easy
+specification of names, device placement, etc., and the resulting graph can be
+efficiently run and the desired outputs fetched in a few lines of code. This
+guide explains the basic concepts and data structures needed to get started with
+TensorFlow graph construction and execution in C++.
+
+## The Basics
+
+Let's start with a simple example that illustrates graph construction and
+execution using the C++ API.
+
+```c++
+// tensorflow/cc/example/example.cc
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor.h"
+
+int main() {
+  using namespace tensorflow;
+  using namespace tensorflow::ops;
+  Scope root = Scope::NewRootScope();
+  // Matrix A = [3 2; -1 0]
+  auto A = Const(root, { {3.f, 2.f}, {-1.f, 0.f} });
+  // Vector b = [3 5]
+  auto b = Const(root, { {3.f, 5.f} });
+  // v = Ab^T
+  auto v = MatMul(root.WithOpName("v"), A, b, MatMul::TransposeB(true));
+  std::vector<Tensor> outputs;
+  ClientSession session(root);
+  // Run and fetch v
+  TF_CHECK_OK(session.Run({v}, &outputs));
+  // Expect outputs[0] == [19; -3]
+  LOG(INFO) << outputs[0].matrix<float>();
+  return 0;
+}
+```
+
+Place this example code in the file `tensorflow/cc/example/example.cc` inside a
+clone of the
+TensorFlow
+[github repository](http://www.github.com/tensorflow/tensorflow). Also place a
+`BUILD` file in the same directory with the following contents:
+
+```python
+cc_binary(
+    name = "example",
+    srcs = ["example.cc"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+```
+
+You should be able to build and run the example using the following command:
+
+```shell
+bazel run -c opt //tensorflow/cc/example:example
+```
+
+This example shows some of the important features of the C++ API such as the
+following:
+
+* Constructing tensor constants from C++ nested initializer lists
+* Constructing and naming of TensorFlow operations
+* Specifying optional attributes to operation constructors
+* Executing and fetching the tensor values from the TensorFlow session.
+
+We will delve into the details of each below.
+
+## Graph Construction
+
+### Scope
+
+@{tensorflow::Scope} is the main data structure that holds the current state
+of graph construction. A `Scope` acts as a handle to the graph being
+constructed, as well as storing TensorFlow operation properties. The `Scope`
+object is the first argument to operation constructors, and operations that use
+a given `Scope` as their first argument inherit that `Scope`'s properties, such
+as a common name prefix. Multiple `Scope`s can refer to the same graph, as
+explained further below.
+
+Create a new `Scope` object by calling `Scope::NewRootScope`. This creates
+some resources such as a graph to which operations are added. It also creates a
+@{tensorflow::Status} object which will be used to indicate errors encountered
+when constructing operations. The `Scope` class has value semantics, thus, a
+`Scope` object can be freely copied and passed around.
+
+The `Scope` object returned by `Scope::NewRootScope` is referred
+to as the root scope. "Child" scopes can be constructed from the root scope by
+calling various member functions of the `Scope` class, thus forming a hierarchy
+of scopes. A child scope inherits all of the properties of the parent scope and
+typically has one property added or changed. For instance, `NewSubScope(name)`
+appends `name` to the prefix of names for operations created using the returned
+`Scope` object.
+
+Here are some of the properties controlled by a `Scope` object:
+
+* Operation names
+* Set of control dependencies for an operation
+* Device placement for an operation
+* Kernel attribute for an operation
+
+Please refer to @{tensorflow::Scope} for the complete list of member functions
+that let you create child scopes with new properties.
+
+### Operation Constructors
+
+You can create graph operations with operation constructors, one C++ class per
+TensorFlow operation. Unlike the Python API which uses snake-case to name the
+operation constructors, the C++ API uses camel-case to conform to C++ coding
+style. For instance, the `MatMul` operation has a C++ class with the same name.
+
+Using this class-per-operation method, it is possible, though not recommended,
+to construct an operation as follows:
+
+```c++
+// Not recommended
+MatMul m(scope, a, b);
+```
+
+Instead, we recommend the following "functional" style for constructing
+operations:
+
+```c++
+// Recommended
+auto m = MatMul(scope, a, b);
+```
+
+The first parameter for all operation constructors is always a `Scope` object.
+Tensor inputs and mandatory attributes form the rest of the arguments.
+
+For optional arguments, constructors have an optional parameter that allows
+optional attributes.  For operations with optional arguments, the constructor's
+last optional parameter is a `struct` type called `[operation]:Attrs` that
+contains data members for each optional attribute. You can construct such
+`Attrs` in multiple ways:
+
+* You can specify a single optional attribute by constructing an `Attrs` object
+using the `static` functions provided in the C++ class for the operation. For
+example:
+
+```c++
+auto m = MatMul(scope, a, b, MatMul::TransposeA(true));
+```
+
+* You can specify multiple optional attributes by chaining together functions
+  available in the `Attrs` struct. For example:
+
+```c++
+auto m = MatMul(scope, a, b, MatMul::TransposeA(true).TransposeB(true));
+
+// Or, alternatively
+auto m = MatMul(scope, a, b, MatMul::Attrs().TransposeA(true).TransposeB(true));
+```
+
+The arguments and return values of operations are handled in different ways
+depending on their type:
+
+* For operations that return single tensors, the object returned by
+  the operation object can be passed directly to other operation
+  constructors. For example:
+
+```c++
+auto m = MatMul(scope, x, W);
+auto sum = Add(scope, m, bias);
+```
+
+* For operations producing multiple outputs, the object returned by the
+  operation constructor has a member for each of the outputs. The names of those
+  members are identical to the names present in the `OpDef` for the
+  operation. For example:
+
+```c++
+auto u = Unique(scope, a);
+// u.y has the unique values and u.idx has the unique indices
+auto m = Add(scope, u.y, b);
+```
+
+* Operations producing a list-typed output return an object that can
+  be indexed using the `[]` operator. That object can also be directly passed to
+  other constructors that expect list-typed inputs. For example:
+
+```c++
+auto s = Split(scope, 0, a, 2);
+// Access elements of the returned list.
+auto b = Add(scope, s[0], s[1]);
+// Pass the list as a whole to other constructors.
+auto c = Concat(scope, s, 0);
+```
+
+### Constants
+
+You may pass many different types of C++ values directly to tensor
+constants. You may explicitly create a tensor constant by calling the
+@{tensorflow::ops::Const} function from various kinds of C++ values. For
+example:
+
+* Scalars
+
+```c++
+auto f = Const(scope, 42.0f);
+auto s = Const(scope, "hello world!");
+```
+
+* Nested initializer lists
+
+```c++
+// 2x2 matrix
+auto c1 = Const(scope, { {1, 2}, {2, 4} });
+// 1x3x1 tensor
+auto c2 = Const(scope, { { {1}, {2}, {3} } });
+// 1x2x0 tensor
+auto c3 = ops::Const(scope, { { {}, {} } });
+```
+
+* Shapes explicitly specified
+
+```c++
+// 2x2 matrix with all elements = 10
+auto c1 = Const(scope, 10, /* shape */ {2, 2});
+// 1x3x2x1 tensor
+auto c2 = Const(scope, {1, 2, 3, 4, 5, 6}, /* shape */ {1, 3, 2, 1});
+```
+
+You may directly pass constants to other operation constructors, either by
+explicitly constructing one using the `Const` function, or implicitly as any of
+the above types of C++ values. For example:
+
+```c++
+// [1 1] * [41; 1]
+auto x = MatMul(scope, { {1, 1} }, { {41}, {1} });
+// [1 2 3 4] + 10
+auto y = Add(scope, {1, 2, 3, 4}, 10);
+```
+
+## Graph Execution
+
+When executing a graph, you will need a session. The C++ API provides a
+@{tensorflow::ClientSession} class that will execute ops created by the
+operation constructors. TensorFlow will automatically determine which parts of
+the graph need to be executed, and what values need feeding. For example:
+
+```c++
+Scope root = Scope::NewRootScope();
+auto c = Const(root, { {1, 1} });
+auto m = MatMul(root, c, { {42}, {1} });
+
+ClientSession session(root);
+std::vector<Tensor> outputs;
+session.Run({m}, &outputs);
+// outputs[0] == {42}
+```
+
+Similarly, the object returned by the operation constructor can be used as the
+argument to specify a value being fed when executing the graph. Furthermore, the
+value to feed can be specified with the different kinds of C++ values used to
+specify tensor constants. For example:
+
+```c++
+Scope root = Scope::NewRootScope();
+auto a = Placeholder(root, DT_INT32);
+// [3 3; 3 3]
+auto b = Const(root, 3, {2, 2});
+auto c = Add(root, a, b);
+ClientSession session(root);
+std::vector<Tensor> outputs;
+
+// Feed a <- [1 2; 3 4]
+session.Run({ {a, { {1, 2}, {3, 4} } } }, {c}, &outputs);
+// outputs[0] == [4 5; 6 7]
+```
+
+Please see the @{tensorflow::Tensor} documentation for more information on how
+to use the execution output.
diff --git a/tensorflow/docs_src/api_guides/python/array_ops.md b/tensorflow/docs_src/api_guides/python/array_ops.md
new file mode 100644
index 00000000000..a34f01f0731
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/array_ops.md
@@ -0,0 +1,87 @@
+# Tensor Transformations
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Casting
+
+TensorFlow provides several operations that you can use to cast tensor data
+types in your graph.
+
+*   @{tf.string_to_number}
+*   @{tf.to_double}
+*   @{tf.to_float}
+*   @{tf.to_bfloat16}
+*   @{tf.to_int32}
+*   @{tf.to_int64}
+*   @{tf.cast}
+*   @{tf.bitcast}
+*   @{tf.saturate_cast}
+
+## Shapes and Shaping
+
+TensorFlow provides several operations that you can use to determine the shape
+of a tensor and change the shape of a tensor.
+
+*   @{tf.broadcast_dynamic_shape}
+*   @{tf.broadcast_static_shape}
+*   @{tf.shape}
+*   @{tf.shape_n}
+*   @{tf.size}
+*   @{tf.rank}
+*   @{tf.reshape}
+*   @{tf.squeeze}
+*   @{tf.expand_dims}
+*   @{tf.meshgrid}
+
+## Slicing and Joining
+
+TensorFlow provides several operations to slice or extract parts of a tensor,
+or join multiple tensors together.
+
+*   @{tf.slice}
+*   @{tf.strided_slice}
+*   @{tf.split}
+*   @{tf.tile}
+*   @{tf.pad}
+*   @{tf.concat}
+*   @{tf.stack}
+*   @{tf.parallel_stack}
+*   @{tf.unstack}
+*   @{tf.reverse_sequence}
+*   @{tf.reverse}
+*   @{tf.reverse_v2}
+*   @{tf.transpose}
+*   @{tf.extract_image_patches}
+*   @{tf.space_to_batch_nd}
+*   @{tf.space_to_batch}
+*   @{tf.required_space_to_batch_paddings}
+*   @{tf.batch_to_space_nd}
+*   @{tf.batch_to_space}
+*   @{tf.space_to_depth}
+*   @{tf.depth_to_space}
+*   @{tf.gather}
+*   @{tf.gather_nd}
+*   @{tf.unique_with_counts}
+*   @{tf.scatter_nd}
+*   @{tf.dynamic_partition}
+*   @{tf.dynamic_stitch}
+*   @{tf.boolean_mask}
+*   @{tf.one_hot}
+*   @{tf.sequence_mask}
+*   @{tf.dequantize}
+*   @{tf.quantize_v2}
+*   @{tf.quantized_concat}
+*   @{tf.setdiff1d}
+
+## Fake quantization
+Operations used to help train for better quantization accuracy.
+
+*   @{tf.fake_quant_with_min_max_args}
+*   @{tf.fake_quant_with_min_max_args_gradient}
+*   @{tf.fake_quant_with_min_max_vars}
+*   @{tf.fake_quant_with_min_max_vars_gradient}
+*   @{tf.fake_quant_with_min_max_vars_per_channel}
+*   @{tf.fake_quant_with_min_max_vars_per_channel_gradient}
diff --git a/tensorflow/docs_src/api_guides/python/check_ops.md b/tensorflow/docs_src/api_guides/python/check_ops.md
new file mode 100644
index 00000000000..6f8a18af428
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/check_ops.md
@@ -0,0 +1,19 @@
+# Asserts and boolean checks
+
+*   @{tf.assert_negative}
+*   @{tf.assert_positive}
+*   @{tf.assert_proper_iterable}
+*   @{tf.assert_non_negative}
+*   @{tf.assert_non_positive}
+*   @{tf.assert_equal}
+*   @{tf.assert_integer}
+*   @{tf.assert_less}
+*   @{tf.assert_less_equal}
+*   @{tf.assert_greater}
+*   @{tf.assert_greater_equal}
+*   @{tf.assert_rank}
+*   @{tf.assert_rank_at_least}
+*   @{tf.assert_type}
+*   @{tf.is_non_decreasing}
+*   @{tf.is_numeric_tensor}
+*   @{tf.is_strictly_increasing}
diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md
new file mode 100644
index 00000000000..97c19863600
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/client.md
@@ -0,0 +1,36 @@
+# Running Graphs
+[TOC]
+
+This library contains classes for launching graphs and executing operations.
+
+The @{$get_started/get_started} guide has
+examples of how a graph is launched in a @{tf.Session}.
+
+## Session management
+
+*   @{tf.Session}
+*   @{tf.InteractiveSession}
+*   @{tf.get_default_session}
+
+## Error classes and convenience functions
+
+*   @{tf.OpError}
+*   @{tf.errors.CancelledError}
+*   @{tf.errors.UnknownError}
+*   @{tf.errors.InvalidArgumentError}
+*   @{tf.errors.DeadlineExceededError}
+*   @{tf.errors.NotFoundError}
+*   @{tf.errors.AlreadyExistsError}
+*   @{tf.errors.PermissionDeniedError}
+*   @{tf.errors.UnauthenticatedError}
+*   @{tf.errors.ResourceExhaustedError}
+*   @{tf.errors.FailedPreconditionError}
+*   @{tf.errors.AbortedError}
+*   @{tf.errors.OutOfRangeError}
+*   @{tf.errors.UnimplementedError}
+*   @{tf.errors.InternalError}
+*   @{tf.errors.UnavailableError}
+*   @{tf.errors.DataLossError}
+*   @{tf.errors.exception_type_from_error_code}
+*   @{tf.errors.error_code_from_exception_type}
+*   @{tf.errors.raise_exception_on_not_ok_status}
diff --git a/tensorflow/docs_src/api_guides/python/constant_op.md b/tensorflow/docs_src/api_guides/python/constant_op.md
new file mode 100644
index 00000000000..db3410ce221
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/constant_op.md
@@ -0,0 +1,87 @@
+# Constants, Sequences, and Random Values
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Constant Value Tensors
+
+TensorFlow provides several operations that you can use to generate constants.
+
+*   @{tf.zeros}
+*   @{tf.zeros_like}
+*   @{tf.ones}
+*   @{tf.ones_like}
+*   @{tf.fill}
+*   @{tf.constant}
+
+## Sequences
+
+*   @{tf.linspace}
+*   @{tf.range}
+
+## Random Tensors
+
+TensorFlow has several ops that create random tensors with different
+distributions.  The random ops are stateful, and create new random values each
+time they are evaluated.
+
+The `seed` keyword argument in these functions acts in conjunction with
+the graph-level random seed. Changing either the graph-level seed using
+@{tf.set_random_seed} or the
+op-level seed will change the underlying seed of these operations. Setting
+neither graph-level nor op-level seed, results in a random seed for all
+operations.
+See @{tf.set_random_seed}
+for details on the interaction between operation-level and graph-level random
+seeds.
+
+### Examples:
+
+```python
+# Create a tensor of shape [2, 3] consisting of random normal values, with mean
+# -1 and standard deviation 4.
+norm = tf.random_normal([2, 3], mean=-1, stddev=4)
+
+# Shuffle the first dimension of a tensor
+c = tf.constant([[1, 2], [3, 4], [5, 6]])
+shuff = tf.random_shuffle(c)
+
+# Each time we run these ops, different results are generated
+sess = tf.Session()
+print(sess.run(norm))
+print(sess.run(norm))
+
+# Set an op-level seed to generate repeatable sequences across sessions.
+norm = tf.random_normal([2, 3], seed=1234)
+sess = tf.Session()
+print(sess.run(norm))
+print(sess.run(norm))
+sess = tf.Session()
+print(sess.run(norm))
+print(sess.run(norm))
+```
+
+Another common use of random values is the initialization of variables. Also see
+the @{$variables$Variables How To}.
+
+```python
+# Use random uniform values in [0, 1) as the initializer for a variable of shape
+# [2, 3]. The default type is float32.
+var = tf.Variable(tf.random_uniform([2, 3]), name="var")
+init = tf.global_variables_initializer()
+
+sess = tf.Session()
+sess.run(init)
+print(sess.run(var))
+```
+
+*   @{tf.random_normal}
+*   @{tf.truncated_normal}
+*   @{tf.random_uniform}
+*   @{tf.random_shuffle}
+*   @{tf.random_crop}
+*   @{tf.multinomial}
+*   @{tf.random_gamma}
+*   @{tf.set_random_seed}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
new file mode 100644
index 00000000000..1ef72d7b446
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
@@ -0,0 +1,47 @@
+# BayesFlow Entropy (contrib)
+[TOC]
+
+Entropy Ops.
+
+## Background
+
+Common Shannon entropy, the Evidence Lower BOund (ELBO), KL divergence, and more
+all have information theoretic use and interpretations.  They are also often
+used in variational inference.  This library brings together `Ops` for
+estimating them, e.g. using Monte Carlo expectations.
+
+## Examples
+
+Example of fitting a variational posterior with the ELBO.
+
+```python
+# We start by assuming knowledge of the log of a joint density p(z, x) over
+# latent variable z and fixed measurement x.  Since x is fixed, the Python
+# function does not take x as an argument.
+def log_joint(z):
+  theta = tf.Variable(0.)  # Trainable variable that helps define log_joint.
+  ...
+
+# Next, define a Normal distribution with trainable parameters.
+q = distributions.Normal(mu=tf.Variable(0.), sigma=tf.Variable(1.))
+
+# Now, define a loss function (negative ELBO) that, when minimized, will adjust
+# mu, sigma, and theta, increasing the ELBO, which we hope will both reduce the
+# KL divergence between q(z) and p(z | x), and increase p(x).  Note that we
+# cannot guarantee both, but in general we expect both to happen.
+elbo = entropy.elbo_ratio(log_p, q, n=10)
+loss = -elbo
+
+# Minimize the loss
+train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
+tf.global_variables_initializer().run()
+for step in range(100):
+  train_op.run()
+```
+
+## Ops
+
+*   @{tf.contrib.bayesflow.entropy.elbo_ratio}
+*   @{tf.contrib.bayesflow.entropy.entropy_shannon}
+*   @{tf.contrib.bayesflow.entropy.renyi_ratio}
+*   @{tf.contrib.bayesflow.entropy.renyi_alpha}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
new file mode 100644
index 00000000000..956dccb64f9
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
@@ -0,0 +1,54 @@
+# BayesFlow Monte Carlo (contrib)
+[TOC]
+
+Monte Carlo integration and helpers.
+
+## Background
+
+Monte Carlo integration refers to the practice of estimating an expectation with
+a sample mean.  For example, given random variable `Z in R^k` with density `p`,
+the expectation of function `f` can be approximated like:
+
+```
+E_p[f(Z)] = \int f(z) p(z) dz
+          ~ S_n
+          := n^{-1} \sum_{i=1}^n f(z_i),  z_i iid samples from p.
+```
+
+If `E_p[|f(Z)|] < infinity`, then `S_n --> E_p[f(Z)]` by the strong law of large
+numbers.  If `E_p[f(Z)^2] < infinity`, then `S_n` is asymptotically normal with
+variance `Var[f(Z)] / n`.
+
+Practitioners of Bayesian statistics often find themselves wanting to estimate
+`E_p[f(Z)]` when the distribution `p` is known only up to a constant.  For
+example, the joint distribution `p(z, x)` may be known, but the evidence
+`p(x) = \int p(z, x) dz` may be intractable.  In that case, a parameterized
+distribution family `q_lambda(z)` may be chosen, and the optimal `lambda` is the
+one minimizing the KL divergence between `q_lambda(z)` and
+`p(z | x)`.  We only know `p(z, x)`, but that is sufficient to find `lambda`.
+
+
+## Log-space evaluation and subtracting the maximum
+
+Care must be taken when the random variable lives in a high dimensional space.
+For example, the naive importance sample estimate `E_q[f(Z) p(Z) / q(Z)]`
+involves the ratio of two terms `p(Z) / q(Z)`, each of which must have tails
+dropping off faster than `O(|z|^{-(k + 1)})` in order to have finite integral.
+This ratio would often be zero or infinity up to numerical precision.
+
+For that reason, we write
+
+```
+Log E_q[ f(Z) p(Z) / q(Z) ]
+   = Log E_q[ exp{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C} ] + C,  where
+C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
+```
+
+The maximum value of the exponentiated term will be 0.0, and the expectation
+can be evaluated in a stable manner.
+
+## Ops
+
+*   @{tf.contrib.bayesflow.monte_carlo.expectation}
+*   @{tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler}
+*   @{tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
new file mode 100644
index 00000000000..2b575340690
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
@@ -0,0 +1,8 @@
+# BayesFlow Stochastic Graph (contrib)
+[TOC]
+
+Classes and helper functions for Stochastic Computation Graphs.
+
+## Stochastic Computation Graph Helper Functions
+
+*   @{tf.contrib.bayesflow.stochastic_graph.surrogate_loss}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
new file mode 100644
index 00000000000..e90f58a8222
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
@@ -0,0 +1,24 @@
+# BayesFlow Stochastic Tensors (contrib)
+[TOC]
+
+Classes and helper functions for creating Stochastic Tensors.
+
+`StochasticTensor` objects wrap `Distribution` objects.  Their
+values may be samples from the underlying distribution, or the distribution
+mean (as governed by `value_type`).  These objects provide a `loss`
+method for use when sampling from a non-reparameterized distribution.
+The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
+to produce a single differentiable loss in stochastic graphs having
+both continuous and discrete stochastic nodes.
+
+## Stochastic Tensor Classes
+
+*   @{tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor}
+*   @{tf.contrib.bayesflow.stochastic_tensor.StochasticTensor}
+
+## Stochastic Tensor Value Types
+
+*   @{tf.contrib.bayesflow.stochastic_tensor.MeanValue}
+*   @{tf.contrib.bayesflow.stochastic_tensor.SampleValue}
+*   @{tf.contrib.bayesflow.stochastic_tensor.value_type}
+*   @{tf.contrib.bayesflow.stochastic_tensor.get_current_value_type}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
new file mode 100644
index 00000000000..e6070b9aea6
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
@@ -0,0 +1,11 @@
+# BayesFlow Variational Inference (contrib)
+[TOC]
+
+Variational inference.
+
+## Ops
+
+*   @{tf.contrib.bayesflow.variational_inference.elbo}
+*   @{tf.contrib.bayesflow.variational_inference.elbo_with_log_joint}
+*   @{tf.contrib.bayesflow.variational_inference.ELBOForms}
+*   @{tf.contrib.bayesflow.variational_inference.register_prior}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md b/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
new file mode 100644
index 00000000000..f61f4c764d2
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
@@ -0,0 +1,4 @@
+# Copying Graph Elements (contrib)
+[TOC]
+
+Functions for copying elements from one graph to another.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.crf.md b/tensorflow/docs_src/api_guides/python/contrib.crf.md
new file mode 100644
index 00000000000..428383fd413
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.crf.md
@@ -0,0 +1,11 @@
+# CRF (contrib)
+
+Linear-chain CRF layer.
+
+*   @{tf.contrib.crf.crf_sequence_score}
+*   @{tf.contrib.crf.crf_log_norm}
+*   @{tf.contrib.crf.crf_log_likelihood}
+*   @{tf.contrib.crf.crf_unary_score}
+*   @{tf.contrib.crf.crf_binary_score}
+*   @{tf.contrib.crf.CrfForwardRnnCell}
+*   @{tf.contrib.crf.viterbi_decode}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
new file mode 100644
index 00000000000..0ce187b329b
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
@@ -0,0 +1,33 @@
+# Random variable transformations (contrib)
+[TOC]
+
+Bijector Ops.
+
+An API for invertible, differentiable transformations of random variables.
+
+## Background
+
+Differentiable, bijective transformations of continuous random variables alter
+the calculations made in the cumulative/probability distribution functions and
+sample function.  This module provides a standard interface for making these
+manipulations.
+
+For more details and examples, see the `Bijector` docstring.
+
+To apply a `Bijector`, use `distributions.TransformedDistribution`.
+
+## Bijectors
+
+*   @{tf.contrib.distributions.bijectors.Affine}
+*   @{tf.contrib.distributions.bijectors.AffineLinearOperator}
+*   @{tf.contrib.distributions.bijectors.Bijector}
+*   @{tf.contrib.distributions.bijectors.Chain}
+*   @{tf.contrib.distributions.bijectors.CholeskyOuterProduct}
+*   @{tf.contrib.distributions.bijectors.Exp}
+*   @{tf.contrib.distributions.bijectors.Identity}
+*   @{tf.contrib.distributions.bijectors.Inline}
+*   @{tf.contrib.distributions.bijectors.Invert}
+*   @{tf.contrib.distributions.bijectors.PowerTransform}
+*   @{tf.contrib.distributions.bijectors.SigmoidCentered}
+*   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
+*   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
new file mode 100644
index 00000000000..7a3d509b751
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
@@ -0,0 +1,84 @@
+# Statistical Distributions (contrib)
+[TOC]
+
+Classes representing statistical distributions and ops for working with them.
+
+## Classes for statistical distributions
+
+Classes that represent batches of statistical distributions.  Each class is
+initialized with parameters that define the distributions.
+
+## Base classes
+
+*   @{tf.contrib.distributions.ReparameterizationType}
+*   @{tf.contrib.distributions.Distribution}
+
+## Univariate (scalar) distributions
+
+*   @{tf.contrib.distributions.Binomial}
+*   @{tf.contrib.distributions.Bernoulli}
+*   @{tf.contrib.distributions.BernoulliWithSigmoidProbs}
+*   @{tf.contrib.distributions.Beta}
+*   @{tf.contrib.distributions.Categorical}
+*   @{tf.contrib.distributions.Chi2}
+*   @{tf.contrib.distributions.Chi2WithAbsDf}
+*   @{tf.contrib.distributions.Exponential}
+*   @{tf.contrib.distributions.Gamma}
+*   @{tf.contrib.distributions.InverseGamma}
+*   @{tf.contrib.distributions.Laplace}
+*   @{tf.contrib.distributions.LaplaceWithSoftplusScale}
+*   @{tf.contrib.distributions.Normal}
+*   @{tf.contrib.distributions.NormalWithSoftplusScale}
+*   @{tf.contrib.distributions.Poisson}
+*   @{tf.contrib.distributions.StudentT}
+*   @{tf.contrib.distributions.StudentTWithAbsDfSoftplusScale}
+*   @{tf.contrib.distributions.Uniform}
+
+## Multivariate distributions
+
+### Multivariate normal
+
+*   @{tf.contrib.distributions.MultivariateNormalDiag}
+*   @{tf.contrib.distributions.MultivariateNormalTriL}
+*   @{tf.contrib.distributions.MultivariateNormalDiagPlusLowRank}
+*   @{tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale}
+
+### Other multivariate distributions
+
+*   @{tf.contrib.distributions.Dirichlet}
+*   @{tf.contrib.distributions.DirichletMultinomial}
+*   @{tf.contrib.distributions.Multinomial}
+*   @{tf.contrib.distributions.WishartCholesky}
+*   @{tf.contrib.distributions.WishartFull}
+
+### Multivariate Utilities
+
+*   @{tf.contrib.distributions.matrix_diag_transform}
+
+## Transformed distributions
+
+*   @{tf.contrib.distributions.TransformedDistribution}
+*   @{tf.contrib.distributions.QuantizedDistribution}
+
+## Mixture Models
+
+*   @{tf.contrib.distributions.Mixture}
+
+## Posterior inference with conjugate priors
+
+Functions that transform conjugate prior/likelihood pairs to distributions
+representing the posterior or posterior predictive.
+
+## Normal likelihood with conjugate prior
+
+*   @{tf.contrib.distributions.normal_conjugates_known_scale_posterior}
+*   @{tf.contrib.distributions.normal_conjugates_known_scale_predictive}
+
+## Kullback-Leibler Divergence
+
+*   @{tf.contrib.distributions.kl_divergence}
+*   @{tf.contrib.distributions.RegisterKL}
+
+## Utilities
+
+*   @{tf.contrib.distributions.softplus_inverse}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md b/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md
new file mode 100644
index 00000000000..27948689c54
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md
@@ -0,0 +1,23 @@
+# FFmpeg (contrib)
+[TOC]
+
+## Encoding and decoding audio using FFmpeg
+
+TensorFlow provides Ops to decode and encode audio files using the
+[FFmpeg](https://www.ffmpeg.org/) library. FFmpeg must be
+locally [installed](https://ffmpeg.org/download.html) for these Ops to succeed.
+
+Example:
+
+```python
+from tensorflow.contrib import ffmpeg
+
+audio_binary = tf.read_file('song.mp3')
+waveform = ffmpeg.decode_audio(
+    audio_binary, file_format='mp3', samples_per_second=44100, channel_count=2)
+uncompressed_binary = ffmpeg.encode_audio(
+    waveform, file_format='wav', samples_per_second=44100)
+```
+
+*   @{tf.contrib.ffmpeg.decode_audio}
+*   @{tf.contrib.ffmpeg.encode_audio}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.framework.md b/tensorflow/docs_src/api_guides/python/contrib.framework.md
new file mode 100644
index 00000000000..f7f26e56260
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.framework.md
@@ -0,0 +1,61 @@
+# Framework (contrib)
+[TOC]
+
+Framework utilities.
+
+*   @{tf.contrib.framework.assert_same_float_dtype}
+*   @{tf.contrib.framework.assert_scalar}
+*   @{tf.contrib.framework.assert_scalar_int}
+*   @{tf.convert_to_tensor_or_sparse_tensor}
+*   @{tf.contrib.framework.get_graph_from_inputs}
+*   @{tf.is_numeric_tensor}
+*   @{tf.is_non_decreasing}
+*   @{tf.is_strictly_increasing}
+*   @{tf.contrib.framework.is_tensor}
+*   @{tf.contrib.framework.reduce_sum_n}
+*   @{tf.contrib.framework.remove_squeezable_dimensions}
+*   @{tf.contrib.framework.with_shape}
+*   @{tf.contrib.framework.with_same_shape}
+
+## Deprecation
+*   @{tf.contrib.framework.deprecated}
+*   @{tf.contrib.framework.deprecated_args}
+*   @{tf.contrib.framework.deprecated_arg_values}
+
+## Arg_Scope
+*   @{tf.contrib.framework.arg_scope}
+*   @{tf.contrib.framework.add_arg_scope}
+*   @{tf.contrib.framework.has_arg_scope}
+*   @{tf.contrib.framework.arg_scoped_arguments}
+
+## Variables
+*   @{tf.contrib.framework.add_model_variable}
+*   @{tf.train.assert_global_step}
+*   @{tf.contrib.framework.assert_or_get_global_step}
+*   @{tf.contrib.framework.assign_from_checkpoint}
+*   @{tf.contrib.framework.assign_from_checkpoint_fn}
+*   @{tf.contrib.framework.assign_from_values}
+*   @{tf.contrib.framework.assign_from_values_fn}
+*   @{tf.contrib.framework.create_global_step}
+*   @{tf.contrib.framework.filter_variables}
+*   @{tf.train.get_global_step}
+*   @{tf.contrib.framework.get_or_create_global_step}
+*   @{tf.contrib.framework.get_local_variables}
+*   @{tf.contrib.framework.get_model_variables}
+*   @{tf.contrib.framework.get_unique_variable}
+*   @{tf.contrib.framework.get_variables_by_name}
+*   @{tf.contrib.framework.get_variables_by_suffix}
+*   @{tf.contrib.framework.get_variables_to_restore}
+*   @{tf.contrib.framework.get_variables}
+*   @{tf.contrib.framework.local_variable}
+*   @{tf.contrib.framework.model_variable}
+*   @{tf.contrib.framework.variable}
+*   @{tf.contrib.framework.VariableDeviceChooser}
+*   @{tf.contrib.framework.zero_initializer}
+
+## Checkpoint utilities
+
+*   @{tf.contrib.framework.load_checkpoint}
+*   @{tf.contrib.framework.list_variables}
+*   @{tf.contrib.framework.load_variable}
+*   @{tf.contrib.framework.init_from_checkpoint}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
new file mode 100644
index 00000000000..de4f1265079
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -0,0 +1,177 @@
+# Graph Editor (contrib)
+[TOC]
+
+TensorFlow Graph Editor.
+
+The TensorFlow Graph Editor library allows for modification of an existing
+`tf.Graph` instance in-place.
+
+The author's github username is [purpledog](https://github.com/purpledog).
+
+## Library overview
+
+Appending new nodes is the only graph editing operation allowed by the
+TensorFlow core library. The Graph Editor library is an attempt to allow for
+other kinds of editing operations, namely, *rerouting* and *transforming*.
+
+* *rerouting* is a local operation consisting in re-plugging existing tensors
+  (the edges of the graph). Operations (the nodes) are not modified by this
+  operation. For example, rerouting can be used to insert an operation adding
+  noise in place of an existing tensor.
+* *transforming* is a global operation consisting in transforming a graph into
+  another. By default, a transformation is a simple copy but it can be
+  customized to achieved other goals. For instance, a graph can be transformed
+  into another one in which noise is added after all the operations of a
+  specific type.
+
+**Important: modifying a graph in-place with the Graph Editor must be done
+`offline`, that is, without any active sessions.**
+
+Of course new operations can be appended online but Graph Editor specific
+operations like rerouting and transforming can currently only be done offline.
+
+Here is an example of what you **cannot** do:
+
+* Build a graph.
+* Create a session and run the graph.
+* Modify the graph with the Graph Editor.
+* Re-run the graph with the `same` previously created session.
+
+To edit an already running graph, follow these steps:
+
+* Build a graph.
+* Create a session and run the graph.
+* Save the graph state and terminate the session
+* Modify the graph with the Graph Editor.
+* create a new session and restore the graph state
+* Re-run the graph with the newly created session.
+
+Note that this procedure is very costly because a new session must be created
+after any modifications. Among other things, it takes time because the entire
+graph state must be saved and restored again.
+
+## Sub-graph
+
+Most of the functions in the Graph Editor library operate on *sub-graph*.
+More precisely, they take as input arguments instances of the SubGraphView class
+(or anything which can be converted to it). Doing so allows the same function
+to transparently operate on single operations as well as sub-graph of any size.
+
+A subgraph can be created in several ways:
+
+* using a list of ops:
+
+```python
+my_sgv = ge.sgv(ops)
+```
+
+* from a name scope:
+
+```python
+my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
+```
+
+* using regular expression:
+
+```python
+my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
+```
+
+Note that the Graph Editor is meant to manipulate several graphs at the same
+time, typically during transform or copy operation. For that reason,
+to avoid any confusion, the default graph is never used and the graph on
+which to operate must always be given explicitly. This is the reason why
+*`graph=tf.get_default_graph()`* is used in the code snippets above.
+
+## Modules overview
+
+* util: utility functions.
+* select: various selection methods of TensorFlow tensors and operations.
+* match: TensorFlow graph matching. Think of this as regular expressions for
+  graphs (but not quite yet).
+* reroute: various ways of rerouting tensors to different consuming ops like
+  *swap* or *reroute_a2b*.
+* subgraph: the SubGraphView class, which enables subgraph manipulations in a
+  TensorFlow `tf.Graph`.
+* edit: various editing functions operating on subgraphs like *detach*,
+  *connect* or *bypass*.
+* transform: the Transformer class, which enables transforming
+  (or simply copying) a subgraph into another one.
+
+## Module: util
+
+*   @{tf.contrib.graph_editor.make_list_of_op}
+*   @{tf.contrib.graph_editor.get_tensors}
+*   @{tf.contrib.graph_editor.make_list_of_t}
+*   @{tf.contrib.graph_editor.get_generating_ops}
+*   @{tf.contrib.graph_editor.get_consuming_ops}
+*   @{tf.contrib.graph_editor.ControlOutputs}
+*   @{tf.contrib.graph_editor.placeholder_name}
+*   @{tf.contrib.graph_editor.make_placeholder_from_tensor}
+*   @{tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape}
+
+## Module: select
+
+*   @{tf.contrib.graph_editor.filter_ts}
+*   @{tf.contrib.graph_editor.filter_ts_from_regex}
+*   @{tf.contrib.graph_editor.filter_ops}
+*   @{tf.contrib.graph_editor.filter_ops_from_regex}
+*   @{tf.contrib.graph_editor.get_name_scope_ops}
+*   @{tf.contrib.graph_editor.check_cios}
+*   @{tf.contrib.graph_editor.get_ops_ios}
+*   @{tf.contrib.graph_editor.compute_boundary_ts}
+*   @{tf.contrib.graph_editor.get_within_boundary_ops}
+*   @{tf.contrib.graph_editor.get_forward_walk_ops}
+*   @{tf.contrib.graph_editor.get_backward_walk_ops}
+*   @{tf.contrib.graph_editor.get_walks_intersection_ops}
+*   @{tf.contrib.graph_editor.get_walks_union_ops}
+*   @{tf.contrib.graph_editor.select_ops}
+*   @{tf.contrib.graph_editor.select_ts}
+*   @{tf.contrib.graph_editor.select_ops_and_ts}
+
+## Module: subgraph
+
+*   @{tf.contrib.graph_editor.SubGraphView}
+*   @{tf.contrib.graph_editor.make_view}
+*   @{tf.contrib.graph_editor.make_view_from_scope}
+
+## Module: reroute
+
+*   @{tf.contrib.graph_editor.swap_ts}
+*   @{tf.contrib.graph_editor.reroute_ts}
+*   @{tf.contrib.graph_editor.swap_inputs}
+*   @{tf.contrib.graph_editor.reroute_inputs}
+*   @{tf.contrib.graph_editor.swap_outputs}
+*   @{tf.contrib.graph_editor.reroute_outputs}
+*   @{tf.contrib.graph_editor.swap_ios}
+*   @{tf.contrib.graph_editor.reroute_ios}
+*   @{tf.contrib.graph_editor.remove_control_inputs}
+*   @{tf.contrib.graph_editor.add_control_inputs}
+
+## Module: edit
+
+*   @{tf.contrib.graph_editor.detach_control_inputs}
+*   @{tf.contrib.graph_editor.detach_control_outputs}
+*   @{tf.contrib.graph_editor.detach_inputs}
+*   @{tf.contrib.graph_editor.detach_outputs}
+*   @{tf.contrib.graph_editor.detach}
+*   @{tf.contrib.graph_editor.connect}
+*   @{tf.contrib.graph_editor.bypass}
+
+## Module: transform
+
+*   @{tf.contrib.graph_editor.replace_t_with_placeholder_handler}
+*   @{tf.contrib.graph_editor.keep_t_if_possible_handler}
+*   @{tf.contrib.graph_editor.assign_renamed_collections_handler}
+*   @{tf.contrib.graph_editor.transform_op_if_inside_handler}
+*   @{tf.contrib.graph_editor.copy_op_handler}
+*   @{tf.contrib.graph_editor.Transformer}
+*   @{tf.contrib.graph_editor.copy}
+*   @{tf.contrib.graph_editor.copy_with_input_replacements}
+*   @{tf.contrib.graph_editor.graph_replace}
+
+## Useful aliases
+
+*   @{tf.contrib.graph_editor.ph}
+*   @{tf.contrib.graph_editor.sgv}
+*   @{tf.contrib.graph_editor.sgv_scope}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.integrate.md b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
new file mode 100644
index 00000000000..e95b5a2e686
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
@@ -0,0 +1,41 @@
+# Integrate (contrib)
+[TOC]
+
+Integration and ODE solvers for TensorFlow.
+
+## Example: Lorenz attractor
+
+We can use `odeint` to solve the
+[Lorentz system](https://en.wikipedia.org/wiki/Lorenz_system) of ordinary
+differential equations, a prototypical example of chaotic dynamics:
+
+```python
+rho = 28.0
+sigma = 10.0
+beta = 8.0/3.0
+
+def lorenz_equation(state, t):
+  x, y, z = tf.unstack(state)
+  dx = sigma * (y - x)
+  dy = x * (rho - z) - y
+  dz = x * y - beta * z
+  return tf.stack([dx, dy, dz])
+
+init_state = tf.constant([0, 2, 20], dtype=tf.float64)
+t = np.linspace(0, 50, num=5000)
+tensor_state, tensor_info = tf.contrib.integrate.odeint(
+    lorenz_equation, init_state, t, full_output=True)
+
+sess = tf.Session()
+state, info = sess.run([tensor_state, tensor_info])
+x, y, z = state.T
+plt.plot(x, z)
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/lorenz_attractor.png" alt>
+</div>
+
+## Ops
+
+*   @{tf.contrib.integrate.odeint}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md
new file mode 100644
index 00000000000..d4cda3a2545
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.layers.md
@@ -0,0 +1,108 @@
+# Layers (contrib)
+[TOC]
+
+Ops for building neural network layers, regularizers, summaries, etc.
+
+## Higher level ops for building neural network layers
+
+This package provides several ops that take care of creating variables that are
+used internally in a consistent way and provide the building blocks for many
+common machine learning algorithms.
+
+*   @{tf.contrib.layers.avg_pool2d}
+*   @{tf.contrib.layers.batch_norm}
+*   @{tf.contrib.layers.convolution2d}
+*   @{tf.contrib.layers.conv2d_in_plane}
+*   @{tf.contrib.layers.convolution2d_in_plane}
+*   @{tf.nn.conv2d_transpose}
+*   @{tf.contrib.layers.convolution2d_transpose}
+*   @{tf.nn.dropout}
+*   @{tf.contrib.layers.flatten}
+*   @{tf.contrib.layers.fully_connected}
+*   @{tf.contrib.layers.layer_norm}
+*   @{tf.contrib.layers.max_pool2d}
+*   @{tf.contrib.layers.one_hot_encoding}
+*   @{tf.nn.relu}
+*   @{tf.nn.relu6}
+*   @{tf.contrib.layers.repeat}
+*   @{tf.contrib.layers.safe_embedding_lookup_sparse}
+*   @{tf.nn.separable_conv2d}
+*   @{tf.contrib.layers.separable_convolution2d}
+*   @{tf.nn.softmax}
+*   @{tf.stack}
+*   @{tf.contrib.layers.unit_norm}
+*   @{tf.contrib.layers.embed_sequence}
+
+Aliases for fully_connected which set a default activation function are
+available: `relu`, `relu6` and `linear`.
+
+`stack` operation is also available. It builds a stack of layers by applying
+a layer repeatedly.
+
+## Regularizers
+
+Regularization can help prevent overfitting. These have the signature
+`fn(weights)`. The loss is typically added to
+`tf.GraphKeys.REGULARIZATION_LOSSES`.
+
+*   @{tf.contrib.layers.apply_regularization}
+*   @{tf.contrib.layers.l1_regularizer}
+*   @{tf.contrib.layers.l2_regularizer}
+*   @{tf.contrib.layers.sum_regularizer}
+
+## Initializers
+
+Initializers are used to initialize variables with sensible values given their
+size, data type, and purpose.
+
+*   @{tf.contrib.layers.xavier_initializer}
+*   @{tf.contrib.layers.xavier_initializer_conv2d}
+*   @{tf.contrib.layers.variance_scaling_initializer}
+
+## Optimization
+
+Optimize weights given a loss.
+
+*   @{tf.contrib.layers.optimize_loss}
+
+## Summaries
+
+Helper functions to summarize specific variables or ops.
+
+*   @{tf.contrib.layers.summarize_activation}
+*   @{tf.contrib.layers.summarize_tensor}
+*   @{tf.contrib.layers.summarize_tensors}
+*   @{tf.contrib.layers.summarize_collection}
+
+The layers module defines convenience functions `summarize_variables`,
+`summarize_weights` and `summarize_biases`, which set the `collection` argument
+of `summarize_collection` to `VARIABLES`, `WEIGHTS` and `BIASES`, respectively.
+
+*   @{tf.contrib.layers.summarize_activations}
+
+## Feature columns
+
+Feature columns provide a mechanism to map data to a model.
+
+*   @{tf.contrib.layers.bucketized_column}
+*   @{tf.contrib.layers.check_feature_columns}
+*   @{tf.contrib.layers.create_feature_spec_for_parsing}
+*   @{tf.contrib.layers.crossed_column}
+*   @{tf.contrib.layers.embedding_column}
+*   @{tf.contrib.layers.scattered_embedding_column}
+*   @{tf.contrib.layers.input_from_feature_columns}
+*   @{tf.contrib.layers.joint_weighted_sum_from_feature_columns}
+*   @{tf.contrib.layers.make_place_holder_tensors_for_base_features}
+*   @{tf.contrib.layers.multi_class_target}
+*   @{tf.contrib.layers.one_hot_column}
+*   @{tf.contrib.layers.parse_feature_columns_from_examples}
+*   @{tf.contrib.layers.parse_feature_columns_from_sequence_examples}
+*   @{tf.contrib.layers.real_valued_column}
+*   @{tf.contrib.layers.shared_embedding_columns}
+*   @{tf.contrib.layers.sparse_column_with_hash_bucket}
+*   @{tf.contrib.layers.sparse_column_with_integerized_feature}
+*   @{tf.contrib.layers.sparse_column_with_keys}
+*   @{tf.contrib.layers.weighted_sparse_column}
+*   @{tf.contrib.layers.weighted_sum_from_feature_columns}
+*   @{tf.contrib.layers.infer_real_valued_columns}
+*   @{tf.contrib.layers.sequence_input_from_feature_columns}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.learn.md b/tensorflow/docs_src/api_guides/python/contrib.learn.md
new file mode 100644
index 00000000000..8b2fffa2013
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.learn.md
@@ -0,0 +1,62 @@
+# Learn (contrib)
+[TOC]
+
+High level API for learning with TensorFlow.
+
+## Estimators
+
+Train and evaluate TensorFlow models.
+
+*   @{tf.contrib.learn.BaseEstimator}
+*   @{tf.contrib.learn.Estimator}
+*   @{tf.contrib.learn.Trainable}
+*   @{tf.contrib.learn.Evaluable}
+*   @{tf.contrib.learn.KMeansClustering}
+*   @{tf.contrib.learn.ModeKeys}
+*   @{tf.contrib.learn.ModelFnOps}
+*   @{tf.contrib.learn.MetricSpec}
+*   @{tf.contrib.learn.PredictionKey}
+*   @{tf.contrib.learn.DNNClassifier}
+*   @{tf.contrib.learn.DNNRegressor}
+*   @{tf.contrib.learn.DNNLinearCombinedRegressor}
+*   @{tf.contrib.learn.DNNLinearCombinedClassifier}
+*   @{tf.contrib.learn.LinearClassifier}
+*   @{tf.contrib.learn.LinearRegressor}
+*   @{tf.contrib.learn.LogisticRegressor}
+
+## Distributed training utilities
+*   @{tf.contrib.learn.Experiment}
+*   @{tf.contrib.learn.ExportStrategy}
+*   @{tf.contrib.learn.TaskType}
+
+## Graph actions
+
+Perform various training, evaluation, and inference actions on a graph.
+
+*   @{tf.train.NanLossDuringTrainingError}
+*   @{tf.contrib.learn.RunConfig}
+*   @{tf.contrib.learn.evaluate}
+*   @{tf.contrib.learn.infer}
+*   @{tf.contrib.learn.run_feeds}
+*   @{tf.contrib.learn.run_n}
+*   @{tf.contrib.learn.train}
+
+## Input processing
+
+Queue and read batched input data.
+
+*   @{tf.contrib.learn.extract_dask_data}
+*   @{tf.contrib.learn.extract_dask_labels}
+*   @{tf.contrib.learn.extract_pandas_data}
+*   @{tf.contrib.learn.extract_pandas_labels}
+*   @{tf.contrib.learn.extract_pandas_matrix}
+*   @{tf.contrib.learn.infer_real_valued_columns_from_input}
+*   @{tf.contrib.learn.infer_real_valued_columns_from_input_fn}
+*   @{tf.contrib.learn.read_batch_examples}
+*   @{tf.contrib.learn.read_batch_features}
+*   @{tf.contrib.learn.read_batch_record_features}
+
+Export utilities
+
+*   @{tf.contrib.learn.build_parsing_serving_input_fn}
+*   @{tf.contrib.learn.ProblemType}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
new file mode 100644
index 00000000000..5f1db6c6af2
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
@@ -0,0 +1,30 @@
+# Linear Algebra (contrib)
+[TOC]
+
+Linear algebra libraries for TensorFlow.
+
+## `LinearOperator`
+
+Subclasses of `LinearOperator` provide a access to common methods on a
+(batch) matrix, without the need to materialize the matrix.  This allows:
+
+* Matrix free computations
+* Different operators to take advantage of special structure, while providing a
+  consistent API to users.
+
+### Base class
+
+*   @{tf.contrib.linalg.LinearOperator}
+
+### Individual operators
+
+*   @{tf.contrib.linalg.LinearOperatorDiag}
+*   @{tf.contrib.linalg.LinearOperatorIdentity}
+*   @{tf.contrib.linalg.LinearOperatorScaledIdentity}
+*   @{tf.contrib.linalg.LinearOperatorFullMatrix}
+*   @{tf.contrib.linalg.LinearOperatorTriL}
+*   @{tf.contrib.linalg.LinearOperatorUDVHUpdate}
+
+### Transformations and Combinations of operators
+
+*   @{tf.contrib.linalg.LinearOperatorComposition}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
new file mode 100644
index 00000000000..8c289dd5563
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -0,0 +1,121 @@
+# Losses (contrib)
+
+## Loss operations for use in neural networks.
+
+Note: By default all the losses are collected into the `GraphKeys.LOSSES`
+collection.
+
+All of the loss functions take a pair of predictions and ground truth labels,
+from which the loss is computed. It is assumed that the shape of both these
+tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number
+of samples in the batch and `d1` ... `dN` are the remaining dimensions.
+
+It is common, when training with multiple loss functions, to adjust the relative
+strengths of individual losses. This is performed by rescaling the losses via
+a `weight` parameter passed to the loss functions. For example, if we were
+training with both log_loss and mean_square_error, and we wished that the
+log_loss penalty be twice as severe as the mean_square_error, we would
+implement this as:
+
+```python
+  # Explicitly set the weight.
+  tf.contrib.losses.log(predictions, labels, weight=2.0)
+
+  # Uses default weight of 1.0
+  tf.contrib.losses.mean_square_error(predictions, labels)
+
+  # All the losses are collected into the `GraphKeys.LOSSES` collection.
+  losses = tf.get_collection(tf.GraphKeys.LOSSES)
+```
+
+While specifying a scalar loss rescales the loss over the entire batch,
+we sometimes want to rescale the loss per batch sample. For example, if we have
+certain examples that matter more to us to get correctly, we might want to have
+a higher loss that other samples whose mistakes matter less. In this case, we
+can provide a weight vector of length `batch_size` which results in the loss
+for each sample in the batch being scaled by the corresponding weight element.
+For example, consider the case of a classification problem where we want to
+maximize our accuracy but we especially interested in obtaining high accuracy
+for a specific class:
+
+```python
+  inputs, labels = LoadData(batch_size=3)
+  logits = MyModelPredictions(inputs)
+
+  # Ensures that the loss for examples whose ground truth class is `3` is 5x
+  # higher than the loss for all other examples.
+  weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1
+
+  onehot_labels = tf.one_hot(labels, num_classes=5)
+  tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight)
+```
+
+Finally, in certain cases, we may want to specify a different loss for every
+single measurable value. For example, if we are performing per-pixel depth
+prediction, or per-pixel denoising, a single batch sample has P values where P
+is the number of pixels in the image. For many losses, the number of measurable
+values matches the number of elements in the predictions and labels tensors.
+For others, such as softmax_cross_entropy and cosine_distance, the
+loss functions reduces the dimensions of the inputs to produces a tensor of
+losses for each measurable value. For example, softmax_cross_entropy takes as
+input predictions and labels of dimension [batch_size, num_classes] but the
+number of measurable values is [batch_size]. Consequently, when passing a weight
+tensor to specify a different loss for every measurable value, the dimension of
+the tensor will depend on the loss being used.
+
+For a concrete example, consider the case of per-pixel depth prediction where
+certain ground truth depth values are missing (due to sensor noise in the
+capture process). In this case, we want to assign zero weight to losses for
+these predictions.
+
+```python
+  # 'depths' that are missing have a value of 0:
+  images, depths = LoadData(...)
+  predictions = MyModelPredictions(images)
+
+  weight = tf.cast(tf.greater(depths, 0), tf.float32)
+  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+```
+
+Note that when using weights for the losses, the final average is computed
+by rescaling the losses by the weights and then dividing by the total number of
+non-zero samples. For an arbitrary set of weights, this may not necessarily
+produce a weighted average. Instead, it simply and transparently rescales the
+per-element losses before averaging over the number of observations. For example
+if the losses computed by the loss function is an array [4, 1, 2, 3] and the
+weights are an array [1, 0.5, 3, 9], then the average loss is:
+
+```python
+  (4*1 + 1*0.5 + 2*3 + 3*9) / 4
+```
+
+However, with a single loss function and an arbitrary set of weights, one can
+still easily create a loss function such that the resulting loss is a
+weighted average over the individual prediction errors:
+
+
+```python
+  images, labels = LoadData(...)
+  predictions = MyModelPredictions(images)
+
+  weight = MyComplicatedWeightingFunction(labels)
+  weight = tf.div(weight, tf.size(weight))
+  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+```
+
+@{tf.contrib.losses.absolute_difference}
+@{tf.contrib.losses.add_loss}
+@{tf.contrib.losses.hinge_loss}
+@{tf.contrib.losses.compute_weighted_loss}
+@{tf.contrib.losses.cosine_distance}
+@{tf.contrib.losses.get_losses}
+@{tf.contrib.losses.get_regularization_losses}
+@{tf.contrib.losses.get_total_loss}
+@{tf.contrib.losses.log_loss}
+@{tf.contrib.losses.mean_pairwise_squared_error}
+@{tf.contrib.losses.mean_squared_error}
+@{tf.contrib.losses.sigmoid_cross_entropy}
+@{tf.contrib.losses.softmax_cross_entropy}
+@{tf.contrib.losses.sparse_softmax_cross_entropy}
+
+
diff --git a/tensorflow/docs_src/api_guides/python/contrib.metrics.md b/tensorflow/docs_src/api_guides/python/contrib.metrics.md
new file mode 100644
index 00000000000..b502826e6ac
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.metrics.md
@@ -0,0 +1,133 @@
+# Metrics (contrib)
+[TOC]
+
+##Ops for evaluation metrics and summary statistics.
+
+### API
+
+This module provides functions for computing streaming metrics: metrics computed
+on dynamically valued `Tensors`. Each metric declaration returns a
+"value_tensor", an idempotent operation that returns the current value of the
+metric, and an "update_op", an operation that accumulates the information
+from the current value of the `Tensors` being measured as well as returns the
+value of the "value_tensor".
+
+To use any of these metrics, one need only declare the metric, call `update_op`
+repeatedly to accumulate data over the desired number of `Tensor` values (often
+each one is a single batch) and finally evaluate the value_tensor. For example,
+to use the `streaming_mean`:
+
+```python
+value = ...
+mean_value, update_op = tf.contrib.metrics.streaming_mean(values)
+sess.run(tf.local_variables_initializer())
+
+for i in range(number_of_batches):
+  print('Mean after batch %d: %f' % (i, update_op.eval())
+print('Final Mean: %f' % mean_value.eval())
+```
+
+Each metric function adds nodes to the graph that hold the state necessary to
+compute the value of the metric as well as a set of operations that actually
+perform the computation. Every metric evaluation is composed of three steps
+
+* Initialization: initializing the metric state.
+* Aggregation: updating the values of the metric state.
+* Finalization: computing the final metric value.
+
+In the above example, calling streaming_mean creates a pair of state variables
+that will contain (1) the running sum and (2) the count of the number of samples
+in the sum.  Because the streaming metrics use local variables,
+the Initialization stage is performed by running the op returned
+by `tf.local_variables_initializer()`. It sets the sum and count variables to
+zero.
+
+Next, Aggregation is performed by examining the current state of `values`
+and incrementing the state variables appropriately. This step is executed by
+running the `update_op` returned by the metric.
+
+Finally, finalization is performed by evaluating the "value_tensor"
+
+In practice, we commonly want to evaluate across many batches and multiple
+metrics. To do so, we need only run the metric computation operations multiple
+times:
+
+```python
+labels = ...
+predictions = ...
+accuracy, update_op_acc = tf.contrib.metrics.streaming_accuracy(
+    labels, predictions)
+error, update_op_error = tf.contrib.metrics.streaming_mean_absolute_error(
+    labels, predictions)
+
+sess.run(tf.local_variables_initializer())
+for batch in range(num_batches):
+  sess.run([update_op_acc, update_op_error])
+
+accuracy, mean_absolute_error = sess.run([accuracy, mean_absolute_error])
+```
+
+Note that when evaluating the same metric multiple times on different inputs,
+one must specify the scope of each metric to avoid accumulating the results
+together:
+
+```python
+labels = ...
+predictions0 = ...
+predictions1 = ...
+
+accuracy0 = tf.contrib.metrics.accuracy(labels, predictions0, name='preds0')
+accuracy1 = tf.contrib.metrics.accuracy(labels, predictions1, name='preds1')
+```
+
+Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted
+via a `weights` argument. The `weights` tensor must be the same size as the
+labels and predictions tensors and results in a weighted average of the metric.
+
+## Metric `Ops`
+
+*   @{tf.contrib.metrics.streaming_accuracy}
+*   @{tf.contrib.metrics.streaming_mean}
+*   @{tf.contrib.metrics.streaming_recall}
+*   @{tf.contrib.metrics.streaming_recall_at_thresholds}
+*   @{tf.contrib.metrics.streaming_precision}
+*   @{tf.contrib.metrics.streaming_precision_at_thresholds}
+*   @{tf.contrib.metrics.streaming_auc}
+*   @{tf.contrib.metrics.streaming_recall_at_k}
+*   @{tf.contrib.metrics.streaming_mean_absolute_error}
+*   @{tf.contrib.metrics.streaming_mean_iou}
+*   @{tf.contrib.metrics.streaming_mean_relative_error}
+*   @{tf.contrib.metrics.streaming_mean_squared_error}
+*   @{tf.contrib.metrics.streaming_mean_tensor}
+*   @{tf.contrib.metrics.streaming_root_mean_squared_error}
+*   @{tf.contrib.metrics.streaming_covariance}
+*   @{tf.contrib.metrics.streaming_pearson_correlation}
+*   @{tf.contrib.metrics.streaming_mean_cosine_distance}
+*   @{tf.contrib.metrics.streaming_percentage_less}
+*   @{tf.contrib.metrics.streaming_sensitivity_at_specificity}
+*   @{tf.contrib.metrics.streaming_sparse_average_precision_at_k}
+*   @{tf.contrib.metrics.streaming_sparse_precision_at_k}
+*   @{tf.contrib.metrics.streaming_sparse_precision_at_top_k}
+*   @{tf.contrib.metrics.streaming_sparse_recall_at_k}
+*   @{tf.contrib.metrics.streaming_specificity_at_sensitivity}
+*   @{tf.contrib.metrics.streaming_concat}
+*   @{tf.contrib.metrics.streaming_false_negatives}
+*   @{tf.contrib.metrics.streaming_false_negatives_at_thresholds}
+*   @{tf.contrib.metrics.streaming_false_positives}
+*   @{tf.contrib.metrics.streaming_false_positives_at_thresholds}
+*   @{tf.contrib.metrics.streaming_true_negatives}
+*   @{tf.contrib.metrics.streaming_true_negatives_at_thresholds}
+*   @{tf.contrib.metrics.streaming_true_positives}
+*   @{tf.contrib.metrics.streaming_true_positives_at_thresholds}
+*   @{tf.contrib.metrics.auc_using_histogram}
+*   @{tf.contrib.metrics.accuracy}
+*   @{tf.contrib.metrics.aggregate_metrics}
+*   @{tf.contrib.metrics.aggregate_metric_map}
+*   @{tf.contrib.metrics.confusion_matrix}
+
+## Set `Ops`
+
+*   @{tf.contrib.metrics.set_difference}
+*   @{tf.contrib.metrics.set_intersection}
+*   @{tf.contrib.metrics.set_size}
+*   @{tf.contrib.metrics.set_union}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.opt.md b/tensorflow/docs_src/api_guides/python/contrib.opt.md
new file mode 100644
index 00000000000..944a80a5ccb
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.opt.md
@@ -0,0 +1,4 @@
+# Optimization (contrib)
+[TOC]
+
+opt: A module containing optimization routines.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.rnn.md b/tensorflow/docs_src/api_guides/python/contrib.rnn.md
new file mode 100644
index 00000000000..d089b0616f5
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.rnn.md
@@ -0,0 +1,61 @@
+# RNN and Cells (contrib)
+[TOC]
+
+Module for constructing RNN Cells and additional RNN operations.
+
+## Base interface for all RNN Cells
+
+*   @{tf.contrib.rnn.RNNCell}
+
+## Core RNN Cells for use with TensorFlow's core RNN methods
+
+*   @{tf.contrib.rnn.BasicRNNCell}
+*   @{tf.contrib.rnn.BasicLSTMCell}
+*   @{tf.contrib.rnn.GRUCell}
+*   @{tf.contrib.rnn.LSTMCell}
+*   @{tf.contrib.rnn.LayerNormBasicLSTMCell}
+
+## Classes storing split `RNNCell` state
+
+*   @{tf.contrib.rnn.LSTMStateTuple}
+
+## Core RNN Cell wrappers (RNNCells that wrap other RNNCells)
+
+*   @{tf.contrib.rnn.MultiRNNCell}
+*   @{tf.contrib.rnn.LSTMBlockWrapper}
+*   @{tf.contrib.rnn.DropoutWrapper}
+*   @{tf.contrib.rnn.EmbeddingWrapper}
+*   @{tf.contrib.rnn.InputProjectionWrapper}
+*   @{tf.contrib.rnn.OutputProjectionWrapper}
+*   @{tf.contrib.rnn.DeviceWrapper}
+*   @{tf.contrib.rnn.ResidualWrapper}
+
+### Block RNNCells
+*   @{tf.contrib.rnn.LSTMBlockCell}
+*   @{tf.contrib.rnn.GRUBlockCell}
+
+### Fused RNNCells
+*   @{tf.contrib.rnn.FusedRNNCell}
+*   @{tf.contrib.rnn.FusedRNNCellAdaptor}
+*   @{tf.contrib.rnn.TimeReversedFusedRNN}
+*   @{tf.contrib.rnn.LSTMBlockFusedCell}
+
+### LSTM-like cells
+*   @{tf.contrib.rnn.CoupledInputForgetGateLSTMCell}
+*   @{tf.contrib.rnn.TimeFreqLSTMCell}
+*   @{tf.contrib.rnn.GridLSTMCell}
+
+### RNNCell wrappers
+*   @{tf.contrib.rnn.AttentionCellWrapper}
+*   @{tf.contrib.rnn.CompiledWrapper}
+
+
+## Recurrent Neural Networks
+
+TensorFlow provides a number of methods for constructing Recurrent Neural
+Networks.
+
+*   @{tf.contrib.rnn.static_rnn}
+*   @{tf.contrib.rnn.static_state_saving_rnn}
+*   @{tf.contrib.rnn.static_bidirectional_rnn}
+*   @{tf.contrib.rnn.stack_bidirectional_dynamic_rnn}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
new file mode 100644
index 00000000000..2522e50c266
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -0,0 +1,134 @@
+# Seq2seq Library (contrib)
+[TOC]
+
+Module for constructing seq2seq models and dynamic decoding.  Builds on top of
+libraries in @{tf.contrib.rnn}.
+
+This library is composed of two primary components:
+
+*   New attention wrappers for @{tf.contrib.rnn.RNNCell} objects.
+*   A new object-oriented dynamic decoding framework.
+
+## Attention
+
+Attention wrappers are `RNNCell` objects that wrap other `RNNCell` objects and
+implement attention.  The form of attention is determined by a subclass of
+@{tf.contrib.seq2seq.AttentionMechanism}.  These subclasses describe the form
+of attention (e.g. additive vs. multiplicative) to use when creating the
+wrapper.  An instance of an `AttentionMechanism` is constructed with a
+`memory` tensor, from which lookup keys and values tensors are created.
+
+### Attention Mechanisms
+
+The two basic attention mechanisms are:
+*   @{tf.contrib.seq2seq.BahdanauAttention} (additive attention,
+    [ref.](https://arxiv.org/abs/1409.0473))
+*   @{tf.contrib.seq2seq.LuongAttention} (multiplicative attention,
+    [ref.](https://arxiv.org/abs/1508.04025))
+
+The `memory` tensor passed the attention mechanism's constructor is expected to
+be shaped `[batch_size, memory_max_time, memory_depth]`; and often an additional
+`memory_sequence_length` vector is accepted.  If provided, the `memory`
+tensors' rows are masked with zeros past their true sequence lengths.
+
+Attention mechanisms also have a concept of depth, usually determined as a
+construction parameter `num_units`.  For some kinds of attention (like
+`BahdanauAttention`), both queries and memory are projected to tensors of depth
+`num_units`.  For other kinds (like `LuongAttention`), `num_units` should match
+the depth of the queries; and the `memory` tensor will be projected to this
+depth.
+
+### Attention Wrappers
+
+The basic attention wrapper is @{tf.contrib.seq2seq.DynamicAttentionWrapper}.
+This wrapper accepts an `RNNCell` instance, an instance of `AttentionMechanism`,
+and an attention depth parameter (`attention_size`); as well as several
+optional arguments that allow one to customize intermediate calculations.
+
+At each time step, the basic calculation performed by this wrapper is:
+
+```python
+cell_inputs = concat([inputs, prev_state.attention], -1)
+cell_output, next_cell_state = cell(cell_inputs, prev_state.cell_state)
+score = attention_mechanism(cell_output)
+alignments = softmax(score)
+context = matmul(alignments, attention_mechanism.values)
+attention = tf.layers.Dense(attention_size)(concat([cell_output, context], 1))
+next_state = DynamicAttentionWrapperState(
+  cell_state=next_cell_state,
+  attention=attention)
+output = attention
+return output, next_state
+```
+
+In practice, a number of the intermediate calculations are configurable.
+For example, the initial concatenation of `inputs` and `prev_state.attention`
+can be replaced with another mixing function.  The function `softmax` can
+be replaced with alternative options when calculating `alignments` from the
+`score`.  Finally, the outputs returned by the wrapper can be configured to
+be the value `cell_output` instead of `attention`.
+
+The benefit of using a `DynamicAttentionWrapper` is that it plays nicely with
+other wrappers and the dynamic decoder described below.  For example, one can
+write:
+
+```python
+cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
+attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
+attn_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
+  cell, attention_mechanism, attention_size=256)
+attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
+top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
+multi_cell = MultiRNNCell([attn_cell, top_cell])
+```
+
+The `multi_rnn` cell will perform the bottom layer calculations on GPU 0;
+attention calculations will be performed on GPU 1 and immediately passed
+up to the top layer which is also calculated on GPU 1.  The attention is
+also passed forward in time to the next time step and copied to GPU 0 for the
+next time step of `cell`.  (*Note*: This is just an example of use,
+not a suggested device partitioning strategy.)
+
+## Dynamic Decoding
+
+Example usage:
+
+``` python
+cell = # instance of RNNCell
+
+if mode == "train":
+  helper = tf.contrib.seq2seq.TrainingHelper(
+    input=input_vectors,
+    sequence_length=input_lengths)
+elif mode == "infer":
+  helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
+      embedding=embedding,
+      start_tokens=tf.tile([GO_SYMBOL], [batch_size]),
+      end_token=END_SYMBOL)
+
+decoder = tf.contrib.seq2seq.BasicDecoder(
+    cell=cell,
+    helper=helper,
+    initial_state=cell.zero_state(batch_size, tf.float32))
+outputs, _ = tf.contrib.seq2seq.dynamic_decode(
+   decoder=decoder,
+   output_time_major=False,
+   impute_finished=True,
+   maximum_iterations=20)
+```
+
+### Decoder base class and functions
+*   @{tf.contrib.seq2seq.Decoder}
+*   @{tf.contrib.seq2seq.dynamic_decode}
+
+### Basic Decoder
+*   @{tf.contrib.seq2seq.BasicDecoderOutput}
+*   @{tf.contrib.seq2seq.BasicDecoder}
+
+### Decoder Helpers
+*   @{tf.contrib.seq2seq.Helper}
+*   @{tf.contrib.seq2seq.CustomHelper}
+*   @{tf.contrib.seq2seq.GreedyEmbeddingHelper}
+*   @{tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper}
+*   @{tf.contrib.seq2seq.ScheduledOutputTrainingHelper}
+*   @{tf.contrib.seq2seq.TrainingHelper}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.staging.md b/tensorflow/docs_src/api_guides/python/contrib.staging.md
new file mode 100644
index 00000000000..b0ac5483427
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.staging.md
@@ -0,0 +1,6 @@
+# Staging (contrib)
+[TOC]
+
+This library contains utilities for adding pipelining to a model.
+
+*   @{tf.contrib.staging.StagingArea}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.training.md b/tensorflow/docs_src/api_guides/python/contrib.training.md
new file mode 100644
index 00000000000..87395d930b7
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.training.md
@@ -0,0 +1,50 @@
+# Training (contrib)
+[TOC]
+
+Training and input utilities.
+
+## Splitting sequence inputs into minibatches with state saving
+
+Use @{tf.contrib.training.SequenceQueueingStateSaver} or
+its wrapper @{tf.contrib.training.batch_sequences_with_states} if
+you have input data with a dynamic primary time / frame count axis which
+you'd like to convert into fixed size segments during minibatching, and would
+like to store state in the forward direction across segments of an example.
+
+*   @{tf.contrib.training.batch_sequences_with_states}
+*   @{tf.contrib.training.NextQueuedSequenceBatch}
+*   @{tf.contrib.training.SequenceQueueingStateSaver}
+
+
+## Online data resampling
+
+To resample data with replacement on a per-example basis, use
+@{tf.contrib.training.rejection_sample} or
+@{tf.contrib.training.resample_at_rate}. For `rejection_sample`, provide
+a boolean Tensor describing whether to accept or reject. Resulting batch sizes
+are always the same. For `resample_at_rate`, provide the desired rate for each
+example. Resulting batch sizes may vary. If you wish to specify relative
+rates, rather than absolute ones, use @{tf.contrib.training.weighted_resample}
+(which also returns the actual resampling rate used for each output example).
+
+Use @{tf.contrib.training.stratified_sample} to resample without replacement
+from the data to achieve a desired mix of class proportions that the Tensorflow
+graph sees. For instance, if you have a binary classification dataset that is
+99.9% class 1, a common approach is to resample from the data so that the data
+is more balanced.
+
+*   @{tf.contrib.training.rejection_sample}
+*   @{tf.contrib.training.resample_at_rate}
+*   @{tf.contrib.training.stratified_sample}
+*   @{tf.contrib.training.weighted_resample}
+
+## Bucketing
+
+Use @{tf.contrib.training.bucket} or
+@{tf.contrib.training.bucket_by_sequence_length} to stratify
+minibatches into groups ("buckets").  Use `bucket_by_sequence_length`
+with the argument `dynamic_pad=True` to receive minibatches of similarly
+sized sequences for efficient training via `dynamic_rnn`.
+
+*   @{tf.contrib.training.bucket}
+*   @{tf.contrib.training.bucket_by_sequence_length}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.util.md b/tensorflow/docs_src/api_guides/python/contrib.util.md
new file mode 100644
index 00000000000..6bc120d43dc
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.util.md
@@ -0,0 +1,12 @@
+# Utilities (contrib)
+[TOC]
+
+Utilities for dealing with Tensors.
+
+## Miscellaneous Utility Functions
+
+*   @{tf.contrib.util.constant_value}
+*   @{tf.contrib.util.make_tensor_proto}
+*   @{tf.contrib.util.make_ndarray}
+*   @{tf.contrib.util.ops_used_by_graph_def}
+*   @{tf.contrib.util.stripped_op_list_for_graph}
diff --git a/tensorflow/docs_src/api_guides/python/control_flow_ops.md b/tensorflow/docs_src/api_guides/python/control_flow_ops.md
new file mode 100644
index 00000000000..68ea96d3dc7
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/control_flow_ops.md
@@ -0,0 +1,57 @@
+# Control Flow
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Control Flow Operations
+
+TensorFlow provides several operations and classes that you can use to control
+the execution of operations and add conditional dependencies to your graph.
+
+*   @{tf.identity}
+*   @{tf.tuple}
+*   @{tf.group}
+*   @{tf.no_op}
+*   @{tf.count_up_to}
+*   @{tf.cond}
+*   @{tf.case}
+*   @{tf.while_loop}
+
+## Logical Operators
+
+TensorFlow provides several operations that you can use to add logical operators
+to your graph.
+
+*   @{tf.logical_and}
+*   @{tf.logical_not}
+*   @{tf.logical_or}
+*   @{tf.logical_xor}
+
+## Comparison Operators
+
+TensorFlow provides several operations that you can use to add comparison
+operators to your graph.
+
+*   @{tf.equal}
+*   @{tf.not_equal}
+*   @{tf.less}
+*   @{tf.less_equal}
+*   @{tf.greater}
+*   @{tf.greater_equal}
+*   @{tf.where}
+
+## Debugging Operations
+
+TensorFlow provides several operations that you can use to validate values and
+debug your graph.
+
+*   @{tf.is_finite}
+*   @{tf.is_inf}
+*   @{tf.is_nan}
+*   @{tf.verify_tensor_all_finite}
+*   @{tf.check_numerics}
+*   @{tf.add_check_numerics_ops}
+*   @{tf.Assert}
+*   @{tf.Print}
diff --git a/tensorflow/docs_src/api_guides/python/framework.md b/tensorflow/docs_src/api_guides/python/framework.md
new file mode 100644
index 00000000000..42c3e57477b
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/framework.md
@@ -0,0 +1,51 @@
+# Building Graphs
+[TOC]
+
+Classes and functions for building TensorFlow graphs.
+
+## Core graph data structures
+
+*   @{tf.Graph}
+*   @{tf.Operation}
+*   @{tf.Tensor}
+
+## Tensor types
+
+*   @{tf.DType}
+*   @{tf.as_dtype}
+
+## Utility functions
+
+*   @{tf.device}
+*   @{tf.container}
+*   @{tf.name_scope}
+*   @{tf.control_dependencies}
+*   @{tf.convert_to_tensor}
+*   @{tf.convert_to_tensor_or_indexed_slices}
+*   @{tf.convert_to_tensor_or_sparse_tensor}
+*   @{tf.get_default_graph}
+*   @{tf.reset_default_graph}
+*   @{tf.import_graph_def}
+*   @{tf.load_file_system_library}
+*   @{tf.load_op_library}
+
+## Graph collections
+
+*   @{tf.add_to_collection}
+*   @{tf.get_collection}
+*   @{tf.get_collection_ref}
+*   @{tf.GraphKeys}
+
+## Defining new operations
+
+*   @{tf.RegisterGradient}
+*   @{tf.NotDifferentiable}
+*   @{tf.NoGradient}
+*   @{tf.TensorShape}
+*   @{tf.Dimension}
+*   @{tf.op_scope}
+*   @{tf.get_seed}
+
+## For libraries building on TensorFlow
+
+*   @{tf.register_tensor_conversion_function}
diff --git a/tensorflow/docs_src/api_guides/python/functional_ops.md b/tensorflow/docs_src/api_guides/python/functional_ops.md
new file mode 100644
index 00000000000..9fd46066a8a
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/functional_ops.md
@@ -0,0 +1,18 @@
+# Higher Order Functions
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+Functional operations.
+
+## Higher Order Operators
+
+TensorFlow provides several higher order operators to simplify the common
+map-reduce programming patterns.
+
+*   @{tf.map_fn}
+*   @{tf.foldl}
+*   @{tf.foldr}
+*   @{tf.scan}
diff --git a/tensorflow/docs_src/api_guides/python/histogram_ops.md b/tensorflow/docs_src/api_guides/python/histogram_ops.md
new file mode 100644
index 00000000000..dbd4555429b
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/histogram_ops.md
@@ -0,0 +1,6 @@
+# Histograms
+[TOC]
+
+## Histograms
+
+*   @{tf.histogram_fixed_width}
diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
new file mode 100644
index 00000000000..a2c8c3c3c92
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/image.md
@@ -0,0 +1,143 @@
+# Images
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Encoding and Decoding
+
+TensorFlow provides Ops to decode and encode JPEG and PNG formats.  Encoded
+images are represented by scalar string Tensors, decoded images by 3-D uint8
+tensors of shape `[height, width, channels]`. (PNG also supports uint16.)
+
+The encode and decode Ops apply to one image at a time.  Their input and output
+are all of variable size.  If you need fixed size images, pass the output of
+the decode Ops to one of the cropping and resizing Ops.
+
+Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
+presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
+to be stripped from the image and re-attached using slicing ops.
+
+*   @{tf.image.decode_gif}
+*   @{tf.image.decode_jpeg}
+*   @{tf.image.encode_jpeg}
+*   @{tf.image.decode_png}
+*   @{tf.image.encode_png}
+*   @{tf.image.decode_image}
+
+## Resizing
+
+The resizing Ops accept input images as tensors of several types.  They always
+output resized images as float32 tensors.
+
+The convenience function @{tf.image.resize_images} supports both 4-D
+and 3-D tensors as input and output.  4-D tensors are for batches of images,
+3-D tensors for individual images.
+
+Other resizing Ops only support 4-D batches of images as input:
+@{tf.image.resize_area}, @{tf.image.resize_bicubic},
+@{tf.image.resize_bilinear},
+@{tf.image.resize_nearest_neighbor}.
+
+Example:
+
+```python
+# Decode a JPG image and resize it to 299 by 299 using default method.
+image = tf.image.decode_jpeg(...)
+resized_image = tf.image.resize_images(image, [299, 299])
+```
+
+*   @{tf.image.resize_images}
+*   @{tf.image.resize_area}
+*   @{tf.image.resize_bicubic}
+*   @{tf.image.resize_bilinear}
+*   @{tf.image.resize_nearest_neighbor}
+
+## Cropping
+
+*   @{tf.image.resize_image_with_crop_or_pad}
+*   @{tf.image.central_crop}
+*   @{tf.image.pad_to_bounding_box}
+*   @{tf.image.crop_to_bounding_box}
+*   @{tf.image.extract_glimpse}
+*   @{tf.image.crop_and_resize}
+
+## Flipping, Rotating and Transposing
+
+*   @{tf.image.flip_up_down}
+*   @{tf.image.random_flip_up_down}
+*   @{tf.image.flip_left_right}
+*   @{tf.image.random_flip_left_right}
+*   @{tf.image.transpose_image}
+*   @{tf.image.rot90}
+
+## Converting Between Colorspaces
+
+Image ops work either on individual images or on batches of images, depending on
+the shape of their input Tensor.
+
+If 3-D, the shape is `[height, width, channels]`, and the Tensor represents one
+image. If 4-D, the shape is `[batch_size, height, width, channels]`, and the
+Tensor represents `batch_size` images.
+
+Currently, `channels` can usefully be 1, 2, 3, or 4. Single-channel images are
+grayscale, images with 3 channels are encoded as either RGB or HSV. Images
+with 2 or 4 channels include an alpha channel, which has to be stripped from the
+image before passing the image to most image processing functions (and can be
+re-attached later).
+
+Internally, images are either stored in as one `float32` per channel per pixel
+(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel
+per pixel (values are assumed to lie in `[0,255]`).
+
+TensorFlow can convert between images in RGB or HSV. The conversion functions
+work only on float images, so you need to convert images in other formats using
+@{tf.image.convert_image_dtype}.
+
+Example:
+
+```python
+# Decode an image and convert it to HSV.
+rgb_image = tf.image.decode_png(...,  channels=3)
+rgb_image_float = tf.image.convert_image_dtype(rgb_image, tf.float32)
+hsv_image = tf.image.rgb_to_hsv(rgb_image)
+```
+
+*   @{tf.image.rgb_to_grayscale}
+*   @{tf.image.grayscale_to_rgb}
+*   @{tf.image.hsv_to_rgb}
+*   @{tf.image.rgb_to_hsv}
+*   @{tf.image.convert_image_dtype}
+
+## Image Adjustments
+
+TensorFlow provides functions to adjust images in various ways: brightness,
+contrast, hue, and saturation.  Each adjustment can be done with predefined
+parameters or with random parameters picked from predefined intervals. Random
+adjustments are often useful to expand a training set and reduce overfitting.
+
+If several adjustments are chained it is advisable to minimize the number of
+redundant conversions by first converting the images to the most natural data
+type and representation (RGB or HSV).
+
+*   @{tf.image.adjust_brightness}
+*   @{tf.image.random_brightness}
+*   @{tf.image.adjust_contrast}
+*   @{tf.image.random_contrast}
+*   @{tf.image.adjust_hue}
+*   @{tf.image.random_hue}
+*   @{tf.image.adjust_gamma}
+*   @{tf.image.adjust_saturation}
+*   @{tf.image.random_saturation}
+*   @{tf.image.per_image_standardization}
+
+## Working with Bounding Boxes
+
+*   @{tf.image.draw_bounding_boxes}
+*   @{tf.image.non_max_suppression}
+*   @{tf.image.sample_distorted_bounding_box}
+
+## Denoising
+
+*   @{tf.image.total_variation}
diff --git a/tensorflow/docs_src/api_guides/python/index.md b/tensorflow/docs_src/api_guides/python/index.md
new file mode 100644
index 00000000000..19d50926d88
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/index.md
@@ -0,0 +1,49 @@
+# Python API Guides
+
+*   [Asserts and boolean checks](check_ops.md)
+*   [Building Graphs](framework.md)
+*   [Constants, Sequences, and Random Values](constant_op.md)
+*   [Control Flow](control_flow_ops.md)
+*   [Data IO (Python functions)](python_io.md)
+*   [Higher Order Functions](functional_ops.md)
+*   [Histograms](histogram_ops.md)
+*   [Images](image.md)
+*   [Inputs and Readers](io_ops.md)
+*   [Math](math_ops.md)
+*   [Neural Network](nn.md)
+*   [Running Graphs](client.md)
+*   [Sparse Tensors](sparse_ops.md)
+*   [Spectral Functions](spectral_ops.md)
+*   [Strings](string_ops.md)
+*   [Summary Operations](summary.md)
+*   [TensorFlow Debugger](tfdbg.md)
+*   [Tensor Handle Operations](session_ops.md)
+*   [Tensor Transformations](array_ops.md)
+*   [Testing](test.md)
+*   [Training](train.md)
+*   [Variables](state_ops.md)
+*   [Wraps python functions](script_ops.md)
+*   [BayesFlow Entropy (contrib)](contrib.bayesflow.entropy.md)
+*   [BayesFlow Monte Carlo (contrib)](contrib.bayesflow.monte_carlo.md)
+*   [BayesFlow Stochastic Graph (contrib)](contrib.bayesflow.stochastic_graph.md)
+*   [BayesFlow Stochastic Tensors (contrib)](contrib.bayesflow.stochastic_tensor.md)
+*   [BayesFlow Variational Inference (contrib)](contrib.bayesflow.variational_inference.md)
+*   [Copying Graph Elements (contrib)](contrib.copy_graph.md)
+*   [CRF (contrib)](contrib.crf.md)
+*   [FFmpeg (contrib)](contrib.ffmpeg.md)
+*   [Framework (contrib)](contrib.framework.md)
+*   [Graph Editor (contrib)](contrib.graph_editor.md)
+*   [Integrate (contrib)](contrib.integrate.md)
+*   [Layers (contrib)](contrib.layers.md)
+*   [Learn (contrib)](contrib.learn.md)
+*   [Linear Algebra (contrib)](contrib.linalg.md)
+*   [Losses (contrib)](contrib.losses.md)
+*   [Metrics (contrib)](contrib.metrics.md)
+*   [Optimization (contrib)](contrib.opt.md)
+*   [Random variable transformations (contrib)](contrib.distributions.bijectors.md)
+*   [RNN and Cells (contrib)](contrib.rnn.md)
+*   [Seq2seq Library (contrib)](contrib.seq2seq.md)
+*   [Staging (contrib)](contrib.staging.md)
+*   [Statistical Distributions (contrib)](contrib.distributions.md)
+*   [Training (contrib)](contrib.training.md)
+*   [Utilities (contrib)](contrib.util.md)
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
new file mode 100644
index 00000000000..94cf0de32a2
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -0,0 +1,130 @@
+# Inputs and Readers
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Placeholders
+
+TensorFlow provides a placeholder operation that must be fed with data
+on execution.  For more info, see the section on @{$reading_data#feeding$Feeding data}.
+
+*   @{tf.placeholder}
+*   @{tf.placeholder_with_default}
+
+For feeding `SparseTensor`s which are composite type,
+there is a convenience function:
+
+*   @{tf.sparse_placeholder}
+
+## Readers
+
+TensorFlow provides a set of Reader classes for reading data formats.
+For more information on inputs and readers, see @{$reading_data$Reading data}.
+
+*   @{tf.ReaderBase}
+*   @{tf.TextLineReader}
+*   @{tf.WholeFileReader}
+*   @{tf.IdentityReader}
+*   @{tf.TFRecordReader}
+*   @{tf.FixedLengthRecordReader}
+
+## Converting
+
+TensorFlow provides several operations that you can use to convert various data
+formats into tensors.
+
+*   @{tf.decode_csv}
+*   @{tf.decode_raw}
+
+- - -
+
+### Example protocol buffer
+
+TensorFlow's @{$reading_data#standard-tensorflow-format$recommended format for training examples}
+is serialized `Example` protocol buffers, [described
+here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
+They contain `Features`, [described
+here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
+
+*   @{tf.VarLenFeature}
+*   @{tf.FixedLenFeature}
+*   @{tf.FixedLenSequenceFeature}
+*   @{tf.SparseFeature}
+*   @{tf.parse_example}
+*   @{tf.parse_single_example}
+*   @{tf.parse_tensor}
+*   @{tf.decode_json_example}
+
+## Queues
+
+TensorFlow provides several implementations of 'Queues', which are
+structures within the TensorFlow computation graph to stage pipelines
+of tensors together. The following describe the basic Queue interface
+and some implementations.  To see an example use, see @{$threading_and_queues$Threading and Queues}.
+
+*   @{tf.QueueBase}
+*   @{tf.FIFOQueue}
+*   @{tf.PaddingFIFOQueue}
+*   @{tf.RandomShuffleQueue}
+*   @{tf.PriorityQueue}
+
+## Conditional Accumulators
+
+*   @{tf.ConditionalAccumulatorBase}
+*   @{tf.ConditionalAccumulator}
+*   @{tf.SparseConditionalAccumulator}
+
+## Dealing with the filesystem
+
+*   @{tf.matching_files}
+*   @{tf.read_file}
+*   @{tf.write_file}
+
+## Input pipeline
+
+TensorFlow functions for setting up an input-prefetching pipeline.
+Please see the @{$reading_data$reading data how-to}
+for context.
+
+### Beginning of an input pipeline
+
+The "producer" functions add a queue to the graph and a corresponding
+`QueueRunner` for running the subgraph that fills that queue.
+
+*   @{tf.train.match_filenames_once}
+*   @{tf.train.limit_epochs}
+*   @{tf.train.input_producer}
+*   @{tf.train.range_input_producer}
+*   @{tf.train.slice_input_producer}
+*   @{tf.train.string_input_producer}
+
+### Batching at the end of an input pipeline
+
+These functions add a queue to the graph to assemble a batch of
+examples, with possible shuffling.  They also add a `QueueRunner` for
+running the subgraph that fills that queue.
+
+Use @{tf.train.batch} or @{tf.train.batch_join} for batching
+examples that have already been well shuffled.  Use
+@{tf.train.shuffle_batch} or
+@{tf.train.shuffle_batch_join} for examples that would
+benefit from additional shuffling.
+
+Use @{tf.train.batch} or @{tf.train.shuffle_batch} if you want a
+single thread producing examples to batch, or if you have a
+single subgraph producing examples but you want to run it in *N* threads
+(where you increase *N* until it can keep the queue full).  Use
+@{tf.train.batch_join} or @{tf.train.shuffle_batch_join}
+if you have *N* different subgraphs producing examples to batch and you
+want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
+
+*   @{tf.train.batch}
+*   @{tf.train.maybe_batch}
+*   @{tf.train.batch_join}
+*   @{tf.train.maybe_batch_join}
+*   @{tf.train.shuffle_batch}
+*   @{tf.train.maybe_shuffle_batch}
+*   @{tf.train.shuffle_batch_join}
+*   @{tf.train.maybe_shuffle_batch_join}
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
new file mode 100644
index 00000000000..3d9f203297a
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -0,0 +1,195 @@
+# Math
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+## Arithmetic Operators
+
+TensorFlow provides several operations that you can use to add basic arithmetic
+operators to your graph.
+
+*   @{tf.add}
+*   @{tf.subtract}
+*   @{tf.multiply}
+*   @{tf.scalar_mul}
+*   @{tf.div}
+*   @{tf.divide}
+*   @{tf.truediv}
+*   @{tf.floordiv}
+*   @{tf.realdiv}
+*   @{tf.truncatediv}
+*   @{tf.floor_div}
+*   @{tf.truncatemod}
+*   @{tf.floormod}
+*   @{tf.mod}
+*   @{tf.cross}
+
+## Basic Math Functions
+
+TensorFlow provides several operations that you can use to add basic
+mathematical functions to your graph.
+
+*   @{tf.add_n}
+*   @{tf.abs}
+*   @{tf.negative}
+*   @{tf.sign}
+*   @{tf.reciprocal}
+*   @{tf.square}
+*   @{tf.round}
+*   @{tf.sqrt}
+*   @{tf.rsqrt}
+*   @{tf.pow}
+*   @{tf.exp}
+*   @{tf.expm1}
+*   @{tf.log}
+*   @{tf.log1p}
+*   @{tf.ceil}
+*   @{tf.floor}
+*   @{tf.maximum}
+*   @{tf.minimum}
+*   @{tf.cos}
+*   @{tf.sin}
+*   @{tf.lbeta}
+*   @{tf.tan}
+*   @{tf.acos}
+*   @{tf.asin}
+*   @{tf.atan}
+*   @{tf.cosh}
+*   @{tf.sinh}
+*   @{tf.lgamma}
+*   @{tf.digamma}
+*   @{tf.erf}
+*   @{tf.erfc}
+*   @{tf.squared_difference}
+*   @{tf.igamma}
+*   @{tf.igammac}
+*   @{tf.zeta}
+*   @{tf.polygamma}
+*   @{tf.betainc}
+*   @{tf.rint}
+
+## Matrix Math Functions
+
+TensorFlow provides several operations that you can use to add linear algebra
+functions on matrices to your graph.
+
+*   @{tf.diag}
+*   @{tf.diag_part}
+*   @{tf.trace}
+*   @{tf.transpose}
+*   @{tf.eye}
+*   @{tf.matrix_diag}
+*   @{tf.matrix_diag_part}
+*   @{tf.matrix_band_part}
+*   @{tf.matrix_set_diag}
+*   @{tf.matrix_transpose}
+*   @{tf.matmul}
+*   @{tf.norm}
+*   @{tf.matrix_determinant}
+*   @{tf.matrix_inverse}
+*   @{tf.cholesky}
+*   @{tf.cholesky_solve}
+*   @{tf.matrix_solve}
+*   @{tf.matrix_triangular_solve}
+*   @{tf.matrix_solve_ls}
+*   @{tf.qr}
+*   @{tf.self_adjoint_eig}
+*   @{tf.self_adjoint_eigvals}
+*   @{tf.svd}
+
+
+## Tensor Math Function
+
+TensorFlow provides operations that you can use to add tensor functions to your
+graph.
+
+*   @{tf.tensordot}
+
+
+## Complex Number Functions
+
+TensorFlow provides several operations that you can use to add complex number
+functions to your graph.
+
+*   @{tf.complex}
+*   @{tf.conj}
+*   @{tf.imag}
+*   @{tf.real}
+
+
+## Reduction
+
+TensorFlow provides several operations that you can use to perform
+common math computations that reduce various dimensions of a tensor.
+
+*   @{tf.reduce_sum}
+*   @{tf.reduce_prod}
+*   @{tf.reduce_min}
+*   @{tf.reduce_max}
+*   @{tf.reduce_mean}
+*   @{tf.reduce_all}
+*   @{tf.reduce_any}
+*   @{tf.reduce_logsumexp}
+*   @{tf.count_nonzero}
+*   @{tf.accumulate_n}
+*   @{tf.einsum}
+
+## Scan
+
+TensorFlow provides several operations that you can use to perform scans
+(running totals) across one axis of a tensor.
+
+*   @{tf.cumsum}
+*   @{tf.cumprod}
+
+## Segmentation
+
+TensorFlow provides several operations that you can use to perform common
+math computations on tensor segments.
+Here a segmentation is a partitioning of a tensor along
+the first dimension, i.e. it  defines a mapping from the first dimension onto
+`segment_ids`. The `segment_ids` tensor should be the size of
+the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
+where `k<d0`.
+In particular, a segmentation of a matrix tensor is a mapping of rows to
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+  ==>  [[0 0 0 0]
+        [5 6 7 8]]
+```
+
+*   @{tf.segment_sum}
+*   @{tf.segment_prod}
+*   @{tf.segment_min}
+*   @{tf.segment_max}
+*   @{tf.segment_mean}
+*   @{tf.unsorted_segment_sum}
+*   @{tf.sparse_segment_sum}
+*   @{tf.sparse_segment_mean}
+*   @{tf.sparse_segment_sqrt_n}
+
+
+## Sequence Comparison and Indexing
+
+TensorFlow provides several operations that you can use to add sequence
+comparison and index extraction to your graph. You can use these operations to
+determine sequence differences and determine the indexes of specific values in
+a tensor.
+
+*   @{tf.argmin}
+*   @{tf.argmax}
+*   @{tf.setdiff1d}
+*   @{tf.where}
+*   @{tf.unique}
+*   @{tf.edit_distance}
+*   @{tf.invert_permutation}
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
new file mode 100644
index 00000000000..44a2696e5cf
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -0,0 +1,290 @@
+# Neural Network
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Activation Functions
+
+The activation ops provide different types of nonlinearities for use in neural
+networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`,
+`softplus`, and `softsign`), continuous but not everywhere differentiable
+functions (`relu`, `relu6`, `crelu` and `relu_x`), and random regularization
+(`dropout`).
+
+All activation ops apply componentwise, and produce a tensor of the same
+shape as the input tensor.
+
+*   @{tf.nn.relu}
+*   @{tf.nn.relu6}
+*   @{tf.nn.crelu}
+*   @{tf.nn.elu}
+*   @{tf.nn.softplus}
+*   @{tf.nn.softsign}
+*   @{tf.nn.dropout}
+*   @{tf.nn.bias_add}
+*   @{tf.sigmoid}
+*   @{tf.tanh}
+
+## Convolution
+
+The convolution ops sweep a 2-D filter over a batch of images, applying the
+filter to each window of each image of the appropriate size.  The different
+ops trade off between generic vs. specific filters:
+
+* `conv2d`: Arbitrary filters that can mix channels together.
+* `depthwise_conv2d`: Filters that operate on each channel independently.
+* `separable_conv2d`: A depthwise spatial filter followed by a pointwise filter.
+
+Note that although these ops are called "convolution", they are strictly
+speaking "cross-correlation" since the filter is combined with an input window
+without reversing the filter.  For details, see [the properties of
+cross-correlation](https://en.wikipedia.org/wiki/Cross-correlation#Properties).
+
+The filter is applied to image patches of the same size as the filter and
+strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
+the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
+filter to every other image patch in each dimension, etc.
+
+Ignoring channels for the moment, and assume that the 4-D `input` has shape
+`[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
+`[filter_height, filter_width, ...]`, then the spatial semantics of the
+convolution ops are as follows: first, according to the padding scheme chosen
+as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
+For the `'SAME'` padding, the output height and width are computed as:
+
+    out_height = ceil(float(in_height) / float(strides[1]))
+    out_width  = ceil(float(in_width) / float(strides[2]))
+
+and the padding on the top and left are computed as:
+
+    pad_along_height = max((out_height - 1) * strides[1] +
+                        filter_height - in_height, 0)
+    pad_along_width = max((out_width - 1) * strides[2] +
+                       filter_width - in_width, 0)
+    pad_top = pad_along_height // 2
+    pad_bottom = pad_along_height - pad_top
+    pad_left = pad_along_width // 2
+    pad_right = pad_along_width - pad_left
+
+
+Note that the division by 2 means that there might be cases when the padding on
+both sides (top vs bottom, right vs left) are off by one. In this case, the
+bottom and right sides always get the one additional padded pixel. For example,
+when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the
+bottom. Note that this is different from existing libraries such as cuDNN and
+Caffe, which explicitly specify the number of padded pixels and always pad the
+same number of pixels on both sides.
+
+For the `'VALID`' padding, the output height and width are computed as:
+
+    out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
+    out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
+
+and the padding values are always zero. The output is then computed as
+
+    output[b, i, j, :] =
+        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
+                           strides[2] * j + dj - pad_left, ...] *
+                     filter[di, dj, ...]
+
+where any value outside the original input image region are considered zero (
+i.e. we pad zero values around the border of the image).
+
+Since `input` is 4-D, each `input[b, i, j, :]` is a vector.  For `conv2d`, these
+vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new
+vectors.  For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]`
+is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
+concatenated.
+
+*   @{tf.nn.convolution}
+*   @{tf.nn.conv2d}
+*   @{tf.nn.depthwise_conv2d}
+*   @{tf.nn.depthwise_conv2d_native}
+*   @{tf.nn.separable_conv2d}
+*   @{tf.nn.atrous_conv2d}
+*   @{tf.nn.atrous_conv2d_transpose}
+*   @{tf.nn.conv2d_transpose}
+*   @{tf.nn.conv1d}
+*   @{tf.nn.conv3d}
+*   @{tf.nn.conv3d_transpose}
+*   @{tf.nn.conv2d_backprop_filter}
+*   @{tf.nn.conv2d_backprop_input}
+*   @{tf.nn.conv3d_backprop_filter_v2}
+*   @{tf.nn.depthwise_conv2d_native_backprop_filter}
+*   @{tf.nn.depthwise_conv2d_native_backprop_input}
+
+## Pooling
+
+The pooling ops sweep a rectangular window over the input tensor, computing a
+reduction operation for each window (average, max, or max with argmax).  Each
+pooling op uses rectangular windows of size `ksize` separated by offset
+`strides`.  For example, if `strides` is all ones every window is used, if
+`strides` is all twos every other window is used in each dimension, etc.
+
+In detail, the output is
+
+    output[i] = reduce(value[strides * i:strides * i + ksize])
+
+where the indices also take into consideration the padding values. Please refer
+to the `Convolution` section for details about the padding calculation.
+
+*   @{tf.nn.avg_pool}
+*   @{tf.nn.max_pool}
+*   @{tf.nn.max_pool_with_argmax}
+*   @{tf.nn.avg_pool3d}
+*   @{tf.nn.max_pool3d}
+*   @{tf.nn.fractional_avg_pool}
+*   @{tf.nn.fractional_max_pool}
+*   @{tf.nn.pool}
+
+## Morphological filtering
+
+Morphological operators are non-linear filters used in image processing.
+
+[Greyscale morphological dilation
+](https://en.wikipedia.org/wiki/Dilation_(morphology))
+is the max-sum counterpart of standard sum-product convolution:
+
+    output[b, y, x, c] =
+        max_{dy, dx} input[b,
+                           strides[1] * y + rates[1] * dy,
+                           strides[2] * x + rates[2] * dx,
+                           c] +
+                     filter[dy, dx, c]
+
+The `filter` is usually called structuring function. Max-pooling is a special
+case of greyscale morphological dilation when the filter assumes all-zero
+values (a.k.a. flat structuring function).
+
+[Greyscale morphological erosion
+](https://en.wikipedia.org/wiki/Erosion_(morphology))
+is the min-sum counterpart of standard sum-product convolution:
+
+    output[b, y, x, c] =
+        min_{dy, dx} input[b,
+                           strides[1] * y - rates[1] * dy,
+                           strides[2] * x - rates[2] * dx,
+                           c] -
+                     filter[dy, dx, c]
+
+Dilation and erosion are dual to each other. The dilation of the input signal
+`f` by the structuring signal `g` is equal to the negation of the erosion of
+`-f` by the reflected `g`, and vice versa.
+
+Striding and padding is carried out in exactly the same way as in standard
+convolution. Please refer to the `Convolution` section for details.
+
+*   @{tf.nn.dilation2d}
+*   @{tf.nn.erosion2d}
+*   @{tf.nn.with_space_to_batch}
+
+## Normalization
+
+Normalization is useful to prevent neurons from saturating when inputs may
+have varying scale, and to aid generalization.
+
+*   @{tf.nn.l2_normalize}
+*   @{tf.nn.local_response_normalization}
+*   @{tf.nn.sufficient_statistics}
+*   @{tf.nn.normalize_moments}
+*   @{tf.nn.moments}
+*   @{tf.nn.weighted_moments}
+*   @{tf.nn.fused_batch_norm}
+*   @{tf.nn.batch_normalization}
+*   @{tf.nn.batch_norm_with_global_normalization}
+
+## Losses
+
+The loss ops measure error between two tensors, or between a tensor and zero.
+These can be used for measuring accuracy of a network in a regression task
+or for regularization purposes (weight decay).
+
+*   @{tf.nn.l2_loss}
+*   @{tf.nn.log_poisson_loss}
+
+## Classification
+
+TensorFlow provides several operations that help you perform classification.
+
+*   @{tf.nn.sigmoid_cross_entropy_with_logits}
+*   @{tf.nn.softmax}
+*   @{tf.nn.log_softmax}
+*   @{tf.nn.softmax_cross_entropy_with_logits}
+*   @{tf.nn.sparse_softmax_cross_entropy_with_logits}
+*   @{tf.nn.weighted_cross_entropy_with_logits}
+
+## Embeddings
+
+TensorFlow provides library support for looking up values in embedding
+tensors.
+
+*   @{tf.nn.embedding_lookup}
+*   @{tf.nn.embedding_lookup_sparse}
+
+## Recurrent Neural Networks
+
+TensorFlow provides a number of methods for constructing Recurrent
+Neural Networks.  Most accept an `RNNCell`-subclassed object
+(see the documentation for `tf.contrib.rnn`).
+
+*   @{tf.nn.dynamic_rnn}
+*   @{tf.nn.bidirectional_dynamic_rnn}
+*   @{tf.nn.raw_rnn}
+
+## Connectionist Temporal Classification (CTC)
+
+*   @{tf.nn.ctc_loss}
+*   @{tf.nn.ctc_greedy_decoder}
+*   @{tf.nn.ctc_beam_search_decoder}
+
+## Evaluation
+
+The evaluation ops are useful for measuring the performance of a network.
+They are typically used at evaluation time.
+
+*   @{tf.nn.top_k}
+*   @{tf.nn.in_top_k}
+
+## Candidate Sampling
+
+Do you want to train a multiclass or multilabel model with thousands
+or millions of output classes (for example, a language model with a
+large vocabulary)?  Training with a full Softmax is slow in this case,
+since all of the classes are evaluated for every training example.
+Candidate Sampling training algorithms can speed up your step times by
+only considering a small randomly-chosen subset of contrastive classes
+(called candidates) for each batch of training examples.
+
+See our
+[Candidate Sampling Algorithms
+Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+### Sampled Loss Functions
+
+TensorFlow provides the following sampled loss functions for faster training.
+
+*   @{tf.nn.nce_loss}
+*   @{tf.nn.sampled_softmax_loss}
+
+### Candidate Samplers
+
+TensorFlow provides the following samplers for randomly sampling candidate
+classes when using one of the sampled loss functions above.
+
+*   @{tf.nn.uniform_candidate_sampler}
+*   @{tf.nn.log_uniform_candidate_sampler}
+*   @{tf.nn.learned_unigram_candidate_sampler}
+*   @{tf.nn.fixed_unigram_candidate_sampler}
+
+### Miscellaneous candidate sampling utilities
+
+*   @{tf.nn.compute_accidental_hits}
+
+### Quantization ops
+
+*   @{tf.nn.quantized_conv2d}
+*   @{tf.nn.quantized_relu_x}
+*   @{tf.nn.quantized_max_pool}
+*   @{tf.nn.quantized_avg_pool}
diff --git a/tensorflow/docs_src/api_guides/python/python_io.md b/tensorflow/docs_src/api_guides/python/python_io.md
new file mode 100644
index 00000000000..a5444408fe8
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/python_io.md
@@ -0,0 +1,29 @@
+# Data IO (Python functions)
+[TOC]
+
+A TFRecords file represents a sequence of (binary) strings.  The format is not
+random access, so it is suitable for streaming large amounts of data but not
+suitable if fast sharding or other non-sequential access is desired.
+
+*   @{tf.python_io.TFRecordWriter}
+*   @{tf.python_io.tf_record_iterator}
+*   @{tf.python_io.TFRecordCompressionType}
+*   @{tf.python_io.TFRecordOptions}
+
+- - -
+
+## TFRecords Format Details
+
+A TFRecords file contains a sequence of strings with CRC hashes.  Each record
+has the format
+
+    uint64 length
+    uint32 masked_crc32_of_length
+    byte   data[length]
+    uint32 masked_crc32_of_data
+
+and the records are concatenated together to produce the file.  The CRC32s
+are [described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check),
+and the mask of a CRC is
+
+    masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul
diff --git a/tensorflow/docs_src/api_guides/python/script_ops.md b/tensorflow/docs_src/api_guides/python/script_ops.md
new file mode 100644
index 00000000000..ab49a570c13
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/script_ops.md
@@ -0,0 +1,13 @@
+# Wraps python functions
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Script Language Operators
+
+TensorFlow provides allows you to wrap python/numpy functions as
+TensorFlow operators.
+
+*   @{tf.py_func}
diff --git a/tensorflow/docs_src/api_guides/python/session_ops.md b/tensorflow/docs_src/api_guides/python/session_ops.md
new file mode 100644
index 00000000000..5176e3549c3
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/session_ops.md
@@ -0,0 +1,15 @@
+# Tensor Handle Operations
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Tensor Handle Operations
+
+TensorFlow provides several operators that allows the user to keep tensors
+"in-place" across run calls.
+
+*   @{tf.get_session_handle}
+*   @{tf.get_session_tensor}
+*   @{tf.delete_session_tensor}
diff --git a/tensorflow/docs_src/api_guides/python/sparse_ops.md b/tensorflow/docs_src/api_guides/python/sparse_ops.md
new file mode 100644
index 00000000000..19d5faba05a
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/sparse_ops.md
@@ -0,0 +1,45 @@
+# Sparse Tensors
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Sparse Tensor Representation
+
+TensorFlow supports a `SparseTensor` representation for data that is sparse
+in multiple dimensions. Contrast this representation with `IndexedSlices`,
+which is efficient for representing tensors that are sparse in their first
+dimension, and dense along all other dimensions.
+
+*   @{tf.SparseTensor}
+*   @{tf.SparseTensorValue}
+
+## Conversion
+
+*   @{tf.sparse_to_dense}
+*   @{tf.sparse_tensor_to_dense}
+*   @{tf.sparse_to_indicator}
+*   @{tf.sparse_merge}
+
+## Manipulation
+
+*   @{tf.sparse_concat}
+*   @{tf.sparse_reorder}
+*   @{tf.sparse_reshape}
+*   @{tf.sparse_split}
+*   @{tf.sparse_retain}
+*   @{tf.sparse_reset_shape}
+*   @{tf.sparse_fill_empty_rows}
+*   @{tf.sparse_transpose}
+
+## Reduction
+*   @{tf.sparse_reduce_sum}
+*   @{tf.sparse_reduce_sum_sparse}
+
+## Math Operations
+*   @{tf.sparse_add}
+*   @{tf.sparse_softmax}
+*   @{tf.sparse_tensor_dense_matmul}
+*   @{tf.sparse_maximum}
+*   @{tf.sparse_minimum}
diff --git a/tensorflow/docs_src/api_guides/python/spectral_ops.md b/tensorflow/docs_src/api_guides/python/spectral_ops.md
new file mode 100644
index 00000000000..e19403bfda9
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/spectral_ops.md
@@ -0,0 +1,21 @@
+# Spectral Functions
+
+[TOC]
+
+## Fourier Transform Functions
+
+TensorFlow provides several operations that you can use to add discrete
+Fourier transform functions to your graph.
+
+*   @{tf.spectral.fft}
+*   @{tf.spectral.ifft}
+*   @{tf.spectral.fft2d}
+*   @{tf.spectral.ifft2d}
+*   @{tf.spectral.fft3d}
+*   @{tf.spectral.ifft3d}
+*   @{tf.spectral.rfft}
+*   @{tf.spectral.irfft}
+*   @{tf.spectral.rfft2d}
+*   @{tf.spectral.irfft2d}
+*   @{tf.spectral.rfft3d}
+*   @{tf.spectral.irfft3d}
diff --git a/tensorflow/docs_src/api_guides/python/state_ops.md b/tensorflow/docs_src/api_guides/python/state_ops.md
new file mode 100644
index 00000000000..0d612ee0c7e
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/state_ops.md
@@ -0,0 +1,108 @@
+# Variables
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Variables
+
+*   @{tf.Variable}
+
+## Variable helper functions
+
+TensorFlow provides a set of functions to help manage the set of variables
+collected in the graph.
+
+*   @{tf.global_variables}
+*   @{tf.local_variables}
+*   @{tf.model_variables}
+*   @{tf.trainable_variables}
+*   @{tf.moving_average_variables}
+*   @{tf.global_variables_initializer}
+*   @{tf.local_variables_initializer}
+*   @{tf.variables_initializer}
+*   @{tf.is_variable_initialized}
+*   @{tf.report_uninitialized_variables}
+*   @{tf.assert_variables_initialized}
+*   @{tf.assign}
+*   @{tf.assign_add}
+*   @{tf.assign_sub}
+
+## Saving and Restoring Variables
+
+*   @{tf.train.Saver}
+*   @{tf.train.latest_checkpoint}
+*   @{tf.train.get_checkpoint_state}
+*   @{tf.train.update_checkpoint_state}
+
+## Sharing Variables
+
+TensorFlow provides several classes and operations that you can use to
+create variables contingent on certain conditions.
+
+*   @{tf.get_variable}
+*   @{tf.get_local_variable}
+*   @{tf.VariableScope}
+*   @{tf.variable_scope}
+*   @{tf.variable_op_scope}
+*   @{tf.get_variable_scope}
+*   @{tf.make_template}
+*   @{tf.no_regularizer}
+*   @{tf.constant_initializer}
+*   @{tf.random_normal_initializer}
+*   @{tf.truncated_normal_initializer}
+*   @{tf.random_uniform_initializer}
+*   @{tf.uniform_unit_scaling_initializer}
+*   @{tf.zeros_initializer}
+*   @{tf.ones_initializer}
+*   @{tf.orthogonal_initializer}
+
+## Variable Partitioners for Sharding
+
+*   @{tf.fixed_size_partitioner}
+*   @{tf.variable_axis_size_partitioner}
+*   @{tf.min_max_variable_partitioner}
+
+## Sparse Variable Updates
+
+The sparse update ops modify a subset of the entries in a dense `Variable`,
+either overwriting the entries or adding / subtracting a delta.  These are
+useful for training embedding models and similar lookup-based networks, since
+only a small subset of embedding vectors change in any given step.
+
+Since a sparse update of a large tensor may be generated automatically during
+gradient computation (as in the gradient of
+@{tf.gather}),
+an @{tf.IndexedSlices} class is provided that encapsulates a set
+of sparse indices and values.  `IndexedSlices` objects are detected and handled
+automatically by the optimizers in most cases.
+
+*   @{tf.scatter_update}
+*   @{tf.scatter_add}
+*   @{tf.scatter_sub}
+*   @{tf.scatter_mul}
+*   @{tf.scatter_div}
+*   @{tf.scatter_nd_update}
+*   @{tf.scatter_nd_add}
+*   @{tf.scatter_nd_sub}
+*   @{tf.sparse_mask}
+*   @{tf.IndexedSlices}
+
+### Read-only Lookup Tables
+
+*   @{tf.initialize_all_tables}
+*   @{tf.tables_initializer}
+
+
+## Exporting and Importing Meta Graphs
+
+*   @{tf.train.export_meta_graph}
+*   @{tf.train.import_meta_graph}
+
+# Deprecated functions (removed after 2017-03-02). Please don't use them.
+
+*   @{tf.all_variables}
+*   @{tf.initialize_all_variables}
+*   @{tf.initialize_local_variables}
+*   @{tf.initialize_variables}
diff --git a/tensorflow/docs_src/api_guides/python/string_ops.md b/tensorflow/docs_src/api_guides/python/string_ops.md
new file mode 100644
index 00000000000..a73a5efac99
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/string_ops.md
@@ -0,0 +1,34 @@
+# Strings
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+@{tf.convert_to_tensor}.
+
+[TOC]
+
+## Hashing
+
+String hashing ops take a string input tensor and map each element to an
+integer.
+
+*   @{tf.string_to_hash_bucket_fast}
+*   @{tf.string_to_hash_bucket_strong}
+*   @{tf.string_to_hash_bucket}
+
+## Joining
+
+String joining ops concatenate elements of input string tensors to produce a new
+string tensor.
+
+*   @{tf.reduce_join}
+*   @{tf.string_join}
+
+## Splitting
+
+*   @{tf.string_split}
+*   @{tf.substr}
+
+## Conversion
+
+*   @{tf.as_string}
+*   @{tf.encode_base64}
+*   @{tf.decode_base64}
diff --git a/tensorflow/docs_src/api_guides/python/summary.md b/tensorflow/docs_src/api_guides/python/summary.md
new file mode 100644
index 00000000000..eda119ab24e
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/summary.md
@@ -0,0 +1,23 @@
+# Summary Operations
+[TOC]
+
+Summaries provide a way to export condensed information about a model, which is
+then accessible in tools such as @{$summaries_and_tensorboard$TensorBoard}.
+
+## Generation of Summaries
+
+### Class for writing Summaries
+*   @{tf.summary.FileWriter}
+*   @{tf.summary.FileWriterCache}
+
+### Summary Ops
+*   @{tf.summary.tensor_summary}
+*   @{tf.summary.scalar}
+*   @{tf.summary.histogram}
+*   @{tf.summary.audio}
+*   @{tf.summary.image}
+*   @{tf.summary.merge}
+*   @{tf.summary.merge_all}
+
+## Utilities
+*   @{tf.summary.get_summary_description}
diff --git a/tensorflow/docs_src/api_guides/python/test.md b/tensorflow/docs_src/api_guides/python/test.md
new file mode 100644
index 00000000000..93a1d50b298
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/test.md
@@ -0,0 +1,44 @@
+# Testing
+[TOC]
+
+## Unit tests
+
+TensorFlow provides a convenience class inheriting from `unittest.TestCase`
+which adds methods relevant to TensorFlow tests.  Here is an example:
+
+```python
+    import tensorflow as tf
+
+
+    class SquareTest(tf.test.TestCase):
+
+      def testSquare(self):
+        with self.test_session():
+          x = tf.square([2, 3])
+          self.assertAllEqual(x.eval(), [4, 9])
+
+
+    if __name__ == '__main__':
+      tf.test.main()
+```
+
+`tf.test.TestCase` inherits from `unittest.TestCase` but adds a few additional
+methods.  See @{tf.test.TestCase} for details.
+
+*   @{tf.test.main}
+*   @{tf.test.TestCase}
+*   @{tf.test.test_src_dir_path}
+
+## Utilities
+
+*   @{tf.test.assert_equal_graph_def}
+*   @{tf.test.get_temp_dir}
+*   @{tf.test.is_built_with_cuda}
+*   @{tf.test.is_gpu_available}
+*   @{tf.test.gpu_device_name}
+
+## Gradient checking
+
+@{tf.test.compute_gradient} and @{tf.test.compute_gradient_error} perform
+numerical differentiation of graphs for comparison against registered analytic
+gradients.
diff --git a/tensorflow/docs_src/api_guides/python/tfdbg.md b/tensorflow/docs_src/api_guides/python/tfdbg.md
new file mode 100644
index 00000000000..2212a2da0e8
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/tfdbg.md
@@ -0,0 +1,50 @@
+# TensorFlow Debugger
+[TOC]
+
+Public Python API of TensorFlow Debugger (tfdbg).
+
+## Functions for adding debug watches
+
+These functions help you modify `RunOptions` to specify which `Tensor`s are to
+be watched when the TensorFlow graph is executed at runtime.
+
+*   @{tfdbg.add_debug_tensor_watch}
+*   @{tfdbg.watch_graph}
+*   @{tfdbg.watch_graph_with_blacklists}
+
+
+## Classes for debug-dump data and directories
+
+These classes allow you to load and inspect tensor values dumped from
+TensorFlow graphs during runtime.
+
+*   @{tfdbg.DebugTensorDatum}
+*   @{tfdbg.DebugDumpDir}
+
+
+## Functions for loading debug-dump data
+
+*   @{tfdbg.load_tensor_from_event_file}
+
+
+## Tensor-value predicates
+
+Built-in tensor-filter predicates to support conditional breakpoint between
+runs. See `DebugDumpDir.find()` for more details.
+
+*   @{tfdbg.has_inf_or_nan}
+
+
+## Session wrapper class and `SessionRunHook` implementations
+
+These classes allow you to
+
+* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models
+  (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or
+* generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
+  `DumpingDebugHook` and `LocalCLIDebugHook`).
+
+*   @{tfdbg.DumpingDebugHook}
+*   @{tfdbg.DumpingDebugWrapperSession}
+*   @{tfdbg.LocalCLIDebugHook}
+*   @{tfdbg.LocalCLIDebugWrapperSession}
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
new file mode 100644
index 00000000000..943394f4ae0
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -0,0 +1,133 @@
+# Training
+[TOC]
+
+@{tf.train} provides a set of classes and functions that help train models.
+
+## Optimizers
+
+The Optimizer base class provides methods to compute gradients for a loss and
+apply gradients to variables.  A collection of subclasses implement classic
+optimization algorithms such as GradientDescent and Adagrad.
+
+You never instantiate the Optimizer class itself, but instead instantiate one
+of the subclasses.
+
+*   @{tf.train.Optimizer}
+*   @{tf.train.GradientDescentOptimizer}
+*   @{tf.train.AdadeltaOptimizer}
+*   @{tf.train.AdagradOptimizer}
+*   @{tf.train.AdagradDAOptimizer}
+*   @{tf.train.MomentumOptimizer}
+*   @{tf.train.AdamOptimizer}
+*   @{tf.train.FtrlOptimizer}
+*   @{tf.train.ProximalGradientDescentOptimizer}
+*   @{tf.train.ProximalAdagradOptimizer}
+*   @{tf.train.RMSPropOptimizer}
+
+## Gradient Computation
+
+TensorFlow provides functions to compute the derivatives for a given
+TensorFlow computation graph, adding operations to the graph. The
+optimizer classes automatically compute derivatives on your graph, but
+creators of new Optimizers or expert users can call the lower-level
+functions below.
+
+*   @{tf.gradients}
+*   @{tf.AggregationMethod}
+*   @{tf.stop_gradient}
+*   @{tf.hessians}
+
+
+## Gradient Clipping
+
+TensorFlow provides several operations that you can use to add clipping
+functions to your graph. You can use these functions to perform general data
+clipping, but they're particularly useful for handling exploding or vanishing
+gradients.
+
+*   @{tf.clip_by_value}
+*   @{tf.clip_by_norm}
+*   @{tf.clip_by_average_norm}
+*   @{tf.clip_by_global_norm}
+*   @{tf.global_norm}
+
+## Decaying the learning rate
+*   @{tf.train.exponential_decay}
+*   @{tf.train.inverse_time_decay}
+*   @{tf.train.natural_exp_decay}
+*   @{tf.train.piecewise_constant}
+*   @{tf.train.polynomial_decay}
+
+## Moving Averages
+
+Some training algorithms, such as GradientDescent and Momentum often benefit
+from maintaining a moving average of variables during optimization.  Using the
+moving averages for evaluations often improve results significantly.
+
+*   @{tf.train.ExponentialMovingAverage}
+
+## Coordinator and QueueRunner
+
+See @{$threading_and_queues$Threading and Queues}
+for how to use threads and queues.  For documentation on the Queue API,
+see @{$python/io_ops#queues$Queues}.
+
+
+*   @{tf.train.Coordinator}
+*   @{tf.train.QueueRunner}
+*   @{tf.train.LooperThread}
+*   @{tf.train.add_queue_runner}
+*   @{tf.train.start_queue_runners}
+
+## Distributed execution
+
+See @{$distributed$Distributed TensorFlow} for
+more information about how to configure a distributed TensorFlow program.
+
+*   @{tf.train.Server}
+*   @{tf.train.Supervisor}
+*   @{tf.train.SessionManager}
+*   @{tf.train.ClusterSpec}
+*   @{tf.train.replica_device_setter}
+*   @{tf.train.MonitoredTrainingSession}
+*   @{tf.train.MonitoredSession}
+*   @{tf.train.SingularMonitoredSession}
+*   @{tf.train.Scaffold}
+*   @{tf.train.SessionCreator}
+*   @{tf.train.ChiefSessionCreator}
+*   @{tf.train.WorkerSessionCreator}
+
+## Reading Summaries from Event Files
+
+See @{$summaries_and_tensorboard$Summaries and TensorBoard} for an
+overview of summaries, event files, and visualization in TensorBoard.
+
+*   @{tf.train.summary_iterator}
+
+## Training Hooks
+
+Hooks are tools that run in the process of training/evaluation of the model.
+
+*   @{tf.train.SessionRunHook}
+*   @{tf.train.SessionRunArgs}
+*   @{tf.train.SessionRunContext}
+*   @{tf.train.SessionRunValues}
+*   @{tf.train.LoggingTensorHook}
+*   @{tf.train.StopAtStepHook}
+*   @{tf.train.CheckpointSaverHook}
+*   @{tf.train.NewCheckpointReader}
+*   @{tf.train.StepCounterHook}
+*   @{tf.train.NanLossDuringTrainingError}
+*   @{tf.train.NanTensorHook}
+*   @{tf.train.SummarySaverHook}
+*   @{tf.train.GlobalStepWaiterHook}
+*   @{tf.train.FinalOpsHook}
+*   @{tf.train.FeedFnHook}
+
+## Training Utilities
+
+*   @{tf.train.global_step}
+*   @{tf.train.basic_train_loop}
+*   @{tf.train.get_global_step}
+*   @{tf.train.assert_global_step}
+*   @{tf.train.write_graph}
diff --git a/tensorflow/docs_src/community/benchmarks.md b/tensorflow/docs_src/community/benchmarks.md
new file mode 100644
index 00000000000..c8be854fd9e
--- /dev/null
+++ b/tensorflow/docs_src/community/benchmarks.md
@@ -0,0 +1,95 @@
+# Benchmarks
+
+This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to TensorFlow github repo, then we will run them daily with our Jenkins build.
+
+[TOC]
+
+
+## Defining a Benchmark
+
+Defining a TensorFlow benchmark requires extending from `tf.test.Benchmark`
+class and calling `self.report_benchmark` method. For example, take a look at the sample benchmark code below:
+
+```python
+import time
+
+import tensorflow as tf
+
+
+# Define a class that extends from tf.test.Benchmark.
+class SampleBenchmark(tf.test.Benchmark):
+
+  # Note: benchmark method name must start with `benchmark`.
+  def benchmarkSum(self):
+    with tf.Session() as sess:
+      x = tf.constant(10)
+      y = tf.constant(5)
+      result = tf.add(x, y)
+
+      iters = 100
+      start_time = time.time()
+      for _ in range(iters):
+        sess.run(result)
+      total_wall_time = time.time() - start_time
+
+      # Call report_benchmark to report a metric value.
+      self.report_benchmark(
+          name="sum_wall_time",
+          # This value should always be per iteration.
+          wall_time=total_wall_time/iters,
+          iters=iters)
+
+if __name__ == "__main__":
+  tf.test.main()
+```
+See the full example for [SampleBenchmark](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/benchmark/).
+
+
+Key points to note in the example above:
+
+* Benchmark class extends from `tf.test.Benchmark`.
+* Each benchmark method should start with `benchmark` prefix.
+* Benchmark method calls `report_benchmark` to report the metric value.
+
+
+## Adding a `bazel` Target
+
+We have a special target called `tf_py_logged_benchmark` for benchmarks defined under TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
+
+First, define a regular `py_test` target. See example below:
+
+```build
+py_test(
+  name = "sample_benchmark",
+  srcs = ["sample_benchmark.py"],
+  srcs_version = "PY2AND3",
+  deps = [
+    "//tensorflow:tensorflow_py",
+  ],
+)
+```
+
+You can run benchmarks in a `py_test` target by passing `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
+
+```shell
+bazel test :sample_benchmark --test_arg=--benchmarks=all
+```
+
+
+Now, add the `tf_py_logged_benchmark` target (if available). This target would
+pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. `tf_py_logged_benchmark` target should be available in TensorFlow repository.
+
+```build
+load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
+
+tf_py_logged_benchmark(
+    name = "sample_logged_benchmark",
+    target = "//tensorflow/tools/test:sample_benchmark",
+)
+```
+
+Use the following command to run the benchmark target:
+
+```shell
+bazel test :sample_logged_benchmark
+```
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
new file mode 100644
index 00000000000..31a10d1f15d
--- /dev/null
+++ b/tensorflow/docs_src/community/documentation.md
@@ -0,0 +1,684 @@
+# Writing TensorFlow Documentation
+
+We welcome contributions to the Tensorflow documentation from the community.
+This document explains how you can contribute to that documentation. In
+particular, this document explains the following:
+
+* Where the documentation is located.
+* How to make conformant edits.
+* How to build and test your documentation changes before you submit them.
+
+You can view Tensorflow documentation on https://www.tensorflow.org, and you
+can view and edit the raw files on Github. We're publishing our docs on Github
+so everybody can contribute. Whatever gets checked in tensorflow/docs_src will
+be published soon after on https://www.tensorflow.org. 
+
+Republishing TensorFlow documentation in different forms is absolutely allowed,
+but we are unlikely to accept other documentation formats (or the tooling to
+generate them) into our repository. If you do choose to republish our
+documentation in another form, please be sure to include:
+
+* The version of the API this represents (i.e. r1.0, master, etc.)
+* The commit or version from which the documentation was generated
+* Where to get the latest documentation (that is, https://www.tensorflow.org)
+* The Apache 2.0 license.
+
+## A Note on Versions
+
+tensorflow.org, at root, shows documentation for the latest stable binary.  This
+is the documentation you should be reading if you are using `pip` to install
+TensorFlow.
+
+However, most developers will contribute documentation into the master Github
+branch, which is published, occasionally,
+at [tensorflow.org/versions/master](https://tensorflow.org/versions/master).
+
+If you want documentation changes to appear at root, you will need to also
+contribute that change to the current stable binary branch (and/or
+[cherrypick](https://www.google.com/url?sa=D&q=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F9339429%2Fwhat-does-cherry-picking-a-commit-with-git-mean)).
+
+## Reference vs. non-reference documentation
+
+The following reference documentation is automatically generated from comments
+in the code:
+
+- C++ API reference docs
+- Java API reference docs
+- Python API reference docs
+
+To modify the reference documentation, you edit the appropriate code comments.
+
+Non-reference documentation (for example, the TensorFlow installation guides) is
+authored by humans. This documentation is located in the `tensorflow/docs_src`
+directory.  Each subdirectory of `docs_src` contains a set of related Tensorflow
+documentation. For example, the TensorFlow installation guides are all in the
+`docs_src/install` directory.
+
+The C++ documentation is generated from XML files generated via doxygen;
+however, those tools are not available in open source at this time.
+
+## Markdown
+
+Editable TensorFlow documentation is written in Markdown. With a few exceptions,
+TensorFlow uses
+the [standard Markdown rules](https://daringfireball.net/projects/markdown/).
+
+This section explains the primary differences between standard Markdown rules
+and the Markdown rules that editable TensorFlow documentation uses.
+
+### Math in Markdown
+
+You may use MathJax within TensorFlow when editing Markdown files, but note the
+following:
+
+- MathJax renders properly on [tensorflow.org](https://www.tensorflow.org)
+- MathJax does not render properly on [github](https://github.com/tensorflow/tensorflow).
+
+When writing MathJax, you can use <code>&#36;&#36;</code> and `\\(` and `\\)` to
+surround your math.  <code>&#36;&#36;</code> guards will cause line breaks, so
+within text, use `\\(` `\\)` instead.
+
+### Links in Markdown
+
+Links fall into a few categories:
+
+- Links to a different part of the same file
+- Links to a URL outside of tensorflow.org
+- Links from a Markdown file (or code comments) to another file within tensorflow.org
+
+For the first two link categories, you may use standard Markdown links, but put
+the link entirely on one line, rather than splitting it across lines. For
+example:
+
+- `[text](link)    # Good link`
+- `[text]\n(link)  # Bad link`
+- `[text](\nlink)  # Bad link`
+
+For the final link category (links to another file within tensorflow.org),
+please use a special link parameterization mechanism. This mechanism enables
+authors to move and reorganize files without breaking links.
+
+The parameterization scheme is as follows.  Use:
+
+<!-- Note: the use of &#64; is a hack so we don't translate these as symbols -->
+- <code>&#64;{tf.symbol}</code> to make a link to the reference page for a
+  Python symbol.  Note that class members don't get their own page, but the
+  syntax still works, since <code>&#64;{tf.MyClass.method}</code> links to the
+  proper part of the tf.MyClass page.
+
+- <code>&#64;{tensorflow::symbol}</code> to make a link to the reference page
+  for a C++ symbol.
+
+- <code>&#64;{$doc_page}</code> to make a link to another (not an API reference)
+    doc page. To link to
+
+    - `red/green/blue/index.md` use <code>&#64;{$blue}</code> or
+      <code>&#64;{$green/blue}</code>,
+
+    - `foo/bar/baz.md` use <code>&#64;{$baz}</code> or
+      <code>&#64;{$bar/baz}</code>.
+
+    The shorter one is preferred, so we can move pages around without breaking
+    these references. The main exception is that the Python API guides should
+    probably be referred to using <code>&#64;{$python/<guide-name>}</code> to
+    avoid ambiguity.
+
+- <code>&#64;{$doc_page#anchor-tag$link-text}</code> to link to an anchor in
+    that doc and use different link text (by default, the link text is the title
+    of the target page).
+
+    To override the link text only, omit  the `#anchor-tag`.
+
+To link to source code, use a link starting with:
+`https://www.tensorflow.org/code/`, followed by
+the file name starting at the github root. For instance, a link to the file you
+are currently reading should be written as
+`https://www.tensorflow.org/code/tensorflow/docs_src/community/documentation.md`.
+
+This URL naming scheme ensures
+that [tensorflow.org](https://www.tensorflow.org/) can forward the link to the
+branch of the code corresponding to the version of the documentation you're
+viewing. Do not include url parameters in the source code URL.
+
+## Generating docs and previewing links
+
+Before building the documentation, you must first set up your environment by
+doing the following:
+
+1. If pip isn't installed on your machine, install it now by issuing the
+following command:
+
+        $ sudo easy_install pip
+
+2. Use pip to install codegen, mock, and pandas by issuing the following
+   command (Note: If you are using
+   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
+   dependencies, you may not want to use sudo for these installations):
+
+        $ sudo pip install codegen mock pandas
+
+3. If bazel is not installed on your machine, install it now. If you are on
+   Linux, install bazel by issuing the following command:
+
+        $ sudo apt-get install bazel  # Linux
+
+    If you are on Mac OS, find bazel installation instructions on
+    [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
+
+4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+   source code.
+
+5. Run the `configure` script and answer its prompts appropriately for your
+   system.
+
+        $ ./configure
+
+Then, change to the `tensorflow` directory which contains `docs_src` (`cd
+tensorflow`).  Run the following command to compile TensorFlow and generate the
+documentation in the `/tmp/tfdocs` dir:
+
+    bazel run tools/docs:generate -- \
+              --src_dir="$(pwd)/docs_src/" \
+              --output_dir=/tmp/tfdocs/
+
+Note: You must set `src_dir` and `output_dir` to absolute file paths.
+
+## Generating Python API Documentation
+
+Ops, classes, and utility functions are defined in Python modules, such as
+`image_ops.py`. Python modules contain a module docstring. For example:
+
+```python
+"""Image processing and decoding ops."""
+```
+
+The documentation generator places this module docstring at the beginning of the
+Markdown file generated for the module, in this
+case, [tf.image](https://www.tensorflow.org/api_docs/python/tf/image).
+
+It used to be a requirement to list every member of a module inside the module
+file at the beginning, putting a `@@` before each member. The `@@member_name`
+syntax is deprecated and no longer generates any docs. But depending on how a
+module is [sealed](#sealing_modules) it may still be necessary to mark the
+elements of the module’s contents as public. The called-out op, function, or
+class does not have to be defined in the same file. The next few sections of
+this document discuss sealing and how to add elements to the public
+documentation.
+
+The new documentation system automatically documents public symbols, except for
+the following:
+
+- Private symbols whose names start with an underscore.
+- Symbols originally defined in `object` or protobuf’s `Message`.
+- Some class members, such as `__base__`, `__class__`, which are dynamically
+  created but generally have no useful documentation.
+
+Only top level modules (currently just `tf` and `tfdbg`) need to be manually
+added to the generate script.
+
+### Sealing Modules
+
+Because the doc generator walks all visible symbols, and descends into anything
+it finds, it will document any accidentally exposed symbols. If a module only
+exposes symbols that are meant to be part of the public API, we call it
+**sealed**. Because of Python’s loose import and visibility conventions, naively
+written Python code will inadvertently expose a lot of modules which are
+implementation details. Improperly sealed modules may expose other unsealed
+modules, which will typically lead the doc generator to fail. **This failure is
+the intended behavior.** It ensures that our API is well defined, and allows us
+to change implementation details (including which modules are imported where)
+without fear of accidentally breaking users.
+
+If a module is accidentally imported, it typically breaks the doc generator
+(`generate_test`). This is a clear sign you need to seal your modules. However,
+even if the doc generator succeeds, unwanted symbols may show up in the
+docs. Check the generated docs to make sure that all symbols that are documented
+are expected. If there are symbols that shouldn’t be there, you have the
+following options for dealing with them: 
+
+- Private symbols and imports
+- The `remove_undocumented` filter
+- A traversal blacklist.
+
+We'll discuss these options in detail below.
+
+#### Private Symbols and Imports
+
+The easiest way to conform to the API sealing expectations is to make non-public
+symbols private (by prepending an underscore _). The doc generator respects
+private symbols. This also applies to modules. If the only problem is that there
+is a small number of imported modules that show up in the docs (or break the
+generator), you can simply rename them on import, e.g.: `import sys as _sys`.
+
+Because Python considers all files to be modules, this applies to files as
+well. If you have a directory containing the following two files/modules:
+
+    module/__init__.py
+    module/private_impl.py
+
+Then, after `module` is imported, it will be possible to access
+`module.private_impl`. Renaming `private_impl.py` to `_private_impl.py` solves
+the problem. If renaming modules is awkward, read on.
+
+#### Use the `remove_undocumented` filter
+
+Another way to seal a module is to split your implementation from the API. To do
+so, consider using `remove_undocumented`, which takes a list of allowed symbols,
+and deletes everything else from the module. For example, the following snippet
+demonstrates how to put `remove_undocumented` in the `__init__.py` file for a
+module:
+
+__init__.py:
+
+    # Use * imports only if __all__ defined in some_file
+    from tensorflow.some_module.some_file import *
+
+    # Otherwise import symbols directly
+    from tensorflow.some_module.some_other_file import some_symbol
+
+    from tensorflow.platform.all_util import remove_undocumented
+
+    _allowed_symbols = [‘some_symbol’, ‘some_other_symbol’]
+
+    remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
+
+The `@@member_name` syntax is deprecated, but it still exists in some places in
+the documentation as an indicator to `remove_undocumented` that those symbols
+are public. All `@@`s will eventually be removed. If you see them, however,
+please do not randomly delete them as they are still in use by some of our
+systems.
+
+#### Traversal Blacklist
+
+If all else fails, you may add entries to the traversal blacklist in
+`generate_lib.py.` **Almost all entries in this list are an abuse of its
+purpose; avoid adding to it if you can!**
+
+The traversal blacklist maps qualified module names (without the leading `tf.`)
+to local names that are not to be descended into. For instance, the following
+entry will exclude `some_module` from traversal.
+
+    { ...
+      ‘contrib.my_module’: [‘some_module’]
+      ...
+    }
+
+That means that the doc generator will show that `some_module` exists, but it
+will not enumerate its content.
+
+This blacklist was originally intended to make sure that system modules (mock,
+flags, ...) included for platform abstraction can be documented without
+documenting their interior. Its use beyond this purpose is a shortcut that may
+be acceptable for contrib, but not for core tensorflow.
+
+## Op Documentation Style Guide
+
+Long, descriptive module-level documentation for modules should go in the API
+Guides in `docs_src/api_guides/python`.
+
+For classes and ops, ideally, you should provide the following information, in
+order of presentation:
+
+* A short sentence that describes what the op does.
+* A short description of what happens when you pass arguments to the op.
+* An example showing how the op works (pseudocode is best).
+* Requirements, caveats, important notes (if there are any).
+* Descriptions of inputs, outputs, and Attrs or other parameters of the op
+  constructor.
+
+Each of these is described in more
+detail [below](#description-of-the-docstring-sections).
+
+Write your text in Markdown format. A basic syntax reference
+is [here](https://daringfireball.net/projects/markdown/). You are allowed to
+use [MathJax](https://www.mathjax.org) notation for equations (see above for
+restrictions).
+
+### Writing About Code
+
+Put backticks around these things when they're used in text:
+
+* Argument names (for example, `input`, `x`, `tensor`)
+* Returned tensor names (for example, `output`, `idx`, `out`)
+* Data types (for example, `int32`, `float`, `uint8`)
+* Other op names referenced in text (for example, `list_diff()`, `shuffle()`)
+* Class names (for example, `Tensor` when you actually mean a `Tensor` object;
+  don't capitalize or use backticks if you're just explaining what an op does to
+  a tensor, or a graph, or an operation in general)
+* File names (for example, `image_ops.py`, or
+  `/path-to-your-data/xml/example-name`)
+* Math expressions or conditions (for example, `-1-input.dims() <= dim <=
+  input.dims()`)
+
+Put three backticks around sample code and pseudocode examples. And use `==>`
+instead of a single equal sign when you want to show what an op returns. For
+example:
+
+    ```
+    # 'input' is a tensor of shape [2, 3, 5]
+    (tf.expand_dims(input, 0)) ==> [1, 2, 3, 5]
+    ```
+
+If you're providing a Python code sample, add the python style label to ensure
+proper syntax highlighting:
+
+    ```python
+    # some Python code
+    ```
+
+Two notes about backticks for code samples in Markdown:
+
+1. You can use backticks for pretty printing languages other than Python, if
+   necessary. A full list of languages is available
+   [here](https://github.com/google/code-prettify#how-do-i-specify-the-language-of-my-code).
+2. Markdown also allows you to indent four spaces to specify a code sample.
+   However, do NOT indent four spaces and use backticks simultaneously. Use one
+   or the other.
+
+### Tensor Dimensions
+
+When you're talking about a tensor in general, don't capitalize the word tensor.
+When you're talking about the specific object that's provided to an op as an
+argument or returned by an op, then you should capitalize the word Tensor and
+add backticks around it because you're talking about a `Tensor` object.
+
+Don't use the word `Tensors` to describe multiple Tensor objects unless you
+really are talking about a `Tensors` object. Better to say "a list of `Tensor`
+objects."
+
+Use the term "dimension" to refer to the size of a tensor. If you need to be
+specific about the size, use these conventions:
+
+- Refer to a scalar as a "0-D tensor"
+- Refer to a vector as a "1-D tensor"
+- Refer to a matrix as a "2-D tensor"
+- Refer to tensors with 3 or more dimensions as 3-D tensors or n-D tensors. Use
+  the word "rank" only if it makes sense, but try to use "dimension" instead.
+  Never use the word "order" to describe the size of a tensor.
+
+Use the word "shape" to detail the dimensions of a tensor, and show the shape in
+square brackets with backticks. For example:
+
+    If `input` is a 3-D tensor with shape `[3, 4, 3]`, this operation
+    returns a 3-D tensor with shape `[6, 8, 6]`.
+
+### Ops defined in C++
+
+All Ops defined in C++ (and accessible from other languages) must be documented
+with a `REGISTER_OP` declaration. The docstring in the C++ file is processed to
+automatically add some information for the input types, output types, and Attr
+types and default values.
+
+For example:
+
+    ```c++
+    REGISTER_OP("PngDecode")
+      .Input("contents: string")
+      .Attr("channels: int = 0")
+      .Output("image: uint8")
+      .Doc(R"doc(
+    Decodes the contents of a PNG file into a uint8 tensor.
+
+    contents: PNG file contents.
+    channels: Number of color channels, or 0 to autodetect based on the input.
+      Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
+      If the input has a different number of channels, it will be transformed
+      accordingly.
+    image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
+      If `channels` is 0, the last dimension is determined
+      from the png contents.
+    )doc");
+    ```
+
+Results in this piece of Markdown:
+
+    ### tf.image.png_decode(contents, channels=None, name=None) {#png_decode}
+
+    Decodes the contents of a PNG file into a uint8 tensor.
+
+    #### Args:
+
+    *  <b>contents</b>: A string Tensor. PNG file contents.
+    *  <b>channels</b>: An optional int. Defaults to 0.
+       Number of color channels, or 0 to autodetect based on the input.
+       Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
+       input has a different number of channels, it will be transformed accordingly.
+    *  <b>name</b>: A name for the operation (optional).
+
+    #### Returns:
+    A 3-D uint8 tensor of shape `[height, width, channels]`.  If `channels` is
+    0, the last dimension is determined from the png contents.
+
+Much of the argument description is added automatically. In particular, the doc
+generator automatically adds the name and type of all inputs, attrs, and
+outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
+automatically. You should write your additional text to flow naturally after
+that description.
+
+For inputs and output, you can prefix your additional text with an equal sign to
+prevent the automatically added name and type. In the above example, the
+description for the output named `image` starts with `=` to prevent the addition
+of `A uint8 Tensor.` before our text `A 3-D uint8 Tensor...`. You cannot prevent
+the addition of the name, type, and default value of attrs this way, so write
+your text carefully.
+
+### Ops defined in Python
+
+If your op is defined in a `python/ops/*.py` file, then you need to provide text
+for all of the arguments and output (returned) tensors. The doc generator does
+not auto-generate any text for ops that are defined in Python, so what you write
+is what you get.
+
+You should conform to the usual Python docstring conventions, except that you
+should use Markdown in the docstring.
+
+Here's a simple example:
+
+```python
+def foo(x, y, name="bar"):
+  """Computes foo.
+
+  Given two 1-D tensors `x` and `y`, this operation computes the foo.
+
+  Example:
+
+  ```
+  # x is [1, 1]
+  # y is [2, 2]
+  tf.foo(x, y) ==> [3, 3]
+  ```
+  Args:
+    x: A `Tensor` of type `int32`.
+    y: A `Tensor` of type `int32`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int32` that is the foo of `x` and `y`.
+
+  Raises:
+    ValueError: If `x` or `y` are not of type `int32`.
+  """
+```
+
+## Description of the Docstring Sections
+
+This section details each of the elements in docstrings.
+
+### Short sentence describing what the op does
+
+Examples:
+
+```
+Concatenates tensors.
+```
+
+```
+Flips an image horizontally from left to right.
+```
+
+```
+Computes the Levenshtein distance between two sequences.
+```
+
+```
+Saves a list of tensors to a file.
+```
+
+```
+Extracts a slice from a tensor.
+```
+
+### Short description of what happens when you pass arguments to the op
+
+Examples:
+
+    Given a tensor input of numerical type, this operation returns a tensor of
+    the same type and size with values reversed along dimension `seq_dim`. A
+    vector `seq_lengths` determines which elements are reversed for each index
+    within dimension 0 (usually the batch dimension).
+
+
+    This operation returns a tensor of type `dtype` and dimensions `shape`, with
+    all elements set to zero.
+
+### Example demonstrating the op
+
+Good code samples are short and easy to understand, typically containing a brief
+snippet of code to clarify what the example is demonstrating. When an op
+manipulates the shape of a Tensor it is often useful to include an example of
+the before and after, as well.
+
+The `squeeze()` op has a nice pseudocode example:
+
+    # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+    shape(squeeze(t)) ==> [2, 3]
+
+The `tile()` op provides a good example in descriptive text:
+
+    For example, tiling `[a, b, c, d]` by `[2]` produces `[a b c d a b c d]`.
+
+It is often helpful to show code samples in Python. Never put them in the C++
+Ops file, and avoid putting them in the Python Ops doc. We recommend, if
+possible, putting code samples in the
+[API guides](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src/api_guides).
+Otherwise, add them to the module or class docstring where the Ops constructors
+are called out.
+
+Here's an example from the module docstring in `api_guides/python/math_ops.md`:
+
+    ## Segmentation
+
+    TensorFlow provides several operations that you can use to perform common
+    math computations on tensor segments.
+    ...
+    In particular, a segmentation of a matrix tensor is a mapping of rows to
+    segments.
+
+    For example:
+
+    ```python
+    c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+    tf.segment_sum(c, tf.constant([0, 0, 1]))
+      ==>  [[0 0 0 0]
+            [5 6 7 8]]
+    ```
+
+### Requirements, caveats, important notes
+
+Examples:
+
+```
+This operation requires that: `-1-input.dims() <= dim <= input.dims()`
+```
+
+```
+Note: This tensor will produce an error if evaluated. Its value must
+be fed using the `feed_dict` optional argument to `Session.run()`,
+`Tensor.eval()`, or `Operation.run()`.
+```
+
+### Descriptions of arguments and output (returned) tensors.
+
+Keep the descriptions brief and to the point. You should not have to explain how
+the operation works in the argument sections.
+
+Mention if the Op has strong constraints on the dimensions of the input or
+output tensors. Remember that for C++ Ops, the type of the tensor is
+automatically added as either as "A ..type.. Tensor" or "A Tensor with type in
+{...list of types...}". In such cases, if the Op has a constraint on the
+dimensions either add text such as "Must be 4-D" or start the description with
+`=` (to prevent the tensor type to be added) and write something like "A 4-D
+float tensor".
+
+For example, here are two ways to document an image argument of a C++ op (note
+the "=" sign):
+
+```
+image: Must be 4-D. The image to resize.
+```
+
+```
+image:= A 4-D `float` tensor. The image to resize.
+```
+
+In the documentation, these will be rendered to markdown as
+
+```
+image: A `float` Tensor. Must be 4-D. The image to resize.
+```
+
+```
+image: A 4-D `float` Tensor. The image to resize.
+```
+
+### Optional arguments descriptions ("attrs")
+
+The doc generator always describes the type for each attr and their default
+value, if any. You cannot override that with an equal sign because the
+description is very different in the C++ and Python generated docs.
+
+Phrase any additional attr description so that it flows well after the type
+and default value. The type and defaults are displayed first, and additional
+descriptions follow afterwards. Therefore, complete sentences are best.
+
+Here's an example from `image_ops.cc`:
+
+    REGISTER_OP("DecodePng")
+        .Input("contents: string")
+        .Attr("channels: int = 0")
+        .Attr("dtype: {uint8, uint16} = DT_UINT8")
+        .Output("image: dtype")
+        .SetShapeFn(DecodeImageShapeFn)
+        .Doc(R"doc(
+    Decode a PNG-encoded image to a uint8 or uint16 tensor.
+
+    The attr `channels` indicates the desired number of color channels for the
+    decoded image.
+
+    Accepted values are:
+
+    *   0: Use the number of channels in the PNG-encoded image.
+    *   1: output a grayscale image.
+    *   3: output an RGB image.
+    *   4: output an RGBA image.
+
+    If needed, the PNG-encoded image is transformed to match the requested
+    number of color channels.
+
+    contents: 0-D.  The PNG-encoded image.
+    channels: Number of color channels for the decoded image.
+    image: 3-D with shape `[height, width, channels]`.
+    )doc");
+
+This generates the following Args section in
+`api_docs/python/tf/image/decode_png.md`:
+
+    #### Args:
+
+    * <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded
+      image.
+    * <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color
+      channels for the decoded image.
+    * <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8,
+      tf.uint16`. Defaults to `tf.uint 8`.
+    * <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
new file mode 100644
index 00000000000..f4c57725a05
--- /dev/null
+++ b/tensorflow/docs_src/community/index.md
@@ -0,0 +1,12 @@
+# Community
+
+This section contains the following documents:
+
+  * @{$welcome$Welcome to the TensorFlow Community}, which explains how
+    you can get involved, where to report issues, and where to join
+    like-minded TensorFlow enthusiasts online.
+  * @{$documentation$Writing TensorFlow Documentation}, which explains
+    TensorFlow's documentation conventions.  If you are modifying
+    TensorFlow source code or documentation, please read this guide.
+  * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
+    conventions that TensorFlow developers and users should follow.
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
new file mode 100644
index 00000000000..c1595d3c955
--- /dev/null
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -0,0 +1,5 @@
+index.md
+welcome.md
+documentation.md
+style_guide.md
+benchmarks.md
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
new file mode 100644
index 00000000000..f90a6cf938d
--- /dev/null
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -0,0 +1,185 @@
+# TensorFlow Style Guide
+
+This page contains style decisions that both developers and users of TensorFlow
+should follow to increase the readability of their code, reduce the number of
+errors, and promote consistency.
+
+[TOC]
+
+## Python style
+
+Generally follow
+[PEP8 Python style guide](https://www.python.org/dev/peps/pep-0008/),
+except for using 2 spaces.
+
+
+## Python 2 and 3 compatible
+
+* All code needs to be compatible with Python 2 and 3.
+
+* Next lines should be present in all Python files:
+
+```
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+```
+
+* Use `six` to write compatible code (for example `six.moves.range`).
+
+
+## Bazel BUILD rules
+
+TensorFlow uses Bazel build system and enforces next requirements:
+
+* Every BUILD file should contain next header:
+
+```
+# Description:
+# <...>
+
+package(
+    default_visibility = ["//visibility:private"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+```
+
+* At the end of every BUILD file, should contain:
+
+```
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//third_party/tensorflow:__subpackages__"],
+)
+```
+
+* When adding new BUILD file, add this line to `tensorflow/BUILD` file into `all_opensource_files` target.
+
+```
+"//third_party/tensorflow/<directory>:all_files",
+```
+
+* For all Python BUILD targets (libraries and tests) add next line:
+
+```
+srcs_version = "PY2AND3",
+```
+
+
+## Tensor
+
+* Operations that deal with batches may assume that the first dimension of a Tensor is the batch dimension.
+
+
+## Python operations
+
+A *Python operation* is a function that, given input tensors and parameters,
+creates a part of the graph and returns output tensors.
+
+* The first arguments should be tensors, followed by basic python parameters.
+ The last argument is `name` with a default value of `None`.
+ If operation needs to save some `Tensor`s to Graph collections,
+ put the arguments with names of the collections right before `name` argument.
+
+* Tensor arguments should be either a single tensor or an iterable of tensors.
+ E.g. a "Tensor or list of Tensors" is too broad. See `assert_proper_iterable`.
+
+* Operations that take tensors as arguments should call `convert_to_tensor`
+ to convert non-tensor inputs into tensors if they are using C++ operations.
+ Note that the arguments are still described as a `Tensor` object
+ of a specific dtype in the documentation.
+
+* Each Python operation should have a `name_scope` like below. Pass as
+ arguments `name`, a default name of the op, and a list of the input tensors.
+
+* Operations should contain an extensive Python comment with Args and Returns
+ declarations that explain both the type and meaning of each value. Possible
+ shapes, dtypes, or ranks should be specified in the description.
+ @{$documentation$See documentation details}
+
+* For increased usability include an example of usage with inputs / outputs
+ of the op in Example section.
+
+Example:
+
+    def my_op(tensor_in, other_tensor_in, my_param, other_param=0.5,
+              output_collections=(), name=None):
+      """My operation that adds two tensors with given coefficients.
+
+      Args:
+        tensor_in: `Tensor`, input tensor.
+        other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor.
+        my_param: `float`, coefficient for `tensor_in`.
+        other_param: `float`, coefficient for `other_tensor_in`.
+        output_collections: `tuple` of `string`s, name of the collection to
+                            collect result of this op.
+        name: `string`, name of the operation.
+
+      Returns:
+        `Tensor` of same shape as `tensor_in`, sum of input values with coefficients.
+
+      Example:
+        >>> my_op([1., 2.], [3., 4.], my_param=0.5, other_param=0.6,
+                  output_collections=['MY_OPS'], name='add_t1t2')
+        [2.3, 3.4]
+      """
+      with tf.name_scope(name, "my_op", [tensor_in, other_tensor_in]):
+        tensor_in = tf.convert_to_tensor(tensor_in)
+        other_tensor_in = tf.convert_to_tensor(other_tensor_in)
+        result = my_param * tensor_in + other_param * other_tensor_in
+        tf.add_to_collection(output_collections, result)
+        return result
+
+Usage:
+
+    output = my_op(t1, t2, my_param=0.5, other_param=0.6,
+                   output_collections=['MY_OPS'], name='add_t1t2')
+
+
+## Layers
+
+A *Layer* is a Python operation that combines variable creation and/or one or many
+other graph operations. Follow the same requirements as for regular Python
+operation.
+
+* If a layer creates one or more variables, the layer function
+ should take next arguments also following order:
+  - `initializers`: Optionally allow to specify initializers for the variables.
+  - `regularizers`: Optionally allow to specify regularizers for the variables.
+  - `trainable`: which control if their variables are trainable or not.
+  - `scope`: `VariableScope` object that variable will be put under.
+  - `reuse`: `bool` indicator if the variable should be reused if
+             it's present in the scope.
+
+* Layers that behave differently during training should take:
+  - `is_training`: `bool` indicator to conditionally choose different 
+                   computation paths (e.g. using `tf.cond`) during execution.
+
+Example:
+
+    def conv2d(inputs,
+               num_filters_out,
+               kernel_size,
+               stride=1,
+               padding='SAME',
+               activation_fn=tf.nn.relu,
+               normalization_fn=add_bias,
+               normalization_params=None,
+               initializers=None,
+               regularizers=None,
+               trainable=True,
+               scope=None,
+               reuse=None):
+      ... see implementation at tensorflow/contrib/layers/python/layers/layers.py ...
+
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
new file mode 100644
index 00000000000..4c8c4e1a972
--- /dev/null
+++ b/tensorflow/docs_src/community/welcome.md
@@ -0,0 +1,64 @@
+# Welcome to the TensorFlow Community
+
+TensorFlow is an open-source project.  This page explains how to contribute,
+where to ask questions, and how to help each other.
+
+
+## Development
+
+The source code for TensorFlow is on
+[GitHub](https://github.com/tensorflow/tensorflow).
+
+Before contributing to TensorFlow source code, please review the
+[Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+
+### Projects developed by the TensorFlow community
+
+The TensorFlow community has created many great projects around TensorFlow, including:
+
+* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
+* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
+* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
+* [Scikit Flow - Simplified Interface for TensorFlow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/learn/python/learn)
+* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
+* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
+* [Rust language bindings](https://github.com/google/tensorflow-rust)
+* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
+
+## TensorFlow Communities Around the World
+
+Asia:
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+
+
+Europe:
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
+
+
+## Support
+
+TensorFlow provides multiple communication paths.  To pick the right path,
+please read the following list carefully:
+
+  * To ask or answer technical questions about TensorFlow, use
+    [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).
+    For example, ask or search Stack Overflow about a particular error message
+    you encountered during installation.
+  * To join general discussions about TensorFlow development and directions,
+    please join the
+    [TensorFlow discuss mailing list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
+    For example, use this mailing list to learn about new features in
+    upcoming releases of TensorFlow.
+  * To report bugs or make feature requests, use the
+    [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
+    on GitHub.  For example, use the issue tracker to request a
+    new operation in TensorFlow.
+    
+
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
new file mode 100644
index 00000000000..f3e2fac49f2
--- /dev/null
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -0,0 +1,354 @@
+# Distributed TensorFlow
+
+This document shows how to create a cluster of TensorFlow servers, and how to
+distribute a computation graph across that cluster. We assume that you are
+familiar with the @{$get_started/get_started$basic concepts} of
+writing TensorFlow programs.
+
+## Hello distributed TensorFlow!
+
+To see a simple TensorFlow cluster in action, execute the following:
+
+```shell
+# Start a TensorFlow server as a single-process "cluster".
+$ python
+>>> import tensorflow as tf
+>>> c = tf.constant("Hello, distributed TensorFlow!")
+>>> server = tf.train.Server.create_local_server()
+>>> sess = tf.Session(server.target)  # Create a session on the server.
+>>> sess.run(c)
+'Hello, distributed TensorFlow!'
+```
+
+The
+@{tf.train.Server.create_local_server}
+method creates a single-process cluster, with an in-process server.
+
+## Create a cluster
+
+<div class="video-wrapper">
+  <iframe class="devsite-embedded-youtube-video" data-video-id="la_M6bCV91M"
+          data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
+  </iframe>
+</div>
+
+A TensorFlow "cluster" is a set of "tasks" that participate in the distributed
+execution of a TensorFlow graph. Each task is associated with a TensorFlow
+"server", which contains a "master" that can be used to create sessions, and a
+"worker" that executes operations in the graph.  A cluster can also be divided
+into one or more "jobs", where each job contains one or more tasks.
+
+To create a cluster, you start one TensorFlow server per task in the cluster.
+Each task typically runs on a different machine, but you can run multiple tasks
+on the same machine (e.g. to control different GPU devices). In each task, do
+the following:
+
+1.  **Create a `tf.train.ClusterSpec`** that describes all of the tasks
+    in the cluster. This should be the same for each task.
+
+2.  **Create a `tf.train.Server`**, passing the `tf.train.ClusterSpec` to
+    the constructor, and identifying the local task with a job name
+    and task index.
+
+
+### Create a `tf.train.ClusterSpec` to describe the cluster
+
+The cluster specification dictionary maps job names to lists of network
+addresses. Pass this dictionary to
+the @{tf.train.ClusterSpec}
+constructor.  For example:
+
+<table>
+  <tr><th><code>tf.train.ClusterSpec</code> construction</th><th>Available tasks</th>
+  <tr>
+    <td><pre>
+tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+</pre></td>
+<td><code>/job:local/task:0<br/>/job:local/task:1</code></td>
+  </tr>
+  <tr>
+    <td><pre>
+tf.train.ClusterSpec({
+    "worker": [
+        "worker0.example.com:2222",
+        "worker1.example.com:2222",
+        "worker2.example.com:2222"
+    ],
+    "ps": [
+        "ps0.example.com:2222",
+        "ps1.example.com:2222"
+    ]})
+</pre></td><td><code>/job:worker/task:0</code><br/><code>/job:worker/task:1</code><br/><code>/job:worker/task:2</code><br/><code>/job:ps/task:0</code><br/><code>/job:ps/task:1</code></td>
+  </tr>
+</table>
+
+### Create a `tf.train.Server` instance in each task
+
+A @{tf.train.Server} object contains a
+set of local devices, a set of connections to other tasks in its
+`tf.train.ClusterSpec`, and a
+@{tf.Session} that can use these
+to perform a distributed computation. Each server is a member of a specific
+named job and has a task index within that job.  A server can communicate with
+any other server in the cluster.
+
+For example, to launch a cluster with two servers running on `localhost:2222`
+and `localhost:2223`, run the following snippets in two different processes on
+the local machine:
+
+```python
+# In task 0:
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+server = tf.train.Server(cluster, job_name="local", task_index=0)
+```
+```python
+# In task 1:
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+server = tf.train.Server(cluster, job_name="local", task_index=1)
+```
+
+**Note:** Manually specifying these cluster specifications can be tedious,
+especially for large clusters. We are working on tools for launching tasks
+programmatically, e.g. using a cluster manager like
+[Kubernetes](http://kubernetes.io). If there are particular cluster managers for
+which you'd like to see support, please raise a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+
+## Specifying distributed devices in your model
+
+To place operations on a particular process, you can use the same
+@{tf.device}
+function that is used to specify whether ops run on the CPU or GPU. For example:
+
+```python
+with tf.device("/job:ps/task:0"):
+  weights_1 = tf.Variable(...)
+  biases_1 = tf.Variable(...)
+
+with tf.device("/job:ps/task:1"):
+  weights_2 = tf.Variable(...)
+  biases_2 = tf.Variable(...)
+
+with tf.device("/job:worker/task:7"):
+  input, labels = ...
+  layer_1 = tf.nn.relu(tf.matmul(input, weights_1) + biases_1)
+  logits = tf.nn.relu(tf.matmul(layer_1, weights_2) + biases_2)
+  # ...
+  train_op = ...
+
+with tf.Session("grpc://worker7.example.com:2222") as sess:
+  for _ in range(10000):
+    sess.run(train_op)
+```
+
+In the above example, the variables are created on two tasks in the `ps` job,
+and the compute-intensive part of the model is created in the `worker`
+job. TensorFlow will insert the appropriate data transfers between the jobs
+(from `ps` to `worker` for the forward pass, and from `worker` to `ps` for
+applying gradients).
+
+## Replicated training
+
+A common training configuration, called "data parallelism," involves multiple
+tasks in a `worker` job training the same model on different mini-batches of
+data, updating shared parameters hosted in one or more tasks in a `ps`
+job. All tasks typically run on different machines. There are many ways to
+specify this structure in TensorFlow, and we are building libraries that will
+simplify the work of specifying a replicated model. Possible approaches include:
+
+* **In-graph replication.** In this approach, the client builds a single
+  `tf.Graph` that contains one set of parameters (in `tf.Variable` nodes pinned
+  to `/job:ps`); and multiple copies of the compute-intensive part of the model,
+  each pinned to a different task in `/job:worker`.
+
+* **Between-graph replication.** In this approach, there is a separate client
+  for each `/job:worker` task, typically in the same process as the worker
+  task. Each client builds a similar graph containing the parameters (pinned to
+  `/job:ps` as before using
+  @{tf.train.replica_device_setter}
+  to map them deterministically to the same tasks); and a single copy of the
+  compute-intensive part of the model, pinned to the local task in
+  `/job:worker`.
+
+* **Asynchronous training.** In this approach, each replica of the graph has an
+  independent training loop that executes without coordination. It is compatible
+  with both forms of replication above.
+
+* **Synchronous training.** In this approach, all of the replicas read the same
+  values for the current parameters, compute gradients in parallel, and then
+  apply them together. It is compatible with in-graph replication (e.g. using
+  gradient averaging as in the
+  [CIFAR-10 multi-GPU trainer](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
+  and between-graph replication (e.g. using the
+  @{tf.train.SyncReplicasOptimizer}).
+
+### Putting it all together: example trainer program
+
+The following code shows the skeleton of a distributed trainer program,
+implementing **between-graph replication** and **asynchronous training**. It
+includes the code for the parameter server and worker tasks.
+
+```python
+import argparse
+import sys
+
+import tensorflow as tf
+
+FLAGS = None
+
+
+def main(_):
+  ps_hosts = FLAGS.ps_hosts.split(",")
+  worker_hosts = FLAGS.worker_hosts.split(",")
+
+  # Create a cluster from the parameter server and worker hosts.
+  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
+
+  # Create and start a server for the local task.
+  server = tf.train.Server(cluster,
+                           job_name=FLAGS.job_name,
+                           task_index=FLAGS.task_index)
+
+  if FLAGS.job_name == "ps":
+    server.join()
+  elif FLAGS.job_name == "worker":
+
+    # Assigns ops to the local worker by default.
+    with tf.device(tf.train.replica_device_setter(
+        worker_device="/job:worker/task:%d" % FLAGS.task_index,
+        cluster=cluster)):
+
+      # Build model...
+      loss = ...
+      global_step = tf.contrib.framework.get_or_create_global_step()
+
+      train_op = tf.train.AdagradOptimizer(0.01).minimize(
+          loss, global_step=global_step)
+
+    # The StopAtStepHook handles stopping after running given steps.
+    hooks=[tf.train.StopAtStepHook(last_step=1000000)]
+
+    # The MonitoredTrainingSession takes care of session initialization,
+    # restoring from a checkpoint, saving to a checkpoint, and closing when done
+    # or an error occurs.
+    with tf.train.MonitoredTrainingSession(master=server.target,
+                                           is_chief=(FLAGS.task_index == 0),
+                                           checkpoint_dir="/tmp/train_logs",
+                                           hooks=hooks) as mon_sess:
+      while not mon_sess.should_stop():
+        # Run a training step asynchronously.
+        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
+        # perform *synchronous* training.
+        # mon_sess.run handles AbortedError in case of preempted PS.
+        mon_sess.run(train_op)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  # Flags for defining the tf.train.ClusterSpec
+  parser.add_argument(
+      "--ps_hosts",
+      type=str,
+      default="",
+      help="Comma-separated list of hostname:port pairs"
+  )
+  parser.add_argument(
+      "--worker_hosts",
+      type=str,
+      default="",
+      help="Comma-separated list of hostname:port pairs"
+  )
+  parser.add_argument(
+      "--job_name",
+      type=str,
+      default="",
+      help="One of 'ps', 'worker'"
+  )
+  # Flags for defining the tf.train.Server
+  parser.add_argument(
+      "--task_index",
+      type=int,
+      default=0,
+      help="Index of task within the job"
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+```
+
+To start the trainer with two parameter servers and two workers, use the
+following command line (assuming the script is called `trainer.py`):
+
+```shell
+# On ps0.example.com:
+$ python trainer.py \
+     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
+     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
+     --job_name=ps --task_index=0
+# On ps1.example.com:
+$ python trainer.py \
+     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
+     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
+     --job_name=ps --task_index=1
+# On worker0.example.com:
+$ python trainer.py \
+     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
+     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
+     --job_name=worker --task_index=0
+# On worker1.example.com:
+$ python trainer.py \
+     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
+     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
+     --job_name=worker --task_index=1
+```
+
+## Glossary
+
+**Client**
+
+A client is typically a program that builds a TensorFlow graph and constructs a
+`tensorflow::Session` to interact with a cluster. Clients are typically written
+in Python or C++. A single client process can directly interact with multiple
+TensorFlow servers (see "Replicated training" above), and a single server can
+serve multiple clients.
+
+**Cluster**
+
+A TensorFlow cluster comprises a one or more "jobs", each divided into lists of
+one or more "tasks". A cluster is typically dedicated to a particular high-level
+objective, such as training a neural network, using many machines in parallel. A
+cluster is defined by
+a @{tf.train.ClusterSpec} object.
+
+**Job**
+
+A job comprises a list of "tasks", which typically serve a common purpose.
+For example, a job named `ps` (for "parameter server") typically hosts nodes
+that store and update variables; while a job named `worker` typically hosts
+stateless nodes that perform compute-intensive tasks. The tasks in a job
+typically run on different machines. The set of job roles is flexible:
+for example, a `worker` may maintain some state.
+
+**Master service**
+
+An RPC service that provides remote access to a set of distributed devices,
+and acts as a session target. The master service implements the
+`tensorflow::Session` interface, and is responsible for coordinating work across
+one or more "worker services". All TensorFlow servers implement the master
+service.
+
+**Task**
+
+A task corresponds to a specific TensorFlow server, and typically corresponds
+to a single process. A task belongs to a particular "job" and is identified by
+its index within that job's list of tasks.
+
+**TensorFlow server** A process running
+a @{tf.train.Server} instance, which is
+a member of a cluster, and exports a "master service" and "worker service".
+
+**Worker service**
+
+An RPC service that executes parts of a TensorFlow graph using its local devices.
+A worker service implements [worker_service.proto](https://www.tensorflow.org/code/tensorflow/core/protobuf/worker_service.proto).
+All TensorFlow servers implement the worker service.
diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md
new file mode 100644
index 00000000000..c50c1580a5e
--- /dev/null
+++ b/tensorflow/docs_src/deploy/hadoop.md
@@ -0,0 +1,65 @@
+# How to run TensorFlow on Hadoop
+
+This document describes how to run TensorFlow on Hadoop. It will be expanded to
+describe running on various cluster managers, but only describes running on HDFS
+at the moment.
+
+## HDFS
+
+We assume that you are familiar with @{$reading_data$reading data}.
+
+To use HDFS with TensorFlow, change the file paths you use to read and write
+data to an HDFS path. For example:
+
+```python
+filename_queue = tf.train.string_input_producer([
+    "hdfs://namenode:8020/path/to/file1.csv",
+    "hdfs://namenode:8020/path/to/file2.csv",
+])
+```
+
+If you want to use the namenode specified in your HDFS configuration files, then
+change the file prefix to `hdfs://default/`.
+
+When launching your TensorFlow program, the following environment variables must
+be set:
+
+*   **JAVA_HOME**: The location of your Java installation.
+*   **HADOOP_HDFS_HOME**: The location of your HDFS installation. You can also
+    set this environment variable by running:
+
+    ```shell
+    source ${HADOOP_HOME}/libexec/hadoop-config.sh
+    ```
+
+*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path 
+    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in 
+    `$HADOOP_HDFS_HOME/lib/native`. On Linux:
+
+    ```shell
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server
+    ```
+
+*   **CLASSPATH**: The Hadoop jars must be added prior to running your
+    TensorFlow program. The CLASSPATH set by
+    `${HADOOP_HOME}/libexec/hadoop-config.sh` is insufficient. Globs must be
+    expanded as described in the libhdfs documentation:
+
+    ```shell
+    CLASSPATH=$(${HADOOP_HDFS_HOME}/bin/hadoop classpath --glob) python your_script.py
+    ```
+    For older version of Hadoop/libhdfs (older than 2.6.0), you have to expand the
+    classpath wildcard manually. For more details, see
+    [HADOOP-10903](https://issues.apache.org/jira/browse/HADOOP-10903).
+
+If the Hadoop cluster is in secure mode, the following environment variable must
+be set:
+
+*   **KERB_TICKET_CACHE_PATH**: The path of Kerberos ticket cache file. For example:
+
+    ```shell
+    export KERB_TICKET_CACHE_PATH=/tmp/krb5cc_10002
+    ```
+
+If you are running @{$distributed$Distributed TensorFlow}, then all
+workers must have the environment variables set and Hadoop installed.
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
new file mode 100644
index 00000000000..644115b8ff5
--- /dev/null
+++ b/tensorflow/docs_src/deploy/index.md
@@ -0,0 +1,15 @@
+# Deploy
+
+This section focuses on deploying real-world models.  It contains
+the following documents:
+
+  * @{$distributed$Distributed TensorFlow}, which explains how to create
+    a cluster of TensorFlow servers.
+  * @{$tfserve$TensorFlow Serving}, which describes TensorFlow Serving--an
+    open-source serving system for machine learning models. This document
+    provides a short introduction to TensorFlow Serving; the bulk of the
+    documentation about TensorFlow Serving is in a
+    [separate website](https://tensorflow.github.io/serving/serving_basic).
+  * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
+    self-explanatory title.
+
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
new file mode 100644
index 00000000000..6f5821cfd52
--- /dev/null
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -0,0 +1,4 @@
+index.md
+distributed.md
+tfserve.md
+hadoop.md
diff --git a/tensorflow/g3doc/tutorials/tfserve/index.md b/tensorflow/docs_src/deploy/tfserve.md
similarity index 100%
rename from tensorflow/g3doc/tutorials/tfserve/index.md
rename to tensorflow/docs_src/deploy/tfserve.md
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
new file mode 100644
index 00000000000..b955ee8fbee
--- /dev/null
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -0,0 +1,256 @@
+# Adding a Custom Filesystem Plugin
+
+## Background
+
+The TensorFlow framework is often used in multi-process and
+multi-machine environments, such as Google data centers, Google Cloud
+Machine Learning, Amazon Web Services (AWS), and on-site distributed clusters.
+In order to both share and save certain types of state produced by TensorFlow,
+the framework assumes the existence of a reliable, shared filesystem. This
+shared filesystem has numerous uses, for example:
+
+*   Checkpoints of state are often saved to a distributed filesystem for
+    reliability and fault-tolerance.
+*   Training processes communicate with TensorBoard by writing event files
+    to a directory, which TensorBoard watches. A shared filesystem allows this
+    communication to work even when TensorBoard runs in a different process or
+    machine.
+
+There are many different implementations of shared or distributed filesystems in
+the real world, so TensorFlow provides an ability for users to implement a
+custom FileSystem plugin that can be registered with the TensorFlow runtime.
+When the TensorFlow runtime attempts to write to a file through the `FileSystem`
+interface, it uses a portion of the pathname to dynamically select the
+implementation that should be used for filesystem operations. Thus, adding
+support for your custom filesystem requires implementing a `FileSystem`
+interface, building a shared object containing that implementation, and loading
+that object at runtime in whichever process needs to write to that filesystem.
+
+Note that TensorFlow already includes many filesystem implementations, such as:
+
+*   A standard POSIX filesystem
+
+    Note: NFS filesystems often mount as a POSIX interface, and so standard
+    TensorFlow can work on top of NFS-mounted remote filesystems.
+*   HDFS - the Hadoop File System
+*   GCS - Google Cloud Storage filesystem
+*   A "memory-mapped-file" filesystem
+
+The rest of this guide describes how to implement a custom filesystem.
+
+## Implementing a custom filesystem plugin
+
+To implement a custom filesystem plugin, you must do the following:
+
+*   Implement subclasses of `RandomAccessFile`, `WriteableFile`,
+    `AppendableFile`, and `ReadOnlyMemoryRegion`.
+*   Implement the `FileSystem` interface as a subclass.
+*   Register the `FileSystem` implementation with an appropriate prefix pattern.
+*   Load the filesystem plugin in a process that wants to write to that
+    filesystem.
+
+### The FileSystem interface
+
+The `FileSystem` interface is an abstract C++ interface defined in
+[file_system.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/file_system.h).
+An implementation of the `FileSystem` interface should implement all relevant
+the methods defined by the interface. Implementing the interface requires
+defining operations such as creating `RandomAccessFile`, `WritableFile`, and
+implementing standard filesystem operations such as `FileExists`, `IsDirectory`,
+`GetMatchingPaths`, `DeleteFile`, and so on. An implementation of these
+interfaces will often involve translating the function's input arguments to
+delegate to an already-existing library function implementing the equivalent
+functionality in your custom filesystem.
+
+For example, the `PosixFileSystem` implementation implements `DeleteFile` using
+the POSIX `unlink()` function; `CreateDir` simply calls `mkdir()`; `GetFileSize`
+involves calling `stat()` on the file and then returns the filesize as reported
+by the return of the stat object. Similarly, for the `HDFSFileSystem`
+implementation, these calls simply delegate to the `libHDFS` implementation of
+similar functionality, such as `hdfsDelete` for
+[DeleteFile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.cc#L386).
+
+We suggest looking through these code examples to get an idea of how different
+filesystem implementations call their existing libraries. Examples include:
+
+*   [POSIX
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/posix/posix_file_system.h)
+*   [HDFS
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h)
+*   [GCS
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h)
+
+#### The File interfaces
+
+Beyond operations that allow you to query and manipulate files and directories
+in a filesystem, the `FileSystem` interface requires you to implement factories
+that return implementations of abstract objects such as the
+[RandomAccessFile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/file_system.h#L223),
+the `WritableFile`, so that TensorFlow code and read and write to files in that
+`FileSystem` implementation.
+
+To implement a `RandomAccessFile`, you must implement a single interface called
+`Read()`, in which the implementation must provide a way to read from an offset
+within a named file.
+
+For example, below is the implementation of RandomAccessFile for the POSIX
+filesystem, which uses the `pread()` random-access POSIX function to implement
+read. Notice that the particular implementation must know how to retry or
+propagate errors from the underlying filesystem.
+
+```C++
+    class PosixRandomAccessFile : public RandomAccessFile {
+     public:
+      PosixRandomAccessFile(const string& fname, int fd)
+          : filename_(fname), fd_(fd) {}
+      ~PosixRandomAccessFile() override { close(fd_); }
+
+      Status Read(uint64 offset, size_t n, StringPiece* result,
+                  char* scratch) const override {
+        Status s;
+        char* dst = scratch;
+        while (n > 0 && s.ok()) {
+          ssize_t r = pread(fd_, dst, n, static_cast<off_t>(offset));
+          if (r > 0) {
+            dst += r;
+            n -= r;
+            offset += r;
+          } else if (r == 0) {
+            s = Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+          } else if (errno == EINTR || errno == EAGAIN) {
+            // Retry
+          } else {
+            s = IOError(filename_, errno);
+          }
+        }
+        *result = StringPiece(scratch, dst - scratch);
+        return s;
+      }
+
+     private:
+      string filename_;
+      int fd_;
+    };
+```
+
+To implement the WritableFile sequential-writing abstraction, one must implement
+a few interfaces, such as `Append()`, `Flush()`, `Sync()`, and `Close()`.
+
+For example, below is the implementation of WritableFile for the POSIX
+filesystem, which takes a `FILE` object in its constructor and uses standard
+posix functions on that object to implement the interface.
+
+```C++
+    class PosixWritableFile : public WritableFile {
+     public:
+      PosixWritableFile(const string& fname, FILE* f)
+          : filename_(fname), file_(f) {}
+
+      ~PosixWritableFile() override {
+        if (file_ != NULL) {
+          fclose(file_);
+        }
+      }
+
+      Status Append(const StringPiece& data) override {
+        size_t r = fwrite(data.data(), 1, data.size(), file_);
+        if (r != data.size()) {
+          return IOError(filename_, errno);
+        }
+        return Status::OK();
+      }
+
+      Status Close() override {
+        Status result;
+        if (fclose(file_) != 0) {
+          result = IOError(filename_, errno);
+        }
+        file_ = NULL;
+        return result;
+      }
+
+      Status Flush() override {
+        if (fflush(file_) != 0) {
+          return IOError(filename_, errno);
+        }
+        return Status::OK();
+      }
+
+      Status Sync() override {
+        Status s;
+        if (fflush(file_) != 0) {
+          s = IOError(filename_, errno);
+        }
+        return s;
+      }
+
+     private:
+      string filename_;
+      FILE* file_;
+    };
+
+```
+
+For more details, please see the documentations of those interfaces, and look at
+example implementations for inspiration.
+
+### Registering and loading the filesystem
+
+Once you have implemented the `FileSystem` implementation for your custom
+filesystem, you need to register it under a "scheme" so that paths prefixed with
+that scheme are directed to your implementation. To do this, you call
+`REGISTER_FILE_SYSTEM`::
+
+```
+    REGISTER_FILE_SYSTEM("foobar", FooBarFileSystem);
+```
+
+When TensorFlow tries to operate on a file whose path starts with `foobar://`,
+it will use the `FooBarFileSystem` implementation.
+
+```C++
+    string filename = "foobar://path/to/file.txt";
+    std::unique_ptr<WritableFile> file;
+
+    // Calls FooBarFileSystem::NewWritableFile to return
+    // a WritableFile class, which happens to be the FooBarFileSystem's
+    // WritableFile implementation.
+    TF_RETURN_IF_ERROR(env->NewWritableFile(filename, &file));
+```
+
+Next, you must build a shared object containing this implementation. An example
+of doing so using bazel's `cc_binary` rule can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244),
+but you may use any build system to do so. See the section on @{$adding_an_op#build-the-op-library$building the op library} for similar
+instructions.
+
+The result of building this target is a `.so` shared object file.
+
+Lastly, you must dynamically load this implementation in the process. In Python,
+you can call the `tf.load_file_system_library(file_system_library)` function,
+passing the path to the shared object. Calling this in your client program loads
+the shared object in the process, thus registering your implementation as
+available for any file operations going through the `FileSystem` interface. You
+can see
+[test_file_system.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/file_system_test.py)
+for an example.
+
+## What goes through this interface?
+
+Almost all core C++ file operations within TensorFlow use the `FileSystem`
+interface, such as the `CheckpointWriter`, the `EventsWriter`, and many other
+utilities. This means implementing a `FileSystem` implementation allows most of
+your TensorFlow programs to write to your shared filesystem.
+
+In Python, the `gfile` and `file_io` classes bind underneath to the `FileSystem
+implementation via SWIG, which means that once you have loaded this filesystem
+library, you can do:
+
+```
+with gfile.Open("foobar://path/to/file.txt") as w:
+
+  w.write("hi")
+```
+
+When you do this, a file containing "hi" will appear in the "/path/to/file.txt"
+of your shared filesystem.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
new file mode 100644
index 00000000000..a8c28e98c9b
--- /dev/null
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -0,0 +1,1420 @@
+# Adding a New Op
+
+If you'd like to create an op that isn't covered by the existing TensorFlow
+library, we recommend that you first try writing the op in Python as
+a composition of existing Python ops or functions. If that isn't possible, you
+can create a custom C++ op. There are several reasons why you might want to
+create a custom C++ op:
+
+*   It's not easy or possible to express your operation as a composition of
+    existing ops.
+*   It's not efficient to express your operation as a composition of existing
+    primitives.
+*   You want to hand-fuse a composition of primitives that a future compiler
+    would find difficult fusing.
+
+For example, imagine you want to implement something like "median pooling",
+similar to the "MaxPool" operator, but computing medians over sliding windows
+instead of maximum values.  Doing this using a composition of operations may be
+possible (e.g., using ExtractImagePatches and TopK), but may not be as
+performance- or memory-efficient as a native operation where you can do
+something more clever in a single, fused operation. As always, it is typically
+first worth trying to express what you want using operator composition, only
+choosing to add a new operation if that proves to be difficult or inefficient.
+
+To incorporate your custom op you'll need to:
+
+1.  Register the new op in a C++ file. Op registration defines an interface
+    (specification) for the op's functionality, which is independent of the
+    op's implementation. For example, op registration defines the op's name and
+    the op's inputs and outputs. It also defines the shape function
+    that is used for tensor shape inference.
+2.  Implement the op in C++. The implementation of an op is known
+    as a kernel, and it is the concrete implementation of the specification you
+    registered in Step 1. There can be multiple kernels for different input /
+    output types or architectures (for example, CPUs, GPUs).
+3.  Create a Python wrapper (optional). This wrapper is the public API that's
+    used to create the op in Python. A default wrapper is generated from the
+    op registration, which can be used directly or added to.
+4.  Write a function to compute gradients for the op (optional).
+5.  Test the op. We usually do this in Python for convenience, but you can also
+    test the op in C++. If you define gradients, you can verify them with the
+    Python @{tf.test.compute_gradient_error$gradient checker}.
+    See
+    [`relu_op_test.py`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/relu_op_test.py) as
+    an example that tests the forward functions of Relu-like operators and
+    their gradients.
+
+PREREQUISITES:
+
+*   Some familiarity with C++.
+*   Must have installed the
+    @{$install$TensorFlow binary}, or must have
+    @{$install_sources$downloaded TensorFlow source},
+    and be able to build it.
+
+[TOC]
+
+## Define the op's interface
+
+You define the interface of an op by registering it with the TensorFlow system.
+In the registration, you specify the name of your op, its inputs (types and
+names) and outputs (types and names), as well as docstrings and
+any [attrs](#attrs) the op might require.
+
+To see how this works, suppose you'd like to create an op that takes a tensor of
+`int32`s and outputs a copy of the tensor, with all but the first element set to
+zero. To do this, create a file named `zero_out.cc`. Then add a call to the
+`REGISTER_OP` macro that defines the interface for your op:
+
+```c++
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+
+REGISTER_OP("ZeroOut")
+    .Input("to_zero: int32")
+    .Output("zeroed: int32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+```
+
+This `ZeroOut` op takes one tensor `to_zero` of 32-bit integers as input, and
+outputs a tensor `zeroed` of 32-bit integers. The op also uses a shape function
+to ensure that the output tensor is the same shape as the input tensor. For
+example, if the input is a tensor of shape [10, 20], then this shape function
+specifies that the output shape is also [10, 20].
+
+
+>   A note on naming: The op name must be in CamelCase and it must be unique
+>   among all other ops that are registered in the binary.
+
+## Implement the kernel for the op
+
+After you define the interface, provide one or more implementations of the op.
+To create one of these kernels, create a class that extends `OpKernel` and
+overrides the `Compute` method. The `Compute` method provides one `context`
+argument of type `OpKernelContext*`, from which you can access useful things
+like the input and output tensors.
+
+Add your kernel to the file you created above. The kernel might look something
+like this:
+
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+class ZeroOutOp : public OpKernel {
+ public:
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<int32>();
+
+    // Create an output tensor
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+    auto output_flat = output_tensor->flat<int32>();
+
+    // Set all but the first element of the output tensor to 0.
+    const int N = input.size();
+    for (int i = 1; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
+    // Preserve the first input value if possible.
+    if (N > 0) output_flat(0) = input(0);
+  }
+};
+```
+
+After implementing your kernel, you register it with the TensorFlow system. In
+the registration, you specify different constraints under which this kernel
+will run. For example, you might have one kernel made for CPUs, and a separate
+one for GPUs.
+
+To do this for the `ZeroOut` op, add the following to `zero_out.cc`:
+
+```c++
+REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
+```
+
+>   Important: Instances of your OpKernel may be accessed concurrently.
+>   Your `Compute` method must be thread-safe. Guard any access to class
+>   members with a mutex. Or better yet, don't share state via class members!
+>   Consider using a [`ResourceMgr`](https://www.tensorflow.org/code/tensorflow/core/framework/resource_mgr.h)
+>   to keep track of op state.
+
+### Multi-threaded CPU kernels
+
+To write a multi-threaded CPU kernel, the Shard function in
+[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/framework/work_sharder.h)
+can be used. This function shards a computation function across the
+threads configured to be used for intra-op threading (see
+intra_op_parallelism_threads in
+[`config.proto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)).
+
+### GPU kernels
+
+A GPU kernel is implemented in two parts: the OpKernel and the CUDA kernel and
+its launch code.
+
+Sometimes the OpKernel implementation is common between a CPU and GPU kernel,
+such as around inspecting inputs and allocating outputs.  In that case, a
+suggested implementation is to:
+
+1. Define the OpKernel templated on the Device and the primitive type of the
+   tensor.
+2. To do the actual computation of the output, the Compute function calls a
+    templated functor struct.
+3. The specialization of that functor for the CPUDevice is defined in the same
+   file, but the specialization for the GPUDevice is defined in a .cu.cc file,
+   since it will be compiled with the CUDA compiler.
+
+<!--zippy-->
+
+Expand this to see the example implementation.
+
+```c++
+// example.h
+#ifndef KERNEL_EXAMPLE_H_
+#define KERNEL_EXAMPLE_H_
+
+template <typename Device, typename T>
+struct ExampleFunctor {
+  void operator()(const Device& d, int size, const T* in, T* out);
+};
+
+#endif KERNEL_EXAMPLE_H_
+```
+
+```c++
+// example.cc
+#define EIGEN_USE_THREADS
+#include "example.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+// CPU specialization of actual computation.
+template <typename T>
+struct ExampleFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, int size, const T* in, T* out) {
+    for (int i = 0; i < size; ++i) {
+      out[i] = 2 * in[i];
+    }
+  }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class ExampleOp : public OpKernel {
+ public:
+  explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+
+    // Create an output tensor
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+
+    // Do the computation.
+    OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
+                errors::InvalidArgument("Too many elements in tensor"));
+    ExampleFunctor<Device, T>()(
+        context->eigen_device<Device>(),
+        static_cast<int>(input_tensor.NumElements()),
+        input_tensor.flat<T>().data(),
+        output_tensor->flat<T>().data());
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ExampleOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(int32);
+
+// Register the GPU kernels.
+#ifdef GOOGLE_CUDA
+#define REGISTER_GPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      ExampleOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(int32);
+#endif  // GOOGLE_CUDA
+```
+
+```c++
+#ifdef GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+#define EIGEN_USE_THREADS
+
+#include "example.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+using namespace tensorflow;
+
+#define EIGEN_USE_GPU
+
+// Define the CUDA kernel.
+template <typename T>
+__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 2 * ldg(in + i);
+  }
+}
+
+// Define the GPU implementation that launches the CUDA kernel.
+template <typename T>
+struct ExampleFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, int size, const T* in, T* out) {
+    // Launch the cuda kernel.
+    //
+    // See core/util/cuda_kernel_helper.h for example of computing
+    // block count and thread_per_block count.
+    int block_count = 1024;
+    int thread_per_block = 20;
+    ExampleCudaKernel<T>
+        <<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
+  }
+};
+
+// Instantiate functors for the types of OpKernels registered.
+typedef Eigen::GpuDevice GPUDevice;
+template struct ExampleFunctor<GPUDevice, float>;
+template struct ExampleFunctor<GPUDevice, int32>;
+
+#endif  // GOOGLE_CUDA
+```
+
+<!--endzippy-->
+
+## Build the op library
+### Compile the op using your system compiler (TensorFlow binary installation)
+
+You should be able to compile `zero_out.cc` with a `C++` compiler such as `g++`
+or `clang` available on your system. The binary PIP package installs the header
+files and the library that you need to compile your op in locations that are
+system specific. However, the TensorFlow python library provides the
+`get_include` function to get the header directory.
+Here is the output of this function on an Ubuntu machine.
+
+```bash
+$ python
+>>> import tensorflow as tf
+>>> tf.sysconfig.get_include()
+'/usr/local/lib/python2.7/site-packages/tensorflow/include'
+
+```
+
+Assuming you have `g++` installed, here is the sequence of commands you can use
+to compile your op into a dynamic library.
+
+```bash
+TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
+
+g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I $TF_INC -O2
+```
+
+On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
+building the `.so` file.
+
+>   Note on `gcc` version `>=5`: gcc uses the new C++
+>   [ABI](https://gcc.gnu.org/gcc-5/changes.html#libstdcxx) since version `5`. The binary pip
+>   packages available on the TensorFlow website are built with `gcc4` that uses
+>   the older ABI. If you compile your op library with `gcc>=5`, add
+>   `-D_GLIBCXX_USE_CXX11_ABI=0` to the command line to make the library
+>   compatible with the older abi.
+>   Furthermore if you are using TensorFlow package created from source remember to add `--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"`
+>   as bazel command to compile the Python package.
+
+### Compile the op using bazel (TensorFlow source installation)
+
+If you have TensorFlow sources installed, you can make use of TensorFlow's build
+system to compile your op. Place a BUILD file with following Bazel build rule in
+the [`tensorflow/core/user_ops`][user_ops] directory.
+
+```python
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+
+tf_custom_op_library(
+    name = "zero_out.so",
+    srcs = ["zero_out.cc"],
+)
+```
+
+Run the following command to build `zero_out.so`.
+
+```bash
+$ bazel build --config opt //tensorflow/core/user_ops:zero_out.so
+```
+
+>   Note: Although you can create a shared library (a `.so` file) with the
+>   standard `cc_library` rule, we strongly recommend that you use the
+>   `tf_custom_op_library` macro. It adds some required dependencies, and
+>   performs checks to ensure that the shared library is compatible with
+>   TensorFlow's plugin loading mechanism.
+
+## Use the op in Python
+
+TensorFlow Python API provides the
+@{tf.load_op_library} function to
+load the dynamic library and register the op with the TensorFlow
+framework. `load_op_library` returns a Python module that contains the Python
+wrappers for the op and the kernel. Thus, once you have built the op, you can
+do the following to run it from Python:
+
+```python
+import tensorflow as tf
+zero_out_module = tf.load_op_library('./zero_out.so')
+with tf.Session(''):
+  zero_out_module.zero_out([[1, 2], [3, 4]]).eval()
+
+# Prints
+array([[1, 0], [0, 0]], dtype=int32)
+```
+
+Keep in mind, the generated function will be given a snake\_case name (to comply
+with [PEP8](https://www.python.org/dev/peps/pep-0008/)). So, if your op is
+named `ZeroOut` in the C++ files, the python function will be called `zero_out`.
+
+To make the op available as a regular function `import`-able from a Python
+module, it maybe useful to have the `load_op_library` call in a Python source
+file as follows:
+
+```python
+import tensorflow as tf
+
+zero_out_module = tf.load_op_library('./zero_out.so')
+zero_out = zero_out_module.zero_out
+```
+
+## Verify that the op works
+
+A good way to verify that you've successfully implemented your op is to write a
+test for it. Create the file
+`zero_out_op_test.py` with the contents:
+
+```python
+import tensorflow as tf
+
+class ZeroOutTest(tf.test.TestCase):
+  def testZeroOut(self):
+    zero_out_module = tf.load_op_library('./zero_out.so')
+    with self.test_session():
+      result = zero_out_module.zero_out([5, 4, 3, 2, 1])
+      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
+
+if __name__ == "__main__":
+  tf.test.main()
+```
+
+Then run your test (assuming you have tensorflow installed):
+
+```sh
+$ python zero_out_op_test.py
+```
+
+## Building advanced features into your op
+
+Now that you know how to build a basic (and somewhat restricted) op and
+implementation, we'll look at some of the more complicated things you will
+typically need to build into your op. This includes:
+
+*   [Conditional checks and validation](#validate)
+*   Op registration
+    *   [Attrs](#attrs)
+    *   [Attr types](#attr-types)
+    *   [Polymorphism](#polymorphism)
+    *   [Inputs and outputs](#inputs-outputs)
+    *   [Backwards compatibility](#backward-compat)
+*   [GPU support](#gpu-support)
+    *   [Compiling the kernel for the GPU device](#compiling-kernel)
+*   [Implement the gradient in Python](#implement-gradient)
+*   [Shape functions in C++](#shape-functions)
+
+### Conditional checks and validation {#validate}
+
+The example above assumed that the op applied to a tensor of any shape.  What
+if it only applied to vectors?  That means adding a check to the above OpKernel
+implementation.
+
+```c++
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
+                errors::InvalidArgument("ZeroOut expects a 1-D vector."));
+    // ...
+  }
+```
+
+This asserts that the input is a vector, and returns having set the
+`InvalidArgument` status if it isn't.  The
+[`OP_REQUIRES` macro][validation-macros] takes three arguments:
+
+*   The `context`, which can either be an `OpKernelContext` or
+    `OpKernelConstruction` pointer (see
+    [`tensorflow/core/framework/op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op_kernel.h)),
+    for its `SetStatus()` method.
+*   The condition.  For example, there are functions for validating the shape
+    of a tensor in
+    [`tensorflow/core/framework/tensor_shape.h`](https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.h)
+*   The error itself, which is represented by a `Status` object, see
+    [`tensorflow/core/lib/core/status.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/status.h). A
+    `Status` has both a type (frequently `InvalidArgument`, but see the list of
+    types) and a message.  Functions for constructing an error may be found in
+    [`tensorflow/core/lib/core/errors.h`][validation-macros].
+
+Alternatively, if you want to test whether a `Status` object returned from some
+function is an error, and if so return it, use
+[`OP_REQUIRES_OK`][validation-macros].  Both of these macros return from the
+function on error.
+
+### Op registration
+
+#### Attrs {#attrs}
+
+Ops can have attrs, whose values are set when the op is added to a graph. These
+are used to configure the op, and their values can be accessed both within the
+kernel implementation and in the types of inputs and outputs in the op
+registration. Prefer using an input instead of an attr when possible, since
+inputs are more flexible. This is because attrs are constants and must be
+defined at graph construction time. In contrast, inputs are Tensors whose
+values can be dynamic; that is, inputs can change every step, be set using a
+feed, etc. Attrs are used for things that can't be done with inputs: any
+configuration that affects the signature (number or type of inputs or outputs)
+or that can't change from step-to-step.
+
+You define an attr when you register the op, by specifying its name and type
+using the `Attr` method, which expects a spec of the form:
+
+```
+<name>: <attr-type-expr>
+```
+
+where `<name>` begins with a letter and can be composed of alphanumeric
+characters and underscores, and `<attr-type-expr>` is a type expression of the
+form [described below](#attr-types).
+
+For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
+instead of only the 0th element, you can register the op like so:
+<pre class="prettyprint"><code class="lang-cpp">
+REGISTER\_OP("ZeroOut")
+    <b>.Attr("preserve\_index: int")</b>
+    .Input("to\_zero: int32")
+    .Output("zeroed: int32");
+</code></pre>
+
+(Note that the set of [attribute types](#attr-types) is different from the
+@{$dims_types$tensor types} used for inputs and outputs.)
+
+Your kernel can then access this attr in its constructor via the `context`
+parameter:
+<pre class="prettyprint"><code class="lang-cpp">
+class ZeroOutOp : public OpKernel {
+ public:
+  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+    // Get the index of the value to preserve
+    OP\_REQUIRES\_OK(context,
+                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
+    // Check that preserve\_index is positive
+    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
+                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
+                                        preserve\_index_));
+  </b>}
+  void Compute(OpKernelContext\* context) override {
+    // ...
+  }
+ <b>private:
+  int preserve\_index\_;</b>
+};
+</code></pre>
+
+which can then be used in the `Compute` method:
+<pre class="prettyprint"><code class="lang-cpp">
+  void Compute(OpKernelContext\* context) override {
+    // ...
+<br/>
+    <b>// We're using saved attr to validate potentially dynamic input
+    // So we check that preserve\_index is in range
+    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
+                errors::InvalidArgument("preserve\_index out of range"));<br/>
+    </b>// Set all the elements of the output tensor to 0
+    const int N = input.size();
+    for (int i = 0; i < N; i++) {
+      output\_flat(i) = 0;
+    }<br/>
+    <b>// Preserve the requested input value
+    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+  }
+</code></pre>
+
+#### Attr types {#attr-types}
+
+The following types are supported in an attr:
+
+* `string`: Any sequence of bytes (not required to be UTF8).
+* `int`: A signed integer.
+* `float`: A floating point number.
+* `bool`: True or false.
+* `type`: One of the (non-ref) values of [`DataType`][DataTypeString].
+* `shape`: A [`TensorShapeProto`][TensorShapeProto].
+* `tensor`: A [`TensorProto`][TensorProto].
+* `list(<type>)`: A list of `<type>`, where `<type>` is one of the above types.
+  Note that `list(list(<type>))` is invalid.
+
+See also: [`op_def_builder.cc:FinalizeAttr`][FinalizeAttr] for a definitive list.
+
+##### Default values & constraints
+
+Attrs may have default values, and some types of attrs can have constraints. To
+define an attr with constraints, you can use the following `<attr-type-expr>`s:
+
+* `{'<string1>', '<string2>'}`: The value must be a string that has either the
+  value `<string1>` or `<string2>`.  The name of the type, `string`, is implied
+  when you use this syntax.  This emulates an enum:
+
+  ```c++
+  REGISTER_OP("EnumExample")
+      .Attr("e: {'apple', 'orange'}");
+  ```
+
+* `{<type1>, <type2>}`: The value is of type `type`, and must be one of
+  `<type1>` or `<type2>`, where `<type1>` and `<type2>` are supported
+  @{$dims_types#data-types$tensor types}.  You don't specify
+  that the type of the attr is `type`. This is implied when you have a list of
+  types in `{...}`.  For example, in this case the attr `t` is a type that must
+  be an `int32`, a `float`, or a `bool`:
+
+  ```c++
+  REGISTER_OP("RestrictedTypeExample")
+      .Attr("t: {int32, float, bool}");
+  ```
+
+* There are shortcuts for common type constraints:
+    * `numbertype`: Type `type` restricted to the numeric (non-string and
+      non-bool) types.
+    * `realnumbertype`: Like `numbertype` without complex types.
+    * `quantizedtype`: Like `numbertype` but just the quantized number types.
+
+    The specific lists of types allowed by these are defined by the functions
+    (like `NumberTypes()`) in
+    [`tensorflow/core/framework/types.h`](https://www.tensorflow.org/code/tensorflow/core/framework/types.h).
+    In this example the attr `t` must be one of the numeric types:
+
+    ```c++
+    REGISTER_OP("NumberType")
+        .Attr("t: numbertype");
+    ```
+
+    For this op:
+
+    ```python
+    tf.number_type(t=tf.int32)  # Valid
+    tf.number_type(t=tf.bool)   # Invalid
+    ```
+
+* `int >= <n>`: The value must be an int whose value is greater than or equal to
+  `<n>`, where `<n>` is a natural number.
+
+  For example, the following op registration specifies that the attr `a` must
+  have a value that is at least `2`:
+
+  ```c++
+  REGISTER_OP("MinIntExample")
+      .Attr("a: int >= 2");
+  ```
+
+* `list(<type>) >= <n>`: A list of type `<type>` whose length is greater than
+  or equal to `<n>`.
+
+  For example, the following op registration specifies that the attr `a` is a
+  list of types (either `int32` or `float`), and that there must be at least 3
+  of them:
+
+  ```c++
+  REGISTER_OP("TypeListExample")
+      .Attr("a: list({int32, float}) >= 3");
+  ```
+
+To set a default value for an attr (making it optional in the generated code),
+add `= <default>` to the end, as in:
+
+```c++
+REGISTER_OP("AttrDefaultExample")
+    .Attr("i: int = 0");
+```
+
+The supported syntax of the default value is what would be used in the proto
+representation of the resulting GraphDef definition.
+
+Here are examples for how to specify a default for all types:
+
+```c++
+REGISTER_OP("AttrDefaultExampleForAllTypes")
+   .Attr("s: string = 'foo'")
+   .Attr("i: int = 0")
+   .Attr("f: float = 1.0")
+   .Attr("b: bool = true")
+   .Attr("ty: type = DT_INT32")
+   .Attr("sh: shape = { dim { size: 1 } dim { size: 2 } }")
+   .Attr("te: tensor = { dtype: DT_INT32 int_val: 5 }")
+   .Attr("l_empty: list(int) = []")
+   .Attr("l_int: list(int) = [2, 3, 5, 7]");
+```
+
+Note in particular that the values of type `type` use @{$dims_types#data-types$the `DT_*` names for the types}.
+
+#### Polymorphism {#polymorphism}
+
+##### Type Polymorphism
+
+For ops that can take different types as input or produce different output
+types, you can specify [an attr](#attrs) in
+[an input or output type](#inputs-and-outputs) in the op registration.  Typically
+you would then register an `OpKernel` for each supported type.
+
+For instance, if you'd like the `ZeroOut` op to work on `float`s
+in addition to `int32`s, your op registration might look like:
+<pre class="prettyprint"><code class="lang-cpp">
+REGISTER\_OP("ZeroOut")
+    <b>.Attr("T: {float, int32}")</b>
+    .Input("to\_zero: <b>T</b>")
+    .Output("zeroed: <b>T</b>");
+</code></pre>
+
+Your op registration now specifies that the input's type must be `float`, or
+`int32`, and that its output will be the same type, since both have type `T`.
+
+> <a id="naming"></a>A note on naming: Inputs, outputs, and attrs generally should be
+> given snake\_case names.  The one exception is attrs that are used as the type
+> of an input or in the type of an input. Those attrs can be inferred when the
+> op is added to the graph and so don't appear in the op's function.  For
+> example, this last definition of ZeroOut will generate a Python function that
+> looks like:
+>
+> ```python
+> def zero_out(to_zero, name=None):
+>   """...
+>   Args:
+>     to_zero: A `Tensor`. Must be one of the following types:
+>         `float32`, `int32`.
+>     name: A name for the operation (optional).
+>
+>   Returns:
+>     A `Tensor`. Has the same type as `to_zero`.
+>   """
+> ```
+>
+> If `to_zero` is passed an `int32` tensor, then `T` is automatically set to
+> `int32` (well, actually `DT_INT32`). Those inferred attrs are given
+> Capitalized or CamelCase names.
+>
+> Compare this with an op that has a type attr that determines the output
+> type:
+>
+> ```c++
+> REGISTER_OP("StringToNumber")
+>     .Input("string_tensor: string")
+>     .Output("output: out_type")
+>     .Attr("out_type: {float, int32} = DT_FLOAT");
+>     .Doc(R"doc(
+> Converts each string in the input Tensor to the specified numeric type.
+> )doc");
+> ```
+>
+> In this case, the user has to specify the output type, as in the generated
+> Python:
+>
+> ```python
+> def string_to_number(string_tensor, out_type=None, name=None):
+>   """Converts each string in the input Tensor to the specified numeric type.
+>
+>   Args:
+>     string_tensor: A `Tensor` of type `string`.
+>     out_type: An optional `tf.DType` from: `tf.float32, tf.int32`.
+>       Defaults to `tf.float32`.
+>     name: A name for the operation (optional).
+>
+>   Returns:
+>     A `Tensor` of type `out_type`.
+>   """
+> ```
+
+<pre><pre class="prettyprint"><code class="lang-cpp">
+\#include "tensorflow/core/framework/op_kernel.h"<br/>
+class ZeroOut<b>Int32</b>Op : public OpKernel {
+  // as before
+};<br/>
+class ZeroOut<b>Float</b>Op : public OpKernel {
+ public:
+  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
+      : OpKernel(context) {}<br/>
+  void Compute(OpKernelContext\* context) override {
+    // Grab the input tensor
+    const Tensor& input\_tensor = context-&gt;input(0);
+    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    // Create an output tensor
+    Tensor* output = NULL;
+    OP\_REQUIRES\_OK(context,
+                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
+    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    // Set all the elements of the output tensor to 0
+    const int N = input.size();
+    for (int i = 0; i &lt; N; i++) {
+      output\_flat(i) = 0;
+    }<br/>
+    // Preserve the first input value
+    if (N &gt; 0) output\_flat(0) = input(0);
+  }
+};<br/><b>
+// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+// in the op registration above) must be "int32" to use this template
+// instantiation.</b>
+REGISTER\_KERNEL\_BUILDER(
+    Name("ZeroOut")
+    .Device(DEVICE\_CPU)
+    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
+    ZeroOutOp<b>Int32</b>);
+<b>REGISTER\_KERNEL\_BUILDER(
+    Name("ZeroOut")
+    .Device(DEVICE\_CPU)
+    .TypeConstraint&lt;float&gt;("T"),
+    ZeroOutFloatOp);
+</b></code></pre></pre>
+
+> To preserve [backwards compatibility](#backwards-compatibility), you should
+> specify a [default value](#default-values-constraints) when adding an attr to
+> an existing op:
+>
+> <pre class="prettyprint"><code class="lang-cpp">
+> REGISTER\_OP("ZeroOut")
+>   <b>.Attr("T: {float, int32} = DT_INT32")</b>
+>   .Input("to\_zero: T")
+>   .Output("zeroed: T")
+> </code></pre>
+
+Let's say you wanted to add more types, say `double`:
+<pre class="prettyprint"><code class="lang-cpp">
+REGISTER\_OP("ZeroOut")
+    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
+    .Input("to\_zero: <b>T</b>")
+    .Output("zeroed: <b>T</b>");
+</code></pre>
+
+Instead of writing another `OpKernel` with redundant code as above, often you
+will be able to use a C++ template instead.  You will still have one kernel
+registration (`REGISTER_KERNEL_BUILDER` call) per overload.
+<pre class="prettyprint"><code class="lang-cpp">
+<b>template &lt;typename T&gt;</b>
+class ZeroOutOp : public OpKernel {
+ public:
+  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
+  void Compute(OpKernelContext\* context) override {
+    // Grab the input tensor
+    const Tensor& input\_tensor = context-&gt;input(0);
+    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
+    // Create an output tensor
+    Tensor* output = NULL;
+    OP\_REQUIRES\_OK(context,
+                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
+    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
+    // Set all the elements of the output tensor to 0
+    const int N = input.size();
+    for (int i = 0; i &lt; N; i++) {
+      output\_flat(i) = 0;
+    }<br/>
+    // Preserve the first input value
+    if (N &gt; 0) output\_flat(0) = input(0);
+  }
+};<br/>
+// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+// in the op registration above) must be "int32" to use this template
+// instantiation.</b>
+REGISTER\_KERNEL\_BUILDER(
+    Name("ZeroOut")
+    .Device(DEVICE\_CPU)
+    .TypeConstraint&lt;int32&gt;("T"),
+    <b>ZeroOutOp&lt;int32&gt;</b>);
+REGISTER\_KERNEL\_BUILDER(
+    Name("ZeroOut")
+    .Device(DEVICE\_CPU)
+    .TypeConstraint&lt;float&gt;("T"),
+    <b>ZeroOutOp&lt;float&gt;</b>);
+<b>REGISTER\_KERNEL\_BUILDER(
+    Name("ZeroOut")
+    .Device(DEVICE\_CPU)
+    .TypeConstraint&lt;double&gt;("T"),
+    ZeroOutOp&lt;double&gt;);
+</b></code></pre>
+
+If you have more than a couple overloads, you can put the registration in a
+macro.
+
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+#define REGISTER_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ZeroOutOp<type>)
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+
+#undef REGISTER_KERNEL
+```
+
+Depending on the list of types you are registering the kernel for, you may be
+able to use a macro provided by
+[`tensorflow/core/framework/register_types.h`][register_types]:
+
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+
+REGISTER_OP("ZeroOut")
+    .Attr("T: realnumbertype")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+
+template <typename T>
+class ZeroOutOp : public OpKernel { ... };
+
+#define REGISTER_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ZeroOutOp<type>)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
+
+#undef REGISTER_KERNEL
+```
+
+##### List Inputs and Outputs
+
+In addition to being able to accept or produce different types, ops can consume
+or produce a variable number of tensors.
+
+In the next example, the attr `T` holds a *list* of types, and is used as the
+type of both the input `in` and the output `out`.  The input and output are
+lists of tensors of that type (and the number and types of tensors in the output
+are the same as the input, since both have type `T`).
+
+```c++
+REGISTER_OP("PolymorphicListExample")
+    .Attr("T: list(type)")
+    .Input("in: T")
+    .Output("out: T");
+```
+
+You can also place restrictions on what types can be specified in the list. In
+this next case, the input is a list of `float` and `double` tensors. The op
+accepts, for example, input types `(float, double, float)` and in that case the
+output type would also be `(float, double, float)`.
+
+```c++
+REGISTER_OP("ListTypeRestrictionExample")
+    .Attr("T: list({float, double})")
+    .Input("in: T")
+    .Output("out: T");
+```
+
+If you want all the tensors in a list to be of the same type, you might do
+something like:
+
+```c++
+REGISTER_OP("IntListInputExample")
+    .Attr("N: int")
+    .Input("in: N * int32")
+    .Output("out: int32");
+```
+
+This accepts a list of `int32` tensors, and uses an `int` attr `N` to
+specify the length of the list.
+
+This can be made [type polymorphic](#type-polymorphism) as well.  In the next
+example, the input is a list of tensors (with length `"N"`) of the same (but
+unspecified) type (`"T"`), and the output is a single tensor of matching type:
+
+```c++
+REGISTER_OP("SameListInputExample")
+    .Attr("N: int")
+    .Attr("T: type")
+    .Input("in: N * T")
+    .Output("out: T");
+```
+
+By default, tensor lists have a minimum length of 1. You can change that default
+using
+[a `">="` constraint on the corresponding attr](#default-values-constraints).
+In this next example, the input is a list of at least 2 `int32` tensors:
+
+```c++
+REGISTER_OP("MinLengthIntListExample")
+    .Attr("N: int >= 2")
+    .Input("in: N * int32")
+    .Output("out: int32");
+```
+
+The same syntax works with `"list(type)"` attrs:
+
+```c++
+REGISTER_OP("MinimumLengthPolymorphicListExample")
+    .Attr("T: list(type) >= 3")
+    .Input("in: T")
+    .Output("out: T");
+```
+
+#### Inputs and Outputs {#inputs-outputs}
+
+To summarize the above, an op registration can have multiple inputs and outputs:
+
+```c++
+REGISTER_OP("MultipleInsAndOuts")
+    .Input("y: int32")
+    .Input("z: float")
+    .Output("a: string")
+    .Output("b: int32");
+```
+
+Each input or output spec is of the form:
+
+```
+<name>: <io-type-expr>
+```
+
+where `<name>` begins with a letter and can be composed of alphanumeric
+characters and underscores. `<io-type-expr>` is one of the following type
+expressions:
+
+* `<type>`, where `<type>` is a supported input type (e.g. `float`, `int32`,
+  `string`). This specifies a single tensor of the given type.
+
+  See
+  @{$dims_types#data-types$the list of supported Tensor types}.
+
+  ```c++
+  REGISTER_OP("BuiltInTypesExample")
+      .Input("integers: int32")
+      .Input("complex_numbers: complex64");
+  ```
+
+* `<attr-type>`, where `<attr-type>` is the name of an [Attr](#attrs) with type
+  `type` or `list(type)` (with a possible type restriction). This syntax allows
+  for [polymorphic ops](#polymorphism).
+
+  ```c++
+  REGISTER_OP("PolymorphicSingleInput")
+      .Attr("T: type")
+      .Input("in: T");
+
+  REGISTER_OP("RestrictedPolymorphicSingleInput")
+      .Attr("T: {int32, int64}")
+      .Input("in: T");
+  ```
+
+  Referencing an attr of type `list(type)` allows you to accept a sequence of
+  tensors.
+
+  ```c++
+  REGISTER_OP("ArbitraryTensorSequenceExample")
+      .Attr("T: list(type)")
+      .Input("in: T")
+      .Output("out: T");
+
+  REGISTER_OP("RestrictedTensorSequenceExample")
+      .Attr("T: list({int32, int64})")
+      .Input("in: T")
+      .Output("out: T");
+  ```
+
+  Note that the number and types of tensors in the output `out` is the same as
+  in the input `in`, since both are of type `T`.
+
+* For a sequence of tensors with the same type: `<number> * <type>`, where
+  `<number>` is the name of an [Attr](#attrs) with type `int`.  The `<type>` can
+  either be
+  @{$dims_types#data-types$a specific type like `int32` or `float`},
+  or the name of an attr with type `type`.  As an example of the first, this
+  op accepts a list of `int32` tensors:
+
+  ```c++
+  REGISTER_OP("Int32SequenceExample")
+      .Attr("NumTensors: int")
+      .Input("in: NumTensors * int32")
+  ```
+
+  Whereas this op accepts a list of tensors of any type, as long as they are all
+  the same:
+
+  ```c++
+  REGISTER_OP("SameTypeSequenceExample")
+      .Attr("NumTensors: int")
+      .Attr("T: type")
+      .Input("in: NumTensors * T")
+  ```
+
+* For a reference to a tensor: `Ref(<type>)`, where `<type>` is one of the
+  previous types.
+
+> A note on naming: Any attr used in the type of an input will be inferred.  By
+> convention those inferred attrs use capital names (like `T` or `N`).
+> Otherwise inputs, outputs, and attrs have names like function parameters
+> (e.g. `num_outputs`).  For more details, see the
+> [earlier note on naming](#naming).
+
+For more details, see
+[`tensorflow/core/framework/op_def_builder.h`][op_def_builder].
+
+#### Backwards compatibility {#backward-compat}
+
+Let's assume you have written a nice, custom op and shared it with others, so
+you have happy customers using your operation.  However, you'd like to make
+changes to the op in some way.
+
+In general, changes to existing, checked-in specifications must be
+backwards-compatible: changing the specification of an op must not break prior
+serialized `GraphDef` protocol buffers constructed from older specifications.
+The details of `GraphDef` compatibility are
+@{$version_semantics#graphs$described here}.
+
+There are several ways to preserve backwards-compatibility.
+
+1. Any new attrs added to an operation must have default values defined, and
+   with that default value the op must have the original behavior. To change an
+   operation from not polymorphic to polymorphic, you *must* give a default
+   value to the new type attr to preserve the original signature by default. For
+   example, if your operation was:
+
+       REGISTER_OP("MyGeneralUnaryOp")
+           .Input("in: float")
+           .Output("out: float");
+
+   you can make it polymorphic in a backwards-compatible way using:
+
+       REGISTER_OP("MyGeneralUnaryOp")
+           .Input("in: T")
+           .Output("out: T")
+           .Attr("T: numerictype = DT_FLOAT");
+
+2. You can safely make a constraint on an attr less restrictive.  For example,
+   you can change from `{int32, int64}` to `{int32, int64, float}` or `type`.
+   Or you may change from `{"apple", "orange"}` to `{"apple", "banana",
+   "orange"}` or `string`.
+
+3. You can change single inputs / outputs into list inputs / outputs, as long as
+   the default for the list type matches the old signature.
+
+4. You can add a new list input / output, if it defaults to empty.
+
+5. Namespace any new ops you create, by prefixing the op names with something
+   unique to your project. This avoids having your op colliding with any ops
+   that might be included in future versions of TensorFlow.
+
+6. Plan ahead! Try to anticipate future uses for the op. Some signature changes
+   can't be done in a compatible way (for example, making a list of the same
+   type into a list of varying types).
+
+The full list of safe and unsafe changes can be found in
+[`tensorflow/core/framework/op_compatibility_test.cc`](https://www.tensorflow.org/code/tensorflow/core/framework/op_compatibility_test.cc).
+If you cannot make your change to an operation backwards compatible, then create
+a new operation with a new name with the new semantics.
+
+Also note that while these changes can maintain `GraphDef` compatibility, the
+generated Python code may change in a way that isn't compatible with old
+callers.  The Python API may be kept compatible by careful changes in a
+hand-written Python wrapper, by keeping the old signature except possibly adding
+new optional arguments to the end.  Generally incompatible changes may only be
+made when TensorFlow's changes major versions, and must conform to the
+@{$version_semantics#graphs$`GraphDef` version semantics}.
+
+### GPU Support {#gpu-support}
+
+You can implement different OpKernels and register one for CPU and another for
+GPU, just like you can [register kernels for different types](#polymorphism).
+There are several examples of kernels with GPU support in
+[`tensorflow/core/kernels/`](https://www.tensorflow.org/code/tensorflow/core/kernels/).
+Notice some kernels have a CPU version in a `.cc` file, a GPU version in a file
+ending in `_gpu.cu.cc`, and some code shared in common in a `.h` file.
+
+For example, the @{tf.pad} has
+everything but the GPU kernel in [`tensorflow/core/kernels/pad_op.cc`][pad_op].
+The GPU kernel is in
+[`tensorflow/core/kernels/pad_op_gpu.cu.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op_gpu.cu.cc),
+and the shared code is a templated class defined in
+[`tensorflow/core/kernels/pad_op.h`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.h).
+We organize the code this way for two reasons: it allows you to share common
+code among the CPU and GPU implementations, and it puts the GPU implementation
+into a separate file so that it can be compiled only by the GPU compiler.
+
+One thing to note, even when the GPU kernel version of `pad` is used, it still
+needs its `"paddings"` input in CPU memory.  To mark that inputs or outputs are
+kept on the CPU, add a `HostMemory()` call to the kernel registration, e.g.:
+
+```c++
+#define REGISTER_GPU_KERNEL(T)                         \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                  \
+                              .Device(DEVICE_GPU)      \
+                              .TypeConstraint<T>("T")  \
+                              .HostMemory("paddings"), \
+                          PadOp<GPUDevice, T>)
+```
+
+#### Compiling the kernel for the GPU device {#compiling-kernel}
+
+Look at
+[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc)
+for an example that uses a CUDA kernel to implement an op. The
+`tf_custom_op_library` accepts a `gpu_srcs` argument in which the list of source
+files containing the CUDA kernels (`*.cu.cc` files) can be specified. For use
+with a binary installation of TensorFlow, the CUDA kernels have to be compiled
+with NVIDIA's `nvcc` compiler. Here is the sequence of commands you can use to
+compile the
+[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc)
+and
+[cuda_op_kernel.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cc)
+into a single dynamically loadable library:
+
+```bash
+nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \
+-I $TF_INC -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
+
+g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \
+cuda_op_kernel.cu.o -I $TF_INC -fPIC -lcudart
+```
+
+`cuda_op_kernel.so` produced above can be loaded as usual in Python, using the
+`tf.load_op_library` function.
+
+Note that if your CUDA libraries are not installed in `/usr/local/lib64`,
+you'll need to specify the path explicitly in the second (g++) command above.
+For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
+`/usr/local/cuda-8.0`.
+
+>   Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
+
+### Implement the gradient in Python {#implement-gradient}
+
+Given a graph of ops, TensorFlow uses automatic differentiation
+(backpropagation) to add new ops representing gradients with respect to the
+existing ops (see
+@{$python/train#gradient_computation$Gradient Computation}).
+To make automatic differentiation work for new ops, you must register a gradient
+function which computes gradients with respect to the ops' inputs given
+gradients with respect to the ops' outputs.
+
+Mathematically, if an op computes \\(y = f(x)\\) the registered gradient op
+converts gradients \\(\partial L/ \partial y\\) of loss \\(L\\) with respect to
+\\(y\\) into gradients \\(\partial L/ \partial x\\) with respect to \\(x\\) via
+the chain rule:
+
+$$\frac{\partial L}{\partial x}
+    = \frac{\partial L}{\partial y} \frac{\partial y}{\partial x}
+    = \frac{\partial L}{\partial y} \frac{\partial f}{\partial x}.$$
+
+In the case of `ZeroOut`, only one entry in the input affects the output, so the
+gradient with respect to the input is a sparse "one hot" tensor.  This is
+expressed as follows:
+
+```python
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import sparse_ops
+
+@ops.RegisterGradient("ZeroOut")
+def _zero_out_grad(op, grad):
+  """The gradients for `zero_out`.
+
+  Args:
+    op: The `zero_out` `Operation` that we are differentiating, which we can use
+      to find the inputs and outputs of the original op.
+    grad: Gradient with respect to the output of the `zero_out` op.
+
+  Returns:
+    Gradients with respect to the input of `zero_out`.
+  """
+  to_zero = op.inputs[0]
+  shape = array_ops.shape(to_zero)
+  index = array_ops.zeros_like(shape)
+  first_grad = array_ops.reshape(grad, [-1])[0]
+  to_zero_grad = sparse_ops.sparse_to_dense([index], shape, first_grad, 0)
+  return [to_zero_grad]  # List of one Tensor, since we have one input
+```
+
+Details about registering gradient functions with
+@{tf.RegisterGradient}:
+
+* For an op with one output, the gradient function will take an
+  @{tf.Operation} `op` and a
+  @{tf.Tensor} `grad` and build new ops
+  out of the tensors
+  [`op.inputs[i]`](../../api_docs/python/framework.md#Operation.inputs),
+  [`op.outputs[i]`](../../api_docs/python/framework.md#Operation.outputs), and `grad`.  Information
+  about any attrs can be found via
+  @{tf.Operation.get_attr}.
+
+* If the op has multiple outputs, the gradient function will take `op` and
+  `grads`, where `grads` is a list of gradients with respect to each output.
+  The result of the gradient function must be a list of `Tensor` objects
+  representing the gradients with respect to each input.
+
+* If there is no well-defined gradient for some input, such as for integer
+  inputs used as indices, the corresponding returned gradient should be
+  `None`.  For example, for an op taking a floating point tensor `x` and an
+  integer index `i`, the gradient function would `return [x_grad, None]`.
+
+* If there is no meaningful gradient for the op at all, you often will not have
+  to register any gradient, and as long as the op's gradient is never needed,
+  you will be fine. In some cases, an op has no well-defined gradient but can
+  be involved in the computation of the gradient. Here you can use
+  `ops.NotDifferentiable` to automatically propagate zeros backwards.
+
+Note that at the time the gradient function is called, only the data flow graph
+of ops is available, not the tensor data itself.  Thus, all computation must be
+performed using other tensorflow ops, to be run at graph execution time.
+
+### Shape functions in C++ {#shape-functions}
+
+The TensorFlow API has a feature called "shape inference" that provides
+information about the shapes of tensors without having to execute the
+graph. Shape inference is supported by "shape functions" that are registered for
+each op type in the C++ `REGISTER_OP` declaration, and perform two roles:
+asserting that the shapes of the inputs are compatible during graph
+construction, and specifying the shapes for the outputs.
+
+Shape functions are defined as operations on the
+`shape_inference::InferenceContext` class. For example, in the shape function
+for ZeroOut:
+
+```c++
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+```
+
+`c->set_output(0, c->input(0));` declares that the first output's shape should
+be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`.
+
+There are a number of common shape functions
+that apply to many ops, such as `shape_inference::UnchangedShape` which can be
+found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows:
+
+```c++
+REGISTER_OP("ZeroOut")
+    .Input("to_zero: int32")
+    .Output("zeroed: int32")
+    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+```
+
+A shape function can also constrain the shape of an input. For the version of
+[`ZeroOut` with a vector shape constraint](#validation), the shape function
+would be as follows:
+
+```c++
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
+      c->set_output(0, input);
+      return Status::OK();
+    });
+```
+
+The `WithRank` call validates that the input shape `c->input(0)` has
+a shape with exactly one dimension (or if the input shape is unknown,
+the output shape will be a vector with one unknown dimension).
+
+If your op is [polymorphic with multiple inputs](#polymorphism), you can use
+members of `InferenceContext` to determine the number of shapes to check, and
+`Merge` to validate that the shapes are all compatible (alternatively, access
+attributes that indicate the lengths, with `InferenceContext::GetAttr`, which
+provides access to the attributes of the op).
+
+```c++
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle input;
+      ::tensorflow::shape_inference::ShapeHandle output;
+      for (size_t i = 0; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &input));
+        TF_RETURN_IF_ERROR(c->Merge(output, input, &output));
+      }
+      c->set_output(0, output);
+      return Status::OK();
+    });
+```
+
+Since shape inference is an optional feature, and the shapes of tensors may vary
+dynamically, shape functions must be robust to incomplete shape information for
+any of the inputs. The `Merge` method in [`InferenceContext`](https://www.tensorflow.org/code/tensorflow/core/framework/shape_inference.h)
+allows the caller to assert that two shapes are the same, even if either
+or both of them do not have complete information. Shape functions are defined
+for all of the core TensorFlow ops and provide many different usage examples.
+
+The `InferenceContext` class has a number of functions that can be used to
+define shape function manipulations.  For example, you can validate that a
+particular dimension has a very specific value using `InferenceContext::Dim` and
+`InferenceContext::WithValue`; you can specify that an output dimension is the
+sum / product of two input dimensions using `InferenceContext::Add` and
+`InferenceContext::Multiply`. See the `InferenceContext` class for
+all of the various shape manipulations you can specify. The following example sets
+shape of the first output to (n, 3), where first input has shape (n, ...)
+
+```c++
+.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+    c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3));
+    return Status::OK();
+});
+```
+
+If you have a complicated shape function, you should consider adding a test for
+validating that various input shape combinations produce the expected output
+shape combinations.  You can see examples of how to write these tests in some
+our
+[core ops tests](https://www.tensorflow.org/code/tensorflow/core/ops/array_ops_test.cc).
+(The syntax of `INFER_OK` and `INFER_ERROR` are a little cryptic, but try to be
+compact in representing input and output shape specifications in tests.  For
+now, see the surrounding comments in those tests to get a sense of the shape
+string specification).
+
+
+[core-array_ops]:https://www.tensorflow.org/code/tensorflow/core/ops/array_ops.cc
+[python-user_ops]:https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py
+[tf-kernels]:https://www.tensorflow.org/code/tensorflow/core/kernels/
+[user_ops]:https://www.tensorflow.org/code/tensorflow/core/user_ops/
+[pad_op]:https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.cc
+[standard_ops-py]:https://www.tensorflow.org/code/tensorflow/python/ops/standard_ops.py
+[standard_ops-cc]:https://www.tensorflow.org/code/tensorflow/cc/ops/standard_ops.h
+[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD
+[validation-macros]:https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h
+[op_def_builder]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.h
+[register_types]:https://www.tensorflow.org/code/tensorflow/core/framework/register_types.h
+[FinalizeAttr]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.cc
+[DataTypeString]:https://www.tensorflow.org/code/tensorflow/core/framework/types.cc
+[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD
+[types-proto]:https://www.tensorflow.org/code/tensorflow/core/framework/types.proto
+[TensorShapeProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.proto
+[TensorProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor.proto
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
new file mode 100644
index 00000000000..21816502ace
--- /dev/null
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -0,0 +1,218 @@
+# TensorFlow Architecture
+
+We designed TensorFlow for large-scale distributed training and inference, but
+it is also flexible enough to support experimentation with new machine
+learning models and system-level optimizations.
+
+This document describes the system architecture that makes possible this
+combination of scale and flexibility. It assumes that you have basic familiarity
+with TensorFlow programming concepts such as the computation graph, operations,
+and sessions. See @{$get_started/get_started$Getting Started}
+for an introduction to these topics. Some familiarity
+with @{$distributed$distributed TensorFlow}
+will also be helpful.
+
+This document is for developers who want to extend TensorFlow in some way not
+supported by current APIs, hardware engineers who want to optimize for
+TensorFlow, implementers of machine learning systems working on scaling and
+distribution, or anyone who wants to look under Tensorflow's hood. After
+reading it you should understand TensorFlow architecture well enough to read
+and modify the core TensorFlow code.
+
+## Overview
+
+The TensorFlow runtime is a cross-platform library. Figure 1 illustrates its
+general architecture. A C API separates user level code in different languages
+from the core runtime.
+
+![TensorFlow Layers](https://www.tensorflow.org/images/layers.png){: width="300"}
+
+**Figure 1**
+
+
+This document focuses on the following layers:
+
+*  **Client**:
+   *  Defines the computation as a dataflow graph.
+   *  Initiates graph execution using a [**session**](
+      https://www.tensorflow.org/code/tensorflow/python/client/session.py)
+*  **Distributed Master**
+   *  Prunes a specific subgraph from the graph, as defined by the arguments
+      to Session.run().
+   *  Partitions the subgraph into multiple pieces that run in different
+      processes and devices.
+   *  Distributes the graph pieces to worker services.
+   *  Initiates graph piece execution by worker services.
+*  **Worker Services** (one for each task)
+   *  Schedule the execution of graph operations using kernel implementations
+      appropriate to the available hardware (CPUs, GPUs, etc).
+   *  Send and receive operation results to and from other worker services.
+*  **Kernel Implementations**
+   *  Perform the computation for individual graph operations.
+
+Figure 2 illustrates the interaction of these components. "/job:worker/task:0" and
+"/job:ps/task:0" are both tasks with worker services. "PS" stands for "parameter
+server": a task responsible for storing and updating the model's parameters.
+Other tasks send updates to these parameters as they work on optimizing the
+parameters. This particular division of labor between tasks is not required, but
+it is common for distributed training.
+
+![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
+
+**Figure 2**
+
+Note that the Distributed Master and Worker Service only exist in
+distributed TensorFlow. The single-process version of TensorFlow includes a
+special Session implementation that does everything the distributed master does
+but only communicates with devices in the local process.
+
+The following sections describe the core TensorFlow layers in greater detail and
+step through the processing of an example graph.
+
+## Client
+
+Users write the client TensorFlow program that builds the computation graph.
+This program can either directly compose individual operations or use a
+convenience library like the Estimators API to compose neural network layers and
+other higher-level abstractions. TensorFlow supports multiple client
+languages, and we have prioritized Python and C++, because our internal users
+are most familiar with these languages. As features become more established,
+we typically port them to C++, so that users can access an optimized
+implementation from all client languages. Most of the training libraries are
+still Python-only, but C++ does have support for efficient inference.
+
+The client creates a session, which sends the graph definition to the
+distributed master as a @{tf.GraphDef}
+protocol buffer. When the client evaluates a node or nodes in the
+graph, the evaluation triggers a call to the distributed master to initiate
+computation.
+
+In Figure 3, the client has built a graph that applies weights (w) to a
+feature vector (x), adds a bias term (b) and saves the result in a variable
+(s).
+
+![TensorFlow Architecture Diagram: Client](https://www.tensorflow.org/images/graph_client.svg){: width="700"}
+
+**Figure 3**
+
+### Code
+
+*  @{tf.Session}
+
+## Distributed master
+
+The distributed master:
+
+*  prunes the graph to obtain the subgraph required to evaluate the nodes
+   requested by the client,
+*  partitions the graph to obtain graph pieces for
+   each participating device, and
+*  caches these pieces so that they may be re-used in subsequent steps.
+
+Since the master sees the overall computation for
+a step, it applies standard optimizations such as common subexpression
+elimination and constant folding. It then coordinates execution of the
+optimized subgraphs across a set of tasks.
+
+![TensorFlow Architecture Diagram: Master](https://www.tensorflow.org/images/graph_master_cln.svg){: width="700"}
+
+**Figure 4**
+
+
+Figure 5 shows a possible partition of our example graph. The distributed
+master has grouped the model parameters in order to place them together on the
+parameter server.
+
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split1.svg){: width="700"}
+
+**Figure 5**
+
+
+Where graph edges are cut by the partition, the distributed master inserts
+send and receive nodes to pass information between the distributed tasks
+(Figure 6).
+
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split2.svg){: width="700"}
+
+**Figure 6**
+
+
+The distributed master then ships the graph pieces to the distributed tasks.
+
+![Partitioned Graph](https://www.tensorflow.org/images/graph_workers_cln.svg){: width="700"}
+
+**Figure 7**
+
+### Code
+
+*  [MasterService API definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/master_service.proto)
+*  [Master interface](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/master_interface.h)
+
+## Worker Service
+
+The worker service in each task:
+
+*  handles requests from the master,
+*  schedules the execution of the kernels for the operations that comprise a
+   local subgraph, and
+*  mediates direct communication between tasks.
+
+We optimize the worker service for running large graphs with low overhead. Our
+current implementation can execute tens of thousands of subgraphs per second,
+which enables a large number of replicas to make rapid, fine-grained training
+steps. The worker service dispatches kernels to local devices and runs kernels
+in parallel when possible, for example by using multiple CPU cores or GPU
+streams.
+
+We specialize Send and Recv operations for each pair of source and destination
+device types:
+
+*  Transfers between local CPU and GPU devices use the
+   `cudaMemcpyAsync()` API to overlap computation and data transfer.
+*  Transfers between two local GPUs use peer-to-peer DMA, to avoid an expensive
+   copy via the host CPU.
+
+For transfers between tasks, TensorFlow uses multiple protocols, including:
+
+*  gRPC over TCP.
+*  RDMA over Converged Ethernet.
+
+We also have preliminary support for NVIDIA's NCCL library for multi-GPU
+communication (see [`tf.contrib.nccl`](
+https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py)).
+
+![Partitioned Graph](https://www.tensorflow.org/images/graph_send_recv.svg){: width="700"}
+
+**Figure 8**
+
+### Code
+
+*   [WorkerService API definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/worker_service.proto)
+*   [Worker interface](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/worker_interface.h)
+*   [Remote rendezvous (for Send and Recv implementations)](https://www.tensorflow.org/code/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h)
+
+## Kernel Implementations
+
+The runtime contains over 200 standard operations, including mathematical, array
+manipulation, control flow, and state management operations. Each of these
+operations can have kernel implementations optimized for a variety of devices.
+Many of the operation kernels are implemented using Eigen::Tensor, which uses
+C++ templates to generate efficient parallel code for multicore CPUs and GPUs;
+however, we liberally use libraries like cuDNN where a more efficient kernel
+implementation is possible. We have also implemented
+@{$quantization$quantization}, which enables
+faster inference in environments such as mobile devices and high-throughput
+datacenter applications, and use the
+[gemmlowp](https://github.com/google/gemmlowp) low-precision matrix library to
+accelerate quantized computation.
+
+If it is difficult or inefficient to represent a subcomputation as a composition
+of operations, users can register additional kernels that provide an efficient
+implementation written in C++. For example, we recommend registering your own
+fused kernels for some performance critical operations, such as the ReLU and
+Sigmoid activation functions and their corresponding gradients. The @{$xla$XLA Compiler} has an
+experimental implementation of automatic kernel fusion.
+
+### Code
+
+*   [`OpKernel` interface](https://www.tensorflow.org/code/tensorflow/core/framework/op_kernel.h)
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
new file mode 100644
index 00000000000..6bd21be0193
--- /dev/null
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -0,0 +1,733 @@
+# Creating Estimators in tf.contrib.learn
+
+The tf.contrib.learn framework makes it easy to construct and train machine
+learning models via its high-level
+@{$python/contrib.learn#estimators$Estimator} API. `Estimator`
+offers classes you can instantiate to quickly configure common model types such
+as regressors and classifiers:
+
+*   @{tf.contrib.learn.LinearClassifier}:
+    Constructs a linear classification model.
+*   @{tf.contrib.learn.LinearRegressor}:
+    Constructs a linear regression model.
+*   @{tf.contrib.learn.DNNClassifier}:
+    Construct a neural network classification model.
+*   @{tf.contrib.learn.DNNRegressor}:
+    Construct a neural network regressions model.
+
+But what if none of `tf.contrib.learn`'s predefined model types meets your
+needs? Perhaps you need more granular control over model configuration, such as
+the ability to customize the loss function used for optimization, or specify
+different activation functions for each neural network layer. Or maybe you're
+implementing a ranking or recommendation system, and neither a classifier nor a
+regressor is appropriate for generating predictions.
+
+This tutorial covers how to create your own `Estimator` using the building
+blocks provided in `tf.contrib.learn`, which will predict the ages of
+[abalones](https://en.wikipedia.org/wiki/Abalone) based on their physical
+measurements. You'll learn how to do the following:
+
+*   Instantiate an `Estimator`
+*   Construct a custom model function
+*   Configure a neural network using `tf.contrib.layers`
+*   Choose an appropriate loss function from `tf.losses`
+*   Define a training op for your model
+*   Generate and return predictions
+
+## Prerequisites
+
+This tutorial assumes you already know tf.contrib.learn API basics, such as
+feature columns, input functions, and `fit()`/`evaluate()`/`predict()`
+operations. If you've never used tf.contrib.learn before, or need a refresher,
+you should first review the following tutorials:
+
+*   @{$tflearn$tf.contrib.learn Quickstart}: Quick introduction to
+    training a neural network using tf.contrib.learn.
+*   @{$wide$TensorFlow Linear Model Tutorial}: Introduction to
+    feature columns, and an overview on building a linear classifier in
+    tf.contrib.learn.
+*   @{$input_fn$Building Input Functions with tf.contrib.learn}: Overview of how
+    to construct an input_fn to preprocess and feed data into your models.
+
+## An Abalone Age Predictor {#abalone-predictor}
+
+It's possible to estimate the age of an
+[abalone](https://en.wikipedia.org/wiki/Abalone) (sea snail) by the number of
+rings on its shell. However, because this task requires cutting, staining, and
+viewing the shell under a microscope, it's desirable to find other measurements
+that can predict age.
+
+The [Abalone Data Set](https://archive.ics.uci.edu/ml/datasets/Abalone) contains
+the following
+[feature data](https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names)
+for abalone:
+
+| Feature        | Description                                               |
+| -------------- | --------------------------------------------------------- |
+| Length         | Length of abalone (in longest direction; in mm)           |
+| Diameter       | Diameter of abalone (measurement perpendicular to length; |
+:                : in mm)                                                    :
+| Height         | Height of abalone (with its meat inside shell; in mm)     |
+| Whole Weight   | Weight of entire abalone (in grams)                       |
+| Shucked Weight | Weight of abalone meat only (in grams)                    |
+| Viscera Weight | Gut weight of abalone (in grams), after bleeding          |
+| Shell Weight   | Weight of dried abalone shell (in grams)                  |
+
+The label to predict is number of rings, as a proxy for abalone age.
+
+![Abalone shell](https://www.tensorflow.org/abalone_shell.jpg) **[“Abalone
+shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
+Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
+
+## Setup
+
+This tutorial uses three data sets.
+[`abalone_train.csv`](http://download.tensorflow.org/data/abalone_train.csv)
+contains labeled training data comprising 3,320 examples.
+[`abalone_test.csv`](http://download.tensorflow.org/data/abalone_test.csv)
+contains labeled test data for 850 examples.
+[`abalone_predict`](http://download.tensorflow.org/data/abalone_predict.csv)
+contains 7 examples on which to make predictions.
+
+The following sections walk through writing the `Estimator` code step by step;
+the [full, final code is available
+here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/estimators/abalone.py).
+
+## Loading Abalone CSV Data into TensorFlow Datasets
+
+To feed the abalone dataset into the model, you'll need to download and load the
+CSVs into TensorFlow `Dataset`s. First, add some standard Python and TensorFlow
+imports, and set up FLAGS:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import tempfile
+
+# Import urllib
+from six.moves import urllib
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+
+FLAGS = None
+```
+
+Enable logging:
+
+```python
+tf.logging.set_verbosity(tf.logging.INFO)
+```
+
+Then define a function to load the CSVs (either from files specified in
+command-line options, or downloaded from
+[tensorflow.org](https://www.tensorflow.org/)):
+
+```python
+def maybe_download(train_data, test_data, predict_data):
+  """Maybe downloads training data and returns train and test file names."""
+  if train_data:
+    train_file_name = train_data
+  else:
+    train_file = tempfile.NamedTemporaryFile(delete=False)
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_train.csv",
+        train_file.name)
+    train_file_name = train_file.name
+    train_file.close()
+    print("Training data is downloaded to %s" % train_file_name)
+
+  if test_data:
+    test_file_name = test_data
+  else:
+    test_file = tempfile.NamedTemporaryFile(delete=False)
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
+    test_file_name = test_file.name
+    test_file.close()
+    print("Test data is downloaded to %s" % test_file_name)
+
+  if predict_data:
+    predict_file_name = predict_data
+  else:
+    predict_file = tempfile.NamedTemporaryFile(delete=False)
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_predict.csv",
+        predict_file.name)
+    predict_file_name = predict_file.name
+    predict_file.close()
+    print("Prediction data is downloaded to %s" % predict_file_name)
+
+  return train_file_name, test_file_name, predict_file_name
+```
+
+Finally, create `main()` and load the abalone CSVs into `Datasets`, defining
+flags to allow users to optionally specify CSV files for training, test, and
+prediction datasets via the command line (by default, files will be downloaded
+from [tensorflow.org](https://www.tensorflow.org/)):
+
+```python
+def main(unused_argv):
+  # Load datasets
+  abalone_train, abalone_test, abalone_predict = maybe_download(
+    FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
+
+  # Training examples
+  training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
+      filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
+
+  # Test examples
+  test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
+      filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
+
+  # Set of 7 examples for which to predict abalone ages
+  prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
+      filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--train_data", type=str, default="", help="Path to the training data.")
+  parser.add_argument(
+      "--test_data", type=str, default="", help="Path to the test data.")
+  parser.add_argument(
+      "--predict_data",
+      type=str,
+      default="",
+      help="Path to the prediction data.")
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+```
+
+## Instantiating an Estimator
+
+When defining a model using one of tf.contrib.learn's provided classes, such as
+`DNNClassifier`, you supply all the configuration parameters right in the
+constructor, e.g.:
+
+```python
+my_nn = tf.contrib.learn.DNNClassifier(feature_columns=[age, height, weight],
+                                       hidden_units=[10, 10, 10],
+                                       activation_fn=tf.nn.relu,
+                                       dropout=0.2,
+                                       n_classes=3,
+                                       optimizer="Adam")
+```
+
+You don't need to write any further code to instruct TensorFlow how to train the
+model, calculate loss, or return predictions; that logic is already baked into
+the `DNNClassifier`.
+
+By contrast, when you're creating your own estimator from scratch, the
+constructor accepts just two high-level parameters for model configuration,
+`model_fn` and `params`:
+
+```python
+nn = tf.contrib.learn.Estimator(
+    model_fn=model_fn, params=model_params)
+```
+
+*   `model_fn`: A function object that contains all the aforementioned logic to
+    support training, evaluation, and prediction. You are responsible for
+    implementing that functionality. The next section, [Constructing the
+    `model_fn`](#constructing-modelfn) covers creating a model function in
+    detail.
+
+*   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
+    that will be passed into the `model_fn`.
+
+Note: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
+`Estimator` initializer also accepts the general configuration arguments
+`model_dir` and `config`.
+
+For the abalone age predictor, the model will accept one hyperparameter:
+learning rate. Define `LEARNING_RATE` as a constant at the beginning of your
+code (highlighted in bold below), right after the logging configuration:
+
+<pre class="prettyprint"><code class="lang-python">tf.logging.set_verbosity(tf.logging.INFO)
+
+<strong># Learning rate for the model
+LEARNING_RATE = 0.001</strong></code></pre>
+
+Note: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
+needed to achieve the best results during model training.
+
+Then, add the following code to `main()`, which creates the dict `model_params`
+containing the learning rate and instantiates the `Estimator`:
+
+```python
+# Set model params
+model_params = {"learning_rate": LEARNING_RATE}
+
+# Instantiate Estimator
+nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
+```
+
+## Constructing the `model_fn` {#constructing-modelfn}
+
+The basic skeleton for an `Estimator` API model function looks like this:
+
+```python
+def model_fn(features, targets, mode, params):
+   # Logic to do the following:
+   # 1. Configure the model via TensorFlow operations
+   # 2. Define the loss function for training/evaluation
+   # 3. Define the training operation/optimizer
+   # 4. Generate predictions
+   # 5. Return predictions/loss/train_op/eval_metric_ops in ModelFnOps object
+   return ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops)
+```
+
+The `model_fn` must accept three arguments:
+
+*   `features`: A dict containing the features passed to the model via `fit()`,
+    `evaluate()`, or `predict()`.
+*   `targets`: A `Tensor` containing the labels passed to the model via `fit()`,
+    `evaluate()`, or `predict()`. Will be empty for `predict()` calls, as these
+    are the values the model will infer.
+*   `mode`: One of the following @{tf.contrib.learn.ModeKeys} string values
+    indicating the context in which the model_fn was invoked:
+    *   `tf.contrib.learn.ModeKeys.TRAIN` The `model_fn` was invoked in training
+        mode—e.g., via a `fit()` call.
+    *   `tf.contrib.learn.ModeKeys.EVAL`. The `model_fn` was invoked in
+        evaluation mode—e.g., via an `evaluate()` call.
+    *   `tf.contrib.learn.ModeKeys.INFER`. The `model_fn` was invoked in
+        inference mode—e.g., via a `predict()` call.
+
+`model_fn` may also accept a `params` argument containing a dict of
+hyperparameters used for training (as shown in the skeleton above).
+
+The body of the function performs the following tasks (described in detail in the
+sections that follow):
+
+*   Configuring the model—here, for the abalone predictor, this will be a neural
+    network.
+*   Defining the loss function used to calculate how closely the model's
+    predictions match the target values.
+*   Defining the training operation that specifies the `optimizer` algorithm to
+    minimize the loss values calculated by the loss function.
+
+The `model_fn` must return a @{tf.contrib.learn.ModelFnOps}
+object, which contains the following values:
+
+*   `mode` (required). The mode in which the model was run. Typically, you will
+    return the `mode` argument of the `model_fn` here.
+
+*   `predictions` (required in `INFER` and `EVAL` modes). A dict that maps key
+    names of your choice to `Tensor`s containing the predictions from the model,
+    e.g.:
+
+    ```python
+    predictions = {"results": tensor_of_predictions}
+    ```
+
+    In `INFER` mode, the dict that you return in `ModelFnOps` will then be
+    returned by `predict()`, so you can construct it in the format in which
+    you'd like to consume it.
+
+    In `EVAL` mode, the dict is used by
+    @{$python/contrib.metrics#Metric_Ops_$metric functions}
+    to compute metrics.
+    
+
+*   `loss` (required in `EVAL` and `TRAIN` mode). A `Tensor` containing a scalar
+    loss value: the output of the model's loss function (discussed in more depth
+    later in [Defining loss for the model](#defining-loss)) calculated over all
+    the input examples. This is used in `TRAIN` mode for error handling and
+    logging, and is automatically included as a metric in `EVAL` mode.
+
+*   `train_op` (required only in `TRAIN` mode). An Op that runs one step of
+    training.
+
+*   `eval_metric_ops` (optional). A dict of name/value pairs specifying the
+    metrics that will be calculated when the model runs in `EVAL` mode. The name
+    is a label of your choice for the metric, and the value is the result of
+    your metric calculation. The @{tf.metrics}
+    module provides predefined functions for a variety of common metrics. The
+    following `eval_metric_ops` contains an `"accuracy"` metric calculated using
+    `tf.metrics.accuracy`:
+
+    ```python
+    eval_metric_ops = {
+        "accuracy": tf.metrics.accuracy(labels, predictions)
+    }
+    ```
+
+    If you do not specify `eval_metric_ops`, only `loss` will be calculated
+    during evaluation.
+
+### Configuring a neural network with `tf.contrib.layers`
+
+Constructing a [neural
+network](https://en.wikipedia.org/wiki/Artificial_neural_network) entails
+creating and connecting the input layer, the hidden layers, and the output
+layer.
+
+The input layer is a series of nodes (one for each feature in the model) that
+will accept the feature data that is passed to the `model_fn` in the `features`
+argument. If `features` contains an n-dimensional `Tensor` with all your feature
+data (which is the case if `x` and `y` `Dataset`s are passed to `fit()`,
+`evaluate()`, and `predict()` directly), then it can serve as the input layer.
+If `features` contains a dict of @{$linear#feature-columns-and-transformations$feature columns} passed to
+the model via an input function, you can convert it to an input-layer `Tensor`
+with the @{tf.contrib.layers.input_from_feature_columns} function in
+@{tf.contrib.layers}.
+
+```python
+input_layer = tf.contrib.layers.input_from_feature_columns(
+    columns_to_tensors=features, feature_columns=[age, height, weight])
+```
+
+As shown above, `input_from_feature_columns()` takes two required arguments:
+
+*   `columns_to_tensors`. A mapping of the model's `FeatureColumns` to the
+    `Tensors` containing the corresponding feature data. This is exactly what is
+    passed to the `model_fn` in the `features` argument.
+*   `feature_columns`. A list of all the `FeatureColumns` in the model—`age`,
+    `height`, and `weight` in the above example.
+
+The input layer of the neural network then must be connected to one or more
+hidden layers via an [activation
+function](https://en.wikipedia.org/wiki/Activation_function) that performs a
+nonlinear transformation on the data from the previous layer. The last hidden
+layer is then connected to the output layer, the final layer in the model.
+tf.contrib.layers provides the following convenience functions for constructing
+fully connected layers:
+
+*   `relu(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
+    connected to the previous layer `inputs` with a [ReLU activation
+    function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\))
+    (@{tf.nn.relu}):
+
+    ```python
+    hidden_layer = tf.contrib.layers.relu(inputs=input_layer, num_outputs=10)
+    ```
+
+*   `relu6(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
+    connected to the previous layer `hidden_layer` with a ReLU 6 activation
+    function (@{tf.nn.relu6}):
+
+    ```python
+    second_hidden_layer = tf.contrib.layers.relu6(inputs=hidden_layer, num_outputs=20)
+    ```
+
+*   `linear(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
+    connected to the previous layer `second_hidden_layer` with *no* activation
+    function, just a linear transformation:
+
+    ```python
+    output_layer = tf.contrib.layers.linear(inputs=second_hidden_layer, num_outputs=3)
+    ```
+
+All these functions are
+[partials](https://docs.python.org/2/library/functools.html#functools.partial)
+of the more general @{tf.contrib.layers.fully_connected}
+function, which can be used to add fully connected layers with other activation
+functions, e.g.:
+
+```python
+output_layer = tf.contrib.layers.fully_connected(inputs=second_hidden_layer,
+                                                 num_outputs=10,
+                                                 activation_fn=tf.sigmoid)
+```
+
+The above code creates the neural network layer `output_layer`, which is fully
+connected to `second_hidden_layer` with a sigmoid activation function
+(@{tf.sigmoid}). For a list of predefined
+activation functions available in TensorFlow, see the @{$python/nn#activation_functions$API docs}.
+
+Putting it all together, the following code constructs a full neural network for
+the abalone predictor, and captures its predictions:
+
+```python
+def model_fn(features, targets, mode, params):
+  """Model function for Estimator."""
+
+  # Connect the first hidden layer to input layer
+  # (features) with relu activation
+  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+
+  # Connect the second hidden layer to first hidden layer with relu
+  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+
+  # Connect the output layer to second hidden layer (no activation fn)
+  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+
+  # Reshape output layer to 1-dim Tensor to return predictions
+  predictions = tf.reshape(output_layer, [-1])
+  predictions_dict = {"ages": predictions}
+  ...
+```
+
+Here, because you'll be passing the abalone `Datasets` directly to `fit()`,
+`evaluate()`, and `predict()` via `x` and `y` arguments, the input layer is the
+`features` `Tensor` passed to the `model_fn`. The network contains two hidden
+layers, each with 10 nodes and a ReLU activation function. The output layer
+contains no activation function, and is
+@{tf.reshape} to a one-dimensional
+tensor to capture the model's predictions, which are stored in
+`predictions_dict`.
+
+### Defining loss for the model {#defining-loss}
+
+The `ModelFnOps` returned by the `model_fn` must contain `loss`: a `Tensor`
+representing the loss value, which quantifies how well the model's predictions
+reflect the target values during training and evaluation runs. The @{tf.losses}
+module provides convenience functions for calculating loss using a variety of
+metrics, including:
+
+*   `absolute_difference(predictions, targets)`. Calculates loss using the
+    [absolute-difference
+    formula](https://en.wikipedia.org/wiki/Deviation_\(statistics\)#Unsigned_or_absolute_deviation)
+    (also known as L<sub>1</sub> loss).
+
+*   `log_loss(predictions, targets)`. Calculates loss using the [logistic loss
+    forumula](https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss)
+    (typically used in logistic regression).
+
+*   `mean_squared_error(predictions, targets)`. Calculates loss using the [mean
+    squared error](https://en.wikipedia.org/wiki/Mean_squared_error) (MSE; also
+    known as L<sub>2</sub> loss).
+
+The following example adds a definition for `loss` to the abalone `model_fn`
+using `mean_squared_error()` (in bold):
+
+<pre class="prettyprint"><code class="lang-python">def model_fn(features, targets, mode, params):
+  """Model function for Estimator."""
+
+  # Connect the first hidden layer to input layer
+  # (features) with relu activation
+  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+
+  # Connect the second hidden layer to first hidden layer with relu
+  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+
+  # Connect the output layer to second hidden layer (no activation fn)
+  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+
+  # Reshape output layer to 1-dim Tensor to return predictions
+  predictions = tf.reshape(output_layer, [-1])
+  predictions_dict = {"ages": predictions}
+
+  <strong># Calculate loss using mean squared error
+  loss = tf.losses.mean_squared_error(targets, predictions)</strong>
+  ...</code></pre>
+
+See the @{$python/contrib.losses$API guide} for a
+full list of loss functions and more details on supported arguments and usage.
+
+Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
+The following code defines an `rmse` metric, which calculates the root mean
+squared error for the model predictions. Note that the `targets` tensor is cast
+to a `float64` type to match the data type of the `predictions` tensor, which
+will contain real values:
+
+```python
+eval_metric_ops = {
+    "rmse": tf.metrics.root_mean_squared_error(
+        tf.cast(targets, tf.float64), predictions)
+}
+```
+
+### Defining the training op for the model
+
+The training op defines the optimization algorithm TensorFlow will use when
+fitting the model to the training data. Typically when training, the goal is to
+minimize loss. The tf.contrib.layers API provides the function `optimize_loss`,
+which returns a training op that will do just that. `optimize_loss` has four
+required arguments:
+
+*   `loss`. The loss value calculated by the `model_fn` (see [Defining Loss for
+    the Model](#defining-loss)).
+*   `global_step`. An integer
+    @{tf.Variable} representing the
+    step counter to increment for each model training run. Can easily be
+    created/incremented in TensorFlow via the
+    @{tf.train.get_global_step}
+    function.
+*   `learning_rate`. The [learning
+    rate](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Background)
+    (also known as _step size_) hyperparameter that the optimization algorithm
+    uses when training.
+*   `optimizer`. The optimization algorithm to use during training. `optimizer`
+    can accept any of the following string values, representing an optimization
+    algorithm predefined in `tf.contrib.layers.optimizers`:
+    *   `SGD`. Implementation of [gradient
+        descent](https://en.wikipedia.org/wiki/Gradient_descent)
+        (@{tf.train.GradientDescentOptimizer})
+    *   `Adagrad`. Implementation of the [AdaGrad optimization
+        algorithm](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+        (@{tf.train.AdagradOptimizer})
+    *   `Adam`. Implementation of the [Adam optimization
+        algorithm](http://arxiv.org/pdf/1412.6980.pdf)
+        (@{tf.train.AdamOptimizer})
+    *   `Ftrl`. Implementation of the
+        [FTRL-Proximal](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+        ("Follow The (Proximally) Regularized Leader") algorithm
+        (@{tf.train.FtrlOptimizer})
+    *   `Momentum`. Implementation of stochastic gradient descent with
+        [momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)
+        (@{tf.train.MomentumOptimizer})
+    *   `RMSProp`. Implementation of the
+        [RMSprop](http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop)
+        algorithm
+        (@{tf.train.RMSPropOptimizer})
+
+Note: The `optimize_loss` function supports additional optional arguments to
+further configure the optimizer, such as for implementing decay. See the
+@{tf.contrib.layers.optimize_loss$API docs} for more info.
+
+The following code defines a training op for the abalone `model_fn` using the
+loss value calculated in [Defining Loss for the Model](#defining-loss), the
+learning rate passed to the function in `params`, and the SGD optimizer. For
+`global_step`, the convenience function
+@{tf.train.get_global_step}
+in tf.contrib.framework takes care of generating an integer variable:
+
+```python
+train_op = tf.contrib.layers.optimize_loss(
+    loss=loss,
+    global_step=tf.contrib.framework.get_global_step(),
+    learning_rate=params["learning_rate"],
+    optimizer="SGD")
+```
+
+### The complete abalone `model_fn`
+
+Here's the final, complete `model_fn` for the abalone age predictor. The
+following code configures the neural network; defines loss and the training op;
+and returns a `ModelFnOps` object containing `mode`, `predictions_dict`, `loss`,
+and `train_op`:
+
+```python
+def model_fn(features, targets, mode, params):
+  """Model function for Estimator."""
+
+  # Connect the first hidden layer to input layer
+  # (features) with relu activation
+  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+
+  # Connect the second hidden layer to first hidden layer with relu
+  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+
+  # Connect the output layer to second hidden layer (no activation fn)
+  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+
+  # Reshape output layer to 1-dim Tensor to return predictions
+  predictions = tf.reshape(output_layer, [-1])
+  predictions_dict = {"ages": predictions}
+
+  # Calculate loss using mean squared error
+  loss = tf.losses.mean_squared_error(targets, predictions)
+
+  # Calculate root mean squared error as additional eval metric
+  eval_metric_ops = {
+      "rmse":
+          tf.metrics.root_mean_squared_error(
+              tf.cast(targets, tf.float64), predictions)
+  }
+
+  train_op = tf.contrib.layers.optimize_loss(
+      loss=loss,
+      global_step=tf.contrib.framework.get_global_step(),
+      learning_rate=params["learning_rate"],
+      optimizer="SGD")
+
+  return model_fn_lib.ModelFnOps(
+      mode=mode,
+      predictions=predictions_dict,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops=eval_metric_ops)
+```
+
+## Running the Abalone Model
+
+You've instantiated an `Estimator` for the abalone predictor and defined its
+behavior in `model_fn`; all that's left to do is train, evaluate, and make
+predictions.
+
+Add the following code to the end of `main()` to fit the neural network to the
+training data and evaluate accuracy:
+
+```python
+def get_train_inputs():
+  x = tf.constant(training_set.data)
+  y = tf.constant(training_set.target)
+  return x, y
+
+# Fit
+nn.fit(input_fn=get_train_inputs, steps=5000)
+
+def get_test_inputs():
+  x = tf.constant(test_set.data)
+  y = tf.constant(test_set.target)
+  return x, y
+
+# Score accuracy
+ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
+print("Loss: %s" % ev["loss"])
+print("Root Mean Squared Error: %s" % ev["rmse"])
+```
+
+Note: The above code uses input functions to feed feature (`x`) and label (`y`)
+`Tensor`s into the model for both training (`get_train_inputs()`) and evaluation
+(`get_test_inputs()`). To learn more about input functions, see the tutorial
+@{$input_fn$Building Input Functions with tf.contrib.learn}.
+
+Then run the code. You should see output like the following:
+
+```none
+...
+INFO:tensorflow:loss = 4.86658, step = 4701
+INFO:tensorflow:loss = 4.86191, step = 4801
+INFO:tensorflow:loss = 4.85788, step = 4901
+...
+INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 5.581
+Loss: 5.581
+```
+
+The loss score reported is the mean squared error returned from the `model_fn`
+when run on the `ABALONE_TEST` data set.
+
+To predict ages for the `ABALONE_PREDICT` data set, add the following to
+`main()`:
+
+```python
+# Print out predictions
+predictions = nn.predict(x=prediction_set.data, as_iterable=True)
+for i, p in enumerate(predictions):
+  print("Prediction %s: %s" % (i + 1, p["ages"]))
+```
+
+Here, the `predict()` function returns results in `predictions` as an iterable.
+The `for` loop enumerates and prints out the results. Rerun the code, and you
+should see output similar to the following:
+
+```python
+...
+Prediction 1: 4.92229
+Prediction 2: 10.3225
+Prediction 3: 7.384
+Prediction 4: 10.6264
+Prediction 5: 11.0862
+Prediction 6: 9.39239
+Prediction 7: 11.1289
+```
+
+## Additional Resources
+
+Congrats! You've successfully built a tf.contrib.learn `Estimator` from scratch.
+For additional reference materials on building `Estimator`s, see the following
+sections of the API guides:
+
+*   @{$python/contrib.learn#Estimators$Estimators}
+*   @{$python/contrib.layers$Layers}
+*   @{$python/contrib.losses$Losses}
+*   @{$python/contrib.layers#optimization$Optimization}
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
new file mode 100644
index 00000000000..590023b0654
--- /dev/null
+++ b/tensorflow/docs_src/extend/index.md
@@ -0,0 +1,35 @@
+# Extend
+
+This section explains how developers can add functionality to TensorFlow's
+capabilities. Begin by reading the following architectural overview:
+
+  * @{$architecture$TensorFlow Architecture}
+
+The following guides explain how to extend particular aspects of
+TensorFlow:
+
+  * @{$adding_an_op$Adding a New Op}, which explains how to create your own
+    operations.
+  * @{$add_filesys$Adding a Custom Filesystem Plugin}, which explains how to
+    add support for your own shared or distributed filesystem.
+  * @{$new_data_formats$Custom Data Readers}, which details how to add support
+    for your own file and record formats.
+  * @{$estimators$Creating Estimators in tf.contrib.learn}, which explains how
+    to write your own custom Estimator.  For example, you could build your
+    own Estimator to implement some variation on standard linear regression.
+
+Python is currently the only language supported by TensorFlow's API stability
+promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
+plus community support for [Haskell](https://github.com/tensorflow/haskell)
+and [Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+develop TensorFlow features in a language other than these languages, read the
+following guide:
+
+  * @{$language_bindings$TensorFlow in Other Languages}
+
+To create tools compatible with TensorFlow's model format, read the following
+guide:
+
+  * @{$tool_developers$A Tool Developer's Guide to TensorFlow Model Files}
+
+
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
new file mode 100644
index 00000000000..b9fd72978dd
--- /dev/null
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -0,0 +1,234 @@
+# TensorFlow in other languages
+
+## Background
+
+This document is intended as a guide for those interested in the creation or
+development of TensorFlow functionality in other programming languages. It
+describes the features of TensorFlow and recommended steps for making the same
+available in other programming languages.
+
+Python was the first client language supported by TensorFlow and currently
+supports the most features. More and more of that functionality is being moved
+into the core of TensorFlow (implemented in C++) and exposed via a [C API].
+Client languages should use the language's [foreign function interface
+(FFI)](https://en.wikipedia.org/wiki/Foreign_function_interface) to call into
+this [C API] to provide TensorFlow functionality.
+
+## Overview
+
+Providing TensorFlow functionality in a programming language can be broken down
+into broad categories:
+
+-   *Run a predefined graph*: Given a `GraphDef` (or
+    `MetaGraphDef`) protocol message, be able to create a session, run queries,
+    and get tensor results. This is sufficient for a mobile app or server that
+    wants to run inference on a pre-trained model.
+-   *Graph construction*: At least one function per defined
+    TensorFlow op that adds an operation to the graph. Ideally these functions
+    would be automatically generated so they stay in sync as the op definitions
+    are modified.
+-   *Gradients (AKA automatic differentiation)*: Given a graph and a list of
+    input and output operations, add operations to the graph that compute the
+    partial derivatives (gradients) of the inputs with respect to the outputs.
+    Allows for customization of the gradient function for a particular operation
+    in the graph.
+-   *Functions*: Define a subgraph that may be called in multiple places in the
+    main `GraphDef`. Defines a `FunctionDef` in the `FunctionDefLibrary`
+    included in a `GraphDef`.
+-   *Control Flow*: Construct "If" and "While" with user-specified subgraphs.
+    Ideally these work with gradients (see above).
+-   *Neural Network library*: A number of components that together support the
+    creation of neural network models and training them (possibly in a
+    distributed setting). While it would be convenient to have this available in
+    other languages, there are currently no plans to support this in languages
+    other than Python. These libraries are typically wrappers over the features
+    described above.
+
+At a minimum, a language binding should support running a predefined graph, but
+most should also support graph construction. The TensorFlow Python API provides
+all these features.
+
+## Current Status
+
+New language support should be built on top of the [C API]. However, as you can
+see in the table below, not all functionality is available in C yet. Providing
+more functionality in the [C API] is an ongoing project.
+
+Feature                                        | Python                                                      | C
+:--------------------------------------------- | :---------------------------------------------------------- | :--
+Run a predefined Graph                         | `tf.import_graph_def`, `tf.Session`                         | `TF_GraphImportGraphDef`, `TF_NewSession`
+Graph construction with generated op functions | Yes                                                         | Yes (The C API supports client languages that do this)
+Gradients                                      | `tf.gradients`                                              |
+Functions                                      | `tf.python.framework.function.Defun`                        |
+Control Flow                                   | `tf.cond`, `tf.while_loop`                                  |
+Neural Network library                         | `tf.train`, `tf.nn`, `tf.contrib.layers`, `tf.contrib.slim` |
+
+## Recommended Approach
+
+### Run a predefined graph
+
+A language binding is expected to define the following classes:
+
+-   `Graph`: A graph representing a TensorFlow computation. Consists of
+    operations (represented in the client language by `Operation`s) and
+    corresponds to a `TF_Graph` in the C API. Mainly used as an argument when
+    creating new `Operation` objects and when starting a `Session`. Also
+    supports iterating through the operations in the graph
+    (`TF_GraphNextOperation`), looking up operations by name
+    (`TF_GraphOperationByName`), and converting to and from a `GraphDef`
+    protocol message (`TF_GraphToGraphDef` and `TF_GraphImportGraphDef` in the C
+    API).
+-   `Operation`: Represents a computation node in the graph. Corresponds to a
+    `TF_Operation` in the C API.
+-   `Output`: Represents one of the outputs of an operation in the graph. Has a
+    `DataType` (and eventually a shape). May be passed as an input argument to a
+    function for adding operations to a graph, or to a `Session`'s `Run()`
+    method to fetch that output as a tensor. Corresponds to a `TF_Output` in the
+    C API.
+-   `Session`: Represents a client to a particular instance of the TensorFlow
+    runtime. Its main job is to be constructed with a `Graph` and some options
+    and then field calls to `Run()` the graph. Corresponds to a `TF_Session` in
+    the C API.
+-   `Tensor`: Represents an N-dimensional (rectangular) array with elements all
+    the same `DataType`. Gets data into and out of a `Session`'s `Run()` call.
+    Corresponds to a `TF_Tensor` in the C API.
+-   `DataType`: An enumerant with all the possible tensor types supported by
+    TensorFlow. Corresponds to `TF_DataType` in the C API and often referred to
+    as `dtype` in the Python API.
+
+### Graph construction
+
+TensorFlow has many ops, and the list is not static, so we recommend generating
+the functions for adding ops to a graph instead of writing them by individually
+by hand (though writing a few by hand is a good way to figure out what the
+generator should generate). The information needed to generate a function is
+contained in an `OpDef` protocol message.
+
+There are a few ways to get a list of the `OpDef`s for the registered ops:
+
+-   `TF_GetAllOpList` in the C API retrieves all registered `OpDef` protocol
+    messages. This can be used to write the generator in the client language.
+    This requires that the client language have protocol buffer support in order
+    to interpret the `OpDef` messages.
+-   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
+    list of all registered `OpDef`s (defined in
+    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    in C++ (particularly useful for languages that do not have protocol buffer
+    support).
+-   The ASCII-serialized version of that list is periodically checked in to
+    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+
+The `OpDef` specifies the following:
+
+-   Name of the op in CamelCase. For generated functions follow the conventions
+    of the language. For example, if the language uses snake_case, use that
+    instead of CamelCase for the op's function name.
+-   A list of inputs and outputs. The types for these may be polymorphic by
+    referencing attributes, as described in the inputs and outputs section of
+    @{$adding_an_op$Adding an     op}.
+-   A list of attributes, along with their default values (if any). Note that
+    some of these will be inferred (if they are determined by an input), some
+    will be optional (if they have a default), and some will be required (no
+    default).
+-   Documentation for the op in general and the inputs, outputs, and
+    non-inferred attributes.
+-   Some other fields that are used by the runtime and can be ignored by the
+    code generators.
+
+An `OpDef` can be converted into the text of a function that adds that op to the
+graph using the `TF_OperationDescription` C API (wrapped in the language's FFI):
+
+-   Start with `TF_NewOperation()` to create the `TF_OperationDescription*`.
+-   Call `TF_AddInput()` or `TF_AddInputList()` once per input (depending on
+    whether the input has a list type).
+-   Call `TF_SetAttr*()` functions to set non-inferred attributes. May skip
+    attributes with defaults if you don't want to override the default value.
+-   Set optional fields if necessary:
+    -   `TF_SetDevice()`: force the operation onto a specific device.
+    -   `TF_AddControlInput()`: add requirements that another operation finish
+        before this operation starts running
+    -   `TF_SetAttrString("_kernel")` to set the kernel label (rarely used)
+    -   `TF_ColocateWith()` to colocate one op with another
+-   Call `TF_FinishOperation()` when done. This adds the operation to the graph,
+    after which it can't be modified.
+
+The existing examples run the code generator as part of the build process (using
+a Bazel genrule). Alternatively, the code generator can be run by an automated
+cron process, possibly checking in the result. This creates a risk of divergence
+between the generated code and the `OpDef`s checked into the repository, but is
+useful for languages where code is expected to be generated ahead of time like
+`go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
+some languages the code could be generated dynamically from
+[`tensorflow/core/ops/ops.pbtxt`].
+
+#### Handling Constants
+
+Calling code will be much more concise if users can provide constants to input
+arguments. The generated code should convert those constants to operations that
+are added to the graph and used as input to the op being instantiated.
+
+#### Optional parameters
+
+If the language allows for optional parameters to a function (like keyword
+arguments with defaults in Python), use them for optional attributes, operation
+names, devices, control inputs etc. In some languages, these optional parameters
+can be set using dynamic scopes (like "with" blocks in Python). Without these
+features, the library may resort to the "builder pattern", as is done in the C++
+version of the TensorFlow API.
+
+#### Name scopes
+
+It is a good idea to have support for naming graph operations using some sort of
+scoping hierarchy, especially considering the fact that TensorBoard relies on it
+to display large graphs in a reasonable way. The existing Python and C++ APIs
+take different approaches: In Python, the "directory" part of the name
+(everything up to the last "/") comes from `with` blocks. In effect, there is a
+thread-local stack with the scopes defining the name hierarchy. The last
+component of the name is either supplied explicitly by the user (using the
+optional `name` keyword argument) or defaults to the name of the type of the op
+being added. In C++ the "directory" part of the name is stored in an explicit
+`Scope` object. The `NewSubScope()` method appends to that part of the name and
+returns a new `Scope`. The last component of the name is set using the
+`WithOpName()` method, and like Python defaults to the name of the type of op
+being added. `Scope` objects are explicitly passed around to specify the name of
+the context.
+
+#### Wrappers
+
+It may make sense to keep the generated functions private for some ops so that
+wrapper functions that do a little bit of additional work can be used instead.
+This also gives an escape hatch for supporting features outside the scope of
+generated code.
+
+One use of a wrapper is for supporting `SparseTensor` input and output. A
+`SparseTensor` is a tuple of 3 dense tensors: indices, values, and shape. values
+is a vector size [n], shape is a vector size [rank], and indices is a matrix
+size [n, rank]. There are some sparse ops that use this triple to represent a
+single sparse tensor.
+
+Another reason to use wrappers is for ops that hold state. There are a few such
+ops (e.g. a variable) that have several companion ops for operating on that
+state. The Python API has classes for these ops where the constructor creates
+the op, and methods on that class add operations to the graph that operate on
+the state.
+
+#### Other Considerations
+
+-   It is good to have a list of keywords used to rename op functions and
+    arguments that collide with language keywords (or other symbols that will
+    cause trouble, like the names of library functions or variables referenced
+    in the generated code).
+-   The function for adding a `Const` operation to a graph typically is a
+    wrapper since the generated function will typically have redundant
+    `DataType` inputs.
+
+### Gradients, functions and control flow
+
+At this time, support for gradients, functions and control flow operations ("if"
+and "while") is not available in languages other than Python. This will be
+updated when the [C API] provides necessary support.
+
+[C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
+[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
+[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
+[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/docs_src/extend/leftnav_files b/tensorflow/docs_src/extend/leftnav_files
new file mode 100644
index 00000000000..8dbb54f6f63
--- /dev/null
+++ b/tensorflow/docs_src/extend/leftnav_files
@@ -0,0 +1,8 @@
+index.md
+architecture.md
+adding_an_op.md
+add_filesys.md
+new_data_formats.md
+estimators.md
+language_bindings.md
+tool_developers/index.md
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
new file mode 100644
index 00000000000..b3cc9680474
--- /dev/null
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -0,0 +1,221 @@
+# Custom Data Readers
+
+PREREQUISITES:
+
+*   Some familiarity with C++.
+*   Must have
+    @{$install_sources$downloaded TensorFlow source}, and be
+    able to build it.
+
+We divide the task of supporting a file format into two pieces:
+
+*   File formats: We use a *Reader* Op to read a *record* (which can be any
+    string) from a file.
+*   Record formats: We use decoder or parsing Ops to turn a string record
+    into tensors usable by TensorFlow.
+
+For example, to read a
+[CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
+@{tf.TextLineReader$a Reader for text files}
+followed by
+@{tf.decode_csv$an Op that parses CSV data from a line of text}.
+
+[TOC]
+
+## Writing a Reader for a file format
+
+A `Reader` is something that reads records from a file.  There are some examples
+of Reader Ops already built into TensorFlow:
+
+*   @{tf.TFRecordReader}
+    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
+*   @{tf.FixedLengthRecordReader}
+    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
+*   @{tf.TextLineReader}
+    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
+
+You can see these all expose the same interface, the only differences
+are in their constructors.  The most important method is `read`.
+It takes a queue argument, which is where it gets filenames to
+read from whenever it needs one (e.g. when the `read` op first runs, or
+the previous `read` reads the last record from a file).  It produces
+two scalar tensors: a string key and a string value.
+
+To create a new reader called `SomeReader`, you will need to:
+
+1.  In C++, define a subclass of
+    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h)
+    called `SomeReader`.
+2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
+3.  In Python, define a subclass of @{tf.ReaderBase} called `SomeReader`.
+
+You can put all the C++ code in a file in
+`tensorflow/core/user_ops/some_reader_op.cc`. The code to read a file will live
+in a descendant of the C++ `ReaderBase` class, which is defined in
+[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h).
+You will need to implement the following methods:
+
+*   `OnWorkStartedLocked`: open the next file
+*   `ReadLocked`: read a record or report EOF/error
+*   `OnWorkFinishedLocked`: close the current file, and
+*   `ResetLocked`: get a clean slate after, e.g., an error
+
+These methods have names ending in "Locked" since `ReaderBase` makes sure
+to acquire a mutex before calling any of these methods, so you generally don't
+have to worry about thread safety (though that only protects the members of the
+class, not global state).
+
+For `OnWorkStartedLocked`, the name of the file to open is the value returned by
+the `current_work()` method.  `ReadLocked` has this signature:
+
+```c++
+Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
+```
+
+If `ReadLocked` successfully reads a record from the file, it should fill in:
+
+*   `*key`: with an identifier for the record, that a human could use to find
+    this record again.  You can include the filename from `current_work()`,
+    and append a record number or whatever.
+*   `*value`: with the contents of the record.
+*   `*produced`: set to `true`.
+
+If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
+return `Status::OK()`.  If there is an error, simply return it using one of the
+helper functions from
+[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
+without modifying any arguments.
+
+Next you will create the actual Reader op.  It will help if you are familiar
+with @{$adding_an_op$the adding an op how-to}.  The main steps
+are:
+
+*   Registering the op.
+*   Define and register an `OpKernel`.
+
+To register the op, you will use a `REGISTER_OP` call defined in
+[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
+Reader ops never take any input and always have a single output with type
+`resource`.  They should have string `container` and `shared_name` attrs.
+You may optionally define additional attrs
+for configuration or include documentation in a `Doc`.  For examples, see
+[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
+e.g.:
+
+```c++
+#include "tensorflow/core/framework/op.h"
+
+REGISTER_OP("TextLineReader")
+    .Output("reader_handle: resource")
+    .Attr("skip_header_lines: int = 0")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+A Reader that outputs the lines of a file delimited by '\n'.
+)doc");
+```
+
+To define an `OpKernel`, Readers can use the shortcut of descending from
+`ReaderOpKernel`, defined in
+[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
+and implement a constructor that calls `SetReaderFactory`.  After defining
+your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
+An example with no attrs:
+
+```c++
+#include "tensorflow/core/framework/reader_op_kernel.h"
+
+class TFRecordReaderOp : public ReaderOpKernel {
+ public:
+  explicit TFRecordReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
+                        TFRecordReaderOp);
+```
+
+An example with attrs:
+
+```c++
+#include "tensorflow/core/framework/reader_op_kernel.h"
+
+class TextLineReaderOp : public ReaderOpKernel {
+ public:
+  explicit TextLineReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    int skip_header_lines = -1;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("skip_header_lines", &skip_header_lines));
+    OP_REQUIRES(context, skip_header_lines >= 0,
+                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
+                                        skip_header_lines));
+    Env* env = context->env();
+    SetReaderFactory([this, skip_header_lines, env]() {
+      return new TextLineReader(name(), skip_header_lines, env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
+                        TextLineReaderOp);
+```
+
+The last step is to add the Python wrapper.  You can either do this by
+@{$adding_an_op#building_the_op_library$compiling a dynamic library}
+or, if you are building TensorFlow from source, adding to `user_ops.py`.
+For the latter, you will import `tensorflow.python.ops.io_ops` in
+[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
+and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+
+```python
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import common_shapes
+from tensorflow.python.ops import io_ops
+
+class SomeReader(io_ops.ReaderBase):
+
+    def __init__(self, name=None):
+        rr = gen_user_ops.some_reader(name=name)
+        super(SomeReader, self).__init__(rr)
+
+
+ops.NotDifferentiable("SomeReader")
+```
+
+You can see some examples in
+[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+
+## Writing an Op for a record format
+
+Generally this is an ordinary op that takes a scalar string record as input, and
+so follow @{$adding_an_op$the instructions to add an Op}.
+You may optionally take a scalar string key as input, and include that in error
+messages reporting improperly formatted data.  That way users can more easily
+track down where the bad data came from.
+
+Examples of Ops useful for decoding records:
+
+*   @{tf.parse_single_example}
+    (and
+    @{tf.parse_example})
+*   @{tf.decode_csv}
+*   @{tf.decode_raw}
+
+Note that it can be useful to use multiple Ops to decode a particular record
+format.  For example, you may have an image saved as a string in
+[a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
+Depending on the format of that image, you might take the corresponding output
+from a
+@{tf.parse_single_example}
+op and call @{tf.image.decode_jpeg},
+@{tf.image.decode_png}, or
+@{tf.decode_raw}.  It is common to
+take the output of `tf.decode_raw` and use
+@{tf.slice} and
+@{tf.reshape} to extract pieces.
diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md
new file mode 100644
index 00000000000..06fc5e70dd0
--- /dev/null
+++ b/tensorflow/docs_src/extend/tool_developers/index.md
@@ -0,0 +1,186 @@
+# A Tool Developer's Guide to TensorFlow Model Files
+
+Most users shouldn't need to care about the internal details of how TensorFlow
+stores data on disk, but you might if you're a tool developer. For example, you
+may want to analyze models, or convert back and forth between TensorFlow and
+other formats. This guide tries to explain some of the details of how you can
+work with the main files that hold model data, to make it easier to develop
+those kind of tools.
+
+[TOC]
+
+## Protocol Buffers
+
+All of TensorFlow's file formats are based on
+[Protocol Buffers](https://developers.google.com/protocol-buffers/?hl=en), so to
+start it's worth getting familiar with how they work. The summary is that you
+define data structures in text files, and the protobuf tools generate classes in
+C, Python, and other languages that can load, save, and access the data in a
+friendly way. We often refer to Protocol Buffers as protobufs, and I'll use
+that convention in this guide.
+
+## GraphDef
+
+The foundation of computation in TensorFlow is the `Graph` object. This holds a
+network of nodes, each representing one operation, connected to each other as
+inputs and outputs. After you've created a `Graph` object, you can save it out
+by calling `as_graph_def()`, which returns a `GraphDef` object.
+
+The GraphDef class is an object created by the ProtoBuf library from the
+definition in
+[tensorflow/core/framework/graph.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto). The protobuf tools parse
+this text file, and generate the code to load, store, and manipulate graph
+definitions. If you see a standalone TensorFlow file representing a model, it's
+likely to contain a serialized version of one of these `GraphDef` objects
+saved out by the protobuf code.
+
+This generated code is used to save and load the GraphDef files from disk. The code that actually loads the model looks like this:
+
+```python
+graph_def = graph_pb2.GraphDef()
+```
+
+This line creates an empty `GraphDef` object, the class that's been created
+from the textual definition in graph.proto. This is the object we're going to
+populate with the data from our file.
+
+```python
+with open(FLAGS.graph, "rb") as f:
+```
+
+Here we get a file handle for the path we've passed in to the script
+
+```python
+  if FLAGS.input_binary:
+    graph_def.ParseFromString(f.read())
+  else:
+    text_format.Merge(f.read(), graph_def)
+```
+
+## Text or Binary?
+
+There are actually two different formats that a ProtoBuf can be saved in.
+TextFormat is a human-readable form, which makes it nice for debugging and
+editing, but can get large when there's numerical data like weights stored in
+it. You can see a small example of that in
+[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt).
+
+Binary format files are a lot smaller than their text equivalents, even though
+they're not as readable for us. In this script, we ask the user to supply a
+flag indicating whether the input file is binary or text, so we know the right
+function to call. You can find an example of a large binary file inside the
+[inception_v3 archive](https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz),
+as `inception_v3_2016_08_28_frozen.pb`.
+
+The API itself can be a bit confusing - the binary call is actually
+`ParseFromString()`, whereas you use a utility function from the `text_format`
+module to load textual files.
+
+## Nodes
+
+Once you've loaded a file into the `graph_def` variable, you can now access the
+data inside it. For most practical purposes, the important section is the list
+of nodes stored in the node member. Here's the code that loops through those:
+
+```python
+for node in graph_def.node
+```
+
+Each node is a `NodeDef` object, defined in
+[tensorflow/core/framework/node_def.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto). These
+are the fundamental building blocks of TensorFlow graphs, with each one defining
+a single operation along with its input connections. Here are the members of a
+`NodeDef`, and what they mean.
+
+### `name`
+
+Every node should have a unique identifier that's not used by any other nodes
+in the graph. If you don't specify one as you're building a graph using the
+Python API, one reflecting the name of operation, such as "MatMul",
+concatenated with a monotonically increasing number, such as "5", will be
+picked for you. The name is used when defining the connections between nodes,
+and when setting inputs and outputs for the whole graph when it's run.
+
+### `op`
+
+This defines what operation to run, for example `"Add"`, `"MatMul"`, or
+`"Conv2D"`. When a graph is run, this op name is looked up in a registry to
+find an implementation. The registry is populated by calls to the
+`REGISTER_OP()` macro, like those in
+[tensorflow/core/ops/nn_ops.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc).
+
+### `input`
+
+A list of strings, each one of which is the name of another node, optionally
+followed by a colon and an output port number. For example, a node with two
+inputs might have a list like `["some_node_name", "another_node_name"]`, which
+is equivalent to `["some_node_name:0", "another_node_name:0"]`, and defines the
+node's first input as the first output from the node with the name
+`"some_node_name"`, and a second input from the first output of
+`"another_node_name"`
+
+### `device`
+
+In most cases you can ignore this, since it defines where to run a node in a
+distributed environment, or when you want to force the operation onto CPU or
+GPU.
+
+### `attr`
+
+This is a key/value store holding all the attributes of a node. These are the
+permanent properties of nodes, things that don't change at runtime such as the
+size of filters for convolutions, or the values of constant ops. Because there
+can be so many different types of attribute values, from strings, to ints, to
+arrays of tensor values, there's a separate protobuf file defining the data
+structure that holds them, in
+[tensorflow/core/framework/attr_value.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto).
+
+Each attribute has a unique name string, and the expected attributes are listed
+when the operation is defined. If an attribute isn't present in a node, but it
+has a default listed in the operation definition, that default is used when the
+graph is created.
+
+You can access all of these members by calling `node.name`, `node.op`, etc. in
+Python. The list of nodes stored in the `GraphDef` is a full definition of the
+model architecture.
+
+## Freezing
+
+One confusing part about this is that the weights usually aren't stored inside
+the file format during training. Instead, they're held in separate checkpoint
+files, and there are `Variable` ops in the graph that load the latest values
+when they're initialized. It's often not very convenient to have separate files
+when you're deploying to production, so there's the
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) script that takes a graph definition and a set
+of checkpoints and freezes them together into a single file.
+
+What this does is load the `GraphDef`, pull in the values for all the variables
+from the latest checkpoint file, and then replace each `Variable` op with a
+`Const` that has the numerical data for the weights stored in its attributes
+It then strips away all the extraneous nodes that aren't used for forward
+inference, and saves out the resulting `GraphDef` into an output file.
+
+## Weight Formats
+
+If you're dealing with TensorFlow models that represent neural networks, one of
+the most common problems is extracting and interpreting the weight values. A
+common way to store them, for example in graphs created by the freeze_graph
+script, is as `Const` ops containing the weights as `Tensors`. These are
+defined in
+[tensorflow/core/framework/tensor.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto), and contain information
+about the size and type of the data, as well as the values themselves. In
+Python, you get a `TensorProto` object from a `NodeDef` representing a `Const`
+op by calling something like `some_node_def.attr['value'].tensor`.
+
+This will give you an object representing the weights data. The data itself
+will be stored in one of the lists with the suffix _val as indicated by the
+type of the object, for example `float_val` for 32-bit float data types.
+
+The ordering of convolution weight values is often tricky to deal with when
+converting between different frameworks. In TensorFlow, the filter weights for
+the `Conv2D` operation are stored on the second input, and are expected to be
+in the order `[filter_height, filter_width, input_depth, output_depth]`, where
+filter_count increasing by one means moving to an adjacent value in memory.
+
+Hopefully this rundown gives you a better idea of what's going on inside
+TensorFlow model files, and will help you if you ever need to manipulate them.
diff --git a/tensorflow/g3doc/extras/README.txt b/tensorflow/docs_src/extras/README.txt
similarity index 100%
rename from tensorflow/g3doc/extras/README.txt
rename to tensorflow/docs_src/extras/README.txt
diff --git a/tensorflow/docs_src/get_started/embedding_viz.md b/tensorflow/docs_src/get_started/embedding_viz.md
new file mode 100644
index 00000000000..84245b11bea
--- /dev/null
+++ b/tensorflow/docs_src/get_started/embedding_viz.md
@@ -0,0 +1,287 @@
+# TensorBoard: Embedding Visualization
+
+Embeddings are ubiquitous in machine learning, appearing in recommender systems,
+NLP, and many other applications. Indeed, in the context of TensorFlow, it's
+natural to view tensors (or slices of tensors) as points in space, so almost any
+TensorFlow system will naturally give rise to various embeddings.
+
+TensorBoard has a built-in visualizer, called the <i>Embedding Projector</i>,
+for interactive visualization and analysis of high-dimensional data like
+embeddings. The embedding projector will read the embeddings from your model
+checkpoint file. Although it's most useful for embeddings, it will load any 2D
+tensor, including your training weights.
+
+To learn more about embeddings and how to train them, see the
+@{$word2vec$Vector Representations of Words} tutorial.
+If you are interested in embeddings of images, check out
+[this article](http://colah.github.io/posts/2014-10-Visualizing-MNIST/) for
+interesting visualizations of MNIST images. On the other hand, if you are
+interested in word embeddings,
+[this article](http://colah.github.io/posts/2015-01-Visualizing-Representations/)
+gives a good introduction.
+
+<video autoplay loop style="max-width: 100%;">
+  <source src="https://www.tensorflow.org/images/embedding-mnist.mp4" type="video/mp4">
+  Sorry, your browser doesn't support HTML5 video in MP4 format.
+</video>
+
+By default, the Embedding Projector projects the high-dimensional data into 3
+dimensions using
+[principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis).
+For a visual explanation of PCA, see
+[this article](http://setosa.io/ev/principal-component-analysis/). Another
+very useful projection you can use is
+[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding).
+We talk about more t-SNE later in the tutorial.
+
+If you are working with an embedding, you'll probably want to attach
+labels/images to the data points. You can do this by generating a
+[metadata file](#metadata) containing the labels for each point and configuring
+the projector either by using our Python API, or manually constructing and
+saving a
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
+in the same directory as your checkpoint file.
+
+## Setup
+
+For in depth information on how to run TensorBoard and make sure you are
+logging all the necessary information,
+see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
+
+To visualize your embeddings, there are 3 things you need to do:
+
+1) Setup a 2D tensor that holds your embedding(s).
+
+```python
+embedding_var = tf.Variable(....)
+```
+
+2) Periodically save your model variables in a checkpoint in
+<code>LOG_DIR</code>.
+
+```python
+saver = tf.train.Saver()
+saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), step)
+```
+
+3) (Optional) Associate metadata with your embedding.
+
+If you have any metadata (labels, images) associated with your embedding, you
+can tell TensorBoard about it either by directly storing a
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
+in the <code>LOG_DIR</code>, or use our python API.
+
+For instance, the following <code>projector_config.ptxt</code> associates the
+<code>word_embedding</code> tensor with metadata stored in <code>$LOG_DIR/metadata.tsv</code>:
+
+```
+embeddings {
+  tensor_name: 'word_embedding'
+  metadata_path: '$LOG_DIR/metadata.tsv'
+}
+```
+
+The same config can be produced programmatically using the following code snippet:
+
+```python
+from tensorflow.contrib.tensorboard.plugins import projector
+
+# Create randomly initialized embedding weights which will be trained.
+N = 10000 # Number of items (vocab size).
+D = 200 # Dimensionality of the embedding.
+embedding_var = tf.Variable(tf.random_normal([N,D]), name='word_embedding')
+
+# Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
+config = projector.ProjectorConfig()
+
+# You can add multiple embeddings. Here we add only one.
+embedding = config.embeddings.add()
+embedding.tensor_name = embedding_var.name
+# Link this tensor to its metadata file (e.g. labels).
+embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
+
+# Use the same LOG_DIR where you stored your checkpoint.
+summary_writer = tf.summary.FileWriter(LOG_DIR)
+
+# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
+# read this file during startup.
+projector.visualize_embeddings(summary_writer, config)
+```
+
+After running your model and training your embeddings, run TensorBoard and point
+it to the <code>LOG_DIR</code> of the job.
+
+```python
+tensorboard --logdir=LOG_DIR
+```
+
+Then click on the *Embeddings* tab on the top pane
+and select the appropriate run (if there are more than one run).
+
+
+## Metadata
+Usually embeddings have metadata associated with it (e.g. labels, images). The
+metadata should be stored in a separate file outside of the model checkpoint
+since the metadata is not a trainable parameter of the model. The format should
+be a [TSV file](https://en.wikipedia.org/wiki/Tab-separated_values)
+(tab characters shown in red) with the first line containing column headers
+(shown in bold) and subsequent lines contain the metadata values:
+
+<code>
+<b>Word<span style="color:#800;">\t</span>Frequency</b><br/>
+  Airplane<span style="color:#800;">\t</span>345<br/>
+  Car<span style="color:#800;">\t</span>241<br/>
+  ...
+</code>
+
+There is no explicit key shared with the main data file; instead, the order in
+the metadata file is assumed to match the order in the embedding tensor. In
+other words, the first line is the header information and the (i+1)-th line in
+the metadata file corresponds to the i-th row of the embedding tensor stored in
+the checkpoint.
+
+Note: If the TSV metadata file has only a single column, then we don’t expect a
+header row, and assume each row is the label of the embedding. We include this
+exception because it matches the commonly-used "vocab file" format.
+
+### Images
+If you have images associated with your embeddings, you will need to
+produce a single image consisting of small thumbnails of each data point.
+This is known as the
+[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image).
+The sprite should have the same number of rows and columns with thumbnails
+stored in row-first order: the first data point placed in the top left and the
+last data point in the bottom right:
+
+<table style="border: none;">
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">0</td>
+  <td style="border: 1px solid black">1</td>
+  <td style="border: 1px solid black">2</td>
+</tr>
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">3</td>
+  <td style="border: 1px solid black">4</td>
+  <td style="border: 1px solid black">5</td>
+</tr>
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">6</td>
+  <td style="border: 1px solid black">7</td>
+  <td style="border: 1px solid black"></td>
+</tr>
+</table>
+
+Note in the example above that the last row doesn't have to be filled. For a
+concrete example of a sprite, see
+[this sprite image](https://www.tensorflow.org/images/mnist_10k_sprite.png) of 10,000 MNIST digits
+(100x100).
+
+Note: We currently support sprites up to 8192px X 8192px.
+
+After constructing the sprite, you need to tell the Embedding Projector where
+to find it:
+
+
+```python
+embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE
+# Specify the width and height of a single thumbnail.
+embedding.sprite.single_image_dim.extend([w, h])
+```
+
+## Interaction
+
+The Embedding Projector has three panels:
+
+1. *Data panel* on the top left, where you can choose the run, the embedding
+   tensor and data columns to color and label points by.
+2. *Projections panel* on the bottom left, where you choose the type of
+    projection (e.g. PCA, t-SNE).
+3. *Inspector panel* on the right side, where you can search for particular
+   points and see a list of nearest neighbors.
+
+### Projections
+The Embedding Projector has three methods of reducing the dimensionality of a
+data set: two linear and one nonlinear. Each method can be used to create either
+a two- or three-dimensional view.
+
+**Principal Component Analysis** A straightforward technique for reducing
+dimensions is Principal Component Analysis (PCA). The Embedding Projector
+computes the top 10 principal components. The menu lets you project those
+components onto any combination of two or three. PCA is a linear projection,
+often effective at examining global geometry.
+
+**t-SNE** A popular non-linear dimensionality reduction technique is t-SNE.
+The Embedding Projector offers both two- and three-dimensional t-SNE views.
+Layout is performed client-side animating every step of the algorithm. Because
+t-SNE often preserves some local structure, it is useful for exploring local
+neighborhoods and finding clusters. Although extremely useful for visualizing
+high-dimensional data, t-SNE plots can sometimes be mysterious or misleading.
+See this [great article](http://distill.pub/2016/misread-tsne/) for how to use
+t-SNE effectively.
+
+**Custom** You can also construct specialized linear projections based on text
+searches for finding meaningful directions in space. To define a projection
+axis, enter two search strings or regular expressions. The program computes the
+centroids of the sets of points whose labels match these searches, and uses the
+difference vector between centroids as a projection axis.
+
+### Navigation
+
+To explore a data set, you can navigate the views in either a 2D or a 3D mode,
+zooming, rotating, and panning using natural click-and-drag gestures.
+Clicking on a point causes the right pane to show an explicit textual list of
+nearest neighbors, along with distances to the current point. The
+nearest-neighbor points themselves are highlighted on the projection.
+
+Zooming into the cluster gives some information, but it is sometimes more
+helpful to restrict the view to a subset of points and perform projections only
+on those points. To do so, you can select points in multiple ways:
+
+1. After clicking on a point, its nearest neighbors are also selected.
+2. After a search, the points matching the query are selected.
+3. Enabling selection, clicking on a point and dragging defines a selection
+   sphere.
+
+After selecting a set of points, you can isolate those points for
+further analysis on their own with the "Isolate Points" button in the Inspector
+pane on the right hand side.
+
+
+![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
+*Selection of the nearest neighbors of “important” in a word embedding dataset.*
+
+The combination of filtering with custom projection can be powerful. Below, we filtered
+the 100 nearest neighbors of “politics” and projected them onto the
+“best” - “worst” vector as an x axis. The y axis is random.
+
+You can see that on the right side we have “ideas”, “science”, “perspective”,
+“journalism” while on the left we have “crisis”, “violence” and “conflict”.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 30%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
+    </td>
+    <td style="width: 70%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 30%;">
+      Custom projection controls.
+    </td>
+    <td style="width: 70%;">
+      Custom projection of neighbors of "politics" onto "best" - "worst" vector.
+    </td>
+  </tr>
+</table>
+
+### Collaborative Features
+
+To share your findings, you can use the bookmark panel in the bottom right
+corner and save the current state (including computed coordinates of any
+projection) as a small file. The Projector can then be pointed to a set of one
+or more of these files, producing the panel below. Other users can then walk
+through a sequence of bookmarks.
+
+<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
new file mode 100644
index 00000000000..c1c68f9c12b
--- /dev/null
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -0,0 +1,476 @@
+# Getting Started With TensorFlow
+
+This guide gets you started programming in TensorFlow. Before using this guide,
+@{$install$install TensorFlow}. To get the most out of
+this guide, you should know the following:
+
+*   How to program in Python.
+*   At least a little bit about arrays.
+*   Ideally, something about machine learning. However, if you know little or
+    nothing about machine learning, then this is still the first guide you
+    should read.
+
+TensorFlow provides multiple APIs. The lowest level API--TensorFlow Core--
+provides you with complete programming control. We recommend TensorFlow Core for
+machine learning researchers and others who require fine levels of control over
+their models. The higher level APIs are built on top of TensorFlow Core. These
+higher level APIs are typically easier to learn and use than TensorFlow Core. In
+addition, the higher level APIs make repetitive tasks easier and more consistent
+between different users. A high-level API like tf.contrib.learn helps you manage
+data sets, estimators, training and inference. Note that a few of the high-level
+TensorFlow APIs--those whose method names contain `contrib`-- are still in
+development. It is possible that some `contrib` methods will change or become
+obsolete in subsequent TensorFlow releases.
+
+This guide begins with a tutorial on TensorFlow Core. Later, we
+demonstrate how to implement the same model in tf.contrib.learn. Knowing
+TensorFlow Core principles will give you a great mental model of how things are
+working internally when you use the more compact higher level API.
+
+# Tensors
+
+The central unit of data in TensorFlow is the **tensor**. A tensor consists of a
+set of primitive values shaped into an array of any number of dimensions. A
+tensor's **rank** is its number of dimensions. Here are some examples of
+tensors:
+
+```python
+3 # a rank 0 tensor; this is a scalar with shape []
+[1. ,2., 3.] # a rank 1 tensor; this is a vector with shape [3]
+[[1., 2., 3.], [4., 5., 6.]] # a rank 2 tensor; a matrix with shape [2, 3]
+[[[1., 2., 3.]], [[7., 8., 9.]]] # a rank 3 tensor with shape [2, 1, 3]
+```
+
+## TensorFlow Core tutorial
+
+### Importing TensorFlow
+
+The canonical import statement for TensorFlow programs is as follows:
+
+```python
+import tensorflow as tf
+```
+This gives Python access to all of TensorFlow's classes, methods, and symbols.
+Most of the documentation assumes you have already done this.
+
+### The Computational Graph
+
+You might think of TensorFlow Core programs as consisting of two discrete
+sections:
+
+1.  Building the computational graph.
+2.  Running the computational graph.
+
+A **computational graph** is a series of TensorFlow operations arranged into a
+graph of nodes.
+Let's build a simple computational graph. Each node takes zero
+or more tensors as inputs and produces a tensor as an output. One type of node
+is a constant. Like all TensorFlow constants, it takes no inputs, and it outputs
+a value it stores internally. We can create two floating point Tensors `node1`
+and `node2` as follows:
+
+```python
+node1 = tf.constant(3.0, dtype=tf.float32)
+node2 = tf.constant(4.0) # also tf.float32 implicitly
+print(node1, node2)
+```
+
+The final print statement produces
+
+```
+Tensor("Const:0", shape=(), dtype=float32) Tensor("Const_1:0", shape=(), dtype=float32)
+```
+
+Notice that printing the nodes does not output the values `3.0` and `4.0` as you
+might expect. Instead, they are nodes that, when evaluated, would produce 3.0
+and 4.0, respectively. To actually evaluate the nodes, we must run the
+computational graph within a **session**. A session encapsulates the control and
+state of the TensorFlow runtime.
+
+The following code creates a `Session` object and then invokes its `run` method
+to run enough of the computational graph to evaluate `node1` and `node2`. By
+running the computational graph in a session as follows:
+
+```python
+sess = tf.Session()
+print(sess.run([node1, node2]))
+```
+
+we see the expected values of 3.0 and 4.0:
+
+```
+[3.0, 4.0]
+```
+
+We can build more complicated computations by combining `Tensor` nodes with
+operations (Operations are also nodes.). For example, we can add our two
+constant nodes and produce a new graph as follows:
+
+```python
+node3 = tf.add(node1, node2)
+print("node3: ", node3)
+print("sess.run(node3): ",sess.run(node3))
+```
+
+The last two print statements produce
+
+```
+node3:  Tensor("Add:0", shape=(), dtype=float32)
+sess.run(node3):  7.0
+```
+
+TensorFlow provides a utility called TensorBoard that can display a picture of
+the computational graph. Here is a screenshot showing how TensorBoard
+visualizes the graph:
+
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
+
+As it stands, this graph is not especially interesting because it always
+produces a constant result. A graph can be parameterized to accept external
+inputs, known as **placeholders**. A **placeholder** is a promise to provide a
+value later.
+
+```python
+a = tf.placeholder(tf.float32)
+b = tf.placeholder(tf.float32)
+adder_node = a + b  # + provides a shortcut for tf.add(a, b)
+```
+
+The preceding three lines are a bit like a function or a lambda in which we
+define two input parameters (a and b) and then an operation on them. We can
+evaluate this graph with multiple inputs by using the feed_dict argument to
+the [run method](https://www.tensorflow.org/api_docs/python/tf/Session#run)
+to feed concrete values to the placeholders:
+
+```python
+print(sess.run(adder_node, {a: 3, b:4.5}))
+print(sess.run(adder_node, {a: [1,3], b: [2, 4]}))
+```
+resulting in the output
+
+```
+7.5
+[ 3.  7.]
+```
+
+In TensorBoard, the graph looks like this:
+
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_adder.png)
+
+We can make the computational graph more complex by adding another operation.
+For example,
+
+```python
+add_and_triple = adder_node * 3.
+print(sess.run(add_and_triple, {a: 3, b:4.5}))
+```
+produces the output
+```
+22.5
+```
+
+The preceding computational graph would look as follows in TensorBoard:
+
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_triple.png)
+
+In machine learning we will typically want a model that can take arbitrary
+inputs, such as the one above.  To make the model trainable, we need to be able
+to modify the graph to get new outputs with the same input.  **Variables** allow
+us to add trainable parameters to a graph.  They are constructed with a type and
+initial value:
+
+
+```python
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
+x = tf.placeholder(tf.float32)
+linear_model = W * x + b
+```
+
+Constants are initialized when you call `tf.constant`, and their value can never
+change. By contrast, variables are not initialized when you call `tf.Variable`.
+To initialize all the variables in a TensorFlow program, you must explicitly
+call a special operation as follows:
+
+```python
+init = tf.global_variables_initializer()
+sess.run(init)
+```
+It is important to realize `init` is a handle to the TensorFlow sub-graph that
+initializes all the global variables. Until we call `sess.run`, the variables
+are uninitialized.
+
+
+Since `x` is a placeholder, we can evaluate `linear_model` for several values of
+`x` simultaneously as follows:
+
+```python
+print(sess.run(linear_model, {x:[1,2,3,4]}))
+```
+to produce the output
+```
+[ 0.          0.30000001  0.60000002  0.90000004]
+```
+
+We've created a model, but we don't know how good it is yet. To evaluate the
+model on training data, we need a `y` placeholder to provide the desired values,
+and we need to write a loss function.
+
+A loss function measures how far apart the
+current model is from the provided data. We'll use a standard loss model for
+linear regression, which sums the squares of the deltas between the current
+model and the provided data. `linear_model - y` creates a vector where each
+element is the corresponding example's error delta. We call `tf.square` to
+square that error. Then, we sum all the squared errors to create a single scalar
+that abstracts the error of all examples using `tf.reduce_sum`:
+
+```python
+y = tf.placeholder(tf.float32)
+squared_deltas = tf.square(linear_model - y)
+loss = tf.reduce_sum(squared_deltas)
+print(sess.run(loss, {x:[1,2,3,4], y:[0,-1,-2,-3]}))
+```
+producing the loss value
+```
+23.66
+```
+
+We could improve this manually by reassigning the values of `W` and `b` to the
+perfect values of -1 and 1. A variable is initialized to the value provided to
+`tf.Variable` but can be changed using operations like `tf.assign`. For example,
+`W=-1` and `b=1` are the optimal parameters for our model. We can change `W` and
+`b` accordingly:
+
+```python
+fixW = tf.assign(W, [-1.])
+fixb = tf.assign(b, [1.])
+sess.run([fixW, fixb])
+print(sess.run(loss, {x:[1,2,3,4], y:[0,-1,-2,-3]}))
+```
+The final print shows the loss now is zero.
+```
+0.0
+```
+
+We guessed the "perfect" values of `W` and `b`, but the whole point of machine
+learning is to find the correct model parameters automatically.  We will show
+how to accomplish this in the next section.
+
+## tf.train API
+
+A complete discussion of machine learning is out of the scope of this tutorial.
+However, TensorFlow provides **optimizers** that slowly change each variable in
+order to minimize the loss function. The simplest optimizer is **gradient
+descent**. It modifies each variable according to the magnitude of the
+derivative of loss with respect to that variable. In general, computing symbolic
+derivatives manually is tedious and error-prone. Consequently, TensorFlow can
+automatically produce derivatives given only a description of the model using
+the function `tf.gradients`. For simplicity, optimizers typically do this
+for you. For example,
+
+```python
+optimizer = tf.train.GradientDescentOptimizer(0.01)
+train = optimizer.minimize(loss)
+```
+
+```python
+sess.run(init) # reset values to incorrect defaults.
+for i in range(1000):
+  sess.run(train, {x:[1,2,3,4], y:[0,-1,-2,-3]})
+
+print(sess.run([W, b]))
+```
+results in the final model parameters:
+```
+[array([-0.9999969], dtype=float32), array([ 0.99999082],
+ dtype=float32)]
+```
+
+Now we have done actual machine learning!  Although doing this simple linear
+regression doesn't require much TensorFlow core code, more complicated models
+and methods to feed data into your model necessitate more code. Thus TensorFlow
+provides higher level abstractions for common patterns, structures, and
+functionality. We will learn how to use some of these abstractions in the
+next section.
+
+### Complete program
+
+The completed trainable linear regression model is shown here:
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Model parameters
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
+# Model input and output
+x = tf.placeholder(tf.float32)
+linear_model = W * x + b
+y = tf.placeholder(tf.float32)
+# loss
+loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
+# optimizer
+optimizer = tf.train.GradientDescentOptimizer(0.01)
+train = optimizer.minimize(loss)
+# training data
+x_train = [1,2,3,4]
+y_train = [0,-1,-2,-3]
+# training loop
+init = tf.global_variables_initializer()
+sess = tf.Session()
+sess.run(init) # reset values to wrong
+for i in range(1000):
+  sess.run(train, {x:x_train, y:y_train})
+
+# evaluate training accuracy
+curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x:x_train, y:y_train})
+print("W: %s b: %s loss: %s"%(curr_W, curr_b, curr_loss))
+```
+When run, it produces
+```
+W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
+```
+
+Notice that the loss is a very small number (close to zero). If you run this
+program your loss will not be exactly the same, because the model is initialized
+with random values.
+
+This more complicated program can still be visualized in TensorBoard
+![TensorBoard final model visualization](https://www.tensorflow.org/images/getting_started_final.png)
+
+## `tf.contrib.learn`
+
+`tf.contrib.learn` is a high-level TensorFlow library that simplifies the
+mechanics of machine learning, including the following:
+
+*   running training loops
+*   running evaluation loops
+*   managing data sets
+*   managing feeding
+
+tf.contrib.learn defines many common models.
+
+### Basic usage
+
+Notice how much simpler the linear regression program becomes with
+`tf.contrib.learn`:
+
+```python
+import tensorflow as tf
+# NumPy is often used to load, manipulate and preprocess data.
+import numpy as np
+
+# Declare list of features. We only have one real-valued feature. There are many
+# other types of columns that are more complicated and useful.
+features = [tf.contrib.layers.real_valued_column("x", dimension=1)]
+
+# An estimator is the front end to invoke training (fitting) and evaluation
+# (inference). There are many predefined types like linear regression,
+# logistic regression, linear classification, logistic classification, and
+# many neural network classifiers and regressors. The following code
+# provides an estimator that does linear regression.
+estimator = tf.contrib.learn.LinearRegressor(feature_columns=features)
+
+# TensorFlow provides many helper methods to read and set up data sets.
+# Here we use two data sets: one for training and one for evaluation
+# We have to tell the function how many batches
+# of data (num_epochs) we want and how big each batch should be.
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x_train}, y_train,
+                                              batch_size=4,
+                                              num_epochs=1000)
+eval_input_fn = tf.contrib.learn.io.numpy_input_fn(
+    {"x":x_eval}, y_eval, batch_size=4, num_epochs=1000)
+
+# We can invoke 1000 training steps by invoking the  method and passing the
+# training data set.
+estimator.fit(input_fn=input_fn, steps=1000)
+
+# Here we evaluate how well our model did.
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
+```
+When run, it produces
+```
+    train loss: {'global_step': 1000, 'loss': 4.3049088e-08}
+    eval loss: {'global_step': 1000, 'loss': 0.0025487561}
+```
+Notice how our eval data has a higher loss, but it is still close to zero.
+That means we are learning properly.
+
+### A custom model
+
+`tf.contrib.learn` does not lock you into its predefined models. Suppose we
+wanted to create a custom model that is not built into TensorFlow. We can still
+retain the high level abstraction of data set, feeding, training, etc. of
+`tf.contrib.learn`. For illustration, we will show how to implement our own
+equivalent model to `LinearRegressor` using our knowledge of the lower level
+TensorFlow API.
+
+To define a custom model that works with `tf.contrib.learn`, we need to use
+`tf.contrib.learn.Estimator`. `tf.contrib.learn.LinearRegressor` is actually
+a sub-class of `tf.contrib.learn.Estimator`. Instead of sub-classing
+`Estimator`, we simply provide `Estimator` a function `model_fn` that tells
+`tf.contrib.learn` how it can evaluate predictions, training steps, and
+loss. The code is as follows:
+
+```python
+import numpy as np
+import tensorflow as tf
+# Declare list of features, we only have one real-valued feature
+def model(features, labels, mode):
+  # Build a linear model and predict values
+  W = tf.get_variable("W", [1], dtype=tf.float64)
+  b = tf.get_variable("b", [1], dtype=tf.float64)
+  y = W*features['x'] + b
+  # Loss sub-graph
+  loss = tf.reduce_sum(tf.square(y - labels))
+  # Training sub-graph
+  global_step = tf.train.get_global_step()
+  optimizer = tf.train.GradientDescentOptimizer(0.01)
+  train = tf.group(optimizer.minimize(loss),
+                   tf.assign_add(global_step, 1))
+  # ModelFnOps connects subgraphs we built to the
+  # appropriate functionality.
+  return tf.contrib.learn.ModelFnOps(
+      mode=mode, predictions=y,
+      loss=loss,
+      train_op=train)
+
+estimator = tf.contrib.learn.Estimator(model_fn=model)
+# define our data sets
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x_train}, y_train, 4, num_epochs=1000)
+
+# train
+estimator.fit(input_fn=input_fn, steps=1000)
+# Here we evaluate how well our model did. 
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
+```
+When run, it produces
+```
+train loss: {'global_step': 1000, 'loss': 4.9380226e-11}
+eval loss: {'global_step': 1000, 'loss': 0.01010081}
+```
+
+Notice how the contents of the custom `model()` function are very similar
+to our manual model training loop from the lower level API.
+
+## Next steps
+
+Now you have a working knowledge of the basics of TensorFlow. We have several
+more tutorials that you can look at to learn more. If you are a beginner in
+machine learning see @{$beginners$MNIST for beginners},
+otherwise see @{$pros$Deep MNIST for experts}.
diff --git a/tensorflow/docs_src/get_started/graph_viz.md b/tensorflow/docs_src/get_started/graph_viz.md
new file mode 100644
index 00000000000..06ec427b757
--- /dev/null
+++ b/tensorflow/docs_src/get_started/graph_viz.md
@@ -0,0 +1,315 @@
+# TensorBoard: Graph Visualization
+
+TensorFlow computation graphs are powerful but complicated. The graph visualization can help you understand and debug them. Here's an example of the visualization at work.
+
+![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
+*Visualization of a TensorFlow graph.*
+
+To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
+
+## Name scoping and nodes
+
+Typical TensorFlow graphs can have many thousands of nodes--far too many to see
+easily all at once, or even to lay out using standard graph tools. To simplify,
+variable names can be scoped and the visualization uses this information to
+define a hierarchy on the nodes in the graph.  By default, only the top of this
+hierarchy is shown. Here is an example that defines three operations under the
+`hidden` name scope using
+@{tf.name_scope}:
+
+```python
+import tensorflow as tf
+
+with tf.name_scope('hidden') as scope:
+  a = tf.constant(5, name='alpha')
+  W = tf.Variable(tf.random_uniform([1, 2], -1.0, 1.0), name='weights')
+  b = tf.Variable(tf.zeros([1]), name='biases')
+```
+
+This results in the following three op names:
+
+* `hidden/alpha`
+* `hidden/weights`
+* `hidden/biases`
+
+By default, the visualization will collapse all three into a node labeled `hidden`.
+The extra detail isn't lost. You can double-click, or click
+on the orange `+` sign in the top right to expand the node, and then you'll see
+three subnodes for `alpha`, `weights` and `biases`.
+
+Here's a real-life example of a more complicated node in its initial and
+expanded states.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
+    </td>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 50%;">
+      Initial view of top-level name scope <code>pool_1</code>. Clicking on the orange <code>+</code> button on the top right or double-clicking on the node itself will expand it.
+    </td>
+    <td style="width: 50%;">
+      Expanded view of <code>pool_1</code> name scope. Clicking on the orange <code>-</code> button on the top right or double-clicking on the node itself will collapse the name scope.
+    </td>
+  </tr>
+</table>
+
+Grouping nodes by name scopes is critical to making a legible graph. If you're
+building a model, name scopes give you control over the resulting visualization.
+**The better your name scopes, the better your visualization.**
+
+The figure above illustrates a second aspect of the visualization. TensorFlow
+graphs have two kinds of connections: data dependencies and control
+dependencies. Data dependencies show the flow of tensors between two ops and
+are shown as solid arrows, while control dependencies use dotted lines. In the
+expanded view (right side of the figure above) all the connections are data
+dependencies with the exception of the dotted line connecting `CheckNumerics`
+and `control_dependency`.
+
+There's a second trick to simplifying the layout. Most TensorFlow graphs have a
+few nodes with many connections to other nodes. For example, many nodes might
+have a control dependency on an initialization step. Drawing all edges between
+the `init` node and its dependencies would create a very cluttered view.
+
+To reduce clutter, the visualization separates out all high-degree nodes to an
+*auxiliary* area on the right and doesn't draw lines to represent their edges.
+Instead of lines, we draw small *node icons* to indicate the connections.
+Separating out the auxiliary nodes typically doesn't remove critical
+information since these nodes are usually related to bookkeeping functions.
+See [Interaction](#interaction) for how to move nodes between the main graph
+and the auxiliary area.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
+    </td>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 50%;">
+      Node <code>conv_1</code> is connected to <code>save</code>. Note the little <code>save</code> node icon on its right.
+    </td>
+    <td style="width: 50%;">
+      <code>save</code> has a high degree, and will appear as an auxiliary node. The connection with <code>conv_1</code> is shown as a node icon on its left. To further reduce clutter, since <code>save</code> has a lot of connections, we show the first 5 and abbreviate the others as <code>... 12 more</code>.
+    </td>
+  </tr>
+</table>
+
+One last structural simplification is *series collapsing*. Sequential
+motifs--that is, nodes whose names differ by a number at the end and have
+isomorphic structures--are collapsed into a single *stack* of nodes, as shown
+below. For networks with long sequences, this greatly simplifies the view. As
+with hierarchical nodes, double-clicking expands the series. See
+[Interaction](#interaction) for how to disable/enable series collapsing for a
+specific set of nodes.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
+    </td>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 50%;">
+      A collapsed view of a node sequence.
+    </td>
+    <td style="width: 50%;">
+      A small piece of the expanded view, after double-click.
+    </td>
+  </tr>
+</table>
+
+Finally, as one last aid to legibility, the visualization uses special icons
+for constants and summary nodes. To summarize, here's a table of node symbols:
+
+Symbol | Meaning
+--- | ---
+![Name scope](https://www.tensorflow.org/images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
+![Sequence of unconnected nodes](https://www.tensorflow.org/images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
+![Sequence of connected nodes](https://www.tensorflow.org/images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
+![Operation node](https://www.tensorflow.org/images/op_node.png "Operation node") | An individual operation node.
+![Constant node](https://www.tensorflow.org/images/constant.png "Constant node") | A constant.
+![Summary node](https://www.tensorflow.org/images/summary.png "Summary node") | A summary node.
+![Data flow edge](https://www.tensorflow.org/images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
+![Control dependency edge](https://www.tensorflow.org/images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
+![Reference edge](https://www.tensorflow.org/images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
+
+## Interaction {#interaction}
+
+Navigate the graph by panning and zooming. Click and drag to pan, and use a
+scroll gesture to zoom. Double-click on a node, or click on its `+` button, to
+expand a name scope that represents a group of operations. To easily keep
+track of the current viewpoint when zooming and panning, there is a minimap in
+the bottom right corner.
+
+To close an open node, double-click it again or click its `-` button. You can
+also click once to select a node. It will turn a darker color, and details
+about it and the nodes it connects to will appear in the info card at upper
+right corner of the visualization.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
+    </td>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 50%;">
+      Info card showing detailed information for the <code>conv2</code> name scope. The inputs and outputs are combined from the inputs and outputs of the operation nodes inside the name scope. For name scopes no attributes are shown.
+    </td>
+    <td style="width: 50%;">
+      Info card showing detailed information for the <code>DecodeRaw</code> operation node. In addition to inputs and outputs, the card shows the device and the attributes associated with the current operation.
+    </td>
+  </tr>
+</table>
+
+TensorBoard provides several ways to change the visual layout of the graph. This
+doesn't change the graph's computational semantics, but it can bring some
+clarity to the network's structure. By right clicking on a node or pressing
+buttons on the bottom of that node's info card, you can make the following
+changes to its layout:
+
+* Nodes can be moved between the main graph and the auxiliary area.
+* A series of nodes can be ungrouped so that the nodes in the series do not
+appear grouped together. Ungrouped series can likewise be regrouped.
+
+Selection can also be helpful in understanding high-degree nodes. Select any
+high-degree node, and the corresponding node icons for its other connections
+will be selected as well. This makes it easy, for example, to see which nodes
+are being saved--and which aren't.
+
+Clicking on a node name in the info card will select it. If necessary, the
+viewpoint will automatically pan so that the node is visible.
+
+Finally, you can choose two color schemes for your graph, using the color menu
+above the legend. The default *Structure View* shows structure: when two
+high-level nodes have the same structure, they appear in the same color of the
+rainbow. Uniquely structured nodes are gray. There's a second view, which shows
+what device the different operations run on. Name scopes are colored
+proportionally to the fraction of devices for the operations inside them.
+
+The images below give an illustration for a piece of a real-life graph.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
+    </td>
+    <td style="width: 50%;">
+      <img src="https://www.tensorflow.org/images/colorby_device.png" alt="Color by device" title="Color by device" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 50%;">
+      Structure view: The gray nodes have unique structure. The orange <code>conv1</code> and <code>conv2</code> nodes have the same structure, and analogously for nodes with other colors.
+    </td>
+    <td style="width: 50%;">
+      Device view: Name scopes are colored proportionally to the fraction of devices of the operation nodes inside them. Here, purple means GPU and the green is CPU.
+    </td>
+  </tr>
+</table>
+
+## Tensor shape information
+
+When the serialized `GraphDef` includes tensor shapes, the graph visualizer
+labels edges with tensor dimensions, and edge thickness reflects total tensor
+size. To include tensor shapes in the `GraphDef` pass the actual graph object
+(as in `sess.graph`) to the `FileWriter` when serializing the graph.
+The images below show the CIFAR-10 model with tensor shape information:
+<table width="100%;">
+  <tr>
+    <td style="width: 100%;">
+      <img src="https://www.tensorflow.org/images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 100%;">
+      CIFAR-10 model with tensor shape information.
+    </td>
+  </tr>
+</table>
+
+## Runtime statistics
+
+Often it is useful to collect runtime metadata for a run, such as total memory
+usage, total compute time, and tensor shapes for nodes. The code example below
+is a snippet from the train and test section of a modification of the
+@{$beginners$simple MNIST tutorial},
+in which we have recorded summaries and runtime statistics. See the @{$summaries_and_tensorboard#serializing-the-data$Summaries Tutorial}
+for details on how to record summaries.
+Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
+
+```python
+  # Train the model, and also write summaries.
+  # Every 10th step, measure test-set accuracy, and write test summaries
+  # All other steps, run train_step on training data, & add training summaries
+
+  def feed_dict(train):
+    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
+    if train or FLAGS.fake_data:
+      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
+      k = FLAGS.dropout
+    else:
+      xs, ys = mnist.test.images, mnist.test.labels
+      k = 1.0
+    return {x: xs, y_: ys, keep_prob: k}
+
+  for i in range(FLAGS.max_steps):
+    if i % 10 == 0:  # Record summaries and test-set accuracy
+      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
+      test_writer.add_summary(summary, i)
+      print('Accuracy at step %s: %s' % (i, acc))
+    else:  # Record train set summaries, and train
+      if i % 100 == 99:  # Record execution stats
+        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+        run_metadata = tf.RunMetadata()
+        summary, _ = sess.run([merged, train_step],
+                              feed_dict=feed_dict(True),
+                              options=run_options,
+                              run_metadata=run_metadata)
+        train_writer.add_run_metadata(run_metadata, 'step%d' % i)
+        train_writer.add_summary(summary, i)
+        print('Adding run metadata for', i)
+      else:  # Record a summary
+        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+        train_writer.add_summary(summary, i)
+```
+
+This code will emit runtime statistics for every 100th step starting at step99.
+
+When you launch tensorboard and go to the Graph tab, you will now see options
+under "Session runs" which correspond to the steps where run metadata was added.
+Selecting one of these runs will show you the snapshot of the network at that
+step, fading out unused nodes. In the controls on the left hand side, you will
+be able to color the nodes by total memory or total compute time. Additionally,
+clicking on a node will display the exact total memory, compute time, and
+tensor output sizes.
+
+
+<table width="100%;">
+  <tr style="height: 380px">
+    <td>
+      <img src="https://www.tensorflow.org/images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
+    </td>
+    <td>
+      <img src="https://www.tensorflow.org/images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
+    </td>
+    <td>
+      <img src="https://www.tensorflow.org/images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
+    </td>
+  </tr>
+</table>
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
new file mode 100644
index 00000000000..241263c72c0
--- /dev/null
+++ b/tensorflow/docs_src/get_started/index.md
@@ -0,0 +1,41 @@
+# Getting Started
+
+For a brief overview of TensorFlow programming fundamentals, see the following
+guide:
+
+  * @{$get_started/get_started$Getting Started with TensorFlow}
+
+MNIST has become the canonical dataset for trying out a new machine learning
+toolkit.  We offer three guides that each demonstrate a different approach
+to training an MNIST model on TensorFlow:
+
+  * @{$mnist/beginners$MNIST for ML Beginners}, which introduces MNIST through
+    the high-level API.
+  * @{$mnist/pros$Deep MNIST for Experts}, which is more-in depth than 
+    "MNIST for ML Beginners," and assumes some familiarity with machine 
+    learning concepts.
+  * @{$mnist/mechanics$TensorFlow Mechanics 101}, which introduces MNIST through
+    the low-level API.
+
+For developers new to TensorFlow, the high-level API is a good place to start. 
+To learn about the high-level API, read the following guides:
+
+  * @{$get_started/tflearn$tf.contrib.learn Quickstart}, which introduces this
+    API.
+  * @{$get_started/input_fn$Building Input Functions with tf.contrib.learn},
+    which takes you into a somewhat more sophisticated use of this API.
+  * @{$get_started/monitors$Logging and Monitoring Basics with tf.contrib.learn},
+    which explains how to audit the progress of model training.
+
+TensorBoard is a utility to visualize different aspects of machine learning.
+The following guides explain how to use TensorBoard:
+
+  * @{$get_started/summaries_and_tensorboard$TensorBoard: Visualizing Learning},
+    which gets you started.
+  * @{$get_started/embedding_viz$TensorBoard: Embedding Visualization}, which
+    demonstrates how to view and interact with high-dimensional data, such as
+    embeddings.
+  * @{$get_started/graph_viz$TensorBoard: Graph Visualization}, which explains
+    how to visualize the computational graph.  Graph visualization is typically
+    more useful for programmers using the low-level API.
+
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
new file mode 100644
index 00000000000..a053617b589
--- /dev/null
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -0,0 +1,389 @@
+# Building Input Functions with tf.contrib.learn
+
+This tutorial introduces you to creating input functions in tf.contrib.learn.
+You'll get an overview of how to construct an `input_fn` to preprocess and feed
+data into your models. Then, you'll implement an `input_fn` that feeds training,
+evaluation, and prediction data into a neural network regressor for predicting
+median house values.
+
+## Custom Input Pipelines with input_fn
+
+When training a neural network using tf.contrib.learn, it's possible to pass
+your feature and target data directly into your `fit`, `evaluate`, or `predict`
+operations. Here's an example taken from the @{$tflearn$tf.contrib.learn quickstart tutorial}:
+
+```python
+training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+    filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
+test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+    filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
+...
+
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=2000)
+```
+
+This approach works well when little to no manipulation of source data is
+required. But in cases where more feature engineering is needed,
+`tf.contrib.learn` supports using a custom input function (`input_fn`) to
+encapsulate the logic for preprocessing and piping data into your models.
+
+### Anatomy of an input_fn
+
+The following code illustrates the basic skeleton for an input function:
+
+```python
+def my_input_fn():
+
+    # Preprocess your data here...
+
+    # ...then return 1) a mapping of feature columns to Tensors with
+    # the corresponding feature data, and 2) a Tensor containing labels
+    return feature_cols, labels
+```
+
+The body of the input function contains the specific logic for preprocessing your
+input data, such as scrubbing out bad examples or [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling).
+
+Input functions must return the following two values containing the final
+feature and label data to be fed into your model (as shown in the above code
+skeleton):
+
+<dl>
+  <dt><code>feature_cols</code></dt>
+  <dd>A dict containing key/value pairs that map feature column
+names to <code>Tensor</code>s (or <code>SparseTensor</code>s) containing the corresponding feature
+data.</dd>
+  <dt><code>labels</code></dt>
+  <dd>A <code>Tensor</code> containing your label (target) values: the values your model aims to predict.</dd>
+</dl>
+
+### Converting Feature Data to Tensors
+
+If your feature/label data is stored in [_pandas_](http://pandas.pydata.org/)
+dataframes or [numpy](http://www.numpy.org/) arrays, you'll need to convert it
+to `Tensor`s before returning it from your `input_fn`.
+
+For continuous data, you can create and populate a `Tensor` using `tf.constant`:
+
+```python
+feature_column_data = [1, 2.4, 0, 9.9, 3, 120]
+feature_tensor = tf.constant(feature_column_data)
+```
+
+For [sparse, categorical data](https://en.wikipedia.org/wiki/Sparse_matrix)
+(data where the majority of values are 0), you'll instead want to populate a
+`SparseTensor`, which is instantiated with three arguments:
+
+<dl>
+  <dt><code>dense_shape</code></dt>
+  <dd>The shape of the tensor. Takes a list indicating the number of elements in each dimension. For example, <code>dense_shape=[3,6]</code> specifies a two-dimensional 3x6 tensor, <code>dense_shape=[2,3,4]</code> specifies a three-dimensional 2x3x4 tensor, and <code>dense_shape=[9]</code> specifies a one-dimensional tensor with 9 elements.</dd>
+  <dt><code>indices</code></dt>
+  <dd>The indices of the elements in your tensor that contain nonzero values. Takes a list of terms, where each term is itself a list containing the index of a nonzero element. (Elements are zero-indexed—i.e., [0,0] is the index value for the element in the first column of the first row in a two-dimensional tensor.) For example, <code>indices=[[1,3], [2,4]]</code> specifies that the elements with indexes of [1,3] and [2,4] have nonzero values.</dd>
+  <dt><code>values</code></dt>
+  <dd>A one-dimensional tensor of values. Term <code>i</code> in <code>values</code> corresponds to term <code>i</code> in <code>indices</code> and specifies its value. For example, given <code>indices=[[1,3], [2,4]]</code>, the parameter <code>values=[18, 3.6]</code> specifies that element [1,3] of the tensor has a value of 18, and element [2,4] of the tensor has a value of 3.6.</dd>
+</dl>
+
+The following code defines a two-dimensional `SparseTensor` with 3 rows and 5
+columns. The element with index [0,1] has a value of 6, and the element with
+index [2,4] has a value of 0.5 (all other values are 0):
+
+```python
+sparse_tensor = tf.SparseTensor(indices=[[0,1], [2,4]],
+                                values=[6, 0.5],
+                                dense_shape=[3, 5])
+```
+
+This corresponds to the following dense tensor:
+
+```none
+[[0, 6, 0, 0, 0]
+ [0, 0, 0, 0, 0]
+ [0, 0, 0, 0, 0.5]]
+```
+
+For more on `SparseTensor`, see the
+@{tf.SparseTensor}.
+
+### Passing input_fn Data to Your Model
+
+To feed data to your model for training, you simply pass the input function
+you've created to your `fit` operation as the value of the `input_fn` parameter,
+e.g.:
+
+```python
+classifier.fit(input_fn=my_input_fn, steps=2000)
+```
+
+Note that the `input_fn` is responsible for supplying both feature and label
+data to the model, and replaces both the `x` and `y` parameters in `fit`. If you
+supply an `input_fn` value to `fit` that is not `None` in conjunction with
+either an `x` or `y` parameter that is not `None`, it will result in a
+`ValueError`.
+
+Also note that the `input_fn` parameter must receive a function object (i.e.,
+`input_fn=my_input_fn`), not the return value of a function call
+(`input_fn=my_input_fn()`). This means that if you try to pass parameters to the input
+function in your `fit` call, as in the following code, it will result in a
+`TypeError`:
+
+```python
+classifier.fit(input_fn=my_input_fn(training_set), steps=2000)
+```
+
+However, if you'd like to be able to parameterize your input function, there are
+other methods for doing so. You can employ a wrapper function that takes no
+arguments as your `input_fn` and use it to invoke your input function
+with the desired parameters. For example:
+
+```python
+def my_input_function_training_set():
+  return my_input_function(training_set)
+
+classifier.fit(input_fn=my_input_fn_training_set, steps=2000)
+```
+
+Alternatively, you can use Python's [`functools.partial`](https://docs.python.org/2/library/functools.html#functools.partial)
+function to construct a new function object with all parameter values fixed:
+
+```python
+classifier.fit(input_fn=functools.partial(my_input_function,
+                                          data_set=training_set), steps=2000)
+```
+
+A third option is to wrap your input_fn invocation in a
+[`lambda`](https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions)
+and pass it to the `input_fn` parameter:
+
+```python
+classifier.fit(input_fn=lambda: my_input_fn(training_set), steps=2000)
+```
+
+One big advantage of architecting your input pipeline as shown above—to accept a
+parameter for data set—is that you can pass the same `input_fn` to `evaluate`
+and `predict` operations by just changing the data set argument, e.g.:
+
+```python
+classifier.evaluate(input_fn=lambda: my_input_fn(test_set), steps=2000)
+```
+
+This approach enhances code maintainability: no need to capture `x` and `y`
+values in separate variables (e.g., `x_train`, `x_test`, `y_train`, `y_test`)
+for each type of operation.
+
+### A Neural Network Model for Boston House Values
+
+In the remainder of this tutorial, you'll write an input function for
+preprocessing a subset of Boston housing data pulled from the [UCI Housing Data
+Set](https://archive.ics.uci.edu/ml/datasets/Housing) and use it to feed data to
+a neural network regressor for predicting median house values.
+
+The [Boston CSV data sets](#setup) you'll use to train your neural network
+contain the following
+[feature data](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names)
+for Boston suburbs:
+
+Feature | Description
+------- | ---------------------------------------------------------------
+CRIM    | Crime rate per capita
+ZN      | Fraction of residential land zoned to permit 25,000+ sq ft lots
+INDUS   | Fraction of land that is non-retail business
+NOX     | Concentration of nitric oxides in parts per 10 million
+RM      | Average Rooms per dwelling
+AGE     | Fraction of owner-occupied residences built before 1940
+DIS     | Distance to Boston-area employment centers
+TAX     | Property tax rate per $10,000
+PTRATIO | Student-teacher ratio
+
+And the label your model will predict is MEDV, the median value of
+owner-occupied residences in thousands of dollars.
+
+## Setup {#setup}
+
+Download the following data sets:
+[boston_train.csv](http://download.tensorflow.org/data/boston_train.csv),
+[boston_test.csv](http://download.tensorflow.org/data/boston_test.csv), and
+[boston_predict.csv](http://download.tensorflow.org/data/boston_predict.csv).
+
+The following sections provide a step-by-step walkthrough of how to create an
+input function, feed these data sets into a neural network regressor, train and
+evaluate the model, and make house value predictions. The full, final code is [available
+here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/input_fn/boston.py).
+
+### Importing the Housing Data
+
+To start, set up your imports (including `pandas` and `tensorflow`) and @{$monitors#enabling-logging-with-tensorflow$set logging verbosity} to
+`INFO` for more detailed log output:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import pandas as pd
+import tensorflow as tf
+
+tf.logging.set_verbosity(tf.logging.INFO)
+```
+
+Define the column names for the data set in `COLUMNS`. To distinguish features
+from the label, also define `FEATURES` and `LABEL`. Then read the three CSVs
+(@{tf.train},
+@{tf.test}, and
+[predict](http://download.tensorflow.org/data/boston_predict.csv)) into _pandas_
+`DataFrame`s:
+
+```python
+COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
+           "dis", "tax", "ptratio", "medv"]
+FEATURES = ["crim", "zn", "indus", "nox", "rm",
+            "age", "dis", "tax", "ptratio"]
+LABEL = "medv"
+
+training_set = pd.read_csv("boston_train.csv", skipinitialspace=True,
+                           skiprows=1, names=COLUMNS)
+test_set = pd.read_csv("boston_test.csv", skipinitialspace=True,
+                       skiprows=1, names=COLUMNS)
+prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
+                             skiprows=1, names=COLUMNS)
+```
+
+### Defining FeatureColumns and Creating the Regressor
+
+Next, create a list of `FeatureColumn`s for the input data, which formally
+specify the set of features to use for training. Because all features in the
+housing data set contain continuous values, you can create their
+`FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
+
+```python
+feature_cols = [tf.contrib.layers.real_valued_column(k)
+                  for k in FEATURES]
+```
+
+NOTE: For a more in-depth overview of feature columns, see
+@{$linear#feature-columns-and-transformations$this introduction},
+and for an example that illustrates how to define `FeatureColumns` for
+categorical data, see the @{$wide$Linear Model Tutorial}.
+
+Now, instantiate a `DNNRegressor` for the neural network regression model.
+You'll need to provide two arguments here: `hidden_units`, a hyperparameter
+specifying the number of nodes in each hidden layer (here, two hidden layers
+with 10 nodes each), and `feature_columns`, containing the list of
+`FeatureColumns` you just defined:
+
+```python
+regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
+                                          hidden_units=[10, 10],
+                                          model_dir="/tmp/boston_model")
+```
+
+### Building the input_fn
+
+To pass input data into the `regressor`, create an input function, which will
+accept a _pandas_ `Dataframe` and return feature column and label values as
+`Tensor`s:
+
+```python
+def input_fn(data_set):
+  feature_cols = {k: tf.constant(data_set[k].values)
+                  for k in FEATURES}
+  labels = tf.constant(data_set[LABEL].values)
+  return feature_cols, labels
+```
+
+Note that the input data is passed into `input_fn` in the `data_set` argument,
+which means the function can process any of the `DataFrame`s you've imported:
+`training_set`, `test_set`, and `prediction_set`.
+
+### Training the Regressor
+
+To train the neural network regressor, run `fit` with the `training_set` passed
+to the `input_fn` as follows:
+
+```python
+regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
+```
+
+You should see log output similar to the following, which reports training loss
+for every 100 steps:
+
+```none
+INFO:tensorflow:Step 1: loss = 483.179
+INFO:tensorflow:Step 101: loss = 81.2072
+INFO:tensorflow:Step 201: loss = 72.4354
+...
+INFO:tensorflow:Step 1801: loss = 33.4454
+INFO:tensorflow:Step 1901: loss = 32.3397
+INFO:tensorflow:Step 2001: loss = 32.0053
+INFO:tensorflow:Step 4801: loss = 27.2791
+INFO:tensorflow:Step 4901: loss = 27.2251
+INFO:tensorflow:Saving checkpoints for 5000 into /tmp/boston_model/model.ckpt.
+INFO:tensorflow:Loss for final step: 27.1674.
+```
+
+### Evaluating the Model
+
+Next, see how the trained model performs against the test data set. Run
+`evaluate`, and this time pass the `test_set` to the `input_fn`:
+
+```python
+ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
+```
+
+Retrieve the loss from the `ev` results and print it to output:
+
+```python
+loss_score = ev["loss"]
+print("Loss: {0:f}".format(loss_score))
+```
+
+You should see results similar to the following:
+
+```none
+INFO:tensorflow:Eval steps [0,1) for training step 5000.
+INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 11.9221
+Loss: 11.922098
+```
+
+### Making Predictions
+
+Finally, you can use the model to predict median house values for the
+`prediction_set`, which contains feature data but no labels for six examples:
+
+```python
+y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
+# .predict() returns an iterator; convert to a list and print predictions
+predictions = list(itertools.islice(y, 6))
+print ("Predictions: {}".format(str(predictions)))
+```
+
+Your results should contain six house-value predictions in thousands of dollars,
+e.g:
+
+```none
+Predictions: [ 33.30348587  17.04452896  22.56370163  34.74345398  14.55953979
+  19.58005714]
+```
+
+## Additional Resources
+
+This tutorial focused on creating an `input_fn` for a neural network regressor.
+To learn more about using `input_fn`s for other types of models, check out the
+following resources:
+
+*   @{$linear$Large-scale Linear Models with TensorFlow}: This
+    introduction to linear models in TensorFlow provides a high-level overview
+    of feature columns and techniques for transforming input data.
+
+*   @{$wide$TensorFlow Linear Model Tutorial}: This tutorial covers
+    creating `FeatureColumn`s and an `input_fn` for a linear classification
+    model that predicts income range based on census data.
+
+*   @{$wide_and_deep$TensorFlow Wide & Deep Learning Tutorial}: Building on
+    the @{$wide$Linear Model Tutorial}, this tutorial covers
+    `FeatureColumn` and `input_fn` creation for a "wide and deep" model that
+    combines a linear model and a neural network using
+    `DNNLinearCombinedClassifier`.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
new file mode 100644
index 00000000000..812f248d3eb
--- /dev/null
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -0,0 +1,12 @@
+index.md
+get_started.md
+mnist/beginners.md
+mnist/pros.md
+mnist/mechanics.md
+tflearn.md
+input_fn.md
+monitors.md
+summaries_and_tensorboard.md
+embedding_viz.md
+graph_viz.md
+tensorboard_histograms.md
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
new file mode 100644
index 00000000000..624d9164748
--- /dev/null
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -0,0 +1,455 @@
+# MNIST For ML Beginners
+
+*This tutorial is intended for readers who are new to both machine learning and
+TensorFlow. If you already know what MNIST is, and what softmax (multinomial
+logistic) regression is, you might prefer this
+@{$pros$faster paced tutorial}.  Be sure to
+@{$install$install TensorFlow} before starting either
+tutorial.*
+
+When one learns how to program, there's a tradition that the first thing you do
+is print "Hello World." Just like programming has Hello World, machine learning
+has MNIST.
+
+MNIST is a simple computer vision dataset. It consists of images of handwritten
+digits like these:
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST.png">
+</div>
+
+It also includes labels for each image, telling us which digit it is. For
+example, the labels for the above images are 5, 0, 4, and 1.
+
+In this tutorial, we're going to train a model to look at images and predict
+what digits they are. Our goal isn't to train a really elaborate model that
+achieves state-of-the-art performance -- although we'll give you code to do that
+later! -- but rather to dip a toe into using TensorFlow. As such, we're going
+to start with a very simple model, called a Softmax Regression.
+
+The actual code for this tutorial is very short, and all the interesting
+stuff happens in just three lines. However, it is very
+important to understand the ideas behind it: both how TensorFlow works and the
+core machine learning concepts. Because of this, we are going to very carefully
+work through the code.
+
+## About this tutorial
+
+This tutorial is an explanation, line by line, of what is happening in the
+[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py) code.
+
+You can use this tutorial in a few different ways, including:
+
+- Copy and paste each code snippet, line by line, into a Python environment as
+  you read through the explanations of each line.
+
+- Run the entire `mnist_softmax.py` Python file either before or after reading
+  through the explanations, and use this tutorial to understand the lines of
+  code that aren't clear to you.
+
+What we will accomplish in this tutorial:
+
+- Learn about the MNIST data and softmax regressions
+
+- Create a function that is a model for recognizing digits, based on looking at
+  every pixel in the image
+
+- Use TensorFlow to train the model to recognize digits by having it "look" at
+  thousands of examples (and run our first TensorFlow session to do so)
+
+- Check the model's accuracy with our test data
+
+## The MNIST Data
+
+The MNIST data is hosted on
+[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/).  If you are copying and
+pasting in the code from this tutorial, start here with these two lines of code
+which will download and read in the data automatically:
+
+```python
+from tensorflow.examples.tutorials.mnist import input_data
+mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
+```
+
+The MNIST data is split into three parts: 55,000 data points of training
+data (`mnist.train`), 10,000 points of test data (`mnist.test`), and 5,000
+points of validation data (`mnist.validation`). This split is very important:
+it's essential in machine learning that we have separate data which we don't
+learn from so that we can make sure that what we've learned actually
+generalizes!
+
+As mentioned earlier, every MNIST data point has two parts: an image of a
+handwritten digit and a corresponding label. We'll call the images "x"
+and the labels "y". Both the training set and test set contain images and their
+corresponding labels; for example the training images are `mnist.train.images`
+and the training labels are `mnist.train.labels`.
+
+Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
+numbers:
+
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST-Matrix.png">
+</div>
+
+We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
+matter how we flatten the array, as long as we're consistent between images.
+From this perspective, the MNIST images are just a bunch of points in a
+784-dimensional vector space, with a
+[very rich structure](http://colah.github.io/posts/2014-10-Visualizing-MNIST/)
+(warning: computationally intensive visualizations).
+
+Flattening the data throws away information about the 2D structure of the image.
+Isn't that bad? Well, the best computer vision methods do exploit this
+structure, and we will in later tutorials. But the simple method we will be
+using here, a softmax regression (defined below), won't.
+
+The result is that `mnist.train.images` is a tensor (an n-dimensional array)
+with a shape of `[55000, 784]`. The first dimension is an index into the list
+of images and the second dimension is the index for each pixel in each image.
+Each entry in the tensor is a pixel intensity between 0 and 1, for a particular
+pixel in a particular image.
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-xs.png">
+</div>
+
+Each image in MNIST has a corresponding label, a number between 0 and 9
+representing the digit drawn in the image.
+
+For the purposes of this tutorial, we're going to want our labels as "one-hot
+vectors". A one-hot vector is a vector which is 0 in most dimensions, and 1 in a
+single dimension. In this case, the \\(n\\)th digit will be represented as a
+vector which is 1 in the \\(n\\)th dimension. For example, 3 would be
+\\([0,0,0,1,0,0,0,0,0,0]\\).  Consequently, `mnist.train.labels` is a
+`[55000, 10]` array of floats.
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-ys.png">
+</div>
+
+We're now ready to actually make our model!
+
+## Softmax Regressions
+
+We know that every image in MNIST is of a handwritten digit between zero and
+nine.  So there are only ten possible things that a given image can be. We want
+to be able to look at an image and give the probabilities for it being each
+digit. For example, our model might look at a picture of a nine and be 80% sure
+it's a nine, but give a 5% chance to it being an eight (because of the top loop)
+and a bit of probability to all the others because it isn't 100% sure.
+
+This is a classic case where a softmax regression is a natural, simple model.
+If you want to assign probabilities to an object being one of several different
+things, softmax is the thing to do, because softmax gives us a list of values
+between 0 and 1 that add up to 1. Even later on, when we train more sophisticated
+models, the final step will be a layer of softmax.
+
+A softmax regression has two steps: first we add up the evidence of our input
+being in certain classes, and then we convert that evidence into probabilities.
+
+To tally up the evidence that a given image is in a particular class, we do a
+weighted sum of the pixel intensities. The weight is negative if that pixel
+having a high intensity is evidence against the image being in that class, and
+positive if it is evidence in favor.
+
+The following diagram shows the weights one model learned for each of these
+classes. Red represents negative weights, while blue represents positive
+weights.
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-weights.png">
+</div>
+
+We also add some extra evidence called a bias. Basically, we want to be able
+to say that some things are more likely independent of the input. The result is
+that the evidence for a class \\(i\\) given an input \\(x\\) is:
+
+$$\text{evidence}_i = \sum_j W_{i,~ j} x_j + b_i$$
+
+where \\(W_i\\) is the weights and \\(b_i\\) is the bias for class \\(i\\),
+and \\(j\\) is an index for summing over the pixels in our input image \\(x\\).
+We then convert the evidence tallies into our predicted probabilities
+\\(y\\) using the "softmax" function:
+
+$$y = \text{softmax}(\text{evidence})$$
+
+Here softmax is serving as an "activation" or "link" function, shaping
+the output of our linear function into the form we want -- in this case, a
+probability distribution over 10 cases.
+You can think of it as converting tallies
+of evidence into probabilities of our input being in each class.
+It's defined as:
+
+$$\text{softmax}(x) = \text{normalize}(\exp(x))$$
+
+If you expand that equation out, you get:
+
+$$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$
+
+But it's often more helpful to think of softmax the first way: exponentiating
+its inputs and then normalizing them.  The exponentiation means that one more
+unit of evidence increases the weight given to any hypothesis multiplicatively.
+And conversely, having one less unit of evidence means that a hypothesis gets a
+fraction of its earlier weight. No hypothesis ever has zero or negative
+weight. Softmax then normalizes these weights, so that they add up to one,
+forming a valid probability distribution. (To get more intuition about the
+softmax function, check out the
+[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax) on it in
+Michael Nielsen's book, complete with an interactive visualization.)
+
+You can picture our softmax regression as looking something like the following,
+although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
+the \\(x\\)s, add a bias, and then apply softmax.
+
+<div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalargraph.png">
+</div>
+
+If we write that out as equations, we get:
+
+<div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalarequation.png"
+   alt="[y1, y2, y3] = softmax(W11*x1 + W12*x2 + W13*x3 + b1,  W21*x1 + W22*x2 + W23*x3 + b2,  W31*x1 + W32*x2 + W33*x3 + b3)">
+</div>
+
+We can "vectorize" this procedure, turning it into a matrix multiplication
+and vector addition. This is helpful for computational efficiency. (It's also
+a useful way to think.)
+
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-vectorequation.png"
+ alt="[y1, y2, y3] = softmax([[W11, W12, W13], [W21, W22, W23], [W31, W32, W33]]*[x1, x2, x3] + [b1, b2, b3])">
+</div>
+
+More compactly, we can just write:
+
+$$y = \text{softmax}(Wx + b)$$
+
+Now let's turn that into something that TensorFlow can use.
+
+## Implementing the Regression
+
+
+To do efficient numerical computing in Python, we typically use libraries like
+[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
+multiplication outside Python, using highly efficient code implemented in
+another language.  Unfortunately, there can still be a lot of overhead from
+switching back to Python every operation. This overhead is especially bad if you
+want to run computations on GPUs or in a distributed manner, where there can be
+a high cost to transferring data.
+
+TensorFlow also does its heavy lifting outside Python, but it takes things a
+step further to avoid this overhead.  Instead of running a single expensive
+operation independently from Python, TensorFlow lets us describe a graph of
+interacting operations that run entirely outside Python. (Approaches like this
+can be seen in a few machine learning libraries.)
+
+To use TensorFlow, first we need to import it.
+
+```python
+import tensorflow as tf
+```
+
+We describe these interacting operations by manipulating symbolic variables.
+Let's create one:
+
+```python
+x = tf.placeholder(tf.float32, [None, 784])
+```
+
+`x` isn't a specific value. It's a `placeholder`, a value that we'll input when
+we ask TensorFlow to run a computation. We want to be able to input any number
+of MNIST images, each flattened into a 784-dimensional vector. We represent
+this as a 2-D tensor of floating-point numbers, with a shape `[None, 784]`.
+(Here `None` means that a dimension can be of any length.)
+
+We also need the weights and biases for our model. We could imagine treating
+these like additional inputs, but TensorFlow has an even better way to handle
+it: `Variable`.  A `Variable` is a modifiable tensor that lives in TensorFlow's
+graph of interacting operations. It can be used and even modified by the
+computation. For machine learning applications, one generally has the model
+parameters be `Variable`s.
+
+```python
+W = tf.Variable(tf.zeros([784, 10]))
+b = tf.Variable(tf.zeros([10]))
+```
+
+We create these `Variable`s by giving `tf.Variable` the initial value of the
+`Variable`: in this case, we initialize both `W` and `b` as tensors full of
+zeros. Since we are going to learn `W` and `b`, it doesn't matter very much
+what they initially are.
+
+Notice that `W` has a shape of [784, 10] because we want to multiply the
+784-dimensional image vectors by it to produce 10-dimensional vectors of
+evidence for the difference classes. `b` has a shape of [10] so we can add it
+to the output.
+
+We can now implement our model. It only takes one line to define it!
+
+```python
+y = tf.nn.softmax(tf.matmul(x, W) + b)
+```
+
+First, we multiply `x` by `W` with the expression `tf.matmul(x, W)`. This is
+flipped from when we multiplied them in our equation, where we had \\(Wx\\), as
+a small trick to deal with `x` being a 2D tensor with multiple inputs. We then
+add `b`, and finally apply `tf.nn.softmax`.
+
+That's it. It only took us one line to define our model, after a couple short
+lines of setup. That isn't because TensorFlow is designed to make a softmax
+regression particularly easy: it's just a very flexible way to describe many
+kinds of numerical computations, from machine learning models to physics
+simulations. And once defined, our model can be run on different devices:
+your computer's CPU, GPUs, and even phones!
+
+
+## Training
+
+In order to train our model, we need to define what it means for the model to be
+good. Well, actually, in machine learning we typically define what it means for
+a model to be bad. We call this the cost, or the loss, and it represents how far
+off our model is from our desired outcome. We try to minimize that error, and
+the smaller the error margin, the better our model is.
+
+One very common, very nice function to determine the loss of a model is called
+"cross-entropy." Cross-entropy arises from thinking about information
+compressing codes in information theory but it winds up being an important idea
+in lots of areas, from gambling to machine learning. It's defined as:
+
+$$H_{y'}(y) = -\sum_i y'_i \log(y_i)$$
+
+Where \\(y\\) is our predicted probability distribution, and \\(y'\\) is the true
+distribution (the one-hot vector with the digit labels).  In some rough sense, the
+cross-entropy is measuring how inefficient our predictions are for describing
+the truth. Going into more detail about cross-entropy is beyond the scope of
+this tutorial, but it's well worth
+[understanding](http://colah.github.io/posts/2015-09-Visual-Information/).
+
+To implement cross-entropy we need to first add a new placeholder to input the
+correct answers:
+
+```python
+y_ = tf.placeholder(tf.float32, [None, 10])
+```
+
+Then we can implement the cross-entropy function, \\(-\sum y'\log(y)\\):
+
+```python
+cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
+```
+
+First, `tf.log` computes the logarithm of each element of `y`. Next, we multiply
+each element of `y_` with the corresponding element of `tf.log(y)`. Then
+`tf.reduce_sum` adds the elements in the second dimension of y, due to the
+`reduction_indices=[1]` parameter. Finally, `tf.reduce_mean` computes the mean
+over all the examples in the batch.
+
+Note that in the source code, we don't use this formulation, because it is
+numerically unstable.  Instead, we apply
+`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
+call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
+more numerically stable function internally computes the softmax activation.  In
+your code, consider using `tf.nn.softmax_cross_entropy_with_logits`
+instead.
+
+Now that we know what we want our model to do, it's very easy to have TensorFlow
+train it to do so.  Because TensorFlow knows the entire graph of your
+computations, it can automatically use the
+[backpropagation algorithm](http://colah.github.io/posts/2015-08-Backprop/) to
+efficiently determine how your variables affect the loss you ask it to
+minimize. Then it can apply your choice of optimization algorithm to modify the
+variables and reduce the loss.
+
+```python
+train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
+```
+
+In this case, we ask TensorFlow to minimize `cross_entropy` using the
+[gradient descent algorithm](https://en.wikipedia.org/wiki/Gradient_descent)
+with a learning rate of 0.5. Gradient descent is a simple procedure, where
+TensorFlow simply shifts each variable a little bit in the direction that
+reduces the cost. But TensorFlow also provides
+@{$python/train#Optimizers$many other optimization algorithms}:
+using one is as simple as tweaking one line.
+
+What TensorFlow actually does here, behind the scenes, is to add new operations
+to your graph which implement backpropagation and gradient descent. Then it
+gives you back a single operation which, when run, does a step of gradient
+descent training, slightly tweaking your variables to reduce the loss.
+
+
+We can now launch the model in an `InteractiveSession`:
+
+```python
+sess = tf.InteractiveSession()
+```
+
+We first have to create an operation to initialize the variables we created:
+
+```python
+tf.global_variables_initializer().run()
+```
+
+
+Let's train -- we'll run the training step 1000 times!
+
+```python
+for _ in range(1000):
+  batch_xs, batch_ys = mnist.train.next_batch(100)
+  sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
+```
+
+Each step of the loop, we get a "batch" of one hundred random data points from
+our training set. We run `train_step` feeding in the batches data to replace
+the `placeholder`s.
+
+Using small batches of random data is called stochastic training -- in this
+case, stochastic gradient descent. Ideally, we'd like to use all our data for
+every step of training because that would give us a better sense of what we
+should be doing, but that's expensive. So, instead, we use a different subset
+every time. Doing this is cheap and has much of the same benefit.
+
+
+
+## Evaluating Our Model
+
+How well does our model do?
+
+Well, first let's figure out where we predicted the correct label. `tf.argmax`
+is an extremely useful function which gives you the index of the highest entry
+in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our
+model thinks is most likely for each input, while `tf.argmax(y_,1)` is the
+correct label. We can use `tf.equal` to check if our prediction matches the
+truth.
+
+```python
+correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
+```
+
+That gives us a list of booleans. To determine what fraction are correct, we
+cast to floating point numbers and then take the mean. For example,
+`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
+
+```python
+accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+```
+
+Finally, we ask for our accuracy on our test data.
+
+```python
+print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
+```
+
+This should be about 92%.
+
+Is that good? Well, not really. In fact, it's pretty bad. This is because we're
+using a very simple model. With some small changes, we can get to 97%. The best
+models can get to over 99.7% accuracy! (For more information, have a look at
+this
+[list of results](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html).)
+
+What matters is that we learned from this model. Still, if you're feeling a bit
+down about these results, check out
+@{$pros$the next tutorial} where we do a lot
+better, and learn how to build more sophisticated models using TensorFlow!
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
new file mode 100644
index 00000000000..27fae45b5b0
--- /dev/null
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -0,0 +1,489 @@
+# TensorFlow Mechanics 101
+
+Code: [tensorflow/examples/tutorials/mnist/](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/)
+
+The goal of this tutorial is to show how to use TensorFlow to train and
+evaluate a simple feed-forward neural network for handwritten digit
+classification using the (classic) MNIST data set.  The intended audience for
+this tutorial is experienced machine learning users interested in using
+TensorFlow.
+
+These tutorials are not intended for teaching Machine Learning in general.
+
+Please ensure you have followed the instructions to
+@{$install$install TensorFlow}.
+
+## Tutorial Files
+
+This tutorial references the following files:
+
+File | Purpose
+--- | ---
+[`mnist.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist.py) | The code to build a fully-connected MNIST model.
+[`fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py) | The main code to train the built MNIST model against the downloaded dataset using a feed dictionary.
+
+Simply run the `fully_connected_feed.py` file directly to start training:
+
+```bash
+python fully_connected_feed.py
+```
+
+## Prepare the Data
+
+MNIST is a classic problem in machine learning. The problem is to look at
+greyscale 28x28 pixel images of handwritten digits and determine which digit
+the image represents, for all the digits from zero to nine.
+
+![MNIST Digits](https://www.tensorflow.org/images/mnist_digits.png "MNIST Digits")
+
+For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
+or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
+
+### Download
+
+At the top of the `run_training()` method, the `input_data.read_data_sets()`
+function will ensure that the correct data has been downloaded to your local
+training folder and then unpack that data to return a dictionary of `DataSet`
+instances.
+
+```python
+data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+```
+
+**NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
+safely ignored by the reader.
+
+Dataset | Purpose
+--- | ---
+`data_sets.train` | 55000 images and labels, for primary training.
+`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy.
+`data_sets.test` | 10000 images and labels, for final testing of trained accuracy.
+
+### Inputs and Placeholders
+
+The `placeholder_inputs()` function creates two @{tf.placeholder}
+ops that define the shape of the inputs, including the `batch_size`, to the
+rest of the graph and into which the actual training examples will be fed.
+
+```python
+images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
+                                                       mnist.IMAGE_PIXELS))
+labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
+```
+
+Further down, in the training loop, the full image and label datasets are
+sliced to fit the `batch_size` for each step, matched with these placeholder
+ops, and then passed into the `sess.run()` function using the `feed_dict`
+parameter.
+
+## Build the Graph
+
+After creating placeholders for the data, the graph is built from the
+`mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and
+`training()`.
+
+1.  `inference()` - Builds the graph as far as required for running
+the network forward to make predictions.
+1.  `loss()` - Adds to the inference graph the ops required to generate
+loss.
+1.  `training()` - Adds to the loss graph the ops required to compute
+and apply gradients.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/mnist_subgraph.png">
+</div>
+
+### Inference
+
+The `inference()` function builds the graph as far as needed to
+return the tensor that would contain the output predictions.
+
+It takes the images placeholder as input and builds on top
+of it a pair of fully connected layers with [ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation followed by a ten
+node linear layer specifying the output logits.
+
+Each layer is created beneath a unique @{tf.name_scope}
+that acts as a prefix to the items created within that scope.
+
+```python
+with tf.name_scope('hidden1'):
+```
+
+Within the defined scope, the weights and biases to be used by each of these
+layers are generated into @{tf.Variable}
+instances, with their desired shapes:
+
+```python
+weights = tf.Variable(
+    tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
+                        stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
+    name='weights')
+biases = tf.Variable(tf.zeros([hidden1_units]),
+                     name='biases')
+```
+
+When, for instance, these are created under the `hidden1` scope, the unique
+name given to the weights variable would be "`hidden1/weights`".
+
+Each variable is given initializer ops as part of their construction.
+
+In this most common case, the weights are initialized with the
+@{tf.truncated_normal}
+and given their shape of a 2-D tensor with
+the first dim representing the number of units in the layer from which the
+weights connect and the second dim representing the number of
+units in the layer to which the weights connect.  For the first layer, named
+`hidden1`, the dimensions are `[IMAGE_PIXELS, hidden1_units]` because the
+weights are connecting the image inputs to the hidden1 layer.  The
+`tf.truncated_normal` initializer generates a random distribution with a given
+mean and standard deviation.
+
+Then the biases are initialized with @{tf.zeros}
+to ensure they start with all zero values, and their shape is simply the number
+of units in the layer to which they connect.
+
+The graph's three primary ops -- two @{tf.nn.relu}
+ops wrapping @{tf.matmul}
+for the hidden layers and one extra `tf.matmul` for the logits -- are then
+created, each in turn, with separate `tf.Variable` instances connected to each
+of the input placeholders or the output tensors of the previous layer.
+
+```python
+hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
+```
+
+```python
+hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
+```
+
+```python
+logits = tf.matmul(hidden2, weights) + biases
+```
+
+Finally, the `logits` tensor that will contain the output is returned.
+
+### Loss
+
+The `loss()` function further builds the graph by adding the required loss
+ops.
+
+First, the values from the `labels_placeholder` are converted to 64-bit integers. Then, a @{tf.nn.sparse_softmax_cross_entropy_with_logits} op is added to automatically produce 1-hot labels from the `labels_placeholder` and compare the output logits from the `inference()` function with those 1-hot labels.
+
+```python
+labels = tf.to_int64(labels)
+cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    labels=labels, logits=logits, name='xentropy')
+```
+
+It then uses @{tf.reduce_mean}
+to average the cross entropy values across the batch dimension (the first
+dimension) as the total loss.
+
+```python
+loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+```
+
+And the tensor that will then contain the loss value is returned.
+
+> Note: Cross-entropy is an idea from information theory that allows us
+> to describe how bad it is to believe the predictions of the neural network,
+> given what is actually true. For more information, read the blog post Visual
+> Information Theory (http://colah.github.io/posts/2015-09-Visual-Information/)
+
+### Training
+
+The `training()` function adds the operations needed to minimize the loss via
+[Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent).
+
+Firstly, it takes the loss tensor from the `loss()` function and hands it to a
+@{tf.summary.scalar},
+an op for generating summary values into the events file when used with a
+@{tf.summary.FileWriter} (see below).  In this case, it will emit the snapshot value of
+the loss every time the summaries are written out.
+
+```python
+tf.summary.scalar('loss', loss)
+```
+
+Next, we instantiate a @{tf.train.GradientDescentOptimizer}
+responsible for applying gradients with the requested learning rate.
+
+```python
+optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+```
+
+We then generate a single variable to contain a counter for the global
+training step and the @{tf.train.Optimizer.minimize}
+op is used to both update the trainable weights in the system and increment the
+global step.  This op is, by convention, known as the `train_op` and is what must
+be run by a TensorFlow session in order to induce one full step of training
+(see below).
+
+```python
+global_step = tf.Variable(0, name='global_step', trainable=False)
+train_op = optimizer.minimize(loss, global_step=global_step)
+```
+
+## Train the Model
+
+Once the graph is built, it can be iteratively trained and evaluated in a loop
+controlled by the user code in `fully_connected_feed.py`.
+
+### The Graph
+
+At the top of the `run_training()` function is a python `with` command that
+indicates all of the built ops are to be associated with the default
+global @{tf.Graph}
+instance.
+
+```python
+with tf.Graph().as_default():
+```
+
+A `tf.Graph` is a collection of ops that may be executed together as a group.
+Most TensorFlow uses will only need to rely on the single default graph.
+
+More complicated uses with multiple graphs are possible, but beyond the scope of
+this simple tutorial.
+
+### The Session
+
+Once all of the build preparation has been completed and all of the necessary
+ops generated, a @{tf.Session}
+is created for running the graph.
+
+```python
+sess = tf.Session()
+```
+
+Alternately, a `Session` may be generated into a `with` block for scoping:
+
+```python
+with tf.Session() as sess:
+```
+
+The empty parameter to session indicates that this code will attach to
+(or create if not yet created) the default local session.
+
+Immediately after creating the session, all of the `tf.Variable`
+instances are initialized by calling @{tf.Session.run}
+on their initialization op.
+
+```python
+init = tf.global_variables_initializer()
+sess.run(init)
+```
+
+The @{tf.Session.run}
+method will run the complete subset of the graph that
+corresponds to the op(s) passed as parameters.  In this first call, the `init`
+op is a @{tf.group}
+that contains only the initializers for the variables.  None of the rest of the
+graph is run here; that happens in the training loop below.
+
+### Train Loop
+
+After initializing the variables with the session, training may begin.
+
+The user code controls the training per step, and the simplest loop that
+can do useful training is:
+
+```python
+for step in xrange(FLAGS.max_steps):
+    sess.run(train_op)
+```
+
+However, this tutorial is slightly more complicated in that it must also slice
+up the input data for each step to match the previously generated placeholders.
+
+#### Feed the Graph
+
+For each step, the code will generate a feed dictionary that will contain the
+set of examples on which to train for the step, keyed by the placeholder
+ops they represent.
+
+In the `fill_feed_dict()` function, the given `DataSet` is queried for its next
+`batch_size` set of images and labels, and tensors matching the placeholders are
+filled containing the next images and labels.
+
+```python
+images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,
+                                               FLAGS.fake_data)
+```
+
+A python dictionary object is then generated with the placeholders as keys and
+the representative feed tensors as values.
+
+```python
+feed_dict = {
+    images_placeholder: images_feed,
+    labels_placeholder: labels_feed,
+}
+```
+
+This is passed into the `sess.run()` function's `feed_dict` parameter to provide
+the input examples for this step of training.
+
+#### Check the Status
+
+The code specifies two values to fetch in its run call: `[train_op, loss]`.
+
+```python
+for step in xrange(FLAGS.max_steps):
+    feed_dict = fill_feed_dict(data_sets.train,
+                               images_placeholder,
+                               labels_placeholder)
+    _, loss_value = sess.run([train_op, loss],
+                             feed_dict=feed_dict)
+```
+
+Because there are two values to fetch, `sess.run()` returns a tuple with two
+items.  Each `Tensor` in the list of values to fetch corresponds to a numpy
+array in the returned tuple, filled with the value of that tensor during this
+step of training. Since `train_op` is an `Operation` with no output value, the
+corresponding element in the returned tuple is `None` and, thus,
+discarded. However, the value of the `loss` tensor may become NaN if the model
+diverges during training, so we capture this value for logging.
+
+Assuming that the training runs fine without NaNs, the training loop also
+prints a simple status text every 100 steps to let the user know the state of
+training.
+
+```python
+if step % 100 == 0:
+    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
+```
+
+#### Visualize the Status
+
+In order to emit the events files used by @{$summaries_and_tensorboard$TensorBoard},
+all of the summaries (in this case, only one) are collected into a single Tensor
+during the graph building phase.
+
+```python
+summary = tf.summary.merge_all()
+```
+
+And then after the session is created, a @{tf.summary.FileWriter}
+may be instantiated to write the events files, which
+contain both the graph itself and the values of the summaries.
+
+```python
+summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+```
+
+Lastly, the events file will be updated with new summary values every time the
+`summary` is evaluated and the output passed to the writer's `add_summary()`
+function.
+
+```python
+summary_str = sess.run(summary, feed_dict=feed_dict)
+summary_writer.add_summary(summary_str, step)
+```
+
+When the events files are written, TensorBoard may be run against the training
+folder to display the values from the summaries.
+
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
+
+**NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial @{$summaries_and_tensorboard$Tensorboard: Visualizing Learning}.
+
+#### Save a Checkpoint
+
+In order to emit a checkpoint file that may be used to later restore a model
+for further training or evaluation, we instantiate a
+@{tf.train.Saver}.
+
+```python
+saver = tf.train.Saver()
+```
+
+In the training loop, the @{tf.train.Saver.save}
+method will periodically be called to write a checkpoint file to the training
+directory with the current values of all the trainable variables.
+
+```python
+saver.save(sess, FLAGS.train_dir, global_step=step)
+```
+
+At some later point in the future, training might be resumed by using the
+@{tf.train.Saver.restore}
+method to reload the model parameters.
+
+```python
+saver.restore(sess, FLAGS.train_dir)
+```
+
+## Evaluate the Model
+
+Every thousand steps, the code will attempt to evaluate the model against both
+the training and test datasets.  The `do_eval()` function is called thrice, for
+the training, validation, and test datasets.
+
+```python
+print('Training Data Eval:')
+do_eval(sess,
+        eval_correct,
+        images_placeholder,
+        labels_placeholder,
+        data_sets.train)
+print('Validation Data Eval:')
+do_eval(sess,
+        eval_correct,
+        images_placeholder,
+        labels_placeholder,
+        data_sets.validation)
+print('Test Data Eval:')
+do_eval(sess,
+        eval_correct,
+        images_placeholder,
+        labels_placeholder,
+        data_sets.test)
+```
+
+> Note that more complicated usage would usually sequester the `data_sets.test`
+> to only be checked after significant amounts of hyperparameter tuning.  For
+> the sake of a simple little MNIST problem, however, we evaluate against all of
+> the data.
+
+### Build the Eval Graph
+
+Before entering the training loop, the Eval op should have been built
+by calling the `evaluation()` function from `mnist.py` with the same
+logits/labels parameters as the `loss()` function.
+
+```python
+eval_correct = mnist.evaluation(logits, labels_placeholder)
+```
+
+The `evaluation()` function simply generates a @{tf.nn.in_top_k}
+op that can automatically score each model output as correct if the true label
+can be found in the K most-likely predictions.  In this case, we set the value
+of K to 1 to only consider a prediction correct if it is for the true label.
+
+```python
+eval_correct = tf.nn.in_top_k(logits, labels, 1)
+```
+
+### Eval Output
+
+One can then create a loop for filling a `feed_dict` and calling `sess.run()`
+against the `eval_correct` op to evaluate the model on the given dataset.
+
+```python
+for step in xrange(steps_per_epoch):
+    feed_dict = fill_feed_dict(data_set,
+                               images_placeholder,
+                               labels_placeholder)
+    true_count += sess.run(eval_correct, feed_dict=feed_dict)
+```
+
+The `true_count` variable simply accumulates all of the predictions that the
+`in_top_k` op has determined to be correct.  From there, the precision may be
+calculated from simply dividing by the total number of examples.
+
+```python
+precision = true_count / num_examples
+print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
+      (num_examples, true_count, precision))
+```
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
new file mode 100644
index 00000000000..d50e874d521
--- /dev/null
+++ b/tensorflow/docs_src/get_started/mnist/pros.md
@@ -0,0 +1,429 @@
+# Deep MNIST for Experts
+
+TensorFlow is a powerful library for doing large-scale numerical computation.
+One of the tasks at which it excels is implementing and training deep neural
+networks.  In this tutorial we will learn the basic building blocks of a
+TensorFlow model while constructing a deep convolutional MNIST classifier.
+
+*This introduction assumes familiarity with neural networks and the MNIST
+dataset. If you don't have
+a background with them, check out the
+@{$beginners$introduction for beginners}. Be sure to
+@{$install$install TensorFlow} before starting.*
+
+
+## About this tutorial
+
+The first part of this tutorial explains what is happening in the
+[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py)
+code, which is a basic implementation of a Tensorflow model.  The second part
+shows some ways to improve the accuracy.
+
+You can copy and paste each code snippet from this tutorial into a Python
+environment to follow along, or you can download the fully implemented deep net
+from [mnist_deep.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_deep.py)
+.
+
+What we will accomplish in this tutorial:
+
+- Create a softmax regression function that is a model for recognizing MNIST
+  digits, based on looking at every pixel in the image
+
+- Use Tensorflow to train the model to recognize digits by having it "look" at
+  thousands of examples (and run our first Tensorflow session to do so)
+
+- Check the model's accuracy with our test data
+
+- Build, train, and test a multilayer convolutional neural network to improve
+  the results
+
+## Setup
+
+Before we create our model, we will first load the MNIST dataset, and start a
+TensorFlow session.
+
+### Load MNIST Data
+
+If you are copying and pasting in the code from this tutorial, start here with
+these two lines of code which will download and read in the data automatically:
+
+```python
+from tensorflow.examples.tutorials.mnist import input_data
+mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
+```
+
+Here `mnist` is a lightweight class which stores the training, validation, and
+testing sets as NumPy arrays.  It also provides a function for iterating through
+data minibatches, which we will use below.
+
+### Start TensorFlow InteractiveSession
+
+TensorFlow relies on a highly efficient C++ backend to do its computation. The
+connection to this backend is called a session.  The common usage for TensorFlow
+programs is to first create a graph and then launch it in a session.
+
+Here we instead use the convenient `InteractiveSession` class, which makes
+TensorFlow more flexible about how you structure your code.  It allows you to
+interleave operations which build a
+@{$get_started/get_started#the_computational_graph$computation graph}
+with ones that run the graph.  This is particularly convenient when working in
+interactive contexts like IPython.  If you are not using an
+`InteractiveSession`, then you should build the entire computation graph before
+starting a session and
+@{$get_started/get_started#the_computational_graph$launching the graph}.
+
+```python
+import tensorflow as tf
+sess = tf.InteractiveSession()
+```
+
+#### Computation Graph
+
+To do efficient numerical computing in Python, we typically use libraries like
+[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
+multiplication outside Python, using highly efficient code implemented in
+another language.  Unfortunately, there can still be a lot of overhead from
+switching back to Python every operation. This overhead is especially bad if you
+want to run computations on GPUs or in a distributed manner, where there can be
+a high cost to transferring data.
+
+TensorFlow also does its heavy lifting outside Python, but it takes things a
+step further to avoid this overhead.  Instead of running a single expensive
+operation independently from Python, TensorFlow lets us describe a graph of
+interacting operations that run entirely outside Python.  This approach is
+similar to that used in Theano or Torch.
+
+The role of the Python code is therefore to build this external computation
+graph, and to dictate which parts of the computation graph should be run. See
+the @{$get_started/get_started#the_computational_graph$Computation Graph}
+section of @{$get_started/get_started} for more detail.
+
+## Build a Softmax Regression Model
+
+In this section we will build a softmax regression model with a single linear
+layer. In the next section, we will extend this to the case of softmax
+regression with a multilayer convolutional network.
+
+### Placeholders
+
+We start building the computation graph by creating nodes for the
+input images and target output classes.
+
+```python
+x = tf.placeholder(tf.float32, shape=[None, 784])
+y_ = tf.placeholder(tf.float32, shape=[None, 10])
+```
+
+Here `x` and `y_` aren't specific values. Rather, they are each a `placeholder`
+-- a value that we'll input when we ask TensorFlow to run a computation.
+
+The input images `x` will consist of a 2d tensor of floating point numbers.
+Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality
+of a single flattened 28 by 28 pixel MNIST image, and `None` indicates that the
+first dimension, corresponding to the batch size, can be of any size.  The
+target output classes `y_` will also consist of a 2d tensor, where each row is a
+one-hot 10-dimensional vector indicating which digit class (zero through nine)
+the corresponding MNIST image belongs to.
+
+The `shape` argument to `placeholder` is optional, but it allows TensorFlow
+to automatically catch bugs stemming from inconsistent tensor shapes.
+
+### Variables
+
+We now define the weights `W` and biases `b` for our model. We could imagine
+treating these like additional inputs, but TensorFlow has an even better way to
+handle them: `Variable`.  A `Variable` is a value that lives in TensorFlow's
+computation graph.  It can be used and even modified by the computation. In
+machine learning applications, one generally has the model parameters be
+`Variable`s.
+
+```python
+W = tf.Variable(tf.zeros([784,10]))
+b = tf.Variable(tf.zeros([10]))
+```
+
+We pass the initial value for each parameter in the call to `tf.Variable`.  In
+this case, we initialize both `W` and `b` as tensors full of zeros. `W` is a
+784x10 matrix (because we have 784 input features and 10 outputs) and `b` is a
+10-dimensional vector (because we have 10 classes).
+
+Before `Variable`s can be used within a session, they must be initialized using
+that session.  This step takes the initial values (in this case tensors full of
+zeros) that have already been specified, and assigns them to each
+`Variable`. This can be done for all `Variables` at once:
+
+```python
+sess.run(tf.global_variables_initializer())
+```
+
+### Predicted Class and Loss Function
+
+We can now implement our regression model. It only takes one line!  We multiply
+the vectorized input images `x` by the weight matrix `W`, add the bias `b`.
+
+```python
+y = tf.matmul(x,W) + b
+```
+
+We can specify a loss function just as easily. Loss indicates how bad the
+model's prediction was on a single example; we try to minimize that while
+training across all the examples. Here, our loss function is the cross-entropy
+between the target and the softmax activation function applied to the model's
+prediction.  As in the beginners tutorial, we use the stable formulation:
+
+```python
+cross_entropy = tf.reduce_mean(
+    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+```
+
+Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
+softmax on the model's unnormalized model prediction and sums across all
+classes, and `tf.reduce_mean` takes the average over these sums.
+
+## Train the Model
+
+Now that we have defined our model and training loss function, it is
+straightforward to train using TensorFlow.  Because TensorFlow knows the entire
+computation graph, it can use automatic differentiation to find the gradients of
+the loss with respect to each of the variables.  TensorFlow has a variety of
+@{$python/train#optimizers$built-in optimization algorithms}.
+For this example, we will use steepest gradient descent, with a step length of
+0.5, to descend the cross entropy.
+
+```python
+train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
+```
+
+What TensorFlow actually did in that single line was to add new operations to
+the computation graph. These operations included ones to compute gradients,
+compute parameter update steps, and apply update steps to the parameters.
+
+The returned operation `train_step`, when run, will apply the gradient descent
+updates to the parameters. Training the model can therefore be accomplished by
+repeatedly running `train_step`.
+
+```python
+for _ in range(1000):
+  batch = mnist.train.next_batch(100)
+  train_step.run(feed_dict={x: batch[0], y_: batch[1]})
+```
+
+We load 100 training examples in each training iteration. We then run the
+`train_step` operation, using `feed_dict` to replace the `placeholder` tensors
+`x` and `y_` with the training examples.  Note that you can replace any tensor
+in your computation graph using `feed_dict` -- it's not restricted to just
+`placeholder`s.
+
+### Evaluate the Model
+
+How well did our model do?
+
+First we'll figure out where we predicted the correct label. `tf.argmax` is an
+extremely useful function which gives you the index of the highest entry in a
+tensor along some axis. For example, `tf.argmax(y,1)` is the label our model
+thinks is most likely for each input, while `tf.argmax(y_,1)` is the true
+label. We can use `tf.equal` to check if our prediction matches the truth.
+
+```python
+correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
+```
+
+That gives us a list of booleans. To determine what fraction are correct, we
+cast to floating point numbers and then take the mean. For example,
+`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
+
+```python
+accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+```
+
+Finally, we can evaluate our accuracy on the test data. This should be about
+92% correct.
+
+```python
+print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
+```
+
+## Build a Multilayer Convolutional Network
+
+Getting 92% accuracy on MNIST is bad. It's almost embarrassingly bad. In this
+section, we'll fix that, jumping from a very simple model to something
+moderately sophisticated: a small convolutional neural network. This will get us
+to around 99.2% accuracy -- not state of the art, but respectable.
+
+### Weight Initialization
+
+To create this model, we're going to need to create a lot of weights and biases.
+One should generally initialize weights with a small amount of noise for
+symmetry breaking, and to prevent 0 gradients. Since we're using
+[ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) neurons, it is
+also good practice to initialize them with a slightly positive initial bias to
+avoid "dead neurons". Instead of doing this repeatedly while we build the model,
+let's create two handy functions to do it for us.
+
+```python
+def weight_variable(shape):
+  initial = tf.truncated_normal(shape, stddev=0.1)
+  return tf.Variable(initial)
+
+def bias_variable(shape):
+  initial = tf.constant(0.1, shape=shape)
+  return tf.Variable(initial)
+```
+
+### Convolution and Pooling
+
+TensorFlow also gives us a lot of flexibility in convolution and pooling
+operations. How do we handle the boundaries? What is our stride size?
+In this example, we're always going to choose the vanilla version.
+Our convolutions uses a stride of one and are zero padded so that the
+output is the same size as the input. Our pooling is plain old max pooling
+over 2x2 blocks. To keep our code cleaner, let's also abstract those operations
+into functions.
+
+```python
+def conv2d(x, W):
+  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
+
+def max_pool_2x2(x):
+  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
+                        strides=[1, 2, 2, 1], padding='SAME')
+```
+
+### First Convolutional Layer
+
+We can now implement our first layer. It will consist of convolution, followed
+by max pooling. The convolution will compute 32 features for each 5x5 patch.
+Its weight tensor will have a shape of `[5, 5, 1, 32]`. The first two
+dimensions are the patch size, the next is the number of input channels, and
+the last is the number of output channels. We will also have a bias vector with
+a component for each output channel.
+
+```python
+W_conv1 = weight_variable([5, 5, 1, 32])
+b_conv1 = bias_variable([32])
+```
+
+To apply the layer, we first reshape `x` to a 4d tensor, with the second and
+third dimensions corresponding to image width and height, and the final
+dimension corresponding to the number of color channels.
+
+```python
+x_image = tf.reshape(x, [-1, 28, 28, 1])
+```
+
+We then convolve `x_image` with the weight tensor, add the
+bias, apply the ReLU function, and finally max pool. The `max_pool_2x2` method will
+reduce the image size to 14x14.
+
+```python
+h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
+h_pool1 = max_pool_2x2(h_conv1)
+```
+
+### Second Convolutional Layer
+
+In order to build a deep network, we stack several layers of this type. The
+second layer will have 64 features for each 5x5 patch.
+
+```python
+W_conv2 = weight_variable([5, 5, 32, 64])
+b_conv2 = bias_variable([64])
+
+h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
+h_pool2 = max_pool_2x2(h_conv2)
+```
+
+### Densely Connected Layer
+
+Now that the image size has been reduced to 7x7, we add a fully-connected layer
+with 1024 neurons to allow processing on the entire image. We reshape the tensor
+from the pooling layer into a batch of vectors,
+multiply by a weight matrix, add a bias, and apply a ReLU.
+
+```python
+W_fc1 = weight_variable([7 * 7 * 64, 1024])
+b_fc1 = bias_variable([1024])
+
+h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
+h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+```
+
+#### Dropout
+
+To reduce overfitting, we will apply [dropout](
+https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf) before the readout layer.
+We create a `placeholder` for the probability that a neuron's output is kept
+during dropout. This allows us to turn dropout on during training, and turn it
+off during testing.
+TensorFlow's `tf.nn.dropout` op automatically handles scaling neuron outputs in
+addition to masking them, so dropout just works without any additional
+scaling.<sup id="a1">[1](#f1)</sup>
+
+```python
+keep_prob = tf.placeholder(tf.float32)
+h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+```
+
+### Readout Layer
+
+Finally, we add a layer, just like for the one layer softmax regression
+above.
+
+```python
+W_fc2 = weight_variable([1024, 10])
+b_fc2 = bias_variable([10])
+
+y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
+```
+
+### Train and Evaluate the Model
+
+How well does this model do? To train and evaluate it we will use code that is
+nearly identical to that for the simple one layer SoftMax network above.
+
+The differences are that:
+
+- We will replace the steepest gradient descent optimizer with the more
+  sophisticated ADAM optimizer.
+
+- We will include the additional parameter `keep_prob` in `feed_dict` to control
+  the dropout rate.
+
+- We will add logging to every 100th iteration in the training process.
+
+We will also use tf.Session rather than tf.InteractiveSession. This better
+separates the process of creating the graph (model specification) and the
+process of evaluating the graph (model fitting). It generally makes for cleaner
+code. The tf.Session is created within a [`with` block](https://docs.python.org/3/whatsnew/2.6.html#pep-343-the-with-statement)
+so that it is automatically destroyed once the block is exited.
+
+Feel free to run this code. Be aware that it does 20,000 training iterations
+and may take a while (possibly up to half an hour), depending on your processor.
+
+```python
+cross_entropy = tf.reduce_mean(
+    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
+train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
+correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+
+with tf.Session() as sess:
+  sess.run(tf.global_variables_initializer())
+  for i in range(20000):
+    batch = mnist.train.next_batch(50)
+    if i % 100 == 0:
+      train_accuracy = accuracy.eval(feed_dict={
+          x: batch[0], y_: batch[1], keep_prob: 1.0})
+      print('step %d, training accuracy %g' % (i, train_accuracy))
+    train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
+
+  print('test accuracy %g' % accuracy.eval(feed_dict={
+      x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
+```
+
+The final test set accuracy after running this code should be approximately 99.2%.
+
+We have learned how to quickly and easily build, train, and evaluate a
+fairly sophisticated deep learning model using TensorFlow.
+
+<b id="f1">1</b>: For this small convolutional network, performance is actually nearly identical with and without dropout. Dropout is often very effective at reducing overfitting, but it is most useful when training very large neural networks. [↩](#a1)
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
new file mode 100644
index 00000000000..d9c605b013c
--- /dev/null
+++ b/tensorflow/docs_src/get_started/monitors.md
@@ -0,0 +1,406 @@
+# Logging and Monitoring Basics with tf.contrib.learn
+
+When training a model, it’s often valuable to track and evaluate progress in
+real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
+capabilities and the `Monitor` API to audit the in-progress training of a neural
+network classifier for categorizing irises. This tutorial builds on the code
+developed in @{$tflearn$tf.contrib.learn Quickstart} so if you
+haven't yet completed that tutorial, you may want to explore it first,
+especially if you're looking for an intro/refresher on tf.contrib.learn basics.
+
+## Setup {#setup}
+
+For this tutorial, you'll be building upon the following code from
+@{$tflearn$tf.contrib.learn Quickstart}:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+# Data sets
+IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv")
+IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
+
+def main(unused_argv):
+    # Load datasets.
+    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+        filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
+    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+        filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
+
+    # Specify that all features have real-value data
+    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+
+    # Build 3 layer DNN with 10, 20, 10 units respectively.
+    classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
+                                                hidden_units=[10, 20, 10],
+                                                n_classes=3,
+                                                model_dir="/tmp/iris_model")
+
+    # Fit model.
+    classifier.fit(x=training_set.data,
+                   y=training_set.target,
+                   steps=2000)
+
+    # Evaluate accuracy.
+    accuracy_score = classifier.evaluate(x=test_set.data,
+                                         y=test_set.target)["accuracy"]
+    print('Accuracy: {0:f}'.format(accuracy_score))
+
+    # Classify two new flower samples.
+    new_samples = np.array(
+        [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+    y = list(classifier.predict(new_samples, as_iterable=True))
+    print('Predictions: {}'.format(str(y)))
+
+if __name__ == "__main__":
+  tf.app.run()
+```
+
+Copy the above code into a file, and download the corresponding
+[training](http://download.tensorflow.org/data/iris_training.csv) and
+[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
+directory.
+
+In the following sections, you'll progressively make updates to the above code
+to add logging and monitoring capabilities. Final code incorporating all updates
+is [available for download
+here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iris_monitors.py).
+
+## Overview
+
+The @{$tflearn$tf.contrib.learn Quickstart tutorial} walked through
+how to implement a neural net classifier to categorize iris examples into one of
+three species.
+
+But when [the code](#setup) from this tutorial is run, the output contains no
+logging tracking how model training is progressing&mdash;only the results of the
+`print` statements that were included:
+
+```none
+Accuracy: 0.933333
+Predictions: [1 2]
+```
+
+Without any logging, model training feels like a bit of a black box; you can't
+see what's happening as TensorFlow steps through gradient descent, get a sense
+of whether the model is converging appropriately, or audit to determine whether
+[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be
+appropriate.
+
+One way to address this problem would be to split model training into multiple
+`fit` calls with smaller numbers of steps in order to evaluate accuracy more
+progressively. However, this is not recommended practice, as it greatly slows
+down model training. Fortunately, tf.contrib.learn offers another solution: a
+@{tf.contrib.learn.monitors$Monitor API} designed to help
+you log metrics and evaluate your model while training is in progress. In the
+following sections, you'll learn how to enable logging in TensorFlow, set up a
+ValidationMonitor to do streaming evaluations, and visualize your metrics using
+TensorBoard.
+
+## Enabling Logging with TensorFlow
+
+TensorFlow uses five different levels for log messages. In order of ascending
+severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you
+configure logging at any of these levels, TensorFlow will output all log
+messages corresponding to that level and all levels of higher severity. For
+example, if you set a logging level of `ERROR`, you'll get log output containing
+`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log
+messages from all five levels.
+
+By default, TensorFlow is configured at a logging level of `WARN`, but when
+tracking model training, you'll want to adjust the level to `INFO`, which will
+provide additional feedback as `fit` operations are in progress.
+
+Add the following line to the beginning of your code (right after your
+`import`s):
+
+```python
+tf.logging.set_verbosity(tf.logging.INFO)
+```
+
+Now when you run the code, you'll see additional log output like the following:
+
+```none
+INFO:tensorflow:loss = 1.18812, step = 1
+INFO:tensorflow:loss = 0.210323, step = 101
+INFO:tensorflow:loss = 0.109025, step = 201
+```
+
+With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss
+metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100
+steps.
+
+## Configuring a ValidationMonitor for Streaming Evaluation
+
+Logging training loss is helpful to get a sense whether your model is
+converging, but what if you want further insight into what's happening during
+training? tf.contrib.learn provides several high-level `Monitor`s you can attach
+to your `fit` operations to further track metrics and/or debug lower-level
+TensorFlow operations during model training, including:
+
+Monitor             | Description
+------------------- | -----------
+`CaptureVariable`   | Saves a specified variable's values into a collection at every _n_ steps of training
+`PrintTensor`       | Logs a specified tensor's values at every _n_ steps of training
+`SummarySaver`      | Saves @{tf.Summary} [protocol buffers](https://developers.google.com/protocol-buffers/) for a given tensor using a @{tf.summary.FileWriter} at every _n_ steps of training
+`ValidationMonitor` | Logs a specified set of evaluation metrics at every _n_ steps of training, and, if desired, implements early stopping under certain conditions
+
+### Evaluating Every *N* Steps
+
+For the iris neural network classifier, while logging training loss, you might
+also want to simultaneously evaluate against test data to see how well the model
+is generalizing. You can accomplish this by configuring a `ValidationMonitor`
+with the test data (`test_set.data` and `test_set.target`), and setting how
+often to evaluate with `every_n_steps`. The default value of `every_n_steps` is
+`100`; here, set `every_n_steps` to `50` to evaluate after every 50 steps of
+model training:
+
+```python
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50)
+```
+
+Place this code right before the line instantiating the `classifier`.
+
+`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations,
+so you'll want to modify instantiation of the `classifier` to add a
+@{tf.contrib.learn.RunConfig} that includes
+`save_checkpoints_secs`, which specifies how many seconds should elapse between
+checkpoint saves during training. Because the iris data set is quite small, and
+thus trains quickly, it makes sense to set `save_checkpoints_secs` to 1 (saving
+a checkpoint every second) to ensure a sufficient number of checkpoints:
+
+```python
+classifier = tf.contrib.learn.DNNClassifier(
+    feature_columns=feature_columns,
+    hidden_units=[10, 20, 10],
+    n_classes=3,
+    model_dir="/tmp/iris_model",
+    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
+```
+
+NOTE: The `model_dir` parameter specifies an explicit directory
+(`/tmp/iris_model`) for model data to be stored; this directory path will be
+easier to reference later on than an autogenerated one. Each time you run the
+code, any existing data in `/tmp/iris_model` will be loaded, and model training
+will continue where it left off in the last run (e.g., running the script twice
+in succession will execute 4000 steps during training&mdash;2000 during each
+`fit` operation). To start over model training from scratch, delete
+`/tmp/iris_model` before running the code.
+
+Finally, to attach your `validation_monitor`, update the `fit` call to include a
+`monitors` param, which takes a list of all monitors to run during model
+training:
+
+```python
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=2000,
+               monitors=[validation_monitor])
+```
+
+Now, when you rerun the code, you should see validation metrics in your log
+output, e.g.:
+
+```none
+INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667
+...
+INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667
+...
+INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667
+```
+
+### Customizing the Evaluation Metrics with MetricSpec
+
+By default, if no evaluation metrics are specified, `ValidationMonitor` will log
+both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you
+can customize the list of metrics that will be run every 50 steps. To specify
+the exact metrics you'd like to run in each evaluation pass, you can add a
+`metrics` param to the `ValidationMonitor` constructor. `metrics` takes a dict
+of key/value pairs, where each key is the name you'd like logged for the metric,
+and the corresponding value is a
+[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py)
+object.
+
+The `MetricSpec` constructor accepts four parameters:
+
+*   `metric_fn`. The function that calculates and returns the value of a metric.
+    This can be a predefined function available in the
+    @{tf.contrib.metrics} module, such as
+    @{tf.contrib.metrics.streaming_precision} or
+    @{tf.contrib.metrics.streaming_recall}.
+
+    Alternatively, you can define your own custom metric function, which must
+    take `predictions` and `labels` tensors as arguments (a `weights` argument
+    can also optionally be supplied). The function must return the value of the
+    metric in one of two formats:
+
+    *   A single tensor
+    *   A pair of ops `(value_op, update_op)`, where `value_op` returns the
+        metric value and `update_op` performs a corresponding operation to
+        update internal model state.
+
+*   `prediction_key`. The key of the tensor containing the predictions returned
+    by the model. This argument may be omitted if the model returns either a
+    single tensor or a dict with a single entry. For a `DNNClassifier` model,
+    class predictions will be returned in a tensor with the key
+    @{tf.contrib.learn.PredictionKey.CLASSES}.
+
+*   `label_key`. The key of the tensor containing the labels returned by the
+    model, as specified by the model's @{$input_fn$`input_fn`}. As
+    with `prediction_key`, this argument may be omitted if the `input_fn`
+    returns either a single tensor or a dict with a single entry. In the iris
+    example in this tutorial, the `DNNClassifier` does not have an `input_fn`
+    (`x`,`y` data is passed directly to `fit`), so it's not necessary to provide
+    a `label_key`.
+
+*   `weights_key`. *Optional*. The key of the tensor (returned by the
+    @{$input_fn$`input_fn`}) containing weights inputs for the
+    `metric_fn`.
+
+The following code creates a `validation_metrics` dict that defines three
+metrics to log during model evaluation:
+
+*   `"accuracy"`, using @{tf.contrib.metrics.streaming_accuracy}
+    as the `metric_fn`
+*   `"precision"`, using @{tf.contrib.metrics.streaming_precision}
+    as the `metric_fn`
+*   `"recall"`, using @{tf.contrib.metrics.streaming_recall}
+    as the `metric_fn`
+
+```python
+validation_metrics = {
+    "accuracy":
+        tf.contrib.learn.MetricSpec(
+            metric_fn=tf.contrib.metrics.streaming_accuracy,
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
+    "precision":
+        tf.contrib.learn.MetricSpec(
+            metric_fn=tf.contrib.metrics.streaming_precision,
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
+    "recall":
+        tf.contrib.learn.MetricSpec(
+            metric_fn=tf.contrib.metrics.streaming_recall,
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
+}
+```
+
+Add the above code before the `ValidationMonitor` constructor. Then revise the
+`ValidationMonitor` constructor as follows to add a `metrics` parameter to log
+the accuracy, precision, and recall metrics specified in `validation_metrics`
+(loss is always logged, and doesn't need to be explicitly specified):
+
+```python
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50,
+    metrics=validation_metrics)
+```
+
+Rerun the code, and you should see precision and recall included in your log
+output, e.g.:
+
+```none
+INFO:tensorflow:Validation (step 50): recall = 0.0, loss = 1.20626, global_step = 1, precision = 0.0, accuracy = 0.266667
+...
+INFO:tensorflow:Validation (step 600): recall = 1.0, loss = 0.0530696, global_step = 571, precision = 1.0, accuracy = 0.966667
+...
+INFO:tensorflow:Validation (step 1500): recall = 1.0, loss = 0.0617403, global_step = 1452, precision = 1.0, accuracy = 0.966667
+```
+
+### Early Stopping with ValidationMonitor
+
+Note that in the above log output, by step 600, the model has already achieved
+precision and recall rates of 1.0. This raises the question as to whether model
+training could benefit from
+[early stopping](https://en.wikipedia.org/wiki/Early_stopping).
+
+In addition to logging eval metrics, `ValidationMonitor`s make it easy to
+implement early stopping when specified conditions are met, via three params:
+
+| Param                            | Description                               |
+| -------------------------------- | ----------------------------------------- |
+| `early_stopping_metric`          | Metric that triggers early stopping       |
+:                                  : (e.g., loss or accuracy) under conditions :
+:                                  : specified in `early_stopping_rounds` and  :
+:                                  : `early_stopping_metric_minimize`. Default :
+:                                  : is `"loss"`.                              :
+| `early_stopping_metric_minimize` | `True` if desired model behavior is to    |
+:                                  : minimize the value of                     :
+:                                  : `early_stopping_metric`; `False` if       :
+:                                  : desired model behavior is to maximize the :
+:                                  : value of `early_stopping_metric`. Default :
+:                                  : is `True`.                                :
+| `early_stopping_rounds`          | Sets a number of steps during which if    |
+:                                  : the `early_stopping_metric` does not      :
+:                                  : decrease (if                              :
+:                                  : `early_stopping_metric_minimize` is       :
+:                                  : `True`) or increase (if                   :
+:                                  : `early_stopping_metric_minimize` is       :
+:                                  : `False`), training will be stopped.       :
+:                                  : Default is `None`, which means early      :
+:                                  : stopping will never occur.                :
+
+Make the following revision to the `ValidationMonitor` constructor, which
+specifies that if loss (`early_stopping_metric="loss"`) does not decrease
+(`early_stopping_metric_minimize=True`) over a period of 200 steps
+(`early_stopping_rounds=200`), model training will stop immediately at that
+point, and not complete the full 2000 steps specified in `fit`:
+
+```python
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50,
+    metrics=validation_metrics,
+    early_stopping_metric="loss",
+    early_stopping_metric_minimize=True,
+    early_stopping_rounds=200)
+```
+
+Rerun the code to see if model training stops early:
+
+```none
+...
+INFO:tensorflow:Validation (step 1150): recall = 1.0, loss = 0.056436, global_step = 1119, precision = 1.0, accuracy = 0.966667
+INFO:tensorflow:Stopping. Best step: 800 with loss = 0.048313818872.
+```
+
+Indeed, here training stops at step 1150, indicating that for the past 200
+steps, loss did not decrease, and that overall, step 800 produced the smallest
+loss value against the test data set. This suggests that additional calibration
+of hyperparameters by decreasing the step count might further improve the model.
+
+## Visualizing Log Data with TensorBoard
+
+Reading through the log produced by `ValidationMonitor` provides plenty of raw
+data on model performance during training, but it may also be helpful to see
+visualizations of this data to get further insight into trends&mdash;for
+example, how accuracy is changing over step count. You can use TensorBoard (a
+separate program packaged with TensorFlow) to plot graphs like this by setting
+the `logdir` command-line argument to the directory where you saved your model
+training data (here, `/tmp/iris_model`). Run the following on your command line:
+
+<pre><strong>$ tensorboard --logdir=/tmp/iris_model/</strong>
+Starting TensorBoard 39 on port 6006</pre>
+
+Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
+*`<port_number>`* is the port specified in the command-line output (here,
+`6006`).
+
+If you click on the accuracy field, you'll see an image like the following,
+which shows accuracy plotted against step count:
+
+![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
+
+For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}.
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
new file mode 100644
index 00000000000..45d43e7a6e7
--- /dev/null
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -0,0 +1,220 @@
+# TensorBoard: Visualizing Learning
+
+The computations you'll use TensorFlow for - like training a massive
+deep neural network - can be complex and confusing. To make it easier to
+understand, debug, and optimize TensorFlow programs, we've included a suite of
+visualization tools called TensorBoard. You can use TensorBoard to visualize
+your TensorFlow graph, plot quantitative metrics about the execution of your
+graph, and show additional data like images that pass through it. When
+TensorBoard is fully configured, it looks like this:
+
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
+
+<div class="video-wrapper">
+  <iframe class="devsite-embedded-youtube-video" data-video-id="eBbEDRsCmv4"
+          data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
+  </iframe>
+</div>
+
+This tutorial is intended to get you started with simple TensorBoard usage.
+There are other resources available as well! The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md)
+has a lot more information on TensorBoard usage, including tips & tricks, and
+debugging information.
+
+## Serializing the data
+
+TensorBoard operates by reading TensorFlow events files, which contain summary
+data that you can generate when running TensorFlow. Here's the general
+lifecycle for summary data within TensorBoard.
+
+First, create the TensorFlow graph that you'd like to collect summary
+data from, and decide which nodes you would like to annotate with
+@{$python/summary$summary operations}.
+
+For example, suppose you are training a convolutional neural network for
+recognizing MNIST digits. You'd like to record how the learning rate
+varies over time, and how the objective function is changing. Collect these by
+attaching @{tf.summary.scalar} ops
+to the nodes that output the learning rate and loss respectively. Then, give
+each `scalar_summary` a meaningful `tag`, like `'learning rate'` or `'loss
+function'`.
+
+Perhaps you'd also like to visualize the distributions of activations coming
+off a particular layer, or the distribution of gradients or weights. Collect
+this data by attaching
+@{tf.summary.histogram} ops to
+the gradient outputs and to the variable that holds your weights, respectively.
+
+For details on all of the summary operations available, check out the docs on
+@{$python/summary$summary operations}.
+
+Operations in TensorFlow don't do anything until you run them, or an op that
+depends on their output. And the summary nodes that we've just created are
+peripheral to your graph: none of the ops you are currently running depend on
+them. So, to generate summaries, we need to run all of these summary nodes.
+Managing them by hand would be tedious, so use
+@{tf.summary.merge_all}
+to combine them into a single op that generates all the summary data.
+
+Then, you can just run the merged summary op, which will generate a serialized
+`Summary` protobuf object with all of your summary data at a given step.
+Finally, to write this summary data to disk, pass the summary protobuf to a
+@{tf.summary.FileWriter}.
+
+The `FileWriter` takes a logdir in its constructor - this logdir is quite
+important, it's the directory where all of the events will be written out.
+Also, the `FileWriter` can optionally take a `Graph` in its constructor.
+If it receives a `Graph` object, then TensorBoard will visualize your graph
+along with tensor shape information. This will give you a much better sense of
+what flows through the graph: see
+@{$graph_viz#tensor-shape-information$Tensor shape information}.
+
+Now that you've modified your graph and have a `FileWriter`, you're ready to
+start running your network! If you want, you could run the merged summary op
+every single step, and record a ton of training data. That's likely to be more
+data than you need, though. Instead, consider running the merged summary op
+every `n` steps.
+
+The code example below is a modification of the
+@{$beginners$simple MNIST tutorial},
+in which we have added some summary ops, and run them every ten steps. If you
+run this and then launch `tensorboard --logdir=/tmp/tensorflow/mnist`, you'll be able
+to visualize statistics, such as how the weights or accuracy varied during
+training. The code below is an excerpt; full source is
+[here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
+
+```python
+def variable_summaries(var):
+  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
+  with tf.name_scope('summaries'):
+    mean = tf.reduce_mean(var)
+    tf.summary.scalar('mean', mean)
+    with tf.name_scope('stddev'):
+      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
+    tf.summary.scalar('stddev', stddev)
+    tf.summary.scalar('max', tf.reduce_max(var))
+    tf.summary.scalar('min', tf.reduce_min(var))
+    tf.summary.histogram('histogram', var)
+
+def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
+  """Reusable code for making a simple neural net layer.
+
+  It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+  It also sets up name scoping so that the resultant graph is easy to read,
+  and adds a number of summary ops.
+  """
+  # Adding a name scope ensures logical grouping of the layers in the graph.
+  with tf.name_scope(layer_name):
+    # This Variable will hold the state of the weights for the layer
+    with tf.name_scope('weights'):
+      weights = weight_variable([input_dim, output_dim])
+      variable_summaries(weights)
+    with tf.name_scope('biases'):
+      biases = bias_variable([output_dim])
+      variable_summaries(biases)
+    with tf.name_scope('Wx_plus_b'):
+      preactivate = tf.matmul(input_tensor, weights) + biases
+      tf.summary.histogram('pre_activations', preactivate)
+    activations = act(preactivate, name='activation')
+    tf.summary.histogram('activations', activations)
+    return activations
+
+hidden1 = nn_layer(x, 784, 500, 'layer1')
+
+with tf.name_scope('dropout'):
+  keep_prob = tf.placeholder(tf.float32)
+  tf.summary.scalar('dropout_keep_probability', keep_prob)
+  dropped = tf.nn.dropout(hidden1, keep_prob)
+
+# Do not apply softmax activation yet, see below.
+y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
+
+with tf.name_scope('cross_entropy'):
+  # The raw formulation of cross-entropy,
+  #
+  # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
+  #                               reduction_indices=[1]))
+  #
+  # can be numerically unstable.
+  #
+  # So here we use tf.nn.softmax_cross_entropy_with_logits on the
+  # raw outputs of the nn_layer above, and then average across
+  # the batch.
+  diff = tf.nn.softmax_cross_entropy_with_logits(targets=y_, logits=y)
+  with tf.name_scope('total'):
+    cross_entropy = tf.reduce_mean(diff)
+tf.summary.scalar('cross_entropy', cross_entropy)
+
+with tf.name_scope('train'):
+  train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
+      cross_entropy)
+
+with tf.name_scope('accuracy'):
+  with tf.name_scope('correct_prediction'):
+    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  with tf.name_scope('accuracy'):
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+tf.summary.scalar('accuracy', accuracy)
+
+# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
+merged = tf.summary.merge_all()
+train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
+                                      sess.graph)
+test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test')
+tf.global_variables_initializer().run()
+```
+
+After we've initialized the `FileWriters`, we have to add summaries to the
+`FileWriters` as we train and test the model.
+
+```python
+# Train the model, and also write summaries.
+# Every 10th step, measure test-set accuracy, and write test summaries
+# All other steps, run train_step on training data, & add training summaries
+
+def feed_dict(train):
+  """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
+  if train or FLAGS.fake_data:
+    xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
+    k = FLAGS.dropout
+  else:
+    xs, ys = mnist.test.images, mnist.test.labels
+    k = 1.0
+  return {x: xs, y_: ys, keep_prob: k}
+
+for i in range(FLAGS.max_steps):
+  if i % 10 == 0:  # Record summaries and test-set accuracy
+    summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
+    test_writer.add_summary(summary, i)
+    print('Accuracy at step %s: %s' % (i, acc))
+  else:  # Record train set summaries, and train
+    summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+    train_writer.add_summary(summary, i)
+```
+
+You're now all set to visualize this data using TensorBoard.
+
+
+## Launching TensorBoard
+
+To run TensorBoard, use the following command (alternatively `python -m
+tensorflow.tensorboard`)
+
+```bash
+tensorboard --logdir=path/to/log-directory
+```
+
+where `logdir` points to the directory where the `FileWriter` serialized its
+data.  If this `logdir` directory contains subdirectories which contain
+serialized data from separate runs, then TensorBoard will visualize the data
+from all of those runs. Once TensorBoard is running, navigate your web browser
+to `localhost:6006` to view the TensorBoard.
+
+When looking at TensorBoard, you will see the navigation tabs in the top right
+corner. Each tab represents a set of serialized data that can be visualized.
+
+For in depth information on how to use the *graph* tab to visualize your graph,
+see @{$graph_viz$TensorBoard: Graph Visualization}.
+
+For more usage information on TensorBoard in general, see the [TensorBoard
+README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md).
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/get_started/tensorboard_histograms.md
new file mode 100644
index 00000000000..b3dd13497eb
--- /dev/null
+++ b/tensorflow/docs_src/get_started/tensorboard_histograms.md
@@ -0,0 +1,243 @@
+# TensorBoard Histogram Dashboard
+
+The TensorBoard Histogram Dashboard displays how the distribution of some
+`Tensor` in your TensorFlow graph has changed over time. It does this by showing
+many histograms visualizations of your tensor at different points in time.
+
+## A Basic Example
+
+Let's start with a simple case: a normally-distributed variable, where the mean
+shifts over time.
+TensorFlow has an op
+[`tf.random_normal`](https://www.tensorflow.org/api_docs/python/tf/random_normal)
+which is perfect for this purpose. As is usually the case with TensorBoard, we
+will ingest data using a summary op; in this case,
+['tf.summary.histogram'](https://www.tensorflow.org/api_docs/python/tf/summary/histogram).
+For a primer on how summaries work, please see the general
+[TensorBoard tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+Here is a code snippet that will generate some histogram summaries containing
+normally distributed data, where the mean of the distribution increases over
+time.
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+Once that code runs, we can load the data into TensorBoard via the command line:
+
+
+```sh
+tensorboard --logdir=/tmp/histogram_example
+```
+
+Once TensorBoard is running, load it in Chrome or Firefox and navigate to the
+Histogram Dashboard. Then we can see a histogram visualization for our normally
+distributed data.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/1_moving_mean.png)
+
+`tf.summary.histogram` takes an arbitrarily sized and shaped Tensor, and
+compresses it into a histogram data structure consisting of many bins with
+widths and counts. For example, let's say we want to organize the numbers
+`[0.5, 1.1, 1.3, 2.2, 2.9, 2.99]` into bins. We could make three bins:
+* a bin
+containing everything from 0 to 1 (it would contain one element, 0.5),
+* a bin
+containing everything from 1-2 (it would contain two elements, 1.1 and 1.3),
+* a bin containing everything from 2-3 (it would contain three elements: 2.2,
+2.9 and 2.99).
+
+TensorFlow uses a similar approach to create bins, but unlike in our example, it
+doesn't create integer bins. For large, sparse datasets, that might result in
+many thousands of bins.
+Instead, [the bins are exponentially distributed, with many bins close to 0 and
+comparatively few bins for very large numbers.](https://github.com/tensorflow/tensorflow/blob/c8b59c046895fa5b6d79f73e0b5817330fcfbfc1/tensorflow/core/lib/histogram/histogram.cc#L28)
+However, visualizing exponentially-distributed bins is tricky; if height is used
+to encode count, then wider bins take more space, even if they have the same
+number of elements. Conversely, encoding count in the area makes height
+comparisons impossible. Instead, the histograms [resample the data](https://github.com/tensorflow/tensorflow/blob/17c47804b86e340203d451125a721310033710f1/tensorflow/tensorboard/components/tf_backend/backend.ts#L400)
+into uniform bins. This can lead to unfortunate artifacts in some cases.
+
+Each slice in the histogram visualizer displays a single histogram.
+The slices are organized by step;
+older slices (e.g. step 0) are further "back" and darker, while newer slices
+(e.g. step 400) are close to the foreground, and lighter in color.
+The y-axis on the right shows the step number.
+
+You can mouse over the histogram to see tooltips with some more detailed
+information. For example, in the following image we can see that the histogram
+at timestep 176 has a bin centered at 2.25 with 177 elements in that bin.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/2_moving_mean_tooltip.png)
+
+Also, you may note that the histogram slices are not always evenly spaced in
+step count or time. This is because TensorBoard uses
+[reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to keep a
+subset of all the histograms, to save on memory. Reservoir sampling guarantees
+that every sample has an equal likelihood of being included, but because it is
+a randomized algorithm, the samples chosen don't occur at even steps.
+
+## Overlay Mode
+
+There is a control on the left of the dashboard that allows you to toggle the
+histogram mode from "offset" to "overlay":
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/3_overlay_offset.png)
+
+In "offset" mode, the visualization rotates 45 degrees, so that the individual
+histogram slices are no longer spread out in time, but instead are all plotted
+on the same y-axis.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/4_overlay.png)
+Now, each slice is a separate line on the chart, and the y-axis shows the item
+count within each bucket. Darker lines are older, earlier steps, and lighter
+lines are more recent, later steps. Once again, you can mouse over the chart to
+see some additional information.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/5_overlay_tooltips.png)
+
+In general, the overlay visualization is useful if you want to directly compare
+the counts of different histograms.
+
+## Multimodal Distributions
+
+The Histogram Dashboard is great for visualizing multimodal
+distributions. Let's construct a simple bimodal distribution by concatenating
+the outputs from two different normal distributions. The code will look like
+this:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+You already remember our "moving mean" normal distribution from the example
+above. Now we also have a "shrinking variance" distribution. Side-by-side, they
+look like this:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/6_two_distributions.png)
+
+When we concatenate them, we get a chart that clearly reveals the divergent,
+bimodal structure:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/7_bimodal.png)
+
+## Some more distributions
+
+Just for fun, let's generate and visualize a few more distributions, and then
+combine them all into one chart. Here's the code we'll use:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+# Add a gamma distribution
+gamma = tf.random_gamma(shape=[1000], alpha=k)
+tf.summary.histogram("gamma", gamma)
+
+# And a poisson distribution
+poisson = tf.random_poisson(shape=[1000], lam=k)
+tf.summary.histogram("poisson", poisson)
+
+# And a uniform distribution
+uniform = tf.random_uniform(shape=[1000], maxval=k*10)
+tf.summary.histogram("uniform", uniform)
+
+# Finally, combine everything together!
+all_distributions = [mean_moving_normal, variance_shrinking_normal,
+                     gamma, poisson, uniform]
+all_combined = tf.concat(all_distributions, 0)
+tf.summary.histogram("all_combined", all_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+### Gamma Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/8_gamma.png)
+
+### Uniform Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/9_uniform.png)
+
+### Poisson Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/10_poisson.png)
+The poisson distribution is defined over the integers. So, all of the values
+being generated are perfect integers. The histogram compression moves the data
+into floating-point bins, causing the visualization to show little
+bumps over the integer values rather than perfect spikes.
+
+### All Together Now
+Finally, we can concatenate all of the data into one funny-looking curve.
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/11_all_combined.png)
+
diff --git a/tensorflow/docs_src/get_started/tflearn.md b/tensorflow/docs_src/get_started/tflearn.md
new file mode 100644
index 00000000000..002118073ce
--- /dev/null
+++ b/tensorflow/docs_src/get_started/tflearn.md
@@ -0,0 +1,413 @@
+# tf.contrib.learn Quickstart
+
+TensorFlow’s high-level machine learning API (tf.contrib.learn) makes it easy to
+configure, train, and evaluate a variety of machine learning models. In this
+tutorial, you’ll use tf.contrib.learn to construct a
+[neural network](https://en.wikipedia.org/wiki/Artificial_neural_network)
+classifier and train it on the
+[Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) to
+predict flower species based on sepal/petal geometry. You'll write code to
+perform the following five steps:
+
+1.  Load CSVs containing Iris training/test data into a TensorFlow `Dataset`
+2.  Construct a @{tf.contrib.learn.DNNClassifier$neural network classifier}
+3.  Fit the model using the training data
+4.  Evaluate the accuracy of the model
+5.  Classify new samples
+
+NOTE: Remember to @{$install$install TensorFlow on your machine}
+before getting started with this tutorial.
+
+## Complete Neural Network Source Code
+
+Here is the full code for the neural network classifier:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import urllib
+
+import numpy as np
+import tensorflow as tf
+
+# Data sets
+IRIS_TRAINING = "iris_training.csv"
+IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
+
+IRIS_TEST = "iris_test.csv"
+IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
+
+
+def main():
+  # If the training and test sets aren't stored locally, download them.
+  if not os.path.exists(IRIS_TRAINING):
+    raw = urllib.urlopen(IRIS_TRAINING_URL).read()
+    with open(IRIS_TRAINING, "w") as f:
+      f.write(raw)
+
+  if not os.path.exists(IRIS_TEST):
+    raw = urllib.urlopen(IRIS_TEST_URL).read()
+    with open(IRIS_TEST, "w") as f:
+      f.write(raw)
+
+  # Load datasets.
+  training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+      filename=IRIS_TRAINING,
+      target_dtype=np.int,
+      features_dtype=np.float32)
+  test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+      filename=IRIS_TEST,
+      target_dtype=np.int,
+      features_dtype=np.float32)
+
+  # Specify that all features have real-value data
+  feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+
+  # Build 3 layer DNN with 10, 20, 10 units respectively.
+  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
+                                              hidden_units=[10, 20, 10],
+                                              n_classes=3,
+                                              model_dir="/tmp/iris_model")
+  # Define the training inputs
+  def get_train_inputs():
+    x = tf.constant(training_set.data)
+    y = tf.constant(training_set.target)
+
+    return x, y
+
+  # Fit model.
+  classifier.fit(input_fn=get_train_inputs, steps=2000)
+
+  # Define the test inputs
+  def get_test_inputs():
+    x = tf.constant(test_set.data)
+    y = tf.constant(test_set.target)
+
+    return x, y
+
+  # Evaluate accuracy.
+  accuracy_score = classifier.evaluate(input_fn=get_test_inputs,
+                                       steps=1)["accuracy"]
+
+  print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
+
+  # Classify two new flower samples.
+  def new_samples():
+    return tf.constant(
+      [[6.4, 3.2, 4.5, 1.5],
+       [5.8, 3.1, 5.0, 1.7]], dtype=tf.float32)
+
+  predictions = list(classifier.predict(input_fn=new_samples))
+
+  print(
+      "New Samples, Class Predictions:    {}\n"
+      .format(predictions))
+
+if __name__ == "__main__":
+    main()
+```
+
+The following sections walk through the code in detail.
+
+## Load the Iris CSV data to TensorFlow
+
+The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
+150 rows of data, comprising 50 samples from each of three related Iris species:
+*Iris setosa*, *Iris virginica*, and *Iris versicolor*.
+
+![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](https://www.tensorflow.org/images/iris_three_species.jpg) **From left to right,
+[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
+[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
+[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
+[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
+and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
+(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
+2.0).**
+
+Each row contains the following data for each flower sample:
+[sepal](https://en.wikipedia.org/wiki/Sepal) length, sepal width,
+[petal](https://en.wikipedia.org/wiki/Petal) length, petal width, and flower
+species. Flower species are represented as integers, with 0 denoting *Iris
+setosa*, 1 denoting *Iris versicolor*, and 2 denoting *Iris virginica*.
+
+Sepal Length | Sepal Width | Petal Length | Petal Width | Species
+:----------- | :---------- | :----------- | :---------- | :-------
+5.1          | 3.5         | 1.4          | 0.2         | 0
+4.9          | 3.0         | 1.4          | 0.2         | 0
+4.7          | 3.2         | 1.3          | 0.2         | 0
+&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
+7.0          | 3.2         | 4.7          | 1.4         | 1
+6.4          | 3.2         | 4.5          | 1.5         | 1
+6.9          | 3.1         | 4.9          | 1.5         | 1
+&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
+6.5          | 3.0         | 5.2          | 2.0         | 2
+6.2          | 3.4         | 5.4          | 2.3         | 2
+5.9          | 3.0         | 5.1          | 1.8         | 2
+
+For this tutorial, the Iris data has been randomized and split into two separate
+CSVs:
+
+*   A training set of 120 samples
+    ([iris_training.csv](http://download.tensorflow.org/data/iris_training.csv))
+*   A test set of 30 samples
+    ([iris_test.csv](http://download.tensorflow.org/data/iris_test.csv)).
+
+To get started, first import all the necessary modules, and define where to
+download and store the dataset:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import urllib
+
+import tensorflow as tf
+import numpy as np
+
+IRIS_TRAINING = "iris_training.csv"
+IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
+
+IRIS_TEST = "iris_test.csv"
+IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
+```
+
+Then, if the training and test sets aren't already stored locally, download
+them.
+
+```python
+if not os.path.exists(IRIS_TRAINING):
+  raw = urllib.urlopen(IRIS_TRAINING_URL).read()
+  with open(IRIS_TRAINING,'w') as f:
+    f.write(raw)
+
+if not os.path.exists(IRIS_TEST):
+  raw = urllib.urlopen(IRIS_TEST_URL).read()
+  with open(IRIS_TEST,'w') as f:
+    f.write(raw)
+```
+
+Next, load the training and test sets into `Dataset`s using the
+[`load_csv_with_header()`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/datasets/base.py)
+method in `learn.datasets.base`. The `load_csv_with_header()` method takes three
+required arguments:
+
+*   `filename`, which takes the filepath to the CSV file
+*   `target_dtype`, which takes the
+    [`numpy` datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html)
+    of the dataset's target value.
+*   `features_dtype`, which takes the
+    [`numpy` datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html)
+    of the dataset's feature values.
+
+
+Here, the target (the value you're training the model to predict) is flower
+species, which is an integer from 0&ndash;2, so the appropriate `numpy` datatype
+is `np.int`:
+
+```python
+# Load datasets.
+training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+    filename=IRIS_TRAINING,
+    target_dtype=np.int,
+    features_dtype=np.float32)
+test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
+    filename=IRIS_TEST,
+    target_dtype=np.int,
+    features_dtype=np.float32)
+```
+
+`Dataset`s in tf.contrib.learn are
+[named tuples](https://docs.python.org/2/library/collections.html#collections.namedtuple);
+you can access feature data and target values via the `data` and `target`
+fields. Here, `training_set.data` and `training_set.target` contain the feature
+data and target values for the training set, respectively, and `test_set.data`
+and `test_set.target` contain feature data and target values for the test set.
+
+Later on, in
+["Fit the DNNClassifier to the Iris Training Data,"](#fit-dnnclassifier)
+you'll use `training_set.data` and
+`training_set.target` to train your model, and in
+["Evaluate Model Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
+`test_set.target`. But first, you'll construct your model in the next section.
+
+## Construct a Deep Neural Network Classifier
+
+tf.contrib.learn offers a variety of predefined models, called
+@{$python/contrib.learn#estimators$`Estimator`s}, which you can
+use "out of the box" to run training and evaluation operations on your data.
+Here, you'll configure a Deep Neural Network Classifier model to fit the Iris
+data. Using tf.contrib.learn, you can instantiate your
+@{tf.contrib.learn.DNNClassifier} with
+just a couple lines of code:
+
+```python
+# Specify that all features have real-value data
+feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+
+# Build 3 layer DNN with 10, 20, 10 units respectively.
+classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
+                                            hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model")
+```
+
+The code above first defines the model's feature columns, which specify the data
+type for the features in the data set. All the feature data is continuous, so
+`tf.contrib.layers.real_valued_column` is the appropriate function to use to
+construct the feature columns. There are four features in the data set (sepal
+width, sepal height, petal width, and petal height), so accordingly `dimension`
+must be set to `4` to hold all the data.
+
+Then, the code creates a `DNNClassifier` model using the following arguments:
+
+*   `feature_columns=feature_columns`. The set of feature columns defined above.
+*   `hidden_units=[10, 20, 10]`. Three
+    [hidden layers](http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw),
+    containing 10, 20, and 10 neurons, respectively.
+*   `n_classes=3`. Three target classes, representing the three Iris species.
+*   `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save
+    checkpoint data during model training. For more on logging and monitoring
+    with TensorFlow, see @{$monitors$Logging and Monitoring Basics with     tf.contrib.learn}.
+
+## Describe the training input pipeline {#train-input}
+
+The `tf.contrib.learn` API uses input functions, which create the TensorFlow
+operations that generate data for the model. In this case, the data is small
+enough that it can be stored in @{tf.constant$TensorFlow constants}. The
+following code produces the simplest possible input pipeline:
+
+```python
+# Define the training inputs
+def get_train_inputs():
+  x = tf.constant(training_set.data)
+  y = tf.constant(training_set.target)
+
+  return x, y
+```
+
+## Fit the DNNClassifier to the Iris Training Data {#fit-dnnclassifier}
+
+Now that you've configured your DNN `classifier` model, you can fit it to the
+Iris training data using the @{tf.contrib.learn.BaseEstimator.fit$`fit`} method.
+Pass `get_train_inputs` as the `input_fn`, and the number of steps to train
+(here, 2000):
+
+```python
+# Fit model.
+classifier.fit(input_fn=get_train_inputs, steps=2000)
+```
+
+The state of the model is preserved in the `classifier`, which means you can
+train iteratively if you like. For example, the above is equivalent to the
+following:
+
+```python
+classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
+classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
+```
+
+However, if you're looking to track the model while it trains, you'll likely
+want to instead use a TensorFlow @{tf.contrib.learn.monitors$`monitor`}
+to perform logging operations. See the tutorial
+@{$monitors$&ldquo;Logging and Monitoring Basics with tf.contrib.learn&rdquo;}
+for more on this topic.
+
+## Evaluate Model Accuracy {#evaluate-accuracy}
+
+You've fit your `DNNClassifier` model on the Iris training data; now, you can
+check its accuracy on the Iris test data using the
+@{tf.contrib.learn.BaseEstimator.evaluate$`evaluate`} method. Like `fit`,
+`evaluate` takes an input function that builds its input pipeline. `evaluate`
+returns a `dict` with the evaluation results. The following code passes the Iris
+test data&mdash;`test_set.data` and `test_set.target`&mdash;to `evaluate` and
+prints the `accuracy` from the results:
+
+```python
+# Define the test inputs
+def get_test_inputs():
+  x = tf.constant(test_set.data)
+  y = tf.constant(test_set.target)
+
+  return x, y
+
+# Evaluate accuracy.
+accuracy_score = classifier.evaluate(input_fn=get_test_inputs,
+                                     steps=1)["accuracy"]
+
+print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
+```
+
+Note: The `steps` argument to `evaluate` is important here.
+@{tf.contrib.learn.Evaluable.evaluate$`evaluate`} normally runs until it reaches
+the end of the input. This is perfect for evaluating over a set of files, but
+the constants being used here will never throw the `OutOfRangeError` or
+`StopIteration` that it is expecting.
+
+When you run the full script, it will print something close to:
+
+```
+Test Accuracy: 0.966667
+```
+
+Your accuracy result may vary a bit, but should be higher than 90%. Not bad for
+a relatively small data set!
+
+## Classify New Samples
+
+Use the estimator's `predict()` method to classify new samples. For example, say
+you have these two new flower samples:
+
+Sepal Length | Sepal Width | Petal Length | Petal Width
+:----------- | :---------- | :----------- | :----------
+6.4          | 3.2         | 4.5          | 1.5
+5.8          | 3.1         | 5.0          | 1.7
+
+You can predict their species using the `predict()` method. `predict` returns a
+generator, which can easily be converted to a list. The following code retrieves
+and prints the class predictions:
+
+```python
+# Classify two new flower samples.
+def new_samples():
+  return np.array(
+    [[6.4, 3.2, 4.5, 1.5],
+     [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
+
+predictions = list(classifier.predict(input_fn=new_samples))
+
+print(
+    "New Samples, Class Predictions:    {}\n"
+    .format(predictions))
+```
+
+Your results should look as follows:
+
+```
+New Samples, Class Predictions:    [1 2]
+```
+
+The model thus predicts that the first sample is *Iris versicolor*, and the
+second sample is *Iris virginica*.
+
+## Additional Resources
+
+*   For further reference materials on tf.contrib.learn, see the official
+    @{$python/contrib.learn$API docs}.
+
+*   To learn more about using tf.contrib.learn to create linear models, see
+    @{$linear$Large-scale Linear Models with TensorFlow}.
+
+*   To build your own Estimator using tf.contrib.learn APIs, check out
+    @{$estimators$Creating Estimators in tf.contrib.learn}.
+
+*   To experiment with neural network modeling and visualization in the browser,
+    check out [Deep Playground](http://playground.tensorflow.org/).
+
+*   For more advanced tutorials on neural networks, see
+    @{$deep_cnn$Convolutional Neural Networks} and @{$recurrent$Recurrent Neural
+    Networks}.
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
new file mode 100644
index 00000000000..05a1af320f8
--- /dev/null
+++ b/tensorflow/docs_src/install/index.md
@@ -0,0 +1,24 @@
+# Installing TensorFlow
+
+The following guides explain how to install a version of TensorFlow
+that enables you to write applications in Python:
+
+  * @{$install_linux$Installing TensorFlow on Ubuntu}
+  * @{$install_mac$Installing TensorFlow on Mac OS X}
+  * @{$install_windows$Installing TensorFlow on Windows}
+  * @{$install_sources$Installing TensorFlow from Sources}
+
+Many aspects of the Python TensorFlow API changed from version 0.n to 1.0.
+The following guide explains how to migrate older TensorFlow applications
+to Version 1.0:
+
+  * @{$migration$Transitioning to TensorFlow 1.0}
+
+The following guides explain how to install TensorFlow libraries for use in
+other programming languages. These APIs are aimed at deploying TensorFlow
+models in applications and are not as extensive as the Python APIs.
+
+  * @{$install_java$Installing TensorFlow for Java}
+  * @{$install_c$Installing TensorFlow for C}
+  * @{$install_go$Installing TensorFlow for Go}
+
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
new file mode 100644
index 00000000000..81aa6e3f763
--- /dev/null
+++ b/tensorflow/docs_src/install/install_c.md
@@ -0,0 +1,115 @@
+# Installing TensorFlow for C
+
+TensorFlow provides a C API defined in
+[`c_api.h`](https://github.com/tensorflow/tensorflow/tree/master/c/c_api.h),
+which is suitable for
+[building bindings for other languages](https://www.tensorflow.org/extend/language_bindings).
+The API leans towards simplicity and uniformity rather than convenience.
+
+
+## Supported Platforms
+
+You may install TensorFlow for C on the following operating systems:
+
+  * Linux
+  * Mac OS X
+
+
+## Installation
+
+Take the following steps to install the TensorFlow for C library and
+enable TensorFlow for C:
+
+  1. Decide whether you will run TensorFlow for C on CPU(s) only or
+     with the help of GPU(s). To help you decide, read the section
+     entitled "Determine which TensorFlow to install" in one of the
+     following guides:
+
+       * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
+       * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on Mac OS}
+
+  2. Download and extract the TensorFlow C library into `/usr/local/lib` by
+     invoking the following shell commands:
+
+         TF_TYPE="cpu" # Change to "gpu" for GPU support
+         OS="linux" # Change to "darwin" for Mac OS
+         TARGET_DIRECTORY="/usr/local"
+         curl -L \
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
+           sudo tar -C $TARGET_DIRECTORY -xz
+
+     The `tar` command extracts the TensorFlow C library into the `lib`
+     subdirectory of `TARGET_DIRECTORY`. For example, specifying `/usr/local`
+     as `TARGET_DIRECTORY` causes `tar` to extract the TensorFlow C library
+     into `/usr/local/lib`.
+
+     If you'd prefer to extract the library into a different directory,
+     adjust `TARGET_DIRECTORY` accordingly.
+
+  3. In Step 2, if you specified a system directory (for example, `/usr/local`)
+     as the `TARGET_DIRECTORY`, then run `ldconfig` to configure the linker.
+     For example:
+
+     <pre><b>sudo ldconfig</b></pre>
+
+     If you assigned a `TARGET_DIRECTORY` other than a system
+     directory (for example, `~/mydir`), then you must append the extraction
+     directory (for example, `~/mydir/lib`) to two environment variables.
+     For example:
+
+     <pre> <b>export LIBRARY_PATH=$LIBRARY_PATH:~/mydir/lib</b> # For both Linux and Mac OS X
+     <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/mydir/lib</b> # For Linux only
+     <b>export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/mydir/lib</b> # For Mac OS X only</pre>
+
+
+
+## Validate your installation
+
+After installing TensorFlow for C, enter the following code into a file named
+`hello_tf.c`:
+
+```c
+#include <stdio.h>
+#include <tensorflow/c/c_api.h>
+
+int main() {
+  printf("Hello from TensorFlow C library version %s\n", TF_Version());
+  return 0;
+}
+```
+
+### Build and Run
+
+Build `hello_tf.c` by invoking the following command:
+
+
+<pre><b>gcc hello_tf.c</b></pre>
+
+
+Running the resulting executable should output the following message:
+
+
+<pre><b>a.out</b>
+Hello from TensorFlow C library version <i>number</i></pre>
+
+
+### Troubleshooting
+
+If building the program fails, the most likely culprit is that `gcc` cannot
+find the TensorFlow C library.  One way to fix this problem is to specify
+the `-I` and `-L` options to `gcc`.  For example, if the `TARGET_LIBRARY`
+was `/usr/local`, you would invoke `gcc` as follows:
+
+<pre><b>gcc -I/usr/local/include -L/usr/local/lib hello_tf.c -ltensorflow</b></pre>
+
+If executing `a.out` fails, ask yourself the following questions:
+
+  * Did the program build without error?
+  * Have you assigned the correct directory to the environment variables
+    noted in Step 3 of [Installation](#installation)?
+  * Did you export those environment variables?
+
+If you are still seeing build or execution error messages, search (or post to)
+[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+possible solutions.
+
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
new file mode 100644
index 00000000000..3f9096b8229
--- /dev/null
+++ b/tensorflow/docs_src/install/install_go.md
@@ -0,0 +1,139 @@
+# Installing TensorFlow for Go
+
+TensorFlow provides APIs for use in Go programs. These APIs are particularly
+well-suited to loading models created in Python and executing them within
+a Go application. This guide explains how to install and set up the
+[TensorFlow Go package](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go).
+
+**WARNING:** The TensorFlow Go API is *not* covered by the TensorFlow
+[API stability guarantees](https://www.tensorflow.org/programmers_guide/version_semantics).
+
+
+## Supported Platforms
+
+You may install TensorFlow for Go on the following operating systems:
+
+  * Linux
+  * Mac OS X
+
+
+## Installation
+
+TensorFlow for Go depends on the TensorFlow C library. Take the following
+steps to install this library and enable TensorFlow for Go:
+
+  1. Decide whether you will run TensorFlow for Go on CPU(s) only or with
+     the help of GPU(s). To help you decide, read the section entitled
+     "Determine which TensorFlow to install" in one of the following guides:
+
+     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
+     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on Mac OS}
+
+  2. Download and extract the TensorFlow C library into `/usr/local/lib` by
+     invoking the following shell commands:
+
+         TF_TYPE="cpu" # Change to "gpu" for GPU support
+         TARGET_DIRECTORY='/usr/local'
+         curl -L \
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0.tar.gz" |
+         sudo tar -C $TARGET_DIRECTORY -xz
+
+     The `tar` command extracts the TensorFlow C library into the `lib`
+     subdirectory of `TARGET_DIRECTORY`. For example, specifying `/usr/local`
+     as `TARGET_DIRECTORY` causes `tar` to extract the TensorFlow C library
+     into `/usr/local/lib`.
+
+     If you'd prefer to extract the library into a different directory,
+     adjust `TARGET_DIRECTORY` accordingly.
+
+  3. In Step 2, if you specified a system directory (for example, `/usr/local`)
+     as the `TARGET_DIRECTORY`, then run `ldconfig` to configure the linker.
+     For example:
+
+     <pre><b>sudo ldconfig</b></pre>
+
+     If you assigned a `TARGET_DIRECTORY` other than a system
+     directory (for example, `~/mydir`), then you must append the extraction
+     directory (for example, `~/mydir/lib`) to two environment variables
+     as follows:
+
+     <pre> <b>export LIBRARY_PATH=$LIBRARY_PATH:~/mydir/lib</b> # For both Linux and Mac OS X
+     <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/mydir/lib</b> # For Linux only
+     <b>export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/mydir/lib</b> # For Mac OS X only</pre>
+
+  4. Now that the TensorFlow C library is installed, invoke `go get` as follows
+     to download the appropriate packages and their dependencies:
+
+     <pre><b>go get github.com/tensorflow/tensorflow/tensorflow/go</b></pre>
+
+  5. Invoke `go test` as follows to validate the TensorFlow for Go
+     installation:
+
+     <pre><b>go test github.com/tensorflow/tensorflow/tensorflow/go</b></pre>
+
+If `go get` or `go test` generate error messages, search (or post to)
+[StackOverflow](http://www.stackoverflow.com/questions/tagged/tensorflow)
+for possible solutions.
+
+
+## Hello World
+
+After installing TensorFlow for Go, enter the following code into a
+file named `hello_tf.go`:
+
+```go
+package main
+
+import (
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+	"github.com/tensorflow/tensorflow/tensorflow/go/op"
+	"fmt"
+)
+
+func main() {
+	// Construct a graph with an operation that produces a string constant.
+	s := op.NewScope()
+	c := op.Const(s, "Hello from TensorFlow version " + tf.Version())
+	graph, err := s.Finalize()
+	if err != nil {
+		panic(err)
+	}
+
+	// Execute the graph in a session.
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		panic(err)
+	}
+	output, err := sess.Run(nil, []tf.Output{c}, nil)
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(output[0].Value())
+}
+```
+
+For a more advanced example of TensorFlow in Go, look at the
+[example in the API documentation](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go#ex-package),
+which uses a pre-trained TensorFlow model to label contents of an image.
+
+
+### Running
+
+Run `hello_tf.go` by invoking the following command:
+
+<pre><b>go run hello_tf.go</b>
+Hello from TensorFlow version <i>number</i></pre>
+
+The program might also generate multiple warning messages of the
+following form, which you can ignore:
+
+<pre>W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library
+wasn't compiled to use *Type* instructions, but these are available on your
+machine and could speed up CPU computations.</pre>
+
+
+## Building from source code
+
+TensorFlow is open-source. You may build TensorFlow for Go from the
+TensorFlow source code by following the instructions in a
+[separate document](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/README.md).
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
new file mode 100644
index 00000000000..40ed9e1826f
--- /dev/null
+++ b/tensorflow/docs_src/install/install_java.md
@@ -0,0 +1,241 @@
+# Installing TensorFlow for Java
+
+TensorFlow provides APIs for use in Java programs. These APIs are particularly
+well-suited to loading models created in Python and executing them within a
+Java application. This guide explains how to install
+[TensorFlow for Java](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+and use it in a Java application.
+
+**WARNING:** The TensorFlow Java API is *not* covered by the TensorFlow
+[API stability guarantees](https://www.tensorflow.org/programmers_guide/version_semantics).
+
+
+## Supported Platforms
+
+TensorFlow for Java is supported on the following operating systems:
+
+  * Linux
+  * Mac OS X
+  * Windows
+  * Android
+
+The installation instructions for Android are in a separate
+[Android TensorFlow Support page](https://www.tensorflow.org/code/tensorflow/contrib/android).
+After installation, please see this
+[complete example](https://www.tensorflow.org/code/tensorflow/examples/android)
+of TensorFlow on Android.
+
+## Using TensorFlow with a Maven project
+
+If your project uses [Apache Maven](https://maven.apache.org), then add the
+following to the project's `pom.xml` to use the TensorFlow Java APIs:
+
+```xml
+<dependency>
+  <groupId>org.tensorflow</groupId>
+  <artifactId>tensorflow</artifactId>
+  <version>1.2.0</version>
+</dependency>
+```
+
+That's all.
+
+### Example
+
+As an example, these steps will create a Maven project that uses TensorFlow:
+
+  1. Create the project's `pom.xml`:
+
+
+         <project>
+             <modelVersion>4.0.0</modelVersion>
+             <groupId>org.myorg</groupId>
+             <artifactId>hellotf</artifactId>
+             <version>1.0-SNAPSHOT</version>
+             <properties>
+               <exec.mainClass>HelloTF</exec.mainClass>
+               <!-- The sample code requires at least JDK 1.7. -->
+               <!-- The maven compiler plugin defaults to a lower version -->
+               <maven.compiler.source>1.7</maven.compiler.source>
+               <maven.compiler.target>1.7</maven.compiler.target>
+             </properties>
+             <dependencies>
+               <dependency>
+                 <groupId>org.tensorflow</groupId>
+                 <artifactId>tensorflow</artifactId>
+                 <version>1.2.0</version>
+               </dependency>
+             </dependencies>
+         </project>
+
+
+  2. Create the source file (`src/main/java/HelloTF.java`):
+
+
+        import org.tensorflow.Graph;
+        import org.tensorflow.Session;
+        import org.tensorflow.Tensor;
+        import org.tensorflow.TensorFlow;
+
+        public class HelloTF {
+          public static void main(String[] args) throws Exception {
+            try (Graph g = new Graph()) {
+              final String value = "Hello from " + TensorFlow.version();
+     
+              // Construct the computation graph with a single operation, a constant
+              // named "MyConst" with a value "value".
+              try (Tensor t = Tensor.create(value.getBytes("UTF-8"))) {
+                // The Java API doesn't yet include convenience functions for adding operations.
+                g.opBuilder("Const", "MyConst").setAttr("dtype", t.dataType()).setAttr("value", t).build();
+              }
+     
+              // Execute the "MyConst" operation in a Session.
+              try (Session s = new Session(g);
+                   Tensor output = s.runner().fetch("MyConst").run().get(0)) {
+                System.out.println(new String(output.bytesValue(), "UTF-8"));
+              }
+            }
+          }
+        }
+
+
+  3. Compile and execute:
+
+     <pre> # Use -q to hide logging from the mvn tool
+     <b>mvn -q compile exec:java</b></pre>
+
+
+The preceding command should output <tt>Hello from <i>version</i></tt>. If it
+does, you've successfully set up TensorFlow for Java and are ready to use it in
+Maven projects. If not, check
+[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
+for possible solutions.  You can skip reading the rest of this document.
+
+## Using TensorFlow with JDK
+
+This section describes how to use TensorFlow using the `java` and `javac`
+commands from a JDK installation. If your project uses Apache Maven, then
+refer to the simpler instructions above instead.
+
+### Install on Linux or Mac OS
+
+Take the following steps to install TensorFlow for Java on Linux or Mac OS:
+
+  1. Download
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
+     which is the TensorFlow Java Archive (JAR).
+
+  2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
+     the help of GPU(s). To help you decide, read the section entitled
+     "Determine which TensorFlow to install" in one of the following guides:
+
+     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
+     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on Mac OS}
+
+  3. Download and extract the appropriate Java Native Interface (JNI)
+     file for your operating system and processor support by running the
+     following shell commands:
+
+
+         TF_TYPE="cpu" # Default processor is CPU. If you want GPU, set to "gpu"
+         OS=$(uname -s | tr '[:upper:]' '[:lower:]')
+         mkdir -p ./jni
+         curl -L \
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0.tar.gz" |
+           tar -xz -C ./jni
+
+### Install on Windows
+
+Take the following steps to install TensorFlow for Java on Windows:
+
+  1. Download
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0.jar),
+     which is the TensorFlow Java Archive (JAR).
+  2. Download the following Java Native Interface (JNI) file appropriate for
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0.zip).
+  3. Extract this .zip file.
+
+
+
+### Validate the installation
+
+After installing TensorFlow for Java, validate your installation by entering
+the following code into a file named `HelloTF.java`:
+
+```java
+import org.tensorflow.Graph;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.TensorFlow;
+
+public class HelloTF {
+  public static void main(String[] args) throws Exception {
+    try (Graph g = new Graph()) {
+      final String value = "Hello from " + TensorFlow.version();
+
+      // Construct the computation graph with a single operation, a constant
+      // named "MyConst" with a value "value".
+      try (Tensor t = Tensor.create(value.getBytes("UTF-8"))) {
+        // The Java API doesn't yet include convenience functions for adding operations.
+        g.opBuilder("Const", "MyConst").setAttr("dtype", t.dataType()).setAttr("value", t).build();
+      }
+
+      // Execute the "MyConst" operation in a Session.
+      try (Session s = new Session(g);
+           Tensor output = s.runner().fetch("MyConst").run().get(0)) {
+        System.out.println(new String(output.bytesValue(), "UTF-8"));
+      }
+    }
+  }
+}
+```
+
+And use the instructions below to compile and run `HelloTF.java`.
+
+
+### Compiling
+
+When compiling a Java program that uses TensorFlow, the downloaded `.jar`
+must be part of your `classpath`. For example, you can include the
+downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
+as follows:
+
+<pre><b>javac -cp libtensorflow-1.2.0.jar HelloTF.java</b></pre>
+
+
+### Running
+
+To execute a Java program that depends on TensorFlow, ensure that the following
+two files are available to the JVM:
+
+  * the downloaded `.jar` file
+  * the extracted JNI library
+
+For example, the following command line executes the `HelloTF` program on Linux
+and Mac OS X:
+
+<pre><b>java -cp libtensorflow-1.2.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+
+And the following command line executes the `HelloTF` program on Windows:
+
+<pre><b>java -cp libtensorflow-1.2.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+
+If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
+installed TensorFlow for Java and are ready to use the API.  If the program
+outputs something else, check
+[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) for
+possible solutions.
+
+
+### Advanced Example
+
+For a more sophisticated example, see
+[LabelImage.java](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java),
+which recognizes objects in an image.
+
+
+## Building from source code
+
+TensorFlow is open-source. You may build TensorFlow for Java from the
+TensorFlow source code by following the instructions in a
+[separate document](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/README.md).
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
new file mode 100644
index 00000000000..99f27d7b856
--- /dev/null
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -0,0 +1,745 @@
+# Installing TensorFlow on Ubuntu
+
+This guide explains how to install TensorFlow on Ubuntu. These instructions
+might also work on other Linux variants, but we have only tested (and we
+only support) these instructions on Ubuntu 14.04 or higher.
+
+
+## Determine which TensorFlow to install
+
+You must choose one of the following types of TensorFlow to install:
+
+  * **TensorFlow with CPU support only**. If your system does not have a
+    NVIDIA® GPU, you must install this version. Note that this version of
+    TensorFlow is typically much easier to install (typically,
+    in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
+    installing this version first.
+  * **TensorFlow with GPU support**. TensorFlow programs typically run
+    significantly faster on a GPU than on a CPU. Therefore, if your
+    system has a NVIDIA® GPU meeting the prerequisites shown below and you
+    need to run performance-critical applications, you should ultimately
+    install this version.
+
+<a name="NVIDIARequirements"></a>
+### NVIDIA requirements to run TensorFlow with GPU support
+
+If you are installing TensorFlow with GPU support using one of the
+mechanisms described in this guide, then the following NVIDIA software
+must be installed on your system:
+
+  * CUDA® Toolkit 8.0. For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
+    Ensure that you append the relevant Cuda pathnames to the
+    `LD_LIBRARY_PATH` environment variable as described in the
+    NVIDIA documentation.
+  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
+  * cuDNN v5.1. For details, see
+    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
+    Ensure that you create the `CUDA_HOME` environment variable as
+    described in the NVIDIA documentation.
+  * GPU card with CUDA Compute Capability 3.0 or higher.  See
+    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for
+    a list of supported GPU cards.
+  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
+    This library provides advanced profiling support. To install this library,
+    issue the following command:
+
+    <pre>
+    $ <b>sudo apt-get install libcupti-dev</b>
+    </pre>
+
+If you have an earlier version of the preceding packages, please upgrade to
+the specified versions. If upgrading is not possible, then you may still run
+TensorFlow with GPU support, but only if you do the following:
+
+  * Install TensorFlow from sources as documented in
+    @{$install_sources$Installing TensorFlow from Sources}.
+  * Install or upgrade to at least the following NVIDIA versions:
+    * CUDA toolkit 7.0 or greater
+    * cuDNN v3 or greater
+    * GPU card with CUDA Compute Capability 3.0 or higher.
+
+
+## Determine how to install TensorFlow
+
+You must pick the mechanism by which you install TensorFlow. The
+supported choices are as follows:
+
+  * [virtualenv](#InstallingVirtualenv)
+  * ["native" pip](#InstallingNativePip)
+  * [Docker](#InstallingDocker)
+  * [Anaconda](#InstallingAnaconda)
+  * installing from sources, which is documented in
+    [a separate guide](https://www.tensorflow.org/install/install_sources).
+
+**We recommend the virtualenv installation.**
+[Virtualenv](https://virtualenv.pypa.io/en/stable/)
+is a virtual Python environment isolated from other Python development,
+incapable of interfering with or being affected by other Python programs
+on the same machine.  During the virtualenv installation process,
+you will install not only TensorFlow but also all the packages that
+TensorFlow requires.  (This is actually pretty easy.)
+To start working with TensorFlow, you simply need to "activate" the
+virtual environment.  All in all, virtualenv provides a safe and
+reliable mechanism for installing and running TensorFlow.
+
+Native pip installs TensorFlow directly on your system without going
+through any container system. **We recommend the native pip install for
+system administrators aiming to make TensorFlow available to everyone on a
+multi-user system.** Since a native pip installation is not walled-off in
+a separate container, the pip installation might interfere with other
+Python-based installations on your system. However, if you understand pip
+and your Python environment, a native pip installation often entails only
+a single command.
+
+Docker completely isolates the TensorFlow installation
+from pre-existing packages on your machine. The Docker container contains
+TensorFlow and all its dependencies. Note that the Docker image can be quite
+large (hundreds of MBs). You might choose the Docker installation if you are
+incorporating TensorFlow into a larger application architecture that already
+uses Docker.
+
+In Anaconda, you may use conda to create a virtual environment.
+However, within Anaconda, we recommend installing TensorFlow with the
+`pip install` command, not with the `conda install` command.
+
+**NOTE:** The conda package is community supported, not officially supported.
+That is, the TensorFlow team neither tests nor maintains the conda package.
+Use that package at your own risk.
+
+
+<a name="InstallingVirtualenv"></a>
+## Installing with virtualenv
+
+Take the following steps to install TensorFlow with Virtualenv:
+
+  1. Install pip and virtualenv by issuing one of the following commands:
+
+     <pre>$ <b>sudo apt-get install python-pip python-dev python-virtualenv</b> # for Python 2.7
+     $ <b>sudo apt-get install python3-pip python3-dev python-virtualenv</b> # for Python 3.n</pre>
+
+  2. Create a virtualenv environment by issuing one of the following commands:
+
+     <pre>$ <b>virtualenv --system-site-packages</b> <i>targetDirectory</i> # for Python 2.7
+     $ <b>virtualenv --system-site-packages -p python3</b> <i>targetDirectory</i> # for Python 3.n</pre>
+
+     where <code><em>targetDirectory</em></code> specifies the top of the
+     virtualenv tree.  Our instructions assume that
+     <code><em>targetDirectory</em></code> is `~/tensorflow`, but you may
+     choose any directory.
+
+  3. Activate the virtualenv environment by issuing one of the following
+     commands:
+
+     <pre>$ <b>source ~/tensorflow/bin/activate</b> # bash, sh, ksh, or zsh
+     $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+
+     The preceding <tt>source</tt> command should change your prompt
+     to the following:
+
+     <pre>(tensorflow)$ </pre>
+
+  4. Ensure pip ≥8.1 is installed:
+
+     <pre>(tensorflow)$ <b>easy_install -U pip</b></pre>
+
+  5. Issue one of the following commands to install TensorFlow in the active
+     virtualenv environment:
+
+     <pre>(tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
+     (tensorflow)$ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
+     (tensorflow)$ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU</pre>
+
+     If the preceding command succeeds, skip Step 5. If the preceding
+     command fails, perform Step 5.
+
+  5. (Optional) If Step 4 failed (typically because you invoked a pip version
+     lower than 8.1), install TensorFlow in the active virtualenv environment
+     by issuing a command of the following format:
+
+     <pre>(tensorflow)$ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
+     (tensorflow)$ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
+
+     where <code><em>tfBinaryURL</em></code> identifies the URL of the
+     TensorFlow Python package. The appropriate value of
+     <code><em>tfBinaryURL</em></code>depends on the operating system,
+     Python version, and GPU support. Find the appropriate value for
+     <code><em>tfBinaryURL</em></code> for your system
+     [here](#the_url_of_the_tensorflow_python_package).  For example, if you
+     are installing TensorFlow for Linux, Python 2.7, and CPU-only support,
+     issue the following command to install TensorFlow in the active
+     virtualenv environment:
+
+     <pre>(tensorflow)$ <b>pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
+
+If you encounter installation problems, see
+[Common Installation Problems](#common_installation_problems).
+
+
+### Next Steps
+
+After installing TensorFlow,
+[validate the installation](#ValidateYourInstallation).
+
+Note that you must activate the virtualenv environment each time you
+use TensorFlow. If the virtualenv environment is not currently active,
+invoke one of the following commands:
+
+<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
+$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
+
+When the virtualenv environment is active, you may run
+TensorFlow programs from this shell.  Your prompt will become
+the following to indicate that your tensorflow environment is active:
+
+<pre>(tensorflow)$ </pre>
+
+When you are done using TensorFlow, you may deactivate the
+environment by invoking the `deactivate` function as follows:
+
+<pre>(tensorflow)$ <b>deactivate</b> </pre>
+
+The prompt will revert back to your default prompt (as defined by the
+`PS1` environment variable).
+
+
+### Uninstalling TensorFlow
+
+To uninstall TensorFlow, simply remove the tree you created.
+For example:
+
+<pre>$ <b>rm -r</b> <i>targetDirectory</i> </pre>
+
+
+<a name="InstallingNativePip"></a>
+## Installing with native pip
+
+You may install TensorFlow through pip, choosing between a simple
+installation procedure or a more complex one.
+
+**Note:** The
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+lists the TensorFlow packages that pip will install or upgrade.
+
+
+### Prerequisite: Python and Pip
+
+Python is automatically installed on Ubuntu.  Take a moment to confirm
+(by issuing a `python -V` command) that one of the following Python
+versions is already installed on your system:
+
+  * Python 2.7
+  * Python 3.3+
+
+The pip or pip3 package manager is *usually* installed on Ubuntu.  Take a
+moment to confirm (by issuing a `pip -V` or `pip3 -V` command)
+that pip or pip3 is installed.  We strongly recommend version 8.1 or higher
+of pip or pip3.  If Version 8.1 or later is not installed, issue the
+following command, which will either install or upgrade to the latest
+pip version:
+
+<pre>$ <b>sudo apt-get install python-pip python-dev</b>   # for Python 2.7
+$ <b>sudo apt-get install python3-pip python3-dev</b> # for Python 3.n
+</pre>
+
+
+### Install TensorFlow
+
+Assuming the prerequisite software is installed on your Linux host,
+take the following steps:
+
+  1. Install TensorFlow by invoking **one** of the following commands:
+
+     <pre>$ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
+     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
+     $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
+     $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
+
+     If the preceding command runs to completion, you should now
+     [validate your installation](#ValidateYourInstallation).
+
+  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
+     by issuing a command of the following format:
+
+     <pre>$ <b>sudo pip  install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
+     $ <b>sudo pip3 install --upgrade</b> <i>tfBinaryURL</i>   # Python 3.n </pre>
+
+     where <code><em>tfBinaryURL</em></code> identifies the URL of the
+     TensorFlow Python package. The appropriate value of
+     <code><em>tfBinaryURL</em></code> depends on the operating system,
+     Python version, and GPU support. Find the appropriate value for
+     <code><em>tfBinaryURL</em></code>
+     [here](#the_url_of_the_tensorflow_python_package).  For example, to
+     install TensorFlow for Linux, Python 2.7, and CPU-only support, issue
+     the following command:
+
+     <pre>
+     $ <b>sudo pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b>
+     </pre>
+
+     If this step fails, see
+     [Common Installation Problems](#common_installation_problems).
+
+
+### Next Steps
+
+After installing TensorFlow, [validate your installation](#ValidateYourInstallation).
+
+
+### Uninstalling TensorFlow
+
+To uninstall TensorFlow, issue one of following commands:
+
+<pre>
+$ <b>sudo pip uninstall tensorflow</b>  # for Python 2.7
+$ <b>sudo pip3 uninstall tensorflow</b> # for Python 3.n
+</pre>
+
+
+<a name="InstallingDocker"></a>
+## Installing with Docker
+
+Take the following steps to install TensorFlow through Docker:
+
+  1. Install Docker on your machine as described in the
+     [Docker documentation](http://docs.docker.com/engine/installation/).
+  2. Optionally, create a Linux group called <code>docker</code> to allow
+     launching containers without sudo as described in the
+     [Docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/).
+     (If you don't do this step, you'll have to use sudo each time
+     you invoke Docker.)
+  3. To install a version of TensorFlow that supports GPUs, you must first
+     install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), which
+     is stored in github.
+  4. Launch a Docker container that contains one of the
+     [TensorFlow binary images](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+The remainder of this section explains how to launch a Docker container.
+
+
+### CPU-only
+
+To launch a Docker container with CPU-only support (that is, without
+GPU support), enter a command of the following format:
+
+<pre>
+$ docker run -it <i>-p hostPort:containerPort TensorFlowCPUImage</i>
+</pre>
+
+where:
+
+  * <tt><i>-p hostPort:containerPort</i></tt> is optional.
+    If you plan to run TensorFlow programs from the shell, omit this option.
+    If you plan to run TensorFlow programs as Jupyter notebooks, set both
+    <tt><i>hostPort</i></tt> and <tt><i>containerPort</i></tt>
+    to <tt>8888</tt>.  If you'd like to run TensorBoard inside the container,
+    add a second `-p` flag, setting both <i>hostPort</i> and <i>containerPort</i>
+    to 6006.
+  * <tt><i>TensorFlowCPUImage</i></tt> is required. It identifies the Docker
+    container. Specify one of the following values:
+    * <tt>gcr.io/tensorflow/tensorflow</tt>, which is the TensorFlow CPU binary image.
+    * <tt>gcr.io/tensorflow/tensorflow:latest-devel</tt>, which is the latest
+      TensorFlow CPU Binary image plus source code.
+    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i></tt>, which is the
+      specified version (for example, 1.1.0rc1) of TensorFlow CPU binary image.
+    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-devel</tt>, which is
+      the specified version (for example, 1.1.0rc1) of the TensorFlow GPU
+      binary image plus source code.
+
+    <tt>gcr.io</tt> is the Google Container Registry. Note that some
+    TensorFlow images are also available at
+    [dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/).
+
+For example, the following command launches the latest TensorFlow CPU binary image
+in a Docker container from which you can run TensorFlow programs in a shell:
+
+<pre>
+$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b>
+</pre>
+
+The following command also launches the latest TensorFlow CPU binary image in a
+Docker container. However, in this Docker container, you can run TensorFlow
+programs in a Jupyter notebook:
+
+<pre>
+$ <b>docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow</b>
+</pre>
+
+Docker will download the TensorFlow binary image the first time you launch it.
+
+
+### GPU support
+
+Prior to installing TensorFlow with GPU support, ensure that your system meets all
+[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
+with NVidia GPU support, enter a command of the following format:
+
+<pre>
+$ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
+</pre>
+
+where:
+
+  * <tt><i>-p hostPort:containerPort</i></tt> is optional. If you plan
+    to run TensorFlow programs from the shell, omit this option. If you plan
+    to run TensorFlow programs as Jupyter notebooks, set both
+    <tt><i>hostPort</i></tt> and <code><em>containerPort</em></code> to `8888`.
+  * <i>TensorFlowGPUImage</i> specifies the Docker container. You must
+    specify one of the following values:
+    * <tt>gcr.io/tensorflow/tensorflow:latest-gpu</tt>, which is the latest
+      TensorFlow GPU binary image.
+    * <tt>gcr.io/tensorflow/tensorflow:latest-devel-gpu</tt>, which is
+      the latest TensorFlow GPU Binary image plus source code.
+    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-gpu</tt>, which is the
+      specified version (for example, 0.12.1) of the TensorFlow GPU
+      binary image.
+    * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-devel-gpu</tt>, which is
+      the specified version (for example, 0.12.1) of the TensorFlow GPU
+      binary image plus source code.
+
+We recommend installing one of the `latest` versions. For example, the
+following command launches the latest TensorFlow GPU binary image in a
+Docker container from which you can run TensorFlow programs in a shell:
+
+<pre>
+$ <b>nvidia-docker run -it gcr.io/tensorflow/tensorflow:latest-gpu bash</b>
+</pre>
+
+The following command also launches the latest TensorFlow GPU binary image
+in a Docker container. In this Docker container, you can run TensorFlow
+programs in a Jupyter notebook:
+
+<pre>
+$ <b>nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu</b>
+</pre>
+
+The following command installs an older TensorFlow version (0.12.1):
+
+<pre>
+$ <b>nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:0.12.1-gpu</b>
+</pre>
+
+Docker will download the TensorFlow binary image the first time you launch it.
+For more details see the
+[TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
+
+
+### Next Steps
+
+You should now
+[validate your installation](#ValidateYourInstallation).
+
+
+<a name="InstallingAnaconda"></a>
+## Installing with Anaconda
+
+Take the following steps to install TensorFlow in an Anaconda environment:
+
+  1. Follow the instructions on the
+     [Anaconda download site](https://www.continuum.io/downloads)
+     to download and install Anaconda.
+
+  2. Create a conda environment named <tt>tensorflow</tt> to run a version
+     of Python by invoking the following command:
+
+     <pre>$ <b>conda create -n tensorflow</b></pre>
+
+  3. Activate the conda environment by issuing the following command:
+
+     <pre>$ <b>source activate tensorflow</b>
+     (tensorflow)$  # Your prompt should change </pre>
+
+  4. Issue a command of the following format to install
+     TensorFlow inside your conda environment:
+
+     <pre>(tensorflow)$ <b>pip install --ignore-installed --upgrade</b> <i>tfBinaryURL</i></pre>
+
+     where <code><em>tfBinaryURL</em></code> is the
+     [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
+     For example, the following command installs the CPU-only version of
+     TensorFlow for Python 2.7:
+
+     <pre>
+     (tensorflow)$ <b>pip install --ignore-installed --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl</b></pre>
+
+
+<a name="ValidateYourInstallation"></a>
+## Validate your installation
+
+To validate your TensorFlow installation, do the following:
+
+  1. Ensure that your environment is prepared to run TensorFlow programs.
+  2. Run a short TensorFlow program.
+
+
+### Prepare your environment
+
+If you installed on native pip, virtualenv, or Anaconda, then
+do the following:
+
+  1. Start a terminal.
+  2. If you installed with virtualenv or Anaconda, activate your container.
+  3. If you installed TensorFlow source code, navigate to any
+     directory *except* one containing TensorFlow source code.
+
+If you installed through Docker, start a Docker container
+from which you can run bash. For example:
+
+<pre>
+$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b>
+</pre>
+
+
+### Run a short TensorFlow program
+
+Invoke python from your shell as follows:
+
+<pre>$ <b>python</b></pre>
+
+Enter the following short program inside the python interactive shell:
+
+```python
+# Python
+import tensorflow as tf
+hello = tf.constant('Hello, TensorFlow!')
+sess = tf.Session()
+print(sess.run(hello))
+```
+
+If the system outputs the following, then you are ready to begin writing
+TensorFlow programs:
+
+<pre>Hello, TensorFlow!</pre>
+
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with TensorFlow}.
+
+If the system outputs an error message instead of a greeting, see [Common
+installation problems](#common_installation_problems).
+
+## Common installation problems
+
+We are relying on Stack Overflow to document TensorFlow installation problems
+and their remedies.  The following table contains links to Stack Overflow
+answers for some common installation problems.
+If you encounter an error message or other
+installation problem not listed in the following table, search for it
+on Stack Overflow.  If Stack Overflow doesn't show the error message,
+ask a new question about it on Stack Overflow and specify
+the `tensorflow` tag.
+
+<table>
+<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
+  <td><pre>ImportError: libcudart.so.<i>Version</i>: cannot open shared object file:
+  No such file or directory</pre></td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/41991101">41991101</a></td>
+  <td><pre>ImportError: libcudnn.<i>Version</i>: cannot open shared object file:
+  No such file or directory</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/36371137">36371137</a> and
+  <a href="#Protobuf31">here</a></td>
+  <td><pre>libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207] A
+  protocol message was rejected because it was too big (more than 67108864 bytes).
+  To increase the limit (or to disable these warnings), see
+  CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre></td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/35252888">35252888</a></td>
+  <td><pre>Error importing tensorflow. Unless you are using bazel, you should
+  not try to import tensorflow from its source directory; please exit the
+  tensorflow source tree, and relaunch your python interpreter from
+  there.</pre></td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/33623453">33623453</a></td>
+  <td><pre>IOError: [Errno 2] No such file or directory:
+  '/tmp/pip-o6Tpui-build/setup.py'</tt></pre>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42006320">42006320</a></td>
+  <td><pre>ImportError: Traceback (most recent call last):
+  File ".../tensorflow/core/framework/graph_pb2.py", line 6, in <module>
+  from google.protobuf import descriptor as _descriptor
+  ImportError: cannot import name 'descriptor'</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/questions/35190574">35190574</a> </td>
+  <td><pre>SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
+  failed</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42009190">42009190</a></td>
+  <td><pre>
+  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
+  Found existing installation: setuptools 1.1.6
+  Uninstalling setuptools-1.1.6:
+  Exception:
+  ...
+  [Errno 1] Operation not permitted:
+  '/tmp/pip-a1DXRT-uninstall/.../lib/python/_markerlib' </pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/questions/36933958">36933958</a></td>
+  <td><pre>
+  ...
+  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
+  Found existing installation: setuptools 1.1.6
+  Uninstalling setuptools-1.1.6:
+  Exception:
+  ...
+  [Errno 1] Operation not permitted:
+  '/tmp/pip-a1DXRT-uninstall/System/Library/Frameworks/Python.framework/
+   Versions/2.7/Extras/lib/python/_markerlib'</pre>
+  </td>
+</tr>
+
+</table>
+
+
+<a name="TF_PYTHON_URL"></a>
+## The URL of the TensorFlow Python package
+
+A few installation mechanisms require the URL of the TensorFlow Python package.
+The value you specify depends on three factors:
+
+  * operating system
+  * Python version
+  * CPU only vs. GPU support
+
+This section documents the relevant values for Linux installations.
+
+
+### Python 2.7
+
+CPU only:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp27-none-linux_x86_64.whl
+</pre>
+
+
+GPU support:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl
+</pre>
+
+Note that GPU support requires the NVIDIA hardware and software described in
+[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
+
+
+### Python 3.4
+
+CPU only:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl
+</pre>
+
+
+GPU support:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl
+</pre>
+
+Note that GPU support requires the NVIDIA hardware and software described in
+[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
+
+
+### Python 3.5
+
+CPU only:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl
+</pre>
+
+
+GPU support:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl
+</pre>
+
+
+Note that GPU support requires the NVIDIA hardware and software described in
+[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
+
+### Python 3.6
+
+CPU only:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0-cp36-cp36m-linux_x86_64.whl
+</pre>
+
+
+GPU support:
+
+<pre>
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0-cp36-cp36m-linux_x86_64.whl
+</pre>
+
+
+Note that GPU support requires the NVIDIA hardware and software described in
+[NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
+
+<a name="Protobuf31"></a>
+## Protobuf pip package 3.1
+
+You can skip this section unless you are seeing problems related
+to the protobuf pip package.
+
+**NOTE:** If your TensorFlow programs are running slowly, you might
+have a problem related to the protobuf pip package.
+
+The TensorFlow pip package depends on protobuf pip package version 3.1. The
+protobuf pip package downloaded from PyPI (when invoking
+<tt>pip install protobuf</tt>) is a Python-only library containing
+Python implementations of proto serialization/deserialization that can run
+**10x-50x slower** than the C++ implementation. Protobuf also supports a
+binary extension for the Python package that contains fast
+C++ based proto parsing.  This extension is not available in the
+standard Python-only pip package.  We have created a custom binary
+pip package for protobuf that contains the binary extension. To install
+the custom binary protobuf pip package, invoke one of the following commands:
+
+  * for Python 2.7:
+
+  <pre>
+  $ <b>pip install --upgrade \
+  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp27-none-linux_x86_64.whl</b></pre>
+
+  * for Python 3.5:
+
+  <pre>
+  $ <b>pip3 install --upgrade \
+  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp35-none-linux_x86_64.whl</b></pre>
+
+Installing this protobuf package will overwrite the existing protobuf package.
+Note that the binary pip package already has support for protobufs
+larger than 64MB, which should fix errors such as these:
+
+<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
+A protocol message was rejected because it was too big (more than 67108864 bytes).
+To increase the limit (or to disable these warnings), see
+CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
new file mode 100644
index 00000000000..8ff0fb872f7
--- /dev/null
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -0,0 +1,565 @@
+# Installing TensorFlow on Mac OS X
+
+This guide explains how to install TensorFlow on Mac OS X.
+
+Note: As of version 1.2, TensorFlow no longer provides GPU support on Mac OS X.
+
+## Determine how to install TensorFlow
+
+You must pick the mechanism by which you install TensorFlow. The supported choices are as follows:
+
+  * virtualenv
+  * "native" pip
+  * Docker
+  * installing from sources, which is documented in
+    [a separate guide](https://www.tensorflow.org/install/install_sources).
+
+**We recommend the virtualenv installation.**
+[Virtualenv](https://virtualenv.pypa.io/en/stable/)
+is a virtual Python environment isolated from other Python development,
+incapable of interfering with or being affected by other Python programs
+on the same machine.  During the virtualenv installation process,
+you will install not only TensorFlow but also all the packages that
+TensorFlow requires.  (This is actually pretty easy.)
+To start working with TensorFlow, you simply need to "activate" the
+virtual environment.  All in all, virtualenv provides a safe and
+reliable mechanism for installing and running TensorFlow.
+
+Native pip installs TensorFlow directly on your system without going through
+any container or virtual environment system. Since a native pip installation
+is not walled-off, the pip installation might interfere with or be influenced
+by other Python-based installations on your system. Furthermore, you might need
+to disable System Integrity Protection (SIP) in order to install through native
+pip.  However, if you understand SIP, pip, and your Python environment, a
+native pip installation is relatively easy to perform.
+
+[Docker](http://docker.com/) completely isolates the TensorFlow installation
+from pre-existing packages on your machine. The Docker container contains
+TensorFlow and all its dependencies. Note that the Docker image can be quite
+large (hundreds of MBs). You might choose the Docker installation if you are
+incorporating TensorFlow into a larger application architecture that
+already uses Docker.
+
+In Anaconda, you may use conda to create a virtual environment.
+However, within Anaconda, we recommend installing TensorFlow with the
+`pip install` command, not with the `conda install` command.
+
+**NOTE:** The conda package is community supported, not officially supported.
+That is, the TensorFlow team neither tests nor maintains the conda package.
+Use that package at your own risk.
+
+## Installing with virtualenv
+
+Take the following steps to install TensorFlow with Virtualenv:
+
+  1. Start a terminal (a shell). You'll perform all subsequent steps
+     in this shell.
+
+  2. Install pip and virtualenv by issuing the following commands:
+
+     <pre> $ <b>sudo easy_install pip</b>
+     $ <b>sudo pip install --upgrade virtualenv</b> </pre>
+
+  3. Create a virtualenv environment by issuing a command of one
+     of the following formats:
+
+     <pre> $ <b>virtualenv --system-site-packages</b> <i>targetDirectory</i> # for Python 2.7
+     $ <b>virtualenv --system-site-packages -p python3</b> <i>targetDirectory</i> # for Python 3.n
+     </pre>
+
+     where <i>targetDirectory</i> identifies the top of the virtualenv tree.
+     Our instructions assume that <i>targetDirectory</i>
+     is `~/tensorflow`, but you may choose any directory.
+
+  4. Activate the virtualenv environment by issuing one of the
+     following commands:
+
+     <pre>$ <b>source ~/tensorflow/bin/activate</b>      # If using bash, sh, ksh, or zsh
+    $ <b>source ~/tensorflow/bin/activate.csh</b>  # If using csh or tcsh </pre>
+
+     The preceding `source` command should change your prompt to the following:
+
+     <pre> (tensorflow)$ </pre>
+
+  5. Ensure pip ≥8.1 is installed:
+
+     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
+
+  6. Issue one of the following commands to install TensorFlow and all the
+     packages that TensorFlow requires into the active Virtualenv environment:
+
+     <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
+
+  7. Optional. If Step 6 failed (typically because you invoked a pip version
+     lower than 8.1), install TensorFlow in the active
+     virtualenv environment by issuing a command of the following format:
+
+     <pre> $ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
+     $ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
+
+     where <i>tfBinaryURL</i> identifies the URL
+     of the TensorFlow Python package. The appropriate value of
+     <i>tfBinaryURL</i> depends on the operating system and
+     Python version. Find the appropriate value for
+     <i>tfBinaryURL</i> for your system
+     [here](#the_url_of_the_tensorflow_python_package).
+     For example, if you are installing TensorFlow for Mac OS X,
+     Python 2.7, the command to install
+     TensorFlow in the active Virtualenv is as follows:
+
+     <pre> $ <b>pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
+
+If you encounter installation problems, see
+[Common Installation Problems](#common-installation-problems).
+
+
+### Next Steps
+
+After installing TensorFlow,
+[validate your installation](#ValidateYourInstallation)
+to confirm that the installation worked properly.
+
+Note that you must activate the virtualenv environment each time you
+use TensorFlow in a new shell.  If the virtualenv environment is not
+currently active (that is, the prompt is not `(tensorflow)`, invoke
+one of the following commands:
+
+<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
+$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh </pre>
+
+Your prompt will transform to the following to indicate that your
+tensorflow environment is active:
+
+<pre> (tensorflow)$ </pre>
+
+When the virtualenv environment is active, you may run
+TensorFlow programs from this shell.
+
+When you are done using TensorFlow, you may deactivate the
+environment by issuing the following command:
+
+<pre> (tensorflow)$ <b>deactivate</b> </pre>
+
+The prompt will revert back to your default prompt (as defined by `PS1`).
+
+
+### Uninstalling TensorFlow
+
+If you want to uninstall TensorFlow, simply remove the tree you created. For example:
+
+<pre> $ <b>rm -r ~/tensorflow</b> </pre>
+
+
+## Installing with native pip
+
+We have uploaded the TensorFlow binaries to PyPI.
+Therefore, you can install TensorFlow through pip.
+
+The
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+lists the packages that pip will install or upgrade.
+
+
+### Prerequisite: Python
+
+In order to install TensorFlow, your system must contain one of the following Python versions:
+
+  * Python 2.7
+  * Python 3.3+
+
+If your system does not already have one of the preceding Python versions,
+[install](https://wiki.python.org/moin/BeginnersGuide/Download) it now.
+
+When installing Python, you might need to disable
+System Integrity Protection (SIP) to permit any entity other than
+Mac App Store to install software.
+
+
+### Prerequisite: pip
+
+[Pip](https://en.wikipedia.org/wiki/Pip_(package_manager)) installs
+and manages software packages written in Python. If you intend to install
+with native pip, then one of the following flavors of pip must be
+installed on your system:
+
+  * `pip`, for Python 2.7
+  * `pip3`, for Python 3.n.
+
+`pip` or `pip3` was probably installed on your system when you
+installed Python.  To determine whether pip or pip3 is actually
+installed on your system, issue one of the following commands:
+
+<pre>$ <b>pip -V</b>  # for Python 2.7
+$ <b>pip3 -V</b> # for Python 3.n </pre>
+
+We strongly recommend pip or pip3 version 8.1 or higher in order
+to install TensorFlow.  If pip or pip3 8.1 or later is not
+installed, issue the following commands to install or upgrade:
+
+<pre>$ <b>sudo easy_install --upgrade pip</b>
+$ <b>sudo easy_install --upgrade six</b> </pre>
+
+
+### Install TensorFlow
+
+Assuming the prerequisite software is installed on your Mac,
+take the following steps:
+
+  1. Install TensorFlow by invoking **one** of the following commands:
+
+     <pre> $ <b>pip install tensorflow</b>      # Python 2.7; CPU support
+     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support
+
+     If the preceding command runs to completion, you should now
+     [validate your installation](#ValidateYourInstallation).
+
+  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
+     by issuing a command of the following format:
+
+     <pre> $ <b>sudo pip  install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
+     $ <b>sudo pip3 install --upgrade</b> <i>tfBinaryURL</i>   # Python 3.n </pre>
+
+     where <i>tfBinaryURL</i> identifies the URL of the TensorFlow Python
+     package. The appropriate value of <i>tfBinaryURL</i> depends on the
+     operating system and Python version. Find the appropriate
+     value for <i>tfBinaryURL</i>
+     [here](#the_url_of_the_tensorflow_python_package).  For example, if
+     you are installing TensorFlow for Mac OS and Python 2.7
+     issue the following command:
+
+     <pre> $ <b>sudo pip install --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b> </pre>
+
+     If the preceding command fails, see
+     [installation problems](#common-installation-problems).
+
+
+
+### Next Steps
+
+After installing TensorFlow,
+[validate your installation](#ValidateYourInstallation)
+to confirm that the installation worked properly.
+
+
+### Uninstalling TensorFlow
+
+To uninstall TensorFlow, issue one of following commands:
+
+<pre>$ <b>pip uninstall tensorflow</b>
+$ <b>pip3 uninstall tensorflow</b> </pre>
+
+
+## Installing with Docker
+
+Follow these steps to install TensorFlow through Docker.
+
+  1. Install Docker on your machine as described in the
+     [Docker documentation](https://docs.docker.com/engine/installation/#/on-macos-and-windows).
+
+  2. Launch a Docker container that contains one of the TensorFlow
+     binary images.
+
+The remainder of this section explains how to launch a Docker container.
+
+To launch a Docker container that holds the TensorFlow binary image,
+enter a command of the following format:
+
+<pre> $ <b>docker run -it <i>-p hostPort:containerPort</i> TensorFlowImage</b> </pre>
+
+where:
+
+  * <i>-p hostPort:containerPort</i> is optional. If you'd like to run
+    TensorFlow programs from the shell, omit this option. If you'd like
+    to run TensorFlow programs from Jupyter notebook,  set both
+    <i>hostPort</i> and <i>containerPort</i> to <code>8888</code>.
+    If you'd like to run TensorBoard inside the container, add
+    a second `-p` flag, setting both <i>hostPort</i> and <i>containerPort</i>
+    to 6006.
+  * <i>TensorFlowImage</i> is required. It identifies the Docker container.
+    You must specify one of the following values:
+    * <code>gcr.io/tensorflow/tensorflow</code>: TensorFlow binary image.
+    * <code>gcr.io/tensorflow/tensorflow:latest-devel</code>: TensorFlow
+      Binary image plus source code.
+
+<code>gcr.io</code> is the Google Container Registry. Note that some
+TensorFlow images are also available at
+[dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/).
+
+For example, the following command launches a TensorFlow CPU binary image
+in a Docker container from which you can run TensorFlow programs in a shell:
+
+<pre>$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b></pre>
+
+The following command also launches a TensorFlow CPU binary image in a
+Docker container. However, in this Docker container, you can run
+TensorFlow programs in a Jupyter notebook:
+
+<pre>$ <b>docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow</b></pre>
+
+Docker will download the TensorFlow binary image the first time you launch it.
+
+
+### Next Steps
+
+You should now
+[validate your installation](#ValidateYourInstallation).
+
+
+## Installing with Anaconda
+
+**The Anaconda installation is community supported, not officially supported.**
+
+Take the following steps to install TensorFlow in an Anaconda environment:
+
+  1. Follow the instructions on the
+     [Anaconda download site](https://www.continuum.io/downloads)
+     to download and install Anaconda.
+
+  2. Create a conda environment named `tensorflow`
+     by invoking the following command:
+
+     <pre>$ <b>conda create -n tensorflow</b></pre>
+
+  3. Activate the conda environment by issuing the following command:
+
+     <pre>$ <b>source activate tensorflow</b>
+     (tensorflow)$  # Your prompt should change</pre>
+
+  4. Issue a command of the following format to install
+     TensorFlow inside your conda environment:
+
+     <pre>(tensorflow)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
+
+     where <i>TF_PYTHON_URL</i> is the
+     [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
+     For example, the following command installs the CPU-only version of
+     TensorFlow for Python 2.7:
+
+     <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl</b></pre>
+
+
+<a name="ValidateYourInstallation"></a>
+## Validate your installation
+
+To validate your TensorFlow installation, do the following:
+
+  1. Ensure that your environment is prepared to run TensorFlow programs.
+  2. Run a short TensorFlow program.
+
+
+### Prepare your environment
+
+If you installed on native pip, virtualenv, or Anaconda, then
+do the following:
+
+  1. Start a terminal.
+  2. If you installed with virtualenv or Anaconda, activate your container.
+  3. If you installed TensorFlow source code, navigate to any
+     directory *except* one containing TensorFlow source code.
+
+If you installed through Docker, start a Docker container that runs bash.
+For example:
+
+<pre>$ <b>docker run -it gcr.io/tensorflow/tensorflow bash</b></pre>
+
+
+
+### Run a short TensorFlow program
+
+Invoke python from your shell as follows:
+
+<pre>$ <b>python</b></pre>
+
+Enter the following short program inside the python interactive shell:
+
+```python
+# Python
+import tensorflow as tf
+hello = tf.constant('Hello, TensorFlow!')
+sess = tf.Session()
+print(sess.run(hello))
+```
+
+If the system outputs the following, then you are ready to begin
+writing TensorFlow programs:
+
+<pre>Hello, TensorFlow!</pre>
+
+If you are new to TensorFlow, see
+@{$get_started/get_started$Getting Started with TensorFlow}.
+
+If the system outputs an error message instead of a greeting, see
+[Common installation problems](#common_installation_problems).
+
+## Common installation problems
+
+We are relying on Stack Overflow to document TensorFlow installation problems
+and their remedies.  The following table contains links to Stack Overflow
+answers for some common installation problems.
+If you encounter an error message or other
+installation problem not listed in the following table, search for it
+on Stack Overflow.  If Stack Overflow doesn't show the error message,
+ask a new question about it on Stack Overflow and specify
+the `tensorflow` tag.
+
+<table>
+<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42006320">42006320</a></td>
+  <td><pre>ImportError: Traceback (most recent call last):
+File ".../tensorflow/core/framework/graph_pb2.py", line 6, in <module>
+from google.protobuf import descriptor as _descriptor
+ImportError: cannot import name 'descriptor'</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/33623453">33623453</a></td>
+  <td><pre>IOError: [Errno 2] No such file or directory:
+  '/tmp/pip-o6Tpui-build/setup.py'</tt></pre>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/questions/35190574">35190574</a> </td>
+  <td><pre>SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
+  failed</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42009190">42009190</a></td>
+  <td><pre>
+  Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
+  Found existing installation: setuptools 1.1.6
+  Uninstalling setuptools-1.1.6:
+  Exception:
+  ...
+  [Errno 1] Operation not permitted:
+  '/tmp/pip-a1DXRT-uninstall/.../lib/python/_markerlib' </pre></td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/33622019">33622019</a></td>
+  <td><pre>ImportError: No module named copyreg</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/37810228">37810228</a></td>
+  <td>During a <tt>pip install</tt> operation, the system returns:
+  <pre>OSError: [Errno 1] Operation not permitted</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/33622842">33622842</a></td>
+  <td>An <tt>import tensorflow</tt> statement triggers an error such as the
+  following:<pre>Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+  File "/usr/local/lib/python2.7/site-packages/tensorflow/__init__.py",
+    line 4, in <module>
+    from tensorflow.python import *
+    ...
+  File "/usr/local/lib/python2.7/site-packages/tensorflow/core/framework/tensor_shape_pb2.py",
+    line 22, in <module>
+    serialized_pb=_b('\n,tensorflow/core/framework/tensor_shape.proto\x12\ntensorflow\"d\n\x10TensorShapeProto\x12-\n\x03\x64im\x18\x02
+      \x03(\x0b\x32
+      .tensorflow.TensorShapeProto.Dim\x1a!\n\x03\x44im\x12\x0c\n\x04size\x18\x01
+      \x01(\x03\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3')
+  TypeError: __init__() got an unexpected keyword argument 'syntax'</pre>
+  </td>
+</tr>
+
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42075397">42075397</a></td>
+  <td>A <tt>pip install</tt> command triggers the following error:
+<pre>...<lots of warnings and errors>
+You have not agreed to the Xcode license agreements, please run
+'xcodebuild -license' (for user-level acceptance) or
+'sudo xcodebuild -license' (for system-wide acceptance) from within a
+Terminal window to review and agree to the Xcode license agreements.
+...<more stack trace output>
+  File "numpy/core/setup.py", line 653, in get_mathlib_info
+
+    raise RuntimeError("Broken toolchain: cannot link a simple C program")
+
+RuntimeError: Broken toolchain: cannot link a simple C program</pre>
+</td>
+
+
+</table>
+
+
+
+
+<a name="TF_PYTHON_URL"></a>
+## The URL of the TensorFlow Python package
+
+A few installation mechanisms require the URL of the TensorFlow Python package.
+The value you specify depends on three factors:
+
+  * operating system
+  * Python version
+
+This section documents the relevant values for Mac OS installations.
+
+### Python 2.7
+
+
+<pre>
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py2-none-any.whl
+</pre>
+
+
+### Python 3.4, 3.5, or 3.6
+
+
+<pre>
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0-py3-none-any.whl
+</pre>
+
+
+
+<a name="Protobuf31"></a>
+## Protobuf pip package 3.1
+
+You can skip this section unless you are seeing problems related
+to the protobuf pip package.
+
+**NOTE:** If your TensorFlow programs are running slowly, you might
+have a problem related to the protobuf pip package.
+
+The TensorFlow pip package depends on protobuf pip package version 3.1. The
+protobuf pip package downloaded from PyPI (when invoking
+<tt>pip install protobuf</tt>) is a Python-only library containing
+Python implementations of proto serialization/deserialization that can run
+**10x-50x slower** than the C++ implementation. Protobuf also supports a
+binary extension for the Python package that contains fast
+C++ based proto parsing.  This extension is not available in the
+standard Python-only pip package.  We have created a custom binary
+pip package for protobuf that contains the binary extension. To install
+the custom binary protobuf pip package, invoke one of the following commands:
+
+  * for Python 2.7:
+
+    <pre>$ <b>pip install --upgrade \
+    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp27-none-macosx_10_11_x86_64.whl</b></pre>
+
+  * for Python 3.n:
+
+    <pre>$ <b>pip3 install --upgrade \
+    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp35-none-macosx_10_11_x86_64.whl</b></pre>
+
+Installing this protobuf package will overwrite the existing protobuf package.
+Note that the binary pip package already has support for protobufs
+larger than 64MB, which should fix errors such as these:
+
+<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
+A protocol message was rejected because it was too big (more than 67108864 bytes).
+To increase the limit (or to disable these warnings), see
+CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
new file mode 100644
index 00000000000..a082c3ce781
--- /dev/null
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -0,0 +1,431 @@
+# Installing TensorFlow from Sources
+
+This guide explains how to build TensorFlow sources into a TensorFlow
+binary and how to install that TensorFlow binary.  Note that we provide
+well-tested, pre-built TensorFlow binaries for Linux, Mac, and Windows
+systems. In addition, there are pre-built TensorFlow
+[docker images](https://hub.docker.com/r/tensorflow/tensorflow/).
+So, don't build a TensorFlow binary yourself unless you are very
+comfortable building complex packages from source and dealing with
+the inevitable aftermath should things not go exactly as documented.
+
+If the last paragraph didn't scare you off, welcome.  This guide explains
+how to build TensorFlow on the following operating systems:
+
+*   Ubuntu
+*   Mac OS X
+
+We don't officially support building TensorFlow on Windows; however, you may try
+to build TensorFlow on Windows if you don't mind using the highly experimental
+[Bazel on Windows](https://bazel.build/versions/master/docs/windows.html)
+or
+[TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/r0.12/tensorflow/contrib/cmake).
+
+
+## Determine which TensorFlow to install
+
+You must choose one of the following types of TensorFlow to build and
+install:
+
+* **TensorFlow with CPU support only**. If your system does not have a
+  NVIDIA® GPU, build and install this version. Note that this version of
+  TensorFlow is typically easier to build and install, so even if you
+  have an NVIDIA GPU, we recommend building and installing this version
+  first.
+* **TensorFlow with GPU support**. TensorFlow programs typically run
+  significantly faster on a GPU than on a CPU. Therefore, if your system
+  has a NVIDIA GPU and you need to run performance-critical applications,
+  you should ultimately build and install this version.
+  Beyond the NVIDIA GPU itself, your system must also fulfill the NVIDIA
+  software requirements described in one of the following documents:
+
+  * @{$install_linux#NVIDIARequirements$Installing TensorFlow on Ubuntu}
+  * @{$install_mac#NVIDIARequirements$Installing TensorFlow on Mac OS}
+
+
+## Clone the TensorFlow repository
+
+Start the process of building TensorFlow by cloning a TensorFlow
+repository.
+
+To clone **the latest** TensorFlow repository, issue the following command:
+
+<pre>$ <b>git clone https://github.com/tensorflow/tensorflow</b> </pre>
+
+The preceding <code>git clone</code> command creates a subdirectory
+named `tensorflow`.  After cloning, you may optionally build a
+**specific branch** (such as a release branch) by invoking the
+following commands:
+
+<pre>
+$ <b>cd tensorflow</b>
+$ <b>git checkout</b> <i>Branch</i> # where <i>Branch</i> is the desired branch
+</pre>
+
+For example, to work with the `r1.0` release instead of the master release,
+issue the following command:
+
+<pre>$ <b>git checkout r1.0</b></pre>
+
+Next, you must prepare your environment for
+[Linux](#PrepareLinux)
+or
+[Mac OS](#PrepareMac)
+
+
+<a name="#PrepareLinux"></a>
+## Prepare environment for Linux
+
+Before building TensorFlow on Linux, install the following build
+tools on your system:
+
+  * bazel
+  * TensorFlow Python dependencies
+  * optionally, NVIDIA packages to support TensorFlow for GPU.
+
+
+### Install Bazel
+
+If bazel is not installed on your system, install it now by following
+[these directions](https://bazel.build/versions/master/docs/install.html).
+
+
+### Install TensorFlow Python dependencies
+
+To install TensorFlow, you must install the following packages:
+
+  * `numpy`, which is a numerical processing package that TensorFlow requires.
+  * `dev`, which enables adding extensions to Python.
+  * `pip`, which enables you to install and manage certain Python packages.
+  * `wheel`, which enables you to manage Python compressed packages in
+    the wheel (.whl) format.
+
+To install these packages for Python 2.7, issue the following command:
+
+<pre>
+$ <b>sudo apt-get install python-numpy python-dev python-pip python-wheel</b>
+</pre>
+
+To install these packages for Python 3.n, issue the following command:
+
+<pre>
+$ <b>sudo apt-get install python3-numpy python3-dev python3-pip python3-wheel</b>
+</pre>
+
+
+### Optional: install TensorFlow for GPU prerequisites
+
+If you are building TensorFlow without GPU support, skip this section.
+
+The following NVIDIA <i>hardware</i> must be installed on your system:
+
+  * GPU card with CUDA Compute Capability 3.0 or higher.  See
+    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus)
+    for a list of supported GPU cards.
+
+The following NVIDIA <i>software</i> must be installed on your system:
+
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+    For details, see
+    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
+    Ensure that you append the relevant Cuda pathnames to the
+    `LD_LIBRARY_PATH` environment variable as described in the
+    NVIDIA documentation.
+  * The NVIDIA drivers associated with NVIDIA's Cuda Toolkit.
+  * cuDNN (>= v3). We recommend version 5.1. For details, see
+    [NVIDIA's documentation](https://developer.nvidia.com/cudnn),
+    particularly the description of appending the appropriate pathname
+    to your `LD_LIBRARY_PATH` environment variable.
+
+Finally, you must also install `libcupti-dev` by invoking the following
+command:
+
+<pre> $ <b>sudo apt-get install libcupti-dev</b> </pre>
+
+
+### Next
+
+After preparing the environment, you must now
+[configure the installation](#ConfigureInstallation).
+
+
+<a name="PrepareMac"></a>
+## Prepare environment for Mac OS
+
+Before building TensorFlow, you must install the following on your system:
+
+  * bazel
+  * TensorFlow Python dependencies.
+  * optionally, NVIDIA packages to support TensorFlow for GPU.
+
+
+### Install bazel
+
+If bazel is not installed on your system, install it now by following
+[these directions](https://bazel.build/versions/master/docs/install.html#mac-os-x).
+
+
+### Install python dependencies
+
+To install TensorFlow, you must install the following packages:
+
+  * six
+  * numpy, which is a numerical processing package that TensorFlow requires.
+  * wheel, which enables you to manage Python compressed packages
+    in the wheel (.whl) format.
+
+You may install the python dependencies using pip. If you don't have pip
+on your machine, we recommend using homebrew to install Python and pip as
+[documented here](http://docs.python-guide.org/en/latest/starting/install/osx/).
+If you follow these instructions, you will not need to disable SIP.
+
+After installing pip, invoke the following commands:
+
+<pre> $ <b>sudo pip install six numpy wheel</b> </pre>
+
+
+
+### Optional: install TensorFlow for GPU prerequisites
+
+If you do not have brew installed, install it by following
+[these instructions](http://brew.sh/).
+
+After installing brew, install GNU coreutils by issuing the following command:
+
+<pre>$ <b>brew install coreutils</b></pre>
+
+If you want to compile tensorflow and have XCode 7.3 and CUDA 7.5 installed,
+note that Xcode 7.3 is not yet compatible with CUDA 7.5.  To remedy this
+problem, do either of the following:
+
+  * Upgrade to CUDA 8.0.
+  * Download Xcode 7.2 and select it as your default by issuing the following
+    command:
+
+    <pre> $ <b>sudo xcode-select -s /Application/Xcode-7.2/Xcode.app</b></pre>
+
+**NOTE:** Your system must fulfill the NVIDIA software requirements described
+in one of the following documents:
+
+  * @{$install_linux#NVIDIARequirements$Installing TensorFlow on Linux}
+  * @{$install_mac#NVIDIARequirements$Installing TensorFlow on Mac OS}
+
+
+<a name="ConfigureInstallation"></a>
+## Configure the installation
+
+The root of the source tree contains a bash script named
+<code>configure</code>. This script asks you to identify the pathname of all
+relevant TensorFlow dependencies and specify other build configuration options
+such as compiler flags. You must run this script *prior* to
+creating the pip package and installing TensorFlow.
+
+If you wish to build TensorFlow with GPU, `configure` will ask
+you to specify the version numbers of Cuda and cuDNN. If several
+versions of Cuda or cuDNN are installed on your system, explicitly select
+the desired version instead of relying on the default.
+
+One of the questions that `configure` will ask is as follows:
+
+<pre>
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]
+</pre>
+
+This question refers to a later phase in which you'll use bazel to 
+[build the pip package](#build-the-pip-package).  We recommend 
+accepting the default (`-march=native`), which will
+optimize the generated code for your local machine's CPU type.  However,
+if you are building TensorFlow on one CPU type but will run TensorFlow on
+a different CPU type, then consider specifying a more specific optimization
+flag as described in [the gcc
+documentation](https://gcc.gnu.org/onlinedocs/gcc-4.5.3/gcc/i386-and-x86_002d64-Options.html).
+
+Here is an example execution of the `configure` script.  Note that your
+own input will likely differ from our sample input:
+
+<pre>
+$ <b>cd tensorflow</b>  # cd to the top-level directory created
+$ <b>./configure</b>
+Please specify the location of python. [Default is /usr/bin/python]: <b>/usr/bin/python2.7</b>
+Found possible Python library paths:
+  /usr/local/lib/python2.7/dist-packages
+  /usr/lib/python2.7/dist-packages
+Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
+
+Using python library path: /usr/local/lib/python2.7/dist-packages
+Do you wish to build TensorFlow with MKL support? [y/N]
+No MKL support will be enabled for TensorFlow
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
+Do you wish to use jemalloc as the malloc implementation? [Y/n]
+jemalloc enabled
+Do you wish to build TensorFlow with Google Cloud Platform support? [y/N]
+No Google Cloud Platform support will be enabled for TensorFlow
+Do you wish to build TensorFlow with Hadoop File System support? [y/N]
+No Hadoop File System support will be enabled for TensorFlow
+Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N]
+No XLA support will be enabled for TensorFlow
+Do you wish to build TensorFlow with VERBS support? [y/N]
+No VERBS support will be enabled for TensorFlow
+Do you wish to build TensorFlow with OpenCL support? [y/N]
+No OpenCL support will be enabled for TensorFlow
+Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
+CUDA support will be enabled for TensorFlow
+Do you want to use clang as CUDA compiler? [y/N]
+nvcc will be used as CUDA compiler
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
+Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
+Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify a list of comma-separated Cuda compute capabilities you want to build with.
+You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
+Please note that each additional compute capability significantly increases your build time and binary size.
+[Default is: "3.5,5.2"]: <b>3.0</b>
+Do you wish to build TensorFlow with MPI support? [y/N] 
+MPI support will not be enabled for TensorFlow
+Configuration finished
+</pre>
+
+If you told `configure` to build for GPU support, then `configure`
+will create a canonical set of symbolic links to the Cuda libraries
+on your system.  Therefore, every time you change the Cuda library paths,
+you must rerun the `configure` script before re-invoking
+the <code>bazel build</code> command.
+
+Note the following:
+
+  * Although it is possible to build both Cuda and non-Cuda configs
+    under the same source tree, we recommend running `bazel clean` when
+    switching between these two configurations in the same source tree.
+  * If you don't run the `configure` script *before* running the
+    `bazel build` command, the `bazel build` command will fail.
+
+
+## Build the pip package
+
+To build a pip package for TensorFlow with CPU-only support,
+you would typically invoke the following command:
+
+<pre>
+$ <b>bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package</b>
+</pre>
+
+To build a pip package for TensorFlow with GPU support,
+invoke the following command:
+
+<pre>$ <b>bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package</b> </pre>
+
+**NOTE on gcc 5 or later:** the binary pip packages available on the
+TensorFlow website are built with gcc 4, which uses the older ABI. To
+make your build compatible with the older ABI, you need to add
+`--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"` to your `bazel build` command.
+ABI compatibility allows custom ops built against the TensorFlow pip package
+to continue to work against your built package.
+
+<b>Tip:</b> By default, building TensorFlow from sources consumes
+a lot of RAM.  If RAM is an issue on your system, you may limit RAM usage
+by specifying <code>--local_resources 2048,.5,1.0</code> while
+invoking `bazel`.
+
+The <code>bazel build</code> command builds a script named
+`build_pip_package`.  Running this script as follows will build
+a `.whl` file within the `/tmp/tensorflow_pkg` directory:
+
+<pre>
+$ <b>bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg</b>
+</pre>
+
+
+## Install the pip package
+
+Invoke `pip install` to install that pip package.
+The filename of the `.whl` file depends on your platform.
+For example, the following command will install the pip package
+
+for TensorFlow 1.2.0 on Linux:
+
+<pre>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0-py2-none-any.whl</b>
+</pre>
+
+## Validate your installation
+
+Validate your TensorFlow installation by doing the following:
+
+Start a terminal.
+
+Change directory (`cd`) to any directory on your system other than the
+`tensorflow` subdirectory from which you invoked the `configure` command.
+
+Invoke python:
+
+<pre>$ <b>python</b></pre>
+
+Enter the following short program inside the python interactive shell:
+
+```python
+# Python
+import tensorflow as tf
+hello = tf.constant('Hello, TensorFlow!')
+sess = tf.Session()
+print(sess.run(hello))
+```
+
+If the system outputs the following, then you are ready to begin writing
+TensorFlow programs:
+
+<pre>Hello, TensorFlow!</pre>
+
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+TensorFlow}.
+
+If the system outputs an error message instead of a greeting, see [Common
+installation problems](#common_installation_problems).
+
+## Common installation problems
+
+The installation problems you encounter typically depend on the
+operating system.  See the "Common installation problems" section
+of one of the following guides:
+
+  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
+  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+
+Beyond the errors documented in those two guides, the following table
+notes additional errors specific to building TensorFlow.  Note that we
+are relying on Stack Overflow as the repository for build and installation
+problems.  If you encounter an error message not listed in the preceding
+two guides or in the following table, search for it on Stack Overflow.  If
+Stack Overflow doesn't show the error message, ask a new question on
+Stack Overflow and specify the `tensorflow` tag.
+
+<table>
+<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+
+<tr>
+  <td><a
+  href="https://stackoverflow.com/questions/41293077/how-to-compile-tensorflow-with-sse4-2-and-avx-instructions">41293077</a></td>
+  <td><pre>W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow
+  library wasn't compiled to use SSE4.1 instructions, but these are available on
+  your machine and could speed up CPU computations.</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42013316">42013316</a></td>
+  <td><pre>ImportError: libcudart.so.8.0: cannot open shared object file:
+  No such file or directory</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42013316">42013316</a></td>
+  <td><pre>ImportError: libcudnn.5: cannot open shared object file:
+  No such file or directory</pre></td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/35953210">35953210</a></td>
+  <td>Invoking `python` or `ipython` generates the following error:
+  <pre>ImportError: cannot import name pywrap_tensorflow</pre></td>
+</tr>
+</table>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
new file mode 100644
index 00000000000..8282afaab4e
--- /dev/null
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -0,0 +1,213 @@
+# Installing TensorFlow on Windows
+
+This guide explains how to install TensorFlow on Windows.
+
+## Determine which TensorFlow to install
+
+You must choose one of the following types of TensorFlow to install:
+
+  * **TensorFlow with CPU support only**. If your system does not have a
+    NVIDIA® GPU, you must install this version. Note that this version of
+    TensorFlow is typically much easier to install (typically,
+    in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
+    installing this version first.
+  * **TensorFlow with GPU support**. TensorFlow programs typically run
+    significantly faster on a GPU than on a CPU. Therefore, if your
+    system has a NVIDIA® GPU meeting the prerequisites shown below
+    and you need to run performance-critical applications, you should
+    ultimately install this version.
+
+### Requirements to run TensorFlow with GPU support
+
+If you are installing TensorFlow with GPU support using one of the mechanisms
+described in this guide, then the following NVIDIA software must be
+installed on your system:
+
+  * CUDA® Toolkit 8.0. For details, see
+    [NVIDIA's
+    documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
+    Ensure that you append the relevant Cuda pathnames to the `%PATH%`
+    environment variable as described in the NVIDIA documentation.
+  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
+  * cuDNN v5.1. For details, see
+    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
+    Note that cuDNN is typically installed in a different location from the
+    other CUDA DLLs. Ensure that you add the directory where you installed
+    the cuDNN DLL to your `%PATH%` environment variable.
+  * GPU card with CUDA Compute Capability 3.0 or higher.  See
+    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
+    list of supported GPU cards.
+
+If you have a different version of one of the preceding packages, please
+change to the specified versions.  In particular, the cuDNN version
+must match exactly: TensorFlow will not load if it cannot find `cuDNN64_5.dll`.
+To use a different version of cuDNN, you must build from source.
+
+## Determine how to install TensorFlow
+
+You must pick the mechanism by which you install TensorFlow. The
+supported choices are as follows:
+
+  * "native" pip
+  * Anaconda
+
+Native pip installs TensorFlow directly on your system without going
+through a virtual environment.  Since a native pip installation is not
+walled-off in a separate container, the pip installation might interfere
+with other Python-based installations on your system. However, if you
+understand pip and your Python environment, a native pip installation
+often entails only a single command! Furthermore, if you install with
+native pip, users can run TensorFlow programs from any directory on
+the system.
+
+In Anaconda, you may use conda to create a virtual environment.
+However, within Anaconda, we recommend installing TensorFlow with the
+`pip install` command, not with the `conda install` command.
+
+**NOTE:** The conda package is community supported, not officially supported.
+That is, the TensorFlow team neither tests nor maintains this conda package.
+Use that package at your own risk.
+
+
+## Installing with native pip
+
+If the following version of Python is not installed on your machine,
+install it now:
+
+  * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
+
+TensorFlow only supports version 3.5.x of Python on Windows.
+Note that Python 3.5.x comes with the pip3 package manager, which is the
+program you'll use to install TensorFlow.
+
+To install TensorFlow, start a terminal. Then issue the appropriate
+<tt>pip3 install</tt> command in that terminal.  To install the CPU-only
+version of TensorFlow, enter the following command:
+
+<pre>C:\> <b>pip3 install --upgrade tensorflow</b></pre>
+
+To install the GPU version of TensorFlow, enter the following command:
+
+<pre>C:\> <b>pip3 install --upgrade tensorflow-gpu</b></pre>
+
+
+## Installing with Anaconda
+
+**The Anaconda installation is community supported, not officially supported.**
+
+Take the following steps to install TensorFlow in an Anaconda environment:
+
+  1. Follow the instructions on the
+     [Anaconda download site](https://www.continuum.io/downloads)
+     to download and install Anaconda.
+
+  2. Create a conda environment named <tt>tensorflow</tt>
+     by invoking the following command:
+
+     <pre>C:\> <b>conda create -n tensorflow python=3.5</b> </pre>
+
+  3. Activate the conda environment by issuing the following command:
+
+     <pre>C:\> <b>activate tensorflow</b>
+     (tensorflow)C:\>  # Your prompt should change </pre>
+
+  4. Issue the appropriate command to install TensorFlow inside your conda
+     environment. To install the CPU-only version of TensorFlow, enter the
+     following command:
+
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
+
+     To install the GPU version of TensorFlow, enter the following command
+     (on a single line):
+
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl</b> </pre>
+
+## Validate your installation
+
+Start a terminal.
+
+If you installed through Anaconda, activate your Anaconda environment.
+
+Invoke python from your shell as follows:
+
+<pre>$ <b>python</b></pre>
+
+Enter the following short program inside the python interactive shell:
+
+```python
+>>> import tensorflow as tf
+>>> hello = tf.constant('Hello, TensorFlow!')
+>>> sess = tf.Session()
+>>> print(sess.run(hello))
+```
+
+If the system outputs the following, then you are ready to begin writing
+TensorFlow programs:
+
+<pre>Hello, TensorFlow!</pre>
+
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+TensorFlow}.
+
+If the system outputs an error message instead of a greeting, see [Common
+installation problems](#common_installation_problems).
+
+## Common installation problems
+
+We are relying on Stack Overflow to document TensorFlow installation problems
+and their remedies.  The following table contains links to Stack Overflow
+answers for some common installation problems.
+If you encounter an error message or other
+installation problem not listed in the following table, search for it
+on Stack Overflow.  If Stack Overflow doesn't show the error message,
+ask a new question about it on Stack Overflow and specify
+the `tensorflow` tag.
+
+<table>
+<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/41007279">41007279</a></td>
+  <td>
+  <pre>[...\stream_executor\dso_loader.cc] Couldn't open CUDA library nvcuda.dll</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/41007279">41007279</a></td>
+  <td>
+  <pre>[...\stream_executor\cuda\cuda_dnn.cc] Unable to load cuDNN DSO</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="http://stackoverflow.com/q/42006320">42006320</a></td>
+  <td><pre>ImportError: Traceback (most recent call last):
+File "...\tensorflow\core\framework\graph_pb2.py", line 6, in <module>
+from google.protobuf import descriptor as _descriptor
+ImportError: cannot import name 'descriptor'</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/42011070">42011070</a></td>
+  <td><pre>No module named "pywrap_tensorflow"</pre></td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/42217532">42217532</a></td>
+  <td>
+  <pre>OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/43134753">43134753</a></td>
+  <td>
+  <pre>The TensorFlow library wasn't compiled to use SSE instructions</pre>
+  </td>
+</tr>
+
+
+</table>
+
diff --git a/tensorflow/docs_src/install/leftnav_files b/tensorflow/docs_src/install/leftnav_files
new file mode 100644
index 00000000000..bc30d37bd08
--- /dev/null
+++ b/tensorflow/docs_src/install/leftnav_files
@@ -0,0 +1,10 @@
+install_linux.md
+install_mac.md
+install_windows.md
+install_sources.md
+>>>
+migration.md
+>>>
+install_java.md
+install_go.md
+install_c.md
diff --git a/tensorflow/docs_src/install/migration.md b/tensorflow/docs_src/install/migration.md
new file mode 100644
index 00000000000..d6c31f96bd6
--- /dev/null
+++ b/tensorflow/docs_src/install/migration.md
@@ -0,0 +1,337 @@
+
+# Transitioning to TensorFlow 1.0
+
+
+The APIs in TensorFlow 1.0 have changed in ways that are not all backwards
+compatible.  That is, TensorFlow programs that worked on TensorFlow 0.n won't
+necessarily work on TensorFlow 1.0.  We have made this API changes to ensure an
+internally-consistent API, and do not plan to make backwards-breaking changes
+throughout the 1.N lifecycle.
+
+This guide walks you through the major changes in the API and how to
+automatically upgrade your programs for TensorFlow 1.0.  This guide not
+only steps you through the changes but also explains why we've made them.
+
+## How to upgrade
+
+If you would like to automatically  port your code to 1.0, you can try our
+`tf_upgrade.py` script. While this script handles many cases, manual changes
+are sometimes necessary.
+  Get this script from our
+[GitHub tree](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/compatibility).
+
+To convert a single 0.n TensorFlow source file to 1.0, enter a
+command of the following format:
+
+<pre>
+$ <b>python tf_upgrade.py --infile</b> <i>InputFile</i> <b>--outfile</b> <i>OutputFile</i>
+</pre>
+
+For example, the following command converts a 0.n TensorFlow
+program named `test.py` to a 1.0 TensorFlow program named `test_1.0.py`:
+
+<pre>
+$ <b>python tf_upgrade.py --infile test.py --outfile test_1.0.py</b>
+</pre>
+
+The `tf_upgrade.py` script also generates a file named `report.txt`, which
+details all the changes it performed and makes additional suggestions about
+changes you might need to make manually.
+
+To upgrade a whole directory of 0.n TensorFlow programs to 1.0,
+enter a command having the following format:
+
+<pre>
+$ <b>python tf_upgrade.py --intree</b> <i>InputDir</i> <b>--outtree</b> <i>OutputDir</i>
+</pre>
+
+For example, the following command converts all the 0.n TensorFlow programs
+in the `/home/user/cool` directory, creating their 1.0 equivalents in
+the `/home/user/cool_1.0` directory:
+
+<pre>
+$ <b>python tf_upgrade.py --intree /home/user/cool --outtree /home/user/cool_1.0</b>
+</pre>
+
+### Limitations
+
+There are a few things to watch out for. Specifically:
+
+ * You must manually fix any instances of `tf.reverse()`.
+   The `tf_upgrade.py` script will warn you about `tf.reverse()` in
+   stdout and in the `report.txt` file.
+ * On reordered arguments, `tf_upgrade.py` tries to minimally reformat
+   your code, so it cannot automatically change the actual argument order.
+   Instead, `tf_upgrade.py` makes your function invocations order-independent
+   by introducing keyword arguments.
+ * Constructions like `tf.get_variable_scope().reuse_variables()`
+   will likely not work. We recommend deleting those lines and replacing
+   them with lines such as the following:
+
+   <pre class="prettyprint">
+   with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+     ...
+   </pre>
+
+ * Analogously to `tf.pack` and  `tf.unpack`, we're renamed
+   `TensorArray.pack` and `TensorArray.unpack` to
+   `TensorArray.stack` and `TensorArray.unstack`. However, `TensorArray.pack`
+   and `TensorArray.unpack` cannot be detected lexically since they are
+   indirectly related to the `tf` namespace e.g.
+   `foo = tf.TensorArray(); foo.unpack()`
+
+## Upgrading your code manually
+
+Instead of running `tf_upgrade.py`, you may manually upgrade your code.
+The remainder of this document provides a comprehensive list of
+all backward incompatible changes made in TensorFlow 1.0.
+
+
+### Variables
+
+Variable functions have been made more consistent and less confusing.
+
+* `tf.VARIABLES`
+    * should be renamed to `tf.GLOBAL_VARIABLES`
+* `tf.all_variables`
+    * should be renamed to `tf.global_variables`
+* `tf.initialize_all_variables`
+    * should be renamed to `tf.global_variables_initializer`
+* `tf.initialize_local_variables`
+    * should be renamed to `tf.local_variables_initializer`
+* `tf.initialize_variables`
+    * should be renamed to `tf.variables_initializer`
+
+### Summary functions
+
+Summary functions have been consolidated under the `tf.summary` namespace.
+
+* `tf.audio_summary`
+    * should be renamed to `tf.summary.audio`
+* `tf.contrib.deprecated.histogram_summary`
+    * should be renamed to `tf.summary.histogram`
+* `tf.contrib.deprecated.scalar_summary`
+    * should be renamed to `tf.summary.scalar`
+* `tf.histogram_summary`
+    * should be renamed to `tf.summary.histogram`
+* `tf.image_summary`
+    * should be renamed to `tf.summary.image`
+* `tf.merge_all_summaries`
+    * should be renamed to `tf.summary.merge_all`
+* `tf.merge_summary`
+    * should be renamed to `tf.summary.merge`
+* `tf.scalar_summary`
+    * should be renamed to `tf.summary.scalar`
+* `tf.train.SummaryWriter`
+    * should be renamed to `tf.summary.FileWriter`
+
+### Numeric differences
+
+
+Integer division and `tf.floordiv` now uses flooring semantics. This is to
+make the results of `np.divide` and `np.mod` consistent with `tf.divide` and
+`tf.mod`, respectively. In addition we have changed the rounding algorithm
+used by `tf.round` to match NumPy.
+
+
+* `tf.div`
+
+    * The semantics of `tf.divide` division have been changed to match Python
+semantics completely. That is, `/` in Python 3     and future division mode in
+Python 2 will produce floating point numbers always, `//` will produce floored
+division.     However, even `tf.div` will produce floored integer division.
+To force C-style truncation semantics, you must use `tf.truncatediv`.
+
+    * Consider changing your code to use `tf.divide`, which follows Python semantics for promotion.
+
+* `tf.mod`
+
+    * The semantics of `tf.mod` have been changed to match Python semantics. In
+particular, flooring semantics are used for     integers. If you wish to have
+C-style truncation mod (remainders), you can use `tf.truncatemod`
+
+
+The old and new behavior of division can be summarized with this table:
+
+| Expr                | TF 0.11 (py2) | TF 0.11 (py3) | TF 1.0 (py2) | TF 1.0 (py3) |
+|---------------------|---------------|---------------|--------------|--------------|
+| tf.div(3,4)         | 0             | 0             | 0            | 0            |
+| tf.div(-3,4)        | 0             | 0             | -1           | -1           |
+| tf.mod(-3,4)        | -3            | -3            | 1            | 1            |
+| -3/4                | 0             | -0.75         | -1           | -0.75        |
+| -3/4tf.divide(-3,4) | N/A           | N/A           | -0.75        | -1           |
+
+The old and new behavior of rounding can be summarized with this table:
+
+| Input | Python | NumPy | C++ round() | TensorFlow 0.11(floor(x+.5)) | TensorFlow 1.0 |
+|-------|--------|-------|-------------|------------------------------|----------------|
+| -3.5  | -4     | -4    | -4          | -3                           | -4             |
+| -2.5  | -2     | -2    | -3          | -2                           | -2             |
+| -1.5  | -2     | -2    | -2          | -1                           | -2             |
+| -0.5  | 0      | 0     | -1          | 0                            | 0              |
+| 0.5   | 0      | 0     | 1           | 1                            | 0              |
+| 1.5   | 2      | 2     | 2           | 2                            | 2              |
+| 2.5   | 2      | 2     | 3           | 3                            | 2              |
+| 3.5   | 4      | 4     | 4           | 4                            | 4              |
+
+
+
+### NumPy matching names
+
+
+Many functions have been renamed to match NumPy. This was done to make the
+transition between NumPy and TensorFlow as easy as possible. There are still
+numerous cases where functions do not match, so this is far from a hard and
+fast rule, but we have removed several commonly noticed inconsistencies.
+
+* `tf.inv`
+    * should be renamed to `tf.reciprocal`
+    * This was done to avoid confusion with NumPy's matrix inverse `np.inv`
+* `tf.list_diff`
+    * should be renamed to `tf.setdiff1d`
+* `tf.listdiff`
+    * should be renamed to `tf.setdiff1d`
+* `tf.mul`
+    * should be renamed to `tf.multiply`
+* `tf.neg`
+    * should be renamed to `tf.negative`
+* `tf.select`
+    * should be renamed to `tf.where`
+    * `tf.where` now takes 3 arguments or 1 argument, just like `np.where`
+* `tf.sub`
+    * should be renamed to `tf.subtract`
+
+### NumPy matching arguments
+
+Arguments for certain TensorFlow 1.0 methods now match arguments in certain
+NumPy methods.  To achieve this, TensorFlow 1.0 has changed keyword arguments
+and reordered some arguments. Notably, TensorFlow 1.0 now uses `axis` rather
+than `dimension`. TensorFlow 1.0 aims to keep the tensor argument first on
+operations that modify Tensors. (see the `tf.concat` change).
+
+
+* `tf.argmax`
+    * keyword argument `dimension` should be renamed to `axis`
+* `tf.argmin`
+    * keyword argument `dimension` should be renamed to `axis`
+* `tf.concat`
+    * keyword argument `concat_dim` should be renamed to `axis`
+    * arguments have been reordered to `tf.concat(values, axis, name='concat')`.
+* `tf.count_nonzero`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.expand_dims`
+    * keyword argument `dim` should be renamed to `axis`
+* `tf.reduce_all`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_any`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_join`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_logsumexp`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_max`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_mean`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_min`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_prod`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reduce_sum`
+    * keyword argument `reduction_indices` should be renamed to `axis`
+* `tf.reverse`
+    * `tf.reverse` used to take a 1D `bool` tensor to control which dimensions were reversed. Now we use a Tensor of axis indices.
+    * For example `tf.reverse(a, [True, False, True])` now must be `tf.reverse(a, [0, 2])`
+* `tf.reverse_sequence`
+    * keyword argument `batch_dim` should be renamed to `batch_axis`
+    * keyword argument `seq_dim` should be renamed to `seq_axis`
+* `tf.sparse_concat`
+    * keyword argument `concat_dim` should be renamed to `axis`
+* `tf.sparse_reduce_sum`
+    * keyword argument `reduction_axes` should be renamed to `axis`
+* `tf.sparse_reduce_sum_sparse`
+    * keyword argument `reduction_axes` should be renamed to `axis`
+* `tf.sparse_split`
+    * keyword argument `split_dim` should be renamed to `axis`
+    * arguments have been reordered to `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)`.
+* `tf.split`
+    * keyword argument `split_dim` should be renamed to `axis`
+    * keyword argument `num_split` should be renamed to `num_or_size_splits`
+    * arguments have been reordered to `tf.split(value, num_or_size_splits, axis=0, num=None, name='split')`.
+* `tf.squeeze`
+    * keyword argument `squeeze_dims` should be renamed to `axis`
+* `tf.svd`
+    * arguments have been reordered to `tf.svd(tensor, full_matrices=False, compute_uv=True, name=None)`.
+
+### Simplified math variants
+
+Batched versions of math operations have been removed. Now the functionality is
+contained in the non-batched versions. Similarly,`tf.complex_abs` has had its
+functionality moved to `tf.abs`
+
+* `tf.batch_band_part`
+    * should be renamed to `tf.band_part`
+* `tf.batch_cholesky`
+    * should be renamed to `tf.cholesky`
+* `tf.batch_cholesky_solve`
+    * should be renamed to `tf.cholesky_solve`
+* `tf.batch_fft`
+    * should be renamed to `tf.fft`
+* `tf.batch_fft3d`
+    * should be renamed to `tf.fft3d`
+* `tf.batch_ifft`
+    * should be renamed to `tf.ifft`
+* `tf.batch_ifft2d`
+    * should be renamed to `tf.ifft2d`
+* `tf.batch_ifft3d`
+    * should be renamed to `tf.ifft3d`
+* `tf.batch_matmul`
+    * should be renamed to `tf.matmul`
+* `tf.batch_matrix_determinant`
+    * should be renamed to `tf.matrix_determinant`
+* `tf.batch_matrix_diag`
+    * should be renamed to `tf.matrix_diag`
+* `tf.batch_matrix_inverse`
+    * should be renamed to `tf.matrix_inverse`
+* `tf.batch_matrix_solve`
+    * should be renamed to `tf.matrix_solve`
+* `tf.batch_matrix_solve_ls`
+    * should be renamed to `tf.matrix_solve_ls`
+* `tf.batch_matrix_transpose`
+    * should be renamed to `tf.matrix_transpose`
+* `tf.batch_matrix_triangular_solve`
+    * should be renamed to `tf.matrix_triangular_solve`
+* `tf.batch_self_adjoint_eig`
+    * should be renamed to `tf.self_adjoint_eig`
+* `tf.batch_self_adjoint_eigvals`
+    * should be renamed to `tf.self_adjoint_eigvals`
+* `tf.batch_set_diag`
+    * should be renamed to `tf.set_diag`
+* `tf.batch_svd`
+    * should be renamed to `tf.svd`
+* `tf.complex_abs`
+    * should be renamed to `tf.abs`
+
+### Misc Changes
+
+Several other changes have been made, including the following:
+
+* `tf.image.per_image_whitening`
+    * should be renamed to `tf.image.per_image_standardization`
+* `tf.nn.sigmoid_cross_entropy_with_logits`
+    * arguments have been reordered to `tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)`.
+* `tf.nn.softmax_cross_entropy_with_logits`
+    * arguments have been reordered to `tf.nn.softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None)`.
+* `tf.nn.sparse_softmax_cross_entropy_with_logits`
+    * arguments have been reordered to `tf.nn.sparse_softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)`.
+* `tf.ones_initializer`
+    * should be changed to a function call i.e. `tf.ones_initializer()`
+* `tf.pack`
+    * should be renamed to `tf.stack`
+* `tf.round`
+    * The semantics of `tf.round` now match Banker's rounding.
+* `tf.unpack`
+    * should be renamed to `tf.unstack`
+* `tf.zeros_initializer`
+    * should be changed to a function call i.e. `tf.zeros_initializer()`
+
diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
new file mode 100644
index 00000000000..20165a090ef
--- /dev/null
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -0,0 +1,414 @@
+# Benchmarks
+
+## Overview
+
+A selection of image classification models were tested across multiple platforms
+to create a point of reference for the TensorFlow community. The
+[Methodology](#methodology) section details how the tests were executed and has
+links to the scripts used.
+
+## Results for image classification models
+
+InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)), ResNet-50
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), ResNet-152
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16
+([arXiv:1409.1556](https://arxiv.org/abs/1409.1556)), and
+[AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)
+were tested using the [ImageNet](http://www.image-net.org/) data set. Tests were
+run on Google Compute Engine, Amazon Elastic Compute Cloud (Amazon EC2), and an
+NVIDIA® DGX-1™. Most of the tests were run with both synthetic and real data.
+Testing with synthetic data was done by using a `tf.Variable` set to the same
+shape as the data expected by each model for ImageNet. We believe it is
+important to include real data measurements when benchmarking a platform. This
+load tests both the underlying hardware and the framework at preparing data for
+actual training. We start with synthetic data to remove disk I/O as a variable
+and to set a baseline. Real data is then used to verify that the TensorFlow
+input pipeline and the underlying disk I/O are saturating the compute units.
+
+### Training with NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+Details and additional results are in the [Details for NVIDIA® DGX-1™ (NVIDIA®
+Tesla® P100)](#details_for_nvidia_dgx-1tm_nvidia_tesla_p100) section.
+
+### Training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_single_server.png">
+</div>
+
+Details and additional results are in the [Details for Google Compute Engine
+(NVIDIA® Tesla® K80)](#details_for_google_compute_engine_nvidia_tesla_k80) and
+[Details for Amazon EC2 (NVIDIA® Tesla®
+K80)](#details_for_amazon_ec2_nvidia_tesla_k80) sections.
+
+### Distributed training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+Details and additional results are in the [Details for Amazon EC2 Distributed
+(NVIDIA® Tesla® K80)](#details_for_amazon_ec2_distributed_nvidia_tesla_k80)
+section.
+
+### Compare synthetic with real data training
+
+**NVIDIA® Tesla® P100**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_resnet50.png">
+</div>
+
+**NVIDIA® Tesla® K80**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_resnet50.png">
+</div>
+
+## Details for NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+### Environment
+
+*   **Instance type**: NVIDIA® DGX-1™
+*   **GPU:** 8x NVIDIA® Tesla® P100
+*   **OS:** Ubuntu 16.04 LTS with tests run via Docker
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** Local SSD
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3, ResNet-50,
+ResNet-152, and VGG16 were tested with a batch size of 32. Those results are in
+the *other results* section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 64         | 512     | 64
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device
+----------- | ---------------------- | ----------------------
+InceptionV3 | parameter_server       | cpu
+ResNet50    | parameter_server       | cpu
+ResNet152   | parameter_server       | cpu
+AlexNet     | replicated (with NCCL) | n/a
+VGG16       | replicated (with NCCL) | n/a
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_dgx1_synth_p100_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_dgx1_real_p100_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 219       | 91.8       | 2987    | 154
+2    | 284         | 422       | 181        | 5658    | 295
+4    | 569         | 852       | 356        | 10509   | 584
+8    | 1131        | 1734      | 716        | 17822   | 1081
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 218       | 91.4       | 2890    | 154
+2    | 278         | 425       | 179        | 4448    | 284
+4    | 551         | 853       | 359        | 7105    | 534
+8    | 1079        | 1630      | 708        | N/A     | 898
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to it maxing out the input pipeline.
+
+### Other Results
+
+The results below are all with a batch size of 32.
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 128         | 195       | 82.7       | 144
+2    | 259         | 368       | 160        | 281
+4    | 520         | 768       | 317        | 549
+8    | 995         | 1485      | 632        | 820
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 130         | 193       | 82.4       | 144
+2    | 257         | 369       | 159        | 253
+4    | 507         | 760       | 317        | 457
+8    | 966         | 1410      | 609        | 690
+
+## Details for Google Compute Engine (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: n1-standard-32-k80x8
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.7 TB Shared SSD persistent disk (800 MB/s)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+The configuration used for each model was `variable_update` equal to
+`parameter_server` and `local_parameter_device` equal to `cpu`.
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_gce_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_gce_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.9      | 20.0       | 656     | 35.4
+2    | 57.8        | 99.0      | 38.2       | 1209    | 64.8
+4    | 116         | 195       | 75.8       | 2328    | 120
+8    | 227         | 387       | 148        | 4640    | 234
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.6        | 51.2      | 20.0       | 639     | 34.2
+2    | 58.4        | 98.8      | 38.3       | 1136    | 62.9
+4    | 115         | 194       | 75.4       | 2067    | 118
+8    | 225         | 381       | 148        | 4056    | 230
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.3                        | 49.5
+2    | 55.0                        | 95.4
+4    | 109                         | 183
+8    | 216                         | 362
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.5                        | 49.3
+2    | 55.4                        | 95.3
+4    | 110                         | 186
+8    | 216                         | 359
+
+## Details for Amazon EC2 (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1TB Amazon EFS (burst 100 MiB/sec for 12 hours, continuous 50
+    MiB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update           | local_parameter_device
+----------- | ------------------------- | ----------------------
+InceptionV3 | parameter_server          | cpu
+ResNet-50   | replicated (without NCCL) | gpu
+ResNet-152  | replicated (without NCCL) | gpu
+AlexNet     | parameter_server          | gpu
+VGG16       | parameter_server          | gpu
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_aws_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_aws_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.8        | 51.5      | 19.7       | 684     | 36.3
+2    | 58.7        | 98.0      | 37.6       | 1244    | 69.4
+4    | 117         | 195       | 74.9       | 2479    | 141
+8    | 230         | 384       | 149        | 4853    | 260
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | AlexNet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.3      | 19.7       | 674     | 36.3
+2    | 59.0        | 94.9      | 38.2       | 1227    | 67.5
+4    | 118         | 188       | 75.2       | 2201    | 136
+8    | 228         | 373       | 149        | N/A     | 242
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to our EFS setup not providing enough throughput.
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.9                        | 49.0
+2    | 57.5                        | 94.1
+4    | 114                         | 184
+8    | 216                         | 355
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 30.0                        | 49.1
+2    | 57.5                        | 95.1
+4    | 113                         | 185
+8    | 212                         | 353
+
+## Details for Amazon EC2 Distributed (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.0 TB EFS (burst 100 MB/sec for 12 hours, continuous 50 MB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+The batch size and optimizer used for the tests are listed in the table. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152
+------------------ | ----------- | --------- | ----------
+Batch size per GPU | 64          | 64        | 32
+Optimizer          | sgd         | sgd       | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device | cross_replica_sync
+----------- | ---------------------- | ---------------------- | ------------------
+InceptionV3 | distributed_replicated | n/a                    | True
+ResNet-50   | distributed_replicated | n/a                    | True
+ResNet-152  | distributed_replicated | n/a                    | True
+
+To simplify server setup, EC2 instances (p2.8xlarge) running worker servers also
+ran parameter servers. Equal numbers of parameter servers and worker servers were
+used with the following exceptions:
+
+*   InceptionV3: 8 instances / 6 parameter servers
+*   ResNet-50: (batch size 32) 8 instances / 4 parameter servers
+*   ResNet-152: 8 instances / 4 parameter servers
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../images/perf_aws_synth_k80_distributed_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152
+---- | ----------- | --------- | ----------
+1    | 29.7        | 52.4      | 19.4
+8    | 229         | 378       | 146
+16   | 459         | 751       | 291
+32   | 902         | 1388      | 565
+64   | 1783        | 2744      | 981
+
+### Other Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:50%" src="../images/perf_aws_synth_k80_multi_server_batch32.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.2                        | 48.4
+8    | 219                         | 333
+16   | 427                         | 667
+32   | 820                         | 1180
+64   | 1608                        | 2315
+
+## Methodology
+
+This
+[script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+was run on the various platforms to generate the above results.
+@{$performance_models$High-Performance Models} details techniques in the script
+along with examples of how to execute the script.
+
+In order to create results that are as repeatable as possible, each test was run
+5 times and then the times were averaged together. GPUs are run in their default
+state on the given platform. For NVIDIA® Tesla® K80 this means leaving on [GPU
+Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/).
+For each test, 10 warmup steps are done and then the next 100 steps are
+averaged.
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
new file mode 100644
index 00000000000..7c1cd152d37
--- /dev/null
+++ b/tensorflow/docs_src/performance/index.md
@@ -0,0 +1,45 @@
+# Performance
+
+Performance is often a significant issue when training a machine learning
+model.  This section explains various ways to optimize performance.  Start
+your investigation with the @{$performance_guide$Performance Guide} and then go
+deeper with techniques detailed in @{$performance_models$High-Performance Models}:
+
+  * @{$performance_guide$Performance Guide}, which contains a collection of best
+    practices for optimizing your TensorFlow code.
+
+  * @{$performance_models$High-Performance Models}, which contains a collection
+    of advanced techniques to build highly scalable models targeting different
+    system types and network topologies.
+
+  * @{$benchmarks$Benchmarks}, which contains a collection of benchmark
+    results.
+
+XLA (Accelerated Linear Algebra) is an experimental compiler for linear
+algebra that optimizes TensorFlow computations. The following guides explore
+XLA:
+
+  * @{$xla$XLA Overview}, which introduces XLA.
+  * @{$broadcasting$Broadcasting Semantics}, which describes XLA's
+    broadcasting semantics.
+  * @{$developing_new_backend$Developing a new back end for XLA}, which
+    explains how to re-target TensorFlow in order to optimize the performance
+    of the computational graph for particular hardware.
+  * @{$jit$Using JIT Compilation}, which describes the XLA JIT compiler that
+    compiles and runs parts of TensorFlow graphs via XLA in order to optimize
+    performance.
+  * @{$operation_semantics$Operation Semantics}, which is a reference manual
+    describing the semantics of operations in the `ComputationBuilder`
+    interface.
+  * @{$shapes$Shapes and Layout}, which details the `Shape` protocol buffer.
+  * @{$tfcompile$Using AOT compilation}, which explains `tfcompile`, a
+    standalone tool that compiles TensorFlow graphs into executable code in
+    order to optimize performance.
+
+And finally, we offer the following guide:
+
+  * @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
+    can explains how to use quantization to reduce model size, both in storage
+    and at runtime. Quantization can improve performance, especially on
+    mobile hardware.
+
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
new file mode 100644
index 00000000000..d2284732208
--- /dev/null
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -0,0 +1,12 @@
+performance_guide.md
+performance_models.md
+benchmarks.md
+quantization.md
+>>>
+xla/index.md
+xla/broadcasting.md
+xla/developing_new_backend.md
+xla/jit.md
+xla/operation_semantics.md
+xla/shapes.md
+xla/tfcompile.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
new file mode 100644
index 00000000000..a5508ac23e5
--- /dev/null
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -0,0 +1,152 @@
+# Performance Guide
+
+This guide contains a collection of best practices for optimizing your
+TensorFlow code. The best practices apply to both new and experienced
+Tensorflow users.  As a complement to the best practices in this document, the
+@{$performance_models$High-Performance Models} document links to example code
+and details for creating models that scale on a variety of hardware.
+
+## Best Practices
+While optimizing implementations of different types of models can be different,
+the topics below cover best practices to get the most performance from
+TensorFlow. Although these suggestions focus on image-based models, we will
+regularly add tips for all kinds of models. The following list highlights key
+best practices:
+
+*   Build and install from source
+*   Utilize queues for reading data
+*   Preprocessing on the CPU
+*   Use `NCHW` image data format
+*   Place shared parameters on the GPU
+*   Use fused batch norm
+
+The following sections detail the preceding suggestions.
+
+### Build and install from source
+
+To install the most optimized version of TensorFlow, build and install
+TensorFlow from source by following [Installing TensorFlow from Source](../install/install_sources).
+Building from source with compiler optimizations for the target hardware and
+ensuring the latest CUDA platform and cuDNN libraries are installed results in
+the highest performing installs.
+
+For the most stable experience, build from the [latest release](https://github.com/tensorflow/tensorflow/releases)
+branch. To get the latest performance changes and accept some stability risk,
+build from [master](https://github.com/tensorflow/tensorflow).
+
+If there is a need to build TensorFlow on a platform that has different hardware
+than the target, then cross-compile with the highest optimizations for the target
+platform.  The following command is an example of telling `bazel` to compile for
+a specific platform:
+
+```python
+# This command optimizes for Intel’s Broadwell processor
+bazel build -c opt --copt=-march="broadwell" --config=cuda //tensorflow/tools/pip_package:build_pip_package
+
+```
+
+#### Environment, build, and install tips
+
+*   Compile with the highest level of compute the [GPU
+    supports](http://developer.nvidia.com/cuda-gpus), e.g. P100: 6.0, Titan X
+    (pascal): 6.2, Titan X (maxwell): 5.2, and K80: 3.7.
+*   Install the latest CUDA platform and cuDNN libraries.
+*   Make sure to use a version of gcc that supports all of the optimizations of
+    the target CPU. The recommended minimum gcc version is 4.8.3.  On OS X upgrade
+    to the latest Xcode version and use the version of clang that comes with Xcode.
+*   TensorFlow checks on startup whether it has been compiled with the
+    optimizations available on the CPU. If the optimizations are not included,
+    TensorFlow will emit warnings, e.g. AVX, AVX2, and FMA instructions not
+    included.
+
+### Utilize queues for reading data
+
+One common cause of poor performance is underutilizing GPUs, or essentially
+"starving" them of data by not setting up an efficient pipeline. Make sure to
+set up an input pipeline to utilize queues and stream data effectively. Review
+the @{$reading_data#reading_from_files$Reading Data guide} for implementation
+details. One way to identify a "starved" GPU is to generate and review
+timelines. A detailed tutorial for timelines does not exist, but a quick example
+of generating a timeline exists as part of the @{$jit$XLA JIT} tutorial. Another
+simple way to check if a GPU is underutilized is to run `watch nvidia-smi`, and
+if GPU utilization is not approaching 100% then the GPU is not getting data fast
+enough.
+
+Unless for a special circumstance or for example code, do not feed data
+into the session from Python variables, e.g. `dictionary`.
+
+```python
+# Using feed_dict often results in suboptimal performance when using large inputs.
+sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
+```
+
+### Preprocessing on the CPU
+
+Placing preprocessing operations on the CPU can significantly improve
+performance.  When preprocessing occurs on the GPU the flow of data is
+CPU -> GPU (preprocessing) -> CPU -> GPU (training).  The data is bounced back
+and forth between the CPU and GPU.  When preprocessing is placed on the CPU,
+the data flow is CPU (preprocessing) -> GPU (training).  Another benefit is
+preprocessing on the CPU frees GPU time to focus on training.
+
+Placing preprocessing on the CPU can result in a 6X+ increase in samples/sec
+processed, which could lead to training in 1/6th of the time.  To ensure
+preprocessing is on the CPU, wrap the preprocessing operations as shown below:
+
+```python
+with tf.device('/cpu:0'):
+  # function to get and process images or data.
+  distorted_inputs = load_and_distort_images()
+```
+
+### Use large files
+
+Under some circumstances, both the CPU and GPU can be starved for data by the
+I/O system. If you are using many small files to form your input data set, you
+may be limited by the speed of your filesystem. If your training loop runs
+faster when using SSDs vs HDDs for storing your input data, you could could be
+I/O bottlenecked.
+
+If this is the case, you should pre-process your input data, creating a few
+large TFRecord files.
+
+### Use NCHW image data format
+
+Image data format refers to the representation of batches of images. TensorFlow
+supports `NHWC` (TensorFlow default) and `NCHW` (cuDNN default). N refers to the
+number of images in a batch, H refers to the number of pixels in the vertical
+dimension, W refers to the number of pixels in the horizontal dimension, and C
+refers to the channels (e.g. 1 for black and white, 3 for RGB, etc.) Although
+cuDNN can operate on both formats, it is faster to operate in its default
+format.
+
+The best practice is to build models that work with both `NCHW` and `NHWC` as it
+is common to train using `NCHW` on GPU, and then do inference with NHWC on CPU.
+
+There are edge cases where `NCHW` can be slower on GPU than `NHWC`. One
+[case](https://github.com/tensorflow/tensorflow/issues/7551#issuecomment-280421351)
+is using non-fused batch norm on WRN-16-4 without dropout. In that case using
+fused batch norm, which is also recommended, is the optimal solution.
+
+The very brief history of these two formats is that TensorFlow started by using
+`NHWC` because it was a little faster on CPUs. Then the TensorFlow team
+discovered that `NCHW` performs better when using the NVIDIA cuDNN library.  The
+current recommendation is that users support both formats in their models. In
+the long term, we plan to rewrite graphs to make switching between the formats
+transparent.
+
+### Use fused batch norm
+
+When using batch norm
+@{tf.contrib.layers.batch_norm} set the attribute `fused=True`:
+
+```python
+bn = tf.contrib.layers.batch_norm(
+          input_layer, fused=True, data_format='NCHW'
+          scope=scope, **kwargs)
+```
+
+The non-fused batch norm does computations using several individual Ops. Fused
+batch norm combines the individual operations into a single kernel, which runs
+faster.
+
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
new file mode 100644
index 00000000000..aa4261f5454
--- /dev/null
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -0,0 +1,422 @@
+# High-Performance Models
+
+This document and accompanying
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+detail how to build highly scalable models that target a variety of system types
+and network topologies. The techniques in this document utilize some low-level
+TensorFlow Python primitives. In the future, many of these techniques will be
+incorporated into high-level APIs.
+
+## Input Pipeline
+
+The @{$performance_guide$Performance Guide} explains how to identify possible
+input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
+and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
+when using large inputs and processing with higher samples per second, such
+as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+This is due to the use of Python threads as its underlying implementation. The
+overhead of Python threads is too large.
+
+Another approach, which we have implemented in the
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks),
+is to build an input pipeline using the native parallelism in TensorFlow. Our
+implementation is made up of 3 stages:
+
+*   I/O reads: Choose and read image files from disk.
+*   Image Processing: Decode image records into images, preprocess, and organize
+    into mini-batches.
+*   CPU-to-GPU Data Transfer: Transfer images from CPU to GPU.
+
+The dominant part of each stage is executed in parallel with the other stages
+using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
+similar to @{tf.FIFOQueue}. The difference is that `StagingArea` offers simpler
+functionality and can be executed on both CPU and GPU in parallel with other
+stages. Breaking the input pipeline into 3 stages that operate independently in
+parallel is scalable and takes full advantage of large multi-core environments.
+The rest of this section details the stages followed by details about using
+`data_flow_ops.StagingArea`.
+
+### Parallelize I/O Reads
+
+`data_flow_ops.RecordInput` is used to parallelize reading from disk. Given a
+list of input files representing TFRecords, `RecordInput` continuously reads
+records using background threads. The records are placed into its own large
+internal pool and when it has loaded at least half of its capacity, it produces
+output tensors.
+
+This op has its own internal threads that are dominated by I/O time that consume
+minimal CPU, which allows it to run smoothly in parallel with the rest of the
+model.
+
+### Parallelize Image Processing
+
+After images are read from `RecordInput` they are passed as tensors to the image
+processing pipeline. To make the image processing pipeline easier to explain,
+assume that the input pipeline is targeting 8 GPUs with a batch size of 256 (32
+per GPU).
+
+256 records are read and processed individually in parallel. This starts with
+256 independent `RecordInput` read ops in the graph. Each read op is followed by
+an identical set of ops for image preprocessing that are considered independent
+and executed in parallel. The image preprocessing ops include operations such as
+image decoding, distortion, and resizing.
+
+Once the images are through preprocessing, they are concatenated together into 8
+tensors each with a batch-size of 32. Rather than using @{tf.concat} for this
+purpose, which is implemented as a single op that waits for all the inputs to be
+ready before concatenating them together, @{tf.parallel_stack} is used.
+@{tf.parallel_stack} allocates an uninitialized tensor as an output, and each
+input tensor is written to its designated portion of the output tensor as soon
+as the input is available.
+
+When all the input tensors are finished, the output tensor is passed along in
+the graph. This effectively hides all the memory latency with the long tail of
+producing all the input tensors.
+
+### Parallelize CPU-to-GPU Data Transfer
+
+Continuing with the assumption that the target is 8 GPUs with a batch size of
+256 (32 per GPU). Once the input images are processed and concatenated together
+by the CPU, we have 8 tensors each with a batch-size of 32.
+
+TensorFlow enables tensors from one device to be used on any other device
+directly. TensorFlow inserts implicit copies to make the tensors available on
+any devices where they are used. The runtime schedules the copy between devices
+to run before the tensors are actually used. However, if the copy cannot finish
+in time, the computation that needs those tensors will stall and result in
+decreased performance.
+
+In this implementation, `data_flow_ops.StagingArea` is used to explicitly
+schedule the copy in parallel. The end result is that when computation starts on
+the GPU, all the tensors are already available.
+
+### Software Pipelining
+
+With all the stages capable of being driven by different processors,
+`data_flow_ops.StagingArea` is used between them so they run in parallel.
+`StagingArea` is a queue-like operator similar to @{tf.FIFOQueue} that offers
+simpler functionalities that can be executed on both CPU and GPU.
+
+Before the model starts running all the stages, the input pipeline stages are
+warmed up to prime the staging buffers in between with one set of data.
+During each run step, one set of data is read from the staging buffers at
+the beginning of each stage, and one set is pushed at the end.
+
+For example: if there are three stages: A, B and C. There are two staging areas
+in between: S1 and S2. During the warm up, we run:
+
+```
+Warm up:
+Step 1: A0
+Step 2: A1  B0
+
+Actual execution:
+Step 3: A2  B1  C0
+Step 4: A3  B2  C1
+Step 5: A4  B3  C2
+```
+
+After the warm up, S1 and S2 each have one set of data in them. For each step of
+the actual execution, one set of data is consumed from each staging area, and
+one set is added to each.
+
+Benefits of using this scheme:
+
+*   All stages are non-blocking, since the staging areas always have one set of
+    data after the warm up.
+*   Each stage can run in parallel since they can all start immediately.
+*   The staging buffers have a fixed memory overhead. They will have at most one
+    extra set of data.
+*   Only a single`session.run()` call is needed to run all stages of the step,
+    which makes profiling and debugging much easier.
+
+## Best Practices in Building High-Performance Models
+
+Collected below are a couple of additional best practices that can improve
+performance and increase the flexibility of models.
+
+### Build the model with both NHWC and NCHW
+
+Most TensorFlow operations used by a CNN support both NHWC and NCHW data format.
+On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster.
+
+Building a model to support both data formats keeps the model flexible and
+capable of operating optimally regardless of platform. Most TensorFlow
+operations used by a CNN support both NHWC and NCHW data formats. The benchmark
+script was written to support both NCHW and NHWC. NCHW should always be used
+when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can
+be trained on GPUs using NCHW with inference done on CPU using NHWC with the
+weights obtained from training.
+
+### Use Fused Batch-Normalization
+
+The default batch-normalization in TensorFlow is implemented as composite
+operations. This is very general, but often leads to suboptimal performance. An
+alternative is to use fused batch-normalization which often has much better
+performance on GPU. Below is an example of using @{tf.contrib.layers.batch_norm}
+to implement fused batch-normalization.
+
+```python
+bn = tf.contrib.layers.batch_norm(
+          input_layer, fused=True, data_format='NCHW'
+          scope=scope)
+```
+
+## Variable Distribution and Gradient Aggregation
+
+During training, training variable values are updated using aggregated gradients
+and deltas. In the benchmark script, we demonstrate that with the flexible and
+general-purpose TensorFlow primitives, a diverse range of high-performance
+distribution and aggregation schemes can be built.
+
+Three examples of variable distribution and aggregation were included in the
+script:
+
+*   `parameter_server` where each replica of the training model reads the
+    variables from a parameter server and updates the variable independently.
+    When each model needs the variables, they are copied over through the
+    standard implicit copies added by the TensorFlow runtime. The example
+    [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+    illustrates using this method for local training, distributed synchronous
+    training, and distributed asynchronous training.
+*   `replicated` places an identical copy of each training variable on each
+    GPU. The forward and backward computation can start immediately as the
+    variable data is immediately available. Gradients are accumulated across all
+    GPUs, and the aggregated total is applied to each GPU's copy of the
+    variables to keep them in sync.
+*   `distributed_replicated` places an identical copy of the training parameters
+    on each GPU along with a master copy on the parameter servers. The forward
+    and backward computation can start immediately as the variable data is
+    immediately available. Gradients are accumulated across all GPUs on each
+    server and then the per-server aggregated gradients are applied to the
+    master copy. After all workers do this, each worker updates its copy of the
+    variable from the master copy.
+
+Below are additional details about each approach.
+
+### Parameter Server Variables
+
+The most common way trainable variables are managed in TensorFlow models is
+parameter server mode.
+
+In a distributed system, each worker process runs the same model, and parameter
+server processes own the master copies of the variables. When a worker needs a
+variable from a parameter server, it refers to it directly. The TensorFlow
+runtime adds implicit copies to the graph to make the variable value available
+on the computation device that needs it. When a gradient is computed on a
+worker, it is sent to the parameter server that owns the particular variable,
+and the corresponding optimizer is used to update the variable.
+
+There are some techniques to improve throughput:
+
+*   The variables are spread among parameter servers based on their size, for
+    load balancing.
+*   When each worker has multiple GPUs, gradients are accumulated across the
+    GPUs and a single aggregated gradient is sent to the parameter server. This
+    reduces the network bandwidth and the amount of work done by the parameter
+    servers.
+
+For coordinating between workers, a very common mode is async updates, where
+each worker updates the master copy of the variables without synchronizing with
+other workers. In our model, we demonstrate that it is fairly easy to introduce
+synchronization across workers so updates for all workers are finished in one
+step before the next step can start.
+
+The parameter server method can also be used for local training, In this case,
+instead of spreading the master copies of variables across parameters servers,
+they are either on the CPU or spread across the available GPUs.
+
+Due to the simple nature of this setup, this architecture has gained a lot of
+popularity within the community.
+
+This mode can be used in the script by passing
+`--variable_update=parameter_server`.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="parameter_server mode in distributed training"
+   src="../images/perf_parameter_server_mode_doc.png">
+</div>
+
+### Replicated Variables
+
+In this design, each GPU on the server has its own copy of each variable. The
+values are kept in sync across GPUs by applying the fully aggregated gradient to
+each GPU's copy of the variable.
+
+The variables and data are available at the start of training, so the forward
+pass of training can start immediately. Gradients are aggregated across the
+devices and the fully aggregated gradient is then applied to each local copy.
+
+Gradient aggregation across the server can be done in different ways:
+
+*   Using standard TensorFlow operations to accumulate the total on a single
+    device (CPU or GPU) and then copy it back to all GPUs.
+*   Using NVIDIA® NCCL, described below in the NCCL section.
+
+This mode can be used in the script by passing `--variable_update=replicated`.
+
+### Replicated Variables in Distributed Training
+
+The replicated method for variables can be extended to distributed training. One
+way to do this like the replicated mode: aggregate the gradients fully across
+the cluster and apply them to each local copy of the variable. This may be shown
+in a future version of this scripts; the scripts do present a different
+variation, described here.
+
+In this mode, in addition to each GPU's copy of the variables, a master copy is
+stored on the parameter servers. As with the replicated mode, training can start
+immediately using the local copies of the variables.
+
+As the gradients of the weights become available, they are sent back to the
+parameter servers and all local copies are updated:
+
+1.  All the gradients from the GPU on the same worker are aggregated together.
+2.  Aggregated gradients from each worker are sent to the parameter server that
+    owns the variable, where the specified optimizer is used to update the
+    master copy of the variable.
+3.  Each worker updates its local copy of the variable from the master. In the
+    example model, this is done with a cross-replica barrier that waits for all
+    the workers to finish updating the variables, and fetches the new variable
+    only after the barrier has been released by all replicas. Once the copy
+    finishes for all variables, this marks the end of a training step, and a new
+    step can start.
+
+Although this sounds similar to the standard use of parameter servers, the
+performance is often better in many cases. This is largely due to the fact the
+computation can happen without any delay, and much of the copy latency of early
+gradients can be hidden by later computation layers.
+
+This mode can be used in the script by passing
+`--variable_update=distributed_replicated`.
+
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="distributed_replicated mode"
+   src="../images/perf_distributed_replicated_mode_doc.png">
+</div>
+
+#### NCCL
+
+In order to broadcast variables and aggregate gradients across different GPUs
+within the same host machine, we can use the default TensorFlow implicit copy
+mechanism.
+
+However, we can instead use the optional NCCL (@{tf.contrib.nccl}) support. NCCL
+is an NVIDIA® library that can efficiently broadcast and aggregate data across
+different GPUs. It schedules a cooperating kernel on each GPU that knows how to
+best utilize the underlying hardware topology; this kernel uses a single SM of
+the GPU.
+
+In our experiment, we demonstrate that although NCCL often leads to much faster
+data aggregation by itself, it doesn't necessarily lead to faster training. Our
+hypothesis is that the implicit copies are essentially free since they go to the
+copy engine on GPU, as long as its latency can be hidden by the main computation
+itself. Although NCCL can transfer data faster, it takes one SM away, and adds
+more pressure to the underlying L2 cache. Our results show that for 8-GPUs, NCCL
+often leads to better performance. However, for fewer GPUs, the implicit copies
+often perform better.
+
+#### Staged Variables
+
+We further introduce a staged-variable mode where we use staging areas for both
+the variable reads, and their updates. Similar to software pipelining of the
+input pipeline, this can hide the data copy latency. If the computation time
+takes longer than the copy and aggregation, the copy itself becomes essentially
+free.
+
+The downside is that all the weights read are from the previous training step.
+So it is a different algorithm from SGD. But it is possible to improve its
+convergence by adjusting learning rate and other hyperparameters.
+
+## Executing the script
+
+This section lists the core command line arguments and a few basic examples for
+executing the main script
+([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
+
+> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`,
+> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released
+> building from source is advised.
+
+#### Base command line arguments
+
+*   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
+    `alexnet`.
+*   **`num_gpus`**: Number of GPUs to use.
+*   **`data_dir`**: Path to data to process. If not set, synthetic data is used.
+    To use Imagenet data use these
+    [instructions](https://github.com/tensorflow/models/tree/master/inception#getting-started)
+    as a starting point.
+*   **`batch_size`**: Batch size for each GPU.
+*   **`variable_update`**: The method for managing variables: `parameter_server`
+    ,`replicated`, `distributed_replicated`, `independent`
+*   **`local_parameter_device`**: Device to use as parameter server: `cpu` or
+    `gpu`.
+
+#### Single instance examples
+
+```bash
+# VGG16 training ImageNet with 8 GPUs using arguments that optimize for
+# Google Compute Engine.
+python tf_cnn_benchmarks.py --local_parameter_device=cpu --num_gpus=8 \
+--batch_size=32 --model=vgg16 --data_dir=/home/ubuntu/imagenet/train \
+--variable_update=parameter_server --nodistortions
+
+# VGG16 training synthetic ImageNet data with 8 GPUs using arguments that
+# optimize for the NVIDIA DGX-1.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=replicated --use_nccl=True
+
+# VGG16 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=parameter_server
+
+# ResNet-50 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=replicated --use_nccl=False
+
+```
+
+#### Distributed command line arguments
+
+*   **`ps_hosts`**: Comma separated list of hosts to use as parameter servers
+    in the format of ```<host>:port```, e.g. ```10.0.0.2:50000```.
+*   **`worker_hosts`**: Comma separated list of hosts to use as workers in the
+    format of ```<host>:port```, e.g. ```10.0.0.2:50001```.
+*   **`task_index`**: Index of the host in the list of `ps_hosts` or
+    `worker_hosts` being started.
+*   **`job_name`**: Type of job, e.g `ps` or `worker`
+
+#### Distributed examples
+
+Below is an example of training ResNet-50 on 2 hosts: host_0 (10.0.0.1) and
+host_1 (10.0.0.2). The example uses synthetic data. To use real data pass the
+`--data_dir` argument.
+
+```bash
+# Run the following commands on host_0 (10.0.0.1):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+
+# Run the following commands on host_1 (10.0.0.2):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+```
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
new file mode 100644
index 00000000000..a37748d0c91
--- /dev/null
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -0,0 +1,238 @@
+# How to Quantize Neural Networks with TensorFlow
+
+When modern neural networks were being developed, the biggest challenge was
+getting them to work at all! That meant that accuracy and speed during training
+were the top priorities. Using floating point arithmetic was the easiest way to
+preserve accuracy, and GPUs were well-equipped to accelerate those calculations,
+so it's natural that not much attention was paid to other numerical formats.
+
+These days, we actually have a lot of models being deployed in commercial
+applications. The computation demands of training grow with the number of
+researchers, but the cycles needed for inference expand in proportion to users.
+That means pure inference efficiency has become a burning issue for a lot of
+teams.
+
+That is where quantization comes in. It's an umbrella term that covers a lot of
+different techniques to store numbers and perform calculations on them in more
+compact formats than 32-bit floating point. I am going to focus on eight-bit
+fixed point, for reasons I'll go into more detail on later.
+
+[TOC]
+
+## Why does Quantization Work?
+
+Training neural networks is done by applying many tiny nudges to the weights,
+and these small increments typically need floating point precision to work
+(though there are research efforts to use quantized representations here too).
+
+Taking a pre-trained model and running inference is very different. One of the
+magical qualities of deep networks is that they tend to cope very well with high
+levels of noise in their inputs. If you think about recognizing an object in a
+photo you've just taken, the network has to ignore all the CCD noise, lighting
+changes, and other non-essential differences between it and the training
+examples it's seen before, and focus on the important similarities instead. This
+ability means that they seem to treat low-precision calculations as just another
+source of noise, and still produce accurate results even with numerical formats
+that hold less information.
+
+## Why Quantize?
+
+Neural network models can take up a lot of space on disk, with the original
+AlexNet being over 200 MB in float format for example. Almost all of that size
+is taken up with the weights for the neural connections, since there are often
+many millions of these in a single model. Because they're all slightly different
+floating point numbers, simple compression formats like zip don't compress them
+well. They are arranged in large layers though, and within each layer the
+weights tend to be normally distributed within a certain range, for example -3.0
+to 6.0.
+
+The simplest motivation for quantization is to shrink file sizes by storing the
+min and max for each layer, and then compressing each float value to an
+eight-bit integer representing the closest real number in a linear set of 256
+within the range. For example with the -3.0 to 6.0 range, a 0 byte would
+represent -3.0, a 255 would stand for 6.0, and 128 would represent about 1.5.
+I'll go into the exact calculations later, since there's some subtleties, but
+this means you can get the benefit of a file on disk that's shrunk by 75%, and
+then convert back to float after loading so that your existing floating-point
+code can work without any changes.
+
+Another reason to quantize is to reduce the computational resources you need to
+do the inference calculations, by running them entirely with eight-bit inputs
+and outputs. This is a lot more difficult since it requires changes everywhere
+you do calculations, but offers a lot of potential rewards. Fetching eight-bit
+values only requires 25% of the memory bandwidth of floats, so you'll make much
+better use of caches and avoid bottlenecking on RAM access. You can also
+typically use SIMD operations that do many more operations per clock cycle. In
+some case you'll have a DSP chip available that can accelerate eight-bit
+calculations too, which can offer a lot of advantages.
+
+Moving calculations over to eight bit will help you run your models faster, and
+use less power (which is especially important on mobile devices). It also opens
+the door to a lot of embedded systems that can't run floating point code
+efficiently, so it can enable a lot of applications in the IoT world.
+
+## Why Not Train in Lower Precision Directly?
+
+There have been some experiments training at lower bit depths, but the results
+seem to indicate that you need higher than eight bit to handle the back
+propagation and gradients. That makes implementing the training more
+complicated, and so starting with inference made sense. We also already have a
+lot of float models already that we use and know well, so being able to convert
+them directly is very convenient.
+
+## How Can You Quantize Your Models?
+
+TensorFlow has production-grade support for eight-bit calculations built in. It
+also has a process for converting many models trained in floating-point over to
+equivalent graphs using quantized calculations for inference. For example,
+here's how you can translate the latest GoogLeNet model into a version that uses
+eight-bit computations:
+
+```sh
+curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
+tar xzf /tmp/inceptionv3.tgz -C /tmp/
+bazel build tensorflow/tools/graph_transforms:transform_graph
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+  --in_graph=/tmp/classify_image_graph_def.pb \
+  --outputs="softmax" --out_graph=/tmp/quantized_graph.pb \
+  --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
+    remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
+    fold_batch_norms fold_old_batch_norms quantize_weights quantize_nodes
+    strip_unused_nodes sort_by_execution_order'
+```
+
+This will produce a new model that runs the same operations as the original, but
+with eight bit calculations internally, and all weights quantized as well. If
+you look at the file size, you'll see it's about a quarter of the original (23MB
+versus 91MB). You can still run this model using exactly the same inputs and
+outputs though, and you should get equivalent results. Here's an example:
+
+```sh
+# Note: You need to add the dependencies of the quantization operation to the
+#       cc_binary in the BUILD file of the label_image program:
+#
+#     //tensorflow/contrib/quantization:cc_ops
+#     //tensorflow/contrib/quantization/kernels:quantized_ops
+
+bazel build tensorflow/examples/label_image:label_image
+bazel-bin/tensorflow/examples/label_image/label_image \
+--image=<input-image> \
+--graph=/tmp/quantized_graph.pb \
+--labels=/tmp/imagenet_synset_to_human_label_map.txt \
+--input_width=299 \
+--input_height=299 \
+--input_mean=128 \
+--input_std=128 \
+--input_layer="Mul:0" \
+--output_layer="softmax:0"
+```
+
+You'll see that this runs the newly-quantized graph, and outputs a very similar
+answer to the original.
+
+You can run the same process on your own models saved out as GraphDefs, with the
+input and output names adapted to those your network requires. I recommend that
+you run them through the freeze_graph script first, to convert checkpoints into
+constants stored in the file.
+
+## How Does the Quantization Process Work?
+
+We've implemented quantization by writing equivalent eight-bit versions of
+operations that are commonly used during inference. These include convolution,
+matrix multiplication, activation functions, pooling operations and
+concatenation. The conversion script first replaces all the individual ops it
+knows about with quantized equivalents. These are small sub-graphs that have
+conversion functions before and after to move the data between float and
+eight-bit. Below is an example of what they look like. First here's the original
+Relu operation, with float inputs and outputs:
+
+![Relu Diagram](https://www.tensorflow.org/images/quantization0.png)
+
+Then, this is the equivalent converted subgraph, still with float inputs and
+outputs, but with internal conversions so the calculations are done in eight
+bit.
+
+![Converted Diagram](https://www.tensorflow.org/images/quantization1.png)
+
+The min and max operations actually look at the values in the input float
+tensor, and then feeds them into the Dequantize operation that converts the
+tensor into eight-bits. There are more details on how the quantized representation
+works later on.
+
+Once the individual operations have been converted, the next stage is to remove
+unnecessary conversions to and from float. If there are consecutive sequences of
+operations that all have float equivalents, then there will be a lot of adjacent
+Dequantize/Quantize ops. This stage spots that pattern, recognizes that they
+cancel each other out, and removes them, like this:
+
+![Stripping Diagram](https://www.tensorflow.org/images/quantization2.png)
+
+Applied on a large scale to models where all of the operations have quantized
+equivalents, this gives a graph where all of the tensor calculations are done in
+eight bit, without having to convert to float.
+
+## What Representation is Used for Quantized Tensors?
+
+We approach converting floating-point arrays of numbers into eight-bit
+representations as a compression problem. We know that the weights and
+activation tensors in trained neural network models tend to have values that are
+distributed across comparatively small ranges (for example you might have -15 to
++15 for weights, -500 to 1000 for activations on an image model, though the
+exact numbers will vary). We also know from experiment that neural nets tend to
+be very robust in the face of noise, and so the noise-like error produced by
+quantizing down to a small set of values will not hurt the precision of the
+overall results very much. We also want to pick a representation that's easy to
+perform calculations on, especially the large matrix multiplications that form
+the bulk of the work that's needed to run a model.
+
+These led us to pick a representation that has two floats to store the overall
+minimum and maximum values that are represented by the lowest and highest
+quantized value. Each entry in the quantized array represents a float value in
+that range, distributed linearly between the minimum and maximum. For example,
+if we have minimum = -10.0, and maximum = 30.0f, and an eight-bit array, here's
+what the quantized values represent:
+
+```
+Quantized | Float
+--------- | -----
+0         | -10.0
+255       | 30.0
+128       | 10.0
+```
+
+The advantages of this format are that it can represent arbitrary magnitudes of
+ranges, they don't have to be symmetrical, it can represent signed and unsigned
+values, and the linear spread makes doing multiplications straightforward. There
+are alternatives like [Song Han's code books](http://arxiv.org/pdf/1510.00149.pdf)
+that can use lower bit depths by non-linearly distributing the float values
+across the representation, but these tend to be more expensive to calculate on.
+
+The advantage of having a strong and clear definition of the quantized format is
+that it's always possible to convert back and forth from float for operations
+that aren't quantization-ready, or to inspect the tensors for debugging
+purposes. One implementation detail in TensorFlow that we're hoping to improve
+in the future is that the minimum and maximum float values need to be passed as
+separate tensors to the one holding the quantized values, so graphs can get a
+bit dense!
+
+The nice thing about the minimum and maximum ranges is that they can often be
+pre-calculated. Weight parameters are constants known at load time, so their
+ranges can also be stored as constants. We often know the ranges for inputs (for
+examples images are usually RGB values in the range 0.0 to 255.0), and many
+activation functions have known ranges too. This can avoid having to analyze the
+outputs of an operation to determine the range, which we need to do for math ops
+like convolution or matrix multiplication which produce 32-bit accumulated
+results from 8-bit inputs.
+
+## What's Next?
+
+We've found that we can get extremely good performance on mobile and embedded
+devices by using eight-bit arithmetic rather than floating-point. You can see
+the framework we use to optimize matrix multiplications at
+[gemmlowp](https://github.com/google/gemmlowp). We still need to apply all the
+lessons we've learned to the TensorFlow ops to get maximum performance on
+mobile, but we're actively working on that. Right now, this quantized
+implementation is a reasonably fast and accurate reference implementation that
+we're hoping will enable wider support for our eight-bit models on a wider
+variety of devices. We also hope that this demonstration will encourage the
+community to explore what's possible with low-precision neural networks.
diff --git a/tensorflow/g3doc/experimental/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
similarity index 100%
rename from tensorflow/g3doc/experimental/xla/broadcasting.md
rename to tensorflow/docs_src/performance/xla/broadcasting.md
diff --git a/tensorflow/g3doc/experimental/xla/developing_new_backend.md b/tensorflow/docs_src/performance/xla/developing_new_backend.md
similarity index 100%
rename from tensorflow/g3doc/experimental/xla/developing_new_backend.md
rename to tensorflow/docs_src/performance/xla/developing_new_backend.md
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
new file mode 100644
index 00000000000..19045b45d92
--- /dev/null
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -0,0 +1,93 @@
+# XLA Overview
+
+> Note: XLA is experimental and considered alpha.  Most use cases will not
+> see improvements in performance (speed or decreased memory usage). We have
+> released XLA early so the Open Source Community can contribute to its
+> development, as well as create a path for integration with hardware
+> accelerators.
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. The results are improvements in
+speed, memory usage, and portability on server and mobile platforms. Initially,
+most users will not see large benefits from XLA, but are welcome to experiment
+by using XLA via @{$jit$just-in-time (JIT) compilation} or @{$tfcompile$ahead-of-time (AOT) compilation}. Developers targeting new hardware accelerators are
+especially encouraged to try out XLA.
+
+The XLA framework is experimental and in active development. In particular,
+while it is unlikely that the semantics of existing operations will change, it
+is expected that more operations will be added to cover important use cases. The
+team welcomes feedback from the community about missing functionality and
+community contributions via GitHub.
+
+## Why did we build XLA?
+
+We had several objectives for XLA to work with TensorFlow:
+
+*   *Improve execution speed.* Compile subgraphs to reduce the execution time of
+    short-lived Ops to eliminate overhead from the TensorFlow runtime, fuse
+    pipelined operations to reduce memory overhead, and specialize to known
+    tensor shapes to allow for more aggressive constant propagation.
+
+*   *Improve memory usage.* Analyze and schedule memory usage, in principle
+    eliminating many intermediate storage buffers.
+
+*   *Reduce reliance on custom Ops.* Remove the need for many custom Ops by
+    improving the performance of automatically fused low-level Ops to match the
+    performance of custom Ops that were fused by hand.
+
+*   *Reduce mobile footprint.* Eliminate the TensorFlow runtime by ahead-of-time
+    compiling the subgraph and emitting an object/header file pair that can be
+    linked directly into another application. The results can reduce the
+    footprint for mobile inference by several orders of magnitude.
+
+*   *Improve portability.* Make it relatively easy to write a new backend for
+    novel hardware, at which point a large fraction of TensorFlow programs will
+    run unmodified on that hardware. This is in contrast with the approach of
+    specializing individual monolithic Ops for new hardware, which requires
+    TensorFlow programs to be rewritten to make use of those Ops.
+
+## How does XLA work?
+
+The input language to XLA is called "HLO IR", or just HLO (High Level
+Optimizer). The semantics of HLO are described on the
+@{$operation_semantics$Operation Semantics} page. It
+is most convenient to think of HLO as a [compiler
+IR](https://en.wikipedia.org/wiki/Intermediate_representation).
+
+XLA takes graphs ("computations") defined in HLO and compiles them into machine
+instructions for various architectures. XLA is modular in the sense that it is
+easy to slot in an alternative backend to @{$developing_new_backend$target some novel HW architecture}. The CPU backend for x64 and ARM64 as
+well as the NVIDIA GPU backend are in the TensorFlow source tree.
+
+The following diagram shows the compilation process in XLA:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
+</div>
+
+XLA comes with several optimizations and analyzes that are target-independent,
+such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
+target-independent operation fusion, and buffer analysis for allocating runtime
+memory for the computation.
+
+After the target-independent step, XLA sends the HLO computation to a backend.
+The backend can perform further HLO-level analyzes and optimizations, this time
+with target specific information and needs in mind. For example, the XLA GPU
+backend may perform operation fusion beneficial specifically for the GPU
+programming model and determine how to partition the computation into streams.
+At this stage, backends may also pattern-match certain operations or
+combinations thereof to optimized library calls.
+
+The next step is target-specific code generation. The CPU and GPU backends
+included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
+and code-generation. These backends emit the LLVM IR necessary to represent the
+XLA HLO computation in an efficient manner, and then invoke LLVM to emit native
+code from this LLVM IR.
+
+The GPU backend currently supports NVIDIA GPUs via the LLVM NVPTX backend; the
+CPU backend supports multiple CPU ISAs.
+
+## Supported Platforms
+
+XLA currently supports @{$jit$JIT compilation} on x86-64 and NVIDIA GPUs; and
+@{$tfcompile$AOT compilation} for x86-64 and ARM.
diff --git a/tensorflow/g3doc/experimental/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
similarity index 79%
rename from tensorflow/g3doc/experimental/xla/jit.md
rename to tensorflow/docs_src/performance/xla/jit.md
index d2eb070c98a..d4dc3e57c8f 100644
--- a/tensorflow/g3doc/experimental/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -35,8 +35,7 @@ placed on the same device.
 
 Turning on JIT compilation at the session level will result in all possible
 operators being greedily compiled into XLA computations. Each XLA computation
-will be compiled into one or more kernels for the underlying device. (This does
-not mean everything will be fused into a single CUDA kernel, for example.)
+will be compiled into one or more kernels for the underlying device.
 
 Subject to a few constraints, if there are two adjacent operators in the graph
 that both have XLA implementations, then they will be compiled into a single XLA
@@ -54,6 +53,11 @@ config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON
 sess = tf.Session(config=config)
 ```
 
+> Note: Turning on JIT at the session level will not result in operations being
+> compiled for the CPU. JIT compilation for CPU operations must be done via
+> the manual method documented below. This decision was made due to the CPU
+> backend being single-threaded.
+
 #### Manual
 
 JIT compilation can also be turned on manually for one or more operators. This
@@ -93,8 +97,13 @@ it expensive to mix XLA and TensorFlow operators in the same graph.
 ## Tutorial
 
 This tutorial covers training a simple version of MNIST softmax with JIT turned
-on. While, the tutorial was created using CPU only, the steps are the same and
-the artifacts are similar to running on GPU.
+on. Currently JIT at the session level, which is what is used for the tutorial,
+only supports GPU.
+
+Before starting the tutorial verify that the LD_LIBRARY environment variable or
+ldconfig contains `$CUDA_ROOT/extras/CUPTI/lib64`, which contains libraries for
+the CUDA Profiling Tools Interface [(CUPTI)](http://docs.nvidia.com/cuda/cupti/index.html).
+TensorFlow uses CUPTI to pull tracing information from the GPU.
 
 ### Step #1: Prepare sample script
 
@@ -107,7 +116,7 @@ into a folder outside of the TensorFlow source tree.
 Execute the python script to train the model without XLA.
 
 ```shell
-python mnist_softmax_xla.py --xla=false
+python mnist_softmax_xla.py --xla=''
 ```
 
 Using the Chrome Trace Event Profiler (browse to chrome://tracing),
@@ -115,7 +124,7 @@ open the timeline file created when the script finishes: `timeline.ctf.json`.
 The rendered timeline should look similar to the picture below with multiple
 green boxes labeled `MatMul`, possibly across multiple CPUs.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_cpu.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu.png">
 </div>
 
 ### Step #3 Run with XLA
@@ -130,7 +139,7 @@ TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
 should look similar to the picture below with one long bar labeled `_XlaLaunch`.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_cpu_xla.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu_xla.png">
 </div>
 
 To understand what is happening in `_XlaLaunch`, look at the console output for
@@ -142,15 +151,19 @@ pipeline start, before inline]: /tmp/hlo_graph_0.dot
 
 ```
 
-The debug statements point to the location of `hlo_graph_xx.dot` files that
-contain info about the graph created by XLA at runtime. To Render the .dot file
-into a png, install [GraphViz](http://www.graphviz.org/Download..php) and run:
+The console statements point to the location of `hlo_graph_xx.dot` files that
+contain information about the graph created by XLA. The process that XLA takes
+to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram
+in succession.
+
+To Render the .dot file into a png, install
+[GraphViz](http://www.graphviz.org/Download..php) and run:
 
 ```shell
-dot -Tpng hlo_graph_0.dot -o hlo_graph_0.png
+dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
 ```
 
 The result will look like the following:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_cpu_xla_graph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_gpu_xla_graph.png">
 </div>
diff --git a/tensorflow/g3doc/experimental/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
similarity index 85%
rename from tensorflow/g3doc/experimental/xla/operation_semantics.md
rename to tensorflow/docs_src/performance/xla/operation_semantics.md
index 4808b919b4f..b970aa5f5fe 100644
--- a/tensorflow/g3doc/experimental/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -65,7 +65,7 @@ The arity and types of the `args` must match the parameters of the
 
 See also
 [`ComputationBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
-and the [`Reshape`](#reshape) operation.
+and the @{tf.reshape} operation.
 
 Collapses dimensions of an array into one dimension.
 
@@ -87,7 +87,7 @@ same position in the dimension sequence as those they replace, with the new
 dimension size equal to the product of original dimension sizes. The lowest
 dimension number in `dimensions` is the slowest varying dimension (most major)
 in the loop nest which collapses these dimension, and the highest dimension
-number is fastest varying (most minor). See the [`Reshape`](#reshape) operator
+number is fastest varying (most minor). See the @{tf.reshape} operator
 if more general collapse ordering is needed.
 
 For example, let v be an array of 24 elements:
@@ -178,7 +178,7 @@ Concat({a, b}, 0)
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_concatenate.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
 ## ConvertElementType
@@ -293,8 +293,7 @@ array. The holes are filled with a no-op value, which for convolution means
 zeroes.
 
 Dilation of the rhs is also called atrous convolution. For more details, see the
-[TensorFlow documentation on atrous convolution](https://www.tensorflow.org/
-versions/r0.11/api_docs/python/nn.html#atrous_conv2d). Dilation of the lhs is
+@{tf.nn.atrous_conv2d}. Dilation of the lhs is
 also called deconvolution.
 
 The output shape has these dimensions, in this order:
@@ -474,7 +473,7 @@ Arguments | Type                    | Semantics
 `rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](broadcasting.md) documentation about what it means for shapes to
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays. In this variant, operations between arrays of
 different ranks are *not* supported, unless one of the operands is a scalar.
@@ -498,7 +497,7 @@ the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
 then broadcasts the shapes along these degenerate dimension to equalize the
 shapes of both operands. The semantics are described in detail on the
-[broadcasting page](broadcasting.md).
+@{$broadcasting$broadcasting page}.
 
 ## Element-wise comparison operations
 
@@ -521,7 +520,7 @@ Arguments | Type                    | Semantics
 `rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](broadcasting.md) documentation about what it means for shapes to
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays with the element type `PRED`. In this variant,
 operations between arrays of different ranks are *not* supported, unless one of
@@ -538,7 +537,7 @@ matrix to a vector).
 
 The additional `broadcast_dimensions` operand is a slice of integers specifying
 the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the [broadcasting page](broadcasting.md).
+in detail on the @{$broadcasting$broadcasting page}.
 
 ## Element-wise unary functions
 
@@ -552,6 +551,11 @@ ComputationBuilder supports these element-wise unary functions:
 
 <b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
 
+<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
+i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
+of `PRED` values with the same shape as the input, where each element is `true`
+if and only if the corresponding input element is finite.
+
 <b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
 
 <b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
@@ -574,6 +578,158 @@ Arguments | Type                    | Semantics
 The function is applied to each element in the `operand` array, resulting in an
 array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
 
+
+## BatchNormTraining
+
+See also
+[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+<b> Warning: Not implemented yet </b>
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized                       :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\beta\\ )                    :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension       |
+:                 :                         : in `operand`                     :
+
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and use the mean and variance to normalize each
+element in `operand`. If an invalid `feature_index` is passed, an error is
+produced.
+
+The algorithm goes as follows for each batch in `operand` \\(x\\) that
+contains `m` elements with `w` and `h` as the size of spatial dimensions (
+assuming `operand` is an 4 dimensional array):
+
+- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
+\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+
+- Calculates batch variance \\(\sigma^2_l\\):
+\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+
+- Normalizes, scales and shifts:
+\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+
+The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+
+The output type is a tuple of three ComputationDataHandles:
+
+| Outputs      | Type                    | Semantics                            |
+| ------------ | ----------------------- | -------------------------------------|
+| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+:              :                         : shape as input `operand` (y)         :
+| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+
+The `batch_mean` and `batch_var` are moments calculated across the batch and
+spatial dimensions using the formulars above.
+
+## BatchNormInference
+
+See also
+[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+<b> Warning: Not implemented yet </b>
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                       |
+| --------------  | ----------------------- | ------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
+:                 :                         : normalized                      :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
+| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
+| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
+| `epsilon`       | `float`                 | Epsilon value                   |
+| `feature_index` | `int64`                 | Index to feature dimension in   |
+:                 :                         : `operand`                       :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and use the mean and variance to normalize each
+element in `operand`. If an invalid `feature_index` is passed, an error is
+produced.
+
+`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+computing `mean` and `variance` for each batch. It uses the input `mean` and
+`variance` instead as estimated values. The purpose of this op is to reduce
+latency in inference, hence the name `BatchNormInference`.
+
+The output is a n dimensional, normalized array with the same shape as input
+`operand`.
+
+## BatchNormGrad
+
+See also
+[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+<b> Warning: Not implemented yet </b>
+
+Calculates gradients of batch norm.
+
+<b> `BatchNormGrad(x, scale, mean, variance, epsilon, grad_y, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------  | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized (x)                   :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\sigma^2\\))                 :
+| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+:                 :                         : `BatchNormTraining`              :
+:                 :                         : (\\( \nabla y\\))                :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension in    |
+:                 :                         : `operand`                        :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the gradients with
+respect to `operand`, `offset` and `scale` across all the other dimensions. If
+an invalid `feature_index` is passed, an error is produced.
+
+The three gradients are defined by the following formulas:
+
+\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+
+\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+
+\\( \nabla \beta = sum(\nabla y) \\)
+
+The inputs `mean` and `variance` represents moments value
+across batch and spatial dimensions.
+
+The output type is a tuple of three ComputationDataHandles:
+
+|Outputs       | Type                    | Semantics                           |
+|------------- | ----------------------- | ------------------------------------|
+|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `operand`                           :
+|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `offset`                            :
+|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `scale`                             :
+
+
 ## GetTupleElement
 
 See also
@@ -593,7 +749,7 @@ let t: (f32[10], s32) = tuple(v, s);
 let element_1: s32 = gettupleelement(t, 1);  // Inferred shape matches s32.
 ```
 
-See also [`Tuple`](#tuple).
+See also @{tf.tuple}.
 
 ## Infeed
 
@@ -703,7 +859,7 @@ are all 0. Figure below shows examples of different `edge_padding` and
 `interior_padding` values for a two dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_pad.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
 </div>
 
 ## Reduce
@@ -730,9 +886,15 @@ inserted anywhere during computation if the back-end chooses to do so. So in
 most cases `init_value` should be an identity of the reduction function (for
 example, 0 for addition).
 
-The evaluation order of the reduction function across the reduction dimensions
-is arbitrary and may be non-deterministic. Therefore, the reduction function
-should not be overly sensitive to reassociation[^1].
+The evaluation order of the reduction function is arbitrary and may be
+non-deterministic. Therefore, the reduction function should not be overly
+sensitive to reassociation.
+
+Some reduction functions like addition are not strictly associative for floats.
+However, if the range of the data is limited, floating-point addition is close
+enough to being associative for most practical uses. It is possible to conceive
+of some completely non-associative reductions, however, and these will produce
+incorrect or unpredictable results in XLA reductions.
 
 As an example, when reducing across the one dimension in a 1D array with values
 [10, 11, 12, 13], with reduction function `f` (this is `computation`) then that
@@ -771,13 +933,13 @@ Here's an example of reducing a 2D array (matrix). The shape has rank 2,
 dimension 0 of size 2 and dimension 1 of size 3:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
 </div>
 
 Results of reducing dimensions 0 or 1 with an "add" function:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
 </div>
 
 Note that both reduction results are 1D arrays. The diagram shows one as column
@@ -788,7 +950,7 @@ size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
 values 1 to 6 are replicated across dimension 0.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_3d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
 </div>
 
 Similarly to the 2D example, we can reduce just one dimension. If we reduce
@@ -796,8 +958,8 @@ dimension 0, for example, we get a rank-2 array where all values across
 dimension 0 were folded into a scalar:
 
 ```text
-| 4  8  12 |
-| 4  8  12 |
+|  4   8  12 |
+| 16  20  24 |
 ```
 
 If we reduce dimension 2, we also get a rank-2 array where all values across
@@ -880,7 +1042,7 @@ builder.ReduceWindow(
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_window.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
 </div>
 
 Stride of 1 in a dimension specifies that the position of a window in the
@@ -892,9 +1054,14 @@ are the same as though the input came in with the dimensions it has after
 padding.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:75%" src="../../images/ops_reduce_window_stride.png">
+  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
 </div>
 
+The evaluation order of the reduction function is arbitrary and may be
+non-deterministic. Therefore, the reduction function should not be overly
+sensitive to reassociation. See the discussion about associativity in the
+context of [`Reduce`](#reduce) for more details.
+
 ## Reshape
 
 See also
@@ -1113,7 +1280,7 @@ padding, source, init_value, scatter)`</b>
 | `source`            | `ComputationDataHandle` | array of type T with the     |
 :                     :                         : values to scatter            :
 | `init_value`        | `ComputationDataHandle` | scalar value of type T for   |
-:                     :                         : the inital value of the      :
+:                     :                         : the initial value of the     :
 :                     :                         : output array                 :
 | `scatter`           | `Computation`           | binary computation of type   |
 :                     :                         : `T, T -> T`, to apply each   :
@@ -1129,9 +1296,14 @@ addition `scatter` function produces the output element of value 8 (2 + 6).
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%"
-    src="../../images/ops_scatter_to_selected_window_element.png">
+    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
 </div>
 
+The evaluation order of the `scatter` function is arbitrary and may be
+non-deterministic. Therefore, the `scatter` function should not be overly
+sensitive to reassociation. See the discussion about associativity in the
+context of [`Reduce`](#reduce) for more details.
+
 ## Select
 
 See also
@@ -1376,7 +1548,7 @@ Arguments | Type                    | Semantics
 
 ## Transpose
 
-See also the [`Reshape`](#reshape) operation.
+See also the @{tf.reshape} operation.
 
 <b>`Transpose(operand)`</b>
 
@@ -1462,11 +1634,5 @@ while (result(0) < 1000) {
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_while.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
 </div>
-
-[^1]: Some obvious reductions like "add reduction" are not strictly associative
-    for floats. However, if the range of the data is limited, floating-point
-    addition is close enough to being associative for most practical uses. It
-    is possible to conceive some complete un-associative reductions, however,
-    and these will produce wrong results in TLA reductions.
diff --git a/tensorflow/g3doc/experimental/xla/shapes.md b/tensorflow/docs_src/performance/xla/shapes.md
similarity index 100%
rename from tensorflow/g3doc/experimental/xla/shapes.md
rename to tensorflow/docs_src/performance/xla/shapes.md
diff --git a/tensorflow/g3doc/experimental/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
similarity index 89%
rename from tensorflow/g3doc/experimental/xla/tfcompile.md
rename to tensorflow/docs_src/performance/xla/tfcompile.md
index c99a337a106..60aff2f6337 100644
--- a/tensorflow/g3doc/experimental/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -17,23 +17,21 @@ kernels that are actually used in the computation.
 The compiler is built on top of the XLA framework. The code bridging TensorFlow
 to the XLA framework resides under
 [tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/),
-which also includes support for [just-in-time (JIT) compilation](jit.md) of
+which also includes support for @{$jit$just-in-time (JIT) compilation} of
 TensorFlow graphs.
 
 ## What does tfcompile do?
 
 `tfcompile` takes a subgraph, identified by the TensorFlow concepts of
-[`feeds`](../../get_started/basic_usage.md#feeds) and
-[`fetches`](../../get_started/basic_usage.md#fetches), and generates a function
-that implements that subgraph. The feeds are the input arguments for the
-function, and the fetches are the output arguments for the function. All inputs
-must be fully specified by the feeds; the resulting pruned subgraph cannot
-contain Placeholder or Variable nodes. It is common to specify all Placeholders
-and Variables as feeds, which ensures the resulting subgraph no longer contains
-these nodes. The generated function is packaged as a `cc_library`, with a header
-file exporting the function signature, and an object file containing the
-implementation. The user writes code to invoke the generated function as
-appropriate.
+feeds and fetches, and generates a function that implements that subgraph.
+The `feeds` are the input arguments for the function, and the `fetches` are the
+output arguments for the function. All inputs must be fully specified by the
+feeds; the resulting pruned subgraph cannot contain Placeholder or Variable
+nodes. It is common to specify all Placeholders and Variables as feeds, which
+ensures the resulting subgraph no longer contains these nodes. The generated
+function is packaged as a `cc_library`, with a header file exporting the
+function signature, and an object file containing the implementation. The user
+writes code to invoke the generated function as appropriate.
 
 ## Using tfcompile
 
@@ -47,11 +45,9 @@ This section details high level steps for generating an executable binary with
 
 ### Step 1: Configure the subgraph to compile
 
-Identify the [`feeds`](../../get_started/basic_usage.md#feeds) and
-[`fetches`](../../get_started/basic_usage.md#fetches) of the graph, which
-correspond to the input and output arguments for the generated function. Then
-configure the feeds and fetches in a
-[`tensorflow.tfcompile.Config`](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile.proto)
+Identify the feeds and fetches that correspond to the input and output
+arguments for the generated function. Then configure the `feeds` and `fetches`
+in a [`tensorflow.tfcompile.Config`](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile.proto)
 proto.
 
 ```textproto
@@ -120,7 +116,7 @@ tf_library(
 > [make_test_graphs.py]("https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/make_test_graphs.py")
 > and specify the output location with the --out_dir flag.
 
-Typical graphs contain [`Variables`](../../api_docs/python/state_ops.md)
+Typical graphs contain @{$python/state_ops$`Variables`}
 representing the weights that are learned via training, but `tfcompile` cannot
 compile a subgraph that contain `Variables`. The
 [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py)
diff --git a/tensorflow/docs_src/programmers_guide/data_versions.md b/tensorflow/docs_src/programmers_guide/data_versions.md
new file mode 100644
index 00000000000..006aa2a8267
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/data_versions.md
@@ -0,0 +1,132 @@
+# TensorFlow Data Versioning: GraphDefs and Checkpoints
+
+As described in
+@{$version_semantics#compatibility-for-graphs-and-checkpoints$Compatibility for Graphs and Checkpoints},
+TensorFlow marks each kind of data with version information in order to maintain
+backward compatibility. This document provides additional details about the
+versioning mechanism, and how to use it to safely change data formats.
+
+## Backward and partial forward compatibility
+
+The two core artifacts exported from and imported into TensorFlow are
+checkpoints (serialized variable states) and `GraphDef`s (serialized computation
+graphs). Any approach to versioning these artifacts must take into account the
+following requirements:
+
+*   **Backward compatibility** to support loading `GraphDefs` created with older
+    versions of TensorFlow.
+*   **Forward compatibility** to support scenarios where the producer of a
+    `GraphDef` is upgraded to a newer version of TensorFlow before the consumer.
+*   Enable evolving TensorFlow in incompatible ways. For example, removing Ops,
+    adding attributes, and removing attributes.
+
+For `GraphDef`s, backward compatibility is enforced within a major version. This
+means functionality can only be removed between major versions. Forward
+compatibility is enforced within Patch releases (1.x.1 -> 1.x.2, for example).
+
+
+In order to achieve backward and forward compatibility as well as know when to
+enforce changes in formats, the serialized representations of graphs and
+variable state need to have metadata that describes when they were produced. The
+sections below detail the TensorFlow implementation and guidelines for evolving
+`GraphDef` versions.
+
+### Independent data version schemes
+
+There are data versions for `GraphDef`s and checkpoints. Both data formats
+evolve at different rates, and also at different speeds than the version of
+TensorFlow. Both versioning systems are defined in
+[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h).
+Whenever a new version is added a note is added to the header detailing what
+changed and the date.
+
+### Data, producers, and consumers
+
+This section discusses version information for **data**, binaries that produce
+data (**producers**), and binaries that consume data (**consumers**):
+
+*   Producer binaries have a version (`producer`) and a minimum consumer version
+    that they are compatible with (`min_consumer`).
+*   Consumer binaries have a version (`consumer`) and a minimum producer version
+    that they are compatible with (`min_producer`).
+*   Each piece of versioned data has a [`VersionDef
+    versions`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/versions.proto)
+    field which records the `producer` that made the data, the `min_consumer`
+    that it is compatible with, and a list of `bad_consumers` versions that are
+    disallowed.
+
+By default, when a producer makes some data, the data inherits the producer's
+`producer` and `min_consumer` versions. `bad_consumers` can be set if specific
+consumer versions are known to contain bugs and must be avoided. A consumer can
+accept a piece of data if
+
+*   `consumer` >= data's `min_consumer`
+*   data's `producer` >= consumer's `min_producer`
+*   `consumer` not in data's `bad_consumers`
+
+Since both producers and consumers come from the same TensorFlow code base,
+[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h)
+contains a main binary version which is treated as either `producer` or
+`consumer` depending on context and both `min_consumer` and `min_producer`
+(needed by producers and consumers, respectively). Specifically,
+
+*   For `GraphDef` versions, we have `TF_GRAPH_DEF_VERSION`,
+    `TF_GRAPH_DEF_VERSION_MIN_CONSUMER`, and
+    `TF_GRAPH_DEF_VERSION_MIN_PRODUCER`.
+*   For checkpoint versions, we have `TF_CHECKPOINT_VERSION`,
+    `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and
+    `TF_CHECKPOINT_VERSION_MIN_PRODUCER`.
+
+### Evolving GraphDef versions
+
+This section presents examples of using this versioning mechanism to make
+changes to the `GraphDef` format.
+
+**Adding a new Op:**
+
+1.  Add the new Op to both consumers and producers at the same time, and do not
+    change any `GraphDef` versions. This type of change is automatically
+    backward compatible, and does not impact forward compatibility plan since
+    existing producer scripts will not suddenly use the new functionality.
+
+**Adding a new Op and switching existing Python wrappers to use it:**
+
+1.  Implement new consumer functionality and increment the binary version.
+2.  If it is possible to make the wrappers use the new functionality only in
+    cases that did not work before, the wrappers can be updated now.
+3.  Change Python wrappers to use the new functionality. Do not increment
+    `min_consumer`, since models which do not use this Op should not break.
+
+**Removing an Op or restricting the functionality of an Op:**
+
+1.  Fix all producer scripts (not TensorFlow itself) to not use the banned Op or
+    functionality.
+2.  Increment the binary version and implement new consumer functionality that
+    bans the removed Op or functionality for GraphDefs at the new version and
+    above. If possible, make TensorFlow stop producing `GraphDefs` with the
+    banned functionality. This can be done with
+    [`REGISTER_OP(...).Deprecated(deprecated_at_version,
+    message)`](https://github.com/tensorflow/tensorflow/blob/b289bc7a50fc0254970c60aaeba01c33de61a728/tensorflow/core/ops/array_ops.cc#L1009).
+3.  Wait for a major release for backward compatibility purposes.
+4.  Increase `min_producer` to the GraphDef version from (2) and remove the
+    functionality entirely.
+
+**Changing the functionality of an Op:**
+
+1.  Add a new similar Op named `SomethingV2` or similar and go through the
+    process of adding it and switching existing Python wrappers to use it (may
+    take 3 weeks if forward compatibility is desired).
+2.  Remove the old Op (Can only take place with a major version change due to
+    backward compatibility).
+3.  Increase `min_consumer` to rule out consumers with the old Op, add back the
+    old Op as an alias for `SomethingV2`, and go through the process to switch
+    existing Python wrappers to use it.
+4.  Go through the process to remove `SomethingV2`.
+
+**Banning a single consumer version that cannot run safely:**
+
+1.  Bump the binary version and add the bad version to `bad_consumers` for all
+    new GraphDefs. If possible, add to `bad_consumers` only for GraphDefs which
+    contain a certain Op or similar.
+2.  If existing consumers have the bad version, push them out as soon as
+    possible.
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
new file mode 100644
index 00000000000..994633dad79
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -0,0 +1,680 @@
+# Debugging TensorFlow Programs
+
+[comment]: TODO(barryr): Links to and from sections on "Graphs" & "Monitoring Learning".
+
+[TOC]
+
+TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
+lets you view the internal structure and states of running TensorFlow graphs
+during training and inference, which is difficult to debug with general-purpose
+debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
+
+> NOTE: The system requirements of tfdbg on supported external platforms include
+> the following. On Mac OS X, the `ncurses` library is required. It can be
+> installed with `brew install homebrew/dupes/ncurses`. On Windows, `pyreadline`
+> is required. If you use Anaconda3, you can install it with a command
+> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
+
+This tutorial demonstrates how to use the **tfdbg** command-line interface
+(CLI) to debug the appearance of [`nan`s](https://en.wikipedia.org/wiki/NaN)
+and [`inf`s](https://en.wikipedia.org/wiki/Infinity), a frequently-encountered
+type of bug in TensorFlow model development.
+The following example is for users who use the low-level
+[`Session`](https://www.tensorflow.org/api_docs/python/tf/Session) API of
+TensorFlow. A later section of this document describes how to use **tfdbg**
+with a higher-level API, namely tf-learn `Estimator`s and `Experiment`s.
+To *observe* such an issue, run the following command without the debugger (the
+source code can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py)):
+
+```none
+python -m tensorflow.python.debug.examples.debug_mnist
+```
+
+This code trains a simple neural network for MNIST digit image recognition.
+Notice that the accuracy increases slightly after the first training step, but
+then gets stuck at a low (near-chance) level:
+
+> Accuracy at step 0: 0.1113
+> Accuracy at step 1: 0.3183
+> Accuracy at step 2: 0.098
+> Accuracy at step 3: 0.098
+> Accuracy at step 4: 0.098
+
+Wondering what might have gone wrong, you suspect that certain nodes in the
+training graph generated bad numeric values such as `inf`s and `nan`s. Let's
+use tfdbg to debug this issue and pinpoint the exact graph node where this
+numeric problem first surfaced.
+
+## Wrapping TensorFlow Sessions with tfdbg
+
+To add support for tfdbg in our example, all that is needed is to add the
+following lines of code and wrap the Session object with a debugger wrapper.
+This code is already added in
+[debug_mnist.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py),
+so you can activate tfdbg CLI with the `--debug` flag at the command line.
+
+```python
+# Let your BUILD target depend on "//tensorflow/python/debug:debug_py"
+# (You don't need to worry about the BUILD dependency if you are using a pip
+#  install of open-source TensorFlow.)
+from tensorflow.python import debug as tf_debug
+
+sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
+```
+
+This wrapper has the same interface as Session, so enabling debugging requires
+no other changes to the code. The wrapper provides additional features,
+including:
+
+* Bringing up a CLI before and after `run()` calls, to let you control the
+execution and inspect the graph's internal state.
+* Allowing you to register special `filters` for tensor values, to facilitate
+the diagnosis of issues.
+
+In this example, we have already registered a tensor filter called
+@{tfdbg.has_inf_or_nan},
+which simply determines if there are any `nan` or `inf` values in any
+intermediate tensors (tensors that are neither inputs or outputs of the
+`Session.run()` call, but are in the path leading from the inputs to the
+outputs). This filter is for `nan`s and `inf`s is a common enough use case that
+we ship it with the
+@{$python/tfdbg#Classes_for_debug_dump_data_and_directories$`debug_data`}
+module.
+
+TIP: You can also write your own custom filters. See
+the @{tfdbg.DebugDumpDir.find$API documentation}
+of `DebugDumpDir.find()` for additional information.
+
+## Debugging Model Training with tfdbg
+
+
+Let's try training the model again, but with the `--debug` flag added this time:
+
+```none
+python -m tensorflow.python.debug.examples.debug_mnist --debug
+```
+
+The debug wrapper session will prompt you when it is about to execute the first
+`run()` call, with information regarding the fetched tensor and feed
+dictionaries displayed on the screen.
+
+![tfdbg run-start UI](https://www.tensorflow.org/images/tfdbg_screenshot_run_start.png)
+
+This is what we refer to as the *run-start CLI*. If the screen size is
+too small to display the content of the message in its entirety, you can resize
+it or use the **PageUp** / **PageDown** / **Home** / **End** keys to navigate
+the screen output.
+
+Enter the `run` command (or just `r`) at the command prompt:
+
+```
+tfdbg> run
+```
+
+tfdbg calculates the accuracy using a test data set and then displays all dumped
+intermediate tensors from the run in the *run-end CLI*. For example:
+
+![tfdbg run-end UI: accuracy](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_accuracy.png)
+
+This list of tensors can also be obtained by running the command `lt` after you
+executed `run`.
+
+### tfdbg CLI Frequently-Used Commands
+
+Try the following commands at the `tfdbg>` prompt (referencing the code at
+`tensorflow/python/debug/examples/debug_mnist.py`):
+
+| Command            | Syntax or Option | Explanation  | Example                   |
+|:-------------------|:---------------- |:------------ |:------------------------- |
+| **`lt`** | | **List dumped tensors.** | `lt` |
+| | `-n <name_pattern>` | List dumped tensors with names matching given regular-expression pattern. | `lt -n softmax.*` |
+| | `-t <op_pattern>` | List dumped tensors with op types matching given regular-expression pattern. | `lt -t MatMul` |
+| | `s <sort_key>` | Sort the output by given `sort_key`, whose possible values are `timestamp` (default), `dump_size`, `op_type` and `tensor_name`. | `lt -s dump_size` |
+| | `-r` | Sort in reverse order. | `lt -r -s dump_size` |
+| **`pt`** | | **Print value of a dumped tensor.** | |
+| | `pt <tensor>` | Print tensor value. | `pt hidden/Relu:0` |
+| | `pt <tensor>[slicing]` | Print a subarray of tensor, using [numpy](http://www.numpy.org/)-style array slicing. | `pt hidden/Relu:0[0:50,:]` |
+| | `-a` | Print the entirety of a large tensor, without using ellipses. (May take a long time for large tensors.) | `pt -a hidden/Relu:0[0:50,:]` |
+| | `-r <range>` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` |
+| **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` |
+| **`/regex`** | |  [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` |
+| **`/`** | | Scroll to the next line with matches to the searched regex (if any). | `/` |
+| **`ni`** | | **Display node information.** | |
+| | `-a` | Include node attributes in the output. | `ni -a hidden/Relu` |
+| | `-d` | List the debug dumps available from the node. | `ni -d hidden/Relu` |
+| | `-t` | Display the Python stack trace of the node's creation. | `ni -t hidden/Relu` |
+| **`li`** | | **List inputs to node** | |
+| | `-r` | List the inputs to node, recursively (the input tree.) | `li -r hidden/Relu:0` |
+| | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `li -r -d 3 hidden/Relu:0` |
+| | `-c` | Include control inputs. | `li -c -r hidden/Relu:0` |
+| **`lo`** | | **List output recipients of node** | |
+| | `-r` | List the output recipients of node, recursively (the output tree.) | `lo -r hidden/Relu:0` |
+| | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `lo -r -d 3 hidden/Relu:0` |
+| | `-c` | Include recipients via control edges. | `lo -c -r hidden/Relu:0` |
+| **`ls`** | | **List Python source files involved in node creation.** | |
+| | `-p <path_pattern>` | Limit output to source files matching given regular-expression path pattern. | `ls -p .*debug_mnist.*` |
+| | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n softmax.*` |
+| **`ps`** | | **Print Python source file.** | |
+| | `ps <file_path>` | Print given Python source file source.py, with the lines annotated with the nodes created at each of them (if any). | `ps /path/to/source.py` |
+| | `-t` | Perform annotation with respect to Tensors, instead of the default, nodes. | `ps -t /path/to/source.py` |
+| | `-b <line_number>` | Annotate source.py beginning at given line. | `ps -b 30 /path/to/source.py` |
+| | `-m <max_elements>` | Limit the number of elements in the annotation for each line. | `ps -m 100 /path/to/source.py` |
+| **`run`** | | **Proceed to the next Session.run()** | `run` |
+| | `-n` | Execute through the next `Session.run` without debugging, and drop to CLI right before the run after that. | `run -n` |
+| | `-t <T>` | Execute `Session.run` `T - 1` times without debugging, followed by a run with debugging. Then drop to CLI right after the debugged run. | `run -t 10` |
+| | `-f <filter_name>` | Continue executing `Session.run` until any intermediate tensors passes the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` |
+| | `--node_name_filter <pattern>` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter softmax.*` |
+| | `--op_type_filter <pattern>` | Execute the next `Session.run`, watching only nodes with op types matching the given regular-expression pattern. | `run --op_type_filter Variable.*` |
+| | `--tensor_dtype_filter <pattern>` | Execute the next `Session.run`, dumping only Tensors with data types (`dtype`s) matching the given regular-expression pattern. | `run --tensor_dtype_filter int.*` |
+| **`ri`** | | **Display information about the run the current run, including fetches and feeds.** | `ri` |
+| **`help`** | | **Print general help information** | `help` |
+| | `help <command>` | Print help for given command. | `help lt` |
+
+### Other Features of the tfdbg CLI
+
+In addition to the commands listed above, the tfdbg CLI provides the following
+addditional features:
+
+*   To navigate through previous tfdbg commands, type in a few characters
+    followed by the Up or Down arrow keys. tfdbg will show you the history of
+    commands that started with those characters.
+*   To navigate through the history of screen outputs, do either of the
+    following:
+    * Use the `prev` and `next` commands.
+    * Click underlined `<--` and `-->` links near the top left corner of the
+      screen.
+*   Tab completion of commands and some command arguments.
+*   To redirect the screen output to a file instead of the screen, end the
+    command with bash-style redirection. For example, the following command
+    redirects the output of the pt command to the `/tmp/xent_value_slices.txt`
+    file:
+
+  ```none
+  tfdbg> pt cross_entropy/Log:0[:, 0:10] > /tmp/xent_value_slices.txt
+  ```
+
+### Finding `nan`s and `inf`s
+
+In this first `run()` call, there happen to be no problematic numerical values.
+You can move on to the next run by using the command `run` or its shorthand `r`.
+
+> TIP: If you enter `run` or `r` repeatedly, you will be able to move through the
+> `run()` calls in a sequential manner.
+>
+> You can also use the `-t` flag to move ahead a number of `run()` calls at a time, for example:
+>
+> ```
+> tfdbg> run -t 10
+> ```
+
+Instead of entering `run` repeatedly and manually searching for `nan`s and
+`inf`s in the run-end UI after every `run()` call (for example, by using the `pt`
+command shown in the table above) , you can use the following
+command to let the debugger repeatedly execute `run()` calls without stopping at
+the run-start or run-end prompt, until the first `nan` or `inf` value shows up
+in the graph. This is analogous to *conditional breakpoints* in some
+procedural-language debuggers:
+
+```none
+tfdbg> run -f has_inf_or_nan
+```
+
+> NOTE: The preceding command works properly because we have registered a filter
+> for `nan`s and `inf`s called `has_inf_or_nan` (as explained previously).
+> If you have registered any other filters, you can
+> let tfdbg run till any tensors pass that filter (cause the filter to return True)
+> as well, for example,
+>
+> ```
+> # In python code:
+> sess.add_tensor_filter('my_filter', my_filter_callable)
+>
+> # Run at tfdbg run-start prompt:
+> tfdbg> run -f my_filter
+> ```
+
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_inf_nan.png)
+
+As the screen display indicates, the `has_inf_or_nan` filter is first passed
+during the fourth `run()` call: an [Adam optimizer](https://arxiv.org/abs/1412.6980)
+forward-backward training pass on the graph. In this run, 36 (out of the total
+95) intermediate tensors contain `nan` or `inf` values. These tensors are listed
+in chronological order, with their timestamps displayed on the left. At the top
+of the list, you can see the first tensor in which the bad numerical values
+first surfaced: `cross_entropy/Log:0`.
+
+To view the value of the tensor, click the underlined tensor name
+`cross_entropy/Log:0` or enter the equivalent command:
+
+```none
+tfdbg> pt cross_entropy/Log:0
+```
+
+Scroll down a little and you will notice some scattered `inf` values. If the
+instances of `inf` and `nan` are difficult to spot by eye, you can use the
+following command to perform a regex search and highlight the output:
+
+```none
+tfdbg> /inf
+```
+
+Or, alternatively:
+
+```none
+tfdbg> /(inf|nan)
+```
+
+Why did these infinities appear? To further debug, display more information
+about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
+item on the top or entering the equivalent command:
+
+```none
+tfdbg> ni cross_entropy/Log
+```
+
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_node_info.png)
+
+You can see that this node has the op type `Log`
+and that its input is the node `softmax/Softmax`. Run the following command to
+take a closer look at the input tensor:
+
+```none
+tfdbg> pt softmax/Softmax:0
+```
+
+Examine the values in the input tensor, searching for zeros:
+
+```none
+tfdbg> /0\.000
+```
+
+Indeed, there are zeros. Now it is clear that the origin of the bad numerical
+values is the node `cross_entropy/Log` taking logs of zeros. To find out the
+culprit line in the Python source code, use the `-t` flag of the `ni` command
+to show the traceback of the node's construction:
+
+```none
+tfdbg> ni -t cross_entropy/Log
+```
+
+If you click "node_info" at the top of the screen, tfdbg automatically shows the
+traceback of the node's construction.
+
+From the traceback, you can see that the op is constructed at the following
+line:
+[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py):
+
+```python
+diff = y_ * tf.log(y)
+```
+
+***tfdbg** has a feature that makes it ease to trace Tensors and ops back to
+lines in Python source files. It can annotate lines of a Python file with
+the ops or Tensors created by them. To use this feature,
+simply click the underlined line numbers in the stack trace output of the
+`ni -t <op_name>` commands, or use the `ps` (or `print_source`) command such as:
+`ps /path/to/source.py`. For example, the following screenshot shows the output
+of a `ps` command.
+
+![tfdbg run-end UI: annotated Python source file](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_annotated_source.png)
+
+### Fixing the problem
+
+To fix the problem, edit `debug_mnist.py`, changing the original line:
+
+```python
+diff = y_ * tf.log(y)
+```
+
+to the following:
+
+```python
+diff = y_ * tf.log(tf.clip_by_value(y, 1e-8, 1.0))
+```
+
+Rerun with the `--debug` flag as follows:
+
+```none
+python -m tensorflow.python.debug.examples.debug_mnist --debug
+```
+
+At the `tfdbg>` prompt, enter the following command:
+
+```none
+run -f has_inf_or_nan`
+```
+
+Confirm that no tensors are flagged as containing `nan` or `inf` values, and
+accuracy now continues to rise rather than getting stuck. Success!
+
+## Debugging tf-learn Estimators and Experiments
+
+This section explains how to debug TensorFlow programs that use the `Estimators`
+and `Experiment` APIs. Part of the convenience provided by these APIs is that
+they manage `Session`s internally. This makes the `LocalCLIDebugWrapperSession`
+described in the preceding sections inapplicable. Fortunately, you can still
+debug them by using special `hook`s provided by `tfdbg`.
+
+### Debugging tf.contrib.learn Estimators
+
+Currently, `tfdbg` can debug the
+@{tf.contrib.learn.BaseEstimator.fit$`fit()`}
+@{tf.contrib.learn.BaseEstimator.evaluate$`evaluate()`}
+methods of tf-learn `Estimator`s. To debug `Estimator.fit()`,
+create a `LocalCLIDebugHook` and supply it as the `monitors` argument. For example:
+
+```python
+# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
+# (You don't need to worry about the BUILD dependency if you are using a pip
+#  install of open-source TensorFlow.)
+from tensorflow.python import debug as tf_debug
+
+hooks = [tf_debug.LocalCLIDebugHook()]
+
+# Create a local CLI debug hook and use it as a monitor when calling fit().
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=1000,
+               monitors=hooks)
+```
+
+To debug `Estimator.evaluate()`, assign hooks to the `hooks` parameter, as in
+the following example:
+
+```python
+accuracy_score = classifier.evaluate(x=test_set.data,
+                                     y=test_set.target,
+                                     hooks=hooks)["accuracy"]
+```
+
+
+[debug_tflearn_iris.py](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_tflearn_iris.py),
+based on {$tflearn$tf-learn's iris tutorial}, contains a full example of how to
+use the tfdbg with `Estimator`s. To run this example, do:
+
+```none
+python -m tensorflow.python.debug.examples.debug_tflearn_iris --debug
+```
+
+### Debugging tf.contrib.learn Experiments
+
+`Experiment` is a construct in `tf.contrib.learn` at a higher level than
+`Estimator`.
+It provides a single interface for training and evaluating a model. To debug
+the `train()` and `evaluate()` calls to an `Experiment` object, you can
+use the keyword arguments `train_monitors` and `eval_hooks`, respectively, when
+calling its constructor. For example:
+
+```python
+# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
+# (You don't need to worry about the BUILD dependency if you are using a pip
+#  install of open-source TensorFlow.)
+from tensorflow.python import debug as tf_debug
+
+hooks = [tf_debug.LocalCLIDebugHook()]
+
+ex = experiment.Experiment(classifier,
+                           train_input_fn=iris_input_fn,
+                           eval_input_fn=iris_input_fn,
+                           train_steps=FLAGS.train_steps,
+                           eval_delay_secs=0,
+                           eval_steps=1,
+                           train_monitors=hooks,
+                           eval_hooks=hooks)
+
+ex.train()
+accuracy_score = ex.evaluate()["accuracy"]
+```
+
+To build and run the `debug_tflearn_iris` example in the `Experiment` mode, do:
+
+```none
+python -m tensorflow.python.debug.examples.debug_tflearn_iris \
+    --use_experiment --debug
+```
+
+The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be
+used to flexibly specify what `Tensor`s to watch on different `Session.run()`
+calls, as a function of the `fetches` and `feed_dict` and other states. See
+@{tfdbg.DumpingDebugWrapperSession.__init__$this API doc}
+for more details.
+
+## Offline Debugging of Remotely-Running Sessions
+
+Oftentimes, your model is running in a remote machine or process that you don't
+have terminal access to. To perform model debugging in such cases, you can use
+the `offline_analyzer` binary of `tfdbg`. It operates on dumped data
+directories. This can be done to both the lower-level `Session` API and the
+higher-level `Estimator` and `Experiment` APIs.
+
+### Debugging Remotely-Running tf.Sessions
+
+If you interact directly with the `tf.Session` API in `python`, you can
+configure the `RunOptions` proto that you call your `Session.run()` method
+with, by using the method @{tfdbg.watch_graph}.
+This will cause the intermediate tensors and runtime graphs to be dumped to a
+shared storage location of your choice when the `Session.run()` call occurs.
+For example:
+
+```python
+from tensorflow.python import debug as tf_debug
+
+# ... Code where your session and graph are set up...
+
+run_options = tf.RunOptions()
+tf_debug.watch_graph(
+      run_options,
+      session.graph,
+      debug_urls=["file:///shared/storage/location/tfdbg_dumps_1"])
+# Be sure to specify different directories for different run() calls.
+
+session.run(fetches, feed_dict=feeds, options=run_options)
+```
+
+Later, in an environment that you have terminal access to (for example, a local
+computer that can access the shared storage location specified in the code
+above), you can load and inspect the data in the dump directory on the shared
+storage by using the `offline_analyzer` binary of `tfdbg`. For example:
+
+```none
+python -m tensorflow.python.debug.cli.offline_analyzer \
+    --dump_dir=/shared/storage/location/tfdbg_dumps_1
+```
+
+The `Session` wrapper `DumpingDebugWrapperSession` offers an easier and more
+flexible way to generate file-system dumps that can be analyzed offline.
+To use it, simply call the `tf_debug.DumpingDebugWrapperSession` method in the
+program being debugged. For example:
+
+```python
+# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
+# (You don't need to worry about the BUILD dependency if you are using a pip
+#  install of open-source TensorFlow.)
+from tensorflow.python import debug as tf_debug
+
+sess = tf_debug.DumpingDebugWrapperSession(
+    sess, "/shared/storage/location/tfdbg_dumps_1/", watch_fn=my_watch_fn)
+```
+
+`watch_fn=my_watch_fn` is a `Callable` that allows you to configure what
+`tensor`s to watch on different `Session.run()` calls, as a function of the
+`fetches` and `feed_dict` to the `run()` call and other states. See
+@{tfdbg.DumpingDebugWrapperSession.__init__$the API doc of DumpingDebugWrapperSession}
+for more details.
+
+### C++ and other languages
+
+If you model code is written in C++ or other languages, you can also
+modify the `debug_options` field of `RunOptions` to generate debug dumps that
+can be inspected offline. See
+[the proto definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/debug.proto)
+for more details.
+
+### Debugging Remotely-Running tf-learn Estimators and Experiments
+
+If your remote TensorFlow server runs `Estimator`s,
+you can use the non-interactive `DumpingDebugHook`. For example:
+
+```python
+# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
+# (You don't need to worry about the BUILD dependency if you are using a pip
+#  install of open-source TensorFlow.)
+from tensorflow.python import debug as tf_debug
+
+hooks = [tf_debug.DumpingDebugHook("/shared/storage/location/tfdbg_dumps_1")]
+```
+
+Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples
+described earlier in this document.
+As the training and/or evalution of `Estimator` or `Experiment`
+happens, tfdbg creates directories having the following name pattern:
+`/shared/storage/location/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>`.
+Each directory corresponds to a `Session.run()` call that underlies
+the `fit()` or `evaluate()` call. You can load these directories and inspect
+them in a command-line interface in an offline manner using the
+`offline_analyzer` offered by tfdbg. For example:
+
+```bash
+python -m tensorflow.python.debug.cli.offline_analyzer \
+    --dump_dir="/shared/storage/location/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>"
+```
+
+## Frequently Asked Questions
+
+**Q**: _Do the timestamps on the left side of the `lt` output reflect actual
+       performance in a non-debugging session?_
+
+**A**: No. The debugger inserts additional special-purpose debug nodes to the
+       graph to record the values of intermediate tensors. These nodes certainly
+       slow down the graph execution. If you are interested in profiling your
+       model, check out
+       [tfprof](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tfprof)
+       and other profiling tools for TensorFlow.
+
+**Q**: _How do I link tfdbg against my `Session` in Bazel? Why do I see an
+       error such as "ImportError: cannot import name debug"?_
+
+**A**: In your BUILD rule, declare dependencies:
+       `"//tensorflow:tensorflow_py"` and `"//tensorflow/python/debug:debug_py"`.
+       The first is the dependency that you include to use TensorFlow even
+       without debugger support; the second enables the debugger.
+       Then, In your Python file, add:
+
+```python
+from tensorflow.python import debug as tf_debug
+
+# Then wrap your TensorFlow Session with the local-CLI wrapper.
+sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+```
+
+**Q**: _Does tfdbg help debug runtime errors such as shape mismatches?_
+
+**A**: Yes. tfdbg intercepts errors generated by ops during runtime and presents
+       the errors with some debug instructions to the user in the CLI.
+       See examples:
+
+```none
+# Debugging shape mismatch during matrix multiplication.
+python -m tensorflow.python.debug.examples.debug_errors \
+    --error shape_mismatch --debug
+
+# Debugging uninitialized variable.
+python -m tensorflow.python.debug.examples.debug_errors \
+    --error uninitialized_variable --debug
+```
+
+**Q**: _How can I let my tfdbg-wrapped Sessions or Hooks run the debug mode
+only from the main thread?_
+
+**A**:
+This is a common use case, in which the `Session` object is used from multiple
+threads concurrently. Typically, the child threads take care of background tasks
+such as running enqueue operations. Oftentimes, you want to debug only the main
+thread (or less frequently, only one of the child threads). You can use the
+`thread_name_filter` keyword argument of `LocalCLIDebugWrapperSession` to
+achieve this type of thread-selective debugging. For example, to debug from the
+main thread only, construct a wrapped `Session` as follows:
+
+```python
+sess = tf_debug.LocalCLIDebugWrapperSession(sess, thread_name_filter="MainThread$")
+```
+
+The above example relies on the fact that main threads in Python have the
+default name `MainThread`.
+
+**Q**: _The model I am debugging is very large. The data dumped by tfdbg
+fills up the free space of my disk. What can I do?_
+
+**A**:
+You might encounter this problem in any of the following situations:
+
+*   models with many intermediate tensors
+*   very large intermediate tensors
+*   many @{tf.while_loop} iterations
+
+There are three possible workarounds or solutions:
+
+*  The constructors of `LocalCLIDebugWrapperSession` and `LocalCLIDebugHook`
+   provide a keyword argument, `dump_root`, to specify the path
+   to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
+   debug data on a disk with larger free space. For example:
+
+   ``` python
+   # For LocalCLIDebugWrapperSession
+   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
+
+   # For LocalCLIDebugHook
+   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+   ```
+   Make sure that the directory pointed to by dump_root is empty or nonexistent.
+   tfdbg cleans up the dump directories before exiting.
+*  Reduce the batch size used during the runs.
+*  Use the filtering options of tfdbg's `run` command to watch only specific
+   nodes in the graph. For example:
+
+   ```
+   tfdbg> run --node_name_filter .*hidden.*
+   tfdbg> run --op_type_filter Variable.*
+   tfdbg> run --tensor_dtype_filter int.*
+   ```
+
+**Q**: _Why can't I select text in the tfdbg CLI?_
+
+**A**: This is because the tfdbg CLI enables mouse events in the terminal by
+       default. This [mouse-mask](https://linux.die.net/man/3/mousemask) mode
+       overrides default terminal interactions, including text selection. You
+       can re-enable text selection by using the command `mouse off` or
+       `m off`.
+
+**Q**: _Why does the tfdbg CLI show no dumped tensors when I debug code like the following?_
+
+``` python
+a = tf.ones([10], name="a")
+b = tf.add(a, a, name="b")
+sess = tf.Session()
+sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+sess.run(b)
+```
+
+**A**: The reason why you see no data dumped is because every node in the
+       executed TensorFlow graph is constant-folded by the TensorFlow runtime.
+       In this exapmle, `a` is a constant tensor; therefore, the fetched
+       tensor `b` is effectively also a constant tensor. TensorFlow's graph
+       optimization folds the graph that contains `a` and `b` into a single
+       node to speed up future runs of the graph, which is why `tfdbg` does
+       not generate any intermedate-tensor dumps. If `a` were a
+       @{tf.Variable}, the constant-folding would not occur and `tfdbg`
+       should show the intermeidate-tensor dumps. For example:
+
+``` python
+import numpy as np
+
+a = tf.Variable(np.ones[10], name="a")
+b = tf.add(a, a, name="b")
+sess = tf.Session()
+sess.run(tf.global_variables_initializer())
+sess = tf_debug.LocalCLIDebugWrapperSession(sess)
+sess.run(b)
+```
diff --git a/tensorflow/g3doc/resources/dims_types.md b/tensorflow/docs_src/programmers_guide/dims_types.md
similarity index 97%
rename from tensorflow/g3doc/resources/dims_types.md
rename to tensorflow/docs_src/programmers_guide/dims_types.md
index 3fedbbc0b44..65b748d56ec 100644
--- a/tensorflow/g3doc/resources/dims_types.md
+++ b/tensorflow/docs_src/programmers_guide/dims_types.md
@@ -43,7 +43,7 @@ Rank | Shape | Dimension number | Example
 n | [D0, D1, ... Dn-1] | n-D | A tensor with shape [D0, D1, ... Dn-1].
 
 Shapes can be represented via Python lists / tuples of ints, or with the
-[`TensorShape` class](../api_docs/python/framework.md#TensorShape).
+@{tf.TensorShape}.
 
 ## Data types
 
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
new file mode 100644
index 00000000000..975850349f0
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -0,0 +1,352 @@
+# Embeddings
+
+[TOC]
+
+## Introduction
+
+An embedding is a mapping from discrete objects, such as words, to vectors of
+real numbers. For example, a 300-dimensional embedding for English words could
+include:
+
+```
+blue:  (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)
+blues:  (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)
+orange:  (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)
+oranges:  (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)
+```
+
+Embeddings let you apply machine learning to discrete inputs. Classifiers, and
+neural networks more generally, are designed to work with dense continuous
+vectors, where all values contribute to define what an object is.  If discrete
+objects are naively encoded as discrete atoms, e.g., unique id numbers, they
+hinder learning and generalization. One way to think of embeddings is as a way
+to transform non-vector objects into useful inputs for machine learning.
+
+Embeddings are also useful as outputs of machine learning. Because embeddings
+map objects to vectors, applications can use similarity in vector space (e.g.,
+Euclidean distance or the angle between vectors) as a robust and flexible
+measure of object similarity. One common use is to find nearest neighbors.
+Using the same word embeddings above, for instance, here are the three nearest
+neighbors for each word and the corresponding angles (in degrees):
+
+```
+blue:  (red, 47.6°), (yellow, 51.9°), (purple, 52.4°)
+blues:  (jazz, 53.3°), (folk, 59.1°), (bluegrass, 60.6°)
+orange:  (yellow, 53.5°), (colored, 58.0°), (bright, 59.9°)
+oranges:  (apples, 45.3°), (lemons, 48.3°), (mangoes, 50.4°)
+```
+
+This would tell an application that apples and oranges are in some way more
+similar (45.3° apart) than lemons and oranges (48.3° apart).
+
+## Training an Embedding
+
+To train word embeddings in TensorFlow, we first need to split the text into
+words and assign an integer to every word in the vocabulary. Let us assume that
+this has already been done, and that `word_ids` is a vector of these integers.
+For example, the sentence “I have a cat.” could be split into
+`[“I”, “have”, “a”, “cat”, “.”]` and then the corresponding `word_ids` tensor
+would have shape `[5]` and consist of 5 integers. To get these word ids
+embedded, we need to create the embedding variable and use the `tf.gather`
+function as follows:
+
+```
+word_embeddings = tf.get_variable(“word_embeddings”,
+    [vocabulary_size, embedding_size])
+embedded_word_ids = tf.gather(word_embeddings, word_ids)
+```
+
+After this, the tensor `embedded_word_ids` will have shape `[5, embedding_size]`
+in our example and contain the embeddings (dense vectors) for each of the 5
+words. The variable `word_embeddings` will be learned and at the end of the
+training it will contain the embeddings for all words in the vocabulary.
+The embeddings can be trained in many ways, depending on the data available.
+For example, one could use a recurrent neural network to predict the next word
+from the previous one given a large corpus of sentences, or one could train
+two networks to do multi-lingual translation. These methods are described in
+[Vector Representations of Words](../tutorials/word2vec.md) tutorial, but in
+all cases there is an embedding variable like above and words are embedded
+using `tf.gather`, as shown.
+
+## Visualizing Embeddings
+
+TensorBoard has a built-in visualizer, called the <i>Embedding Projector</i>,
+for interactive visualization of embeddings. The embedding projector will read
+the embeddings from your checkpoint file and project them into 3 dimensions using
+[principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis).
+For a visual explanation of PCA, see
+[this article](http://setosa.io/ev/principal-component-analysis/). Another
+very useful projection you can use is
+[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding).
+
+If you are working with an embedding, you'll probably want to attach
+labels/images to the data points. You can do this by generating a
+[metadata file](#metadata) containing the labels for each point and configuring
+the projector either by using our Python API, or manually constructing and
+saving a
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
+in the same directory as your checkpoint file.
+
+### Setup
+
+For in depth information on how to run TensorBoard and make sure you are
+logging all the necessary information, see
+[TensorBoard: Visualizing Learning](../get_started/summaries_and_tensorboard.md).
+
+To visualize your embeddings, there are 3 things you need to do:
+
+1) Setup a 2D tensor that holds your embedding(s).
+
+```python
+embedding_var = tf.get_variable(....)
+```
+
+2) Periodically save your model variables in a checkpoint in
+<code>LOG_DIR</code>.
+
+```python
+saver = tf.train.Saver()
+saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), step)
+```
+
+3) (Optional) Associate metadata with your embedding.
+
+If you have any metadata (labels, images) associated with your embedding, you
+can tell TensorBoard about it either by directly storing a
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
+in the <code>LOG_DIR</code>, or use our python API.
+
+For instance, the following <code>projector_config.ptxt</code> associates the
+<code>word_embedding</code> tensor with metadata stored in <code>$LOG_DIR/metadata.tsv</code>:
+
+```
+embeddings {
+  tensor_name: 'word_embedding'
+  metadata_path: '$LOG_DIR/metadata.tsv'
+}
+```
+
+The same config can be produced programmatically using the following code snippet:
+
+```python
+from tensorflow.contrib.tensorboard.plugins import projector
+
+# Create randomly initialized embedding weights which will be trained.
+vocabulary_size = 10000
+embedding_size = 200
+embedding_var = tf.get_variable('word_embedding', [vocabulary_size, embedding_size])
+
+# Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
+config = projector.ProjectorConfig()
+
+# You can add multiple embeddings. Here we add only one.
+embedding = config.embeddings.add()
+embedding.tensor_name = embedding_var.name
+# Link this tensor to its metadata file (e.g. labels).
+embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
+
+# Use the same LOG_DIR where you stored your checkpoint.
+summary_writer = tf.summary.FileWriter(LOG_DIR)
+
+# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
+# read this file during startup.
+projector.visualize_embeddings(summary_writer, config)
+```
+
+After running your model and training your embeddings, run TensorBoard and point
+it to the <code>LOG_DIR</code> of the job.
+
+```python
+tensorboard --logdir=LOG_DIR
+```
+
+Then click on the *Embeddings* tab on the top pane
+and select the appropriate run (if there are more than one run).
+
+
+### Metadata
+Usually embeddings have metadata associated with it (e.g. labels, images). The
+metadata should be stored in a separate file outside of the model checkpoint
+since the metadata is not a trainable parameter of the model. The format should
+be a [TSV file](https://en.wikipedia.org/wiki/Tab-separated_values)
+(tab characters shown in red) with the first line containing column headers
+(shown in bold) and subsequent lines contain the metadata values:
+
+<code>
+<b>Word<span style="color:#800;">\t</span>Frequency</b><br/>
+  Airplane<span style="color:#800;">\t</span>345<br/>
+  Car<span style="color:#800;">\t</span>241<br/>
+  ...
+</code>
+
+There is no explicit key shared with the main data file; instead, the order in
+the metadata file is assumed to match the order in the embedding tensor. In
+other words, the first line is the header information and the (i+1)-th line in
+the metadata file corresponds to the i-th row of the embedding tensor stored in
+the checkpoint.
+
+Note: If the TSV metadata file has only a single column, then we don’t expect a
+header row, and assume each row is the label of the embedding. We include this
+exception because it matches the commonly-used "vocab file" format.
+
+### Images
+If you have images associated with your embeddings, you will need to
+produce a single image consisting of small thumbnails of each data point.
+This is known as the
+[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image).
+The sprite should have the same number of rows and columns with thumbnails
+stored in row-first order: the first data point placed in the top left and the
+last data point in the bottom right:
+
+<table style="border: none;">
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">0</td>
+  <td style="border: 1px solid black">1</td>
+  <td style="border: 1px solid black">2</td>
+</tr>
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">3</td>
+  <td style="border: 1px solid black">4</td>
+  <td style="border: 1px solid black">5</td>
+</tr>
+<tr style="background-color: transparent;">
+  <td style="border: 1px solid black">6</td>
+  <td style="border: 1px solid black">7</td>
+  <td style="border: 1px solid black"></td>
+</tr>
+</table>
+
+Note in the example above that the last row doesn't have to be filled. For a
+concrete example of a sprite, see
+[this sprite image](https://www.tensorflow.org/images/mnist_10k_sprite.png) of 10,000 MNIST digits
+(100x100).
+
+Note: We currently support sprites up to 8192px X 8192px.
+
+After constructing the sprite, you need to tell the Embedding Projector where
+to find it:
+
+
+```python
+embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE
+# Specify the width and height of a single thumbnail.
+embedding.sprite.single_image_dim.extend([w, h])
+```
+
+### Interaction
+
+The Embedding Projector has three panels:
+
+1. *Data panel* on the top left, where you can choose the run, the embedding
+   tensor and data columns to color and label points by.
+2. *Projections panel* on the bottom left, where you choose the type of
+    projection (e.g. PCA, t-SNE).
+3. *Inspector panel* on the right side, where you can search for particular
+   points and see a list of nearest neighbors.
+
+### Projections
+The Embedding Projector has three methods of reducing the dimensionality of a
+data set: two linear and one nonlinear. Each method can be used to create either
+a two- or three-dimensional view.
+
+**Principal Component Analysis** A straightforward technique for reducing
+dimensions is Principal Component Analysis (PCA). The Embedding Projector
+computes the top 10 principal components. The menu lets you project those
+components onto any combination of two or three. PCA is a linear projection,
+often effective at examining global geometry.
+
+**t-SNE** A popular non-linear dimensionality reduction technique is t-SNE.
+The Embedding Projector offers both two- and three-dimensional t-SNE views.
+Layout is performed client-side animating every step of the algorithm. Because
+t-SNE often preserves some local structure, it is useful for exploring local
+neighborhoods and finding clusters. Although extremely useful for visualizing
+high-dimensional data, t-SNE plots can sometimes be mysterious or misleading.
+See this [great article](http://distill.pub/2016/misread-tsne/) for how to use
+t-SNE effectively.
+
+**Custom** You can also construct specialized linear projections based on text
+searches for finding meaningful directions in space. To define a projection
+axis, enter two search strings or regular expressions. The program computes the
+centroids of the sets of points whose labels match these searches, and uses the
+difference vector between centroids as a projection axis.
+
+### Navigation
+
+To explore a data set, you can navigate the views in either a 2D or a 3D mode,
+zooming, rotating, and panning using natural click-and-drag gestures.
+Clicking on a point causes the right pane to show an explicit textual list of
+nearest neighbors, along with distances to the current point. The
+nearest-neighbor points themselves are highlighted on the projection.
+
+Zooming into the cluster gives some information, but it is sometimes more
+helpful to restrict the view to a subset of points and perform projections only
+on those points. To do so, you can select points in multiple ways:
+
+1. After clicking on a point, its nearest neighbors are also selected.
+2. After a search, the points matching the query are selected.
+3. Enabling selection, clicking on a point and dragging defines a selection
+   sphere.
+
+After selecting a set of points, you can isolate those points for
+further analysis on their own with the "Isolate Points" button in the Inspector
+pane on the right hand side.
+
+
+![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
+*Selection of the nearest neighbors of “important” in a word embedding dataset.*
+
+The combination of filtering with custom projection can be powerful. Below, we filtered
+the 100 nearest neighbors of “politics” and projected them onto the
+“best” - “worst” vector as an x axis. The y axis is random.
+
+You can see that on the right side we have “ideas”, “science”, “perspective”,
+“journalism” while on the left we have “crisis”, “violence” and “conflict”.
+
+<table width="100%;">
+  <tr>
+    <td style="width: 30%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
+    </td>
+    <td style="width: 70%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 30%;">
+      Custom projection controls.
+    </td>
+    <td style="width: 70%;">
+      Custom projection of neighbors of "politics" onto "best" - "worst" vector.
+    </td>
+  </tr>
+</table>
+
+### Collaborative Features
+
+To share your findings, you can use the bookmark panel in the bottom right
+corner and save the current state (including computed coordinates of any
+projection) as a small file. The Projector can then be pointed to a set of one
+or more of these files, producing the panel below. Other users can then walk
+through a sequence of bookmarks.
+
+<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
+
+
+## Mini-FAQ
+
+**Is "embedding" an action or a thing?**
+Both. People talk about embedding words in a vector space (action) and about
+producing word embeddings (things).  Common to both is the notion of embedding
+as a mapping from discrete objects to vectors. Creating or applying that
+mapping is an action, but the mapping itself is a thing.
+
+**Are embeddings high-dimensional or low-dimensional?**
+It depends. A 300-dimensional vector space of words and phrases, for instance,
+is often called low-dimensional (and dense) when compared to the millions of
+words and phrases it can contain. But mathematically it is high-dimensional,
+displaying many properties that are dramatically different from what our human
+intuition has learned about 2- and 3-dimensional spaces.
+
+**Is an embedding the same as an embedding layer?**
+No; an embedding layer is a part of neural network, but an embedding is a more
+general concept.
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
new file mode 100644
index 00000000000..e31d2717a66
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -0,0 +1,317 @@
+# Frequently Asked Questions
+
+This document provides answers to some of the frequently asked questions about
+TensorFlow. If you have a question that is not covered here, you might find an
+answer on one of the TensorFlow @{$about$community resources}.
+
+[TOC]
+
+## Features and Compatibility
+
+#### Can I run distributed training on multiple computers?
+
+Yes! TensorFlow gained
+@{$distributed$support for distributed computation} in
+version 0.8. TensorFlow now supports multiple devices (CPUs and GPUs) in one or
+more computers.
+
+#### Does TensorFlow work with Python 3?
+
+As of the 0.6.0 release timeframe (Early December 2015), we do support Python
+3.3+.
+
+## Building a TensorFlow graph
+
+See also the
+@{$python/framework$API documentation on building graphs}.
+
+#### Why does `c = tf.matmul(a, b)` not execute the matrix multiplication immediately?
+
+In the TensorFlow Python API, `a`, `b`, and `c` are
+@{tf.Tensor} objects. A `Tensor` object is
+a symbolic handle to the result of an operation, but does not actually hold the
+values of the operation's output. Instead, TensorFlow encourages users to build
+up complicated expressions (such as entire neural networks and its gradients) as
+a dataflow graph. You then offload the computation of the entire dataflow graph
+(or a subgraph of it) to a TensorFlow
+@{tf.Session}, which is able to execute the
+whole computation much more efficiently than executing the operations
+one-by-one.
+
+#### How are devices named?
+
+The supported device names are `"/device:CPU:0"` (or `"/cpu:0"`) for the CPU
+device, and `"/device:GPU:i"` (or `"/gpu:i"`) for the *i*th GPU device.
+
+#### How do I place operations on a particular device?
+
+To place a group of operations on a device, create them within a
+@{tf.device$`with tf.device(name):`} context.  See
+the how-to documentation on
+@{$using_gpu$using GPUs with TensorFlow} for details of how
+TensorFlow assigns operations to devices, and the
+@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+uses multiple GPUs.
+
+#### What are the different types of tensors that are available?
+
+TensorFlow supports a variety of different data types and tensor shapes. See the
+@{$dims_types$ranks, shapes, and types reference} for more details.
+
+## Running a TensorFlow computation
+
+See also the
+@{$python/client$API documentation on running graphs}.
+
+#### What's the deal with feeding and placeholders?
+
+Feeding is a mechanism in the TensorFlow Session API that allows you to
+substitute different values for one or more tensors at run time. The `feed_dict`
+argument to @{tf.Session.run} is a
+dictionary that maps @{tf.Tensor} objects to
+numpy arrays (and some other types), which will be used as the values of those
+tensors in the execution of a step.
+
+Often, you have certain tensors, such as inputs, that will always be fed. The
+@{tf.placeholder} op allows you
+to define tensors that *must* be fed, and optionally allows you to constrain
+their shape as well. See the
+@{$beginners$beginners' MNIST tutorial} for an
+example of how placeholders and feeding can be used to provide the training data
+for a neural network.
+
+#### What is the difference between `Session.run()` and `Tensor.eval()`?
+
+If `t` is a @{tf.Tensor} object,
+@{tf.Tensor.eval} is shorthand for
+@{tf.Session.run} (where `sess` is the
+current @{tf.get_default_session}. The
+two following snippets of code are equivalent:
+
+```python
+# Using `Session.run()`.
+sess = tf.Session()
+c = tf.constant(5.0)
+print(sess.run(c))
+
+# Using `Tensor.eval()`.
+c = tf.constant(5.0)
+with tf.Session():
+  print(c.eval())
+```
+
+In the second example, the session acts as a
+[context manager](https://docs.python.org/2.7/reference/compound_stmts.html#with),
+which has the effect of installing it as the default session for the lifetime of
+the `with` block. The context manager approach can lead to more concise code for
+simple use cases (like unit tests); if your code deals with multiple graphs and
+sessions, it may be more straightforward to make explicit calls to
+`Session.run()`.
+
+#### Do Sessions have a lifetime? What about intermediate tensors?
+
+Sessions can own resources, such as
+@{tf.Variable},
+@{tf.QueueBase}, and
+@{tf.ReaderBase}; and these resources can use
+a significant amount of memory. These resources (and the associated memory) are
+released when the session is closed, by calling
+@{tf.Session.close}.
+
+The intermediate tensors that are created as part of a call to
+@{$python/client$`Session.run()`} will be freed at or before the
+end of the call.
+
+#### Does the runtime parallelize parts of graph execution?
+
+The TensorFlow runtime parallelizes graph execution across many different
+dimensions:
+
+* The individual ops have parallel implementations, using multiple cores in a
+  CPU, or multiple threads in a GPU.
+* Independent nodes in a TensorFlow graph can run in parallel on multiple
+  devices, which makes it possible to speed up
+  @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
+* The Session API allows multiple concurrent steps (i.e. calls to
+  @{tf.Session.run} in parallel. This
+  enables the runtime to get higher throughput, if a single step does not use
+  all of the resources in your computer.
+
+#### Which client languages are supported in TensorFlow?
+
+TensorFlow is designed to support multiple client languages.
+Currently, the best-supported client language is [Python](../api_docs/python/index.md). Experimental interfaces for
+executing and constructing graphs are also available for
+[C++](../api_docs/cc/index.md), [Java](../api_docs/java/reference/org/tensorflow/package-summary.html) and [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go).
+
+TensorFlow also has a
+[C-based client API](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
+to help build support for more client languages.  We invite contributions of new
+language bindings.
+
+#### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
+
+TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on
+@{$using_gpu$using GPUs with TensorFlow} for details of how
+TensorFlow assigns operations to devices, and the
+@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+uses multiple GPUs.
+
+Note that TensorFlow only uses GPU devices with a compute capability greater
+than 3.5.
+
+#### Why does `Session.run()` hang when using a reader or a queue?
+
+The @{tf.ReaderBase} and
+@{tf.QueueBase} classes provide special operations that
+can *block* until input (or free space in a bounded queue) becomes
+available. These operations allow you to build sophisticated
+@{$reading_data$input pipelines}, at the cost of making the
+TensorFlow computation somewhat more complicated. See the how-to documentation
+for
+@{$reading_data#creating-threads-to-prefetch-using-queuerunner-objects$using `QueueRunner` objects to drive queues and readers}
+for more information on how to use them.
+
+## Variables
+
+See also the how-to documentation on @{$variables$variables}
+and @{$variable_scope$variable scopes}, and
+@{$python/state_ops$the API documentation for variables}.
+
+#### What is the lifetime of a variable?
+
+A variable is created when you first run the
+@{tf.Variable.initializer}
+operation for that variable in a session. It is destroyed when that
+@{tf.Session.close}.
+
+#### How do variables behave when they are concurrently accessed?
+
+Variables allow concurrent read and write operations. The value read from a
+variable may change if it is concurrently updated. By default, concurrent
+assignment operations to a variable are allowed to run with no mutual exclusion.
+To acquire a lock when assigning to a variable, pass `use_locking=True` to
+@{tf.Variable.assign}.
+
+## Tensor shapes
+
+See also the
+@{tf.TensorShape}.
+
+#### How can I determine the shape of a tensor in Python?
+
+In TensorFlow, a tensor has both a static (inferred) shape and a dynamic (true)
+shape. The static shape can be read using the
+@{tf.Tensor.get_shape}
+method: this shape is inferred from the operations that were used to create the
+tensor, and may be
+@{tf.TensorShape$partially complete}. If the static
+shape is not fully defined, the dynamic shape of a `Tensor` `t` can be
+determined by evaluating @{tf.shape$`tf.shape(t)`}.
+
+#### What is the difference between `x.set_shape()` and `x = tf.reshape(x)`?
+
+The @{tf.Tensor.set_shape} method updates
+the static shape of a `Tensor` object, and it is typically used to provide
+additional shape information when this cannot be inferred directly. It does not
+change the dynamic shape of the tensor.
+
+The @{tf.reshape} operation creates
+a new tensor with a different dynamic shape.
+
+#### How do I build a graph that works with variable batch sizes?
+
+It is often useful to build a graph that works with variable batch sizes, for
+example so that the same code can be used for (mini-)batch training, and
+single-instance inference. The resulting graph can be
+@{tf.Graph.as_graph_def$saved as a protocol buffer}
+and
+@{tf.import_graph_def$imported into another program}.
+
+When building a variable-size graph, the most important thing to remember is not
+to encode the batch size as a Python constant, but instead to use a symbolic
+`Tensor` to represent it. The following tips may be useful:
+
+* Use [`batch_size = tf.shape(input)[0]`](../api_docs/python/array_ops.md#shape)
+  to extract the batch dimension from a `Tensor` called `input`, and store it in
+  a `Tensor` called `batch_size`.
+
+* Use @{tf.reduce_mean} instead
+  of `tf.reduce_sum(...) / batch_size`.
+
+* If you use
+  @{$reading_data#feeding$placeholders for feeding input},
+  you can specify a variable batch dimension by creating the placeholder with
+  [`tf.placeholder(..., shape=[None, ...])`](../api_docs/python/io_ops.md#placeholder). The
+  `None` element of the shape corresponds to a variable-sized dimension.
+
+## TensorBoard
+
+#### How can I visualize a TensorFlow graph?
+
+See the @{$graph_viz$graph visualization tutorial}.
+
+#### What is the simplest way to send data to TensorBoard?
+
+Add summary ops to your TensorFlow graph, and write
+these summaries to a log directory.  Then, start TensorBoard using
+
+    python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
+
+For more details, see the
+@{$summaries_and_tensorboard$Summaries and TensorBoard tutorial}.
+
+#### Every time I launch TensorBoard, I get a network security popup!
+
+You can change TensorBoard to serve on localhost rather than '0.0.0.0' by
+the flag --host=localhost. This should quiet any security warnings.
+
+## Extending TensorFlow
+
+See also the how-to documentation for
+@{$adding_an_op$adding a new operation to TensorFlow}.
+
+#### My data is in a custom format. How do I read it using TensorFlow?
+
+There are two main options for dealing with data in a custom format.
+
+The easier option is to write parsing code in Python that transforms the data
+into a numpy array, then feed a
+@{tf.placeholder} a tensor with
+that data. See the documentation on
+@{$reading_data#feeding$using placeholders for input} for
+more details. This approach is easy to get up and running, but the parsing can
+be a performance bottleneck.
+
+The more efficient option is to
+@{$adding_an_op$add a new op written in C++} that parses your
+data format. The
+@{$new_data_formats$guide to handling new data formats} has
+more information about the steps for doing this.
+
+#### How do I define an operation that takes a variable number of inputs?
+
+The TensorFlow op registration mechanism allows you to define inputs that are a
+single tensor, a list of tensors with the same type (for example when adding
+together a variable-length list of tensors), or a list of tensors with different
+types (for example when enqueuing a tuple of tensors to a queue).  See the
+how-to documentation for
+@{$adding_an_op#list-inputs-and-outputs$adding an op with a list of inputs or outputs}
+for more details of how to define these different input types.
+
+## Miscellaneous
+
+#### What is TensorFlow's coding style convention?
+
+The TensorFlow Python API adheres to the
+[PEP8](https://www.python.org/dev/peps/pep-0008/) conventions.<sup>*</sup> In
+particular, we use `CamelCase` names for classes, and `snake_case` names for
+functions, methods, and properties. We also adhere to the
+[Google Python style guide](https://google.github.io/styleguide/pyguide.html).
+
+The TensorFlow C++ code base adheres to the
+[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
+
+(<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
+indentation.)
+
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
new file mode 100644
index 00000000000..5ed7c09a577
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -0,0 +1,56 @@
+# Programmer's Guide
+
+The documents in this unit dive into the details of writing TensorFlow
+code.  This section begins with the following guides, each of which
+explain a particular aspect of TensorFlow:
+
+  * @{$variables$Variables: Creation, Initialization, Saving, and Loading},
+    which details the mechanics of TensorFlow Variables.
+  * @{$dims_types$Tensor Ranks, Shapes, and Types}, which explains Tensor
+    rank (the number of dimensions), shape (the size of each dimension),
+    and datatypes.
+  * @{$variable_scope$Sharing Variables}, which explains how to share and
+    manage large sets of variables when building complex models.
+  * @{$threading_and_queues$Threading and Queues}, which explains TensorFlow's
+    rich queuing system.
+  * @{$reading_data$Reading Data}, which documents three different mechanisms
+    for getting data into a TensorFlow program.
+
+The following guide is helpful when training a complex model over multiple
+days:
+
+  * @{$supervisor$Supervisor: Training Helper for Days-Long Trainings}, which
+    explains how to gracefully handle system crashes during a lengthy training
+    session.
+
+TensorFlow provides a debugger named `tfdbg`, which is documented in the
+following guide:
+
+  * @{$debugger$Debugging TensorFlow Programs},
+    which walks you through the use of `tfdbg` within an application. It covers
+    using `tfdbg` with both the low-level TensorFlow API and the Estimator API.
+
+A `MetaGraph` consists of both a computational graph and its associated
+metadata.  A `MetaGraph` contains the information required to continue
+training, perform evaluation, or run inference on a previously
+trained graph.  The following guide details `MetaGraph` objects:
+
+  * @{$meta_graph$Exporting and Importing a MetaGraph}.
+
+`SavedModel` is the universal serialization format for Tensorflow models. TensorFlow provides SavedModel CLI (command-line interface) as a tool to inspect and execute a MetaGraph in a SavedModel. The detailed usages and examples are
+documented in the following guide:
+
+  * @{$saved_model_cli$SavedModel CLI (Command-Line Interface)}.
+
+To learn about the TensorFlow versioning scheme, consult the following two
+guides:
+
+  * @{$version_semantics$TensorFlow Version Semantics}, which explains
+    TensorFlow's versioning nomenclature and compatibility rules.
+  * @{$data_versions$TensorFlow Data Versioning: GraphDefs and Checkpoints},
+    which explains how TensorFlow adds versioning information to computational
+    graphs and checkpoints in order to support compatibility across versions.
+
+We conclude this section with a FAQ about TensorFlow programming:
+
+  * @{$faq$Frequently Asked Questions}
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
new file mode 100644
index 00000000000..322e11cbd69
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -0,0 +1,14 @@
+index.md
+variables.md
+dims_types.md
+variable_scope.md
+threading_and_queues.md
+reading_data.md
+supervisor.md
+debugger.md
+tfdbg-tflearn.md
+meta_graph.md
+saved_model_cli.md
+version_semantics.md
+data_versions.md
+faq.md
diff --git a/tensorflow/docs_src/programmers_guide/meta_graph.md b/tensorflow/docs_src/programmers_guide/meta_graph.md
new file mode 100644
index 00000000000..fa4cee87007
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/meta_graph.md
@@ -0,0 +1,283 @@
+# Exporting and Importing a MetaGraph
+
+A [`MetaGraph`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) contains both a TensorFlow GraphDef
+as well as associated metadata necessary for running computation in a
+graph when crossing a process boundary.  It can also be used for long
+term storage of graphs.  The MetaGraph contains the information required
+to continue training, perform evaluation, or run inference on a previously trained graph.
+
+The APIs for exporting and importing the complete model are in
+the @{tf.train.Saver} class:
+@{tf.train.export_meta_graph}
+and
+@{tf.train.import_meta_graph}.
+
+## What's in a MetaGraph
+
+The information contained in a MetaGraph is expressed as a
+[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
+protocol buffer. It contains the following fields:
+
+* [`MetaInfoDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) for meta information, such as version and other user information.
+* [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto) for describing the graph.
+* [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
+* [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
+map that further describes additional components of the model, such as
+@{$python/state_ops$`Variables`},
+@{tf.train.QueueRunner}, etc.  In order for a Python object to be serialized
+to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
+`from_proto()` methods, and register them with the system using
+`register_proto_function`.
+
+  For example,
+
+  ```Python
+  def to_proto(self, export_scope=None):
+
+    """Converts a `Variable` to a `VariableDef` protocol buffer.
+
+    Args:
+      export_scope: Optional `string`. Name scope to remove.
+
+    Returns:
+      A `VariableDef` protocol buffer, or `None` if the `Variable` is not
+      in the specified name scope.
+    """
+    if (export_scope is None or
+        self._variable.name.startswith(export_scope)):
+      var_def = variable_pb2.VariableDef()
+      var_def.variable_name = ops.strip_name_scope(
+          self._variable.name, export_scope)
+      var_def.initializer_name = ops.strip_name_scope(
+          self.initializer.name, export_scope)
+      var_def.snapshot_name = ops.strip_name_scope(
+          self._snapshot.name, export_scope)
+      if self._save_slice_info:
+        var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto(
+            export_scope=export_scope))
+      return var_def
+    else:
+      return None
+
+  @staticmethod
+  def from_proto(variable_def, import_scope=None):
+    """Returns a `Variable` object created from `variable_def`."""
+    return Variable(variable_def=variable_def, import_scope=import_scope)
+
+  ops.register_proto_function(ops.GraphKeys.GLOBAL_VARIABLES,
+                              proto_type=variable_pb2.VariableDef,
+                              to_proto=Variable.to_proto,
+                              from_proto=Variable.from_proto)
+  ```
+
+## Exporting a Complete Model to MetaGraph
+
+The API for exporting a running model as a MetaGraph is `export_meta_graph()`.
+
+  ```Python
+  def export_meta_graph(filename=None, collection_list=None, as_text=False):
+    """Writes `MetaGraphDef` to save_path/filename.
+
+    Args:
+      filename: Optional meta_graph filename including the path.
+      collection_list: List of string keys to collect.
+      as_text: If `True`, writes the meta_graph as an ASCII proto.
+
+    Returns:
+      A `MetaGraphDef` proto.
+    """
+  ```
+
+  A `collection` can contain any Python objects that users would like to
+  be able to uniquely identify and easily retrieve. These objects can be
+  special operations in the graph, such as `train_op`, or hyper parameters,
+  such as "learning rate".  Users can specify the list of collections
+  they would like to export.  If no `collection_list` is specified,
+  all collections in the model will be exported.
+
+  The API returns a serialized protocol buffer. If `filename` is
+  specified, the protocol buffer will also be written to a file.
+
+  Here are some of the typical usage models:
+
+  * Export the default running graph:
+
+  ```Python
+  # Build the model
+  ...
+  with tf.Session() as sess:
+    # Use the model
+    ...
+  # Export the model to /tmp/my-model.meta.
+  meta_graph_def = tf.train.export_meta_graph(filename='/tmp/my-model.meta')
+  ```
+
+  * Export the default running graph and only a subset of the collections.
+
+  ```Python
+  meta_graph_def = tf.train.export_meta_graph(
+      filename='/tmp/my-model.meta',
+      collection_list=["input_tensor", "output_tensor"])
+  ```
+
+
+The MetaGraph is also automatically exported via the `save()` API in
+@{tf.train.Saver}.
+
+
+## Import a MetaGraph
+
+The API for importing a MetaGraph file into a graph is `import_meta_graph()`.
+
+Here are some of the typical usage models:
+
+* Import and continue training without building the model from scratch.
+
+  ```Python
+  ...
+  # Create a saver.
+  saver = tf.train.Saver(...variables...)
+  # Remember the training_op we want to run by adding it to a collection.
+  tf.add_to_collection('train_op', train_op)
+  sess = tf.Session()
+  for step in xrange(1000000):
+      sess.run(train_op)
+      if step % 1000 == 0:
+          # Saves checkpoint, which by default also exports a meta_graph
+          # named 'my-model-global_step.meta'.
+          saver.save(sess, 'my-model', global_step=step)
+  ```
+
+  Later we can continue training from this saved `meta_graph` without building
+  the model from scratch.
+
+  ```Python
+  with tf.Session() as sess:
+    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
+    new_saver.restore(sess, 'my-save-dir/my-model-10000')
+    # tf.get_collection() returns a list. In this example we only want the
+    # first one.
+    train_op = tf.get_collection('train_op')[0]
+    for step in xrange(1000000):
+      sess.run(train_op)
+  ```
+
+* Import and extend the graph.
+
+  For example, we can first build an inference graph, export it as a meta graph:
+
+  ```Python
+  # Creates an inference graph.
+  # Hidden 1
+  images = tf.constant(1.2, tf.float32, shape=[100, 28])
+  with tf.name_scope("hidden1"):
+    weights = tf.Variable(
+        tf.truncated_normal([28, 128],
+                            stddev=1.0 / math.sqrt(float(28))),
+        name="weights")
+    biases = tf.Variable(tf.zeros([128]),
+                         name="biases")
+    hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
+  # Hidden 2
+  with tf.name_scope("hidden2"):
+    weights = tf.Variable(
+        tf.truncated_normal([128, 32],
+                            stddev=1.0 / math.sqrt(float(128))),
+        name="weights")
+    biases = tf.Variable(tf.zeros([32]),
+                         name="biases")
+    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
+  # Linear
+  with tf.name_scope("softmax_linear"):
+    weights = tf.Variable(
+        tf.truncated_normal([32, 10],
+                            stddev=1.0 / math.sqrt(float(32))),
+        name="weights")
+    biases = tf.Variable(tf.zeros([10]),
+                         name="biases")
+    logits = tf.matmul(hidden2, weights) + biases
+    tf.add_to_collection("logits", logits)
+
+  init_all_op = tf.global_variables_initializer()
+
+  with tf.Session() as sess:
+    # Initializes all the variables.
+    sess.run(init_all_op)
+    # Runs to logit.
+    sess.run(logits)
+    # Creates a saver.
+    saver0 = tf.train.Saver()
+    saver0.save(sess, 'my-save-dir/my-model-10000')
+    # Generates MetaGraphDef.
+    saver0.export_meta_graph('my-save-dir/my-model-10000.meta')
+  ```
+
+  Then later import it and extend it to a training graph.
+
+  ```Python
+  with tf.Session() as sess:
+    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
+    new_saver.restore(sess, 'my-save-dir/my-model-10000')
+    # Addes loss and train.
+    labels = tf.constant(0, tf.int32, shape=[100], name="labels")
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size), 1)
+    concated = tf.concat([indices, labels], 1)
+    onehot_labels = tf.sparse_to_dense(
+        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
+    logits = tf.get_collection("logits")[0]
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits, name="xentropy")
+    loss = tf.reduce_mean(cross_entropy, name="xentropy_mean")
+
+    tf.summary.scalar('loss', loss)
+    # Creates the gradient descent optimizer with the given learning rate.
+    optimizer = tf.train.GradientDescentOptimizer(0.01)
+
+    # Runs train_op.
+    train_op = optimizer.minimize(loss)
+    sess.run(train_op)
+  ```
+
+* Import a graph with preset devices.
+
+  Sometimes an exported meta graph is from a training environment that the
+  importer doesn't have. For example, the model might have been trained
+  on GPUs, or in a distributed environment with replicas. When importing
+  such models, it's useful to be able to clear the device settings in
+  the graph so that we can run it on locally available devices. This can
+  be achieved by calling `import_meta_graph` with the `clear_devices`
+  option set to `True`.
+
+  ```Python
+  with tf.Session() as sess:
+    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta',
+        clear_devices=True)
+    new_saver.restore(sess, 'my-save-dir/my-model-10000')
+    ...
+  ```
+
+* Import within the default graph.
+
+  Sometimes you might want to run `export_meta_graph` and `import_meta_graph`
+  in codelab using the default graph. In that case, you need to reset
+  the default graph by calling `tf.reset_default_graph()` first before
+  running import.
+
+  ```Python
+  meta_graph_def = tf.train.export_meta_graph()
+  ...
+  tf.reset_default_graph()
+  ...
+  tf.train.import_meta_graph(meta_graph_def)
+  ...
+  ```
+
+* Retrieve Hyper Parameters
+
+  ```Python
+  filename = ".".join([tf.train.latest_checkpoint(train_dir), "meta"])
+  tf.train.import_meta_graph(filename)
+  hparams = tf.get_collection("hparams")
+  ```
diff --git a/tensorflow/docs_src/programmers_guide/reading_data.md b/tensorflow/docs_src/programmers_guide/reading_data.md
new file mode 100644
index 00000000000..3c31d3a1a70
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/reading_data.md
@@ -0,0 +1,479 @@
+# Reading data
+
+There are three main methods of getting data into a TensorFlow program:
+
+*   Feeding: Python code provides the data when running each step.
+*   Reading from files: an input pipeline reads the data from files
+    at the beginning of a TensorFlow graph.
+*   Preloaded data: a constant or variable in the TensorFlow graph holds
+    all the data (for small data sets).
+
+[TOC]
+
+## Feeding
+
+TensorFlow's feed mechanism lets you inject data into any Tensor in a
+computation graph. A python computation can thus feed data directly into the
+graph.
+
+Supply feed data through the `feed_dict` argument to a run() or eval() call
+that initiates computation.
+
+```python
+with tf.Session():
+  input = tf.placeholder(tf.float32)
+  classifier = ...
+  print(classifier.eval(feed_dict={input: my_python_preprocessing_fn()}))
+```
+
+While you can replace any Tensor with feed data, including variables and
+constants, the best practice is to use a
+@{tf.placeholder} node. A
+`placeholder` exists solely to serve as the target of feeds. It is not
+initialized and contains no data. A placeholder generates an error if
+it is executed without a feed, so you won't forget to feed it.
+
+An example using `placeholder` and feeding to train on MNIST data can be found
+in
+[`tensorflow/examples/tutorials/mnist/fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py),
+and is described in the @{$mechanics$MNIST tutorial}.
+
+## Reading from files
+
+A typical pipeline for reading records from files has the following stages:
+
+1.  The list of filenames
+2.  *Optional* filename shuffling
+3.  *Optional* epoch limit
+4.  Filename queue
+5.  A Reader for the file format
+6.  A decoder for a record read by the reader
+7.  *Optional* preprocessing
+8.  Example queue
+
+### Filenames, shuffling, and epoch limits
+
+For the list of filenames, use either a constant string Tensor (like
+`["file0", "file1"]` or `[("file%d" % i) for i in range(2)]`) or the
+@{tf.train.match_filenames_once} function.
+
+Pass the list of filenames to the @{tf.train.string_input_producer} function.
+`string_input_producer` creates a FIFO queue for holding the filenames until
+the reader needs them.
+
+`string_input_producer` has options for shuffling and setting a maximum number
+of epochs. A queue runner adds the whole list of filenames to the queue once
+for each epoch, shuffling the filenames within an epoch if `shuffle=True`.
+This procedure provides a uniform sampling of files, so that examples are not
+under- or over- sampled relative to each other.
+
+The queue runner works in a thread separate from the reader that pulls
+filenames from the queue, so the shuffling and enqueuing process does not
+block the reader.
+
+### File formats
+
+Select the reader that matches your input file format and pass the filename
+queue to the reader's read method.  The read method outputs a key identifying
+the file and record (useful for debugging if you have some weird records), and
+a scalar string value. Use one (or more) of the decoder and conversion ops to
+decode this string into the tensors that make up an example.
+
+#### CSV files
+
+To read text files in [comma-separated value (CSV)
+format](https://tools.ietf.org/html/rfc4180), use a
+@{tf.TextLineReader} with the
+@{tf.decode_csv} operation. For example:
+
+```python
+filename_queue = tf.train.string_input_producer(["file0.csv", "file1.csv"])
+
+reader = tf.TextLineReader()
+key, value = reader.read(filename_queue)
+
+# Default values, in case of empty columns. Also specifies the type of the
+# decoded result.
+record_defaults = [[1], [1], [1], [1], [1]]
+col1, col2, col3, col4, col5 = tf.decode_csv(
+    value, record_defaults=record_defaults)
+features = tf.stack([col1, col2, col3, col4])
+
+with tf.Session() as sess:
+  # Start populating the filename queue.
+  coord = tf.train.Coordinator()
+  threads = tf.train.start_queue_runners(coord=coord)
+
+  for i in range(1200):
+    # Retrieve a single instance:
+    example, label = sess.run([features, col5])
+
+  coord.request_stop()
+  coord.join(threads)
+```
+
+Each execution of `read` reads a single line from the file. The
+`decode_csv` op then parses the result into a list of tensors. The
+`record_defaults` argument determines the type of the resulting tensors and
+sets the default value to use if a value is missing in the input string.
+
+You must call `tf.train.start_queue_runners` to populate the queue before
+you call `run` or `eval` to execute the `read`. Otherwise `read` will
+block while it waits for filenames from the queue.
+
+#### Fixed length records
+
+To read binary files in which each record is a fixed number of bytes, use
+@{tf.FixedLengthRecordReader}
+with the @{tf.decode_raw} operation.
+The `decode_raw` op converts from a string to a uint8 tensor.
+
+For example, [the CIFAR-10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html)
+uses a file format where each record is represented using a fixed number of
+bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have
+a uint8 tensor, standard operations can slice out each piece and reformat as
+needed. For CIFAR-10, you can see how to do the reading and decoding in
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
+and described in
+@{$deep_cnn#prepare-the-data$this tutorial}.
+
+#### Standard TensorFlow format
+
+Another approach is to convert whatever data you have into a supported format.
+This approach makes it easier to mix and match data sets and network
+architectures. The recommended format for TensorFlow is a
+@{$python/python_io#tfrecords_format_details$TFRecords file}
+containing
+[`tf.train.Example` protocol buffers](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+(which contain
+[`Features`](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto)
+as a field).  You write a little program that gets your data, stuffs it in an
+`Example` protocol buffer, serializes the protocol buffer to a string, and then
+writes the string to a TFRecords file using the
+@{tf.python_io.TFRecordWriter}.
+For example,
+[`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
+converts MNIST data to this format.
+
+To read a file of TFRecords, use
+@{tf.TFRecordReader} with
+the @{tf.parse_single_example}
+decoder. The `parse_single_example` op decodes the example protocol buffers into
+tensors. An MNIST example using the data produced by `convert_to_records` can be
+found in
+[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
+which you can compare with the `fully_connected_feed` version.
+
+### Preprocessing
+
+You can then do any preprocessing of these examples you want. This would be any
+processing that doesn't depend on trainable parameters. Examples include
+normalization of your data, picking a random slice, adding noise or distortions,
+etc.  See
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
+for an example.
+
+### Batching
+
+At the end of the pipeline we use another queue to batch together examples for
+training, evaluation, or inference.  For this we use a queue that randomizes the
+order of examples, using the
+@{tf.train.shuffle_batch}.
+
+Example:
+
+```
+def read_my_file_format(filename_queue):
+  reader = tf.SomeReader()
+  key, record_string = reader.read(filename_queue)
+  example, label = tf.some_decoder(record_string)
+  processed_example = some_processing(example)
+  return processed_example, label
+
+def input_pipeline(filenames, batch_size, num_epochs=None):
+  filename_queue = tf.train.string_input_producer(
+      filenames, num_epochs=num_epochs, shuffle=True)
+  example, label = read_my_file_format(filename_queue)
+  # min_after_dequeue defines how big a buffer we will randomly sample
+  #   from -- bigger means better shuffling but slower start up and more
+  #   memory used.
+  # capacity must be larger than min_after_dequeue and the amount larger
+  #   determines the maximum we will prefetch.  Recommendation:
+  #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
+  min_after_dequeue = 10000
+  capacity = min_after_dequeue + 3 * batch_size
+  example_batch, label_batch = tf.train.shuffle_batch(
+      [example, label], batch_size=batch_size, capacity=capacity,
+      min_after_dequeue=min_after_dequeue)
+  return example_batch, label_batch
+```
+
+If you need more parallelism or shuffling of examples between files, use
+multiple reader instances using the
+@{tf.train.shuffle_batch_join}.
+For example:
+
+```
+def read_my_file_format(filename_queue):
+  # Same as above
+
+def input_pipeline(filenames, batch_size, read_threads, num_epochs=None):
+  filename_queue = tf.train.string_input_producer(
+      filenames, num_epochs=num_epochs, shuffle=True)
+  example_list = [read_my_file_format(filename_queue)
+                  for _ in range(read_threads)]
+  min_after_dequeue = 10000
+  capacity = min_after_dequeue + 3 * batch_size
+  example_batch, label_batch = tf.train.shuffle_batch_join(
+      example_list, batch_size=batch_size, capacity=capacity,
+      min_after_dequeue=min_after_dequeue)
+  return example_batch, label_batch
+```
+
+You still only use a single filename queue that is shared by all the readers.
+That way we ensure that the different readers use different files from the same
+epoch until all the files from the epoch have been started.  (It is also usually
+sufficient to have a single thread filling the filename queue.)
+
+An alternative is to use a single reader via the
+@{tf.train.shuffle_batch}
+with `num_threads` bigger than 1.  This will make it read from a single file at
+the same time (but faster than with 1 thread), instead of N files at once.
+This can be important:
+
+*   If you have more reading threads than input files, to avoid the risk that
+    you will have two threads reading the same example from the same file near
+    each other.
+*   Or if reading N files in parallel causes too many disk seeks.
+
+How many threads do you need? the `tf.train.shuffle_batch*` functions add a
+summary to the graph that indicates how full the example queue is. If you have
+enough reading threads, that summary will stay above zero.  You can
+@{$summaries_and_tensorboard$view your summaries as training progresses using TensorBoard}.
+
+### Creating threads to prefetch using `QueueRunner` objects
+
+The short version: many of the `tf.train` functions listed above add
+@{tf.train.QueueRunner} objects to your
+graph.  These require that you call
+@{tf.train.start_queue_runners}
+before running any training or inference steps, or it will hang forever. This
+will start threads that run the input pipeline, filling the example queue so
+that the dequeue to get the examples will succeed.  This is best combined with a
+@{tf.train.Coordinator} to cleanly
+shut down these threads when there are errors. If you set a limit on the number
+of epochs, that will use an epoch counter that will need to be initialized. The
+recommended code pattern combining these is:
+
+```python
+# Create the graph, etc.
+init_op = tf.global_variables_initializer()
+
+# Create a session for running operations in the Graph.
+sess = tf.Session()
+
+# Initialize the variables (like the epoch counter).
+sess.run(init_op)
+
+# Start input enqueue threads.
+coord = tf.train.Coordinator()
+threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+
+try:
+    while not coord.should_stop():
+        # Run training steps or whatever
+        sess.run(train_op)
+
+except tf.errors.OutOfRangeError:
+    print('Done training -- epoch limit reached')
+finally:
+    # When done, ask the threads to stop.
+    coord.request_stop()
+
+# Wait for threads to finish.
+coord.join(threads)
+sess.close()
+```
+
+#### Aside: What is happening here?
+
+First we create the graph. It will have a few pipeline stages that are
+connected by queues. The first stage will generate filenames to read and enqueue
+them in the filename queue. The second stage consumes filenames (using a
+`Reader`), produces examples, and enqueues them in an example queue. Depending
+on how you have set things up, you may actually have a few independent copies of
+the second stage, so that you can read from multiple files in parallel. At the
+end of these stages is an enqueue operation, which enqueues into a queue that
+the next stage dequeues from. We want to start threads running these enqueuing
+operations, so that our training loop can dequeue examples from the example
+queue.
+
+<div style="width:70%; margin-left:12%; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/AnimatedFileQueues.gif">
+</div>
+
+The helpers in `tf.train` that create these queues and enqueuing operations add
+a @{tf.train.QueueRunner} to the
+graph using the
+@{tf.train.add_queue_runner}
+function. Each `QueueRunner` is responsible for one stage, and holds the list of
+enqueue operations that need to be run in threads. Once the graph is
+constructed, the
+@{tf.train.start_queue_runners}
+function asks each QueueRunner in the graph to start its threads running the
+enqueuing operations.
+
+If all goes well, you can now run your training steps and the queues will be
+filled by the background threads. If you have set an epoch limit, at some point
+an attempt to dequeue examples will get an
+@{tf.errors.OutOfRangeError}. This
+is the TensorFlow equivalent of "end of file" (EOF) -- this means the epoch
+limit has been reached and no more examples are available.
+
+The last ingredient is the
+@{tf.train.Coordinator}. This is responsible
+for letting all the threads know if anything has signaled a shut down. Most
+commonly this would be because an exception was raised, for example one of the
+threads got an error when running some operation (or an ordinary Python
+exception).
+
+For more about threading, queues, QueueRunners, and Coordinators
+@{$threading_and_queues$see here}.
+
+#### Aside: How clean shut-down when limiting epochs works
+
+Imagine you have a model that has set a limit on the number of epochs to train
+on.  That means that the thread generating filenames will only run that many
+times before generating an `OutOfRange` error. The QueueRunner will catch that
+error, close the filename queue, and exit the thread. Closing the queue does two
+things:
+
+*   Any future attempt to enqueue in the filename queue will generate an error.
+    At this point there shouldn't be any threads trying to do that, but this
+    is helpful when queues are closed due to other errors.
+*   Any current or future dequeue will either succeed (if there are enough
+    elements left) or fail (with an `OutOfRange` error) immediately.  They won't
+    block waiting for more elements to be enqueued, since by the previous point
+    that can't happen.
+
+The point is that when the filename queue is closed, there will likely still be
+many filenames in that queue, so the next stage of the pipeline (with the reader
+and other preprocessing) may continue running for some time.  Once the filename
+queue is exhausted, though, the next attempt to dequeue a filename (e.g. from a
+reader that has finished with the file it was working on) will trigger an
+`OutOfRange` error.  In this case, though, you might have multiple threads
+associated with a single QueueRunner.  If this isn't the last thread in the
+QueueRunner, the `OutOfRange` error just causes the one thread to exit.  This
+allows the other threads, which are still finishing up their last file, to
+proceed until they finish as well.  (Assuming you are using a
+@{tf.train.Coordinator},
+other types of errors will cause all the threads to stop.)  Once all the reader
+threads hit the `OutOfRange` error, only then does the next queue, the example
+queue, gets closed.
+
+Again, the example queue will have some elements queued, so training will
+continue until those are exhausted.  If the example queue is a
+@{tf.RandomShuffleQueue}, say
+because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
+avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
+However, once the queue is closed that restriction will be lifted and the queue
+will eventually empty.  At that point the actual training threads, when they
+try and dequeue from example queue, will start getting `OutOfRange` errors and
+exiting.  Once all the training threads are done,
+@{tf.train.Coordinator.join}
+will return and you can exit cleanly.
+
+### Filtering records or producing multiple examples per record
+
+Instead of examples with shapes `[x, y, z]`, you will produce a batch of
+examples with shape `[batch, x, y, z]`.  The batch size can be 0 if you want to
+filter this record out (maybe it is in a hold-out set?), or bigger than 1 if you
+are producing multiple examples per record.  Then simply set `enqueue_many=True`
+when calling one of the batching functions (such as `shuffle_batch` or
+`shuffle_batch_join`).
+
+### Sparse input data
+
+SparseTensors don't play well with queues. If you use SparseTensors you have
+to decode the string records using
+@{tf.parse_example} **after**
+batching (instead of using `tf.parse_single_example` before batching).
+
+## Preloaded data
+
+This is only used for small data sets that can be loaded entirely in memory.
+There are two approaches:
+
+* Store the data in a constant.
+* Store the data in a variable, that you initialize and then never change.
+
+Using a constant is a bit simpler, but uses more memory (since the constant is
+stored inline in the graph data structure, which may be duplicated a few times).
+
+```python
+training_data = ...
+training_labels = ...
+with tf.Session():
+  input_data = tf.constant(training_data)
+  input_labels = tf.constant(training_labels)
+  ...
+```
+
+To instead use a variable, you need to also initialize it after the graph has been built.
+
+```python
+training_data = ...
+training_labels = ...
+with tf.Session() as sess:
+  data_initializer = tf.placeholder(dtype=training_data.dtype,
+                                    shape=training_data.shape)
+  label_initializer = tf.placeholder(dtype=training_labels.dtype,
+                                     shape=training_labels.shape)
+  input_data = tf.Variable(data_initializer, trainable=False, collections=[])
+  input_labels = tf.Variable(label_initializer, trainable=False, collections=[])
+  ...
+  sess.run(input_data.initializer,
+           feed_dict={data_initializer: training_data})
+  sess.run(input_labels.initializer,
+           feed_dict={label_initializer: training_labels})
+```
+
+Setting `trainable=False` keeps the variable out of the
+`GraphKeys.TRAINABLE_VARIABLES` collection in the graph, so we won't try and
+update it when training.  Setting `collections=[]` keeps the variable out of the
+`GraphKeys.GLOBAL_VARIABLES` collection used for saving and restoring checkpoints.
+
+Either way,
+@{tf.train.slice_input_producer}
+can be used to produce a slice at a time.  This shuffles the examples across an
+entire epoch, so further shuffling when batching is undesirable.  So instead of
+using the `shuffle_batch` functions, we use the plain
+@{tf.train.batch} function.  To use
+multiple preprocessing threads, set the `num_threads` parameter to a number
+bigger than 1.
+
+An MNIST example that preloads the data using constants can be found in
+[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py), and one that preloads the data using variables can be found in
+[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py),
+You can compare these with the `fully_connected_feed` and
+`fully_connected_reader` versions above.
+
+## Multiple input pipelines
+
+Commonly you will want to train on one dataset and evaluate (or "eval") on
+another.  One way to do this is to actually have two separate processes:
+
+* The training process reads training input data and periodically writes
+  checkpoint files with all the trained variables.
+* The evaluation process restores the checkpoint files into an inference
+  model that reads validation input data.
+
+This is what is done in
+@{$deep_cnn#save-and-restore-checkpoints$the example CIFAR-10 model}.  This has a couple of benefits:
+
+* The eval is performed on a single snapshot of the trained variables.
+* You can perform the eval even after training has completed and exited.
+
+You can have the train and eval in the same graph in the same process, and share
+their trained variables.  See
+@{$variable_scope$the shared variables tutorial}.
diff --git a/tensorflow/docs_src/programmers_guide/saved_model_cli.md b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
new file mode 100644
index 00000000000..9851bd72510
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
@@ -0,0 +1,251 @@
+# SavedModel CLI (Command-Line Interface)
+
+[`SavedModel`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md)
+is a universal serialization format for Tensorflow. It provides a
+language-neutral format to save machine-learned models and enables higher-level
+systems and tools to produce, consume and transform TensorFlow models.
+
+We provide SavedModel CLI(command-line interface) as a tool to inspect and
+execute a [`MetaGraph`](https://www.tensorflow.org/programmers_guide/meta_graph)
+in a SavedModel. You can inspect for example, what
+[`SignatureDefs`](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/signature_defs.md),
+including their input and output tensors, are in the model without writing any
+code. This can be useful in situations such as when you want to quickly check
+your input dtype and shape match with the model. Moreover, if you want to test
+out the model, it also allows you to do a sanity check by passing in sample
+inputs in the format of for example, python expressions, and fetch the outputs
+simply through command line.
+
+## Get SavedModel CLI
+
+If TensorFlow is installed on your system through pip, the `saved_model_cli`
+binary can be invoked directly from command line.
+
+To build the binary from source, run the following command:
+
+```
+$bazel build tensorflow/python/tools:saved_model_cli
+```
+
+## Commands
+
+SavedModel CLI allows users to both show and run computations on a
+[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
+in a SavedModel. These are done through `show` and `run` commands. We will
+explain the usages of both commands with detailed examples. SavedModel CLI will
+also display this information with `-h` option.
+
+### `show` command
+
+A SavedModel contains one or more MetaGraphs, identified by their tag-sets. Each
+MetaGraph contains both a TensorFlow GraphDef as well as associated metadata
+necessary for running computation in a graph. In order to serve a model, you
+might wonder what kind of SignatureDefs are in each model, and what are their
+inputs and outputs etc. The `show` command let you examine the content of the
+SavedModel in a hierarchical order.
+
+```
+usage: saved_model_cli show [-h] --dir DIR [--all]
+[--tag_set TAG_SET] [--signature_def SIGNATURE_DEF_KEY]
+```
+
+#### Examples
+
+To show all available MetaGraphDef tag-sets in the SavedModel:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir
+The given SavedModel contains the following tag-sets:
+serve
+serve, gpu
+```
+
+To show all available SignatureDef keys in a MetaGraphDef:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve
+The given SavedModel MetaGraphDef contains SignatureDefs with the following keys:
+SignatureDef key: "classify_x2_to_y3"
+SignatureDef key: "classify_x_to_y"
+SignatureDef key: "regress_x2_to_y3"
+SignatureDef key: "regress_x_to_y"
+SignatureDef key: "regress_x_to_y2"
+SignatureDef key: "serving_default"
+```
+
+For a MetaGraphDef with multiple tags in the tag-set, all tags must be passed
+in, separated by ',':
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve,gpu
+```
+
+To show all inputs and outputs TensorInfo for a specific SignatureDef, pass in
+the SignatureDef key to `signature_def` option. This is very useful when you
+want to know the tensor key value, dtype and shape of the input tensors for
+executing the computation graph later.
+
+```
+$saved_model_cli show --dir \
+/tmp/saved_model_dir --tag_set serve --signature_def serving_default
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+To show all available information in the SavedModel, use `--all` option:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --all
+MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['classify_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/classify
+
+...
+
+signature_def['serving_default']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+### `run` command
+
+SavedModel CLI also allows you to run a graph computation by passing in inputs,
+displaying, and saving the outputs.
+
+```
+usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET --signature_def
+                           SIGNATURE_DEF_KEY [--inputs INPUTS]
+                           [--input_exprs INPUT_EXPRS] [--outdir OUTDIR]
+                           [--overwrite] [--tf_debug]
+```
+
+Tensor keys are used to specify which input we are passing in the values for.
+There are two ways to pass inputs to the model. With '--inputs' option, you can
+pass in numpy ndarray by files. With '--input_exprs' option, you can use python
+expressions as inputs.
+
+#### Input By File
+
+To pass in inputs by files, use `--inputs` option in the format of
+`<input_key>=<filename>`, or `<input_key>=<filename>[<variable_name>]`. Each
+input is separated by semicolon. File specified by `filename` will be loaded
+using `numpy.load`. Inputs can be loaded from only `.npy`, `.npz` or pickle
+files. The `variable_name` key is optional depending on the input file type as
+descripted in more details below.
+
+When loading from a `.npy` file, which always contains a numpy ndarray, the
+content will be directly assigned to the specified input tensor. If a
+`variable_name` is specified, it will be ignored and a warning will be issued.
+
+When loading from a `.npz` zip file, user can specify which variable within the
+zip file to load for the input tensor key with `variable_name`. If nothing is
+specified, SavedModel CLI will check that only one file is included in the zip
+file and load it for the specified input tensor key.
+
+When loading from a pickle file, if no `variable_name` is specified in the
+square brackets, whatever that is inside the pickle file will be passed to the
+specified input tensor key. Else SavedModel CLI will assume a dictionary is
+stored in the pickle file and the value corresponding to the variable_name will
+be used.
+
+#### Input By Python Expression
+
+To pass in inputs by python expressions, use `--input_exprs` option. `numpy`
+module is available as `np`. For example, `input_key=np.ones((32, 32, 3))` or
+`input_key=[[1], [2], [3]]`. This can be useful for when you don't have data
+files lying around, but still want to sanity check the model with some simple
+inputs that match the dtype and shape of the model signature.
+
+#### Save Output
+
+By default, SavedModel CLI will print outputs to console. If a directory is
+passed to `--outdir` option, the outputs will be saved as npy files named after
+output tensor keys under the given directory. Use `--overwrite` to overwrite
+existing output files.
+
+#### TensorFlow Debugger (tfdbg) Integration
+
+If `--tf_debug` option is set, SavedModel CLI will use TensorFlow Debugger
+(tfdbg) to watch the intermediate Tensors and runtime GraphDefs while running
+the SavedModel.
+
+#### Examples
+
+If we have a simple model that adds `x1` and `x2` to get output `y`, where all
+tensors are of shape `(-1, 1)`, and we have two `npz` files. File
+`/tmp/my_data1.npy` contains a numpy ndarray `[[1], [2], [3]]`, file
+`/tmp/my_data2.npy` contains another numpy ndarray `[[0.5], [0.5], [0.5]]`. Now
+let's run these two `npy` files through the model to get `y`:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npy;x2=/tmp/my_data2.npy \
+--outdir /tmp/out
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+Similarly, we can run input tensors from `npz` file and pickle file, as well as
+overwrite the previous output file:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y \
+--inputs x1=/tmp/my_data1.npz[x];x2=/tmp/my_data2.pkl --outdir /tmp/out \
+--overwrite
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+You can also use python expression instead of input file. Here we replace input
+`x2` with a python expression:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npz[x] \
+--input_exprs 'x2=np.ones((3,1))'
+Result for output key y:
+[[ 2]
+ [ 3]
+ [ 4]]
+```
+
+To run model with TensorFlow Debugger on:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def serving_default --inputs x=/tmp/data.npz[x] --tf_debug
+```
diff --git a/tensorflow/docs_src/programmers_guide/supervisor.md b/tensorflow/docs_src/programmers_guide/supervisor.md
new file mode 100644
index 00000000000..ec7c91b1472
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/supervisor.md
@@ -0,0 +1,402 @@
+# Supervisor: Training Helper for Days-Long Trainings.
+
+To train a model with TensorFlow you can simply run a training op a number of
+times and save a checkpoint of the trained parameters when you're done.  This
+works well for small models that can train in a few hours.
+
+Larger models that require days of training, possibly across multiple replicas,
+need a more robust training process that:
+
+ * Handles shutdowns and crashes cleanly.
+ * Can be resumed after a shutdown or a crash.
+ * Can be monitored through TensorBoard.
+
+To be able to resume training after a shutdown or a crash the training process
+must save checkpoints regularly.  On restart, it must look for the most recent
+checkpoint and load it before resuming training.
+
+To be monitored through TensorBoard, the training process must run summary ops
+regularly and append the returned values to an events file as explained in
+@{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
+TensorBoard monitors events files and displays graphs reporting training
+progress over time.
+
+The @{tf.train.Supervisor} provides
+a set of services that helps implement a robust training process.
+
+This how-to shows how to use the supervisor directly.  Please also consider
+using one of several frameworks built on top of the supervisor that provide
+richer training loops, and numerous customization options:
+@{$python/contrib.learn$`tf.learn`} is a good choice.
+
+Note that the supervisor is very helpful for training large models, but can
+also be used for smaller models without any penalty.
+
+## Very Simple Scenario
+
+The simplest scenario for using a supervisor is to:
+
+ * Create a `Supervisor` object, passing it the path to a directory where to
+   save checkpoints and summaries.
+
+ * Ask the supervisor for a session with
+   @{tf.train.Supervisor.managed_session}.
+
+ * Use the session to execute a train op, checking at each step if the
+   supervisor requests that the training stops.
+
+```python
+  ...create graph...
+  my_train_op = ...
+
+  sv = tf.train.Supervisor(logdir="/my/training/directory")
+  with sv.managed_session() as sess:
+    for step in range(100000):
+      if sv.should_stop():
+        break
+      sess.run(my_train_op)
+```
+
+### Started Services
+
+In the very simple scenario, the `managed_session()` call starts a few
+services, which run in their own threads, and use the managed session to run
+ops in your graph.
+
+If your graph contains an integer variable named `global_step`, the services
+use its value to measure the number of training steps executed.  See the @{$mechanics#training$MNIST training tutorial} for how to
+create a `global_step` variable.
+
+ * _Checkpointing_ service: Saves a copy of the graph variables in the logdir.
+   The checkpoint filename uses the value of the `global_step` variable if one
+   was added to your graph.  Runs every 10 minutes by default.
+
+ * _Summary_ service: Runs all the summary ops and appends their output to an
+   @{$summaries_and_tensorboard$events file} in the logdir.  Runs
+   every 2 minutes by default.
+
+ * _Step counter_: Counts how many steps have been executed, by looking at
+   changes in the `global_step` variable.  Appends a summary to the events file
+   reporting the number of global steps per second.  The summary tag is
+   "global_step/sec".  This also runs every 2 minutes by default.
+
+ * _Queue Runners_: If any @{tf.train.QueueRunner} were added to the
+   graph, the supervisor launches them in their own threads.
+
+All time intervals can be changed when constructing the supervisor object.  See
+the [supervisor reference](#supervisor_reference) for details.
+
+### Checking for Stop
+
+The check for stop in the main training loop is important and necessary.
+
+Exceptions raised in the service threads are reported to the supervisor which
+then sets its `should_stop()` condition to true.  Other service threads notice
+that condition and terminate properly.  The main training loop, within the
+`managed_session()` block, must also check for the stop condition and
+terminate.
+
+Note that `managed_session()` takes care of catching exceptions raised from the
+training loop to report them to the supervisor.  The main loop does not need to
+do anything special about exceptions.  It only needs to check for the stop
+condition.
+
+### Recovery
+
+If the training program shuts down or crashes, its most recent checkpoint and
+event files are left in the logdir.  When you restart the program,
+`managed_session()` restores the graph from the most recent checkpoint and
+resumes training where it stopped.
+
+A new events file is created.  If you start TensorBoard and point it to the
+logdir, it will know how to merge the contents of the two events files and will
+show the training resuming at the last global step from the checkpoint.
+
+## Larger Model Scenario
+
+The very simple scenario is sufficient for most small to medium sized models.
+Larger models may run out memory when the summary service runs: The summary ops
+are run in parallel with the main loop running the train op.  This can cause
+memory usage to peak to up to two times the normal use.
+
+For a larger model you can tell the supervisor to not run the summary service
+and instead run it yourself in your main training loop: pass `summary_op=None`
+when constructing the supervisor.
+
+For example this code runs the summary op every 100 steps in the training loop:
+
+```python
+  ...create graph...
+  my_train_op = ...
+  my_summary_op = tf.summary.merge_all()
+
+  sv = tf.train.Supervisor(logdir="/my/training/directory",
+                     summary_op=None) # Do not run the summary service
+  with sv.managed_session() as sess:
+    for step in range(100000):
+      if sv.should_stop():
+        break
+      if step % 100 == 0:
+        _, summ = sess.run([my_train_op, my_summary_op])
+        sv.summary_computed(sess, summ)
+      else:
+        sess.run(my_train_op)
+```
+
+## Pre-trained Model Scenario
+
+The `managed_session()` call takes care of initializing the model in the
+session.  The model is restored from a checkpoint if one is available,
+or initialized from scratch otherwise.
+
+One common scenario is to initialize the model by loading a "pre-trained"
+checkpoint that was saved while training a usually slightly different model
+using a different dataset.
+
+You can load a pre-trained checkpoint by passing an "init function" to the
+supervisor.  This function is called only if the model needs to be initialized
+from scratch, not when the model can be recovered from a checkpoint from the
+logdir.
+
+To load the pre-trained model, the init function needs a
+@{tf.train.Saver} object, so you should create
+a saver for this purpose.  This is usually a good idea because the new model
+may contain variables that are not present in the pre-trained checkpoint: This
+saver must only restore the pre-trained variables.  If you were using the
+default saver, you could get an error trying to restore all the variables of
+the new model from the pre-trained checkpoint.
+
+```python
+  ...create graph...
+  # Create a saver that restores only the pre-trained variables.
+  pre_train_saver = tf.train.Saver([pre_train_var1, pre_train_var2])
+
+  # Define an init function that loads the pretrained checkpoint.
+  def load_pretrain(sess):
+    pre_train_saver.restore(sess, "<path to pre-trained-checkpoint>")
+
+  # Pass the init function to the supervisor.
+  #
+  # The init function is called _after_ the variables have been initialized
+  # by running the init_op.
+  sv = tf.train.Supervisor(logdir="/my/training/directory",
+                     init_fn=load_pretrain)
+  with sv.managed_session() as sess:
+    # Here sess was either initialized from the pre-trained-checkpoint or
+    # recovered from a checkpoint saved in a previous run of this code.
+    ...
+```
+
+## Running Your Own Services
+
+Supervisor services, such as the checkpointing service, run in threads parallel
+to the main training loop.  You sometimes want to add your own services, for
+example to fetch different sets of summaries on a different schedule than the
+usual summary service.
+
+Use the @{tf.train.Supervisor.loop} method of
+the supervisor for this purpose.  It repeatedly calls a function of your choice
+on a timer until the supervisor stop condition becomes true, so it plays nicely
+with the other services.
+
+Example: Call `my_additional_summaries()` every 20mn:
+
+```python
+
+def my_additional_summaries(sv, sess):
+ ...fetch and write summaries, see below...
+
+...
+  sv = tf.train.Supervisor(logdir="/my/training/directory")
+  with sv.managed_session() as sess:
+    # Call my_additional_summaries() every 1200s, or 20mn,
+    # passing (sv, sess) as arguments.
+    sv.loop(1200, my_additional_summaries, args=(sv, sess))
+    ...main training loop...
+```
+
+## Writing Summaries
+
+The supervisor always creates an events file in its logdir, as well as a
+@{tf.summary.FileWriter} to append
+events and summaries to that file.  If you want to write your own summaries it
+is a good idea to append them to that same events file: TensorBoard likes it
+better when only one events file in a directory is being actively appended to.
+
+The supervisor provides a helper function to append summaries:
+@{tf.train.Supervisor.summary_computed}.
+Just pass to the function the output returned by a summary op.  Here is an
+example of using that function to implement `my_additional_summaries()` from the
+previous example:
+
+```python
+def my_additional_summaries(sv, sess):
+  summaries = sess.run(my_additional_summary_op)
+  sv.summary_computed(sess, summaries)
+```
+
+For more advanced usages, the supervisor provides access to its summary writer
+through its
+@{tf.train.Supervisor.summary_writer}
+attribute.
+
+## Supervisor Reference
+
+The [Very Simple Scenario](#very_simple_scenario), and the [Larger Model
+Scenario](#larger_model_scenario) show basic uses of a supervisor.  More
+advanced scenarios can be constructed by using the many options provided by the
+supervisor
+
+### Checkpointing: Where and When.
+
+The `managed_session()` call launches the checkpointing service, which can be
+configured by the following keyword arguments to the `Supervisor()`
+constructor:
+
+ * `logdir`: path to a directory where the checkpointing service creates
+   checkpoints.  The directory is created if needed.  Passing `None` disables
+   the checkpointing and the summary services.
+
+ * `checkpoint_basename`: Name of the checkpoint files to create, defaults to
+   "model.ckpt".
+
+   If the model contains a scalar integer variable named `global_step`, the
+   value of that variable is appended to the checkpoint filename.
+
+   For example, at global step 1234 the checkpoint filename is
+   "model.ckpt-1234".
+
+ * `save_model_secs`: Number of seconds between each checkpoint.  Defaults to
+   600, or 10 minutes.
+
+   When choosing a value, consider how much work you want to lose in case of a
+   crash: you will never lose more than `save_model_secs` seconds of work.
+   Setting this to 0 disables the checkpointing service.
+
+ * `saver`: A @{tf.train.Saver} object to use
+   for checkpointing.
+
+   If you do not pass one, the supervisor creates one for you by calling
+   `tf.train.Saver()`, which add ops to save and restore all variables in your model.
+   This is usually what you need.
+
+Example: Use a custom Saver and checkpoint every 30 seconds.
+
+```python
+  ...create graph...
+  my_saver = tf.train.Saver(<only some variables>)
+  sv = tf.train.Supervisor(logdir="/my/training/directory",
+                     saver=my_saver,
+                     save_model_secs=30)
+  with sv.managed_session() as sess:
+    ...training loop...
+```
+
+### Summaries: Where and When.
+
+The `managed_session()` call launches the summary service which fetches
+summaries and reports the number of steps executed per second.  It can be
+configured by the following keyword arguments to the `Supervisor()`
+constructor:
+
+ * `logdir`: Path to a directory where the summary service creates event files.
+   The directory is created if needed.  Passing `None` disables the summary
+   service as well as the checkpointing services.
+
+ * `save_summaries_secs`: Number of seconds between each run of the summary
+   service.  Defaults to 120, or 2 minutes.
+
+   When choosing a value, consider how expensive your summaries are, and how
+   much disk they will occupy.  Pass 0 to disable the summary service.
+
+ * `summary_op`: Op to use to fetch the summaries.
+
+   If not specified, the supervisor use the first op in the
+   `tf.GraphKeys.SUMMARY_OP` @{tf.Graph.add_to_collection$graph collection}.  If
+   the collection is empty the supervisor creates an op that aggregates all
+   summaries in the graph using `tf.summary.merge_all()`.
+
+   Passing `None` disables the summary service.
+
+ * `global_step`: Tensor to use to count the global step.
+
+   If not specified, the supervisor uses the first tensor in the
+   `tf.GraphKeys.GLOBAL_STEP` @{tf.Graph.add_to_collection$graph collection}.  If
+   the collection is empty, the supervisor looks for a scalar integer variable
+   named `global_step` in the graph.
+
+   If found, the global step tensor is used to measure the number of training
+   steps executed.  Note that your training op is responsible for incrementing
+   the global step value.
+
+### Model Initialization and Recovery
+
+The `managed_session()` call takes care of initializing or recovering a
+session.  It returns a session with a fully initialized model, ready to run
+ops.  If a checkpoint exists in the logdir when `managed_session()` is called,
+the model is initialized by loading that checkpoint, otherwise it is
+initialized by calling an init op and optionally an init function.
+
+When no checkpoint is available, model initialization is controlled by the
+following keyword arguments to the `Supervisor()` constructor:
+
+ * `init_op`: Op to run to initialize the model.
+
+   If not specified, the supervisor uses the first op in the
+   `tf.GraphKeys.INIT_OP` collection.  If the collection is empty, the
+   supervisor adds an op to initialize all the variables in the graph by
+   calling `tf.global_variables_initializer()`.
+
+   Pass `None` to not use an init op.
+
+ * `init_fn`: Python function to call to initialize the model.
+
+   If specified, called as `init_fn(sess)` where `sess` is the managed session.
+   If an init op is also used, the init function is called _after_ the init op.
+
+ * `local_init_op`: An additional op to initialize parts of the graph that are
+   not saved in checkpoints such as tables and
+   @{tf.contrib.framework.local_variable$local variables}. The
+   local init op is run _before_ the init op and the init function.
+
+   If not specified, the supervisor uses the first op in the
+   `tf.GraphKeys.LOCAL_INIT_OP` collection.  If the collection is empty the
+   supervisor adds an op to initialize all the tables and local variables in
+   the graph by calling `tf.tables_initializer()` and
+   `tf.local_variables_initializer()`.
+
+   Pass `None` to not use a local init op.
+
+ * `ready_op`: Op to check if the model is initialized.
+
+   After running the local init op, the init op, and the init function, the
+   supervisor verifies that the model is fully initialized by running the ready
+   op.  This is an op that returns an empty string if the model is initialized,
+   or a description of what parts of the model are not initialized if not.
+
+   If not specified, the supervisor uses the first op in the
+   `tf.GraphKeys.READY_OP` collection.  If the collection is empty the
+   supervisor creates a ready op that verifies that all variables are
+   initialized by calling `tf.report_uninitialized_variables()`.
+
+   Pass `None` to disable the ready op.  In that case the model is not
+   checked after initialization.
+
+Checkpoint recovery is controlled by the following keyword arguments to the
+`Supervisor()` constructor:
+
+ * `logdir`: Path to a directory in which to look for checkpoints.  The
+  checkpoint service saves a metadata file, named "checkpoint", in the
+  checkpoint directory that indicates the path to the most recent checkpoint.
+
+  This file is in text format. When in a pinch, you can edit it manually to
+  recover from a different checkpoint than the most recent one.
+
+ * `ready_op`: (see above).  The ready op is run before and after loading the
+   checkpoint.  The first run checks if the model needs to be initialized and
+   the second run verifies that the model is fully initialized.
+
+ * `local_init_op`: (see above).  The local init op is run before running the
+   ready op the first time, to initialize local variables and tables.
+
+ * `saver`: (see above).  Saver object used to load the checkpoint.
diff --git a/tensorflow/docs_src/programmers_guide/threading_and_queues.md b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
new file mode 100644
index 00000000000..7d3edb788e0
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
@@ -0,0 +1,179 @@
+# Threading and Queues
+
+Queues are a powerful mechanism for asynchronous computation using TensorFlow.
+
+Like everything in TensorFlow, a queue is a node in a TensorFlow graph. It's a
+stateful node, like a variable: other nodes can modify its content. In
+particular, nodes can enqueue new items in to the queue, or dequeue existing
+items from the queue.
+
+To get a feel for queues, let's consider a simple example. We will create a
+"first in, first out" queue (`FIFOQueue`) and fill it with zeros.
+Then we'll construct a graph
+that takes an item off the queue, adds one to that item, and puts it back on the
+end of the queue. Slowly, the numbers on the queue increase.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/IncremeterFifoQueue.gif">
+</div>
+
+`Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer
+to the queue instead of a normal value, allowing them to change it. We recommend
+you think of these as being like methods of the queue. In fact, in the Python
+API, they are methods of the queue object (e.g. `q.enqueue(...)`).
+
+**N.B.** Queue methods (such as `q.enqueue(...)`) *must* run on the same device
+as the queue. Incompatible device placement directives will be ignored when
+creating these operations.
+
+Now that you have a bit of a feel for queues, let's dive into the details...
+
+## Queue usage overview
+
+Queues, such as @{tf.FIFOQueue}
+and @{tf.RandomShuffleQueue},
+are important TensorFlow objects for computing tensors asynchronously in a
+graph.
+
+For example, a typical input architecture is to use a `RandomShuffleQueue` to
+prepare inputs for training a model:
+
+* Multiple threads prepare training examples and push them in the queue.
+* A training thread executes a training op that dequeues mini-batches from the
+  queue
+
+This architecture has many benefits, as highlighted in the
+@{$reading_data$Reading data how to}, which also gives an overview of
+functions that simplify the construction of input pipelines.
+
+The TensorFlow `Session` object is multithreaded, so multiple threads can
+easily use the same session and run ops in parallel.  However, it is not always
+easy to implement a Python program that drives threads as described above.  All
+threads must be able to stop together, exceptions must be caught and
+reported, and queues must be properly closed when stopping.
+
+TensorFlow provides two classes to help:
+@{tf.train.Coordinator} and
+@{tf.train.QueueRunner}. These two classes
+are designed to be used together. The `Coordinator` class helps multiple threads
+stop together and report exceptions to a program that waits for them to stop.
+The `QueueRunner` class is used to create a number of threads cooperating to
+enqueue tensors in the same queue.
+
+## Coordinator
+
+The `Coordinator` class helps multiple threads stop together.
+
+Its key methods are:
+
+* @{tf.train.Coordinator.should_stop}: returns True if the threads should stop.
+* @{tf.train.Coordinator.request_stop}: requests that threads should stop.
+* @{tf.train.Coordinator.join}: waits until the specified threads have stopped.
+
+You first create a `Coordinator` object, and then create a number of threads
+that use the coordinator.  The threads typically run loops that stop when
+`should_stop()` returns `True`.
+
+Any thread can decide that the computation should stop.  It only has to call
+`request_stop()` and the other threads will stop as `should_stop()` will then
+return `True`.
+
+```python
+# Thread body: loop until the coordinator indicates a stop was requested.
+# If some condition becomes true, ask the coordinator to stop.
+def MyLoop(coord):
+  while not coord.should_stop():
+    ...do something...
+    if ...some condition...:
+      coord.request_stop()
+
+# Main thread: create a coordinator.
+coord = tf.train.Coordinator()
+
+# Create 10 threads that run 'MyLoop()'
+threads = [threading.Thread(target=MyLoop, args=(coord,)) for i in xrange(10)]
+
+# Start the threads and wait for all of them to stop.
+for t in threads:
+  t.start()
+coord.join(threads)
+```
+
+Obviously, the coordinator can manage threads doing very different things.
+They don't have to be all the same as in the example above.  The coordinator
+also has support to capture and report exceptions.  See the @{tf.train.Coordinator} documentation for more details.
+
+## QueueRunner
+
+The `QueueRunner` class creates a number of threads that repeatedly run an
+enqueue op.  These threads can use a coordinator to stop together.  In
+addition, a queue runner runs a *closer thread* that automatically closes the
+queue if an exception is reported to the coordinator.
+
+You can use a queue runner to implement the architecture described above.
+
+First build a graph that uses a TensorFlow queue (e.g. a `tf.RandomShuffleQueue`) for input examples.  Add ops that
+process examples and enqueue them in the queue.  Add training ops that start by
+dequeueing from the queue.
+
+```python
+example = ...ops to create one example...
+# Create a queue, and an op that enqueues examples one at a time in the queue.
+queue = tf.RandomShuffleQueue(...)
+enqueue_op = queue.enqueue(example)
+# Create a training graph that starts by dequeueing a batch of examples.
+inputs = queue.dequeue_many(batch_size)
+train_op = ...use 'inputs' to build the training part of the graph...
+```
+
+In the Python training program, create a `QueueRunner` that will run a few
+threads to process and enqueue examples.  Create a `Coordinator` and ask the
+queue runner to start its threads with the coordinator.  Write a training loop
+that also uses the coordinator.
+
+```
+# Create a queue runner that will run 4 threads in parallel to enqueue
+# examples.
+qr = tf.train.QueueRunner(queue, [enqueue_op] * 4)
+
+# Launch the graph.
+sess = tf.Session()
+# Create a coordinator, launch the queue runner threads.
+coord = tf.train.Coordinator()
+enqueue_threads = qr.create_threads(sess, coord=coord, start=True)
+# Run the training loop, controlling termination with the coordinator.
+for step in xrange(1000000):
+    if coord.should_stop():
+        break
+    sess.run(train_op)
+# When done, ask the threads to stop.
+coord.request_stop()
+# And wait for them to actually do it.
+coord.join(enqueue_threads)
+```
+
+## Handling exceptions
+
+Threads started by queue runners do more than just run the enqueue ops.  They
+also catch and handle exceptions generated by queues, including the
+`tf.errors.OutOfRangeError` exception, which is used to report that a queue was closed.
+
+A training program that uses a coordinator must similarly catch and report
+exceptions in its main loop.
+
+Here is an improved version of the training loop above.
+
+```python
+try:
+    for step in xrange(1000000):
+        if coord.should_stop():
+            break
+        sess.run(train_op)
+except Exception, e:
+    # Report exceptions to the coordinator.
+    coord.request_stop(e)
+finally:
+    # Terminate as usual. It is safe to call `coord.request_stop()` twice.
+    coord.request_stop()
+    coord.join(threads)
+```
diff --git a/tensorflow/docs_src/programmers_guide/variable_scope.md b/tensorflow/docs_src/programmers_guide/variable_scope.md
new file mode 100644
index 00000000000..f4d2b3f37b8
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/variable_scope.md
@@ -0,0 +1,373 @@
+# Sharing Variables
+
+You can create, initialize, save and load single variables
+in the way described in the @{$variables$Variables HowTo}.
+But when building complex models you often need to share large sets of
+variables and you might want to initialize all of them in one place.
+This tutorial shows how this can be done using `tf.variable_scope()` and
+`tf.get_variable()`.
+
+## The Problem
+
+Imagine you create a simple model for image filters, similar to our
+@{$deep_cnn$Convolutional Neural Networks Tutorial}
+model but with only 2 convolutions (for simplicity of this example). If you use
+just `tf.Variable`, as explained in @{$variables$Variables HowTo},
+your model might look like this.
+
+```python
+def my_image_filter(input_images):
+    conv1_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
+        name="conv1_weights")
+    conv1_biases = tf.Variable(tf.zeros([32]), name="conv1_biases")
+    conv1 = tf.nn.conv2d(input_images, conv1_weights,
+        strides=[1, 1, 1, 1], padding='SAME')
+    relu1 = tf.nn.relu(conv1 + conv1_biases)
+
+    conv2_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
+        name="conv2_weights")
+    conv2_biases = tf.Variable(tf.zeros([32]), name="conv2_biases")
+    conv2 = tf.nn.conv2d(relu1, conv2_weights,
+        strides=[1, 1, 1, 1], padding='SAME')
+    return tf.nn.relu(conv2 + conv2_biases)
+```
+
+As you can easily imagine, models quickly get much more complicated than
+this one, and even here we already have 4 different variables: `conv1_weights`,
+`conv1_biases`, `conv2_weights`, and `conv2_biases`.
+
+The problem arises when you want to reuse this model. Assume you want to
+apply your image filter to 2 different images, `image1` and `image2`.
+You want both images processed by the same filter with the same parameters.
+You can call `my_image_filter()` twice, but this will create two sets
+of variables, 4 variables in each one, for a total of 8 variables.
+
+```python
+# First call creates one set of 4 variables.
+result1 = my_image_filter(image1)
+# Another set of 4 variables is created in the second call.
+result2 = my_image_filter(image2)
+```
+
+A common way to share variables is to create them in a separate piece of code
+and pass them to functions that use them.   For example by using a dictionary:
+
+```python
+variables_dict = {
+    "conv1_weights": tf.Variable(tf.random_normal([5, 5, 32, 32]),
+        name="conv1_weights")
+    "conv1_biases": tf.Variable(tf.zeros([32]), name="conv1_biases")
+    ... etc. ...
+}
+
+def my_image_filter(input_images, variables_dict):
+    conv1 = tf.nn.conv2d(input_images, variables_dict["conv1_weights"],
+        strides=[1, 1, 1, 1], padding='SAME')
+    relu1 = tf.nn.relu(conv1 + variables_dict["conv1_biases"])
+
+    conv2 = tf.nn.conv2d(relu1, variables_dict["conv2_weights"],
+        strides=[1, 1, 1, 1], padding='SAME')
+    return tf.nn.relu(conv2 + variables_dict["conv2_biases"])
+
+# Both calls to my_image_filter() now use the same variables
+result1 = my_image_filter(image1, variables_dict)
+result2 = my_image_filter(image2, variables_dict)
+```
+
+While convenient, creating variables like above,
+outside of the code, breaks encapsulation:
+
+*  The code that builds the graph must document the names, types,
+   and shapes of variables to create.
+*  When the code changes, the callers may have to create more, or less,
+   or different variables.
+
+One way to address the problem is to use classes to create a model,
+where the classes take care of managing the variables they need.
+For a lighter solution, not involving classes, TensorFlow provides
+a *Variable Scope* mechanism that allows to easily share named variables
+while constructing a graph.
+
+## Variable Scope Example
+
+Variable Scope mechanism in TensorFlow consists of two main functions:
+
+* `tf.get_variable(<name>, <shape>, <initializer>)`:
+  Creates or returns a variable with a given name.
+* `tf.variable_scope(<scope_name>)`:
+  Manages namespaces for names passed to `tf.get_variable()`.
+
+The function `tf.get_variable()` is used to get or create a variable instead
+of a direct call to `tf.Variable`. It uses an *initializer* instead of passing
+the value directly, as in `tf.Variable`. An initializer is a function that
+takes the shape and provides a tensor with that shape. Here are some
+initializers available in TensorFlow:
+
+* `tf.constant_initializer(value)` initializes everything to the provided value,
+* `tf.random_uniform_initializer(a, b)` initializes uniformly from [a, b],
+* `tf.random_normal_initializer(mean, stddev)` initializes from the normal
+  distribution with the given mean and standard deviation.
+
+To see how `tf.get_variable()` solves the problem discussed
+before, let's refactor the code that created one convolution into
+a separate function, named `conv_relu`:
+
+```python
+def conv_relu(input, kernel_shape, bias_shape):
+    # Create variable named "weights".
+    weights = tf.get_variable("weights", kernel_shape,
+        initializer=tf.random_normal_initializer())
+    # Create variable named "biases".
+    biases = tf.get_variable("biases", bias_shape,
+        initializer=tf.constant_initializer(0.0))
+    conv = tf.nn.conv2d(input, weights,
+        strides=[1, 1, 1, 1], padding='SAME')
+    return tf.nn.relu(conv + biases)
+```
+
+This function uses short names `"weights"` and `"biases"`.
+We'd like to use it for both `conv1` and `conv2`, but
+the variables need to have different names.
+This is where `tf.variable_scope()` comes into play:
+it pushes a namespace for variables.
+
+```python
+def my_image_filter(input_images):
+    with tf.variable_scope("conv1"):
+        # Variables created here will be named "conv1/weights", "conv1/biases".
+        relu1 = conv_relu(input_images, [5, 5, 32, 32], [32])
+    with tf.variable_scope("conv2"):
+        # Variables created here will be named "conv2/weights", "conv2/biases".
+        return conv_relu(relu1, [5, 5, 32, 32], [32])
+```
+
+Now, let's see what happens when we call `my_image_filter()` twice.
+
+```
+result1 = my_image_filter(image1)
+result2 = my_image_filter(image2)
+# Raises ValueError(... conv1/weights already exists ...)
+```
+
+As you can see, `tf.get_variable()` checks that already existing variables
+are not shared by accident. If you want to share them, you need to specify
+it by setting `reuse_variables()` as follows.
+
+```
+with tf.variable_scope("image_filters") as scope:
+    result1 = my_image_filter(image1)
+    scope.reuse_variables()
+    result2 = my_image_filter(image2)
+```
+
+This is a good way to share variables, lightweight and safe.
+
+## How Does Variable Scope Work?
+
+### Understanding `tf.get_variable()`
+
+To understand variable scope it is necessary to first
+fully understand how `tf.get_variable()` works.
+Here is how `tf.get_variable` is usually called.
+
+```python
+v = tf.get_variable(name, shape, dtype, initializer)
+```
+
+This call does one of two things depending on the scope it is called in.
+Here are the two options.
+
+* Case 1: the scope is set for creating new variables, as evidenced by
+`tf.get_variable_scope().reuse == False`.
+
+In this case, `v` will be a newly created `tf.Variable` with the provided
+shape and data type. The full name of the created variable will be set to
+the current variable scope name + the provided `name` and a check will be
+performed to ensure that no variable with this full name exists yet.
+If a variable with this full name already exists, the function will
+raise a `ValueError`. If a new variable is created, it will be
+initialized to the value `initializer(shape)`. For example:
+
+```python
+with tf.variable_scope("foo"):
+    v = tf.get_variable("v", [1])
+assert v.name == "foo/v:0"
+```
+
+* Case 2: the scope is set for reusing variables, as evidenced by
+`tf.get_variable_scope().reuse == True`.
+
+In this case, the call will search for an already existing variable with
+name equal to the current variable scope name + the provided `name`.
+If no such variable exists, a `ValueError` will be raised. If the variable
+is found, it will be returned. For example:
+
+```python
+with tf.variable_scope("foo"):
+    v = tf.get_variable("v", [1])
+with tf.variable_scope("foo", reuse=True):
+    v1 = tf.get_variable("v", [1])
+assert v1 is v
+```
+
+### Basics of `tf.variable_scope()`
+
+Knowing how `tf.get_variable()` works makes it easy to understand variable
+scope. The primary function of variable scope is to carry a name that will
+be used as prefix for variable names and a reuse-flag to distinguish the two
+cases described above. Nesting variable scopes appends their names in a way
+analogous to how directories work:
+
+```python
+with tf.variable_scope("foo"):
+    with tf.variable_scope("bar"):
+        v = tf.get_variable("v", [1])
+assert v.name == "foo/bar/v:0"
+```
+
+The current variable scope can be retrieved using `tf.get_variable_scope()`
+and the `reuse` flag of the current variable scope can be set to `True` by
+calling `tf.get_variable_scope().reuse_variables()`:
+
+```python
+with tf.variable_scope("foo"):
+    v = tf.get_variable("v", [1])
+    tf.get_variable_scope().reuse_variables()
+    v1 = tf.get_variable("v", [1])
+assert v1 is v
+```
+
+Note that you *cannot* set the `reuse` flag to `False`. The reason behind
+this is to allow to compose functions that create models. Imagine you write
+a function `my_image_filter(inputs)` as before. Someone calling the function
+in a variable scope with `reuse=True` would expect all inner variables to be
+reused as well. Allowing to force `reuse=False` inside the function would break
+this contract and make it hard to share parameters in this way.
+
+Even though you cannot set `reuse` to `False` explicitly, you can enter
+a reusing variable scope and then exit it, going back to a non-reusing one.
+This can be done using a `reuse=True` parameter when opening a variable scope.
+Note also that, for the same reason as above, the `reuse` parameter is
+inherited. So when you open a reusing variable scope, all sub-scopes will
+be reusing too.
+
+```python
+with tf.variable_scope("root"):
+    # At start, the scope is not reusing.
+    assert tf.get_variable_scope().reuse == False
+    with tf.variable_scope("foo"):
+        # Opened a sub-scope, still not reusing.
+        assert tf.get_variable_scope().reuse == False
+    with tf.variable_scope("foo", reuse=True):
+        # Explicitly opened a reusing scope.
+        assert tf.get_variable_scope().reuse == True
+        with tf.variable_scope("bar"):
+            # Now sub-scope inherits the reuse flag.
+            assert tf.get_variable_scope().reuse == True
+    # Exited the reusing scope, back to a non-reusing one.
+    assert tf.get_variable_scope().reuse == False
+```
+
+### Capturing variable scope
+
+In all examples presented above, we shared parameters only because their
+names agreed, that is, because we opened a reusing variable scope with
+exactly the same string. In more complex cases, it might be useful to pass
+a VariableScope object rather than rely on getting the names right.
+To this end, variable scopes can be captured and used instead of names
+when opening a new variable scope.
+
+```python
+with tf.variable_scope("foo") as foo_scope:
+    v = tf.get_variable("v", [1])
+with tf.variable_scope(foo_scope):
+    w = tf.get_variable("w", [1])
+with tf.variable_scope(foo_scope, reuse=True):
+    v1 = tf.get_variable("v", [1])
+    w1 = tf.get_variable("w", [1])
+assert v1 is v
+assert w1 is w
+```
+
+When opening a variable scope using a previously existing scope
+we jump out of the current variable scope prefix to an entirely
+different one. This is fully independent of where we do it.
+
+```python
+with tf.variable_scope("foo") as foo_scope:
+    assert foo_scope.name == "foo"
+with tf.variable_scope("bar"):
+    with tf.variable_scope("baz") as other_scope:
+        assert other_scope.name == "bar/baz"
+        with tf.variable_scope(foo_scope) as foo_scope2:
+            assert foo_scope2.name == "foo"  # Not changed.
+```
+
+### Initializers in variable scope
+
+Using `tf.get_variable()` allows to write functions that create or reuse
+variables and can be transparently called from outside. But what if we wanted
+to change the initializer of the created variables? Do we need to pass an extra
+argument to every function that creates variables? What about the most common
+case, when we want to set the default initializer for all variables in one
+place, on top of all functions? To help with these cases, variable scope
+can carry a default initializer. It is inherited by sub-scopes and passed
+to each `tf.get_variable()` call. But it will be overridden if another
+initializer is specified explicitly.
+
+```python
+with tf.variable_scope("foo", initializer=tf.constant_initializer(0.4)):
+    v = tf.get_variable("v", [1])
+    assert v.eval() == 0.4  # Default initializer as set above.
+    w = tf.get_variable("w", [1], initializer=tf.constant_initializer(0.3)):
+    assert w.eval() == 0.3  # Specific initializer overrides the default.
+    with tf.variable_scope("bar"):
+        v = tf.get_variable("v", [1])
+        assert v.eval() == 0.4  # Inherited default initializer.
+    with tf.variable_scope("baz", initializer=tf.constant_initializer(0.2)):
+        v = tf.get_variable("v", [1])
+        assert v.eval() == 0.2  # Changed default initializer.
+```
+
+### Names of ops in `tf.variable_scope()`
+
+We discussed how `tf.variable_scope` governs the names of variables.
+But how does it influence the names of other ops in the scope?
+It is natural that ops created inside a variable scope should also
+share that name. For this reason, when we do `with tf.variable_scope("name")`,
+this implicitly opens a `tf.name_scope("name")`. For example:
+
+```python
+with tf.variable_scope("foo"):
+    x = 1.0 + tf.get_variable("v", [1])
+assert x.op.name == "foo/add"
+```
+
+Name scopes can be opened in addition to a variable scope, and then
+they will only affect the names of the ops, but not of variables.
+
+```python
+with tf.variable_scope("foo"):
+    with tf.name_scope("bar"):
+        v = tf.get_variable("v", [1])
+        x = 1.0 + v
+assert v.name == "foo/v:0"
+assert x.op.name == "foo/bar/add"
+```
+
+When opening a variable scope using a captured object instead of a string,
+we do not alter the current name scope for ops.
+
+
+## Examples of Use
+
+Here are pointers to a few files that make use of variable scope. They can all
+be found in the [TensorFlow models repo](https://github.com/tensorflow/models).
+In particular, variable scope is heavily used for recurrent neural networks and
+sequence-to-sequence models.
+
+File | What's in it?
+--- | ---
+`tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
+`tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
+`tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
new file mode 100644
index 00000000000..e8d1e519f0b
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -0,0 +1,258 @@
+# Variables: Creation, Initialization, Saving, and Loading
+
+When you train a model, you use @{$python/state_ops$variables}
+to hold and update parameters.  Variables are in-memory buffers containing
+tensors.  They must be explicitly initialized and can be saved to disk during
+and after training. You can later restore saved values to exercise or analyze
+the model.
+
+This document references the following TensorFlow classes.  Follow the links to
+their reference manual for a complete description of their API:
+
+*  The @{tf.Variable} class.
+*  The @{tf.train.Saver} class.
+
+
+## Creation
+
+When you create a @{$python/state_ops$Variable} you pass a
+`Tensor` as its initial value to the `Variable()` constructor.  TensorFlow
+provides a collection of ops that produce tensors often used for initialization
+from @{$python/constant_op$constants or random values}.
+
+Note that all these ops require you to specify the shape of the tensors.  That
+shape automatically becomes the shape of the variable.  Variables generally
+have a fixed shape, but TensorFlow provides advanced mechanisms to reshape
+variables.
+
+```python
+# Create two variables.
+weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
+                      name="weights")
+biases = tf.Variable(tf.zeros([200]), name="biases")
+```
+
+Calling `tf.Variable()` adds several ops to the graph:
+
+*  A `variable` op that holds the variable value.
+*  An initializer op that sets the variable to its initial value.  This is
+   actually a `tf.assign` op.
+*  The ops for the initial value, such as the `zeros` op for the `biases`
+   variable in the example are also added to the graph.
+
+The value returned by `tf.Variable()` value is an instance of the Python class
+`tf.Variable`.
+
+### Device placement
+
+A variable can be pinned to a particular device when it is created, using a
+@{tf.device$`with tf.device(...):`} block:
+
+```python
+# Pin a variable to CPU.
+with tf.device("/cpu:0"):
+  v = tf.Variable(...)
+
+# Pin a variable to GPU.
+with tf.device("/gpu:0"):
+  v = tf.Variable(...)
+
+# Pin a variable to a particular parameter server task.
+with tf.device("/job:ps/task:7"):
+  v = tf.Variable(...)
+```
+
+**NOTE** Operations that mutate a variable, such as
+@{tf.Variable.assign} and the parameter
+update operations in a
+@{tf.train.Optimizer} *must* run on
+the same device as the variable. Incompatible device placement directives will
+be ignored when creating these operations.
+
+Device placement is particularly important when running in a replicated
+setting. See
+@{tf.train.replica_device_setter}
+for details of a device function that can simplify the configuration for devices
+for a replicated model.
+
+## Initialization
+
+Variable initializers must be run explicitly before other ops in your model can
+be run.  The easiest way to do that is to add an op that runs all the variable
+initializers, and run that op before using the model.
+
+You can alternatively restore variable values from a checkpoint file, see
+below.
+
+Use `tf.global_variables_initializer()` to add an op to run variable initializers.
+Only run that op after you have fully constructed your model and launched it in
+a session.
+
+```python
+# Create two variables.
+weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
+                      name="weights")
+biases = tf.Variable(tf.zeros([200]), name="biases")
+...
+# Add an op to initialize the variables.
+init_op = tf.global_variables_initializer()
+
+# Later, when launching the model
+with tf.Session() as sess:
+  # Run the init operation.
+  sess.run(init_op)
+  ...
+  # Use the model
+  ...
+```
+
+### Initialization from another Variable
+
+You sometimes need to initialize a variable from the initial value of another
+variable.  As the op added by `tf.global_variables_initializer()` initializes all
+variables in parallel you have to be careful when this is needed.
+
+To initialize a new variable from the value of another variable use the other
+variable's `initialized_value()` property.  You can use the initialized value
+directly as the initial value for the new variable, or you can use it as any
+other tensor to compute a value for the new variable.
+
+
+```python
+# Create a variable with a random value.
+weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
+                      name="weights")
+# Create another variable with the same value as 'weights'.
+w2 = tf.Variable(weights.initialized_value(), name="w2")
+# Create another variable with twice the value of 'weights'
+w_twice = tf.Variable(weights.initialized_value() * 2.0, name="w_twice")
+```
+
+### Custom Initialization
+
+The convenience function `tf.global_variables_initializer()` adds an op to
+initialize *all variables* in the model.  You can also pass an explicit list of
+variables to initialize to `tf.variables_initializer`.  See the
+@{$python/state_ops$Variables Documentation} for more options,
+including checking if variables are initialized.
+
+## Saving and Restoring
+
+The easiest way to save and restore a model is to use a `tf.train.Saver` object.
+The constructor adds `save` and `restore` ops to the graph for all, or a
+specified list, of the variables in the graph.  The saver object provides
+methods to run these ops, specifying paths for the checkpoint files to write to
+or read from.
+
+Note that to restore a model checkpoint without a graph one must first import
+the graph from the meta graph file (typical extension is `.meta`). This is
+done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from
+which one can than perform a `restore`.
+
+### Checkpoint Files
+
+Variables are saved in binary files that, roughly, contain a map from variable
+names to tensor values.
+
+When you create a `Saver` object, you can optionally choose names for the
+variables in the checkpoint files.  By default, it uses the value of the
+@{tf.Variable.name} property for
+each variable.
+
+To understand what variables are in a checkpoint, you can use the
+[`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py)
+library, and in particular, the `print_tensors_in_checkpoint_file` function.
+
+### Saving Variables
+
+Create a `Saver` with `tf.train.Saver()` to manage all variables in
+the model.
+
+```python
+# Create some variables.
+v1 = tf.Variable(..., name="v1")
+v2 = tf.Variable(..., name="v2")
+...
+# Add an op to initialize the variables.
+init_op = tf.global_variables_initializer()
+
+# Add ops to save and restore all the variables.
+saver = tf.train.Saver()
+
+# Later, launch the model, initialize the variables, do some work, save the
+# variables to disk.
+with tf.Session() as sess:
+  sess.run(init_op)
+  # Do some work with the model.
+  ..
+  # Save the variables to disk.
+  save_path = saver.save(sess, "/tmp/model.ckpt")
+  print("Model saved in file: %s" % save_path)
+```
+
+### Restoring Variables
+
+The same `Saver` object is used to restore variables.  Note that when you
+restore variables from a file you do not have to initialize them beforehand.
+
+```python
+# Create some variables.
+v1 = tf.Variable(..., name="v1")
+v2 = tf.Variable(..., name="v2")
+...
+# Add ops to save and restore all the variables.
+saver = tf.train.Saver()
+
+# Later, launch the model, use the saver to restore variables from disk, and
+# do some work with the model.
+with tf.Session() as sess:
+  # Restore variables from disk.
+  saver.restore(sess, "/tmp/model.ckpt")
+  print("Model restored.")
+  # Do some work with the model
+  ...
+```
+
+### Choosing which Variables to Save and Restore
+
+If you do not pass any argument to `tf.train.Saver()` the saver handles all
+variables in the graph.  Each one of them is saved under the name that was
+passed when the variable was created.
+
+It is sometimes useful to explicitly specify names for variables in the
+checkpoint files.  For example, you may have trained a model with a variable
+named `"weights"` whose value you want to restore in a new variable named
+`"params"`.
+
+It is also sometimes useful to only save or restore a subset of the variables
+used by a model.  For example, you may have trained a neural net with 5 layers,
+and you now want to train a new model with 6 layers, restoring the parameters
+from the 5 layers of the previously trained model into the first 5 layers of
+the new model.
+
+You can easily specify the names and variables to save by passing to the
+`tf.train.Saver()` constructor a Python dictionary: keys are the
+names to use, values are the variables to manage.
+
+Notes:
+
+*  You can create as many saver objects as you want if you need to save and
+   restore different subsets of the model variables.  The same variable can be
+   listed in multiple saver objects, its value is only changed when the saver
+   `restore()` method is run.
+
+*  If you only restore a subset of the model variables at the start
+   of a session, you have to run an initialize op for the other variables.  See
+   @{tf.variables_initializer}
+   for more information.
+
+```python
+# Create some variables.
+v1 = tf.Variable(..., name="v1")
+v2 = tf.Variable(..., name="v2")
+...
+# Add ops to save and restore only 'v2' using the name "my_v2"
+saver = tf.train.Saver({"my_v2": v2})
+# Use the saver object normally after that.
+...
+```
diff --git a/tensorflow/docs_src/programmers_guide/version_semantics.md b/tensorflow/docs_src/programmers_guide/version_semantics.md
new file mode 100644
index 00000000000..cee3b105de4
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/version_semantics.md
@@ -0,0 +1,161 @@
+# TensorFlow Version Semantics
+
+## Semantic Versioning 2.0
+
+TensorFlow follows Semantic Versioning 2.0 ([semver](http://semver.org)) for its
+public API. Each release version of TensorFlow has the form `MAJOR.MINOR.PATCH`.
+Changes to the each number have the following meaning:
+
+* **MAJOR**:  Backwards incompatible changes.  Code and data that worked with
+  a previous major release will not necessarily work with a new release.
+  However, in some cases existing TensorFlow data (graphs, checkpoints, and
+  other protobufs) may be migratable to the newer release; see below for details
+  on data compatibility.
+
+* **MINOR**: Backwards compatible features, speed improvements, etc.  Code and
+  data that worked with a previous minor release *and* which depends only the
+  public API will continue to work unchanged.  For details on what is and is
+  not the public API, see below.
+
+* **PATCH**: Backwards compatible bug fixes.
+
+## What is covered
+
+Only the public APIs of TensorFlow are backwards compatible across minor and
+patch versions.  The public APIs consist of
+
+* The documented public [Python](../api_docs/python) API, excluding `tf.contrib`.
+  This includes all public functions and classes (whose names do not start with
+  `_`) in the tensorflow module and its submodules. Note that the code in
+  the `examples/` to `tools/` directories is not reachable through the
+  tensorflow Python module and is thus not covered by the compatibility
+  guarantee.
+
+  If a symbol is available through the tensorflow Python module or its
+  submodules, but is not documented, then it is _not_ considered part of the
+  public API.
+
+* The [C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h).
+
+* The following protocol buffer files:
+  [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto),
+  [`config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto),
+  [`event`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/event.proto),
+  [`graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto),
+  [`op_def`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto),
+  [`reader_base`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/reader_base.proto),
+  [`summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto),
+  [`tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto),
+  [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto),
+  and [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto).
+
+## What is *not* covered
+
+Some API functions are explicitly marked as "experimental" and can change in
+backward incompatible ways between minor releases. These include:
+
+*   **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
+    and any functions in the C API or fields in protocol buffers that are
+    explicitly commented as being experimental.
+
+*   **Other languages**: TensorFlow APIs in languages other than Python and C,
+    such as:
+
+  - @{$cc/guide$C++} (exposed through header files in
+    [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
+  - [Java](../api_docs/java/reference/org/tensorflow/package-summary), and
+  - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
+
+*   **Details of composite ops:** Many public functions in Python expand to
+    several primitive ops in the graph, and these details will be part of any
+    graphs saved to disk as `GraphDef`s. These details are allowed to change for
+    minor releases. In particular, regressions tests that check for exact
+    matching between graphs are likely to break across minor releases, even
+    though the behavior of the graph should be unchanged and existing
+    checkpoints will still work.
+
+*   **Floating point numerical details:** The specific floating point values
+    computed by ops may change at any time: users should rely only on
+    approximate accuracy and numerical stability, not on the specific bits
+    computed. Changes to numerical formulas in minor and patch releases should
+    result in comparable or improved accuracy, with the caveat that in machine
+    learning improved accuracy of specific formulas may result in worse accuracy
+    for the overall system.
+
+*   **Random numbers:** The specific random numbers computed by the
+    @{$python/constant_op#Random_Tensors$random ops} may change at any time:
+    users should rely only on approximately correct distributions and
+    statistical strength, not the specific bits computed. However, we will make
+    changes to random bits rarely and ideally never for patch releases, and all
+    such intended changes will be documented.
+
+*   **Distributed Tensorflow:** Running 2 different versions of TensorFlow in a
+    single cluster is unsupported. There are no guarantees about backwards
+    compatibility of the wire protocol.
+
+*   **Bugs:** We reserve the right to make backwards incompatible behavior
+    (though not API) changes if the current implementation is clearly broken,
+    i.e., if it is contradicting the documentation, or if a well-known and
+    well-defined intended behavior is not properly implemented due to a bug.
+    For example, if an optimizer claims to implement a well-known optimization
+    algorithm but, due to a bug, does not match that algorithm we will fix the
+    optimizer. This may break code relying on the wrong behavior for
+    convergence. We will note such changes in the release notes.
+
+*   **Error messages:** We reserve the right to change the text of error
+    messages. In addition, the type of an error may change unless the type is
+    specified in the documentation. For example, a function that says in some
+    condition it will raise an `InvalidArgument` exception, it will continue to
+    raise `InvalidArgument`, but the human-readable message contents can change.
+
+
+Furthermore, any API methods marked "deprecated" in the 1.0 release can
+be deleted in any subsequent minor release.
+
+## Compatibility for Graphs and Checkpoints
+
+Many users of TensorFlow will be saving graphs and trained models to disk for
+later evaluation or more training, often changing versions of TensorFlow in the
+process.  First, following semver, any graph or checkpoint written out with one
+version of TensorFlow can be loaded and evaluated with a later version of
+TensorFlow with the same major release.  However, we will endeavor to preserve
+backwards compatibility even across major releases when possible, so that the
+serialized files are usable over long periods of time.
+
+There are two main classes of saved TensorFlow data: graphs and checkpoints.
+Graphs describe the data flow graphs of ops to be run during training and
+inference, and checkpoints contain the saved tensor values of variables in a
+graph.
+
+Graphs are serialized via the `GraphDef` protocol buffer.  To facilitate (rare)
+backwards incompatible changes to graphs, each `GraphDef` has an integer version
+separate from the TensorFlow version.  The semantics are:
+
+* Each version of TensorFlow supports an interval of `GraphDef` versions.  This
+  interval with be constant across patch releases, and will only grow across
+  minor releases.  Dropping support for a `GraphDef` version will only occur
+  for a major release of TensorFlow.
+
+* Newly created graphs use the newest `GraphDef` version.
+
+* If a given version of TensorFlow supports the `GraphDef` version of a graph,
+  it will load and evaluate with the same behavior as when it was written out
+  (except for floating point numerical details and random numbers), regardless
+  of the major version of TensorFlow.  In particular, all checkpoint files will
+  be compatible.
+
+* If the `GraphDef` upper bound is increased to X in a (minor) release, there
+  will be at least six months before the lower bound is increased to X.
+
+For example (numbers and versions hypothetical), TensorFlow 1.2 might support
+`GraphDef` versions 4 to 7.  TensorFlow 1.3 could add `GraphDef` version 8 and
+support versions 4 to 8.  At least six months later, TensorFlow 2.0.0 could drop
+support for versions 4 to 7, leaving version 8 only.
+
+Finally, when support for a `GraphDef` version is dropped, we will attempt to
+provide tools for automatically converting graphs to a newer supported
+`GraphDef` version.
+
+For developer-level details about `GraphDef` versioning, including how to evolve
+the versions to account for changes, see
+@{$data_versions$TensorFlow Data Versioning}.
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
new file mode 100644
index 00000000000..b0617326ff3
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -0,0 +1,453 @@
+# Convolutional Neural Networks
+
+> **NOTE:** This tutorial is intended for *advanced* users of TensorFlow
+and assumes expertise and experience in machine learning.
+
+## Overview
+
+CIFAR-10 classification is a common benchmark problem in machine learning.  The
+problem is to classify RGB 32x32 pixel images across 10 categories:
+```
+airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck.
+```
+
+For more details refer to the [CIFAR-10 page](http://www.cs.toronto.edu/~kriz/cifar.html)
+and a [Tech Report](http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf)
+by Alex Krizhevsky.
+
+### Goals
+
+The goal of this tutorial is to build a relatively small [convolutional neural
+network](https://en.wikipedia.org/wiki/Convolutional_neural_network) (CNN) for
+recognizing images. In the process, this tutorial:
+
+1. Highlights a canonical organization for network architecture,
+training and evaluation.
+2. Provides a template for constructing larger and more sophisticated models.
+
+The reason CIFAR-10 was selected was that it is complex enough to exercise
+much of TensorFlow's ability to scale to large models. At the same time,
+the model is small enough to train fast, which is ideal for trying out
+new ideas and experimenting with new techniques.
+
+### Highlights of the Tutorial
+The CIFAR-10 tutorial demonstrates several important constructs for
+designing larger and more sophisticated models in TensorFlow:
+
+* Core mathematical components including @{tf.nn.conv2d$convolution}
+([wiki](https://en.wikipedia.org/wiki/Convolution)),
+@{tf.nn.relu$rectified linear activations}
+([wiki](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))),
+@{tf.nn.max_pool$max pooling}
+([wiki](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer))
+and @{tf.nn.local_response_normalization$local response normalization}
+(Chapter 3.3 in
+[AlexNet paper](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)).
+* @{$summaries_and_tensorboard$Visualization}
+of network activities during training, including input images,
+losses and distributions of activations and gradients.
+* Routines for calculating the
+@{tf.train.ExponentialMovingAverage$moving average}
+of learned parameters and using these averages
+during evaluation to boost predictive performance.
+* Implementation of a
+@{tf.train.exponential_decay$learning rate schedule}
+that systematically decrements over time.
+* Prefetching @{tf.train.shuffle_batch$queues}
+for input
+data to isolate the model from disk latency and expensive image pre-processing.
+
+We also provide a [multi-GPU version](#training-a-model-using-multiple-gpu-cards)
+of the model which demonstrates:
+
+* Configuring a model to train across multiple GPU cards in parallel.
+* Sharing and updating variables among multiple GPUs.
+
+We hope that this tutorial provides a launch point for building larger CNNs for
+vision tasks on TensorFlow.
+
+### Model Architecture
+
+The model in this CIFAR-10 tutorial is a multi-layer architecture consisting of
+alternating convolutions and nonlinearities. These layers are followed by fully
+connected layers leading into a softmax classifier.  The model follows the
+architecture described by
+[Alex Krizhevsky](https://code.google.com/p/cuda-convnet/), with a few
+differences in the top few layers.
+
+This model achieves a peak performance of about 86% accuracy within a few hours
+of training time on a GPU. Please see [below](#evaluating-a-model) and the code
+for details.  It consists of 1,068,298 learnable parameters and requires about
+19.5M multiply-add operations to compute inference on a single image.
+
+## Code Organization
+
+The code for this tutorial resides in
+[`models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
+
+File | Purpose
+--- | ---
+[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+
+
+## CIFAR-10 Model
+
+The CIFAR-10 network is largely contained in
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
+The complete training
+graph contains roughly 765 operations. We find that we can make the code most
+reusable by constructing the graph with the following modules:
+
+1. [**Model inputs:**](#model-inputs) `inputs()` and `distorted_inputs()` add
+operations that read and preprocess CIFAR images for evaluation and training,
+respectively.
+1. [**Model prediction:**](#model-prediction) `inference()`
+adds operations that perform inference, i.e. classification, on supplied images.
+1. [**Model training:**](#model-training) `loss()` and `train()`
+add operations that compute the loss,
+gradients, variable updates and visualization summaries.
+
+### Model Inputs
+
+The input part of the model is built by the functions `inputs()` and
+`distorted_inputs()` which read images from the CIFAR-10 binary data files.
+These files contain fixed byte length records, so we use
+@{tf.FixedLengthRecordReader}.
+See @{$reading_data#reading-from-files$Reading Data} to
+learn more about how the `Reader` class works.
+
+The images are processed as follows:
+
+*  They are cropped to 24 x 24 pixels, centrally for evaluation or
+   @{tf.random_crop$randomly} for training.
+*  They are @{tf.image.per_image_standardization$approximately whitened}
+   to make the model insensitive to dynamic range.
+
+For training, we additionally apply a series of random distortions to
+artificially increase the data set size:
+
+* @{tf.image.random_flip_left_right$Randomly flip} the image from left to right.
+* Randomly distort the @{tf.image.random_brightness$image brightness}.
+* Randomly distort the @{tf.image.random_contrast$image contrast}.
+
+Please see the @{$python/image$Images} page for the list of
+available distortions. We also attach an
+@{tf.summary.image} to the images
+so that we may visualize them in @{$summaries_and_tensorboard$TensorBoard}.
+This is a good practice to verify that inputs are built correctly.
+
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="https://www.tensorflow.org/images/cifar_image_summary.png">
+</div>
+
+Reading images from disk and distorting them can use a non-trivial amount of
+processing time. To prevent these operations from slowing down training, we run
+them inside 16 separate threads which continuously fill a TensorFlow
+@{tf.train.shuffle_batch$queue}.
+
+### Model Prediction
+
+The prediction part of the model is constructed by the `inference()` function
+which adds operations to compute the *logits* of the predictions. That part of
+the model is organized as follows:
+
+Layer Name | Description
+--- | ---
+`conv1` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
+`pool1` | @{tf.nn.max_pool$max pooling}.
+`norm1` | @{tf.nn.local_response_normalization$local response normalization}.
+`conv2` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
+`norm2` | @{tf.nn.local_response_normalization$local response normalization}.
+`pool2` | @{tf.nn.max_pool$max pooling}.
+`local3` | @{$python/nn$fully connected layer with rectified linear activation}.
+`local4` | @{$python/nn$fully connected layer with rectified linear activation}.
+`softmax_linear` | linear transformation to produce logits.
+
+Here is a graph generated from TensorBoard describing the inference operation:
+
+<div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cifar_graph.png">
+</div>
+
+> **EXERCISE**: The output of `inference` are un-normalized logits. Try editing
+the network architecture to return normalized predictions using
+@{tf.nn.softmax}.
+
+The `inputs()` and `inference()` functions provide all the components
+necessary to perform evaluation on a model. We now shift our focus towards
+building operations for training a model.
+
+> **EXERCISE:** The model architecture in `inference()` differs slightly from
+the CIFAR-10 model specified in
+[cuda-convnet](https://code.google.com/p/cuda-convnet/).  In particular, the top
+layers of Alex's original model are locally connected and not fully connected.
+Try editing the architecture to exactly reproduce the locally connected
+architecture in the top layer.
+
+### Model Training
+
+The usual method for training a network to perform N-way classification is
+[multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression),
+aka. *softmax regression*. Softmax regression applies a
+@{tf.nn.softmax$softmax} nonlinearity to the
+output of the network and calculates the
+@{tf.nn.softmax_cross_entropy_with_logits$cross-entropy}
+between the normalized predictions and a
+@{tf.sparse_to_dense$1-hot encoding} of the label.
+For regularization, we also apply the usual
+@{tf.nn.l2_loss$weight decay} losses to all learned
+variables.  The objective function for the model is the sum of the cross entropy
+loss and all these weight decay terms, as returned by the `loss()` function.
+
+We visualize it in TensorBoard with a @{tf.summary.scalar}:
+
+![CIFAR-10 Loss](https://www.tensorflow.org/images/cifar_loss.png "CIFAR-10 Total Loss")
+
+We train the model using standard
+[gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
+algorithm (see @{$python/train$Training} for other methods)
+with a learning rate that
+@{tf.train.exponential_decay$exponentially decays}
+over time.
+
+![CIFAR-10 Learning Rate Decay](https://www.tensorflow.org/images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
+
+The `train()` function adds the operations needed to minimize the objective by
+calculating the gradient and updating the learned variables (see
+@{tf.train.GradientDescentOptimizer}
+for details).  It returns an operation that executes all the calculations
+needed to train and update the model for one batch of images.
+
+## Launching and Training the Model
+
+We have built the model, let's now launch it and run the training operation with
+the script `cifar10_train.py`.
+
+```shell
+python cifar10_train.py
+```
+
+> **NOTE:** The first time you run any target in the CIFAR-10 tutorial,
+the CIFAR-10 dataset is automatically downloaded. The data set is ~160MB
+so you may want to grab a quick cup of coffee for your first run.
+
+You should see the output:
+
+```shell
+Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes.
+2015-11-04 11:45:45.927302: step 0, loss = 4.68 (2.0 examples/sec; 64.221 sec/batch)
+2015-11-04 11:45:49.133065: step 10, loss = 4.66 (533.8 examples/sec; 0.240 sec/batch)
+2015-11-04 11:45:51.397710: step 20, loss = 4.64 (597.4 examples/sec; 0.214 sec/batch)
+2015-11-04 11:45:54.446850: step 30, loss = 4.62 (391.0 examples/sec; 0.327 sec/batch)
+2015-11-04 11:45:57.152676: step 40, loss = 4.61 (430.2 examples/sec; 0.298 sec/batch)
+2015-11-04 11:46:00.437717: step 50, loss = 4.59 (406.4 examples/sec; 0.315 sec/batch)
+...
+```
+
+The script reports the total loss every 10 steps as well as the speed at which
+the last batch of data was processed. A few comments:
+
+* The first batch of data can be inordinately slow (e.g. several minutes) as the
+preprocessing threads fill up the shuffling queue with 20,000 processed CIFAR
+images.
+
+* The reported loss is the average loss of the most recent batch. Remember that
+this loss is the sum of the cross entropy and all weight decay terms.
+
+* Keep an eye on the processing speed of a batch. The numbers shown above were
+obtained on a Tesla K40c. If you are running on a CPU, expect slower performance.
+
+
+> **EXERCISE:** When experimenting, it is sometimes annoying that the first
+training step can take so long. Try decreasing the number of images that
+initially fill up the queue.  Search for `min_fraction_of_examples_in_queue`
+in `cifar10_input.py`.
+
+`cifar10_train.py` periodically @{tf.train.Saver$saves}
+all model parameters in
+@{$variables#saving-and-restoring$checkpoint files}
+but it does *not* evaluate the model. The checkpoint file
+will be used by `cifar10_eval.py` to measure the predictive
+performance (see [Evaluating a Model](#evaluating-a-model) below).
+
+
+If you followed the previous steps, then you have now started training
+a CIFAR-10 model. [Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0)
+
+The terminal text returned from `cifar10_train.py` provides minimal insight into
+how the model is training. We want more insight into the model during training:
+
+* Is the loss *really* decreasing or is that just noise?
+* Is the model being provided appropriate images?
+* Are the gradients, activations and weights reasonable?
+* What is the learning rate currently at?
+
+@{$summaries_and_tensorboard$TensorBoard} provides this
+functionality, displaying data exported periodically from `cifar10_train.py` via
+a
+@{tf.summary.FileWriter}.
+
+For instance, we can watch how the distribution of activations and degree of
+sparsity in `local3` features evolve during training:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_sparsity.png">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_activations.png">
+</div>
+
+Individual loss functions, as well as the total loss, are particularly
+interesting to track over time. However, the loss exhibits a considerable amount
+of noise due to the small batch size employed by training.  In practice we find
+it extremely useful to visualize their moving averages in addition to their raw
+values.  See how the scripts use
+@{tf.train.ExponentialMovingAverage}
+for this purpose.
+
+## Evaluating a Model
+
+Let us now evaluate how well the trained model performs on a hold-out data set.
+The model is evaluated by the script `cifar10_eval.py`.  It constructs the model
+with the `inference()` function and uses all 10,000 images in the evaluation set
+of CIFAR-10. It calculates the *precision at 1:* how often the top prediction
+matches the true label of the image.
+
+To monitor how the model improves during training, the evaluation script runs
+periodically on the latest checkpoint files created by the `cifar10_train.py`.
+
+```shell
+python cifar10_eval.py
+```
+
+> Be careful not to run the evaluation and training binary on the same GPU or
+else you might run out of memory. Consider running the evaluation on
+a separate GPU if available or suspending the training binary while running
+the evaluation on the same GPU.
+
+You should see the output:
+
+```shell
+2015-11-06 08:30:44.391206: precision @ 1 = 0.860
+...
+```
+
+The script merely returns the precision @ 1 periodically -- in this case
+it returned 86% accuracy. `cifar10_eval.py` also
+exports summaries that may be visualized in TensorBoard. These summaries
+provide additional insight into the model during evaluation.
+
+The training script calculates the
+@{tf.train.ExponentialMovingAverage$moving average}
+version of all learned variables. The evaluation script substitutes
+all learned model parameters with the moving average version. This
+substitution boosts model performance at evaluation time.
+
+> **EXERCISE:** Employing averaged parameters may boost predictive performance
+by about 3% as measured by precision @ 1. Edit `cifar10_eval.py` to not employ
+the averaged parameters for the model and verify that the predictive performance
+drops.
+
+
+## Training a Model Using Multiple GPU Cards
+
+Modern workstations may contain multiple GPUs for scientific computation.
+TensorFlow can leverage this environment to run the training operation
+concurrently across multiple cards.
+
+Training a model in a parallel, distributed fashion requires
+coordinating training processes. For what follows we term *model replica*
+to be one copy of a model training on a subset of data.
+
+Naively employing asynchronous updates of model parameters
+leads to sub-optimal training performance
+because an individual model replica might be trained on a stale
+copy of the model parameters. Conversely, employing fully synchronous
+updates will be as slow as the slowest model replica.
+
+In a workstation with multiple GPU cards, each GPU will have similar speed
+and contain enough memory to run an entire CIFAR-10 model. Thus, we opt to
+design our training system in the following manner:
+
+* Place an individual model replica on each GPU.
+* Update model parameters synchronously by waiting for all GPUs to finish
+processing a batch of data.
+
+Here is a diagram of this model:
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/Parallelism.png">
+</div>
+
+Note that each GPU computes inference as well as the gradients for a unique
+batch of data. This setup effectively permits dividing up a larger batch
+of data across the GPUs.
+
+This setup requires that all GPUs share the model parameters. A well-known
+fact is that transferring data to and from GPUs is quite slow. For this
+reason, we decide to store and update all model parameters on the CPU (see
+green box). A fresh set of model parameters is transferred to the GPU
+when a new batch of data is processed by all GPUs.
+
+The GPUs are synchronized in operation. All gradients are accumulated from
+the GPUs and averaged (see green box). The model parameters are updated with
+the gradients averaged across all model replicas.
+
+### Placing Variables and Operations on Devices
+
+Placing operations and variables on devices requires some special
+abstractions.
+
+The first abstraction we require is a function for computing inference and
+gradients for a single model replica. In the code we term this abstraction
+a "tower". We must set two attributes for each tower:
+
+* A unique name for all operations within a tower.
+@{tf.name_scope} provides
+this unique name by prepending a scope. For instance, all operations in
+the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
+
+* A preferred hardware device to run the operation within a tower.
+@{tf.device} specifies this. For
+instance, all operations in the first tower reside within `device('/gpu:0')`
+scope indicating that they should be run on the first GPU.
+
+All variables are pinned to the CPU and accessed via
+@{tf.get_variable}
+in order to share them in a multi-GPU version.
+See how-to on @{$variable_scope$Sharing Variables}.
+
+### Launching and Training the Model on Multiple GPU cards
+
+If you have several GPU cards installed on your machine you can use them to
+train the model faster with the `cifar10_multi_gpu_train.py` script.  This
+version of the training script parallelizes the model across multiple GPU cards.
+
+```shell
+python cifar10_multi_gpu_train.py --num_gpus=2
+```
+
+Note that the number of GPU cards used defaults to 1. Additionally, if only 1
+GPU is available on your machine, all computations will be placed on it, even if
+you ask for more.
+
+> **EXERCISE:** The default settings for `cifar10_train.py` is to
+run on a batch size of 128. Try running `cifar10_multi_gpu_train.py` on 2 GPUs
+with a batch size of 64 and compare the training speed.
+
+## Next Steps
+
+[Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) You have
+completed the CIFAR-10 tutorial.
+
+If you are now interested in developing and training your own image
+classification system, we recommend forking this tutorial and replacing
+components to address your image classification problem.
+
+
+> **EXERCISE:** Download the
+[Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) data set.
+Fork the CIFAR-10 tutorial and swap in the SVHN as the input data. Try adapting
+the network architecture to improve predictive performance.
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
new file mode 100644
index 00000000000..88ae451cd53
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -0,0 +1,458 @@
+# Image Recognition
+
+Our brains make vision seem easy. It doesn't take any effort for humans to
+tell apart a lion and a jaguar, read a sign, or recognize a human's face.
+But these are actually hard problems to solve with a computer: they only
+seem easy because our brains are incredibly good at understanding images.
+
+In the last few years the field of machine learning has made tremendous
+progress on addressing these difficult problems. In particular, we've
+found that a kind of model called a deep
+[convolutional neural network](http://colah.github.io/posts/2014-07-Conv-Nets-Modular/)
+can achieve reasonable performance on hard visual recognition tasks --
+matching or exceeding human performance in some domains.
+
+Researchers have demonstrated steady progress
+in computer vision by validating their work against
+[ImageNet](http://www.image-net.org) -- an academic benchmark for computer vision.
+Successive models continue to show improvements, each time achieving
+a new state-of-the-art result:
+[QuocNet], [AlexNet], [Inception (GoogLeNet)], [BN-Inception-v2].
+Researchers both internal and external to Google have published papers describing all
+these models but the results are still hard to reproduce.
+We're now taking the next step by releasing code for running image recognition
+on our latest model, [Inception-v3].
+
+[QuocNet]: http://static.googleusercontent.com/media/research.google.com/en//archive/unsupervised_icml2012.pdf
+[AlexNet]: http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+[Inception (GoogLeNet)]: http://arxiv.org/abs/1409.4842
+[BN-Inception-v2]: http://arxiv.org/abs/1502.03167
+[Inception-v3]: http://arxiv.org/abs/1512.00567
+
+Inception-v3 is trained for the [ImageNet] Large Visual Recognition Challenge
+using the data from 2012. This is a standard task in computer vision,
+where models try to classify entire
+images into [1000 classes], like "Zebra", "Dalmatian", and "Dishwasher".
+For example, here are the results from [AlexNet] classifying some images:
+
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/AlexClassification.png">
+</div>
+
+To compare models, we examine how often the model fails to predict the
+correct answer as one of their top 5 guesses -- termed "top-5 error rate".
+[AlexNet] achieved by setting a top-5 error rate of 15.3% on the 2012
+validation data set; [Inception (GoogLeNet)] achieved 6.67%; 
+[BN-Inception-v2] achieved 4.9%; [Inception-v3] reaches 3.46%.
+
+> How well do humans do on ImageNet Challenge? There's a [blog post] by
+Andrej Karpathy who attempted to measure his own performance. He reached
+5.1% top-5 error rate.
+
+[ImageNet]: http://image-net.org/
+[1000 classes]: http://image-net.org/challenges/LSVRC/2014/browse-synsets
+[blog post]: http://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/
+
+This tutorial will teach you how to use [Inception-v3]. You'll learn how to
+classify images into [1000 classes] in Python or C++. We'll also discuss how to
+extract higher level features from this model which may be reused for other
+vision tasks.
+
+We're excited to see what the community will do with this model.
+
+
+##Usage with Python API
+
+`classify_image.py` downloads the trained model from `tensorflow.org`
+when the program is run for the first time. You'll need about 200M of free space
+available on your hard disk.
+
+Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
+
+    cd models/tutorials/image/imagenet
+    python classify_image.py
+
+The above command will classify a supplied image of a panda bear.
+
+<div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cropped_panda.jpg">
+</div>
+
+If the model runs correctly, the script will produce the following output:
+
+    giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca (score = 0.88493)
+    indri, indris, Indri indri, Indri brevicaudatus (score = 0.00878)
+    lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens (score = 0.00317)
+    custard apple (score = 0.00149)
+    earthstar (score = 0.00127)
+
+If you wish to supply other JPEG images, you may do so by editing
+the `--image_file` argument.
+
+> If you download the model data to a different directory, you
+will need to point `--model_dir`  to the directory used.
+
+## Usage with the C++ API
+
+You can run the same [Inception-v3] model in C++ for use in production
+environments. You can download the archive containing the GraphDef that defines
+the model like this (running from the root directory of the TensorFlow
+repository):
+
+```bash
+curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz" |
+  tar -C tensorflow/examples/label_image/data -xz
+```
+
+Next, we need to compile the C++ binary that includes the code to load and run the graph.
+If you've followed
+@{$install_sources$the instructions to download the source installation of TensorFlow}
+for your platform, you should be able to build the example by
+running this command from your shell terminal:
+
+```bash
+bazel build tensorflow/examples/label_image/...
+```
+
+That should create a binary executable that you can then run like this:
+
+```bash
+bazel-bin/tensorflow/examples/label_image/label_image
+```
+
+This uses the default example image that ships with the framework, and should
+output something similar to this:
+
+```
+I tensorflow/examples/label_image/main.cc:206] military uniform (653): 0.834306
+I tensorflow/examples/label_image/main.cc:206] mortarboard (668): 0.0218692
+I tensorflow/examples/label_image/main.cc:206] academic gown (401): 0.0103579
+I tensorflow/examples/label_image/main.cc:206] pickelhaube (716): 0.00800814
+I tensorflow/examples/label_image/main.cc:206] bulletproof vest (466): 0.00535088
+```
+In this case, we're using the default image of
+[Admiral Grace Hopper](https://en.wikipedia.org/wiki/Grace_Hopper), and you can
+see the network correctly identifies she's wearing a military uniform, with a high
+score of 0.8.
+
+
+<div style="width:45%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/grace_hopper.jpg">
+</div>
+
+Next, try it out on your own images by supplying the --image= argument, e.g.
+
+```bash
+bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png
+```
+
+If you look inside the [`tensorflow/examples/label_image/main.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc)
+file, you can find out
+how it works. We hope this code will help you integrate TensorFlow into
+your own applications, so we will walk step by step through the main functions:
+
+The command line flags control where the files are loaded from, and properties of the input images.
+The model expects to get square 299x299 RGB images, so those are the `input_width`
+and `input_height` flags. We also need to scale the pixel values from integers that
+are between 0 and 255 to the floating point values that the graph operates on.
+We control the scaling with the `input_mean` and `input_std` flags: we first subtract
+`input_mean` from each pixel value, then divide it by `input_std`.
+
+These values probably look somewhat magical, but they are just defined by the
+original model author based on what he/she wanted to use as input images for
+training. If you have a graph that you've trained yourself, you'll just need
+to adjust the values to match whatever you used during your training process.
+
+You can see how they're applied to an image in the
+[`ReadTensorFromImageFile()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L88)
+function.
+
+```C++
+// Given an image file name, read in the data, try to decode it as an image,
+// resize it to the requested size, and then scale the values as desired.
+Status ReadTensorFromImageFile(string file_name, const int input_height,
+                               const int input_width, const float input_mean,
+                               const float input_std,
+                               std::vector<Tensor>* out_tensors) {
+  tensorflow::GraphDefBuilder b;
+```
+We start by creating a `GraphDefBuilder`, which is an object we can use to
+specify a model to run or load.
+
+```C++
+  string input_name = "file_reader";
+  string output_name = "normalized";
+  tensorflow::Node* file_reader =
+      tensorflow::ops::ReadFile(tensorflow::ops::Const(file_name, b.opts()),
+                                b.opts().WithName(input_name));
+```
+We then start creating nodes for the small model we want to run
+to load, resize, and scale the pixel values to get the result the main model
+expects as its input. The first node we create is just a `Const` op that holds a
+tensor with the file name of the image we want to load. That's then passed as the
+first input to the `ReadFile` op. You might notice we're passing `b.opts()` as the last
+argument to all the op creation functions. The argument ensures that the node is added to
+the model definition held in the `GraphDefBuilder`. We also name the `ReadFile`
+operator by making the `WithName()` call to `b.opts()`. This gives a name to the node,
+which isn't strictly necessary since an automatic name will be assigned if you don't
+do this, but it does make debugging a bit easier.
+
+```C++
+  // Now try to figure out what kind of file it is and decode it.
+  const int wanted_channels = 3;
+  tensorflow::Node* image_reader;
+  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+    image_reader = tensorflow::ops::DecodePng(
+        file_reader,
+        b.opts().WithAttr("channels", wanted_channels).WithName("png_reader"));
+  } else {
+    // Assume if it's not a PNG then it must be a JPEG.
+    image_reader = tensorflow::ops::DecodeJpeg(
+        file_reader,
+        b.opts().WithAttr("channels", wanted_channels).WithName("jpeg_reader"));
+  }
+  // Now cast the image data to float so we can do normal math on it.
+  tensorflow::Node* float_caster = tensorflow::ops::Cast(
+      image_reader, tensorflow::DT_FLOAT, b.opts().WithName("float_caster"));
+  // The convention for image ops in TensorFlow is that all images are expected
+  // to be in batches, so that they're four-dimensional arrays with indices of
+  // [batch, height, width, channel]. Because we only have a single image, we
+  // have to add a batch dimension of 1 to the start with ExpandDims().
+  tensorflow::Node* dims_expander = tensorflow::ops::ExpandDims(
+      float_caster, tensorflow::ops::Const(0, b.opts()), b.opts());
+  // Bilinearly resize the image to fit the required dimensions.
+  tensorflow::Node* resized = tensorflow::ops::ResizeBilinear(
+      dims_expander, tensorflow::ops::Const({input_height, input_width},
+                                            b.opts().WithName("size")),
+      b.opts());
+  // Subtract the mean and divide by the scale.
+  tensorflow::ops::Div(
+      tensorflow::ops::Sub(
+          resized, tensorflow::ops::Const({input_mean}, b.opts()), b.opts()),
+      tensorflow::ops::Const({input_std}, b.opts()),
+      b.opts().WithName(output_name));
+```
+We then keep adding more nodes, to decode the file data as an image, to cast the
+integers into floating point values, to resize it, and then finally to run the
+subtraction and division operations on the pixel values.
+
+```C++
+  // This runs the GraphDef network definition that we've just constructed, and
+  // returns the results in the output tensor.
+  tensorflow::GraphDef graph;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
+```
+At the end of this we have
+a model definition stored in the b variable, which we turn into a full graph
+definition with the `ToGraphDef()` function.
+
+```C++
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_RETURN_IF_ERROR(session->Create(graph));
+  TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
+  return Status::OK();
+```
+Then we create a @{tf.Session}
+object, which is the interface to actually running the graph, and run it,
+specifying which node we want to get the output from, and where to put the
+output data.
+
+This gives us a vector of `Tensor` objects, which in this case we know will only be a
+single object long. You can think of a `Tensor` as a multi-dimensional array in this
+context, and it holds a 299 pixel high, 299 pixel wide, 3 channel image as float
+values. If you have your own image-processing framework in your product already, you
+should be able to use that instead, as long as you apply the same transformations
+before you feed images into the main graph.
+
+This is a simple example of creating a small TensorFlow graph dynamically in C++,
+but for the pre-trained Inception model we want to load a much larger definition from
+a file. You can see how we do that in the `LoadGraph()` function.
+
+```C++
+// Reads a model graph definition from disk, and creates a session object you
+// can use to run it.
+Status LoadGraph(string graph_file_name,
+                 std::unique_ptr<tensorflow::Session>* session) {
+  tensorflow::GraphDef graph_def;
+  Status load_graph_status =
+      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
+  if (!load_graph_status.ok()) {
+    return tensorflow::errors::NotFound("Failed to load compute graph at '",
+                                        graph_file_name, "'");
+  }
+```
+If you've looked through the image loading code, a lot of the terms should seem familiar. Rather than
+using a `GraphDefBuilder` to produce a `GraphDef` object, we load a protobuf file that
+directly contains the `GraphDef`.
+
+```C++
+  session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
+  Status session_create_status = (*session)->Create(graph_def);
+  if (!session_create_status.ok()) {
+    return session_create_status;
+  }
+  return Status::OK();
+}
+```
+Then we create a Session object from that `GraphDef` and
+pass it back to the caller so that they can run it at a later time.
+
+The `GetTopLabels()` function is a lot like the image loading, except that in this case
+we want to take the results of running the main graph, and turn it into a sorted list
+of the highest-scoring labels. Just like the image loader, it creates a
+`GraphDefBuilder`, adds a couple of nodes to it, and then runs the short graph to get a
+pair of output tensors. In this case they represent the sorted scores and index
+positions of the highest results.
+
+```C++
+// Analyzes the output of the Inception graph to retrieve the highest scores and
+// their positions in the tensor, which correspond to categories.
+Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
+                    Tensor* indices, Tensor* scores) {
+  tensorflow::GraphDefBuilder b;
+  string output_name = "top_k";
+  tensorflow::ops::TopK(tensorflow::ops::Const(outputs[0], b.opts()),
+                        how_many_labels, b.opts().WithName(output_name));
+  // This runs the GraphDef network definition that we've just constructed, and
+  // returns the results in the output tensors.
+  tensorflow::GraphDef graph;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_RETURN_IF_ERROR(session->Create(graph));
+  // The TopK node returns two outputs, the scores and their original indices,
+  // so we have to append :0 and :1 to specify them both.
+  std::vector<Tensor> out_tensors;
+  TF_RETURN_IF_ERROR(session->Run({}, {output_name + ":0", output_name + ":1"},
+                                  {}, &out_tensors));
+  *scores = out_tensors[0];
+  *indices = out_tensors[1];
+  return Status::OK();
+```
+The `PrintTopLabels()` function takes those sorted results, and prints them out in a
+friendly way. The `CheckTopLabel()` function is very similar, but just makes sure that
+the top label is the one we expect, for debugging purposes.
+
+At the end, [`main()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L252)
+ties together all of these calls.
+
+```C++
+int main(int argc, char* argv[]) {
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  Status s = tensorflow::ParseCommandLineFlags(&argc, argv);
+  if (!s.ok()) {
+    LOG(ERROR) << "Error parsing command line flags: " << s.ToString();
+    return -1;
+  }
+
+  // First we load and initialize the model.
+  std::unique_ptr<tensorflow::Session> session;
+  string graph_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_graph);
+  Status load_graph_status = LoadGraph(graph_path, &session);
+  if (!load_graph_status.ok()) {
+    LOG(ERROR) << load_graph_status;
+    return -1;
+  }
+```
+We load the main graph.
+
+```C++
+  // Get the image from disk as a float array of numbers, resized and normalized
+  // to the specifications the main graph expects.
+  std::vector<Tensor> resized_tensors;
+  string image_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_image);
+  Status read_tensor_status = ReadTensorFromImageFile(
+      image_path, FLAGS_input_height, FLAGS_input_width, FLAGS_input_mean,
+      FLAGS_input_std, &resized_tensors);
+  if (!read_tensor_status.ok()) {
+    LOG(ERROR) << read_tensor_status;
+    return -1;
+  }
+  const Tensor& resized_tensor = resized_tensors[0];
+```
+Load, resize, and process the input image.
+
+```C++
+  // Actually run the image through the model.
+  std::vector<Tensor> outputs;
+  Status run_status = session->Run({{FLAGS_input_layer, resized_tensor}},
+                                   {FLAGS_output_layer}, {}, &outputs);
+  if (!run_status.ok()) {
+    LOG(ERROR) << "Running model failed: " << run_status;
+    return -1;
+  }
+```
+Here we run the loaded graph with the image as an input.
+
+```C++
+  // This is for automated testing to make sure we get the expected result with
+  // the default settings. We know that label 866 (military uniform) should be
+  // the top label for the Admiral Hopper image.
+  if (FLAGS_self_test) {
+    bool expected_matches;
+    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
+    if (!check_status.ok()) {
+      LOG(ERROR) << "Running check failed: " << check_status;
+      return -1;
+    }
+    if (!expected_matches) {
+      LOG(ERROR) << "Self-test failed!";
+      return -1;
+    }
+  }
+```
+For testing purposes we can check to make sure we get the output we expect here.
+
+```C++
+  // Do something interesting with the results we've generated.
+  Status print_status = PrintTopLabels(outputs, FLAGS_labels);
+```
+Finally we print the labels we found.
+
+```C++
+  if (!print_status.ok()) {
+    LOG(ERROR) << "Running print failed: " << print_status;
+    return -1;
+  }
+```
+
+The error handling here is using TensorFlow's `Status`
+object, which is very convenient because it lets you know whether any error has
+occurred with the `ok()` checker, and then can be printed out to give a readable error
+message.
+
+In this case we are demonstrating object recognition, but you should be able to
+use very similar code on other models you've found or trained yourself, across
+all
+sorts of domains. We hope this small example gives you some ideas on how to use
+TensorFlow within your own products.
+
+> **EXERCISE**: Transfer learning is the idea that, if you know how to solve a task well, you
+should be able to transfer some of that understanding to solving related
+problems.  One way to perform transfer learning is to remove the final
+classification layer of the network and extract
+the [next-to-last layer of the CNN](http://arxiv.org/abs/1310.1531), in this case a 2048 dimensional vector.
+There's a guide to doing this @{$image_retraining$in the how-to section}.
+
+
+## Resources for Learning More
+
+To learn about neural networks in general, Michael Nielsen's
+[free online book](http://neuralnetworksanddeeplearning.com/chap1.html)
+is an excellent resource. For convolutional neural networks in particular,
+Chris Olah has some
+[nice blog posts](http://colah.github.io/posts/2014-07-Conv-Nets-Modular/),
+and Michael Nielsen's book has a
+[great chapter](http://neuralnetworksanddeeplearning.com/chap6.html)
+covering them.
+
+To find out more about implementing convolutional neural networks, you can jump
+to the TensorFlow @{$deep_cnn$deep convolutional networks tutorial},
+or start a bit more gently with our
+@{$beginners$ML beginner} or @{$pros$ML expert}
+MNIST starter tutorials. Finally, if you want to get up to speed on research
+in this area, you can
+read the recent work of all the papers referenced in this tutorial.
+
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
new file mode 100644
index 00000000000..85e6ec76dc4
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -0,0 +1,330 @@
+# How to Retrain Inception's Final Layer for New Categories
+
+Modern object recognition models have millions of parameters and can take weeks
+to fully train. Transfer learning is a technique that shortcuts a lot of this
+work by taking a fully-trained model for a set of categories like ImageNet, and
+retrains from the existing weights for new classes. In this example we'll be
+retraining the final layer from scratch, while leaving all the others untouched.
+For more information on the approach you can see
+[this paper on Decaf](http://arxiv.org/pdf/1310.1531v1.pdf).
+
+Though it's not as good as a full training run, this is surprisingly effective
+for many applications, and can be run in as little as thirty minutes on a
+laptop, without requiring a GPU. This tutorial will show you how to run the
+example script on your own images, and will explain some of the options you have
+to help control the training process.
+
+Note: This version of the tutorial mainly uses bazel. A bazel free version is
+also available
+[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
+
+[TOC]
+
+## Training on Flowers
+
+![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
+
+[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
+
+Before you start any training, you'll need a set of images to teach the network
+about the new classes you want to recognize. There's a later section that
+explains how to prepare your own images, but to make it easy we've created an
+archive of creative-commons licensed flower photos to use initially. To get the
+set of flower photos, run these commands:
+
+```sh
+cd ~
+curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
+tar xzf flower_photos.tgz
+```
+
+Once you have the images, you can build the retrainer like this, from the root
+of your TensorFlow source directory:
+
+```sh
+bazel build tensorflow/examples/image_retraining:retrain
+```
+
+If you have a machine which supports
+[the AVX instruction set](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
+(common in x86 CPUs produced in the last few years) you can improve the running
+speed of the retraining by building for that architecture, like this (after choosing appropriate options in `configure`):
+
+```sh
+bazel build --config opt tensorflow/examples/image_retraining:retrain
+```
+
+The retrainer can then be run like this:
+
+```sh
+bazel-bin/tensorflow/examples/image_retraining/retrain --image_dir ~/flower_photos
+```
+
+This script loads the pre-trained Inception v3 model, removes the old top layer,
+and trains a new one on the flower photos you've downloaded. None of the flower
+species were in the original ImageNet classes the full network was trained on.
+The magic of transfer learning is that lower layers that have been trained to
+distinguish between some objects can be reused for many recognition tasks
+without any alteration.
+
+## Bottlenecks
+
+The script can take thirty minutes or more to complete, depending on the speed
+of your machine. The first phase analyzes all the images on disk and calculates
+the bottleneck values for each of them. 'Bottleneck' is an informal term we
+often use for the layer just before the final output layer that actually does
+the classification. This penultimate layer has been trained to output a set of
+values that's good enough for the classifier to use to distinguish between all
+the classes it's been asked to recognize. That means it has to be a meaningful
+and compact summary of the images, since it has to contain enough information
+for the classifier to make a good choice in a very small set of values. The
+reason our final layer retraining can work on new classes is that it turns out
+the kind of information needed to distinguish between all the 1,000 classes in
+ImageNet is often also useful to distinguish between new kinds of objects.
+
+Because every image is reused multiple times during training and calculating
+each bottleneck takes a significant amount of time, it speeds things up to
+cache these bottleneck values on disk so they don't have to be repeatedly
+recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
+if you rerun the script they'll be reused so you don't have to wait for this
+part again.
+
+## Training
+
+Once the bottlenecks are complete, the actual training of the top layer of the
+network begins. You'll see a series of step outputs, each one showing training
+accuracy, validation accuracy, and the cross entropy. The training accuracy
+shows what percent of the images used in the current training batch were
+labeled with the correct class. The validation accuracy is the precision on a
+randomly-selected group of images from a different set. The key difference is
+that the training accuracy is based on images that the network has been able
+to learn from so the network can overfit to the noise in the training data. A
+true measure of the performance of the network is to measure its performance on
+a data set not contained in the training data -- this is measured by the
+validation accuracy. If the train accuracy is high but the validation accuracy
+remains low, that means the network is overfitting and memorizing particular
+features in the training images that aren't helpful more generally. Cross
+entropy is a loss function which gives a glimpse into how well the learning
+process is progressing. The training's objective is to make the loss as small as
+possible, so you can tell if the learning is working by keeping an eye on
+whether the loss keeps trending downwards, ignoring the short-term noise.
+
+By default this script will run 4,000 training steps. Each step chooses ten
+images at random from the training set, finds their bottlenecks from the cache,
+and feeds them into the final layer to get predictions. Those predictions are
+then compared against the actual labels to update the final layer's weights
+through the back-propagation process. As the process continues you should see
+the reported accuracy improve, and after all the steps are done, a final test
+accuracy evaluation is run on a set of images kept separate from the training
+and validation pictures. This test evaluation is the best estimate of how the
+trained model will perform on the classification task. You should see an
+accuracy value of between 90% and 95%, though the exact value will vary from run
+to run since there's randomness in the training process. This number is based on
+the percent of the images in the test set that are given the correct label
+after the model is fully trained.
+
+## Visualizing the Retraining with TensorBoard
+
+The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
+
+To launch TensorBoard, run this command during or after retraining:
+
+```sh
+tensorboard --logdir /tmp/retrain_logs
+```
+
+Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
+
+The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
+
+The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+
+## Using the Retrained Model
+
+The script will write out a version of the Inception v3 network with a final
+layer retrained to your categories to /tmp/output_graph.pb, and a text file
+containing the labels to /tmp/output_labels.txt. These are both in a format that
+the @{$image_recognition$C++ and Python image classification examples}
+can read in, so you can start using your new model immediately. Since you've
+replaced the top layer, you will need to specify the new name in the script, for
+example with the flag `--output_layer=final_result` if you're using label_image.
+
+Here's an example of how to build and run the label_image example with your
+retrained graphs:
+
+```sh
+bazel build tensorflow/examples/image_retraining:label_image && \
+bazel-bin/tensorflow/examples/image_retraining/label_image \
+--graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
+--output_layer=final_result:0 \
+--image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
+```
+
+You should see a list of flower labels, in most cases with daisy on top
+(though each retrained model may be slightly different). You can replace the
+`--image` parameter with your own images to try those out, and use the C++ code
+as a template to integrate with your own applications.
+
+If you'd like to use the retrained model in your own Python program, then the
+above
+[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/image_retraining/label_image.py)
+is a reasonable starting point.
+
+## Training on Your Own Categories
+
+If you've managed to get the script working on the flower example images, you
+can start looking at teaching it to recognize categories you care about instead.
+In theory all you'll need to do is point it at a set of sub-folders, each named
+after one of your categories and containing only images from that category. If
+you do that and pass the root folder of the subdirectories as the argument to
+`--image_dir`, the script should train just like it did for the flowers.
+
+Here's what the folder structure of the flowers archive looks like, to give you
+and example of the kind of layout the script is looking for:
+
+![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
+
+In practice it may take some work to get the accuracy you want. I'll try to
+guide you through some of the common problems you might encounter below.
+
+## Creating a Set of Training Images
+
+The first place to start is by looking at the images you've gathered, since the
+most common issues we see with training come from the data that's being fed in.
+
+For training to work well, you should gather at least a hundred photos of each
+kind of object you want to recognize. The more you can gather, the better the
+accuracy of your trained model is likely to be. You also need to make sure that
+the photos are a good representation of what your application will actually
+encounter. For example, if you take all your photos indoors against a blank wall
+and your users are trying to recognize objects outdoors, you probably won't see
+good results when you deploy.
+
+Another pitfall to avoid is that the learning process will pick up on anything
+that the labeled images have in common with each other, and if you're not
+careful that might be something that's not useful. For example if you photograph
+one kind of object in a blue room, and another in a green one, then the model
+will end up basing its prediction on the background color, not the features of
+the object you actually care about. To avoid this, try to take pictures in as
+wide a variety of situations as you can, at different times, and with different
+devices. If you want to know more about this problem, you can read about the
+classic (and possibly apocryphal)
+[tank recognition problem](http://www.jefftk.com/p/detecting-tanks).
+
+You may also want to think about the categories you use. It might be worth
+splitting big categories that cover a lot of different physical forms into
+smaller ones that are more visually distinct. For example instead of 'vehicle'
+you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
+whether you have a 'closed world' or an 'open world' problem. In a closed world,
+the only things you'll ever be asked to categorize are the classes of object you
+know about. This might apply to a plant recognition app where you know the user
+is likely to be taking a picture of a flower, so all you have to do is decide
+which species. By contrast a roaming robot might see all sorts of different
+things through its camera as it wanders around the world. In that case you'd
+want the classifier to report if it wasn't sure what it was seeing. This can be
+hard to do well, but often if you collect a large number of typical 'background'
+photos with no relevant objects in them, you can add them to an extra 'unknown'
+class in your image folders.
+
+It's also worth checking to make sure that all of your images are labeled
+correctly. Often user-generated tags are unreliable for our purposes, for
+example using #daisy for pictures of a person named Daisy. If you go through
+your images and weed out any mistakes it can do wonders for your overall
+accuracy.
+
+## Training Steps
+
+If you're happy with your images, you can take a look at improving your results
+by altering the details of the learning process. The simplest one to try is
+`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
+8,000 it will train for twice as long. The rate of improvement in the accuracy
+slows the longer you train for, and at some point will stop altogether, but you
+can experiment to see when you hit that limit for your model.
+
+## Distortions
+
+A common way of improving the results of image training is by deforming,
+cropping, or brightening the training inputs in random ways. This has the
+advantage of expanding the effective size of the training data thanks to all the
+possible variations of the same images, and tends to help the network learn to
+cope with all the distortions that will occur in real-life uses of the
+classifier. The biggest disadvantage of enabling these distortions in our script
+is that the bottleneck caching is no longer useful, since input images are never
+reused exactly. This means the training process takes a lot longer, so I
+recommend trying this as a way of fine-tuning your model once you've got one
+that you're reasonably happy with.
+
+You enable these distortions by passing `--random_crop`, `--random_scale` and
+`--random_brightness` to the script. These are all percentage values that
+control how much of each of the distortions is applied to each image. It's
+reasonable to start with values of 5 or 10 for each of them and then experiment
+to see which of them help with your application. `--flip_left_right` will
+randomly mirror half of the images horizontally, which makes sense as long as
+those inversions are likely to happen in your application. For example it
+wouldn't be a good idea if you were trying to recognize letters, since flipping
+them destroys their meaning.
+
+## Hyper-parameters
+
+There are several other parameters you can try adjusting to see if they help
+your results. The `--learning_rate` controls the magnitude of the updates to the
+final layer during training. Intuitively if this is smaller than the learning
+will take longer, but it can end up helping the overall precision. That's not
+always the case though, so you need to experiment carefully to see what works
+for your case. The `--train_batch_size` controls how many images are examined
+during one training step, and because the learning rate is applied per batch
+you'll need to reduce it if you have larger batches to get the same overall
+effect.
+
+## Training, Validation, and Testing Sets
+
+One of the things the script does under the hood when you point it at a folder
+of images is divide them up into three different sets. The largest is usually
+the training set, which are all the images fed into the network during training,
+with the results used to update the model's weights. You might wonder why we
+don't use all the images for training? A big potential problem when we're doing
+machine learning is that our model may just be memorizing irrelevant details of
+the training images to come up with the right answers. For example, you could
+imagine a network remembering a pattern in the background of each photo it was
+shown, and using that to match labels with objects. It could produce good
+results on all the images it's seen before during training, but then fail on new
+images because it's not learned general characteristics of the objects, just
+memorized unimportant details of the training images.
+
+This problem is known as overfitting, and to avoid it we keep some of our data
+out of the training process, so that the model can't memorize them. We then use
+those images as a check to make sure that overfitting isn't occurring, since if
+we see good accuracy on them it's a good sign the network isn't overfitting. The
+usual split is to put 80% of the images into the main training set, keep 10%
+aside to run as validation frequently during training, and then have a final 10%
+that are used less often as a testing set to predict the real-world performance
+of the classifier. These ratios can be controlled using the
+`--testing_percentage` and `--validation_percentage` flags. In general
+you should be able to leave these values at their defaults, since you won't
+usually find any advantage to training to adjusting them.
+
+Note that the script uses the image filenames (rather than a completely random
+function) to divide the images among the training, validation, and test sets.
+This is done to ensure that images don't get moved between training and testing
+sets on different runs, since that could be a problem if images that had been
+used for training a model were subsequently used in a validation set.
+
+You might notice that the validation accuracy fluctuates among iterations. Much
+of this fluctuation arises from the fact that a random subset of the validation
+set is chosen for each validation accuracy measurement. The fluctuations can be
+greatly reduced, at the cost of some increase in training time, by choosing
+`--validation_batch_size=-1`, which uses the entire validation set for each
+accuracy computation.
+
+Once training is complete, you may find it insightful to examine misclassified
+images in the test set. This can be done by adding the flag
+`--print_misclassified_test_images`. This may help you get a feeling for which
+types of images were most confusing for the model, and which categories were
+most difficult to distinguish. For instance, you might discover that some
+subtype of a particular category, or some unusual photo angle, is particularly
+difficult to identify, which may encourage you to add more training images of
+that subtype. Oftentimes, examining misclassified images can also point to
+errors in the input data set, such as mislabeled, low-quality, or ambiguous
+images. However, one should generally avoid point-fixing individual errors in
+the test set, since they are likely to merely reflect more general problems in
+the (much larger) training set.
diff --git a/tensorflow/docs_src/tutorials/index.md b/tensorflow/docs_src/tutorials/index.md
new file mode 100644
index 00000000000..b9c18f32885
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/index.md
@@ -0,0 +1,51 @@
+# Tutorials
+
+This section contains tutorials demonstrating how to do specific tasks
+in TensorFlow.  If you are new to TensorFlow, we recommend reading the
+documents in the "Get Started" section before reading these tutorials.
+
+The following tutorial explains the interaction of CPUs and GPUs on a
+TensorFlow system:
+
+  * @{$using_gpu$Using GPUs}
+
+The following tutorials cover different aspects of image recognition:
+
+  * @{$image_recognition$Image Recognition}, which introduces the field of
+    image recognition and a model (Inception) for recognizing images.
+  * @{$image_retraining$How to Retrain Inception's Final Layer for New Categories},
+    which has a wonderfully self-explanatory title.
+  * @{$layers$A Guide to TF Layers: Building a Convolutional Neural Network},
+    which introduces convolutional neural networks (CNNs) and demonstrates how
+    to build a CNN in TensorFlow.
+  * @{$deep_cnn$Convolutional Neural Networks}, which demonstrates how to
+    build a small CNN for recognizing images.  This tutorial is aimed at
+    advanced TensorFlow users.
+
+The following tutorials focus on machine learning problems in human language:
+
+  * @{$word2vec$Vector Representations of Words}, which demonstrates how to
+    create an embedding for words.
+  * @{$recurrent$Recurrent Neural Networks}, which demonstrates how to use a
+    recurrent neural network to predict the next word in a sentence.
+  * @{$seq2seq$Sequence-to-Sequence Models}, which demonstrates how to use a
+    sequence-to-sequence model to translate text from English to French.
+
+The following tutorials focus on linear models:
+
+  * @{$linear$Large-Scale Linear Models with TensorFlow}, which introduces
+    linear models and demonstrates how to build them with the high-level API.
+  * @{$wide$TensorFlow Linear Model Tutorial}, which demonstrates how to solve
+    a binary classification problem in TensorFlow.
+  * @{$wide_and_deep$TensorFlow Wide & Deep Learning Tutorial}, which explains
+    how to use the high-level API to jointly train both a wide linear model
+    and a deep feed-forward neural network.
+  * @{$kernel_methods$Improving Linear Models Using Explicit Kernel Methods},
+    which shows how to improve the quality of a linear model by using explicit
+    kernel mappings.
+
+Although TensorFlow specializes in machine learning, you may also use
+TensorFlow to solve other kinds of math problems.  For example:
+
+  * @{$mandelbrot$Mandelbrot Set}
+  * @{$pdes$Partial Differential Equations}
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
new file mode 100644
index 00000000000..fbf1afc4ab4
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -0,0 +1,299 @@
+# Improving Linear Models Using Explicit Kernel Methods
+
+In this tutorial, we demonstrate how combining (explicit) kernel methods with
+linear models can drastically increase the latters' quality of predictions
+without significantly increasing training and inference times. Unlike dual
+kernel methods, explicit (primal) kernel methods scale well with the size of the
+training dataset both in terms of training/inference times and in terms of
+memory requirements.
+
+**Intended audience:** Even though we provide a high-level overview of concepts
+related to explicit kernel methods, this tutorial primarily targets readers who
+already have at least basic knowledge of kernel methods and Support Vector
+Machines (SVMs). If you are new to kernel methods, refer to either of the
+following sources for an introduction:
+
+* If you have a strong mathematical background:
+[Kernel Methods in Machine Learning](http://www.kernel-machines.org/publications/pdfs/0701907.pdf)
+* [Kernel method wikipedia page](https://en.wikipedia.org/wiki/Kernel_method)
+
+Currently, TensorFlow supports explicit kernel mappings for dense features only;
+TensorFlow will provide support for sparse features at a later release.
+
+This tutorial uses [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn)
+(TensorFlow's high-level Machine Learning API) Estimators for our ML models.
+If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn)
+is a good place to start. We will use the MNIST dataset. The tutorial consists
+of the following steps:
+
+* Load and prepare MNIST data for classification.
+* Construct a simple linear model, train it, and evaluate it on the eval data.
+* Replace the linear model with a kernelized linear model, re-train, and
+re-evaluate.
+
+## Load and prepare MNIST data for classification
+Run the following utility command to load the MNIST dataset:
+
+```python
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+```
+The preceding method loads the entire MNIST dataset (containing 70K samples) and
+splits it into train, validation, and test data with 55K, 5K, and 10K samples
+respectively. Each split contains one numpy array for images (with shape
+[sample_size, 784]) and one for labels (with shape [sample_size, 1]). In this
+tutorial, we only use the train and validation splits to train and evaluate our
+models respectively.
+
+In order to feed data to a tf.contrib.learn Estimator, it is helpful to convert
+it to Tensors. For this, we will use an `input function` which adds Ops to the
+TensorFlow graph that, when executed, create mini-batches of Tensors to be used
+downstream. For more background on input functions, check
+@{$get_started/input_fn$Building Input Functions with tf.contrib.learn}. In this
+example, we will use the `tf.train.shuffle_batch` Op which, besides converting
+numpy arrays to Tensors, allows us to specify the batch_size and whether to
+randomize the input every time the input_fn Ops are executed (randomization
+typically expedites convergence during training). The full code for loading and
+preparing the data is shown in the snippet below. In this example, we use
+mini-batches of size 256 for training and the entire sample (5K entries) for
+evaluation. Feel free to experiment with different batch sizes.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000):
+
+  def _input_fn():
+    images_batch, labels_batch = tf.train.shuffle_batch(
+        tensors=[dataset_split.images, dataset_split.labels.astype(np.int32)],
+        batch_size=batch_size,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        enqueue_many=True,
+        num_threads=4)
+    features_map = {'images': images_batch}
+    return features_map, labels_batch
+
+  return _input_fn
+
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+
+train_input_fn = get_input_fn(data.train, batch_size=256)
+eval_input_fn = get_input_fn(data.validation, batch_size=5000)
+
+```
+
+## Training a simple linear model
+We can now train a linear model over the MNIST dataset. We will use the
+@{tf.contrib.learn.LinearClassifier} estimator with 10 classes representing the
+10 digits. The input features form a 784-dimensional dense vector which can
+be specified as follows:
+
+```python
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+```
+
+The full code for constructing, training and evaluating a LinearClassifier
+estimator is as follows:
+
+```python
+import time
+
+# Specify the feature(s) to be used by the estimator.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+estimator = tf.contrib.learn.LinearClassifier(feature_columns=[image_column], n_classes=10)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+The following table summarizes the results on the eval data.
+
+metric        | value
+:------------ | :------------
+loss          | 0.25 to 0.30
+accuracy      | 92.5%
+training time | ~25 seconds on my machine
+
+Note: Metrics will vary depending on various factors.
+
+In addition to experimenting with the (training) batch size and the number of
+training steps, there are a couple other parameters that can be tuned as well.
+For instance, you can change the optimization method used to minimize the loss
+by explicitly selecting another optimizer from the collection of
+[available optimizers](https://www.tensorflow.org/code/tensorflow/python/training).
+As an example, the following code constructs a LinearClassifier estimator that
+uses the Follow-The-Regularized-Leader (FTRL) optimization strategy with a
+specific learning rate and L2-regularization.
+
+
+```python
+optimizer = tf.train.FtrlOptimizer(learning_rate=5.0, l2_regularization_strength=1.0)
+estimator = tf.contrib.learn.LinearClassifier(
+    feature_columns=[image_column], n_classes=10, optimizer=optimizer)
+```
+
+Regardless of the values of the parameters, the maximum accuracy a linear model
+can achieve on this dataset caps at around **93%**.
+
+## Using explicit kernel mappings with the linear model.
+The relatively high error (~7%) of the linear model over MNIST indicates that
+the input data is not linearly separable. We will use explicit kernel mappings
+to reduce the classification error.
+
+**Intuition:** The high-level idea is to use a non-linear map to transform the
+input space to another feature space (of possibly higher dimension) where the
+(transformed) features are (almost) linearly separable and then apply a linear
+model on the mapped features. This is shown in the following figure:
+
+<div style="text-align:center">
+<img src="https://www.tensorflow.org/versions/master/images/kernel_mapping.png" />
+</div>
+
+
+### Technical details
+In this example we will use **Random Fourier Features**, introduced in the
+["Random Features for Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+paper by Rahimi and Recht, to map the input data. Random Fourier Features map a
+vector \\(\mathbf{x} \in \mathbb{R}^d\\) to \\(\mathbf{x'} \in \mathbb{R}^D\\)
+via the following mapping:
+
+$$
+RFFM(\cdot): \mathbb{R}^d \to \mathbb{R}^D, \quad
+RFFM(\mathbf{x}) =  \cos(\mathbf{\Omega} \cdot \mathbf{x}+ \mathbf{b})
+$$
+
+where \\(\mathbf{\Omega} \in \mathbb{R}^{D \times d}\\),
+\\(\mathbf{x} \in \mathbb{R}^d,\\) \\(\mathbf{b} \in \mathbb{R}^D\\) and the
+cosine is applied element-wise.
+
+In this example, the entries of \\(\mathbf{\Omega}\\) and \\(\mathbf{b}\\) are
+sampled from distributions such that the mapping satisfies the following
+property:
+
+$$
+RFFM(\mathbf{x})^T \cdot RFFM(\mathbf{y}) \approx
+e^{-\frac{\|\mathbf{x} - \mathbf{y}\|^2}{2 \sigma^2}}
+$$
+
+The right-hand-side quantity of the expression above is known as the RBF (or
+Gaussian) kernel function. This function is one of the most-widely used kernel
+functions in Machine Learning and implicitly measures similarity in a different,
+much higher dimensional space than the original one. See
+[Radial basis function kernel](https://en.wikipedia.org/wiki/Radial_basis_function_kernel)
+for more details.
+
+### Kernel classifier
+@{tf.contrib.kernel_methods.KernelLinearClassifier} is a pre-packaged
+`tf.contrib.learn` estimator that combines the power of explicit kernel mappings
+with linear models. Its constructor is almost identical to that of the
+LinearClassifier estimator with the additional option to specify a list of
+explicit kernel mappings to be applied to each feature the classifier uses. The
+following code snippet demonstrates how to replace LinearClassifier with
+KernelLinearClassifier.
+
+
+```python
+# Specify the feature(s) to be used by the estimator. This is identical to the
+# code used for the LinearClassifier.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+optimizer = tf.train.FtrlOptimizer(
+   learning_rate=50.0, l2_regularization_strength=0.001)
+
+
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+The only additional parameter passed to `KernelLinearClassifier` is a dictionary
+from feature_columns to a list of kernel mappings to be applied to the
+corresponding feature column. The following lines instruct the classifier to
+first map the initial 784-dimensional images to 2000-dimensional vectors using
+random Fourier features and then learn a linear model on the transformed
+vectors:
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+```
+Notice the `stddev` parameter. This is the standard deviation (\\(\sigma\\)) of
+the approximated RBF kernel and controls the similarity measure used in
+classification. `stddev` is typically determined via hyperparameter tuning.
+
+The results of running the preceding code are summarized in the following table.
+We can further increase the accuracy by increasing the output dimension of the
+mapping and tuning the standard deviation.
+
+metric        | value
+:------------ | :------------
+loss          | 0.10
+accuracy      | 97%
+training time | ~35 seconds on my machine
+
+
+### stddev
+The classification quality is very sensitive to the value of stddev. The
+following table shows the accuracy of the classifier on the eval data for
+different values of stddev. The optimal value is stddev=5.0. Notice how too
+small or too high stddev values can dramatically decrease the accuracy of the
+classification.
+
+stddev | eval accuracy
+:----- | :------------
+1.0    | 0.1362
+2.0    | 0.4764
+4.0    | 0.9654
+5.0    | 0.9766
+8.0    | 0.9714
+16.0   | 0.8878
+
+### Output dimension
+Intuitively, the larger the output dimension of the mapping, the closer the
+inner product of two mapped vectors approximates the kernel, which typically
+translates to better classification accuracy. Another way to think about this is
+that the output dimension equals the number of weights of the linear model; the
+larger this dimension, the larger the "degrees of freedom" of the model.
+However, after a certain threshold, higher output dimensions increase the
+accuracy by very little, while making training take more time. This is shown in
+the following two Figures which depict the eval accuracy as a function of the
+output dimension and the training time, respectively.
+
+![image](https://www.tensorflow.org/versions/master/images/acc_vs_outdim.png)
+![image](https://www.tensorflow.org/versions/master/images/acc-vs-trn_time.png)
+
+
+## Summary
+Explicit kernel mappings combine the predictive power of nonlinear models with
+the scalability of linear models. Unlike traditional dual kernel methods,
+explicit kernel methods can scale to millions or hundreds of millions of
+samples. When using explicit kernel mappings, consider the following tips:
+
+* Random Fourier Features can be particularly effective for datasets with dense
+features.
+* The parameters of the kernel mapping are often data-dependent. Model quality
+can be very sensitive to these parameters. Use hyperparameter tuning to find the
+optimal values.
+* If you have multiple numerical features, concatenate them into a single
+multi-dimensional feature and apply the kernel mapping to the concatenated
+vector.
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
new file mode 100644
index 00000000000..0fdfcf5d2a2
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -0,0 +1,747 @@
+# A Guide to TF Layers: Building a Convolutional Neural Network
+
+The TensorFlow @{tf.layers$`layers` module} provides a high-level API that makes
+it easy to construct a neural network. It provides methods that facilitate the
+creation of dense (fully connected) layers and convolutional layers, adding
+activation functions, and applying dropout regularization. In this tutorial,
+you'll learn how to use `layers` to build a convolutional neural network model
+to recognize the handwritten digits in the MNIST data set.
+
+![handwritten digits 0–9 from the MNIST data set](https://www.tensorflow.org/images/mnist_0-9.png)
+
+**The [MNIST dataset](http://yann.lecun.com/exdb/mnist/) comprises 60,000
+training examples and 10,000 test examples of the handwritten digits 0–9,
+formatted as 28x28-pixel monochrome images.**
+
+## Getting Started
+
+Let's set up the skeleton for our TensorFlow program. Create a file called
+`cnn_mnist.py`, and add the following code:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Imports
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib import learn
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# Our application logic will be added here
+
+if __name__ == "__main__":
+  tf.app.run()
+```
+
+As you work through the tutorial, you'll add code to construct, train, and
+evaluate the convolutional neural network. The complete, final code can be
+[found here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/layers/cnn_mnist.py).
+
+## Intro to Convolutional Neural Networks
+
+Convolutional neural networks (CNNs) are the current state-of-the-art model
+architecture for image classification tasks. CNNs apply a series of filters to
+the raw pixel data of an image to extract and learn higher-level features, which
+the model can then use for classification. CNNs contains three components:
+
+*   **Convolutional layers**, which apply a specified number of convolution
+    filters to the image. For each subregion, the layer performs a set of
+    mathematical operations to produce a single value in the output feature map.
+    Convolutional layers then typically apply a
+    [ReLU activation function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\)) to
+    the output to introduce nonlinearities into the model.
+
+*   **Pooling layers**, which
+    [downsample the image data](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer)
+    extracted by the convolutional layers to reduce the dimensionality of the
+    feature map in order to decrease processing time. A commonly used pooling
+    algorithm is max pooling, which extracts subregions of the feature map
+    (e.g., 2x2-pixel tiles), keeps their maximum value, and discards all other
+    values.
+
+*   **Dense (fully connected) layers**, which perform classification on the
+    features extracted by the convolutional layers and downsampled by the
+    pooling layers. In a dense layer, every node in the layer is connected to
+    every node in the preceding layer.
+
+Typically, a CNN is composed of a stack of convolutional modules that perform
+feature extraction. Each module consists of a convolutional layer followed by a
+pooling layer. The last convolutional module is followed by one or more dense
+layers that perform classification. The final dense layer in a CNN contains a
+single node for each target class in the model (all the possible classes the
+model may predict), with a
+[softmax](https://en.wikipedia.org/wiki/Softmax_function) activation function to
+generate a value between 0–1 for each node (the sum of all these softmax values
+is equal to 1). We can interpret the softmax values for a given image as
+relative measurements of how likely it is that the image falls into each target
+class.
+
+> Note: For a more comprehensive walkthrough of CNN architecture, see Stanford
+> University's <a href="http://cs231n.github.io/convolutional-networks/">
+> Convolutional Neural Networks for Visual Recognition course materials</a>.</p>
+
+## Building the CNN MNIST Classifier {#building_the_cnn_mnist_classifier}
+
+Let's build a model to classify the images in the MNIST dataset using the
+following CNN architecture:
+
+1.  **Convolutional Layer #1**: Applies 32 5x5 filters (extracting 5x5-pixel
+    subregions), with ReLU activation function
+2.  **Pooling Layer #1**: Performs max pooling with a 2x2 filter and stride of 2
+    (which specifies that pooled regions do not overlap)
+3.  **Convolutional Layer #2**: Applies 64 5x5 filters, with ReLU activation
+    function
+4.  **Pooling Layer #2**: Again, performs max pooling with a 2x2 filter and
+    stride of 2
+5.  **Dense Layer #1**: 1,024 neurons, with dropout regularization rate of 0.4
+    (probability of 0.4 that any given element will be dropped during training)
+6.  **Dense Layer #2 (Logits Layer)**: 10 neurons, one for each digit target
+    class (0–9).
+
+The `tf.layers` module contains methods to create each of the three layer types
+above:
+
+*   `conv2d()`. Constructs a two-dimensional convolutional layer. Takes number
+    of filters, filter kernel size, padding, and activation function as
+    arguments.
+*   `max_pooling2d()`. Constructs a two-dimensional pooling layer using the
+    max-pooling algorithm. Takes pooling filter size and stride as arguments.
+*   `dense()`. Constructs a dense layer. Takes number of neurons and activation
+    function as arguments.
+
+Each of these methods accepts a tensor as input and returns a transformed tensor
+as output. This makes it easy to connect one layer to another: just take the
+output from one layer-creation method and supply it as input to another.
+
+Open `cnn_mnist.py` and add the following `cnn_model_fn` function, which
+conforms to the interface expected by TensorFlow's Estimator API (more on this
+later in [Create the Estimator](#create-the-estimator)). `cnn_mnist.py` takes
+MNIST feature data, labels, and
+@{tf.contrib.learn.ModeKeys$model mode} (`TRAIN`, `EVAL`,
+`INFER`) as arguments; configures the CNN; and returns predictions, loss, and a
+training operation:
+
+```python
+def cnn_model_fn(features, labels, mode):
+  """Model function for CNN."""
+  # Input Layer
+  input_layer = tf.reshape(features, [-1, 28, 28, 1])
+
+  # Convolutional Layer #1
+  conv1 = tf.layers.conv2d(
+      inputs=input_layer,
+      filters=32,
+      kernel_size=[5, 5],
+      padding="same",
+      activation=tf.nn.relu)
+
+  # Pooling Layer #1
+  pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+
+  # Convolutional Layer #2 and Pooling Layer #2
+  conv2 = tf.layers.conv2d(
+      inputs=pool1,
+      filters=64,
+      kernel_size=[5, 5],
+      padding="same",
+      activation=tf.nn.relu)
+  pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+
+  # Dense Layer
+  pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+  dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
+  dropout = tf.layers.dropout(
+      inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
+
+  # Logits Layer
+  logits = tf.layers.dense(inputs=dropout, units=10)
+
+  loss = None
+  train_op = None
+
+  # Calculate Loss (for both TRAIN and EVAL modes)
+  if mode != learn.ModeKeys.INFER:
+    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+    loss = tf.losses.softmax_cross_entropy(
+        onehot_labels=onehot_labels, logits=logits)
+
+  # Configure the Training Op (for TRAIN mode)
+  if mode == learn.ModeKeys.TRAIN:
+    train_op = tf.contrib.layers.optimize_loss(
+        loss=loss,
+        global_step=tf.contrib.framework.get_global_step(),
+        learning_rate=0.001,
+        optimizer="SGD")
+
+  # Generate Predictions
+  predictions = {
+      "classes": tf.argmax(
+          input=logits, axis=1),
+      "probabilities": tf.nn.softmax(
+          logits, name="softmax_tensor")
+  }
+
+  # Return a ModelFnOps object
+  return model_fn_lib.ModelFnOps(
+      mode=mode, predictions=predictions, loss=loss, train_op=train_op)
+```
+
+The following sections (with headings corresponding to each code block above)
+dive deeper into the `tf.layers` code used to create each layer, as well as how
+to calculate loss, configure the training op, and generate predictions. If
+you're already experienced with CNNs and @{$estimators$TensorFlow `Estimator`s},
+and find the above code intuitive, you may want to skim these sections or just
+skip ahead to ["Training and Evaluating the CNN MNIST
+Classifier"](#training-and-evaluating-the-cnn-mnist-classifier).
+
+### Input Layer
+
+The methods in the `layers` module for creating convolutional and pooling layers
+for two-dimensional image data expect input tensors to have a shape of
+<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<em>channels</em>]</code>, defined as follows:
+
+*   _`batch_size`_. Size of the subset of examples to use when performing
+    gradient descent during training.
+*   _`image_width`_. Width of the example images.
+*   _`image_height`_. Height of the example images.
+*   _`channels`_. Number of color channels in the example images. For color
+    images, the number of channels is 3 (red, green, blue). For monochrome
+    images, there is just 1 channel (black).
+
+Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the
+desired shape for our input layer is <code>[<em>batch_size</em>, 28, 28,
+1]</code>.
+
+To convert our input feature map (`features`) to this shape, we can perform the
+following `reshape` operation:
+
+```python
+input_layer = tf.reshape(features, [-1, 28, 28, 1])
+```
+
+Note that we've indicated `-1` for batch size, which specifies that this
+dimension should be dynamically computed based on the number of input values in
+`features`, holding the size of all other dimensions constant. This allows us to
+treat `batch_size` as a hyperparameter that we can tune. For example, if we feed
+examples into our model in batches of 5, `features` will contain 3,920 values
+(one value for each pixel in each image), and `input_layer` will have a shape of
+`[5, 28, 28, 1]`. Similarly, if we feed examples in batches of 100, `features`
+will contain 78,400 values, and `input_layer` will have a shape of `[100, 28,
+28, 1]`.
+
+### Convolutional Layer #1
+
+In our first convolutional layer, we want to apply 32 5x5 filters to the input
+layer, with a ReLU activation function. We can use the `conv2d()` method in the
+`layers` module to create this layer as follows:
+
+```python
+conv1 = tf.layers.conv2d(
+    inputs=input_layer,
+    filters=32,
+    kernel_size=[5, 5],
+    padding="same",
+    activation=tf.nn.relu)
+```
+
+The `inputs` argument specifies our input tensor, which must have the shape
+<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<em>channels</em>]</code>. Here, we're connecting our first convolutional layer
+to `input_layer`, which has the shape <code>[<em>batch_size</em>, 28, 28,
+1]</code>.
+
+> Note: <code>conv2d()</code> will instead accept a shape of
+> <code>[<em>channels</em>, <em>batch_size</em>, <em>image_width</em>,
+> <em>image_height</em>]</code> when passed the argument
+> <code>data_format=channels_first</code>.
+
+The `filters` argument specifies the number of filters to apply (here, 32), and
+`kernel_size` specifies the dimensions of the filters as <code>[<em>width</em>,
+<em>height</em>]</code> (here, <code>[5, 5]</code>).
+
+<p class="tip"><b>TIP:</b> If filter width and height have the same value, you can instead specify a
+single integer for <code>kernel_size</code>—e.g., <code>kernel_size=5</code>.</p>
+
+The `padding` argument specifies one of two enumerated values
+(case-insensitive): `valid` (default value) or `same`. To specify that the
+output tensor should have the same width and height values as the input tensor,
+we set `padding=same` here, which instructs TensorFlow to add 0 values to the
+edges of the output tensor to preserve width and height of 28. (Without padding,
+a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
+24x24 locations to extract a 5x5 tile from a 28x28 grid.)
+
+The `activation` argument specifies the activation function to apply to the
+output of the convolution. Here, we specify ReLU activation with
+@{tf.nn.relu}.
+
+Our output tensor produced by `conv2d()` has a shape of
+<code>[<em>batch_size</em>, 28, 28, 32]</code>: the same width and height
+dimensions as the input, but now with 32 channels holding the output from each
+of the filters.
+
+### Pooling Layer #1
+
+Next, we connect our first pooling layer to the convolutional layer we just
+created. We can use the `max_pooling2d()` method in `layers` to construct a
+layer that performs max pooling with a 2x2 filter and stride of 2:
+
+```python
+pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+```
+
+Again, `inputs` specifies the input tensor, with a shape of
+<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
+<em>channels</em>]</code>. Here, our input tensor is `conv1`, the output from
+the first convolutional layer, which has a shape of <code>[<em>batch_size</em>,
+28, 28, 32]</code>.
+
+> Note: As with <code>conv2d()</code>, <code>max_pooling2d()</code> will instead
+> accept a shape of <code>[<em>channels</em>, <em>batch_size</em>,
+> <em>image_width</em>, <em>image_height</em>]</code> when passed the argument
+> <code>data_format=channels_first</code>.
+
+The `pool_size` argument specifies the size of the max pooling filter as
+<code>[<em>width</em>, <em>height</em>]</code> (here, `[2, 2]`). If both
+dimensions have the same value, you can instead specify a single integer (e.g.,
+`pool_size=2`).
+
+The `strides` argument specifies the size of the stride. Here, we set a stride
+of 2, which indicates that the subregions extracted by the filter should be
+separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
+this means that none of the regions extracted will overlap). If you want to set
+different stride values for width and height, you can instead specify a tuple or
+list (e.g., `stride=[3, 6]`).
+
+Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
+height by 50% each.
+
+### Convolutional Layer #2 and Pooling Layer #2
+
+We can connect a second convolutional and pooling layer to our CNN using
+`conv2d()` and `max_pooling2d()` as before. For convolutional layer #2, we
+configure 64 5x5 filters with ReLU activation, and for pooling layer #2, we use
+the same specs as pooling layer #1 (a 2x2 max pooling filter with stride of 2):
+
+```python
+conv2 = tf.layers.conv2d(
+    inputs=pool1,
+    filters=64,
+    kernel_size=[5, 5],
+    padding="same",
+    activation=tf.nn.relu)
+
+pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+```
+
+Note that convolutional layer #2 takes the output tensor of our first pooling
+layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2`
+has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
+and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
+filters applied.
+
+Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2`
+has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of width
+and height from `conv2`).
+
+### Dense Layer
+
+Next, we want to add a dense layer (with 1,024 neurons and ReLU activation) to
+our CNN to perform classification on the features extracted by the
+convolution/pooling layers. Before we connect the layer, however, we'll flatten
+our feature map (`pool2`) to shape <code>[<em>batch_size</em>,
+<em>features</em>]</code>, so that our tensor has only two dimensions:
+
+```python
+pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+```
+
+In the `reshape()` operation above, the `-1` signifies that the *`batch_size`*
+dimension will be dynamically calculated based on the number of examples in our
+input data. Each example has 7 (`pool2` width) * 7 (`pool2` height) * 64
+(`pool2` channels) features, so we want the `features` dimension to have a value
+of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape
+<code>[<em>batch_size</em>, 3136]</code>.
+
+Now, we can use the `dense()` method in `layers` to connect our dense layer as
+follows:
+
+```python
+dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
+```
+
+The `inputs` argument specifies the input tensor: our flattened feature map,
+`pool2_flat`. The `units` argument specifies the number of neurons in the dense
+layer (1,024). The `activation` argument takes the activation function; again,
+we'll use `tf.nn.relu` to add ReLU activation.
+
+To help improve the results of our model, we also apply dropout regularization
+to our dense layer, using the `dropout` method in `layers`:
+
+```python
+dropout = tf.layers.dropout(
+    inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
+```
+
+Again, `inputs` specifies the input tensor, which is the output tensor from our
+dense layer (`dense`).
+
+The `rate` argument specifies the dropout rate; here, we use `0.4`, which means
+40% of the elements will be randomly dropped out during training.
+
+The `training` argument takes a boolean specifying whether or not the model is
+currently being run in training mode; dropout will only be performed if
+`training` is `True`. Here, we check if the `mode` passed to our model function
+`cnn_model_fn` is `TRAIN` mode.
+
+Our output tensor `dropout` has shape <code>[<em>batch_size</em>, 1024]</code>.
+
+### Logits Layer
+
+The final layer in our neural network is the logits layer, which will return the
+raw values for our predictions. We create a dense layer with 10 neurons (one for
+each target class 0–9), with linear activation (the default):
+
+```python
+logits = tf.layers.dense(inputs=dropout, units=10)
+```
+
+Our final output tensor of the CNN, `logits`, has shape
+<code>[<em>batch_size</em>, 10]</code>.
+
+### Calculate Loss {#calculating-loss}
+
+For both training and evaluation, we need to define a
+[loss function](https://en.wikipedia.org/wiki/Loss_function)
+that measures how closely the model's predictions match the target classes. For
+multiclass classification problems like MNIST,
+[cross entropy](https://en.wikipedia.org/wiki/Cross_entropy) is typically used
+as the loss metric. The following code calculates cross entropy when the model
+runs in either `TRAIN` or `EVAL` mode:
+
+```python
+loss = None
+train_op = None
+
+# Calculate loss for both TRAIN and EVAL modes
+if mode != learn.ModeKeys.INFER:
+  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+```
+
+Let's take a closer look at what's happening above.
+
+Our `labels` tensor contains a list of predictions for our examples, e.g. `[1,
+9, ...]`. In order to calculate cross-entropy, first we need to convert `labels`
+to the corresponding
+[one-hot encoding](https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science):
+
+```none
+[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+ ...]
+```
+
+We use the @{tf.one_hot} function
+to perform this conversion. `tf.one_hot()` has two required arguments:
+
+*   `indices`. The locations in the one-hot tensor that will have "on
+    values"—i.e., the locations of `1` values in the tensor shown above.
+*   `depth`. The depth of the one-hot tensor—i.e., the number of target classes.
+    Here, the depth is `10`.
+
+The following code creates the one-hot tensor for our labels, `onehot_labels`:
+
+```python
+onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+```
+
+Because `labels` contains a series of values from 0–9, `indices` is just our
+`labels` tensor, with values cast to integers. The `depth` is `10` because we
+have 10 possible target classes, one for each digit.
+
+Next, we compute cross-entropy of `onehot_labels` and the softmax of the
+predictions from our logits layer. `tf.losses.softmax_cross_entropy()` takes
+`onehot_labels` and `logits` as arguments, performs softmax activation on
+`logits`, calculates cross-entropy, and returns our `loss` as a scalar `Tensor`:
+
+```python
+loss = tf.losses.softmax_cross_entropy(
+        onehot_labels=onehot_labels, logits=logits)
+```
+
+### Configure the Training Op
+
+In the previous section, we defined loss for our CNN as the softmax
+cross-entropy of the logits layer and our labels. Let's configure our model to
+optimize this loss value during training, using the
+@{tf.contrib.layers.optimize_loss}
+method in `tf.contrib.layers`. We'll use a learning rate of 0.001 and
+[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
+as the optimization algorithm:
+
+```python
+# Configure the Training Op (for TRAIN mode)
+if mode == learn.ModeKeys.TRAIN:
+    train_op = tf.contrib.layers.optimize_loss(
+        loss=loss,
+        global_step=tf.contrib.framework.get_global_step(),
+        learning_rate=0.001,
+        optimizer="SGD")
+```
+
+> Note: For a more in-depth look at configuring training ops for Estimator model
+> functions, see @{$estimators#defining-the-training-op-for-the-model$"Defining
+> the training op for the model"} in the @{$estimators$"Creating Estimations in
+> tf.contrib.learn"} tutorial.
+
+### Generate Predictions {#generate_predictions}
+
+The logits layer of our model returns our predictions as raw values in a
+<code>[<em>batch_size</em>, 10]</code>-dimensional tensor. Let's convert these
+raw values into two different formats that our model function can return:
+
+*   The **predicted class** for each example: a digit from 0–9.
+*   The **probabilities** for each possible target class for each example: the
+    probability that the example is a 0, is a 1, is a 2, etc.
+
+For a given example, our predicted class is the element in the corresponding row
+of the logits tensor with the highest raw value. We can find the index of this
+element using the @{tf.argmax}
+function:
+
+```python
+tf.argmax(input=logits, axis=1)
+```
+
+The `input` argument specifies the tensor from which to extract maximum
+values—here `logits`. The `axis` argument specifies the axis of the `input`
+tensor along which to find the greatest value. Here, we want to find the largest
+value along the dimension with index of 1, which corresponds to our predictions
+(recall that our logits tensor has shape <code>[<em>batch_size</em>,
+10]</code>).
+
+We can derive probabilities from our logits layer by applying softmax activation
+using @{tf.nn.softmax}:
+
+```python
+tf.nn.softmax(logits, name="softmax_tensor")
+```
+
+> Note: We use the `name` argument to explicitly name this operation
+> `softmax_tensor`, so we can reference it later. (We'll set up logging for the
+> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook).
+
+We compile our predictions in a dict as follows:
+
+```python
+predictions = {
+    "classes": tf.argmax(
+        input=logits, axis=1),
+    "probabilities": tf.nn.softmax(
+        logits, name="softmax_tensor")
+}
+```
+
+Finally, now that we've got our `predictions`, `loss`, and `train_op`, we can
+return them, along with our `mode` argument, in a
+@{tf.contrib.learn.ModelFnOps} object:
+
+```python
+# Return a ModelFnOps object
+return model_fn_lib.ModelFnOps(
+    mode=mode, predictions=predictions, loss=loss, train_op=train_op)
+```
+
+## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier}
+
+We've coded our MNIST CNN model function; now we're ready to train and evaluate
+it.
+
+### Load Training and Test Data
+
+First, let's load our training and test data. Add a `main()` function to
+`cnn_mnist.py` with the following code:
+
+```python
+def main(unused_argv):
+  # Load training and eval data
+  mnist = learn.datasets.load_dataset("mnist")
+  train_data = mnist.train.images # Returns np.array
+  train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
+  eval_data = mnist.test.images # Returns np.array
+  eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
+```
+
+We store the training feature data (the raw pixel values for 55,000 images of
+hand-drawn digits) and training labels (the corresponding value from 0–9 for
+each image) as [numpy
+arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html)
+in `train_data` and `train_labels`, respectively. Similarly, we store the
+evaluation feature data (10,000 images) and evaluation labels in `eval_data`
+and `eval_labels`, respectively.
+
+### Create the Estimator {#create-the-estimator}
+
+Next, let's create an `Estimator` (a TensorFlow class for performing high-level
+model training, evaluation, and inference) for our model. Add the following code
+to `main()`:
+
+```python
+# Create the Estimator
+mnist_classifier = learn.Estimator(
+      model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
+```
+
+The `model_fn` argument specifies the model function to use for training,
+evaluation, and inference; we pass it the `cnn_model_fn` we created in
+["Building the CNN MNIST Classifier."](#building-the-cnn-mnist-classifier) The
+`model_dir` argument specifies the directory where model data (checkpoints) will
+be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but
+feel free to change to another directory of your choice).
+
+> Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
+> tutorial @{$estimators$"Creating Estimators in tf.contrib.learn."}
+
+### Set Up a Logging Hook {#set_up_a_logging_hook}
+
+Since CNNs can take a while to train, let's set up some logging so we can track
+progress during training. We can use TensorFlow's @{tf.train.SessionRunHook} to create a
+@{tf.train.LoggingTensorHook}
+that will log the probability values from the softmax layer of our CNN. Add the
+following to `main()`:
+
+```python
+# Set up logging for predictions
+  tensors_to_log = {"probabilities": "softmax_tensor"}
+  logging_hook = tf.train.LoggingTensorHook(
+      tensors=tensors_to_log, every_n_iter=50)
+```
+
+We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a
+label of our choice that will be printed in the log output, and the
+corresponding label is the name of a `Tensor` in the TensorFlow graph. Here, our
+`probabilities` can be found in `softmax_tensor`, the name we gave our softmax
+operation earlier when we generated the probabilities in `cnn_model_fn`.
+
+> Note: If you don't explicitly assign a name to an operation via the `name`
+> argument, TensorFlow will assign a default name. A couple easy ways to
+> discover the names applied to operations are to visualize your graph on
+> @{$graph_viz$TensorBoard}) or to enable the @{$debugger$TensorFlow Debugger
+> (tfdbg)}.
+
+Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the
+`tensors` argument. We set `every_n_iter=50`, which specifies that probabilities
+should be logged after every 50 steps of training.
+
+### Train the Model
+
+Now we're ready to train our model, which we can do by calling `fit()` on
+`mnist_classifier`. Add the following to `main()`:
+
+```python
+# Train the model
+mnist_classifier.fit(
+    x=train_data,
+    y=train_labels,
+    batch_size=100,
+    steps=20000,
+    monitors=[logging_hook])
+```
+
+In the `fit` call, we pass the training feature data and labels to `x` and `y`,
+respectively. We set a `batch_size` of `100` (which means that the model will
+train on minibatches of 100 examples at each step), and `steps` of `20000`
+(which means the model will train for 20,000 steps total). We pass our
+`logging_hook` to the `monitors` argument, so that it will be triggered during
+training.
+
+### Evaluate the Model
+
+Once training is complete, we want to evaluate our model to determine its
+accuracy on the MNIST test set. To set up the accuracy metric for our model, we
+need to create a metrics dict with a @{tf.contrib.learn.MetricSpec}
+that calculates accuracy. Add the following to `main()`:
+
+```python
+# Configure the accuracy metric for evaluation
+metrics = {
+    "accuracy":
+        learn.MetricSpec(
+            metric_fn=tf.metrics.accuracy, prediction_key="classes"),
+}
+```
+
+We create our `MetricSpec`s with the following two arguments:
+
+*   `metric_fn`. The function that calculates and returns the value of our
+    metric. Here, we can use the predefined `accuracy` function in the
+    @{tf.metrics} module.
+*   `prediction_key`. The key of the tensor that contains the predictions
+    returned by the model function. Here, because we're building a
+    classification model, the prediction key is `"classes"`, which we specified
+    back in ["Generate Predictions."](#generate_predictions)
+
+Now that we've set up our `metrics` dict, we can evaluate the model. Add the
+following code, which performs evaluation and prints the results:
+
+```python
+# Evaluate the model and print results
+eval_results = mnist_classifier.evaluate(
+    x=eval_data, y=eval_labels, metrics=metrics)
+print(eval_results)
+```
+
+We pass our evaluation feature data and labels to `evaluate()` in the `x` and
+`y` arguments, respectively. The `metrics` argument takes the metrics dict we
+just defined.
+
+### Run the Model
+
+We've coded the CNN model function, `Estimator`, and the training/evaluation
+logic; now let's see the results. Run `cnn_mnist.py`.
+
+> Note: Training CNNs is quite computationally intensive. Estimated completion
+> time of `cnn_mnist.py` will vary depending on your processor, but will likely
+> be upwards of 1 hour on CPU. To train more quickly, you can decrease the
+> number of `steps` passed to `fit()`, but note that this will affect accuracy.
+
+As the model trains, you'll see log output like the following:
+
+```python
+INFO:tensorflow:loss = 2.36026, step = 1
+INFO:tensorflow:probabilities = [[ 0.07722801  0.08618255  0.09256398, ...]]
+...
+INFO:tensorflow:loss = 2.13119, step = 101
+INFO:tensorflow:global_step/sec: 5.44132
+...
+INFO:tensorflow:Loss for final step: 0.553216.
+
+INFO:tensorflow:Restored model from /tmp/mnist_convnet_model
+INFO:tensorflow:Eval steps [0,inf) for training step 20000.
+INFO:tensorflow:Input iterator is exhausted.
+INFO:tensorflow:Saving evaluation summary for step 20000: accuracy = 0.9733, loss = 0.0902271
+{'loss': 0.090227105, 'global_step': 20000, 'accuracy': 0.97329998}
+```
+
+Here, we've achieved an accuracy of 97.3% on our test data set.
+
+## Additional Resources
+
+To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
+following resources:
+
+*   @{$estimators$Creating Estimators in tf.contrib.learn}. An
+    introduction to the TensorFlow Estimator API, which walks through
+    configuring an Estimator, writing a model function, calculating loss, and
+    defining a training op.
+*   @{$pros#build-a-multilayer-convolutional-network$Deep MNIST for Experts: Building a Multilayer CNN}. Walks
+    through how to build a MNIST CNN classification model *without layers* using
+    lower-level TensorFlow operations.
diff --git a/tensorflow/docs_src/tutorials/leftnav_files b/tensorflow/docs_src/tutorials/leftnav_files
new file mode 100644
index 00000000000..411e6d5137b
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/leftnav_files
@@ -0,0 +1,15 @@
+index.md
+using_gpu.md
+image_recognition.md
+image_retraining.md
+layers.md
+deep_cnn.md
+word2vec.md
+recurrent.md
+seq2seq.md
+linear.md
+wide.md
+wide_and_deep.md
+kernel_methods.md
+mandelbrot.md
+pdes.md
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
new file mode 100644
index 00000000000..de87c164ae0
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -0,0 +1,238 @@
+# Large-scale Linear Models with TensorFlow
+
+The tf.contrib.learn API provides (among other things) a rich set of tools for working
+with linear models in TensorFlow. This document provides an overview of those
+tools. It explains:
+
+   * what a linear model is.
+   * why you might want to use a linear model.
+   * how tf.contrib.learn makes it easy to build linear models in TensorFlow.
+   * how you can use tf.contrib.learn to combine linear models with
+   deep learning to get the advantages of both.
+
+Read this overview to decide whether the tf.contrib.learn linear model tools might be
+useful to you. Then do the @{$wide$Linear Models tutorial} to
+give it a try. This overview uses code samples from the tutorial, but the
+tutorial walks through the code in greater detail.
+
+To understand this overview it will help to have some familiarity
+with basic machine learning concepts, and also with
+@{$tflearn$tf.contrib.learn}.
+
+[TOC]
+
+## What is a linear model?
+
+A *linear model* uses a single weighted sum of features to make a prediction.
+For example, if you have [data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
+on age, years of education, and weekly hours of
+work for a population, you can learn weights for each of those numbers so that
+their weighted sum estimates a person's salary. You can also use linear models
+for classification.
+
+Some linear models transform the weighted sum into a more convenient form. For
+example, *logistic regression* plugs the weighted sum into the logistic
+function to turn the output into a value between 0 and 1. But you still just
+have one weight for each input feature.
+
+## Why would you want to use a linear model?
+
+Why would you want to use so simple a model when recent research has
+demonstrated the power of more complex neural networks with many layers?
+
+Linear models:
+
+   * train quickly, compared to deep neural nets.
+   * can work well on very large feature sets.
+   * can be trained with algorithms that don't require a lot of fiddling
+   with learning rates, etc.
+   * can be interpreted and debugged more easily than neural nets.
+   You can examine the weights assigned to each feature to figure out what's
+   having the biggest impact on a prediction.
+   * provide an excellent starting point for learning about machine learning.
+   * are widely used in industry.
+
+## How does tf.contrib.learn help you build linear models?
+
+You can build a linear model from scratch in TensorFlow without the help of a
+special API. But tf.contrib.learn provides some tools that make it easier to build
+effective large-scale linear models.
+
+### Feature columns and transformations
+
+Much of the work of designing a linear model consists of transforming raw data
+into suitable input features. tf.contrib.learn uses the `FeatureColumn` abstraction to
+enable these transformations.
+
+A `FeatureColumn` represents a single feature in your data. A `FeatureColumn`
+may represent a quantity like 'height', or it may represent a category like
+'eye_color' where the value is drawn from a set of discrete possibilities like {'blue', 'brown', 'green'}.
+
+In the case of both *continuous features* like 'height' and *categorical
+features* like 'eye_color', a single value in the data might get transformed
+into a sequence of numbers before it is input into the model. The
+`FeatureColumn` abstraction lets you manipulate the feature as a single
+semantic unit in spite of this fact. You can specify transformations and
+select features to include without dealing with specific indices in the
+tensors you feed into the model.
+
+#### Sparse columns
+
+Categorical features in linear models are typically translated into a sparse
+vector in which each possible value has a corresponding index or id. For
+example, if there are only three possible eye colors you can represent
+'eye_color' as a length 3 vector: 'brown' would become [1, 0, 0], 'blue' would
+become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called
+"sparse" because they may be very long, with many zeros, when the set of
+possible values is very large (such as all English words).
+
+While you don't need to use sparse columns to use tf.contrib.learn linear models, one
+of the strengths of linear models is their ability to deal with large sparse
+vectors. Sparse features are a primary use case for the tf.contrib.learn linear model
+tools.
+
+##### Encoding sparse columns
+
+`FeatureColumn` handles the conversion of categorical values into vectors
+automatically, with code like this:
+
+```python
+eye_color = tf.contrib.layers.sparse_column_with_keys(
+  column_name="eye_color", keys=["blue", "brown", "green"])
+```
+
+where `eye_color` is the name of a column in your source data.
+
+You can also generate `FeatureColumn`s for categorical features for which you
+don't know all possible values. For this case you would use
+`sparse_column_with_hash_bucket()`, which uses a hash function to assign
+indices to feature values.
+
+```python
+education = tf.contrib.layers.sparse_column_with_hash_bucket(\
+    "education", hash_bucket_size=1000)
+```
+
+##### Feature Crosses
+
+Because linear models assign independent weights to separate features, they
+can't learn the relative importance of specific combinations of feature
+values. If you have a feature 'favorite_sport' and a feature 'home_city' and
+you're trying to predict whether a person likes to wear red, your linear model
+won't be able to learn that baseball fans from St. Louis especially like to
+wear red.
+
+You can get around this limitation by creating a new feature
+'favorite_sport_x_home_city'. The value of this feature for a given person is
+just the concatenation of the values of the two source features:
+'baseball_x_stlouis', for example. This sort of combination feature is called
+a *feature cross*.
+
+The `crossed_column()` method makes it easy to set up feature crosses:
+
+```python
+sport = tf.contrib.layers.sparse_column_with_hash_bucket(\
+    "sport", hash_bucket_size=1000)
+city = tf.contrib.layers.sparse_column_with_hash_bucket(\
+    "city", hash_bucket_size=1000)
+sport_x_city = tf.contrib.layers.crossed_column(
+    [sport, city], hash_bucket_size=int(1e4))
+```
+
+#### Continuous columns
+
+You can specify a continuous feature like so:
+
+```python
+age = tf.contrib.layers.real_valued_column("age")
+```
+
+Although, as a single real number, a continuous feature can often be input
+directly into the model, tf.contrib.learn offers useful transformations for this sort
+of column as well.
+
+##### Bucketization
+
+*Bucketization* turns a continuous column into a categorical column. This
+transformation lets you use continuous features in feature crosses, or learn
+cases where specific value ranges have particular importance.
+
+Bucketization divides the range of possible values into subranges called
+buckets:
+
+```python
+age_buckets = tf.contrib.layers.bucketized_column(
+    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+```
+
+The bucket into which a value falls becomes the categorical label for
+that value.
+
+#### Input function
+
+`FeatureColumn`s provide a specification for the input data for your model,
+indicating how to represent and transform the data. But they do not provide
+the data itself. You provide the data through an input function.
+
+The input function must return a dictionary of tensors. Each key corresponds to
+the name of a `FeatureColumn`. Each key's value is a tensor containing the
+values of that feature for all data instances. See
+@{$input_fn$Building Input Functions with tf.contrib.learn} for a
+more comprehensive look at input functions, and `input_fn` in the
+[linear models tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
+for an example implementation of an input function.
+
+The input function is passed to the `fit()` and `evaluate()` calls that
+initiate training and testing, as described in the next section.
+
+### Linear estimators
+
+tf.contrib.learn's estimator classes provide a unified training and evaluation harness
+for regression and classification models. They take care of the details of the
+training and evaluation loops and allow the user to focus on model inputs and
+architecture.
+
+To build a linear estimator, you can use either the
+`tf.contrib.learn.LinearClassifier` estimator or the
+`tf.contrib.learn.LinearRegressor` estimator, for classification and
+regression respectively.
+
+As with all tf.contrib.learn estimators, to run the estimator you just:
+
+   1. Instantiate the estimator class. For the two linear estimator classes,
+   you pass a list of `FeatureColumn`s to the constructor.
+   2. Call the estimator's `fit()` method to train it.
+   3. Call the estimator's `evaluate()` method to see how it does.
+
+For example:
+
+```python
+e = tf.contrib.learn.LinearClassifier(feature_columns=[
+  native_country, education, occupation, workclass, marital_status,
+  race, age_buckets, education_x_occupation, age_buckets_x_race_x_occupation],
+  model_dir=YOUR_MODEL_DIRECTORY)
+e.fit(input_fn=input_fn_train, steps=200)
+# Evaluate for one step (one pass through the test data).
+results = e.evaluate(input_fn=input_fn_test, steps=1)
+
+# Print the stats for the evaluation.
+for key in sorted(results):
+    print("%s: %s" % (key, results[key]))
+```
+
+### Wide and deep learning
+
+The tf.contrib.learn API also provides an estimator class that lets you jointly train
+a linear model and a deep neural network. This novel approach combines the
+ability of linear models to "memorize" key features with the generalization
+ability of neural nets. Use `tf.contrib.learn.DNNLinearCombinedClassifier` to
+create this sort of "wide and deep" model:
+
+```python
+e = tf.contrib.learn.DNNLinearCombinedClassifier(
+    model_dir=YOUR_MODEL_DIR,
+    linear_feature_columns=wide_columns,
+    dnn_feature_columns=deep_columns,
+    dnn_hidden_units=[100, 50])
+```
+For more information, see the @{$wide_and_deep$Wide and Deep Learning tutorial}.
diff --git a/tensorflow/docs_src/tutorials/mandelbrot.md b/tensorflow/docs_src/tutorials/mandelbrot.md
new file mode 100755
index 00000000000..1c0a548129c
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/mandelbrot.md
@@ -0,0 +1,116 @@
+# Mandelbrot Set
+
+Visualizing the [Mandelbrot set](https://en.wikipedia.org/wiki/Mandelbrot_set)
+doesn't have anything to do with machine learning, but it makes for a fun
+example of how one can use TensorFlow for general mathematics.  This is
+actually a pretty naive implementation of the visualization, but it makes the
+point.  (We may end up providing a more elaborate implementation down the line
+to produce more truly beautiful images.)
+
+
+## Basic Setup
+
+We'll need a few imports to get started.
+
+```python
+# Import libraries for simulation
+import tensorflow as tf
+import numpy as np
+
+# Imports for visualization
+import PIL.Image
+from io import BytesIO
+from IPython.display import Image, display
+```
+
+Now we'll define a function to actually display the image once we have
+iteration counts.
+
+```python
+def DisplayFractal(a, fmt='jpeg'):
+  """Display an array of iteration counts as a
+     colorful picture of a fractal."""
+  a_cyclic = (6.28*a/20.0).reshape(list(a.shape)+[1])
+  img = np.concatenate([10+20*np.cos(a_cyclic),
+                        30+50*np.sin(a_cyclic),
+                        155-80*np.cos(a_cyclic)], 2)
+  img[a==a.max()] = 0
+  a = img
+  a = np.uint8(np.clip(a, 0, 255))
+  f = BytesIO()
+  PIL.Image.fromarray(a).save(f, fmt)
+  display(Image(data=f.getvalue()))
+```
+
+## Session and Variable Initialization
+
+For playing around like this, we often use an interactive session, but a regular
+session would work as well.
+
+```python
+sess = tf.InteractiveSession()
+```
+
+It's handy that we can freely mix NumPy and TensorFlow.
+
+```python
+# Use NumPy to create a 2D array of complex numbers
+
+Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005]
+Z = X+1j*Y
+```
+
+Now we define and initialize TensorFlow tensors.
+
+```python
+xs = tf.constant(Z.astype(np.complex64))
+zs = tf.Variable(xs)
+ns = tf.Variable(tf.zeros_like(xs, tf.float32))
+```
+
+TensorFlow requires that you explicitly initialize variables before using them.
+
+```python
+tf.global_variables_initializer().run()
+```
+
+## Defining and Running the Computation
+
+Now we specify more of the computation...
+
+```python
+# Compute the new values of z: z^2 + x
+zs_ = zs*zs + xs
+
+# Have we diverged with this new value?
+not_diverged = tf.abs(zs_) < 4
+
+# Operation to update the zs and the iteration count.
+#
+# Note: We keep computing zs after they diverge! This
+#       is very wasteful! There are better, if a little
+#       less simple, ways to do this.
+#
+step = tf.group(
+  zs.assign(zs_),
+  ns.assign_add(tf.cast(not_diverged, tf.float32))
+  )
+```
+
+... and run it for a couple hundred steps
+
+```python
+for i in range(200): step.run()
+```
+
+Let's see what we've got.
+
+```python
+DisplayFractal(ns.eval())
+```
+
+![jpeg](https://www.tensorflow.org/images/mandelbrot_output.jpg)
+
+Not bad!
+
+
diff --git a/tensorflow/docs_src/tutorials/pdes.md b/tensorflow/docs_src/tutorials/pdes.md
new file mode 100755
index 00000000000..425e8d7084e
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/pdes.md
@@ -0,0 +1,141 @@
+# Partial Differential Equations
+
+TensorFlow isn't just for machine learning.  Here we give a (somewhat
+pedestrian) example of using TensorFlow for simulating the behavior of a
+[partial differential equation](
+https://en.wikipedia.org/wiki/Partial_differential_equation).
+We'll simulate the surface of square pond as a few raindrops land on it.
+
+
+## Basic Setup
+
+A few imports we'll need.
+
+```python
+#Import libraries for simulation
+import tensorflow as tf
+import numpy as np
+
+#Imports for visualization
+import PIL.Image
+from io import BytesIO
+from IPython.display import clear_output, Image, display
+```
+
+A function for displaying the state of the pond's surface as an image.
+
+```python
+def DisplayArray(a, fmt='jpeg', rng=[0,1]):
+  """Display an array as a picture."""
+  a = (a - rng[0])/float(rng[1] - rng[0])*255
+  a = np.uint8(np.clip(a, 0, 255))
+  f = BytesIO()
+  PIL.Image.fromarray(a).save(f, fmt)
+  clear_output(wait = True)
+  display(Image(data=f.getvalue()))
+```
+
+Here we start an interactive TensorFlow session for convenience in playing
+around.  A regular session would work as well if we were doing this in an
+executable .py file.
+
+```python
+sess = tf.InteractiveSession()
+```
+
+## Computational Convenience Functions
+
+
+```python
+def make_kernel(a):
+  """Transform a 2D array into a convolution kernel"""
+  a = np.asarray(a)
+  a = a.reshape(list(a.shape) + [1,1])
+  return tf.constant(a, dtype=1)
+
+def simple_conv(x, k):
+  """A simplified 2D convolution operation"""
+  x = tf.expand_dims(tf.expand_dims(x, 0), -1)
+  y = tf.nn.depthwise_conv2d(x, k, [1, 1, 1, 1], padding='SAME')
+  return y[0, :, :, 0]
+
+def laplace(x):
+  """Compute the 2D laplacian of an array"""
+  laplace_k = make_kernel([[0.5, 1.0, 0.5],
+                           [1.0, -6., 1.0],
+                           [0.5, 1.0, 0.5]])
+  return simple_conv(x, laplace_k)
+```
+
+## Define the PDE
+
+Our pond is a perfect 500 x 500 square, as is the case for most ponds found in
+nature.
+
+```python
+N = 500
+```
+
+Here we create our pond and hit it with some rain drops.
+
+```python
+# Initial Conditions -- some rain drops hit a pond
+
+# Set everything to zero
+u_init = np.zeros([N, N], dtype=np.float32)
+ut_init = np.zeros([N, N], dtype=np.float32)
+
+# Some rain drops hit a pond at random points
+for n in range(40):
+  a,b = np.random.randint(0, N, 2)
+  u_init[a,b] = np.random.uniform()
+
+DisplayArray(u_init, rng=[-0.1, 0.1])
+```
+
+![jpeg](https://www.tensorflow.org/images/pde_output_1.jpg)
+
+
+Now let's specify the details of the differential equation.
+
+
+```python
+# Parameters:
+# eps -- time resolution
+# damping -- wave damping
+eps = tf.placeholder(tf.float32, shape=())
+damping = tf.placeholder(tf.float32, shape=())
+
+# Create variables for simulation state
+U  = tf.Variable(u_init)
+Ut = tf.Variable(ut_init)
+
+# Discretized PDE update rules
+U_ = U + eps * Ut
+Ut_ = Ut + eps * (laplace(U) - damping * Ut)
+
+# Operation to update the state
+step = tf.group(
+  U.assign(U_),
+  Ut.assign(Ut_))
+```
+
+## Run The Simulation
+
+This is where it gets fun -- running time forward with a simple for loop.
+
+```python
+# Initialize state to initial conditions
+tf.global_variables_initializer().run()
+
+# Run 1000 steps of PDE
+for i in range(1000):
+  # Step simulation
+  step.run({eps: 0.03, damping: 0.04})
+  DisplayArray(U.eval(), rng=[-0.1, 0.1])
+```
+
+![jpeg](../images/pde_output_2.jpg)
+
+Look! Ripples!
+
diff --git a/tensorflow/docs_src/tutorials/recurrent.md b/tensorflow/docs_src/tutorials/recurrent.md
new file mode 100644
index 00000000000..708a9620dd7
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/recurrent.md
@@ -0,0 +1,228 @@
+# Recurrent Neural Networks
+
+## Introduction
+
+Take a look at [this great article](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
+for an introduction to recurrent neural networks and LSTMs in particular.
+
+## Language Modeling
+
+In this tutorial we will show how to train a recurrent neural network on
+a challenging task of language modeling. The goal of the problem is to fit a
+probabilistic model which assigns probabilities to sentences. It does so by
+predicting next words in a text given a history of previous words. For this
+purpose we will use the [Penn Tree Bank](https://catalog.ldc.upenn.edu/ldc99t42)
+(PTB) dataset, which is a popular benchmark for measuring the quality of these
+models, whilst being small and relatively fast to train.
+
+Language modeling is key to many interesting problems such as speech
+recognition, machine translation, or image captioning. It is also fun --
+take a look [here](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
+
+For the purpose of this tutorial, we will reproduce the results from
+[Zaremba et al., 2014](http://arxiv.org/abs/1409.2329)
+([pdf](http://arxiv.org/pdf/1409.2329.pdf)), which achieves very good quality
+on the PTB dataset.
+
+## Tutorial Files
+
+This tutorial references the following files from `models/tutorials/rnn/ptb` in the [TensorFlow models repo](https://github.com/tensorflow/models):
+
+File | Purpose
+--- | ---
+`ptb_word_lm.py` | The code to train a language model on the PTB dataset.
+`reader.py` | The code to read the dataset.
+
+## Download and Prepare the Data
+
+The data required for this tutorial is in the `data/` directory of the
+[PTB dataset from Tomas Mikolov's webpage](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz).
+
+The dataset is already preprocessed and contains overall 10000 different words,
+including the end-of-sentence marker and a special symbol (\<unk\>) for rare
+words. In `reader.py`, we convert each word to a unique integer identifier,
+in order to make it easy for the neural network to process the data.
+
+## The Model
+
+### LSTM
+
+The core of the model consists of an LSTM cell that processes one word at a
+time and computes probabilities of the possible values for the next word in the
+sentence. The memory state of the network is initialized with a vector of zeros
+and gets updated after reading each word. For computational reasons, we will
+process data in mini-batches of size `batch_size`.  In this example, it is important 
+to note that `current_batch_of_words` does not correspond to a "sentence" of words.  
+Every word in a batch should correspond to time t.  Tensorflow will automatically sum 
+the gradients of each batch for you.
+
+For example:
+```
+ t=0  t=1    t=2  t=3     t=4
+[The, brown, fox, is,     quick]
+[The, red,   fox, jumped, high]
+
+words_in_dataset[0] = [The, The]
+words_in_dataset[1] = [fox, fox]
+words_in_dataset[2] = [is, jumped]
+words_in_dataset[3] = [quick, high]
+num_batches = 4, batch_size = 2, time_steps = 5
+```
+
+The basic pseudocode is as follows:
+
+```python
+words_in_dataset = tf.placeholder(tf.float32, [num_batches, batch_size, num_features])
+lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
+# Initial state of the LSTM memory.
+state = tf.zeros([batch_size, lstm.state_size])
+probabilities = []
+loss = 0.0
+for current_batch_of_words in words_in_dataset:
+    # The value of state is updated after processing each batch of words.
+    output, state = lstm(current_batch_of_words, state)
+
+    # The LSTM output can be used to make next word predictions
+    logits = tf.matmul(output, softmax_w) + softmax_b
+    probabilities.append(tf.nn.softmax(logits))
+    loss += loss_function(probabilities, target_words)
+```
+
+### Truncated Backpropagation
+
+By design, the output of a recurrent neural network (RNN) depends on arbitrarily
+distant inputs. Unfortunately, this makes backpropagation computation difficult.
+In order to make the learning process tractable, it is common practice to create
+an "unrolled" version of the network, which contains a fixed number
+(`num_steps`) of LSTM inputs and outputs. The model is then trained on this
+finite approximation of the RNN. This can be implemented by feeding inputs of
+length `num_steps` at a time and performing a backward pass after each
+such input block.
+
+Here is a simplified block of code for creating a graph which performs
+truncated backpropagation:
+
+```python
+# Placeholder for the inputs in a given iteration.
+words = tf.placeholder(tf.int32, [batch_size, num_steps])
+
+lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
+# Initial state of the LSTM memory.
+initial_state = state = tf.zeros([batch_size, lstm.state_size])
+
+for i in range(num_steps):
+    # The value of state is updated after processing each batch of words.
+    output, state = lstm(words[:, i], state)
+
+    # The rest of the code.
+    # ...
+
+final_state = state
+```
+
+And this is how to implement an iteration over the whole dataset:
+
+```python
+# A numpy array holding the state of LSTM after each batch of words.
+numpy_state = initial_state.eval()
+total_loss = 0.0
+for current_batch_of_words in words_in_dataset:
+    numpy_state, current_loss = session.run([final_state, loss],
+        # Initialize the LSTM state from the previous iteration.
+        feed_dict={initial_state: numpy_state, words: current_batch_of_words})
+    total_loss += current_loss
+```
+
+### Inputs
+
+The word IDs will be embedded into a dense representation (see the
+@{$word2vec$Vector Representations Tutorial}) before feeding to
+the LSTM. This allows the model to efficiently represent the knowledge about
+particular words. It is also easy to write:
+
+```python
+# embedding_matrix is a tensor of shape [vocabulary_size, embedding size]
+word_embeddings = tf.nn.embedding_lookup(embedding_matrix, word_ids)
+```
+
+The embedding matrix will be initialized randomly and the model will learn to
+differentiate the meaning of words just by looking at the data.
+
+### Loss Function
+
+We want to minimize the average negative log probability of the target words:
+
+$$ \text{loss} = -\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i} $$
+
+It is not very difficult to implement but the function
+`sequence_loss_by_example` is already available, so we can just use it here.
+
+The typical measure reported in the papers is average per-word perplexity (often
+just called perplexity), which is equal to
+
+$$e^{-\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i}} = e^{\text{loss}} $$
+
+and we will monitor its value throughout the training process.
+
+### Stacking multiple LSTMs
+
+To give the model more expressive power, we can add multiple layers of LSTMs
+to process the data. The output of the first layer will become the input of
+the second and so on.
+
+We have a class called `MultiRNNCell` that makes the implementation seamless:
+
+```python
+def lstm_cell():
+  return tf.contrib.rnn.BasicLSTMCell(lstm_size)
+stacked_lstm = tf.contrib.rnn.MultiRNNCell(
+    [lstm_cell() for _ in range(number_of_layers)])
+
+initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
+for i in range(num_steps):
+    # The value of state is updated after processing each batch of words.
+    output, state = stacked_lstm(words[:, i], state)
+
+    # The rest of the code.
+    # ...
+
+final_state = state
+```
+
+## Run the Code
+
+Before running the code, download the PTB dataset, as discussed at the beginning
+of this tutorial.  Then, extract the PTB dataset underneath your home directory
+as follows:
+
+```bsh
+tar xvfz simple-examples.tgz -C $HOME
+```
+_(Note: On Windows, you may need to use
+[other tools](https://wiki.haskell.org/How_to_unpack_a_tar_file_in_Windows).)_
+
+Now, clone the [TensorFlow models repo](https://github.com/tensorflow/models)
+from GitHub. Run the following commands:
+
+```bsh
+cd models/tutorials/rnn/ptb
+python ptb_word_lm.py --data_path=$HOME/simple-examples/data/ --model=small
+```
+
+There are 3 supported model configurations in the tutorial code: "small",
+"medium" and "large". The difference between them is in size of the LSTMs and
+the set of hyperparameters used for training.
+
+The larger the model, the better results it should get. The `small` model should
+be able to reach perplexity below 120 on the test set and the `large` one below
+80, though it might take several hours to train.
+
+## What Next?
+
+There are several tricks that we haven't mentioned that make the model better,
+including:
+
+* decreasing learning rate schedule,
+* dropout between the LSTM layers.
+
+Study the code and modify it to improve the model even further.
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
new file mode 100644
index 00000000000..6ffe3e8b037
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -0,0 +1,351 @@
+# Sequence-to-Sequence Models
+
+Recurrent neural networks can learn to model language, as already discussed
+in the @{$recurrent$RNN Tutorial}
+(if you did not read it, please go through it before proceeding with this one).
+This raises an interesting question: could we condition the generated words on
+some input and generate a meaningful response? For example, could we train
+a neural network to translate from English to French? It turns out that
+the answer is *yes*.
+
+This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
+
+```
+cd models/tutorials/rnn/translate
+python translate.py --data_dir [your_data_directory]
+```
+
+It will download English-to-French translation data from the
+[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html),
+prepare it for training, and train. It takes about 20GB of disk space,
+and a while to download and prepare (see [later](#lets-run-it) for details),
+so you can start and leave it running while reading this tutorial.
+
+This tutorial references the following files.
+
+File | What's in it?
+--- | ---
+`tensorflow/tensorflow/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
+`models/tutorials/rnn/translate/seq2seq_model.py` | Neural translation sequence-to-sequence model.
+`models/tutorials/rnn/translate/data_utils.py` | Helper functions for preparing translation data.
+`models/tutorials/rnn/translate/translate.py` | Binary that trains and runs the translation model.
+
+
+## Sequence-to-sequence basics
+
+A basic sequence-to-sequence model, as introduced in
+[Cho et al., 2014](http://arxiv.org/abs/1406.1078)
+([pdf](http://arxiv.org/pdf/1406.1078.pdf)), consists of two recurrent neural
+networks (RNNs): an *encoder* that processes the input and a *decoder* that
+generates the output. This basic architecture is depicted below.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/basic_seq2seq.png" />
+</div>
+
+Each box in the picture above represents a cell of the RNN, most commonly
+a GRU cell or an LSTM cell (see the @{$recurrent$RNN Tutorial}
+for an explanation of those). Encoder and decoder can share weights or,
+as is more common, use a different set of parameters. Multi-layer cells
+have been successfully used in sequence-to-sequence models too, e.g. for
+translation [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215)
+([pdf](http://arxiv.org/pdf/1409.3215.pdf)).
+
+In the basic model depicted above, every input has to be encoded into
+a fixed-size state vector, as that is the only thing passed to the decoder.
+To allow the decoder more direct access to the input, an *attention* mechanism
+was introduced in [Bahdanau et al., 2014](http://arxiv.org/abs/1409.0473)
+([pdf](http://arxiv.org/pdf/1409.0473.pdf)).
+We will not go into the details of the attention mechanism (see the paper);
+suffice it to say that it allows the decoder to peek into the input at every
+decoding step. A multi-layer sequence-to-sequence network with LSTM cells and
+attention mechanism in the decoder looks like this.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/attention_seq2seq.png" />
+</div>
+
+## TensorFlow seq2seq library
+
+As you can see above, there are many different sequence-to-sequence
+models. Each of these models can use different RNN cells, but all
+of them accept encoder inputs and decoder inputs. This motivates
+the interfaces in the TensorFlow seq2seq library (`tensorflow/tensorflow/python/ops/seq2seq.py`).
+The basic RNN encoder-decoder sequence-to-sequence model works as follows.
+
+```python
+outputs, states = basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell)
+```
+
+In the above call, `encoder_inputs` are a list of tensors representing inputs
+to the encoder, i.e., corresponding to the letters *A, B, C* in the first
+picture above. Similarly, `decoder_inputs` are tensors representing inputs
+to the decoder, *GO, W, X, Y, Z* on the first picture.
+
+The `cell` argument is an instance of the `tf.contrib.rnn.RNNCell` class
+that determines which cell will be used inside the model. You can use
+an existing cell, such as `GRUCell` or `LSTMCell`, or you can write your own.
+Moreover, `tf.contrib.rnn` provides wrappers to construct multi-layer cells,
+add dropout to cell inputs or outputs, or to do other transformations.
+See the @{$recurrent$RNN Tutorial} for examples.
+
+The call to `basic_rnn_seq2seq` returns two arguments: `outputs` and `states`.
+Both of them are lists of tensors of the same length as `decoder_inputs`.
+Naturally, `outputs` correspond to the outputs of the decoder in each time-step,
+in the first picture above that would be *W, X, Y, Z, EOS*. The returned
+`states` represent the internal state of the decoder at every time-step.
+
+In many applications of sequence-to-sequence models, the output of the decoder
+at time t is fed back and becomes the input of the decoder at time t+1. At test
+time, when decoding a sequence, this is how the sequence is constructed.
+During training, on the other hand, it is common to provide the correct input
+to the decoder at every time-step, even if the decoder made a mistake before.
+Functions in `seq2seq.py` support both modes using the `feed_previous` argument.
+For example, let's analyze the following use of an embedding RNN model.
+
+```python
+outputs, states = embedding_rnn_seq2seq(
+    encoder_inputs, decoder_inputs, cell,
+    num_encoder_symbols, num_decoder_symbols,
+    embedding_size, output_projection=None,
+    feed_previous=False)
+```
+
+In the `embedding_rnn_seq2seq` model, all inputs (both `encoder_inputs` and
+`decoder_inputs`) are integer-tensors that represent discrete values.
+They will be embedded into a dense representation (see the
+@{$word2vec$Vectors Representations Tutorial} for more details
+on embeddings), but to construct these embeddings we need to specify
+the maximum number of discrete symbols that will appear: `num_encoder_symbols`
+on the encoder side, and `num_decoder_symbols` on the decoder side.
+
+In the above invocation, we set `feed_previous` to False. This means that the
+decoder will use `decoder_inputs` tensors as provided. If we set `feed_previous`
+to True, the decoder would only use the first element of `decoder_inputs`.
+All other tensors from this list would be ignored, and instead the previous
+output of the decoder would be used. This is used for decoding translations
+in our translation model, but it can also be used during training, to make
+the model more robust to its own mistakes, similar
+to [Bengio et al., 2015](http://arxiv.org/abs/1506.03099)
+([pdf](http://arxiv.org/pdf/1506.03099.pdf)).
+
+One more important argument used above is `output_projection`. If not specified,
+the outputs of the embedding model will be tensors of shape batch-size by
+`num_decoder_symbols` as they represent the logits for each generated symbol.
+When training models with large output vocabularies, i.e., when
+`num_decoder_symbols` is large, it is not practical to store these large
+tensors. Instead, it is better to return smaller output tensors, which will
+later be projected onto a large output tensor using `output_projection`.
+This allows to use our seq2seq models with a sampled softmax loss, as described
+in [Jean et. al., 2014](http://arxiv.org/abs/1412.2007)
+([pdf](http://arxiv.org/pdf/1412.2007.pdf)).
+
+In addition to `basic_rnn_seq2seq` and `embedding_rnn_seq2seq` there are a few
+more sequence-to-sequence models in `seq2seq.py`; take a look there. They all
+have similar interfaces, so we will not describe them in detail. We will use
+`embedding_attention_seq2seq` for our translation model below.
+
+## Neural translation model
+
+While the core of the sequence-to-sequence model is constructed by
+the functions in `tensorflow/tensorflow/python/ops/seq2seq.py`, there are still a few tricks
+that are worth mentioning that are used in our translation model in
+`models/tutorials/rnn/translate/seq2seq_model.py`.
+
+### Sampled softmax and output projection
+
+For one, as already mentioned above, we want to use sampled softmax to
+handle large output vocabulary. To decode from it, we need to keep track
+of the output projection. Both the sampled softmax loss and the output
+projections are constructed by the following code in `seq2seq_model.py`.
+
+```python
+
+if num_samples > 0 and num_samples < self.target_vocab_size:
+  w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
+  w = tf.transpose(w_t)
+  b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
+  output_projection = (w, b)
+
+  def sampled_loss(labels, inputs):
+    labels = tf.reshape(labels, [-1, 1])
+    # We need to compute the sampled_softmax_loss using 32bit floats to
+    # avoid numerical instabilities.
+    local_w_t = tf.cast(w_t, tf.float32)
+    local_b = tf.cast(b, tf.float32)
+    local_inputs = tf.cast(inputs, tf.float32)
+    return tf.cast(
+        tf.nn.sampled_softmax_loss(
+            weights=local_w_t,
+            biases=local_b,
+            labels=labels,
+            inputs=local_inputs,
+            num_sampled=num_samples,
+            num_classes=self.target_vocab_size),
+        dtype)
+```
+
+First, note that we only construct a sampled softmax if the number of samples
+(512 by default) is smaller than the target vocabulary size. For vocabularies
+smaller than 512, it might be a better idea to just use a standard softmax loss.
+
+Then, as you can see, we construct an output projection. It is a pair,
+consisting of a weight matrix and a bias vector. If used, the rnn cell
+will return vectors of shape batch-size by `size`, rather than batch-size
+by `target_vocab_size`. To recover logits, we need to multiply by the weight
+matrix and add the biases, as is done in lines 124-126 in `seq2seq_model.py`.
+
+```python
+if output_projection is not None:
+  for b in xrange(len(buckets)):
+    self.outputs[b] = [tf.matmul(output, output_projection[0]) +
+                       output_projection[1] for ...]
+```
+
+### Bucketing and padding
+
+In addition to sampled softmax, our translation model also makes use
+of *bucketing*, which is a method to efficiently handle sentences of
+different lengths. Let us first clarify the problem. When translating
+English to French, we will have English sentences of different lengths L1
+on input, and French sentences of different lengths L2 on output. Since
+the English sentence is passed as `encoder_inputs`, and the French sentence
+comes as `decoder_inputs` (prefixed by a GO symbol), we should in principle
+create a seq2seq model for every pair (L1, L2+1) of lengths of an English
+and French sentence. This would result in an enormous graph consisting of
+many very similar subgraphs. On the other hand, we could just pad every
+sentence with a special PAD symbol. Then we'd need only one seq2seq model,
+for the padded lengths. But on shorter sentence our model would be inefficient,
+encoding and decoding many PAD symbols that are useless.
+
+As a compromise between constructing a graph for every pair of lengths and
+padding to a single length, we use a number of *buckets* and pad each sentence
+to the length of the bucket above it. In `translate.py` we use the following
+default buckets.
+
+```python
+buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
+```
+
+This means that if the input is an English sentence with 3 tokens,
+and the corresponding output is a French sentence with 6 tokens,
+then they will be put in the first bucket and padded to length 5 for
+encoder inputs, and length 10 for decoder inputs. If we have an English
+sentence with 8 tokens and the corresponding French sentence has 18 tokens,
+then they will not fit into the (10, 15) bucket, and so the (20, 25) bucket
+will be used, i.e. the English sentence will be padded to 20, and the French
+one to 25.
+
+Remember that when constructing decoder inputs we prepend the special `GO`
+symbol to the input data. This is done in the `get_batch()` function in
+`seq2seq_model.py`, which also reverses the input English sentence.
+Reversing the inputs was shown to improve results for the neural translation
+model in [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215)
+([pdf](http://arxiv.org/pdf/1409.3215.pdf)).
+To put it all together, imagine we have the sentence "I go.", tokenized
+as `["I", "go", "."]` as input and the sentence "Je vais." as output,
+tokenized `["Je", "vais", "."]`. It will be put in the (5, 10) bucket,
+with encoder inputs representing `[PAD PAD "." "go" "I"]` and decoder
+inputs `[GO "Je" "vais" "." EOS PAD PAD PAD PAD PAD]`.
+
+
+## Let's run it
+
+To train the model described above, we need to a large English-French corpus.
+We will use the *10^9-French-English corpus* from the
+[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html)
+for training, and the 2013 news test from the same site as development set.
+Both data-sets will be downloaded to `data_dir` and training will start,
+saving checkpoints in `train_dir`, when this command is run.
+
+```
+python translate.py
+  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
+  --en_vocab_size=40000 --fr_vocab_size=40000
+```
+
+It takes  about 18GB of disk space and several hours to prepare the training
+corpus. It is unpacked, vocabulary files are created in `data_dir`, and then
+the corpus is tokenized and converted to integer ids. Note the parameters
+that determine vocabulary sizes. In the example above, all words outside
+the 40K most common ones will be converted to an `UNK` token representing
+unknown words. So if you change vocabulary size, the binary will re-map
+the corpus to token-ids again.
+
+After the data is prepared, training starts. Default parameters in `translate`
+are set to quite large values. Large models trained over a long time give good
+results, but it might take too long or use too much memory for your GPU.
+You can request to train a smaller model as in the following example.
+
+```
+python translate.py
+  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
+  --size=256 --num_layers=2 --steps_per_checkpoint=50
+```
+
+The above command will train a model with 2 layers (the default is 3),
+each layer with 256 units (default is 1024), and will save a checkpoint
+every 50 steps (the default is 200). You can play with these parameters
+to find out how large a model can be to fit into the memory of your GPU.
+
+During training, every `steps_per_checkpoint` steps the binary will print
+out statistics from recent steps. With the default parameters (3 layers
+of size 1024), first messages look like this.
+
+```
+global step 200 learning rate 0.5000 step-time 1.39 perplexity 1720.62
+  eval: bucket 0 perplexity 184.97
+  eval: bucket 1 perplexity 248.81
+  eval: bucket 2 perplexity 341.64
+  eval: bucket 3 perplexity 469.04
+global step 400 learning rate 0.5000 step-time 1.38 perplexity 379.89
+  eval: bucket 0 perplexity 151.32
+  eval: bucket 1 perplexity 190.36
+  eval: bucket 2 perplexity 227.46
+  eval: bucket 3 perplexity 238.66
+```
+
+You can see that each step takes just under 1.4 seconds, the perplexity
+on the training set and the perplexities on the development set
+for each bucket. After about 30K steps, we see perplexities on short
+sentences (bucket 0 and 1) going into single digits.
+Since the training corpus contains ~22M sentences, one epoch (going through
+the training data once) takes about 340K steps with batch-size of 64. At this
+point the model can be used for translating English sentences to French
+using the `--decode` option.
+
+```
+python translate.py --decode
+  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
+
+Reading model parameters from /tmp/translate.ckpt-340000
+>  Who is the president of the United States?
+ Qui est le président des États-Unis ?
+```
+
+## What next?
+
+The example above shows how you can build your own English-to-French
+translator, end-to-end. Run it and see how the model performs for yourself.
+While it has reasonable quality, the default parameters will not give you
+the best translation model. Here are a few things you can improve.
+
+First of all, we use a very primitive tokenizer, the `basic_tokenizer` function
+in `data_utils`. A better tokenizer can be found on the
+[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html).
+Using that tokenizer, and a larger vocabulary, should improve your translations.
+
+Also, the default parameters of the translation model are not tuned.
+You can try changing the learning rate, decay, or initializing the weights
+of your model in a different way. You can also change the default
+`GradientDescentOptimizer` in `seq2seq_model.py` to a more advanced one, such
+as `AdagradOptimizer`. Try these things and see how they improve your results!
+
+Finally, the model presented above can be used for any sequence-to-sequence
+task, not only for translation. Even if you want to transform a sequence to
+a tree, for example to generate a parsing tree, the same model as above can
+give state-of-the-art results, as demonstrated in
+[Vinyals & Kaiser et al., 2014](http://arxiv.org/abs/1412.7449)
+([pdf](http://arxiv.org/pdf/1412.7449.pdf)).
+So you can not only build your own translator, you can also build a parser,
+a chat-bot, or any program that comes to your mind. Experiment!
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md
new file mode 100644
index 00000000000..dcec62d2749
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@@ -0,0 +1,215 @@
+# Using GPUs
+
+## Supported devices
+
+On a typical system, there are multiple computing devices. In TensorFlow, the
+supported device types are `CPU` and `GPU`. They are represented as `strings`.
+For example:
+
+*   `"/cpu:0"`: The CPU of your machine.
+*   `"/gpu:0"`: The GPU of your machine, if you have one.
+*   `"/gpu:1"`: The second GPU of your machine, etc.
+
+If a TensorFlow operation has both CPU and GPU implementations, the GPU devices
+will be given priority when the operation is assigned to a device. For example,
+`matmul` has both CPU and GPU kernels. On a system with devices `cpu:0` and
+`gpu:0`, `gpu:0` will be selected to run `matmul`.
+
+## Logging Device placement
+
+To find out which devices your operations and tensors are assigned to, create
+the session with `log_device_placement` configuration option set to `True`.
+
+```python
+# Creates a graph.
+a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
+b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
+c = tf.matmul(a, b)
+# Creates a session with log_device_placement set to True.
+sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
+# Runs the op.
+print(sess.run(c))
+```
+
+You should see the following output:
+
+```
+Device mapping:
+/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+id: 0000:05:00.0
+b: /job:localhost/replica:0/task:0/gpu:0
+a: /job:localhost/replica:0/task:0/gpu:0
+MatMul: /job:localhost/replica:0/task:0/gpu:0
+[[ 22.  28.]
+ [ 49.  64.]]
+
+```
+
+## Manual device placement
+
+If you would like a particular operation to run on a device of your choice
+instead of what's automatically selected for you, you can use `with tf.device`
+to create a device context such that all the operations within that context will
+have the same device assignment.
+
+```python
+# Creates a graph.
+with tf.device('/cpu:0'):
+  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
+  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
+c = tf.matmul(a, b)
+# Creates a session with log_device_placement set to True.
+sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
+# Runs the op.
+print(sess.run(c))
+```
+
+You will see that now `a` and `b` are assigned to `cpu:0`. Since a device was
+not explicitly specified for the `MatMul` operation, the TensorFlow runtime will
+choose one based on the operation and available devices (`gpu:0` in this
+example) and automatically copy tensors between devices if required.
+
+```
+Device mapping:
+/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+id: 0000:05:00.0
+b: /job:localhost/replica:0/task:0/cpu:0
+a: /job:localhost/replica:0/task:0/cpu:0
+MatMul: /job:localhost/replica:0/task:0/gpu:0
+[[ 22.  28.]
+ [ 49.  64.]]
+```
+
+## Allowing GPU memory growth
+
+By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
+[`CUDA_VISIBLE_DEVICES`](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars))
+visible to the process. This is done to more efficiently use the relatively
+precious GPU memory resources on the devices by reducing [memory
+fragmentation](https://en.wikipedia.org/wiki/Fragmentation_\(computing\)).
+
+In some cases it is desirable for the process to only allocate a subset of the
+available memory, or to only grow the memory usage as is needed by the process.
+TensorFlow provides two Config options on the Session to control this.
+
+The first is the `allow_growth` option, which attempts to allocate only as much
+GPU memory based on runtime allocations: it starts out allocating very little
+memory, and as Sessions get run and more GPU memory is needed, we extend the GPU
+memory region needed by the TensorFlow process. Note that we do not release
+memory, since that can lead to even worse memory fragmentation. To turn this
+option on, set the option in the ConfigProto by:
+
+```python
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+session = tf.Session(config=config, ...)
+```
+
+The second method is the `per_process_gpu_memory_fraction` option, which
+determines the fraction of the overall amount of memory that each visible GPU
+should be allocated. For example, you can tell TensorFlow to only allocate 40%
+of the total memory of each GPU by:
+
+```python
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.4
+session = tf.Session(config=config, ...)
+```
+
+This is useful if you want to truly bound the amount of GPU memory available to
+the TensorFlow process.
+
+## Using a single GPU on a multi-GPU system
+
+If you have more than one GPU in your system, the GPU with the lowest ID will be
+selected by default. If you would like to run on a different GPU, you will need
+to specify the preference explicitly:
+
+```python
+# Creates a graph.
+with tf.device('/gpu:2'):
+  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
+  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
+  c = tf.matmul(a, b)
+# Creates a session with log_device_placement set to True.
+sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
+# Runs the op.
+print(sess.run(c))
+```
+
+If the device you have specified does not exist, you will get
+`InvalidArgumentError`:
+
+```
+InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
+Could not satisfy explicit device specification '/gpu:2'
+   [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
+   values: 1 2 3...>, _device="/gpu:2"]()]]
+```
+
+If you would like TensorFlow to automatically choose an existing and supported
+device to run the operations in case the specified one doesn't exist, you can
+set `allow_soft_placement` to `True` in the configuration option when creating
+the session.
+
+```python
+# Creates a graph.
+with tf.device('/gpu:2'):
+  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
+  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
+  c = tf.matmul(a, b)
+# Creates a session with allow_soft_placement and log_device_placement set
+# to True.
+sess = tf.Session(config=tf.ConfigProto(
+      allow_soft_placement=True, log_device_placement=True))
+# Runs the op.
+print(sess.run(c))
+```
+
+## Using multiple GPUs
+
+If you would like to run TensorFlow on multiple GPUs, you can construct your
+model in a multi-tower fashion where each tower is assigned to a different GPU.
+For example:
+
+```
+# Creates a graph.
+c = []
+for d in ['/gpu:2', '/gpu:3']:
+  with tf.device(d):
+    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
+    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
+    c.append(tf.matmul(a, b))
+with tf.device('/cpu:0'):
+  sum = tf.add_n(c)
+# Creates a session with log_device_placement set to True.
+sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
+# Runs the op.
+print(sess.run(sum))
+```
+
+You will see the following output.
+
+```
+Device mapping:
+/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K20m, pci bus
+id: 0000:02:00.0
+/job:localhost/replica:0/task:0/gpu:1 -> device: 1, name: Tesla K20m, pci bus
+id: 0000:03:00.0
+/job:localhost/replica:0/task:0/gpu:2 -> device: 2, name: Tesla K20m, pci bus
+id: 0000:83:00.0
+/job:localhost/replica:0/task:0/gpu:3 -> device: 3, name: Tesla K20m, pci bus
+id: 0000:84:00.0
+Const_3: /job:localhost/replica:0/task:0/gpu:3
+Const_2: /job:localhost/replica:0/task:0/gpu:3
+MatMul_1: /job:localhost/replica:0/task:0/gpu:3
+Const_1: /job:localhost/replica:0/task:0/gpu:2
+Const: /job:localhost/replica:0/task:0/gpu:2
+MatMul: /job:localhost/replica:0/task:0/gpu:2
+AddN: /job:localhost/replica:0/task:0/cpu:0
+[[  44.   56.]
+ [  98.  128.]]
+```
+
+The @{$deep_cnn$cifar10 tutorial} is a good example
+demonstrating how to do training with multiple GPUs.
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
new file mode 100644
index 00000000000..24c866eee54
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -0,0 +1,470 @@
+# TensorFlow Linear Model Tutorial
+
+In this tutorial, we will use the tf.contrib.learn API in TensorFlow to solve a binary
+classification problem: Given census data about a person such as age, gender,
+education and occupation (the features), we will try to predict whether or not
+the person earns more than 50,000 dollars a year (the target label). We will
+train a **logistic regression** model, and given an individual's information our
+model will output a number between 0 and 1, which can be interpreted as the
+probability that the individual has an annual income of over 50,000 dollars.
+
+## Setup
+
+To try the code for this tutorial:
+
+1.  @{$install$Install TensorFlow} if you haven't already.
+
+2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+
+3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+
+    a. Get `pip`:
+
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
+
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
+
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
+
+    If you have trouble installing pandas, consult the
+    [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
+    on the pandas site.
+
+4. Execute the tutorial code with the following command to train the linear
+model described in this tutorial:
+
+        $ python wide_n_deep_tutorial.py --model_type=wide
+
+Read on to find out how this code builds its linear model.
+
+## Reading The Census Data
+
+The dataset we'll be using is the
+[Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).
+You can download the
+[training data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
+and [test data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test)
+manually or use code like this:
+
+```python
+import tempfile
+import urllib
+train_file = tempfile.NamedTemporaryFile()
+test_file = tempfile.NamedTemporaryFile()
+urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
+urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
+```
+
+Once the CSV files are downloaded, let's read them into
+[Pandas](http://pandas.pydata.org/) dataframes.
+
+```python
+import pandas as pd
+COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+           "marital_status", "occupation", "relationship", "race", "gender",
+           "capital_gain", "capital_loss", "hours_per_week", "native_country",
+           "income_bracket"]
+df_train = pd.read_csv(train_file.name, names=COLUMNS, skipinitialspace=True)
+df_test = pd.read_csv(test_file.name, names=COLUMNS, skipinitialspace=True, skiprows=1)
+```
+
+Since the task is a binary classification problem, we'll construct a label
+column named "label" whose value is 1 if the income is over 50K, and 0
+otherwise.
+
+```python
+LABEL_COLUMN = "label"
+df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+```
+
+Next, let's take a look at the dataframe and see which columns we can use to
+predict the target label. The columns can be grouped into two types—categorical
+and continuous columns:
+
+*   A column is called **categorical** if its value can only be one of the
+    categories in a finite set. For example, the native country of a person
+    (U.S., India, Japan, etc.) or the education level (high school, college,
+    etc.) are categorical columns.
+*   A column is called **continuous** if its value can be any numerical value in
+    a continuous range. For example, the capital gain of a person (e.g. $14,084)
+    is a continuous column.
+
+```python
+CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
+                       "relationship", "race", "gender", "native_country"]
+CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
+```
+
+Here's a list of columns available in the Census Income dataset:
+
+| Column Name    | Type        | Description                       | {.sortable}
+| -------------- | ----------- | --------------------------------- |
+| age            | Continuous  | The age of the individual         |
+| workclass      | Categorical | The type of employer the          |
+:                :             : individual has (government,       :
+:                :             : military, private, etc.).         :
+| fnlwgt         | Continuous  | The number of people the census   |
+:                :             : takers believe that observation   :
+:                :             : represents (sample weight). This  :
+:                :             : variable will not be used.        :
+| education      | Categorical | The highest level of education    |
+:                :             : achieved for that individual.     :
+| education_num  | Continuous  | The highest level of education in |
+:                :             : numerical form.                   :
+| marital_status | Categorical | Marital status of the individual. |
+| occupation     | Categorical | The occupation of the individual. |
+| relationship   | Categorical | Wife, Own-child, Husband,         |
+:                :             : Not-in-family, Other-relative,    :
+:                :             : Unmarried.                        :
+| race           | Categorical | White, Asian-Pac-Islander,        |
+:                :             : Amer-Indian-Eskimo, Other, Black. :
+| gender         | Categorical | Female, Male.                     |
+| capital_gain   | Continuous  | Capital gains recorded.           |
+| capital_loss   | Continuous  | Capital Losses recorded.          |
+| hours_per_week | Continuous  | Hours worked per week.            |
+| native_country | Categorical | Country of origin of the          |
+:                :             : individual.                       :
+| income         | Categorical | ">50K" or "<=50K", meaning        |
+:                :             : whether the person makes more     :
+:                :             : than \$50,000 annually.           :
+
+## Converting Data into Tensors
+
+When building a tf.contrib.learn model, the input data is specified by means of an Input
+Builder function. This builder function will not be called until it is later
+passed to tf.contrib.learn methods such as `fit` and `evaluate`. The purpose of this
+function is to construct the input data, which is represented in the form of
+@{tf.Tensor}s
+or
+@{tf.SparseTensor}s.
+In more detail, the Input Builder function returns the following as a pair:
+
+1.  `feature_cols`: A dict from feature column names to `Tensors` or
+    `SparseTensors`.
+2.  `label`: A `Tensor` containing the label column.
+
+The keys of the `feature_cols` will be used to construct columns in the
+next section. Because we want to call the `fit` and `evaluate` methods with
+different data, we define two different input builder functions,
+`train_input_fn` and `test_input_fn` which are identical except that they pass
+different data to `input_fn`. Note that `input_fn` will be called while
+constructing the TensorFlow graph, not while running the graph. What it is
+returning is a representation of the input data as the fundamental unit of
+TensorFlow computations, a `Tensor` (or `SparseTensor`).
+
+Our model represents the input data as *constant* tensors, meaning that the
+tensor represents a constant value, in this case the values of a particular
+column of `df_train` or `df_test`. This is the simplest way to pass data into
+TensorFlow. Another more advanced way to represent input data would be to
+construct an @{$python/io_ops#inputs-and-readers$Inputs And Readers}
+that represents a file or other data source, and iterates through the file as
+TensorFlow runs the graph. Each continuous column in the train or test dataframe
+will be converted into a `Tensor`, which in general is a good format to
+represent dense data. For categorical data, we must represent the data as a
+`SparseTensor`. This data format is good for representing sparse data.
+
+```python
+import tensorflow as tf
+
+def input_fn(df):
+  # Creates a dictionary mapping from each continuous feature column name (k) to
+  # the values of that column stored in a constant Tensor.
+  continuous_cols = {k: tf.constant(df[k].values)
+                     for k in CONTINUOUS_COLUMNS}
+  # Creates a dictionary mapping from each categorical feature column name (k)
+  # to the values of that column stored in a tf.SparseTensor.
+  categorical_cols = {k: tf.SparseTensor(
+      indices=[[i, 0] for i in range(df[k].size)],
+      values=df[k].values,
+      dense_shape=[df[k].size, 1])
+                      for k in CATEGORICAL_COLUMNS}
+  # Merges the two dictionaries into one.
+  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
+  # Converts the label column into a constant Tensor.
+  label = tf.constant(df[LABEL_COLUMN].values)
+  # Returns the feature columns and the label.
+  return feature_cols, label
+
+def train_input_fn():
+  return input_fn(df_train)
+
+def eval_input_fn():
+  return input_fn(df_test)
+```
+
+## Selecting and Engineering Features for the Model
+
+Selecting and crafting the right set of feature columns is key to learning an
+effective model. A **feature column** can be either one of the raw columns in
+the original dataframe (let's call them **base feature columns**), or any new
+columns created based on some transformations defined over one or multiple base
+columns (let's call them **derived feature columns**). Basically, "feature
+column" is an abstract concept of any raw or derived variable that can be used
+to predict the target label.
+
+### Base Categorical Feature Columns
+
+To define a feature column for a categorical feature, we can create a
+`SparseColumn` using the tf.contrib.learn API. If you know the set of all possible
+feature values of a column and there are only a few of them, you can use
+`sparse_column_with_keys`. Each key in the list will get assigned an
+auto-incremental ID starting from 0. For example, for the `gender` column we can
+assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
+doing:
+
+```python
+gender = tf.contrib.layers.sparse_column_with_keys(
+  column_name="gender", keys=["Female", "Male"])
+```
+
+What if we don't know the set of possible values in advance? Not a problem. We
+can use `sparse_column_with_hash_bucket` instead:
+
+```python
+education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
+```
+
+What will happen is that each possible value in the feature column `education`
+will be hashed to an integer ID as we encounter them in training. See an example
+illustration below:
+
+ID  | Feature
+--- | -------------
+... |
+9   | `"Bachelors"`
+... |
+103 | `"Doctorate"`
+... |
+375 | `"Masters"`
+... |
+
+No matter which way we choose to define a `SparseColumn`, each feature string
+will be mapped into an integer ID by looking up a fixed mapping or by hashing.
+Note that hashing collisions are possible, but may not significantly impact the
+model quality. Under the hood, the `LinearModel` class is responsible for
+managing the mapping and creating `tf.Variable` to store the model parameters
+(also known as model weights) for each feature ID. The model parameters will be
+learned through the model training process we'll go through later.
+
+We'll do the similar trick to define the other categorical features:
+
+```python
+race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100)
+marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
+relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
+workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
+occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
+native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
+```
+
+### Base Continuous Feature Columns
+
+Similarly, we can define a `RealValuedColumn` for each continuous feature column
+that we want to use in the model:
+
+```python
+age = tf.contrib.layers.real_valued_column("age")
+education_num = tf.contrib.layers.real_valued_column("education_num")
+capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
+capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
+hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
+```
+
+### Making Continuous Features Categorical through Bucketization
+
+Sometimes the relationship between a continuous feature and the label is not
+linear. As an hypothetical example, a person's income may grow with age in the
+early stage of one's career, then the growth may slow at some point, and finally
+the income decreases after retirement. In this scenario, using the raw `age` as
+a real-valued feature column might not be a good choice because the model can
+only learn one of the three cases:
+
+1.  Income always increases at some rate as age grows (positive correlation),
+1.  Income always decreases at some rate as age grows (negative correlation), or
+1.  Income stays the same no matter at what age (no correlation)
+
+If we want to learn the fine-grained correlation between income and each age
+group separately, we can leverage **bucketization**. Bucketization is a process
+of dividing the entire range of a continuous feature into a set of consecutive
+bins/buckets, and then converting the original numerical feature into a bucket
+ID (as a categorical feature) depending on which bucket that value falls into.
+So, we can define a `bucketized_column` over `age` as:
+
+```python
+age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+```
+
+where the `boundaries` is a list of bucket boundaries. In this case, there are
+10 boundaries, resulting in 11 age group buckets (from age 17 and below, 18-24,
+25-29, ..., to 65 and over).
+
+### Intersecting Multiple Columns with CrossedColumn
+
+Using each base feature column separately may not be enough to explain the data.
+For example, the correlation between education and the label (earning > 50,000
+dollars) may be different for different occupations. Therefore, if we only learn
+a single model weight for `education="Bachelors"` and `education="Masters"`, we
+won't be able to capture every single education-occupation combination (e.g.
+distinguishing between `education="Bachelors" AND occupation="Exec-managerial"`
+and `education="Bachelors" AND occupation="Craft-repair"`). To learn the
+differences between different feature combinations, we can add **crossed feature
+columns** to the model.
+
+```python
+education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
+```
+
+We can also create a `CrossedColumn` over more than two columns. Each
+constituent column can be either a base feature column that is categorical
+(`SparseColumn`), a bucketized real-valued feature column (`BucketizedColumn`),
+or even another `CrossColumn`. Here's an example:
+
+```python
+age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column(
+  [age_buckets, education, occupation], hash_bucket_size=int(1e6))
+```
+
+## Defining The Logistic Regression Model
+
+After processing the input data and defining all the feature columns, we're now
+ready to put them all together and build a Logistic Regression model. In the
+previous section we've seen several types of base and derived feature columns,
+including:
+
+*   `SparseColumn`
+*   `RealValuedColumn`
+*   `BucketizedColumn`
+*   `CrossedColumn`
+
+All of these are subclasses of the abstract `FeatureColumn` class, and can be
+added to the `feature_columns` field of a model:
+
+```python
+model_dir = tempfile.mkdtemp()
+m = tf.contrib.learn.LinearClassifier(feature_columns=[
+  gender, native_country, education, occupation, workclass, marital_status, race,
+  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
+  model_dir=model_dir)
+```
+
+The model also automatically learns a bias term, which controls the prediction
+one would make without observing any features (see the section "How Logistic
+Regression Works" for more explanations). The learned model files will be stored
+in `model_dir`.
+
+## Training and Evaluating Our Model
+
+After adding all the features to the model, now let's look at how to actually
+train the model. Training a model is just a one-liner using the tf.contrib.learn API:
+
+```python
+m.fit(input_fn=train_input_fn, steps=200)
+```
+
+After the model is trained, we can evaluate how good our model is at predicting
+the labels of the holdout data:
+
+```python
+results = m.evaluate(input_fn=eval_input_fn, steps=1)
+for key in sorted(results):
+    print("%s: %s" % (key, results[key]))
+```
+
+The first line of the output should be something like `accuracy: 0.83557522`,
+which means the accuracy is 83.6%. Feel free to try more features and
+transformations and see if you can do even better!
+
+If you'd like to see a working end-to-end example, you can download our
+[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+and set the `model_type` flag to `wide`.
+
+## Adding Regularization to Prevent Overfitting
+
+Regularization is a technique used to avoid **overfitting**. Overfitting happens
+when your model does well on the data it is trained on, but worse on test data
+that the model has not seen before, such as live traffic. Overfitting generally
+occurs when a model is excessively complex, such as having too many parameters
+relative to the number of observed training data. Regularization allows for you
+to control your model's complexity and makes the model more generalizable to
+unseen data.
+
+In the Linear Model library, you can add L1 and L2 regularizations to the model
+as:
+
+```
+m = tf.contrib.learn.LinearClassifier(feature_columns=[
+  gender, native_country, education, occupation, workclass, marital_status, race,
+  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
+  optimizer=tf.train.FtrlOptimizer(
+    learning_rate=0.1,
+    l1_regularization_strength=1.0,
+    l2_regularization_strength=1.0),
+  model_dir=model_dir)
+```
+
+One important difference between L1 and L2 regularization is that L1
+regularization tends to make model weights stay at zero, creating sparser
+models, whereas L2 regularization also tries to make the model weights closer to
+zero but not necessarily zero. Therefore, if you increase the strength of L1
+regularization, you will have a smaller model size because many of the model
+weights will be zero. This is often desirable when the feature space is very
+large but sparse, and when there are resource constraints that prevent you from
+serving a model that is too large.
+
+In practice, you should try various combinations of L1, L2 regularization
+strengths and find the best parameters that best control overfitting and give
+you a desirable model size.
+
+## How Logistic Regression Works
+
+Finally, let's take a minute to talk about what the Logistic Regression model
+actually looks like in case you're not already familiar with it. We'll denote
+the label as \\(Y\\), and the set of observed features as a feature vector
+\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). We define \\(Y=1\\) if an individual earned >
+50,000 dollars and \\(Y=0\\) otherwise. In Logistic Regression, the probability of
+the label being positive (\\(Y=1\\)) given the features \\(\mathbf{x}\\) is given
+as:
+
+$$ P(Y=1|\mathbf{x}) = \frac{1}{1+\exp(-(\mathbf{w}^T\mathbf{x}+b))}$$
+
+where \\(\mathbf{w}=[w_1, w_2, ..., w_d]\\) are the model weights for the features
+\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). \\(b\\) is a constant that is often called
+the **bias** of the model. The equation consists of two parts—A linear model and
+a logistic function:
+
+*   **Linear Model**: First, we can see that \\(\mathbf{w}^T\mathbf{x}+b = b +
+    w_1x_1 + ... +w_dx_d\\) is a linear model where the output is a linear
+    function of the input features \\(\mathbf{x}\\). The bias \\(b\\) is the
+    prediction one would make without observing any features. The model weight
+    \\(w_i\\) reflects how the feature \\(x_i\\) is correlated with the positive
+    label. If \\(x_i\\) is positively correlated with the positive label, the
+    weight \\(w_i\\) increases, and the probability \\(P(Y=1|\mathbf{x})\\) will be
+    closer to 1. On the other hand, if \\(x_i\\) is negatively correlated with the
+    positive label, then the weight \\(w_i\\) decreases and the probability
+    \\(P(Y=1|\mathbf{x})\\) will be closer to 0.
+
+*   **Logistic Function**: Second, we can see that there's a logistic function
+    (also known as the sigmoid function) \\(S(t) = 1/(1+\exp(-t))\\) being applied
+    to the linear model. The logistic function is used to convert the output of
+    the linear model \\(\mathbf{w}^T\mathbf{x}+b\\) from any real number into the
+    range of \\([0, 1]\\), which can be interpreted as a probability.
+
+Model training is an optimization problem: The goal is to find a set of model
+weights (i.e. model parameters) to minimize a **loss function** defined over the
+training data, such as logistic loss for Logistic Regression models. The loss
+function measures the discrepancy between the ground-truth label and the model's
+prediction. If the prediction is very close to the ground-truth label, the loss
+value will be low; if the prediction is very far from the label, then the loss
+value would be high.
+
+## Learn Deeper
+
+If you're interested in learning more, check out our @{$wide_and_deep$Wide & Deep Learning Tutorial} where we'll show you how to combine
+the strengths of linear models and deep neural networks by jointly training them
+using the tf.contrib.learn API.
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
new file mode 100644
index 00000000000..0978005d6c3
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -0,0 +1,266 @@
+# TensorFlow Wide & Deep Learning Tutorial
+
+In the previous @{$wide$TensorFlow Linear Model Tutorial},
+we trained a logistic regression model to predict the probability that the
+individual has an annual income of over 50,000 dollars using the
+[Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).
+TensorFlow is
+great for training deep neural networks too, and you might be thinking which one
+you should choose—Well, why not both? Would it be possible to combine the
+strengths of both in one model?
+
+In this tutorial, we'll introduce how to use the tf.contrib.learn API to jointly train a
+wide linear model and a deep feed-forward neural network. This approach combines
+the strengths of memorization and generalization. It's useful for generic
+large-scale regression and classification problems with sparse input features
+(e.g., categorical features with a large number of possible feature values). If
+you're interested in learning more about how Wide & Deep Learning works, please
+check out our [research paper](http://arxiv.org/abs/1606.07792).
+
+![Wide & Deep Spectrum of Models](https://www.tensorflow.org/images/wide_n_deep.svg "Wide & Deep")
+
+The figure above shows a comparison of a wide model (logistic regression with
+sparse features and transformations), a deep model (feed-forward neural network
+with an embedding layer and several hidden layers), and a Wide & Deep model
+(joint training of both). At a high level, there are only 3 steps to configure a
+wide, deep, or Wide & Deep model using the tf.contrib.learn API:
+
+1.  Select features for the wide part: Choose the sparse base columns and
+    crossed columns you want to use.
+1.  Select features for the deep part: Choose the continuous columns, the
+    embedding dimension for each categorical column, and the hidden layer sizes.
+1.  Put them all together in a Wide & Deep model
+    (`DNNLinearCombinedClassifier`).
+
+And that's it! Let's go through a simple example.
+
+## Setup
+
+To try the code for this tutorial:
+
+1.  @{$install$Install TensorFlow} if you haven't already.
+
+2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+
+3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+
+    a. Get `pip`:
+
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
+
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
+
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
+
+    If you have trouble installing pandas, consult the
+    [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
+    on the pandas site.
+
+4. Execute the tutorial code with the following command to train the linear
+model described in this tutorial:
+
+        $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
+
+Read on to find out how this code builds its linear model.
+
+
+## Define Base Feature Columns
+
+First, let's define the base categorical and continuous feature columns that
+we'll use. These base columns will be the building blocks used by both the wide
+part and the deep part of the model.
+
+```python
+import tensorflow as tf
+
+# Categorical base columns.
+gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
+race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
+  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
+education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
+relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
+workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
+occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
+native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
+
+# Continuous base columns.
+age = tf.contrib.layers.real_valued_column("age")
+age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+education_num = tf.contrib.layers.real_valued_column("education_num")
+capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
+capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
+hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
+```
+
+## The Wide Model: Linear Model with Crossed Feature Columns
+
+The wide model is a linear model with a wide set of sparse and crossed feature
+columns:
+
+```python
+wide_columns = [
+  gender, native_country, education, occupation, workclass, relationship, age_buckets,
+  tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
+  tf.contrib.layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)),
+  tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))]
+```
+
+Wide models with crossed feature columns can memorize sparse interactions
+between features effectively. That being said, one limitation of crossed feature
+columns is that they do not generalize to feature combinations that have not
+appeared in the training data. Let's add a deep model with embeddings to fix
+that.
+
+## The Deep Model: Neural Network with Embeddings
+
+The deep model is a feed-forward neural network, as shown in the previous
+figure. Each of the sparse, high-dimensional categorical features are first
+converted into a low-dimensional and dense real-valued vector, often referred to
+as an embedding vector. These low-dimensional dense embedding vectors are
+concatenated with the continuous features, and then fed into the hidden layers
+of a neural network in the forward pass. The embedding values are initialized
+randomly, and are trained along with all other model parameters to minimize the
+training loss. If you're interested in learning more about embeddings, check out
+the TensorFlow tutorial on
+[Vector Representations of Words](https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html),
+or [Word Embedding](https://en.wikipedia.org/wiki/Word_embedding) on Wikipedia.
+
+We'll configure the embeddings for the categorical columns using
+`embedding_column`, and concatenate them with the continuous columns:
+
+```python
+deep_columns = [
+  tf.contrib.layers.embedding_column(workclass, dimension=8),
+  tf.contrib.layers.embedding_column(education, dimension=8),
+  tf.contrib.layers.embedding_column(gender, dimension=8),
+  tf.contrib.layers.embedding_column(relationship, dimension=8),
+  tf.contrib.layers.embedding_column(native_country, dimension=8),
+  tf.contrib.layers.embedding_column(occupation, dimension=8),
+  age, education_num, capital_gain, capital_loss, hours_per_week]
+```
+
+The higher the `dimension` of the embedding is, the more degrees of freedom the
+model will have to learn the representations of the features. For simplicity, we
+set the dimension to 8 for all feature columns here. Empirically, a more
+informed decision for the number of dimensions is to start with a value on the
+order of \\(\log_2(n)\\) or \\(k\sqrt[4]n\\), where \\(n\\) is the number of unique
+features in a feature column and \\(k\\) is a small constant (usually smaller than
+10).
+
+Through dense embeddings, deep models can generalize better and make predictions
+on feature pairs that were previously unseen in the training data. However, it
+is difficult to learn effective low-dimensional representations for feature
+columns when the underlying interaction matrix between two feature columns is
+sparse and high-rank. In such cases, the interaction between most feature pairs
+should be zero except a few, but dense embeddings will lead to nonzero
+predictions for all feature pairs, and thus can over-generalize. On the other
+hand, linear models with crossed features can memorize these “exception rules”
+effectively with fewer model parameters.
+
+Now, let's see how to jointly train wide and deep models and allow them to
+complement each other’s strengths and weaknesses.
+
+## Combining Wide and Deep Models into One
+
+The wide models and deep models are combined by summing up their final output
+log odds as the prediction, then feeding the prediction to a logistic loss
+function. All the graph definition and variable allocations have already been
+handled for you under the hood, so you simply need to create a
+`DNNLinearCombinedClassifier`:
+
+```python
+import tempfile
+model_dir = tempfile.mkdtemp()
+m = tf.contrib.learn.DNNLinearCombinedClassifier(
+    model_dir=model_dir,
+    linear_feature_columns=wide_columns,
+    dnn_feature_columns=deep_columns,
+    dnn_hidden_units=[100, 50])
+```
+
+## Training and Evaluating The Model
+
+Before we train the model, let's read in the Census dataset as we did in the
+@{$wide$TensorFlow Linear Model tutorial}. The code for
+input data processing is provided here again for your convenience:
+
+```python
+import pandas as pd
+import urllib
+
+# Define the column names for the data sets.
+COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
+  "marital_status", "occupation", "relationship", "race", "gender",
+  "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
+LABEL_COLUMN = 'label'
+CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
+                       "relationship", "race", "gender", "native_country"]
+CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
+                      "hours_per_week"]
+
+# Download the training and test data to temporary files.
+# Alternatively, you can download them yourself and change train_file and
+# test_file to your own paths.
+train_file = tempfile.NamedTemporaryFile()
+test_file = tempfile.NamedTemporaryFile()
+urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
+urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
+
+# Read the training and test data sets into Pandas dataframe.
+df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
+df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
+df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
+df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
+
+def input_fn(df):
+  # Creates a dictionary mapping from each continuous feature column name (k) to
+  # the values of that column stored in a constant Tensor.
+  continuous_cols = {k: tf.constant(df[k].values)
+                     for k in CONTINUOUS_COLUMNS}
+  # Creates a dictionary mapping from each categorical feature column name (k)
+  # to the values of that column stored in a tf.SparseTensor.
+  categorical_cols = {k: tf.SparseTensor(
+      indices=[[i, 0] for i in range(df[k].size)],
+      values=df[k].values,
+      dense_shape=[df[k].size, 1])
+                      for k in CATEGORICAL_COLUMNS}
+  # Merges the two dictionaries into one.
+  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
+  # Converts the label column into a constant Tensor.
+  label = tf.constant(df[LABEL_COLUMN].values)
+  # Returns the feature columns and the label.
+  return feature_cols, label
+
+def train_input_fn():
+  return input_fn(df_train)
+
+def eval_input_fn():
+  return input_fn(df_test)
+```
+
+After reading in the data, you can train and evaluate the model:
+
+```python
+m.fit(input_fn=train_input_fn, steps=200)
+results = m.evaluate(input_fn=eval_input_fn, steps=1)
+for key in sorted(results):
+    print("%s: %s" % (key, results[key]))
+```
+
+The first line of the output should be something like `accuracy: 0.84429705`. We
+can see that the accuracy was improved from about 83.6% using a wide-only linear
+model to about 84.4% using a Wide & Deep model. If you'd like to see a working
+end-to-end example, you can download our
+[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+
+Note that this tutorial is just a quick example on a small dataset to get you
+familiar with the API. Wide & Deep Learning will be even more powerful if you
+try it on a large dataset with many sparse feature columns that have a large
+number of possible feature values. Again, feel free to take a look at our
+[research paper](http://arxiv.org/abs/1606.07792) for more ideas about how to
+apply Wide & Deep Learning in real-world large-scale machine learning problems.
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
new file mode 100644
index 00000000000..380763db1ab
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -0,0 +1,405 @@
+# Vector Representations of Words
+
+In this tutorial we look at the word2vec model by
+[Mikolov et al.](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
+This model is used for learning vector representations of words, called "word
+embeddings".
+
+## Highlights
+
+This tutorial is meant to highlight the interesting, substantive parts of
+building a word2vec model in TensorFlow.
+
+* We start by giving the motivation for why we would want to
+represent words as vectors.
+* We look at the intuition behind the model and how it is trained
+(with a splash of math for good measure).
+* We also show a simple implementation of the model in TensorFlow.
+* Finally, we look at ways to make the naive version scale better.
+
+We walk through the code later during the tutorial, but if you'd prefer to dive
+straight in, feel free to look at the minimalistic implementation in
+[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
+This basic example contains the code needed to download some data, train on it a
+bit and visualize the result. Once you get comfortable with reading and running
+the basic version, you can graduate to
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
+which is a more serious implementation that showcases some more advanced
+TensorFlow principles about how to efficiently use threads to move data into a
+text model, how to checkpoint during training, etc.
+
+But first, let's look at why we would want to learn word embeddings in the first
+place. Feel free to skip this section if you're an Embedding Pro and you'd just
+like to get your hands dirty with the details.
+
+## Motivation: Why Learn Word Embeddings?
+
+Image and audio processing systems work with rich, high-dimensional datasets
+encoded as vectors of the individual raw pixel-intensities for image data, or
+e.g. power spectral density coefficients for audio data. For tasks like object
+or speech recognition we know that all the information required to successfully
+perform the task is encoded in the data (because humans can perform these tasks
+from the raw data).  However, natural language processing systems traditionally
+treat words as discrete atomic symbols, and therefore 'cat' may be represented
+as  `Id537` and 'dog' as `Id143`.  These encodings are arbitrary, and provide
+no useful information to the system regarding the relationships that may exist
+between the individual symbols. This means that the model can leverage
+very little of what it has learned about 'cats' when it is processing data about
+'dogs' (such that they are both animals, four-legged, pets, etc.). Representing
+words as unique, discrete ids furthermore leads to data sparsity, and usually
+means that we may need more data in order to successfully train statistical
+models.  Using vector representations can overcome some of these obstacles.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/audio-image-text.png" alt>
+</div>
+
+[Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs)
+represent (embed) words in a continuous vector space where semantically
+similar words are mapped to nearby points ('are embedded nearby each other').
+VSMs have a long, rich history in NLP, but all methods depend in some way or
+another on the
+[Distributional Hypothesis](https://en.wikipedia.org/wiki/Distributional_semantics#Distributional_Hypothesis),
+which states that words that appear in the same contexts share
+semantic meaning. The different approaches that leverage this principle can be
+divided into two categories: *count-based methods* (e.g.
+[Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis)),
+and *predictive methods* (e.g.
+[neural probabilistic language models](http://www.scholarpedia.org/article/Neural_net_language_models)).
+
+This distinction is elaborated in much more detail by
+[Baroni et al.](http://clic.cimec.unitn.it/marco/publications/acl2014/baroni-etal-countpredict-acl2014.pdf),
+but in a nutshell: Count-based methods compute the statistics of
+how often some word co-occurs with its neighbor words in a large text corpus,
+and then map these count-statistics down to a small, dense vector for each word.
+Predictive models directly try to predict a word from its neighbors in terms of
+learned small, dense *embedding vectors* (considered parameters of the
+model).
+
+Word2vec is a particularly computationally-efficient predictive model for
+learning word embeddings from raw text. It comes in two flavors, the Continuous
+Bag-of-Words model (CBOW) and the Skip-Gram model (Section 3.1 and 3.2 in [Mikolov et al.](http://arxiv.org/pdf/1301.3781.pdf)). Algorithmically, these
+models are similar, except that CBOW predicts target words (e.g. 'mat') from
+source context words ('the cat sits on the'), while the skip-gram does the
+inverse and predicts source context-words from the target words. This inversion
+might seem like an arbitrary choice, but statistically it has the effect that
+CBOW smoothes over a lot of the distributional information (by treating an
+entire context as one observation). For the most part, this turns out to be a
+useful thing for smaller datasets. However, skip-gram treats each context-target
+pair as a new observation, and this tends to do better when we have larger
+datasets. We will focus on the skip-gram model in the rest of this tutorial.
+
+
+## Scaling up with Noise-Contrastive Training
+
+Neural probabilistic language models are traditionally trained using the
+[maximum likelihood](https://en.wikipedia.org/wiki/Maximum_likelihood) (ML)
+principle  to maximize the probability of the next word \\(w_t\\) (for "target")
+given the previous words \\(h\\) (for "history") in terms of a
+[*softmax* function](https://en.wikipedia.org/wiki/Softmax_function),
+
+$$
+\begin{align}
+P(w_t | h) &= \text{softmax}(\text{score}(w_t, h)) \\
+           &= \frac{\exp \{ \text{score}(w_t, h) \} }
+             {\sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} }
+\end{align}
+$$
+
+where \\(\text{score}(w_t, h)\\) computes the compatibility of word \\(w_t\\)
+with the context \\(h\\) (a dot product is commonly used). We train this model
+by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function)
+on the training set, i.e. by maximizing
+
+$$
+\begin{align}
+ J_\text{ML} &= \log P(w_t | h) \\
+  &= \text{score}(w_t, h) -
+     \log \left( \sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} \right).
+\end{align}
+$$
+
+This yields a properly normalized probabilistic model for language modeling.
+However this is very expensive, because we need to compute and normalize each
+probability using the score for all other \\(V\\) words \\(w'\\) in the current
+context \\(h\\), *at every training step*.
+
+<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-nplm.png" alt>
+</div>
+
+On the other hand, for feature learning in word2vec we do not need a full
+probabilistic model. The CBOW and skip-gram models are instead trained using a
+binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression))
+to discriminate the real target words \\(w_t\\) from \\(k\\) imaginary (noise) words \\(\tilde w\\), in the
+same context. We illustrate this below for a CBOW model. For skip-gram the
+direction is simply inverted.
+
+<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/nce-nplm.png" alt>
+</div>
+
+Mathematically, the objective (for each example) is to maximize
+
+$$J_\text{NEG} = \log Q_\theta(D=1 |w_t, h) +
+  k \mathop{\mathbb{E}}_{\tilde w \sim P_\text{noise}}
+     \left[ \log Q_\theta(D = 0 |\tilde w, h) \right]$$
+
+where \\(Q_\theta(D=1 | w, h)\\) is the binary logistic regression probability
+under the model of seeing the word \\(w\\) in the context \\(h\\) in the dataset
+\\(D\\), calculated in terms of the learned embedding vectors \\(\theta\\). In
+practice we approximate the expectation by drawing \\(k\\) contrastive words
+from the noise distribution (i.e. we compute a
+[Monte Carlo average](https://en.wikipedia.org/wiki/Monte_Carlo_integration)).
+
+This objective is maximized when the model assigns high probabilities
+to the real words, and low probabilities to noise words. Technically, this is
+called
+[Negative Sampling](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf),
+and there is good mathematical motivation for using this loss function:
+The updates it proposes approximate the updates of the softmax function in the
+limit. But computationally it is especially appealing because computing the
+loss function now scales only with the number of *noise words* that we
+select (\\(k\\)), and not *all words* in the vocabulary (\\(V\\)). This makes it
+much faster to train. We will actually make use of the very similar
+[noise-contrastive estimation (NCE)](http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)
+loss, for which TensorFlow has a handy helper function `tf.nn.nce_loss()`.
+
+Let's get an intuitive feel for how this would work in practice!
+
+## The Skip-gram Model
+
+As an example, let's consider the dataset
+
+`the quick brown fox jumped over the lazy dog`
+
+We first form a dataset of words and the contexts in which they appear. We
+could define 'context' in any way that makes sense, and in fact people have
+looked at syntactic contexts (i.e. the syntactic dependents of the current
+target word, see e.g.
+[Levy et al.](https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf)),
+words-to-the-left of the target, words-to-the-right of the target, etc. For now,
+let's stick to the vanilla definition and define 'context' as the window
+of words to the left and to the right of a target word. Using a window
+size of 1, we then have the dataset
+
+`([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...`
+
+of `(context, target)` pairs. Recall that skip-gram inverts contexts and
+targets, and tries to predict each context word from its target word, so the
+task becomes to predict 'the' and 'brown' from 'quick', 'quick' and 'fox' from
+'brown', etc. Therefore our dataset becomes
+
+`(quick, the), (quick, brown), (brown, quick), (brown, fox), ...`
+
+of `(input, output)` pairs.  The objective function is defined over the entire
+dataset, but we typically optimize this with
+[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
+(SGD) using one example at a time (or a 'minibatch' of `batch_size` examples,
+where typically `16 <= batch_size <= 512`). So let's look at one step of
+this process.
+
+Let's imagine at training step \\(t\\) we observe the first training case above,
+where the goal is to predict `the` from `quick`. We select `num_noise` number
+of noisy (contrastive) examples by drawing from some noise distribution,
+typically the unigram distribution, \\(P(w)\\). For simplicity let's say
+`num_noise=1` and we select `sheep` as a noisy example. Next we compute the
+loss for this pair of observed and noisy examples, i.e. the objective at time
+step \\(t\\) becomes
+
+$$J^{(t)}_\text{NEG} = \log Q_\theta(D=1 | \text{the, quick}) +
+  \log(Q_\theta(D=0 | \text{sheep, quick}))$$
+
+The goal is to make an update to the embedding parameters \\(\theta\\) to improve
+(in this case, maximize) this objective function.  We do this by deriving the
+gradient of the loss with respect to the embedding parameters \\(\theta\\), i.e.
+\\(\frac{\partial}{\partial \theta} J_\text{NEG}\\) (luckily TensorFlow provides
+easy helper functions for doing this!). We then perform an update to the
+embeddings by taking a small step in the direction of the gradient. When this
+process is repeated over the entire training set, this has the effect of
+'moving' the embedding vectors around for each word until the model is
+successful at discriminating real words from noise words.
+
+We can visualize the learned vectors by projecting them down to 2 dimensions
+using for instance something like the
+[t-SNE dimensionality reduction technique](http://lvdmaaten.github.io/tsne/).
+When we inspect these visualizations it becomes apparent that the vectors
+capture some general, and in fact quite useful, semantic information about
+words and their relationships to one another. It was very interesting when we
+first discovered that certain directions in the induced vector space specialize
+towards certain semantic relationships, e.g. *male-female*, *verb tense* and
+even *country-capital* relationships between words, as illustrated in the figure
+below (see also for example
+[Mikolov et al., 2013](http://www.aclweb.org/anthology/N13-1090)).
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/linear-relationships.png" alt>
+</div>
+
+This explains why these vectors are also useful as features for many canonical
+NLP prediction tasks, such as part-of-speech tagging or named entity recognition
+(see for example the original work by
+[Collobert et al., 2011](http://arxiv.org/abs/1103.0398)
+([pdf](http://arxiv.org/pdf/1103.0398.pdf)), or follow-up work by
+[Turian et al., 2010](http://www.aclweb.org/anthology/P10-1040)).
+
+But for now, let's just use them to draw pretty pictures!
+
+## Building the Graph
+
+This is all about embeddings, so let's define our embedding matrix.
+This is just a big random matrix to start.  We'll initialize the values to be
+uniform in the unit cube.
+
+```python
+embeddings = tf.Variable(
+    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
+```
+
+The noise-contrastive estimation loss is defined in terms of a logistic regression
+model. For this, we need to define the weights and biases for each word in the
+vocabulary (also called the `output weights` as opposed to the `input
+embeddings`). So let's define that.
+
+```python
+nce_weights = tf.Variable(
+  tf.truncated_normal([vocabulary_size, embedding_size],
+                      stddev=1.0 / math.sqrt(embedding_size)))
+nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
+```
+
+Now that we have the parameters in place, we can define our skip-gram model
+graph. For simplicity, let's suppose we've already integerized our text corpus
+with a vocabulary so that each word is represented as an integer (see
+[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
+for the details). The skip-gram model takes two inputs. One is a batch full of
+integers representing the source context words, the other is for the target
+words. Let's create placeholder nodes for these inputs, so that we can feed in
+data later.
+
+```python
+# Placeholders for inputs
+train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
+train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+```
+
+Now what we need to do is look up the vector for each of the source words in
+the batch.  TensorFlow has handy helpers that make this easy.
+
+```python
+embed = tf.nn.embedding_lookup(embeddings, train_inputs)
+```
+
+Ok, now that we have the embeddings for each word, we'd like to try to predict
+the target word using the noise-contrastive training objective.
+
+```python
+# Compute the NCE loss, using a sample of the negative labels each time.
+loss = tf.reduce_mean(
+  tf.nn.nce_loss(weights=nce_weights,
+                 biases=nce_biases,
+                 labels=train_labels,
+                 inputs=embed,
+                 num_sampled=num_sampled,
+                 num_classes=vocabulary_size))
+```
+
+Now that we have a loss node, we need to add the nodes required to compute
+gradients and update the parameters, etc. For this we will use stochastic
+gradient descent, and TensorFlow has handy helpers to make this easy as well.
+
+```python
+# We use the SGD optimizer.
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
+```
+
+## Training the Model
+
+Training the model is then as simple as using a `feed_dict` to push data into
+the placeholders and calling
+@{tf.Session.run} with this new data
+in a loop.
+
+```python
+for inputs, labels in generate_batch(...):
+  feed_dict = {train_inputs: inputs, train_labels: labels}
+  _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)
+```
+
+See the full example code in
+[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py).
+
+## Visualizing the Learned Embeddings
+
+After training has finished we can visualize the learned embeddings using
+t-SNE.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/tsne.png" alt>
+</div>
+
+Et voila! As expected, words that are similar end up clustering nearby each
+other. For a more heavyweight implementation of word2vec that showcases more of
+the advanced features of TensorFlow, see the implementation in
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+
+## Evaluating Embeddings: Analogical Reasoning
+
+Embeddings are useful for a wide variety of prediction tasks in NLP. Short of
+training a full-blown part-of-speech model or named-entity model, one simple way
+to evaluate embeddings is to directly use them to predict syntactic and semantic
+relationships like `king is to queen as father is to ?`. This is called
+*analogical reasoning* and the task was introduced by
+[Mikolov and colleagues
+](http://www.anthology.aclweb.org/N/N13/N13-1090.pdf).
+Download the dataset for this task from
+[download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt).
+
+To see how we do this evaluation, have a look at the `build_eval_graph()` and
+`eval()` functions in
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+
+The choice of hyperparameters can strongly influence the accuracy on this task.
+To achieve state-of-the-art performance on this task requires training over a
+very large dataset, carefully tuning the hyperparameters and making use of
+tricks like subsampling the data, which is out of the scope of this tutorial.
+
+
+## Optimizing the Implementation
+
+Our vanilla implementation showcases the flexibility of TensorFlow. For
+example, changing the training objective is as simple as swapping out the call
+to `tf.nn.nce_loss()` for an off-the-shelf alternative such as
+`tf.nn.sampled_softmax_loss()`. If you have a new idea for a loss function, you
+can manually write an expression for the new objective in TensorFlow and let
+the optimizer compute its derivatives. This flexibility is invaluable in the
+exploratory phase of machine learning model development, where we are trying
+out several different ideas and iterating quickly.
+
+Once you have a model structure you're satisfied with, it may be worth
+optimizing your implementation to run more efficiently (and cover more data in
+less time).  For example, the naive code we used in this tutorial would suffer
+compromised speed because we use Python for reading and feeding data items --
+each of which require very little work on the TensorFlow back-end.  If you find
+your model is seriously bottlenecked on input data, you may want to implement a
+custom data reader for your problem, as described in
+@{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
+modeling, we've actually already done this for you as an example in
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+
+If your model is no longer I/O bound but you want still more performance, you
+can take things further by writing your own TensorFlow Ops, as described in
+@{$adding_an_op$Adding a New Op}.  Again we've provided an
+example of this for the Skip-Gram case
+[models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
+Feel free to benchmark these against each other to measure performance
+improvements at each stage.
+
+## Conclusion
+
+In this tutorial we covered the word2vec model, a computationally efficient
+model for learning word embeddings. We motivated why embeddings are useful,
+discussed efficient training techniques and showed how to implement all of this
+in TensorFlow. Overall, we hope that this has show-cased how TensorFlow affords
+you the flexibility you need for early experimentation, and the control you
+later need for bespoke optimized implementation.
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/BUILD
rename to tensorflow/examples/adding_an_op/BUILD
diff --git a/tensorflow/python/ops/cloud/__init__.py b/tensorflow/examples/adding_an_op/__init__.py
similarity index 100%
rename from tensorflow/python/ops/cloud/__init__.py
rename to tensorflow/examples/adding_an_op/__init__.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/attr_examples.cc b/tensorflow/examples/adding_an_op/attr_examples.cc
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/attr_examples.cc
rename to tensorflow/examples/adding_an_op/attr_examples.cc
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/cuda_op.py b/tensorflow/examples/adding_an_op/cuda_op.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/cuda_op.py
rename to tensorflow/examples/adding_an_op/cuda_op.py
diff --git a/tensorflow/examples/adding_an_op/cuda_op_kernel.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
new file mode 100644
index 00000000000..af04258fa2c
--- /dev/null
+++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;  // NOLINT(build/namespaces)
+
+REGISTER_OP("AddOne")
+    .Input("input: int32")
+    .Output("output: int32")
+    .Doc(R"doc(
+Adds 1 to all elements of the tensor.
+
+output: A Tensor.
+  output = input + 1
+)doc");
+
+void AddOneKernelLauncher(const int* in, const int N, int* out);
+
+class AddOneOp : public OpKernel {
+ public:
+  explicit AddOneOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<int32>();
+
+    // Create an output tensor
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+    auto output = output_tensor->template flat<int32>();
+
+    // Set all but the first element of the output tensor to 0.
+    const int N = input.size();
+    // Call the cuda kernel launcher
+    AddOneKernelLauncher(input.data(), N, output.data());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AddOne").Device(DEVICE_GPU), AddOneOp);
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc
rename to tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_test.py b/tensorflow/examples/adding_an_op/cuda_op_test.py
similarity index 95%
rename from tensorflow/g3doc/how_tos/adding_an_op/cuda_op_test.py
rename to tensorflow/examples/adding_an_op/cuda_op_test.py
index 4ae5fde171f..07390bc3bf1 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_test.py
+++ b/tensorflow/examples/adding_an_op/cuda_op_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow.g3doc.how_tos.adding_an_op import cuda_op
+from tensorflow.examples.adding_an_op import cuda_op
 
 
 class AddOneTest(tf.test.TestCase):
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/fact_test.py b/tensorflow/examples/adding_an_op/fact_test.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/fact_test.py
rename to tensorflow/examples/adding_an_op/fact_test.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
similarity index 95%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_1_test.py
rename to tensorflow/examples/adding_an_op/zero_out_1_test.py
index 062e845a84b..fac486100d8 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 import os.path
 
 import tensorflow as tf
-from tensorflow.g3doc.how_tos.adding_an_op import zero_out_op_1
+from tensorflow.examples.adding_an_op import zero_out_op_1
 
 
 class ZeroOut1Test(tf.test.TestCase):
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
similarity index 92%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
rename to tensorflow/examples/adding_an_op/zero_out_2_test.py
index 2598af4b277..217bbbcffa3 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -20,8 +20,10 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow.g3doc.how_tos.adding_an_op import zero_out_op_2
-from tensorflow.g3doc.how_tos.adding_an_op import zero_out_grad_2
+
+
+from tensorflow.examples.adding_an_op import zero_out_grad_2  # pylint: disable=unused-import
+from tensorflow.examples.adding_an_op import zero_out_op_2
 
 
 class ZeroOut2Test(tf.test.TestCase):
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
similarity index 94%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_3_test.py
rename to tensorflow/examples/adding_an_op/zero_out_3_test.py
index a6778033d34..01280caf495 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -20,7 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow.g3doc.how_tos.adding_an_op import zero_out_op_3
+from tensorflow.examples.adding_an_op import zero_out_op_3
 
 
 class ZeroOut3Test(tf.test.TestCase):
@@ -48,5 +48,5 @@ class ZeroOut3Test(tf.test.TestCase):
         result.eval()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py b/tensorflow/examples/adding_an_op/zero_out_grad_2.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py
rename to tensorflow/examples/adding_an_op/zero_out_grad_2.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_1.py b/tensorflow/examples/adding_an_op/zero_out_op_1.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_1.py
rename to tensorflow/examples/adding_an_op/zero_out_op_1.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_2.py b/tensorflow/examples/adding_an_op/zero_out_op_2.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_2.py
rename to tensorflow/examples/adding_an_op/zero_out_op_2.py
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_3.py b/tensorflow/examples/adding_an_op/zero_out_op_3.py
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_3.py
rename to tensorflow/examples/adding_an_op/zero_out_op_3.py
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
new file mode 100644
index 00000000000..6d57b64d1aa
--- /dev/null
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
@@ -0,0 +1,62 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;  // NOLINT(build/namespaces)
+
+REGISTER_OP("ZeroOut")
+    .Input("to_zero: int32")
+    .Output("zeroed: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Zeros out all but the first value of a Tensor.
+
+zeroed: A Tensor whose first value is identical to `to_zero`, and 0
+  otherwise.
+)doc");
+
+class ZeroOutOp : public OpKernel {
+ public:
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<int32>();
+
+    // Create an output tensor
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+    auto output = output_tensor->template flat<int32>();
+
+    // Set all but the first element of the output tensor to 0.
+    const int N = input.size();
+    for (int i = 1; i < N; i++) {
+      output(i) = 0;
+    }
+
+    // Preserve the first input value.
+    if (N > 0) output(0) = input(0);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_2.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
similarity index 97%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_2.cc
rename to tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
index c0ced5c9a59..04c34c59685 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_2.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
-using namespace tensorflow;
+using namespace tensorflow;  // NOLINT(build/namespaces)
 
 REGISTER_OP("ZeroOut")
     .Attr("T: realnumbertype")
@@ -65,7 +65,7 @@ class ZeroOutOp : public OpKernel {
     auto input = input_tensor.flat<T>();
 
     // Create an output tensor
-    Tensor* output = NULL;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_tensor.shape(), &output));
     auto output_flat = output->template flat<T>();
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_3.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
similarity index 96%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_3.cc
rename to tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
index ad449dc2ea8..3d1c4307ada 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_3.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
-using namespace tensorflow;
+using namespace tensorflow;  // NOLINT(build/namespaces)
 
 REGISTER_OP("ZeroOut")
     .Attr("preserve_index: int = 0")
@@ -50,7 +50,7 @@ class ZeroOutOp : public OpKernel {
                 errors::InvalidArgument("preserve_index out of range"));
 
     // Create an output tensor
-    Tensor* output_tensor = NULL;
+    Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
     auto output = output_tensor->template flat<int32>();
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index c795ba67a8c..71c16e23996 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -15,6 +15,10 @@ exports_files(["LICENSE"])
 
 LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
 
+# libtensorflow_demo.so contains the native code for image colorspace conversion
+# and object tracking used by the demo. It does not require TF as a dependency
+# to build if STANDALONE_DEMO_LIB is defined.
+# TF support for the demo is provided separately by libtensorflow_inference.so.
 cc_binary(
     name = "libtensorflow_demo.so",
     srcs = glob([
@@ -22,6 +26,7 @@ cc_binary(
         "jni/**/*.h",
     ]),
     copts = tf_copts(),
+    defines = ["STANDALONE_DEMO_LIB"],
     linkopts = [
         "-landroid",
         "-ljnigraphics",
@@ -39,15 +44,16 @@ cc_binary(
         "notap",
     ],
     deps = [
-        "//tensorflow/contrib/android:android_tensorflow_inference_jni",
-        "//tensorflow/core:android_tensorflow_lib",
         LINKER_SCRIPT,
     ],
 )
 
 cc_library(
     name = "tensorflow_native_libs",
-    srcs = [":libtensorflow_demo.so"],
+    srcs = [
+        ":libtensorflow_demo.so",
+        "//tensorflow/contrib/android:libtensorflow_inference.so",
+    ],
     tags = [
         "manual",
         "notap",
@@ -63,14 +69,13 @@ android_binary(
     # (and corresponding Activities in source) to reduce APK size.
     assets = [
         "//tensorflow/examples/android/assets:asset_files",
-        "@inception5h//:model_files",
-        "@mobile_multibox//:model_files",
-        "@stylize//:model_files",
+        ":external_assets",
     ],
     assets_dir = "",
     custom_package = "org.tensorflow.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
+    manifest_merger = "legacy",
     resource_files = glob(["res/**"]),
     tags = [
         "manual",
@@ -82,6 +87,17 @@ android_binary(
     ],
 )
 
+# LINT.IfChange
+filegroup(
+    name = "external_assets",
+    srcs = [
+        "@inception5h//:model_files",
+        "@mobile_multibox//:model_files",
+        "@stylize//:model_files",
+    ],
+)
+# LINT.ThenChange(//tensorflow/examples/android/download-models.gradle)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index fbbe9f276bc..270f654ed72 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -10,7 +10,12 @@ TensorFlow in mobile applications.
 
 Inference is done using the [TensorFlow Android Inference Interface](../../../tensorflow/contrib/android),
 which may be built separately if you want a standalone library to drop into your
-existing application.
+existing application. Object tracking and efficient YUV -> RGB conversion are
+handled by `libtensorflow_demo.so`.
+
+A device running Android 5.0 (API 21) or higher is required to run the demo due
+to the use of the camera2 API, although the native libraries themselves can run
+on API >= 14 devices.
 
 ## Current samples:
 
@@ -23,27 +28,63 @@ existing application.
         using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to
         localize and track people in the camera preview in real-time.
 3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
-        Uses a model based on [A Learned Representation For Artistic Style]
-        (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
-        to that of a number of different artists.
+        Uses a model based on [A Learned Representation For Artistic
+        Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview 
+        image to that of a number of different artists.
 
-## Prebuilt APK:
+<img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
+
+## Prebuilt Components:
 
 If you just want the fastest path to trying the demo, you may download the
 nightly build
-[here](https://ci.tensorflow.org/view/Nightly/job/nightly-android/).
-A device running Android 5.0 (API 21) or higher is required.
+[here](https://ci.tensorflow.org/view/Nightly/job/nightly-android/). Expand the
+"View" and then the "out" folders under "Last Successful Artifacts" to find
+tensorflow_demo.apk.
+
+Also available are precompiled native libraries, and a jcenter package that you
+may simply drop into your own applications. See
+[tensorflow/contrib/android/README.md](../../../tensorflow/contrib/android/README.md)
+for more details.
 
 ## Running the Demo
 
-Once the app is installed it can be started via the "TF Classify" and
-"TF Detect" and icons, which have the orange TensorFlow logo as their icon.
+Once the app is installed it can be started via the "TF Classify", "TF Detect"
+and "TF Stylize" icons, which have the orange TensorFlow logo as their icon.
 
 While running the activities, pressing the volume keys on your device will
 toggle debug visualizations on/off, rendering additional info to the screen
 that may be useful for development purposes.
 
-## Building the Demo from Source
+## Building in Android Studio using the TensorFlow AAR from JCenter
+
+The simplest way to compile the demo app yourself, and try out changes to the
+project code is to use AndroidStudio. Simply set this `android` directory as the project root.
+
+Then edit the `build.gradle` file and change the value of `nativeBuildSystem`
+to `'none'` so that the project is built in the simplest way possible:
+
+```None
+def nativeBuildSystem = 'none'
+```
+
+While this project includes full build integration for TensorFlow, this setting
+disables it, and uses the TensorFlow Inference Interface package from JCenter.
+
+Note: Currently, in this build mode, YUV -> RGB is done using a less efficient
+Java implementation, and object tracking is not available in the "TF Detect"
+activity. Setting the build system to `'cmake'` currently only builds
+`libtensorflow_demo.so`, which provides fast YUV -> RGB conversion and object
+tracking, while still acquiring TensorFlow support via the downloaded AAR, so
+it may be a lightweight way to enable these features.
+
+For any project that does not include custom low level TensorFlow code, this is
+likely sufficient.
+
+For details on how to include this JCenter package in your own project see
+[tensorflow/contrib/android/README.md](../../../tensorflow/contrib/android/README.md)
+
+## Building the Demo with TensorFlow from Source
 
 Pick your preferred approach below. At the moment, we have full support for
 Bazel, and partial support for gradle, cmake, make, and Android Studio.
@@ -59,12 +100,18 @@ protobuf compilation.
 
 ### Bazel
 
+NOTE: Bazel does not currently support building for Android on Windows. Full
+support for gradle/cmake builds is coming soon, but in the meantime we suggest
+that Windows users download the
+[prebuilt binaries](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)
+instead.
+
 ##### Install Bazel and Android Prerequisites
 
 Bazel is the primary build system for TensorFlow. To build with Bazel,
 it and the Android NDK and SDK must be installed on your system.
 
-1. Get the recommended Bazel version listed in [os_setup.html](https://www.tensorflow.org/versions/master/get_started/os_setup.html#source)
+1. Install the latest version of Bazel as per the instructions [on the Bazel website](https://bazel.build/versions/master/docs/install.html).
 2. The Android NDK is required to build the native (C/C++) TensorFlow code.
         The current recommended version is 12b, which may be found
         [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-12b-downloads).
@@ -72,11 +119,12 @@ it and the Android NDK and SDK must be installed on your system.
         [here](https://developer.android.com/tools/revisions/build-tools.html),
         or alternatively as part of
         [Android Studio](https://developer.android.com/studio/index.html). Build
-        tools API >= 23 is required to build the TF Android demo.
+        tools API >= 23 is required to build the TF Android demo (though it will
+        run on API >= 21 devices).
 
 ##### Edit WORKSPACE
 
-The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L2-L13)
+The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L32)
 must be uncommented with the paths filled in appropriately depending on where
 you installed the NDK and SDK. Otherwise an error such as:
 "The external label '//external:android/sdk' is not bound to anything" will
@@ -85,38 +133,43 @@ be reported.
 Also edit the API levels for the SDK in WORKSPACE to the highest level you
 have installed in your SDK. This must be >= 23 (this is completely independent
 of the API level of the demo, which is defined in AndroidManifest.xml).
-The NDK API level may remain at 21.
+The NDK API level may remain at 14.
 
 ##### Install Model Files (optional)
 
 The TensorFlow `GraphDef`s that contain the model definitions and weights
 are not packaged in the repo because of their size. They are downloaded
 automatically and packaged with the APK by Bazel via a new_http_archive defined
-in WORKSPACE during the build process.
+in `WORKSPACE` during the build process, and by Gradle via download-models.gradle.
 
-**Optional**: If you wish to place the models in your assets manually (E.g. for
-non-Bazel builds), remove the `inception_5` and `mobile_multibox` entries in
-`BUILD` and download the archives yourself to the `assets` directory in the
-source tree:
+**Optional**: If you wish to place the models in your assets manually,
+remove all of the `model_files` entries from the `assets`
+list in `tensorflow_demo` found in the `[BUILD](BUILD)` file. Then download
+and extract the archives yourself to the `assets` directory in the source tree:
 
 ```bash
-$ curl -L https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip -o /tmp/inception5h.zip
-$ curl -L https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1.zip -o /tmp/mobile_multibox_v1.zip
-
-$ unzip /tmp/inception5h.zip -d tensorflow/examples/android/assets/
-$ unzip /tmp/mobile_multibox_v1.zip -d tensorflow/examples/android/assets/
+BASE_URL=https://storage.googleapis.com/download.tensorflow.org/models
+for MODEL_ZIP in inception5h.zip mobile_multibox_v1a.zip stylize_v1.zip
+do
+  curl -L ${BASE_URL}/${MODEL_ZIP} -o /tmp/${MODEL_ZIP}
+  unzip /tmp/${MODEL_ZIP} -d tensorflow/examples/android/assets/
+done
 ```
 
 This will extract the models and their associated metadata files to the local
 assets/ directory.
 
+If you are using Gradle, make sure to remove download-models.gradle reference
+from build.gradle after your manually download models; otherwise gradle
+might download models again and overwrite your models.
+
 ##### Build
 
 After editing your WORKSPACE file to update the SDK/NDK configuration,
 you may build the APK. Run this from your workspace root:
 
 ```bash
-$ bazel build -c opt //tensorflow/examples/android:tensorflow_demo
+bazel build -c opt //tensorflow/examples/android:tensorflow_demo
 ```
 
 If you get build errors about protocol buffers, run
@@ -130,10 +183,10 @@ later device, then after building use the following command from your workspace
 root to install the APK:
 
 ```bash
-$ adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
+adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
 ```
 
-### Android Studio
+### Android Studio with Bazel
 
 Android Studio may be used to build the demo in conjunction with Bazel. First,
 make sure that you can build with Bazel following the above directions. Then,
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 19f47558eb9..001d57f88ab 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -1,35 +1,102 @@
 // This file provides basic support for building the TensorFlow demo
 // in Android Studio with Gradle.
 //
-// Note that Bazel is still used to compile the native libs, and must be
-// installed at the location noted below. This build configuration merely
+// Note that Bazel is still used by default to compile the native libs,
+// and should be installed at the location noted below. This build file
 // automates the process of calling out to it and copying the compiled
 // libraries back into the appropriate directory.
+//
+// Alternatively, experimental support for Makefile builds is provided by
+// setting nativeBuildSystem below to 'makefile'. This will allow building the demo
+// on Windows machines, but note that full equivalence with the Bazel
+// build is not yet guaranteed. See comments below for caveats and tips
+// for speeding up the build, such as as enabling ccache.
+// NOTE: Running a make build will cause subsequent Bazel builds to *fail*
+// unless the contrib/makefile/downloads/ and gen/ dirs are deleted afterwards.
 
-def bazel_location = '/usr/local/bin/bazel'
-def cpuType = 'armeabi-v7a'
-def nativeDir = 'libs/' + cpuType
+// The cmake build only creates libtensorflow_demo.so. In this situation,
+// libtensorflow_inference.so will be acquired via the tensorflow.aar dependency.
 
+// It is necessary to customize Gradle's build directory, as otherwise
+// it will conflict with the BUILD file used by Bazel on case-insensitive OSs.
 project.buildDir = 'gradleBuild'
 getProject().setBuildDir('gradleBuild')
 
 buildscript {
-    System.properties['com.android.build.gradle.overrideVersionCheck'] = 'true'
-
     repositories {
         jcenter()
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.1.3'
+        classpath 'com.android.tools.build:gradle:2.3.0'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
     }
 }
 
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+// set to 'bazel', 'cmake', 'makefile', 'none'
+def nativeBuildSystem = 'bazel'
+
+// Controls output directory in APK and CPU type for Bazel builds.
+// NOTE: Does not affect the Makefile build target API (yet), which currently
+// assumes armeabi-v7a. If building with make, changing this will require
+// editing the Makefile as well.
+// The CMake build has only been tested with armeabi-v7a; others may not work.
+def cpuType = 'armeabi-v7a'
+
+// Output directory in the local directory for packaging into the APK.
+def nativeOutDir = 'libs/' + cpuType
+
+// Default to building with Bazel and override with make if requested.
+def nativeBuildRule = 'buildNativeBazel'
+def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
+def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+if (nativeBuildSystem == 'makefile') {
+    nativeBuildRule = 'buildNativeMake'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+}
+
+// If building with Bazel, this is the location of the bazel binary.
+// NOTE: Bazel does not yet support building for Android on Windows,
+// so in this case the Makefile build must be used as described above.
+def bazelLocation = '/usr/local/bin/bazel'
+
+// import DownloadModels task
+project.ext.ASSET_DIR = projectDir.toString() + '/assets'
+project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
+
 apply plugin: 'com.android.application'
 
 android {
-    compileSdkVersion 24
-    buildToolsVersion "24.0.1"
+    compileSdkVersion 23
+    buildToolsVersion "25.0.2"
+
+    if (nativeBuildSystem == 'cmake') {
+        defaultConfig {
+            applicationId = 'com.tensorflow.demo'
+            minSdkVersion 21
+            targetSdkVersion 23
+            ndk {
+                abiFilters "${cpuType}"
+            }
+            externalNativeBuild {
+                cmake {
+                    arguments '-DANDROID_TOOLCHAIN=gcc', '-DANDROID_STL=gnustl_static'
+                }
+            }
+        }
+        externalNativeBuild {
+            cmake {
+                path './jni/CMakeLists.txt'
+            }
+        }
+    }
 
     lintOptions {
         abortOnError false
@@ -37,13 +104,29 @@ android {
 
     sourceSets {
         main {
+            if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+                // TensorFlow Java API sources.
+                java {
+                    srcDir '../../java/src/main/java'
+                    exclude '**/examples/**'
+                }
+
+                // Android TensorFlow wrappers, etc.
+                java {
+                    srcDir '../../contrib/android/java'
+                }
+            }
+            // Android demo app sources.
+            java {
+                srcDir 'src'
+            }
+
             manifest.srcFile 'AndroidManifest.xml'
-            java.srcDirs = ['src', '../../contrib/android/java']
             resources.srcDirs = ['src']
             aidl.srcDirs = ['src']
             renderscript.srcDirs = ['src']
             res.srcDirs = ['res']
-            assets.srcDirs = ['assets']
+            assets.srcDirs = [project.ext.ASSET_DIR]
             jniLibs.srcDirs = ['libs']
         }
 
@@ -52,21 +135,57 @@ android {
     }
 }
 
-task buildNative(type:Exec) {
+task buildNativeBazel(type: Exec) {
     workingDir '../../..'
-    commandLine bazel_location, 'build', '-c', 'opt', \
-      'tensorflow/examples/android:tensorflow_native_libs', \
-       '--crosstool_top=//external:android/crosstool', \
-       '--cpu=' + cpuType, \
-       '--host_crosstool_top=@bazel_tools//tools/cpp:toolchain'
+    commandLine bazelLocation, 'build', '-c', 'opt',  \
+         'tensorflow/examples/android:tensorflow_native_libs',  \
+         '--crosstool_top=//external:android/crosstool',  \
+         '--cpu=' + cpuType,  \
+         '--host_crosstool_top=@bazel_tools//tools/cpp:toolchain'
 }
 
+task buildNativeMake(type: Exec) {
+    environment "NDK_ROOT", android.ndkDirectory
+    // Tip: install ccache and uncomment the following to speed up
+    // builds significantly.
+    // environment "CC_PREFIX", 'ccache'
+    workingDir '../../..'
+    commandLine 'tensorflow/contrib/makefile/build_all_android.sh',  \
+         '-s',  \
+         'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
+         '-t',  \
+         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
+}
+
+
 task copyNativeLibs(type: Copy) {
-    from('../../../bazel-bin/tensorflow/examples/android') { include '**/*.so' }
-    into nativeDir
+    from demoLibPath
+    from inferenceLibPath
+    into nativeOutDir
     duplicatesStrategy = 'include'
+    dependsOn nativeBuildRule
+    fileMode 0644
 }
 
-copyNativeLibs.dependsOn buildNative
-assemble.dependsOn copyNativeLibs
-assembleDebug.dependsOn copyNativeLibs
+tasks.whenTaskAdded { task ->
+    if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+        if (task.name == 'assembleDebug') {
+            task.dependsOn 'copyNativeLibs'
+        }
+        if (task.name == 'assembleRelease') {
+            task.dependsOn 'copyNativeLibs'
+        }
+    }
+}
+
+// Download default models; if you wish to use your own models then
+// place them in the "assets" directory and comment out this line.
+apply from: "download-models.gradle"
+
+
+dependencies {
+    if (nativeBuildSystem == 'cmake' || nativeBuildSystem == 'none') {
+        compile 'org.tensorflow:tensorflow-android:+'
+    }
+}
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/examples/android/download-models.gradle
new file mode 100644
index 00000000000..aca015fa265
--- /dev/null
+++ b/tensorflow/examples/android/download-models.gradle
@@ -0,0 +1,67 @@
+/*
+ * download-models.gradle
+ *     Downloads model files from ${MODEL_URL} into application's asset folder
+ * Input:
+ *     project.ext.TMP_DIR: absolute path to hold downloaded zip files
+ *     project.ext.ASSET_DIR: absolute path to save unzipped model files
+ * Output:
+ *     3 model files will be downloaded into given folder of ext.ASSET_DIR
+ */
+// hard coded model files
+// LINT.IfChange
+def models = ['inception5h.zip',
+              'mobile_multibox_v1a.zip',
+              'stylize_v1.zip']
+// LINT.ThenChange(//tensorflow/examples/android/BUILD)
+
+// Root URL for model archives
+def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models'
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'de.undercouch:gradle-download-task:3.2.0'
+    }
+}
+
+import de.undercouch.gradle.tasks.download.Download
+task downloadFile(type: Download){
+    for (f in models) {
+        src "${MODEL_URL}/" + f
+    }
+    dest new File(project.ext.TMP_DIR)
+    overwrite true
+}
+
+task extractModels(type: Copy) {
+    for (f in models) {
+        from zipTree(project.ext.TMP_DIR + '/' + f)
+    }
+
+    into file(project.ext.ASSET_DIR)
+    fileMode  0644
+    exclude '**/LICENSE'
+
+    def needDownload = false
+    for (f in models) {
+        if (!(new File(project.ext.TMP_DIR + '/' + f)).exists()) {
+            needDownload = true
+        }
+    }
+
+    if (needDownload) {
+        dependsOn downloadFile
+    }
+}
+
+tasks.whenTaskAdded { task ->
+    if (task.name == 'assembleDebug') {
+        task.dependsOn 'extractModels'
+    }
+    if (task.name == 'assembleRelease') {
+        task.dependsOn 'extractModels'
+    }
+}
+
diff --git a/tensorflow/examples/android/jni/CMakeLists.txt b/tensorflow/examples/android/jni/CMakeLists.txt
new file mode 100644
index 00000000000..c8238464f57
--- /dev/null
+++ b/tensorflow/examples/android/jni/CMakeLists.txt
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+project(TENSORFLOW_DEMO)
+cmake_minimum_required(VERSION 3.4.1)
+
+set(CMAKE_VERBOSE_MAKEFILE on)
+
+get_filename_component(TF_SRC_ROOT ${CMAKE_SOURCE_DIR}/../../../..  ABSOLUTE)
+get_filename_component(SAMPLE_SRC_DIR  ${CMAKE_SOURCE_DIR}/..  ABSOLUTE)
+
+if (ANDROID_ABI MATCHES "^armeabi-v7a$")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon")
+elseif(ANDROID_ABI MATCHES "^arm64-v8a")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -ftree-vectorize")
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTANDALONE_DEMO_LIB \
+                    -std=c++11 -fno-exceptions -fno-rtti -O2 -Wno-narrowing \
+                    -fPIE")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} \
+                              -Wl,--allow-multiple-definition \
+                              -Wl,--whole-archive -fPIE -v")
+
+file(GLOB_RECURSE tensorflow_demo_sources ${SAMPLE_SRC_DIR}/jni/*.*)
+add_library(tensorflow_demo SHARED
+            ${tensorflow_demo_sources})
+target_include_directories(tensorflow_demo PRIVATE
+                           ${TF_SRC_ROOT}
+                           ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(tensorflow_demo
+                      android
+                      log
+                      jnigraphics
+                      m
+                      atomic
+                      z)
diff --git a/tensorflow/examples/android/jni/object_tracking/geom.h b/tensorflow/examples/android/jni/object_tracking/geom.h
index 5d5249cd97d..28190636165 100644
--- a/tensorflow/examples/android/jni/object_tracking/geom.h
+++ b/tensorflow/examples/android/jni/object_tracking/geom.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 #define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/log_streaming.h"
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
diff --git a/tensorflow/examples/android/jni/object_tracking/image-inl.h b/tensorflow/examples/android/jni/object_tracking/image-inl.h
index e3fc89d5a0f..9c4c389aa71 100644
--- a/tensorflow/examples/android/jni/object_tracking/image-inl.h
+++ b/tensorflow/examples/android/jni/object_tracking/image-inl.h
@@ -452,7 +452,7 @@ void Image<T>::DownsampleSmoothed5x5(const Image<T>& original) {
                                        4, 16, 24, 16, 4,   // 64 +
                                        1,  4,  6,  4, 1};  // 16 = 256
 
-  // We'll multiply and sum with the the whole numbers first, then divide by
+  // We'll multiply and sum with the whole numbers first, then divide by
   // the total weight to normalize at the last moment.
   for (int y = 0; y < height_; ++y) {
     for (int x = 0; x < width_; ++x) {
diff --git a/tensorflow/examples/android/jni/object_tracking/image.h b/tensorflow/examples/android/jni/object_tracking/image.h
index a7fa789ce59..b7a2301f5e1 100644
--- a/tensorflow/examples/android/jni/object_tracking/image.h
+++ b/tensorflow/examples/android/jni/object_tracking/image.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/core/platform/macros.h"
-
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
diff --git a/tensorflow/examples/android/jni/object_tracking/image_utils.h b/tensorflow/examples/android/jni/object_tracking/image_utils.h
index 2d712e77f91..ac9ffd90f8a 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_utils.h
@@ -67,7 +67,7 @@ inline static void MarkImage(const int x, const int y, const int radius,
     // reduce the number of iterations required as compared to starting from
     // either 0 and counting up or radius and counting down.
     for (int d_x = radius - d_y; d_x <= radius; ++d_x) {
-      // The first time this critera is met, we know the width of the circle at
+      // The first time this criteria is met, we know the width of the circle at
       // this row (without using sqrt).
       if (squared_y_dist + Square(d_x) >= squared_radius) {
         const int min_x = MAX(x - d_x, 0);
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index 16169b2e7e8..21fbabb5211 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <android/log.h>
-
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
 // The JniLongField class is used to access Java fields from native code. This
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint.h b/tensorflow/examples/android/jni/object_tracking/keypoint.h
index 87f58a33259..719f9aff3f8 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint.h
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
 #include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/log_streaming.h"
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/time_log.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
diff --git a/tensorflow/examples/android/jni/object_tracking/log_streaming.h b/tensorflow/examples/android/jni/object_tracking/log_streaming.h
deleted file mode 100644
index e68945cc721..00000000000
--- a/tensorflow/examples/android/jni/object_tracking/log_streaming.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
-
-#include <string.h>
-#include <string>
-
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-
-using namespace tensorflow;
-
-namespace tf_tracking {
-
-#define LOGV(...)
-#define LOGD(...)
-#define LOGI(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
-#define LOGW(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
-#define LOGE(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
-
-}  // namespace tf_tracking
-
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.cc b/tensorflow/examples/android/jni/object_tracking/logging.cc
new file mode 100644
index 00000000000..0ba8a541ab7
--- /dev/null
+++ b/tensorflow/examples/android/jni/object_tracking/logging.cc
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
+
+#ifdef STANDALONE_DEMO_LIB
+
+#include <android/log.h>
+#include <stdlib.h>
+#include <time.h>
+#include <iostream>
+#include <sstream>
+
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
+void LogMessage::GenerateLogMessage() {
+  int android_log_level;
+  switch (severity_) {
+    case INFO:
+      android_log_level = ANDROID_LOG_INFO;
+      break;
+    case WARNING:
+      android_log_level = ANDROID_LOG_WARN;
+      break;
+    case ERROR:
+      android_log_level = ANDROID_LOG_ERROR;
+      break;
+    case FATAL:
+      android_log_level = ANDROID_LOG_FATAL;
+      break;
+    default:
+      if (severity_ < INFO) {
+        android_log_level = ANDROID_LOG_VERBOSE;
+      } else {
+        android_log_level = ANDROID_LOG_ERROR;
+      }
+      break;
+  }
+
+  std::stringstream ss;
+  const char* const partial_name = strrchr(fname_, '/');
+  ss << (partial_name != nullptr ? partial_name + 1 : fname_) << ":" << line_
+     << " " << str();
+  __android_log_write(android_log_level, "native", ss.str().c_str());
+
+  // Also log to stderr (for standalone Android apps).
+  std::cerr << "native : " << ss.str() << std::endl;
+
+  // Android logging at level FATAL does not terminate execution, so abort()
+  // is still required to stop the program.
+  if (severity_ == FATAL) {
+    abort();
+  }
+}
+
+namespace {
+
+// Parse log level (int64) from environment variable (char*)
+int64_t LogLevelStrToInt(const char* tf_env_var_val) {
+  if (tf_env_var_val == nullptr) {
+    return 0;
+  }
+
+  // Ideally we would use env_var / safe_strto64, but it is
+  // hard to use here without pulling in a lot of dependencies,
+  // so we use std:istringstream instead
+  std::string min_log_level(tf_env_var_val);
+  std::istringstream ss(min_log_level);
+  int64_t level;
+  if (!(ss >> level)) {
+    // Invalid vlog level setting, set level to default (0)
+    level = 0;
+  }
+
+  return level;
+}
+
+int64_t MinLogLevelFromEnv() {
+  const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
+  return LogLevelStrToInt(tf_env_var_val);
+}
+
+int64_t MinVLogLevelFromEnv() {
+  const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
+  return LogLevelStrToInt(tf_env_var_val);
+}
+
+}  // namespace
+
+LogMessage::~LogMessage() {
+  // Read the min log level once during the first call to logging.
+  static int64_t min_log_level = MinLogLevelFromEnv();
+  if (TF_PREDICT_TRUE(severity_ >= min_log_level)) GenerateLogMessage();
+}
+
+int64_t LogMessage::MinVLogLevel() {
+  static const int64_t min_vlog_level = MinVLogLevelFromEnv();
+  return min_vlog_level;
+}
+
+LogMessageFatal::LogMessageFatal(const char* file, int line)
+    : LogMessage(file, line, ANDROID_LOG_FATAL) {}
+LogMessageFatal::~LogMessageFatal() {
+  // abort() ensures we don't return (we promised we would not via
+  // ATTRIBUTE_NORETURN).
+  GenerateLogMessage();
+  abort();
+}
+
+void LogString(const char* fname, int line, int severity,
+               const std::string& message) {
+  LogMessage(fname, line, severity) << message;
+}
+
+void LogPrintF(const int severity, const char* format, ...) {
+  char message[1024];
+  va_list argptr;
+  va_start(argptr, format);
+  vsnprintf(message, 1024, format, argptr);
+  va_end(argptr);
+  __android_log_write(severity, "native", message);
+
+  // Also log to stderr (for standalone Android apps).
+  std::cerr << "native : " << message << std::endl;
+}
+
+#endif
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.h b/tensorflow/examples/android/jni/object_tracking/logging.h
new file mode 100644
index 00000000000..dbc89af2f7e
--- /dev/null
+++ b/tensorflow/examples/android/jni/object_tracking/logging.h
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+
+#include <android/log.h>
+#include <string.h>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+// Allow this library to be built without depending on TensorFlow by
+// defining STANDALONE_DEMO_LIB. Otherwise TensorFlow headers will be
+// used.
+#ifdef STANDALONE_DEMO_LIB
+
+// A macro to disallow the copy constructor and operator= functions
+// This is usually placed in the private: declarations for a class.
+#define TF_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;         \
+  void operator=(const TypeName&) = delete
+
+#if defined(COMPILER_GCC3)
+#define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#else
+#define TF_PREDICT_FALSE(x) (x)
+#define TF_PREDICT_TRUE(x) (x)
+#endif
+
+// Log levels equivalent to those defined by
+// third_party/tensorflow/core/platform/logging.h
+const int INFO = 0;            // base_logging::INFO;
+const int WARNING = 1;         // base_logging::WARNING;
+const int ERROR = 2;           // base_logging::ERROR;
+const int FATAL = 3;           // base_logging::FATAL;
+const int NUM_SEVERITIES = 4;  // base_logging::NUM_SEVERITIES;
+
+class LogMessage : public std::basic_ostringstream<char> {
+ public:
+  LogMessage(const char* fname, int line, int severity);
+  ~LogMessage();
+
+  // Returns the minimum log level for VLOG statements.
+  // E.g., if MinVLogLevel() is 2, then VLOG(2) statements will produce output,
+  // but VLOG(3) will not. Defaults to 0.
+  static int64_t MinVLogLevel();
+
+ protected:
+  void GenerateLogMessage();
+
+ private:
+  const char* fname_;
+  int line_;
+  int severity_;
+};
+
+// LogMessageFatal ensures the process will exit in failure after
+// logging this message.
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line);
+  ~LogMessageFatal();
+};
+
+#define _TF_LOG_INFO \
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+#define _TF_LOG_WARNING \
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::WARNING)
+#define _TF_LOG_ERROR \
+  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::ERROR)
+#define _TF_LOG_FATAL \
+  ::tensorflow::internal::LogMessageFatal(__FILE__, __LINE__)
+
+#define _TF_LOG_QFATAL _TF_LOG_FATAL
+
+#define LOG(severity) _TF_LOG_##severity
+
+#define VLOG_IS_ON(lvl) ((lvl) <= LogMessage::MinVLogLevel())
+
+#define VLOG(lvl)                        \
+  if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \
+  LogMessage(__FILE__, __LINE__, ANDROID_LOG_INFO)
+
+void LogPrintF(const int severity, const char* format, ...);
+
+// Support for printf style logging.
+#define LOGV(...)
+#define LOGD(...)
+#define LOGI(...) LogPrintF(ANDROID_LOG_INFO, __VA_ARGS__);
+#define LOGW(...) LogPrintF(ANDROID_LOG_INFO, __VA_ARGS__);
+#define LOGE(...) LogPrintF(ANDROID_LOG_ERROR, __VA_ARGS__);
+
+#else
+
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+// Support for printf style logging.
+#define LOGV(...)
+#define LOGD(...)
+#define LOGI(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
+#define LOGW(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
+#define LOGE(...) LOG(INFO) << tensorflow::strings::Printf(__VA_ARGS__);
+
+#endif
+
+#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.cc b/tensorflow/examples/android/jni/object_tracking/object_tracker.cc
index a9c64973518..aa67acdf97a 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.cc
+++ b/tensorflow/examples/android/jni/object_tracking/object_tracker.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
 #include "tensorflow/examples/android/jni/object_tracking/image.h"
 #include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
-#include "tensorflow/examples/android/jni/object_tracking/log_streaming.h"
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/time_log.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.h b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
index eef2849797e..eb281fad372 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
-#include "tensorflow/examples/android/jni/object_tracking/log_streaming.h"
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/time_log.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.cc b/tensorflow/examples/android/jni/object_tracking/time_log.cc
index 35a610eaf04..8fce78f915f 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.cc
+++ b/tensorflow/examples/android/jni/object_tracking/time_log.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/examples/android/jni/object_tracking/time_log.h"
 
-using namespace tensorflow;
-
 #ifdef LOG_TIME
 // Storage for logging functionality.
 int num_time_logs = 0;
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.h b/tensorflow/examples/android/jni/object_tracking/time_log.h
index 3f91d037220..60911da396c 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.h
+++ b/tensorflow/examples/android/jni/object_tracking/time_log.h
@@ -20,9 +20,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/core/platform/logging.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/log_streaming.h"
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
 
 #ifdef LOG_TIME
diff --git a/tensorflow/examples/android/jni/object_tracking/utils.h b/tensorflow/examples/android/jni/object_tracking/utils.h
index 34ca84aae4e..51cdfcdcfb1 100644
--- a/tensorflow/examples/android/jni/object_tracking/utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/utils.h
@@ -28,9 +28,7 @@ limitations under the License.
 #include <sys/time.h>
 #endif  // ifdef HAVE_CLOCK_GETTIME
 
-#include "tensorflow/core/platform/logging.h"
-
-using namespace tensorflow;
+#include "tensorflow/examples/android/jni/object_tracking/logging.h"
 
 // TODO(andrewharp): clean up these macros to use the codebase statndard.
 
diff --git a/tensorflow/examples/android/jni/yuv2rgb.cc b/tensorflow/examples/android/jni/yuv2rgb.cc
index 6aa43acf193..96d9632e572 100644
--- a/tensorflow/examples/android/jni/yuv2rgb.cc
+++ b/tensorflow/examples/android/jni/yuv2rgb.cc
@@ -39,9 +39,9 @@ static inline uint32_t YUV2RGB(int nY, int nU, int nV) {
   // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
   // nB = (int)(1.164 * nY + 1.596 * nV);
 
-  int nR = (int)(1192 * nY + 1634 * nV);
-  int nG = (int)(1192 * nY - 833 * nV - 400 * nU);
-  int nB = (int)(1192 * nY + 2066 * nU);
+  int nR = 1192 * nY + 1634 * nV;
+  int nG = 1192 * nY - 833 * nV - 400 * nU;
+  int nB = 1192 * nY + 2066 * nU;
 
   nR = MIN(kMaxChannelValue, MAX(0, nR));
   nG = MIN(kMaxChannelValue, MAX(0, nG));
@@ -171,9 +171,9 @@ void ConvertYUV420SPToRGB565(const uint8_t* const input, uint16_t* const output,
       // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
       // nB = (int)(1.164 * nY + 1.596 * nV);
 
-      int nR = (int)(1192 * nY + 1634 * nV);
-      int nG = (int)(1192 * nY - 833 * nV - 400 * nU);
-      int nB = (int)(1192 * nY + 2066 * nU);
+      int nR = 1192 * nY + 1634 * nV;
+      int nG = 1192 * nY - 833 * nV - 400 * nU;
+      int nB = 1192 * nY + 2066 * nU;
 
       nR = MIN(kMaxChannelValue, MAX(0, nR));
       nG = MIN(kMaxChannelValue, MAX(0, nG));
diff --git a/tensorflow/examples/android/sample_images/classify1.jpg b/tensorflow/examples/android/sample_images/classify1.jpg
new file mode 100644
index 00000000000..6cb730bfecf
Binary files /dev/null and b/tensorflow/examples/android/sample_images/classify1.jpg differ
diff --git a/tensorflow/examples/android/sample_images/detect1.jpg b/tensorflow/examples/android/sample_images/detect1.jpg
new file mode 100644
index 00000000000..a62a8e8bedf
Binary files /dev/null and b/tensorflow/examples/android/sample_images/detect1.jpg differ
diff --git a/tensorflow/examples/android/sample_images/stylize1.jpg b/tensorflow/examples/android/sample_images/stylize1.jpg
new file mode 100644
index 00000000000..7bad9c25107
Binary files /dev/null and b/tensorflow/examples/android/sample_images/stylize1.jpg differ
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index a684579228f..27d7e414870 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -151,14 +151,17 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
   }
 
   protected void setFragment() {
-    final Fragment fragment = CameraConnectionFragment.newInstance(
-        new CameraConnectionFragment.ConnectionCallback(){
-          @Override
-          public void onPreviewSizeChosen(final Size size, final int rotation) {
-            CameraActivity.this.onPreviewSizeChosen(size, rotation);
-          }
-        },
-        this, getLayoutId(), getDesiredPreviewFrameSize());
+    final Fragment fragment =
+        CameraConnectionFragment.newInstance(
+            new CameraConnectionFragment.ConnectionCallback() {
+              @Override
+              public void onPreviewSizeChosen(final Size size, final int rotation) {
+                CameraActivity.this.onPreviewSizeChosen(size, rotation);
+              }
+            },
+            this,
+            getLayoutId(),
+            getDesiredPreviewFrameSize());
 
     getFragmentManager()
         .beginTransaction()
@@ -212,5 +215,5 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
 
   protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
   protected abstract int getLayoutId();
-  protected abstract int getDesiredPreviewFrameSize();
+  protected abstract Size getDesiredPreviewFrameSize();
 }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
index 85df16c3c3c..76bd61d00f2 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -42,6 +42,7 @@ import android.media.ImageReader.OnImageAvailableListener;
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.HandlerThread;
+import android.text.TextUtils;
 import android.util.Size;
 import android.util.SparseIntArray;
 import android.view.LayoutInflater;
@@ -215,10 +216,8 @@ public class CameraConnectionFragment extends Fragment {
    */
   private final OnImageAvailableListener imageListener;
 
-  /**
-   * The input size in pixels desired by TensorFlow (width and height of a square bitmap).
-   */
-  private final int inputSize;
+  /** The input size in pixels desired by TensorFlow (width and height of a square bitmap). */
+  private final Size inputSize;
 
   /**
    * The layout identifier to inflate for this Fragment.
@@ -231,7 +230,8 @@ public class CameraConnectionFragment extends Fragment {
   private CameraConnectionFragment(
       final ConnectionCallback connectionCallback,
       final OnImageAvailableListener imageListener,
-      final int layout, final int inputSize) {
+      final int layout,
+      final Size inputSize) {
     this.cameraConnectionCallback = connectionCallback;
     this.imageListener = imageListener;
     this.layout = layout;
@@ -258,32 +258,43 @@ public class CameraConnectionFragment extends Fragment {
 
   /**
    * Given {@code choices} of {@code Size}s supported by a camera, chooses the smallest one whose
-   * width and height are at least as large as the respective requested values, and whose aspect
-   * ratio matches with the specified value.
+   * width and height are at least as large as the minimum of both, or an exact match if possible.
    *
-   * @param choices     The list of sizes that the camera supports for the intended output class
-   * @param width       The minimum desired width
-   * @param height      The minimum desired height
-   * @param aspectRatio The aspect ratio
+   * @param choices The list of sizes that the camera supports for the intended output class
+   * @param width The minimum desired width
+   * @param height The minimum desired height
    * @return The optimal {@code Size}, or an arbitrary one if none were big enough
    */
-  private static Size chooseOptimalSize(
-      final Size[] choices, final int width, final int height, final Size aspectRatio) {
+  private static Size chooseOptimalSize(final Size[] choices, final int width, final int height) {
+    final int minSize = Math.max(Math.min(width, height), MINIMUM_PREVIEW_SIZE);
+    final Size desiredSize = new Size(width, height);
+
     // Collect the supported resolutions that are at least as big as the preview Surface
+    boolean exactSizeFound = false;
     final List<Size> bigEnough = new ArrayList<Size>();
-
-    final int minWidth = Math.max(width, MINIMUM_PREVIEW_SIZE);
-    final int minHeight = Math.max(height, MINIMUM_PREVIEW_SIZE);
-
+    final List<Size> tooSmall = new ArrayList<Size>();
     for (final Size option : choices) {
-      if (option.getHeight() >= minHeight && option.getWidth() >= minWidth) {
-        LOGGER.i("Adding size: " + option.getWidth() + "x" + option.getHeight());
+      if (option.equals(desiredSize)) {
+        // Set the size but don't return yet so that remaining sizes will still be logged.
+        exactSizeFound = true;
+      }
+
+      if (option.getHeight() >= minSize && option.getWidth() >= minSize) {
         bigEnough.add(option);
       } else {
-        LOGGER.i("Not adding size: " + option.getWidth() + "x" + option.getHeight());
+        tooSmall.add(option);
       }
     }
 
+    LOGGER.i("Desired size: " + desiredSize + ", min size: " + minSize + "x" + minSize);
+    LOGGER.i("Valid preview sizes: [" + TextUtils.join(", ", bigEnough) + "]");
+    LOGGER.i("Rejected preview sizes: [" + TextUtils.join(", ", tooSmall) + "]");
+
+    if (exactSizeFound) {
+      LOGGER.i("Exact size match found.");
+      return desiredSize;
+    }
+
     // Pick the smallest of those, assuming we found any
     if (bigEnough.size() > 0) {
       final Size chosenSize = Collections.min(bigEnough, new CompareSizesByArea());
@@ -297,7 +308,9 @@ public class CameraConnectionFragment extends Fragment {
 
   public static CameraConnectionFragment newInstance(
       final ConnectionCallback callback,
-      final OnImageAvailableListener imageListener, final int layout, final int inputSize) {
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
     return new CameraConnectionFragment(callback, imageListener, layout, inputSize);
   }
 
@@ -378,8 +391,10 @@ public class CameraConnectionFragment extends Fragment {
         // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
         // garbage capture data.
         previewSize =
-            chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
-                inputSize, inputSize, largest);
+            chooseOptimalSize(
+                map.getOutputSizes(SurfaceTexture.class),
+                inputSize.getWidth(),
+                inputSize.getHeight());
 
         // We fit the aspect ratio of TextureView to the size of preview we picked.
         final int orientation = getResources().getConfiguration().orientation;
@@ -390,18 +405,20 @@ public class CameraConnectionFragment extends Fragment {
         }
 
         CameraConnectionFragment.this.cameraId = cameraId;
-
-        cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
-        return;
       }
     } catch (final CameraAccessException e) {
       LOGGER.e(e, "Exception!");
     } catch (final NullPointerException e) {
       // Currently an NPE is thrown when the Camera2API is used but not supported on the
       // device this code runs.
+      // TODO(andrewharp): abstract ErrorDialog/RuntimeException handling out into new method and
+      // reuse throughout app.
       ErrorDialog.newInstance(getString(R.string.camera_error))
           .show(getChildFragmentManager(), FRAGMENT_DIALOG);
+      throw new RuntimeException(getString(R.string.camera_error));
     }
+
+    cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
   }
 
   /**
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index aa46c0c3464..bc391269255 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -45,7 +45,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   // These are the settings for the original v1 Inception model. If you want to
   // use a model that's been produced from the TensorFlow for Poets codelab,
   // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
-  // INPUT_NAME = "Mul:0", and OUTPUT_NAME = "final_result:0".
+  // INPUT_NAME = "Mul", and OUTPUT_NAME = "final_result".
   // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
   // the ones you produced.
   //
@@ -58,14 +58,11 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   // --input_node_names="Mul" \
   // --output_node_names="final_result" \
   // --input_binary=true
-  //
-  // Note: the actual number of classes for Inception is 1001, but the output layer size is 1008.
-  private static final int NUM_CLASSES = 1008;
   private static final int INPUT_SIZE = 224;
   private static final int IMAGE_MEAN = 117;
   private static final float IMAGE_STD = 1;
-  private static final String INPUT_NAME = "input:0";
-  private static final String OUTPUT_NAME = "output:0";
+  private static final String INPUT_NAME = "input";
+  private static final String OUTPUT_NAME = "output";
 
   private static final String MODEL_FILE = "file:///android_asset/tensorflow_inception_graph.pb";
   private static final String LABEL_FILE =
@@ -75,6 +72,8 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
 
   private static final boolean MAINTAIN_ASPECT = true;
 
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
   private Classifier classifier;
 
   private Integer sensorOrientation;
@@ -105,8 +104,8 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   }
 
   @Override
-  protected int getDesiredPreviewFrameSize() {
-    return INPUT_SIZE;
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
   }
 
   private static final float TEXT_SIZE_DIP = 10;
@@ -119,21 +118,16 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    try {
-      classifier =
-          TensorFlowImageClassifier.create(
-              getAssets(),
-              MODEL_FILE,
-              LABEL_FILE,
-              NUM_CLASSES,
-              INPUT_SIZE,
-              IMAGE_MEAN,
-              IMAGE_STD,
-              INPUT_NAME,
-              OUTPUT_NAME);
-    } catch (final Exception e) {
-      throw new RuntimeException("Error initializing TensorFlow!", e);
-    }
+    classifier =
+        TensorFlowImageClassifier.create(
+            getAssets(),
+            MODEL_FILE,
+            LABEL_FILE,
+            INPUT_SIZE,
+            IMAGE_MEAN,
+            IMAGE_STD,
+            INPUT_NAME,
+            OUTPUT_NAME);
 
     resultsView = (ResultsView) findViewById(R.id.results);
     previewWidth = size.getWidth();
@@ -200,13 +194,12 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index d06f2d3c0f8..5800f80651b 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -52,12 +52,12 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   private static final Logger LOGGER = new Logger();
 
   // Configuration values for the prepackaged multibox model.
-  private static final int MB_NUM_LOCATIONS = 784;
   private static final int MB_INPUT_SIZE = 224;
   private static final int MB_IMAGE_MEAN = 128;
   private static final float MB_IMAGE_STD = 128;
   private static final String MB_INPUT_NAME = "ResizeBilinear";
-  private static final String MB_OUTPUT_NAMES = "output_locations/Reshape,output_scores/Reshape";
+  private static final String MB_OUTPUT_LOCATIONS_NAME = "output_locations/Reshape";
+  private static final String MB_OUTPUT_SCORES_NAME = "output_scores/Reshape";
   private static final String MB_MODEL_FILE = "file:///android_asset/multibox_model.pb";
   private static final String MB_LOCATION_FILE =
       "file:///android_asset/multibox_location_priors.txt";
@@ -79,10 +79,12 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   private static final int CROP_SIZE = USE_YOLO ? YOLO_INPUT_SIZE : MB_INPUT_SIZE;
 
   // Minimum detection confidence to track a detection.
-  private static final float MINIMUM_CONFIDENCE = USE_YOLO ? 0.0f : 0.1f;
+  private static final float MINIMUM_CONFIDENCE = USE_YOLO ? 0.25f : 0.1f;
 
   private static final boolean MAINTAIN_ASPECT = USE_YOLO;
 
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
   private static final boolean SAVE_PREVIEW_BITMAP = false;
   private static final float TEXT_SIZE_DIP = 10;
 
@@ -122,33 +124,28 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    tracker = new MultiBoxTracker(getResources().getDisplayMetrics());
+    tracker = new MultiBoxTracker(this);
 
-    try {
-      if (USE_YOLO) {
-        detector =
-            TensorFlowYoloDetector.create(
-                getAssets(),
-                YOLO_MODEL_FILE,
-                YOLO_INPUT_SIZE,
-                YOLO_INPUT_NAME,
-                YOLO_OUTPUT_NAMES,
-                YOLO_BLOCK_SIZE);
-      } else {
-        detector =
-            TensorFlowMultiBoxDetector.create(
-                getAssets(),
-                MB_MODEL_FILE,
-                MB_LOCATION_FILE,
-                MB_NUM_LOCATIONS,
-                MB_INPUT_SIZE,
-                MB_IMAGE_MEAN,
-                MB_IMAGE_STD,
-                MB_INPUT_NAME,
-                MB_OUTPUT_NAMES);
-      }
-    } catch (final Exception e) {
-      throw new RuntimeException("Error initializing TensorFlow!", e);
+    if (USE_YOLO) {
+      detector =
+          TensorFlowYoloDetector.create(
+              getAssets(),
+              YOLO_MODEL_FILE,
+              YOLO_INPUT_SIZE,
+              YOLO_INPUT_NAME,
+              YOLO_OUTPUT_NAMES,
+              YOLO_BLOCK_SIZE);
+    } else {
+      detector =
+          TensorFlowMultiBoxDetector.create(
+              getAssets(),
+              MB_MODEL_FILE,
+              MB_LOCATION_FILE,
+              MB_IMAGE_MEAN,
+              MB_IMAGE_STD,
+              MB_INPUT_NAME,
+              MB_OUTPUT_LOCATIONS_NAME,
+              MB_OUTPUT_SCORES_NAME);
     }
 
     previewWidth = size.getWidth();
@@ -276,13 +273,12 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
@@ -354,8 +350,8 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   }
 
   @Override
-  protected int getDesiredPreviewFrameSize() {
-    return CROP_SIZE;
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index 8a3c7a4ef92..7afe2bf5412 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -51,6 +51,7 @@ import android.widget.Toast;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Vector;
 import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 import org.tensorflow.demo.OverlayView.DrawCallback;
@@ -64,15 +65,11 @@ import org.tensorflow.demo.R;
  * Artistic Style" (https://arxiv.org/abs/1610.07629)
  */
 public class StylizeActivity extends CameraActivity implements OnImageAvailableListener {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final Logger LOGGER = new Logger();
 
   private static final String MODEL_FILE = "file:///android_asset/stylize_quantized.pb";
-  private static final String INPUT_NODE = "input:0";
-  private static final String STYLE_NODE = "style_num:0";
+  private static final String INPUT_NODE = "input";
+  private static final String STYLE_NODE = "style_num";
   private static final String OUTPUT_NODE = "transformer/expand/conv3/conv/Sigmoid";
   private static final int NUM_STYLES = 26;
 
@@ -86,7 +83,9 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
   private static final boolean DEBUG_MODEL = false;
 
-  private static final int[] SIZES = {32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024};
+  private static final int[] SIZES = {128, 192, 256, 384, 512, 720};
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(1280, 720);
 
   // Start at a medium size, but let the user step up through smaller sizes so they don't get
   // immediately stuck processing a large image.
@@ -171,6 +170,9 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
                 slider = null;
               }
               break;
+
+            default: // fall out
+
           }
           return true;
         }
@@ -187,8 +189,8 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
   }
 
   @Override
-  protected int getDesiredPreviewFrameSize() {
-    return SIZES[SIZES.length - 1];
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
   }
 
   public static Bitmap getBitmapFromAsset(final Context context, final String filePath) {
@@ -264,7 +266,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
   private class ImageGridAdapter extends BaseAdapter {
     final ImageSlider[] items = new ImageSlider[NUM_STYLES];
-    final ArrayList<Button> buttons = new ArrayList<Button>();
+    final ArrayList<Button> buttons = new ArrayList<>();
 
     {
       final Button sizeButton =
@@ -295,7 +297,9 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
               setMeasuredDimension(getMeasuredWidth(), getMeasuredWidth());
             }
           };
-      saveButton.setText("Save");
+      saveButton.setText("save");
+      saveButton.setTextSize(12);
+
       saveButton.setOnClickListener(
           new OnClickListener() {
             @Override
@@ -365,8 +369,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    inferenceInterface = new TensorFlowInferenceInterface();
-    inferenceInterface.initializeTensorFlow(getAssets(), MODEL_FILE);
+    inferenceInterface = new TensorFlowInferenceInterface(getAssets(), MODEL_FILE);
 
     previewWidth = size.getWidth();
     previewHeight = size.getHeight();
@@ -428,7 +431,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
         // Everything else is 0, so just pick a suitable slider to push up when the
         // selected one goes down.
         if (adapter.items[lastOtherStyle] == slider) {
-          lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES;
+          lastOtherStyle = (lastOtherStyle + 1) % NUM_STYLES;
         }
         adapter.items[lastOtherStyle].setValue(1.0f - value);
       }
@@ -502,17 +505,17 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
       final int yRowStride = planes[0].getRowStride();
       final int uvRowStride = planes[1].getRowStride();
       final int uvPixelStride = planes[1].getPixelStride();
+
       ImageUtils.convertYUV420ToARGB8888(
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
@@ -553,8 +556,6 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     Trace.endSection();
   }
 
-  String outputNode = "";
-
   private void stylizeImage(final Bitmap bitmap) {
     ++frameNum;
     bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
@@ -583,12 +584,12 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     }
 
     // Copy the input data into TensorFlow.
-    inferenceInterface.fillNodeFloat(
-        INPUT_NODE, new int[] {1, bitmap.getWidth(), bitmap.getHeight(), 3}, floatValues);
-    inferenceInterface.fillNodeFloat(STYLE_NODE, new int[] {NUM_STYLES}, styleVals);
+    inferenceInterface.feed(
+        INPUT_NODE, floatValues, 1, bitmap.getWidth(), bitmap.getHeight(), 3);
+    inferenceInterface.feed(STYLE_NODE, styleVals, NUM_STYLES);
 
-    inferenceInterface.runInference(new String[] {OUTPUT_NODE});
-    inferenceInterface.readNodeFloat(OUTPUT_NODE, floatValues);
+    inferenceInterface.run(new String[] {OUTPUT_NODE}, isDebug());
+    inferenceInterface.fetch(OUTPUT_NODE, floatValues);
 
     for (int i = 0; i < intValues.length; ++i) {
       intValues[i] =
@@ -601,11 +602,6 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     bitmap.setPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
   }
 
-  @Override
-  public void onSetDebug(final boolean debug) {
-    inferenceInterface.enableStatLogging(debug);
-  }
-
   private void renderDebug(final Canvas canvas) {
     // TODO(andrewharp): move result display to its own View instead of using debug overlay.
     final Bitmap texture = textureCopyBitmap;
@@ -640,12 +636,10 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
         canvas.getHeight() - copy.getHeight() * scaleFactor);
     canvas.drawBitmap(copy, matrix, new Paint());
 
-    final Vector<String> lines = new Vector<String>();
+    final Vector<String> lines = new Vector<>();
 
     final String[] statLines = inferenceInterface.getStatString().split("\n");
-    for (final String line : statLines) {
-      lines.add(line);
-    }
+    Collections.addAll(lines, statLines);
 
     lines.add("");
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
index ff77696792d..5756bd6b64f 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
@@ -27,14 +27,11 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.Vector;
+import org.tensorflow.Operation;
 import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 
 /** A classifier specialized to label images using TensorFlow. */
 public class TensorFlowImageClassifier implements Classifier {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final String TAG = "TensorFlowImageClassifier";
 
   // Only return this many results with at least this confidence.
@@ -55,6 +52,8 @@ public class TensorFlowImageClassifier implements Classifier {
   private float[] outputs;
   private String[] outputNames;
 
+  private boolean logStats = false;
+
   private TensorFlowInferenceInterface inferenceInterface;
 
   private TensorFlowImageClassifier() {}
@@ -65,7 +64,6 @@ public class TensorFlowImageClassifier implements Classifier {
    * @param assetManager The asset manager to be used to load assets.
    * @param modelFilename The filepath of the model GraphDef protocol buffer.
    * @param labelFilename The filepath of label file for classes.
-   * @param numClasses The number of classes output by the model.
    * @param inputSize The input size. A square image of inputSize x inputSize is assumed.
    * @param imageMean The assumed mean of the image values.
    * @param imageStd The assumed std of the image values.
@@ -77,13 +75,11 @@ public class TensorFlowImageClassifier implements Classifier {
       AssetManager assetManager,
       String modelFilename,
       String labelFilename,
-      int numClasses,
       int inputSize,
       int imageMean,
       float imageStd,
       String inputName,
-      String outputName)
-      throws IOException {
+      String outputName) {
     TensorFlowImageClassifier c = new TensorFlowImageClassifier();
     c.inputName = inputName;
     c.outputName = outputName;
@@ -93,15 +89,27 @@ public class TensorFlowImageClassifier implements Classifier {
     String actualFilename = labelFilename.split("file:///android_asset/")[1];
     Log.i(TAG, "Reading labels from: " + actualFilename);
     BufferedReader br = null;
-    br = new BufferedReader(new InputStreamReader(assetManager.open(actualFilename)));
-    String line;
-    while ((line = br.readLine()) != null) {
-      c.labels.add(line);
+    try {
+      br = new BufferedReader(new InputStreamReader(assetManager.open(actualFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        c.labels.add(line);
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!" , e);
     }
-    br.close();
-    Log.i(TAG, "Read " + c.labels.size() + " labels, " + numClasses +
-        " output layer size specified");
 
+    c.inferenceInterface = new TensorFlowInferenceInterface(assetManager, modelFilename);
+
+    // The shape of the output is [N, NUM_CLASSES], where N is the batch size.
+    final Operation operation = c.inferenceInterface.graphOperation(outputName);
+    final int numClasses = (int) operation.output(0).shape().size(1);
+    Log.i(TAG, "Read " + c.labels.size() + " labels, output layer size is " + numClasses);
+
+    // Ideally, inputSize could have been retrieved from the shape of the input operation.  Alas,
+    // the placeholder node for input in the graphdef typically used does not specify a shape, so it
+    // must be passed in as a parameter.
     c.inputSize = inputSize;
     c.imageMean = imageMean;
     c.imageStd = imageStd;
@@ -112,13 +120,6 @@ public class TensorFlowImageClassifier implements Classifier {
     c.floatValues = new float[inputSize * inputSize * 3];
     c.outputs = new float[numClasses];
 
-    c.inferenceInterface = new TensorFlowInferenceInterface();
-
-    final int status = c.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
-    if (status != 0) {
-      Log.e(TAG, "TF init status: " + status);
-      throw new RuntimeException("TF init status (" + status + ") != 0");
-    }
     return c;
   }
 
@@ -140,19 +141,18 @@ public class TensorFlowImageClassifier implements Classifier {
     Trace.endSection();
 
     // Copy the input data into TensorFlow.
-    Trace.beginSection("fillNodeFloat");
-    inferenceInterface.fillNodeFloat(
-        inputName, new int[] {1, inputSize, inputSize, 3}, floatValues);
+    Trace.beginSection("feed");
+    inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
     Trace.endSection();
 
     // Run the inference call.
-    Trace.beginSection("runInference");
-    inferenceInterface.runInference(outputNames);
+    Trace.beginSection("run");
+    inferenceInterface.run(outputNames, logStats);
     Trace.endSection();
 
     // Copy the output Tensor back into the output array.
-    Trace.beginSection("readNodeFloat");
-    inferenceInterface.readNodeFloat(outputName, outputs);
+    Trace.beginSection("fetch");
+    inferenceInterface.fetch(outputName, outputs);
     Trace.endSection();
 
     // Find the best classifications.
@@ -168,8 +168,9 @@ public class TensorFlowImageClassifier implements Classifier {
             });
     for (int i = 0; i < outputs.length; ++i) {
       if (outputs[i] > THRESHOLD) {
-        pq.add(new Recognition("" + i,
-            labels.size() > i ? labels.get(i) : "unknown", outputs[i], null));
+        pq.add(
+            new Recognition(
+                "" + i, labels.size() > i ? labels.get(i) : "unknown", outputs[i], null));
       }
     }
     final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
@@ -182,8 +183,8 @@ public class TensorFlowImageClassifier implements Classifier {
   }
 
   @Override
-  public void enableStatLogging(boolean debug) {
-    inferenceInterface.enableStatLogging(debug);
+  public void enableStatLogging(boolean logStats) {
+    this.logStats = logStats;
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
index 34a43616265..1dcf9f55efe 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
@@ -29,6 +29,8 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.StringTokenizer;
+import org.tensorflow.Graph;
+import org.tensorflow.Operation;
 import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 import org.tensorflow.demo.env.Logger;
 
@@ -39,10 +41,6 @@ import org.tensorflow.demo.env.Logger;
 public class TensorFlowMultiBoxDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = Integer.MAX_VALUE;
 
@@ -60,6 +58,8 @@ public class TensorFlowMultiBoxDetector implements Classifier {
   private String[] outputNames;
   private int numLocations;
 
+  private boolean logStats = false;
+
   private TensorFlowInferenceInterface inferenceInterface;
 
   private float[] boxPriors;
@@ -80,20 +80,38 @@ public class TensorFlowMultiBoxDetector implements Classifier {
       final AssetManager assetManager,
       final String modelFilename,
       final String locationFilename,
-      final int numLocations,
-      final int inputSize,
       final int imageMean,
       final float imageStd,
       final String inputName,
-      final String outputName) {
+      final String outputLocationsName,
+      final String outputScoresName) {
     final TensorFlowMultiBoxDetector d = new TensorFlowMultiBoxDetector();
+
+    d.inferenceInterface = new TensorFlowInferenceInterface(assetManager, modelFilename);
+
+    final Graph g = d.inferenceInterface.graph();
+
     d.inputName = inputName;
-    d.inputSize = inputSize;
+    // The inputName node has a shape of [N, H, W, C], where
+    // N is the batch size
+    // H = W are the height and width
+    // C is the number of channels (3 for our purposes - RGB)
+    final Operation inputOp = g.operation(inputName);
+    if (inputOp == null) {
+      throw new RuntimeException("Failed to find input Node '" + inputName + "'");
+    }
+    d.inputSize = (int) inputOp.output(0).shape().size(1);
     d.imageMean = imageMean;
     d.imageStd = imageStd;
-    d.numLocations = numLocations;
+    // The outputScoresName node has a shape of [N, NumLocations], where N
+    // is the batch size.
+    final Operation outputOp = g.operation(outputScoresName);
+    if (outputOp == null) {
+      throw new RuntimeException("Failed to find output Node '" + outputScoresName + "'");
+    }
+    d.numLocations = (int) outputOp.output(0).shape().size(1);
 
-    d.boxPriors = new float[numLocations * 8];
+    d.boxPriors = new float[d.numLocations * 8];
 
     try {
       d.loadCoderOptions(assetManager, locationFilename, d.boxPriors);
@@ -102,19 +120,12 @@ public class TensorFlowMultiBoxDetector implements Classifier {
     }
 
     // Pre-allocate buffers.
-    d.outputNames = outputName.split(",");
-    d.intValues = new int[inputSize * inputSize];
-    d.floatValues = new float[inputSize * inputSize * 3];
-    d.outputScores = new float[numLocations];
-    d.outputLocations = new float[numLocations * 4];
+    d.outputNames = new String[] {outputLocationsName, outputScoresName};
+    d.intValues = new int[d.inputSize * d.inputSize];
+    d.floatValues = new float[d.inputSize * d.inputSize * 3];
+    d.outputScores = new float[d.numLocations];
+    d.outputLocations = new float[d.numLocations * 4];
 
-    d.inferenceInterface = new TensorFlowInferenceInterface();
-
-    final int status = d.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
-    if (status != 0) {
-      LOGGER.e("TF init status: " + status);
-      throw new RuntimeException("TF init status (" + status + ") != 0");
-    }
     return d;
   }
 
@@ -206,22 +217,21 @@ public class TensorFlowMultiBoxDetector implements Classifier {
     Trace.endSection(); // preprocessBitmap
 
     // Copy the input data into TensorFlow.
-    Trace.beginSection("fillNodeFloat");
-    inferenceInterface.fillNodeFloat(
-        inputName, new int[] {1, inputSize, inputSize, 3}, floatValues);
+    Trace.beginSection("feed");
+    inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
     Trace.endSection();
 
     // Run the inference call.
-    Trace.beginSection("runInference");
-    inferenceInterface.runInference(outputNames);
+    Trace.beginSection("run");
+    inferenceInterface.run(outputNames, logStats);
     Trace.endSection();
 
     // Copy the output Tensor back into the output array.
-    Trace.beginSection("readNodeFloat");
+    Trace.beginSection("fetch");
     final float[] outputScoresEncoding = new float[numLocations];
     final float[] outputLocationsEncoding = new float[numLocations * 4];
-    inferenceInterface.readNodeFloat(outputNames[0], outputLocationsEncoding);
-    inferenceInterface.readNodeFloat(outputNames[1], outputScoresEncoding);
+    inferenceInterface.fetch(outputNames[0], outputLocationsEncoding);
+    inferenceInterface.fetch(outputNames[1], outputScoresEncoding);
     Trace.endSection();
 
     outputLocations = decodeLocationsEncoding(outputLocationsEncoding);
@@ -259,8 +269,8 @@ public class TensorFlowMultiBoxDetector implements Classifier {
   }
 
   @Override
-  public void enableStatLogging(final boolean debug) {
-    inferenceInterface.enableStatLogging(debug);
+  public void enableStatLogging(final boolean logStats) {
+    this.logStats = logStats;
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
index 86c922b5891..b7e36a2379d 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
@@ -31,10 +31,6 @@ import org.tensorflow.demo.env.SplitTimer;
 public class TensorFlowYoloDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = 5;
 
@@ -86,6 +82,8 @@ public class TensorFlowYoloDetector implements Classifier {
 
   private int blockSize;
 
+  private boolean logStats = false;
+
   private TensorFlowInferenceInterface inferenceInterface;
 
   /** Initializes a native TensorFlow session for classifying images. */
@@ -106,13 +104,8 @@ public class TensorFlowYoloDetector implements Classifier {
     d.floatValues = new float[inputSize * inputSize * 3];
     d.blockSize = blockSize;
 
-    d.inferenceInterface = new TensorFlowInferenceInterface();
+    d.inferenceInterface = new TensorFlowInferenceInterface(assetManager, modelFilename);
 
-    final int status = d.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
-    if (status != 0) {
-      LOGGER.e("TF init status: " + status);
-      throw new RuntimeException("TF init status (" + status + ") != 0");
-    }
     return d;
   }
 
@@ -150,37 +143,33 @@ public class TensorFlowYoloDetector implements Classifier {
     bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
 
     for (int i = 0; i < intValues.length; ++i) {
-      floatValues[i * 3 + 0] = (intValues[i] & 0xFF) / 255.0f;
+      floatValues[i * 3 + 0] = ((intValues[i] >> 16) & 0xFF) / 255.0f;
       floatValues[i * 3 + 1] = ((intValues[i] >> 8) & 0xFF) / 255.0f;
-      floatValues[i * 3 + 2] = ((intValues[i] >> 16) & 0xFF) / 255.0f;
+      floatValues[i * 3 + 2] = (intValues[i] & 0xFF) / 255.0f;
     }
     Trace.endSection(); // preprocessBitmap
 
     // Copy the input data into TensorFlow.
-    Trace.beginSection("fillNodeFloat");
-    inferenceInterface.fillNodeFloat(
-        inputName, new int[] {1, inputSize, inputSize, 3}, floatValues);
+    Trace.beginSection("feed");
+    inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
     Trace.endSection();
 
     timer.endSplit("ready for inference");
 
     // Run the inference call.
-    Trace.beginSection("runInference");
-    final int resultCode = inferenceInterface.runInference(outputNames);
-    if (resultCode != 0) {
-      throw new RuntimeException("Bad result code from inference: " + resultCode);
-    }
+    Trace.beginSection("run");
+    inferenceInterface.run(outputNames, logStats);
     Trace.endSection();
 
     timer.endSplit("ran inference");
 
     // Copy the output Tensor back into the output array.
-    Trace.beginSection("readNodeFloat");
+    Trace.beginSection("fetch");
     final int gridWidth = bitmap.getWidth() / blockSize;
     final int gridHeight = bitmap.getHeight() / blockSize;
     final float[] output =
         new float[gridWidth * gridHeight * (NUM_CLASSES + 5) * NUM_BOXES_PER_BLOCK];
-    inferenceInterface.readNodeFloat(outputNames[0], output);
+    inferenceInterface.fetch(outputNames[0], output);
     Trace.endSection();
 
     // Find the best detections.
@@ -215,7 +204,7 @@ public class TensorFlowYoloDetector implements Classifier {
                   Math.max(0, yPos - h / 2),
                   Math.min(bitmap.getWidth() - 1, xPos + w / 2),
                   Math.min(bitmap.getHeight() - 1, yPos + h / 2));
-          final float confidence = output[offset + 4];
+          final float confidence = expit(output[offset + 4]);
 
           int detectedClass = -1;
           float maxClass = 0;
@@ -256,8 +245,8 @@ public class TensorFlowYoloDetector implements Classifier {
   }
 
   @Override
-  public void enableStatLogging(final boolean debug) {
-    inferenceInterface.enableStatLogging(debug);
+  public void enableStatLogging(final boolean logStats) {
+    this.logStats = logStats;
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
index db929e5e087..5f2ff9164cc 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -27,6 +27,14 @@ import java.io.FileOutputStream;
 public class ImageUtils {
   @SuppressWarnings("unused")
   private static final Logger LOGGER = new Logger();
+  
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
 
   /**
    * Utility method to compute the allocated size in bytes of a YUV420SP image
@@ -83,10 +91,84 @@ public class ImageUtils {
     }
   }
 
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = true;
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w("Native YUV -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int i = 0;
+    for (int y = 0; y < height; y++) {
+      int pY = yRowStride * y;
+      int uv_row_start = uvRowStride * (y >> 1);
+      int pUV = uv_row_start;
+      int pV = uv_row_start;
+
+      for (int x = 0; x < width; x++) {
+        int uv_offset = pUV + (x >> 1) * uvPixelStride;
+        out[i++] =
+            YUV2RGB(
+                convertByteToInt(yData, pY + x),
+                convertByteToInt(uData, uv_offset),
+                convertByteToInt(vData, uv_offset));
+      }
+    }
+  }
+
+  private static int convertByteToInt(byte[] arr, int pos) {
+    return arr[pos] & 0xFF;
+  }
+
+  private static int YUV2RGB(int nY, int nU, int nV) {
+    nY -= 16;
+    nU -= 128;
+    nV -= 128;
+    if (nY < 0) nY = 0;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+
+    final int foo = 1192 * nY;
+    int nR = foo + 1634 * nV;
+    int nG = foo - 833 * nV - 400 * nU;
+    int nB = foo + 2066 * nU;
+
+    nR = Math.min(kMaxChannelValue, Math.max(0, nR));
+    nG = Math.min(kMaxChannelValue, Math.max(0, nG));
+    nB = Math.min(kMaxChannelValue, Math.max(0, nB));
+
+    return 0xff000000 | ((nR << 6) & 0x00ff0000) | ((nG >> 2) & 0x0000FF00) | ((nB >> 10) & 0xff);
+  }
+
   /**
-   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
-   * and height. The input and output must already be allocated and non-null.
-   * For efficiency, no error checking is performed.
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
    *
    * @param input The array of YUV 4:2:0 input data.
    * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 49c91d600da..91d1f9feb18 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -15,6 +15,7 @@ limitations under the License.
 
 package org.tensorflow.demo.tracking;
 
+import android.content.Context;
 import android.graphics.Canvas;
 import android.graphics.Color;
 import android.graphics.Matrix;
@@ -24,9 +25,9 @@ import android.graphics.Paint.Join;
 import android.graphics.Paint.Style;
 import android.graphics.RectF;
 import android.text.TextUtils;
-import android.util.DisplayMetrics;
 import android.util.Pair;
 import android.util.TypedValue;
+import android.widget.Toast;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Queue;
@@ -69,6 +70,7 @@ public class MultiBoxTracker {
 
   private static class TrackedRecognition {
     ObjectTracker.TrackedObject trackedObject;
+    RectF location;
     float detectionConfidence;
     int color;
     String title;
@@ -87,8 +89,10 @@ public class MultiBoxTracker {
   private int frameHeight;
 
   private int sensorOrientation;
+  private Context context;
 
-  public MultiBoxTracker(final DisplayMetrics metrics) {
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
     for (final int color : COLORS) {
       availableColors.add(color);
     }
@@ -100,7 +104,9 @@ public class MultiBoxTracker {
     boxPaint.setStrokeJoin(Join.ROUND);
     boxPaint.setStrokeMiter(100);
 
-    textSizePx = TypedValue.applyDimension(TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, metrics);
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
   }
 
@@ -152,10 +158,6 @@ public class MultiBoxTracker {
   }
 
   public synchronized void draw(final Canvas canvas) {
-    if (objectTracker == null) {
-      return;
-    }
-
     // TODO(andrewharp): This may not work for non-90 deg rotations.
     final float multiplier =
         Math.min(canvas.getWidth() / (float) frameHeight, canvas.getHeight() / (float) frameWidth);
@@ -168,9 +170,11 @@ public class MultiBoxTracker {
             sensorOrientation,
             false);
     for (final TrackedRecognition recognition : trackedObjects) {
-      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
 
-      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
       getFrameToCanvasMatrix().mapRect(trackedPos);
       boxPaint.setColor(recognition.color);
 
@@ -185,6 +189,8 @@ public class MultiBoxTracker {
     }
   }
 
+  private boolean initialized = false;
+
   public synchronized void onFrame(
       final int w,
       final int h,
@@ -192,7 +198,7 @@ public class MultiBoxTracker {
       final int sensorOrienation,
       final byte[] frame,
       final long timestamp) {
-    if (objectTracker == null) {
+    if (objectTracker == null && !initialized) {
       ObjectTracker.clearInstance();
 
       logger.i("Initializing ObjectTracker: %dx%d", w, h);
@@ -200,6 +206,19 @@ public class MultiBoxTracker {
       frameWidth = w;
       frameHeight = h;
       this.sensorOrientation = sensorOrienation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
     }
 
     objectTracker.nextFrame(frame, null, timestamp, null, true);
@@ -255,7 +274,20 @@ public class MultiBoxTracker {
     }
 
     if (objectTracker == null) {
-      logger.w("No ObjectTracker, can't track anything!");
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
       return;
     }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
index 82de634baff..69f202b5681 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -48,7 +48,18 @@ import org.tensorflow.demo.env.Size;
  * ObjectTracker still exists.
  */
 public class ObjectTracker {
-  private final Logger logger = new Logger();
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
 
   private static final boolean DRAW_TEXT = false;
 
@@ -194,6 +205,13 @@ public class ObjectTracker {
 
   public static synchronized ObjectTracker getInstance(
       final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
     if (instance == null) {
       instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
       instance.init();
@@ -519,7 +537,7 @@ public class ObjectTracker {
       checkValidObject();
       synchronized (ObjectTracker.this) {
         if (lastExternalPositionTime > timestamp) {
-          logger.w("Tried to use older position time!");
+          LOGGER.w("Tried to use older position time!");
           return;
         }
         final RectF externalPosition = downscaleRect(position);
@@ -640,8 +658,4 @@ public class ObjectTracker {
 
   protected static native void downsampleImageNative(
       int width, int height, int rowStride, byte[] input, int factor, byte[] output);
-
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
 }
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
new file mode 100644
index 00000000000..bd07f52bb8f
--- /dev/null
+++ b/tensorflow/examples/benchmark/BUILD
@@ -0,0 +1,32 @@
+# Description:
+# Examples of adding a benchmark to TensorFlow.
+
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_test(
+    name = "sample_benchmark",
+    size = "small",
+    srcs = ["sample_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "sample_logged_benchmark",
+    target = "//tensorflow/examples/benchmark:sample_benchmark",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**/*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/benchmark/sample_benchmark.py b/tensorflow/examples/benchmark/sample_benchmark.py
new file mode 100644
index 00000000000..e98d7a2b5f0
--- /dev/null
+++ b/tensorflow/examples/benchmark/sample_benchmark.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sample TensorFlow benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+
+
+# Define a class that extends from tf.test.Benchmark.
+class SampleBenchmark(tf.test.Benchmark):
+
+  # Note: benchmark method name must start with `benchmark`.
+  def benchmarkSum(self):
+    with tf.Session() as sess:
+      x = tf.constant(10)
+      y = tf.constant(5)
+      result = tf.add(x, y)
+
+      iters = 100
+      start_time = time.time()
+      for _ in range(iters):
+        sess.run(result)
+      total_wall_time = time.time() - start_time
+
+      # Call report_benchmark to report a metric value.
+      self.report_benchmark(
+          name="sum_wall_time",
+          # This value should always be per iteration.
+          wall_time=total_wall_time/iters,
+          iters=iters)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/examples/how_tos/__init__.py b/tensorflow/examples/how_tos/__init__.py
index 878841c1840..2069def2420 100644
--- a/tensorflow/examples/how_tos/__init__.py
+++ b/tensorflow/examples/how_tos/__init__.py
@@ -1,3 +1,19 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Declaring how_tos a python package.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
index 8cd296d7520..e29387ab9d0 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
@@ -88,7 +88,8 @@ def run_training():
     saver = tf.train.Saver()
 
     # Create the op for initializing variables.
-    init_op = tf.global_variables_initializer()
+    init_op = tf.group(tf.global_variables_initializer(),
+                       tf.local_variables_initializer())
 
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9e73def6a9..a9ed02dd1a6 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -17,7 +17,8 @@
 
 This version is like fully_connected_feed.py but uses data converted
 to a TFRecords file containing tf.train.Example protocol buffers.
-See tensorflow/g3doc/how_tos/reading_data.md#reading-from-files
+See:
+https://www.tensorflow.org/programmers_guide/reading_data#reading_from_files
 for context.
 
 YOU MUST run convert_to_records before running this (but you only need to
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index e8fafb7de8c..394c413b33e 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -5,6 +5,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_binary(
     name = "retrain",
     srcs = [
@@ -22,13 +24,30 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "label_image",
+    srcs = [
+        "label_image.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "retrain_test",
     size = "small",
     srcs = [
+        "label_image.py",
         "retrain.py",
         "retrain_test.py",
     ],
+    data = [
+        ":data/labels.txt",
+        "//tensorflow/examples/label_image:data/grace_hopper.jpg",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":retrain",
diff --git a/tensorflow/examples/image_retraining/data/labels.txt b/tensorflow/examples/image_retraining/data/labels.txt
new file mode 100644
index 00000000000..bc1131ac459
--- /dev/null
+++ b/tensorflow/examples/image_retraining/data/labels.txt
@@ -0,0 +1,3 @@
+Runner-up
+Winner
+Loser
diff --git a/tensorflow/examples/image_retraining/label_image.py b/tensorflow/examples/image_retraining/label_image.py
new file mode 100644
index 00000000000..ecfa672462b
--- /dev/null
+++ b/tensorflow/examples/image_retraining/label_image.py
@@ -0,0 +1,147 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple image classification with Inception.
+
+Run image classification with your model.
+
+This script is usually used with retrain.py found in this same
+directory.
+
+This program creates a graph from a saved GraphDef protocol buffer,
+and runs inference on an input JPEG image. You are required
+to pass in the graph file and the txt file.
+
+It outputs human readable strings of the top 5 predictions along with
+their probabilities.
+
+Change the --image_file argument to any jpg image to compute a
+classification of that image.
+
+Example usage:
+python label_image.py --graph=retrained_graph.pb
+  --labels=retrained_labels.txt
+  --image=flower_photos/daisy/54377391_15648e8d18.jpg
+
+NOTE: To learn to use this file and retrain.py, please see:
+
+https://codelabs.developers.google.com/codelabs/tensorflow-for-poets
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import tensorflow as tf
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--image', required=True, type=str, help='Absolute path to image file.')
+parser.add_argument(
+    '--num_top_predictions',
+    type=int,
+    default=5,
+    help='Display this many predictions.')
+parser.add_argument(
+    '--graph',
+    required=True,
+    type=str,
+    help='Absolute path to graph file (.pb)')
+parser.add_argument(
+    '--labels',
+    required=True,
+    type=str,
+    help='Absolute path to labels file (.txt)')
+parser.add_argument(
+    '--output_layer',
+    type=str,
+    default='final_result:0',
+    help='Name of the result operation')
+parser.add_argument(
+    '--input_layer',
+    type=str,
+    default='DecodeJpeg/contents:0',
+    help='Name of the input operation')
+
+
+def load_image(filename):
+  """Read in the image_data to be classified."""
+  return tf.gfile.FastGFile(filename, 'rb').read()
+
+
+def load_labels(filename):
+  """Read in labels, one label per line."""
+  return [line.rstrip() for line in tf.gfile.GFile(filename)]
+
+
+def load_graph(filename):
+  """Unpersists graph from file as default graph."""
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    tf.import_graph_def(graph_def, name='')
+
+
+def run_graph(image_data, labels, input_layer_name, output_layer_name,
+              num_top_predictions):
+  with tf.Session() as sess:
+    # Feed the image_data as input to the graph.
+    #   predictions  will contain a two-dimensional array, where one
+    #   dimension represents the input image count, and the other has
+    #   predictions per class
+    softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name)
+    predictions, = sess.run(softmax_tensor, {input_layer_name: image_data})
+
+    # Sort to show labels in order of confidence
+    top_k = predictions.argsort()[-num_top_predictions:][::-1]
+    for node_id in top_k:
+      human_string = labels[node_id]
+      score = predictions[node_id]
+      print('%s (score = %.5f)' % (human_string, score))
+
+    return 0
+
+
+def main(argv):
+  """Runs inference on an image."""
+  if argv[1:]:
+    raise ValueError('Unused Command Line Args: %s' % argv[1:])
+
+  if not tf.gfile.Exists(FLAGS.image):
+    tf.logging.fatal('image file does not exist %s', FLAGS.image)
+
+  if not tf.gfile.Exists(FLAGS.labels):
+    tf.logging.fatal('labels file does not exist %s', FLAGS.labels)
+
+  if not tf.gfile.Exists(FLAGS.graph):
+    tf.logging.fatal('graph file does not exist %s', FLAGS.graph)
+
+  # load image
+  image_data = load_image(FLAGS.image)
+
+  # load labels
+  labels = load_labels(FLAGS.labels)
+
+  # load graph, which is stored in the default session
+  load_graph(FLAGS.graph)
+
+  run_graph(image_data, labels, FLAGS.input_layer, FLAGS.output_layer,
+            FLAGS.num_top_predictions)
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=sys.argv[:1]+unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index e612eb7424e..44a3097d80e 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Simple transfer learning with an Inception v3 architecture model which
-displays summaries in TensorBoard.
+"""Simple transfer learning with an Inception v3 architecture model.
+
+With support for TensorBoard.
 
 This example shows how to take a Inception v3 architecture model trained on
 ImageNet images, and train a new top layer that can recognize other classes of
@@ -39,9 +40,20 @@ The subfolder names are important, since they define what label is applied to
 each image, but the filenames themselves don't matter. Once your images are
 prepared, you can run the training with a command like this:
 
-bazel build third_party/tensorflow/examples/image_retraining:retrain && \
-bazel-bin/third_party/tensorflow/examples/image_retraining/retrain \
---image_dir ~/flower_photos
+
+```bash
+bazel build tensorflow/examples/image_retraining:retrain && \
+bazel-bin/tensorflow/examples/image_retraining/retrain \
+    --image_dir ~/flower_photos
+```
+
+Or, if you have a pip installation of tensorflow, `retrain.py` can be run
+without bazel:
+
+```bash
+python tensorflow/examples/image_retraining/retrain.py \
+    --image_dir ~/flower_photos
+```
 
 You can replace the image_dir argument with any folder containing subfolders of
 images. The label for each image is taken from the name of the subfolder it's
@@ -244,7 +256,7 @@ def create_inception_graph():
     Graph holding the trained Inception network, and various tensors we'll be
     manipulating.
   """
-  with tf.Session() as sess:
+  with tf.Graph().as_default() as graph:
     model_filename = os.path.join(
         FLAGS.model_dir, 'classify_image_graph_def.pb')
     with gfile.FastGFile(model_filename, 'rb') as f:
@@ -254,7 +266,7 @@ def create_inception_graph():
           tf.import_graph_def(graph_def, name='', return_elements=[
               BOTTLENECK_TENSOR_NAME, JPEG_DATA_TENSOR_NAME,
               RESIZED_INPUT_TENSOR_NAME]))
-  return sess.graph, bottleneck_tensor, jpeg_data_tensor, resized_input_tensor
+  return graph, bottleneck_tensor, jpeg_data_tensor, resized_input_tensor
 
 
 def run_bottleneck_on_image(sess, image_data, image_data_tensor,
@@ -315,7 +327,7 @@ def ensure_dir_exists(dir_name):
     os.makedirs(dir_name)
 
 
-def write_list_of_floats_to_file(list_of_floats , file_path):
+def write_list_of_floats_to_file(list_of_floats, file_path):
   """Writes a given list of floats to a binary file.
 
   Args:
@@ -346,18 +358,28 @@ def read_list_of_floats_from_file(file_path):
 
 bottleneck_path_2_bottleneck_values = {}
 
+
 def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor):
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor):
+  """Create a single bottleneck file."""
   print('Creating bottleneck at ' + bottleneck_path)
-  image_path = get_image_path(image_lists, label_name, index, image_dir, category)
+  image_path = get_image_path(image_lists, label_name, index,
+                              image_dir, category)
   if not gfile.Exists(image_path):
     tf.logging.fatal('File does not exist %s', image_path)
   image_data = gfile.FastGFile(image_path, 'rb').read()
-  bottleneck_values = run_bottleneck_on_image(sess, image_data, jpeg_data_tensor, bottleneck_tensor)
+  try:
+    bottleneck_values = run_bottleneck_on_image(
+        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
+  except:
+    raise RuntimeError('Error during processing file %s' % image_path)
+
   bottleneck_string = ','.join(str(x) for x in bottleneck_values)
   with open(bottleneck_path, 'w') as bottleneck_file:
     bottleneck_file.write(bottleneck_string)
 
+
 def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
                              category, bottleneck_dir, jpeg_data_tensor,
                              bottleneck_tensor):
@@ -387,25 +409,32 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
   sub_dir = label_lists['dir']
   sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
   ensure_dir_exists(sub_dir_path)
-  bottleneck_path = get_bottleneck_path(image_lists, label_name, index, bottleneck_dir, category)
+  bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
+                                        bottleneck_dir, category)
   if not os.path.exists(bottleneck_path):
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor)
+    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor)
   with open(bottleneck_path, 'r') as bottleneck_file:
     bottleneck_string = bottleneck_file.read()
   did_hit_error = False
   try:
     bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  except:
-    print("Invalid float found, recreating bottleneck")
+  except ValueError:
+    print('Invalid float found, recreating bottleneck')
     did_hit_error = True
   if did_hit_error:
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor)
+    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor)
     with open(bottleneck_path, 'r') as bottleneck_file:
       bottleneck_string = bottleneck_file.read()
-    # Allow exceptions to propagate here, since they shouldn't happen after a fresh creation
+    # Allow exceptions to propagate here, since they shouldn't happen after a
+    # fresh creation
     bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
   return bottleneck_values
 
+
 def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
                       jpeg_data_tensor, bottleneck_tensor):
   """Ensures all the training, testing, and validation bottlenecks are cached.
@@ -718,7 +747,11 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   layer_name = 'final_training_ops'
   with tf.name_scope(layer_name):
     with tf.name_scope('weights'):
-      layer_weights = tf.Variable(tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count], stddev=0.001), name='final_weights')
+      initial_value = tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count],
+                                          stddev=0.001)
+
+      layer_weights = tf.Variable(initial_value, name='final_weights')
+
       variable_summaries(layer_weights)
     with tf.name_scope('biases'):
       layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
@@ -738,8 +771,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
   with tf.name_scope('train'):
-    train_step = tf.train.GradientDescentOptimizer(FLAGS.learning_rate).minimize(
-        cross_entropy_mean)
+    optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
+    train_step = optimizer.minimize(cross_entropy_mean)
 
   return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
           final_tensor)
@@ -767,11 +800,27 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   return evaluation_step, prediction
 
 
-def main(_):
+def save_graph_to_file(sess, graph, graph_file_name):
+  output_graph_def = graph_util.convert_variables_to_constants(
+    sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+  with gfile.FastGFile(graph_file_name, 'wb') as f:
+    f.write(output_graph_def.SerializeToString())
+  return
+
+
+def prepare_file_system():
   # Setup the directory we'll write summaries to for TensorBoard
   if tf.gfile.Exists(FLAGS.summaries_dir):
     tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
   tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  if FLAGS.intermediate_store_frequency > 0:
+    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
+  return
+
+
+def main(_):
+  # Prepare necessary directories  that can be used during training
+  prepare_file_system()
 
   # Set up the pre-trained graph.
   maybe_download_and_extract()
@@ -794,114 +843,130 @@ def main(_):
   do_distort_images = should_distort_images(
       FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
       FLAGS.random_brightness)
-  sess = tf.Session()
 
-  if do_distort_images:
-    # We will be applying distortions, so setup the operations we'll need.
-    distorted_jpeg_data_tensor, distorted_image_tensor = add_input_distortions(
-        FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-        FLAGS.random_brightness)
-  else:
-    # We'll make sure we've calculated the 'bottleneck' image summaries and
-    # cached them on disk.
-    cache_bottlenecks(sess, image_lists, FLAGS.image_dir, FLAGS.bottleneck_dir,
-                      jpeg_data_tensor, bottleneck_tensor)
+  with tf.Session(graph=graph) as sess:
 
-  # Add the new layer that we'll be training.
-  (train_step, cross_entropy, bottleneck_input, ground_truth_input,
-   final_tensor) = add_final_training_ops(len(image_lists.keys()),
-                                          FLAGS.final_tensor_name,
-                                          bottleneck_tensor)
-
-  # Create the operations we need to evaluate the accuracy of our new layer.
-  evaluation_step, prediction = add_evaluation_step(
-      final_tensor, ground_truth_input)
-
-  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
-  merged = tf.summary.merge_all()
-  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                       sess.graph)
-  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')
-
-  # Set up all our weights to their initial default values.
-  init = tf.global_variables_initializer()
-  sess.run(init)
-
-  # Run the training for as many cycles as requested on the command line.
-  for i in range(FLAGS.how_many_training_steps):
-    # Get a batch of input bottleneck values, either calculated fresh every time
-    # with distortions applied, or from the cache stored on disk.
     if do_distort_images:
-      train_bottlenecks, train_ground_truth = get_random_distorted_bottlenecks(
-          sess, image_lists, FLAGS.train_batch_size, 'training',
-          FLAGS.image_dir, distorted_jpeg_data_tensor,
-          distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
+      # We will be applying distortions, so setup the operations we'll need.
+      (distorted_jpeg_data_tensor,
+       distorted_image_tensor) = add_input_distortions(
+           FLAGS.flip_left_right, FLAGS.random_crop,
+           FLAGS.random_scale, FLAGS.random_brightness)
     else:
-      train_bottlenecks, train_ground_truth, _ = get_random_cached_bottlenecks(
-          sess, image_lists, FLAGS.train_batch_size, 'training',
-          FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-          bottleneck_tensor)
-    # Feed the bottlenecks and ground truth into the graph, and run a training
-    # step. Capture training summaries for TensorBoard with the `merged` op.
-    train_summary, _ = sess.run([merged, train_step],
-             feed_dict={bottleneck_input: train_bottlenecks,
-                        ground_truth_input: train_ground_truth})
-    train_writer.add_summary(train_summary, i)
+      # We'll make sure we've calculated the 'bottleneck' image summaries and
+      # cached them on disk.
+      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
+                        FLAGS.bottleneck_dir, jpeg_data_tensor,
+                        bottleneck_tensor)
 
-    # Every so often, print out how well the graph is training.
-    is_last_step = (i + 1 == FLAGS.how_many_training_steps)
-    if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
-      train_accuracy, cross_entropy_value = sess.run(
-          [evaluation_step, cross_entropy],
+    # Add the new layer that we'll be training.
+    (train_step, cross_entropy, bottleneck_input, ground_truth_input,
+     final_tensor) = add_final_training_ops(len(image_lists.keys()),
+                                            FLAGS.final_tensor_name,
+                                            bottleneck_tensor)
+
+    # Create the operations we need to evaluate the accuracy of our new layer.
+    evaluation_step, prediction = add_evaluation_step(
+        final_tensor, ground_truth_input)
+
+    # Merge all the summaries and write them out to the summaries_dir
+    merged = tf.summary.merge_all()
+    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
+                                         sess.graph)
+
+    validation_writer = tf.summary.FileWriter(
+        FLAGS.summaries_dir + '/validation')
+
+    # Set up all our weights to their initial default values.
+    init = tf.global_variables_initializer()
+    sess.run(init)
+
+    # Run the training for as many cycles as requested on the command line.
+    for i in range(FLAGS.how_many_training_steps):
+      # Get a batch of input bottleneck values, either calculated fresh every
+      # time with distortions applied, or from the cache stored on disk.
+      if do_distort_images:
+        (train_bottlenecks,
+         train_ground_truth) = get_random_distorted_bottlenecks(
+             sess, image_lists, FLAGS.train_batch_size, 'training',
+             FLAGS.image_dir, distorted_jpeg_data_tensor,
+             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
+      else:
+        (train_bottlenecks,
+         train_ground_truth, _) = get_random_cached_bottlenecks(
+             sess, image_lists, FLAGS.train_batch_size, 'training',
+             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
+             bottleneck_tensor)
+      # Feed the bottlenecks and ground truth into the graph, and run a training
+      # step. Capture training summaries for TensorBoard with the `merged` op.
+
+      train_summary, _ = sess.run(
+          [merged, train_step],
           feed_dict={bottleneck_input: train_bottlenecks,
                      ground_truth_input: train_ground_truth})
-      print('%s: Step %d: Train accuracy = %.1f%%' % (datetime.now(), i,
-                                                      train_accuracy * 100))
-      print('%s: Step %d: Cross entropy = %f' % (datetime.now(), i,
-                                                 cross_entropy_value))
-      validation_bottlenecks, validation_ground_truth, _ = (
-          get_random_cached_bottlenecks(
-              sess, image_lists, FLAGS.validation_batch_size, 'validation',
-              FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-              bottleneck_tensor))
-      # Run a validation step and capture training summaries for TensorBoard
-      # with the `merged` op.
-      validation_summary, validation_accuracy = sess.run(
-          [merged, evaluation_step],
-          feed_dict={bottleneck_input: validation_bottlenecks,
-                     ground_truth_input: validation_ground_truth})
-      validation_writer.add_summary(validation_summary, i)
-      print('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-            (datetime.now(), i, validation_accuracy * 100,
-             len(validation_bottlenecks)))
+      train_writer.add_summary(train_summary, i)
 
-  # We've completed all our training, so run a final test evaluation on
-  # some new images we haven't used before.
-  test_bottlenecks, test_ground_truth, test_filenames = (
-      get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
-                                    'testing', FLAGS.bottleneck_dir,
-                                    FLAGS.image_dir, jpeg_data_tensor,
-                                    bottleneck_tensor))
-  test_accuracy, predictions = sess.run(
-      [evaluation_step, prediction],
-      feed_dict={bottleneck_input: test_bottlenecks,
-                 ground_truth_input: test_ground_truth})
-  print('Final test accuracy = %.1f%% (N=%d)' % (
-      test_accuracy * 100, len(test_bottlenecks)))
+      # Every so often, print out how well the graph is training.
+      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
+      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
+        train_accuracy, cross_entropy_value = sess.run(
+            [evaluation_step, cross_entropy],
+            feed_dict={bottleneck_input: train_bottlenecks,
+                       ground_truth_input: train_ground_truth})
+        print('%s: Step %d: Train accuracy = %.1f%%' % (datetime.now(), i,
+                                                        train_accuracy * 100))
+        print('%s: Step %d: Cross entropy = %f' % (datetime.now(), i,
+                                                   cross_entropy_value))
+        validation_bottlenecks, validation_ground_truth, _ = (
+            get_random_cached_bottlenecks(
+                sess, image_lists, FLAGS.validation_batch_size, 'validation',
+                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
+                bottleneck_tensor))
+        # Run a validation step and capture training summaries for TensorBoard
+        # with the `merged` op.
+        validation_summary, validation_accuracy = sess.run(
+            [merged, evaluation_step],
+            feed_dict={bottleneck_input: validation_bottlenecks,
+                       ground_truth_input: validation_ground_truth})
+        validation_writer.add_summary(validation_summary, i)
+        print('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
+              (datetime.now(), i, validation_accuracy * 100,
+               len(validation_bottlenecks)))
 
-  if FLAGS.print_misclassified_test_images:
-    print('=== MISCLASSIFIED TEST IMAGES ===')
-    for i, test_filename in enumerate(test_filenames):
-      if predictions[i] != test_ground_truth[i].argmax():
-        print('%70s  %s' % (test_filename, image_lists.keys()[predictions[i]]))
+      # Store intermediate results
+      intermediate_frequency = FLAGS.intermediate_store_frequency
 
-  # Write out the trained graph and labels with the weights stored as constants.
-  output_graph_def = graph_util.convert_variables_to_constants(
-      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-  with gfile.FastGFile(FLAGS.output_graph, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-  with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
-    f.write('\n'.join(image_lists.keys()) + '\n')
+      if intermediate_frequency > 0 and (i % intermediate_frequency == 0) and i > 0:
+        intermediate_file_name = FLAGS.intermediate_output_graphs_dir + 'intermediate_' + str(i) + '.pb'
+        print('Save intermediate result to : ' + intermediate_file_name)
+        save_graph_to_file(sess, graph, intermediate_file_name)
+                
+    # We've completed all our training, so run a final test evaluation on
+    # some new images we haven't used before.
+    test_bottlenecks, test_ground_truth, test_filenames = (
+        get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
+                                      'testing', FLAGS.bottleneck_dir,
+                                      FLAGS.image_dir, jpeg_data_tensor,
+                                      bottleneck_tensor))
+    test_accuracy, predictions = sess.run(
+        [evaluation_step, prediction],
+        feed_dict={bottleneck_input: test_bottlenecks,
+                   ground_truth_input: test_ground_truth})
+    print('Final test accuracy = %.1f%% (N=%d)' % (
+        test_accuracy * 100, len(test_bottlenecks)))
+
+    if FLAGS.print_misclassified_test_images:
+      print('=== MISCLASSIFIED TEST IMAGES ===')
+      for i, test_filename in enumerate(test_filenames):
+        if predictions[i] != test_ground_truth[i].argmax():
+          print('%70s  %s' % (test_filename,
+                              list(image_lists.keys())[predictions[i]]))
+
+    # Write out the trained graph and labels with the weights stored as
+    # constants.
+    save_graph_to_file(sess, graph, FLAGS.output_graph)
+    with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
+      f.write('\n'.join(image_lists.keys()) + '\n')
 
 
 if __name__ == '__main__':
@@ -918,6 +983,18 @@ if __name__ == '__main__':
       default='/tmp/output_graph.pb',
       help='Where to save the trained graph.'
   )
+  parser.add_argument(
+      '--intermediate_output_graphs_dir',
+      type=str,
+      default='/tmp/intermediate_graph/',
+      help='Where to save the intermediate graphs.'
+  )
+  parser.add_argument(
+      '--intermediate_store_frequency',
+      type=int,
+      default=0,
+      help='How many steps to store intermediate graph. If "0" then will not store.'
+  )
   parser.add_argument(
       '--output_labels',
       type=str,
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 00ccea174f7..8af5cc71149 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
+import os
 
+from tensorflow.examples.image_retraining import label_image
 from tensorflow.examples.image_retraining import retrain
 from tensorflow.python.framework import test_util
 
@@ -81,5 +83,35 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
       gt = tf.placeholder(tf.float32, [1], name='gt')
       self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
 
+  def testLabelImage(self):
+
+    image_filename = ('../label_image/data/grace_hopper.jpg')
+
+    # Load some default data
+    label_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                              'data/labels.txt')
+    labels = label_image.load_labels(label_path)
+    self.assertEqual(len(labels), 3)
+
+    image_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                              image_filename)
+
+    image = label_image.load_image(image_path)
+    self.assertEqual(len(image), 61306)
+
+    # Create trivial graph; note that the two nodes don't meet
+    with tf.Graph().as_default():
+      jpeg = tf.constant(image)
+      # Input node that doesn't lead anywhere.
+      tf.image.decode_jpeg(jpeg, name='DecodeJpeg')
+
+      # Output node, that always outputs a constant.
+      tf.constant([[10, 30, 5]], name='final')
+
+      # As label_image outputs via print, we assume that
+      # if it returns, everything is OK.
+      result = label_image.run_graph(image, labels, jpeg, 'final:0', 3)
+      self.assertEqual(result, 0)
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/ios_examples/.gitignore b/tensorflow/examples/ios/.gitignore
similarity index 100%
rename from tensorflow/contrib/ios_examples/.gitignore
rename to tensorflow/examples/ios/.gitignore
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
new file mode 100644
index 00000000000..9832399d721
--- /dev/null
+++ b/tensorflow/examples/ios/README.md
@@ -0,0 +1,194 @@
+# TensorFlow iOS Examples
+
+This folder contains examples of how to build applications for iOS devices using TensorFlow.
+
+## Running the Samples using CocoaPod
+ - You'll need Xcode 7.3 or later.
+
+ - There are currently three examples: simple, benchmark, and camera. For now,
+   you can download the sample code by cloning the main tensorflow repository 
+   (we are planning to make the samples available as a separate repository
+   later).
+
+ - From the root of the tensorflow folder, download
+   [Inception v1](https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip),
+   and extract the label and graph files into the data folders inside both the
+   simple and camera examples:
+
+```bash
+mkdir -p ~/graphs
+curl -o ~/graphs/inception5h.zip \
+ https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip \
+ && unzip ~/graphs/inception5h.zip -d ~/graphs/inception5h
+cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/benchmark/data/
+cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/camera/data/
+cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/simple/data/
+```
+
+ - Change directory to one of the samples, download the TensorFlow-experimental
+   pod, and open the Xcode workspace. Observe: installing the pod can take a
+   long time since it is big (~450MB). For example, if you want to run the
+   simple example, then:
+```bash
+cd tensorflow/contrib/ios_examples/simple
+pod install
+open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
+```
+
+ - Run the simple app in the simulator. You should see a single-screen app with
+   a "Run Model" button. Tap that, and you should see some debug output appear
+   below indicating that the example Grace Hopper image in directory data has
+   been analyzed, with a military uniform recognized.
+
+ - Run the other samples using the same process. The camera example requires a
+   real device connected. Once you build and run that, you should get a live
+   camera view that you can point at objects to get real-time recognition
+   results.
+
+### Troubleshooting
+
+ - Make sure you use the TensorFlow-experimental pod (and not TensorFlow).
+  
+ - The TensorFlow-experimental pod is current about ~450MB. The reason it is 
+   so big is because we are bundling multiple platforms, and the pod includes
+   all TensorFlow functionality (e.g. operations). This is convenient during
+   development, but see below section on how you can build your own custom
+   TensorFlow library to reduce the size.
+
+### Creating Your own App
+
+ - Create your own app using Xcode then add a file named Podfile at the project
+   root directory with the following content:
+```bash
+target 'YourProjectName'
+       pod 'TensorFlow-experimental'
+```
+
+ - Then you run ```pod install``` to download and install the
+ TensorFlow-experimental pod, and finaly perform
+ ```open YourProjectName.xcworkspace``` and add your code.
+
+ - In your apps "Build Settings", make sure to add $(inherited) to sections
+   "Other Linker Flags", and "Header Search Paths".
+
+ - That's it. If you want to create your custom TensorFlow iOS library, for
+   example to reduce binary footprint, see below section.
+
+## Building the TensorFlow iOS libraries from source
+
+ - You'll need Xcode 7.3 or later, with the command-line tools installed.
+
+ - Follow the instructions at
+   [tensorflow/contrib/makefile](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile)
+   under "iOS" to compile a static library containing the core TensorFlow code.
+
+ - You should see a single-screen app with a "Run Model" button. Tap that, and
+   you should see some debug output appear below indicating that the example
+   Grace Hopper image has been analyzed, with a military uniform recognized.
+
+ - Once you have success there, make sure you have a real device connected and
+   open up the Xcode project in the `camera` subfolder. Once you build and run
+   that, you should get a live camera view that you can point at objects to get
+   real-time recognition results.
+   
+### Troubleshooting
+
+If you're hitting problems, here's a checklist of common things to investigate:
+
+ - Make sure that you've run the `build_all_ios.sh` script.
+   This will run `download_dependencies.sh`,`compile_ios_protobuf.sh` and `compile_ios_tensorflow.sh`.
+   (check each one if they have run successful.)
+
+ - Check that you have version 7.3 of Xcode.
+
+ - If there's a complaint about no Sessions registered, that means that the C++
+   global constructors that TensorFlow relies on for registration haven't been
+   linked in properly. You'll have to make sure your project uses force_load, as
+   described below.
+
+### Creating your Own App from your source libraries
+
+You'll need to update various settings in your app to link against
+TensorFlow. You can view them in the example projects, but here's a full
+rundown:
+
+ - The `compile_ios_tensorflow.sh` script builds a universal static library in
+   `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`. You'll need to add
+   this to your linking build stage, and in Search Paths add
+   `tensorflow/contrib/makefile/gen/lib` to the Library Search Paths setting.
+
+ - You'll also need to add `libprotobuf.a` and `libprotobuf-lite.a` from
+   `tensorflow/contrib/makefile/gen/protobuf_ios/lib` to your _Build Stages_ and
+   _Library Search Paths_.
+
+ - The _Header Search_ paths needs to contain:
+   - the root folder of tensorflow,
+   - `tensorflow/contrib/makefile/downloads/protobuf/src`
+   - `tensorflow/contrib/makefile/downloads`,
+   - `tensorflow/contrib/makefile/downloads/eigen`, and
+   - `tensorflow/contrib/makefile/gen/proto`.
+
+ - In the Linking section, you need to add `-force_load` followed by the path to
+   the TensorFlow static library in the _Other Linker_ Flags section. This ensures
+   that the global C++ objects that are used to register important classes
+   inside the library are not stripped out. To the linker, they can appear
+   unused because no other code references the variables, but in fact their
+   constructors have the important side effect of registering the class.
+
+ - You'll need to include the Accelerate framework in the "Link Binary with
+   Libraries" build phase of your project.
+
+ - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
+   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
+
+ - The library doesn't currently support bitcode, so you'll need to disable that
+   in your project settings.
+
+ - Remove any use of the `-all_load` flag in your project. The protocol buffers
+   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
+   flag will cause these duplicates to become link errors. If you were using
+   `-all_load` to avoid issues with Objective-C categories in static libraries,
+   you may be able to replace it with the `-ObjC` flag.
+
+### Reducing the binary size
+
+TensorFlow is a comparatively large library for a mobile device, so it will
+increase the size of your app. Currently on iOS we see around a 11 MB binary
+footprint per CPU architecture, though we're actively working on reducing that.
+It can be tricky to set up the right configuration in your own app to keep the
+size minimized, so if you do run into this issue we recommend you start by
+looking at the simple example to examine its size. Here's how you do that:
+
+ - Open the Xcode project in tensorflow/contrib/ios_examples/simple.
+
+ - Make sure you've followed the steps above to get the data files.
+
+ - Choose "Generic iOS Device" as the build configuration.
+
+ - Select Product->Build.
+
+ - Once the build's complete, open the Report Navigator and select the logs.
+
+ - Near the bottom, you'll see a line saying "Touch tf_simple_example.app".
+
+ - Expand that line using the icon on the right, and copy the first argument to
+   the Touch command.
+
+ - Go to the terminal, type `ls -lah ` and then paste the path you copied.
+
+ - For example it might look like `ls -lah /Users/petewarden/Library/Developer/Xcode/DerivedData/tf_simple_example-etdbksqytcnzeyfgdwiihzkqpxwr/Build/Products/Debug-iphoneos/tf_simple_example.app`
+
+ - Running this command will show the size of the executable as the
+   `tf_simple_example` line.
+
+Right now you'll see a size of around 23 MB, since it's including two
+architectures (armv7 and arm64). As a first step, you should make sure the size
+increase you see in your own app is similar, and if it's larger, look at the
+"Other Linker Flags" used in the Simple Xcode project settings to strip the
+executable.
+
+After that, you can manually look at modifying the list of kernels
+included in tensorflow/contrib/makefile/tf_op_files.txt to reduce the number of
+implementations to the ones you're actually using in your own model. We're
+hoping to automate this step in the future, but for now manually removing them
+is the best approach.
diff --git a/tensorflow/contrib/ios_examples/benchmark/AppDelegate.h b/tensorflow/examples/ios/benchmark/AppDelegate.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/AppDelegate.h
rename to tensorflow/examples/ios/benchmark/AppDelegate.h
diff --git a/tensorflow/contrib/ios_examples/benchmark/AppDelegate.mm b/tensorflow/examples/ios/benchmark/AppDelegate.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/AppDelegate.mm
rename to tensorflow/examples/ios/benchmark/AppDelegate.mm
diff --git a/tensorflow/contrib/ios_examples/benchmark/Benchmark-Info.plist b/tensorflow/examples/ios/benchmark/Benchmark-Info.plist
similarity index 91%
rename from tensorflow/contrib/ios_examples/benchmark/Benchmark-Info.plist
rename to tensorflow/examples/ios/benchmark/Benchmark-Info.plist
index 8d17162b877..0cdbf28a31b 100644
--- a/tensorflow/contrib/ios_examples/benchmark/Benchmark-Info.plist
+++ b/tensorflow/examples/ios/benchmark/Benchmark-Info.plist
@@ -5,11 +5,11 @@
 	<key>CFBundleDevelopmentRegion</key>
 	<string>en</string>
 	<key>CFBundleDisplayName</key>
-	<string>TF Benchmark</string>
+	<string>tf_benchmark_example</string>
 	<key>CFBundleExecutable</key>
-	<string>benchmark</string>
+	<string>tf_benchmark_example</string>
 	<key>CFBundleIdentifier</key>
-	<string>Google.Benchmark</string>
+	<string>com.google.tf_benchmark_example</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>
diff --git a/tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.h b/tensorflow/examples/ios/benchmark/BenchmarkViewController.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.h
rename to tensorflow/examples/ios/benchmark/BenchmarkViewController.h
diff --git a/tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.mm b/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
similarity index 94%
rename from tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.mm
rename to tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
index 4421c88651e..cab7b36f177 100644
--- a/tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.mm
+++ b/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
@@ -22,17 +22,17 @@
 #include <sstream>
 #include <string>
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/message_lite.h"
+//#include "google/protobuf/io/coded_stream.h"
+//#include "google/protobuf/io/zero_copy_stream_impl.h"
+//#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+//#include "google/protobuf/message_lite.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
+//#include "tensorflow/core/framework/tensor.h"
+//#include "tensorflow/core/framework/types.pb.h"
+//#include "tensorflow/core/platform/env.h"
+//#include "tensorflow/core/platform/logging.h"
+//#include "tensorflow/core/platform/mutex.h"
+//#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
@@ -52,7 +52,7 @@ class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
       return -1;
     }
     ifs_.read(static_cast<char*>(buffer), size);
-    return ifs_.gcount();
+    return (int)ifs_.gcount();
   }
 
  private:
@@ -85,7 +85,7 @@ static void GetTopN(
                       std::greater<std::pair<float, int>>>
       top_result_pq;
 
-  const int count = prediction.size();
+  long count = prediction.size();
   for (int i = 0; i < count; ++i) {
     const float value = prediction(i);
 
@@ -178,7 +178,7 @@ tensorflow::Status BenchmarkInference(
   stat_summarizer->PrintStepStats();
 
   *average_time = total_time / iterations_count;
-  NSLog(@"Took %f seconds", average_time);
+  NSLog(@"Took %f seconds", *average_time);
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.xib b/tensorflow/examples/ios/benchmark/BenchmarkViewController.xib
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/BenchmarkViewController.xib
rename to tensorflow/examples/ios/benchmark/BenchmarkViewController.xib
diff --git a/tensorflow/examples/ios/benchmark/Podfile b/tensorflow/examples/ios/benchmark/Podfile
new file mode 100644
index 00000000000..e163d56e8d2
--- /dev/null
+++ b/tensorflow/examples/ios/benchmark/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tf_benchmark_example'
+       pod 'TensorFlow-experimental'
diff --git a/tensorflow/contrib/ios_examples/benchmark/data/grace_hopper.jpg b/tensorflow/examples/ios/benchmark/data/grace_hopper.jpg
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/data/grace_hopper.jpg
rename to tensorflow/examples/ios/benchmark/data/grace_hopper.jpg
diff --git a/tensorflow/contrib/ios_examples/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/ios_image_load.h
rename to tensorflow/examples/ios/benchmark/ios_image_load.h
diff --git a/tensorflow/contrib/ios_examples/benchmark/ios_image_load.mm b/tensorflow/examples/ios/benchmark/ios_image_load.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/ios_image_load.mm
rename to tensorflow/examples/ios/benchmark/ios_image_load.mm
diff --git a/tensorflow/contrib/ios_examples/benchmark/main.mm b/tensorflow/examples/ios/benchmark/main.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/benchmark/main.mm
rename to tensorflow/examples/ios/benchmark/main.mm
diff --git a/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..d61b65ba614
--- /dev/null
+++ b/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj
@@ -0,0 +1,388 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C8BA8FD1EC682E700CCCC8C /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
+		1C8BA8FE1EC682E700CCCC8C /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
+		1C8BA8FF1EC682E700CCCC8C /* BenchmarkViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */; };
+		1C8BA9001EC682E700CCCC8C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
+		1C8BA9051EC682E700CCCC8C /* BenchmarkViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */; };
+		1C8BA9061EC682E700CCCC8C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
+		1C8BA9071EC682E700CCCC8C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
+		1C8BA9081EC682E700CCCC8C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
+		1CB1883E1ECCC0DC00C93EF7 /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */; };
+		1CB1883F1ECCC10D00C93EF7 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */; };
+		1E0EBA4DF4C722C63814B257 /* libPods-tf_benchmark_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_benchmark_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1CB1883B1ECCC09A00C93EF7 /* CoreFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreFoundation.framework; path = System/Library/Frameworks/CoreFoundation.framework; sourceTree = SDKROOT; };
+		1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
+		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
+		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
+		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
+		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
+		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
+		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "Benchmark-Info.plist"; sourceTree = "<group>"; };
+		59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BenchmarkViewController.h; sourceTree = "<group>"; };
+		59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkViewController.mm; sourceTree = "<group>"; };
+		59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = BenchmarkViewController.xib; sourceTree = "<group>"; };
+		5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_benchmark_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example.debug.xcconfig"; sourceTree = "<group>"; };
+		8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_benchmark_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_benchmark_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example.release.xcconfig"; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1C8BA9011EC682E700CCCC8C /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CB1883F1ECCC10D00C93EF7 /* UIKit.framework in Frameworks */,
+				1CB1883E1ECCC0DC00C93EF7 /* CoreGraphics.framework in Frameworks */,
+				1E0EBA4DF4C722C63814B257 /* libPods-tf_benchmark_example.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		2BD56010B574F539C2070A57 /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */,
+				DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
+				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
+				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
+				59A3CFFC1CF4E68100C4259F /* main.mm */,
+				59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */,
+				59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */,
+				59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */,
+				59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				2BD56010B574F539C2070A57 /* Pods */,
+				76A25A27041EB307BDFF0DD1 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
+				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
+				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+		76A25A27041EB307BDFF0DD1 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */,
+				1CB1883B1ECCC09A00C93EF7 /* CoreFoundation.framework */,
+				1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */,
+				8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1C8BA8FB1EC682E700CCCC8C /* tf_benchmark_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1C8BA9091EC682E700CCCC8C /* Build configuration list for PBXNativeTarget "tf_benchmark_example" */;
+			buildPhases = (
+				0388D751057A257A12848245 /* [CP] Check Pods Manifest.lock */,
+				1C8BA8FC1EC682E700CCCC8C /* Sources */,
+				1C8BA9011EC682E700CCCC8C /* Frameworks */,
+				1C8BA9041EC682E700CCCC8C /* Resources */,
+				8999A303091D4E86202C2F64 /* [CP] Embed Pods Frameworks */,
+				A7B4B278BCC417B76A47ABB0 /* [CP] Copy Pods Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tf_benchmark_example;
+			productName = benchmark;
+			productReference = 1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_benchmark_example" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1C8BA8FB1EC682E700CCCC8C /* tf_benchmark_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1C8BA9041EC682E700CCCC8C /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1C8BA9051EC682E700CCCC8C /* BenchmarkViewController.xib in Resources */,
+				1C8BA9061EC682E700CCCC8C /* imagenet_comp_graph_label_strings.txt in Resources */,
+				1C8BA9071EC682E700CCCC8C /* tensorflow_inception_graph.pb in Resources */,
+				1C8BA9081EC682E700CCCC8C /* grace_hopper.jpg in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		0388D751057A257A12848245 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
+			showEnvVarsInLog = 0;
+		};
+		8999A303091D4E86202C2F64 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		A7B4B278BCC417B76A47ABB0 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1C8BA8FC1EC682E700CCCC8C /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1C8BA8FD1EC682E700CCCC8C /* main.mm in Sources */,
+				1C8BA8FE1EC682E700CCCC8C /* AppDelegate.mm in Sources */,
+				1C8BA8FF1EC682E700CCCC8C /* BenchmarkViewController.mm in Sources */,
+				1C8BA9001EC682E700CCCC8C /* ios_image_load.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1C8BA90A1EC682E700CCCC8C /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */;
+			buildSettings = {
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				ENABLE_BITCODE = NO;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = "";
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-benchmark-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		1C8BA90B1EC682E700CCCC8C /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */;
+			buildSettings = {
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				ENABLE_BITCODE = NO;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = "";
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-benchmark-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1C8BA9091EC682E700CCCC8C /* Build configuration list for PBXNativeTarget "tf_benchmark_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1C8BA90A1EC682E700CCCC8C /* Debug */,
+				1C8BA90B1EC682E700CCCC8C /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_benchmark_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleAppDelegate.h b/tensorflow/examples/ios/camera/CameraExampleAppDelegate.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/CameraExampleAppDelegate.h
rename to tensorflow/examples/ios/camera/CameraExampleAppDelegate.h
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleAppDelegate.m b/tensorflow/examples/ios/camera/CameraExampleAppDelegate.m
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/CameraExampleAppDelegate.m
rename to tensorflow/examples/ios/camera/CameraExampleAppDelegate.m
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
similarity index 96%
rename from tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
rename to tensorflow/examples/ios/camera/CameraExampleViewController.h
index eb9d5aed687..0aefbc6eedb 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.h
@@ -39,7 +39,7 @@
   std::unique_ptr<tensorflow::MemmappedEnv> tf_memmapped_env;
   std::vector<std::string> labels;
 }
-@property(retain, nonatomic) CATextLayer *predictionTextLayer;
+@property(strong, nonatomic) CATextLayer *predictionTextLayer;
 
 - (IBAction)takePicture:(id)sender;
 - (IBAction)switchCameras:(id)sender;
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm b/tensorflow/examples/ios/camera/CameraExampleViewController.mm
similarity index 94%
rename from tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
rename to tensorflow/examples/ios/camera/CameraExampleViewController.mm
index 86570b19d25..d113d50ff8e 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.mm
@@ -43,8 +43,8 @@ const float input_std = 1.0f;
 const std::string input_layer_name = "input";
 const std::string output_layer_name = "softmax1";
 
-static const NSString *AVCaptureStillImageIsCapturingStillImageContext =
-    @"AVCaptureStillImageIsCapturingStillImageContext";
+static void *AVCaptureStillImageIsCapturingStillImageContext =
+    &AVCaptureStillImageIsCapturingStillImageContext;
 
 @interface CameraExampleViewController (InternalMethods)
 - (void)setupAVCapture;
@@ -105,28 +105,23 @@ static const NSString *AVCaptureStillImageIsCapturingStillImageContext =
   [rootLayer addSublayer:previewLayer];
   [session startRunning];
 
-  [session release];
   if (error) {
-    UIAlertView *alertView = [[UIAlertView alloc]
-            initWithTitle:[NSString stringWithFormat:@"Failed with error %d",
-                                                     (int)[error code]]
-                  message:[error localizedDescription]
-                 delegate:nil
-        cancelButtonTitle:@"Dismiss"
-        otherButtonTitles:nil];
-    [alertView show];
-    [alertView release];
+    NSString *title = [NSString stringWithFormat:@"Failed with error %d", (int)[error code]];
+    UIAlertController *alertController =
+        [UIAlertController alertControllerWithTitle:title
+                                            message:[error localizedDescription]
+                                     preferredStyle:UIAlertControllerStyleAlert];
+    UIAlertAction *dismiss =
+        [UIAlertAction actionWithTitle:@"Dismiss" style:UIAlertActionStyleDefault handler:nil];
+    [alertController addAction:dismiss];
+    [self presentViewController:alertController animated:YES completion:nil];
     [self teardownAVCapture];
   }
 }
 
 - (void)teardownAVCapture {
-  [videoDataOutput release];
-  if (videoDataOutputQueue) dispatch_release(videoDataOutputQueue);
   [stillImageOutput removeObserver:self forKeyPath:@"isCapturingStillImage"];
-  [stillImageOutput release];
   [previewLayer removeFromSuperlayer];
-  [previewLayer release];
 }
 
 - (void)observeValueForKeyPath:(NSString *)keyPath
@@ -155,7 +150,6 @@ static const NSString *AVCaptureStillImageIsCapturingStillImageContext =
           }
           completion:^(BOOL finished) {
             [flashView removeFromSuperview];
-            [flashView release];
             flashView = nil;
           }];
     }
@@ -194,7 +188,6 @@ static const NSString *AVCaptureStillImageIsCapturingStillImageContext =
               }
               completion:^(BOOL finished) {
                 [flashView removeFromSuperview];
-                [flashView release];
                 flashView = nil;
               }];
         }];
@@ -256,7 +249,9 @@ static const NSString *AVCaptureStillImageIsCapturingStillImageContext =
 didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
        fromConnection:(AVCaptureConnection *)connection {
   CVPixelBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+  CFRetain(pixelBuffer);
   [self runCNNOnFrame:pixelBuffer];
+  CFRelease(pixelBuffer);
 }
 
 - (void)runCNNOnFrame:(CVPixelBufferRef)pixelBuffer {
@@ -275,7 +270,10 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
   const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
   const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
   const int fullHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
-  CVPixelBufferLockBaseAddress(pixelBuffer, 0);
+
+  CVPixelBufferLockFlags unlockFlags = kNilOptions;
+  CVPixelBufferLockBaseAddress(pixelBuffer, unlockFlags);
+
   unsigned char *sourceBaseAddr =
       (unsigned char *)(CVPixelBufferGetBaseAddress(pixelBuffer));
   int image_height;
@@ -312,6 +310,8 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
     }
   }
 
+  CVPixelBufferUnlockBaseAddress(pixelBuffer, unlockFlags);
+
   if (tf_session.get()) {
     std::vector<tensorflow::Tensor> outputs;
     tensorflow::Status run_status = tf_session->Run(
@@ -327,7 +327,7 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
         const float predictionValue = predictions(index);
         if (predictionValue > 0.05f) {
           std::string label = labels[index % predictions.size()];
-          NSString *labelObject = [NSString stringWithCString:label.c_str()];
+          NSString *labelObject = [NSString stringWithUTF8String:label.c_str()];
           NSNumber *valueObject = [NSNumber numberWithFloat:predictionValue];
           [newValues setObject:valueObject forKey:labelObject];
         }
@@ -337,12 +337,11 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
       });
     }
   }
+  CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
 }
 
 - (void)dealloc {
   [self teardownAVCapture];
-  [square release];
-  [super dealloc];
 }
 
 // use front/back camera
@@ -376,11 +375,11 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
 
 - (void)viewDidLoad {
   [super viewDidLoad];
-  square = [[UIImage imageNamed:@"squarePNG"] retain];
+  square = [UIImage imageNamed:@"squarePNG"];
   synth = [[AVSpeechSynthesizer alloc] init];
   labelLayers = [[NSMutableArray alloc] init];
   oldPredictionValues = [[NSMutableDictionary alloc] init];
-  
+
   tensorflow::Status load_status;
   if (model_uses_memory_mapping) {
     load_status = LoadMemoryMappedModel(
@@ -402,7 +401,6 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
 
 - (void)viewDidUnload {
   [super viewDidUnload];
-  [oldPredictionValues release];
 }
 
 - (void)viewWillAppear:(BOOL)animated {
@@ -449,7 +447,6 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
                                   forKey:label];
     }
   }
-  [oldPredictionValues release];
   oldPredictionValues = decayedPredictionValues;
 
   for (NSString *label in newValues) {
@@ -553,7 +550,7 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
                         width:(float)width
                        height:(float)height
                     alignment:(NSString *)alignment {
-  NSString *const font = @"Menlo-Regular";
+  CFTypeRef font = (CFTypeRef) @"Menlo-Regular";
   const float fontSize = 20.0f;
 
   const float marginSizeX = 5.0f;
diff --git a/tensorflow/contrib/ios_examples/camera/Info.plist b/tensorflow/examples/ios/camera/Info.plist
similarity index 94%
rename from tensorflow/contrib/ios_examples/camera/Info.plist
rename to tensorflow/examples/ios/camera/Info.plist
index 0cd75f61f73..772fb38dcc9 100644
--- a/tensorflow/contrib/ios_examples/camera/Info.plist
+++ b/tensorflow/examples/ios/camera/Info.plist
@@ -5,7 +5,7 @@
 	<key>CFBundleDevelopmentRegion</key>
 	<string>en</string>
 	<key>CFBundleDisplayName</key>
-	<string>${PRODUCT_NAME}</string>
+	<string>tf_camera_example</string>
 	<key>CFBundleExecutable</key>
 	<string>${EXECUTABLE_NAME}</string>
 	<key>CFBundleIdentifier</key>
@@ -24,8 +24,12 @@
 	<string>1.0</string>
 	<key>LSRequiresIPhoneOS</key>
 	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>Capture images to detect object</string>
 	<key>UIMainStoryboardFile</key>
 	<string>MainStoryboard_iPhone</string>
+	<key>UIRequiresFullScreen</key>
+	<true/>
 	<key>UIStatusBarHidden</key>
 	<true/>
 	<key>UISupportedInterfaceOrientations</key>
@@ -36,7 +40,5 @@
 	<array>
 		<string>UIInterfaceOrientationPortrait</string>
 	</array>
-	<key>NSCameraUsageDescription</key>
-	<string>Capture images to detect object</string>
 </dict>
 </plist>
diff --git a/tensorflow/contrib/ios_examples/camera/en.lproj/MainStoryboard_iPhone.storyboard b/tensorflow/examples/ios/camera/MainStoryboard_iPhone.storyboard
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/en.lproj/MainStoryboard_iPhone.storyboard
rename to tensorflow/examples/ios/camera/MainStoryboard_iPhone.storyboard
diff --git a/tensorflow/examples/ios/camera/Podfile b/tensorflow/examples/ios/camera/Podfile
new file mode 100644
index 00000000000..117828f0714
--- /dev/null
+++ b/tensorflow/examples/ios/camera/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tf_camera_example'
+       pod 'TensorFlow-experimental'
diff --git a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg b/tensorflow/examples/ios/camera/data/grace_hopper.jpg
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
rename to tensorflow/examples/ios/camera/data/grace_hopper.jpg
diff --git a/tensorflow/contrib/ios_examples/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/ios_image_load.h
rename to tensorflow/examples/ios/camera/ios_image_load.h
diff --git a/tensorflow/contrib/ios_examples/camera/ios_image_load.mm b/tensorflow/examples/ios/camera/ios_image_load.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/ios_image_load.mm
rename to tensorflow/examples/ios/camera/ios_image_load.mm
diff --git a/tensorflow/contrib/ios_examples/camera/main.mm b/tensorflow/examples/ios/camera/main.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/main.mm
rename to tensorflow/examples/ios/camera/main.mm
diff --git a/tensorflow/contrib/ios_examples/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/camera/tensorflow_utils.h
rename to tensorflow/examples/ios/camera/tensorflow_utils.h
diff --git a/tensorflow/contrib/ios_examples/camera/tensorflow_utils.mm b/tensorflow/examples/ios/camera/tensorflow_utils.mm
similarity index 93%
rename from tensorflow/contrib/ios_examples/camera/tensorflow_utils.mm
rename to tensorflow/examples/ios/camera/tensorflow_utils.mm
index 43746882ee1..56d1e53081d 100644
--- a/tensorflow/contrib/ios_examples/camera/tensorflow_utils.mm
+++ b/tensorflow/examples/ios/camera/tensorflow_utils.mm
@@ -23,18 +23,6 @@
 #include <sstream>
 #include <string>
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/message_lite.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
-
 namespace {
 
 // Helper class used to load protobufs efficiently.
@@ -228,4 +216,4 @@ tensorflow::Status LoadLabels(NSString* file_name, NSString* file_type,
   }
   t.close();
   return tensorflow::Status::OK();
-}
\ No newline at end of file
+}
diff --git a/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..ee9fe57c792
--- /dev/null
+++ b/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj
@@ -0,0 +1,412 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C3C9DCB1ED3AB4200B8B5FA /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */; };
+		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
+		1C968D171ED3B8F20054F5C3 /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
+		1C968D181ED3B8F20054F5C3 /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
+		1C968D191ED3B8F20054F5C3 /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
+		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
+		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
+		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
+		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
+		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
+		1CDB2D4C1ED3A9CD007929E9 /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */; };
+		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
+		54DC6C3C5F734F3A58069F0C /* libPods-tf_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1C3C9DC81ED3AB4200B8B5FA /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
+		1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
+		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		1C564C0D1ED3A92E00087306 /* tf_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
+		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
+		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
+		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
+		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
+		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
+		1CDB2D471ED3A9CD007929E9 /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = "<group>"; };
+		1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = "<group>"; };
+		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
+		55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example.release.xcconfig"; sourceTree = "<group>"; };
+		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
+		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
+		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
+				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
+				54DC6C3C5F734F3A58069F0C /* libPods-tf_camera_example.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
+				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+				3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		3E9FC355632FB928EA23BEED /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */,
+				55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
+				1C3C9DC81ED3AB4200B8B5FA /* ios_image_load.h */,
+				1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */,
+				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
+				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
+				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
+				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
+				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
+				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
+				1CDB2D471ED3A9CD007929E9 /* tensorflow_utils.h */,
+				1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				3E9FC355632FB928EA23BEED /* Pods */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1C564C0D1ED3A92E00087306 /* tf_camera_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
+				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
+				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1C564C0C1ED3A92E00087306 /* tf_camera_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tf_camera_example" */;
+			buildPhases = (
+				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
+				1C564C091ED3A92E00087306 /* Sources */,
+				1C564C0A1ED3A92E00087306 /* Frameworks */,
+				1C564C0B1ED3A92E00087306 /* Resources */,
+				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
+				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tf_camera_example;
+			productName = tf_camera_example;
+			productReference = 1C564C0D1ED3A92E00087306 /* tf_camera_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0830;
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					1C564C0C1ED3A92E00087306 = {
+						CreatedOnToolsVersion = 8.3.2;
+						DevelopmentTeam = 5DRPWFQSHP;
+						ProvisioningStyle = Automatic;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_camera_example" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1C564C0C1ED3A92E00087306 /* tf_camera_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1C564C0B1ED3A92E00087306 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1C968D171ED3B8F20054F5C3 /* grace_hopper.jpg in Resources */,
+				1C968D181ED3B8F20054F5C3 /* imagenet_comp_graph_label_strings.txt in Resources */,
+				1C968D191ED3B8F20054F5C3 /* tensorflow_inception_graph.pb in Resources */,
+				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
+				1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1C564C091ED3A92E00087306 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CDB2D4C1ED3A9CD007929E9 /* tensorflow_utils.mm in Sources */,
+				1C3C9DCB1ED3AB4200B8B5FA /* ios_image_load.mm in Sources */,
+				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
+				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
+				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1C564C361ED3A92E00087306 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = 5DRPWFQSHP;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Debug;
+		};
+		1C564C371ED3A92E00087306 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = 5DRPWFQSHP;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Release;
+		};
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tf_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1C564C361ED3A92E00087306 /* Debug */,
+				1C564C371ED3A92E00087306 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/contrib/ios_examples/simple/AppDelegate.h b/tensorflow/examples/ios/simple/AppDelegate.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/AppDelegate.h
rename to tensorflow/examples/ios/simple/AppDelegate.h
diff --git a/tensorflow/contrib/ios_examples/simple/AppDelegate.mm b/tensorflow/examples/ios/simple/AppDelegate.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/AppDelegate.mm
rename to tensorflow/examples/ios/simple/AppDelegate.mm
diff --git a/tensorflow/examples/ios/simple/Podfile b/tensorflow/examples/ios/simple/Podfile
new file mode 100644
index 00000000000..1740ad64573
--- /dev/null
+++ b/tensorflow/examples/ios/simple/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tf_simple_example'
+       pod 'TensorFlow-experimental'
diff --git a/tensorflow/contrib/ios_examples/simple/RunModel-Info.plist b/tensorflow/examples/ios/simple/RunModel-Info.plist
similarity index 92%
rename from tensorflow/contrib/ios_examples/simple/RunModel-Info.plist
rename to tensorflow/examples/ios/simple/RunModel-Info.plist
index ca80e680911..d0a8742456f 100644
--- a/tensorflow/contrib/ios_examples/simple/RunModel-Info.plist
+++ b/tensorflow/examples/ios/simple/RunModel-Info.plist
@@ -5,11 +5,11 @@
 	<key>CFBundleDevelopmentRegion</key>
 	<string>en</string>
 	<key>CFBundleDisplayName</key>
-	<string>tf_ios_makefile_example</string>
+	<string>tf_simple_example</string>
 	<key>CFBundleExecutable</key>
-	<string>tf_ios_makefile_example</string>
+	<string>tf_simple_example</string>
 	<key>CFBundleIdentifier</key>
-	<string>Google.RunModel</string>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>
diff --git a/tensorflow/contrib/ios_examples/simple/RunModelViewController.h b/tensorflow/examples/ios/simple/RunModelViewController.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/RunModelViewController.h
rename to tensorflow/examples/ios/simple/RunModelViewController.h
diff --git a/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm b/tensorflow/examples/ios/simple/RunModelViewController.mm
similarity index 92%
rename from tensorflow/contrib/ios_examples/simple/RunModelViewController.mm
rename to tensorflow/examples/ios/simple/RunModelViewController.mm
index 5c121962d9c..c8ccb5c77b2 100644
--- a/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm
+++ b/tensorflow/examples/ios/simple/RunModelViewController.mm
@@ -21,17 +21,7 @@
 #include <sstream>
 #include <string>
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/message_lite.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
 #include "ios_image_load.h"
@@ -50,7 +40,7 @@ class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
       return -1;
     }
     ifs_.read(static_cast<char*>(buffer), size);
-    return ifs_.gcount();
+    return (int)ifs_.gcount();
   }
 
  private:
@@ -83,7 +73,7 @@ static void GetTopN(
       std::vector<std::pair<float, int> >,
       std::greater<std::pair<float, int> > > top_result_pq;
 
-  const int count = prediction.size();
+  const long count = prediction.size();
   for (int i = 0; i < count; ++i) {
     const float value = prediction(i);
 
@@ -121,7 +111,7 @@ bool PortableReadFileToProto(const std::string& file_name,
   // eventually remove this and quit loud when a large protobuf is passed in.
   ::google::protobuf::io::CodedInputStream coded_stream(&stream);
   // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively. 
+  // respectively.
   coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
   return proto->ParseFromCodedStream(&coded_stream);
 }
@@ -192,7 +182,7 @@ NSString* RunInferenceOnImage() {
           1, wanted_height, wanted_width, wanted_channels}));
   auto image_tensor_mapped = image_tensor.tensor<float, 4>();
   tensorflow::uint8* in = image_data.data();
-  tensorflow::uint8* in_end = (in + (image_height * image_width * image_channels));
+  // tensorflow::uint8* in_end = (in + (image_height * image_width * image_channels));
   float* out = image_tensor_mapped.data();
   for (int y = 0; y < wanted_height; ++y) {
     const int in_y = (y * image_height) / wanted_height;
@@ -209,7 +199,7 @@ NSString* RunInferenceOnImage() {
   }
 
   NSString* result = [network_path stringByAppendingString: @" - loaded!"];
-  result = [NSString stringWithFormat: @"%@ - %d, %s - %dx%d", result,
+  result = [NSString stringWithFormat: @"%@ - %lu, %s - %dx%d", result,
 	label_strings.size(), label_strings[0].c_str(), image_width, image_height];
 
   std::string input_layer = "input";
diff --git a/tensorflow/contrib/ios_examples/simple/RunModelViewController.xib b/tensorflow/examples/ios/simple/RunModelViewController.xib
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/RunModelViewController.xib
rename to tensorflow/examples/ios/simple/RunModelViewController.xib
diff --git a/tensorflow/contrib/ios_examples/simple/data/grace_hopper.jpg b/tensorflow/examples/ios/simple/data/grace_hopper.jpg
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/data/grace_hopper.jpg
rename to tensorflow/examples/ios/simple/data/grace_hopper.jpg
diff --git a/tensorflow/contrib/ios_examples/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/ios_image_load.h
rename to tensorflow/examples/ios/simple/ios_image_load.h
diff --git a/tensorflow/contrib/ios_examples/simple/ios_image_load.mm b/tensorflow/examples/ios/simple/ios_image_load.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/ios_image_load.mm
rename to tensorflow/examples/ios/simple/ios_image_load.mm
diff --git a/tensorflow/contrib/ios_examples/simple/main.mm b/tensorflow/examples/ios/simple/main.mm
similarity index 100%
rename from tensorflow/contrib/ios_examples/simple/main.mm
rename to tensorflow/examples/ios/simple/main.mm
diff --git a/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..55c06e28fb3
--- /dev/null
+++ b/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj
@@ -0,0 +1,404 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */; };
+		1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */; };
+		2530463E3C9A9D5FB9299C0E /* libPods-tf_simple_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */; };
+		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
+		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
+		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
+		59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
+		59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
+		59A3D0091CF4E68100C4259F /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
+		59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */; };
+		59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* RunModelViewController.xib */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		5911579B1CF4011C00C31E3A /* tf_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
+		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
+		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
+		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
+		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
+		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
+		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "RunModel-Info.plist"; sourceTree = "<group>"; };
+		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
+		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
+		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
+		73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_simple_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example.release.xcconfig"; sourceTree = "<group>"; };
+		8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_simple_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example.debug.xcconfig"; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		591157981CF4011C00C31E3A /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */,
+				1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */,
+				2530463E3C9A9D5FB9299C0E /* libPods-tf_simple_example.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+				73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		3E9FC355632FB928EA23BEED /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */,
+				87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
+				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
+				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
+				59A3CFFC1CF4E68100C4259F /* main.mm */,
+				59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */,
+				59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */,
+				59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */,
+				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				3E9FC355632FB928EA23BEED /* Pods */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5911579B1CF4011C00C31E3A /* tf_simple_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
+				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
+				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		5911579A1CF4011C00C31E3A /* tf_simple_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */;
+			buildPhases = (
+				1CD07C1CEB04E50C5975C7BB /* [CP] Check Pods Manifest.lock */,
+				591157971CF4011C00C31E3A /* Sources */,
+				591157981CF4011C00C31E3A /* Frameworks */,
+				591157991CF4011C00C31E3A /* Resources */,
+				0EABEF9F31578BDA8CA9D2A7 /* [CP] Embed Pods Frameworks */,
+				96DDF9E6E35958387A215092 /* [CP] Copy Pods Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tf_simple_example;
+			productName = tf_ios_makefile_example;
+			productReference = 5911579B1CF4011C00C31E3A /* tf_simple_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					5911579A1CF4011C00C31E3A = {
+						CreatedOnToolsVersion = 7.2;
+						DevelopmentTeam = 85Z3VXS37U;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_simple_example" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				5911579A1CF4011C00C31E3A /* tf_simple_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		591157991CF4011C00C31E3A /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */,
+				59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */,
+				59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */,
+				59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		0EABEF9F31578BDA8CA9D2A7 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		1CD07C1CEB04E50C5975C7BB /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
+			showEnvVarsInLog = 0;
+		};
+		96DDF9E6E35958387A215092 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		591157971CF4011C00C31E3A /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				59A3D0091CF4E68100C4259F /* main.mm in Sources */,
+				59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */,
+				59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */,
+				59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		591157B31CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */;
+			buildSettings = {
+				CLANG_DEBUG_INFORMATION_LEVEL = default;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				ENABLE_BITCODE = NO;
+				GCC_ENABLE_CPP_EXCEPTIONS = YES;
+				GCC_ENABLE_CPP_RTTI = YES;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = "";
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-simple-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SEPARATE_STRIP = NO;
+			};
+			name = Debug;
+		};
+		591157B41CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */;
+			buildSettings = {
+				CLANG_DEBUG_INFORMATION_LEVEL = default;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				ENABLE_BITCODE = NO;
+				GCC_ENABLE_CPP_EXCEPTIONS = YES;
+				GCC_ENABLE_CPP_RTTI = YES;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = "";
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-simple-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SEPARATE_STRIP = NO;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_simple_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B31CF4011D00C31E3A /* Debug */,
+				591157B41CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 021372fa7b8..d677e58ac32 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -12,12 +12,32 @@ cc_binary(
     srcs = [
         "main.cc",
     ],
-    linkopts = ["-lm"],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:tensorflow",
-    ],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-ljnigraphics",
+            "-llog",
+            "-lm",
+            "-z defs",
+            "-s",
+            "-Wl,--exclude-libs,ALL",
+        ],
+        "//conditions:default": ["-lm"],
+    }),
+    deps = select({
+        "//tensorflow:android": [
+            # cc:cc_ops is used to include image ops (for label_image)
+            # Jpg, gif, and png related code won't be included
+            "//tensorflow/cc:cc_ops",
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/cc:cc_ops",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:tensorflow",
+        ],
+    }),
 )
 
 filegroup(
diff --git a/tensorflow/examples/label_image/README.md b/tensorflow/examples/label_image/README.md
index 62385312b6f..1103caf5864 100644
--- a/tensorflow/examples/label_image/README.md
+++ b/tensorflow/examples/label_image/README.md
@@ -18,9 +18,8 @@ packaged in the repo because of its size. Instead, you must first download the
 file to the `data` directory in the source tree:
 
 ```bash
-$ wget https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015.zip -O tensorflow/examples/label_image/data/inception_dec_2015.zip
-
-$ unzip tensorflow/examples/label_image/data/inception_dec_2015.zip -d tensorflow/examples/label_image/data/
+$ curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz" |
+  tar -C tensorflow/examples/label_image/data -xz
 ```
 
 Then, as long as you've managed to build the main TensorFlow framework, you
@@ -46,16 +45,16 @@ This uses the default example image that ships with the framework, and should
 output something similar to this:
 
 ```
-I tensorflow/examples/label_image/main.cc:207] military uniform (866): 0.647299
-I tensorflow/examples/label_image/main.cc:207] suit (794): 0.0477195
-I tensorflow/examples/label_image/main.cc:207] academic gown (896): 0.0232407
-I tensorflow/examples/label_image/main.cc:207] bow tie (817): 0.0157355
-I tensorflow/examples/label_image/main.cc:207] bolo tie (940): 0.0145023
+I tensorflow/examples/label_image/main.cc:206] military uniform (653): 0.834306
+I tensorflow/examples/label_image/main.cc:206] mortarboard (668): 0.0218692
+I tensorflow/examples/label_image/main.cc:206] academic gown (401): 0.0103579
+I tensorflow/examples/label_image/main.cc:206] pickelhaube (716): 0.00800814
+I tensorflow/examples/label_image/main.cc:206] bulletproof vest (466): 0.00535088
 ```
 
 In this case, we're using the default image of Admiral Grace Hopper, and you can
 see the network correctly spots she's wearing a military uniform, with a high
-score of 0.6.
+score of 0.8.
 
 Next, try it out on your own images by supplying the --image= argument, e.g.
 
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index fa024010281..a98c0817e30 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -30,8 +30,12 @@ limitations under the License.
 // the top of the main() function.
 //
 // The googlenet_graph.pb file included by default is created from Inception.
+//
+// Note that, for GIF inputs, to reuse existing code, only single-frame ones
+// are supported.
 
 #include <fstream>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -46,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,7 +67,7 @@ using tensorflow::int32;
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
 // of the result is a multiple of 16, because our model expects that.
-Status ReadLabelsFile(string file_name, std::vector<string>* result,
+Status ReadLabelsFile(const string& file_name, std::vector<string>* result,
                       size_t* found_label_count) {
   std::ifstream file(file_name);
   if (!file) {
@@ -82,9 +87,32 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
   return Status::OK();
 }
 
+static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
+                             Tensor* output) {
+
+  tensorflow::uint64 file_size = 0;
+  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+
+  string contents;
+  contents.resize(file_size);
+
+  std::unique_ptr<tensorflow::RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+
+  tensorflow::StringPiece data;
+  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(contents)[0]));
+  if (data.size() != file_size) {
+    return tensorflow::errors::DataLoss("Truncated read of '", filename,
+                                        "' expected ", file_size, " got ",
+                                        data.size());
+  }
+  output->scalar<string>()() = data.ToString();
+  return Status::OK();
+}
+
 // Given an image file name, read in the data, try to decode it as an image,
 // resize it to the requested size, and then scale the values as desired.
-Status ReadTensorFromImageFile(string file_name, const int input_height,
+Status ReadTensorFromImageFile(const string& file_name, const int input_height,
                                const int input_width, const float input_mean,
                                const float input_std,
                                std::vector<Tensor>* out_tensors) {
@@ -93,8 +121,20 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
 
   string input_name = "file_reader";
   string output_name = "normalized";
-  auto file_reader = tensorflow::ops::ReadFile(root.WithOpName(input_name),
-                                               file_name);
+
+  // read file_name into a tensor named input
+  Tensor input(tensorflow::DT_STRING, tensorflow::TensorShape());
+  TF_RETURN_IF_ERROR(ReadEntireFile(tensorflow::Env::Default(), file_name,
+                                    &input));
+
+  // use a placeholder to read input data
+  auto file_reader = Placeholder(root.WithOpName("input"),
+                                 tensorflow::DataType::DT_STRING);
+
+  std::vector<std::pair<string, tensorflow::Tensor>> inputs = {
+    {"input", input},
+  };
+
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
@@ -102,7 +142,12 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
   } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
-    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
+    // gif decoder returns 4-D tensor, remove the first dim
+    image_reader =
+        Squeeze(root.WithOpName("squeeze_first_dim"),
+                DecodeGif(root.WithOpName("gif_reader"), file_reader));
+  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+    image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
     image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
@@ -132,13 +177,13 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
   std::unique_ptr<tensorflow::Session> session(
       tensorflow::NewSession(tensorflow::SessionOptions()));
   TF_RETURN_IF_ERROR(session->Create(graph));
-  TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
+  TF_RETURN_IF_ERROR(session->Run({inputs}, {output_name}, {}, out_tensors));
   return Status::OK();
 }
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
@@ -185,7 +230,7 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
 // Given the output of a model run, and the name of a file containing the labels
 // this prints out the top five highest-scoring values.
 Status PrintTopLabels(const std::vector<Tensor>& outputs,
-                      string labels_file_name) {
+                      const string& labels_file_name) {
   std::vector<string> labels;
   size_t label_count;
   Status read_labels_status =
@@ -232,20 +277,18 @@ int main(int argc, char* argv[]) {
   // These are the command-line flags the program can understand.
   // They define where the graph and input data is located, and what kind of
   // input the model expects. If you train your own model, or use something
-  // other than GoogLeNet you'll need to update these.
+  // other than inception_v3, then you'll need to update these.
   string image = "tensorflow/examples/label_image/data/grace_hopper.jpg";
   string graph =
-      "tensorflow/examples/label_image/data/"
-      "tensorflow_inception_graph.pb";
+      "tensorflow/examples/label_image/data/inception_v3_2016_08_28_frozen.pb";
   string labels =
-      "tensorflow/examples/label_image/data/"
-      "imagenet_comp_graph_label_strings.txt";
+      "tensorflow/examples/label_image/data/imagenet_slim_labels.txt";
   int32 input_width = 299;
   int32 input_height = 299;
-  int32 input_mean = 128;
-  int32 input_std = 128;
-  string input_layer = "Mul";
-  string output_layer = "softmax";
+  int32 input_mean = 0;
+  int32 input_std = 255;
+  string input_layer = "input";
+  string output_layer = "InceptionV3/Predictions/Reshape_1";
   bool self_test = false;
   string root_dir = "";
   std::vector<Flag> flag_list = {
@@ -309,11 +352,11 @@ int main(int argc, char* argv[]) {
   }
 
   // This is for automated testing to make sure we get the expected result with
-  // the default settings. We know that label 866 (military uniform) should be
+  // the default settings. We know that label 653 (military uniform) should be
   // the top label for the Admiral Hopper image.
   if (self_test) {
     bool expected_matches;
-    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
+    Status check_status = CheckTopLabel(outputs, 653, &expected_matches);
     if (!check_status.ok()) {
       LOG(ERROR) << "Running check failed: " << check_status;
       return -1;
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index b36986855fc..37157fc2967 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -1,7 +1,7 @@
 # TF Learn Examples
 
 Learn is a high-level API for TensorFlow that allows you to create,
-train, and use deep learning models easily. See the [Quickstart tutorial](../../g3doc/tutorials/tflearn/index.md)
+train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn)
 for an introduction to the API.
 
 To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`).
diff --git a/tensorflow/examples/learn/boston.py b/tensorflow/examples/learn/boston.py
index 5d5ddff5643..7a7024e001a 100644
--- a/tensorflow/examples/learn/boston.py
+++ b/tensorflow/examples/learn/boston.py
@@ -16,19 +16,22 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sklearn import cross_validation
+
+from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
 from sklearn import preprocessing
+
 import tensorflow as tf
 
 
 def main(unused_argv):
   # Load dataset
-  boston = tf.contrib.learn.datasets.load_dataset('boston')
+  boston = datasets.load_boston()
   x, y = boston.data, boston.target
 
   # Split dataset into train / test
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       x, y, test_size=0.2, random_state=42)
 
   # Scale data (training set) to 0 mean and unit standard deviation.
@@ -43,11 +46,12 @@ def main(unused_argv):
 
   # Fit
   regressor.fit(x_train, y_train, steps=5000, batch_size=1)
-
+  
+  # Transform
+  x_transformed = scaler.transform(x_test)
+  
   # Predict and score
-  y_predicted = list(
-      regressor.predict(
-          scaler.transform(x_test), as_iterable=True))
+  y_predicted = list(regressor.predict(x_transformed, as_iterable=True))
   score = metrics.mean_squared_error(y_predicted, y_test)
 
   print('MSE: {0:f}'.format(score))
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index 8942720271c..6a7bfa49b9d 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -32,7 +32,7 @@ TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
 
 
 function test() {
-  echo "Test "$1":"
+  echo "Test $1:"
   $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
   if [ $? -eq 0 ]
   then
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index ad01f3544ab..ec2aa9b5731 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -16,15 +16,18 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sklearn import cross_validation
+
+from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
+
 import tensorflow as tf
 
 
 def main(unused_argv):
   # Load dataset.
-  iris = tf.contrib.learn.datasets.load_dataset('iris')
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  iris = datasets.load_iris()
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 73c526cd4e6..31acbd30cd3 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -11,6 +11,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+"""Example of DNNClassifier for Iris plant dataset, with exponential decay."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index 31fb88954bc..fbc50716c93 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -43,7 +43,7 @@ def my_model(features, target):
 
   # Compute logits (1 per class) and compute loss.
   logits = layers.fully_connected(features, 3, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
+  loss = tf.losses.softmax_cross_entropy(target, logits)
 
   # Create a tensor for training op.
   train_op = tf.contrib.layers.optimize_loss(
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 4b3f1835e26..15cf4b91ddb 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -46,13 +46,13 @@ def conv_model(feature, target, mode):
 
   # First conv layer will compute 32 features for each 5x5 patch
   with tf.variable_scope('conv_layer1'):
-    h_conv1 = layers.convolution(
+    h_conv1 = layers.convolution2d(
         feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
     h_pool1 = max_pool_2x2(h_conv1)
 
   # Second conv layer will compute 64 features for each 5x5 patch.
   with tf.variable_scope('conv_layer2'):
-    h_conv2 = layers.convolution(
+    h_conv2 = layers.convolution2d(
         h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
     h_pool2 = max_pool_2x2(h_conv2)
     # reshape tensor into a batch of vectors
@@ -67,7 +67,7 @@ def conv_model(feature, target, mode):
 
   # Compute logits (1 per class) and compute loss.
   logits = layers.fully_connected(h_fc1, 10, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
+  loss = tf.losses.softmax_cross_entropy(target, logits)
 
   # Create a tensor for training op.
   train_op = layers.optimize_loss(
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index a395d94151c..df58906b393 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -60,7 +60,7 @@ def my_model(features, target):
   with tf.device('/gpu:2'):
     # Compute logits (1 per class) and compute loss.
     logits = layers.fully_connected(features, 3, activation_fn=None)
-    loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
+    loss = tf.losses.softmax_cross_entropy(target, logits)
 
     # Create a tensor for training op.
     train_op = tf.contrib.layers.optimize_loss(
diff --git a/tensorflow/examples/learn/random_forest_mnist.py b/tensorflow/examples/learn/random_forest_mnist.py
index 6a943eb42ef..3c09990ea1e 100644
--- a/tensorflow/examples/learn/random_forest_mnist.py
+++ b/tensorflow/examples/learn/random_forest_mnist.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+   # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@ import tempfile
 # pylint: disable=g-backslash-continuation
 from tensorflow.contrib.learn.python.learn\
         import metric_spec
+from tensorflow.contrib.learn.python.learn.estimators\
+        import estimator
 from tensorflow.contrib.tensor_forest.client\
         import eval_metrics
 from tensorflow.contrib.tensor_forest.client\
@@ -44,9 +46,11 @@ def build_estimator(model_dir):
   graph_builder_class = tensor_forest.RandomForestGraphs
   if FLAGS.use_training_loss:
     graph_builder_class = tensor_forest.TrainingLossForest
-  return random_forest.TensorForestEstimator(
+  # Use the SKCompat wrapper, which gives us a convenient way to split
+  # in-memory data like MNIST into batches.
+  return estimator.SKCompat(random_forest.TensorForestEstimator(
       params, graph_builder_class=graph_builder_class,
-      model_dir=model_dir)
+      model_dir=model_dir))
 
 
 def train_and_eval():
@@ -54,17 +58,12 @@ def train_and_eval():
   model_dir = tempfile.mkdtemp() if not FLAGS.model_dir else FLAGS.model_dir
   print('model directory = %s' % model_dir)
 
-  estimator = build_estimator(model_dir)
-
-  # TensorForest's loss hook allows training to terminate early if the
-  # forest is no longer growing.
-  early_stopping_rounds = 100
-  monitor = random_forest.TensorForestLossHook(early_stopping_rounds)
+  est = build_estimator(model_dir)
 
   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
 
-  estimator.fit(x=mnist.train.images, y=mnist.train.labels,
-                batch_size=FLAGS.batch_size, monitors=[monitor])
+  est.fit(x=mnist.train.images, y=mnist.train.labels,
+          batch_size=FLAGS.batch_size)
 
   metric_name = 'accuracy'
   metric = {metric_name:
@@ -72,9 +71,9 @@ def train_and_eval():
                 eval_metrics.get_metric(metric_name),
                 prediction_key=eval_metrics.get_prediction_key(metric_name))}
 
-  results = estimator.evaluate(x=mnist.test.images, y=mnist.test.labels,
-                               batch_size=FLAGS.batch_size,
-                               metrics=metric)
+  results = est.score(x=mnist.test.images, y=mnist.test.labels,
+                      batch_size=FLAGS.batch_size,
+                      metrics=metric)
   for key in sorted(results):
     print('%s: %s' % (key, results[key]))
 
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 49d89ef660b..881905fde8e 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -144,8 +144,8 @@ def res_net(x, y, activation=tf.nn.relu):
 
   target = tf.one_hot(y, depth=10, dtype=tf.float32)
   logits = tf.contrib.layers.fully_connected(net, 10, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
-  return tf.softmax(logits), loss
+  loss = tf.losses.softmax_cross_entropy(target, logits)
+  return tf.nn.softmax(logits), loss
 
 
 def res_net_model(x, y):
@@ -181,7 +181,7 @@ result = classifier.evaluate(
     y=mnist.test.labels,
     metrics={
         'accuracy':
-            tf.contrib.learn.metric_spec.MetricSpec(
+            tf.contrib.learn.MetricSpec(
                 metric_fn=tf.contrib.metrics.streaming_accuracy,
                 prediction_key='accuracy'),
     })
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index a3a5f9e3e98..7e10014c392 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -24,6 +24,7 @@ import numpy as np
 import pandas
 from sklearn import metrics
 import tensorflow as tf
+from tensorflow.contrib.layers.python.layers import encoders
 
 learn = tf.contrib.learn
 
@@ -37,7 +38,7 @@ n_words = 0
 def bag_of_words_model(features, target):
   """A bag-of-words model. Note it disregards the word order in the text."""
   target = tf.one_hot(target, 15, 1, 0)
-  features = tf.contrib.layers.bow_encoder(
+  features = encoders.bow_encoder(
       features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
   logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
   loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
@@ -104,8 +105,13 @@ def main(unused_argv):
 
   # Process vocabulary
   vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
-  x_train = np.array(list(vocab_processor.fit_transform(x_train)))
-  x_test = np.array(list(vocab_processor.transform(x_test)))
+  
+  x_transform_train = vocab_processor.fit_transform(x_train)
+  x_transform_test = vocab_processor.transform(x_test)
+  
+  x_train = np.array(list(x_transform_train))
+  x_test = np.array(list(x_transform_test))
+  
   n_words = len(vocab_processor.vocabulary_)
   print('Total words: %d' % n_words)
 
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 143af4f6641..5ad53acf9f3 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -11,7 +11,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""This is an example of using convolutional networks over characters for DBpedia dataset to predict class from description of an entity.
+"""This is an example of using convolutional networks over characters for
+   DBpedia dataset to predict class from description of an entity.
 
 This model is similar to one described in this paper:
    "Character-level Convolutional Networks for Text Classification"
@@ -49,12 +50,12 @@ def char_cnn_model(features, target):
   """Character level convolutional neural network model to predict classes."""
   target = tf.one_hot(target, 15, 1, 0)
   byte_list = tf.reshape(
-      tf.one_hot(features, 256, 1, 0), [-1, MAX_DOCUMENT_LENGTH, 256, 1])
+      tf.one_hot(features, 256), [-1, MAX_DOCUMENT_LENGTH, 256, 1])
   with tf.variable_scope('CNN_Layer1'):
     # Apply Convolution filtering on input sequence.
     conv1 = tf.contrib.layers.convolution2d(
         byte_list, N_FILTERS, FILTER_SHAPE1, padding='VALID')
-    # Add a RELU for non linearity.
+    # Add a ReLU for non linearity.
     conv1 = tf.nn.relu(conv1)
     # Max pooling across output of Convolution+Relu.
     pool1 = tf.nn.max_pool(
@@ -73,7 +74,7 @@ def char_cnn_model(features, target):
 
   # Apply regular WX + B and classification.
   logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
+  loss = tf.losses.softmax_cross_entropy(target, logits)
 
   train_op = tf.contrib.layers.optimize_loss(
       loss,
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index 41fbdba1a7b..468a96b58f7 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -73,7 +73,7 @@ def cnn_model(features, target):
 
   # Apply regular WX + B and classification.
   logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
+  loss = tf.losses.softmax_cross_entropy(target, logits)
 
   train_op = tf.contrib.layers.optimize_loss(
       loss,
@@ -105,14 +105,11 @@ def main(unused_argv):
   print('Total words: %d' % n_words)
 
   # Build model
-  classifier = learn.Estimator(model_fn=cnn_model)
+  classifier = learn.SKCompat(learn.Estimator(model_fn=cnn_model))
 
   # Train and predict
   classifier.fit(x_train, y_train, steps=100)
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  y_predicted = classifier.predict(x_test)['class']
   score = metrics.accuracy_score(y_test, y_predicted)
   print('Accuracy: {0:f}'.format(score))
 
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index 09af375314e..a0c6df821a4 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -44,7 +44,7 @@ def maybe_download(train_data, test_data):
     train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
     train_file_name = train_file.name
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
@@ -53,7 +53,7 @@ def maybe_download(train_data, test_data):
     test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
     test_file_name = test_file.name
     test_file.close()
     print("Test data is downloaded to %s" % test_file_name)
@@ -128,7 +128,8 @@ def build_estimator(model_dir, model_type):
         model_dir=model_dir,
         linear_feature_columns=wide_columns,
         dnn_feature_columns=deep_columns,
-        dnn_hidden_units=[100, 50])
+        dnn_hidden_units=[100, 50],
+        fix_global_step_increment_bug=True)
   return m
 
 
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 0d6875671b1..e38704fd98c 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -159,7 +159,7 @@ Status SaveImage(const Tensor& tensor, const string& file_path) {
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
index 52c95190bcc..02afa0b0fcb 100644
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -46,6 +46,7 @@ import sys
 import tensorflow as tf
 
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import main_op
 
 FLAGS = None
 
@@ -103,12 +104,15 @@ def _build_classification_signature(input_tensor, scores_tensor):
       tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)
 
 
-def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
+def _generate_saved_model_for_half_plus_two(export_dir,
+                                            as_text=False,
+                                            use_main_op=False):
   """Generates SavedModel for half plus two.
 
   Args:
     export_dir: The directory to which the SavedModel should be written.
     as_text: Writes the SavedModel protocol buffer in text format to disk.
+    use_main_op: Whether to supply a main op during SavedModel build time.
   """
   builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
@@ -168,25 +172,46 @@ def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
             predict_signature_inputs, predict_signature_outputs,
             tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
 
+    signature_def_map = {
+        "regress_x_to_y":
+            _build_regression_signature(serialized_tf_example, y),
+        "regress_x_to_y2":
+            _build_regression_signature(serialized_tf_example, y2),
+        "regress_x2_to_y3":
+            _build_regression_signature(x2, y3),
+        "classify_x_to_y":
+            _build_classification_signature(serialized_tf_example, y),
+        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            predict_signature_def
+    }
     # Initialize all variables and then save the SavedModel.
     sess.run(tf.global_variables_initializer())
-    builder.add_meta_graph_and_variables(
-        sess, [tf.saved_model.tag_constants.SERVING],
-        signature_def_map={
-            "regress_x_to_y":
-                _build_regression_signature(serialized_tf_example, y),
-            "regress_x_to_y2":
-                _build_regression_signature(serialized_tf_example, y2),
-            "regress_x2_to_y3":
-                _build_regression_signature(x2, y3),
-            "classify_x_to_y":
-                _build_classification_signature(serialized_tf_example, y),
-            tf.saved_model.signature_constants.
-            DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                predict_signature_def
-        },
-        assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-        legacy_init_op=tf.group(assign_filename_op))
+    signature_def_map = {
+        "regress_x_to_y":
+            _build_regression_signature(serialized_tf_example, y),
+        "regress_x_to_y2":
+            _build_regression_signature(serialized_tf_example, y2),
+        "regress_x2_to_y3":
+            _build_regression_signature(x2, y3),
+        "classify_x_to_y":
+            _build_classification_signature(serialized_tf_example, y),
+        "classify_x2_to_y3":
+            _build_classification_signature(x2, y3),
+        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            predict_signature_def
+    }
+    if use_main_op:
+      builder.add_meta_graph_and_variables(
+          sess, [tf.saved_model.tag_constants.SERVING],
+          signature_def_map=signature_def_map,
+          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
+          main_op=tf.group(main_op.main_op(), assign_filename_op))
+    else:
+      builder.add_meta_graph_and_variables(
+          sess, [tf.saved_model.tag_constants.SERVING],
+          signature_def_map=signature_def_map,
+          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
+          legacy_init_op=tf.group(assign_filename_op))
     builder.save(as_text)
 
 
@@ -197,6 +222,10 @@ def main(_):
   _generate_saved_model_for_half_plus_two(FLAGS.output_dir_pbtxt, as_text=True)
   print("SavedModel generated at: %s" % FLAGS.output_dir_pbtxt)
 
+  _generate_saved_model_for_half_plus_two(
+      FLAGS.output_dir_main_op, use_main_op=True)
+  print("SavedModel generated at: %s" % FLAGS.output_dir_main_op)
+
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
@@ -204,11 +233,16 @@ if __name__ == "__main__":
       "--output_dir",
       type=str,
       default="/tmp/saved_model_half_plus_two",
-      help="Directory where to ouput SavedModel.")
+      help="Directory where to output SavedModel.")
   parser.add_argument(
       "--output_dir_pbtxt",
       type=str,
       default="/tmp/saved_model_half_plus_two_pbtxt",
-      help="Directory where to ouput the text format of SavedModel.")
+      help="Directory where to output the text format of SavedModel.")
+  parser.add_argument(
+      "--output_dir_main_op",
+      type=str,
+      default="/tmp/saved_model_half_plus_two_main_op",
+      help="Directory where to output the SavedModel with a main op.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
index cbcc54ce3cf..186c14b4fd0 100644
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@@ -120,7 +120,7 @@
    },
    "outputs": [],
    "source": [
-    "#!wget https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip && unzip inception5h.zip"
+    "!wget -nc https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip && unzip -n inception5h.zip"
    ]
   },
   {
@@ -278,7 +278,7 @@
     "            tensor = n.attr['value'].tensor\n",
     "            size = len(tensor.tensor_content)\n",
     "            if size > max_const_size:\n",
-    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size, 'utf-8')\n",
+    "                tensor.tensor_content = tf.compat.as_bytes(\"<stripped %d bytes>\"%size)\n",
     "    return strip_def\n",
     "  \n",
     "def rename_nodes(graph_def, rename_func):\n",
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index 932ce8a8b25..3c0ea2e4090 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -134,12 +134,22 @@ def main(unused_argv):
 
   # Instantiate Estimator
   nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
-
+  
+  def get_train_inputs():
+    x = tf.constant(training_set.data)
+    y = tf.constant(training_set.target)
+    return x, y
+  
   # Fit
-  nn.fit(x=training_set.data, y=training_set.target, steps=5000)
+  nn.fit(input_fn=get_train_inputs, steps=5000)
 
   # Score accuracy
-  ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
+  def get_test_inputs():
+    x = tf.constant(test_set.data)
+    y = tf.constant(test_set.target)
+    return x, y
+  
+  ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
   print("Loss: %s" % ev["loss"])
   print("Root Mean Squared Error: %s" % ev["rmse"])
 
diff --git a/tensorflow/examples/tutorials/input_fn/boston.py b/tensorflow/examples/tutorials/input_fn/boston.py
index fb2164c3952..c7fb7e23165 100644
--- a/tensorflow/examples/tutorials/input_fn/boston.py
+++ b/tensorflow/examples/tutorials/input_fn/boston.py
@@ -53,8 +53,9 @@ def main(unused_argv):
                   for k in FEATURES]
 
   # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  regressor = tf.contrib.learn.DNNRegressor(
-      feature_columns=feature_cols, hidden_units=[10, 10])
+  regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
+                                            hidden_units=[10, 10],
+                                            model_dir="/tmp/boston_model")
 
   # Fit
   regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 09dbffd517e..aa92b1758a0 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -148,7 +148,7 @@ def main(unused_argv):
   # Configure the accuracy metric for evaluation
   metrics = {
       "accuracy":
-          learn.metric_spec.MetricSpec(
+          learn.MetricSpec(
               metric_fn=tf.metrics.accuracy, prediction_key="classes"),
   }
 
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 283083ec4bd..7a38e8ca97a 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -5,6 +5,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "package",
     srcs = [
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index d533697976e..3585043a2a9 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -17,7 +17,7 @@
 
 Implements the inference/loss/training pattern for model building.
 
-1. inference() - Builds the model as far as is required for running the network
+1. inference() - Builds the model as far as required for running the network
 forward to make predictions.
 2. loss() - Adds to the inference model the layers required to generate loss.
 3. training() - Adds to the loss model the Ops required to generate and
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
new file mode 100644
index 00000000000..2896eee77d1
--- /dev/null
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -0,0 +1,155 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A deep MNIST classifier using convolutional layers.
+
+See extensive documentation at
+https://www.tensorflow.org/get_started/mnist/pros
+"""
+# Disable linter warnings to maintain consistency with tutorial.
+# pylint: disable=invalid-name
+# pylint: disable=g-bad-import-order
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from tensorflow.examples.tutorials.mnist import input_data
+
+import tensorflow as tf
+
+FLAGS = None
+
+
+def deepnn(x):
+  """deepnn builds the graph for a deep net for classifying digits.
+
+  Args:
+    x: an input tensor with the dimensions (N_examples, 784), where 784 is the
+    number of pixels in a standard MNIST image.
+
+  Returns:
+    A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
+    equal to the logits of classifying the digit into one of 10 classes (the
+    digits 0-9). keep_prob is a scalar placeholder for the probability of
+    dropout.
+  """
+  # Reshape to use within a convolutional neural net.
+  # Last dimension is for "features" - there is only one here, since images are
+  # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
+  x_image = tf.reshape(x, [-1, 28, 28, 1])
+
+  # First convolutional layer - maps one grayscale image to 32 feature maps.
+  W_conv1 = weight_variable([5, 5, 1, 32])
+  b_conv1 = bias_variable([32])
+  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
+
+  # Pooling layer - downsamples by 2X.
+  h_pool1 = max_pool_2x2(h_conv1)
+
+  # Second convolutional layer -- maps 32 feature maps to 64.
+  W_conv2 = weight_variable([5, 5, 32, 64])
+  b_conv2 = bias_variable([64])
+  h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
+
+  # Second pooling layer.
+  h_pool2 = max_pool_2x2(h_conv2)
+
+  # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
+  # is down to 7x7x64 feature maps -- maps this to 1024 features.
+  W_fc1 = weight_variable([7 * 7 * 64, 1024])
+  b_fc1 = bias_variable([1024])
+
+  h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
+  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+
+  # Dropout - controls the complexity of the model, prevents co-adaptation of
+  # features.
+  keep_prob = tf.placeholder(tf.float32)
+  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+
+  # Map the 1024 features to 10 classes, one for each digit
+  W_fc2 = weight_variable([1024, 10])
+  b_fc2 = bias_variable([10])
+
+  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
+  return y_conv, keep_prob
+
+
+def conv2d(x, W):
+  """conv2d returns a 2d convolution layer with full stride."""
+  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
+
+
+def max_pool_2x2(x):
+  """max_pool_2x2 downsamples a feature map by 2X."""
+  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
+                        strides=[1, 2, 2, 1], padding='SAME')
+
+
+def weight_variable(shape):
+  """weight_variable generates a weight variable of a given shape."""
+  initial = tf.truncated_normal(shape, stddev=0.1)
+  return tf.Variable(initial)
+
+
+def bias_variable(shape):
+  """bias_variable generates a bias variable of a given shape."""
+  initial = tf.constant(0.1, shape=shape)
+  return tf.Variable(initial)
+
+
+def main(_):
+  # Import data
+  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+  # Create the model
+  x = tf.placeholder(tf.float32, [None, 784])
+
+  # Define loss and optimizer
+  y_ = tf.placeholder(tf.float32, [None, 10])
+
+  # Build the graph for the deep net
+  y_conv, keep_prob = deepnn(x)
+
+  cross_entropy = tf.reduce_mean(
+      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
+  train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
+  correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+
+  with tf.Session() as sess:
+    sess.run(tf.global_variables_initializer())
+    for i in range(20000):
+      batch = mnist.train.next_batch(50)
+      if i % 100 == 0:
+        train_accuracy = accuracy.eval(feed_dict={
+            x: batch[0], y_: batch[1], keep_prob: 1.0})
+        print('step %d, training accuracy %g' % (i, train_accuracy))
+      train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
+
+    print('test accuracy %g' % accuracy.eval(feed_dict={
+        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--data_dir', type=str,
+                      default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index 4fa89ff2467..addd2d38102 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -16,7 +16,7 @@
 """A very simple MNIST classifier.
 
 See extensive documentation at
-http://tensorflow.org/tutorials/mnist/beginners/index.md
+https://www.tensorflow.org/get_started/mnist/beginners
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 75ea0b9c675..dc0d8703158 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """A simple MNIST classifier which displays summaries in TensorBoard.
 
- This is an unimpressive MNIST model, but it is a good example of using
+This is an unimpressive MNIST model, but it is a good example of using
 tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 naming summary tags so that they are grouped meaningfully in TensorBoard.
 
@@ -78,7 +78,7 @@ def train():
   def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
     """Reusable code for making a simple neural net layer.
 
-    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
     It also sets up name scoping so that the resultant graph is easy to read,
     and adds a number of summary ops.
     """
@@ -135,7 +135,8 @@ def train():
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', accuracy)
 
-  # Merge all the summaries and write them out to /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
+  # Merge all the summaries and write them out to
+  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
   merged = tf.summary.merge_all()
   train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
   test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
@@ -196,9 +197,15 @@ if __name__ == '__main__':
                       help='Initial learning rate')
   parser.add_argument('--dropout', type=float, default=0.9,
                       help='Keep probability for training dropout.')
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
-  parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
-                      help='Summaries log directory')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+      help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index 041592b9b0f..850d105f7b1 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -21,7 +21,6 @@ import os
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
@@ -39,20 +38,17 @@ def main(unused_argv):
 
   validation_metrics = {
       "accuracy":
-          tf.contrib.learn.metric_spec.MetricSpec(
+          tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_accuracy,
-              prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-              CLASSES),
+              prediction_key="classes"),
       "precision":
-          tf.contrib.learn.metric_spec.MetricSpec(
+          tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_precision,
-              prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-              CLASSES),
+              prediction_key="classes"),
       "recall":
-          tf.contrib.learn.metric_spec.MetricSpec(
+          tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_recall,
-              prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-              CLASSES)
+              prediction_key="classes")
   }
   validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
       test_set.data,
@@ -66,26 +62,6 @@ def main(unused_argv):
   # Specify that all features have real-value data
   feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
 
-  validation_metrics = {
-      "accuracy": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_accuracy,
-                          prediction_key="classes"),
-      "recall": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_recall,
-                          prediction_key="classes"),
-      "precision": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_precision,
-                          prediction_key="classes")
-                        }
-  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-      test_set.data,
-      test_set.target,
-      every_n_steps=50,
-      metrics=validation_metrics,
-      early_stopping_metric="loss",
-      early_stopping_metric_minimize=True,
-      early_stopping_rounds=200)
-
   # Build 3 layer DNN with 10, 20, 10 units respectively.
   classifier = tf.contrib.learn.DNNClassifier(
       feature_columns=feature_columns,
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index bc502edd8b1..aee482fda56 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Basic word2vec example."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -50,21 +51,22 @@ filename = maybe_download('text8.zip', 31344016)
 
 # Read the data into a list of strings.
 def read_data(filename):
-  """Extract the first file enclosed in a zip file as a list of words"""
+  """Extract the first file enclosed in a zip file as a list of words."""
   with zipfile.ZipFile(filename) as f:
     data = tf.compat.as_str(f.read(f.namelist()[0])).split()
   return data
 
-words = read_data(filename)
-print('Data size', len(words))
+vocabulary = read_data(filename)
+print('Data size', len(vocabulary))
 
 # Step 2: Build the dictionary and replace rare words with UNK token.
 vocabulary_size = 50000
 
 
-def build_dataset(words):
+def build_dataset(words, n_words):
+  """Process raw inputs into a dataset."""
   count = [['UNK', -1]]
-  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
+  count.extend(collections.Counter(words).most_common(n_words - 1))
   dictionary = dict()
   for word, _ in count:
     dictionary[word] = len(dictionary)
@@ -78,17 +80,17 @@ def build_dataset(words):
       unk_count += 1
     data.append(index)
   count[0][1] = unk_count
-  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
-  return data, count, dictionary, reverse_dictionary
+  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
+  return data, count, dictionary, reversed_dictionary
 
-data, count, dictionary, reverse_dictionary = build_dataset(words)
-del words  # Hint to reduce memory.
+data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
+                                                            vocabulary_size)
+del vocabulary  # Hint to reduce memory.
 print('Most common words (+UNK)', count[:5])
 print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 
 data_index = 0
 
-
 # Step 3: Function to generate a training batch for the skip-gram model.
 def generate_batch(batch_size, num_skips, skip_window):
   global data_index
@@ -98,9 +100,10 @@ def generate_batch(batch_size, num_skips, skip_window):
   labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
   span = 2 * skip_window + 1  # [ skip_window target skip_window ]
   buffer = collections.deque(maxlen=span)
-  for _ in range(span):
-    buffer.append(data[data_index])
-    data_index = (data_index + 1) % len(data)
+  if data_index + span > len(data):
+    data_index = 0
+  buffer.extend(data[data_index:data_index + span])
+  data_index += span
   for i in range(batch_size // num_skips):
     target = skip_window  # target label at the center of the buffer
     targets_to_avoid = [skip_window]
@@ -110,8 +113,14 @@ def generate_batch(batch_size, num_skips, skip_window):
       targets_to_avoid.append(target)
       batch[i * num_skips + j] = buffer[skip_window]
       labels[i * num_skips + j, 0] = buffer[target]
-    buffer.append(data[data_index])
-    data_index = (data_index + 1) % len(data)
+    if data_index == len(data):
+      buffer[:] = data[:span]
+      data_index = span
+    else:
+      buffer.append(data[data_index])
+      data_index += 1
+  # Backtrack a little bit to avoid skipping words in the end of a batch
+  data_index = (data_index + len(data) - span) % len(data)
   return batch, labels
 
 batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
@@ -187,7 +196,7 @@ num_steps = 100001
 with tf.Session(graph=graph) as session:
   # We must initialize all variables before we use them.
   init.run()
-  print("Initialized")
+  print('Initialized')
 
   average_loss = 0
   for step in xrange(num_steps):
@@ -204,7 +213,7 @@ with tf.Session(graph=graph) as session:
       if step > 0:
         average_loss /= 2000
       # The average loss is an estimate of the loss over the last 2000 batches.
-      print("Average loss at step ", step, ": ", average_loss)
+      print('Average loss at step ', step, ': ', average_loss)
       average_loss = 0
 
     # Note that this is expensive (~20% slowdown if computed every 500 steps)
@@ -214,10 +223,10 @@ with tf.Session(graph=graph) as session:
         valid_word = reverse_dictionary[valid_examples[i]]
         top_k = 8  # number of nearest neighbors
         nearest = (-sim[i, :]).argsort()[1:top_k + 1]
-        log_str = "Nearest to %s:" % valid_word
+        log_str = 'Nearest to %s:' % valid_word
         for k in xrange(top_k):
           close_word = reverse_dictionary[nearest[k]]
-          log_str = "%s %s," % (log_str, close_word)
+          log_str = '%s %s,' % (log_str, close_word)
         print(log_str)
   final_embeddings = normalized_embeddings.eval()
 
@@ -225,7 +234,7 @@ with tf.Session(graph=graph) as session:
 
 
 def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
-  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
+  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
   plt.figure(figsize=(18, 18))  # in inches
   for i, label in enumerate(labels):
     x, y = low_dim_embs[i, :]
@@ -240,6 +249,7 @@ def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
   plt.savefig(filename)
 
 try:
+  # pylint: disable=g-import-not-at-top
   from sklearn.manifold import TSNE
   import matplotlib.pyplot as plt
 
@@ -250,4 +260,4 @@ try:
   plot_with_labels(low_dim_embs, labels)
 
 except ImportError:
-  print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")
+  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
diff --git a/tensorflow/examples/udacity/1_notmnist.ipynb b/tensorflow/examples/udacity/1_notmnist.ipynb
index 4b0a20b1dd2..39674e1aa49 100644
--- a/tensorflow/examples/udacity/1_notmnist.ipynb
+++ b/tensorflow/examples/udacity/1_notmnist.ipynb
@@ -70,7 +70,7 @@
         "colab_type": "text"
       },
       "source": [
-        "First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine."
+        "First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labeled examples. Given these sizes, it should be possible to train models quickly on any machine."
       ]
     },
     {
@@ -109,7 +109,7 @@
         "outputId": "0d0f85df-155f-4a89-8e7e-ee32df36ec8d"
       },
       "source": [
-        "url = 'http://commondatastorage.googleapis.com/books1000/'\n",
+        "url = 'https://commondatastorage.googleapis.com/books1000/'\n",
         "last_percent_reported = None\n",
         "data_root = '.' # Change me to store data elsewhere\n",
         "\n",
@@ -168,7 +168,7 @@
       },
       "source": [
         "Extract the dataset from the compressed .tar.gz file.\n",
-        "This should give you a set of directories, labelled A through J."
+        "This should give you a set of directories, labeled A through J."
       ]
     },
     {
@@ -219,7 +219,7 @@
         "    print('Extracting data for %s. This may take a while. Please wait.' % root)\n",
         "    tar = tarfile.open(filename)\n",
         "    sys.stdout.flush()\n",
-        "    tar.extractall()\n",
+        "    tar.extractall(data_root)\n",
         "    tar.close()\n",
         "  data_folders = [\n",
         "    os.path.join(root, d) for d in sorted(os.listdir(root))\n",
diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index 9f5ef1aca3e..3d48ced41b2 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -12,4 +12,4 @@ RUN pip install scikit-learn pyreadline Pillow
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
 WORKDIR /notebooks
-CMD ["/run_jupyter.sh"]
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 143a75a3e9f..6faad294c2d 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -6,7 +6,7 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
 
-    docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:0.6.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:1.0.0
 
 Note that if you ever exit the container, you can return to it using:
 
@@ -94,10 +94,10 @@ This will allow you to save work and have access to generated files on the host
 Pushing a Google Cloud release
 ------------------------------
 
-    V=0.6.0
+    V=1.0.0
     docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:$V
     gcloud docker push gcr.io/tensorflow/udacity-assignments
-    docker tag -f $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
+    docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
     gcloud docker push gcr.io/tensorflow/udacity-assignments
 
 History
@@ -109,3 +109,4 @@ History
 * 0.4.0: Move notMMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
 * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
+* 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
new file mode 100644
index 00000000000..5923fa59293
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -0,0 +1,61 @@
+# Description:
+#   TensorFlow C++ inference example for labeling images.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "wav_to_spectrogram_lib",
+    srcs = ["wav_to_spectrogram.cc"],
+    hdrs = ["wav_to_spectrogram.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_binary(
+    name = "wav_to_spectrogram",
+    srcs = ["main.cc"],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:framework_internal",
+    ],
+)
+
+cc_test(
+    name = "wav_to_spectrogram_test",
+    size = "medium",
+    srcs = ["wav_to_spectrogram_test.cc"],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/wav_to_spectrogram/README.md b/tensorflow/examples/wav_to_spectrogram/README.md
new file mode 100644
index 00000000000..7f7eb43700c
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Spectrogram Example
+
+This example shows how you can load audio from a .wav file, convert it to a
+spectrogram, and then save it out as a PNG image. A spectrogram is a
+visualization of the frequencies in sound over time, and can be useful as a
+feature for neural network recognition on noise or speech.
+
+## Building
+
+To build it, run this command:
+
+```bash
+bazel build tensorflow/examples/wav_to_spectrogram/...
+```
+
+That should build a binary executable that you can then run like this:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram
+```
+
+This uses a default test audio file that's part of the TensorFlow source code,
+and writes out the image to the current directory as spectrogram.png.
+
+## Options
+
+To load your own audio, you need to supply a .wav file in LIN16 format, and use
+the `--input_audio` flag to pass in the path.
+
+To control how the spectrogram is created, you can specify the `--window_size`
+and `--stride` arguments, which control how wide the window used to estimate
+frequencies is, and how widely adjacent windows are spaced.
+
+The `--output_image` flag sets the path to save the image file to. This is
+always written out in PNG format, even if you specify a different file
+extension.
+
+If your result seems too dark, try using the `--brightness` flag to make the
+output image easier to see.
+
+Here's an example of how to use all of them together:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram \
+--input_wav=/tmp/my_audio.wav \
+--window=1024 \
+--stride=512 \
+--output_image=/tmp/my_spectrogram.png
+```
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
new file mode 100644
index 00000000000..539e6c4fe42
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+int main(int argc, char* argv[]) {
+  // These are the command-line flags the program can understand.
+  // They define where the graph and input data is located, and what kind of
+  // input the model expects. If you train your own model, or use something
+  // other than inception_v3, then you'll need to update these.
+  tensorflow::string input_wav =
+      "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
+  tensorflow::int32 window_size = 256;
+  tensorflow::int32 stride = 128;
+  float brightness = 64.0f;
+  tensorflow::string output_image = "spectrogram.png";
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
+      tensorflow::Flag("window_size", &window_size,
+                       "frequency sample window width"),
+      tensorflow::Flag("stride", &stride,
+                       "how far apart to place frequency windows"),
+      tensorflow::Flag("brightness", &brightness,
+                       "controls how bright the output image is"),
+      tensorflow::Flag("output_image", &output_image,
+                       "where to save the spectrogram image to"),
+  };
+  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return -1;
+  }
+
+  tensorflow::Status wav_status = WavToSpectrogram(
+      input_wav, window_size, stride, brightness, output_image);
+  if (!wav_status.ok()) {
+    LOG(ERROR) << "WavToSpectrogram failed with " << wav_status;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
new file mode 100644
index 00000000000..c69a3596378
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_UINT8;
+using tensorflow::Output;
+using tensorflow::TensorShape;
+
+// Runs a TensorFlow graph to convert an audio file into a visualization.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image) {
+  auto root = tensorflow::Scope::NewRootScope();
+  using namespace tensorflow::ops;  // NOLINT(build/namespaces)
+  // The following block creates a TensorFlow graph that:
+  //  - Reads and decodes the audio file into a tensor of float samples.
+  //  - Creates a float spectrogram from those samples.
+  //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
+  //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
+  //  - Encodes it as a PNG stream and saves it out to a file.
+  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  DecodeWav wav_decoder =
+      DecodeWav(root.WithOpName("wav_decoder"), file_reader);
+  Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
+                                        wav_decoder.audio, window_size, stride);
+  Output brightness_placeholder =
+      Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
+                  Placeholder::Attrs().Shape(TensorShape({})));
+  Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
+  Output min_const = Const(root.WithOpName("min_const"), 255.0f);
+  Output min = Minimum(root.WithOpName("min"), mul, min_const);
+  Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
+  Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
+  Output expand_dims =
+      ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
+  Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
+                           Squeeze::Attrs().SqueezeDims({0}));
+  Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
+  WriteFile file_writer =
+      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::GraphDef graph;
+  TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+  // Build a session object from this graph definition. The power of TensorFlow
+  // is that you can reuse complex computations like this, so usually we'd run a
+  // lot of different inputs through it. In this example, we're just doing a
+  // one-off run, so we'll create it and then use it immediately.
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_RETURN_IF_ERROR(session->Create(graph));
+
+  // We're passing in the brightness as an input, so create a tensor to hold the
+  // value.
+  tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
+  brightness_tensor.scalar<float>()() = brightness;
+
+  // Run the session to analyze the audio and write out the file.
+  TF_RETURN_IF_ERROR(
+      session->Run({{"brightness_placeholder", brightness_tensor}}, {},
+                   {"output_image"}, nullptr));
+  return tensorflow::Status::OK();
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
new file mode 100644
index 00000000000..fa8cb0abe95
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+// Runs a TensorFlow graph to convert an audio file into a visualization. Takes
+// in the path to the audio file, the window size and stride parameters
+// controlling the spectrogram creation, the brightness scaling to use, and a
+// path to save the output PNG file to.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image);
+
+#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
new file mode 100644
index 00000000000..e5997114454
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
+  const tensorflow::string input_wav =
+      tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
+  const tensorflow::string output_image = tensorflow::io::JoinPath(
+      tensorflow::testing::TmpDir(), "output_image.png");
+  float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  tensorflow::string wav_string;
+  TF_ASSERT_OK(
+      tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
+  TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
+                                             input_wav, wav_string));
+  TF_ASSERT_OK(WavToSpectrogram(input_wav, 4, 4, 64.0f, output_image));
+  TF_EXPECT_OK(tensorflow::Env::Default()->FileExists(output_image));
+}
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
new file mode 100644
index 00000000000..4f4076db374
--- /dev/null
+++ b/tensorflow/g3doc/README.txt
@@ -0,0 +1,46 @@
+Docs have moved!  If you just want to view TensorFlow documentation,
+go to:
+
+   https://www.tensorflow.org/
+
+Documentation (on Github, tensorflow.org, and anywhere else we decide to
+serve it from) is now generated from the files in
+third_party/tensorflow/docs_src/ (for tutorials and other guides) and
+TensorFlow source code (for the API reference pages). If you see a problem with
+API reference, edit the code comments in the appropriate language. If you see a 
+problem with our other docs, edit the files in docs_src.
+
+To preview the results of your changes, or generate an offline copy of
+the docs, run:
+
+  bazel run -- tensorflow/tools/docs:generate \
+    --src_dir=tensorflow/docs_src/ \
+    --output_dir=/tmp/tfdocs/
+
+When authoring docs, note that we have some new syntax for references --
+at least for docs coming from Python docstrings or
+tensorflow/docs_src/.  Use:
+
+* @{tf.symbol} to make a link to the reference page for a Python
+  symbol.  Note that class members don't get their own page, but the
+  syntax still works, since @{tf.MyClass.method} links to the right
+  part of the tf.MyClass page.
+
+
+* @{tensorflow::symbol} to make a link to the reference page for a C++
+  symbol. (This only works for a few symbols but will work for more soon.)
+
+* @{$doc_page} to make a link to another (not an API reference) doc
+  page. To link to
+    - red/green/blue/index.md use @{$blue} or @{$green/blue},
+    - foo/bar/baz.md use @{$baz} or @{$bar/baz}.
+  The shorter one is preferred, so we can move pages around without
+  breaking these references. The main exception is that the Python API
+  guides should probably be referred to using @{$python/<guide-name>}
+  to avoid ambiguity. To link to an anchor in that doc and use
+  different link text (by default it uses the title of the target
+  page) use:
+        @{$doc_page#anchor-tag$link-text}
+  (You can skip #anchor-tag if you just want to override the link text).
+
+Thanks!
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnv.md b/tensorflow/g3doc/api_docs/cc/ClassEnv.md
deleted file mode 100644
index 43f75fefb9f..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassEnv.md
+++ /dev/null
@@ -1,253 +0,0 @@
-# `class tensorflow::Env`
-
-An interface used by the tensorflow implementation to access operating system functionality like the filesystem etc.
-
-Callers may wish to provide a custom Env object to get fine grain control.
-
-All Env implementations are safe for concurrent access from multiple threads without any external synchronization.
-
-###Member Details
-
-#### `tensorflow::Env::Env()` {#tensorflow_Env_Env}
-
-
-
-
-
-#### `virtual tensorflow::Env::~Env()=default` {#virtual_tensorflow_Env_Env}
-
-
-
-
-
-#### `Status tensorflow::Env::GetFileSystemForFile(const string &fname, FileSystem **result)` {#Status_tensorflow_Env_GetFileSystemForFile}
-
-Returns the FileSystem object to handle operations on the file specified by &apos;fname&apos;. The FileSystem object is used as the implementation for the file system related (non-virtual) functions that follow. Returned FileSystem object is still owned by the Env object and will.
-
-
-
-#### `Status tensorflow::Env::GetRegisteredFileSystemSchemes(std::vector< string > *schemes)` {#Status_tensorflow_Env_GetRegisteredFileSystemSchemes}
-
-Returns the file system schemes registered for this Env .
-
-
-
-#### `Status tensorflow::Env::RegisterFileSystem(const string &scheme, FileSystemRegistry::Factory factory)` {#Status_tensorflow_Env_RegisterFileSystem}
-
-
-
-
-
-#### `Status tensorflow::Env::NewRandomAccessFile(const string &fname, std::unique_ptr< RandomAccessFile > *result)` {#Status_tensorflow_Env_NewRandomAccessFile}
-
-Creates a brand new random access read-only file with the specified name.
-
-On success, stores a pointer to the new file in *result and returns OK. On failure stores NULL in *result and returns non-OK. If the file does not exist, returns a non-OK status.
-
-The returned file may be concurrently accessed by multiple threads.
-
-The ownership of the returned RandomAccessFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
-
-#### `Status tensorflow::Env::NewWritableFile(const string &fname, std::unique_ptr< WritableFile > *result)` {#Status_tensorflow_Env_NewWritableFile}
-
-Creates an object that writes to a new file with the specified name.
-
-Deletes any existing file with the same name and creates a new file. On success, stores a pointer to the new file in *result and returns OK. On failure stores NULL in *result and returns non-OK.
-
-The returned file will only be accessed by one thread at a time.
-
-The ownership of the returned WritableFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
-
-#### `Status tensorflow::Env::NewAppendableFile(const string &fname, std::unique_ptr< WritableFile > *result)` {#Status_tensorflow_Env_NewAppendableFile}
-
-Creates an object that either appends to an existing file, or writes to a new file (if the file does not exist to begin with).
-
-On success, stores a pointer to the new file in *result and returns OK. On failure stores NULL in *result and returns non-OK.
-
-The returned file will only be accessed by one thread at a time.
-
-The ownership of the returned WritableFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
-
-#### `Status tensorflow::Env::NewReadOnlyMemoryRegionFromFile(const string &fname, std::unique_ptr< ReadOnlyMemoryRegion > *result)` {#Status_tensorflow_Env_NewReadOnlyMemoryRegionFromFile}
-
-Creates a readonly region of memory with the file context.
-
-On success, it returns a pointer to read-only memory region from the content of file fname. The ownership of the region is passed to the caller. On failure stores nullptr in *result and returns non-OK.
-
-The returned memory region can be accessed from many threads in parallel.
-
-The ownership of the returned ReadOnlyMemoryRegion is passed to the caller and the object should be deleted when is not used. The memory region object shouldn&apos;t live longer than the Env object.
-
-#### `Status tensorflow::Env::FileExists(const string &fname)` {#Status_tensorflow_Env_FileExists}
-
-Returns OK if the named path exists and NOT_FOUND otherwise.
-
-
-
-#### `Status tensorflow::Env::GetChildren(const string &dir, std::vector< string > *result)` {#Status_tensorflow_Env_GetChildren}
-
-Stores in *result the names of the children of the specified directory. The names are relative to "dir".
-
-Original contents of *results are dropped.
-
-#### `virtual bool tensorflow::Env::MatchPath(const string &path, const string &pattern)=0` {#virtual_bool_tensorflow_Env_MatchPath}
-
-Returns true if the path matches the given pattern. The wildcards allowed in pattern are described in FileSystem::GetMatchingPaths.
-
-
-
-#### `Status tensorflow::Env::GetMatchingPaths(const string &pattern, std::vector< string > *results)` {#Status_tensorflow_Env_GetMatchingPaths}
-
-Given a pattern, stores in *results the set of paths that matches that pattern. *results is cleared.
-
-More details about `pattern` in FileSystem::GetMatchingPaths.
-
-#### `Status tensorflow::Env::DeleteFile(const string &fname)` {#Status_tensorflow_Env_DeleteFile}
-
-Deletes the named file.
-
-
-
-#### `Status tensorflow::Env::DeleteRecursively(const string &dirname, int64 *undeleted_files, int64 *undeleted_dirs)` {#Status_tensorflow_Env_DeleteRecursively}
-
-Deletes the specified directory and all subdirectories and files underneath it. undeleted_files and undeleted_dirs stores the number of files and directories that weren&apos;t deleted (unspecified if the return status is not OK). REQUIRES: undeleted_files, undeleted_dirs to be not null. Typical return codes.
-
-
-
-OK - dirname exists and we were able to delete everything underneath.
-
-NOT_FOUND - dirname doesn&apos;t exist
-
-PERMISSION_DENIED - dirname or some descendant is not writable
-
-UNIMPLEMENTED - Some underlying functions (like Delete) are not implemented
-
-#### `Status tensorflow::Env::RecursivelyCreateDir(const string &dirname)` {#Status_tensorflow_Env_RecursivelyCreateDir}
-
-Creates the specified directory and all the necessary subdirectories. Typical return codes.
-
-
-
-OK - successfully created the directory and sub directories, even if they were already created.
-
-PERMISSION_DENIED - dirname or some subdirectory is not writable.
-
-#### `Status tensorflow::Env::CreateDir(const string &dirname)` {#Status_tensorflow_Env_CreateDir}
-
-Creates the specified directory. Typical return codes.
-
-
-
-OK - successfully created the directory.
-
-ALREADY_EXISTS - directory already exists.
-
-PERMISSION_DENIED - dirname is not writable.
-
-#### `Status tensorflow::Env::DeleteDir(const string &dirname)` {#Status_tensorflow_Env_DeleteDir}
-
-Deletes the specified directory.
-
-
-
-#### `Status tensorflow::Env::Stat(const string &fname, FileStatistics *stat)` {#Status_tensorflow_Env_Stat}
-
-Obtains statistics for the given path.
-
-
-
-#### `Status tensorflow::Env::IsDirectory(const string &fname)` {#Status_tensorflow_Env_IsDirectory}
-
-Returns whether the given path is a directory or not. Typical return codes (not guaranteed exhaustive):
-
-
-
-OK - The path exists and is a directory.
-
-FAILED_PRECONDITION - The path exists and is not a directory.
-
-NOT_FOUND - The path entry does not exist.
-
-PERMISSION_DENIED - Insufficient permissions.
-
-UNIMPLEMENTED - The file factory doesn&apos;t support directories.
-
-#### `Status tensorflow::Env::GetFileSize(const string &fname, uint64 *file_size)` {#Status_tensorflow_Env_GetFileSize}
-
-Stores the size of `fname` in `*file_size`.
-
-
-
-#### `Status tensorflow::Env::RenameFile(const string &src, const string &target)` {#Status_tensorflow_Env_RenameFile}
-
-Renames file src to target. If target already exists, it will be replaced.
-
-
-
-#### `string tensorflow::Env::GetExecutablePath()` {#string_tensorflow_Env_GetExecutablePath}
-
-Returns the absolute path of the current executable. It resolves symlinks if there is any.
-
-
-
-#### `virtual uint64 tensorflow::Env::NowMicros()=0` {#virtual_uint64_tensorflow_Env_NowMicros}
-
-Returns the number of micro-seconds since the Unix epoch.
-
-
-
-#### `virtual uint64 tensorflow::Env::NowSeconds()` {#virtual_uint64_tensorflow_Env_NowSeconds}
-
-Returns the number of seconds since the Unix epoch.
-
-
-
-#### `virtual void tensorflow::Env::SleepForMicroseconds(int64 micros)=0` {#virtual_void_tensorflow_Env_SleepForMicroseconds}
-
-Sleeps/delays the thread for the prescribed number of micro-seconds.
-
-
-
-#### `virtual Thread* tensorflow::Env::StartThread(const ThreadOptions &thread_options, const string &name, std::function< void()> fn) TF_MUST_USE_RESULT=0` {#virtual_Thread_tensorflow_Env_StartThread}
-
-Returns a new thread that is running fn() and is identified (for debugging/performance-analysis) by "name".
-
-Caller takes ownership of the result and must delete it eventually (the deletion will block until fn() stops running).
-
-#### `virtual void tensorflow::Env::SchedClosure(std::function< void()> closure)=0` {#virtual_void_tensorflow_Env_SchedClosure}
-
-
-
-
-
-#### `virtual void tensorflow::Env::SchedClosureAfter(int64 micros, std::function< void()> closure)=0` {#virtual_void_tensorflow_Env_SchedClosureAfter}
-
-
-
-
-
-#### `virtual Status tensorflow::Env::LoadLibrary(const char *library_filename, void **handle)=0` {#virtual_Status_tensorflow_Env_LoadLibrary}
-
-
-
-
-
-#### `virtual Status tensorflow::Env::GetSymbolFromLibrary(void *handle, const char *symbol_name, void **symbol)=0` {#virtual_Status_tensorflow_Env_GetSymbolFromLibrary}
-
-
-
-
-
-#### `virtual string tensorflow::Env::FormatLibraryFileName(const string &name, const string &version)=0` {#virtual_string_tensorflow_Env_FormatLibraryFileName}
-
-
-
-
-
-#### `static Env* tensorflow::Env::Default()` {#static_Env_tensorflow_Env_Default}
-
-Returns a default environment suitable for the current operating system.
-
-Sophisticated users may wish to provide their own Env implementation instead of relying on this default environment.
-
-The result of Default() belongs to this library and must never be deleted.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
deleted file mode 100644
index e367f5f042d..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# `class tensorflow::EnvWrapper`
-
-An implementation of Env that forwards all calls to another Env .
-
-May be useful to clients who wish to override just part of the functionality of another Env .
-
-###Member Details
-
-#### `tensorflow::EnvWrapper::EnvWrapper(Env *t)` {#tensorflow_EnvWrapper_EnvWrapper}
-
-Initializes an EnvWrapper that delegates all calls to *t.
-
-
-
-#### `tensorflow::EnvWrapper::~EnvWrapper()` {#tensorflow_EnvWrapper_EnvWrapper}
-
-
-
-
-
-#### `Env* tensorflow::EnvWrapper::target() const` {#Env_tensorflow_EnvWrapper_target}
-
-Returns the target to which this Env forwards all calls.
-
-
-
-#### `Status tensorflow::EnvWrapper::GetFileSystemForFile(const string &fname, FileSystem **result) override` {#Status_tensorflow_EnvWrapper_GetFileSystemForFile}
-
-Returns the FileSystem object to handle operations on the file specified by &apos;fname&apos;. The FileSystem object is used as the implementation for the file system related (non-virtual) functions that follow. Returned FileSystem object is still owned by the Env object and will.
-
-
-
-#### `Status tensorflow::EnvWrapper::GetRegisteredFileSystemSchemes(std::vector< string > *schemes) override` {#Status_tensorflow_EnvWrapper_GetRegisteredFileSystemSchemes}
-
-Returns the file system schemes registered for this Env .
-
-
-
-#### `Status tensorflow::EnvWrapper::RegisterFileSystem(const string &scheme, FileSystemRegistry::Factory factory) override` {#Status_tensorflow_EnvWrapper_RegisterFileSystem}
-
-
-
-
-
-#### `bool tensorflow::EnvWrapper::MatchPath(const string &path, const string &pattern) override` {#bool_tensorflow_EnvWrapper_MatchPath}
-
-Returns true if the path matches the given pattern. The wildcards allowed in pattern are described in FileSystem::GetMatchingPaths.
-
-
-
-#### `uint64 tensorflow::EnvWrapper::NowMicros() override` {#uint64_tensorflow_EnvWrapper_NowMicros}
-
-Returns the number of micro-seconds since the Unix epoch.
-
-
-
-#### `void tensorflow::EnvWrapper::SleepForMicroseconds(int64 micros) override` {#void_tensorflow_EnvWrapper_SleepForMicroseconds}
-
-Sleeps/delays the thread for the prescribed number of micro-seconds.
-
-
-
-#### `Thread* tensorflow::EnvWrapper::StartThread(const ThreadOptions &thread_options, const string &name, std::function< void()> fn) override` {#Thread_tensorflow_EnvWrapper_StartThread}
-
-Returns a new thread that is running fn() and is identified (for debugging/performance-analysis) by "name".
-
-Caller takes ownership of the result and must delete it eventually (the deletion will block until fn() stops running).
-
-#### `void tensorflow::EnvWrapper::SchedClosure(std::function< void()> closure) override` {#void_tensorflow_EnvWrapper_SchedClosure}
-
-
-
-
-
-#### `void tensorflow::EnvWrapper::SchedClosureAfter(int64 micros, std::function< void()> closure) override` {#void_tensorflow_EnvWrapper_SchedClosureAfter}
-
-
-
-
-
-#### `Status tensorflow::EnvWrapper::LoadLibrary(const char *library_filename, void **handle) override` {#Status_tensorflow_EnvWrapper_LoadLibrary}
-
-
-
-
-
-#### `Status tensorflow::EnvWrapper::GetSymbolFromLibrary(void *handle, const char *symbol_name, void **symbol) override` {#Status_tensorflow_EnvWrapper_GetSymbolFromLibrary}
-
-
-
-
-
-#### `string tensorflow::EnvWrapper::FormatLibraryFileName(const string &name, const string &version) override` {#string_tensorflow_EnvWrapper_FormatLibraryFileName}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md
deleted file mode 100644
index ac2c26093de..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShape.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# `class tensorflow::PartialTensorShape`
-
-Manages the partially known dimensions of a Tensor and their sizes.
-
-
-
-###Member Details
-
-#### `tensorflow::PartialTensorShape::PartialTensorShape()` {#tensorflow_PartialTensorShape_PartialTensorShape}
-
-Construct an unknown ` PartialTensorShape `.
-
-
-
-#### `tensorflow::PartialTensorShape::PartialTensorShape(gtl::ArraySlice< int64 > dim_sizes)` {#tensorflow_PartialTensorShape_PartialTensorShape}
-
-Construct a ` PartialTensorShape ` from the provided sizes. REQUIRES: `dim_sizes[i] >= 0`
-
-
-
-#### `tensorflow::PartialTensorShape::PartialTensorShape(std::initializer_list< int64 > dim_sizes)` {#tensorflow_PartialTensorShape_PartialTensorShape}
-
-
-
-
-
-#### `tensorflow::PartialTensorShape::PartialTensorShape(const TensorShapeProto &proto)` {#tensorflow_PartialTensorShape_PartialTensorShape}
-
-REQUIRES: `IsValid(proto)`
-
-
-
-#### `PartialTensorShape tensorflow::PartialTensorShape::Concatenate(int64 size) const` {#PartialTensorShape_tensorflow_PartialTensorShape_Concatenate}
-
-
-
-Add a dimension to the end ("inner-most"), returns a new PartialTensorShape . REQUIRES: `size >= -1`, where -1 means unknown
-
-#### `PartialTensorShape tensorflow::PartialTensorShape::Concatenate(const PartialTensorShape &shape) const` {#PartialTensorShape_tensorflow_PartialTensorShape_Concatenate}
-
-
-
-Appends all the dimensions from `shape`. Returns a new PartialTensorShape .
-
-#### `Status tensorflow::PartialTensorShape::MergeWith(const PartialTensorShape &shape, PartialTensorShape *result) const` {#Status_tensorflow_PartialTensorShape_MergeWith}
-
-
-
-Merges all the dimensions from `shape`. Returns `InvalidArgument` error if either `shape` has a different rank or if any of the dimensions are incompatible.
-
-#### `int tensorflow::PartialTensorShape::dims() const` {#int_tensorflow_PartialTensorShape_dims}
-
-
-
-Return the number of dimensions in the tensor. If the number of dimensions is unknown, return -1.
-
-#### `bool tensorflow::PartialTensorShape::IsFullyDefined() const` {#bool_tensorflow_PartialTensorShape_IsFullyDefined}
-
-Return true iff the rank and all of the dimensions are well defined.
-
-
-
-#### `bool tensorflow::PartialTensorShape::IsIdenticalTo(const PartialTensorShape &shape) const` {#bool_tensorflow_PartialTensorShape_IsIdenticalTo}
-
-
-
-Exact equality test. Returns true iff the ranks match (i.e., both are unknown, or both are known and equal), and all dimensions are equal (i.e., both dimensions are known, or both are known and equal). This is a stronger condition that IsCompatibleWith.
-
-#### `bool tensorflow::PartialTensorShape::IsCompatibleWith(const PartialTensorShape &shape) const` {#bool_tensorflow_PartialTensorShape_IsCompatibleWith}
-
-
-
-Return true iff the ranks match, and if the dimensions all either match or one is unknown.
-
-#### `bool tensorflow::PartialTensorShape::IsCompatibleWith(const TensorShape &shape) const` {#bool_tensorflow_PartialTensorShape_IsCompatibleWith}
-
-
-
-Return true iff the dimensions of `shape` are compatible with `*this`.
-
-#### `int64 tensorflow::PartialTensorShape::dim_size(int d) const` {#int64_tensorflow_PartialTensorShape_dim_size}
-
-Returns the number of elements in dimension `d`. REQUIRES: `0 <= d < dims() `
-
-
-
-#### `gtl::ArraySlice<int64> tensorflow::PartialTensorShape::dim_sizes() const` {#gtl_ArraySlice_int64_tensorflow_PartialTensorShape_dim_sizes}
-
-Returns sizes of all dimensions.
-
-
-
-#### `void tensorflow::PartialTensorShape::AsProto(TensorShapeProto *proto) const` {#void_tensorflow_PartialTensorShape_AsProto}
-
-Fill `*proto` from `*this`.
-
-
-
-#### `bool tensorflow::PartialTensorShape::AsTensorShape(TensorShape *tensor_shape) const` {#bool_tensorflow_PartialTensorShape_AsTensorShape}
-
-
-
-
-
-#### `string tensorflow::PartialTensorShape::DebugString() const` {#string_tensorflow_PartialTensorShape_DebugString}
-
-For error messages.
-
-
-
-#### `bool tensorflow::PartialTensorShape::IsValid(const TensorShapeProto &proto)` {#bool_tensorflow_PartialTensorShape_IsValid}
-
-Returns `true` iff `proto` is a valid partial tensor shape.
-
-
-
-#### `Status tensorflow::PartialTensorShape::IsValidShape(const TensorShapeProto &proto)` {#Status_tensorflow_PartialTensorShape_IsValidShape}
-
-
-
-Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error status otherwise.
-
-#### `string tensorflow::PartialTensorShape::DebugString(const TensorShapeProto &proto)` {#string_tensorflow_PartialTensorShape_DebugString}
-
-
-
-
-
-#### `static Status tensorflow::PartialTensorShape::MakePartialShape(const int32 *dims, int n, PartialTensorShape *out)` {#static_Status_tensorflow_PartialTensorShape_MakePartialShape}
-
-Returns a ` PartialTensorShape ` whose dimensions are `dims[0]`, `dims[1]`, ..., `dims[n-1]`. Values of -1 are considered "unknown".
-
-
-
-#### `static Status tensorflow::PartialTensorShape::MakePartialShape(const int64 *dims, int n, PartialTensorShape *out)` {#static_Status_tensorflow_PartialTensorShape_MakePartialShape}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md
deleted file mode 100644
index ca3666ba8fd..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassPartialTensorShapeUtils.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# `class tensorflow::PartialTensorShapeUtils`
-
-Static helper routines for ` PartialTensorShape `. Includes a few common predicates on a partially known tensor shape.
-
-
-
-###Member Details
-
-#### `string tensorflow::PartialTensorShapeUtils::PartialShapeListString(const gtl::ArraySlice< PartialTensorShape > &shapes)` {#string_tensorflow_PartialTensorShapeUtils_PartialShapeListString}
-
-
-
-
-
-#### `bool tensorflow::PartialTensorShapeUtils::AreIdentical(const gtl::ArraySlice< PartialTensorShape > &shapes0, const gtl::ArraySlice< PartialTensorShape > &shapes1)` {#bool_tensorflow_PartialTensorShapeUtils_AreIdentical}
-
-
-
-
-
-#### `bool tensorflow::PartialTensorShapeUtils::AreCompatible(const gtl::ArraySlice< PartialTensorShape > &shapes0, const gtl::ArraySlice< PartialTensorShape > &shapes1)` {#bool_tensorflow_PartialTensorShapeUtils_AreCompatible}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md b/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md
deleted file mode 100644
index 1a1526f66d5..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassRandomAccessFile.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# `class tensorflow::RandomAccessFile`
-
-A file abstraction for randomly reading the contents of a file.
-
-
-
-###Member Details
-
-#### `tensorflow::RandomAccessFile::RandomAccessFile()` {#tensorflow_RandomAccessFile_RandomAccessFile}
-
-
-
-
-
-#### `tensorflow::RandomAccessFile::~RandomAccessFile()` {#tensorflow_RandomAccessFile_RandomAccessFile}
-
-
-
-
-
-#### `virtual Status tensorflow::RandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result, char *scratch) const =0` {#virtual_Status_tensorflow_RandomAccessFile_Read}
-
-Reads up to `n` bytes from the file starting at `offset`.
-
-`scratch[0..n-1]` may be written by this routine. Sets `*result` to the data that was read (including if fewer than `n` bytes were successfully read). May set `*result` to point at data in `scratch[0..n-1]`, so `scratch[0..n-1]` must be live when `*result` is used.
-
-On OK returned status: `n` bytes have been stored in `*result`. On non-OK returned status: `[0..n]` bytes have been stored in `*result`.
-
-Returns `OUT_OF_RANGE` if fewer than n bytes were stored in `*result` because of EOF.
-
-Safe for concurrent use by multiple threads.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassSession.md b/tensorflow/g3doc/api_docs/cc/ClassSession.md
deleted file mode 100644
index 6829548530d..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassSession.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# `class tensorflow::Session`
-
-A Session instance lets a caller drive a TensorFlow graph computation.
-
-When a Session is created with a given target, a new Session object is bound to the universe of resources specified by that target. Those resources are available to this session to perform computation described in the GraphDef. After extending the session with a graph, the caller uses the Run() API to perform the computation and potentially fetch outputs as Tensors.
-
-Example:
-
-```c++ tensorflow::GraphDef graph;
-// ... Create or load graph into "graph".
-
-// This example uses the default options which connects
-// to a local runtime.
-tensorflow::SessionOptions options;
-std::unique_ptr<tensorflow::Session>
-session(tensorflow::NewSession(options));
-
-// Create the session with this graph.
-tensorflow::Status s = session->Create(graph);
-if (!s.ok()) { ... }
-
-// Run the graph and fetch the first output of the "output"
-// operation, and also run to but do not return anything
-// for the "update_state" operation.
-std::vector<tensorflow::Tensor> outputs;
-s = session->Run({}, {"output:0"}, {"update_state"}, &outputs);
-if (!s.ok()) { ... }
-
-// Map the output as a flattened float tensor, and do something
-// with it.
-auto output_tensor = outputs[0].flat<float>();
-if (output_tensor(0) > 0.5) { ... }
-
-// Close the session to release the resources associated with
-// this session.
-session->Close();
-
-```
-
-A Session allows concurrent calls to Run() , though a Session must be created / extended by a single thread.
-
-Only one thread must call Close() , and Close() must only be called after all other calls to Run() have returned.
-
-###Member Details
-
-#### `tensorflow::Session::Session()` {#tensorflow_Session_Session}
-
-
-
-
-
-#### `virtual tensorflow::Session::~Session()` {#virtual_tensorflow_Session_Session}
-
-
-
-
-
-#### `virtual Status tensorflow::Session::Create(const GraphDef &graph)=0` {#virtual_Status_tensorflow_Session_Create}
-
-Create the graph to be used for the session.
-
-Returns an error if this session has already been created with a graph. To re-use the session with a different graph, the caller must Close() the session first.
-
-#### `virtual Status tensorflow::Session::Extend(const GraphDef &graph)=0` {#virtual_Status_tensorflow_Session_Extend}
-
-Adds operations to the graph that is already registered with the Session .
-
-The names of new operations in "graph" must not exist in the graph that is already registered.
-
-#### `virtual Status tensorflow::Session::Run(const std::vector< std::pair< string, Tensor > > &inputs, const std::vector< string > &output_tensor_names, const std::vector< string > &target_node_names, std::vector< Tensor > *outputs)=0` {#virtual_Status_tensorflow_Session_Run}
-
-Runs the graph with the provided input tensors and fills `outputs` for the endpoints specified in `output_tensor_names`. Runs to but does not return Tensors for the nodes in `target_node_names`.
-
-The order of tensors in `outputs` will match the order provided by `output_tensor_names`.
-
-If `Run` returns `OK()`, then `outputs->size()` will be equal to `output_tensor_names.size()`. If `Run` does not return `OK()`, the state of `outputs` is undefined.
-
-REQUIRES: The name of each Tensor of the input or output must match a "Tensor endpoint" in the `GraphDef` passed to ` Create() `.
-
-REQUIRES: At least one of `output_tensor_names` and `target_node_names` must be non-empty.
-
-REQUIRES: outputs is not nullptr if `output_tensor_names` is non-empty.
-
-#### `virtual Status tensorflow::Session::Create(const RunOptions &run_options, const GraphDef &graph)` {#virtual_Status_tensorflow_Session_Create}
-
-Implementations which support `RunOptions`.
-
-NOTE: This API is still experimental and may change.
-
-#### `virtual Status tensorflow::Session::Extend(const RunOptions &run_options, const GraphDef &graph)` {#virtual_Status_tensorflow_Session_Extend}
-
-
-
-
-
-#### `virtual Status tensorflow::Session::Close(const RunOptions &run_options)` {#virtual_Status_tensorflow_Session_Close}
-
-
-
-
-
-#### `virtual Status tensorflow::Session::Run(const RunOptions &run_options, const std::vector< std::pair< string, Tensor > > &inputs, const std::vector< string > &output_tensor_names, const std::vector< string > &target_node_names, std::vector< Tensor > *outputs, RunMetadata *run_metadata)` {#virtual_Status_tensorflow_Session_Run}
-
-Like `Run`, but allows users to pass in a `RunOptions` proto and to retrieve non-Tensor metadata output via a `RunMetadata` proto for this step. `run_metadata` may be nullptr, in which case any metadata output is discarded. NOTE: This API is still experimental and may change.
-
-
-
-#### `virtual Status tensorflow::Session::PRunSetup(const std::vector< string > &input_names, const std::vector< string > &output_names, const std::vector< string > &target_nodes, string *handle)` {#virtual_Status_tensorflow_Session_PRunSetup}
-
-Sets up a graph for partial execution. All future feeds and fetches are specified by `input_names` and `output_names`. Returns `handle` that can be used to perform a sequence of partial feeds and fetches. NOTE: This API is still experimental and may change.
-
-
-
-#### `virtual Status tensorflow::Session::PRun(const string &handle, const std::vector< std::pair< string, Tensor > > &inputs, const std::vector< string > &output_names, std::vector< Tensor > *outputs)` {#virtual_Status_tensorflow_Session_PRun}
-
-Continues the pending execution specified by `handle` with the provided input tensors and fills `outputs` for the endpoints specified in `output_names`. NOTE: This API is still experimental and may change.
-
-
-
-#### `virtual Status tensorflow::Session::Close()=0` {#virtual_Status_tensorflow_Session_Close}
-
-Closes this session.
-
-Closing a session releases the resources used by this session on the TensorFlow runtime (specified during session creation by the ` SessionOptions::target ` field).
diff --git a/tensorflow/g3doc/api_docs/cc/ClassStatus.md b/tensorflow/g3doc/api_docs/cc/ClassStatus.md
deleted file mode 100644
index 8956af75ec3..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassStatus.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# `class tensorflow::Status`
-
-Denotes success or failure of a call in Tensorflow.
-
-
-
-###Member Details
-
-#### `tensorflow::Status::Status()` {#tensorflow_Status_Status}
-
-Create a success status.
-
-
-
-#### `tensorflow::Status::~Status()` {#tensorflow_Status_Status}
-
-
-
-
-
-#### `tensorflow::Status::Status(tensorflow::error::Code code, tensorflow::StringPiece msg)` {#tensorflow_Status_Status}
-
-Create a status with the specified error code and msg as a human-readable string containing more detailed information.
-
-
-
-#### `tensorflow::Status::Status(const Status &s)` {#tensorflow_Status_Status}
-
-Copy the specified status.
-
-
-
-#### `void tensorflow::Status::operator=(const Status &s)` {#void_tensorflow_Status_operator_}
-
-
-
-
-
-#### `bool tensorflow::Status::ok() const` {#bool_tensorflow_Status_ok}
-
-Returns true iff the status indicates success.
-
-
-
-#### `tensorflow::error::Code tensorflow::Status::code() const` {#tensorflow_error_Code_tensorflow_Status_code}
-
-
-
-
-
-#### `const string& tensorflow::Status::error_message() const` {#const_string_tensorflow_Status_error_message}
-
-
-
-
-
-#### `bool tensorflow::Status::operator==(const Status &x) const` {#bool_tensorflow_Status_operator_}
-
-
-
-
-
-#### `bool tensorflow::Status::operator!=(const Status &x) const` {#bool_tensorflow_Status_operator_}
-
-
-
-
-
-#### `void tensorflow::Status::Update(const Status &new_status)` {#void_tensorflow_Status_Update}
-
-If ` ok() `, stores `new_status` into `*this`. If `!ok()`, preserves the current status, but may augment with additional information about `new_status`.
-
-Convenient way of keeping track of the first error encountered. Instead of: `if (overall_status.ok()) overall_status = new_status` Use: `overall_status.Update(new_status);`
-
-#### `string tensorflow::Status::ToString() const` {#string_tensorflow_Status_ToString}
-
-Return a string representation of this status suitable for printing. Returns the string `"OK"` for success.
-
-
-
-#### `return tensorflow::Status::OK()` {#return_tensorflow_Status_OK}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensor.md b/tensorflow/g3doc/api_docs/cc/ClassTensor.md
deleted file mode 100644
index b909bffe3a5..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassTensor.md
+++ /dev/null
@@ -1,382 +0,0 @@
-# `class tensorflow::Tensor`
-
-Represents an n-dimensional array of values.
-
-
-
-###Member Details
-
-#### `tensorflow::Tensor::Tensor()` {#tensorflow_Tensor_Tensor}
-
-Creates a 1-dimensional, 0-element float tensor.
-
-The returned Tensor is not a scalar (shape {}), but is instead an empty one-dimensional Tensor (shape {0}, NumElements() == 0). Since it has no elements, it does not need to be assigned a value and is initialized by default ( IsInitialized() is true). If this is undesirable, consider creating a one-element scalar which does require initialization:
-
-```c++ Tensor(DT_FLOAT, TensorShape({}))
-
-```
-
-#### `tensorflow::Tensor::Tensor(DataType type, const TensorShape &shape)` {#tensorflow_Tensor_Tensor}
-
-Creates a Tensor of the given `type` and `shape`. If LogMemory::IsEnabled() the allocation is logged as coming from an unknown kernel and step. Calling the Tensor constructor directly from within an Op is deprecated: use the OpKernelConstruction/OpKernelContext allocate_* methods to allocate a new tensor, which record the kernel and step.
-
-The underlying buffer is allocated using a ` CPUAllocator `.
-
-#### `tensorflow::Tensor::Tensor(Allocator *a, DataType type, const TensorShape &shape)` {#tensorflow_Tensor_Tensor}
-
-Creates a tensor with the input `type` and `shape`, using the allocator `a` to allocate the underlying buffer. If LogMemory::IsEnabled() the allocation is logged as coming from an unknown kernel and step. Calling the Tensor constructor directly from within an Op is deprecated: use the OpKernelConstruction/OpKernelContext allocate_* methods to allocate a new tensor, which record the kernel and step.
-
-`a` must outlive the lifetime of this Tensor .
-
-#### `tensorflow::Tensor::Tensor(Allocator *a, DataType type, const TensorShape &shape, const AllocationAttributes &allocation_attr)` {#tensorflow_Tensor_Tensor}
-
-Creates a tensor with the input `type` and `shape`, using the allocator `a` and the specified "allocation_attr" to allocate the underlying buffer. If the kernel and step are known allocation_attr.allocation_will_be_logged should be set to true and LogMemory::RecordTensorAllocation should be called after the tensor is constructed. Calling the Tensor constructor directly from within an Op is deprecated: use the OpKernelConstruction/OpKernelContext allocate_* methods to allocate a new tensor, which record the kernel and step.
-
-`a` must outlive the lifetime of this Tensor .
-
-#### `tensorflow::Tensor::Tensor(DataType type)` {#tensorflow_Tensor_Tensor}
-
-Creates an empty Tensor of the given data type.
-
-Like Tensor() , returns a 1-dimensional, 0-element Tensor with IsInitialized() returning True. See the Tensor() documentation for details.
-
-#### `tensorflow::Tensor::Tensor(const Tensor &other)` {#tensorflow_Tensor_Tensor}
-
-
-
-
-
-#### `tensorflow::Tensor::Tensor(Tensor &&other)` {#tensorflow_Tensor_Tensor}
-
-Copy constructor.
-
-
-
-#### `tensorflow::Tensor::~Tensor()` {#tensorflow_Tensor_Tensor}
-
-
-
-
-
-#### `DataType tensorflow::Tensor::dtype() const` {#DataType_tensorflow_Tensor_dtype}
-
-Returns the data type.
-
-
-
-#### `const TensorShape& tensorflow::Tensor::shape() const` {#const_TensorShape_tensorflow_Tensor_shape}
-
-Returns the shape of the tensor.
-
-
-
-#### `int tensorflow::Tensor::dims() const` {#int_tensorflow_Tensor_dims}
-
-Convenience accessor for the tensor shape.
-
-For all shape accessors, see comments for relevant methods of ` TensorShape ` in ` tensor_shape.h `.
-
-#### `int64 tensorflow::Tensor::dim_size(int d) const` {#int64_tensorflow_Tensor_dim_size}
-
-Convenience accessor for the tensor shape.
-
-
-
-#### `int64 tensorflow::Tensor::NumElements() const` {#int64_tensorflow_Tensor_NumElements}
-
-Convenience accessor for the tensor shape.
-
-
-
-#### `bool tensorflow::Tensor::IsSameSize(const Tensor &b) const` {#bool_tensorflow_Tensor_IsSameSize}
-
-
-
-
-
-#### `bool tensorflow::Tensor::SharesBufferWith(const Tensor &b) const` {#bool_tensorflow_Tensor_SharesBufferWith}
-
-
-
-
-
-#### `bool tensorflow::Tensor::IsInitialized() const` {#bool_tensorflow_Tensor_IsInitialized}
-
-If necessary, has this Tensor been initialized?
-
-Zero-element Tensors are always considered initialized, even if they have never been assigned to and do not have any memory allocated.
-
-#### `size_t tensorflow::Tensor::TotalBytes() const` {#size_t_tensorflow_Tensor_TotalBytes}
-
-Returns the estimated memory usage of this tensor.
-
-
-
-#### `bool tensorflow::Tensor::IsAligned() const` {#bool_tensorflow_Tensor_IsAligned}
-
-Returns true iff this tensor is aligned.
-
-
-
-#### `Tensor& tensorflow::Tensor::operator=(const Tensor &other)` {#Tensor_tensorflow_Tensor_operator_}
-
-Assign operator. This tensor shares other&apos;s underlying storage.
-
-
-
-#### `Tensor & tensorflow::Tensor::operator=(Tensor &&other)` {#Tensor_tensorflow_Tensor_operator_}
-
-Move operator. See move constructor for details.
-
-
-
-#### `bool tensorflow::Tensor::CopyFrom(const Tensor &other, const TensorShape &shape) TF_MUST_USE_RESULT` {#bool_tensorflow_Tensor_CopyFrom}
-
-Copy the other tensor into this tensor and reshape it.
-
-This tensor shares other&apos;s underlying storage. Returns `true` iff `other.shape()` has the same number of elements of the given `shape`.
-
-#### `Tensor tensorflow::Tensor::Slice(int64 dim0_start, int64 dim0_limit) const` {#Tensor_tensorflow_Tensor_Slice}
-
-Slice this tensor along the 1st dimension.
-
-I.e., the returned tensor satisfies returned[i, ...] == this[dim0_start + i, ...]. The returned tensor shares the underlying tensor buffer with this tensor.
-
-NOTE: The returned tensor may not satisfies the same alignment requirement as this tensor depending on the shape. The caller must check the returned tensor&apos;s alignment before calling certain methods that have alignment requirement (e.g., ` flat() `, `tensor()`).
-
-REQUIRES: ` dims() ` >= 1 REQUIRES: `0 <= dim0_start <= dim0_limit <= dim_size(0)`
-
-#### `bool tensorflow::Tensor::FromProto(const TensorProto &other) TF_MUST_USE_RESULT` {#bool_tensorflow_Tensor_FromProto}
-
-Parse `other` and construct the tensor.
-
-Returns `true` iff the parsing succeeds. If the parsing fails, the state of `*this` is unchanged.
-
-#### `bool tensorflow::Tensor::FromProto(Allocator *a, const TensorProto &other) TF_MUST_USE_RESULT` {#bool_tensorflow_Tensor_FromProto}
-
-
-
-
-
-#### `void tensorflow::Tensor::AsProtoField(TensorProto *proto) const` {#void_tensorflow_Tensor_AsProtoField}
-
-Fills in `proto` with `*this` tensor&apos;s content.
-
-` AsProtoField() ` fills in the repeated field for `proto.dtype()`, while `AsProtoTensorContent()` encodes the content in `proto.tensor_content()` in a compact form.
-
-#### `void tensorflow::Tensor::AsProtoTensorContent(TensorProto *proto) const` {#void_tensorflow_Tensor_AsProtoTensorContent}
-
-
-
-
-
-#### `TTypes<T>::Vec tensorflow::Tensor::vec()` {#TTypes_T_Vec_tensorflow_Tensor_vec}
-
-Return the tensor data as an `Eigen::Tensor` with the type and sizes of this ` Tensor `.
-
-Use these methods when you know the data type and the number of dimensions of the Tensor and you want an `Eigen::Tensor` automatically sized to the ` Tensor ` sizes. The implementation check fails if either type or sizes mismatch.
-
-Example:
-
-```c++ typedef float T;
-Tensor my_mat(...built with Shape{rows: 3, cols: 5}...);
-auto mat = my_mat.matrix<T>();    // 2D Eigen::Tensor, 3 x 5.
-auto mat = my_mat.tensor<T, 2>(); // 2D Eigen::Tensor, 3 x 5.
-auto vec = my_mat.vec<T>();       // CHECK fails as my_mat is 2D.
-auto vec = my_mat.tensor<T, 3>(); // CHECK fails as my_mat is 2D.
-auto mat = my_mat.matrix<int32>();// CHECK fails as type mismatch.
-
-```
-
-#### `TTypes<T>::Matrix tensorflow::Tensor::matrix()` {#TTypes_T_Matrix_tensorflow_Tensor_matrix}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::tensor()` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_tensor}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::bit_casted_tensor()` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_bit_casted_tensor}
-
-Return the tensor data to an `Eigen::Tensor` with the same size but a bitwise cast to the specified dtype `T`.
-
-Using a bitcast is useful for move and copy operations. NOTE: this is the same as `tensor()` except a bitcast is allowed.
-
-#### `TTypes<T>::Flat tensorflow::Tensor::flat()` {#TTypes_T_Flat_tensorflow_Tensor_flat}
-
-Return the tensor data as an `Eigen::Tensor` of the data type and a specified shape.
-
-These methods allow you to access the data with the dimensions and sizes of your choice. You do not need to know the number of dimensions of the Tensor to call them. However, they `CHECK` that the type matches and the dimensions requested creates an `Eigen::Tensor` with the same number of elements as the tensor.
-
-Example:
-
-```c++ typedef float T;
-Tensor my_ten(...built with Shape{planes: 4, rows: 3, cols: 5}...);
-// 1D Eigen::Tensor, size 60:
-auto flat = my_ten.flat<T>();
-// 2D Eigen::Tensor 12 x 5:
-auto inner = my_ten.flat_inner_dims<T>();
-// 2D Eigen::Tensor 4 x 15:
-auto outer = my_ten.shaped<T, 2>({4, 15});
-// CHECK fails, bad num elements:
-auto outer = my_ten.shaped<T, 2>({4, 8});
-// 3D Eigen::Tensor 6 x 5 x 2:
-auto weird = my_ten.shaped<T, 3>({6, 5, 2});
-// CHECK fails, type mismatch:
-auto bad   = my_ten.flat<int32>();
-
-```
-
-#### `TTypes<T>::UnalignedFlat tensorflow::Tensor::unaligned_flat()` {#TTypes_T_UnalignedFlat_tensorflow_Tensor_unaligned_flat}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::flat_inner_dims()` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_flat_inner_dims}
-
-
-
-Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing all Tensor dimensions but the last NDIMS-1 into the first dimension of the result. If NDIMS > dims() then leading dimensions of size 1 will be added to make the output rank NDIMS.
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::flat_outer_dims()` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_flat_outer_dims}
-
-
-
-Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing all Tensor dimensions but the first NDIMS-1 into the last dimension of the result. If NDIMS > dims() then trailing dimensions of size 1 will be added to make the output rank NDIMS.
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::shaped(gtl::ArraySlice< int64 > new_sizes)` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_shaped}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::bit_casted_shaped(gtl::ArraySlice< int64 > new_sizes)` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_bit_casted_shaped}
-
-Return the tensor data to an `Eigen::Tensor` with the new shape specified in `new_sizes` and cast to a new dtype `T`.
-
-Using a bitcast is useful for move and copy operations. The allowed bitcast is the only difference from `shaped()`.
-
-#### `TTypes< T, NDIMS >::UnalignedTensor tensorflow::Tensor::unaligned_shaped(gtl::ArraySlice< int64 > new_sizes)` {#TTypes_T_NDIMS_UnalignedTensor_tensorflow_Tensor_unaligned_shaped}
-
-
-
-
-
-#### `TTypes< T >::Scalar tensorflow::Tensor::scalar()` {#TTypes_T_Scalar_tensorflow_Tensor_scalar}
-
-Return the Tensor data as a `TensorMap` of fixed size 1: `TensorMap<TensorFixedSize<T, 1>>`.
-
-Using ` scalar() ` allows the compiler to perform optimizations as the size of the tensor is known at compile time.
-
-#### `TTypes<T>::ConstVec tensorflow::Tensor::vec() const` {#TTypes_T_ConstVec_tensorflow_Tensor_vec}
-
-Const versions of all the methods above.
-
-
-
-#### `TTypes<T>::ConstMatrix tensorflow::Tensor::matrix() const` {#TTypes_T_ConstMatrix_tensorflow_Tensor_matrix}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::tensor() const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_tensor}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::bit_casted_tensor() const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_bit_casted_tensor}
-
-Return the tensor data to an `Eigen::Tensor` with the same size but a bitwise cast to the specified dtype `T`.
-
-Using a bitcast is useful for move and copy operations. NOTE: this is the same as `tensor()` except a bitcast is allowed.
-
-#### `TTypes<T>::ConstFlat tensorflow::Tensor::flat() const` {#TTypes_T_ConstFlat_tensorflow_Tensor_flat}
-
-
-
-
-
-#### `TTypes<T>::UnalignedConstFlat tensorflow::Tensor::unaligned_flat() const` {#TTypes_T_UnalignedConstFlat_tensorflow_Tensor_unaligned_flat}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::shaped(gtl::ArraySlice< int64 > new_sizes) const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_shaped}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::bit_casted_shaped(gtl::ArraySlice< int64 > new_sizes) const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_bit_casted_shaped}
-
-Return the tensor data to an `Eigen::Tensor` with the new shape specified in `new_sizes` and cast to a new dtype `T`.
-
-Using a bitcast is useful for move and copy operations. The allowed bitcast is the only difference from `shaped()`.
-
-#### `TTypes< T, NDIMS >::UnalignedConstTensor tensorflow::Tensor::unaligned_shaped(gtl::ArraySlice< int64 > new_sizes) const` {#TTypes_T_NDIMS_UnalignedConstTensor_tensorflow_Tensor_unaligned_shaped}
-
-
-
-
-
-#### `TTypes< T >::ConstScalar tensorflow::Tensor::scalar() const` {#TTypes_T_ConstScalar_tensorflow_Tensor_scalar}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::flat_inner_dims() const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_flat_inner_dims}
-
-
-
-
-
-#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::flat_outer_dims() const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_flat_outer_dims}
-
-
-
-
-
-#### `string tensorflow::Tensor::SummarizeValue(int64 max_entries) const` {#string_tensorflow_Tensor_SummarizeValue}
-
-Render the first `max_entries` values in `*this` into a string.
-
-
-
-#### `string tensorflow::Tensor::DebugString() const` {#string_tensorflow_Tensor_DebugString}
-
-A human-readable summary of the tensor suitable for debugging.
-
-
-
-#### `void tensorflow::Tensor::FillDescription(TensorDescription *description) const` {#void_tensorflow_Tensor_FillDescription}
-
-
-
-Fill in the `TensorDescription` proto with metadata about the tensor that is useful for monitoring and debugging.
-
-#### `StringPiece tensorflow::Tensor::tensor_data() const` {#StringPiece_tensorflow_Tensor_tensor_data}
-
-Returns a ` StringPiece ` mapping the current tensor&apos;s buffer.
-
-The returned ` StringPiece ` may point to memory location on devices that the CPU cannot address directly.
-
-NOTE: The underlying tensor buffer is refcounted, so the lifetime of the contents mapped by the ` StringPiece ` matches the lifetime of the buffer; callers should arrange to make sure the buffer does not get destroyed while the ` StringPiece ` is still used.
-
-REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
-
-#### `void tensorflow::Tensor::UnsafeCopyFromInternal(const Tensor &, DataType dtype, const TensorShape &)` {#void_tensorflow_Tensor_UnsafeCopyFromInternal}
-
-
-
-Copy the other tensor into this tensor and reshape it and reinterpret the buffer&apos;s datatype.
-
-This tensor shares other&apos;s underlying storage.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
deleted file mode 100644
index 51fad8c2fa1..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
+++ /dev/null
@@ -1,221 +0,0 @@
-# `class tensorflow::TensorShape`
-
-
-
-Represents the shape of a Tensor .
-
-A tensor&apos;s shape is denoted by its number of dimensions and a size for each dimension. For example, a Tensor represented by a 3 x 4 matrix would have a shape of 2-D, [3,4].
-
-If you know the exact shape of your Tensor when you create the TensorShape object, you can specify it then, or you can create a TensorShape with zero dimensions and one element, and call AddDim() to add dimensions later.
-
-###Member Details
-
-#### `uint8 tensorflow::TensorShape::buf[16][16]` {#uint8_tensorflow_TensorShape_buf_16_}
-
-
-
-
-
-#### `Rep64* tensorflow::TensorShape::unused_aligner` {#Rep64_tensorflow_TensorShape_unused_aligner}
-
-
-
-
-
-#### `tensorflow::TensorShape::TensorShape(gtl::ArraySlice< int64 > dim_sizes)` {#tensorflow_TensorShape_TensorShape}
-
-Construct a ` TensorShape ` from the provided sizes. REQUIRES: `dim_sizes[i] >= 0`
-
-
-
-#### `tensorflow::TensorShape::TensorShape(std::initializer_list< int64 > dim_sizes)` {#tensorflow_TensorShape_TensorShape}
-
-
-
-
-
-#### `tensorflow::TensorShape::TensorShape(const TensorShapeProto &proto)` {#tensorflow_TensorShape_TensorShape}
-
-REQUIRES: `IsValid(proto)`
-
-
-
-#### `tensorflow::TensorShape::TensorShape()` {#tensorflow_TensorShape_TensorShape}
-
-
-
-Create a tensor shape with no dimensions and one element, which you can then call ` AddDim() ` on.
-
-#### `tensorflow::TensorShape::~TensorShape()` {#tensorflow_TensorShape_TensorShape}
-
-
-
-
-
-#### `tensorflow::TensorShape::TensorShape(const TensorShape &b)` {#tensorflow_TensorShape_TensorShape}
-
-Copy the specified shape.
-
-
-
-#### `void tensorflow::TensorShape::operator=(const TensorShape &b)` {#void_tensorflow_TensorShape_operator_}
-
-
-
-
-
-#### `tensorflow::TensorShape::TensorShape(TensorShape &&b)` {#tensorflow_TensorShape_TensorShape}
-
-Move the specified shape. After moving, is safe for destruction and.
-
-
-
-#### `void tensorflow::TensorShape::operator=(TensorShape &&b)` {#void_tensorflow_TensorShape_operator_}
-
-
-
-
-
-#### `void tensorflow::TensorShape::Clear()` {#void_tensorflow_TensorShape_Clear}
-
-Clear a tensor shape.
-
-
-
-#### `void tensorflow::TensorShape::AddDim(int64 size)` {#void_tensorflow_TensorShape_AddDim}
-
-Add a dimension to the end ("inner-most"). REQUIRES: `size >= 0`
-
-
-
-#### `void tensorflow::TensorShape::AppendShape(const TensorShape &shape)` {#void_tensorflow_TensorShape_AppendShape}
-
-Appends all the dimensions from `shape`.
-
-
-
-#### `void tensorflow::TensorShape::InsertDim(int d, int64 size)` {#void_tensorflow_TensorShape_InsertDim}
-
-Insert a dimension somewhere in the ` TensorShape `. REQUIRES: `0 <= d <= dims() ` REQUIRES: `size >= 0`
-
-
-
-#### `void tensorflow::TensorShape::set_dim(int d, int64 size)` {#void_tensorflow_TensorShape_set_dim}
-
-Modifies the size of the dimension `d` to be `size` REQUIRES: `0 <= d < dims() ` REQUIRES: `size >= 0`
-
-
-
-#### `void tensorflow::TensorShape::RemoveDim(int d)` {#void_tensorflow_TensorShape_RemoveDim}
-
-Removes dimension `d` from the ` TensorShape `. REQUIRES: `0 <= d < dims() `
-
-
-
-#### `int tensorflow::TensorShape::dims() const` {#int_tensorflow_TensorShape_dims}
-
-Return the number of dimensions in the tensor.
-
-
-
-#### `int64 tensorflow::TensorShape::dim_size(int d) const` {#int64_tensorflow_TensorShape_dim_size}
-
-Returns the number of elements in dimension `d`. REQUIRES: `0 <= d < dims() `
-
-
-
-#### `gtl::InlinedVector< int64, 4 > tensorflow::TensorShape::dim_sizes() const` {#gtl_InlinedVector_int64_4_tensorflow_TensorShape_dim_sizes}
-
-Returns sizes of all dimensions.
-
-
-
-#### `int64 tensorflow::TensorShape::num_elements() const` {#int64_tensorflow_TensorShape_num_elements}
-
-Returns the number of elements in the tensor.
-
-We use `int64` and not `size_t` to be compatible with `Eigen::Tensor` which uses `ptrdiff_t`.
-
-#### `bool tensorflow::TensorShape::IsSameSize(const TensorShape &b) const` {#bool_tensorflow_TensorShape_IsSameSize}
-
-
-
-Returns true if `*this` and `b` have the same sizes. Ignores dimension names.
-
-#### `bool tensorflow::TensorShape::operator==(const TensorShape &b) const` {#bool_tensorflow_TensorShape_operator_}
-
-
-
-
-
-#### `bool tensorflow::TensorShape::operator!=(const TensorShape &b) const` {#bool_tensorflow_TensorShape_operator_}
-
-
-
-
-
-#### `void tensorflow::TensorShape::AsProto(TensorShapeProto *proto) const` {#void_tensorflow_TensorShape_AsProto}
-
-Fill `*proto` from `*this`.
-
-
-
-#### `Eigen::DSizes< Eigen::DenseIndex, NDIMS > tensorflow::TensorShape::AsEigenDSizes() const` {#Eigen_DSizes_Eigen_DenseIndex_NDIMS_tensorflow_TensorShape_AsEigenDSizes}
-
-Fill `*dsizes` from `*this`.
-
-
-
-#### `Eigen::DSizes< Eigen::DenseIndex, NDIMS > tensorflow::TensorShape::AsEigenDSizesWithPadding() const` {#Eigen_DSizes_Eigen_DenseIndex_NDIMS_tensorflow_TensorShape_AsEigenDSizesWithPadding}
-
-
-
-Same as ` AsEigenDSizes() ` but allows for `NDIMS > dims() ` in which case we pad the rest of the sizes with 1.
-
-#### `TensorShapeIter tensorflow::TensorShape::begin() const` {#TensorShapeIter_tensorflow_TensorShape_begin}
-
-For iterating through the dimensions.
-
-
-
-#### `TensorShapeIter tensorflow::TensorShape::end() const` {#TensorShapeIter_tensorflow_TensorShape_end}
-
-
-
-
-
-#### `string tensorflow::TensorShape::DebugString() const` {#string_tensorflow_TensorShape_DebugString}
-
-For error messages.
-
-
-
-#### `void tensorflow::TensorShape::DumpRep() const` {#void_tensorflow_TensorShape_DumpRep}
-
-
-
-
-
-#### `bool tensorflow::TensorShape::IsValid(const TensorShapeProto &proto)` {#bool_tensorflow_TensorShape_IsValid}
-
-Returns `true` iff `proto` is a valid tensor shape.
-
-
-
-#### `Status tensorflow::TensorShape::IsValidShape(const TensorShapeProto &proto)` {#Status_tensorflow_TensorShape_IsValidShape}
-
-
-
-Returns `OK` iff `proto` is a valid tensor shape, and a descriptive error status otherwise.
-
-#### `static constexpr int tensorflow::TensorShape::MaxDimensions()` {#static_constexpr_int_tensorflow_TensorShape_MaxDimensions}
-
-
-
-
-
-#### `string tensorflow::TensorShape::DebugString(const TensorShapeProto &proto)` {#string_tensorflow_TensorShape_DebugString}
-
-
-
-Same as `TensorShape(proto). DebugString() ` but doesn&apos;t crash for invalid protos.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
deleted file mode 100644
index 7d8c36ddec5..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# `class tensorflow::TensorShapeUtils`
-
-Static helper routines for ` TensorShape `. Includes a few common predicates on a tensor shape.
-
-
-
-###Member Details
-
-#### `static bool tensorflow::TensorShapeUtils::IsScalar(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsScalar}
-
-
-
-
-
-#### `static bool tensorflow::TensorShapeUtils::IsVector(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsVector}
-
-
-
-
-
-#### `static bool tensorflow::TensorShapeUtils::IsVectorOrHigher(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsVectorOrHigher}
-
-
-
-
-
-#### `static bool tensorflow::TensorShapeUtils::IsMatrix(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsMatrix}
-
-
-
-
-
-#### `static bool tensorflow::TensorShapeUtils::IsSquareMatrix(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsSquareMatrix}
-
-
-
-
-
-#### `static bool tensorflow::TensorShapeUtils::IsMatrixOrHigher(const TensorShape &shape)` {#static_bool_tensorflow_TensorShapeUtils_IsMatrixOrHigher}
-
-
-
-
-
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int32 *dims, int64 n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
-
-Returns a ` TensorShape ` whose dimensions are `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
-
-
-
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int64 *dims, int64 n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
-
-
-
-
-
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(gtl::ArraySlice< int32 > shape, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
-
-
-
-
-
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(gtl::ArraySlice< int64 > shape, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
-
-
-
-
-
-#### `string tensorflow::TensorShapeUtils::ShapeListString(const gtl::ArraySlice< TensorShape > &shapes)` {#string_tensorflow_TensorShapeUtils_ShapeListString}
-
-
-
-
-
-#### `bool tensorflow::TensorShapeUtils::StartsWith(const TensorShape &shape, const TensorShape &prefix)` {#bool_tensorflow_TensorShapeUtils_StartsWith}
-
-Returns true iff `shape` starts with `prefix`.
-
-
-
-#### `bool tensorflow::TensorShapeUtils::EndsWith(const TensorShape &shape, const TensorShape &suffix)` {#bool_tensorflow_TensorShapeUtils_EndsWith}
-
-Returns true iff `shape` ends with `suffix`.
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassThread.md b/tensorflow/g3doc/api_docs/cc/ClassThread.md
deleted file mode 100644
index 56127d72ad9..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassThread.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# `class tensorflow::Thread`
-
-Represents a thread used to run a Tensorflow function.
-
-
-
-###Member Details
-
-#### `tensorflow::Thread::Thread()` {#tensorflow_Thread_Thread}
-
-
-
-
-
-#### `tensorflow::Thread::~Thread()` {#tensorflow_Thread_Thread}
-
-Blocks until the thread of control stops running.
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md b/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md
deleted file mode 100644
index a7e250d6977..00000000000
--- a/tensorflow/g3doc/api_docs/cc/ClassWritableFile.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# `class tensorflow::WritableFile`
-
-A file abstraction for sequential writing.
-
-The implementation must provide buffering since callers may append small fragments at a time to the file.
-
-###Member Details
-
-#### `tensorflow::WritableFile::WritableFile()` {#tensorflow_WritableFile_WritableFile}
-
-
-
-
-
-#### `tensorflow::WritableFile::~WritableFile()` {#tensorflow_WritableFile_WritableFile}
-
-
-
-
-
-#### `virtual Status tensorflow::WritableFile::Append(const StringPiece &data)=0` {#virtual_Status_tensorflow_WritableFile_Append}
-
-
-
-
-
-#### `virtual Status tensorflow::WritableFile::Close()=0` {#virtual_Status_tensorflow_WritableFile_Close}
-
-
-
-
-
-#### `virtual Status tensorflow::WritableFile::Flush()=0` {#virtual_Status_tensorflow_WritableFile_Flush}
-
-
-
-
-
-#### `virtual Status tensorflow::WritableFile::Sync()=0` {#virtual_Status_tensorflow_WritableFile_Sync}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md b/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md
deleted file mode 100644
index f0dbe1a304a..00000000000
--- a/tensorflow/g3doc/api_docs/cc/StructSessionOptions.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# `struct tensorflow::SessionOptions`
-
-Configuration information for a Session .
-
-
-
-###Member Details
-
-#### `Env* tensorflow::SessionOptions::env` {#Env_tensorflow_SessionOptions_env}
-
-The environment to use.
-
-
-
-#### `string tensorflow::SessionOptions::target` {#string_tensorflow_SessionOptions_target}
-
-The TensorFlow runtime to connect to.
-
-If &apos;target&apos; is empty or unspecified, the local TensorFlow runtime implementation will be used. Otherwise, the TensorFlow engine defined by &apos;target&apos; will be used to perform all computations.
-
-"target" can be either a single entry or a comma separated list of entries. Each entry is a resolvable address of the following format: local ip:port host:port ... other system-specific formats to identify tasks and jobs ...
-
-NOTE: at the moment &apos;local&apos; maps to an in-process service-based runtime.
-
-Upon creation, a single session affines itself to one of the remote processes, with possible load balancing choices when the "target" resolves to a list of possible processes.
-
-If the session disconnects from the remote process during its lifetime, session calls may fail immediately.
-
-#### `ConfigProto tensorflow::SessionOptions::config` {#ConfigProto_tensorflow_SessionOptions_config}
-
-Configuration options.
-
-
-
-#### `tensorflow::SessionOptions::SessionOptions()` {#tensorflow_SessionOptions_SessionOptions}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/StructState.md b/tensorflow/g3doc/api_docs/cc/StructState.md
deleted file mode 100644
index a0335b20e06..00000000000
--- a/tensorflow/g3doc/api_docs/cc/StructState.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# `struct tensorflow::Status::State`
-
-
-
-
-
-###Member Details
-
-#### `tensorflow::error::Code tensorflow::Status::State::code` {#tensorflow_error_Code_tensorflow_Status_State_code}
-
-
-
-
-
-#### `string tensorflow::Status::State::msg` {#string_tensorflow_Status_State_msg}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md b/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md
deleted file mode 100644
index 509491f27c2..00000000000
--- a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# `struct tensorflow::TensorShapeDim`
-
-Represents the value of one dimension in a TensorShape .
-
-
-
-###Member Details
-
-#### `int64 tensorflow::TensorShapeDim::size` {#int64_tensorflow_TensorShapeDim_size}
-
-
-
-
-
-#### `tensorflow::TensorShapeDim::TensorShapeDim(int64 s)` {#tensorflow_TensorShapeDim_TensorShapeDim}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md b/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md
deleted file mode 100644
index 35db265ecd1..00000000000
--- a/tensorflow/g3doc/api_docs/cc/StructThreadOptions.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# `struct tensorflow::ThreadOptions`
-
-Options to configure a Thread .
-
-Note that the options are all hints, and the underlying implementation may choose to ignore it.
-
-###Member Details
-
-#### `size_t tensorflow::ThreadOptions::stack_size` {#size_t_tensorflow_ThreadOptions_stack_size}
-
-Thread stack size to use (in bytes).
-
-
-
-#### `size_t tensorflow::ThreadOptions::guard_size` {#size_t_tensorflow_ThreadOptions_guard_size}
-
-Guard area size to use near thread stacks to use (in bytes)
-
-
diff --git a/tensorflow/g3doc/api_docs/cc/index.md b/tensorflow/g3doc/api_docs/cc/index.md
deleted file mode 100644
index 2fb0b1c1d2d..00000000000
--- a/tensorflow/g3doc/api_docs/cc/index.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# TensorFlow C++ Session API reference documentation
-
-TensorFlow's public C++ API includes only the API for executing graphs, as of
-version 0.5. To control the execution of a graph from C++:
-
-1. Build the computation graph using the [Python API](../python/).
-1. Use [`tf.train.write_graph()`](../python/train.md#write_graph) to
-write the graph to a file.
-1. Load the graph using the C++ Session API. For example:
-
-  ```c++
-  // Reads a model graph definition from disk, and creates a session object you
-  // can use to run it.
-  Status LoadGraph(string graph_file_name, Session** session) {
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(
-        ReadBinaryProto(Env::Default(), graph_file_name, &graph_def));
-    TF_RETURN_IF_ERROR(NewSession(SessionOptions(), session));
-    TF_RETURN_IF_ERROR((*session)->Create(graph_def));
-    return Status::OK();
-  }
-```
-
-1. Run the graph with a call to `session->Run()`
-
-## Env
-
-* [tensorflow::Env](ClassEnv.md)
-* [tensorflow::RandomAccessFile](ClassRandomAccessFile.md)
-* [tensorflow::WritableFile](ClassWritableFile.md)
-* [tensorflow::EnvWrapper](ClassEnvWrapper.md)
-
-## Session
-
-* [tensorflow::Session](ClassSession.md)
-* [tensorflow::SessionOptions](StructSessionOptions.md)
-
-## Status
-
-* [tensorflow::Status](ClassStatus.md)
-* [tensorflow::Status::State](StructState.md)
-
-## Tensor
-
-* [tensorflow::Tensor](ClassTensor.md)
-* [tensorflow::TensorShape](ClassTensorShape.md)
-* [tensorflow::TensorShapeDim](StructTensorShapeDim.md)
-* [tensorflow::TensorShapeUtils](ClassTensorShapeUtils.md)
-* [tensorflow::PartialTensorShape](ClassPartialTensorShape.md)
-* [tensorflow::PartialTensorShapeUtils](ClassPartialTensorShapeUtils.md)
-
-## Thread
-
-* [tensorflow::Thread](ClassThread.md)
-* [tensorflow::ThreadOptions](StructThreadOptions.md)
-
diff --git a/tensorflow/g3doc/api_docs/index.md b/tensorflow/g3doc/api_docs/index.md
deleted file mode 100644
index a2b4b1c2939..00000000000
--- a/tensorflow/g3doc/api_docs/index.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# API Documentation
-
-TensorFlow has APIs available in several languages both for constructing and
-executing a TensorFlow graph. The Python API is at present the most complete and
-the easiest to use, but the C++ API may offer some performance advantages in
-graph execution, and supports deployment to small devices such as Android.
-
-Additionally, the TensorFlow maintainers intend to include APIs for
-[Java](https://github.com/tensorflow/tensorflow/issues/5). We hope that
-the TensorFlow community will develop front ends for other languages like
-JavaScript, Lua, R and perhaps others, building on the [approach recommended by
-the TensorFlow maintainers](../how_tos/language_bindings/index.md).
-
-Note: Many practical aspects of usage are covered in the TUTORIALS and HOW TO
-tab, and some additional documentation not specific to any particular language
-API is available in the RESOURCES tab.
-
-*   [Python API](python/index.md)
-*   [C++ API](cc/index.md)
-*   [Go API](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
-    (experimental)
diff --git a/tensorflow/g3doc/api_docs/leftnav_files b/tensorflow/g3doc/api_docs/leftnav_files
deleted file mode 100644
index b1cf343c0bb..00000000000
--- a/tensorflow/g3doc/api_docs/leftnav_files
+++ /dev/null
@@ -1,60 +0,0 @@
-### [Overview](/api_docs/index.md)
-### [Python API](/api_docs/python/index.md)
-python/framework.md
-python/constant_op.md
-python/state_ops.md
-python/array_ops.md
-python/tensor_array_ops.md
-python/math_ops.md
-python/string_ops.md
-python/control_flow_ops.md
-python/check_ops.md
-python/image.md
-python/sparse_ops.md
-python/io_ops.md
-python/python_io.md
-python/nn.md
-python/client.md
-python/train.md
-python/histogram_ops.md
-python/summary.md
-python/session_ops.md
-python/script_ops.md
-python/functional_ops.md
-python/test.md
-python/contrib.bayesflow.entropy.md
-python/contrib.bayesflow.monte_carlo.md
-python/contrib.bayesflow.stochastic_graph.md
-python/contrib.bayesflow.stochastic_tensor.md
-python/contrib.bayesflow.variational_inference.md
-python/contrib.copy_graph.md
-python/contrib.crf.md
-python/contrib.distributions.md
-python/contrib.ffmpeg.md
-python/contrib.framework.md
-python/contrib.graph_editor.md
-python/contrib.layers.md
-python/contrib.learn.md
-python/contrib.learn.monitors.md
-python/contrib.losses.md
-python/contrib.metrics.md
-python/contrib.rnn.md
-python/contrib.training.md
-python/contrib.util.md
->>> [C++ API](/api_docs/cc/index.md)
-cc/ClassEnv.md
-cc/ClassRandomAccessFile.md
-cc/ClassWritableFile.md
-cc/ClassEnvWrapper.md
-cc/ClassSession.md
-cc/StructSessionOptions.md
-cc/ClassStatus.md
-cc/StructState.md
-cc/ClassTensor.md
-cc/ClassTensorShape.md
-cc/StructTensorShapeDim.md
-cc/ClassTensorShapeUtils.md
-cc/ClassPartialTensorShape.md
-cc/ClassPartialTensorShapeUtils.md
-cc/ClassThread.md
-cc/StructThreadOptions.md
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
deleted file mode 100644
index 6f87d3b2d12..00000000000
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ /dev/null
@@ -1,3176 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Tensor Transformations
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Casting
-
-TensorFlow provides several operations that you can use to cast tensor data
-types in your graph.
-
-- - -
-
-### `tf.string_to_number(string_tensor, out_type=None, name=None)` {#string_to_number}
-
-Converts each string in the input Tensor to the specified numeric type.
-
-(Note that int32 overflow results in an error while float overflow
-results in a rounded value.)
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.float32, tf.int32`. Defaults to `tf.float32`.
-    The numeric type to interpret each string in `string_tensor` as.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-  A Tensor of the same shape as the input `string_tensor`.
-
-
-- - -
-
-### `tf.to_double(x, name='ToDouble')` {#to_double}
-
-Casts a tensor to type `float64`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `float64`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `float64`.
-
-
-- - -
-
-### `tf.to_float(x, name='ToFloat')` {#to_float}
-
-Casts a tensor to type `float32`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `float32`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `float32`.
-
-
-- - -
-
-### `tf.to_bfloat16(x, name='ToBFloat16')` {#to_bfloat16}
-
-Casts a tensor to type `bfloat16`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `bfloat16`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `bfloat16`.
-
-
-- - -
-
-### `tf.to_int32(x, name='ToInt32')` {#to_int32}
-
-Casts a tensor to type `int32`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `int32`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `int32`.
-
-
-- - -
-
-### `tf.to_int64(x, name='ToInt64')` {#to_int64}
-
-Casts a tensor to type `int64`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `int64`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `int64`.
-
-
-- - -
-
-### `tf.cast(x, dtype, name=None)` {#cast}
-
-Casts a tensor to a new type.
-
-The operation casts `x` (in case of `Tensor`) or `x.values`
-(in case of `SparseTensor`) to `dtype`.
-
-For example:
-
-```python
-# tensor `a` is [1.8, 2.2], dtype=tf.float
-tf.cast(a, tf.int32) ==> [1, 2]  # dtype=tf.int32
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`dtype`</b>: The destination type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `dtype`.
-
-
-- - -
-
-### `tf.bitcast(input, type, name=None)` {#bitcast}
-
-Bitcasts a tensor from one type to another without copying data.
-
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`type`</b>: A `tf.DType` from: `tf.float32, tf.float64, tf.int64, tf.int32, tf.uint8, tf.uint16, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.qint8, tf.quint8, tf.qint32, tf.half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `type`.
-
-
-- - -
-
-### `tf.saturate_cast(value, dtype, name=None)` {#saturate_cast}
-
-Performs a safe saturating cast of `value` to `dtype`.
-
-This function casts the input to `dtype` without applying any scaling.  If
-there is a danger that values would over or underflow in the cast, this op
-applies the appropriate clamping before the cast.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`.
-*  <b>`dtype`</b>: The desired output `DType`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `value` safely cast to `dtype`.
-
-
-
-## Shapes and Shaping
-
-TensorFlow provides several operations that you can use to determine the shape
-of a tensor and change the shape of a tensor.
-
-- - -
-
-### `tf.broadcast_dynamic_shape(shape_x, shape_y)` {#broadcast_dynamic_shape}
-
-Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
-
-##### Args:
-
-
-*  <b>`shape_x`</b>: A rank 1 integer `Tensor`, representing the shape of x.
-*  <b>`shape_y`</b>: A rank 1 integer `Tensor`, representing the shape of x.
-
-##### Returns:
-
-  A rank 1 integer `Tensor` representing the broadcasted shape.
-
-
-- - -
-
-### `tf.broadcast_static_shape(shape_x, shape_y)` {#broadcast_static_shape}
-
-Returns the broadcasted static shape between `shape_x` and `shape_y`.
-
-##### Args:
-
-
-*  <b>`shape_x`</b>: A `TensorShape`
-*  <b>`shape_y`</b>: A `TensorShape`
-
-##### Returns:
-
-  A `TensorShape` representing the broadcasted shape.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the two shapes can not be broadcasted.
-
-
-- - -
-
-### `tf.shape(input, name=None, out_type=tf.int32)` {#shape}
-
-Returns the shape of a tensor.
-
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`out_type`</b>: (Optional) The specified output type of the operation
-    (`int32` or `int64`). Defaults to `tf.int32`.
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-
-
-- - -
-
-### `tf.shape_n(input, out_type=None, name=None)` {#shape_n}
-
-Returns shape of tensors.
-
-This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A list of at least 1 `Tensor` objects of the same type.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list with the same number of `Tensor` objects as `input` of `Tensor` objects of type out_type.
-
-
-- - -
-
-### `tf.size(input, name=None, out_type=tf.int32)` {#size}
-
-Returns the size of a tensor.
-
-This operation returns an integer representing the number of elements in
-`input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-size(t) ==> 12
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`out_type`</b>: (Optional) The specified output type of the operation
-    (`int32` or `int64`). Defaults to tf.int32.
-
-##### Returns:
-
-  A `Tensor` of type `out_type`. Defaults to tf.int32.
-
-
-- - -
-
-### `tf.rank(input, name=None)` {#rank}
-
-Returns the rank of a tensor.
-
-This operation returns an integer representing the rank of `input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-# shape of tensor 't' is [2, 2, 3]
-rank(t) ==> 3
-```
-
-**Note**: The rank of a tensor is not the same as the rank of a matrix. The
-rank of a tensor is the number of indices required to uniquely select each
-element of the tensor. Rank is also known as "order", "degree", or "ndims."
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int32`.
-
-@compatibility(numpy)
-Equivalent to np.ndim
-@end_compatibility
-
-
-- - -
-
-### `tf.reshape(tensor, shape, name=None)` {#reshape}
-
-Reshapes a tensor.
-
-Given `tensor`, this operation returns a tensor that has the same values
-as `tensor` with shape `shape`.
-
-If one component of `shape` is the special value -1, the size of that dimension
-is computed so that the total size remains constant.  In particular, a `shape`
-of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-
-If `shape` is 1-D or higher, then the operation returns a tensor with shape
-`shape` filled with the values of `tensor`. In this case, the number of elements
-implied by `shape` must be the same as the number of elements in `tensor`.
-
-For example:
-
-```prettyprint
-# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# tensor 't' has shape [9]
-reshape(t, [3, 3]) ==> [[1, 2, 3],
-                        [4, 5, 6],
-                        [7, 8, 9]]
-
-# tensor 't' is [[[1, 1], [2, 2]],
-#                [[3, 3], [4, 4]]]
-# tensor 't' has shape [2, 2, 2]
-reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-                        [3, 3, 4, 4]]
-
-# tensor 't' is [[[1, 1, 1],
-#                 [2, 2, 2]],
-#                [[3, 3, 3],
-#                 [4, 4, 4]],
-#                [[5, 5, 5],
-#                 [6, 6, 6]]]
-# tensor 't' has shape [3, 2, 3]
-# pass '[-1]' to flatten 't'
-reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-
-# -1 can also be used to infer the shape
-
-# -1 is inferred to be 9:
-reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 2:
-reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 3:
-reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-                              [2, 2, 2],
-                              [3, 3, 3]],
-                             [[4, 4, 4],
-                              [5, 5, 5],
-                              [6, 6, 6]]]
-
-# tensor 't' is [7]
-# shape `[]` reshapes to a scalar
-reshape(t, []) ==> 7
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Defines the shape of the output tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
-
-- - -
-
-### `tf.squeeze(input, axis=None, name=None, squeeze_dims=None)` {#squeeze}
-
-Removes dimensions of size 1 from the shape of a tensor.
-
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
-
-For example:
-
-```prettyprint
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```prettyprint
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The `input` to squeeze.
-*  <b>`axis`</b>: An optional list of `ints`. Defaults to `[]`.
-    If specified, only squeezes the dimensions listed. The dimension
-    index starts at 0. It is an error to squeeze a dimension that is not 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`squeeze_dims`</b>: Deprecated keyword argument that is now axis.
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Contains the same data as `input`, but has one or more dimensions of
-  size 1 removed.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When both `squeeze_dims` and `axis` are specified.
-
-
-- - -
-
-### `tf.expand_dims(input, axis=None, name=None, dim=None)` {#expand_dims}
-
-Inserts a dimension of 1 into a tensor's shape.
-
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `axis` of `input`'s shape. The dimension index `axis` starts
-at zero; if you specify a negative number for `axis` it is counted backward
-from the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```python
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`axis`</b>: 0-D (scalar). Specifies the dimension index at which to
-    expand the shape of `input`.
-*  <b>`name`</b>: The name of the output `Tensor`.
-*  <b>`dim`</b>: 0-D (scalar). Equivalent to `axis`, to be deprecated.
-
-##### Returns:
-
-  A `Tensor` with the same data as `input`, but its shape has an additional
-  dimension of size 1 added.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if both `dim` and `axis` are specified.
-
-
-- - -
-
-### `tf.meshgrid(*args, **kwargs)` {#meshgrid}
-
-Broadcasts parameters for evaluation on an N-D grid.
-
-Given N one-dimensional coordinate arrays `*args`, returns a list `outputs`
-of N-D coordinate arrays for evaluating expressions on an N-D grid.
-
-Notes:
-
-`meshgrid` supports cartesian ('xy') and matrix ('ij') indexing conventions.
-When the `indexing` argument is set to 'xy' (the default), the broadcasting
-instructions for the first two dimensions are swapped.
-
-Examples:
-
-Calling `X, Y = meshgrid(x, y)` with the tensors
-
-```prettyprint
-  x = [1, 2, 3]
-  y = [4, 5, 6]
-```
-
-results in
-
-```prettyprint
-  X = [[1, 1, 1],
-       [2, 2, 2],
-       [3, 3, 3]]
-  Y = [[4, 5, 6],
-       [4, 5, 6],
-       [4, 5, 6]]
-```
-
-##### Args:
-
-
-*  <b>`*args`</b>: `Tensor`s with rank 1
-*  <b>`indexing`</b>: Either 'xy' or 'ij' (optional, default: 'xy')
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`outputs`</b>: A list of N `Tensor`s with rank N
-
-
-
-## Slicing and Joining
-
-TensorFlow provides several operations to slice or extract parts of a tensor,
-or join multiple tensors together.
-
-- - -
-
-### `tf.slice(input_, begin, size, name=None)` {#slice}
-
-Extracts a slice from a tensor.
-
-This operation extracts a slice of size `size` from a tensor `input` starting
-at the location specified by `begin`. The slice `size` is represented as a
-tensor shape, where `size[i]` is the number of elements of the 'i'th dimension
-of `input` that you want to slice. The starting location (`begin`) for the
-slice is represented as an offset in each dimension of `input`. In other
-words, `begin[i]` is the offset into the 'i'th dimension of `input` that you
-want to slice from.
-
-`begin` is zero-based; `size` is one-based. If `size[i]` is -1,
-all remaining elements in dimension i are included in the
-slice. In other words, this is equivalent to setting:
-
-`size[i] = input.dim_size(i) - begin[i]`
-
-This operation requires that:
-
-`0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n]`
-
-For example:
-
-```python
-# 'input' is [[[1, 1, 1], [2, 2, 2]],
-#             [[3, 3, 3], [4, 4, 4]],
-#             [[5, 5, 5], [6, 6, 6]]]
-tf.slice(input, [1, 0, 0], [1, 1, 3]) ==> [[[3, 3, 3]]]
-tf.slice(input, [1, 0, 0], [1, 2, 3]) ==> [[[3, 3, 3],
-                                            [4, 4, 4]]]
-tf.slice(input, [1, 0, 0], [2, 1, 3]) ==> [[[3, 3, 3]],
-                                           [[5, 5, 5]]]
-```
-
-##### Args:
-
-
-*  <b>`input_`</b>: A `Tensor`.
-*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`size`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` the same type as `input`.
-
-
-- - -
-
-### `tf.strided_slice(input_, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
-
-Extracts a strided slice from a tensor.
-
-To a first order, this operation extracts a slice of size `end - begin`
-from a tensor `input`
-starting at the location specified by `begin`. The slice continues by adding
-`stride` to the `begin` index until all dimensions are not less than `end`.
-Note that components of stride can be negative, which causes a reverse
-slice.
-
-This operation can be thought of an encoding of a numpy style sliced
-range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
-this function will be called as follows.
-
-`begin`, `end`, and `strides` will be all length n. n is in general
-not the same dimensionality as `input`.
-
-For the ith spec,
-`begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
-and `shrink_axis_mask` will have the ith bit corresponding to
-the ith spec.
-
-If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
-the fullest possible range in that dimension is used instead.
-`end_mask` works analogously, except with the end range.
-
-`foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
-`foo[::-1]` reverses a tensor with shape 8.
-
-
-If the ith bit of `ellipsis_mask`, as many unspecified dimensions
-as needed will be inserted between other dimensions. Only one
-non-zero bit is allowed in `ellipsis_mask`.
-
-For example `foo[3:5,...,4:5]` on a shape 10x3x3x10 tensor is
-equivalent to `foo[3:5,:,:,4:5]` and
-`foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
-
-If the ith bit of `new_axis_mask` is one, then a `begin`,
-`end`, and `stride` are ignored and a new length 1 dimension is
-added at this point in the output tensor.
-
-For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
-whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
-being 1<<1 == 2.
-
-If the ith bit of `shrink_axis_mask` is one, then `begin`,
-`end[i]`, and `stride[i]` are used to do a slice in the appropriate
-dimension, but the output tensor will be reduced in dimensionality
-by one. This is only valid if the ith entry of slice[i]==1.
-
-NOTE: `begin` and `end` are zero-indexed`.
-`strides` entries must be non-zero.
-
-
-```python
-# 'input' is [[[1, 1, 1], [2, 2, 2]],
-#             [[3, 3, 3], [4, 4, 4]],
-#             [[5, 5, 5], [6, 6, 6]]]
-tf.strided_slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
-tf.strided_slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
-                                                               [4, 4, 4]]]
-tf.strided_slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
-                                                                [3, 3, 3]]]
-```
-
-##### Args:
-
-
-*  <b>`input_`</b>: A `Tensor`.
-*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`end`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`strides`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`begin_mask`</b>: An `int32` mask.
-*  <b>`end_mask`</b>: An `int32` mask.
-*  <b>`ellipsis_mask`</b>: An `int32` mask.
-*  <b>`new_axis_mask`</b>: An `int32` mask.
-*  <b>`shrink_axis_mask`</b>: An `int32` mask.
-*  <b>`var`</b>: The variable corresponding to `input_` or None
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` the same type as `input`.
-
-
-- - -
-
-### `tf.split(value, num_or_size_splits, axis=0, num=None, name='split')` {#split}
-
-Splits a tensor into sub tensors.
-
-If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-dimension `axis` into `num_split` smaller tensors.
-Requires that `num_split` evenly divides `value.shape[axis]`.
-
-If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-`len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-the `value` except along dimension `axis` where the size is `size_splits[i]`.
-
-For example:
-
-```python
-# 'value' is a tensor with shape [5, 30]
-# Split 'value' into 3 tensors with sizes [4, 15, 11] along dimension 1
-split0, split1, split2 = tf.split(value, [4, 15, 11], 1)
-tf.shape(split0) ==> [5, 4]
-tf.shape(split1) ==> [5, 15]
-tf.shape(split2) ==> [5, 11]
-# Split 'value' into 3 tensors along dimension 1
-split0, split1, split2 = tf.split(value, num_or_size_splits=3, axis=1)
-tf.shape(split0) ==> [5, 10]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: The `Tensor` to split.
-*  <b>`num_or_size_splits`</b>: Either an integer indicating the number of splits along
-    split_dim or a 1-D Tensor containing the sizes of each output tensor
-    along split_dim. If an integer then it must evenly divide
-    `value.shape[axis]`; otherwise the sum of sizes along the split
-    dimension must match that of the `value`.
-*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-    Must be in the range `[0, rank(value))`. Defaults to 0.
-*  <b>`num`</b>: Optional, used to specify the number of outputs when it cannot be
-    inferred from the shape of `size_splits`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor`
-  objects; if `num_or_size_splits` is a 1-D Tensor returns
-  `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
-  `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num` is unspecified and cannot be inferred.
-
-
-- - -
-
-### `tf.tile(input, multiples, name=None)` {#tile}
-
-Constructs a tensor by tiling a given tensor.
-
-This operation creates a new tensor by replicating `input` `multiples` times.
-The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-and the values of `input` are replicated `multiples[i]` times along the 'i'th
-dimension. For example, tiling `[a b c d]` by `[2]` produces
-`[a b c d a b c d]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 1-D or higher.
-*  <b>`multiples`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. Length must be the same as the number of dimensions in `input`
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.pad(tensor, paddings, mode='CONSTANT', name=None)` {#pad}
-
-Pads a tensor.
-
-This operation pads a `tensor` according to the `paddings` you specify.
-`paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
-`tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
-many values to add before the contents of `tensor` in that dimension, and
-`paddings[D, 1]` indicates how many values to add after the contents of
-`tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
-and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
-`mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
-no greater than `tensor.dim_size(D)`.
-
-The padded size of each dimension D of the output is:
-
-`paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
-
-For example:
-
-```python
-# 't' is [[1, 2, 3], [4, 5, 6]].
-# 'paddings' is [[1, 1,], [2, 2]].
-# rank of 't' is 2.
-pad(t, paddings, "CONSTANT") ==> [[0, 0, 0, 0, 0, 0, 0],
-                                  [0, 0, 1, 2, 3, 0, 0],
-                                  [0, 0, 4, 5, 6, 0, 0],
-                                  [0, 0, 0, 0, 0, 0, 0]]
-
-pad(t, paddings, "REFLECT") ==> [[6, 5, 4, 5, 6, 5, 4],
-                                 [3, 2, 1, 2, 3, 2, 1],
-                                 [6, 5, 4, 5, 6, 5, 4],
-                                 [3, 2, 1, 2, 3, 2, 1]]
-
-pad(t, paddings, "SYMMETRIC") ==> [[2, 1, 1, 2, 3, 3, 2],
-                                   [2, 1, 1, 2, 3, 3, 2],
-                                   [5, 4, 4, 5, 6, 6, 5],
-                                   [5, 4, 4, 5, 6, 6, 5]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`paddings`</b>: A `Tensor` of type `int32`.
-*  <b>`mode`</b>: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
-
-
-- - -
-
-### `tf.concat(values, axis, name='concat')` {#concat}
-
-Concatenates tensors along one dimension.
-
-Concatenates the list of tensors `values` along dimension `axis`.  If
-`values[i].shape = [D0, D1, ... Daxis(i), ...Dn]`, the concatenated
-result has shape
-
-    [D0, D1, ... Raxis, ...Dn]
-
-where
-
-    Raxis = sum(Daxis(i))
-
-That is, the data from the input tensors is joined along the `axis`
-dimension.
-
-The number of dimensions of the input tensors must match, and all dimensions
-except `axis` must be equal.
-
-For example:
-
-```python
-t1 = [[1, 2, 3], [4, 5, 6]]
-t2 = [[7, 8, 9], [10, 11, 12]]
-tf.concat([t1, t2], 0) ==> [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
-tf.concat([t1, t2], 1) ==> [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
-
-# tensor t3 with shape [2, 3]
-# tensor t4 with shape [2, 3]
-tf.shape(tf.concat([t3, t4], 0)) ==> [4, 3]
-tf.shape(tf.concat([t3, t4], 1)) ==> [2, 6]
-```
-
-Note: If you are concatenating along a new axis consider using stack.
-E.g.
-
-```python
-tf.concat([tf.expand_dims(t, axis) for t in tensors], axis)
-```
-
-can be rewritten as
-
-```python
-tf.stack(tensors, axis=axis)
-```
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects or a single `Tensor`.
-*  <b>`axis`</b>: 0-D `int32` `Tensor`.  Dimension along which to concatenate.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` resulting from concatenation of the input tensors.
-
-
-- - -
-
-### `tf.stack(values, axis=0, name='stack')` {#stack}
-
-Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
-
-Packs the list of tensors in `values` into a tensor with rank one higher than
-each tensor in `values`, by packing them along the `axis` dimension.
-Given a list of length `N` of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```prettyprint
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-stack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of unstack.  The numpy equivalent is
-
-    tf.stack([x, y, z]) = np.asarray([x, y, z])
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects with the same shape and type.
-*  <b>`axis`</b>: An `int`. The axis to stack along. Defaults to the first dimension.
-    Supports negative indexes.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`output`</b>: A stacked `Tensor` with the same type as `values`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `axis` is out of the range [-(R+1), R+1).
-
-
-- - -
-
-### `tf.parallel_stack(values, name='parallel_stack')` {#parallel_stack}
-
-Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
-
-Requires that the shape of inputs be known at graph construction time.
-
-Packs the list of tensors in `values` into a tensor with rank one higher than
-each tensor in `values`, by packing them along the first dimension.
-Given a list of length `N` of tensors of shape `(A, B, C)`; the `output`
-tensor will have the shape `(N, A, B, C)`.
-
-For example:
-
-```prettyprint
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-parallel_stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]
-```
-
-The difference between stack and parallel_stack is that stack requires all
-of the inputs be computed before the operation will begin but doesn't require
-that the input shapes be known during graph construction.  Parallel stack
-will copy pieces of the input into the output as they become available, in
-some situations this can provide a performance benefit.
-
-This is the opposite of unstack.  The numpy equivalent is
-
-    tf.parallel_stack([x, y, z]) = np.asarray([x, y, z])
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects with the same shape and type.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`output`</b>: A stacked `Tensor` with the same type as `values`.
-
-
-- - -
-
-### `tf.unstack(value, num=None, axis=0, name='unstack')` {#unstack}
-
-Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
-
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-If `num` is not specified (the default), it is inferred from `value`'s shape.
-If `value.shape[axis]` is not known, `ValueError` is raised.
-
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice
-  `value[i, :, :, :]` and each tensor in `output` will have shape `(B, C, D)`.
-  (Note that the dimension unpacked along is gone, unlike `split`).
-
-If `axis == 1` then the i'th tensor in `output` is the slice
-  `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of pack.  The numpy equivalent is
-
-    tf.unstack(x, n) = list(x)
-
-##### Args:
-
-
-*  <b>`value`</b>: A rank `R > 0` `Tensor` to be unstacked.
-*  <b>`num`</b>: An `int`. The length of the dimension `axis`. Automatically inferred
-    if `None` (the default).
-*  <b>`axis`</b>: An `int`. The axis to unstack along. Defaults to the first
-    dimension. Supports negative indexes.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The list of `Tensor` objects unstacked from `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num` is unspecified and cannot be inferred.
-*  <b>`ValueError`</b>: If `axis` is out of the range [-R, R).
-
-
-- - -
-
-### `tf.reverse_sequence(input, seq_lengths, seq_axis=None, batch_axis=None, name=None, seq_dim=None, batch_dim=None)` {#reverse_sequence}
-
-Reverses variable length slices.
-
-This op first slices `input` along the dimension `batch_axis`, and for each
-slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_axis`.
-
-The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-
-The output slice `i` along dimension `batch_axis` is then given by input
-slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_axis` reversed.
-
-For example:
-
-```prettyprint
-# Given this:
-batch_dim = 0
-seq_dim = 1
-input.dims = (4, 8, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-
-# while entries past seq_lens are copied through:
-output[0, 7:, :, ...] = input[0, 7:, :, ...]
-output[1, 2:, :, ...] = input[1, 2:, :, ...]
-output[2, 3:, :, ...] = input[2, 3:, :, ...]
-output[3, 2:, :, ...] = input[3, 2:, :, ...]
-```
-
-In contrast, if:
-
-```prettyprint
-# Given this:
-batch_dim = 2
-seq_dim = 0
-input.dims = (8, ?, 4, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-
-# while entries past seq_lens are copied through:
-output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The input to reverse.
-*  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with length `input.dims(batch_dim)` and
-    `max(seq_lengths) <= input.dims(seq_dim)`
-*  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
-*  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
-    The dimension along which reversal is performed.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The partially reversed input. It has the same shape as `input`.
-
-
-- - -
-
-### `tf.reverse(tensor, axis, name=None)` {#reverse}
-
-Reverses specific dimensions of a tensor.
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```prettyprint
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is -1
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int32`, `int64`, `bool`, `half`, `float32`, `float64`, `complex64`, `complex128`.
-    Up to 8-D.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. The indices of the dimensions to reverse.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`. The same shape as `tensor`.
-
-
-- - -
-
-### `tf.reverse_v2(tensor, axis, name=None)` {#reverse_v2}
-
-Reverses specific dimensions of a tensor.
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```prettyprint
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is -1
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int32`, `int64`, `bool`, `half`, `float32`, `float64`, `complex64`, `complex128`.
-    Up to 8-D.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. The indices of the dimensions to reverse.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`. The same shape as `tensor`.
-
-
-- - -
-
-### `tf.transpose(a, perm=None, name='transpose')` {#transpose}
-
-Transposes `a`. Permutes the dimensions according to `perm`.
-
-The returned tensor's dimension i will correspond to the input dimension
-`perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-the rank of the input tensor. Hence by default, this operation performs a
-regular matrix transpose on 2-D input Tensors.
-
-For example:
-
-```python
-# 'x' is [[1 2 3]
-#         [4 5 6]]
-tf.transpose(x) ==> [[1 4]
-                     [2 5]
-                     [3 6]]
-
-# Equivalently
-tf.transpose(x, perm=[1, 0]) ==> [[1 4]
-                                  [2 5]
-                                  [3 6]]
-
-# 'perm' is more useful for n-dimensional tensors, for n > 2
-# 'x' is   [[[1  2  3]
-#            [4  5  6]]
-#           [[7  8  9]
-#            [10 11 12]]]
-# Take the transpose of the matrices in dimension-0
-tf.transpose(x, perm=[0, 2, 1]) ==> [[[1  4]
-                                      [2  5]
-                                      [3  6]]
-
-                                     [[7 10]
-                                      [8 11]
-                                      [9 12]]]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`.
-*  <b>`perm`</b>: A permutation of the dimensions of `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A transposed `Tensor`.
-
-
-- - -
-
-### `tf.extract_image_patches(images, ksizes, strides, rates, padding, name=None)` {#extract_image_patches}
-
-Extract `patches` from `images` and put them in the "depth" output dimension.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-*  <b>`ksizes`</b>: A list of `ints` that has length `>= 4`.
-    The size of the sliding window for each dimension of `images`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. How far the centers of two consecutive patches are in
-    the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-    input stride, specifying how far two consecutive patch samples are in the
-    input. Equivalent to extracting patches with
-    `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-    subsampling them spatially by a factor of `rates`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-
-    We specify the size-related attributes as:
-
-    ```python
-          ksizes = [1, ksize_rows, ksize_cols, 1]
-          strides = [1, strides_rows, strides_cols, 1]
-          rates = [1, rates_rows, rates_cols, 1]
-    ```
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`.
-  4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-  ksize_cols * depth]` containing image patches with size
-  `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
-
-
-- - -
-
-### `tf.space_to_batch_nd(input, block_shape, paddings, name=None)` {#space_to_batch_nd}
-
-SpaceToBatch for N-D tensors of type T.
-
-This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-grid of blocks of shape `block_shape`, and interleaves these blocks with the
-"batch" dimension (0) such that in the output, the spatial dimensions
-`[1, ..., M]` correspond to the position within the grid, and the batch
-dimension combines both the position within a spatial block and the original
-batch position.  Prior to division into blocks, the spatial dimensions of the
-input are optionally zero padded according to `paddings`.  See below for a
-precise description.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-    where spatial_shape has `M` dimensions.
-*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with shape `[M]`, all values must be >= 1.
-*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D with shape `[M, 2]`, all values must be >= 0.
-      `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-      `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-      `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-
-    This operation is equivalent to the following steps:
-
-    1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-       input according to `paddings` to produce `padded` of shape `padded_shape`.
-
-    2. Reshape `padded` to `reshaped_padded` of shape:
-
-         [batch] +
-         [padded_shape[1] / block_shape[0],
-           block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1],
-          block_shape[M-1]] +
-         remaining_shape
-
-    3. Permute dimensions of `reshaped_padded` to produce
-       `permuted_reshaped_padded` of shape:
-
-         block_shape +
-         [batch] +
-         [padded_shape[1] / block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1]] +
-         remaining_shape
-
-    4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-       dimension, producing an output tensor of shape:
-
-         [batch * prod(block_shape)] +
-         [padded_shape[1] / block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1]] +
-         remaining_shape
-
-    Some examples:
-
-    (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 1]` and value:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 3]` and value:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]],
-          [[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[4, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-        paddings = `[[0, 0], [2, 0]]`:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[8, 1, 3, 1]` and value:
-
-    ```prettyprint
-    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-         [[[0], [2], [4]]], [[[0], [10], [12]]],
-         [[[0], [5], [7]]], [[[0], [13], [15]]],
-         [[[0], [6], [8]]], [[[0], [14], [16]]]]
-    ```
-
-    Among others, this operation is useful for reducing atrous convolution into
-    regular convolution.
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.space_to_batch(input, paddings, block_size, name=None)` {#space_to_batch}
-
-SpaceToBatch for 4-D tensors of type T.
-
-This is a legacy version of the more general SpaceToBatchND.
-
-Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-More specifically, this op outputs a copy of the input tensor where values from
-the `height` and `width` dimensions are moved to the `batch` dimension. After
-the zero-padding, both `height` and `width` of the input must be divisible by the
-block size.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 4-D with shape `[batch, height, width, depth]`.
-*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-      the padding of the input with zeros across the spatial dimensions as follows:
-
-          paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-
-      The effective spatial dimensions of the zero-padded input tensor will be:
-
-          height_pad = pad_top + height + pad_bottom
-          width_pad = pad_left + width + pad_right
-
-    The attr `block_size` must be greater than one. It indicates the block size.
-
-      * Non-overlapping blocks of size `block_size x block size` in the height and
-        width dimensions are rearranged into the batch dimension at each location.
-      * The batch of the output tensor is `batch * block_size * block_size`.
-      * Both height_pad and width_pad must be divisible by block_size.
-
-    The shape of the output will be:
-
-        [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-         depth]
-
-    Some examples:
-
-    (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 1]` and value:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 3]` and value:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]],
-          [[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[4, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[8, 1, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-         [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-    ```
-
-    Among others, this operation is useful for reducing atrous convolution into
-    regular convolution.
-
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.required_space_to_batch_paddings(input_shape, block_shape, base_paddings=None, name=None)` {#required_space_to_batch_paddings}
-
-Calculate padding required to make block_shape divide input_shape.
-
-This function can be used to calculate a suitable paddings argument for use
-with space_to_batch_nd and batch_to_space_nd.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: int32 Tensor of shape [N].
-*  <b>`block_shape`</b>: int32 Tensor of shape [N].
-*  <b>`base_paddings`</b>: Optional int32 Tensor of shape [N, 2].  Specifies the minimum
-    amount of padding to use.  All elements must be >= 0.  If not specified,
-    defaults to 0.
-*  <b>`name`</b>: string.  Optional name prefix.
-
-##### Returns:
-
-  (paddings, crops), where:
-
-  `paddings` and `crops` are int32 Tensors of rank 2 and shape [N, 2]
-
-*  <b>`satisfying`</b>: 
-
-      paddings[i, 0] = base_paddings[i, 0].
-      0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
-      (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0
-
-      crops[i, 0] = 0
-      crops[i, 1] = paddings[i, 1] - base_paddings[i, 1]
-
-
-*  <b>`Raises`</b>: ValueError if called with incompatible shapes.
-
-
-- - -
-
-### `tf.batch_to_space_nd(input, block_shape, crops, name=None)` {#batch_to_space_nd}
-
-BatchToSpace for N-D tensors of type T.
-
-This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-`block_shape + [batch]`, interleaves these blocks back into the grid defined by
-the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-the input.  The spatial dimensions of this intermediate result are then
-optionally cropped according to `crops` to produce the output.  This is the
-reverse of SpaceToBatch.  See below for a precise description.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-    where spatial_shape has M dimensions.
-*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with shape `[M]`, all values must be >= 1.
-*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D with shape `[M, 2]`, all values must be >= 0.
-      `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-      dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-      required that
-      `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-
-    This operation is equivalent to the following steps:
-
-    1. Reshape `input` to `reshaped` of shape:
-         [block_shape[0], ..., block_shape[M-1],
-          batch / prod(block_shape),
-          input_shape[1], ..., input_shape[N-1]]
-
-    2. Permute dimensions of `reshaped` to produce `permuted` of shape
-         [batch / prod(block_shape),
-
-          input_shape[1], block_shape[0],
-          ...,
-          input_shape[M], block_shape[M-1],
-
-          input_shape[M+1], ..., input_shape[N-1]]
-
-    3. Reshape `permuted` to produce `reshaped_permuted` of shape
-         [batch / prod(block_shape),
-
-          input_shape[1] * block_shape[0],
-          ...,
-          input_shape[M] * block_shape[M-1],
-
-          input_shape[M+1],
-          ...,
-          input_shape[N-1]]
-
-    4. Crop the start and end of dimensions `[1, ..., M]` of
-       `reshaped_permuted` according to `crops` to produce the output of shape:
-         [batch / prod(block_shape),
-
-          input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-          ...,
-          input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-
-          input_shape[M+1], ..., input_shape[N-1]]
-
-    Some examples:
-
-    (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    The output tensor has shape `[1, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    The output tensor has shape `[1, 2, 2, 3]` and value:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    The output tensor has shape `[1, 4, 4, 1]` and value:
-
-    ```prettyprint
-    x = [[[1],   [2],  [3],  [4]],
-         [[5],   [6],  [7],  [8]],
-         [[9],  [10], [11],  [12]],
-         [[13], [14], [15],  [16]]]
-    ```
-
-    (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [2, 0]]`:
-
-    ```prettyprint
-    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-         [[[0], [2], [4]]], [[[0], [10], [12]]],
-         [[[0], [5], [7]]], [[[0], [13], [15]]],
-         [[[0], [6], [8]]], [[[0], [14], [16]]]]
-    ```
-
-    The output tensor has shape `[2, 2, 4, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.batch_to_space(input, crops, block_size, name=None)` {#batch_to_space}
-
-BatchToSpace for 4-D tensors of type T.
-
-This is a legacy version of the more general BatchToSpaceND.
-
-Rearranges (permutes) data from batch into blocks of spatial data, followed by
-cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-this op outputs a copy of the input tensor where values from the `batch`
-dimension are moved in spatial blocks to the `height` and `width` dimensions,
-followed by cropping along the `height` and `width` dimensions.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 4-D tensor with shape
-    `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-      depth]`. Note that the batch size of the input tensor must be divisible by
-    `block_size * block_size`.
-*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-    how many elements to crop from the intermediate result across the spatial
-    dimensions as follows:
-
-        crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  4-D with shape `[batch, height, width, depth]`, where:
-
-        height = height_pad - crop_top - crop_bottom
-        width = width_pad - crop_left - crop_right
-
-  The attr `block_size` must be greater than one. It indicates the block size.
-
-  Some examples:
-
-  (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-
-  ```prettyprint
-  [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-  ```
-
-  The output tensor has shape `[1, 2, 2, 1]` and value:
-
-  ```prettyprint
-  x = [[[[1], [2]], [[3], [4]]]]
-  ```
-
-  (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-
-  ```prettyprint
-  [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-  ```
-
-  The output tensor has shape `[1, 2, 2, 3]` and value:
-
-  ```prettyprint
-  x = [[[[1, 2, 3], [4, 5, 6]],
-        [[7, 8, 9], [10, 11, 12]]]]
-  ```
-
-  (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-
-  ```prettyprint
-  x = [[[[1], [3]], [[9], [11]]],
-       [[[2], [4]], [[10], [12]]],
-       [[[5], [7]], [[13], [15]]],
-       [[[6], [8]], [[14], [16]]]]
-  ```
-
-  The output tensor has shape `[1, 4, 4, 1]` and value:
-
-  ```prettyprint
-  x = [[[1],   [2],  [3],  [4]],
-       [[5],   [6],  [7],  [8]],
-       [[9],  [10], [11],  [12]],
-       [[13], [14], [15],  [16]]]
-  ```
-
-  (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-
-  ```prettyprint
-  x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-       [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-  ```
-
-  The output tensor has shape `[2, 2, 4, 1]` and value:
-
-  ```prettyprint
-  x = [[[[1], [3]], [[5], [7]]],
-       [[[2], [4]], [[10], [12]]],
-       [[[5], [7]], [[13], [15]]],
-       [[[6], [8]], [[14], [16]]]]
-  ```
-
-
-- - -
-
-### `tf.space_to_depth(input, block_size, name=None)` {#space_to_depth}
-
-SpaceToDepth for tensors of type T.
-
-Rearranges blocks of spatial data, into depth. More specifically,
-this op outputs a copy of the input tensor where values from the `height`
-and `width` dimensions are moved to the `depth` dimension.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Non-overlapping blocks of size `block_size x block size` are rearranged
-    into depth at each location.
-  * The depth of the output tensor is `input_depth * block_size * block_size`.
-  * The input tensor's height and width must be divisible by block_size.
-
-That is, assuming the input is in the shape:
-`[batch, height, width, depth]`,
-the shape of the output will be:
-`[batch, height/block_size, width/block_size, depth*block_size*block_size]`
-
-This operation requires that the input tensor be of rank 4, and that
-`block_size` be >=1 and a divisor of both the input `height` and `width`.
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
-
-```prettyprint
-x = [[[[1], [2]],
-      [[3], [4]]]]
-```
-
-This operation will output a tensor of shape `[1, 1, 1, 4]`:
-
-```prettyprint
-[[[[1, 2, 3, 4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-the corresponding output will have a single element (i.e. width and height are
-both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-The output element shape is `[1, 1, 4]`.
-
-For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-
-```prettyprint
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-This operation, for block_size of 2, will return the following tensor of shape
-`[1, 1, 1, 12]`
-
-```prettyprint
-[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-
-```prettyprint
-x = [[[[1],   [2],  [5],  [6]],
-      [[3],   [4],  [7],  [8]],
-      [[9],  [10], [13],  [14]],
-      [[11], [12], [15],  [16]]]]
-```
-
-the operator will return the following tensor of shape `[1 2 2 4]`:
-
-```prettyprint
-x = [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`block_size`</b>: An `int` that is `>= 2`. The size of the spatial block.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.depth_to_space(input, block_size, name=None)` {#depth_to_space}
-
-DepthToSpace for tensors of type T.
-
-Rearranges data from depth into blocks of spatial data.
-This is the reverse transformation of SpaceToDepth. More specifically,
-this op outputs a copy of the input tensor where values from the `depth`
-dimension are moved in spatial blocks to the `height` and `width` dimensions.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Chunks of data of size `block_size * block_size` from depth are rearranged
-    into non-overlapping blocks of size `block_size x block_size`
-  * The width the output tensor is `input_depth * block_size`, whereas the
-    height is `input_height * block_size`.
-  * The depth of the input tensor must be divisible by
-    `block_size * block_size`.
-
-That is, assuming the input is in the shape:
-`[batch, height, width, depth]`,
-the shape of the output will be:
-`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`
-
-This operation requires that the input tensor be of rank 4, and that
-`block_size` be >=1 and that `block_size * block_size` be a divisor of the
-input depth.
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
-
-```prettyprint
-x = [[[[1, 2, 3, 4]]]]
-
-```
-
-This operation will output a tensor of shape `[1, 2, 2, 1]`:
-
-```prettyprint
-   [[[[1], [2]],
-     [[3], [4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-the corresponding output will have 2x2 elements and will have a depth of
-1 channel (1 = `4 / (block_size * block_size)`).
-The output element shape is `[2, 2, 1]`.
-
-For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-
-```prettyprint
-x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-This operation, for block size of 2, will return the following tensor of shape
-`[1, 2, 2, 3]`
-
-```prettyprint
-   [[[[1, 2, 3], [4, 5, 6]],
-     [[7, 8, 9], [10, 11, 12]]]]
-
-```
-
-Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-
-```prettyprint
-x =  [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-the operator will return the following tensor of shape `[1 4 4 1]`:
-
-```prettyprint
-x = [[ [1],   [2],  [5],  [6]],
-     [ [3],   [4],  [7],  [8]],
-     [ [9],  [10], [13],  [14]],
-     [ [11], [12], [15],  [16]]]
-
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-    The size of the spatial block, same as in Space2Depth.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.gather(params, indices, validate_indices=None, name=None)` {#gather}
-
-Gather slices from `params` according to `indices`.
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-If `indices` is a permutation and `len(indices) == params.shape[0]` then
-this operation will permute `params` accordingly.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/Gather.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`params`</b>: A `Tensor`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-*  <b>`validate_indices`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `params`.
-
-
-- - -
-
-### `tf.gather_nd(params, indices, name=None)` {#gather_nd}
-
-Gather values or slices from `params` according to `indices`.
-
-`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `params`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `params`.
-
-Produces an output tensor with shape
-
-```
-[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-```
-
-Some examples below.
-
-Simple indexing into a matrix:
-
-```python
-    indices = [[0, 0], [1, 1]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = ['a', 'd']
-```
-
-Slice indexing into a matrix:
-
-```python
-    indices = [[1], [0]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['c', 'd'], ['a', 'b']]
-```
-
-Indexing into a 3-tensor:
-
-```python
-    indices = [[1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['a1', 'b1'], ['c1', 'd1']]]
-
-
-    indices = [[0, 1], [1, 0]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['c0', 'd0'], ['a1', 'b1']]
-
-
-    indices = [[0, 0, 1], [1, 0, 1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = ['b0', 'b1']
-```
-
-Batched indexing into a matrix:
-
-```python
-    indices = [[[0, 0]], [[0, 1]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['a'], ['b']]
-```
-
-Batched slice indexing into a matrix:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [[['c', 'd']], [['a', 'b']]]
-```
-
-Batched indexing into a 3-tensor:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[[['a1', 'b1'], ['c1', 'd1']]],
-              [[['a0', 'b0'], ['c0', 'd0']]]]
-
-    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['c0', 'd0'], ['a1', 'b1']],
-              [['a0', 'b0'], ['c1', 'd1']]]
-
-
-    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['b0', 'b1'], ['d0', 'c1']]
-```
-
-##### Args:
-
-
-*  <b>`params`</b>: A `Tensor`. `P-D`.  The tensor from which to gather values.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `params`.
-  `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-  `indices`.
-
-
-- - -
-
-### `tf.unique_with_counts(x, out_idx=None, name=None)` {#unique_with_counts}
-
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. Finally, it returns a third tensor `count` that
-contains the count of each element of `y` in `x`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```prettyprint
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx, count = unique_with_counts(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-count ==> [2, 1, 3, 1, 2]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (y, idx, count).
-
-*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
-*  <b>`count`</b>: A `Tensor` of type `out_idx`. 1-D.
-
-
-- - -
-
-### `tf.scatter_nd(indices, updates, shape, name=None)` {#scatter_nd}
-
-Creates a new tensor by applying sparse `updates` to individual
-
-values or slices within a zero tensor of the given `shape` tensor according to
-indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
-operator which extracts values or slices from a given tensor.
-
-TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-syntax.
-
-`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-`Q`.
-
-`indices` must be integer tensor, containing indices into `shape`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `shape`.
-
-`updates` is Tensor of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-```
-
-The simplest form of scatter is to insert individual elements in a tensor by
-index. For example, say we want to insert 4 scattered elements in a rank-1
-tensor with 8 elements.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd1.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    shape = tf.constant([8])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print sess.run(scatter)
-
-The resulting tensor would look like this:
-
-    [0, 11, 0, 10, 9, 0, 0, 12]
-
-We can also, insert entire slices of a higher rank tensor all at once. For
-example, if we wanted to insert two slices in the first dimension of a
-rank-3 tensor with two matrices of new values.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd2.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-    indices = tf.constant([[0], [2]])
-    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]],
-                           [[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]]])
-    shape = tf.constant([4, 4, 4])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print sess.run(scatter)
-
-The resulting tensor would look like this:
-
-    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`.
-    A Tensor. Must have the same type as tensor. A tensor of updated values
-    to store in ref.
-*  <b>`shape`</b>: A `Tensor`. Must have the same type as `indices`.
-    A vector. The shape of the resulting tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `updates`.
-  A new tensor with the given shape and updates applied according
-  to the indices.
-
-
-- - -
-
-### `tf.dynamic_partition(data, partitions, num_partitions, name=None)` {#dynamic_partition}
-
-Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-
-For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-are placed in `outputs[i]` in lexicographic order of `js`, and the first
-dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-In detail,
-
-```python
-    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-
-    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-```
-
-`data.shape` must start with `partitions.shape`.
-
-For example:
-
-```python
-    # Scalar partitions.
-    partitions = 1
-    num_partitions = 2
-    data = [10, 20]
-    outputs[0] = []  # Empty with shape [0, 2]
-    outputs[1] = [[10, 20]]
-
-    # Vector partitions.
-    partitions = [0, 0, 1, 1, 0]
-    num_partitions = 2
-    data = [10, 20, 30, 40, 50]
-    outputs[0] = [10, 20, 50]
-    outputs[1] = [30, 40]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicPartition.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`.
-*  <b>`partitions`</b>: A `Tensor` of type `int32`.
-    Any shape.  Indices in the range `[0, num_partitions)`.
-*  <b>`num_partitions`</b>: An `int` that is `>= 1`.
-    The number of partitions to output.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `num_partitions` `Tensor` objects of the same type as data.
-
-
-- - -
-
-### `tf.dynamic_stitch(indices, data, name=None)` {#dynamic_stitch}
-
-Interleave the values from the `data` tensors into a single tensor.
-
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values are merged in order, so if an index appears in both `indices[m][i]` and
-`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-merged result.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicStitch.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`indices`</b>: A list of at least 1 `Tensor` objects of type `int32`.
-*  <b>`data`</b>: A list with the same number of `Tensor` objects as `indices` of `Tensor` objects of the same type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-
-
-- - -
-
-### `tf.boolean_mask(tensor, mask, name='boolean_mask')` {#boolean_mask}
-
-Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
-
-```python
-# 1-D example
-tensor = [0, 1, 2, 3]
-mask = np.array([True, False, True, False])
-boolean_mask(tensor, mask) ==> [0, 2]
-```
-
-In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
-the first K dimensions of `tensor`'s shape.  We then have:
-  `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
-where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
-
-##### Args:
-
-
-*  <b>`tensor`</b>: N-D tensor.
-*  <b>`mask`</b>: K-D boolean tensor, K <= N and K must be known statically.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
-  to `True` values in `mask`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If shapes do not conform.
-
-
-*  <b>`Examples`</b>: 
-
-```python
-# 2-D example
-tensor = [[1, 2], [3, 4], [5, 6]]
-mask = np.array([True, False, True])
-boolean_mask(tensor, mask) ==> [[1, 2], [5, 6]]
-```
-
-
-- - -
-
-### `tf.one_hot(indices, depth, on_value=None, off_value=None, axis=None, dtype=None, name=None)` {#one_hot}
-
-Returns a one-hot tensor.
-
-The locations represented by indices in `indices` take value `on_value`,
-while all other locations take value `off_value`.
-
-`on_value` and `off_value` must have matching data types. If `dtype` is also
-provided, they must be the same data type as specified by `dtype`.
-
-If `on_value` is not provided, it will default to the value `1` with type
-`dtype`
-
-If `off_value` is not provided, it will default to the value `0` with type
-`dtype`
-
-If the input `indices` is rank `N`, the output will have rank `N+1`. The
-new axis is created at dimension `axis` (default: the new axis is appended
-at the end).
-
-If `indices` is a scalar the output shape will be a vector of length `depth`
-
-If `indices` is a vector of length `features`, the output shape will be:
-
-```
-  features x depth if axis == -1
-  depth x features if axis == 0
-```
-
-If `indices` is a matrix (batch) with shape `[batch, features]`, the output
-shape will be:
-
-```
-  batch x features x depth if axis == -1
-  batch x depth x features if axis == 1
-  depth x batch x features if axis == 0
-```
-
-If `dtype` is not provided, it will attempt to assume the data type of
-`on_value` or `off_value`, if one or both are passed in. If none of
-`on_value`, `off_value`, or `dtype` are provided, `dtype` will default to the
-value `tf.float32`.
-
-Note: If a non-numeric data type output is desired (`tf.string`, `tf.bool`,
-etc.), both `on_value` and `off_value` _must_ be provided to `one_hot`.
-
-Examples
-=========
-
-Suppose that
-
-```python
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 5.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[4 x 3]`:
-
-```python
-  output =
-  [5.0 0.0 0.0]  // one_hot(0)
-  [0.0 0.0 5.0]  // one_hot(2)
-  [0.0 0.0 0.0]  // one_hot(-1)
-  [0.0 5.0 0.0]  // one_hot(1)
-```
-
-Suppose that
-
-```python
-  indices = [[0, 2], [1, -1]]
-  depth = 3
-  on_value = 1.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[2 x 2 x 3]`:
-
-```python
-  output =
-  [
-    [1.0, 0.0, 0.0]  // one_hot(0)
-    [0.0, 0.0, 1.0]  // one_hot(2)
-  ][
-    [0.0, 1.0, 0.0]  // one_hot(1)
-    [0.0, 0.0, 0.0]  // one_hot(-1)
-  ]
-```
-
-Using default values for `on_value` and `off_value`:
-
-```python
-  indices = [0, 1, 2]
-  depth = 3
-```
-
-The output will be
-
-```python
-  output =
-  [[1., 0., 0.],
-   [0., 1., 0.],
-   [0., 0., 1.]]
-```
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `Tensor` of indices.
-*  <b>`depth`</b>: A scalar defining the depth of the one hot dimension.
-*  <b>`on_value`</b>: A scalar defining the value to fill in output when `indices[j]
-    = i`. (default: 1)
-*  <b>`off_value`</b>: A scalar defining the value to fill in output when `indices[j]
-    != i`. (default: 0)
-*  <b>`axis`</b>: The axis to fill (default: -1, a new inner-most axis).
-*  <b>`dtype`</b>: The data type of the output tensor.
-
-##### Returns:
-
-
-*  <b>`output`</b>: The one-hot tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If dtype of either `on_value` or `off_value` don't match `dtype`
-*  <b>`TypeError`</b>: If dtype of `on_value` and `off_value` don't match one another
-
-
-- - -
-
-### `tf.sequence_mask(lengths, maxlen=None, dtype=tf.bool, name=None)` {#sequence_mask}
-
-Return a mask tensor representing the first N positions of each row.
-
-Example:
-
-```python
-tf.sequence_mask([1, 3, 2], 5) =
-  [[True, False, False, False, False],
-   [True, True, True, False, False],
-   [True, True, False, False, False]]
-```
-
-##### Args:
-
-
-*  <b>`lengths`</b>: 1D integer tensor, all its values < maxlen.
-*  <b>`maxlen`</b>: scalar integer tensor, maximum length of each row. Default: use
-          maximum over lengths.
-*  <b>`dtype`</b>: output type of the resulting tensor.
-*  <b>`name`</b>: name of the op.
-
-##### Returns:
-
-  A 2D mask tensor, as shown in the example above, cast to specified dtype.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the arguments have invalid rank.
-
-
-- - -
-
-### `tf.dequantize(input, min_range, max_range, mode=None, name=None)` {#dequantize}
-
-Dequantize the 'input' tensor into a float Tensor.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-if T == qint8, in[i] += (range(T) + 1)/ 2.0
-out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-If the input comes from a QuantizedRelu6, the output type is
-quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-Dequantize on quint8 will take each value, cast to float, and multiply
-by 6 / 255.
-Note that if quantizedtype is qint8, the operation will additionally add
-each value by 128 prior to casting.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = range / number_of_steps
-const double offset_input = static_cast<double>(input) - lowest_quantized;
-result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`min_range`</b>: A `Tensor` of type `float32`.
-    The minimum scalar value possibly produced for the input.
-*  <b>`max_range`</b>: A `Tensor` of type `float32`.
-    The maximum scalar value possibly produced for the input.
-*  <b>`mode`</b>: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST"`. Defaults to `"MIN_COMBINED"`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
-
-- - -
-
-### `tf.quantize_v2(input, min_range, max_range, T, mode=None, name=None)` {#quantize_v2}
-
-Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-if T == qint8, out[i] -= (range(T) + 1) / 2.0
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-Assume the input is type float and has a possible range of [0.0, 6.0] and the
-output type is quint8 ([0, 255]). The min_range and max_range values should be
-specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-value of the input by 255/6 and cast to quint8.
-
-If the output type was qint8 ([-128, 127]), the operation will additionally
-subtract each value by 128 prior to casting, so that the range of values aligns
-with the range of qint8.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = number_of_steps / range
-quantized = round(input * range_scale) - round(range_min * range_scale) +
-  numeric_limits<T>::min()
-quantized = max(quantized, numeric_limits<T>::min())
-quantized = min(quantized, numeric_limits<T>::max())
-```
-
-The biggest difference between this and MIN_COMBINED is that the minimum range
-is rounded first, before it's subtracted from the rounded value. With
-MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-and dequantizing will introduce a larger and larger error.
-
-One thing to watch out for is that the operator may choose to adjust the
-requested minimum and maximum values slightly during the quantization process,
-so you should always use the output ports as the range for further calculations.
-For example, if the requested minimum and maximum values are close to equal,
-they will be separated by a small epsilon value to prevent ill-formed quantized
-buffers from being created. Otherwise, you can end up with buffers where all the
-quantized values map to the same float value, which causes problems for
-operations that have to perform further calculations on them.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `float32`.
-*  <b>`min_range`</b>: A `Tensor` of type `float32`.
-    The minimum scalar value possibly produced for the input.
-*  <b>`max_range`</b>: A `Tensor` of type `float32`.
-    The maximum scalar value possibly produced for the input.
-*  <b>`T`</b>: A `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`.
-*  <b>`mode`</b>: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST"`. Defaults to `"MIN_COMBINED"`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, output_min, output_max).
-
-*  <b>`output`</b>: A `Tensor` of type `T`. The quantized data produced from the float input.
-*  <b>`output_min`</b>: A `Tensor` of type `float32`. The actual minimum scalar value used for the output.
-*  <b>`output_max`</b>: A `Tensor` of type `float32`. The actual maximum scalar value used for the output.
-
-
-- - -
-
-### `tf.quantized_concat(concat_dim, values, input_mins, input_maxes, name=None)` {#quantized_concat}
-
-Concatenates quantized tensors along one dimension.
-
-##### Args:
-
-
-*  <b>`concat_dim`</b>: A `Tensor` of type `int32`.
-    0-D.  The dimension along which to concatenate.  Must be in the
-    range [0, rank(values)).
-*  <b>`values`</b>: A list of at least 2 `Tensor` objects of the same type.
-    The `N` Tensors to concatenate. Their ranks and types must match,
-    and their sizes must match in all dimensions except `concat_dim`.
-*  <b>`input_mins`</b>: A list with the same number of `Tensor` objects as `values` of `Tensor` objects of type `float32`.
-    The minimum scalar values for each of the input tensors.
-*  <b>`input_maxes`</b>: A list with the same number of `Tensor` objects as `values` of `Tensor` objects of type `float32`.
-    The maximum scalar values for each of the input tensors.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, output_min, output_max).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `values`. A `Tensor` with the concatenation of values stacked along the
-    `concat_dim` dimension.  This tensor's shape matches that of `values` except
-    in `concat_dim` where it has the sum of the sizes.
-*  <b>`output_min`</b>: A `Tensor` of type `float32`. The float value that the minimum quantized output value represents.
-*  <b>`output_max`</b>: A `Tensor` of type `float32`. The float value that the maximum quantized output value represents.
-
-
-- - -
-
-### `tf.setdiff1d(x, y, index_dtype=tf.int32, name=None)` {#setdiff1d}
-
-Computes the difference between two lists of numbers or strings.
-
-Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all values that are in `x` but not in `y`. The returned list `out`
-is sorted in the same order that the numbers appear in `x` (duplicates are
-preserved). This operation also returns a list `idx` that represents the
-position of each `out` element in `x`. In other words:
-
-`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-
-For example, given this input:
-
-```prettyprint
-x = [1, 2, 3, 4, 5, 6]
-y = [1, 3, 5]
-```
-
-This operation would return:
-
-```prettyprint
-out ==> [2, 4, 6]
-idx ==> [1, 3, 5]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D. Values to keep.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`. 1-D. Values to remove.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (out, idx).
-
-*  <b>`out`</b>: A `Tensor`. Has the same type as `x`. 1-D. Values present in `x` but not in `y`.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D. Positions of `x` values preserved in `out`.
-
-
-
-## Fake quantization
-Operations used to help train for better quantization accuracy.
-
-- - -
-
-### `tf.fake_quant_with_min_max_args(inputs, min=None, max=None, name=None)` {#fake_quant_with_min_max_args}
-
-Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-
-Attributes [min; max] define the clamping range for the 'inputs' data.  Op
-divides this range into 255 steps (total of 256 values), then replaces each
-'inputs' value with the closest of the quantized step values.
-
-Quantization is called fake since the output is still in floating point.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: An optional `float`. Defaults to `-6`.
-*  <b>`max`</b>: An optional `float`. Defaults to `6`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
-
-- - -
-
-### `tf.fake_quant_with_min_max_args_gradient(gradients, inputs, min=None, max=None, name=None)` {#fake_quant_with_min_max_args_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxArgs operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-*  <b>`min`</b>: An optional `float`. Defaults to `-6`.
-*  <b>`max`</b>: An optional `float`. Defaults to `6`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-  `gradients * (inputs >= min && inputs <= max)`.
-
-
-- - -
-
-### `tf.fake_quant_with_min_max_vars(inputs, min, max, name=None)` {#fake_quant_with_min_max_vars}
-
-Fake-quantize the 'inputs' tensor of type float and shape `[b, h, w, d]` via
-
-global float scalars `min` and `max` to 'outputs' tensor of same shape as
-`inputs`.
-
-[min; max] is the clamping range for the 'inputs' data.  Op divides this range
-into 255 steps (total of 256 values), then replaces each 'inputs' value with the
-closest of the quantized step values.
-
-This operation has a gradient and thus allows for training `min` and `max` values.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
-
-- - -
-
-### `tf.fake_quant_with_min_max_vars_gradient(gradients, inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxVars operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-    min, max: Quantization interval, scalar floats.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (backprops_wrt_input, backprop_wrt_min, backprop_wrt_max).
-
-*  <b>`backprops_wrt_input`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. inputs:
-    `gradients * (inputs >= min && inputs <= max)`.
-*  <b>`backprop_wrt_min`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. min parameter:
-    `sum(gradients * (inputs < min))`.
-*  <b>`backprop_wrt_max`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. max parameter:
-    `sum(gradients * (inputs > max))`.
-
-
-- - -
-
-### `tf.fake_quant_with_min_max_vars_per_channel(inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_per_channel}
-
-Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
-
-`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-to 'outputs' tensor of same shape as `inputs`.
-
-[min; max] is the clamping range for the 'inputs' data in the corresponding
-depth channel.  Op divides this range into 255 steps (total of 256 values), then
-replaces each 'inputs' value with the closest of the quantized step values.
-
-This operation has a gradient and thus allows for training `min` and `max` values.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
-
-- - -
-
-### `tf.fake_quant_with_min_max_vars_per_channel_gradient(gradients, inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_per_channel_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-    shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-      same as `gradients`.
-    min, max: Quantization interval, floats of shape `[d]`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (backprops_wrt_input, backprop_wrt_min, backprop_wrt_max).
-
-*  <b>`backprops_wrt_input`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. inputs, shape same as
-    `inputs`:
-      `gradients * (inputs >= min && inputs <= max)`.
-*  <b>`backprop_wrt_min`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-    `sum_per_d(gradients * (inputs < min))`.
-*  <b>`backprop_wrt_max`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-    `sum_per_d(gradients * (inputs > max))`.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `tf.concat_v2(values, axis, name='concat_v2')` {#concat_v2}
-
-
-
-
-- - -
-
-### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='', reuse_dst_scope=False)` {#copy}
-
-Copy a subgraph.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
-    using the same rules than the function subgraph.make_view.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dst_graph` is not a `tf.Graph`.
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/check_ops.md b/tensorflow/g3doc/api_docs/python/check_ops.md
deleted file mode 100644
index 9eec5e20ad9..00000000000
--- a/tensorflow/g3doc/api_docs/python/check_ops.md
+++ /dev/null
@@ -1,510 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Asserts and boolean checks.
-[TOC]
-
-## Asserts and Boolean Checks
-
-- - -
-
-### `tf.assert_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_negative}
-
-Assert the condition `x < 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_negative(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Negative means, for every element `x[i]` of `x`, we have `x[i] < 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_negative".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all negative.
-
-
-- - -
-
-### `tf.assert_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_positive}
-
-Assert the condition `x > 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_positive(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Positive means, for every element `x[i]` of `x`, we have `x[i] > 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_positive".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all positive.
-
-
-- - -
-
-### `tf.assert_proper_iterable(values)` {#assert_proper_iterable}
-
-Static assert that values is a "proper" iterable.
-
-`Ops` that expect iterables of `Tensor` can call this to validate input.
-Useful since `Tensor`, `ndarray`, byte/text type are all iterables themselves.
-
-##### Args:
-
-
-*  <b>`values`</b>: Object to be checked.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `values` is not iterable or is one of
-    `Tensor`, `SparseTensor`, `np.array`, `tf.compat.bytes_or_text_types`.
-
-
-- - -
-
-### `tf.assert_non_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_non_negative}
-
-Assert the condition `x >= 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_non_negative(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Non-negative means, for every element `x[i]` of `x`, we have `x[i] >= 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_non_negative".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all non-negative.
-
-
-- - -
-
-### `tf.assert_non_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_non_positive}
-
-Assert the condition `x <= 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_non_positive(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Non-positive means, for every element `x[i]` of `x`, we have `x[i] <= 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_non_positive".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all non-positive.
-
-
-- - -
-
-### `tf.assert_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_equal}
-
-Assert the condition `x == y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] == y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_equal".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x == y` is False.
-
-
-- - -
-
-### `tf.assert_integer(x, message=None, name=None)` {#assert_integer}
-
-Assert that `x` is of integer dtype.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_integer(x)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` whose basetype is integer and is not quantized.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_integer".
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x.dtype` is anything other than non-quantized integer.
-
-##### Returns:
-
-  A `no_op` that does nothing.  Type can be determined statically.
-
-
-- - -
-
-### `tf.assert_less(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less}
-
-Assert the condition `x < y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_less(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] < y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x < y` is False.
-
-
-- - -
-
-### `tf.assert_less_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less_equal}
-
-Assert the condition `x <= y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_less_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] <= y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less_equal"
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x <= y` is False.
-
-
-- - -
-
-### `tf.assert_greater(x, y, data=None, summarize=None, message=None, name=None)` {#assert_greater}
-
-Assert the condition `x > y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_greater(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] > y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_greater".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x > y` is False.
-
-
-- - -
-
-### `tf.assert_greater_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_greater_equal}
-
-Assert the condition `x >= y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_greater_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] >= y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to
-    "assert_greater_equal"
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x >= y` is False.
-
-
-- - -
-
-### `tf.assert_rank(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank}
-
-Assert `x` has rank equal to `rank`.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_rank(x, 2)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`rank`</b>: Scalar integer `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_rank".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` has specified rank.
-  If static checks determine `x` has correct rank, a `no_op` is returned.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If static checks determine `x` has wrong rank.
-
-
-- - -
-
-### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank_at_least}
-
-Assert `x` has rank equal to `rank` or higher.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_rank_at_least(x, 2)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`rank`</b>: Scalar `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_rank_at_least".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` has specified rank or higher.
-  If static checks determine `x` has correct rank, a `no_op` is returned.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If static checks determine `x` has wrong rank.
-
-
-- - -
-
-### `tf.assert_type(tensor, tf_type, message=None, name=None)` {#assert_type}
-
-Statically asserts that the given `Tensor` is of the specified type.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A tensorflow `Tensor`.
-*  <b>`tf_type`</b>: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
-    etc).
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name to give this `Op`.  Defaults to "assert_type"
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the tensors data type doesn't match `tf_type`.
-
-##### Returns:
-
-  A `no_op` that does nothing.  Type can be determined statically.
-
-
-- - -
-
-### `tf.is_non_decreasing(x, name=None)` {#is_non_decreasing}
-
-Returns `True` if `x` is non-decreasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is non-decreasing if for every adjacent pair we have `x[i] <= x[i+1]`.
-If `x` has less than two elements, it is trivially non-decreasing.
-
-See also:  `is_strictly_increasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "is_non_decreasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is non-decreasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
-
-- - -
-
-### `tf.is_numeric_tensor(tensor)` {#is_numeric_tensor}
-
-
-
-
-- - -
-
-### `tf.is_strictly_increasing(x, name=None)` {#is_strictly_increasing}
-
-Returns `True` if `x` is strictly increasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is strictly increasing if for every adjacent pair we have `x[i] < x[i+1]`.
-If `x` has less than two elements, it is trivially strictly increasing.
-
-See also:  `is_non_decreasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "is_strictly_increasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is strictly increasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/client.md b/tensorflow/g3doc/api_docs/python/client.md
deleted file mode 100644
index fbd1bf58081..00000000000
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ /dev/null
@@ -1,821 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Running Graphs
-[TOC]
-
-This library contains classes for launching graphs and executing operations.
-
-The [basic usage](../../get_started/index.md#basic-usage) guide has
-examples of how a graph is launched in a [`tf.Session`](#Session).
-
-## Session management
-
-- - -
-
-### `class tf.Session` {#Session}
-
-A class for running TensorFlow operations.
-
-A `Session` object encapsulates the environment in which `Operation`
-objects are executed, and `Tensor` objects are evaluated. For
-example:
-
-```python
-# Build a graph.
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-
-# Launch the graph in a session.
-sess = tf.Session()
-
-# Evaluate the tensor `c`.
-print(sess.run(c))
-```
-
-A session may own resources, such as
-[variables](../../api_docs/python/state_ops.md#Variable), [queues](../../api_docs/python/io_ops.md#QueueBase),
-and [readers](../../api_docs/python/io_ops.md#ReaderBase). It is important to release
-these resources when they are no longer required. To do this, either
-invoke the [`close()`](#Session.close) method on the session, or use
-the session as a context manager. The following two examples are
-equivalent:
-
-```python
-# Using the `close()` method.
-sess = tf.Session()
-sess.run(...)
-sess.close()
-
-# Using the context manager.
-with tf.Session() as sess:
-  sess.run(...)
-```
-
-The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
-protocol buffer exposes various configuration options for a
-session. For example, to create a session that uses soft constraints
-for device placement, and log the resulting placement decisions,
-create a session as follows:
-
-```python
-# Launch the graph in a session that allows soft device placement and
-# logs the placement decisions.
-sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
-                                        log_device_placement=True))
-```
-
-- - -
-
-#### `tf.Session.__init__(target='', graph=None, config=None)` {#Session.__init__}
-
-Creates a new TensorFlow session.
-
-If no `graph` argument is specified when constructing the session,
-the default graph will be launched in the session. If you are
-using more than one graph (created with `tf.Graph()` in the same
-process, you will have to use different sessions for each graph,
-but each graph can be used in multiple sessions. In this case, it
-is often clearer to pass the graph to be launched explicitly to
-the session constructor.
-
-##### Args:
-
-
-*  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See
-    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
-    for more examples.
-*  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
-*  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
-    protocol buffer with configuration options for the session.
-
-
-- - -
-
-#### `tf.Session.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#Session.run}
-
-Runs operations and evaluates tensors in `fetches`.
-
-This method runs one "step" of TensorFlow computation, by
-running the necessary graph fragment to execute every `Operation`
-and evaluate every `Tensor` in `fetches`, substituting the values in
-`feed_dict` for the corresponding input values.
-
-The `fetches` argument may be a single graph element, or an arbitrarily
-nested list, tuple, namedtuple, dict, or OrderedDict containing graph
-elements at its leaves.  A graph element can be one of the following types:
-
-* An [`Operation`](../../api_docs/python/framework.md#Operation).
-  The corresponding fetched value will be `None`.
-* A [`Tensor`](../../api_docs/python/framework.md#Tensor).
-  The corresponding fetched value will be a numpy ndarray containing the
-  value of that tensor.
-* A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
-  The corresponding fetched value will be a
-  [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
-  containing the value of that sparse tensor.
-* A `get_tensor_handle` op.  The corresponding fetched value will be a
-  numpy ndarray containing the handle of that tensor.
-* A `string` which is the name of a tensor or operation in the graph.
-
-The value returned by `run()` has the same shape as the `fetches` argument,
-where the leaves are replaced by the corresponding values returned by
-TensorFlow.
-
-Example:
-
-```python
-   a = tf.constant([10, 20])
-   b = tf.constant([1.0, 2.0])
-   # 'fetches' can be a singleton
-   v = session.run(a)
-   # v is the numpy array [10, 20]
-   # 'fetches' can be a list.
-   v = session.run([a, b])
-   # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
-   # 1-D array [1.0, 2.0]
-   # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
-   MyData = collections.namedtuple('MyData', ['a', 'b'])
-   v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
-   # v is a dict with
-   # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
-   # 'b' the numpy array [1.0, 2.0]
-   # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
-   # [10, 20].
-```
-
-The optional `feed_dict` argument allows the caller to override
-the value of tensors in the graph. Each key in `feed_dict` can be
-one of the following types:
-
-* If the key is a [`Tensor`](../../api_docs/python/framework.md#Tensor), the
-  value may be a Python scalar, string, list, or numpy ndarray
-  that can be converted to the same `dtype` as that
-  tensor. Additionally, if the key is a
-  [placeholder](../../api_docs/python/io_ops.md#placeholder), the shape of
-  the value will be checked for compatibility with the placeholder.
-* If the key is a
-  [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-  the value should be a
-  [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue).
-* If the key is a nested tuple of `Tensor`s or `SparseTensor`s, the value
-  should be a nested tuple with the same structure that maps to their
-  corresponding values as above.
-
-Each value in `feed_dict` must be convertible to a numpy array of the dtype
-of the corresponding key.
-
-The optional `options` argument expects a [`RunOptions`] proto. The options
-allow controlling the behavior of this particular step (e.g. turning tracing
-on).
-
-The optional `run_metadata` argument expects a [`RunMetadata`] proto. When
-appropriate, the non-Tensor output of this step will be collected there. For
-example, when users turn on tracing in `options`, the profiled info will be
-collected into this argument and passed back.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: A single graph element, a list of graph elements,
-    or a dictionary whose values are graph elements or lists of graph
-    elements (described above).
-*  <b>`feed_dict`</b>: A dictionary that maps graph elements to values
-    (described above).
-*  <b>`options`</b>: A [`RunOptions`] protocol buffer
-*  <b>`run_metadata`</b>: A [`RunMetadata`] protocol buffer
-
-##### Returns:
-
-  Either a single value if `fetches` is a single graph element, or
-  a list of values if `fetches` is a list, or a dictionary with the
-  same keys as `fetches` if that is a dictionary (described above).
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If this `Session` is in an invalid state (e.g. has been
-    closed).
-*  <b>`TypeError`</b>: If `fetches` or `feed_dict` keys are of an inappropriate type.
-*  <b>`ValueError`</b>: If `fetches` or `feed_dict` keys are invalid or refer to a
-    `Tensor` that doesn't exist.
-
-
-- - -
-
-#### `tf.Session.close()` {#Session.close}
-
-Closes this session.
-
-Calling this method frees all resources associated with the session.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    closing the TensorFlow session.
-
-
-
-- - -
-
-#### `tf.Session.graph` {#Session.graph}
-
-The graph that was launched in this session.
-
-
-
-- - -
-
-#### `tf.Session.as_default()` {#Session.as_default}
-
-Returns a context manager that makes this object the default session.
-
-Use with the `with` keyword to specify that calls to
-[`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
-[`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval) should be
-executed in this session.
-
-```python
-c = tf.constant(..)
-sess = tf.Session()
-
-with sess.as_default():
-  assert tf.get_default_session() is sess
-  print(c.eval())
-```
-
-To get the current default session, use
-[`tf.get_default_session()`](#get_default_session).
-
-
-*N.B.* The `as_default` context manager *does not* close the
-session when you exit the context, and you must close the session
-explicitly.
-
-```python
-c = tf.constant(...)
-sess = tf.Session()
-with sess.as_default():
-  print(c.eval())
-# ...
-with sess.as_default():
-  print(c.eval())
-
-sess.close()
-```
-
-Alternatively, you can use `with tf.Session():` to create a
-session that is automatically closed on exiting the context,
-including when an uncaught exception is raised.
-
-*N.B.* The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default session in that
-thread, you must explicitly add a `with sess.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  A context manager using this session as the default session.
-
-
-
-- - -
-
-#### `tf.Session.reset(target, containers=None, config=None)` {#Session.reset}
-
-Resets resource containers on `target`, and close all connected sessions.
-
-A resource container is distributed across all workers in the
-same cluster as `target`.  When a resource container on `target`
-is reset, resources associated with that container will be cleared.
-In particular, all Variables in the container will become undefined:
-they lose their values and shapes.
-
-NOTE:
-(i) reset() is currently only implemented for distributed sessions.
-(ii) Any sessions on the master named by `target` will be closed.
-
-If no resource containers are provided, all containers are reset.
-
-##### Args:
-
-
-*  <b>`target`</b>: The execution engine to connect to.
-*  <b>`containers`</b>: A list of resource container name strings, or `None` if all of
-    all the containers are to be reset.
-*  <b>`config`</b>: (Optional.) Protocol buffer with configuration options.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    resetting containers.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Session.__enter__()` {#Session.__enter__}
-
-
-
-
-- - -
-
-#### `tf.Session.__exit__(exec_type, exec_value, exec_tb)` {#Session.__exit__}
-
-
-
-
-
-- - -
-
-### `class tf.InteractiveSession` {#InteractiveSession}
-
-A TensorFlow `Session` for use in interactive contexts, such as a shell.
-
-The only difference with a regular `Session` is that an `InteractiveSession`
-installs itself as the default session on construction.
-The methods [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval)
-and [`Operation.run()`](../../api_docs/python/framework.md#Operation.run)
-will use that session to run ops.
-
-This is convenient in interactive shells and [IPython
-notebooks](http://ipython.org), as it avoids having to pass an explicit
-`Session` object to run ops.
-
-For example:
-
-```python
-sess = tf.InteractiveSession()
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-# We can just use 'c.eval()' without passing 'sess'
-print(c.eval())
-sess.close()
-```
-
-Note that a regular session installs itself as the default session when it
-is created in a `with` statement.  The common usage in non-interactive
-programs is to follow that pattern:
-
-```python
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-with tf.Session():
-  # We can also use 'c.eval()' here.
-  print(c.eval())
-```
-
-- - -
-
-#### `tf.InteractiveSession.__init__(target='', graph=None, config=None)` {#InteractiveSession.__init__}
-
-Creates a new interactive TensorFlow session.
-
-If no `graph` argument is specified when constructing the session,
-the default graph will be launched in the session. If you are
-using more than one graph (created with `tf.Graph()` in the same
-process, you will have to use different sessions for each graph,
-but each graph can be used in multiple sessions. In this case, it
-is often clearer to pass the graph to be launched explicitly to
-the session constructor.
-
-##### Args:
-
-
-*  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine.
-*  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
-*  <b>`config`</b>: (Optional) `ConfigProto` proto used to configure the session.
-
-
-- - -
-
-#### `tf.InteractiveSession.close()` {#InteractiveSession.close}
-
-Closes an `InteractiveSession`.
-
-
-
-
-- - -
-
-### `tf.get_default_session()` {#get_default_session}
-
-Returns the default session for the current thread.
-
-The returned `Session` will be the innermost session on which a
-`Session` or `Session.as_default()` context has been entered.
-
-NOTE: The default session is a property of the current thread. If you
-create a new thread, and wish to use the default session in that
-thread, you must explicitly add a `with sess.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  The default `Session` being used in the current thread.
-
-
-
-## Error classes and convenience functions
-
-- - -
-
-### `class tf.OpError` {#OpError}
-
-A generic error that is raised when TensorFlow execution fails.
-
-Whenever possible, the session will raise a more specific subclass
-of `OpError` from the `tf.errors` module.
-
-- - -
-
-#### `tf.OpError.op` {#OpError.op}
-
-The operation that failed, if known.
-
-*N.B.* If the failed op was synthesized at runtime, e.g. a `Send`
-or `Recv` op, there will be no corresponding
-[`Operation`](../../api_docs/python/framework.md#Operation)
-object.  In that case, this will return `None`, and you should
-instead use the [`OpError.node_def`](#OpError.node_def) to
-discover information about the op.
-
-##### Returns:
-
-  The `Operation` that failed, or None.
-
-
-- - -
-
-#### `tf.OpError.node_def` {#OpError.node_def}
-
-The `NodeDef` proto representing the op that failed.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.OpError.__init__(node_def, op, message, error_code)` {#OpError.__init__}
-
-Creates a new `OpError` indicating that a particular op failed.
-
-##### Args:
-
-
-*  <b>`node_def`</b>: The `node_def_pb2.NodeDef` proto representing the op that
-    failed, if known; otherwise None.
-*  <b>`op`</b>: The `ops.Operation` that failed, if known; otherwise None.
-*  <b>`message`</b>: The message string describing the failure.
-*  <b>`error_code`</b>: The `error_codes_pb2.Code` describing the error.
-
-
-- - -
-
-#### `tf.OpError.__str__()` {#OpError.__str__}
-
-
-
-
-- - -
-
-#### `tf.OpError.error_code` {#OpError.error_code}
-
-The integer error code that describes the error.
-
-
-- - -
-
-#### `tf.OpError.message` {#OpError.message}
-
-The error message that describes the error.
-
-
-
-- - -
-
-### `class tf.errors.CancelledError` {#CancelledError}
-
-Raised when an operation or step is cancelled.
-
-For example, a long-running operation (e.g.
-[`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue) may be
-cancelled by running another operation (e.g.
-[`queue.close(cancel_pending_enqueues=True)`](../../api_docs/python/io_ops.md#QueueBase.close),
-or by [closing the session](../../api_docs/python/client.md#Session.close).
-A step that is running such a long-running operation will fail by raising
-`CancelledError`.
-
-- - -
-
-#### `tf.errors.CancelledError.__init__(node_def, op, message)` {#CancelledError.__init__}
-
-Creates a `CancelledError`.
-
-
-
-- - -
-
-### `class tf.errors.UnknownError` {#UnknownError}
-
-Unknown error.
-
-An example of where this error may be returned is if a Status value
-received from another address space belongs to an error-space that
-is not known to this address space. Also errors raised by APIs that
-do not return enough error information may be converted to this
-error.
-
-- - -
-
-#### `tf.errors.UnknownError.__init__(node_def, op, message, error_code=2)` {#UnknownError.__init__}
-
-Creates an `UnknownError`.
-
-
-
-- - -
-
-### `class tf.errors.InvalidArgumentError` {#InvalidArgumentError}
-
-Raised when an operation receives an invalid argument.
-
-This may occur, for example, if an operation is receives an input
-tensor that has an invalid value or shape. For example, the
-[`tf.matmul()`](../../api_docs/python/math_ops.md#matmul) op will raise this
-error if it receives an input that is not a matrix, and the
-[`tf.reshape()`](../../api_docs/python/array_ops.md#reshape) op will raise
-this error if the new shape does not match the number of elements in the input
-tensor.
-
-- - -
-
-#### `tf.errors.InvalidArgumentError.__init__(node_def, op, message)` {#InvalidArgumentError.__init__}
-
-Creates an `InvalidArgumentError`.
-
-
-
-- - -
-
-### `class tf.errors.DeadlineExceededError` {#DeadlineExceededError}
-
-Raised when a deadline expires before an operation could complete.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.DeadlineExceededError.__init__(node_def, op, message)` {#DeadlineExceededError.__init__}
-
-Creates a `DeadlineExceededError`.
-
-
-
-- - -
-
-### `class tf.errors.NotFoundError` {#NotFoundError}
-
-Raised when a requested entity (e.g., a file or directory) was not found.
-
-For example, running the
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation could raise `NotFoundError` if it receives the name of a file that
-does not exist.
-
-- - -
-
-#### `tf.errors.NotFoundError.__init__(node_def, op, message)` {#NotFoundError.__init__}
-
-Creates a `NotFoundError`.
-
-
-
-- - -
-
-### `class tf.errors.AlreadyExistsError` {#AlreadyExistsError}
-
-Raised when an entity that we attempted to create already exists.
-
-For example, running an operation that saves a file
-(e.g. [`tf.train.Saver.save()`](../../api_docs/python/train.md#Saver.save))
-could potentially raise this exception if an explicit filename for an
-existing file was passed.
-
-- - -
-
-#### `tf.errors.AlreadyExistsError.__init__(node_def, op, message)` {#AlreadyExistsError.__init__}
-
-Creates an `AlreadyExistsError`.
-
-
-
-- - -
-
-### `class tf.errors.PermissionDeniedError` {#PermissionDeniedError}
-
-Raised when the caller does not have permission to run an operation.
-
-For example, running the
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation could raise `PermissionDeniedError` if it receives the name of a
-file for which the user does not have the read file permission.
-
-- - -
-
-#### `tf.errors.PermissionDeniedError.__init__(node_def, op, message)` {#PermissionDeniedError.__init__}
-
-Creates a `PermissionDeniedError`.
-
-
-
-- - -
-
-### `class tf.errors.UnauthenticatedError` {#UnauthenticatedError}
-
-The request does not have valid authentication credentials.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.UnauthenticatedError.__init__(node_def, op, message)` {#UnauthenticatedError.__init__}
-
-Creates an `UnauthenticatedError`.
-
-
-
-- - -
-
-### `class tf.errors.ResourceExhaustedError` {#ResourceExhaustedError}
-
-Some resource has been exhausted.
-
-For example, this error might be raised if a per-user quota is
-exhausted, or perhaps the entire file system is out of space.
-
-- - -
-
-#### `tf.errors.ResourceExhaustedError.__init__(node_def, op, message)` {#ResourceExhaustedError.__init__}
-
-Creates a `ResourceExhaustedError`.
-
-
-
-- - -
-
-### `class tf.errors.FailedPreconditionError` {#FailedPreconditionError}
-
-Operation was rejected because the system is not in a state to execute it.
-
-This exception is most commonly raised when running an operation
-that reads a [`tf.Variable`](../../api_docs/python/state_ops.md#Variable)
-before it has been initialized.
-
-- - -
-
-#### `tf.errors.FailedPreconditionError.__init__(node_def, op, message)` {#FailedPreconditionError.__init__}
-
-Creates a `FailedPreconditionError`.
-
-
-
-- - -
-
-### `class tf.errors.AbortedError` {#AbortedError}
-
-The operation was aborted, typically due to a concurrent action.
-
-For example, running a
-[`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue)
-operation may raise `AbortedError` if a
-[`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close) operation
-previously ran.
-
-- - -
-
-#### `tf.errors.AbortedError.__init__(node_def, op, message)` {#AbortedError.__init__}
-
-Creates an `AbortedError`.
-
-
-
-- - -
-
-### `class tf.errors.OutOfRangeError` {#OutOfRangeError}
-
-Raised when an operation iterates past the valid input range.
-
-This exception is raised in "end-of-file" conditions, such as when a
-[`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
-operation is blocked on an empty queue, and a
-[`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close)
-operation executes.
-
-- - -
-
-#### `tf.errors.OutOfRangeError.__init__(node_def, op, message)` {#OutOfRangeError.__init__}
-
-Creates an `OutOfRangeError`.
-
-
-
-- - -
-
-### `class tf.errors.UnimplementedError` {#UnimplementedError}
-
-Raised when an operation has not been implemented.
-
-Some operations may raise this error when passed otherwise-valid
-arguments that it does not currently support. For example, running
-the [`tf.nn.max_pool()`](../../api_docs/python/nn.md#max_pool) operation
-would raise this error if pooling was requested on the batch dimension,
-because this is not yet supported.
-
-- - -
-
-#### `tf.errors.UnimplementedError.__init__(node_def, op, message)` {#UnimplementedError.__init__}
-
-Creates an `UnimplementedError`.
-
-
-
-- - -
-
-### `class tf.errors.InternalError` {#InternalError}
-
-Raised when the system experiences an internal error.
-
-This exception is raised when some invariant expected by the runtime
-has been broken. Catching this exception is not recommended.
-
-- - -
-
-#### `tf.errors.InternalError.__init__(node_def, op, message)` {#InternalError.__init__}
-
-Creates an `InternalError`.
-
-
-
-- - -
-
-### `class tf.errors.UnavailableError` {#UnavailableError}
-
-Raised when the runtime is currently unavailable.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.UnavailableError.__init__(node_def, op, message)` {#UnavailableError.__init__}
-
-Creates an `UnavailableError`.
-
-
-
-- - -
-
-### `class tf.errors.DataLossError` {#DataLossError}
-
-Raised when unrecoverable data loss or corruption is encountered.
-
-For example, this may be raised by running a
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation, if the file is truncated while it is being read.
-
-- - -
-
-#### `tf.errors.DataLossError.__init__(node_def, op, message)` {#DataLossError.__init__}
-
-Creates a `DataLossError`.
-
-
-
-
-- - -
-
-### `tf.errors.exception_type_from_error_code(error_code)` {#exception_type_from_error_code}
-
-
-
-
-- - -
-
-### `tf.errors.error_code_from_exception_type(cls)` {#error_code_from_exception_type}
-
-
-
-
-- - -
-
-### `tf.errors.raise_exception_on_not_ok_status()` {#raise_exception_on_not_ok_status}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/constant_op.md b/tensorflow/g3doc/api_docs/python/constant_op.md
deleted file mode 100644
index 561e6c8d6cf..00000000000
--- a/tensorflow/g3doc/api_docs/python/constant_op.md
+++ /dev/null
@@ -1,734 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Constants, Sequences, and Random Values
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Constant Value Tensors
-
-TensorFlow provides several operations that you can use to generate constants.
-
-- - -
-
-### `tf.zeros(shape, dtype=tf.float32, name=None)` {#zeros}
-
-Creates a tensor with all elements set to zero.
-
-This operation returns a tensor of type `dtype` with shape `shape` and
-all elements set to zero.
-
-For example:
-
-```python
-tf.zeros([3, 4], tf.int32) ==> [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: Either a list of integers, or a 1-D `Tensor` of type `int32`.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with all elements set to zero.
-
-
-- - -
-
-### `tf.zeros_like(tensor, dtype=None, name=None, optimize=True)` {#zeros_like}
-
-Creates a tensor with all elements set to zero.
-
-Given a single tensor (`tensor`), this operation returns a tensor of the
-same type and shape as `tensor` with all elements set to zero. Optionally,
-you can use `dtype` to specify a new type for the returned tensor.
-
-For example:
-
-```python
-# 'tensor' is [[1, 2, 3], [4, 5, 6]]
-tf.zeros_like(tensor) ==> [[0, 0, 0], [0, 0, 0]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`dtype`</b>: A type for the returned `Tensor`. Must be `float32`, `float64`,
-  `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, or `complex128`.
-
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`optimize`</b>: if true, attempt to statically determine the shape of 'tensor'
-  and encode it as a constant.
-
-##### Returns:
-
-  A `Tensor` with all elements set to zero.
-
-
-
-- - -
-
-### `tf.ones(shape, dtype=tf.float32, name=None)` {#ones}
-
-Creates a tensor with all elements set to 1.
-
-This operation returns a tensor of type `dtype` with shape `shape` and all
-elements set to 1.
-
-For example:
-
-```python
-tf.ones([2, 3], tf.int32) ==> [[1, 1, 1], [1, 1, 1]]
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: Either a list of integers, or a 1-D `Tensor` of type `int32`.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with all elements set to 1.
-
-
-- - -
-
-### `tf.ones_like(tensor, dtype=None, name=None, optimize=True)` {#ones_like}
-
-Creates a tensor with all elements set to 1.
-
-Given a single tensor (`tensor`), this operation returns a tensor of the same
-type and shape as `tensor` with all elements set to 1. Optionally, you can
-specify a new type (`dtype`) for the returned tensor.
-
-For example:
-
-```python
-# 'tensor' is [[1, 2, 3], [4, 5, 6]]
-tf.ones_like(tensor) ==> [[1, 1, 1], [1, 1, 1]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`dtype`</b>: A type for the returned `Tensor`. Must be `float32`, `float64`,
-    `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, `complex128` or
-    `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`optimize`</b>: if true, attempt to statically determine the shape of 'tensor'
-  and encode it as a constant.
-
-##### Returns:
-
-  A `Tensor` with all elements set to 1.
-
-
-
-- - -
-
-### `tf.fill(dims, value, name=None)` {#fill}
-
-Creates a tensor filled with a scalar value.
-
-This operation creates a tensor of shape `dims` and fills it with `value`.
-
-For example:
-
-```prettyprint
-# Output tensor has shape [2, 3].
-fill([2, 3], 9) ==> [[9, 9, 9]
-                     [9, 9, 9]]
-```
-
-##### Args:
-
-
-*  <b>`dims`</b>: A `Tensor` of type `int32`.
-    1-D. Represents the shape of the output tensor.
-*  <b>`value`</b>: A `Tensor`. 0-D (scalar). Value to fill the returned tensor.
-
-    @compatibility(numpy)
-    Equivalent to np.full
-    @end_compatibility
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `value`.
-
-
-
-- - -
-
-### `tf.constant(value, dtype=None, shape=None, name='Const', verify_shape=False)` {#constant}
-
-Creates a constant tensor.
-
- The resulting tensor is populated with values of type `dtype`, as
- specified by arguments `value` and (optionally) `shape` (see examples
- below).
-
- The argument `value` can be a constant value, or a list of values of type
- `dtype`. If `value` is a list, then the length of the list must be less
- than or equal to the number of elements implied by the `shape` argument (if
- specified). In the case where the list length is less than the number of
- elements specified by `shape`, the last element in the list will be used
- to fill the remaining entries.
-
- The argument `shape` is optional. If present, it specifies the dimensions of
- the resulting tensor. If not present, the shape of `value` is used.
-
- If the argument `dtype` is not specified, then the type is inferred from
- the type of `value`.
-
- For example:
-
- ```python
- # Constant 1-D Tensor populated with value list.
- tensor = tf.constant([1, 2, 3, 4, 5, 6, 7]) => [1 2 3 4 5 6 7]
-
- # Constant 2-D tensor populated with scalar value -1.
- tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
-                                              [-1. -1. -1.]]
- ```
-
-##### Args:
-
-
-*  <b>`value`</b>: A constant value (or list) of output type `dtype`.
-
-
-*  <b>`dtype`</b>: The type of the elements of the resulting tensor.
-
-
-*  <b>`shape`</b>: Optional dimensions of resulting tensor.
-
-
-*  <b>`name`</b>: Optional name for the tensor.
-
-
-*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
-
-##### Returns:
-
-  A Constant Tensor.
-
-
-
-## Sequences
-
-- - -
-
-### `tf.linspace(start, stop, num, name=None)` {#linspace}
-
-Generates values in an interval.
-
-A sequence of `num` evenly-spaced values are generated beginning at `start`.
-If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-so that the last one is exactly `stop`.
-
-For example:
-
-```
-tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-```
-
-##### Args:
-
-
-*  <b>`start`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    First entry in the range.
-*  <b>`stop`</b>: A `Tensor`. Must have the same type as `start`.
-    Last entry in the range.
-*  <b>`num`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Number of values to generate.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `start`. 1-D. The generated values.
-
-
-
-- - -
-
-### `tf.range(start, limit=None, delta=1, dtype=None, name='range')` {#range}
-
-Creates a sequence of numbers.
-
-Creates a sequence of numbers that begins at `start` and extends by
-increments of `delta` up to but not including `limit`.
-
-The dtype of the resulting tensor is inferred from the inputs unless
-it is provided explicitly.
-
-Like the Python builtin `range`, `start` defaults to 0, so that
-`range(n) = range(0, n)`.
-
-For example:
-
-```python
-# 'start' is 3
-# 'limit' is 18
-# 'delta' is 3
-tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-
-# 'start' is 3
-# 'limit' is 1
-# 'delta' is -0.5
-tf.range(start, limit, delta) ==> [3, 2.5, 2, 1.5]
-
-# 'limit' is 5
-tf.range(limit) ==> [0, 1, 2, 3, 4]
-```
-
-##### Args:
-
-
-*  <b>`start`</b>: A 0-D `Tensor` (scalar). Acts as first entry in the range if
-    `limit` is not None; otherwise, acts as range limit and first entry
-    defaults to 0.
-*  <b>`limit`</b>: A 0-D `Tensor` (scalar). Upper limit of sequence,
-    exclusive. If None, defaults to the value of `start` while the first
-    entry of the range defaults to 0.
-*  <b>`delta`</b>: A 0-D `Tensor` (scalar). Number that increments
-    `start`. Defaults to 1.
-*  <b>`dtype`</b>: The type of the elements of the resulting tensor.
-*  <b>`name`</b>: A name for the operation. Defaults to "range".
-
-##### Returns:
-
-  An 1-D `Tensor` of type `dtype`.
-
-@compatibility(numpy)
-Equivalent to np.arange
-@end_compatibility
-
-
-
-## Random Tensors
-
-TensorFlow has several ops that create random tensors with different
-distributions.  The random ops are stateful, and create new random values each
-time they are evaluated.
-
-The `seed` keyword argument in these functions acts in conjunction with
-the graph-level random seed. Changing either the graph-level seed using
-[`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed) or the
-op-level seed will change the underlying seed of these operations. Setting
-neither graph-level nor op-level seed, results in a random seed for all
-operations.
-See [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-for details on the interaction between operation-level and graph-level random
-seeds.
-
-### Examples:
-
-```python
-# Create a tensor of shape [2, 3] consisting of random normal values, with mean
-# -1 and standard deviation 4.
-norm = tf.random_normal([2, 3], mean=-1, stddev=4)
-
-# Shuffle the first dimension of a tensor
-c = tf.constant([[1, 2], [3, 4], [5, 6]])
-shuff = tf.random_shuffle(c)
-
-# Each time we run these ops, different results are generated
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-
-# Set an op-level seed to generate repeatable sequences across sessions.
-norm = tf.random_normal([2, 3], seed=1234)
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-```
-
-Another common use of random values is the initialization of variables. Also see
-the [Variables How To](../../how_tos/variables/index.md).
-
-```python
-# Use random uniform values in [0, 1) as the initializer for a variable of shape
-# [2, 3]. The default type is float32.
-var = tf.Variable(tf.random_uniform([2, 3]), name="var")
-init = tf.global_variables_initializer()
-
-sess = tf.Session()
-sess.run(init)
-print(sess.run(var))
-```
-
-- - -
-
-### `tf.random_normal(shape, mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)` {#random_normal}
-
-Outputs random values from a normal distribution.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`mean`</b>: A 0-D Tensor or Python value of type `dtype`. The mean of the normal
-    distribution.
-*  <b>`stddev`</b>: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-    of the normal distribution.
-*  <b>`dtype`</b>: The type of the output.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random normal values.
-
-
-- - -
-
-### `tf.truncated_normal(shape, mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)` {#truncated_normal}
-
-Outputs random values from a truncated normal distribution.
-
-The generated values follow a normal distribution with specified mean and
-standard deviation, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`mean`</b>: A 0-D Tensor or Python value of type `dtype`. The mean of the
-    truncated normal distribution.
-*  <b>`stddev`</b>: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-    of the truncated normal distribution.
-*  <b>`dtype`</b>: The type of the output.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random truncated normal values.
-
-
-- - -
-
-### `tf.random_uniform(shape, minval=0, maxval=None, dtype=tf.float32, seed=None, name=None)` {#random_uniform}
-
-Outputs random values from a uniform distribution.
-
-The generated values follow a uniform distribution in the range
-`[minval, maxval)`. The lower bound `minval` is included in the range, while
-the upper bound `maxval` is excluded.
-
-For floats, the default range is `[0, 1)`.  For ints, at least `maxval` must
-be specified explicitly.
-
-In the integer case, the random integers are slightly biased unless
-`maxval - minval` is an exact power of two.  The bias is small for values of
-`maxval - minval` significantly smaller than the range of the output (either
-`2**32` or `2**64`).
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`minval`</b>: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
-    range of random values to generate.  Defaults to 0.
-*  <b>`maxval`</b>: A 0-D Tensor or Python value of type `dtype`. The upper bound on
-    the range of random values to generate.  Defaults to 1 if `dtype` is
-    floating point.
-*  <b>`dtype`</b>: The type of the output: `float32`, `float64`, `int32`, or `int64`.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random uniform values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `dtype` is integral and `maxval` is not specified.
-
-
-- - -
-
-### `tf.random_shuffle(value, seed=None, name=None)` {#random_shuffle}
-
-Randomly shuffles a tensor along its first dimension.
-
-The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-to one and only one `output[i]`. For example, a mapping that might occur for a
-3x2 tensor is:
-
-```python
-[[1, 2],       [[5, 6],
- [3, 4],  ==>   [1, 2],
- [5, 6]]        [3, 4]]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: A Tensor to be shuffled.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of same shape and type as `value`, shuffled along its first
-  dimension.
-
-
-- - -
-
-### `tf.random_crop(value, size, seed=None, name=None)` {#random_crop}
-
-Randomly crops a tensor to a given size.
-
-Slices a shape `size` portion out of `value` at a uniformly chosen offset.
-Requires `value.shape >= size`.
-
-If a dimension should not be cropped, pass the full size of that dimension.
-For example, RGB images can be cropped with
-`size = [crop_height, crop_width, 3]`.
-
-##### Args:
-
-
-*  <b>`value`</b>: Input tensor to crop.
-*  <b>`size`</b>: 1-D tensor with size the rank of `value`.
-*  <b>`seed`</b>: Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A cropped tensor of the same rank as `value` and shape `size`.
-
-
-- - -
-
-### `tf.multinomial(logits, num_samples, seed=None, name=None)` {#multinomial}
-
-Draws samples from a multinomial distribution.
-
-Example:
-
-```python
-# samples has shape [1, 5], where each value is either 0 or 1 with equal
-# probability.
-samples = tf.multinomial(tf.log([[10., 10.]]), 5)
-```
-
-##### Args:
-
-
-*  <b>`logits`</b>: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
-    `[i, :]` represents the unnormalized log probabilities for all classes.
-*  <b>`num_samples`</b>: 0-D.  Number of independent samples to draw for each row slice.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The drawn samples of shape `[batch_size, num_samples]`.
-
-
-- - -
-
-### `tf.random_gamma(shape, alpha, beta=None, dtype=tf.float32, seed=None, name=None)` {#random_gamma}
-
-Draws `shape` samples from each of the given Gamma distribution(s).
-
-`alpha` is the shape parameter describing the distribution(s), and `beta` is
-the inverse scale parameter(s).
-
-Example:
-
-  samples = tf.random_gamma([10], [0.5, 1.5])
-  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
-  # the samples drawn from each distribution
-
-  samples = tf.random_gamma([7, 5], [0.5, 1.5])
-  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
-  # represents the 7x5 samples drawn from each of the two distributions
-
-  samples = tf.random_gamma([30], [[1.],[3.],[5.]], beta=[[3., 4.]])
-  # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.
-
-  Note that for small alpha values, there is a chance you will draw a value of
-  exactly 0, which gets worse for lower-precision dtypes, even though zero is
-  not in the support of the gamma distribution.
-
-  Relevant cdfs (~chance you will draw a exactly-0 value):
-  ```
-    stats.gamma(.01).cdf(np.finfo(np.float16).tiny)
-        0.91269738769897879
-    stats.gamma(.01).cdf(np.finfo(np.float32).tiny)
-        0.41992668622045726
-    stats.gamma(.01).cdf(np.finfo(np.float64).tiny)
-        0.00084322740680686662
-    stats.gamma(.35).cdf(np.finfo(np.float16).tiny)
-        0.037583276135263931
-    stats.gamma(.35).cdf(np.finfo(np.float32).tiny)
-        5.9514895726818067e-14
-    stats.gamma(.35).cdf(np.finfo(np.float64).tiny)
-        2.3529843400647272e-108
-  ```
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output samples
-    to be drawn per alpha/beta-parameterized distribution.
-*  <b>`alpha`</b>: A Tensor or Python value or N-D array of type `dtype`. `alpha`
-    provides the shape parameter(s) describing the gamma distribution(s) to
-    sample. Must be broadcastable with `beta`.
-*  <b>`beta`</b>: A Tensor or Python value or N-D array of type `dtype`. Defaults to 1.
-    `beta` provides the inverse scale parameter(s) of the gamma
-    distribution(s) to sample. Must be broadcastable with `alpha`.
-*  <b>`dtype`</b>: The type of alpha, beta, and the output: `float16`, `float32`, or
-    `float64`.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distributions.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` of shape `tf.concat(shape, tf.shape(alpha + beta))`
-    with values of type `dtype`.
-
-
-- - -
-
-### `tf.set_random_seed(seed)` {#set_random_seed}
-
-Sets the graph-level random seed.
-
-Operations that rely on a random seed actually derive it from two seeds:
-the graph-level and operation-level seeds. This sets the graph-level seed.
-
-Its interactions with operation-level seeds is as follows:
-
-  1. If neither the graph-level nor the operation seed is set:
-    A random seed is used for this op.
-  2. If the graph-level seed is set, but the operation seed is not:
-    The system deterministically picks an operation seed in conjunction
-    with the graph-level seed so that it gets a unique random sequence.
-  3. If the graph-level seed is not set, but the operation seed is set:
-    A default graph-level seed and the specified operation seed are used to
-    determine the random sequence.
-  4. If both the graph-level and the operation seed are set:
-    Both seeds are used in conjunction to determine the random sequence.
-
-To illustrate the user-visible effects, consider these examples:
-
-To generate different sequences across sessions, set neither
-graph-level nor op-level seeds:
-
-```python
-a = tf.random_uniform([1])
-b = tf.random_normal([1])
-
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A3'
-  print(sess2.run(a))  # generates 'A4'
-  print(sess2.run(b))  # generates 'B3'
-  print(sess2.run(b))  # generates 'B4'
-```
-
-To generate the same repeatable sequence for an op across sessions, set the
-seed for the op:
-
-```python
-a = tf.random_uniform([1], seed=1)
-b = tf.random_normal([1])
-
-# Repeatedly running this block with the same graph will generate the same
-# sequence of values for 'a', but different sequences of values for 'b'.
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A1'
-  print(sess2.run(a))  # generates 'A2'
-  print(sess2.run(b))  # generates 'B3'
-  print(sess2.run(b))  # generates 'B4'
-```
-
-To make the random sequences generated by all ops be repeatable across
-sessions, set a graph-level seed:
-
-```python
-tf.set_random_seed(1234)
-a = tf.random_uniform([1])
-b = tf.random_normal([1])
-
-# Repeatedly running this block with the same graph will generate the same
-# sequences of 'a' and 'b'.
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A1'
-  print(sess2.run(a))  # generates 'A2'
-  print(sess2.run(b))  # generates 'B1'
-  print(sess2.run(b))  # generates 'B2'
-```
-
-##### Args:
-
-
-*  <b>`seed`</b>: integer.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.entropy.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.entropy.md
deleted file mode 100644
index bac58387e8b..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.entropy.md
+++ /dev/null
@@ -1,304 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# BayesFlow Entropy (contrib)
-[TOC]
-
-Entropy Ops.
-
-## Background
-
-Common Shannon entropy, the Evidence Lower BOund (ELBO), KL divergence, and more
-all have information theoretic use and interpretations.  They are also often
-used in variational inference.  This library brings together `Ops` for
-estimating them, e.g. using Monte Carlo expectations.
-
-## Examples
-
-Example of fitting a variational posterior with the ELBO.
-
-```python
-# We start by assuming knowledge of the log of a joint density p(z, x) over
-# latent variable z and fixed measurement x.  Since x is fixed, the Python
-# function does not take x as an argument.
-def log_joint(z):
-  theta = tf.Variable(0.)  # Trainable variable that helps define log_joint.
-  ...
-
-# Next, define a Normal distribution with trainable parameters.
-q = distributions.Normal(mu=tf.Variable(0.), sigma=tf.Variable(1.))
-
-# Now, define a loss function (negative ELBO) that, when minimized, will adjust
-# mu, sigma, and theta, increasing the ELBO, which we hope will both reduce the
-# KL divergence between q(z) and p(z | x), and increase p(x).  Note that we
-# cannot guarantee both, but in general we expect both to happen.
-elbo = entropy.elbo_ratio(log_p, q, n=10)
-loss = -elbo
-
-# Minimize the loss
-train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
-tf.global_variables_initializer().run()
-for step in range(100):
-  train_op.run()
-```
-
-## Ops
-
-- - -
-
-### `tf.contrib.bayesflow.entropy.elbo_ratio(log_p, q, z=None, n=None, seed=None, form=None, name='elbo_ratio')` {#elbo_ratio}
-
-Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
-
-With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
-
-```
-E_q[ Log[p(Z) / q(Z)] ]
-```
-
-The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
-The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
-if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
-
-This log-ratio appears in different contexts:
-
-#### `KL[q || p]`
-
-If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
-the negative Kullback-Leibler divergence.
-
-```
-elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
-KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
-```
-
-Note that if `p` is a `Distribution`, then `distributions.kl(q, p)` may be
-defined and available as an exact result.
-
-#### ELBO
-
-If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
-the Evidence Lower BOund (ELBO):
-
-```
-ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
-      = Log[p(x)] - KL[q || p]
-     <= Log[p(x)]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_p`</b>: Callable mapping samples from `q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`q`</b>: `tf.contrib.distributions.Distribution`.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`form`</b>: Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-    or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-    (attempt analytic entropy, fallback on sample).
-    Default value is `ELBOForms.default`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
-    shape of `q`, and `dtype` is the same as `q`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `form` is not handled by this function.
-
-
-- - -
-
-### `tf.contrib.bayesflow.entropy.entropy_shannon(p, z=None, n=None, seed=None, form=None, name='entropy_shannon')` {#entropy_shannon}
-
-Monte Carlo or deterministic computation of Shannon's entropy.
-
-Depending on the kwarg `form`, this `Op` returns either the analytic entropy
-of the distribution `p`, or the sampled entropy:
-
-```
--n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
-    \approx - E_p[ Log[p(Z)] ]
-    = Entropy[p]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`p`</b>: `tf.contrib.distributions.Distribution`
-*  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`form`</b>: Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-    or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-    (attempt analytic entropy, fallback on sample).
-    Default value is `ELBOForms.default`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `form` not handled by this function.
-*  <b>`ValueError`</b>: If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
-
-
-- - -
-
-### `tf.contrib.bayesflow.entropy.renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio')` {#renyi_ratio}
-
-Monte Carlo estimate of the ratio appearing in Renyi divergence.
-
-This can be used to compute the Renyi (alpha) divergence, or a log evidence
-approximation based on Renyi divergence.
-
-#### Definition
-
-With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
-the (biased for finite `n`) estimate:
-
-```
-(1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
-\approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
-```
-
-This ratio appears in different contexts:
-
-#### Renyi divergence
-
-If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
-`alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
-
-```
-# Choose reasonably high n to limit bias, see below.
-renyi_ratio(log_p, q, alpha, n=100)
-                \approx -1 * D_alpha[q || p],  where
-D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
-```
-
-The Renyi (or "alpha") divergence is non-negative and equal to zero iff
-`q = p`.  Various limits of `alpha` lead to different special case results:
-
-```
-alpha       D_alpha[q || p]
------       ---------------
---> 0       Log[ int_{q > 0} p(z) dz ]
-= 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
---> 1       KL[q || p]
-= 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
---> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
-```
-
-See "Renyi Divergence Variational Inference", by Li and Turner.
-
-#### Log evidence approximation
-
-If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
-an alternative to the ELBO common in variational inference.
-
-```
-L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
-```
-
-If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
-`ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
-interpolation between the ELBO and the true evidence.
-
-#### Stability notes
-
-Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
-is subject to underflow/overflow issues.  For that reason, it is evaluated in
-log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
-that reason, one may wish to shrink `alpha` gradually.  See the `Op`
-`renyi_alpha`.  Using `float64` will also help.
-
-
-#### Bias for finite sample size
-
-Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
-`E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
-estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
-with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
-same result as `elbo_ratio`, and as `n` increases the expected value
-of the estimator increases.
-
-#### Call signature
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_p`</b>: Callable mapping samples from `q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`q`</b>: `tf.contrib.distributions.Distribution`.
-     `float64` `dtype` recommended.
-     `log_p` and `q` should be supported on the same set.
-*  <b>`alpha`</b>: `Tensor` with shape `q.batch_shape` and values not equal to 1.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  The number of samples to use if `z` is not provided.
-    Note that this can be highly biased for small `n`, see docstring.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-
-*  <b>`renyi_result`</b>: The scaled log of sample mean.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
-
-- - -
-
-### `tf.contrib.bayesflow.entropy.renyi_alpha(step, decay_time, alpha_min, alpha_max=0.99999, name='renyi_alpha')` {#renyi_alpha}
-
-Exponentially decaying `Tensor` appropriate for Renyi ratios.
-
-When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
-Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
-`NaN` and `inf` values when `alpha` is far from `1`.
-
-For that reason, it is often desirable to start the optimization with `alpha`
-very close to 1, and reduce it to a final `alpha_min` according to some
-schedule.  The user may even want to optimize using `elbo_ratio` for
-some fixed time before switching to Renyi based methods.
-
-This `Op` returns an `alpha` decaying exponentially with step:
-
-```
-s(step) = (exp{step / decay_time} - 1) / (e - 1)
-t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
-alpha(t) = (1 - t) alpha_min + t alpha_max
-```
-
-##### Args:
-
-
-*  <b>`step`</b>: Non-negative scalar `Tensor`.  Typically the global step or an
-    offset version thereof.
-*  <b>`decay_time`</b>: Positive scalar `Tensor`.
-*  <b>`alpha_min`</b>: `float` or `double` `Tensor`.
-    The minimal, final value of `alpha`, achieved when `step >= decay_time`
-*  <b>`alpha_max`</b>: `Tensor` of same `dtype` as `alpha_min`.
-    The maximal, beginning value of `alpha`, achieved when `step == 0`
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-
-*  <b>`alpha`</b>: A `Tensor` of same `dtype` as `alpha_min`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md
deleted file mode 100644
index 78ed0cb38f7..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md
+++ /dev/null
@@ -1,206 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# BayesFlow Monte Carlo (contrib)
-[TOC]
-
-Monte Carlo integration and helpers.
-
-## Background
-
-Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable `Z in R^k` with density `p`,
-the expectation of function `f` can be approximated like:
-
-```
-E_p[f(Z)] = \int f(z) p(z) dz
-          ~ S_n
-          := n^{-1} \sum_{i=1}^n f(z_i),  z_i iid samples from p.
-```
-
-If `E_p[|f(Z)|] < infinity`, then `S_n --> E_p[f(Z)]` by the strong law of large
-numbers.  If `E_p[f(Z)^2] < infinity`, then `S_n` is asymptotically normal with
-variance `Var[f(Z)] / n`.
-
-Practitioners of Bayesian statistics often find themselves wanting to estimate
-`E_p[f(Z)]` when the distribution `p` is known only up to a constant.  For
-example, the joint distribution `p(z, x)` may be known, but the evidence
-`p(x) = \int p(z, x) dz` may be intractable.  In that case, a parameterized
-distribution family `q_lambda(z)` may be chosen, and the optimal `lambda` is the
-one minimizing the KL divergence between `q_lambda(z)` and
-`p(z | x)`.  We only know `p(z, x)`, but that is sufficient to find `lambda`.
-
-
-## Log-space evaluation and subtracting the maximum.
-
-Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate `E_q[f(Z) p(Z) / q(Z)]`
-involves the ratio of two terms `p(Z) / q(Z)`, each of which must have tails
-dropping off faster than `O(|z|^{-(k + 1)})` in order to have finite integral.
-This ratio would often be zero or infinity up to numerical precision.
-
-For that reason, we write
-
-```
-Log E_q[ f(Z) p(Z) / q(Z) ]
-   = Log E_q[ exp{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C} ] + C,  where
-C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
-```
-
-The maximum value of the exponentiated term will be 0.0, and the expectation
-can be evaluated in a stable manner.
-
-## Ops
-
-- - -
-
-### `tf.contrib.bayesflow.monte_carlo.expectation(f, p, z=None, n=None, seed=None, name='expectation')` {#expectation}
-
-Monte Carlo estimate of an expectation:  `E_p[f(Z)]` with sample mean.
-
-This `Op` returns
-
-```
-n^{-1} sum_{i=1}^n f(z_i),  where z_i ~ p
-\approx E_p[f(Z)]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`f`</b>: Callable mapping samples from `p` to `Tensors`.
-*  <b>`p`</b>: `tf.contrib.distributions.Distribution`.
-*  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with the same `dtype` as `p`.
-
-
-*  <b>`Example`</b>: 
-
-```python
-N_samples = 10000
-
-distributions = tf.contrib.distributions
-
-dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
-elementwise_mean = lambda x: x
-mean_sum = lambda x: tf.reduce_sum(x, 1)
-
-estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
-                                                       dist,
-                                                       n=N_samples)
-estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
-                                               dist,
-                                               n=N_samples)
-
-with tf.Session() as sess:
-  estimate_elementwise_mean, estimate_mean_sum = (
-      sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
-print estimate_elementwise_mean
->>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
-print estimate_mean_sum
->>> 1.49571
-
-```
-
-
-- - -
-
-### `tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler(f, log_p, sampling_dist_q, z=None, n=None, seed=None, name='expectation_importance_sampler')` {#expectation_importance_sampler}
-
-Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
-
-With `p(z) := exp{log_p(z)}`, this `Op` returns
-
-```
-n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-\approx E_q[ f(Z) p(Z) / q(Z) ]
-=       E_p[f(Z)]
-```
-
-This integral is done in log-space with max-subtraction to better handle the
-often extreme values that `f(z) p(z) / q(z)` can take on.
-
-If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
-`expectation_importance_sampler_logspace` applied to `Log[f]`.
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
-    broadcastable to `q.batch_shape`.
-    For example, `f` works "just like" `q.log_prob`.
-*  <b>`log_p`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
-*  <b>`sampling_dist_q`</b>: The sampling distribution.
-    `tf.contrib.distributions.Distribution`.
-    `float64` `dtype` recommended.
-    `log_p` and `q` should be supported on the same set.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  The importance sampling estimate.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
-
-- - -
-
-### `tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace(log_f, log_p, sampling_dist_q, z=None, n=None, seed=None, name='expectation_importance_sampler_logspace')` {#expectation_importance_sampler_logspace}
-
-Importance sampling with a positive function, in log-space.
-
-With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-returns
-
-```
-Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-=       Log[E_p[f(Z)]]
-```
-
-This integral is done in log-space with max-subtraction to better handle the
-often extreme values that `f(z) p(z) / q(z)` can take on.
-
-In contrast to `expectation_importance_sampler`, this `Op` returns values in
-log-space.
-
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_f` works "just like" `sampling_dist_q.log_prob`.
-*  <b>`log_p`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`sampling_dist_q`</b>: The sampling distribution.
-    `tf.contrib.distributions.Distribution`.
-    `float64` `dtype` recommended.
-    `log_p` and `q` should be supported on the same set.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  Logarithm of the importance sampling estimate.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_graph.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_graph.md
deleted file mode 100644
index cd7bba275b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_graph.md
+++ /dev/null
@@ -1,46 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# BayesFlow Stochastic Graph (contrib)
-[TOC]
-
-Classes and helper functions for Stochastic Computation Graphs.
-
-## Stochastic Computation Graph Helper Functions
-
-- - -
-
-### `tf.contrib.bayesflow.stochastic_graph.surrogate_loss(sample_losses, stochastic_tensors=None, name='SurrogateLoss')` {#surrogate_loss}
-
-Surrogate loss for stochastic graphs.
-
-This function will call `loss_fn` on each `StochasticTensor`
-upstream of `sample_losses`, passing the losses that it influenced.
-
-Note that currently `surrogate_loss` does not work with `StochasticTensor`s
-instantiated in `while_loop`s or other control structures.
-
-##### Args:
-
-
-*  <b>`sample_losses`</b>: a list or tuple of final losses. Each loss should be per
-    example in the batch (and possibly per sample); that is, it should have
-    dimensionality of 1 or greater. All losses should have the same shape.
-*  <b>`stochastic_tensors`</b>: a list of `StochasticTensor`s to add loss terms for.
-    If None, defaults to all `StochasticTensor`s in the graph upstream of
-    the `Tensor`s in `sample_losses`.
-*  <b>`name`</b>: the name with which to prepend created ops.
-
-##### Returns:
-
-  `Tensor` loss, which is the sum of `sample_losses` and the
-  `loss_fn`s returned by the `StochasticTensor`s.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `sample_losses` is not a list or tuple, or if its elements
-    are not `Tensor`s.
-*  <b>`ValueError`</b>: if any loss in `sample_losses` does not have dimensionality 1
-    or greater.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_tensor.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_tensor.md
deleted file mode 100644
index a0be205aea9..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.stochastic_tensor.md
+++ /dev/null
@@ -1,467 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# BayesFlow Stochastic Tensors (contrib)
-[TOC]
-
-Classes and helper functions for creating Stochastic Tensors.
-
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
-
-- - -
-
-### `class tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor` {#BaseStochasticTensor}
-
-Base Class for Tensor-like objects that emit stochastic values.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.__init__()` {#BaseStochasticTensor.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.dtype` {#BaseStochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.graph` {#BaseStochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.loss(sample_loss)` {#BaseStochasticTensor.loss}
-
-Returns the term to add to the surrogate loss.
-
-This method is called by `surrogate_loss`.  The input `sample_loss` should
-have already had `stop_gradient` applied to it.  This is because the
-surrogate_loss usually provides a Monte Carlo sample term of the form
-`differentiable_surrogate * sample_loss` where `sample_loss` is considered
-constant with respect to the input for purposes of the gradient.
-
-##### Args:
-
-
-*  <b>`sample_loss`</b>: `Tensor`, sample loss downstream of this `StochasticTensor`.
-
-##### Returns:
-
-  Either `None` or a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.name` {#BaseStochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.value(name=None)` {#BaseStochasticTensor.value}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.bayesflow.stochastic_tensor.StochasticTensor` {#StochasticTensor}
-
-StochasticTensor is a BaseStochasticTensor backed by a distribution.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.__init__(dist, name='StochasticTensor', dist_value_type=None, loss_fn=score_function)` {#StochasticTensor.__init__}
-
-Construct a `StochasticTensor`.
-
-`StochasticTensor` is backed by the `dist` distribution and its `value`
-method will return the same value each time it is called. What `value` is
-returned is controlled by the `dist_value_type` (defaults to
-`SampleValue`).
-
-Some distributions' sample functions are not differentiable (e.g. a sample
-from a discrete distribution like a Bernoulli) and so to differentiate
-wrt parameters upstream of the sample requires a gradient estimator like
-the score function estimator. This is accomplished by passing a
-differentiable `loss_fn` to the `StochasticTensor`, which
-defaults to a function whose derivative is the score function estimator.
-Calling `stochastic_graph.surrogate_loss(final_losses)` will call
-`loss()` on every `StochasticTensor` upstream of final losses.
-
-`loss()` will return None for `StochasticTensor`s backed by
-reparameterized distributions; it will also return None if the value type is
-`MeanValueType` or if `loss_fn=None`.
-
-##### Args:
-
-
-*  <b>`dist`</b>: an instance of `Distribution`.
-*  <b>`name`</b>: a name for this `StochasticTensor` and its ops.
-*  <b>`dist_value_type`</b>: a `_StochasticValueType`, which will determine what the
-      `value` of this `StochasticTensor` will be. If not provided, the
-      value type set with the `value_type` context manager will be used.
-*  <b>`loss_fn`</b>: callable that takes
-      `(st, st.value(), influenced_loss)`, where
-      `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
-      default, `loss_fn` is the `score_function`, or more precisely, the
-      integral of the score function, such that when the gradient is taken,
-      the score function results. See the `stochastic_gradient_estimators`
-      module for additional loss functions and baselines.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist` is not an instance of `Distribution`.
-*  <b>`TypeError`</b>: if `loss_fn` is not `callable`.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.distribution` {#StochasticTensor.distribution}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.dtype` {#StochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.entropy(name='entropy')` {#StochasticTensor.entropy}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.graph` {#StochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.loss(final_loss, name='Loss')` {#StochasticTensor.loss}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.mean(name='mean')` {#StochasticTensor.mean}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.name` {#StochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.value(name='value')` {#StochasticTensor.value}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.value_type` {#StochasticTensor.value_type}
-
-
-
-
-
-
-## Stochastic Tensor Value Types
-
-- - -
-
-### `class tf.contrib.bayesflow.stochastic_tensor.MeanValue` {#MeanValue}
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.__init__(stop_gradient=False)` {#MeanValue.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.declare_inputs(unused_stochastic_tensor, unused_inputs_dict)` {#MeanValue.declare_inputs}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.popped_above(unused_value_type)` {#MeanValue.popped_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.pushed_above(unused_value_type)` {#MeanValue.pushed_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.stop_gradient` {#MeanValue.stop_gradient}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.bayesflow.stochastic_tensor.SampleValue` {#SampleValue}
-
-Draw samples, possibly adding new outer dimensions along the way.
-
-This ValueType draws samples from StochasticTensors run within its
-context, increasing the rank according to the requested shape.
-
-Examples:
-
-```python
-mu = tf.zeros((2,3))
-sigma = tf.ones((2, 3))
-with sg.value_type(sg.SampleValue()):
-  st = sg.StochasticTensor(
-    tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-# draws 1 sample and does not reshape
-assertEqual(st.value().get_shape(), (2, 3))
-```
-
-```python
-mu = tf.zeros((2,3))
-sigma = tf.ones((2, 3))
-with sg.value_type(sg.SampleValue(4)):
-  st = sg.StochasticTensor(
-    tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-# draws 4 samples each with shape (2, 3) and concatenates
-assertEqual(st.value().get_shape(), (4, 2, 3))
-```
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.__init__(shape=(), stop_gradient=False)` {#SampleValue.__init__}
-
-Sample according to shape.
-
-For the given StochasticTensor `st` using this value type,
-the shape of `st.value()` will match that of
-`st.distribution.sample(shape)`.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A shape tuple or int32 tensor.  The sample shape.
-    Default is a scalar: take one sample and do not change the size.
-*  <b>`stop_gradient`</b>: If `True`, StochasticTensors' values are wrapped in
-    `stop_gradient`, to avoid backpropagation through.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.declare_inputs(unused_stochastic_tensor, unused_inputs_dict)` {#SampleValue.declare_inputs}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.popped_above(unused_value_type)` {#SampleValue.popped_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.pushed_above(unused_value_type)` {#SampleValue.pushed_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.shape` {#SampleValue.shape}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.stop_gradient` {#SampleValue.stop_gradient}
-
-
-
-
-
-
-- - -
-
-### `tf.contrib.bayesflow.stochastic_tensor.value_type(dist_value_type)` {#value_type}
-
-Creates a value type context for any StochasticTensor created within.
-
-Typical usage:
-
-```
-with sg.value_type(sg.MeanValue(stop_gradients=True)):
-  st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
-                           sigma=sigma)
-```
-
-In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
-be the mean value of the Normal distribution, i.e., `mu` (possibly
-broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
-was marked with `stop_gradients=True`, this value will have been wrapped
-in a `stop_gradients` call to disable any possible backpropagation.
-
-##### Args:
-
-
-*  <b>`dist_value_type`</b>: An instance of `MeanValue`, `SampleValue`, or
-    any other stochastic value type.
-
-##### Yields:
-
-  A context for `StochasticTensor` objects that controls the
-  value created when they are initialized.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist_value_type` is not an instance of a stochastic value
-    type.
-
-
-- - -
-
-### `tf.contrib.bayesflow.stochastic_tensor.get_current_value_type()` {#get_current_value_type}
-
-
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor` {#ObservedStochasticTensor}
-
-A StochasticTensor with an observed value.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.__init__(dist, value, name=None)` {#ObservedStochasticTensor.__init__}
-
-Construct an `ObservedStochasticTensor`.
-
-`ObservedStochasticTensor` is backed by distribution `dist` and uses the
-provided value instead of using the current value type to draw a value from
-the distribution. The provided value argument must be appropriately shaped
-to have come from the distribution.
-
-##### Args:
-
-
-*  <b>`dist`</b>: an instance of `Distribution`.
-*  <b>`value`</b>: a Tensor containing the observed value
-*  <b>`name`</b>: a name for this `ObservedStochasticTensor` and its ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist` is not an instance of `Distribution`.
-*  <b>`ValueError`</b>: if `value` is not compatible with the distribution.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.distribution` {#ObservedStochasticTensor.distribution}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.dtype` {#ObservedStochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.entropy(name='entropy')` {#ObservedStochasticTensor.entropy}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.graph` {#ObservedStochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.loss(final_loss, name=None)` {#ObservedStochasticTensor.loss}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.mean(name='mean')` {#ObservedStochasticTensor.mean}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.name` {#ObservedStochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.value(name='value')` {#ObservedStochasticTensor.value}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.value_type` {#ObservedStochasticTensor.value_type}
-
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.variational_inference.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.variational_inference.md
deleted file mode 100644
index 3da4aedcb66..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.variational_inference.md
+++ /dev/null
@@ -1,171 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# BayesFlow Variational Inference (contrib)
-[TOC]
-
-Variational inference.
-
-## Ops
-
-- - -
-
-### `tf.contrib.bayesflow.variational_inference.elbo(log_likelihood, variational_with_prior=None, keep_batch_dim=True, form=None, name='ELBO')` {#elbo}
-
-Evidence Lower BOund. `log p(x) >= ELBO`.
-
-Optimization objective for inference of hidden variables by variational
-inference.
-
-This function is meant to be used in conjunction with `StochasticTensor`.
-The user should build out the inference network, using `StochasticTensor`s
-as latent variables, and the generative network. `elbo` at minimum needs
-`p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
-the variational distributions. Use `register_prior` to register `Distribution`
-priors for each `StochasticTensor`. Alternatively, pass in
-`variational_with_prior` specifying all variational distributions and their
-priors.
-
-Mathematical details:
-
-```
-log p(x) =  log \int p(x, Z) dZ
-         =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
-         =  log E_q[\frac {p(x, Z)}{q(Z)}]
-         >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
-
-L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
-           = E_q[log p(x|Z)p(Z)] + H[q]           (1)
-           = E_q[log p(x|Z)] - KL(q || p)         (2)
-
-H - Entropy
-KL - Kullback-Leibler divergence
-```
-
-See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
-more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
-in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
-
-`form` specifies which form of the ELBO is used. `form=ELBOForms.default`
-tries, in order of preference: analytic KL, analytic entropy, sampling.
-
-Multiple entries in the `variational_with_prior` dict implies a factorization.
-e.g. `q(Z) = q(z1)q(z2)q(z3)`.
-
-##### Args:
-
-
-*  <b>`log_likelihood`</b>: `Tensor` log p(x|Z).
-*  <b>`variational_with_prior`</b>: dict from `StochasticTensor` q(Z) to
-    `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
-    objects upstream of `log_likelihood` with priors registered with
-    `register_prior`.
-*  <b>`keep_batch_dim`</b>: bool. Whether to keep the batch dimension when summing
-    entropy/KL term. When the sample is per data point, this should be True;
-    otherwise (e.g. in a Bayesian NN), this should be False.
-*  <b>`form`</b>: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-    ELBOForms.default.
-*  <b>`name`</b>: name to prefix ops with.
-
-##### Returns:
-
-  `Tensor` ELBO of the same type and shape as `log_likelihood`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if variationals in `variational_with_prior` are not
-    `StochasticTensor`s or if priors are not `Distribution`s.
-*  <b>`TypeError`</b>: if form is not a valid ELBOForms constant.
-*  <b>`ValueError`</b>: if `variational_with_prior` is None and there are no
-    `StochasticTensor`s upstream of `log_likelihood`.
-*  <b>`ValueError`</b>: if any variational does not have a prior passed or registered.
-
-
-- - -
-
-### `tf.contrib.bayesflow.variational_inference.elbo_with_log_joint(log_joint, variational=None, keep_batch_dim=True, form=None, name='ELBO')` {#elbo_with_log_joint}
-
-Evidence Lower BOund. `log p(x) >= ELBO`.
-
-This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
-See `elbo` for further details.
-
-Because only the joint is specified, analytic KL is not available.
-
-##### Args:
-
-
-*  <b>`log_joint`</b>: `Tensor` log p(x, Z).
-*  <b>`variational`</b>: list of `StochasticTensor` q(Z). If `None`, defaults to all
-    `StochasticTensor` objects upstream of `log_joint`.
-*  <b>`keep_batch_dim`</b>: bool. Whether to keep the batch dimension when summing
-    entropy term. When the sample is per data point, this should be True;
-    otherwise (e.g. in a Bayesian NN), this should be False.
-*  <b>`form`</b>: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-    ELBOForms.default.
-*  <b>`name`</b>: name to prefix ops with.
-
-##### Returns:
-
-  `Tensor` ELBO of the same type and shape as `log_joint`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if variationals in `variational` are not `StochasticTensor`s.
-*  <b>`TypeError`</b>: if form is not a valid ELBOForms constant.
-*  <b>`ValueError`</b>: if `variational` is None and there are no `StochasticTensor`s
-    upstream of `log_joint`.
-*  <b>`ValueError`</b>: if form is ELBOForms.analytic_kl.
-
-
-- - -
-
-### `class tf.contrib.bayesflow.variational_inference.ELBOForms` {#ELBOForms}
-
-Constants to control the `elbo` calculation.
-
-`analytic_kl` uses the analytic KL divergence between the
-variational distribution(s) and the prior(s).
-
-`analytic_entropy` uses the analytic entropy of the variational
-distribution(s).
-
-`sample` uses the sample KL or the sample entropy is the joint is provided.
-
-See `elbo` for what is used with `default`.
-- - -
-
-#### `tf.contrib.bayesflow.variational_inference.ELBOForms.check_form(form)` {#ELBOForms.check_form}
-
-
-
-
-
-- - -
-
-### `tf.contrib.bayesflow.variational_inference.register_prior(variational, prior)` {#register_prior}
-
-Associate a variational `StochasticTensor` with a `Distribution` prior.
-
-This is a helper function used in conjunction with `elbo` that allows users
-to specify the mapping between variational distributions and their priors
-without having to pass in `variational_with_prior` explicitly.
-
-##### Args:
-
-
-*  <b>`variational`</b>: `StochasticTensor` q(Z). Approximating distribution.
-*  <b>`prior`</b>: `Distribution` p(Z). Prior distribution.
-
-##### Returns:
-
-  None
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if variational is not a `StochasticTensor` or `prior` is not
-    a `Distribution`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.copy_graph.md b/tensorflow/g3doc/api_docs/python/contrib.copy_graph.md
deleted file mode 100644
index 2fc64e30713..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.copy_graph.md
+++ /dev/null
@@ -1,84 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Copying Graph Elements (contrib)
-[TOC]
-
-Functions for copying elements from one graph to another.
-
-## Other Functions and Classes
-- - -
-
-### `tf.contrib.copy_graph.copy_op_to_graph(org_instance, to_graph, variables, scope='')` {#copy_op_to_graph}
-
-Given an `Operation` 'org_instance` from one `Graph`,
-initializes and returns a copy of it from another `Graph`,
-under the specified scope (default `""`).
-
-The copying is done recursively, so any `Operation` whose output
-is required to evaluate the `org_instance`, is also copied (unless
-already done).
-
-Since `Variable` instances are copied separately, those required
-to evaluate `org_instance` must be provided as input.
-
-Args:
-org_instance: An `Operation` from some `Graph`. Could be a
-    `Placeholder` as well.
-to_graph: The `Graph` to copy `org_instance` to.
-variables: An iterable of `Variable` instances to copy `org_instance` to.
-scope: A scope for the new `Variable` (default `""`).
-
-##### Returns:
-
-    The copied `Operation` from `to_graph`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `org_instance` is not an `Operation` or `Tensor`.
-
-
-- - -
-
-### `tf.contrib.copy_graph.copy_variable_to_graph(org_instance, to_graph, scope='')` {#copy_variable_to_graph}
-
-Given a `Variable` instance from one `Graph`, initializes and returns
-a copy of it from another `Graph`, under the specified scope
-(default `""`).
-
-Args:
-org_instance: A `Variable` from some `Graph`.
-to_graph: The `Graph` to copy the `Variable` to.
-scope: A scope for the new `Variable` (default `""`).
-
-##### Returns:
-
-    The copied `Variable` from `to_graph`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `org_instance` is not a `Variable`.
-
-
-- - -
-
-### `tf.contrib.copy_graph.get_copied_op(org_instance, graph, scope='')` {#get_copied_op}
-
-Given an `Operation` instance from some `Graph`, returns
-its namesake from `graph`, under the specified scope
-(default `""`).
-
-If a copy of `org_instance` is present in `graph` under the given
-`scope`, it will be returned.
-
-Args:
-org_instance: An `Operation` from some `Graph`.
-graph: The `Graph` to be searched for a copr of `org_instance`.
-scope: The scope `org_instance` is present in.
-
-##### Returns:
-
-    The `Operation` copy from `graph`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.crf.md b/tensorflow/g3doc/api_docs/python/contrib.crf.md
deleted file mode 100644
index 27ae98cb7b0..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.crf.md
+++ /dev/null
@@ -1,214 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# CRF (contrib)
-[TOC]
-
-Linear-chain CRF layer.
-
-## This package provides functions for building a linear-chain CRF layer.
-
-- - -
-
-### `tf.contrib.crf.crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params)` {#crf_sequence_score}
-
-Computes the unnormalized score for a tag sequence.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
-      compute the unnormalized score.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
-
-##### Returns:
-
-
-*  <b>`sequence_scores`</b>: A [batch_size] vector of unnormalized sequence scores.
-
-
-- - -
-
-### `tf.contrib.crf.crf_log_norm(inputs, sequence_lengths, transition_params)` {#crf_log_norm}
-
-Computes the normalization for a CRF.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
-
-##### Returns:
-
-
-*  <b>`log_norm`</b>: A [batch_size] vector of normalizers for a CRF.
-
-
-- - -
-
-### `tf.contrib.crf.crf_log_likelihood(inputs, tag_indices, sequence_lengths, transition_params=None)` {#crf_log_likelihood}
-
-Computes the log-likelihood of tag sequences in a CRF.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
-      compute the log-likelihood.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix, if available.
-
-##### Returns:
-
-
-*  <b>`log_likelihood`</b>: A scalar containing the log-likelihood of the given sequence
-      of tag indices.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix. This is either
-      provided by the caller or created in this function.
-
-
-- - -
-
-### `tf.contrib.crf.crf_unary_score(tag_indices, sequence_lengths, inputs)` {#crf_unary_score}
-
-Computes the unary scores of tag sequences.
-
-##### Args:
-
-
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
-
-##### Returns:
-
-
-*  <b>`unary_scores`</b>: A [batch_size] vector of unary scores.
-
-
-- - -
-
-### `tf.contrib.crf.crf_binary_score(tag_indices, sequence_lengths, transition_params)` {#crf_binary_score}
-
-Computes the binary scores of tag sequences.
-
-##### Args:
-
-
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-
-##### Returns:
-
-
-*  <b>`binary_scores`</b>: A [batch_size] vector of binary scores.
-
-
-- - -
-
-### `class tf.contrib.crf.CrfForwardRnnCell` {#CrfForwardRnnCell}
-
-Computes the alpha values in a linear-chain CRF.
-
-See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.__call__(inputs, state, scope=None)` {#CrfForwardRnnCell.__call__}
-
-Build the CrfForwardRnnCell.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, num_tags] matrix of unary potentials.
-*  <b>`state`</b>: A [batch_size, num_tags] matrix containing the previous alpha
-      values.
-*  <b>`scope`</b>: Unused variable scope of this cell.
-
-##### Returns:
-
-  new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
-      values containing the new alpha values.
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.__init__(transition_params)` {#CrfForwardRnnCell.__init__}
-
-Initialize the CrfForwardRnnCell.
-
-##### Args:
-
-
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-      This matrix is expanded into a [1, num_tags, num_tags] in preparation
-      for the broadcast summation occurring within the cell.
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.output_size` {#CrfForwardRnnCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.state_size` {#CrfForwardRnnCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.zero_state(batch_size, dtype)` {#CrfForwardRnnCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `tf.contrib.crf.viterbi_decode(score, transition_params)` {#viterbi_decode}
-
-Decode the highest scoring sequence of tags outside of TensorFlow.
-
-This should only be used at test time.
-
-##### Args:
-
-
-*  <b>`score`</b>: A [seq_len, num_tags] matrix of unary potentials.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-
-##### Returns:
-
-
-*  <b>`viterbi`</b>: A [seq_len] list of integers containing the highest scoring tag
-      indicies.
-*  <b>`viterbi_score`</b>: A float containing the score for the Viterbi sequence.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md
deleted file mode 100644
index 5b669c2f188..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md
+++ /dev/null
@@ -1,4338 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Random variable transformations (contrib)
-[TOC]
-
-Bijector Ops.
-
-An API for invertible, differentiable transformations of random variables.
-
-## Background
-
-Differentiable, bijective transformations of continuous random variables alter
-the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
-manipulations.
-
-For more details and examples, see the `Bijector` docstring.
-
-To apply a `Bijector`, use `distributions.TransformedDistribution`.
-
-## Bijectors
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Affine` {#Affine}
-
-Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-In TF parlance, the `scale` term is logically equivalent to:
-
-```python
-scale = (
-  scale_identity_multiplier * tf.diag(tf.ones(d)) +
-  tf.diag(scale_diag) +
-  scale_tril +
-  scale_perturb_factor @ diag(scale_perturb_diag) @
-    tf.transpose([scale_perturb_factor])
-)
-```
-
-The `scale` term is applied without necessarily materializing constituent
-matrices, i.e., the matmul is [matrix-free](
-https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-Examples:
-
-```python
-# Y = X
-b = Affine()
-
-# Y = X + shift
-b = Affine(shift=[1., 2, 3])
-
-# Y = 2 * I @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_identity_multiplier=2.)
-
-# Y = tf.diag(d1) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-# Y = (I + v * v.T) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_perturb_factor=[[1., 0],
-                                 [0, 1],
-                                 [1, 1]])
-
-# Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_diag=[1., 3, 3],          # Implicitly 3x3.
-           scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-           scale_perturb_factor=[[1., 0],
-                                 [0, 1],
-                                 [1, 1]])
-
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.__init__(shift=None, scale_identity_multiplier=None, scale_diag=None, scale_tril=None, scale_perturb_factor=None, scale_perturb_diag=None, event_ndims=1, validate_args=False, name='affine')` {#Affine.__init__}
-
-Instantiates the `Affine` bijector.
-
-This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-giving the forward operation:
-
-```none
-Y = g(X) = scale @ X + shift
-```
-
-where the `scale` term is logically equivalent to:
-
-```python
-scale = (
-  scale_identity_multiplier * tf.diag(tf.ones(d)) +
-  tf.diag(scale_diag) +
-  scale_tril +
-  scale_perturb_factor @ diag(scale_perturb_diag) @
-    tf.transpose([scale_perturb_factor])
-)
-```
-
-If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-specified then `scale += IdentityMatrix`. Otherwise specifying a
-`scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-`scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-##### Args:
-
-
-*  <b>`shift`</b>: Numeric `Tensor`.  If this is set to `None`, no shift is applied.
-*  <b>`scale_identity_multiplier`</b>: floating point rank 0 `Tensor` representing a
-    scaling done to the identity matrix.
-    When `scale_identity_multiplier = scale_diag=scale_tril = None` then
-    `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-    to `scale`.
-*  <b>`scale_diag`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_diag` has shape [N1, N2, ... k], which represents a k x k
-    diagonal matrix.
-    When `None` no diagonal term is added to `scale`.
-*  <b>`scale_tril`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
-    lower triangular matrix.
-    When `None` no `scale_tril` term is added to `scale`.
-    The upper triangular elements above the diagonal are ignored.
-*  <b>`scale_perturb_factor`</b>: Numeric `Tensor` representing factor matrix with
-    last two dimensions of shape `(k, r)`.
-    When `None`, no rank-r update is added to `scale`.
-*  <b>`scale_perturb_diag`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-    r x r Diagonal matrix.
-    When `None` low rank updates will take the form `scale_perturb_factor *
-    scale_perturb_factor.T`.
-*  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution. Must be 0 or 1.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `perturb_diag` is specified but not `perturb_factor`.
-*  <b>`TypeError`</b>: if `shift` has different `dtype` from `scale` arguments.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.dtype` {#Affine.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.event_ndims` {#Affine.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward(x, name='forward')` {#Affine.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_event_shape(input_shape)` {#Affine.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Affine.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Affine.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.graph_parents` {#Affine.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse(y, name='inverse')` {#Affine.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Affine.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_event_shape(output_shape)` {#Affine.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Affine.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Affine.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.is_constant_jacobian` {#Affine.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.name` {#Affine.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.scale` {#Affine.scale}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.shift` {#Affine.shift}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.validate_args` {#Affine.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.AffineLinearOperator` {#AffineLinearOperator}
-
-Compute `Y = g(X; shift, scale) = scale(X.T) + shift`.
-
-`shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-If `X` is a scalar then the forward transformation is: `scale * X + shift`
-where `*` denotes the scalar product.
-
-Note: we don't always simply transpose `X` (but write it this way for
-brevity).  Actually the input `X` undergoes the following transformation
-before being premultiplied by `scale`:
-
-1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-   `new_sample_shape = [1]`. Otherwise do nothing.
-2. The sample shape is flattened to have one dimension, i.e.,
-   `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-3. The sample dim is cyclically rotated left by 1, i.e.,
-   `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-   event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-   dimensions.
-
-(For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-The result of the above transformation is that `X` can be regarded as a batch
-of matrices where each column is a draw from the distribution.  After
-premultiplying by `scale`, we take the inverse of this procedure.  The input
-`Y` also undergoes the same transformation before/after premultiplying by
-`inv(scale)`.
-
-Example Use:
-
-```python
-linalg = tf.contrib.linalg
-
-x = [1., 2, 3]
-
-shift = [-1., 0., 1]
-diag = [1., 2, 3]
-scale = linalg.LinearOperatorDiag(diag)
-affine = AffineLinearOperator(shift, scale)
-# In this case, `forward` is equivalent to:
-# diag * scale + shift
-y = affine.forward(x)  # [0., 4, 10]
-
-shift = [2., 3, 1]
-tril = [[1., 0, 0],
-        [2, 1, 0],
-        [3, 2, 1]]
-scale = linalg.LinearOperatorTriL(tril)
-affine = AffineLinearOperator(shift, scale)
-# In this case, `forward` is equivalent to:
-# np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-y = affine.forward(x)  # [3., 7, 11]
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.__init__(shift=None, scale=None, event_ndims=1, validate_args=False, name='affine_linear_operator')` {#AffineLinearOperator.__init__}
-
-Instantiates the `AffineLinearOperator` bijector.
-
-##### Args:
-
-
-*  <b>`shift`</b>: Numeric `Tensor`.
-*  <b>`scale`</b>: Subclass of `LinearOperator`.  Represents the (batch) positive
-    definite matrix `M` in `R^{k x k}`.
-*  <b>`event_ndims`</b>: Scalar `integer` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution. Must be 0 or 1.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `event_ndims` is not 0 or 1.
-*  <b>`TypeError`</b>: if `scale` is not a `LinearOperator`.
-*  <b>`TypeError`</b>: if `shift.dtype` does not match `scale.dtype`.
-*  <b>`ValueError`</b>: if not `scale.is_non_singular`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.dtype` {#AffineLinearOperator.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.event_ndims` {#AffineLinearOperator.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward(x, name='forward')` {#AffineLinearOperator.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_event_shape(input_shape)` {#AffineLinearOperator.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#AffineLinearOperator.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#AffineLinearOperator.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.graph_parents` {#AffineLinearOperator.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse(y, name='inverse')` {#AffineLinearOperator.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#AffineLinearOperator.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_event_shape(output_shape)` {#AffineLinearOperator.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#AffineLinearOperator.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#AffineLinearOperator.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.is_constant_jacobian` {#AffineLinearOperator.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.name` {#AffineLinearOperator.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.scale` {#AffineLinearOperator.scale}
-
-The `scale` `LinearOperator` in `Y = scale @ X.T + shift`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.shift` {#AffineLinearOperator.shift}
-
-The `shift` `Tensor` in `Y = scale @ X.T + shift`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.validate_args` {#AffineLinearOperator.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Bijector` {#Bijector}
-
-Interface for transforming a `Distribution` sample.
-
-A `Bijector` implements a
-[diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
-bijective, differentiable function. A `Bijector` is used by
-`TransformedDistribution` but can be generally used for transforming a
-`Distribution` generated `Tensor`.  A `Bijector` is characterized by three
-operations:
-
-1. Forward Evaluation
-
-   Useful for turning one random outcome into another random outcome from a
-   different distribution.
-
-2. Inverse Evaluation
-
-   Useful for "reversing" a transformation to compute one probability in
-   terms of another.
-
-3. (log o det o Jacobian o inverse)(x)
-
-   "The log of the determinant of the matrix of all first-order partial
-   derivatives of the inverse function."
-   Useful for inverting a transformation to compute one probability in terms
-   of another.  Geometrically, the det(Jacobian) is the volume of the
-   transformation and is used to scale the probability.
-
-By convention, transformations of random variables are named in terms of the
-forward transformation. The forward transformation creates samples, the
-inverse is useful for computing probabilities.
-
-Example Use:
-
-  - Basic properties:
-
-  ```python
-  x = ... # A tensor.
-  # Evaluate forward transformation.
-  fwd_x = my_bijector.forward(x)
-  x == my_bijector.inverse(fwd_x)
-  x != my_bijector.forward(fwd_x)  # Not equal because g(x) != g(g(x)).
-  ```
-
-  - Computing a log-likelihood:
-
-  ```python
-  def transformed_log_prob(bijector, log_prob, x):
-    return (bijector.inverse_log_det_jacobian(x) +
-            log_prob(bijector.inverse(x)))
-  ```
-
-  - Transforming a random outcome:
-
-  ```python
-  def transformed_sample(bijector, x):
-    return bijector.forward(x)
-  ```
-
-Example transformations:
-
-  - "Exponential"
-
-    ```
-    Y = g(X) = exp(X)
-    X ~ Normal(0, 1)  # Univariate.
-    ```
-
-    Implies:
-
-    ```
-      g^{-1}(Y) = log(Y)
-      |Jacobian(g^{-1})(y)| = 1 / y
-      Y ~ LogNormal(0, 1), i.e.,
-      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                = (1 / y) Normal(log(y); 0, 1)
-    ```
-
-    Here is an example of how one might implement the `Exp` bijector:
-
-    ```
-      class Exp(Bijector):
-        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
-          super(Exp, self).__init__(event_ndims=event_ndims,
-                                    validate_args=validate_args, name=name)
-        def _forward(self, x):
-          return math_ops.exp(x)
-        def _inverse_and_inverse_log_det_jacobian(self, y):
-          x = math_ops.log(y)
-          return x, -self._forward_log_det_jacobian(x)
-        def _forward_log_det_jacobian(self, x):
-          if self.event_ndims is None:
-            raise ValueError("Jacobian requires known event_ndims.")
-          event_dims = array_ops.shape(x)[-self.event_ndims:]
-          return math_ops.reduce_sum(x, reduction_indices=event_dims)
-      ```
-
-  - "Affine"
-
-    ```
-    Y = g(X) = sqrtSigma * X + mu
-    X ~ MultivariateNormal(0, I_d)
-    ```
-
-    Implies:
-
-    ```
-      g^{-1}(Y) = inv(sqrtSigma) * (Y - mu)
-      |Jacobian(g^{-1})(y)| = det(inv(sqrtSigma))
-      Y ~ MultivariateNormal(mu, sqrtSigma) , i.e.,
-      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                = det(sqrtSigma)^(-d) *
-                  MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
-    ```
-
-Example of why a `Bijector` needs to understand sample, batch, event
-partitioning:
-
-- Consider the `Exp` `Bijector` applied to a `Tensor` which has sample, batch,
-  and event (S, B, E) shape semantics.  Suppose
-  the `Tensor`'s partitioned-shape is `(S=[4], B=[2], E=[3, 3])`.
-
-  For `Exp`, the shape of the `Tensor` returned by `forward` and `inverse` is
-  unchanged, i.e., `[4, 2, 3, 3]`. However the shape returned by
-  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-  over the event dimensions.
-
-Subclass Requirements:
-
-- Typically subclasses implement `_forward` and one or both of:
-    - `_inverse`, `_inverse_log_det_jacobian`,
-    - `_inverse_and_inverse_log_det_jacobian`.
-
-- If the `Bijector`'s use is limited to `TransformedDistribution` (or friends
-  like `QuantizedDistribution`) then depending on your use, you may not need
-  to implement all of `_forward` and `_inverse` functions.  Examples:
-    1. Sampling (e.g., `sample`) only requires `_forward`.
-    2. Probability functions (e.g., `prob`, `cdf`, `survival`) only require
-       `_inverse` (and related).
-    3. Only calling probability functions on the output of `sample` means
-      `_inverse` can be implemented as a cache lookup.
-
-  See `Example Use` [above] which shows how these functions are used to
-  transform a distribution.  (Note: `_forward` could theoretically be
-  implemented as a cache lookup but this would require controlling the
-  underlying sample generation mechanism.)
-
-- If computation can be shared among `_inverse` and
-  `_inverse_log_det_jacobian` it is preferable to implement
-  `_inverse_and_inverse_log_det_jacobian`. This usually reduces
-  graph-construction overhead because a `Distribution`'s implementation of
-  `log_prob` will need to evaluate both the inverse Jacobian as well as the
-  inverse function.
-
-- If an additional use case needs just `inverse` or just
-  `inverse_log_det_jacobian` then he or she may also wish to implement these
-  functions to avoid computing the `inverse_log_det_jacobian` or the
-  `inverse`, respectively.
-
-- Subclasses should implement `_forward_event_shape`,
-  `_forward_event_shape_tensor` (and `inverse` counterparts) if the
-  transformation is shape-changing.  By default the event-shape is assumed
-  unchanged from input.
-
-Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
-
-- As case 3 [above] indicates, under some circumstances the inverse function
-  can be implemented as a cache lookup.
-
-- The inverse `log o det o Jacobian` can be implemented as the negative of the
-  forward `log o det o Jacobian`.  This is useful if the `inverse` is
-  implemented as a cache or the inverse Jacobian is computationally more
-  expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
-  demonstrates the suggested implementation.
-
-  ```python
-  def _inverse_and_log_det_jacobian(self, y):
-     x = # ... implement inverse, possibly via cache.
-     return x, -self._forward_log_det_jac(x)  # Note negation.
-  ```
-
-  By overriding the `_inverse_and_log_det_jacobian` function we have access to
-  the inverse in one call.
-
-  The correctness of this approach can be seen from the following claim.
-
-  - Claim:
-
-      Assume `Y=g(X)` is a bijection whose derivative exists and is nonzero
-      for its domain, i.e., `d/dX g(X)!=0`. Then:
-
-      ```none
-      (log o det o jacobian o g^{-1})(Y) = -(log o det o jacobian o g)(X)
-      ```
-
-  - Proof:
-
-      From the bijective, nonzero differentiability of `g`, the
-      [inverse function theorem](
-          https://en.wikipedia.org/wiki/Inverse_function_theorem)
-      implies `g^{-1}` is differentiable in the image of `g`.
-      Applying the chain rule to `y = g(x) = g(g^{-1}(y))` yields
-      `I = g'(g^{-1}(y))*g^{-1}'(y)`.
-      The same theorem also implies `g{-1}'` is non-singular therefore:
-      `inv[ g'(g^{-1}(y)) ] = g^{-1}'(y)`.
-      The claim follows from [properties of determinant](
-https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
-
-- If possible, prefer a direct implementation of the inverse Jacobian. This
-  should have superior numerical stability and will often share subgraphs with
-  the `_inverse` implementation.
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.__init__(event_ndims=None, graph_parents=None, is_constant_jacobian=False, validate_args=False, dtype=None, name=None)` {#Bijector.__init__}
-
-Constructs Bijector.
-
-A `Bijector` transforms random variables into new random variables.
-
-Examples:
-
-```python
-# Create the Y = g(X) = X transform which operates on vector events.
-identity = Identity(event_ndims=1)
-
-# Create the Y = g(X) = exp(X) transform which operates on matrices.
-exp = Exp(event_ndims=2)
-```
-
-See `Bijector` subclass docstring for more details and specific examples.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: number of dimensions associated with event coordinates.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Bijector`.
-*  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is not a
-    function of the input.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input with
-    asserts. If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`dtype`</b>: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
-    enforced.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.dtype` {#Bijector.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.event_ndims` {#Bijector.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward(x, name='forward')` {#Bijector.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_event_shape(input_shape)` {#Bijector.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Bijector.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Bijector.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.graph_parents` {#Bijector.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse(y, name='inverse')` {#Bijector.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Bijector.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_event_shape(output_shape)` {#Bijector.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Bijector.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Bijector.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.is_constant_jacobian` {#Bijector.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.name` {#Bijector.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.validate_args` {#Bijector.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Chain` {#Chain}
-
-Bijector which applies a sequence of bijectors.
-
-Example Use:
-
-```python
-chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-```
-
-Results in:
-
-* Forward:
-
- ```python
- exp = Exp()
- softplus = Softplus()
- Chain([exp, softplus]).forward(x)
- = exp.forward(softplus.forward(x))
- = tf.exp(tf.log(1. + tf.exp(x)))
- = 1. + tf.exp(x)
- ```
-
-* Inverse:
-
- ```python
- exp = Exp()
- softplus = Softplus()
- Chain([exp, softplus]).inverse(y)
- = softplus.inverse(exp.inverse(y))
- = tf.log(tf.exp(tf.log(y)) - 1.)
- = tf.log(y - 1.)
- ```
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.__init__(bijectors=(), validate_args=False, name=None)` {#Chain.__init__}
-
-Instantiates `Chain` bijector.
-
-##### Args:
-
-
-*  <b>`bijectors`</b>: Python list of bijector instances. An empty list makes this
-    bijector equivalent to the `Identity` bijector.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object. Default: E.g.,
-    `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if bijectors have different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.bijectors` {#Chain.bijectors}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.dtype` {#Chain.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.event_ndims` {#Chain.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward(x, name='forward')` {#Chain.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_event_shape(input_shape)` {#Chain.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Chain.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Chain.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.graph_parents` {#Chain.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse(y, name='inverse')` {#Chain.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Chain.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_event_shape(output_shape)` {#Chain.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Chain.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Chain.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.is_constant_jacobian` {#Chain.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.name` {#Chain.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.validate_args` {#Chain.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.CholeskyOuterProduct` {#CholeskyOuterProduct}
-
-Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-`event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-Examples:
-
-```python
-bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-# Result: [[1, 1], [1, 5]], i.e., x x.T
-
-bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
-# Result: [[1, 0], [2, 1]], i.e., chol(y).
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.__init__(event_ndims=2, validate_args=False, name='cholesky_outer_product')` {#CholeskyOuterProduct.__init__}
-
-Instantiates the `CholeskyOuterProduct` bijector.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: `constant` `int32` scalar `Tensor` indicating the number of
-    dimensions associated with a particular draw from the distribution. Must
-    be 0 or 2.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if event_ndims is neither 0 or 2.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.dtype` {#CholeskyOuterProduct.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.event_ndims` {#CholeskyOuterProduct.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward(x, name='forward')` {#CholeskyOuterProduct.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_event_shape(input_shape)` {#CholeskyOuterProduct.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#CholeskyOuterProduct.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#CholeskyOuterProduct.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.graph_parents` {#CholeskyOuterProduct.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse(y, name='inverse')` {#CholeskyOuterProduct.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_event_shape(output_shape)` {#CholeskyOuterProduct.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#CholeskyOuterProduct.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#CholeskyOuterProduct.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.is_constant_jacobian` {#CholeskyOuterProduct.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.name` {#CholeskyOuterProduct.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.validate_args` {#CholeskyOuterProduct.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Exp` {#Exp}
-
-Compute `Y = g(X) = exp(X)`.
-
-Example Use:
-
-```python
-# Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-# batch ndim and 2 event ndims (i.e., vector of matrices).
-exp = Exp(event_ndims=2)
-x = [[[1., 2],
-       [3, 4]],
-      [[5, 6],
-       [7, 8]]]
-exp(x) == exp.forward(x)
-log(x) == exp.inverse(x)
-```
-
-Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-over the event space.
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.__init__(event_ndims=0, validate_args=False, name='exp')` {#Exp.__init__}
-
-Instantiates the `Exp` bijector.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.dtype` {#Exp.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.event_ndims` {#Exp.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward(x, name='forward')` {#Exp.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_event_shape(input_shape)` {#Exp.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Exp.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Exp.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.graph_parents` {#Exp.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse(y, name='inverse')` {#Exp.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Exp.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_event_shape(output_shape)` {#Exp.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Exp.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Exp.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.is_constant_jacobian` {#Exp.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.name` {#Exp.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.power` {#Exp.power}
-
-The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.validate_args` {#Exp.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Identity` {#Identity}
-
-Compute Y = g(X) = X.
-
-Example Use:
-
-```python
-# Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
-# ndim and 1 event ndim (i.e., vector of vectors).
-identity = Identity(event_ndims=1)
-x = [[1., 2],
-     [3, 4]]
-x == identity.forward(x) == identity.inverse(x)
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.__init__(validate_args=False, event_ndims=0, name='identity')` {#Identity.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.dtype` {#Identity.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.event_ndims` {#Identity.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward(x, name='forward')` {#Identity.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_event_shape(input_shape)` {#Identity.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Identity.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Identity.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.graph_parents` {#Identity.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse(y, name='inverse')` {#Identity.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Identity.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_event_shape(output_shape)` {#Identity.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Identity.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Identity.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.is_constant_jacobian` {#Identity.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.name` {#Identity.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.validate_args` {#Identity.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Inline` {#Inline}
-
-Bijector constructed from custom callables.
-
-Example Use:
-
-```python
-exp = Inline(
-  forward_fn=tf.exp,
-  inverse_fn=tf.log,
-  inverse_log_det_jacobian_fn=(
-    lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="exp")
-```
-
-The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.__init__(forward_fn=None, inverse_fn=None, inverse_log_det_jacobian_fn=None, forward_log_det_jacobian_fn=None, forward_event_shape_fn=None, forward_event_shape_tensor_fn=None, inverse_event_shape_fn=None, inverse_event_shape_tensor_fn=None, is_constant_jacobian=False, validate_args=False, name='inline')` {#Inline.__init__}
-
-Creates a `Bijector` from callables.
-
-##### Args:
-
-
-*  <b>`forward_fn`</b>: Python callable implementing the forward transformation.
-*  <b>`inverse_fn`</b>: Python callable implementing the inverse transformation.
-*  <b>`inverse_log_det_jacobian_fn`</b>: Python callable implementing the
-    log o det o jacobian of the inverse transformation.
-*  <b>`forward_log_det_jacobian_fn`</b>: Python callable implementing the
-    log o det o jacobian of the forward transformation.
-*  <b>`forward_event_shape_fn`</b>: Python callable implementing non-identical
-    static event shape changes. Default: shape is assumed unchanged.
-*  <b>`forward_event_shape_tensor_fn`</b>: Python callable implementing non-identical
-    event shape changes. Default: shape is assumed unchanged.
-*  <b>`inverse_event_shape_fn`</b>: Python callable implementing non-identical
-    static event shape changes. Default: shape is assumed unchanged.
-*  <b>`inverse_event_shape_tensor_fn`</b>: Python callable implementing non-identical
-    event shape changes. Default: shape is assumed unchanged.
-*  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is constant
-    for all input arguments.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.dtype` {#Inline.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.event_ndims` {#Inline.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward(x, name='forward')` {#Inline.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_event_shape(input_shape)` {#Inline.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Inline.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Inline.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.graph_parents` {#Inline.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse(y, name='inverse')` {#Inline.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Inline.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_event_shape(output_shape)` {#Inline.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Inline.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Inline.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.is_constant_jacobian` {#Inline.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.name` {#Inline.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.validate_args` {#Inline.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Invert` {#Invert}
-
-Bijector which inverts another Bijector.
-
-Example Use: [ExpGammaDistribution (see Background & Context)](
-https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-models `Y=log(X)` where `X ~ Gamma`.
-
-```python
-exp_gamma_distribution = TransformedDistribution(
-  Gamma(alpha=1., beta=2.),
-  bijector.Invert(bijector.Exp())
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.__init__(bijector, validate_args=False, name=None)` {#Invert.__init__}
-
-Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-efficient if the base bijector implements `_forward_log_det_jacobian`. If
-`_forward_log_det_jacobian` is not implemented then the following code is
-used:
-
-```python
-y = self.inverse(x, **kwargs)
-return -self.inverse_log_det_jacobian(y, **kwargs)
-```
-
-##### Args:
-
-
-*  <b>`bijector`</b>: Bijector instance.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.bijector` {#Invert.bijector}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.dtype` {#Invert.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.event_ndims` {#Invert.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward(x, name='forward')` {#Invert.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_event_shape(input_shape)` {#Invert.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Invert.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Invert.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.graph_parents` {#Invert.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse(y, name='inverse')` {#Invert.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Invert.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_event_shape(output_shape)` {#Invert.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Invert.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Invert.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.is_constant_jacobian` {#Invert.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.name` {#Invert.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.validate_args` {#Invert.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.PowerTransform` {#PowerTransform}
-
-Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-of this bijector.
-
-This bijector is equivalent to the `Exp` bijector when `c=0`.
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.__init__(power=0.0, event_ndims=0, validate_args=False, name='power_transform')` {#PowerTransform.__init__}
-
-Instantiates the `PowerTransform` bijector.
-
-##### Args:
-
-
-*  <b>`power`</b>: Python `float` scalar indicating the transform power, i.e.,
-    `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-*  <b>`event_ndims`</b>: Python scalar indicating the number of dimensions associated
-    with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `power < 0` or is not known statically.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.dtype` {#PowerTransform.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.event_ndims` {#PowerTransform.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward(x, name='forward')` {#PowerTransform.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_event_shape(input_shape)` {#PowerTransform.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#PowerTransform.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#PowerTransform.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.graph_parents` {#PowerTransform.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse(y, name='inverse')` {#PowerTransform.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#PowerTransform.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_event_shape(output_shape)` {#PowerTransform.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#PowerTransform.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#PowerTransform.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.is_constant_jacobian` {#PowerTransform.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.name` {#PowerTransform.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.power` {#PowerTransform.power}
-
-The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.validate_args` {#PowerTransform.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.SigmoidCentered` {#SigmoidCentered}
-
-Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-See `bijector.SoftmaxCentered` for more details.
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.__init__(validate_args=False, name='sigmoid_centered')` {#SigmoidCentered.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.dtype` {#SigmoidCentered.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.event_ndims` {#SigmoidCentered.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward(x, name='forward')` {#SigmoidCentered.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_event_shape(input_shape)` {#SigmoidCentered.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#SigmoidCentered.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#SigmoidCentered.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.graph_parents` {#SigmoidCentered.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse(y, name='inverse')` {#SigmoidCentered.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#SigmoidCentered.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_event_shape(output_shape)` {#SigmoidCentered.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#SigmoidCentered.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#SigmoidCentered.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.is_constant_jacobian` {#SigmoidCentered.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.name` {#SigmoidCentered.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.validate_args` {#SigmoidCentered.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.SoftmaxCentered` {#SoftmaxCentered}
-
-Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-bijection, the forward transformation appends a value to the input and the
-inverse removes this coordinate.  The appended coordinate represents a pivot,
-e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-coordinate.
-
-Because we append a coordinate, this bijector only supports `event_ndim in [0,
-1]`, i.e., scalars and vectors.
-
-Example Use:
-
-```python
-bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-# Result: [0.2, 0.3, 0.4, 0.1]
-# Extra result: 0.1
-
-bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-# Result: tf.log([2, 3, 4])
-# Extra coordinate removed.
-```
-
-At first blush it may seem like the [Invariance of domain](
-https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-implementation is not a bijection.  However, the appended dimension
-makes the (forward) image non-open and the theorem does not directly apply.
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.__init__(event_ndims=0, validate_args=False, name='softmax_centered')` {#SoftmaxCentered.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.dtype` {#SoftmaxCentered.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.event_ndims` {#SoftmaxCentered.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward(x, name='forward')` {#SoftmaxCentered.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_event_shape(input_shape)` {#SoftmaxCentered.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#SoftmaxCentered.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#SoftmaxCentered.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.graph_parents` {#SoftmaxCentered.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse(y, name='inverse')` {#SoftmaxCentered.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#SoftmaxCentered.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_event_shape(output_shape)` {#SoftmaxCentered.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#SoftmaxCentered.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#SoftmaxCentered.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.is_constant_jacobian` {#SoftmaxCentered.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.name` {#SoftmaxCentered.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.validate_args` {#SoftmaxCentered.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.bijector.Softplus` {#Softplus}
-
-Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-The softplus `Bijector` has the following two useful properties:
-
-* The domain is the positive real numbers
-* `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-  the `Exp` `Bijector`.
-
-  Example Use:
-
-  ```python
-  # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-  # batch ndim and 2 event ndims (i.e., vector of matrices).
-  softplus = Softplus(event_ndims=2)
-  x = [[[1., 2],
-         [3, 4]],
-        [[5, 6],
-         [7, 8]]]
-  log(1 + exp(x)) == softplus.forward(x)
-  log(exp(x) - 1) == softplus.inverse(x)
-  ```
-
-  Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-  reduction over the event space.
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.__init__(event_ndims=0, validate_args=False, name='softplus')` {#Softplus.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.dtype` {#Softplus.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.event_ndims` {#Softplus.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward(x, name='forward')` {#Softplus.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_event_shape(input_shape)` {#Softplus.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Softplus.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Softplus.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.graph_parents` {#Softplus.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse(y, name='inverse')` {#Softplus.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Softplus.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_event_shape(output_shape)` {#Softplus.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Softplus.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Softplus.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.is_constant_jacobian` {#Softplus.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.name` {#Softplus.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.validate_args` {#Softplus.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
deleted file mode 100644
index d42a74c2c1f..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ /dev/null
@@ -1,27956 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Statistical Distributions (contrib)
-[TOC]
-
-Classes representing statistical distributions and ops for working with them.
-
-## Classes for statistical distributions.
-
-Classes that represent batches of statistical distributions.  Each class is
-initialized with parameters that define the distributions.
-
-## Base classes
-
-- - -
-
-### `class tf.contrib.distributions.ReparameterizationType` {#ReparameterizationType}
-
-Instances of this class represent how sampling is reparameterized.
-
-Two static instances exist in the distritributions library, signifying
-one of two possible properties for samples from a distribution:
-
-`FULLY_REPARAMETERIZED`: Samples from the distribution are fully
-  reparameterized, and straight-through gradients are supported.
-
-`NOT_REPARAMETERIZED`: Samples from the distribution are not fully
-  reparameterized, and straight-through gradients are either partially
-  unsupported or are not supported at all.  In this case, for purposes of
-  e.g. RL or variational inference, it is generally safest to wrap the
-  sample results in a `stop_gradients` call and instead use policy
-  gradients / surrogate loss instead.
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__eq__(other)` {#ReparameterizationType.__eq__}
-
-Determine if this `ReparameterizationType` is equal to another.
-
-Since RepaparameterizationType instances are constant static global
-instances, equality checks if two instances' id() values are equal.
-
-##### Args:
-
-
-*  <b>`other`</b>: Object to compare against.
-
-##### Returns:
-
-  `self is other`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__init__(rep_type)` {#ReparameterizationType.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__repr__()` {#ReparameterizationType.__repr__}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Distribution` {#Distribution}
-
-A generic probability distribution base class.
-
-`Distribution` is a base class for constructing and organizing properties
-(e.g., mean, variance) of random variables (e.g, Bernoulli, Gaussian).
-
-### Subclassing
-
-Subclasses are expected to implement a leading-underscore version of the
-same-named function.  The argument signature should be identical except for
-the omission of `name="..."`.  For example, to enable `log_prob(value,
-name="log_prob")` a subclass should implement `_log_prob(value)`.
-
-Subclasses can append to public-level docstrings by providing
-docstrings for their method specializations. For example:
-
-```python
-@distribution_util.AppendDocstring("Some other details.")
-def _log_prob(self, value):
-  ...
-```
-
-would add the string "Some other details." to the `log_prob` function
-docstring.  This is implemented as a simple decorator to avoid python
-linter complaining about missing Args/Returns/Raises sections in the
-partial docstrings.
-
-### Broadcasting, batching, and shapes
-
-All distributions support batches of independent distributions of that type.
-The batch shape is determined by broadcasting together the parameters.
-
-The shape of arguments to `__init__`, `cdf`, `log_cdf`, `prob`, and
-`log_prob` reflect this broadcasting, as does the return value of `sample` and
-`sample_n`.
-
-`sample_n_shape = (n,) + batch_shape + event_shape`, where `sample_n_shape` is
-the shape of the `Tensor` returned from `sample_n`, `n` is the number of
-samples, `batch_shape` defines how many independent distributions there are,
-and `event_shape` defines the shape of samples from each of those independent
-distributions. Samples are independent along the `batch_shape` dimensions, but
-not necessarily so along the `event_shape` dimensions (depending on the
-particulars of the underlying distribution).
-
-Using the `Uniform` distribution as an example:
-
-```python
-minval = 3.0
-maxval = [[4.0, 6.0],
-          [10.0, 12.0]]
-
-# Broadcasting:
-# This instance represents 4 Uniform distributions. Each has a lower bound at
-# 3.0 as the `minval` parameter was broadcasted to match `maxval`'s shape.
-u = Uniform(minval, maxval)
-
-# `event_shape` is `TensorShape([])`.
-event_shape = u.event_shape
-# `event_shape_t` is a `Tensor` which will evaluate to [].
-event_shape_t = u.event_shape_tensor()
-
-# Sampling returns a sample per distribution.  `samples` has shape
-# (5, 2, 2), which is (n,) + batch_shape + event_shape, where n=5,
-# batch_shape=(2, 2), and event_shape=().
-samples = u.sample_n(5)
-
-# The broadcasting holds across methods. Here we use `cdf` as an example. The
-# same holds for `log_cdf` and the likelihood functions.
-
-# `cum_prob` has shape (2, 2) as the `value` argument was broadcasted to the
-# shape of the `Uniform` instance.
-cum_prob_broadcast = u.cdf(4.0)
-
-# `cum_prob`'s shape is (2, 2), one per distribution. No broadcasting
-# occurred.
-cum_prob_per_dist = u.cdf([[4.0, 5.0],
-                           [6.0, 7.0]])
-
-# INVALID as the `value` argument is not broadcastable to the distribution's
-# shape.
-cum_prob_invalid = u.cdf([4.0, 5.0, 6.0])
-```
-
-### Parameter values leading to undefined statistics or distributions.
-
-Some distributions do not have well-defined statistics for all initialization
-parameter values.  For example, the beta distribution is parameterized by
-positive real numbers `a` and `b`, and does not have well-defined mode if
-`a < 1` or `b < 1`.
-
-The user is given the option of raising an exception or returning `NaN`.
-
-```python
-a = tf.exp(tf.matmul(logits, weights_a))
-b = tf.exp(tf.matmul(logits, weights_b))
-
-# Will raise exception if ANY batch member has a < 1 or b < 1.
-dist = distributions.beta(a, b, allow_nan_stats=False)
-mode = dist.mode().eval()
-
-# Will return NaN for batch members with either a < 1 or b < 1.
-dist = distributions.beta(a, b, allow_nan_stats=True)  # Default behavior
-mode = dist.mode().eval()
-```
-
-In all cases, an exception is raised if *invalid* parameters are passed, e.g.
-
-```python
-# Will raise an exception if any Op is run.
-negative_a = -1.0 * a  # beta distribution by definition has a > 0.
-dist = distributions.beta(negative_a, b, allow_nan_stats=True)
-dist.mean().eval()
-```
-- - -
-
-#### `tf.contrib.distributions.Distribution.__init__(dtype, is_continuous, reparameterization_type, validate_args, allow_nan_stats, parameters=None, graph_parents=None, name=None)` {#Distribution.__init__}
-
-Constructs the `Distribution`.
-
-**This is a private method for subclass use.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the event samples. `None` implies no type-enforcement.
-*  <b>`is_continuous`</b>: Python boolean. If `True` this
-    `Distribution` is continuous over its supported domain.
-*  <b>`reparameterization_type`</b>: Instance of `ReparameterizationType`.
-    If `distributions.FULLY_REPARAMETERIZED`, this
-    `Distribution` can be reparameterized in terms of some standard
-    distribution with a function whose Jacobian is constant for the support
-    of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
-    then no such reparameterization is available.
-*  <b>`validate_args`</b>: Python boolean.  Whether to validate input with asserts.
-    If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Python boolean.  If `False`, raise an
-    exception if a statistic (e.g., mean, mode) is undefined for any batch
-    member. If True, batch members with valid parameters leading to
-    undefined statistics will return `NaN` for this statistic.
-*  <b>`parameters`</b>: Python dictionary of parameters used to instantiate this
-    `Distribution`.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Distribution`.
-*  <b>`name`</b>: A name for this distribution. Default: subclass name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any member of graph_parents is `None` or not a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.allow_nan_stats` {#Distribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.batch_shape` {#Distribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.batch_shape_tensor(name='batch_shape_tensor')` {#Distribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.cdf(value, name='cdf')` {#Distribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.copy(**override_parameters_kwargs)` {#Distribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.covariance(name='covariance')` {#Distribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.dtype` {#Distribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.entropy(name='entropy')` {#Distribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.event_shape` {#Distribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.event_shape_tensor(name='event_shape_tensor')` {#Distribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_continuous` {#Distribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_scalar_batch(name='is_scalar_batch')` {#Distribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_scalar_event(name='is_scalar_event')` {#Distribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_cdf(value, name='log_cdf')` {#Distribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_prob(value, name='log_prob')` {#Distribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_survival_function(value, name='log_survival_function')` {#Distribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.mean(name='mean')` {#Distribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.mode(name='mode')` {#Distribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.name` {#Distribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Distribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.param_static_shapes(cls, sample_shape)` {#Distribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.parameters` {#Distribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.prob(value, name='prob')` {#Distribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.reparameterization_type` {#Distribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.sample(sample_shape=(), seed=None, name='sample')` {#Distribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.stddev(name='stddev')` {#Distribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.survival_function(value, name='survival_function')` {#Distribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.validate_args` {#Distribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.variance(name='variance')` {#Distribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-## Univariate (scalar) distributions
-
-- - -
-
-### `class tf.contrib.distributions.Binomial` {#Binomial}
-
-Binomial distribution.
-
-This distribution is parameterized by `probs`, a (batch of) probabilities for
-drawing a `1` and `total_count`, the number of trials per draw from the
-Binomial.
-
-#### Mathematical Details
-
-The Binomial is a distribution over the number of `1`'s in `total_count`
-independent trials, with each trial having the same probability of `1`, i.e.,
-`probs`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(k; n, p) = p**k (1 - p)**(n - k) / Z
-Z = k! (n - k)! / n!
-```
-
-where:
-* `total_count = n`,
-* `probs = p`,
-* `Z` is the normalizaing constant, and,
-* `n!` is the factorial of `n`.
-
-#### Examples
-
-Create a single distribution, corresponding to 5 coin flips.
-
-```python
-dist = Binomial(total_count=5., probs=.5)
-```
-
-Create a single distribution (using logits), corresponding to 5 coin flips.
-
-```python
-dist = Binomial(total_count=5., logits=0.)
-```
-
-Creates 3 distributions with the third distribution most likely to have
-successes.
-
-```python
-p = [.2, .3, .8]
-# n will be broadcast to [4., 4., 4.], to match p.
-dist = Binomial(total_count=4., probs=p)
-```
-
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as p.
-counts = [1., 2, 3]
-dist.prob(counts)  # Shape [3]
-
-# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
-counts = [[1., 2, 1], [2, 2, 4]]
-dist.prob(counts)  # Shape [2, 3]
-
-# p will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Binomial.__init__(total_count, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='Binomial')` {#Binomial.__init__}
-
-Initialize a batch of Binomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor with shape broadcastable
-    to `[N1,..., Nm]` with `m >= 0` and the same dtype as `probs` or
-    `logits`.  Defines this as a batch of `N1 x ... x Nm` different Binomial
-    distributions. Its components should be equal to integer values.
-*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
-    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
-    the same dtype as `total_count`. Each entry represents logits for the
-    probability of success for independent Binomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: Positive floating point tensor with shape broadcastable to
-    `[N1,..., Nm]` `m >= 0`, `probs in [0, 1]`. Each entry represents the
-    probability of success for independent Binomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.batch_shape` {#Binomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.batch_shape_tensor(name='batch_shape_tensor')` {#Binomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.copy(**override_parameters_kwargs)` {#Binomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.covariance(name='covariance')` {#Binomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.event_shape` {#Binomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.event_shape_tensor(name='event_shape_tensor')` {#Binomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_scalar_batch(name='is_scalar_batch')` {#Binomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_scalar_event(name='is_scalar_event')` {#Binomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_prob(value, name='log_prob')` {#Binomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Binomial`:
-
-For each batch member of counts `value`, `P[value]` is the probability that
-after sampling `self.total_count` draws from this Binomial distribution, the
-number of successes is `value`. Since different sequences of draws can result in
-the same counts, the probability includes a combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `dtype` and whose shape
-can be broadcast with `self.probs` and `self.total_count`. `value` is only legal
-if it is less than or equal to `self.total_count` and its components are equal
-to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_survival_function(value, name='log_survival_function')` {#Binomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
-
-Log-odds of drawing a `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
-
-Mode.
-
-Additional documentation from `Binomial`:
-
-Note that when `(1 + total_count) * probs` is an integer, there are
-actually two modes.  Namely, `(1 + total_count) * probs` and
-`(1 + total_count) * probs - 1` are both modes. Here we return only the
-larger of the two modes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Binomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.param_static_shapes(cls, sample_shape)` {#Binomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.parameters` {#Binomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.prob(value, name='prob')` {#Binomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Binomial`:
-
-For each batch member of counts `value`, `P[value]` is the probability that
-after sampling `self.total_count` draws from this Binomial distribution, the
-number of successes is `value`. Since different sequences of draws can result in
-the same counts, the probability includes a combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `dtype` and whose shape
-can be broadcast with `self.probs` and `self.total_count`. `value` is only legal
-if it is less than or equal to `self.total_count` and its components are equal
-to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.probs` {#Binomial.probs}
-
-Probability of of drawing a `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.reparameterization_type` {#Binomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.stddev(name='stddev')` {#Binomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.survival_function(value, name='survival_function')` {#Binomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.total_count` {#Binomial.total_count}
-
-Number of trials.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Bernoulli` {#Bernoulli}
-
-Bernoulli distribution.
-
-The Bernoulli distribution with `probs` parameter, i.e., the probability of a
-`1` outcome (vs a `0` outcome).
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='Bernoulli')` {#Bernoulli.__init__}
-
-Construct Bernoulli distributions.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor` representing the log-odds of a `1` event. Each
-    entry in the `Tensor` parametrizes an independent Bernoulli distribution
-    where the probability of an event is sigmoid(logits). Only one of
-    `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor` representing the probability of a `1`
-    event. Each entry in the `Tensor` parameterizes an independent
-    Bernoulli distribution. Only one of `logits` or `probs` should be passed
-    in.
-*  <b>`dtype`</b>: The type of the event samples. Default: `int32`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If p and logits are passed, or if neither are passed.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.allow_nan_stats` {#Bernoulli.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.batch_shape` {#Bernoulli.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.batch_shape_tensor(name='batch_shape_tensor')` {#Bernoulli.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.cdf(value, name='cdf')` {#Bernoulli.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.copy(**override_parameters_kwargs)` {#Bernoulli.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.covariance(name='covariance')` {#Bernoulli.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.dtype` {#Bernoulli.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.entropy(name='entropy')` {#Bernoulli.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.event_shape` {#Bernoulli.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.event_shape_tensor(name='event_shape_tensor')` {#Bernoulli.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_continuous` {#Bernoulli.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_scalar_batch(name='is_scalar_batch')` {#Bernoulli.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_scalar_event(name='is_scalar_event')` {#Bernoulli.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_cdf(value, name='log_cdf')` {#Bernoulli.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_prob(value, name='log_prob')` {#Bernoulli.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_survival_function(value, name='log_survival_function')` {#Bernoulli.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.logits` {#Bernoulli.logits}
-
-Log-odds of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.mean(name='mean')` {#Bernoulli.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.mode(name='mode')` {#Bernoulli.mode}
-
-Mode.
-
-Additional documentation from `Bernoulli`:
-
-Returns `1` if `prob > 0.5` and `0` otherwise.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.name` {#Bernoulli.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Bernoulli.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.param_static_shapes(cls, sample_shape)` {#Bernoulli.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.parameters` {#Bernoulli.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.prob(value, name='prob')` {#Bernoulli.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.probs` {#Bernoulli.probs}
-
-Probability of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.reparameterization_type` {#Bernoulli.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.sample(sample_shape=(), seed=None, name='sample')` {#Bernoulli.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.stddev(name='stddev')` {#Bernoulli.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.survival_function(value, name='survival_function')` {#Bernoulli.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.validate_args` {#Bernoulli.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.variance(name='variance')` {#Bernoulli.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.BernoulliWithSigmoidProbs` {#BernoulliWithSigmoidProbs}
-
-Bernoulli with `probs = nn.sigmoid(logits)`.
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.__init__(logits=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='BernoulliWithSigmoidProbs')` {#BernoulliWithSigmoidProbs.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.allow_nan_stats` {#BernoulliWithSigmoidProbs.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.batch_shape` {#BernoulliWithSigmoidProbs.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.batch_shape_tensor(name='batch_shape_tensor')` {#BernoulliWithSigmoidProbs.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.cdf(value, name='cdf')` {#BernoulliWithSigmoidProbs.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.copy(**override_parameters_kwargs)` {#BernoulliWithSigmoidProbs.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.covariance(name='covariance')` {#BernoulliWithSigmoidProbs.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.dtype` {#BernoulliWithSigmoidProbs.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.entropy(name='entropy')` {#BernoulliWithSigmoidProbs.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.event_shape` {#BernoulliWithSigmoidProbs.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.event_shape_tensor(name='event_shape_tensor')` {#BernoulliWithSigmoidProbs.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_continuous` {#BernoulliWithSigmoidProbs.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_scalar_batch(name='is_scalar_batch')` {#BernoulliWithSigmoidProbs.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_scalar_event(name='is_scalar_event')` {#BernoulliWithSigmoidProbs.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_cdf(value, name='log_cdf')` {#BernoulliWithSigmoidProbs.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_prob(value, name='log_prob')` {#BernoulliWithSigmoidProbs.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_survival_function(value, name='log_survival_function')` {#BernoulliWithSigmoidProbs.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.logits` {#BernoulliWithSigmoidProbs.logits}
-
-Log-odds of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.mean(name='mean')` {#BernoulliWithSigmoidProbs.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.mode(name='mode')` {#BernoulliWithSigmoidProbs.mode}
-
-Mode.
-
-Additional documentation from `Bernoulli`:
-
-Returns `1` if `prob > 0.5` and `0` otherwise.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.name` {#BernoulliWithSigmoidProbs.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#BernoulliWithSigmoidProbs.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.param_static_shapes(cls, sample_shape)` {#BernoulliWithSigmoidProbs.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.parameters` {#BernoulliWithSigmoidProbs.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.prob(value, name='prob')` {#BernoulliWithSigmoidProbs.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.probs` {#BernoulliWithSigmoidProbs.probs}
-
-Probability of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.reparameterization_type` {#BernoulliWithSigmoidProbs.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.sample(sample_shape=(), seed=None, name='sample')` {#BernoulliWithSigmoidProbs.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.stddev(name='stddev')` {#BernoulliWithSigmoidProbs.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.survival_function(value, name='survival_function')` {#BernoulliWithSigmoidProbs.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.validate_args` {#BernoulliWithSigmoidProbs.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.variance(name='variance')` {#BernoulliWithSigmoidProbs.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Beta` {#Beta}
-
-Beta distribution.
-
-The Beta distribution is defined over the `(0, 1)` interval using parameters
-`concentration1` (aka "alpha") and `concentration0` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
-Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
-```
-
-where:
-
-* `concentration1 = alpha`,
-* `concentration0 = beta`,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The concentration parameters represent mean total counts of a `1` or a `0`,
-i.e.,
-
-```none
-concentration1 = alpha = mean * total_concentration
-concentration0 = beta  = (1. - mean) * total_concentration
-```
-
-where `mean` in `(0, 1)` and `total_concentration` is a positive real number
-representing a mean `total_count = concentration1 + concentration0`.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-# Create a batch of three Beta distributions.
-alpha = [1, 2, 3]
-beta = [1, 2, 3]
-dist = Beta(alpha, beta)
-
-dist.sample([4, 5])  # Shape [4, 5, 3]
-
-# `x` has three batch entries, each with two samples.
-x = [[.1, .4, .5],
-     [.2, .3, .5]]
-# Calculate the probability of each pair of samples under the corresponding
-# distribution in `dist`.
-dist.prob(x)         # Shape [2, 3]
-```
-
-```python
-# Create batch_shape=[2, 3] via parameter broadcast:
-alpha = [[1.], [2]]      # Shape [2, 1]
-beta = [3., 4, 5]        # Shape [3]
-dist = Beta(alpha, beta)
-
-# alpha broadcast as: [[1., 1, 1,],
-#                      [2, 2, 2]]
-# beta broadcast as:  [[3., 4, 5],
-#                      [3, 4, 5]]
-# batch_Shape [2, 3]
-dist.sample([4, 5])  # Shape [4, 5, 2, 3]
-
-x = [.2, .3, .5]
-# x will be broadcast as [[.2, .3, .5],
-#                         [.2, .3, .5]],
-# thus matching batch_shape [2, 3].
-dist.prob(x)         # Shape [2, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Beta.__init__(concentration1=None, concentration0=None, validate_args=False, allow_nan_stats=True, name='Beta')` {#Beta.__init__}
-
-Initialize a batch of Beta distributions.
-
-##### Args:
-
-
-*  <b>`concentration1`</b>: Positive floating-point `Tensor` indicating mean
-    number of successes; aka "alpha". Implies `self.dtype` and
-    `self.batch_shape`, i.e.,
-    `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
-*  <b>`concentration0`</b>: Positive floating-point `Tensor` indicating mean
-    number of failures; aka "beta". Otherwise has same semantics as
-    `concentration1`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.allow_nan_stats` {#Beta.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.batch_shape` {#Beta.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.batch_shape_tensor(name='batch_shape_tensor')` {#Beta.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.cdf(value, name='cdf')` {#Beta.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.concentration0` {#Beta.concentration0}
-
-Concentration parameter associated with a `0` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.concentration1` {#Beta.concentration1}
-
-Concentration parameter associated with a `1` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.copy(**override_parameters_kwargs)` {#Beta.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.covariance(name='covariance')` {#Beta.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.dtype` {#Beta.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.entropy(name='entropy')` {#Beta.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.event_shape` {#Beta.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.event_shape_tensor(name='event_shape_tensor')` {#Beta.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_continuous` {#Beta.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_scalar_batch(name='is_scalar_batch')` {#Beta.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_scalar_event(name='is_scalar_event')` {#Beta.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_cdf(value, name='log_cdf')` {#Beta.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_prob(value, name='log_prob')` {#Beta.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_survival_function(value, name='log_survival_function')` {#Beta.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.mean(name='mean')` {#Beta.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.mode(name='mode')` {#Beta.mode}
-
-Mode.
-
-Additional documentation from `Beta`:
-
-Note: The mode is undefined when `concentration1 <= 1` or
-`concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-is used for undefined modes.  If `self.allow_nan_stats` is `False` an
-exception is raised when one or more modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.name` {#Beta.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Beta.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.param_static_shapes(cls, sample_shape)` {#Beta.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.parameters` {#Beta.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.prob(value, name='prob')` {#Beta.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.reparameterization_type` {#Beta.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.sample(sample_shape=(), seed=None, name='sample')` {#Beta.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.stddev(name='stddev')` {#Beta.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.survival_function(value, name='survival_function')` {#Beta.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.total_concentration` {#Beta.total_concentration}
-
-Sum of concentration parameters.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.validate_args` {#Beta.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.variance(name='variance')` {#Beta.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.BetaWithSoftplusConcentration` {#BetaWithSoftplusConcentration}
-
-Beta with softplus transform of `concentration1` and `concentration0`.
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.__init__(concentration1, concentration0, validate_args=False, allow_nan_stats=True, name='BetaWithSoftplusConcentration')` {#BetaWithSoftplusConcentration.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.allow_nan_stats` {#BetaWithSoftplusConcentration.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.batch_shape` {#BetaWithSoftplusConcentration.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.batch_shape_tensor(name='batch_shape_tensor')` {#BetaWithSoftplusConcentration.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.cdf(value, name='cdf')` {#BetaWithSoftplusConcentration.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.concentration0` {#BetaWithSoftplusConcentration.concentration0}
-
-Concentration parameter associated with a `0` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.concentration1` {#BetaWithSoftplusConcentration.concentration1}
-
-Concentration parameter associated with a `1` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.copy(**override_parameters_kwargs)` {#BetaWithSoftplusConcentration.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.covariance(name='covariance')` {#BetaWithSoftplusConcentration.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.dtype` {#BetaWithSoftplusConcentration.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.entropy(name='entropy')` {#BetaWithSoftplusConcentration.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.event_shape` {#BetaWithSoftplusConcentration.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.event_shape_tensor(name='event_shape_tensor')` {#BetaWithSoftplusConcentration.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_continuous` {#BetaWithSoftplusConcentration.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_scalar_batch(name='is_scalar_batch')` {#BetaWithSoftplusConcentration.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_scalar_event(name='is_scalar_event')` {#BetaWithSoftplusConcentration.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_cdf(value, name='log_cdf')` {#BetaWithSoftplusConcentration.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_prob(value, name='log_prob')` {#BetaWithSoftplusConcentration.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_survival_function(value, name='log_survival_function')` {#BetaWithSoftplusConcentration.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.mean(name='mean')` {#BetaWithSoftplusConcentration.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.mode(name='mode')` {#BetaWithSoftplusConcentration.mode}
-
-Mode.
-
-Additional documentation from `Beta`:
-
-Note: The mode is undefined when `concentration1 <= 1` or
-`concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-is used for undefined modes.  If `self.allow_nan_stats` is `False` an
-exception is raised when one or more modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.name` {#BetaWithSoftplusConcentration.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#BetaWithSoftplusConcentration.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.param_static_shapes(cls, sample_shape)` {#BetaWithSoftplusConcentration.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.parameters` {#BetaWithSoftplusConcentration.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.prob(value, name='prob')` {#BetaWithSoftplusConcentration.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.reparameterization_type` {#BetaWithSoftplusConcentration.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.sample(sample_shape=(), seed=None, name='sample')` {#BetaWithSoftplusConcentration.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.stddev(name='stddev')` {#BetaWithSoftplusConcentration.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.survival_function(value, name='survival_function')` {#BetaWithSoftplusConcentration.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.total_concentration` {#BetaWithSoftplusConcentration.total_concentration}
-
-Sum of concentration parameters.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.validate_args` {#BetaWithSoftplusConcentration.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.variance(name='variance')` {#BetaWithSoftplusConcentration.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Categorical` {#Categorical}
-
-Categorical distribution.
-
-The categorical distribution is parameterized by the log-probabilities
-of a set of classes.
-
-#### Examples
-
-Creates a 3-class distiribution, with the 2nd class, the most likely to be
-drawn from.
-
-```python
-p = [0.1, 0.5, 0.4]
-dist = Categorical(probs=p)
-```
-
-Creates a 3-class distiribution, with the 2nd class the most likely to be
-drawn from, using logits.
-
-```python
-logits = [-50, 400, 40]
-dist = Categorical(logits=logits)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-The distribution functions can be evaluated on counts.
-
-```python
-# counts is a scalar.
-p = [0.1, 0.4, 0.5]
-dist = Categorical(probs=p)
-dist.prob(0)  # Shape []
-
-# p will be broadcast to [[0.1, 0.4, 0.5], [0.1, 0.4, 0.5]] to match counts.
-counts = [1, 0]
-dist.prob(counts)  # Shape [2]
-
-# p will be broadcast to shape [3, 5, 7, 3] to match counts.
-counts = [[...]] # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Categorical.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='Categorical')` {#Categorical.__init__}
-
-Initialize Categorical distributions using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of Categorical distributions. The first `N - 1` dimensions
-    index into a batch of independent distributions and the last dimension
-    represents a vector of logits for each class. Only one of `logits` or
-    `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of Categorical distributions. The first `N - 1` dimensions
-    index into a batch of independent distributions and the last dimension
-    represents a vector of probabilities for each class. Only one of
-    `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.allow_nan_stats` {#Categorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.batch_shape` {#Categorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.batch_shape_tensor(name='batch_shape_tensor')` {#Categorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.cdf(value, name='cdf')` {#Categorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.copy(**override_parameters_kwargs)` {#Categorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.covariance(name='covariance')` {#Categorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.dtype` {#Categorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.entropy(name='entropy')` {#Categorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_shape` {#Categorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_shape_tensor(name='event_shape_tensor')` {#Categorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_size` {#Categorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_continuous` {#Categorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_scalar_batch(name='is_scalar_batch')` {#Categorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_scalar_event(name='is_scalar_event')` {#Categorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_cdf(value, name='log_cdf')` {#Categorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_prob(value, name='log_prob')` {#Categorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_survival_function(value, name='log_survival_function')` {#Categorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.logits` {#Categorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.mean(name='mean')` {#Categorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.mode(name='mode')` {#Categorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.name` {#Categorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Categorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.param_static_shapes(cls, sample_shape)` {#Categorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.parameters` {#Categorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.prob(value, name='prob')` {#Categorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.probs` {#Categorical.probs}
-
-Vector of coordinatewise probabilities.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.reparameterization_type` {#Categorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.sample(sample_shape=(), seed=None, name='sample')` {#Categorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.stddev(name='stddev')` {#Categorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.survival_function(value, name='survival_function')` {#Categorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.validate_args` {#Categorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.variance(name='variance')` {#Categorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Chi2` {#Chi2}
-
-Chi2 distribution.
-
-The Chi2 distribution is defined over positive real numbers using a degrees of
-freedom ("df") parameter.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; df, x > 0) = x**(0.5 df - 1) exp(-0.5 x) / Z
-Z = 2**(0.5 df) Gamma(0.5 df)
-```
-
-where:
-
-* `df` denotes the degrees of freedom,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The Chi2 distribution is a special case of the Gamma distribution, i.e.,
-
-```python
-Chi2(df) = Gamma(concentration=0.5 * df, rate=0.5)
-```
-- - -
-
-#### `tf.contrib.distributions.Chi2.__init__(df, validate_args=False, allow_nan_stats=True, name='Chi2')` {#Chi2.__init__}
-
-Construct Chi2 distributions with parameter `df`.
-
-##### Args:
-
-
-*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
-    distribution(s).  `df` must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.allow_nan_stats` {#Chi2.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.batch_shape` {#Chi2.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.batch_shape_tensor(name='batch_shape_tensor')` {#Chi2.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.cdf(value, name='cdf')` {#Chi2.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.concentration` {#Chi2.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.copy(**override_parameters_kwargs)` {#Chi2.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.covariance(name='covariance')` {#Chi2.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.df` {#Chi2.df}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.dtype` {#Chi2.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.entropy(name='entropy')` {#Chi2.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.event_shape` {#Chi2.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.event_shape_tensor(name='event_shape_tensor')` {#Chi2.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_continuous` {#Chi2.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_scalar_batch(name='is_scalar_batch')` {#Chi2.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_scalar_event(name='is_scalar_event')` {#Chi2.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_cdf(value, name='log_cdf')` {#Chi2.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_prob(value, name='log_prob')` {#Chi2.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_survival_function(value, name='log_survival_function')` {#Chi2.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.mean(name='mean')` {#Chi2.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.mode(name='mode')` {#Chi2.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.name` {#Chi2.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Chi2.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.param_static_shapes(cls, sample_shape)` {#Chi2.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.parameters` {#Chi2.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.prob(value, name='prob')` {#Chi2.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.rate` {#Chi2.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.reparameterization_type` {#Chi2.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.sample(sample_shape=(), seed=None, name='sample')` {#Chi2.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.stddev(name='stddev')` {#Chi2.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.survival_function(value, name='survival_function')` {#Chi2.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.validate_args` {#Chi2.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.variance(name='variance')` {#Chi2.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Chi2WithAbsDf` {#Chi2WithAbsDf}
-
-Chi2 with parameter transform `df = floor(abs(df))`.
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.__init__(df, validate_args=False, allow_nan_stats=True, name='Chi2WithAbsDf')` {#Chi2WithAbsDf.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.allow_nan_stats` {#Chi2WithAbsDf.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.batch_shape` {#Chi2WithAbsDf.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.batch_shape_tensor(name='batch_shape_tensor')` {#Chi2WithAbsDf.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.cdf(value, name='cdf')` {#Chi2WithAbsDf.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.concentration` {#Chi2WithAbsDf.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.copy(**override_parameters_kwargs)` {#Chi2WithAbsDf.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.covariance(name='covariance')` {#Chi2WithAbsDf.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.df` {#Chi2WithAbsDf.df}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.dtype` {#Chi2WithAbsDf.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.entropy(name='entropy')` {#Chi2WithAbsDf.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.event_shape` {#Chi2WithAbsDf.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.event_shape_tensor(name='event_shape_tensor')` {#Chi2WithAbsDf.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_continuous` {#Chi2WithAbsDf.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_scalar_batch(name='is_scalar_batch')` {#Chi2WithAbsDf.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_scalar_event(name='is_scalar_event')` {#Chi2WithAbsDf.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_cdf(value, name='log_cdf')` {#Chi2WithAbsDf.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_prob(value, name='log_prob')` {#Chi2WithAbsDf.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_survival_function(value, name='log_survival_function')` {#Chi2WithAbsDf.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.mean(name='mean')` {#Chi2WithAbsDf.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.mode(name='mode')` {#Chi2WithAbsDf.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.name` {#Chi2WithAbsDf.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Chi2WithAbsDf.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.param_static_shapes(cls, sample_shape)` {#Chi2WithAbsDf.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.parameters` {#Chi2WithAbsDf.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.prob(value, name='prob')` {#Chi2WithAbsDf.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.rate` {#Chi2WithAbsDf.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.reparameterization_type` {#Chi2WithAbsDf.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.sample(sample_shape=(), seed=None, name='sample')` {#Chi2WithAbsDf.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.stddev(name='stddev')` {#Chi2WithAbsDf.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.survival_function(value, name='survival_function')` {#Chi2WithAbsDf.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.validate_args` {#Chi2WithAbsDf.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.variance(name='variance')` {#Chi2WithAbsDf.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Exponential` {#Exponential}
-
-Exponential distribution.
-
-The Exponential distribution is parameterized by an event `rate` parameter.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; lambda, x > 0) = exp(-lambda x) / Z
-Z = 1 / lambda
-```
-
-where `rate = lambda` and `Z` is the normalizaing constant.
-
-The Exponential distribution is a special case of the Gamma distribution,
-i.e.,
-
-```python
-Exponential(rate) = Gamma(concentration=1., rate)
-```
-
-The Exponential distribution uses a `rate` parameter, or "inverse scale",
-which can be intuited as,
-
-```none
-X ~ Exponential(rate=1)
-Y = X / rate
-```
-- - -
-
-#### `tf.contrib.distributions.Exponential.__init__(rate, validate_args=False, allow_nan_stats=True, name='Exponential')` {#Exponential.__init__}
-
-Construct Exponential distribution with parameter `rate`.
-
-##### Args:
-
-
-*  <b>`rate`</b>: Floating point tensor, equivalent to `1 / mean`. Must contain only
-    positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.allow_nan_stats` {#Exponential.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.batch_shape` {#Exponential.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.batch_shape_tensor(name='batch_shape_tensor')` {#Exponential.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.cdf(value, name='cdf')` {#Exponential.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.concentration` {#Exponential.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.copy(**override_parameters_kwargs)` {#Exponential.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.covariance(name='covariance')` {#Exponential.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.dtype` {#Exponential.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.entropy(name='entropy')` {#Exponential.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.event_shape` {#Exponential.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.event_shape_tensor(name='event_shape_tensor')` {#Exponential.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_continuous` {#Exponential.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_scalar_batch(name='is_scalar_batch')` {#Exponential.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_scalar_event(name='is_scalar_event')` {#Exponential.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_cdf(value, name='log_cdf')` {#Exponential.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_prob(value, name='log_prob')` {#Exponential.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_survival_function(value, name='log_survival_function')` {#Exponential.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.mean(name='mean')` {#Exponential.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.mode(name='mode')` {#Exponential.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.name` {#Exponential.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Exponential.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.param_static_shapes(cls, sample_shape)` {#Exponential.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.parameters` {#Exponential.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.prob(value, name='prob')` {#Exponential.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.rate` {#Exponential.rate}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.reparameterization_type` {#Exponential.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.sample(sample_shape=(), seed=None, name='sample')` {#Exponential.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.stddev(name='stddev')` {#Exponential.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.survival_function(value, name='survival_function')` {#Exponential.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.validate_args` {#Exponential.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.variance(name='variance')` {#Exponential.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.ExponentialWithSoftplusRate` {#ExponentialWithSoftplusRate}
-
-Exponential with softplus transform on `rate`.
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.__init__(rate, validate_args=False, allow_nan_stats=True, name='ExponentialWithSoftplusRate')` {#ExponentialWithSoftplusRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.allow_nan_stats` {#ExponentialWithSoftplusRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.batch_shape` {#ExponentialWithSoftplusRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.batch_shape_tensor(name='batch_shape_tensor')` {#ExponentialWithSoftplusRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.cdf(value, name='cdf')` {#ExponentialWithSoftplusRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.concentration` {#ExponentialWithSoftplusRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.copy(**override_parameters_kwargs)` {#ExponentialWithSoftplusRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.covariance(name='covariance')` {#ExponentialWithSoftplusRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.dtype` {#ExponentialWithSoftplusRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.entropy(name='entropy')` {#ExponentialWithSoftplusRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.event_shape` {#ExponentialWithSoftplusRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.event_shape_tensor(name='event_shape_tensor')` {#ExponentialWithSoftplusRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_continuous` {#ExponentialWithSoftplusRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_scalar_batch(name='is_scalar_batch')` {#ExponentialWithSoftplusRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_scalar_event(name='is_scalar_event')` {#ExponentialWithSoftplusRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_cdf(value, name='log_cdf')` {#ExponentialWithSoftplusRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_prob(value, name='log_prob')` {#ExponentialWithSoftplusRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_survival_function(value, name='log_survival_function')` {#ExponentialWithSoftplusRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.mean(name='mean')` {#ExponentialWithSoftplusRate.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.mode(name='mode')` {#ExponentialWithSoftplusRate.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.name` {#ExponentialWithSoftplusRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ExponentialWithSoftplusRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.param_static_shapes(cls, sample_shape)` {#ExponentialWithSoftplusRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.parameters` {#ExponentialWithSoftplusRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.prob(value, name='prob')` {#ExponentialWithSoftplusRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.rate` {#ExponentialWithSoftplusRate.rate}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.reparameterization_type` {#ExponentialWithSoftplusRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.sample(sample_shape=(), seed=None, name='sample')` {#ExponentialWithSoftplusRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.stddev(name='stddev')` {#ExponentialWithSoftplusRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.survival_function(value, name='survival_function')` {#ExponentialWithSoftplusRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.validate_args` {#ExponentialWithSoftplusRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.variance(name='variance')` {#ExponentialWithSoftplusRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Gamma` {#Gamma}
-
-Gamma distribution.
-
-The Gamma distribution is defined over positive real numbers using
-parameters `concentration` (aka "alpha") and `rate` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta, x > 0) = x**(alpha - 1) exp(-x beta) / Z
-Z = Gamma(alpha) beta**alpha
-```
-
-where:
-
-* `concentration = alpha`, `alpha > 0`,
-* `rate = beta`, `beta > 0`,
-* `Z` is the normalizing constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The cumulative density function (cdf) is,
-
-```none
-cdf(x; alpha, beta, x > 0) = GammaInc(alpha, beta x) / Gamma(alpha)
-```
-
-where `GammaInc` is the [lower incomplete Gamma function](
-https://en.wikipedia.org/wiki/Incomplete_gamma_function).
-
-The parameters can be intuited via their relationship to mean and stddev,
-
-```none
-concentration = alpha = (mean / stddev)**2
-rate = beta = mean / stddev**2 = concentration / mean
-```
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-WARNING: This distribution may draw 0-valued samples for small `concentration`
-values. See note in `tf.random_gamma` docstring.
-
-#### Examples
-
-```python
-dist = Gamma(concentration=3.0, rate=2.0)
-dist2 = Gamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
-```
-- - -
-
-#### `tf.contrib.distributions.Gamma.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='Gamma')` {#Gamma.__init__}
-
-Construct Gamma with `concentration` and `rate` parameters.
-
-The parameters `concentration` and `rate` must be shaped in a way that
-supports broadcasting (e.g. `concentration + rate` is a valid operation).
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Floating point tensor, the concentration params of the
-    distribution(s). Must contain only positive values.
-*  <b>`rate`</b>: Floating point tensor, the inverse scale params of the
-    distribution(s). Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `concentration` and `rate` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.allow_nan_stats` {#Gamma.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.batch_shape` {#Gamma.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.batch_shape_tensor(name='batch_shape_tensor')` {#Gamma.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.cdf(value, name='cdf')` {#Gamma.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.concentration` {#Gamma.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.copy(**override_parameters_kwargs)` {#Gamma.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.covariance(name='covariance')` {#Gamma.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.dtype` {#Gamma.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.entropy(name='entropy')` {#Gamma.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.event_shape` {#Gamma.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.event_shape_tensor(name='event_shape_tensor')` {#Gamma.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_continuous` {#Gamma.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_scalar_batch(name='is_scalar_batch')` {#Gamma.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_scalar_event(name='is_scalar_event')` {#Gamma.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_cdf(value, name='log_cdf')` {#Gamma.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_prob(value, name='log_prob')` {#Gamma.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_survival_function(value, name='log_survival_function')` {#Gamma.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.mean(name='mean')` {#Gamma.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.mode(name='mode')` {#Gamma.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.name` {#Gamma.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Gamma.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.param_static_shapes(cls, sample_shape)` {#Gamma.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.parameters` {#Gamma.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.prob(value, name='prob')` {#Gamma.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.rate` {#Gamma.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.reparameterization_type` {#Gamma.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.sample(sample_shape=(), seed=None, name='sample')` {#Gamma.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.stddev(name='stddev')` {#Gamma.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.survival_function(value, name='survival_function')` {#Gamma.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.validate_args` {#Gamma.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.variance(name='variance')` {#Gamma.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.GammaWithSoftplusConcentrationRate` {#GammaWithSoftplusConcentrationRate}
-
-`Gamma` with softplus of `concentration` and `rate`.
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='GammaWithSoftplusConcentrationRate')` {#GammaWithSoftplusConcentrationRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.allow_nan_stats` {#GammaWithSoftplusConcentrationRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.batch_shape` {#GammaWithSoftplusConcentrationRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.batch_shape_tensor(name='batch_shape_tensor')` {#GammaWithSoftplusConcentrationRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.cdf(value, name='cdf')` {#GammaWithSoftplusConcentrationRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.concentration` {#GammaWithSoftplusConcentrationRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.copy(**override_parameters_kwargs)` {#GammaWithSoftplusConcentrationRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.covariance(name='covariance')` {#GammaWithSoftplusConcentrationRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.dtype` {#GammaWithSoftplusConcentrationRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.entropy(name='entropy')` {#GammaWithSoftplusConcentrationRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.event_shape` {#GammaWithSoftplusConcentrationRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.event_shape_tensor(name='event_shape_tensor')` {#GammaWithSoftplusConcentrationRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_continuous` {#GammaWithSoftplusConcentrationRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_scalar_batch(name='is_scalar_batch')` {#GammaWithSoftplusConcentrationRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_scalar_event(name='is_scalar_event')` {#GammaWithSoftplusConcentrationRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_cdf(value, name='log_cdf')` {#GammaWithSoftplusConcentrationRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_prob(value, name='log_prob')` {#GammaWithSoftplusConcentrationRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_survival_function(value, name='log_survival_function')` {#GammaWithSoftplusConcentrationRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.mean(name='mean')` {#GammaWithSoftplusConcentrationRate.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.mode(name='mode')` {#GammaWithSoftplusConcentrationRate.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.name` {#GammaWithSoftplusConcentrationRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#GammaWithSoftplusConcentrationRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.param_static_shapes(cls, sample_shape)` {#GammaWithSoftplusConcentrationRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.parameters` {#GammaWithSoftplusConcentrationRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.prob(value, name='prob')` {#GammaWithSoftplusConcentrationRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.rate` {#GammaWithSoftplusConcentrationRate.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.reparameterization_type` {#GammaWithSoftplusConcentrationRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.sample(sample_shape=(), seed=None, name='sample')` {#GammaWithSoftplusConcentrationRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.stddev(name='stddev')` {#GammaWithSoftplusConcentrationRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.survival_function(value, name='survival_function')` {#GammaWithSoftplusConcentrationRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.validate_args` {#GammaWithSoftplusConcentrationRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.variance(name='variance')` {#GammaWithSoftplusConcentrationRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.InverseGamma` {#InverseGamma}
-
-InverseGamma distribution.
-
-The `InverseGamma` distribution is defined over positive real numbers using
-parameters `concentration` (aka "alpha") and `rate` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta, x > 0) = x**(-alpha - 1) exp(-beta / x) / Z
-Z = Gamma(alpha) beta**-alpha
-```
-
-where:
-
-* `concentration = alpha`,
-* `rate = beta`,
-* `Z` is the normalizing constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The cumulative density function (cdf) is,
-
-```none
-cdf(x; alpha, beta, x > 0) = GammaInc(alpha, beta / x) / Gamma(alpha)
-```
-
-where `GammaInc` is the [upper incomplete Gamma function](
-https://en.wikipedia.org/wiki/Incomplete_gamma_function).
-
-The parameters can be intuited via their relationship to mean and stddev,
-
-```none
-concentration = alpha = (mean / stddev)**2
-rate = beta = mean / stddev**2
-```
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-WARNING: This distribution may draw 0-valued samples for small concentration
-values. See note in `tf.random_gamma` docstring.
-
-#### Examples
-
-```python
-dist = InverseGamma(concentration=3.0, rate=2.0)
-dist2 = InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
-```
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='InverseGamma')` {#InverseGamma.__init__}
-
-Construct InverseGamma with `concentration` and `rate` parameters.
-
-The parameters `concentration` and `rate` must be shaped in a way that
-supports broadcasting (e.g. `concentration + rate` is a valid operation).
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Floating point tensor, the concentration params of the
-    distribution(s). Must contain only positive values.
-*  <b>`rate`</b>: Floating point tensor, the inverse scale params of the
-    distribution(s). Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `concentration` and `rate` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.allow_nan_stats` {#InverseGamma.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.batch_shape` {#InverseGamma.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.batch_shape_tensor(name='batch_shape_tensor')` {#InverseGamma.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.cdf(value, name='cdf')` {#InverseGamma.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.concentration` {#InverseGamma.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.copy(**override_parameters_kwargs)` {#InverseGamma.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.covariance(name='covariance')` {#InverseGamma.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.dtype` {#InverseGamma.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.entropy(name='entropy')` {#InverseGamma.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.event_shape` {#InverseGamma.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.event_shape_tensor(name='event_shape_tensor')` {#InverseGamma.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_continuous` {#InverseGamma.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_scalar_batch(name='is_scalar_batch')` {#InverseGamma.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_scalar_event(name='is_scalar_event')` {#InverseGamma.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_cdf(value, name='log_cdf')` {#InverseGamma.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_prob(value, name='log_prob')` {#InverseGamma.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_survival_function(value, name='log_survival_function')` {#InverseGamma.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.mean(name='mean')` {#InverseGamma.mean}
-
-Mean.
-
-Additional documentation from `InverseGamma`:
-
-The mean of an inverse gamma distribution is
-`rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
-raised rather than returning `NaN`
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.mode(name='mode')` {#InverseGamma.mode}
-
-Mode.
-
-Additional documentation from `InverseGamma`:
-
-The mode of an inverse gamma distribution is `rate / (concentration +
-1)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.name` {#InverseGamma.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#InverseGamma.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.param_static_shapes(cls, sample_shape)` {#InverseGamma.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.parameters` {#InverseGamma.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.prob(value, name='prob')` {#InverseGamma.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.rate` {#InverseGamma.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.reparameterization_type` {#InverseGamma.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.sample(sample_shape=(), seed=None, name='sample')` {#InverseGamma.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.stddev(name='stddev')` {#InverseGamma.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.survival_function(value, name='survival_function')` {#InverseGamma.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.validate_args` {#InverseGamma.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.variance(name='variance')` {#InverseGamma.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `InverseGamma`:
-
-Variance for inverse gamma is defined only for `concentration > 2`. If
-`self.allow_nan_stats` is `False`, an exception will be raised rather
-than returning `NaN`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate` {#InverseGammaWithSoftplusConcentrationRate}
-
-`InverseGamma` with softplus of `concentration` and `rate`.
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='InverseGammaWithSoftplusConcentrationRate')` {#InverseGammaWithSoftplusConcentrationRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.allow_nan_stats` {#InverseGammaWithSoftplusConcentrationRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.batch_shape` {#InverseGammaWithSoftplusConcentrationRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.batch_shape_tensor(name='batch_shape_tensor')` {#InverseGammaWithSoftplusConcentrationRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.cdf(value, name='cdf')` {#InverseGammaWithSoftplusConcentrationRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.concentration` {#InverseGammaWithSoftplusConcentrationRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.copy(**override_parameters_kwargs)` {#InverseGammaWithSoftplusConcentrationRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.covariance(name='covariance')` {#InverseGammaWithSoftplusConcentrationRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.dtype` {#InverseGammaWithSoftplusConcentrationRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.entropy(name='entropy')` {#InverseGammaWithSoftplusConcentrationRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.event_shape` {#InverseGammaWithSoftplusConcentrationRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.event_shape_tensor(name='event_shape_tensor')` {#InverseGammaWithSoftplusConcentrationRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_continuous` {#InverseGammaWithSoftplusConcentrationRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_scalar_batch(name='is_scalar_batch')` {#InverseGammaWithSoftplusConcentrationRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_scalar_event(name='is_scalar_event')` {#InverseGammaWithSoftplusConcentrationRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_cdf(value, name='log_cdf')` {#InverseGammaWithSoftplusConcentrationRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_prob(value, name='log_prob')` {#InverseGammaWithSoftplusConcentrationRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_survival_function(value, name='log_survival_function')` {#InverseGammaWithSoftplusConcentrationRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.mean(name='mean')` {#InverseGammaWithSoftplusConcentrationRate.mean}
-
-Mean.
-
-Additional documentation from `InverseGamma`:
-
-The mean of an inverse gamma distribution is
-`rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
-raised rather than returning `NaN`
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.mode(name='mode')` {#InverseGammaWithSoftplusConcentrationRate.mode}
-
-Mode.
-
-Additional documentation from `InverseGamma`:
-
-The mode of an inverse gamma distribution is `rate / (concentration +
-1)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.name` {#InverseGammaWithSoftplusConcentrationRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#InverseGammaWithSoftplusConcentrationRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.param_static_shapes(cls, sample_shape)` {#InverseGammaWithSoftplusConcentrationRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.parameters` {#InverseGammaWithSoftplusConcentrationRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.prob(value, name='prob')` {#InverseGammaWithSoftplusConcentrationRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.rate` {#InverseGammaWithSoftplusConcentrationRate.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.reparameterization_type` {#InverseGammaWithSoftplusConcentrationRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.sample(sample_shape=(), seed=None, name='sample')` {#InverseGammaWithSoftplusConcentrationRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.stddev(name='stddev')` {#InverseGammaWithSoftplusConcentrationRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.survival_function(value, name='survival_function')` {#InverseGammaWithSoftplusConcentrationRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.validate_args` {#InverseGammaWithSoftplusConcentrationRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.variance(name='variance')` {#InverseGammaWithSoftplusConcentrationRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `InverseGamma`:
-
-Variance for inverse gamma is defined only for `concentration > 2`. If
-`self.allow_nan_stats` is `False`, an exception will be raised rather
-than returning `NaN`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Laplace` {#Laplace}
-
-The Laplace distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) of this distribution is,
-
-```none
-pdf(x; mu, sigma) = exp(-|x - mu| / sigma) / Z
-Z = 2 sigma
-```
-
-where `loc = mu`, `scale = sigma`, and `Z` is the normalization constant.
-
-Note that the Laplace distribution can be thought of two exponential
-distributions spliced together "back-to-back."
-
-The Lpalce distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Laplace(loc=0, scale=1)
-Y = loc + scale * X
-```
-- - -
-
-#### `tf.contrib.distributions.Laplace.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Laplace')` {#Laplace.__init__}
-
-Construct Laplace distribution with parameters `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g., `loc / scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
-    of the distribution.
-*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
-    the distribution.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `loc` and `scale` are of different dtype.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.allow_nan_stats` {#Laplace.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.batch_shape` {#Laplace.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.batch_shape_tensor(name='batch_shape_tensor')` {#Laplace.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.cdf(value, name='cdf')` {#Laplace.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.copy(**override_parameters_kwargs)` {#Laplace.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.covariance(name='covariance')` {#Laplace.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.dtype` {#Laplace.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.entropy(name='entropy')` {#Laplace.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.event_shape` {#Laplace.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.event_shape_tensor(name='event_shape_tensor')` {#Laplace.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_continuous` {#Laplace.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_scalar_batch(name='is_scalar_batch')` {#Laplace.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_scalar_event(name='is_scalar_event')` {#Laplace.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.loc` {#Laplace.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_cdf(value, name='log_cdf')` {#Laplace.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_prob(value, name='log_prob')` {#Laplace.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_survival_function(value, name='log_survival_function')` {#Laplace.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.mean(name='mean')` {#Laplace.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.mode(name='mode')` {#Laplace.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.name` {#Laplace.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Laplace.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.param_static_shapes(cls, sample_shape)` {#Laplace.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.parameters` {#Laplace.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.prob(value, name='prob')` {#Laplace.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.reparameterization_type` {#Laplace.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.sample(sample_shape=(), seed=None, name='sample')` {#Laplace.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.scale` {#Laplace.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.stddev(name='stddev')` {#Laplace.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.survival_function(value, name='survival_function')` {#Laplace.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.validate_args` {#Laplace.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.variance(name='variance')` {#Laplace.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.LaplaceWithSoftplusScale` {#LaplaceWithSoftplusScale}
-
-Laplace with softplus applied to `scale`.
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='LaplaceWithSoftplusScale')` {#LaplaceWithSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.allow_nan_stats` {#LaplaceWithSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.batch_shape` {#LaplaceWithSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#LaplaceWithSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.cdf(value, name='cdf')` {#LaplaceWithSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.copy(**override_parameters_kwargs)` {#LaplaceWithSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.covariance(name='covariance')` {#LaplaceWithSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.dtype` {#LaplaceWithSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.entropy(name='entropy')` {#LaplaceWithSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.event_shape` {#LaplaceWithSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#LaplaceWithSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_continuous` {#LaplaceWithSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#LaplaceWithSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_scalar_event(name='is_scalar_event')` {#LaplaceWithSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.loc` {#LaplaceWithSoftplusScale.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_cdf(value, name='log_cdf')` {#LaplaceWithSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_prob(value, name='log_prob')` {#LaplaceWithSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_survival_function(value, name='log_survival_function')` {#LaplaceWithSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.mean(name='mean')` {#LaplaceWithSoftplusScale.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.mode(name='mode')` {#LaplaceWithSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.name` {#LaplaceWithSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#LaplaceWithSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.param_static_shapes(cls, sample_shape)` {#LaplaceWithSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.parameters` {#LaplaceWithSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.prob(value, name='prob')` {#LaplaceWithSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.reparameterization_type` {#LaplaceWithSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#LaplaceWithSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.scale` {#LaplaceWithSoftplusScale.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.stddev(name='stddev')` {#LaplaceWithSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.survival_function(value, name='survival_function')` {#LaplaceWithSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.validate_args` {#LaplaceWithSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.variance(name='variance')` {#LaplaceWithSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Logistic` {#Logistic}
-
-The Logistic distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The cumulative density function of this distribution is:
-
-```none
-cdf(x; mu, sigma) = 1 / (1 + exp(-(x - mu) / sigma))
-```
-
-where `loc = mu` and `scale = sigma`.
-
-The Logistic distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Logistic(loc=0, scale=1)
-Y = loc + scale * X
-```
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Logistic distribution.
-dist = tf.contrib.distributions.Logistic(loc=0., scale=3.)
-
-# Evaluate the cdf at 1, returning a scalar.
-dist.cdf(1.)
-
-# Define a batch of two scalar valued Logistics.
-# The first has mean 1 and scale 11, the second 2 and 22.
-dist = tf.contrib.distributions.Logistic(loc=[1, 2.], scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-dist.sample([3])
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two scalar valued Logistics.
-# Both have mean 1, but different scales.
-dist = tf.contrib.distributions.Logistic(loc=1., scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.Logistic.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Logistic')` {#Logistic.__init__}
-
-Construct Logistic distributions with mean and scale `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g. `loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor, the means of the distribution(s).
-*  <b>`scale`</b>: Floating point tensor, the scales of the distribution(s). Must
-    contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if loc and scale are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.allow_nan_stats` {#Logistic.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.batch_shape` {#Logistic.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.batch_shape_tensor(name='batch_shape_tensor')` {#Logistic.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.cdf(value, name='cdf')` {#Logistic.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.copy(**override_parameters_kwargs)` {#Logistic.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.covariance(name='covariance')` {#Logistic.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.dtype` {#Logistic.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.entropy(name='entropy')` {#Logistic.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.event_shape` {#Logistic.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.event_shape_tensor(name='event_shape_tensor')` {#Logistic.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_continuous` {#Logistic.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_scalar_batch(name='is_scalar_batch')` {#Logistic.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_scalar_event(name='is_scalar_event')` {#Logistic.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.loc` {#Logistic.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_cdf(value, name='log_cdf')` {#Logistic.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_prob(value, name='log_prob')` {#Logistic.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_survival_function(value, name='log_survival_function')` {#Logistic.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.mean(name='mean')` {#Logistic.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.mode(name='mode')` {#Logistic.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.name` {#Logistic.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Logistic.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.param_static_shapes(cls, sample_shape)` {#Logistic.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.parameters` {#Logistic.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.prob(value, name='prob')` {#Logistic.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.reparameterization_type` {#Logistic.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.sample(sample_shape=(), seed=None, name='sample')` {#Logistic.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.scale` {#Logistic.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.stddev(name='stddev')` {#Logistic.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.survival_function(value, name='survival_function')` {#Logistic.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.validate_args` {#Logistic.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.variance(name='variance')` {#Logistic.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Normal` {#Normal}
-
-The Normal distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; mu, sigma) = exp(-0.5 (x - mu)**2 / sigma**2) / Z
-Z = (2 pi sigma**2)**0.5
-```
-
-where `loc = mu` is the mean, `scale = sigma` is the std. deviation, and, `Z`
-is the normalization constant.
-
-The Normal distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Normal(loc=0, scale=1)
-Y = loc + scale * X
-```
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Normal distribution.
-dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
-
-# Evaluate the cdf at 1, returning a scalar.
-dist.cdf(1.)
-
-# Define a batch of two scalar valued Normals.
-# The first has mean 1 and standard deviation 11, the second 2 and 22.
-dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-dist.sample([3])
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two scalar valued Normals.
-# Both have mean 1, but different standard deviations.
-dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.Normal.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Normal')` {#Normal.__init__}
-
-Construct Normal distributions with mean and stddev `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g. `loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor; the means of the distribution(s).
-*  <b>`scale`</b>: Floating point tensor; the stddevs of the distribution(s).
-    Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `loc` and `scale` have different `dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.allow_nan_stats` {#Normal.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.batch_shape` {#Normal.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.batch_shape_tensor(name='batch_shape_tensor')` {#Normal.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.cdf(value, name='cdf')` {#Normal.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.copy(**override_parameters_kwargs)` {#Normal.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.covariance(name='covariance')` {#Normal.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.dtype` {#Normal.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.entropy(name='entropy')` {#Normal.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.event_shape` {#Normal.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.event_shape_tensor(name='event_shape_tensor')` {#Normal.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_continuous` {#Normal.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_scalar_batch(name='is_scalar_batch')` {#Normal.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_scalar_event(name='is_scalar_event')` {#Normal.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.loc` {#Normal.loc}
-
-Distribution parameter for the mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_cdf(value, name='log_cdf')` {#Normal.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_prob(value, name='log_prob')` {#Normal.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_survival_function(value, name='log_survival_function')` {#Normal.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.mean(name='mean')` {#Normal.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.mode(name='mode')` {#Normal.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.name` {#Normal.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Normal.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.param_static_shapes(cls, sample_shape)` {#Normal.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.parameters` {#Normal.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.prob(value, name='prob')` {#Normal.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.reparameterization_type` {#Normal.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.sample(sample_shape=(), seed=None, name='sample')` {#Normal.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.scale` {#Normal.scale}
-
-Distribution parameter for standard deviation.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.stddev(name='stddev')` {#Normal.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.survival_function(value, name='survival_function')` {#Normal.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.validate_args` {#Normal.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.variance(name='variance')` {#Normal.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.NormalWithSoftplusScale` {#NormalWithSoftplusScale}
-
-Normal with softplus applied to `scale`.
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='NormalWithSoftplusScale')` {#NormalWithSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.allow_nan_stats` {#NormalWithSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.batch_shape` {#NormalWithSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#NormalWithSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.cdf(value, name='cdf')` {#NormalWithSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.copy(**override_parameters_kwargs)` {#NormalWithSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.covariance(name='covariance')` {#NormalWithSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.dtype` {#NormalWithSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.entropy(name='entropy')` {#NormalWithSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.event_shape` {#NormalWithSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#NormalWithSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_continuous` {#NormalWithSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#NormalWithSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_scalar_event(name='is_scalar_event')` {#NormalWithSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.loc` {#NormalWithSoftplusScale.loc}
-
-Distribution parameter for the mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_cdf(value, name='log_cdf')` {#NormalWithSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_prob(value, name='log_prob')` {#NormalWithSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_survival_function(value, name='log_survival_function')` {#NormalWithSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.mean(name='mean')` {#NormalWithSoftplusScale.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.mode(name='mode')` {#NormalWithSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.name` {#NormalWithSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#NormalWithSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.param_static_shapes(cls, sample_shape)` {#NormalWithSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.parameters` {#NormalWithSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.prob(value, name='prob')` {#NormalWithSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.reparameterization_type` {#NormalWithSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#NormalWithSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.scale` {#NormalWithSoftplusScale.scale}
-
-Distribution parameter for standard deviation.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.stddev(name='stddev')` {#NormalWithSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.survival_function(value, name='survival_function')` {#NormalWithSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.validate_args` {#NormalWithSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.variance(name='variance')` {#NormalWithSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Poisson` {#Poisson}
-
-Poisson distribution.
-
-The Poisson distribution is parameterized by an event `rate` parameter.
-
-#### Mathematical Details
-
-The probability mass function (pmf) is,
-
-```none
-pmf(k; lambda, k >= 0) = (lambda^k / k!) / Z
-Z = exp(lambda).
-```
-
-where `rate = lambda` and `Z` is the normalizing constant.
-- - -
-
-#### `tf.contrib.distributions.Poisson.__init__(rate, validate_args=False, allow_nan_stats=True, name='Poisson')` {#Poisson.__init__}
-
-Initialize a batch of Poisson distributions.
-
-##### Args:
-
-
-*  <b>`rate`</b>: Floating point tensor, the rate parameter of the
-    distribution(s). `rate` must be positive.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.allow_nan_stats` {#Poisson.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.batch_shape` {#Poisson.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.batch_shape_tensor(name='batch_shape_tensor')` {#Poisson.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.cdf(value, name='cdf')` {#Poisson.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.copy(**override_parameters_kwargs)` {#Poisson.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.covariance(name='covariance')` {#Poisson.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.dtype` {#Poisson.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.entropy(name='entropy')` {#Poisson.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.event_shape` {#Poisson.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.event_shape_tensor(name='event_shape_tensor')` {#Poisson.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_continuous` {#Poisson.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_scalar_batch(name='is_scalar_batch')` {#Poisson.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_scalar_event(name='is_scalar_event')` {#Poisson.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_cdf(value, name='log_cdf')` {#Poisson.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_prob(value, name='log_prob')` {#Poisson.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_survival_function(value, name='log_survival_function')` {#Poisson.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.mean(name='mean')` {#Poisson.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.mode(name='mode')` {#Poisson.mode}
-
-Mode.
-
-Additional documentation from `Poisson`:
-
-Note: when `rate` is an integer, there are actually two modes: `rate`
-and `rate - 1`. In this case we return the larger, i.e., `rate`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.name` {#Poisson.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Poisson.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.param_static_shapes(cls, sample_shape)` {#Poisson.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.parameters` {#Poisson.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.prob(value, name='prob')` {#Poisson.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.rate` {#Poisson.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.reparameterization_type` {#Poisson.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.sample(sample_shape=(), seed=None, name='sample')` {#Poisson.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.stddev(name='stddev')` {#Poisson.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.survival_function(value, name='survival_function')` {#Poisson.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.validate_args` {#Poisson.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.variance(name='variance')` {#Poisson.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.StudentT` {#StudentT}
-
-Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; df, mu, sigma) = (1 + y**2 / df)**(-0.5 (df + 1)) / Z
-where,
-y = (x - mu) / sigma
-Z = abs(sigma) sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1))
-```
-
-where:
-* `loc = mu`,
-* `scale = sigma`, and,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The StudentT distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ StudentT(df, loc=0, scale=1)
-Y = loc + scale * X
-```
-
-Notice that `scale` has semantics more similar to standard deviation than
-variance.  However it is not actually the std. deviation; the Student's
-t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Student t distribution.
-single_dist = tf.contrib.distributions.StudentT(df=3)
-
-# Evaluate the pdf at 1, returning a scalar Tensor.
-single_dist.prob(1.)
-
-# Define a batch of two scalar valued Student t's.
-# The first has degrees of freedom 2, mean 1, and scale 11.
-# The second 3, 2 and 22.
-multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
-                                               loc=[1, 2.],
-                                               scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-multi_dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-multi_dist.sample(3)
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two Student's t distributions.
-# Both have df 2 and mean 1, but different scales.
-dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.StudentT.__init__(df, loc, scale, validate_args=False, allow_nan_stats=True, name='StudentT')` {#StudentT.__init__}
-
-Construct Student's t distributions.
-
-The distributions have degree of freedom `df`, mean `loc`, and scale
-`scale`.
-
-The parameters `df`, `loc`, and `scale` must be shaped in a way that
-supports broadcasting (e.g. `df + loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`df`</b>: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-    `df` must contain only positive values.
-*  <b>`loc`</b>: Numeric `Tensor`. The mean(s) of the distribution(s).
-*  <b>`scale`</b>: Numeric `Tensor`. The scaling factor(s) for the distribution(s).
-    Note that `scale` is not technically the standard deviation of this
-    distribution but has semantics more similar to standard deviation than
-    variance.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if loc and scale are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.allow_nan_stats` {#StudentT.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.batch_shape` {#StudentT.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.batch_shape_tensor(name='batch_shape_tensor')` {#StudentT.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.cdf(value, name='cdf')` {#StudentT.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.copy(**override_parameters_kwargs)` {#StudentT.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.covariance(name='covariance')` {#StudentT.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.df` {#StudentT.df}
-
-Degrees of freedom in these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.dtype` {#StudentT.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.entropy(name='entropy')` {#StudentT.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.event_shape` {#StudentT.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.event_shape_tensor(name='event_shape_tensor')` {#StudentT.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_continuous` {#StudentT.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_scalar_batch(name='is_scalar_batch')` {#StudentT.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_scalar_event(name='is_scalar_event')` {#StudentT.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.loc` {#StudentT.loc}
-
-Locations of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_cdf(value, name='log_cdf')` {#StudentT.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_prob(value, name='log_prob')` {#StudentT.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_survival_function(value, name='log_survival_function')` {#StudentT.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.mean(name='mean')` {#StudentT.mean}
-
-Mean.
-
-Additional documentation from `StudentT`:
-
-The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-`NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
-rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.mode(name='mode')` {#StudentT.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.name` {#StudentT.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#StudentT.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.param_static_shapes(cls, sample_shape)` {#StudentT.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.parameters` {#StudentT.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.prob(value, name='prob')` {#StudentT.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.reparameterization_type` {#StudentT.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.sample(sample_shape=(), seed=None, name='sample')` {#StudentT.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.scale` {#StudentT.scale}
-
-Scaling factors of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.stddev(name='stddev')` {#StudentT.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.survival_function(value, name='survival_function')` {#StudentT.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.validate_args` {#StudentT.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.variance(name='variance')` {#StudentT.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `StudentT`:
-
-The variance for Student's T equals
-
-```
-df / (df - 2), when df > 2
-infinity, when 1 < df <= 2
-NaN, when df <= 1
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.StudentTWithAbsDfSoftplusScale` {#StudentTWithAbsDfSoftplusScale}
-
-StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`.
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.__init__(df, loc, scale, validate_args=False, allow_nan_stats=True, name='StudentTWithAbsDfSoftplusScale')` {#StudentTWithAbsDfSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.allow_nan_stats` {#StudentTWithAbsDfSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.batch_shape` {#StudentTWithAbsDfSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#StudentTWithAbsDfSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.cdf(value, name='cdf')` {#StudentTWithAbsDfSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.copy(**override_parameters_kwargs)` {#StudentTWithAbsDfSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.covariance(name='covariance')` {#StudentTWithAbsDfSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.df` {#StudentTWithAbsDfSoftplusScale.df}
-
-Degrees of freedom in these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.dtype` {#StudentTWithAbsDfSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.entropy(name='entropy')` {#StudentTWithAbsDfSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.event_shape` {#StudentTWithAbsDfSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#StudentTWithAbsDfSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_continuous` {#StudentTWithAbsDfSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#StudentTWithAbsDfSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_scalar_event(name='is_scalar_event')` {#StudentTWithAbsDfSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.loc` {#StudentTWithAbsDfSoftplusScale.loc}
-
-Locations of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_cdf(value, name='log_cdf')` {#StudentTWithAbsDfSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_prob(value, name='log_prob')` {#StudentTWithAbsDfSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_survival_function(value, name='log_survival_function')` {#StudentTWithAbsDfSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.mean(name='mean')` {#StudentTWithAbsDfSoftplusScale.mean}
-
-Mean.
-
-Additional documentation from `StudentT`:
-
-The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-`NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
-rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.mode(name='mode')` {#StudentTWithAbsDfSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.name` {#StudentTWithAbsDfSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#StudentTWithAbsDfSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.param_static_shapes(cls, sample_shape)` {#StudentTWithAbsDfSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.parameters` {#StudentTWithAbsDfSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.prob(value, name='prob')` {#StudentTWithAbsDfSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.reparameterization_type` {#StudentTWithAbsDfSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#StudentTWithAbsDfSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.scale` {#StudentTWithAbsDfSoftplusScale.scale}
-
-Scaling factors of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.stddev(name='stddev')` {#StudentTWithAbsDfSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.survival_function(value, name='survival_function')` {#StudentTWithAbsDfSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.validate_args` {#StudentTWithAbsDfSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.variance(name='variance')` {#StudentTWithAbsDfSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `StudentT`:
-
-The variance for Student's T equals
-
-```
-df / (df - 2), when df > 2
-infinity, when 1 < df <= 2
-NaN, when df <= 1
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Uniform` {#Uniform}
-
-Uniform distribution with `low` and `high` parameters.
-
-### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; a, b) = I[a <= x < b] / Z
-Z = b - a
-```
-
-where:
-* `low = a`,
-* `high = b`,
-* `Z` is the normalizing constant, and,
-* `I[predicate]` is the [indicator function](
-  https://en.wikipedia.org/wiki/Indicator_function) for `predicate`.
-
-The parameters `low` and `high` must be shaped in a way that supports
-broadcasting (e.g., `high - low` is a valid operation).
-
-### Examples
-
-```python
-# Without broadcasting:
-u1 = Uniform(low=3.0, high=4.0)  # a single uniform distribution [3, 4]
-u2 = Uniform(low=[1.0, 2.0],
-             high=[3.0, 4.0])  # 2 distributions [1, 3], [2, 4]
-u3 = Uniform(low=[[1.0, 2.0],
-                  [3.0, 4.0]],
-             high=[[1.5, 2.5],
-                   [3.5, 4.5]])  # 4 distributions
-```
-
-```python
-# With broadcasting:
-u1 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])  # 3 distributions
-```
-- - -
-
-#### `tf.contrib.distributions.Uniform.__init__(low=0.0, high=1.0, validate_args=False, allow_nan_stats=True, name='Uniform')` {#Uniform.__init__}
-
-Initialize a batch of Uniform distributions.
-
-##### Args:
-
-
-*  <b>`low`</b>: Floating point tensor, lower boundary of the output interval. Must
-    have `low < high`.
-*  <b>`high`</b>: Floating point tensor, upper boundary of the output interval. Must
-    have `low < high`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `low >= high` and `validate_args=False`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.allow_nan_stats` {#Uniform.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.batch_shape` {#Uniform.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.batch_shape_tensor(name='batch_shape_tensor')` {#Uniform.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.cdf(value, name='cdf')` {#Uniform.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.copy(**override_parameters_kwargs)` {#Uniform.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.covariance(name='covariance')` {#Uniform.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.dtype` {#Uniform.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.entropy(name='entropy')` {#Uniform.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.event_shape` {#Uniform.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.event_shape_tensor(name='event_shape_tensor')` {#Uniform.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.high` {#Uniform.high}
-
-Upper boundary of the output interval.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_continuous` {#Uniform.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_scalar_batch(name='is_scalar_batch')` {#Uniform.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_scalar_event(name='is_scalar_event')` {#Uniform.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_cdf(value, name='log_cdf')` {#Uniform.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_prob(value, name='log_prob')` {#Uniform.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_survival_function(value, name='log_survival_function')` {#Uniform.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.low` {#Uniform.low}
-
-Lower boundary of the output interval.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.mean(name='mean')` {#Uniform.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.mode(name='mode')` {#Uniform.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.name` {#Uniform.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Uniform.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.param_static_shapes(cls, sample_shape)` {#Uniform.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.parameters` {#Uniform.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.prob(value, name='prob')` {#Uniform.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.range(name='range')` {#Uniform.range}
-
-`high - low`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.reparameterization_type` {#Uniform.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.sample(sample_shape=(), seed=None, name='sample')` {#Uniform.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.stddev(name='stddev')` {#Uniform.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.survival_function(value, name='survival_function')` {#Uniform.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.validate_args` {#Uniform.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.variance(name='variance')` {#Uniform.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-## Multivariate distributions
-
-### Multivariate normal
-
-- - -
-
-### `class tf.contrib.distributions.MultivariateNormalDiag` {#MultivariateNormalDiag}
-
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and a 1-D diagonal
-`diag_stddev`, representing the standard deviations.  This distribution
-assumes the random variables, `(X_1,...,X_k)` are independent, thus no
-non-diagonal terms of the covariance matrix are needed.
-
-This allows for `O(k)` pdf evaluation, sampling, and storage.
-
-#### Mathematical details
-
-The PDF of this distribution is defined in terms of the diagonal covariance
-determined by `diag_stddev`: `C_{ii} = diag_stddev[i]**2`.
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and the square roots of the (independent) random variables.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal standard deviation.
-mu = [1, 2, 3.]
-diag_stddev = [4, 5, 6.]
-dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-diag_stddev = ...  # shape 2 x 3, positive.
-dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.__init__(mu, diag_stddev, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiag')` {#MultivariateNormalDiag.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and standard deviations `diag_stddev`.
-Each batch member represents a random vector `(X_1,...,X_k)` of independent
-random normals.
-The mean of `X_i` is `mu[i]`, and the standard deviation is
-`diag_stddev[i]`.
-
-##### Args:
-
-
-*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`diag_stddev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-    representing the standard deviations.  Must be positive.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate
-    input with asserts.  If `validate_args` is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `diag_stddev` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.allow_nan_stats` {#MultivariateNormalDiag.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.batch_shape` {#MultivariateNormalDiag.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiag.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.cdf(value, name='cdf')` {#MultivariateNormalDiag.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.copy(**override_parameters_kwargs)` {#MultivariateNormalDiag.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.covariance(name='covariance')` {#MultivariateNormalDiag.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.dtype` {#MultivariateNormalDiag.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.entropy(name='entropy')` {#MultivariateNormalDiag.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.event_shape` {#MultivariateNormalDiag.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiag.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_continuous` {#MultivariateNormalDiag.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiag.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiag.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiag.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_prob(value, name='log_prob')` {#MultivariateNormalDiag.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiag.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiag.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mean(name='mean')` {#MultivariateNormalDiag.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mode(name='mode')` {#MultivariateNormalDiag.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mu` {#MultivariateNormalDiag.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.name` {#MultivariateNormalDiag.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiag.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiag.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.parameters` {#MultivariateNormalDiag.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.prob(value, name='prob')` {#MultivariateNormalDiag.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.reparameterization_type` {#MultivariateNormalDiag.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiag.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sigma` {#MultivariateNormalDiag.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sigma_det(name='sigma_det')` {#MultivariateNormalDiag.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.stddev(name='stddev')` {#MultivariateNormalDiag.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.survival_function(value, name='survival_function')` {#MultivariateNormalDiag.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.validate_args` {#MultivariateNormalDiag.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.variance(name='variance')` {#MultivariateNormalDiag.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.MultivariateNormalFull` {#MultivariateNormalFull}
-
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and covariance matrix `sigma`.
-Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations.
-
-#### Mathematical details
-
-With `C = sigma`, the PDF of this distribution is:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and a covariance matrix of shape `k x k`.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal covariance.
-mu = [1, 2, 3.]
-sigma = [[1, 0, 0], [0, 3, 0], [0, 0, 2.]]
-dist = tf.contrib.distributions.MultivariateNormalFull(mu, chol)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33.]]
-sigma = ...  # shape 2 x 3 x 3, positive definite.
-dist = tf.contrib.distributions.MultivariateNormalFull(mu, sigma)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11.]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.__init__(mu, sigma, validate_args=False, allow_nan_stats=True, name='MultivariateNormalFull')` {#MultivariateNormalFull.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and `sigma`, the mean and covariance.
-
-##### Args:
-
-
-*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`, and the inputs are
-    invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `sigma` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.allow_nan_stats` {#MultivariateNormalFull.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.batch_shape` {#MultivariateNormalFull.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalFull.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.cdf(value, name='cdf')` {#MultivariateNormalFull.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.copy(**override_parameters_kwargs)` {#MultivariateNormalFull.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.covariance(name='covariance')` {#MultivariateNormalFull.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.dtype` {#MultivariateNormalFull.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.entropy(name='entropy')` {#MultivariateNormalFull.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.event_shape` {#MultivariateNormalFull.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalFull.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_continuous` {#MultivariateNormalFull.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalFull.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalFull.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_cdf(value, name='log_cdf')` {#MultivariateNormalFull.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_prob(value, name='log_prob')` {#MultivariateNormalFull.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalFull.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalFull.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mean(name='mean')` {#MultivariateNormalFull.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mode(name='mode')` {#MultivariateNormalFull.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mu` {#MultivariateNormalFull.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.name` {#MultivariateNormalFull.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalFull.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.param_static_shapes(cls, sample_shape)` {#MultivariateNormalFull.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.parameters` {#MultivariateNormalFull.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.prob(value, name='prob')` {#MultivariateNormalFull.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.reparameterization_type` {#MultivariateNormalFull.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalFull.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sigma` {#MultivariateNormalFull.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sigma_det(name='sigma_det')` {#MultivariateNormalFull.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.stddev(name='stddev')` {#MultivariateNormalFull.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.survival_function(value, name='survival_function')` {#MultivariateNormalFull.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.validate_args` {#MultivariateNormalFull.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.variance(name='variance')` {#MultivariateNormalFull.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.MultivariateNormalCholesky` {#MultivariateNormalCholesky}
-
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and a Cholesky factor `chol`.
-Providing the Cholesky factor allows for `O(k^2)` pdf evaluation and sampling,
-and requires `O(k^2)` storage.
-
-#### Mathematical details
-
-The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
-
-The PDF of this distribution is then:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and a covariance matrix of shape `k x k`.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal covariance.
-# Note, this would be more efficient with MultivariateNormalDiag.
-mu = [1, 2, 3.]
-chol = [[1, 0, 0], [0, 3, 0], [0, 0, 2]]
-dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33]]
-chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-
-Trainable (batch) Cholesky matrices can be created with
-`tf.contrib.distributions.matrix_diag_transform()`
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.__init__(mu, chol, validate_args=False, allow_nan_stats=True, name='MultivariateNormalCholesky')` {#MultivariateNormalCholesky.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and `chol` which holds the (batch) Cholesky
-factors, such that the covariance of each batch member is `chol chol^T`.
-
-##### Args:
-
-
-*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
-    though it is zero), and the diagonal must be positive.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`, and the inputs are
-    invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `chol` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.allow_nan_stats` {#MultivariateNormalCholesky.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.batch_shape` {#MultivariateNormalCholesky.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalCholesky.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.cdf(value, name='cdf')` {#MultivariateNormalCholesky.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.copy(**override_parameters_kwargs)` {#MultivariateNormalCholesky.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.covariance(name='covariance')` {#MultivariateNormalCholesky.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.dtype` {#MultivariateNormalCholesky.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.entropy(name='entropy')` {#MultivariateNormalCholesky.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.event_shape` {#MultivariateNormalCholesky.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalCholesky.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_continuous` {#MultivariateNormalCholesky.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalCholesky.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalCholesky.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_cdf(value, name='log_cdf')` {#MultivariateNormalCholesky.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_prob(value, name='log_prob')` {#MultivariateNormalCholesky.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalCholesky.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalCholesky.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mean(name='mean')` {#MultivariateNormalCholesky.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mode(name='mode')` {#MultivariateNormalCholesky.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mu` {#MultivariateNormalCholesky.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.name` {#MultivariateNormalCholesky.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalCholesky.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.param_static_shapes(cls, sample_shape)` {#MultivariateNormalCholesky.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.parameters` {#MultivariateNormalCholesky.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.prob(value, name='prob')` {#MultivariateNormalCholesky.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.reparameterization_type` {#MultivariateNormalCholesky.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalCholesky.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sigma` {#MultivariateNormalCholesky.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sigma_det(name='sigma_det')` {#MultivariateNormalCholesky.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.stddev(name='stddev')` {#MultivariateNormalCholesky.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.survival_function(value, name='survival_function')` {#MultivariateNormalCholesky.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.validate_args` {#MultivariateNormalCholesky.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.variance(name='variance')` {#MultivariateNormalCholesky.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` {#MultivariateNormalDiagPlusVDVT}
-
-The multivariate normal distribution on `R^k`.
-
-Every batch member of this distribution is defined by a mean and a lightweight
-covariance matrix `C`.
-
-#### Mathematical details
-
-The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-For every batch member, this distribution represents `k` random variables
-`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-The user initializes this class by providing the mean `mu`, and a lightweight
-definition of `C`:
-
-```
-C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-M is diagonal (k x k)
-V = is shape (k x r), typically r << k
-D = is diagonal (r x r), optional (defaults to identity).
-```
-
-This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
-sampling and storage (per batch member).
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
-leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with covariance square root
-# S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
-mu = [1, 2, 3.]
-diag_large = [1.1, 2.2, 3.3]
-v = ... # shape 3 x 2
-diag_small = [4., 5.]
-dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-    mu, diag_large, v, diag_small=diag_small)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.  This time, don't provide
-# diag_small.  This means S = M + V V^T.
-mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-diag_large = ... # shape 2 x 3
-v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
-dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-    mu, diag_large, v)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.__init__(mu, diag_large, v, diag_small=None, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiagPlusVDVT')` {#MultivariateNormalDiagPlusVDVT.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-For every batch member, this distribution represents `k` random variables
-`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-The user initializes this class by providing the mean `mu`, and a
-lightweight definition of `C`:
-
-```
-C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-M is diagonal (k x k)
-V = is shape (k x r), typically r << k
-D = is diagonal (r x r), optional (defaults to identity).
-```
-
-##### Args:
-
-
-*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
-    `n >= 0`.  The means.
-*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
-    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
-    `n >= 0`.  Defines the matrix `V`.
-*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
-    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
-    is `None`, which means `D` will be the identity matrix.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.allow_nan_stats` {#MultivariateNormalDiagPlusVDVT.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape` {#MultivariateNormalDiagPlusVDVT.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiagPlusVDVT.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.cdf(value, name='cdf')` {#MultivariateNormalDiagPlusVDVT.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagPlusVDVT.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.covariance(name='covariance')` {#MultivariateNormalDiagPlusVDVT.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.entropy(name='entropy')` {#MultivariateNormalDiagPlusVDVT.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape` {#MultivariateNormalDiagPlusVDVT.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiagPlusVDVT.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_continuous` {#MultivariateNormalDiagPlusVDVT.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiagPlusVDVT.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiagPlusVDVT.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagPlusVDVT.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_prob(value, name='log_prob')` {#MultivariateNormalDiagPlusVDVT.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagPlusVDVT.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiagPlusVDVT.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mean(name='mean')` {#MultivariateNormalDiagPlusVDVT.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mode(name='mode')` {#MultivariateNormalDiagPlusVDVT.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mu` {#MultivariateNormalDiagPlusVDVT.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.name` {#MultivariateNormalDiagPlusVDVT.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiagPlusVDVT.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiagPlusVDVT.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.parameters` {#MultivariateNormalDiagPlusVDVT.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.prob(value, name='prob')` {#MultivariateNormalDiagPlusVDVT.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.reparameterization_type` {#MultivariateNormalDiagPlusVDVT.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagPlusVDVT.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma` {#MultivariateNormalDiagPlusVDVT.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma_det(name='sigma_det')` {#MultivariateNormalDiagPlusVDVT.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.stddev(name='stddev')` {#MultivariateNormalDiagPlusVDVT.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.survival_function(value, name='survival_function')` {#MultivariateNormalDiagPlusVDVT.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.validate_args` {#MultivariateNormalDiagPlusVDVT.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.variance(name='variance')` {#MultivariateNormalDiagPlusVDVT.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev` {#MultivariateNormalDiagWithSoftplusStDev}
-
-MultivariateNormalDiag with `diag_stddev = softplus(diag_stddev)`.
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.__init__(mu, diag_stddev, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiagWithSoftplusStdDev')` {#MultivariateNormalDiagWithSoftplusStDev.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.allow_nan_stats` {#MultivariateNormalDiagWithSoftplusStDev.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.batch_shape` {#MultivariateNormalDiagWithSoftplusStDev.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiagWithSoftplusStDev.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.cdf(value, name='cdf')` {#MultivariateNormalDiagWithSoftplusStDev.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagWithSoftplusStDev.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.covariance(name='covariance')` {#MultivariateNormalDiagWithSoftplusStDev.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.dtype` {#MultivariateNormalDiagWithSoftplusStDev.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.entropy(name='entropy')` {#MultivariateNormalDiagWithSoftplusStDev.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.event_shape` {#MultivariateNormalDiagWithSoftplusStDev.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiagWithSoftplusStDev.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_continuous` {#MultivariateNormalDiagWithSoftplusStDev.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiagWithSoftplusStDev.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiagWithSoftplusStDev.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagWithSoftplusStDev.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_prob(value, name='log_prob')` {#MultivariateNormalDiagWithSoftplusStDev.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagWithSoftplusStDev.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiagWithSoftplusStDev.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mean(name='mean')` {#MultivariateNormalDiagWithSoftplusStDev.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mode(name='mode')` {#MultivariateNormalDiagWithSoftplusStDev.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mu` {#MultivariateNormalDiagWithSoftplusStDev.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.name` {#MultivariateNormalDiagWithSoftplusStDev.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiagWithSoftplusStDev.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiagWithSoftplusStDev.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.parameters` {#MultivariateNormalDiagWithSoftplusStDev.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.prob(value, name='prob')` {#MultivariateNormalDiagWithSoftplusStDev.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.reparameterization_type` {#MultivariateNormalDiagWithSoftplusStDev.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagWithSoftplusStDev.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sigma` {#MultivariateNormalDiagWithSoftplusStDev.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sigma_det(name='sigma_det')` {#MultivariateNormalDiagWithSoftplusStDev.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.stddev(name='stddev')` {#MultivariateNormalDiagWithSoftplusStDev.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.survival_function(value, name='survival_function')` {#MultivariateNormalDiagWithSoftplusStDev.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.validate_args` {#MultivariateNormalDiagWithSoftplusStDev.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.variance(name='variance')` {#MultivariateNormalDiagWithSoftplusStDev.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-### Other multivariate distributions
-
-- - -
-
-### `class tf.contrib.distributions.Dirichlet` {#Dirichlet}
-
-Dirichlet distribution.
-
-The Dirichlet distribution is defined over the
-[`(k-1)`-simplex](https://en.wikipedia.org/wiki/Simplex) using a positive,
-length-`k` vector `concentration` (`k > 1`). The Dirichlet is identically the
-Beta distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Dirichlet is a distribution over the open `(k-1)`-simplex, i.e.,
-
-```none
-S^{k-1} = { (x_0, ..., x_{k-1}) in R^k : sum_j x_j = 1 and all_j x_j > 0 }.
-```
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha) = prod_j x_j**(alpha_j - 1) / Z
-Z = prod_j Gamma(alpha_j) / Gamma(sum_j alpha_j)
-```
-
-where:
-
-* `x in S^{k-1}`, i.e., the `(k-1)`-simplex,
-* `concentration = alpha = [alpha_0, ..., alpha_{k-1}]`, `alpha_j > 0`,
-* `Z` is the normalization constant aka the [multivariate beta function](
-  https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function),
-  and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The `concentration` represents mean total counts of class occurrence, i.e.,
-
-```none
-concentration = alpha = mean * total_concentration
-```
-
-where `mean` in `S^{k-1}` and `total_concentration` is a positive real number
-representing a mean total count.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-# Create a single trivariate Dirichlet, with the 3rd class being three times
-# more frequent than the first. I.e., batch_shape=[], event_shape=[3].
-alpha = [1., 2, 3]
-dist = Dirichlet(alpha)
-
-dist.sample([4, 5])  # shape: [4, 5, 3]
-
-# x has one sample, one batch, three classes:
-x = [.2, .3, .5]   # shape: [3]
-dist.prob(x)       # shape: []
-
-# x has two samples from one batch:
-x = [[.1, .4, .5],
-     [.2, .3, .5]]
-dist.prob(x)         # shape: [2]
-
-# alpha will be broadcast to shape [5, 7, 3] to match x.
-x = [[...]]   # shape: [5, 7, 3]
-dist.prob(x)  # shape: [5, 7]
-```
-
-```python
-# Create batch_shape=[2], event_shape=[3]:
-alpha = [[1., 2, 3],
-         [4, 5, 6]]   # shape: [2, 3]
-dist = Dirichlet(alpha)
-
-dist.sample([4, 5])  # shape: [4, 5, 2, 3]
-
-x = [.2, .3, .5]
-# x will be broadcast as [[.2, .3, .5],
-#                         [.2, .3, .5]],
-# thus matching batch_shape [2, 3].
-dist.prob(x)         # shape: [2]
-```
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.__init__(concentration, validate_args=False, allow_nan_stats=True, name='Dirichlet')` {#Dirichlet.__init__}
-
-Initialize a batch of Dirichlet distributions.
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Positive floating-point `Tensor` indicating mean number
-    of class occurrences; aka "alpha". Implies `self.dtype`, and
-    `self.batch_shape`, `self.event_shape`, i.e., if
-    `concentration.shape = [N1, N2, ..., Nm, k]` then
-    `batch_shape = [N1, N2, ..., Nm]` and
-    `event_shape = [k]`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.allow_nan_stats` {#Dirichlet.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.batch_shape` {#Dirichlet.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.batch_shape_tensor(name='batch_shape_tensor')` {#Dirichlet.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.cdf(value, name='cdf')` {#Dirichlet.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.concentration` {#Dirichlet.concentration}
-
-Concentration parameter; expected counts for that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.copy(**override_parameters_kwargs)` {#Dirichlet.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.covariance(name='covariance')` {#Dirichlet.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.dtype` {#Dirichlet.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.entropy(name='entropy')` {#Dirichlet.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.event_shape` {#Dirichlet.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.event_shape_tensor(name='event_shape_tensor')` {#Dirichlet.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_continuous` {#Dirichlet.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_scalar_batch(name='is_scalar_batch')` {#Dirichlet.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_scalar_event(name='is_scalar_event')` {#Dirichlet.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_cdf(value, name='log_cdf')` {#Dirichlet.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_prob(value, name='log_prob')` {#Dirichlet.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Dirichlet`:
-
-Note: `value` must be a non-negative tensor with
-dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
-`tf.reduce_sum(value, -1) = 1`. It must have a shape compatible with
-`self.batch_shape() + self.event_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_survival_function(value, name='log_survival_function')` {#Dirichlet.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.mean(name='mean')` {#Dirichlet.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.mode(name='mode')` {#Dirichlet.mode}
-
-Mode.
-
-Additional documentation from `Dirichlet`:
-
-Note: The mode is undefined when any `concentration <= 1`. If
-`self.allow_nan_stats` is `True`, `NaN` is used for undefined modes.  If
-`self.allow_nan_stats` is `False` an exception is raised when one or more
-modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.name` {#Dirichlet.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Dirichlet.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.param_static_shapes(cls, sample_shape)` {#Dirichlet.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.parameters` {#Dirichlet.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.prob(value, name='prob')` {#Dirichlet.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Dirichlet`:
-
-Note: `value` must be a non-negative tensor with
-dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
-`tf.reduce_sum(value, -1) = 1`. It must have a shape compatible with
-`self.batch_shape() + self.event_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.reparameterization_type` {#Dirichlet.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.sample(sample_shape=(), seed=None, name='sample')` {#Dirichlet.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.stddev(name='stddev')` {#Dirichlet.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.survival_function(value, name='survival_function')` {#Dirichlet.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.total_concentration` {#Dirichlet.total_concentration}
-
-Sum of last dim of concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.validate_args` {#Dirichlet.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.variance(name='variance')` {#Dirichlet.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.DirichletMultinomial` {#DirichletMultinomial}
-
-Dirichlet-Multinomial compound distribution.
-
-The Dirichlet-Multinomial distribution is parameterized by a (batch of)
-length-`k` `concentration` vectors (`k > 1`) and a `total_count` number of
-trials, i.e., the number of trials per draw from the DirichletMultinomial. It
-is defined over a (batch of) length-`k` vector `counts` such that
-`tf.reduce_sum(counts, -1) = total_count`. The Dirichlet-Multinomial is
-identically the Beta-Binomial distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Dirichlet-Multinomial is a distribution over `k`-class counts, i.e., a
-length-`k` vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(n; alpha, N) = Beta(alpha + n) / (prod_j n_j!) / Z
-Z = Beta(alpha) / N!
-```
-
-where:
-
-* `concentration = alpha = [alpha_0, ..., alpha_{k-1}]`, `alpha_j > 0`,
-* `total_count = N`, `N` a positive integer,
-* `N!` is `N` factorial, and,
-* `Beta(x) = prod_j Gamma(x_j) / Gamma(sum_j x_j)` is the
-  [multivariate beta function](
-  https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function),
-  and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-Dirichlet-Multinomial is a [compound distribution](
-https://en.wikipedia.org/wiki/Compound_probability_distribution), i.e., its
-samples are generated as follows.
-
-  1. Choose class probabilities:
-     `probs = [p_0,...,p_{k-1}] ~ Dir(concentration)`
-  2. Draw integers:
-     `counts = [n_0,...,n_{k-1}] ~ Multinomial(total_count, probs)`
-
-The last `concentration` dimension parametrizes a single Dirichlet-Multinomial
-distribution. When calling distribution functions (e.g., `dist.prob(counts)`),
-`concentration`, `total_count` and `counts` are broadcast to the same shape.
-The last dimension of of `counts` corresponds single Dirichlet-Multinomial
-distributions.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-alpha = [1, 2, 3]
-n = 2
-dist = DirichletMultinomial(n, alpha)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as alpha.
-counts = [0, 0, 2]
-dist.prob(counts)  # Shape []
-
-# alpha will be broadcast to [[1, 2, 3], [1, 2, 3]] to match counts.
-counts = [[1, 1, 0], [1, 0, 1]]
-dist.prob(counts)  # Shape [2]
-
-# alpha will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7]
-```
-
-Creates a 2-batch of 3-class distributions.
-
-```python
-alpha = [[1, 2, 3], [4, 5, 6]]  # Shape [2, 3]
-n = [3, 3]
-dist = DirichletMultinomial(n, alpha)
-
-# counts will be broadcast to [[2, 1, 0], [2, 1, 0]] to match alpha.
-counts = [2, 1, 0]
-dist.prob(counts)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.__init__(total_count, concentration, validate_args=False, allow_nan_stats=True, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
-
-Initialize a batch of DirichletMultinomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor, whose dtype is the same
-    as `concentration`. The shape is broadcastable to `[N1,..., Nm]` with
-    `m >= 0`.  Defines this as a batch of `N1 x ... x Nm` different
-    Dirichlet multinomial distributions. Its components should be equal to
-    integer values.
-*  <b>`concentration`</b>: Positive floating point tensor, whose dtype is the
-    same as `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.
-    Defines this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
-    multinomial distributions.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.allow_nan_stats` {#DirichletMultinomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.batch_shape` {#DirichletMultinomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.batch_shape_tensor(name='batch_shape_tensor')` {#DirichletMultinomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.cdf(value, name='cdf')` {#DirichletMultinomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.concentration` {#DirichletMultinomial.concentration}
-
-Concentration parameter; expected prior counts for that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.copy(**override_parameters_kwargs)` {#DirichletMultinomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.covariance(name='covariance')` {#DirichletMultinomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-
-Additional documentation from `DirichletMultinomial`:
-
-The covariance for each batch member is defined as the following:
-
-```none
-Var(X_j) = n * alpha_j / alpha_0 * (1 - alpha_j / alpha_0) *
-(n + alpha_0) / (1 + alpha_0)
-```
-
-where `concentration = alpha` and
-`total_concentration = alpha_0 = sum_j alpha_j`.
-
-The covariance between elements in a batch is defined as:
-
-```none
-Cov(X_i, X_j) = -n * alpha_i * alpha_j / alpha_0 ** 2 *
-(n + alpha_0) / (1 + alpha_0)
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.dtype` {#DirichletMultinomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.entropy(name='entropy')` {#DirichletMultinomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.event_shape` {#DirichletMultinomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.event_shape_tensor(name='event_shape_tensor')` {#DirichletMultinomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_continuous` {#DirichletMultinomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_scalar_batch(name='is_scalar_batch')` {#DirichletMultinomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_scalar_event(name='is_scalar_event')` {#DirichletMultinomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_cdf(value, name='log_cdf')` {#DirichletMultinomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_prob(value, name='log_prob')` {#DirichletMultinomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `DirichletMultinomial`:
-
-For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
-different sequences have the same counts so the probability includes a
-combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.concentration` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_survival_function(value, name='log_survival_function')` {#DirichletMultinomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.mean(name='mean')` {#DirichletMultinomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.mode(name='mode')` {#DirichletMultinomial.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.name` {#DirichletMultinomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#DirichletMultinomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.param_static_shapes(cls, sample_shape)` {#DirichletMultinomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.parameters` {#DirichletMultinomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.prob(value, name='prob')` {#DirichletMultinomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `DirichletMultinomial`:
-
-For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
-different sequences have the same counts so the probability includes a
-combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.concentration` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.reparameterization_type` {#DirichletMultinomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.sample(sample_shape=(), seed=None, name='sample')` {#DirichletMultinomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.stddev(name='stddev')` {#DirichletMultinomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.survival_function(value, name='survival_function')` {#DirichletMultinomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.total_concentration` {#DirichletMultinomial.total_concentration}
-
-Sum of last dim of concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.total_count` {#DirichletMultinomial.total_count}
-
-Number of trials used to construct a sample.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.validate_args` {#DirichletMultinomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.variance(name='variance')` {#DirichletMultinomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.Multinomial` {#Multinomial}
-
-Multinomial distribution.
-
-This Multinomial distribution is parameterized by `probs`, a (batch of)
-length-`k` `prob` (probability) vectors (`k > 1`) such that
-`tf.reduce_sum(probs, -1) = 1`, and a `total_count` number of trials, i.e.,
-the number of trials per draw from the Multinomial. It is defined over a
-(batch of) length-`k` vector `counts` such that
-`tf.reduce_sum(counts, -1) = total_count`. The Multinomial is identically the
-Binomial distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Multinomial is a distribution over `k`-class counts, i.e., a length-`k`
-vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(n; pi, N) = prod_j (pi_j)**n_j / Z
-Z = (prod_j n_j!) / N!
-```
-
-where:
-* `probs = pi = [pi_0, ..., pi_{k-1}]`, `pi_j > 0`, `sum_j pi_j = 1`,
-* `total_count = N`, `N` a positive integer,
-* `Z` is the normalization constant, and,
-* `N!` denotes `N` factorial.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-Create a 3-class distribution, with the 3rd class is most likely to be drawn,
-using logits.
-
-```python
-logits = [-50., -43, 0]
-dist = Multinomial(total_count=4., logits=logits)
-```
-
-Create a 3-class distribution, with the 3rd class is most likely to be drawn.
-
-```python
-p = [.2, .3, .5]
-dist = Multinomial(total_count=4., probs=p)
-```
-
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as p.
-counts = [1., 0, 3]
-dist.prob(counts)  # Shape []
-
-# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
-counts = [[1., 2, 1], [2, 2, 0]]
-dist.prob(counts)  # Shape [2]
-
-# p will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7]
-```
-
-Create a 2-batch of 3-class distributions.
-
-```python
-p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
-dist = Multinomial(total_count=[4., 5], probs=p)
-
-counts = [[2., 1, 1], [3, 1, 1]]
-dist.prob(counts)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.Multinomial.__init__(total_count, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='Multinomial')` {#Multinomial.__init__}
-
-Initialize a batch of Multinomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor with shape broadcastable
-    to `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
-    `N1 x ... x Nm` different Multinomial distributions.  Its components
-    should be equal to integer values.
-*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
-    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
-    and the same dtype as `total_count`. Defines this as a batch of
-    `N1 x ... x Nm` different `k` class Multinomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: Positive floating point tensor with shape broadcastable to
-    `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`.  Defines
-    this as a batch of `N1 x ... x Nm` different `k` class Multinomial
-    distributions. `probs`'s components in the last portion of its shape
-    should sum to `1`. Only one of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.batch_shape` {#Multinomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.batch_shape_tensor(name='batch_shape_tensor')` {#Multinomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.copy(**override_parameters_kwargs)` {#Multinomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.covariance(name='covariance')` {#Multinomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.event_shape` {#Multinomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.event_shape_tensor(name='event_shape_tensor')` {#Multinomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_scalar_batch(name='is_scalar_batch')` {#Multinomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_scalar_event(name='is_scalar_event')` {#Multinomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_prob(value, name='log_prob')` {#Multinomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Multinomial`:
-
-For each batch of counts, `value = [n_0, ...
-,n_{k-1}]`, `P[value]` is the probability that after sampling `self.total_count`
-draws from this Multinomial distribution, the number of draws falling in class
-`j` is `n_j`. Since this definition is [exchangeable](
-https://en.wikipedia.org/wiki/Exchangeable_random_variables); different
-sequences have the same counts so the probability includes a combinatorial
-coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.probs` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_survival_function(value, name='log_survival_function')` {#Multinomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Multinomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.param_static_shapes(cls, sample_shape)` {#Multinomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.parameters` {#Multinomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.prob(value, name='prob')` {#Multinomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Multinomial`:
-
-For each batch of counts, `value = [n_0, ...
-,n_{k-1}]`, `P[value]` is the probability that after sampling `self.total_count`
-draws from this Multinomial distribution, the number of draws falling in class
-`j` is `n_j`. Since this definition is [exchangeable](
-https://en.wikipedia.org/wiki/Exchangeable_random_variables); different
-sequences have the same counts so the probability includes a combinatorial
-coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.probs` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.probs` {#Multinomial.probs}
-
-Probability of of drawing a `1` in that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.reparameterization_type` {#Multinomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.stddev(name='stddev')` {#Multinomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.survival_function(value, name='survival_function')` {#Multinomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.total_count` {#Multinomial.total_count}
-
-Number of trials used to construct a sample.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.WishartCholesky` {#WishartCholesky}
-
-The matrix Wishart distribution on positive definite matrices.
-
-This distribution is defined by a scalar degrees of freedom `df` and a
-lower, triangular Cholesky factor which characterizes the scale matrix.
-
-Using WishartCholesky is a constant-time improvement over WishartFull. It
-saves an O(nbk^3) operation, i.e., a matrix-product operation for sampling
-and a Cholesky factorization in log_prob. For most use-cases it often saves
-another O(nbk^3) operation since most uses of Wishart will also use the
-Cholesky factorization.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
-Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
-```
-
-where:
-* `df >= k` denotes the degrees of freedom,
-* `scale` is a symmetric, positive definite, `k x k` matrix,
-* `Z` is the normalizing constant, and,
-* `Gamma_k` is the [multivariate Gamma function](
-  https://en.wikipedia.org/wiki/Multivariate_gamma_function).
-
-
-#### Examples
-
-```python
-# Initialize a single 3x3 Wishart with Cholesky factored scale matrix and 5
-# degrees-of-freedom.(*)
-df = 5
-chol_scale = tf.cholesky(...)  # Shape is [3, 3].
-dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-x = ... # A 3x3 positive definite matrix.
-dist.prob(x)  # Shape is [], a scalar.
-
-# Evaluate this on a two observations, each in R^{3x3}, returning a length two
-# Tensor.
-x = [x0, x1]  # Shape is [2, 3, 3].
-dist.prob(x)  # Shape is [2].
-
-# Initialize two 3x3 Wisharts with Cholesky factored scale matrices.
-df = [5, 4]
-chol_scale = tf.cholesky(...)  # Shape is [2, 3, 3].
-dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
-
-# Evaluate this on four observations.
-x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3].
-dist.prob(x)  # Shape is [2, 2].
-
-# (*) - To efficiently create a trainable covariance matrix, see the example
-#   in tf.contrib.distributions.matrix_diag_transform.
-```
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.__init__(df, scale, cholesky_input_output_matrices=False, validate_args=False, allow_nan_stats=True, name='WishartCholesky')` {#WishartCholesky.__init__}
-
-Construct Wishart distributions.
-
-##### Args:
-
-
-*  <b>`df`</b>: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
-    or equal to dimension of the scale matrix.
-*  <b>`scale`</b>: `float` or `double` `Tensor`. The Cholesky factorization of
-    the symmetric positive definite scale matrix of the distribution.
-*  <b>`cholesky_input_output_matrices`</b>: `Boolean`. Any function which whose input
-    or output is a matrix assumes the input is Cholesky and returns a
-    Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
-    `sample_n` returns a Cholesky when
-    `cholesky_input_output_matrices=True`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.allow_nan_stats` {#WishartCholesky.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.batch_shape` {#WishartCholesky.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.batch_shape_tensor(name='batch_shape_tensor')` {#WishartCholesky.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.cdf(value, name='cdf')` {#WishartCholesky.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.cholesky_input_output_matrices` {#WishartCholesky.cholesky_input_output_matrices}
-
-Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.copy(**override_parameters_kwargs)` {#WishartCholesky.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.covariance(name='covariance')` {#WishartCholesky.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.df` {#WishartCholesky.df}
-
-Wishart distribution degree(s) of freedom.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.dimension` {#WishartCholesky.dimension}
-
-Dimension of underlying vector space. The `p` in `R^(p*p)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.dtype` {#WishartCholesky.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.entropy(name='entropy')` {#WishartCholesky.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.event_shape` {#WishartCholesky.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.event_shape_tensor(name='event_shape_tensor')` {#WishartCholesky.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_continuous` {#WishartCholesky.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_scalar_batch(name='is_scalar_batch')` {#WishartCholesky.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_scalar_event(name='is_scalar_event')` {#WishartCholesky.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_cdf(value, name='log_cdf')` {#WishartCholesky.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_normalization(name='log_normalization')` {#WishartCholesky.log_normalization}
-
-Computes the log normalizing constant, log(Z).
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_prob(value, name='log_prob')` {#WishartCholesky.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_survival_function(value, name='log_survival_function')` {#WishartCholesky.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mean(name='mean')` {#WishartCholesky.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mean_log_det(name='mean_log_det')` {#WishartCholesky.mean_log_det}
-
-Computes E[log(det(X))] under this Wishart distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mode(name='mode')` {#WishartCholesky.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.name` {#WishartCholesky.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#WishartCholesky.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.param_static_shapes(cls, sample_shape)` {#WishartCholesky.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.parameters` {#WishartCholesky.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.prob(value, name='prob')` {#WishartCholesky.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.reparameterization_type` {#WishartCholesky.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.sample(sample_shape=(), seed=None, name='sample')` {#WishartCholesky.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.scale()` {#WishartCholesky.scale}
-
-Wishart distribution scale matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.scale_operator_pd` {#WishartCholesky.scale_operator_pd}
-
-Wishart distribution scale matrix as an OperatorPD.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.stddev(name='stddev')` {#WishartCholesky.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.survival_function(value, name='survival_function')` {#WishartCholesky.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.validate_args` {#WishartCholesky.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.variance(name='variance')` {#WishartCholesky.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.WishartFull` {#WishartFull}
-
-The matrix Wishart distribution on positive definite matrices.
-
-This distribution is defined by a scalar degrees of freedom `df` and a
-symmetric, positive definite scale matrix.
-
-Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations
-where `(k, k)` is the event space shape.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
-Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
-```
-
-where:
-* `df >= k` denotes the degrees of freedom,
-* `scale` is a symmetric, positive definite, `k x k` matrix,
-* `Z` is the normalizing constant, and,
-* `Gamma_k` is the [multivariate Gamma function](
-  https://en.wikipedia.org/wiki/Multivariate_gamma_function).
-
-#### Examples
-
-```python
-# Initialize a single 3x3 Wishart with Full factored scale matrix and 5
-# degrees-of-freedom.(*)
-df = 5
-scale = ...  # Shape is [3, 3]; positive definite.
-dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-x = ... # A 3x3 positive definite matrix.
-dist.prob(x)  # Shape is [], a scalar.
-
-# Evaluate this on a two observations, each in R^{3x3}, returning a length two
-# Tensor.
-x = [x0, x1]  # Shape is [2, 3, 3].
-dist.prob(x)  # Shape is [2].
-
-# Initialize two 3x3 Wisharts with Full factored scale matrices.
-df = [5, 4]
-scale = ...  # Shape is [2, 3, 3].
-dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
-
-# Evaluate this on four observations.
-x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3]; xi is positive definite.
-dist.prob(x)  # Shape is [2, 2].
-
-# (*) - To efficiently create a trainable covariance matrix, see the example
-#   in tf.contrib.distributions.matrix_diag_transform.
-```
-- - -
-
-#### `tf.contrib.distributions.WishartFull.__init__(df, scale, cholesky_input_output_matrices=False, validate_args=False, allow_nan_stats=True, name='WishartFull')` {#WishartFull.__init__}
-
-Construct Wishart distributions.
-
-##### Args:
-
-
-*  <b>`df`</b>: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
-    or equal to dimension of the scale matrix.
-*  <b>`scale`</b>: `float` or `double` `Tensor`. The symmetric positive definite
-    scale matrix of the distribution.
-*  <b>`cholesky_input_output_matrices`</b>: `Boolean`. Any function which whose input
-    or output is a matrix assumes the input is Cholesky and returns a
-    Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
-    `sample_n` returns a Cholesky when
-    `cholesky_input_output_matrices=True`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.allow_nan_stats` {#WishartFull.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.batch_shape` {#WishartFull.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.batch_shape_tensor(name='batch_shape_tensor')` {#WishartFull.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.cdf(value, name='cdf')` {#WishartFull.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.cholesky_input_output_matrices` {#WishartFull.cholesky_input_output_matrices}
-
-Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.copy(**override_parameters_kwargs)` {#WishartFull.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.covariance(name='covariance')` {#WishartFull.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.df` {#WishartFull.df}
-
-Wishart distribution degree(s) of freedom.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.dimension` {#WishartFull.dimension}
-
-Dimension of underlying vector space. The `p` in `R^(p*p)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.dtype` {#WishartFull.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.entropy(name='entropy')` {#WishartFull.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.event_shape` {#WishartFull.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.event_shape_tensor(name='event_shape_tensor')` {#WishartFull.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_continuous` {#WishartFull.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_scalar_batch(name='is_scalar_batch')` {#WishartFull.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_scalar_event(name='is_scalar_event')` {#WishartFull.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_cdf(value, name='log_cdf')` {#WishartFull.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_normalization(name='log_normalization')` {#WishartFull.log_normalization}
-
-Computes the log normalizing constant, log(Z).
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_prob(value, name='log_prob')` {#WishartFull.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_survival_function(value, name='log_survival_function')` {#WishartFull.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mean(name='mean')` {#WishartFull.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mean_log_det(name='mean_log_det')` {#WishartFull.mean_log_det}
-
-Computes E[log(det(X))] under this Wishart distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mode(name='mode')` {#WishartFull.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.name` {#WishartFull.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#WishartFull.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.param_static_shapes(cls, sample_shape)` {#WishartFull.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.parameters` {#WishartFull.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.prob(value, name='prob')` {#WishartFull.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.reparameterization_type` {#WishartFull.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.sample(sample_shape=(), seed=None, name='sample')` {#WishartFull.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.scale()` {#WishartFull.scale}
-
-Wishart distribution scale matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.scale_operator_pd` {#WishartFull.scale_operator_pd}
-
-Wishart distribution scale matrix as an OperatorPD.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.stddev(name='stddev')` {#WishartFull.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.survival_function(value, name='survival_function')` {#WishartFull.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.validate_args` {#WishartFull.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.variance(name='variance')` {#WishartFull.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-### Multivariate Utilities
-
-- - -
-
-### `tf.contrib.distributions.matrix_diag_transform(matrix, transform=None, name=None)` {#matrix_diag_transform}
-
-Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
-
-Create a trainable covariance defined by a Cholesky factor:
-
-```python
-# Transform network layer into 2 x 2 array.
-matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-
-# Make the diagonal positive.  If the upper triangle was zero, this would be a
-# valid Cholesky factor.
-chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-# OperatorPDCholesky ignores the upper triangle.
-operator = OperatorPDCholesky(chol)
-```
-
-Example of heteroskedastic 2-D linear regression.
-
-```python
-# Get a trainable Cholesky factor.
-matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-# Get a trainable mean.
-mu = tf.contrib.layers.fully_connected(activations, 2)
-
-# This is a fully trainable multivariate normal!
-dist = tf.contrib.distributions.MVNCholesky(mu, chol)
-
-# Standard log loss.  Minimizing this will "train" mu and chol, and then dist
-# will be a distribution predicting labels as multivariate Gaussians.
-loss = -1 * tf.reduce_mean(dist.log_prob(labels))
-```
-
-##### Args:
-
-
-*  <b>`matrix`</b>: Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
-    equal.
-*  <b>`transform`</b>: Element-wise function mapping `Tensors` to `Tensors`.  To
-    be applied to the diagonal of `matrix`.  If `None`, `matrix` is returned
-    unchanged.  Defaults to `None`.
-*  <b>`name`</b>: A name to give created ops.
-    Defaults to "matrix_diag_transform".
-
-##### Returns:
-
-  A `Tensor` with same shape and `dtype` as `matrix`.
-
-
-
-## Transformed distributions
-
-- - -
-
-### `class tf.contrib.distributions.TransformedDistribution` {#TransformedDistribution}
-
-A Transformed Distribution.
-
-A `TransformedDistribution` models `p(y)` given a base distribution `p(x)`,
-and a deterministic, invertible, differentiable transform, `Y = g(X)`. The
-transform is typically an instance of the `Bijector` class and the base
-distribution is typically an instance of the `Distribution` class.
-
-A `Bijector` is expected to implement the following functions:
-- `forward`,
-- `inverse`,
-- `inverse_log_det_jacobian`.
-The semantics of these functions are outlined in the `Bijector` documentation.
-
-We now describe how a `TransformedDistribution` alters the input/outputs of a
-`Distribution` associated with a random variable (rv) `X`.
-
-Write `cdf(Y=y)` for an absolutely continuous cumulative distribution function
-of random variable `Y`; write the probability density function `pdf(Y=y) :=
-d^k / (dy_1,...,dy_k) cdf(Y=y)` for its derivative wrt to `Y` evaluated at
-`y`.  Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
-i.e., a non-random, continuous, differentiable, and invertible function.
-Write the inverse of `g` as `X = g^{-1}(Y)` and `(J o g)(x)` for the Jacobian
-of `g` evaluated at `x`.
-
-A `TransformedDistribution` implements the following operations:
-
-  * `sample`:
-
-    Mathematically:
-
-    ```none
-    Y = g(X)
-    ```
-
-    Programmatically:
-
-    ```python
-    return bijector.forward(distribution.sample(...))
-    ```
-
-  * `log_prob`:
-
-    Mathematically:
-
-    ```none
-    (log o pdf)(Y=y) = (log o pdf o g^{-1})(y) +
-                         (log o abs o det o J o g^{-1})(y)
-    ```
-
-    Programmatically:
-
-    ```python
-    return (distribution.log_prob(bijector.inverse(y)) +
-            bijector.inverse_log_det_jacobian(y))
-    ```
-
-  * `log_cdf`:
-
-    Mathematically:
-
-    ```none
-    (log o cdf)(Y=y) = (log o cdf o g^{-1})(y)
-    ```
-
-    Programmatically:
-
-    ```python
-    return distribution.log_cdf(bijector.inverse(x))
-    ```
-
-  * and similarly for: `cdf`, `prob`, `log_survival_function`,
-   `survival_function`.
-
-A simple example constructing a Log-Normal distribution from a Normal
-distribution:
-
-```python
-ds = tf.contrib.distributions
-log_normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=mu, sigma=sigma),
-  bijector=ds.bijector.Exp(),
-  name="LogNormalTransformedDistribution")
-```
-
-A `LogNormal` made from callables:
-
-```python
-ds = tf.contrib.distributions
-log_normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=mu, sigma=sigma),
-  bijector=ds.bijector.Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="LogNormalTransformedDistribution")
-```
-
-Another example constructing a Normal from a StandardNormal:
-
-```python
-ds = tf.contrib.distributions
-normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=0, sigma=1),
-  bijector=ds.bijector.ScaleAndShift(loc=mu, scale=sigma, event_ndims=0),
-  name="NormalTransformedDistribution")
-```
-
-A `TransformedDistribution`'s batch- and event-shape are implied by the base
-distribution unless explicitly overridden by `batch_shape` or `event_shape`
-arguments.  Specifying an overriding `batch_shape` (`event_shape`) is
-permitted only if the base distribution has scalar batch-shape (event-shape).
-The bijector is applied to the distribution as if the distribution possessed
-the overridden shape(s). The following example demonstrates how to construct a
-multivariate Normal as a `TransformedDistribution`.
-
-```python
-bs = tf.contrib.distributions.bijector
-ds = tf.contrib.distributions
-# We will create two MVNs with batch_shape = event_shape = 2.
-mean = [[-1., 0],      # batch:0
-        [0., 1]]       # batch:1
-chol_cov = [[[1., 0],
-             [0, 1]],  # batch:0
-            [[1, 0],
-             [2, 2]]]  # batch:1
-mvn1 = ds.TransformedDistribution(
-    distribution=ds.Normal(mu=0., sigma=1.),
-    bijector=bs.Affine(shift=mean, tril=chol_cov),
-    batch_shape=[2],  # Valid because base_distribution.batch_shape == [].
-    event_shape=[2])  # Valid because base_distribution.event_shape == [].
-mvn2 = ds.MultivariateNormalCholesky(mu=mean, chol=chol_cov)
-# mvn1.log_prob(x) == mvn2.log_prob(x)
-```
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.__init__(distribution, bijector=None, batch_shape=None, event_shape=None, validate_args=False, name=None)` {#TransformedDistribution.__init__}
-
-Construct a Transformed Distribution.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution instance to transform. Typically an
-    instance of `Distribution`.
-*  <b>`bijector`</b>: The object responsible for calculating the transformation.
-    Typically an instance of `Bijector`. `None` means `Identity()`.
-*  <b>`batch_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `batch_shape`; valid only if `distribution.is_scalar_batch()`.
-*  <b>`event_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `event_shape`; valid only if `distribution.is_scalar_event()`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class. Default:
-    `bijector.name + distribution.name`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.allow_nan_stats` {#TransformedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.batch_shape` {#TransformedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#TransformedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.bijector` {#TransformedDistribution.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.cdf(value, name='cdf')` {#TransformedDistribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.copy(**override_parameters_kwargs)` {#TransformedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.covariance(name='covariance')` {#TransformedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.distribution` {#TransformedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.dtype` {#TransformedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.entropy(name='entropy')` {#TransformedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.event_shape` {#TransformedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.event_shape_tensor(name='event_shape_tensor')` {#TransformedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_continuous` {#TransformedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_scalar_batch(name='is_scalar_batch')` {#TransformedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_scalar_event(name='is_scalar_event')` {#TransformedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_cdf(value, name='log_cdf')` {#TransformedDistribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_prob(value, name='log_prob')` {#TransformedDistribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_survival_function(value, name='log_survival_function')` {#TransformedDistribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.mean(name='mean')` {#TransformedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.mode(name='mode')` {#TransformedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.name` {#TransformedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#TransformedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.param_static_shapes(cls, sample_shape)` {#TransformedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.parameters` {#TransformedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.prob(value, name='prob')` {#TransformedDistribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.reparameterization_type` {#TransformedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.sample(sample_shape=(), seed=None, name='sample')` {#TransformedDistribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.stddev(name='stddev')` {#TransformedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.survival_function(value, name='survival_function')` {#TransformedDistribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.validate_args` {#TransformedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.variance(name='variance')` {#TransformedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.QuantizedDistribution` {#QuantizedDistribution}
-
-Distribution representing the quantization `Y = ceiling(X)`.
-
-#### Definition in terms of sampling.
-
-```
-1. Draw X
-2. Set Y <-- ceiling(X)
-3. If Y < low, reset Y <-- low
-4. If Y > high, reset Y <-- high
-5. Return Y
-```
-
-#### Definition in terms of the probability mass function.
-
-Given scalar random variable `X`, we define a discrete random variable `Y`
-supported on the integers as follows:
-
-```
-P[Y = j] := P[X <= low],  if j == low,
-         := P[X > high - 1],  j == high,
-         := 0, if j < low or j > high,
-         := P[j - 1 < X <= j],  all other j.
-```
-
-Conceptually, without cutoffs, the quantization process partitions the real
-line `R` into half open intervals, and identifies an integer `j` with the
-right endpoints:
-
-```
-R = ... (-2, -1](-1, 0](0, 1](1, 2](2, 3](3, 4] ...
-j = ...      -1      0     1     2     3     4  ...
-```
-
-`P[Y = j]` is the mass of `X` within the `jth` interval.
-If `low = 0`, and `high = 2`, then the intervals are redrawn
-and `j` is re-assigned:
-
-```
-R = (-infty, 0](0, 1](1, infty)
-j =          0     1     2
-```
-
-`P[Y = j]` is still the mass of `X` within the `jth` interval.
-
-#### Caveats
-
-Since evaluation of each `P[Y = j]` involves a cdf evaluation (rather than
-a closed form function such as for a Poisson), computations such as mean and
-entropy are better done with samples or approximations, and are not
-implemented by this class.
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.__init__(distribution, low=None, high=None, validate_args=False, name='QuantizedDistribution')` {#QuantizedDistribution.__init__}
-
-Construct a Quantized Distribution representing `Y = ceiling(X)`.
-
-Some properties are inherited from the distribution defining `X`. Example:
-`allow_nan_stats` is determined for this `QuantizedDistribution` by reading
-the `distribution`.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution class to transform. Typically an
-    instance of `Distribution`.
-*  <b>`low`</b>: `Tensor` with same `dtype` as this distribution and shape
-    able to be added to samples.  Should be a whole number.  Default `None`.
-    If provided, base distribution's `prob` should be defined at
-    `low`.
-*  <b>`high`</b>: `Tensor` with same `dtype` as this distribution and shape
-    able to be added to samples.  Should be a whole number.  Default `None`.
-    If provided, base distribution's `prob` should be defined at
-    `high - 1`.
-    `high` must be strictly greater than `low`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `dist_cls` is not a subclass of
-      `Distribution` or continuous.
-*  <b>`NotImplementedError`</b>: If the base distribution does not implement `cdf`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.allow_nan_stats` {#QuantizedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.batch_shape` {#QuantizedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#QuantizedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.cdf(value, name='cdf')` {#QuantizedDistribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-cdf(y) := P[Y <= y]
-        = 1, if y >= high,
-        = 0, if y < low,
-        = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.copy(**override_parameters_kwargs)` {#QuantizedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.covariance(name='covariance')` {#QuantizedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.distribution` {#QuantizedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.dtype` {#QuantizedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.entropy(name='entropy')` {#QuantizedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.event_shape` {#QuantizedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.event_shape_tensor(name='event_shape_tensor')` {#QuantizedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_continuous` {#QuantizedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_scalar_batch(name='is_scalar_batch')` {#QuantizedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_scalar_event(name='is_scalar_event')` {#QuantizedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_cdf(value, name='log_cdf')` {#QuantizedDistribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-cdf(y) := P[Y <= y]
-        = 1, if y >= high,
-        = 0, if y < low,
-        = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_prob(value, name='log_prob')` {#QuantizedDistribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-P[Y = y] := P[X <= low],  if y == low,
-         := P[X > high - 1],  y == high,
-         := 0, if j < low or y > high,
-         := P[y - 1 < X <= y],  all other y.
-```
-
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.  If the
-base distribution has a `log_survival_function` method results will be more
-accurate for large values of `y`, and in this case the `log_survival_function`
-must also be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_survival_function(value, name='log_survival_function')` {#QuantizedDistribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-survival_function(y) := P[Y > y]
-                      = 0, if y >= high,
-                      = 1, if y < low,
-                      = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.mean(name='mean')` {#QuantizedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.mode(name='mode')` {#QuantizedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.name` {#QuantizedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#QuantizedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.param_static_shapes(cls, sample_shape)` {#QuantizedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.parameters` {#QuantizedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.prob(value, name='prob')` {#QuantizedDistribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-P[Y = y] := P[X <= low],  if y == low,
-         := P[X > high - 1],  y == high,
-         := 0, if j < low or y > high,
-         := P[y - 1 < X <= y],  all other y.
-```
-
-
-The base distribution's `cdf` method must be defined on `y - 1`.  If the
-base distribution has a `survival_function` method, results will be more
-accurate for large values of `y`, and in this case the `survival_function` must
-also be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.reparameterization_type` {#QuantizedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.sample(sample_shape=(), seed=None, name='sample')` {#QuantizedDistribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.stddev(name='stddev')` {#QuantizedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.survival_function(value, name='survival_function')` {#QuantizedDistribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-survival_function(y) := P[Y > y]
-                      = 0, if y >= high,
-                      = 1, if y < low,
-                      = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.validate_args` {#QuantizedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.variance(name='variance')` {#QuantizedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-## Mixture Models
-
-- - -
-
-### `class tf.contrib.distributions.Mixture` {#Mixture}
-
-Mixture distribution.
-
-The `Mixture` object implements batched mixture distributions.
-The mixture model is defined by a `Categorical` distribution (the mixture)
-and a python list of `Distribution` objects.
-
-Methods supported include `log_prob`, `prob`, `mean`, `sample`, and
-`entropy_lower_bound`.
-- - -
-
-#### `tf.contrib.distributions.Mixture.__init__(cat, components, validate_args=False, allow_nan_stats=True, name='Mixture')` {#Mixture.__init__}
-
-Initialize a Mixture distribution.
-
-A `Mixture` is defined by a `Categorical` (`cat`, representing the
-mixture probabilities) and a list of `Distribution` objects
-all having matching dtype, batch shape, event shape, and continuity
-properties (the components).
-
-The `num_classes` of `cat` must be possible to infer at graph construction
-time and match `len(components)`.
-
-##### Args:
-
-
-*  <b>`cat`</b>: A `Categorical` distribution instance, representing the probabilities
-      of `distributions`.
-*  <b>`components`</b>: A list or tuple of `Distribution` instances.
-    Each instance must have the same type, be defined on the same domain,
-    and have matching `event_shape` and `batch_shape`.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  If `True`, raise a runtime
-    error if batch or event ranks are inconsistent between cat and any of
-    the distributions.  This is only checked if the ranks cannot be
-    determined statically at graph construction time.
-*  <b>`allow_nan_stats`</b>: Boolean, default `True`.  If `False`, raise an
-   exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member.  If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: A name for this distribution (optional).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If cat is not a `Categorical`, or `components` is not
-    a list or tuple, or the elements of `components` are not
-    instances of `Distribution`, or do not have matching `dtype`.
-*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or its
-    elements do not have a statically known event rank.
-    If `cat.num_classes` cannot be inferred at graph creation time,
-    or the constant value of `cat.num_classes` is not equal to
-    `len(components)`, or all `components` and `cat` do not have
-    matching static batch shapes, or all components do not
-    have matching static event shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.allow_nan_stats` {#Mixture.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.batch_shape` {#Mixture.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.batch_shape_tensor(name='batch_shape_tensor')` {#Mixture.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.cat` {#Mixture.cat}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.cdf(value, name='cdf')` {#Mixture.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.components` {#Mixture.components}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.copy(**override_parameters_kwargs)` {#Mixture.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.covariance(name='covariance')` {#Mixture.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.dtype` {#Mixture.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.entropy(name='entropy')` {#Mixture.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.entropy_lower_bound(name='entropy_lower_bound')` {#Mixture.entropy_lower_bound}
-
-A lower bound on the entropy of this mixture model.
-
-The bound below is not always very tight, and its usefulness depends
-on the mixture probabilities and the components in use.
-
-A lower bound is useful for ELBO when the `Mixture` is the variational
-distribution:
-
-\\(
-\log p(x) >= ELBO = \int q(z) \log p(x, z) dz + H[q]
-\\)
-
-where \\( p \\) is the prior distribution, \\( q \\) is the variational,
-and \\( H[q] \\) is the entropy of \\( q \\).  If there is a lower bound
-\\( G[q] \\) such that \\( H[q] \geq G[q] \\) then it can be used in
-place of \\( H[q] \\).
-
-For a mixture of distributions \\( q(Z) = \sum_i c_i q_i(Z) \\) with
-\\( \sum_i c_i = 1 \\), by the concavity of \\( f(x) = -x \log x \\), a
-simple lower bound is:
-
-\\(
-\begin{align}
-H[q] & = - \int q(z) \log q(z) dz \\\
-   & = - \int (\sum_i c_i q_i(z)) \log(\sum_i c_i q_i(z)) dz \\\
-   & \geq - \sum_i c_i \int q_i(z) \log q_i(z) dz \\\
-   & = \sum_i c_i H[q_i]
-\end{align}
-\\)
-
-This is the term we calculate below for \\( G[q] \\).
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A lower bound on the Mixture's entropy.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.event_shape` {#Mixture.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.event_shape_tensor(name='event_shape_tensor')` {#Mixture.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_continuous` {#Mixture.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_scalar_batch(name='is_scalar_batch')` {#Mixture.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_scalar_event(name='is_scalar_event')` {#Mixture.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_cdf(value, name='log_cdf')` {#Mixture.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_prob(value, name='log_prob')` {#Mixture.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_survival_function(value, name='log_survival_function')` {#Mixture.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.mean(name='mean')` {#Mixture.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.mode(name='mode')` {#Mixture.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.name` {#Mixture.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.num_components` {#Mixture.num_components}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Mixture.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.param_static_shapes(cls, sample_shape)` {#Mixture.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.parameters` {#Mixture.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.prob(value, name='prob')` {#Mixture.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.reparameterization_type` {#Mixture.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.sample(sample_shape=(), seed=None, name='sample')` {#Mixture.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.stddev(name='stddev')` {#Mixture.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.survival_function(value, name='survival_function')` {#Mixture.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.validate_args` {#Mixture.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.variance(name='variance')` {#Mixture.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-## Posterior inference with conjugate priors.
-
-Functions that transform conjugate prior/likelihood pairs to distributions
-representing the posterior or posterior predictive.
-
-## Normal likelihood with conjugate prior.
-
-- - -
-
-### `tf.contrib.distributions.normal_conjugates_known_scale_posterior(prior, scale, s, n)` {#normal_conjugates_known_scale_posterior}
-
-Posterior Normal distribution with conjugate prior on the mean.
-
-This model assumes that `n` observations (with sum `s`) come from a
-Normal with unknown mean `loc` (described by the Normal `prior`)
-and known variance `scale^2`.  The "known scale posterior" is
-the distribution of the unknown `loc`.
-
-Accepts a prior Normal distribution object, having parameters
-`loc0` and `scale0`, as well as known `scale` values of the predictive
-distribution(s) (also assumed Normal),
-and statistical estimates `s` (the sum(s) of the observations) and
-`n` (the number(s) of observations).
-
-Returns a posterior (also Normal) distribution object, with parameters
-`(loc', scale'^2)`, where:
-
-```
-mu ~ N(mu', sigma'^2)
-sigma'^2 = 1/(1/sigma0^2 + n/sigma^2),
-mu' = (mu0/sigma0^2 + s/sigma^2) * sigma'^2.
-```
-
-Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
-will broadcast in the case of multidimensional sets of parameters.
-
-##### Args:
-
-
-*  <b>`prior`</b>: `Normal` object of type `dtype`:
-    the prior distribution having parameters `(loc0, scale0)`.
-*  <b>`scale`</b>: tensor of type `dtype`, taking values `scale > 0`.
-    The known stddev parameter(s).
-*  <b>`s`</b>: Tensor of type `dtype`.  The sum(s) of observations.
-*  <b>`n`</b>: Tensor of type `int`.  The number(s) of observations.
-
-##### Returns:
-
-  A new Normal posterior distribution object for the unknown observation
-  mean `loc`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Normal object.
-
-
-- - -
-
-### `tf.contrib.distributions.normal_conjugates_known_scale_predictive(prior, scale, s, n)` {#normal_conjugates_known_scale_predictive}
-
-Posterior predictive Normal distribution w. conjugate prior on the mean.
-
-This model assumes that `n` observations (with sum `s`) come from a
-Normal with unknown mean `loc` (described by the Normal `prior`)
-and known variance `scale^2`.  The "known scale predictive"
-is the distribution of new observations, conditioned on the existing
-observations and our prior.
-
-Accepts a prior Normal distribution object, having parameters
-`loc0` and `scale0`, as well as known `scale` values of the predictive
-distribution(s) (also assumed Normal),
-and statistical estimates `s` (the sum(s) of the observations) and
-`n` (the number(s) of observations).
-
-Calculates the Normal distribution(s) `p(x | sigma^2)`:
-
-```
-p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.loc, prior.scale**2) dmu
-               = N(x | prior.loc, 1/(sigma^2 + prior.scale**2))
-```
-
-Returns the predictive posterior distribution object, with parameters
-`(loc', scale'^2)`, where:
-
-```
-sigma_n^2 = 1/(1/sigma0^2 + n/sigma^2),
-mu' = (mu0/sigma0^2 + s/sigma^2) * sigma_n^2.
-sigma'^2 = sigma_n^2 + sigma^2,
-```
-
-Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
-will broadcast in the case of multidimensional sets of parameters.
-
-##### Args:
-
-
-*  <b>`prior`</b>: `Normal` object of type `dtype`:
-    the prior distribution having parameters `(loc0, scale0)`.
-*  <b>`scale`</b>: tensor of type `dtype`, taking values `scale > 0`.
-    The known stddev parameter(s).
-*  <b>`s`</b>: Tensor of type `dtype`.  The sum(s) of observations.
-*  <b>`n`</b>: Tensor of type `int`.  The number(s) of observations.
-
-##### Returns:
-
-  A new Normal predictive distribution object.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Normal object.
-
-
-
-## Kullback-Leibler Divergence
-
-- - -
-
-### `tf.contrib.distributions.kl(dist_a, dist_b, allow_nan=False, name=None)` {#kl}
-
-Get the KL-divergence KL(dist_a || dist_b).
-
-If there is no KL method registered specifically for `type(dist_a)` and
-`type(dist_b)`, then the class hierarchies of these types are searched.
-
-If one KL method is registered between any pairs of classes in these two
-parent hierarchies, it is used.
-
-If more than one such registered method exists, the method whose registered
-classes have the shortest sum MRO paths to the input types is used.
-
-If more than one such shortest path exists, the first method
-identified in the search is used (favoring a shorter MRO distance to
-`type(dist_a)`).
-
-##### Args:
-
-
-*  <b>`dist_a`</b>: The first distribution.
-*  <b>`dist_b`</b>: The second distribution.
-*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
-    if the KL returns NaN values for any batch entry of the given
-    distributions.  If `True`, the KL may return a NaN for the given entry.
-*  <b>`name`</b>: (optional) Name scope to use for created operations.
-
-##### Returns:
-
-  A Tensor with the batchwise KL-divergence between dist_a and dist_b.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If no KL method is defined for distribution types
-    of dist_a and dist_b.
-
-
-- - -
-
-### `class tf.contrib.distributions.RegisterKL` {#RegisterKL}
-
-Decorator to register a KL divergence implementation function.
-
-Usage:
-
-@distributions.RegisterKL(distributions.Normal, distributions.Normal)
-def _kl_normal_mvn(norm_a, norm_b):
-  # Return KL(norm_a || norm_b)
-- - -
-
-#### `tf.contrib.distributions.RegisterKL.__call__(kl_fn)` {#RegisterKL.__call__}
-
-Perform the KL registration.
-
-##### Args:
-
-
-*  <b>`kl_fn`</b>: The function to use for the KL divergence.
-
-##### Returns:
-
-  kl_fn
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if kl_fn is not a callable.
-*  <b>`ValueError`</b>: if a KL divergence function has already been registered for
-    the given argument classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RegisterKL.__init__(dist_cls_a, dist_cls_b)` {#RegisterKL.__init__}
-
-Initialize the KL registrar.
-
-##### Args:
-
-
-*  <b>`dist_cls_a`</b>: the class of the first argument of the KL divergence.
-*  <b>`dist_cls_b`</b>: the class of the second argument of the KL divergence.
-
-
-
-
-## Utilities
-
-- - -
-
-### `tf.contrib.distributions.softplus_inverse(x, name=None)` {#softplus_inverse}
-
-Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
-
-Mathematically this op is equivalent to:
-
-```none
-softplus_inverse = log(exp(x) - 1.)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. Non-negative (not enforced), floating-point.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `Tensor`. Has the same type/shape as input `x`.
-
-
-
-## Relaxed Discrete Distributions
-
-- - -
-
-### `class tf.contrib.distributions.ExpRelaxedOneHotCategorical` {#ExpRelaxedOneHotCategorical}
-
-ExpRelaxedOneHotCategorical distribution with temperature and logits.
-
-An ExpRelaxedOneHotCategorical distribution is a log-transformed
-RelaxedOneHotCategorical distribution. The RelaxedOneHotCategorical is a
-distribution over random probability vectors, vectors of positive real
-values that sum to one, which continuously approximates a OneHotCategorical.
-The degree of approximation is controlled by a temperature: as the temperature
-goes to 0 the RelaxedOneHotCategorical becomes discrete with a distribution
-described by the logits, as the temperature goes to infinity the
-RelaxedOneHotCategorical becomes the constant distribution that is identically
-the constant vector of (1/event_size, ..., 1/event_size).
-
-Because computing log-probabilities of the RelaxedOneHotCategorical can
-suffer from underflow issues, this class is one solution for loss
-functions that depend on log-probabilities, such as the KL Divergence found
-in the variational autoencoder loss. The KL divergence between two
-distributions is invariant under invertible transformations, so evaluating
-KL divergences of ExpRelaxedOneHotCategorical samples, which are always
-followed by a `tf.exp` op, is equivalent to evaluating KL divergences of
-RelaxedOneHotCategorical samples. See the appendix of Maddison et al., 2016
-for more mathematical details, where this distribution is called the
-ExpConcrete.
-
-#### Examples
-
-Creates a continuous distribution, whoe exp approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution. If those samples
-are followed by a `tf.exp` op, then they are distributed as a relaxed onehot
-categorical.
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = ExpRelaxedOneHotCategorical(temperature, probs=p)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very low, samples from
-this distribution are almost discrete, with one component almost 0 and the
-others very negative. The 2nd class is the most likely to be the largest
-component in samples drawn from this distribution.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very high, samples from
-this distribution are usually close to the (-log(3), -log(3), -log(3)) vector.
-The 2nd class is still the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 10
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.__init__(temperature, logits=None, probs=None, dtype=tf.float32, validate_args=False, allow_nan_stats=True, name='ExpRelaxedOneHotCategorical')` {#ExpRelaxedOneHotCategorical.__init__}
-
-Initialize ExpRelaxedOneHotCategorical using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of ExpRelaxedCategorical distributions. The temperature should
-    be positive.
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of ExpRelaxedCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of logits for each class. Only
-    one of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of ExpRelaxedCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of probabilities for each
-    class. Only one of `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.allow_nan_stats` {#ExpRelaxedOneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.batch_shape` {#ExpRelaxedOneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#ExpRelaxedOneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.cdf(value, name='cdf')` {#ExpRelaxedOneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.copy(**override_parameters_kwargs)` {#ExpRelaxedOneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.covariance(name='covariance')` {#ExpRelaxedOneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.dtype` {#ExpRelaxedOneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.entropy(name='entropy')` {#ExpRelaxedOneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_shape` {#ExpRelaxedOneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#ExpRelaxedOneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_size` {#ExpRelaxedOneHotCategorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_continuous` {#ExpRelaxedOneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#ExpRelaxedOneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_scalar_event(name='is_scalar_event')` {#ExpRelaxedOneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_cdf(value, name='log_cdf')` {#ExpRelaxedOneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_prob(value, name='log_prob')` {#ExpRelaxedOneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_survival_function(value, name='log_survival_function')` {#ExpRelaxedOneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.logits` {#ExpRelaxedOneHotCategorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.mean(name='mean')` {#ExpRelaxedOneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.mode(name='mode')` {#ExpRelaxedOneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.name` {#ExpRelaxedOneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ExpRelaxedOneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.param_static_shapes(cls, sample_shape)` {#ExpRelaxedOneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.parameters` {#ExpRelaxedOneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.prob(value, name='prob')` {#ExpRelaxedOneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.probs` {#ExpRelaxedOneHotCategorical.probs}
-
-Vector of probabilities summing to one.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.reparameterization_type` {#ExpRelaxedOneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#ExpRelaxedOneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.stddev(name='stddev')` {#ExpRelaxedOneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.survival_function(value, name='survival_function')` {#ExpRelaxedOneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.temperature` {#ExpRelaxedOneHotCategorical.temperature}
-
-Batchwise temperature tensor of a RelaxedCategorical.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.validate_args` {#ExpRelaxedOneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.variance(name='variance')` {#ExpRelaxedOneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.OneHotCategorical` {#OneHotCategorical}
-
-OneHotCategorical distribution.
-
-The categorical distribution is parameterized by the log-probabilities
-of a set of classes. The difference between OneHotCategorical and Categorical
-distributions is that OneHotCategorical is a discrete distribution over
-one-hot bit vectors whereas Categorical is a discrete distribution over
-positive integers. OneHotCategorical is equivalent to Categorical except
-Categorical has event_dim=() while OneHotCategorical has event_dim=K, where
-K is the number of classes.
-
-This class provides methods to create indexed batches of OneHotCategorical
-distributions.  If the provided `logits` or `probs` is rank 2 or higher, for
-every fixed set of leading dimensions, the last dimension represents one
-single OneHotCategorical distribution.  When calling distribution
-functions (e.g. `dist.prob(x)`), `logits` and `x` are broadcast to the
-same shape (if possible).  In all cases, the last dimension of `logits,x`
-represents single OneHotCategorical distributions.
-
-#### Examples
-
-Creates a 3-class distiribution, with the 2nd class, the most likely to be
-drawn from.
-
-```python
-p = [0.1, 0.5, 0.4]
-dist = OneHotCategorical(probs=p)
-```
-
-Creates a 3-class distiribution, with the 2nd class the most likely to be
-drawn from, using logits.
-
-```python
-logits = [-2, 2, 0]
-dist = OneHotCategorical(logits=logits)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-
-```python
-# counts is a scalar.
-p = [0.1, 0.4, 0.5]
-dist = OneHotCategorical(probs=p)
-dist.prob([0,1,0])  # Shape []
-
-# p will be broadcast to [[0.1, 0.4, 0.5], [0.1, 0.4, 0.5]] to match.
-samples = [[0,1,0], [1,0,0]]
-dist.prob(samples)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='OneHotCategorical')` {#OneHotCategorical.__init__}
-
-Initialize OneHotCategorical distributions using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities of a
-    set of Categorical distributions. The first `N - 1` dimensions index
-    into a batch of independent distributions and the last dimension
-    represents a vector of logits for each class. Only one of `logits` or
-    `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities of a set
-    of Categorical distributions. The first `N - 1` dimensions index into a
-    batch of independent distributions and the last dimension represents a
-    vector of probabilities for each class. Only one of `logits` or `probs`
-    should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.allow_nan_stats` {#OneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.batch_shape` {#OneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#OneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.cdf(value, name='cdf')` {#OneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.copy(**override_parameters_kwargs)` {#OneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.covariance(name='covariance')` {#OneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.dtype` {#OneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.entropy(name='entropy')` {#OneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_shape` {#OneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#OneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_size` {#OneHotCategorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_continuous` {#OneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#OneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_scalar_event(name='is_scalar_event')` {#OneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_cdf(value, name='log_cdf')` {#OneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_prob(value, name='log_prob')` {#OneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_survival_function(value, name='log_survival_function')` {#OneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.logits` {#OneHotCategorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.mean(name='mean')` {#OneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.mode(name='mode')` {#OneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.name` {#OneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#OneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.param_static_shapes(cls, sample_shape)` {#OneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.parameters` {#OneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.prob(value, name='prob')` {#OneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.probs` {#OneHotCategorical.probs}
-
-Vector of coordinatewise probabilities.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.reparameterization_type` {#OneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#OneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.stddev(name='stddev')` {#OneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.survival_function(value, name='survival_function')` {#OneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.validate_args` {#OneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.variance(name='variance')` {#OneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.RelaxedBernoulli` {#RelaxedBernoulli}
-
-RelaxedBernoulli distribution with temperature and logits parameters.
-
-The RelaxedBernoulli is a distribution over the unit interval (0,1), which
-continuously approximates a Bernoulli. The degree of approximation is
-controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-becomes discrete with a distribution described by the `logits` or `probs`
-parameters, as the temperature goes to infinity the RelaxedBernoulli
-becomes the constant distribution that is identically 0.5.
-
-The RelaxedBernoulli distribution is a reparameterized continuous
-distribution that is the binary special case of the RelaxedOneHotCategorical
-distribution (Maddison et al., 2016; Jang et al., 2016). For details on the
-binary special case see the appendix of Maddison et al. (2016) where it is
-referred to as BinConcrete. If you use this distribution, please cite both
-papers.
-
-Some care needs to be taken for loss functions that depend on the
-log-probability of RelaxedBernoullis, because computing log-probabilities of
-the RelaxedBernoulli can suffer from underflow issues. In many case loss
-functions such as these are invariant under invertible transformations of
-the random variables. The KL divergence, found in the variational autoencoder
-loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
-random variable followed by a `tf.sigmoid` op, one solution is to treat
-the Logistic as the random variable and `tf.sigmoid` as downstream. The
-KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
-op, is equivalent to evaluating KL divergences of RelaxedBernoulli samples.
-See Maddison et al., 2016 for more details where this distribution is called
-the BinConcrete.
-
-An alternative approach is to evaluate Bernoulli log probability or KL
-directly on relaxed samples, as done in Jang et al., 2016. In this case,
-guarantees on the loss are usually violated. For instance, using a Bernoulli
-KL in a relaxed ELBO is no longer a lower bound on the log marginal
-probability of the observation. Thus care and early stopping are important.
-
-#### Examples
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-probabilities (0.1, 0.5, 0.4). Samples from these distributions will be in
-the unit interval (0,1).
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = RelaxedBernoulli(temperature, probs=p)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1).
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, whose sigmoid approximate 3 Bernoullis
-with logits (-2, 2, 0).
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = Logistic(logits/temperature, 1./temperature)
-samples = dist.sample()
-sigmoid_samples = tf.sigmoid(samples)
-# sigmoid_samples has the same distribution as samples from
-# RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1). Because the temperature is very low, samples from
-these distributions are almost discrete, usually taking values very close to 0
-or 1.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1). Because the temperature is very high, samples from
-these distributions are usually close to the (0.5, 0.5, 0.5) vector.
-
-```python
-temperature = 100
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-
-Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with
-Gumbel-Softmax. 2016.
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.__init__(temperature, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='RelaxedBernoulli')` {#RelaxedBernoulli.__init__}
-
-Construct RelaxedBernoulli distributions.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of RelaxedBernoulli distributions. The temperature should be
-    positive.
-*  <b>`logits`</b>: An N-D `Tensor` representing the log-odds
-    of a positive event. Each entry in the `Tensor` parametrizes
-    an independent RelaxedBernoulli distribution where the probability of an
-    event is sigmoid(logits). Only one of `logits` or `probs` should be
-    passed in.
-*  <b>`probs`</b>: An N-D `Tensor` representing the probability of a positive event.
-    Each entry in the `Tensor` parameterizes an independent Bernoulli
-    distribution. Only one of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `probs` and `logits` are passed, or if neither.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.allow_nan_stats` {#RelaxedBernoulli.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.batch_shape` {#RelaxedBernoulli.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.batch_shape_tensor(name='batch_shape_tensor')` {#RelaxedBernoulli.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.bijector` {#RelaxedBernoulli.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.cdf(value, name='cdf')` {#RelaxedBernoulli.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.copy(**override_parameters_kwargs)` {#RelaxedBernoulli.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.covariance(name='covariance')` {#RelaxedBernoulli.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.distribution` {#RelaxedBernoulli.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.dtype` {#RelaxedBernoulli.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.entropy(name='entropy')` {#RelaxedBernoulli.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.event_shape` {#RelaxedBernoulli.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.event_shape_tensor(name='event_shape_tensor')` {#RelaxedBernoulli.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_continuous` {#RelaxedBernoulli.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_scalar_batch(name='is_scalar_batch')` {#RelaxedBernoulli.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_scalar_event(name='is_scalar_event')` {#RelaxedBernoulli.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_cdf(value, name='log_cdf')` {#RelaxedBernoulli.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_prob(value, name='log_prob')` {#RelaxedBernoulli.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_survival_function(value, name='log_survival_function')` {#RelaxedBernoulli.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.logits` {#RelaxedBernoulli.logits}
-
-Log-odds of `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.mean(name='mean')` {#RelaxedBernoulli.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.mode(name='mode')` {#RelaxedBernoulli.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.name` {#RelaxedBernoulli.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#RelaxedBernoulli.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.param_static_shapes(cls, sample_shape)` {#RelaxedBernoulli.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.parameters` {#RelaxedBernoulli.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.prob(value, name='prob')` {#RelaxedBernoulli.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.probs` {#RelaxedBernoulli.probs}
-
-Probability of `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.reparameterization_type` {#RelaxedBernoulli.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.sample(sample_shape=(), seed=None, name='sample')` {#RelaxedBernoulli.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.stddev(name='stddev')` {#RelaxedBernoulli.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.survival_function(value, name='survival_function')` {#RelaxedBernoulli.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.temperature` {#RelaxedBernoulli.temperature}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.validate_args` {#RelaxedBernoulli.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.variance(name='variance')` {#RelaxedBernoulli.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.RelaxedOneHotCategorical` {#RelaxedOneHotCategorical}
-
-RelaxedOneHotCategorical distribution with temperature and logits.
-
-The RelaxedOneHotCategorical is a distribution over random probability
-vectors, vectors of positive real values that sum to one, which continuously
-approximates a OneHotCategorical. The degree of approximation is controlled by
-a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
-becomes discrete with a distribution described by the `logits` or `probs`
-parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
-becomes the constant distribution that is identically the constant vector of
-(1/event_size, ..., 1/event_size).
-
-The RelaxedOneHotCategorical distribution was concurrently introduced as the
-Gumbel-Softmax (Jang et al., 2016) and Concrete (Maddison et al., 2016)
-distributions for use as a reparameterized continuous approximation to the
-`Categorical` one-hot distribution. If you use this distribution, please cite
-both papers.
-
-#### Examples
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very low, samples from
-this distribution are almost discrete, with one component almost 1 and the
-others nearly 0. The 2nd class is the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very high, samples from
-this distribution are usually close to the (1/3, 1/3, 1/3) vector. The 2nd
-class is still the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 10
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with
-Gumbel-Softmax. 2016.
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.__init__(temperature, logits=None, probs=None, dtype=tf.float32, validate_args=False, allow_nan_stats=True, name='RelaxedOneHotCategorical')` {#RelaxedOneHotCategorical.__init__}
-
-Initialize RelaxedOneHotCategorical using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of RelaxedOneHotCategorical distributions. The temperature
-    should be positive.
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of RelaxedOneHotCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of logits for each class. Only
-    one of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of RelaxedOneHotCategorical distributions. The first `N - 1`
-    dimensions index into a batch of independent distributions and the last
-    dimension represents a vector of probabilities for each class. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member.  If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: A name for this distribution (optional).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.allow_nan_stats` {#RelaxedOneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.batch_shape` {#RelaxedOneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#RelaxedOneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.bijector` {#RelaxedOneHotCategorical.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.cdf(value, name='cdf')` {#RelaxedOneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.copy(**override_parameters_kwargs)` {#RelaxedOneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.covariance(name='covariance')` {#RelaxedOneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.distribution` {#RelaxedOneHotCategorical.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.dtype` {#RelaxedOneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.entropy(name='entropy')` {#RelaxedOneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.event_shape` {#RelaxedOneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#RelaxedOneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_continuous` {#RelaxedOneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#RelaxedOneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_scalar_event(name='is_scalar_event')` {#RelaxedOneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_cdf(value, name='log_cdf')` {#RelaxedOneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_prob(value, name='log_prob')` {#RelaxedOneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_survival_function(value, name='log_survival_function')` {#RelaxedOneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.mean(name='mean')` {#RelaxedOneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.mode(name='mode')` {#RelaxedOneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.name` {#RelaxedOneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#RelaxedOneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.param_static_shapes(cls, sample_shape)` {#RelaxedOneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.parameters` {#RelaxedOneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.prob(value, name='prob')` {#RelaxedOneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.reparameterization_type` {#RelaxedOneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#RelaxedOneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.stddev(name='stddev')` {#RelaxedOneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.survival_function(value, name='survival_function')` {#RelaxedOneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.validate_args` {#RelaxedOneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.variance(name='variance')` {#RelaxedOneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.distributions.ConditionalDistribution` {#ConditionalDistribution}
-
-Distribution that supports intrinsic parameters (local latents).
-
-Subclasses of this distribution may have additional keyword arguments passed
-to their sample-based methods (i.e. `sample`, `log_prob`, etc.).
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.__init__(dtype, is_continuous, reparameterization_type, validate_args, allow_nan_stats, parameters=None, graph_parents=None, name=None)` {#ConditionalDistribution.__init__}
-
-Constructs the `Distribution`.
-
-**This is a private method for subclass use.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the event samples. `None` implies no type-enforcement.
-*  <b>`is_continuous`</b>: Python boolean. If `True` this
-    `Distribution` is continuous over its supported domain.
-*  <b>`reparameterization_type`</b>: Instance of `ReparameterizationType`.
-    If `distributions.FULLY_REPARAMETERIZED`, this
-    `Distribution` can be reparameterized in terms of some standard
-    distribution with a function whose Jacobian is constant for the support
-    of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
-    then no such reparameterization is available.
-*  <b>`validate_args`</b>: Python boolean.  Whether to validate input with asserts.
-    If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Python boolean.  If `False`, raise an
-    exception if a statistic (e.g., mean, mode) is undefined for any batch
-    member. If True, batch members with valid parameters leading to
-    undefined statistics will return `NaN` for this statistic.
-*  <b>`parameters`</b>: Python dictionary of parameters used to instantiate this
-    `Distribution`.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Distribution`.
-*  <b>`name`</b>: A name for this distribution. Default: subclass name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any member of graph_parents is `None` or not a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.allow_nan_stats` {#ConditionalDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.batch_shape` {#ConditionalDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#ConditionalDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.cdf(*args, **kwargs)` {#ConditionalDistribution.cdf}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.copy(**override_parameters_kwargs)` {#ConditionalDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.covariance(name='covariance')` {#ConditionalDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.dtype` {#ConditionalDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.entropy(name='entropy')` {#ConditionalDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.event_shape` {#ConditionalDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.event_shape_tensor(name='event_shape_tensor')` {#ConditionalDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_continuous` {#ConditionalDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_scalar_batch(name='is_scalar_batch')` {#ConditionalDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_scalar_event(name='is_scalar_event')` {#ConditionalDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_cdf(*args, **kwargs)` {#ConditionalDistribution.log_cdf}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_prob(*args, **kwargs)` {#ConditionalDistribution.log_prob}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_survival_function(*args, **kwargs)` {#ConditionalDistribution.log_survival_function}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.mean(name='mean')` {#ConditionalDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.mode(name='mode')` {#ConditionalDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.name` {#ConditionalDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ConditionalDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.param_static_shapes(cls, sample_shape)` {#ConditionalDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.parameters` {#ConditionalDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.prob(*args, **kwargs)` {#ConditionalDistribution.prob}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.reparameterization_type` {#ConditionalDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.sample(*args, **kwargs)` {#ConditionalDistribution.sample}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.stddev(name='stddev')` {#ConditionalDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.survival_function(*args, **kwargs)` {#ConditionalDistribution.survival_function}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.validate_args` {#ConditionalDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.variance(name='variance')` {#ConditionalDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
-- - -
-
-### `class tf.contrib.distributions.ConditionalTransformedDistribution` {#ConditionalTransformedDistribution}
-
-A TransformedDistribution that allows intrinsic conditioning.
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.__init__(distribution, bijector=None, batch_shape=None, event_shape=None, validate_args=False, name=None)` {#ConditionalTransformedDistribution.__init__}
-
-Construct a Transformed Distribution.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution instance to transform. Typically an
-    instance of `Distribution`.
-*  <b>`bijector`</b>: The object responsible for calculating the transformation.
-    Typically an instance of `Bijector`. `None` means `Identity()`.
-*  <b>`batch_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `batch_shape`; valid only if `distribution.is_scalar_batch()`.
-*  <b>`event_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `event_shape`; valid only if `distribution.is_scalar_event()`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class. Default:
-    `bijector.name + distribution.name`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.allow_nan_stats` {#ConditionalTransformedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.batch_shape` {#ConditionalTransformedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#ConditionalTransformedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.bijector` {#ConditionalTransformedDistribution.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.cdf(*args, **kwargs)` {#ConditionalTransformedDistribution.cdf}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.copy(**override_parameters_kwargs)` {#ConditionalTransformedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.covariance(name='covariance')` {#ConditionalTransformedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.distribution` {#ConditionalTransformedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.dtype` {#ConditionalTransformedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.entropy(name='entropy')` {#ConditionalTransformedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.event_shape` {#ConditionalTransformedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.event_shape_tensor(name='event_shape_tensor')` {#ConditionalTransformedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_continuous` {#ConditionalTransformedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_scalar_batch(name='is_scalar_batch')` {#ConditionalTransformedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_scalar_event(name='is_scalar_event')` {#ConditionalTransformedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_cdf(*args, **kwargs)` {#ConditionalTransformedDistribution.log_cdf}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_prob(*args, **kwargs)` {#ConditionalTransformedDistribution.log_prob}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_survival_function(*args, **kwargs)` {#ConditionalTransformedDistribution.log_survival_function}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.mean(name='mean')` {#ConditionalTransformedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.mode(name='mode')` {#ConditionalTransformedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.name` {#ConditionalTransformedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ConditionalTransformedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.param_static_shapes(cls, sample_shape)` {#ConditionalTransformedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.parameters` {#ConditionalTransformedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.prob(*args, **kwargs)` {#ConditionalTransformedDistribution.prob}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.reparameterization_type` {#ConditionalTransformedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.sample(*args, **kwargs)` {#ConditionalTransformedDistribution.sample}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.stddev(name='stddev')` {#ConditionalTransformedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.survival_function(*args, **kwargs)` {#ConditionalTransformedDistribution.survival_function}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.validate_args` {#ConditionalTransformedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.variance(name='variance')` {#ConditionalTransformedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md b/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
deleted file mode 100644
index 572b7ccc1ac..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
+++ /dev/null
@@ -1,77 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# FFmpeg (contrib)
-[TOC]
-
-## Encoding and decoding audio using FFmpeg
-
-TensorFlow provides Ops to decode and encode audio files using the
-[FFmpeg](https://www.ffmpeg.org/) library. FFmpeg must be
-locally [installed](https://ffmpeg.org/download.html) for these Ops to succeed.
-
-Example:
-
-```python
-from tensorflow.contrib import ffmpeg
-
-audio_binary = tf.read_file('song.mp3')
-waveform = ffmpeg.decode_audio(
-    audio_binary, file_format='mp3', samples_per_second=44100, channel_count=2)
-uncompressed_binary = ffmpeg.encode_audio(
-    waveform, file_format='wav', samples_per_second=44100)
-```
-
-- - -
-
-### `tf.contrib.ffmpeg.decode_audio(contents, file_format=None, samples_per_second=None, channel_count=None)` {#decode_audio}
-
-Create an op that decodes the contents of an audio file.
-
-Note that ffmpeg is free to select the "best" audio track from an mp4.
-https://trac.ffmpeg.org/wiki/Map
-
-##### Args:
-
-
-*  <b>`contents`</b>: The binary contents of the audio file to decode. This is a
-      scalar.
-*  <b>`file_format`</b>: A string specifying which format the contents will conform
-      to. This can be mp3, mp4, ogg, or wav.
-*  <b>`samples_per_second`</b>: The number of samples per second that is assumed.
-      In some cases, resampling will occur to generate the correct sample
-      rate.
-*  <b>`channel_count`</b>: The number of channels that should be created from the
-      audio contents. If the contents have more than this number, then
-      some channels will be merged or dropped. If contents has fewer than
-      this, then additional channels will be created from the existing ones.
-
-##### Returns:
-
-  A rank 2 tensor that has time along dimension 0 and channels along
-  dimension 1. Dimension 0 will be `samples_per_second * length` wide, and
-  dimension 1 will be `channel_count` wide. If ffmpeg fails to decode the
-  audio then an empty tensor will be returned.
-
-
-- - -
-
-### `tf.contrib.ffmpeg.encode_audio(audio, file_format=None, samples_per_second=None)` {#encode_audio}
-
-Creates an op that encodes an audio file using sampled audio from a tensor.
-
-##### Args:
-
-
-*  <b>`audio`</b>: A rank 2 tensor that has time along dimension 0 and channels along
-      dimension 1. Dimension 0 is `samples_per_second * length` long in
-      seconds.
-*  <b>`file_format`</b>: The type of file to encode. "wav" is the only supported format.
-*  <b>`samples_per_second`</b>: The number of samples in the audio tensor per second of
-      audio.
-
-##### Returns:
-
-  A scalar tensor that contains the encoded audio in the specified file
-  format.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.framework.md b/tensorflow/g3doc/api_docs/python/contrib.framework.md
deleted file mode 100644
index 0acb4a26bdf..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.framework.md
+++ /dev/null
@@ -1,1189 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Framework (contrib)
-[TOC]
-
-Framework utilities.
-
-- - -
-
-### `tf.contrib.framework.assert_same_float_dtype(tensors=None, dtype=None)` {#assert_same_float_dtype}
-
-Validate and return float type based on `tensors` and `dtype`.
-
-For ops such as matrix multiplication, inputs and weights must be of the
-same float type. This function validates that all `tensors` are the same type,
-validates that type is `dtype` (if supplied), and returns the type. Type must
-be `dtypes.float32` or `dtypes.float64`. If neither `tensors` nor
-`dtype` is supplied, default to `dtypes.float32`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: Tensors of input values. Can include `None` elements, which will be
-      ignored.
-*  <b>`dtype`</b>: Expected type.
-
-##### Returns:
-
-  Validated type.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if neither `tensors` nor `dtype` is supplied, or result is not
-      float.
-
-
-- - -
-
-### `tf.contrib.framework.assert_scalar(tensor, name=None)` {#assert_scalar}
-
-
-
-
-- - -
-
-### `tf.contrib.framework.assert_scalar_int(tensor, name=None)` {#assert_scalar_int}
-
-Assert `tensor` is 0-D, of type `tf.int32` or `tf.int64`.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` to test.
-*  <b>`name`</b>: Name of the op and of the new `Tensor` if one is created.
-
-##### Returns:
-
-  `tensor`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `tensor` is not 0-D, of type `tf.int32` or `tf.int64`.
-
-
-- - -
-
-### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
-
-Converts value to a `SparseTensor` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
-    registered `Tensor` conversion function.
-*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
-    type is inferred from the type of `value`.
-*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
-
-##### Returns:
-
-  A `SparseTensor` or `Tensor` based on `value`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If result type is incompatible with `dtype`.
-
-
-- - -
-
-### `tf.contrib.framework.get_graph_from_inputs(op_input_list, graph=None)` {#get_graph_from_inputs}
-
-Returns the appropriate graph to use for the given inputs.
-
-1. If `graph` is provided, we validate that all inputs in `op_input_list` are
-   from the same graph.
-2. Otherwise, we attempt to select a graph from the first Operation- or
-   Tensor-valued input in `op_input_list`, and validate that all other
-   such inputs are in the same graph.
-3. If the graph was not specified and it could not be inferred from
-   `op_input_list`, we attempt to use the default graph.
-
-##### Args:
-
-
-*  <b>`op_input_list`</b>: A list of inputs to an operation, which may include `Tensor`,
-    `Operation`, and other objects that may be converted to a graph element.
-*  <b>`graph`</b>: (Optional) The explicit graph to use.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_input_list` is not a list or tuple, or if graph is not a
-    Graph.
-*  <b>`ValueError`</b>: If a graph is explicitly passed and not all inputs are from it,
-    or if the inputs are from multiple graphs, or we could not find a graph
-    and there was no default graph.
-
-##### Returns:
-
-  The appropriate graph to use for the given inputs.
-
-
-- - -
-
-### `tf.is_numeric_tensor(tensor)` {#is_numeric_tensor}
-
-
-
-
-- - -
-
-### `tf.is_non_decreasing(x, name=None)` {#is_non_decreasing}
-
-Returns `True` if `x` is non-decreasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is non-decreasing if for every adjacent pair we have `x[i] <= x[i+1]`.
-If `x` has less than two elements, it is trivially non-decreasing.
-
-See also:  `is_strictly_increasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "is_non_decreasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is non-decreasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
-
-- - -
-
-### `tf.is_strictly_increasing(x, name=None)` {#is_strictly_increasing}
-
-Returns `True` if `x` is strictly increasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is strictly increasing if for every adjacent pair we have `x[i] < x[i+1]`.
-If `x` has less than two elements, it is trivially strictly increasing.
-
-See also:  `is_non_decreasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "is_strictly_increasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is strictly increasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
-
-- - -
-
-### `tf.contrib.framework.is_tensor(x)` {#is_tensor}
-
-Check for tensor types.
-
-Check whether an object is a tensor. Equivalent to
-`isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
-
-##### Args:
-
-
-*  <b>`x`</b>: An python object to check.
-
-##### Returns:
-
-  `True` if `x` is a tensor, `False` if not.
-
-
-- - -
-
-### `tf.contrib.framework.reduce_sum_n(tensors, name=None)` {#reduce_sum_n}
-
-Reduce tensors to a scalar sum.
-
-This reduces each tensor in `tensors` to a scalar via `tf.reduce_sum`, then
-adds them via `tf.add_n`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors, all of the same numeric type.
-*  <b>`name`</b>: Tensor name, and scope for all other ops.
-
-##### Returns:
-
-  Total loss tensor, or None if no losses have been configured.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `losses` is missing or empty.
-
-
-- - -
-
-### `tf.contrib.framework.remove_squeezable_dimensions(predictions, labels, name=None)` {#remove_squeezable_dimensions}
-
-Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
-
-This will use static shape if available. Otherwise, it will add graph
-operations, which could result in a performance hit.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Predicted values, a `Tensor` of arbitrary dimensions.
-*  <b>`labels`</b>: Label values, a `Tensor` whose dimensions match `predictions`.
-*  <b>`name`</b>: Name of the op.
-
-##### Returns:
-
-  Tuple of `predictions` and `labels`, possibly with last dim squeezed.
-
-
-- - -
-
-### `tf.contrib.framework.with_shape(expected_shape, tensor)` {#with_shape}
-
-Asserts tensor has expected shape.
-
-If tensor shape and expected_shape, are fully defined, assert they match.
-Otherwise, add assert op that will validate the shape when tensor is
-evaluated, and set shape on tensor.
-
-##### Args:
-
-
-*  <b>`expected_shape`</b>: Expected shape to assert, as a 1D array of ints, or tensor
-      of same.
-*  <b>`tensor`</b>: Tensor whose shape we're validating.
-
-##### Returns:
-
-  tensor, perhaps with a dependent assert operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if tensor has an invalid shape.
-
-
-- - -
-
-### `tf.contrib.framework.with_same_shape(expected_tensor, tensor)` {#with_same_shape}
-
-Assert tensors are the same shape, from the same graph.
-
-##### Args:
-
-
-*  <b>`expected_tensor`</b>: Tensor with expected shape.
-*  <b>`tensor`</b>: Tensor of actual values.
-
-##### Returns:
-
-  Tuple of (actual_tensor, label_tensor), possibly with assert ops added.
-
-
-
-## Deprecation
-- - -
-
-### `tf.contrib.framework.deprecated(date, instructions)` {#deprecated}
-
-Decorator for marking functions or methods deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called. It has the following format:
-
-  <function> (from <module>) is deprecated and will be removed after <date>.
-  Instructions for updating:
-  <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated)' is appended
-to the first line of the docstring and a deprecation notice is prepended
-to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
-
-
-- - -
-
-### `tf.contrib.framework.deprecated_args(date, instructions, *deprecated_arg_names_or_tuples)` {#deprecated_args}
-
-Decorator for marking specific function arguments as deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called with the deprecated argument. It has the following format:
-
-  Calling <function> (from <module>) with <arg> is deprecated and will be
-  removed after <date>. Instructions for updating:
-    <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated arguments)' is
-appended to the first line of the docstring and a deprecation notice is
-prepended to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-*  <b>`*deprecated_arg_names_or_tuples`</b>: String. or 2-Tuple(String,
-    [ok_vals]).  The string is the deprecated argument name.
-    Optionally, an ok-value may be provided.  If the user provided
-    argument equals this value, the warning is suppressed.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, instructions are
-    empty, the deprecated arguments are not present in the function
-    signature, or the second element of a deprecated_tuple is not a
-    list.
-
-
-- - -
-
-### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
-
-Decorator for marking specific function argument values as deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called with the deprecated argument values. It has the following format:
-
-  Calling <function> (from <module>) with <arg>=<value> is deprecated and
-  will be removed after <date>. Instructions for updating:
-    <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated arguments)' is
-appended to the first line of the docstring and a deprecation notice is
-prepended to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
-
-
-
-## Arg_Scope
-- - -
-
-### `tf.contrib.framework.arg_scope(list_ops_or_scope, **kwargs)` {#arg_scope}
-
-Stores the default arguments for the given set of list_ops.
-
-For usage, please see examples at top of the file.
-
-##### Args:
-
-
-*  <b>`list_ops_or_scope`</b>: List or tuple of operations to set argument scope for or
-    a dictionary containing the current scope. When list_ops_or_scope is a
-    dict, kwargs must be empty. When list_ops_or_scope is a list or tuple,
-    then every op in it need to be decorated with @add_arg_scope to work.
-*  <b>`**kwargs`</b>: keyword=value that will define the defaults for each op in
-            list_ops. All the ops need to accept the given set of arguments.
-
-##### Yields:
-
-  the current_scope, which is a dictionary of {op: {arg: value}}
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if list_ops is not a list or a tuple.
-*  <b>`ValueError`</b>: if any op in list_ops has not be decorated with @add_arg_scope.
-
-
-- - -
-
-### `tf.contrib.framework.add_arg_scope(func)` {#add_arg_scope}
-
-Decorates a function with args so it can be used within an arg_scope.
-
-##### Args:
-
-
-*  <b>`func`</b>: function to decorate.
-
-##### Returns:
-
-  A tuple with the decorated function func_with_args().
-
-
-- - -
-
-### `tf.contrib.framework.has_arg_scope(func)` {#has_arg_scope}
-
-Checks whether a func has been decorated with @add_arg_scope or not.
-
-##### Args:
-
-
-*  <b>`func`</b>: function to check.
-
-##### Returns:
-
-  a boolean.
-
-
-- - -
-
-### `tf.contrib.framework.arg_scoped_arguments(func)` {#arg_scoped_arguments}
-
-Returns the list kwargs that arg_scope can set for a func.
-
-##### Args:
-
-
-*  <b>`func`</b>: function which has been decorated with @add_arg_scope.
-
-##### Returns:
-
-  a list of kwargs names.
-
-
-
-## Variables
-- - -
-
-### `tf.contrib.framework.add_model_variable(var)` {#add_model_variable}
-
-Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-
-##### Args:
-
-
-*  <b>`var`</b>: a variable.
-
-
-- - -
-
-### `tf.train.assert_global_step(global_step_tensor)` {#assert_global_step}
-
-Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`global_step_tensor`</b>: `Tensor` to test.
-
-
-- - -
-
-### `tf.contrib.framework.assert_or_get_global_step(graph=None, global_step_tensor=None)` {#assert_or_get_global_step}
-
-Verifies that a global step tensor is valid or gets one if None is given.
-
-If `global_step_tensor` is not None, check that it is a valid global step
-tensor (using `assert_global_step`). Otherwise find a global step tensor using
-`get_global_step` and return it.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph to find the global step tensor for.
-*  <b>`global_step_tensor`</b>: The tensor to check for suitability as a global step.
-    If None is given (the default), find a global step tensor.
-
-##### Returns:
-
-  A tensor suitable as a global step, or `None` if none was provided and none
-  was found.
-
-
-- - -
-
-### `tf.contrib.framework.assign_from_checkpoint(model_path, var_list)` {#assign_from_checkpoint}
-
-Creates an operation to assign specific variables from a checkpoint.
-
-##### Args:
-
-
-*  <b>`model_path`</b>: The full path to the model checkpoint. To get latest checkpoint
-      use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
-*  <b>`var_list`</b>: A list of (possibly partitioned) `Variable` objects
-      or a dictionary mapping names in the checkpoint to the
-      corresponding variables or list of variables to initialize
-      from that checkpoint value. For partitioned Variables, the
-      name in the checkpoint must be the full variable, not the
-      name of the partitioned variable, eg. "my_var" rather than
-      "my_var/part_4". If empty, returns no_op(), {}.
-
-##### Returns:
-
-  the restore_op and the feed_dict that need to be run to restore var_list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the checkpoint specified at `model_path` is missing one of
-    the variables in `var_list`.
-
-
-- - -
-
-### `tf.contrib.framework.assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False, reshape_variables=False)` {#assign_from_checkpoint_fn}
-
-Returns a function that assigns specific variables from a checkpoint.
-
-##### Args:
-
-
-*  <b>`model_path`</b>: The full path to the model checkpoint. To get latest checkpoint
-      use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
-*  <b>`var_list`</b>: A list of `Variable` objects or a dictionary mapping names in the
-      checkpoint to the correspoing variables to initialize. If empty or None,
-      it would return  no_op(), None.
-*  <b>`ignore_missing_vars`</b>: Boolean, if True it would ignore variables missing in
-      the checkpoint with a warning instead of failing.
-*  <b>`reshape_variables`</b>: Boolean, if True it would automatically reshape variables
-      which are of different shape then the ones stored in the checkpoint but
-      which have the same number of elements.
-
-##### Returns:
-
-  A function that takes a single argument, a `tf.Session`, that applies the
-  assignment operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the checkpoint specified at `model_path` is missing one of
-    the variables in `var_list`.
-
-
-- - -
-
-### `tf.contrib.framework.assign_from_values(var_names_to_values)` {#assign_from_values}
-
-Creates an assignment operation from a given mapping.
-
-This function provides a mechanism for performing assignment of variables
-to values in a way that does not fill the graph with large assignment values.
-
-##### Args:
-
-
-*  <b>`var_names_to_values`</b>: A map from variable names to values.
-
-##### Returns:
-
-
-*  <b>`assign_op`</b>: An `Operation` that assigns each of the given variables to the
-    requested values.
-*  <b>`feed_dict`</b>: The feed dictionary to use when evaluating `assign_op`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any of the given variable names were not found.
-
-
-- - -
-
-### `tf.contrib.framework.assign_from_values_fn(var_names_to_values)` {#assign_from_values_fn}
-
-Returns a function that assigns specific variables from the given values.
-
-This function provides a mechanism for performing assignment of variables
-to values in a way that does not fill the graph with large assignment values.
-
-##### Args:
-
-
-*  <b>`var_names_to_values`</b>: A map from variable names to values.
-
-##### Returns:
-
-  A function that takes a single argument, a `tf.Session`, that applies the
-  assignment operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any of the given variable names were not found.
-
-
-- - -
-
-### `tf.contrib.framework.create_global_step(graph=None)` {#create_global_step}
-
-Create global step tensor in graph.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph in which to create the global step. If missing, use default
-      graph.
-
-##### Returns:
-
-  Global step tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if global step key is already defined.
-
-
-- - -
-
-### `tf.contrib.framework.filter_variables(var_list, include_patterns=None, exclude_patterns=None, reg_search=True)` {#filter_variables}
-
-Filter a list of variables using regular expressions.
-
-First includes variables according to the list of include_patterns.
-Afterwards, eliminates variables according to the list of exclude_patterns.
-
-For example, one can obtain a list of variables with the weights of all
-convolutional layers (depending on the network definition) by:
-
-```python
-variables = tf.contrib.framework.get_model_variables()
-conv_weight_variables = tf.contrib.framework.filter_variables(
-    variables,
-    include_patterns=['Conv'],
-    exclude_patterns=['biases', 'Logits'])
-```
-
-##### Args:
-
-
-*  <b>`var_list`</b>: list of variables.
-*  <b>`include_patterns`</b>: list of regular expressions to include. Defaults to None,
-      which means all variables are selected according to the include rules.
-      A variable is included if it matches any of the include_patterns.
-*  <b>`exclude_patterns`</b>: list of regular expressions to exclude. Defaults to None,
-      which means all variables are selected according to the exclude rules.
-      A variable is excluded if it matches any of the exclude_patterns.
-*  <b>`reg_search`</b>: boolean. If True (default), performs re.search to find matches
-      (i.e. pattern can match any substring of the variable name). If False,
-      performs re.match (i.e. regexp should match from the beginning of the
-      variable name).
-
-##### Returns:
-
-  filtered list of variables.
-
-
-- - -
-
-### `tf.train.get_global_step(graph=None)` {#get_global_step}
-
-Get the global step tensor.
-
-The global step tensor must be an integer variable. We first try to find it
-in the collection `GLOBAL_STEP`, or by name `global_step:0`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph to find the global step in. If missing, use default graph.
-
-##### Returns:
-
-  The global step variable, or `None` if none was found.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the global step tensor has a non-integer type, or if it is not
-    a `Variable`.
-
-
-- - -
-
-### `tf.contrib.framework.get_or_create_global_step(graph=None)` {#get_or_create_global_step}
-
-Returns and create (if necessary) the global step variable.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph in which to create the global step. If missing, use default
-      graph.
-
-##### Returns:
-
-  the tensor representing the global step variable.
-
-
-- - -
-
-### `tf.contrib.framework.get_local_variables(scope=None, suffix=None)` {#get_local_variables}
-
-Gets the list of local variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
-
-- - -
-
-### `tf.contrib.framework.get_model_variables(scope=None, suffix=None)` {#get_model_variables}
-
-Gets the list of model variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
-
-- - -
-
-### `tf.contrib.framework.get_unique_variable(var_op_name)` {#get_unique_variable}
-
-Gets the variable uniquely identified by that var_op_name.
-
-##### Args:
-
-
-*  <b>`var_op_name`</b>: the full name of the variable op, including the scope.
-
-##### Returns:
-
-  a tensorflow variable.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if no variable uniquely identified by the name exists.
-
-
-- - -
-
-### `tf.contrib.framework.get_variables_by_name(given_name, scope=None)` {#get_variables_by_name}
-
-Gets the list of variables that were given that name.
-
-##### Args:
-
-
-*  <b>`given_name`</b>: name given to the variable without any scope.
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-
-##### Returns:
-
-  a copied list of variables with the given name and scope.
-
-
-- - -
-
-### `tf.contrib.framework.get_variables_by_suffix(suffix, scope=None)` {#get_variables_by_suffix}
-
-Gets the list of variables that end with the given suffix.
-
-##### Args:
-
-
-*  <b>`suffix`</b>: suffix for filtering the variables to return.
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-
-##### Returns:
-
-  a copied list of variables with the given name and prefix.
-
-
-- - -
-
-### `tf.contrib.framework.get_variables_to_restore(include=None, exclude=None)` {#get_variables_to_restore}
-
-Gets the list of the variables to restore.
-
-##### Args:
-
-
-*  <b>`include`</b>: an optional list/tuple of scope strings for filtering which
-    variables from the VARIABLES collection to include. None would include all
-    the variables.
-*  <b>`exclude`</b>: an optional list/tuple of scope strings for filtering which
-    variables from the VARIABLES collection to exclude. None it would not
-    exclude any.
-
-##### Returns:
-
-  a list of variables to restore.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: include or exclude is provided but is not a list or a tuple.
-
-
-- - -
-
-### `tf.contrib.framework.get_variables(scope=None, suffix=None, collection='variables')` {#get_variables}
-
-Gets the list of variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return. Can be a
-    variable scope or a string.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-*  <b>`collection`</b>: in which collection search for. Defaults to
-    `GraphKeys.GLOBAL_VARIABLES`.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
-
-- - -
-
-### `tf.contrib.framework.local_variable(initial_value, validate_shape=True, name=None)` {#local_variable}
-
-Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
-
-##### Args:
-
-
-*  <b>`initial_value`</b>: See variables.Variable.__init__.
-*  <b>`validate_shape`</b>: See variables.Variable.__init__.
-*  <b>`name`</b>: See variables.Variable.__init__.
-
-##### Returns:
-
-  New variable.
-
-
-- - -
-
-### `tf.contrib.framework.model_variable(*args, **kwargs)` {#model_variable}
-
-Gets an existing model variable with these parameters or creates a new one.
-
-##### Args:
-
-
-*  <b>`name`</b>: the name of the new or existing variable.
-*  <b>`shape`</b>: shape of the new or existing variable.
-*  <b>`dtype`</b>: type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: initializer for the variable if one is created.
-*  <b>`regularizer`</b>: a (Tensor -> Tensor or None) function; the result of
-      applying it on a newly created variable will be added to the collection
-      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: A list of collection names to which the Variable will be added.
-    Note that the variable is always also added to the
-    `GraphKeys.GLOBAL_VARIABLES` and `GraphKeys.MODEL_VARIABLES` collections.
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-      Variable should be cached for reading.  Defaults to the Variable's
-      device.
-*  <b>`device`</b>: Optional device to place the variable. It can be an string or a
-    function that is called to get the device for the variable.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and dtype of the `Variable` to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`custom_getter`</b>: Callable that allows overwriting the internal
-    get_variable method and has to have the same signature.
-
-##### Returns:
-
-  The created or existing variable.
-
-
-- - -
-
-### `tf.contrib.framework.variable(*args, **kwargs)` {#variable}
-
-Gets an existing variable with these parameters or creates a new one.
-
-##### Args:
-
-
-*  <b>`name`</b>: the name of the new or existing variable.
-*  <b>`shape`</b>: shape of the new or existing variable.
-*  <b>`dtype`</b>: type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: initializer for the variable if one is created.
-*  <b>`regularizer`</b>: a (Tensor -> Tensor or None) function; the result of
-      applying it on a newly created variable will be added to the collection
-      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: A list of collection names to which the Variable will be added.
-    If None it would default to `tf.GraphKeys.GLOBAL_VARIABLES`.
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-      Variable should be cached for reading.  Defaults to the Variable's
-      device.
-*  <b>`device`</b>: Optional device to place the variable. It can be an string or a
-    function that is called to get the device for the variable.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and dtype of the `Variable` to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`custom_getter`</b>: Callable that allows overwriting the internal
-    get_variable method and has to have the same signature.
-
-##### Returns:
-
-  The created or existing variable.
-
-
-- - -
-
-### `class tf.contrib.framework.VariableDeviceChooser` {#VariableDeviceChooser}
-
-Device chooser for variables.
-
-When using a parameter server it will assign them in a round-robin fashion.
-When not using a parameter server it allows GPU or CPU placement.
-- - -
-
-#### `tf.contrib.framework.VariableDeviceChooser.__call__(op)` {#VariableDeviceChooser.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.framework.VariableDeviceChooser.__init__(num_tasks=0, job_name='ps', device_type='CPU', device_index=0)` {#VariableDeviceChooser.__init__}
-
-Initialize VariableDeviceChooser.
-
-##### Usage:
-
-  To use with 2 parameter servers:
-    VariableDeviceChooser(2)
-
-  To use without parameter servers:
-    VariableDeviceChooser()
-    VariableDeviceChooser(device_type='GPU') # For GPU placement
-
-##### Args:
-
-
-*  <b>`num_tasks`</b>: number of tasks.
-*  <b>`job_name`</b>: String, a name for the parameter server job.
-*  <b>`device_type`</b>: Optional device type string (e.g. "CPU" or "GPU")
-*  <b>`device_index`</b>: int.  Optional device index.  If left
-    unspecified, device represents 'any' device_index.
-
-
-
-- - -
-
-### `tf.contrib.framework.zero_initializer(ref, use_locking=True, name='zero_initializer')` {#zero_initializer}
-
-Initialize 'ref' with all zeros, ref tensor should be uninitialized.
-If already initialized, you will get ValueError. This op is intended to
-save memory during initialization.
-
-##### Args:
-
-
-*  <b>`ref`</b>: ref of the tensor need to be zero initialized.
-*  <b>`name`</b>: optional name for this operation.
-
-##### Returns:
-
-  ref that initialized.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If ref tensor is initialized.
-
-
-
-## Checkpoint utilities
-
-- - -
-
-### `tf.contrib.framework.load_checkpoint(filepattern)` {#load_checkpoint}
-
-Returns CheckpointReader for latest checkpoint.
-
-##### Args:
-
-
-*  <b>`filepattern`</b>: Directory with checkpoints file or path to checkpoint.
-
-##### Returns:
-
-  `CheckpointReader` object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if checkpoint_dir doesn't have 'checkpoint' file or checkpoints.
-
-
-- - -
-
-### `tf.contrib.framework.list_variables(checkpoint_dir)` {#list_variables}
-
-Returns list of all variables in the latest checkpoint.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-
-##### Returns:
-
-  List of tuples `(name, shape)`.
-
-
-- - -
-
-### `tf.contrib.framework.load_variable(checkpoint_dir, name)` {#load_variable}
-
-Returns a Tensor with the contents of the given variable in the checkpoint.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-*  <b>`name`</b>: Name of the tensor to return.
-
-##### Returns:
-
-  `Tensor` object.
-
-
-- - -
-
-### `tf.contrib.framework.init_from_checkpoint(checkpoint_dir, assignment_map)` {#init_from_checkpoint}
-
-Using assingment map initializes current variables with loaded tensors.
-
-Note: This overrides default initialization ops of specified variables and
-redefines dtype.
-
-##### Assignment map supports following syntax:
-
-  `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
-    current `scope_name` from `checkpoint_scope_name` with matching variable
-    names.
-  `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
-    will initalize `scope_name/variable_name` variable
-    from `checkpoint_scope_name/some_other_variable`.
-  `'scope_variable_name': variable` - will initialize given `tf.Variable`
-    object with variable from the checkpoint.
-  `'scope_variable_name': list(variable)` - will initialize list of
-    partitioned variables with variable from the checkpoint.
-  `'/': 'scope_name/'` - will load all variables in current `scope_name` from
-    checkpoint's root (e.g. no scope).
-
-Supports loading into partitioned variables, which are represented as
-'<variable>/part_<part #>'.
-
-
-*  <b>`Example`</b>: 
-```python
-  # Create variables.
-  with tf.variable_scope('test'):
-    m = tf.get_variable('my_var')
-  with tf.variable_scope('test2'):
-    var2 = tf.get_variable('my_var')
-  var3 = tf.get_variable(name="my1", shape=[100, 100],
-                         partitioner=lambda shape, dtype: [5, 1])
-  ...
-  # Specify which variables to intialize from checkpoint.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var': 'test/my_var',
-    'some_scope/': 'test2/'})
-  ...
-  # Or use `Variable` objects to identify what to initialize.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_scope/var2': var2,
-  })
-  # Initialize partitioned variables
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var_from_ckpt': 'part_var',
-  })
-  # Or specifying the list of `Variable` objects.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var_from_ckpt': var3._get_variable_list(),
-  })
-  ...
-  # Initialize variables as usual.
-  session.run(tf.get_all_variables())
-```
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-*  <b>`assignment_map`</b>: Dict, where keys are names of the variables in the
-    checkpoint and values are current variables or names of current variables
-    (in default graph).
-
-##### Raises:
-
-  tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
-
-*  <b>`ValueError`</b>: If missing variables in current graph.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
deleted file mode 100644
index a612eda9d2b..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
+++ /dev/null
@@ -1,2149 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Graph Editor (contrib)
-[TOC]
-
-TensorFlow Graph Editor.
-
-The TensorFlow Graph Editor library allows for modification of an existing
-`tf.Graph` instance in-place.
-
-The author's github username is [purpledog](https://github.com/purpledog).
-
-## Library overview
-
-Appending new nodes is the only graph editing operation allowed by the
-TensorFlow core library. The Graph Editor library is an attempt to allow for
-other kinds of editing operations, namely, *rerouting* and *transforming*.
-
-* *rerouting* is a local operation consisting in re-plugging existing tensors
-  (the edges of the graph). Operations (the nodes) are not modified by this
-  operation. For example, rerouting can be used to insert an operation adding
-  noise in place of an existing tensor.
-* *transforming* is a global operation consisting in transforming a graph into
-  another. By default, a transformation is a simple copy but it can be
-  customized to achieved other goals. For instance, a graph can be transformed
-  into another one in which noise is added after all the operations of a
-  specific type.
-
-**Important: modifying a graph in-place with the Graph Editor must be done
-`offline`, that is, without any active sessions.**
-
-Of course new operations can be appended online but Graph Editor specific
-operations like rerouting and transforming can currently only be done offline.
-
-Here is an example of what you **cannot** do:
-
-* Build a graph.
-* Create a session and run the graph.
-* Modify the graph with the Graph Editor.
-* Re-run the graph with the `same` previously created session.
-
-To edit an already running graph, follow these steps:
-
-* Build a graph.
-* Create a session and run the graph.
-* Save the graph state and terminate the session
-* Modify the graph with the Graph Editor.
-* create a new session and restore the graph state
-* Re-run the graph with the newly created session.
-
-Note that this procedure is very costly because a new session must be created
-after any modifications. Among other things, it takes time because the entire
-graph state must be saved and restored again.
-
-## Sub-graph
-
-Most of the functions in the Graph Editor library operate on *sub-graph*.
-More precisely, they take as input arguments instances of the SubGraphView class
-(or anything which can be converted to it). Doing so allows the same function
-to transparently operate on single operations as well as sub-graph of any size.
-
-A subgraph can be created in several ways:
-
-* using a list of ops:
-
-```python
-my_sgv = ge.sgv(ops)
-```
-
-* from a name scope:
-
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
-
-* using regular expression:
-
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
-
-Note that the Graph Editor is meant to manipulate several graphs at the same
-time, typically during transform or copy operation. For that reason,
-to avoid any confusion, the default graph is never used and the graph on
-which to operate must always be given explicitly. This is the reason why
-*`graph=tf.get_default_graph()`* is used in the code snippets above.
-
-## Modules overview
-
-* util: utility functions.
-* select: various selection methods of TensorFlow tensors and operations.
-* match: TensorFlow graph matching. Think of this as regular expressions for
-  graphs (but not quite yet).
-* reroute: various ways of rerouting tensors to different consuming ops like
-  *swap* or *reroute_a2b*.
-* subgraph: the SubGraphView class, which enables subgraph manipulations in a
-  TensorFlow `tf.Graph`.
-* edit: various editing functions operating on subgraphs like *detach*,
-  *connect* or *bypass*.
-* transform: the Transformer class, which enables transforming
-  (or simply copying) a subgraph into another one.
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.graph_editor.ControlOutputs` {#ControlOutputs}
-
-The control outputs topology.
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.__init__(graph)` {#ControlOutputs.__init__}
-
-Create a dictionary of control-output dependencies.
-
-##### Args:
-
-
-*  <b>`graph`</b>: a `tf.Graph`.
-
-##### Returns:
-
-  A dictionary where a key is a `tf.Operation` instance and the
-     corresponding value is a list of all the ops which have the key
-     as one of their control-input dependencies.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: graph is not a `tf.Graph`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.get(op)` {#ControlOutputs.get}
-
-return the control outputs of op.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.get_all()` {#ControlOutputs.get_all}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.graph` {#ControlOutputs.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.update()` {#ControlOutputs.update}
-
-Update the control outputs if the graph has changed.
-
-
-
-- - -
-
-### `class tf.contrib.graph_editor.OpMatcher` {#OpMatcher}
-
-Graph match class.
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.__call__(op)` {#OpMatcher.__call__}
-
-Evaluate if the op matches or not.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.__init__(positive_filter)` {#OpMatcher.__init__}
-
-Graph match constructor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.control_input_ops(*args)` {#OpMatcher.control_input_ops}
-
-Add input matches.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.input_ops(*args)` {#OpMatcher.input_ops}
-
-Add input matches.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.output_ops(*args)` {#OpMatcher.output_ops}
-
-Add output matches.
-
-
-
-- - -
-
-### `class tf.contrib.graph_editor.SubGraphView` {#SubGraphView}
-
-A subgraph view on an existing `tf.Graph`.
-
-An instance of this class is a subgraph view on an existing `tf.Graph`.
-"subgraph" means that it can represent part of the whole `tf.Graph`.
-"view" means that it only provides a passive observation and do not to act
-on the `tf.Graph`. Note that in this documentation, the term "subgraph" is
-often used as substitute to "subgraph view".
-
-A subgraph contains:
-
-* a list of input tensors, accessible via the `inputs` property.
-* a list of output tensors, accessible via the `outputs` property.
-* and the operations in between, accessible via the "ops" property.
-
-An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
-function which takes as input some input tensors and returns as output some
-output tensors. The computation that the function performs is encoded in the
-operations of the subgraph.
-
-The tensors (input or output) can be of two kinds:
-
-- connected: a connected tensor connects to at least one operation contained
-in the subgraph. One example is a subgraph representing a single operation
-and its inputs and outputs: all the input and output tensors of the op
-are "connected".
-- passthrough: a passthrough tensor does not connect to any operation
-contained in the subgraph. One example is a subgraph representing a
-single tensor: this tensor is passthrough. By default a passthrough tensor is
-present both in the input and output tensors of the subgraph. It can however
-be remapped to only appear as an input (or output) only.
-
-The input and output tensors can be remapped. For instance, some input tensor
-can be omitted. For instance, a subgraph representing an operation with two
-inputs can be remapped to only take one input. Note that this does not change
-at all the underlying `tf.Graph` (remember, it is a view). It means that
-the other input is being ignored, or is being treated as "given".
-The analogy with functions can be extended like this: F(x,y) is the original
-function. Remapping the inputs from [x, y] to just [x] means that the subgraph
-now represent the function F_y(x) (y is "given").
-
-The output tensors can also be remapped. For instance, some output tensor can
-be omitted. Other output tensor can be duplicated as well. As mentioned
-before, this does not change at all the underlying `tf.Graph`.
-The analogy with functions can be extended like this: F(...)->x,y is the
-original function. Remapping the outputs from [x, y] to just [y,y] means that
-the subgraph now represent the function M(F(...)) where M is the function
-M(a,b)->b,b.
-
-It is useful to describe three other kind of tensors:
-
-* internal: an internal tensor is a tensor connecting operations contained
-  in the subgraph. One example in the subgraph representing the two
-  operations A and B connected sequentially: -> A -> B ->. The middle arrow
-  is an internal tensor.
-* actual input: an input tensor of the subgraph, regardless of whether it is
-  listed in "inputs" or not (masked-out).
-* actual output: an output tensor of the subgraph, regardless of whether it is
-  listed in "outputs" or not (masked-out).
-* hidden input: an actual input which has been masked-out using an
-  input remapping. In other word, a hidden input is a non-internal tensor
-  not listed as a input tensor and one of whose consumers belongs to
-  the subgraph.
-* hidden output: a actual output which has been masked-out using an output
-  remapping. In other word, a hidden output is a non-internal tensor
-  not listed as an output and one of whose generating operations belongs to
-  the subgraph.
-
-Here are some useful guarantees about an instance of a SubGraphView:
-
-* the input (or output) tensors are not internal.
-* the input (or output) tensors are either "connected" or "passthrough".
-* the passthrough tensors are not connected to any of the operation of
-the subgraph.
-
-Note that there is no guarantee that an operation in a subgraph contributes
-at all to its inputs or outputs. For instance, remapping both the inputs and
-outputs to empty lists will produce a subgraph which still contains all the
-original operations. However, the remove_unused_ops function can be used to
-make a new subgraph view whose operations are connected to at least one of
-the input or output tensors.
-
-An instance of this class is meant to be a lightweight object which is not
-modified in-place by the user. Rather, the user can create new modified
-instances of a given subgraph. In that sense, the class SubGraphView is meant
-to be used like an immutable python object.
-
-A common problem when using views is that they can get out-of-sync with the
-data they observe (in this case, a `tf.Graph`). This is up to the user to
-ensure that this doesn't happen. To keep on the safe side, it is recommended
-that the life time of subgraph views are kept very short. One way to achieve
-this is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
-
-To alleviate the out-of-sync problem, some functions are granted the right to
-modified subgraph in place. This is typically the case of graph manipulation
-functions which, given some subgraphs as arguments, can modify the underlying
-`tf.Graph`. Since this modification is likely to render the subgraph view
-invalid, those functions can modify the argument in place to reflect the
-change. For instance, calling the function swap_inputs(svg0, svg1) will modify
-svg0 and svg1 in place to reflect the fact that their inputs have now being
-swapped.
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__bool__()` {#SubGraphView.__bool__}
-
-Allows for implicit boolean conversion.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__copy__()` {#SubGraphView.__copy__}
-
-Create a copy of this subgraph.
-
-Note that this class is a "view", copying it only create another view and
-does not copy the underlying part of the `tf.Graph`.
-
-##### Returns:
-
-  A new identical instance of the original subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__enter__()` {#SubGraphView.__enter__}
-
-Allow Python context to minimize the life time of a subgraph view.
-
-A subgraph view is meant to be a lightweight and transient object. A short
-lifetime will alleviate the "out-of-sync" issue mentioned earlier. For that
-reason, a SubGraphView instance can be used within a Python context. For
-example:
-
-from tensorflow.contrib import graph_editor as ge
-with ge.make_sgv(...) as sgv:
-  print(sgv)
-
-##### Returns:
-
-  Itself.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__exit__(exc_type, exc_value, traceback)` {#SubGraphView.__exit__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
-
-Create a subgraph containing the given ops and the "passthrough" tensors.
-
-##### Args:
-
-
-*  <b>`inside_ops`</b>: an object convertible to a list of `tf.Operation`. This list
-    defines all the operations in the subgraph.
-*  <b>`passthrough_ts`</b>: an object convertible to a list of `tf.Tensor`. This list
-    define all the "passthrough" tensors. A passthrough tensor is a tensor
-    which goes directly from the input of the subgraph to it output, without
-    any intermediate operations. All the non passthrough tensors are
-    silently ignored.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of `tf.Operation`
-    or if `passthrough_ts` cannot be converted to a list of `tf.Tensor`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__nonzero__()` {#SubGraphView.__nonzero__}
-
-Allows for implicit boolean conversion.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__str__()` {#SubGraphView.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
-
-The connected input tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
-
-The connected output tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
-
-Return a Python set of all the consumers of this subgraph view.
-
-A consumer of a subgraph view is a tf.Operation which is a consumer
-of one of the output tensors and is not in the subgraph.
-
-##### Returns:
-
-  A list of `tf.Operation` which are the consumers of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
-
-Return a copy of itself.
-
-Note that this class is a "view", copying it only create another view and
-does not copy the underlying part of the tf.Graph.
-
-##### Returns:
-
-  A new instance identical to the original one.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
-
-Return the op named op_name.
-
-##### Args:
-
-
-*  <b>`op_name`</b>: the name to search for
-
-##### Returns:
-
-  The op named op_name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the op_name could not be found.
-*  <b>`AssertionError`</b>: if the name was found multiple time.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
-
-The underlying `tf.Graph`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
-
-Find the input index corresponding to the given input tensor t.
-
-##### Args:
-
-
-*  <b>`t`</b>: the input tensor of this subgraph view.
-
-##### Returns:
-
-  The index in the self.inputs list.
-
-##### Raises:
-
-
-*  <b>`Error`</b>: if t in not an input tensor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
-
-The input tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
-
-Check whether a tensor is passthrough.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
-
-Get an op by its index.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
-
-The operations in this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
-
-Find the output index corresponding to given output tensor t.
-
-##### Args:
-
-
-*  <b>`t`</b>: the output tensor of this subgraph view.
-
-##### Returns:
-
-  The index in the self.outputs list.
-
-##### Raises:
-
-
-*  <b>`Error`</b>: if t in not an output tensor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
-
-The output tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
-
-The passthrough tensors, going straight from input to output.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
-
-Remap the inputs and outputs of the subgraph.
-
-Note that this is only modifying the view: the underlying tf.Graph is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_input_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old inputs and the new ones.
-    Integers must be positive and smaller than the number of old inputs.
-    tf.Tensors must belong to the old list of inputs.
-    This mapping can be under-complete and must be without repetitions.
-*  <b>`new_output_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old outputs and the new ones.
-    Integers must be positive and smaller than the number of old outputs.
-    tf.Tensors must belong to the old list of outputs.
-    This mapping can be under-complete and can have repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    inputs and outputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
-
-Remap the inputs and/or outputs to the default mapping.
-
-##### Args:
-
-
-*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
-*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with its
-    input and/or output mapping reset to the default one.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
-
-Remap the inputs of the subgraph.
-
-If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
-will create a new instance whose inputs is [t2, t0].
-
-Note that this is only modifying the view: the underlying `tf.Graph` is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_input_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old inputs and the new ones.
-    Integers must be positive and smaller than the number of old inputs.
-    tf.Tensors must belong to the old list of inputs.
-    This mapping can be under-complete and must be without repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    inputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
-
-Remap the output of the subgraph.
-
-If the output of the original subgraph are [t0, t1, t2], remapping to
-[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
-
-Note that this is only modifying the view: the underlying tf.Graph is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_output_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old outputs and the new ones.
-    Integers must be positive and smaller than the number of old outputs.
-    tf.Tensors must belong to the old list of outputs.
-    This mapping can be under-complete and can have repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    outputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
-
-Remap the outputs so that all the tensors appears only once.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
-
-Remap the outputs to match the number of consumers.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
-
-Remove unused ops.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
-
-##### Returns:
-
-  A new subgraph view which only contains used operations.
-
-
-
-- - -
-
-### `class tf.contrib.graph_editor.Transformer` {#Transformer}
-
-Transform a subgraph into another one.
-
-By default, the constructor create a transform which copy a subgraph and
-replaces inputs with placeholders. This behavior can be modified by changing
-the handlers.
-- - -
-
-#### `tf.contrib.graph_editor.Transformer.__call__(sgv, dst_graph, dst_scope, src_scope='', reuse_dst_scope=False)` {#Transformer.__call__}
-
-Execute the transformation.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope, which specify the path from which the
-    relative path of the transformed nodes are computed. For instance, if
-    src_scope is a/ and dst_scoped is b/, then the node a/x/y will have a
-    relative path of x/y and will be transformed into b/x/y.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the arguments are invalid.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
-
-Transformer constructor.
-
-The following members can be modified:
-transform_op_handler: handle the transformation of a `tf.Operation`.
-  This handler defaults to a simple copy.
-assign_collections_handler: handle the assignment of collections.
-  This handler defaults to assigning new collections created under the
-  given name-scope.
-transform_external_input_handler: handle the transform of the inputs to
-  the given subgraph. This handler defaults to creating placeholders
-  instead of the ops just before the input tensors of the subgraph.
-transform_external_hidden_input_handler: handle the transform of the
-  hidden inputs of the subgraph, that is, the inputs which are not listed
-  in sgv.inputs. This handler defaults to a transform which keep the same
-  input if the source and destination graphs are the same, otherwise
-  use placeholders.
-transform_original_op_handler: handle the transform of original_op. This
-  handler defaults to transforming original_op only if they are in the
-  subgraph, otherwise they are ignored.
-
-
-
-- - -
-
-### `class tf.contrib.graph_editor.TransformerInfo` {#TransformerInfo}
-
-"Contains information about the result of a transform operation.
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.__init__(info)` {#TransformerInfo.__init__}
-
-Constructor.
-
-##### Args:
-
-
-*  <b>`info`</b>: an instance of Transformer._TmpInfo containing various internal
-    information about the transform operation.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.__str__()` {#TransformerInfo.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.original(transformed, missing_fn=None)` {#TransformerInfo.original}
-
-Return the original op/tensor corresponding to the transformed one.
-
-Note that the output of this function mimics the hierarchy
-of its input argument `transformed`.
-Given an iterable, it returns a list. Given an operation or a tensor,
-it will return an operation or a tensor.
-
-##### Args:
-
-
-*  <b>`transformed`</b>: the transformed tensor/operation.
-*  <b>`missing_fn`</b>: function handling the case where the counterpart
-    cannot be found. By default, None is returned.
-
-##### Returns:
-
-  the original tensor/operation (or None if no match is found).
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.transformed(original, missing_fn=None)` {#TransformerInfo.transformed}
-
-Return the transformed op/tensor corresponding to the original one.
-
-Note that the output of this function mimics the hierarchy
-of its input argument `original`.
-Given an iterable, it returns a list. Given an operation or a tensor,
-it will return an operation or a tensor.
-
-##### Args:
-
-
-*  <b>`original`</b>: the original tensor/operation.
-*  <b>`missing_fn`</b>: function handling the case where the counterpart
-    cannot be found. By default, None is returned.
-
-##### Returns:
-
-  the transformed tensor/operation (or None if no match is found).
-
-
-
-- - -
-
-### `tf.contrib.graph_editor.add_control_inputs(op, cops)` {#add_control_inputs}
-
-Add the control inputs cops to co.
-
-Warning: this function is directly manipulating the internals of the tf.Graph.
-
-##### Args:
-
-
-*  <b>`op`</b>: a tf.Operation to which the control inputs are added.
-*  <b>`cops`</b>: an object convertible to a list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if op is not a tf.Operation
-*  <b>`ValueError`</b>: if any cop in cops is already a control input of op.
-
-
-- - -
-
-### `tf.contrib.graph_editor.assign_renamed_collections_handler(info, elem, elem_)` {#assign_renamed_collections_handler}
-
-Add the transformed elem to the (renamed) collections of elem.
-
-A collection is renamed only if is not a known key, as described in
-`tf.GraphKeys`.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`elem`</b>: the original element (`tf.Tensor` or `tf.Operation`)
-*  <b>`elem_`</b>: the transformed element
-
-
-- - -
-
-### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
-
-Bypass the given subgraph by connecting its inputs to its outputs.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
-    subgraph using the same rules than the function subgraph.make_view.
-    Note that sgv is modified in place.
-
-##### Returns:
-
-  A tuple `(sgv, detached_inputs)` where:
-    `sgv` is a new subgraph view of the bypassed subgraph;
-    `detached_inputs` is a list of the created input placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.can_be_regex(obj)` {#can_be_regex}
-
-Return True if obj can be turned into a regular expression.
-
-
-- - -
-
-### `tf.contrib.graph_editor.check_cios(control_inputs=False, control_outputs=None, control_ios=None)` {#check_cios}
-
-Do various check on control_inputs and control_outputs.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A tuple `(control_inputs, control_outputs)` where:
-    `control_inputs` is a boolean indicating whether to use control inputs.
-    `control_outputs` is an instance of util.ControlOutputs or None
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if control_inputs is an instance of util.ControlOutputs but
-    control_outputs is not None
-*  <b>`TypeError`</b>: if control_outputs is not None and is not a util.ControlOutputs.
-
-
-- - -
-
-### `tf.contrib.graph_editor.compute_boundary_ts(ops)` {#compute_boundary_ts}
-
-Compute the tensors at the boundary of a set of ops.
-
-This function looks at all the tensors connected to the given ops (in/out)
-and classify them into three categories:
-1) input tensors: tensors whose generating operation is not in ops.
-2) output tensors: tensors whose consumer operations are not in ops
-3) inside tensors: tensors which are neither input nor output tensors.
-
-Note that a tensor can be both an inside tensor and an output tensor if it is
-consumed by operations both outside and inside of `ops`.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-
-##### Returns:
-
-  A tuple `(outside_input_ts, outside_output_ts, inside_ts)` where:
-    `outside_input_ts` is a Python list of input tensors;
-    `outside_output_ts` is a python list of output tensors;
-    `inside_ts` is a python list of inside tensors.
-  Since a tensor can be both an inside tensor and an output tensor,
-  `outside_output_ts` and `inside_ts` might intersect.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
-
-Connect the outputs of sgv0 to the inputs of sgv1.
-
-##### Args:
-
-
-*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
-    converted to a subgraph using the same rules as the function
-    subgraph.make_view.
-    Note that sgv0 is modified in place.
-*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
-    converted to a subgraph using the same rules as the function
-    subgraph.make_view.
-    Note that sgv1 is modified in place.
-*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
-
-##### Returns:
-
-  A tuple `(sgv0, sgv1)` of the now connected subgraphs.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.copy_op_handler(info, op, copy_shape=True)` {#copy_op_handler}
-
-Copy a `tf.Operation`.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`op`</b>: the `tf.Operation` to be copied.
-*  <b>`copy_shape`</b>: also copy the shape of the tensor
-
-##### Returns:
-
-  A `(op, op_outputs)` tuple containgin the transformed op and its outputs.
-
-
-- - -
-
-### `tf.contrib.graph_editor.copy_with_input_replacements(sgv, replacement_ts, dst_graph=None, dst_scope='', src_scope='', reuse_dst_scope=False)` {#copy_with_input_replacements}
-
-Copy a subgraph, replacing some of its inputs.
-
-Note a replacement only happens if the tensor to be replaced
-is an input of the given subgraph. The inputs of a subgraph can
-be queried using sgv.inputs.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
-    using the same rules as the function subgraph.make_view.
-*  <b>`replacement_ts`</b>: dictionary mapping from original tensors to the
-    replaced one.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules as the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
-
-Detach both the inputs and the outputs of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A tuple `(sgv, detached_inputs, detached_outputs)` where:
-  `sgv` is a new subgraph view of the detached subgraph;
-  `detach_inputs` is a list of the created input placeholders;
-  `detach_outputs` is a list of the created output placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.detach_control_inputs(sgv)` {#detach_control_inputs}
-
-Detach all the external control inputs of the subgraph sgv.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.detach_control_outputs(sgv, control_outputs)` {#detach_control_outputs}
-
-Detach all the external control outputs of the subgraph sgv.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-*  <b>`control_outputs`</b>: a util.ControlOutputs instance.
-
-
-- - -
-
-### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
-
-Detach the inputs of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_inputs`</b>: if True control_inputs are also detached.
-
-##### Returns:
-
-  A tuple `(sgv, input_placeholders)` where
-    `sgv` is a new subgraph view of the detached subgraph;
-    `input_placeholders` is a list of the created input placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
-
-Detach the output of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
-    control outputs are also detached.
-
-##### Returns:
-
-  A tuple `(sgv, output_placeholders)` where
-    `sgv` is a new subgraph view of the detached subgraph;
-    `output_placeholders` is a list of the created output placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
-
-- - -
-
-### `tf.contrib.graph_editor.filter_ops(ops, positive_filter)` {#filter_ops}
-
-Get the ops passing the given filter.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`positive_filter`</b>: a function deciding where to keep an operation or not.
-    If True, all the operations are returned.
-
-##### Returns:
-
-  A list of selected tf.Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.filter_ops_from_regex(ops, regex)` {#filter_ops_from_regex}
-
-Get all the operations that match the given regex.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`regex`</b>: a regular expression matching the operation's name.
-    For example, `"^foo(/.*)?$"` will match all the operations in the "foo"
-    scope.
-
-##### Returns:
-
-  A list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.filter_ts(ops, positive_filter)` {#filter_ts}
-
-Get all the tensors which are input or output of an op in ops.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`positive_filter`</b>: a function deciding whether to keep a tensor or not.
-    If `True`, all the tensors are returned.
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.filter_ts_from_regex(ops, regex)` {#filter_ts_from_regex}
-
-Get all the tensors linked to ops that match the given regex.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`regex`</b>: a regular expression matching the tensors' name.
-    For example, "^foo(/.*)?:\d+$" will match all the tensors in the "foo"
-    scope.
-
-##### Returns:
-
-  A list of tf.Tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_backward_walk_ops(seed_ops, inclusive=True, within_ops=None, stop_at_ts=(), control_inputs=False)` {#get_backward_walk_ops}
-
-Do a backward graph walk and return all the visited ops.
-
-##### Args:
-
-
-*  <b>`seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`inclusive`</b>: if True the given seed_ops are also part of the resulting set.
-*  <b>`within_ops`</b>: an iterable of `tf.Operation` within which the search is
-    restricted. If `within_ops` is `None`, the search is performed within
-    the whole graph.
-*  <b>`stop_at_ts`</b>: an iterable of tensors at which the graph walk stops.
-*  <b>`control_inputs`</b>: if True, control inputs will be used while moving backward.
-
-##### Returns:
-
-  A Python set of all the `tf.Operation` behind `seed_ops`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `seed_ops` or `within_ops` cannot be converted to a list of
-    `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_consuming_ops(ts)` {#get_consuming_ops}
-
-Return all the consuming ops of the tensors in ts.
-
-##### Args:
-
-
-*  <b>`ts`</b>: a list of `tf.Tensor`
-
-##### Returns:
-
-  A list of all the consuming `tf.Operation` of the tensors in `ts`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts cannot be converted to a list of `tf.Tensor`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_forward_walk_ops(seed_ops, inclusive=True, within_ops=None, stop_at_ts=(), control_outputs=None)` {#get_forward_walk_ops}
-
-Do a forward graph walk and return all the visited ops.
-
-##### Args:
-
-
-*  <b>`seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`inclusive`</b>: if True the given seed_ops are also part of the resulting set.
-*  <b>`within_ops`</b>: an iterable of `tf.Operation` within which the search is
-    restricted. If `within_ops` is `None`, the search is performed within
-    the whole graph.
-*  <b>`stop_at_ts`</b>: an iterable of tensors at which the graph walk stops.
-*  <b>`control_outputs`</b>: a `util.ControlOutputs` instance or None.
-    If not `None`, it will be used while walking the graph forward.
-
-##### Returns:
-
-  A Python set of all the `tf.Operation` ahead of `seed_ops`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `seed_ops` or `within_ops` cannot be converted to a list of
-    `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_generating_ops(ts)` {#get_generating_ops}
-
-Return all the generating ops of the tensors in `ts`.
-
-##### Args:
-
-
-*  <b>`ts`</b>: a list of `tf.Tensor`
-
-##### Returns:
-
-  A list of all the generating `tf.Operation` of the tensors in `ts`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ts` cannot be converted to a list of `tf.Tensor`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_name_scope_ops(ops, scope)` {#get_name_scope_ops}
-
-Get all the operations under the given scope path.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`scope`</b>: a scope path.
-
-##### Returns:
-
-  A list of tf.Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_ops_ios(ops, control_inputs=False, control_outputs=None, control_ios=None)` {#get_ops_ios}
-
-Return all the `tf.Operation` which are connected to an op in ops.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of `util.ControlOutputs` or `None`. If not `None`,
-    both control inputs and control outputs are enabled. This is equivalent to
-    set `control_inputs` to `True` and `control_outputs` to the
-    `util.ControlOutputs` instance.
-
-##### Returns:
-
-  All the `tf.Operation` surrounding the given ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ops` cannot be converted to a list of `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_tensors(graph)` {#get_tensors}
-
-get all the tensors which are input or output of an op in the graph.
-
-##### Args:
-
-
-*  <b>`graph`</b>: a `tf.Graph`.
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if graph is not a `tf.Graph`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_walks_intersection_ops(forward_seed_ops, backward_seed_ops, forward_inclusive=True, backward_inclusive=True, within_ops=None, control_inputs=False, control_outputs=None, control_ios=None)` {#get_walks_intersection_ops}
-
-Return the intersection of a forward and a backward walk.
-
-##### Args:
-
-
-*  <b>`forward_seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`backward_seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`forward_inclusive`</b>: if True the given forward_seed_ops are also part of the
-    resulting set.
-*  <b>`backward_inclusive`</b>: if True the given backward_seed_ops are also part of the
-    resulting set.
-*  <b>`within_ops`</b>: an iterable of tf.Operation within which the search is
-    restricted. If within_ops is None, the search is performed within
-    the whole graph.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A Python set of all the tf.Operation in the intersection of a forward and a
-    backward walk.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `forward_seed_ops` or `backward_seed_ops` or `within_ops`
-    cannot be converted to a list of `tf.Operation`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_walks_union_ops(forward_seed_ops, backward_seed_ops, forward_inclusive=True, backward_inclusive=True, within_ops=None, control_inputs=False, control_outputs=None, control_ios=None)` {#get_walks_union_ops}
-
-Return the union of a forward and a backward walk.
-
-##### Args:
-
-
-*  <b>`forward_seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`backward_seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`forward_inclusive`</b>: if True the given forward_seed_ops are also part of the
-    resulting set.
-*  <b>`backward_inclusive`</b>: if True the given backward_seed_ops are also part of the
-    resulting set.
-*  <b>`within_ops`</b>: restrict the search within those operations. If within_ops is
-    None, the search is done within the whole graph.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A Python set of all the tf.Operation in the union of a forward and a
-    backward walk.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if forward_seed_ops or backward_seed_ops or within_ops cannot be
-    converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.get_within_boundary_ops(ops, seed_ops, boundary_ops=(), inclusive=True, control_inputs=False, control_outputs=None, control_ios=None)` {#get_within_boundary_ops}
-
-Return all the `tf.Operation` within the given boundary.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`. those ops define the
-    set in which to perform the operation (if a `tf.Graph` is given, it
-    will be converted to the list of all its operations).
-*  <b>`seed_ops`</b>: the operations from which to start expanding.
-*  <b>`boundary_ops`</b>: the ops forming the boundary.
-*  <b>`inclusive`</b>: if `True`, the result will also include the boundary ops.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, both control inputs and control outputs are enabled. This is
-    equivalent to set control_inputs to True and control_outputs to
-    the `util.ControlOutputs` instance.
-
-##### Returns:
-
-  All the `tf.Operation` surrounding the given ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ops` or `seed_ops` cannot be converted to a list of
-    `tf.Operation`.
-*  <b>`ValueError`</b>: if the boundary is intersecting with the seeds.
-
-
-- - -
-
-### `tf.contrib.graph_editor.graph_replace(target_ts, replacement_ts, dst_scope='', src_scope='', reuse_dst_scope=False)` {#graph_replace}
-
-Create a new graph which compute the targets from the replaced Tensors.
-
-##### Args:
-
-
-*  <b>`target_ts`</b>: a single tf.Tensor or an iterable of tf.Tensor.
-*  <b>`replacement_ts`</b>: dictionary mapping from original tensors to replaced tensors
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A single tf.Tensor or a list of target tf.Tensor, depending on
-  the type of the input argument `target_ts`.
-  The returned tensors are recomputed using the tensors from replacement_ts.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the targets are not connected to replacement_ts.
-
-
-- - -
-
-### `tf.contrib.graph_editor.keep_t_if_possible_handler(info, t)` {#keep_t_if_possible_handler}
-
-Transform a tensor into itself (identity) if possible.
-
-This handler transform a tensor into itself if the source and destination
-graph are the same. Otherwise it will create a placeholder.
-This handler is typically used to transform a hidden input tensors.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`t`</b>: tensor whose input must be transformed into a place holder.
-
-##### Returns:
-
-  The tensor generated by the newly created place holder.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_list_of_op(ops, check_graph=True, allow_graph=True, ignore_ts=False)` {#make_list_of_op}
-
-Convert ops to a list of `tf.Operation`.
-
-##### Args:
-
-
-*  <b>`ops`</b>: can be an iterable of `tf.Operation`, a `tf.Graph` or a single
-    operation.
-*  <b>`check_graph`</b>: if `True` check if all the operations belong to the same graph.
-*  <b>`allow_graph`</b>: if `False` a `tf.Graph` cannot be converted.
-*  <b>`ignore_ts`</b>: if True, silently ignore `tf.Tensor`.
-
-##### Returns:
-
-  A newly created list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation` or,
-   if `check_graph` is `True`, if all the ops do not belong to the
-   same graph.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_list_of_t(ts, check_graph=True, allow_graph=True, ignore_ops=False)` {#make_list_of_t}
-
-Convert ts to a list of `tf.Tensor`.
-
-##### Args:
-
-
-*  <b>`ts`</b>: can be an iterable of `tf.Tensor`, a `tf.Graph` or a single tensor.
-*  <b>`check_graph`</b>: if `True` check if all the tensors belong to the same graph.
-*  <b>`allow_graph`</b>: if `False` a `tf.Graph` cannot be converted.
-*  <b>`ignore_ops`</b>: if `True`, silently ignore `tf.Operation`.
-
-##### Returns:
-
-  A newly created list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ts` cannot be converted to a list of `tf.Tensor` or,
-   if `check_graph` is `True`, if all the ops do not belong to the same graph.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None)` {#make_placeholder_from_dtype_and_shape}
-
-Create a tf.placeholder for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-The placeholder is named using the function placeholder_name (with no
-tensor argument).
-
-##### Args:
-
-
-*  <b>`dtype`</b>: the tensor type.
-*  <b>`shape`</b>: the tensor shape (optional).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A newly created tf.placeholder.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_placeholder_from_tensor(t, scope=None)` {#make_placeholder_from_tensor}
-
-Create a `tf.placeholder` for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-
-##### Args:
-
-
-*  <b>`t`</b>: a `tf.Tensor` whose name will be used to create the placeholder
-    (see function placeholder_name).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of `t` is preserved. `""` means the root scope.
-
-##### Returns:
-
-  A newly created `tf.placeholder`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `t` is not `None` or a `tf.Tensor`.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_regex(obj)` {#make_regex}
-
-Return a compiled regular expression.
-
-##### Args:
-
-
-*  <b>`obj`</b>: a string or a regular expression.
-
-##### Returns:
-
-  A compiled regular expression.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if obj could not be converted to a regular expression.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_view(*args, **kwargs)` {#make_view}
-
-Create a SubGraphView from selected operations and passthrough tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) `tf.Tensor`. Those objects will be converted
-    into a list of operations and a list of candidate for passthrough tensors.
-*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
-    the correct graph 2) for regular expression query
-
-##### Returns:
-
-  A subgraph view.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
-
-
-- - -
-
-### `tf.contrib.graph_editor.make_view_from_scope(scope, graph)` {#make_view_from_scope}
-
-Make a subgraph from a name scope.
-
-##### Args:
-
-
-*  <b>`scope`</b>: the name of the scope.
-*  <b>`graph`</b>: the `tf.Graph`.
-
-##### Returns:
-
-  A subgraph view representing the given scope.
-
-
-- - -
-
-### `tf.contrib.graph_editor.op_type(op_types, op=None)` {#op_type}
-
-Check if an op is of the given type.
-
-##### Args:
-
-
-*  <b>`op_types`</b>: tuple of strings containing the types to check against.
-    For instance: ("Add", "Const")
-*  <b>`op`</b>: the operation to check (or None).
-
-##### Returns:
-
-  if op is not None, return True if the op is of the correct type.
-  if op is None, return a lambda function which does the type checking.
-
-
-- - -
-
-### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
-
-Create a tf.placeholder for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-The placeholder is named using the function placeholder_name (with no
-tensor argument).
-
-##### Args:
-
-
-*  <b>`dtype`</b>: the tensor type.
-*  <b>`shape`</b>: the tensor shape (optional).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A newly created tf.placeholder.
-
-
-- - -
-
-### `tf.contrib.graph_editor.placeholder_name(t=None, scope=None)` {#placeholder_name}
-
-Create placeholder name for the graph editor.
-
-##### Args:
-
-
-*  <b>`t`</b>: optional tensor on which the placeholder operation's name will be based
-    on
-*  <b>`scope`</b>: absolute scope with which to prefix the placeholder's name. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A new placeholder name prefixed by "geph". Note that "geph" stands for
-    Graph Editor PlaceHolder. This convention allows to quickly identify the
-    placeholder generated by the Graph Editor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if t is not None or a tf.Tensor.
-
-
-- - -
-
-### `tf.contrib.graph_editor.remove_control_inputs(op, cops)` {#remove_control_inputs}
-
-Remove the control inputs cops from co.
-
-Warning: this function is directly manipulating the internals of the
-`tf.Graph`.
-
-##### Args:
-
-
-*  <b>`op`</b>: a `tf.Operation` from which to remove the control inputs.
-*  <b>`cops`</b>: an object convertible to a list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if op is not a `tf.Operation`.
-*  <b>`ValueError`</b>: if any cop in cops is not a control input of op.
-
-
-- - -
-
-### `tf.contrib.graph_editor.replace_t_with_placeholder_handler(info, t)` {#replace_t_with_placeholder_handler}
-
-Transform a tensor into a placeholder tensor.
-
-This handler is typically used to transform a subgraph input tensor into a
-placeholder.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`t`</b>: tensor whose input must be transformed into a place holder.
-
-##### Returns:
-
-  The tensor generated by the newly created place holder.
-
-
-- - -
-
-### `tf.contrib.graph_editor.reroute_inputs(sgv0, sgv1)` {#reroute_inputs}
-
-Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
-
-
-- - -
-
-### `tf.contrib.graph_editor.reroute_ios(sgv0, sgv1)` {#reroute_ios}
-
-Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
-
-
-- - -
-
-### `tf.contrib.graph_editor.reroute_outputs(sgv0, sgv1)` {#reroute_outputs}
-
-Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
-
-
-- - -
-
-### `tf.contrib.graph_editor.reroute_ts(ts0, ts1, can_modify=None, cannot_modify=None)` {#reroute_ts}
-
-For each tensor's pair, replace the end of t1 by the end of t0.
-
-B0 B1     B0 B1
-|  |    => |/
-A0 A1     A0 A1
-
-The end of the tensors in ts1 are left dangling.
-
-##### Args:
-
-
-*  <b>`ts0`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`ts1`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`can_modify`</b>: iterable of operations which can be modified. Any operation
-    outside within_ops will be left untouched by this function.
-*  <b>`cannot_modify`</b>: iterable of operations which cannot be modified. Any
-    operation within cannot_modify will be left untouched by this function.
-
-##### Returns:
-
-  The number of individual modifications made by the function.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts0 or ts1 cannot be converted to a list of tf.Tensor.
-*  <b>`TypeError`</b>: if can_modify or cannot_modify is not None and cannot be
-    converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
-
-Helper to select operations.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation`. `tf.Tensor` instances are silently ignored.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-    'restrict_ops_regex': a regular expression is ignored if it doesn't start
-      with the substring "(?#ops)".
-
-##### Returns:
-
-  A list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Operation`
-    or an (array of) `tf.Tensor` (silently ignored) or a string
-    or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
-
-- - -
-
-### `tf.contrib.graph_editor.select_ops_and_ts(*args, **kwargs)` {#select_ops_and_ts}
-
-Helper to select operations and tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) tf.Tensor. Regular expressions matching
-    tensors must start with the comment `"(?#ts)"`, for instance:
-    `"(?#ts)^foo/.*"`.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-
-##### Returns:
-
-  A tuple `(ops, ts)` where:
-    `ops` is a list of `tf.Operation`, and
-    `ts` is a list of `tf.Tensor`
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
-
-- - -
-
-### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
-
-Helper to select tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Tensor`. `tf.Operation` instances are silently ignored.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-    'restrict_ts_regex': a regular expression is ignored if it doesn't start
-      with the substring "(?#ts)".
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` (silently ignored) or a string
-    or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
-
-- - -
-
-### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
-
-Create a SubGraphView from selected operations and passthrough tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) `tf.Tensor`. Those objects will be converted
-    into a list of operations and a list of candidate for passthrough tensors.
-*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
-    the correct graph 2) for regular expression query
-
-##### Returns:
-
-  A subgraph view.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
-
-
-- - -
-
-### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
-
-Make a subgraph from a name scope.
-
-##### Args:
-
-
-*  <b>`scope`</b>: the name of the scope.
-*  <b>`graph`</b>: the `tf.Graph`.
-
-##### Returns:
-
-  A subgraph view representing the given scope.
-
-
-- - -
-
-### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
-
-Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
-
-
-- - -
-
-### `tf.contrib.graph_editor.swap_ios(sgv0, sgv1)` {#swap_ios}
-
-Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
-
-
-- - -
-
-### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
-
-Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
-
-
-- - -
-
-### `tf.contrib.graph_editor.swap_ts(ts0, ts1, can_modify=None, cannot_modify=None)` {#swap_ts}
-
-For each tensor's pair, swap the end of (t0,t1).
-
-B0 B1     B0 B1
-|  |    =>  X
-A0 A1     A0 A1
-
-##### Args:
-
-
-*  <b>`ts0`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`ts1`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`can_modify`</b>: iterable of operations which can be modified. Any operation
-    outside within_ops will be left untouched by this function.
-*  <b>`cannot_modify`</b>: iterable of operations which cannot be modified.
-    Any operation within cannot_modify will be left untouched by this
-    function.
-
-##### Returns:
-
-  The number of individual modifications made by the function.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts0 or ts1 cannot be converted to a list of tf.Tensor.
-*  <b>`TypeError`</b>: if can_modify or cannot_modify is not None and cannot be
-    converted to a list of tf.Operation.
-
-
-- - -
-
-### `tf.contrib.graph_editor.transform_op_if_inside_handler(info, op, keep_if_possible=True)` {#transform_op_if_inside_handler}
-
-Transform an optional op only if it is inside the subgraph.
-
-This handler is typically use to handle original op: it is fine to keep them
-if they are inside the subgraph, otherwise they are just ignored.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`op`</b>: the optional op to transform (or ignore).
-*  <b>`keep_if_possible`</b>: re-attach to the original op if possible, that is,
-    if the source graph and the destination graph are the same.
-
-##### Returns:
-
-  The transformed op or None.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.integrate.md b/tensorflow/g3doc/api_docs/python/contrib.integrate.md
deleted file mode 100644
index edccf69cb45..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.integrate.md
+++ /dev/null
@@ -1,135 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Integrate (contrib)
-[TOC]
-
-Integration and ODE solvers for TensorFlow.
-
-## Example: Lorenz attractor
-
-We can use `odeint` to solve the
-[Lorentz system](https://en.wikipedia.org/wiki/Lorenz_system) of ordinary
-differential equations, a prototypical example of chaotic dynamics:
-
-```python
-rho = 28.0
-sigma = 10.0
-beta = 8.0/3.0
-
-def lorenz_equation(state, t):
-  x, y, z = tf.unstack(state)
-  dx = sigma * (y - x)
-  dy = x * (rho - z) - y
-  dz = x * y - beta * z
-  return tf.stack([dx, dy, dz])
-
-init_state = tf.constant([0, 2, 20], dtype=tf.float64)
-t = np.linspace(0, 50, num=5000)
-tensor_state, tensor_info = tf.contrib.integrate.odeint(
-    lorenz_equation, init_state, t, full_output=True)
-
-sess = tf.Session()
-state, info = sess.run([tensor_state, tensor_info])
-x, y, z = state.T
-plt.plot(x, z)
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/lorenz_attractor.png" alt>
-</div>
-
-## Ops
-
-- - -
-
-### `tf.contrib.integrate.odeint(func, y0, t, rtol=1e-06, atol=1e-12, method=None, options=None, full_output=False, name=None)` {#odeint}
-
-Integrate a system of ordinary differential equations.
-
-Solves the initial value problem for a non-stiff system of first order ode-s:
-
-  ```
-  dy/dt = func(y, t), y(t[0]) = y0
-  ```
-
-where y is a Tensor of any shape.
-
-For example:
-
-  ```
-  # solve `dy/dt = -y`, corresponding to exponential decay
-  tf.contrib.integrate.odeint(lambda y, _: -y, 1.0, [0, 1, 2])
-  => [1, exp(-1), exp(-2)]
-  ```
-
-Output dtypes and numerical precision are based on the dtypes of the inputs
-`y0` and `t`.
-
-Currently, implements 5th order Runge-Kutta with adaptive step size control
-and dense output, using the Dormand-Prince method. Similar to the 'dopri5'
-method of `scipy.integrate.ode` and MATLAB's `ode45`.
-
-Based on: Shampine, Lawrence F. (1986), "Some Practical Runge-Kutta Formulas",
-Mathematics of Computation, American Mathematical Society, 46 (173): 135-150,
-doi:10.2307/2008219
-
-##### Args:
-
-
-*  <b>`func`</b>: Function that maps a Tensor holding the state `y` and a scalar Tensor
-    `t` into a Tensor of state derivatives with respect to time.
-*  <b>`y0`</b>: N-D Tensor giving starting value of `y` at time point `t[0]`. May
-    have any floating point or complex dtype.
-*  <b>`t`</b>: 1-D Tensor holding a sequence of time points for which to solve for
-    `y`. The initial time point should be the first element of this sequence,
-    and each time must be larger than the previous time. May have any floating
-    point dtype. If not provided as a Tensor, converted to a Tensor with
-    float64 dtype.
-*  <b>`rtol`</b>: optional float64 Tensor specifying an upper bound on relative error,
-    per element of `y`.
-*  <b>`atol`</b>: optional float64 Tensor specifying an upper bound on absolute error,
-    per element of `y`.
-*  <b>`method`</b>: optional string indicating the integration method to use. Currently,
-    the only valid option is `'dopri5'`.
-*  <b>`options`</b>: optional dict of configuring options for the indicated integration
-    method. Can only be provided if a `method` is explicitly set. For
-    `'dopri5'`, valid options include:
-    * first_step: an initial guess for the size of the first integration
-      (current default: 1.0, but may later be changed to use heuristics based
-      on the gradient).
-    * safety: safety factor for adaptive step control, generally a constant
-      in the range 0.8-1 (default: 0.9).
-    * ifactor: maximum factor by which the adaptive step may be increased
-      (default: 10.0).
-    * dfactor: maximum factor by which the adpative step may be decreased
-      (default: 0.2).
-    * max_num_steps: integer maximum number of integrate steps between time
-      points in `t` (default: 1000).
-*  <b>`full_output`</b>: optional boolean. If True, `odeint` returns a tuple
-    `(y, info_dict)` describing the integration process.
-*  <b>`name`</b>: Optional name for this operation.
-
-##### Returns:
-
-
-*  <b>`y`</b>: (N+1)-D tensor, where the first dimension corresponds to different
-    time points. Contains the solved value of y for each desired time point in
-    `t`, with the initial value `y0` being the first element along the first
-    dimension.
-*  <b>`info_dict`</b>: only if `full_output == True`. A dict with the following values:
-    * num_func_evals: integer Tensor counting the number of function
-      evaluations.
-    * integrate_points: 1D float64 Tensor with the upper bound of each
-      integration time step.
-    * error_ratio: 1D float Tensor with the estimated ratio of the integration
-      error to the error tolerance at each integration step. An ratio greater
-      than 1 corresponds to rejected steps.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an invalid `method` is provided.
-*  <b>`TypeError`</b>: if `options` is supplied without `method`, or if `t` or `y0` has
-    an invalid dtype.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
deleted file mode 100644
index ea20efccd66..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ /dev/null
@@ -1,2376 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Layers (contrib)
-[TOC]
-
-Ops for building neural network layers, regularizers, summaries, etc.
-
-## Higher level ops for building neural network layers.
-
-This package provides several ops that take care of creating variables that are
-used internally in a consistent way and provide the building blocks for many
-common machine learning algorithms.
-
-- - -
-
-### `tf.contrib.layers.avg_pool2d(*args, **kwargs)` {#avg_pool2d}
-
-Adds a 2D average pooling op.
-
-It is assumed that the pooling is done per image but not in batch or channels.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor of shape `[batch_size, height, width, channels]` if
-    `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
-    `data_format` is `NCHW`.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
-    pooling kernel over which the op is computed. Can be an int if both
-    values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A `Tensor` representing the results of the pooling operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-
-
-- - -
-
-### `tf.contrib.layers.batch_norm(*args, **kwargs)` {#batch_norm}
-
-Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
-
-  "Batch Normalization: Accelerating Deep Network Training by Reducing
-  Internal Covariate Shift"
-
-  Sergey Ioffe, Christian Szegedy
-
-Can be used as a normalizer function for conv2d and fully_connected.
-
-Note: When is_training is True the moving_mean and moving_variance need to be
-updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so
-they need to be added as a dependency to the `train_op`, example:
-
-  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-  if update_ops:
-    updates = tf.group(*update_ops)
-    total_loss = control_flow_ops.with_dependencies([updates], total_loss)
-
-One can set updates_collections=None to force the updates in place, but that
-can have speed penalty, especially in distributed settings.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor with 2 or more dimensions, where the first dimension has
-    `batch_size`. The normalization is over all but the last dimension if
-    `data_format` is `NHWC` and the second dimension if `data_format` is
-    `NCHW`.
-*  <b>`decay`</b>: Decay for the moving average. Reasonable values for `decay` are close
-    to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
-    Lower `decay` value (recommend trying `decay`=0.9) if model experiences
-    reasonably good training performance but poor validation and/or test
-    performance. Try zero_debias_moving_mean=True for improved stability.
-*  <b>`center`</b>: If True, add offset of `beta` to normalized tensor. If False, `beta`
-    is ignored.
-*  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
-    not used. When the next layer is linear (also e.g. `nn.relu`), this can be
-    disabled since the scaling can be done by the next layer.
-*  <b>`epsilon`</b>: Small float added to variance to avoid dividing by zero.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`param_initializers`</b>: Optional initializers for beta, gamma, moving mean and
-    moving variance.
-*  <b>`updates_collections`</b>: Collections to collect the update ops for computation.
-    The updates_ops need to be executed with the train_op.
-    If None, a control dependency would be added to make sure the updates are
-    computed in place.
-*  <b>`is_training`</b>: Whether or not the layer is in training mode. In training mode
-    it would accumulate the statistics of the moments into `moving_mean` and
-    `moving_variance` using an exponential moving average with the given
-    `decay`. When it is not in training mode then it would use the values of
-    the `moving_mean` and the `moving_variance`.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional collections for the variables.
-*  <b>`outputs_collections`</b>: Collections to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`batch_weights`</b>: An optional tensor of shape `[batch_size]`,
-    containing a frequency weight for each batch item. If present,
-    then the batch normalization uses weighted mean and
-    variance. (This can be used to correct for bias in training
-    example selection.)
-*  <b>`fused`</b>: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`zero_debias_moving_mean`</b>: Use zero_debias for moving_mean. It creates a new
-    pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `batch_weights` is not None and `fused` is True.
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If the rank of `inputs` is undefined.
-*  <b>`ValueError`</b>: If rank or channels dimension of `inputs` is undefined.
-
-
-- - -
-
-### `tf.contrib.layers.convolution2d(*args, **kwargs)` {#convolution2d}
-
-Adds an N-D convolution followed by an optional batch_norm layer.
-
-It is required that 1 <= N <= 3.
-
-`convolution` creates a variable called `weights`, representing the
-convolutional kernel, that is convolved (actually cross-correlated) with the
-`inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is
-provided (such as `batch_norm`), it is then applied. Otherwise, if
-`normalizer_fn` is None and a `biases_initializer` is provided then a `biases`
-variable would be created and added the activations. Finally, if
-`activation_fn` is not `None`, it is applied to the activations as well.
-
-Performs a'trous convolution with input stride/dilation rate equal to `rate`
-if a value > 1 for any dimension of `rate` is specified.  In this case
-`stride` values != 1 are not supported.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A Tensor of rank N+2 of shape
-    `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, in_channels] + input_spatial_shape` if data_format starts
-    with "NC".
-*  <b>`num_outputs`</b>: Integer, the number of output filters.
-*  <b>`kernel_size`</b>: A sequence of N positive integers specifying the spatial
-    dimensions of of the filters.  Can be a single integer to specify the same
-    value for all spatial dimensions.
-*  <b>`stride`</b>: A sequence of N positive integers specifying the stride at which to
-    compute output.  Can be a single integer to specify the same value for all
-    spatial dimensions.  Specifying any `stride` value != 1 is incompatible
-    with specifying any `rate` value != 1.
-*  <b>`padding`</b>: One of `"VALID"` or `"SAME"`.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, currently the only valid value is "NDHWC".
-*  <b>`rate`</b>: A sequence of N positive integers specifying the dilation rate to use
-    for a'trous convolution.  Can be a single integer to specify the same
-    value for all spatial dimensions.  Specifying any `rate` value != 1 is
-    incompatible with specifying any `stride` value != 1.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A tensor representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is invalid.
-*  <b>`ValueError`</b>: Both 'rate' and `stride` are not uniformly 1.
-
-
-- - -
-
-### `tf.contrib.layers.conv2d_in_plane(*args, **kwargs)` {#conv2d_in_plane}
-
-Performs the same in-plane convolution to each channel independently.
-
-This is useful for performing various simple channel-independent convolution
-operations such as image gradients:
-
-  image = tf.constant(..., shape=(16, 240, 320, 3))
-  vert_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[2, 1])
-  horz_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[1, 2])
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor with dimensions [batch_size, height, width, channels].
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the pooling. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2 `[stride_height, stride_width]`.
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-
-- - -
-
-### `tf.contrib.layers.convolution2d_in_plane(*args, **kwargs)` {#convolution2d_in_plane}
-
-Performs the same in-plane convolution to each channel independently.
-
-This is useful for performing various simple channel-independent convolution
-operations such as image gradients:
-
-  image = tf.constant(..., shape=(16, 240, 320, 3))
-  vert_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[2, 1])
-  horz_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[1, 2])
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor with dimensions [batch_size, height, width, channels].
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the pooling. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2 `[stride_height, stride_width]`.
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-
-- - -
-
-### `tf.nn.conv2d_transpose(value, filter, output_shape, strides, padding='SAME', data_format='NHWC', name=None)` {#conv2d_transpose}
-
-The transpose of `conv2d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `conv2d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float` and shape
-    `[batch, height, width, in_channels]` for `NHWC` data format or
-    `[batch, in_channels, height, width]` for `NCHW` data format.
-*  <b>`filter`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[height, width, output_channels, in_channels]`.  `filter`'s
-    `in_channels` dimension must match that of `value`.
-*  <b>`output_shape`</b>: A 1-D `Tensor` representing the output shape of the
-    deconvolution op.
-*  <b>`strides`</b>: A list of ints. The stride of the sliding window for each
-    dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter`'s shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
-
-- - -
-
-### `tf.contrib.layers.convolution2d_transpose(*args, **kwargs)` {#convolution2d_transpose}
-
-Adds a convolution2d_transpose with an optional batch normalization layer.
-
-The function creates a variable called `weights`, representing the
-kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
-second variable called 'biases' is added to the result of the operation.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D `Tensor` of type `float` and shape
-    `[batch, height, width, in_channels]` for `NHWC` data format or
-    `[batch, in_channels, height, width]` for `NCHW` data format.
-*  <b>`num_outputs`</b>: Integer, the number of output filters.
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the filters. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: One of 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: Whether or not the variables should be trainable or not.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tensor representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If 'kernel_size' is not a list of length 2.
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If `C` dimension of `inputs` is None.
-
-
-- - -
-
-### `tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)` {#dropout}
-
-Computes dropout.
-
-With probability `keep_prob`, outputs the input element scaled up by
-`1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
-sum is unchanged.
-
-By default, each element is kept or dropped independently.  If `noise_shape`
-is specified, it must be
-[broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
-will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
-and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
-kept independently and each row and column will be kept or not kept together.
-
-##### Args:
-
-
-*  <b>`x`</b>: A tensor.
-*  <b>`keep_prob`</b>: A scalar `Tensor` with the same type as x. The probability
-    that each element is kept.
-*  <b>`noise_shape`</b>: A 1-D `Tensor` of type `int32`, representing the
-    shape for randomly generated keep/drop flags.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A Tensor of the same shape of `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `keep_prob` is not in `(0, 1]`.
-
-
-- - -
-
-### `tf.contrib.layers.flatten(*args, **kwargs)` {#flatten}
-
-Flattens the input while maintaining the batch_size.
-
-  Assumes that the first dimension represents the batch.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of size [batch_size, ...].
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A flattened tensor with shape [batch_size, k].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If inputs rank is unknown or less than 2.
-
-
-- - -
-
-### `tf.contrib.layers.fully_connected(*args, **kwargs)` {#fully_connected}
-
-Adds a fully connected layer.
-
-`fully_connected` creates a variable called `weights`, representing a fully
-connected weight matrix, which is multiplied by the `inputs` to produce a
-`Tensor` of hidden units. If a `normalizer_fn` is provided (such as
-`batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is
-None and a `biases_initializer` is provided then a `biases` variable would be
-created and added the hidden units. Finally, if `activation_fn` is not `None`,
-it is applied to the hidden units as well.
-
-Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened
-prior to the initial matrix multiply by `weights`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of at least rank 2 and static value for the last dimension;
-    i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
-*  <b>`num_outputs`</b>: Integer or long, the number of output units in the layer.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collections per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-   The tensor variable representing the result of the series of operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x has rank less than 2 or if its last dimension is not set.
-
-
-- - -
-
-### `tf.contrib.layers.layer_norm(*args, **kwargs)` {#layer_norm}
-
-Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450.
-
-  "Layer Normalization"
-
-  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
-
-Can be used as a normalizer function for conv2d and fully_connected.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor with 2 or more dimensions. The normalization
-          occurs over all but the first dimension.
-*  <b>`center`</b>: If True, add offset of `beta` to normalized tensor. If False, `beta`
-    is ignored.
-*  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
-    not used. When the next layer is linear (also e.g. `nn.relu`), this can be
-    disabled since the scaling can be done by the next layer.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional collections for the variables.
-*  <b>`outputs_collections`</b>: Collections to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If rank or last dimension of `inputs` is undefined.
-
-
-- - -
-
-### `tf.contrib.layers.linear()` {#linear}
-
-partial(func, *args, **keywords) - new function with partial application
-of the given arguments and keywords.
-
-
-- - -
-
-### `tf.contrib.layers.max_pool2d(*args, **kwargs)` {#max_pool2d}
-
-Adds a 2D Max Pooling op.
-
-It is assumed that the pooling is done per image but not in batch or channels.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor of shape `[batch_size, height, width, channels]` if
-    `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
-    `data_format` is `NCHW`.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
-    pooling kernel over which the op is computed. Can be an int if both
-    values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A `Tensor` representing the results of the pooling operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If 'kernel_size' is not a 2-D list
-
-
-- - -
-
-### `tf.contrib.layers.one_hot_encoding(*args, **kwargs)` {#one_hot_encoding}
-
-Transform numeric labels into onehot_labels using `tf.one_hot`.
-
-##### Args:
-
-
-*  <b>`labels`</b>: [batch_size] target labels.
-*  <b>`num_classes`</b>: Total number of classes.
-*  <b>`on_value`</b>: A scalar defining the on-value.
-*  <b>`off_value`</b>: A scalar defining the off-value.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  One-hot encoding of the labels.
-
-
-- - -
-
-### `tf.nn.relu(features, name=None)` {#relu}
-
-Computes rectified linear: `max(features, 0)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
-
-- - -
-
-### `tf.nn.relu6(features, name=None)` {#relu6}
-
-Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-    `int16`, or `int8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `features`.
-
-
-- - -
-
-### `tf.contrib.layers.repeat(inputs, repetitions, layer, *args, **kwargs)` {#repeat}
-
-Applies the same layer with the same arguments repeatedly.
-
-```python
-  y = repeat(x, 3, conv2d, 64, [3, 3], scope='conv1')
-  # It is equivalent to:
-
-  x = conv2d(x, 64, [3, 3], scope='conv1/conv1_1')
-  x = conv2d(x, 64, [3, 3], scope='conv1/conv1_2')
-  y = conv2d(x, 64, [3, 3], scope='conv1/conv1_3')
-```
-
-If the `scope` argument is not given in `kwargs`, it is set to
-`layer.__name__`, or `layer.func.__name__` (for `functools.partial`
-objects). If neither `__name__` nor `func.__name__` is available, the
-layers are called with `scope='stack'`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` suitable for layer.
-*  <b>`repetitions`</b>: Int, number of repetitions.
-*  <b>`layer`</b>: A layer with arguments `(inputs, *args, **kwargs)`
-*  <b>`*args`</b>: Extra args for the layer.
-*  <b>`**kwargs`</b>: Extra kwargs for the layer.
-
-##### Returns:
-
-  A tensor result of applying the layer, repetitions times.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the op is unknown or wrong.
-
-
-- - -
-
-### `tf.contrib.layers.safe_embedding_lookup_sparse(embedding_weights, sparse_ids, sparse_weights=None, combiner=None, default_id=None, name=None, partition_strategy='div', max_norm=None)` {#safe_embedding_lookup_sparse}
-
-Lookup embedding results, accounting for invalid IDs and empty features.
-
-The partitioned embedding in `embedding_weights` must all be the same shape
-except for the first dimension. The first dimension is allowed to vary as the
-vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-partitioner.
-
-Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-with non-positive weight. For an entry with no features, the embedding vector
-for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-The ids and weights may be multi-dimensional. Embeddings are always aggregated
-along the last dimension.
-
-##### Args:
-
-
-*  <b>`embedding_weights`</b>: A list of `P` float tensors or values representing
-      partitioned embedding tensors.  Alternatively, a `PartitionedVariable`,
-      created by partitioning along dimension 0.  The total unpartitioned
-      shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-      vocab size and `e_1, ..., e_m` are the embedding dimensions.
-*  <b>`sparse_ids`</b>: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-      ids. `d_0` is typically batch size.
-*  <b>`sparse_weights`</b>: `SparseTensor` of same shape as `sparse_ids`, containing
-      float weights corresponding to `sparse_ids`, or `None` if all weights
-      are be assumed to be 1.0.
-*  <b>`combiner`</b>: A string specifying how to combine embedding results for each
-      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-      the default.
-*  <b>`default_id`</b>: The id to use for an entry with no features.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy.
-      Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-*  <b>`max_norm`</b>: If not None, all embeddings are l2-normalized to max_norm before
-      combining.
-
-
-##### Returns:
-
-  Dense tensor of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `embedding_weights` is empty.
-
-
-- - -
-
-### `tf.nn.separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, rate=None, name=None)` {#separable_conv2d}
-
-2-D convolution with separable filters.
-
-Performs a depthwise convolution that acts separately on channels followed by
-a pointwise convolution that mixes channels.  Note that this is separability
-between dimensions `[1, 2]` and `3`, not spatial separability between
-dimensions `1` and `2`.
-
-In detail,
-
-    output[b, i, j, k] = sum_{di, dj, q, r]
-        input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-        depthwise_filter[di, dj, q, r] *
-        pointwise_filter[0, 0, q * channel_multiplier + r, k]
-
-`strides` controls the strides for the depthwise convolution only, since
-the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
-`strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-If any value in `rate` is greater than 1, we perform atrous depthwise
-convolution, in which case all values in the `strides` tensor must be equal
-to 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`depthwise_filter`</b>: 4-D `Tensor` with shape
-    `[filter_height, filter_width, in_channels, channel_multiplier]`.
-    Contains `in_channels` convolutional filters of depth 1.
-*  <b>`pointwise_filter`</b>: 4-D `Tensor` with shape
-    `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
-    filter to mix channels after `depthwise_filter` has convolved spatially.
-*  <b>`strides`</b>: 1-D of size 4.  The strides for the depthwise convolution for
-    each dimension of `input`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-    See the [comment
-      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`rate`</b>: 1-D of size 2. The dilation rate in which we sample input values
-    across the `height` and `width` dimensions in atrous convolution. If it is
-    greater than 1, then all values of strides must be 1.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If channel_multiplier * in_channels > out_channels,
-    which means that the separable convolution is overparameterized.
-
-
-- - -
-
-### `tf.contrib.layers.separable_convolution2d(*args, **kwargs)` {#separable_convolution2d}
-
-Adds a depth-separable 2D convolution with optional batch_norm layer.
-
-This op first performs a depthwise convolution that acts separately on
-channels, creating a variable called `depthwise_weights`. If `num_outputs`
-is not None, it adds a pointwise convolution that mixes channels, creating a
-variable called `pointwise_weights`. Then, if `batch_norm_params` is None,
-it adds bias to the result, creating a variable called 'biases', otherwise
-it adds a batch normalization layer. It finally applies an activation function
-to produce the end result.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of size [batch_size, height, width, channels].
-*  <b>`num_outputs`</b>: The number of pointwise convolution output filters. If is
-    None, then we skip the pointwise convolution stage.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of
-    of the filters. Can be an int if both values are the same.
-*  <b>`depth_multiplier`</b>: The number of depthwise convolution output channels for
-    each input channel. The total number of depthwise convolution output
-    channels will be equal to `num_filters_in * depth_multiplier`.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width], specifying the
-    depthwise convolution stride. Can be an int if both strides are the same.
-*  <b>`padding`</b>: One of 'VALID' or 'SAME'.
-*  <b>`rate`</b>: A list of length 2: [rate_height, rate_width], specifying the dilation
-    rates for a'trous convolution. Can be an int if both rates are the same.
-    If any value is larger than one, then both stride values need to be one.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionay containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: Whether or not the variables should be trainable or not.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-
-- - -
-
-### `tf.nn.softmax(logits, dim=-1, name=None)` {#softmax}
-
-Computes softmax activations.
-
-For each batch `i` and class `j` we have
-
-    softmax = exp(logits) / reduce_sum(exp(logits), dim)
-
-##### Args:
-
-
-*  <b>`logits`</b>: A non-empty `Tensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`dim`</b>: The dimension softmax would be performed on. The default is -1 which
-    indicates the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `logits` is empty or `dim` is beyond the last
-    dimension of `logits`.
-
-
-- - -
-
-### `tf.stack(values, axis=0, name='stack')` {#stack}
-
-Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
-
-Packs the list of tensors in `values` into a tensor with rank one higher than
-each tensor in `values`, by packing them along the `axis` dimension.
-Given a list of length `N` of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```prettyprint
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-stack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of unstack.  The numpy equivalent is
-
-    tf.stack([x, y, z]) = np.asarray([x, y, z])
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects with the same shape and type.
-*  <b>`axis`</b>: An `int`. The axis to stack along. Defaults to the first dimension.
-    Supports negative indexes.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`output`</b>: A stacked `Tensor` with the same type as `values`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `axis` is out of the range [-(R+1), R+1).
-
-
-- - -
-
-### `tf.contrib.layers.unit_norm(*args, **kwargs)` {#unit_norm}
-
-Normalizes the given input across the specified dimension to unit length.
-
-Note that the rank of `input` must be known.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of arbitrary size.
-*  <b>`dim`</b>: The dimension along which the input is normalized.
-*  <b>`epsilon`</b>: A small value to add to the inputs to avoid dividing by zero.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  The normalized `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If dim is smaller than the number of dimensions in 'inputs'.
-
-
-- - -
-
-### `tf.contrib.layers.embed_sequence(ids, vocab_size=None, embed_dim=None, unique=False, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None)` {#embed_sequence}
-
-Maps a sequence of symbols to a sequence of embeddings.
-
-Typical use case would be reusing embeddings between an encoder and decoder.
-
-##### Args:
-
-
-*  <b>`ids`</b>: `[batch_size, doc_length]` `Tensor` of type `int32` or `int64`
-    with symbol ids.
-*  <b>`vocab_size`</b>: Integer number of symbols in vocabulary.
-*  <b>`embed_dim`</b>: Integer number of dimensions for embedding matrix.
-*  <b>`unique`</b>: If `True`, will first compute the unique set of indices, and then
-       lookup each embedding once, repeating them in the output as needed.
-*  <b>`initializer`</b>: An initializer for the embeddings, if `None` default for
-      current scope is used.
-*  <b>`regularizer`</b>: Optional regularizer for the embeddings.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`scope`</b>: Optional string specifying the variable scope for the op, required
-      if `reuse=True`.
-*  <b>`reuse`</b>: If `True`, variables inside the op will be reused.
-
-##### Returns:
-
-  `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `embed_dim` or `vocab_size` are not specified when not
-    `reuse` is `None` or `False`.
-
-
-
-Aliases for fully_connected which set a default activation function are
-available: `relu`, `relu6` and `linear`.
-
-`stack` operation is also available. It builds a stack of layers by applying
-a layer repeatedly.
-
-## Regularizers
-
-Regularization can help prevent overfitting. These have the signature
-`fn(weights)`. The loss is typically added to
-`tf.GraphKeys.REGULARIZATION_LOSSES`.
-
-- - -
-
-### `tf.contrib.layers.apply_regularization(regularizer, weights_list=None)` {#apply_regularization}
-
-Returns the summed penalty by applying `regularizer` to the `weights_list`.
-
-Adding a regularization penalty over the layer weights and embedding weights
-can help prevent overfitting the training data. Regularization over layer
-biases is less common/useful, but assuming proper data preprocessing/mean
-subtraction, it usually shouldn't hurt much either.
-
-##### Args:
-
-
-*  <b>`regularizer`</b>: A function that takes a single `Tensor` argument and returns
-    a scalar `Tensor` output.
-*  <b>`weights_list`</b>: List of weights `Tensors` or `Variables` to apply
-    `regularizer` over. Defaults to the `GraphKeys.WEIGHTS` collection if
-    `None`.
-
-##### Returns:
-
-  A scalar representing the overall regularization penalty.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `regularizer` does not return a scalar output, or if we find
-      no weights.
-
-
-- - -
-
-### `tf.contrib.layers.l1_regularizer(scale, scope=None)` {#l1_regularizer}
-
-Returns a function that can be used to apply L1 regularization to weights.
-
-L1 regularization encourages sparsity.
-
-##### Args:
-
-
-*  <b>`scale`</b>: A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-*  <b>`scope`</b>: An optional scope name.
-
-##### Returns:
-
-  A function with signature `l1(weights)` that apply L1 regularization.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If scale is negative or if scale is not a float.
-
-
-- - -
-
-### `tf.contrib.layers.l2_regularizer(scale, scope=None)` {#l2_regularizer}
-
-Returns a function that can be used to apply L2 regularization to weights.
-
-Small values of L2 can help prevent overfitting the training data.
-
-##### Args:
-
-
-*  <b>`scale`</b>: A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-*  <b>`scope`</b>: An optional scope name.
-
-##### Returns:
-
-  A function with signature `l2(weights)` that applies L2 regularization.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If scale is negative or if scale is not a float.
-
-
-- - -
-
-### `tf.contrib.layers.sum_regularizer(regularizer_list, scope=None)` {#sum_regularizer}
-
-Returns a function that applies the sum of multiple regularizers.
-
-##### Args:
-
-
-*  <b>`regularizer_list`</b>: A list of regularizers to apply.
-*  <b>`scope`</b>: An optional scope name
-
-##### Returns:
-
-  A function with signature `sum_reg(weights)` that applies the
-  sum of all the input regularizers.
-
-
-
-## Initializers
-
-Initializers are used to initialize variables with sensible values given their
-size, data type, and purpose.
-
-- - -
-
-### `tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32)` {#xavier_initializer}
-
-Returns an initializer performing "Xavier" initialization for weights.
-
-This function implements the weight initialization from:
-
-Xavier Glorot and Yoshua Bengio (2010):
-         Understanding the difficulty of training deep feedforward neural
-         networks. International conference on artificial intelligence and
-         statistics.
-
-This initializer is designed to keep the scale of the gradients roughly the
-same in all layers. In uniform distribution this ends up being the range:
-`x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
-deviation of `sqrt(3. / (in + out))` is used.
-
-##### Args:
-
-
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer for a weight matrix.
-
-
-- - -
-
-### `tf.contrib.layers.xavier_initializer_conv2d(uniform=True, seed=None, dtype=tf.float32)` {#xavier_initializer_conv2d}
-
-Returns an initializer performing "Xavier" initialization for weights.
-
-This function implements the weight initialization from:
-
-Xavier Glorot and Yoshua Bengio (2010):
-         Understanding the difficulty of training deep feedforward neural
-         networks. International conference on artificial intelligence and
-         statistics.
-
-This initializer is designed to keep the scale of the gradients roughly the
-same in all layers. In uniform distribution this ends up being the range:
-`x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
-deviation of `sqrt(3. / (in + out))` is used.
-
-##### Args:
-
-
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer for a weight matrix.
-
-
-- - -
-
-### `tf.contrib.layers.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, seed=None, dtype=tf.float32)` {#variance_scaling_initializer}
-
-Returns an initializer that generates tensors without scaling variance.
-
-When initializing a deep network, it is in principle advantageous to keep
-the scale of the input variance constant, so it does not explode or diminish
-by reaching the final layer. This initializer use the following formula:
-
-```python
-  if mode='FAN_IN': # Count only number of input connections.
-    n = fan_in
-  elif mode='FAN_OUT': # Count only number of output connections.
-    n = fan_out
-  elif mode='FAN_AVG': # Average number of inputs and output connections.
-    n = (fan_in + fan_out)/2.0
-
-    truncated_normal(shape, 0.0, stddev=sqrt(factor / n))
-```
-
-* To get [Delving Deep into Rectifiers](
-   http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
-  `factor=2.0 mode='FAN_IN' uniform=False`
-* To get [Convolutional Architecture for Fast Feature Embedding](
-   http://arxiv.org/abs/1408.5093), use:<br/>
-  `factor=1.0 mode='FAN_IN' uniform=True`
-* To get [Understanding the difficulty of training deep feedforward neural
-  networks](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf),
-  use:<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=True.`
-* To get `xavier_initializer` use either:<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=True`, or<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=False`.
-
-##### Args:
-
-
-*  <b>`factor`</b>: Float.  A multiplicative factor.
-*  <b>`mode`</b>: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer that generates tensors with unit variance.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `dtype` is not a floating point type.
-*  <b>`TypeError`</b>: if `mode` is not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG'].
-
-
-
-## Optimization
-
-Optimize weights given a loss.
-
-- - -
-
-### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False)` {#optimize_loss}
-
-Given loss and parameters for optimizer, returns a training op.
-
-Various ways of passing optimizers, include:
-
-- string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
-    for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
-- function, takes learning rate `Tensor` as argument and must return
-    `Optimizer` instance. E.g. `optimize_loss(...,
-    optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
-  Alternatively, if `learning_rate` is `None`, the function takes no
-  arguments. E.g. `optimize_loss(..., learning_rate=None,
-    optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
-- class, subclass of `Optimizer` that takes only one required argument -
-    learning rate, such as AdamOptimizer, AdagradOptimizer.
-    E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
-- object, instance of subclass of `Optimizer`.
-    E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.
-
-##### Args:
-
-
-*  <b>`loss`</b>: Scalar `Tensor`.
-*  <b>`global_step`</b>: Scalar int `Tensor`, step counter for each update. If not
-               supplied, it will be fetched from the default graph (see
-               `tf.contrib.framework.get_global_step` for details). If it's
-               not been created, no step will be incremented with each weight
-               update. `learning_rate_decay_fn` requires `global_step`.
-*  <b>`learning_rate`</b>: float or `Tensor`, magnitude of update per each training
-                 step. Can be `None`.
-*  <b>`optimizer`</b>: string, class or optimizer instance, used as trainer.
-             string should be name of optimizer, like 'SGD',
-               'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
-             class should be sub-class of `tf.Optimizer` that implements
-               `compute_gradients` and `apply_gradients` functions.
-             optimizer instance should be instantiation of `tf.Optimizer`
-               sub-class and have `compute_gradients` and `apply_gradients`
-               functions.
-*  <b>`gradient_noise_scale`</b>: float or None, adds 0-mean normal noise scaled by this
-                        value.
-*  <b>`gradient_multipliers`</b>: dict of variables or variable names to floats.
-                        If present, gradients for specified
-                        variables will be multiplied by given constant.
-*  <b>`clip_gradients`</b>: float, callable or `None`. If float, is provided, a global
-    clipping is applied to prevent the norm of the gradient to exceed this
-    value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
-    This callable takes a `list` of `(gradients, variables)` `tuple`s and
-    returns the same thing with the gradients modified.
-*  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
-                          `Tensor`s, returns `Tensor`.
-                          Can be used to implement any learning rate decay
-                          functions.
-                          For example: `tf.train.exponential_decay`.
-                          Ignored if `learning_rate` is not supplied.
-*  <b>`update_ops`</b>: list of update `Operation`s to execute at each step. If `None`,
-              uses elements of UPDATE_OPS collection. The order of execution
-              between `update_ops` and `loss` is non-deterministic.
-*  <b>`variables`</b>: list of variables to optimize or
-             `None` to use all trainable variables.
-*  <b>`name`</b>: The name for this operation is used to scope operations and summaries.
-*  <b>`summaries`</b>: List of internal quantities to visualize on tensorboard. If not
-             set only the loss and the learning rate will be reported. The
-             complete list is in OPTIMIZER_SUMMARIES.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with the
-                               corresponding op.
-
-##### Returns:
-
-  Training op.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if:
-      * `loss` is an invalid type or shape.
-      * `global_step` is an invalid type or shape.
-      * `learning_rate` is an invalid type or value.
-      * `optimizer` is wrong type.
-      * `clip_gradients` is not float or callable.
-      * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
-        `global_step` is available.
-
-
-
-## Summaries
-
-Helper functions to summarize specific variables or ops.
-
-- - -
-
-### `tf.contrib.layers.summarize_activation(op)` {#summarize_activation}
-
-Summarize an activation.
-
-This applies the given activation and adds useful summaries specific to the
-activation.
-
-##### Args:
-
-
-*  <b>`op`</b>: The tensor to summarize (assumed to be a layer activation).
-
-##### Returns:
-
-  The summary op created to summarize `op`.
-
-
-- - -
-
-### `tf.contrib.layers.summarize_tensor(tensor, tag=None)` {#summarize_tensor}
-
-Summarize a tensor using a suitable summary type.
-
-This function adds a summary op for `tensor`. The type of summary depends on
-the shape of `tensor`. For scalars, a `scalar_summary` is created, for all
-other tensors, `histogram_summary` is used.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: The tensor to summarize
-*  <b>`tag`</b>: The tag to use, if None then use tensor's op's name.
-
-##### Returns:
-
-  The summary op created or None for string tensors.
-
-
-- - -
-
-### `tf.contrib.layers.summarize_tensors(tensors, summarizer=summarize_tensor)` {#summarize_tensors}
-
-Summarize a set of tensors.
-
-
-- - -
-
-### `tf.contrib.layers.summarize_collection(collection, name_filter=None, summarizer=summarize_tensor)` {#summarize_collection}
-
-Summarize a graph collection of tensors, possibly filtered by name.
-
-
-
-The layers module defines convenience functions `summarize_variables`,
-`summarize_weights` and `summarize_biases`, which set the `collection` argument
-of `summarize_collection` to `VARIABLES`, `WEIGHTS` and `BIASES`, respectively.
-
-- - -
-
-### `tf.contrib.layers.summarize_activations(name_filter=None, summarizer=summarize_activation)` {#summarize_activations}
-
-Summarize activations, using `summarize_activation` to summarize.
-
-
-
-## Feature columns
-
-Feature columns provide a mechanism to map data to a model.
-
-- - -
-
-### `tf.contrib.layers.bucketized_column(source_column, boundaries)` {#bucketized_column}
-
-Creates a _BucketizedColumn for discretizing dense input.
-
-##### Args:
-
-
-*  <b>`source_column`</b>: A _RealValuedColumn defining dense column.
-*  <b>`boundaries`</b>: A list of floats specifying the boundaries. It has to be sorted.
-
-##### Returns:
-
-  A _BucketizedColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if 'boundaries' is empty or not sorted.
-
-
-- - -
-
-### `tf.contrib.layers.check_feature_columns(feature_columns)` {#check_feature_columns}
-
-Checks the validity of the set of FeatureColumns.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: A set of instances or subclasses of FeatureColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If there are duplicate feature column keys.
-
-
-- - -
-
-### `tf.contrib.layers.create_feature_spec_for_parsing(feature_columns)` {#create_feature_spec_for_parsing}
-
-Helper that prepares features config from input feature_columns.
-
-The returned feature config can be used as arg 'features' in tf.parse_example.
-
-Typical usage example:
-
-```python
-# Define features and transformations
-feature_a = sparse_column_with_vocabulary_file(...)
-feature_b = real_valued_column(...)
-feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...)
-feature_a_x_feature_c = crossed_column(
-  columns=[feature_a, feature_c_bucketized], ...)
-
-feature_columns = set(
-  [feature_b, feature_c_bucketized, feature_a_x_feature_c])
-batch_examples = tf.parse_example(
-    serialized=serialized_examples,
-    features=create_feature_spec_for_parsing(feature_columns))
-```
-
-For the above example, create_feature_spec_for_parsing would return the dict:
-{
-  "feature_a": parsing_ops.VarLenFeature(tf.string),
-  "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
-  "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
-}
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn, unless
-    feature_columns is a dict -- in which case, this should be true of all
-    values in the dict.
-
-##### Returns:
-
-  A dict mapping feature keys to FixedLenFeature or VarLenFeature values.
-
-
-- - -
-
-### `tf.contrib.layers.crossed_column(columns, hash_bucket_size, combiner='sum', ckpt_to_load_from=None, tensor_name_in_ckpt=None, hash_key=None)` {#crossed_column}
-
-Creates a _CrossedColumn for performing feature crosses.
-
-##### Args:
-
-
-*  <b>`columns`</b>: An iterable of _FeatureColumn. Items can be an instance of
-    _SparseColumn, _CrossedColumn, or _BucketizedColumn.
-*  <b>`hash_bucket_size`</b>: An int that is > 1. The number of buckets.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "sum" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column::
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`hash_key`</b>: Specify the hash_key that will be used by the `FingerprintCat64`
-    function to combine the crosses fingerprints on SparseFeatureCrossOp
-    (optional).
-
-##### Returns:
-
-  A _CrossedColumn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any item in columns is not an instance of _SparseColumn,
-    _CrossedColumn, or _BucketizedColumn, or
-    hash_bucket_size is not an int.
-*  <b>`ValueError`</b>: if hash_bucket_size is not > 1 or
-    len(columns) is not > 1.
-
-
-- - -
-
-### `tf.contrib.layers.embedding_column(sparse_id_column, dimension, combiner='mean', initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None)` {#embedding_column}
-
-Creates an `_EmbeddingColumn` for feeding sparse data into a DNN.
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A `_SparseColumn` which is created by for example
-    `sparse_column_with_*` or crossed_column functions. Note that `combiner`
-    defined in `sparse_id_column` is ignored.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
-    1/sqrt(sparse_id_column.length).
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`max_norm`</b>: (Optional). If not None, embedding values are l2-normalized to
-    the value of max_norm.
-
-##### Returns:
-
-  An `_EmbeddingColumn`.
-
-
-- - -
-
-### `tf.contrib.layers.scattered_embedding_column(column_name, size, dimension, hash_key, combiner='mean', initializer=None)` {#scattered_embedding_column}
-
-Creates an embedding column of a sparse feature using parameter hashing.
-
-The i-th embedding component of a value v is found by retrieving an
-embedding weight whose index is a fingerprint of the pair (v,i).
-
-An embedding column with sparse_column_with_hash_bucket such as
-  embedding_column(
-      sparse_column_with_hash_bucket(column_name, bucket_size),
-      dimension)
-
-could be replaced by
-  scattered_embedding_column(
-      column_name, size=bucket_size * dimension, dimension=dimension,
-      hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
-
-for the same number of embedding parameters and hopefully reduced impact of
-collisions with a cost of slowing down training.
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`size`</b>: An integer specifying the number of parameters in the embedding layer.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`hash_key`</b>: Specify the hash_key that will be used by the `FingerprintCat64`
-    function to combine the crosses fingerprints on SparseFeatureCrossOp.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1.
-
-##### Returns:
-
-  A _ScatteredEmbeddingColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dimension or size is not a positive integer; or if combiner
-    is not supported.
-
-
-- - -
-
-### `tf.contrib.layers.input_from_feature_columns(columns_to_tensors, feature_columns, weight_collections=None, trainable=True, scope=None)` {#input_from_feature_columns}
-
-A tf.contrib.layer style input layer builder based on FeatureColumns.
-
-Generally a single example in training data is described with feature columns.
-At the first layer of the model, this column oriented data should be converted
-to a single tensor. Each feature column needs a different kind of operation
-during this conversion. For example sparse features need a totally different
-handling than continuous features.
-
-Example:
-
-```python
-  # Building model for training
-  columns_to_tensor = tf.parse_example(...)
-  first_layer = input_from_feature_columns(
-      columns_to_tensors=columns_to_tensor,
-      feature_columns=feature_columns)
-  second_layer = fully_connected(inputs=first_layer, ...)
-  ...
-```
-
-where feature_columns can be defined as follows:
-
-```python
-  sparse_feature = sparse_column_with_hash_bucket(
-      column_name="sparse_col", ...)
-  sparse_feature_emb = embedding_column(sparse_id_column=sparse_feature, ...)
-  real_valued_feature = real_valued_column(...)
-  real_valued_buckets = bucketized_column(
-      source_column=real_valued_feature, ...)
-
-  feature_columns=[sparse_feature_emb, real_valued_buckets]
-```
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived by FeatureColumn.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A Tensor which can be consumed by hidden layers in the neural network.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be consumed by a neural network.
-
-
-- - -
-
-### `tf.contrib.layers.joint_weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None)` {#joint_weighted_sum_from_feature_columns}
-
-A restricted linear prediction builder based on FeatureColumns.
-
-As long as all feature columns are unweighted sparse columns this computes the
-prediction of a linear model which stores all weights in a single variable.
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived from FeatureColumn.
-*  <b>`num_outputs`</b>: An integer specifying number of outputs. Default value is 1.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tuple containing:
-
-    * A Tensor which represents predictions of a linear model.
-    * A list of Variables storing the weights.
-    * A Variable which is used for bias.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be used for linear predictions.
-
-
-- - -
-
-### `tf.contrib.layers.make_place_holder_tensors_for_base_features(feature_columns)` {#make_place_holder_tensors_for_base_features}
-
-Returns placeholder tensors for inference.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn.
-
-##### Returns:
-
-  A dict mapping feature keys to SparseTensors (sparse columns) or
-  placeholder Tensors (dense columns).
-
-
-- - -
-
-### `tf.contrib.layers.multi_class_target(*args, **kwargs)` {#multi_class_target}
-
-Creates a _TargetColumn for multi class single label classification. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-12.
-Instructions for updating:
-This file will be removed after the deprecation date.Please switch to third_party/tensorflow/contrib/learn/python/learn/estimators/head.py
-
-The target column uses softmax cross entropy loss.
-
-##### Args:
-
-
-*  <b>`n_classes`</b>: Integer, number of classes, must be >= 2
-*  <b>`label_name`</b>: String, name of the key in label dict. Can be null if label
-      is a tensor (single headed models).
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-
-##### Returns:
-
-  An instance of _MultiClassTargetColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if n_classes is < 2
-
-
-- - -
-
-### `tf.contrib.layers.one_hot_column(sparse_id_column)` {#one_hot_column}
-
-Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A _SparseColumn which is created by
-      `sparse_column_with_*`
-      or crossed_column functions. Note that `combiner` defined in
-      `sparse_id_column` is ignored.
-
-##### Returns:
-
-  An _OneHotColumn.
-
-
-- - -
-
-### `tf.contrib.layers.parse_feature_columns_from_examples(serialized, feature_columns, name=None, example_names=None)` {#parse_feature_columns_from_examples}
-
-Parses tf.Examples to extract tensors for given feature_columns.
-
-This is a wrapper of 'tf.parse_example'.
-
-Example:
-
-```python
-columns_to_tensor = parse_feature_columns_from_examples(
-    serialized=my_data,
-    feature_columns=my_features)
-
-# Where my_features are:
-# Define features and transformations
-sparse_feature_a = sparse_column_with_keys(
-    column_name="sparse_feature_a", keys=["AB", "CD", ...])
-
-embedding_feature_a = embedding_column(
-    sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
-
-sparse_feature_b = sparse_column_with_hash_bucket(
-    column_name="sparse_feature_b", hash_bucket_size=1000)
-
-embedding_feature_b = embedding_column(
-    sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
-
-crossed_feature_a_x_b = crossed_column(
-    columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
-
-real_feature = real_valued_column("real_feature")
-real_feature_buckets = bucketized_column(
-    source_column=real_feature, boundaries=[...])
-
-my_features = [embedding_feature_b, real_feature_buckets, embedding_feature_a]
-```
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
-    serialized `Example` protos.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
-    the serialized protos in the batch.
-
-##### Returns:
-
-  A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
-
-
-- - -
-
-### `tf.contrib.layers.parse_feature_columns_from_sequence_examples(serialized, context_feature_columns, sequence_feature_columns, name=None, example_name=None)` {#parse_feature_columns_from_sequence_examples}
-
-Parses tf.SequenceExamples to extract tensors for given `FeatureColumn`s.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A scalar (0-D Tensor) of type string, a single serialized
-    `SequenceExample` proto.
-*  <b>`context_feature_columns`</b>: An iterable containing the feature columns for
-    context features. All items should be instances of classes derived from
-    `_FeatureColumn`. Can be `None`.
-*  <b>`sequence_feature_columns`</b>: An iterable containing the feature columns for
-    sequence features. All items should be instances of classes derived from
-    `_FeatureColumn`. Can be `None`.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_name`</b>: A scalar (0-D Tensor) of type string (optional), the names of
-    the serialized proto.
-
-##### Returns:
-
-  A tuple consisting of:
-
-*  <b>`context_features`</b>: a dict mapping `FeatureColumns` from
-    `context_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
-*  <b>`sequence_features`</b>: a dict mapping `FeatureColumns` from
-    `sequence_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
-
-
-- - -
-
-### `tf.contrib.layers.real_valued_column(column_name, dimension=1, default_value=None, dtype=tf.float32, normalizer=None)` {#real_valued_column}
-
-Creates a `_RealValuedColumn` for dense numeric data.
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining real valued column name.
-*  <b>`dimension`</b>: An integer specifying dimension of the real valued column.
-    The default is 1. When dimension is not None, the Tensor representing
-    the _RealValuedColumn will have the shape of [batch_size, dimension].
-    A None dimension means the feature column should be treat as variable
-    length and will be parsed as a `SparseTensor`.
-*  <b>`default_value`</b>: A single value compatible with dtype or a list of values
-    compatible with dtype which the column takes on during tf.Example parsing
-    if data is missing. When dimension is not None, a default value of None
-    will cause tf.parse_example to fail if an example does not contain this
-    column. If a single value is provided, the same value will be applied as
-    the default value for every dimension. If a list of values is provided,
-    the length of the list should be equal to the value of `dimension`.
-    Only scalar default value is supported in case dimension is not specified.
-*  <b>`dtype`</b>: defines the type of values. Default value is tf.float32. Must be a
-    non-quantized, real integer or floating point type.
-*  <b>`normalizer`</b>: If not None, a function that can be used to normalize the value
-    of the real valued column after default_value is applied for parsing.
-    Normalizer function takes the input tensor as its argument, and returns
-    the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
-    variable length columns, the normalizer should expect an input_tensor of
-    type `SparseTensor`.
-
-##### Returns:
-
-  A _RealValuedColumn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dimension is not an int
-*  <b>`ValueError`</b>: if dimension is not a positive integer
-*  <b>`TypeError`</b>: if default_value is a list but its length is not equal to the
-    value of `dimension`.
-*  <b>`TypeError`</b>: if default_value is not compatible with dtype.
-*  <b>`ValueError`</b>: if dtype is not convertable to tf.float32.
-
-
-- - -
-
-### `tf.contrib.layers.shared_embedding_columns(sparse_id_columns, dimension, combiner='mean', shared_embedding_name=None, initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None)` {#shared_embedding_columns}
-
-Creates a list of `_EmbeddingColumn` sharing the same embedding.
-
-##### Args:
-
-
-*  <b>`sparse_id_columns`</b>: An iterable of `_SparseColumn`, such as those created by
-    `sparse_column_with_*` or crossed_column functions. Note that `combiner`
-    defined in each sparse_id_column is ignored.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`shared_embedding_name`</b>: (Optional). A string specifying the name of shared
-    embedding weights. This will be needed if you want to reference the shared
-    embedding separately from the generated `_EmbeddingColumn`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
-    1/sqrt(sparse_id_columns[0].length).
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`max_norm`</b>: (Optional). If not None, embedding values are l2-normalized to
-    the value of max_norm.
-
-##### Returns:
-
-  A tuple of `_EmbeddingColumn` with shared embedding space.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if sparse_id_columns is empty, or its elements are not
-    compatible with each other.
-*  <b>`TypeError`</b>: if `sparse_id_columns` is not a sequence or is a string. If at
-    least one element of `sparse_id_columns` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.contrib.layers.sparse_column_with_hash_bucket(column_name, hash_bucket_size, combiner='sum', dtype=tf.string)` {#sparse_column_with_hash_bucket}
-
-Creates a _SparseColumn with hashed bucket configuration.
-
-Use this when your sparse features are in string or integer format, but you
-don't have a vocab file that maps each value to an integer ID.
-output_id = Hash(input_feature_string) % bucket_size
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`hash_bucket_size`</b>: An int that is > 1. The number of buckets.
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`dtype`</b>: The type of features. Only string and integer types are supported.
-
-##### Returns:
-
-  A _SparseColumn with hashed bucket configuration
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: hash_bucket_size is not greater than 2.
-*  <b>`ValueError`</b>: dtype is neither string nor integer.
-
-
-- - -
-
-### `tf.contrib.layers.sparse_column_with_integerized_feature(column_name, bucket_size, combiner='sum', dtype=tf.int64)` {#sparse_column_with_integerized_feature}
-
-Creates an integerized _SparseColumn.
-
-Use this when your features are already pre-integerized into int64 IDs, that
-is, when the set of values to output is already coming in as what's desired in
-the output. Integerized means we can use the feature value itself as id.
-
-Typically this is used for reading contiguous ranges of integers indexes, but
-it doesn't have to be. The output value is simply copied from the
-input_feature, whatever it is. Just be aware, however, that if you have large
-gaps of unused integers it might affect what you feed those in (for instance,
-if you make up a one-hot tensor from these, the unused integers will appear as
-values in the tensor which are always zero.)
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`bucket_size`</b>: An int that is > 1. The number of buckets. It should be bigger
-    than maximum feature. In other words features in this column should be an
-    int64 in range [0, bucket_size)
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`dtype`</b>: Type of features. It should be an integer type. Default value is
-    dtypes.int64.
-
-##### Returns:
-
-  An integerized _SparseColumn definition.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: bucket_size is not greater than 1.
-*  <b>`ValueError`</b>: dtype is not integer.
-
-
-- - -
-
-### `tf.contrib.layers.sparse_column_with_keys(column_name, keys, default_value=-1, combiner='sum')` {#sparse_column_with_keys}
-
-Creates a _SparseColumn with keys.
-
-Look up logic is as follows:
-lookup_id = index_of_feature_in_keys if feature in keys else default_value
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`keys`</b>: a string list defining vocabulary.
-*  <b>`default_value`</b>: The value to use for out-of-vocabulary feature values.
-    Default is -1.
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-
-##### Returns:
-
-  A _SparseColumnKeys with keys configuration.
-
-
-- - -
-
-### `tf.contrib.layers.weighted_sparse_column(sparse_id_column, weight_column_name, dtype=tf.float32)` {#weighted_sparse_column}
-
-Creates a _SparseColumn by combining sparse_id_column with a weight column.
-
-Example:
-
-  ```python
-  sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col",
-                                                  hash_bucket_size=1000)
-  weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature,
-                                            weight_column_name="weights_col")
-  ```
-
-  This configuration assumes that input dictionary of model contains the
-  following two items:
-    * (key="sparse_col", value=sparse_tensor) where sparse_tensor is
-      a SparseTensor.
-    * (key="weights_col", value=weights_tensor) where weights_tensor
-      is a SparseTensor.
-   Following are assumed to be true:
-     * sparse_tensor.indices = weights_tensor.indices
-     * sparse_tensor.dense_shape = weights_tensor.dense_shape
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A `_SparseColumn` which is created by
-    `sparse_column_with_*` functions.
-*  <b>`weight_column_name`</b>: A string defining a sparse column name which represents
-    weight or value of the corresponding sparse id feature.
-*  <b>`dtype`</b>: Type of weights, such as `tf.float32`. Only floating and integer
-    weights are supported.
-
-##### Returns:
-
-  A _WeightedSparseColumn composed of two sparse features: one represents id,
-  the other represents weight (value) of the id feature in that example.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dtype is not convertible to float.
-
-
-- - -
-
-### `tf.contrib.layers.weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None)` {#weighted_sum_from_feature_columns}
-
-A tf.contrib.layer style linear prediction builder based on FeatureColumns.
-
-Generally a single example in training data is described with feature columns.
-This function generates weighted sum for each num_outputs. Weighted sum refers
-to logits in classification problems. It refers to prediction itself for
-linear regression problems.
-
-Example:
-
-  ```
-  # Building model for training
-  feature_columns = (
-      real_valued_column("my_feature1"),
-      ...
-  )
-  columns_to_tensor = tf.parse_example(...)
-  logits = weighted_sum_from_feature_columns(
-      columns_to_tensors=columns_to_tensor,
-      feature_columns=feature_columns,
-      num_outputs=1)
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-                                                 logits=logits)
-  ```
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived from FeatureColumn.
-*  <b>`num_outputs`</b>: An integer specifying number of outputs. Default value is 1.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tuple containing:
-
-    * A Tensor which represents predictions of a linear model.
-    * A dictionary which maps feature_column to corresponding Variable.
-    * A Variable which is used for bias.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be used for linear predictions.
-
-
-- - -
-
-### `tf.contrib.layers.infer_real_valued_columns(features)` {#infer_real_valued_columns}
-
-
-
-
-- - -
-
-### `tf.contrib.layers.sequence_input_from_feature_columns(*args, **kwargs)` {#sequence_input_from_feature_columns}
-
-Builds inputs for sequence models from `FeatureColumn`s. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
-
-
-See documentation for `input_from_feature_columns`. The following types of
-`FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`,
-`_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`,
-`_DataFrameColumn`. In addition, columns in `feature_columns` may not be
-constructed using any of the following: `ScatteredEmbeddingColumn`,
-`BucketizedColumn`, `CrossedColumn`.
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived by FeatureColumn.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A Tensor which can be consumed by hidden layers in the neural network.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be consumed by a neural network.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `tf.contrib.layers.legacy_fully_connected(x, num_output_units, activation_fn=None, weight_init=_initializer, bias_init=Zeros(), name=None, weight_collections=('weights',), bias_collections=('biases',), output_collections=('activations',), trainable=True, weight_regularizer=None, bias_regularizer=None)` {#legacy_fully_connected}
-
-Adds the parameters for a fully connected layer and returns the output.
-
-A fully connected layer is generally defined as a matrix multiply:
-`y = f(w * x + b)` where `f` is given by `activation_fn`. If
-`activation_fn` is `None`, the result of `y = w * x + b` is
-returned.
-
-If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
-multiply along the first dimensions. The result r is a tensor of shape
-[\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-where \\\( r_{i_0, ..., i_{n-1}, k} =
-\\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
-This is accomplished by reshaping `x` to 2-D
-[\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
-before the matrix multiply and afterwards reshaping it to
-[\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
-
-This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
-`bias_init` to `None`.
-
-The variable creation is compatible with `tf.variable_scope` and so can be
-reused with `tf.variable_scope` or `tf.make_template`.
-
-Most of the details of variable creation can be controlled by specifying the
-initializers (`weight_init` and `bias_init`) and in which collections to place
-the created variables (`weight_collections` and `bias_collections`; note that
-the variables are always added to the `VARIABLES` collection). The output of
-the layer can be placed in custom collections using `output_collections`.
-The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`,
-respectively.
-
-A per layer regularization can be specified by setting `weight_regularizer`
-and `bias_regularizer`, which are applied to the weights and biases
-respectively, and whose output is added to the `REGULARIZATION_LOSSES`
-collection.
-
-##### Args:
-
-
-*  <b>`x`</b>: The input `Tensor`.
-*  <b>`num_output_units`</b>: The size of the output.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`weight_init`</b>: An optional weight initialization, defaults to
-    `xavier_initializer`.
-*  <b>`bias_init`</b>: An initializer for the bias, defaults to 0. Set to `None` in
-    order to disable bias.
-*  <b>`name`</b>: The name for this operation is used to name operations and to find
-    variables. If specified it must be unique for this scope, otherwise a
-    unique name starting with "fully_connected" will be created.  See
-    `tf.variable_scope` for details.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`bias_collections`</b>: List of graph collections to which biases are added.
-*  <b>`output_collections`</b>: List of graph collections to which outputs are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`weight_regularizer`</b>: A regularizer like the result of
-    `l1_regularizer` or `l2_regularizer`. Used for weights.
-*  <b>`bias_regularizer`</b>: A regularizer like the result of
-    `l1_regularizer` or `l2_regularizer`. Used for biases.
-
-##### Returns:
-
-  The output of the fully connected layer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x has rank less than 2 or if its last dimension is not set.
-
-
-- - -
-
-### `tf.contrib.layers.legacy_linear(x, num_output_units, weight_init=_initializer, bias_init=Zeros(), name=None, weight_collections=('weights',), bias_collections=('biases',), output_collections=('activations',), trainable=True, weight_regularizer=None, bias_regularizer=None)` {#legacy_linear}
-
-partial(func, *args, **keywords) - new function with partial application
-of the given arguments and keywords.
-
-
-- - -
-
-### `tf.contrib.layers.legacy_relu(x, num_output_units, weight_init=_initializer, bias_init=Zeros(), name=None, weight_collections=('weights',), bias_collections=('biases',), output_collections=('activations',), trainable=True, weight_regularizer=None, bias_regularizer=None)` {#legacy_relu}
-
-partial(func, *args, **keywords) - new function with partial application
-of the given arguments and keywords.
-
-
-- - -
-
-### `tf.contrib.layers.regression_target(*args, **kwargs)` {#regression_target}
-
-Creates a _TargetColumn for linear regression. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-12.
-Instructions for updating:
-This file will be removed after the deprecation date.Please switch to third_party/tensorflow/contrib/learn/python/learn/estimators/head.py
-
-##### Args:
-
-
-*  <b>`label_name`</b>: String, name of the key in label dict. Can be null if label
-      is a tensor (single headed models).
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`label_dimension`</b>: dimension of the target for multilabels.
-
-##### Returns:
-
-  An instance of _TargetColumn
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
deleted file mode 100644
index d90b6b1a53d..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ /dev/null
@@ -1,5413 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Learn (contrib)
-[TOC]
-
-High level API for learning with TensorFlow.
-
-## Estimators
-
-Train and evaluate TensorFlow models.
-
-- - -
-
-### `class tf.contrib.learn.BaseEstimator` {#BaseEstimator}
-
-Abstract BaseEstimator class to train and evaluate TensorFlow models.
-
-Users should not instantiate or subclass this class. Instead, use `Estimator`.
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.__init__(model_dir=None, config=None)` {#BaseEstimator.__init__}
-
-Initializes a BaseEstimator instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: A RunConfig instance.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.__repr__()` {#BaseEstimator.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.config` {#BaseEstimator.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.evaluate(*args, **kwargs)` {#BaseEstimator.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.export(*args, **kwargs)` {#BaseEstimator.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.fit(*args, **kwargs)` {#BaseEstimator.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_params(deep=True)` {#BaseEstimator.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_variable_names()` {#BaseEstimator.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_variable_value(name)` {#BaseEstimator.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.model_dir` {#BaseEstimator.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.partial_fit(*args, **kwargs)` {#BaseEstimator.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.predict(*args, **kwargs)` {#BaseEstimator.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.set_params(**params)` {#BaseEstimator.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-
-- - -
-
-### `class tf.contrib.learn.Estimator` {#Estimator}
-
-Estimator class is the basic TensorFlow model trainer/evaluator.
-- - -
-
-#### `tf.contrib.learn.Estimator.__init__(model_fn=None, model_dir=None, config=None, params=None, feature_engineering_fn=None)` {#Estimator.__init__}
-
-Constructs an `Estimator` instance.
-
-##### Args:
-
-
-*  <b>`model_fn`</b>: Model function. Follows the signature:
-    * Args:
-      * `features`: single `Tensor` or `dict` of `Tensor`s
-             (depending on data passed to `fit`),
-      * `labels`: `Tensor` or `dict` of `Tensor`s (for multi-head
-             models). If mode is `ModeKeys.INFER`, `labels=None` will be
-             passed. If the `model_fn`'s signature does not accept
-             `mode`, the `model_fn` must still be able to handle
-             `labels=None`.
-      * `mode`: Optional. Specifies if this training, evaluation or
-             prediction. See `ModeKeys`.
-      * `params`: Optional `dict` of hyperparameters.  Will receive what
-             is passed to Estimator in `params` parameter. This allows
-             to configure Estimators from hyper parameter tuning.
-      * `config`: Optional configuration object. Will receive what is passed
-             to Estimator in `config` parameter, or the default `config`.
-             Allows updating things in your model_fn based on configuration
-             such as `num_ps_replicas`.
-      * `model_dir`: Optional directory where model parameters, graph etc
-             are saved. Will receive what is passed to Estimator in
-             `model_dir` parameter, or the default `model_dir`. Allows
-             updating things in your model_fn that expect model_dir, such as
-             training hooks.
-
-    * Returns:
-      `ModelFnOps`
-
-    Also supports a legacy signature which returns tuple of:
-
-      * predictions: `Tensor`, `SparseTensor` or dictionary of same.
-          Can also be any type that is convertible to a `Tensor` or
-          `SparseTensor`, or dictionary of same.
-      * loss: Scalar loss `Tensor`.
-      * train_op: Training update `Tensor` or `Operation`.
-
-    Supports next three signatures for the function:
-
-      * `(features, labels) -> (predictions, loss, train_op)`
-      * `(features, labels, mode) -> (predictions, loss, train_op)`
-      * `(features, labels, mode, params) -> (predictions, loss, train_op)`
-      * `(features, labels, mode, params, config) ->
-         (predictions, loss, train_op)`
-      * `(features, labels, mode, params, config, model_dir) ->
-         (predictions, loss, train_op)`
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: Configuration object.
-*  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
-          Keys are names of parameters, values are basic python types.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                          labels which are the output of `input_fn` and
-                          returns features and labels which will be fed
-                          into `model_fn`. Please check `model_fn` for
-                          a definition of features and labels.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: parameters of `model_fn` don't match `params`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.__repr__()` {#Estimator.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.config` {#Estimator.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.evaluate(*args, **kwargs)` {#Estimator.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.export(*args, **kwargs)` {#Estimator.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#Estimator.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.fit(*args, **kwargs)` {#Estimator.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_params(deep=True)` {#Estimator.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_variable_names()` {#Estimator.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_variable_value(name)` {#Estimator.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.model_dir` {#Estimator.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.partial_fit(*args, **kwargs)` {#Estimator.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.predict(*args, **kwargs)` {#Estimator.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.set_params(**params)` {#Estimator.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-
-- - -
-
-### `class tf.contrib.learn.Trainable` {#Trainable}
-
-Interface for objects that are trainable by, e.g., `Experiment`.
-- - -
-
-#### `tf.contrib.learn.Trainable.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Trainable.fit}
-
-Trains a model given training data `x` predictions and `y` labels.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-     Can be iterator that returns arrays of features or dictionary of arrays of features.
-     The training input samples for fitting the model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-     Can be iterator that returns array of labels or dictionary of array of labels.
-     The training label values (class labels in classification, real numbers in regression).
-     If set, `input_fn` must be `None`. Note: For classification, label values must
-     be integers representing the class index (i.e. values from 0 to
-     n_classes-1).
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - `Tensor` or dictionary of string feature name to `Tensor`.
-      labels - `Tensor` or dictionary of `Tensor` with labels.
-    If input_fn is set, `x`, `y`, and `batch_size` must be `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    'steps' works incrementally. If you call two times fit(steps=10) then
-    training occurs in total 20 steps. If you don't want to have incremental
-    behaviour please set `max_steps` instead. If set, `max_steps` must be
-    `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
-
-
-
-- - -
-
-### `class tf.contrib.learn.Evaluable` {#Evaluable}
-
-Interface for objects that are evaluatable by, e.g., `Experiment`.
-- - -
-
-#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#Evaluable.evaluate}
-
-Evaluates given model with provided evaluation data.
-
-Stop conditions - we evaluate on the given input data until one of the
-following:
-- If `steps` is provided, and `steps` batches of size `batch_size` are
-processed.
-- If `input_fn` is provided, and it raises an end-of-input
-exception (`OutOfRangeError` or `StopIteration`).
-- If `x` is provided, and all items in `x` have been processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-     containing the input samples for fitting the model. Can be iterator that returns
-     arrays of features or dictionary of array of features. If set, `input_fn` must
-     be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
-     label values (class labels in classification, real numbers in
-     regression) or dictionary of multiple vectors/matrices. Can be iterator
-     that returns array of targets or dictionary of array of targets. If set,
-     `input_fn` must be `None`. Note: For classification, label values must
-     be integers representing the class index (i.e. values from 0 to
-     n_classes-1).
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - Dictionary of string feature name to `Tensor` or `Tensor`.
-      labels - `Tensor` or dictionary of `Tensor` with labels.
-    If input_fn is set, `x`, `y`, and `batch_size` must be `None`. If
-    `steps` is not provided, this should raise `OutOfRangeError` or
-    `StopIteration` after the desired amount of data (e.g., one epoch) has
-    been provided. See "Stop conditions" above for specifics.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration. Must be `None` if `input_fn` is provided.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until `x` is consumed or `input_fn` raises an end-of-input exception.
-    See "Stop conditions" above for specifics.
-*  <b>`metrics`</b>: Dict of metrics to run. If None, the default metric functions
-    are used; if {}, no metrics are used. Otherwise, `metrics` should map
-    friendly names for the metric to a `MetricSpec` object defining which
-    model outputs to evaluate against which labels with which metric
-    function.
-
-    Metric ops should support streaming, e.g., returning `update_op` and
-    `value` tensors. For example, see the options defined in
-    `../../../metrics/python/ops/metrics_ops.py`.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-*  <b>`checkpoint_path`</b>: Path of a specific checkpoint to evaluate. If `None`, the
-    latest checkpoint in `model_dir` is used.
-*  <b>`hooks`</b>: List of `SessionRunHook` subclass instances. Used for callbacks
-    inside the evaluation call.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
-
-
-- - -
-
-#### `tf.contrib.learn.Evaluable.model_dir` {#Evaluable.model_dir}
-
-Returns a path in which the eval process will look for checkpoints.
-
-
-
-- - -
-
-### `class tf.contrib.learn.KMeansClustering` {#KMeansClustering}
-
-An Estimator for K-Means clustering.
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.__init__(num_clusters, model_dir=None, initial_clusters='random', distance_metric='squared_euclidean', random_seed=0, use_mini_batch=True, kmeans_plus_plus_num_retries=2, relative_tolerance=None, config=None)` {#KMeansClustering.__init__}
-
-Creates a model for running KMeans training and inference.
-
-##### Args:
-
-
-*  <b>`num_clusters`</b>: number of clusters to train.
-*  <b>`model_dir`</b>: the directory to save the model results and log files.
-*  <b>`initial_clusters`</b>: specifies how to initialize the clusters for training.
-    See clustering_ops.kmeans for the possible values.
-*  <b>`distance_metric`</b>: the distance metric used for clustering.
-    See clustering_ops.kmeans for the possible values.
-*  <b>`random_seed`</b>: Python integer. Seed for PRNG used to initialize centers.
-*  <b>`use_mini_batch`</b>: If true, use the mini-batch k-means algorithm. Else assume
-    full batch.
-*  <b>`kmeans_plus_plus_num_retries`</b>: For each point that is sampled during
-    kmeans++ initialization, this parameter specifies the number of
-    additional points to draw from the current distribution before selecting
-    the best. If a negative value is specified, a heuristic is used to
-    sample O(log(num_to_sample)) additional points.
-*  <b>`relative_tolerance`</b>: A relative tolerance of change in the loss between
-    iterations.  Stops learning if the loss changes less than this amount.
-    Note that this may not work correctly if use_mini_batch=True.
-*  <b>`config`</b>: See Estimator
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.__repr__()` {#KMeansClustering.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.clusters()` {#KMeansClustering.clusters}
-
-Returns cluster centers.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.config` {#KMeansClustering.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.evaluate(*args, **kwargs)` {#KMeansClustering.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.export(*args, **kwargs)` {#KMeansClustering.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#KMeansClustering.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.fit(*args, **kwargs)` {#KMeansClustering.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_params(deep=True)` {#KMeansClustering.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_variable_names()` {#KMeansClustering.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_variable_value(name)` {#KMeansClustering.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.model_dir` {#KMeansClustering.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.partial_fit(*args, **kwargs)` {#KMeansClustering.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.predict(*args, **kwargs)` {#KMeansClustering.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.predict_cluster_idx(input_fn=None)` {#KMeansClustering.predict_cluster_idx}
-
-Yields predicted cluster indices.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.score(input_fn=None, steps=None)` {#KMeansClustering.score}
-
-Predict total sum of distances to nearest clusters.
-
-Note that this function is different from the corresponding one in sklearn
-which returns the negative of the sum of distances.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: see predict.
-*  <b>`steps`</b>: see predict.
-
-##### Returns:
-
-  Total sum of distances to nearest clusters.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.set_params(**params)` {#KMeansClustering.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.transform(input_fn=None, as_iterable=False)` {#KMeansClustering.transform}
-
-Transforms each element to distances to cluster centers.
-
-Note that this function is different from the corresponding one in sklearn.
-For SQUARED_EUCLIDEAN distance metric, sklearn transform returns the
-EUCLIDEAN distance, while this function returns the SQUARED_EUCLIDEAN
-distance.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: see predict.
-*  <b>`as_iterable`</b>: see predict
-
-##### Returns:
-
-  Array with same number of rows as x, and num_clusters columns, containing
-  distances to the cluster centers.
-
-
-
-- - -
-
-### `class tf.contrib.learn.ModeKeys` {#ModeKeys}
-
-Standard names for model modes.
-
-The following standard keys are defined:
-
-* `TRAIN`: training mode.
-* `EVAL`: evaluation mode.
-* `INFER`: inference mode.
-
-- - -
-
-### `class tf.contrib.learn.ModelFnOps` {#ModelFnOps}
-
-Ops returned from a model_fn.
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__getnewargs__()` {#ModelFnOps.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__getstate__()` {#ModelFnOps.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__new__(cls, mode, predictions=None, loss=None, train_op=None, eval_metric_ops=None, output_alternatives=None, training_chief_hooks=None, training_hooks=None, scaffold=None)` {#ModelFnOps.__new__}
-
-Creates a validated `ModelFnOps` instance.
-
-For a multi-headed model, the predictions dict here will contain the outputs
-of all of the heads.  However: at serving time, requests will be made
-specifically for one or more heads, and the RPCs used for these requests may
-differ by problem type (i.e., regression, classification, other).  The
-purpose of the output_alternatives dict is to aid in exporting a SavedModel
-from which such head-specific queries can be served.  These
-output_alternatives will be combined with input_alternatives (see
-`saved_model_export_utils`) to produce a set of `SignatureDef`s specifying
-the valid requests that can be served from this model.
-
-For a single-headed model, it is still adviseable to provide
-output_alternatives with a single entry, because this is how the problem
-type is communicated for export and serving.  If output_alternatives is not
-given, the resulting SavedModel will support only one head of unspecified
-type.
-
-##### Args:
-
-
-*  <b>`mode`</b>: One of `ModeKeys`. Specifies if this training, evaluation or
-    prediction.
-*  <b>`predictions`</b>: Predictions `Tensor` or dict of `Tensor`.
-*  <b>`loss`</b>: Training loss `Tensor`.
-*  <b>`train_op`</b>: Op for the training step.
-*  <b>`eval_metric_ops`</b>: Dict of metric results keyed by name. The values of the
-    dict are the results of calling a metric function, such as `Tensor`.
-*  <b>`output_alternatives`</b>: a dict of
-    `{submodel_name: (problem_type, {tensor_name: Tensor})}`, where
-    `submodel_name` is a submodel identifier that should be consistent
-    across the pipeline (here likely taken from the name of each `Head`,
-    for models that use them), `problem_type` is a `ProblemType`,
-    `tensor_name` is a symbolic name for an output Tensor possibly but not
-    necessarily taken from `PredictionKey`, and `Tensor` is the
-    corresponding output Tensor itself.
-*  <b>`training_chief_hooks`</b>: A list of `SessionRunHook` objects that will be
-    run on the chief worker during training.
-*  <b>`training_hooks`</b>: A list of `SessionRunHook` objects that will be run on
-    all workers during training.
-*  <b>`scaffold`</b>: A `tf.train.Scaffold` object that can be used to set
-    initialization, saver, and more to be used in training.
-
-##### Returns:
-
-  A validated `ModelFnOps` object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If validation fails.
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__repr__()` {#ModelFnOps.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.eval_metric_ops` {#ModelFnOps.eval_metric_ops}
-
-Alias for field number 3
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.loss` {#ModelFnOps.loss}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.output_alternatives` {#ModelFnOps.output_alternatives}
-
-Alias for field number 4
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.predictions` {#ModelFnOps.predictions}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.scaffold` {#ModelFnOps.scaffold}
-
-Alias for field number 7
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.train_op` {#ModelFnOps.train_op}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.training_chief_hooks` {#ModelFnOps.training_chief_hooks}
-
-Alias for field number 5
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.training_hooks` {#ModelFnOps.training_hooks}
-
-Alias for field number 6
-
-
-
-- - -
-
-### `class tf.contrib.learn.MetricSpec` {#MetricSpec}
-
-MetricSpec connects a model to metric functions.
-
-The MetricSpec class contains all information necessary to connect the
-output of a `model_fn` to the metrics (usually, streaming metrics) that are
-used in evaluation.
-
-It is passed in the `metrics` argument of `Estimator.evaluate`. The
-`Estimator` then knows which predictions, labels, and weight to use to call a
-given metric function.
-
-When building the ops to run in evaluation, `Estimator` will call
-`create_metric_ops`, which will connect the given `metric_fn` to the model
-as detailed in the docstring for `create_metric_ops`, and return the metric.
-
-Example:
-
-Assuming a model has an input function which returns inputs containing
-(among other things) a tensor with key "input_key", and a labels dictionary
-containing "label_key". Let's assume that the `model_fn` for this model
-returns a prediction with key "prediction_key".
-
-In order to compute the accuracy of the "prediction_key" prediction, we
-would add
-
-```
-"prediction accuracy": MetricSpec(metric_fn=prediction_accuracy_fn,
-                                  prediction_key="prediction_key",
-                                  label_key="label_key")
-```
-
-to the metrics argument to `evaluate`. `prediction_accuracy_fn` can be either
-a predefined function in metric_ops (e.g., `streaming_accuracy`) or a custom
-function you define.
-
-If we would like the accuracy to be weighted by "input_key", we can add that
-as the `weight_key` argument.
-
-```
-"prediction accuracy": MetricSpec(metric_fn=prediction_accuracy_fn,
-                                  prediction_key="prediction_key",
-                                  label_key="label_key",
-                                  weight_key="input_key")
-```
-
-An end-to-end example is as follows:
-
-```
-estimator = tf.contrib.learn.Estimator(...)
-estimator.fit(...)
-_ = estimator.evaluate(
-    input_fn=input_fn,
-    steps=1,
-    metrics={
-        'prediction accuracy':
-            metric_spec.MetricSpec(
-                metric_fn=prediction_accuracy_fn,
-                prediction_key="prediction_key",
-                label_key="label_key")
-    })
-```
-- - -
-
-#### `tf.contrib.learn.MetricSpec.__init__(metric_fn, prediction_key=None, label_key=None, weight_key=None)` {#MetricSpec.__init__}
-
-Constructor.
-
-Creates a MetricSpec.
-
-##### Args:
-
-
-*  <b>`metric_fn`</b>: A function to use as a metric. See `_adapt_metric_fn` for
-    rules on how `predictions`, `labels`, and `weights` are passed to this
-    function. This must return either a single `Tensor`, which is
-    interpreted as a value of this metric, or a pair
-    `(value_op, update_op)`, where `value_op` is the op to call to
-    obtain the value of the metric, and `update_op` should be run for
-    each batch to update internal state.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `metric_fn`. Optional. If `None`, the `model_fn` must return a single
-    tensor or a dict with only a single entry as `predictions`.
-*  <b>`label_key`</b>: The key for a tensor in the `labels` dict (output from the
-    `input_fn`) to use as the `labels` input to the `metric_fn`.
-    Optional. If `None`, the `input_fn` must return a single tensor or a
-    dict with only a single entry as `labels`.
-*  <b>`weight_key`</b>: The key for a tensor in the `inputs` dict (output from the
-    `input_fn`) to use as the `weights` input to the `metric_fn`.
-    Optional. If `None`, no weights will be passed to the `metric_fn`.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.__str__()` {#MetricSpec.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.create_metric_ops(inputs, labels, predictions)` {#MetricSpec.create_metric_ops}
-
-Connect our `metric_fn` to the specified members of the given dicts.
-
-This function will call the `metric_fn` given in our constructor as follows:
-
-```
-  metric_fn(predictions[self.prediction_key],
-            labels[self.label_key],
-            weights=weights[self.weight_key])
-```
-
-And returns the result. The `weights` argument is only passed if
-`self.weight_key` is not `None`.
-
-`predictions` and `labels` may be single tensors as well as dicts. If
-`predictions` is a single tensor, `self.prediction_key` must be `None`. If
-`predictions` is a single element dict, `self.prediction_key` is allowed to
-be `None`. Conversely, if `labels` is a single tensor, `self.label_key` must
-be `None`. If `labels` is a single element dict, `self.label_key` is allowed
-to be `None`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A dict of inputs produced by the `input_fn`
-*  <b>`labels`</b>: A dict of labels or a single label tensor produced by the
-    `input_fn`.
-*  <b>`predictions`</b>: A dict of predictions or a single tensor produced by the
-    `model_fn`.
-
-##### Returns:
-
-  The result of calling `metric_fn`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` or `labels` is a single `Tensor` and
-    `self.prediction_key` or `self.label_key` is not `None`; or if
-    `self.label_key` is `None` but `labels` is a dict with more than one
-    element, or if `self.prediction_key` is `None` but `predictions` is a
-    dict with more than one element.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.label_key` {#MetricSpec.label_key}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.metric_fn` {#MetricSpec.metric_fn}
-
-Metric function.
-
-This function accepts named args: `predictions`, `labels`, `weights`. It
-returns a single `Tensor` or `(value_op, update_op)` pair. See `metric_fn`
-constructor argument for more details.
-
-##### Returns:
-
-  Function, see `metric_fn` constructor argument for more details.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.prediction_key` {#MetricSpec.prediction_key}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.weight_key` {#MetricSpec.weight_key}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.PredictionKey` {#PredictionKey}
-
-
-
-- - -
-
-### `class tf.contrib.learn.DNNClassifier` {#DNNClassifier}
-
-A classifier for TensorFlow DNN models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNClassifier(
-    feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    hidden_units=[1024, 512, 256])
-
-# Or estimator using the ProximalAdagradOptimizer optimizer with
-# regularization.
-estimator = DNNClassifier(
-    feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  pass
-estimator.fit(input_fn=input_fn_train)
-
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  pass
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-   `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__init__(hidden_units, feature_columns, model_dir=None, n_classes=2, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNClassifier.__init__}
-
-Initializes a DNNClassifier instance.
-
-##### Args:
-
-
-*  <b>`hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
-    has 32.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    It must be greater than 1. Note: Class labels are integers representing
-    the class index (i.e. values from 0 to n_classes-1). For arbitrary
-    label values (e.g. string labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Adagrad optimizer.
-*  <b>`activation_fn`</b>: Activation function applied to each layer. If `None`, will
-    use `tf.nn.relu`.
-*  <b>`dropout`</b>: When not `None`, the probability we will drop out a given
-    coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are
-    clipped to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Returns:
-
-  A `DNNClassifier` estimator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `n_classes` < 2.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__repr__()` {#DNNClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.bias_` {#DNNClassifier.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.config` {#DNNClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.evaluate(*args, **kwargs)` {#DNNClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNClassifier.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.fit(*args, **kwargs)` {#DNNClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_params(deep=True)` {#DNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_variable_names()` {#DNNClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_variable_value(name)` {#DNNClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.model_dir` {#DNNClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.partial_fit(*args, **kwargs)` {#DNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict(*args, **kwargs)` {#DNNClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict_classes(*args, **kwargs)` {#DNNClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict_proba(*args, **kwargs)` {#DNNClassifier.predict_proba}
-
-Returns predicted probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.set_params(**params)` {#DNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.weights_` {#DNNClassifier.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-
-- - -
-
-### `class tf.contrib.learn.DNNRegressor` {#DNNRegressor}
-
-A regressor for TensorFlow DNN models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNRegressor(
-    feature_columns=[sparse_feature_a, sparse_feature_b],
-    hidden_units=[1024, 512, 256])
-
-# Or estimator using the ProximalAdagradOptimizer optimizer with
-# regularization.
-estimator = DNNRegressor(
-    feature_columns=[sparse_feature_a, sparse_feature_b],
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Input builders
-def input_fn_train: # returns x, y
-  pass
-estimator.fit(input_fn=input_fn_train)
-
-def input_fn_eval: # returns x, y
-  pass
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x)
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-  `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNRegressor.__init__}
-
-Initializes a `DNNRegressor` instance.
-
-##### Args:
-
-
-*  <b>`hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
-    has 32.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Adagrad optimizer.
-*  <b>`activation_fn`</b>: Activation function applied to each layer. If `None`, will
-    use `tf.nn.relu`.
-*  <b>`dropout`</b>: When not `None`, the probability we will drop out a given
-    coordinate.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Returns:
-
-  A `DNNRegressor` estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__repr__()` {#DNNRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.config` {#DNNRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#DNNRegressor.evaluate}
-
-See evaluable.Evaluable.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.fit(*args, **kwargs)` {#DNNRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_params(deep=True)` {#DNNRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_variable_names()` {#DNNRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_variable_value(name)` {#DNNRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.model_dir` {#DNNRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.partial_fit(*args, **kwargs)` {#DNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.predict(*args, **kwargs)` {#DNNRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.predict_scores(*args, **kwargs)` {#DNNRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.set_params(**params)` {#DNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-
-- - -
-
-### `class tf.contrib.learn.DNNLinearCombinedRegressor` {#DNNLinearCombinedRegressor}
-
-A regressor for TensorFlow Linear and DNN joined training models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNLinearCombinedRegressor(
-    # common settings
-    weight_column_name=weight_column_name,
-    # wide settings
-    linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
-    linear_optimizer=tf.train.FtrlOptimizer(...),
-    # deep settings
-    dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    dnn_hidden_units=[1000, 500, 100],
-    dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
-
-# To apply L1 and L2 regularization, you can set optimizers as follows:
-tf.train.ProximalAdagradOptimizer(
-    learning_rate=0.1,
-    l1_regularization_strength=0.001,
-    l2_regularization_strength=0.001)
-# It is same for FtrlOptimizer.
-
-# Input builders
-def input_fn_train: # returns x, y
-  ...
-def input_fn_eval: # returns x, y
-  ...
-estimator.train(input_fn_train)
-estimator.evaluate(input_fn_eval)
-estimator.predict(x)
-```
-
-Input of `fit`, `train`, and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-    if `weight_column_name` is not `None`, a feature with
-      `key=weight_column_name` whose value is a `Tensor`.
-    for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `SparseColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `WeightedSparseColumn`, two features: the first with
-      `key` the id column name, the second with `key` the weight column name.
-      Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `RealValuedColumn, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.__init__(model_dir=None, weight_column_name=None, linear_feature_columns=None, linear_optimizer=None, _joint_linear_weights=False, dnn_feature_columns=None, dnn_optimizer=None, dnn_hidden_units=None, dnn_activation_fn=relu, dnn_dropout=None, gradient_clip_norm=None, enable_centered_bias=False, label_dimension=1, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNLinearCombinedRegressor.__init__}
-
-Initializes a DNNLinearCombinedRegressor instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`linear_feature_columns`</b>: An iterable containing all the feature columns
-    used by linear part of the model. All items in the set must be
-    instances of classes derived from `FeatureColumn`.
-*  <b>`linear_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the linear part of the model. If `None`, will use a FTRL optimizer.
-  _joint_linear_weights: If True a single (possibly partitioned) variable
-    will be used to store the linear model weights. It's faster, but
-    requires that all columns are sparse and have the 'sum' combiner.
-
-*  <b>`dnn_feature_columns`</b>: An iterable containing all the feature columns used
-    by deep part of the model. All items in the set must be instances of
-    classes derived from `FeatureColumn`.
-*  <b>`dnn_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the deep part of the model. If `None`, will use an Adagrad optimizer.
-*  <b>`dnn_hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected.
-*  <b>`dnn_activation_fn`</b>: Activation function applied to each layer. If None,
-    will use `tf.nn.relu`.
-*  <b>`dnn_dropout`</b>: When not None, the probability we will drop out
-    a given coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    tf.clip_by_global_norm for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-*  <b>`config`</b>: RunConfig object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both linear_feature_columns and dnn_features_columns are
-    empty at the same time.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.__repr__()` {#DNNLinearCombinedRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.config` {#DNNLinearCombinedRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#DNNLinearCombinedRegressor.evaluate}
-
-See evaluable.Evaluable.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNLinearCombinedRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNLinearCombinedRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.fit(*args, **kwargs)` {#DNNLinearCombinedRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_params(deep=True)` {#DNNLinearCombinedRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_variable_names()` {#DNNLinearCombinedRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_variable_value(name)` {#DNNLinearCombinedRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.model_dir` {#DNNLinearCombinedRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.partial_fit(*args, **kwargs)` {#DNNLinearCombinedRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.predict(*args, **kwargs)` {#DNNLinearCombinedRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.predict_scores(*args, **kwargs)` {#DNNLinearCombinedRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.set_params(**params)` {#DNNLinearCombinedRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-
-- - -
-
-### `class tf.contrib.learn.DNNLinearCombinedClassifier` {#DNNLinearCombinedClassifier}
-
-A classifier for TensorFlow Linear and DNN joined training models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNLinearCombinedClassifier(
-    # common settings
-    n_classes=n_classes,
-    weight_column_name=weight_column_name,
-    # wide settings
-    linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
-    linear_optimizer=tf.train.FtrlOptimizer(...),
-    # deep settings
-    dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    dnn_hidden_units=[1000, 500, 100],
-    dnn_optimizer=tf.train.AdagradOptimizer(...))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  ...
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-    if `weight_column_name` is not `None`, a feature with
-      `key=weight_column_name` whose value is a `Tensor`.
-    for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `SparseColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `WeightedSparseColumn`, two features: the first with
-      `key` the id column name, the second with `key` the weight column name.
-      Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `RealValuedColumn, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.__init__(model_dir=None, n_classes=2, weight_column_name=None, linear_feature_columns=None, linear_optimizer=None, _joint_linear_weights=False, dnn_feature_columns=None, dnn_optimizer=None, dnn_hidden_units=None, dnn_activation_fn=relu, dnn_dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNLinearCombinedClassifier.__init__}
-
-Constructs a DNNLinearCombinedClassifier instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    Note that class labels are integers representing the class index (i.e.
-    values from 0 to n_classes-1). For arbitrary label values (e.g. string
-    labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training.
-    It will be multiplied by the loss of the example.
-*  <b>`linear_feature_columns`</b>: An iterable containing all the feature columns
-    used by linear part of the model. All items in the set must be
-    instances of classes derived from `FeatureColumn`.
-*  <b>`linear_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the linear part of the model. If `None`, will use a FTRL optimizer.
-  _joint_linear_weights: If True a single (possibly partitioned) variable
-    will be used to store the linear model weights. It's faster, but
-    requires all columns are sparse and have the 'sum' combiner.
-
-*  <b>`dnn_feature_columns`</b>: An iterable containing all the feature columns used
-    by deep part of the model. All items in the set must be instances of
-    classes derived from `FeatureColumn`.
-*  <b>`dnn_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the deep part of the model. If `None`, will use an Adagrad optimizer.
-*  <b>`dnn_hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected.
-*  <b>`dnn_activation_fn`</b>: Activation function applied to each layer. If `None`,
-    will use `tf.nn.relu`.
-*  <b>`dnn_dropout`</b>: When not None, the probability we will drop out
-    a given coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    tf.clip_by_global_norm for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: RunConfig object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `n_classes` < 2.
-*  <b>`ValueError`</b>: If both `linear_feature_columns` and `dnn_features_columns`
-    are empty at the same time.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.__repr__()` {#DNNLinearCombinedClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.config` {#DNNLinearCombinedClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.dnn_bias_` {#DNNLinearCombinedClassifier.dnn_bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.dnn_weights_` {#DNNLinearCombinedClassifier.dnn_weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.evaluate(*args, **kwargs)` {#DNNLinearCombinedClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNLinearCombinedClassifier.export}
-
-See BasEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNLinearCombinedClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.fit(*args, **kwargs)` {#DNNLinearCombinedClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_params(deep=True)` {#DNNLinearCombinedClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_variable_names()` {#DNNLinearCombinedClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_variable_value(name)` {#DNNLinearCombinedClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.linear_bias_` {#DNNLinearCombinedClassifier.linear_bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.linear_weights_` {#DNNLinearCombinedClassifier.linear_weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.model_dir` {#DNNLinearCombinedClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.partial_fit(*args, **kwargs)` {#DNNLinearCombinedClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict_classes(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict_proba(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict_proba}
-
-Returns prediction probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.set_params(**params)` {#DNNLinearCombinedClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-
-- - -
-
-### `class tf.contrib.learn.LinearClassifier` {#LinearClassifier}
-
-Linear classifier model.
-
-Train a linear model to classify instances into one of multiple possible
-classes. When number of possible classes is 2, this is binary classification.
-
-Example:
-
-```python
-sparse_column_a = sparse_column_with_hash_bucket(...)
-sparse_column_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-# Estimator using the default optimizer.
-estimator = LinearClassifier(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
-
-# Or estimator using the FTRL optimizer with regularization.
-estimator = LinearClassifier(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
-    optimizer=tf.train.FtrlOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Or estimator using the SDCAOptimizer.
-estimator = LinearClassifier(
-   feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
-   optimizer=tf.contrib.linear_optimizer.SDCAOptimizer(
-     example_id_column='example_id',
-     num_loss_partitions=...,
-     symmetric_l2_regularization=2.0
-   ))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  ...
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-  `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.__init__(feature_columns, model_dir=None, n_classes=2, weight_column_name=None, optimizer=None, gradient_clip_norm=None, enable_centered_bias=False, _joint_weight=False, config=None, feature_engineering_fn=None)` {#LinearClassifier.__init__}
-
-Construct a `LinearClassifier` estimator object.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    Note that class labels are integers representing the class index (i.e.
-    values from 0 to n_classes-1). For arbitrary label values (e.g. string
-    labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: The optimizer used to train the model. If specified, it should
-    be either an instance of `tf.Optimizer` or the SDCAOptimizer. If `None`,
-    the Ftrl optimizer will be used.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-  _joint_weight: If True, the weights for all columns will be stored in a
-    single (possibly partitioned) variable. It's more efficient, but it's
-    incompatible with SDCAOptimizer, and requires all feature columns are
-    sparse and use the 'sum' combiner.
-
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `LinearClassifier` estimator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if n_classes < 2.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.__repr__()` {#LinearClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.bias_` {#LinearClassifier.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.config` {#LinearClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.evaluate(*args, **kwargs)` {#LinearClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#LinearClassifier.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#LinearClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.fit(*args, **kwargs)` {#LinearClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_params(deep=True)` {#LinearClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_variable_names()` {#LinearClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_variable_value(name)` {#LinearClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.model_dir` {#LinearClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.partial_fit(*args, **kwargs)` {#LinearClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict(*args, **kwargs)` {#LinearClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict_classes(*args, **kwargs)` {#LinearClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict_proba(*args, **kwargs)` {#LinearClassifier.predict_proba}
-
-Returns predicted probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.set_params(**params)` {#LinearClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.weights_` {#LinearClassifier.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-
-- - -
-
-### `class tf.contrib.learn.LinearRegressor` {#LinearRegressor}
-
-Linear regressor model.
-
-Train a linear regression model to predict label value given observation of
-feature values.
-
-Example:
-
-```python
-sparse_column_a = sparse_column_with_hash_bucket(...)
-sparse_column_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-estimator = LinearRegressor(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
-
-# Input builders
-def input_fn_train: # returns x, y
-  ...
-def input_fn_eval: # returns x, y
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x)
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a KeyError:
-
-* if `weight_column_name` is not `None`:
-  key=weight_column_name, value=a `Tensor`
-* for column in `feature_columns`:
-  - if isinstance(column, `SparseColumn`):
-      key=column.name, value=a `SparseTensor`
-  - if isinstance(column, `WeightedSparseColumn`):
-      {key=id column name, value=a `SparseTensor`,
-       key=weight column name, value=a `SparseTensor`}
-  - if isinstance(column, `RealValuedColumn`):
-      key=column.name, value=a `Tensor`
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.__init__(feature_columns, model_dir=None, weight_column_name=None, optimizer=None, gradient_clip_norm=None, enable_centered_bias=False, label_dimension=1, _joint_weights=False, config=None, feature_engineering_fn=None)` {#LinearRegressor.__init__}
-
-Construct a `LinearRegressor` estimator object.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Ftrl optimizer.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-  _joint_weights: If True use a single (possibly partitioned) variable to
-    store the weights. It's faster, but requires all feature columns are
-    sparse and have the 'sum' combiner. Incompatible with SDCAOptimizer.
-
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `LinearRegressor` estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.__repr__()` {#LinearRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.bias_` {#LinearRegressor.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.config` {#LinearRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.evaluate(*args, **kwargs)` {#LinearRegressor.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#LinearRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#LinearRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.fit(*args, **kwargs)` {#LinearRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_params(deep=True)` {#LinearRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_variable_names()` {#LinearRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_variable_value(name)` {#LinearRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.model_dir` {#LinearRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.partial_fit(*args, **kwargs)` {#LinearRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.predict(*args, **kwargs)` {#LinearRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.predict_scores(*args, **kwargs)` {#LinearRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.set_params(**params)` {#LinearRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.weights_` {#LinearRegressor.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-
-- - -
-
-### `tf.contrib.learn.LogisticRegressor(model_fn, thresholds=None, model_dir=None, config=None, feature_engineering_fn=None)` {#LogisticRegressor}
-
-Builds a logistic regression Estimator for binary classification.
-
-This method provides a basic Estimator with some additional metrics for custom
-binary classification models, including AUC, precision/recall and accuracy.
-
-Example:
-
-```python
-  # See tf.contrib.learn.Estimator(...) for details on model_fn structure
-  def my_model_fn(...):
-    pass
-
-  estimator = LogisticRegressor(model_fn=my_model_fn)
-
-  # Input builders
-  def input_fn_train:
-    pass
-
-  estimator.fit(input_fn=input_fn_train)
-  estimator.predict(x=x)
-```
-
-##### Args:
-
-
-*  <b>`model_fn`</b>: Model function with the signature:
-    `(features, labels, mode) -> (predictions, loss, train_op)`.
-    Expects the returned predictions to be probabilities in [0.0, 1.0].
-*  <b>`thresholds`</b>: List of floating point thresholds to use for accuracy,
-    precision, and recall metrics. If `None`, defaults to `[0.5]`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graphs, etc. This can also
-    be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: A RunConfig configuration object.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `tf.contrib.learn.Estimator` instance.
-
-
-
-## Distributed training utilities
-- - -
-
-### `class tf.contrib.learn.Experiment` {#Experiment}
-
-Experiment is a class containing all information needed to train a model.
-
-After an experiment is created (by passing an Estimator and inputs for
-training and evaluation), an Experiment instance knows how to invoke training
-and eval loops in a sensible fashion for distributed training.
-- - -
-
-#### `tf.contrib.learn.Experiment.__init__(*args, **kwargs)` {#Experiment.__init__}
-
-Constructor for `Experiment`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-23.
-Instructions for updating:
-local_eval_frequency is deprecated as local_run will be renamed to train_and_evaluate. Use min_eval_frequency and call train_and_evaluate instead. Note, however, that the default for min_eval_frequency is 1, meaning models will be evaluated every time a new checkpoint is available. In contrast, the default for local_eval_frequency is None, resulting in evaluation occurring only after training has completed. min_eval_frequency is ignored when calling the deprecated local_run.
-
-Creates an Experiment instance. None of the functions passed to this
-constructor are executed at construction time. They are stored and used
-when a method is executed which requires it.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: Object implementing `Trainable` and `Evaluable`.
-*  <b>`train_input_fn`</b>: function, returns features and labels for training.
-*  <b>`eval_input_fn`</b>: function, returns features and labels for evaluation. If
-    `eval_steps` is `None`, this should be configured only to produce for a
-    finite number of batches (generally, 1 epoch over the evaluation data).
-*  <b>`eval_metrics`</b>: `dict` of string, metric function. If `None`, default set
-    is used.
-*  <b>`train_steps`</b>: Perform this many steps of training. `None`, the default,
-    means train forever.
-*  <b>`eval_steps`</b>: `evaluate` runs until input is exhausted (or another exception
-    is raised), or for `eval_steps` steps, if specified.
-*  <b>`train_monitors`</b>: A list of monitors to pass to the `Estimator`'s `fit`
-    function.
-*  <b>`eval_hooks`</b>: A list of `SessionRunHook` hooks to pass to the
-    `Estimator`'s `evaluate` function.
-*  <b>`local_eval_frequency`</b>: Frequency of running eval in steps,
-    when running locally. If `None`, runs evaluation only at the end of
-    training.
-*  <b>`eval_delay_secs`</b>: Start evaluating after waiting for this many seconds.
-*  <b>`continuous_eval_throttle_secs`</b>: Do not re-evaluate unless the last
-    evaluation was started at least this many seconds ago for
-    continuous_eval().
-*  <b>`min_eval_frequency`</b>: (applies only to train_and_evaluate). the minimum
-    number of steps between evaluations. Of course, evaluation does not
-    occur if no new snapshot is available, hence, this is the minimum.
-*  <b>`delay_workers_by_global_step`</b>: if `True` delays training workers
-    based on global step instead of time.
-*  <b>`export_strategies`</b>: A list of `ExportStrategy`s, or a single one, or None.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `estimator` does not implement `Evaluable` and `Trainable`,
-    or if export_strategies has the wrong type.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.continuous_eval(delay_secs=None, throttle_delay_secs=None, evaluate_checkpoint_only_once=True, continuous_eval_predicate_fn=None)` {#Experiment.continuous_eval}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.continuous_eval_on_train_data(delay_secs=None, throttle_delay_secs=None, continuous_eval_predicate_fn=None)` {#Experiment.continuous_eval_on_train_data}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.estimator` {#Experiment.estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.eval_metrics` {#Experiment.eval_metrics}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.eval_steps` {#Experiment.eval_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.evaluate(delay_secs=None)` {#Experiment.evaluate}
-
-Evaluate on the evaluation data.
-
-Runs evaluation on the evaluation data and returns the result. Runs for
-`self._eval_steps` steps, or if it's `None`, then run until input is
-exhausted or another exception is raised. Start the evaluation after
-`delay_secs` seconds, or if it's `None`, defaults to using
-`self._eval_delay_secs` seconds.
-
-##### Args:
-
-
-*  <b>`delay_secs`</b>: Start evaluating after this many seconds. If `None`, defaults
-    to using `self._eval_delays_secs`.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.extend_train_hooks(additional_hooks)` {#Experiment.extend_train_hooks}
-
-Extends the hooks for training.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.local_run(*args, **kwargs)` {#Experiment.local_run}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-23.
-Instructions for updating:
-local_run will be renamed to train_and_evaluate and the new default behavior will be to run evaluation every time there is a new checkpoint.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.reset_export_strategies(new_export_strategies=None)` {#Experiment.reset_export_strategies}
-
-Resets the export strategies with the `new_export_strategies`.
-
-##### Args:
-
-
-*  <b>`new_export_strategies`</b>: A new list of `ExportStrategy`s, or a single one,
-    or None.
-
-##### Returns:
-
-  The old export strategies.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.run_std_server()` {#Experiment.run_std_server}
-
-Starts a TensorFlow server and joins the serving thread.
-
-Typically used for parameter servers.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if not enough information is available in the estimator's
-    config to create a server.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.test()` {#Experiment.test}
-
-Tests training and evaluating the estimator both for a single step.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train(delay_secs=None)` {#Experiment.train}
-
-Fit the estimator using the training data.
-
-Train the estimator for `self._train_steps` steps, after waiting for
-`delay_secs` seconds. If `self._train_steps` is `None`, train forever.
-
-##### Args:
-
-
-*  <b>`delay_secs`</b>: Start training after this many seconds.
-
-##### Returns:
-
-  The trained estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train_and_evaluate()` {#Experiment.train_and_evaluate}
-
-Interleaves training and evaluation.
-
-The frequency of evaluation is controlled by the contructor arg
-`min_eval_frequency`. When this parameter is None or 0, evaluation happens
-only after training has completed. Note that evaluation cannot happen
-more frequently than checkpoints are taken. If no new snapshots are
-available when evaluation is supposed to occur, then evaluation doesn't
-happen for another `min_eval_frequency` steps (assuming a checkpoint is
-available at that point). Thus, settings `min_eval_frequency` to 1 means
-that the model will be evaluated everytime there is a new checkpoint.
-
-This is particular useful for a "Master" task in the cloud, whose
-responsibility it is to take checkpoints, evaluate those checkpoints,
-and write out summaries. Participating in training as the supervisor
-allows such a task to accomplish the first and last items, while
-performing evaluation allows for the second.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train_steps` {#Experiment.train_steps}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.ExportStrategy` {#ExportStrategy}
-
-A class representing a type of model export.
-
-Typically constructed by a utility function specific to the exporter, such as
-`saved_model_export_utils.make_export_strategy()`.
-
-The fields are:
-  name: The directory name under the export base directory where exports of
-    this type will be written.
-  export_fn: A function that writes an export, given an estimator, a
-    destination path, and optionally a checkpoint path and an evaluation
-    result for that checkpoint.  This export_fn() may be run repeatedly during
-    continuous training, or just once at the end of fixed-length training.
-    Note the export_fn() may choose whether or not to export based on the eval
-    result or based on an internal timer or any other criterion, if exports
-    are not desired for every checkpoint.
-
-    The signature of this function must be one of:
-      * (estimator, export_path) -> export_path`
-      * (estimator, export_path, checkpoint_path) -> export_path`
-      * (estimator, export_path, checkpoint_path, eval_result) -> export_path`
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__getnewargs__()` {#ExportStrategy.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__getstate__()` {#ExportStrategy.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__new__(_cls, name, export_fn)` {#ExportStrategy.__new__}
-
-Create new instance of ExportStrategy(name, export_fn)
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__repr__()` {#ExportStrategy.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.export(estimator, export_path, checkpoint_path=None, eval_result=None)` {#ExportStrategy.export}
-
-Exports the given Estimator to a specific format.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the Estimator to export.
-*  <b>`export_path`</b>: A string containing a directory where to write the export.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the strategy may locate a checkpoint (e.g. the most recent) by itself.
-*  <b>`eval_result`</b>: The output of Estimator.evaluate on this checkpoint.  This
-    should be set only if checkpoint_path is provided (otherwise it is
-    unclear which checkpoint this eval refers to).
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the export_fn does not have the required signature
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.export_fn` {#ExportStrategy.export_fn}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.name` {#ExportStrategy.name}
-
-Alias for field number 0
-
-
-
-- - -
-
-### `class tf.contrib.learn.TaskType` {#TaskType}
-
-
-
-
-## Graph actions
-
-Perform various training, evaluation, and inference actions on a graph.
-
-- - -
-
-### `class tf.train.NanLossDuringTrainingError` {#NanLossDuringTrainingError}
-
-
-- - -
-
-#### `tf.train.NanLossDuringTrainingError.__str__()` {#NanLossDuringTrainingError.__str__}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.RunConfig` {#RunConfig}
-
-This class specifies the configurations for an `Estimator` run.
-
-If you're a Google-internal user using command line flags with
-`learn_runner.py` (for instance, to do distributed training or to use
-parameter servers), you probably want to use `learn_runner.EstimatorConfig`
-instead.
-- - -
-
-#### `tf.contrib.learn.RunConfig.__init__(master=None, num_cores=0, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, save_checkpoints_steps=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, evaluation_master='')` {#RunConfig.__init__}
-
-Constructor.
-
-Note that the superclass `ClusterConfig` may set properties like
-`cluster_spec`, `is_chief`, `master` (if `None` in the args),
-`num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG`
-environment variable. See `ClusterConfig` for more details.
-
-##### Args:
-
-
-*  <b>`master`</b>: TensorFlow master. Defaults to empty string for local.
-*  <b>`num_cores`</b>: Number of cores to be used. If 0, the system picks an
-    appropriate number (default: 0).
-*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
-*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
-    each GPU uniformly on the same machine.
-*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
-    Setting this value allows consistency between reruns.
-*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
-*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds. Can not
-      be specified with `save_checkpoints_steps`.
-*  <b>`save_checkpoints_steps`</b>: Save checkpoints every this many steps. Can not be
-      specified with `save_checkpoints_secs`.
-*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
-    keep. As new files are created, older files are deleted. If None or 0,
-    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
-    checkpoint files are kept.)
-*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
-    to be saved. The default value of 10,000 hours effectively disables
-    the feature.
-*  <b>`evaluation_master`</b>: the master on which to perform evaluation.
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.cluster_spec` {#RunConfig.cluster_spec}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.environment` {#RunConfig.environment}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.evaluation_master` {#RunConfig.evaluation_master}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.get_task_id()` {#RunConfig.get_task_id}
-
-Returns task index from `TF_CONFIG` environmental variable.
-
-If you have a ClusterConfig instance, you can just access its task_id
-property instead of calling this function and re-parsing the environmental
-variable.
-
-##### Returns:
-
-  `TF_CONFIG['task']['index']`. Defaults to 0.
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.is_chief` {#RunConfig.is_chief}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.keep_checkpoint_every_n_hours` {#RunConfig.keep_checkpoint_every_n_hours}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.keep_checkpoint_max` {#RunConfig.keep_checkpoint_max}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.master` {#RunConfig.master}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.num_ps_replicas` {#RunConfig.num_ps_replicas}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_checkpoints_secs` {#RunConfig.save_checkpoints_secs}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_checkpoints_steps` {#RunConfig.save_checkpoints_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_summary_steps` {#RunConfig.save_summary_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.task_id` {#RunConfig.task_id}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.task_type` {#RunConfig.task_type}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.tf_config` {#RunConfig.tf_config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.tf_random_seed` {#RunConfig.tf_random_seed}
-
-
-
-
-
-- - -
-
-### `tf.contrib.learn.evaluate(*args, **kwargs)` {#evaluate}
-
-Evaluate a model loaded from a checkpoint. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
-to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
-loop for `max_steps` steps, or until an exception (generally, an
-end-of-input signal from a reader operation) is raised from running
-`eval_dict`.
-
-In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
-every `log_every_steps` steps, they are logged. At the very end of evaluation,
-a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
-and written to `output_dir`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` to train. It is expected that this graph is not in use
-    elsewhere.
-*  <b>`output_dir`</b>: A string containing the directory to write a summary to.
-*  <b>`checkpoint_path`</b>: A string containing the path to a checkpoint to restore.
-    Can be `None` if the graph doesn't require loading any variables.
-*  <b>`eval_dict`</b>: A `dict` mapping string names to tensors to evaluate. It is
-    evaluated in every logging step. The result of the final evaluation is
-    returned. If `update_op` is None, then it's evaluated in every step. If
-    `max_steps` is `None`, this should depend on a reader that will raise an
-    end-of-input exception when the inputs are exhausted.
-*  <b>`update_op`</b>: A `Tensor` which is run in every step.
-*  <b>`global_step_tensor`</b>: A `Variable` containing the global step. If `None`,
-    one is extracted from the graph using the same logic as in `Supervisor`.
-    Used to place eval summaries on training curves.
-*  <b>`supervisor_master`</b>: The master string to use when preparing the session.
-*  <b>`log_every_steps`</b>: Integer. Output logs every `log_every_steps` evaluation
-    steps. The logs contain the `eval_dict` and timing information.
-*  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
-    passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Integer. Evaluate `eval_dict` this many times.
-
-##### Returns:
-
-  A tuple `(eval_results, global_step)`:
-
-*  <b>`eval_results`</b>: A `dict` mapping `string` to numeric values (`int`, `float`)
-    that are the result of running eval_dict in the last step. `None` if no
-    eval steps were run.
-*  <b>`global_step`</b>: The global step this evaluation corresponds to.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `output_dir` is empty.
-
-
-- - -
-
-### `tf.contrib.learn.infer(*args, **kwargs)` {#infer}
-
-Restore graph from `restore_checkpoint_path` and run `output_dict` tensors. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise,
-init all variables.
-
-##### Args:
-
-
-*  <b>`restore_checkpoint_path`</b>: A string containing the path to a checkpoint to
-    restore.
-*  <b>`output_dict`</b>: A `dict` mapping string names to `Tensor` objects to run.
-    Tensors must all be from the same graph.
-*  <b>`feed_dict`</b>: `dict` object mapping `Tensor` objects to input values to feed.
-
-##### Returns:
-
-  Dict of values read from `output_dict` tensors. Keys are the same as
-  `output_dict`, values are the results read from the corresponding `Tensor`
-  in `output_dict`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `output_dict` or `feed_dicts` is None or empty.
-
-
-- - -
-
-### `tf.contrib.learn.run_feeds(*args, **kwargs)` {#run_feeds}
-
-See run_feeds_iter(). Returns a `list` instead of an iterator. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-
-- - -
-
-### `tf.contrib.learn.run_n(*args, **kwargs)` {#run_n}
-
-Run `output_dict` tensors `n` times, with the same `feed_dict` each run. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-##### Args:
-
-
-*  <b>`output_dict`</b>: A `dict` mapping string names to tensors to run. Must all be
-    from the same graph.
-*  <b>`feed_dict`</b>: `dict` of input values to feed each run.
-*  <b>`restore_checkpoint_path`</b>: A string containing the path to a checkpoint to
-    restore.
-*  <b>`n`</b>: Number of times to repeat.
-
-##### Returns:
-
-  A list of `n` `dict` objects, each containing values read from `output_dict`
-  tensors.
-
-
-- - -
-
-### `tf.contrib.learn.train(*args, **kwargs)` {#train}
-
-Train a model. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
-run a training loop. The given `train_op` performs one step of training on the
-model. The `loss_op` represents the objective function of the training. It is
-expected to increment the `global_step_tensor`, a scalar integer tensor
-counting training steps. This function uses `Supervisor` to initialize the
-graph (from a checkpoint if one is available in `output_dir`), write summaries
-defined in the graph, and write regular checkpoints as defined by
-`supervisor_save_model_secs`.
-
-Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
-`fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
-program is terminated with exit code 1.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A graph to train. It is expected that this graph is not in use
-    elsewhere.
-*  <b>`output_dir`</b>: A directory to write outputs to.
-*  <b>`train_op`</b>: An op that performs one training step when run.
-*  <b>`loss_op`</b>: A scalar loss tensor.
-*  <b>`global_step_tensor`</b>: A tensor representing the global step. If none is given,
-    one is extracted from the graph using the same logic as in `Supervisor`.
-*  <b>`init_op`</b>: An op that initializes the graph. If `None`, use `Supervisor`'s
-    default.
-*  <b>`init_feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    This feed dictionary will be used when `init_op` is evaluated.
-*  <b>`init_fn`</b>: Optional callable passed to Supervisor to initialize the model.
-*  <b>`log_every_steps`</b>: Output logs regularly. The logs contain timing data and the
-    current loss.
-*  <b>`supervisor_is_chief`</b>: Whether the current process is the chief supervisor in
-    charge of restoring the model and running standard services.
-*  <b>`supervisor_master`</b>: The master string to use when preparing the session.
-*  <b>`supervisor_save_model_secs`</b>: Save a checkpoint every
-    `supervisor_save_model_secs` seconds when training.
-*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
-    keep. As new files are created, older files are deleted. If None or 0,
-    all checkpoint files are kept. This is simply passed as the max_to_keep
-    arg to tf.Saver constructor.
-*  <b>`supervisor_save_summaries_steps`</b>: Save summaries every
-    `supervisor_save_summaries_steps` seconds when training.
-*  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
-    passed to `session.run` calls. Optional.
-*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
-*  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
-    evaluates to `NaN`. If false, continue training as if nothing happened.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. Two calls fit(steps=100) means 200 training iterations.
-    On the other hand two calls of fit(max_steps=100) means, second call
-    will not do any iteration since first call did all 100 steps.
-
-##### Returns:
-
-  The final loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
-    is not provided. See `tf.contrib.framework.get_global_step` for how we
-    look up the latter if not provided explicitly.
-*  <b>`NanLossDuringTrainingError`</b>: If `fail_on_nan_loss` is `True`, and loss ever
-    evaluates to `NaN`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-
-## Input processing
-
-Queue and read batched input data.
-
-- - -
-
-### `tf.contrib.learn.extract_dask_data(data)` {#extract_dask_data}
-
-Extract data from dask.Series or dask.DataFrame for predictors.
-
-Given a distributed dask.DataFrame or dask.Series containing columns or names
-for one or more predictors, this operation returns a single dask.DataFrame or
-dask.Series that can be iterated over.
-
-##### Args:
-
-
-*  <b>`data`</b>: A distributed dask.DataFrame or dask.Series.
-
-##### Returns:
-
-  A dask.DataFrame or dask.Series that can be iterated over.
-  If the supplied argument is neither a dask.DataFrame nor a dask.Series this
-  operation returns it without modification.
-
-
-- - -
-
-### `tf.contrib.learn.extract_dask_labels(labels)` {#extract_dask_labels}
-
-Extract data from dask.Series or dask.DataFrame for labels.
-
-Given a distributed dask.DataFrame or dask.Series containing exactly one
-column or name, this operation returns a single dask.DataFrame or dask.Series
-that can be iterated over.
-
-##### Args:
-
-
-*  <b>`labels`</b>: A distributed dask.DataFrame or dask.Series with exactly one
-          column or name.
-
-##### Returns:
-
-  A dask.DataFrame or dask.Series that can be iterated over.
-  If the supplied argument is neither a dask.DataFrame nor a dask.Series this
-  operation returns it without modification.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the supplied dask.DataFrame contains more than one
-              column or the supplied dask.Series contains more than
-              one name.
-
-
-- - -
-
-### `tf.contrib.learn.extract_pandas_data(data)` {#extract_pandas_data}
-
-Extract data from pandas.DataFrame for predictors.
-
-Given a DataFrame, will extract the values and cast them to float. The
-DataFrame is expected to contain values of type int, float or bool.
-
-##### Args:
-
-
-*  <b>`data`</b>: `pandas.DataFrame` containing the data to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of the DataFrame's values as floats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if data contains types other than int, float or bool.
-
-
-- - -
-
-### `tf.contrib.learn.extract_pandas_labels(labels)` {#extract_pandas_labels}
-
-Extract data from pandas.DataFrame for labels.
-
-##### Args:
-
-
-*  <b>`labels`</b>: `pandas.DataFrame` or `pandas.Series` containing one column of
-    labels to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of labels from the DataFrame.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if more than one column is found or type is not int, float or
-    bool.
-
-
-- - -
-
-### `tf.contrib.learn.extract_pandas_matrix(data)` {#extract_pandas_matrix}
-
-Extracts numpy matrix from pandas DataFrame.
-
-##### Args:
-
-
-*  <b>`data`</b>: `pandas.DataFrame` containing the data to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of the DataFrame's values.
-
-
-- - -
-
-### `tf.contrib.learn.infer_real_valued_columns_from_input(x)` {#infer_real_valued_columns_from_input}
-
-Creates `FeatureColumn` objects for inputs defined by input `x`.
-
-This interprets all inputs as dense, fixed-length float values.
-
-##### Args:
-
-
-*  <b>`x`</b>: Real-valued matrix of shape [n_samples, n_features...]. Can be
-     iterator that returns arrays of features.
-
-##### Returns:
-
-  List of `FeatureColumn` objects.
-
-
-- - -
-
-### `tf.contrib.learn.infer_real_valued_columns_from_input_fn(input_fn)` {#infer_real_valued_columns_from_input_fn}
-
-Creates `FeatureColumn` objects for inputs defined by `input_fn`.
-
-This interprets all inputs as dense, fixed-length float values. This creates
-a local graph in which it calls `input_fn` to build the tensors, then discards
-it.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - Dictionary of string feature name to `Tensor` or `Tensor`.
-      labels - `Tensor` of label values.
-
-##### Returns:
-
-  List of `FeatureColumn` objects.
-
-
-- - -
-
-### `tf.contrib.learn.read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, num_threads=1, read_batch_size=1, parse_fn=None, name=None, seed=None)` {#read_batch_examples}
-
-Adds operations to read, queue, batch `Example` protos.
-
-Given file pattern (or list of files), will setup a queue for file names,
-read `Example` proto using provided `reader`, use batch queue to create
-batches of examples of size `batch_size`.
-
-All queue runners are added to the queue runners collection, and may be
-started via `start_queue_runners`.
-
-All ops are added to the default graph.
-
-Use `parse_fn` if you need to do parsing / processing on single examples.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`reader`</b>: A function or class that returns an object with
-    `read` method, (filename tensor) -> (example tensor).
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If `None`, cycles through the dataset forever.
-    NOTE - If specified, creates a variable that must be initialized, so call
-    `tf.global_variables_initializer()` and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing examples.
-*  <b>`read_batch_size`</b>: An int or scalar `Tensor` specifying the number of
-    records to read at once
-*  <b>`parse_fn`</b>: Parsing function, takes `Example` Tensor returns parsed
-    representation. If `None`, no parsing is done.
-*  <b>`name`</b>: Name of resulting op.
-*  <b>`seed`</b>: An integer (optional). Seed used if randomize_input == True.
-
-##### Returns:
-
-  String `Tensor` of batched `Example` proto.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
-
-- - -
-
-### `tf.contrib.learn.read_batch_features(file_pattern, batch_size, features, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, feature_queue_capacity=100, reader_num_threads=1, parse_fn=None, name=None)` {#read_batch_features}
-
-Adds operations to read, queue, batch and parse `Example` protos.
-
-Given file pattern (or list of files), will setup a queue for file names,
-read `Example` proto using provided `reader`, use batch queue to create
-batches of examples of size `batch_size` and parse example given `features`
-specification.
-
-All queue runners are added to the queue runners collection, and may be
-started via `start_queue_runners`.
-
-All ops are added to the default graph.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`reader`</b>: A function or class that returns an object with
-    `read` method, (filename tensor) -> (example tensor).
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If None, cycles through the dataset forever. NOTE - If specified,
-    creates a variable that must be initialized, so call
-    tf.local_variables_initializer() and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`feature_queue_capacity`</b>: Capacity of the parsed features queue. Set this
-    value to a small number, for example 5 if the parsed features are large.
-*  <b>`reader_num_threads`</b>: The number of threads to read examples.
-*  <b>`parse_fn`</b>: Parsing function, takes `Example` Tensor returns parsed
-    representation. If `None`, no parsing is done.
-*  <b>`name`</b>: Name of resulting op.
-
-##### Returns:
-
-  A dict of `Tensor` or `SparseTensor` objects for each in `features`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
-
-- - -
-
-### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}
-
-Reads TFRecord, queues, batches and parses `Example` proto.
-
-See more detailed description in `read_examples`.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If None, cycles through the dataset forever. NOTE - If specified,
-    creates a variable that must be initialized, so call
-    tf.local_variables_initializer() and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`reader_num_threads`</b>: The number of threads to read examples.
-*  <b>`name`</b>: Name of resulting op.
-
-##### Returns:
-
-  A dict of `Tensor` or `SparseTensor` objects for each in `features`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
-
-
-Export utilities
-
-- - -
-
-### `tf.contrib.learn.build_parsing_serving_input_fn(feature_spec, default_batch_size=None)` {#build_parsing_serving_input_fn}
-
-Build an input_fn appropriate for serving, expecting fed tf.Examples.
-
-Creates an input_fn that expects a serialized tf.Example fed into a string
-placeholder.  The function parses the tf.Example according to the provided
-feature_spec, and returns all parsed Tensors as features.  This input_fn is
-for use at serving time, so the labels return value is always None.
-
-##### Args:
-
-
-*  <b>`feature_spec`</b>: a dict of string to `VarLenFeature`/`FixedLenFeature`.
-*  <b>`default_batch_size`</b>: the number of query examples expected per batch.
-      Leave unset for variable batch size (recommended).
-
-##### Returns:
-
-  An input_fn suitable for use in serving.
-
-
-- - -
-
-### `class tf.contrib.learn.ProblemType` {#ProblemType}
-
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.learn.NotFittedError` {#NotFittedError}
-
-Exception class to raise if estimator is used before fitting.
-
-This class inherits from both ValueError and AttributeError to help with
-exception handling and backward compatibility.
-
-Examples:
->>> from sklearn.svm import LinearSVC
->>> from sklearn.exceptions import NotFittedError
->>> try:
-...     LinearSVC().predict([[1, 2], [2, 3], [3, 4]])
-... except NotFittedError as e:
-...     print(repr(e))
-...                        # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-NotFittedError('This LinearSVC instance is not fitted yet',)
-
-Copied from
-https://github.com/scikit-learn/scikit-learn/master/sklearn/exceptions.py
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
deleted file mode 100644
index dae7162a0d9..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ /dev/null
@@ -1,2737 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Monitors (contrib)
-[TOC]
-
-Monitors allow user instrumentation of the training process.
-
-Monitors are useful to track training, report progress, request early
-stopping and more. Monitors use the observer pattern and notify at the following
-points:
-
-* when training begins
-* before a training step
-* after a training step
-* when training ends
-
-Monitors are not intended to be reusable.
-
-There are a few pre-defined monitors:
-
-* `CaptureVariable`: saves a variable's values
-* `GraphDump`: intended for debug only - saves all tensor values
-* `PrintTensor`: outputs one or more tensor values to log
-* `SummarySaver`: saves summaries to a summary writer
-* `ValidationMonitor`: runs model validation, by periodically calculating eval
-    metrics on a separate data set; supports optional early stopping
-
-For more specific needs, you can create custom monitors by extending one of the
-following classes:
-
-* `BaseMonitor`: the base class for all monitors
-* `EveryN`: triggers a callback every N training steps
-
-Example:
-
-```python
-  class ExampleMonitor(monitors.BaseMonitor):
-    def __init__(self):
-      print 'Init'
-
-    def begin(self, max_steps):
-      print 'Starting run. Will train until step %d.' % max_steps
-
-    def end(self):
-      print 'Completed run.'
-
-    def step_begin(self, step):
-      print 'About to run step %d...' % step
-      return ['loss_1:0']
-
-    def step_end(self, step, outputs):
-      print 'Done running step %d. The value of "loss" tensor: %s' % (
-        step, outputs['loss_1:0'])
-
-  linear_regressor = LinearRegressor()
-  example_monitor = ExampleMonitor()
-  linear_regressor.fit(
-    x, y, steps=2, batch_size=1, monitors=[example_monitor])
-```
-
-## Ops
-
-- - -
-
-### `tf.contrib.learn.monitors.get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100, output_dir=None, summary_writer=None)` {#get_default_monitors}
-
-Returns a default set of typically-used monitors.
-
-##### Args:
-
-
-*  <b>`loss_op`</b>: `Tensor`, the loss tensor. This will be printed using `PrintTensor`
-      at the default interval.
-*  <b>`summary_op`</b>: See `SummarySaver`.
-*  <b>`save_summary_steps`</b>: See `SummarySaver`.
-*  <b>`output_dir`</b>: See `SummarySaver`.
-*  <b>`summary_writer`</b>: See `SummarySaver`.
-
-##### Returns:
-
-  `list` of monitors.
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.BaseMonitor` {#BaseMonitor}
-
-Base class for Monitors.
-
-Defines basic interfaces of Monitors.
-Monitors can either be run on all workers or, more commonly, restricted
-to run exclusively on the elected chief worker.
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.__init__(*args, **kwargs)` {#BaseMonitor.__init__}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-05.
-Instructions for updating:
-Monitors are deprecated. Please use tf.train.SessionRunHook.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.begin(max_steps=None)` {#BaseMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.end(session=None)` {#BaseMonitor.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.epoch_begin(epoch)` {#BaseMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.epoch_end(epoch)` {#BaseMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.post_step(step, session)` {#BaseMonitor.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.run_on_all_workers` {#BaseMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.set_estimator(estimator)` {#BaseMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.step_begin(step)` {#BaseMonitor.step_begin}
-
-Callback before training step begins.
-
-You may use this callback to request evaluation of additional tensors
-in the graph.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  List of `Tensor` objects or string tensor names to be run.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a step, or `step` < 0, or
-      `step` > `max_steps`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.step_end(step, output)` {#BaseMonitor.step_end}
-
-Callback after training step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-Note that this method is not called if the call to `Session.run()` that
-followed the last call to `step_begin()` failed.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.CaptureVariable` {#CaptureVariable}
-
-Captures a variable's values into a collection.
-
-This monitor is useful for unit testing. You should exercise caution when
-using this monitor in production, since it never discards values.
-
-This is an `EveryN` monitor and has consistent semantic for `every_n`
-and `first_n`.
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.__init__(var_name, every_n=100, first_n=1)` {#CaptureVariable.__init__}
-
-Initializes a CaptureVariable monitor.
-
-##### Args:
-
-
-*  <b>`var_name`</b>: `string`. The variable name, including suffix (typically ":0").
-*  <b>`every_n`</b>: `int`, print every N steps. See `PrintN.`
-*  <b>`first_n`</b>: `int`, also print the first N steps. See `PrintN.`
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.begin(max_steps=None)` {#CaptureVariable.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.end(session=None)` {#CaptureVariable.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.epoch_begin(epoch)` {#CaptureVariable.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.epoch_end(epoch)` {#CaptureVariable.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_post_step(step, session)` {#CaptureVariable.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_step_begin(step)` {#CaptureVariable.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_step_end(step, outputs)` {#CaptureVariable.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.post_step(step, session)` {#CaptureVariable.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.run_on_all_workers` {#CaptureVariable.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.set_estimator(estimator)` {#CaptureVariable.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.step_begin(step)` {#CaptureVariable.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.step_end(step, output)` {#CaptureVariable.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.values` {#CaptureVariable.values}
-
-Returns the values captured so far.
-
-##### Returns:
-
-  `dict` mapping `int` step numbers to that values of the variable at the
-      respective step.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.CheckpointSaver` {#CheckpointSaver}
-
-Saves checkpoints every N steps or N seconds.
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
-
-Initialize CheckpointSaver monitor.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
-*  <b>`save_secs`</b>: `int`, save every N secs.
-*  <b>`save_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
-*  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
-*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.end(session=None)` {#CheckpointSaver.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.epoch_begin(epoch)` {#CheckpointSaver.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.epoch_end(epoch)` {#CheckpointSaver.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.run_on_all_workers` {#CheckpointSaver.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.set_estimator(estimator)` {#CheckpointSaver.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
-
-Callback after training step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-Note that this method is not called if the call to `Session.run()` that
-followed the last call to `step_begin()` failed.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.EveryN` {#EveryN}
-
-Base class for monitors that execute callbacks every N steps.
-
-This class adds three new callbacks:
-  - every_n_step_begin
-  - every_n_step_end
-  - every_n_post_step
-
-The callbacks are executed every n steps, or optionally every step for the
-first m steps, where m and n can both be user-specified.
-
-When extending this class, note that if you wish to use any of the
-`BaseMonitor` callbacks, you must call their respective super implementation:
-
-  def step_begin(self, step):
-    super(ExampleMonitor, self).step_begin(step)
-    return []
-
-Failing to call the super implementation will cause unpredictable behavior.
-
-The `every_n_post_step()` callback is also called after the last step if it
-was not already called through the regular conditions.  Note that
-`every_n_step_begin()` and `every_n_step_end()` do not receive that special
-treatment.
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.__init__(every_n_steps=100, first_n_steps=1)` {#EveryN.__init__}
-
-Initializes an `EveryN` monitor.
-
-##### Args:
-
-
-*  <b>`every_n_steps`</b>: `int`, the number of steps to allow between callbacks.
-*  <b>`first_n_steps`</b>: `int`, specifying the number of initial steps during
-    which the callbacks will always be executed, regardless of the value
-    of `every_n_steps`. Note that this value is relative to the global step
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.begin(max_steps=None)` {#EveryN.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.end(session=None)` {#EveryN.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.epoch_begin(epoch)` {#EveryN.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.epoch_end(epoch)` {#EveryN.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_post_step(step, session)` {#EveryN.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_step_begin(step)` {#EveryN.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_step_end(step, outputs)` {#EveryN.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.post_step(step, session)` {#EveryN.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.run_on_all_workers` {#EveryN.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.set_estimator(estimator)` {#EveryN.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.step_begin(step)` {#EveryN.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.step_end(step, output)` {#EveryN.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.ExportMonitor` {#ExportMonitor}
-
-Monitor that exports Estimator every N steps.
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.__init__(*args, **kwargs)` {#ExportMonitor.__init__}
-
-Initializes ExportMonitor. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will both become required args.
-
-##### Args:
-
-
-*  <b>`every_n_steps`</b>: Run monitor every N steps.
-*  <b>`export_dir`</b>: str, folder to export.
-*  <b>`input_fn`</b>: A function that takes no argument and returns a tuple of
-    (features, labels), where features is a dict of string key to `Tensor`
-    and labels is a `Tensor` that's currently not used (and so can be
-    `None`).
-*  <b>`input_feature_key`</b>: String key into the features dict returned by
-    `input_fn` that corresponds to the raw `Example` strings `Tensor` that
-    the exported model will take as input. Should be `None` if and only if
-    you're passing in a `signature_fn` that does not use the first arg
-    (`Tensor` of `Example` strings).
-*  <b>`exports_to_keep`</b>: int, number of exports to keep.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `dict` of `Tensor`s for predictions.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `input_fn` and `input_feature_key` are not both defined or
-    are not both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.begin(max_steps=None)` {#ExportMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.end(session=None)` {#ExportMonitor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.epoch_begin(epoch)` {#ExportMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.epoch_end(epoch)` {#ExportMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_post_step(step, session)` {#ExportMonitor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_step_begin(step)` {#ExportMonitor.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_step_end(step, outputs)` {#ExportMonitor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.export_dir` {#ExportMonitor.export_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.exports_to_keep` {#ExportMonitor.exports_to_keep}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.last_export_dir` {#ExportMonitor.last_export_dir}
-
-Returns the directory containing the last completed export.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added on 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because the
-  estimator being fitted does not yet return a value during export.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.post_step(step, session)` {#ExportMonitor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.run_on_all_workers` {#ExportMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.set_estimator(estimator)` {#ExportMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.signature_fn` {#ExportMonitor.signature_fn}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.step_begin(step)` {#ExportMonitor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.step_end(step, output)` {#ExportMonitor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.GraphDump` {#GraphDump}
-
-Dumps almost all tensors in the graph at every step.
-
-Note, this is very expensive, prefer `PrintTensor` in production.
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.__init__(ignore_ops=None)` {#GraphDump.__init__}
-
-Initializes GraphDump monitor.
-
-##### Args:
-
-
-*  <b>`ignore_ops`</b>: `list` of `string`. Names of ops to ignore.
-      If None, `GraphDump.IGNORE_OPS` is used.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.begin(max_steps=None)` {#GraphDump.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.compare(other_dump, step, atol=1e-06)` {#GraphDump.compare}
-
-Compares two `GraphDump` monitors and returns differences.
-
-##### Args:
-
-
-*  <b>`other_dump`</b>: Another `GraphDump` monitor.
-*  <b>`step`</b>: `int`, step to compare on.
-*  <b>`atol`</b>: `float`, absolute tolerance in comparison of floating arrays.
-
-##### Returns:
-
-  Returns tuple:
-
-*  <b>`matched`</b>: `list` of keys that matched.
-*  <b>`non_matched`</b>: `dict` of keys to tuple of 2 mismatched values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if a key in `data` is missing from `other_dump` at `step`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.data` {#GraphDump.data}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.end(session=None)` {#GraphDump.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.epoch_begin(epoch)` {#GraphDump.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.epoch_end(epoch)` {#GraphDump.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.post_step(step, session)` {#GraphDump.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.run_on_all_workers` {#GraphDump.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.set_estimator(estimator)` {#GraphDump.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.step_begin(step)` {#GraphDump.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.step_end(step, output)` {#GraphDump.step_end}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.LoggingTrainable` {#LoggingTrainable}
-
-Writes trainable variable values into log every N steps.
-
-Write the tensors in trainable variables `every_n` steps,
-starting with the `first_n`th step.
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.__init__(scope=None, every_n=100, first_n=1)` {#LoggingTrainable.__init__}
-
-Initializes LoggingTrainable monitor.
-
-##### Args:
-
-
-*  <b>`scope`</b>: An optional string to match variable names using re.match.
-*  <b>`every_n`</b>: Print every N steps.
-*  <b>`first_n`</b>: Print first N steps.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.begin(max_steps=None)` {#LoggingTrainable.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.end(session=None)` {#LoggingTrainable.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.epoch_begin(epoch)` {#LoggingTrainable.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.epoch_end(epoch)` {#LoggingTrainable.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_post_step(step, session)` {#LoggingTrainable.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_step_begin(step)` {#LoggingTrainable.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_step_end(step, outputs)` {#LoggingTrainable.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.post_step(step, session)` {#LoggingTrainable.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.run_on_all_workers` {#LoggingTrainable.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.set_estimator(estimator)` {#LoggingTrainable.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.step_begin(step)` {#LoggingTrainable.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.step_end(step, output)` {#LoggingTrainable.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.NanLoss` {#NanLoss}
-
-NaN Loss monitor.
-
-Monitors loss and stops training if loss is NaN.
-Can either fail with exception or just stop training.
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.__init__(loss_tensor, every_n_steps=100, fail_on_nan_loss=True)` {#NanLoss.__init__}
-
-Initializes NanLoss monitor.
-
-##### Args:
-
-
-*  <b>`loss_tensor`</b>: `Tensor`, the loss tensor.
-*  <b>`every_n_steps`</b>: `int`, run check every this many steps.
-*  <b>`fail_on_nan_loss`</b>: `bool`, whether to raise exception when loss is NaN.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.begin(max_steps=None)` {#NanLoss.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.end(session=None)` {#NanLoss.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.epoch_begin(epoch)` {#NanLoss.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.epoch_end(epoch)` {#NanLoss.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_post_step(step, session)` {#NanLoss.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_step_begin(step)` {#NanLoss.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_step_end(step, outputs)` {#NanLoss.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.post_step(step, session)` {#NanLoss.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.run_on_all_workers` {#NanLoss.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.set_estimator(estimator)` {#NanLoss.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.step_begin(step)` {#NanLoss.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.step_end(step, output)` {#NanLoss.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.PrintTensor` {#PrintTensor}
-
-Prints given tensors every N steps.
-
-This is an `EveryN` monitor and has consistent semantic for `every_n`
-and `first_n`.
-
-The tensors will be printed to the log, with `INFO` severity.
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.__init__(tensor_names, every_n=100, first_n=1)` {#PrintTensor.__init__}
-
-Initializes a PrintTensor monitor.
-
-##### Args:
-
-
-*  <b>`tensor_names`</b>: `dict` of tag to tensor names or
-      `iterable` of tensor names (strings).
-*  <b>`every_n`</b>: `int`, print every N steps. See `PrintN.`
-*  <b>`first_n`</b>: `int`, also print the first N steps. See `PrintN.`
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.begin(max_steps=None)` {#PrintTensor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.end(session=None)` {#PrintTensor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.epoch_begin(epoch)` {#PrintTensor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.epoch_end(epoch)` {#PrintTensor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_post_step(step, session)` {#PrintTensor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_step_begin(step)` {#PrintTensor.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_step_end(step, outputs)` {#PrintTensor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.post_step(step, session)` {#PrintTensor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.run_on_all_workers` {#PrintTensor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.set_estimator(estimator)` {#PrintTensor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.step_begin(step)` {#PrintTensor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.step_end(step, output)` {#PrintTensor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.StepCounter` {#StepCounter}
-
-Steps per second monitor.
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.__init__(every_n_steps=100, output_dir=None, summary_writer=None)` {#StepCounter.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.begin(max_steps=None)` {#StepCounter.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.end(session=None)` {#StepCounter.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.epoch_begin(epoch)` {#StepCounter.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.epoch_end(epoch)` {#StepCounter.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_post_step(step, session)` {#StepCounter.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_step_begin(step)` {#StepCounter.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_step_end(current_step, outputs)` {#StepCounter.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.post_step(step, session)` {#StepCounter.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.run_on_all_workers` {#StepCounter.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.set_estimator(estimator)` {#StepCounter.set_estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.step_begin(step)` {#StepCounter.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.step_end(step, output)` {#StepCounter.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.StopAtStep` {#StopAtStep}
-
-Monitor to request stop at a specified step.
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.__init__(num_steps=None, last_step=None)` {#StopAtStep.__init__}
-
-Create a StopAtStep monitor.
-
-This monitor requests stop after either a number of steps have been
-executed or a last step has been reached.  Only of the two options can be
-specified.
-
-if `num_steps` is specified, it indicates the number of steps to execute
-after `begin()` is called.  If instead `last_step` is specified, it
-indicates the last step we want to execute, as passed to the `step_begin()`
-call.
-
-##### Args:
-
-
-*  <b>`num_steps`</b>: Number of steps to execute.
-*  <b>`last_step`</b>: Step after which to stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.begin(max_steps=None)` {#StopAtStep.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.end(session=None)` {#StopAtStep.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.epoch_begin(epoch)` {#StopAtStep.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.epoch_end(epoch)` {#StopAtStep.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.post_step(step, session)` {#StopAtStep.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.run_on_all_workers` {#StopAtStep.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.set_estimator(estimator)` {#StopAtStep.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.step_begin(step)` {#StopAtStep.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.step_end(step, output)` {#StopAtStep.step_end}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.SummarySaver` {#SummarySaver}
-
-Saves summaries every N steps.
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
-
-Initializes a `SummarySaver` monitor.
-
-##### Args:
-
-
-*  <b>`summary_op`</b>: `Tensor` of type `string`. A serialized `Summary` protocol
-      buffer, as output by TF summary methods like `summary.scalar` or
-      `summary.merge_all`.
-*  <b>`save_steps`</b>: `int`, save summaries every N steps. See `EveryN`.
-*  <b>`output_dir`</b>: `string`, the directory to save the summaries to. Only used
-      if no `summary_writer` is supplied.
-*  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
-      one will be created accordingly.
-*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.begin(max_steps=None)` {#SummarySaver.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.end(session=None)` {#SummarySaver.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.epoch_begin(epoch)` {#SummarySaver.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.epoch_end(epoch)` {#SummarySaver.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_post_step(step, session)` {#SummarySaver.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_step_begin(step)` {#SummarySaver.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_step_end(step, outputs)` {#SummarySaver.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.post_step(step, session)` {#SummarySaver.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.run_on_all_workers` {#SummarySaver.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.set_estimator(estimator)` {#SummarySaver.set_estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.step_begin(step)` {#SummarySaver.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.step_end(step, output)` {#SummarySaver.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.ValidationMonitor` {#ValidationMonitor}
-
-Runs evaluation of a given estimator, at most every N steps.
-
-Note that the evaluation is done based on the saved checkpoint, which will
-usually be older than the current step.
-
-Can do early stopping on validation metrics if `early_stopping_rounds` is
-provided.
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, hooks=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
-
-Initializes a ValidationMonitor.
-
-##### Args:
-
-
-*  <b>`x`</b>: See `BaseEstimator.evaluate`.
-*  <b>`y`</b>: See `BaseEstimator.evaluate`.
-*  <b>`input_fn`</b>: See `BaseEstimator.evaluate`.
-*  <b>`batch_size`</b>: See `BaseEstimator.evaluate`.
-*  <b>`eval_steps`</b>: See `BaseEstimator.evaluate`.
-*  <b>`every_n_steps`</b>: Check for new checkpoints to evaluate every N steps. If a
-      new checkpoint is found, it is evaluated. See `EveryN`.
-*  <b>`metrics`</b>: See `BaseEstimator.evaluate`.
-*  <b>`hooks`</b>: A list of `SessionRunHook` hooks to pass to the
-    `Estimator`'s `evaluate` function.
-*  <b>`early_stopping_rounds`</b>: `int`. If the metric indicated by
-      `early_stopping_metric` does not change according to
-      `early_stopping_metric_minimize` for this many steps, then training
-      will be stopped.
-*  <b>`early_stopping_metric`</b>: `string`, name of the metric to check for early
-      stopping.
-*  <b>`early_stopping_metric_minimize`</b>: `bool`, True if `early_stopping_metric` is
-      expected to decrease (thus early stopping occurs when this metric
-      stops decreasing), False if `early_stopping_metric` is expected to
-      increase. Typically, `early_stopping_metric_minimize` is True for
-      loss metrics like mean squared error, and False for performance
-      metrics like accuracy.
-*  <b>`name`</b>: See `BaseEstimator.evaluate`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both x and input_fn are provided.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.begin(max_steps=None)` {#ValidationMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.best_step` {#ValidationMonitor.best_step}
-
-Returns the step at which the best early stopping metric was found.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.best_value` {#ValidationMonitor.best_value}
-
-Returns the best early stopping metric value found so far.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.early_stopped` {#ValidationMonitor.early_stopped}
-
-Returns True if this monitor caused an early stop.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.end(session=None)` {#ValidationMonitor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.epoch_begin(epoch)` {#ValidationMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.epoch_end(epoch)` {#ValidationMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_post_step(step, session)` {#ValidationMonitor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_step_begin(step)` {#ValidationMonitor.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_step_end(step, outputs)` {#ValidationMonitor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.post_step(step, session)` {#ValidationMonitor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.run_on_all_workers` {#ValidationMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.set_estimator(estimator)` {#ValidationMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.step_begin(step)` {#ValidationMonitor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.step_end(step, output)` {#ValidationMonitor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.learn.monitors.RunHookAdapterForMonitors` {#RunHookAdapterForMonitors}
-
-Wraps monitors into a SessionRunHook.
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.__init__(monitors)` {#RunHookAdapterForMonitors.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session, coord)` {#RunHookAdapterForMonitors.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_run(run_context, run_values)` {#RunHookAdapterForMonitors.after_run}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.before_run(run_context)` {#RunHookAdapterForMonitors.before_run}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.begin()` {#RunHookAdapterForMonitors.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.end(session)` {#RunHookAdapterForMonitors.end}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.monitors.SummaryWriterCache` {#SummaryWriterCache}
-
-Cache for file writers.
-
-This class caches file writers, one per directory.
-- - -
-
-#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
-
-
-- - -
-
-### `tf.contrib.learn.monitors.replace_monitors_with_hooks(monitors_or_hooks, estimator)` {#replace_monitors_with_hooks}
-
-Wraps monitors with a hook.
-
-`Monitor` is deprecated in favor of `SessionRunHook`. If you're using a
-monitor, you can wrap it with a hook using function. It is recommended to
-implement hook version of your monitor.
-
-##### Args:
-
-
-*  <b>`monitors_or_hooks`</b>: A `list` may contain both monitors and hooks.
-*  <b>`estimator`</b>: An `Estimator` that monitor will be used with.
-
-##### Returns:
-
-  Returns a list of hooks. If there is any monitor in the given list, it is
-  replaced by a hook.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md b/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md
deleted file mode 100644
index e76fa69f1f5..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.legacy_seq2seq.md
+++ /dev/null
@@ -1,579 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Sequence to Sequence (contrib)
-[TOC]
-
-Deprecated library for creating sequence-to-sequence models in TensorFlow.
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False)` {#attention_decoder}
-
-RNN decoder with attention for the sequence-to-sequence model.
-
-In this context "attention" means that, during decoding, the RNN can look up
-information in the additional tensor attention_states, and it does this by
-focusing on a few entries from the tensor. This model has proven to yield
-especially good results in a number of sequence-to-sequence tasks. This
-implementation is based on http://arxiv.org/abs/1412.7449 (see below for
-details). It is recommended for complex sequence-to-sequence tasks.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`output_size`</b>: Size of the output vectors; if None, we use cell.output_size.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
-    in order to generate i+1-th input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol). This can be used for decoding,
-    but also for training to emulate http://arxiv.org/abs/1506.03099.
-    Signature -- loop_function(prev, i) = next
-      * prev is a 2D Tensor of shape [batch_size x output_size],
-      * i is an integer, the step number (when advanced control is needed),
-      * next is a 2D Tensor of shape [batch_size x input_size].
-*  <b>`dtype`</b>: The dtype to use for the RNN initial state (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "attention_decoder".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states -- useful when we wish to resume decoding from a previously
-    stored decoder state and attention states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors of
-      shape [batch_size x output_size]. These represent the generated outputs.
-      Output i is computed from input i (which is either the i-th element
-      of decoder_inputs or loop_function(output {i-1}, i)) as follows.
-      First, we run the cell on a combination of the input and previous
-      attention masks:
-        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
-      Then, we calculate new attention masks:
-        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
-      and then we calculate the output:
-        output = linear(cell_output, new_attn).
-*  <b>`state`</b>: The state of each decoder cell the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when num_heads is not positive, there are no inputs, shapes
-    of attention_states are not set, or input size cannot be inferred
-    from the input.
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, dtype=tf.float32, scope=None)` {#basic_rnn_seq2seq}
-
-Basic RNN sequence-to-sequence model.
-
-This model first runs an RNN to encode encoder_inputs into a state vector,
-then runs decoder, initialized with the last encoder state, on decoder_inputs.
-Encoder and decoder use the same RNN cell type, but don't share parameters.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`dtype`</b>: The dtype of the initial state of the RNN cell (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell in the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_decoder}
-
-RNN decoder with embedding and attention and a pure-decoding option.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function.
-*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`output_size`</b>: Size of the output vectors; if None, use output_size.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has shape
-    [num_symbols]; if provided and feed_previous=True, each fed previous
-    output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
-    used (the "GO" symbol), and all other decoder inputs will be generated by:
-      next = embedding_lookup(embedding, argmax(previous_output)),
-    In effect, this implements a greedy decoder. It can also be used
-    during training to emulate http://arxiv.org/abs/1506.03099.
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
-    only the embedding for the first symbol of decoder_inputs (the "GO"
-    symbol) will be updated by back propagation. Embeddings for the symbols
-    generated from the decoder itself remain unchanged. This parameter has
-    no effect if feed_previous=False.
-*  <b>`dtype`</b>: The dtype to use for the RNN initial states (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_attention_decoder".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states -- useful when we wish to resume decoding from a previously
-    stored decoder state and attention states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_seq2seq}
-
-Embedding sequence-to-sequence model with attention.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_encoder_symbols x input_size]). Then it runs an RNN to encode
-embedded encoder_inputs into a state vector. It keeps the outputs of this
-RNN at every step to use for attention later. Next, it embeds decoder_inputs
-by another newly created embedding (of shape [num_decoder_symbols x
-input_size]). Then it runs attention decoder, initialized with the last
-encoder state, on embedded decoder_inputs and attending to encoder outputs.
-
-Warning: when output_projection is None, the size of the attention vectors
-and variables will be made proportional to num_decoder_symbols, can be large.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_decoder_symbols] and B has
-    shape [num_decoder_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial RNN state (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_attention_seq2seq".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x num_decoder_symbols] containing the generated
-      outputs.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None)` {#embedding_rnn_decoder}
-
-RNN decoder with embedding and a pure-decoding option.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function.
-*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has
-    shape [num_symbols]; if provided and feed_previous=True, each fed
-    previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
-    used (the "GO" symbol), and all other decoder inputs will be generated by:
-      next = embedding_lookup(embedding, argmax(previous_output)),
-    In effect, this implements a greedy decoder. It can also be used
-    during training to emulate http://arxiv.org/abs/1506.03099.
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
-    only the embedding for the first symbol of decoder_inputs (the "GO"
-    symbol) will be updated by back propagation. Embeddings for the symbols
-    generated from the decoder itself remain unchanged. This parameter has
-    no effect if feed_previous=False.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_rnn_decoder".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
-      output is of shape [batch_size x cell.output_size] when
-      output_projection is not None (and represents the dense representation
-      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
-      when output_projection is None.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_rnn_seq2seq}
-
-Embedding RNN sequence-to-sequence model.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_encoder_symbols x input_size]). Then it runs an RNN to encode
-embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
-by another newly created embedding (of shape [num_decoder_symbols x
-input_size]). Then it runs RNN decoder, initialized with the last
-encoder state, on embedded decoder_inputs.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_decoder_symbols] and B has
-    shape [num_decoder_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
-    rnn cells (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_rnn_seq2seq"
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
-      output is of shape [batch_size x cell.output_size] when
-      output_projection is not None (and represents the dense representation
-      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
-      when output_projection is None.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_tied_rnn_seq2seq}
-
-Embedding RNN sequence-to-sequence model with tied (shared) parameters.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_symbols x input_size]). Then it runs an RNN to encode embedded
-encoder_inputs into a state vector. Next, it embeds decoder_inputs using
-the same embedding. Then it runs RNN decoder, initialized with the last
-encoder state, on embedded decoder_inputs. The decoder output is over symbols
-from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
-is over 0 to num_symbols - 1.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_symbols`</b>: Integer; number of symbols for both encoder and decoder.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_decoder_symbols`</b>: Integer; number of output symbols for decoder. If
-    provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
-    Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
-    this assumes that the vocabulary is set up such that the first
-    num_decoder_symbols of num_symbols are part of decoding.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has
-    shape [num_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype to use for the initial RNN states (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_tied_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_symbols] containing the generated
-      outputs where output_symbols = num_decoder_symbols if
-      num_decoder_symbols is not None otherwise output_symbols = num_symbols.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None)` {#model_with_buckets}
-
-Create a sequence-to-sequence model with support for bucketing.
-
-The seq2seq argument is a function that defines a sequence-to-sequence model,
-e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(
-    x, y, core_rnn_cell.GRUCell(24))
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of Tensors to feed the encoder; first seq2seq input.
-*  <b>`decoder_inputs`</b>: A list of Tensors to feed the decoder; second seq2seq input.
-*  <b>`targets`</b>: A list of 1D batch-sized int32 Tensors (desired output sequence).
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors to weight the targets.
-*  <b>`buckets`</b>: A list of pairs of (input size, output size) for each bucket.
-*  <b>`seq2seq`</b>: A sequence-to-sequence model function; it takes 2 input that
-    agree with encoder_inputs and decoder_inputs, and returns a pair
-    consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
-*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`per_example_loss`</b>: Boolean. If set, the returned loss will be a batch-sized
-    tensor of losses for each sequence in the batch. If unset, it will be
-    a scalar with the averaged loss from all examples.
-*  <b>`name`</b>: Optional name for this operation, defaults to "model_with_buckets".
-
-##### Returns:
-
-  A tuple of the form (outputs, losses), where:
-
-*  <b>`outputs`</b>: The outputs for each bucket. Its j'th element consists of a list
-      of 2D Tensors. The shape of output tensors can be either
-      [batch_size x output_size] or [batch_size x num_decoder_symbols]
-      depending on the seq2seq model used.
-*  <b>`losses`</b>: List of scalar Tensors, representing losses for each bucket, or,
-      if per_example_loss is set, a list of 1D batch-sized float Tensors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If length of encoder_inputsut, targets, or weights is smaller
-    than the largest (last) bucket.
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None)` {#one2many_rnn_seq2seq}
-
-One-to-many RNN sequence-to-sequence model (multi-task).
-
-This is a multi-task sequence-to-sequence model with one encoder and multiple
-decoders. Reference to multi-task sequence-to-sequence learning can be found
-here: http://arxiv.org/abs/1511.06114
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs_dict`</b>: A dictionany mapping decoder name (string) to
-    the corresponding decoder_inputs; each decoder_inputs is a list of 1D
-    Tensors of shape [batch_size]; num_decoders is defined as
-    len(decoder_inputs_dict).
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols_dict`</b>: A dictionary mapping decoder name (string) to an
-    integer specifying number of symbols for the corresponding decoder;
-    len(num_decoder_symbols_dict) must be equal to num_decoders.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first of
-    decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
-    rnn cells (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "one2many_rnn_seq2seq"
-
-##### Returns:
-
-  A tuple of the form (outputs_dict, state_dict), where:
-
-*  <b>`outputs_dict`</b>: A mapping from decoder name (string) to a list of the same
-      length as decoder_inputs_dict[name]; each element in the list is a 2D
-      Tensors with shape [batch_size x num_decoder_symbol_list[name]]
-      containing the generated outputs.
-*  <b>`state_dict`</b>: A mapping from decoder name (string) to the final state of the
-      corresponding decoder RNN; it is a 2D Tensor of shape
-      [batch_size x cell.state_size].
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None)` {#rnn_decoder}
-
-RNN decoder for the sequence-to-sequence model.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`initial_state`</b>: 2D Tensor with shape [batch_size x cell.state_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`loop_function`</b>: If not None, this function will be applied to the i-th output
-    in order to generate the i+1-st input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol). This can be used for decoding,
-    but also for training to emulate http://arxiv.org/abs/1506.03099.
-    Signature -- loop_function(prev, i) = next
-      * prev is a 2D Tensor of shape [batch_size x output_size],
-      * i is an integer, the step number (when advanced control is needed),
-      * next is a 2D Tensor of shape [batch_size x input_size].
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn_decoder".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing generated outputs.
-*  <b>`state`</b>: The state of each cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
-       states can be the same. They are different for LSTM cells though.)
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None)` {#sequence_loss}
-
-Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
-
-##### Args:
-
-
-*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
-*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
-*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
-    label weight.
-*  <b>`average_across_batch`</b>: If set, divide the returned cost by the batch size.
-*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`name`</b>: Optional name for this operation, defaults to "sequence_loss".
-
-##### Returns:
-
-  A scalar float Tensor: The average log-perplexity per symbol (weighted).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None)` {#sequence_loss_by_example}
-
-Weighted cross-entropy loss for a sequence of logits (per example).
-
-##### Args:
-
-
-*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
-*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
-*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
-    label weight.
-*  <b>`softmax_loss_function`</b>: Function (labels-batch, inputs-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`name`</b>: Optional name for this operation, default: "sequence_loss_by_example".
-
-##### Returns:
-
-  1D batch-sized float Tensor: The log-perplexity for each sequence.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
-
-
-- - -
-
-### `tf.contrib.legacy_seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=tf.float32, scope=None)` {#tied_rnn_seq2seq}
-
-RNN sequence-to-sequence model with tied encoder and decoder parameters.
-
-This model first runs an RNN to encode encoder_inputs into a state vector, and
-then runs decoder, initialized with the last encoder state, on decoder_inputs.
-Encoder and decoder use the same RNN cell and share parameters.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
-    in order to generate i+1-th input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol), see rnn_decoder for details.
-*  <b>`dtype`</b>: The dtype of the initial state of the rnn cell (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.linalg.md b/tensorflow/g3doc/api_docs/python/contrib.linalg.md
deleted file mode 100644
index 5e12d441668..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.linalg.md
+++ /dev/null
@@ -1,4152 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Linear Algebra (contrib)
-[TOC]
-
-Linear algebra libraries for TensorFlow.
-
-## `LinearOperator`
-
-Subclasses of `LinearOperator` provide a access to common methods on a
-(batch) matrix, without the need to materialize the matrix.  This allows:
-
-* Matrix free computations
-* Different operators to take advantage of special strcture, while providing a
-  consistent API to users.
-
-### Base class
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperator` {#LinearOperator}
-
-Base class defining a [batch of] linear operator[s].
-
-Subclasses of `LinearOperator` provide a access to common methods on a
-(batch) matrix, without the need to materialize the matrix.  This allows:
-
-* Matrix free computations
-* Operators that take advantage of special structure, while providing a
-  consistent API to users.
-
-#### Subclassing
-
-To enable a public method, subclasses should implement the leading-underscore
-version of the method.  The argument signature should be identical except for
-the omission of `name="..."`.  For example, to enable
-`apply(x, adjoint=False, name="apply")` a subclass should implement
-`_apply(x, adjoint=False)`.
-
-#### Performance contract
-
-Subclasses should implement a method only if it can be done with a reasonable
-performance increase over generic dense operations, either in time, parallel
-scalability, or memory usage.  For example, if the determinant can only be
-computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-should not be implemented.
-
-Class docstrings should contain an explanation of computational complexity.
-Since this is a high-performance library, attention should be paid to detail,
-and explanations can include constants as well as Big-O notation.
-
-#### Shape compatibility
-
-`LinearOperator` sub classes should operate on a [batch] matrix with
-compatible shape.  Class docstrings should define what is meant by compatible
-shape.  Some sub-classes may not support batching.
-
-An example is:
-
-`x` is a batch matrix with compatible shape for `apply` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
-x.shape =   [B1,...,Bb] + [N, R]
-```
-
-`rhs` is a batch matrix with compatible shape for `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
-rhs.shape =   [B1,...,Bb] + [M, R]
-```
-
-#### Example docstring for subclasses.
-
-This operator acts like a (batch) matrix `A` with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `m x n` matrix.  Again, this matrix `A` may not be materialized, but for
-purposes of identifying and working with compatible arguments the shape is
-relevant.
-
-Examples:
-
-```python
-some_tensor = ... shape = ????
-operator = MyLinOp(some_tensor)
-
-operator.shape()
-==> [2, 4, 4]
-
-operator.log_determinant()
-==> Shape [2] Tensor
-
-x = ... Shape [2, 4, 5] Tensor
-
-operator.apply(x)
-==> Shape [2, 4, 5] Tensor
-```
-
-#### Shape compatibility
-
-This operator acts on batch matrices with compatible shape.
-FILL IN WHAT IS MEANT BY COMPATIBLE SHAPE
-
-#### Performance
-
-FILL THIS IN
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite, square`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.__init__(dtype, graph_parents=None, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, is_square=None, name=None)` {#LinearOperator.__init__}
-
-Initialize the `LinearOperator`.
-
-**This is a private method for subclass use.**
-**Subclasses should copy-paste this `__init__` documentation.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the this `LinearOperator`.  Arguments to `apply` and
-    `solve` will have to be this type.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `LinearOperator`
-    Typically tensors that are passed during initialization.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  If `dtype` is real, this is equivalent to being symmetric.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix\
-        #Extension_for_non_symmetric_matrices
-*  <b>`is_square`</b>: Expect that this operator acts like square [batch] matrices.
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If any member of graph_parents is `None` or not a `Tensor`.
-*  <b>`ValueError`</b>: If hints are set incorrectly.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.add_to_tensor(x, name='add_to_tensor')` {#LinearOperator.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.apply(x, adjoint=False, name='apply')` {#LinearOperator.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_non_singular(name='assert_non_singular')` {#LinearOperator.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_positive_definite(name='assert_positive_definite')` {#LinearOperator.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperator.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.batch_shape` {#LinearOperator.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperator.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.determinant(name='det')` {#LinearOperator.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.domain_dimension` {#LinearOperator.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperator.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.dtype` {#LinearOperator.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.graph_parents` {#LinearOperator.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_non_singular` {#LinearOperator.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_positive_definite` {#LinearOperator.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_self_adjoint` {#LinearOperator.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_square` {#LinearOperator.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.log_abs_determinant(name='log_abs_det')` {#LinearOperator.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.name` {#LinearOperator.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.range_dimension` {#LinearOperator.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperator.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.shape` {#LinearOperator.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.shape_tensor(name='shape_tensor')` {#LinearOperator.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.solve(rhs, adjoint=False, name='solve')` {#LinearOperator.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.tensor_rank` {#LinearOperator.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperator.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.to_dense(name='to_dense')` {#LinearOperator.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-
-### Individual operators
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorDiag` {#LinearOperatorDiag}
-
-`LinearOperator` acting like a [batch] square diagonal matrix.
-
-This operator acts like a [batch] diagonal matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.  This matrix `A` is not materialized, but for
-purposes of broadcasting this shape will be relevant.
-
-`LinearOperatorDiag` is initialized with a (batch) vector.
-
-```python
-# Create a 2 x 2 diagonal linear operator.
-diag = [1., -1.]
-operator = LinearOperatorDiag(diag)
-
-operator.to_dense()
-==> [[1.,  0.]
-     [0., -1.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-diag = tf.random_normal(shape=[2, 3, 4])
-operator = LinearOperatorDiag(diag)
-
-# Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-# since the batch dimensions, [2, 1], are brodcast to
-# operator.batch_shape = [2, 3].
-y = tf.random_normal(shape=[2, 1, 4, 2])
-x = operator.solve(y)
-==> operator.apply(x) = y
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-#### Performance
-
-Suppose `operator` is a `LinearOperatorDiag` of shape `[N, N]`,
-and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` involves `N * R` multiplications.
-* `operator.solve(x)` involves `N` divisions and `N * R` multiplications.
-* `operator.determinant()` involves a size `N` `reduce_prod`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
-
-Initialize a `LinearOperatorDiag`.
-
-##### Args:
-
-
-*  <b>`diag`</b>: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-    The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-      `complex64`, `complex128`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  If `diag.dtype` is real, this is auto-set to `True`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-*  <b>`ValueError`</b>: If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorDiag.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.apply(x, adjoint=False, name='apply')` {#LinearOperatorDiag.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_non_singular(name='assert_non_singular')` {#LinearOperatorDiag.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorDiag.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorDiag.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape` {#LinearOperatorDiag.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorDiag.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.determinant(name='det')` {#LinearOperatorDiag.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension` {#LinearOperatorDiag.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorDiag.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.dtype` {#LinearOperatorDiag.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.graph_parents` {#LinearOperatorDiag.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_non_singular` {#LinearOperatorDiag.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_positive_definite` {#LinearOperatorDiag.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_self_adjoint` {#LinearOperatorDiag.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_square` {#LinearOperatorDiag.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.log_abs_determinant(name='log_abs_det')` {#LinearOperatorDiag.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.name` {#LinearOperatorDiag.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension` {#LinearOperatorDiag.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorDiag.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.shape` {#LinearOperatorDiag.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.shape_tensor(name='shape_tensor')` {#LinearOperatorDiag.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorDiag.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank` {#LinearOperatorDiag.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorDiag.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.to_dense(name='to_dense')` {#LinearOperatorDiag.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorIdentity` {#LinearOperatorIdentity}
-
-`LinearOperator` acting like a [batch] square identity matrix.
-
-This operator acts like a [batch] identity matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.  This matrix `A` is not materialized, but for
-purposes of broadcasting this shape will be relevant.
-
-`LinearOperatorIdentity` is initialized with `num_rows`, and optionally
-`batch_shape`, and `dtype` arguments.  If `batch_shape` is `None`, this
-operator efficiently passes through all arguments.  If `batch_shape` is
-provided, broadcasting may occur, which will require making copies.
-
-```python
-# Create a 2 x 2 identity matrix.
-operator = LinearOperatorIdentity(num_rows=2, dtype=tf.float32)
-
-operator.to_dense()
-==> [[1., 0.]
-     [0., 1.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> 0.
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor, same as x.
-
-y = tf.random_normal(shape=[3, 2, 4])
-# Note that y.shape is compatible with operator.shape because operator.shape
-# is broadcast to [3, 2, 2].
-# This broadcast does NOT require copying data, since we can infer that y
-# will be passed through without changing shape.  We are always able to infer
-# this if the operator has no batch_shape.
-x = operator.solve(y)
-==> Shape [3, 2, 4] Tensor, same as y.
-
-# Create a 2-batch of 2x2 identity matrices
-operator = LinearOperatorIdentity(num_rows=2, batch_shape=[2])
-operator.to_dense()
-==> [[[1., 0.]
-      [0., 1.]],
-     [[1., 0.]
-      [0., 1.]]]
-
-# Here, even though the operator has a batch shape, the input is the same as
-# the output, so x can be passed through without a copy.  The operator is able
-# to detect that no broadcast is necessary because both x and the operator
-# have statically defined shape.
-x = ... Shape [2, 2, 3]
-operator.apply(x)
-==> Shape [2, 2, 3] Tensor, same as x
-
-# Here the operator and x have different batch_shape, and are broadcast.
-# This requires a copy, since the output is different size than the input.
-x = ... Shape [1, 2, 3]
-operator.apply(x)
-==> Shape [2, 2, 3] Tensor, equal to [x, x]
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-### Performance
-
-If `batch_shape` initialization arg is `None`:
-
-* `operator.apply(x)` is `O(1)`
-* `operator.solve(x)` is `O(1)`
-* `operator.determinant()` is `O(1)`
-
-If `batch_shape` initialization arg is provided, and static checks cannot
-rule out the need to broadcast:
-
-* `operator.apply(x)` is `O(D1*...*Dd*N*R)`
-* `operator.solve(x)` is `O(D1*...*Dd*N*R)`
-* `operator.determinant()` is `O(B1*...*Bb)`
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.__init__(num_rows, batch_shape=None, dtype=None, is_non_singular=True, is_self_adjoint=True, is_positive_definite=True, assert_proper_shapes=False, name='LinearOperatorIdentity')` {#LinearOperatorIdentity.__init__}
-
-Initialize a `LinearOperatorIdentity`.
-
-The `LinearOperatorIdentity` is initialized with arguments defining `dtype`
-and shape.
-
-This operator is able to broadcast the leading (batch) dimensions, which
-sometimes requires copying data.  If `batch_shape` is `None`, the operator
-can take arguments of any batch shape without copying.  See examples.
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Scalar non-negative integer `Tensor`.  Number of rows in the
-    corresponding identity matrix.
-*  <b>`batch_shape`</b>: Optional `1-D` integer `Tensor`.  The shape of the leading
-    dimensions.  If `None`, this operator has no leading dimensions.
-*  <b>`dtype`</b>: Data type of the matrix that this operator represents.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-*  <b>`assert_proper_shapes`</b>: Python `bool`.  If `False`, only perform static
-    checks that initialization and method arguments have proper shape.
-    If `True`, and static checks are inconclusive, add asserts to the graph.
-*  <b>`name`</b>: A name for this `LinearOperator`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num_rows` is determined statically to be non-scalar, or
-    negative.
-*  <b>`ValueError`</b>: If `batch_shape` is determined statically to not be 1-D, or
-    negative.
-*  <b>`ValueError`</b>: If any of the following is not `True`:
-    `{is_self_adjoint, is_non_singular, is_positive_definite}`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.add_to_tensor(mat, name='add_to_tensor')` {#LinearOperatorIdentity.add_to_tensor}
-
-Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
-
-##### Args:
-
-
-*  <b>`mat`</b>: `Tensor` with same `dtype` and shape broadcastable to `self`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.apply(x, adjoint=False, name='apply')` {#LinearOperatorIdentity.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_non_singular(name='assert_non_singular')` {#LinearOperatorIdentity.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorIdentity.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorIdentity.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape` {#LinearOperatorIdentity.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorIdentity.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.determinant(name='det')` {#LinearOperatorIdentity.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension` {#LinearOperatorIdentity.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorIdentity.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.dtype` {#LinearOperatorIdentity.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.graph_parents` {#LinearOperatorIdentity.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_non_singular` {#LinearOperatorIdentity.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_positive_definite` {#LinearOperatorIdentity.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_self_adjoint` {#LinearOperatorIdentity.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_square` {#LinearOperatorIdentity.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.log_abs_determinant(name='log_abs_det')` {#LinearOperatorIdentity.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.name` {#LinearOperatorIdentity.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension` {#LinearOperatorIdentity.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorIdentity.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape` {#LinearOperatorIdentity.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorIdentity.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorIdentity.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank` {#LinearOperatorIdentity.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorIdentity.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.to_dense(name='to_dense')` {#LinearOperatorIdentity.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorScaledIdentity` {#LinearOperatorScaledIdentity}
-
-`LinearOperator` acting like a scaled [batch] identity matrix `A = c I`.
-
-This operator acts like a scaled [batch] identity matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-a scaled version of the `N x N` identity matrix.
-
-`LinearOperatorIdentity` is initialized with `num_rows`, and a `multiplier`
-(a `Tensor`) of shape `[B1,...,Bb]`.  `N` is set to `num_rows`, and the
-`multiplier` determines the scale for each batch member.
-
-```python
-# Create a 2 x 2 scaled identity matrix.
-operator = LinearOperatorIdentity(num_rows=2, multiplier=3.)
-
-operator.to_dense()
-==> [[3., 0.]
-     [0., 3.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> 2 * Log[3]
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> 3 * x
-
-y = tf.random_normal(shape=[3, 2, 4])
-# Note that y.shape is compatible with operator.shape because operator.shape
-# is broadcast to [3, 2, 2].
-x = operator.solve(y)
-==> 3 * x
-
-# Create a 2-batch of 2x2 identity matrices
-operator = LinearOperatorIdentity(num_rows=2, multiplier=5.)
-operator.to_dense()
-==> [[[5., 0.]
-      [0., 5.]],
-     [[5., 0.]
-      [0., 5.]]]
-
-x = ... Shape [2, 2, 3]
-operator.apply(x)
-==> 5 * x
-
-# Here the operator and x have different batch_shape, and are broadcast.
-x = ... Shape [1, 2, 3]
-operator.apply(x)
-==> 5 * x
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-### Performance
-
-* `operator.apply(x)` is `O(D1*...*Dd*N*R)`
-* `operator.solve(x)` is `O(D1*...*Dd*N*R)`
-* `operator.determinant()` is `O(D1*...*Dd)`
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.__init__(num_rows, multiplier, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, assert_proper_shapes=False, name='LinearOperatorScaledIdentity')` {#LinearOperatorScaledIdentity.__init__}
-
-Initialize a `LinearOperatorScaledIdentity`.
-
-The `LinearOperatorScaledIdentity` is initialized with `num_rows`, which
-determines the size of each identity matrix, and a `multiplier`,
-which defines `dtype`, batch shape, and scale of each matrix.
-
-This operator is able to broadcast the leading (batch) dimensions.
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Scalar non-negative integer `Tensor`.  Number of rows in the
-    corresponding identity matrix.
-*  <b>`multiplier`</b>: `Tensor` of shape `[B1,...,Bb]`, or `[]` (a scalar).
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-*  <b>`assert_proper_shapes`</b>: Python `bool`.  If `False`, only perform static
-    checks that initialization and method arguments have proper shape.
-    If `True`, and static checks are inconclusive, add asserts to the graph.
-*  <b>`name`</b>: A name for this `LinearOperator`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num_rows` is determined statically to be non-scalar, or
-    negative.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.add_to_tensor(mat, name='add_to_tensor')` {#LinearOperatorScaledIdentity.add_to_tensor}
-
-Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
-
-##### Args:
-
-
-*  <b>`mat`</b>: `Tensor` with same `dtype` and shape broadcastable to `self`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.apply(x, adjoint=False, name='apply')` {#LinearOperatorScaledIdentity.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_non_singular(name='assert_non_singular')` {#LinearOperatorScaledIdentity.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorScaledIdentity.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorScaledIdentity.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape` {#LinearOperatorScaledIdentity.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorScaledIdentity.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.determinant(name='det')` {#LinearOperatorScaledIdentity.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension` {#LinearOperatorScaledIdentity.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorScaledIdentity.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.dtype` {#LinearOperatorScaledIdentity.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.graph_parents` {#LinearOperatorScaledIdentity.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_non_singular` {#LinearOperatorScaledIdentity.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_positive_definite` {#LinearOperatorScaledIdentity.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_self_adjoint` {#LinearOperatorScaledIdentity.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_square` {#LinearOperatorScaledIdentity.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.log_abs_determinant(name='log_abs_det')` {#LinearOperatorScaledIdentity.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.multiplier` {#LinearOperatorScaledIdentity.multiplier}
-
-The [batch] scalar `Tensor`, `c` in `cI`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.name` {#LinearOperatorScaledIdentity.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension` {#LinearOperatorScaledIdentity.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorScaledIdentity.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape` {#LinearOperatorScaledIdentity.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorScaledIdentity.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorScaledIdentity.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank` {#LinearOperatorScaledIdentity.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorScaledIdentity.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.to_dense(name='to_dense')` {#LinearOperatorScaledIdentity.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorMatrix` {#LinearOperatorMatrix}
-
-`LinearOperator` that wraps a [batch] matrix.
-
-This operator wraps a [batch] matrix `A` (which is a `Tensor`) with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `M x N` matrix.
-
-```python
-# Create a 2 x 2 linear operator.
-matrix = [[1., 2.], [3., 4.]]
-operator = LinearOperatorMatrix(matrix)
-
-operator.to_dense()
-==> [[1., 2.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-matrix = tf.random_normal(shape=[2, 3, 4, 4])
-operator = LinearOperatorMatrix(matrix)
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-#### Performance
-
-`LinearOperatorMatrix` has exactly the same performance as would be achieved
-by using standard `TensorFlow` matrix ops.  Intelligent choices are made
-based on the following initialization hints.
-
-* If `dtype` is real, and `is_self_adjoint` and `is_positive_definite`, a
-  Cholesky factorization is used for the determinant and solve.
-
-In all cases, suppose `operator` is a `LinearOperatorMatrix` of shape
-`[M, N]`, and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` is `O(M * N * R)`.
-* If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
-* If `M=N`, `operator.determinant()` is `O(N^3)`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.__init__(matrix, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorMatrix')` {#LinearOperatorMatrix.__init__}
-
-Initialize a `LinearOperatorMatrix`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-    Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorMatrix.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.apply(x, adjoint=False, name='apply')` {#LinearOperatorMatrix.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_non_singular(name='assert_non_singular')` {#LinearOperatorMatrix.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorMatrix.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorMatrix.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape` {#LinearOperatorMatrix.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorMatrix.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.determinant(name='det')` {#LinearOperatorMatrix.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension` {#LinearOperatorMatrix.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorMatrix.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.dtype` {#LinearOperatorMatrix.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.graph_parents` {#LinearOperatorMatrix.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_non_singular` {#LinearOperatorMatrix.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_positive_definite` {#LinearOperatorMatrix.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_self_adjoint` {#LinearOperatorMatrix.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_square` {#LinearOperatorMatrix.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.log_abs_determinant(name='log_abs_det')` {#LinearOperatorMatrix.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.name` {#LinearOperatorMatrix.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension` {#LinearOperatorMatrix.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorMatrix.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape` {#LinearOperatorMatrix.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape_tensor(name='shape_tensor')` {#LinearOperatorMatrix.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorMatrix.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank` {#LinearOperatorMatrix.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorMatrix.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.to_dense(name='to_dense')` {#LinearOperatorMatrix.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorTriL` {#LinearOperatorTriL}
-
-`LinearOperator` acting like a [batch] square lower triangular matrix.
-
-This operator acts like a [batch] lower triangular matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.
-
-`LinearOperatorTriL` is initialized with a `Tensor` having dimensions
-`[B1,...,Bb, N, N]`. The upper triangle of the last two dimensions is ignored.
-
-```python
-# Create a 2 x 2 lower-triangular linear operator.
-tril = [[1., 2.], [3., 4.]]
-operator = LinearOperatorTriL(tril)
-
-# The upper triangle is ignored.
-operator.to_dense()
-==> [[1., 0.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-tril = tf.random_normal(shape=[2, 3, 4, 4])
-operator = LinearOperatorTriL(tril)
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-#### Performance
-
-Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
-and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` involves `N^2 * R` multiplications.
-* `operator.solve(x)` involves `N * R` size `N` back-substitutions.
-* `operator.determinant()` involves a size `N` `reduce_prod`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.__init__(tril, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorTriL')` {#LinearOperatorTriL.__init__}
-
-Initialize a `LinearOperatorTriL`.
-
-##### Args:
-
-
-*  <b>`tril`</b>: Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
-    The lower triangular part of `tril` defines this operator.  The strictly
-    upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-    This operator is non-singular if and only if its diagonal elements are
-    all non-zero.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  This operator is self-adjoint only if it is diagonal with
-    real-valued diagonal entries.  In this case it is advised to use
-    `LinearOperatorDiag`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorTriL.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.apply(x, adjoint=False, name='apply')` {#LinearOperatorTriL.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_non_singular(name='assert_non_singular')` {#LinearOperatorTriL.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorTriL.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorTriL.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape` {#LinearOperatorTriL.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorTriL.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.determinant(name='det')` {#LinearOperatorTriL.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension` {#LinearOperatorTriL.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorTriL.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.dtype` {#LinearOperatorTriL.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.graph_parents` {#LinearOperatorTriL.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_non_singular` {#LinearOperatorTriL.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_positive_definite` {#LinearOperatorTriL.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_self_adjoint` {#LinearOperatorTriL.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_square` {#LinearOperatorTriL.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.log_abs_determinant(name='log_abs_det')` {#LinearOperatorTriL.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.name` {#LinearOperatorTriL.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension` {#LinearOperatorTriL.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorTriL.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.shape` {#LinearOperatorTriL.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.shape_tensor(name='shape_tensor')` {#LinearOperatorTriL.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorTriL.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank` {#LinearOperatorTriL.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorTriL.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.to_dense(name='to_dense')` {#LinearOperatorTriL.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorUDVHUpdate` {#LinearOperatorUDVHUpdate}
-
-Perturb a `LinearOperator` with a rank `K` update.
-
-This operator acts like a [batch] matrix `A` with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `M x N` matrix.
-
-`LinearOperatorUDVHUpdate` represents `A = L + U D V^H`, where
-
-```
-L, is a LinearOperator representing [batch] M x N matrices
-U, is a [batch] M x K matrix.  Typically K << M.
-D, is a [batch] K x K matrix.
-V, is a [batch] N x K matrix.  Typically K << N.
-V^H is the Hermitian transpose (adjoint) of V.
-```
-
-If `M = N`, determinants and solves are done using the matrix determinant
-lemma and Woodbury identities, and thus require L and D to be non-singular.
-
-Solves and determinants will be attempted unless the "is_non_singular"
-property of L and D is False.
-
-In the event that L and D are positive-definite, and U = V, solves and
-determinants can be done using a Cholesky factorization.
-
-```python
-# Create a 3 x 3 diagonal linear operator.
-diag_operator = LinearOperatorDiag(
-    diag=[1., 2., 3.], is_non_singular=True, is_self_adjoint=True,
-    is_positive_definite=True)
-
-# Perturb with a rank 2 perturbation
-operator = LinearOperatorUDVHUpdate(
-    operator=diag_operator,
-    u=[[1., 2.], [-1., 3.], [0., 0.]],
-    diag=[11., 12.],
-    v=[[1., 2.], [-1., 3.], [10., 10.]])
-
-operator.shape
-==> [3, 3]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [3, 4] Tensor
-operator.apply(x)
-==> Shape [3, 4] Tensor
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-### Performance
-
-Suppose `operator` is a `LinearOperatorUDVHUpdate` of shape `[M, N]`,
-made from a rank `K` update of `base_operator` which performs `.apply(x)` on
-`x` having `x.shape = [N, R]` with `O(L_apply*N*R)` complexity (and similarly
-for `solve`, `determinant`.  Then, if `x.shape = [N, R]`,
-
-* `operator.apply(x)` is `O(L_apply*N*R + K*N*R)`
-
-and if `M = N`,
-
-* `operator.solve(x)` is `O(L_apply*N*R + N*K*R + K^2*R + K^3)`
-* `operator.determinant()` is `O(L_determinant + L_solve*N*K + K^2*N + K^3)`
-
-If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite, diag_positive, square`
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.__init__(base_operator, u, diag=None, v=None, is_diag_positive=None, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, is_square=None, name='LinearOperatorUDVHUpdate')` {#LinearOperatorUDVHUpdate.__init__}
-
-Initialize a `LinearOperatorUDVHUpdate`.
-
-This creates a `LinearOperator` of the form `A = L + U D V^H`, with
-`L` a `LinearOperator`, `U, V` both [batch] matrices, and `D` a [batch]
-diagonal matrix.
-
-If `L` is non-singular, solves and determinants are available.
-Solves/determinants both involve a solve/determinant of a `K x K` system.
-In the event that L and D are self-adjoint positive-definite, and U = V,
-this can be done using a Cholesky factorization.  The user should set the
-`is_X` matrix property hints, which will trigger the appropriate code path.
-
-##### Args:
-
-
-*  <b>`base_operator`</b>: Shape `[B1,...,Bb, M, N]` real `float32` or `float64`
-    `LinearOperator`.  This is `L` above.
-*  <b>`u`</b>: Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
-    This is `U` above.
-*  <b>`diag`</b>: Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype` as
-    `base_operator`.  This is the diagonal of `D` above.
-     Defaults to `D` being the identity operator.
-*  <b>`v`</b>: Optional `Tensor` of same `dtype` as `u` and shape `[B1,...,Bb, N, K]`
-     Defaults to `v = u`, in which case the perturbation is symmetric.
-     If `M != N`, then `v` must be set since the pertrubation is not square.
-*  <b>`is_diag_positive`</b>: Python `bool`.  If `True`, expect `diag > 0`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-    Default is `None`, unless `is_positive_definite` is auto-set to be
-    `True` (see below).
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  Default is `None`, unless `base_operator` is self-adjoint
-    and `v = None` (meaning `u=v`), in which case this defaults to `True`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-    Default is `None`, unless `base_operator` is positive-definite
-    `v = None` (meaning `u=v`), and `is_diag_positive`, in which case this
-    defaults to `True`.
-*  <b>`is_square`</b>: Expect that this operator acts like square [batch] matrices.
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `is_X` flags are set in an inconsistent way.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorUDVHUpdate.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.apply(x, adjoint=False, name='apply')` {#LinearOperatorUDVHUpdate.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_non_singular(name='assert_non_singular')` {#LinearOperatorUDVHUpdate.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorUDVHUpdate.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorUDVHUpdate.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.base_operator` {#LinearOperatorUDVHUpdate.base_operator}
-
-If this operator is `A = L + U D V^H`, this is the `L`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.batch_shape` {#LinearOperatorUDVHUpdate.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorUDVHUpdate.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.determinant(name='det')` {#LinearOperatorUDVHUpdate.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.diag` {#LinearOperatorUDVHUpdate.diag}
-
-If this operator is `A = L + U D V^H`, this is the diagonal of `D`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.diag_operator` {#LinearOperatorUDVHUpdate.diag_operator}
-
-If this operator is `A = L + U D V^H`, this is `D`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.domain_dimension` {#LinearOperatorUDVHUpdate.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorUDVHUpdate.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.dtype` {#LinearOperatorUDVHUpdate.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.graph_parents` {#LinearOperatorUDVHUpdate.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_non_singular` {#LinearOperatorUDVHUpdate.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_positive_definite` {#LinearOperatorUDVHUpdate.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_self_adjoint` {#LinearOperatorUDVHUpdate.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_square` {#LinearOperatorUDVHUpdate.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.log_abs_determinant(name='log_abs_det')` {#LinearOperatorUDVHUpdate.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.name` {#LinearOperatorUDVHUpdate.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.range_dimension` {#LinearOperatorUDVHUpdate.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorUDVHUpdate.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.shape` {#LinearOperatorUDVHUpdate.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.shape_tensor(name='shape_tensor')` {#LinearOperatorUDVHUpdate.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorUDVHUpdate.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.tensor_rank` {#LinearOperatorUDVHUpdate.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorUDVHUpdate.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.to_dense(name='to_dense')` {#LinearOperatorUDVHUpdate.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.u` {#LinearOperatorUDVHUpdate.u}
-
-If this operator is `A = L + U D V^H`, this is the `U`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.v` {#LinearOperatorUDVHUpdate.v}
-
-If this operator is `A = L + U D V^H`, this is the `V`.
-
-
-
-
-### Transformations and Combinations of operators
-
-- - -
-
-### `class tf.contrib.linalg.LinearOperatorComposition` {#LinearOperatorComposition}
-
-Composes one or more `LinearOperators`.
-
-This operator composes one or more linear operators `[op1,...,opJ]`,
-building a new `LinearOperator` with action defined by:
-
-```
-op_composed(x) := op1(op2(...(opJ(x)...))
-```
-
-If `opj` acts like [batch] matrix `Aj`, then `op_composed` acts like the
-[batch] matrix formed with the multiplication `A1 A2...AJ`.
-
-If `opj` has shape `batch_shape_j + [M_j, N_j]`, then we must have
-`N_j = M_{j+1}`, in which case the composed operator has shape equal to
-`broadcast_batch_shape + [M_1, N_J]`, where `broadcast_batch_shape` is the
-mutual broadcast of `batch_shape_j`, `j = 1,...,J`, assuming the intermediate
-batch shapes broadcast.  Even if the composed shape is well defined, the
-composed operator's methods may fail due to lack of broadcasting ability in
-the defining operators' methods.
-
-```python
-# Create a 2 x 2 linear operator composed of two 2 x 2 operators.
-operator_1 = LinearOperatorMatrix([[1., 2.], [3., 4.]])
-operator_2 = LinearOperatorMatrix([[1., 0.], [0., 1.]])
-operator = LinearOperatorComposition([operator_1, operator_2])
-
-operator.to_dense()
-==> [[1., 2.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 5 linear operators.
-matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
-operator_45 = LinearOperatorMatrix(matrix)
-
-# Create a [2, 3] batch of 5 x 6 linear operators.
-matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
-operator_56 = LinearOperatorMatrix(matrix_56)
-
-# Compose to create a [2, 3] batch of 4 x 6 operators.
-opeartor_46 = LinearOperatorComposition([operator_45, operator_56])
-
-# Create a shape [2, 3, 6, 2] vector.
-x = tf.random_normal(shape=[2, 3, 6, 2])
-operator.apply(x)
-==> Shape [2, 3, 4, 2] Tensor
-```
-
-#### Performance
-
-The performance of `LinearOperatorComposition` on any operation is equal to
-the sum of the individual operators' operations.
-
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.__init__(operators, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name=None)` {#LinearOperatorComposition.__init__}
-
-Initialize a `LinearOperatorComposition`.
-
-`LinearOperatorComposition` is initialized with a list of operators
-`[op_1,...,op_J]`.  For the `apply` method to be well defined, the
-composition `op_i.apply(op_{i+1}(x))` must be defined.  Other methods have
-similar constraints.
-
-##### Args:
-
-
-*  <b>`operators`</b>: Iterable of `LinearOperator` objects, each with
-    the same `dtype` and composible shape.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.  Default is the individual
-    operators names joined with `_o_`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If all operators do not have the same `dtype`.
-*  <b>`ValueError`</b>: If `operators` is empty.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorComposition.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.apply(x, adjoint=False, name='apply')` {#LinearOperatorComposition.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_non_singular(name='assert_non_singular')` {#LinearOperatorComposition.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorComposition.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorComposition.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape` {#LinearOperatorComposition.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorComposition.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.determinant(name='det')` {#LinearOperatorComposition.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension` {#LinearOperatorComposition.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorComposition.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.dtype` {#LinearOperatorComposition.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.graph_parents` {#LinearOperatorComposition.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_non_singular` {#LinearOperatorComposition.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_positive_definite` {#LinearOperatorComposition.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_self_adjoint` {#LinearOperatorComposition.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_square` {#LinearOperatorComposition.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.log_abs_determinant(name='log_abs_det')` {#LinearOperatorComposition.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.name` {#LinearOperatorComposition.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.operators` {#LinearOperatorComposition.operators}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension` {#LinearOperatorComposition.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorComposition.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.shape` {#LinearOperatorComposition.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.shape_tensor(name='shape_tensor')` {#LinearOperatorComposition.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorComposition.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank` {#LinearOperatorComposition.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorComposition.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.to_dense(name='to_dense')` {#LinearOperatorComposition.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.losses.md b/tensorflow/g3doc/api_docs/python/contrib.losses.md
deleted file mode 100644
index 696bb4e60f5..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.losses.md
+++ /dev/null
@@ -1,472 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Losses (contrib)
-[TOC]
-
-Ops for building neural network losses.
-
-## Other Functions and Classes
-- - -
-
-### `tf.contrib.losses.absolute_difference(*args, **kwargs)` {#absolute_difference}
-
-Adds an Absolute Difference loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.absolute_difference instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
-
-- - -
-
-### `tf.contrib.losses.add_loss(*args, **kwargs)` {#add_loss}
-
-Adds a externally defined loss to the collection of losses. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.add_loss instead.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A loss `Tensor`.
-*  <b>`loss_collection`</b>: Optional collection to add the loss to.
-
-
-- - -
-
-### `tf.contrib.losses.compute_weighted_loss(*args, **kwargs)` {#compute_weighted_loss}
-
-Computes the weighted loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.compute_weighted_loss instead.
-
-##### Args:
-
-
-*  <b>`losses`</b>: A tensor of size [batch_size, d1, ... dN].
-*  <b>`weights`</b>: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` that returns the weighted loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is `None` or the shape is not compatible with
-    `losses`, or if the number of dimensions (rank) of either `losses` or
-    `weights` is missing.
-
-
-- - -
-
-### `tf.contrib.losses.cosine_distance(*args, **kwargs)` {#cosine_distance}
-
-Adds a cosine-distance loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.cosine_distance instead.
-
-Note that the function assumes that `predictions` and `labels` are already
-unit-normalized.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: An arbitrary matrix.
-*  <b>`labels`</b>: A `Tensor` whose shape matches 'predictions'
-*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` shape doesn't match `labels` shape, or
-    `weights` is `None`.
-
-
-- - -
-
-### `tf.contrib.losses.get_losses(*args, **kwargs)` {#get_losses}
-
-Gets the list of losses from the loss_collection. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_losses instead.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the losses to return.
-*  <b>`loss_collection`</b>: Optional losses collection.
-
-##### Returns:
-
-  a list of loss tensors.
-
-
-- - -
-
-### `tf.contrib.losses.get_regularization_losses(*args, **kwargs)` {#get_regularization_losses}
-
-Gets the regularization losses. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_regularization_losses instead.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the losses to return.
-
-##### Returns:
-
-  A list of loss variables.
-
-
-- - -
-
-### `tf.contrib.losses.get_total_loss(*args, **kwargs)` {#get_total_loss}
-
-Returns a tensor whose value represents the total loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_total_loss instead.
-
-Notice that the function adds the given losses to the regularization losses.
-
-##### Args:
-
-
-*  <b>`add_regularization_losses`</b>: A boolean indicating whether or not to use the
-    regularization losses in the sum.
-*  <b>`name`</b>: The name of the returned tensor.
-
-##### Returns:
-
-  A `Tensor` whose value represents the total loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `losses` is not iterable.
-
-
-- - -
-
-### `tf.contrib.losses.hinge_loss(*args, **kwargs)` {#hinge_loss}
-
-Method that returns the loss tensor for hinge loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.hinge_loss instead.
-
-##### Args:
-
-
-*  <b>`logits`</b>: The logits, a float tensor.
-*  <b>`labels`</b>: The ground truth output tensor. Its shape should match the shape of
-    logits. The values of the tensor are expected to be 0.0 or 1.0.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A `Tensor` of same shape as `logits` and `labels` representing the loss
-    values across the batch.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `logits` and `labels` don't match.
-
-
-- - -
-
-### `tf.contrib.losses.log_loss(*args, **kwargs)` {#log_loss}
-
-Adds a Log Loss term to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.log_loss instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`epsilon`</b>: A small increment to add to avoid taking a log of zero.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
-
-- - -
-
-### `tf.contrib.losses.mean_pairwise_squared_error(*args, **kwargs)` {#mean_pairwise_squared_error}
-
-Adds a pairwise-errors-squared loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.mean_pairwise_squared_error instead.
-
-Unlike `mean_squared_error`, which is a measure of the differences between
-corresponding elements of `predictions` and `labels`,
-`mean_pairwise_squared_error` is a measure of the differences between pairs of
-corresponding elements of `predictions` and `labels`.
-
-For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
-three pairs of differences are summed to compute the loss:
-  loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
-
-Note that since the inputs are of size [batch_size, d0, ... dN], the
-corresponding pairs are computed within each batch sample but not across
-samples within a batch. For example, if `predictions` represents a batch of
-16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
-is drawn from each image, but not across images.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
-    where N+1 is the total number of dimensions in `predictions`.
-*  <b>`labels`</b>: The ground truth output tensor, whose shape must match the shape of
-    the `predictions` tensor.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape [batch_size]
-    or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
-
-- - -
-
-### `tf.contrib.losses.mean_squared_error(*args, **kwargs)` {#mean_squared_error}
-
-Adds a Sum-of-Squares loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.mean_squared_error instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
-
-- - -
-
-### `tf.contrib.losses.sigmoid_cross_entropy(*args, **kwargs)` {#sigmoid_cross_entropy}
-
-Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.sigmoid_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-If `label_smoothing` is nonzero, smooth the labels towards 1/2:
-
-    new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
-                            + 0.5 * label_smoothing
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`multi_class_labels`</b>: [batch_size, num_classes] labels in (0, 1).
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar, a tensor of
-    shape [batch_size] or shape [batch_size, num_classes].
-*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of
-    `multi_class_labels` or if the shape of `weights` is invalid, or if
-    `weights` is None.
-
-
-- - -
-
-### `tf.contrib.losses.softmax_cross_entropy(*args, **kwargs)` {#softmax_cross_entropy}
-
-Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.softmax_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
-    new_onehot_labels = onehot_labels * (1 - label_smoothing)
-                        + label_smoothing / num_classes
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`onehot_labels`</b>: [batch_size, num_classes] one-hot-encoded labels.
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
-    of shape [batch_size].
-*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the mean loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of `onehot_labels`
-    or if the shape of `weights` is invalid or if `weights` is None.
-
-
-- - -
-
-### `tf.contrib.losses.sparse_softmax_cross_entropy(*args, **kwargs)` {#sparse_softmax_cross_entropy}
-
-Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.sparse_softmax_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`labels`</b>: [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64`
-    in the range `[0, num_classes)`.
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
-    of shape [batch_size] or [batch_size, 1].
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the mean loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `logits`, `labels`, and `weights` are
-    incompatible, or if `weights` is None.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
deleted file mode 100644
index f7f02dd7d18..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ /dev/null
@@ -1,2059 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Metrics (contrib)
-[TOC]
-
-##Ops for evaluation metrics and summary statistics.
-
-### API
-
-This module provides functions for computing streaming metrics: metrics computed
-on dynamically valued `Tensors`. Each metric declaration returns a
-"value_tensor", an idempotent operation that returns the current value of the
-metric, and an "update_op", an operation that accumulates the information
-from the current value of the `Tensors` being measured as well as returns the
-value of the "value_tensor".
-
-To use any of these metrics, one need only declare the metric, call `update_op`
-repeatedly to accumulate data over the desired number of `Tensor` values (often
-each one is a single batch) and finally evaluate the value_tensor. For example,
-to use the `streaming_mean`:
-
-```python
-value = ...
-mean_value, update_op = tf.contrib.metrics.streaming_mean(values)
-sess.run(tf.local_variables_initializer())
-
-for i in range(number_of_batches):
-  print('Mean after batch %d: %f' % (i, update_op.eval())
-print('Final Mean: %f' % mean_value.eval())
-```
-
-Each metric function adds nodes to the graph that hold the state necessary to
-compute the value of the metric as well as a set of operations that actually
-perform the computation. Every metric evaluation is composed of three steps
-
-* Initialization: initializing the metric state.
-* Aggregation: updating the values of the metric state.
-* Finalization: computing the final metric value.
-
-In the above example, calling streaming_mean creates a pair of state variables
-that will contain (1) the running sum and (2) the count of the number of samples
-in the sum.  Because the streaming metrics use local variables,
-the Initialization stage is performed by running the op returned
-by `tf.local_variables_initializer()`. It sets the sum and count variables to
-zero.
-
-Next, Aggregation is performed by examining the current state of `values`
-and incrementing the state variables appropriately. This step is executed by
-running the `update_op` returned by the metric.
-
-Finally, finalization is performed by evaluating the "value_tensor"
-
-In practice, we commonly want to evaluate across many batches and multiple
-metrics. To do so, we need only run the metric computation operations multiple
-times:
-
-```python
-labels = ...
-predictions = ...
-accuracy, update_op_acc = tf.contrib.metrics.streaming_accuracy(
-    labels, predictions)
-error, update_op_error = tf.contrib.metrics.streaming_mean_absolute_error(
-    labels, predictions)
-
-sess.run(tf.local_variables_initializer())
-for batch in range(num_batches):
-  sess.run([update_op_acc, update_op_error])
-
-accuracy, mean_absolute_error = sess.run([accuracy, mean_absolute_error])
-```
-
-Note that when evaluating the same metric multiple times on different inputs,
-one must specify the scope of each metric to avoid accumulating the results
-together:
-
-```python
-labels = ...
-predictions0 = ...
-predictions1 = ...
-
-accuracy0 = tf.contrib.metrics.accuracy(labels, predictions0, name='preds0')
-accuracy1 = tf.contrib.metrics.accuracy(labels, predictions1, name='preds1')
-```
-
-Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted
-via a `weights` argument. The `weights` tensor must be the same size as the
-labels and predictions tensors and results in a weighted average of the metric.
-
-## Metric `Ops`
-
-- - -
-
-### `tf.contrib.metrics.streaming_accuracy(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_accuracy}
-
-Calculates how often `predictions` matches `labels`.
-
-The `streaming_accuracy` function creates two local variables, `total` and
-`count` that are used to compute the frequency with which `predictions`
-matches `labels`. This frequency is ultimately returned as `accuracy`: an
-idempotent operation that simply divides `total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `accuracy`.
-Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
-where the corresponding elements of `predictions` and `labels` match and 0.0
-otherwise. Then `update_op` increments `total` with the reduced sum of the
-product of `weights` and `is_correct`, and it increments `count` with the
-reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of any shape.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose shape matches
-    `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `accuracy` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`accuracy`</b>: A `Tensor` representing the accuracy, the value of `total` divided
-    by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `accuracy`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean(values, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean}
-
-Computes the (weighted) mean of the given values.
-
-The `streaming_mean` function creates two local variables, `total` and `count`
-that are used to compute the average of `values`. This average is ultimately
-returned as `mean` which is an idempotent operation that simply divides
-`total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean`.
-`update_op` increments `total` with the reduced sum of the product of `values`
-and `weights`, and it increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `values`, and
-    must be broadcastable to `values` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `values` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op`
-    should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean`</b>: A `Tensor` representing the current mean, the value of `total` divided
-    by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_recall(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall}
-
-Computes the recall of the predictions with respect to the labels.
-
-The `streaming_recall` function creates two local variables, `true_positives`
-and `false_negatives`, that are used to compute the recall. This value is
-ultimately returned as `recall`, an idempotent operation that simply divides
-`true_positives` by the sum of `true_positives`  and `false_negatives`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` that updates these variables and returns the `recall`. `update_op`
-weights each prediction by the corresponding value in `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: Scalar float `Tensor` with the value of `true_positives` divided
-    by the sum of `true_positives` and `false_negatives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_negatives` variables appropriately and whose value matches
-    `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_recall_at_thresholds(predictions, labels, thresholds, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall_at_thresholds}
-
-Computes various recall values for different `thresholds` on `predictions`.
-
-The `streaming_recall_at_thresholds` function creates four local variables,
-`true_positives`, `true_negatives`, `false_positives` and `false_negatives`
-for various values of thresholds. `recall[i]` is defined as the total weight
-of values in `predictions` above `thresholds[i]` whose corresponding entry in
-`labels` is `True`, divided by the total weight of `True` values in `labels`
-(`true_positives[i] / (true_positives[i] + false_negatives[i])`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `recall`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`thresholds`</b>: A python list or tuple of float thresholds in `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: A float `Tensor` of shape `[len(thresholds)]`.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables that
-    are used in the computation of `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_precision(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision}
-
-Computes the precision of the predictions with respect to the labels.
-
-The `streaming_precision` function creates two local variables,
-`true_positives` and `false_positives`, that are used to compute the
-precision. This value is ultimately returned as `precision`, an idempotent
-operation that simply divides `true_positives` by the sum of `true_positives`
-and `false_positives`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision`. `update_op` weights each prediction by the corresponding value in
-`weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `precision` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar float `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_precision_at_thresholds(predictions, labels, thresholds, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision_at_thresholds}
-
-Computes precision values for different `thresholds` on `predictions`.
-
-The `streaming_precision_at_thresholds` function creates four local variables,
-`true_positives`, `true_negatives`, `false_positives` and `false_negatives`
-for various values of thresholds. `precision[i]` is defined as the total
-weight of values in `predictions` above `thresholds[i]` whose corresponding
-entry in `labels` is `True`, divided by the total weight of values in
-`predictions` above `thresholds[i]` (`true_positives[i] / (true_positives[i] +
-false_positives[i])`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`thresholds`</b>: A python list or tuple of float thresholds in `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: A float `Tensor` of shape `[len(thresholds)]`.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables that
-    are used in the computation of `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_auc(predictions, labels, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, curve='ROC', name=None)` {#streaming_auc}
-
-Computes the approximate AUC via a Riemann sum.
-
-The `streaming_auc` function creates four local variables, `true_positives`,
-`true_negatives`, `false_positives` and `false_negatives` that are used to
-compute the AUC. To discretize the AUC curve, a linearly spaced set of
-thresholds is used to compute pairs of recall and precision values. The area
-under the ROC-curve is therefore computed using the height of the recall
-values by the false positive rate, while the area under the PR-curve is the
-computed using the height of the precision values by the recall.
-
-This value is ultimately returned as `auc`, an idempotent operation that
-computes the area under a discretized curve of precision versus recall values
-(computed using the aforementioned variables). The `num_thresholds` variable
-controls the degree of discretization with larger numbers of thresholds more
-closely approximating the true AUC. The quality of the approximation may vary
-dramatically depending on `num_thresholds`.
-
-For best results, `predictions` should be distributed approximately uniformly
-in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
-approximation may be poor if this is not the case.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `auc`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use when discretizing the roc
-    curve.
-*  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`curve`</b>: Specifies the name of the curve to be computed, 'ROC' [default] or
-  'PR' for the Precision-Recall-curve.
-
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`auc`</b>: A scalar `Tensor` representing the current area-under-curve.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `auc`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_recall_at_k(*args, **kwargs)` {#streaming_recall_at_k}
-
-Computes the recall@k of the predictions with respect to dense labels. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
-Instructions for updating:
-Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
-
-The `streaming_recall_at_k` function creates two local variables, `total` and
-`count`, that are used to compute the recall@k frequency. This frequency is
-ultimately returned as `recall_at_<k>`: an idempotent operation that simply
-divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
-shape [batch_size] whose elements indicate whether or not the corresponding
-label is in the top `k` `predictions`. Then `update_op` increments `total`
-with the reduced sum of `weights` where `in_top_k` is `True`, and it
-increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A float `Tensor` of dimension [batch_size, num_classes].
-*  <b>`labels`</b>: A `Tensor` of dimension [batch_size] whose type is in `int32`,
-    `int64`.
-*  <b>`k`</b>: The number of top elements to look at for computing recall.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall_at_k`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall_at_k`</b>: A `Tensor` representing the recall@k, the fraction of labels
-    which fall into the top `k` predictions.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `recall_at_k`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_absolute_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_absolute_error}
-
-Computes the mean absolute error between the labels and predictions.
-
-The `streaming_mean_absolute_error` function creates two local variables,
-`total` and `count` that are used to compute the mean absolute error. This
-average is weighted by `weights`, and it is ultimately returned as
-`mean_absolute_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_absolute_error`. Internally, an `absolute_errors` operation computes the
-absolute value of the differences between `predictions` and `labels`. Then
-`update_op` increments `total` with the reduced sum of the product of
-`weights` and `absolute_errors`, and it increments `count` with the reduced
-sum of `weights`
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_absolute_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_absolute_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_absolute_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_iou(predictions, labels, num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_iou}
-
-Calculate per-step mean Intersection-Over-Union (mIOU).
-
-Mean Intersection-Over-Union is a common evaluation metric for
-semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes.
-
-##### IOU is defined as follows:
-
-  IOU = true_positive / (true_positive + false_positive + false_negative).
-The predictions are accumulated in a confusion matrix, weighted by `weights`,
-and mIOU is then calculated from it.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean_iou`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of prediction results for semantic labels, whose
-    shape is [batch size] and type `int32` or `int64`. The tensor will be
-    flattened, if its rank > 1.
-*  <b>`labels`</b>: A `Tensor` of ground truth labels with shape [batch size] and of
-    type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
-*  <b>`num_classes`</b>: The possible number of labels the prediction task can
-    have. This value must be provided, since a confusion matrix of
-    dimension = [num_classes, num_classes] will be allocated.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean_iou`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_iou`</b>: A `Tensor` representing the mean intersection-over-union.
-*  <b>`update_op`</b>: An operation that increments the confusion matrix.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_relative_error(predictions, labels, normalizer, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_relative_error}
-
-Computes the mean relative error by normalizing with the given values.
-
-The `streaming_mean_relative_error` function creates two local variables,
-`total` and `count` that are used to compute the mean relative absolute error.
-This average is weighted by `weights`, and it is ultimately returned as
-`mean_relative_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_reative_error`. Internally, a `relative_errors` operation divides the
-absolute value of the differences between `predictions` and `labels` by the
-`normalizer`. Then `update_op` increments `total` with the reduced sum of the
-product of `weights` and `relative_errors`, and it increments `count` with the
-reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`normalizer`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_relative_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_relative_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_relative_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_squared_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_squared_error}
-
-Computes the mean squared error between the labels and predictions.
-
-The `streaming_mean_squared_error` function creates two local variables,
-`total` and `count` that are used to compute the mean squared error.
-This average is weighted by `weights`, and it is ultimately returned as
-`mean_squared_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_squared_error`. Internally, a `squared_error` operation computes the
-element-wise square of the difference between `predictions` and `labels`. Then
-`update_op` increments `total` with the reduced sum of the product of
-`weights` and `squared_error`, and it increments `count` with the reduced sum
-of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_squared_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_squared_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_squared_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_tensor(values, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_tensor}
-
-Computes the element-wise (weighted) mean of the given tensors.
-
-In contrast to the `streaming_mean` function which returns a scalar with the
-mean,  this function returns an average tensor with the same shape as the
-input tensors.
-
-The `streaming_mean_tensor` function creates two local variables,
-`total_tensor` and `count_tensor` that are used to compute the average of
-`values`. This average is ultimately returned as `mean` which is an idempotent
-operation that simply divides `total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean`.
-`update_op` increments `total` with the reduced sum of the product of `values`
-and `weights`, and it increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `values`, and
-    must be broadcastable to `values` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `values` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op`
-    should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean`</b>: A float `Tensor` representing the current mean, the value of `total`
-    divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_root_mean_squared_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_root_mean_squared_error}
-
-Computes the root mean squared error between the labels and predictions.
-
-The `streaming_root_mean_squared_error` function creates two local variables,
-`total` and `count` that are used to compute the root mean squared error.
-This average is weighted by `weights`, and it is ultimately returned as
-`root_mean_squared_error`: an idempotent operation that takes the square root
-of the division of `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`root_mean_squared_error`. Internally, a `squared_error` operation computes
-the element-wise square of the difference between `predictions` and `labels`.
-Then `update_op` increments `total` with the reduced sum of the product of
-`weights` and `squared_error`, and it increments `count` with the reduced sum
-of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `root_mean_squared_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`root_mean_squared_error`</b>: A `Tensor` representing the current mean, the value
-    of `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `root_mean_squared_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_covariance(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_covariance}
-
-Computes the unbiased sample covariance between `predictions` and `labels`.
-
-The `streaming_covariance` function creates four local variables,
-`comoment`, `mean_prediction`, `mean_label`, and `count`, which are used to
-compute the sample covariance between predictions and labels across multiple
-batches of data. The covariance is ultimately returned as an idempotent
-operation that simply divides `comoment` by `count` - 1. We use `count` - 1
-in order to get an unbiased estimate.
-
-The algorithm used for this online computation is described in
-https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
-Specifically, the formula used to combine two sample comoments is
-`C_AB = C_A + C_B + (E[x_A] - E[x_B]) * (E[y_A] - E[y_B]) * n_A * n_B / n_AB`
-The comoment for a single batch of data is simply
-`sum((x - E[x]) * (y - E[y]))`, optionally weighted.
-
-If `weights` is not None, then it is used to compute weighted comoments,
-means, and count. NOTE: these weights are treated as "frequency weights", as
-opposed to "reliability weights". See discussion of the difference on
-https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
-
-To facilitate the computation of covariance across multiple batches of data,
-the function creates an `update_op` operation, which updates underlying
-variables and returns the updated covariance.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary size.
-*  <b>`labels`</b>: A `Tensor` of the same size as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: A `Tensor` representing the current unbiased sample covariance,
-    `comoment` / (`count` - 1).
-*  <b>`update_op`</b>: An operation that updates the local variables appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If labels and predictions are of different sizes or if either
-    `metrics_collections` or `updates_collections` are not a list or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_pearson_correlation(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_pearson_correlation}
-
-Computes Pearson correlation coefficient between `predictions`, `labels`.
-
-The `streaming_pearson_correlation` function delegates to
-`streaming_covariance` the tracking of three [co]variances:
-
-- `streaming_covariance(predictions, labels)`, i.e. covariance
-- `streaming_covariance(predictions, predictions)`, i.e. variance
-- `streaming_covariance(labels, labels)`, i.e. variance
-
-The product-moment correlation ultimately returned is an idempotent operation
-`cov(predictions, labels) / sqrt(var(predictions) * var(labels))`. To
-facilitate correlation computation across multiple batches, the function
-groups the `update_op`s of the underlying streaming_covariance and returns an
-`update_op`.
-
-If `weights` is not None, then it is used to compute a weighted correlation.
-NOTE: these weights are treated as "frequency weights", as opposed to
-"reliability weights". See discussion of the difference on
-https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary size.
-*  <b>`labels`</b>: A `Tensor` of the same size as predictions.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`pearson_r`</b>: A `Tensor` representing the current Pearson product-moment
-    correlation coefficient, the value of
-    `cov(predictions, labels) / sqrt(var(predictions) * var(labels))`.
-*  <b>`update_op`</b>: An operation that updates the underlying variables appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `labels` and `predictions` are of different sizes, or if
-    `weights` is the wrong size, or if either `metrics_collections` or
-    `updates_collections` are not a `list` or `tuple`.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_mean_cosine_distance(predictions, labels, dim, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_cosine_distance}
-
-Computes the cosine distance between the labels and predictions.
-
-The `streaming_mean_cosine_distance` function creates two local variables,
-`total` and `count` that are used to compute the average cosine distance
-between `predictions` and `labels`. This average is weighted by `weights`,
-and it is ultimately returned as `mean_distance`, which is an idempotent
-operation that simply divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_distance`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of the same shape as `labels`.
-*  <b>`labels`</b>: A `Tensor` of arbitrary shape.
-*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`,
-    and whose dimension `dim` is 1.
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_distance`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_percentage_less(values, threshold, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_percentage_less}
-
-Computes the percentage of values less than the given threshold.
-
-The `streaming_percentage_less` function creates two local variables,
-`total` and `count` that are used to compute the percentage of `values` that
-fall below `threshold`. This rate is weighted by `weights`, and it is
-ultimately returned as `percentage` which is an idempotent operation that
-simply divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`percentage`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A numeric `Tensor` of arbitrary size.
-*  <b>`threshold`</b>: A scalar threshold.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`percentage`</b>: A `Tensor` representing the current mean, the value of `total`
-    divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_sensitivity_at_specificity(predictions, labels, specificity, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sensitivity_at_specificity}
-
-Computes the specificity at a given sensitivity.
-
-The `streaming_sensitivity_at_specificity` function creates four local
-variables, `true_positives`, `true_negatives`, `false_positives` and
-`false_negatives` that are used to compute the sensitivity at the given
-specificity value. The threshold for the given specificity value is computed
-and used to evaluate the corresponding sensitivity.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`sensitivity`. `update_op` increments the `true_positives`, `true_negatives`,
-`false_positives` and `false_negatives` counts with the weight of each case
-found in the `predictions` and `labels`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-For additional information about specificity and sensitivity, see the
-following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`specificity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
-    specificity.
-*  <b>`metrics_collections`</b>: An optional list of collections that `sensitivity`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`sensitivity`</b>: A scalar `Tensor` representing the sensitivity at the given
-    `specificity` value.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `sensitivity`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    `specificity` is not between 0 and 1, or if either `metrics_collections`
-    or `updates_collections` are not a list or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_sparse_average_precision_at_k(predictions, labels, k, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_average_precision_at_k}
-
-Computes average precision@k of predictions with respect to sparse labels.
-
-See `sparse_average_precision_at_k` for details on formula. `weights` are
-applied to the result of `sparse_average_precision_at_k`
-
-`streaming_sparse_average_precision_at_k` creates two local variables,
-`average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
-are used to compute the frequency. This frequency is ultimately returned as
-`average_precision_at_<k>`: an idempotent operation that simply divides
-`average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_positive_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and `predictions` has shape
-    [batch size, num_classes]. The final dimension contains the logit values
-    for each class. [D1, ... DN] must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `predictions_`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`k`</b>: Integer, k for @k metric. This will calculate an average precision for
-    range `[1,k]`, as documented above.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`mean_average_precision`</b>: Scalar `float64` `Tensor` with the mean average
-    precision values.
-*  <b>`update`</b>: `Operation` that increments  variables appropriately, and whose
-    value matches `metric`.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_sparse_precision_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_k}
-
-Computes precision@k of the predictions with respect to sparse labels.
-
-If `class_id` is not specified, we calculate precision as the ratio of true
-    positives (i.e., correct predictions, items in the top `k` highest
-    `predictions` that are found in the corresponding row in `labels`) to
-    positives (all top `k` `predictions`).
-If `class_id` is specified, we calculate precision by considering only the
-    rows in the batch for which `class_id` is in the top `k` highest
-    `predictions`, and computing the fraction of them for which `class_id` is
-    in the corresponding row in `labels`.
-
-We expect precision to decrease as `k` increases.
-
-`streaming_sparse_precision_at_k` creates two local variables,
-`true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
-the precision@k frequency. This frequency is ultimately returned as
-`precision_at_<k>`: an idempotent operation that simply divides
-`true_positive_at_<k>` by total (`true_positive_at_<k>` +
-`false_positive_at_<k>`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_positive_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-    The final dimension contains the logit values for each class. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `predictions`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`k`</b>: Integer, k for @k metric.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes], where num_classes is the last dimension of
-    `predictions`. If `class_id` is outside this range, the method returns
-    NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately, and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-    `predictions`, or if either `metrics_collections` or `updates_collections`
-    are not a list or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(top_k_predictions, labels, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_top_k}
-
-Computes precision@k of top-k predictions with respect to sparse labels.
-
-If `class_id` is not specified, we calculate precision as the ratio of
-    true positives (i.e., correct predictions, items in `top_k_predictions`
-    that are found in the corresponding row in `labels`) to positives (all
-    `top_k_predictions`).
-If `class_id` is specified, we calculate precision by considering only the
-    rows in the batch for which `class_id` is in the top `k` highest
-    `predictions`, and computing the fraction of them for which `class_id` is
-    in the corresponding row in `labels`.
-
-We expect precision to decrease as `k` increases.
-
-`streaming_sparse_precision_at_top_k` creates two local variables,
-`true_positive_at_k` and `false_positive_at_k`, that are used to compute
-the precision@k frequency. This frequency is ultimately returned as
-`precision_at_k`: an idempotent operation that simply divides
-`true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_k`. Internally, set operations applied to `top_k_predictions`
-and `labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_k` and
-`false_positive_at_k` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`top_k_predictions`</b>: Integer `Tensor` with shape [D1, ... DN, k] where
-    N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
-    The final dimension contains the indices of top-k labels. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `top_k_predictions`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes), where num_classes is the last dimension of
-    `predictions`. If `class_id` is outside this range, the method returns
-    NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately, and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-    `predictions`, or if either `metrics_collections` or `updates_collections`
-    are not a list or tuple.
-*  <b>`ValueError`</b>: If `top_k_predictions` has rank < 2.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_sparse_recall_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_recall_at_k}
-
-Computes recall@k of the predictions with respect to sparse labels.
-
-If `class_id` is not specified, we'll calculate recall as the ratio of true
-    positives (i.e., correct predictions, items in the top `k` highest
-    `predictions` that are found in the corresponding row in `labels`) to
-    actual positives (the full `labels` row).
-If `class_id` is specified, we calculate recall by considering only the rows
-    in the batch for which `class_id` is in `labels`, and computing the
-    fraction of them for which `class_id` is in the corresponding row in
-    `labels`.
-
-`streaming_sparse_recall_at_k` creates two local variables,
-`true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
-the recall_at_k frequency. This frequency is ultimately returned as
-`recall_at_<k>`: an idempotent operation that simply divides
-`true_positive_at_<k>` by total (`true_positive_at_<k>` +
-`false_negative_at_<k>`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false negatives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_negative_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-    The final dimension contains the logit values for each class. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
-    Values should be in range [0, num_classes), where num_classes is the last
-    dimension of `predictions`. Values outside this range always count
-    towards `false_negative_at_<k>`.
-*  <b>`k`</b>: Integer, k for @k metric.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes), where num_classes is the last dimension of
-    `predictions`. If class_id is outside this range, the method returns NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: Scalar `float64` `Tensor` with the value of `true_positives` divided
-    by the sum of `true_positives` and `false_negatives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_negatives` variables appropriately, and whose value matches
-    `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-  `predictions`, or if either `metrics_collections` or `updates_collections`
-  are not a list or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_specificity_at_sensitivity(predictions, labels, sensitivity, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, name=None)` {#streaming_specificity_at_sensitivity}
-
-Computes the specificity at a given sensitivity.
-
-The `streaming_specificity_at_sensitivity` function creates four local
-variables, `true_positives`, `true_negatives`, `false_positives` and
-`false_negatives` that are used to compute the specificity at the given
-sensitivity value. The threshold for the given sensitivity value is computed
-and used to evaluate the corresponding specificity.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`specificity`. `update_op` increments the `true_positives`, `true_negatives`,
-`false_positives` and `false_negatives` counts with the weight of each case
-found in the `predictions` and `labels`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-For additional information about specificity and sensitivity, see the
-following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`sensitivity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
-    sensitivity.
-*  <b>`metrics_collections`</b>: An optional list of collections that `specificity`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`specificity`</b>: A scalar `Tensor` representing the specificity at the given
-    `specificity` value.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `specificity`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    `sensitivity` is not between 0 and 1, or if either `metrics_collections`
-    or `updates_collections` are not a list or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_concat(values, axis=0, max_size=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_concat}
-
-Concatenate values along an axis across batches.
-
-The function `streaming_concat` creates two local variables, `array` and
-`size`, that are used to store concatenated values. Internally, `array` is
-used as storage for a dynamic array (if `maxsize` is `None`), which ensures
-that updates can be run in amortized constant time.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that appends the values of a tensor and returns the
-length of the concatenated axis.
-
-This op allows for evaluating metrics that cannot be updated incrementally
-using the same framework as other streaming metrics.
-
-##### Args:
-
-
-*  <b>`values`</b>: `Tensor` to concatenate. Rank and the shape along all axes other
-    than the axis to concatenate along must be statically known.
-*  <b>`axis`</b>: optional integer axis to concatenate along.
-*  <b>`max_size`</b>: optional integer maximum size of `value` along the given axis.
-    Once the maximum size is reached, further updates are no-ops. By default,
-    there is no maximum size: the array is resized as necessary.
-*  <b>`metrics_collections`</b>: An optional list of collections that `value`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value`</b>: A `Tensor` representing the concatenated values.
-*  <b>`update_op`</b>: An operation that concatenates the next values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `values` does not have a statically known rank, `axis` is
-    not in the valid range or the size of `values` is not statically known
-    along any axis other than `axis`.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_false_negatives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_false_negatives}
-
-Computes the total number of false positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_false_negatives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_false_negatives_at_thresholds}
-
-
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_false_positives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_false_positives}
-
-Sum the weights of false positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_false_positives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_false_positives_at_thresholds}
-
-
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_true_negatives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_true_negatives}
-
-Sum the weights of true_negatives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_true_negatives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_true_negatives_at_thresholds}
-
-
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_true_positives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_true_positives}
-
-Sum the weights of true_positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
-
-- - -
-
-### `tf.contrib.metrics.streaming_true_positives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_true_positives_at_thresholds}
-
-
-
-
-
-- - -
-
-### `tf.contrib.metrics.auc_using_histogram(boolean_labels, scores, score_range, nbins=100, collections=None, check_shape=True, name=None)` {#auc_using_histogram}
-
-AUC computed by maintaining histograms.
-
-Rather than computing AUC directly, this Op maintains Variables containing
-histograms of the scores associated with `True` and `False` labels.  By
-comparing these the AUC is generated, with some discretization error.
-See: "Efficient AUC Learning Curve Calculation" by Bouckaert.
-
-This AUC Op updates in `O(batch_size + nbins)` time and works well even with
-large class imbalance.  The accuracy is limited by discretization error due
-to finite number of bins.  If scores are concentrated in a fewer bins,
-accuracy is lower.  If this is a concern, we recommend trying different
-numbers of bins and comparing results.
-
-##### Args:
-
-
-*  <b>`boolean_labels`</b>: 1-D boolean `Tensor`.  Entry is `True` if the corresponding
-    record is in class.
-*  <b>`scores`</b>: 1-D numeric `Tensor`, same shape as boolean_labels.
-*  <b>`score_range`</b>: `Tensor` of shape `[2]`, same dtype as `scores`.  The min/max
-    values of score that we expect.  Scores outside range will be clipped.
-*  <b>`nbins`</b>: Integer number of bins to use.  Accuracy strictly increases as the
-    number of bins increases.
-*  <b>`collections`</b>: List of graph collections keys. Internal histogram Variables
-    are added to these collections. Defaults to `[GraphKeys.LOCAL_VARIABLES]`.
-*  <b>`check_shape`</b>: Boolean.  If `True`, do a runtime shape check on the scores
-    and labels.
-*  <b>`name`</b>: A name for this Op.  Defaults to "auc_using_histogram".
-
-##### Returns:
-
-
-*  <b>`auc`</b>: `float32` scalar `Tensor`.  Fetching this converts internal histograms
-    to auc value.
-*  <b>`update_op`</b>: `Op`, when run, updates internal histograms.
-
-
-
-- - -
-
-### `tf.contrib.metrics.accuracy(predictions, labels, weights=None)` {#accuracy}
-
-Computes the percentage of times that predictions matches labels.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: the predicted values, a `Tensor` whose dtype and shape
-               matches 'labels'.
-*  <b>`labels`</b>: the ground truth values, a `Tensor` of any shape and
-          bool, integer, or string dtype.
-*  <b>`weights`</b>: None or `Tensor` of float values to reweight the accuracy.
-
-##### Returns:
-
-  Accuracy `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dtypes don't match or
-              if dtype is not bool, integer, or string.
-
-
-
-- - -
-
-### `tf.contrib.metrics.aggregate_metrics(*value_update_tuples)` {#aggregate_metrics}
-
-Aggregates the metric value tensors and update ops into two lists.
-
-##### Args:
-
-
-*  <b>`*value_update_tuples`</b>: a variable number of tuples, each of which contain the
-    pair of (value_tensor, update_op) from a streaming metric.
-
-##### Returns:
-
-  A list of value `Tensor` objects and a list of update ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `value_update_tuples` is empty.
-
-
-- - -
-
-### `tf.contrib.metrics.aggregate_metric_map(names_to_tuples)` {#aggregate_metric_map}
-
-Aggregates the metric names to tuple dictionary.
-
-This function is useful for pairing metric names with their associated value
-and update ops when the list of metrics is long. For example:
-
-```python
-  metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({
-      'Mean Absolute Error': new_slim.metrics.streaming_mean_absolute_error(
-          predictions, labels, weights),
-      'Mean Relative Error': new_slim.metrics.streaming_mean_relative_error(
-          predictions, labels, labels, weights),
-      'RMSE Linear': new_slim.metrics.streaming_root_mean_squared_error(
-          predictions, labels, weights),
-      'RMSE Log': new_slim.metrics.streaming_root_mean_squared_error(
-          predictions, labels, weights),
-  })
-```
-
-##### Args:
-
-
-*  <b>`names_to_tuples`</b>: a map of metric names to tuples, each of which contain the
-    pair of (value_tensor, update_op) from a streaming metric.
-
-##### Returns:
-
-  A dictionary from metric names to value ops and a dictionary from metric
-  names to update ops.
-
-
-
-- - -
-
-### `tf.contrib.metrics.confusion_matrix(labels, predictions, num_classes=None, dtype=tf.int32, name=None, weights=None)` {#confusion_matrix}
-
-Deprecated. Use tf.confusion_matrix instead.
-
-
-
-## Set `Ops`
-
-- - -
-
-### `tf.contrib.metrics.set_difference(a, b, aminusb=True, validate_indices=True)` {#set_difference}
-
-Compute set difference of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_difference(a, b, aminusb=True) = [
-    [
-      [
-        [2],
-        [3],
-      ],
-      [
-        [],
-        [],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`aminusb`</b>: Whether to subtract `b` from `a`, vs vice versa.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  differences.
-
-
-- - -
-
-### `tf.contrib.metrics.set_intersection(a, b, validate_indices=True)` {#set_intersection}
-
-Compute set intersection of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_intersection(a, b) = [
-    [
-      [
-        [1],
-        [],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  intersections.
-
-
-- - -
-
-### `tf.contrib.metrics.set_size(a, validate_indices=True)` {#set_size}
-
-Compute number of unique elements along last dimension of `a`.
-
-##### Args:
-
-
-*  <b>`a`</b>: `SparseTensor`, with indices sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a`.
-
-##### Returns:
-
-  `int32` `Tensor` of set sizes. For `a` ranked `n`, this is a `Tensor` with
-  rank `n-1`, and the same 1st `n-1` dimensions as `a`. Each value is the
-  number of unique elements in the corresponding `[0...n-1]` dimension of `a`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `a` is an invalid types.
-
-
-- - -
-
-### `tf.contrib.metrics.set_union(a, b, validate_indices=True)` {#set_union}
-
-Compute set union of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_union(a, b) = [
-    [
-      [
-        [1, 2, 3],
-        [2, 3],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  unions.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.opt.md b/tensorflow/g3doc/api_docs/python/contrib.opt.md
deleted file mode 100644
index 12d789af835..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.opt.md
+++ /dev/null
@@ -1,423 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Optimization (contrib)
-[TOC]
-
-opt: A module containing optimization routines.
-
-## Other Functions and Classes
-- - -
-
-### `class tf.contrib.opt.ExternalOptimizerInterface` {#ExternalOptimizerInterface}
-
-Base class for interfaces with external optimization algorithms.
-
-Subclass this and implement `_minimize` in order to wrap a new optimization
-algorithm.
-
-`ExternalOptimizerInterface` should not be instantiated directly; instead use
-e.g. `ScipyOptimizerInterface`.
-
-- - -
-
-#### `tf.contrib.opt.ExternalOptimizerInterface.__init__(loss, var_list=None, equalities=None, inequalities=None, **optimizer_kwargs)` {#ExternalOptimizerInterface.__init__}
-
-Initialize a new interface instance.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A scalar `Tensor` to be minimized.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`equalities`</b>: Optional list of equality constraint scalar `Tensor`s to be
-    held equal to zero.
-*  <b>`inequalities`</b>: Optional list of inequality constraint scalar `Tensor`s
-    to be kept nonnegative.
-*  <b>`**optimizer_kwargs`</b>: Other subclass-specific keyword arguments.
-
-
-
-- - -
-
-#### `tf.contrib.opt.ExternalOptimizerInterface.minimize(session=None, feed_dict=None, fetches=None, step_callback=None, loss_callback=None)` {#ExternalOptimizerInterface.minimize}
-
-Minimize a scalar `Tensor`.
-
-Variables subject to optimization are updated in-place at the end of
-optimization.
-
-Note that this method does *not* just return a minimization `Op`, unlike
-`Optimizer.minimize()`; instead it actually performs minimization by
-executing commands to control a `Session`.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `Session` instance.
-*  <b>`feed_dict`</b>: A feed dict to be passed to calls to `session.run`.
-*  <b>`fetches`</b>: A list of `Tensor`s to fetch and supply to `loss_callback`
-    as positional arguments.
-*  <b>`step_callback`</b>: A function to be called at each optimization step;
-    arguments are the current values of all optimization variables
-    flattened into a single vector.
-*  <b>`loss_callback`</b>: A function to be called every time the loss and gradients
-    are computed, with evaluated fetches supplied as positional arguments.
-
-
-
-- - -
-
-### `class tf.contrib.opt.MovingAverageOptimizer` {#MovingAverageOptimizer}
-
-Optimizer wrapper that maintains a moving average of parameters.
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.__init__(opt, average_decay=0.9999, num_updates=None, sequential_update=True)` {#MovingAverageOptimizer.__init__}
-
-Construct a new MovingAverageOptimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: A tf.Optimizer that will be used to compute and apply gradients.
-*  <b>`average_decay`</b>: Float.  Decay to use to maintain the moving averages
-                 of trained variables.
-                 See tf.train.ExponentialMovingAverage for details.
-*  <b>`num_updates`</b>: Optional count of number of updates applied to variables.
-               See tf.train.ExponentialMovingAverage for details.
-*  <b>`sequential_update`</b>: Bool. If False, will compute the moving average at the
-                     same time as the model is updated, potentially doing
-                     benign data races.
-                     If True, will update the moving average after gradient
-                     updates.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#MovingAverageOptimizer.apply_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.compute_gradients(loss, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)` {#MovingAverageOptimizer.compute_gradients}
-
-Compute gradients of `loss` for the variables in `var_list`.
-
-This is the first part of `minimize()`.  It returns a list
-of (gradient, variable) pairs where "gradient" is the gradient
-for "variable".  Note that "gradient" can be a `Tensor`, an
-`IndexedSlices`, or `None` if there is no gradient for the
-given variable.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A Tensor containing the value to minimize.
-*  <b>`var_list`</b>: Optional list of `tf.Variable` to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKey.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  A list of (gradient, variable) pairs. Variable is always present, but
-  gradient can be `None`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` contains anything else than `Variable` objects.
-*  <b>`ValueError`</b>: If some arguments are invalid.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_name()` {#MovingAverageOptimizer.get_name}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_slot(var, name)` {#MovingAverageOptimizer.get_slot}
-
-Return a slot named `name` created for `var` by the Optimizer.
-
-Some `Optimizer` subclasses use additional variables.  For example
-`Momentum` and `Adagrad` use variables to accumulate updates.  This method
-gives access to these `Variable` objects if for some reason you need them.
-
-Use `get_slot_names()` to get the list of slot names created by the
-`Optimizer`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A variable passed to `minimize()` or `apply_gradients()`.
-*  <b>`name`</b>: A string.
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_slot_names()` {#MovingAverageOptimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-See `get_slot()`.
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.minimize(loss, global_step=None, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None)` {#MovingAverageOptimizer.minimize}
-
-Add operations to minimize `loss` by updating `var_list`.
-
-This method simply combines calls `compute_gradients()` and
-`apply_gradients()`. If you want to process the gradient before applying
-them call `compute_gradients()` and `apply_gradients()` explicitly instead
-of using this function.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A `Tensor` containing the value to minimize.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`name`</b>: Optional name for the returned operation.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  An Operation that updates the variables in `var_list`.  If `global_step`
-  was not `None`, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.swapping_saver(var_list=None, name='swapping_saver', **kwargs)` {#MovingAverageOptimizer.swapping_saver}
-
-Create a saver swapping moving averages and variables.
-
-You should use this saver during training.  It will save the moving averages
-of the trained parameters under the original parameter names.  For
-evaluations or inference you should use a regular saver and it will
-automatically use the moving averages for the trained variable.
-
-You must call this function after all variables have been created and after
-you have called Optimizer.minimize().
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of variables to save, as per `Saver()`.
-            If set to None, will save all the variables that have been
-            created before this call.
-*  <b>`name`</b>: The name of the saver.
-*  <b>`**kwargs`</b>: Keyword arguments of `Saver()`.
-
-##### Returns:
-
-  A `tf.Saver` object.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If apply_gradients or minimize has not been called before.
-
-
-
-- - -
-
-### `class tf.contrib.opt.ScipyOptimizerInterface` {#ScipyOptimizerInterface}
-
-Wrapper allowing `scipy.optimize.minimize` to operate a `tf.Session`.
-
-Example:
-
-```python
-vector = tf.Variable([7., 7.], 'vector')
-
-# Make vector norm as small as possible.
-loss = tf.reduce_sum(tf.square(vector))
-
-optimizer = ScipyOptimizerInterface(loss, options={'maxiter': 100})
-
-with tf.Session() as session:
-  optimizer.minimize(session)
-
-# The value of vector should now be [0., 0.].
-```
-
-Example with constraints:
-
-```python
-vector = tf.Variable([7., 7.], 'vector')
-
-# Make vector norm as small as possible.
-loss = tf.reduce_sum(tf.square(vector))
-# Ensure the vector's y component is = 1.
-equalities = [vector[1] - 1.]
-# Ensure the vector's x component is >= 1.
-inequalities = [vector[0] - 1.]
-
-# Our default SciPy optimization algorithm, L-BFGS-B, does not support
-# general constraints. Thus we use SLSQP instead.
-optimizer = ScipyOptimizerInterface(
-    loss, equalities=equalities, inequalities=inequalities, method='SLSQP')
-
-with tf.Session() as session:
-  optimizer.minimize(session)
-
-# The value of vector should now be [1., 1.].
-```
-- - -
-
-#### `tf.contrib.opt.ScipyOptimizerInterface.__init__(loss, var_list=None, equalities=None, inequalities=None, **optimizer_kwargs)` {#ScipyOptimizerInterface.__init__}
-
-Initialize a new interface instance.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A scalar `Tensor` to be minimized.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`equalities`</b>: Optional list of equality constraint scalar `Tensor`s to be
-    held equal to zero.
-*  <b>`inequalities`</b>: Optional list of inequality constraint scalar `Tensor`s
-    to be kept nonnegative.
-*  <b>`**optimizer_kwargs`</b>: Other subclass-specific keyword arguments.
-
-
-- - -
-
-#### `tf.contrib.opt.ScipyOptimizerInterface.minimize(session=None, feed_dict=None, fetches=None, step_callback=None, loss_callback=None)` {#ScipyOptimizerInterface.minimize}
-
-Minimize a scalar `Tensor`.
-
-Variables subject to optimization are updated in-place at the end of
-optimization.
-
-Note that this method does *not* just return a minimization `Op`, unlike
-`Optimizer.minimize()`; instead it actually performs minimization by
-executing commands to control a `Session`.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `Session` instance.
-*  <b>`feed_dict`</b>: A feed dict to be passed to calls to `session.run`.
-*  <b>`fetches`</b>: A list of `Tensor`s to fetch and supply to `loss_callback`
-    as positional arguments.
-*  <b>`step_callback`</b>: A function to be called at each optimization step;
-    arguments are the current values of all optimization variables
-    flattened into a single vector.
-*  <b>`loss_callback`</b>: A function to be called every time the loss and gradients
-    are computed, with evaluated fetches supplied as positional arguments.
-
-
-
-- - -
-
-### `class tf.contrib.opt.VariableClippingOptimizer` {#VariableClippingOptimizer}
-
-Wrapper optimizer that clips the norm of specified variables after update.
-
-This optimizer delegates all aspects of gradient calculation and application
-to an underlying optimizer.  After applying gradients, this optimizer then
-clips the variable to have a maximum L2 norm along specified dimensions.
-NB: this is quite different from clipping the norm of the gradients.
-
-Multiple instances of `VariableClippingOptimizer` may be chained to specify
-different max norms for different subsets of variables.
-
-This is more efficient at serving-time than using normalization during
-embedding lookup, at the expense of more expensive training and fewer
-guarantees about the norms.
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.__init__(opt, vars_to_clip_dims, max_norm, use_locking=False, colocate_clip_ops_with_vars=False, name='VariableClipping')` {#VariableClippingOptimizer.__init__}
-
-Construct a new clip-norm optimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: The actual optimizer that will be used to compute and apply the
-    gradients. Must be one of the Optimizer classes.
-*  <b>`vars_to_clip_dims`</b>: A dict with keys as Variables and values as lists
-    of dimensions along which to compute the L2-norm.  See
-    `tf.clip_by_norm` for more details.
-*  <b>`max_norm`</b>: The L2-norm to clip to, for all variables specified.
-*  <b>`use_locking`</b>: If `True` use locks for clip update operations.
-*  <b>`colocate_clip_ops_with_vars`</b>: If `True`, try colocating the clip norm
-    ops with the corresponding variable.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "VariableClipping".
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#VariableClippingOptimizer.apply_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.compute_gradients(*args, **kwargs)` {#VariableClippingOptimizer.compute_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.get_slot(*args, **kwargs)` {#VariableClippingOptimizer.get_slot}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.get_slot_names(*args, **kwargs)` {#VariableClippingOptimizer.get_slot_names}
-
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.rnn.md b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
deleted file mode 100644
index a4600e50b52..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.rnn.md
+++ /dev/null
@@ -1,2232 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# RNN and Cells (contrib)
-[TOC]
-
-Module for constructing RNN Cells and additional RNN operations.
-
-## Base interface for all RNN Cells
-
-- - -
-
-### `class tf.contrib.rnn.RNNCell` {#RNNCell}
-
-Abstract object representing an RNN cell.
-
-The definition of cell in this package differs from the definition used in the
-literature. In the literature, cell refers to an object with a single scalar
-output. The definition in this package refers to a horizontal array of such
-units.
-
-An RNN cell, in the most abstract setting, is anything that has
-a state and performs some operation that takes a matrix of inputs.
-This operation results in an output matrix with `self.output_size` columns.
-If `self.state_size` is an integer, this operation also results in a new
-state matrix with `self.state_size` columns.  If `self.state_size` is a
-tuple of integers, then it results in a tuple of `len(state_size)` state
-matrices, each with a column size corresponding to values in `state_size`.
-
-This module provides a number of basic commonly used RNN cells, such as
-LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-of operators that allow add dropouts, projections, or embeddings for inputs.
-Constructing multi-layer cells is supported by the class `MultiRNNCell`,
-or by calling the `rnn` ops several times. Every `RNNCell` must have the
-properties below and implement `__call__` with the following signature.
-- - -
-
-#### `tf.contrib.rnn.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
-
-Run this RNN cell on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `2-D` tensor with shape `[batch_size x input_size]`.
-*  <b>`state`</b>: if `self.state_size` is an integer, this should be a `2-D Tensor`
-    with shape `[batch_size x self.state_size]`.  Otherwise, if
-    `self.state_size` is a tuple of integers, this should be a tuple
-    with shapes `[batch_size x s] for s in self.state_size`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `2-D` tensor with shape `[batch_size x self.output_size]`.
-  - New state: Either a single `2-D` tensor, or a tuple of tensors matching
-    the arity and shapes of `state`.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.output_size` {#RNNCell.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.state_size` {#RNNCell.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-## Core RNN Cells for use with TensorFlow's core RNN methods
-
-- - -
-
-### `class tf.contrib.rnn.BasicRNNCell` {#BasicRNNCell}
-
-The most basic RNN cell.
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
-
-Most basic RNN: output = new_state = act(W * input + U * state + B).
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.BasicLSTMCell` {#BasicLSTMCell}
-
-Basic LSTM recurrent network cell.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add forget_bias (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-It does not allow cell clipping, a projection layer, and does not
-use peep-hole connections: it is the basic baseline.
-
-For advanced models, please use the full LSTMCell that follows.
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
-
-Long short-term memory cell (LSTM).
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
-
-Initialize the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  The latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.GRUCell` {#GRUCell}
-
-Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
-- - -
-
-#### `tf.contrib.rnn.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
-
-Gated recurrent unit (GRU) with nunits cells.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.output_size` {#GRUCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.state_size` {#GRUCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.LSTMCell` {#LSTMCell}
-
-Long short-term memory unit (LSTM) recurrent network cell.
-
-The default non-peephole implementation is based on:
-
-  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-S. Hochreiter and J. Schmidhuber.
-"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-The peephole implementation is based on:
-
-  https://research.google.com/pubs/archive/43905.pdf
-
-Hasim Sak, Andrew Senior, and Francoise Beaufays.
-"Long short-term memory recurrent neural network architectures for
- large scale acoustic modeling." INTERSPEECH, 2014.
-
-The class uses optional peep-hole connections, optional cell clipping, and
-an optional projection layer.
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
-    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-    `m_state`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "lstm_cell".
-
-##### Returns:
-
-  A tuple containing:
-
-  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-    LSTM after reading `inputs` when previous state was `state`.
-    Here output_dim is:
-       num_proj if num_proj was set,
-       num_units otherwise.
-  - Tensor(s) representing the new state of LSTM after reading `inputs` when
-    the previous state was `state`.  Same type and shape(s) as `state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
-    static shape inference.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
-    by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
-    matrices.  If None, no projection is performed.
-*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-    provided, then the projected values are clipped elementwise to within
-    `[-proj_clip, proj_clip]`.
-*  <b>`num_unit_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`num_proj_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
-    in order to reduce the scale of forgetting at the beginning of
-    the training.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  This latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.output_size` {#LSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.state_size` {#LSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.LayerNormBasicLSTMCell` {#LayerNormBasicLSTMCell}
-
-LSTM unit with layer normalization and recurrent dropout.
-
-This class adds layer normalization and recurrent dropout to a
-basic LSTM unit. Layer normalization implementation is based on:
-
-  https://arxiv.org/abs/1607.06450.
-
-"Layer Normalization"
-Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
-
-and is applied before the internal nonlinearities.
-Recurrent dropout is base on:
-
-  https://arxiv.org/abs/1603.05118
-
-"Recurrent Dropout without Memory Loss"
-Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.__call__(inputs, state, scope=None)` {#LayerNormBasicLSTMCell.__call__}
-
-LSTM cell with layer normalization and recurrent dropout.
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, activation=tanh, layer_norm=True, norm_gain=1.0, norm_shift=0.0, dropout_keep_prob=1.0, dropout_prob_seed=None)` {#LayerNormBasicLSTMCell.__init__}
-
-Initializes the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`activation`</b>: Activation function of the inner states.
-*  <b>`layer_norm`</b>: If `True`, layer normalization will be applied.
-*  <b>`norm_gain`</b>: float, The layer normalization gain initial value. If
-    `layer_norm` has been set to `False`, this argument will be ignored.
-*  <b>`norm_shift`</b>: float, The layer normalization shift initial value. If
-    `layer_norm` has been set to `False`, this argument will be ignored.
-*  <b>`dropout_keep_prob`</b>: unit Tensor or float between 0 and 1 representing the
-    recurrent dropout probability value. If float and 1.0, no dropout will
-    be applied.
-*  <b>`dropout_prob_seed`</b>: (optional) integer, the randomness seed.
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.output_size` {#LayerNormBasicLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.state_size` {#LayerNormBasicLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.zero_state(batch_size, dtype)` {#LayerNormBasicLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-## Classes storing split `RNNCell` state
-
-- - -
-
-### `class tf.contrib.rnn.LSTMStateTuple` {#LSTMStateTuple}
-
-Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
-
-Stores two elements: `(c, h)`, in that order.
-
-Only used when `state_is_tuple=True`.
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
-
-Create new instance of LSTMStateTuple(c, h)
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.c` {#LSTMStateTuple.c}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.h` {#LSTMStateTuple.h}
-
-Alias for field number 1
-
-
-
-
-## Core RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-- - -
-
-### `class tf.contrib.rnn.MultiRNNCell` {#MultiRNNCell}
-
-RNN cell composed sequentially of multiple simple cells.
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
-
-Run this multi-layer cell on inputs, starting from state.
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
-
-Create a RNN cell composed sequentially of a number of RNNCells.
-
-##### Args:
-
-
-*  <b>`cells`</b>: list of RNNCells that will be composed in this order.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
-    `n = len(cells)`.  If False, the states are all
-    concatenated along the column axis.  This latter behavior will soon be
-    deprecated.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if cells is empty (not allowed), or at least one of the cells
-    returns a state tuple but the flag `state_is_tuple` is `False`.
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.LSTMBlockWrapper` {#LSTMBlockWrapper}
-
-This is a helper class that provides housekeeping for LSTM cells.
-
-This may be useful for alternative LSTM and similar type of cells.
-The subclasses must implement `_call_cell` method and `num_units` property.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockWrapper.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#LSTMBlockWrapper.__call__}
-
-Run this LSTM on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len, batch_size, input_size]`
-    or a list of `time_len` tensors of shape `[batch_size, input_size]`.
-*  <b>`initial_state`</b>: a tuple `(initial_cell_state, initial_output)` with tensors
-    of shape `[batch_size, self._num_units]`. If this is not provided, the
-    cell is expected to create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len).`
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len, batch_size, output_size]`
-    or a list of time_len tensors of shape `[batch_size, output_size]`,
-    to match the type of the `inputs`.
-  - Final state: a tuple `(cell_state, output)` matching `initial_state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: in case of shape mismatches
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockWrapper.num_units` {#LSTMBlockWrapper.num_units}
-
-Number of units in this cell (output dimension).
-
-
-
-- - -
-
-### `class tf.contrib.rnn.DropoutWrapper` {#DropoutWrapper}
-
-Operator adding dropout to inputs and outputs of the given cell.
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
-
-Run the cell with the declared dropouts.
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
-
-Create a cell with added input and/or output dropout.
-
-Dropout is never used on the state.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`input_keep_prob`</b>: unit Tensor or float between 0 and 1, input keep
-    probability; if it is float and 1, no input dropout will be added.
-*  <b>`output_keep_prob`</b>: unit Tensor or float between 0 and 1, output keep
-    probability; if it is float and 1, no output dropout will be added.
-*  <b>`seed`</b>: (optional) integer, the randomness seed.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if keep_prob is not between 0 and 1.
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.EmbeddingWrapper` {#EmbeddingWrapper}
-
-Operator adding input embedding to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the embedding on this batch-concatenated sequence, then split it and
-feed into your RNN.
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
-
-Run the cell on embedded inputs.
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
-
-Create a cell with an added input embedding.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, an embedding will be put before its inputs.
-*  <b>`embedding_classes`</b>: integer, how many symbols will be embedded.
-*  <b>`embedding_size`</b>: integer, the size of the vectors we embed into.
-*  <b>`initializer`</b>: an initializer to use when creating the embedding;
-    if None, the initializer from variable scope or a default one is used.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if embedding_classes is not positive.
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.InputProjectionWrapper` {#InputProjectionWrapper}
-
-Operator adding an input projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the projection on this batch-concatenated sequence, then split it.
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
-
-Run the input projection and then the cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
-
-Create a cell with input projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection of inputs is added before it.
-*  <b>`num_proj`</b>: Python integer.  The dimension to project to.
-*  <b>`input_size`</b>: Deprecated and unused.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.OutputProjectionWrapper` {#OutputProjectionWrapper}
-
-Operator adding an output projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your outputs in time,
-do the projection on this batch-concatenated sequence, then split it
-if needed or directly feed into a softmax.
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
-
-Run the cell and output projection on inputs, starting from state.
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
-
-Create a cell with output projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`output_size`</b>: integer, the size of the output after projection.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if output_size is not positive.
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.DeviceWrapper` {#DeviceWrapper}
-
-Operator that ensures an RNNCell runs on a particular device.
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.__call__(inputs, state, scope=None)` {#DeviceWrapper.__call__}
-
-Run the cell on specified device.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.__init__(cell, device)` {#DeviceWrapper.__init__}
-
-Construct a `DeviceWrapper` for `cell` with device `device`.
-
-Ensures the wrapped `cell` is called with `tf.device(device)`.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-*  <b>`device`</b>: A device string or function, for passing to `tf.device`.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.output_size` {#DeviceWrapper.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.state_size` {#DeviceWrapper.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.zero_state(batch_size, dtype)` {#DeviceWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.ResidualWrapper` {#ResidualWrapper}
-
-RNNCell wrapper that ensures cell inputs are added to the outputs.
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.__call__(inputs, state, scope=None)` {#ResidualWrapper.__call__}
-
-Run the cell and add its inputs to its outputs.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: cell inputs.
-*  <b>`state`</b>: cell state.
-*  <b>`scope`</b>: optional cell scope.
-
-##### Returns:
-
-  Tuple of cell outputs and new state.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If cell inputs and outputs have different structure (type).
-*  <b>`ValueError`</b>: If cell inputs and outputs have different structure (value).
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.__init__(cell)` {#ResidualWrapper.__init__}
-
-Constructs a `ResidualWrapper` for `cell`.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.output_size` {#ResidualWrapper.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.state_size` {#ResidualWrapper.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.zero_state(batch_size, dtype)` {#ResidualWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-### Block RNNCells
-- - -
-
-### `class tf.contrib.rnn.LSTMBlockCell` {#LSTMBlockCell}
-
-Basic LSTM recurrent network cell.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add `forget_bias` (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-Unlike `core_rnn_cell.LSTMCell`, this is a monolithic op and should be much
-faster.  The weight and bias matrixes should be compatible as long as the
-variable scope matches.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.__call__(x, states_prev, scope=None)` {#LSTMBlockCell.__call__}
-
-Long short-term memory cell (LSTM).
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.__init__(num_units, forget_bias=1.0, use_peephole=False)` {#LSTMBlockCell.__init__}
-
-Initialize the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.output_size` {#LSTMBlockCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.state_size` {#LSTMBlockCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.zero_state(batch_size, dtype)` {#LSTMBlockCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.GRUBlockCell` {#GRUBlockCell}
-
-Block GRU cell implementation.
-
-The implementation is based on:  http://arxiv.org/abs/1406.1078
-Computes the LSTM cell forward propagation for 1 time step.
-
-This kernel op implements the following mathematical equations:
-
-Biases are initialized with:
-
-* `b_ru` - constant_initializer(1.0)
-* `b_c` - constant_initializer(0.0)
-
-```
-x_h_prev = [x, h_prev]
-
-[r_bar u_bar] = x_h_prev * w_ru + b_ru
-
-r = sigmoid(r_bar)
-u = sigmoid(u_bar)
-
-h_prevr = h_prev \circ r
-
-x_h_prevr = [x h_prevr]
-
-c_bar = x_h_prevr * w_c + b_c
-c = tanh(c_bar)
-
-h = (1-u) \circ c + u \circ h_prev
-```
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.__call__(x, h_prev, scope=None)` {#GRUBlockCell.__call__}
-
-GRU cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.__init__(cell_size)` {#GRUBlockCell.__init__}
-
-Initialize the Block GRU cell.
-
-##### Args:
-
-
-*  <b>`cell_size`</b>: int, GRU cell size.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.output_size` {#GRUBlockCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.state_size` {#GRUBlockCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.zero_state(batch_size, dtype)` {#GRUBlockCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-### Fused RNNCells
-- - -
-
-### `class tf.contrib.rnn.FusedRNNCell` {#FusedRNNCell}
-
-Abstract object representing a fused RNN cell.
-
-A fused RNN cell represents the entire RNN expanded over the time
-dimension. In effect, this represents an entire recurrent network.
-
-Unlike RNN cells which are subclasses of `rnn_cell.RNNCell`, a `FusedRNNCell`
-operates on the entire time sequence at once, by putting the loop over time
-inside the cell. This usually leads to much more efficient, but more complex
-and less flexible implementations.
-
-Every `FusedRNNCell` must implement `__call__` with the following signature.
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCell.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#FusedRNNCell.__call__}
-
-Run this fused RNN on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len x batch_size x input_size]`
-    or a list of `time_len` tensors of shape `[batch_size x input_size]`.
-*  <b>`initial_state`</b>: either a tensor with shape `[batch_size x state_size]`
-    or a tuple with shapes `[batch_size x s] for s in state_size`, if the
-    cell takes tuples. If this is not provided, the cell is expected to
-    create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-      dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len)`.
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` or `string` for the created subgraph; defaults to
-    class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len x batch_size x output_size]`
-    or a list of `time_len` tensors of shape `[batch_size x output_size]`,
-    to match the type of the `inputs`.
-  - Final state: Either a single `2-D` tensor, or a tuple of tensors
-    matching the arity and shapes of `initial_state`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.FusedRNNCellAdaptor` {#FusedRNNCellAdaptor}
-
-This is an adaptor for RNNCell classes to be used with `FusedRNNCell`.
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCellAdaptor.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#FusedRNNCellAdaptor.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCellAdaptor.__init__(cell, use_dynamic_rnn=False)` {#FusedRNNCellAdaptor.__init__}
-
-Initialize the adaptor.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an instance of a subclass of a `rnn_cell.RNNCell`.
-*  <b>`use_dynamic_rnn`</b>: whether to use dynamic (or static) RNN.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.TimeReversedFusedRNN` {#TimeReversedFusedRNN}
-
-This is an adaptor to time-reverse a FusedRNNCell.
-
-For example,
-
-```python
-cell = tf.contrib.rnn.BasicRNNCell(10)
-fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
-bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
-fw_out, fw_state = fw_lstm(inputs)
-bw_out, bw_state = bw_lstm(inputs)
-```
-- - -
-
-#### `tf.contrib.rnn.TimeReversedFusedRNN.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#TimeReversedFusedRNN.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeReversedFusedRNN.__init__(cell)` {#TimeReversedFusedRNN.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.rnn.LSTMBlockFusedCell` {#LSTMBlockFusedCell}
-
-FusedRNNCell implementation of LSTM.
-
-This is an extremely efficient LSTM implementation, that uses a single TF op
-for the entire LSTM. It should be both faster and more memory-efficient than
-LSTMBlockCell defined above.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add forget_bias (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-The variable naming is consistent with `core_rnn_cell.LSTMCell`.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#LSTMBlockFusedCell.__call__}
-
-Run this LSTM on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len, batch_size, input_size]`
-    or a list of `time_len` tensors of shape `[batch_size, input_size]`.
-*  <b>`initial_state`</b>: a tuple `(initial_cell_state, initial_output)` with tensors
-    of shape `[batch_size, self._num_units]`. If this is not provided, the
-    cell is expected to create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len).`
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len, batch_size, output_size]`
-    or a list of time_len tensors of shape `[batch_size, output_size]`,
-    to match the type of the `inputs`.
-  - Final state: a tuple `(cell_state, output)` matching `initial_state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: in case of shape mismatches
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.__init__(num_units, forget_bias=1.0, cell_clip=None, use_peephole=False)` {#LSTMBlockFusedCell.__init__}
-
-Initialize the LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`cell_clip`</b>: clip the cell to this value. Defaults to `3`.
-*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.num_units` {#LSTMBlockFusedCell.num_units}
-
-Number of units in this cell (output dimension).
-
-
-
-
-### LSTM-like cells
-- - -
-
-### `class tf.contrib.rnn.CoupledInputForgetGateLSTMCell` {#CoupledInputForgetGateLSTMCell}
-
-Long short-term memory unit (LSTM) recurrent network cell.
-
-The default non-peephole implementation is based on:
-
-  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-S. Hochreiter and J. Schmidhuber.
-"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-The peephole implementation is based on:
-
-  https://research.google.com/pubs/archive/43905.pdf
-
-Hasim Sak, Andrew Senior, and Francoise Beaufays.
-"Long short-term memory recurrent neural network architectures for
- large scale acoustic modeling." INTERSPEECH, 2014.
-
-The coupling of input and forget gate is based on:
-
-  http://arxiv.org/pdf/1503.04069.pdf
-
-Greff et al. "LSTM: A Search Space Odyssey"
-
-The class uses optional peep-hole connections, and an optional projection
-layer.
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__call__(inputs, state, scope=None)` {#CoupledInputForgetGateLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
-    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-    `m_state`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "LSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-    LSTM after reading `inputs` when previous state was `state`.
-    Here output_dim is:
-       num_proj if num_proj was set,
-       num_units otherwise.
-  - Tensor(s) representing the new state of LSTM after reading `inputs` when
-    the previous state was `state`.  Same type and shape(s) as `state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
-    static shape inference.
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__init__(num_units, use_peepholes=False, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=tanh)` {#CoupledInputForgetGateLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
-    matrices.  If None, no projection is performed.
-*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-  provided, then the projected values are clipped elementwise to within
-  `[-proj_clip, proj_clip]`.
-
-*  <b>`num_unit_shards`</b>: How to split the weight matrix.  If >1, the weight
-    matrix is stored across num_unit_shards.
-*  <b>`num_proj_shards`</b>: How to split the projection matrix.  If >1, the
-    projection matrix is stored across num_proj_shards.
-*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
-    in order to reduce the scale of forgetting at the beginning of
-    the training.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  By default (False), they are concatenated
-    along the column axis.  This default behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.output_size` {#CoupledInputForgetGateLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.state_size` {#CoupledInputForgetGateLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.zero_state(batch_size, dtype)` {#CoupledInputForgetGateLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.TimeFreqLSTMCell` {#TimeFreqLSTMCell}
-
-Time-Frequency Long short-term memory unit (LSTM) recurrent network cell.
-
-This implementation is based on:
-
-  Tara N. Sainath and Bo Li
-  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
-  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
-
-It uses peep-hole connections and optional cell clipping.
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.__call__(inputs, state, scope=None)` {#TimeFreqLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: state Tensor, 2D, batch x state_size.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "TimeFreqLSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A 2D, batch x output_dim, Tensor representing the output of the LSTM
-    after reading "inputs" when previous state was "state".
-    Here output_dim is num_units.
-  - A 2D, batch x state_size, Tensor representing the new state of LSTM
-    after reading "inputs" when previous state was "state".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an input_size was specified and the provided inputs have
-    a different dimension.
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.__init__(num_units, use_peepholes=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#TimeFreqLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
-    by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
-    matrix is stored across num_unit_shards.
-*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
-    to 1 in order to reduce the scale of forgetting at the beginning
-    of the training.
-*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
-*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
-    frequency.
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.output_size` {#TimeFreqLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.state_size` {#TimeFreqLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.zero_state(batch_size, dtype)` {#TimeFreqLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.GridLSTMCell` {#GridLSTMCell}
-
-Grid Long short-term memory unit (LSTM) recurrent network cell.
-
-The default is based on:
-  Nal Kalchbrenner, Ivo Danihelka and Alex Graves
-  "Grid Long Short-Term Memory," Proc. ICLR 2016.
-  http://arxiv.org/abs/1507.01526
-
-When peephole connections are used, the implementation is based on:
-  Tara N. Sainath and Bo Li
-  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
-  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
-
-The code uses optional peephole connections, shared_weights and cell clipping.
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.__call__(inputs, state, scope=None)` {#GridLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, [batch, feature_size].
-*  <b>`state`</b>: Tensor or tuple of Tensors, 2D, [batch, state_size], depends on the
-    flag self._state_is_tuple.
-*  <b>`scope`</b>: (optional) VariableScope for the created subgraph; if None, it
-    defaults to "GridLSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A 2D, [batch, output_dim], Tensor representing the output of the LSTM
-    after reading "inputs" when previous state was "state".
-    Here output_dim is num_units.
-  - A 2D, [batch, state_size], Tensor representing the new state of LSTM
-    after reading "inputs" when previous state was "state".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an input_size was specified and the provided inputs have
-    a different dimension.
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.__init__(num_units, use_peepholes=False, share_time_frequency_weights=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None, num_frequency_blocks=None, start_freqindex_list=None, end_freqindex_list=None, couple_input_forget_gates=False, state_is_tuple=False)` {#GridLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: (optional) bool, default False. Set True to enable
-    diagonal/peephole connections.
-*  <b>`share_time_frequency_weights`</b>: (optional) bool, default False. Set True to
-    enable shared cell weights between time and frequency LSTMs.
-*  <b>`cell_clip`</b>: (optional) A float value, default None, if provided the cell
-    state is clipped by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices, default None.
-*  <b>`num_unit_shards`</b>: (optional) int, defualt 1, How to split the weight
-    matrix. If > 1,the weight matrix is stored across num_unit_shards.
-*  <b>`forget_bias`</b>: (optional) float, default 1.0, The initial bias of the
-    forget gates, used to reduce the scale of forgetting at the beginning
-    of the training.
-*  <b>`feature_size`</b>: (optional) int, default None, The size of the input feature
-    the LSTM spans over.
-*  <b>`frequency_skip`</b>: (optional) int, default None, The amount the LSTM filter
-    is shifted by in frequency.
-*  <b>`num_frequency_blocks`</b>: [required] A list of frequency blocks needed to
-    cover the whole input feature splitting defined by start_freqindex_list
-    and end_freqindex_list.
-*  <b>`start_freqindex_list`</b>: [optional], list of ints, default None,  The
-    starting frequency index for each frequency block.
-*  <b>`end_freqindex_list`</b>: [optional], list of ints, default None. The ending
-    frequency index for each frequency block.
-*  <b>`couple_input_forget_gates`</b>: (optional) bool, default False, Whether to
-    couple the input and forget gates, i.e. f_gate = 1.0 - i_gate, to reduce
-    model parameters and computation cost.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  By default (False), they are concatenated
-    along the column axis.  This default behavior will soon be deprecated.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the num_frequency_blocks list is not specified
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.output_size` {#GridLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.state_size` {#GridLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.state_tuple_type` {#GridLSTMCell.state_tuple_type}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.zero_state(batch_size, dtype)` {#GridLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-### RNNCell wrappers
-- - -
-
-### `class tf.contrib.rnn.AttentionCellWrapper` {#AttentionCellWrapper}
-
-Basic attention cell wrapper.
-
-Implementation based on https://arxiv.org/abs/1409.0473.
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.__call__(inputs, state, scope=None)` {#AttentionCellWrapper.__call__}
-
-Long short-term memory cell with attention (LSTMA).
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.__init__(cell, attn_length, attn_size=None, attn_vec_size=None, input_size=None, state_is_tuple=False)` {#AttentionCellWrapper.__init__}
-
-Create a cell with attention.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, an attention is added to it.
-*  <b>`attn_length`</b>: integer, the size of an attention window.
-*  <b>`attn_size`</b>: integer, the size of an attention vector. Equal to
-      cell.output_size by default.
-*  <b>`attn_vec_size`</b>: integer, the number of convolutional features calculated
-      on attention state and a size of the hidden layer built from
-      base cell state. Equal attn_size to by default.
-*  <b>`input_size`</b>: integer, the size of a hidden linear layer,
-      built from inputs and attention. Derived from the input tensor
-      by default.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
-    `n = len(cells)`.  By default (False), the states are all
-    concatenated along the column axis.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if cell returns a state tuple but the flag
-      `state_is_tuple` is `False` or if attn_length is zero or less.
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.output_size` {#AttentionCellWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.state_size` {#AttentionCellWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.zero_state(batch_size, dtype)` {#AttentionCellWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-- - -
-
-### `class tf.contrib.rnn.CompiledWrapper` {#CompiledWrapper}
-
-Wraps step execution in an XLA JIT scope.
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.__call__(inputs, state, scope=None)` {#CompiledWrapper.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.__init__(cell, compile_stateful=False)` {#CompiledWrapper.__init__}
-
-Create CompiledWrapper cell.
-
-##### Args:
-
-
-*  <b>`cell`</b>: Instance of `RNNCell`.
-*  <b>`compile_stateful`</b>: Whether to compile stateful ops like initializers
-    and random number generators (default: False).
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.output_size` {#CompiledWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.state_size` {#CompiledWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.zero_state(batch_size, dtype)` {#CompiledWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
-
-
-
-## Recurrent Neural Networks
-
-TensorFlow provides a number of methods for constructing Recurrent Neural
-Networks.
-
-- - -
-
-### `tf.contrib.rnn.static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#static_rnn}
-
-Creates a recurrent neural network specified by RNNCell `cell`.
-
-The simplest form of RNN network generated is:
-
-```python
-  state = cell.zero_state(...)
-  outputs = []
-  for input_ in inputs:
-    output, state = cell(input_, state)
-    outputs.append(output)
-  return (outputs, state)
-```
-However, a few other options are available:
-
-An initial state can be provided.
-If the sequence_length vector is provided, dynamic calculation is performed.
-This method of calculation does not compute the RNN steps past the maximum
-sequence length of the minibatch (thus saving computational time),
-and properly propagates the state at an example's sequence length
-to the final state output.
-
-The dynamic calculation performed is, at time `t` for batch row `b`,
-
-```python
-  (output, state)(b, t) =
-    (t >= sequence_length(b))
-      ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
-      : cell(input(b, t), state(b, t - 1))
-```
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
-    `[batch_size, input_size]`, or a nested tuple of such elements.
-*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell.state_size`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
-    Required if initial_state is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs.
-    An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-
-  - outputs is a length T list of outputs (one for each input), or a nested
-    tuple of such elements.
-  - state is the final state
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the input depth
-    (column size) cannot be inferred from inputs via shape inference.
-
-
-- - -
-
-### `tf.contrib.rnn.static_state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None)` {#static_state_saving_rnn}
-
-RNN that accepts a state saver for time-truncated RNN calculation.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
-    `[batch_size, input_size]`.
-*  <b>`state_saver`</b>: A state saver object with methods `state` and `save_state`.
-*  <b>`state_name`</b>: Python string or tuple of strings.  The name to use with the
-    state_saver. If the cell returns tuples of states (i.e.,
-    `cell.state_size` is a tuple) then `state_name` should be a tuple of
-    strings having the same length as `cell.state_size`.  Otherwise it should
-    be a single string.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector size [batch_size].
-    See the documentation for rnn() for more details about sequence_length.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-    outputs is a length T list of outputs (one for each input)
-    states is the final state
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the arity and
-   type of `state_name` does not match that of `cell.state_size`.
-
-
-- - -
-
-### `tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None)` {#static_bidirectional_rnn}
-
-Creates a bidirectional recurrent neural network.
-
-Similar to the unidirectional case above (rnn) but takes input and builds
-independent forward and backward RNNs with the final forward and backward
-outputs depth-concatenated, such that the output will have the format
-[time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
-forward and backward cell must match. The initial state for both directions
-is zero by default (but can be set optionally) and no intermediate states are
-ever returned -- the network is fully unrolled for the given (passed in)
-length(s) of the sequence(s) or completely unrolled if length(s) is not given.
-
-##### Args:
-
-
-*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
-*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
-*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
-    [batch_size, input_size], or a nested tuple of such elements.
-*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
-    This must be a tensor of appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-    If `cell_fw.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
-    the corresponding properties of `cell_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "bidirectional_rnn"
-
-##### Returns:
-
-  A tuple (outputs, output_state_fw, output_state_bw) where:
-    outputs is a length `T` list of outputs (one for each input), which
-      are depth-concatenated forward and backward outputs.
-    output_state_fw is the final state of the forward rnn.
-    output_state_bw is the final state of the backward rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-*  <b>`ValueError`</b>: If inputs is None or an empty list.
-
-
-- - -
-
-### `tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, initial_states_fw=None, initial_states_bw=None, dtype=None, sequence_length=None, scope=None)` {#stack_bidirectional_dynamic_rnn}
-
-Creates a dynamic bidirectional recurrent neural network.
-
-Stacks several bidirectional rnn layers. The combined forward and backward
-layer outputs are used as input of the next layer. tf.bidirectional_rnn
-does not allow to share forward and backward information between layers.
-The input_size of the first forward and backward cells must match.
-The initial state for both directions is zero and no intermediate states
-are returned.
-
-##### Args:
-
-
-*  <b>`cells_fw`</b>: List of instances of RNNCell, one per layer,
-    to be used for forward direction.
-*  <b>`cells_bw`</b>: List of instances of RNNCell, one per layer,
-    to be used for backward direction.
-*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
-    [batch_size, input_size], or a nested tuple of such elements.
-*  <b>`initial_states_fw`</b>: (optional) A list of the initial states (one per layer)
-    for the forward RNN.
-    Each tensor must has an appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-*  <b>`initial_states_bw`</b>: (optional) Same as for `initial_states_fw`, but using
-    the corresponding properties of `cells_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to None.
-
-##### Returns:
-
-  A tuple (outputs, output_state_fw, output_state_bw) where:
-
-*  <b>`outputs`</b>: Output `Tensor` shaped:
-      `batch_size, max_time, layers_output]`. Where layers_output
-      are depth-concatenated forward and backward outputs.
-    output_states_fw is the final states, one tensor per layer,
-      of the forward rnn.
-    output_states_bw is the final states, one tensor per layer,
-      of the backward rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-*  <b>`ValueError`</b>: If inputs is `None`, not a list or an empty list.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.training.md b/tensorflow/g3doc/api_docs/python/contrib.training.md
deleted file mode 100644
index 8b8847d41d2..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.training.md
+++ /dev/null
@@ -1,1084 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Training (contrib)
-[TOC]
-
-Training and input utilities.
-
-## Splitting sequence inputs into minibatches with state saving
-
-Use [`SequenceQueueingStateSaver`](#SequenceQueueingStateSaver) or
-its wrapper [`batch_sequences_with_states`](#batch_sequences_with_states) if
-you have input data with a dynamic primary time / frame count axis which
-you'd like to convert into fixed size segments during minibatching, and would
-like to store state in the forward direction across segments of an example.
-
-- - -
-
-### `tf.contrib.training.batch_sequences_with_states(input_key, input_sequences, input_context, input_length, initial_states, num_unroll, batch_size, num_threads=3, capacity=1000, allow_small_batch=True, pad=True, name=None)` {#batch_sequences_with_states}
-
-Creates batches of segments of sequential input.
-
-This method creates a `SequenceQueueingStateSaver` (SQSS) and adds it to
-the queuerunners. It returns a `NextQueuedSequenceBatch`.
-
-It accepts one example at a time identified by a unique `input_key`.
-`input_sequence` is a dict with values that are tensors with time as first
-dimension. This time dimension must be the same across those tensors of an
-example. It can vary across examples. Although it always has to be a multiple
-of `num_unroll`. Hence, padding may be necessary and it is turned on by
-default by `pad=True`.
-
-`input_length` is a Tensor scalar or an int recording the time dimension prior
-to padding. It should be between 0 and the time dimension. One reason we want
-to keep track of it is so that we can take it into consideration when
-computing the loss. If `pad=True` then `input_length` can be `None` and will
-be inferred.
-
-This methods segments `input_sequence` into segments of length `num_unroll`.
-It batches input sequences from `batch_size` many examples. These mini-batches
-are available through the `sequence` property of the output. Moreover, for
-each entry in the batch we can access its original `input_key` in `key` and
-its input length in `total_length`. `length` records within this segment how
-many non-padded time steps there are.
-
-Static features of an example that do not vary across time can be part of the
-`input_context`, a dict with Tensor values. This method copies the context for
-each segment and makes it available in the `context` of the output.
-
-This method can maintain and update a state for each example. It accepts some
-initial_states as a dict with Tensor values. The first mini-batch an example
-is contained has initial_states as entry of the `state`. If save_state is
-called then the next segment will have the updated entry of the `state`.
-See `NextQueuedSequenceBatch` for a complete list of properties and methods.
-
-Example usage:
-
-```python
-batch_size = 32
-num_unroll = 20
-num_enqueue_threads = 3
-lstm_size = 8
-cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
-
-key, sequences, context = my_parser(raw_data)
-initial_state_values = tf.zeros((state_size,), dtype=tf.float32)
-initial_states = {"lstm_state": initial_state_values}
-batch = tf.batch_sequences_with_states(
-    input_key=key,
-    input_sequences=sequences,
-    input_context=context,
-    initial_states=initial_states,
-    num_unroll=num_unroll,
-    batch_size=batch_size,
-    num_threads=num_enqueue_threads,
-    capacity=batch_size * num_enqueue_threads * 2)
-
-inputs = batch.sequences["input"]
-context_label = batch.context["label"]
-
-inputs_by_time = tf.split(value=inputs, num_or_size_splits=num_unroll, axis=1)
-assert len(inputs_by_time) == num_unroll
-
-lstm_output, _ = tf.contrib.rnn.static_state_saving_rnn(
-  cell,
-  inputs_by_time,
-  state_saver=batch,
-  state_name="lstm_state")
-
-# Start a prefetcher in the background
-sess = tf.Session()
-
-tf.train.start_queue_runners(sess=session)
-
-while True:
-  # Step through batches, perform training or inference...
-  session.run([lstm_output])
-```
-
-##### Args:
-
-
-*  <b>`input_key`</b>: A string scalar `Tensor`, the **unique** key for the given
-    input example.  This is used to keep track of the split minibatch elements
-    of this input.  Batched keys of the current iteration are made
-    accessible via the `key` property.  The shape of `input_key` (scalar) must
-    be fully specified.
-*  <b>`input_sequences`</b>: A dict mapping string names to `Tensor` values.  The values
-    must all have matching first dimension, called `value_length`. They may
-    vary from input to input. The remainder of the shape (other than the first
-    dimension) must be fully specified.
-    The `SequenceQueueingStateSaver` will split these tensors along
-    this first dimension into minibatch elements of dimension `num_unrolled`.
-    Batched and segmented sequences of the current iteration are made
-    accessible via the `sequences` property.
-
-    **Note**: if `pad=False`, then `value_length` must always be a multiple
-      of `num_unroll`.
-
-*  <b>`input_context`</b>: A dict mapping string names to `Tensor` values.  The values
-    are treated as "global" across all time splits of the given input example,
-    and will be copied across for all minibatch elements accordingly.
-    Batched and copied context of the current iteration are made
-    accessible via the `context` property.
-
-    **Note**: All input_context values must have fully defined shapes.
-
-*  <b>`input_length`</b>: None or an int32 scalar `Tensor`, the length of the sequence
-    prior to padding. If `input_length=None` and `pad=True` then the length
-    will be inferred and will be equal to `value_length`. If `pad=False` then
-    `input_length` cannot be `None`: `input_length` must be specified. Its
-    shape of `input_length` (scalar) must be fully specified. Its value may be
-    at most `value_length` for any given input (see above for the definition
-    of `value_length`). Batched and total lengths of the current iteration are
-    made accessible via the `length` and `total_length` properties.
-*  <b>`initial_states`</b>: A dict mapping string state names to multi-dimensional
-    values (e.g. constants or tensors).  This input defines the set of
-    states that will be kept track of during computing iterations, and
-    which can be accessed via the `state` and `save_state` methods.
-
-    **Note**: All initial_state values must have fully defined shapes.
-
-*  <b>`num_unroll`</b>: Python integer, how many time steps to unroll at a time.
-    The input sequences of length k are then split into k / num_unroll many
-    segments.
-*  <b>`batch_size`</b>: int or int32 scalar `Tensor`, how large minibatches should
-    be when accessing the `state()` method and `context`, `sequences`, etc,
-    properties.
-*  <b>`num_threads`</b>: The int number of threads enqueuing input examples into a
-    queue.
-*  <b>`capacity`</b>: The max capacity of the queue in number of examples. Needs to be
-    at least `batch_size`. Defaults to 1000. When iterating over the same
-    input example multiple times reusing their keys the `capacity` must be
-    smaller than the number of examples.
-*  <b>`allow_small_batch`</b>: If true, the queue will return smaller batches when
-    there aren't enough input examples to fill a whole batch and the end of
-    the input has been reached.
-*  <b>`pad`</b>: If `True`, `input_sequences` will be padded to multiple of
-    `num_unroll`. In that case `input_length` may be `None` and is assumed to
-    be the length of first dimension of values in `input_sequences`
-    (i.e. `value_length`).
-*  <b>`name`</b>: An op name string (optional).
-
-##### Returns:
-
-  A NextQueuedSequenceBatch with segmented and batched inputs and their
-  states.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not an expected type.
-*  <b>`ValueError`</b>: if any of the input values is inconsistent, e.g. if
-    not enough shape information is available from inputs to build
-    the state saver.
-
-
-- - -
-
-### `class tf.contrib.training.NextQueuedSequenceBatch` {#NextQueuedSequenceBatch}
-
-NextQueuedSequenceBatch stores deferred SequenceQueueingStateSaver data.
-
-This class is instantiated by `SequenceQueueingStateSaver` and is accessible
-via its `next_batch` property.
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.__init__(state_saver)` {#NextQueuedSequenceBatch.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.batch_size` {#NextQueuedSequenceBatch.batch_size}
-
-The batch_size of the given batch.
-
-Usually, this is the batch_size requested when initializing the SQSS, but
-if allow_small_batch=True this will become smaller when inputs are
-exhausted.
-
-##### Returns:
-
-  A scalar integer tensor, the batch_size
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.context` {#NextQueuedSequenceBatch.context}
-
-A dict mapping keys of `input_context` to batched context.
-
-##### Returns:
-
-  A dict mapping keys of `input_context` to tensors.
-  If we had at input:
-
-  ```python
-  context["name"].get_shape() == [d1, d2, ...]
-  ```
-
-  then for this property:
-
-  ```python
-  context["name"].get_shape() == [batch_size, d1, d2, ...]
-  ```
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.insertion_index` {#NextQueuedSequenceBatch.insertion_index}
-
-The insertion indices of the examples (when they were first added).
-
-These indices start with the value -2**63 and increase with every
-call to the prefetch op.  Each whole example gets its own insertion
-index, and this is used to prioritize the example so that its truncated
-segments appear in adjacent iterations, even if new examples are inserted
-by the prefetch op between iterations.
-
-##### Returns:
-
-  An int64 vector of length `batch_size`, the insertion indices.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.key` {#NextQueuedSequenceBatch.key}
-
-The key names of the given truncated unrolled examples.
-
-The format of the key is:
-
-```python
-"%05d_of_%05d:%s" % (sequence, sequence_count, original_key)
-```
-
-where `original_key` is the unique key read in by the prefetcher.
-
-##### Returns:
-
-  A string vector of length `batch_size`, the keys.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.length` {#NextQueuedSequenceBatch.length}
-
-The lengths of the given truncated unrolled examples.
-
-For initial iterations, for which `sequence * num_unroll < length`,
-this number is `num_unroll`.  For the remainder,
-this number is between `0` and `num_unroll`.
-
-##### Returns:
-
-  An integer vector of length `batch_size`, the lengths.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.next_key` {#NextQueuedSequenceBatch.next_key}
-
-The key names of the next (in iteration) truncated unrolled examples.
-
-The format of the key is:
-
-```python
-"%05d_of_%05d:%s" % (sequence + 1, sequence_count, original_key)
-```
-
-if `sequence + 1 < sequence_count`, otherwise:
-
-```python
-"STOP:%s" % original_key
-```
-
-where `original_key` is the unique key read in by the prefetcher.
-
-##### Returns:
-
-  A string vector of length `batch_size`, the keys.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.save_state(state_name, value, name=None)` {#NextQueuedSequenceBatch.save_state}
-
-Returns an op to save the current batch of state `state_name`.
-
-##### Args:
-
-
-*  <b>`state_name`</b>: string, matches a key provided in `initial_states`.
-*  <b>`value`</b>: A `Tensor`.
-    Its type must match that of `initial_states[state_name].dtype`.
-    If we had at input:
-
-    ```python
-    initial_states[state_name].get_shape() == [d1, d2, ...]
-    ```
-
-    then the shape of `value` must match:
-
-    ```python
-    tf.shape(value) == [batch_size, d1, d2, ...]
-    ```
-
-
-*  <b>`name`</b>: string (optional).  The name scope for newly created ops.
-
-##### Returns:
-
-  A control flow op that stores the new state of each entry into
-  the state saver.  This op must be run for every iteration that
-  accesses data from the state saver (otherwise the state saver
-  will never progress through its states and run out of capacity).
-
-##### Raises:
-
-
-*  <b>`KeyError`</b>: if `state_name` does not match any of the initial states
-    declared in `initial_states`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequence` {#NextQueuedSequenceBatch.sequence}
-
-An int32 vector, length `batch_size`: the sequence index of each entry.
-
-When an input is split up, the sequence values
-```
-0, 1, ..., sequence_count - 1
-```
-are assigned to each split.
-
-##### Returns:
-
-  An int32 vector `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequence_count` {#NextQueuedSequenceBatch.sequence_count}
-
-An int32 vector, length `batch_size`: the sequence count of each entry.
-
-When an input is split up, the number of splits is equal to:
-`padded_length / num_unroll`.  This is the sequence_count.
-
-##### Returns:
-
-  An int32 vector `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequences` {#NextQueuedSequenceBatch.sequences}
-
-A dict mapping keys of `input_sequences` to split and rebatched data.
-
-##### Returns:
-
-  A dict mapping keys of `input_sequences` to tensors.
-  If we had at input:
-
-  ```python
-  sequences["name"].get_shape() == [None, d1, d2, ...]
-  ```
-
-  where `None` meant the sequence time was dynamic, then for this property:
-
-  ```python
-  sequences["name"].get_shape() == [batch_size, num_unroll, d1, d2, ...].
-  ```
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.state(state_name)` {#NextQueuedSequenceBatch.state}
-
-Returns batched state tensors.
-
-##### Args:
-
-
-*  <b>`state_name`</b>: string, matches a key provided in `initial_states`.
-
-##### Returns:
-
-  A `Tensor`: a batched set of states, either initial states (if this is
-  the first run of the given example), or a value as stored during
-  a previous iteration via `save_state` control flow.
-  Its type is the same as `initial_states["state_name"].dtype`.
-  If we had at input:
-
-  ```python
-  initial_states[state_name].get_shape() == [d1, d2, ...],
-  ```
-
-  then
-
-  ```python
-  state(state_name).get_shape() == [batch_size, d1, d2, ...]
-  ```
-
-##### Raises:
-
-
-*  <b>`KeyError`</b>: if `state_name` does not match any of the initial states
-    declared in `initial_states`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.total_length` {#NextQueuedSequenceBatch.total_length}
-
-The lengths of the original (non-truncated) unrolled examples.
-
-##### Returns:
-
-  An integer vector of length `batch_size`, the total lengths.
-
-
-
-- - -
-
-### `class tf.contrib.training.SequenceQueueingStateSaver` {#SequenceQueueingStateSaver}
-
-SequenceQueueingStateSaver provides access to stateful values from input.
-
-This class is meant to be used instead of, e.g., a `Queue`, for splitting
-variable-length sequence inputs into segments of sequences with fixed length
-and batching them into mini-batches.  It maintains contexts and state for a
-sequence across the segments.  It can be used in conjunction with a
-`QueueRunner` (see the example below).
-
-The `SequenceQueueingStateSaver` (SQSS) accepts one example at a time via the
-inputs `input_length`, `input_key`, `input_sequences` (a dict),
-`input_context` (a dict), and `initial_states` (a dict).
-The sequences, values in `input_sequences`, may have variable first dimension
-(the `padded_length`), though this dimension must always be a multiple of
-`num_unroll`.  All other dimensions must be fixed and accessible via
-`get_shape` calls.  The length prior to padding can be recorded in
-`input_length`.  The context values in `input_context` must all have fixed and
-well defined dimensions.  The initial state values must all have fixed and
-well defined dimensions.
-
-The SQSS splits the sequences of an input example into segments of length
-`num_unroll`.  Across examples minibatches of size `batch_size` are formed.
-These minibatches contain a segment of the sequences, copy the context values,
-and maintain state, length, and key information of the original input
-examples.  In the first segment of an example the state is still the initial
-state.  It can then be updated; and updated state values are accessible in
-subsequent segments of the same example. After each segment
-`batch.save_state()` must be called which is done by the state_saving_rnn.
-Without this call, the dequeue op associated with the SQSS will not run.
-Internally, SQSS has a queue for the input examples. Its `capacity` is
-configurable.  If set smaller than `batch_size` then the dequeue op will block
-indefinitely.  A small multiple of `batch_size` is a good rule of thumb to
-prevent that queue from becoming a bottleneck and slowing down training.
-If set too large (and note that it defaults to unbounded) memory consumption
-goes up.  Moreover, when iterating over the same input examples multiple times
-reusing the same `key` the `capacity` must be smaller than the number of
-examples.
-
-The prefetcher, which reads one unrolled, variable-length input sequence at
-a time, is accessible via `prefetch_op`.  The underlying `Barrier` object
-is accessible via `barrier`.  Processed minibatches, as well as
-state read and write capabilities are accessible via `next_batch`.
-Specifically, `next_batch` provides access to all of the minibatched
-data, including the following, see `NextQueuedSequenceBatch` for details:
-
-*  `total_length`, `length`, `insertion_index`, `key`, `next_key`,
-*  `sequence` (the index each minibatch entry's time segment index),
-*  `sequence_count` (the total time segment count for each minibatch entry),
-*  `context` (a dict of the copied minibatched context values),
-*  `sequences` (a dict of the split minibatched variable-length sequences),
-*  `state` (to access the states of the current segments of these entries)
-*  `save_state` (to save the states for the next segments of these entries)
-
-Example usage:
-
-```python
-batch_size = 32
-num_unroll = 20
-lstm_size = 8
-cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
-initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
-
-raw_data = get_single_input_from_input_reader()
-length, key, sequences, context = my_parser(raw_data)
-assert "input" in sequences.keys()
-assert "label" in context.keys()
-initial_states = {"lstm_state": initial_state_value}
-
-stateful_reader = tf.SequenceQueueingStateSaver(
-    batch_size, num_unroll,
-    length=length, input_key=key, input_sequences=sequences,
-    input_context=context, initial_states=initial_states,
-    capacity=batch_size*100)
-
-batch = stateful_reader.next_batch
-inputs = batch.sequences["input"]
-context_label = batch.context["label"]
-
-inputs_by_time = tf.split(value=inputs, num_or_size_splits=num_unroll, axis=1)
-assert len(inputs_by_time) == num_unroll
-
-lstm_output, _ = tf.contrib.rnn.static_state_saving_rnn(
-  cell,
-  inputs_by_time,
-  state_saver=batch,
-  state_name="lstm_state")
-
-# Start a prefetcher in the background
-sess = tf.Session()
-num_threads = 3
-queue_runner = tf.train.QueueRunner(
-    stateful_reader, [stateful_reader.prefetch_op] * num_threads)
-tf.train.add_queue_runner(queue_runner)
-tf.train.start_queue_runners(sess=session)
-
-while True:
-  # Step through batches, perform training or inference...
-  session.run([lstm_output])
-```
-
-**Note**: Usually the barrier is given to a QueueRunner as in the
-    examples above.  The QueueRunner will close the barrier if the prefetch_op
-    receives an OutOfRange Error from upstream input queues (i.e., reaches
-    the end of the input).  If the barrier is closed no further new examples
-    are added to the SQSS.  The underlying barrier might, however, still
-    contain further unroll-steps of examples that have not undergone all
-    iterations.  To gracefully finish all examples, the flag
-    `allow_small_batch` must be set to true, which causes the SQSS to issue
-    progressively smaller mini-batches with the remaining examples.
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.__init__(batch_size, num_unroll, input_length, input_key, input_sequences, input_context, initial_states, capacity=None, allow_small_batch=False, name=None)` {#SequenceQueueingStateSaver.__init__}
-
-Creates the SequenceQueueingStateSaver.
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int or int32 scalar `Tensor`, how large minibatches should
-    be when accessing the `state()` method and `context`, `sequences`, etc,
-    properties.
-*  <b>`num_unroll`</b>: Python integer, how many time steps to unroll at a time.
-    The input sequences of length `k` are then split into `k / num_unroll`
-    many segments.
-*  <b>`input_length`</b>: An int32 scalar `Tensor`, the length of the sequence prior
-    to padding.  This value may be at most `padded_length` for any given
-    input (see below for the definition of `padded_length`).
-    Batched and total lengths of the current iteration are made accessible
-    via the `length` and `total_length` properties.  The shape of
-    input_length (scalar) must be fully specified.
-*  <b>`input_key`</b>: A string scalar `Tensor`, the **unique** key for the given
-    input.  This is used to keep track of the split minibatch elements
-    of this input.  Batched keys of the current iteration are made
-    accessible via the `key` property.  The shape of `input_key` (scalar)
-    must be fully specified.
-*  <b>`input_sequences`</b>: A dict mapping string names to `Tensor` values.  The
-    values must all have matching first dimension, called `padded_length`.
-    The `SequenceQueueingStateSaver` will split these tensors along
-    this first dimension into minibatch elements of dimension
-    `num_unroll`. Batched and segmented sequences of the current iteration
-    are made accessible via the `sequences` property.
-
-    **Note**: `padded_length` may be dynamic, and may vary from input
-    to input, but must always be a multiple of `num_unroll`.  The remainder
-    of the shape (other than the first dimension) must be fully specified.
-
-*  <b>`input_context`</b>: A dict mapping string names to `Tensor` values.  The values
-    are treated as "global" across all time splits of the given input,
-    and will be copied across for all minibatch elements accordingly.
-    Batched and copied context of the current iteration are made
-    accessible via the `context` property.
-
-    **Note**: All input_context values must have fully defined shapes.
-
-*  <b>`initial_states`</b>: A dict mapping string state names to multi-dimensional
-    values (e.g. constants or tensors).  This input defines the set of
-    states that will be kept track of during computing iterations, and
-    which can be accessed via the `state` and `save_state` methods.
-
-    **Note**: All initial_state values must have fully defined shapes.
-
-*  <b>`capacity`</b>: The max capacity of the SQSS in number of examples. Needs to be
-    at least `batch_size`. Defaults to unbounded.
-*  <b>`allow_small_batch`</b>: If true, the SQSS will return smaller batches when
-    there aren't enough input examples to fill a whole batch and the end of
-    the input has been reached (i.e., the underlying barrier has been
-    closed).
-*  <b>`name`</b>: An op name string (optional).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not an expected type.
-*  <b>`ValueError`</b>: if any of the input values is inconsistent, e.g. if
-  not enough shape information is available from inputs to build
-  the state saver.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.barrier` {#SequenceQueueingStateSaver.barrier}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.batch_size` {#SequenceQueueingStateSaver.batch_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.close(cancel_pending_enqueues=False, name=None)` {#SequenceQueueingStateSaver.close}
-
-Closes the barrier and the FIFOQueue.
-
-This operation signals that no more segments of new sequences will be
-enqueued. New segments of already inserted sequences may still be enqueued
-and dequeued if there is a sufficient number filling a batch or
-allow_small_batch is true. Otherwise dequeue operations will fail
-immediately.
-
-##### Args:
-
-
-*  <b>`cancel_pending_enqueues`</b>: (Optional.) A boolean, defaulting to
-    `False`. If `True`, all pending enqueues to the underlying queues will
-    be cancelled, and completing already started sequences is not possible.
-*  <b>`name`</b>: Optional name for the op.
-
-##### Returns:
-
-  The operation that closes the barrier and the FIFOQueue.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.name` {#SequenceQueueingStateSaver.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.next_batch` {#SequenceQueueingStateSaver.next_batch}
-
-The `NextQueuedSequenceBatch` providing access to batched output data.
-
-Also provides access to the `state` and `save_state` methods.
-The first time this gets called, it additionally prepares barrier reads
-and creates `NextQueuedSequenceBatch` / next_batch objects. Subsequent
-calls simply return the previously created `next_batch`.
-
-In order to access data in `next_batch` without blocking, the `prefetch_op`
-must have been run at least `batch_size` times (ideally in a separate
-thread, or launched via a `QueueRunner`). After processing a segment in
-`next_batch()`, `batch.save_state()` must be called which is done by the
-state_saving_rnn. Without this call, the dequeue op associated with the SQSS
-will not run.
-
-##### Returns:
-
-  A cached `NextQueuedSequenceBatch` instance.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.num_unroll` {#SequenceQueueingStateSaver.num_unroll}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.prefetch_op` {#SequenceQueueingStateSaver.prefetch_op}
-
-The op used to prefetch new data into the state saver.
-
-Running it once enqueues one new input example into the state saver.
-The first time this gets called, it additionally creates the prefetch_op.
-Subsequent calls simply return the previously created `prefetch_op`.
-
-It should be run in a separate thread via e.g. a `QueueRunner`.
-
-##### Returns:
-
-  An `Operation` that performs prefetching.
-
-
-
-
-
-## Online data resampling
-
-To resample data with replacement on a per-example basis, use
-['rejection_sample'](#rejection_sample) or
-['resample_at_rate'](#resample_at_rate). For `rejection_sample`, provide
-a boolean Tensor describing whether to accept or reject. Resulting batch sizes
-are always the same. For `resample_at_rate`, provide the desired rate for each
-example. Resulting batch sizes may vary. If you wish to specify relative
-rates, rather than absolute ones, use ['weighted_resample'](#weighted_resample)
-(which also returns the actual resampling rate used for each output example).
-
-Use ['stratified_sample'](#stratified_sample) to resample without replacement
-from the data to achieve a desired mix of class proportions that the Tensorflow
-graph sees. For instance, if you have a binary classification dataset that is
-99.9% class 1, a common approach is to resample from the data so that the data
-is more balanced.
-
-- - -
-
-### `tf.contrib.training.rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None)` {#rejection_sample}
-
-Stochastically creates batches by rejection sampling.
-
-Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce
-a scalar tensor between 0 and 1. This tensor corresponds to the probability of
-being accepted. When `batch_size` tensor groups have been accepted, the batch
-queue will return a mini-batch.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors for data. All tensors are either one item or a
-      batch, according to enqueue_many.
-*  <b>`accept_prob_fn`</b>: A python lambda that takes a non-batch tensor from each
-      item in `tensors`, and produces a scalar tensor.
-*  <b>`batch_size`</b>: Size of batch to be returned.
-*  <b>`queue_threads`</b>: The number of threads for the queue that will hold the final
-    batch.
-*  <b>`enqueue_many`</b>: Bool. If true, interpret input tensors as having a batch
-      dimension.
-*  <b>`prebatch_capacity`</b>: Capacity for the large queue that is used to convert
-    batched tensors to single examples.
-*  <b>`prebatch_threads`</b>: Number of threads for the large queue that is used to
-    convert batched tensors to single examples.
-*  <b>`runtime_checks`</b>: Bool. If true, insert runtime checks on the output of
-      `accept_prob_fn`. Using `True` might have a performance impact.
-*  <b>`name`</b>: Optional prefix for ops created by this function.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: enqueue_many is True and labels doesn't have a batch
-      dimension, or if enqueue_many is False and labels isn't a scalar.
-*  <b>`ValueError`</b>: enqueue_many is True, and batch dimension on data and labels
-      don't match.
-*  <b>`ValueError`</b>: if a zero initial probability class has a nonzero target
-      probability.
-
-##### Returns:
-
-  A list of tensors of the same length as `tensors`, with batch dimension
-  `batch_size`.
-
-##### Example:
-
-  # Get tensor for a single data and label example.
-  data, label = data_provider.Get(['data', 'label'])
-
-  # Get stratified batch according to data tensor.
-  accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2
-  data_batch = tf.contrib.training.rejection_sample(
-      [data, label], accept_prob_fn, 16)
-
-  # Run batch through network.
-  ...
-
-
-- - -
-
-### `tf.contrib.training.resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False)` {#resample_at_rate}
-
-Given `inputs` tensors, stochastically resamples each at a given rate.
-
-For example, if the inputs are `[[a1, a2], [b1, b2]]` and the rates
-tensor contains `[3, 1]`, then the return value may look like `[[a1,
-a2, a1, a1], [b1, b2, b1, b1]]`. However, many other outputs are
-possible, since this is stochastic -- averaged over many repeated
-calls, each set of inputs should appear in the output `rate` times
-the number of invocations.
-
-Uses Knuth's method to generate samples from the poisson
-distribution (but instead of just incrementing a count, actually
-emits the input); this is described at
-https://en.wikipedia.org/wiki/Poisson_distribution in the section on
-generating Poisson-distributed random variables.
-
-Note that this method is not appropriate for large rate values: with
-float16 it will stop performing correctly for rates above 9.17;
-float32, 87; and float64, 708. (These are the base-e versions of the
-minimum representable exponent for each type.)
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of tensors, each of which has a shape of `[batch_size, ...]`
-*  <b>`rates`</b>: A tensor of shape `[batch_size]` contiaining the resampling rates
-         for each input.
-*  <b>`scope`</b>: Scope for the op.
-*  <b>`seed`</b>: Random seed to use.
-*  <b>`back_prop`</b>: Whether to allow back-propagation through this op.
-
-##### Returns:
-
-  Selections from the input tensors.
-
-
-- - -
-
-### `tf.contrib.training.stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None)` {#stratified_sample}
-
-Stochastically creates batches based on per-class probabilities.
-
-This method discards examples. Internally, it creates one queue to amortize
-the cost of disk reads, and one queue to hold the properly-proportioned
-batch.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors for data. All tensors are either one item or a
-      batch, according to enqueue_many.
-*  <b>`labels`</b>: Tensor for label of data. Label is a single integer or a batch,
-      depending on enqueue_many. It is not a one-hot vector.
-*  <b>`target_probs`</b>: Target class proportions in batch. An object whose type has a
-      registered Tensor conversion function.
-*  <b>`batch_size`</b>: Size of batch to be returned.
-*  <b>`init_probs`</b>: Class proportions in the data. An object whose type has a
-      registered Tensor conversion function, or `None` for estimating the
-      initial distribution.
-*  <b>`enqueue_many`</b>: Bool. If true, interpret input tensors as having a batch
-      dimension.
-*  <b>`queue_capacity`</b>: Capacity of the large queue that holds input examples.
-*  <b>`threads_per_queue`</b>: Number of threads for the large queue that holds input
-      examples and for the final queue with the proper class proportions.
-*  <b>`name`</b>: Optional prefix for ops created by this function.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: enqueue_many is True and labels doesn't have a batch
-      dimension, or if enqueue_many is False and labels isn't a scalar.
-*  <b>`ValueError`</b>: enqueue_many is True, and batch dimension on data and labels
-      don't match.
-*  <b>`ValueError`</b>: if probs don't sum to one.
-*  <b>`ValueError`</b>: if a zero initial probability class has a nonzero target
-      probability.
-*  <b>`TFAssertion`</b>: if labels aren't integers in [0, num classes).
-
-##### Returns:
-
-  (data_batch, label_batch), where data_batch is a list of tensors of the same
-      length as `tensors`
-
-##### Example:
-
-  # Get tensor for a single data and label example.
-  data, label = data_provider.Get(['data', 'label'])
-
-  # Get stratified batch according to per-class probabilities.
-  target_probs = [...distribution you want...]
-  [data_batch], labels = tf.contrib.training.stratified_sample(
-      [data], label, target_probs)
-
-  # Run batch through network.
-  ...
-
-
-- - -
-
-### `tf.contrib.training.weighted_resample(inputs, weights, overall_rate, scope=None, mean_decay=0.999, seed=None)` {#weighted_resample}
-
-Performs an approximate weighted resampling of `inputs`.
-
-This method chooses elements from `inputs` where each item's rate of
-selection is proportional to its value in `weights`, and the average
-rate of selection across all inputs (and many invocations!) is
-`overall_rate`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of tensors whose first dimension is `batch_size`.
-*  <b>`weights`</b>: A `[batch_size]`-shaped tensor with each batch member's weight.
-*  <b>`overall_rate`</b>: Desired overall rate of resampling.
-*  <b>`scope`</b>: Scope to use for the op.
-*  <b>`mean_decay`</b>: How quickly to decay the running estimate of the mean weight.
-*  <b>`seed`</b>: Random seed.
-
-##### Returns:
-
-  A list of tensors exactly like `inputs`, but with an unknown (and
-    possibly zero) first dimension.
-  A tensor containing the effective resampling rate used for each output.
-
-
-
-## Bucketing
-
-Use ['bucket'](#bucket) or
-['bucket_by_sequence_length'](#bucket_by_sequence_length) to stratify
-minibatches into groups ("buckets").  Use `bucket_by_sequence_length`
-with the argument `dynamic_pad=True` to receive minibatches of similarly
-sized sequences for efficient training via `dynamic_rnn`.
-
-- - -
-
-### `tf.contrib.training.bucket(tensors, which_bucket, batch_size, num_buckets, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=True, shared_name=None, name=None)` {#bucket}
-
-Lazy bucketing of input tensors according to `which_bucket`.
-
-The argument `tensors` can be a list or a dictionary of tensors.
-The value returned by the function will be of the same type
-as `tensors`.
-
-The tensors entering this function are put into the bucket given by
-`which_bucket`.  Each bucket has its own queue.  When a bucket contains
-`batch_size` elements, this minibatch is pushed onto a top queue.  The
-tensors returned from this function are a the result of dequeueing the
-next minibatch from this top queue.
-
-This function is implemented using several queues. A `QueueRunner` for the
-queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-As the returned tensors are the result of of a dequeue operation, evaluating
-them will throw a `tf.errors.OutOfRangeError` when the input queue is
-exhausted.  If these tensors are feeding another input queue, its queue runner
-will catch this exception, however, if they are used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have shape `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queues are closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape()` method will have a 0th `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors, representing a single element,
-    to bucket.  Nested lists are not supported.
-*  <b>`which_bucket`</b>: An `int32` scalar Tensor taking a value in `[0, num_buckets)`.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue (all queues will have
-    the same size).  If a list is passed in then each bucket will have a
-    different batch_size.
-    (python int, int32 scalar or iterable of integers of length num_buckets).
-*  <b>`num_buckets`</b>: A python integer, the number of buckets.
-*  <b>`num_threads`</b>: An integer.  The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of minibatches in the top queue,
-    and also the maximum number of elements within each bucket.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batches to be smaller if there are insufficient items left in the queues.
-*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
-    whether the input is added to the queue or not.  If it evaluates `True`,
-    then `tensors` are added to the bucket; otherwise they are dropped.  This
-    tensor essentially acts as a filtering mechanism.
-*  <b>`shared_name`</b>: (Optional). If set, the queues will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A tuple `(bucket, outputs)` where `bucket` is
-  a `int32` scalar tensor and `outputs` is a list or
-  dictionary of batched outputs corresponding to elements of `tensors`.
-  Every step will receive a new bucket of outputs.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors` or if batch_size is a sequence
-    but it's length != num_buckets.
-
-
-- - -
-
-### `tf.contrib.training.bucket_by_sequence_length(input_length, tensors, batch_size, bucket_boundaries, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=True, shared_name=None, name=None)` {#bucket_by_sequence_length}
-
-Lazy bucketing of inputs according to their length.
-
-This method calls `tf.contrib.training.bucket` under the hood, after first
-subdividing the bucket boundaries into separate buckets and identifying which
-bucket the given `input_length` belongs to.  See the documentation for
-`which_bucket` for details of the other arguments.
-
-##### Args:
-
-
-*  <b>`input_length`</b>: `int32` scalar `Tensor`, the sequence length of tensors.
-*  <b>`tensors`</b>: The list or dictionary of tensors, representing a single element,
-    to bucket.  Nested lists are not supported.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue (all queues will have
-    the same size).  If a list is passed in then each bucket will have a
-    different batch_size.
-    (python int, int32 scalar or iterable of integers of length num_buckets).
-*  <b>`bucket_boundaries`</b>: int list, increasing non-negative numbers.
-    The edges of the buckets to use when bucketing tensors.  Two extra buckets
-    are created, one for `input_length < bucket_boundaries[0]` and
-    one for `input_length >= bucket_boundaries[-1]`.
-*  <b>`num_threads`</b>: An integer.  The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of minibatches in the top queue,
-    and also the maximum number of elements within each bucket.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batches to be smaller if there are insufficient items left in the queues.
-*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
-    whether the input is added to the queue or not.  If it evaluates `True`,
-    then `tensors` are added to the bucket; otherwise they are dropped.  This
-    tensor essentially acts as a filtering mechanism.
-*  <b>`shared_name`</b>: (Optional). If set, the queues will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A tuple `(sequence_length, outputs)` where `sequence_length` is
-  a 1-D `Tensor` of size `batch_size` and `outputs` is a list or dictionary
-  of batched, bucketed, outputs corresponding to elements of `tensors`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `bucket_boundaries` is not a list of python integers.
-*  <b>`ValueError`</b>: if `bucket_boundaries` is empty or contains non-increasing
-    values or if batch_size is a list and it's length doesn't equal the number
-    of buckets.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/contrib.util.md b/tensorflow/g3doc/api_docs/python/contrib.util.md
deleted file mode 100644
index 103b2088d73..00000000000
--- a/tensorflow/g3doc/api_docs/python/contrib.util.md
+++ /dev/null
@@ -1,159 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Utilities (contrib)
-[TOC]
-
-Utilities for dealing with Tensors.
-
-## Miscellaneous Utility Functions
-
-- - -
-
-### `tf.contrib.util.constant_value(tensor)` {#constant_value}
-
-Returns the constant value of the given tensor, if efficiently calculable.
-
-This function attempts to partially evaluate the given tensor, and
-returns its value as a numpy ndarray if this succeeds.
-
-TODO(mrry): Consider whether this function should use a registration
-mechanism like gradients and ShapeFunctions, so that it is easily
-extensible.
-
-NOTE: If `constant_value(tensor)` returns a non-`None` result, it will no
-longer be possible to feed a different value for `tensor`. This allows the
-result of this function to influence the graph that is constructed, and
-permits static shape optimizations.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: The Tensor to be evaluated.
-
-##### Returns:
-
-  A numpy ndarray containing the constant value of the given `tensor`,
-  or None if it cannot be calculated.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if tensor is not an ops.Tensor.
-
-
-- - -
-
-### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None, verify_shape=False)` {#make_tensor_proto}
-
-Create a TensorProto.
-
-##### Args:
-
-
-*  <b>`values`</b>: Values to put in the TensorProto.
-*  <b>`dtype`</b>: Optional tensor_pb2 DataType value.
-*  <b>`shape`</b>: List of integers representing the dimensions of tensor.
-*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
-
-##### Returns:
-
-  A TensorProto. Depending on the type, it may contain data in the
-  "tensor_content" attribute, which is not directly useful to Python programs.
-  To access the values you should convert the proto back to a numpy ndarray
-  with tensor_util.MakeNdarray(proto).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if unsupported types are provided.
-*  <b>`ValueError`</b>: if arguments have inappropriate values or if verify_shape is
-   True and shape of values is not equals to a shape from the argument.
-
-make_tensor_proto accepts "values" of a python scalar, a python list, a
-numpy ndarray, or a numpy scalar.
-
-If "values" is a python scalar or a python list, make_tensor_proto
-first convert it to numpy ndarray. If dtype is None, the
-conversion tries its best to infer the right numpy data
-type. Otherwise, the resulting numpy array has a compatible data
-type with the given dtype.
-
-In either case above, the numpy ndarray (either the caller provided
-or the auto converted) must have the compatible type with dtype.
-
-make_tensor_proto then converts the numpy array to a tensor proto.
-
-If "shape" is None, the resulting tensor proto represents the numpy
-array precisely.
-
-Otherwise, "shape" specifies the tensor's shape and the numpy array
-can not have more elements than what "shape" specifies.
-
-
-- - -
-
-### `tf.contrib.util.make_ndarray(tensor)` {#make_ndarray}
-
-Create a numpy ndarray from a tensor.
-
-Create a numpy ndarray with the same shape and data as the tensor.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A TensorProto.
-
-##### Returns:
-
-  A numpy array with the tensor contents.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if tensor has unsupported type.
-
-
-- - -
-
-### `tf.contrib.util.ops_used_by_graph_def(graph_def)` {#ops_used_by_graph_def}
-
-Collect the list of ops used by a graph.
-
-Does not validate that the ops are all registered.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto, as from `graph.as_graph_def()`.
-
-##### Returns:
-
-  A list of strings, each naming an op used by the graph.
-
-
-- - -
-
-### `tf.contrib.util.stripped_op_list_for_graph(graph_def)` {#stripped_op_list_for_graph}
-
-Collect the stripped OpDefs for ops used by a graph.
-
-This function computes the `stripped_op_list` field of `MetaGraphDef` and
-similar protos.  The result can be communicated from the producer to the
-consumer, which can then use the C++ function
-`RemoveNewDefaultAttrsFromGraphDef` to improve forwards compatibility.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto, as from `graph.as_graph_def()`.
-
-##### Returns:
-
-  An `OpList` of ops used by the graph.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If an unregistered op is used.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/control_flow_ops.md b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
deleted file mode 100644
index cfcec97c91b..00000000000
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ /dev/null
@@ -1,829 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Control Flow
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Control Flow Operations
-
-TensorFlow provides several operations and classes that you can use to control
-the execution of operations and add conditional dependencies to your graph.
-
-- - -
-
-### `tf.identity(input, name=None)` {#identity}
-
-Return a tensor with the same shape and contents as the input tensor or value.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.tuple(tensors, name=None, control_inputs=None)` {#tuple}
-
-Group tensors together.
-
-This creates a tuple of tensors with the same values as the `tensors`
-argument, except that the value of each tensor is only returned after the
-values of all tensors have been computed.
-
-`control_inputs` contains additional ops that have to finish before this op
-finishes, but whose outputs are not returned.
-
-This can be used as a "join" mechanism for parallel computations: all the
-argument tensors can be computed in parallel, but the values of any tensor
-returned by `tuple` are only available after all the parallel computations
-are done.
-
-See also `group` and `with_dependencies`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
-*  <b>`name`</b>: (optional) A name to use as a `name_scope` for the operation.
-*  <b>`control_inputs`</b>: List of additional ops to finish before returning.
-
-##### Returns:
-
-  Same as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
-*  <b>`TypeError`</b>: If `control_inputs` is not a list of `Operation` or `Tensor`
-    objects.
-
-
-- - -
-
-### `tf.group(*inputs, **kwargs)` {#group}
-
-Create an op that groups multiple operations.
-
-When this op finishes, all ops in `input` have finished. This op has no
-output.
-
-See also `tuple` and `with_dependencies`.
-
-##### Args:
-
-
-*  <b>`*inputs`</b>: Zero or more tensors to group.
-*  <b>`**kwargs`</b>: Optional parameters to pass when constructing the NodeDef.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  An Operation that executes all its inputs.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If an unknown keyword argument is provided.
-
-
-- - -
-
-### `tf.no_op(name=None)` {#no_op}
-
-Does nothing. Only useful as a placeholder for control edges.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-### `tf.count_up_to(ref, limit, name=None)` {#count_up_to}
-
-Increments 'ref' until it reaches 'limit'.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Should be from a scalar `Variable` node.
-*  <b>`limit`</b>: An `int`.
-    If incrementing ref would bring it above limit, instead generates an
-    'OutOfRange' error.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `ref`.
-  A copy of the input before increment. If nothing else modifies the
-  input, the values produced will all be distinct.
-
-
-- - -
-
-### `tf.cond(pred, fn1, fn2, name=None)` {#cond}
-
-Return either fn1() or fn2() based on the boolean predicate `pred`.
-
-`fn1` and `fn2` both return lists of output tensors. `fn1` and `fn2` must have
-the same non-zero number and type of outputs.
-
-Note that the conditional execution applies only to the operations defined in
-fn1 and fn2. Consider the following simple program:
-
-```python
-z = tf.multiply(a, b)
-result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
-```
-
-If x < y, the `tf.add` operation will be executed and `tf.square`
-operation will not be executed. Since z is needed for at least one
-branch of the cond, the `tf.multiply` operation is always executed, unconditionally.
-Although this behavior is consistent with the dataflow model of TensorFlow,
-it has occasionally surprised some users who expected a lazier semantics.
-
-##### Args:
-
-
-*  <b>`pred`</b>: A scalar determining whether to return the result of `fn1` or `fn2`.
-*  <b>`fn1`</b>: The callable to be performed if pred is true.
-*  <b>`fn2`</b>: The callable to be performed if pref is false.
-*  <b>`name`</b>: Optional name prefix for the returned tensors.
-
-##### Returns:
-
-  Tensors returned by the call to either `fn1` or `fn2`. If the callables
-  return a singleton list, the element is extracted from the list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn1` or `fn2` is not callable.
-*  <b>`ValueError`</b>: if `fn1` and `fn2` do not return the same number of tensors, or
-              return tensors of different types.
-
-
-*  <b>`Example`</b>: 
-
-```python
-  x = tf.constant(2)
-  y = tf.constant(5)
-  def f1(): return tf.multiply(x, 17)
-  def f2(): return tf.add(y, 23)
-  r = tf.cond(tf.less(x, y), f1, f2)
-  # r is set to f1().
-  # Operations in f2 (e.g., tf.add) are not executed.
-```
-
-
-- - -
-
-### `tf.case(pred_fn_pairs, default, exclusive=False, name='case')` {#case}
-
-Create a case operation.
-
-The `pred_fn_pairs` parameter is a dict or list of pairs of size N.
-Each pair contains a boolean scalar tensor and a python callable that
-creates the tensors to be returned if the boolean evaluates to True.
-`default` is a callable generating a list of tensors. All the callables
-in `pred_fn_pairs` as well as `default` should return the same number
-and types of tensors.
-
-If `exclusive==True`, all predicates are evaluated, and an exception is
-thrown if more than one of the predicates evaluates to `True`.
-If `exclusive==False`, execution stops are the first predicate which
-evaluates to True, and the tensors generated by the corresponding function
-are returned immediately. If none of the predicates evaluate to True, this
-operation returns the tensors generated by `default`.
-
-Example 1:
-  Pseudocode:
-  ```
-    if (x < y) return 17;
-    else return 23;
-  ```
-
-  Expressions:
-  ```
-    f1 = lambda: tf.constant(17)
-    f2 = lambda: tf.constant(23)
-    r = case([(tf.less(x, y), f1)], default=f2)
-  ```
-
-Example 2:
-  Pseudocode:
-  ```
-    if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
-    if (x < y) return 17;
-    else if (x > z) return 23;
-    else return -1;
-  ```
-
-  Expressions:
-  ```
-    x = tf.constant(0)
-    y = tf.constant(1)
-    z = tf.constant(2)
-    def f1(): return tf.constant(17)
-    def f2(): return tf.constant(23)
-    def f3(): return tf.constant(-1)
-    r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
-             default=f3, exclusive=True)
-  ```
-
-##### Args:
-
-
-*  <b>`pred_fn_pairs`</b>: Dict or list of pairs of a boolean scalar tensor and a
-                 callable which returns a list of tensors.
-*  <b>`default`</b>: A callable that returns a list of tensors.
-*  <b>`exclusive`</b>: True iff at most one predicate is allowed to evaluate to `True`.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  The tensors returned by the first pair whose predicate evaluated to True, or
-  those returned by `default` if none does.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `pred_fn_pairs` is not a list/dictionary.
-*  <b>`TypeError`</b>: If `pred_fn_pairs` is a list but does not contain 2-tuples.
-*  <b>`TypeError`</b>: If `fns[i]` is not callable for any i, or `default` is not
-             callable.
-
-
-- - -
-
-### `tf.while_loop(cond, body, loop_vars, shape_invariants=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#while_loop}
-
-Repeat `body` while the condition `cond` is true.
-
-`cond` is a callable returning a boolean scalar tensor. `body` is a callable
-returning a (possibly nested) tuple, namedtuple or list of tensors of the same
-arity (length and structure) and types as `loop_vars`. `loop_vars` is a
-(possibly nested) tuple, namedtuple or list of tensors that is passed to both
-`cond` and `body`. `cond` and `body` both take as many arguments as there are
-`loop_vars`.
-
-While `cond` evaluates to true, `body` is executed.
-
-In addition to regular Tensors or IndexedSlices, the body may accept and
-return TensorArray objects.  The flows of the TensorArray objects will
-be appropriately forwarded between loops and during gradient calculations.
-
-For correctness, `tf.while_loop()` strictly enforces shape invariants for
-the loop variables. A shape invariant is a (possibly partial) shape that
-is unchanged across the iterations of the loop. An error will be raised
-if the shape of a loop variable after an iteration is determined to be more
-general than or incompatible with its shape invariant. For example, a shape
-of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
-compatible with [11, 17]. By default (if the argument `shape_invariants` is
-not specified), it is assumed that the initial shape of each tensor in
-`loop_vars` is the same in every iteration. The `shape_invariants` argument
-allows the caller to specify a less specific shape invariant for each loop
-variable, which is needed if the shape varies between iterations. The
-[`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape)
-function may also be used in the `body` function to indicate that
-the output loop variable has a particular shape. The shape invariant for
-SparseTensor and IndexedSlices are treated specially as follows:
-
-a) If a loop variable is a SparseTensor, the shape invariant must be
-TensorShape([r]) where r is the rank of the dense tensor represented
-by the sparse tensor. It means the shapes of the three tensors of the
-SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
-is the shape of the SparseTensor.dense_shape property. It must be the shape of
-a vector.
-
-b) If a loop variable is an IndexedSlices, the shape invariant must be
-a shape invariant of the values tensor of the IndexedSlices. It means
-the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
-[shape.ndims]).
-
-`while_loop` implements non-strict semantics, enabling multiple iterations
-to run in parallel. The maximum number of parallel iterations can be
-controlled by `parallel_iterations`, which gives users some control over
-memory consumption and execution order. For correct programs, `while_loop`
-should return the same result for any parallel_iterations > 0.
-
-For training, TensorFlow remembers the tensors that are produced in the
-forward inference but needed in back propagation. These tensors can be a
-main source of memory consumption and often cause OOM problems when training
-on GPUs.  When the flag swap_memory is true, we swap out these tensors from
-GPU to CPU.  This for example allows us to train RNN models with very long
-sequences and large batches.
-
-##### Args:
-
-
-*  <b>`cond`</b>: A callable that represents the termination condition of the loop.
-*  <b>`body`</b>: A callable that represents the loop body.
-*  <b>`loop_vars`</b>: A (possibly nested) tuple, namedtuple or list of numpy array,
-    `Tensor`, and `TensorArray` objects.
-*  <b>`shape_invariants`</b>: The shape invariants for the loop variables.
-*  <b>`parallel_iterations`</b>: The number of iterations allowed to run in parallel.
-    It must be a positive integer.
-*  <b>`back_prop`</b>: Whether backprop is enabled for this while loop.
-*  <b>`swap_memory`</b>: Whether GPU-CPU memory swap is enabled for this loop.
-*  <b>`name`</b>: Optional name prefix for the returned tensors.
-
-##### Returns:
-
-  The output tensors for the loop variables after the loop. When the length
-  of `loop_vars` is 1 this is a Tensor, TensorArray or IndexedSlice and when
-  the length of `loop_vars` is greater than 1 it returns a list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `cond` or `body` is not callable.
-*  <b>`ValueError`</b>: if `loop_vars` is empty.
-
-
-*  <b>`Example`</b>: 
-
-  ```python
-  i = tf.constant(0)
-  c = lambda i: tf.less(i, 10)
-  b = lambda i: tf.add(i, 1)
-  r = tf.while_loop(c, b, [i])
-  ```
-
-Example with nesting and a namedtuple:
-
-  ```python
-  import collections
-  Pair = collections.namedtuple('Pair', 'j, k')
-  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
-  c = lambda i, p: i < 10
-  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
-  ijk_final = tf.while_loop(c, b, ijk_0)
-  ```
-
-Example using shape_invariants:
-
-  ```python
-  i0 = tf.constant(0)
-  m0 = tf.ones([2, 2])
-  c = lambda i, m: i < 10
-  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
-  tf.while_loop(
-      c, b, loop_vars=[i0, m0],
-      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
-  ```
-
-
-
-## Logical Operators
-
-TensorFlow provides several operations that you can use to add logical operators
-to your graph.
-
-- - -
-
-### `tf.logical_and(x, y, name=None)` {#logical_and}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.logical_not(x, name=None)` {#logical_not}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.logical_or(x, y, name=None)` {#logical_or}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.logical_xor(x, y, name='LogicalXor')` {#logical_xor}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-
-## Comparison Operators
-
-TensorFlow provides several operations that you can use to add comparison
-operators to your graph.
-
-- - -
-
-### `tf.equal(x, y, name=None)` {#equal}
-
-Returns the truth value of (x == y) element-wise.
-
-*NOTE*: `Equal` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `quint8`, `qint8`, `qint32`, `string`, `bool`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.not_equal(x, y, name=None)` {#not_equal}
-
-Returns the truth value of (x != y) element-wise.
-
-*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `quint8`, `qint8`, `qint32`, `string`, `bool`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.less(x, y, name=None)` {#less}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.less_equal(x, y, name=None)` {#less_equal}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.greater(x, y, name=None)` {#greater}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.greater_equal(x, y, name=None)` {#greater_equal}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.where(condition, x=None, y=None, name=None)` {#where}
-
-Return the elements, either from `x` or `y`, depending on the `condition`.
-
-If both `x` and `y` are None, then this operation returns the coordinates of
-true elements of `condition`.  The coordinates are returned in a 2-D tensor
-where the first dimension (rows) represents the number of true elements, and
-the second dimension (columns) represents the coordinates of the true
-elements. Keep in mind, the shape of the output tensor can vary depending on
-how many true values there are in input. Indices are output in row-major
-order.
-
-If both non-None, `x` and `y` must have the same shape.
-The `condition` tensor must be a scalar if `x` and `y` are scalar.
-If `x` and `y` are vectors or higher rank, then `condition` must be either a
-vector with size matching the first dimension of `x`, or must have the same
-shape as `x`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be taken
-from `x` (if true) or `y` (if false).
-
-If `condition` is a vector and `x` and `y` are higher rank matrices, then it
-chooses which row (outer dimension) to copy from `x` and `y`. If `condition`
-has the same shape as `x` and `y`, then it chooses which element to copy from
-`x` and `y`.
-
-##### Args:
-
-
-*  <b>`condition`</b>: A `Tensor` of type `bool`
-*  <b>`x`</b>: A Tensor which may have the same shape as `condition`. If `condition` is
-    rank 1, `x` may have higher rank, but its first dimension must match the
-    size of `condition`.
-*  <b>`y`</b>: A `tensor` with the same shape and type as `x`.
-*  <b>`name`</b>: A name of the operation (optional)
-
-##### Returns:
-
-  A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-  A `Tensor` with shape `(num_true, dim_size(condition))`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When exactly one of `x` or `y` is non-None.
-
-
-
-## Debugging Operations
-
-TensorFlow provides several operations that you can use to validate values and
-debug your graph.
-
-- - -
-
-### `tf.is_finite(x, name=None)` {#is_finite}
-
-Returns which elements of x are finite.
-
-@compatibility(numpy)
-Equivalent to np.isfinite
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.is_inf(x, name=None)` {#is_inf}
-
-Returns which elements of x are Inf.
-
-@compatibility(numpy)
-Equivalent to np.isinf
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.is_nan(x, name=None)` {#is_nan}
-
-Returns which elements of x are NaN.
-
-@compatibility(numpy)
-Equivalent to np.isnan
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-### `tf.verify_tensor_all_finite(t, msg, name=None)` {#verify_tensor_all_finite}
-
-Assert that the tensor does not contain any NaN's or Inf's.
-
-##### Args:
-
-
-*  <b>`t`</b>: Tensor to check.
-*  <b>`msg`</b>: Message to log on failure.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Same tensor as `t`.
-
-
-- - -
-
-### `tf.check_numerics(tensor, message, name=None)` {#check_numerics}
-
-Checks a tensor for NaN and Inf values.
-
-When run, reports an `InvalidArgument` error if `tensor` has any values
-that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`message`</b>: A `string`. Prefix of the error message.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
-
-- - -
-
-### `tf.add_check_numerics_ops()` {#add_check_numerics_ops}
-
-Connect a `check_numerics` to every floating point tensor.
-
-`check_numerics` operations themselves are added for each `half`, `float`,
-or `double` tensor in the graph. For all ops in the graph, the
-`check_numerics` op for all of its (`half`, `float`, or `double`) inputs
-is guaranteed to run before the `check_numerics` op on any of its outputs.
-
-##### Returns:
-
-  A `group` op depending on all `check_numerics` ops added.
-
-
-- - -
-
-### `tf.Assert(condition, data, summarize=None, name=None)` {#Assert}
-
-Asserts that the given condition is true.
-
-If `condition` evaluates to false, print the list of tensors in `data`.
-`summarize` determines how many entries of the tensors to print.
-
-NOTE: To ensure that Assert executes, one usually attaches a dependency:
-
-```python
-# Ensure maximum element of x is smaller or equal to 1
-assert_op = tf.Assert(tf.less_equal(tf.reduce_max(x), 1.), [x])
-with tf.control_dependencies([assert_op]):
-  ... code using x ...
-```
-
-##### Args:
-
-
-*  <b>`condition`</b>: The condition to evaluate.
-*  <b>`data`</b>: The tensors to print out when condition is false.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`assert_op`</b>: An `Operation` that, when executed, raises a
-  `tf.errors.InvalidArgumentError` if `condition` is not true.
-
-
-- - -
-
-### `tf.Print(input_, data, message=None, first_n=None, summarize=None, name=None)` {#Print}
-
-Prints a list of tensors.
-
-This is an identity op with the side effect of printing `data` when
-evaluating.
-
-##### Args:
-
-
-*  <b>`input_`</b>: A tensor passed through this op.
-*  <b>`data`</b>: A list of tensors to print out when op is evaluated.
-*  <b>`message`</b>: A string, prefix of the error message.
-*  <b>`first_n`</b>: Only log `first_n` number of times. Negative numbers log always;
-           this is the default.
-*  <b>`summarize`</b>: Only print this many entries of each tensor. If None, then a
-             maximum of 3 elements are printed per input tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same tensor as `input_`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
deleted file mode 100644
index 4cfd8268cb6..00000000000
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ /dev/null
@@ -1,3988 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Building Graphs
-[TOC]
-
-Classes and functions for building TensorFlow graphs.
-
-## Core graph data structures
-
-- - -
-
-### `class tf.Graph` {#Graph}
-
-A TensorFlow computation, represented as a dataflow graph.
-
-A `Graph` contains a set of
-[`Operation`](../../api_docs/python/framework.md#Operation) objects,
-which represent units of computation; and
-[`Tensor`](../../api_docs/python/framework.md#Tensor) objects, which represent
-the units of data that flow between operations.
-
-A default `Graph` is always registered, and accessible by calling
-[`tf.get_default_graph()`](../../api_docs/python/framework.md#get_default_graph).
-To add an operation to the default graph, simply call one of the functions
-that defines a new `Operation`:
-
-```python
-c = tf.constant(4.0)
-assert c.graph is tf.get_default_graph()
-```
-
-Another typical usage involves the
-[`Graph.as_default()`](../../api_docs/python/framework.md#Graph.as_default)
-context manager, which overrides the current default graph for the
-lifetime of the context:
-
-```python
-g = tf.Graph()
-with g.as_default():
-  # Define operations and tensors in `g`.
-  c = tf.constant(30.0)
-  assert c.graph is g
-```
-
-Important note: This class *is not* thread-safe for graph construction. All
-operations should be created from a single thread, or external
-synchronization must be provided. Unless otherwise specified, all methods
-are not thread-safe.
-
-- - -
-
-#### `tf.Graph.__init__()` {#Graph.__init__}
-
-Creates a new, empty Graph.
-
-
-- - -
-
-#### `tf.Graph.as_default()` {#Graph.as_default}
-
-Returns a context manager that makes this `Graph` the default graph.
-
-This method should be used if you want to create multiple graphs
-in the same process. For convenience, a global default graph is
-provided, and all ops will be added to this graph if you do not
-create a new graph explicitly. Use this method with the `with` keyword
-to specify that ops created within the scope of a block should be
-added to this graph.
-
-The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default graph in that
-thread, you must explicitly add a `with g.as_default():` in that
-thread's function.
-
-The following code examples are equivalent:
-
-```python
-# 1. Using Graph.as_default():
-g = tf.Graph()
-with g.as_default():
-  c = tf.constant(5.0)
-  assert c.graph is g
-
-# 2. Constructing and making default:
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0)
-  assert c.graph is g
-```
-
-##### Returns:
-
-  A context manager for using this graph as the default graph.
-
-
-- - -
-
-#### `tf.Graph.as_graph_def(from_version=None, add_shapes=False)` {#Graph.as_graph_def}
-
-Returns a serialized `GraphDef` representation of this graph.
-
-The serialized `GraphDef` can be imported into another `Graph`
-(using [`import_graph_def()`](#import_graph_def)) or used with the
-[C++ Session API](../../api_docs/cc/index.md).
-
-This method is thread-safe.
-
-##### Args:
-
-
-*  <b>`from_version`</b>: Optional.  If this is set, returns a `GraphDef`
-    containing only the nodes that were added to this graph since
-    its `version` property had the given value.
-*  <b>`add_shapes`</b>: If true, adds an "_output_shapes" list attr to each
-    node with the inferred shapes of each of its outputs.
-
-##### Returns:
-
-  A [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
-  protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `graph_def` would be too large.
-
-
-- - -
-
-#### `tf.Graph.finalize()` {#Graph.finalize}
-
-Finalizes this graph, making it read-only.
-
-After calling `g.finalize()`, no new operations can be added to
-`g`.  This method is used to ensure that no operations are added
-to a graph when it is shared between multiple threads, for example
-when using a [`QueueRunner`](../../api_docs/python/train.md#QueueRunner).
-
-
-- - -
-
-#### `tf.Graph.finalized` {#Graph.finalized}
-
-True if this graph has been finalized.
-
-
-
-- - -
-
-#### `tf.Graph.control_dependencies(control_inputs)` {#Graph.control_dependencies}
-
-Returns a context manager that specifies control dependencies.
-
-Use with the `with` keyword to specify that all operations constructed
-within the context should have control dependencies on
-`control_inputs`. For example:
-
-```python
-with g.control_dependencies([a, b, c]):
-  # `d` and `e` will only run after `a`, `b`, and `c` have executed.
-  d = ...
-  e = ...
-```
-
-Multiple calls to `control_dependencies()` can be nested, and in
-that case a new `Operation` will have control dependencies on the union
-of `control_inputs` from all active contexts.
-
-```python
-with g.control_dependencies([a, b]):
-  # Ops constructed here run after `a` and `b`.
-  with g.control_dependencies([c, d]):
-    # Ops constructed here run after `a`, `b`, `c`, and `d`.
-```
-
-You can pass None to clear the control dependencies:
-
-```python
-with g.control_dependencies([a, b]):
-  # Ops constructed here run after `a` and `b`.
-  with g.control_dependencies(None):
-    # Ops constructed here run normally, not waiting for either `a` or `b`.
-    with g.control_dependencies([c, d]):
-      # Ops constructed here run after `c` and `d`, also not waiting
-      # for either `a` or `b`.
-```
-
-*N.B.* The control dependencies context applies *only* to ops that
-are constructed within the context. Merely using an op or tensor
-in the context does not add a control dependency. The following
-example illustrates this point:
-
-```python
-# WRONG
-def my_func(pred, tensor):
-  t = tf.matmul(tensor, tensor)
-  with tf.control_dependencies([pred]):
-    # The matmul op is created outside the context, so no control
-    # dependency will be added.
-    return t
-
-# RIGHT
-def my_func(pred, tensor):
-  with tf.control_dependencies([pred]):
-    # The matmul op is created in the context, so a control dependency
-    # will be added.
-    return tf.matmul(tensor, tensor)
-```
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A list of `Operation` or `Tensor` objects which
-    must be executed or computed before running the operations
-    defined in the context.  Can also be `None` to clear the control
-    dependencies.
-
-##### Returns:
-
- A context manager that specifies control dependencies for all
- operations constructed within the context.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `control_inputs` is not a list of `Operation` or
-    `Tensor` objects.
-
-
-- - -
-
-#### `tf.Graph.device(device_name_or_function)` {#Graph.device}
-
-Returns a context manager that specifies the default device to use.
-
-The `device_name_or_function` argument may either be a device name
-string, a device function, or None:
-
-* If it is a device name string, all operations constructed in
-  this context will be assigned to the device with that name, unless
-  overridden by a nested `device()` context.
-* If it is a function, it will be treated as a function from
-  Operation objects to device name strings, and invoked each time
-  a new Operation is created. The Operation will be assigned to
-  the device with the returned name.
-* If it is None, all `device()` invocations from the enclosing context
-  will be ignored.
-
-For information about the valid syntax of device name strings, see
-the documentation in
-[`DeviceNameUtils`](https://www.tensorflow.org/code/tensorflow/core/util/device_name_utils.h).
-
-For example:
-
-```python
-with g.device('/gpu:0'):
-  # All operations constructed in this context will be placed
-  # on GPU 0.
-  with g.device(None):
-    # All operations constructed in this context will have no
-    # assigned device.
-
-# Defines a function from `Operation` to device string.
-def matmul_on_gpu(n):
-  if n.type == "MatMul":
-    return "/gpu:0"
-  else:
-    return "/cpu:0"
-
-with g.device(matmul_on_gpu):
-  # All operations of type "MatMul" constructed in this context
-  # will be placed on GPU 0; all other operations will be placed
-  # on CPU 0.
-```
-
-**N.B.** The device scope may be overridden by op wrappers or
-other library code. For example, a variable assignment op
-`v.assign()` must be colocated with the `tf.Variable` `v`, and
-incompatible device scopes will be ignored.
-
-##### Args:
-
-
-*  <b>`device_name_or_function`</b>: The device name or function to use in
-    the context.
-
-##### Returns:
-
-  A context manager that specifies the default device to use for newly
-  created ops.
-
-
-- - -
-
-#### `tf.Graph.name_scope(name)` {#Graph.name_scope}
-
-Returns a context manager that creates hierarchical names for operations.
-
-A graph maintains a stack of name scopes. A `with name_scope(...):`
-statement pushes a new name onto the stack for the lifetime of the context.
-
-The `name` argument will be interpreted as follows:
-
-* A string (not ending with '/') will create a new name scope, in which
-  `name` is appended to the prefix of all operations created in the
-  context. If `name` has been used before, it will be made unique by
-  calling `self.unique_name(name)`.
-* A scope previously captured from a `with g.name_scope(...) as
-  scope:` statement will be treated as an "absolute" name scope, which
-  makes it possible to re-enter existing scopes.
-* A value of `None` or the empty string will reset the current name scope
-  to the top-level (empty) name scope.
-
-For example:
-
-```python
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0, name="c")
-  assert c.op.name == "c"
-  c_1 = tf.constant(6.0, name="c")
-  assert c_1.op.name == "c_1"
-
-  # Creates a scope called "nested"
-  with g.name_scope("nested") as scope:
-    nested_c = tf.constant(10.0, name="c")
-    assert nested_c.op.name == "nested/c"
-
-    # Creates a nested scope called "inner".
-    with g.name_scope("inner"):
-      nested_inner_c = tf.constant(20.0, name="c")
-      assert nested_inner_c.op.name == "nested/inner/c"
-
-    # Create a nested scope called "inner_1".
-    with g.name_scope("inner"):
-      nested_inner_1_c = tf.constant(30.0, name="c")
-      assert nested_inner_1_c.op.name == "nested/inner_1/c"
-
-      # Treats `scope` as an absolute name scope, and
-      # switches to the "nested/" scope.
-      with g.name_scope(scope):
-        nested_d = tf.constant(40.0, name="d")
-        assert nested_d.op.name == "nested/d"
-
-        with g.name_scope(""):
-          e = tf.constant(50.0, name="e")
-          assert e.op.name == "e"
-```
-
-The name of the scope itself can be captured by `with
-g.name_scope(...) as scope:`, which stores the name of the scope
-in the variable `scope`. This value can be used to name an
-operation that represents the overall result of executing the ops
-in a scope. For example:
-
-```python
-inputs = tf.constant(...)
-with g.name_scope('my_layer') as scope:
-  weights = tf.Variable(..., name="weights")
-  biases = tf.Variable(..., name="biases")
-  affine = tf.matmul(inputs, weights) + biases
-  output = tf.nn.relu(affine, name=scope)
-```
-
-NOTE: This constructor validates the given `name`. Valid scope
-names match one of the following regular expressions:
-
-    [A-Za-z0-9.][A-Za-z0-9_.\\-/]* (for scopes at the root)
-    [A-Za-z0-9_.\\-/]* (for other scopes)
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the scope.
-
-##### Returns:
-
-  A context manager that installs `name` as a new name scope.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `name` is not a valid scope name. The rules are the
-
-
-
-A `Graph` instance supports an arbitrary number of "collections"
-that are identified by name. For convenience when building a large
-graph, collections can store groups of related objects: for
-example, the `tf.Variable` uses a collection (named
-[`tf.GraphKeys.GLOBAL_VARIABLES`](../../api_docs/python/framework.md#GraphKeys)) for
-all variables that are created during the construction of a graph. The caller
-may define additional collections by specifying a new name.
-
-- - -
-
-#### `tf.Graph.add_to_collection(name, value)` {#Graph.add_to_collection}
-
-Stores `value` in the collection with the given `name`.
-
-Note that collections are not sets, so it is possible to add a value to
-a collection several times.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. The `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collection.
-
-
-- - -
-
-#### `tf.Graph.add_to_collections(names, value)` {#Graph.add_to_collections}
-
-Stores `value` in the collections given by `names`.
-
-Note that collections are not sets, so it is possible to add a value to
-a collection several times. This function makes sure that duplicates in
-`names` are ignored, but it will not check for pre-existing membership of
-`value` in any of the collections in `names`.
-
-`names` can be any iterable, but if `names` is a string, it is treated as a
-single collection name.
-
-##### Args:
-
-
-*  <b>`names`</b>: The keys for the collections to add to. The `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collections.
-
-
-- - -
-
-#### `tf.Graph.get_collection(name, scope=None)` {#Graph.get_collection}
-
-Returns a list of values in the collection with the given `name`.
-
-This is different from `get_collection_ref()` which always returns the
-actual collection list if it exists in that it returns a new list each time
-it is called.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`scope`</b>: (Optional.) If supplied, the resulting list is filtered to include
-    only items whose `name` attribute matches using `re.match`. Items
-    without a `name` attribute are never returned if a scope is supplied and
-    the choice or `re.match` means that a `scope` without special tokens
-    filters by prefix.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or
-  an empty list if no value has been added to that collection. The
-  list contains the values in the order under which they were
-  collected.
-
-
-- - -
-
-#### `tf.Graph.get_collection_ref(name)` {#Graph.get_collection_ref}
-
-Returns a list of values in the collection with the given `name`.
-
-If the collection exists, this returns the list itself, which can
-be modified in place to change the collection.  If the collection does
-not exist, it is created as an empty list and the list is returned.
-
-This is different from `get_collection()` which always returns a copy of
-the collection list if it exists and never creates an empty collection.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or an empty
-  list if no value has been added to that collection.
-
-
-
-- - -
-
-#### `tf.Graph.as_graph_element(obj, allow_tensor=True, allow_operation=True)` {#Graph.as_graph_element}
-
-Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
-
-This function validates that `obj` represents an element of this
-graph, and gives an informative error message if it is not.
-
-This function is the canonical way to get/validate an object of
-one of the allowed types from an external argument reference in the
-Session API.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`obj`</b>: A `Tensor`, an `Operation`, or the name of a tensor or operation.
-    Can also be any object with an `_as_graph_element()` method that returns
-    a value of one of these types.
-*  <b>`allow_tensor`</b>: If true, `obj` may refer to a `Tensor`.
-*  <b>`allow_operation`</b>: If true, `obj` may refer to an `Operation`.
-
-##### Returns:
-
-  The `Tensor` or `Operation` in the Graph corresponding to `obj`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `obj` is not a type we support attempting to convert
-    to types.
-*  <b>`ValueError`</b>: If `obj` is of an appropriate type but invalid. For
-    example, an invalid string.
-*  <b>`KeyError`</b>: If `obj` is not an object in the graph.
-
-
-- - -
-
-#### `tf.Graph.get_operation_by_name(name)` {#Graph.get_operation_by_name}
-
-Returns the `Operation` with the given `name`.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the `Operation` to return.
-
-##### Returns:
-
-  The `Operation` with the given `name`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `name` is not a string.
-*  <b>`KeyError`</b>: If `name` does not correspond to an operation in this graph.
-
-
-- - -
-
-#### `tf.Graph.get_tensor_by_name(name)` {#Graph.get_tensor_by_name}
-
-Returns the `Tensor` with the given `name`.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the `Tensor` to return.
-
-##### Returns:
-
-  The `Tensor` with the given `name`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `name` is not a string.
-*  <b>`KeyError`</b>: If `name` does not correspond to a tensor in this graph.
-
-
-- - -
-
-#### `tf.Graph.get_operations()` {#Graph.get_operations}
-
-Return the list of operations in the graph.
-
-You can modify the operations in place, but modifications
-to the list such as inserts/delete have no effect on the
-list of operations known to the graph.
-
-This method may be called concurrently from multiple threads.
-
-##### Returns:
-
-  A list of Operations.
-
-
-
-- - -
-
-#### `tf.Graph.seed` {#Graph.seed}
-
-The graph-level random seed of this graph.
-
-
-- - -
-
-#### `tf.Graph.unique_name(name, mark_as_used=True)` {#Graph.unique_name}
-
-Return a unique operation name for `name`.
-
-Note: You rarely need to call `unique_name()` directly.  Most of
-the time you just need to create `with g.name_scope()` blocks to
-generate structured names.
-
-`unique_name` is used to generate structured names, separated by
-`"/"`, to help identify operations when debugging a graph.
-Operation names are displayed in error messages reported by the
-TensorFlow runtime, and in various visualization tools such as
-TensorBoard.
-
-If `mark_as_used` is set to `True`, which is the default, a new
-unique name is created and marked as in use. If it's set to `False`,
-the unique name is returned without actually being marked as used.
-This is useful when the caller simply wants to know what the name
-to be created will be.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name for an operation.
-*  <b>`mark_as_used`</b>: Whether to mark this name as being used.
-
-##### Returns:
-
-  A string to be passed to `create_op()` that will be used
-  to name the operation being created.
-
-
-- - -
-
-#### `tf.Graph.version` {#Graph.version}
-
-Returns a version number that increases as ops are added to the graph.
-
-Note that this is unrelated to the
-[GraphDef version](#Graph.graph_def_version).
-
-
-- - -
-
-#### `tf.Graph.graph_def_versions` {#Graph.graph_def_versions}
-
-The GraphDef version information of this graph.
-
-For details on the meaning of each version, see
-[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
-
-##### Returns:
-
-  A `VersionDef`.
-
-
-
-- - -
-
-#### `tf.Graph.create_op(op_type, inputs, dtypes, input_types=None, name=None, attrs=None, op_def=None, compute_shapes=True, compute_device=True)` {#Graph.create_op}
-
-Creates an `Operation` in this graph.
-
-This is a low-level interface for creating an `Operation`. Most
-programs will not call this method directly, and instead use the
-Python op constructors, such as `tf.constant()`, which add ops to
-the default graph.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The `Operation` type to create. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-*  <b>`inputs`</b>: A list of `Tensor` objects that will be inputs to the `Operation`.
-*  <b>`dtypes`</b>: A list of `DType` objects that will be the types of the tensors
-    that the operation produces.
-*  <b>`input_types`</b>: (Optional.) A list of `DType`s that will be the types of
-    the tensors that the operation consumes. By default, uses the base
-    `DType` of each input in `inputs`. Operations that expect
-    reference-typed inputs must specify `input_types` explicitly.
-*  <b>`name`</b>: (Optional.) A string name for the operation. If not specified, a
-    name is generated based on `op_type`.
-*  <b>`attrs`</b>: (Optional.) A dictionary where the key is the attribute name (a
-    string) and the value is the respective `attr` attribute of the
-    `NodeDef` proto that will represent the operation (an `AttrValue`
-    proto).
-*  <b>`op_def`</b>: (Optional.) The `OpDef` proto that describes the `op_type` that
-    the operation will have.
-*  <b>`compute_shapes`</b>: (Optional.) If True, shape inference will be performed
-    to compute the shapes of the outputs.
-*  <b>`compute_device`</b>: (Optional.) If True, device functions will be executed
-    to compute the device property of the Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not a `Tensor`.
-*  <b>`ValueError`</b>: if colocation conflicts with existing device assignment.
-
-##### Returns:
-
-  An `Operation` object.
-
-
-- - -
-
-#### `tf.Graph.gradient_override_map(op_type_map)` {#Graph.gradient_override_map}
-
-EXPERIMENTAL: A context manager for overriding gradient functions.
-
-This context manager can be used to override the gradient function
-that will be used for ops within the scope of the context.
-
-For example:
-
-```python
-@tf.RegisterGradient("CustomSquare")
-def _custom_square_grad(op, grad):
-  # ...
-
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0)
-  s_1 = tf.square(c)  # Uses the default gradient for tf.square.
-  with g.gradient_override_map({"Square": "CustomSquare"}):
-    s_2 = tf.square(s_2)  # Uses _custom_square_grad to compute the
-                          # gradient of s_2.
-```
-
-##### Args:
-
-
-*  <b>`op_type_map`</b>: A dictionary mapping op type strings to alternative op
-    type strings.
-
-##### Returns:
-
-  A context manager that sets the alternative op type to be used for one
-  or more ops created in that context.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type_map` is not a dictionary mapping strings to
-    strings.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Graph.building_function` {#Graph.building_function}
-
-Returns True iff this graph represents a function.
-
-
-- - -
-
-#### `tf.Graph.clear_collection(name)` {#Graph.clear_collection}
-
-Clears all values in a collection.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. The `GraphKeys` class contains many
-    standard names for collections.
-
-
-- - -
-
-#### `tf.Graph.colocate_with(op, ignore_existing=False)` {#Graph.colocate_with}
-
-Returns a context manager that specifies an op to colocate with.
-
-Note: this function is not for public use, only for internal libraries.
-
-For example:
-
-```python
-a = tf.Variable([1.0])
-with g.colocate_with(a):
-  b = tf.constant(1.0)
-  c = tf.add(a, b)
-```
-
-`b` and `c` will always be colocated with `a`, no matter where `a`
-is eventually placed.
-
-**NOTE** Using a colocation scope resets any existing device constraints.
-
-If `op` is `None` then `ignore_existing` must be `True` and the new
-scope resets all colocation and device constraints.
-
-##### Args:
-
-
-*  <b>`op`</b>: The op to colocate all created ops with, or `None`.
-*  <b>`ignore_existing`</b>: If true, only applies colocation of this op within
-    the context, rather than applying all colocation properties
-    on the stack.  If `op` is `None`, this value must be `True`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if op is None but ignore_existing is False.
-
-##### Yields:
-
-  A context manager that specifies the op with which to colocate
-  newly created ops.
-
-
-- - -
-
-#### `tf.Graph.container(container_name)` {#Graph.container}
-
-Returns a context manager that specifies the resource container to use.
-
-Stateful operations, such as variables and queues, can maintain their
-states on devices so that they can be shared by multiple processes.
-A resource container is a string name under which these stateful
-operations are tracked. These resources can be released or cleared
-with `tf.Session.reset()`.
-
-For example:
-
-```python
-with g.container('experiment0'):
-  # All stateful Operations constructed in this context will be placed
-  # in resource container "experiment0".
-  v1 = tf.Variable([1.0])
-  v2 = tf.Variable([2.0])
-  with g.container("experiment1"):
-    # All stateful Operations constructed in this context will be
-    # placed in resource container "experiment1".
-    v3 = tf.Variable([3.0])
-    q1 = tf.FIFOQueue(10, tf.float32)
-  # All stateful Operations constructed in this context will be
-  # be created in the "experiment0".
-  v4 = tf.Variable([4.0])
-  q1 = tf.FIFOQueue(20, tf.float32)
-  with g.container(""):
-    # All stateful Operations constructed in this context will be
-    # be placed in the default resource container.
-    v5 = tf.Variable([5.0])
-    q3 = tf.FIFOQueue(30, tf.float32)
-
-# Resets container "experiment0", after which the state of v1, v2, v4, q1
-# will become undefined (such as uninitialized).
-tf.Session.reset(target, ["experiment0"])
-```
-
-##### Args:
-
-
-*  <b>`container_name`</b>: container name string.
-
-##### Returns:
-
-  A context manager for defining resource containers for stateful ops,
-    yields the container name.
-
-
-- - -
-
-#### `tf.Graph.get_all_collection_keys()` {#Graph.get_all_collection_keys}
-
-Returns a list of collections used in this graph.
-
-
-- - -
-
-#### `tf.Graph.is_feedable(tensor)` {#Graph.is_feedable}
-
-Returns `True` if and only if `tensor` is feedable.
-
-
-- - -
-
-#### `tf.Graph.is_fetchable(tensor_or_op)` {#Graph.is_fetchable}
-
-Returns `True` if and only if `tensor_or_op` is fetchable.
-
-
-- - -
-
-#### `tf.Graph.prevent_feeding(tensor)` {#Graph.prevent_feeding}
-
-Marks the given `tensor` as unfeedable in this graph.
-
-
-- - -
-
-#### `tf.Graph.prevent_fetching(op)` {#Graph.prevent_fetching}
-
-Marks the given `op` as unfetchable in this graph.
-
-
-
-- - -
-
-### `class tf.Operation` {#Operation}
-
-Represents a graph node that performs computation on tensors.
-
-An `Operation` is a node in a TensorFlow `Graph` that takes zero or
-more `Tensor` objects as input, and produces zero or more `Tensor`
-objects as output. Objects of type `Operation` are created by
-calling a Python op constructor (such as
-[`tf.matmul()`](../../api_docs/python/math_ops.md#matmul))
-or [`Graph.create_op()`](../../api_docs/python/framework.md#Graph.create_op).
-
-For example `c = tf.matmul(a, b)` creates an `Operation` of type
-"MatMul" that takes tensors `a` and `b` as input, and produces `c`
-as output.
-
-After the graph has been launched in a session, an `Operation` can
-be executed by passing it to
-[`Session.run()`](../../api_docs/python/client.md#Session.run).
-`op.run()` is a shortcut for calling `tf.get_default_session().run(op)`.
-
-- - -
-
-#### `tf.Operation.name` {#Operation.name}
-
-The full name of this operation.
-
-
-- - -
-
-#### `tf.Operation.type` {#Operation.type}
-
-The type of the op (e.g. `"MatMul"`).
-
-
-- - -
-
-#### `tf.Operation.inputs` {#Operation.inputs}
-
-The list of `Tensor` objects representing the data inputs of this op.
-
-
-- - -
-
-#### `tf.Operation.control_inputs` {#Operation.control_inputs}
-
-The `Operation` objects on which this op has a control dependency.
-
-Before this op is executed, TensorFlow will ensure that the
-operations in `self.control_inputs` have finished executing. This
-mechanism can be used to run ops sequentially for performance
-reasons, or to ensure that the side effects of an op are observed
-in the correct order.
-
-##### Returns:
-
-  A list of `Operation` objects.
-
-
-- - -
-
-#### `tf.Operation.outputs` {#Operation.outputs}
-
-The list of `Tensor` objects representing the outputs of this op.
-
-
-- - -
-
-#### `tf.Operation.device` {#Operation.device}
-
-The name of the device to which this op has been assigned, if any.
-
-##### Returns:
-
-  The string name of the device to which this op has been
-  assigned, or an empty string if it has not been assigned to a
-  device.
-
-
-- - -
-
-#### `tf.Operation.graph` {#Operation.graph}
-
-The `Graph` that contains this operation.
-
-
-
-- - -
-
-#### `tf.Operation.run(feed_dict=None, session=None)` {#Operation.run}
-
-Runs this operation in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for this operation.
-
-*N.B.* Before invoking `Operation.run()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run)
-    for a description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to run to this operation. If
-    none, the default session will be used.
-
-
-
-- - -
-
-#### `tf.Operation.get_attr(name)` {#Operation.get_attr}
-
-Returns the value of the attr of this op with the given `name`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the attr to fetch.
-
-##### Returns:
-
-  The value of the attr, as a Python object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this op does not have an attr with the given `name`.
-
-
-- - -
-
-#### `tf.Operation.traceback` {#Operation.traceback}
-
-Returns the call stack from when this operation was constructed.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Operation.__init__(node_def, g, inputs=None, output_types=None, control_inputs=None, input_types=None, original_op=None, op_def=None)` {#Operation.__init__}
-
-Creates an `Operation`.
-
-NOTE: This constructor validates the name of the `Operation` (passed
-as `node_def.name`). Valid `Operation` names match the following
-regular expression:
-
-    [A-Za-z0-9.][A-Za-z0-9_.\\-/]*
-
-##### Args:
-
-
-*  <b>`node_def`</b>: `node_def_pb2.NodeDef`.  `NodeDef` for the `Operation`.
-    Used for attributes of `node_def_pb2.NodeDef`, typically `name`,
-    `op`, and `device`.  The `input` attribute is irrelevant here
-    as it will be computed when generating the model.
-*  <b>`g`</b>: `Graph`. The parent graph.
-*  <b>`inputs`</b>: list of `Tensor` objects. The inputs to this `Operation`.
-*  <b>`output_types`</b>: list of `DType` objects.  List of the types of the
-    `Tensors` computed by this operation.  The length of this list indicates
-    the number of output endpoints of the `Operation`.
-*  <b>`control_inputs`</b>: list of operations or tensors from which to have a
-    control dependency.
-*  <b>`input_types`</b>: List of `DType` objects representing the
-    types of the tensors accepted by the `Operation`.  By default
-    uses `[x.dtype.base_dtype for x in inputs]`.  Operations that expect
-    reference-typed inputs must specify these explicitly.
-*  <b>`original_op`</b>: Optional. Used to associate the new `Operation` with an
-    existing `Operation` (for example, a replica with the op that was
-    replicated).
-*  <b>`op_def`</b>: Optional. The `op_def_pb2.OpDef` proto that describes the
-    op type that this `Operation` represents.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if control inputs are not Operations or Tensors,
-    or if `node_def` is not a `NodeDef`,
-    or if `g` is not a `Graph`,
-    or if `inputs` are not tensors,
-    or if `inputs` and `input_types` are incompatible.
-*  <b>`ValueError`</b>: if the `node_def` name is not valid.
-
-
-- - -
-
-#### `tf.Operation.__repr__()` {#Operation.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Operation.__str__()` {#Operation.__str__}
-
-
-
-
-- - -
-
-#### `tf.Operation.colocation_groups()` {#Operation.colocation_groups}
-
-Returns the list of colocation groups of the op.
-
-
-- - -
-
-#### `tf.Operation.node_def` {#Operation.node_def}
-
-Returns a serialized `NodeDef` representation of this operation.
-
-##### Returns:
-
-  A
-  [`NodeDef`](https://www.tensorflow.org/code/tensorflow/core/framework/node_def.proto)
-  protocol buffer.
-
-
-- - -
-
-#### `tf.Operation.op_def` {#Operation.op_def}
-
-Returns the `OpDef` proto that represents the type of this op.
-
-##### Returns:
-
-  An
-  [`OpDef`](https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto)
-  protocol buffer.
-
-
-- - -
-
-#### `tf.Operation.values()` {#Operation.values}
-
-DEPRECATED: Use outputs.
-
-
-
-- - -
-
-### `class tf.Tensor` {#Tensor}
-
-Represents one of the outputs of an `Operation`.
-
-A `Tensor` is a symbolic handle to one of the outputs of an
-`Operation`. It does not hold the values of that operation's output,
-but instead provides a means of computing those values in a
-TensorFlow [`Session`](../../api_docs/python/client.md#Session).
-
-This class has two primary purposes:
-
-1. A `Tensor` can be passed as an input to another `Operation`.
-   This builds a dataflow connection between operations, which
-   enables TensorFlow to execute an entire `Graph` that represents a
-   large, multi-step computation.
-
-2. After the graph has been launched in a session, the value of the
-   `Tensor` can be computed by passing it to
-   [`Session.run()`](../../api_docs/python/client.md#Session.run).
-   `t.eval()` is a shortcut for calling
-   `tf.get_default_session().run(t)`.
-
-In the following example, `c`, `d`, and `e` are symbolic `Tensor`
-objects, whereas `result` is a numpy array that stores a concrete
-value:
-
-```python
-# Build a dataflow graph.
-c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
-d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
-e = tf.matmul(c, d)
-
-# Construct a `Session` to execute the graph.
-sess = tf.Session()
-
-# Execute the graph and store the value that `e` represents in `result`.
-result = sess.run(e)
-```
-
-- - -
-
-#### `tf.Tensor.dtype` {#Tensor.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.Tensor.name` {#Tensor.name}
-
-The string name of this tensor.
-
-
-- - -
-
-#### `tf.Tensor.value_index` {#Tensor.value_index}
-
-The index of this tensor in the outputs of its `Operation`.
-
-
-- - -
-
-#### `tf.Tensor.graph` {#Tensor.graph}
-
-The `Graph` that contains this tensor.
-
-
-- - -
-
-#### `tf.Tensor.op` {#Tensor.op}
-
-The `Operation` that produces this tensor as an output.
-
-
-- - -
-
-#### `tf.Tensor.consumers()` {#Tensor.consumers}
-
-Returns a list of `Operation`s that consume this tensor.
-
-##### Returns:
-
-  A list of `Operation`s.
-
-
-
-- - -
-
-#### `tf.Tensor.eval(feed_dict=None, session=None)` {#Tensor.eval}
-
-Evaluates this tensor in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for the operation that produces this
-tensor.
-
-*N.B.* Before invoking `Tensor.eval()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
-    description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to evaluate this tensor. If
-    none, the default session will be used.
-
-##### Returns:
-
-  A numpy array corresponding to the value of this tensor.
-
-
-
-- - -
-
-#### `tf.Tensor.get_shape()` {#Tensor.get_shape}
-
-Alias of Tensor.shape.
-
-
-- - -
-
-#### `tf.Tensor.shape` {#Tensor.shape}
-
-Returns the `TensorShape` that represents the shape of this tensor.
-
-The shape is computed using shape inference functions that are
-registered in the Op for each `Operation`.  See
-[`TensorShape`](../../api_docs/python/framework.md#TensorShape)
-for more details of what a shape represents.
-
-The inferred shape of a tensor is used to provide shape
-information without having to launch the graph in a session. This
-can be used for debugging, and providing early error messages. For
-example:
-
-```python
-c = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-
-print(c.shape)
-==> TensorShape([Dimension(2), Dimension(3)])
-
-d = tf.constant([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
-
-print(d.shape)
-==> TensorShape([Dimension(4), Dimension(2)])
-
-# Raises a ValueError, because `c` and `d` do not have compatible
-# inner dimensions.
-e = tf.matmul(c, d)
-
-f = tf.matmul(c, d, transpose_a=True, transpose_b=True)
-
-print(f.shape)
-==> TensorShape([Dimension(3), Dimension(4)])
-```
-
-In some cases, the inferred shape may have unknown dimensions. If
-the caller has additional information about the values of these
-dimensions, `Tensor.set_shape()` can be used to augment the
-inferred shape.
-
-##### Returns:
-
-  A `TensorShape` representing the shape of this tensor.
-
-
-- - -
-
-#### `tf.Tensor.set_shape(shape)` {#Tensor.set_shape}
-
-Updates the shape of this tensor.
-
-This method can be called multiple times, and will merge the given
-`shape` with the current shape of this tensor. It can be used to
-provide additional information about the shape of this tensor that
-cannot be inferred from the graph alone. For example, this can be used
-to provide additional information about the shapes of images:
-
-```python
-_, image_data = tf.TFRecordReader(...).read(...)
-image = tf.image.decode_png(image_data, channels=3)
-
-# The height and width dimensions of `image` are data dependent, and
-# cannot be computed without executing the op.
-print(image.shape)
-==> TensorShape([Dimension(None), Dimension(None), Dimension(3)])
-
-# We know that each image in this dataset is 28 x 28 pixels.
-image.set_shape([28, 28, 3])
-print(image.shape)
-==> TensorShape([Dimension(28), Dimension(28), Dimension(3)])
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: A `TensorShape` representing the shape of this tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `shape` is not compatible with the current shape of
-    this tensor.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Tensor.__abs__(x, name=None)` {#Tensor.__abs__}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
-
-- - -
-
-#### `tf.Tensor.__add__(x, y)` {#Tensor.__add__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__and__(x, y)` {#Tensor.__and__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__bool__()` {#Tensor.__bool__}
-
-Dummy method to prevent a tensor from being used as a Python `bool`.
-
-This overload raises a `TypeError` when the user inadvertently
-treats a `Tensor` as a boolean (e.g. in an `if` statement). For
-example:
-
-```python
-if tf.constant(True):  # Will raise.
-  # ...
-
-if tf.constant(5) < tf.constant(7):  # Will raise.
-  # ...
-```
-
-This disallows ambiguities between testing the Python value vs testing the
-dynamic condition of the `Tensor`.
-
-##### Raises:
-
-  `TypeError`.
-
-
-- - -
-
-#### `tf.Tensor.__div__(x, y)` {#Tensor.__div__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Tensor.__eq__(other)` {#Tensor.__eq__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__floordiv__(x, y)` {#Tensor.__floordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Tensor.__ge__(x, y, name=None)` {#Tensor.__ge__}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__getitem__(tensor, slice_spec, var=None)` {#Tensor.__getitem__}
-
-Overload for Tensor.__getitem__.
-
-This operation extracts the specified region from the tensor.
-The notation is similar to NumPy with the restriction that
-currently only support basic indexing. That means that
-using a tensor as input is not currently allowed
-
-Some useful examples:
-
-```python
-# strip leading and trailing 2 elements
-foo = tf.constant([1,2,3,4,5,6])
-print(foo[2:-2].eval()) # => [3,4]
-
-# skip every row and reverse every column
-foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-print(foo[::2,::-1].eval()) # => [[3,2,1], [9,8,7]]
-
-# Insert another dimension
-foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[:, tf.newaxis, :].eval()) # => [[[3,2,1]], [[9,8,7]]]
-print(foo[:, :, tf.newaxis].eval()) # => [[[3],[2],[1]], [[9],[8],[7]]]
-
-# Ellipses (3 equivalent operations)
-print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[tf.newaxis, ...].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[tf.newaxis].eval()) # => [[[3,2,1], [9,8,7]]]
-```
-
-##### Notes:
-
-  - `tf.newaxis` is `None` as in NumPy.
-  - An implicit ellipsis is placed at the end of the `slice_spec`
-  - NumPy advanced indexing is currently not supported.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: An ops.Tensor object.
-*  <b>`slice_spec`</b>: The arguments to Tensor.__getitem__.
-*  <b>`var`</b>: In the case of variable slice assignment, the Variable
-    object to slice (i.e. tensor is the read-only view of this
-    variable).
-
-##### Returns:
-
-  The appropriate slice of "tensor", based on "slice_spec".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If a slice range is negative size.
-*  <b>`TypeError`</b>: If the slice indices aren't int, slice, or Ellipsis.
-
-
-- - -
-
-#### `tf.Tensor.__gt__(x, y, name=None)` {#Tensor.__gt__}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__hash__()` {#Tensor.__hash__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__init__(op, value_index, dtype)` {#Tensor.__init__}
-
-Creates a new `Tensor`.
-
-##### Args:
-
-
-*  <b>`op`</b>: An `Operation`. `Operation` that computes this tensor.
-*  <b>`value_index`</b>: An `int`. Index of the operation's endpoint that produces
-    this tensor.
-*  <b>`dtype`</b>: A `DType`. Type of elements stored in this tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the op is not an `Operation`.
-
-
-- - -
-
-#### `tf.Tensor.__invert__(x, name=None)` {#Tensor.__invert__}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__iter__()` {#Tensor.__iter__}
-
-Dummy method to prevent iteration. Do not call.
-
-NOTE(mrry): If we register __getitem__ as an overloaded operator,
-Python will valiantly attempt to iterate over the Tensor from 0 to
-infinity.  Declaring this method prevents this unintended
-behavior.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: when invoked.
-
-
-- - -
-
-#### `tf.Tensor.__le__(x, y, name=None)` {#Tensor.__le__}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__lt__(x, y, name=None)` {#Tensor.__lt__}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__mod__(x, y)` {#Tensor.__mod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__mul__(x, y)` {#Tensor.__mul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Tensor.__neg__(x, name=None)` {#Tensor.__neg__}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__nonzero__()` {#Tensor.__nonzero__}
-
-Dummy method to prevent a tensor from being used as a Python `bool`.
-
-This is the Python 2.x counterpart to `__bool__()` above.
-
-##### Raises:
-
-  `TypeError`.
-
-
-- - -
-
-#### `tf.Tensor.__or__(x, y)` {#Tensor.__or__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__pow__(x, y)` {#Tensor.__pow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Tensor.__radd__(y, x)` {#Tensor.__radd__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rand__(y, x)` {#Tensor.__rand__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__rdiv__(y, x)` {#Tensor.__rdiv__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Tensor.__repr__()` {#Tensor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__rfloordiv__(y, x)` {#Tensor.__rfloordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Tensor.__rmod__(y, x)` {#Tensor.__rmod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rmul__(y, x)` {#Tensor.__rmul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Tensor.__ror__(y, x)` {#Tensor.__ror__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__rpow__(y, x)` {#Tensor.__rpow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Tensor.__rsub__(y, x)` {#Tensor.__rsub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rtruediv__(y, x)` {#Tensor.__rtruediv__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__rxor__(y, x)` {#Tensor.__rxor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Tensor.__str__()` {#Tensor.__str__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__sub__(x, y)` {#Tensor.__sub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__truediv__(x, y)` {#Tensor.__truediv__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__xor__(x, y)` {#Tensor.__xor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Tensor.device` {#Tensor.device}
-
-The name of the device on which this tensor will be produced, or None.
-
-
-
-
-## Tensor types
-
-- - -
-
-### `class tf.DType` {#DType}
-
-Represents the type of the elements in a `Tensor`.
-
-The following `DType` objects are defined:
-
-* `tf.float16`: 16-bit half-precision floating-point.
-* `tf.float32`: 32-bit single-precision floating-point.
-* `tf.float64`: 64-bit double-precision floating-point.
-* `tf.bfloat16`: 16-bit truncated floating-point.
-* `tf.complex64`: 64-bit single-precision complex.
-* `tf.complex128`: 128-bit double-precision complex.
-* `tf.int8`: 8-bit signed integer.
-* `tf.uint8`: 8-bit unsigned integer.
-* `tf.uint16`: 16-bit unsigned integer.
-* `tf.int16`: 16-bit signed integer.
-* `tf.int32`: 32-bit signed integer.
-* `tf.int64`: 64-bit signed integer.
-* `tf.bool`: Boolean.
-* `tf.string`: String.
-* `tf.qint8`: Quantized 8-bit signed integer.
-* `tf.quint8`: Quantized 8-bit unsigned integer.
-* `tf.qint16`: Quantized 16-bit signed integer.
-* `tf.quint16`: Quantized 16-bit unsigned integer.
-* `tf.qint32`: Quantized 32-bit signed integer.
-* `tf.resource`: Handle to a mutable resource.
-
-In addition, variants of these types with the `_ref` suffix are
-defined for reference-typed tensors.
-
-The `tf.as_dtype()` function converts numpy types and string type
-names to a `DType` object.
-
-- - -
-
-#### `tf.DType.is_compatible_with(other)` {#DType.is_compatible_with}
-
-Returns True if the `other` DType will be converted to this DType.
-
-The conversion rules are as follows:
-
-```python
-DType(T)       .is_compatible_with(DType(T))        == True
-DType(T)       .is_compatible_with(DType(T).as_ref) == True
-DType(T).as_ref.is_compatible_with(DType(T))        == False
-DType(T).as_ref.is_compatible_with(DType(T).as_ref) == True
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: A `DType` (or object that may be converted to a `DType`).
-
-##### Returns:
-
-  True if a Tensor of the `other` `DType` will be implicitly converted to
-  this `DType`.
-
-
-- - -
-
-#### `tf.DType.name` {#DType.name}
-
-Returns the string name for this `DType`.
-
-
-- - -
-
-#### `tf.DType.base_dtype` {#DType.base_dtype}
-
-Returns a non-reference `DType` based on this `DType`.
-
-
-- - -
-
-#### `tf.DType.real_dtype` {#DType.real_dtype}
-
-Returns the dtype correspond to this dtype's real part.
-
-
-- - -
-
-#### `tf.DType.is_bool` {#DType.is_bool}
-
-Returns whether this is a boolean data type
-
-
-- - -
-
-#### `tf.DType.is_floating` {#DType.is_floating}
-
-Returns whether this is a (non-quantized, real) floating point type.
-
-
-- - -
-
-#### `tf.DType.is_complex` {#DType.is_complex}
-
-Returns whether this is a complex floating point type.
-
-
-- - -
-
-#### `tf.DType.is_integer` {#DType.is_integer}
-
-Returns whether this is a (non-quantized) integer type.
-
-
-- - -
-
-#### `tf.DType.is_quantized` {#DType.is_quantized}
-
-Returns whether this is a quantized data type.
-
-
-- - -
-
-#### `tf.DType.is_unsigned` {#DType.is_unsigned}
-
-Returns whether this type is unsigned.
-
-Non-numeric, unordered, and quantized types are not considered unsigned, and
-this function returns `False`.
-
-##### Returns:
-
-  Whether a `DType` is unsigned.
-
-
-
-- - -
-
-#### `tf.DType.as_numpy_dtype` {#DType.as_numpy_dtype}
-
-Returns a `numpy.dtype` based on this `DType`.
-
-
-- - -
-
-#### `tf.DType.as_datatype_enum` {#DType.as_datatype_enum}
-
-Returns a `types_pb2.DataType` enum value based on this `DType`.
-
-
-
-- - -
-
-#### `tf.DType.limits` {#DType.limits}
-
-Return intensity limits, i.e. (min, max) tuple, of the dtype.
-
-##### Args:
-
-  clip_negative : bool, optional
-      If True, clip the negative range (i.e. return 0 for min intensity)
-      even if the image dtype allows negative values.
-Returns
-  min, max : tuple
-    Lower and upper intensity limits.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.DType.__eq__(other)` {#DType.__eq__}
-
-Returns True iff this DType refers to the same type as `other`.
-
-
-- - -
-
-#### `tf.DType.__hash__()` {#DType.__hash__}
-
-
-
-
-- - -
-
-#### `tf.DType.__init__(type_enum)` {#DType.__init__}
-
-Creates a new `DataType`.
-
-NOTE(mrry): In normal circumstances, you should not need to
-construct a `DataType` object directly. Instead, use the
-`tf.as_dtype()` function.
-
-##### Args:
-
-
-*  <b>`type_enum`</b>: A `types_pb2.DataType` enum value.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `type_enum` is not a value `types_pb2.DataType`.
-
-
-- - -
-
-#### `tf.DType.__ne__(other)` {#DType.__ne__}
-
-Returns True iff self != other.
-
-
-- - -
-
-#### `tf.DType.__repr__()` {#DType.__repr__}
-
-
-
-
-- - -
-
-#### `tf.DType.__str__()` {#DType.__str__}
-
-
-
-
-- - -
-
-#### `tf.DType.is_numpy_compatible` {#DType.is_numpy_compatible}
-
-
-
-
-- - -
-
-#### `tf.DType.max` {#DType.max}
-
-Returns the maximum representable value in this data type.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if this is a non-numeric, unordered, or quantized type.
-
-
-- - -
-
-#### `tf.DType.min` {#DType.min}
-
-Returns the minimum representable value in this data type.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if this is a non-numeric, unordered, or quantized type.
-
-
-- - -
-
-#### `tf.DType.size` {#DType.size}
-
-
-
-
-
-- - -
-
-### `tf.as_dtype(type_value)` {#as_dtype}
-
-Converts the given `type_value` to a `DType`.
-
-##### Args:
-
-
-*  <b>`type_value`</b>: A value that can be converted to a `tf.DType`
-    object. This may currently be a `tf.DType` object, a
-    [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
-    a string type name, or a `numpy.dtype`.
-
-##### Returns:
-
-  A `DType` corresponding to `type_value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `type_value` cannot be converted to a `DType`.
-
-
-
-## Utility functions
-
-- - -
-
-### `tf.device(device_name_or_function)` {#device}
-
-Wrapper for `Graph.device()` using the default graph.
-
-See
-[`Graph.device()`](../../api_docs/python/framework.md#Graph.device)
-for more details.
-
-##### Args:
-
-
-*  <b>`device_name_or_function`</b>: The device name or function to use in
-    the context.
-
-##### Returns:
-
-  A context manager that specifies the default device to use for newly
-  created ops.
-
-
-- - -
-
-### `tf.container(container_name)` {#container}
-
-Wrapper for `Graph.container()` using the default graph.
-
-##### Args:
-
-
-*  <b>`container_name`</b>: The container string to use in the context.
-
-##### Returns:
-
-  A context manager that specifies the default container to use for newly
-  created stateful ops.
-
-
-- - -
-
-### `tf.name_scope(name, default_name=None, values=None)` {#name_scope}
-
-Returns a context manager for use when defining a Python op.
-
-This context manager validates that the given `values` are from the
-same graph, makes that graph the default graph, and pushes a
-name scope in that graph (see
-[`Graph.name_scope()`](../../api_docs/python/framework.md#Graph.name_scope)
-for more details on that).
-
-For example, to define a new Python op called `my_op`:
-
-```python
-def my_op(a, b, c, name=None):
-  with tf.name_scope(name, "MyOp", [a, b, c]) as scope:
-    a = tf.convert_to_tensor(a, name="a")
-    b = tf.convert_to_tensor(b, name="b")
-    c = tf.convert_to_tensor(c, name="c")
-    # Define some computation that uses `a`, `b`, and `c`.
-    return foo_op(..., name=scope)
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name argument that is passed to the op function.
-*  <b>`default_name`</b>: The default name to use if the `name` argument is `None`.
-*  <b>`values`</b>: The list of `Tensor` arguments that are passed to the op function.
-
-##### Returns:
-
-  A context manager for use in defining Python ops. Yields the name scope.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if neither `name` nor `default_name` is provided
-    but `values` are.
-
-
-- - -
-
-### `tf.control_dependencies(control_inputs)` {#control_dependencies}
-
-Wrapper for `Graph.control_dependencies()` using the default graph.
-
-See [`Graph.control_dependencies()`](../../api_docs/python/framework.md#Graph.control_dependencies)
-for more details.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A list of `Operation` or `Tensor` objects which
-    must be executed or computed before running the operations
-    defined in the context.  Can also be `None` to clear the control
-    dependencies.
-
-##### Returns:
-
- A context manager that specifies control dependencies for all
- operations constructed within the context.
-
-
-- - -
-
-### `tf.convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None)` {#convert_to_tensor}
-
-Converts the given `value` to a `Tensor`.
-
-This function converts Python objects of various types to `Tensor`
-objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-and Python scalars. For example:
-
-```python
-import numpy as np
-
-def my_func(arg):
-  arg = tf.convert_to_tensor(arg, dtype=tf.float32)
-  return tf.matmul(arg, arg) + arg
-
-# The following calls are equivalent.
-value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
-value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
-value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
-```
-
-This function can be useful when composing a new operation in Python
-(such as `my_func` in the example above). All standard Python op
-constructors apply this function to each of their Tensor-valued
-inputs, which allows those ops to accept numpy arrays, Python lists,
-and scalars in addition to `Tensor` objects.
-
-##### Args:
-
-
-*  <b>`value`</b>: An object whose type has a registered `Tensor` conversion function.
-*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
-    type is inferred from the type of `value`.
-*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
-*  <b>`preferred_dtype`</b>: Optional element type for the returned tensor,
-    used when dtype is None. In some cases, a caller may not have a
-    dtype in mind when converting to a tensor, so preferred_dtype
-    can be used as a soft preference.  If the conversion to
-    `preferred_dtype` is not possible, this argument has no effect.
-
-##### Returns:
-
-  An `Output` based on `value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If no conversion function is registered for `value`.
-*  <b>`RuntimeError`</b>: If a registered conversion function returns an invalid value.
-
-
-- - -
-
-### `tf.convert_to_tensor_or_indexed_slices(value, dtype=None, name=None)` {#convert_to_tensor_or_indexed_slices}
-
-Converts the given object to a `Tensor` or an `IndexedSlices`.
-
-If `value` is an `IndexedSlices` or `SparseTensor` it is returned
-unmodified. Otherwise, it is converted to a `Tensor` using
-`convert_to_tensor()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: An `IndexedSlices`, `SparseTensor`, or an object that can be consumed
-    by `convert_to_tensor()`.
-*  <b>`dtype`</b>: (Optional.) The required `DType` of the returned `Tensor` or
-    `IndexedSlices`.
-*  <b>`name`</b>: (Optional.) A name to use if a new `Tensor` is created.
-
-##### Returns:
-
-  An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `dtype` does not match the element type of `value`.
-
-
-- - -
-
-### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
-
-Converts value to a `SparseTensor` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
-    registered `Tensor` conversion function.
-*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
-    type is inferred from the type of `value`.
-*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
-
-##### Returns:
-
-  A `SparseTensor` or `Tensor` based on `value`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If result type is incompatible with `dtype`.
-
-
-- - -
-
-### `tf.get_default_graph()` {#get_default_graph}
-
-Returns the default graph for the current thread.
-
-The returned graph will be the innermost graph on which a
-`Graph.as_default()` context has been entered, or a global default
-graph if none has been explicitly created.
-
-NOTE: The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default graph in that
-thread, you must explicitly add a `with g.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  The default `Graph` being used in the current thread.
-
-
-- - -
-
-### `tf.reset_default_graph()` {#reset_default_graph}
-
-Clears the default graph stack and resets the global default graph.
-
-NOTE: The default graph is a property of the current thread. This
-function applies only to the current thread.  Calling this function while
-a `tf.Session` or `tf.InteractiveSession` is active will result in undefined
-behavior. Using any previously created `tf.Operation` or `tf.Tensor` objects
-after calling this function will result in undefined behavior.
-
-
-- - -
-
-### `tf.import_graph_def(graph_def, input_map=None, return_elements=None, name=None, op_dict=None, producer_op_list=None)` {#import_graph_def}
-
-Imports the TensorFlow graph in `graph_def` into the Python `Graph`.
-
-This function provides a way to import a serialized TensorFlow
-[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
-protocol buffer, and extract individual objects in the `GraphDef` as
-[`Tensor`](#Tensor) and [`Operation`](#Operation) objects. See
-[`Graph.as_graph_def()`](#Graph.as_graph_def) for a way to create a
-`GraphDef` proto.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto containing operations to be imported into
-    the default graph.
-*  <b>`input_map`</b>: A dictionary mapping input names (as strings) in `graph_def`
-    to `Tensor` objects. The values of the named input tensors in the
-    imported graph will be re-mapped to the respective `Tensor` values.
-*  <b>`return_elements`</b>: A list of strings containing operation names in
-    `graph_def` that will be returned as `Operation` objects; and/or
-    tensor names in `graph_def` that will be returned as `Tensor` objects.
-*  <b>`name`</b>: (Optional.) A prefix that will be prepended to the names in
-    `graph_def`. Defaults to `"import"`.
-*  <b>`op_dict`</b>: (Optional.) A dictionary mapping op type names to `OpDef` protos.
-    Must contain an `OpDef` proto for each op type named in `graph_def`.
-    If omitted, uses the `OpDef` protos registered in the global registry.
-*  <b>`producer_op_list`</b>: (Optional.) An `OpList` proto with the (possibly stripped)
-    list of `OpDef`s used by the producer of the graph. If provided, attrs
-    for ops in `graph_def` that are not in `op_dict` that have their default
-    value according to `producer_op_list` will be removed. This will allow
-    some more `GraphDef`s produced by later binaries to be accepted by
-    earlier binaries.
-
-##### Returns:
-
-  A list of `Operation` and/or `Tensor` objects from the imported graph,
-  corresponding to the names in `return_elements`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `graph_def` is not a `GraphDef` proto,
-    `input_map` is not a dictionary mapping strings to `Tensor` objects,
-    or `return_elements` is not a list of strings.
-*  <b>`ValueError`</b>: If `input_map`, or `return_elements` contains names that
-    do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
-    it refers to an unknown tensor).
-
-
-- - -
-
-### `tf.load_file_system_library(library_filename)` {#load_file_system_library}
-
-Loads a TensorFlow plugin, containing file system implementation.
-
-Pass `library_filename` to a platform-specific mechanism for dynamically
-loading a library. The rules for determining the exact location of the
-library are platform-specific and are not documented here.
-
-##### Args:
-
-
-*  <b>`library_filename`</b>: Path to the plugin.
-    Relative or absolute filesystem path to a dynamic library file.
-
-##### Returns:
-
-  None.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: when unable to load the library.
-
-
-- - -
-
-### `tf.load_op_library(library_filename)` {#load_op_library}
-
-Loads a TensorFlow plugin, containing custom ops and kernels.
-
-Pass "library_filename" to a platform-specific mechanism for dynamically
-loading a library. The rules for determining the exact location of the
-library are platform-specific and are not documented here. When the
-library is loaded, ops and kernels registered in the library via the
-`REGISTER_*` macros are made available in the TensorFlow process. Note
-that ops with the same name as an existing op are rejected and not
-registered with the process.
-
-##### Args:
-
-
-*  <b>`library_filename`</b>: Path to the plugin.
-    Relative or absolute filesystem path to a dynamic library file.
-
-##### Returns:
-
-  A python module containing the Python wrappers for Ops defined in
-  the plugin.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: when unable to load the library or get the python wrappers.
-
-
-
-## Graph collections
-
-- - -
-
-### `tf.add_to_collection(name, value)` {#add_to_collection}
-
-Wrapper for `Graph.add_to_collection()` using the default graph.
-
-See [`Graph.add_to_collection()`](../../api_docs/python/framework.md#Graph.add_to_collection)
-for more details.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collection.
-
-
-- - -
-
-### `tf.get_collection(key, scope=None)` {#get_collection}
-
-Wrapper for `Graph.get_collection()` using the default graph.
-
-See [`Graph.get_collection()`](../../api_docs/python/framework.md#Graph.get_collection)
-for more details.
-
-##### Args:
-
-
-*  <b>`key`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`scope`</b>: (Optional.) If supplied, the resulting list is filtered to include
-    only items whose `name` attribute matches using `re.match`. Items
-    without a `name` attribute are never returned if a scope is supplied and
-    the choice or `re.match` means that a `scope` without special tokens
-    filters by prefix.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or
-  an empty list if no value has been added to that collection. The
-  list contains the values in the order under which they were
-  collected.
-
-
-- - -
-
-### `tf.get_collection_ref(key)` {#get_collection_ref}
-
-Wrapper for `Graph.get_collection_ref()` using the default graph.
-
-See [`Graph.get_collection_ref()`](../../api_docs/python/framework.md#Graph.get_collection_ref)
-for more details.
-
-##### Args:
-
-
-*  <b>`key`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or an empty
-  list if no value has been added to that collection.  Note that this returns
-  the collection list itself, which can be modified in place to change the
-  collection.
-
-
-- - -
-
-### `class tf.GraphKeys` {#GraphKeys}
-
-Standard names to use for graph collections.
-
-The standard library uses various well-known names to collect and
-retrieve values associated with a graph. For example, the
-`tf.Optimizer` subclasses default to optimizing the variables
-collected under `tf.GraphKeys.TRAINABLE_VARIABLES` if none is
-specified, but it is also possible to pass an explicit list of
-variables.
-
-The following standard keys are defined:
-
-* `GLOBAL_VARIABLES`: the default collection of `Variable` objects, shared
-  across distributed environment (model variables are subset of these). See
-  [`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
-  for more details.
-  Commonly, all `TRAINABLE_VARIABLES` variables will be in `MODEL_VARIABLES`,
-  and all `MODEL_VARIABLES` variables will be in `GLOBAL_VARIABLES`.
-* `LOCAL_VARIABLES`: the subset of `Variable` objects that are local to each
-  machine. Usually used for temporarily variables, like counters.
-  Note: use `tf.contrib.framework.local_variable` to add to this collection.
-* `MODEL_VARIABLES`: the subset of `Variable` objects that are used in the
-  model for inference (feed forward). Note: use
-  `tf.contrib.framework.model_variable` to add to this collection.
-* `TRAINABLE_VARIABLES`: the subset of `Variable` objects that will
-  be trained by an optimizer. See
-  [`tf.trainable_variables()`](../../api_docs/python/state_ops.md#trainable_variables)
-  for more details.
-* `SUMMARIES`: the summary `Tensor` objects that have been created in the
-  graph. See
-  [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
-  for more details.
-* `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
-  produce input for a computation. See
-  [`tf.start_queue_runners()`](../../api_docs/python/train.md#start_queue_runners)
-  for more details.
-* `MOVING_AVERAGE_VARIABLES`: the subset of `Variable` objects that will also
-  keep moving averages.  See
-  [`tf.moving_average_variables()`](../../api_docs/python/state_ops.md#moving_average_variables)
-  for more details.
-* `REGULARIZATION_LOSSES`: regularization losses collected during graph
-  construction.
-* `WEIGHTS`: weights inside neural network layers
-* `BIASES`: biases inside neural network layers
-* `ACTIVATIONS`: activations of neural network layers
-
-
-## Defining new operations
-
-- - -
-
-### `class tf.RegisterGradient` {#RegisterGradient}
-
-A decorator for registering the gradient function for an op type.
-
-This decorator is only used when defining a new op type. For an op
-with `m` inputs and `n` outputs, the gradient function is a function
-that takes the original `Operation` and `n` `Tensor` objects
-(representing the gradients with respect to each output of the op),
-and returns `m` `Tensor` objects (representing the partial gradients
-with respect to each input of the op).
-
-For example, assuming that operations of type `"Sub"` take two
-inputs `x` and `y`, and return a single output `x - y`, the
-following gradient function would be registered:
-
-```python
-@tf.RegisterGradient("Sub")
-def _sub_grad(unused_op, grad):
-  return grad, tf.negative(grad)
-```
-
-The decorator argument `op_type` is the string type of an
-operation. This corresponds to the `OpDef.name` field for the proto
-that defines the operation.
-
-- - -
-
-#### `tf.RegisterGradient.__init__(op_type)` {#RegisterGradient.__init__}
-
-Creates a new decorator with `op_type` as the Operation type.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.RegisterGradient.__call__(f)` {#RegisterGradient.__call__}
-
-Registers the function `f` as gradient function for `op_type`.
-
-
-
-- - -
-
-### `tf.NotDifferentiable(op_type)` {#NotDifferentiable}
-
-Specifies that ops of type `op_type` is not differentiable.
-
-This function should *not* be used for operations that have a
-well-defined gradient that is not yet implemented.
-
-This function is only used when defining a new op type. It may be
-used for ops such as `tf.size()` that are not differentiable.  For
-example:
-
-```python
-tf.NotDifferentiable("Size")
-```
-
-The gradient computed for 'op_type' will then propagate zeros.
-
-For ops that have a well-defined gradient but are not yet implemented,
-no declaration should be made, and an error *must* be thrown if
-an attempt to request its gradient is made.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type` is not a string.
-
-
-- - -
-
-### `tf.NoGradient(op_type)` {#NoGradient}
-
-Specifies that ops of type `op_type` is not differentiable.
-
-This function should *not* be used for operations that have a
-well-defined gradient that is not yet implemented.
-
-This function is only used when defining a new op type. It may be
-used for ops such as `tf.size()` that are not differentiable.  For
-example:
-
-```python
-tf.NotDifferentiable("Size")
-```
-
-The gradient computed for 'op_type' will then propagate zeros.
-
-For ops that have a well-defined gradient but are not yet implemented,
-no declaration should be made, and an error *must* be thrown if
-an attempt to request its gradient is made.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type` is not a string.
-
-
-- - -
-
-### `class tf.TensorShape` {#TensorShape}
-
-Represents the shape of a `Tensor`.
-
-A `TensorShape` represents a possibly-partial shape specification for a
-`Tensor`. It may be one of the following:
-
-* *Fully-known shape:* has a known number of dimensions and a known size
-  for each dimension.
-* *Partially-known shape:* has a known number of dimensions, and an unknown
-  size for one or more dimension.
-* *Unknown shape:* has an unknown number of dimensions, and an unknown
-  size in all dimensions.
-
-If a tensor is produced by an operation of type `"Foo"`, its shape
-may be inferred if there is a registered shape function for
-`"Foo"`. See [`Shape functions in
-C++`](../../how_tos/adding_an_op/index.md#shape-functions-in-c) for
-details of shape functions and how to register them. Alternatively,
-the shape may be set explicitly using
-[`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape).
-
-- - -
-
-#### `tf.TensorShape.merge_with(other)` {#TensorShape.merge_with}
-
-Returns a `TensorShape` combining the information in `self` and `other`.
-
-The dimensions in `self` and `other` are merged elementwise,
-according to the rules defined for `Dimension.merge_with()`.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Returns:
-
-  A `TensorShape` containing the combined information of `self` and
-  `other`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible.
-
-
-- - -
-
-#### `tf.TensorShape.concatenate(other)` {#TensorShape.concatenate}
-
-Returns the concatenation of the dimension in `self` and `other`.
-
-*N.B.* If either `self` or `other` is completely unknown,
-concatenation will discard information about the other shape. In
-future, we might support concatenation that preserves this
-information for use with slicing.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Returns:
-
-  A `TensorShape` whose dimensions are the concatenation of the
-  dimensions in `self` and `other`.
-
-
-
-- - -
-
-#### `tf.TensorShape.ndims` {#TensorShape.ndims}
-
-Returns the rank of this shape, or None if it is unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.dims` {#TensorShape.dims}
-
-Returns a list of Dimensions, or None if the shape is unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.as_list()` {#TensorShape.as_list}
-
-Returns a list of integers or `None` for each dimension.
-
-##### Returns:
-
-  A list of integers or `None` for each dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` is an unknown shape with an unknown rank.
-
-
-- - -
-
-#### `tf.TensorShape.as_proto()` {#TensorShape.as_proto}
-
-Returns this shape as a `TensorShapeProto`.
-
-
-- - -
-
-#### `tf.TensorShape.is_compatible_with(other)` {#TensorShape.is_compatible_with}
-
-Returns True iff `self` is compatible with `other`.
-
-Two possibly-partially-defined shapes are compatible if there
-exists a fully-defined shape that both shapes can represent. Thus,
-compatibility allows the shape inference code to reason about
-partially-defined shapes. For example:
-
-* TensorShape(None) is compatible with all shapes.
-
-* TensorShape([None, None]) is compatible with all two-dimensional
-  shapes, such as TensorShape([32, 784]), and also TensorShape(None). It is
-  not compatible with, for example, TensorShape([None]) or
-  TensorShape([None, None, None]).
-
-* TensorShape([32, None]) is compatible with all two-dimensional shapes
-  with size 32 in the 0th dimension, and also TensorShape([None, None])
-  and TensorShape(None). It is not compatible with, for example,
-  TensorShape([32]), TensorShape([32, None, 1]) or TensorShape([64, None]).
-
-* TensorShape([32, 784]) is compatible with itself, and also
-  TensorShape([32, None]), TensorShape([None, 784]), TensorShape([None,
-  None]) and TensorShape(None). It is not compatible with, for example,
-  TensorShape([32, 1, 784]) or TensorShape([None]).
-
-The compatibility relation is reflexive and symmetric, but not
-transitive. For example, TensorShape([32, 784]) is compatible with
-TensorShape(None), and TensorShape(None) is compatible with
-TensorShape([4, 4]), but TensorShape([32, 784]) is not compatible with
-TensorShape([4, 4]).
-
-##### Args:
-
-
-*  <b>`other`</b>: Another TensorShape.
-
-##### Returns:
-
-  True iff `self` is compatible with `other`.
-
-
-- - -
-
-#### `tf.TensorShape.is_fully_defined()` {#TensorShape.is_fully_defined}
-
-Returns True iff `self` is fully defined in every dimension.
-
-
-
-- - -
-
-#### `tf.TensorShape.with_rank(rank)` {#TensorShape.with_rank}
-
-Returns a shape based on `self` with the given rank.
-
-This method promotes a completely unknown shape to one with a
-known rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with the given rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with the given `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.with_rank_at_least(rank)` {#TensorShape.with_rank_at_least}
-
-Returns a shape based on `self` with at least the given rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with at least the given
-  rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with at least the given
-    `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.with_rank_at_most(rank)` {#TensorShape.with_rank_at_most}
-
-Returns a shape based on `self` with at most the given rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with at most the given
-  rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with at most the given
-    `rank`.
-
-
-
-- - -
-
-#### `tf.TensorShape.assert_has_rank(rank)` {#TensorShape.assert_has_rank}
-
-Raises an exception if `self` is not compatible with the given `rank`.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with the given `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.assert_same_rank(other)` {#TensorShape.assert_same_rank}
-
-Raises an exception if `self` and `other` do not have compatible ranks.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` do not represent shapes with the
-    same rank.
-
-
-- - -
-
-#### `tf.TensorShape.assert_is_compatible_with(other)` {#TensorShape.assert_is_compatible_with}
-
-Raises exception if `self` and `other` do not represent the same shape.
-
-This method can be used to assert that there exists a shape that both
-`self` and `other` represent.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another TensorShape.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` do not represent the same shape.
-
-
-- - -
-
-#### `tf.TensorShape.assert_is_fully_defined()` {#TensorShape.assert_is_fully_defined}
-
-Raises an exception if `self` is not fully defined in every dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not have a known value for every dimension.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.TensorShape.__bool__()` {#TensorShape.__bool__}
-
-Returns True if this shape contains non-zero information.
-
-
-- - -
-
-#### `tf.TensorShape.__eq__(other)` {#TensorShape.__eq__}
-
-Returns True if `self` is equivalent to `other`.
-
-
-- - -
-
-#### `tf.TensorShape.__getitem__(key)` {#TensorShape.__getitem__}
-
-Returns the value of a dimension or a shape, depending on the key.
-
-##### Args:
-
-
-*  <b>`key`</b>: If `key` is an integer, returns the dimension at that index;
-    otherwise if `key` is a slice, returns a TensorShape whose
-    dimensions are those selected by the slice from `self`.
-
-##### Returns:
-
-  A dimension if `key` is an integer, or a `TensorShape` if `key` is a
-  slice.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `key` is a slice, and any of its elements are negative, or
-    if `self` is completely unknown and the step is set.
-
-
-- - -
-
-#### `tf.TensorShape.__init__(dims)` {#TensorShape.__init__}
-
-Creates a new TensorShape with the given dimensions.
-
-##### Args:
-
-
-*  <b>`dims`</b>: A list of Dimensions, or None if the shape is unspecified.
-*  <b>`DEPRECATED`</b>: A single integer is treated as a singleton list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If dims cannot be converted to a list of dimensions.
-
-
-- - -
-
-#### `tf.TensorShape.__iter__()` {#TensorShape.__iter__}
-
-Returns `self.dims` if the rank is known, otherwise raises ValueError.
-
-
-- - -
-
-#### `tf.TensorShape.__len__()` {#TensorShape.__len__}
-
-Returns the rank of this shape, or raises ValueError if unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.__ne__(other)` {#TensorShape.__ne__}
-
-Returns True if `self` is known to be different from `other`.
-
-
-- - -
-
-#### `tf.TensorShape.__nonzero__()` {#TensorShape.__nonzero__}
-
-Returns True if this shape contains non-zero information.
-
-
-- - -
-
-#### `tf.TensorShape.__repr__()` {#TensorShape.__repr__}
-
-
-
-
-- - -
-
-#### `tf.TensorShape.__str__()` {#TensorShape.__str__}
-
-
-
-
-- - -
-
-#### `tf.TensorShape.num_elements()` {#TensorShape.num_elements}
-
-Returns the total number of elements, or none for incomplete shapes.
-
-
-
-- - -
-
-### `class tf.Dimension` {#Dimension}
-
-Represents the value of one dimension in a TensorShape.
-- - -
-
-#### `tf.Dimension.__add__(other)` {#Dimension.__add__}
-
-Returns the sum of `self` and `other`.
-
-Dimensions are summed as follows:
-
-  Dimension(m)    + Dimension(n)    == Dimension(m + n)
-  Dimension(m)    + Dimension(None) == Dimension(None)
-  Dimension(None) + Dimension(n)    == Dimension(None)
-  Dimension(None) + Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the sum of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__div__(other)` {#Dimension.__div__}
-
-DEPRECATED: Use `__floordiv__` via `x // y` instead.
-
-This function exists only for backwards compatibility purposes; new code
-should use `__floordiv__` via the syntax `x // y`.  Using `x // y`
-communicates clearly that the result rounds down, and is forward compatible
-to Python 3.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `Dimension`.
-
-##### Returns:
-
-  A `Dimension` whose value is the integer quotient of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__eq__(other)` {#Dimension.__eq__}
-
-Returns true if `other` has the same known value as this Dimension.
-
-
-- - -
-
-#### `tf.Dimension.__floordiv__(other)` {#Dimension.__floordiv__}
-
-Returns the quotient of `self` and `other` rounded down.
-
-Dimensions are divided as follows:
-
-  Dimension(m)    // Dimension(n)    == Dimension(m // n)
-  Dimension(m)    // Dimension(None) == Dimension(None)
-  Dimension(None) // Dimension(n)    == Dimension(None)
-  Dimension(None) // Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `Dimension`.
-
-##### Returns:
-
-  A `Dimension` whose value is the integer quotient of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__ge__(other)` {#Dimension.__ge__}
-
-Returns True if `self` is known to be greater than or equal to `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    >= Dimension(n)    == m >= n
-  Dimension(m)    >= Dimension(None) == None
-  Dimension(None) >= Dimension(n)    == None
-  Dimension(None) >= Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value >= other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__gt__(other)` {#Dimension.__gt__}
-
-Returns True if `self` is known to be greater than `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    > Dimension(n)    == m > n
-  Dimension(m)    > Dimension(None) == None
-  Dimension(None) > Dimension(n)    == None
-  Dimension(None) > Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value > other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__index__()` {#Dimension.__index__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__init__(value)` {#Dimension.__init__}
-
-Creates a new Dimension with the given value.
-
-
-- - -
-
-#### `tf.Dimension.__int__()` {#Dimension.__int__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__le__(other)` {#Dimension.__le__}
-
-Returns True if `self` is known to be less than or equal to `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    <= Dimension(n)    == m <= n
-  Dimension(m)    <= Dimension(None) == None
-  Dimension(None) <= Dimension(n)    == None
-  Dimension(None) <= Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value <= other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__lt__(other)` {#Dimension.__lt__}
-
-Returns True if `self` is known to be less than `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    < Dimension(n)    == m < n
-  Dimension(m)    < Dimension(None) == None
-  Dimension(None) < Dimension(n)    == None
-  Dimension(None) < Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value < other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__mod__(other)` {#Dimension.__mod__}
-
-Returns `self` modulo `other.
-
-Dimension moduli are computed  as follows:
-
-  Dimension(m)    % Dimension(n)     == Dimension(m % n)
-  Dimension(m)    % Dimension(None)  == Dimension(None)
-  Dimension(None) % Dimension(n)     == Dimension(None)
-  Dimension(None) %  Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is `self` modulo `other`.
-
-
-- - -
-
-#### `tf.Dimension.__mul__(other)` {#Dimension.__mul__}
-
-Returns the product of `self` and `other`.
-
-Dimensions are summed as follows:
-
-```
-  Dimension(m)    * Dimension(n)    == Dimension(m * n)
-  Dimension(m)    * Dimension(None) == Dimension(None)
-  Dimension(None) * Dimension(n)    == Dimension(None)
-  Dimension(None) * Dimension(None) == Dimension(None)
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the product of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__ne__(other)` {#Dimension.__ne__}
-
-Returns true if `other` has a different known value from `self`.
-
-
-- - -
-
-#### `tf.Dimension.__repr__()` {#Dimension.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__str__()` {#Dimension.__str__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__sub__(other)` {#Dimension.__sub__}
-
-Returns the subtraction of `other` from `self`.
-
-Dimensions are subtracted as follows:
-
-  Dimension(m)    - Dimension(n)    == Dimension(m - n)
-  Dimension(m)    - Dimension(None) == Dimension(None)
-  Dimension(None) - Dimension(n)    == Dimension(None)
-  Dimension(None) - Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the subtraction of sum of `other` from `self`.
-
-
-- - -
-
-#### `tf.Dimension.assert_is_compatible_with(other)` {#Dimension.assert_is_compatible_with}
-
-Raises an exception if `other` is not compatible with this Dimension.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible (see
-    is_compatible_with).
-
-
-- - -
-
-#### `tf.Dimension.is_compatible_with(other)` {#Dimension.is_compatible_with}
-
-Returns true if `other` is compatible with this Dimension.
-
-Two known Dimensions are compatible if they have the same value.
-An unknown Dimension is compatible with all other Dimensions.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  True if this Dimension and `other` are compatible.
-
-
-- - -
-
-#### `tf.Dimension.merge_with(other)` {#Dimension.merge_with}
-
-Returns a Dimension that combines the information in `self` and `other`.
-
-Dimensions are combined as follows:
-
-```python
-    Dimension(n)   .merge_with(Dimension(n))    == Dimension(n)
-    Dimension(n)   .merge_with(Dimension(None)) == Dimension(n)
-    Dimension(None).merge_with(Dimension(n))    == Dimension(n)
-    Dimension(None).merge_with(Dimension(None)) == Dimension(None)
-    Dimension(n)   .merge_with(Dimension(m)) raises ValueError for n != m
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension containing the combined information of `self` and
-  `other`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible (see
-    is_compatible_with).
-
-
-- - -
-
-#### `tf.Dimension.value` {#Dimension.value}
-
-The value of this dimension, or None if it is unknown.
-
-
-
-- - -
-
-### `tf.op_scope(values, name, default_name=None)` {#op_scope}
-
-DEPRECATED. Same as name_scope above, just different argument order.
-
-
-- - -
-
-### `tf.get_seed(op_seed)` {#get_seed}
-
-Returns the local seeds an operation should use given an op-specific seed.
-
-Given operation-specific seed, `op_seed`, this helper function returns two
-seeds derived from graph-level and op-level seeds. Many random operations
-internally use the two seeds to allow user to change the seed globally for a
-graph, or for only specific operations.
-
-For details on how the graph-level seed interacts with op seeds, see
-@{set_random_seed}.
-
-##### Args:
-
-
-*  <b>`op_seed`</b>: integer.
-
-##### Returns:
-
-  A tuple of two integers that should be used for the local seed of this
-  operation.
-
-
-
-## For libraries building on TensorFlow
-
-- - -
-
-### `tf.register_tensor_conversion_function(base_type, conversion_func, priority=100)` {#register_tensor_conversion_function}
-
-Registers a function for converting objects of `base_type` to `Tensor`.
-
-The conversion function must have the following signature:
-
-```python
-    def conversion_func(value, dtype=None, name=None, as_ref=False):
-      # ...
-```
-
-It must return a `Tensor` with the given `dtype` if specified. If the
-conversion function creates a new `Tensor`, it should use the given
-`name` if specified. All exceptions will be propagated to the caller.
-
-The conversion function may return `NotImplemented` for some
-inputs. In this case, the conversion process will continue to try
-subsequent conversion functions.
-
-If `as_ref` is true, the function must return a `Tensor` reference,
-such as a `Variable`.
-
-NOTE: The conversion functions will execute in order of priority,
-followed by order of registration. To ensure that a conversion function
-`F` runs before another conversion function `G`, ensure that `F` is
-registered with a smaller priority than `G`.
-
-##### Args:
-
-
-*  <b>`base_type`</b>: The base type or tuple of base types for all objects that
-    `conversion_func` accepts.
-*  <b>`conversion_func`</b>: A function that converts instances of `base_type` to
-    `Tensor`.
-*  <b>`priority`</b>: Optional integer that indicates the priority for applying this
-    conversion function. Conversion functions with smaller priority values
-    run earlier than conversion functions with larger priority values.
-    Defaults to 100.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments do not have the appropriate type.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.DeviceSpec` {#DeviceSpec}
-
-Represents a (possibly partial) specification for a TensorFlow device.
-
-`DeviceSpec`s are used throughout TensorFlow to describe where state is stored
-and computations occur. Using `DeviceSpec` allows you to parse device spec
-strings to verify their validity, merge them or compose them programmatically.
-
-Example:
-
-```python
-# Place the operations on device "GPU:0" in the "ps" job.
-device_spec = DeviceSpec(job="ps", device_type="GPU", device_index=0)
-with tf.device(device_spec):
-  # Both my_var and squared_var will be placed on /job:ps/device:GPU:0.
-  my_var = tf.Variable(..., name="my_variable")
-  squared_var = tf.square(my_var)
-```
-
-If a `DeviceSpec` is partially specified, it will be merged with other
-`DeviceSpec`s according to the scope in which it is defined. `DeviceSpec`
-components defined in inner scopes take precedence over those defined in
-outer scopes.
-
-```python
-with tf.device(DeviceSpec(job="train", )):
-  with tf.device(DeviceSpec(job="ps", device_type="GPU", device_index=0):
-    # Nodes created here will be assigned to /job:ps/device:GPU:0.
-  with tf.device(DeviceSpec(device_type="GPU", device_index=1):
-    # Nodes created here will be assigned to /job:train/device:GPU:1.
-```
-
-A `DeviceSpec` consists of 5 components -- each of
-which is optionally specified:
-
-* Job: The job name.
-* Replica: The replica index.
-* Task: The task index.
-* Device type: The device type string (e.g. "CPU" or "GPU").
-* Device index: The device index.
-- - -
-
-#### `tf.DeviceSpec.__init__(job=None, replica=None, task=None, device_type=None, device_index=None)` {#DeviceSpec.__init__}
-
-Create a new `DeviceSpec` object.
-
-##### Args:
-
-
-*  <b>`job`</b>: string.  Optional job name.
-*  <b>`replica`</b>: int.  Optional replica index.
-*  <b>`task`</b>: int.  Optional task index.
-*  <b>`device_type`</b>: Optional device type string (e.g. "CPU" or "GPU")
-*  <b>`device_index`</b>: int.  Optional device index.  If left
-    unspecified, device represents 'any' device_index.
-
-
-- - -
-
-#### `tf.DeviceSpec.from_string(spec)` {#DeviceSpec.from_string}
-
-Construct a `DeviceSpec` from a string.
-
-##### Args:
-
-
-*  <b>`spec`</b>: a string of the form
-   /job:<name>/replica:<id>/task:<id>/device:CPU:<id>
-  or
-   /job:<name>/replica:<id>/task:<id>/device:GPU:<id>
-  as cpu and gpu are mutually exclusive.
-  All entries are optional.
-
-##### Returns:
-
-  A DeviceSpec.
-
-
-- - -
-
-#### `tf.DeviceSpec.job` {#DeviceSpec.job}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.merge_from(dev)` {#DeviceSpec.merge_from}
-
-Merge the properties of "dev" into this `DeviceSpec`.
-
-##### Args:
-
-
-*  <b>`dev`</b>: a `DeviceSpec`.
-
-
-- - -
-
-#### `tf.DeviceSpec.parse_from_string(spec)` {#DeviceSpec.parse_from_string}
-
-Parse a `DeviceSpec` name into its components.
-
-##### Args:
-
-
-*  <b>`spec`</b>: a string of the form
-   /job:<name>/replica:<id>/task:<id>/device:CPU:<id>
-  or
-   /job:<name>/replica:<id>/task:<id>/device:GPU:<id>
-  as cpu and gpu are mutually exclusive.
-  All entries are optional.
-
-##### Returns:
-
-  The `DeviceSpec`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the spec was not valid.
-
-
-- - -
-
-#### `tf.DeviceSpec.replica` {#DeviceSpec.replica}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.task` {#DeviceSpec.task}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.to_string()` {#DeviceSpec.to_string}
-
-Return a string representation of this `DeviceSpec`.
-
-##### Returns:
-
-  a string of the form
-  /job:<name>/replica:<id>/task:<id>/device:<device_type>:<id>.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functional_ops.md b/tensorflow/g3doc/api_docs/python/functional_ops.md
deleted file mode 100644
index 02338eb97ec..00000000000
--- a/tensorflow/g3doc/api_docs/python/functional_ops.md
+++ /dev/null
@@ -1,304 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Higher Order Functions
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-Functional operations.
-
-## Higher Order Operators
-
-TensorFlow provides several higher order operators to simplify the common
-map-reduce programming patterns.
-
-- - -
-
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
-
-map on the list of tensors unpacked from `elems` on dimension 0.
-
-The simplest version of `map` repeatedly applies the callable `fn` to a
-sequence of elements from first to last. The elements are made of the
-tensors unpacked from `elems`. `dtype` is the data type of the return
-value of `fn`. Users must provide `dtype` if it is different from
-the data type of `elems`.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
-
-This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-is a (possibly nested) list or tuple of tensors, then each of these tensors
-must have a matching first (unpack) dimension.  The signature of `fn` may
-match the structure of `elems`.  That is, if `elems` is
-`(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-`fn = lambda (t1, [t2, t3, [t4, t5]]):`.
-
-Furthermore, `fn` may emit a different structure than its input.  For example,
-`fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-nested) tuple of types matching the output of `fn`.
-
-To apply a functional operation to the nonzero elements of a SparseTensor
-one of the following methods is recommended. First, if the function is
-expressible as TensorFlow ops, use
-
-```python
-  result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-```
-
-If, however, the function is not expressible as a TensorFlow op, then use
-
-```python
-result = SparseTensor(
-  input.indices, map_fn(fn, input.values), input.dense_shape)
-```
-
-instead.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.  It accepts one argument, which will
-    have the same (possibly nested) structure as `elems`.  Its output
-    must have the same structure as `dtype` if one is provided, otherwise
-    it must have the same structure as `elems`.
-*  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
-    will be unpacked along their first dimension.  The nested sequence
-    of the resulting slices will be applied to `fn`.
-*  <b>`dtype`</b>: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-    of Tensors differing from the structure of `elems`, then `dtype` is not
-    optional and must have the same structure as the output of `fn`.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-  results of applying `fn` to tensors unpacked from `elems` along the first
-  dimension, from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `dtype` do not match, or if elems is a SparseTensor.
-*  <b>`ValueError`</b>: if the lengths of the output of `fn` and `dtype` do not match.
-
-##### Examples:
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  squares = map_fn(lambda x: x * x, elems)
-  # squares == [1, 4, 9, 16, 25, 36]
-  ```
-
-  ```python
-  elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-  alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-  # alternate == [-1, 2, -3]
-  ```
-
-  ```python
-  elems = np.array([1, 2, 3])
-  alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-  # alternates[0] == [1, 2, 3]
-  # alternates[1] == [-1, -2, -3]
-  ```
-
-
-- - -
-
-### `tf.foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#foldl}
-
-foldl on the list of tensors unpacked from `elems` on dimension 0.
-
-This foldl operator repeatedly applies the callable `fn` to a sequence
-of elements from first to last. The elements are made of the tensors
-unpacked from `elems` on dimension 0. The callable fn takes two tensors as
-arguments. The first argument is the accumulated value computed from the
-preceding invocation of fn. If `initializer` is None, `elems` must contain
-at least one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is fn(initializer, values[0]).shape`.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.
-*  <b>`elems`</b>: A tensor to be unpacked on dimension 0.
-*  <b>`initializer`</b>: (optional) The initial value for the accumulator.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor resulting from applying `fn` consecutively to the list of tensors
-  unpacked from `elems`, from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable.
-
-##### Example:
-
-  ```python
-  elems = [1, 2, 3, 4, 5, 6]
-  sum = foldl(lambda a, x: a + x, elems)
-  # sum == 21
-  ```
-
-
-- - -
-
-### `tf.foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#foldr}
-
-foldr on the list of tensors unpacked from `elems` on dimension 0.
-
-This foldr operator repeatedly applies the callable `fn` to a sequence
-of elements from last to first. The elements are made of the tensors
-unpacked from `elems`. The callable fn takes two tensors as arguments.
-The first argument is the accumulated value computed from the preceding
-invocation of fn. If `initializer` is None, `elems` must contain at least
-one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `fn(initializer, values[0]).shape`.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.
-*  <b>`elems`</b>: A tensor that is unpacked into a sequence of tensors to apply `fn`.
-*  <b>`initializer`</b>: (optional) The initial value for the accumulator.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor resulting from applying `fn` consecutively to the list of tensors
-  unpacked from `elems`, from last to first.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable.
-
-##### Example:
-
-  ```python
-  elems = [1, 2, 3, 4, 5, 6]
-  sum = foldr(lambda a, x: a + x, elems)
-  # sum == 21
-  ```
-
-
-- - -
-
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
-
-scan on the list of tensors unpacked from `elems` on dimension 0.
-
-The simplest version of `scan` repeatedly applies the callable `fn` to a
-sequence of elements from first to last. The elements are made of the tensors
-unpacked from `elems` on dimension 0. The callable fn takes two tensors as
-arguments. The first argument is the accumulated value computed from the
-preceding invocation of fn. If `initializer` is None, `elems` must contain
-at least one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`.
-
-This method also allows multi-arity `elems` and accumulator.  If `elems`
-is a (possibly nested) list or tuple of tensors, then each of these tensors
-must have a matching first (unpack) dimension.  The second argument of
-`fn` must match the structure of `elems`.
-
-If no `initializer` is provided, the output structure and dtypes of `fn`
-are assumed to be the same as its input; and in this case, the first
-argument of `fn` must match the structure of `elems`.
-
-If an `initializer` is provided, then the output of `fn` must have the same
-structure as `initializer`; and the first argument of `fn` must match
-this structure.
-
-For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
-`[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-`fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
-`[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
- one that works in `python3`, is:
-`fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.  It accepts two arguments.  The first
-    will have the same structure as `initializer` if one is provided,
-    otherwise it will have the same structure as `elems`.  The second
-    will have the same (possibly nested) structure as `elems`.  Its output
-    must have the same structure as `initializer` if one is provided,
-    otherwise it must have the same structure as `elems`.
-*  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
-    will be unpacked along their first dimension.  The nested sequence
-    of the resulting slices will be the first argument to `fn`.
-*  <b>`initializer`</b>: (optional) A tensor or (possibly nested) sequence of tensors,
-    initial value for the accumulator, and the expected output type of `fn`.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-  results of applying `fn` to tensors unpacked from `elems` along the first
-  dimension, and the previous accumulator value(s), from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `initializer` do not match.
-*  <b>`ValueError`</b>: if the lengths of the output of `fn` and `initializer`
-    do not match.
-
-##### Examples:
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  sum = scan(lambda a, x: a + x, elems)
-  # sum == [1, 3, 6, 10, 15, 21]
-  ```
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  initializer = np.array(0)
-  sum_one = scan(
-      lambda a, x: x[0] - x[1] + a, (elems + 1, elems), initializer)
-  # sum_one == [1, 2, 3, 4, 5, 6]
-  ```
-
-  ```python
-  elems = np.array([1, 0, 0, 0, 0, 0])
-  initializer = (np.array(0), np.array(1))
-  fibonaccis = scan(lambda a, _: (a[1], a[0] + a[1]), elems, initializer)
-  # fibonaccis == ([1, 1, 2, 3, 5, 8], [1, 2, 3, 5, 8, 13])
-  ```
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ReaderBase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ReaderBase.md
deleted file mode 100644
index 68a60bc33a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ReaderBase.md
+++ /dev/null
@@ -1,183 +0,0 @@
-Base class for different Reader types, that produce a record every step.
-
-Conceptually, Readers convert string 'work units' into records (key,
-value pairs).  Typically the 'work units' are filenames and the
-records are extracted from the contents of those files.  We want a
-single record produced per step, but a work unit can correspond to
-many records.
-
-Therefore we introduce some decoupling using a queue.  The queue
-contains the work units and the Reader dequeues from the queue when
-it is asked to produce a record (via Read()) but it has finished the
-last work unit.
-- - -
-
-#### `tf.ReaderBase.__init__(reader_ref, supports_serialize=False)` {#ReaderBase.__init__}
-
-Creates a new ReaderBase.
-
-##### Args:
-
-
-*  <b>`reader_ref`</b>: The operation that implements the reader.
-*  <b>`supports_serialize`</b>: True if the reader implementation can
-    serialize its state.
-
-
-- - -
-
-#### `tf.ReaderBase.num_records_produced(name=None)` {#ReaderBase.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.num_work_units_completed(name=None)` {#ReaderBase.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.read(queue, name=None)` {#ReaderBase.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.read_up_to(queue, num_records, name=None)` {#ReaderBase.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.reader_ref` {#ReaderBase.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.ReaderBase.reset(name=None)` {#ReaderBase.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.ReaderBase.restore_state(state, name=None)` {#ReaderBase.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.ReaderBase.serialize_state(name=None)` {#ReaderBase.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.supports_serialize` {#ReaderBase.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md
deleted file mode 100644
index fd0950c328f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseFeature.md
+++ /dev/null
@@ -1,78 +0,0 @@
-Configuration for parsing a sparse input feature.
-
-Fields:
-  index_key: Name of index feature.  The underlying feature's type must
-    be `int64` and its length must always match that of the `value_key`
-    feature.
-  value_key: Name of value feature.  The underlying feature's type must
-    be `dtype` and its length must always match that of the `index_key`
-    feature.
-  dtype: Data type of the `value_key` feature.
-  size: A Python int to specify a dimension of the dense shape. Each value in
-    the `index_key` feature must be in `[0, size)`.
-  already_sorted: A Python boolean to specify whether the values in
-    `index_key` are already sorted. If so skip sorting.
-    False by default (optional).
-- - -
-
-#### `tf.SparseFeature.__getnewargs__()` {#SparseFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.SparseFeature.__getstate__()` {#SparseFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
-
-Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
-
-
-- - -
-
-#### `tf.SparseFeature.__repr__()` {#SparseFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.SparseFeature.already_sorted` {#SparseFeature.already_sorted}
-
-Alias for field number 4
-
-
-- - -
-
-#### `tf.SparseFeature.dtype` {#SparseFeature.dtype}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.SparseFeature.index_key` {#SparseFeature.index_key}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.SparseFeature.size` {#SparseFeature.size}
-
-Alias for field number 3
-
-
-- - -
-
-#### `tf.SparseFeature.value_key` {#SparseFeature.value_key}
-
-Alias for field number 1
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md
deleted file mode 100644
index 74544425592..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.SparseTensorValue.md
+++ /dev/null
@@ -1,50 +0,0 @@
-SparseTensorValue(indices, values, dense_shape)
-- - -
-
-#### `tf.SparseTensorValue.__getnewargs__()` {#SparseTensorValue.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.SparseTensorValue.__getstate__()` {#SparseTensorValue.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, dense_shape)` {#SparseTensorValue.__new__}
-
-Create new instance of SparseTensorValue(indices, values, dense_shape)
-
-
-- - -
-
-#### `tf.SparseTensorValue.__repr__()` {#SparseTensorValue.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.SparseTensorValue.dense_shape` {#SparseTensorValue.dense_shape}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.SparseTensorValue.indices` {#SparseTensorValue.indices}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.SparseTensorValue.values` {#SparseTensorValue.values}
-
-Alias for field number 1
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
deleted file mode 100644
index a10d61aedcb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
+++ /dev/null
@@ -1,288 +0,0 @@
-Class wrapping dynamic-sized, per-time-step, write-once Tensor arrays.
-
-This class is meant to be used with dynamic iteration primitives such as
-`while_loop` and `map_fn`.  It supports gradient back-propagation via special
-"flow" control flow dependencies.
-
-- - -
-
-#### `tf.TensorArray.handle` {#TensorArray.handle}
-
-The reference to the TensorArray.
-
-
-- - -
-
-#### `tf.TensorArray.flow` {#TensorArray.flow}
-
-The flow `Tensor` forcing ops leading to this TensorArray state.
-
-
-- - -
-
-#### `tf.TensorArray.dtype` {#TensorArray.dtype}
-
-The data type of this TensorArray.
-
-
-
-- - -
-
-#### `tf.TensorArray.read(index, name=None)` {#TensorArray.read}
-
-Read the value at location `index` in the TensorArray.
-
-##### Args:
-
-
-*  <b>`index`</b>: 0-D.  int32 tensor with the index to read from.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tensor at index `index`.
-
-
-- - -
-
-#### `tf.TensorArray.gather(indices, name=None)` {#TensorArray.gather}
-
-Return selected values in the TensorArray as a packed `Tensor`.
-
-All of selected values must have been written and their shapes
-must all match.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `1-D` `Tensor` taking values in `[0, max_value)`.  If
-    the `TensorArray` is not dynamic, `max_value=size()`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The in the `TensorArray` selected by `indices`, packed into one tensor.
-
-
-- - -
-
-#### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
-
-Return the values in the TensorArray as a stacked `Tensor`.
-
-All of the values must have been written and their shapes must all match.
-If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray stacked into one tensor.
-
-
-- - -
-
-#### `tf.TensorArray.concat(name=None)` {#TensorArray.concat}
-
-Return the values in the TensorArray as a concatenated `Tensor`.
-
-All of the values must have been written, their ranks must match, and
-and their shapes must all match for all dimensions except the first.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray concatenated into one tensor.
-
-
-
-- - -
-
-#### `tf.TensorArray.write(index, value, name=None)` {#TensorArray.write}
-
-Write `value` into index `index` of the TensorArray.
-
-##### Args:
-
-
-*  <b>`index`</b>: 0-D.  int32 scalar with the index to write to.
-*  <b>`value`</b>: N-D.  Tensor of type `dtype`.  The Tensor to write to this index.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the write occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if there are more writers than specified.
-
-
-- - -
-
-#### `tf.TensorArray.scatter(indices, value, name=None)` {#TensorArray.scatter}
-
-Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `1-D` `Tensor` taking values in `[0, max_value)`.  If
-    the `TensorArray` is not dynamic, `max_value=size()`.
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the scatter occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-- - -
-
-#### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
-
-Unstack the values of a `Tensor` in the TensorArray.
-
-If input value shapes have rank-`R`, then the output TensorArray will
-contain elements whose shapes are rank-`(R-1)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the unstack occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-- - -
-
-#### `tf.TensorArray.split(value, lengths, name=None)` {#TensorArray.split}
-
-Split the values of a `Tensor` into the TensorArray.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to split.
-*  <b>`lengths`</b>: 1-D.  int32 vector with the lengths to use when splitting
-    `value` along its first dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the split occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-
-- - -
-
-#### `tf.TensorArray.identity()` {#TensorArray.identity}
-
-Returns a TensorArray with the same content and properties.
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the control dependencies
-  from the contexts will become control dependencies for writes, reads, etc.
-  Use this object all for subsequent operations.
-
-
-
-- - -
-
-#### `tf.TensorArray.grad(source, flow=None, name=None)` {#TensorArray.grad}
-
-
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, element_shape=None, name=None)` {#TensorArray.__init__}
-
-Construct a new TensorArray or wrap an existing TensorArray handle.
-
-A note about the parameter `name`:
-
-The name of the `TensorArray` (even if passed in) is uniquified: each time
-a new `TensorArray` is created at runtime it is assigned its own name for
-the duration of the run.  This avoids name collisions if a `TensorArray`
-is created within a `while_loop`.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: (required) data type of the TensorArray.
-*  <b>`size`</b>: (optional) int32 scalar `Tensor`: the size of the TensorArray.
-    Required if handle is not provided.
-*  <b>`dynamic_size`</b>: (optional) Python bool: If true, writes to the TensorArray
-    can grow the TensorArray past its initial size.  Default: False.
-*  <b>`clear_after_read`</b>: Boolean (optional, default: True).  If True, clear
-    TensorArray values after reading them.  This disables read-many
-    semantics, but allows early release of memory.
-*  <b>`tensor_array_name`</b>: (optional) Python string: the name of the TensorArray.
-    This is used when creating the TensorArray handle.  If this value is
-    set, handle should be None.
-*  <b>`handle`</b>: (optional) A `Tensor` handle to an existing TensorArray.  If this
-    is set, tensor_array_name should be None.
-*  <b>`flow`</b>: (optional) A float `Tensor` scalar coming from an existing
-    `TensorArray.flow`.
-*  <b>`infer_shape`</b>: (optional, default: True) If True, shape inference
-    is enabled.  In this case, all elements must have the same shape.
-*  <b>`element_shape`</b>: (optional, default: None) A `TensorShape` object specifying
-    the shape constraints of each of the elements of the TensorArray.
-    Need not be fully defined.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if both handle and tensor_array_name are provided.
-*  <b>`TypeError`</b>: if handle is provided but is not a Tensor.
-
-
-- - -
-
-#### `tf.TensorArray.close(name=None)` {#TensorArray.close}
-
-Close the current TensorArray.
-
-
-- - -
-
-#### `tf.TensorArray.size(name=None)` {#TensorArray.size}
-
-Return the size of the TensorArray.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorShape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorShape.md
deleted file mode 100644
index 29f3c6b26a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorShape.md
+++ /dev/null
@@ -1,403 +0,0 @@
-Represents the shape of a `Tensor`.
-
-A `TensorShape` represents a possibly-partial shape specification for a
-`Tensor`. It may be one of the following:
-
-* *Fully-known shape:* has a known number of dimensions and a known size
-  for each dimension.
-* *Partially-known shape:* has a known number of dimensions, and an unknown
-  size for one or more dimension.
-* *Unknown shape:* has an unknown number of dimensions, and an unknown
-  size in all dimensions.
-
-If a tensor is produced by an operation of type `"Foo"`, its shape
-may be inferred if there is a registered shape function for
-`"Foo"`. See [`Shape functions in
-C++`](../../how_tos/adding_an_op/index.md#shape-functions-in-c) for
-details of shape functions and how to register them. Alternatively,
-the shape may be set explicitly using
-[`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape).
-
-- - -
-
-#### `tf.TensorShape.merge_with(other)` {#TensorShape.merge_with}
-
-Returns a `TensorShape` combining the information in `self` and `other`.
-
-The dimensions in `self` and `other` are merged elementwise,
-according to the rules defined for `Dimension.merge_with()`.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Returns:
-
-  A `TensorShape` containing the combined information of `self` and
-  `other`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible.
-
-
-- - -
-
-#### `tf.TensorShape.concatenate(other)` {#TensorShape.concatenate}
-
-Returns the concatenation of the dimension in `self` and `other`.
-
-*N.B.* If either `self` or `other` is completely unknown,
-concatenation will discard information about the other shape. In
-future, we might support concatenation that preserves this
-information for use with slicing.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Returns:
-
-  A `TensorShape` whose dimensions are the concatenation of the
-  dimensions in `self` and `other`.
-
-
-
-- - -
-
-#### `tf.TensorShape.ndims` {#TensorShape.ndims}
-
-Returns the rank of this shape, or None if it is unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.dims` {#TensorShape.dims}
-
-Returns a list of Dimensions, or None if the shape is unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.as_list()` {#TensorShape.as_list}
-
-Returns a list of integers or `None` for each dimension.
-
-##### Returns:
-
-  A list of integers or `None` for each dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` is an unknown shape with an unknown rank.
-
-
-- - -
-
-#### `tf.TensorShape.as_proto()` {#TensorShape.as_proto}
-
-Returns this shape as a `TensorShapeProto`.
-
-
-- - -
-
-#### `tf.TensorShape.is_compatible_with(other)` {#TensorShape.is_compatible_with}
-
-Returns True iff `self` is compatible with `other`.
-
-Two possibly-partially-defined shapes are compatible if there
-exists a fully-defined shape that both shapes can represent. Thus,
-compatibility allows the shape inference code to reason about
-partially-defined shapes. For example:
-
-* TensorShape(None) is compatible with all shapes.
-
-* TensorShape([None, None]) is compatible with all two-dimensional
-  shapes, such as TensorShape([32, 784]), and also TensorShape(None). It is
-  not compatible with, for example, TensorShape([None]) or
-  TensorShape([None, None, None]).
-
-* TensorShape([32, None]) is compatible with all two-dimensional shapes
-  with size 32 in the 0th dimension, and also TensorShape([None, None])
-  and TensorShape(None). It is not compatible with, for example,
-  TensorShape([32]), TensorShape([32, None, 1]) or TensorShape([64, None]).
-
-* TensorShape([32, 784]) is compatible with itself, and also
-  TensorShape([32, None]), TensorShape([None, 784]), TensorShape([None,
-  None]) and TensorShape(None). It is not compatible with, for example,
-  TensorShape([32, 1, 784]) or TensorShape([None]).
-
-The compatibility relation is reflexive and symmetric, but not
-transitive. For example, TensorShape([32, 784]) is compatible with
-TensorShape(None), and TensorShape(None) is compatible with
-TensorShape([4, 4]), but TensorShape([32, 784]) is not compatible with
-TensorShape([4, 4]).
-
-##### Args:
-
-
-*  <b>`other`</b>: Another TensorShape.
-
-##### Returns:
-
-  True iff `self` is compatible with `other`.
-
-
-- - -
-
-#### `tf.TensorShape.is_fully_defined()` {#TensorShape.is_fully_defined}
-
-Returns True iff `self` is fully defined in every dimension.
-
-
-
-- - -
-
-#### `tf.TensorShape.with_rank(rank)` {#TensorShape.with_rank}
-
-Returns a shape based on `self` with the given rank.
-
-This method promotes a completely unknown shape to one with a
-known rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with the given rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with the given `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.with_rank_at_least(rank)` {#TensorShape.with_rank_at_least}
-
-Returns a shape based on `self` with at least the given rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with at least the given
-  rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with at least the given
-    `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.with_rank_at_most(rank)` {#TensorShape.with_rank_at_most}
-
-Returns a shape based on `self` with at most the given rank.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Returns:
-
-  A shape that is at least as specific as `self` with at most the given
-  rank.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with at most the given
-    `rank`.
-
-
-
-- - -
-
-#### `tf.TensorShape.assert_has_rank(rank)` {#TensorShape.assert_has_rank}
-
-Raises an exception if `self` is not compatible with the given `rank`.
-
-##### Args:
-
-
-*  <b>`rank`</b>: An integer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not represent a shape with the given `rank`.
-
-
-- - -
-
-#### `tf.TensorShape.assert_same_rank(other)` {#TensorShape.assert_same_rank}
-
-Raises an exception if `self` and `other` do not have compatible ranks.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` do not represent shapes with the
-    same rank.
-
-
-- - -
-
-#### `tf.TensorShape.assert_is_compatible_with(other)` {#TensorShape.assert_is_compatible_with}
-
-Raises exception if `self` and `other` do not represent the same shape.
-
-This method can be used to assert that there exists a shape that both
-`self` and `other` represent.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another TensorShape.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` do not represent the same shape.
-
-
-- - -
-
-#### `tf.TensorShape.assert_is_fully_defined()` {#TensorShape.assert_is_fully_defined}
-
-Raises an exception if `self` is not fully defined in every dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` does not have a known value for every dimension.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.TensorShape.__bool__()` {#TensorShape.__bool__}
-
-Returns True if this shape contains non-zero information.
-
-
-- - -
-
-#### `tf.TensorShape.__eq__(other)` {#TensorShape.__eq__}
-
-Returns True if `self` is equivalent to `other`.
-
-
-- - -
-
-#### `tf.TensorShape.__getitem__(key)` {#TensorShape.__getitem__}
-
-Returns the value of a dimension or a shape, depending on the key.
-
-##### Args:
-
-
-*  <b>`key`</b>: If `key` is an integer, returns the dimension at that index;
-    otherwise if `key` is a slice, returns a TensorShape whose
-    dimensions are those selected by the slice from `self`.
-
-##### Returns:
-
-  A dimension if `key` is an integer, or a `TensorShape` if `key` is a
-  slice.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `key` is a slice, and any of its elements are negative, or
-    if `self` is completely unknown and the step is set.
-
-
-- - -
-
-#### `tf.TensorShape.__init__(dims)` {#TensorShape.__init__}
-
-Creates a new TensorShape with the given dimensions.
-
-##### Args:
-
-
-*  <b>`dims`</b>: A list of Dimensions, or None if the shape is unspecified.
-*  <b>`DEPRECATED`</b>: A single integer is treated as a singleton list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If dims cannot be converted to a list of dimensions.
-
-
-- - -
-
-#### `tf.TensorShape.__iter__()` {#TensorShape.__iter__}
-
-Returns `self.dims` if the rank is known, otherwise raises ValueError.
-
-
-- - -
-
-#### `tf.TensorShape.__len__()` {#TensorShape.__len__}
-
-Returns the rank of this shape, or raises ValueError if unspecified.
-
-
-- - -
-
-#### `tf.TensorShape.__ne__(other)` {#TensorShape.__ne__}
-
-Returns True if `self` is known to be different from `other`.
-
-
-- - -
-
-#### `tf.TensorShape.__nonzero__()` {#TensorShape.__nonzero__}
-
-Returns True if this shape contains non-zero information.
-
-
-- - -
-
-#### `tf.TensorShape.__repr__()` {#TensorShape.__repr__}
-
-
-
-
-- - -
-
-#### `tf.TensorShape.__str__()` {#TensorShape.__str__}
-
-
-
-
-- - -
-
-#### `tf.TensorShape.num_elements()` {#TensorShape.num_elements}
-
-Returns the total number of elements, or none for incomplete shapes.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VarLenFeature.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VarLenFeature.__new__.md
deleted file mode 100644
index 282ca37e0b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VarLenFeature.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.VarLenFeature.__new__(_cls, dtype)` {#VarLenFeature.__new__}
-
-Create new instance of VarLenFeature(dtype,)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
deleted file mode 100644
index cecd0a2c836..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
+++ /dev/null
@@ -1,159 +0,0 @@
-Variable scope object to carry defaults to provide to get_variable.
-
-Many of the arguments we need for get_variable in a variable store are most
-easily handled with a context. This object is used for the defaults.
-
-Attributes:
-  name: name of the current scope, used as prefix in get_variable.
-  initializer: default initializer passed to get_variable.
-  regularizer: default regularizer passed to get_variable.
-  reuse: Boolean or None, setting the reuse in get_variable.
-  caching_device: string, callable, or None: the caching device passed to
-    get_variable.
-  partitioner: callable or `None`: the partitioner passed to `get_variable`.
-  custom_getter: default custom getter passed to get_variable.
-  name_scope: The name passed to `tf.name_scope`.
-  dtype: default type passed to get_variable (defaults to DT_FLOAT).
-  use_resource: if False, create a normal Variable; if True create an
-    experimental ResourceVariable with well-defined semantics. Defaults
-    to False (will later change to True).
-- - -
-
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32, use_resource=None)` {#VariableScope.__init__}
-
-Creates a new VariableScope with the given properties.
-
-
-- - -
-
-#### `tf.VariableScope.caching_device` {#VariableScope.caching_device}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.custom_getter` {#VariableScope.custom_getter}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.dtype` {#VariableScope.dtype}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, use_resource=None, custom_getter=None)` {#VariableScope.get_variable}
-
-Gets an existing variable with this name or create a new one.
-
-
-- - -
-
-#### `tf.VariableScope.initializer` {#VariableScope.initializer}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.name` {#VariableScope.name}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.original_name_scope` {#VariableScope.original_name_scope}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.partitioner` {#VariableScope.partitioner}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.regularizer` {#VariableScope.regularizer}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.reuse` {#VariableScope.reuse}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.reuse_variables()` {#VariableScope.reuse_variables}
-
-Reuse variables in this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_caching_device(caching_device)` {#VariableScope.set_caching_device}
-
-Set caching_device for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_custom_getter(custom_getter)` {#VariableScope.set_custom_getter}
-
-Set custom getter for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
-
-Set data type for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
-
-Set initializer for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_partitioner(partitioner)` {#VariableScope.set_partitioner}
-
-Set partitioner for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_regularizer(regularizer)` {#VariableScope.set_regularizer}
-
-Set regularizer for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_use_resource(use_resource)` {#VariableScope.set_use_resource}
-
-Sets whether to use ResourceVariables for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.use_resource` {#VariableScope.use_resource}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_check_numerics_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_check_numerics_ops.md
deleted file mode 100644
index 5895f744b8e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_check_numerics_ops.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.add_check_numerics_ops()` {#add_check_numerics_ops}
-
-Connect a `check_numerics` to every floating point tensor.
-
-`check_numerics` operations themselves are added for each `half`, `float`,
-or `double` tensor in the graph. For all ops in the graph, the
-`check_numerics` op for all of its (`half`, `float`, or `double`) inputs
-is guaranteed to run before the `check_numerics` op on any of its outputs.
-
-##### Returns:
-
-  A `group` op depending on all `check_numerics` ops added.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_n.md
deleted file mode 100644
index 306aaf4dddf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_n.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.add_n(inputs, name=None)` {#add_n}
-
-Adds all input tensors element-wise.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `Tensor` objects, each with same shape and type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as the elements of `inputs`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `inputs` don't all have same shape and dtype or the shape
-  cannot be inferred.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_to_collection.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_to_collection.md
deleted file mode 100644
index 1d8d7529171..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.add_to_collection.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.add_to_collection(name, value)` {#add_to_collection}
-
-Wrapper for `Graph.add_to_collection()` using the default graph.
-
-See [`Graph.add_to_collection()`](../../api_docs/python/framework.md#Graph.add_to_collection)
-for more details.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collection.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_greater_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_greater_equal.md
deleted file mode 100644
index 3674f530e71..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_greater_equal.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.assert_greater_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_greater_equal}
-
-Assert the condition `x >= y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_greater_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] >= y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to
-    "assert_greater_equal"
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x >= y` is False.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md
deleted file mode 100644
index 7f9547f39da..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.assert_non_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_non_positive}
-
-Assert the condition `x <= 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_non_positive(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Non-positive means, for every element `x[i]` of `x`, we have `x[i] <= 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_non_positive".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all non-positive.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.case.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.case.md
deleted file mode 100644
index 02bae13a15e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.case.md
+++ /dev/null
@@ -1,75 +0,0 @@
-### `tf.case(pred_fn_pairs, default, exclusive=False, name='case')` {#case}
-
-Create a case operation.
-
-The `pred_fn_pairs` parameter is a dict or list of pairs of size N.
-Each pair contains a boolean scalar tensor and a python callable that
-creates the tensors to be returned if the boolean evaluates to True.
-`default` is a callable generating a list of tensors. All the callables
-in `pred_fn_pairs` as well as `default` should return the same number
-and types of tensors.
-
-If `exclusive==True`, all predicates are evaluated, and an exception is
-thrown if more than one of the predicates evaluates to `True`.
-If `exclusive==False`, execution stops are the first predicate which
-evaluates to True, and the tensors generated by the corresponding function
-are returned immediately. If none of the predicates evaluate to True, this
-operation returns the tensors generated by `default`.
-
-Example 1:
-  Pseudocode:
-  ```
-    if (x < y) return 17;
-    else return 23;
-  ```
-
-  Expressions:
-  ```
-    f1 = lambda: tf.constant(17)
-    f2 = lambda: tf.constant(23)
-    r = case([(tf.less(x, y), f1)], default=f2)
-  ```
-
-Example 2:
-  Pseudocode:
-  ```
-    if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
-    if (x < y) return 17;
-    else if (x > z) return 23;
-    else return -1;
-  ```
-
-  Expressions:
-  ```
-    x = tf.constant(0)
-    y = tf.constant(1)
-    z = tf.constant(2)
-    def f1(): return tf.constant(17)
-    def f2(): return tf.constant(23)
-    def f3(): return tf.constant(-1)
-    r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
-             default=f3, exclusive=True)
-  ```
-
-##### Args:
-
-
-*  <b>`pred_fn_pairs`</b>: Dict or list of pairs of a boolean scalar tensor and a
-                 callable which returns a list of tensors.
-*  <b>`default`</b>: A callable that returns a list of tensors.
-*  <b>`exclusive`</b>: True iff at most one predicate is allowed to evaluate to `True`.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  The tensors returned by the first pair whose predicate evaluated to True, or
-  those returned by `default` if none does.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `pred_fn_pairs` is not a list/dictionary.
-*  <b>`TypeError`</b>: If `pred_fn_pairs` is a list but does not contain 2-tuples.
-*  <b>`TypeError`</b>: If `fns[i]` is not callable for any i, or `default` is not
-             callable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
deleted file mode 100644
index 046c443925f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.cholesky(input, name=None)` {#cholesky}
-
-Computes the Cholesky decomposition of one or more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix Cholesky
-decomposition above. The output is a tensor of the same shape as the input
-containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M, M]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cond.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cond.md
deleted file mode 100644
index bb94a0610a3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cond.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.cond(pred, fn1, fn2, name=None)` {#cond}
-
-Return either fn1() or fn2() based on the boolean predicate `pred`.
-
-`fn1` and `fn2` both return lists of output tensors. `fn1` and `fn2` must have
-the same non-zero number and type of outputs.
-
-Note that the conditional execution applies only to the operations defined in
-fn1 and fn2. Consider the following simple program:
-
-```python
-z = tf.multiply(a, b)
-result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
-```
-
-If x < y, the `tf.add` operation will be executed and `tf.square`
-operation will not be executed. Since z is needed for at least one
-branch of the cond, the `tf.multiply` operation is always executed, unconditionally.
-Although this behavior is consistent with the dataflow model of TensorFlow,
-it has occasionally surprised some users who expected a lazier semantics.
-
-##### Args:
-
-
-*  <b>`pred`</b>: A scalar determining whether to return the result of `fn1` or `fn2`.
-*  <b>`fn1`</b>: The callable to be performed if pred is true.
-*  <b>`fn2`</b>: The callable to be performed if pref is false.
-*  <b>`name`</b>: Optional name prefix for the returned tensors.
-
-##### Returns:
-
-  Tensors returned by the call to either `fn1` or `fn2`. If the callables
-  return a singleton list, the element is extracted from the list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn1` or `fn2` is not callable.
-*  <b>`ValueError`</b>: if `fn1` and `fn2` do not return the same number of tensors, or
-              return tensors of different types.
-
-
-*  <b>`Example`</b>: 
-
-```python
-  x = tf.constant(2)
-  y = tf.constant(5)
-  def f1(): return tf.multiply(x, 17)
-  def f2(): return tf.add(y, 23)
-  r = tf.cond(tf.less(x, y), f1, f2)
-  # r is set to f1().
-  # Operations in f2 (e.g., tf.add) are not executed.
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.md
deleted file mode 100644
index da7082ffb65..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.md
+++ /dev/null
@@ -1,89 +0,0 @@
-A StochasticTensor with an observed value.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.__init__(dist, value, name=None)` {#ObservedStochasticTensor.__init__}
-
-Construct an `ObservedStochasticTensor`.
-
-`ObservedStochasticTensor` is backed by distribution `dist` and uses the
-provided value instead of using the current value type to draw a value from
-the distribution. The provided value argument must be appropriately shaped
-to have come from the distribution.
-
-##### Args:
-
-
-*  <b>`dist`</b>: an instance of `Distribution`.
-*  <b>`value`</b>: a Tensor containing the observed value
-*  <b>`name`</b>: a name for this `ObservedStochasticTensor` and its ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist` is not an instance of `Distribution`.
-*  <b>`ValueError`</b>: if `value` is not compatible with the distribution.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.distribution` {#ObservedStochasticTensor.distribution}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.dtype` {#ObservedStochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.entropy(name='entropy')` {#ObservedStochasticTensor.entropy}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.graph` {#ObservedStochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.loss(final_loss, name=None)` {#ObservedStochasticTensor.loss}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.mean(name='mean')` {#ObservedStochasticTensor.mean}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.name` {#ObservedStochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.value(name='value')` {#ObservedStochasticTensor.value}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.ObservedStochasticTensor.value_type` {#ObservedStochasticTensor.value_type}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
deleted file mode 100644
index b51d9785faf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
+++ /dev/null
@@ -1,594 +0,0 @@
-Bernoulli distribution.
-
-The Bernoulli distribution with `probs` parameter, i.e., the probability of a
-`1` outcome (vs a `0` outcome).
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='Bernoulli')` {#Bernoulli.__init__}
-
-Construct Bernoulli distributions.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor` representing the log-odds of a `1` event. Each
-    entry in the `Tensor` parametrizes an independent Bernoulli distribution
-    where the probability of an event is sigmoid(logits). Only one of
-    `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor` representing the probability of a `1`
-    event. Each entry in the `Tensor` parameterizes an independent
-    Bernoulli distribution. Only one of `logits` or `probs` should be passed
-    in.
-*  <b>`dtype`</b>: The type of the event samples. Default: `int32`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If p and logits are passed, or if neither are passed.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.allow_nan_stats` {#Bernoulli.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.batch_shape` {#Bernoulli.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.batch_shape_tensor(name='batch_shape_tensor')` {#Bernoulli.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.cdf(value, name='cdf')` {#Bernoulli.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.copy(**override_parameters_kwargs)` {#Bernoulli.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.covariance(name='covariance')` {#Bernoulli.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.dtype` {#Bernoulli.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.entropy(name='entropy')` {#Bernoulli.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.event_shape` {#Bernoulli.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.event_shape_tensor(name='event_shape_tensor')` {#Bernoulli.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_continuous` {#Bernoulli.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_scalar_batch(name='is_scalar_batch')` {#Bernoulli.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.is_scalar_event(name='is_scalar_event')` {#Bernoulli.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_cdf(value, name='log_cdf')` {#Bernoulli.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_prob(value, name='log_prob')` {#Bernoulli.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.log_survival_function(value, name='log_survival_function')` {#Bernoulli.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.logits` {#Bernoulli.logits}
-
-Log-odds of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.mean(name='mean')` {#Bernoulli.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.mode(name='mode')` {#Bernoulli.mode}
-
-Mode.
-
-Additional documentation from `Bernoulli`:
-
-Returns `1` if `prob > 0.5` and `0` otherwise.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.name` {#Bernoulli.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Bernoulli.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.param_static_shapes(cls, sample_shape)` {#Bernoulli.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.parameters` {#Bernoulli.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.prob(value, name='prob')` {#Bernoulli.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.probs` {#Bernoulli.probs}
-
-Probability of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.reparameterization_type` {#Bernoulli.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.sample(sample_shape=(), seed=None, name='sample')` {#Bernoulli.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.stddev(name='stddev')` {#Bernoulli.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.survival_function(value, name='survival_function')` {#Bernoulli.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.validate_args` {#Bernoulli.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Bernoulli.variance(name='variance')` {#Bernoulli.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md
deleted file mode 100644
index c845a63e8eb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md
+++ /dev/null
@@ -1,573 +0,0 @@
-Chi2 with parameter transform `df = floor(abs(df))`.
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.__init__(df, validate_args=False, allow_nan_stats=True, name='Chi2WithAbsDf')` {#Chi2WithAbsDf.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.allow_nan_stats` {#Chi2WithAbsDf.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.batch_shape` {#Chi2WithAbsDf.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.batch_shape_tensor(name='batch_shape_tensor')` {#Chi2WithAbsDf.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.cdf(value, name='cdf')` {#Chi2WithAbsDf.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.concentration` {#Chi2WithAbsDf.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.copy(**override_parameters_kwargs)` {#Chi2WithAbsDf.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.covariance(name='covariance')` {#Chi2WithAbsDf.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.df` {#Chi2WithAbsDf.df}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.dtype` {#Chi2WithAbsDf.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.entropy(name='entropy')` {#Chi2WithAbsDf.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.event_shape` {#Chi2WithAbsDf.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.event_shape_tensor(name='event_shape_tensor')` {#Chi2WithAbsDf.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_continuous` {#Chi2WithAbsDf.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_scalar_batch(name='is_scalar_batch')` {#Chi2WithAbsDf.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.is_scalar_event(name='is_scalar_event')` {#Chi2WithAbsDf.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_cdf(value, name='log_cdf')` {#Chi2WithAbsDf.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_prob(value, name='log_prob')` {#Chi2WithAbsDf.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.log_survival_function(value, name='log_survival_function')` {#Chi2WithAbsDf.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.mean(name='mean')` {#Chi2WithAbsDf.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.mode(name='mode')` {#Chi2WithAbsDf.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.name` {#Chi2WithAbsDf.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Chi2WithAbsDf.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.param_static_shapes(cls, sample_shape)` {#Chi2WithAbsDf.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.parameters` {#Chi2WithAbsDf.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.prob(value, name='prob')` {#Chi2WithAbsDf.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.rate` {#Chi2WithAbsDf.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.reparameterization_type` {#Chi2WithAbsDf.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.sample(sample_shape=(), seed=None, name='sample')` {#Chi2WithAbsDf.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.stddev(name='stddev')` {#Chi2WithAbsDf.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.survival_function(value, name='survival_function')` {#Chi2WithAbsDf.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.validate_args` {#Chi2WithAbsDf.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2WithAbsDf.variance(name='variance')` {#Chi2WithAbsDf.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
deleted file mode 100644
index 5b140c42135..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
+++ /dev/null
@@ -1,683 +0,0 @@
-Dirichlet distribution.
-
-The Dirichlet distribution is defined over the
-[`(k-1)`-simplex](https://en.wikipedia.org/wiki/Simplex) using a positive,
-length-`k` vector `concentration` (`k > 1`). The Dirichlet is identically the
-Beta distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Dirichlet is a distribution over the open `(k-1)`-simplex, i.e.,
-
-```none
-S^{k-1} = { (x_0, ..., x_{k-1}) in R^k : sum_j x_j = 1 and all_j x_j > 0 }.
-```
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha) = prod_j x_j**(alpha_j - 1) / Z
-Z = prod_j Gamma(alpha_j) / Gamma(sum_j alpha_j)
-```
-
-where:
-
-* `x in S^{k-1}`, i.e., the `(k-1)`-simplex,
-* `concentration = alpha = [alpha_0, ..., alpha_{k-1}]`, `alpha_j > 0`,
-* `Z` is the normalization constant aka the [multivariate beta function](
-  https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function),
-  and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The `concentration` represents mean total counts of class occurrence, i.e.,
-
-```none
-concentration = alpha = mean * total_concentration
-```
-
-where `mean` in `S^{k-1}` and `total_concentration` is a positive real number
-representing a mean total count.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-# Create a single trivariate Dirichlet, with the 3rd class being three times
-# more frequent than the first. I.e., batch_shape=[], event_shape=[3].
-alpha = [1., 2, 3]
-dist = Dirichlet(alpha)
-
-dist.sample([4, 5])  # shape: [4, 5, 3]
-
-# x has one sample, one batch, three classes:
-x = [.2, .3, .5]   # shape: [3]
-dist.prob(x)       # shape: []
-
-# x has two samples from one batch:
-x = [[.1, .4, .5],
-     [.2, .3, .5]]
-dist.prob(x)         # shape: [2]
-
-# alpha will be broadcast to shape [5, 7, 3] to match x.
-x = [[...]]   # shape: [5, 7, 3]
-dist.prob(x)  # shape: [5, 7]
-```
-
-```python
-# Create batch_shape=[2], event_shape=[3]:
-alpha = [[1., 2, 3],
-         [4, 5, 6]]   # shape: [2, 3]
-dist = Dirichlet(alpha)
-
-dist.sample([4, 5])  # shape: [4, 5, 2, 3]
-
-x = [.2, .3, .5]
-# x will be broadcast as [[.2, .3, .5],
-#                         [.2, .3, .5]],
-# thus matching batch_shape [2, 3].
-dist.prob(x)         # shape: [2]
-```
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.__init__(concentration, validate_args=False, allow_nan_stats=True, name='Dirichlet')` {#Dirichlet.__init__}
-
-Initialize a batch of Dirichlet distributions.
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Positive floating-point `Tensor` indicating mean number
-    of class occurrences; aka "alpha". Implies `self.dtype`, and
-    `self.batch_shape`, `self.event_shape`, i.e., if
-    `concentration.shape = [N1, N2, ..., Nm, k]` then
-    `batch_shape = [N1, N2, ..., Nm]` and
-    `event_shape = [k]`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.allow_nan_stats` {#Dirichlet.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.batch_shape` {#Dirichlet.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.batch_shape_tensor(name='batch_shape_tensor')` {#Dirichlet.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.cdf(value, name='cdf')` {#Dirichlet.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.concentration` {#Dirichlet.concentration}
-
-Concentration parameter; expected counts for that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.copy(**override_parameters_kwargs)` {#Dirichlet.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.covariance(name='covariance')` {#Dirichlet.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.dtype` {#Dirichlet.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.entropy(name='entropy')` {#Dirichlet.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.event_shape` {#Dirichlet.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.event_shape_tensor(name='event_shape_tensor')` {#Dirichlet.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_continuous` {#Dirichlet.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_scalar_batch(name='is_scalar_batch')` {#Dirichlet.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.is_scalar_event(name='is_scalar_event')` {#Dirichlet.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_cdf(value, name='log_cdf')` {#Dirichlet.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_prob(value, name='log_prob')` {#Dirichlet.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Dirichlet`:
-
-Note: `value` must be a non-negative tensor with
-dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
-`tf.reduce_sum(value, -1) = 1`. It must have a shape compatible with
-`self.batch_shape() + self.event_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.log_survival_function(value, name='log_survival_function')` {#Dirichlet.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.mean(name='mean')` {#Dirichlet.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.mode(name='mode')` {#Dirichlet.mode}
-
-Mode.
-
-Additional documentation from `Dirichlet`:
-
-Note: The mode is undefined when any `concentration <= 1`. If
-`self.allow_nan_stats` is `True`, `NaN` is used for undefined modes.  If
-`self.allow_nan_stats` is `False` an exception is raised when one or more
-modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.name` {#Dirichlet.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Dirichlet.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.param_static_shapes(cls, sample_shape)` {#Dirichlet.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.parameters` {#Dirichlet.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.prob(value, name='prob')` {#Dirichlet.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Dirichlet`:
-
-Note: `value` must be a non-negative tensor with
-dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
-`tf.reduce_sum(value, -1) = 1`. It must have a shape compatible with
-`self.batch_shape() + self.event_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.reparameterization_type` {#Dirichlet.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.sample(sample_shape=(), seed=None, name='sample')` {#Dirichlet.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.stddev(name='stddev')` {#Dirichlet.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.survival_function(value, name='survival_function')` {#Dirichlet.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.total_concentration` {#Dirichlet.total_concentration}
-
-Sum of last dim of concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.validate_args` {#Dirichlet.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Dirichlet.variance(name='variance')` {#Dirichlet.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md
deleted file mode 100644
index a2658ef44b8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md
+++ /dev/null
@@ -1,688 +0,0 @@
-A generic probability distribution base class.
-
-`Distribution` is a base class for constructing and organizing properties
-(e.g., mean, variance) of random variables (e.g, Bernoulli, Gaussian).
-
-### Subclassing
-
-Subclasses are expected to implement a leading-underscore version of the
-same-named function.  The argument signature should be identical except for
-the omission of `name="..."`.  For example, to enable `log_prob(value,
-name="log_prob")` a subclass should implement `_log_prob(value)`.
-
-Subclasses can append to public-level docstrings by providing
-docstrings for their method specializations. For example:
-
-```python
-@distribution_util.AppendDocstring("Some other details.")
-def _log_prob(self, value):
-  ...
-```
-
-would add the string "Some other details." to the `log_prob` function
-docstring.  This is implemented as a simple decorator to avoid python
-linter complaining about missing Args/Returns/Raises sections in the
-partial docstrings.
-
-### Broadcasting, batching, and shapes
-
-All distributions support batches of independent distributions of that type.
-The batch shape is determined by broadcasting together the parameters.
-
-The shape of arguments to `__init__`, `cdf`, `log_cdf`, `prob`, and
-`log_prob` reflect this broadcasting, as does the return value of `sample` and
-`sample_n`.
-
-`sample_n_shape = (n,) + batch_shape + event_shape`, where `sample_n_shape` is
-the shape of the `Tensor` returned from `sample_n`, `n` is the number of
-samples, `batch_shape` defines how many independent distributions there are,
-and `event_shape` defines the shape of samples from each of those independent
-distributions. Samples are independent along the `batch_shape` dimensions, but
-not necessarily so along the `event_shape` dimensions (depending on the
-particulars of the underlying distribution).
-
-Using the `Uniform` distribution as an example:
-
-```python
-minval = 3.0
-maxval = [[4.0, 6.0],
-          [10.0, 12.0]]
-
-# Broadcasting:
-# This instance represents 4 Uniform distributions. Each has a lower bound at
-# 3.0 as the `minval` parameter was broadcasted to match `maxval`'s shape.
-u = Uniform(minval, maxval)
-
-# `event_shape` is `TensorShape([])`.
-event_shape = u.event_shape
-# `event_shape_t` is a `Tensor` which will evaluate to [].
-event_shape_t = u.event_shape_tensor()
-
-# Sampling returns a sample per distribution.  `samples` has shape
-# (5, 2, 2), which is (n,) + batch_shape + event_shape, where n=5,
-# batch_shape=(2, 2), and event_shape=().
-samples = u.sample_n(5)
-
-# The broadcasting holds across methods. Here we use `cdf` as an example. The
-# same holds for `log_cdf` and the likelihood functions.
-
-# `cum_prob` has shape (2, 2) as the `value` argument was broadcasted to the
-# shape of the `Uniform` instance.
-cum_prob_broadcast = u.cdf(4.0)
-
-# `cum_prob`'s shape is (2, 2), one per distribution. No broadcasting
-# occurred.
-cum_prob_per_dist = u.cdf([[4.0, 5.0],
-                           [6.0, 7.0]])
-
-# INVALID as the `value` argument is not broadcastable to the distribution's
-# shape.
-cum_prob_invalid = u.cdf([4.0, 5.0, 6.0])
-```
-
-### Parameter values leading to undefined statistics or distributions.
-
-Some distributions do not have well-defined statistics for all initialization
-parameter values.  For example, the beta distribution is parameterized by
-positive real numbers `a` and `b`, and does not have well-defined mode if
-`a < 1` or `b < 1`.
-
-The user is given the option of raising an exception or returning `NaN`.
-
-```python
-a = tf.exp(tf.matmul(logits, weights_a))
-b = tf.exp(tf.matmul(logits, weights_b))
-
-# Will raise exception if ANY batch member has a < 1 or b < 1.
-dist = distributions.beta(a, b, allow_nan_stats=False)
-mode = dist.mode().eval()
-
-# Will return NaN for batch members with either a < 1 or b < 1.
-dist = distributions.beta(a, b, allow_nan_stats=True)  # Default behavior
-mode = dist.mode().eval()
-```
-
-In all cases, an exception is raised if *invalid* parameters are passed, e.g.
-
-```python
-# Will raise an exception if any Op is run.
-negative_a = -1.0 * a  # beta distribution by definition has a > 0.
-dist = distributions.beta(negative_a, b, allow_nan_stats=True)
-dist.mean().eval()
-```
-- - -
-
-#### `tf.contrib.distributions.Distribution.__init__(dtype, is_continuous, reparameterization_type, validate_args, allow_nan_stats, parameters=None, graph_parents=None, name=None)` {#Distribution.__init__}
-
-Constructs the `Distribution`.
-
-**This is a private method for subclass use.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the event samples. `None` implies no type-enforcement.
-*  <b>`is_continuous`</b>: Python boolean. If `True` this
-    `Distribution` is continuous over its supported domain.
-*  <b>`reparameterization_type`</b>: Instance of `ReparameterizationType`.
-    If `distributions.FULLY_REPARAMETERIZED`, this
-    `Distribution` can be reparameterized in terms of some standard
-    distribution with a function whose Jacobian is constant for the support
-    of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
-    then no such reparameterization is available.
-*  <b>`validate_args`</b>: Python boolean.  Whether to validate input with asserts.
-    If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Python boolean.  If `False`, raise an
-    exception if a statistic (e.g., mean, mode) is undefined for any batch
-    member. If True, batch members with valid parameters leading to
-    undefined statistics will return `NaN` for this statistic.
-*  <b>`parameters`</b>: Python dictionary of parameters used to instantiate this
-    `Distribution`.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Distribution`.
-*  <b>`name`</b>: A name for this distribution. Default: subclass name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any member of graph_parents is `None` or not a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.allow_nan_stats` {#Distribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.batch_shape` {#Distribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.batch_shape_tensor(name='batch_shape_tensor')` {#Distribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.cdf(value, name='cdf')` {#Distribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.copy(**override_parameters_kwargs)` {#Distribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.covariance(name='covariance')` {#Distribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.dtype` {#Distribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.entropy(name='entropy')` {#Distribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.event_shape` {#Distribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.event_shape_tensor(name='event_shape_tensor')` {#Distribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_continuous` {#Distribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_scalar_batch(name='is_scalar_batch')` {#Distribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.is_scalar_event(name='is_scalar_event')` {#Distribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_cdf(value, name='log_cdf')` {#Distribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_prob(value, name='log_prob')` {#Distribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.log_survival_function(value, name='log_survival_function')` {#Distribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.mean(name='mean')` {#Distribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.mode(name='mode')` {#Distribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.name` {#Distribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Distribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.param_static_shapes(cls, sample_shape)` {#Distribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.parameters` {#Distribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.prob(value, name='prob')` {#Distribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.reparameterization_type` {#Distribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.sample(sample_shape=(), seed=None, name='sample')` {#Distribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.stddev(name='stddev')` {#Distribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.survival_function(value, name='survival_function')` {#Distribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.validate_args` {#Distribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Distribution.variance(name='variance')` {#Distribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.GammaWithSoftplusConcentrationRate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.GammaWithSoftplusConcentrationRate.md
deleted file mode 100644
index 298fe91fe5e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.GammaWithSoftplusConcentrationRate.md
+++ /dev/null
@@ -1,566 +0,0 @@
-`Gamma` with softplus of `concentration` and `rate`.
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='GammaWithSoftplusConcentrationRate')` {#GammaWithSoftplusConcentrationRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.allow_nan_stats` {#GammaWithSoftplusConcentrationRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.batch_shape` {#GammaWithSoftplusConcentrationRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.batch_shape_tensor(name='batch_shape_tensor')` {#GammaWithSoftplusConcentrationRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.cdf(value, name='cdf')` {#GammaWithSoftplusConcentrationRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.concentration` {#GammaWithSoftplusConcentrationRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.copy(**override_parameters_kwargs)` {#GammaWithSoftplusConcentrationRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.covariance(name='covariance')` {#GammaWithSoftplusConcentrationRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.dtype` {#GammaWithSoftplusConcentrationRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.entropy(name='entropy')` {#GammaWithSoftplusConcentrationRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.event_shape` {#GammaWithSoftplusConcentrationRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.event_shape_tensor(name='event_shape_tensor')` {#GammaWithSoftplusConcentrationRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_continuous` {#GammaWithSoftplusConcentrationRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_scalar_batch(name='is_scalar_batch')` {#GammaWithSoftplusConcentrationRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.is_scalar_event(name='is_scalar_event')` {#GammaWithSoftplusConcentrationRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_cdf(value, name='log_cdf')` {#GammaWithSoftplusConcentrationRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_prob(value, name='log_prob')` {#GammaWithSoftplusConcentrationRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.log_survival_function(value, name='log_survival_function')` {#GammaWithSoftplusConcentrationRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.mean(name='mean')` {#GammaWithSoftplusConcentrationRate.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.mode(name='mode')` {#GammaWithSoftplusConcentrationRate.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.name` {#GammaWithSoftplusConcentrationRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#GammaWithSoftplusConcentrationRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.param_static_shapes(cls, sample_shape)` {#GammaWithSoftplusConcentrationRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.parameters` {#GammaWithSoftplusConcentrationRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.prob(value, name='prob')` {#GammaWithSoftplusConcentrationRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.rate` {#GammaWithSoftplusConcentrationRate.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.reparameterization_type` {#GammaWithSoftplusConcentrationRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.sample(sample_shape=(), seed=None, name='sample')` {#GammaWithSoftplusConcentrationRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.stddev(name='stddev')` {#GammaWithSoftplusConcentrationRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.survival_function(value, name='survival_function')` {#GammaWithSoftplusConcentrationRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.validate_args` {#GammaWithSoftplusConcentrationRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.GammaWithSoftplusConcentrationRate.variance(name='variance')` {#GammaWithSoftplusConcentrationRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.md
deleted file mode 100644
index 0b19b2f0693..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.md
+++ /dev/null
@@ -1,579 +0,0 @@
-`InverseGamma` with softplus of `concentration` and `rate`.
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='InverseGammaWithSoftplusConcentrationRate')` {#InverseGammaWithSoftplusConcentrationRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.allow_nan_stats` {#InverseGammaWithSoftplusConcentrationRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.batch_shape` {#InverseGammaWithSoftplusConcentrationRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.batch_shape_tensor(name='batch_shape_tensor')` {#InverseGammaWithSoftplusConcentrationRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.cdf(value, name='cdf')` {#InverseGammaWithSoftplusConcentrationRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.concentration` {#InverseGammaWithSoftplusConcentrationRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.copy(**override_parameters_kwargs)` {#InverseGammaWithSoftplusConcentrationRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.covariance(name='covariance')` {#InverseGammaWithSoftplusConcentrationRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.dtype` {#InverseGammaWithSoftplusConcentrationRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.entropy(name='entropy')` {#InverseGammaWithSoftplusConcentrationRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.event_shape` {#InverseGammaWithSoftplusConcentrationRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.event_shape_tensor(name='event_shape_tensor')` {#InverseGammaWithSoftplusConcentrationRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_continuous` {#InverseGammaWithSoftplusConcentrationRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_scalar_batch(name='is_scalar_batch')` {#InverseGammaWithSoftplusConcentrationRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.is_scalar_event(name='is_scalar_event')` {#InverseGammaWithSoftplusConcentrationRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_cdf(value, name='log_cdf')` {#InverseGammaWithSoftplusConcentrationRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_prob(value, name='log_prob')` {#InverseGammaWithSoftplusConcentrationRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.log_survival_function(value, name='log_survival_function')` {#InverseGammaWithSoftplusConcentrationRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.mean(name='mean')` {#InverseGammaWithSoftplusConcentrationRate.mean}
-
-Mean.
-
-Additional documentation from `InverseGamma`:
-
-The mean of an inverse gamma distribution is
-`rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
-raised rather than returning `NaN`
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.mode(name='mode')` {#InverseGammaWithSoftplusConcentrationRate.mode}
-
-Mode.
-
-Additional documentation from `InverseGamma`:
-
-The mode of an inverse gamma distribution is `rate / (concentration +
-1)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.name` {#InverseGammaWithSoftplusConcentrationRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#InverseGammaWithSoftplusConcentrationRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.param_static_shapes(cls, sample_shape)` {#InverseGammaWithSoftplusConcentrationRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.parameters` {#InverseGammaWithSoftplusConcentrationRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.prob(value, name='prob')` {#InverseGammaWithSoftplusConcentrationRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.rate` {#InverseGammaWithSoftplusConcentrationRate.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.reparameterization_type` {#InverseGammaWithSoftplusConcentrationRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.sample(sample_shape=(), seed=None, name='sample')` {#InverseGammaWithSoftplusConcentrationRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.stddev(name='stddev')` {#InverseGammaWithSoftplusConcentrationRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.survival_function(value, name='survival_function')` {#InverseGammaWithSoftplusConcentrationRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.validate_args` {#InverseGammaWithSoftplusConcentrationRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGammaWithSoftplusConcentrationRate.variance(name='variance')` {#InverseGammaWithSoftplusConcentrationRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `InverseGamma`:
-
-Variance for inverse gamma is defined only for `concentration > 2`. If
-`self.allow_nan_stats` is `False`, an exception will be raised rather
-than returning `NaN`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
deleted file mode 100644
index a110fae74a3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
+++ /dev/null
@@ -1,676 +0,0 @@
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and a Cholesky factor `chol`.
-Providing the Cholesky factor allows for `O(k^2)` pdf evaluation and sampling,
-and requires `O(k^2)` storage.
-
-#### Mathematical details
-
-The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
-
-The PDF of this distribution is then:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and a covariance matrix of shape `k x k`.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal covariance.
-# Note, this would be more efficient with MultivariateNormalDiag.
-mu = [1, 2, 3.]
-chol = [[1, 0, 0], [0, 3, 0], [0, 0, 2]]
-dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33]]
-chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-dist = tf.contrib.distributions.MultivariateNormalCholesky(mu, chol)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-
-Trainable (batch) Cholesky matrices can be created with
-`tf.contrib.distributions.matrix_diag_transform()`
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.__init__(mu, chol, validate_args=False, allow_nan_stats=True, name='MultivariateNormalCholesky')` {#MultivariateNormalCholesky.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and `chol` which holds the (batch) Cholesky
-factors, such that the covariance of each batch member is `chol chol^T`.
-
-##### Args:
-
-
-*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
-    though it is zero), and the diagonal must be positive.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`, and the inputs are
-    invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `chol` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.allow_nan_stats` {#MultivariateNormalCholesky.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.batch_shape` {#MultivariateNormalCholesky.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalCholesky.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.cdf(value, name='cdf')` {#MultivariateNormalCholesky.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.copy(**override_parameters_kwargs)` {#MultivariateNormalCholesky.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.covariance(name='covariance')` {#MultivariateNormalCholesky.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.dtype` {#MultivariateNormalCholesky.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.entropy(name='entropy')` {#MultivariateNormalCholesky.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.event_shape` {#MultivariateNormalCholesky.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalCholesky.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_continuous` {#MultivariateNormalCholesky.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalCholesky.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalCholesky.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_cdf(value, name='log_cdf')` {#MultivariateNormalCholesky.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_prob(value, name='log_prob')` {#MultivariateNormalCholesky.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalCholesky.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalCholesky.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mean(name='mean')` {#MultivariateNormalCholesky.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mode(name='mode')` {#MultivariateNormalCholesky.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.mu` {#MultivariateNormalCholesky.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.name` {#MultivariateNormalCholesky.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalCholesky.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.param_static_shapes(cls, sample_shape)` {#MultivariateNormalCholesky.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.parameters` {#MultivariateNormalCholesky.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.prob(value, name='prob')` {#MultivariateNormalCholesky.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.reparameterization_type` {#MultivariateNormalCholesky.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalCholesky.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sigma` {#MultivariateNormalCholesky.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.sigma_det(name='sigma_det')` {#MultivariateNormalCholesky.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.stddev(name='stddev')` {#MultivariateNormalCholesky.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.survival_function(value, name='survival_function')` {#MultivariateNormalCholesky.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.validate_args` {#MultivariateNormalCholesky.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalCholesky.variance(name='variance')` {#MultivariateNormalCholesky.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md
deleted file mode 100644
index 07fe04d1224..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md
+++ /dev/null
@@ -1,43 +0,0 @@
-Decorator to register a KL divergence implementation function.
-
-Usage:
-
-@distributions.RegisterKL(distributions.Normal, distributions.Normal)
-def _kl_normal_mvn(norm_a, norm_b):
-  # Return KL(norm_a || norm_b)
-- - -
-
-#### `tf.contrib.distributions.RegisterKL.__call__(kl_fn)` {#RegisterKL.__call__}
-
-Perform the KL registration.
-
-##### Args:
-
-
-*  <b>`kl_fn`</b>: The function to use for the KL divergence.
-
-##### Returns:
-
-  kl_fn
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if kl_fn is not a callable.
-*  <b>`ValueError`</b>: if a KL divergence function has already been registered for
-    the given argument classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RegisterKL.__init__(dist_cls_a, dist_cls_b)` {#RegisterKL.__init__}
-
-Initialize the KL registrar.
-
-##### Args:
-
-
-*  <b>`dist_cls_a`</b>: the class of the first argument of the KL divergence.
-*  <b>`dist_cls_b`</b>: the class of the second argument of the KL divergence.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RelaxedOneHotCategorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RelaxedOneHotCategorical.md
deleted file mode 100644
index 699be2dbbf0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RelaxedOneHotCategorical.md
+++ /dev/null
@@ -1,669 +0,0 @@
-RelaxedOneHotCategorical distribution with temperature and logits.
-
-The RelaxedOneHotCategorical is a distribution over random probability
-vectors, vectors of positive real values that sum to one, which continuously
-approximates a OneHotCategorical. The degree of approximation is controlled by
-a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
-becomes discrete with a distribution described by the `logits` or `probs`
-parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
-becomes the constant distribution that is identically the constant vector of
-(1/event_size, ..., 1/event_size).
-
-The RelaxedOneHotCategorical distribution was concurrently introduced as the
-Gumbel-Softmax (Jang et al., 2016) and Concrete (Maddison et al., 2016)
-distributions for use as a reparameterized continuous approximation to the
-`Categorical` one-hot distribution. If you use this distribution, please cite
-both papers.
-
-#### Examples
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very low, samples from
-this distribution are almost discrete, with one component almost 1 and the
-others nearly 0. The 2nd class is the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Creates a continuous distribution, which approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very high, samples from
-this distribution are usually close to the (1/3, 1/3, 1/3) vector. The 2nd
-class is still the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 10
-logits = [-2, 2, 0]
-dist = RelaxedOneHotCategorical(temperature, logits=logits)
-```
-
-Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with
-Gumbel-Softmax. 2016.
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.__init__(temperature, logits=None, probs=None, dtype=tf.float32, validate_args=False, allow_nan_stats=True, name='RelaxedOneHotCategorical')` {#RelaxedOneHotCategorical.__init__}
-
-Initialize RelaxedOneHotCategorical using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of RelaxedOneHotCategorical distributions. The temperature
-    should be positive.
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of RelaxedOneHotCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of logits for each class. Only
-    one of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of RelaxedOneHotCategorical distributions. The first `N - 1`
-    dimensions index into a batch of independent distributions and the last
-    dimension represents a vector of probabilities for each class. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member.  If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: A name for this distribution (optional).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.allow_nan_stats` {#RelaxedOneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.batch_shape` {#RelaxedOneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#RelaxedOneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.bijector` {#RelaxedOneHotCategorical.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.cdf(value, name='cdf')` {#RelaxedOneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.copy(**override_parameters_kwargs)` {#RelaxedOneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.covariance(name='covariance')` {#RelaxedOneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.distribution` {#RelaxedOneHotCategorical.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.dtype` {#RelaxedOneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.entropy(name='entropy')` {#RelaxedOneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.event_shape` {#RelaxedOneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#RelaxedOneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_continuous` {#RelaxedOneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#RelaxedOneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.is_scalar_event(name='is_scalar_event')` {#RelaxedOneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_cdf(value, name='log_cdf')` {#RelaxedOneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_prob(value, name='log_prob')` {#RelaxedOneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.log_survival_function(value, name='log_survival_function')` {#RelaxedOneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.mean(name='mean')` {#RelaxedOneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.mode(name='mode')` {#RelaxedOneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.name` {#RelaxedOneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#RelaxedOneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.param_static_shapes(cls, sample_shape)` {#RelaxedOneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.parameters` {#RelaxedOneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.prob(value, name='prob')` {#RelaxedOneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.reparameterization_type` {#RelaxedOneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#RelaxedOneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.stddev(name='stddev')` {#RelaxedOneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.survival_function(value, name='survival_function')` {#RelaxedOneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.validate_args` {#RelaxedOneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedOneHotCategorical.variance(name='variance')` {#RelaxedOneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md
deleted file mode 100644
index 689176cd1e3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md
+++ /dev/null
@@ -1,300 +0,0 @@
-Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-`event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-Examples:
-
-```python
-bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-# Result: [[1, 1], [1, 5]], i.e., x x.T
-
-bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
-# Result: [[1, 0], [2, 1]], i.e., chol(y).
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.__init__(event_ndims=2, validate_args=False, name='cholesky_outer_product')` {#CholeskyOuterProduct.__init__}
-
-Instantiates the `CholeskyOuterProduct` bijector.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: `constant` `int32` scalar `Tensor` indicating the number of
-    dimensions associated with a particular draw from the distribution. Must
-    be 0 or 2.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if event_ndims is neither 0 or 2.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.dtype` {#CholeskyOuterProduct.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.event_ndims` {#CholeskyOuterProduct.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward(x, name='forward')` {#CholeskyOuterProduct.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_event_shape(input_shape)` {#CholeskyOuterProduct.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#CholeskyOuterProduct.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#CholeskyOuterProduct.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.graph_parents` {#CholeskyOuterProduct.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse(y, name='inverse')` {#CholeskyOuterProduct.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_event_shape(output_shape)` {#CholeskyOuterProduct.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#CholeskyOuterProduct.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#CholeskyOuterProduct.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.is_constant_jacobian` {#CholeskyOuterProduct.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.name` {#CholeskyOuterProduct.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.validate_args` {#CholeskyOuterProduct.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.SigmoidCentered.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.SigmoidCentered.md
deleted file mode 100644
index de0ab4bb9cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.SigmoidCentered.md
+++ /dev/null
@@ -1,275 +0,0 @@
-Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-See `bijector.SoftmaxCentered` for more details.
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.__init__(validate_args=False, name='sigmoid_centered')` {#SigmoidCentered.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.dtype` {#SigmoidCentered.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.event_ndims` {#SigmoidCentered.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward(x, name='forward')` {#SigmoidCentered.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_event_shape(input_shape)` {#SigmoidCentered.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#SigmoidCentered.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#SigmoidCentered.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.graph_parents` {#SigmoidCentered.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse(y, name='inverse')` {#SigmoidCentered.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#SigmoidCentered.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_event_shape(output_shape)` {#SigmoidCentered.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#SigmoidCentered.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#SigmoidCentered.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.is_constant_jacobian` {#SigmoidCentered.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.name` {#SigmoidCentered.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SigmoidCentered.validate_args` {#SigmoidCentered.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.assign_from_checkpoint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.assign_from_checkpoint.md
deleted file mode 100644
index 1ab8d563e1d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.assign_from_checkpoint.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.framework.assign_from_checkpoint(model_path, var_list)` {#assign_from_checkpoint}
-
-Creates an operation to assign specific variables from a checkpoint.
-
-##### Args:
-
-
-*  <b>`model_path`</b>: The full path to the model checkpoint. To get latest checkpoint
-      use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
-*  <b>`var_list`</b>: A list of (possibly partitioned) `Variable` objects
-      or a dictionary mapping names in the checkpoint to the
-      corresponding variables or list of variables to initialize
-      from that checkpoint value. For partitioned Variables, the
-      name in the checkpoint must be the full variable, not the
-      name of the partitioned variable, eg. "my_var" rather than
-      "my_var/part_4". If empty, returns no_op(), {}.
-
-##### Returns:
-
-  the restore_op and the feed_dict that need to be run to restore var_list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the checkpoint specified at `model_path` is missing one of
-    the variables in `var_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.deprecated_args.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.deprecated_args.md
deleted file mode 100644
index 3f81ac9fc15..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.framework.deprecated_args.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.contrib.framework.deprecated_args(date, instructions, *deprecated_arg_names_or_tuples)` {#deprecated_args}
-
-Decorator for marking specific function arguments as deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called with the deprecated argument. It has the following format:
-
-  Calling <function> (from <module>) with <arg> is deprecated and will be
-  removed after <date>. Instructions for updating:
-    <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated arguments)' is
-appended to the first line of the docstring and a deprecation notice is
-prepended to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-*  <b>`*deprecated_arg_names_or_tuples`</b>: String. or 2-Tuple(String,
-    [ok_vals]).  The string is the deprecated argument name.
-    Optionally, an ok-value may be provided.  If the user provided
-    argument equals this value, the warning is suppressed.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, instructions are
-    empty, the deprecated arguments are not present in the function
-    signature, or the second element of a deprecated_tuple is not a
-    list.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.add_control_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.add_control_inputs.md
deleted file mode 100644
index 18aa510a324..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.add_control_inputs.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.add_control_inputs(op, cops)` {#add_control_inputs}
-
-Add the control inputs cops to co.
-
-Warning: this function is directly manipulating the internals of the tf.Graph.
-
-##### Args:
-
-
-*  <b>`op`</b>: a tf.Operation to which the control inputs are added.
-*  <b>`cops`</b>: an object convertible to a list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if op is not a tf.Operation
-*  <b>`ValueError`</b>: if any cop in cops is already a control input of op.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
deleted file mode 100644
index 56b381f85fd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
-
-Detach the inputs of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_inputs`</b>: if True control_inputs are also detached.
-
-##### Returns:
-
-  A tuple `(sgv, input_placeholders)` where
-    `sgv` is a new subgraph view of the detached subgraph;
-    `input_placeholders` is a list of the created input placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.filter_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.filter_ops.md
deleted file mode 100644
index a5a24d54cf4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.filter_ops.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.graph_editor.filter_ops(ops, positive_filter)` {#filter_ops}
-
-Get the ops passing the given filter.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`positive_filter`</b>: a function deciding where to keep an operation or not.
-    If True, all the operations are returned.
-
-##### Returns:
-
-  A list of selected tf.Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.make_placeholder_from_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.make_placeholder_from_tensor.md
deleted file mode 100644
index d6456955770..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.make_placeholder_from_tensor.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.graph_editor.make_placeholder_from_tensor(t, scope=None)` {#make_placeholder_from_tensor}
-
-Create a `tf.placeholder` for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-
-##### Args:
-
-
-*  <b>`t`</b>: a `tf.Tensor` whose name will be used to create the placeholder
-    (see function placeholder_name).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of `t` is preserved. `""` means the root scope.
-
-##### Returns:
-
-  A newly created `tf.placeholder`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `t` is not `None` or a `tf.Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.placeholder_name.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.placeholder_name.md
deleted file mode 100644
index 6d7a8facc45..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.placeholder_name.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.graph_editor.placeholder_name(t=None, scope=None)` {#placeholder_name}
-
-Create placeholder name for the graph editor.
-
-##### Args:
-
-
-*  <b>`t`</b>: optional tensor on which the placeholder operation's name will be based
-    on
-*  <b>`scope`</b>: absolute scope with which to prefix the placeholder's name. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A new placeholder name prefixed by "geph". Note that "geph" stands for
-    Graph Editor PlaceHolder. This convention allows to quickly identify the
-    placeholder generated by the Graph Editor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if t is not None or a tf.Tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
deleted file mode 100644
index 513a7cf4728..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
-
-Helper to select operations.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation`. `tf.Tensor` instances are silently ignored.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-    'restrict_ops_regex': a regular expression is ignored if it doesn't start
-      with the substring "(?#ops)".
-
-##### Returns:
-
-  A list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Operation`
-    or an (array of) `tf.Tensor` (silently ignored) or a string
-    or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
deleted file mode 100644
index bd18c89d6b2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
-
-Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_ios.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_ios.md
deleted file mode 100644
index aa18c7e0f0c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_ios.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.swap_ios(sgv0, sgv1)` {#swap_ios}
-
-Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md
deleted file mode 100644
index ff9c0f77b2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.contrib.layers.convolution2d_in_plane(*args, **kwargs)` {#convolution2d_in_plane}
-
-Performs the same in-plane convolution to each channel independently.
-
-This is useful for performing various simple channel-independent convolution
-operations such as image gradients:
-
-  image = tf.constant(..., shape=(16, 240, 320, 3))
-  vert_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[2, 1])
-  horz_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[1, 2])
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor with dimensions [batch_size, height, width, channels].
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the pooling. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2 `[stride_height, stride_width]`.
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.l2_regularizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.l2_regularizer.md
deleted file mode 100644
index 60791e81f3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.l2_regularizer.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.layers.l2_regularizer(scale, scope=None)` {#l2_regularizer}
-
-Returns a function that can be used to apply L2 regularization to weights.
-
-Small values of L2 can help prevent overfitting the training data.
-
-##### Args:
-
-
-*  <b>`scale`</b>: A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-*  <b>`scope`</b>: An optional scope name.
-
-##### Returns:
-
-  A function with signature `l2(weights)` that applies L2 regularization.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If scale is negative or if scale is not a float.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.real_valued_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.real_valued_column.md
deleted file mode 100644
index 61b4c76318c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.real_valued_column.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.contrib.layers.real_valued_column(column_name, dimension=1, default_value=None, dtype=tf.float32, normalizer=None)` {#real_valued_column}
-
-Creates a `_RealValuedColumn` for dense numeric data.
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining real valued column name.
-*  <b>`dimension`</b>: An integer specifying dimension of the real valued column.
-    The default is 1. When dimension is not None, the Tensor representing
-    the _RealValuedColumn will have the shape of [batch_size, dimension].
-    A None dimension means the feature column should be treat as variable
-    length and will be parsed as a `SparseTensor`.
-*  <b>`default_value`</b>: A single value compatible with dtype or a list of values
-    compatible with dtype which the column takes on during tf.Example parsing
-    if data is missing. When dimension is not None, a default value of None
-    will cause tf.parse_example to fail if an example does not contain this
-    column. If a single value is provided, the same value will be applied as
-    the default value for every dimension. If a list of values is provided,
-    the length of the list should be equal to the value of `dimension`.
-    Only scalar default value is supported in case dimension is not specified.
-*  <b>`dtype`</b>: defines the type of values. Default value is tf.float32. Must be a
-    non-quantized, real integer or floating point type.
-*  <b>`normalizer`</b>: If not None, a function that can be used to normalize the value
-    of the real valued column after default_value is applied for parsing.
-    Normalizer function takes the input tensor as its argument, and returns
-    the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
-    variable length columns, the normalizer should expect an input_tensor of
-    type `SparseTensor`.
-
-##### Returns:
-
-  A _RealValuedColumn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dimension is not an int
-*  <b>`ValueError`</b>: if dimension is not a positive integer
-*  <b>`TypeError`</b>: if default_value is a list but its length is not equal to the
-    value of `dimension`.
-*  <b>`TypeError`</b>: if default_value is not compatible with dtype.
-*  <b>`ValueError`</b>: if dtype is not convertable to tf.float32.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.sparse_column_with_keys.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.sparse_column_with_keys.md
deleted file mode 100644
index b32b62cc28b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.sparse_column_with_keys.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.layers.sparse_column_with_keys(column_name, keys, default_value=-1, combiner='sum')` {#sparse_column_with_keys}
-
-Creates a _SparseColumn with keys.
-
-Look up logic is as follows:
-lookup_id = index_of_feature_in_keys if feature in keys else default_value
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`keys`</b>: a string list defining vocabulary.
-*  <b>`default_value`</b>: The value to use for out-of-vocabulary feature values.
-    Default is -1.
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-
-##### Returns:
-
-  A _SparseColumnKeys with keys configuration.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md
deleted file mode 100644
index 1223ea2d774..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.weighted_sparse_column.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.layers.weighted_sparse_column(sparse_id_column, weight_column_name, dtype=tf.float32)` {#weighted_sparse_column}
-
-Creates a _SparseColumn by combining sparse_id_column with a weight column.
-
-Example:
-
-  ```python
-  sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col",
-                                                  hash_bucket_size=1000)
-  weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature,
-                                            weight_column_name="weights_col")
-  ```
-
-  This configuration assumes that input dictionary of model contains the
-  following two items:
-    * (key="sparse_col", value=sparse_tensor) where sparse_tensor is
-      a SparseTensor.
-    * (key="weights_col", value=weights_tensor) where weights_tensor
-      is a SparseTensor.
-   Following are assumed to be true:
-     * sparse_tensor.indices = weights_tensor.indices
-     * sparse_tensor.dense_shape = weights_tensor.dense_shape
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A `_SparseColumn` which is created by
-    `sparse_column_with_*` functions.
-*  <b>`weight_column_name`</b>: A string defining a sparse column name which represents
-    weight or value of the corresponding sparse id feature.
-*  <b>`dtype`</b>: Type of weights, such as `tf.float32`. Only floating and integer
-    weights are supported.
-
-##### Returns:
-
-  A _WeightedSparseColumn composed of two sparse features: one represents id,
-  the other represents weight (value) of the id feature in that example.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dtype is not convertible to float.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
deleted file mode 100644
index 2f12a6f2771..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ /dev/null
@@ -1,412 +0,0 @@
-Linear regressor model.
-
-Train a linear regression model to predict label value given observation of
-feature values.
-
-Example:
-
-```python
-sparse_column_a = sparse_column_with_hash_bucket(...)
-sparse_column_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-estimator = LinearRegressor(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
-
-# Input builders
-def input_fn_train: # returns x, y
-  ...
-def input_fn_eval: # returns x, y
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x)
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a KeyError:
-
-* if `weight_column_name` is not `None`:
-  key=weight_column_name, value=a `Tensor`
-* for column in `feature_columns`:
-  - if isinstance(column, `SparseColumn`):
-      key=column.name, value=a `SparseTensor`
-  - if isinstance(column, `WeightedSparseColumn`):
-      {key=id column name, value=a `SparseTensor`,
-       key=weight column name, value=a `SparseTensor`}
-  - if isinstance(column, `RealValuedColumn`):
-      key=column.name, value=a `Tensor`
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.__init__(feature_columns, model_dir=None, weight_column_name=None, optimizer=None, gradient_clip_norm=None, enable_centered_bias=False, label_dimension=1, _joint_weights=False, config=None, feature_engineering_fn=None)` {#LinearRegressor.__init__}
-
-Construct a `LinearRegressor` estimator object.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Ftrl optimizer.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-  _joint_weights: If True use a single (possibly partitioned) variable to
-    store the weights. It's faster, but requires all feature columns are
-    sparse and have the 'sum' combiner. Incompatible with SDCAOptimizer.
-
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `LinearRegressor` estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.__repr__()` {#LinearRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.bias_` {#LinearRegressor.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.config` {#LinearRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.evaluate(*args, **kwargs)` {#LinearRegressor.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#LinearRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#LinearRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.fit(*args, **kwargs)` {#LinearRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_params(deep=True)` {#LinearRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_variable_names()` {#LinearRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.get_variable_value(name)` {#LinearRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.model_dir` {#LinearRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.partial_fit(*args, **kwargs)` {#LinearRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.predict(*args, **kwargs)` {#LinearRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.predict_scores(*args, **kwargs)` {#LinearRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.set_params(**params)` {#LinearRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearRegressor.weights_` {#LinearRegressor.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.ModelFnOps.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.ModelFnOps.md
deleted file mode 100644
index 85371ec0846..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.ModelFnOps.md
+++ /dev/null
@@ -1,135 +0,0 @@
-Ops returned from a model_fn.
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__getnewargs__()` {#ModelFnOps.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__getstate__()` {#ModelFnOps.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__new__(cls, mode, predictions=None, loss=None, train_op=None, eval_metric_ops=None, output_alternatives=None, training_chief_hooks=None, training_hooks=None, scaffold=None)` {#ModelFnOps.__new__}
-
-Creates a validated `ModelFnOps` instance.
-
-For a multi-headed model, the predictions dict here will contain the outputs
-of all of the heads.  However: at serving time, requests will be made
-specifically for one or more heads, and the RPCs used for these requests may
-differ by problem type (i.e., regression, classification, other).  The
-purpose of the output_alternatives dict is to aid in exporting a SavedModel
-from which such head-specific queries can be served.  These
-output_alternatives will be combined with input_alternatives (see
-`saved_model_export_utils`) to produce a set of `SignatureDef`s specifying
-the valid requests that can be served from this model.
-
-For a single-headed model, it is still adviseable to provide
-output_alternatives with a single entry, because this is how the problem
-type is communicated for export and serving.  If output_alternatives is not
-given, the resulting SavedModel will support only one head of unspecified
-type.
-
-##### Args:
-
-
-*  <b>`mode`</b>: One of `ModeKeys`. Specifies if this training, evaluation or
-    prediction.
-*  <b>`predictions`</b>: Predictions `Tensor` or dict of `Tensor`.
-*  <b>`loss`</b>: Training loss `Tensor`.
-*  <b>`train_op`</b>: Op for the training step.
-*  <b>`eval_metric_ops`</b>: Dict of metric results keyed by name. The values of the
-    dict are the results of calling a metric function, such as `Tensor`.
-*  <b>`output_alternatives`</b>: a dict of
-    `{submodel_name: (problem_type, {tensor_name: Tensor})}`, where
-    `submodel_name` is a submodel identifier that should be consistent
-    across the pipeline (here likely taken from the name of each `Head`,
-    for models that use them), `problem_type` is a `ProblemType`,
-    `tensor_name` is a symbolic name for an output Tensor possibly but not
-    necessarily taken from `PredictionKey`, and `Tensor` is the
-    corresponding output Tensor itself.
-*  <b>`training_chief_hooks`</b>: A list of `SessionRunHook` objects that will be
-    run on the chief worker during training.
-*  <b>`training_hooks`</b>: A list of `SessionRunHook` objects that will be run on
-    all workers during training.
-*  <b>`scaffold`</b>: A `tf.train.Scaffold` object that can be used to set
-    initialization, saver, and more to be used in training.
-
-##### Returns:
-
-  A validated `ModelFnOps` object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If validation fails.
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.__repr__()` {#ModelFnOps.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.eval_metric_ops` {#ModelFnOps.eval_metric_ops}
-
-Alias for field number 3
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.loss` {#ModelFnOps.loss}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.output_alternatives` {#ModelFnOps.output_alternatives}
-
-Alias for field number 4
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.predictions` {#ModelFnOps.predictions}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.scaffold` {#ModelFnOps.scaffold}
-
-Alias for field number 7
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.train_op` {#ModelFnOps.train_op}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.training_chief_hooks` {#ModelFnOps.training_chief_hooks}
-
-Alias for field number 5
-
-
-- - -
-
-#### `tf.contrib.learn.ModelFnOps.training_hooks` {#ModelFnOps.training_hooks}
-
-Alias for field number 6
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.extract_pandas_matrix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.extract_pandas_matrix.md
deleted file mode 100644
index a5efd4f09b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.extract_pandas_matrix.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.contrib.learn.extract_pandas_matrix(data)` {#extract_pandas_matrix}
-
-Extracts numpy matrix from pandas DataFrame.
-
-##### Args:
-
-
-*  <b>`data`</b>: `pandas.DataFrame` containing the data to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of the DataFrame's values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.monitors.replace_monitors_with_hooks.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.monitors.replace_monitors_with_hooks.md
deleted file mode 100644
index de84ecb3619..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.monitors.replace_monitors_with_hooks.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.learn.monitors.replace_monitors_with_hooks(monitors_or_hooks, estimator)` {#replace_monitors_with_hooks}
-
-Wraps monitors with a hook.
-
-`Monitor` is deprecated in favor of `SessionRunHook`. If you're using a
-monitor, you can wrap it with a hook using function. It is recommended to
-implement hook version of your monitor.
-
-##### Args:
-
-
-*  <b>`monitors_or_hooks`</b>: A `list` may contain both monitors and hooks.
-*  <b>`estimator`</b>: An `Estimator` that monitor will be used with.
-
-##### Returns:
-
-  Returns a list of hooks. If there is any monitor in the given list, it is
-  replaced by a hook.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.read_batch_record_features.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.read_batch_record_features.md
deleted file mode 100644
index 2a114a25c28..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.read_batch_record_features.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}
-
-Reads TFRecord, queues, batches and parses `Example` proto.
-
-See more detailed description in `read_examples`.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If None, cycles through the dataset forever. NOTE - If specified,
-    creates a variable that must be initialized, so call
-    tf.local_variables_initializer() and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`reader_num_threads`</b>: The number of threads to read examples.
-*  <b>`name`</b>: Name of resulting op.
-
-##### Returns:
-
-  A dict of `Tensor` or `SparseTensor` objects for each in `features`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md
deleted file mode 100644
index 022ac6fefaa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.attention_decoder.md
+++ /dev/null
@@ -1,60 +0,0 @@
-### `tf.contrib.legacy_seq2seq.attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False)` {#attention_decoder}
-
-RNN decoder with attention for the sequence-to-sequence model.
-
-In this context "attention" means that, during decoding, the RNN can look up
-information in the additional tensor attention_states, and it does this by
-focusing on a few entries from the tensor. This model has proven to yield
-especially good results in a number of sequence-to-sequence tasks. This
-implementation is based on http://arxiv.org/abs/1412.7449 (see below for
-details). It is recommended for complex sequence-to-sequence tasks.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`output_size`</b>: Size of the output vectors; if None, we use cell.output_size.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
-    in order to generate i+1-th input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol). This can be used for decoding,
-    but also for training to emulate http://arxiv.org/abs/1506.03099.
-    Signature -- loop_function(prev, i) = next
-      * prev is a 2D Tensor of shape [batch_size x output_size],
-      * i is an integer, the step number (when advanced control is needed),
-      * next is a 2D Tensor of shape [batch_size x input_size].
-*  <b>`dtype`</b>: The dtype to use for the RNN initial state (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "attention_decoder".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states -- useful when we wish to resume decoding from a previously
-    stored decoder state and attention states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors of
-      shape [batch_size x output_size]. These represent the generated outputs.
-      Output i is computed from input i (which is either the i-th element
-      of decoder_inputs or loop_function(output {i-1}, i)) as follows.
-      First, we run the cell on a combination of the input and previous
-      attention masks:
-        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
-      Then, we calculate new attention masks:
-        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
-      and then we calculate the output:
-        output = linear(cell_output, new_attn).
-*  <b>`state`</b>: The state of each decoder cell the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when num_heads is not positive, there are no inputs, shapes
-    of attention_states are not set, or input size cannot be inferred
-    from the input.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md
deleted file mode 100644
index 11a5e81298f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_decoder.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.legacy_seq2seq.embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None)` {#embedding_rnn_decoder}
-
-RNN decoder with embedding and a pure-decoding option.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function.
-*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has
-    shape [num_symbols]; if provided and feed_previous=True, each fed
-    previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
-    used (the "GO" symbol), and all other decoder inputs will be generated by:
-      next = embedding_lookup(embedding, argmax(previous_output)),
-    In effect, this implements a greedy decoder. It can also be used
-    during training to emulate http://arxiv.org/abs/1506.03099.
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
-    only the embedding for the first symbol of decoder_inputs (the "GO"
-    symbol) will be updated by back propagation. Embeddings for the symbols
-    generated from the decoder itself remain unchanged. This parameter has
-    no effect if feed_previous=False.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_rnn_decoder".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
-      output is of shape [batch_size x cell.output_size] when
-      output_projection is not None (and represents the dense representation
-      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
-      when output_projection is None.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md
deleted file mode 100644
index 5c69dbee40c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_rnn_seq2seq}
-
-Embedding RNN sequence-to-sequence model.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_encoder_symbols x input_size]). Then it runs an RNN to encode
-embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
-by another newly created embedding (of shape [num_decoder_symbols x
-input_size]). Then it runs RNN decoder, initialized with the last
-encoder state, on embedded decoder_inputs.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_decoder_symbols] and B has
-    shape [num_decoder_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
-    rnn cells (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_rnn_seq2seq"
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors. The
-      output is of shape [batch_size x cell.output_size] when
-      output_projection is not None (and represents the dense representation
-      of predicted tokens). It is of shape [batch_size x num_decoder_symbols]
-      when output_projection is None.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md
deleted file mode 100644
index c0beb1541ed..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.legacy_seq2seq.sequence_loss.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.legacy_seq2seq.sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None)` {#sequence_loss}
-
-Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
-
-##### Args:
-
-
-*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
-*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
-*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
-    label weight.
-*  <b>`average_across_batch`</b>: If set, divide the returned cost by the batch size.
-*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`name`</b>: Optional name for this operation, defaults to "sequence_loss".
-
-##### Returns:
-
-  A scalar float Tensor: The average log-perplexity per symbol (weighted).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
deleted file mode 100644
index 8f869cbfd72..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.linalg.LinearOperatorDiag.md
+++ /dev/null
@@ -1,492 +0,0 @@
-`LinearOperator` acting like a [batch] square diagonal matrix.
-
-This operator acts like a [batch] diagonal matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.  This matrix `A` is not materialized, but for
-purposes of broadcasting this shape will be relevant.
-
-`LinearOperatorDiag` is initialized with a (batch) vector.
-
-```python
-# Create a 2 x 2 diagonal linear operator.
-diag = [1., -1.]
-operator = LinearOperatorDiag(diag)
-
-operator.to_dense()
-==> [[1.,  0.]
-     [0., -1.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-diag = tf.random_normal(shape=[2, 3, 4])
-operator = LinearOperatorDiag(diag)
-
-# Create a shape [2, 1, 4, 2] vector.  Note that this shape is compatible
-# since the batch dimensions, [2, 1], are brodcast to
-# operator.batch_shape = [2, 3].
-y = tf.random_normal(shape=[2, 1, 4, 2])
-x = operator.solve(y)
-==> operator.apply(x) = y
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-#### Performance
-
-Suppose `operator` is a `LinearOperatorDiag` of shape `[N, N]`,
-and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` involves `N * R` multiplications.
-* `operator.solve(x)` involves `N` divisions and `N * R` multiplications.
-* `operator.determinant()` involves a size `N` `reduce_prod`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.__init__(diag, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorDiag')` {#LinearOperatorDiag.__init__}
-
-Initialize a `LinearOperatorDiag`.
-
-##### Args:
-
-
-*  <b>`diag`</b>: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-    The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-      `complex64`, `complex128`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  If `diag.dtype` is real, this is auto-set to `True`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-*  <b>`ValueError`</b>: If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorDiag.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.apply(x, adjoint=False, name='apply')` {#LinearOperatorDiag.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_non_singular(name='assert_non_singular')` {#LinearOperatorDiag.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorDiag.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorDiag.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape` {#LinearOperatorDiag.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorDiag.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.determinant(name='det')` {#LinearOperatorDiag.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension` {#LinearOperatorDiag.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorDiag.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.dtype` {#LinearOperatorDiag.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.graph_parents` {#LinearOperatorDiag.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_non_singular` {#LinearOperatorDiag.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_positive_definite` {#LinearOperatorDiag.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_self_adjoint` {#LinearOperatorDiag.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.is_square` {#LinearOperatorDiag.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.log_abs_determinant(name='log_abs_det')` {#LinearOperatorDiag.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.name` {#LinearOperatorDiag.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension` {#LinearOperatorDiag.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorDiag.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.shape` {#LinearOperatorDiag.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.shape_tensor(name='shape_tensor')` {#LinearOperatorDiag.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorDiag.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank` {#LinearOperatorDiag.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorDiag.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorDiag.to_dense(name='to_dense')` {#LinearOperatorDiag.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md
deleted file mode 100644
index da8e3ed5bb5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.losses.get_losses.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.losses.get_losses(*args, **kwargs)` {#get_losses}
-
-Gets the list of losses from the loss_collection. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_losses instead.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the losses to return.
-*  <b>`loss_collection`</b>: Optional losses collection.
-
-##### Returns:
-
-  a list of loss tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.set_size.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.set_size.md
deleted file mode 100644
index 0a33afb229b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.set_size.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.metrics.set_size(a, validate_indices=True)` {#set_size}
-
-Compute number of unique elements along last dimension of `a`.
-
-##### Args:
-
-
-*  <b>`a`</b>: `SparseTensor`, with indices sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a`.
-
-##### Returns:
-
-  `int32` `Tensor` of set sizes. For `a` ranked `n`, this is a `Tensor` with
-  rank `n-1`, and the same 1st `n-1` dimensions as `a`. Each value is the
-  number of unique elements in the corresponding `[0...n-1]` dimension of `a`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `a` is an invalid types.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_tensor.md
deleted file mode 100644
index dbaf38c5cc4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_tensor.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_tensor(values, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_tensor}
-
-Computes the element-wise (weighted) mean of the given tensors.
-
-In contrast to the `streaming_mean` function which returns a scalar with the
-mean,  this function returns an average tensor with the same shape as the
-input tensors.
-
-The `streaming_mean_tensor` function creates two local variables,
-`total_tensor` and `count_tensor` that are used to compute the average of
-`values`. This average is ultimately returned as `mean` which is an idempotent
-operation that simply divides `total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean`.
-`update_op` increments `total` with the reduced sum of the product of `values`
-and `weights`, and it increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `values`, and
-    must be broadcastable to `values` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `values` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op`
-    should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean`</b>: A float `Tensor` representing the current mean, the value of `total`
-    divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
deleted file mode 100644
index 607aea1f1d6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
+++ /dev/null
@@ -1,77 +0,0 @@
-Basic attention cell wrapper.
-
-Implementation based on https://arxiv.org/abs/1409.0473.
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.__call__(inputs, state, scope=None)` {#AttentionCellWrapper.__call__}
-
-Long short-term memory cell with attention (LSTMA).
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.__init__(cell, attn_length, attn_size=None, attn_vec_size=None, input_size=None, state_is_tuple=False)` {#AttentionCellWrapper.__init__}
-
-Create a cell with attention.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, an attention is added to it.
-*  <b>`attn_length`</b>: integer, the size of an attention window.
-*  <b>`attn_size`</b>: integer, the size of an attention vector. Equal to
-      cell.output_size by default.
-*  <b>`attn_vec_size`</b>: integer, the number of convolutional features calculated
-      on attention state and a size of the hidden layer built from
-      base cell state. Equal attn_size to by default.
-*  <b>`input_size`</b>: integer, the size of a hidden linear layer,
-      built from inputs and attention. Derived from the input tensor
-      by default.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
-    `n = len(cells)`.  By default (False), the states are all
-    concatenated along the column axis.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if cell returns a state tuple but the flag
-      `state_is_tuple` is `False` or if attn_length is zero or less.
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.output_size` {#AttentionCellWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.state_size` {#AttentionCellWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.AttentionCellWrapper.zero_state(batch_size, dtype)` {#AttentionCellWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md
deleted file mode 100644
index 47c1855010b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.MultiRNNCell.md
+++ /dev/null
@@ -1,66 +0,0 @@
-RNN cell composed sequentially of multiple simple cells.
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.__call__(inputs, state, scope=None)` {#MultiRNNCell.__call__}
-
-Run this multi-layer cell on inputs, starting from state.
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.__init__(cells, state_is_tuple=True)` {#MultiRNNCell.__init__}
-
-Create a RNN cell composed sequentially of a number of RNNCells.
-
-##### Args:
-
-
-*  <b>`cells`</b>: list of RNNCells that will be composed in this order.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
-    `n = len(cells)`.  If False, the states are all
-    concatenated along the column axis.  This latter behavior will soon be
-    deprecated.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if cells is empty (not allowed), or at least one of the cells
-    returns a state tuple but the flag `state_is_tuple` is `False`.
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.output_size` {#MultiRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.state_size` {#MultiRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.MultiRNNCell.zero_state(batch_size, dtype)` {#MultiRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md
deleted file mode 100644
index 87e1024613c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.OutputProjectionWrapper.md
+++ /dev/null
@@ -1,68 +0,0 @@
-Operator adding an output projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your outputs in time,
-do the projection on this batch-concatenated sequence, then split it
-if needed or directly feed into a softmax.
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.__call__(inputs, state, scope=None)` {#OutputProjectionWrapper.__call__}
-
-Run the cell and output projection on inputs, starting from state.
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.__init__(cell, output_size)` {#OutputProjectionWrapper.__init__}
-
-Create a cell with output projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`output_size`</b>: integer, the size of the output after projection.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if output_size is not positive.
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.output_size` {#OutputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.state_size` {#OutputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.OutputProjectionWrapper.zero_state(batch_size, dtype)` {#OutputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.stack_bidirectional_dynamic_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.stack_bidirectional_dynamic_rnn.md
deleted file mode 100644
index 3ced8eb13f0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.stack_bidirectional_dynamic_rnn.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, initial_states_fw=None, initial_states_bw=None, dtype=None, sequence_length=None, scope=None)` {#stack_bidirectional_dynamic_rnn}
-
-Creates a dynamic bidirectional recurrent neural network.
-
-Stacks several bidirectional rnn layers. The combined forward and backward
-layer outputs are used as input of the next layer. tf.bidirectional_rnn
-does not allow to share forward and backward information between layers.
-The input_size of the first forward and backward cells must match.
-The initial state for both directions is zero and no intermediate states
-are returned.
-
-##### Args:
-
-
-*  <b>`cells_fw`</b>: List of instances of RNNCell, one per layer,
-    to be used for forward direction.
-*  <b>`cells_bw`</b>: List of instances of RNNCell, one per layer,
-    to be used for backward direction.
-*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
-    [batch_size, input_size], or a nested tuple of such elements.
-*  <b>`initial_states_fw`</b>: (optional) A list of the initial states (one per layer)
-    for the forward RNN.
-    Each tensor must has an appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-*  <b>`initial_states_bw`</b>: (optional) Same as for `initial_states_fw`, but using
-    the corresponding properties of `cells_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to None.
-
-##### Returns:
-
-  A tuple (outputs, output_state_fw, output_state_bw) where:
-
-*  <b>`outputs`</b>: Output `Tensor` shaped:
-      `batch_size, max_time, layers_output]`. Where layers_output
-      are depth-concatenated forward and backward outputs.
-    output_states_fw is the final states, one tensor per layer,
-      of the forward rnn.
-    output_states_bw is the final states, one tensor per layer,
-      of the backward rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-*  <b>`ValueError`</b>: If inputs is `None`, not a list or an empty list.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.training.weighted_resample.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.training.weighted_resample.md
deleted file mode 100644
index 903cad838b8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.training.weighted_resample.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.training.weighted_resample(inputs, weights, overall_rate, scope=None, mean_decay=0.999, seed=None)` {#weighted_resample}
-
-Performs an approximate weighted resampling of `inputs`.
-
-This method chooses elements from `inputs` where each item's rate of
-selection is proportional to its value in `weights`, and the average
-rate of selection across all inputs (and many invocations!) is
-`overall_rate`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of tensors whose first dimension is `batch_size`.
-*  <b>`weights`</b>: A `[batch_size]`-shaped tensor with each batch member's weight.
-*  <b>`overall_rate`</b>: Desired overall rate of resampling.
-*  <b>`scope`</b>: Scope to use for the op.
-*  <b>`mean_decay`</b>: How quickly to decay the running estimate of the mean weight.
-*  <b>`seed`</b>: Random seed.
-
-##### Returns:
-
-  A list of tensors exactly like `inputs`, but with an unknown (and
-    possibly zero) first dimension.
-  A tensor containing the effective resampling rate used for each output.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.util.ops_used_by_graph_def.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.util.ops_used_by_graph_def.md
deleted file mode 100644
index 38a9cc4f43b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.util.ops_used_by_graph_def.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.contrib.util.ops_used_by_graph_def(graph_def)` {#ops_used_by_graph_def}
-
-Collect the list of ops used by a graph.
-
-Does not validate that the ops are all registered.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto, as from `graph.as_graph_def()`.
-
-##### Returns:
-
-  A list of strings, each naming an op used by the graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md
deleted file mode 100644
index 1999e711806..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.convert_to_tensor_or_sparse_tensor.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None)` {#convert_to_tensor_or_sparse_tensor}
-
-Converts value to a `SparseTensor` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `SparseTensor`, `SparseTensorValue`, or an object whose type has a
-    registered `Tensor` conversion function.
-*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
-    type is inferred from the type of `value`.
-*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
-
-##### Returns:
-
-  A `SparseTensor` or `Tensor` based on `value`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If result type is incompatible with `dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
deleted file mode 100644
index 0275374f03b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.cumprod(x, axis=0, exclusive=False, reverse=False, name=None)` {#cumprod}
-
-Compute the cumulative product of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumprod, which means that the
-first
-element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-performed
-instead:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-opposite direction:
-```prettyprint
-tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-```
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
-*  <b>`reverse`</b>: A `bool` (default: False).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.delete_session_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.delete_session_tensor.md
deleted file mode 100644
index 7a43e917ce2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.delete_session_tensor.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.delete_session_tensor(handle, name=None)` {#delete_session_tensor}
-
-Delete the tensor for the given tensor handle.
-
-This is EXPERIMENTAL and subject to change.
-
-Delete the tensor of a given tensor handle. The tensor is produced
-in a previous run() and stored in the state of the session.
-
-##### Args:
-
-
-*  <b>`handle`</b>: The string representation of a persistent tensor handle.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A pair of graph elements. The first is a placeholder for feeding a
-  tensor handle and the second is a deletion operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.depth_to_space.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.depth_to_space.md
deleted file mode 100644
index 03dc6bb3b0d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.depth_to_space.md
+++ /dev/null
@@ -1,95 +0,0 @@
-### `tf.depth_to_space(input, block_size, name=None)` {#depth_to_space}
-
-DepthToSpace for tensors of type T.
-
-Rearranges data from depth into blocks of spatial data.
-This is the reverse transformation of SpaceToDepth. More specifically,
-this op outputs a copy of the input tensor where values from the `depth`
-dimension are moved in spatial blocks to the `height` and `width` dimensions.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Chunks of data of size `block_size * block_size` from depth are rearranged
-    into non-overlapping blocks of size `block_size x block_size`
-  * The width the output tensor is `input_depth * block_size`, whereas the
-    height is `input_height * block_size`.
-  * The depth of the input tensor must be divisible by
-    `block_size * block_size`.
-
-That is, assuming the input is in the shape:
-`[batch, height, width, depth]`,
-the shape of the output will be:
-`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`
-
-This operation requires that the input tensor be of rank 4, and that
-`block_size` be >=1 and that `block_size * block_size` be a divisor of the
-input depth.
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
-
-```prettyprint
-x = [[[[1, 2, 3, 4]]]]
-
-```
-
-This operation will output a tensor of shape `[1, 2, 2, 1]`:
-
-```prettyprint
-   [[[[1], [2]],
-     [[3], [4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-the corresponding output will have 2x2 elements and will have a depth of
-1 channel (1 = `4 / (block_size * block_size)`).
-The output element shape is `[2, 2, 1]`.
-
-For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-
-```prettyprint
-x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-This operation, for block size of 2, will return the following tensor of shape
-`[1, 2, 2, 3]`
-
-```prettyprint
-   [[[[1, 2, 3], [4, 5, 6]],
-     [[7, 8, 9], [10, 11, 12]]]]
-
-```
-
-Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-
-```prettyprint
-x =  [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-the operator will return the following tensor of shape `[1 4 4 1]`:
-
-```prettyprint
-x = [[ [1],   [2],  [5],  [6]],
-     [ [3],   [4],  [7],  [8]],
-     [ [9],  [10], [13],  [14]],
-     [ [11], [12], [15],  [16]]]
-
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-    The size of the spatial block, same as in Space2Depth.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.device.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.device.md
deleted file mode 100644
index 2a5e33203df..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.device.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.device(device_name_or_function)` {#device}
-
-Wrapper for `Graph.device()` using the default graph.
-
-See
-[`Graph.device()`](../../api_docs/python/framework.md#Graph.device)
-for more details.
-
-##### Args:
-
-
-*  <b>`device_name_or_function`</b>: The device name or function to use in
-    the context.
-
-##### Returns:
-
-  A context manager that specifies the default device to use for newly
-  created ops.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.CancelledError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.CancelledError.md
deleted file mode 100644
index cf20c0e2e37..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.CancelledError.md
+++ /dev/null
@@ -1,17 +0,0 @@
-Raised when an operation or step is cancelled.
-
-For example, a long-running operation (e.g.
-[`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue) may be
-cancelled by running another operation (e.g.
-[`queue.close(cancel_pending_enqueues=True)`](../../api_docs/python/io_ops.md#QueueBase.close),
-or by [closing the session](../../api_docs/python/client.md#Session.close).
-A step that is running such a long-running operation will fail by raising
-`CancelledError`.
-
-- - -
-
-#### `tf.errors.CancelledError.__init__(node_def, op, message)` {#CancelledError.__init__}
-
-Creates a `CancelledError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DataLossError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DataLossError.md
deleted file mode 100644
index 3193e77ae3e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DataLossError.md
+++ /dev/null
@@ -1,13 +0,0 @@
-Raised when unrecoverable data loss or corruption is encountered.
-
-For example, this may be raised by running a
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation, if the file is truncated while it is being read.
-
-- - -
-
-#### `tf.errors.DataLossError.__init__(node_def, op, message)` {#DataLossError.__init__}
-
-Creates a `DataLossError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DeadlineExceededError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DeadlineExceededError.md
deleted file mode 100644
index e8ef3be06e7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.errors.DeadlineExceededError.md
+++ /dev/null
@@ -1,11 +0,0 @@
-Raised when a deadline expires before an operation could complete.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.DeadlineExceededError.__init__(node_def, op, message)` {#DeadlineExceededError.__init__}
-
-Creates a `DeadlineExceededError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_gradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_gradient.md
deleted file mode 100644
index b363afe7cef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_gradient.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.fake_quant_with_min_max_vars_gradient(gradients, inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxVars operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-    min, max: Quantization interval, scalar floats.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (backprops_wrt_input, backprop_wrt_min, backprop_wrt_max).
-
-*  <b>`backprops_wrt_input`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. inputs:
-    `gradients * (inputs >= min && inputs <= max)`.
-*  <b>`backprop_wrt_min`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. min parameter:
-    `sum(gradients * (inputs < min))`.
-*  <b>`backprop_wrt_max`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. max parameter:
-    `sum(gradients * (inputs > max))`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_per_channel_gradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_per_channel_gradient.md
deleted file mode 100644
index a7a62e29b31..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fake_quant_with_min_max_vars_per_channel_gradient.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.fake_quant_with_min_max_vars_per_channel_gradient(gradients, inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_per_channel_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-    shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-      same as `gradients`.
-    min, max: Quantization interval, floats of shape `[d]`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (backprops_wrt_input, backprop_wrt_min, backprop_wrt_max).
-
-*  <b>`backprops_wrt_input`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. inputs, shape same as
-    `inputs`:
-      `gradients * (inputs >= min && inputs <= max)`.
-*  <b>`backprop_wrt_min`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-    `sum_per_d(gradients * (inputs < min))`.
-*  <b>`backprop_wrt_max`</b>: A `Tensor` of type `float32`. Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-    `sum_per_d(gradients * (inputs > max))`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft.md
deleted file mode 100644
index da37dd49332..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.fft(input, name=None)` {#fft}
-
-Compute the 1-dimensional discrete Fourier Transform over the inner-most
-
-dimension of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its 1D Fourier Transform.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft2d.md
deleted file mode 100644
index 81b83df8bbc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.fft2d.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.fft2d(input, name=None)` {#fft2d}
-
-Compute the 2-dimensional discrete Fourier Transform over the inner-most
-
-2 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 2
-    dimensions of `input` are replaced with their 2D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft2
-  @end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.floormod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.floormod.md
deleted file mode 100644
index 5ebd691835e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.floormod.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.floormod(x, y, name=None)` {#floormod}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_seed.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_seed.md
deleted file mode 100644
index b13c75b8265..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_seed.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.get_seed(op_seed)` {#get_seed}
-
-Returns the local seeds an operation should use given an op-specific seed.
-
-Given operation-specific seed, `op_seed`, this helper function returns two
-seeds derived from graph-level and op-level seeds. Many random operations
-internally use the two seeds to allow user to change the seed globally for a
-graph, or for only specific operations.
-
-For details on how the graph-level seed interacts with op seeds, see
-@{set_random_seed}.
-
-##### Args:
-
-
-*  <b>`op_seed`</b>: integer.
-
-##### Returns:
-
-  A tuple of two integers that should be used for the local seed of this
-  operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_session_handle.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_session_handle.md
deleted file mode 100644
index ac3379b93d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.get_session_handle.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.get_session_handle(data, name=None)` {#get_session_handle}
-
-Return the handle of `data`.
-
-This is EXPERIMENTAL and subject to change.
-
-Keep `data` "in-place" in the runtime and create a handle that can be
-used to retrieve `data` in a subsequent run().
-
-Combined with `get_session_tensor`, we can keep a tensor produced in
-one run call in place, and use it as the input in a future run call.
-
-##### Args:
-
-
-*  <b>`data`</b>: A tensor to be stored in the session.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A scalar string tensor representing a unique handle for `data`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `data` is not a Tensor.
-
-
-*  <b>`Example`</b>: 
-
-```python
-c = tf.multiply(a, b)
-h = tf.get_session_handle(c)
-h = sess.run(h)
-
-p, a = tf.get_session_tensor(h.handle, tf.float32)
-b = tf.multiply(a, 10)
-c = sess.run(b, feed_dict={p: h.handle})
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.global_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.global_norm.md
deleted file mode 100644
index d37d4228b27..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.global_norm.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.global_norm(t_list, name=None)` {#global_norm}
-
-Computes the global norm of multiple tensors.
-
-Given a tuple or list of tensors `t_list`, this operation returns the
-global norm of the elements in all tensors in `t_list`. The global norm is
-computed as:
-
-`global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))`
-
-Any entries in `t_list` that are of type None are ignored.
-
-##### Args:
-
-
-*  <b>`t_list`</b>: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A 0-D (scalar) `Tensor` of type `float`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `t_list` is not a sequence.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ifft3d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ifft3d.md
deleted file mode 100644
index 7d106f24a8a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ifft3d.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.ifft3d(input, name=None)` {#ifft3d}
-
-Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
-
-3 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 3
-    dimensions of `input` are replaced with their inverse 3D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft3
-  @end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_brightness.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_brightness.md
deleted file mode 100644
index 7743f0180c5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_brightness.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.image.adjust_brightness(image, delta)` {#adjust_brightness}
-
-Adjust the brightness of RGB or Grayscale images.
-
-This is a convenience method that converts an RGB image to float
-representation, adjusts its brightness, and then converts it back to the
-original data type. If several adjustments are chained it is advisable to
-minimize the number of redundant conversions.
-
-The value `delta` is added to all components of the tensor `image`. Both
-`image` and `delta` are converted to `float` before adding (and `image` is
-scaled appropriately if it is in fixed-point representation). For regular
-images, `delta` should be in the range `[0,1)`, as it is added to the image in
-floating point representation, where pixel values are in the `[0,1)` range.
-
-##### Args:
-
-
-*  <b>`image`</b>: A tensor.
-*  <b>`delta`</b>: A scalar. Amount to add to the pixel values.
-
-##### Returns:
-
-  A brightness-adjusted tensor of the same shape and type as `image`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_gamma.md
deleted file mode 100644
index 34fdad226bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_gamma.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.image.adjust_gamma(image, gamma=1, gain=1)` {#adjust_gamma}
-
-Performs Gamma Correction on the input image.
-  Also known as Power Law Transform. This function transforms the
-  input image pixelwise according to the equation Out = In**gamma
-  after scaling each pixel to the range 0 to 1.
-
-##### Args:
-
-  image : A Tensor.
-  gamma : A scalar. Non negative real number.
-  gain  : A scalar. The constant multiplier.
-
-##### Returns:
-
-  A Tensor. Gamma corrected output image.
-
-##### Notes:
-
-  For gamma greater than 1, the histogram will shift towards left and
-  the output image will be darker than the input image.
-  For gamma less than 1, the histogram will shift towards right and
-  the output image will be brighter than the input image.
-
-##### References:
-
-  [1] http://en.wikipedia.org/wiki/Gamma_correction
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.decode_jpeg.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.decode_jpeg.md
deleted file mode 100644
index 1e3b4912b22..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.decode_jpeg.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.image.decode_jpeg(contents, channels=None, ratio=None, fancy_upscaling=None, try_recover_truncated=None, acceptable_fraction=None, dct_method=None, name=None)` {#decode_jpeg}
-
-Decode a JPEG-encoded image to a uint8 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The JPEG-encoded image.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`.
-    Number of color channels for the decoded image.
-*  <b>`ratio`</b>: An optional `int`. Defaults to `1`. Downscaling ratio.
-*  <b>`fancy_upscaling`</b>: An optional `bool`. Defaults to `True`.
-    If true use a slower but nicer upscaling of the
-    chroma planes (yuv420/422 only).
-*  <b>`try_recover_truncated`</b>: An optional `bool`. Defaults to `False`.
-    If true try to recover an image from truncated input.
-*  <b>`acceptable_fraction`</b>: An optional `float`. Defaults to `1`.
-    The minimum required fraction of lines before a truncated
-    input is accepted.
-*  <b>`dct_method`</b>: An optional `string`. Defaults to `""`.
-    string specifying a hint about the algorithm used for
-    decompression.  Defaults to "" which maps to a system-specific
-    default.  Currently valid values are ["INTEGER_FAST",
-    "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-    jpeg library changes to a version that does not have that specific
-    option.)
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `uint8`. 3-D with shape `[height, width, channels]`..
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.grayscale_to_rgb.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.grayscale_to_rgb.md
deleted file mode 100644
index 755b66141b6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.grayscale_to_rgb.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.image.grayscale_to_rgb(images, name=None)` {#grayscale_to_rgb}
-
-Converts one or more images from Grayscale to RGB.
-
-Outputs a tensor of the same `DType` and rank as `images`.  The size of the
-last dimension of the output is 3, containing the RGB value of the pixels.
-
-##### Args:
-
-
-*  <b>`images`</b>: The Grayscale tensor to convert. Last dimension must be size 1.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The converted grayscale image(s).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.random_brightness.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.random_brightness.md
deleted file mode 100644
index 6c773b6985d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.random_brightness.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.image.random_brightness(image, max_delta, seed=None)` {#random_brightness}
-
-Adjust the brightness of images by a random factor.
-
-Equivalent to `adjust_brightness()` using a `delta` randomly picked in the
-interval `[-max_delta, max_delta)`.
-
-##### Args:
-
-
-*  <b>`image`</b>: An image.
-*  <b>`max_delta`</b>: float, must be non-negative.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  The brightness-adjusted image.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `max_delta` is negative.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.rgb_to_grayscale.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.rgb_to_grayscale.md
deleted file mode 100644
index bf9b6846e00..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.rgb_to_grayscale.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.image.rgb_to_grayscale(images, name=None)` {#rgb_to_grayscale}
-
-Converts one or more images from RGB to Grayscale.
-
-Outputs a tensor of the same `DType` and rank as `images`.  The size of the
-last dimension of the output is 1, containing the Grayscale value of the
-pixels.
-
-##### Args:
-
-
-*  <b>`images`</b>: The RGB tensor to convert. Last dimension must have size 3 and
-    should contain RGB values.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The converted grayscale image(s).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_finite.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_finite.md
deleted file mode 100644
index 15d5a7df94f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_finite.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.is_finite(x, name=None)` {#is_finite}
-
-Returns which elements of x are finite.
-
-@compatibility(numpy)
-Equivalent to np.isfinite
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_nan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_nan.md
deleted file mode 100644
index b1fd8de13c7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_nan.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.is_nan(x, name=None)` {#is_nan}
-
-Returns which elements of x are NaN.
-
-@compatibility(numpy)
-Equivalent to np.isnan
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_non_decreasing.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_non_decreasing.md
deleted file mode 100644
index f10ff932c0b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_non_decreasing.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.is_non_decreasing(x, name=None)` {#is_non_decreasing}
-
-Returns `True` if `x` is non-decreasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is non-decreasing if for every adjacent pair we have `x[i] <= x[i+1]`.
-If `x` has less than two elements, it is trivially non-decreasing.
-
-See also:  `is_strictly_increasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "is_non_decreasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is non-decreasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_numeric_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_numeric_tensor.md
deleted file mode 100644
index c2e61b856d6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.is_numeric_tensor.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.is_numeric_tensor(tensor)` {#is_numeric_tensor}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
deleted file mode 100644
index 3f928e7a616..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.mod(x, y, name=None)` {#mod}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.name_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.name_scope.md
deleted file mode 100644
index f888ca22cd9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.name_scope.md
+++ /dev/null
@@ -1,39 +0,0 @@
-### `tf.name_scope(name, default_name=None, values=None)` {#name_scope}
-
-Returns a context manager for use when defining a Python op.
-
-This context manager validates that the given `values` are from the
-same graph, makes that graph the default graph, and pushes a
-name scope in that graph (see
-[`Graph.name_scope()`](../../api_docs/python/framework.md#Graph.name_scope)
-for more details on that).
-
-For example, to define a new Python op called `my_op`:
-
-```python
-def my_op(a, b, c, name=None):
-  with tf.name_scope(name, "MyOp", [a, b, c]) as scope:
-    a = tf.convert_to_tensor(a, name="a")
-    b = tf.convert_to_tensor(b, name="b")
-    c = tf.convert_to_tensor(c, name="c")
-    # Define some computation that uses `a`, `b`, and `c`.
-    return foo_op(..., name=scope)
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name argument that is passed to the op function.
-*  <b>`default_name`</b>: The default name to use if the `name` argument is `None`.
-*  <b>`values`</b>: The list of `Tensor` arguments that are passed to the op function.
-
-##### Returns:
-
-  A context manager for use in defining Python ops. Yields the name scope.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if neither `name` nor `default_name` is provided
-    but `values` are.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.avg_pool3d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.avg_pool3d.md
deleted file mode 100644
index 5bb4dcf68f5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.avg_pool3d.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.nn.avg_pool3d(input, ksize, strides, padding, name=None)` {#avg_pool3d}
-
-Performs 3D average pooling on the input.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The size of the window for each dimension of
-    the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The average pooled output tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.conv2d_backprop_filter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.conv2d_backprop_filter.md
deleted file mode 100644
index 27c2da89df1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.conv2d_backprop_filter.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.nn.conv2d_backprop_filter(input, filter_sizes, out_backprop, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d_backprop_filter}
-
-Computes the gradients of convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-    4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 4-D
-    `[filter_height, filter_width, in_channels, out_channels]` tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution. Must be in the same order as the dimension specified with
-    format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md
deleted file mode 100644
index 128808ff36a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.ctc_loss.md
+++ /dev/null
@@ -1,103 +0,0 @@
-### `tf.nn.ctc_loss(labels, inputs, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
-
-Computes the CTC (Connectionist Temporal Classification) Loss.
-
-This op implements the CTC loss as presented in the article:
-
-A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
-Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.
-
-http://www.cs.toronto.edu/~graves/icml_2006.pdf
-
-Input requirements:
-
-```
-sequence_length(b) <= time for all b
-
-max(labels.indices(labels.indices[:, 1] == b, 2))
-  <= sequence_length(b) for all b.
-```
-
-Notes:
-
-This class performs the softmax operation for you, so inputs should
-be e.g. linear projections of outputs by an LSTM.
-
-The `inputs` Tensor's innermost dimension size, `num_classes`, represents
-`num_labels + 1` classes, where num_labels is the number of true labels, and
-the largest value `(num_classes - 1)` is reserved for the blank label.
-
-For example, for a vocabulary containing 3 labels `[a, b, c]`,
-`num_classes = 4` and the labels indexing is `{a: 0, b: 1, c: 2, blank: 3}`.
-
-Regarding the arguments `preprocess_collapse_repeated` and
-`ctc_merge_repeated`:
-
-If `preprocess_collapse_repeated` is True, then a preprocessing step runs
-before loss calculation, wherein repeated labels passed to the loss
-are merged into single labels.  This is useful if the training labels come
-from, e.g., forced alignments and therefore have unnecessary repetitions.
-
-If `ctc_merge_repeated` is set False, then deep within the CTC calculation,
-repeated non-blank labels will not be merged and are interpreted
-as individual labels.  This is a simplified (non-standard) version of CTC.
-
-Here is a table of the (roughly) expected first order behavior:
-
-* `preprocess_collapse_repeated=False`, `ctc_merge_repeated=True`
-
-  Classical CTC behavior: Outputs true repeated classes with blanks in
-  between, and can also output repeated classes with no blanks in
-  between that need to be collapsed by the decoder.
-
-* `preprocess_collapse_repeated=True`, `ctc_merge_repeated=False`
-
-  Never learns to output repeated classes, as they are collapsed
-  in the input labels before training.
-
-* `preprocess_collapse_repeated=False`, `ctc_merge_repeated=False`
-
-  Outputs repeated classes with blanks in between, but generally does not
-  require the decoder to collapse/merge repeated classes.
-
-* `preprocess_collapse_repeated=True`, `ctc_merge_repeated=True`
-
-  Untested.  Very likely will not learn to output repeated classes.
-
-##### Args:
-
-
-*  <b>`labels`</b>: An `int32` `SparseTensor`.
-    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-    the id for (batch b, time t).
-    `labels.values[i]` must take on values in `[0, num_labels)`.
-    See `core/ops/ctc_ops.cc` for more details.
-*  <b>`inputs`</b>: 3-D `float` `Tensor`.
-    If time_major == False, this will be a `Tensor` shaped:
-      `[batch_size x max_time x num_classes]`.
-    If time_major == True (default), this will be a `Tensor` shaped:
-      `[max_time x batch_size x num_classes]`.
-    The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector, size `[batch_size]`.
-    The sequence lengths.
-*  <b>`preprocess_collapse_repeated`</b>: Boolean.  Default: False.
-    If True, repeated labels are collapsed prior to the CTC calculation.
-*  <b>`ctc_merge_repeated`</b>: Boolean.  Default: True.
-*  <b>`time_major`</b>: The shape format of the `inputs` Tensors.
-    If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
-    If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
-    Using `time_major = True` (default) is a bit more efficient because it avoids
-    transposes at the beginning of the ctc_loss calculation.  However, most
-    TensorFlow data is batch-major, so by this function also accepts inputs
-    in batch-major form.
-
-##### Returns:
-
-  A 1-D `float` `Tensor`, size `[batch]`, containing the negative log probabilities.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if labels is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.l2_normalize.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.l2_normalize.md
deleted file mode 100644
index 57b617a331a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.l2_normalize.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.nn.l2_normalize(x, dim, epsilon=1e-12, name=None)` {#l2_normalize}
-
-Normalizes along dimension `dim` using an L2 norm.
-
-For a 1-D tensor with `dim = 0`, computes
-
-    output = x / sqrt(max(sum(x**2), epsilon))
-
-For `x` with more dimensions, independently normalizes each 1-D slice along
-dimension `dim`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`dim`</b>: Dimension along which to normalize.  A scalar or a vector of
-    integers.
-*  <b>`epsilon`</b>: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
-    divisor if `norm < sqrt(epsilon)`.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same shape as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md
deleted file mode 100644
index f91766f656d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md
+++ /dev/null
@@ -1,66 +0,0 @@
-### `tf.norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None)` {#norm}
-
-Computes the norm of vectors, matrices, and tensors.
-
-This function can compute 3 different matrix norms (Frobenius, 1-norm, and
-inf-norm) and up to 9218868437227405311 different vectors norms.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
-*  <b>`ord`</b>: Order of the norm. Supported values are 'fro', 'euclidean', `0`,
-    `1, `2`, `np.inf` and any positive real number yielding the corresponding
-    p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
-    `tensor` is a matrix and equivalent to 2-norm for vectors.
-    Some restrictions apply,
-      a) The Frobenius norm `fro` is not defined for vectors,
-      b) If axis is a 2-tuple (matrix-norm), only 'euclidean', 'fro', `1`,
-         `np.inf` are supported.
-    See the description of `axis` on how to compute norms for a batch of
-    vectors or matrices stored in a tensor.
-*  <b>`axis`</b>: If `axis` is `None` (the default), the input is considered a vector
-    and a single vector norm is computed over the entire set of values in the
-    tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
-    `norm(reshape(tensor, [-1]), ord=ord)`.
-    If `axis` is a Python integer, the input is considered a batch of vectors,
-    and `axis`t determines the axis in `tensor` over which to compute vector
-    norms.
-    If `axis` is a 2-tuple of Python integers it is considered a batch of
-    matrices and `axis` determines the axes in `tensor` over which to compute
-    a matrix norm.
-    Negative indices are supported. Example: If you are passing a tensor that
-    can be either a matrix or a batch of matrices at runtime, pass
-    `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
-    computed.
-*  <b>`keep_dims`</b>: If True, the axis indicated in `axis` are kept with size 1.
-    Otherwise, the dimensions in `axis` are removed from the output shape.
-*  <b>`name`</b>: The name of the op.
-
-##### Returns:
-
-
-*  <b>`output`</b>: A `Tensor` of the same type as tensor, containing the vector or
-    matrix norms. If `keep_dims` is True then the rank of output is equal to
-    the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
-    if `axis` is an integer, the rank of `output` is one less than the rank
-    of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
-    than the rank of `tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `ord` or `axis` is invalid.
-
-@compatibility(numpy)
-Mostly equivalent to numpy.linalg.norm.
-Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
-
-##### Other differences:
-
-  a) If axis is `None`, treats the the flattened `tensor` as a vector
-   regardless of rank.
-  b) Explicitly supports 'euclidean' norm as the default, including for
-   higher order tensors.
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
deleted file mode 100644
index 5ed8df49d5c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.not_equal(x, y, name=None)` {#not_equal}
-
-Returns the truth value of (x != y) element-wise.
-
-*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `quint8`, `qint8`, `qint32`, `string`, `bool`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.orthogonal_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.orthogonal_initializer.md
deleted file mode 100644
index 02f8528a6ff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.orthogonal_initializer.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Initializer that generates an orthogonal matrix.
-
-If the shape of the tensor to initialize is two-dimensional, i is initialized
-with an orthogonal matrix obtained from the singular value decomposition of a
-matrix of uniform random numbers.
-
-If the shape of the tensor to initialize is more than two-dimensional,
-a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
-is initialized, where `n` is the length of the shape vector.
-The matrix is subsequently reshaped to give a tensor of the desired shape.
-
-Args:
-  gain: multiplicative factor to apply to the orthogonal matrix
-  dtype: The type of the output.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-- - -
-
-#### `tf.orthogonal_initializer.__call__(shape, dtype=None, partition_info=None)` {#orthogonal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.orthogonal_initializer.__init__(gain=1.0, dtype=tf.float32, seed=None)` {#orthogonal_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md
deleted file mode 100644
index 97e9df1308d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.py_func(func, inp, Tout, stateful=True, name=None)` {#py_func}
-
-Wraps a python function and uses it as a TensorFlow op.
-
-Given a python function `func`, which takes numpy arrays as its
-inputs and returns numpy arrays as its outputs, wrap this function as an
-operation in a TensorFlow graph. The following snippet constructs a simple
-TensorFlow graph that invokes the `np.sinh()` NumPy function as a operation
-in the graph:
-
-```python
-def my_func(x):
-  # x will be a numpy array with the contents of the placeholder below
-  return np.sinh(x)
-inp = tf.placeholder(tf.float32)
-y = tf.py_func(my_func, [inp], tf.float32)
-```
-
-**N.B.** The `tf.py_func()` operation has the following known limitations:
-
-* The body of the function (i.e. `func`) will not be serialized in a
-  `GraphDef`. Therefore, you should not use this function if you need to
-  serialize your model and restore it in a different environment.
-
-* The operation must run in the same address space as the Python program
-  that calls `tf.py_func()`. If you are using distributed TensorFlow, you
-  must run a `tf.train.Server` in the same process as the program that calls
-  `tf.py_func()` and you must pin the created operation to a device in that
-  server (e.g. using `with tf.device():`).
-
-##### Args:
-
-
-*  <b>`func`</b>: A Python function, which accepts a list of NumPy `ndarray` objects
-    having element types that match the corresponding `tf.Tensor` objects
-    in `inp`, and returns a list of `ndarray` objects (or a single `ndarray`)
-    having element types that match the corresponding values in `Tout`.
-*  <b>`inp`</b>: A list of `Tensor` objects.
-*  <b>`Tout`</b>: A list or tuple of tensorflow data types or a single tensorflow data
-    type if there is only one, indicating what `func` returns.
-*  <b>`stateful`</b>: (Boolean.) If True, the function should be considered stateful.
-    If a function is stateless, when given the same input it will return the
-    same output and have no observable side effects. Optimizations such as
-    common subexpression elimination are only performed on stateless
-    operations.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `Tensor` or a single `Tensor` which `func` computes.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.quantize_v2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.quantize_v2.md
deleted file mode 100644
index a02df53efe9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.quantize_v2.md
+++ /dev/null
@@ -1,74 +0,0 @@
-### `tf.quantize_v2(input, min_range, max_range, T, mode=None, name=None)` {#quantize_v2}
-
-Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-if T == qint8, out[i] -= (range(T) + 1) / 2.0
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-Assume the input is type float and has a possible range of [0.0, 6.0] and the
-output type is quint8 ([0, 255]). The min_range and max_range values should be
-specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-value of the input by 255/6 and cast to quint8.
-
-If the output type was qint8 ([-128, 127]), the operation will additionally
-subtract each value by 128 prior to casting, so that the range of values aligns
-with the range of qint8.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = number_of_steps / range
-quantized = round(input * range_scale) - round(range_min * range_scale) +
-  numeric_limits<T>::min()
-quantized = max(quantized, numeric_limits<T>::min())
-quantized = min(quantized, numeric_limits<T>::max())
-```
-
-The biggest difference between this and MIN_COMBINED is that the minimum range
-is rounded first, before it's subtracted from the rounded value. With
-MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-and dequantizing will introduce a larger and larger error.
-
-One thing to watch out for is that the operator may choose to adjust the
-requested minimum and maximum values slightly during the quantization process,
-so you should always use the output ports as the range for further calculations.
-For example, if the requested minimum and maximum values are close to equal,
-they will be separated by a small epsilon value to prevent ill-formed quantized
-buffers from being created. Otherwise, you can end up with buffers where all the
-quantized values map to the same float value, which causes problems for
-operations that have to perform further calculations on them.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `float32`.
-*  <b>`min_range`</b>: A `Tensor` of type `float32`.
-    The minimum scalar value possibly produced for the input.
-*  <b>`max_range`</b>: A `Tensor` of type `float32`.
-    The maximum scalar value possibly produced for the input.
-*  <b>`T`</b>: A `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`.
-*  <b>`mode`</b>: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST"`. Defaults to `"MIN_COMBINED"`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, output_min, output_max).
-
-*  <b>`output`</b>: A `Tensor` of type `T`. The quantized data produced from the float input.
-*  <b>`output_min`</b>: A `Tensor` of type `float32`. The actual minimum scalar value used for the output.
-*  <b>`output_max`</b>: A `Tensor` of type `float32`. The actual maximum scalar value used for the output.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reduce_sum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reduce_sum.md
deleted file mode 100644
index 3da82a8cb74..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reduce_sum.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.reduce_sum(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_sum}
-
-Computes the sum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[1, 1, 1]
-#         [1, 1, 1]]
-tf.reduce_sum(x) ==> 6
-tf.reduce_sum(x, 0) ==> [2, 2, 2]
-tf.reduce_sum(x, 1) ==> [3, 3]
-tf.reduce_sum(x, 1, keep_dims=True) ==> [[3], [3]]
-tf.reduce_sum(x, [0, 1]) ==> 6
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.sum
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reshape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reshape.md
deleted file mode 100644
index 05de3a27792..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reshape.md
+++ /dev/null
@@ -1,73 +0,0 @@
-### `tf.reshape(tensor, shape, name=None)` {#reshape}
-
-Reshapes a tensor.
-
-Given `tensor`, this operation returns a tensor that has the same values
-as `tensor` with shape `shape`.
-
-If one component of `shape` is the special value -1, the size of that dimension
-is computed so that the total size remains constant.  In particular, a `shape`
-of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-
-If `shape` is 1-D or higher, then the operation returns a tensor with shape
-`shape` filled with the values of `tensor`. In this case, the number of elements
-implied by `shape` must be the same as the number of elements in `tensor`.
-
-For example:
-
-```prettyprint
-# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# tensor 't' has shape [9]
-reshape(t, [3, 3]) ==> [[1, 2, 3],
-                        [4, 5, 6],
-                        [7, 8, 9]]
-
-# tensor 't' is [[[1, 1], [2, 2]],
-#                [[3, 3], [4, 4]]]
-# tensor 't' has shape [2, 2, 2]
-reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-                        [3, 3, 4, 4]]
-
-# tensor 't' is [[[1, 1, 1],
-#                 [2, 2, 2]],
-#                [[3, 3, 3],
-#                 [4, 4, 4]],
-#                [[5, 5, 5],
-#                 [6, 6, 6]]]
-# tensor 't' has shape [3, 2, 3]
-# pass '[-1]' to flatten 't'
-reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-
-# -1 can also be used to infer the shape
-
-# -1 is inferred to be 9:
-reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 2:
-reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 3:
-reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-                              [2, 2, 2],
-                              [3, 3, 3]],
-                             [[4, 4, 4],
-                              [5, 5, 5],
-                              [6, 6, 6]]]
-
-# tensor 't' is [7]
-# shape `[]` reshapes to a scalar
-reshape(t, []) ==> 7
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Defines the shape of the output tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
deleted file mode 100644
index c6e8c748bf1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
+++ /dev/null
@@ -1,76 +0,0 @@
-### `tf.reverse_sequence(input, seq_lengths, seq_axis=None, batch_axis=None, name=None, seq_dim=None, batch_dim=None)` {#reverse_sequence}
-
-Reverses variable length slices.
-
-This op first slices `input` along the dimension `batch_axis`, and for each
-slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_axis`.
-
-The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-
-The output slice `i` along dimension `batch_axis` is then given by input
-slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_axis` reversed.
-
-For example:
-
-```prettyprint
-# Given this:
-batch_dim = 0
-seq_dim = 1
-input.dims = (4, 8, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-
-# while entries past seq_lens are copied through:
-output[0, 7:, :, ...] = input[0, 7:, :, ...]
-output[1, 2:, :, ...] = input[1, 2:, :, ...]
-output[2, 3:, :, ...] = input[2, 3:, :, ...]
-output[3, 2:, :, ...] = input[3, 2:, :, ...]
-```
-
-In contrast, if:
-
-```prettyprint
-# Given this:
-batch_dim = 2
-seq_dim = 0
-input.dims = (8, ?, 4, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-
-# while entries past seq_lens are copied through:
-output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The input to reverse.
-*  <b>`seq_lengths`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with length `input.dims(batch_dim)` and
-    `max(seq_lengths) <= input.dims(seq_dim)`
-*  <b>`seq_axis`</b>: An `int`. The dimension which is partially reversed.
-*  <b>`batch_axis`</b>: An optional `int`. Defaults to `0`.
-    The dimension along which reversal is performed.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The partially reversed input. It has the same shape as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.segment_min.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.segment_min.md
deleted file mode 100644
index 5cacf2cf723..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.segment_min.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.segment_min(data, segment_ids, name=None)` {#segment_min}
-
-Computes the minimum along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMin.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md
deleted file mode 100644
index 6269665d08d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_tensor_to_dense.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.sparse_tensor_to_dense(sp_input, default_value=0, validate_indices=True, name=None)` {#sparse_tensor_to_dense}
-
-Converts a `SparseTensor` into a dense tensor.
-
-This op is a convenience wrapper around `sparse_to_dense` for `SparseTensor`s.
-
-For example, if `sp_input` has shape `[3, 5]` and non-empty string values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-
-and `default_value` is `x`, then the output will be a dense `[3, 5]`
-string tensor with values:
-
-    [[x a x b x]
-     [x x x x x]
-     [c x x x x]]
-
-Indices must be without repeats.  This is only
-tested if validate_indices is True.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`default_value`</b>: Scalar value to set for indices not specified in
-    `sp_input`.  Defaults to zero.
-*  <b>`validate_indices`</b>: A boolean value.  If `True`, indices are checked to make
-    sure they are sorted in lexicographic order and that there are no repeats.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional).
-
-##### Returns:
-
-  A dense tensor with shape `sp_input.dense_shape` and values specified by
-  the non-empty values in `sp_input`. Indices not in `sp_input` are assigned
-  `default_value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sqrt.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sqrt.md
deleted file mode 100644
index 89daef944d7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sqrt.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.sqrt(x, name=None)` {#sqrt}
-
-Computes square root of x element-wise.
-
-I.e., \(y = \sqrt{x} = x^{1/2}\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.FileWriterCache.clear.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.FileWriterCache.clear.md
deleted file mode 100644
index e3c70278136..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.FileWriterCache.clear.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.summary.FileWriterCache.clear()` {#FileWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md
deleted file mode 100644
index 8dc62c4c18c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md
+++ /dev/null
@@ -1,252 +0,0 @@
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ByteSize()` {#TaggedRunMetadata.ByteSize}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.Clear()` {#TaggedRunMetadata.Clear}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ClearExtension(extension_handle)` {#TaggedRunMetadata.ClearExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ClearField(field_name)` {#TaggedRunMetadata.ClearField}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.CopyFrom(other_msg)` {#TaggedRunMetadata.CopyFrom}
-
-Copies the content of the specified message into the current message.
-
-The method clears the current message and then merges the specified
-message using MergeFrom.
-
-##### Args:
-
-
-*  <b>`other_msg`</b>: Message to copy into the current one.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.DiscardUnknownFields()` {#TaggedRunMetadata.DiscardUnknownFields}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.FindInitializationErrors()` {#TaggedRunMetadata.FindInitializationErrors}
-
-Finds required fields which are not initialized.
-
-##### Returns:
-
-  A list of strings.  Each string is a path to an uninitialized field from
-  the top-level message, e.g. "foo.bar[5].baz".
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.HasExtension(extension_handle)` {#TaggedRunMetadata.HasExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.HasField(field_name)` {#TaggedRunMetadata.HasField}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.IsInitialized(errors=None)` {#TaggedRunMetadata.IsInitialized}
-
-Checks if all required fields of a message are set.
-
-##### Args:
-
-
-*  <b>`errors`</b>: A list which, if provided, will be populated with the field
-           paths of all missing required fields.
-
-##### Returns:
-
-  True iff the specified message has all required fields set.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ListFields()` {#TaggedRunMetadata.ListFields}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.MergeFrom(msg)` {#TaggedRunMetadata.MergeFrom}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.MergeFromString(serialized)` {#TaggedRunMetadata.MergeFromString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ParseFromString(serialized)` {#TaggedRunMetadata.ParseFromString}
-
-Parse serialized protocol buffer data into this message.
-
-Like MergeFromString(), except we clear the object first and
-do not return the value that MergeFromString returns.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SerializePartialToString()` {#TaggedRunMetadata.SerializePartialToString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SerializeToString()` {#TaggedRunMetadata.SerializeToString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SetInParent()` {#TaggedRunMetadata.SetInParent}
-
-Sets the _cached_byte_size_dirty bit to true,
-and propagates this to our listener iff this was a state change.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.WhichOneof(oneof_name)` {#TaggedRunMetadata.WhichOneof}
-
-Returns the name of the currently set field inside a oneof, or None.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__deepcopy__(memo=None)` {#TaggedRunMetadata.__deepcopy__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__eq__(other)` {#TaggedRunMetadata.__eq__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__getstate__()` {#TaggedRunMetadata.__getstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__hash__()` {#TaggedRunMetadata.__hash__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__init__(**kwargs)` {#TaggedRunMetadata.__init__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__ne__(other_msg)` {#TaggedRunMetadata.__ne__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__repr__()` {#TaggedRunMetadata.__repr__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__setstate__(state)` {#TaggedRunMetadata.__setstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__str__()` {#TaggedRunMetadata.__str__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__unicode__()` {#TaggedRunMetadata.__unicode__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.run_metadata` {#TaggedRunMetadata.run_metadata}
-
-Magic attribute generated for "run_metadata" proto field.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.tag` {#TaggedRunMetadata.tag}
-
-Magic attribute generated for "tag" proto field.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.merge_all.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.merge_all.md
deleted file mode 100644
index 8f1ff2a277f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.merge_all.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.summary.merge_all(key='summaries')` {#merge_all}
-
-Merges all summaries collected in the default graph.
-
-##### Args:
-
-
-*  <b>`key`</b>: `GraphKey` used to collect the summaries.  Defaults to
-    `GraphKeys.SUMMARIES`.
-
-##### Returns:
-
-  If no summaries were collected, returns None.  Otherwise returns a scalar
-  `Tensor` of type `string` containing the serialized `Summary` protocol
-  buffer resulting from the merging.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.ExponentialMovingAverage.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.ExponentialMovingAverage.md
deleted file mode 100644
index b540230fe06..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.ExponentialMovingAverage.md
+++ /dev/null
@@ -1,232 +0,0 @@
-Maintains moving averages of variables by employing an exponential decay.
-
-When training a model, it is often beneficial to maintain moving averages of
-the trained parameters.  Evaluations that use averaged parameters sometimes
-produce significantly better results than the final trained values.
-
-The `apply()` method adds shadow copies of trained variables and add ops that
-maintain a moving average of the trained variables in their shadow copies.
-It is used when building the training model.  The ops that maintain moving
-averages are typically run after each training step.
-The `average()` and `average_name()` methods give access to the shadow
-variables and their names.  They are useful when building an evaluation
-model, or when restoring a model from a checkpoint file.  They help use the
-moving averages in place of the last trained values for evaluations.
-
-The moving averages are computed using exponential decay.  You specify the
-decay value when creating the `ExponentialMovingAverage` object.  The shadow
-variables are initialized with the same initial values as the trained
-variables.  When you run the ops to maintain the moving averages, each
-shadow variable is updated with the formula:
-
-  `shadow_variable -= (1 - decay) * (shadow_variable - variable)`
-
-This is mathematically equivalent to the classic formula below, but the use
-of an `assign_sub` op (the `"-="` in the formula) allows concurrent lockless
-updates to the variables:
-
-  `shadow_variable = decay * shadow_variable + (1 - decay) * variable`
-
-Reasonable values for `decay` are close to 1.0, typically in the
-multiple-nines range: 0.999, 0.9999, etc.
-
-Example usage when creating a training model:
-
-```python
-# Create variables.
-var0 = tf.Variable(...)
-var1 = tf.Variable(...)
-# ... use the variables to build a training model...
-...
-# Create an op that applies the optimizer.  This is what we usually
-# would use as a training op.
-opt_op = opt.minimize(my_loss, [var0, var1])
-
-# Create an ExponentialMovingAverage object
-ema = tf.train.ExponentialMovingAverage(decay=0.9999)
-
-# Create the shadow variables, and add ops to maintain moving averages
-# of var0 and var1.
-maintain_averages_op = ema.apply([var0, var1])
-
-# Create an op that will update the moving averages after each training
-# step.  This is what we will use in place of the usual training op.
-with tf.control_dependencies([opt_op]):
-    training_op = tf.group(maintain_averages_op)
-
-...train the model by running training_op...
-```
-
-There are two ways to use the moving averages for evaluations:
-
-*  Build a model that uses the shadow variables instead of the variables.
-   For this, use the `average()` method which returns the shadow variable
-   for a given variable.
-*  Build a model normally but load the checkpoint files to evaluate by using
-   the shadow variable names.  For this use the `average_name()` method.  See
-   the [Saver class](../../api_docs/python/train.md#Saver) for more
-   information on restoring saved variables.
-
-Example of restoring the shadow variable values:
-
-```python
-# Create a Saver that loads variables from their saved shadow values.
-shadow_var0_name = ema.average_name(var0)
-shadow_var1_name = ema.average_name(var1)
-saver = tf.train.Saver({shadow_var0_name: var0, shadow_var1_name: var1})
-saver.restore(...checkpoint filename...)
-# var0 and var1 now hold the moving average values
-```
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.__init__(decay, num_updates=None, zero_debias=False, name='ExponentialMovingAverage')` {#ExponentialMovingAverage.__init__}
-
-Creates a new ExponentialMovingAverage object.
-
-The `apply()` method has to be called to create shadow variables and add
-ops to maintain moving averages.
-
-The optional `num_updates` parameter allows one to tweak the decay rate
-dynamically. It is typical to pass the count of training steps, usually
-kept in a variable that is incremented at each step, in which case the
-decay rate is lower at the start of training.  This makes moving averages
-move faster.  If passed, the actual decay rate used is:
-
-  `min(decay, (1 + num_updates) / (10 + num_updates))`
-
-##### Args:
-
-
-*  <b>`decay`</b>: Float.  The decay to use.
-*  <b>`num_updates`</b>: Optional count of number of updates applied to variables.
-*  <b>`zero_debias`</b>: If `True`, zero debias moving-averages that are initialized
-    with tensors.
-*  <b>`name`</b>: String. Optional prefix name to use for the name of ops added in
-    `apply()`.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.apply(var_list=None)` {#ExponentialMovingAverage.apply}
-
-Maintains moving averages of variables.
-
-`var_list` must be a list of `Variable` or `Tensor` objects.  This method
-creates shadow variables for all elements of `var_list`.  Shadow variables
-for `Variable` objects are initialized to the variable's initial value.
-They will be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
-For `Tensor` objects, the shadow variables are initialized to 0 and zero
-debiased (see docstring in `assign_moving_average` for more details).
-
-shadow variables are created with `trainable=False` and added to the
-`GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
-`tf.global_variables()`.
-
-Returns an op that updates all shadow variables as described above.
-
-Note that `apply()` can be called multiple times with different lists of
-variables.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: A list of Variable or Tensor objects. The variables
-    and Tensors must be of types float16, float32, or float64.
-
-##### Returns:
-
-  An Operation that updates the moving averages.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments are not all float16, float32, or float64.
-*  <b>`ValueError`</b>: If the moving average of one of the variables is already
-    being computed.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.average_name(var)` {#ExponentialMovingAverage.average_name}
-
-Returns the name of the `Variable` holding the average for `var`.
-
-The typical scenario for `ExponentialMovingAverage` is to compute moving
-averages of variables during training, and restore the variables from the
-computed moving averages during evaluations.
-
-To restore variables, you have to know the name of the shadow variables.
-That name and the original variable can then be passed to a `Saver()` object
-to restore the variable from the moving average value with:
-  `saver = tf.train.Saver({ema.average_name(var): var})`
-
-`average_name()` can be called whether or not `apply()` has been called.
-
-##### Args:
-
-
-*  <b>`var`</b>: A `Variable` object.
-
-##### Returns:
-
-  A string: The name of the variable that will be used or was used
-  by the `ExponentialMovingAverage class` to hold the moving average of
-  `var`.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.average(var)` {#ExponentialMovingAverage.average}
-
-Returns the `Variable` holding the average of `var`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A `Variable` object.
-
-##### Returns:
-
-  A `Variable` object or `None` if the moving average of `var`
-  is not maintained.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.variables_to_restore(moving_avg_variables=None)` {#ExponentialMovingAverage.variables_to_restore}
-
-Returns a map of names to `Variables` to restore.
-
-If a variable has a moving average, use the moving average variable name as
-the restore name; otherwise, use the variable name.
-
-For example,
-
-```python
-  variables_to_restore = ema.variables_to_restore()
-  saver = tf.train.Saver(variables_to_restore)
-```
-
-Below is an example of such mapping:
-
-```
-  conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
-  conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
-  global_step: global_step
-```
-
-##### Args:
-
-
-*  <b>`moving_avg_variables`</b>: a list of variables that require to use of the
-    moving variable name to be restored. If None, it will default to
-    variables.moving_average_variables() + variables.trainable_variables()
-
-##### Returns:
-
-  A map from restore_names to variables. The restore_name can be the
-  moving_average version of the variable name if it exist, or the original
-  variable name.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md
deleted file mode 100644
index 4710e841d1f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md
+++ /dev/null
@@ -1,87 +0,0 @@
-Delay execution until global step reaches to wait_until_step.
-
-This hook delays execution until global step reaches to `wait_until_step`. It
-is used to gradually start workers in distributed settings. One example usage
-would be setting `wait_until_step=int(K*log(task_id+1))` assuming that
-task_id=0 is the chief.
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.__init__(wait_until_step)` {#GlobalStepWaiterHook.__init__}
-
-Create a _GlobalStepWaiterHook.
-
-##### Args:
-
-
-*  <b>`wait_until_step`</b>: an `int` shows until which global step should we wait.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.after_create_session(session, coord)` {#GlobalStepWaiterHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.after_run(run_context, run_values)` {#GlobalStepWaiterHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.before_run(run_context)` {#GlobalStepWaiterHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.begin()` {#GlobalStepWaiterHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.end(session)` {#GlobalStepWaiterHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunArgs.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunArgs.__new__.md
deleted file mode 100644
index 2dd4d8c8b37..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunArgs.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.train.SessionRunArgs.__new__(cls, fetches, feed_dict=None, options=None)` {#SessionRunArgs.__new__}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md
deleted file mode 100644
index 00bd190dbf3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md
+++ /dev/null
@@ -1,97 +0,0 @@
-Hook to extend calls to MonitoredSession.run().
-- - -
-
-#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.after_run(run_context, run_values)` {#SessionRunHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.before_run(run_context)` {#SessionRunHook.before_run}
-
-Called before each call to run().
-
-You can return from this call a `SessionRunArgs` object indicating ops or
-tensors to add to the upcoming `run()` call.  These ops/tensors will be run
-together with the ops/tensors originally passed to the original run() call.
-The run args you return can also contain feeds to be added to the run()
-call.
-
-The `run_context` argument is a `SessionRunContext` that provides
-information about the upcoming `run()` call: the originally requested
-op/tensors, the TensorFlow Session.
-
-At this point graph is finalized and you can not add ops.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-
-##### Returns:
-
-  None or a `SessionRunArgs` object.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.begin()` {#SessionRunHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.end(session)` {#SessionRunHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.add_queue_runner.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.add_queue_runner.md
deleted file mode 100644
index f5b9549ad8c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.add_queue_runner.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.train.add_queue_runner(qr, collection='queue_runners')` {#add_queue_runner}
-
-Adds a `QueueRunner` to a collection in the graph.
-
-When building a complex model that uses many queues it is often difficult to
-gather all the queue runners that need to be run.  This convenience function
-allows you to add a queue runner to a well known collection in the graph.
-
-The companion method `start_queue_runners()` can be used to start threads for
-all the collected queue runners.
-
-##### Args:
-
-
-*  <b>`qr`</b>: A `QueueRunner`.
-*  <b>`collection`</b>: A `GraphKey` specifying the graph collection to add
-    the queue runner to.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md
deleted file mode 100644
index bcd9d32c30c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.train.limit_epochs(tensor, num_epochs=None, name=None)` {#limit_epochs}
-
-Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
-
-Note: creates local counter `epochs`. Use `local_variables_initializer()` to
-initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: Any `Tensor`.
-*  <b>`num_epochs`</b>: A positive integer (optional).  If specified, limits the number
-    of steps the output tensor may be evaluated.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  tensor or `OutOfRange`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `num_epochs` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.tuple.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.tuple.md
deleted file mode 100644
index 503a98d6255..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.tuple.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.tuple(tensors, name=None, control_inputs=None)` {#tuple}
-
-Group tensors together.
-
-This creates a tuple of tensors with the same values as the `tensors`
-argument, except that the value of each tensor is only returned after the
-values of all tensors have been computed.
-
-`control_inputs` contains additional ops that have to finish before this op
-finishes, but whose outputs are not returned.
-
-This can be used as a "join" mechanism for parallel computations: all the
-argument tensors can be computed in parallel, but the values of any tensor
-returned by `tuple` are only available after all the parallel computations
-are done.
-
-See also `group` and `with_dependencies`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
-*  <b>`name`</b>: (optional) A name to use as a `name_scope` for the operation.
-*  <b>`control_inputs`</b>: List of additional ops to finish before returning.
-
-##### Returns:
-
-  Same as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
-*  <b>`TypeError`</b>: If `control_inputs` is not a list of `Operation` or `Tensor`
-    objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.zeros_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.zeros_initializer.md
deleted file mode 100644
index 0bfa37f6cf7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.zeros_initializer.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Initializer that generates tensors initialized to 0.
-- - -
-
-#### `tf.zeros_initializer.__call__(shape, dtype=None, partition_info=None)` {#zeros_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.zeros_initializer.__init__(dtype=tf.float32)` {#zeros_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md
deleted file mode 100644
index f86f63d7d9e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md
+++ /dev/null
@@ -1,140 +0,0 @@
-Debug Session wrapper that dumps debug data to filesystem.
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__enter__()` {#DumpingDebugWrapperSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugWrapperSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__init__(sess, session_root, watch_fn=None, log_usage=True)` {#DumpingDebugWrapperSession.__init__}
-
-Constructor of DumpingDebugWrapperSession.
-
-##### Args:
-
-
-*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
-*  <b>`session_root`</b>: (`str`) Path to the session root directory. Must be a
-    directory that does not exist or an empty directory. If the directory
-    does not exist, it will be created by the debugger core during debug
-    [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
-    calls.
-    As the `run()` calls occur, subdirectories will be added to
-    `session_root`. The subdirectories' names has the following pattern:
-      run_<epoch_time_stamp>_<uuid>
-    E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
-*  <b>`watch_fn`</b>: (`Callable`) A Callable that can be used to define per-run
-    debug ops and watched tensors. See the doc of
-    `NonInteractiveDebugWrapperSession.__init__()` for details.
-*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `session_root` is an existing and non-empty directory or
-   if `session_root` is a file.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.close()` {#DumpingDebugWrapperSession.close}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.graph` {#DumpingDebugWrapperSession.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugWrapperSession.invoke_node_stepper}
-
-See doc of BaseDebugWrapperSession.invoke_node_stepper.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_run_end(request)` {#DumpingDebugWrapperSession.on_run_end}
-
-See doc of BaseDebugWrapperSession.on_run_end.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_run_start(request)` {#DumpingDebugWrapperSession.on_run_start}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_session_init(request)` {#DumpingDebugWrapperSession.on_session_init}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugWrapperSession.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#DumpingDebugWrapperSession.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugWrapperSession.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.sess_str` {#DumpingDebugWrapperSession.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.session` {#DumpingDebugWrapperSession.session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.watch_graph_with_blacklists.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.watch_graph_with_blacklists.md
deleted file mode 100644
index 64e084af3df..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.watch_graph_with_blacklists.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf_debug.watch_graph_with_blacklists(run_options, graph, debug_ops='DebugIdentity', debug_urls=None, node_name_regex_blacklist=None, op_type_regex_blacklist=None)` {#watch_graph_with_blacklists}
-
-Add debug tensor watches, blacklisting nodes and op types.
-
-This is similar to `watch_graph()`, but the node names and op types are
-blacklisted, instead of whitelisted.
-
-N.B.: Under certain circumstances, not all specified `Tensor`s will be
-  actually watched (e.g., nodes that are constant-folded during runtime will
-  not be watched).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`graph`</b>: An instance of `ops.Graph`.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-*  <b>`debug_urls`</b>: URL(s) to send ebug values to, e.g.,
-    `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-*  <b>`node_name_regex_blacklist`</b>: Regular-expression blacklist for node_name.
-    This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
-*  <b>`op_type_regex_blacklist`</b>: Regular-expression blacklist for the op type of
-    nodes, e.g., `"(Variable|Add)"`.
-    If both node_name_regex_blacklist and op_type_regex_blacklist
-    are set, the two filtering operations will occur in a logical `OR`
-    relation. In other words, a node will be excluded if it hits either of
-    the two blacklists; a node will be included if and only if it hits
-    neither of the blacklists.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.AggregationMethod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.AggregationMethod.md
deleted file mode 100644
index ee655fbd25e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.AggregationMethod.md
+++ /dev/null
@@ -1,10 +0,0 @@
-A class listing aggregation methods used to combine gradients.
-
-Computing partial derivatives can require aggregating gradient
-contributions. This class lists the various methods that can
-be used to combine gradients in the graph:
-
-*  `ADD_N`: All of the gradient terms are summed as part of one
-   operation using the "AddN" op. It has the property that all
-   gradients must be ready before any aggregation is performed.
-*  `DEFAULT`: The system-chosen default aggregation method.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.ConditionalAccumulatorBase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.ConditionalAccumulatorBase.md
deleted file mode 100644
index f41d77e7dbe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.ConditionalAccumulatorBase.md
+++ /dev/null
@@ -1,79 +0,0 @@
-A conditional accumulator for aggregating gradients.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-- - -
-
-#### `tf.ConditionalAccumulatorBase.__init__(dtype, shape, accumulator_ref)` {#ConditionalAccumulatorBase.__init__}
-
-Creates a new ConditionalAccumulator.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: Datatype of the accumulated gradients.
-*  <b>`shape`</b>: Shape of the accumulated gradients.
-*  <b>`accumulator_ref`</b>: A handle to the conditional accumulator, created by sub-
-    classes
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.accumulator_ref` {#ConditionalAccumulatorBase.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.dtype` {#ConditionalAccumulatorBase.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.name` {#ConditionalAccumulatorBase.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.num_accumulated(name=None)` {#ConditionalAccumulatorBase.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.set_global_step(new_global_step, name=None)` {#ConditionalAccumulatorBase.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.FIFOQueue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.FIFOQueue.md
deleted file mode 100644
index b60559e41f2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.FIFOQueue.md
+++ /dev/null
@@ -1,41 +0,0 @@
-A queue implementation that dequeues elements in first-in first-out order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.FIFOQueue.__init__(capacity, dtypes, shapes=None, names=None, shared_name=None, name='fifo_queue')` {#FIFOQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `FIFOQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `FIFOQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `dtypes`, and whose shapes are optionally described
-by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects
-    with the same length as `dtypes`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.IdentityReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.IdentityReader.md
deleted file mode 100644
index 03f62113032..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.IdentityReader.md
+++ /dev/null
@@ -1,175 +0,0 @@
-A Reader that outputs the queued work as both the key and value.
-
-To use, enqueue strings in a Queue.  Read will take the front
-work string and output (work, work).
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.IdentityReader.__init__(name=None)` {#IdentityReader.__init__}
-
-Create a IdentityReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.IdentityReader.num_records_produced(name=None)` {#IdentityReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.num_work_units_completed(name=None)` {#IdentityReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.read(queue, name=None)` {#IdentityReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.read_up_to(queue, num_records, name=None)` {#IdentityReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.reader_ref` {#IdentityReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.IdentityReader.reset(name=None)` {#IdentityReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.IdentityReader.restore_state(state, name=None)` {#IdentityReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.IdentityReader.serialize_state(name=None)` {#IdentityReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.supports_serialize` {#IdentityReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.NoGradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.NoGradient.md
deleted file mode 100644
index 7181713d264..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.NoGradient.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.NoGradient(op_type)` {#NoGradient}
-
-Specifies that ops of type `op_type` is not differentiable.
-
-This function should *not* be used for operations that have a
-well-defined gradient that is not yet implemented.
-
-This function is only used when defining a new op type. It may be
-used for ops such as `tf.size()` that are not differentiable.  For
-example:
-
-```python
-tf.NotDifferentiable("Size")
-```
-
-The gradient computed for 'op_type' will then propagate zeros.
-
-For ops that have a well-defined gradient but are not yet implemented,
-no declaration should be made, and an error *must* be thrown if
-an attempt to request its gradient is made.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type` is not a string.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Print.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Print.md
deleted file mode 100644
index b1ec7c1af0d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Print.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.Print(input_, data, message=None, first_n=None, summarize=None, name=None)` {#Print}
-
-Prints a list of tensors.
-
-This is an identity op with the side effect of printing `data` when
-evaluating.
-
-##### Args:
-
-
-*  <b>`input_`</b>: A tensor passed through this op.
-*  <b>`data`</b>: A list of tensors to print out when op is evaluated.
-*  <b>`message`</b>: A string, prefix of the error message.
-*  <b>`first_n`</b>: Only log `first_n` number of times. Negative numbers log always;
-           this is the default.
-*  <b>`summarize`</b>: Only print this many entries of each tensor. If None, then a
-             maximum of 3 elements are printed per input tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same tensor as `input_`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
deleted file mode 100644
index 41b72011ed7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
+++ /dev/null
@@ -1,945 +0,0 @@
-Represents one of the outputs of an `Operation`.
-
-A `Tensor` is a symbolic handle to one of the outputs of an
-`Operation`. It does not hold the values of that operation's output,
-but instead provides a means of computing those values in a
-TensorFlow [`Session`](../../api_docs/python/client.md#Session).
-
-This class has two primary purposes:
-
-1. A `Tensor` can be passed as an input to another `Operation`.
-   This builds a dataflow connection between operations, which
-   enables TensorFlow to execute an entire `Graph` that represents a
-   large, multi-step computation.
-
-2. After the graph has been launched in a session, the value of the
-   `Tensor` can be computed by passing it to
-   [`Session.run()`](../../api_docs/python/client.md#Session.run).
-   `t.eval()` is a shortcut for calling
-   `tf.get_default_session().run(t)`.
-
-In the following example, `c`, `d`, and `e` are symbolic `Tensor`
-objects, whereas `result` is a numpy array that stores a concrete
-value:
-
-```python
-# Build a dataflow graph.
-c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
-d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
-e = tf.matmul(c, d)
-
-# Construct a `Session` to execute the graph.
-sess = tf.Session()
-
-# Execute the graph and store the value that `e` represents in `result`.
-result = sess.run(e)
-```
-
-- - -
-
-#### `tf.Tensor.dtype` {#Tensor.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.Tensor.name` {#Tensor.name}
-
-The string name of this tensor.
-
-
-- - -
-
-#### `tf.Tensor.value_index` {#Tensor.value_index}
-
-The index of this tensor in the outputs of its `Operation`.
-
-
-- - -
-
-#### `tf.Tensor.graph` {#Tensor.graph}
-
-The `Graph` that contains this tensor.
-
-
-- - -
-
-#### `tf.Tensor.op` {#Tensor.op}
-
-The `Operation` that produces this tensor as an output.
-
-
-- - -
-
-#### `tf.Tensor.consumers()` {#Tensor.consumers}
-
-Returns a list of `Operation`s that consume this tensor.
-
-##### Returns:
-
-  A list of `Operation`s.
-
-
-
-- - -
-
-#### `tf.Tensor.eval(feed_dict=None, session=None)` {#Tensor.eval}
-
-Evaluates this tensor in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for the operation that produces this
-tensor.
-
-*N.B.* Before invoking `Tensor.eval()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
-    description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to evaluate this tensor. If
-    none, the default session will be used.
-
-##### Returns:
-
-  A numpy array corresponding to the value of this tensor.
-
-
-
-- - -
-
-#### `tf.Tensor.get_shape()` {#Tensor.get_shape}
-
-Alias of Tensor.shape.
-
-
-- - -
-
-#### `tf.Tensor.shape` {#Tensor.shape}
-
-Returns the `TensorShape` that represents the shape of this tensor.
-
-The shape is computed using shape inference functions that are
-registered in the Op for each `Operation`.  See
-[`TensorShape`](../../api_docs/python/framework.md#TensorShape)
-for more details of what a shape represents.
-
-The inferred shape of a tensor is used to provide shape
-information without having to launch the graph in a session. This
-can be used for debugging, and providing early error messages. For
-example:
-
-```python
-c = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-
-print(c.shape)
-==> TensorShape([Dimension(2), Dimension(3)])
-
-d = tf.constant([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
-
-print(d.shape)
-==> TensorShape([Dimension(4), Dimension(2)])
-
-# Raises a ValueError, because `c` and `d` do not have compatible
-# inner dimensions.
-e = tf.matmul(c, d)
-
-f = tf.matmul(c, d, transpose_a=True, transpose_b=True)
-
-print(f.shape)
-==> TensorShape([Dimension(3), Dimension(4)])
-```
-
-In some cases, the inferred shape may have unknown dimensions. If
-the caller has additional information about the values of these
-dimensions, `Tensor.set_shape()` can be used to augment the
-inferred shape.
-
-##### Returns:
-
-  A `TensorShape` representing the shape of this tensor.
-
-
-- - -
-
-#### `tf.Tensor.set_shape(shape)` {#Tensor.set_shape}
-
-Updates the shape of this tensor.
-
-This method can be called multiple times, and will merge the given
-`shape` with the current shape of this tensor. It can be used to
-provide additional information about the shape of this tensor that
-cannot be inferred from the graph alone. For example, this can be used
-to provide additional information about the shapes of images:
-
-```python
-_, image_data = tf.TFRecordReader(...).read(...)
-image = tf.image.decode_png(image_data, channels=3)
-
-# The height and width dimensions of `image` are data dependent, and
-# cannot be computed without executing the op.
-print(image.shape)
-==> TensorShape([Dimension(None), Dimension(None), Dimension(3)])
-
-# We know that each image in this dataset is 28 x 28 pixels.
-image.set_shape([28, 28, 3])
-print(image.shape)
-==> TensorShape([Dimension(28), Dimension(28), Dimension(3)])
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: A `TensorShape` representing the shape of this tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `shape` is not compatible with the current shape of
-    this tensor.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Tensor.__abs__(x, name=None)` {#Tensor.__abs__}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
-
-- - -
-
-#### `tf.Tensor.__add__(x, y)` {#Tensor.__add__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__and__(x, y)` {#Tensor.__and__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__bool__()` {#Tensor.__bool__}
-
-Dummy method to prevent a tensor from being used as a Python `bool`.
-
-This overload raises a `TypeError` when the user inadvertently
-treats a `Tensor` as a boolean (e.g. in an `if` statement). For
-example:
-
-```python
-if tf.constant(True):  # Will raise.
-  # ...
-
-if tf.constant(5) < tf.constant(7):  # Will raise.
-  # ...
-```
-
-This disallows ambiguities between testing the Python value vs testing the
-dynamic condition of the `Tensor`.
-
-##### Raises:
-
-  `TypeError`.
-
-
-- - -
-
-#### `tf.Tensor.__div__(x, y)` {#Tensor.__div__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Tensor.__eq__(other)` {#Tensor.__eq__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__floordiv__(x, y)` {#Tensor.__floordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Tensor.__ge__(x, y, name=None)` {#Tensor.__ge__}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__getitem__(tensor, slice_spec, var=None)` {#Tensor.__getitem__}
-
-Overload for Tensor.__getitem__.
-
-This operation extracts the specified region from the tensor.
-The notation is similar to NumPy with the restriction that
-currently only support basic indexing. That means that
-using a tensor as input is not currently allowed
-
-Some useful examples:
-
-```python
-# strip leading and trailing 2 elements
-foo = tf.constant([1,2,3,4,5,6])
-print(foo[2:-2].eval()) # => [3,4]
-
-# skip every row and reverse every column
-foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-print(foo[::2,::-1].eval()) # => [[3,2,1], [9,8,7]]
-
-# Insert another dimension
-foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[:, tf.newaxis, :].eval()) # => [[[3,2,1]], [[9,8,7]]]
-print(foo[:, :, tf.newaxis].eval()) # => [[[3],[2],[1]], [[9],[8],[7]]]
-
-# Ellipses (3 equivalent operations)
-print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[tf.newaxis, ...].eval()) # => [[[3,2,1], [9,8,7]]]
-print(foo[tf.newaxis].eval()) # => [[[3,2,1], [9,8,7]]]
-```
-
-##### Notes:
-
-  - `tf.newaxis` is `None` as in NumPy.
-  - An implicit ellipsis is placed at the end of the `slice_spec`
-  - NumPy advanced indexing is currently not supported.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: An ops.Tensor object.
-*  <b>`slice_spec`</b>: The arguments to Tensor.__getitem__.
-*  <b>`var`</b>: In the case of variable slice assignment, the Variable
-    object to slice (i.e. tensor is the read-only view of this
-    variable).
-
-##### Returns:
-
-  The appropriate slice of "tensor", based on "slice_spec".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If a slice range is negative size.
-*  <b>`TypeError`</b>: If the slice indices aren't int, slice, or Ellipsis.
-
-
-- - -
-
-#### `tf.Tensor.__gt__(x, y, name=None)` {#Tensor.__gt__}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__hash__()` {#Tensor.__hash__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__init__(op, value_index, dtype)` {#Tensor.__init__}
-
-Creates a new `Tensor`.
-
-##### Args:
-
-
-*  <b>`op`</b>: An `Operation`. `Operation` that computes this tensor.
-*  <b>`value_index`</b>: An `int`. Index of the operation's endpoint that produces
-    this tensor.
-*  <b>`dtype`</b>: A `DType`. Type of elements stored in this tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the op is not an `Operation`.
-
-
-- - -
-
-#### `tf.Tensor.__invert__(x, name=None)` {#Tensor.__invert__}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__iter__()` {#Tensor.__iter__}
-
-Dummy method to prevent iteration. Do not call.
-
-NOTE(mrry): If we register __getitem__ as an overloaded operator,
-Python will valiantly attempt to iterate over the Tensor from 0 to
-infinity.  Declaring this method prevents this unintended
-behavior.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: when invoked.
-
-
-- - -
-
-#### `tf.Tensor.__le__(x, y, name=None)` {#Tensor.__le__}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__lt__(x, y, name=None)` {#Tensor.__lt__}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__mod__(x, y)` {#Tensor.__mod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__mul__(x, y)` {#Tensor.__mul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Tensor.__neg__(x, name=None)` {#Tensor.__neg__}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__nonzero__()` {#Tensor.__nonzero__}
-
-Dummy method to prevent a tensor from being used as a Python `bool`.
-
-This is the Python 2.x counterpart to `__bool__()` above.
-
-##### Raises:
-
-  `TypeError`.
-
-
-- - -
-
-#### `tf.Tensor.__or__(x, y)` {#Tensor.__or__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__pow__(x, y)` {#Tensor.__pow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Tensor.__radd__(y, x)` {#Tensor.__radd__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rand__(y, x)` {#Tensor.__rand__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__rdiv__(y, x)` {#Tensor.__rdiv__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Tensor.__repr__()` {#Tensor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__rfloordiv__(y, x)` {#Tensor.__rfloordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Tensor.__rmod__(y, x)` {#Tensor.__rmod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rmul__(y, x)` {#Tensor.__rmul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Tensor.__ror__(y, x)` {#Tensor.__ror__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Tensor.__rpow__(y, x)` {#Tensor.__rpow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Tensor.__rsub__(y, x)` {#Tensor.__rsub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__rtruediv__(y, x)` {#Tensor.__rtruediv__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__rxor__(y, x)` {#Tensor.__rxor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Tensor.__str__()` {#Tensor.__str__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__sub__(x, y)` {#Tensor.__sub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Tensor.__truediv__(x, y)` {#Tensor.__truediv__}
-
-
-
-
-- - -
-
-#### `tf.Tensor.__xor__(x, y)` {#Tensor.__xor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Tensor.device` {#Tensor.device}
-
-The name of the device on which this tensor will be produced, or None.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Variable.from_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Variable.from_proto.md
deleted file mode 100644
index e4ab071c598..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Variable.from_proto.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.Variable.from_proto(variable_def, import_scope=None)` {#Variable.from_proto}
-
-Returns a `Variable` object created from `variable_def`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md
deleted file mode 100644
index 7b558a48688..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None)` {#accumulate_n}
-
-Returns the element-wise sum of a list of tensors.
-
-Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
-otherwise, these are inferred.
-
-NOTE: This operation is not differentiable and cannot be used if inputs depend
-on trainable variables. Please use `tf.add_n` for such cases.
-
-For example:
-
-```python
-# tensor 'a' is [[1, 2], [3, 4]]
-# tensor `b` is [[5, 0], [0, 6]]
-tf.accumulate_n([a, b, a]) ==> [[7, 4], [6, 14]]
-
-# Explicitly pass shape and type
-tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
-  ==> [[7, 4], [6, 14]]
-```
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `Tensor` objects, each with same shape and type.
-*  <b>`shape`</b>: Shape of elements of `inputs`.
-*  <b>`tensor_dtype`</b>: The type of `inputs`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as the elements of `inputs`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `inputs` don't all have same shape and dtype or the shape
-  cannot be inferred.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.all_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.all_variables.md
deleted file mode 100644
index 1badf0e5c5c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.all_variables.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.all_variables(*args, **kwargs)` {#all_variables}
-
-See `tf.global_variables`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Please use tf.global_variables instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md
deleted file mode 100644
index 3671bc94dfa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.assert_less_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less_equal}
-
-Assert the condition `x <= y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_less_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] <= y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less_equal"
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x <= y` is False.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md
deleted file mode 100644
index 380ab0af748..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank_at_least}
-
-Assert `x` has rank equal to `rank` or higher.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_rank_at_least(x, 2)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`rank`</b>: Scalar `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_rank_at_least".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` has specified rank or higher.
-  If static checks determine `x` has correct rank, a `no_op` is returned.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If static checks determine `x` has wrong rank.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md
deleted file mode 100644
index f72385be605..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.assign(ref, value, validate_shape=None, use_locking=None, name=None)` {#assign}
-
-Update 'ref' by assigning 'value' to it.
-
-This operation outputs "ref" after the assignment is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`.
-    Should be from a `Variable` node. May be uninitialized.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be assigned to the variable.
-*  <b>`validate_shape`</b>: An optional `bool`. Defaults to `True`.
-    If true, the operation will validate that the shape
-    of 'value' matches the shape of the Tensor being assigned to.  If false,
-    'ref' will take on the shape of 'value'.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    If True, the assignment will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been reset.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md
deleted file mode 100644
index 3c3f85f869d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md
+++ /dev/null
@@ -1,100 +0,0 @@
-### `tf.batch_to_space(input, crops, block_size, name=None)` {#batch_to_space}
-
-BatchToSpace for 4-D tensors of type T.
-
-This is a legacy version of the more general BatchToSpaceND.
-
-Rearranges (permutes) data from batch into blocks of spatial data, followed by
-cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-this op outputs a copy of the input tensor where values from the `batch`
-dimension are moved in spatial blocks to the `height` and `width` dimensions,
-followed by cropping along the `height` and `width` dimensions.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 4-D tensor with shape
-    `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-      depth]`. Note that the batch size of the input tensor must be divisible by
-    `block_size * block_size`.
-*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-    how many elements to crop from the intermediate result across the spatial
-    dimensions as follows:
-
-        crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  4-D with shape `[batch, height, width, depth]`, where:
-
-        height = height_pad - crop_top - crop_bottom
-        width = width_pad - crop_left - crop_right
-
-  The attr `block_size` must be greater than one. It indicates the block size.
-
-  Some examples:
-
-  (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-
-  ```prettyprint
-  [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-  ```
-
-  The output tensor has shape `[1, 2, 2, 1]` and value:
-
-  ```prettyprint
-  x = [[[[1], [2]], [[3], [4]]]]
-  ```
-
-  (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-
-  ```prettyprint
-  [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-  ```
-
-  The output tensor has shape `[1, 2, 2, 3]` and value:
-
-  ```prettyprint
-  x = [[[[1, 2, 3], [4, 5, 6]],
-        [[7, 8, 9], [10, 11, 12]]]]
-  ```
-
-  (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-
-  ```prettyprint
-  x = [[[[1], [3]], [[9], [11]]],
-       [[[2], [4]], [[10], [12]]],
-       [[[5], [7]], [[13], [15]]],
-       [[[6], [8]], [[14], [16]]]]
-  ```
-
-  The output tensor has shape `[1, 4, 4, 1]` and value:
-
-  ```prettyprint
-  x = [[[1],   [2],  [3],  [4]],
-       [[5],   [6],  [7],  [8]],
-       [[9],  [10], [11],  [12]],
-       [[13], [14], [15],  [16]]]
-  ```
-
-  (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-
-  ```prettyprint
-  x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-       [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-  ```
-
-  The output tensor has shape `[2, 2, 4, 1]` and value:
-
-  ```prettyprint
-  x = [[[[1], [3]], [[5], [7]]],
-       [[[2], [4]], [[10], [12]]],
-       [[[5], [7]], [[13], [15]]],
-       [[[6], [8]], [[14], [16]]]]
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.constant_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.constant_initializer.md
deleted file mode 100644
index fa95f791bfe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.constant_initializer.md
+++ /dev/null
@@ -1,86 +0,0 @@
-Initializer that generates tensors with constant values.
-
-The resulting tensor is populated with values of type `dtype`, as
-specified by arguments `value` following the desired `shape` of the
-new tensor (see examples below).
-
-The argument `value` can be a constant value, or a list of values of type
-`dtype`. If `value` is a list, then the length of the list must be less
-than or equal to the number of elements implied by the desired shape of the
-tensor. In the case where the total number of elements in `value` is less
-than the number of elements required by the tensor shape, the last element
-in `value` will be used to fill the remaining entries. If the total number of
-elements in `value` is greater than the number of elements required by the
-tensor shape, the initializer will raise a `ValueError`.
-
-Args:
-  value: A Python scalar, list of values, or a N-dimensional numpy array. All
-    elements of the initialized variable will be set to the corresponding
-    value in the `value` argument.
-  dtype: The data type.
-  verify_shape: Boolean that enables verification of the shape of `value`. If
-    `True`, the initializer will throw an error if the shape of `value` is not
-    compatible with the shape of the initialized tensor.
-
-Examples:
-  The following example can be rewritten using a numpy.ndarray instead
-  of the `value` list, even reshaped, as shown in the two commented lines
-  below the `value` list initialization.
-
-```python
-  >>> import numpy as np
-  >>> import tensorflow as tf
-
-  >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
-  >>> # value = np.array(value)
-  >>> # value = value.reshape([2, 4])
-  >>> init = tf.constant_initializer(value)
-
-  >>> print('fitting shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
-  >>>   x.initializer.run()
-  >>>   print(x.eval())
-
-  fitting shape:
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]]
-
-  >>> print('larger shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
-  >>>   x.initializer.run()
-  >>>   print(x.eval())
-
-  larger shape:
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]
-   [ 7.  7.  7.  7.]]
-
-  >>> print('smaller shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
-
-  ValueError: Too many elements provided. Needed at most 6, but received 8
-
-  >>> print('shape verification:')
-  >>> init_verify = tf.constant_initializer(value, verify_shape=True)
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init_verify)
-
-  TypeError: Expected Tensor's shape: (3, 4), got (8,).
-```
-- - -
-
-#### `tf.constant_initializer.__call__(shape, dtype=None, partition_info=None)` {#constant_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.constant_initializer.__init__(value=0, dtype=tf.float32, verify_shape=False)` {#constant_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.md
deleted file mode 100644
index 0bee637f4d2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.md
+++ /dev/null
@@ -1,58 +0,0 @@
-Base Class for Tensor-like objects that emit stochastic values.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.__init__()` {#BaseStochasticTensor.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.dtype` {#BaseStochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.graph` {#BaseStochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.loss(sample_loss)` {#BaseStochasticTensor.loss}
-
-Returns the term to add to the surrogate loss.
-
-This method is called by `surrogate_loss`.  The input `sample_loss` should
-have already had `stop_gradient` applied to it.  This is because the
-surrogate_loss usually provides a Monte Carlo sample term of the form
-`differentiable_surrogate * sample_loss` where `sample_loss` is considered
-constant with respect to the input for purposes of the gradient.
-
-##### Args:
-
-
-*  <b>`sample_loss`</b>: `Tensor`, sample loss downstream of this `StochasticTensor`.
-
-##### Returns:
-
-  Either `None` or a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.name` {#BaseStochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor.value(name=None)` {#BaseStochasticTensor.value}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.variational_inference.elbo.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.variational_inference.elbo.md
deleted file mode 100644
index b4fbbf8aadc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.bayesflow.variational_inference.elbo.md
+++ /dev/null
@@ -1,71 +0,0 @@
-### `tf.contrib.bayesflow.variational_inference.elbo(log_likelihood, variational_with_prior=None, keep_batch_dim=True, form=None, name='ELBO')` {#elbo}
-
-Evidence Lower BOund. `log p(x) >= ELBO`.
-
-Optimization objective for inference of hidden variables by variational
-inference.
-
-This function is meant to be used in conjunction with `StochasticTensor`.
-The user should build out the inference network, using `StochasticTensor`s
-as latent variables, and the generative network. `elbo` at minimum needs
-`p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
-the variational distributions. Use `register_prior` to register `Distribution`
-priors for each `StochasticTensor`. Alternatively, pass in
-`variational_with_prior` specifying all variational distributions and their
-priors.
-
-Mathematical details:
-
-```
-log p(x) =  log \int p(x, Z) dZ
-         =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
-         =  log E_q[\frac {p(x, Z)}{q(Z)}]
-         >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
-
-L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
-           = E_q[log p(x|Z)p(Z)] + H[q]           (1)
-           = E_q[log p(x|Z)] - KL(q || p)         (2)
-
-H - Entropy
-KL - Kullback-Leibler divergence
-```
-
-See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
-more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
-in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
-
-`form` specifies which form of the ELBO is used. `form=ELBOForms.default`
-tries, in order of preference: analytic KL, analytic entropy, sampling.
-
-Multiple entries in the `variational_with_prior` dict implies a factorization.
-e.g. `q(Z) = q(z1)q(z2)q(z3)`.
-
-##### Args:
-
-
-*  <b>`log_likelihood`</b>: `Tensor` log p(x|Z).
-*  <b>`variational_with_prior`</b>: dict from `StochasticTensor` q(Z) to
-    `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
-    objects upstream of `log_likelihood` with priors registered with
-    `register_prior`.
-*  <b>`keep_batch_dim`</b>: bool. Whether to keep the batch dimension when summing
-    entropy/KL term. When the sample is per data point, this should be True;
-    otherwise (e.g. in a Bayesian NN), this should be False.
-*  <b>`form`</b>: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-    ELBOForms.default.
-*  <b>`name`</b>: name to prefix ops with.
-
-##### Returns:
-
-  `Tensor` ELBO of the same type and shape as `log_likelihood`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if variationals in `variational_with_prior` are not
-    `StochasticTensor`s or if priors are not `Distribution`s.
-*  <b>`TypeError`</b>: if form is not a valid ELBOForms constant.
-*  <b>`ValueError`</b>: if `variational_with_prior` is None and there are no
-    `StochasticTensor`s upstream of `log_likelihood`.
-*  <b>`ValueError`</b>: if any variational does not have a prior passed or registered.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md
deleted file mode 100644
index a1f7bd2033d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.crf.crf_log_likelihood(inputs, tag_indices, sequence_lengths, transition_params=None)` {#crf_log_likelihood}
-
-Computes the log-likelihood of tag sequences in a CRF.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
-      compute the log-likelihood.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix, if available.
-
-##### Returns:
-
-
-*  <b>`log_likelihood`</b>: A scalar containing the log-likelihood of the given sequence
-      of tag indices.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix. This is either
-      provided by the caller or created in this function.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
deleted file mode 100644
index 3cd1d4f9ae0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
+++ /dev/null
@@ -1,676 +0,0 @@
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and a 1-D diagonal
-`diag_stddev`, representing the standard deviations.  This distribution
-assumes the random variables, `(X_1,...,X_k)` are independent, thus no
-non-diagonal terms of the covariance matrix are needed.
-
-This allows for `O(k)` pdf evaluation, sampling, and storage.
-
-#### Mathematical details
-
-The PDF of this distribution is defined in terms of the diagonal covariance
-determined by `diag_stddev`: `C_{ii} = diag_stddev[i]**2`.
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and the square roots of the (independent) random variables.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal standard deviation.
-mu = [1, 2, 3.]
-diag_stddev = [4, 5, 6.]
-dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-diag_stddev = ...  # shape 2 x 3, positive.
-dist = tf.contrib.distributions.MultivariateNormalDiag(mu, diag_stddev)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.__init__(mu, diag_stddev, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiag')` {#MultivariateNormalDiag.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and standard deviations `diag_stddev`.
-Each batch member represents a random vector `(X_1,...,X_k)` of independent
-random normals.
-The mean of `X_i` is `mu[i]`, and the standard deviation is
-`diag_stddev[i]`.
-
-##### Args:
-
-
-*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`diag_stddev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-    representing the standard deviations.  Must be positive.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate
-    input with asserts.  If `validate_args` is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `diag_stddev` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.allow_nan_stats` {#MultivariateNormalDiag.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.batch_shape` {#MultivariateNormalDiag.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiag.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.cdf(value, name='cdf')` {#MultivariateNormalDiag.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.copy(**override_parameters_kwargs)` {#MultivariateNormalDiag.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.covariance(name='covariance')` {#MultivariateNormalDiag.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.dtype` {#MultivariateNormalDiag.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.entropy(name='entropy')` {#MultivariateNormalDiag.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.event_shape` {#MultivariateNormalDiag.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiag.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_continuous` {#MultivariateNormalDiag.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiag.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiag.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiag.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_prob(value, name='log_prob')` {#MultivariateNormalDiag.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiag.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiag.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mean(name='mean')` {#MultivariateNormalDiag.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mode(name='mode')` {#MultivariateNormalDiag.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.mu` {#MultivariateNormalDiag.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.name` {#MultivariateNormalDiag.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiag.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiag.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.parameters` {#MultivariateNormalDiag.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.prob(value, name='prob')` {#MultivariateNormalDiag.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.reparameterization_type` {#MultivariateNormalDiag.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiag.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sigma` {#MultivariateNormalDiag.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.sigma_det(name='sigma_det')` {#MultivariateNormalDiag.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.stddev(name='stddev')` {#MultivariateNormalDiag.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.survival_function(value, name='survival_function')` {#MultivariateNormalDiag.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.validate_args` {#MultivariateNormalDiag.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiag.variance(name='variance')` {#MultivariateNormalDiag.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md
deleted file mode 100644
index 4cf3f021980..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md
+++ /dev/null
@@ -1,741 +0,0 @@
-Distribution representing the quantization `Y = ceiling(X)`.
-
-#### Definition in terms of sampling.
-
-```
-1. Draw X
-2. Set Y <-- ceiling(X)
-3. If Y < low, reset Y <-- low
-4. If Y > high, reset Y <-- high
-5. Return Y
-```
-
-#### Definition in terms of the probability mass function.
-
-Given scalar random variable `X`, we define a discrete random variable `Y`
-supported on the integers as follows:
-
-```
-P[Y = j] := P[X <= low],  if j == low,
-         := P[X > high - 1],  j == high,
-         := 0, if j < low or j > high,
-         := P[j - 1 < X <= j],  all other j.
-```
-
-Conceptually, without cutoffs, the quantization process partitions the real
-line `R` into half open intervals, and identifies an integer `j` with the
-right endpoints:
-
-```
-R = ... (-2, -1](-1, 0](0, 1](1, 2](2, 3](3, 4] ...
-j = ...      -1      0     1     2     3     4  ...
-```
-
-`P[Y = j]` is the mass of `X` within the `jth` interval.
-If `low = 0`, and `high = 2`, then the intervals are redrawn
-and `j` is re-assigned:
-
-```
-R = (-infty, 0](0, 1](1, infty)
-j =          0     1     2
-```
-
-`P[Y = j]` is still the mass of `X` within the `jth` interval.
-
-#### Caveats
-
-Since evaluation of each `P[Y = j]` involves a cdf evaluation (rather than
-a closed form function such as for a Poisson), computations such as mean and
-entropy are better done with samples or approximations, and are not
-implemented by this class.
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.__init__(distribution, low=None, high=None, validate_args=False, name='QuantizedDistribution')` {#QuantizedDistribution.__init__}
-
-Construct a Quantized Distribution representing `Y = ceiling(X)`.
-
-Some properties are inherited from the distribution defining `X`. Example:
-`allow_nan_stats` is determined for this `QuantizedDistribution` by reading
-the `distribution`.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution class to transform. Typically an
-    instance of `Distribution`.
-*  <b>`low`</b>: `Tensor` with same `dtype` as this distribution and shape
-    able to be added to samples.  Should be a whole number.  Default `None`.
-    If provided, base distribution's `prob` should be defined at
-    `low`.
-*  <b>`high`</b>: `Tensor` with same `dtype` as this distribution and shape
-    able to be added to samples.  Should be a whole number.  Default `None`.
-    If provided, base distribution's `prob` should be defined at
-    `high - 1`.
-    `high` must be strictly greater than `low`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `dist_cls` is not a subclass of
-      `Distribution` or continuous.
-*  <b>`NotImplementedError`</b>: If the base distribution does not implement `cdf`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.allow_nan_stats` {#QuantizedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.batch_shape` {#QuantizedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#QuantizedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.cdf(value, name='cdf')` {#QuantizedDistribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-cdf(y) := P[Y <= y]
-        = 1, if y >= high,
-        = 0, if y < low,
-        = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.copy(**override_parameters_kwargs)` {#QuantizedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.covariance(name='covariance')` {#QuantizedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.distribution` {#QuantizedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.dtype` {#QuantizedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.entropy(name='entropy')` {#QuantizedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.event_shape` {#QuantizedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.event_shape_tensor(name='event_shape_tensor')` {#QuantizedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_continuous` {#QuantizedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_scalar_batch(name='is_scalar_batch')` {#QuantizedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.is_scalar_event(name='is_scalar_event')` {#QuantizedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_cdf(value, name='log_cdf')` {#QuantizedDistribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-cdf(y) := P[Y <= y]
-        = 1, if y >= high,
-        = 0, if y < low,
-        = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_prob(value, name='log_prob')` {#QuantizedDistribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-P[Y = y] := P[X <= low],  if y == low,
-         := P[X > high - 1],  y == high,
-         := 0, if j < low or y > high,
-         := P[y - 1 < X <= y],  all other y.
-```
-
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.  If the
-base distribution has a `log_survival_function` method results will be more
-accurate for large values of `y`, and in this case the `log_survival_function`
-must also be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.log_survival_function(value, name='log_survival_function')` {#QuantizedDistribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-survival_function(y) := P[Y > y]
-                      = 0, if y >= high,
-                      = 1, if y < low,
-                      = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `log_cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.mean(name='mean')` {#QuantizedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.mode(name='mode')` {#QuantizedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.name` {#QuantizedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#QuantizedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.param_static_shapes(cls, sample_shape)` {#QuantizedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.parameters` {#QuantizedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.prob(value, name='prob')` {#QuantizedDistribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-P[Y = y] := P[X <= low],  if y == low,
-         := P[X > high - 1],  y == high,
-         := 0, if j < low or y > high,
-         := P[y - 1 < X <= y],  all other y.
-```
-
-
-The base distribution's `cdf` method must be defined on `y - 1`.  If the
-base distribution has a `survival_function` method, results will be more
-accurate for large values of `y`, and in this case the `survival_function` must
-also be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.reparameterization_type` {#QuantizedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.sample(sample_shape=(), seed=None, name='sample')` {#QuantizedDistribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.stddev(name='stddev')` {#QuantizedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.survival_function(value, name='survival_function')` {#QuantizedDistribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-
-Additional documentation from `QuantizedDistribution`:
-
-For whole numbers `y`,
-
-```
-survival_function(y) := P[Y > y]
-                      = 0, if y >= high,
-                      = 1, if y < low,
-                      = P[X <= y], otherwise.
-```
-
-Since `Y` only has mass at whole numbers, `P[Y <= y] = P[Y <= floor(y)]`.
-This dictates that fractional `y` are first floored to a whole number, and
-then above definition applies.
-
-The base distribution's `cdf` method must be defined on `y - 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.validate_args` {#QuantizedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.QuantizedDistribution.variance(name='variance')` {#QuantizedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
deleted file mode 100644
index 6d8bff66129..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
+++ /dev/null
@@ -1,684 +0,0 @@
-Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; df, mu, sigma) = (1 + y**2 / df)**(-0.5 (df + 1)) / Z
-where,
-y = (x - mu) / sigma
-Z = abs(sigma) sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1))
-```
-
-where:
-* `loc = mu`,
-* `scale = sigma`, and,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The StudentT distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ StudentT(df, loc=0, scale=1)
-Y = loc + scale * X
-```
-
-Notice that `scale` has semantics more similar to standard deviation than
-variance.  However it is not actually the std. deviation; the Student's
-t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Student t distribution.
-single_dist = tf.contrib.distributions.StudentT(df=3)
-
-# Evaluate the pdf at 1, returning a scalar Tensor.
-single_dist.prob(1.)
-
-# Define a batch of two scalar valued Student t's.
-# The first has degrees of freedom 2, mean 1, and scale 11.
-# The second 3, 2 and 22.
-multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
-                                               loc=[1, 2.],
-                                               scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-multi_dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-multi_dist.sample(3)
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two Student's t distributions.
-# Both have df 2 and mean 1, but different scales.
-dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.StudentT.__init__(df, loc, scale, validate_args=False, allow_nan_stats=True, name='StudentT')` {#StudentT.__init__}
-
-Construct Student's t distributions.
-
-The distributions have degree of freedom `df`, mean `loc`, and scale
-`scale`.
-
-The parameters `df`, `loc`, and `scale` must be shaped in a way that
-supports broadcasting (e.g. `df + loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`df`</b>: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-    `df` must contain only positive values.
-*  <b>`loc`</b>: Numeric `Tensor`. The mean(s) of the distribution(s).
-*  <b>`scale`</b>: Numeric `Tensor`. The scaling factor(s) for the distribution(s).
-    Note that `scale` is not technically the standard deviation of this
-    distribution but has semantics more similar to standard deviation than
-    variance.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if loc and scale are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.allow_nan_stats` {#StudentT.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.batch_shape` {#StudentT.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.batch_shape_tensor(name='batch_shape_tensor')` {#StudentT.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.cdf(value, name='cdf')` {#StudentT.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.copy(**override_parameters_kwargs)` {#StudentT.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.covariance(name='covariance')` {#StudentT.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.df` {#StudentT.df}
-
-Degrees of freedom in these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.dtype` {#StudentT.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.entropy(name='entropy')` {#StudentT.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.event_shape` {#StudentT.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.event_shape_tensor(name='event_shape_tensor')` {#StudentT.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_continuous` {#StudentT.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_scalar_batch(name='is_scalar_batch')` {#StudentT.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.is_scalar_event(name='is_scalar_event')` {#StudentT.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.loc` {#StudentT.loc}
-
-Locations of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_cdf(value, name='log_cdf')` {#StudentT.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_prob(value, name='log_prob')` {#StudentT.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.log_survival_function(value, name='log_survival_function')` {#StudentT.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.mean(name='mean')` {#StudentT.mean}
-
-Mean.
-
-Additional documentation from `StudentT`:
-
-The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-`NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
-rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.mode(name='mode')` {#StudentT.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.name` {#StudentT.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#StudentT.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.param_static_shapes(cls, sample_shape)` {#StudentT.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.parameters` {#StudentT.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.prob(value, name='prob')` {#StudentT.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.reparameterization_type` {#StudentT.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.sample(sample_shape=(), seed=None, name='sample')` {#StudentT.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.scale` {#StudentT.scale}
-
-Scaling factors of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.stddev(name='stddev')` {#StudentT.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.survival_function(value, name='survival_function')` {#StudentT.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.validate_args` {#StudentT.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentT.variance(name='variance')` {#StudentT.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `StudentT`:
-
-The variance for Student's T equals
-
-```
-df / (df - 2), when df > 2
-infinity, when 1 < df <= 2
-NaN, when df <= 1
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.md
deleted file mode 100644
index 8dac7a1de9c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.md
+++ /dev/null
@@ -1,584 +0,0 @@
-StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`.
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.__init__(df, loc, scale, validate_args=False, allow_nan_stats=True, name='StudentTWithAbsDfSoftplusScale')` {#StudentTWithAbsDfSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.allow_nan_stats` {#StudentTWithAbsDfSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.batch_shape` {#StudentTWithAbsDfSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#StudentTWithAbsDfSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.cdf(value, name='cdf')` {#StudentTWithAbsDfSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.copy(**override_parameters_kwargs)` {#StudentTWithAbsDfSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.covariance(name='covariance')` {#StudentTWithAbsDfSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.df` {#StudentTWithAbsDfSoftplusScale.df}
-
-Degrees of freedom in these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.dtype` {#StudentTWithAbsDfSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.entropy(name='entropy')` {#StudentTWithAbsDfSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.event_shape` {#StudentTWithAbsDfSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#StudentTWithAbsDfSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_continuous` {#StudentTWithAbsDfSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#StudentTWithAbsDfSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.is_scalar_event(name='is_scalar_event')` {#StudentTWithAbsDfSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.loc` {#StudentTWithAbsDfSoftplusScale.loc}
-
-Locations of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_cdf(value, name='log_cdf')` {#StudentTWithAbsDfSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_prob(value, name='log_prob')` {#StudentTWithAbsDfSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.log_survival_function(value, name='log_survival_function')` {#StudentTWithAbsDfSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.mean(name='mean')` {#StudentTWithAbsDfSoftplusScale.mean}
-
-Mean.
-
-Additional documentation from `StudentT`:
-
-The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-`NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
-rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.mode(name='mode')` {#StudentTWithAbsDfSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.name` {#StudentTWithAbsDfSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#StudentTWithAbsDfSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.param_static_shapes(cls, sample_shape)` {#StudentTWithAbsDfSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.parameters` {#StudentTWithAbsDfSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.prob(value, name='prob')` {#StudentTWithAbsDfSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.reparameterization_type` {#StudentTWithAbsDfSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#StudentTWithAbsDfSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.scale` {#StudentTWithAbsDfSoftplusScale.scale}
-
-Scaling factors of these Student's t distribution(s).
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.stddev(name='stddev')` {#StudentTWithAbsDfSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.survival_function(value, name='survival_function')` {#StudentTWithAbsDfSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.validate_args` {#StudentTWithAbsDfSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusScale.variance(name='variance')` {#StudentTWithAbsDfSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `StudentT`:
-
-The variance for Student's T equals
-
-```
-df / (df - 2), when df > 2
-infinity, when 1 < df <= 2
-NaN, when df <= 1
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md
deleted file mode 100644
index a5797203003..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md
+++ /dev/null
@@ -1,729 +0,0 @@
-A Transformed Distribution.
-
-A `TransformedDistribution` models `p(y)` given a base distribution `p(x)`,
-and a deterministic, invertible, differentiable transform, `Y = g(X)`. The
-transform is typically an instance of the `Bijector` class and the base
-distribution is typically an instance of the `Distribution` class.
-
-A `Bijector` is expected to implement the following functions:
-- `forward`,
-- `inverse`,
-- `inverse_log_det_jacobian`.
-The semantics of these functions are outlined in the `Bijector` documentation.
-
-We now describe how a `TransformedDistribution` alters the input/outputs of a
-`Distribution` associated with a random variable (rv) `X`.
-
-Write `cdf(Y=y)` for an absolutely continuous cumulative distribution function
-of random variable `Y`; write the probability density function `pdf(Y=y) :=
-d^k / (dy_1,...,dy_k) cdf(Y=y)` for its derivative wrt to `Y` evaluated at
-`y`.  Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
-i.e., a non-random, continuous, differentiable, and invertible function.
-Write the inverse of `g` as `X = g^{-1}(Y)` and `(J o g)(x)` for the Jacobian
-of `g` evaluated at `x`.
-
-A `TransformedDistribution` implements the following operations:
-
-  * `sample`:
-
-    Mathematically:
-
-    ```none
-    Y = g(X)
-    ```
-
-    Programmatically:
-
-    ```python
-    return bijector.forward(distribution.sample(...))
-    ```
-
-  * `log_prob`:
-
-    Mathematically:
-
-    ```none
-    (log o pdf)(Y=y) = (log o pdf o g^{-1})(y) +
-                         (log o abs o det o J o g^{-1})(y)
-    ```
-
-    Programmatically:
-
-    ```python
-    return (distribution.log_prob(bijector.inverse(y)) +
-            bijector.inverse_log_det_jacobian(y))
-    ```
-
-  * `log_cdf`:
-
-    Mathematically:
-
-    ```none
-    (log o cdf)(Y=y) = (log o cdf o g^{-1})(y)
-    ```
-
-    Programmatically:
-
-    ```python
-    return distribution.log_cdf(bijector.inverse(x))
-    ```
-
-  * and similarly for: `cdf`, `prob`, `log_survival_function`,
-   `survival_function`.
-
-A simple example constructing a Log-Normal distribution from a Normal
-distribution:
-
-```python
-ds = tf.contrib.distributions
-log_normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=mu, sigma=sigma),
-  bijector=ds.bijector.Exp(),
-  name="LogNormalTransformedDistribution")
-```
-
-A `LogNormal` made from callables:
-
-```python
-ds = tf.contrib.distributions
-log_normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=mu, sigma=sigma),
-  bijector=ds.bijector.Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="LogNormalTransformedDistribution")
-```
-
-Another example constructing a Normal from a StandardNormal:
-
-```python
-ds = tf.contrib.distributions
-normal = ds.TransformedDistribution(
-  distribution=ds.Normal(mu=0, sigma=1),
-  bijector=ds.bijector.ScaleAndShift(loc=mu, scale=sigma, event_ndims=0),
-  name="NormalTransformedDistribution")
-```
-
-A `TransformedDistribution`'s batch- and event-shape are implied by the base
-distribution unless explicitly overridden by `batch_shape` or `event_shape`
-arguments.  Specifying an overriding `batch_shape` (`event_shape`) is
-permitted only if the base distribution has scalar batch-shape (event-shape).
-The bijector is applied to the distribution as if the distribution possessed
-the overridden shape(s). The following example demonstrates how to construct a
-multivariate Normal as a `TransformedDistribution`.
-
-```python
-bs = tf.contrib.distributions.bijector
-ds = tf.contrib.distributions
-# We will create two MVNs with batch_shape = event_shape = 2.
-mean = [[-1., 0],      # batch:0
-        [0., 1]]       # batch:1
-chol_cov = [[[1., 0],
-             [0, 1]],  # batch:0
-            [[1, 0],
-             [2, 2]]]  # batch:1
-mvn1 = ds.TransformedDistribution(
-    distribution=ds.Normal(mu=0., sigma=1.),
-    bijector=bs.Affine(shift=mean, tril=chol_cov),
-    batch_shape=[2],  # Valid because base_distribution.batch_shape == [].
-    event_shape=[2])  # Valid because base_distribution.event_shape == [].
-mvn2 = ds.MultivariateNormalCholesky(mu=mean, chol=chol_cov)
-# mvn1.log_prob(x) == mvn2.log_prob(x)
-```
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.__init__(distribution, bijector=None, batch_shape=None, event_shape=None, validate_args=False, name=None)` {#TransformedDistribution.__init__}
-
-Construct a Transformed Distribution.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution instance to transform. Typically an
-    instance of `Distribution`.
-*  <b>`bijector`</b>: The object responsible for calculating the transformation.
-    Typically an instance of `Bijector`. `None` means `Identity()`.
-*  <b>`batch_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `batch_shape`; valid only if `distribution.is_scalar_batch()`.
-*  <b>`event_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `event_shape`; valid only if `distribution.is_scalar_event()`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class. Default:
-    `bijector.name + distribution.name`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.allow_nan_stats` {#TransformedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.batch_shape` {#TransformedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#TransformedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.bijector` {#TransformedDistribution.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.cdf(value, name='cdf')` {#TransformedDistribution.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.copy(**override_parameters_kwargs)` {#TransformedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.covariance(name='covariance')` {#TransformedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.distribution` {#TransformedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.dtype` {#TransformedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.entropy(name='entropy')` {#TransformedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.event_shape` {#TransformedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.event_shape_tensor(name='event_shape_tensor')` {#TransformedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_continuous` {#TransformedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_scalar_batch(name='is_scalar_batch')` {#TransformedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.is_scalar_event(name='is_scalar_event')` {#TransformedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_cdf(value, name='log_cdf')` {#TransformedDistribution.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_prob(value, name='log_prob')` {#TransformedDistribution.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.log_survival_function(value, name='log_survival_function')` {#TransformedDistribution.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.mean(name='mean')` {#TransformedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.mode(name='mode')` {#TransformedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.name` {#TransformedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#TransformedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.param_static_shapes(cls, sample_shape)` {#TransformedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.parameters` {#TransformedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.prob(value, name='prob')` {#TransformedDistribution.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.reparameterization_type` {#TransformedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.sample(sample_shape=(), seed=None, name='sample')` {#TransformedDistribution.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.stddev(name='stddev')` {#TransformedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.survival_function(value, name='survival_function')` {#TransformedDistribution.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.validate_args` {#TransformedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.TransformedDistribution.variance(name='variance')` {#TransformedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_graph_from_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_graph_from_inputs.md
deleted file mode 100644
index d7f00160297..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_graph_from_inputs.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.contrib.framework.get_graph_from_inputs(op_input_list, graph=None)` {#get_graph_from_inputs}
-
-Returns the appropriate graph to use for the given inputs.
-
-1. If `graph` is provided, we validate that all inputs in `op_input_list` are
-   from the same graph.
-2. Otherwise, we attempt to select a graph from the first Operation- or
-   Tensor-valued input in `op_input_list`, and validate that all other
-   such inputs are in the same graph.
-3. If the graph was not specified and it could not be inferred from
-   `op_input_list`, we attempt to use the default graph.
-
-##### Args:
-
-
-*  <b>`op_input_list`</b>: A list of inputs to an operation, which may include `Tensor`,
-    `Operation`, and other objects that may be converted to a graph element.
-*  <b>`graph`</b>: (Optional) The explicit graph to use.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_input_list` is not a list or tuple, or if graph is not a
-    Graph.
-*  <b>`ValueError`</b>: If a graph is explicitly passed and not all inputs are from it,
-    or if the inputs are from multiple graphs, or we could not find a graph
-    and there was no default graph.
-
-##### Returns:
-
-  The appropriate graph to use for the given inputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_local_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_local_variables.md
deleted file mode 100644
index 94c9fc96d92..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_local_variables.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.get_local_variables(scope=None, suffix=None)` {#get_local_variables}
-
-Gets the list of local variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_variables_by_name.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_variables_by_name.md
deleted file mode 100644
index a76f564a1d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.get_variables_by_name.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.get_variables_by_name(given_name, scope=None)` {#get_variables_by_name}
-
-Gets the list of variables that were given that name.
-
-##### Args:
-
-
-*  <b>`given_name`</b>: name given to the variable without any scope.
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-
-##### Returns:
-
-  a copied list of variables with the given name and scope.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.init_from_checkpoint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.init_from_checkpoint.md
deleted file mode 100644
index 9d3ee1d24a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.framework.init_from_checkpoint.md
+++ /dev/null
@@ -1,72 +0,0 @@
-### `tf.contrib.framework.init_from_checkpoint(checkpoint_dir, assignment_map)` {#init_from_checkpoint}
-
-Using assingment map initializes current variables with loaded tensors.
-
-Note: This overrides default initialization ops of specified variables and
-redefines dtype.
-
-##### Assignment map supports following syntax:
-
-  `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
-    current `scope_name` from `checkpoint_scope_name` with matching variable
-    names.
-  `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
-    will initalize `scope_name/variable_name` variable
-    from `checkpoint_scope_name/some_other_variable`.
-  `'scope_variable_name': variable` - will initialize given `tf.Variable`
-    object with variable from the checkpoint.
-  `'scope_variable_name': list(variable)` - will initialize list of
-    partitioned variables with variable from the checkpoint.
-  `'/': 'scope_name/'` - will load all variables in current `scope_name` from
-    checkpoint's root (e.g. no scope).
-
-Supports loading into partitioned variables, which are represented as
-'<variable>/part_<part #>'.
-
-
-*  <b>`Example`</b>: 
-```python
-  # Create variables.
-  with tf.variable_scope('test'):
-    m = tf.get_variable('my_var')
-  with tf.variable_scope('test2'):
-    var2 = tf.get_variable('my_var')
-  var3 = tf.get_variable(name="my1", shape=[100, 100],
-                         partitioner=lambda shape, dtype: [5, 1])
-  ...
-  # Specify which variables to intialize from checkpoint.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var': 'test/my_var',
-    'some_scope/': 'test2/'})
-  ...
-  # Or use `Variable` objects to identify what to initialize.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_scope/var2': var2,
-  })
-  # Initialize partitioned variables
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var_from_ckpt': 'part_var',
-  })
-  # Or specifying the list of `Variable` objects.
-  init_from_checkpoint(checkpoint_dir, {
-    'some_var_from_ckpt': var3._get_variable_list(),
-  })
-  ...
-  # Initialize variables as usual.
-  session.run(tf.get_all_variables())
-```
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-*  <b>`assignment_map`</b>: Dict, where keys are names of the variables in the
-    checkpoint and values are current variables or names of current variables
-    (in default graph).
-
-##### Raises:
-
-  tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
-
-*  <b>`ValueError`</b>: If missing variables in current graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.compute_boundary_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.compute_boundary_ts.md
deleted file mode 100644
index 27ad95be99a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.compute_boundary_ts.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.contrib.graph_editor.compute_boundary_ts(ops)` {#compute_boundary_ts}
-
-Compute the tensors at the boundary of a set of ops.
-
-This function looks at all the tensors connected to the given ops (in/out)
-and classify them into three categories:
-1) input tensors: tensors whose generating operation is not in ops.
-2) output tensors: tensors whose consumer operations are not in ops
-3) inside tensors: tensors which are neither input nor output tensors.
-
-Note that a tensor can be both an inside tensor and an output tensor if it is
-consumed by operations both outside and inside of `ops`.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-
-##### Returns:
-
-  A tuple `(outside_input_ts, outside_output_ts, inside_ts)` where:
-    `outside_input_ts` is a Python list of input tensors;
-    `outside_output_ts` is a python list of output tensors;
-    `inside_ts` is a python list of inside tensors.
-  Since a tensor can be both an inside tensor and an output tensor,
-  `outside_output_ts` and `inside_ts` might intersect.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.get_name_scope_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.get_name_scope_ops.md
deleted file mode 100644
index 462ae97e17b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.get_name_scope_ops.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.graph_editor.get_name_scope_ops(ops, scope)` {#get_name_scope_ops}
-
-Get all the operations under the given scope path.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`scope`</b>: a scope path.
-
-##### Returns:
-
-  A list of tf.Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.make_list_of_t.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.make_list_of_t.md
deleted file mode 100644
index c67586bcf88..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.make_list_of_t.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.graph_editor.make_list_of_t(ts, check_graph=True, allow_graph=True, ignore_ops=False)` {#make_list_of_t}
-
-Convert ts to a list of `tf.Tensor`.
-
-##### Args:
-
-
-*  <b>`ts`</b>: can be an iterable of `tf.Tensor`, a `tf.Graph` or a single tensor.
-*  <b>`check_graph`</b>: if `True` check if all the tensors belong to the same graph.
-*  <b>`allow_graph`</b>: if `False` a `tf.Graph` cannot be converted.
-*  <b>`ignore_ops`</b>: if `True`, silently ignore `tf.Operation`.
-
-##### Returns:
-
-  A newly created list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ts` cannot be converted to a list of `tf.Tensor` or,
-   if `check_graph` is `True`, if all the ops do not belong to the same graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_ios.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_ios.md
deleted file mode 100644
index 0979bf0e0f1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_ios.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.reroute_ios(sgv0, sgv1)` {#reroute_ios}
-
-Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.transform_op_if_inside_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.transform_op_if_inside_handler.md
deleted file mode 100644
index 176ed58f081..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.transform_op_if_inside_handler.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.graph_editor.transform_op_if_inside_handler(info, op, keep_if_possible=True)` {#transform_op_if_inside_handler}
-
-Transform an optional op only if it is inside the subgraph.
-
-This handler is typically use to handle original op: it is fine to keep them
-if they are inside the subgraph, otherwise they are just ignored.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`op`</b>: the optional op to transform (or ignore).
-*  <b>`keep_if_possible`</b>: re-attach to the original op if possible, that is,
-    if the source graph and the destination graph are the same.
-
-##### Returns:
-
-  The transformed op or None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.integrate.odeint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.integrate.odeint.md
deleted file mode 100644
index 25b2709be88..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.integrate.odeint.md
+++ /dev/null
@@ -1,90 +0,0 @@
-### `tf.contrib.integrate.odeint(func, y0, t, rtol=1e-06, atol=1e-12, method=None, options=None, full_output=False, name=None)` {#odeint}
-
-Integrate a system of ordinary differential equations.
-
-Solves the initial value problem for a non-stiff system of first order ode-s:
-
-  ```
-  dy/dt = func(y, t), y(t[0]) = y0
-  ```
-
-where y is a Tensor of any shape.
-
-For example:
-
-  ```
-  # solve `dy/dt = -y`, corresponding to exponential decay
-  tf.contrib.integrate.odeint(lambda y, _: -y, 1.0, [0, 1, 2])
-  => [1, exp(-1), exp(-2)]
-  ```
-
-Output dtypes and numerical precision are based on the dtypes of the inputs
-`y0` and `t`.
-
-Currently, implements 5th order Runge-Kutta with adaptive step size control
-and dense output, using the Dormand-Prince method. Similar to the 'dopri5'
-method of `scipy.integrate.ode` and MATLAB's `ode45`.
-
-Based on: Shampine, Lawrence F. (1986), "Some Practical Runge-Kutta Formulas",
-Mathematics of Computation, American Mathematical Society, 46 (173): 135-150,
-doi:10.2307/2008219
-
-##### Args:
-
-
-*  <b>`func`</b>: Function that maps a Tensor holding the state `y` and a scalar Tensor
-    `t` into a Tensor of state derivatives with respect to time.
-*  <b>`y0`</b>: N-D Tensor giving starting value of `y` at time point `t[0]`. May
-    have any floating point or complex dtype.
-*  <b>`t`</b>: 1-D Tensor holding a sequence of time points for which to solve for
-    `y`. The initial time point should be the first element of this sequence,
-    and each time must be larger than the previous time. May have any floating
-    point dtype. If not provided as a Tensor, converted to a Tensor with
-    float64 dtype.
-*  <b>`rtol`</b>: optional float64 Tensor specifying an upper bound on relative error,
-    per element of `y`.
-*  <b>`atol`</b>: optional float64 Tensor specifying an upper bound on absolute error,
-    per element of `y`.
-*  <b>`method`</b>: optional string indicating the integration method to use. Currently,
-    the only valid option is `'dopri5'`.
-*  <b>`options`</b>: optional dict of configuring options for the indicated integration
-    method. Can only be provided if a `method` is explicitly set. For
-    `'dopri5'`, valid options include:
-    * first_step: an initial guess for the size of the first integration
-      (current default: 1.0, but may later be changed to use heuristics based
-      on the gradient).
-    * safety: safety factor for adaptive step control, generally a constant
-      in the range 0.8-1 (default: 0.9).
-    * ifactor: maximum factor by which the adaptive step may be increased
-      (default: 10.0).
-    * dfactor: maximum factor by which the adpative step may be decreased
-      (default: 0.2).
-    * max_num_steps: integer maximum number of integrate steps between time
-      points in `t` (default: 1000).
-*  <b>`full_output`</b>: optional boolean. If True, `odeint` returns a tuple
-    `(y, info_dict)` describing the integration process.
-*  <b>`name`</b>: Optional name for this operation.
-
-##### Returns:
-
-
-*  <b>`y`</b>: (N+1)-D tensor, where the first dimension corresponds to different
-    time points. Contains the solved value of y for each desired time point in
-    `t`, with the initial value `y0` being the first element along the first
-    dimension.
-*  <b>`info_dict`</b>: only if `full_output == True`. A dict with the following values:
-    * num_func_evals: integer Tensor counting the number of function
-      evaluations.
-    * integrate_points: 1D float64 Tensor with the upper bound of each
-      integration time step.
-    * error_ratio: 1D float Tensor with the estimated ratio of the integration
-      error to the error tolerance at each integration step. An ratio greater
-      than 1 corresponds to rejected steps.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an invalid `method` is provided.
-*  <b>`TypeError`</b>: if `options` is supplied without `method`, or if `t` or `y0` has
-    an invalid dtype.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.embedding_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.embedding_column.md
deleted file mode 100644
index 30c543c6312..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.embedding_column.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.layers.embedding_column(sparse_id_column, dimension, combiner='mean', initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None)` {#embedding_column}
-
-Creates an `_EmbeddingColumn` for feeding sparse data into a DNN.
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A `_SparseColumn` which is created by for example
-    `sparse_column_with_*` or crossed_column functions. Note that `combiner`
-    defined in `sparse_id_column` is ignored.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
-    1/sqrt(sparse_id_column.length).
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`max_norm`</b>: (Optional). If not None, embedding values are l2-normalized to
-    the value of max_norm.
-
-##### Returns:
-
-  An `_EmbeddingColumn`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.flatten.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.flatten.md
deleted file mode 100644
index e7de4571b0a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.flatten.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.layers.flatten(*args, **kwargs)` {#flatten}
-
-Flattens the input while maintaining the batch_size.
-
-  Assumes that the first dimension represents the batch.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of size [batch_size, ...].
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A flattened tensor with shape [batch_size, k].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If inputs rank is unknown or less than 2.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.make_place_holder_tensors_for_base_features.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.make_place_holder_tensors_for_base_features.md
deleted file mode 100644
index bc6cc5ccc3b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.make_place_holder_tensors_for_base_features.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.contrib.layers.make_place_holder_tensors_for_base_features(feature_columns)` {#make_place_holder_tensors_for_base_features}
-
-Returns placeholder tensors for inference.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn.
-
-##### Returns:
-
-  A dict mapping feature keys to SparseTensors (sparse columns) or
-  placeholder Tensors (dense columns).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.summarize_tensors.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.summarize_tensors.md
deleted file mode 100644
index 608999b437a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.summarize_tensors.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.layers.summarize_tensors(tensors, summarizer=summarize_tensor)` {#summarize_tensors}
-
-Summarize a set of tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
deleted file mode 100644
index 5b70ba04934..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ /dev/null
@@ -1,467 +0,0 @@
-Linear classifier model.
-
-Train a linear model to classify instances into one of multiple possible
-classes. When number of possible classes is 2, this is binary classification.
-
-Example:
-
-```python
-sparse_column_a = sparse_column_with_hash_bucket(...)
-sparse_column_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-# Estimator using the default optimizer.
-estimator = LinearClassifier(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
-
-# Or estimator using the FTRL optimizer with regularization.
-estimator = LinearClassifier(
-    feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
-    optimizer=tf.train.FtrlOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Or estimator using the SDCAOptimizer.
-estimator = LinearClassifier(
-   feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
-   optimizer=tf.contrib.linear_optimizer.SDCAOptimizer(
-     example_id_column='example_id',
-     num_loss_partitions=...,
-     symmetric_l2_regularization=2.0
-   ))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  ...
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-  `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.__init__(feature_columns, model_dir=None, n_classes=2, weight_column_name=None, optimizer=None, gradient_clip_norm=None, enable_centered_bias=False, _joint_weight=False, config=None, feature_engineering_fn=None)` {#LinearClassifier.__init__}
-
-Construct a `LinearClassifier` estimator object.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    Note that class labels are integers representing the class index (i.e.
-    values from 0 to n_classes-1). For arbitrary label values (e.g. string
-    labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: The optimizer used to train the model. If specified, it should
-    be either an instance of `tf.Optimizer` or the SDCAOptimizer. If `None`,
-    the Ftrl optimizer will be used.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-  _joint_weight: If True, the weights for all columns will be stored in a
-    single (possibly partitioned) variable. It's more efficient, but it's
-    incompatible with SDCAOptimizer, and requires all feature columns are
-    sparse and use the 'sum' combiner.
-
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `LinearClassifier` estimator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if n_classes < 2.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.__repr__()` {#LinearClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.bias_` {#LinearClassifier.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.config` {#LinearClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.evaluate(*args, **kwargs)` {#LinearClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#LinearClassifier.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#LinearClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.fit(*args, **kwargs)` {#LinearClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_params(deep=True)` {#LinearClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_variable_names()` {#LinearClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.get_variable_value(name)` {#LinearClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.model_dir` {#LinearClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.partial_fit(*args, **kwargs)` {#LinearClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict(*args, **kwargs)` {#LinearClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict_classes(*args, **kwargs)` {#LinearClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.predict_proba(*args, **kwargs)` {#LinearClassifier.predict_proba}
-
-Returns predicted probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.set_params(**params)` {#LinearClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.LinearClassifier.weights_` {#LinearClassifier.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.PredictionKey.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.PredictionKey.md
deleted file mode 100644
index 8b137891791..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.PredictionKey.md
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.extract_pandas_data.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.extract_pandas_data.md
deleted file mode 100644
index 6b1956884d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.extract_pandas_data.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.learn.extract_pandas_data(data)` {#extract_pandas_data}
-
-Extract data from pandas.DataFrame for predictors.
-
-Given a DataFrame, will extract the values and cast them to float. The
-DataFrame is expected to contain values of type int, float or bool.
-
-##### Args:
-
-
-*  <b>`data`</b>: `pandas.DataFrame` containing the data to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of the DataFrame's values as floats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if data contains types other than int, float or bool.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.infer_real_valued_columns_from_input_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.infer_real_valued_columns_from_input_fn.md
deleted file mode 100644
index e1ac1979538..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.infer_real_valued_columns_from_input_fn.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.learn.infer_real_valued_columns_from_input_fn(input_fn)` {#infer_real_valued_columns_from_input_fn}
-
-Creates `FeatureColumn` objects for inputs defined by `input_fn`.
-
-This interprets all inputs as dense, fixed-length float values. This creates
-a local graph in which it calls `input_fn` to build the tensors, then discards
-it.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - Dictionary of string feature name to `Tensor` or `Tensor`.
-      labels - `Tensor` of label values.
-
-##### Returns:
-
-  List of `FeatureColumn` objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md
deleted file mode 100644
index 35b49b99cfc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.SummaryWriterCache.get.md
+++ /dev/null
@@ -1,13 +0,0 @@
-#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md
deleted file mode 100644
index b24a86f1e16..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.monitors.ValidationMonitor.md
+++ /dev/null
@@ -1,242 +0,0 @@
-Runs evaluation of a given estimator, at most every N steps.
-
-Note that the evaluation is done based on the saved checkpoint, which will
-usually be older than the current step.
-
-Can do early stopping on validation metrics if `early_stopping_rounds` is
-provided.
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.__init__(x=None, y=None, input_fn=None, batch_size=None, eval_steps=None, every_n_steps=100, metrics=None, hooks=None, early_stopping_rounds=None, early_stopping_metric='loss', early_stopping_metric_minimize=True, name=None)` {#ValidationMonitor.__init__}
-
-Initializes a ValidationMonitor.
-
-##### Args:
-
-
-*  <b>`x`</b>: See `BaseEstimator.evaluate`.
-*  <b>`y`</b>: See `BaseEstimator.evaluate`.
-*  <b>`input_fn`</b>: See `BaseEstimator.evaluate`.
-*  <b>`batch_size`</b>: See `BaseEstimator.evaluate`.
-*  <b>`eval_steps`</b>: See `BaseEstimator.evaluate`.
-*  <b>`every_n_steps`</b>: Check for new checkpoints to evaluate every N steps. If a
-      new checkpoint is found, it is evaluated. See `EveryN`.
-*  <b>`metrics`</b>: See `BaseEstimator.evaluate`.
-*  <b>`hooks`</b>: A list of `SessionRunHook` hooks to pass to the
-    `Estimator`'s `evaluate` function.
-*  <b>`early_stopping_rounds`</b>: `int`. If the metric indicated by
-      `early_stopping_metric` does not change according to
-      `early_stopping_metric_minimize` for this many steps, then training
-      will be stopped.
-*  <b>`early_stopping_metric`</b>: `string`, name of the metric to check for early
-      stopping.
-*  <b>`early_stopping_metric_minimize`</b>: `bool`, True if `early_stopping_metric` is
-      expected to decrease (thus early stopping occurs when this metric
-      stops decreasing), False if `early_stopping_metric` is expected to
-      increase. Typically, `early_stopping_metric_minimize` is True for
-      loss metrics like mean squared error, and False for performance
-      metrics like accuracy.
-*  <b>`name`</b>: See `BaseEstimator.evaluate`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both x and input_fn are provided.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.begin(max_steps=None)` {#ValidationMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.best_step` {#ValidationMonitor.best_step}
-
-Returns the step at which the best early stopping metric was found.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.best_value` {#ValidationMonitor.best_value}
-
-Returns the best early stopping metric value found so far.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.early_stopped` {#ValidationMonitor.early_stopped}
-
-Returns True if this monitor caused an early stop.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.end(session=None)` {#ValidationMonitor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.epoch_begin(epoch)` {#ValidationMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.epoch_end(epoch)` {#ValidationMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_post_step(step, session)` {#ValidationMonitor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_step_begin(step)` {#ValidationMonitor.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.every_n_step_end(step, outputs)` {#ValidationMonitor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.post_step(step, session)` {#ValidationMonitor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.run_on_all_workers` {#ValidationMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.set_estimator(estimator)` {#ValidationMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.step_begin(step)` {#ValidationMonitor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ValidationMonitor.step_end(step, output)` {#ValidationMonitor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
deleted file mode 100644
index 4158479faf1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
+++ /dev/null
@@ -1,75 +0,0 @@
-### `tf.contrib.learn.train(*args, **kwargs)` {#train}
-
-Train a model. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
-run a training loop. The given `train_op` performs one step of training on the
-model. The `loss_op` represents the objective function of the training. It is
-expected to increment the `global_step_tensor`, a scalar integer tensor
-counting training steps. This function uses `Supervisor` to initialize the
-graph (from a checkpoint if one is available in `output_dir`), write summaries
-defined in the graph, and write regular checkpoints as defined by
-`supervisor_save_model_secs`.
-
-Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
-`fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
-program is terminated with exit code 1.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A graph to train. It is expected that this graph is not in use
-    elsewhere.
-*  <b>`output_dir`</b>: A directory to write outputs to.
-*  <b>`train_op`</b>: An op that performs one training step when run.
-*  <b>`loss_op`</b>: A scalar loss tensor.
-*  <b>`global_step_tensor`</b>: A tensor representing the global step. If none is given,
-    one is extracted from the graph using the same logic as in `Supervisor`.
-*  <b>`init_op`</b>: An op that initializes the graph. If `None`, use `Supervisor`'s
-    default.
-*  <b>`init_feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    This feed dictionary will be used when `init_op` is evaluated.
-*  <b>`init_fn`</b>: Optional callable passed to Supervisor to initialize the model.
-*  <b>`log_every_steps`</b>: Output logs regularly. The logs contain timing data and the
-    current loss.
-*  <b>`supervisor_is_chief`</b>: Whether the current process is the chief supervisor in
-    charge of restoring the model and running standard services.
-*  <b>`supervisor_master`</b>: The master string to use when preparing the session.
-*  <b>`supervisor_save_model_secs`</b>: Save a checkpoint every
-    `supervisor_save_model_secs` seconds when training.
-*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
-    keep. As new files are created, older files are deleted. If None or 0,
-    all checkpoint files are kept. This is simply passed as the max_to_keep
-    arg to tf.Saver constructor.
-*  <b>`supervisor_save_summaries_steps`</b>: Save summaries every
-    `supervisor_save_summaries_steps` seconds when training.
-*  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
-    passed to `session.run` calls. Optional.
-*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
-*  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
-    evaluates to `NaN`. If false, continue training as if nothing happened.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. Two calls fit(steps=100) means 200 training iterations.
-    On the other hand two calls of fit(max_steps=100) means, second call
-    will not do any iteration since first call did all 100 steps.
-
-##### Returns:
-
-  The final loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `output_dir`, `train_op`, `loss_op`, or `global_step_tensor`
-    is not provided. See `tf.contrib.framework.get_global_step` for how we
-    look up the latter if not provided explicitly.
-*  <b>`NanLossDuringTrainingError`</b>: If `fail_on_nan_loss` is `True`, and loss ever
-    evaluates to `NaN`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md
deleted file mode 100644
index 340c9f8cba1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorComposition.md
+++ /dev/null
@@ -1,503 +0,0 @@
-Composes one or more `LinearOperators`.
-
-This operator composes one or more linear operators `[op1,...,opJ]`,
-building a new `LinearOperator` with action defined by:
-
-```
-op_composed(x) := op1(op2(...(opJ(x)...))
-```
-
-If `opj` acts like [batch] matrix `Aj`, then `op_composed` acts like the
-[batch] matrix formed with the multiplication `A1 A2...AJ`.
-
-If `opj` has shape `batch_shape_j + [M_j, N_j]`, then we must have
-`N_j = M_{j+1}`, in which case the composed operator has shape equal to
-`broadcast_batch_shape + [M_1, N_J]`, where `broadcast_batch_shape` is the
-mutual broadcast of `batch_shape_j`, `j = 1,...,J`, assuming the intermediate
-batch shapes broadcast.  Even if the composed shape is well defined, the
-composed operator's methods may fail due to lack of broadcasting ability in
-the defining operators' methods.
-
-```python
-# Create a 2 x 2 linear operator composed of two 2 x 2 operators.
-operator_1 = LinearOperatorMatrix([[1., 2.], [3., 4.]])
-operator_2 = LinearOperatorMatrix([[1., 0.], [0., 1.]])
-operator = LinearOperatorComposition([operator_1, operator_2])
-
-operator.to_dense()
-==> [[1., 2.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 5 linear operators.
-matrix_45 = tf.random_normal(shape=[2, 3, 4, 5])
-operator_45 = LinearOperatorMatrix(matrix)
-
-# Create a [2, 3] batch of 5 x 6 linear operators.
-matrix_56 = tf.random_normal(shape=[2, 3, 5, 6])
-operator_56 = LinearOperatorMatrix(matrix_56)
-
-# Compose to create a [2, 3] batch of 4 x 6 operators.
-opeartor_46 = LinearOperatorComposition([operator_45, operator_56])
-
-# Create a shape [2, 3, 6, 2] vector.
-x = tf.random_normal(shape=[2, 3, 6, 2])
-operator.apply(x)
-==> Shape [2, 3, 4, 2] Tensor
-```
-
-#### Performance
-
-The performance of `LinearOperatorComposition` on any operation is equal to
-the sum of the individual operators' operations.
-
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.__init__(operators, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name=None)` {#LinearOperatorComposition.__init__}
-
-Initialize a `LinearOperatorComposition`.
-
-`LinearOperatorComposition` is initialized with a list of operators
-`[op_1,...,op_J]`.  For the `apply` method to be well defined, the
-composition `op_i.apply(op_{i+1}(x))` must be defined.  Other methods have
-similar constraints.
-
-##### Args:
-
-
-*  <b>`operators`</b>: Iterable of `LinearOperator` objects, each with
-    the same `dtype` and composible shape.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.  Default is the individual
-    operators names joined with `_o_`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If all operators do not have the same `dtype`.
-*  <b>`ValueError`</b>: If `operators` is empty.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorComposition.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.apply(x, adjoint=False, name='apply')` {#LinearOperatorComposition.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_non_singular(name='assert_non_singular')` {#LinearOperatorComposition.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorComposition.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorComposition.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape` {#LinearOperatorComposition.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorComposition.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.determinant(name='det')` {#LinearOperatorComposition.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension` {#LinearOperatorComposition.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorComposition.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.dtype` {#LinearOperatorComposition.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.graph_parents` {#LinearOperatorComposition.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_non_singular` {#LinearOperatorComposition.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_positive_definite` {#LinearOperatorComposition.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_self_adjoint` {#LinearOperatorComposition.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.is_square` {#LinearOperatorComposition.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.log_abs_determinant(name='log_abs_det')` {#LinearOperatorComposition.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.name` {#LinearOperatorComposition.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.operators` {#LinearOperatorComposition.operators}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension` {#LinearOperatorComposition.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorComposition.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.shape` {#LinearOperatorComposition.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.shape_tensor(name='shape_tensor')` {#LinearOperatorComposition.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorComposition.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank` {#LinearOperatorComposition.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorComposition.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorComposition.to_dense(name='to_dense')` {#LinearOperatorComposition.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md
deleted file mode 100644
index 30998f8f9fc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.linalg.LinearOperatorIdentity.md
+++ /dev/null
@@ -1,529 +0,0 @@
-`LinearOperator` acting like a [batch] square identity matrix.
-
-This operator acts like a [batch] identity matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.  This matrix `A` is not materialized, but for
-purposes of broadcasting this shape will be relevant.
-
-`LinearOperatorIdentity` is initialized with `num_rows`, and optionally
-`batch_shape`, and `dtype` arguments.  If `batch_shape` is `None`, this
-operator efficiently passes through all arguments.  If `batch_shape` is
-provided, broadcasting may occur, which will require making copies.
-
-```python
-# Create a 2 x 2 identity matrix.
-operator = LinearOperatorIdentity(num_rows=2, dtype=tf.float32)
-
-operator.to_dense()
-==> [[1., 0.]
-     [0., 1.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> 0.
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor, same as x.
-
-y = tf.random_normal(shape=[3, 2, 4])
-# Note that y.shape is compatible with operator.shape because operator.shape
-# is broadcast to [3, 2, 2].
-# This broadcast does NOT require copying data, since we can infer that y
-# will be passed through without changing shape.  We are always able to infer
-# this if the operator has no batch_shape.
-x = operator.solve(y)
-==> Shape [3, 2, 4] Tensor, same as y.
-
-# Create a 2-batch of 2x2 identity matrices
-operator = LinearOperatorIdentity(num_rows=2, batch_shape=[2])
-operator.to_dense()
-==> [[[1., 0.]
-      [0., 1.]],
-     [[1., 0.]
-      [0., 1.]]]
-
-# Here, even though the operator has a batch shape, the input is the same as
-# the output, so x can be passed through without a copy.  The operator is able
-# to detect that no broadcast is necessary because both x and the operator
-# have statically defined shape.
-x = ... Shape [2, 2, 3]
-operator.apply(x)
-==> Shape [2, 2, 3] Tensor, same as x
-
-# Here the operator and x have different batch_shape, and are broadcast.
-# This requires a copy, since the output is different size than the input.
-x = ... Shape [1, 2, 3]
-operator.apply(x)
-==> Shape [2, 2, 3] Tensor, equal to [x, x]
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-### Performance
-
-If `batch_shape` initialization arg is `None`:
-
-* `operator.apply(x)` is `O(1)`
-* `operator.solve(x)` is `O(1)`
-* `operator.determinant()` is `O(1)`
-
-If `batch_shape` initialization arg is provided, and static checks cannot
-rule out the need to broadcast:
-
-* `operator.apply(x)` is `O(D1*...*Dd*N*R)`
-* `operator.solve(x)` is `O(D1*...*Dd*N*R)`
-* `operator.determinant()` is `O(B1*...*Bb)`
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.__init__(num_rows, batch_shape=None, dtype=None, is_non_singular=True, is_self_adjoint=True, is_positive_definite=True, assert_proper_shapes=False, name='LinearOperatorIdentity')` {#LinearOperatorIdentity.__init__}
-
-Initialize a `LinearOperatorIdentity`.
-
-The `LinearOperatorIdentity` is initialized with arguments defining `dtype`
-and shape.
-
-This operator is able to broadcast the leading (batch) dimensions, which
-sometimes requires copying data.  If `batch_shape` is `None`, the operator
-can take arguments of any batch shape without copying.  See examples.
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Scalar non-negative integer `Tensor`.  Number of rows in the
-    corresponding identity matrix.
-*  <b>`batch_shape`</b>: Optional `1-D` integer `Tensor`.  The shape of the leading
-    dimensions.  If `None`, this operator has no leading dimensions.
-*  <b>`dtype`</b>: Data type of the matrix that this operator represents.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-*  <b>`assert_proper_shapes`</b>: Python `bool`.  If `False`, only perform static
-    checks that initialization and method arguments have proper shape.
-    If `True`, and static checks are inconclusive, add asserts to the graph.
-*  <b>`name`</b>: A name for this `LinearOperator`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num_rows` is determined statically to be non-scalar, or
-    negative.
-*  <b>`ValueError`</b>: If `batch_shape` is determined statically to not be 1-D, or
-    negative.
-*  <b>`ValueError`</b>: If any of the following is not `True`:
-    `{is_self_adjoint, is_non_singular, is_positive_definite}`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.add_to_tensor(mat, name='add_to_tensor')` {#LinearOperatorIdentity.add_to_tensor}
-
-Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
-
-##### Args:
-
-
-*  <b>`mat`</b>: `Tensor` with same `dtype` and shape broadcastable to `self`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.apply(x, adjoint=False, name='apply')` {#LinearOperatorIdentity.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_non_singular(name='assert_non_singular')` {#LinearOperatorIdentity.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorIdentity.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorIdentity.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape` {#LinearOperatorIdentity.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorIdentity.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.determinant(name='det')` {#LinearOperatorIdentity.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension` {#LinearOperatorIdentity.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorIdentity.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.dtype` {#LinearOperatorIdentity.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.graph_parents` {#LinearOperatorIdentity.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_non_singular` {#LinearOperatorIdentity.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_positive_definite` {#LinearOperatorIdentity.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_self_adjoint` {#LinearOperatorIdentity.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.is_square` {#LinearOperatorIdentity.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.log_abs_determinant(name='log_abs_det')` {#LinearOperatorIdentity.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.name` {#LinearOperatorIdentity.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension` {#LinearOperatorIdentity.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorIdentity.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape` {#LinearOperatorIdentity.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorIdentity.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorIdentity.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank` {#LinearOperatorIdentity.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorIdentity.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorIdentity.to_dense(name='to_dense')` {#LinearOperatorIdentity.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md
deleted file mode 100644
index 1f900a6ffc7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.absolute_difference.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.losses.absolute_difference(*args, **kwargs)` {#absolute_difference}
-
-Adds an Absolute Difference loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.absolute_difference instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md
deleted file mode 100644
index 6f7d92f7bbf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.compute_weighted_loss.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.losses.compute_weighted_loss(*args, **kwargs)` {#compute_weighted_loss}
-
-Computes the weighted loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.compute_weighted_loss instead.
-
-##### Args:
-
-
-*  <b>`losses`</b>: A tensor of size [batch_size, d1, ... dN].
-*  <b>`weights`</b>: A tensor of size [1] or [batch_size, d1, ... dK] where K < N.
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` that returns the weighted loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is `None` or the shape is not compatible with
-    `losses`, or if the number of dimensions (rank) of either `losses` or
-    `weights` is missing.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
deleted file mode 100644
index 4c6894c37e8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.losses.hinge_loss(*args, **kwargs)` {#hinge_loss}
-
-Method that returns the loss tensor for hinge loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.hinge_loss instead.
-
-##### Args:
-
-
-*  <b>`logits`</b>: The logits, a float tensor.
-*  <b>`labels`</b>: The ground truth output tensor. Its shape should match the shape of
-    logits. The values of the tensor are expected to be 0.0 or 1.0.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A `Tensor` of same shape as `logits` and `labels` representing the loss
-    values across the batch.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `logits` and `labels` don't match.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md
deleted file mode 100644
index 285b2528e04..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_squared_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_squared_error}
-
-Computes the mean squared error between the labels and predictions.
-
-The `streaming_mean_squared_error` function creates two local variables,
-`total` and `count` that are used to compute the mean squared error.
-This average is weighted by `weights`, and it is ultimately returned as
-`mean_squared_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_squared_error`. Internally, a `squared_error` operation computes the
-element-wise square of the difference between `predictions` and `labels`. Then
-`update_op` increments `total` with the reduced sum of the product of
-`weights` and `squared_error`, and it increments `count` with the reduced sum
-of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_squared_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_squared_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_squared_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md
deleted file mode 100644
index 19259dd2f35..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md
+++ /dev/null
@@ -1,74 +0,0 @@
-### `tf.contrib.metrics.streaming_sparse_recall_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_recall_at_k}
-
-Computes recall@k of the predictions with respect to sparse labels.
-
-If `class_id` is not specified, we'll calculate recall as the ratio of true
-    positives (i.e., correct predictions, items in the top `k` highest
-    `predictions` that are found in the corresponding row in `labels`) to
-    actual positives (the full `labels` row).
-If `class_id` is specified, we calculate recall by considering only the rows
-    in the batch for which `class_id` is in `labels`, and computing the
-    fraction of them for which `class_id` is in the corresponding row in
-    `labels`.
-
-`streaming_sparse_recall_at_k` creates two local variables,
-`true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
-the recall_at_k frequency. This frequency is ultimately returned as
-`recall_at_<k>`: an idempotent operation that simply divides
-`true_positive_at_<k>` by total (`true_positive_at_<k>` +
-`false_negative_at_<k>`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false negatives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_negative_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-    The final dimension contains the logit values for each class. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
-    Values should be in range [0, num_classes), where num_classes is the last
-    dimension of `predictions`. Values outside this range always count
-    towards `false_negative_at_<k>`.
-*  <b>`k`</b>: Integer, k for @k metric.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes), where num_classes is the last dimension of
-    `predictions`. If class_id is outside this range, the method returns NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: Scalar `float64` `Tensor` with the value of `true_positives` divided
-    by the sum of `true_positives` and `false_negatives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_negatives` variables appropriately, and whose value matches
-    `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-  `predictions`, or if either `metrics_collections` or `updates_collections`
-  are not a list or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.opt.VariableClippingOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.opt.VariableClippingOptimizer.md
deleted file mode 100644
index 7dac3ec66af..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.opt.VariableClippingOptimizer.md
+++ /dev/null
@@ -1,66 +0,0 @@
-Wrapper optimizer that clips the norm of specified variables after update.
-
-This optimizer delegates all aspects of gradient calculation and application
-to an underlying optimizer.  After applying gradients, this optimizer then
-clips the variable to have a maximum L2 norm along specified dimensions.
-NB: this is quite different from clipping the norm of the gradients.
-
-Multiple instances of `VariableClippingOptimizer` may be chained to specify
-different max norms for different subsets of variables.
-
-This is more efficient at serving-time than using normalization during
-embedding lookup, at the expense of more expensive training and fewer
-guarantees about the norms.
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.__init__(opt, vars_to_clip_dims, max_norm, use_locking=False, colocate_clip_ops_with_vars=False, name='VariableClipping')` {#VariableClippingOptimizer.__init__}
-
-Construct a new clip-norm optimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: The actual optimizer that will be used to compute and apply the
-    gradients. Must be one of the Optimizer classes.
-*  <b>`vars_to_clip_dims`</b>: A dict with keys as Variables and values as lists
-    of dimensions along which to compute the L2-norm.  See
-    `tf.clip_by_norm` for more details.
-*  <b>`max_norm`</b>: The L2-norm to clip to, for all variables specified.
-*  <b>`use_locking`</b>: If `True` use locks for clip update operations.
-*  <b>`colocate_clip_ops_with_vars`</b>: If `True`, try colocating the clip norm
-    ops with the corresponding variable.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "VariableClipping".
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#VariableClippingOptimizer.apply_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.compute_gradients(*args, **kwargs)` {#VariableClippingOptimizer.compute_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.get_slot(*args, **kwargs)` {#VariableClippingOptimizer.get_slot}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.VariableClippingOptimizer.get_slot_names(*args, **kwargs)` {#VariableClippingOptimizer.get_slot_names}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md
deleted file mode 100644
index eb4a38a8c3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.BasicLSTMCell.md
+++ /dev/null
@@ -1,72 +0,0 @@
-Basic LSTM recurrent network cell.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add forget_bias (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-It does not allow cell clipping, a projection layer, and does not
-use peep-hole connections: it is the basic baseline.
-
-For advanced models, please use the full LSTMCell that follows.
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.__call__(inputs, state, scope=None)` {#BasicLSTMCell.__call__}
-
-Long short-term memory cell (LSTM).
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, state_is_tuple=True, activation=tanh)` {#BasicLSTMCell.__init__}
-
-Initialize the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  The latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.output_size` {#BasicLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.state_size` {#BasicLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicLSTMCell.zero_state(batch_size, dtype)` {#BasicLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md
deleted file mode 100644
index 4f7cf4402f4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GRUCell.md
+++ /dev/null
@@ -1,51 +0,0 @@
-Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
-- - -
-
-#### `tf.contrib.rnn.GRUCell.__call__(inputs, state, scope=None)` {#GRUCell.__call__}
-
-Gated recurrent unit (GRU) with nunits cells.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.__init__(num_units, input_size=None, activation=tanh)` {#GRUCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.output_size` {#GRUCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.state_size` {#GRUCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUCell.zero_state(batch_size, dtype)` {#GRUCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md
deleted file mode 100644
index a88d5f8977a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md
+++ /dev/null
@@ -1,134 +0,0 @@
-Grid Long short-term memory unit (LSTM) recurrent network cell.
-
-The default is based on:
-  Nal Kalchbrenner, Ivo Danihelka and Alex Graves
-  "Grid Long Short-Term Memory," Proc. ICLR 2016.
-  http://arxiv.org/abs/1507.01526
-
-When peephole connections are used, the implementation is based on:
-  Tara N. Sainath and Bo Li
-  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
-  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
-
-The code uses optional peephole connections, shared_weights and cell clipping.
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.__call__(inputs, state, scope=None)` {#GridLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, [batch, feature_size].
-*  <b>`state`</b>: Tensor or tuple of Tensors, 2D, [batch, state_size], depends on the
-    flag self._state_is_tuple.
-*  <b>`scope`</b>: (optional) VariableScope for the created subgraph; if None, it
-    defaults to "GridLSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A 2D, [batch, output_dim], Tensor representing the output of the LSTM
-    after reading "inputs" when previous state was "state".
-    Here output_dim is num_units.
-  - A 2D, [batch, state_size], Tensor representing the new state of LSTM
-    after reading "inputs" when previous state was "state".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an input_size was specified and the provided inputs have
-    a different dimension.
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.__init__(num_units, use_peepholes=False, share_time_frequency_weights=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None, num_frequency_blocks=None, start_freqindex_list=None, end_freqindex_list=None, couple_input_forget_gates=False, state_is_tuple=False)` {#GridLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: (optional) bool, default False. Set True to enable
-    diagonal/peephole connections.
-*  <b>`share_time_frequency_weights`</b>: (optional) bool, default False. Set True to
-    enable shared cell weights between time and frequency LSTMs.
-*  <b>`cell_clip`</b>: (optional) A float value, default None, if provided the cell
-    state is clipped by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices, default None.
-*  <b>`num_unit_shards`</b>: (optional) int, defualt 1, How to split the weight
-    matrix. If > 1,the weight matrix is stored across num_unit_shards.
-*  <b>`forget_bias`</b>: (optional) float, default 1.0, The initial bias of the
-    forget gates, used to reduce the scale of forgetting at the beginning
-    of the training.
-*  <b>`feature_size`</b>: (optional) int, default None, The size of the input feature
-    the LSTM spans over.
-*  <b>`frequency_skip`</b>: (optional) int, default None, The amount the LSTM filter
-    is shifted by in frequency.
-*  <b>`num_frequency_blocks`</b>: [required] A list of frequency blocks needed to
-    cover the whole input feature splitting defined by start_freqindex_list
-    and end_freqindex_list.
-*  <b>`start_freqindex_list`</b>: [optional], list of ints, default None,  The
-    starting frequency index for each frequency block.
-*  <b>`end_freqindex_list`</b>: [optional], list of ints, default None. The ending
-    frequency index for each frequency block.
-*  <b>`couple_input_forget_gates`</b>: (optional) bool, default False, Whether to
-    couple the input and forget gates, i.e. f_gate = 1.0 - i_gate, to reduce
-    model parameters and computation cost.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  By default (False), they are concatenated
-    along the column axis.  This default behavior will soon be deprecated.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the num_frequency_blocks list is not specified
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.output_size` {#GridLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.state_size` {#GridLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.state_tuple_type` {#GridLSTMCell.state_tuple_type}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GridLSTMCell.zero_state(batch_size, dtype)` {#GridLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md
deleted file mode 100644
index 0d9eb1ff907..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.TimeReversedFusedRNN.md
+++ /dev/null
@@ -1,25 +0,0 @@
-This is an adaptor to time-reverse a FusedRNNCell.
-
-For example,
-
-```python
-cell = tf.contrib.rnn.BasicRNNCell(10)
-fw_lstm = tf.contrib.rnn.FusedRNNCellAdaptor(cell, use_dynamic_rnn=True)
-bw_lstm = tf.contrib.rnn.TimeReversedFusedRNN(fw_lstm)
-fw_out, fw_state = fw_lstm(inputs)
-bw_out, bw_state = bw_lstm(inputs)
-```
-- - -
-
-#### `tf.contrib.rnn.TimeReversedFusedRNN.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#TimeReversedFusedRNN.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeReversedFusedRNN.__init__(cell)` {#TimeReversedFusedRNN.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md
deleted file mode 100644
index 0f6ef9e4090..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.static_state_saving_rnn.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.contrib.rnn.static_state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None)` {#static_state_saving_rnn}
-
-RNN that accepts a state saver for time-truncated RNN calculation.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
-    `[batch_size, input_size]`.
-*  <b>`state_saver`</b>: A state saver object with methods `state` and `save_state`.
-*  <b>`state_name`</b>: Python string or tuple of strings.  The name to use with the
-    state_saver. If the cell returns tuples of states (i.e.,
-    `cell.state_size` is a tuple) then `state_name` should be a tuple of
-    strings having the same length as `cell.state_size`.  Otherwise it should
-    be a single string.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector size [batch_size].
-    See the documentation for rnn() for more details about sequence_length.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-    outputs is a length T list of outputs (one for each input)
-    states is the final state
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the arity and
-   type of `state_name` does not match that of `cell.state_size`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.training.resample_at_rate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.training.resample_at_rate.md
deleted file mode 100644
index 7142c66c2e7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.training.resample_at_rate.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.contrib.training.resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False)` {#resample_at_rate}
-
-Given `inputs` tensors, stochastically resamples each at a given rate.
-
-For example, if the inputs are `[[a1, a2], [b1, b2]]` and the rates
-tensor contains `[3, 1]`, then the return value may look like `[[a1,
-a2, a1, a1], [b1, b2, b1, b1]]`. However, many other outputs are
-possible, since this is stochastic -- averaged over many repeated
-calls, each set of inputs should appear in the output `rate` times
-the number of invocations.
-
-Uses Knuth's method to generate samples from the poisson
-distribution (but instead of just incrementing a count, actually
-emits the input); this is described at
-https://en.wikipedia.org/wiki/Poisson_distribution in the section on
-generating Poisson-distributed random variables.
-
-Note that this method is not appropriate for large rate values: with
-float16 it will stop performing correctly for rates above 9.17;
-float32, 87; and float64, 708. (These are the base-e versions of the
-minimum representable exponent for each type.)
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of tensors, each of which has a shape of `[batch_size, ...]`
-*  <b>`rates`</b>: A tensor of shape `[batch_size]` contiaining the resampling rates
-         for each input.
-*  <b>`scope`</b>: Scope for the op.
-*  <b>`seed`</b>: Random seed to use.
-*  <b>`back_prop`</b>: Whether to allow back-propagation through this op.
-
-##### Returns:
-
-  Selections from the input tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.util.constant_value.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.util.constant_value.md
deleted file mode 100644
index 58ba7b0abbe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.util.constant_value.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.util.constant_value(tensor)` {#constant_value}
-
-Returns the constant value of the given tensor, if efficiently calculable.
-
-This function attempts to partially evaluate the given tensor, and
-returns its value as a numpy ndarray if this succeeds.
-
-TODO(mrry): Consider whether this function should use a registration
-mechanism like gradients and ShapeFunctions, so that it is easily
-extensible.
-
-NOTE: If `constant_value(tensor)` returns a non-`None` result, it will no
-longer be possible to feed a different value for `tensor`. This allows the
-result of this function to influence the graph that is constructed, and
-permits static shape optimizations.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: The Tensor to be evaluated.
-
-##### Returns:
-
-  A numpy ndarray containing the constant value of the given `tensor`,
-  or None if it cannot be calculated.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if tensor is not an ops.Tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.convert_to_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.convert_to_tensor.md
deleted file mode 100644
index 226e01ead0b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.convert_to_tensor.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None)` {#convert_to_tensor}
-
-Converts the given `value` to a `Tensor`.
-
-This function converts Python objects of various types to `Tensor`
-objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-and Python scalars. For example:
-
-```python
-import numpy as np
-
-def my_func(arg):
-  arg = tf.convert_to_tensor(arg, dtype=tf.float32)
-  return tf.matmul(arg, arg) + arg
-
-# The following calls are equivalent.
-value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
-value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
-value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
-```
-
-This function can be useful when composing a new operation in Python
-(such as `my_func` in the example above). All standard Python op
-constructors apply this function to each of their Tensor-valued
-inputs, which allows those ops to accept numpy arrays, Python lists,
-and scalars in addition to `Tensor` objects.
-
-##### Args:
-
-
-*  <b>`value`</b>: An object whose type has a registered `Tensor` conversion function.
-*  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
-    type is inferred from the type of `value`.
-*  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
-*  <b>`preferred_dtype`</b>: Optional element type for the returned tensor,
-    used when dtype is None. In some cases, a caller may not have a
-    dtype in mind when converting to a tensor, so preferred_dtype
-    can be used as a soft preference.  If the conversion to
-    `preferred_dtype` is not possible, this argument has no effect.
-
-##### Returns:
-
-  An `Output` based on `value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If no conversion function is registered for `value`.
-*  <b>`RuntimeError`</b>: If a registered conversion function returns an invalid value.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.diag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.diag.md
deleted file mode 100644
index 32791228755..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.diag.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.diag(diagonal, name=None)` {#diag}
-
-Returns a diagonal tensor with a given diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-
-`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-
-For example:
-
-```prettyprint
-# 'diagonal' is [1, 2, 3, 4]
-tf.diag(diagonal) ==> [[1, 0, 0, 0]
-                       [0, 2, 0, 0]
-                       [0, 0, 3, 0]
-                       [0, 0, 0, 4]]
-```
-
-##### Args:
-
-
-*  <b>`diagonal`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    Rank k tensor where k is at most 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `diagonal`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.divide.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.divide.md
deleted file mode 100644
index 8db7bf156c3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.divide.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.divide(x, y, name=None)` {#divide}
-
-Computes Python style division of `x` by `y`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md
deleted file mode 100644
index 45597f2056f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md
+++ /dev/null
@@ -1,74 +0,0 @@
-### `tf.einsum(equation, *inputs)` {#einsum}
-
-A generalized contraction between tensors of arbitrary dimension.
-
-This function returns a tensor whose elements are defined by `equation`,
-which is written in a shorthand form inspired by the Einstein summation
-convention.  As an example, consider multiplying two matrices
-A and B to form a matrix C.  The elements of C are given by:
-
-```
-  C[i,k] = sum_j A[i,j] * B[j,k]
-```
-
-The corresponding `equation` is:
-
-```
-  ij,jk->ik
-```
-
-In general, the `equation` is obtained from the more familiar element-wise
-equation by
-  1. removing variable names, brackets, and commas,
-  2. replacing "*" with ",",
-  3. dropping summation signs, and
-  4. moving the output to the right, and replacing "=" with "->".
-
-Many common operations can be expressed in this way.  For example:
-
-```python
-# Matrix multiplication
->>> einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
-
-# Dot product
->>> einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
-
-# Outer product
->>> einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
-
-# Transpose
->>> einsum('ij->ji', m)  # output[j,i] = m[i,j]
-
-# Batch matrix multiplication
->>> einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
-```
-
-This function behaves like `numpy.einsum`, but does not support:
-* Ellipses (subscripts like `ij...,jk...->ik...`)
-* Subscripts where an axis appears more than once for a single input
-  (e.g. `ijj,k->ik`).
-* Subscripts that are summed across multiple inputs (e.g., `ij,ij,jk->ik`).
-
-##### Args:
-
-
-*  <b>`equation`</b>: a `str` describing the contraction, in the same format as
-    `numpy.einsum`.
-*  <b>`inputs`</b>: the inputs to contract (each one a `Tensor`), whose shapes should
-    be consistent with `equation`.
-
-##### Returns:
-
-  The contracted `Tensor`, with shape determined by `equation`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If
-    - the format of `equation` is incorrect,
-    - the number of inputs implied by `equation` does not match `len(inputs)`,
-    - an axis appears in the output subscripts but not in any of the inputs,
-    - the number of dimensions of an input differs from the number of
-      indices in its subscript, or
-    - the input shapes are inconsistent along a particular axis.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.erf.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.erf.md
deleted file mode 100644
index 21e0f14be4d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.erf.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.erf(x, name=None)` {#erf}
-
-Computes the Gauss error function of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.errors.AlreadyExistsError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.errors.AlreadyExistsError.md
deleted file mode 100644
index 85425df298b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.errors.AlreadyExistsError.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Raised when an entity that we attempted to create already exists.
-
-For example, running an operation that saves a file
-(e.g. [`tf.train.Saver.save()`](../../api_docs/python/train.md#Saver.save))
-could potentially raise this exception if an explicit filename for an
-existing file was passed.
-
-- - -
-
-#### `tf.errors.AlreadyExistsError.__init__(node_def, op, message)` {#AlreadyExistsError.__init__}
-
-Creates an `AlreadyExistsError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.fake_quant_with_min_max_args_gradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.fake_quant_with_min_max_args_gradient.md
deleted file mode 100644
index 5c93c3e0468..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.fake_quant_with_min_max_args_gradient.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.fake_quant_with_min_max_args_gradient(gradients, inputs, min=None, max=None, name=None)` {#fake_quant_with_min_max_args_gradient}
-
-Compute gradients for a FakeQuantWithMinMaxArgs operation.
-
-##### Args:
-
-
-*  <b>`gradients`</b>: A `Tensor` of type `float32`.
-    Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-    Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-*  <b>`min`</b>: An optional `float`. Defaults to `-6`.
-*  <b>`max`</b>: An optional `float`. Defaults to `6`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-  `gradients * (inputs >= min && inputs <= max)`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.get_default_session.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.get_default_session.md
deleted file mode 100644
index c564366e8b8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.get_default_session.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.get_default_session()` {#get_default_session}
-
-Returns the default session for the current thread.
-
-The returned `Session` will be the innermost session on which a
-`Session` or `Session.as_default()` context has been entered.
-
-NOTE: The default session is a property of the current thread. If you
-create a new thread, and wish to use the default session in that
-thread, you must explicitly add a `with sess.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  The default `Session` being used in the current thread.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.gradients.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.gradients.md
deleted file mode 100644
index ea710b2a156..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.gradients.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.gradients(ys, xs, grad_ys=None, name='gradients', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)` {#gradients}
-
-Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.
-
-`ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
-is a list of `Tensor`, holding the gradients received by the
-`ys`. The list must be the same length as `ys`.
-
-`gradients()` adds ops to the graph to output the partial
-derivatives of `ys` with respect to `xs`.  It returns a list of
-`Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
-for y in `ys`.
-
-`grad_ys` is a list of tensors of the same length as `ys` that holds
-the initial gradients for each y in `ys`.  When `grad_ys` is None,
-we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
-user can provide their own initial `grad_ys` to compute the
-derivatives using a different initial gradient for each y (e.g., if
-one wanted to weight the gradient differently for each value in
-each y).
-
-##### Args:
-
-
-*  <b>`ys`</b>: A `Tensor` or list of tensors to be differentiated.
-*  <b>`xs`</b>: A `Tensor` or list of tensors to be used for differentiation.
-*  <b>`grad_ys`</b>: Optional. A `Tensor` or list of tensors the same size as
-    `ys` and holding the gradients computed for each y in `ys`.
-*  <b>`name`</b>: Optional name to use for grouping all the gradient ops together.
-    defaults to 'gradients'.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`gate_gradients`</b>: If True, add a tuple around the gradients returned
-    for an operations.  This avoids some race conditions.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Accepted values are constants defined in the class `AggregationMethod`.
-
-##### Returns:
-
-  A list of `sum(dy/dx)` for each x in `xs`.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: if one of the operations between `x` and `y` does not
-    have a registered gradient function.
-*  <b>`ValueError`</b>: if the arguments are invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
deleted file mode 100644
index d6ce057c133..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.greater_equal(x, y, name=None)` {#greater_equal}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.igammac.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.igammac.md
deleted file mode 100644
index 2d935bb6e32..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.igammac.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.igammac(a, x, name=None)` {#igammac}
-
-Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-
-The upper regularized incomplete Gamma function is defined as:
-
-```
-Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-```
-where
-```
-Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-```
-is the upper incomplete Gama function.
-
-Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-Gamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.adjust_hue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.adjust_hue.md
deleted file mode 100644
index e334e26184f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.adjust_hue.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.image.adjust_hue(image, delta, name=None)` {#adjust_hue}
-
-Adjust hue of an RGB image.
-
-This is a convenience method that converts an RGB image to float
-representation, converts it to HSV, add an offset to the hue channel, converts
-back to RGB and then back to the original data type. If several adjustments
-are chained it is advisable to minimize the number of redundant conversions.
-
-`image` is an RGB image.  The image hue is adjusted by converting the
-image to HSV and rotating the hue channel (H) by
-`delta`.  The image is then converted back to RGB.
-
-`delta` must be in the interval `[-1, 1]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`delta`</b>: float.  How much to add to the hue channel.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.central_crop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.central_crop.md
deleted file mode 100644
index 4e6b6115f87..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.central_crop.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.image.central_crop(image, central_fraction)` {#central_crop}
-
-Crop the central region of the image.
-
-Remove the outer parts of an image but retain the central region of the image
-along each dimension. If we specify central_fraction = 0.5, this function
-returns the region marked with "X" in the below diagram.
-
-     --------
-    |        |
-    |  XXXX  |
-    |  XXXX  |
-    |        |   where "X" is the central 50% of the image.
-     --------
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D float Tensor of shape [height, width, depth]
-*  <b>`central_fraction`</b>: float (0, 1], fraction of size to crop
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if central_crop_fraction is not within (0, 1].
-
-##### Returns:
-
-  3-D float Tensor
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.random_hue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.random_hue.md
deleted file mode 100644
index 09a4ebc17f1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.random_hue.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.image.random_hue(image, max_delta, seed=None)` {#random_hue}
-
-Adjust the hue of an RGB image by a random factor.
-
-Equivalent to `adjust_hue()` but uses a `delta` randomly
-picked in the interval `[-max_delta, max_delta]`.
-
-`max_delta` must be in the interval `[0, 0.5]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`max_delta`</b>: float.  Maximum value for the random delta.
-*  <b>`seed`</b>: An operation-specific seed. It will be used in conjunction
-    with the graph-level seed to determine the real seeds that will be
-    used in this operation. Please see the documentation of
-    set_random_seed for its interaction with the graph-level random seed.
-
-##### Returns:
-
-  3-D float tensor of shape `[height, width, channels]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `max_delta` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.resize_bicubic.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.resize_bicubic.md
deleted file mode 100644
index 1805c7423dd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.image.resize_bicubic.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.image.resize_bicubic(images, size, align_corners=None, name=None)` {#resize_bicubic}
-
-Resize `images` to `size` using bicubic interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.invert_permutation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.invert_permutation.md
deleted file mode 100644
index 20cab182082..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.invert_permutation.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.invert_permutation(x, name=None)` {#invert_permutation}
-
-Computes the inverse permutation of a tensor.
-
-This operation computes the inverse of an index permutation. It takes a 1-D
-integer tensor `x`, which represents the indices of a zero-based array, and
-swaps each value with its index position. In other words, for an output tensor
-`y` and an input tensor `x`, this operation computes the following:
-
-`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-
-The values must include 0. There can be no duplicate values or negative values.
-
-For example:
-
-```prettyprint
-# tensor `x` is [3, 4, 0, 2, 1]
-invert_permutation(x) ==> [2, 4, 3, 0, 1]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`. 1-D.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.is_strictly_increasing.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.is_strictly_increasing.md
deleted file mode 100644
index bdaedd519e6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.is_strictly_increasing.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.is_strictly_increasing(x, name=None)` {#is_strictly_increasing}
-
-Returns `True` if `x` is strictly increasing.
-
-Elements of `x` are compared in row-major order.  The tensor `[x[0],...]`
-is strictly increasing if for every adjacent pair we have `x[i] < x[i+1]`.
-If `x` has less than two elements, it is trivially strictly increasing.
-
-See also:  `is_non_decreasing`
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "is_strictly_increasing"
-
-##### Returns:
-
-  Boolean `Tensor`, equal to `True` iff `x` is strictly increasing.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `x` is not a numeric tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.linspace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.linspace.md
deleted file mode 100644
index 29b8993fe61..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.linspace.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.linspace(start, stop, num, name=None)` {#linspace}
-
-Generates values in an interval.
-
-A sequence of `num` evenly-spaced values are generated beginning at `start`.
-If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-so that the last one is exactly `stop`.
-
-For example:
-
-```
-tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-```
-
-##### Args:
-
-
-*  <b>`start`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    First entry in the range.
-*  <b>`stop`</b>: A `Tensor`. Must have the same type as `start`.
-    Last entry in the range.
-*  <b>`num`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Number of values to generate.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `start`. 1-D. The generated values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.log1p.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.log1p.md
deleted file mode 100644
index e861034528b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.log1p.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.log1p(x, name=None)` {#log1p}
-
-Computes natural logarithm of (1 + x) element-wise.
-
-I.e., \\(y = \log_e (1 + x)\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
deleted file mode 100644
index 1cbc6177de8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
+++ /dev/null
@@ -1,96 +0,0 @@
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
-
-map on the list of tensors unpacked from `elems` on dimension 0.
-
-The simplest version of `map` repeatedly applies the callable `fn` to a
-sequence of elements from first to last. The elements are made of the
-tensors unpacked from `elems`. `dtype` is the data type of the return
-value of `fn`. Users must provide `dtype` if it is different from
-the data type of `elems`.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
-
-This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-is a (possibly nested) list or tuple of tensors, then each of these tensors
-must have a matching first (unpack) dimension.  The signature of `fn` may
-match the structure of `elems`.  That is, if `elems` is
-`(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-`fn = lambda (t1, [t2, t3, [t4, t5]]):`.
-
-Furthermore, `fn` may emit a different structure than its input.  For example,
-`fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-nested) tuple of types matching the output of `fn`.
-
-To apply a functional operation to the nonzero elements of a SparseTensor
-one of the following methods is recommended. First, if the function is
-expressible as TensorFlow ops, use
-
-```python
-  result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-```
-
-If, however, the function is not expressible as a TensorFlow op, then use
-
-```python
-result = SparseTensor(
-  input.indices, map_fn(fn, input.values), input.dense_shape)
-```
-
-instead.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.  It accepts one argument, which will
-    have the same (possibly nested) structure as `elems`.  Its output
-    must have the same structure as `dtype` if one is provided, otherwise
-    it must have the same structure as `elems`.
-*  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
-    will be unpacked along their first dimension.  The nested sequence
-    of the resulting slices will be applied to `fn`.
-*  <b>`dtype`</b>: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-    of Tensors differing from the structure of `elems`, then `dtype` is not
-    optional and must have the same structure as the output of `fn`.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-  results of applying `fn` to tensors unpacked from `elems` along the first
-  dimension, from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `dtype` do not match, or if elems is a SparseTensor.
-*  <b>`ValueError`</b>: if the lengths of the output of `fn` and `dtype` do not match.
-
-##### Examples:
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  squares = map_fn(lambda x: x * x, elems)
-  # squares == [1, 4, 9, 16, 25, 36]
-  ```
-
-  ```python
-  elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-  alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-  # alternate == [-1, 2, -3]
-  ```
-
-  ```python
-  elems = np.array([1, 2, 3])
-  alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-  # alternates[0] == [1, 2, 3]
-  # alternates[1] == [-1, -2, -3]
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.matching_files.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.matching_files.md
deleted file mode 100644
index 19262f8dd5b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.matching_files.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.matching_files(pattern, name=None)` {#matching_files}
-
-Returns the set of files matching one or more glob patterns.
-
-Note that this routine only supports wildcard characters in the
-basename portion of the pattern, not in the directory portion.
-
-##### Args:
-
-
-*  <b>`pattern`</b>: A `Tensor` of type `string`.
-    Shell wildcard pattern(s). Scalar or vector of type string.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. A vector of matching filenames.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.negative.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.negative.md
deleted file mode 100644
index 80a062f68ff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.negative.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.negative(x, name=None)` {#negative}
-
-Computes numerical negative value element-wise.
-
-I.e., \(y = -x\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.compute_accidental_hits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.compute_accidental_hits.md
deleted file mode 100644
index 9d5bb30303e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.compute_accidental_hits.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.nn.compute_accidental_hits(true_classes, sampled_candidates, num_true, seed=None, name=None)` {#compute_accidental_hits}
-
-Compute the position ids in `sampled_candidates` matching `true_classes`.
-
-In Candidate Sampling, this operation facilitates virtually removing
-sampled classes which happen to match target classes.  This is done
-in Sampled Softmax and Sampled Logistic.
-
-See our [Candidate Sampling Algorithms
-Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-
-We presuppose that the `sampled_candidates` are unique.
-
-We call it an 'accidental hit' when one of the target classes
-matches one of the sampled classes.  This operation reports
-accidental hits as triples `(index, id, weight)`, where `index`
-represents the row number in `true_classes`, `id` represents the
-position in `sampled_candidates`, and weight is `-FLOAT_MAX`.
-
-The result of this op should be passed through a `sparse_to_dense`
-operation, then added to the logits of the sampled classes. This
-removes the contradictory effect of accidentally sampling the true
-target classes as noise classes for the same example.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled_candidates output of CandidateSampler.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`indices`</b>: A `Tensor` of type `int32` and shape `[num_accidental_hits]`.
-    Values indicate rows in `true_classes`.
-*  <b>`ids`</b>: A `Tensor` of type `int64` and shape `[num_accidental_hits]`.
-    Values indicate positions in `sampled_candidates`.
-*  <b>`weights`</b>: A `Tensor` of type `float` and shape `[num_accidental_hits]`.
-    Each value is `-FLOAT_MAX`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.conv1d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.conv1d.md
deleted file mode 100644
index d073ce7fb2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.conv1d.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.nn.conv1d(value, filters, stride, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv1d}
-
-Computes a 1-D convolution given 3-D input and filter tensors.
-
-Given an input tensor of shape
-  [batch, in_width, in_channels]
-if data_format is "NHWC", or
-  [batch, in_channels, in_width]
-if data_format is "NCHW",
-and a filter / kernel tensor of shape
-[filter_width, in_channels, out_channels], this op reshapes
-the arguments to pass them to conv2d to perform the equivalent
-convolution operation.
-
-Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-For example, if `data_format` does not start with "NC", a tensor of shape
-  [batch, in_width, in_channels]
-is reshaped to
-  [batch, 1, in_width, in_channels],
-and the filter is reshaped to
-  [1, filter_width, in_channels, out_channels].
-The result is then reshaped back to
-  [batch, out_width, out_channels]
-(where out_width is a function of the stride and padding as in conv2d) and
-returned to the caller.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 3D `Tensor`.  Must be of type `float32` or `float64`.
-*  <b>`filters`</b>: A 3D `Tensor`.  Must have the same type as `input`.
-*  <b>`stride`</b>: An `integer`.  The number of entries by which
-    the filter is moved right at each step.
-*  <b>`padding`</b>: 'SAME' or 'VALID'
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`.  Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from `"NHWC", "NCHW"`.  Defaults
-    to `"NHWC"`, the data is stored in the order of
-    [batch, in_width, in_channels].  The `"NCHW"` format stores
-    data as [batch, in_channels, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.  Has the same type as input.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `data_format` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.embedding_lookup_sparse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.embedding_lookup_sparse.md
deleted file mode 100644
index 23e0fe4a549..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.embedding_lookup_sparse.md
+++ /dev/null
@@ -1,76 +0,0 @@
-### `tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, partition_strategy='mod', name=None, combiner=None, max_norm=None)` {#embedding_lookup_sparse}
-
-Computes embeddings for the given ids and weights.
-
-This op assumes that there is at least one id for each row in the dense tensor
-represented by sp_ids (i.e. there are no rows with empty features), and that
-all the indices of sp_ids are in canonical row-major order.
-
-It also assumes that all id values lie in the range [0, p0), where p0
-is the sum of the size of params along dimension 0.
-
-##### Args:
-
-
-*  <b>`params`</b>: A single tensor representing the complete embedding tensor,
-    or a list of P tensors all of same shape except for the first dimension,
-    representing sharded embedding tensors.  Alternatively, a
-    `PartitionedVariable`, created by partitioning along dimension 0. Each
-    element must be appropriately sized for the given `partition_strategy`.
-*  <b>`sp_ids`</b>: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
-    where N is typically batch size and M is arbitrary.
-*  <b>`sp_weights`</b>: either a SparseTensor of float / double weights, or None to
-    indicate all weights should be taken to be 1. If specified, sp_weights
-    must have exactly the same shape and indices as sp_ids.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-    if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-    is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: Optional name for the op.
-*  <b>`combiner`</b>: A string specifying the reduction op. Currently "mean", "sqrtn"
-    and "sum" are supported.
-    "sum" computes the weighted sum of the embedding results for each row.
-    "mean" is the weighted sum divided by the total weight.
-    "sqrtn" is the weighted sum divided by the square root of the sum of the
-    squares of the weights.
-*  <b>`max_norm`</b>: If not None, each embedding is normalized to have l2 norm equal
-    to max_norm before combining.
-
-##### Returns:
-
-  A dense tensor representing the combined embeddings for the
-  sparse ids. For each row in the dense tensor represented by sp_ids, the op
-  looks up the embeddings for all ids in that row, multiplies them by the
-  corresponding weight, and combines these embeddings as specified.
-
-  In other words, if
-
-    shape(combined params) = [p0, p1, ..., pm]
-
-  and
-
-    shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
-
-  then
-
-    shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
-
-  For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
-
-    [0, 0]: id 1, weight 2.0
-    [0, 1]: id 3, weight 0.5
-    [1, 0]: id 0, weight 1.0
-    [2, 3]: id 1, weight 3.0
-
-  with `combiner`="mean", then the output will be a 3x20 matrix where
-
-    output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
-    output[1, :] = params[0, :] * 1.0
-    output[2, :] = params[1, :] * 3.0
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If sp_ids is not a SparseTensor, or if sp_weights is neither
-    None nor SparseTensor.
-*  <b>`ValueError`</b>: If combiner is not one of {"mean", "sqrtn", "sum"}.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.erosion2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.erosion2d.md
deleted file mode 100644
index a6fc19fb6d6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.erosion2d.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.nn.erosion2d(value, kernel, strides, rates, padding, name=None)` {#erosion2d}
-
-Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
-
-The `value` tensor has shape `[batch, in_height, in_width, depth]` and the
-`kernel` tensor has shape `[kernel_height, kernel_width, depth]`, i.e.,
-each input channel is processed independently of the others with its own
-structuring function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the
-output tensor depend on the `padding` algorithm. We currently only support the
-default "NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D erosion is given by:
-
-    output[b, y, x, c] =
-       min_{dy, dx} value[b,
-                          strides[1] * y - rates[1] * dy,
-                          strides[2] * x - rates[2] * dx,
-                          c] -
-                    kernel[dy, dx, c]
-
-Duality: The erosion of `value` by the `kernel` is equal to the negation of
-the dilation of `-value` by the reflected `kernel`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. 4-D with shape `[batch, in_height, in_width, depth]`.
-*  <b>`kernel`</b>: A `Tensor`. Must have the same type as `value`.
-    3-D with shape `[kernel_height, kernel_width, depth]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. The stride of the sliding window for each dimension of
-    the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. The input stride for atrous morphological dilation.
-    Must be: `[1, rate_height, rate_width, 1]`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional). If not specified "erosion2d"
-    is used.
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `value`.
-  4-D with shape `[batch, out_height, out_width, depth]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `value` depth does not match `kernel`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.moments.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.moments.md
deleted file mode 100644
index dd56055311e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.moments.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.nn.moments(x, axes, shift=None, name=None, keep_dims=False)` {#moments}
-
-Calculate the mean and variance of `x`.
-
-The mean and variance are calculated by aggregating the contents of `x`
-across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
-and variance of a vector.
-
-Note: for numerical stability, when shift=None, the true mean
-would be computed and used as shift.
-
-When using these moments for batch normalization (see
-`tf.nn.batch_normalization`):
-
- * for so-called "global normalization", used with convolutional filters with
-   shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
- * for simple batch normalization pass `axes=[0]` (batch only).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`axes`</b>: Array of ints.  Axes along which to compute mean and
-    variance.
-*  <b>`shift`</b>: A `Tensor` containing the value by which to shift the data for
-    numerical stability, or `None` in which case the true mean of the data is
-    used as shift. A shift close to the true mean provides the most
-    numerically stable results.
-*  <b>`name`</b>: Name used to scope the operations that compute the moments.
-*  <b>`keep_dims`</b>: produce moments with the same dimensionality as the input.
-
-##### Returns:
-
-  Two `Tensor` objects: `mean` and `variance`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.normalize_moments.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.normalize_moments.md
deleted file mode 100644
index d7a6b9cab48..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.normalize_moments.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.nn.normalize_moments(counts, mean_ss, variance_ss, shift, name=None)` {#normalize_moments}
-
-Calculate the mean and variance of based on the sufficient statistics.
-
-##### Args:
-
-
-*  <b>`counts`</b>: A `Tensor` containing a the total count of the data (one value).
-*  <b>`mean_ss`</b>: A `Tensor` containing the mean sufficient statistics: the (possibly
-    shifted) sum of the elements to average over.
-*  <b>`variance_ss`</b>: A `Tensor` containing the variance sufficient statistics: the
-    (possibly shifted) squared sum of the data to compute the variance over.
-*  <b>`shift`</b>: A `Tensor` containing the value by which the data is shifted for
-    numerical stability, or `None` if no shift was performed.
-*  <b>`name`</b>: Name used to scope the operations that compute the moments.
-
-##### Returns:
-
-  Two `Tensor` objects: `mean` and `variance`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
deleted file mode 100644
index 6cddbd7d172..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.nn.sampled_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, partition_strategy='mod', name='sampled_softmax_loss')` {#sampled_softmax_loss}
-
-Computes and returns the sampled softmax training loss.
-
-This is a faster way to train a softmax classifier over a huge number of
-classes.
-
-This operation is for training only.  It is generally an underestimate of
-the full softmax loss.
-
-At inference time, you can compute full softmax probabilities with the
-expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
-
-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
-
-Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
-([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
-
-##### Args:
-
-
-*  <b>`weights`</b>: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
-      objects whose concatenation along dimension 0 has shape
-      [num_classes, dim].  The (possibly-sharded) class embeddings.
-*  <b>`biases`</b>: A `Tensor` of shape `[num_classes]`.  The class biases.
-*  <b>`labels`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-      num_true]`. The target classes.  Note that this format differs from
-      the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
-*  <b>`inputs`</b>: A `Tensor` of shape `[batch_size, dim]`.  The forward
-      activations of the input network.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`num_classes`</b>: An `int`. The number of possible classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`sampled_values`</b>: a tuple of (`sampled_candidates`, `true_expected_count`,
-      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
-      (if None, we default to `log_uniform_candidate_sampler`)
-*  <b>`remove_accidental_hits`</b>: A `bool`.  whether to remove "accidental hits"
-      where a sampled class equals one of the target classes.  Default is
-      True.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
-      Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `batch_size` 1-D tensor of per-example sampled softmax losses.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.parse_single_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.parse_single_example.md
deleted file mode 100644
index e5ac731bce3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.parse_single_example.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.parse_single_example(serialized, features, name=None, example_names=None)` {#parse_single_example}
-
-Parses a single `Example` proto.
-
-Similar to `parse_example`, except:
-
-For dense tensors, the returned `Tensor` is identical to the output of
-`parse_example`, except there is no batch dimension, the output shape is the
-same as the shape given in `dense_shape`.
-
-For `SparseTensor`s, the first (batch) column of the indices matrix is removed
-(the indices matrix is a column vector), the values vector is unchanged, and
-the first (`batch_size`) entry of the shape vector is removed (it is now a
-single element vector).
-
-One might see performance advantages by batching `Example` protos with
-`parse_example` instead of using this function directly.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A scalar string Tensor, a single serialized Example.
-    See `_parse_single_example_raw` documentation for more details.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: (Optional) A scalar string Tensor, the associated name.
-    See `_parse_single_example_raw` documentation for more details.
-
-##### Returns:
-
-  A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any feature is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.qr.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.qr.md
deleted file mode 100644
index 64467b22a75..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.qr.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.qr(input, full_matrices=None, name=None)` {#qr}
-
-Computes the QR decompositions of one or more matrices.
-
-Computes the QR decomposition of each inner matrix in `tensor` such that
-`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-
-```prettyprint
-# a is a tensor.
-# q is a tensor of orthonormal matrices.
-# r is a tensor of upper triangular matrices.
-q, r = qr(a)
-q_full, r_full = qr(a, full_matrices=True)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`, `complex64`, `complex128`.
-    A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-    form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-*  <b>`full_matrices`</b>: An optional `bool`. Defaults to `False`.
-    If true, compute full-sized `q` and `r`. If false
-    (the default), compute only the leading `P` columns of `q`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (q, r).
-
-*  <b>`q`</b>: A `Tensor`. Has the same type as `input`. Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-    `[..., M, M]`.
-*  <b>`r`</b>: A `Tensor`. Has the same type as `input`. Triangular factor. If `full_matrices` is `False` then shape is
-    `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.random_uniform_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.random_uniform_initializer.md
deleted file mode 100644
index 65cf6073059..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.random_uniform_initializer.md
+++ /dev/null
@@ -1,25 +0,0 @@
-Initializer that generates tensors with a uniform distribution.
-
-Args:
-  minval: A python scalar or a scalar tensor. Lower bound of the range
-    of random values to generate.
-  maxval: A python scalar or a scalar tensor. Upper bound of the range
-    of random values to generate.  Defaults to 1 for float types.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type.
-- - -
-
-#### `tf.random_uniform_initializer.__call__(shape, dtype=None, partition_info=None)` {#random_uniform_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.random_uniform_initializer.__init__(minval=0, maxval=None, seed=None, dtype=tf.float32)` {#random_uniform_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.read_file.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.read_file.md
deleted file mode 100644
index 3c0ad3652af..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.read_file.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.read_file(filename, name=None)` {#read_file}
-
-Reads and outputs the entire contents of the input filename.
-
-##### Args:
-
-
-*  <b>`filename`</b>: A `Tensor` of type `string`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_any.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_any.md
deleted file mode 100644
index ef4468dae2f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_any.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.reduce_any(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_any}
-
-Computes the "logical or" of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[True,  True]
-#         [False, False]]
-tf.reduce_any(x) ==> True
-tf.reduce_any(x, 0) ==> [True, True]
-tf.reduce_any(x, 1) ==> [True, False]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The boolean tensor to reduce.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.any
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md
deleted file mode 100644
index 2a6d631b6bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_join.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.reduce_join(inputs, axis=None, keep_dims=False, separator='', name=None, reduction_indices=None)` {#reduce_join}
-
-Joins a string Tensor across the given dimensions.
-
-Computes the string join across dimensions in the given string Tensor of shape
-`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-strings with the given separator (default: empty string).  Negative indices are
-counted backwards from the end, with `-1` being equivalent to `n - 1`.
-
-For example:
-
-```
-# tensor `a` is [["a", "b"], ["c", "d"]]
-tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-tf.reduce_join(a, []) ==> ["abcd"]
-```
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `string`.
-    The input to be joined.  All reduced indices must have non-zero size.
-*  <b>`axis`</b>: A `Tensor` of type `int32`.
-    The dimensions to reduce over.  Dimensions are reduced in the
-    order specified.  Omitting `axis` is equivalent to passing
-    `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-*  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, retain reduced dimensions with length `1`.
-*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
-    The separator to use when joining.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-  Has shape equal to that of the input with reduced dimensions removed or
-  set to `1` depending on `keep_dims`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_logsumexp.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_logsumexp.md
deleted file mode 100644
index 485d8fb9bef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.reduce_logsumexp.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.reduce_logsumexp(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_logsumexp}
-
-Computes log(sum(exp(elements across dimensions of a tensor))).
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-This function is more numerically stable than log(sum(exp(input))). It avoids
-overflows caused by taking the exp of large inputs and underflows caused by
-taking the log of small inputs.
-
-For example:
-
-```python
-# 'x' is [[0, 0, 0]]
-#         [0, 0, 0]]
-tf.reduce_logsumexp(x) ==> log(6)
-tf.reduce_logsumexp(x, 0) ==> [log(2), log(2), log(2)]
-tf.reduce_logsumexp(x, 1) ==> [log(3), log(3)]
-tf.reduce_logsumexp(x, 1, keep_dims=True) ==> [[log(3)], [log(3)]]
-tf.reduce_logsumexp(x, [0, 1]) ==> log(6)
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.sparse_minimum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.sparse_minimum.md
deleted file mode 100644
index 1455e3e533e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.sparse_minimum.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.sparse_minimum(sp_a, sp_b, name=None)` {#sparse_minimum}
-
-Returns the element-wise min of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-Example:
-
-```python
-sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
-sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-res = tf.sparse_minimum(sp_zero, sp_one).eval()
-# "res" should be equal to SparseTensor([[0], [1]], [0, 0], [7]).
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: a `SparseTensor` operand whose dtype is real, and indices
-    lexicographically ordered.
-*  <b>`sp_b`</b>: the other `SparseTensor` operand with the same requirements (and the
-    same shape).
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: the output SparseTensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.string_to_hash_bucket_strong.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.string_to_hash_bucket_strong.md
deleted file mode 100644
index 764dfe8431c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.string_to_hash_bucket_strong.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.string_to_hash_bucket_strong(input, num_buckets, key, name=None)` {#string_to_hash_bucket_strong}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process. The hash function is a keyed hash function, where attribute `key`
-defines the key of the hash function. `key` is an array of 2 elements.
-
-A strong hash is important when inputs may be malicious, e.g. URLs with
-additional components. Adversaries could try to make their inputs hash to the
-same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it dificult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-time than `tf.string_to_hash_bucket_fast`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. The strings to assign a hash bucket.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`key`</b>: A list of `ints`.
-    The key for the keyed hash function passed as a list of two uint64
-    elements.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.Benchmark.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.Benchmark.md
deleted file mode 100644
index d4a2f78b5d7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.Benchmark.md
+++ /dev/null
@@ -1,61 +0,0 @@
-Abstract class that provides helpers for TensorFlow benchmarks.
-- - -
-
-#### `tf.test.Benchmark.is_abstract(cls)` {#Benchmark.is_abstract}
-
-
-
-
-- - -
-
-#### `tf.test.Benchmark.report_benchmark(iters=None, cpu_time=None, wall_time=None, throughput=None, extras=None, name=None)` {#Benchmark.report_benchmark}
-
-Report a benchmark.
-
-##### Args:
-
-
-*  <b>`iters`</b>: (optional) How many iterations were run
-*  <b>`cpu_time`</b>: (optional) Total cpu time in seconds
-*  <b>`wall_time`</b>: (optional) Total wall time in seconds
-*  <b>`throughput`</b>: (optional) Throughput (in MB/s)
-*  <b>`extras`</b>: (optional) Dict mapping string keys to additional benchmark info.
-    Values may be either floats or values that are convertible to strings.
-*  <b>`name`</b>: (optional) Override the BenchmarkEntry name with `name`.
-    Otherwise it is inferred from the top-level method name.
-
-
-- - -
-
-#### `tf.test.Benchmark.run_op_benchmark(sess, op_or_tensor, feed_dict=None, burn_iters=2, min_iters=10, store_trace=False, store_memory_usage=True, name=None, extras=None, mbs=0)` {#Benchmark.run_op_benchmark}
-
-Run an op or tensor in the given session.  Report the results.
-
-##### Args:
-
-
-*  <b>`sess`</b>: `Session` object to use for timing.
-*  <b>`op_or_tensor`</b>: `Operation` or `Tensor` to benchmark.
-*  <b>`feed_dict`</b>: A `dict` of values to feed for each op iteration (see the
-    `feed_dict` parameter of `Session.run`).
-*  <b>`burn_iters`</b>: Number of burn-in iterations to run.
-*  <b>`min_iters`</b>: Minimum number of iterations to use for timing.
-*  <b>`store_trace`</b>: Boolean, whether to run an extra untimed iteration and
-    store the trace of iteration in the benchmark report.
-    The trace will be stored as a string in Google Chrome trace format
-    in the extras field "full_trace_chrome_format".
-*  <b>`store_memory_usage`</b>: Boolean, whether to run an extra untimed iteration,
-    calculate memory usage, and store that in extras fields.
-*  <b>`name`</b>: (optional) Override the BenchmarkEntry name with `name`.
-    Otherwise it is inferred from the top-level method name.
-*  <b>`extras`</b>: (optional) Dict mapping string keys to additional benchmark info.
-    Values may be either floats or values that are convertible to strings.
-*  <b>`mbs`</b>: (optional) The number of megabytes moved by this op, used to
-    calculate the ops throughput.
-
-##### Returns:
-
-  A `dict` containing the key-value pairs that were passed to
-  `report_benchmark`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.get_temp_dir.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.get_temp_dir.md
deleted file mode 100644
index e36d6163a7d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.test.get_temp_dir.md
+++ /dev/null
@@ -1,10 +0,0 @@
-### `tf.test.get_temp_dir()` {#get_temp_dir}
-
-Returns a temporary directory for use during tests.
-
-There is no need to delete the directory after the test.
-
-##### Returns:
-
-  The temporary directory.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.AdadeltaOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.AdadeltaOptimizer.md
deleted file mode 100644
index 4503f54b749..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.AdadeltaOptimizer.md
+++ /dev/null
@@ -1,23 +0,0 @@
-Optimizer that implements the Adadelta algorithm.
-
-See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
-([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
-
-- - -
-
-#### `tf.train.AdadeltaOptimizer.__init__(learning_rate=0.001, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta')` {#AdadeltaOptimizer.__init__}
-
-Construct a new Adadelta optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value. The learning rate.
-*  <b>`rho`</b>: A `Tensor` or a floating point value. The decay rate.
-*  <b>`epsilon`</b>: A `Tensor` or a floating point value.  A constant epsilon used
-           to better conditioning the grad update.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adadelta".
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.ClusterSpec.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.ClusterSpec.md
deleted file mode 100644
index bd4c26b2d33..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.ClusterSpec.md
+++ /dev/null
@@ -1,210 +0,0 @@
-Represents a cluster as a set of "tasks", organized into "jobs".
-
-A `tf.train.ClusterSpec` represents the set of processes that
-participate in a distributed TensorFlow computation. Every
-[`tf.train.Server`](#Server) is constructed in a particular cluster.
-
-To create a cluster with two jobs and five tasks, you specify the
-mapping from job names to lists of network addresses (typically
-hostname-port pairs).
-
-```python
-cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
-                                           "worker1.example.com:2222",
-                                           "worker2.example.com:2222"],
-                                "ps": ["ps0.example.com:2222",
-                                       "ps1.example.com:2222"]})
-```
-
-Each job may also be specified as a sparse mapping from task indices
-to network addresses. This enables a server to be configured without
-needing to know the identity of (for example) all other worker
-tasks:
-
-```python
-cluster = tf.train.ClusterSpec({"worker": {1: "worker1.example.com:2222"},
-                                "ps": ["ps0.example.com:2222",
-                                       "ps1.example.com:2222"]})
-```
-
-- - -
-
-#### `tf.train.ClusterSpec.as_cluster_def()` {#ClusterSpec.as_cluster_def}
-
-Returns a `tf.train.ClusterDef` protocol buffer based on this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.as_dict()` {#ClusterSpec.as_dict}
-
-Returns a dictionary from job names to their tasks.
-
-For each job, if the task index space is dense, the corresponding
-value will be a list of network addresses; otherwise it will be a
-dictionary mapping (sparse) task indices to the corresponding
-addresses.
-
-##### Returns:
-
-  A dictionary mapping job names to lists or dictionaries
-  describing the tasks in those jobs.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.ClusterSpec.__bool__()` {#ClusterSpec.__bool__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__eq__(other)` {#ClusterSpec.__eq__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__init__(cluster)` {#ClusterSpec.__init__}
-
-Creates a `ClusterSpec`.
-
-##### Args:
-
-
-*  <b>`cluster`</b>: A dictionary mapping one or more job names to (i) a
-    list of network addresses, or (ii) a dictionary mapping integer
-    task indices to network addresses; or a `tf.train.ClusterDef`
-    protocol buffer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cluster` is not a dictionary mapping strings to lists
-    of strings, and not a `tf.train.ClusterDef` protobuf.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__ne__(other)` {#ClusterSpec.__ne__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__nonzero__()` {#ClusterSpec.__nonzero__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.job_tasks(job_name)` {#ClusterSpec.job_tasks}
-
-Returns a mapping from task ID to address in the given job.
-
-NOTE: For backwards compatibility, this method returns a list. If
-the given job was defined with a sparse set of task indices, the
-length of this list may not reflect the number of tasks defined in
-this job. Use the [`num_tasks()`](#ClusterSpec.num_tasks) method
-to find the number of tasks defined in a particular job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  A list of task addresses, where the index in the list
-  corresponds to the task index of each task. The list may contain
-  `None` if the job was defined with a sparse set of task indices.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.jobs` {#ClusterSpec.jobs}
-
-Returns a list of job names in this cluster.
-
-##### Returns:
-
-  A list of strings, corresponding to the names of jobs in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.num_tasks(job_name)` {#ClusterSpec.num_tasks}
-
-Returns the number of tasks defined in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  The number of tasks defined in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.task_address(job_name, task_index)` {#ClusterSpec.task_address}
-
-Returns the address of the given task in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-*  <b>`task_index`</b>: A non-negative integer.
-
-##### Returns:
-
-  The address of the given task in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster,
-  or no task with index `task_index` is defined in that job.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.task_indices(job_name)` {#ClusterSpec.task_indices}
-
-Returns a list of valid task indices in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  A list of valid task indices in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster,
-  or no task with index `task_index` is defined in that job.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.GradientDescentOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.GradientDescentOptimizer.md
deleted file mode 100644
index 99a5f1f0b1c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.GradientDescentOptimizer.md
+++ /dev/null
@@ -1,18 +0,0 @@
-Optimizer that implements the gradient descent algorithm.
-
-- - -
-
-#### `tf.train.GradientDescentOptimizer.__init__(learning_rate, use_locking=False, name='GradientDescent')` {#GradientDescentOptimizer.__init__}
-
-Construct a new gradient descent optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning
-    rate to use.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "GradientDescent".
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
deleted file mode 100644
index e76b7838ed2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
+++ /dev/null
@@ -1,85 +0,0 @@
-Prints the given tensors once every N local steps or once every N seconds.
-
-The tensors will be printed to the log, with `INFO` severity.
-- - -
-
-#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None, formatter=None)` {#LoggingTensorHook.__init__}
-
-Initializes a LoggingHook monitor.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: `dict` that maps string-valued tags to tensors/tensor names,
-      or `iterable` of tensors/tensor names.
-*  <b>`every_n_iter`</b>: `int`, print the values of `tensors` once every N local
-      steps taken on the current worker.
-*  <b>`every_n_secs`</b>: `int` or `float`, print the values of `tensors` once every N
-      seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
-      provided.
-*  <b>`formatter`</b>: function, takes dict of `tag`->`Tensor` and returns a string.
-      If `None` uses default printing all tensors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `every_n_iter` is non-positive.
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.after_create_session(session, coord)` {#LoggingTensorHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.after_run(run_context, run_values)` {#LoggingTensorHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.before_run(run_context)` {#LoggingTensorHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.begin()` {#LoggingTensorHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.end(session)` {#LoggingTensorHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LooperThread.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LooperThread.md
deleted file mode 100644
index d4fc63d870e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LooperThread.md
+++ /dev/null
@@ -1,222 +0,0 @@
-A thread that runs code repeatedly, optionally on a timer.
-
-This thread class is intended to be used with a `Coordinator`.  It repeatedly
-runs code specified either as `target` and `args` or by the `run_loop()`
-method.
-
-Before each run the thread checks if the coordinator has requested stop.  In
-that case the looper thread terminates immediately.
-
-If the code being run raises an exception, that exception is reported to the
-coordinator and the thread terminates.  The coordinator will then request all
-the other threads it coordinates to stop.
-
-You typically pass looper threads to the supervisor `Join()` method.
-- - -
-
-#### `tf.train.LooperThread.__init__(coord, timer_interval_secs, target=None, args=None, kwargs=None)` {#LooperThread.__init__}
-
-Create a LooperThread.
-
-##### Args:
-
-
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Time boundaries at which to call Run(), or None
-    if it should be called back to back.
-*  <b>`target`</b>: Optional callable object that will be executed in the thread.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.train.LooperThread.__repr__()` {#LooperThread.__repr__}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.daemon` {#LooperThread.daemon}
-
-A boolean value indicating whether this thread is a daemon thread (True) or not (False).
-
-This must be set before start() is called, otherwise RuntimeError is
-raised. Its initial value is inherited from the creating thread; the
-main thread is not a daemon thread and therefore all threads created in
-the main thread default to daemon = False.
-
-The entire Python program exits when no alive non-daemon threads are
-left.
-
-
-- - -
-
-#### `tf.train.LooperThread.getName()` {#LooperThread.getName}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.ident` {#LooperThread.ident}
-
-Thread identifier of this thread or None if it has not been started.
-
-This is a nonzero integer. See the thread.get_ident() function. Thread
-identifiers may be recycled when a thread exits and another thread is
-created. The identifier is available even after the thread has exited.
-
-
-- - -
-
-#### `tf.train.LooperThread.isAlive()` {#LooperThread.isAlive}
-
-Return whether the thread is alive.
-
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
-
-
-- - -
-
-#### `tf.train.LooperThread.isDaemon()` {#LooperThread.isDaemon}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.is_alive()` {#LooperThread.is_alive}
-
-Return whether the thread is alive.
-
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
-
-
-- - -
-
-#### `tf.train.LooperThread.join(timeout=None)` {#LooperThread.join}
-
-Wait until the thread terminates.
-
-This blocks the calling thread until the thread whose join() method is
-called terminates -- either normally or through an unhandled exception
-or until the optional timeout occurs.
-
-When the timeout argument is present and not None, it should be a
-floating point number specifying a timeout for the operation in seconds
-(or fractions thereof). As join() always returns None, you must call
-isAlive() after join() to decide whether a timeout happened -- if the
-thread is still alive, the join() call timed out.
-
-When the timeout argument is not present or None, the operation will
-block until the thread terminates.
-
-A thread can be join()ed many times.
-
-join() raises a RuntimeError if an attempt is made to join the current
-thread as that would cause a deadlock. It is also an error to join() a
-thread before it has been started and attempts to do so raises the same
-exception.
-
-
-- - -
-
-#### `tf.train.LooperThread.loop(coord, timer_interval_secs, target, args=None, kwargs=None)` {#LooperThread.loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(args)`
-repeatedly.  Otherwise `target(args)` is called every `timer_interval_secs`
-seconds.  The thread terminates when a stop of the coordinator is
-requested.
-
-##### Args:
-
-
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.LooperThread.name` {#LooperThread.name}
-
-A string used for identification purposes only.
-
-It has no semantics. Multiple threads may be given the same name. The
-initial name is set by the constructor.
-
-
-- - -
-
-#### `tf.train.LooperThread.run()` {#LooperThread.run}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.run_loop()` {#LooperThread.run_loop}
-
-Called at 'timer_interval_secs' boundaries.
-
-
-- - -
-
-#### `tf.train.LooperThread.setDaemon(daemonic)` {#LooperThread.setDaemon}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.setName(name)` {#LooperThread.setName}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.start()` {#LooperThread.start}
-
-Start the thread's activity.
-
-It must be called at most once per thread object. It arranges for the
-object's run() method to be invoked in a separate thread of control.
-
-This method will raise a RuntimeError if called more than once on the
-same thread object.
-
-
-- - -
-
-#### `tf.train.LooperThread.start_loop()` {#LooperThread.start_loop}
-
-Called when the thread starts.
-
-
-- - -
-
-#### `tf.train.LooperThread.stop_loop()` {#LooperThread.stop_loop}
-
-Called when the thread stops.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredSession.md
deleted file mode 100644
index aeaf209fe92..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredSession.md
+++ /dev/null
@@ -1,128 +0,0 @@
-Session-like object that handles initialization, recovery and hooks.
-
-Example usage:
-
-```python
-saver_hook = CheckpointSaverHook(...)
-summary_hook = SummaryHook(...)
-with MonitoredSession(session_creator=ChiefSessionCreator(...),
-                      hooks=[saver_hook, summary_hook]) as sess:
-  while not sess.should_stop():
-    sess.run(train_op)
-```
-
-Initialization: At creation time the monitored session does following things
-in given order:
-
-* calls `hook.begin()` for each given hook
-* finalizes the graph via `scaffold.finalize()`
-* create session
-* initializes the model via initialization ops provided by `Scaffold`
-* restores variables if a checkpoint exists
-* launches queue runners
-
-Run: When `run()` is called, the monitored session does following things:
-
-* calls `hook.before_run()`
-* calls TensorFlow `session.run()` with merged fetches and feed_dict
-* calls `hook.after_run()`
-* returns result of `session.run()` asked by user
-* if `AbortedError` occurs, it recovers or reinitializes the session before
-  executing the run() call again
-
-
-Exit: At the `close()`, the monitored session does following things in order:
-
-* calls `hook.end()`
-* closes the queue runners and the session
-* suppresses `OutOfRange` error which indicates that all inputs have been
-  processed if the monitored_session is used as a context
-
-How to set `tf.Session` arguments:
-
-* In most cases you can set session arguments as follows:
-
-```python
-MonitoredSession(
-  session_creator=ChiefSessionCreator(master=..., config=...))
-```
-
-* In distributed setting for a non-chief worker, you can use following:
-
-```python
-MonitoredSession(
-  session_creator=WorkerSessionCreator(master=..., config=...))
-```
-
-See `MonitoredTrainingSession` for an example usage based on chief or worker.
-
-Args:
-  session_creator: A factory object to create session. Typically a
-    `ChiefSessionCreator` which is the default one.
-  hooks: An iterable of `SessionRunHook' objects.
-
-Returns:
-  A MonitoredSession object.
-- - -
-
-#### `tf.train.MonitoredSession.__enter__()` {#MonitoredSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.__exit__(exception_type, exception_value, traceback)` {#MonitoredSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.__init__(session_creator=None, hooks=None)` {#MonitoredSession.__init__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.close()` {#MonitoredSession.close}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.graph` {#MonitoredSession.graph}
-
-The graph that was launched in this session.
-
-
-- - -
-
-#### `tf.train.MonitoredSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#MonitoredSession.run}
-
-Run ops in the monitored session.
-
-This method is completely compatible with the `tf.Session.run()` method.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as `tf.Session.run()`.
-*  <b>`feed_dict`</b>: Same as `tf.Session.run()`.
-*  <b>`options`</b>: Same as `tf.Session.run()`.
-*  <b>`run_metadata`</b>: Same as `tf.Session.run()`.
-
-##### Returns:
-
-  Same as `tf.Session.run()`.
-
-
-- - -
-
-#### `tf.train.MonitoredSession.should_stop()` {#MonitoredSession.should_stop}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md
deleted file mode 100644
index 254e28a70a6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.MonitoredTrainingSession.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, save_summaries_secs=None, config=None)` {#MonitoredTrainingSession}
-
-Creates a `MonitoredSession` for training.
-
-For a chief, this utility sets proper session initializer/restorer. It also
-creates hooks related to checkpoint and summary saving. For workers, this
-utility sets proper session creator which waits for the chief to
-inialize/restore.
-
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` the TensorFlow master to use.
-*  <b>`is_chief`</b>: If `True`, it will take care of initialization and recovery the
-    underlying TensorFlow session. If `False`, it will wait on a chief to
-    initialize or recover the TensorFlow session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified, a default one is created. It's used to finalize the graph.
-*  <b>`hooks`</b>: Optional list of `SessionRunHook` objects.
-*  <b>`chief_only_hooks`</b>: list of `SessionRunHook` objects. Activate these hooks if
-    `is_chief==True`, ignore otherwise.
-*  <b>`save_checkpoint_secs`</b>: The frequency, in seconds, that a checkpoint is saved
-    using a default checkpoint saver. If `save_checkpoint_secs` is set to
-    `None`, then the default checkpoint saver isn't used.
-*  <b>`save_summaries_steps`</b>: The frequency, in number of global steps, that the
-    summaries are written to disk using a default summary saver. If both
-    `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
-    the default summary saver isn't used.
-*  <b>`save_summaries_secs`</b>: The frequency, in secs, that the summaries are written
-    to disk using a default summary saver.  If both `save_summaries_steps` and
-    `save_summaries_secs` are set to `None`, then the default summary saver
-    isn't used.
-*  <b>`config`</b>: an instance of `tf.ConfigProto` proto used to configure the session.
-    It's the `config` argument of constructor of `tf.Session`.
-
-##### Returns:
-
-  A `MonitoredSession` object.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md
deleted file mode 100644
index 6e509684c28..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md
+++ /dev/null
@@ -1,80 +0,0 @@
-NaN Loss monitor.
-
-Monitors loss and stops training if loss is NaN.
-Can either fail with exception or just stop training.
-- - -
-
-#### `tf.train.NanTensorHook.__init__(loss_tensor, fail_on_nan_loss=True)` {#NanTensorHook.__init__}
-
-Initializes NanLoss monitor.
-
-##### Args:
-
-
-*  <b>`loss_tensor`</b>: `Tensor`, the loss tensor.
-*  <b>`fail_on_nan_loss`</b>: `bool`, whether to raise exception when loss is NaN.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.after_create_session(session, coord)` {#NanTensorHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.after_run(run_context, run_values)` {#NanTensorHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.NanTensorHook.before_run(run_context)` {#NanTensorHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.NanTensorHook.begin()` {#NanTensorHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.end(session)` {#NanTensorHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.Server.create_local_server.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.Server.create_local_server.md
deleted file mode 100644
index 98340049577..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.Server.create_local_server.md
+++ /dev/null
@@ -1,21 +0,0 @@
-#### `tf.train.Server.create_local_server(config=None, start=True)` {#Server.create_local_server}
-
-Creates a new single-process cluster running on the local host.
-
-This method is a convenience wrapper for creating a
-`tf.train.Server` with a `tf.train.ServerDef` that specifies a
-single-process cluster containing a single task in a job called
-`"local"`.
-
-##### Args:
-
-
-*  <b>`config`</b>: (Options.) A `tf.ConfigProto` that specifies default
-    configuration options for all sessions that run on this server.
-*  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server after
-    creating it. Defaults to `True`.
-
-##### Returns:
-
-  A local `tf.train.Server`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.generate_checkpoint_state_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.generate_checkpoint_state_proto.md
deleted file mode 100644
index 7405b289e30..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.generate_checkpoint_state_proto.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.train.generate_checkpoint_state_proto(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None)` {#generate_checkpoint_state_proto}
-
-Generates a checkpoint state proto.
-
-##### Args:
-
-
-*  <b>`save_dir`</b>: Directory where the model was saved.
-*  <b>`model_checkpoint_path`</b>: The checkpoint file.
-*  <b>`all_model_checkpoint_paths`</b>: List of strings.  Paths to all not-yet-deleted
-    checkpoints, sorted from oldest to newest.  If this is a non-empty list,
-    the last element must be equal to model_checkpoint_path.  These paths
-    are also saved in the CheckpointState proto.
-
-##### Returns:
-
-  CheckpointState proto with model_checkpoint_path and
-  all_model_checkpoint_paths updated to either absolute paths or
-  relative paths to the current save_dir.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md
deleted file mode 100644
index ed68f0e2402..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.maybe_batch.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.train.maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch}
-
-Conditionally creates batches of tensors based on `keep_input`.
-
-See docstring in `batch` for more details.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.shuffle_batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.shuffle_batch_join.md
deleted file mode 100644
index 13a925678aa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.shuffle_batch_join.md
+++ /dev/null
@@ -1,77 +0,0 @@
-### `tf.train.shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#shuffle_batch_join}
-
-Create batches by randomly shuffling tensors.
-
-The `tensors_list` argument is a list of tuples of tensors, or a list of
-dictionaries of tensors.  Each element in the list is treated similarly
-to the `tensors` argument of `tf.train.shuffle_batch()`.
-
-This version enqueues a different list of tensors in different threads.
-It adds the following to the current `Graph`:
-
-* A shuffling queue into which tensors from `tensors_list` are enqueued.
-* A `dequeue_many` operation to create batches from the queue.
-* A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
-  from `tensors_list`.
-
-`len(tensors_list)` threads will be started, with thread `i` enqueuing
-the tensors from `tensors_list[i]`. `tensors_list[i1][j]` must match
-`tensors_list[i2][j]` in type and shape, except in the first dimension if
-`enqueue_many` is true.
-
-If `enqueue_many` is `False`, each `tensors_list[i]` is assumed
-to represent a single example.  An input tensor with shape `[x, y, z]`
-will be output as a tensor with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors_list[i]` is assumed to
-represent a batch of examples, where the first dimension is indexed
-by example, and all members of `tensors_list[i]` should have the
-same size in the first dimension.  If an input tensor has shape `[*, x,
-y, z]`, the output will have shape `[batch_size, x, y, z]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors_list[i]`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.unstack.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.unstack.md
deleted file mode 100644
index 872ef968c1b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.unstack.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.unstack(value, num=None, axis=0, name='unstack')` {#unstack}
-
-Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
-
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-If `num` is not specified (the default), it is inferred from `value`'s shape.
-If `value.shape[axis]` is not known, `ValueError` is raised.
-
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice
-  `value[i, :, :, :]` and each tensor in `output` will have shape `(B, C, D)`.
-  (Note that the dimension unpacked along is gone, unlike `split`).
-
-If `axis == 1` then the i'th tensor in `output` is the slice
-  `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of pack.  The numpy equivalent is
-
-    tf.unstack(x, n) = list(x)
-
-##### Args:
-
-
-*  <b>`value`</b>: A rank `R > 0` `Tensor` to be unstacked.
-*  <b>`num`</b>: An `int`. The length of the dimension `axis`. Automatically inferred
-    if `None` (the default).
-*  <b>`axis`</b>: An `int`. The axis to unstack along. Defaults to the first
-    dimension. Supports negative indexes.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The list of `Tensor` objects unstacked from `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num` is unspecified and cannot be inferred.
-*  <b>`ValueError`</b>: If `axis` is out of the range [-R, R).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros.md
deleted file mode 100644
index 590294db659..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.zeros(shape, dtype=tf.float32, name=None)` {#zeros}
-
-Creates a tensor with all elements set to zero.
-
-This operation returns a tensor of type `dtype` with shape `shape` and
-all elements set to zero.
-
-For example:
-
-```python
-tf.zeros([3, 4], tf.int32) ==> [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: Either a list of integers, or a 1-D `Tensor` of type `int32`.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with all elements set to zero.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros_like.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros_like.md
deleted file mode 100644
index 178c2ae4673..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.zeros_like.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.zeros_like(tensor, dtype=None, name=None, optimize=True)` {#zeros_like}
-
-Creates a tensor with all elements set to zero.
-
-Given a single tensor (`tensor`), this operation returns a tensor of the
-same type and shape as `tensor` with all elements set to zero. Optionally,
-you can use `dtype` to specify a new type for the returned tensor.
-
-For example:
-
-```python
-# 'tensor' is [[1, 2, 3], [4, 5, 6]]
-tf.zeros_like(tensor) ==> [[0, 0, 0], [0, 0, 0]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`dtype`</b>: A type for the returned `Tensor`. Must be `float32`, `float64`,
-  `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, or `complex128`.
-
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`optimize`</b>: if true, attempt to statically determine the shape of 'tensor'
-  and encode it as a constant.
-
-##### Returns:
-
-  A `Tensor` with all elements set to zero.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
deleted file mode 100644
index eeb4226633d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
+++ /dev/null
@@ -1,256 +0,0 @@
-Command-line-interface debugger hook.
-
-Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__enter__()` {#LocalCLIDebugHook.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#LocalCLIDebugHook.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__init__(ui_type='curses')` {#LocalCLIDebugHook.__init__}
-
-Create a local debugger command-line interface (CLI) hook.
-
-##### Args:
-
-
-*  <b>`ui_type`</b>: (str) user-interface type.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.add_tensor_filter(filter_name, tensor_filter)` {#LocalCLIDebugHook.add_tensor_filter}
-
-Add a tensor filter.
-
-See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-Override default behavior to accomodate the possibility of this method being
-called prior to the initialization of the underlying
-`LocalCLIDebugWrapperSession` object.
-
-##### Args:
-
-
-*  <b>`filter_name`</b>: See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()`
-    for details.
-*  <b>`tensor_filter`</b>: See doc of
-    `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.after_create_session(session, coord)` {#LocalCLIDebugHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.after_run(run_context, run_values)` {#LocalCLIDebugHook.after_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.before_run(run_context)` {#LocalCLIDebugHook.before_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.begin()` {#LocalCLIDebugHook.begin}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.close()` {#LocalCLIDebugHook.close}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.end(session)` {#LocalCLIDebugHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.graph` {#LocalCLIDebugHook.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#LocalCLIDebugHook.invoke_node_stepper}
-
-Overrides method in base class to implement interactive node stepper.
-
-##### Args:
-
-
-*  <b>`node_stepper`</b>: (`stepper.NodeStepper`) The underlying NodeStepper API
-    object.
-*  <b>`restore_variable_values_on_exit`</b>: (`bool`) Whether any variables whose
-    values have been altered during this node-stepper invocation should be
-    restored to their old values when this invocation ends.
-
-##### Returns:
-
-  The same return values as the `Session.run()` call on the same fetches as
-    the NodeStepper.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_run_end(request)` {#LocalCLIDebugHook.on_run_end}
-
-Overrides on-run-end callback.
-
-##### Actions taken:
-
-  1) Load the debug dump.
-  2) Bring up the Analyzer CLI.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of OnSessionInitRequest.
-
-##### Returns:
-
-  An instance of OnSessionInitResponse.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_run_start(request)` {#LocalCLIDebugHook.on_run_start}
-
-Overrides on-run-start callback.
-
-##### Invoke the CLI to let user choose what action to take:
-
-  `run` / `invoke_stepper`.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If user chooses to prematurely exit the debugger.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_session_init(request)` {#LocalCLIDebugHook.on_session_init}
-
-Overrides on-session-init callback.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.partial_run(handle, fetches, feed_dict=None)` {#LocalCLIDebugHook.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.partial_run_setup(fetches, feeds=None)` {#LocalCLIDebugHook.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#LocalCLIDebugHook.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.sess_str` {#LocalCLIDebugHook.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.session` {#LocalCLIDebugHook.session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.DType.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.DType.md
deleted file mode 100644
index 1bd817c962f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.DType.md
+++ /dev/null
@@ -1,255 +0,0 @@
-Represents the type of the elements in a `Tensor`.
-
-The following `DType` objects are defined:
-
-* `tf.float16`: 16-bit half-precision floating-point.
-* `tf.float32`: 32-bit single-precision floating-point.
-* `tf.float64`: 64-bit double-precision floating-point.
-* `tf.bfloat16`: 16-bit truncated floating-point.
-* `tf.complex64`: 64-bit single-precision complex.
-* `tf.complex128`: 128-bit double-precision complex.
-* `tf.int8`: 8-bit signed integer.
-* `tf.uint8`: 8-bit unsigned integer.
-* `tf.uint16`: 16-bit unsigned integer.
-* `tf.int16`: 16-bit signed integer.
-* `tf.int32`: 32-bit signed integer.
-* `tf.int64`: 64-bit signed integer.
-* `tf.bool`: Boolean.
-* `tf.string`: String.
-* `tf.qint8`: Quantized 8-bit signed integer.
-* `tf.quint8`: Quantized 8-bit unsigned integer.
-* `tf.qint16`: Quantized 16-bit signed integer.
-* `tf.quint16`: Quantized 16-bit unsigned integer.
-* `tf.qint32`: Quantized 32-bit signed integer.
-* `tf.resource`: Handle to a mutable resource.
-
-In addition, variants of these types with the `_ref` suffix are
-defined for reference-typed tensors.
-
-The `tf.as_dtype()` function converts numpy types and string type
-names to a `DType` object.
-
-- - -
-
-#### `tf.DType.is_compatible_with(other)` {#DType.is_compatible_with}
-
-Returns True if the `other` DType will be converted to this DType.
-
-The conversion rules are as follows:
-
-```python
-DType(T)       .is_compatible_with(DType(T))        == True
-DType(T)       .is_compatible_with(DType(T).as_ref) == True
-DType(T).as_ref.is_compatible_with(DType(T))        == False
-DType(T).as_ref.is_compatible_with(DType(T).as_ref) == True
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: A `DType` (or object that may be converted to a `DType`).
-
-##### Returns:
-
-  True if a Tensor of the `other` `DType` will be implicitly converted to
-  this `DType`.
-
-
-- - -
-
-#### `tf.DType.name` {#DType.name}
-
-Returns the string name for this `DType`.
-
-
-- - -
-
-#### `tf.DType.base_dtype` {#DType.base_dtype}
-
-Returns a non-reference `DType` based on this `DType`.
-
-
-- - -
-
-#### `tf.DType.real_dtype` {#DType.real_dtype}
-
-Returns the dtype correspond to this dtype's real part.
-
-
-- - -
-
-#### `tf.DType.is_bool` {#DType.is_bool}
-
-Returns whether this is a boolean data type
-
-
-- - -
-
-#### `tf.DType.is_floating` {#DType.is_floating}
-
-Returns whether this is a (non-quantized, real) floating point type.
-
-
-- - -
-
-#### `tf.DType.is_complex` {#DType.is_complex}
-
-Returns whether this is a complex floating point type.
-
-
-- - -
-
-#### `tf.DType.is_integer` {#DType.is_integer}
-
-Returns whether this is a (non-quantized) integer type.
-
-
-- - -
-
-#### `tf.DType.is_quantized` {#DType.is_quantized}
-
-Returns whether this is a quantized data type.
-
-
-- - -
-
-#### `tf.DType.is_unsigned` {#DType.is_unsigned}
-
-Returns whether this type is unsigned.
-
-Non-numeric, unordered, and quantized types are not considered unsigned, and
-this function returns `False`.
-
-##### Returns:
-
-  Whether a `DType` is unsigned.
-
-
-
-- - -
-
-#### `tf.DType.as_numpy_dtype` {#DType.as_numpy_dtype}
-
-Returns a `numpy.dtype` based on this `DType`.
-
-
-- - -
-
-#### `tf.DType.as_datatype_enum` {#DType.as_datatype_enum}
-
-Returns a `types_pb2.DataType` enum value based on this `DType`.
-
-
-
-- - -
-
-#### `tf.DType.limits` {#DType.limits}
-
-Return intensity limits, i.e. (min, max) tuple, of the dtype.
-
-##### Args:
-
-  clip_negative : bool, optional
-      If True, clip the negative range (i.e. return 0 for min intensity)
-      even if the image dtype allows negative values.
-Returns
-  min, max : tuple
-    Lower and upper intensity limits.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.DType.__eq__(other)` {#DType.__eq__}
-
-Returns True iff this DType refers to the same type as `other`.
-
-
-- - -
-
-#### `tf.DType.__hash__()` {#DType.__hash__}
-
-
-
-
-- - -
-
-#### `tf.DType.__init__(type_enum)` {#DType.__init__}
-
-Creates a new `DataType`.
-
-NOTE(mrry): In normal circumstances, you should not need to
-construct a `DataType` object directly. Instead, use the
-`tf.as_dtype()` function.
-
-##### Args:
-
-
-*  <b>`type_enum`</b>: A `types_pb2.DataType` enum value.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `type_enum` is not a value `types_pb2.DataType`.
-
-
-- - -
-
-#### `tf.DType.__ne__(other)` {#DType.__ne__}
-
-Returns True iff self != other.
-
-
-- - -
-
-#### `tf.DType.__repr__()` {#DType.__repr__}
-
-
-
-
-- - -
-
-#### `tf.DType.__str__()` {#DType.__str__}
-
-
-
-
-- - -
-
-#### `tf.DType.is_numpy_compatible` {#DType.is_numpy_compatible}
-
-
-
-
-- - -
-
-#### `tf.DType.max` {#DType.max}
-
-Returns the maximum representable value in this data type.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if this is a non-numeric, unordered, or quantized type.
-
-
-- - -
-
-#### `tf.DType.min` {#DType.min}
-
-Returns the minimum representable value in this data type.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if this is a non-numeric, unordered, or quantized type.
-
-
-- - -
-
-#### `tf.DType.size` {#DType.size}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
deleted file mode 100644
index 9c4713b1d0d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
+++ /dev/null
@@ -1,884 +0,0 @@
-A TensorFlow computation, represented as a dataflow graph.
-
-A `Graph` contains a set of
-[`Operation`](../../api_docs/python/framework.md#Operation) objects,
-which represent units of computation; and
-[`Tensor`](../../api_docs/python/framework.md#Tensor) objects, which represent
-the units of data that flow between operations.
-
-A default `Graph` is always registered, and accessible by calling
-[`tf.get_default_graph()`](../../api_docs/python/framework.md#get_default_graph).
-To add an operation to the default graph, simply call one of the functions
-that defines a new `Operation`:
-
-```python
-c = tf.constant(4.0)
-assert c.graph is tf.get_default_graph()
-```
-
-Another typical usage involves the
-[`Graph.as_default()`](../../api_docs/python/framework.md#Graph.as_default)
-context manager, which overrides the current default graph for the
-lifetime of the context:
-
-```python
-g = tf.Graph()
-with g.as_default():
-  # Define operations and tensors in `g`.
-  c = tf.constant(30.0)
-  assert c.graph is g
-```
-
-Important note: This class *is not* thread-safe for graph construction. All
-operations should be created from a single thread, or external
-synchronization must be provided. Unless otherwise specified, all methods
-are not thread-safe.
-
-- - -
-
-#### `tf.Graph.__init__()` {#Graph.__init__}
-
-Creates a new, empty Graph.
-
-
-- - -
-
-#### `tf.Graph.as_default()` {#Graph.as_default}
-
-Returns a context manager that makes this `Graph` the default graph.
-
-This method should be used if you want to create multiple graphs
-in the same process. For convenience, a global default graph is
-provided, and all ops will be added to this graph if you do not
-create a new graph explicitly. Use this method with the `with` keyword
-to specify that ops created within the scope of a block should be
-added to this graph.
-
-The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default graph in that
-thread, you must explicitly add a `with g.as_default():` in that
-thread's function.
-
-The following code examples are equivalent:
-
-```python
-# 1. Using Graph.as_default():
-g = tf.Graph()
-with g.as_default():
-  c = tf.constant(5.0)
-  assert c.graph is g
-
-# 2. Constructing and making default:
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0)
-  assert c.graph is g
-```
-
-##### Returns:
-
-  A context manager for using this graph as the default graph.
-
-
-- - -
-
-#### `tf.Graph.as_graph_def(from_version=None, add_shapes=False)` {#Graph.as_graph_def}
-
-Returns a serialized `GraphDef` representation of this graph.
-
-The serialized `GraphDef` can be imported into another `Graph`
-(using [`import_graph_def()`](#import_graph_def)) or used with the
-[C++ Session API](../../api_docs/cc/index.md).
-
-This method is thread-safe.
-
-##### Args:
-
-
-*  <b>`from_version`</b>: Optional.  If this is set, returns a `GraphDef`
-    containing only the nodes that were added to this graph since
-    its `version` property had the given value.
-*  <b>`add_shapes`</b>: If true, adds an "_output_shapes" list attr to each
-    node with the inferred shapes of each of its outputs.
-
-##### Returns:
-
-  A [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
-  protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `graph_def` would be too large.
-
-
-- - -
-
-#### `tf.Graph.finalize()` {#Graph.finalize}
-
-Finalizes this graph, making it read-only.
-
-After calling `g.finalize()`, no new operations can be added to
-`g`.  This method is used to ensure that no operations are added
-to a graph when it is shared between multiple threads, for example
-when using a [`QueueRunner`](../../api_docs/python/train.md#QueueRunner).
-
-
-- - -
-
-#### `tf.Graph.finalized` {#Graph.finalized}
-
-True if this graph has been finalized.
-
-
-
-- - -
-
-#### `tf.Graph.control_dependencies(control_inputs)` {#Graph.control_dependencies}
-
-Returns a context manager that specifies control dependencies.
-
-Use with the `with` keyword to specify that all operations constructed
-within the context should have control dependencies on
-`control_inputs`. For example:
-
-```python
-with g.control_dependencies([a, b, c]):
-  # `d` and `e` will only run after `a`, `b`, and `c` have executed.
-  d = ...
-  e = ...
-```
-
-Multiple calls to `control_dependencies()` can be nested, and in
-that case a new `Operation` will have control dependencies on the union
-of `control_inputs` from all active contexts.
-
-```python
-with g.control_dependencies([a, b]):
-  # Ops constructed here run after `a` and `b`.
-  with g.control_dependencies([c, d]):
-    # Ops constructed here run after `a`, `b`, `c`, and `d`.
-```
-
-You can pass None to clear the control dependencies:
-
-```python
-with g.control_dependencies([a, b]):
-  # Ops constructed here run after `a` and `b`.
-  with g.control_dependencies(None):
-    # Ops constructed here run normally, not waiting for either `a` or `b`.
-    with g.control_dependencies([c, d]):
-      # Ops constructed here run after `c` and `d`, also not waiting
-      # for either `a` or `b`.
-```
-
-*N.B.* The control dependencies context applies *only* to ops that
-are constructed within the context. Merely using an op or tensor
-in the context does not add a control dependency. The following
-example illustrates this point:
-
-```python
-# WRONG
-def my_func(pred, tensor):
-  t = tf.matmul(tensor, tensor)
-  with tf.control_dependencies([pred]):
-    # The matmul op is created outside the context, so no control
-    # dependency will be added.
-    return t
-
-# RIGHT
-def my_func(pred, tensor):
-  with tf.control_dependencies([pred]):
-    # The matmul op is created in the context, so a control dependency
-    # will be added.
-    return tf.matmul(tensor, tensor)
-```
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A list of `Operation` or `Tensor` objects which
-    must be executed or computed before running the operations
-    defined in the context.  Can also be `None` to clear the control
-    dependencies.
-
-##### Returns:
-
- A context manager that specifies control dependencies for all
- operations constructed within the context.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `control_inputs` is not a list of `Operation` or
-    `Tensor` objects.
-
-
-- - -
-
-#### `tf.Graph.device(device_name_or_function)` {#Graph.device}
-
-Returns a context manager that specifies the default device to use.
-
-The `device_name_or_function` argument may either be a device name
-string, a device function, or None:
-
-* If it is a device name string, all operations constructed in
-  this context will be assigned to the device with that name, unless
-  overridden by a nested `device()` context.
-* If it is a function, it will be treated as a function from
-  Operation objects to device name strings, and invoked each time
-  a new Operation is created. The Operation will be assigned to
-  the device with the returned name.
-* If it is None, all `device()` invocations from the enclosing context
-  will be ignored.
-
-For information about the valid syntax of device name strings, see
-the documentation in
-[`DeviceNameUtils`](https://www.tensorflow.org/code/tensorflow/core/util/device_name_utils.h).
-
-For example:
-
-```python
-with g.device('/gpu:0'):
-  # All operations constructed in this context will be placed
-  # on GPU 0.
-  with g.device(None):
-    # All operations constructed in this context will have no
-    # assigned device.
-
-# Defines a function from `Operation` to device string.
-def matmul_on_gpu(n):
-  if n.type == "MatMul":
-    return "/gpu:0"
-  else:
-    return "/cpu:0"
-
-with g.device(matmul_on_gpu):
-  # All operations of type "MatMul" constructed in this context
-  # will be placed on GPU 0; all other operations will be placed
-  # on CPU 0.
-```
-
-**N.B.** The device scope may be overridden by op wrappers or
-other library code. For example, a variable assignment op
-`v.assign()` must be colocated with the `tf.Variable` `v`, and
-incompatible device scopes will be ignored.
-
-##### Args:
-
-
-*  <b>`device_name_or_function`</b>: The device name or function to use in
-    the context.
-
-##### Returns:
-
-  A context manager that specifies the default device to use for newly
-  created ops.
-
-
-- - -
-
-#### `tf.Graph.name_scope(name)` {#Graph.name_scope}
-
-Returns a context manager that creates hierarchical names for operations.
-
-A graph maintains a stack of name scopes. A `with name_scope(...):`
-statement pushes a new name onto the stack for the lifetime of the context.
-
-The `name` argument will be interpreted as follows:
-
-* A string (not ending with '/') will create a new name scope, in which
-  `name` is appended to the prefix of all operations created in the
-  context. If `name` has been used before, it will be made unique by
-  calling `self.unique_name(name)`.
-* A scope previously captured from a `with g.name_scope(...) as
-  scope:` statement will be treated as an "absolute" name scope, which
-  makes it possible to re-enter existing scopes.
-* A value of `None` or the empty string will reset the current name scope
-  to the top-level (empty) name scope.
-
-For example:
-
-```python
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0, name="c")
-  assert c.op.name == "c"
-  c_1 = tf.constant(6.0, name="c")
-  assert c_1.op.name == "c_1"
-
-  # Creates a scope called "nested"
-  with g.name_scope("nested") as scope:
-    nested_c = tf.constant(10.0, name="c")
-    assert nested_c.op.name == "nested/c"
-
-    # Creates a nested scope called "inner".
-    with g.name_scope("inner"):
-      nested_inner_c = tf.constant(20.0, name="c")
-      assert nested_inner_c.op.name == "nested/inner/c"
-
-    # Create a nested scope called "inner_1".
-    with g.name_scope("inner"):
-      nested_inner_1_c = tf.constant(30.0, name="c")
-      assert nested_inner_1_c.op.name == "nested/inner_1/c"
-
-      # Treats `scope` as an absolute name scope, and
-      # switches to the "nested/" scope.
-      with g.name_scope(scope):
-        nested_d = tf.constant(40.0, name="d")
-        assert nested_d.op.name == "nested/d"
-
-        with g.name_scope(""):
-          e = tf.constant(50.0, name="e")
-          assert e.op.name == "e"
-```
-
-The name of the scope itself can be captured by `with
-g.name_scope(...) as scope:`, which stores the name of the scope
-in the variable `scope`. This value can be used to name an
-operation that represents the overall result of executing the ops
-in a scope. For example:
-
-```python
-inputs = tf.constant(...)
-with g.name_scope('my_layer') as scope:
-  weights = tf.Variable(..., name="weights")
-  biases = tf.Variable(..., name="biases")
-  affine = tf.matmul(inputs, weights) + biases
-  output = tf.nn.relu(affine, name=scope)
-```
-
-NOTE: This constructor validates the given `name`. Valid scope
-names match one of the following regular expressions:
-
-    [A-Za-z0-9.][A-Za-z0-9_.\\-/]* (for scopes at the root)
-    [A-Za-z0-9_.\\-/]* (for other scopes)
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the scope.
-
-##### Returns:
-
-  A context manager that installs `name` as a new name scope.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `name` is not a valid scope name. The rules are the
-
-
-
-A `Graph` instance supports an arbitrary number of "collections"
-that are identified by name. For convenience when building a large
-graph, collections can store groups of related objects: for
-example, the `tf.Variable` uses a collection (named
-[`tf.GraphKeys.GLOBAL_VARIABLES`](../../api_docs/python/framework.md#GraphKeys)) for
-all variables that are created during the construction of a graph. The caller
-may define additional collections by specifying a new name.
-
-- - -
-
-#### `tf.Graph.add_to_collection(name, value)` {#Graph.add_to_collection}
-
-Stores `value` in the collection with the given `name`.
-
-Note that collections are not sets, so it is possible to add a value to
-a collection several times.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. The `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collection.
-
-
-- - -
-
-#### `tf.Graph.add_to_collections(names, value)` {#Graph.add_to_collections}
-
-Stores `value` in the collections given by `names`.
-
-Note that collections are not sets, so it is possible to add a value to
-a collection several times. This function makes sure that duplicates in
-`names` are ignored, but it will not check for pre-existing membership of
-`value` in any of the collections in `names`.
-
-`names` can be any iterable, but if `names` is a string, it is treated as a
-single collection name.
-
-##### Args:
-
-
-*  <b>`names`</b>: The keys for the collections to add to. The `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`value`</b>: The value to add to the collections.
-
-
-- - -
-
-#### `tf.Graph.get_collection(name, scope=None)` {#Graph.get_collection}
-
-Returns a list of values in the collection with the given `name`.
-
-This is different from `get_collection_ref()` which always returns the
-actual collection list if it exists in that it returns a new list each time
-it is called.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`scope`</b>: (Optional.) If supplied, the resulting list is filtered to include
-    only items whose `name` attribute matches using `re.match`. Items
-    without a `name` attribute are never returned if a scope is supplied and
-    the choice or `re.match` means that a `scope` without special tokens
-    filters by prefix.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or
-  an empty list if no value has been added to that collection. The
-  list contains the values in the order under which they were
-  collected.
-
-
-- - -
-
-#### `tf.Graph.get_collection_ref(name)` {#Graph.get_collection_ref}
-
-Returns a list of values in the collection with the given `name`.
-
-If the collection exists, this returns the list itself, which can
-be modified in place to change the collection.  If the collection does
-not exist, it is created as an empty list and the list is returned.
-
-This is different from `get_collection()` which always returns a copy of
-the collection list if it exists and never creates an empty collection.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or an empty
-  list if no value has been added to that collection.
-
-
-
-- - -
-
-#### `tf.Graph.as_graph_element(obj, allow_tensor=True, allow_operation=True)` {#Graph.as_graph_element}
-
-Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
-
-This function validates that `obj` represents an element of this
-graph, and gives an informative error message if it is not.
-
-This function is the canonical way to get/validate an object of
-one of the allowed types from an external argument reference in the
-Session API.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`obj`</b>: A `Tensor`, an `Operation`, or the name of a tensor or operation.
-    Can also be any object with an `_as_graph_element()` method that returns
-    a value of one of these types.
-*  <b>`allow_tensor`</b>: If true, `obj` may refer to a `Tensor`.
-*  <b>`allow_operation`</b>: If true, `obj` may refer to an `Operation`.
-
-##### Returns:
-
-  The `Tensor` or `Operation` in the Graph corresponding to `obj`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `obj` is not a type we support attempting to convert
-    to types.
-*  <b>`ValueError`</b>: If `obj` is of an appropriate type but invalid. For
-    example, an invalid string.
-*  <b>`KeyError`</b>: If `obj` is not an object in the graph.
-
-
-- - -
-
-#### `tf.Graph.get_operation_by_name(name)` {#Graph.get_operation_by_name}
-
-Returns the `Operation` with the given `name`.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the `Operation` to return.
-
-##### Returns:
-
-  The `Operation` with the given `name`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `name` is not a string.
-*  <b>`KeyError`</b>: If `name` does not correspond to an operation in this graph.
-
-
-- - -
-
-#### `tf.Graph.get_tensor_by_name(name)` {#Graph.get_tensor_by_name}
-
-Returns the `Tensor` with the given `name`.
-
-This method may be called concurrently from multiple threads.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the `Tensor` to return.
-
-##### Returns:
-
-  The `Tensor` with the given `name`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `name` is not a string.
-*  <b>`KeyError`</b>: If `name` does not correspond to a tensor in this graph.
-
-
-- - -
-
-#### `tf.Graph.get_operations()` {#Graph.get_operations}
-
-Return the list of operations in the graph.
-
-You can modify the operations in place, but modifications
-to the list such as inserts/delete have no effect on the
-list of operations known to the graph.
-
-This method may be called concurrently from multiple threads.
-
-##### Returns:
-
-  A list of Operations.
-
-
-
-- - -
-
-#### `tf.Graph.seed` {#Graph.seed}
-
-The graph-level random seed of this graph.
-
-
-- - -
-
-#### `tf.Graph.unique_name(name, mark_as_used=True)` {#Graph.unique_name}
-
-Return a unique operation name for `name`.
-
-Note: You rarely need to call `unique_name()` directly.  Most of
-the time you just need to create `with g.name_scope()` blocks to
-generate structured names.
-
-`unique_name` is used to generate structured names, separated by
-`"/"`, to help identify operations when debugging a graph.
-Operation names are displayed in error messages reported by the
-TensorFlow runtime, and in various visualization tools such as
-TensorBoard.
-
-If `mark_as_used` is set to `True`, which is the default, a new
-unique name is created and marked as in use. If it's set to `False`,
-the unique name is returned without actually being marked as used.
-This is useful when the caller simply wants to know what the name
-to be created will be.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name for an operation.
-*  <b>`mark_as_used`</b>: Whether to mark this name as being used.
-
-##### Returns:
-
-  A string to be passed to `create_op()` that will be used
-  to name the operation being created.
-
-
-- - -
-
-#### `tf.Graph.version` {#Graph.version}
-
-Returns a version number that increases as ops are added to the graph.
-
-Note that this is unrelated to the
-[GraphDef version](#Graph.graph_def_version).
-
-
-- - -
-
-#### `tf.Graph.graph_def_versions` {#Graph.graph_def_versions}
-
-The GraphDef version information of this graph.
-
-For details on the meaning of each version, see
-[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
-
-##### Returns:
-
-  A `VersionDef`.
-
-
-
-- - -
-
-#### `tf.Graph.create_op(op_type, inputs, dtypes, input_types=None, name=None, attrs=None, op_def=None, compute_shapes=True, compute_device=True)` {#Graph.create_op}
-
-Creates an `Operation` in this graph.
-
-This is a low-level interface for creating an `Operation`. Most
-programs will not call this method directly, and instead use the
-Python op constructors, such as `tf.constant()`, which add ops to
-the default graph.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The `Operation` type to create. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-*  <b>`inputs`</b>: A list of `Tensor` objects that will be inputs to the `Operation`.
-*  <b>`dtypes`</b>: A list of `DType` objects that will be the types of the tensors
-    that the operation produces.
-*  <b>`input_types`</b>: (Optional.) A list of `DType`s that will be the types of
-    the tensors that the operation consumes. By default, uses the base
-    `DType` of each input in `inputs`. Operations that expect
-    reference-typed inputs must specify `input_types` explicitly.
-*  <b>`name`</b>: (Optional.) A string name for the operation. If not specified, a
-    name is generated based on `op_type`.
-*  <b>`attrs`</b>: (Optional.) A dictionary where the key is the attribute name (a
-    string) and the value is the respective `attr` attribute of the
-    `NodeDef` proto that will represent the operation (an `AttrValue`
-    proto).
-*  <b>`op_def`</b>: (Optional.) The `OpDef` proto that describes the `op_type` that
-    the operation will have.
-*  <b>`compute_shapes`</b>: (Optional.) If True, shape inference will be performed
-    to compute the shapes of the outputs.
-*  <b>`compute_device`</b>: (Optional.) If True, device functions will be executed
-    to compute the device property of the Operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not a `Tensor`.
-*  <b>`ValueError`</b>: if colocation conflicts with existing device assignment.
-
-##### Returns:
-
-  An `Operation` object.
-
-
-- - -
-
-#### `tf.Graph.gradient_override_map(op_type_map)` {#Graph.gradient_override_map}
-
-EXPERIMENTAL: A context manager for overriding gradient functions.
-
-This context manager can be used to override the gradient function
-that will be used for ops within the scope of the context.
-
-For example:
-
-```python
-@tf.RegisterGradient("CustomSquare")
-def _custom_square_grad(op, grad):
-  # ...
-
-with tf.Graph().as_default() as g:
-  c = tf.constant(5.0)
-  s_1 = tf.square(c)  # Uses the default gradient for tf.square.
-  with g.gradient_override_map({"Square": "CustomSquare"}):
-    s_2 = tf.square(s_2)  # Uses _custom_square_grad to compute the
-                          # gradient of s_2.
-```
-
-##### Args:
-
-
-*  <b>`op_type_map`</b>: A dictionary mapping op type strings to alternative op
-    type strings.
-
-##### Returns:
-
-  A context manager that sets the alternative op type to be used for one
-  or more ops created in that context.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type_map` is not a dictionary mapping strings to
-    strings.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Graph.building_function` {#Graph.building_function}
-
-Returns True iff this graph represents a function.
-
-
-- - -
-
-#### `tf.Graph.clear_collection(name)` {#Graph.clear_collection}
-
-Clears all values in a collection.
-
-##### Args:
-
-
-*  <b>`name`</b>: The key for the collection. The `GraphKeys` class contains many
-    standard names for collections.
-
-
-- - -
-
-#### `tf.Graph.colocate_with(op, ignore_existing=False)` {#Graph.colocate_with}
-
-Returns a context manager that specifies an op to colocate with.
-
-Note: this function is not for public use, only for internal libraries.
-
-For example:
-
-```python
-a = tf.Variable([1.0])
-with g.colocate_with(a):
-  b = tf.constant(1.0)
-  c = tf.add(a, b)
-```
-
-`b` and `c` will always be colocated with `a`, no matter where `a`
-is eventually placed.
-
-**NOTE** Using a colocation scope resets any existing device constraints.
-
-If `op` is `None` then `ignore_existing` must be `True` and the new
-scope resets all colocation and device constraints.
-
-##### Args:
-
-
-*  <b>`op`</b>: The op to colocate all created ops with, or `None`.
-*  <b>`ignore_existing`</b>: If true, only applies colocation of this op within
-    the context, rather than applying all colocation properties
-    on the stack.  If `op` is `None`, this value must be `True`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if op is None but ignore_existing is False.
-
-##### Yields:
-
-  A context manager that specifies the op with which to colocate
-  newly created ops.
-
-
-- - -
-
-#### `tf.Graph.container(container_name)` {#Graph.container}
-
-Returns a context manager that specifies the resource container to use.
-
-Stateful operations, such as variables and queues, can maintain their
-states on devices so that they can be shared by multiple processes.
-A resource container is a string name under which these stateful
-operations are tracked. These resources can be released or cleared
-with `tf.Session.reset()`.
-
-For example:
-
-```python
-with g.container('experiment0'):
-  # All stateful Operations constructed in this context will be placed
-  # in resource container "experiment0".
-  v1 = tf.Variable([1.0])
-  v2 = tf.Variable([2.0])
-  with g.container("experiment1"):
-    # All stateful Operations constructed in this context will be
-    # placed in resource container "experiment1".
-    v3 = tf.Variable([3.0])
-    q1 = tf.FIFOQueue(10, tf.float32)
-  # All stateful Operations constructed in this context will be
-  # be created in the "experiment0".
-  v4 = tf.Variable([4.0])
-  q1 = tf.FIFOQueue(20, tf.float32)
-  with g.container(""):
-    # All stateful Operations constructed in this context will be
-    # be placed in the default resource container.
-    v5 = tf.Variable([5.0])
-    q3 = tf.FIFOQueue(30, tf.float32)
-
-# Resets container "experiment0", after which the state of v1, v2, v4, q1
-# will become undefined (such as uninitialized).
-tf.Session.reset(target, ["experiment0"])
-```
-
-##### Args:
-
-
-*  <b>`container_name`</b>: container name string.
-
-##### Returns:
-
-  A context manager for defining resource containers for stateful ops,
-    yields the container name.
-
-
-- - -
-
-#### `tf.Graph.get_all_collection_keys()` {#Graph.get_all_collection_keys}
-
-Returns a list of collections used in this graph.
-
-
-- - -
-
-#### `tf.Graph.is_feedable(tensor)` {#Graph.is_feedable}
-
-Returns `True` if and only if `tensor` is feedable.
-
-
-- - -
-
-#### `tf.Graph.is_fetchable(tensor_or_op)` {#Graph.is_fetchable}
-
-Returns `True` if and only if `tensor_or_op` is fetchable.
-
-
-- - -
-
-#### `tf.Graph.prevent_feeding(tensor)` {#Graph.prevent_feeding}
-
-Marks the given `tensor` as unfeedable in this graph.
-
-
-- - -
-
-#### `tf.Graph.prevent_fetching(op)` {#Graph.prevent_fetching}
-
-Marks the given `op` as unfetchable in this graph.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md
deleted file mode 100644
index 308a0a80b49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md
+++ /dev/null
@@ -1,67 +0,0 @@
-A TensorFlow `Session` for use in interactive contexts, such as a shell.
-
-The only difference with a regular `Session` is that an `InteractiveSession`
-installs itself as the default session on construction.
-The methods [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval)
-and [`Operation.run()`](../../api_docs/python/framework.md#Operation.run)
-will use that session to run ops.
-
-This is convenient in interactive shells and [IPython
-notebooks](http://ipython.org), as it avoids having to pass an explicit
-`Session` object to run ops.
-
-For example:
-
-```python
-sess = tf.InteractiveSession()
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-# We can just use 'c.eval()' without passing 'sess'
-print(c.eval())
-sess.close()
-```
-
-Note that a regular session installs itself as the default session when it
-is created in a `with` statement.  The common usage in non-interactive
-programs is to follow that pattern:
-
-```python
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-with tf.Session():
-  # We can also use 'c.eval()' here.
-  print(c.eval())
-```
-
-- - -
-
-#### `tf.InteractiveSession.__init__(target='', graph=None, config=None)` {#InteractiveSession.__init__}
-
-Creates a new interactive TensorFlow session.
-
-If no `graph` argument is specified when constructing the session,
-the default graph will be launched in the session. If you are
-using more than one graph (created with `tf.Graph()` in the same
-process, you will have to use different sessions for each graph,
-but each graph can be used in multiple sessions. In this case, it
-is often clearer to pass the graph to be launched explicitly to
-the session constructor.
-
-##### Args:
-
-
-*  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine.
-*  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
-*  <b>`config`</b>: (Optional) `ConfigProto` proto used to configure the session.
-
-
-- - -
-
-#### `tf.InteractiveSession.close()` {#InteractiveSession.close}
-
-Closes an `InteractiveSession`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md
deleted file mode 100644
index 167611ebd5b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseFeature.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
-
-Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md
deleted file mode 100644
index cc3fb1c0523..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.SparseTensorValue.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, dense_shape)` {#SparseTensorValue.__new__}
-
-Create new instance of SparseTensorValue(indices, values, dense_shape)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TFRecordReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TFRecordReader.md
deleted file mode 100644
index dd8a5242da4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TFRecordReader.md
+++ /dev/null
@@ -1,173 +0,0 @@
-A Reader that outputs the records from a TFRecords file.
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.TFRecordReader.__init__(name=None, options=None)` {#TFRecordReader.__init__}
-
-Create a TFRecordReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`options`</b>: A TFRecordOptions object (optional).
-
-
-- - -
-
-#### `tf.TFRecordReader.num_records_produced(name=None)` {#TFRecordReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.num_work_units_completed(name=None)` {#TFRecordReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.read(queue, name=None)` {#TFRecordReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.read_up_to(queue, num_records, name=None)` {#TFRecordReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.reader_ref` {#TFRecordReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.TFRecordReader.reset(name=None)` {#TFRecordReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TFRecordReader.restore_state(state, name=None)` {#TFRecordReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TFRecordReader.serialize_state(name=None)` {#TFRecordReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.supports_serialize` {#TFRecordReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TextLineReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TextLineReader.md
deleted file mode 100644
index 9338435dde0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.TextLineReader.md
+++ /dev/null
@@ -1,175 +0,0 @@
-A Reader that outputs the lines of a file delimited by newlines.
-
-Newlines are stripped from the output.
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.TextLineReader.__init__(skip_header_lines=None, name=None)` {#TextLineReader.__init__}
-
-Create a TextLineReader.
-
-##### Args:
-
-
-*  <b>`skip_header_lines`</b>: An optional int. Defaults to 0.  Number of lines
-    to skip from the beginning of every file.
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.TextLineReader.num_records_produced(name=None)` {#TextLineReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.num_work_units_completed(name=None)` {#TextLineReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.read(queue, name=None)` {#TextLineReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.read_up_to(queue, num_records, name=None)` {#TextLineReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.reader_ref` {#TextLineReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.TextLineReader.reset(name=None)` {#TextLineReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TextLineReader.restore_state(state, name=None)` {#TextLineReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TextLineReader.serialize_state(name=None)` {#TextLineReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.supports_serialize` {#TextLineReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.WholeFileReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.WholeFileReader.md
deleted file mode 100644
index 0ae2d4e5915..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.WholeFileReader.md
+++ /dev/null
@@ -1,175 +0,0 @@
-A Reader that outputs the entire contents of a file as a value.
-
-To use, enqueue filenames in a Queue.  The output of Read will
-be a filename (key) and the contents of that file (value).
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.WholeFileReader.__init__(name=None)` {#WholeFileReader.__init__}
-
-Create a WholeFileReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.WholeFileReader.num_records_produced(name=None)` {#WholeFileReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.num_work_units_completed(name=None)` {#WholeFileReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.read(queue, name=None)` {#WholeFileReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.read_up_to(queue, num_records, name=None)` {#WholeFileReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.reader_ref` {#WholeFileReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.WholeFileReader.reset(name=None)` {#WholeFileReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.WholeFileReader.restore_state(state, name=None)` {#WholeFileReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.WholeFileReader.serialize_state(name=None)` {#WholeFileReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.supports_serialize` {#WholeFileReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md
deleted file mode 100644
index aa835e51cda..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.assert_non_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_non_negative}
-
-Assert the condition `x >= 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_non_negative(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Non-negative means, for every element `x[i]` of `x`, we have `x[i] >= 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).
-    Defaults to "assert_non_negative".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all non-negative.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.betainc.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.betainc.md
deleted file mode 100644
index 9da04a36426..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.betainc.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.betainc(a, b, x, name=None)` {#betainc}
-
-Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-
-The regularized incomplete beta integral is defined as:
-
-```
-I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-```
-where
-
-```
-B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-```
-
-is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-beta function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`b`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.cholesky_solve.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.cholesky_solve.md
deleted file mode 100644
index cb0bdd7feba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.cholesky_solve.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.cholesky_solve(chol, rhs, name=None)` {#cholesky_solve}
-
-Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
-
-```python
-# Solve 10 separate 2x2 linear systems:
-A = ... # shape 10 x 2 x 2
-RHS = ... # shape 10 x 2 x 1
-chol = tf.cholesky(A)  # shape 10 x 2 x 2
-X = tf.cholesky_solve(chol, RHS)  # shape 10 x 2 x 1
-# tf.matmul(A, X) ~ RHS
-X[3, :, 0]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 0]
-
-# Solve five linear systems (K = 5) for every member of the length 10 batch.
-A = ... # shape 10 x 2 x 2
-RHS = ... # shape 10 x 2 x 5
-...
-X[3, :, 2]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`chol`</b>: A `Tensor`.  Must be `float32` or `float64`, shape is `[..., M, M]`.
-    Cholesky factorization of `A`, e.g. `chol = tf.cholesky(A)`.
-    For that reason, only the lower triangular parts (including the diagonal)
-    of the last two dimensions of `chol` are used.  The strictly upper part is
-    assumed to be zero and not accessed.
-*  <b>`rhs`</b>: A `Tensor`, same type as `chol`, shape is `[..., M, K]`.
-*  <b>`name`</b>: A name to give this `Op`.  Defaults to `cholesky_solve`.
-
-##### Returns:
-
-  Solution to `A x = rhs`, shape `[..., M, K]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md
deleted file mode 100644
index 3cc1e1ac0ac..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md
+++ /dev/null
@@ -1,53 +0,0 @@
-### `tf.constant(value, dtype=None, shape=None, name='Const', verify_shape=False)` {#constant}
-
-Creates a constant tensor.
-
- The resulting tensor is populated with values of type `dtype`, as
- specified by arguments `value` and (optionally) `shape` (see examples
- below).
-
- The argument `value` can be a constant value, or a list of values of type
- `dtype`. If `value` is a list, then the length of the list must be less
- than or equal to the number of elements implied by the `shape` argument (if
- specified). In the case where the list length is less than the number of
- elements specified by `shape`, the last element in the list will be used
- to fill the remaining entries.
-
- The argument `shape` is optional. If present, it specifies the dimensions of
- the resulting tensor. If not present, the shape of `value` is used.
-
- If the argument `dtype` is not specified, then the type is inferred from
- the type of `value`.
-
- For example:
-
- ```python
- # Constant 1-D Tensor populated with value list.
- tensor = tf.constant([1, 2, 3, 4, 5, 6, 7]) => [1 2 3 4 5 6 7]
-
- # Constant 2-D tensor populated with scalar value -1.
- tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
-                                              [-1. -1. -1.]]
- ```
-
-##### Args:
-
-
-*  <b>`value`</b>: A constant value (or list) of output type `dtype`.
-
-
-*  <b>`dtype`</b>: The type of the elements of the resulting tensor.
-
-
-*  <b>`shape`</b>: Optional dimensions of resulting tensor.
-
-
-*  <b>`name`</b>: Optional name for the tensor.
-
-
-*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
-
-##### Returns:
-
-  A Constant Tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md
deleted file mode 100644
index d8f9c5c4627..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.contrib.bayesflow.monte_carlo.expectation(f, p, z=None, n=None, seed=None, name='expectation')` {#expectation}
-
-Monte Carlo estimate of an expectation:  `E_p[f(Z)]` with sample mean.
-
-This `Op` returns
-
-```
-n^{-1} sum_{i=1}^n f(z_i),  where z_i ~ p
-\approx E_p[f(Z)]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`f`</b>: Callable mapping samples from `p` to `Tensors`.
-*  <b>`p`</b>: `tf.contrib.distributions.Distribution`.
-*  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with the same `dtype` as `p`.
-
-
-*  <b>`Example`</b>: 
-
-```python
-N_samples = 10000
-
-distributions = tf.contrib.distributions
-
-dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
-elementwise_mean = lambda x: x
-mean_sum = lambda x: tf.reduce_sum(x, 1)
-
-estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
-                                                       dist,
-                                                       n=N_samples)
-estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
-                                               dist,
-                                               n=N_samples)
-
-with tf.Session() as sess:
-  estimate_elementwise_mean, estimate_mean_sum = (
-      sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
-print estimate_elementwise_mean
->>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
-print estimate_mean_sum
->>> 1.49571
-
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.copy_graph.get_copied_op.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.copy_graph.get_copied_op.md
deleted file mode 100644
index 9e5a2118fd4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.copy_graph.get_copied_op.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.copy_graph.get_copied_op(org_instance, graph, scope='')` {#get_copied_op}
-
-Given an `Operation` instance from some `Graph`, returns
-its namesake from `graph`, under the specified scope
-(default `""`).
-
-If a copy of `org_instance` is present in `graph` under the given
-`scope`, it will be returned.
-
-Args:
-org_instance: An `Operation` from some `Graph`.
-graph: The `Graph` to be searched for a copr of `org_instance`.
-scope: The scope `org_instance` is present in.
-
-##### Returns:
-
-    The `Operation` copy from `graph`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md
deleted file mode 100644
index a319e9bead4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md
+++ /dev/null
@@ -1,73 +0,0 @@
-Computes the alpha values in a linear-chain CRF.
-
-See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.__call__(inputs, state, scope=None)` {#CrfForwardRnnCell.__call__}
-
-Build the CrfForwardRnnCell.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, num_tags] matrix of unary potentials.
-*  <b>`state`</b>: A [batch_size, num_tags] matrix containing the previous alpha
-      values.
-*  <b>`scope`</b>: Unused variable scope of this cell.
-
-##### Returns:
-
-  new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
-      values containing the new alpha values.
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.__init__(transition_params)` {#CrfForwardRnnCell.__init__}
-
-Initialize the CrfForwardRnnCell.
-
-##### Args:
-
-
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-      This matrix is expanded into a [1, num_tags, num_tags] in preparation
-      for the broadcast summation occurring within the cell.
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.output_size` {#CrfForwardRnnCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.state_size` {#CrfForwardRnnCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.crf.CrfForwardRnnCell.zero_state(batch_size, dtype)` {#CrfForwardRnnCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md
deleted file mode 100644
index 956f52766de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.crf.crf_binary_score(tag_indices, sequence_lengths, transition_params)` {#crf_binary_score}
-
-Computes the binary scores of tag sequences.
-
-##### Args:
-
-
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-
-##### Returns:
-
-
-*  <b>`binary_scores`</b>: A [batch_size] vector of binary scores.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
deleted file mode 100644
index 49a2f63ee83..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
+++ /dev/null
@@ -1,630 +0,0 @@
-Categorical distribution.
-
-The categorical distribution is parameterized by the log-probabilities
-of a set of classes.
-
-#### Examples
-
-Creates a 3-class distiribution, with the 2nd class, the most likely to be
-drawn from.
-
-```python
-p = [0.1, 0.5, 0.4]
-dist = Categorical(probs=p)
-```
-
-Creates a 3-class distiribution, with the 2nd class the most likely to be
-drawn from, using logits.
-
-```python
-logits = [-50, 400, 40]
-dist = Categorical(logits=logits)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-The distribution functions can be evaluated on counts.
-
-```python
-# counts is a scalar.
-p = [0.1, 0.4, 0.5]
-dist = Categorical(probs=p)
-dist.prob(0)  # Shape []
-
-# p will be broadcast to [[0.1, 0.4, 0.5], [0.1, 0.4, 0.5]] to match counts.
-counts = [1, 0]
-dist.prob(counts)  # Shape [2]
-
-# p will be broadcast to shape [3, 5, 7, 3] to match counts.
-counts = [[...]] # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Categorical.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='Categorical')` {#Categorical.__init__}
-
-Initialize Categorical distributions using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of Categorical distributions. The first `N - 1` dimensions
-    index into a batch of independent distributions and the last dimension
-    represents a vector of logits for each class. Only one of `logits` or
-    `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of Categorical distributions. The first `N - 1` dimensions
-    index into a batch of independent distributions and the last dimension
-    represents a vector of probabilities for each class. Only one of
-    `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.allow_nan_stats` {#Categorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.batch_shape` {#Categorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.batch_shape_tensor(name='batch_shape_tensor')` {#Categorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.cdf(value, name='cdf')` {#Categorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.copy(**override_parameters_kwargs)` {#Categorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.covariance(name='covariance')` {#Categorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.dtype` {#Categorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.entropy(name='entropy')` {#Categorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_shape` {#Categorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_shape_tensor(name='event_shape_tensor')` {#Categorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.event_size` {#Categorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_continuous` {#Categorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_scalar_batch(name='is_scalar_batch')` {#Categorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.is_scalar_event(name='is_scalar_event')` {#Categorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_cdf(value, name='log_cdf')` {#Categorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_prob(value, name='log_prob')` {#Categorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.log_survival_function(value, name='log_survival_function')` {#Categorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.logits` {#Categorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.mean(name='mean')` {#Categorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.mode(name='mode')` {#Categorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.name` {#Categorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Categorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.param_static_shapes(cls, sample_shape)` {#Categorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.parameters` {#Categorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.prob(value, name='prob')` {#Categorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.probs` {#Categorical.probs}
-
-Vector of coordinatewise probabilities.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.reparameterization_type` {#Categorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.sample(sample_shape=(), seed=None, name='sample')` {#Categorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.stddev(name='stddev')` {#Categorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.survival_function(value, name='survival_function')` {#Categorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.validate_args` {#Categorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Categorical.variance(name='variance')` {#Categorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
deleted file mode 100644
index e9fb06bcaf0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
+++ /dev/null
@@ -1,613 +0,0 @@
-Chi2 distribution.
-
-The Chi2 distribution is defined over positive real numbers using a degrees of
-freedom ("df") parameter.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; df, x > 0) = x**(0.5 df - 1) exp(-0.5 x) / Z
-Z = 2**(0.5 df) Gamma(0.5 df)
-```
-
-where:
-
-* `df` denotes the degrees of freedom,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The Chi2 distribution is a special case of the Gamma distribution, i.e.,
-
-```python
-Chi2(df) = Gamma(concentration=0.5 * df, rate=0.5)
-```
-- - -
-
-#### `tf.contrib.distributions.Chi2.__init__(df, validate_args=False, allow_nan_stats=True, name='Chi2')` {#Chi2.__init__}
-
-Construct Chi2 distributions with parameter `df`.
-
-##### Args:
-
-
-*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
-    distribution(s).  `df` must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.allow_nan_stats` {#Chi2.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.batch_shape` {#Chi2.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.batch_shape_tensor(name='batch_shape_tensor')` {#Chi2.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.cdf(value, name='cdf')` {#Chi2.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.concentration` {#Chi2.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.copy(**override_parameters_kwargs)` {#Chi2.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.covariance(name='covariance')` {#Chi2.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.df` {#Chi2.df}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.dtype` {#Chi2.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.entropy(name='entropy')` {#Chi2.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.event_shape` {#Chi2.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.event_shape_tensor(name='event_shape_tensor')` {#Chi2.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_continuous` {#Chi2.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_scalar_batch(name='is_scalar_batch')` {#Chi2.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.is_scalar_event(name='is_scalar_event')` {#Chi2.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_cdf(value, name='log_cdf')` {#Chi2.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_prob(value, name='log_prob')` {#Chi2.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.log_survival_function(value, name='log_survival_function')` {#Chi2.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.mean(name='mean')` {#Chi2.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.mode(name='mode')` {#Chi2.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.name` {#Chi2.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Chi2.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.param_static_shapes(cls, sample_shape)` {#Chi2.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.parameters` {#Chi2.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.prob(value, name='prob')` {#Chi2.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.rate` {#Chi2.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.reparameterization_type` {#Chi2.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.sample(sample_shape=(), seed=None, name='sample')` {#Chi2.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.stddev(name='stddev')` {#Chi2.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.survival_function(value, name='survival_function')` {#Chi2.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.validate_args` {#Chi2.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Chi2.variance(name='variance')` {#Chi2.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ConditionalDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ConditionalDistribution.md
deleted file mode 100644
index 69ec5bc92f8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ConditionalDistribution.md
+++ /dev/null
@@ -1,474 +0,0 @@
-Distribution that supports intrinsic parameters (local latents).
-
-Subclasses of this distribution may have additional keyword arguments passed
-to their sample-based methods (i.e. `sample`, `log_prob`, etc.).
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.__init__(dtype, is_continuous, reparameterization_type, validate_args, allow_nan_stats, parameters=None, graph_parents=None, name=None)` {#ConditionalDistribution.__init__}
-
-Constructs the `Distribution`.
-
-**This is a private method for subclass use.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the event samples. `None` implies no type-enforcement.
-*  <b>`is_continuous`</b>: Python boolean. If `True` this
-    `Distribution` is continuous over its supported domain.
-*  <b>`reparameterization_type`</b>: Instance of `ReparameterizationType`.
-    If `distributions.FULLY_REPARAMETERIZED`, this
-    `Distribution` can be reparameterized in terms of some standard
-    distribution with a function whose Jacobian is constant for the support
-    of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
-    then no such reparameterization is available.
-*  <b>`validate_args`</b>: Python boolean.  Whether to validate input with asserts.
-    If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Python boolean.  If `False`, raise an
-    exception if a statistic (e.g., mean, mode) is undefined for any batch
-    member. If True, batch members with valid parameters leading to
-    undefined statistics will return `NaN` for this statistic.
-*  <b>`parameters`</b>: Python dictionary of parameters used to instantiate this
-    `Distribution`.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Distribution`.
-*  <b>`name`</b>: A name for this distribution. Default: subclass name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any member of graph_parents is `None` or not a `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.allow_nan_stats` {#ConditionalDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.batch_shape` {#ConditionalDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#ConditionalDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.cdf(*args, **kwargs)` {#ConditionalDistribution.cdf}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.copy(**override_parameters_kwargs)` {#ConditionalDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.covariance(name='covariance')` {#ConditionalDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.dtype` {#ConditionalDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.entropy(name='entropy')` {#ConditionalDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.event_shape` {#ConditionalDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.event_shape_tensor(name='event_shape_tensor')` {#ConditionalDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_continuous` {#ConditionalDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_scalar_batch(name='is_scalar_batch')` {#ConditionalDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.is_scalar_event(name='is_scalar_event')` {#ConditionalDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_cdf(*args, **kwargs)` {#ConditionalDistribution.log_cdf}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_prob(*args, **kwargs)` {#ConditionalDistribution.log_prob}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.log_survival_function(*args, **kwargs)` {#ConditionalDistribution.log_survival_function}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.mean(name='mean')` {#ConditionalDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.mode(name='mode')` {#ConditionalDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.name` {#ConditionalDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ConditionalDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.param_static_shapes(cls, sample_shape)` {#ConditionalDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.parameters` {#ConditionalDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.prob(*args, **kwargs)` {#ConditionalDistribution.prob}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.reparameterization_type` {#ConditionalDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.sample(*args, **kwargs)` {#ConditionalDistribution.sample}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.stddev(name='stddev')` {#ConditionalDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.survival_function(*args, **kwargs)` {#ConditionalDistribution.survival_function}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.validate_args` {#ConditionalDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalDistribution.variance(name='variance')` {#ConditionalDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ReparameterizationType.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ReparameterizationType.md
deleted file mode 100644
index 503f71b8ece..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.ReparameterizationType.md
+++ /dev/null
@@ -1,47 +0,0 @@
-Instances of this class represent how sampling is reparameterized.
-
-Two static instances exist in the distritributions library, signifying
-one of two possible properties for samples from a distribution:
-
-`FULLY_REPARAMETERIZED`: Samples from the distribution are fully
-  reparameterized, and straight-through gradients are supported.
-
-`NOT_REPARAMETERIZED`: Samples from the distribution are not fully
-  reparameterized, and straight-through gradients are either partially
-  unsupported or are not supported at all.  In this case, for purposes of
-  e.g. RL or variational inference, it is generally safest to wrap the
-  sample results in a `stop_gradients` call and instead use policy
-  gradients / surrogate loss instead.
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__eq__(other)` {#ReparameterizationType.__eq__}
-
-Determine if this `ReparameterizationType` is equal to another.
-
-Since RepaparameterizationType instances are constant static global
-instances, equality checks if two instances' id() values are equal.
-
-##### Args:
-
-
-*  <b>`other`</b>: Object to compare against.
-
-##### Returns:
-
-  `self is other`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__init__(rep_type)` {#ReparameterizationType.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ReparameterizationType.__repr__()` {#ReparameterizationType.__repr__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
deleted file mode 100644
index 25673c069f2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
+++ /dev/null
@@ -1,626 +0,0 @@
-Uniform distribution with `low` and `high` parameters.
-
-### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; a, b) = I[a <= x < b] / Z
-Z = b - a
-```
-
-where:
-* `low = a`,
-* `high = b`,
-* `Z` is the normalizing constant, and,
-* `I[predicate]` is the [indicator function](
-  https://en.wikipedia.org/wiki/Indicator_function) for `predicate`.
-
-The parameters `low` and `high` must be shaped in a way that supports
-broadcasting (e.g., `high - low` is a valid operation).
-
-### Examples
-
-```python
-# Without broadcasting:
-u1 = Uniform(low=3.0, high=4.0)  # a single uniform distribution [3, 4]
-u2 = Uniform(low=[1.0, 2.0],
-             high=[3.0, 4.0])  # 2 distributions [1, 3], [2, 4]
-u3 = Uniform(low=[[1.0, 2.0],
-                  [3.0, 4.0]],
-             high=[[1.5, 2.5],
-                   [3.5, 4.5]])  # 4 distributions
-```
-
-```python
-# With broadcasting:
-u1 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])  # 3 distributions
-```
-- - -
-
-#### `tf.contrib.distributions.Uniform.__init__(low=0.0, high=1.0, validate_args=False, allow_nan_stats=True, name='Uniform')` {#Uniform.__init__}
-
-Initialize a batch of Uniform distributions.
-
-##### Args:
-
-
-*  <b>`low`</b>: Floating point tensor, lower boundary of the output interval. Must
-    have `low < high`.
-*  <b>`high`</b>: Floating point tensor, upper boundary of the output interval. Must
-    have `low < high`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `low >= high` and `validate_args=False`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.allow_nan_stats` {#Uniform.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.batch_shape` {#Uniform.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.batch_shape_tensor(name='batch_shape_tensor')` {#Uniform.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.cdf(value, name='cdf')` {#Uniform.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.copy(**override_parameters_kwargs)` {#Uniform.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.covariance(name='covariance')` {#Uniform.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.dtype` {#Uniform.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.entropy(name='entropy')` {#Uniform.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.event_shape` {#Uniform.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.event_shape_tensor(name='event_shape_tensor')` {#Uniform.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.high` {#Uniform.high}
-
-Upper boundary of the output interval.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_continuous` {#Uniform.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_scalar_batch(name='is_scalar_batch')` {#Uniform.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.is_scalar_event(name='is_scalar_event')` {#Uniform.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_cdf(value, name='log_cdf')` {#Uniform.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_prob(value, name='log_prob')` {#Uniform.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.log_survival_function(value, name='log_survival_function')` {#Uniform.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.low` {#Uniform.low}
-
-Lower boundary of the output interval.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.mean(name='mean')` {#Uniform.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.mode(name='mode')` {#Uniform.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.name` {#Uniform.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Uniform.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.param_static_shapes(cls, sample_shape)` {#Uniform.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.parameters` {#Uniform.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.prob(value, name='prob')` {#Uniform.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.range(name='range')` {#Uniform.range}
-
-`high - low`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.reparameterization_type` {#Uniform.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.sample(sample_shape=(), seed=None, name='sample')` {#Uniform.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.stddev(name='stddev')` {#Uniform.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.survival_function(value, name='survival_function')` {#Uniform.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.validate_args` {#Uniform.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Uniform.variance(name='variance')` {#Uniform.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md
deleted file mode 100644
index acc1fa42bdc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md
+++ /dev/null
@@ -1,674 +0,0 @@
-The matrix Wishart distribution on positive definite matrices.
-
-This distribution is defined by a scalar degrees of freedom `df` and a
-lower, triangular Cholesky factor which characterizes the scale matrix.
-
-Using WishartCholesky is a constant-time improvement over WishartFull. It
-saves an O(nbk^3) operation, i.e., a matrix-product operation for sampling
-and a Cholesky factorization in log_prob. For most use-cases it often saves
-another O(nbk^3) operation since most uses of Wishart will also use the
-Cholesky factorization.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
-Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
-```
-
-where:
-* `df >= k` denotes the degrees of freedom,
-* `scale` is a symmetric, positive definite, `k x k` matrix,
-* `Z` is the normalizing constant, and,
-* `Gamma_k` is the [multivariate Gamma function](
-  https://en.wikipedia.org/wiki/Multivariate_gamma_function).
-
-
-#### Examples
-
-```python
-# Initialize a single 3x3 Wishart with Cholesky factored scale matrix and 5
-# degrees-of-freedom.(*)
-df = 5
-chol_scale = tf.cholesky(...)  # Shape is [3, 3].
-dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-x = ... # A 3x3 positive definite matrix.
-dist.prob(x)  # Shape is [], a scalar.
-
-# Evaluate this on a two observations, each in R^{3x3}, returning a length two
-# Tensor.
-x = [x0, x1]  # Shape is [2, 3, 3].
-dist.prob(x)  # Shape is [2].
-
-# Initialize two 3x3 Wisharts with Cholesky factored scale matrices.
-df = [5, 4]
-chol_scale = tf.cholesky(...)  # Shape is [2, 3, 3].
-dist = tf.contrib.distributions.WishartCholesky(df=df, scale=chol_scale)
-
-# Evaluate this on four observations.
-x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3].
-dist.prob(x)  # Shape is [2, 2].
-
-# (*) - To efficiently create a trainable covariance matrix, see the example
-#   in tf.contrib.distributions.matrix_diag_transform.
-```
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.__init__(df, scale, cholesky_input_output_matrices=False, validate_args=False, allow_nan_stats=True, name='WishartCholesky')` {#WishartCholesky.__init__}
-
-Construct Wishart distributions.
-
-##### Args:
-
-
-*  <b>`df`</b>: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
-    or equal to dimension of the scale matrix.
-*  <b>`scale`</b>: `float` or `double` `Tensor`. The Cholesky factorization of
-    the symmetric positive definite scale matrix of the distribution.
-*  <b>`cholesky_input_output_matrices`</b>: `Boolean`. Any function which whose input
-    or output is a matrix assumes the input is Cholesky and returns a
-    Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
-    `sample_n` returns a Cholesky when
-    `cholesky_input_output_matrices=True`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.allow_nan_stats` {#WishartCholesky.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.batch_shape` {#WishartCholesky.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.batch_shape_tensor(name='batch_shape_tensor')` {#WishartCholesky.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.cdf(value, name='cdf')` {#WishartCholesky.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.cholesky_input_output_matrices` {#WishartCholesky.cholesky_input_output_matrices}
-
-Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.copy(**override_parameters_kwargs)` {#WishartCholesky.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.covariance(name='covariance')` {#WishartCholesky.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.df` {#WishartCholesky.df}
-
-Wishart distribution degree(s) of freedom.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.dimension` {#WishartCholesky.dimension}
-
-Dimension of underlying vector space. The `p` in `R^(p*p)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.dtype` {#WishartCholesky.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.entropy(name='entropy')` {#WishartCholesky.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.event_shape` {#WishartCholesky.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.event_shape_tensor(name='event_shape_tensor')` {#WishartCholesky.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_continuous` {#WishartCholesky.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_scalar_batch(name='is_scalar_batch')` {#WishartCholesky.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.is_scalar_event(name='is_scalar_event')` {#WishartCholesky.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_cdf(value, name='log_cdf')` {#WishartCholesky.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_normalization(name='log_normalization')` {#WishartCholesky.log_normalization}
-
-Computes the log normalizing constant, log(Z).
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_prob(value, name='log_prob')` {#WishartCholesky.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.log_survival_function(value, name='log_survival_function')` {#WishartCholesky.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mean(name='mean')` {#WishartCholesky.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mean_log_det(name='mean_log_det')` {#WishartCholesky.mean_log_det}
-
-Computes E[log(det(X))] under this Wishart distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.mode(name='mode')` {#WishartCholesky.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.name` {#WishartCholesky.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#WishartCholesky.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.param_static_shapes(cls, sample_shape)` {#WishartCholesky.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.parameters` {#WishartCholesky.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.prob(value, name='prob')` {#WishartCholesky.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.reparameterization_type` {#WishartCholesky.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.sample(sample_shape=(), seed=None, name='sample')` {#WishartCholesky.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.scale()` {#WishartCholesky.scale}
-
-Wishart distribution scale matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.scale_operator_pd` {#WishartCholesky.scale_operator_pd}
-
-Wishart distribution scale matrix as an OperatorPD.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.stddev(name='stddev')` {#WishartCholesky.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.survival_function(value, name='survival_function')` {#WishartCholesky.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.validate_args` {#WishartCholesky.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartCholesky.variance(name='variance')` {#WishartCholesky.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md
deleted file mode 100644
index 6e3d6d78b65..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md
+++ /dev/null
@@ -1,508 +0,0 @@
-Interface for transforming a `Distribution` sample.
-
-A `Bijector` implements a
-[diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
-bijective, differentiable function. A `Bijector` is used by
-`TransformedDistribution` but can be generally used for transforming a
-`Distribution` generated `Tensor`.  A `Bijector` is characterized by three
-operations:
-
-1. Forward Evaluation
-
-   Useful for turning one random outcome into another random outcome from a
-   different distribution.
-
-2. Inverse Evaluation
-
-   Useful for "reversing" a transformation to compute one probability in
-   terms of another.
-
-3. (log o det o Jacobian o inverse)(x)
-
-   "The log of the determinant of the matrix of all first-order partial
-   derivatives of the inverse function."
-   Useful for inverting a transformation to compute one probability in terms
-   of another.  Geometrically, the det(Jacobian) is the volume of the
-   transformation and is used to scale the probability.
-
-By convention, transformations of random variables are named in terms of the
-forward transformation. The forward transformation creates samples, the
-inverse is useful for computing probabilities.
-
-Example Use:
-
-  - Basic properties:
-
-  ```python
-  x = ... # A tensor.
-  # Evaluate forward transformation.
-  fwd_x = my_bijector.forward(x)
-  x == my_bijector.inverse(fwd_x)
-  x != my_bijector.forward(fwd_x)  # Not equal because g(x) != g(g(x)).
-  ```
-
-  - Computing a log-likelihood:
-
-  ```python
-  def transformed_log_prob(bijector, log_prob, x):
-    return (bijector.inverse_log_det_jacobian(x) +
-            log_prob(bijector.inverse(x)))
-  ```
-
-  - Transforming a random outcome:
-
-  ```python
-  def transformed_sample(bijector, x):
-    return bijector.forward(x)
-  ```
-
-Example transformations:
-
-  - "Exponential"
-
-    ```
-    Y = g(X) = exp(X)
-    X ~ Normal(0, 1)  # Univariate.
-    ```
-
-    Implies:
-
-    ```
-      g^{-1}(Y) = log(Y)
-      |Jacobian(g^{-1})(y)| = 1 / y
-      Y ~ LogNormal(0, 1), i.e.,
-      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                = (1 / y) Normal(log(y); 0, 1)
-    ```
-
-    Here is an example of how one might implement the `Exp` bijector:
-
-    ```
-      class Exp(Bijector):
-        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
-          super(Exp, self).__init__(event_ndims=event_ndims,
-                                    validate_args=validate_args, name=name)
-        def _forward(self, x):
-          return math_ops.exp(x)
-        def _inverse_and_inverse_log_det_jacobian(self, y):
-          x = math_ops.log(y)
-          return x, -self._forward_log_det_jacobian(x)
-        def _forward_log_det_jacobian(self, x):
-          if self.event_ndims is None:
-            raise ValueError("Jacobian requires known event_ndims.")
-          event_dims = array_ops.shape(x)[-self.event_ndims:]
-          return math_ops.reduce_sum(x, reduction_indices=event_dims)
-      ```
-
-  - "Affine"
-
-    ```
-    Y = g(X) = sqrtSigma * X + mu
-    X ~ MultivariateNormal(0, I_d)
-    ```
-
-    Implies:
-
-    ```
-      g^{-1}(Y) = inv(sqrtSigma) * (Y - mu)
-      |Jacobian(g^{-1})(y)| = det(inv(sqrtSigma))
-      Y ~ MultivariateNormal(mu, sqrtSigma) , i.e.,
-      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
-                = det(sqrtSigma)^(-d) *
-                  MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
-    ```
-
-Example of why a `Bijector` needs to understand sample, batch, event
-partitioning:
-
-- Consider the `Exp` `Bijector` applied to a `Tensor` which has sample, batch,
-  and event (S, B, E) shape semantics.  Suppose
-  the `Tensor`'s partitioned-shape is `(S=[4], B=[2], E=[3, 3])`.
-
-  For `Exp`, the shape of the `Tensor` returned by `forward` and `inverse` is
-  unchanged, i.e., `[4, 2, 3, 3]`. However the shape returned by
-  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-  over the event dimensions.
-
-Subclass Requirements:
-
-- Typically subclasses implement `_forward` and one or both of:
-    - `_inverse`, `_inverse_log_det_jacobian`,
-    - `_inverse_and_inverse_log_det_jacobian`.
-
-- If the `Bijector`'s use is limited to `TransformedDistribution` (or friends
-  like `QuantizedDistribution`) then depending on your use, you may not need
-  to implement all of `_forward` and `_inverse` functions.  Examples:
-    1. Sampling (e.g., `sample`) only requires `_forward`.
-    2. Probability functions (e.g., `prob`, `cdf`, `survival`) only require
-       `_inverse` (and related).
-    3. Only calling probability functions on the output of `sample` means
-      `_inverse` can be implemented as a cache lookup.
-
-  See `Example Use` [above] which shows how these functions are used to
-  transform a distribution.  (Note: `_forward` could theoretically be
-  implemented as a cache lookup but this would require controlling the
-  underlying sample generation mechanism.)
-
-- If computation can be shared among `_inverse` and
-  `_inverse_log_det_jacobian` it is preferable to implement
-  `_inverse_and_inverse_log_det_jacobian`. This usually reduces
-  graph-construction overhead because a `Distribution`'s implementation of
-  `log_prob` will need to evaluate both the inverse Jacobian as well as the
-  inverse function.
-
-- If an additional use case needs just `inverse` or just
-  `inverse_log_det_jacobian` then he or she may also wish to implement these
-  functions to avoid computing the `inverse_log_det_jacobian` or the
-  `inverse`, respectively.
-
-- Subclasses should implement `_forward_event_shape`,
-  `_forward_event_shape_tensor` (and `inverse` counterparts) if the
-  transformation is shape-changing.  By default the event-shape is assumed
-  unchanged from input.
-
-Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
-
-- As case 3 [above] indicates, under some circumstances the inverse function
-  can be implemented as a cache lookup.
-
-- The inverse `log o det o Jacobian` can be implemented as the negative of the
-  forward `log o det o Jacobian`.  This is useful if the `inverse` is
-  implemented as a cache or the inverse Jacobian is computationally more
-  expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
-  demonstrates the suggested implementation.
-
-  ```python
-  def _inverse_and_log_det_jacobian(self, y):
-     x = # ... implement inverse, possibly via cache.
-     return x, -self._forward_log_det_jac(x)  # Note negation.
-  ```
-
-  By overriding the `_inverse_and_log_det_jacobian` function we have access to
-  the inverse in one call.
-
-  The correctness of this approach can be seen from the following claim.
-
-  - Claim:
-
-      Assume `Y=g(X)` is a bijection whose derivative exists and is nonzero
-      for its domain, i.e., `d/dX g(X)!=0`. Then:
-
-      ```none
-      (log o det o jacobian o g^{-1})(Y) = -(log o det o jacobian o g)(X)
-      ```
-
-  - Proof:
-
-      From the bijective, nonzero differentiability of `g`, the
-      [inverse function theorem](
-          https://en.wikipedia.org/wiki/Inverse_function_theorem)
-      implies `g^{-1}` is differentiable in the image of `g`.
-      Applying the chain rule to `y = g(x) = g(g^{-1}(y))` yields
-      `I = g'(g^{-1}(y))*g^{-1}'(y)`.
-      The same theorem also implies `g{-1}'` is non-singular therefore:
-      `inv[ g'(g^{-1}(y)) ] = g^{-1}'(y)`.
-      The claim follows from [properties of determinant](
-https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
-
-- If possible, prefer a direct implementation of the inverse Jacobian. This
-  should have superior numerical stability and will often share subgraphs with
-  the `_inverse` implementation.
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.__init__(event_ndims=None, graph_parents=None, is_constant_jacobian=False, validate_args=False, dtype=None, name=None)` {#Bijector.__init__}
-
-Constructs Bijector.
-
-A `Bijector` transforms random variables into new random variables.
-
-Examples:
-
-```python
-# Create the Y = g(X) = X transform which operates on vector events.
-identity = Identity(event_ndims=1)
-
-# Create the Y = g(X) = exp(X) transform which operates on matrices.
-exp = Exp(event_ndims=2)
-```
-
-See `Bijector` subclass docstring for more details and specific examples.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: number of dimensions associated with event coordinates.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `Bijector`.
-*  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is not a
-    function of the input.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input with
-    asserts. If `validate_args` is `False`, and the inputs are invalid,
-    correct behavior is not guaranteed.
-*  <b>`dtype`</b>: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
-    enforced.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.dtype` {#Bijector.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.event_ndims` {#Bijector.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward(x, name='forward')` {#Bijector.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_event_shape(input_shape)` {#Bijector.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Bijector.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Bijector.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.graph_parents` {#Bijector.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse(y, name='inverse')` {#Bijector.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Bijector.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_event_shape(output_shape)` {#Bijector.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Bijector.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Bijector.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.is_constant_jacobian` {#Bijector.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.name` {#Bijector.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Bijector.validate_args` {#Bijector.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.arg_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.arg_scope.md
deleted file mode 100644
index 1b3ea08e64f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.arg_scope.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.framework.arg_scope(list_ops_or_scope, **kwargs)` {#arg_scope}
-
-Stores the default arguments for the given set of list_ops.
-
-For usage, please see examples at top of the file.
-
-##### Args:
-
-
-*  <b>`list_ops_or_scope`</b>: List or tuple of operations to set argument scope for or
-    a dictionary containing the current scope. When list_ops_or_scope is a
-    dict, kwargs must be empty. When list_ops_or_scope is a list or tuple,
-    then every op in it need to be decorated with @add_arg_scope to work.
-*  <b>`**kwargs`</b>: keyword=value that will define the defaults for each op in
-            list_ops. All the ops need to accept the given set of arguments.
-
-##### Yields:
-
-  the current_scope, which is a dictionary of {op: {arg: value}}
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if list_ops is not a list or a tuple.
-*  <b>`ValueError`</b>: if any op in list_ops has not be decorated with @add_arg_scope.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.assert_scalar_int.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.assert_scalar_int.md
deleted file mode 100644
index 469566f7b8d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.assert_scalar_int.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.framework.assert_scalar_int(tensor, name=None)` {#assert_scalar_int}
-
-Assert `tensor` is 0-D, of type `tf.int32` or `tf.int64`.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` to test.
-*  <b>`name`</b>: Name of the op and of the new `Tensor` if one is created.
-
-##### Returns:
-
-  `tensor`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `tensor` is not 0-D, of type `tf.int32` or `tf.int64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
deleted file mode 100644
index 285ea14f96e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
-
-Decorator for marking specific function argument values as deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called with the deprecated argument values. It has the following format:
-
-  Calling <function> (from <module>) with <arg>=<value> is deprecated and
-  will be removed after <date>. Instructions for updating:
-    <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated arguments)' is
-appended to the first line of the docstring and a deprecation notice is
-prepended to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_unique_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_unique_variable.md
deleted file mode 100644
index 39ef6a14532..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_unique_variable.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.framework.get_unique_variable(var_op_name)` {#get_unique_variable}
-
-Gets the variable uniquely identified by that var_op_name.
-
-##### Args:
-
-
-*  <b>`var_op_name`</b>: the full name of the variable op, including the scope.
-
-##### Returns:
-
-  a tensorflow variable.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if no variable uniquely identified by the name exists.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_variables_to_restore.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_variables_to_restore.md
deleted file mode 100644
index c9cde43c397..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.get_variables_to_restore.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.framework.get_variables_to_restore(include=None, exclude=None)` {#get_variables_to_restore}
-
-Gets the list of the variables to restore.
-
-##### Args:
-
-
-*  <b>`include`</b>: an optional list/tuple of scope strings for filtering which
-    variables from the VARIABLES collection to include. None would include all
-    the variables.
-*  <b>`exclude`</b>: an optional list/tuple of scope strings for filtering which
-    variables from the VARIABLES collection to exclude. None it would not
-    exclude any.
-
-##### Returns:
-
-  a list of variables to restore.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: include or exclude is provided but is not a list or a tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.load_checkpoint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.load_checkpoint.md
deleted file mode 100644
index d8a1c94f6f6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.load_checkpoint.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.framework.load_checkpoint(filepattern)` {#load_checkpoint}
-
-Returns CheckpointReader for latest checkpoint.
-
-##### Args:
-
-
-*  <b>`filepattern`</b>: Directory with checkpoints file or path to checkpoint.
-
-##### Returns:
-
-  `CheckpointReader` object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if checkpoint_dir doesn't have 'checkpoint' file or checkpoints.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.with_same_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.with_same_shape.md
deleted file mode 100644
index a0d85c425e6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.with_same_shape.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.with_same_shape(expected_tensor, tensor)` {#with_same_shape}
-
-Assert tensors are the same shape, from the same graph.
-
-##### Args:
-
-
-*  <b>`expected_tensor`</b>: Tensor with expected shape.
-*  <b>`tensor`</b>: Tensor of actual values.
-
-##### Returns:
-
-  Tuple of (actual_tensor, label_tensor), possibly with assert ops added.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
deleted file mode 100644
index 07338f7185d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
+++ /dev/null
@@ -1,472 +0,0 @@
-A subgraph view on an existing `tf.Graph`.
-
-An instance of this class is a subgraph view on an existing `tf.Graph`.
-"subgraph" means that it can represent part of the whole `tf.Graph`.
-"view" means that it only provides a passive observation and do not to act
-on the `tf.Graph`. Note that in this documentation, the term "subgraph" is
-often used as substitute to "subgraph view".
-
-A subgraph contains:
-
-* a list of input tensors, accessible via the `inputs` property.
-* a list of output tensors, accessible via the `outputs` property.
-* and the operations in between, accessible via the "ops" property.
-
-An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
-function which takes as input some input tensors and returns as output some
-output tensors. The computation that the function performs is encoded in the
-operations of the subgraph.
-
-The tensors (input or output) can be of two kinds:
-
-- connected: a connected tensor connects to at least one operation contained
-in the subgraph. One example is a subgraph representing a single operation
-and its inputs and outputs: all the input and output tensors of the op
-are "connected".
-- passthrough: a passthrough tensor does not connect to any operation
-contained in the subgraph. One example is a subgraph representing a
-single tensor: this tensor is passthrough. By default a passthrough tensor is
-present both in the input and output tensors of the subgraph. It can however
-be remapped to only appear as an input (or output) only.
-
-The input and output tensors can be remapped. For instance, some input tensor
-can be omitted. For instance, a subgraph representing an operation with two
-inputs can be remapped to only take one input. Note that this does not change
-at all the underlying `tf.Graph` (remember, it is a view). It means that
-the other input is being ignored, or is being treated as "given".
-The analogy with functions can be extended like this: F(x,y) is the original
-function. Remapping the inputs from [x, y] to just [x] means that the subgraph
-now represent the function F_y(x) (y is "given").
-
-The output tensors can also be remapped. For instance, some output tensor can
-be omitted. Other output tensor can be duplicated as well. As mentioned
-before, this does not change at all the underlying `tf.Graph`.
-The analogy with functions can be extended like this: F(...)->x,y is the
-original function. Remapping the outputs from [x, y] to just [y,y] means that
-the subgraph now represent the function M(F(...)) where M is the function
-M(a,b)->b,b.
-
-It is useful to describe three other kind of tensors:
-
-* internal: an internal tensor is a tensor connecting operations contained
-  in the subgraph. One example in the subgraph representing the two
-  operations A and B connected sequentially: -> A -> B ->. The middle arrow
-  is an internal tensor.
-* actual input: an input tensor of the subgraph, regardless of whether it is
-  listed in "inputs" or not (masked-out).
-* actual output: an output tensor of the subgraph, regardless of whether it is
-  listed in "outputs" or not (masked-out).
-* hidden input: an actual input which has been masked-out using an
-  input remapping. In other word, a hidden input is a non-internal tensor
-  not listed as a input tensor and one of whose consumers belongs to
-  the subgraph.
-* hidden output: a actual output which has been masked-out using an output
-  remapping. In other word, a hidden output is a non-internal tensor
-  not listed as an output and one of whose generating operations belongs to
-  the subgraph.
-
-Here are some useful guarantees about an instance of a SubGraphView:
-
-* the input (or output) tensors are not internal.
-* the input (or output) tensors are either "connected" or "passthrough".
-* the passthrough tensors are not connected to any of the operation of
-the subgraph.
-
-Note that there is no guarantee that an operation in a subgraph contributes
-at all to its inputs or outputs. For instance, remapping both the inputs and
-outputs to empty lists will produce a subgraph which still contains all the
-original operations. However, the remove_unused_ops function can be used to
-make a new subgraph view whose operations are connected to at least one of
-the input or output tensors.
-
-An instance of this class is meant to be a lightweight object which is not
-modified in-place by the user. Rather, the user can create new modified
-instances of a given subgraph. In that sense, the class SubGraphView is meant
-to be used like an immutable python object.
-
-A common problem when using views is that they can get out-of-sync with the
-data they observe (in this case, a `tf.Graph`). This is up to the user to
-ensure that this doesn't happen. To keep on the safe side, it is recommended
-that the life time of subgraph views are kept very short. One way to achieve
-this is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
-
-To alleviate the out-of-sync problem, some functions are granted the right to
-modified subgraph in place. This is typically the case of graph manipulation
-functions which, given some subgraphs as arguments, can modify the underlying
-`tf.Graph`. Since this modification is likely to render the subgraph view
-invalid, those functions can modify the argument in place to reflect the
-change. For instance, calling the function swap_inputs(svg0, svg1) will modify
-svg0 and svg1 in place to reflect the fact that their inputs have now being
-swapped.
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__bool__()` {#SubGraphView.__bool__}
-
-Allows for implicit boolean conversion.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__copy__()` {#SubGraphView.__copy__}
-
-Create a copy of this subgraph.
-
-Note that this class is a "view", copying it only create another view and
-does not copy the underlying part of the `tf.Graph`.
-
-##### Returns:
-
-  A new identical instance of the original subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__enter__()` {#SubGraphView.__enter__}
-
-Allow Python context to minimize the life time of a subgraph view.
-
-A subgraph view is meant to be a lightweight and transient object. A short
-lifetime will alleviate the "out-of-sync" issue mentioned earlier. For that
-reason, a SubGraphView instance can be used within a Python context. For
-example:
-
-from tensorflow.contrib import graph_editor as ge
-with ge.make_sgv(...) as sgv:
-  print(sgv)
-
-##### Returns:
-
-  Itself.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__exit__(exc_type, exc_value, traceback)` {#SubGraphView.__exit__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
-
-Create a subgraph containing the given ops and the "passthrough" tensors.
-
-##### Args:
-
-
-*  <b>`inside_ops`</b>: an object convertible to a list of `tf.Operation`. This list
-    defines all the operations in the subgraph.
-*  <b>`passthrough_ts`</b>: an object convertible to a list of `tf.Tensor`. This list
-    define all the "passthrough" tensors. A passthrough tensor is a tensor
-    which goes directly from the input of the subgraph to it output, without
-    any intermediate operations. All the non passthrough tensors are
-    silently ignored.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of `tf.Operation`
-    or if `passthrough_ts` cannot be converted to a list of `tf.Tensor`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__nonzero__()` {#SubGraphView.__nonzero__}
-
-Allows for implicit boolean conversion.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.__str__()` {#SubGraphView.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
-
-The connected input tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
-
-The connected output tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
-
-Return a Python set of all the consumers of this subgraph view.
-
-A consumer of a subgraph view is a tf.Operation which is a consumer
-of one of the output tensors and is not in the subgraph.
-
-##### Returns:
-
-  A list of `tf.Operation` which are the consumers of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
-
-Return a copy of itself.
-
-Note that this class is a "view", copying it only create another view and
-does not copy the underlying part of the tf.Graph.
-
-##### Returns:
-
-  A new instance identical to the original one.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
-
-Return the op named op_name.
-
-##### Args:
-
-
-*  <b>`op_name`</b>: the name to search for
-
-##### Returns:
-
-  The op named op_name.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the op_name could not be found.
-*  <b>`AssertionError`</b>: if the name was found multiple time.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
-
-The underlying `tf.Graph`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
-
-Find the input index corresponding to the given input tensor t.
-
-##### Args:
-
-
-*  <b>`t`</b>: the input tensor of this subgraph view.
-
-##### Returns:
-
-  The index in the self.inputs list.
-
-##### Raises:
-
-
-*  <b>`Error`</b>: if t in not an input tensor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
-
-The input tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
-
-Check whether a tensor is passthrough.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
-
-Get an op by its index.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
-
-The operations in this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
-
-Find the output index corresponding to given output tensor t.
-
-##### Args:
-
-
-*  <b>`t`</b>: the output tensor of this subgraph view.
-
-##### Returns:
-
-  The index in the self.outputs list.
-
-##### Raises:
-
-
-*  <b>`Error`</b>: if t in not an output tensor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
-
-The output tensors of this subgraph view.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
-
-The passthrough tensors, going straight from input to output.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
-
-Remap the inputs and outputs of the subgraph.
-
-Note that this is only modifying the view: the underlying tf.Graph is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_input_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old inputs and the new ones.
-    Integers must be positive and smaller than the number of old inputs.
-    tf.Tensors must belong to the old list of inputs.
-    This mapping can be under-complete and must be without repetitions.
-*  <b>`new_output_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old outputs and the new ones.
-    Integers must be positive and smaller than the number of old outputs.
-    tf.Tensors must belong to the old list of outputs.
-    This mapping can be under-complete and can have repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    inputs and outputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
-
-Remap the inputs and/or outputs to the default mapping.
-
-##### Args:
-
-
-*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
-*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with its
-    input and/or output mapping reset to the default one.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
-
-Remap the inputs of the subgraph.
-
-If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
-will create a new instance whose inputs is [t2, t0].
-
-Note that this is only modifying the view: the underlying `tf.Graph` is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_input_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old inputs and the new ones.
-    Integers must be positive and smaller than the number of old inputs.
-    tf.Tensors must belong to the old list of inputs.
-    This mapping can be under-complete and must be without repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    inputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
-
-Remap the output of the subgraph.
-
-If the output of the original subgraph are [t0, t1, t2], remapping to
-[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
-
-Note that this is only modifying the view: the underlying tf.Graph is not
-affected.
-
-##### Args:
-
-
-*  <b>`new_output_indices`</b>: an iterable of integers or tf.Tensors
-    representing a mapping between the old outputs and the new ones.
-    Integers must be positive and smaller than the number of old outputs.
-    tf.Tensors must belong to the old list of outputs.
-    This mapping can be under-complete and can have repetitions.
-
-##### Returns:
-
-  A new modified instance of the original subgraph view with remapped
-    outputs.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
-
-Remap the outputs so that all the tensors appears only once.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
-
-Remap the outputs to match the number of consumers.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
-
-Remove unused ops.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
-
-##### Returns:
-
-  A new subgraph view which only contains used operations.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.TransformerInfo.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.TransformerInfo.md
deleted file mode 100644
index 34489b53050..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.TransformerInfo.md
+++ /dev/null
@@ -1,67 +0,0 @@
-"Contains information about the result of a transform operation.
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.__init__(info)` {#TransformerInfo.__init__}
-
-Constructor.
-
-##### Args:
-
-
-*  <b>`info`</b>: an instance of Transformer._TmpInfo containing various internal
-    information about the transform operation.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.__str__()` {#TransformerInfo.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.original(transformed, missing_fn=None)` {#TransformerInfo.original}
-
-Return the original op/tensor corresponding to the transformed one.
-
-Note that the output of this function mimics the hierarchy
-of its input argument `transformed`.
-Given an iterable, it returns a list. Given an operation or a tensor,
-it will return an operation or a tensor.
-
-##### Args:
-
-
-*  <b>`transformed`</b>: the transformed tensor/operation.
-*  <b>`missing_fn`</b>: function handling the case where the counterpart
-    cannot be found. By default, None is returned.
-
-##### Returns:
-
-  the original tensor/operation (or None if no match is found).
-
-
-- - -
-
-#### `tf.contrib.graph_editor.TransformerInfo.transformed(original, missing_fn=None)` {#TransformerInfo.transformed}
-
-Return the transformed op/tensor corresponding to the original one.
-
-Note that the output of this function mimics the hierarchy
-of its input argument `original`.
-Given an iterable, it returns a list. Given an operation or a tensor,
-it will return an operation or a tensor.
-
-##### Args:
-
-
-*  <b>`original`</b>: the original tensor/operation.
-*  <b>`missing_fn`</b>: function handling the case where the counterpart
-    cannot be found. By default, None is returned.
-
-##### Returns:
-
-  the transformed tensor/operation (or None if no match is found).
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
deleted file mode 100644
index 008fe66686e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='', reuse_dst_scope=False)` {#copy}
-
-Copy a subgraph.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
-    using the same rules than the function subgraph.make_view.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dst_graph` is not a `tf.Graph`.
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.filter_ts_from_regex.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.filter_ts_from_regex.md
deleted file mode 100644
index 469a458b4ba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.filter_ts_from_regex.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.graph_editor.filter_ts_from_regex(ops, regex)` {#filter_ts_from_regex}
-
-Get all the tensors linked to ops that match the given regex.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of tf.Operation.
-*  <b>`regex`</b>: a regular expression matching the tensors' name.
-    For example, "^foo(/.*)?:\d+$" will match all the tensors in the "foo"
-    scope.
-
-##### Returns:
-
-  A list of tf.Tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.keep_t_if_possible_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.keep_t_if_possible_handler.md
deleted file mode 100644
index 97d39771243..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.keep_t_if_possible_handler.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.keep_t_if_possible_handler(info, t)` {#keep_t_if_possible_handler}
-
-Transform a tensor into itself (identity) if possible.
-
-This handler transform a tensor into itself if the source and destination
-graph are the same. Otherwise it will create a placeholder.
-This handler is typically used to transform a hidden input tensors.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`t`</b>: tensor whose input must be transformed into a place holder.
-
-##### Returns:
-
-  The tensor generated by the newly created place holder.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.infer_real_valued_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.infer_real_valued_columns.md
deleted file mode 100644
index 92c0e584f2f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.infer_real_valued_columns.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.layers.infer_real_valued_columns(features)` {#infer_real_valued_columns}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
deleted file mode 100644
index a3a7a2989f3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
+++ /dev/null
@@ -1,83 +0,0 @@
-### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False)` {#optimize_loss}
-
-Given loss and parameters for optimizer, returns a training op.
-
-Various ways of passing optimizers, include:
-
-- string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
-    for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
-- function, takes learning rate `Tensor` as argument and must return
-    `Optimizer` instance. E.g. `optimize_loss(...,
-    optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
-  Alternatively, if `learning_rate` is `None`, the function takes no
-  arguments. E.g. `optimize_loss(..., learning_rate=None,
-    optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
-- class, subclass of `Optimizer` that takes only one required argument -
-    learning rate, such as AdamOptimizer, AdagradOptimizer.
-    E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
-- object, instance of subclass of `Optimizer`.
-    E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.
-
-##### Args:
-
-
-*  <b>`loss`</b>: Scalar `Tensor`.
-*  <b>`global_step`</b>: Scalar int `Tensor`, step counter for each update. If not
-               supplied, it will be fetched from the default graph (see
-               `tf.contrib.framework.get_global_step` for details). If it's
-               not been created, no step will be incremented with each weight
-               update. `learning_rate_decay_fn` requires `global_step`.
-*  <b>`learning_rate`</b>: float or `Tensor`, magnitude of update per each training
-                 step. Can be `None`.
-*  <b>`optimizer`</b>: string, class or optimizer instance, used as trainer.
-             string should be name of optimizer, like 'SGD',
-               'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
-             class should be sub-class of `tf.Optimizer` that implements
-               `compute_gradients` and `apply_gradients` functions.
-             optimizer instance should be instantiation of `tf.Optimizer`
-               sub-class and have `compute_gradients` and `apply_gradients`
-               functions.
-*  <b>`gradient_noise_scale`</b>: float or None, adds 0-mean normal noise scaled by this
-                        value.
-*  <b>`gradient_multipliers`</b>: dict of variables or variable names to floats.
-                        If present, gradients for specified
-                        variables will be multiplied by given constant.
-*  <b>`clip_gradients`</b>: float, callable or `None`. If float, is provided, a global
-    clipping is applied to prevent the norm of the gradient to exceed this
-    value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
-    This callable takes a `list` of `(gradients, variables)` `tuple`s and
-    returns the same thing with the gradients modified.
-*  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
-                          `Tensor`s, returns `Tensor`.
-                          Can be used to implement any learning rate decay
-                          functions.
-                          For example: `tf.train.exponential_decay`.
-                          Ignored if `learning_rate` is not supplied.
-*  <b>`update_ops`</b>: list of update `Operation`s to execute at each step. If `None`,
-              uses elements of UPDATE_OPS collection. The order of execution
-              between `update_ops` and `loss` is non-deterministic.
-*  <b>`variables`</b>: list of variables to optimize or
-             `None` to use all trainable variables.
-*  <b>`name`</b>: The name for this operation is used to scope operations and summaries.
-*  <b>`summaries`</b>: List of internal quantities to visualize on tensorboard. If not
-             set only the loss and the learning rate will be reported. The
-             complete list is in OPTIMIZER_SUMMARIES.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with the
-                               corresponding op.
-
-##### Returns:
-
-  Training op.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if:
-      * `loss` is an invalid type or shape.
-      * `global_step` is an invalid type or shape.
-      * `learning_rate` is an invalid type or value.
-      * `optimizer` is wrong type.
-      * `clip_gradients` is not float or callable.
-      * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
-        `global_step` is available.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.repeat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.repeat.md
deleted file mode 100644
index 47672d30bd5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.repeat.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.contrib.layers.repeat(inputs, repetitions, layer, *args, **kwargs)` {#repeat}
-
-Applies the same layer with the same arguments repeatedly.
-
-```python
-  y = repeat(x, 3, conv2d, 64, [3, 3], scope='conv1')
-  # It is equivalent to:
-
-  x = conv2d(x, 64, [3, 3], scope='conv1/conv1_1')
-  x = conv2d(x, 64, [3, 3], scope='conv1/conv1_2')
-  y = conv2d(x, 64, [3, 3], scope='conv1/conv1_3')
-```
-
-If the `scope` argument is not given in `kwargs`, it is set to
-`layer.__name__`, or `layer.func.__name__` (for `functools.partial`
-objects). If neither `__name__` nor `func.__name__` is available, the
-layers are called with `scope='stack'`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` suitable for layer.
-*  <b>`repetitions`</b>: Int, number of repetitions.
-*  <b>`layer`</b>: A layer with arguments `(inputs, *args, **kwargs)`
-*  <b>`*args`</b>: Extra args for the layer.
-*  <b>`**kwargs`</b>: Extra kwargs for the layer.
-
-##### Returns:
-
-  A tensor result of applying the layer, repetitions times.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the op is unknown or wrong.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.safe_embedding_lookup_sparse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.safe_embedding_lookup_sparse.md
deleted file mode 100644
index faa0bb03512..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.safe_embedding_lookup_sparse.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.layers.safe_embedding_lookup_sparse(embedding_weights, sparse_ids, sparse_weights=None, combiner=None, default_id=None, name=None, partition_strategy='div', max_norm=None)` {#safe_embedding_lookup_sparse}
-
-Lookup embedding results, accounting for invalid IDs and empty features.
-
-The partitioned embedding in `embedding_weights` must all be the same shape
-except for the first dimension. The first dimension is allowed to vary as the
-vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-partitioner.
-
-Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-with non-positive weight. For an entry with no features, the embedding vector
-for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-The ids and weights may be multi-dimensional. Embeddings are always aggregated
-along the last dimension.
-
-##### Args:
-
-
-*  <b>`embedding_weights`</b>: A list of `P` float tensors or values representing
-      partitioned embedding tensors.  Alternatively, a `PartitionedVariable`,
-      created by partitioning along dimension 0.  The total unpartitioned
-      shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-      vocab size and `e_1, ..., e_m` are the embedding dimensions.
-*  <b>`sparse_ids`</b>: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-      ids. `d_0` is typically batch size.
-*  <b>`sparse_weights`</b>: `SparseTensor` of same shape as `sparse_ids`, containing
-      float weights corresponding to `sparse_ids`, or `None` if all weights
-      are be assumed to be 1.0.
-*  <b>`combiner`</b>: A string specifying how to combine embedding results for each
-      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-      the default.
-*  <b>`default_id`</b>: The id to use for an entry with no features.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy.
-      Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-*  <b>`max_norm`</b>: If not None, all embeddings are l2-normalized to max_norm before
-      combining.
-
-
-##### Returns:
-
-  Dense tensor of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `embedding_weights` is empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.shared_embedding_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.shared_embedding_columns.md
deleted file mode 100644
index 29611d833a1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.shared_embedding_columns.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.layers.shared_embedding_columns(sparse_id_columns, dimension, combiner='mean', shared_embedding_name=None, initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None)` {#shared_embedding_columns}
-
-Creates a list of `_EmbeddingColumn` sharing the same embedding.
-
-##### Args:
-
-
-*  <b>`sparse_id_columns`</b>: An iterable of `_SparseColumn`, such as those created by
-    `sparse_column_with_*` or crossed_column functions. Note that `combiner`
-    defined in each sparse_id_column is ignored.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`shared_embedding_name`</b>: (Optional). A string specifying the name of shared
-    embedding weights. This will be needed if you want to reference the shared
-    embedding separately from the generated `_EmbeddingColumn`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
-    1/sqrt(sparse_id_columns[0].length).
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`max_norm`</b>: (Optional). If not None, embedding values are l2-normalized to
-    the value of max_norm.
-
-##### Returns:
-
-  A tuple of `_EmbeddingColumn` with shared embedding space.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if sparse_id_columns is empty, or its elements are not
-    compatible with each other.
-*  <b>`TypeError`</b>: if `sparse_id_columns` is not a sequence or is a string. If at
-    least one element of `sparse_id_columns` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.weighted_sum_from_feature_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.weighted_sum_from_feature_columns.md
deleted file mode 100644
index 39ae2754f9f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.weighted_sum_from_feature_columns.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.contrib.layers.weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None)` {#weighted_sum_from_feature_columns}
-
-A tf.contrib.layer style linear prediction builder based on FeatureColumns.
-
-Generally a single example in training data is described with feature columns.
-This function generates weighted sum for each num_outputs. Weighted sum refers
-to logits in classification problems. It refers to prediction itself for
-linear regression problems.
-
-Example:
-
-  ```
-  # Building model for training
-  feature_columns = (
-      real_valued_column("my_feature1"),
-      ...
-  )
-  columns_to_tensor = tf.parse_example(...)
-  logits = weighted_sum_from_feature_columns(
-      columns_to_tensors=columns_to_tensor,
-      feature_columns=feature_columns,
-      num_outputs=1)
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
-                                                 logits=logits)
-  ```
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived from FeatureColumn.
-*  <b>`num_outputs`</b>: An integer specifying number of outputs. Default value is 1.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tuple containing:
-
-    * A Tensor which represents predictions of a linear model.
-    * A dictionary which maps feature_column to corresponding Variable.
-    * A Variable which is used for bias.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be used for linear predictions.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
deleted file mode 100644
index 740be32d9b0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
+++ /dev/null
@@ -1,305 +0,0 @@
-Abstract BaseEstimator class to train and evaluate TensorFlow models.
-
-Users should not instantiate or subclass this class. Instead, use `Estimator`.
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.__init__(model_dir=None, config=None)` {#BaseEstimator.__init__}
-
-Initializes a BaseEstimator instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: A RunConfig instance.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.__repr__()` {#BaseEstimator.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.config` {#BaseEstimator.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.evaluate(*args, **kwargs)` {#BaseEstimator.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.export(*args, **kwargs)` {#BaseEstimator.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.fit(*args, **kwargs)` {#BaseEstimator.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_params(deep=True)` {#BaseEstimator.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_variable_names()` {#BaseEstimator.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.get_variable_value(name)` {#BaseEstimator.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.model_dir` {#BaseEstimator.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.partial_fit(*args, **kwargs)` {#BaseEstimator.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.predict(*args, **kwargs)` {#BaseEstimator.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.BaseEstimator.set_params(**params)` {#BaseEstimator.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModeKeys.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModeKeys.md
deleted file mode 100644
index 83e0bd4119b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModeKeys.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Standard names for model modes.
-
-The following standard keys are defined:
-
-* `TRAIN`: training mode.
-* `EVAL`: evaluation mode.
-* `INFER`: inference mode.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModelFnOps.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModelFnOps.__new__.md
deleted file mode 100644
index 10dec55e352..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ModelFnOps.__new__.md
+++ /dev/null
@@ -1,54 +0,0 @@
-#### `tf.contrib.learn.ModelFnOps.__new__(cls, mode, predictions=None, loss=None, train_op=None, eval_metric_ops=None, output_alternatives=None, training_chief_hooks=None, training_hooks=None, scaffold=None)` {#ModelFnOps.__new__}
-
-Creates a validated `ModelFnOps` instance.
-
-For a multi-headed model, the predictions dict here will contain the outputs
-of all of the heads.  However: at serving time, requests will be made
-specifically for one or more heads, and the RPCs used for these requests may
-differ by problem type (i.e., regression, classification, other).  The
-purpose of the output_alternatives dict is to aid in exporting a SavedModel
-from which such head-specific queries can be served.  These
-output_alternatives will be combined with input_alternatives (see
-`saved_model_export_utils`) to produce a set of `SignatureDef`s specifying
-the valid requests that can be served from this model.
-
-For a single-headed model, it is still adviseable to provide
-output_alternatives with a single entry, because this is how the problem
-type is communicated for export and serving.  If output_alternatives is not
-given, the resulting SavedModel will support only one head of unspecified
-type.
-
-##### Args:
-
-
-*  <b>`mode`</b>: One of `ModeKeys`. Specifies if this training, evaluation or
-    prediction.
-*  <b>`predictions`</b>: Predictions `Tensor` or dict of `Tensor`.
-*  <b>`loss`</b>: Training loss `Tensor`.
-*  <b>`train_op`</b>: Op for the training step.
-*  <b>`eval_metric_ops`</b>: Dict of metric results keyed by name. The values of the
-    dict are the results of calling a metric function, such as `Tensor`.
-*  <b>`output_alternatives`</b>: a dict of
-    `{submodel_name: (problem_type, {tensor_name: Tensor})}`, where
-    `submodel_name` is a submodel identifier that should be consistent
-    across the pipeline (here likely taken from the name of each `Head`,
-    for models that use them), `problem_type` is a `ProblemType`,
-    `tensor_name` is a symbolic name for an output Tensor possibly but not
-    necessarily taken from `PredictionKey`, and `Tensor` is the
-    corresponding output Tensor itself.
-*  <b>`training_chief_hooks`</b>: A list of `SessionRunHook` objects that will be
-    run on the chief worker during training.
-*  <b>`training_hooks`</b>: A list of `SessionRunHook` objects that will be run on
-    all workers during training.
-*  <b>`scaffold`</b>: A `tf.train.Scaffold` object that can be used to set
-    initialization, saver, and more to be used in training.
-
-##### Returns:
-
-  A validated `ModelFnOps` object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If validation fails.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ProblemType.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ProblemType.md
deleted file mode 100644
index 8b137891791..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.ProblemType.md
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.CaptureVariable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.CaptureVariable.md
deleted file mode 100644
index 4160ed5ec48..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.CaptureVariable.md
+++ /dev/null
@@ -1,199 +0,0 @@
-Captures a variable's values into a collection.
-
-This monitor is useful for unit testing. You should exercise caution when
-using this monitor in production, since it never discards values.
-
-This is an `EveryN` monitor and has consistent semantic for `every_n`
-and `first_n`.
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.__init__(var_name, every_n=100, first_n=1)` {#CaptureVariable.__init__}
-
-Initializes a CaptureVariable monitor.
-
-##### Args:
-
-
-*  <b>`var_name`</b>: `string`. The variable name, including suffix (typically ":0").
-*  <b>`every_n`</b>: `int`, print every N steps. See `PrintN.`
-*  <b>`first_n`</b>: `int`, also print the first N steps. See `PrintN.`
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.begin(max_steps=None)` {#CaptureVariable.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.end(session=None)` {#CaptureVariable.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.epoch_begin(epoch)` {#CaptureVariable.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.epoch_end(epoch)` {#CaptureVariable.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_post_step(step, session)` {#CaptureVariable.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_step_begin(step)` {#CaptureVariable.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.every_n_step_end(step, outputs)` {#CaptureVariable.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.post_step(step, session)` {#CaptureVariable.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.run_on_all_workers` {#CaptureVariable.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.set_estimator(estimator)` {#CaptureVariable.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.step_begin(step)` {#CaptureVariable.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.step_end(step, output)` {#CaptureVariable.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CaptureVariable.values` {#CaptureVariable.values}
-
-Returns the values captured so far.
-
-##### Returns:
-
-  `dict` mapping `int` step numbers to that values of the variable at the
-      respective step.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md
deleted file mode 100644
index bf3fa842a3f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.ExportMonitor.md
+++ /dev/null
@@ -1,248 +0,0 @@
-Monitor that exports Estimator every N steps.
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.__init__(*args, **kwargs)` {#ExportMonitor.__init__}
-
-Initializes ExportMonitor. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will both become required args.
-
-##### Args:
-
-
-*  <b>`every_n_steps`</b>: Run monitor every N steps.
-*  <b>`export_dir`</b>: str, folder to export.
-*  <b>`input_fn`</b>: A function that takes no argument and returns a tuple of
-    (features, labels), where features is a dict of string key to `Tensor`
-    and labels is a `Tensor` that's currently not used (and so can be
-    `None`).
-*  <b>`input_feature_key`</b>: String key into the features dict returned by
-    `input_fn` that corresponds to the raw `Example` strings `Tensor` that
-    the exported model will take as input. Should be `None` if and only if
-    you're passing in a `signature_fn` that does not use the first arg
-    (`Tensor` of `Example` strings).
-*  <b>`exports_to_keep`</b>: int, number of exports to keep.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `dict` of `Tensor`s for predictions.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `input_fn` and `input_feature_key` are not both defined or
-    are not both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.begin(max_steps=None)` {#ExportMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.end(session=None)` {#ExportMonitor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.epoch_begin(epoch)` {#ExportMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.epoch_end(epoch)` {#ExportMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_post_step(step, session)` {#ExportMonitor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_step_begin(step)` {#ExportMonitor.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.every_n_step_end(step, outputs)` {#ExportMonitor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.export_dir` {#ExportMonitor.export_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.exports_to_keep` {#ExportMonitor.exports_to_keep}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.last_export_dir` {#ExportMonitor.last_export_dir}
-
-Returns the directory containing the last completed export.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added on 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because the
-  estimator being fitted does not yet return a value during export.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.post_step(step, session)` {#ExportMonitor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.run_on_all_workers` {#ExportMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.set_estimator(estimator)` {#ExportMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.signature_fn` {#ExportMonitor.signature_fn}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.step_begin(step)` {#ExportMonitor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.ExportMonitor.step_end(step, output)` {#ExportMonitor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.GraphDump.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.GraphDump.md
deleted file mode 100644
index 8e1fed54c10..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.GraphDump.md
+++ /dev/null
@@ -1,163 +0,0 @@
-Dumps almost all tensors in the graph at every step.
-
-Note, this is very expensive, prefer `PrintTensor` in production.
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.__init__(ignore_ops=None)` {#GraphDump.__init__}
-
-Initializes GraphDump monitor.
-
-##### Args:
-
-
-*  <b>`ignore_ops`</b>: `list` of `string`. Names of ops to ignore.
-      If None, `GraphDump.IGNORE_OPS` is used.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.begin(max_steps=None)` {#GraphDump.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.compare(other_dump, step, atol=1e-06)` {#GraphDump.compare}
-
-Compares two `GraphDump` monitors and returns differences.
-
-##### Args:
-
-
-*  <b>`other_dump`</b>: Another `GraphDump` monitor.
-*  <b>`step`</b>: `int`, step to compare on.
-*  <b>`atol`</b>: `float`, absolute tolerance in comparison of floating arrays.
-
-##### Returns:
-
-  Returns tuple:
-
-*  <b>`matched`</b>: `list` of keys that matched.
-*  <b>`non_matched`</b>: `dict` of keys to tuple of 2 mismatched values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if a key in `data` is missing from `other_dump` at `step`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.data` {#GraphDump.data}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.end(session=None)` {#GraphDump.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.epoch_begin(epoch)` {#GraphDump.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.epoch_end(epoch)` {#GraphDump.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.post_step(step, session)` {#GraphDump.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.run_on_all_workers` {#GraphDump.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.set_estimator(estimator)` {#GraphDump.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.step_begin(step)` {#GraphDump.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.GraphDump.step_end(step, output)` {#GraphDump.step_end}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.NanLoss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.NanLoss.md
deleted file mode 100644
index d5fb341690d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.NanLoss.md
+++ /dev/null
@@ -1,184 +0,0 @@
-NaN Loss monitor.
-
-Monitors loss and stops training if loss is NaN.
-Can either fail with exception or just stop training.
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.__init__(loss_tensor, every_n_steps=100, fail_on_nan_loss=True)` {#NanLoss.__init__}
-
-Initializes NanLoss monitor.
-
-##### Args:
-
-
-*  <b>`loss_tensor`</b>: `Tensor`, the loss tensor.
-*  <b>`every_n_steps`</b>: `int`, run check every this many steps.
-*  <b>`fail_on_nan_loss`</b>: `bool`, whether to raise exception when loss is NaN.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.begin(max_steps=None)` {#NanLoss.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.end(session=None)` {#NanLoss.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.epoch_begin(epoch)` {#NanLoss.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.epoch_end(epoch)` {#NanLoss.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_post_step(step, session)` {#NanLoss.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_step_begin(step)` {#NanLoss.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.every_n_step_end(step, outputs)` {#NanLoss.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.post_step(step, session)` {#NanLoss.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.run_on_all_workers` {#NanLoss.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.set_estimator(estimator)` {#NanLoss.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.step_begin(step)` {#NanLoss.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.NanLoss.step_end(step, output)` {#NanLoss.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.StepCounter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.StepCounter.md
deleted file mode 100644
index 13278b4fb28..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.monitors.StepCounter.md
+++ /dev/null
@@ -1,171 +0,0 @@
-Steps per second monitor.
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.__init__(every_n_steps=100, output_dir=None, summary_writer=None)` {#StepCounter.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.begin(max_steps=None)` {#StepCounter.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.end(session=None)` {#StepCounter.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.epoch_begin(epoch)` {#StepCounter.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.epoch_end(epoch)` {#StepCounter.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_post_step(step, session)` {#StepCounter.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_step_begin(step)` {#StepCounter.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.every_n_step_end(current_step, outputs)` {#StepCounter.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.post_step(step, session)` {#StepCounter.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.run_on_all_workers` {#StepCounter.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.set_estimator(estimator)` {#StepCounter.set_estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.step_begin(step)` {#StepCounter.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StepCounter.step_end(step, output)` {#StepCounter.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md
deleted file mode 100644
index 2ad87bab357..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_decoder.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.legacy_seq2seq.embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_decoder}
-
-RNN decoder with embedding and attention and a pure-decoding option.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 1D batch-sized int32 Tensors (decoder inputs).
-*  <b>`initial_state`</b>: 2D Tensor [batch_size x cell.state_size].
-*  <b>`attention_states`</b>: 3D Tensor [batch_size x attn_length x attn_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function.
-*  <b>`num_symbols`</b>: Integer, how many symbols come into the embedding.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`output_size`</b>: Size of the output vectors; if None, use output_size.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has shape
-    [num_symbols]; if provided and feed_previous=True, each fed previous
-    output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean; if True, only the first of decoder_inputs will be
-    used (the "GO" symbol), and all other decoder inputs will be generated by:
-      next = embedding_lookup(embedding, argmax(previous_output)),
-    In effect, this implements a greedy decoder. It can also be used
-    during training to emulate http://arxiv.org/abs/1506.03099.
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`update_embedding_for_previous`</b>: Boolean; if False and feed_previous=True,
-    only the embedding for the first symbol of decoder_inputs (the "GO"
-    symbol) will be updated by back propagation. Embeddings for the symbols
-    generated from the decoder itself remain unchanged. This parameter has
-    no effect if feed_previous=False.
-*  <b>`dtype`</b>: The dtype to use for the RNN initial states (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_attention_decoder".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states -- useful when we wish to resume decoding from a previously
-    stored decoder state and attention states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md
deleted file mode 100644
index 6055727edd8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.embedding_attention_seq2seq.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False)` {#embedding_attention_seq2seq}
-
-Embedding sequence-to-sequence model with attention.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_encoder_symbols x input_size]). Then it runs an RNN to encode
-embedded encoder_inputs into a state vector. It keeps the outputs of this
-RNN at every step to use for attention later. Next, it embeds decoder_inputs
-by another newly created embedding (of shape [num_decoder_symbols x
-input_size]). Then it runs attention decoder, initialized with the last
-encoder state, on embedded decoder_inputs and attending to encoder outputs.
-
-Warning: when output_projection is None, the size of the attention vectors
-and variables will be made proportional to num_decoder_symbols, can be large.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols`</b>: Integer; number of symbols on the decoder side.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_heads`</b>: Number of attention heads that read from attention_states.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_decoder_symbols] and B has
-    shape [num_decoder_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial RNN state (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_attention_seq2seq".
-*  <b>`initial_state_attention`</b>: If False (default), initial attentions are zero.
-    If True, initialize the attentions from the initial state and attention
-    states.
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x num_decoder_symbols] containing the generated
-      outputs.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md
deleted file mode 100644
index c5eb781d625..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.legacy_seq2seq.rnn_decoder.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.legacy_seq2seq.rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None)` {#rnn_decoder}
-
-RNN decoder for the sequence-to-sequence model.
-
-##### Args:
-
-
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`initial_state`</b>: 2D Tensor with shape [batch_size x cell.state_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`loop_function`</b>: If not None, this function will be applied to the i-th output
-    in order to generate the i+1-st input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol). This can be used for decoding,
-    but also for training to emulate http://arxiv.org/abs/1506.03099.
-    Signature -- loop_function(prev, i) = next
-      * prev is a 2D Tensor of shape [batch_size x output_size],
-      * i is an integer, the step number (when advanced control is needed),
-      * next is a 2D Tensor of shape [batch_size x input_size].
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn_decoder".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing generated outputs.
-*  <b>`state`</b>: The state of each cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
-       states can be the same. They are different for LSTM cells though.)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.set_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.set_difference.md
deleted file mode 100644
index f656fb5e420..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.set_difference.md
+++ /dev/null
@@ -1,64 +0,0 @@
-### `tf.contrib.metrics.set_difference(a, b, aminusb=True, validate_indices=True)` {#set_difference}
-
-Compute set difference of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_difference(a, b, aminusb=True) = [
-    [
-      [
-        [2],
-        [3],
-      ],
-      [
-        [],
-        [],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`aminusb`</b>: Whether to subtract `b` from `a`, vs vice versa.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  differences.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md
deleted file mode 100644
index 9f405ebd5d0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md
+++ /dev/null
@@ -1,64 +0,0 @@
-### `tf.contrib.metrics.streaming_auc(predictions, labels, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, curve='ROC', name=None)` {#streaming_auc}
-
-Computes the approximate AUC via a Riemann sum.
-
-The `streaming_auc` function creates four local variables, `true_positives`,
-`true_negatives`, `false_positives` and `false_negatives` that are used to
-compute the AUC. To discretize the AUC curve, a linearly spaced set of
-thresholds is used to compute pairs of recall and precision values. The area
-under the ROC-curve is therefore computed using the height of the recall
-values by the false positive rate, while the area under the PR-curve is the
-computed using the height of the precision values by the recall.
-
-This value is ultimately returned as `auc`, an idempotent operation that
-computes the area under a discretized curve of precision versus recall values
-(computed using the aforementioned variables). The `num_thresholds` variable
-controls the degree of discretization with larger numbers of thresholds more
-closely approximating the true AUC. The quality of the approximation may vary
-dramatically depending on `num_thresholds`.
-
-For best results, `predictions` should be distributed approximately uniformly
-in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
-approximation may be poor if this is not the case.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `auc`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use when discretizing the roc
-    curve.
-*  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`curve`</b>: Specifies the name of the curve to be computed, 'ROC' [default] or
-  'PR' for the Precision-Recall-curve.
-
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`auc`</b>: A scalar `Tensor` representing the current area-under-curve.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `auc`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_covariance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_covariance.md
deleted file mode 100644
index 6136a4ebf18..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_covariance.md
+++ /dev/null
@@ -1,55 +0,0 @@
-### `tf.contrib.metrics.streaming_covariance(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_covariance}
-
-Computes the unbiased sample covariance between `predictions` and `labels`.
-
-The `streaming_covariance` function creates four local variables,
-`comoment`, `mean_prediction`, `mean_label`, and `count`, which are used to
-compute the sample covariance between predictions and labels across multiple
-batches of data. The covariance is ultimately returned as an idempotent
-operation that simply divides `comoment` by `count` - 1. We use `count` - 1
-in order to get an unbiased estimate.
-
-The algorithm used for this online computation is described in
-https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
-Specifically, the formula used to combine two sample comoments is
-`C_AB = C_A + C_B + (E[x_A] - E[x_B]) * (E[y_A] - E[y_B]) * n_A * n_B / n_AB`
-The comoment for a single batch of data is simply
-`sum((x - E[x]) * (y - E[y]))`, optionally weighted.
-
-If `weights` is not None, then it is used to compute weighted comoments,
-means, and count. NOTE: these weights are treated as "frequency weights", as
-opposed to "reliability weights". See discussion of the difference on
-https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
-
-To facilitate the computation of covariance across multiple batches of data,
-the function creates an `update_op` operation, which updates underlying
-variables and returns the updated covariance.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary size.
-*  <b>`labels`</b>: A `Tensor` of the same size as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: A `Tensor` representing the current unbiased sample covariance,
-    `comoment` / (`count` - 1).
-*  <b>`update_op`</b>: An operation that updates the local variables appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If labels and predictions are of different sizes or if either
-    `metrics_collections` or `updates_collections` are not a list or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_false_negatives_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_false_negatives_at_thresholds.md
deleted file mode 100644
index e1b1c77293b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_false_negatives_at_thresholds.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.metrics.streaming_false_negatives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_false_negatives_at_thresholds}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md
deleted file mode 100644
index 3919eed9beb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.contrib.metrics.streaming_mean(values, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean}
-
-Computes the (weighted) mean of the given values.
-
-The `streaming_mean` function creates two local variables, `total` and `count`
-that are used to compute the average of `values`. This average is ultimately
-returned as `mean` which is an idempotent operation that simply divides
-`total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean`.
-`update_op` increments `total` with the reduced sum of the product of `values`
-and `weights`, and it increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `values`, and
-    must be broadcastable to `values` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `values` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op`
-    should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean`</b>: A `Tensor` representing the current mean, the value of `total` divided
-    by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md
deleted file mode 100644
index 1270d60a138..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_relative_error(predictions, labels, normalizer, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_relative_error}
-
-Computes the mean relative error by normalizing with the given values.
-
-The `streaming_mean_relative_error` function creates two local variables,
-`total` and `count` that are used to compute the mean relative absolute error.
-This average is weighted by `weights`, and it is ultimately returned as
-`mean_relative_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_reative_error`. Internally, a `relative_errors` operation divides the
-absolute value of the differences between `predictions` and `labels` by the
-`normalizer`. Then `update_op` increments `total` with the reduced sum of the
-product of `weights` and `relative_errors`, and it increments `count` with the
-reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`normalizer`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_relative_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_relative_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_relative_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md
deleted file mode 100644
index d68243c573e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md
+++ /dev/null
@@ -1,77 +0,0 @@
-### `tf.contrib.metrics.streaming_sparse_precision_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_k}
-
-Computes precision@k of the predictions with respect to sparse labels.
-
-If `class_id` is not specified, we calculate precision as the ratio of true
-    positives (i.e., correct predictions, items in the top `k` highest
-    `predictions` that are found in the corresponding row in `labels`) to
-    positives (all top `k` `predictions`).
-If `class_id` is specified, we calculate precision by considering only the
-    rows in the batch for which `class_id` is in the top `k` highest
-    `predictions`, and computing the fraction of them for which `class_id` is
-    in the corresponding row in `labels`.
-
-We expect precision to decrease as `k` increases.
-
-`streaming_sparse_precision_at_k` creates two local variables,
-`true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
-the precision@k frequency. This frequency is ultimately returned as
-`precision_at_<k>`: an idempotent operation that simply divides
-`true_positive_at_<k>` by total (`true_positive_at_<k>` +
-`false_positive_at_<k>`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_positive_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-    The final dimension contains the logit values for each class. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `predictions`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`k`</b>: Integer, k for @k metric.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes], where num_classes is the last dimension of
-    `predictions`. If `class_id` is outside this range, the method returns
-    NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately, and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-    `predictions`, or if either `metrics_collections` or `updates_collections`
-    are not a list or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.opt.ScipyOptimizerInterface.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.opt.ScipyOptimizerInterface.md
deleted file mode 100644
index 63bf919f5c4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.opt.ScipyOptimizerInterface.md
+++ /dev/null
@@ -1,87 +0,0 @@
-Wrapper allowing `scipy.optimize.minimize` to operate a `tf.Session`.
-
-Example:
-
-```python
-vector = tf.Variable([7., 7.], 'vector')
-
-# Make vector norm as small as possible.
-loss = tf.reduce_sum(tf.square(vector))
-
-optimizer = ScipyOptimizerInterface(loss, options={'maxiter': 100})
-
-with tf.Session() as session:
-  optimizer.minimize(session)
-
-# The value of vector should now be [0., 0.].
-```
-
-Example with constraints:
-
-```python
-vector = tf.Variable([7., 7.], 'vector')
-
-# Make vector norm as small as possible.
-loss = tf.reduce_sum(tf.square(vector))
-# Ensure the vector's y component is = 1.
-equalities = [vector[1] - 1.]
-# Ensure the vector's x component is >= 1.
-inequalities = [vector[0] - 1.]
-
-# Our default SciPy optimization algorithm, L-BFGS-B, does not support
-# general constraints. Thus we use SLSQP instead.
-optimizer = ScipyOptimizerInterface(
-    loss, equalities=equalities, inequalities=inequalities, method='SLSQP')
-
-with tf.Session() as session:
-  optimizer.minimize(session)
-
-# The value of vector should now be [1., 1.].
-```
-- - -
-
-#### `tf.contrib.opt.ScipyOptimizerInterface.__init__(loss, var_list=None, equalities=None, inequalities=None, **optimizer_kwargs)` {#ScipyOptimizerInterface.__init__}
-
-Initialize a new interface instance.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A scalar `Tensor` to be minimized.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`equalities`</b>: Optional list of equality constraint scalar `Tensor`s to be
-    held equal to zero.
-*  <b>`inequalities`</b>: Optional list of inequality constraint scalar `Tensor`s
-    to be kept nonnegative.
-*  <b>`**optimizer_kwargs`</b>: Other subclass-specific keyword arguments.
-
-
-- - -
-
-#### `tf.contrib.opt.ScipyOptimizerInterface.minimize(session=None, feed_dict=None, fetches=None, step_callback=None, loss_callback=None)` {#ScipyOptimizerInterface.minimize}
-
-Minimize a scalar `Tensor`.
-
-Variables subject to optimization are updated in-place at the end of
-optimization.
-
-Note that this method does *not* just return a minimization `Op`, unlike
-`Optimizer.minimize()`; instead it actually performs minimization by
-executing commands to control a `Session`.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `Session` instance.
-*  <b>`feed_dict`</b>: A feed dict to be passed to calls to `session.run`.
-*  <b>`fetches`</b>: A list of `Tensor`s to fetch and supply to `loss_callback`
-    as positional arguments.
-*  <b>`step_callback`</b>: A function to be called at each optimization step;
-    arguments are the current values of all optimization variables
-    flattened into a single vector.
-*  <b>`loss_callback`</b>: A function to be called every time the loss and gradients
-    are computed, with evaluated fetches supplied as positional arguments.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.GRUBlockCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.GRUBlockCell.md
deleted file mode 100644
index e6b8d4fc8b9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.GRUBlockCell.md
+++ /dev/null
@@ -1,84 +0,0 @@
-Block GRU cell implementation.
-
-The implementation is based on:  http://arxiv.org/abs/1406.1078
-Computes the LSTM cell forward propagation for 1 time step.
-
-This kernel op implements the following mathematical equations:
-
-Biases are initialized with:
-
-* `b_ru` - constant_initializer(1.0)
-* `b_c` - constant_initializer(0.0)
-
-```
-x_h_prev = [x, h_prev]
-
-[r_bar u_bar] = x_h_prev * w_ru + b_ru
-
-r = sigmoid(r_bar)
-u = sigmoid(u_bar)
-
-h_prevr = h_prev \circ r
-
-x_h_prevr = [x h_prevr]
-
-c_bar = x_h_prevr * w_c + b_c
-c = tanh(c_bar)
-
-h = (1-u) \circ c + u \circ h_prev
-```
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.__call__(x, h_prev, scope=None)` {#GRUBlockCell.__call__}
-
-GRU cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.__init__(cell_size)` {#GRUBlockCell.__init__}
-
-Initialize the Block GRU cell.
-
-##### Args:
-
-
-*  <b>`cell_size`</b>: int, GRU cell size.
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.output_size` {#GRUBlockCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.state_size` {#GRUBlockCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.GRUBlockCell.zero_state(batch_size, dtype)` {#GRUBlockCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md
deleted file mode 100644
index b4cc966e32b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.rnn.static_bidirectional_rnn.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=None, initial_state_bw=None, dtype=None, sequence_length=None, scope=None)` {#static_bidirectional_rnn}
-
-Creates a bidirectional recurrent neural network.
-
-Similar to the unidirectional case above (rnn) but takes input and builds
-independent forward and backward RNNs with the final forward and backward
-outputs depth-concatenated, such that the output will have the format
-[time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
-forward and backward cell must match. The initial state for both directions
-is zero by default (but can be set optionally) and no intermediate states are
-ever returned -- the network is fully unrolled for the given (passed in)
-length(s) of the sequence(s) or completely unrolled if length(s) is not given.
-
-##### Args:
-
-
-*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
-*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
-*  <b>`inputs`</b>: A length T list of inputs, each a tensor of shape
-    [batch_size, input_size], or a nested tuple of such elements.
-*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
-    This must be a tensor of appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-    If `cell_fw.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
-    the corresponding properties of `cell_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "bidirectional_rnn"
-
-##### Returns:
-
-  A tuple (outputs, output_state_fw, output_state_bw) where:
-    outputs is a length `T` list of outputs (one for each input), which
-      are depth-concatenated forward and backward outputs.
-    output_state_fw is the final state of the forward rnn.
-    output_state_bw is the final state of the backward rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-*  <b>`ValueError`</b>: If inputs is None or an empty list.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.digamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.digamma.md
deleted file mode 100644
index 8729e7ecfe0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.digamma.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.digamma(x, name=None)` {#digamma}
-
-Computes Psi, the derivative of Lgamma (the log of the absolute value of
-
-`Gamma(x)`), element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.edit_distance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.edit_distance.md
deleted file mode 100644
index e5f6471817d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.edit_distance.md
+++ /dev/null
@@ -1,65 +0,0 @@
-### `tf.edit_distance(hypothesis, truth, normalize=True, name='edit_distance')` {#edit_distance}
-
-Computes the Levenshtein distance between sequences.
-
-This operation takes variable-length sequences (`hypothesis` and `truth`),
-each provided as a `SparseTensor`, and computes the Levenshtein distance.
-You can normalize the edit distance by length of `truth` by setting
-`normalize` to true.
-
-For example, given the following input:
-
-```python
-# 'hypothesis' is a tensor of shape `[2, 1]` with variable-length values:
-#   (0,0) = ["a"]
-#   (1,0) = ["b"]
-hypothesis = tf.SparseTensor(
-    [[0, 0, 0],
-     [1, 0, 0]],
-    ["a", "b"]
-    (2, 1, 1))
-
-# 'truth' is a tensor of shape `[2, 2]` with variable-length values:
-#   (0,0) = []
-#   (0,1) = ["a"]
-#   (1,0) = ["b", "c"]
-#   (1,1) = ["a"]
-truth = tf.SparseTensor(
-    [[0, 1, 0],
-     [1, 0, 0],
-     [1, 0, 1],
-     [1, 1, 0]]
-    ["a", "b", "c", "a"],
-    (2, 2, 2))
-
-normalize = True
-```
-
-This operation would return the following:
-
-```python
-# 'output' is a tensor of shape `[2, 2]` with edit distances normalized
-# by 'truth' lengths.
-output ==> [[inf, 1.0],  # (0,0): no truth, (0,1): no hypothesis
-           [0.5, 1.0]]  # (1,0): addition, (1,1): no hypothesis
-```
-
-##### Args:
-
-
-*  <b>`hypothesis`</b>: A `SparseTensor` containing hypothesis sequences.
-*  <b>`truth`</b>: A `SparseTensor` containing truth sequences.
-*  <b>`normalize`</b>: A `bool`. If `True`, normalizes the Levenshtein distance by
-    length of `truth.`
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A dense `Tensor` with rank `R - 1`, where R is the rank of the
-  `SparseTensor` inputs `hypothesis` and `truth`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If either `hypothesis` or `truth` are not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.encode_base64.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.encode_base64.md
deleted file mode 100644
index 20fef36bcba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.encode_base64.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.encode_base64(input, pad=None, name=None)` {#encode_base64}
-
-Encode strings into web-safe base64 format.
-
-Refer to the following article for more information on base64 format:
-en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-end so that the encoded has length multiple of 4. See Padding section of the
-link above.
-
-Web-safe means that the encoder uses - and _ instead of + and /.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Strings to be encoded.
-*  <b>`pad`</b>: An optional `bool`. Defaults to `False`.
-    Bool whether padding is applied at the ends.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Input strings encoded in base64.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.errors.ResourceExhaustedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.errors.ResourceExhaustedError.md
deleted file mode 100644
index a01e255be5c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.errors.ResourceExhaustedError.md
+++ /dev/null
@@ -1,12 +0,0 @@
-Some resource has been exhausted.
-
-For example, this error might be raised if a per-user quota is
-exhausted, or perhaps the entire file system is out of space.
-
-- - -
-
-#### `tf.errors.ResourceExhaustedError.__init__(node_def, op, message)` {#ResourceExhaustedError.__init__}
-
-Creates a `ResourceExhaustedError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.expand_dims.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.expand_dims.md
deleted file mode 100644
index 53272b295fd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.expand_dims.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.expand_dims(input, axis=None, name=None, dim=None)` {#expand_dims}
-
-Inserts a dimension of 1 into a tensor's shape.
-
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `axis` of `input`'s shape. The dimension index `axis` starts
-at zero; if you specify a negative number for `axis` it is counted backward
-from the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```python
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`axis`</b>: 0-D (scalar). Specifies the dimension index at which to
-    expand the shape of `input`.
-*  <b>`name`</b>: The name of the output `Tensor`.
-*  <b>`dim`</b>: 0-D (scalar). Equivalent to `axis`, to be deprecated.
-
-##### Returns:
-
-  A `Tensor` with the same data as `input`, but its shape has an additional
-  dimension of size 1 added.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if both `dim` and `axis` are specified.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.floor_div.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.floor_div.md
deleted file mode 100644
index da18338be68..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.floor_div.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.floor_div(x, y, name=None)` {#floor_div}
-
-Returns x // y element-wise.
-
-*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.gather_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.gather_nd.md
deleted file mode 100644
index 22cccc9d9aa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.gather_nd.md
+++ /dev/null
@@ -1,110 +0,0 @@
-### `tf.gather_nd(params, indices, name=None)` {#gather_nd}
-
-Gather values or slices from `params` according to `indices`.
-
-`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `params`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `params`.
-
-Produces an output tensor with shape
-
-```
-[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-```
-
-Some examples below.
-
-Simple indexing into a matrix:
-
-```python
-    indices = [[0, 0], [1, 1]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = ['a', 'd']
-```
-
-Slice indexing into a matrix:
-
-```python
-    indices = [[1], [0]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['c', 'd'], ['a', 'b']]
-```
-
-Indexing into a 3-tensor:
-
-```python
-    indices = [[1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['a1', 'b1'], ['c1', 'd1']]]
-
-
-    indices = [[0, 1], [1, 0]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['c0', 'd0'], ['a1', 'b1']]
-
-
-    indices = [[0, 0, 1], [1, 0, 1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = ['b0', 'b1']
-```
-
-Batched indexing into a matrix:
-
-```python
-    indices = [[[0, 0]], [[0, 1]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['a'], ['b']]
-```
-
-Batched slice indexing into a matrix:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [[['c', 'd']], [['a', 'b']]]
-```
-
-Batched indexing into a 3-tensor:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[[['a1', 'b1'], ['c1', 'd1']]],
-              [[['a0', 'b0'], ['c0', 'd0']]]]
-
-    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['c0', 'd0'], ['a1', 'b1']],
-              [['a0', 'b0'], ['c1', 'd1']]]
-
-
-    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['b0', 'b1'], ['d0', 'c1']]
-```
-
-##### Args:
-
-
-*  <b>`params`</b>: A `Tensor`. `P-D`.  The tensor from which to gather values.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `params`.
-  `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-  `indices`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.get_variable_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.get_variable_scope.md
deleted file mode 100644
index 4a0d3bc7750..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.get_variable_scope.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.get_variable_scope()` {#get_variable_scope}
-
-Returns the current variable scope.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.global_variables_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.global_variables_initializer.md
deleted file mode 100644
index b1ebdcc3270..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.global_variables_initializer.md
+++ /dev/null
@@ -1,10 +0,0 @@
-### `tf.global_variables_initializer()` {#global_variables_initializer}
-
-Returns an Op that initializes global variables.
-
-This is just a shortcut for `variable_initializers(global_variables())`
-
-##### Returns:
-
-  An Op that initializes global variables in the graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.identity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.identity.md
deleted file mode 100644
index 13f13186015..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.identity.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.identity(input, name=None)` {#identity}
-
-Return a tensor with the same shape and contents as the input tensor or value.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.imag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.imag.md
deleted file mode 100644
index e6a0ed1a391..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.imag.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.imag(input, name=None)` {#imag}
-
-Returns the imaginary part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float32` or `float64` that is the imaginary part of each element in
-`input`. All elements in `input` must be complex numbers of the form \(a +
-bj\), where *a* is the real part and *b* is the imaginary part returned by
-this operation.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.imag(input) ==> [4.75, 5.75]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `complex64`,
-    `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32` or `float64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.crop_and_resize.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.crop_and_resize.md
deleted file mode 100644
index aace65153a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.crop_and_resize.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.image.crop_and_resize(image, boxes, box_ind, crop_size, method=None, extrapolation_value=None, name=None)` {#crop_and_resize}
-
-Extracts crops from the input image tensor and bilinearly resizes them (possibly
-
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
-
-Returns a tensor with `crops` from the input `image` at positions defined at the
-bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-    Both `image_height` and `image_width` need to be positive.
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-    specifies the coordinates of a box in the `box_ind[i]` image and is specified
-    in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-    `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-    `[0, 1]` interval of normalized image height is mapped to
-    `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-    which case the sampled crop is an up-down flipped version of the original
-    image. The width dimension is treated similarly. Normalized coordinates
-    outside the `[0, 1]` range are allowed, in which case we use
-    `extrapolation_value` to extrapolate the input image values.
-*  <b>`box_ind`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-    The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-*  <b>`crop_size`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-    cropped image patches are resized to this size. The aspect ratio of the image
-    content is not preserved. Both `crop_height` and `crop_width` need to be
-    positive.
-*  <b>`method`</b>: An optional `string` from: `"bilinear"`. Defaults to `"bilinear"`.
-    A string specifying the interpolation method. Only 'bilinear' is
-    supported for now.
-*  <b>`extrapolation_value`</b>: An optional `float`. Defaults to `0`.
-    Value used for extrapolation, when applicable.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.encode_jpeg.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.encode_jpeg.md
deleted file mode 100644
index 24b1886c107..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.encode_jpeg.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.image.encode_jpeg(image, format=None, quality=None, progressive=None, optimize_size=None, chroma_downsampling=None, density_unit=None, x_density=None, y_density=None, xmp_metadata=None, name=None)` {#encode_jpeg}
-
-JPEG-encode an image.
-
-`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-
-The attr `format` can be used to override the color format of the encoded
-output.  Values can be:
-
-*   `''`: Use a default format based on the number of channels in the image.
-*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-    of `image` must be 1.
-*   `rgb`: Output an RGB JPEG image. The `channels` dimension
-    of `image` must be 3.
-
-If `format` is not specified or is the empty string, a default format is picked
-in function of the number of channels in `image`:
-
-*   1: Output a grayscale image.
-*   3: Output an RGB image.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor` of type `uint8`.
-    3-D with shape `[height, width, channels]`.
-*  <b>`format`</b>: An optional `string` from: `"", "grayscale", "rgb"`. Defaults to `""`.
-    Per pixel image format.
-*  <b>`quality`</b>: An optional `int`. Defaults to `95`.
-    Quality of the compression from 0 to 100 (higher is better and slower).
-*  <b>`progressive`</b>: An optional `bool`. Defaults to `False`.
-    If True, create a JPEG that loads progressively (coarse to fine).
-*  <b>`optimize_size`</b>: An optional `bool`. Defaults to `False`.
-    If True, spend CPU/RAM to reduce size with no quality change.
-*  <b>`chroma_downsampling`</b>: An optional `bool`. Defaults to `True`.
-    See http://en.wikipedia.org/wiki/Chroma_subsampling.
-*  <b>`density_unit`</b>: An optional `string` from: `"in", "cm"`. Defaults to `"in"`.
-    Unit used to specify `x_density` and `y_density`:
-    pixels per inch (`'in'`) or centimeter (`'cm'`).
-*  <b>`x_density`</b>: An optional `int`. Defaults to `300`.
-    Horizontal pixels per density unit.
-*  <b>`y_density`</b>: An optional `int`. Defaults to `300`.
-    Vertical pixels per density unit.
-*  <b>`xmp_metadata`</b>: An optional `string`. Defaults to `""`.
-    If not empty, embed this XMP metadata in the image header.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. 0-D. JPEG-encoded image.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_all_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_all_variables.md
deleted file mode 100644
index ec240fc6088..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_all_variables.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.initialize_all_variables(*args, **kwargs)` {#initialize_all_variables}
-
-See `tf.global_variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.global_variables_initializer` instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_local_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_local_variables.md
deleted file mode 100644
index a6c1395e918..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.initialize_local_variables.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.initialize_local_variables(*args, **kwargs)` {#initialize_local_variables}
-
-See `tf.local_variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.local_variables_initializer` instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.is_variable_initialized.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.is_variable_initialized.md
deleted file mode 100644
index d8383439aba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.is_variable_initialized.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.is_variable_initialized(variable)` {#is_variable_initialized}
-
-Tests if a variable has been initialized.
-
-##### Args:
-
-
-*  <b>`variable`</b>: A `Variable`.
-
-##### Returns:
-
-  Returns a scalar boolean Tensor, `True` if the variable has been
-  initialized, `False` otherwise.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.local_variables_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.local_variables_initializer.md
deleted file mode 100644
index 3f726bdf7ad..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.local_variables_initializer.md
+++ /dev/null
@@ -1,10 +0,0 @@
-### `tf.local_variables_initializer()` {#local_variables_initializer}
-
-Returns an Op that initializes all local variables.
-
-This is just a shortcut for `variable_initializers(local_variables())`
-
-##### Returns:
-
-  An Op that initializes all local variables in the graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.matmul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.matmul.md
deleted file mode 100644
index 69079ecabee..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.matmul.md
+++ /dev/null
@@ -1,90 +0,0 @@
-### `tf.matmul(a, b, transpose_a=False, transpose_b=False, adjoint_a=False, adjoint_b=False, a_is_sparse=False, b_is_sparse=False, name=None)` {#matmul}
-
-Multiplies matrix `a` by matrix `b`, producing `a` * `b`.
-
-The inputs must be matrices (or tensors of rank > 2, representing batches of
-matrices), with matching inner dimensions, possibly after transposition.
-
-Both matrices must be of the same type. The supported types are:
-`float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
-
-Either matrix can be transposed or adjointed (conjugated and transposed) on
-the fly by setting one of the corresponding flag to `True`. These are `False`
-by default.
-
-If one or both of the matrices contain a lot of zeros, a more efficient
-multiplication algorithm can be used by setting the corresponding
-`a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
-This optimization is only available for plain matrices (rank-2 tensors) with
-datatypes `bfloat16` or `float32`.
-
-For example:
-
-```python
-# 2-D tensor `a`
-a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3]) => [[1. 2. 3.]
-                                                      [4. 5. 6.]]
-# 2-D tensor `b`
-b = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2]) => [[7. 8.]
-                                                         [9. 10.]
-                                                         [11. 12.]]
-c = tf.matmul(a, b) => [[58 64]
-                        [139 154]]
-
-
-# 3-D tensor `a`
-a = tf.constant(np.arange(1, 13, dtype=np.int32),
-                shape=[2, 2, 3])                  => [[[ 1.  2.  3.]
-                                                       [ 4.  5.  6.]],
-                                                      [[ 7.  8.  9.]
-                                                       [10. 11. 12.]]]
-
-# 3-D tensor `b`
-b = tf.constant(np.arange(13, 25, dtype=np.int32),
-                shape=[2, 3, 2])                   => [[[13. 14.]
-                                                        [15. 16.]
-                                                        [17. 18.]],
-                                                       [[19. 20.]
-                                                        [21. 22.]
-                                                        [23. 24.]]]
-c = tf.matmul(a, b) => [[[ 94 100]
-                         [229 244]],
-                        [[508 532]
-                         [697 730]]]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
-    `complex128` and rank > 1.
-*  <b>`b`</b>: `Tensor` with same type and rank as `a`.
-*  <b>`transpose_a`</b>: If `True`, `a` is transposed before multiplication.
-*  <b>`transpose_b`</b>: If `True`, `b` is transposed before multiplication.
-*  <b>`adjoint_a`</b>: If `True`, `a` is conjugated and transposed before
-    multiplication.
-*  <b>`adjoint_b`</b>: If `True`, `b` is conjugated and transposed before
-    multiplication.
-*  <b>`a_is_sparse`</b>: If `True`, `a` is treated as a sparse matrix.
-*  <b>`b_is_sparse`</b>: If `True`, `b` is treated as a sparse matrix.
-*  <b>`name`</b>: Name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same type as `a` and `b` where each inner-most matrix is
-  the product of the corresponding matrices in `a` and `b`, e.g. if all
-  transpose or adjoint attributes are `False`:
-
-  `output`[..., i, j] = sum_k (`a`[..., i, k] * `b`[..., k, j]),
-  for all indices i, j.
-
-
-*  <b>`Note`</b>: This is matrix product, not element-wise product.
-
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If transpose_a and adjoint_a, or transpose_b and adjoint_b
-    are both set to True.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
deleted file mode 100644
index 9bcd03f6e78..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.minimum(x, y, name=None)` {#minimum}
-
-Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-
-*NOTE*: `Minimum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.conv3d_backprop_filter_v2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.conv3d_backprop_filter_v2.md
deleted file mode 100644
index 1a48a6f0e02..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.conv3d_backprop_filter_v2.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.nn.conv3d_backprop_filter_v2(input, filter_sizes, out_backprop, strides, padding, name=None)` {#conv3d_backprop_filter_v2}
-
-Computes the gradients of 3-D convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 5-D
-    `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-    tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-    out_channels]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.depthwise_conv2d_native_backprop_input.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.depthwise_conv2d_native_backprop_input.md
deleted file mode 100644
index 26023f5f659..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.depthwise_conv2d_native_backprop_input.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.nn.depthwise_conv2d_native_backprop_input(input_sizes, filter, out_backprop, strides, padding, name=None)` {#depthwise_conv2d_native_backprop_input}
-
-Computes the gradients of depthwise convolution with respect to the input.
-
-##### Args:
-
-
-*  <b>`input_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the shape of `input`,
-    where `input` is a 4-D `[batch, height, width, channels]` tensor.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    4-D with shape
-    `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `filter`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `filter`.
-  4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.embedding_lookup.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.embedding_lookup.md
deleted file mode 100644
index a58bd7f728a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.embedding_lookup.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None)` {#embedding_lookup}
-
-Looks up `ids` in a list of embedding tensors.
-
-This function is used to perform parallel lookups on the list of
-tensors in `params`.  It is a generalization of
-[`tf.gather()`](../../api_docs/python/array_ops.md#gather), where `params` is
-interpreted as a partitioning of a large embedding tensor.  `params` may be
-a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-partitioner.
-
-If `len(params) > 1`, each element `id` of `ids` is partitioned between
-the elements of `params` according to the `partition_strategy`.
-In all strategies, if the id space does not evenly divide the number of
-partitions, each of the first `(max_id + 1) % len(params)` partitions will
-be assigned one more id.
-
-If `partition_strategy` is `"mod"`, we assign each id to partition
-`p = id % len(params)`. For instance,
-13 ids are split across 5 partitions as:
-`[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
-
-If `partition_strategy` is `"div"`, we assign ids to partitions in a
-contiguous manner. In this case, 13 ids are split across 5 partitions as:
-`[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
-
-The results of the lookup are concatenated into a dense
-tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
-
-##### Args:
-
-
-*  <b>`params`</b>: A single tensor representing the complete embedding tensor,
-    or a list of P tensors all of same shape except for the first dimension,
-    representing sharded embedding tensors.  Alternatively, a
-    `PartitionedVariable`, created by partitioning along dimension 0. Each
-    element must be appropriately sized for the given `partition_strategy`.
-*  <b>`ids`</b>: A `Tensor` with type `int32` or `int64` containing the ids to be looked
-    up in `params`.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-    if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-    is `"mod"`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`validate_indices`</b>: Whether or not to validate gather indices.
-*  <b>`max_norm`</b>: If not None, embedding values are l2-normalized to the value of
-   max_norm.
-
-##### Returns:
-
-  A `Tensor` with the same type as the tensors in `params`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `params` is empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.log_uniform_candidate_sampler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.log_uniform_candidate_sampler.md
deleted file mode 100644
index baf9f9d4218..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.log_uniform_candidate_sampler.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.nn.log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#log_uniform_candidate_sampler}
-
-Samples a set of classes using a log-uniform (Zipfian) base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is an approximately log-uniform
-or Zipfian distribution:
-
-`P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
-
-This sampler is useful when the target classes approximately follow such
-a distribution - for example, if the classes represent words in a lexicon
-sorted in decreasing order of frequency. If your classes are not ordered by
-decreasing frequency, do not use this op.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.relu.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.relu.md
deleted file mode 100644
index 5811a1da963..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.nn.relu.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.nn.relu(features, name=None)` {#relu}
-
-Computes rectified linear: `max(features, 0)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.parallel_stack.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.parallel_stack.md
deleted file mode 100644
index a9df823110a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.parallel_stack.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.parallel_stack(values, name='parallel_stack')` {#parallel_stack}
-
-Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
-
-Requires that the shape of inputs be known at graph construction time.
-
-Packs the list of tensors in `values` into a tensor with rank one higher than
-each tensor in `values`, by packing them along the first dimension.
-Given a list of length `N` of tensors of shape `(A, B, C)`; the `output`
-tensor will have the shape `(N, A, B, C)`.
-
-For example:
-
-```prettyprint
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-parallel_stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]
-```
-
-The difference between stack and parallel_stack is that stack requires all
-of the inputs be computed before the operation will begin but doesn't require
-that the input shapes be known during graph construction.  Parallel stack
-will copy pieces of the input into the output as they become available, in
-some situations this can provide a performance benefit.
-
-This is the opposite of unstack.  The numpy equivalent is
-
-    tf.parallel_stack([x, y, z]) = np.asarray([x, y, z])
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects with the same shape and type.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`output`</b>: A stacked `Tensor` with the same type as `values`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.random_uniform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.random_uniform.md
deleted file mode 100644
index 517bdd98c40..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.random_uniform.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.random_uniform(shape, minval=0, maxval=None, dtype=tf.float32, seed=None, name=None)` {#random_uniform}
-
-Outputs random values from a uniform distribution.
-
-The generated values follow a uniform distribution in the range
-`[minval, maxval)`. The lower bound `minval` is included in the range, while
-the upper bound `maxval` is excluded.
-
-For floats, the default range is `[0, 1)`.  For ints, at least `maxval` must
-be specified explicitly.
-
-In the integer case, the random integers are slightly biased unless
-`maxval - minval` is an exact power of two.  The bias is small for values of
-`maxval - minval` significantly smaller than the range of the output (either
-`2**32` or `2**64`).
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`minval`</b>: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
-    range of random values to generate.  Defaults to 0.
-*  <b>`maxval`</b>: A 0-D Tensor or Python value of type `dtype`. The upper bound on
-    the range of random values to generate.  Defaults to 1 if `dtype` is
-    floating point.
-*  <b>`dtype`</b>: The type of the output: `float32`, `float64`, `int32`, or `int64`.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random uniform values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `dtype` is integral and `maxval` is not specified.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.real.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.real.md
deleted file mode 100644
index 00ebad2676d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.real.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.real(input, name=None)` {#real}
-
-Returns the real part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float32` or `float64` that is the real part of each element in `input`.
-All elements in `input` must be complex numbers of the form \\(a + bj\\),
-where *a* is the real part returned by this operation and *b* is the
-imaginary part.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.real(input) ==> [-2.25, 3.25]
-```
-
-If `input` is already real, it is returned unchanged.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must have numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32` or `float64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.report_uninitialized_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.report_uninitialized_variables.md
deleted file mode 100644
index e3ecdf7733b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.report_uninitialized_variables.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.report_uninitialized_variables(var_list=None, name='report_uninitialized_variables')` {#report_uninitialized_variables}
-
-Adds ops to list the names of uninitialized variables.
-
-When run, it returns a 1-D tensor containing the names of uninitialized
-variables if there are any, or an empty array if there are none.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to check. Defaults to the
-    value of `global_variables() + local_variables()`
-*  <b>`name`</b>: Optional name of the `Operation`.
-
-##### Returns:
-
-  A 1-D tensor containing names of the uninitialized variables, or an empty
-  1-D tensor if there are no variables or no uninitialized variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md
deleted file mode 100644
index ac3bd931fb0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.required_space_to_batch_paddings(input_shape, block_shape, base_paddings=None, name=None)` {#required_space_to_batch_paddings}
-
-Calculate padding required to make block_shape divide input_shape.
-
-This function can be used to calculate a suitable paddings argument for use
-with space_to_batch_nd and batch_to_space_nd.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: int32 Tensor of shape [N].
-*  <b>`block_shape`</b>: int32 Tensor of shape [N].
-*  <b>`base_paddings`</b>: Optional int32 Tensor of shape [N, 2].  Specifies the minimum
-    amount of padding to use.  All elements must be >= 0.  If not specified,
-    defaults to 0.
-*  <b>`name`</b>: string.  Optional name prefix.
-
-##### Returns:
-
-  (paddings, crops), where:
-
-  `paddings` and `crops` are int32 Tensors of rank 2 and shape [N, 2]
-
-*  <b>`satisfying`</b>: 
-
-      paddings[i, 0] = base_paddings[i, 0].
-      0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
-      (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0
-
-      crops[i, 0] = 0
-      crops[i, 1] = paddings[i, 1] - base_paddings[i, 1]
-
-
-*  <b>`Raises`</b>: ValueError if called with incompatible shapes.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_nd_sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_nd_sub.md
deleted file mode 100644
index 1d16c8e06c5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_nd_sub.md
+++ /dev/null
@@ -1,61 +0,0 @@
-### `tf.scatter_nd_sub(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_sub}
-
-Applies sparse subtraction between `updates` and individual values or slices
-
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-with 8 elements. In Python, that subtraction would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
-
-The resulting update to ref would look like this:
-
-    [1, -9, 3, -6, -4, 6, 7, -4]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated values
-    to subtract from ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_update.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_update.md
deleted file mode 100644
index 880b740b160..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.scatter_update.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.scatter_update(ref, indices, updates, use_locking=None, name=None)` {#scatter_update}
-
-Applies sparse updates to a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] = updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] = updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-If values in `ref` is to be updated more than once, because there are
-duplicate entries in `indices`, the order at which the updates happen
-for each value is undefined.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterUpdate.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to store in `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    If True, the assignment will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.setdiff1d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.setdiff1d.md
deleted file mode 100644
index 3bd95f13c54..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.setdiff1d.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.setdiff1d(x, y, index_dtype=tf.int32, name=None)` {#setdiff1d}
-
-Computes the difference between two lists of numbers or strings.
-
-Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all values that are in `x` but not in `y`. The returned list `out`
-is sorted in the same order that the numbers appear in `x` (duplicates are
-preserved). This operation also returns a list `idx` that represents the
-position of each `out` element in `x`. In other words:
-
-`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-
-For example, given this input:
-
-```prettyprint
-x = [1, 2, 3, 4, 5, 6]
-y = [1, 3, 5]
-```
-
-This operation would return:
-
-```prettyprint
-out ==> [2, 4, 6]
-idx ==> [1, 3, 5]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D. Values to keep.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`. 1-D. Values to remove.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (out, idx).
-
-*  <b>`out`</b>: A `Tensor`. Has the same type as `x`. 1-D. Values present in `x` but not in `y`.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D. Positions of `x` values preserved in `out`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.shape_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.shape_n.md
deleted file mode 100644
index 5a5eca27623..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.shape_n.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.shape_n(input, out_type=None, name=None)` {#shape_n}
-
-Returns shape of tensors.
-
-This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A list of at least 1 `Tensor` objects of the same type.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list with the same number of `Tensor` objects as `input` of `Tensor` objects of type out_type.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sin.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sin.md
deleted file mode 100644
index f69c58bee0c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sin.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.sin(x, name=None)` {#sin}
-
-Computes sin of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md
deleted file mode 100644
index 7ab9e704752..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md
+++ /dev/null
@@ -1,137 +0,0 @@
-### `tf.space_to_batch_nd(input, block_shape, paddings, name=None)` {#space_to_batch_nd}
-
-SpaceToBatch for N-D tensors of type T.
-
-This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-grid of blocks of shape `block_shape`, and interleaves these blocks with the
-"batch" dimension (0) such that in the output, the spatial dimensions
-`[1, ..., M]` correspond to the position within the grid, and the batch
-dimension combines both the position within a spatial block and the original
-batch position.  Prior to division into blocks, the spatial dimensions of the
-input are optionally zero padded according to `paddings`.  See below for a
-precise description.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-    where spatial_shape has `M` dimensions.
-*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with shape `[M]`, all values must be >= 1.
-*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D with shape `[M, 2]`, all values must be >= 0.
-      `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-      `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-      `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-
-    This operation is equivalent to the following steps:
-
-    1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-       input according to `paddings` to produce `padded` of shape `padded_shape`.
-
-    2. Reshape `padded` to `reshaped_padded` of shape:
-
-         [batch] +
-         [padded_shape[1] / block_shape[0],
-           block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1],
-          block_shape[M-1]] +
-         remaining_shape
-
-    3. Permute dimensions of `reshaped_padded` to produce
-       `permuted_reshaped_padded` of shape:
-
-         block_shape +
-         [batch] +
-         [padded_shape[1] / block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1]] +
-         remaining_shape
-
-    4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-       dimension, producing an output tensor of shape:
-
-         [batch * prod(block_shape)] +
-         [padded_shape[1] / block_shape[0],
-          ...,
-          padded_shape[M] / block_shape[M-1]] +
-         remaining_shape
-
-    Some examples:
-
-    (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 1]` and value:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 3]` and value:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-        `paddings = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]],
-          [[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[4, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-        paddings = `[[0, 0], [2, 0]]`:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[8, 1, 3, 1]` and value:
-
-    ```prettyprint
-    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-         [[[0], [2], [4]]], [[[0], [10], [12]]],
-         [[[0], [5], [7]]], [[[0], [13], [15]]],
-         [[[0], [6], [8]]], [[[0], [14], [16]]]]
-    ```
-
-    Among others, this operation is useful for reducing atrous convolution into
-    regular convolution.
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md
deleted file mode 100644
index c1fa1d12e67..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.sparse_placeholder.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.sparse_placeholder(dtype, shape=None, name=None)` {#sparse_placeholder}
-
-Inserts a placeholder for a sparse tensor that will be always fed.
-
-**Important**: This sparse tensor will produce an error if evaluated.
-Its value must be fed using the `feed_dict` optional argument to
-`Session.run()`, `Tensor.eval()`, or `Operation.run()`.
-
-For example:
-
-```python
-x = tf.sparse_placeholder(tf.float32)
-y = tf.sparse_reduce_sum(x)
-
-with tf.Session() as sess:
-  print(sess.run(y))  # ERROR: will fail because x was not fed.
-
-  indices = np.array([[3, 2, 0], [4, 5, 1]], dtype=np.int64)
-  values = np.array([1.0, 2.0], dtype=np.float32)
-  shape = np.array([7, 9, 2], dtype=np.int64)
-  print(sess.run(y, feed_dict={
-    x: tf.SparseTensorValue(indices, values, shape)}))  # Will succeed.
-  print(sess.run(y, feed_dict={
-    x: (indices, values, shape)}))  # Will succeed.
-
-  sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
-  sp_value = sp.eval(session)
-  print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
-```
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of `values` elements in the tensor to be fed.
-*  <b>`shape`</b>: The shape of the tensor to be fed (optional). If the shape is not
-    specified, you can feed a sparse tensor of any shape.
-*  <b>`name`</b>: A name for prefixing the operations (optional).
-
-##### Returns:
-
-  A `SparseTensor` that may be used as a handle for feeding a value, but not
-  evaluated directly.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md
deleted file mode 100644
index 06c6461b832..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.split.md
+++ /dev/null
@@ -1,53 +0,0 @@
-### `tf.split(value, num_or_size_splits, axis=0, num=None, name='split')` {#split}
-
-Splits a tensor into sub tensors.
-
-If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-dimension `axis` into `num_split` smaller tensors.
-Requires that `num_split` evenly divides `value.shape[axis]`.
-
-If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-`len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-the `value` except along dimension `axis` where the size is `size_splits[i]`.
-
-For example:
-
-```python
-# 'value' is a tensor with shape [5, 30]
-# Split 'value' into 3 tensors with sizes [4, 15, 11] along dimension 1
-split0, split1, split2 = tf.split(value, [4, 15, 11], 1)
-tf.shape(split0) ==> [5, 4]
-tf.shape(split1) ==> [5, 15]
-tf.shape(split2) ==> [5, 11]
-# Split 'value' into 3 tensors along dimension 1
-split0, split1, split2 = tf.split(value, num_or_size_splits=3, axis=1)
-tf.shape(split0) ==> [5, 10]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: The `Tensor` to split.
-*  <b>`num_or_size_splits`</b>: Either an integer indicating the number of splits along
-    split_dim or a 1-D Tensor containing the sizes of each output tensor
-    along split_dim. If an integer then it must evenly divide
-    `value.shape[axis]`; otherwise the sum of sizes along the split
-    dimension must match that of the `value`.
-*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-    Must be in the range `[0, rank(value))`. Defaults to 0.
-*  <b>`num`</b>: Optional, used to specify the number of outputs when it cannot be
-    inferred from the shape of `size_splits`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor`
-  objects; if `num_or_size_splits` is a 1-D Tensor returns
-  `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
-  `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num` is unspecified and cannot be inferred.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md
deleted file mode 100644
index 90a1b9af825..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.squeeze.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.squeeze(input, axis=None, name=None, squeeze_dims=None)` {#squeeze}
-
-Removes dimensions of size 1 from the shape of a tensor.
-
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
-
-For example:
-
-```prettyprint
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```prettyprint
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The `input` to squeeze.
-*  <b>`axis`</b>: An optional list of `ints`. Defaults to `[]`.
-    If specified, only squeezes the dimensions listed. The dimension
-    index starts at 0. It is an error to squeeze a dimension that is not 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`squeeze_dims`</b>: Deprecated keyword argument that is now axis.
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Contains the same data as `input`, but has one or more dimensions of
-  size 1 removed.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When both `squeeze_dims` and `axis` are specified.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.string_split.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.string_split.md
deleted file mode 100644
index 08ccc5f104b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.string_split.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.string_split(source, delimiter=' ')` {#string_split}
-
-Split elements of `source` based on `delimiter` into a `SparseTensor`.
-
-Let N be the size of source (typically N will be the batch size). Split each
-element of `source` based on `delimiter` and return a `SparseTensor`
-containing the splitted tokens. Empty tokens are ignored.
-
-If `delimiter` is an empty string, each element of the `source` is split
-into individual strings, each containing one byte. (This includes splitting
-multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is
-treated as a set of delimiters with each considered a potential split point.
-
-For example:
-N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output
-will be
-
-st.indices = [0, 0;
-              0, 1;
-              1, 0;
-              1, 1;
-              1, 2]
-st.shape = [2, 3]
-st.values = ['hello', 'world', 'a', 'b', 'c']
-
-##### Args:
-
-
-*  <b>`source`</b>: `1-D` string `Tensor`, the strings to split.
-*  <b>`delimiter`</b>: `0-D` string `Tensor`, the delimiter character, the string should
-    be length 0 or 1.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If delimiter is not a string.
-
-##### Returns:
-
-  A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-  The first column of the indices corresponds to the row in `source` and the
-  second column corresponds to the index of the split component in this row.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md
deleted file mode 100644
index bce704ef4f2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md
+++ /dev/null
@@ -1,245 +0,0 @@
-
-- - -
-
-#### `tf.summary.SummaryDescription.ByteSize()` {#SummaryDescription.ByteSize}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.Clear()` {#SummaryDescription.Clear}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ClearExtension(extension_handle)` {#SummaryDescription.ClearExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ClearField(field_name)` {#SummaryDescription.ClearField}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.CopyFrom(other_msg)` {#SummaryDescription.CopyFrom}
-
-Copies the content of the specified message into the current message.
-
-The method clears the current message and then merges the specified
-message using MergeFrom.
-
-##### Args:
-
-
-*  <b>`other_msg`</b>: Message to copy into the current one.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.DiscardUnknownFields()` {#SummaryDescription.DiscardUnknownFields}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.FindInitializationErrors()` {#SummaryDescription.FindInitializationErrors}
-
-Finds required fields which are not initialized.
-
-##### Returns:
-
-  A list of strings.  Each string is a path to an uninitialized field from
-  the top-level message, e.g. "foo.bar[5].baz".
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.HasExtension(extension_handle)` {#SummaryDescription.HasExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.HasField(field_name)` {#SummaryDescription.HasField}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.IsInitialized(errors=None)` {#SummaryDescription.IsInitialized}
-
-Checks if all required fields of a message are set.
-
-##### Args:
-
-
-*  <b>`errors`</b>: A list which, if provided, will be populated with the field
-           paths of all missing required fields.
-
-##### Returns:
-
-  True iff the specified message has all required fields set.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ListFields()` {#SummaryDescription.ListFields}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.MergeFrom(msg)` {#SummaryDescription.MergeFrom}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.MergeFromString(serialized)` {#SummaryDescription.MergeFromString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ParseFromString(serialized)` {#SummaryDescription.ParseFromString}
-
-Parse serialized protocol buffer data into this message.
-
-Like MergeFromString(), except we clear the object first and
-do not return the value that MergeFromString returns.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SerializePartialToString()` {#SummaryDescription.SerializePartialToString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SerializeToString()` {#SummaryDescription.SerializeToString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SetInParent()` {#SummaryDescription.SetInParent}
-
-Sets the _cached_byte_size_dirty bit to true,
-and propagates this to our listener iff this was a state change.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.WhichOneof(oneof_name)` {#SummaryDescription.WhichOneof}
-
-Returns the name of the currently set field inside a oneof, or None.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__deepcopy__(memo=None)` {#SummaryDescription.__deepcopy__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__eq__(other)` {#SummaryDescription.__eq__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__getstate__()` {#SummaryDescription.__getstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__hash__()` {#SummaryDescription.__hash__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__init__(**kwargs)` {#SummaryDescription.__init__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__ne__(other_msg)` {#SummaryDescription.__ne__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__repr__()` {#SummaryDescription.__repr__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__setstate__(state)` {#SummaryDescription.__setstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__str__()` {#SummaryDescription.__str__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__unicode__()` {#SummaryDescription.__unicode__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.type_hint` {#SummaryDescription.type_hint}
-
-Magic attribute generated for "type_hint" proto field.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.audio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.audio.md
deleted file mode 100644
index c7edb742910..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.audio.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.summary.audio(name, tensor, sample_rate, max_outputs=3, collections=None)` {#audio}
-
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of
-`sample_rate`.
-
-The `tag` in the outputted Summary.Value protobufs is generated based on the
-name, with a suffix depending on the max_outputs setting:
-
-*  If `max_outputs` is 1, the summary value tag is '*name*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*name*/audio/0', '*name*/audio/1', etc
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-    or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`.
-*  <b>`sample_rate`</b>: A Scalar `float32` `Tensor` indicating the sample rate of the
-    signal in hertz.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate audio for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.tensor_summary.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.tensor_summary.md
deleted file mode 100644
index 3fb19c26013..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.tensor_summary.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.summary.tensor_summary(name, tensor, summary_description=None, collections=None)` {#tensor_summary}
-
-Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing the input tensor.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as the series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A tensor of any type and shape to serialize.
-*  <b>`summary_description`</b>: Optional summary_pb2.SummaryDescription()
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.tables_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.tables_initializer.md
deleted file mode 100644
index f278bd57e69..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.tables_initializer.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.tables_initializer(name='init_all_tables')` {#tables_initializer}
-
-Returns an Op that initializes all tables of the default graph.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the initialization op.
-
-##### Returns:
-
-  An Op that initializes all tables.  Note that if there are
-  not tables the returned Op is a NoOp.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
deleted file mode 100644
index d912d0e153a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
+++ /dev/null
@@ -1,871 +0,0 @@
-Base class for tests that need to test TensorFlow.
-- - -
-
-#### `tf.test.TestCase.__call__(*args, **kwds)` {#TestCase.__call__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__eq__(other)` {#TestCase.__eq__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__hash__()` {#TestCase.__hash__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__init__(methodName='runTest')` {#TestCase.__init__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__ne__(other)` {#TestCase.__ne__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__repr__()` {#TestCase.__repr__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__str__()` {#TestCase.__str__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.addCleanup(function, *args, **kwargs)` {#TestCase.addCleanup}
-
-Add a function, with arguments, to be called when the test is
-completed. Functions added are called on a LIFO basis and are
-called after tearDown on test failure or success.
-
-Cleanup items are called even if setUp fails (unlike tearDown).
-
-
-- - -
-
-#### `tf.test.TestCase.addTypeEqualityFunc(typeobj, function)` {#TestCase.addTypeEqualityFunc}
-
-Add a type specific assertEqual style function to compare a type.
-
-This method is for use by TestCase subclasses that need to register
-their own type equality functions to provide nicer error messages.
-
-##### Args:
-
-
-*  <b>`typeobj`</b>: The data type to call this function on when both values
-            are of the same type in assertEqual().
-*  <b>`function`</b>: The callable taking two arguments and an optional
-            msg= argument that raises self.failureException with a
-            useful error message when the two arguments are not equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllClose(a, b, rtol=1e-06, atol=1e-06)` {#TestCase.assertAllClose}
-
-Asserts that two numpy arrays have near values.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`rtol`</b>: relative tolerance
-*  <b>`atol`</b>: absolute tolerance
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllCloseAccordingToType(a, b, rtol=1e-06, atol=1e-06)` {#TestCase.assertAllCloseAccordingToType}
-
-Like assertAllClose, but also suitable for comparing fp16 arrays.
-
-In particular, the tolerance is reduced to 1e-3 if at least
-one of the arguments is of type float16.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`rtol`</b>: relative tolerance
-*  <b>`atol`</b>: absolute tolerance
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllEqual(a, b)` {#TestCase.assertAllEqual}
-
-Asserts that two numpy arrays have the same values.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertAlmostEqual}
-
-Fail if the two objects are unequal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-If the two objects compare equal then they will automatically
-compare almost equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAlmostEquals(first, second, places=None, msg=None, delta=None)` {#TestCase.assertAlmostEquals}
-
-Fail if the two objects are unequal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-If the two objects compare equal then they will automatically
-compare almost equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertArrayNear(farray1, farray2, err)` {#TestCase.assertArrayNear}
-
-Asserts that two float arrays are near each other.
-
-Checks that for all elements of farray1 and farray2
-|f1 - f2| < err.  Asserts a test failure if not.
-
-##### Args:
-
-
-*  <b>`farray1`</b>: a list of float values.
-*  <b>`farray2`</b>: a list of float values.
-*  <b>`err`</b>: a float value.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDeviceEqual(device1, device2)` {#TestCase.assertDeviceEqual}
-
-Asserts that the two given devices are the same.
-
-##### Args:
-
-
-*  <b>`device1`</b>: A string device name or TensorFlow `DeviceSpec` object.
-*  <b>`device2`</b>: A string device name or TensorFlow `DeviceSpec` object.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDictContainsSubset(expected, actual, msg=None)` {#TestCase.assertDictContainsSubset}
-
-Checks whether actual is a superset of expected.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDictEqual(d1, d2, msg=None)` {#TestCase.assertDictEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertEqual(first, second, msg=None)` {#TestCase.assertEqual}
-
-Fail if the two objects are unequal as determined by the '=='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertEquals(first, second, msg=None)` {#TestCase.assertEquals}
-
-Fail if the two objects are unequal as determined by the '=='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertFalse(expr, msg=None)` {#TestCase.assertFalse}
-
-Check that the expression is false.
-
-
-- - -
-
-#### `tf.test.TestCase.assertGreater(a, b, msg=None)` {#TestCase.assertGreater}
-
-Just like self.assertTrue(a > b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertGreaterEqual(a, b, msg=None)` {#TestCase.assertGreaterEqual}
-
-Just like self.assertTrue(a >= b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIn(member, container, msg=None)` {#TestCase.assertIn}
-
-Just like self.assertTrue(a in b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIs(expr1, expr2, msg=None)` {#TestCase.assertIs}
-
-Just like self.assertTrue(a is b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsInstance(obj, cls, msg=None)` {#TestCase.assertIsInstance}
-
-Same as self.assertTrue(isinstance(obj, cls)), with a nicer
-default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNone(obj, msg=None)` {#TestCase.assertIsNone}
-
-Same as self.assertTrue(obj is None), with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNot(expr1, expr2, msg=None)` {#TestCase.assertIsNot}
-
-Just like self.assertTrue(a is not b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNotNone(obj, msg=None)` {#TestCase.assertIsNotNone}
-
-Included for symmetry with assertIsNone.
-
-
-- - -
-
-#### `tf.test.TestCase.assertItemsEqual(expected_seq, actual_seq, msg=None)` {#TestCase.assertItemsEqual}
-
-An unordered sequence specific comparison. It asserts that
-actual_seq and expected_seq have the same element counts.
-Equivalent to::
-
-    self.assertEqual(Counter(iter(actual_seq)),
-                     Counter(iter(expected_seq)))
-
-Asserts that each element has the same count in both sequences.
-
-##### Example:
-
-    - [0, 1, 1] and [1, 0, 1] compare equal.
-    - [0, 0, 1] and [0, 1] compare unequal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertLess(a, b, msg=None)` {#TestCase.assertLess}
-
-Just like self.assertTrue(a < b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertLessEqual(a, b, msg=None)` {#TestCase.assertLessEqual}
-
-Just like self.assertTrue(a <= b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertListEqual(list1, list2, msg=None)` {#TestCase.assertListEqual}
-
-A list-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`list1`</b>: The first list to compare.
-*  <b>`list2`</b>: The second list to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assertMultiLineEqual(first, second, msg=None)` {#TestCase.assertMultiLineEqual}
-
-Assert that two multi-line strings are equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNDArrayNear(ndarray1, ndarray2, err)` {#TestCase.assertNDArrayNear}
-
-Asserts that two numpy arrays have near values.
-
-##### Args:
-
-
-*  <b>`ndarray1`</b>: a numpy ndarray.
-*  <b>`ndarray2`</b>: a numpy ndarray.
-*  <b>`err`</b>: a float. The maximum absolute difference allowed.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNear(f1, f2, err, msg=None)` {#TestCase.assertNear}
-
-Asserts that two floats are near each other.
-
-Checks that |f1 - f2| < err and asserts a test failure
-if not.
-
-##### Args:
-
-
-*  <b>`f1`</b>: A float value.
-*  <b>`f2`</b>: A float value.
-*  <b>`err`</b>: A float value.
-*  <b>`msg`</b>: An optional string message to append to the failure message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEqual}
-
-Fail if the two objects are equal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is less than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-Objects that are equal automatically fail.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotAlmostEquals(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEquals}
-
-Fail if the two objects are equal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is less than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-Objects that are equal automatically fail.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEqual(first, second, msg=None)` {#TestCase.assertNotEqual}
-
-Fail if the two objects are equal as determined by the '!='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEquals(first, second, msg=None)` {#TestCase.assertNotEquals}
-
-Fail if the two objects are equal as determined by the '!='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotIn(member, container, msg=None)` {#TestCase.assertNotIn}
-
-Just like self.assertTrue(a not in b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotIsInstance(obj, cls, msg=None)` {#TestCase.assertNotIsInstance}
-
-Included for symmetry with assertIsInstance.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotRegexpMatches(text, unexpected_regexp, msg=None)` {#TestCase.assertNotRegexpMatches}
-
-Fail the test if the text matches the regular expression.
-
-
-- - -
-
-#### `tf.test.TestCase.assertProtoEquals(expected_message_maybe_ascii, message)` {#TestCase.assertProtoEquals}
-
-Asserts that message is same as parsed expected_message_ascii.
-
-Creates another prototype of message, reads the ascii message into it and
-then compares them using self._AssertProtoEqual().
-
-##### Args:
-
-
-*  <b>`expected_message_maybe_ascii`</b>: proto message in original or ascii form
-*  <b>`message`</b>: the message to validate
-
-
-- - -
-
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=21, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaises(excClass, callableObj=None, *args, **kwargs)` {#TestCase.assertRaises}
-
-Fail unless an exception of class excClass is raised
-by callableObj when invoked with arguments args and keyword
-arguments kwargs. If a different type of exception is
-raised, it will not be caught, and the test case will be
-deemed to have suffered an error, exactly as for an
-unexpected exception.
-
-If called with callableObj omitted or None, will return a
-context object used like this::
-
-     with self.assertRaises(SomeException):
-         do_something()
-
-The context manager keeps a reference to the exception as
-the 'exception' attribute. This allows you to inspect the
-exception after the assertion::
-
-    with self.assertRaises(SomeException) as cm:
-        do_something()
-    the_exception = cm.exception
-    self.assertEqual(the_exception.error_code, 3)
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesOpError(expected_err_re_or_predicate)` {#TestCase.assertRaisesOpError}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesRegexp(expected_exception, expected_regexp, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesRegexp}
-
-Asserts that the message in a raised exception matches a regexp.
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_regexp`</b>: Regexp (re pattern object or string) expected
-            to be found in error message.
-*  <b>`callable_obj`</b>: Function to be called.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra kwargs.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesWithPredicateMatch(exception_type, expected_err_re_or_predicate)` {#TestCase.assertRaisesWithPredicateMatch}
-
-Returns a context manager to enclose code expected to raise an exception.
-
-If the exception is an OpError, the op stack is also included in the message
-predicate search.
-
-##### Args:
-
-
-*  <b>`exception_type`</b>: The expected type of exception that should be raised.
-*  <b>`expected_err_re_or_predicate`</b>: If this is callable, it should be a function
-    of one argument that inspects the passed-in exception and
-    returns True (success) or False (please fail the test). Otherwise, the
-    error message is expected to match this regular expression partially.
-
-##### Returns:
-
-  A context manager to surround code that is expected to raise an
-  exception.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRegexpMatches(text, expected_regexp, msg=None)` {#TestCase.assertRegexpMatches}
-
-Fail the test unless the text matches the regular expression.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSequenceEqual(seq1, seq2, msg=None, seq_type=None)` {#TestCase.assertSequenceEqual}
-
-An equality assertion for ordered sequences (like lists and tuples).
-
-For the purposes of this function, a valid ordered sequence type is one
-which can be indexed, has a length, and has an equality operator.
-
-##### Args:
-
-
-*  <b>`seq1`</b>: The first sequence to compare.
-*  <b>`seq2`</b>: The second sequence to compare.
-*  <b>`seq_type`</b>: The expected datatype of the sequences, or None if no
-            datatype should be enforced.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSetEqual(set1, set2, msg=None)` {#TestCase.assertSetEqual}
-
-A set-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`set1`</b>: The first set to compare.
-*  <b>`set2`</b>: The second set to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-assertSetEqual uses ducktyping to support different types of sets, and
-is optimized for sets specifically (parameters must support a
-difference method).
-
-
-- - -
-
-#### `tf.test.TestCase.assertShapeEqual(np_array, tf_tensor)` {#TestCase.assertShapeEqual}
-
-Asserts that a Numpy ndarray and a TensorFlow tensor have the same shape.
-
-##### Args:
-
-
-*  <b>`np_array`</b>: A Numpy ndarray or Numpy scalar.
-*  <b>`tf_tensor`</b>: A Tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments have the wrong type.
-
-
-- - -
-
-#### `tf.test.TestCase.assertStartsWith(actual, expected_start, msg=None)` {#TestCase.assertStartsWith}
-
-Assert that actual.startswith(expected_start) is True.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`expected_start`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertTrue(expr, msg=None)` {#TestCase.assertTrue}
-
-Check that the expression is true.
-
-
-- - -
-
-#### `tf.test.TestCase.assertTupleEqual(tuple1, tuple2, msg=None)` {#TestCase.assertTupleEqual}
-
-A tuple-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`tuple1`</b>: The first tuple to compare.
-*  <b>`tuple2`</b>: The second tuple to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assert_(expr, msg=None)` {#TestCase.assert_}
-
-Check that the expression is true.
-
-
-- - -
-
-#### `tf.test.TestCase.checkedThread(target, args=None, kwargs=None)` {#TestCase.checkedThread}
-
-Returns a Thread wrapper that asserts 'target' completes successfully.
-
-This method should be used to create all threads in test cases, as
-otherwise there is a risk that a thread will silently fail, and/or
-assertions made in the thread will not be respected.
-
-##### Args:
-
-
-*  <b>`target`</b>: A callable object to be executed in the thread.
-*  <b>`args`</b>: The argument tuple for the target invocation. Defaults to ().
-*  <b>`kwargs`</b>: A dictionary of keyword arguments for the target invocation.
-    Defaults to {}.
-
-##### Returns:
-
-  A wrapper for threading.Thread that supports start() and join() methods.
-
-
-- - -
-
-#### `tf.test.TestCase.countTestCases()` {#TestCase.countTestCases}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.debug()` {#TestCase.debug}
-
-Run the test without collecting errors in a TestResult
-
-
-- - -
-
-#### `tf.test.TestCase.defaultTestResult()` {#TestCase.defaultTestResult}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.doCleanups()` {#TestCase.doCleanups}
-
-Execute all cleanup functions. Normally called for you after
-tearDown.
-
-
-- - -
-
-#### `tf.test.TestCase.fail(msg=None)` {#TestCase.fail}
-
-Fail immediately, with the given message.
-
-
-- - -
-
-#### `tf.test.TestCase.failIf(*args, **kwargs)` {#TestCase.failIf}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failIfAlmostEqual(*args, **kwargs)` {#TestCase.failIfAlmostEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failIfEqual(*args, **kwargs)` {#TestCase.failIfEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnless(*args, **kwargs)` {#TestCase.failUnless}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessAlmostEqual(*args, **kwargs)` {#TestCase.failUnlessAlmostEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessEqual(*args, **kwargs)` {#TestCase.failUnlessEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessRaises(*args, **kwargs)` {#TestCase.failUnlessRaises}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.get_temp_dir()` {#TestCase.get_temp_dir}
-
-Returns a unique temporary directory for the test to use.
-
-Across different test runs, this method will return a different folder.
-This will ensure that across different runs tests will not be able to
-pollute each others environment.
-
-##### Returns:
-
-  string, the path to the unique temporary directory created for this test.
-
-
-- - -
-
-#### `tf.test.TestCase.id()` {#TestCase.id}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.run(result=None)` {#TestCase.run}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.setUp()` {#TestCase.setUp}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.setUpClass(cls)` {#TestCase.setUpClass}
-
-Hook method for setting up class fixture before running tests in the class.
-
-
-- - -
-
-#### `tf.test.TestCase.shortDescription()` {#TestCase.shortDescription}
-
-Returns a one-line description of the test, or None if no
-description has been provided.
-
-The default implementation of this method returns the first line of
-the specified test method's docstring.
-
-
-- - -
-
-#### `tf.test.TestCase.skipTest(reason)` {#TestCase.skipTest}
-
-Skip this test.
-
-
-- - -
-
-#### `tf.test.TestCase.tearDown()` {#TestCase.tearDown}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.tearDownClass(cls)` {#TestCase.tearDownClass}
-
-Hook method for deconstructing the class fixture after running all tests in the class.
-
-
-- - -
-
-#### `tf.test.TestCase.test_session(graph=None, config=None, use_gpu=False, force_gpu=False)` {#TestCase.test_session}
-
-Returns a TensorFlow Session for use in executing tests.
-
-This method should be used for all functional tests.
-
-This method behaves different than session.Session: for performance reasons
-`test_session` will by default (if `graph` is None) reuse the same session
-across tests. This means you may want to either call the function
-`reset_default_graph()` before tests, or if creating an explicit new graph,
-pass it here (simply setting it with `as_default()` won't do it), which will
-trigger the creation of a new session.
-
-Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-`force_gpu` is True, all ops are pinned to `/gpu:0`. Otherwise, if `use_gpu`
-is True, TensorFlow tries to run as many ops on the GPU as possible. If both
-`force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
-
-Example:
-
-  class MyOperatorTest(test_util.TensorFlowTestCase):
-    def testMyOperator(self):
-      with self.test_session(use_gpu=True):
-        valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
-        result = MyOperator(valid_input).eval()
-        self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
-        invalid_input = [-1.0, 2.0, 7.0]
-        with self.assertRaisesOpError("negative input not supported"):
-          MyOperator(invalid_input).eval()
-
-##### Args:
-
-
-*  <b>`graph`</b>: Optional graph to use during the returned session.
-*  <b>`config`</b>: An optional config_pb2.ConfigProto to use to configure the
-    session.
-*  <b>`use_gpu`</b>: If True, attempt to run as many ops as possible on GPU.
-*  <b>`force_gpu`</b>: If True, pin all ops to `/gpu:0`.
-
-##### Returns:
-
-  A Session object that should be used as a context manager to surround
-  the graph building and execution code in a test case.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.assert_equal_graph_def.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.assert_equal_graph_def.md
deleted file mode 100644
index 026f5df8901..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.assert_equal_graph_def.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.test.assert_equal_graph_def(actual, expected, checkpoint_v2=False)` {#assert_equal_graph_def}
-
-Asserts that two `GraphDef`s are (mostly) the same.
-
-Compares two `GraphDef` protos for equality, ignoring versions and ordering of
-nodes, attrs, and control inputs.  Node names are used to match up nodes
-between the graphs, so the naming of nodes must be consistent.
-
-##### Args:
-
-
-*  <b>`actual`</b>: The `GraphDef` we have.
-*  <b>`expected`</b>: The `GraphDef` we expected.
-*  <b>`checkpoint_v2`</b>: boolean determining whether to ignore randomized attribute
-      values that appear in V2 checkpoints.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If the `GraphDef`s do not match.
-*  <b>`TypeError`</b>: If either argument is not a `GraphDef`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.compute_gradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.compute_gradient.md
deleted file mode 100644
index a69224a0c50..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.compute_gradient.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.test.compute_gradient(x, x_shape, y, y_shape, x_init_value=None, delta=0.001, init_targets=None, extra_feed_dict=None)` {#compute_gradient}
-
-Computes and returns the theoretical and numerical Jacobian.
-
-If `x` or `y` is complex, the Jacobian will still be real but the
-corresponding Jacobian dimension(s) will be twice as large.  This is required
-even if both input and output is complex since TensorFlow graphs are not
-necessarily holomorphic, and may have gradients not expressible as complex
-numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
-with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
-
-    J[:m, :n] = d(Re y)/d(Re x)
-    J[:m, n:] = d(Im y)/d(Re x)
-    J[m:, :n] = d(Re y)/d(Im x)
-    J[m:, n:] = d(Im y)/d(Im x)
-
-##### Args:
-
-
-*  <b>`x`</b>: a tensor or list of tensors
-*  <b>`x_shape`</b>: the dimensions of x as a tuple or an array of ints. If x is a list,
-  then this is the list of shapes.
-
-*  <b>`y`</b>: a tensor
-*  <b>`y_shape`</b>: the dimensions of y as a tuple or an array of ints.
-*  <b>`x_init_value`</b>: (optional) a numpy array of the same shape as "x"
-    representing the initial value of x. If x is a list, this should be a list
-    of numpy arrays.  If this is none, the function will pick a random tensor
-    as the initial value.
-*  <b>`delta`</b>: (optional) the amount of perturbation.
-*  <b>`init_targets`</b>: list of targets to run to initialize model params.
-    TODO(mrry): remove this argument.
-*  <b>`extra_feed_dict`</b>: dict that allows fixing specified tensor values
-    during the Jacobian calculation.
-
-##### Returns:
-
-  Two 2-d numpy arrays representing the theoretical and numerical
-  Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns
-  where "x_size" is the number of elements in x and "y_size" is the
-  number of elements in y. If x is a list, returns a list of two numpy arrays.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.is_built_with_cuda.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.is_built_with_cuda.md
deleted file mode 100644
index 51e3d97d8cf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.is_built_with_cuda.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.test.is_built_with_cuda()` {#is_built_with_cuda}
-
-Returns whether TensorFlow was built with CUDA (GPU) support.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.to_int32.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.to_int32.md
deleted file mode 100644
index fcc9db61ccd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.to_int32.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.to_int32(x, name='ToInt32')` {#to_int32}
-
-Casts a tensor to type `int32`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `int32`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `int32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
deleted file mode 100644
index 8654557bd5d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
+++ /dev/null
@@ -1,77 +0,0 @@
-Saves checkpoints every N steps or seconds.
-- - -
-
-#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None, listeners=None)` {#CheckpointSaverHook.__init__}
-
-Initialize CheckpointSaverHook monitor.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
-*  <b>`save_secs`</b>: `int`, save every N secs.
-*  <b>`save_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
-*  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
-*  <b>`listeners`</b>: List of `CheckpointSaverListener` subclass instances.
-    Used for callbacks that run immediately after the corresponding
-    CheckpointSaverHook callbacks, only in steps where the
-    CheckpointSaverHook was triggered.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: One of `save_steps` or `save_secs` should be set.
-*  <b>`ValueError`</b>: Exactly one of saver or scaffold should be set.
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.after_create_session(session, coord)` {#CheckpointSaverHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.after_run(run_context, run_values)` {#CheckpointSaverHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.before_run(run_context)` {#CheckpointSaverHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.begin()` {#CheckpointSaverHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.end(session)` {#CheckpointSaverHook.end}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.Saver.from_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.Saver.from_proto.md
deleted file mode 100644
index 1c3b17e2e9f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.Saver.from_proto.md
+++ /dev/null
@@ -1,14 +0,0 @@
-#### `tf.train.Saver.from_proto(saver_def, import_scope=None)` {#Saver.from_proto}
-
-Returns a `Saver` object created from `saver_def`.
-
-##### Args:
-
-
-*  <b>`saver_def`</b>: a `SaveDef` protocol buffer.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to use.
-
-##### Returns:
-
-  A `Saver` built from saver_def.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.SessionCreator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.SessionCreator.md
deleted file mode 100644
index c1df9b34061..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.SessionCreator.md
+++ /dev/null
@@ -1,8 +0,0 @@
-A factory for tf.Session.
-- - -
-
-#### `tf.train.SessionCreator.create_session()` {#SessionCreator.create_session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.basic_train_loop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.basic_train_loop.md
deleted file mode 100644
index 774cbe5ad5d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.basic_train_loop.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.train.basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master='')` {#basic_train_loop}
-
-Basic loop to train a model.
-
-Calls `train_step_fn` in a loop to train a model.  The function is called as:
-
-```python
-train_step_fn(session, *args, **kwargs)
-```
-
-It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
-typically runs one training step in the session.
-
-##### Args:
-
-
-*  <b>`supervisor`</b>: `tf.Supervisor` to run the training services.
-*  <b>`train_step_fn`</b>: Callable to execute one training step.  Called
-    repeatedly as `train_step_fn(session, *args **kwargs)`.
-*  <b>`args`</b>: Optional positional arguments passed to `train_step_fn`.
-*  <b>`kwargs`</b>: Optional keyword arguments passed to `train_step_fn`.
-*  <b>`master`</b>: Master to use to create the training session.  Defaults to
-    `""` which causes the session to be created in the local process.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.global_step.md
deleted file mode 100644
index 2ec7f9654ab..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.global_step.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.train.global_step(sess, global_step_tensor)` {#global_step}
-
-Small helper to get the global step.
-
-```python
-# Creates a variable to hold the global_step.
-global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
-# Creates a session.
-sess = tf.Session()
-# Initializes the variable.
-print('global_step: %s' % tf.train.global_step(sess, global_step_tensor))
-
-global_step: 10
-```
-
-##### Args:
-
-
-*  <b>`sess`</b>: A TensorFlow `Session` object.
-*  <b>`global_step_tensor`</b>: `Tensor` or the `name` of the operation that contains
-    the global step.
-
-##### Returns:
-
-  The global step value.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.latest_checkpoint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.latest_checkpoint.md
deleted file mode 100644
index b1fc87cdd79..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.latest_checkpoint.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.train.latest_checkpoint(checkpoint_dir, latest_filename=None)` {#latest_checkpoint}
-
-Finds the filename of latest saved checkpoint file.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory where the variables were saved.
-*  <b>`latest_filename`</b>: Optional name for the protocol buffer file that
-    contains the list of most recent checkpoint filenames.
-    See the corresponding argument to `Saver.save()`.
-
-##### Returns:
-
-  The full path to the latest checkpoint or `None` if no checkpoint was found.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md
deleted file mode 100644
index ec101daba3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.maybe_shuffle_batch_join.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.train.maybe_shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, keep_input, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch_join}
-
-Create batches by randomly shuffling conditionally-enqueued tensors.
-
-See docstring in `shuffle_batch_join` for more details.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors_list[i]`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md
deleted file mode 100644
index 5fbff8f9d4e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.train.natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#natural_exp_decay}
-
-Applies natural exponential decay to the initial learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an exponential decay function
-to a provided initial learning rate.  It requires an `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
-```
-
-Example: decay exponentially with a base of 0.96:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-learning_rate = 0.1
-k = 0.5
-learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
-
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: How often to apply decay.
-*  <b>`decay_rate`</b>: A Python number.  The decay rate.
-*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
-    continuous, fashion.
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'ExponentialTimeDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md
deleted file mode 100644
index 1da7793d58e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md
+++ /dev/null
@@ -1,86 +0,0 @@
-### `tf.train.shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#shuffle_batch}
-
-Creates batches by randomly shuffling tensors.
-
-This function adds the following to the current `Graph`:
-
-* A shuffling queue into which tensors from `tensors` are enqueued.
-* A `dequeue_many` operation to create batches from the queue.
-* A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
-  from `tensors`.
-
-If `enqueue_many` is `False`, `tensors` is assumed to represent a
-single example.  An input tensor with shape `[x, y, z]` will be output
-as a tensor with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors` is assumed to represent a
-batch of examples, where the first dimension is indexed by example,
-and all members of `tensors` should have the same size in the
-first dimension.  If an input tensor has shape `[*, x, y, z]`, the
-output will have shape `[batch_size, x, y, z]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-For example:
-
-```python
-# Creates batches of 32 images and 32 labels.
-image_batch, label_batch = tf.train.shuffle_batch(
-      [single_image, single_label],
-      batch_size=32,
-      num_threads=4,
-      capacity=50000,
-      min_after_dequeue=10000)
-```
-
-*N.B.:* You must ensure that either (i) the `shapes` argument is
-passed, or (ii) all of the tensors in `tensors` must have
-fully-defined shapes. `ValueError` will be raised if neither of
-these conditions holds.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.start_queue_runners.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.start_queue_runners.md
deleted file mode 100644
index 21ac6efee89..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.start_queue_runners.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.train.start_queue_runners(sess=None, coord=None, daemon=True, start=True, collection='queue_runners')` {#start_queue_runners}
-
-Starts all queue runners collected in the graph.
-
-This is a companion method to `add_queue_runner()`.  It just starts
-threads for all queue runners collected in the graph.  It returns
-the list of all threads.
-
-##### Args:
-
-
-*  <b>`sess`</b>: `Session` used to run the queue ops.  Defaults to the
-    default session.
-*  <b>`coord`</b>: Optional `Coordinator` for coordinating the started threads.
-*  <b>`daemon`</b>: Whether the threads should be marked as `daemons`, meaning
-    they don't block program exit.
-*  <b>`start`</b>: Set to `False` to only create the threads, not start them.
-*  <b>`collection`</b>: A `GraphKey` specifying the graph collection to
-    get the queue runners from.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  A list of threads.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.uniform_unit_scaling_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.uniform_unit_scaling_initializer.md
deleted file mode 100644
index 7d76e459126..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.uniform_unit_scaling_initializer.md
+++ /dev/null
@@ -1,38 +0,0 @@
-Initializer that generates tensors without scaling variance.
-
-When initializing a deep network, it is in principle advantageous to keep
-the scale of the input variance constant, so it does not explode or diminish
-by reaching the final layer. If the input is `x` and the operation `x * W`,
-and we want to initialize `W` uniformly at random, we need to pick `W` from
-
-    [-sqrt(3) / sqrt(dim), sqrt(3) / sqrt(dim)]
-
-to keep the scale intact, where `dim = W.shape[0]` (the size of the input).
-A similar calculation for convolutional networks gives an analogous result
-with `dim` equal to the product of the first 3 dimensions.  When
-nonlinearities are present, we need to multiply this by a constant `factor`.
-See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
-([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
-and the calculation of constants. In section 2.3 there, the constants were
-numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.
-
-Args:
-  factor: Float.  A multiplicative factor by which the values will be scaled.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.uniform_unit_scaling_initializer.__call__(shape, dtype=None, partition_info=None)` {#uniform_unit_scaling_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.uniform_unit_scaling_initializer.__init__(factor=1.0, seed=None, dtype=tf.float32)` {#uniform_unit_scaling_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.variables_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.variables_initializer.md
deleted file mode 100644
index ec779e79f66..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.variables_initializer.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.variables_initializer(var_list, name='init')` {#variables_initializer}
-
-Returns an Op that initializes a list of variables.
-
-After you launch the graph in a session, you can run the returned Op to
-initialize all the variables in `var_list`. This Op runs all the
-initializers of the variables in `var_list` in parallel.
-
-Calling `initialize_variables()` is equivalent to passing the list of
-initializers to `Group()`.
-
-If `var_list` is empty, however, the function still returns an Op that can
-be run. That Op just has no effect.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to initialize.
-*  <b>`name`</b>: Optional name for the returned operation.
-
-##### Returns:
-
-  An Op that run the initializers of all the specified variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.write_file.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.write_file.md
deleted file mode 100644
index ccccf9b43be..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.write_file.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.write_file(filename, contents, name=None)` {#write_file}
-
-Writes contents to the file at input filename. Creates file if not existing.
-
-##### Args:
-
-
-*  <b>`filename`</b>: A `Tensor` of type `string`.
-    scalar. The name of the file to which we write the contents.
-*  <b>`contents`</b>: A `Tensor` of type `string`.
-    scalar. The content to be written to the output file.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.PriorityQueue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.PriorityQueue.md
deleted file mode 100644
index 527a306c95c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.PriorityQueue.md
+++ /dev/null
@@ -1,47 +0,0 @@
-A queue implementation that dequeues elements in prioritized order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.PriorityQueue.__init__(capacity, types, shapes=None, names=None, shared_name=None, name='priority_queue')` {#PriorityQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `PriorityQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `PriorityQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `types`, and whose shapes are optionally described
-by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-Enqueues and Dequeues to the `PriorityQueue` must include an additional
-tuple entry at the beginning: the `priority`.  The priority must be
-an int64 scalar (for `enqueue`) or an int64 vector (for `enqueue_many`).
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`types`</b>: A list of `DType` objects. The length of `types` must equal
-    the number of tensors in each queue element, except the first priority
-    element.  The first tensor in each element is the priority,
-    which must be type int64.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects,
-    with the same length as `types`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of strings naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified, the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.RegisterGradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.RegisterGradient.md
deleted file mode 100644
index 2a93bbba404..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.RegisterGradient.md
+++ /dev/null
@@ -1,45 +0,0 @@
-A decorator for registering the gradient function for an op type.
-
-This decorator is only used when defining a new op type. For an op
-with `m` inputs and `n` outputs, the gradient function is a function
-that takes the original `Operation` and `n` `Tensor` objects
-(representing the gradients with respect to each output of the op),
-and returns `m` `Tensor` objects (representing the partial gradients
-with respect to each input of the op).
-
-For example, assuming that operations of type `"Sub"` take two
-inputs `x` and `y`, and return a single output `x - y`, the
-following gradient function would be registered:
-
-```python
-@tf.RegisterGradient("Sub")
-def _sub_grad(unused_op, grad):
-  return grad, tf.negative(grad)
-```
-
-The decorator argument `op_type` is the string type of an
-operation. This corresponds to the `OpDef.name` field for the proto
-that defines the operation.
-
-- - -
-
-#### `tf.RegisterGradient.__init__(op_type)` {#RegisterGradient.__init__}
-
-Creates a new decorator with `op_type` as the Operation type.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.RegisterGradient.__call__(f)` {#RegisterGradient.__call__}
-
-Registers the function `f` as gradient function for `op_type`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md
deleted file mode 100644
index 747652514cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.SparseTensor.md
+++ /dev/null
@@ -1,251 +0,0 @@
-Represents a sparse tensor.
-
-TensorFlow represents a sparse tensor as three separate dense tensors:
-`indices`, `values`, and `dense_shape`.  In Python, the three tensors are
-collected into a `SparseTensor` class for ease of use.  If you have separate
-`indices`, `values`, and `dense_shape` tensors, wrap them in a `SparseTensor`
-object before passing to the ops below.
-
-Concretely, the sparse tensor `SparseTensor(indices, values, dense_shape)`
-comprises the following components, where `N` and `ndims` are the number
-of values and number of dimensions in the `SparseTensor`, respectively:
-
-* `indices`: A 2-D int64 tensor of dense_shape `[N, ndims]`, which specifies
-  the indices of the elements in the sparse tensor that contain nonzero
-  values (elements are zero-indexed). For example, `indices=[[1,3], [2,4]]`
-  specifies that the elements with indexes of [1,3] and [2,4] have
-  nonzero values.
-
-* `values`: A 1-D tensor of any type and dense_shape `[N]`, which supplies the
-  values for each element in `indices`. For example, given
-  `indices=[[1,3], [2,4]]`, the parameter `values=[18, 3.6]` specifies
-  that element [1,3] of the sparse tensor has a value of 18, and element
-  [2,4] of the tensor has a value of 3.6.
-
-* `dense_shape`: A 1-D int64 tensor of dense_shape `[ndims]`, which specifies
-  the dense_shape of the sparse tensor. Takes a list indicating the number of
-  elements in each dimension. For example, `dense_shape=[3,6]` specifies a
-  two-dimensional 3x6 tensor, `dense_shape=[2,3,4]` specifies a
-  three-dimensional 2x3x4 tensor, and `dense_shape=[9]` specifies a
-  one-dimensional tensor with 9 elements.
-
-The corresponding dense tensor satisfies:
-
-```python
-dense.shape = dense_shape
-dense[tuple(indices[i])] = values[i]
-```
-
-By convention, `indices` should be sorted in row-major order (or equivalently
-lexicographic order on the tuples `indices[i]`). This is not enforced when
-`SparseTensor` objects are constructed, but most ops assume correct ordering.
-If the ordering of sparse tensor `st` is wrong, a fixed version can be
-obtained by calling `tf.sparse_reorder(st)`.
-
-Example: The sparse tensor
-
-```python
-SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
-```
-
-represents the dense tensor
-
-```python
-[[1, 0, 0, 0]
- [0, 0, 2, 0]
- [0, 0, 0, 0]]
-```
-
-- - -
-
-#### `tf.SparseTensor.__init__(indices, values, dense_shape)` {#SparseTensor.__init__}
-
-Creates a `SparseTensor`.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A 2-D int64 tensor of shape `[N, ndims]`.
-*  <b>`values`</b>: A 1-D tensor of any type and shape `[N]`.
-*  <b>`dense_shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
-
-##### Returns:
-
-  A `SparseTensor`.
-
-
-- - -
-
-#### `tf.SparseTensor.get_shape()` {#SparseTensor.get_shape}
-
-Get the `TensorShape` representing the shape of the dense tensor.
-
-##### Returns:
-
-  A `TensorShape` object.
-
-
-- - -
-
-#### `tf.SparseTensor.indices` {#SparseTensor.indices}
-
-The indices of non-zero values in the represented dense tensor.
-
-##### Returns:
-
-  A 2-D Tensor of int64 with dense_shape `[N, ndims]`, where `N` is the
-    number of non-zero values in the tensor, and `ndims` is the rank.
-
-
-- - -
-
-#### `tf.SparseTensor.values` {#SparseTensor.values}
-
-The non-zero values in the represented dense tensor.
-
-##### Returns:
-
-  A 1-D Tensor of any data type.
-
-
-- - -
-
-#### `tf.SparseTensor.dense_shape` {#SparseTensor.dense_shape}
-
-A 1-D Tensor of int64 representing the shape of the dense tensor.
-
-
-- - -
-
-#### `tf.SparseTensor.dtype` {#SparseTensor.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.SparseTensor.op` {#SparseTensor.op}
-
-The `Operation` that produces `values` as an output.
-
-
-- - -
-
-#### `tf.SparseTensor.graph` {#SparseTensor.graph}
-
-The `Graph` that contains the index, value, and dense_shape tensors.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.SparseTensor.__div__(sp_x, y)` {#SparseTensor.__div__}
-
-Component-wise divides a SparseTensor by a dense Tensor.
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-##### Args:
-
-
-*  <b>`sp_indices`</b>: A `Tensor` of type `int64`.
-    2-D.  `N x R` matrix with the indices of non-empty values in a
-    SparseTensor, possibly not in canonical ordering.
-*  <b>`sp_values`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    1-D.  `N` non-empty values corresponding to `sp_indices`.
-*  <b>`sp_shape`</b>: A `Tensor` of type `int64`.
-    1-D.  Shape of the input SparseTensor.
-*  <b>`dense`</b>: A `Tensor`. Must have the same type as `sp_values`.
-    `R`-D.  The dense Tensor operand.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `sp_values`.
-  1-D.  The `N` values that are operated on.
-
-
-- - -
-
-#### `tf.SparseTensor.__mul__(sp_x, y)` {#SparseTensor.__mul__}
-
-Component-wise multiplies a SparseTensor by a dense Tensor.
-
-The output locations corresponding to the implicitly zero elements in the sparse
-tensor will be zero (i.e., will not take up storage space), regardless of the
-contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-##### Args:
-
-
-*  <b>`sp_indices`</b>: A `Tensor` of type `int64`.
-    2-D.  `N x R` matrix with the indices of non-empty values in a
-    SparseTensor, possibly not in canonical ordering.
-*  <b>`sp_values`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    1-D.  `N` non-empty values corresponding to `sp_indices`.
-*  <b>`sp_shape`</b>: A `Tensor` of type `int64`.
-    1-D.  Shape of the input SparseTensor.
-*  <b>`dense`</b>: A `Tensor`. Must have the same type as `sp_values`.
-    `R`-D.  The dense Tensor operand.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `sp_values`.
-  1-D.  The `N` values that are operated on.
-
-
-- - -
-
-#### `tf.SparseTensor.__str__()` {#SparseTensor.__str__}
-
-
-
-
-- - -
-
-#### `tf.SparseTensor.__truediv__(sp_x, y)` {#SparseTensor.__truediv__}
-
-Internal helper function for 'sp_t / dense_t'.
-
-
-- - -
-
-#### `tf.SparseTensor.eval(feed_dict=None, session=None)` {#SparseTensor.eval}
-
-Evaluates this sparse tensor in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for the operation that produces this
-tensor.
-
-*N.B.* Before invoking `SparseTensor.eval()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
-    description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to evaluate this sparse
-    tensor. If none, the default session will be used.
-
-##### Returns:
-
-  A `SparseTensorValue` object.
-
-
-- - -
-
-#### `tf.SparseTensor.from_value(cls, sparse_tensor_value)` {#SparseTensor.from_value}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md
deleted file mode 100644
index 488b7519d2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.assert_rank(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank}
-
-Assert `x` has rank equal to `rank`.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_rank(x, 2)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`rank`</b>: Scalar integer `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_rank".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` has specified rank.
-  If static checks determine `x` has correct rank, a `no_op` is returned.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If static checks determine `x` has wrong rank.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md
deleted file mode 100644
index 922d85b530c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.assert_type(tensor, tf_type, message=None, name=None)` {#assert_type}
-
-Statically asserts that the given `Tensor` is of the specified type.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A tensorflow `Tensor`.
-*  <b>`tf_type`</b>: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
-    etc).
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name to give this `Op`.  Defaults to "assert_type"
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the tensors data type doesn't match `tf_type`.
-
-##### Returns:
-
-  A `no_op` that does nothing.  Type can be determined statically.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ceil.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ceil.md
deleted file mode 100644
index 34e4a7feed0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ceil.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.ceil(x, name=None)` {#ceil}
-
-Returns element-wise smallest integer in not less than x.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.check_numerics.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.check_numerics.md
deleted file mode 100644
index 46a8f6f7db5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.check_numerics.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.check_numerics(tensor, message, name=None)` {#check_numerics}
-
-Checks a tensor for NaN and Inf values.
-
-When run, reports an `InvalidArgument` error if `tensor` has any values
-that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`message`</b>: A `string`. Prefix of the error message.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.stochastic_tensor.SampleValue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.stochastic_tensor.SampleValue.md
deleted file mode 100644
index 5ace6653e39..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.stochastic_tensor.SampleValue.md
+++ /dev/null
@@ -1,80 +0,0 @@
-Draw samples, possibly adding new outer dimensions along the way.
-
-This ValueType draws samples from StochasticTensors run within its
-context, increasing the rank according to the requested shape.
-
-Examples:
-
-```python
-mu = tf.zeros((2,3))
-sigma = tf.ones((2, 3))
-with sg.value_type(sg.SampleValue()):
-  st = sg.StochasticTensor(
-    tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-# draws 1 sample and does not reshape
-assertEqual(st.value().get_shape(), (2, 3))
-```
-
-```python
-mu = tf.zeros((2,3))
-sigma = tf.ones((2, 3))
-with sg.value_type(sg.SampleValue(4)):
-  st = sg.StochasticTensor(
-    tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-# draws 4 samples each with shape (2, 3) and concatenates
-assertEqual(st.value().get_shape(), (4, 2, 3))
-```
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.__init__(shape=(), stop_gradient=False)` {#SampleValue.__init__}
-
-Sample according to shape.
-
-For the given StochasticTensor `st` using this value type,
-the shape of `st.value()` will match that of
-`st.distribution.sample(shape)`.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A shape tuple or int32 tensor.  The sample shape.
-    Default is a scalar: take one sample and do not change the size.
-*  <b>`stop_gradient`</b>: If `True`, StochasticTensors' values are wrapped in
-    `stop_gradient`, to avoid backpropagation through.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.declare_inputs(unused_stochastic_tensor, unused_inputs_dict)` {#SampleValue.declare_inputs}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.popped_above(unused_value_type)` {#SampleValue.popped_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.pushed_above(unused_value_type)` {#SampleValue.pushed_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.shape` {#SampleValue.shape}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.SampleValue.stop_gradient` {#SampleValue.stop_gradient}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.variational_inference.ELBOForms.check_form.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.variational_inference.ELBOForms.check_form.md
deleted file mode 100644
index e3cc3ca4fe7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.bayesflow.variational_inference.ELBOForms.check_form.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.contrib.bayesflow.variational_inference.ELBOForms.check_form(form)` {#ELBOForms.check_form}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.copy_graph.copy_op_to_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.copy_graph.copy_op_to_graph.md
deleted file mode 100644
index d549132fa2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.copy_graph.copy_op_to_graph.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.copy_graph.copy_op_to_graph(org_instance, to_graph, variables, scope='')` {#copy_op_to_graph}
-
-Given an `Operation` 'org_instance` from one `Graph`,
-initializes and returns a copy of it from another `Graph`,
-under the specified scope (default `""`).
-
-The copying is done recursively, so any `Operation` whose output
-is required to evaluate the `org_instance`, is also copied (unless
-already done).
-
-Since `Variable` instances are copied separately, those required
-to evaluate `org_instance` must be provided as input.
-
-Args:
-org_instance: An `Operation` from some `Graph`. Could be a
-    `Placeholder` as well.
-to_graph: The `Graph` to copy `org_instance` to.
-variables: An iterable of `Variable` instances to copy `org_instance` to.
-scope: A scope for the new `Variable` (default `""`).
-
-##### Returns:
-
-    The copied `Operation` from `to_graph`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `org_instance` is not an `Operation` or `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
deleted file mode 100644
index 2ee42bdf54b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
+++ /dev/null
@@ -1,688 +0,0 @@
-Binomial distribution.
-
-This distribution is parameterized by `probs`, a (batch of) probabilities for
-drawing a `1` and `total_count`, the number of trials per draw from the
-Binomial.
-
-#### Mathematical Details
-
-The Binomial is a distribution over the number of `1`'s in `total_count`
-independent trials, with each trial having the same probability of `1`, i.e.,
-`probs`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(k; n, p) = p**k (1 - p)**(n - k) / Z
-Z = k! (n - k)! / n!
-```
-
-where:
-* `total_count = n`,
-* `probs = p`,
-* `Z` is the normalizaing constant, and,
-* `n!` is the factorial of `n`.
-
-#### Examples
-
-Create a single distribution, corresponding to 5 coin flips.
-
-```python
-dist = Binomial(total_count=5., probs=.5)
-```
-
-Create a single distribution (using logits), corresponding to 5 coin flips.
-
-```python
-dist = Binomial(total_count=5., logits=0.)
-```
-
-Creates 3 distributions with the third distribution most likely to have
-successes.
-
-```python
-p = [.2, .3, .8]
-# n will be broadcast to [4., 4., 4.], to match p.
-dist = Binomial(total_count=4., probs=p)
-```
-
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as p.
-counts = [1., 2, 3]
-dist.prob(counts)  # Shape [3]
-
-# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
-counts = [[1., 2, 1], [2, 2, 4]]
-dist.prob(counts)  # Shape [2, 3]
-
-# p will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Binomial.__init__(total_count, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='Binomial')` {#Binomial.__init__}
-
-Initialize a batch of Binomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor with shape broadcastable
-    to `[N1,..., Nm]` with `m >= 0` and the same dtype as `probs` or
-    `logits`.  Defines this as a batch of `N1 x ... x Nm` different Binomial
-    distributions. Its components should be equal to integer values.
-*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
-    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
-    the same dtype as `total_count`. Each entry represents logits for the
-    probability of success for independent Binomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: Positive floating point tensor with shape broadcastable to
-    `[N1,..., Nm]` `m >= 0`, `probs in [0, 1]`. Each entry represents the
-    probability of success for independent Binomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.batch_shape` {#Binomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.batch_shape_tensor(name='batch_shape_tensor')` {#Binomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.copy(**override_parameters_kwargs)` {#Binomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.covariance(name='covariance')` {#Binomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.event_shape` {#Binomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.event_shape_tensor(name='event_shape_tensor')` {#Binomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_scalar_batch(name='is_scalar_batch')` {#Binomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.is_scalar_event(name='is_scalar_event')` {#Binomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_prob(value, name='log_prob')` {#Binomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Binomial`:
-
-For each batch member of counts `value`, `P[value]` is the probability that
-after sampling `self.total_count` draws from this Binomial distribution, the
-number of successes is `value`. Since different sequences of draws can result in
-the same counts, the probability includes a combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `dtype` and whose shape
-can be broadcast with `self.probs` and `self.total_count`. `value` is only legal
-if it is less than or equal to `self.total_count` and its components are equal
-to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.log_survival_function(value, name='log_survival_function')` {#Binomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
-
-Log-odds of drawing a `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
-
-Mode.
-
-Additional documentation from `Binomial`:
-
-Note that when `(1 + total_count) * probs` is an integer, there are
-actually two modes.  Namely, `(1 + total_count) * probs` and
-`(1 + total_count) * probs - 1` are both modes. Here we return only the
-larger of the two modes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Binomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.param_static_shapes(cls, sample_shape)` {#Binomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.parameters` {#Binomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.prob(value, name='prob')` {#Binomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Binomial`:
-
-For each batch member of counts `value`, `P[value]` is the probability that
-after sampling `self.total_count` draws from this Binomial distribution, the
-number of successes is `value`. Since different sequences of draws can result in
-the same counts, the probability includes a combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `dtype` and whose shape
-can be broadcast with `self.probs` and `self.total_count`. `value` is only legal
-if it is less than or equal to `self.total_count` and its components are equal
-to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.probs` {#Binomial.probs}
-
-Probability of of drawing a `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.reparameterization_type` {#Binomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.stddev(name='stddev')` {#Binomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.survival_function(value, name='survival_function')` {#Binomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.total_count` {#Binomial.total_count}
-
-Number of trials.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
deleted file mode 100644
index b3afe9e04a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
+++ /dev/null
@@ -1,727 +0,0 @@
-Dirichlet-Multinomial compound distribution.
-
-The Dirichlet-Multinomial distribution is parameterized by a (batch of)
-length-`k` `concentration` vectors (`k > 1`) and a `total_count` number of
-trials, i.e., the number of trials per draw from the DirichletMultinomial. It
-is defined over a (batch of) length-`k` vector `counts` such that
-`tf.reduce_sum(counts, -1) = total_count`. The Dirichlet-Multinomial is
-identically the Beta-Binomial distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Dirichlet-Multinomial is a distribution over `k`-class counts, i.e., a
-length-`k` vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(n; alpha, N) = Beta(alpha + n) / (prod_j n_j!) / Z
-Z = Beta(alpha) / N!
-```
-
-where:
-
-* `concentration = alpha = [alpha_0, ..., alpha_{k-1}]`, `alpha_j > 0`,
-* `total_count = N`, `N` a positive integer,
-* `N!` is `N` factorial, and,
-* `Beta(x) = prod_j Gamma(x_j) / Gamma(sum_j x_j)` is the
-  [multivariate beta function](
-  https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function),
-  and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-Dirichlet-Multinomial is a [compound distribution](
-https://en.wikipedia.org/wiki/Compound_probability_distribution), i.e., its
-samples are generated as follows.
-
-  1. Choose class probabilities:
-     `probs = [p_0,...,p_{k-1}] ~ Dir(concentration)`
-  2. Draw integers:
-     `counts = [n_0,...,n_{k-1}] ~ Multinomial(total_count, probs)`
-
-The last `concentration` dimension parametrizes a single Dirichlet-Multinomial
-distribution. When calling distribution functions (e.g., `dist.prob(counts)`),
-`concentration`, `total_count` and `counts` are broadcast to the same shape.
-The last dimension of of `counts` corresponds single Dirichlet-Multinomial
-distributions.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-alpha = [1, 2, 3]
-n = 2
-dist = DirichletMultinomial(n, alpha)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as alpha.
-counts = [0, 0, 2]
-dist.prob(counts)  # Shape []
-
-# alpha will be broadcast to [[1, 2, 3], [1, 2, 3]] to match counts.
-counts = [[1, 1, 0], [1, 0, 1]]
-dist.prob(counts)  # Shape [2]
-
-# alpha will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7]
-```
-
-Creates a 2-batch of 3-class distributions.
-
-```python
-alpha = [[1, 2, 3], [4, 5, 6]]  # Shape [2, 3]
-n = [3, 3]
-dist = DirichletMultinomial(n, alpha)
-
-# counts will be broadcast to [[2, 1, 0], [2, 1, 0]] to match alpha.
-counts = [2, 1, 0]
-dist.prob(counts)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.__init__(total_count, concentration, validate_args=False, allow_nan_stats=True, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
-
-Initialize a batch of DirichletMultinomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor, whose dtype is the same
-    as `concentration`. The shape is broadcastable to `[N1,..., Nm]` with
-    `m >= 0`.  Defines this as a batch of `N1 x ... x Nm` different
-    Dirichlet multinomial distributions. Its components should be equal to
-    integer values.
-*  <b>`concentration`</b>: Positive floating point tensor, whose dtype is the
-    same as `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.
-    Defines this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
-    multinomial distributions.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.allow_nan_stats` {#DirichletMultinomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.batch_shape` {#DirichletMultinomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.batch_shape_tensor(name='batch_shape_tensor')` {#DirichletMultinomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.cdf(value, name='cdf')` {#DirichletMultinomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.concentration` {#DirichletMultinomial.concentration}
-
-Concentration parameter; expected prior counts for that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.copy(**override_parameters_kwargs)` {#DirichletMultinomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.covariance(name='covariance')` {#DirichletMultinomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-
-Additional documentation from `DirichletMultinomial`:
-
-The covariance for each batch member is defined as the following:
-
-```none
-Var(X_j) = n * alpha_j / alpha_0 * (1 - alpha_j / alpha_0) *
-(n + alpha_0) / (1 + alpha_0)
-```
-
-where `concentration = alpha` and
-`total_concentration = alpha_0 = sum_j alpha_j`.
-
-The covariance between elements in a batch is defined as:
-
-```none
-Cov(X_i, X_j) = -n * alpha_i * alpha_j / alpha_0 ** 2 *
-(n + alpha_0) / (1 + alpha_0)
-```
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.dtype` {#DirichletMultinomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.entropy(name='entropy')` {#DirichletMultinomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.event_shape` {#DirichletMultinomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.event_shape_tensor(name='event_shape_tensor')` {#DirichletMultinomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_continuous` {#DirichletMultinomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_scalar_batch(name='is_scalar_batch')` {#DirichletMultinomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.is_scalar_event(name='is_scalar_event')` {#DirichletMultinomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_cdf(value, name='log_cdf')` {#DirichletMultinomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_prob(value, name='log_prob')` {#DirichletMultinomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `DirichletMultinomial`:
-
-For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
-different sequences have the same counts so the probability includes a
-combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.concentration` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.log_survival_function(value, name='log_survival_function')` {#DirichletMultinomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.mean(name='mean')` {#DirichletMultinomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.mode(name='mode')` {#DirichletMultinomial.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.name` {#DirichletMultinomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#DirichletMultinomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.param_static_shapes(cls, sample_shape)` {#DirichletMultinomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.parameters` {#DirichletMultinomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.prob(value, name='prob')` {#DirichletMultinomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `DirichletMultinomial`:
-
-For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
-different sequences have the same counts so the probability includes a
-combinatorial coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.concentration` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.reparameterization_type` {#DirichletMultinomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.sample(sample_shape=(), seed=None, name='sample')` {#DirichletMultinomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.stddev(name='stddev')` {#DirichletMultinomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.survival_function(value, name='survival_function')` {#DirichletMultinomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.total_concentration` {#DirichletMultinomial.total_concentration}
-
-Sum of last dim of concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.total_count` {#DirichletMultinomial.total_count}
-
-Number of trials used to construct a sample.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.validate_args` {#DirichletMultinomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.DirichletMultinomial.variance(name='variance')` {#DirichletMultinomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.ExpRelaxedOneHotCategorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.ExpRelaxedOneHotCategorical.md
deleted file mode 100644
index 5a91aeddabf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.ExpRelaxedOneHotCategorical.md
+++ /dev/null
@@ -1,689 +0,0 @@
-ExpRelaxedOneHotCategorical distribution with temperature and logits.
-
-An ExpRelaxedOneHotCategorical distribution is a log-transformed
-RelaxedOneHotCategorical distribution. The RelaxedOneHotCategorical is a
-distribution over random probability vectors, vectors of positive real
-values that sum to one, which continuously approximates a OneHotCategorical.
-The degree of approximation is controlled by a temperature: as the temperature
-goes to 0 the RelaxedOneHotCategorical becomes discrete with a distribution
-described by the logits, as the temperature goes to infinity the
-RelaxedOneHotCategorical becomes the constant distribution that is identically
-the constant vector of (1/event_size, ..., 1/event_size).
-
-Because computing log-probabilities of the RelaxedOneHotCategorical can
-suffer from underflow issues, this class is one solution for loss
-functions that depend on log-probabilities, such as the KL Divergence found
-in the variational autoencoder loss. The KL divergence between two
-distributions is invariant under invertible transformations, so evaluating
-KL divergences of ExpRelaxedOneHotCategorical samples, which are always
-followed by a `tf.exp` op, is equivalent to evaluating KL divergences of
-RelaxedOneHotCategorical samples. See the appendix of Maddison et al., 2016
-for more mathematical details, where this distribution is called the
-ExpConcrete.
-
-#### Examples
-
-Creates a continuous distribution, whoe exp approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution. If those samples
-are followed by a `tf.exp` op, then they are distributed as a relaxed onehot
-categorical.
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = ExpRelaxedOneHotCategorical(temperature, probs=p)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. The 2nd class is the most likely to be the
-largest component in samples drawn from this distribution.
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very low, samples from
-this distribution are almost discrete, with one component almost 0 and the
-others very negative. The 2nd class is the most likely to be the largest
-component in samples drawn from this distribution.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Creates a continuous distribution, whose exp approximates a 3-class one-hot
-categorical distiribution. Because the temperature is very high, samples from
-this distribution are usually close to the (-log(3), -log(3), -log(3)) vector.
-The 2nd class is still the most likely to be the largest component
-in samples drawn from this distribution.
-
-```python
-temperature = 10
-logits = [-2, 2, 0]
-dist = ExpRelaxedOneHotCategorical(temperature, logits=logits)
-samples = dist.sample()
-exp_samples = tf.exp(samples)
-# exp_samples has the same distribution as samples from
-# RelaxedOneHotCategorical(temperature, probs=p)
-```
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.__init__(temperature, logits=None, probs=None, dtype=tf.float32, validate_args=False, allow_nan_stats=True, name='ExpRelaxedOneHotCategorical')` {#ExpRelaxedOneHotCategorical.__init__}
-
-Initialize ExpRelaxedOneHotCategorical using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of ExpRelaxedCategorical distributions. The temperature should
-    be positive.
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities
-    of a set of ExpRelaxedCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of logits for each class. Only
-    one of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities
-    of a set of ExpRelaxedCategorical distributions. The first
-    `N - 1` dimensions index into a batch of independent distributions and
-    the last dimension represents a vector of probabilities for each
-    class. Only one of `logits` or `probs` should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.allow_nan_stats` {#ExpRelaxedOneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.batch_shape` {#ExpRelaxedOneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#ExpRelaxedOneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.cdf(value, name='cdf')` {#ExpRelaxedOneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.copy(**override_parameters_kwargs)` {#ExpRelaxedOneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.covariance(name='covariance')` {#ExpRelaxedOneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.dtype` {#ExpRelaxedOneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.entropy(name='entropy')` {#ExpRelaxedOneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_shape` {#ExpRelaxedOneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#ExpRelaxedOneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.event_size` {#ExpRelaxedOneHotCategorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_continuous` {#ExpRelaxedOneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#ExpRelaxedOneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.is_scalar_event(name='is_scalar_event')` {#ExpRelaxedOneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_cdf(value, name='log_cdf')` {#ExpRelaxedOneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_prob(value, name='log_prob')` {#ExpRelaxedOneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.log_survival_function(value, name='log_survival_function')` {#ExpRelaxedOneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.logits` {#ExpRelaxedOneHotCategorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.mean(name='mean')` {#ExpRelaxedOneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.mode(name='mode')` {#ExpRelaxedOneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.name` {#ExpRelaxedOneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ExpRelaxedOneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.param_static_shapes(cls, sample_shape)` {#ExpRelaxedOneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.parameters` {#ExpRelaxedOneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.prob(value, name='prob')` {#ExpRelaxedOneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.probs` {#ExpRelaxedOneHotCategorical.probs}
-
-Vector of probabilities summing to one.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.reparameterization_type` {#ExpRelaxedOneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#ExpRelaxedOneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.stddev(name='stddev')` {#ExpRelaxedOneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.survival_function(value, name='survival_function')` {#ExpRelaxedOneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.temperature` {#ExpRelaxedOneHotCategorical.temperature}
-
-Batchwise temperature tensor of a RelaxedCategorical.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.validate_args` {#ExpRelaxedOneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExpRelaxedOneHotCategorical.variance(name='variance')` {#ExpRelaxedOneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
deleted file mode 100644
index 594031b7236..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
+++ /dev/null
@@ -1,609 +0,0 @@
-Exponential distribution.
-
-The Exponential distribution is parameterized by an event `rate` parameter.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; lambda, x > 0) = exp(-lambda x) / Z
-Z = 1 / lambda
-```
-
-where `rate = lambda` and `Z` is the normalizaing constant.
-
-The Exponential distribution is a special case of the Gamma distribution,
-i.e.,
-
-```python
-Exponential(rate) = Gamma(concentration=1., rate)
-```
-
-The Exponential distribution uses a `rate` parameter, or "inverse scale",
-which can be intuited as,
-
-```none
-X ~ Exponential(rate=1)
-Y = X / rate
-```
-- - -
-
-#### `tf.contrib.distributions.Exponential.__init__(rate, validate_args=False, allow_nan_stats=True, name='Exponential')` {#Exponential.__init__}
-
-Construct Exponential distribution with parameter `rate`.
-
-##### Args:
-
-
-*  <b>`rate`</b>: Floating point tensor, equivalent to `1 / mean`. Must contain only
-    positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.allow_nan_stats` {#Exponential.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.batch_shape` {#Exponential.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.batch_shape_tensor(name='batch_shape_tensor')` {#Exponential.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.cdf(value, name='cdf')` {#Exponential.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.concentration` {#Exponential.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.copy(**override_parameters_kwargs)` {#Exponential.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.covariance(name='covariance')` {#Exponential.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.dtype` {#Exponential.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.entropy(name='entropy')` {#Exponential.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.event_shape` {#Exponential.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.event_shape_tensor(name='event_shape_tensor')` {#Exponential.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_continuous` {#Exponential.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_scalar_batch(name='is_scalar_batch')` {#Exponential.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.is_scalar_event(name='is_scalar_event')` {#Exponential.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_cdf(value, name='log_cdf')` {#Exponential.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_prob(value, name='log_prob')` {#Exponential.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.log_survival_function(value, name='log_survival_function')` {#Exponential.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.mean(name='mean')` {#Exponential.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.mode(name='mode')` {#Exponential.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.name` {#Exponential.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Exponential.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.param_static_shapes(cls, sample_shape)` {#Exponential.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.parameters` {#Exponential.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.prob(value, name='prob')` {#Exponential.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.rate` {#Exponential.rate}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.reparameterization_type` {#Exponential.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.sample(sample_shape=(), seed=None, name='sample')` {#Exponential.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.stddev(name='stddev')` {#Exponential.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.survival_function(value, name='survival_function')` {#Exponential.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.validate_args` {#Exponential.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Exponential.variance(name='variance')` {#Exponential.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
deleted file mode 100644
index 003fa91793d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
+++ /dev/null
@@ -1,640 +0,0 @@
-Gamma distribution.
-
-The Gamma distribution is defined over positive real numbers using
-parameters `concentration` (aka "alpha") and `rate` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta, x > 0) = x**(alpha - 1) exp(-x beta) / Z
-Z = Gamma(alpha) beta**alpha
-```
-
-where:
-
-* `concentration = alpha`, `alpha > 0`,
-* `rate = beta`, `beta > 0`,
-* `Z` is the normalizing constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The cumulative density function (cdf) is,
-
-```none
-cdf(x; alpha, beta, x > 0) = GammaInc(alpha, beta x) / Gamma(alpha)
-```
-
-where `GammaInc` is the [lower incomplete Gamma function](
-https://en.wikipedia.org/wiki/Incomplete_gamma_function).
-
-The parameters can be intuited via their relationship to mean and stddev,
-
-```none
-concentration = alpha = (mean / stddev)**2
-rate = beta = mean / stddev**2 = concentration / mean
-```
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-WARNING: This distribution may draw 0-valued samples for small `concentration`
-values. See note in `tf.random_gamma` docstring.
-
-#### Examples
-
-```python
-dist = Gamma(concentration=3.0, rate=2.0)
-dist2 = Gamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
-```
-- - -
-
-#### `tf.contrib.distributions.Gamma.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='Gamma')` {#Gamma.__init__}
-
-Construct Gamma with `concentration` and `rate` parameters.
-
-The parameters `concentration` and `rate` must be shaped in a way that
-supports broadcasting (e.g. `concentration + rate` is a valid operation).
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Floating point tensor, the concentration params of the
-    distribution(s). Must contain only positive values.
-*  <b>`rate`</b>: Floating point tensor, the inverse scale params of the
-    distribution(s). Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `concentration` and `rate` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.allow_nan_stats` {#Gamma.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.batch_shape` {#Gamma.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.batch_shape_tensor(name='batch_shape_tensor')` {#Gamma.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.cdf(value, name='cdf')` {#Gamma.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.concentration` {#Gamma.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.copy(**override_parameters_kwargs)` {#Gamma.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.covariance(name='covariance')` {#Gamma.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.dtype` {#Gamma.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.entropy(name='entropy')` {#Gamma.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.event_shape` {#Gamma.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.event_shape_tensor(name='event_shape_tensor')` {#Gamma.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_continuous` {#Gamma.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_scalar_batch(name='is_scalar_batch')` {#Gamma.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.is_scalar_event(name='is_scalar_event')` {#Gamma.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_cdf(value, name='log_cdf')` {#Gamma.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_prob(value, name='log_prob')` {#Gamma.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.log_survival_function(value, name='log_survival_function')` {#Gamma.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.mean(name='mean')` {#Gamma.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.mode(name='mode')` {#Gamma.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.name` {#Gamma.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Gamma.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.param_static_shapes(cls, sample_shape)` {#Gamma.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.parameters` {#Gamma.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.prob(value, name='prob')` {#Gamma.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.rate` {#Gamma.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.reparameterization_type` {#Gamma.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.sample(sample_shape=(), seed=None, name='sample')` {#Gamma.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.stddev(name='stddev')` {#Gamma.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.survival_function(value, name='survival_function')` {#Gamma.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.validate_args` {#Gamma.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Gamma.variance(name='variance')` {#Gamma.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
deleted file mode 100644
index 238bf39b47a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
+++ /dev/null
@@ -1,654 +0,0 @@
-InverseGamma distribution.
-
-The `InverseGamma` distribution is defined over positive real numbers using
-parameters `concentration` (aka "alpha") and `rate` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta, x > 0) = x**(-alpha - 1) exp(-beta / x) / Z
-Z = Gamma(alpha) beta**-alpha
-```
-
-where:
-
-* `concentration = alpha`,
-* `rate = beta`,
-* `Z` is the normalizing constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The cumulative density function (cdf) is,
-
-```none
-cdf(x; alpha, beta, x > 0) = GammaInc(alpha, beta / x) / Gamma(alpha)
-```
-
-where `GammaInc` is the [upper incomplete Gamma function](
-https://en.wikipedia.org/wiki/Incomplete_gamma_function).
-
-The parameters can be intuited via their relationship to mean and stddev,
-
-```none
-concentration = alpha = (mean / stddev)**2
-rate = beta = mean / stddev**2
-```
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-WARNING: This distribution may draw 0-valued samples for small concentration
-values. See note in `tf.random_gamma` docstring.
-
-#### Examples
-
-```python
-dist = InverseGamma(concentration=3.0, rate=2.0)
-dist2 = InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
-```
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.__init__(concentration, rate, validate_args=False, allow_nan_stats=True, name='InverseGamma')` {#InverseGamma.__init__}
-
-Construct InverseGamma with `concentration` and `rate` parameters.
-
-The parameters `concentration` and `rate` must be shaped in a way that
-supports broadcasting (e.g. `concentration + rate` is a valid operation).
-
-##### Args:
-
-
-*  <b>`concentration`</b>: Floating point tensor, the concentration params of the
-    distribution(s). Must contain only positive values.
-*  <b>`rate`</b>: Floating point tensor, the inverse scale params of the
-    distribution(s). Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `concentration` and `rate` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.allow_nan_stats` {#InverseGamma.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.batch_shape` {#InverseGamma.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.batch_shape_tensor(name='batch_shape_tensor')` {#InverseGamma.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.cdf(value, name='cdf')` {#InverseGamma.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.concentration` {#InverseGamma.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.copy(**override_parameters_kwargs)` {#InverseGamma.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.covariance(name='covariance')` {#InverseGamma.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.dtype` {#InverseGamma.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.entropy(name='entropy')` {#InverseGamma.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.event_shape` {#InverseGamma.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.event_shape_tensor(name='event_shape_tensor')` {#InverseGamma.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_continuous` {#InverseGamma.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_scalar_batch(name='is_scalar_batch')` {#InverseGamma.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.is_scalar_event(name='is_scalar_event')` {#InverseGamma.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_cdf(value, name='log_cdf')` {#InverseGamma.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_prob(value, name='log_prob')` {#InverseGamma.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.log_survival_function(value, name='log_survival_function')` {#InverseGamma.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.mean(name='mean')` {#InverseGamma.mean}
-
-Mean.
-
-Additional documentation from `InverseGamma`:
-
-The mean of an inverse gamma distribution is
-`rate / (concentration - 1)`, when `concentration > 1`, and `NaN`
-otherwise.  If `self.allow_nan_stats` is `False`, an exception will be
-raised rather than returning `NaN`
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.mode(name='mode')` {#InverseGamma.mode}
-
-Mode.
-
-Additional documentation from `InverseGamma`:
-
-The mode of an inverse gamma distribution is `rate / (concentration +
-1)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.name` {#InverseGamma.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#InverseGamma.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.param_static_shapes(cls, sample_shape)` {#InverseGamma.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.parameters` {#InverseGamma.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.prob(value, name='prob')` {#InverseGamma.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.rate` {#InverseGamma.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.reparameterization_type` {#InverseGamma.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.sample(sample_shape=(), seed=None, name='sample')` {#InverseGamma.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.stddev(name='stddev')` {#InverseGamma.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.survival_function(value, name='survival_function')` {#InverseGamma.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.validate_args` {#InverseGamma.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.InverseGamma.variance(name='variance')` {#InverseGamma.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-
-Additional documentation from `InverseGamma`:
-
-Variance for inverse gamma is defined only for `concentration > 2`. If
-`self.allow_nan_stats` is `False`, an exception will be raised rather
-than returning `NaN`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
deleted file mode 100644
index 420e28bf738..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
+++ /dev/null
@@ -1,698 +0,0 @@
-Multinomial distribution.
-
-This Multinomial distribution is parameterized by `probs`, a (batch of)
-length-`k` `prob` (probability) vectors (`k > 1`) such that
-`tf.reduce_sum(probs, -1) = 1`, and a `total_count` number of trials, i.e.,
-the number of trials per draw from the Multinomial. It is defined over a
-(batch of) length-`k` vector `counts` such that
-`tf.reduce_sum(counts, -1) = total_count`. The Multinomial is identically the
-Binomial distribution when `k = 2`.
-
-#### Mathematical Details
-
-The Multinomial is a distribution over `k`-class counts, i.e., a length-`k`
-vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
-
-The probability mass function (pmf) is,
-
-```none
-pmf(n; pi, N) = prod_j (pi_j)**n_j / Z
-Z = (prod_j n_j!) / N!
-```
-
-where:
-* `probs = pi = [pi_0, ..., pi_{k-1}]`, `pi_j > 0`, `sum_j pi_j = 1`,
-* `total_count = N`, `N` a positive integer,
-* `Z` is the normalization constant, and,
-* `N!` denotes `N` factorial.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-Create a 3-class distribution, with the 3rd class is most likely to be drawn,
-using logits.
-
-```python
-logits = [-50., -43, 0]
-dist = Multinomial(total_count=4., logits=logits)
-```
-
-Create a 3-class distribution, with the 3rd class is most likely to be drawn.
-
-```python
-p = [.2, .3, .5]
-dist = Multinomial(total_count=4., probs=p)
-```
-
-The distribution functions can be evaluated on counts.
-
-```python
-# counts same shape as p.
-counts = [1., 0, 3]
-dist.prob(counts)  # Shape []
-
-# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
-counts = [[1., 2, 1], [2, 2, 0]]
-dist.prob(counts)  # Shape [2]
-
-# p will be broadcast to shape [5, 7, 3] to match counts.
-counts = [[...]]  # Shape [5, 7, 3]
-dist.prob(counts)  # Shape [5, 7]
-```
-
-Create a 2-batch of 3-class distributions.
-
-```python
-p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
-dist = Multinomial(total_count=[4., 5], probs=p)
-
-counts = [[2., 1, 1], [3, 1, 1]]
-dist.prob(counts)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.Multinomial.__init__(total_count, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='Multinomial')` {#Multinomial.__init__}
-
-Initialize a batch of Multinomial distributions.
-
-##### Args:
-
-
-*  <b>`total_count`</b>: Non-negative floating point tensor with shape broadcastable
-    to `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
-    `N1 x ... x Nm` different Multinomial distributions.  Its components
-    should be equal to integer values.
-*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
-    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
-    and the same dtype as `total_count`. Defines this as a batch of
-    `N1 x ... x Nm` different `k` class Multinomial distributions. Only one
-    of `logits` or `probs` should be passed in.
-*  <b>`probs`</b>: Positive floating point tensor with shape broadcastable to
-    `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`.  Defines
-    this as a batch of `N1 x ... x Nm` different `k` class Multinomial
-    distributions. `probs`'s components in the last portion of its shape
-    should sum to `1`. Only one of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.batch_shape` {#Multinomial.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.batch_shape_tensor(name='batch_shape_tensor')` {#Multinomial.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.copy(**override_parameters_kwargs)` {#Multinomial.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.covariance(name='covariance')` {#Multinomial.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.event_shape` {#Multinomial.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.event_shape_tensor(name='event_shape_tensor')` {#Multinomial.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_scalar_batch(name='is_scalar_batch')` {#Multinomial.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.is_scalar_event(name='is_scalar_event')` {#Multinomial.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_prob(value, name='log_prob')` {#Multinomial.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Multinomial`:
-
-For each batch of counts, `value = [n_0, ...
-,n_{k-1}]`, `P[value]` is the probability that after sampling `self.total_count`
-draws from this Multinomial distribution, the number of draws falling in class
-`j` is `n_j`. Since this definition is [exchangeable](
-https://en.wikipedia.org/wiki/Exchangeable_random_variables); different
-sequences have the same counts so the probability includes a combinatorial
-coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.probs` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.log_survival_function(value, name='log_survival_function')` {#Multinomial.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Multinomial.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.param_static_shapes(cls, sample_shape)` {#Multinomial.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.parameters` {#Multinomial.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.prob(value, name='prob')` {#Multinomial.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Multinomial`:
-
-For each batch of counts, `value = [n_0, ...
-,n_{k-1}]`, `P[value]` is the probability that after sampling `self.total_count`
-draws from this Multinomial distribution, the number of draws falling in class
-`j` is `n_j`. Since this definition is [exchangeable](
-https://en.wikipedia.org/wiki/Exchangeable_random_variables); different
-sequences have the same counts so the probability includes a combinatorial
-coefficient.
-
-Note: `value` must be a non-negative tensor with dtype `self.dtype`, have no
-fractional components, and such that
-`tf.reduce_sum(value, -1) = self.total_count`. Its shape must be broadcastable
-with `self.probs` and `self.total_count`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.probs` {#Multinomial.probs}
-
-Probability of of drawing a `1` in that coordinate.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.reparameterization_type` {#Multinomial.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.stddev(name='stddev')` {#Multinomial.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.survival_function(value, name='survival_function')` {#Multinomial.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.total_count` {#Multinomial.total_count}
-
-Number of trials used to construct a sample.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
deleted file mode 100644
index 880c521eed2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
+++ /dev/null
@@ -1,702 +0,0 @@
-The multivariate normal distribution on `R^k`.
-
-Every batch member of this distribution is defined by a mean and a lightweight
-covariance matrix `C`.
-
-#### Mathematical details
-
-The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-For every batch member, this distribution represents `k` random variables
-`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-The user initializes this class by providing the mean `mu`, and a lightweight
-definition of `C`:
-
-```
-C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-M is diagonal (k x k)
-V = is shape (k x r), typically r << k
-D = is diagonal (r x r), optional (defaults to identity).
-```
-
-This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
-sampling and storage (per batch member).
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
-leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with covariance square root
-# S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
-mu = [1, 2, 3.]
-diag_large = [1.1, 2.2, 3.3]
-v = ... # shape 3 x 2
-diag_small = [4., 5.]
-dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-    mu, diag_large, v, diag_small=diag_small)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.  This time, don't provide
-# diag_small.  This means S = M + V V^T.
-mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
-diag_large = ... # shape 2 x 3
-v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
-dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
-    mu, diag_large, v)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.__init__(mu, diag_large, v, diag_small=None, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiagPlusVDVT')` {#MultivariateNormalDiagPlusVDVT.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-For every batch member, this distribution represents `k` random variables
-`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
-`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
-
-The user initializes this class by providing the mean `mu`, and a
-lightweight definition of `C`:
-
-```
-C = SS^T = SS = (M + V D V^T) (M + V D V^T)
-M is diagonal (k x k)
-V = is shape (k x r), typically r << k
-D = is diagonal (r x r), optional (defaults to identity).
-```
-
-##### Args:
-
-
-*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
-    `n >= 0`.  The means.
-*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
-    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
-    `n >= 0`.  Defines the matrix `V`.
-*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
-    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
-    is `None`, which means `D` will be the identity matrix.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.allow_nan_stats` {#MultivariateNormalDiagPlusVDVT.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape` {#MultivariateNormalDiagPlusVDVT.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiagPlusVDVT.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.cdf(value, name='cdf')` {#MultivariateNormalDiagPlusVDVT.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagPlusVDVT.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.covariance(name='covariance')` {#MultivariateNormalDiagPlusVDVT.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.entropy(name='entropy')` {#MultivariateNormalDiagPlusVDVT.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape` {#MultivariateNormalDiagPlusVDVT.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiagPlusVDVT.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_continuous` {#MultivariateNormalDiagPlusVDVT.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiagPlusVDVT.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiagPlusVDVT.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagPlusVDVT.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_prob(value, name='log_prob')` {#MultivariateNormalDiagPlusVDVT.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagPlusVDVT.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiagPlusVDVT.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mean(name='mean')` {#MultivariateNormalDiagPlusVDVT.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mode(name='mode')` {#MultivariateNormalDiagPlusVDVT.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mu` {#MultivariateNormalDiagPlusVDVT.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.name` {#MultivariateNormalDiagPlusVDVT.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiagPlusVDVT.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiagPlusVDVT.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.parameters` {#MultivariateNormalDiagPlusVDVT.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.prob(value, name='prob')` {#MultivariateNormalDiagPlusVDVT.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.reparameterization_type` {#MultivariateNormalDiagPlusVDVT.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagPlusVDVT.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma` {#MultivariateNormalDiagPlusVDVT.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma_det(name='sigma_det')` {#MultivariateNormalDiagPlusVDVT.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.stddev(name='stddev')` {#MultivariateNormalDiagPlusVDVT.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.survival_function(value, name='survival_function')` {#MultivariateNormalDiagPlusVDVT.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.validate_args` {#MultivariateNormalDiagPlusVDVT.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.variance(name='variance')` {#MultivariateNormalDiagPlusVDVT.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.NormalWithSoftplusScale.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.NormalWithSoftplusScale.md
deleted file mode 100644
index 5cd96f6d194..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.NormalWithSoftplusScale.md
+++ /dev/null
@@ -1,560 +0,0 @@
-Normal with softplus applied to `scale`.
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='NormalWithSoftplusScale')` {#NormalWithSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.allow_nan_stats` {#NormalWithSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.batch_shape` {#NormalWithSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#NormalWithSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.cdf(value, name='cdf')` {#NormalWithSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.copy(**override_parameters_kwargs)` {#NormalWithSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.covariance(name='covariance')` {#NormalWithSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.dtype` {#NormalWithSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.entropy(name='entropy')` {#NormalWithSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.event_shape` {#NormalWithSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#NormalWithSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_continuous` {#NormalWithSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#NormalWithSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.is_scalar_event(name='is_scalar_event')` {#NormalWithSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.loc` {#NormalWithSoftplusScale.loc}
-
-Distribution parameter for the mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_cdf(value, name='log_cdf')` {#NormalWithSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_prob(value, name='log_prob')` {#NormalWithSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.log_survival_function(value, name='log_survival_function')` {#NormalWithSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.mean(name='mean')` {#NormalWithSoftplusScale.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.mode(name='mode')` {#NormalWithSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.name` {#NormalWithSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#NormalWithSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.param_static_shapes(cls, sample_shape)` {#NormalWithSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.parameters` {#NormalWithSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.prob(value, name='prob')` {#NormalWithSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.reparameterization_type` {#NormalWithSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#NormalWithSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.scale` {#NormalWithSoftplusScale.scale}
-
-Distribution parameter for standard deviation.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.stddev(name='stddev')` {#NormalWithSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.survival_function(value, name='survival_function')` {#NormalWithSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.validate_args` {#NormalWithSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.NormalWithSoftplusScale.variance(name='variance')` {#NormalWithSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.OneHotCategorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.OneHotCategorical.md
deleted file mode 100644
index 835cbffe13a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.OneHotCategorical.md
+++ /dev/null
@@ -1,638 +0,0 @@
-OneHotCategorical distribution.
-
-The categorical distribution is parameterized by the log-probabilities
-of a set of classes. The difference between OneHotCategorical and Categorical
-distributions is that OneHotCategorical is a discrete distribution over
-one-hot bit vectors whereas Categorical is a discrete distribution over
-positive integers. OneHotCategorical is equivalent to Categorical except
-Categorical has event_dim=() while OneHotCategorical has event_dim=K, where
-K is the number of classes.
-
-This class provides methods to create indexed batches of OneHotCategorical
-distributions.  If the provided `logits` or `probs` is rank 2 or higher, for
-every fixed set of leading dimensions, the last dimension represents one
-single OneHotCategorical distribution.  When calling distribution
-functions (e.g. `dist.prob(x)`), `logits` and `x` are broadcast to the
-same shape (if possible).  In all cases, the last dimension of `logits,x`
-represents single OneHotCategorical distributions.
-
-#### Examples
-
-Creates a 3-class distiribution, with the 2nd class, the most likely to be
-drawn from.
-
-```python
-p = [0.1, 0.5, 0.4]
-dist = OneHotCategorical(probs=p)
-```
-
-Creates a 3-class distiribution, with the 2nd class the most likely to be
-drawn from, using logits.
-
-```python
-logits = [-2, 2, 0]
-dist = OneHotCategorical(logits=logits)
-```
-
-Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
-
-```python
-# counts is a scalar.
-p = [0.1, 0.4, 0.5]
-dist = OneHotCategorical(probs=p)
-dist.prob([0,1,0])  # Shape []
-
-# p will be broadcast to [[0.1, 0.4, 0.5], [0.1, 0.4, 0.5]] to match.
-samples = [[0,1,0], [1,0,0]]
-dist.prob(samples)  # Shape [2]
-```
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.__init__(logits=None, probs=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='OneHotCategorical')` {#OneHotCategorical.__init__}
-
-Initialize OneHotCategorical distributions using class log-probabilities.
-
-##### Args:
-
-
-*  <b>`logits`</b>: An N-D `Tensor`, `N >= 1`, representing the log probabilities of a
-    set of Categorical distributions. The first `N - 1` dimensions index
-    into a batch of independent distributions and the last dimension
-    represents a vector of logits for each class. Only one of `logits` or
-    `probs` should be passed in.
-*  <b>`probs`</b>: An N-D `Tensor`, `N >= 1`, representing the probabilities of a set
-    of Categorical distributions. The first `N - 1` dimensions index into a
-    batch of independent distributions and the last dimension represents a
-    vector of probabilities for each class. Only one of `logits` or `probs`
-    should be passed in.
-*  <b>`dtype`</b>: The type of the event samples (default: int32).
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.allow_nan_stats` {#OneHotCategorical.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.batch_shape` {#OneHotCategorical.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.batch_shape_tensor(name='batch_shape_tensor')` {#OneHotCategorical.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.cdf(value, name='cdf')` {#OneHotCategorical.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.copy(**override_parameters_kwargs)` {#OneHotCategorical.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.covariance(name='covariance')` {#OneHotCategorical.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.dtype` {#OneHotCategorical.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.entropy(name='entropy')` {#OneHotCategorical.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_shape` {#OneHotCategorical.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_shape_tensor(name='event_shape_tensor')` {#OneHotCategorical.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.event_size` {#OneHotCategorical.event_size}
-
-Scalar `int32` tensor: the number of classes.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_continuous` {#OneHotCategorical.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_scalar_batch(name='is_scalar_batch')` {#OneHotCategorical.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.is_scalar_event(name='is_scalar_event')` {#OneHotCategorical.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_cdf(value, name='log_cdf')` {#OneHotCategorical.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_prob(value, name='log_prob')` {#OneHotCategorical.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.log_survival_function(value, name='log_survival_function')` {#OneHotCategorical.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.logits` {#OneHotCategorical.logits}
-
-Vector of coordinatewise logits.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.mean(name='mean')` {#OneHotCategorical.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.mode(name='mode')` {#OneHotCategorical.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.name` {#OneHotCategorical.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#OneHotCategorical.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.param_static_shapes(cls, sample_shape)` {#OneHotCategorical.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.parameters` {#OneHotCategorical.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.prob(value, name='prob')` {#OneHotCategorical.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.probs` {#OneHotCategorical.probs}
-
-Vector of coordinatewise probabilities.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.reparameterization_type` {#OneHotCategorical.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.sample(sample_shape=(), seed=None, name='sample')` {#OneHotCategorical.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.stddev(name='stddev')` {#OneHotCategorical.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.survival_function(value, name='survival_function')` {#OneHotCategorical.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.validate_args` {#OneHotCategorical.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.OneHotCategorical.variance(name='variance')` {#OneHotCategorical.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.ffmpeg.encode_audio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.ffmpeg.encode_audio.md
deleted file mode 100644
index fb9d958f26f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.ffmpeg.encode_audio.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.ffmpeg.encode_audio(audio, file_format=None, samples_per_second=None)` {#encode_audio}
-
-Creates an op that encodes an audio file using sampled audio from a tensor.
-
-##### Args:
-
-
-*  <b>`audio`</b>: A rank 2 tensor that has time along dimension 0 and channels along
-      dimension 1. Dimension 0 is `samples_per_second * length` long in
-      seconds.
-*  <b>`file_format`</b>: The type of file to encode. "wav" is the only supported format.
-*  <b>`samples_per_second`</b>: The number of samples in the audio tensor per second of
-      audio.
-
-##### Returns:
-
-  A scalar tensor that contains the encoded audio in the specified file
-  format.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.add_model_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.add_model_variable.md
deleted file mode 100644
index 45944baf03c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.add_model_variable.md
+++ /dev/null
@@ -1,9 +0,0 @@
-### `tf.contrib.framework.add_model_variable(var)` {#add_model_variable}
-
-Adds a variable to the `GraphKeys.MODEL_VARIABLES` collection.
-
-##### Args:
-
-
-*  <b>`var`</b>: a variable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.get_model_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.get_model_variables.md
deleted file mode 100644
index 078140ccb6c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.get_model_variables.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.get_model_variables(scope=None, suffix=None)` {#get_model_variables}
-
-Gets the list of model variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.local_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.local_variable.md
deleted file mode 100644
index ac0abb46ada..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.framework.local_variable.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.contrib.framework.local_variable(initial_value, validate_shape=True, name=None)` {#local_variable}
-
-Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
-
-##### Args:
-
-
-*  <b>`initial_value`</b>: See variables.Variable.__init__.
-*  <b>`validate_shape`</b>: See variables.Variable.__init__.
-*  <b>`name`</b>: See variables.Variable.__init__.
-
-##### Returns:
-
-  New variable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.copy_op_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.copy_op_handler.md
deleted file mode 100644
index 1ea461dd9ed..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.copy_op_handler.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.contrib.graph_editor.copy_op_handler(info, op, copy_shape=True)` {#copy_op_handler}
-
-Copy a `tf.Operation`.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`op`</b>: the `tf.Operation` to be copied.
-*  <b>`copy_shape`</b>: also copy the shape of the tensor
-
-##### Returns:
-
-  A `(op, op_outputs)` tuple containgin the transformed op and its outputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
deleted file mode 100644
index 922d905c82b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
-
-Detach the output of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
-    control outputs are also detached.
-
-##### Returns:
-
-  A tuple `(sgv, output_placeholders)` where
-    `sgv` is a new subgraph view of the detached subgraph;
-    `output_placeholders` is a list of the created output placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ops_from_regex.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ops_from_regex.md
deleted file mode 100644
index 41332dfdba5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ops_from_regex.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.graph_editor.filter_ops_from_regex(ops, regex)` {#filter_ops_from_regex}
-
-Get all the operations that match the given regex.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`regex`</b>: a regular expression matching the operation's name.
-    For example, `"^foo(/.*)?$"` will match all the operations in the "foo"
-    scope.
-
-##### Returns:
-
-  A list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ts.md
deleted file mode 100644
index ef4764bc2cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.filter_ts.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.graph_editor.filter_ts(ops, positive_filter)` {#filter_ts}
-
-Get all the tensors which are input or output of an op in ops.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`positive_filter`</b>: a function deciding whether to keep a tensor or not.
-    If `True`, all the tensors are returned.
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.op_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.op_type.md
deleted file mode 100644
index bbf3dfc4c73..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.op_type.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.graph_editor.op_type(op_types, op=None)` {#op_type}
-
-Check if an op is of the given type.
-
-##### Args:
-
-
-*  <b>`op_types`</b>: tuple of strings containing the types to check against.
-    For instance: ("Add", "Const")
-*  <b>`op`</b>: the operation to check (or None).
-
-##### Returns:
-
-  if op is not None, return True if the op is of the correct type.
-  if op is None, return a lambda function which does the type checking.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_inputs.md
deleted file mode 100644
index 91c1d91008e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_inputs.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.reroute_inputs(sgv0, sgv1)` {#reroute_inputs}
-
-Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
deleted file mode 100644
index a37e0948c81..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
-
-Helper to select tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Tensor`. `tf.Operation` instances are silently ignored.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-    'restrict_ts_regex': a regular expression is ignored if it doesn't start
-      with the substring "(?#ts)".
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` (silently ignored) or a string
-    or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
deleted file mode 100644
index 31ed5df8d41..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
-
-Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.crossed_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.crossed_column.md
deleted file mode 100644
index a7ca34c986e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.crossed_column.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.contrib.layers.crossed_column(columns, hash_bucket_size, combiner='sum', ckpt_to_load_from=None, tensor_name_in_ckpt=None, hash_key=None)` {#crossed_column}
-
-Creates a _CrossedColumn for performing feature crosses.
-
-##### Args:
-
-
-*  <b>`columns`</b>: An iterable of _FeatureColumn. Items can be an instance of
-    _SparseColumn, _CrossedColumn, or _BucketizedColumn.
-*  <b>`hash_bucket_size`</b>: An int that is > 1. The number of buckets.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "sum" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column::
-      * "sum": do not normalize
-      * "mean": do l1 normalization
-      * "sqrtn": do l2 normalization
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`ckpt_to_load_from`</b>: (Optional). String representing checkpoint name/pattern
-    to restore the column weights. Required if `tensor_name_in_ckpt` is not
-    None.
-*  <b>`tensor_name_in_ckpt`</b>: (Optional). Name of the `Tensor` in the provided
-    checkpoint from which to restore the column weights. Required if
-    `ckpt_to_load_from` is not None.
-*  <b>`hash_key`</b>: Specify the hash_key that will be used by the `FingerprintCat64`
-    function to combine the crosses fingerprints on SparseFeatureCrossOp
-    (optional).
-
-##### Returns:
-
-  A _CrossedColumn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any item in columns is not an instance of _SparseColumn,
-    _CrossedColumn, or _BucketizedColumn, or
-    hash_bucket_size is not an int.
-*  <b>`ValueError`</b>: if hash_bucket_size is not > 1 or
-    len(columns) is not > 1.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md
deleted file mode 100644
index a946caf9809..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.layers.separable_convolution2d(*args, **kwargs)` {#separable_convolution2d}
-
-Adds a depth-separable 2D convolution with optional batch_norm layer.
-
-This op first performs a depthwise convolution that acts separately on
-channels, creating a variable called `depthwise_weights`. If `num_outputs`
-is not None, it adds a pointwise convolution that mixes channels, creating a
-variable called `pointwise_weights`. Then, if `batch_norm_params` is None,
-it adds bias to the result, creating a variable called 'biases', otherwise
-it adds a batch normalization layer. It finally applies an activation function
-to produce the end result.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of size [batch_size, height, width, channels].
-*  <b>`num_outputs`</b>: The number of pointwise convolution output filters. If is
-    None, then we skip the pointwise convolution stage.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of
-    of the filters. Can be an int if both values are the same.
-*  <b>`depth_multiplier`</b>: The number of depthwise convolution output channels for
-    each input channel. The total number of depthwise convolution output
-    channels will be equal to `num_filters_in * depth_multiplier`.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width], specifying the
-    depthwise convolution stride. Can be an int if both strides are the same.
-*  <b>`padding`</b>: One of 'VALID' or 'SAME'.
-*  <b>`rate`</b>: A list of length 2: [rate_height, rate_width], specifying the dilation
-    rates for a'trous convolution. Can be an int if both rates are the same.
-    If any value is larger than one, then both stride values need to be one.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionay containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: Whether or not the variables should be trainable or not.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.sparse_column_with_integerized_feature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.sparse_column_with_integerized_feature.md
deleted file mode 100644
index 99e91fd7921..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.sparse_column_with_integerized_feature.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.layers.sparse_column_with_integerized_feature(column_name, bucket_size, combiner='sum', dtype=tf.int64)` {#sparse_column_with_integerized_feature}
-
-Creates an integerized _SparseColumn.
-
-Use this when your features are already pre-integerized into int64 IDs, that
-is, when the set of values to output is already coming in as what's desired in
-the output. Integerized means we can use the feature value itself as id.
-
-Typically this is used for reading contiguous ranges of integers indexes, but
-it doesn't have to be. The output value is simply copied from the
-input_feature, whatever it is. Just be aware, however, that if you have large
-gaps of unused integers it might affect what you feed those in (for instance,
-if you make up a one-hot tensor from these, the unused integers will appear as
-values in the tensor which are always zero.)
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`bucket_size`</b>: An int that is > 1. The number of buckets. It should be bigger
-    than maximum feature. In other words features in this column should be an
-    int64 in range [0, bucket_size)
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`dtype`</b>: Type of features. It should be an integer type. Default value is
-    dtypes.int64.
-
-##### Returns:
-
-  An integerized _SparseColumn definition.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: bucket_size is not greater than 1.
-*  <b>`ValueError`</b>: dtype is not integer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.summarize_activation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.summarize_activation.md
deleted file mode 100644
index 3aed0ff43ca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.summarize_activation.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.layers.summarize_activation(op)` {#summarize_activation}
-
-Summarize an activation.
-
-This applies the given activation and adds useful summaries specific to the
-activation.
-
-##### Args:
-
-
-*  <b>`op`</b>: The tensor to summarize (assumed to be a layer activation).
-
-##### Returns:
-
-  The summary op created to summarize `op`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.variance_scaling_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.variance_scaling_initializer.md
deleted file mode 100644
index 27b1b58d7e1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.variance_scaling_initializer.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.contrib.layers.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, seed=None, dtype=tf.float32)` {#variance_scaling_initializer}
-
-Returns an initializer that generates tensors without scaling variance.
-
-When initializing a deep network, it is in principle advantageous to keep
-the scale of the input variance constant, so it does not explode or diminish
-by reaching the final layer. This initializer use the following formula:
-
-```python
-  if mode='FAN_IN': # Count only number of input connections.
-    n = fan_in
-  elif mode='FAN_OUT': # Count only number of output connections.
-    n = fan_out
-  elif mode='FAN_AVG': # Average number of inputs and output connections.
-    n = (fan_in + fan_out)/2.0
-
-    truncated_normal(shape, 0.0, stddev=sqrt(factor / n))
-```
-
-* To get [Delving Deep into Rectifiers](
-   http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
-  `factor=2.0 mode='FAN_IN' uniform=False`
-* To get [Convolutional Architecture for Fast Feature Embedding](
-   http://arxiv.org/abs/1408.5093), use:<br/>
-  `factor=1.0 mode='FAN_IN' uniform=True`
-* To get [Understanding the difficulty of training deep feedforward neural
-  networks](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf),
-  use:<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=True.`
-* To get `xavier_initializer` use either:<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=True`, or<br/>
-  `factor=1.0 mode='FAN_AVG' uniform=False`.
-
-##### Args:
-
-
-*  <b>`factor`</b>: Float.  A multiplicative factor.
-*  <b>`mode`</b>: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer that generates tensors with unit variance.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `dtype` is not a floating point type.
-*  <b>`TypeError`</b>: if `mode` is not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG'].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
deleted file mode 100644
index c564b6fcf8a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
+++ /dev/null
@@ -1,397 +0,0 @@
-Estimator class is the basic TensorFlow model trainer/evaluator.
-- - -
-
-#### `tf.contrib.learn.Estimator.__init__(model_fn=None, model_dir=None, config=None, params=None, feature_engineering_fn=None)` {#Estimator.__init__}
-
-Constructs an `Estimator` instance.
-
-##### Args:
-
-
-*  <b>`model_fn`</b>: Model function. Follows the signature:
-    * Args:
-      * `features`: single `Tensor` or `dict` of `Tensor`s
-             (depending on data passed to `fit`),
-      * `labels`: `Tensor` or `dict` of `Tensor`s (for multi-head
-             models). If mode is `ModeKeys.INFER`, `labels=None` will be
-             passed. If the `model_fn`'s signature does not accept
-             `mode`, the `model_fn` must still be able to handle
-             `labels=None`.
-      * `mode`: Optional. Specifies if this training, evaluation or
-             prediction. See `ModeKeys`.
-      * `params`: Optional `dict` of hyperparameters.  Will receive what
-             is passed to Estimator in `params` parameter. This allows
-             to configure Estimators from hyper parameter tuning.
-      * `config`: Optional configuration object. Will receive what is passed
-             to Estimator in `config` parameter, or the default `config`.
-             Allows updating things in your model_fn based on configuration
-             such as `num_ps_replicas`.
-      * `model_dir`: Optional directory where model parameters, graph etc
-             are saved. Will receive what is passed to Estimator in
-             `model_dir` parameter, or the default `model_dir`. Allows
-             updating things in your model_fn that expect model_dir, such as
-             training hooks.
-
-    * Returns:
-      `ModelFnOps`
-
-    Also supports a legacy signature which returns tuple of:
-
-      * predictions: `Tensor`, `SparseTensor` or dictionary of same.
-          Can also be any type that is convertible to a `Tensor` or
-          `SparseTensor`, or dictionary of same.
-      * loss: Scalar loss `Tensor`.
-      * train_op: Training update `Tensor` or `Operation`.
-
-    Supports next three signatures for the function:
-
-      * `(features, labels) -> (predictions, loss, train_op)`
-      * `(features, labels, mode) -> (predictions, loss, train_op)`
-      * `(features, labels, mode, params) -> (predictions, loss, train_op)`
-      * `(features, labels, mode, params, config) ->
-         (predictions, loss, train_op)`
-      * `(features, labels, mode, params, config, model_dir) ->
-         (predictions, loss, train_op)`
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: Configuration object.
-*  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
-          Keys are names of parameters, values are basic python types.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                          labels which are the output of `input_fn` and
-                          returns features and labels which will be fed
-                          into `model_fn`. Please check `model_fn` for
-                          a definition of features and labels.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: parameters of `model_fn` don't match `params`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.__repr__()` {#Estimator.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.config` {#Estimator.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.evaluate(*args, **kwargs)` {#Estimator.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.export(*args, **kwargs)` {#Estimator.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#Estimator.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.fit(*args, **kwargs)` {#Estimator.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_params(deep=True)` {#Estimator.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_variable_names()` {#Estimator.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.get_variable_value(name)` {#Estimator.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.model_dir` {#Estimator.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.partial_fit(*args, **kwargs)` {#Estimator.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.predict(*args, **kwargs)` {#Estimator.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.Estimator.set_params(**params)` {#Estimator.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.extract_dask_data.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.extract_dask_data.md
deleted file mode 100644
index 16342ea708b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.extract_dask_data.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.learn.extract_dask_data(data)` {#extract_dask_data}
-
-Extract data from dask.Series or dask.DataFrame for predictors.
-
-Given a distributed dask.DataFrame or dask.Series containing columns or names
-for one or more predictors, this operation returns a single dask.DataFrame or
-dask.Series that can be iterated over.
-
-##### Args:
-
-
-*  <b>`data`</b>: A distributed dask.DataFrame or dask.Series.
-
-##### Returns:
-
-  A dask.DataFrame or dask.Series that can be iterated over.
-  If the supplied argument is neither a dask.DataFrame nor a dask.Series this
-  operation returns it without modification.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.StopAtStep.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.StopAtStep.md
deleted file mode 100644
index 55d4104813a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.StopAtStep.md
+++ /dev/null
@@ -1,154 +0,0 @@
-Monitor to request stop at a specified step.
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.__init__(num_steps=None, last_step=None)` {#StopAtStep.__init__}
-
-Create a StopAtStep monitor.
-
-This monitor requests stop after either a number of steps have been
-executed or a last step has been reached.  Only of the two options can be
-specified.
-
-if `num_steps` is specified, it indicates the number of steps to execute
-after `begin()` is called.  If instead `last_step` is specified, it
-indicates the last step we want to execute, as passed to the `step_begin()`
-call.
-
-##### Args:
-
-
-*  <b>`num_steps`</b>: Number of steps to execute.
-*  <b>`last_step`</b>: Step after which to stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.begin(max_steps=None)` {#StopAtStep.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.end(session=None)` {#StopAtStep.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.epoch_begin(epoch)` {#StopAtStep.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.epoch_end(epoch)` {#StopAtStep.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.post_step(step, session)` {#StopAtStep.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.run_on_all_workers` {#StopAtStep.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.set_estimator(estimator)` {#StopAtStep.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.step_begin(step)` {#StopAtStep.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.StopAtStep.step_end(step, output)` {#StopAtStep.step_end}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
deleted file mode 100644
index 056c1c18399..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
+++ /dev/null
@@ -1,175 +0,0 @@
-Saves summaries every N steps.
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
-
-Initializes a `SummarySaver` monitor.
-
-##### Args:
-
-
-*  <b>`summary_op`</b>: `Tensor` of type `string`. A serialized `Summary` protocol
-      buffer, as output by TF summary methods like `summary.scalar` or
-      `summary.merge_all`.
-*  <b>`save_steps`</b>: `int`, save summaries every N steps. See `EveryN`.
-*  <b>`output_dir`</b>: `string`, the directory to save the summaries to. Only used
-      if no `summary_writer` is supplied.
-*  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
-      one will be created accordingly.
-*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.begin(max_steps=None)` {#SummarySaver.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.end(session=None)` {#SummarySaver.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.epoch_begin(epoch)` {#SummarySaver.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.epoch_end(epoch)` {#SummarySaver.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_post_step(step, session)` {#SummarySaver.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_step_begin(step)` {#SummarySaver.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.every_n_step_end(step, outputs)` {#SummarySaver.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.post_step(step, session)` {#SummarySaver.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.run_on_all_workers` {#SummarySaver.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.set_estimator(estimator)` {#SummarySaver.set_estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.step_begin(step)` {#SummarySaver.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummarySaver.step_end(step, output)` {#SummarySaver.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md
deleted file mode 100644
index 8c700a1899d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummaryWriterCache.md
+++ /dev/null
@@ -1,26 +0,0 @@
-Cache for file writers.
-
-This class caches file writers, one per directory.
-- - -
-
-#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.SummaryWriterCache.get(logdir)` {#SummaryWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md
deleted file mode 100644
index a3f7faac5e2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.basic_rnn_seq2seq.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, dtype=tf.float32, scope=None)` {#basic_rnn_seq2seq}
-
-Basic RNN sequence-to-sequence model.
-
-This model first runs an RNN to encode encoder_inputs into a state vector,
-then runs decoder, initialized with the last encoder state, on decoder_inputs.
-Encoder and decoder use the same RNN cell type, but don't share parameters.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`dtype`</b>: The dtype of the initial state of the RNN cell (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell in the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md
deleted file mode 100644
index 06369b11735..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.legacy_seq2seq.tied_rnn_seq2seq.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.contrib.legacy_seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=tf.float32, scope=None)` {#tied_rnn_seq2seq}
-
-RNN sequence-to-sequence model with tied encoder and decoder parameters.
-
-This model first runs an RNN to encode encoder_inputs into a state vector, and
-then runs decoder, initialized with the last encoder state, on decoder_inputs.
-Encoder and decoder use the same RNN cell and share parameters.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`decoder_inputs`</b>: A list of 2D Tensors [batch_size x input_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`loop_function`</b>: If not None, this function will be applied to i-th output
-    in order to generate i+1-th input, and decoder_inputs will be ignored,
-    except for the first element ("GO" symbol), see rnn_decoder for details.
-*  <b>`dtype`</b>: The dtype of the initial state of the rnn cell (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-*  <b>`state`</b>: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md
deleted file mode 100644
index 62511fe5f01..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorScaledIdentity.md
+++ /dev/null
@@ -1,510 +0,0 @@
-`LinearOperator` acting like a scaled [batch] identity matrix `A = c I`.
-
-This operator acts like a scaled [batch] identity matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-a scaled version of the `N x N` identity matrix.
-
-`LinearOperatorIdentity` is initialized with `num_rows`, and a `multiplier`
-(a `Tensor`) of shape `[B1,...,Bb]`.  `N` is set to `num_rows`, and the
-`multiplier` determines the scale for each batch member.
-
-```python
-# Create a 2 x 2 scaled identity matrix.
-operator = LinearOperatorIdentity(num_rows=2, multiplier=3.)
-
-operator.to_dense()
-==> [[3., 0.]
-     [0., 3.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> 2 * Log[3]
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> 3 * x
-
-y = tf.random_normal(shape=[3, 2, 4])
-# Note that y.shape is compatible with operator.shape because operator.shape
-# is broadcast to [3, 2, 2].
-x = operator.solve(y)
-==> 3 * x
-
-# Create a 2-batch of 2x2 identity matrices
-operator = LinearOperatorIdentity(num_rows=2, multiplier=5.)
-operator.to_dense()
-==> [[[5., 0.]
-      [0., 5.]],
-     [[5., 0.]
-      [0., 5.]]]
-
-x = ... Shape [2, 2, 3]
-operator.apply(x)
-==> 5 * x
-
-# Here the operator and x have different batch_shape, and are broadcast.
-x = ... Shape [1, 2, 3]
-operator.apply(x)
-==> 5 * x
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =   [C1,...,Cc] + [N, R],
-and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
-```
-
-### Performance
-
-* `operator.apply(x)` is `O(D1*...*Dd*N*R)`
-* `operator.solve(x)` is `O(D1*...*Dd*N*R)`
-* `operator.determinant()` is `O(D1*...*Dd)`
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.__init__(num_rows, multiplier, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, assert_proper_shapes=False, name='LinearOperatorScaledIdentity')` {#LinearOperatorScaledIdentity.__init__}
-
-Initialize a `LinearOperatorScaledIdentity`.
-
-The `LinearOperatorScaledIdentity` is initialized with `num_rows`, which
-determines the size of each identity matrix, and a `multiplier`,
-which defines `dtype`, batch shape, and scale of each matrix.
-
-This operator is able to broadcast the leading (batch) dimensions.
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Scalar non-negative integer `Tensor`.  Number of rows in the
-    corresponding identity matrix.
-*  <b>`multiplier`</b>: `Tensor` of shape `[B1,...,Bb]`, or `[]` (a scalar).
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-*  <b>`assert_proper_shapes`</b>: Python `bool`.  If `False`, only perform static
-    checks that initialization and method arguments have proper shape.
-    If `True`, and static checks are inconclusive, add asserts to the graph.
-*  <b>`name`</b>: A name for this `LinearOperator`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `num_rows` is determined statically to be non-scalar, or
-    negative.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.add_to_tensor(mat, name='add_to_tensor')` {#LinearOperatorScaledIdentity.add_to_tensor}
-
-Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
-
-##### Args:
-
-
-*  <b>`mat`</b>: `Tensor` with same `dtype` and shape broadcastable to `self`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.apply(x, adjoint=False, name='apply')` {#LinearOperatorScaledIdentity.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_non_singular(name='assert_non_singular')` {#LinearOperatorScaledIdentity.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorScaledIdentity.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorScaledIdentity.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape` {#LinearOperatorScaledIdentity.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorScaledIdentity.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.determinant(name='det')` {#LinearOperatorScaledIdentity.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension` {#LinearOperatorScaledIdentity.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorScaledIdentity.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.dtype` {#LinearOperatorScaledIdentity.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.graph_parents` {#LinearOperatorScaledIdentity.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_non_singular` {#LinearOperatorScaledIdentity.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_positive_definite` {#LinearOperatorScaledIdentity.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_self_adjoint` {#LinearOperatorScaledIdentity.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.is_square` {#LinearOperatorScaledIdentity.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.log_abs_determinant(name='log_abs_det')` {#LinearOperatorScaledIdentity.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.multiplier` {#LinearOperatorScaledIdentity.multiplier}
-
-The [batch] scalar `Tensor`, `c` in `cI`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.name` {#LinearOperatorScaledIdentity.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension` {#LinearOperatorScaledIdentity.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorScaledIdentity.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape` {#LinearOperatorScaledIdentity.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.shape_tensor(name='shape_tensor')` {#LinearOperatorScaledIdentity.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorScaledIdentity.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank` {#LinearOperatorScaledIdentity.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorScaledIdentity.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorScaledIdentity.to_dense(name='to_dense')` {#LinearOperatorScaledIdentity.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorUDVHUpdate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorUDVHUpdate.md
deleted file mode 100644
index 9a6f57b7950..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.linalg.LinearOperatorUDVHUpdate.md
+++ /dev/null
@@ -1,560 +0,0 @@
-Perturb a `LinearOperator` with a rank `K` update.
-
-This operator acts like a [batch] matrix `A` with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `M x N` matrix.
-
-`LinearOperatorUDVHUpdate` represents `A = L + U D V^H`, where
-
-```
-L, is a LinearOperator representing [batch] M x N matrices
-U, is a [batch] M x K matrix.  Typically K << M.
-D, is a [batch] K x K matrix.
-V, is a [batch] N x K matrix.  Typically K << N.
-V^H is the Hermitian transpose (adjoint) of V.
-```
-
-If `M = N`, determinants and solves are done using the matrix determinant
-lemma and Woodbury identities, and thus require L and D to be non-singular.
-
-Solves and determinants will be attempted unless the "is_non_singular"
-property of L and D is False.
-
-In the event that L and D are positive-definite, and U = V, solves and
-determinants can be done using a Cholesky factorization.
-
-```python
-# Create a 3 x 3 diagonal linear operator.
-diag_operator = LinearOperatorDiag(
-    diag=[1., 2., 3.], is_non_singular=True, is_self_adjoint=True,
-    is_positive_definite=True)
-
-# Perturb with a rank 2 perturbation
-operator = LinearOperatorUDVHUpdate(
-    operator=diag_operator,
-    u=[[1., 2.], [-1., 3.], [0., 0.]],
-    diag=[11., 12.],
-    v=[[1., 2.], [-1., 3.], [10., 10.]])
-
-operator.shape
-==> [3, 3]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [3, 4] Tensor
-operator.apply(x)
-==> Shape [3, 4] Tensor
-```
-
-### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-### Performance
-
-Suppose `operator` is a `LinearOperatorUDVHUpdate` of shape `[M, N]`,
-made from a rank `K` update of `base_operator` which performs `.apply(x)` on
-`x` having `x.shape = [N, R]` with `O(L_apply*N*R)` complexity (and similarly
-for `solve`, `determinant`.  Then, if `x.shape = [N, R]`,
-
-* `operator.apply(x)` is `O(L_apply*N*R + K*N*R)`
-
-and if `M = N`,
-
-* `operator.solve(x)` is `O(L_apply*N*R + N*K*R + K^2*R + K^3)`
-* `operator.determinant()` is `O(L_determinant + L_solve*N*K + K^2*N + K^3)`
-
-If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite, diag_positive, square`
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.__init__(base_operator, u, diag=None, v=None, is_diag_positive=None, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, is_square=None, name='LinearOperatorUDVHUpdate')` {#LinearOperatorUDVHUpdate.__init__}
-
-Initialize a `LinearOperatorUDVHUpdate`.
-
-This creates a `LinearOperator` of the form `A = L + U D V^H`, with
-`L` a `LinearOperator`, `U, V` both [batch] matrices, and `D` a [batch]
-diagonal matrix.
-
-If `L` is non-singular, solves and determinants are available.
-Solves/determinants both involve a solve/determinant of a `K x K` system.
-In the event that L and D are self-adjoint positive-definite, and U = V,
-this can be done using a Cholesky factorization.  The user should set the
-`is_X` matrix property hints, which will trigger the appropriate code path.
-
-##### Args:
-
-
-*  <b>`base_operator`</b>: Shape `[B1,...,Bb, M, N]` real `float32` or `float64`
-    `LinearOperator`.  This is `L` above.
-*  <b>`u`</b>: Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
-    This is `U` above.
-*  <b>`diag`</b>: Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype` as
-    `base_operator`.  This is the diagonal of `D` above.
-     Defaults to `D` being the identity operator.
-*  <b>`v`</b>: Optional `Tensor` of same `dtype` as `u` and shape `[B1,...,Bb, N, K]`
-     Defaults to `v = u`, in which case the perturbation is symmetric.
-     If `M != N`, then `v` must be set since the pertrubation is not square.
-*  <b>`is_diag_positive`</b>: Python `bool`.  If `True`, expect `diag > 0`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-    Default is `None`, unless `is_positive_definite` is auto-set to be
-    `True` (see below).
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  Default is `None`, unless `base_operator` is self-adjoint
-    and `v = None` (meaning `u=v`), in which case this defaults to `True`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite.
-    Default is `None`, unless `base_operator` is positive-definite
-    `v = None` (meaning `u=v`), and `is_diag_positive`, in which case this
-    defaults to `True`.
-*  <b>`is_square`</b>: Expect that this operator acts like square [batch] matrices.
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `is_X` flags are set in an inconsistent way.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorUDVHUpdate.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.apply(x, adjoint=False, name='apply')` {#LinearOperatorUDVHUpdate.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_non_singular(name='assert_non_singular')` {#LinearOperatorUDVHUpdate.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorUDVHUpdate.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorUDVHUpdate.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.base_operator` {#LinearOperatorUDVHUpdate.base_operator}
-
-If this operator is `A = L + U D V^H`, this is the `L`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.batch_shape` {#LinearOperatorUDVHUpdate.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorUDVHUpdate.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.determinant(name='det')` {#LinearOperatorUDVHUpdate.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.diag` {#LinearOperatorUDVHUpdate.diag}
-
-If this operator is `A = L + U D V^H`, this is the diagonal of `D`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.diag_operator` {#LinearOperatorUDVHUpdate.diag_operator}
-
-If this operator is `A = L + U D V^H`, this is `D`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.domain_dimension` {#LinearOperatorUDVHUpdate.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorUDVHUpdate.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.dtype` {#LinearOperatorUDVHUpdate.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.graph_parents` {#LinearOperatorUDVHUpdate.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_non_singular` {#LinearOperatorUDVHUpdate.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_positive_definite` {#LinearOperatorUDVHUpdate.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_self_adjoint` {#LinearOperatorUDVHUpdate.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.is_square` {#LinearOperatorUDVHUpdate.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.log_abs_determinant(name='log_abs_det')` {#LinearOperatorUDVHUpdate.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.name` {#LinearOperatorUDVHUpdate.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.range_dimension` {#LinearOperatorUDVHUpdate.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorUDVHUpdate.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.shape` {#LinearOperatorUDVHUpdate.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.shape_tensor(name='shape_tensor')` {#LinearOperatorUDVHUpdate.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorUDVHUpdate.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.tensor_rank` {#LinearOperatorUDVHUpdate.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorUDVHUpdate.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.to_dense(name='to_dense')` {#LinearOperatorUDVHUpdate.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.u` {#LinearOperatorUDVHUpdate.u}
-
-If this operator is `A = L + U D V^H`, this is the `U`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorUDVHUpdate.v` {#LinearOperatorUDVHUpdate.v}
-
-If this operator is `A = L + U D V^H`, this is the `V`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md
deleted file mode 100644
index ba2cba6f1bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.add_loss.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.losses.add_loss(*args, **kwargs)` {#add_loss}
-
-Adds a externally defined loss to the collection of losses. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.add_loss instead.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A loss `Tensor`.
-*  <b>`loss_collection`</b>: Optional collection to add the loss to.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md
deleted file mode 100644
index 8d888a8996e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.cosine_distance.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.losses.cosine_distance(*args, **kwargs)` {#cosine_distance}
-
-Adds a cosine-distance loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.cosine_distance instead.
-
-Note that the function assumes that `predictions` and `labels` are already
-unit-normalized.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: An arbitrary matrix.
-*  <b>`labels`</b>: A `Tensor` whose shape matches 'predictions'
-*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` shape doesn't match `labels` shape, or
-    `weights` is `None`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md
deleted file mode 100644
index e48896b8fa1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.losses.get_regularization_losses.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.contrib.losses.get_regularization_losses(*args, **kwargs)` {#get_regularization_losses}
-
-Gets the regularization losses. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_regularization_losses instead.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the losses to return.
-
-##### Returns:
-
-  A list of loss variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.auc_using_histogram.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.auc_using_histogram.md
deleted file mode 100644
index 01f67e402c1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.auc_using_histogram.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.contrib.metrics.auc_using_histogram(boolean_labels, scores, score_range, nbins=100, collections=None, check_shape=True, name=None)` {#auc_using_histogram}
-
-AUC computed by maintaining histograms.
-
-Rather than computing AUC directly, this Op maintains Variables containing
-histograms of the scores associated with `True` and `False` labels.  By
-comparing these the AUC is generated, with some discretization error.
-See: "Efficient AUC Learning Curve Calculation" by Bouckaert.
-
-This AUC Op updates in `O(batch_size + nbins)` time and works well even with
-large class imbalance.  The accuracy is limited by discretization error due
-to finite number of bins.  If scores are concentrated in a fewer bins,
-accuracy is lower.  If this is a concern, we recommend trying different
-numbers of bins and comparing results.
-
-##### Args:
-
-
-*  <b>`boolean_labels`</b>: 1-D boolean `Tensor`.  Entry is `True` if the corresponding
-    record is in class.
-*  <b>`scores`</b>: 1-D numeric `Tensor`, same shape as boolean_labels.
-*  <b>`score_range`</b>: `Tensor` of shape `[2]`, same dtype as `scores`.  The min/max
-    values of score that we expect.  Scores outside range will be clipped.
-*  <b>`nbins`</b>: Integer number of bins to use.  Accuracy strictly increases as the
-    number of bins increases.
-*  <b>`collections`</b>: List of graph collections keys. Internal histogram Variables
-    are added to these collections. Defaults to `[GraphKeys.LOCAL_VARIABLES]`.
-*  <b>`check_shape`</b>: Boolean.  If `True`, do a runtime shape check on the scores
-    and labels.
-*  <b>`name`</b>: A name for this Op.  Defaults to "auc_using_histogram".
-
-##### Returns:
-
-
-*  <b>`auc`</b>: `float32` scalar `Tensor`.  Fetching this converts internal histograms
-    to auc value.
-*  <b>`update_op`</b>: `Op`, when run, updates internal histograms.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_concat.md
deleted file mode 100644
index 814ef347f67..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_concat.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.contrib.metrics.streaming_concat(values, axis=0, max_size=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_concat}
-
-Concatenate values along an axis across batches.
-
-The function `streaming_concat` creates two local variables, `array` and
-`size`, that are used to store concatenated values. Internally, `array` is
-used as storage for a dynamic array (if `maxsize` is `None`), which ensures
-that updates can be run in amortized constant time.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that appends the values of a tensor and returns the
-length of the concatenated axis.
-
-This op allows for evaluating metrics that cannot be updated incrementally
-using the same framework as other streaming metrics.
-
-##### Args:
-
-
-*  <b>`values`</b>: `Tensor` to concatenate. Rank and the shape along all axes other
-    than the axis to concatenate along must be statically known.
-*  <b>`axis`</b>: optional integer axis to concatenate along.
-*  <b>`max_size`</b>: optional integer maximum size of `value` along the given axis.
-    Once the maximum size is reached, further updates are no-ops. By default,
-    there is no maximum size: the array is resized as necessary.
-*  <b>`metrics_collections`</b>: An optional list of collections that `value`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value`</b>: A `Tensor` representing the concatenated values.
-*  <b>`update_op`</b>: An operation that concatenates the next values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `values` does not have a statically known rank, `axis` is
-    not in the valid range or the size of `values` is not statically known
-    along any axis other than `axis`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_true_negatives_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_true_negatives_at_thresholds.md
deleted file mode 100644
index d8fede8878f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.metrics.streaming_true_negatives_at_thresholds.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.metrics.streaming_true_negatives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_true_negatives_at_thresholds}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md
deleted file mode 100644
index 31e4e3808bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md
+++ /dev/null
@@ -1,127 +0,0 @@
-Long short-term memory unit (LSTM) recurrent network cell.
-
-The default non-peephole implementation is based on:
-
-  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-S. Hochreiter and J. Schmidhuber.
-"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-The peephole implementation is based on:
-
-  https://research.google.com/pubs/archive/43905.pdf
-
-Hasim Sak, Andrew Senior, and Francoise Beaufays.
-"Long short-term memory recurrent neural network architectures for
- large scale acoustic modeling." INTERSPEECH, 2014.
-
-The coupling of input and forget gate is based on:
-
-  http://arxiv.org/pdf/1503.04069.pdf
-
-Greff et al. "LSTM: A Search Space Odyssey"
-
-The class uses optional peep-hole connections, and an optional projection
-layer.
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__call__(inputs, state, scope=None)` {#CoupledInputForgetGateLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
-    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-    `m_state`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "LSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-    LSTM after reading `inputs` when previous state was `state`.
-    Here output_dim is:
-       num_proj if num_proj was set,
-       num_units otherwise.
-  - Tensor(s) representing the new state of LSTM after reading `inputs` when
-    the previous state was `state`.  Same type and shape(s) as `state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
-    static shape inference.
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__init__(num_units, use_peepholes=False, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=tanh)` {#CoupledInputForgetGateLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
-    matrices.  If None, no projection is performed.
-*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-  provided, then the projected values are clipped elementwise to within
-  `[-proj_clip, proj_clip]`.
-
-*  <b>`num_unit_shards`</b>: How to split the weight matrix.  If >1, the weight
-    matrix is stored across num_unit_shards.
-*  <b>`num_proj_shards`</b>: How to split the projection matrix.  If >1, the
-    projection matrix is stored across num_proj_shards.
-*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
-    in order to reduce the scale of forgetting at the beginning of
-    the training.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  By default (False), they are concatenated
-    along the column axis.  This default behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.output_size` {#CoupledInputForgetGateLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.state_size` {#CoupledInputForgetGateLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.zero_state(batch_size, dtype)` {#CoupledInputForgetGateLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md
deleted file mode 100644
index af7dce705d6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.DropoutWrapper.md
+++ /dev/null
@@ -1,69 +0,0 @@
-Operator adding dropout to inputs and outputs of the given cell.
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.__call__(inputs, state, scope=None)` {#DropoutWrapper.__call__}
-
-Run the cell with the declared dropouts.
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.__init__(cell, input_keep_prob=1.0, output_keep_prob=1.0, seed=None)` {#DropoutWrapper.__init__}
-
-Create a cell with added input and/or output dropout.
-
-Dropout is never used on the state.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection to output_size is added to it.
-*  <b>`input_keep_prob`</b>: unit Tensor or float between 0 and 1, input keep
-    probability; if it is float and 1, no input dropout will be added.
-*  <b>`output_keep_prob`</b>: unit Tensor or float between 0 and 1, output keep
-    probability; if it is float and 1, no output dropout will be added.
-*  <b>`seed`</b>: (optional) integer, the randomness seed.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if keep_prob is not between 0 and 1.
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.output_size` {#DropoutWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.state_size` {#DropoutWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.DropoutWrapper.zero_state(batch_size, dtype)` {#DropoutWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md
deleted file mode 100644
index 1898136704f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.InputProjectionWrapper.md
+++ /dev/null
@@ -1,67 +0,0 @@
-Operator adding an input projection to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the projection on this batch-concatenated sequence, then split it.
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.__call__(inputs, state, scope=None)` {#InputProjectionWrapper.__call__}
-
-Run the input projection and then the cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.__init__(cell, num_proj, input_size=None)` {#InputProjectionWrapper.__init__}
-
-Create a cell with input projection.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, a projection of inputs is added before it.
-*  <b>`num_proj`</b>: Python integer.  The dimension to project to.
-*  <b>`input_size`</b>: Deprecated and unused.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.output_size` {#InputProjectionWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.state_size` {#InputProjectionWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.InputProjectionWrapper.zero_state(batch_size, dtype)` {#InputProjectionWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.LSTMBlockWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.LSTMBlockWrapper.md
deleted file mode 100644
index 5cb59b7a0f9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.LSTMBlockWrapper.md
+++ /dev/null
@@ -1,49 +0,0 @@
-This is a helper class that provides housekeeping for LSTM cells.
-
-This may be useful for alternative LSTM and similar type of cells.
-The subclasses must implement `_call_cell` method and `num_units` property.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockWrapper.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#LSTMBlockWrapper.__call__}
-
-Run this LSTM on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len, batch_size, input_size]`
-    or a list of `time_len` tensors of shape `[batch_size, input_size]`.
-*  <b>`initial_state`</b>: a tuple `(initial_cell_state, initial_output)` with tensors
-    of shape `[batch_size, self._num_units]`. If this is not provided, the
-    cell is expected to create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len).`
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len, batch_size, output_size]`
-    or a list of time_len tensors of shape `[batch_size, output_size]`,
-    to match the type of the `inputs`.
-  - Final state: a tuple `(cell_state, output)` matching `initial_state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: in case of shape mismatches
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockWrapper.num_units` {#LSTMBlockWrapper.num_units}
-
-Number of units in this cell (output dimension).
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.bucket.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.bucket.md
deleted file mode 100644
index 19cd27dec83..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.bucket.md
+++ /dev/null
@@ -1,86 +0,0 @@
-### `tf.contrib.training.bucket(tensors, which_bucket, batch_size, num_buckets, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=True, shared_name=None, name=None)` {#bucket}
-
-Lazy bucketing of input tensors according to `which_bucket`.
-
-The argument `tensors` can be a list or a dictionary of tensors.
-The value returned by the function will be of the same type
-as `tensors`.
-
-The tensors entering this function are put into the bucket given by
-`which_bucket`.  Each bucket has its own queue.  When a bucket contains
-`batch_size` elements, this minibatch is pushed onto a top queue.  The
-tensors returned from this function are a the result of dequeueing the
-next minibatch from this top queue.
-
-This function is implemented using several queues. A `QueueRunner` for the
-queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-As the returned tensors are the result of of a dequeue operation, evaluating
-them will throw a `tf.errors.OutOfRangeError` when the input queue is
-exhausted.  If these tensors are feeding another input queue, its queue runner
-will catch this exception, however, if they are used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have shape `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queues are closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape()` method will have a 0th `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors, representing a single element,
-    to bucket.  Nested lists are not supported.
-*  <b>`which_bucket`</b>: An `int32` scalar Tensor taking a value in `[0, num_buckets)`.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue (all queues will have
-    the same size).  If a list is passed in then each bucket will have a
-    different batch_size.
-    (python int, int32 scalar or iterable of integers of length num_buckets).
-*  <b>`num_buckets`</b>: A python integer, the number of buckets.
-*  <b>`num_threads`</b>: An integer.  The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of minibatches in the top queue,
-    and also the maximum number of elements within each bucket.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batches to be smaller if there are insufficient items left in the queues.
-*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
-    whether the input is added to the queue or not.  If it evaluates `True`,
-    then `tensors` are added to the bucket; otherwise they are dropped.  This
-    tensor essentially acts as a filtering mechanism.
-*  <b>`shared_name`</b>: (Optional). If set, the queues will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A tuple `(bucket, outputs)` where `bucket` is
-  a `int32` scalar tensor and `outputs` is a list or
-  dictionary of batched outputs corresponding to elements of `tensors`.
-  Every step will receive a new bucket of outputs.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors` or if batch_size is a sequence
-    but it's length != num_buckets.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.rejection_sample.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.rejection_sample.md
deleted file mode 100644
index fe3c9866e8d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.rejection_sample.md
+++ /dev/null
@@ -1,57 +0,0 @@
-### `tf.contrib.training.rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None)` {#rejection_sample}
-
-Stochastically creates batches by rejection sampling.
-
-Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce
-a scalar tensor between 0 and 1. This tensor corresponds to the probability of
-being accepted. When `batch_size` tensor groups have been accepted, the batch
-queue will return a mini-batch.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors for data. All tensors are either one item or a
-      batch, according to enqueue_many.
-*  <b>`accept_prob_fn`</b>: A python lambda that takes a non-batch tensor from each
-      item in `tensors`, and produces a scalar tensor.
-*  <b>`batch_size`</b>: Size of batch to be returned.
-*  <b>`queue_threads`</b>: The number of threads for the queue that will hold the final
-    batch.
-*  <b>`enqueue_many`</b>: Bool. If true, interpret input tensors as having a batch
-      dimension.
-*  <b>`prebatch_capacity`</b>: Capacity for the large queue that is used to convert
-    batched tensors to single examples.
-*  <b>`prebatch_threads`</b>: Number of threads for the large queue that is used to
-    convert batched tensors to single examples.
-*  <b>`runtime_checks`</b>: Bool. If true, insert runtime checks on the output of
-      `accept_prob_fn`. Using `True` might have a performance impact.
-*  <b>`name`</b>: Optional prefix for ops created by this function.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: enqueue_many is True and labels doesn't have a batch
-      dimension, or if enqueue_many is False and labels isn't a scalar.
-*  <b>`ValueError`</b>: enqueue_many is True, and batch dimension on data and labels
-      don't match.
-*  <b>`ValueError`</b>: if a zero initial probability class has a nonzero target
-      probability.
-
-##### Returns:
-
-  A list of tensors of the same length as `tensors`, with batch dimension
-  `batch_size`.
-
-##### Example:
-
-  # Get tensor for a single data and label example.
-  data, label = data_provider.Get(['data', 'label'])
-
-  # Get stratified batch according to data tensor.
-  accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2
-  data_batch = tf.contrib.training.rejection_sample(
-      [data, label], accept_prob_fn, 16)
-
-  # Run batch through network.
-  ...
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.stratified_sample.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.stratified_sample.md
deleted file mode 100644
index 27251e3b1ca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.training.stratified_sample.md
+++ /dev/null
@@ -1,58 +0,0 @@
-### `tf.contrib.training.stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None)` {#stratified_sample}
-
-Stochastically creates batches based on per-class probabilities.
-
-This method discards examples. Internally, it creates one queue to amortize
-the cost of disk reads, and one queue to hold the properly-proportioned
-batch.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors for data. All tensors are either one item or a
-      batch, according to enqueue_many.
-*  <b>`labels`</b>: Tensor for label of data. Label is a single integer or a batch,
-      depending on enqueue_many. It is not a one-hot vector.
-*  <b>`target_probs`</b>: Target class proportions in batch. An object whose type has a
-      registered Tensor conversion function.
-*  <b>`batch_size`</b>: Size of batch to be returned.
-*  <b>`init_probs`</b>: Class proportions in the data. An object whose type has a
-      registered Tensor conversion function, or `None` for estimating the
-      initial distribution.
-*  <b>`enqueue_many`</b>: Bool. If true, interpret input tensors as having a batch
-      dimension.
-*  <b>`queue_capacity`</b>: Capacity of the large queue that holds input examples.
-*  <b>`threads_per_queue`</b>: Number of threads for the large queue that holds input
-      examples and for the final queue with the proper class proportions.
-*  <b>`name`</b>: Optional prefix for ops created by this function.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: enqueue_many is True and labels doesn't have a batch
-      dimension, or if enqueue_many is False and labels isn't a scalar.
-*  <b>`ValueError`</b>: enqueue_many is True, and batch dimension on data and labels
-      don't match.
-*  <b>`ValueError`</b>: if probs don't sum to one.
-*  <b>`ValueError`</b>: if a zero initial probability class has a nonzero target
-      probability.
-*  <b>`TFAssertion`</b>: if labels aren't integers in [0, num classes).
-
-##### Returns:
-
-  (data_batch, label_batch), where data_batch is a list of tensors of the same
-      length as `tensors`
-
-##### Example:
-
-  # Get tensor for a single data and label example.
-  data, label = data_provider.Get(['data', 'label'])
-
-  # Get stratified batch according to per-class probabilities.
-  target_probs = [...distribution you want...]
-  [data_batch], labels = tf.contrib.training.stratified_sample(
-      [data], label, target_probs)
-
-  # Run batch through network.
-  ...
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.control_dependencies.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.control_dependencies.md
deleted file mode 100644
index 070f8788e5a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.control_dependencies.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.control_dependencies(control_inputs)` {#control_dependencies}
-
-Wrapper for `Graph.control_dependencies()` using the default graph.
-
-See [`Graph.control_dependencies()`](../../api_docs/python/framework.md#Graph.control_dependencies)
-for more details.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A list of `Operation` or `Tensor` objects which
-    must be executed or computed before running the operations
-    defined in the context.  Can also be `None` to clear the control
-    dependencies.
-
-##### Returns:
-
- A context manager that specifies control dependencies for all
- operations constructed within the context.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.convert_to_tensor_or_indexed_slices.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.convert_to_tensor_or_indexed_slices.md
deleted file mode 100644
index 18cf6c58be4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.convert_to_tensor_or_indexed_slices.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.convert_to_tensor_or_indexed_slices(value, dtype=None, name=None)` {#convert_to_tensor_or_indexed_slices}
-
-Converts the given object to a `Tensor` or an `IndexedSlices`.
-
-If `value` is an `IndexedSlices` or `SparseTensor` it is returned
-unmodified. Otherwise, it is converted to a `Tensor` using
-`convert_to_tensor()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: An `IndexedSlices`, `SparseTensor`, or an object that can be consumed
-    by `convert_to_tensor()`.
-*  <b>`dtype`</b>: (Optional.) The required `DType` of the returned `Tensor` or
-    `IndexedSlices`.
-*  <b>`name`</b>: (Optional.) A name to use if a new `Tensor` is created.
-
-##### Returns:
-
-  An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `dtype` does not match the element type of `value`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_csv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_csv.md
deleted file mode 100644
index f2ebf6945be..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_csv.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.decode_csv(records, record_defaults, field_delim=None, name=None)` {#decode_csv}
-
-Convert CSV records to tensors. Each column maps to one tensor.
-
-RFC 4180 format is expected for the CSV records.
-(https://tools.ietf.org/html/rfc4180)
-Note that we allow leading and trailing spaces with int or float field.
-
-##### Args:
-
-
-*  <b>`records`</b>: A `Tensor` of type `string`.
-    Each string is a record/row in the csv and all records should have
-    the same format.
-*  <b>`record_defaults`</b>: A list of `Tensor` objects with types from: `float32`, `int32`, `int64`, `string`.
-    One tensor per column of the input record, with either a
-    scalar default value for that column or empty if the column is required.
-*  <b>`field_delim`</b>: An optional `string`. Defaults to `","`.
-    delimiter to separate fields in a record.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `Tensor` objects. Has the same type as `record_defaults`.
-  Each tensor will have the same shape as records.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_raw.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_raw.md
deleted file mode 100644
index 8beeae4c003..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.decode_raw.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.decode_raw(bytes, out_type, little_endian=None, name=None)` {#decode_raw}
-
-Reinterpret the bytes of a string as a vector of numbers.
-
-##### Args:
-
-
-*  <b>`bytes`</b>: A `Tensor` of type `string`.
-    All the elements must have the same length.
-*  <b>`out_type`</b>: A `tf.DType` from: `tf.half, tf.float32, tf.float64, tf.int32, tf.uint8, tf.int16, tf.int8, tf.int64`.
-*  <b>`little_endian`</b>: An optional `bool`. Defaults to `True`.
-    Whether the input `bytes` are in little-endian order.
-    Ignored for `out_type` values that are stored in a single byte like
-    `uint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-  A Tensor with one more dimension than the input `bytes`.  The
-  added dimension will have size equal to the length of the elements
-  of `bytes` divided by the number of bytes to represent `out_type`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.OutOfRangeError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.OutOfRangeError.md
deleted file mode 100644
index ef996b0a885..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.OutOfRangeError.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Raised when an operation iterates past the valid input range.
-
-This exception is raised in "end-of-file" conditions, such as when a
-[`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
-operation is blocked on an empty queue, and a
-[`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close)
-operation executes.
-
-- - -
-
-#### `tf.errors.OutOfRangeError.__init__(node_def, op, message)` {#OutOfRangeError.__init__}
-
-Creates an `OutOfRangeError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.UnauthenticatedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.UnauthenticatedError.md
deleted file mode 100644
index d3344dc6b17..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.errors.UnauthenticatedError.md
+++ /dev/null
@@ -1,11 +0,0 @@
-The request does not have valid authentication credentials.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.UnauthenticatedError.__init__(node_def, op, message)` {#UnauthenticatedError.__init__}
-
-Creates an `UnauthenticatedError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.exp.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.exp.md
deleted file mode 100644
index f31531e762f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.exp.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.exp(x, name=None)` {#exp}
-
-Computes exponential of x element-wise.  \\(y = e^x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars.md
deleted file mode 100644
index d7815f04146..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.fake_quant_with_min_max_vars(inputs, min, max, name=None)` {#fake_quant_with_min_max_vars}
-
-Fake-quantize the 'inputs' tensor of type float and shape `[b, h, w, d]` via
-
-global float scalars `min` and `max` to 'outputs' tensor of same shape as
-`inputs`.
-
-[min; max] is the clamping range for the 'inputs' data.  Op divides this range
-into 255 steps (total of 256 values), then replaces each 'inputs' value with the
-closest of the quantized step values.
-
-This operation has a gradient and thus allows for training `min` and `max` values.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars_per_channel.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars_per_channel.md
deleted file mode 100644
index bc39cf9570a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.fake_quant_with_min_max_vars_per_channel.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.fake_quant_with_min_max_vars_per_channel(inputs, min, max, name=None)` {#fake_quant_with_min_max_vars_per_channel}
-
-Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
-
-`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-to 'outputs' tensor of same shape as `inputs`.
-
-[min; max] is the clamping range for the 'inputs' data in the corresponding
-depth channel.  Op divides this range into 255 steps (total of 256 values), then
-replaces each 'inputs' value with the closest of the quantized step values.
-
-This operation has a gradient and thus allows for training `min` and `max` values.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: A `Tensor` of type `float32`.
-*  <b>`max`</b>: A `Tensor` of type `float32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.foldl.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.foldl.md
deleted file mode 100644
index 1206976da8a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.foldl.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#foldl}
-
-foldl on the list of tensors unpacked from `elems` on dimension 0.
-
-This foldl operator repeatedly applies the callable `fn` to a sequence
-of elements from first to last. The elements are made of the tensors
-unpacked from `elems` on dimension 0. The callable fn takes two tensors as
-arguments. The first argument is the accumulated value computed from the
-preceding invocation of fn. If `initializer` is None, `elems` must contain
-at least one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is fn(initializer, values[0]).shape`.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.
-*  <b>`elems`</b>: A tensor to be unpacked on dimension 0.
-*  <b>`initializer`</b>: (optional) The initial value for the accumulator.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor resulting from applying `fn` consecutively to the list of tensors
-  unpacked from `elems`, from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable.
-
-##### Example:
-
-  ```python
-  elems = [1, 2, 3, 4, 5, 6]
-  sum = foldl(lambda a, x: a + x, elems)
-  # sum == 21
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.global_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.global_variables.md
deleted file mode 100644
index 1939f422248..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.global_variables.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.global_variables()` {#global_variables}
-
-Returns global variables.
-
-Global variables are variables that are shared across machines in a
-distributed environment. The `Variable()` constructor or `get_variable()`
-automatically adds new variables to the graph collection
-`GraphKeys.GLOBAL_VARIABLES`.
-This convenience function returns the contents of that collection.
-
-An alternative to global variables are local variables. See
-[`tf.local_variables()`](../../api_docs/python/state_ops.md#local_variables)
-
-##### Returns:
-
-  A list of `Variable` objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.group.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.group.md
deleted file mode 100644
index 7958cf9e585..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.group.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.group(*inputs, **kwargs)` {#group}
-
-Create an op that groups multiple operations.
-
-When this op finishes, all ops in `input` have finished. This op has no
-output.
-
-See also `tuple` and `with_dependencies`.
-
-##### Args:
-
-
-*  <b>`*inputs`</b>: Zero or more tensors to group.
-*  <b>`**kwargs`</b>: Optional parameters to pass when constructing the NodeDef.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  An Operation that executes all its inputs.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If an unknown keyword argument is provided.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ifft2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ifft2d.md
deleted file mode 100644
index d19b164d8cc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.ifft2d.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.ifft2d(input, name=None)` {#ifft2d}
-
-Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
-
-2 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 2
-    dimensions of `input` are replaced with their inverse 2D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.ifft2
-  @end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_contrast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_contrast.md
deleted file mode 100644
index 2fbf1b3e2aa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_contrast.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.image.adjust_contrast(images, contrast_factor)` {#adjust_contrast}
-
-Adjust contrast of RGB or grayscale images.
-
-This is a convenience method that converts an RGB image to float
-representation, adjusts its contrast, and then converts it back to the
-original data type. If several adjustments are chained it is advisable to
-minimize the number of redundant conversions.
-
-`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-interpreted as `[height, width, channels]`.  The other dimensions only
-represent a collection of images, such as `[batch, height, width, channels].`
-
-Contrast is adjusted independently for each channel of each image.
-
-For each channel, this Op computes the mean of the image pixels in the
-channel and then adjusts each component `x` of each pixel to
-`(x - mean) * contrast_factor + mean`.
-
-##### Args:
-
-
-*  <b>`images`</b>: Images to adjust.  At least 3-D.
-*  <b>`contrast_factor`</b>: A float multiplier for adjusting contrast.
-
-##### Returns:
-
-  The contrast-adjusted image or images.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_saturation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_saturation.md
deleted file mode 100644
index 1829271ff67..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.adjust_saturation.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.image.adjust_saturation(image, saturation_factor, name=None)` {#adjust_saturation}
-
-Adjust saturation of an RGB image.
-
-This is a convenience method that converts an RGB image to float
-representation, converts it to HSV, add an offset to the saturation channel,
-converts back to RGB and then back to the original data type. If several
-adjustments are chained it is advisable to minimize the number of redundant
-conversions.
-
-`image` is an RGB image.  The image saturation is adjusted by converting the
-image to HSV and multiplying the saturation (S) channel by
-`saturation_factor` and clipping. The image is then converted back to RGB.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`saturation_factor`</b>: float. Factor to multiply the saturation by.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.convert_image_dtype.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.convert_image_dtype.md
deleted file mode 100644
index 63db6f36a9d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.convert_image_dtype.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.image.convert_image_dtype(image, dtype, saturate=False, name=None)` {#convert_image_dtype}
-
-Convert `image` to `dtype`, scaling its values if needed.
-
-Images that are represented using floating point values are expected to have
-values in the range [0,1). Image data stored in integer data types are
-expected to have values in the range `[0,MAX]`, where `MAX` is the largest
-positive representable number for the data type.
-
-This op converts between data types, scaling the values appropriately before
-casting.
-
-Note that converting from floating point inputs to integer types may lead to
-over/underflow problems. Set saturate to `True` to avoid such problem in
-problematic conversions. If enabled, saturation will clip the output into the
-allowed range before performing a potentially dangerous cast (and only before
-performing such a cast, i.e., when casting from a floating point to an integer
-type, and when casting from a signed to an unsigned type; `saturate` has no
-effect on casts between floats, or on casts that increase the type's range).
-
-##### Args:
-
-
-*  <b>`image`</b>: An image.
-*  <b>`dtype`</b>: A `DType` to convert `image` to.
-*  <b>`saturate`</b>: If `True`, clip the input before casting (if necessary).
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  `image`, converted to `dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.decode_png.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.decode_png.md
deleted file mode 100644
index 4332af77044..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.decode_png.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.image.decode_png(contents, channels=None, dtype=None, name=None)` {#decode_png}
-
-Decode a PNG-encoded image to a uint8 or uint16 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the PNG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-
-If needed, the PNG-encoded image is transformed to match the requested number
-of color channels.
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded image.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`.
-    Number of color channels for the decoded image.
-*  <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8, tf.uint16`. Defaults to `tf.uint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `dtype`. 3-D with shape `[height, width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_contrast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_contrast.md
deleted file mode 100644
index 76cd2292cf5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_contrast.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.image.random_contrast(image, lower, upper, seed=None)` {#random_contrast}
-
-Adjust the contrast of an image by a random factor.
-
-Equivalent to `adjust_contrast()` but uses a `contrast_factor` randomly
-picked in the interval `[lower, upper]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: An image tensor with 3 or more dimensions.
-*  <b>`lower`</b>: float.  Lower bound for the random contrast factor.
-*  <b>`upper`</b>: float.  Upper bound for the random contrast factor.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  The contrast-adjusted tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `upper <= lower` or if `lower < 0`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_saturation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_saturation.md
deleted file mode 100644
index 397bfc4d0b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.image.random_saturation.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.image.random_saturation(image, lower, upper, seed=None)` {#random_saturation}
-
-Adjust the saturation of an RGB image by a random factor.
-
-Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
-picked in the interval `[lower, upper]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`lower`</b>: float.  Lower bound for the random saturation factor.
-*  <b>`upper`</b>: float.  Upper bound for the random saturation factor.
-*  <b>`seed`</b>: An operation-specific seed. It will be used in conjunction
-    with the graph-level seed to determine the real seeds that will be
-    used in this operation. Please see the documentation of
-    set_random_seed for its interaction with the graph-level random seed.
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `upper <= lower` or if `lower < 0`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.local_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.local_variables.md
deleted file mode 100644
index 2bf8d2f912f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.local_variables.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.local_variables()` {#local_variables}
-
-Returns local variables.
-
-Local variables - per process variables, usually not saved/restored to
-checkpoint and used for temporary or intermediate values.
-For example, they can be used as counters for metrics computation or
-number of epochs this machine has read data.
-The `tf.contrib.framework.local_variable()` function automatically adds the
-new variable to `GraphKeys.LOCAL_VARIABLES`.
-This convenience function returns the contents of that collection.
-
-An alternative to local variables are global variables. See
-[`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
-
-##### Returns:
-
-  A list of local `Variable` objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.logical_xor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.logical_xor.md
deleted file mode 100644
index 20db3e60a61..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.logical_xor.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.logical_xor(x, y, name='LogicalXor')` {#logical_xor}
-
-x ^ y = (x | y) & ~(x & y).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.multinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.multinomial.md
deleted file mode 100644
index 8e8e1e102db..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.multinomial.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.multinomial(logits, num_samples, seed=None, name=None)` {#multinomial}
-
-Draws samples from a multinomial distribution.
-
-Example:
-
-```python
-# samples has shape [1, 5], where each value is either 0 or 1 with equal
-# probability.
-samples = tf.multinomial(tf.log([[10., 10.]]), 5)
-```
-
-##### Args:
-
-
-*  <b>`logits`</b>: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
-    `[i, :]` represents the unnormalized log probabilities for all classes.
-*  <b>`num_samples`</b>: 0-D.  Number of independent samples to draw for each row slice.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The drawn samples of shape `[batch_size, num_samples]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.dropout.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.dropout.md
deleted file mode 100644
index 4f2b7c02140..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.dropout.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)` {#dropout}
-
-Computes dropout.
-
-With probability `keep_prob`, outputs the input element scaled up by
-`1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
-sum is unchanged.
-
-By default, each element is kept or dropped independently.  If `noise_shape`
-is specified, it must be
-[broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
-will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
-and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
-kept independently and each row and column will be kept or not kept together.
-
-##### Args:
-
-
-*  <b>`x`</b>: A tensor.
-*  <b>`keep_prob`</b>: A scalar `Tensor` with the same type as x. The probability
-    that each element is kept.
-*  <b>`noise_shape`</b>: A 1-D `Tensor` of type `int32`, representing the
-    shape for randomly generated keep/drop flags.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A Tensor of the same shape of `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `keep_prob` is not in `(0, 1]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
deleted file mode 100644
index 8f8fb0237cf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
+++ /dev/null
@@ -1,81 +0,0 @@
-### `tf.nn.fractional_max_pool(value, pooling_ratio, pseudo_random=None, overlapping=None, deterministic=None, seed=None, seed2=None, name=None)` {#fractional_max_pool}
-
-Performs fractional max pooling on the input.
-
-Fractional max pooling is slightly different than regular max pooling.  In
-regular max pooling, you downsize an input set by taking the maximum value of
-smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-a factor of N, where N is an integer.  Fractional max pooling, as you might
-expect from the word "fractional", means that the overall reduction ratio N
-does not have to be an integer.
-
-The sizes of the pooling regions are generated randomly but are fairly uniform.
-For example, let's look at the height dimension, and the constraints on the
-list of rows that will be pool boundaries.
-
-First we define the following:
-
-1.  input_row_length : the number of rows from the input set
-2.  output_row_length : which will be smaller than the input
-3.  alpha = input_row_length / output_row_length : our reduction ratio
-4.  K = floor(alpha)
-5.  row_pooling_sequence : this is the result list of pool boundary rows
-
-Then, row_pooling_sequence should satisfy:
-
-1.  a[0] = 0 : the first value of the sequence is 0
-2.  a[end] = input_row_length : the last value of the sequence is the size
-3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-4.  length(row_pooling_sequence) = output_row_length+1
-
-For more details on fractional max pooling, see this paper:
-[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`pooling_ratio`</b>: A list of `floats` that has length `>= 4`.
-    Pooling ratio for each dimension of `value`, currently only
-    supports row and col dimension and should be >= 1.0. For example, a valid
-    pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-    must be 1.0 because we don't allow pooling on batch and channels
-    dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-    respectively.
-*  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, generates the pooling sequence in a
-    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-    difference between pseudorandom and random.
-*  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, it means when pooling, the values at the boundary
-    of adjacent pooling cells are used by both cells. For example:
-
-    `index  0  1  2  3  4`
-
-    `value  20 5  16 3  7`
-
-    If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-    The result would be [20, 16] for fractional max pooling.
-
-*  <b>`deterministic`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, a fixed pooling region will be used when
-    iterating over a FractionalMaxPool node in the computation graph. Mainly used
-    in unit test to make FractionalMaxPool deterministic.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either seed or seed2 are set to be non-zero, the random number
-    generator is seeded by the given seed.  Otherwise, it is seeded by a
-    random seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    An second seed to avoid seed collision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, row_pooling_sequence, col_pooling_sequence).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `value`. output tensor after fractional max pooling.
-*  <b>`row_pooling_sequence`</b>: A `Tensor` of type `int64`. row pooling sequence, needed to calculate gradient.
-*  <b>`col_pooling_sequence`</b>: A `Tensor` of type `int64`. column pooling sequence, needed to calculate gradient.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md
deleted file mode 100644
index ac55b177d08..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.nn.log_softmax(logits, dim=-1, name=None)` {#log_softmax}
-
-Computes log softmax activations.
-
-For each batch `i` and class `j` we have
-
-    logsoftmax = logits - log(reduce_sum(exp(logits), dim))
-
-##### Args:
-
-
-*  <b>`logits`</b>: A non-empty `Tensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`dim`</b>: The dimension softmax would be performed on. The default is -1 which
-    indicates the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `logits` is empty or `dim` is beyond the last
-    dimension of `logits`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.max_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.max_pool.md
deleted file mode 100644
index 05934c00e18..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.max_pool.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None)` {#max_pool}
-
-Performs the max pooling on the input.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` with shape `[batch, height, width, channels]` and
-    type `tf.float32`.
-*  <b>`ksize`</b>: A list of ints that has length >= 4.  The size of the window for
-    each dimension of the input tensor.
-*  <b>`strides`</b>: A list of ints that has length >= 4.  The stride of the sliding
-    window for each dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  A `Tensor` with type `tf.float32`.  The max pooled output tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_max_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_max_pool.md
deleted file mode 100644
index 3ddffd5b83f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_max_pool.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.nn.quantized_max_pool(input, min_input, max_input, ksize, strides, padding, name=None)` {#quantized_max_pool}
-
-Produces the max pool of the input tensor for quantized types.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`ksize`</b>: A list of `ints`.
-    The size of the window for each dimension of the input tensor.
-    The length must be 4 to match the number of dimensions of the input.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor. The length must be 4 to match the number of dimensions of the input.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_relu_x.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_relu_x.md
deleted file mode 100644
index 2738a4bdabc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.quantized_relu_x.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.nn.quantized_relu_x(features, max_value, min_features, max_features, out_type=None, name=None)` {#quantized_relu_x}
-
-Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`max_value`</b>: A `Tensor` of type `float32`.
-*  <b>`min_features`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized value represents.
-*  <b>`max_features`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized value represents.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`. Defaults to `tf.quint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (activations, min_activations, max_activations).
-
-*  <b>`activations`</b>: A `Tensor` of type `out_type`. Has the same output shape as "features".
-*  <b>`min_activations`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized value represents.
-*  <b>`max_activations`</b>: A `Tensor` of type `float32`. The float value that the highest quantized value represents.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.softsign.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.softsign.md
deleted file mode 100644
index 971b2a81342..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.softsign.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.nn.softsign(features, name=None)` {#softsign}
-
-Computes softsign: `features / (abs(features) + 1)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.parse_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.parse_tensor.md
deleted file mode 100644
index 796eb395986..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.parse_tensor.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.parse_tensor(serialized, out_type, name=None)` {#parse_tensor}
-
-Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A `Tensor` of type `string`.
-    A scalar string containing a serialized TensorProto proto.
-*  <b>`out_type`</b>: A `tf.DType`.
-    The type of the serialized tensor.  The provided type must match the
-    type of the serialized tensor and no implicit conversion will take place.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`. A Tensor of type `out_type`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.placeholder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.placeholder.md
deleted file mode 100644
index 28cdc11cce0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.placeholder.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.placeholder(dtype, shape=None, name=None)` {#placeholder}
-
-Inserts a placeholder for a tensor that will be always fed.
-
-**Important**: This tensor will produce an error if evaluated. Its value must
-be fed using the `feed_dict` optional argument to `Session.run()`,
-`Tensor.eval()`, or `Operation.run()`.
-
-For example:
-
-```python
-x = tf.placeholder(tf.float32, shape=(1024, 1024))
-y = tf.matmul(x, x)
-
-with tf.Session() as sess:
-  print(sess.run(y))  # ERROR: will fail because x was not fed.
-
-  rand_array = np.random.rand(1024, 1024)
-  print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
-```
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of elements in the tensor to be fed.
-*  <b>`shape`</b>: The shape of the tensor to be fed (optional). If the shape is not
-    specified, you can feed a tensor of any shape.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` that may be used as a handle for feeding a value, but not
-  evaluated directly.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_gamma.md
deleted file mode 100644
index 1d99f8c2f82..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_gamma.md
+++ /dev/null
@@ -1,65 +0,0 @@
-### `tf.random_gamma(shape, alpha, beta=None, dtype=tf.float32, seed=None, name=None)` {#random_gamma}
-
-Draws `shape` samples from each of the given Gamma distribution(s).
-
-`alpha` is the shape parameter describing the distribution(s), and `beta` is
-the inverse scale parameter(s).
-
-Example:
-
-  samples = tf.random_gamma([10], [0.5, 1.5])
-  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
-  # the samples drawn from each distribution
-
-  samples = tf.random_gamma([7, 5], [0.5, 1.5])
-  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
-  # represents the 7x5 samples drawn from each of the two distributions
-
-  samples = tf.random_gamma([30], [[1.],[3.],[5.]], beta=[[3., 4.]])
-  # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.
-
-  Note that for small alpha values, there is a chance you will draw a value of
-  exactly 0, which gets worse for lower-precision dtypes, even though zero is
-  not in the support of the gamma distribution.
-
-  Relevant cdfs (~chance you will draw a exactly-0 value):
-  ```
-    stats.gamma(.01).cdf(np.finfo(np.float16).tiny)
-        0.91269738769897879
-    stats.gamma(.01).cdf(np.finfo(np.float32).tiny)
-        0.41992668622045726
-    stats.gamma(.01).cdf(np.finfo(np.float64).tiny)
-        0.00084322740680686662
-    stats.gamma(.35).cdf(np.finfo(np.float16).tiny)
-        0.037583276135263931
-    stats.gamma(.35).cdf(np.finfo(np.float32).tiny)
-        5.9514895726818067e-14
-    stats.gamma(.35).cdf(np.finfo(np.float64).tiny)
-        2.3529843400647272e-108
-  ```
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output samples
-    to be drawn per alpha/beta-parameterized distribution.
-*  <b>`alpha`</b>: A Tensor or Python value or N-D array of type `dtype`. `alpha`
-    provides the shape parameter(s) describing the gamma distribution(s) to
-    sample. Must be broadcastable with `beta`.
-*  <b>`beta`</b>: A Tensor or Python value or N-D array of type `dtype`. Defaults to 1.
-    `beta` provides the inverse scale parameter(s) of the gamma
-    distribution(s) to sample. Must be broadcastable with `alpha`.
-*  <b>`dtype`</b>: The type of alpha, beta, and the output: `float16`, `float32`, or
-    `float64`.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distributions.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` of shape `tf.concat(shape, tf.shape(alpha + beta))`
-    with values of type `dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_shuffle.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_shuffle.md
deleted file mode 100644
index 14f40d64afc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.random_shuffle.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.random_shuffle(value, seed=None, name=None)` {#random_shuffle}
-
-Randomly shuffles a tensor along its first dimension.
-
-The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-to one and only one `output[i]`. For example, a mapping that might occur for a
-3x2 tensor is:
-
-```python
-[[1, 2],       [[5, 6],
- [3, 4],  ==>   [1, 2],
- [5, 6]]        [3, 4]]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: A Tensor to be shuffled.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of same shape and type as `value`, shuffled along its first
-  dimension.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.reduce_min.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.reduce_min.md
deleted file mode 100644
index f1b0ba6614d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.reduce_min.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.reduce_min(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_min}
-
-Computes the minimum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.min
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.register_tensor_conversion_function.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.register_tensor_conversion_function.md
deleted file mode 100644
index dc55e629b49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.register_tensor_conversion_function.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.register_tensor_conversion_function(base_type, conversion_func, priority=100)` {#register_tensor_conversion_function}
-
-Registers a function for converting objects of `base_type` to `Tensor`.
-
-The conversion function must have the following signature:
-
-```python
-    def conversion_func(value, dtype=None, name=None, as_ref=False):
-      # ...
-```
-
-It must return a `Tensor` with the given `dtype` if specified. If the
-conversion function creates a new `Tensor`, it should use the given
-`name` if specified. All exceptions will be propagated to the caller.
-
-The conversion function may return `NotImplemented` for some
-inputs. In this case, the conversion process will continue to try
-subsequent conversion functions.
-
-If `as_ref` is true, the function must return a `Tensor` reference,
-such as a `Variable`.
-
-NOTE: The conversion functions will execute in order of priority,
-followed by order of registration. To ensure that a conversion function
-`F` runs before another conversion function `G`, ensure that `F` is
-registered with a smaller priority than `G`.
-
-##### Args:
-
-
-*  <b>`base_type`</b>: The base type or tuple of base types for all objects that
-    `conversion_func` accepts.
-*  <b>`conversion_func`</b>: A function that converts instances of `base_type` to
-    `Tensor`.
-*  <b>`priority`</b>: Optional integer that indicates the priority for applying this
-    conversion function. Conversion functions with smaller priority values
-    run earlier than conversion functions with larger priority values.
-    Defaults to 100.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments do not have the appropriate type.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_mul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_mul.md
deleted file mode 100644
index 94da4712d49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_mul.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.scatter_mul(ref, indices, updates, use_locking=None, name=None)` {#scatter_mul}
-
-Multiplies sparse updates into a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] *= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] *= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions multiply.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to multiply to `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the operation will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_nd_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_nd_add.md
deleted file mode 100644
index 4d1472205d0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.scatter_nd_add.md
+++ /dev/null
@@ -1,61 +0,0 @@
-### `tf.scatter_nd_add(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_add}
-
-Applies sparse addition between `updates` and individual values or slices
-
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
-
-The resulting update to ref would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated values
-    to add to ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.segment_mean.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.segment_mean.md
deleted file mode 100644
index 5d901859a9d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.segment_mean.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.segment_mean(data, segment_ids, name=None)` {#segment_mean}
-
-Computes the mean along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-over `j` such that `segment_ids[j] == i` and `N` is the total number of
-values summed.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMean.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
deleted file mode 100644
index 2032de86aff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.shape(input, name=None, out_type=tf.int32)` {#shape}
-
-Returns the shape of a tensor.
-
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`out_type`</b>: (Optional) The specified output type of the operation
-    (`int32` or `int64`). Defaults to `tf.int32`.
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
deleted file mode 100644
index 70cab998b44..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
+++ /dev/null
@@ -1,102 +0,0 @@
-### `tf.sparse_concat(axis, sp_inputs, name=None, expand_nonconcat_dim=False, concat_dim=None)` {#sparse_concat}
-
-Concatenates a list of `SparseTensor` along the specified dimension.
-
-Concatenation is with respect to the dense versions of each sparse input.
-It is assumed that each inputs is a `SparseTensor` whose elements are ordered
-along increasing dimension number.
-
-If expand_nonconcat_dim is False, all inputs' shapes must match, except for
-the concat dimension. If expand_nonconcat_dim is True, then inputs' shapes are
-allowed to vary among all inputs.
-
-The `indices`, `values`, and `shapes` lists must have the same length.
-
-If expand_nonconcat_dim is False, then the output shape is identical to the
-inputs', except along the concat dimension, where it is the sum of the inputs'
-sizes along that dimension.
-
-If expand_nonconcat_dim is True, then the output shape along the non-concat
-dimensions will be expand to be the largest among all inputs, and it is the
-sum of the inputs sizes along the concat dimension.
-
-The output elements will be resorted to preserve the sort order along
-increasing dimension number.
-
-This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-values across all inputs. This is due to the need for an internal sort in
-order to concatenate efficiently across an arbitrary dimension.
-
-For example, if `axis = 1` and the inputs are
-
-    sp_inputs[0]: shape = [2, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-then the output will be
-
-    shape = [2, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b c  ]        [       ]   [b c          ]
-
-Another example, if 'axis = 1' and the inputs are
-
-    sp_inputs[0]: shape = [3, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [2, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-if expand_nonconcat_dim = False, this will result in an error. But if
-expand_nonconcat_dim = True, this will result in:
-
-    shape = [3, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [2, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b    ]        [       ]   [b            ]
-    [  c  ]                    [  c          ]
-
-
-##### Args:
-
-
-*  <b>`axis`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
-    where rank is the number of dimensions in each input `SparseTensor`.
-*  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional).
-*  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
-    dimensions. Defaulted to False.
-*  <b>`concat_dim`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  A `SparseTensor` with the concatenated output.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_inputs` is not a list of `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_softmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_softmax.md
deleted file mode 100644
index b2b5d4b9c37..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_softmax.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.sparse_softmax(sp_input, name=None)` {#sparse_softmax}
-
-Applies softmax to a batched N-D `SparseTensor`.
-
-The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-(where `N >= 2`), and with indices sorted in the canonical lexicographic
-order.
-
-This op is equivalent to applying the normal `tf.nn.softmax()` to each
-innermost logical submatrix with shape `[B, C]`, but with the catch that *the
-implicitly zero elements do not participate*.  Specifically, the algorithm is
-equivalent to:
-
-  (1) Applies `tf.nn.softmax()` to a densified view of each innermost
-      submatrix with shape `[B, C]`, along the size-C dimension;
-  (2) Masks out the original implicitly-zero locations;
-  (3) Renormalizes the remaining elements.
-
-Hence, the `SparseTensor` result has exactly the same non-zero indices and
-shape.
-
-Example:
-
-```python
-# First batch:
-# [?   e.]
-# [1.  ? ]
-# Second batch:
-# [e   ? ]
-# [e   e ]
-shape = [2, 2, 2]  # 3-D SparseTensor
-values = np.asarray([[[0., np.e], [1., 0.]], [[np.e, 0.], [np.e, np.e]]])
-indices = np.vstack(np.where(values)).astype(np.int64).T
-
-result = tf.sparse_softmax(tf.SparseTensor(indices, values, shape))
-# ...returning a 3-D SparseTensor, equivalent to:
-# [?   1.]     [1    ?]
-# [1.  ? ] and [.5  .5]
-# where ? means implicitly zero.
-```
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: N-D `SparseTensor`, where `N >= 2`.
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: N-D `SparseTensor` representing the results.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md
deleted file mode 100644
index 11fa3f4465d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_split.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)` {#sparse_split}
-
-Split a `SparseTensor` into `num_split` tensors along `axis`.
-
-If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
-each slice starting from 0:`shape[axis] % num_split` gets extra one
-dimension. For example, if `axis = 1` and `num_split = 2` and the
-input is:
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    output_tensor[0] =
-    [    a ]
-    [b c   ]
-
-    output_tensor[1] =
-    [ d e  ]
-    [      ]
-
-##### Args:
-
-
-*  <b>`keyword_required`</b>: Python 2 standin for * (temporary for argument reorder)
-*  <b>`sp_input`</b>: The `SparseTensor` to split.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
-*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`split_dim`</b>: Deprecated old name for axis.
-
-##### Returns:
-
-  `num_split` `SparseTensor` objects resulting from splitting `value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-*  <b>`ValueError`</b>: If the deprecated `split_dim` and `axis` are both non None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.stop_gradient.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.stop_gradient.md
deleted file mode 100644
index 53759f49ff0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.stop_gradient.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.stop_gradient(input, name=None)` {#stop_gradient}
-
-Stops gradient computation.
-
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, this op prevents the contribution of
-its inputs to be taken into account.  Normally, the gradient generator adds ops
-to a graph to compute the derivatives of a specified 'loss' by recursively
-finding out inputs that contributed to its computation.  If you insert this op
-in the graph it inputs are masked from the gradient generator.  They are not
-taken into account for computing gradients.
-
-This is useful any time you want to compute a value with TensorFlow but need
-to pretend that the value was a constant. Some examples include:
-
-*  The *EM* algorithm where the *M-step* should not involve backpropagation
-   through the output of the *E-step*.
-*  Contrastive divergence training of Boltzmann machines where, when
-   differentiating the energy function, the training must not backpropagate
-   through the graph that generated the samples from the model.
-*  Adversarial training, where no backprop should happen through the adversarial
-   example generation process.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.substr.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.substr.md
deleted file mode 100644
index 0f5a21cc14b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.substr.md
+++ /dev/null
@@ -1,92 +0,0 @@
-### `tf.substr(input, pos, len, name=None)` {#substr}
-
-Return substrings from `Tensor` of strings.
-
-For each string in the input `Tensor`, creates a substring starting at index
-`pos` with a total length of `len`.
-
-If `len` defines a substring that would extend beyond the length of the input
-string, then as many characters as possible are used.
-
-If `pos` is negative or specifies a character index larger than any of the input
-strings, then an `InvalidArgumentError` is thrown.
-
-`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-Op creation.
-
-*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
----
-
-Examples
-
-Using scalar `pos` and `len`:
-
-```
-input = [b'Hello', b'World']
-position = 1
-length = 3
-
-output = [b'ell', b'orl']
-```
-
-Using `pos` and `len` with same shape as `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen']]
-position = [[1, 2, 3],
-            [1, 2, 3],
-            [1, 2, 3]]
-length =   [[2, 3, 4],
-            [4, 3, 2],
-            [5, 5, 5]]
-
-output = [[b'en', b'eve', b'lve'],
-          [b'hirt', b'urt', b'te'],
-          [b'ixtee', b'vente', b'hteen']]
-```
-
-Broadcasting `pos` and `len` onto `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen'],
-         [b'nineteen', b'twenty', b'twentyone']]
-position = [1, 2, 3]
-length =   [1, 2, 3]
-
-output = [[b'e', b'ev', b'lve'],
-          [b'h', b'ur', b'tee'],
-          [b'i', b've', b'hte'],
-          [b'i', b'en', b'nty']]
-```
-
-Broadcasting `input` onto `pos` and `len`:
-
-```
-input = b'thirteen'
-position = [1, 5, 7]
-length =   [3, 2, 1]
-
-output = [b'hir', b'ee', b'n"]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Tensor of strings
-*  <b>`pos`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Scalar defining the position of first character in each substring
-*  <b>`len`</b>: A `Tensor`. Must have the same type as `pos`.
-    Scalar defining the number of characters to include in each substring
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Tensor of substrings
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
deleted file mode 100644
index 74185ba7c91..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.svd(tensor, full_matrices=False, compute_uv=True, name=None)` {#svd}
-
-Computes the singular value decompositions of one or more matrices.
-
-Computes the SVD of each inner matrix in `tensor` such that
-`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
-:])`
-
-```prettyprint
-# a is a tensor.
-# s is a tensor of singular values.
-# u is a tensor of left singular vectors.
-#v is a tensor of right singular vectors.
-s, u, v = svd(a)
-s = svd(a, compute_uv=False)
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
-    `N`.
-*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
-    (the default), compute only the leading `P` singular vectors.
-    Ignored if `compute_uv` is `False`.
-*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
-    computed and returned in `u` and `v`, respectively. Otherwise, only the
-    singular values will be computed, which can be significantly faster.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
-*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
-    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-    `[..., M, M]`. Not returned if `compute_uv` is `False`.
-*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
-    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
-    `[..., N, N]`. Not returned if `compute_uv` is `False`.
-
-@compatibility(numpy)
-Mostly equivalent to numpy.linalg.svd, except that the order of output
-arguments here is `s`, `u`, `v` when `compute_uv` is `True`, as opposed to
-`u`, `s`, `v` for numpy.linalg.svd.
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.compute_gradient_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.compute_gradient_error.md
deleted file mode 100644
index d7175f3239d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.compute_gradient_error.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.test.compute_gradient_error(x, x_shape, y, y_shape, x_init_value=None, delta=0.001, init_targets=None, extra_feed_dict=None)` {#compute_gradient_error}
-
-Computes the gradient error.
-
-Computes the maximum error for dy/dx between the computed Jacobian and the
-numerically estimated Jacobian.
-
-This function will modify the tensors passed in as it adds more operations
-and hence changing the consumers of the operations of the input tensors.
-
-This function adds operations to the current session. To compute the error
-using a particular device, such as a GPU, use the standard methods for
-setting a device (e.g. using with sess.graph.device() or setting a device
-function in the session constructor).
-
-##### Args:
-
-
-*  <b>`x`</b>: a tensor or list of tensors
-*  <b>`x_shape`</b>: the dimensions of x as a tuple or an array of ints. If x is a list,
-  then this is the list of shapes.
-
-*  <b>`y`</b>: a tensor
-*  <b>`y_shape`</b>: the dimensions of y as a tuple or an array of ints.
-*  <b>`x_init_value`</b>: (optional) a numpy array of the same shape as "x"
-    representing the initial value of x. If x is a list, this should be a list
-    of numpy arrays.  If this is none, the function will pick a random tensor
-    as the initial value.
-*  <b>`delta`</b>: (optional) the amount of perturbation.
-*  <b>`init_targets`</b>: list of targets to run to initialize model params.
-    TODO(mrry): Remove this argument.
-*  <b>`extra_feed_dict`</b>: dict that allows fixing specified tensor values
-    during the Jacobian calculation.
-
-##### Returns:
-
-  The maximum error in between the two Jacobians.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.gpu_device_name.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.gpu_device_name.md
deleted file mode 100644
index f950d8e1f07..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.gpu_device_name.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.test.gpu_device_name()` {#gpu_device_name}
-
-Returns the name of a GPU device if available or the empty string.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.test_src_dir_path.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.test_src_dir_path.md
deleted file mode 100644
index 7811f29fac0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.test.test_src_dir_path.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.test.test_src_dir_path(relative_path)` {#test_src_dir_path}
-
-Creates an absolute test srcdir path given a relative path.
-
-##### Args:
-
-
-*  <b>`relative_path`</b>: a path relative to tensorflow root.
-    e.g. "core/platform".
-
-##### Returns:
-
-  An absolute path to the linked in runfiles.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.to_double.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.to_double.md
deleted file mode 100644
index 0cabea178eb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.to_double.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.to_double(x, name='ToDouble')` {#to_double}
-
-Casts a tensor to type `float64`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `float64`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `float64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.trace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.trace.md
deleted file mode 100644
index 666cb43a548..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.trace.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.trace(x, name=None)` {#trace}
-
-Compute the trace of a tensor `x`.
-
-`trace(x)` returns the sum along the main diagonal of each inner-most matrix
-in x. If x is of rank `k` with shape `[I, J, K, ..., L, M, N]`, then output
-is a tensor of rank `k-2` with dimensions `[I, J, K, ..., L]` where
-
-`output[i, j, k, ..., l] = trace(x[i, j, i, ..., l, :, :])`
-
-For example:
-
-```python
-# 'x' is [[1, 2],
-#         [3, 4]]
-tf.trace(x) ==> 5
-
-# 'x' is [[1,2,3],
-#         [4,5,6],
-#         [7,8,9]]
-tf.trace(x) ==> 15
-
-# 'x' is [[[1,2,3],
-#          [4,5,6],
-#          [7,8,9]],
-#         [[-1,-2,-3],
-#          [-4,-5,-6],
-#          [-7,-8,-9]]]
-tf.trace(x) ==> [15,-15]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The trace of input tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.AdagradOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.AdagradOptimizer.md
deleted file mode 100644
index 4b8b08edae5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.AdagradOptimizer.md
+++ /dev/null
@@ -1,28 +0,0 @@
-Optimizer that implements the Adagrad algorithm.
-
-See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-or this
-[intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
-
-- - -
-
-#### `tf.train.AdagradOptimizer.__init__(learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')` {#AdagradOptimizer.__init__}
-
-Construct a new Adagrad optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`initial_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adagrad".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_accumulator_value` is invalid.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.QueueRunner.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.QueueRunner.md
deleted file mode 100644
index ea6f7cadbf7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.QueueRunner.md
+++ /dev/null
@@ -1,175 +0,0 @@
-Holds a list of enqueue operations for a queue, each to be run in a thread.
-
-Queues are a convenient TensorFlow mechanism to compute tensors
-asynchronously using multiple threads. For example in the canonical 'Input
-Reader' setup one set of threads generates filenames in a queue; a second set
-of threads read records from the files, processes them, and enqueues tensors
-on a second queue; a third set of threads dequeues these input records to
-construct batches and runs them through training operations.
-
-There are several delicate issues when running multiple threads that way:
-closing the queues in sequence as the input is exhausted, correctly catching
-and reporting exceptions, etc.
-
-The `QueueRunner`, combined with the `Coordinator`, helps handle these issues.
-- - -
-
-#### `tf.train.QueueRunner.__init__(queue=None, enqueue_ops=None, close_op=None, cancel_op=None, queue_closed_exception_types=None, queue_runner_def=None, import_scope=None)` {#QueueRunner.__init__}
-
-Create a QueueRunner.
-
-On construction the `QueueRunner` adds an op to close the queue.  That op
-will be run if the enqueue ops raise exceptions.
-
-When you later call the `create_threads()` method, the `QueueRunner` will
-create one thread for each op in `enqueue_ops`.  Each thread will run its
-enqueue op in parallel with the other threads.  The enqueue ops do not have
-to all be the same op, but it is expected that they all enqueue tensors in
-`queue`.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A `Queue`.
-*  <b>`enqueue_ops`</b>: List of enqueue ops to run in threads later.
-*  <b>`close_op`</b>: Op to close the queue. Pending enqueue ops are preserved.
-*  <b>`cancel_op`</b>: Op to close the queue and cancel pending enqueue ops.
-*  <b>`queue_closed_exception_types`</b>: Optional tuple of Exception types that
-    indicate that the queue has been closed when raised during an enqueue
-    operation.  Defaults to `(tf.errors.OutOfRangeError,)`.  Another common
-    case includes `(tf.errors.OutOfRangeError, tf.errors.CancelledError)`,
-    when some of the enqueue ops may dequeue from other Queues.
-*  <b>`queue_runner_def`</b>: Optional `QueueRunnerDef` protocol buffer. If specified,
-    recreates the QueueRunner from its contents. `queue_runner_def` and the
-    other arguments are mutually exclusive.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
-    initializing from protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `queue_runner_def` and `queue` are both specified.
-*  <b>`ValueError`</b>: If `queue` or `enqueue_ops` are not provided when not
-    restoring from `queue_runner_def`.
-
-
-- - -
-
-#### `tf.train.QueueRunner.cancel_op` {#QueueRunner.cancel_op}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.close_op` {#QueueRunner.close_op}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.create_threads(sess, coord=None, daemon=False, start=False)` {#QueueRunner.create_threads}
-
-Create threads to run the enqueue ops for the given session.
-
-This method requires a session in which the graph was launched.  It creates
-a list of threads, optionally starting them.  There is one thread for each
-op passed in `enqueue_ops`.
-
-The `coord` argument is an optional coordinator that the threads will use
-to terminate together and report exceptions.  If a coordinator is given,
-this method starts an additional thread to close the queue when the
-coordinator requests a stop.
-
-If previously created threads for the given session are still running, no
-new threads will be created.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`coord`</b>: Optional `Coordinator` object for reporting errors and checking
-    stop conditions.
-*  <b>`daemon`</b>: Boolean.  If `True` make the threads daemon threads.
-*  <b>`start`</b>: Boolean.  If `True` starts the threads.  If `False` the
-    caller must call the `start()` method of the returned threads.
-
-##### Returns:
-
-  A list of threads.
-
-
-- - -
-
-#### `tf.train.QueueRunner.enqueue_ops` {#QueueRunner.enqueue_ops}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.exceptions_raised` {#QueueRunner.exceptions_raised}
-
-Exceptions raised but not handled by the `QueueRunner` threads.
-
-Exceptions raised in queue runner threads are handled in one of two ways
-depending on whether or not a `Coordinator` was passed to
-`create_threads()`:
-
-* With a `Coordinator`, exceptions are reported to the coordinator and
-  forgotten by the `QueueRunner`.
-* Without a `Coordinator`, exceptions are captured by the `QueueRunner` and
-  made available in this `exceptions_raised` property.
-
-##### Returns:
-
-  A list of Python `Exception` objects.  The list is empty if no exception
-  was captured.  (No exceptions are captured when using a Coordinator.)
-
-
-- - -
-
-#### `tf.train.QueueRunner.from_proto(queue_runner_def, import_scope=None)` {#QueueRunner.from_proto}
-
-Returns a `QueueRunner` object created from `queue_runner_def`.
-
-
-- - -
-
-#### `tf.train.QueueRunner.name` {#QueueRunner.name}
-
-The string name of the underlying Queue.
-
-
-- - -
-
-#### `tf.train.QueueRunner.queue` {#QueueRunner.queue}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.queue_closed_exception_types` {#QueueRunner.queue_closed_exception_types}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.to_proto(export_scope=None)` {#QueueRunner.to_proto}
-
-Converts this `QueueRunner` to a `QueueRunnerDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `QueueRunnerDef` protocol buffer, or `None` if the `Variable` is not in
-  the specified name scope.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.Server.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.Server.md
deleted file mode 100644
index a7113297ce9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.Server.md
+++ /dev/null
@@ -1,129 +0,0 @@
-An in-process TensorFlow server, for use in distributed training.
-
-A `tf.train.Server` instance encapsulates a set of devices and a
-[`tf.Session`](../../api_docs/python/client.md#Session) target that
-can participate in distributed training. A server belongs to a
-cluster (specified by a [`tf.train.ClusterSpec`](#ClusterSpec)), and
-corresponds to a particular task in a named job. The server can
-communicate with any other server in the same cluster.
-
-- - -
-
-#### `tf.train.Server.__init__(server_or_cluster_def, job_name=None, task_index=None, protocol=None, config=None, start=True)` {#Server.__init__}
-
-Creates a new server with the given definition.
-
-The `job_name`, `task_index`, and `protocol` arguments are optional, and
-override any information provided in `server_or_cluster_def`.
-
-##### Args:
-
-
-*  <b>`server_or_cluster_def`</b>: A `tf.train.ServerDef` or
-    `tf.train.ClusterDef` protocol buffer, or a
-    `tf.train.ClusterSpec` object, describing the server to be
-    created and/or the cluster of which it is a member.
-*  <b>`job_name`</b>: (Optional.) Specifies the name of the job of which the server
-    is a member. Defaults to the value in `server_or_cluster_def`, if
-    specified.
-*  <b>`task_index`</b>: (Optional.) Specifies the task index of the server in its
-    job. Defaults to the value in `server_or_cluster_def`, if specified.
-    Otherwise defaults to 0 if the server's job has only one task.
-*  <b>`protocol`</b>: (Optional.) Specifies the protocol to be used by the server.
-    Acceptable values include `"grpc"`. Defaults to the value in
-    `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
-*  <b>`config`</b>: (Options.) A `tf.ConfigProto` that specifies default
-    configuration options for all sessions that run on this server.
-*  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server
-    after creating it. Defaults to `True`.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    creating the TensorFlow server.
-
-
-- - -
-
-#### `tf.train.Server.create_local_server(config=None, start=True)` {#Server.create_local_server}
-
-Creates a new single-process cluster running on the local host.
-
-This method is a convenience wrapper for creating a
-`tf.train.Server` with a `tf.train.ServerDef` that specifies a
-single-process cluster containing a single task in a job called
-`"local"`.
-
-##### Args:
-
-
-*  <b>`config`</b>: (Options.) A `tf.ConfigProto` that specifies default
-    configuration options for all sessions that run on this server.
-*  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server after
-    creating it. Defaults to `True`.
-
-##### Returns:
-
-  A local `tf.train.Server`.
-
-
-- - -
-
-#### `tf.train.Server.target` {#Server.target}
-
-Returns the target for a `tf.Session` to connect to this server.
-
-To create a
-[`tf.Session`](../../api_docs/python/client.md#Session) that
-connects to this server, use the following snippet:
-
-```python
-server = tf.train.Server(...)
-with tf.Session(server.target):
-  # ...
-```
-
-##### Returns:
-
-  A string containing a session target for this server.
-
-
-- - -
-
-#### `tf.train.Server.server_def` {#Server.server_def}
-
-Returns the `tf.train.ServerDef` for this server.
-
-##### Returns:
-
-  A `tf.train.ServerDef` protocol buffer that describes the configuration
-  of this server.
-
-
-
-- - -
-
-#### `tf.train.Server.start()` {#Server.start}
-
-Starts this server.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    starting the TensorFlow server.
-
-
-- - -
-
-#### `tf.train.Server.join()` {#Server.join}
-
-Blocks until the server has shut down.
-
-This method currently blocks forever.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    joining the TensorFlow server.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md
deleted file mode 100644
index 84f1099ffec..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md
+++ /dev/null
@@ -1,268 +0,0 @@
-Class to synchronize, aggregate gradients and pass them to the optimizer.
-
-In a typical asynchronous training environment, it's common to have some
-stale gradients. For example, with a N-replica asynchronous training,
-gradients will be applied to the variables N times independently. Depending
-on each replica's training speed, some gradients might be calculated from
-copies of the variable from several steps back (N-1 steps on average). This
-optimizer avoids stale gradients by collecting gradients from all replicas,
-averaging them, then applying them to the variables in one shot, after
-which replicas can fetch the new variables and continue.
-
-The following accumulators/queue are created:
-<empty line>
-* N `gradient accumulators`, one per variable to train. Gradients are pushed
-  to them and the chief worker will wait until enough gradients are collected
-  and then average them before applying to variables. The accumulator will
-  drop all stale gradients (more details in the accumulator op).
-* 1 `token` queue where the optimizer pushes the new global_step value after
-  all variables are updated.
-
-The following local variable is created:
-* `sync_rep_local_step`, one per replica. Compared against the global_step in
-  each accumulator to check for staleness of the gradients.
-
-The optimizer adds nodes to the graph to collect gradients and pause the
-trainers until variables are updated.
-For the Parameter Server job:
-<empty line>
-1. An accumulator is created for each variable, and each replica pushes the
-   gradients into the accumulators instead of directly applying them to the
-   variables.
-2. Each accumulator averages once enough gradients (replicas_to_aggregate)
-   have been accumulated.
-3. Apply the averaged gradients to the variables.
-4. Only after all variables have been updated, increment the global step.
-5. Only after step 4, pushes `global_step` in the `token_queue`, once for
-   each worker replica. The workers can now fetch the global step, use it to
-   update its local_step variable and start the next batch.
-
-For the replicas:
-<empty line>
-1. Start a step: fetch variables and compute gradients.
-2. Once the gradients have been computed, push them into gradient
-   accumulators. Each accumulator will check the staleness and drop the stale.
-3. After pushing all the gradients, dequeue an updated value of global_step
-   from the token queue and record that step to its local_step variable. Note
-   that this is effectively a barrier.
-4. Start the next batch.
-
-### Usage
-
-```python
-# Create any optimizer to update the variables, say a simple SGD:
-opt = GradientDescentOptimizer(learning_rate=0.1)
-
-# Wrap the optimizer with sync_replicas_optimizer with 50 replicas: at each
-# step the optimizer collects 50 gradients before applying to variables.
-# Note that if you want to have 2 backup replicas, you can change
-# total_num_replicas=52 and make sure this number matches how many physical
-# replicas you started in your job.
-opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
-                               total_num_replicas=50)
-
-# Some models have startup_delays to help stabilize the model but when using
-# sync_replicas training, set it to 0.
-
-# Now you can call `minimize()` or `compute_gradients()` and
-# `apply_gradients()` normally
-training_op = opt.minimize(total_loss, global_step=self.global_step)
-
-
-# You can create the hook which handles initialization and queues.
-sync_replicas_hook = opt.make_session_run_hook(is_chief)
-```
-
-In the training program, every worker will run the train_op as if not
-synchronized.
-
-```python
-with training.MonitoredTrainingSession(
-    master=workers[worker_id].target, is_chief=is_chief,
-    hooks=[sync_replicas_hook]) as mon_sess:
-  while not mon_sess.should_stop():
-    mon_sess.run(training_op)
-```
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.__init__(opt, replicas_to_aggregate, total_num_replicas=None, variable_averages=None, variables_to_average=None, use_locking=False, name='sync_replicas')` {#SyncReplicasOptimizer.__init__}
-
-Construct a sync_replicas optimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: The actual optimizer that will be used to compute and apply the
-    gradients. Must be one of the Optimizer classes.
-*  <b>`replicas_to_aggregate`</b>: number of replicas to aggregate for each variable
-    update.
-*  <b>`total_num_replicas`</b>: Total number of tasks/workers/replicas, could be
-    different from replicas_to_aggregate.
-    If total_num_replicas > replicas_to_aggregate: it is backup_replicas +
-    replicas_to_aggregate.
-    If total_num_replicas < replicas_to_aggregate: Replicas compute
-    multiple batches per update to variables.
-*  <b>`variable_averages`</b>: Optional `ExponentialMovingAverage` object, used to
-    maintain moving averages for the variables passed in
-    `variables_to_average`.
-*  <b>`variables_to_average`</b>: a list of variables that need to be averaged. Only
-    needed if variable_averages is passed in.
-*  <b>`use_locking`</b>: If True use locks for update operation.
-*  <b>`name`</b>: string. Optional name of the returned operation.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.compute_gradients(*args, **kwargs)` {#SyncReplicasOptimizer.compute_gradients}
-
-Compute gradients of "loss" for the variables in "var_list".
-
-This simply wraps the compute_gradients() from the real optimizer. The
-gradients will be aggregated in the apply_gradients() so that user can
-modify the gradients like clipping with per replica global norm if needed.
-The global norm with aggregated gradients can be bad as one replica's huge
-gradients can hurt the gradients from other replicas.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for compute_gradients().
-*  <b>`**kwargs`</b>: Keyword arguments for compute_gradients().
-
-##### Returns:
-
-  A list of (gradient, variable) pairs.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#SyncReplicasOptimizer.apply_gradients}
-
-Apply gradients to variables.
-
-This contains most of the synchronization implementation and also wraps the
-apply_gradients() from the real optimizer.
-
-##### Args:
-
-
-*  <b>`grads_and_vars`</b>: List of (gradient, variable) pairs as returned by
-    compute_gradients().
-*  <b>`global_step`</b>: Optional Variable to increment by one after the
-    variables have been updated.
-*  <b>`name`</b>: Optional name for the returned operation.  Default to the
-    name passed to the Optimizer constructor.
-
-##### Returns:
-
-
-*  <b>`train_op`</b>: The op to dequeue a token so the replicas can exit this batch
-  and start the next one. This is executed by each replica.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the grads_and_vars is empty.
-*  <b>`ValueError`</b>: If global step is not provided, the staleness cannot be
-    checked.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_chief_queue_runner()` {#SyncReplicasOptimizer.get_chief_queue_runner}
-
-Returns the QueueRunner for the chief to execute.
-
-This includes the operations to synchronize replicas: aggregate gradients,
-apply to variables, increment global step, insert tokens to token queue.
-
-Note that this can only be called after calling apply_gradients() which
-actually generates this queuerunner.
-
-##### Returns:
-
-  A `QueueRunner` for chief to execute.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this is called before apply_gradients().
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_init_tokens_op(num_tokens=-1)` {#SyncReplicasOptimizer.get_init_tokens_op}
-
-Returns the op to fill the sync_token_queue with the tokens.
-
-This is supposed to be executed in the beginning of the chief/sync thread
-so that even if the total_num_replicas is less than replicas_to_aggregate,
-the model can still proceed as the replicas can compute multiple steps per
-variable update. Make sure:
-`num_tokens >= replicas_to_aggregate - total_num_replicas`.
-
-##### Args:
-
-
-*  <b>`num_tokens`</b>: Number of tokens to add to the queue.
-
-##### Returns:
-
-  An op for the chief/sync replica to fill the token queue.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this is called before apply_gradients().
-*  <b>`ValueError`</b>: If num_tokens are smaller than replicas_to_aggregate -
-    total_num_replicas.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_slot(*args, **kwargs)` {#SyncReplicasOptimizer.get_slot}
-
-Return a slot named "name" created for "var" by the Optimizer.
-
-This simply wraps the get_slot() from the actual optimizer.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for get_slot().
-*  <b>`**kwargs`</b>: Keyword arguments for get_slot().
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_slot_names(*args, **kwargs)` {#SyncReplicasOptimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-This simply wraps the get_slot_names() from the actual optimizer.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for get_slot().
-*  <b>`**kwargs`</b>: Keyword arguments for get_slot().
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.make_session_run_hook(is_chief, num_tokens=-1)` {#SyncReplicasOptimizer.make_session_run_hook}
-
-Creates a hook to handle SyncReplicasHook ops such as initialization.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.assert_global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.assert_global_step.md
deleted file mode 100644
index 2bc8feb0c27..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.assert_global_step.md
+++ /dev/null
@@ -1,9 +0,0 @@
-### `tf.train.assert_global_step(global_step_tensor)` {#assert_global_step}
-
-Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`global_step_tensor`</b>: `Tensor` to test.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.batch_join.md
deleted file mode 100644
index d49358f4b53..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.batch_join.md
+++ /dev/null
@@ -1,88 +0,0 @@
-### `tf.train.batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#batch_join}
-
-Runs a list of tensors to fill a queue to create batches of examples.
-
-The `tensors_list` argument is a list of tuples of tensors, or a list of
-dictionaries of tensors.  Each element in the list is treated similarly
-to the `tensors` argument of `tf.train.batch()`.
-
-Enqueues a different list of tensors in different threads.
-Implemented using a queue -- a `QueueRunner` for the queue
-is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-`len(tensors_list)` threads will be started,
-with thread `i` enqueuing the tensors from
-`tensors_list[i]`. `tensors_list[i1][j]` must match
-`tensors_list[i2][j]` in type and shape, except in the first
-dimension if `enqueue_many` is true.
-
-If `enqueue_many` is `False`, each `tensors_list[i]` is assumed
-to represent a single example. An input tensor `x` will be output as a
-tensor with shape `[batch_size] + x.shape`.
-
-If `enqueue_many` is `True`, `tensors_list[i]` is assumed to
-represent a batch of examples, where the first dimension is indexed
-by example, and all members of `tensors_list[i]` should have the
-same size in the first dimension.  The slices of any input tensor
-`x` are treated as examples, and the output tensors will have shape
-`[batch_size] + x.shape[1:]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors_list` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have value `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list_list[i]`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensor_list_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.get_checkpoint_mtimes.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.get_checkpoint_mtimes.md
deleted file mode 100644
index 0586e55851a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.get_checkpoint_mtimes.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.train.get_checkpoint_mtimes(checkpoint_prefixes)` {#get_checkpoint_mtimes}
-
-Returns the mtimes (modification timestamps) of the checkpoints.
-
-Globs for the checkpoints pointed to by `checkpoint_prefixes`.  If the files
-exist, collect their mtime.  Both V2 and V1 checkpoints are considered, in
-that priority.
-
-This is the recommended way to get the mtimes, since it takes into account
-the naming difference between V1 and V2 formats.
-
-##### Args:
-
-
-*  <b>`checkpoint_prefixes`</b>: a list of checkpoint paths, typically the results of
-    `Saver.save()` or those of `tf.train.latest_checkpoint()`, regardless of
-    sharded/non-sharded or V1/V2.
-
-##### Returns:
-
-  A list of mtimes (in microseconds) of the found checkpoints.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md
deleted file mode 100644
index 51fac958eca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.train.range_input_producer(limit, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None)` {#range_input_producer}
-
-Produces the integers from 0 to limit-1 in a queue.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`limit`</b>: An int32 scalar tensor.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `range_input_producer`
-    produces each integer `num_epochs` times before generating an
-    OutOfRange error. If not specified, `range_input_producer` can cycle
-    through the integers an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the integers are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A Queue with the output integers.  A `QueueRunner` for the Queue
-  is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.truncatediv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.truncatediv.md
deleted file mode 100644
index 99c9d55cea6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.truncatediv.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.truncatediv(x, y, name=None)` {#truncatediv}
-
-Returns x / y element-wise for integer types.
-
-Truncation designates that negative numbers will round fractional quantities
-toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
-than Python semantics. See `FloorDiv` for a division function that matches
-Python Semantics.
-
-*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.unique_with_counts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.unique_with_counts.md
deleted file mode 100644
index 0228699c638..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.unique_with_counts.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.unique_with_counts(x, out_idx=None, name=None)` {#unique_with_counts}
-
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. Finally, it returns a third tensor `count` that
-contains the count of each element of `y` in `x`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```prettyprint
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx, count = unique_with_counts(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-count ==> [2, 1, 3, 1, 2]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (y, idx, count).
-
-*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
-*  <b>`count`</b>: A `Tensor` of type `out_idx`. 1-D.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.DebugTensorDatum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.DebugTensorDatum.md
deleted file mode 100644
index 853f2ef5f5f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.DebugTensorDatum.md
+++ /dev/null
@@ -1,146 +0,0 @@
-A single tensor dumped by TensorFlow Debugger (tfdbg).
-
-Contains metadata about the dumped tensor, including `timestamp`,
-`node_name`, `output_slot`, `debug_op`, and path to the dump file
-(`file_path`).
-
-This type does not hold the generally space-expensive tensor value (numpy
-array). Instead, it points to the file from which the tensor value can be
-loaded (with the `get_tensor` method) if needed.
-- - -
-
-#### `tf_debug.DebugTensorDatum.__init__(dump_root, debug_dump_rel_path)` {#DebugTensorDatum.__init__}
-
-`DebugTensorDatum` constructor.
-
-##### Args:
-
-
-*  <b>`dump_root`</b>: (`str`) Debug dump root directory.
-*  <b>`debug_dump_rel_path`</b>: (`str`) Path to a debug dump file, relative to the
-      `dump_root`. For example, suppose the debug dump root
-      directory is `/tmp/tfdbg_1` and the dump file is at
-      `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
-      the value of the debug_dump_rel_path should be
-      `ns_1/node_a_0_DebugIdenity_1234456789`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the base file name of the dump file does not conform to
-    the dump file naming pattern:
-    `node_name`_`output_slot`_`debug_op`_`timestamp`
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.__repr__()` {#DebugTensorDatum.__repr__}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.__str__()` {#DebugTensorDatum.__str__}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.debug_op` {#DebugTensorDatum.debug_op}
-
-Name of the debug op.
-
-##### Returns:
-
-  (`str`) debug op name (e.g., `DebugIdentity`).
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.dump_size_bytes` {#DebugTensorDatum.dump_size_bytes}
-
-Size of the dump file.
-
-Unit: byte.
-
-##### Returns:
-
-  If the dump file exists, size of the dump file, in bytes.
-  If the dump file does not exist, None.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.file_path` {#DebugTensorDatum.file_path}
-
-Path to the file which stores the value of the dumped tensor.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.get_tensor()` {#DebugTensorDatum.get_tensor}
-
-Get tensor from the dump (`Event`) file.
-
-##### Returns:
-
-  The tensor loaded from the dump (`Event`) file.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.node_name` {#DebugTensorDatum.node_name}
-
-Name of the node from which the tensor value was dumped.
-
-##### Returns:
-
-  (`str`) name of the node watched by the debug op.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.output_slot` {#DebugTensorDatum.output_slot}
-
-Output slot index from which the tensor value was dumped.
-
-##### Returns:
-
-  (`int`) output slot index watched by the debug op.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.tensor_name` {#DebugTensorDatum.tensor_name}
-
-Name of the tensor watched by the debug op.
-
-##### Returns:
-
-  (`str`) `Tensor` name, in the form of `node_name`:`output_slot`
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.timestamp` {#DebugTensorDatum.timestamp}
-
-Timestamp of when this tensor value was dumped.
-
-##### Returns:
-
-  (`int`) The timestamp in microseconds.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.watch_key` {#DebugTensorDatum.watch_key}
-
-Watch key identities a debug watch on a tensor.
-
-##### Returns:
-
-  (`str`) A watch key, in the form of `tensor_name`:`debug_op`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md
deleted file mode 100644
index 8194e8ef07e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md
+++ /dev/null
@@ -1,207 +0,0 @@
-Concrete subclass of BaseDebugWrapperSession implementing a local CLI.
-
-This class has all the methods that a `session.Session` object has, in order
-to support debugging with minimal code changes. Invoking its `run()` method
-will launch the command-line interface (CLI) of tfdbg.
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__enter__()` {#LocalCLIDebugWrapperSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#LocalCLIDebugWrapperSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__init__(sess, dump_root=None, log_usage=True, ui_type='curses')` {#LocalCLIDebugWrapperSession.__init__}
-
-Constructor of LocalCLIDebugWrapperSession.
-
-##### Args:
-
-
-*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
-*  <b>`dump_root`</b>: (`str`) optional path to the dump root directory. Must be a
-    directory that does not exist or an empty directory. If the directory
-    does not exist, it will be created by the debugger core during debug
-    `run()` calls and removed afterwards.
-*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
-*  <b>`ui_type`</b>: (`str`) requested UI type. Currently supported:
-    (curses | readline)
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If dump_root is an existing and non-empty directory or if
-    dump_root is a file.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.add_tensor_filter(filter_name, tensor_filter)` {#LocalCLIDebugWrapperSession.add_tensor_filter}
-
-Add a tensor filter.
-
-##### Args:
-
-
-*  <b>`filter_name`</b>: (`str`) name of the filter.
-*  <b>`tensor_filter`</b>: (`callable`) the filter callable. See the doc string of
-    `DebugDumpDir.find()` for more details about its signature.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.close()` {#LocalCLIDebugWrapperSession.close}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.graph` {#LocalCLIDebugWrapperSession.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#LocalCLIDebugWrapperSession.invoke_node_stepper}
-
-Overrides method in base class to implement interactive node stepper.
-
-##### Args:
-
-
-*  <b>`node_stepper`</b>: (`stepper.NodeStepper`) The underlying NodeStepper API
-    object.
-*  <b>`restore_variable_values_on_exit`</b>: (`bool`) Whether any variables whose
-    values have been altered during this node-stepper invocation should be
-    restored to their old values when this invocation ends.
-
-##### Returns:
-
-  The same return values as the `Session.run()` call on the same fetches as
-    the NodeStepper.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_run_end(request)` {#LocalCLIDebugWrapperSession.on_run_end}
-
-Overrides on-run-end callback.
-
-##### Actions taken:
-
-  1) Load the debug dump.
-  2) Bring up the Analyzer CLI.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of OnSessionInitRequest.
-
-##### Returns:
-
-  An instance of OnSessionInitResponse.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_run_start(request)` {#LocalCLIDebugWrapperSession.on_run_start}
-
-Overrides on-run-start callback.
-
-##### Invoke the CLI to let user choose what action to take:
-
-  `run` / `invoke_stepper`.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If user chooses to prematurely exit the debugger.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_session_init(request)` {#LocalCLIDebugWrapperSession.on_session_init}
-
-Overrides on-session-init callback.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#LocalCLIDebugWrapperSession.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#LocalCLIDebugWrapperSession.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#LocalCLIDebugWrapperSession.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.sess_str` {#LocalCLIDebugWrapperSession.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.session` {#LocalCLIDebugWrapperSession.session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.add_debug_tensor_watch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.add_debug_tensor_watch.md
deleted file mode 100644
index 91ce5083e53..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.add_debug_tensor_watch.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf_debug.add_debug_tensor_watch(run_options, node_name, output_slot=0, debug_ops='DebugIdentity', debug_urls=None)` {#add_debug_tensor_watch}
-
-Add watch on a `Tensor` to `RunOptions`.
-
-N.B.: Under certain circumstances, the `Tensor` may not be actually watched
-  (e.g., if the node of the `Tensor` is constant-folded during runtime).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`node_name`</b>: (`str`) name of the node to watch.
-*  <b>`output_slot`</b>: (`int`) output slot index of the tensor from the watched node.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
-    `list` of `str` or a single `str`. The latter case is equivalent to a
-    `list` of `str` with only one element.
-*  <b>`debug_urls`</b>: (`str` or `list` of `str`) URL(s) to send debug values to,
-    e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.load_tensor_from_event_file.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.load_tensor_from_event_file.md
deleted file mode 100644
index 453be176437..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.load_tensor_from_event_file.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf_debug.load_tensor_from_event_file(event_file_path)` {#load_tensor_from_event_file}
-
-Load a tensor from an event file.
-
-Assumes that the event file contains a `Event` protobuf and the `Event`
-protobuf contains a `Tensor` value.
-
-##### Args:
-
-
-*  <b>`event_file_path`</b>: (`str`) path to the event file.
-
-##### Returns:
-
-  The tensor value loaded from the event file, as a `numpy.ndarray`. For
-  uninitialized Tensors, returns `None`. For Tensors of data types that
-  cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return
-  `None`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.Assert.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.Assert.md
deleted file mode 100644
index 35325fadaa2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.Assert.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.Assert(condition, data, summarize=None, name=None)` {#Assert}
-
-Asserts that the given condition is true.
-
-If `condition` evaluates to false, print the list of tensors in `data`.
-`summarize` determines how many entries of the tensors to print.
-
-NOTE: To ensure that Assert executes, one usually attaches a dependency:
-
-```python
-# Ensure maximum element of x is smaller or equal to 1
-assert_op = tf.Assert(tf.less_equal(tf.reduce_max(x), 1.), [x])
-with tf.control_dependencies([assert_op]):
-  ... code using x ...
-```
-
-##### Args:
-
-
-*  <b>`condition`</b>: The condition to evaluate.
-*  <b>`data`</b>: The tensors to print out when condition is false.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`assert_op`</b>: An `Operation` that, when executed, raises a
-  `tf.errors.InvalidArgumentError` if `condition` is not true.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.ConditionalAccumulator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.ConditionalAccumulator.md
deleted file mode 100644
index e555239caa4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.ConditionalAccumulator.md
+++ /dev/null
@@ -1,136 +0,0 @@
-A conditional accumulator for aggregating gradients.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-- - -
-
-#### `tf.ConditionalAccumulator.__init__(dtype, shape=None, shared_name=None, name='conditional_accumulator')` {#ConditionalAccumulator.__init__}
-
-Creates a new ConditionalAccumulator.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: Datatype of the accumulated gradients.
-*  <b>`shape`</b>: Shape of the accumulated gradients.
-*  <b>`shared_name`</b>: Optional. If non-empty, this accumulator will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.accumulator_ref` {#ConditionalAccumulator.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.apply_grad(grad, local_step=0, name=None)` {#ConditionalAccumulator.apply_grad}
-
-Attempts to apply a gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-##### Args:
-
-
-*  <b>`grad`</b>: The gradient tensor to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.dtype` {#ConditionalAccumulator.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.name` {#ConditionalAccumulator.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.num_accumulated(name=None)` {#ConditionalAccumulator.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.set_global_step(new_global_step, name=None)` {#ConditionalAccumulator.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.take_grad(num_required, name=None)` {#ConditionalAccumulator.take_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  A tensor holding the value of the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.FixedLengthRecordReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.FixedLengthRecordReader.md
deleted file mode 100644
index 5e3ae19b93f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.FixedLengthRecordReader.md
+++ /dev/null
@@ -1,175 +0,0 @@
-A Reader that outputs fixed-length records from a file.
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.FixedLengthRecordReader.__init__(record_bytes, header_bytes=None, footer_bytes=None, name=None)` {#FixedLengthRecordReader.__init__}
-
-Create a FixedLengthRecordReader.
-
-##### Args:
-
-
-*  <b>`record_bytes`</b>: An int.
-*  <b>`header_bytes`</b>: An optional int. Defaults to 0.
-*  <b>`footer_bytes`</b>: An optional int. Defaults to 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.num_records_produced(name=None)` {#FixedLengthRecordReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.num_work_units_completed(name=None)` {#FixedLengthRecordReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.read(queue, name=None)` {#FixedLengthRecordReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.read_up_to(queue, num_records, name=None)` {#FixedLengthRecordReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.reader_ref` {#FixedLengthRecordReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.reset(name=None)` {#FixedLengthRecordReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.restore_state(state, name=None)` {#FixedLengthRecordReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.serialize_state(name=None)` {#FixedLengthRecordReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.supports_serialize` {#FixedLengthRecordReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md
deleted file mode 100644
index 344cb01ce9c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.argmin.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.argmin(input, axis=None, name=None, dimension=None)` {#argmin}
-
-Returns the index with the smallest value across axes of a tensor.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    int32, 0 <= axis < rank(input).  Describes which axis
-    of the input Tensor to reduce across. For vectors, use axis = 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md
deleted file mode 100644
index b6bc1000c78..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.assert_less(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less}
-
-Assert the condition `x < y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_less(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] < y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x < y` is False.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.broadcast_static_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.broadcast_static_shape.md
deleted file mode 100644
index 3d5e1ea96a9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.broadcast_static_shape.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.broadcast_static_shape(shape_x, shape_y)` {#broadcast_static_shape}
-
-Returns the broadcasted static shape between `shape_x` and `shape_y`.
-
-##### Args:
-
-
-*  <b>`shape_x`</b>: A `TensorShape`
-*  <b>`shape_y`</b>: A `TensorShape`
-
-##### Returns:
-
-  A `TensorShape` representing the broadcasted shape.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the two shapes can not be broadcasted.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.clip_by_value.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.clip_by_value.md
deleted file mode 100644
index 7cd7e0311e3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.clip_by_value.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.clip_by_value(t, clip_value_min, clip_value_max, name=None)` {#clip_by_value}
-
-Clips tensor values to a specified min and max.
-
-Given a tensor `t`, this operation returns a tensor of the same type and
-shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-greater than `clip_value_max` are set to `clip_value_max`.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_value_min`</b>: A 0-D (scalar) `Tensor`. The minimum value to clip by.
-*  <b>`clip_value_max`</b>: A 0-D (scalar) `Tensor`. The maximum value to clip by.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.complex.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.complex.md
deleted file mode 100644
index 79809ab1d61..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.complex.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.complex(real, imag, name=None)` {#complex}
-
-Converts two real numbers to a complex number.
-
-Given a tensor `real` representing the real part of a complex number, and a
-tensor `imag` representing the imaginary part of a complex number, this
-operation returns complex numbers elementwise of the form \\(a + bj\\), where
-*a* represents the `real` part and *b* represents the `imag` part.
-
-The input tensors `real` and `imag` must have the same shape.
-
-For example:
-
-```
-# tensor 'real' is [2.25, 3.25]
-# tensor `imag` is [4.75, 5.75]
-tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-```
-
-##### Args:
-
-
-*  <b>`real`</b>: A `Tensor`. Must be one of the following types: `float32`,
-    `float64`.
-*  <b>`imag`</b>: A `Tensor`. Must have the same type as `real`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64` or `complex128`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.md
deleted file mode 100644
index 95891798977..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.md
+++ /dev/null
@@ -1,111 +0,0 @@
-StochasticTensor is a BaseStochasticTensor backed by a distribution.
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.__init__(dist, name='StochasticTensor', dist_value_type=None, loss_fn=score_function)` {#StochasticTensor.__init__}
-
-Construct a `StochasticTensor`.
-
-`StochasticTensor` is backed by the `dist` distribution and its `value`
-method will return the same value each time it is called. What `value` is
-returned is controlled by the `dist_value_type` (defaults to
-`SampleValue`).
-
-Some distributions' sample functions are not differentiable (e.g. a sample
-from a discrete distribution like a Bernoulli) and so to differentiate
-wrt parameters upstream of the sample requires a gradient estimator like
-the score function estimator. This is accomplished by passing a
-differentiable `loss_fn` to the `StochasticTensor`, which
-defaults to a function whose derivative is the score function estimator.
-Calling `stochastic_graph.surrogate_loss(final_losses)` will call
-`loss()` on every `StochasticTensor` upstream of final losses.
-
-`loss()` will return None for `StochasticTensor`s backed by
-reparameterized distributions; it will also return None if the value type is
-`MeanValueType` or if `loss_fn=None`.
-
-##### Args:
-
-
-*  <b>`dist`</b>: an instance of `Distribution`.
-*  <b>`name`</b>: a name for this `StochasticTensor` and its ops.
-*  <b>`dist_value_type`</b>: a `_StochasticValueType`, which will determine what the
-      `value` of this `StochasticTensor` will be. If not provided, the
-      value type set with the `value_type` context manager will be used.
-*  <b>`loss_fn`</b>: callable that takes
-      `(st, st.value(), influenced_loss)`, where
-      `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
-      default, `loss_fn` is the `score_function`, or more precisely, the
-      integral of the score function, such that when the gradient is taken,
-      the score function results. See the `stochastic_gradient_estimators`
-      module for additional loss functions and baselines.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist` is not an instance of `Distribution`.
-*  <b>`TypeError`</b>: if `loss_fn` is not `callable`.
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.distribution` {#StochasticTensor.distribution}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.dtype` {#StochasticTensor.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.entropy(name='entropy')` {#StochasticTensor.entropy}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.graph` {#StochasticTensor.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.loss(final_loss, name='Loss')` {#StochasticTensor.loss}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.mean(name='mean')` {#StochasticTensor.mean}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.name` {#StochasticTensor.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.value(name='value')` {#StochasticTensor.value}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.StochasticTensor.value_type` {#StochasticTensor.value_type}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.variational_inference.ELBOForms.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.variational_inference.ELBOForms.md
deleted file mode 100644
index 2d488ac3d03..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.bayesflow.variational_inference.ELBOForms.md
+++ /dev/null
@@ -1,18 +0,0 @@
-Constants to control the `elbo` calculation.
-
-`analytic_kl` uses the analytic KL divergence between the
-variational distribution(s) and the prior(s).
-
-`analytic_entropy` uses the analytic entropy of the variational
-distribution(s).
-
-`sample` uses the sample KL or the sample entropy is the joint is provided.
-
-See `elbo` for what is used with `default`.
-- - -
-
-#### `tf.contrib.bayesflow.variational_inference.ELBOForms.check_form(form)` {#ELBOForms.check_form}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md
deleted file mode 100644
index d0ebb5bab62..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.crf.viterbi_decode(score, transition_params)` {#viterbi_decode}
-
-Decode the highest scoring sequence of tags outside of TensorFlow.
-
-This should only be used at test time.
-
-##### Args:
-
-
-*  <b>`score`</b>: A [seq_len, num_tags] matrix of unary potentials.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
-
-##### Returns:
-
-
-*  <b>`viterbi`</b>: A [seq_len] list of integers containing the highest scoring tag
-      indicies.
-*  <b>`viterbi_score`</b>: A float containing the score for the Viterbi sequence.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md
deleted file mode 100644
index 7a375cfd203..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md
+++ /dev/null
@@ -1,306 +0,0 @@
-Bijector which inverts another Bijector.
-
-Example Use: [ExpGammaDistribution (see Background & Context)](
-https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-models `Y=log(X)` where `X ~ Gamma`.
-
-```python
-exp_gamma_distribution = TransformedDistribution(
-  Gamma(alpha=1., beta=2.),
-  bijector.Invert(bijector.Exp())
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.__init__(bijector, validate_args=False, name=None)` {#Invert.__init__}
-
-Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-efficient if the base bijector implements `_forward_log_det_jacobian`. If
-`_forward_log_det_jacobian` is not implemented then the following code is
-used:
-
-```python
-y = self.inverse(x, **kwargs)
-return -self.inverse_log_det_jacobian(y, **kwargs)
-```
-
-##### Args:
-
-
-*  <b>`bijector`</b>: Bijector instance.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.bijector` {#Invert.bijector}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.dtype` {#Invert.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.event_ndims` {#Invert.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward(x, name='forward')` {#Invert.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_event_shape(input_shape)` {#Invert.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Invert.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Invert.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.graph_parents` {#Invert.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse(y, name='inverse')` {#Invert.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Invert.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_event_shape(output_shape)` {#Invert.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Invert.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Invert.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.is_constant_jacobian` {#Invert.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.name` {#Invert.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Invert.validate_args` {#Invert.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Softplus.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Softplus.md
deleted file mode 100644
index 7d021a94800..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Softplus.md
+++ /dev/null
@@ -1,294 +0,0 @@
-Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-The softplus `Bijector` has the following two useful properties:
-
-* The domain is the positive real numbers
-* `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-  the `Exp` `Bijector`.
-
-  Example Use:
-
-  ```python
-  # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-  # batch ndim and 2 event ndims (i.e., vector of matrices).
-  softplus = Softplus(event_ndims=2)
-  x = [[[1., 2],
-         [3, 4]],
-        [[5, 6],
-         [7, 8]]]
-  log(1 + exp(x)) == softplus.forward(x)
-  log(exp(x) - 1) == softplus.inverse(x)
-  ```
-
-  Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-  reduction over the event space.
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.__init__(event_ndims=0, validate_args=False, name='softplus')` {#Softplus.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.dtype` {#Softplus.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.event_ndims` {#Softplus.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward(x, name='forward')` {#Softplus.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_event_shape(input_shape)` {#Softplus.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Softplus.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Softplus.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.graph_parents` {#Softplus.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse(y, name='inverse')` {#Softplus.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Softplus.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_event_shape(output_shape)` {#Softplus.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Softplus.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Softplus.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.is_constant_jacobian` {#Softplus.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.name` {#Softplus.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Softplus.validate_args` {#Softplus.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.normal_conjugates_known_scale_predictive.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.normal_conjugates_known_scale_predictive.md
deleted file mode 100644
index 6349d89fb0a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.normal_conjugates_known_scale_predictive.md
+++ /dev/null
@@ -1,55 +0,0 @@
-### `tf.contrib.distributions.normal_conjugates_known_scale_predictive(prior, scale, s, n)` {#normal_conjugates_known_scale_predictive}
-
-Posterior predictive Normal distribution w. conjugate prior on the mean.
-
-This model assumes that `n` observations (with sum `s`) come from a
-Normal with unknown mean `loc` (described by the Normal `prior`)
-and known variance `scale^2`.  The "known scale predictive"
-is the distribution of new observations, conditioned on the existing
-observations and our prior.
-
-Accepts a prior Normal distribution object, having parameters
-`loc0` and `scale0`, as well as known `scale` values of the predictive
-distribution(s) (also assumed Normal),
-and statistical estimates `s` (the sum(s) of the observations) and
-`n` (the number(s) of observations).
-
-Calculates the Normal distribution(s) `p(x | sigma^2)`:
-
-```
-p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.loc, prior.scale**2) dmu
-               = N(x | prior.loc, 1/(sigma^2 + prior.scale**2))
-```
-
-Returns the predictive posterior distribution object, with parameters
-`(loc', scale'^2)`, where:
-
-```
-sigma_n^2 = 1/(1/sigma0^2 + n/sigma^2),
-mu' = (mu0/sigma0^2 + s/sigma^2) * sigma_n^2.
-sigma'^2 = sigma_n^2 + sigma^2,
-```
-
-Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
-will broadcast in the case of multidimensional sets of parameters.
-
-##### Args:
-
-
-*  <b>`prior`</b>: `Normal` object of type `dtype`:
-    the prior distribution having parameters `(loc0, scale0)`.
-*  <b>`scale`</b>: tensor of type `dtype`, taking values `scale > 0`.
-    The known stddev parameter(s).
-*  <b>`s`</b>: Tensor of type `dtype`.  The sum(s) of observations.
-*  <b>`n`</b>: Tensor of type `int`.  The number(s) of observations.
-
-##### Returns:
-
-  A new Normal predictive distribution object.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Normal object.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.VariableDeviceChooser.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.VariableDeviceChooser.md
deleted file mode 100644
index 0b25e06baee..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.VariableDeviceChooser.md
+++ /dev/null
@@ -1,36 +0,0 @@
-Device chooser for variables.
-
-When using a parameter server it will assign them in a round-robin fashion.
-When not using a parameter server it allows GPU or CPU placement.
-- - -
-
-#### `tf.contrib.framework.VariableDeviceChooser.__call__(op)` {#VariableDeviceChooser.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.framework.VariableDeviceChooser.__init__(num_tasks=0, job_name='ps', device_type='CPU', device_index=0)` {#VariableDeviceChooser.__init__}
-
-Initialize VariableDeviceChooser.
-
-##### Usage:
-
-  To use with 2 parameter servers:
-    VariableDeviceChooser(2)
-
-  To use without parameter servers:
-    VariableDeviceChooser()
-    VariableDeviceChooser(device_type='GPU') # For GPU placement
-
-##### Args:
-
-
-*  <b>`num_tasks`</b>: number of tasks.
-*  <b>`job_name`</b>: String, a name for the parameter server job.
-*  <b>`device_type`</b>: Optional device type string (e.g. "CPU" or "GPU")
-*  <b>`device_index`</b>: int.  Optional device index.  If left
-    unspecified, device represents 'any' device_index.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.add_arg_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.add_arg_scope.md
deleted file mode 100644
index a726ebad961..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.add_arg_scope.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.contrib.framework.add_arg_scope(func)` {#add_arg_scope}
-
-Decorates a function with args so it can be used within an arg_scope.
-
-##### Args:
-
-
-*  <b>`func`</b>: function to decorate.
-
-##### Returns:
-
-  A tuple with the decorated function func_with_args().
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.load_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.load_variable.md
deleted file mode 100644
index 410b18e466b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.framework.load_variable.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.load_variable(checkpoint_dir, name)` {#load_variable}
-
-Returns a Tensor with the contents of the given variable in the checkpoint.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-*  <b>`name`</b>: Name of the tensor to return.
-
-##### Returns:
-
-  `Tensor` object.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.OpMatcher.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.OpMatcher.md
deleted file mode 100644
index 61931afde59..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.OpMatcher.md
+++ /dev/null
@@ -1,36 +0,0 @@
-Graph match class.
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.__call__(op)` {#OpMatcher.__call__}
-
-Evaluate if the op matches or not.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.__init__(positive_filter)` {#OpMatcher.__init__}
-
-Graph match constructor.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.control_input_ops(*args)` {#OpMatcher.control_input_ops}
-
-Add input matches.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.input_ops(*args)` {#OpMatcher.input_ops}
-
-Add input matches.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.OpMatcher.output_ops(*args)` {#OpMatcher.output_ops}
-
-Add output matches.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.get_consuming_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.get_consuming_ops.md
deleted file mode 100644
index 2db80c07adc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.get_consuming_ops.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.get_consuming_ops(ts)` {#get_consuming_ops}
-
-Return all the consuming ops of the tensors in ts.
-
-##### Args:
-
-
-*  <b>`ts`</b>: a list of `tf.Tensor`
-
-##### Returns:
-
-  A list of all the consuming `tf.Operation` of the tensors in `ts`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts cannot be converted to a list of `tf.Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.graph_replace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.graph_replace.md
deleted file mode 100644
index 56143111a51..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.graph_replace.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.graph_editor.graph_replace(target_ts, replacement_ts, dst_scope='', src_scope='', reuse_dst_scope=False)` {#graph_replace}
-
-Create a new graph which compute the targets from the replaced Tensors.
-
-##### Args:
-
-
-*  <b>`target_ts`</b>: a single tf.Tensor or an iterable of tf.Tensor.
-*  <b>`replacement_ts`</b>: dictionary mapping from original tensors to replaced tensors
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A single tf.Tensor or a list of target tf.Tensor, depending on
-  the type of the input argument `target_ts`.
-  The returned tensors are recomputed using the tensors from replacement_ts.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the targets are not connected to replacement_ts.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.make_list_of_op.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.make_list_of_op.md
deleted file mode 100644
index 61273bffc3e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.make_list_of_op.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.contrib.graph_editor.make_list_of_op(ops, check_graph=True, allow_graph=True, ignore_ts=False)` {#make_list_of_op}
-
-Convert ops to a list of `tf.Operation`.
-
-##### Args:
-
-
-*  <b>`ops`</b>: can be an iterable of `tf.Operation`, a `tf.Graph` or a single
-    operation.
-*  <b>`check_graph`</b>: if `True` check if all the operations belong to the same graph.
-*  <b>`allow_graph`</b>: if `False` a `tf.Graph` cannot be converted.
-*  <b>`ignore_ts`</b>: if True, silently ignore `tf.Tensor`.
-
-##### Returns:
-
-  A newly created list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ops cannot be converted to a list of `tf.Operation` or,
-   if `check_graph` is `True`, if all the ops do not belong to the
-   same graph.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.reroute_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.reroute_outputs.md
deleted file mode 100644
index a00fe8e5894..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.reroute_outputs.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.reroute_outputs(sgv0, sgv1)` {#reroute_outputs}
-
-Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
deleted file mode 100644
index 3be069140d4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
-
-Make a subgraph from a name scope.
-
-##### Args:
-
-
-*  <b>`scope`</b>: the name of the scope.
-*  <b>`graph`</b>: the `tf.Graph`.
-
-##### Returns:
-
-  A subgraph view representing the given scope.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md
deleted file mode 100644
index d5c41576c90..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.contrib.layers.avg_pool2d(*args, **kwargs)` {#avg_pool2d}
-
-Adds a 2D average pooling op.
-
-It is assumed that the pooling is done per image but not in batch or channels.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor of shape `[batch_size, height, width, channels]` if
-    `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
-    `data_format` is `NCHW`.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
-    pooling kernel over which the op is computed. Can be an int if both
-    values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A `Tensor` representing the results of the pooling operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
deleted file mode 100644
index d6f7271be81..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
+++ /dev/null
@@ -1,83 +0,0 @@
-### `tf.contrib.layers.batch_norm(*args, **kwargs)` {#batch_norm}
-
-Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
-
-  "Batch Normalization: Accelerating Deep Network Training by Reducing
-  Internal Covariate Shift"
-
-  Sergey Ioffe, Christian Szegedy
-
-Can be used as a normalizer function for conv2d and fully_connected.
-
-Note: When is_training is True the moving_mean and moving_variance need to be
-updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so
-they need to be added as a dependency to the `train_op`, example:
-
-  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-  if update_ops:
-    updates = tf.group(*update_ops)
-    total_loss = control_flow_ops.with_dependencies([updates], total_loss)
-
-One can set updates_collections=None to force the updates in place, but that
-can have speed penalty, especially in distributed settings.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor with 2 or more dimensions, where the first dimension has
-    `batch_size`. The normalization is over all but the last dimension if
-    `data_format` is `NHWC` and the second dimension if `data_format` is
-    `NCHW`.
-*  <b>`decay`</b>: Decay for the moving average. Reasonable values for `decay` are close
-    to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
-    Lower `decay` value (recommend trying `decay`=0.9) if model experiences
-    reasonably good training performance but poor validation and/or test
-    performance. Try zero_debias_moving_mean=True for improved stability.
-*  <b>`center`</b>: If True, add offset of `beta` to normalized tensor. If False, `beta`
-    is ignored.
-*  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
-    not used. When the next layer is linear (also e.g. `nn.relu`), this can be
-    disabled since the scaling can be done by the next layer.
-*  <b>`epsilon`</b>: Small float added to variance to avoid dividing by zero.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`param_initializers`</b>: Optional initializers for beta, gamma, moving mean and
-    moving variance.
-*  <b>`updates_collections`</b>: Collections to collect the update ops for computation.
-    The updates_ops need to be executed with the train_op.
-    If None, a control dependency would be added to make sure the updates are
-    computed in place.
-*  <b>`is_training`</b>: Whether or not the layer is in training mode. In training mode
-    it would accumulate the statistics of the moments into `moving_mean` and
-    `moving_variance` using an exponential moving average with the given
-    `decay`. When it is not in training mode then it would use the values of
-    the `moving_mean` and the `moving_variance`.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional collections for the variables.
-*  <b>`outputs_collections`</b>: Collections to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`batch_weights`</b>: An optional tensor of shape `[batch_size]`,
-    containing a frequency weight for each batch item. If present,
-    then the batch normalization uses weighted mean and
-    variance. (This can be used to correct for bias in training
-    example selection.)
-*  <b>`fused`</b>: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`zero_debias_moving_mean`</b>: Use zero_debias for moving_mean. It creates a new
-    pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `batch_weights` is not None and `fused` is True.
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If the rank of `inputs` is undefined.
-*  <b>`ValueError`</b>: If rank or channels dimension of `inputs` is undefined.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.check_feature_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.check_feature_columns.md
deleted file mode 100644
index e484ee865b1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.check_feature_columns.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.layers.check_feature_columns(feature_columns)` {#check_feature_columns}
-
-Checks the validity of the set of FeatureColumns.
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: A set of instances or subclasses of FeatureColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If there are duplicate feature column keys.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.one_hot_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.one_hot_column.md
deleted file mode 100644
index b79159f7988..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.one_hot_column.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.layers.one_hot_column(sparse_id_column)` {#one_hot_column}
-
-Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
-
-##### Args:
-
-
-*  <b>`sparse_id_column`</b>: A _SparseColumn which is created by
-      `sparse_column_with_*`
-      or crossed_column functions. Note that `combiner` defined in
-      `sparse_id_column` is ignored.
-
-##### Returns:
-
-  An _OneHotColumn.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.regression_target.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.regression_target.md
deleted file mode 100644
index a75a8a8f74d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.regression_target.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.layers.regression_target(*args, **kwargs)` {#regression_target}
-
-Creates a _TargetColumn for linear regression. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-12.
-Instructions for updating:
-This file will be removed after the deprecation date.Please switch to third_party/tensorflow/contrib/learn/python/learn/estimators/head.py
-
-##### Args:
-
-
-*  <b>`label_name`</b>: String, name of the key in label dict. Can be null if label
-      is a tensor (single headed models).
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`label_dimension`</b>: dimension of the target for multilabels.
-
-##### Returns:
-
-  An instance of _TargetColumn
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.unit_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.unit_norm.md
deleted file mode 100644
index 9b0752ff7fd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.unit_norm.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.layers.unit_norm(*args, **kwargs)` {#unit_norm}
-
-Normalizes the given input across the specified dimension to unit length.
-
-Note that the rank of `input` must be known.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of arbitrary size.
-*  <b>`dim`</b>: The dimension along which the input is normalized.
-*  <b>`epsilon`</b>: A small value to add to the inputs to avoid dividing by zero.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  The normalized `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If dim is smaller than the number of dimensions in 'inputs'.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
deleted file mode 100644
index f58904ed71c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ /dev/null
@@ -1,467 +0,0 @@
-A classifier for TensorFlow DNN models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNClassifier(
-    feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    hidden_units=[1024, 512, 256])
-
-# Or estimator using the ProximalAdagradOptimizer optimizer with
-# regularization.
-estimator = DNNClassifier(
-    feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  pass
-estimator.fit(input_fn=input_fn_train)
-
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  pass
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-   `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__init__(hidden_units, feature_columns, model_dir=None, n_classes=2, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNClassifier.__init__}
-
-Initializes a DNNClassifier instance.
-
-##### Args:
-
-
-*  <b>`hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
-    has 32.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    It must be greater than 1. Note: Class labels are integers representing
-    the class index (i.e. values from 0 to n_classes-1). For arbitrary
-    label values (e.g. string labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Adagrad optimizer.
-*  <b>`activation_fn`</b>: Activation function applied to each layer. If `None`, will
-    use `tf.nn.relu`.
-*  <b>`dropout`</b>: When not `None`, the probability we will drop out a given
-    coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are
-    clipped to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Returns:
-
-  A `DNNClassifier` estimator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `n_classes` < 2.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__repr__()` {#DNNClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.bias_` {#DNNClassifier.bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.config` {#DNNClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.evaluate(*args, **kwargs)` {#DNNClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNClassifier.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.fit(*args, **kwargs)` {#DNNClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_params(deep=True)` {#DNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_variable_names()` {#DNNClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_variable_value(name)` {#DNNClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.model_dir` {#DNNClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.partial_fit(*args, **kwargs)` {#DNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict(*args, **kwargs)` {#DNNClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict_classes(*args, **kwargs)` {#DNNClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.predict_proba(*args, **kwargs)` {#DNNClassifier.predict_proba}
-
-Returns predicted probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.set_params(**params)` {#DNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.weights_` {#DNNClassifier.weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNLinearCombinedClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNLinearCombinedClassifier.md
deleted file mode 100644
index 0c82c245153..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNLinearCombinedClassifier.md
+++ /dev/null
@@ -1,493 +0,0 @@
-A classifier for TensorFlow Linear and DNN joined training models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNLinearCombinedClassifier(
-    # common settings
-    n_classes=n_classes,
-    weight_column_name=weight_column_name,
-    # wide settings
-    linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
-    linear_optimizer=tf.train.FtrlOptimizer(...),
-    # deep settings
-    dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    dnn_hidden_units=[1000, 500, 100],
-    dnn_optimizer=tf.train.AdagradOptimizer(...))
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  ...
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  ...
-estimator.fit(input_fn=input_fn_train)
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-    if `weight_column_name` is not `None`, a feature with
-      `key=weight_column_name` whose value is a `Tensor`.
-    for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `SparseColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `WeightedSparseColumn`, two features: the first with
-      `key` the id column name, the second with `key` the weight column name.
-      Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `RealValuedColumn, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.__init__(model_dir=None, n_classes=2, weight_column_name=None, linear_feature_columns=None, linear_optimizer=None, _joint_linear_weights=False, dnn_feature_columns=None, dnn_optimizer=None, dnn_hidden_units=None, dnn_activation_fn=relu, dnn_dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNLinearCombinedClassifier.__init__}
-
-Constructs a DNNLinearCombinedClassifier instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`n_classes`</b>: number of label classes. Default is binary classification.
-    Note that class labels are integers representing the class index (i.e.
-    values from 0 to n_classes-1). For arbitrary label values (e.g. string
-    labels), convert to class indices first.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training.
-    It will be multiplied by the loss of the example.
-*  <b>`linear_feature_columns`</b>: An iterable containing all the feature columns
-    used by linear part of the model. All items in the set must be
-    instances of classes derived from `FeatureColumn`.
-*  <b>`linear_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the linear part of the model. If `None`, will use a FTRL optimizer.
-  _joint_linear_weights: If True a single (possibly partitioned) variable
-    will be used to store the linear model weights. It's faster, but
-    requires all columns are sparse and have the 'sum' combiner.
-
-*  <b>`dnn_feature_columns`</b>: An iterable containing all the feature columns used
-    by deep part of the model. All items in the set must be instances of
-    classes derived from `FeatureColumn`.
-*  <b>`dnn_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the deep part of the model. If `None`, will use an Adagrad optimizer.
-*  <b>`dnn_hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected.
-*  <b>`dnn_activation_fn`</b>: Activation function applied to each layer. If `None`,
-    will use `tf.nn.relu`.
-*  <b>`dnn_dropout`</b>: When not None, the probability we will drop out
-    a given coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    tf.clip_by_global_norm for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: RunConfig object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `n_classes` < 2.
-*  <b>`ValueError`</b>: If both `linear_feature_columns` and `dnn_features_columns`
-    are empty at the same time.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.__repr__()` {#DNNLinearCombinedClassifier.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.config` {#DNNLinearCombinedClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.dnn_bias_` {#DNNLinearCombinedClassifier.dnn_bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.dnn_weights_` {#DNNLinearCombinedClassifier.dnn_weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.evaluate(*args, **kwargs)` {#DNNLinearCombinedClassifier.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNLinearCombinedClassifier.export}
-
-See BasEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNLinearCombinedClassifier.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.fit(*args, **kwargs)` {#DNNLinearCombinedClassifier.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_params(deep=True)` {#DNNLinearCombinedClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_variable_names()` {#DNNLinearCombinedClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.get_variable_value(name)` {#DNNLinearCombinedClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.linear_bias_` {#DNNLinearCombinedClassifier.linear_bias_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.linear_weights_` {#DNNLinearCombinedClassifier.linear_weights_}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-30.
-Instructions for updating:
-This method will be removed after the deprecation date. To inspect variables, use get_variable_names() and get_variable_value().
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.model_dir` {#DNNLinearCombinedClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.partial_fit(*args, **kwargs)` {#DNNLinearCombinedClassifier.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_classes, or set `outputs` argument.
-
-By default, returns predicted classes. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_classes` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns classes.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict_classes(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict_classes}
-
-Returns predicted classes for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted classes with shape [batch_size] (or an iterable
-  of predicted classes if as_iterable is True). Each predicted class is
-  represented by its class index (i.e. integer from 0 to n_classes-1).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.predict_proba(*args, **kwargs)` {#DNNLinearCombinedClassifier.predict_proba}
-
-Returns prediction probabilities for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x and y must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted probabilities with shape [batch_size, n_classes]
-  (or an iterable of predicted probabilities if as_iterable is True).
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedClassifier.set_params(**params)` {#DNNLinearCombinedClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md
deleted file mode 100644
index 8024c9f7d05..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.Evaluable.md
+++ /dev/null
@@ -1,77 +0,0 @@
-Interface for objects that are evaluatable by, e.g., `Experiment`.
-- - -
-
-#### `tf.contrib.learn.Evaluable.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#Evaluable.evaluate}
-
-Evaluates given model with provided evaluation data.
-
-Stop conditions - we evaluate on the given input data until one of the
-following:
-- If `steps` is provided, and `steps` batches of size `batch_size` are
-processed.
-- If `input_fn` is provided, and it raises an end-of-input
-exception (`OutOfRangeError` or `StopIteration`).
-- If `x` is provided, and all items in `x` have been processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-     containing the input samples for fitting the model. Can be iterator that returns
-     arrays of features or dictionary of array of features. If set, `input_fn` must
-     be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
-     label values (class labels in classification, real numbers in
-     regression) or dictionary of multiple vectors/matrices. Can be iterator
-     that returns array of targets or dictionary of array of targets. If set,
-     `input_fn` must be `None`. Note: For classification, label values must
-     be integers representing the class index (i.e. values from 0 to
-     n_classes-1).
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - Dictionary of string feature name to `Tensor` or `Tensor`.
-      labels - `Tensor` or dictionary of `Tensor` with labels.
-    If input_fn is set, `x`, `y`, and `batch_size` must be `None`. If
-    `steps` is not provided, this should raise `OutOfRangeError` or
-    `StopIteration` after the desired amount of data (e.g., one epoch) has
-    been provided. See "Stop conditions" above for specifics.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration. Must be `None` if `input_fn` is provided.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until `x` is consumed or `input_fn` raises an end-of-input exception.
-    See "Stop conditions" above for specifics.
-*  <b>`metrics`</b>: Dict of metrics to run. If None, the default metric functions
-    are used; if {}, no metrics are used. Otherwise, `metrics` should map
-    friendly names for the metric to a `MetricSpec` object defining which
-    model outputs to evaluate against which labels with which metric
-    function.
-
-    Metric ops should support streaming, e.g., returning `update_op` and
-    `value` tensors. For example, see the options defined in
-    `../../../metrics/python/ops/metrics_ops.py`.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-*  <b>`checkpoint_path`</b>: Path of a specific checkpoint to evaluate. If `None`, the
-    latest checkpoint in `model_dir` is used.
-*  <b>`hooks`</b>: List of `SessionRunHook` subclass instances. Used for callbacks
-    inside the evaluation call.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
-
-
-- - -
-
-#### `tf.contrib.learn.Evaluable.model_dir` {#Evaluable.model_dir}
-
-Returns a path in which the eval process will look for checkpoints.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.ExportStrategy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.ExportStrategy.md
deleted file mode 100644
index 513bb777b27..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.ExportStrategy.md
+++ /dev/null
@@ -1,89 +0,0 @@
-A class representing a type of model export.
-
-Typically constructed by a utility function specific to the exporter, such as
-`saved_model_export_utils.make_export_strategy()`.
-
-The fields are:
-  name: The directory name under the export base directory where exports of
-    this type will be written.
-  export_fn: A function that writes an export, given an estimator, a
-    destination path, and optionally a checkpoint path and an evaluation
-    result for that checkpoint.  This export_fn() may be run repeatedly during
-    continuous training, or just once at the end of fixed-length training.
-    Note the export_fn() may choose whether or not to export based on the eval
-    result or based on an internal timer or any other criterion, if exports
-    are not desired for every checkpoint.
-
-    The signature of this function must be one of:
-      * (estimator, export_path) -> export_path`
-      * (estimator, export_path, checkpoint_path) -> export_path`
-      * (estimator, export_path, checkpoint_path, eval_result) -> export_path`
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__getnewargs__()` {#ExportStrategy.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__getstate__()` {#ExportStrategy.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__new__(_cls, name, export_fn)` {#ExportStrategy.__new__}
-
-Create new instance of ExportStrategy(name, export_fn)
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.__repr__()` {#ExportStrategy.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.export(estimator, export_path, checkpoint_path=None, eval_result=None)` {#ExportStrategy.export}
-
-Exports the given Estimator to a specific format.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the Estimator to export.
-*  <b>`export_path`</b>: A string containing a directory where to write the export.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the strategy may locate a checkpoint (e.g. the most recent) by itself.
-*  <b>`eval_result`</b>: The output of Estimator.evaluate on this checkpoint.  This
-    should be set only if checkpoint_path is provided (otherwise it is
-    unclear which checkpoint this eval refers to).
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the export_fn does not have the required signature
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.export_fn` {#ExportStrategy.export_fn}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.contrib.learn.ExportStrategy.name` {#ExportStrategy.name}
-
-Alias for field number 0
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
deleted file mode 100644
index c782975fa2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
+++ /dev/null
@@ -1,163 +0,0 @@
-This class specifies the configurations for an `Estimator` run.
-
-If you're a Google-internal user using command line flags with
-`learn_runner.py` (for instance, to do distributed training or to use
-parameter servers), you probably want to use `learn_runner.EstimatorConfig`
-instead.
-- - -
-
-#### `tf.contrib.learn.RunConfig.__init__(master=None, num_cores=0, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, save_checkpoints_steps=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, evaluation_master='')` {#RunConfig.__init__}
-
-Constructor.
-
-Note that the superclass `ClusterConfig` may set properties like
-`cluster_spec`, `is_chief`, `master` (if `None` in the args),
-`num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG`
-environment variable. See `ClusterConfig` for more details.
-
-##### Args:
-
-
-*  <b>`master`</b>: TensorFlow master. Defaults to empty string for local.
-*  <b>`num_cores`</b>: Number of cores to be used. If 0, the system picks an
-    appropriate number (default: 0).
-*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
-*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
-    each GPU uniformly on the same machine.
-*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
-    Setting this value allows consistency between reruns.
-*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
-*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds. Can not
-      be specified with `save_checkpoints_steps`.
-*  <b>`save_checkpoints_steps`</b>: Save checkpoints every this many steps. Can not be
-      specified with `save_checkpoints_secs`.
-*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
-    keep. As new files are created, older files are deleted. If None or 0,
-    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
-    checkpoint files are kept.)
-*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
-    to be saved. The default value of 10,000 hours effectively disables
-    the feature.
-*  <b>`evaluation_master`</b>: the master on which to perform evaluation.
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.cluster_spec` {#RunConfig.cluster_spec}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.environment` {#RunConfig.environment}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.evaluation_master` {#RunConfig.evaluation_master}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.get_task_id()` {#RunConfig.get_task_id}
-
-Returns task index from `TF_CONFIG` environmental variable.
-
-If you have a ClusterConfig instance, you can just access its task_id
-property instead of calling this function and re-parsing the environmental
-variable.
-
-##### Returns:
-
-  `TF_CONFIG['task']['index']`. Defaults to 0.
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.is_chief` {#RunConfig.is_chief}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.keep_checkpoint_every_n_hours` {#RunConfig.keep_checkpoint_every_n_hours}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.keep_checkpoint_max` {#RunConfig.keep_checkpoint_max}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.master` {#RunConfig.master}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.num_ps_replicas` {#RunConfig.num_ps_replicas}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_checkpoints_secs` {#RunConfig.save_checkpoints_secs}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_checkpoints_steps` {#RunConfig.save_checkpoints_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.save_summary_steps` {#RunConfig.save_summary_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.task_id` {#RunConfig.task_id}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.task_type` {#RunConfig.task_type}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.tf_config` {#RunConfig.tf_config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.RunConfig.tf_random_seed` {#RunConfig.tf_random_seed}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.evaluate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.evaluate.md
deleted file mode 100644
index d98fb5061ce..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.evaluate.md
+++ /dev/null
@@ -1,57 +0,0 @@
-### `tf.contrib.learn.evaluate(*args, **kwargs)` {#evaluate}
-
-Evaluate a model loaded from a checkpoint. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-Given `graph`, a directory to write summaries to (`output_dir`), a checkpoint
-to restore variables from, and a `dict` of `Tensor`s to evaluate, run an eval
-loop for `max_steps` steps, or until an exception (generally, an
-end-of-input signal from a reader operation) is raised from running
-`eval_dict`.
-
-In each step of evaluation, all tensors in the `eval_dict` are evaluated, and
-every `log_every_steps` steps, they are logged. At the very end of evaluation,
-a summary is evaluated (finding the summary ops using `Supervisor`'s logic)
-and written to `output_dir`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` to train. It is expected that this graph is not in use
-    elsewhere.
-*  <b>`output_dir`</b>: A string containing the directory to write a summary to.
-*  <b>`checkpoint_path`</b>: A string containing the path to a checkpoint to restore.
-    Can be `None` if the graph doesn't require loading any variables.
-*  <b>`eval_dict`</b>: A `dict` mapping string names to tensors to evaluate. It is
-    evaluated in every logging step. The result of the final evaluation is
-    returned. If `update_op` is None, then it's evaluated in every step. If
-    `max_steps` is `None`, this should depend on a reader that will raise an
-    end-of-input exception when the inputs are exhausted.
-*  <b>`update_op`</b>: A `Tensor` which is run in every step.
-*  <b>`global_step_tensor`</b>: A `Variable` containing the global step. If `None`,
-    one is extracted from the graph using the same logic as in `Supervisor`.
-    Used to place eval summaries on training curves.
-*  <b>`supervisor_master`</b>: The master string to use when preparing the session.
-*  <b>`log_every_steps`</b>: Integer. Output logs every `log_every_steps` evaluation
-    steps. The logs contain the `eval_dict` and timing information.
-*  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
-    passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Integer. Evaluate `eval_dict` this many times.
-
-##### Returns:
-
-  A tuple `(eval_results, global_step)`:
-
-*  <b>`eval_results`</b>: A `dict` mapping `string` to numeric values (`int`, `float`)
-    that are the result of running eval_dict in the last step. `None` if no
-    eval steps were run.
-*  <b>`global_step`</b>: The global step this evaluation corresponds to.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `output_dir` is empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.infer_real_valued_columns_from_input.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.infer_real_valued_columns_from_input.md
deleted file mode 100644
index b9de559b208..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.infer_real_valued_columns_from_input.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.learn.infer_real_valued_columns_from_input(x)` {#infer_real_valued_columns_from_input}
-
-Creates `FeatureColumn` objects for inputs defined by input `x`.
-
-This interprets all inputs as dense, fixed-length float values.
-
-##### Args:
-
-
-*  <b>`x`</b>: Real-valued matrix of shape [n_samples, n_features...]. Can be
-     iterator that returns arrays of features.
-
-##### Returns:
-
-  List of `FeatureColumn` objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md
deleted file mode 100644
index b77f11673dd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.SummaryWriterCache.clear.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.contrib.learn.monitors.SummaryWriterCache.clear()` {#SummaryWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.get_default_monitors.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.get_default_monitors.md
deleted file mode 100644
index 050df4d6a66..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.monitors.get_default_monitors.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.learn.monitors.get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100, output_dir=None, summary_writer=None)` {#get_default_monitors}
-
-Returns a default set of typically-used monitors.
-
-##### Args:
-
-
-*  <b>`loss_op`</b>: `Tensor`, the loss tensor. This will be printed using `PrintTensor`
-      at the default interval.
-*  <b>`summary_op`</b>: See `SummarySaver`.
-*  <b>`save_summary_steps`</b>: See `SummarySaver`.
-*  <b>`output_dir`</b>: See `SummarySaver`.
-*  <b>`summary_writer`</b>: See `SummarySaver`.
-
-##### Returns:
-
-  `list` of monitors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.read_batch_features.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.read_batch_features.md
deleted file mode 100644
index ca012afd172..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.read_batch_features.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.contrib.learn.read_batch_features(file_pattern, batch_size, features, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, feature_queue_capacity=100, reader_num_threads=1, parse_fn=None, name=None)` {#read_batch_features}
-
-Adds operations to read, queue, batch and parse `Example` protos.
-
-Given file pattern (or list of files), will setup a queue for file names,
-read `Example` proto using provided `reader`, use batch queue to create
-batches of examples of size `batch_size` and parse example given `features`
-specification.
-
-All queue runners are added to the queue runners collection, and may be
-started via `start_queue_runners`.
-
-All ops are added to the default graph.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`reader`</b>: A function or class that returns an object with
-    `read` method, (filename tensor) -> (example tensor).
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If None, cycles through the dataset forever. NOTE - If specified,
-    creates a variable that must be initialized, so call
-    tf.local_variables_initializer() and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`feature_queue_capacity`</b>: Capacity of the parsed features queue. Set this
-    value to a small number, for example 5 if the parsed features are large.
-*  <b>`reader_num_threads`</b>: The number of threads to read examples.
-*  <b>`parse_fn`</b>: Parsing function, takes `Example` Tensor returns parsed
-    representation. If `None`, no parsing is done.
-*  <b>`name`</b>: Name of resulting op.
-
-##### Returns:
-
-  A dict of `Tensor` or `SparseTensor` objects for each in `features`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.run_feeds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.run_feeds.md
deleted file mode 100644
index bfbb5887737..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.run_feeds.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.contrib.learn.run_feeds(*args, **kwargs)` {#run_feeds}
-
-See run_feeds_iter(). Returns a `list` instead of an iterator. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md
deleted file mode 100644
index b68b765f1f1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.legacy_seq2seq.one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=None, scope=None)` {#one2many_rnn_seq2seq}
-
-One-to-many RNN sequence-to-sequence model (multi-task).
-
-This is a multi-task sequence-to-sequence model with one encoder and multiple
-decoders. Reference to multi-task sequence-to-sequence learning can be found
-here: http://arxiv.org/abs/1511.06114
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs_dict`</b>: A dictionany mapping decoder name (string) to
-    the corresponding decoder_inputs; each decoder_inputs is a list of 1D
-    Tensors of shape [batch_size]; num_decoders is defined as
-    len(decoder_inputs_dict).
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_encoder_symbols`</b>: Integer; number of symbols on the encoder side.
-*  <b>`num_decoder_symbols_dict`</b>: A dictionary mapping decoder name (string) to an
-    integer specifying number of symbols for the corresponding decoder;
-    len(num_decoder_symbols_dict) must be equal to num_decoders.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first of
-    decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype of the initial state for both the encoder and encoder
-    rnn cells (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "one2many_rnn_seq2seq"
-
-##### Returns:
-
-  A tuple of the form (outputs_dict, state_dict), where:
-
-*  <b>`outputs_dict`</b>: A mapping from decoder name (string) to a list of the same
-      length as decoder_inputs_dict[name]; each element in the list is a 2D
-      Tensors with shape [batch_size x num_decoder_symbol_list[name]]
-      containing the generated outputs.
-*  <b>`state_dict`</b>: A mapping from decoder name (string) to the final state of the
-      corresponding decoder RNN; it is a 2D Tensor of shape
-      [batch_size x cell.state_size].
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md
deleted file mode 100644
index 550dcd9eaca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.losses.mean_squared_error.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.losses.mean_squared_error(*args, **kwargs)` {#mean_squared_error}
-
-Adds a Sum-of-Squares loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.mean_squared_error instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.set_union.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.set_union.md
deleted file mode 100644
index 1bc3ec99c98..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.set_union.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### `tf.contrib.metrics.set_union(a, b, validate_indices=True)` {#set_union}
-
-Compute set union of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_union(a, b) = [
-    [
-      [
-        [1, 2, 3],
-        [2, 3],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  unions.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_precision_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_precision_at_thresholds.md
deleted file mode 100644
index 6c0a5f9220c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_precision_at_thresholds.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.metrics.streaming_precision_at_thresholds(predictions, labels, thresholds, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision_at_thresholds}
-
-Computes precision values for different `thresholds` on `predictions`.
-
-The `streaming_precision_at_thresholds` function creates four local variables,
-`true_positives`, `true_negatives`, `false_positives` and `false_negatives`
-for various values of thresholds. `precision[i]` is defined as the total
-weight of values in `predictions` above `thresholds[i]` whose corresponding
-entry in `labels` is `True`, divided by the total weight of values in
-`predictions` above `thresholds[i]` (`true_positives[i] / (true_positives[i] +
-false_positives[i])`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`thresholds`</b>: A python list or tuple of float thresholds in `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: A float `Tensor` of shape `[len(thresholds)]`.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables that
-    are used in the computation of `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
deleted file mode 100644
index 1ddafd7da63..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
+++ /dev/null
@@ -1,55 +0,0 @@
-### `tf.contrib.metrics.streaming_recall_at_k(*args, **kwargs)` {#streaming_recall_at_k}
-
-Computes the recall@k of the predictions with respect to dense labels. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
-Instructions for updating:
-Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
-
-The `streaming_recall_at_k` function creates two local variables, `total` and
-`count`, that are used to compute the recall@k frequency. This frequency is
-ultimately returned as `recall_at_<k>`: an idempotent operation that simply
-divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`recall_at_<k>`. Internally, an `in_top_k` operation computes a `Tensor` with
-shape [batch_size] whose elements indicate whether or not the corresponding
-label is in the top `k` `predictions`. Then `update_op` increments `total`
-with the reduced sum of `weights` where `in_top_k` is `True`, and it
-increments `count` with the reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A float `Tensor` of dimension [batch_size, num_classes].
-*  <b>`labels`</b>: A `Tensor` of dimension [batch_size] whose type is in `int32`,
-    `int64`.
-*  <b>`k`</b>: The number of top elements to look at for computing recall.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall_at_k`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall_at_k`</b>: A `Tensor` representing the recall@k, the fraction of labels
-    which fall into the top `k` predictions.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `recall_at_k`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md
deleted file mode 100644
index bff835747e0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md
+++ /dev/null
@@ -1,75 +0,0 @@
-### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(top_k_predictions, labels, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_top_k}
-
-Computes precision@k of top-k predictions with respect to sparse labels.
-
-If `class_id` is not specified, we calculate precision as the ratio of
-    true positives (i.e., correct predictions, items in `top_k_predictions`
-    that are found in the corresponding row in `labels`) to positives (all
-    `top_k_predictions`).
-If `class_id` is specified, we calculate precision by considering only the
-    rows in the batch for which `class_id` is in the top `k` highest
-    `predictions`, and computing the fraction of them for which `class_id` is
-    in the corresponding row in `labels`.
-
-We expect precision to decrease as `k` increases.
-
-`streaming_sparse_precision_at_top_k` creates two local variables,
-`true_positive_at_k` and `false_positive_at_k`, that are used to compute
-the precision@k frequency. This frequency is ultimately returned as
-`precision_at_k`: an idempotent operation that simply divides
-`true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_k`. Internally, set operations applied to `top_k_predictions`
-and `labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_k` and
-`false_positive_at_k` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`top_k_predictions`</b>: Integer `Tensor` with shape [D1, ... DN, k] where
-    N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
-    The final dimension contains the indices of top-k labels. [D1, ... DN]
-    must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `top_k_predictions`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
-    in range [0, num_classes), where num_classes is the last dimension of
-    `predictions`. If `class_id` is outside this range, the method returns
-    NAN.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately, and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
-    `predictions`, or if either `metrics_collections` or `updates_collections`
-    are not a list or tuple.
-*  <b>`ValueError`</b>: If `top_k_predictions` has rank < 2.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md
deleted file mode 100644
index a022639c940..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_true_positives.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.metrics.streaming_true_positives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_true_positives}
-
-Sum the weights of true_positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.DeviceWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.DeviceWrapper.md
deleted file mode 100644
index 7375b99f805..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.DeviceWrapper.md
+++ /dev/null
@@ -1,62 +0,0 @@
-Operator that ensures an RNNCell runs on a particular device.
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.__call__(inputs, state, scope=None)` {#DeviceWrapper.__call__}
-
-Run the cell on specified device.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.__init__(cell, device)` {#DeviceWrapper.__init__}
-
-Construct a `DeviceWrapper` for `cell` with device `device`.
-
-Ensures the wrapped `cell` is called with `tf.device(device)`.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-*  <b>`device`</b>: A device string or function, for passing to `tf.device`.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.output_size` {#DeviceWrapper.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.state_size` {#DeviceWrapper.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.DeviceWrapper.zero_state(batch_size, dtype)` {#DeviceWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.LSTMBlockCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.LSTMBlockCell.md
deleted file mode 100644
index 467baa90cb3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.LSTMBlockCell.md
+++ /dev/null
@@ -1,67 +0,0 @@
-Basic LSTM recurrent network cell.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add `forget_bias` (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-Unlike `core_rnn_cell.LSTMCell`, this is a monolithic op and should be much
-faster.  The weight and bias matrixes should be compatible as long as the
-variable scope matches.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.__call__(x, states_prev, scope=None)` {#LSTMBlockCell.__call__}
-
-Long short-term memory cell (LSTM).
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.__init__(num_units, forget_bias=1.0, use_peephole=False)` {#LSTMBlockCell.__init__}
-
-Initialize the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.output_size` {#LSTMBlockCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.state_size` {#LSTMBlockCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockCell.zero_state(batch_size, dtype)` {#LSTMBlockCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md
deleted file mode 100644
index 575d927cfef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md
+++ /dev/null
@@ -1,100 +0,0 @@
-Time-Frequency Long short-term memory unit (LSTM) recurrent network cell.
-
-This implementation is based on:
-
-  Tara N. Sainath and Bo Li
-  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
-  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
-
-It uses peep-hole connections and optional cell clipping.
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.__call__(inputs, state, scope=None)` {#TimeFreqLSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: state Tensor, 2D, batch x state_size.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "TimeFreqLSTMCell".
-
-##### Returns:
-
-  A tuple containing:
-  - A 2D, batch x output_dim, Tensor representing the output of the LSTM
-    after reading "inputs" when previous state was "state".
-    Here output_dim is num_units.
-  - A 2D, batch x state_size, Tensor representing the new state of LSTM
-    after reading "inputs" when previous state was "state".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an input_size was specified and the provided inputs have
-    a different dimension.
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.__init__(num_units, use_peepholes=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#TimeFreqLSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
-    by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
-    matrix is stored across num_unit_shards.
-*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
-    to 1 in order to reduce the scale of forgetting at the beginning
-    of the training.
-*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
-*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
-    frequency.
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.output_size` {#TimeFreqLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.state_size` {#TimeFreqLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.TimeFreqLSTMCell.zero_state(batch_size, dtype)` {#TimeFreqLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.NextQueuedSequenceBatch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.NextQueuedSequenceBatch.md
deleted file mode 100644
index fa1095c17b1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.NextQueuedSequenceBatch.md
+++ /dev/null
@@ -1,265 +0,0 @@
-NextQueuedSequenceBatch stores deferred SequenceQueueingStateSaver data.
-
-This class is instantiated by `SequenceQueueingStateSaver` and is accessible
-via its `next_batch` property.
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.__init__(state_saver)` {#NextQueuedSequenceBatch.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.batch_size` {#NextQueuedSequenceBatch.batch_size}
-
-The batch_size of the given batch.
-
-Usually, this is the batch_size requested when initializing the SQSS, but
-if allow_small_batch=True this will become smaller when inputs are
-exhausted.
-
-##### Returns:
-
-  A scalar integer tensor, the batch_size
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.context` {#NextQueuedSequenceBatch.context}
-
-A dict mapping keys of `input_context` to batched context.
-
-##### Returns:
-
-  A dict mapping keys of `input_context` to tensors.
-  If we had at input:
-
-  ```python
-  context["name"].get_shape() == [d1, d2, ...]
-  ```
-
-  then for this property:
-
-  ```python
-  context["name"].get_shape() == [batch_size, d1, d2, ...]
-  ```
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.insertion_index` {#NextQueuedSequenceBatch.insertion_index}
-
-The insertion indices of the examples (when they were first added).
-
-These indices start with the value -2**63 and increase with every
-call to the prefetch op.  Each whole example gets its own insertion
-index, and this is used to prioritize the example so that its truncated
-segments appear in adjacent iterations, even if new examples are inserted
-by the prefetch op between iterations.
-
-##### Returns:
-
-  An int64 vector of length `batch_size`, the insertion indices.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.key` {#NextQueuedSequenceBatch.key}
-
-The key names of the given truncated unrolled examples.
-
-The format of the key is:
-
-```python
-"%05d_of_%05d:%s" % (sequence, sequence_count, original_key)
-```
-
-where `original_key` is the unique key read in by the prefetcher.
-
-##### Returns:
-
-  A string vector of length `batch_size`, the keys.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.length` {#NextQueuedSequenceBatch.length}
-
-The lengths of the given truncated unrolled examples.
-
-For initial iterations, for which `sequence * num_unroll < length`,
-this number is `num_unroll`.  For the remainder,
-this number is between `0` and `num_unroll`.
-
-##### Returns:
-
-  An integer vector of length `batch_size`, the lengths.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.next_key` {#NextQueuedSequenceBatch.next_key}
-
-The key names of the next (in iteration) truncated unrolled examples.
-
-The format of the key is:
-
-```python
-"%05d_of_%05d:%s" % (sequence + 1, sequence_count, original_key)
-```
-
-if `sequence + 1 < sequence_count`, otherwise:
-
-```python
-"STOP:%s" % original_key
-```
-
-where `original_key` is the unique key read in by the prefetcher.
-
-##### Returns:
-
-  A string vector of length `batch_size`, the keys.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.save_state(state_name, value, name=None)` {#NextQueuedSequenceBatch.save_state}
-
-Returns an op to save the current batch of state `state_name`.
-
-##### Args:
-
-
-*  <b>`state_name`</b>: string, matches a key provided in `initial_states`.
-*  <b>`value`</b>: A `Tensor`.
-    Its type must match that of `initial_states[state_name].dtype`.
-    If we had at input:
-
-    ```python
-    initial_states[state_name].get_shape() == [d1, d2, ...]
-    ```
-
-    then the shape of `value` must match:
-
-    ```python
-    tf.shape(value) == [batch_size, d1, d2, ...]
-    ```
-
-
-*  <b>`name`</b>: string (optional).  The name scope for newly created ops.
-
-##### Returns:
-
-  A control flow op that stores the new state of each entry into
-  the state saver.  This op must be run for every iteration that
-  accesses data from the state saver (otherwise the state saver
-  will never progress through its states and run out of capacity).
-
-##### Raises:
-
-
-*  <b>`KeyError`</b>: if `state_name` does not match any of the initial states
-    declared in `initial_states`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequence` {#NextQueuedSequenceBatch.sequence}
-
-An int32 vector, length `batch_size`: the sequence index of each entry.
-
-When an input is split up, the sequence values
-```
-0, 1, ..., sequence_count - 1
-```
-are assigned to each split.
-
-##### Returns:
-
-  An int32 vector `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequence_count` {#NextQueuedSequenceBatch.sequence_count}
-
-An int32 vector, length `batch_size`: the sequence count of each entry.
-
-When an input is split up, the number of splits is equal to:
-`padded_length / num_unroll`.  This is the sequence_count.
-
-##### Returns:
-
-  An int32 vector `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.sequences` {#NextQueuedSequenceBatch.sequences}
-
-A dict mapping keys of `input_sequences` to split and rebatched data.
-
-##### Returns:
-
-  A dict mapping keys of `input_sequences` to tensors.
-  If we had at input:
-
-  ```python
-  sequences["name"].get_shape() == [None, d1, d2, ...]
-  ```
-
-  where `None` meant the sequence time was dynamic, then for this property:
-
-  ```python
-  sequences["name"].get_shape() == [batch_size, num_unroll, d1, d2, ...].
-  ```
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.state(state_name)` {#NextQueuedSequenceBatch.state}
-
-Returns batched state tensors.
-
-##### Args:
-
-
-*  <b>`state_name`</b>: string, matches a key provided in `initial_states`.
-
-##### Returns:
-
-  A `Tensor`: a batched set of states, either initial states (if this is
-  the first run of the given example), or a value as stored during
-  a previous iteration via `save_state` control flow.
-  Its type is the same as `initial_states["state_name"].dtype`.
-  If we had at input:
-
-  ```python
-  initial_states[state_name].get_shape() == [d1, d2, ...],
-  ```
-
-  then
-
-  ```python
-  state(state_name).get_shape() == [batch_size, d1, d2, ...]
-  ```
-
-##### Raises:
-
-
-*  <b>`KeyError`</b>: if `state_name` does not match any of the initial states
-    declared in `initial_states`.
-
-
-- - -
-
-#### `tf.contrib.training.NextQueuedSequenceBatch.total_length` {#NextQueuedSequenceBatch.total_length}
-
-The lengths of the original (non-truncated) unrolled examples.
-
-##### Returns:
-
-  An integer vector of length `batch_size`, the total lengths.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md
deleted file mode 100644
index a59080eb300..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.training.batch_sequences_with_states.md
+++ /dev/null
@@ -1,158 +0,0 @@
-### `tf.contrib.training.batch_sequences_with_states(input_key, input_sequences, input_context, input_length, initial_states, num_unroll, batch_size, num_threads=3, capacity=1000, allow_small_batch=True, pad=True, name=None)` {#batch_sequences_with_states}
-
-Creates batches of segments of sequential input.
-
-This method creates a `SequenceQueueingStateSaver` (SQSS) and adds it to
-the queuerunners. It returns a `NextQueuedSequenceBatch`.
-
-It accepts one example at a time identified by a unique `input_key`.
-`input_sequence` is a dict with values that are tensors with time as first
-dimension. This time dimension must be the same across those tensors of an
-example. It can vary across examples. Although it always has to be a multiple
-of `num_unroll`. Hence, padding may be necessary and it is turned on by
-default by `pad=True`.
-
-`input_length` is a Tensor scalar or an int recording the time dimension prior
-to padding. It should be between 0 and the time dimension. One reason we want
-to keep track of it is so that we can take it into consideration when
-computing the loss. If `pad=True` then `input_length` can be `None` and will
-be inferred.
-
-This methods segments `input_sequence` into segments of length `num_unroll`.
-It batches input sequences from `batch_size` many examples. These mini-batches
-are available through the `sequence` property of the output. Moreover, for
-each entry in the batch we can access its original `input_key` in `key` and
-its input length in `total_length`. `length` records within this segment how
-many non-padded time steps there are.
-
-Static features of an example that do not vary across time can be part of the
-`input_context`, a dict with Tensor values. This method copies the context for
-each segment and makes it available in the `context` of the output.
-
-This method can maintain and update a state for each example. It accepts some
-initial_states as a dict with Tensor values. The first mini-batch an example
-is contained has initial_states as entry of the `state`. If save_state is
-called then the next segment will have the updated entry of the `state`.
-See `NextQueuedSequenceBatch` for a complete list of properties and methods.
-
-Example usage:
-
-```python
-batch_size = 32
-num_unroll = 20
-num_enqueue_threads = 3
-lstm_size = 8
-cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
-
-key, sequences, context = my_parser(raw_data)
-initial_state_values = tf.zeros((state_size,), dtype=tf.float32)
-initial_states = {"lstm_state": initial_state_values}
-batch = tf.batch_sequences_with_states(
-    input_key=key,
-    input_sequences=sequences,
-    input_context=context,
-    initial_states=initial_states,
-    num_unroll=num_unroll,
-    batch_size=batch_size,
-    num_threads=num_enqueue_threads,
-    capacity=batch_size * num_enqueue_threads * 2)
-
-inputs = batch.sequences["input"]
-context_label = batch.context["label"]
-
-inputs_by_time = tf.split(value=inputs, num_or_size_splits=num_unroll, axis=1)
-assert len(inputs_by_time) == num_unroll
-
-lstm_output, _ = tf.contrib.rnn.static_state_saving_rnn(
-  cell,
-  inputs_by_time,
-  state_saver=batch,
-  state_name="lstm_state")
-
-# Start a prefetcher in the background
-sess = tf.Session()
-
-tf.train.start_queue_runners(sess=session)
-
-while True:
-  # Step through batches, perform training or inference...
-  session.run([lstm_output])
-```
-
-##### Args:
-
-
-*  <b>`input_key`</b>: A string scalar `Tensor`, the **unique** key for the given
-    input example.  This is used to keep track of the split minibatch elements
-    of this input.  Batched keys of the current iteration are made
-    accessible via the `key` property.  The shape of `input_key` (scalar) must
-    be fully specified.
-*  <b>`input_sequences`</b>: A dict mapping string names to `Tensor` values.  The values
-    must all have matching first dimension, called `value_length`. They may
-    vary from input to input. The remainder of the shape (other than the first
-    dimension) must be fully specified.
-    The `SequenceQueueingStateSaver` will split these tensors along
-    this first dimension into minibatch elements of dimension `num_unrolled`.
-    Batched and segmented sequences of the current iteration are made
-    accessible via the `sequences` property.
-
-    **Note**: if `pad=False`, then `value_length` must always be a multiple
-      of `num_unroll`.
-
-*  <b>`input_context`</b>: A dict mapping string names to `Tensor` values.  The values
-    are treated as "global" across all time splits of the given input example,
-    and will be copied across for all minibatch elements accordingly.
-    Batched and copied context of the current iteration are made
-    accessible via the `context` property.
-
-    **Note**: All input_context values must have fully defined shapes.
-
-*  <b>`input_length`</b>: None or an int32 scalar `Tensor`, the length of the sequence
-    prior to padding. If `input_length=None` and `pad=True` then the length
-    will be inferred and will be equal to `value_length`. If `pad=False` then
-    `input_length` cannot be `None`: `input_length` must be specified. Its
-    shape of `input_length` (scalar) must be fully specified. Its value may be
-    at most `value_length` for any given input (see above for the definition
-    of `value_length`). Batched and total lengths of the current iteration are
-    made accessible via the `length` and `total_length` properties.
-*  <b>`initial_states`</b>: A dict mapping string state names to multi-dimensional
-    values (e.g. constants or tensors).  This input defines the set of
-    states that will be kept track of during computing iterations, and
-    which can be accessed via the `state` and `save_state` methods.
-
-    **Note**: All initial_state values must have fully defined shapes.
-
-*  <b>`num_unroll`</b>: Python integer, how many time steps to unroll at a time.
-    The input sequences of length k are then split into k / num_unroll many
-    segments.
-*  <b>`batch_size`</b>: int or int32 scalar `Tensor`, how large minibatches should
-    be when accessing the `state()` method and `context`, `sequences`, etc,
-    properties.
-*  <b>`num_threads`</b>: The int number of threads enqueuing input examples into a
-    queue.
-*  <b>`capacity`</b>: The max capacity of the queue in number of examples. Needs to be
-    at least `batch_size`. Defaults to 1000. When iterating over the same
-    input example multiple times reusing their keys the `capacity` must be
-    smaller than the number of examples.
-*  <b>`allow_small_batch`</b>: If true, the queue will return smaller batches when
-    there aren't enough input examples to fill a whole batch and the end of
-    the input has been reached.
-*  <b>`pad`</b>: If `True`, `input_sequences` will be padded to multiple of
-    `num_unroll`. In that case `input_length` may be `None` and is assumed to
-    be the length of first dimension of values in `input_sequences`
-    (i.e. `value_length`).
-*  <b>`name`</b>: An op name string (optional).
-
-##### Returns:
-
-  A NextQueuedSequenceBatch with segmented and batched inputs and their
-  states.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not an expected type.
-*  <b>`ValueError`</b>: if any of the input values is inconsistent, e.g. if
-    not enough shape information is available from inputs to build
-    the state saver.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.util.stripped_op_list_for_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.util.stripped_op_list_for_graph.md
deleted file mode 100644
index 23bfb285421..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.util.stripped_op_list_for_graph.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.util.stripped_op_list_for_graph(graph_def)` {#stripped_op_list_for_graph}
-
-Collect the stripped OpDefs for ops used by a graph.
-
-This function computes the `stripped_op_list` field of `MetaGraphDef` and
-similar protos.  The result can be communicated from the producer to the
-consumer, which can then use the C++ function
-`RemoveNewDefaultAttrsFromGraphDef` to improve forwards compatibility.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto, as from `graph.as_graph_def()`.
-
-##### Returns:
-
-  An `OpList` of ops used by the graph.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If an unregistered op is used.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.decode_base64.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.decode_base64.md
deleted file mode 100644
index 0d490e313b5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.decode_base64.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.decode_base64(input, name=None)` {#decode_base64}
-
-Decode web-safe base64-encoded strings.
-
-Input may or may not have padding at the end. See EncodeBase64 for padding.
-Web-safe means that input must use - and _ instead of + and /.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Base64 strings to decode.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Decoded strings.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.errors.FailedPreconditionError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.errors.FailedPreconditionError.md
deleted file mode 100644
index 1cbd338bf95..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.errors.FailedPreconditionError.md
+++ /dev/null
@@ -1,13 +0,0 @@
-Operation was rejected because the system is not in a state to execute it.
-
-This exception is most commonly raised when running an operation
-that reads a [`tf.Variable`](../../api_docs/python/state_ops.md#Variable)
-before it has been initialized.
-
-- - -
-
-#### `tf.errors.FailedPreconditionError.__init__(node_def, op, message)` {#FailedPreconditionError.__init__}
-
-Creates a `FailedPreconditionError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.extract_image_patches.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.extract_image_patches.md
deleted file mode 100644
index 9732ad8de46..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.extract_image_patches.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.extract_image_patches(images, ksizes, strides, rates, padding, name=None)` {#extract_image_patches}
-
-Extract `patches` from `images` and put them in the "depth" output dimension.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-*  <b>`ksizes`</b>: A list of `ints` that has length `>= 4`.
-    The size of the sliding window for each dimension of `images`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. How far the centers of two consecutive patches are in
-    the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-    input stride, specifying how far two consecutive patch samples are in the
-    input. Equivalent to extracting patches with
-    `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-    subsampling them spatially by a factor of `rates`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-
-    We specify the size-related attributes as:
-
-    ```python
-          ksizes = [1, ksize_rows, ksize_cols, 1]
-          strides = [1, strides_rows, strides_cols, 1]
-          rates = [1, rates_rows, rates_cols, 1]
-    ```
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`.
-  4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-  ksize_cols * depth]` containing image patches with size
-  `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.fixed_size_partitioner.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.fixed_size_partitioner.md
deleted file mode 100644
index fdeea7f2074..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.fixed_size_partitioner.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.fixed_size_partitioner(num_shards, axis=0)` {#fixed_size_partitioner}
-
-Partitioner to specify a fixed number of shards along given axis.
-
-##### Args:
-
-
-*  <b>`num_shards`</b>: `int`, number of shards to partition variable.
-*  <b>`axis`</b>: `int`, axis to partition on.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.floor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.floor.md
deleted file mode 100644
index 4aadcff6ef5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.floor.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.floor(x, name=None)` {#floor}
-
-Returns element-wise largest integer not greater than x.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
deleted file mode 100644
index 99b34aaca47..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.greater(x, y, name=None)` {#greater}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.histogram_fixed_width.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.histogram_fixed_width.md
deleted file mode 100644
index 3334d6d09de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.histogram_fixed_width.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.histogram_fixed_width(values, value_range, nbins=100, dtype=tf.int32, name=None)` {#histogram_fixed_width}
-
-Return histogram of values.
-
-Given the tensor `values`, this operation returns a rank 1 histogram counting
-the number of entries in `values` that fell into every bin.  The bins are
-equal width and determined by the arguments `value_range` and `nbins`.
-
-##### Args:
-
-
-*  <b>`values`</b>: Numeric `Tensor`.
-*  <b>`value_range`</b>: Shape [2] `Tensor`.  new_values <= value_range[0] will be
-    mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
-    Must be same dtype as new_values.
-*  <b>`nbins`</b>: Scalar `int32 Tensor`.  Number of histogram bins.
-*  <b>`dtype`</b>: dtype for returned histogram.
-*  <b>`name`</b>: A name for this operation (defaults to 'histogram_fixed_width').
-
-##### Returns:
-
-  A 1-D `Tensor` holding histogram of values.
-
-
-*  <b>`Examples`</b>: 
-
-```python
-# Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-nbins = 5
-value_range = [0.0, 5.0]
-new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-with tf.default_session() as sess:
-  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  variables.global_variables_initializer().run()
-  sess.run(hist) => [2, 1, 1, 0, 2]
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.image.hsv_to_rgb.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.image.hsv_to_rgb.md
deleted file mode 100644
index 9bb9c511987..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.image.hsv_to_rgb.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.image.hsv_to_rgb(images, name=None)` {#hsv_to_rgb}
-
-Convert one or more images from HSV to RGB.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-See `rgb_to_hsv` for a description of the HSV encoding.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. `images` converted to RGB.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.initialize_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.initialize_variables.md
deleted file mode 100644
index 3ab51c4b3c6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.initialize_variables.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.initialize_variables(*args, **kwargs)` {#initialize_variables}
-
-See `tf.variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.variables_initializer` instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.log.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.log.md
deleted file mode 100644
index a6c085b5cff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.log.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.log(x, name=None)` {#log}
-
-Computes natural logarithm of x element-wise.
-
-I.e., \\(y = \log_e x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_backprop_input.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_backprop_input.md
deleted file mode 100644
index dc223e8ec12..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_backprop_input.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.nn.conv2d_backprop_input(input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d_backprop_input}
-
-Computes the gradients of convolution with respect to the input.
-
-##### Args:
-
-
-*  <b>`input_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the shape of `input`,
-    where `input` is a 4-D `[batch, height, width, channels]` tensor.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-    4-D with shape
-    `[filter_height, filter_width, in_channels, out_channels]`.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `filter`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution. Must be in the same order as the dimension specified with
-    format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `filter`.
-  4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_transpose.md
deleted file mode 100644
index b5a2ed50de9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.conv2d_transpose.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.nn.conv2d_transpose(value, filter, output_shape, strides, padding='SAME', data_format='NHWC', name=None)` {#conv2d_transpose}
-
-The transpose of `conv2d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `conv2d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float` and shape
-    `[batch, height, width, in_channels]` for `NHWC` data format or
-    `[batch, in_channels, height, width]` for `NCHW` data format.
-*  <b>`filter`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[height, width, output_channels, in_channels]`.  `filter`'s
-    `in_channels` dimension must match that of `value`.
-*  <b>`output_shape`</b>: A 1-D `Tensor` representing the output shape of the
-    deconvolution op.
-*  <b>`strides`</b>: A list of ints. The stride of the sliding window for each
-    dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter`'s shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.ctc_beam_search_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.ctc_beam_search_decoder.md
deleted file mode 100644
index e02a076cb3b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.ctc_beam_search_decoder.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.nn.ctc_beam_search_decoder(inputs, sequence_length, beam_width=100, top_paths=1, merge_repeated=True)` {#ctc_beam_search_decoder}
-
-Performs beam search decoding on the logits given in input.
-
-**Note** The `ctc_greedy_decoder` is a special case of the
-`ctc_beam_search_decoder` with `top_paths=1` and `beam_width=1` (but
-that decoder is faster for this special case).
-
-If `merge_repeated` is `True`, merge repeated classes in the output beams.
-This means that if consecutive entries in a beam are the same,
-only the first of these is emitted.  That is, when the top path
-is `A B B B B`, the return value is:
-
-  * `A B` if `merge_repeated = True`.
-  * `A B B B B` if `merge_repeated = False`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: 3-D `float` `Tensor`, size
-    `[max_time x batch_size x num_classes]`.  The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector containing sequence lengths,
-    having size `[batch_size]`.
-*  <b>`beam_width`</b>: An int scalar >= 0 (beam search beam width).
-*  <b>`top_paths`</b>: An int scalar >= 0, <= beam_width (controls output size).
-*  <b>`merge_repeated`</b>: Boolean.  Default: True.
-
-##### Returns:
-
-  A tuple `(decoded, log_probabilities)` where
-
-*  <b>`decoded`</b>: A list of length top_paths, where `decoded[j]`
-    is a `SparseTensor` containing the decoded outputs:
-    `decoded[j].indices`: Indices matrix `(total_decoded_outputs[j] x 2)`
-      The rows store: [batch, time].
-    `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
-      The vector stores the decoded classes for beam j.
-    `decoded[j].shape`: Shape vector, size `(2)`.
-      The shape values are: `[batch_size, max_decoded_length[j]]`.
-*  <b>`log_probability`</b>: A `float` matrix `(batch_size x top_paths)` containing
-      sequence log-probabilities.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md
deleted file mode 100644
index fbb0a9f09a5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.nn.depthwise_conv2d(input, filter, strides, padding, rate=None, name=None)` {#depthwise_conv2d}
-
-Depthwise 2-D convolution.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`
-containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
-applies a different filter to each input channel (expanding from 1 channel
-to `channel_multiplier` channels for each), then concatenates the results
-together.  The output has `in_channels * channel_multiplier` channels.
-
-In detail,
-
-    output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
-         filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
-                                         strides[2] * j + rate[1] * dj, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the
-same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-If any value in `rate` is greater than 1, we perform atrous depthwise
-convolution, in which case all values in the `strides` tensor must be equal
-to 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter`</b>: 4-D with shape
-    `[filter_height, filter_width, in_channels, channel_multiplier]`.
-*  <b>`strides`</b>: 1-D of size 4.  The stride of the sliding window for each
-    dimension of `input`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment
-      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`rate`</b>: 1-D of size 2. The dilation rate in which we sample input values
-    across the `height` and `width` dimensions in atrous convolution. If it is
-    greater than 1, then all values of strides must be 1.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A 4-D `Tensor` of shape
-  `[batch, out_height, out_width, in_channels * channel_multiplier].`
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
deleted file mode 100644
index c2736f1ba92..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.nn.depthwise_conv2d_native(input, filter, strides, padding, name=None)` {#depthwise_conv2d_native}
-
-Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-a different filter to each input channel (expanding from 1 channel to
-`channel_multiplier` channels for each), then concatenates the results
-together. Thus, the output has `in_channels * channel_multiplier` channels.
-
-for k in 0..in_channels-1
-  for q in 0..channel_multiplier-1
-    output[b, i, j, k * channel_multiplier + q] =
-      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-                        filter[di, dj, k, q]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-*  <b>`strides`</b>: A list of `ints`.
-    1-D of length 4.  The stride of the sliding window for each dimension
-    of `input`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.dilation2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.dilation2d.md
deleted file mode 100644
index b9cf01da19e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.dilation2d.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.nn.dilation2d(input, filter, strides, rates, padding, name=None)` {#dilation2d}
-
-Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-
-The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-input channel is processed independently of the others with its own structuring
-function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-tensor depend on the `padding` algorithm. We currently only support the default
-"NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-(for consistency with `conv2d`, we use unmirrored filters):
-
-    output[b, y, x, c] =
-       max_{dy, dx} input[b,
-                          strides[1] * y + rates[1] * dy,
-                          strides[2] * x + rates[2] * dx,
-                          c] +
-                    filter[dy, dx, c]
-
-Max-pooling is a special case when the filter has size equal to the pooling
-kernel size and contains all zeros.
-
-Note on duality: The dilation of `input` by the `filter` is equal to the
-negation of the erosion of `-input` by the reflected `filter`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    4-D with shape `[batch, in_height, in_width, depth]`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-    3-D with shape `[filter_height, filter_width, depth]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    The stride of the sliding window for each dimension of the input
-    tensor. Must be: `[1, stride_height, stride_width, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    The input stride for atrous morphological dilation. Must be:
-    `[1, rate_height, rate_width, 1]`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  4-D with shape `[batch, out_height, out_width, depth]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.l2_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.l2_loss.md
deleted file mode 100644
index fd648ca642f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.l2_loss.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.nn.l2_loss(t, name=None)` {#l2_loss}
-
-L2 Loss.
-
-Computes half the L2 norm of a tensor without the `sqrt`:
-
-    output = sum(t ** 2) / 2
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Typically 2-D, but may have any dimensions.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `t`. 0-D.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
deleted file mode 100644
index cf5cbe47402..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.nn.log_poisson_loss(targets, log_input, compute_full_loss=False, name=None)` {#log_poisson_loss}
-
-Computes log Poisson loss given `log_input`.
-
-Gives the log-likelihood loss between the prediction and the target under the
-assumption that the target has a Poisson distribution.
-Caveat: By default, this is not the exact loss, but the loss minus a
-  constant term [log(z!)]. That has no effect for optimization, but
-  does not play well with relative loss comparisons. To compute an
-  approximation of the log factorial term, specify
-  compute_full_loss=True to enable Stirling's Approximation.
-
-For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
-loss is
-
-      -log(exp(-x) * (x^z) / z!)
-    = -log(exp(-x) * (x^z)) + log(z!)
-    ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-        [ Note the second term is the Stirling's Approximation for log(z!).
-          It is invariant to x and does not affect optimization, though
-          important for correct relative loss comparisons. It is only
-          computed when compute_full_loss == True. ]
-    = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-    = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-
-##### Args:
-
-
-*  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
-*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
-    term is dropped in favor of more efficient optimization.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `log_input` with the componentwise
-  logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `log_input` and `targets` do not have the same shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.max_pool3d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.max_pool3d.md
deleted file mode 100644
index 960b322c6c7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.max_pool3d.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.nn.max_pool3d(input, ksize, strides, padding, name=None)` {#max_pool3d}
-
-Performs 3D max pooling on the input.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The size of the window for each dimension of
-    the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. The max pooled output tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
deleted file mode 100644
index 7cb440b70df..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ /dev/null
@@ -1,58 +0,0 @@
-### `tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=False, partition_strategy='mod', name='nce_loss')` {#nce_loss}
-
-Computes and returns the noise-contrastive estimation training loss.
-
-See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical
-models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms
-Reference](../../extras/candidate_sampling.pdf)
-
-Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
-so your labels must be sorted in order of decreasing frequency to achieve
-good results.  For more details, see
-[log_uniform_candidate_sampler](#log_uniform_candidate_sampler).
-
-Note: In the case where `num_true` > 1, we assign to each target class
-the target probability 1 / `num_true` so that the target probabilities
-sum to 1 per-example.
-
-Note: It would be useful to allow a variable number of target classes per
-example.  We hope to provide this functionality in a future release.
-For now, if you have a variable number of target classes, you can pad them
-out to a constant number by either repeating them or by padding
-with an otherwise unused class.
-
-##### Args:
-
-
-*  <b>`weights`</b>: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
-      objects whose concatenation along dimension 0 has shape
-      [num_classes, dim].  The (possibly-partitioned) class embeddings.
-*  <b>`biases`</b>: A `Tensor` of shape `[num_classes]`.  The class biases.
-*  <b>`labels`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-      num_true]`. The target classes.
-*  <b>`inputs`</b>: A `Tensor` of shape `[batch_size, dim]`.  The forward
-      activations of the input network.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`num_classes`</b>: An `int`. The number of possible classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`sampled_values`</b>: a tuple of (`sampled_candidates`, `true_expected_count`,
-      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
-      (if None, we default to `log_uniform_candidate_sampler`)
-*  <b>`remove_accidental_hits`</b>: A `bool`.  Whether to remove "accidental hits"
-      where a sampled class equals one of the target classes.  If set to
-      `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
-      learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
-      Default is False.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
-      Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `batch_size` 1-D tensor of per-example NCE losses.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.softplus.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.softplus.md
deleted file mode 100644
index c0faef9687c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.softplus.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.nn.softplus(features, name=None)` {#softplus}
-
-Computes softplus: `log(exp(features) + 1)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
deleted file mode 100644
index 0aa696ba2fc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.nn.sparse_softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)` {#sparse_softmax_cross_entropy_with_logits}
-
-Computes sparse softmax cross entropy between `logits` and `labels`.
-
-Measures the probability error in discrete classification tasks in which the
-classes are mutually exclusive (each entry is in exactly one class).  For
-example, each CIFAR-10 image is labeled with one and only one label: an image
-can be a dog or a truck, but not both.
-
-**NOTE:**  For this operation, the probability of a given label is considered
-exclusive.  That is, soft classes are not allowed, and the `labels` vector
-must provide a single specific index for the true class for each row of
-`logits` (each minibatch entry).  For soft softmax classification with
-a probability distribution for each entry, see
-`softmax_cross_entropy_with_logits`.
-
-**WARNING:** This op expects unscaled logits, since it performs a softmax
-on `logits` internally for efficiency.  Do not call this op with the
-output of `softmax`, as it will produce incorrect results.
-
-A common use case is to have logits of shape `[batch_size, num_classes]` and
-labels of shape `[batch_size]`. But higher dimensions are supported.
-
-**Note that to avoid confusion, it is required to pass only named arguments to
-this function.**
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
-    `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
-    must be an index in `[0, num_classes)`. Other values will raise an
-    exception when this op is run on CPU, and return `NaN` for corresponding
-    loss and gradient rows on GPU.
-*  <b>`logits`</b>: Unscaled log probabilities of shape
-    `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `labels` and of the same type as `logits`
-  with the softmax cross entropy loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
-    of the labels is not equal to the rank of the labels minus one.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.placeholder_with_default.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.placeholder_with_default.md
deleted file mode 100644
index 2f3cdd593c3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.placeholder_with_default.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.placeholder_with_default(input, shape, name=None)` {#placeholder_with_default}
-
-A placeholder op that passes through `input` when its output is not fed.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The default value to produce when `output` is not fed.
-*  <b>`shape`</b>: A `tf.TensorShape` or list of `ints`.
-    The (possibly partial) shape of the tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  A placeholder tensor that defaults to `input` if it is not fed.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.python_io.TFRecordCompressionType.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.python_io.TFRecordCompressionType.md
deleted file mode 100644
index 8b9cbe04451..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.python_io.TFRecordCompressionType.md
+++ /dev/null
@@ -1 +0,0 @@
-The type of compression for the record.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_all.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_all.md
deleted file mode 100644
index 7ce5d55ccc6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_all.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.reduce_all(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_all}
-
-Computes the "logical and" of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[True,  True]
-#         [False, False]]
-tf.reduce_all(x) ==> False
-tf.reduce_all(x, 0) ==> [False, False]
-tf.reduce_all(x, 1) ==> [True, False]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The boolean tensor to reduce.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.all
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_mean.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_mean.md
deleted file mode 100644
index 1c6948ffa3a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.reduce_mean.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.reduce_mean(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_mean}
-
-Computes the mean of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[1., 1.]
-#         [2., 2.]]
-tf.reduce_mean(x) ==> 1.5
-tf.reduce_mean(x, 0) ==> [1.5, 1.5]
-tf.reduce_mean(x, 1) ==> [1.,  2.]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.mean
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.segment_max.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.segment_max.md
deleted file mode 100644
index c9d7a289002..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.segment_max.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.segment_max(data, segment_ids, name=None)` {#segment_max}
-
-Computes the maximum along segments of a tensor.
-
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
-
-Computes a tensor such that
-\\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMax.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
deleted file mode 100644
index c52a82a49ab..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.self_adjoint_eigvals(tensor, name=None)` {#self_adjoint_eigvals}
-
-Computes the eigenvalues of one or more self-adjoint matrices.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
-    eigenvalues of `tensor[..., :, :]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_add.md
deleted file mode 100644
index 3a3c88db49c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_add.md
+++ /dev/null
@@ -1,55 +0,0 @@
-### `tf.sparse_add(a, b, thresh=0)` {#sparse_add}
-
-Adds two tensors, at least one of each is a `SparseTensor`.
-
-If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
-both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
-of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
-`Tensor`s.
-
-The indices of any input `SparseTensor` are assumed ordered in standard
-lexicographic order.  If this is not the case, before this step run
-`SparseReorder` to restore index ordering.
-
-If both arguments are sparse, we perform "clipping" as follows.  By default,
-if two values sum to zero at some index, the output `SparseTensor` would still
-include that particular location in its index, storing a zero in the
-corresponding value slot.  To override this, callers can specify `thresh`,
-indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-corresponding value and index would then not be included.  In particular,
-`thresh == 0.0` (default) means everything is kept and actual thresholding
-happens only for a positive value.
-
-For example, suppose the logical sum of two sparse operands is (densified):
-
-    [       2]
-    [.1     0]
-    [ 6   -.2]
-
-Then,
-
-    * `thresh == 0` (the default): all 5 index/value pairs will be returned.
-    * `thresh == 0.11`: only .1 and 0  will vanish, and the remaining three
-        index/value pairs will be returned.
-    * `thresh == 0.21`: .1, 0, and -.2 will vanish.
-
-##### Args:
-
-
-*  <b>`a`</b>: The first operand; `SparseTensor` or `Tensor`.
-*  <b>`b`</b>: The second operand; `SparseTensor` or `Tensor`.  At least one operand
-    must be sparse.
-*  <b>`thresh`</b>: A 0-D `Tensor`.  The magnitude threshold that determines if an
-  output value/index pair takes space.  Its dtype should match that of the
-  values if they are real; if the latter are complex64/complex128, then the
-  dtype should be float32/float64, correspondingly.
-
-##### Returns:
-
-  A `SparseTensor` or a `Tensor`, representing the sum.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md
deleted file mode 100644
index ede12c08fec..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sparse_to_indicator.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.sparse_to_indicator(sp_input, vocab_size, name=None)` {#sparse_to_indicator}
-
-Converts a `SparseTensor` of ids into a dense bool indicator tensor.
-
-The last dimension of `sp_input.indices` is discarded and replaced with
-the values of `sp_input`.  If `sp_input.dense_shape = [D0, D1, ..., Dn, K]`,
-then `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
-
-    output[d_0, d_1, ..., d_n, sp_input[d_0, d_1, ..., d_n, k]] = True
-
-and False elsewhere in `output`.
-
-For example, if `sp_input.dense_shape = [2, 3, 4]` with non-empty values:
-
-    [0, 0, 0]: 0
-    [0, 1, 0]: 10
-    [1, 0, 3]: 103
-    [1, 1, 2]: 150
-    [1, 1, 3]: 149
-    [1, 1, 4]: 150
-    [1, 2, 1]: 121
-
-and `vocab_size = 200`, then the output will be a `[2, 3, 200]` dense bool
-tensor with False everywhere except at positions
-
-    (0, 0, 0), (0, 1, 10), (1, 0, 103), (1, 1, 149), (1, 1, 150),
-    (1, 2, 121).
-
-Note that repeats are allowed in the input SparseTensor.
-This op is useful for converting `SparseTensor`s into dense formats for
-compatibility with ops that expect dense tensors.
-
-The input `SparseTensor` must be in row-major order.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: A `SparseTensor` with `values` property of type `int32` or
-    `int64`.
-*  <b>`vocab_size`</b>: A scalar int64 Tensor (or Python int) containing the new size
-    of the last dimension, `all(0 <= sp_input.values < vocab_size)`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A dense bool indicator tensor representing the indices with specified value.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.stack.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.stack.md
deleted file mode 100644
index 51f81c1000f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.stack.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.stack(values, axis=0, name='stack')` {#stack}
-
-Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
-
-Packs the list of tensors in `values` into a tensor with rank one higher than
-each tensor in `values`, by packing them along the `axis` dimension.
-Given a list of length `N` of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```prettyprint
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-stack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of unstack.  The numpy equivalent is
-
-    tf.stack([x, y, z]) = np.asarray([x, y, z])
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects with the same shape and type.
-*  <b>`axis`</b>: An `int`. The axis to stack along. Defaults to the first dimension.
-    Supports negative indexes.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`output`</b>: A stacked `Tensor` with the same type as `values`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `axis` is out of the range [-(R+1), R+1).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.string_to_hash_bucket_fast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.string_to_hash_bucket_fast.md
deleted file mode 100644
index e684058326f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.string_to_hash_bucket_fast.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.string_to_hash_bucket_fast(input, num_buckets, name=None)` {#string_to_hash_bucket_fast}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. The strings to assign a hash bucket.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md
deleted file mode 100644
index 3cfd7103d7e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.tile.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.tile.md
deleted file mode 100644
index 0c31e73c98f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.tile.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.tile(input, multiples, name=None)` {#tile}
-
-Constructs a tensor by tiling a given tensor.
-
-This operation creates a new tensor by replicating `input` `multiples` times.
-The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-and the values of `input` are replicated `multiples[i]` times along the 'i'th
-dimension. For example, tiling `[a b c d]` by `[2]` produces
-`[a b c d a b c d]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 1-D or higher.
-*  <b>`multiples`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. Length must be the same as the number of dimensions in `input`
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.SingularMonitoredSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.SingularMonitoredSession.md
deleted file mode 100644
index d858e813e30..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.SingularMonitoredSession.md
+++ /dev/null
@@ -1,129 +0,0 @@
-Session-like object that handles initialization, restoring, and hooks.
-
-Please note that this utility is not recommended for distributed settings.
-For distributed settings, please use `tf.train.MonitoredSession`. The
-differences between `MonitoredSession` and `SingularMonitoredSession` are:
-* `MonitoredSession` handles `AbortedError` for distributed settings,
-  but `SingularMonitoredSession` does not.
-* `MonitoredSession` can be created in `chief` or `worker` modes.
-  `SingularMonitoredSession` is always created as `chief`.
-* You can access the raw `tf.Session` object used by
-  `SingularMonitoredSession`, whereas in MonitoredSession the raw session is
-  private. This can be used:
-  - To `run` without hooks.
-  - To save and restore.
-* All other functionality is identical.
-
-Example usage:
-```python
-saver_hook = CheckpointSaverHook(...)
-summary_hook = SummaryHook(...)
-with SingularMonitoredSession(hooks=[saver_hook, summary_hook]) as sess:
-  while not sess.should_stop():
-    sess.run(train_op)
-```
-
-Initialization: At creation time the hooked session does following things
-in given order:
-
-* calls `hook.begin()` for each given hook
-* finalizes the graph via `scaffold.finalize()`
-* create session
-* initializes the model via initialization ops provided by `Scaffold`
-* restores variables if a checkpoint exists
-* launches queue runners
-
-Run: When `run()` is called, the hooked session does following things:
-
-* calls `hook.before_run()`
-* calls TensorFlow `session.run()` with merged fetches and feed_dict
-* calls `hook.after_run()`
-* returns result of `session.run()` asked by user
-
-Exit: At the `close()`, the hooked session does following things in order:
-
-* calls `hook.end()`
-* closes the queue runners and the session
-* surpresses `OutOfRange` error which indicates that all inputs have been
-  processed if the `SingularMonitoredSession` is used as a context.
-- - -
-
-#### `tf.train.SingularMonitoredSession.__enter__()` {#SingularMonitoredSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.__exit__(exception_type, exception_value, traceback)` {#SingularMonitoredSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.__init__(hooks=None, scaffold=None, master='', config=None, checkpoint_dir=None)` {#SingularMonitoredSession.__init__}
-
-Creates a SingularMonitoredSession.
-
-##### Args:
-
-
-*  <b>`hooks`</b>: An iterable of `SessionRunHook' objects.
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.close()` {#SingularMonitoredSession.close}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.graph` {#SingularMonitoredSession.graph}
-
-The graph that was launched in this session.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.raw_session()` {#SingularMonitoredSession.raw_session}
-
-Returns underlying `TensorFlow.Session` object.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#SingularMonitoredSession.run}
-
-Run ops in the monitored session.
-
-This method is completely compatible with the `tf.Session.run()` method.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as `tf.Session.run()`.
-*  <b>`feed_dict`</b>: Same as `tf.Session.run()`.
-*  <b>`options`</b>: Same as `tf.Session.run()`.
-*  <b>`run_metadata`</b>: Same as `tf.Session.run()`.
-
-##### Returns:
-
-  Same as `tf.Session.run()`.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.should_stop()` {#SingularMonitoredSession.should_stop}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.WorkerSessionCreator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.WorkerSessionCreator.md
deleted file mode 100644
index 9ba1affc6b6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.WorkerSessionCreator.md
+++ /dev/null
@@ -1,23 +0,0 @@
-Creates a tf.Session for a worker.
-- - -
-
-#### `tf.train.WorkerSessionCreator.__init__(scaffold=None, master='', config=None)` {#WorkerSessionCreator.__init__}
-
-Initializes a worker session creator.
-
-##### Args:
-
-
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-
-
-- - -
-
-#### `tf.train.WorkerSessionCreator.create_session()` {#WorkerSessionCreator.create_session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.match_filenames_once.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.match_filenames_once.md
deleted file mode 100644
index db62d202238..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.match_filenames_once.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.train.match_filenames_once(pattern, name=None)` {#match_filenames_once}
-
-Save the list of files matching pattern, so it is only computed once.
-
-##### Args:
-
-
-*  <b>`pattern`</b>: A file pattern (glob), or 1D tensor of file patterns.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A variable that is initialized to the list of files matching the pattern(s).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md
deleted file mode 100644
index 477c0e7bd9d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.maybe_batch_join.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.train.maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch_join}
-
-Runs a list of tensors to conditionally fill a queue to create batches.
-
-See docstring in `batch_join` for more details.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list_list[i]`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensor_list_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.update_checkpoint_state.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.update_checkpoint_state.md
deleted file mode 100644
index 68747fc0c7a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.update_checkpoint_state.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.train.update_checkpoint_state(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None, latest_filename=None)` {#update_checkpoint_state}
-
-Updates the content of the 'checkpoint' file.
-
-This updates the checkpoint file containing a CheckpointState
-proto.
-
-##### Args:
-
-
-*  <b>`save_dir`</b>: Directory where the model was saved.
-*  <b>`model_checkpoint_path`</b>: The checkpoint file.
-*  <b>`all_model_checkpoint_paths`</b>: List of strings.  Paths to all not-yet-deleted
-    checkpoints, sorted from oldest to newest.  If this is a non-empty list,
-    the last element must be equal to model_checkpoint_path.  These paths
-    are also saved in the CheckpointState proto.
-*  <b>`latest_filename`</b>: Optional name of the checkpoint file.  Default to
-    'checkpoint'.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If the save paths conflict.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md
deleted file mode 100644
index 33e1f1c5918..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.train.write_graph.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.train.write_graph(graph_or_graph_def, logdir, name, as_text=True)` {#write_graph}
-
-Writes a graph proto to a file.
-
-The graph is written as a binary proto unless `as_text` is `True`.
-
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
-```
-
-or
-
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
-```
-
-##### Args:
-
-
-*  <b>`graph_or_graph_def`</b>: A `Graph` or a `GraphDef` protocol buffer.
-*  <b>`logdir`</b>: Directory where to write the graph. This can refer to remote
-    filesystems, such as Google Cloud Storage (GCS).
-*  <b>`name`</b>: Filename for the graph.
-*  <b>`as_text`</b>: If `True`, writes the graph as an ASCII proto.
-
-##### Returns:
-
-  The path of the output proto file.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.unsorted_segment_sum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.unsorted_segment_sum.md
deleted file mode 100644
index c02d39e96a1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.unsorted_segment_sum.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.unsorted_segment_sum(data, segment_ids, num_segments, name=None)` {#unsorted_segment_sum}
-
-Computes the sum along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-need not be sorted and need not cover all values in the full
-range of valid values.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-
-`num_segments` should equal the number of distinct segment IDs.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor whose shape is a prefix of `data.shape`.
-*  <b>`num_segments`</b>: A `Tensor` of type `int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for the first `segment_ids.rank`
-  dimensions, which are replaced with a single dimension which has size
-  `num_segments`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md
deleted file mode 100644
index da394788417..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.while_loop.md
+++ /dev/null
@@ -1,117 +0,0 @@
-### `tf.while_loop(cond, body, loop_vars, shape_invariants=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#while_loop}
-
-Repeat `body` while the condition `cond` is true.
-
-`cond` is a callable returning a boolean scalar tensor. `body` is a callable
-returning a (possibly nested) tuple, namedtuple or list of tensors of the same
-arity (length and structure) and types as `loop_vars`. `loop_vars` is a
-(possibly nested) tuple, namedtuple or list of tensors that is passed to both
-`cond` and `body`. `cond` and `body` both take as many arguments as there are
-`loop_vars`.
-
-While `cond` evaluates to true, `body` is executed.
-
-In addition to regular Tensors or IndexedSlices, the body may accept and
-return TensorArray objects.  The flows of the TensorArray objects will
-be appropriately forwarded between loops and during gradient calculations.
-
-For correctness, `tf.while_loop()` strictly enforces shape invariants for
-the loop variables. A shape invariant is a (possibly partial) shape that
-is unchanged across the iterations of the loop. An error will be raised
-if the shape of a loop variable after an iteration is determined to be more
-general than or incompatible with its shape invariant. For example, a shape
-of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
-compatible with [11, 17]. By default (if the argument `shape_invariants` is
-not specified), it is assumed that the initial shape of each tensor in
-`loop_vars` is the same in every iteration. The `shape_invariants` argument
-allows the caller to specify a less specific shape invariant for each loop
-variable, which is needed if the shape varies between iterations. The
-[`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape)
-function may also be used in the `body` function to indicate that
-the output loop variable has a particular shape. The shape invariant for
-SparseTensor and IndexedSlices are treated specially as follows:
-
-a) If a loop variable is a SparseTensor, the shape invariant must be
-TensorShape([r]) where r is the rank of the dense tensor represented
-by the sparse tensor. It means the shapes of the three tensors of the
-SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
-is the shape of the SparseTensor.dense_shape property. It must be the shape of
-a vector.
-
-b) If a loop variable is an IndexedSlices, the shape invariant must be
-a shape invariant of the values tensor of the IndexedSlices. It means
-the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
-[shape.ndims]).
-
-`while_loop` implements non-strict semantics, enabling multiple iterations
-to run in parallel. The maximum number of parallel iterations can be
-controlled by `parallel_iterations`, which gives users some control over
-memory consumption and execution order. For correct programs, `while_loop`
-should return the same result for any parallel_iterations > 0.
-
-For training, TensorFlow remembers the tensors that are produced in the
-forward inference but needed in back propagation. These tensors can be a
-main source of memory consumption and often cause OOM problems when training
-on GPUs.  When the flag swap_memory is true, we swap out these tensors from
-GPU to CPU.  This for example allows us to train RNN models with very long
-sequences and large batches.
-
-##### Args:
-
-
-*  <b>`cond`</b>: A callable that represents the termination condition of the loop.
-*  <b>`body`</b>: A callable that represents the loop body.
-*  <b>`loop_vars`</b>: A (possibly nested) tuple, namedtuple or list of numpy array,
-    `Tensor`, and `TensorArray` objects.
-*  <b>`shape_invariants`</b>: The shape invariants for the loop variables.
-*  <b>`parallel_iterations`</b>: The number of iterations allowed to run in parallel.
-    It must be a positive integer.
-*  <b>`back_prop`</b>: Whether backprop is enabled for this while loop.
-*  <b>`swap_memory`</b>: Whether GPU-CPU memory swap is enabled for this loop.
-*  <b>`name`</b>: Optional name prefix for the returned tensors.
-
-##### Returns:
-
-  The output tensors for the loop variables after the loop. When the length
-  of `loop_vars` is 1 this is a Tensor, TensorArray or IndexedSlice and when
-  the length of `loop_vars` is greater than 1 it returns a list.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `cond` or `body` is not callable.
-*  <b>`ValueError`</b>: if `loop_vars` is empty.
-
-
-*  <b>`Example`</b>: 
-
-  ```python
-  i = tf.constant(0)
-  c = lambda i: tf.less(i, 10)
-  b = lambda i: tf.add(i, 1)
-  r = tf.while_loop(c, b, [i])
-  ```
-
-Example with nesting and a namedtuple:
-
-  ```python
-  import collections
-  Pair = collections.namedtuple('Pair', 'j, k')
-  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
-  c = lambda i, p: i < 10
-  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
-  ijk_final = tf.while_loop(c, b, ijk_0)
-  ```
-
-Example using shape_invariants:
-
-  ```python
-  i0 = tf.constant(0)
-  m0 = tf.ones([2, 2])
-  c = lambda i, m: i < 10
-  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
-  tf.while_loop(
-      c, b, loop_vars=[i0, m0],
-      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md
deleted file mode 100644
index 6a7a6088c9c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md
+++ /dev/null
@@ -1,516 +0,0 @@
-Data set from a debug-dump directory on filesystem.
-
-An instance of `DebugDumpDir` contains all `DebugTensorDatum` instances
-in a tfdbg dump root directory.
-- - -
-
-#### `tf_debug.DebugDumpDir.__init__(dump_root, partition_graphs=None, validate=True)` {#DebugDumpDir.__init__}
-
-`DebugDumpDir` constructor.
-
-##### Args:
-
-
-*  <b>`dump_root`</b>: (`str`) path to the dump root directory.
-*  <b>`partition_graphs`</b>: A repeated field of GraphDefs representing the
-      partition graphs executed by the TensorFlow runtime.
-*  <b>`validate`</b>: (`bool`) whether the dump files are to be validated against the
-      partition graphs.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If dump_root does not exist as a directory.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.debug_watch_keys(node_name)` {#DebugDumpDir.debug_watch_keys}
-
-Get all tensor watch keys of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`list` of `str`) all debug tensor watch keys. Returns an empty list if
-    the node name does not correspond to any debug watch keys.
-
-##### Raises:
-
-  `LookupError`: If debug watch information has not been loaded from
-    partition graphs yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.devices()` {#DebugDumpDir.devices}
-
-Get the list of devices.
-
-##### Returns:
-
-  (`list` of `str`) names of the devices.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.dumped_tensor_data` {#DebugDumpDir.dumped_tensor_data}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.find(predicate, first_n=0)` {#DebugDumpDir.find}
-
-Find dumped tensor data by a certain predicate.
-
-##### Args:
-
-
-*  <b>`predicate`</b>: A callable that takes two input arguments:
-
-    ```python
-    def predicate(debug_tensor_datum, tensor):
-      # returns a bool
-    ```
-
-    where `debug_tensor_datum` is an instance of `DebugTensorDatum`, which
-    carries the metadata, such as the `Tensor`'s node name, output slot
-    timestamp, debug op name, etc.; and `tensor` is the dumped tensor value
-    as a `numpy.ndarray`.
-
-*  <b>`first_n`</b>: (`int`) return only the first n `DebugTensotDatum` instances (in
-    time order) for which the predicate returns True. To return all the
-    `DebugTensotDatum` instances, let first_n be <= 0.
-
-##### Returns:
-
-  A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
-   for which predicate returns True, sorted in ascending order of the
-   timestamp.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_dump_sizes_bytes(node_name, output_slot, debug_op)` {#DebugDumpDir.get_dump_sizes_bytes}
-
-Get the sizes of the dump files for a debug-dumped tensor.
-
-Unit of the file size: byte.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  (`list` of `int`): list of dump file sizes in bytes.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor watch key does not exist in the debug dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_rel_timestamps(node_name, output_slot, debug_op)` {#DebugDumpDir.get_rel_timestamps}
-
-Get the relative timestamp from for a debug-dumped tensor.
-
-Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
-absolute timestamp of the first dumped tensor in the dump root. The tensor
-may be dumped multiple times in the dump root directory, so a list of
-relative timestamps (`numpy.ndarray`) is returned.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  (`list` of `int`) list of relative timestamps.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor watch key does not exist in the debug dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_tensor_file_paths(node_name, output_slot, debug_op)` {#DebugDumpDir.get_tensor_file_paths}
-
-Get the file paths from a debug-dumped tensor.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  List of file path(s) loaded. This is a list because each debugged tensor
-    may be dumped multiple times.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor does not exist in the debug-dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_tensors(node_name, output_slot, debug_op)` {#DebugDumpDir.get_tensors}
-
-Get the tensor value from for a debug-dumped tensor.
-
-The tensor may be dumped multiple times in the dump root directory, so a
-list of tensors (`numpy.ndarray`) is returned.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor does not exist in the debug-dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.loaded_partition_graphs()` {#DebugDumpDir.loaded_partition_graphs}
-
-Test whether partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_attributes(node_name)` {#DebugDumpDir.node_attributes}
-
-Get the attributes of a node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node in question.
-
-##### Returns:
-
-  Attributes of the node.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-*  <b>`ValueError`</b>: If no node named node_name exists.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_device(node_name)` {#DebugDumpDir.node_device}
-
-Get the device of a node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`str`) name of the device on which the node is placed.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_exists(node_name)` {#DebugDumpDir.node_exists}
-
-Test if a node exists in the partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node to be checked.
-
-##### Returns:
-
-  A boolean indicating whether the node exists.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_inputs(node_name, is_control=False)` {#DebugDumpDir.node_inputs}
-
-Get the inputs of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node.
-*  <b>`is_control`</b>: (`bool`) Whether control inputs, rather than non-control
-    inputs, are to be returned.
-
-##### Returns:
-
-  (`list` of `str`) inputs to the node, as a list of node names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_op_type(node_name)` {#DebugDumpDir.node_op_type}
-
-Get the op type of given node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`str`) op type of the node.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node op types have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_recipients(node_name, is_control=False)` {#DebugDumpDir.node_recipients}
-
-Get recipient of the given node's output according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-*  <b>`is_control`</b>: (`bool`) whether control outputs, rather than non-control
-    outputs, are to be returned.
-
-##### Returns:
-
-  (`list` of `str`) all inputs to the node, as a list of node names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_traceback(element_name)` {#DebugDumpDir.node_traceback}
-
-Try to retrieve the Python traceback of node's construction.
-
-##### Args:
-
-
-*  <b>`element_name`</b>: (`str`) Name of a graph element (node or tensor).
-
-##### Returns:
-
-  (list) The traceback list object as returned by the `extract_trace`
-    method of Python's traceback module.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If Python graph is not available for traceback lookup.
-*  <b>`KeyError`</b>: If the node cannot be found in the Python graph loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.nodes()` {#DebugDumpDir.nodes}
-
-Get a list of all nodes from the partition graphs.
-
-##### Returns:
-
-  All nodes' names, as a list of str.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.partition_graphs()` {#DebugDumpDir.partition_graphs}
-
-Get the partition graphs.
-
-##### Returns:
-
-  Partition graphs as repeated fields of GraphDef.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.run_feed_keys_info` {#DebugDumpDir.run_feed_keys_info}
-
-Get a str representation of the feed_dict used in the Session.run() call.
-
-##### Returns:
-
-  If the information is available, a `str` obtained from `repr(feed_dict)`.
-  If the information is not available, `None`.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.run_fetches_info` {#DebugDumpDir.run_fetches_info}
-
-Get a str representation of the fetches used in the Session.run() call.
-
-##### Returns:
-
-  If the information is available, a `str` obtained from `repr(fetches)`.
-  If the information is not available, `None`.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.set_python_graph(python_graph)` {#DebugDumpDir.set_python_graph}
-
-Provide Python `Graph` object to the wrapper.
-
-Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
-is a Python object and carries additional information such as the traceback
-of the construction of the nodes in the graph.
-
-##### Args:
-
-
-*  <b>`python_graph`</b>: (ops.Graph) The Python Graph object.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.size` {#DebugDumpDir.size}
-
-Total number of dumped tensors in the dump root directory.
-
-##### Returns:
-
-  (`int`) total number of dumped tensors in the dump root directory.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.t0` {#DebugDumpDir.t0}
-
-Absolute timestamp of the first dumped tensor.
-
-##### Returns:
-
-  (`int`) absolute timestamp of the first dumped tensor, in microseconds.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.transitive_inputs(node_name, include_control=True)` {#DebugDumpDir.transitive_inputs}
-
-Get the transitive inputs of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node
-*  <b>`include_control`</b>: Include control inputs (True by default).
-
-##### Returns:
-
-  (`list` of `str`) all transitive inputs to the node, as a list of node
-    names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.watch_key_to_data(debug_watch_key)` {#DebugDumpDir.watch_key_to_data}
-
-Get all `DebugTensorDatum` instances corresponding to a debug watch key.
-
-##### Args:
-
-
-*  <b>`debug_watch_key`</b>: (`str`) debug watch key.
-
-##### Returns:
-
-  A list of `DebugTensorDatum` instances that correspond to the debug watch
-  key. If the watch key does not exist, returns an empty list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the debug watch key does not exist.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md
deleted file mode 100644
index 7a2b8936b3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md
+++ /dev/null
@@ -1,185 +0,0 @@
-A debugger hook that dumps debug data to filesystem.
-
-Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
-- - -
-
-#### `tf_debug.DumpingDebugHook.__enter__()` {#DumpingDebugHook.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugHook.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.__init__(session_root, watch_fn=None, log_usage=True)` {#DumpingDebugHook.__init__}
-
-Create a local debugger command-line interface (CLI) hook.
-
-##### Args:
-
-
-*  <b>`session_root`</b>: See doc of
-    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
-*  <b>`watch_fn`</b>: See doc of
-    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
-*  <b>`log_usage`</b>: (bool) Whether usage is to be logged.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.after_create_session(session, coord)` {#DumpingDebugHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.after_run(run_context, run_values)` {#DumpingDebugHook.after_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.before_run(run_context)` {#DumpingDebugHook.before_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.begin()` {#DumpingDebugHook.begin}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.close()` {#DumpingDebugHook.close}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.end(session)` {#DumpingDebugHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.graph` {#DumpingDebugHook.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugHook.invoke_node_stepper}
-
-See doc of BaseDebugWrapperSession.invoke_node_stepper.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_run_end(request)` {#DumpingDebugHook.on_run_end}
-
-See doc of BaseDebugWrapperSession.on_run_end.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_run_start(request)` {#DumpingDebugHook.on_run_start}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_session_init(request)` {#DumpingDebugHook.on_session_init}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugHook.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.partial_run_setup(fetches, feeds=None)` {#DumpingDebugHook.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugHook.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.sess_str` {#DumpingDebugHook.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.session` {#DumpingDebugHook.session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.has_inf_or_nan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.has_inf_or_nan.md
deleted file mode 100644
index c896055789b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.has_inf_or_nan.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf_debug.has_inf_or_nan(datum, tensor)` {#has_inf_or_nan}
-
-A predicate for whether a tensor consists of any bad numerical values.
-
-This predicate is common enough to merit definition in this module.
-Bad numerical values include `nan`s and `inf`s.
-The signature of this function follows the requirement of the method
-`DebugDumpDir.find()`.
-
-##### Args:
-
-
-*  <b>`datum`</b>: (`DebugTensorDatum`) Datum metadata.
-*  <b>`tensor`</b>: (`numpy.ndarray` or None) Value of the tensor. None represents
-    an uninitialized tensor.
-
-##### Returns:
-
-  (`bool`) True if and only if tensor consists of any nan or inf values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.OpError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.OpError.md
deleted file mode 100644
index 0323a56f663..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.OpError.md
+++ /dev/null
@@ -1,69 +0,0 @@
-A generic error that is raised when TensorFlow execution fails.
-
-Whenever possible, the session will raise a more specific subclass
-of `OpError` from the `tf.errors` module.
-
-- - -
-
-#### `tf.OpError.op` {#OpError.op}
-
-The operation that failed, if known.
-
-*N.B.* If the failed op was synthesized at runtime, e.g. a `Send`
-or `Recv` op, there will be no corresponding
-[`Operation`](../../api_docs/python/framework.md#Operation)
-object.  In that case, this will return `None`, and you should
-instead use the [`OpError.node_def`](#OpError.node_def) to
-discover information about the op.
-
-##### Returns:
-
-  The `Operation` that failed, or None.
-
-
-- - -
-
-#### `tf.OpError.node_def` {#OpError.node_def}
-
-The `NodeDef` proto representing the op that failed.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.OpError.__init__(node_def, op, message, error_code)` {#OpError.__init__}
-
-Creates a new `OpError` indicating that a particular op failed.
-
-##### Args:
-
-
-*  <b>`node_def`</b>: The `node_def_pb2.NodeDef` proto representing the op that
-    failed, if known; otherwise None.
-*  <b>`op`</b>: The `ops.Operation` that failed, if known; otherwise None.
-*  <b>`message`</b>: The message string describing the failure.
-*  <b>`error_code`</b>: The `error_codes_pb2.Code` describing the error.
-
-
-- - -
-
-#### `tf.OpError.__str__()` {#OpError.__str__}
-
-
-
-
-- - -
-
-#### `tf.OpError.error_code` {#OpError.error_code}
-
-The integer error code that describes the error.
-
-
-- - -
-
-#### `tf.OpError.message` {#OpError.message}
-
-The error message that describes the error.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.RandomShuffleQueue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.RandomShuffleQueue.md
deleted file mode 100644
index cd617e7578e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.RandomShuffleQueue.md
+++ /dev/null
@@ -1,54 +0,0 @@
-A queue implementation that dequeues elements in a random order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.RandomShuffleQueue.__init__(capacity, min_after_dequeue, dtypes, shapes=None, names=None, seed=None, shared_name=None, name='random_shuffle_queue')` {#RandomShuffleQueue.__init__}
-
-Create a queue that dequeues elements in a random order.
-
-A `RandomShuffleQueue` has bounded capacity; supports multiple
-concurrent producers and consumers; and provides exactly-once
-delivery.
-
-A `RandomShuffleQueue` holds a list of up to `capacity`
-elements. Each element is a fixed-length tuple of tensors whose
-dtypes are described by `dtypes`, and whose shapes are optionally
-described by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-The `min_after_dequeue` argument allows the caller to specify a
-minimum number of elements that will remain in the queue after a
-`dequeue` or `dequeue_many` operation completes, to ensure a
-minimum level of mixing of elements. This invariant is maintained
-by blocking those operations until sufficient elements have been
-enqueued. The `min_after_dequeue` argument is ignored after the
-queue has been closed.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`min_after_dequeue`</b>: An integer (described above).
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects
-    with the same length as `dtypes`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
deleted file mode 100644
index da82da60762..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.add(x, y, name=None)` {#add}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.asin.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.asin.md
deleted file mode 100644
index 64ec024b4c2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.asin.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.asin(x, name=None)` {#asin}
-
-Computes asin of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_greater.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_greater.md
deleted file mode 100644
index 70200819523..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_greater.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.assert_greater(x, y, data=None, summarize=None, message=None, name=None)` {#assert_greater}
-
-Assert the condition `x > y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_greater(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] > y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_greater".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x > y` is False.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md
deleted file mode 100644
index b0cb7d2dbb6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.assert_integer(x, message=None, name=None)` {#assert_integer}
-
-Assert that `x` is of integer dtype.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_integer(x)]):
-  output = tf.reduce_sum(x)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` whose basetype is integer and is not quantized.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_integer".
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x.dtype` is anything other than non-quantized integer.
-
-##### Returns:
-
-  A `no_op` that does nothing.  Type can be determined statically.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.boolean_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.boolean_mask.md
deleted file mode 100644
index 2f6c39a700a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.boolean_mask.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.boolean_mask(tensor, mask, name='boolean_mask')` {#boolean_mask}
-
-Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
-
-```python
-# 1-D example
-tensor = [0, 1, 2, 3]
-mask = np.array([True, False, True, False])
-boolean_mask(tensor, mask) ==> [0, 2]
-```
-
-In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
-the first K dimensions of `tensor`'s shape.  We then have:
-  `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
-where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
-
-##### Args:
-
-
-*  <b>`tensor`</b>: N-D tensor.
-*  <b>`mask`</b>: K-D boolean tensor, K <= N and K must be known statically.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
-  to `True` values in `mask`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If shapes do not conform.
-
-
-*  <b>`Examples`</b>: 
-
-```python
-# 2-D example
-tensor = [[1, 2], [3, 4], [5, 6]]
-mask = np.array([True, False, True])
-boolean_mask(tensor, mask) ==> [[1, 2], [5, 6]]
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.broadcast_dynamic_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.broadcast_dynamic_shape.md
deleted file mode 100644
index 5dc534473ae..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.broadcast_dynamic_shape.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.broadcast_dynamic_shape(shape_x, shape_y)` {#broadcast_dynamic_shape}
-
-Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
-
-##### Args:
-
-
-*  <b>`shape_x`</b>: A rank 1 integer `Tensor`, representing the shape of x.
-*  <b>`shape_y`</b>: A rank 1 integer `Tensor`, representing the shape of x.
-
-##### Returns:
-
-  A rank 1 integer `Tensor` representing the broadcasted shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cast.md
deleted file mode 100644
index 9571f87afef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cast.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.cast(x, dtype, name=None)` {#cast}
-
-Casts a tensor to a new type.
-
-The operation casts `x` (in case of `Tensor`) or `x.values`
-(in case of `SparseTensor`) to `dtype`.
-
-For example:
-
-```python
-# tensor `a` is [1.8, 2.2], dtype=tf.float
-tf.cast(a, tf.int32) ==> [1, 2]  # dtype=tf.int32
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`dtype`</b>: The destination type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_global_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_global_norm.md
deleted file mode 100644
index a40f621bf40..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_global_norm.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None)` {#clip_by_global_norm}
-
-Clips values of multiple tensors by the ratio of the sum of their norms.
-
-Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
-this operation returns a list of clipped tensors `list_clipped`
-and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
-if you've already computed the global norm for `t_list`, you can specify
-the global norm with `use_norm`.
-
-To perform the clipping, the values `t_list[i]` are set to:
-
-    t_list[i] * clip_norm / max(global_norm, clip_norm)
-
-where:
-
-    global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
-
-If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
-otherwise they're all shrunk by the global ratio.
-
-Any of the entries of `t_list` that are of type `None` are ignored.
-
-This is the correct way to perform gradient clipping (for example, see
-[Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
-([pdf](http://arxiv.org/pdf/1211.5063.pdf))).
-
-However, it is slower than `clip_by_norm()` because all the parameters must be
-ready before the clipping operation can be performed.
-
-##### Args:
-
-
-*  <b>`t_list`</b>: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
-*  <b>`use_norm`</b>: A 0-D (scalar) `Tensor` of type `float` (optional). The global
-    norm to use. If not provided, `global_norm()` is used to compute the norm.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`list_clipped`</b>: A list of `Tensors` of the same type as `list_t`.
-*  <b>`global_norm`</b>: A 0-D (scalar) `Tensor` representing the global norm.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `t_list` is not a sequence.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_norm.md
deleted file mode 100644
index 22a642aed97..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.clip_by_norm.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.clip_by_norm(t, clip_norm, axes=None, name=None)` {#clip_by_norm}
-
-Clips tensor values to a maximum L2-norm.
-
-Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
-normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
-along the dimensions given in `axes`. Specifically, in the default case
-where all dimensions are used for calculation, if the L2-norm of `t` is
-already less than or equal to `clip_norm`, then `t` is not modified. If
-the L2-norm is greater than `clip_norm`, then this operation returns a
-tensor of the same type and shape as `t` with its values set to:
-
-`t * clip_norm / l2norm(t)`
-
-In this case, the L2-norm of the output tensor is `clip_norm`.
-
-As another example, if `t` is a matrix and `axes == [1]`, then each row
-of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
-instead, each column of the output will be clipped.
-
-This operation is typically used to clip gradients before applying them with
-an optimizer.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
-*  <b>`axes`</b>: A 1-D (vector) `Tensor` of type int32 containing the dimensions
-    to use for computing the L2-norm. If `None` (the default), uses all
-    dimensions.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.container.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.container.md
deleted file mode 100644
index 44221cd098f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.container.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.container(container_name)` {#container}
-
-Wrapper for `Graph.container()` using the default graph.
-
-##### Args:
-
-
-*  <b>`container_name`</b>: The container string to use in the context.
-
-##### Returns:
-
-  A context manager that specifies the default container to use for newly
-  created stateful ops.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.bayesflow.stochastic_graph.surrogate_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.bayesflow.stochastic_graph.surrogate_loss.md
deleted file mode 100644
index 0928e5d0016..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.bayesflow.stochastic_graph.surrogate_loss.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.contrib.bayesflow.stochastic_graph.surrogate_loss(sample_losses, stochastic_tensors=None, name='SurrogateLoss')` {#surrogate_loss}
-
-Surrogate loss for stochastic graphs.
-
-This function will call `loss_fn` on each `StochasticTensor`
-upstream of `sample_losses`, passing the losses that it influenced.
-
-Note that currently `surrogate_loss` does not work with `StochasticTensor`s
-instantiated in `while_loop`s or other control structures.
-
-##### Args:
-
-
-*  <b>`sample_losses`</b>: a list or tuple of final losses. Each loss should be per
-    example in the batch (and possibly per sample); that is, it should have
-    dimensionality of 1 or greater. All losses should have the same shape.
-*  <b>`stochastic_tensors`</b>: a list of `StochasticTensor`s to add loss terms for.
-    If None, defaults to all `StochasticTensor`s in the graph upstream of
-    the `Tensor`s in `sample_losses`.
-*  <b>`name`</b>: the name with which to prepend created ops.
-
-##### Returns:
-
-  `Tensor` loss, which is the sum of `sample_losses` and the
-  `loss_fn`s returned by the `StochasticTensor`s.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `sample_losses` is not a list or tuple, or if its elements
-    are not `Tensor`s.
-*  <b>`ValueError`</b>: if any loss in `sample_losses` does not have dimensionality 1
-    or greater.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md
deleted file mode 100644
index 830a38940fe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.contrib.crf.crf_log_norm(inputs, sequence_lengths, transition_params)` {#crf_log_norm}
-
-Computes the normalization for a CRF.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
-
-##### Returns:
-
-
-*  <b>`log_norm`</b>: A [batch_size] vector of normalizers for a CRF.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Affine.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Affine.md
deleted file mode 100644
index 751ae7869c6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Affine.md
+++ /dev/null
@@ -1,398 +0,0 @@
-Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-In TF parlance, the `scale` term is logically equivalent to:
-
-```python
-scale = (
-  scale_identity_multiplier * tf.diag(tf.ones(d)) +
-  tf.diag(scale_diag) +
-  scale_tril +
-  scale_perturb_factor @ diag(scale_perturb_diag) @
-    tf.transpose([scale_perturb_factor])
-)
-```
-
-The `scale` term is applied without necessarily materializing constituent
-matrices, i.e., the matmul is [matrix-free](
-https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-Examples:
-
-```python
-# Y = X
-b = Affine()
-
-# Y = X + shift
-b = Affine(shift=[1., 2, 3])
-
-# Y = 2 * I @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_identity_multiplier=2.)
-
-# Y = tf.diag(d1) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-# Y = (I + v * v.T) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_perturb_factor=[[1., 0],
-                                 [0, 1],
-                                 [1, 1]])
-
-# Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-b = Affine(shift=[1., 2, 3],
-           scale_diag=[1., 3, 3],          # Implicitly 3x3.
-           scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-           scale_perturb_factor=[[1., 0],
-                                 [0, 1],
-                                 [1, 1]])
-
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.__init__(shift=None, scale_identity_multiplier=None, scale_diag=None, scale_tril=None, scale_perturb_factor=None, scale_perturb_diag=None, event_ndims=1, validate_args=False, name='affine')` {#Affine.__init__}
-
-Instantiates the `Affine` bijector.
-
-This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-giving the forward operation:
-
-```none
-Y = g(X) = scale @ X + shift
-```
-
-where the `scale` term is logically equivalent to:
-
-```python
-scale = (
-  scale_identity_multiplier * tf.diag(tf.ones(d)) +
-  tf.diag(scale_diag) +
-  scale_tril +
-  scale_perturb_factor @ diag(scale_perturb_diag) @
-    tf.transpose([scale_perturb_factor])
-)
-```
-
-If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-specified then `scale += IdentityMatrix`. Otherwise specifying a
-`scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-`scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-##### Args:
-
-
-*  <b>`shift`</b>: Numeric `Tensor`.  If this is set to `None`, no shift is applied.
-*  <b>`scale_identity_multiplier`</b>: floating point rank 0 `Tensor` representing a
-    scaling done to the identity matrix.
-    When `scale_identity_multiplier = scale_diag=scale_tril = None` then
-    `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-    to `scale`.
-*  <b>`scale_diag`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_diag` has shape [N1, N2, ... k], which represents a k x k
-    diagonal matrix.
-    When `None` no diagonal term is added to `scale`.
-*  <b>`scale_tril`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k
-    lower triangular matrix.
-    When `None` no `scale_tril` term is added to `scale`.
-    The upper triangular elements above the diagonal are ignored.
-*  <b>`scale_perturb_factor`</b>: Numeric `Tensor` representing factor matrix with
-    last two dimensions of shape `(k, r)`.
-    When `None`, no rank-r update is added to `scale`.
-*  <b>`scale_perturb_diag`</b>: Numeric `Tensor` representing the diagonal matrix.
-    `scale_perturb_diag` has shape [N1, N2, ... r], which represents an
-    r x r Diagonal matrix.
-    When `None` low rank updates will take the form `scale_perturb_factor *
-    scale_perturb_factor.T`.
-*  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution. Must be 0 or 1.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `perturb_diag` is specified but not `perturb_factor`.
-*  <b>`TypeError`</b>: if `shift` has different `dtype` from `scale` arguments.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.dtype` {#Affine.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.event_ndims` {#Affine.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward(x, name='forward')` {#Affine.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_event_shape(input_shape)` {#Affine.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Affine.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Affine.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.graph_parents` {#Affine.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse(y, name='inverse')` {#Affine.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Affine.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_event_shape(output_shape)` {#Affine.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Affine.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Affine.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.is_constant_jacobian` {#Affine.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.name` {#Affine.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.scale` {#Affine.scale}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.shift` {#Affine.shift}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Affine.validate_args` {#Affine.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md
deleted file mode 100644
index 3970bd44ad0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md
+++ /dev/null
@@ -1,323 +0,0 @@
-Bijector which applies a sequence of bijectors.
-
-Example Use:
-
-```python
-chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-```
-
-Results in:
-
-* Forward:
-
- ```python
- exp = Exp()
- softplus = Softplus()
- Chain([exp, softplus]).forward(x)
- = exp.forward(softplus.forward(x))
- = tf.exp(tf.log(1. + tf.exp(x)))
- = 1. + tf.exp(x)
- ```
-
-* Inverse:
-
- ```python
- exp = Exp()
- softplus = Softplus()
- Chain([exp, softplus]).inverse(y)
- = softplus.inverse(exp.inverse(y))
- = tf.log(tf.exp(tf.log(y)) - 1.)
- = tf.log(y - 1.)
- ```
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.__init__(bijectors=(), validate_args=False, name=None)` {#Chain.__init__}
-
-Instantiates `Chain` bijector.
-
-##### Args:
-
-
-*  <b>`bijectors`</b>: Python list of bijector instances. An empty list makes this
-    bijector equivalent to the `Identity` bijector.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object. Default: E.g.,
-    `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if bijectors have different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.bijectors` {#Chain.bijectors}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.dtype` {#Chain.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.event_ndims` {#Chain.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward(x, name='forward')` {#Chain.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_event_shape(input_shape)` {#Chain.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Chain.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Chain.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.graph_parents` {#Chain.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse(y, name='inverse')` {#Chain.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Chain.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_event_shape(output_shape)` {#Chain.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Chain.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Chain.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.is_constant_jacobian` {#Chain.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.name` {#Chain.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Chain.validate_args` {#Chain.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md
deleted file mode 100644
index 33e7d154968..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md
+++ /dev/null
@@ -1,304 +0,0 @@
-Compute `Y = g(X) = exp(X)`.
-
-Example Use:
-
-```python
-# Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-# batch ndim and 2 event ndims (i.e., vector of matrices).
-exp = Exp(event_ndims=2)
-x = [[[1., 2],
-       [3, 4]],
-      [[5, 6],
-       [7, 8]]]
-exp(x) == exp.forward(x)
-log(x) == exp.inverse(x)
-```
-
-Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-over the event space.
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.__init__(event_ndims=0, validate_args=False, name='exp')` {#Exp.__init__}
-
-Instantiates the `Exp` bijector.
-
-##### Args:
-
-
-*  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.dtype` {#Exp.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.event_ndims` {#Exp.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward(x, name='forward')` {#Exp.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_event_shape(input_shape)` {#Exp.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Exp.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Exp.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.graph_parents` {#Exp.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse(y, name='inverse')` {#Exp.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Exp.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_event_shape(output_shape)` {#Exp.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Exp.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Exp.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.is_constant_jacobian` {#Exp.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.name` {#Exp.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.power` {#Exp.power}
-
-The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Exp.validate_args` {#Exp.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.arg_scoped_arguments.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.arg_scoped_arguments.md
deleted file mode 100644
index 507c28206d8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.arg_scoped_arguments.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.contrib.framework.arg_scoped_arguments(func)` {#arg_scoped_arguments}
-
-Returns the list kwargs that arg_scope can set for a func.
-
-##### Args:
-
-
-*  <b>`func`</b>: function which has been decorated with @add_arg_scope.
-
-##### Returns:
-
-  a list of kwargs names.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.get_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.get_variables.md
deleted file mode 100644
index e74c25d4a49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.get_variables.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.contrib.framework.get_variables(scope=None, suffix=None, collection='variables')` {#get_variables}
-
-Gets the list of variables, filtered by scope and/or suffix.
-
-##### Args:
-
-
-*  <b>`scope`</b>: an optional scope for filtering the variables to return. Can be a
-    variable scope or a string.
-*  <b>`suffix`</b>: an optional suffix for filtering the variables to return.
-*  <b>`collection`</b>: in which collection search for. Defaults to
-    `GraphKeys.GLOBAL_VARIABLES`.
-
-##### Returns:
-
-  a list of variables in collection with scope and suffix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.list_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.list_variables.md
deleted file mode 100644
index fc8cceb6b1e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.list_variables.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.contrib.framework.list_variables(checkpoint_dir)` {#list_variables}
-
-Returns list of all variables in the latest checkpoint.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory with checkpoints file or path to checkpoint.
-
-##### Returns:
-
-  List of tuples `(name, shape)`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.variable.md
deleted file mode 100644
index 79081d4e9f6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.framework.variable.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.contrib.framework.variable(*args, **kwargs)` {#variable}
-
-Gets an existing variable with these parameters or creates a new one.
-
-##### Args:
-
-
-*  <b>`name`</b>: the name of the new or existing variable.
-*  <b>`shape`</b>: shape of the new or existing variable.
-*  <b>`dtype`</b>: type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: initializer for the variable if one is created.
-*  <b>`regularizer`</b>: a (Tensor -> Tensor or None) function; the result of
-      applying it on a newly created variable will be added to the collection
-      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: A list of collection names to which the Variable will be added.
-    If None it would default to `tf.GraphKeys.GLOBAL_VARIABLES`.
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-      Variable should be cached for reading.  Defaults to the Variable's
-      device.
-*  <b>`device`</b>: Optional device to place the variable. It can be an string or a
-    function that is called to get the device for the variable.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and dtype of the `Variable` to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`custom_getter`</b>: Callable that allows overwriting the internal
-    get_variable method and has to have the same signature.
-
-##### Returns:
-
-  The created or existing variable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
deleted file mode 100644
index 987a242d90e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
-
-Bypass the given subgraph by connecting its inputs to its outputs.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
-    subgraph using the same rules than the function subgraph.make_view.
-    Note that sgv is modified in place.
-
-##### Returns:
-
-  A tuple `(sgv, detached_inputs)` where:
-    `sgv` is a new subgraph view of the bypassed subgraph;
-    `detached_inputs` is a list of the created input placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.can_be_regex.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.can_be_regex.md
deleted file mode 100644
index 212faafdc4d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.can_be_regex.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.graph_editor.can_be_regex(obj)` {#can_be_regex}
-
-Return True if obj can be turned into a regular expression.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.detach_control_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.detach_control_outputs.md
deleted file mode 100644
index 4488755c9bd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.detach_control_outputs.md
+++ /dev/null
@@ -1,11 +0,0 @@
-### `tf.contrib.graph_editor.detach_control_outputs(sgv, control_outputs)` {#detach_control_outputs}
-
-Detach all the external control outputs of the subgraph sgv.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-*  <b>`control_outputs`</b>: a util.ControlOutputs instance.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.get_walks_intersection_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.get_walks_intersection_ops.md
deleted file mode 100644
index 355b6301f87..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.get_walks_intersection_ops.md
+++ /dev/null
@@ -1,39 +0,0 @@
-### `tf.contrib.graph_editor.get_walks_intersection_ops(forward_seed_ops, backward_seed_ops, forward_inclusive=True, backward_inclusive=True, within_ops=None, control_inputs=False, control_outputs=None, control_ios=None)` {#get_walks_intersection_ops}
-
-Return the intersection of a forward and a backward walk.
-
-##### Args:
-
-
-*  <b>`forward_seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`backward_seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`forward_inclusive`</b>: if True the given forward_seed_ops are also part of the
-    resulting set.
-*  <b>`backward_inclusive`</b>: if True the given backward_seed_ops are also part of the
-    resulting set.
-*  <b>`within_ops`</b>: an iterable of tf.Operation within which the search is
-    restricted. If within_ops is None, the search is performed within
-    the whole graph.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A Python set of all the tf.Operation in the intersection of a forward and a
-    backward walk.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `forward_seed_ops` or `backward_seed_ops` or `within_ops`
-    cannot be converted to a list of `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.make_regex.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.make_regex.md
deleted file mode 100644
index e0aaae10b7b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.make_regex.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.make_regex(obj)` {#make_regex}
-
-Return a compiled regular expression.
-
-##### Args:
-
-
-*  <b>`obj`</b>: a string or a regular expression.
-
-##### Returns:
-
-  A compiled regular expression.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if obj could not be converted to a regular expression.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.remove_control_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.remove_control_inputs.md
deleted file mode 100644
index 59b3630485a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.remove_control_inputs.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.graph_editor.remove_control_inputs(op, cops)` {#remove_control_inputs}
-
-Remove the control inputs cops from co.
-
-Warning: this function is directly manipulating the internals of the
-`tf.Graph`.
-
-##### Args:
-
-
-*  <b>`op`</b>: a `tf.Operation` from which to remove the control inputs.
-*  <b>`cops`</b>: an object convertible to a list of `tf.Operation`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if op is not a `tf.Operation`.
-*  <b>`ValueError`</b>: if any cop in cops is not a control input of op.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md
deleted file mode 100644
index 40141a83f6a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md
+++ /dev/null
@@ -1,75 +0,0 @@
-### `tf.contrib.layers.convolution2d(*args, **kwargs)` {#convolution2d}
-
-Adds an N-D convolution followed by an optional batch_norm layer.
-
-It is required that 1 <= N <= 3.
-
-`convolution` creates a variable called `weights`, representing the
-convolutional kernel, that is convolved (actually cross-correlated) with the
-`inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is
-provided (such as `batch_norm`), it is then applied. Otherwise, if
-`normalizer_fn` is None and a `biases_initializer` is provided then a `biases`
-variable would be created and added the activations. Finally, if
-`activation_fn` is not `None`, it is applied to the activations as well.
-
-Performs a'trous convolution with input stride/dilation rate equal to `rate`
-if a value > 1 for any dimension of `rate` is specified.  In this case
-`stride` values != 1 are not supported.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A Tensor of rank N+2 of shape
-    `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, in_channels] + input_spatial_shape` if data_format starts
-    with "NC".
-*  <b>`num_outputs`</b>: Integer, the number of output filters.
-*  <b>`kernel_size`</b>: A sequence of N positive integers specifying the spatial
-    dimensions of of the filters.  Can be a single integer to specify the same
-    value for all spatial dimensions.
-*  <b>`stride`</b>: A sequence of N positive integers specifying the stride at which to
-    compute output.  Can be a single integer to specify the same value for all
-    spatial dimensions.  Specifying any `stride` value != 1 is incompatible
-    with specifying any `rate` value != 1.
-*  <b>`padding`</b>: One of `"VALID"` or `"SAME"`.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, currently the only valid value is "NDHWC".
-*  <b>`rate`</b>: A sequence of N positive integers specifying the dilation rate to use
-    for a'trous convolution.  Can be a single integer to specify the same
-    value for all spatial dimensions.  Specifying any `rate` value != 1 is
-    incompatible with specifying any `stride` value != 1.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A tensor representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is invalid.
-*  <b>`ValueError`</b>: Both 'rate' and `stride` are not uniformly 1.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md
deleted file mode 100644
index 846a09e3bb4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.layers.fully_connected(*args, **kwargs)` {#fully_connected}
-
-Adds a fully connected layer.
-
-`fully_connected` creates a variable called `weights`, representing a fully
-connected weight matrix, which is multiplied by the `inputs` to produce a
-`Tensor` of hidden units. If a `normalizer_fn` is provided (such as
-`batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is
-None and a `biases_initializer` is provided then a `biases` variable would be
-created and added the hidden units. Finally, if `activation_fn` is not `None`,
-it is applied to the hidden units as well.
-
-Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened
-prior to the initial matrix multiply by `weights`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor of at least rank 2 and static value for the last dimension;
-    i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
-*  <b>`num_outputs`</b>: Integer or long, the number of output units in the layer.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collections per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-   The tensor variable representing the result of the series of operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x has rank less than 2 or if its last dimension is not set.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.legacy_fully_connected.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.legacy_fully_connected.md
deleted file mode 100644
index f10993af306..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.legacy_fully_connected.md
+++ /dev/null
@@ -1,73 +0,0 @@
-### `tf.contrib.layers.legacy_fully_connected(x, num_output_units, activation_fn=None, weight_init=_initializer, bias_init=Zeros(), name=None, weight_collections=('weights',), bias_collections=('biases',), output_collections=('activations',), trainable=True, weight_regularizer=None, bias_regularizer=None)` {#legacy_fully_connected}
-
-Adds the parameters for a fully connected layer and returns the output.
-
-A fully connected layer is generally defined as a matrix multiply:
-`y = f(w * x + b)` where `f` is given by `activation_fn`. If
-`activation_fn` is `None`, the result of `y = w * x + b` is
-returned.
-
-If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
-multiply along the first dimensions. The result r is a tensor of shape
-[\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-where \\\( r_{i_0, ..., i_{n-1}, k} =
-\\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
-This is accomplished by reshaping `x` to 2-D
-[\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
-before the matrix multiply and afterwards reshaping it to
-[\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
-
-This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
-`bias_init` to `None`.
-
-The variable creation is compatible with `tf.variable_scope` and so can be
-reused with `tf.variable_scope` or `tf.make_template`.
-
-Most of the details of variable creation can be controlled by specifying the
-initializers (`weight_init` and `bias_init`) and in which collections to place
-the created variables (`weight_collections` and `bias_collections`; note that
-the variables are always added to the `VARIABLES` collection). The output of
-the layer can be placed in custom collections using `output_collections`.
-The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`,
-respectively.
-
-A per layer regularization can be specified by setting `weight_regularizer`
-and `bias_regularizer`, which are applied to the weights and biases
-respectively, and whose output is added to the `REGULARIZATION_LOSSES`
-collection.
-
-##### Args:
-
-
-*  <b>`x`</b>: The input `Tensor`.
-*  <b>`num_output_units`</b>: The size of the output.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`weight_init`</b>: An optional weight initialization, defaults to
-    `xavier_initializer`.
-*  <b>`bias_init`</b>: An initializer for the bias, defaults to 0. Set to `None` in
-    order to disable bias.
-*  <b>`name`</b>: The name for this operation is used to name operations and to find
-    variables. If specified it must be unique for this scope, otherwise a
-    unique name starting with "fully_connected" will be created.  See
-    `tf.variable_scope` for details.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`bias_collections`</b>: List of graph collections to which biases are added.
-*  <b>`output_collections`</b>: List of graph collections to which outputs are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`weight_regularizer`</b>: A regularizer like the result of
-    `l1_regularizer` or `l2_regularizer`. Used for weights.
-*  <b>`bias_regularizer`</b>: A regularizer like the result of
-    `l1_regularizer` or `l2_regularizer`. Used for biases.
-
-##### Returns:
-
-  The output of the fully connected layer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x has rank less than 2 or if its last dimension is not set.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.multi_class_target.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.multi_class_target.md
deleted file mode 100644
index a1ef504e4e5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.multi_class_target.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.layers.multi_class_target(*args, **kwargs)` {#multi_class_target}
-
-Creates a _TargetColumn for multi class single label classification. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-12.
-Instructions for updating:
-This file will be removed after the deprecation date.Please switch to third_party/tensorflow/contrib/learn/python/learn/estimators/head.py
-
-The target column uses softmax cross entropy loss.
-
-##### Args:
-
-
-*  <b>`n_classes`</b>: Integer, number of classes, must be >= 2
-*  <b>`label_name`</b>: String, name of the key in label dict. Can be null if label
-      is a tensor (single headed models).
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-
-##### Returns:
-
-  An instance of _MultiClassTargetColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if n_classes is < 2
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.one_hot_encoding.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.one_hot_encoding.md
deleted file mode 100644
index 7cc66041ea2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.one_hot_encoding.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.layers.one_hot_encoding(*args, **kwargs)` {#one_hot_encoding}
-
-Transform numeric labels into onehot_labels using `tf.one_hot`.
-
-##### Args:
-
-
-*  <b>`labels`</b>: [batch_size] target labels.
-*  <b>`num_classes`</b>: Total number of classes.
-*  <b>`on_value`</b>: A scalar defining the on-value.
-*  <b>`off_value`</b>: A scalar defining the off-value.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  One-hot encoding of the labels.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.scattered_embedding_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.scattered_embedding_column.md
deleted file mode 100644
index b905120ec48..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.scattered_embedding_column.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.contrib.layers.scattered_embedding_column(column_name, size, dimension, hash_key, combiner='mean', initializer=None)` {#scattered_embedding_column}
-
-Creates an embedding column of a sparse feature using parameter hashing.
-
-The i-th embedding component of a value v is found by retrieving an
-embedding weight whose index is a fingerprint of the pair (v,i).
-
-An embedding column with sparse_column_with_hash_bucket such as
-  embedding_column(
-      sparse_column_with_hash_bucket(column_name, bucket_size),
-      dimension)
-
-could be replaced by
-  scattered_embedding_column(
-      column_name, size=bucket_size * dimension, dimension=dimension,
-      hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
-
-for the same number of embedding parameters and hopefully reduced impact of
-collisions with a cost of slowing down training.
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`size`</b>: An integer specifying the number of parameters in the embedding layer.
-*  <b>`dimension`</b>: An integer specifying dimension of the embedding.
-*  <b>`hash_key`</b>: Specify the hash_key that will be used by the `FingerprintCat64`
-    function to combine the crosses fingerprints on SparseFeatureCrossOp.
-*  <b>`combiner`</b>: A string specifying how to reduce if there are multiple entries
-    in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
-    "mean" the default. "sqrtn" often achieves good accuracy, in particular
-    with bag-of-words columns. Each of this can be thought as example level
-    normalizations on the column:
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`initializer`</b>: A variable initializer function to be used in embedding
-    variable initialization. If not specified, defaults to
-    `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1.
-
-##### Returns:
-
-  A _ScatteredEmbeddingColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dimension or size is not a positive integer; or if combiner
-    is not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.sum_regularizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.sum_regularizer.md
deleted file mode 100644
index 4ea32c21351..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.sum_regularizer.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.contrib.layers.sum_regularizer(regularizer_list, scope=None)` {#sum_regularizer}
-
-Returns a function that applies the sum of multiple regularizers.
-
-##### Args:
-
-
-*  <b>`regularizer_list`</b>: A list of regularizers to apply.
-*  <b>`scope`</b>: An optional scope name
-
-##### Returns:
-
-  A function with signature `sum_reg(weights)` that applies the
-  sum of all the input regularizers.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.summarize_collection.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.summarize_collection.md
deleted file mode 100644
index b1b5f56056d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.summarize_collection.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.layers.summarize_collection(collection, name_filter=None, summarizer=summarize_tensor)` {#summarize_collection}
-
-Summarize a graph collection of tensors, possibly filtered by name.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.BaseMonitor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.BaseMonitor.md
deleted file mode 100644
index bea2cc65168..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.BaseMonitor.md
+++ /dev/null
@@ -1,187 +0,0 @@
-Base class for Monitors.
-
-Defines basic interfaces of Monitors.
-Monitors can either be run on all workers or, more commonly, restricted
-to run exclusively on the elected chief worker.
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.__init__(*args, **kwargs)` {#BaseMonitor.__init__}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-05.
-Instructions for updating:
-Monitors are deprecated. Please use tf.train.SessionRunHook.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.begin(max_steps=None)` {#BaseMonitor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.end(session=None)` {#BaseMonitor.end}
-
-Callback at the end of training/evaluation.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `tf.Session` object that can be used to run ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.epoch_begin(epoch)` {#BaseMonitor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.epoch_end(epoch)` {#BaseMonitor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.post_step(step, session)` {#BaseMonitor.post_step}
-
-Callback after the step is finished.
-
-Called after step_end and receives session to perform extra session.run
-calls. If failure occurred in the process, will be called as well.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, global step of the model.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.run_on_all_workers` {#BaseMonitor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.set_estimator(estimator)` {#BaseMonitor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.step_begin(step)` {#BaseMonitor.step_begin}
-
-Callback before training step begins.
-
-You may use this callback to request evaluation of additional tensors
-in the graph.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  List of `Tensor` objects or string tensor names to be run.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a step, or `step` < 0, or
-      `step` > `max_steps`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.BaseMonitor.step_end(step, output)` {#BaseMonitor.step_end}
-
-Callback after training step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-Note that this method is not called if the call to `Session.run()` that
-followed the last call to `step_begin()` failed.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
deleted file mode 100644
index 310b927376c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
+++ /dev/null
@@ -1,146 +0,0 @@
-Saves checkpoints every N steps or N seconds.
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
-
-Initialize CheckpointSaver monitor.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
-*  <b>`save_secs`</b>: `int`, save every N secs.
-*  <b>`save_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
-*  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
-*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.end(session=None)` {#CheckpointSaver.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.epoch_begin(epoch)` {#CheckpointSaver.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.epoch_end(epoch)` {#CheckpointSaver.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.run_on_all_workers` {#CheckpointSaver.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.set_estimator(estimator)` {#CheckpointSaver.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
-
-Callback after training step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-Note that this method is not called if the call to `Session.run()` that
-followed the last call to `step_begin()` failed.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md
deleted file mode 100644
index 4f1e3dcc94f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md
+++ /dev/null
@@ -1,57 +0,0 @@
-Wraps monitors into a SessionRunHook.
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.__init__(monitors)` {#RunHookAdapterForMonitors.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session, coord)` {#RunHookAdapterForMonitors.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_run(run_context, run_values)` {#RunHookAdapterForMonitors.after_run}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.before_run(run_context)` {#RunHookAdapterForMonitors.before_run}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.begin()` {#RunHookAdapterForMonitors.begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.end(session)` {#RunHookAdapterForMonitors.end}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.run_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.run_n.md
deleted file mode 100644
index 69abb2628d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.run_n.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.learn.run_n(*args, **kwargs)` {#run_n}
-
-Run `output_dict` tensors `n` times, with the same `feed_dict` each run. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-##### Args:
-
-
-*  <b>`output_dict`</b>: A `dict` mapping string names to tensors to run. Must all be
-    from the same graph.
-*  <b>`feed_dict`</b>: `dict` of input values to feed each run.
-*  <b>`restore_checkpoint_path`</b>: A string containing the path to a checkpoint to
-    restore.
-*  <b>`n`</b>: Number of times to repeat.
-
-##### Returns:
-
-  A list of `n` `dict` objects, each containing values read from `output_dict`
-  tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md
deleted file mode 100644
index a7b6c99c9a9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.legacy_seq2seq.sequence_loss_by_example.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None)` {#sequence_loss_by_example}
-
-Weighted cross-entropy loss for a sequence of logits (per example).
-
-##### Args:
-
-
-*  <b>`logits`</b>: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
-*  <b>`targets`</b>: List of 1D batch-sized int32 Tensors of the same length as logits.
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors of the same length as logits.
-*  <b>`average_across_timesteps`</b>: If set, divide the returned cost by the total
-    label weight.
-*  <b>`softmax_loss_function`</b>: Function (labels-batch, inputs-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`name`</b>: Optional name for this operation, default: "sequence_loss_by_example".
-
-##### Returns:
-
-  1D batch-sized float Tensor: The log-perplexity for each sequence.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If len(logits) is different from len(targets) or len(weights).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md
deleted file mode 100644
index f722c1dbfac..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.losses.sigmoid_cross_entropy.md
+++ /dev/null
@@ -1,39 +0,0 @@
-### `tf.contrib.losses.sigmoid_cross_entropy(*args, **kwargs)` {#sigmoid_cross_entropy}
-
-Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.sigmoid_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-If `label_smoothing` is nonzero, smooth the labels towards 1/2:
-
-    new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
-                            + 0.5 * label_smoothing
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`multi_class_labels`</b>: [batch_size, num_classes] labels in (0, 1).
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar, a tensor of
-    shape [batch_size] or shape [batch_size, num_classes].
-*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of
-    `multi_class_labels` or if the shape of `weights` is invalid, or if
-    `weights` is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.accuracy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.accuracy.md
deleted file mode 100644
index 71a82cb2487..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.accuracy.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.contrib.metrics.accuracy(predictions, labels, weights=None)` {#accuracy}
-
-Computes the percentage of times that predictions matches labels.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: the predicted values, a `Tensor` whose dtype and shape
-               matches 'labels'.
-*  <b>`labels`</b>: the ground truth values, a `Tensor` of any shape and
-          bool, integer, or string dtype.
-*  <b>`weights`</b>: None or `Tensor` of float values to reweight the accuracy.
-
-##### Returns:
-
-  Accuracy `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if dtypes don't match or
-              if dtype is not bool, integer, or string.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md
deleted file mode 100644
index d3f748fec77..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_false_positives.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.metrics.streaming_false_positives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_false_positives}
-
-Sum the weights of false positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
deleted file mode 100644
index 829c15ee81c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.metrics.streaming_percentage_less(values, threshold, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_percentage_less}
-
-Computes the percentage of values less than the given threshold.
-
-The `streaming_percentage_less` function creates two local variables,
-`total` and `count` that are used to compute the percentage of `values` that
-fall below `threshold`. This rate is weighted by `weights`, and it is
-ultimately returned as `percentage` which is an idempotent operation that
-simply divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`percentage`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`values`</b>: A numeric `Tensor` of arbitrary size.
-*  <b>`threshold`</b>: A scalar threshold.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`percentage`</b>: A `Tensor` representing the current mean, the value of `total`
-    divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_recall_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_recall_at_thresholds.md
deleted file mode 100644
index 10c3c2a29c7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_recall_at_thresholds.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.metrics.streaming_recall_at_thresholds(predictions, labels, thresholds, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall_at_thresholds}
-
-Computes various recall values for different `thresholds` on `predictions`.
-
-The `streaming_recall_at_thresholds` function creates four local variables,
-`true_positives`, `true_negatives`, `false_positives` and `false_negatives`
-for various values of thresholds. `recall[i]` is defined as the total weight
-of values in `predictions` above `thresholds[i]` whose corresponding entry in
-`labels` is `True`, divided by the total weight of `True` values in `labels`
-(`true_positives[i] / (true_positives[i] + false_negatives[i])`).
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `recall`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`thresholds`</b>: A python list or tuple of float thresholds in `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should be
-    added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: A float `Tensor` of shape `[len(thresholds)]`.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables that
-    are used in the computation of `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md
deleted file mode 100644
index 4e1b78cd8b4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.EmbeddingWrapper.md
+++ /dev/null
@@ -1,71 +0,0 @@
-Operator adding input embedding to the given cell.
-
-Note: in many cases it may be more efficient to not use this wrapper,
-but instead concatenate the whole sequence of your inputs in time,
-do the embedding on this batch-concatenated sequence, then split it and
-feed into your RNN.
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.__call__(inputs, state, scope=None)` {#EmbeddingWrapper.__call__}
-
-Run the cell on embedded inputs.
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.__init__(cell, embedding_classes, embedding_size, initializer=None)` {#EmbeddingWrapper.__init__}
-
-Create a cell with an added input embedding.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an RNNCell, an embedding will be put before its inputs.
-*  <b>`embedding_classes`</b>: integer, how many symbols will be embedded.
-*  <b>`embedding_size`</b>: integer, the size of the vectors we embed into.
-*  <b>`initializer`</b>: an initializer to use when creating the embedding;
-    if None, the initializer from variable scope or a default one is used.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if cell is not an RNNCell.
-*  <b>`ValueError`</b>: if embedding_classes is not positive.
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.output_size` {#EmbeddingWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.state_size` {#EmbeddingWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.EmbeddingWrapper.zero_state(batch_size, dtype)` {#EmbeddingWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md
deleted file mode 100644
index fa2d4f17d08..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.rnn.RNNCell.md
+++ /dev/null
@@ -1,85 +0,0 @@
-Abstract object representing an RNN cell.
-
-The definition of cell in this package differs from the definition used in the
-literature. In the literature, cell refers to an object with a single scalar
-output. The definition in this package refers to a horizontal array of such
-units.
-
-An RNN cell, in the most abstract setting, is anything that has
-a state and performs some operation that takes a matrix of inputs.
-This operation results in an output matrix with `self.output_size` columns.
-If `self.state_size` is an integer, this operation also results in a new
-state matrix with `self.state_size` columns.  If `self.state_size` is a
-tuple of integers, then it results in a tuple of `len(state_size)` state
-matrices, each with a column size corresponding to values in `state_size`.
-
-This module provides a number of basic commonly used RNN cells, such as
-LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-of operators that allow add dropouts, projections, or embeddings for inputs.
-Constructing multi-layer cells is supported by the class `MultiRNNCell`,
-or by calling the `rnn` ops several times. Every `RNNCell` must have the
-properties below and implement `__call__` with the following signature.
-- - -
-
-#### `tf.contrib.rnn.RNNCell.__call__(inputs, state, scope=None)` {#RNNCell.__call__}
-
-Run this RNN cell on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `2-D` tensor with shape `[batch_size x input_size]`.
-*  <b>`state`</b>: if `self.state_size` is an integer, this should be a `2-D Tensor`
-    with shape `[batch_size x self.state_size]`.  Otherwise, if
-    `self.state_size` is a tuple of integers, this should be a tuple
-    with shapes `[batch_size x s] for s in self.state_size`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `2-D` tensor with shape `[batch_size x self.output_size]`.
-  - New state: Either a single `2-D` tensor, or a tuple of tensors matching
-    the arity and shapes of `state`.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.output_size` {#RNNCell.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.state_size` {#RNNCell.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.RNNCell.zero_state(batch_size, dtype)` {#RNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cos.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cos.md
deleted file mode 100644
index faf84ea9d31..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.cos.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.cos(x, name=None)` {#cos}
-
-Computes cos of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.count_nonzero.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.count_nonzero.md
deleted file mode 100644
index e464a8f8d3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.count_nonzero.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.count_nonzero(input_tensor, axis=None, keep_dims=False, dtype=tf.int64, name=None, reduction_indices=None)` {#count_nonzero}
-
-Computes number of nonzero elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-**NOTE** Floating point comparison to zero is done by exact floating point
-equality check.  Small values are **not** rounded to zero for purposes of
-the nonzero check.
-
-For example:
-
-```python
-# 'x' is [[0, 1, 0]
-#         [1, 1, 0]]
-tf.count_nonzero(x) ==> 3
-tf.count_nonzero(x, 0) ==> [1, 2, 0]
-tf.count_nonzero(x, 1) ==> [1, 2]
-tf.count_nonzero(x, 1, keep_dims=True) ==> [[1], [2]]
-tf.count_nonzero(x, [0, 1]) ==> 3
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should be of numeric type, or `bool`.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`dtype`</b>: The output dtype; defaults to `tf.int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor (number of nonzero values).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.diag_part.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.diag_part.md
deleted file mode 100644
index 845a45669b5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.diag_part.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.diag_part(input, name=None)` {#diag_part}
-
-Returns the diagonal part of the tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-
-`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-
-For example:
-
-```prettyprint
-# 'input' is [[1, 0, 0, 0]
-              [0, 2, 0, 0]
-              [0, 0, 3, 0]
-              [0, 0, 0, 4]]
-
-tf.diag_part(input) ==> [1, 2, 3, 4]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    Rank k tensor where k is 2, 4, or 6.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. The extracted diagonal.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
deleted file mode 100644
index 8c25e24373e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.div(x, y, name=None)` {#div}
-
-Divides x / y elementwise (using Python 2 division operator semantics).
-
-NOTE: Prefer using the Tensor division operator or tf.divide which obey Python
-division operator semantics.
-
-This function divides `x` and `y`, forcing Python 2.7 semantics. That is,
-if one of `x` or `y` is a float, then the result will be a float.
-Otherwise, the output will be an integer type. Flooring semantics are used
-for integer division.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.PermissionDeniedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.PermissionDeniedError.md
deleted file mode 100644
index a8a81494c8a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.PermissionDeniedError.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Raised when the caller does not have permission to run an operation.
-
-For example, running the
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation could raise `PermissionDeniedError` if it receives the name of a
-file for which the user does not have the read file permission.
-
-- - -
-
-#### `tf.errors.PermissionDeniedError.__init__(node_def, op, message)` {#PermissionDeniedError.__init__}
-
-Creates a `PermissionDeniedError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.UnavailableError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.UnavailableError.md
deleted file mode 100644
index e212ae94eca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.UnavailableError.md
+++ /dev/null
@@ -1,11 +0,0 @@
-Raised when the runtime is currently unavailable.
-
-This exception is not currently used.
-
-- - -
-
-#### `tf.errors.UnavailableError.__init__(node_def, op, message)` {#UnavailableError.__init__}
-
-Creates an `UnavailableError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.raise_exception_on_not_ok_status.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.raise_exception_on_not_ok_status.md
deleted file mode 100644
index a8d96ff97b2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.errors.raise_exception_on_not_ok_status.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.errors.raise_exception_on_not_ok_status()` {#raise_exception_on_not_ok_status}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.floordiv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.floordiv.md
deleted file mode 100644
index cf389be85bf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.floordiv.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.floordiv(x, y, name=None)` {#floordiv}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.decode_image.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.decode_image.md
deleted file mode 100644
index 46395cad628..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.decode_image.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.image.decode_image(contents, channels=None, name=None)` {#decode_image}
-
-Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
-Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate
-operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
-
-Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
-opposed to `decode_jpeg` and `decode_png`, which return 3-D arrays
-`[height, width, num_channels]`. Make sure to take this into account when
-constructing your graph if you are intermixing GIF files with JPEG and/or PNG
-files.
-
-##### Args:
-
-
-*  <b>`contents`</b>: 0-D `string`. The encoded image bytes.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color channels for
-    the decoded image.
-*  <b>`name`</b>: A name for the operation (optional)
-
-##### Returns:
-
-  `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
-    JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF
-    images.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_left_right.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_left_right.md
deleted file mode 100644
index ac8c99806e9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_left_right.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.image.flip_left_right(image)` {#flip_left_right}
-
-Flip an image horizontally (left to right).
-
-Outputs the contents of `image` flipped along the second dimension, which is
-`width`.
-
-See also `reverse()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_up_down.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_up_down.md
deleted file mode 100644
index ed92277f8af..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.flip_up_down.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.image.flip_up_down(image)` {#flip_up_down}
-
-Flip an image horizontally (upside down).
-
-Outputs the contents of `image` flipped along the first dimension, which is
-`height`.
-
-See also `reverse()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.pad_to_bounding_box.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.pad_to_bounding_box.md
deleted file mode 100644
index c731fb2d2ae..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.pad_to_bounding_box.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.image.pad_to_bounding_box(image, offset_height, offset_width, target_height, target_width)` {#pad_to_bounding_box}
-
-Pad `image` with zeros to the specified `height` and `width`.
-
-Adds `offset_height` rows of zeros on top, `offset_width` columns of
-zeros on the left, and then pads the image on the bottom and right
-with zeros until it has dimensions `target_height`, `target_width`.
-
-This op does nothing if `offset_*` is zero and the image already has size
-`target_height` by `target_width`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor with shape `[height, width, channels]`
-*  <b>`offset_height`</b>: Number of rows of zeros to add on top.
-*  <b>`offset_width`</b>: Number of columns of zeros to add on the left.
-*  <b>`target_height`</b>: Height of output image.
-*  <b>`target_width`</b>: Width of output image.
-
-##### Returns:
-
-  3-D tensor of shape `[target_height, target_width, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `image` is incompatible with the `offset_*` or
-    `target_*` arguments, or either `offset_height` or `offset_width` is
-    negative.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_image_with_crop_or_pad.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_image_with_crop_or_pad.md
deleted file mode 100644
index 24104b647c1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_image_with_crop_or_pad.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.image.resize_image_with_crop_or_pad(image, target_height, target_width)` {#resize_image_with_crop_or_pad}
-
-Crops and/or pads an image to a target width and height.
-
-Resizes an image to a target width and height by either centrally
-cropping the image or padding it evenly with zeros.
-
-If `width` or `height` is greater than the specified `target_width` or
-`target_height` respectively, this op centrally crops along that dimension.
-If `width` or `height` is smaller than the specified `target_width` or
-`target_height` respectively, this op centrally pads with 0 along that
-dimension.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`
-*  <b>`target_height`</b>: Target height.
-*  <b>`target_width`</b>: Target width.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `target_height` or `target_width` are zero or negative.
-
-##### Returns:
-
-  Cropped and/or padded image of shape
-  `[target_height, target_width, channels]`
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_nearest_neighbor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_nearest_neighbor.md
deleted file mode 100644
index ba72e73ebd9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.resize_nearest_neighbor.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.image.resize_nearest_neighbor(images, size, align_corners=None, name=None)` {#resize_nearest_neighbor}
-
-Resize `images` to `size` using nearest neighbor interpolation.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.rot90.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.rot90.md
deleted file mode 100644
index 3923d715abd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.rot90.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.image.rot90(image, k=1, name=None)` {#rot90}
-
-Rotate an image counter-clockwise by 90 degrees.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels]`.
-*  <b>`k`</b>: A scalar integer. The number of times the image is rotated by 90 degrees.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A rotated 3-D tensor of the same type and shape as `image`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.sample_distorted_bounding_box.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.sample_distorted_bounding_box.md
deleted file mode 100644
index aeef14c3b6c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.sample_distorted_bounding_box.md
+++ /dev/null
@@ -1,89 +0,0 @@
-### `tf.image.sample_distorted_bounding_box(image_size, bounding_boxes, seed=None, seed2=None, min_object_covered=None, aspect_ratio_range=None, area_range=None, max_attempts=None, use_image_if_no_bounding_boxes=None, name=None)` {#sample_distorted_bounding_box}
-
-Generate a single randomly distorted bounding box for an image.
-
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-
-##### Args:
-
-
-*  <b>`image_size`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`.
-    1-D, containing `[height, width, channels]`.
-*  <b>`bounding_boxes`</b>: A `Tensor` of type `float32`.
-    3-D with shape `[batch, N, 4]` describing the N bounding boxes
-    associated with the image.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either `seed` or `seed2` are set to non-zero, the random number
-    generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-    seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    A second seed to avoid seed collision.
-*  <b>`min_object_covered`</b>: An optional `float`. Defaults to `0.1`.
-    The cropped area of the image must contain at least this
-    fraction of any bounding box supplied. The value of this parameter should be
-    non-negative. In the case of 0, the cropped area does not need to overlap
-    any of the bounding boxes supplied.
-*  <b>`aspect_ratio_range`</b>: An optional list of `floats`. Defaults to `[0.75, 1.33]`.
-    The cropped area of the image must have an aspect ratio =
-    width / height within this range.
-*  <b>`area_range`</b>: An optional list of `floats`. Defaults to `[0.05, 1]`.
-    The cropped area of the image must contain a fraction of the
-    supplied image within in this range.
-*  <b>`max_attempts`</b>: An optional `int`. Defaults to `100`.
-    Number of attempts at generating a cropped region of the image
-    of the specified constraints. After `max_attempts` failures, return the entire
-    image.
-*  <b>`use_image_if_no_bounding_boxes`</b>: An optional `bool`. Defaults to `False`.
-    Controls behavior if no bounding boxes supplied.
-    If true, assume an implicit bounding box covering the whole input. If false,
-    raise an error.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (begin, size, bboxes).
-
-*  <b>`begin`</b>: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-    `tf.slice`.
-*  <b>`size`</b>: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to
-    `tf.slice`.
-*  <b>`bboxes`</b>: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-    Provide as input to `tf.image.draw_bounding_boxes`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.total_variation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.total_variation.md
deleted file mode 100644
index 03fec86c85e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.image.total_variation.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.image.total_variation(images, name=None)` {#total_variation}
-
-Calculate and return the total variation for one or more images.
-
-The total variation is the sum of the absolute differences for neighboring
-pixel-values in the input images. This measures how much noise is in the
-images.
-
-This can be used as a loss-function during optimization so as to suppress
-noise in images. If you have a batch of images, then you should calculate
-the scalar loss-value as the sum:
-`loss = tf.reduce_sum(tf.image.total_variation(images))`
-
-This implements the anisotropic 2-D version of the formula described here:
-
-https://en.wikipedia.org/wiki/Total_variation_denoising
-
-##### Args:
-
-
-*  <b>`images`</b>: 4-D Tensor of shape `[batch, height, width, channels]` or
-          3-D Tensor of shape `[height, width, channels]`.
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if images.shape is not a 3-D or 4-D vector.
-
-##### Returns:
-
-  The total variation of `images`.
-
-  If `images` was 4-D, return a 1-D float Tensor of shape `[batch]` with the
-  total variation for each image in the batch.
-  If `images` was 3-D, return a scalar float with the total variation for
-  that image.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.load_file_system_library.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.load_file_system_library.md
deleted file mode 100644
index 60d768a6243..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.load_file_system_library.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.load_file_system_library(library_filename)` {#load_file_system_library}
-
-Loads a TensorFlow plugin, containing file system implementation.
-
-Pass `library_filename` to a platform-specific mechanism for dynamically
-loading a library. The rules for determining the exact location of the
-library are platform-specific and are not documented here.
-
-##### Args:
-
-
-*  <b>`library_filename`</b>: Path to the plugin.
-    Relative or absolute filesystem path to a dynamic library file.
-
-##### Returns:
-
-  None.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: when unable to load the library.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
deleted file mode 100644
index 2b5f011ccdc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.logical_and(x, y, name=None)` {#logical_and}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_not.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_not.md
deleted file mode 100644
index 40a0bb2e43d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_not.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.logical_not(x, name=None)` {#logical_not}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.make_template.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.make_template.md
deleted file mode 100644
index 99814cacc50..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.make_template.md
+++ /dev/null
@@ -1,111 +0,0 @@
-### `tf.make_template(name_, func_, create_scope_now_=False, unique_name_=None, custom_getter_=None, **kwargs)` {#make_template}
-
-Given an arbitrary function, wrap it so that it does variable sharing.
-
-This wraps `func_` in a Template and partially evaluates it. Templates are
-functions that create variables the first time they are called and reuse them
-thereafter. In order for `func_` to be compatible with a `Template` it must
-have the following properties:
-
-* The function should create all trainable variables and any variables that
-   should be reused by calling `tf.get_variable`. If a trainable variable is
-   created using `tf.Variable`, then a ValueError will be thrown. Variables
-   that are intended to be locals can be created by specifying
-   `tf.Variable(..., trainable=false)`.
-* The function may use variable scopes and other templates internally to
-    create and reuse variables, but it shouldn't use `tf.global_variables` to
-    capture variables that are defined outside of the scope of the function.
-* Internal scopes and variable names should not depend on any arguments that
-    are not supplied to `make_template`. In general you will get a ValueError
-    telling you that you are trying to reuse a variable that doesn't exist
-    if you make a mistake.
-
-In the following example, both `z` and `w` will be scaled by the same `y`. It
-is important to note that if we didn't assign `scalar_name` and used a
-different name for z and w that a `ValueError` would be thrown because it
-couldn't reuse the variable.
-
-```python
-def my_op(x, scalar_name):
-  var1 = tf.get_variable(scalar_name,
-                         shape=[],
-                         initializer=tf.constant_initializer(1))
-  return x * var1
-
-scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
-
-z = scale_by_y(input1)
-w = scale_by_y(input2)
-```
-
-As a safe-guard, the returned function will raise a `ValueError` after the
-first call if trainable variables are created by calling `tf.Variable`.
-
-If all of these are true, then 2 properties are enforced by the template:
-
-1. Calling the same template multiple times will share all non-local
-    variables.
-2. Two different templates are guaranteed to be unique, unless you reenter the
-    same variable scope as the initial definition of a template and redefine
-    it. An examples of this exception:
-
-```python
-def my_op(x, scalar_name):
-  var1 = tf.get_variable(scalar_name,
-                         shape=[],
-                         initializer=tf.constant_initializer(1))
-  return x * var1
-
-with tf.variable_scope('scope') as vs:
-  scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
-  z = scale_by_y(input1)
-  w = scale_by_y(input2)
-
-# Creates a template that reuses the variables above.
-with tf.variable_scope(vs, reuse=True):
-  scale_by_y2 = tf.make_template('scale_by_y', my_op, scalar_name='y')
-  z2 = scale_by_y2(input1)
-  w2 = scale_by_y2(input2)
-```
-
-Depending on the value of `create_scope_now_`, the full variable scope may be
-captured either at the time of first call or at the time of construction. If
-this option is set to True, then all Tensors created by repeated calls to the
-template will have an extra trailing _N+1 to their name, as the first time the
-scope is entered in the Template constructor no Tensors are created.
-
-Note: `name_`, `func_` and `create_scope_now_` have a trailing underscore to
-reduce the likelihood of collisions with kwargs.
-
-##### Args:
-
-
-*  <b>`name_`</b>: A name for the scope created by this template. If necessary, the name
-    will be made unique by appending `_N` to the name.
-*  <b>`func_`</b>: The function to wrap.
-*  <b>`create_scope_now_`</b>: Boolean controlling whether the scope should be created
-    when the template is constructed or when the template is called. Default
-    is False, meaning the scope is created when the template is called.
-*  <b>`unique_name_`</b>: When used, it overrides name_ and is not made unique. If a
-    template of the same scope/unique_name already exists and reuse is false,
-    an error is raised. Defaults to None.
-*  <b>`custom_getter_`</b>: Optional custom getter for variables used in `func_`. See
-    the [`get_variable`](#get_variable) `custom_getter` documentation for
-    more information.
-*  <b>`**kwargs`</b>: Keyword arguments to apply to `func_`.
-
-##### Returns:
-
-  A function to encapsulate a set of variables which should be created once
-  and reused. An enclosing scope will created, either where `make_template`
-  is called, or wherever the result is called, depending on the value of
-  `create_scope_now_`. Regardless of the value, the first time the template
-  is called it will enter the scope with no reuse, and call `func_` to create
-  variables, which are guaranteed to be unique. All subsequent calls will
-  re-enter the scope and reuse those variables.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the name is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md
deleted file mode 100644
index f0bba3c637e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md
+++ /dev/null
@@ -1,8 +0,0 @@
-### `tf.model_variables()` {#model_variables}
-
-Returns all variables in the MODEL_VARIABLES collection.
-
-##### Returns:
-
-  A list of local Variable objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md
deleted file mode 100644
index a4caa462584..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.atrous_conv2d_transpose.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.nn.atrous_conv2d_transpose(value, filters, output_shape, rate, padding, name=None)` {#atrous_conv2d_transpose}
-
-The transpose of `atrous_conv2d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `atrous_conv2d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
-    format. Its shape is `[batch, in_height, in_width, in_channels]`.
-*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[filter_height, filter_width, out_channels, in_channels]`. `filters`'
-    `in_channels` dimension must match that of `value`. Atrous convolution is
-    equivalent to standard convolution with upsampled filters with effective
-    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
-    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
-    inserting `rate - 1` zeros along consecutive elements across the
-    `filters`' spatial dimensions.
-*  <b>`output_shape`</b>: A 1-D `Tensor` of shape representing the output shape of the
-    deconvolution op.
-*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
-    the `height` and `width` dimensions. Equivalently, the rate by which we
-    upsample the filter values by inserting zeros across the `height` and
-    `width` dimensions. In the literature, the same parameter is sometimes
-    called `input stride` or `dilation`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less
-    than one, or if the output_shape is not a tensor with 4 elements.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.conv3d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.conv3d.md
deleted file mode 100644
index cbac47eb583..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.conv3d.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.nn.conv3d(input, filter, strides, padding, name=None)` {#conv3d}
-
-Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-
-In signal processing, cross-correlation is a measure of similarity of
-two waveforms as a function of a time-lag applied to one of them. This
-is also known as a sliding dot product or sliding inner-product.
-
-Our Conv3D implements a form of cross-correlation.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-    Shape `[filter_depth, filter_height, filter_width, in_channels,
-    out_channels]`. `in_channels` must match between `input` and `filter`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.sigmoid_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.sigmoid_cross_entropy_with_logits.md
deleted file mode 100644
index 55e1b178ead..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.sigmoid_cross_entropy_with_logits.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)` {#sigmoid_cross_entropy_with_logits}
-
-Computes sigmoid cross entropy given `logits`.
-
-Measures the probability error in discrete classification tasks in which each
-class is independent and not mutually exclusive.  For instance, one could
-perform multilabel classification where a picture can contain both an elephant
-and a dog at the same time.
-
-For brevity, let `x = logits`, `z = labels`.  The logistic loss is
-
-      z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-    = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
-    = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
-    = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
-    = (1 - z) * x + log(1 + exp(-x))
-    = x - x * z + log(1 + exp(-x))
-
-For x < 0, to avoid overflow in exp(-x), we reformulate the above
-
-      x - x * z + log(1 + exp(-x))
-    = log(exp(x)) - x * z + log(1 + exp(-x))
-    = - x * z + log(1 + exp(x))
-
-Hence, to ensure stability and avoid overflow, the implementation uses this
-equivalent formulation
-
-    max(x, 0) - x * z + log(1 + exp(-abs(x)))
-
-`logits` and `labels` must have the same type and shape.
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: A `Tensor` of the same type and shape as `logits`.
-*  <b>`logits`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `logits` with the componentwise
-  logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `logits` and `labels` do not have the same shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.top_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.top_k.md
deleted file mode 100644
index 819c0ad068c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.nn.top_k.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.nn.top_k(input, k=1, sorted=True, name=None)` {#top_k}
-
-Finds values and indices of the `k` largest entries for the last dimension.
-
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-##### Args:
-
-
-*  <b>`input`</b>: 1-D or higher `Tensor` with last dimension at least `k`.
-*  <b>`k`</b>: 0-D `int32` `Tensor`.  Number of top elements to look for along the last
-    dimension (along each row for matrices).
-*  <b>`sorted`</b>: If true the resulting `k` elements will be sorted by the values in
-    descending order.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-
-*  <b>`values`</b>: The `k` largest elements along each last dimensional slice.
-*  <b>`indices`</b>: The indices of `values` within the last dimension of `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.no_op.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.no_op.md
deleted file mode 100644
index c1b5c0824b9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.no_op.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.no_op(name=None)` {#no_op}
-
-Does nothing. Only useful as a placeholder for control edges.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.range.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.range.md
deleted file mode 100644
index 90f8e7aa501..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.range.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.range(start, limit=None, delta=1, dtype=None, name='range')` {#range}
-
-Creates a sequence of numbers.
-
-Creates a sequence of numbers that begins at `start` and extends by
-increments of `delta` up to but not including `limit`.
-
-The dtype of the resulting tensor is inferred from the inputs unless
-it is provided explicitly.
-
-Like the Python builtin `range`, `start` defaults to 0, so that
-`range(n) = range(0, n)`.
-
-For example:
-
-```python
-# 'start' is 3
-# 'limit' is 18
-# 'delta' is 3
-tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-
-# 'start' is 3
-# 'limit' is 1
-# 'delta' is -0.5
-tf.range(start, limit, delta) ==> [3, 2.5, 2, 1.5]
-
-# 'limit' is 5
-tf.range(limit) ==> [0, 1, 2, 3, 4]
-```
-
-##### Args:
-
-
-*  <b>`start`</b>: A 0-D `Tensor` (scalar). Acts as first entry in the range if
-    `limit` is not None; otherwise, acts as range limit and first entry
-    defaults to 0.
-*  <b>`limit`</b>: A 0-D `Tensor` (scalar). Upper limit of sequence,
-    exclusive. If None, defaults to the value of `start` while the first
-    entry of the range defaults to 0.
-*  <b>`delta`</b>: A 0-D `Tensor` (scalar). Number that increments
-    `start`. Defaults to 1.
-*  <b>`dtype`</b>: The type of the elements of the resulting tensor.
-*  <b>`name`</b>: A name for the operation. Defaults to "range".
-
-##### Returns:
-
-  An 1-D `Tensor` of type `dtype`.
-
-@compatibility(numpy)
-Equivalent to np.arange
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.reverse_v2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.reverse_v2.md
deleted file mode 100644
index 073f0bda7bb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.reverse_v2.md
+++ /dev/null
@@ -1,64 +0,0 @@
-### `tf.reverse_v2(tensor, axis, name=None)` {#reverse_v2}
-
-Reverses specific dimensions of a tensor.
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```prettyprint
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is -1
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int32`, `int64`, `bool`, `half`, `float32`, `float64`, `complex64`, `complex128`.
-    Up to 8-D.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. The indices of the dimensions to reverse.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`. The same shape as `tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.scatter_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.scatter_nd.md
deleted file mode 100644
index c4d448d9d8e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.scatter_nd.md
+++ /dev/null
@@ -1,94 +0,0 @@
-### `tf.scatter_nd(indices, updates, shape, name=None)` {#scatter_nd}
-
-Creates a new tensor by applying sparse `updates` to individual
-
-values or slices within a zero tensor of the given `shape` tensor according to
-indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
-operator which extracts values or slices from a given tensor.
-
-TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-syntax.
-
-`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-`Q`.
-
-`indices` must be integer tensor, containing indices into `shape`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `shape`.
-
-`updates` is Tensor of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-```
-
-The simplest form of scatter is to insert individual elements in a tensor by
-index. For example, say we want to insert 4 scattered elements in a rank-1
-tensor with 8 elements.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd1.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    shape = tf.constant([8])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print sess.run(scatter)
-
-The resulting tensor would look like this:
-
-    [0, 11, 0, 10, 9, 0, 0, 12]
-
-We can also, insert entire slices of a higher rank tensor all at once. For
-example, if we wanted to insert two slices in the first dimension of a
-rank-3 tensor with two matrices of new values.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd2.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-    indices = tf.constant([[0], [2]])
-    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]],
-                           [[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]]])
-    shape = tf.constant([4, 4, 4])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print sess.run(scatter)
-
-The resulting tensor would look like this:
-
-    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`.
-    A Tensor. Must have the same type as tensor. A tensor of updated values
-    to store in ref.
-*  <b>`shape`</b>: A `Tensor`. Must have the same type as `indices`.
-    A vector. The shape of the resulting tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `updates`.
-  A new tensor with the given shape and updates applied according
-  to the indices.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sign.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sign.md
deleted file mode 100644
index e7fa339847d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sign.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.sign(x, name=None)` {#sign}
-
-Returns an element-wise indication of the sign of a number.
-
-`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-
-For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_maximum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_maximum.md
deleted file mode 100644
index 2f2759f2c64..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_maximum.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.sparse_maximum(sp_a, sp_b, name=None)` {#sparse_maximum}
-
-Returns the element-wise max of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-Example:
-
-```python
-sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
-sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-res = tf.sparse_maximum(sp_zero, sp_one).eval()
-# "res" should be equal to SparseTensor([[0], [1]], [0, 1], [7]).
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: a `SparseTensor` operand whose dtype is real, and indices
-    lexicographically ordered.
-*  <b>`sp_b`</b>: the other `SparseTensor` operand with the same requirements (and the
-    same shape).
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: the output SparseTensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.FileWriterCache.get.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.FileWriterCache.get.md
deleted file mode 100644
index 0f416a59092..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.FileWriterCache.get.md
+++ /dev/null
@@ -1,13 +0,0 @@
-#### `tf.summary.FileWriterCache.get(logdir)` {#FileWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md
deleted file mode 100644
index 24a3b3f10c3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.image.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.image.md
deleted file mode 100644
index 64d16619f02..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.image.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.summary.image(name, tensor, max_outputs=3, collections=None)` {#image}
-
-Outputs a `Summary` protocol buffer with images.
-
-The summary has up to `max_outputs` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` in the outputted Summary.Value protobufs is generated based on the
-name, with a suffix depending on the max_outputs setting:
-
-*  If `max_outputs` is 1, the summary value tag is '*name*/image'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*name*/image/0', '*name*/image/1', etc.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A 4-D `uint8` or `float32` `Tensor` of shape `[batch_size, height,
-    width, channels]` where `channels` is 1, 3, or 4.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate images for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.tan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.tan.md
deleted file mode 100644
index cb05f1427be..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.tan.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.tan(x, name=None)` {#tan}
-
-Computes tan of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md
deleted file mode 100644
index 4a9fbf12bff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.test.main(argv=None)` {#main}
-
-Runs all unit tests.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.to_int64.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.to_int64.md
deleted file mode 100644
index 0762822b3d6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.to_int64.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.to_int64(x, name='ToInt64')` {#to_int64}
-
-Casts a tensor to type `int64`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `int64`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `int64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.ChiefSessionCreator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.ChiefSessionCreator.md
deleted file mode 100644
index e5c7a3953a7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.ChiefSessionCreator.md
+++ /dev/null
@@ -1,26 +0,0 @@
-Creates a tf.Session  for a chief.
-- - -
-
-#### `tf.train.ChiefSessionCreator.__init__(scaffold=None, master='', config=None, checkpoint_dir=None, checkpoint_filename_with_path=None)` {#ChiefSessionCreator.__init__}
-
-Initializes a chief session creator.
-
-##### Args:
-
-
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-
-
-- - -
-
-#### `tf.train.ChiefSessionCreator.create_session()` {#ChiefSessionCreator.create_session}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.FtrlOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.FtrlOptimizer.md
deleted file mode 100644
index 4fe719ee6b8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.FtrlOptimizer.md
+++ /dev/null
@@ -1,32 +0,0 @@
-Optimizer that implements the FTRL algorithm.
-
-See this [paper](
-https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
-
-- - -
-
-#### `tf.train.FtrlOptimizer.__init__(learning_rate, learning_rate_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='Ftrl')` {#FtrlOptimizer.__init__}
-
-Construct a new FTRL optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A float value or a constant float `Tensor`.
-*  <b>`learning_rate_power`</b>: A float value, must be less or equal to zero.
-*  <b>`initial_accumulator_value`</b>: The starting value for accumulators.
-    Only positive values are allowed.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Ftrl".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.LooperThread.loop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.LooperThread.loop.md
deleted file mode 100644
index 6665ca7369e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.LooperThread.loop.md
+++ /dev/null
@@ -1,22 +0,0 @@
-#### `tf.train.LooperThread.loop(coord, timer_interval_secs, target, args=None, kwargs=None)` {#LooperThread.loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(args)`
-repeatedly.  Otherwise `target(args)` is called every `timer_interval_secs`
-seconds.  The thread terminates when a stop of the coordinator is
-requested.
-
-##### Args:
-
-
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.NewCheckpointReader.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.NewCheckpointReader.md
deleted file mode 100644
index 324dcf80c50..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.NewCheckpointReader.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.train.NewCheckpointReader(filepattern)` {#NewCheckpointReader}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Optimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Optimizer.md
deleted file mode 100644
index 626a0a87ab6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Optimizer.md
+++ /dev/null
@@ -1,265 +0,0 @@
-Base class for optimizers.
-
-This class defines the API to add Ops to train a model.  You never use this
-class directly, but instead instantiate one of its subclasses such as
-`GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
-
-### Usage
-
-```python
-# Create an optimizer with the desired parameters.
-opt = GradientDescentOptimizer(learning_rate=0.1)
-# Add Ops to the graph to minimize a cost by updating a list of variables.
-# "cost" is a Tensor, and the list of variables contains tf.Variable
-# objects.
-opt_op = opt.minimize(cost, var_list=<list of variables>)
-```
-
-In the training program you will just have to run the returned Op.
-
-```python
-# Execute opt_op to do one step of training:
-opt_op.run()
-```
-
-### Processing gradients before applying them.
-
-Calling `minimize()` takes care of both computing the gradients and
-applying them to the variables.  If you want to process the gradients
-before applying them you can instead use the optimizer in three steps:
-
-1.  Compute the gradients with `compute_gradients()`.
-2.  Process the gradients as you wish.
-3.  Apply the processed gradients with `apply_gradients()`.
-
-Example:
-
-```python
-# Create an optimizer.
-opt = GradientDescentOptimizer(learning_rate=0.1)
-
-# Compute the gradients for a list of variables.
-grads_and_vars = opt.compute_gradients(loss, <list of variables>)
-
-# grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
-# need to the 'gradient' part, for example cap them, etc.
-capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
-
-# Ask the optimizer to apply the capped gradients.
-opt.apply_gradients(capped_grads_and_vars)
-```
-
-- - -
-
-#### `tf.train.Optimizer.__init__(use_locking, name)` {#Optimizer.__init__}
-
-Create a new Optimizer.
-
-This must be called by the constructors of subclasses.
-
-##### Args:
-
-
-*  <b>`use_locking`</b>: Bool. If True apply use locks to prevent concurrent updates
-    to variables.
-*  <b>`name`</b>: A non-empty string.  The name to use for accumulators created
-    for the optimizer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If name is malformed.
-
-
-
-- - -
-
-#### `tf.train.Optimizer.minimize(loss, global_step=None, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None)` {#Optimizer.minimize}
-
-Add operations to minimize `loss` by updating `var_list`.
-
-This method simply combines calls `compute_gradients()` and
-`apply_gradients()`. If you want to process the gradient before applying
-them call `compute_gradients()` and `apply_gradients()` explicitly instead
-of using this function.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A `Tensor` containing the value to minimize.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`name`</b>: Optional name for the returned operation.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  An Operation that updates the variables in `var_list`.  If `global_step`
-  was not `None`, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
-
-
-- - -
-
-#### `tf.train.Optimizer.compute_gradients(loss, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)` {#Optimizer.compute_gradients}
-
-Compute gradients of `loss` for the variables in `var_list`.
-
-This is the first part of `minimize()`.  It returns a list
-of (gradient, variable) pairs where "gradient" is the gradient
-for "variable".  Note that "gradient" can be a `Tensor`, an
-`IndexedSlices`, or `None` if there is no gradient for the
-given variable.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A Tensor containing the value to minimize.
-*  <b>`var_list`</b>: Optional list of `tf.Variable` to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKey.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  A list of (gradient, variable) pairs. Variable is always present, but
-  gradient can be `None`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` contains anything else than `Variable` objects.
-*  <b>`ValueError`</b>: If some arguments are invalid.
-
-
-- - -
-
-#### `tf.train.Optimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#Optimizer.apply_gradients}
-
-Apply gradients to variables.
-
-This is the second part of `minimize()`. It returns an `Operation` that
-applies gradients.
-
-##### Args:
-
-
-*  <b>`grads_and_vars`</b>: List of (gradient, variable) pairs as returned by
-    `compute_gradients()`.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`name`</b>: Optional name for the returned operation.  Default to the
-    name passed to the `Optimizer` constructor.
-
-##### Returns:
-
-  An `Operation` that applies the specified gradients. If `global_step`
-  was not None, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `grads_and_vars` is malformed.
-*  <b>`ValueError`</b>: If none of the variables have gradients.
-
-
-
-### Gating Gradients
-
-Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
-argument that controls the degree of parallelism during the application of
-the gradients.
-
-The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
-
-<b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
-the maximum parallelism in execution, at the cost of some non-reproducibility
-in the results.  For example the two gradients of `matmul` depend on the input
-values: With `GATE_NONE` one of the gradients could be applied to one of the
-inputs _before_ the other gradient is computed resulting in non-reproducible
-results.
-
-<b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
-they are used.  This prevents race conditions for Ops that generate gradients
-for multiple inputs where the gradients depend on the inputs.
-
-<b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
-before any one of them is used.  This provides the least parallelism but can
-be useful if you want to process all gradients before applying any of them.
-
-### Slots
-
-Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
-allocate and manage additional variables associated with the variables to
-train.  These are called <i>Slots</i>.  Slots have names and you can ask the
-optimizer for the names of the slots that it uses.  Once you have a slot name
-you can ask the optimizer for the variable it created to hold the slot value.
-
-This can be useful if you want to log debug a training algorithm, report stats
-about the slots, etc.
-
-- - -
-
-#### `tf.train.Optimizer.get_slot_names()` {#Optimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-See `get_slot()`.
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.train.Optimizer.get_slot(var, name)` {#Optimizer.get_slot}
-
-Return a slot named `name` created for `var` by the Optimizer.
-
-Some `Optimizer` subclasses use additional variables.  For example
-`Momentum` and `Adagrad` use variables to accumulate updates.  This method
-gives access to these `Variable` objects if for some reason you need them.
-
-Use `get_slot_names()` to get the list of slot names created by the
-`Optimizer`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A variable passed to `minimize()` or `apply_gradients()`.
-*  <b>`name`</b>: A string.
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Optimizer.get_name()` {#Optimizer.get_name}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Saver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Saver.md
deleted file mode 100644
index d44000649ae..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Saver.md
+++ /dev/null
@@ -1,372 +0,0 @@
-Saves and restores variables.
-
-See [Variables](../../how_tos/variables/index.md)
-for an overview of variables, saving and restoring.
-
-The `Saver` class adds ops to save and restore variables to and from
-*checkpoints*.  It also provides convenience methods to run these ops.
-
-Checkpoints are binary files in a proprietary format which map variable names
-to tensor values.  The best way to examine the contents of a checkpoint is to
-load it using a `Saver`.
-
-Savers can automatically number checkpoint filenames with a provided counter.
-This lets you keep multiple checkpoints at different steps while training a
-model.  For example you can number the checkpoint filenames with the training
-step number.  To avoid filling up disks, savers manage checkpoint files
-automatically. For example, they can keep only the N most recent files, or
-one checkpoint for every N hours of training.
-
-You number checkpoint filenames by passing a value to the optional
-`global_step` argument to `save()`:
-
-```python
-saver.save(sess, 'my-model', global_step=0) ==> filename: 'my-model-0'
-...
-saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000'
-```
-
-Additionally, optional arguments to the `Saver()` constructor let you control
-the proliferation of checkpoint files on disk:
-
-* `max_to_keep` indicates the maximum number of recent checkpoint files to
-  keep.  As new files are created, older files are deleted.  If None or 0,
-  all checkpoint files are kept.  Defaults to 5 (that is, the 5 most recent
-  checkpoint files are kept.)
-
-* `keep_checkpoint_every_n_hours`: In addition to keeping the most recent
-  `max_to_keep` checkpoint files, you might want to keep one checkpoint file
-  for every N hours of training.  This can be useful if you want to later
-  analyze how a model progressed during a long training session.  For
-  example, passing `keep_checkpoint_every_n_hours=2` ensures that you keep
-  one checkpoint file for every 2 hours of training.  The default value of
-  10,000 hours effectively disables the feature.
-
-Note that you still have to call the `save()` method to save the model.
-Passing these arguments to the constructor will not save variables
-automatically for you.
-
-A training program that saves regularly looks like:
-
-```python
-...
-# Create a saver.
-saver = tf.train.Saver(...variables...)
-# Launch the graph and train, saving the model every 1,000 steps.
-sess = tf.Session()
-for step in xrange(1000000):
-    sess.run(..training_op..)
-    if step % 1000 == 0:
-        # Append the step number to the checkpoint name:
-        saver.save(sess, 'my-model', global_step=step)
-```
-
-In addition to checkpoint files, savers keep a protocol buffer on disk with
-the list of recent checkpoints. This is used to manage numbered checkpoint
-files and by `latest_checkpoint()`, which makes it easy to discover the path
-to the most recent checkpoint. That protocol buffer is stored in a file named
-'checkpoint' next to the checkpoint files.
-
-If you create several savers, you can specify a different filename for the
-protocol buffer file in the call to `save()`.
-
-- - -
-
-#### `tf.train.Saver.__init__(var_list=None, reshape=False, sharded=False, max_to_keep=5, keep_checkpoint_every_n_hours=10000.0, name=None, restore_sequentially=False, saver_def=None, builder=None, defer_build=False, allow_empty=False, write_version=2, pad_step_number=False)` {#Saver.__init__}
-
-Creates a `Saver`.
-
-The constructor adds ops to save and restore variables.
-
-`var_list` specifies the variables that will be saved and restored. It can
-be passed as a `dict` or a list:
-
-* A `dict` of names to variables: The keys are the names that will be
-  used to save or restore the variables in the checkpoint files.
-* A list of variables: The variables will be keyed with their op name in
-  the checkpoint files.
-
-For example:
-
-```python
-v1 = tf.Variable(..., name='v1')
-v2 = tf.Variable(..., name='v2')
-
-# Pass the variables as a dict:
-saver = tf.train.Saver({'v1': v1, 'v2': v2})
-
-# Or pass them as a list.
-saver = tf.train.Saver([v1, v2])
-# Passing a list is equivalent to passing a dict with the variable op names
-# as keys:
-saver = tf.train.Saver({v.op.name: v for v in [v1, v2]})
-```
-
-The optional `reshape` argument, if `True`, allows restoring a variable from
-a save file where the variable had a different shape, but the same number
-of elements and type.  This is useful if you have reshaped a variable and
-want to reload it from an older checkpoint.
-
-The optional `sharded` argument, if `True`, instructs the saver to shard
-checkpoints per device.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: A list of `Variable`/`SaveableObject`, or a dictionary mapping
-    names to `SaveableObject`s. If `None`, defaults to the list of all
-    saveable objects.
-*  <b>`reshape`</b>: If `True`, allows restoring parameters from a checkpoint
-    where the variables have a different shape.
-*  <b>`sharded`</b>: If `True`, shard the checkpoints, one per device.
-*  <b>`max_to_keep`</b>: Maximum number of recent checkpoints to keep.
-    Defaults to 5.
-*  <b>`keep_checkpoint_every_n_hours`</b>: How often to keep checkpoints.
-    Defaults to 10,000 hours.
-*  <b>`name`</b>: String.  Optional name to use as a prefix when adding operations.
-*  <b>`restore_sequentially`</b>: A `Bool`, which if true, causes restore of different
-    variables to happen sequentially within each device.  This can lower
-    memory usage when restoring very large models.
-*  <b>`saver_def`</b>: Optional `SaverDef` proto to use instead of running the
-    builder. This is only useful for specialty code that wants to recreate
-    a `Saver` object for a previously built `Graph` that had a `Saver`.
-    The `saver_def` proto should be the one returned by the
-    `as_saver_def()` call of the `Saver` that was created for that `Graph`.
-*  <b>`builder`</b>: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-    Defaults to `BaseSaverBuilder()`.
-*  <b>`defer_build`</b>: If `True`, defer adding the save and restore ops to the
-    `build()` call. In that case `build()` should be called before
-    finalizing the graph or using the saver.
-*  <b>`allow_empty`</b>: If `False` (default) raise an error if there are no
-    variables in the graph. Otherwise, construct the saver anyway and make
-    it a no-op.
-*  <b>`write_version`</b>: controls what format to use when saving checkpoints.  It
-    also affects certain filepath matching logic.  The V2 format is the
-    recommended choice: it is much more optimized than V1 in terms of
-    memory required and latency incurred during restore.  Regardless of
-    this flag, the Saver is able to restore from both V2 and V1 checkpoints.
-*  <b>`pad_step_number`</b>: if True, pads the global step number in the checkpoint
-    filepaths to some fixed width (8 by default).  This is turned off by
-    default.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` is invalid.
-*  <b>`ValueError`</b>: If any of the keys or values in `var_list` are not unique.
-
-
-- - -
-
-#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta', write_meta_graph=True, write_state=True)` {#Saver.save}
-
-Saves variables.
-
-This method runs the ops added by the constructor for saving variables.
-It requires a session in which the graph was launched.  The variables to
-save must also have been initialized.
-
-The method returns the path of the newly created checkpoint file.  This
-path can be passed directly to a call to `restore()`.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session to use to save the variables.
-*  <b>`save_path`</b>: String.  Path to the checkpoint filename.  If the saver is
-    `sharded`, this is the prefix of the sharded checkpoint filename.
-*  <b>`global_step`</b>: If provided the global step number is appended to
-    `save_path` to create the checkpoint filename. The optional argument
-    can be a `Tensor`, a `Tensor` name or an integer.
-*  <b>`latest_filename`</b>: Optional name for the protocol buffer file that will
-    contains the list of most recent checkpoint filenames.  That file,
-    kept in the same directory as the checkpoint files, is automatically
-    managed by the saver to keep track of recent checkpoints.  Defaults to
-    'checkpoint'.
-*  <b>`meta_graph_suffix`</b>: Suffix for `MetaGraphDef` file. Defaults to 'meta'.
-*  <b>`write_meta_graph`</b>: `Boolean` indicating whether or not to write the meta
-    graph file.
-*  <b>`write_state`</b>: `Boolean` indicating whether or not to write the
-    `CheckpointStateProto`.
-
-##### Returns:
-
-  A string: path at which the variables were saved.  If the saver is
-    sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-    is the number of shards created.
-  If the saver is empty, returns None.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sess` is not a `Session`.
-*  <b>`ValueError`</b>: If `latest_filename` contains path components, or if it
-    collides with `save_path`.
-*  <b>`RuntimeError`</b>: If save and restore ops weren't built.
-
-
-- - -
-
-#### `tf.train.Saver.restore(sess, save_path)` {#Saver.restore}
-
-Restores previously saved variables.
-
-This method runs the ops added by the constructor for restoring variables.
-It requires a session in which the graph was launched.  The variables to
-restore do not have to have been initialized, as restoring is itself a way
-to initialize variables.
-
-The `save_path` argument is typically a value previously returned from a
-`save()` call, or a call to `latest_checkpoint()`.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` to use to restore the parameters.
-*  <b>`save_path`</b>: Path where parameters were previously saved.
-
-
-
-Other utility methods.
-
-- - -
-
-#### `tf.train.Saver.last_checkpoints` {#Saver.last_checkpoints}
-
-List of not-yet-deleted checkpoint filenames.
-
-You can pass any of the returned values to `restore()`.
-
-##### Returns:
-
-  A list of checkpoint filenames, sorted from oldest to newest.
-
-
-- - -
-
-#### `tf.train.Saver.set_last_checkpoints_with_time(last_checkpoints_with_time)` {#Saver.set_last_checkpoints_with_time}
-
-Sets the list of old checkpoint filenames and timestamps.
-
-##### Args:
-
-
-*  <b>`last_checkpoints_with_time`</b>: A list of tuples of checkpoint filenames and
-    timestamps.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If last_checkpoints_with_time is not a list.
-
-
-- - -
-
-#### `tf.train.Saver.recover_last_checkpoints(checkpoint_paths)` {#Saver.recover_last_checkpoints}
-
-Recovers the internal saver state after a crash.
-
-This method is useful for recovering the "self._last_checkpoints" state.
-
-Globs for the checkpoints pointed to by `checkpoint_paths`.  If the files
-exist, use their mtime as the checkpoint timestamp.
-
-##### Args:
-
-
-*  <b>`checkpoint_paths`</b>: a list of checkpoint paths.
-
-
-- - -
-
-#### `tf.train.Saver.as_saver_def()` {#Saver.as_saver_def}
-
-Generates a `SaverDef` representation of this saver.
-
-##### Returns:
-
-  A `SaverDef` proto.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Saver.build()` {#Saver.build}
-
-Builds saver_def.
-
-
-- - -
-
-#### `tf.train.Saver.export_meta_graph(filename=None, collection_list=None, as_text=False, export_scope=None, clear_devices=False)` {#Saver.export_meta_graph}
-
-Writes `MetaGraphDef` to save_path/filename.
-
-##### Args:
-
-
-*  <b>`filename`</b>: Optional meta_graph filename including the path.
-*  <b>`collection_list`</b>: List of string keys to collect.
-*  <b>`as_text`</b>: If `True`, writes the meta_graph as an ASCII proto.
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during export.
-
-##### Returns:
-
-  A `MetaGraphDef` proto.
-
-
-- - -
-
-#### `tf.train.Saver.from_proto(saver_def, import_scope=None)` {#Saver.from_proto}
-
-Returns a `Saver` object created from `saver_def`.
-
-##### Args:
-
-
-*  <b>`saver_def`</b>: a `SaveDef` protocol buffer.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to use.
-
-##### Returns:
-
-  A `Saver` built from saver_def.
-
-
-- - -
-
-#### `tf.train.Saver.set_last_checkpoints(last_checkpoints)` {#Saver.set_last_checkpoints}
-
-DEPRECATED: Use set_last_checkpoints_with_time.
-
-Sets the list of old checkpoint filenames.
-
-##### Args:
-
-
-*  <b>`last_checkpoints`</b>: A list of checkpoint filenames.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If last_checkpoints is not a list.
-
-
-- - -
-
-#### `tf.train.Saver.to_proto(export_scope=None)` {#Saver.to_proto}
-
-Converts this `Saver` to a `SaverDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `SaverDef` protocol buffer.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SessionManager.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SessionManager.md
deleted file mode 100644
index c142b2aca84..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.SessionManager.md
+++ /dev/null
@@ -1,209 +0,0 @@
-Training helper that restores from checkpoint and creates session.
-
-This class is a small wrapper that takes care of session creation and
-checkpoint recovery. It also provides functions that to facilitate
-coordination among multiple training threads or processes.
-
-* Checkpointing trained variables as the training progresses.
-* Initializing variables on startup, restoring them from the most recent
-  checkpoint after a crash, or wait for checkpoints to become available.
-
-### Usage:
-
-```python
-with tf.Graph().as_default():
-   ...add operations to the graph...
-  # Create a SessionManager that will checkpoint the model in '/tmp/mydir'.
-  sm = SessionManager()
-  sess = sm.prepare_session(master, init_op, saver, checkpoint_dir)
-  # Use the session to train the graph.
-  while True:
-    sess.run(<my_train_op>)
-```
-
-`prepare_session()` initializes or restores a model. It requires `init_op`
-and `saver` as an argument.
-
-A second process could wait for the model to be ready by doing the following:
-
-```python
-with tf.Graph().as_default():
-   ...add operations to the graph...
-  # Create a SessionManager that will wait for the model to become ready.
-  sm = SessionManager()
-  sess = sm.wait_for_session(master)
-  # Use the session to train the graph.
-  while True:
-    sess.run(<my_train_op>)
-```
-
-`wait_for_session()` waits for a model to be initialized by other processes.
-- - -
-
-#### `tf.train.SessionManager.__init__(local_init_op=None, ready_op=None, ready_for_local_init_op=None, graph=None, recovery_wait_secs=30)` {#SessionManager.__init__}
-
-Creates a SessionManager.
-
-The `local_init_op` is an `Operation` that is run always after a new session
-was created. If `None`, this step is skipped.
-
-The `ready_op` is an `Operation` used to check if the model is ready.  The
-model is considered ready if that operation returns an empty 1D string
-tensor. If the operation returns a non empty 1D string tensor, the elements
-are concatenated and used to indicate to the user why the model is not
-ready.
-
-The `ready_for_local_init_op` is an `Operation` used to check if the model
-is ready to run local_init_op.  The model is considered ready if that
-operation returns an empty 1D string tensor. If the operation returns a non
-empty 1D string tensor, the elements are concatenated and used to indicate
-to the user why the model is not ready.
-
-If `ready_op` is `None`, the model is not checked for readiness.
-
-`recovery_wait_secs` is the number of seconds between checks that
-the model is ready.  It is used by processes to wait for a model to
-be initialized or restored.  Defaults to 30 seconds.
-
-##### Args:
-
-
-*  <b>`local_init_op`</b>: An `Operation` run immediately after session creation.
-     Usually used to initialize tables and local variables.
-*  <b>`ready_op`</b>: An `Operation` to check if the model is initialized.
-*  <b>`ready_for_local_init_op`</b>: An `Operation` to check if the model is ready
-     to run local_init_op.
-*  <b>`graph`</b>: The `Graph` that the model will use.
-*  <b>`recovery_wait_secs`</b>: Seconds between checks for the model to be ready.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If ready_for_local_init_op is not None but local_init_op is
-    None
-
-
-- - -
-
-#### `tf.train.SessionManager.prepare_session(master, init_op=None, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None, init_feed_dict=None, init_fn=None)` {#SessionManager.prepare_session}
-
-Creates a `Session`. Makes sure the model is ready to be used.
-
-Creates a `Session` on 'master'. If a `saver` object is passed in, and
-`checkpoint_dir` points to a directory containing valid checkpoint
-files, then it will try to recover the model from checkpoint. If
-no checkpoint files are available, and `wait_for_checkpoint` is
-`True`, then the process would check every `recovery_wait_secs`,
-up to `max_wait_secs`, for recovery to succeed.
-
-If the model cannot be recovered successfully then it is initialized by
-either running the provided `init_op`, or calling the provided `init_fn`.
-The local_init_op is also run after init_op and init_fn, regardless of
-whether the model was recovered successfully, but only if
-ready_for_local_init_op passes.
-
-It is an error if the model cannot be recovered and no `init_op`
-or `init_fn` or `local_init_op` are passed.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`init_op`</b>: Optional `Operation` used to initialize the model.
-*  <b>`saver`</b>: A `Saver` object used to restore a model.
-*  <b>`checkpoint_dir`</b>: Path to the checkpoint files. The latest checkpoint in the
-    dir will be used to restore.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-*  <b>`wait_for_checkpoint`</b>: Whether to wait for checkpoint to become available.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for checkpoints to become available.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-*  <b>`init_feed_dict`</b>: Optional dictionary that maps `Tensor` objects to feed
-    values.  This feed dictionary is passed to the session `run()` call when
-    running the init op.
-*  <b>`init_fn`</b>: Optional callable used to initialize the model. Called after the
-    optional `init_op` is called.  The callable must accept one argument,
-    the session being initialized.
-
-##### Returns:
-
-  A `Session` object that can be used to drive the model.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If the model cannot be initialized or recovered.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both checkpoint_dir and checkpoint_filename_with_path are
-    set.
-
-
-- - -
-
-#### `tf.train.SessionManager.recover_session(master, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None)` {#SessionManager.recover_session}
-
-Creates a `Session`, recovering if possible.
-
-Creates a new session on 'master'.  If the session is not initialized
-and can be recovered from a checkpoint, recover it.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`saver`</b>: A `Saver` object used to restore a model.
-*  <b>`checkpoint_dir`</b>: Path to the checkpoint files. The latest checkpoint in the
-    dir will be used to restore.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-*  <b>`wait_for_checkpoint`</b>: Whether to wait for checkpoint to become available.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for checkpoints to become available.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-
-##### Returns:
-
-  A pair (sess, initialized) where 'initialized' is `True` if
-  the session could be recovered and initialized, `False` otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both checkpoint_dir and checkpoint_filename_with_path are
-    set.
-
-
-- - -
-
-#### `tf.train.SessionManager.wait_for_session(master, config=None, max_wait_secs=inf)` {#SessionManager.wait_for_session}
-
-Creates a new `Session` and waits for model to be ready.
-
-Creates a new `Session` on 'master'.  Waits for the model to be
-initialized or recovered from a checkpoint.  It's expected that
-another thread or process will make the model ready, and that this
-is intended to be used by threads/processes that participate in a
-distributed training configuration where a different thread/process
-is responsible for initializing or recovering the model being trained.
-
-NB: The amount of time this method waits for the session is bounded
-by max_wait_secs. By default, this function will wait indefinitely.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-
-##### Returns:
-
-  A `Session`. May be None if the operation exceeds the timeout
-  specified by config.operation_timeout_in_ms.
-
-##### Raises:
-
-  tf.DeadlineExceededError: if the session is not available after
-    max_wait_secs.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.checkpoint_exists.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.checkpoint_exists.md
deleted file mode 100644
index f28e994e52f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.checkpoint_exists.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.train.checkpoint_exists(checkpoint_prefix)` {#checkpoint_exists}
-
-Checks whether a V1 or V2 checkpoint exists with the specified prefix.
-
-This is the recommended way to check if a checkpoint exists, since it takes
-into account the naming difference between V1 and V2 formats.
-
-##### Args:
-
-
-*  <b>`checkpoint_prefix`</b>: the prefix of a V1 or V2 checkpoint, with V2 taking
-    priority.  Typically the result of `Saver.save()` or that of
-    `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or
-    V1/V2.
-
-##### Returns:
-
-  A bool, true iff a checkpoint referred to by `checkpoint_prefix` exists.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md
deleted file mode 100644
index 2c90ddfafe2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.maybe_shuffle_batch.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.train.maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, keep_input, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch}
-
-Creates batches by randomly shuffling conditionally-enqueued tensors.
-
-See docstring in `shuffle_batch` for more details.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md
deleted file mode 100644
index b41f38eb494..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.train.piecewise_constant(x, boundaries, values, name=None)` {#piecewise_constant}
-
-Piecewise constant from boundaries and interval values.
-
-Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
-  for steps 100001 to 110000, and 0.1 for any additional steps.
-
-```python
-global_step = tf.Variable(0, trainable=False)
-boundaries = [100000, 110000]
-values = [1.0, 0.5, 0.1]
-learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-
-# Later, whenever we perform an optimization step, we increment global_step.
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
-    `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
-*  <b>`boundaries`</b>: A list of `Tensor`s or `int`s or `float`s with strictly
-    increasing entries, and with all elements having the same type as `x`.
-*  <b>`values`</b>: A list of `Tensor`s or float`s or `int`s that specifies the values
-    for the intervals defined by `boundaries`. It should have one more element
-    than `boundaries`, and all elements should have the same type.
-*  <b>`name`</b>: A string. Optional name of the operation. Defaults to
-    'PiecewiseConstant'.
-
-##### Returns:
-
-  A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
-  `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
-  and values[-1] when `x > boundaries[-1]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if types of `x` and `buondaries` do not match, or types of all
-      `values` do not match.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md
deleted file mode 100644
index 64a365fb08a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md
+++ /dev/null
@@ -1,78 +0,0 @@
-### `tf.train.polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None)` {#polynomial_decay}
-
-Applies a polynomial decay to the learning rate.
-
-It is commonly observed that a monotonically decreasing learning rate, whose
-degree of change is carefully chosen, results in a better performing model.
-This function applies a polynomial decay function to a provided initial
-`learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
-
-It requires a `global_step` value to compute the decayed learning rate.  You
-can just pass a TensorFlow variable that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-global_step = min(global_step, decay_steps)
-decayed_learning_rate = (learning_rate - end_learning_rate) *
-                        (1 - global_step / decay_steps) ^ (power) +
-                        end_learning_rate
-
-```
-
-If `cycle` is True then a multiple of `decay_steps` is used, the first one
-that is bigger than `global_steps`.
-
-```python
-decay_steps = decay_steps * ceil(global_step / decay_steps)
-decayed_learning_rate = (learning_rate - end_learning_rate) *
-                        (1 - global_step / decay_steps) ^ (power) +
-                        end_learning_rate
-
-```
-
-Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-starter_learning_rate = 0.1
-end_learning_rate = 0.01
-decay_steps = 10000
-learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
-                                          decay_steps, end_learning_rate,
-                                          power=0.5)
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Must be positive.  See the decay computation above.
-*  <b>`end_learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The minimal end learning rate.
-*  <b>`power`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
-*  <b>`cycle`</b>: A boolean, whether or not it should cycle beyond decay_steps.
-*  <b>`name`</b>: String.  Optional name of the operation. Defaults to
-    'PolynomialDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md
deleted file mode 100644
index 4009cc9b30c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### `tf.train.replica_device_setter(ps_tasks=0, ps_device='/job:ps', worker_device='/job:worker', merge_devices=True, cluster=None, ps_ops=None, ps_strategy=None)` {#replica_device_setter}
-
-Return a `device function` to use when building a Graph for replicas.
-
-Device Functions are used in `with tf.device(device_function):` statement to
-automatically assign devices to `Operation` objects as they are constructed,
-Device constraints are added from the inner-most context first, working
-outwards. The merging behavior adds constraints to fields that are yet unset
-by a more inner context. Currently the fields are (job, task, cpu/gpu).
-
-If `cluster` is `None`, and `ps_tasks` is 0, the returned function is a no-op.
-Otherwise, the value of `ps_tasks` is derived from `cluster`.
-
-By default, only Variable ops are placed on ps tasks, and the placement
-strategy is round-robin over all ps tasks. A custom `ps_strategy` may be used
-to do more intelligent placement, such as
-`tf.contrib.training.GreedyLoadBalancingStrategy`.
-
-For example,
-
-```python
-# To build a cluster with two ps jobs on hosts ps0 and ps1, and 3 worker
-# jobs on hosts worker0, worker1 and worker2.
-cluster_spec = {
-    "ps": ["ps0:2222", "ps1:2222"],
-    "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
-  # Build your graph
-  v1 = tf.Variable(...)  # assigned to /job:ps/task:0
-  v2 = tf.Variable(...)  # assigned to /job:ps/task:1
-  v3 = tf.Variable(...)  # assigned to /job:ps/task:0
-# Run compute
-```
-
-##### Args:
-
-
-*  <b>`ps_tasks`</b>: Number of tasks in the `ps` job.  Ignored if `cluster` is
-    provided.
-*  <b>`ps_device`</b>: String.  Device of the `ps` job.  If empty no `ps` job is used.
-    Defaults to `ps`.
-*  <b>`worker_device`</b>: String.  Device of the `worker` job.  If empty no `worker`
-    job is used.
-*  <b>`merge_devices`</b>: `Boolean`. If `True`, merges or only sets a device if the
-    device constraint is completely unset. merges device specification rather
-    than overriding them.
-*  <b>`cluster`</b>: `ClusterDef` proto or `ClusterSpec`.
-*  <b>`ps_ops`</b>: List of strings representing `Operation` types that need to be
-    placed on `ps` devices.  If `None`, defaults to `["Variable"]`.
-*  <b>`ps_strategy`</b>: A callable invoked for every ps `Operation` (i.e. matched by
-    `ps_ops`), that takes the `Operation` and returns the ps task index to
-    use.  If `None`, defaults to a round-robin strategy across all `ps`
-    devices.
-
-##### Returns:
-
-  A function to pass to `tf.device()`.
-
-##### Raises:
-
-  TypeError if `cluster` is not a dictionary or `ClusterDef` protocol buffer,
-  or if `ps_strategy` is provided but not a callable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.transpose.md
deleted file mode 100644
index c6b76c78247..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.transpose.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.transpose(a, perm=None, name='transpose')` {#transpose}
-
-Transposes `a`. Permutes the dimensions according to `perm`.
-
-The returned tensor's dimension i will correspond to the input dimension
-`perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-the rank of the input tensor. Hence by default, this operation performs a
-regular matrix transpose on 2-D input Tensors.
-
-For example:
-
-```python
-# 'x' is [[1 2 3]
-#         [4 5 6]]
-tf.transpose(x) ==> [[1 4]
-                     [2 5]
-                     [3 6]]
-
-# Equivalently
-tf.transpose(x, perm=[1, 0]) ==> [[1 4]
-                                  [2 5]
-                                  [3 6]]
-
-# 'perm' is more useful for n-dimensional tensors, for n > 2
-# 'x' is   [[[1  2  3]
-#            [4  5  6]]
-#           [[7  8  9]
-#            [10 11 12]]]
-# Take the transpose of the matrices in dimension-0
-tf.transpose(x, perm=[0, 2, 1]) ==> [[[1  4]
-                                      [2  5]
-                                      [3  6]]
-
-                                     [[7 10]
-                                      [8 11]
-                                      [9 12]]]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`.
-*  <b>`perm`</b>: A permutation of the dimensions of `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A transposed `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.truncated_normal_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.truncated_normal_initializer.md
deleted file mode 100644
index 7ccec1074a0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.truncated_normal_initializer.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Initializer that generates a truncated normal distribution.
-
-These values are similar to values from a `random_normal_initializer`
-except that values more than two standard deviations from the mean
-are discarded and re-drawn. This is the recommended initializer for
-neural network weights and filters.
-
-Args:
-  mean: a python scalar or a scalar tensor. Mean of the random values
-    to generate.
-  stddev: a python scalar or a scalar tensor. Standard deviation of the
-    random values to generate.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.truncated_normal_initializer.__call__(shape, dtype=None, partition_info=None)` {#truncated_normal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.truncated_normal_initializer.__init__(mean=0.0, stddev=1.0, seed=None, dtype=tf.float32)` {#truncated_normal_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
deleted file mode 100644
index 266ac318e42..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None, use_resource=None)` {#variable_op_scope}
-
-Deprecated: context manager for defining an op that creates variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.verify_tensor_all_finite.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.verify_tensor_all_finite.md
deleted file mode 100644
index 37fa105df5f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.verify_tensor_all_finite.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.verify_tensor_all_finite(t, msg, name=None)` {#verify_tensor_all_finite}
-
-Assert that the tensor does not contain any NaN's or Inf's.
-
-##### Args:
-
-
-*  <b>`t`</b>: Tensor to check.
-*  <b>`msg`</b>: Message to log on failure.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Same tensor as `t`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.Dimension.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.Dimension.md
deleted file mode 100644
index 18d6d04fc09..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.Dimension.md
+++ /dev/null
@@ -1,361 +0,0 @@
-Represents the value of one dimension in a TensorShape.
-- - -
-
-#### `tf.Dimension.__add__(other)` {#Dimension.__add__}
-
-Returns the sum of `self` and `other`.
-
-Dimensions are summed as follows:
-
-  Dimension(m)    + Dimension(n)    == Dimension(m + n)
-  Dimension(m)    + Dimension(None) == Dimension(None)
-  Dimension(None) + Dimension(n)    == Dimension(None)
-  Dimension(None) + Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the sum of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__div__(other)` {#Dimension.__div__}
-
-DEPRECATED: Use `__floordiv__` via `x // y` instead.
-
-This function exists only for backwards compatibility purposes; new code
-should use `__floordiv__` via the syntax `x // y`.  Using `x // y`
-communicates clearly that the result rounds down, and is forward compatible
-to Python 3.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `Dimension`.
-
-##### Returns:
-
-  A `Dimension` whose value is the integer quotient of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__eq__(other)` {#Dimension.__eq__}
-
-Returns true if `other` has the same known value as this Dimension.
-
-
-- - -
-
-#### `tf.Dimension.__floordiv__(other)` {#Dimension.__floordiv__}
-
-Returns the quotient of `self` and `other` rounded down.
-
-Dimensions are divided as follows:
-
-  Dimension(m)    // Dimension(n)    == Dimension(m // n)
-  Dimension(m)    // Dimension(None) == Dimension(None)
-  Dimension(None) // Dimension(n)    == Dimension(None)
-  Dimension(None) // Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another `Dimension`.
-
-##### Returns:
-
-  A `Dimension` whose value is the integer quotient of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__ge__(other)` {#Dimension.__ge__}
-
-Returns True if `self` is known to be greater than or equal to `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    >= Dimension(n)    == m >= n
-  Dimension(m)    >= Dimension(None) == None
-  Dimension(None) >= Dimension(n)    == None
-  Dimension(None) >= Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value >= other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__gt__(other)` {#Dimension.__gt__}
-
-Returns True if `self` is known to be greater than `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    > Dimension(n)    == m > n
-  Dimension(m)    > Dimension(None) == None
-  Dimension(None) > Dimension(n)    == None
-  Dimension(None) > Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value > other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__index__()` {#Dimension.__index__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__init__(value)` {#Dimension.__init__}
-
-Creates a new Dimension with the given value.
-
-
-- - -
-
-#### `tf.Dimension.__int__()` {#Dimension.__int__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__le__(other)` {#Dimension.__le__}
-
-Returns True if `self` is known to be less than or equal to `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    <= Dimension(n)    == m <= n
-  Dimension(m)    <= Dimension(None) == None
-  Dimension(None) <= Dimension(n)    == None
-  Dimension(None) <= Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value <= other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__lt__(other)` {#Dimension.__lt__}
-
-Returns True if `self` is known to be less than `other`.
-
-Dimensions are compared as follows:
-
-  Dimension(m)    < Dimension(n)    == m < n
-  Dimension(m)    < Dimension(None) == None
-  Dimension(None) < Dimension(n)    == None
-  Dimension(None) < Dimension(None) == None
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  The value of `self.value < other.value` if both are known, otherwise
-  None.
-
-
-- - -
-
-#### `tf.Dimension.__mod__(other)` {#Dimension.__mod__}
-
-Returns `self` modulo `other.
-
-Dimension moduli are computed  as follows:
-
-  Dimension(m)    % Dimension(n)     == Dimension(m % n)
-  Dimension(m)    % Dimension(None)  == Dimension(None)
-  Dimension(None) % Dimension(n)     == Dimension(None)
-  Dimension(None) %  Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is `self` modulo `other`.
-
-
-- - -
-
-#### `tf.Dimension.__mul__(other)` {#Dimension.__mul__}
-
-Returns the product of `self` and `other`.
-
-Dimensions are summed as follows:
-
-```
-  Dimension(m)    * Dimension(n)    == Dimension(m * n)
-  Dimension(m)    * Dimension(None) == Dimension(None)
-  Dimension(None) * Dimension(n)    == Dimension(None)
-  Dimension(None) * Dimension(None) == Dimension(None)
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the product of `self` and `other`.
-
-
-- - -
-
-#### `tf.Dimension.__ne__(other)` {#Dimension.__ne__}
-
-Returns true if `other` has a different known value from `self`.
-
-
-- - -
-
-#### `tf.Dimension.__repr__()` {#Dimension.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__str__()` {#Dimension.__str__}
-
-
-
-
-- - -
-
-#### `tf.Dimension.__sub__(other)` {#Dimension.__sub__}
-
-Returns the subtraction of `other` from `self`.
-
-Dimensions are subtracted as follows:
-
-  Dimension(m)    - Dimension(n)    == Dimension(m - n)
-  Dimension(m)    - Dimension(None) == Dimension(None)
-  Dimension(None) - Dimension(n)    == Dimension(None)
-  Dimension(None) - Dimension(None) == Dimension(None)
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension whose value is the subtraction of sum of `other` from `self`.
-
-
-- - -
-
-#### `tf.Dimension.assert_is_compatible_with(other)` {#Dimension.assert_is_compatible_with}
-
-Raises an exception if `other` is not compatible with this Dimension.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible (see
-    is_compatible_with).
-
-
-- - -
-
-#### `tf.Dimension.is_compatible_with(other)` {#Dimension.is_compatible_with}
-
-Returns true if `other` is compatible with this Dimension.
-
-Two known Dimensions are compatible if they have the same value.
-An unknown Dimension is compatible with all other Dimensions.
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  True if this Dimension and `other` are compatible.
-
-
-- - -
-
-#### `tf.Dimension.merge_with(other)` {#Dimension.merge_with}
-
-Returns a Dimension that combines the information in `self` and `other`.
-
-Dimensions are combined as follows:
-
-```python
-    Dimension(n)   .merge_with(Dimension(n))    == Dimension(n)
-    Dimension(n)   .merge_with(Dimension(None)) == Dimension(n)
-    Dimension(None).merge_with(Dimension(n))    == Dimension(n)
-    Dimension(None).merge_with(Dimension(None)) == Dimension(None)
-    Dimension(n)   .merge_with(Dimension(m)) raises ValueError for n != m
-```
-
-##### Args:
-
-
-*  <b>`other`</b>: Another Dimension.
-
-##### Returns:
-
-  A Dimension containing the combined information of `self` and
-  `other`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `self` and `other` are not compatible (see
-    is_compatible_with).
-
-
-- - -
-
-#### `tf.Dimension.value` {#Dimension.value}
-
-The value of this dimension, or None if it is unknown.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.FixedLenSequenceFeature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.FixedLenSequenceFeature.md
deleted file mode 100644
index 49d7b07cb41..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.FixedLenSequenceFeature.md
+++ /dev/null
@@ -1,59 +0,0 @@
-Configuration for a dense input feature in a sequence item.
-
-To treat a sparse input as dense, provide `allow_missing=True`; otherwise,
-the parse functions will fail on any examples missing this feature.
-
-Fields:
-  shape: Shape of input data.
-  dtype: Data type of input.
-  allow_missing: Whether to allow this feature to be missing from a feature
-    list item.
-- - -
-
-#### `tf.FixedLenSequenceFeature.__getnewargs__()` {#FixedLenSequenceFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__getstate__()` {#FixedLenSequenceFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__new__(_cls, shape, dtype, allow_missing=False)` {#FixedLenSequenceFeature.__new__}
-
-Create new instance of FixedLenSequenceFeature(shape, dtype, allow_missing)
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__repr__()` {#FixedLenSequenceFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.allow_missing` {#FixedLenSequenceFeature.allow_missing}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.dtype` {#FixedLenSequenceFeature.dtype}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.shape` {#FixedLenSequenceFeature.shape}
-
-Alias for field number 0
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.PaddingFIFOQueue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.PaddingFIFOQueue.md
deleted file mode 100644
index 66270e6fc7c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.PaddingFIFOQueue.md
+++ /dev/null
@@ -1,55 +0,0 @@
-A FIFOQueue that supports batching variable-sized tensors by padding.
-
-A `PaddingFIFOQueue` may contain components with dynamic shape, while also
-supporting `dequeue_many`.  See the constructor for more details.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.PaddingFIFOQueue.__init__(capacity, dtypes, shapes, names=None, shared_name=None, name='padding_fifo_queue')` {#PaddingFIFOQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `PaddingFIFOQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `PaddingFIFOQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `dtypes`, and whose shapes are described by the `shapes`
-argument.
-
-The `shapes` argument must be specified; each component of a queue
-element must have the respective shape.  Shapes of fixed
-rank but variable size are allowed by setting any shape dimension to None.
-In this case, the inputs' shape may vary along the given dimension, and
-`dequeue_many` will pad the given dimension with zeros up to the maximum
-shape of all elements in the given batch.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: A list of `TensorShape` objects, with the same length as
-    `dtypes`.  Any dimension in the `TensorShape` containing value
-    `None` is dynamic and allows values to be enqueued with
-     variable size in that dimension.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If shapes is not a list of shapes, or the lengths of dtypes
-    and shapes do not match, or if names is specified and the lengths of
-    dtypes and names do not match.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.QueueBase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.QueueBase.md
deleted file mode 100644
index 855ca42ee1a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.QueueBase.md
+++ /dev/null
@@ -1,311 +0,0 @@
-Base class for queue implementations.
-
-A queue is a TensorFlow data structure that stores tensors across
-multiple steps, and exposes operations that enqueue and dequeue
-tensors.
-
-Each queue element is a tuple of one or more tensors, where each
-tuple component has a static dtype, and may have a static shape. The
-queue implementations support versions of enqueue and dequeue that
-handle single elements, versions that support enqueuing and
-dequeuing a batch of elements at once.
-
-See [`tf.FIFOQueue`](#FIFOQueue) and
-[`tf.RandomShuffleQueue`](#RandomShuffleQueue) for concrete
-implementations of this class, and instructions on how to create
-them.
-
-- - -
-
-#### `tf.QueueBase.enqueue(vals, name=None)` {#QueueBase.enqueue}
-
-Enqueues one element to this queue.
-
-If the queue is full when this operation executes, it will block
-until the element has been enqueued.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed before this operation runs,
-`tf.errors.CancelledError` will be raised. If this operation is
-blocked, and either (i) the queue is closed by a close operation
-with `cancel_pending_enqueues=True`, or (ii) the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`vals`</b>: A tensor, a list or tuple of tensors, or a dictionary containing
-    the values to enqueue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that enqueues a new tuple of tensors to the queue.
-
-
-- - -
-
-#### `tf.QueueBase.enqueue_many(vals, name=None)` {#QueueBase.enqueue_many}
-
-Enqueues zero or more elements to this queue.
-
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tensors in `vals` must have the
-same size in the 0th dimension.
-
-If the queue is full when this operation executes, it will block
-until all of the elements have been enqueued.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed before this operation runs,
-`tf.errors.CancelledError` will be raised. If this operation is
-blocked, and either (i) the queue is closed by a close operation
-with `cancel_pending_enqueues=True`, or (ii) the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`vals`</b>: A tensor, a list or tuple of tensors, or a dictionary
-    from which the queue elements are taken.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that enqueues a batch of tuples of tensors to the queue.
-
-
-
-- - -
-
-#### `tf.QueueBase.dequeue(name=None)` {#QueueBase.dequeue}
-
-Dequeues one element from this queue.
-
-If the queue is empty when this operation executes, it will block
-until there is an element to dequeue.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed, the queue is empty, and there are no pending
-enqueue operations that can fulfill this request,
-`tf.errors.OutOfRangeError` will be raised. If the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of tensors that was dequeued.
-
-
-- - -
-
-#### `tf.QueueBase.dequeue_many(n, name=None)` {#QueueBase.dequeue_many}
-
-Dequeues and concatenates `n` elements from this queue.
-
-This operation concatenates queue-element component tensors along
-the 0th dimension to make a single component tensor.  All of the
-components in the dequeued tuple will have size `n` in the 0th dimension.
-
-If the queue is closed and there are less than `n` elements left, then an
-`OutOfRange` exception is raised.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed, the queue contains fewer than `n` elements, and
-there are no pending enqueue operations that can fulfill this
-request, `tf.errors.OutOfRangeError` will be raised. If the
-session is [closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`n`</b>: A scalar `Tensor` containing the number of elements to dequeue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of concatenated tensors that was dequeued.
-
-
-
-- - -
-
-#### `tf.QueueBase.size(name=None)` {#QueueBase.size}
-
-Compute the number of elements in this queue.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar tensor containing the number of elements in this queue.
-
-
-
-- - -
-
-#### `tf.QueueBase.close(cancel_pending_enqueues=False, name=None)` {#QueueBase.close}
-
-Closes this queue.
-
-This operation signals that no more elements will be enqueued in
-the given queue. Subsequent `enqueue` and `enqueue_many`
-operations will fail. Subsequent `dequeue` and `dequeue_many`
-operations will continue to succeed if sufficient elements remain
-in the queue. Subsequent `dequeue` and `dequeue_many` operations
-that would block will fail immediately.
-
-If `cancel_pending_enqueues` is `True`, all pending requests will also
-be cancelled.
-
-##### Args:
-
-
-*  <b>`cancel_pending_enqueues`</b>: (Optional.) A boolean, defaulting to
-    `False` (described above).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that closes the queue.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.QueueBase.__init__(dtypes, shapes, names, queue_ref)` {#QueueBase.__init__}
-
-Constructs a queue object from a queue reference.
-
-The two optional lists, `shapes` and `names`, must be of the same length
-as `dtypes` if provided.  The values at a given index `i` indicate the
-shape and name to use for the corresponding queue component in `dtypes`.
-
-##### Args:
-
-
-*  <b>`dtypes`</b>: A list of types.  The length of dtypes must equal the number
-    of tensors in each element.
-*  <b>`shapes`</b>: Constraints on the shapes of tensors in an element:
-    A list of shape tuples or None. This list is the same length
-    as dtypes.  If the shape of any tensors in the element are constrained,
-    all must be; shapes can be None if the shapes should not be constrained.
-*  <b>`names`</b>: Optional list of names.  If provided, the `enqueue()` and
-    `dequeue()` methods will use dictionaries with these names as keys.
-    Must be None or a list or tuple of the same length as `dtypes`.
-*  <b>`queue_ref`</b>: The queue reference, i.e. the output of the queue op.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.QueueBase.dequeue_up_to(n, name=None)` {#QueueBase.dequeue_up_to}
-
-Dequeues and concatenates `n` elements from this queue.
-
-**Note** This operation is not supported by all queues.  If a queue does not
-support DequeueUpTo, then a `tf.errors.UnimplementedError` is raised.
-
-This operation concatenates queue-element component tensors along
-the 0th dimension to make a single component tensor. If the queue
-has not been closed, all of the components in the dequeued tuple
-will have size `n` in the 0th dimension.
-
-If the queue is closed and there are more than `0` but fewer than
-`n` elements remaining, then instead of raising a
-`tf.errors.OutOfRangeError` like [`dequeue_many`](#QueueBase.dequeue_many),
-less than `n` elements are returned immediately.  If the queue is
-closed and there are `0` elements left in the queue, then a
-`tf.errors.OutOfRangeError` is raised just like in `dequeue_many`.
-Otherwise the behavior is identical to `dequeue_many`.
-
-##### Args:
-
-
-*  <b>`n`</b>: A scalar `Tensor` containing the number of elements to dequeue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of concatenated tensors that was dequeued.
-
-
-- - -
-
-#### `tf.QueueBase.dtypes` {#QueueBase.dtypes}
-
-The list of dtypes for each component of a queue element.
-
-
-- - -
-
-#### `tf.QueueBase.from_list(index, queues)` {#QueueBase.from_list}
-
-Create a queue using the queue reference from `queues[index]`.
-
-##### Args:
-
-
-*  <b>`index`</b>: An integer scalar tensor that determines the input that gets
-    selected.
-*  <b>`queues`</b>: A list of `QueueBase` objects.
-
-##### Returns:
-
-  A `QueueBase` object.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: When `queues` is not a list of `QueueBase` objects,
-    or when the data types of `queues` are not all the same.
-
-
-- - -
-
-#### `tf.QueueBase.name` {#QueueBase.name}
-
-The name of the underlying queue.
-
-
-- - -
-
-#### `tf.QueueBase.names` {#QueueBase.names}
-
-The list of names for each component of a queue element.
-
-
-- - -
-
-#### `tf.QueueBase.queue_ref` {#QueueBase.queue_ref}
-
-The underlying queue reference.
-
-
-- - -
-
-#### `tf.QueueBase.shapes` {#QueueBase.shapes}
-
-The list of shapes for each component of a queue element.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.SparseConditionalAccumulator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.SparseConditionalAccumulator.md
deleted file mode 100644
index f0329e8cbb3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.SparseConditionalAccumulator.md
+++ /dev/null
@@ -1,209 +0,0 @@
-A conditional accumulator for aggregating sparse gradients.
-
-Sparse gradients are represented by IndexedSlices.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-
-Args:
-  dtype: Datatype of the accumulated gradients.
-  shape: Shape of the accumulated gradients.
-  shared_name: Optional. If non-empty, this accumulator will be shared under
-    the given name across multiple sessions.
-  name: Optional name for the accumulator.
-- - -
-
-#### `tf.SparseConditionalAccumulator.__init__(dtype, shape=None, shared_name=None, name='sparse_conditional_accumulator')` {#SparseConditionalAccumulator.__init__}
-
-
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.accumulator_ref` {#SparseConditionalAccumulator.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.apply_grad(grad_indices, grad_values, grad_shape=None, local_step=0, name=None)` {#SparseConditionalAccumulator.apply_grad}
-
-Attempts to apply a sparse gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-A sparse gradient is represented by its indices, values and possibly empty
-or None shape. Indices must be a vector representing the locations of
-non-zero entries in the tensor. Values are the non-zero slices of the
-gradient, and must have the same first dimension as indices, i.e., the nnz
-represented by indices and values must be consistent. Shape, if not empty or
-None, must be consistent with the accumulator's shape (if also provided).
-
-##### Example:
-
-  A tensor [[0, 0], [0. 1], [2, 3]] can be represented
-
-*  <b>`indices`</b>: [1,2]
-*  <b>`values`</b>: [[0,1],[2,3]]
-*  <b>`shape`</b>: [3, 2]
-
-##### Args:
-
-
-*  <b>`grad_indices`</b>: Indices of the sparse gradient to be applied.
-*  <b>`grad_values`</b>: Values of the sparse gradient to be applied.
-*  <b>`grad_shape`</b>: Shape of the sparse gradient to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.apply_indexed_slices_grad(grad, local_step=0, name=None)` {#SparseConditionalAccumulator.apply_indexed_slices_grad}
-
-Attempts to apply a gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-##### Args:
-
-
-*  <b>`grad`</b>: The gradient IndexedSlices to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.dtype` {#SparseConditionalAccumulator.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.name` {#SparseConditionalAccumulator.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.num_accumulated(name=None)` {#SparseConditionalAccumulator.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.set_global_step(new_global_step, name=None)` {#SparseConditionalAccumulator.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.take_grad(num_required, name=None)` {#SparseConditionalAccumulator.take_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  A tuple of indices, values, and shape representing the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.take_indexed_slices_grad(num_required, name=None)` {#SparseConditionalAccumulator.take_indexed_slices_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  An IndexedSlices holding the value of the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.abs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.abs.md
deleted file mode 100644
index 8a5ae1b9ac2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.abs.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.abs(x, name=None)` {#abs}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.as_string.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.as_string.md
deleted file mode 100644
index 0217ad31133..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.as_string.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.as_string(input, precision=None, scientific=None, shortest=None, width=None, fill=None, name=None)` {#as_string}
-
-Converts each entry in the given tensor to strings.  Supports many numeric
-
-types and boolean.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `complex64`, `float32`, `float64`, `bool`, `int8`.
-*  <b>`precision`</b>: An optional `int`. Defaults to `-1`.
-    The post-decimal precision to use for floating point numbers.
-    Only used if precision > -1.
-*  <b>`scientific`</b>: An optional `bool`. Defaults to `False`.
-    Use scientific notation for floating point numbers.
-*  <b>`shortest`</b>: An optional `bool`. Defaults to `False`.
-    Use shortest representation (either scientific or standard) for
-    floating point numbers.
-*  <b>`width`</b>: An optional `int`. Defaults to `-1`.
-    Pad pre-decimal numbers to this width.
-    Applies to both floating point and integer numbers.
-    Only used if width > -1.
-*  <b>`fill`</b>: An optional `string`. Defaults to `""`.
-    The value to pad if width > -1.  If empty, pads with spaces.
-    Another typical value is '0'.  String cannot be longer than 1 character.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md
deleted file mode 100644
index ee73f2f9a59..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.assert_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_positive}
-
-Assert the condition `x > 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_positive(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Positive means, for every element `x[i]` of `x`, we have `x[i] > 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_positive".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all positive.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.bitcast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.bitcast.md
deleted file mode 100644
index 9e60ab21449..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.bitcast.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.bitcast(input, type, name=None)` {#bitcast}
-
-Bitcasts a tensor from one type to another without copying data.
-
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`type`</b>: A `tf.DType` from: `tf.float32, tf.float64, tf.int64, tf.int32, tf.uint8, tf.uint16, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.qint8, tf.quint8, tf.qint32, tf.half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `type`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md
deleted file mode 100644
index 321429967e5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.concat.md
+++ /dev/null
@@ -1,58 +0,0 @@
-### `tf.concat(values, axis, name='concat')` {#concat}
-
-Concatenates tensors along one dimension.
-
-Concatenates the list of tensors `values` along dimension `axis`.  If
-`values[i].shape = [D0, D1, ... Daxis(i), ...Dn]`, the concatenated
-result has shape
-
-    [D0, D1, ... Raxis, ...Dn]
-
-where
-
-    Raxis = sum(Daxis(i))
-
-That is, the data from the input tensors is joined along the `axis`
-dimension.
-
-The number of dimensions of the input tensors must match, and all dimensions
-except `axis` must be equal.
-
-For example:
-
-```python
-t1 = [[1, 2, 3], [4, 5, 6]]
-t2 = [[7, 8, 9], [10, 11, 12]]
-tf.concat([t1, t2], 0) ==> [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
-tf.concat([t1, t2], 1) ==> [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
-
-# tensor t3 with shape [2, 3]
-# tensor t4 with shape [2, 3]
-tf.shape(tf.concat([t3, t4], 0)) ==> [4, 3]
-tf.shape(tf.concat([t3, t4], 1)) ==> [2, 6]
-```
-
-Note: If you are concatenating along a new axis consider using stack.
-E.g.
-
-```python
-tf.concat([tf.expand_dims(t, axis) for t in tensors], axis)
-```
-
-can be rewritten as
-
-```python
-tf.stack(tensors, axis=axis)
-```
-
-##### Args:
-
-
-*  <b>`values`</b>: A list of `Tensor` objects or a single `Tensor`.
-*  <b>`axis`</b>: 0-D `int32` `Tensor`.  Dimension along which to concatenate.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` resulting from concatenation of the input tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.conj.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.conj.md
deleted file mode 100644
index e7491301cbc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.conj.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.conj(x, name=None)` {#conj}
-
-Returns the complex conjugate of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-complex numbers that are the complex conjugate of each element in `input`. The
-complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-real part and *b* is the imaginary part.
-
-The complex conjugate returned by this operation is of the form \\(a - bj\\).
-
-For example:
-
-    # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-    tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-
-If `x` is real, it is returned unchanged.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` to conjugate.  Must have numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` that is the conjugate of `x` (with the same type).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` is not a numeric tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.elbo_ratio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.elbo_ratio.md
deleted file mode 100644
index 0419408ce43..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.elbo_ratio.md
+++ /dev/null
@@ -1,68 +0,0 @@
-### `tf.contrib.bayesflow.entropy.elbo_ratio(log_p, q, z=None, n=None, seed=None, form=None, name='elbo_ratio')` {#elbo_ratio}
-
-Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
-
-With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
-
-```
-E_q[ Log[p(Z) / q(Z)] ]
-```
-
-The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
-The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
-if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
-
-This log-ratio appears in different contexts:
-
-#### `KL[q || p]`
-
-If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
-the negative Kullback-Leibler divergence.
-
-```
-elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
-KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
-```
-
-Note that if `p` is a `Distribution`, then `distributions.kl(q, p)` may be
-defined and available as an exact result.
-
-#### ELBO
-
-If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
-the Evidence Lower BOund (ELBO):
-
-```
-ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
-      = Log[p(x)] - KL[q || p]
-     <= Log[p(x)]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_p`</b>: Callable mapping samples from `q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`q`</b>: `tf.contrib.distributions.Distribution`.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`form`</b>: Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-    or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-    (attempt analytic entropy, fallback on sample).
-    Default value is `ELBOForms.default`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
-    shape of `q`, and `dtype` is the same as `q`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `form` is not handled by this function.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.renyi_alpha.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.renyi_alpha.md
deleted file mode 100644
index bf65d1a8232..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.entropy.renyi_alpha.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.contrib.bayesflow.entropy.renyi_alpha(step, decay_time, alpha_min, alpha_max=0.99999, name='renyi_alpha')` {#renyi_alpha}
-
-Exponentially decaying `Tensor` appropriate for Renyi ratios.
-
-When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
-Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
-`NaN` and `inf` values when `alpha` is far from `1`.
-
-For that reason, it is often desirable to start the optimization with `alpha`
-very close to 1, and reduce it to a final `alpha_min` according to some
-schedule.  The user may even want to optimize using `elbo_ratio` for
-some fixed time before switching to Renyi based methods.
-
-This `Op` returns an `alpha` decaying exponentially with step:
-
-```
-s(step) = (exp{step / decay_time} - 1) / (e - 1)
-t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
-alpha(t) = (1 - t) alpha_min + t alpha_max
-```
-
-##### Args:
-
-
-*  <b>`step`</b>: Non-negative scalar `Tensor`.  Typically the global step or an
-    offset version thereof.
-*  <b>`decay_time`</b>: Positive scalar `Tensor`.
-*  <b>`alpha_min`</b>: `float` or `double` `Tensor`.
-    The minimal, final value of `alpha`, achieved when `step >= decay_time`
-*  <b>`alpha_max`</b>: `Tensor` of same `dtype` as `alpha_min`.
-    The maximal, beginning value of `alpha`, achieved when `step == 0`
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-
-*  <b>`alpha`</b>: A `Tensor` of same `dtype` as `alpha_min`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler.md
deleted file mode 100644
index 9dce634e13f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler(f, log_p, sampling_dist_q, z=None, n=None, seed=None, name='expectation_importance_sampler')` {#expectation_importance_sampler}
-
-Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
-
-With `p(z) := exp{log_p(z)}`, this `Op` returns
-
-```
-n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-\approx E_q[ f(Z) p(Z) / q(Z) ]
-=       E_p[f(Z)]
-```
-
-This integral is done in log-space with max-subtraction to better handle the
-often extreme values that `f(z) p(z) / q(z)` can take on.
-
-If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
-`expectation_importance_sampler_logspace` applied to `Log[f]`.
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
-    broadcastable to `q.batch_shape`.
-    For example, `f` works "just like" `q.log_prob`.
-*  <b>`log_p`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
-*  <b>`sampling_dist_q`</b>: The sampling distribution.
-    `tf.contrib.distributions.Distribution`.
-    `float64` `dtype` recommended.
-    `log_p` and `q` should be supported on the same set.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  The importance sampling estimate.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.stochastic_tensor.value_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.stochastic_tensor.value_type.md
deleted file mode 100644
index f1182cb21cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.bayesflow.stochastic_tensor.value_type.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.bayesflow.stochastic_tensor.value_type(dist_value_type)` {#value_type}
-
-Creates a value type context for any StochasticTensor created within.
-
-Typical usage:
-
-```
-with sg.value_type(sg.MeanValue(stop_gradients=True)):
-  st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
-                           sigma=sigma)
-```
-
-In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
-be the mean value of the Normal distribution, i.e., `mu` (possibly
-broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
-was marked with `stop_gradients=True`, this value will have been wrapped
-in a `stop_gradients` call to disable any possible backpropagation.
-
-##### Args:
-
-
-*  <b>`dist_value_type`</b>: An instance of `MeanValue`, `SampleValue`, or
-    any other stochastic value type.
-
-##### Yields:
-
-  A context for `StochasticTensor` objects that controls the
-  value created when they are initialized.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `dist_value_type` is not an instance of a stochastic value
-    type.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.copy_graph.copy_variable_to_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.copy_graph.copy_variable_to_graph.md
deleted file mode 100644
index 85e336a29b8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.copy_graph.copy_variable_to_graph.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.copy_graph.copy_variable_to_graph(org_instance, to_graph, scope='')` {#copy_variable_to_graph}
-
-Given a `Variable` instance from one `Graph`, initializes and returns
-a copy of it from another `Graph`, under the specified scope
-(default `""`).
-
-Args:
-org_instance: A `Variable` from some `Graph`.
-to_graph: The `Graph` to copy the `Variable` to.
-scope: A scope for the new `Variable` (default `""`).
-
-##### Returns:
-
-    The copied `Variable` from `to_graph`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `org_instance` is not a `Variable`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md
deleted file mode 100644
index 95cbf2e8eb2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.crf.crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params)` {#crf_sequence_score}
-
-Computes the unnormalized score for a tag sequence.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-      to use as input to the CRF layer.
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
-      compute the unnormalized score.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
-
-##### Returns:
-
-
-*  <b>`sequence_scores`</b>: A [batch_size] vector of unnormalized sequence scores.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md
deleted file mode 100644
index 4a344623ce2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.crf.crf_unary_score(tag_indices, sequence_lengths, inputs)` {#crf_unary_score}
-
-Computes the unary scores of tag sequences.
-
-##### Args:
-
-
-*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
-*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
-*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
-
-##### Returns:
-
-
-*  <b>`unary_scores`</b>: A [batch_size] vector of unary scores.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.BernoulliWithSigmoidProbs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.BernoulliWithSigmoidProbs.md
deleted file mode 100644
index 7c51c70b9b3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.BernoulliWithSigmoidProbs.md
+++ /dev/null
@@ -1,564 +0,0 @@
-Bernoulli with `probs = nn.sigmoid(logits)`.
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.__init__(logits=None, dtype=tf.int32, validate_args=False, allow_nan_stats=True, name='BernoulliWithSigmoidProbs')` {#BernoulliWithSigmoidProbs.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.allow_nan_stats` {#BernoulliWithSigmoidProbs.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.batch_shape` {#BernoulliWithSigmoidProbs.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.batch_shape_tensor(name='batch_shape_tensor')` {#BernoulliWithSigmoidProbs.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.cdf(value, name='cdf')` {#BernoulliWithSigmoidProbs.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.copy(**override_parameters_kwargs)` {#BernoulliWithSigmoidProbs.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.covariance(name='covariance')` {#BernoulliWithSigmoidProbs.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.dtype` {#BernoulliWithSigmoidProbs.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.entropy(name='entropy')` {#BernoulliWithSigmoidProbs.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.event_shape` {#BernoulliWithSigmoidProbs.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.event_shape_tensor(name='event_shape_tensor')` {#BernoulliWithSigmoidProbs.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_continuous` {#BernoulliWithSigmoidProbs.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_scalar_batch(name='is_scalar_batch')` {#BernoulliWithSigmoidProbs.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.is_scalar_event(name='is_scalar_event')` {#BernoulliWithSigmoidProbs.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_cdf(value, name='log_cdf')` {#BernoulliWithSigmoidProbs.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_prob(value, name='log_prob')` {#BernoulliWithSigmoidProbs.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.log_survival_function(value, name='log_survival_function')` {#BernoulliWithSigmoidProbs.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.logits` {#BernoulliWithSigmoidProbs.logits}
-
-Log-odds of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.mean(name='mean')` {#BernoulliWithSigmoidProbs.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.mode(name='mode')` {#BernoulliWithSigmoidProbs.mode}
-
-Mode.
-
-Additional documentation from `Bernoulli`:
-
-Returns `1` if `prob > 0.5` and `0` otherwise.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.name` {#BernoulliWithSigmoidProbs.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#BernoulliWithSigmoidProbs.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.param_static_shapes(cls, sample_shape)` {#BernoulliWithSigmoidProbs.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.parameters` {#BernoulliWithSigmoidProbs.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.prob(value, name='prob')` {#BernoulliWithSigmoidProbs.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.probs` {#BernoulliWithSigmoidProbs.probs}
-
-Probability of a `1` outcome (vs `0`).
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.reparameterization_type` {#BernoulliWithSigmoidProbs.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.sample(sample_shape=(), seed=None, name='sample')` {#BernoulliWithSigmoidProbs.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.stddev(name='stddev')` {#BernoulliWithSigmoidProbs.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.survival_function(value, name='survival_function')` {#BernoulliWithSigmoidProbs.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.validate_args` {#BernoulliWithSigmoidProbs.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.BernoulliWithSigmoidProbs.variance(name='variance')` {#BernoulliWithSigmoidProbs.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
deleted file mode 100644
index 5b2fc3113b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
+++ /dev/null
@@ -1,690 +0,0 @@
-Beta distribution.
-
-The Beta distribution is defined over the `(0, 1)` interval using parameters
-`concentration1` (aka "alpha") and `concentration0` (aka "beta").
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
-Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
-```
-
-where:
-
-* `concentration1 = alpha`,
-* `concentration0 = beta`,
-* `Z` is the normalization constant, and,
-* `Gamma` is the [gamma function](
-  https://en.wikipedia.org/wiki/Gamma_function).
-
-The concentration parameters represent mean total counts of a `1` or a `0`,
-i.e.,
-
-```none
-concentration1 = alpha = mean * total_concentration
-concentration0 = beta  = (1. - mean) * total_concentration
-```
-
-where `mean` in `(0, 1)` and `total_concentration` is a positive real number
-representing a mean `total_count = concentration1 + concentration0`.
-
-Distribution parameters are automatically broadcast in all functions; see
-examples for details.
-
-#### Examples
-
-```python
-# Create a batch of three Beta distributions.
-alpha = [1, 2, 3]
-beta = [1, 2, 3]
-dist = Beta(alpha, beta)
-
-dist.sample([4, 5])  # Shape [4, 5, 3]
-
-# `x` has three batch entries, each with two samples.
-x = [[.1, .4, .5],
-     [.2, .3, .5]]
-# Calculate the probability of each pair of samples under the corresponding
-# distribution in `dist`.
-dist.prob(x)         # Shape [2, 3]
-```
-
-```python
-# Create batch_shape=[2, 3] via parameter broadcast:
-alpha = [[1.], [2]]      # Shape [2, 1]
-beta = [3., 4, 5]        # Shape [3]
-dist = Beta(alpha, beta)
-
-# alpha broadcast as: [[1., 1, 1,],
-#                      [2, 2, 2]]
-# beta broadcast as:  [[3., 4, 5],
-#                      [3, 4, 5]]
-# batch_Shape [2, 3]
-dist.sample([4, 5])  # Shape [4, 5, 2, 3]
-
-x = [.2, .3, .5]
-# x will be broadcast as [[.2, .3, .5],
-#                         [.2, .3, .5]],
-# thus matching batch_shape [2, 3].
-dist.prob(x)         # Shape [2, 3]
-```
-- - -
-
-#### `tf.contrib.distributions.Beta.__init__(concentration1=None, concentration0=None, validate_args=False, allow_nan_stats=True, name='Beta')` {#Beta.__init__}
-
-Initialize a batch of Beta distributions.
-
-##### Args:
-
-
-*  <b>`concentration1`</b>: Positive floating-point `Tensor` indicating mean
-    number of successes; aka "alpha". Implies `self.dtype` and
-    `self.batch_shape`, i.e.,
-    `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
-*  <b>`concentration0`</b>: Positive floating-point `Tensor` indicating mean
-    number of failures; aka "beta". Otherwise has same semantics as
-    `concentration1`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.allow_nan_stats` {#Beta.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.batch_shape` {#Beta.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.batch_shape_tensor(name='batch_shape_tensor')` {#Beta.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.cdf(value, name='cdf')` {#Beta.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.concentration0` {#Beta.concentration0}
-
-Concentration parameter associated with a `0` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.concentration1` {#Beta.concentration1}
-
-Concentration parameter associated with a `1` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.copy(**override_parameters_kwargs)` {#Beta.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.covariance(name='covariance')` {#Beta.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.dtype` {#Beta.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.entropy(name='entropy')` {#Beta.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.event_shape` {#Beta.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.event_shape_tensor(name='event_shape_tensor')` {#Beta.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_continuous` {#Beta.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_scalar_batch(name='is_scalar_batch')` {#Beta.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.is_scalar_event(name='is_scalar_event')` {#Beta.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_cdf(value, name='log_cdf')` {#Beta.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_prob(value, name='log_prob')` {#Beta.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.log_survival_function(value, name='log_survival_function')` {#Beta.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.mean(name='mean')` {#Beta.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.mode(name='mode')` {#Beta.mode}
-
-Mode.
-
-Additional documentation from `Beta`:
-
-Note: The mode is undefined when `concentration1 <= 1` or
-`concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-is used for undefined modes.  If `self.allow_nan_stats` is `False` an
-exception is raised when one or more modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.name` {#Beta.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Beta.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.param_static_shapes(cls, sample_shape)` {#Beta.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.parameters` {#Beta.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.prob(value, name='prob')` {#Beta.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.reparameterization_type` {#Beta.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.sample(sample_shape=(), seed=None, name='sample')` {#Beta.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.stddev(name='stddev')` {#Beta.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.survival_function(value, name='survival_function')` {#Beta.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.total_concentration` {#Beta.total_concentration}
-
-Sum of concentration parameters.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.validate_args` {#Beta.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Beta.variance(name='variance')` {#Beta.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
deleted file mode 100644
index 3a814777a8d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
+++ /dev/null
@@ -1,608 +0,0 @@
-The Laplace distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) of this distribution is,
-
-```none
-pdf(x; mu, sigma) = exp(-|x - mu| / sigma) / Z
-Z = 2 sigma
-```
-
-where `loc = mu`, `scale = sigma`, and `Z` is the normalization constant.
-
-Note that the Laplace distribution can be thought of two exponential
-distributions spliced together "back-to-back."
-
-The Lpalce distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Laplace(loc=0, scale=1)
-Y = loc + scale * X
-```
-- - -
-
-#### `tf.contrib.distributions.Laplace.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Laplace')` {#Laplace.__init__}
-
-Construct Laplace distribution with parameters `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g., `loc / scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
-    of the distribution.
-*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
-    the distribution.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `loc` and `scale` are of different dtype.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.allow_nan_stats` {#Laplace.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.batch_shape` {#Laplace.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.batch_shape_tensor(name='batch_shape_tensor')` {#Laplace.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.cdf(value, name='cdf')` {#Laplace.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.copy(**override_parameters_kwargs)` {#Laplace.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.covariance(name='covariance')` {#Laplace.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.dtype` {#Laplace.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.entropy(name='entropy')` {#Laplace.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.event_shape` {#Laplace.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.event_shape_tensor(name='event_shape_tensor')` {#Laplace.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_continuous` {#Laplace.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_scalar_batch(name='is_scalar_batch')` {#Laplace.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.is_scalar_event(name='is_scalar_event')` {#Laplace.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.loc` {#Laplace.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_cdf(value, name='log_cdf')` {#Laplace.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_prob(value, name='log_prob')` {#Laplace.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.log_survival_function(value, name='log_survival_function')` {#Laplace.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.mean(name='mean')` {#Laplace.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.mode(name='mode')` {#Laplace.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.name` {#Laplace.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Laplace.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.param_static_shapes(cls, sample_shape)` {#Laplace.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.parameters` {#Laplace.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.prob(value, name='prob')` {#Laplace.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.reparameterization_type` {#Laplace.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.sample(sample_shape=(), seed=None, name='sample')` {#Laplace.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.scale` {#Laplace.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.stddev(name='stddev')` {#Laplace.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.survival_function(value, name='survival_function')` {#Laplace.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.validate_args` {#Laplace.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Laplace.variance(name='variance')` {#Laplace.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md
deleted file mode 100644
index 42a7ff92ae7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md
+++ /dev/null
@@ -1,560 +0,0 @@
-Laplace with softplus applied to `scale`.
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='LaplaceWithSoftplusScale')` {#LaplaceWithSoftplusScale.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.allow_nan_stats` {#LaplaceWithSoftplusScale.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.batch_shape` {#LaplaceWithSoftplusScale.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.batch_shape_tensor(name='batch_shape_tensor')` {#LaplaceWithSoftplusScale.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.cdf(value, name='cdf')` {#LaplaceWithSoftplusScale.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.copy(**override_parameters_kwargs)` {#LaplaceWithSoftplusScale.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.covariance(name='covariance')` {#LaplaceWithSoftplusScale.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.dtype` {#LaplaceWithSoftplusScale.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.entropy(name='entropy')` {#LaplaceWithSoftplusScale.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.event_shape` {#LaplaceWithSoftplusScale.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.event_shape_tensor(name='event_shape_tensor')` {#LaplaceWithSoftplusScale.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_continuous` {#LaplaceWithSoftplusScale.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_scalar_batch(name='is_scalar_batch')` {#LaplaceWithSoftplusScale.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.is_scalar_event(name='is_scalar_event')` {#LaplaceWithSoftplusScale.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.loc` {#LaplaceWithSoftplusScale.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_cdf(value, name='log_cdf')` {#LaplaceWithSoftplusScale.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_prob(value, name='log_prob')` {#LaplaceWithSoftplusScale.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.log_survival_function(value, name='log_survival_function')` {#LaplaceWithSoftplusScale.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.mean(name='mean')` {#LaplaceWithSoftplusScale.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.mode(name='mode')` {#LaplaceWithSoftplusScale.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.name` {#LaplaceWithSoftplusScale.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#LaplaceWithSoftplusScale.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.param_static_shapes(cls, sample_shape)` {#LaplaceWithSoftplusScale.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.parameters` {#LaplaceWithSoftplusScale.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.prob(value, name='prob')` {#LaplaceWithSoftplusScale.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.reparameterization_type` {#LaplaceWithSoftplusScale.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.sample(sample_shape=(), seed=None, name='sample')` {#LaplaceWithSoftplusScale.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.scale` {#LaplaceWithSoftplusScale.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.stddev(name='stddev')` {#LaplaceWithSoftplusScale.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.survival_function(value, name='survival_function')` {#LaplaceWithSoftplusScale.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.validate_args` {#LaplaceWithSoftplusScale.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.LaplaceWithSoftplusScale.variance(name='variance')` {#LaplaceWithSoftplusScale.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Logistic.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Logistic.md
deleted file mode 100644
index fc0a45d2b3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Logistic.md
+++ /dev/null
@@ -1,638 +0,0 @@
-The Logistic distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The cumulative density function of this distribution is:
-
-```none
-cdf(x; mu, sigma) = 1 / (1 + exp(-(x - mu) / sigma))
-```
-
-where `loc = mu` and `scale = sigma`.
-
-The Logistic distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Logistic(loc=0, scale=1)
-Y = loc + scale * X
-```
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Logistic distribution.
-dist = tf.contrib.distributions.Logistic(loc=0., scale=3.)
-
-# Evaluate the cdf at 1, returning a scalar.
-dist.cdf(1.)
-
-# Define a batch of two scalar valued Logistics.
-# The first has mean 1 and scale 11, the second 2 and 22.
-dist = tf.contrib.distributions.Logistic(loc=[1, 2.], scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-dist.sample([3])
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two scalar valued Logistics.
-# Both have mean 1, but different scales.
-dist = tf.contrib.distributions.Logistic(loc=1., scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.Logistic.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Logistic')` {#Logistic.__init__}
-
-Construct Logistic distributions with mean and scale `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g. `loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor, the means of the distribution(s).
-*  <b>`scale`</b>: Floating point tensor, the scales of the distribution(s). Must
-    contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if loc and scale are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.allow_nan_stats` {#Logistic.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.batch_shape` {#Logistic.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.batch_shape_tensor(name='batch_shape_tensor')` {#Logistic.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.cdf(value, name='cdf')` {#Logistic.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.copy(**override_parameters_kwargs)` {#Logistic.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.covariance(name='covariance')` {#Logistic.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.dtype` {#Logistic.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.entropy(name='entropy')` {#Logistic.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.event_shape` {#Logistic.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.event_shape_tensor(name='event_shape_tensor')` {#Logistic.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_continuous` {#Logistic.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_scalar_batch(name='is_scalar_batch')` {#Logistic.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.is_scalar_event(name='is_scalar_event')` {#Logistic.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.loc` {#Logistic.loc}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_cdf(value, name='log_cdf')` {#Logistic.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_prob(value, name='log_prob')` {#Logistic.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.log_survival_function(value, name='log_survival_function')` {#Logistic.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.mean(name='mean')` {#Logistic.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.mode(name='mode')` {#Logistic.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.name` {#Logistic.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Logistic.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.param_static_shapes(cls, sample_shape)` {#Logistic.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.parameters` {#Logistic.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.prob(value, name='prob')` {#Logistic.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.reparameterization_type` {#Logistic.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.sample(sample_shape=(), seed=None, name='sample')` {#Logistic.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.scale` {#Logistic.scale}
-
-Distribution parameter for scale.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.stddev(name='stddev')` {#Logistic.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.survival_function(value, name='survival_function')` {#Logistic.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.validate_args` {#Logistic.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Logistic.variance(name='variance')` {#Logistic.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.AffineLinearOperator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.AffineLinearOperator.md
deleted file mode 100644
index bcf31371e33..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.AffineLinearOperator.md
+++ /dev/null
@@ -1,357 +0,0 @@
-Compute `Y = g(X; shift, scale) = scale(X.T) + shift`.
-
-`shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-If `X` is a scalar then the forward transformation is: `scale * X + shift`
-where `*` denotes the scalar product.
-
-Note: we don't always simply transpose `X` (but write it this way for
-brevity).  Actually the input `X` undergoes the following transformation
-before being premultiplied by `scale`:
-
-1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-   `new_sample_shape = [1]`. Otherwise do nothing.
-2. The sample shape is flattened to have one dimension, i.e.,
-   `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-3. The sample dim is cyclically rotated left by 1, i.e.,
-   `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-   event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-   dimensions.
-
-(For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-The result of the above transformation is that `X` can be regarded as a batch
-of matrices where each column is a draw from the distribution.  After
-premultiplying by `scale`, we take the inverse of this procedure.  The input
-`Y` also undergoes the same transformation before/after premultiplying by
-`inv(scale)`.
-
-Example Use:
-
-```python
-linalg = tf.contrib.linalg
-
-x = [1., 2, 3]
-
-shift = [-1., 0., 1]
-diag = [1., 2, 3]
-scale = linalg.LinearOperatorDiag(diag)
-affine = AffineLinearOperator(shift, scale)
-# In this case, `forward` is equivalent to:
-# diag * scale + shift
-y = affine.forward(x)  # [0., 4, 10]
-
-shift = [2., 3, 1]
-tril = [[1., 0, 0],
-        [2, 1, 0],
-        [3, 2, 1]]
-scale = linalg.LinearOperatorTriL(tril)
-affine = AffineLinearOperator(shift, scale)
-# In this case, `forward` is equivalent to:
-# np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-y = affine.forward(x)  # [3., 7, 11]
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.__init__(shift=None, scale=None, event_ndims=1, validate_args=False, name='affine_linear_operator')` {#AffineLinearOperator.__init__}
-
-Instantiates the `AffineLinearOperator` bijector.
-
-##### Args:
-
-
-*  <b>`shift`</b>: Numeric `Tensor`.
-*  <b>`scale`</b>: Subclass of `LinearOperator`.  Represents the (batch) positive
-    definite matrix `M` in `R^{k x k}`.
-*  <b>`event_ndims`</b>: Scalar `integer` `Tensor` indicating the number of dimensions
-    associated with a particular draw from the distribution. Must be 0 or 1.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `event_ndims` is not 0 or 1.
-*  <b>`TypeError`</b>: if `scale` is not a `LinearOperator`.
-*  <b>`TypeError`</b>: if `shift.dtype` does not match `scale.dtype`.
-*  <b>`ValueError`</b>: if not `scale.is_non_singular`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.dtype` {#AffineLinearOperator.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.event_ndims` {#AffineLinearOperator.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward(x, name='forward')` {#AffineLinearOperator.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_event_shape(input_shape)` {#AffineLinearOperator.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#AffineLinearOperator.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#AffineLinearOperator.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.graph_parents` {#AffineLinearOperator.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse(y, name='inverse')` {#AffineLinearOperator.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#AffineLinearOperator.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_event_shape(output_shape)` {#AffineLinearOperator.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#AffineLinearOperator.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#AffineLinearOperator.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.is_constant_jacobian` {#AffineLinearOperator.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.name` {#AffineLinearOperator.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.scale` {#AffineLinearOperator.scale}
-
-The `scale` `LinearOperator` in `Y = scale @ X.T + shift`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.shift` {#AffineLinearOperator.shift}
-
-The `shift` `Tensor` in `Y = scale @ X.T + shift`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.AffineLinearOperator.validate_args` {#AffineLinearOperator.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.Identity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.Identity.md
deleted file mode 100644
index 970730ef0fe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.bijector.Identity.md
+++ /dev/null
@@ -1,282 +0,0 @@
-Compute Y = g(X) = X.
-
-Example Use:
-
-```python
-# Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
-# ndim and 1 event ndim (i.e., vector of vectors).
-identity = Identity(event_ndims=1)
-x = [[1., 2],
-     [3, 4]]
-x == identity.forward(x) == identity.inverse(x)
-```
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.__init__(validate_args=False, event_ndims=0, name='identity')` {#Identity.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.dtype` {#Identity.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.event_ndims` {#Identity.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward(x, name='forward')` {#Identity.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_event_shape(input_shape)` {#Identity.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Identity.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Identity.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.graph_parents` {#Identity.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse(y, name='inverse')` {#Identity.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Identity.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_event_shape(output_shape)` {#Identity.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Identity.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Identity.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.is_constant_jacobian` {#Identity.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.name` {#Identity.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Identity.validate_args` {#Identity.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assert_scalar.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assert_scalar.md
deleted file mode 100644
index d3618fa38c6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assert_scalar.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.framework.assert_scalar(tensor, name=None)` {#assert_scalar}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assign_from_values.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assign_from_values.md
deleted file mode 100644
index 6560f082814..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.assign_from_values.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.contrib.framework.assign_from_values(var_names_to_values)` {#assign_from_values}
-
-Creates an assignment operation from a given mapping.
-
-This function provides a mechanism for performing assignment of variables
-to values in a way that does not fill the graph with large assignment values.
-
-##### Args:
-
-
-*  <b>`var_names_to_values`</b>: A map from variable names to values.
-
-##### Returns:
-
-
-*  <b>`assign_op`</b>: An `Operation` that assigns each of the given variables to the
-    requested values.
-*  <b>`feed_dict`</b>: The feed dictionary to use when evaluating `assign_op`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any of the given variable names were not found.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.create_global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.create_global_step.md
deleted file mode 100644
index d41c8eb95a9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.create_global_step.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.framework.create_global_step(graph=None)` {#create_global_step}
-
-Create global step tensor in graph.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph in which to create the global step. If missing, use default
-      graph.
-
-##### Returns:
-
-  Global step tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if global step key is already defined.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
deleted file mode 100644
index 2daecf41e27..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.contrib.framework.deprecated(date, instructions)` {#deprecated}
-
-Decorator for marking functions or methods deprecated.
-
-This decorator logs a deprecation warning whenever the decorated function is
-called. It has the following format:
-
-  <function> (from <module>) is deprecated and will be removed after <date>.
-  Instructions for updating:
-  <instructions>
-
-<function> will include the class name if it is a method.
-
-It also edits the docstring of the function: ' (deprecated)' is appended
-to the first line of the docstring and a deprecation notice is prepended
-to the rest of the docstring.
-
-##### Args:
-
-
-*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
-    ISO 8601 (YYYY-MM-DD).
-*  <b>`instructions`</b>: String. Instructions on how to update code using the
-    deprecated function.
-
-##### Returns:
-
-  Decorated function or method.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.reduce_sum_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.reduce_sum_n.md
deleted file mode 100644
index 06b9822278c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.reduce_sum_n.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.framework.reduce_sum_n(tensors, name=None)` {#reduce_sum_n}
-
-Reduce tensors to a scalar sum.
-
-This reduces each tensor in `tensors` to a scalar via `tf.reduce_sum`, then
-adds them via `tf.add_n`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: List of tensors, all of the same numeric type.
-*  <b>`name`</b>: Tensor name, and scope for all other ops.
-
-##### Returns:
-
-  Total loss tensor, or None if no losses have been configured.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `losses` is missing or empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md
deleted file mode 100644
index b1bab3eec0f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.assign_renamed_collections_handler.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.graph_editor.assign_renamed_collections_handler(info, elem, elem_)` {#assign_renamed_collections_handler}
-
-Add the transformed elem to the (renamed) collections of elem.
-
-A collection is renamed only if is not a known key, as described in
-`tf.GraphKeys`.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`elem`</b>: the original element (`tf.Tensor` or `tf.Operation`)
-*  <b>`elem_`</b>: the transformed element
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.detach_control_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.detach_control_inputs.md
deleted file mode 100644
index cbdf5a943f1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.detach_control_inputs.md
+++ /dev/null
@@ -1,10 +0,0 @@
-### `tf.contrib.graph_editor.detach_control_inputs(sgv)` {#detach_control_inputs}
-
-Detach all the external control inputs of the subgraph sgv.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_forward_walk_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_forward_walk_ops.md
deleted file mode 100644
index 7ac8cc07485..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_forward_walk_ops.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.contrib.graph_editor.get_forward_walk_ops(seed_ops, inclusive=True, within_ops=None, stop_at_ts=(), control_outputs=None)` {#get_forward_walk_ops}
-
-Do a forward graph walk and return all the visited ops.
-
-##### Args:
-
-
-*  <b>`seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`inclusive`</b>: if True the given seed_ops are also part of the resulting set.
-*  <b>`within_ops`</b>: an iterable of `tf.Operation` within which the search is
-    restricted. If `within_ops` is `None`, the search is performed within
-    the whole graph.
-*  <b>`stop_at_ts`</b>: an iterable of tensors at which the graph walk stops.
-*  <b>`control_outputs`</b>: a `util.ControlOutputs` instance or None.
-    If not `None`, it will be used while walking the graph forward.
-
-##### Returns:
-
-  A Python set of all the `tf.Operation` ahead of `seed_ops`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `seed_ops` or `within_ops` cannot be converted to a list of
-    `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_tensors.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_tensors.md
deleted file mode 100644
index c000b26faac..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.get_tensors.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.get_tensors(graph)` {#get_tensors}
-
-get all the tensors which are input or output of an op in the graph.
-
-##### Args:
-
-
-*  <b>`graph`</b>: a `tf.Graph`.
-
-##### Returns:
-
-  A list of `tf.Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if graph is not a `tf.Graph`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.replace_t_with_placeholder_handler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.replace_t_with_placeholder_handler.md
deleted file mode 100644
index a808129fa13..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.replace_t_with_placeholder_handler.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.contrib.graph_editor.replace_t_with_placeholder_handler(info, t)` {#replace_t_with_placeholder_handler}
-
-Transform a tensor into a placeholder tensor.
-
-This handler is typically used to transform a subgraph input tensor into a
-placeholder.
-
-##### Args:
-
-
-*  <b>`info`</b>: Transform._TmpInfo instance.
-*  <b>`t`</b>: tensor whose input must be transformed into a place holder.
-
-##### Returns:
-
-  The tensor generated by the newly created place holder.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.select_ops_and_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.select_ops_and_ts.md
deleted file mode 100644
index 02fae6be8f7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.select_ops_and_ts.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.graph_editor.select_ops_and_ts(*args, **kwargs)` {#select_ops_and_ts}
-
-Helper to select operations and tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) tf.Tensor. Regular expressions matching
-    tensors must start with the comment `"(?#ts)"`, for instance:
-    `"(?#ts)^foo/.*"`.
-*  <b>`**kwargs`</b>: 'graph': `tf.Graph` in which to perform the regex query.This is
-    required when using regex.
-    'positive_filter': an elem if selected only if `positive_filter(elem)` is
-      `True`. This is optional.
-
-##### Returns:
-
-  A tuple `(ops, ts)` where:
-    `ops` is a list of `tf.Operation`, and
-    `ts` is a list of `tf.Tensor`
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
-    expression is used without passing a graph as a keyword argument.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.swap_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.swap_ts.md
deleted file mode 100644
index 2f2883b76e7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.swap_ts.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.contrib.graph_editor.swap_ts(ts0, ts1, can_modify=None, cannot_modify=None)` {#swap_ts}
-
-For each tensor's pair, swap the end of (t0,t1).
-
-B0 B1     B0 B1
-|  |    =>  X
-A0 A1     A0 A1
-
-##### Args:
-
-
-*  <b>`ts0`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`ts1`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`can_modify`</b>: iterable of operations which can be modified. Any operation
-    outside within_ops will be left untouched by this function.
-*  <b>`cannot_modify`</b>: iterable of operations which cannot be modified.
-    Any operation within cannot_modify will be left untouched by this
-    function.
-
-##### Returns:
-
-  The number of individual modifications made by the function.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts0 or ts1 cannot be converted to a list of tf.Tensor.
-*  <b>`TypeError`</b>: if can_modify or cannot_modify is not None and cannot be
-    converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.bucketized_column.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.bucketized_column.md
deleted file mode 100644
index fa69df86d97..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.bucketized_column.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.layers.bucketized_column(source_column, boundaries)` {#bucketized_column}
-
-Creates a _BucketizedColumn for discretizing dense input.
-
-##### Args:
-
-
-*  <b>`source_column`</b>: A _RealValuedColumn defining dense column.
-*  <b>`boundaries`</b>: A list of floats specifying the boundaries. It has to be sorted.
-
-##### Returns:
-
-  A _BucketizedColumn.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if 'boundaries' is empty or not sorted.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.embed_sequence.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.embed_sequence.md
deleted file mode 100644
index 8ad845ba648..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.embed_sequence.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.contrib.layers.embed_sequence(ids, vocab_size=None, embed_dim=None, unique=False, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None)` {#embed_sequence}
-
-Maps a sequence of symbols to a sequence of embeddings.
-
-Typical use case would be reusing embeddings between an encoder and decoder.
-
-##### Args:
-
-
-*  <b>`ids`</b>: `[batch_size, doc_length]` `Tensor` of type `int32` or `int64`
-    with symbol ids.
-*  <b>`vocab_size`</b>: Integer number of symbols in vocabulary.
-*  <b>`embed_dim`</b>: Integer number of dimensions for embedding matrix.
-*  <b>`unique`</b>: If `True`, will first compute the unique set of indices, and then
-       lookup each embedding once, repeating them in the output as needed.
-*  <b>`initializer`</b>: An initializer for the embeddings, if `None` default for
-      current scope is used.
-*  <b>`regularizer`</b>: Optional regularizer for the embeddings.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`scope`</b>: Optional string specifying the variable scope for the op, required
-      if `reuse=True`.
-*  <b>`reuse`</b>: If `True`, variables inside the op will be reused.
-
-##### Returns:
-
-  `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `embed_dim` or `vocab_size` are not specified when not
-    `reuse` is `None` or `False`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.summarize_activations.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.summarize_activations.md
deleted file mode 100644
index dc2e7a6044c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.layers.summarize_activations.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.layers.summarize_activations(name_filter=None, summarizer=summarize_activation)` {#summarize_activations}
-
-Summarize activations, using `summarize_activation` to summarize.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.ExportStrategy.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.ExportStrategy.__new__.md
deleted file mode 100644
index 68f3c7c3144..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.ExportStrategy.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.contrib.learn.ExportStrategy.__new__(_cls, name, export_fn)` {#ExportStrategy.__new__}
-
-Create new instance of ExportStrategy(name, export_fn)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.MetricSpec.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.MetricSpec.md
deleted file mode 100644
index 20b689f0a34..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.MetricSpec.md
+++ /dev/null
@@ -1,181 +0,0 @@
-MetricSpec connects a model to metric functions.
-
-The MetricSpec class contains all information necessary to connect the
-output of a `model_fn` to the metrics (usually, streaming metrics) that are
-used in evaluation.
-
-It is passed in the `metrics` argument of `Estimator.evaluate`. The
-`Estimator` then knows which predictions, labels, and weight to use to call a
-given metric function.
-
-When building the ops to run in evaluation, `Estimator` will call
-`create_metric_ops`, which will connect the given `metric_fn` to the model
-as detailed in the docstring for `create_metric_ops`, and return the metric.
-
-Example:
-
-Assuming a model has an input function which returns inputs containing
-(among other things) a tensor with key "input_key", and a labels dictionary
-containing "label_key". Let's assume that the `model_fn` for this model
-returns a prediction with key "prediction_key".
-
-In order to compute the accuracy of the "prediction_key" prediction, we
-would add
-
-```
-"prediction accuracy": MetricSpec(metric_fn=prediction_accuracy_fn,
-                                  prediction_key="prediction_key",
-                                  label_key="label_key")
-```
-
-to the metrics argument to `evaluate`. `prediction_accuracy_fn` can be either
-a predefined function in metric_ops (e.g., `streaming_accuracy`) or a custom
-function you define.
-
-If we would like the accuracy to be weighted by "input_key", we can add that
-as the `weight_key` argument.
-
-```
-"prediction accuracy": MetricSpec(metric_fn=prediction_accuracy_fn,
-                                  prediction_key="prediction_key",
-                                  label_key="label_key",
-                                  weight_key="input_key")
-```
-
-An end-to-end example is as follows:
-
-```
-estimator = tf.contrib.learn.Estimator(...)
-estimator.fit(...)
-_ = estimator.evaluate(
-    input_fn=input_fn,
-    steps=1,
-    metrics={
-        'prediction accuracy':
-            metric_spec.MetricSpec(
-                metric_fn=prediction_accuracy_fn,
-                prediction_key="prediction_key",
-                label_key="label_key")
-    })
-```
-- - -
-
-#### `tf.contrib.learn.MetricSpec.__init__(metric_fn, prediction_key=None, label_key=None, weight_key=None)` {#MetricSpec.__init__}
-
-Constructor.
-
-Creates a MetricSpec.
-
-##### Args:
-
-
-*  <b>`metric_fn`</b>: A function to use as a metric. See `_adapt_metric_fn` for
-    rules on how `predictions`, `labels`, and `weights` are passed to this
-    function. This must return either a single `Tensor`, which is
-    interpreted as a value of this metric, or a pair
-    `(value_op, update_op)`, where `value_op` is the op to call to
-    obtain the value of the metric, and `update_op` should be run for
-    each batch to update internal state.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `metric_fn`. Optional. If `None`, the `model_fn` must return a single
-    tensor or a dict with only a single entry as `predictions`.
-*  <b>`label_key`</b>: The key for a tensor in the `labels` dict (output from the
-    `input_fn`) to use as the `labels` input to the `metric_fn`.
-    Optional. If `None`, the `input_fn` must return a single tensor or a
-    dict with only a single entry as `labels`.
-*  <b>`weight_key`</b>: The key for a tensor in the `inputs` dict (output from the
-    `input_fn`) to use as the `weights` input to the `metric_fn`.
-    Optional. If `None`, no weights will be passed to the `metric_fn`.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.__str__()` {#MetricSpec.__str__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.create_metric_ops(inputs, labels, predictions)` {#MetricSpec.create_metric_ops}
-
-Connect our `metric_fn` to the specified members of the given dicts.
-
-This function will call the `metric_fn` given in our constructor as follows:
-
-```
-  metric_fn(predictions[self.prediction_key],
-            labels[self.label_key],
-            weights=weights[self.weight_key])
-```
-
-And returns the result. The `weights` argument is only passed if
-`self.weight_key` is not `None`.
-
-`predictions` and `labels` may be single tensors as well as dicts. If
-`predictions` is a single tensor, `self.prediction_key` must be `None`. If
-`predictions` is a single element dict, `self.prediction_key` is allowed to
-be `None`. Conversely, if `labels` is a single tensor, `self.label_key` must
-be `None`. If `labels` is a single element dict, `self.label_key` is allowed
-to be `None`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A dict of inputs produced by the `input_fn`
-*  <b>`labels`</b>: A dict of labels or a single label tensor produced by the
-    `input_fn`.
-*  <b>`predictions`</b>: A dict of predictions or a single tensor produced by the
-    `model_fn`.
-
-##### Returns:
-
-  The result of calling `metric_fn`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` or `labels` is a single `Tensor` and
-    `self.prediction_key` or `self.label_key` is not `None`; or if
-    `self.label_key` is `None` but `labels` is a dict with more than one
-    element, or if `self.prediction_key` is `None` but `predictions` is a
-    dict with more than one element.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.label_key` {#MetricSpec.label_key}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.metric_fn` {#MetricSpec.metric_fn}
-
-Metric function.
-
-This function accepts named args: `predictions`, `labels`, `weights`. It
-returns a single `Tensor` or `(value_op, update_op)` pair. See `metric_fn`
-constructor argument for more details.
-
-##### Returns:
-
-  Function, see `metric_fn` constructor argument for more details.
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.prediction_key` {#MetricSpec.prediction_key}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.MetricSpec.weight_key` {#MetricSpec.weight_key}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.NotFittedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.NotFittedError.md
deleted file mode 100644
index 6101ade1daf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.NotFittedError.md
+++ /dev/null
@@ -1,17 +0,0 @@
-Exception class to raise if estimator is used before fitting.
-
-This class inherits from both ValueError and AttributeError to help with
-exception handling and backward compatibility.
-
-Examples:
->>> from sklearn.svm import LinearSVC
->>> from sklearn.exceptions import NotFittedError
->>> try:
-...     LinearSVC().predict([[1, 2], [2, 3], [3, 4]])
-... except NotFittedError as e:
-...     print(repr(e))
-...                        # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-NotFittedError('This LinearSVC instance is not fitted yet',)
-
-Copied from
-https://github.com/scikit-learn/scikit-learn/master/sklearn/exceptions.py
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.RunConfig.get_task_id.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.RunConfig.get_task_id.md
deleted file mode 100644
index 1c2856df21c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.RunConfig.get_task_id.md
+++ /dev/null
@@ -1,12 +0,0 @@
-#### `tf.contrib.learn.RunConfig.get_task_id()` {#RunConfig.get_task_id}
-
-Returns task index from `TF_CONFIG` environmental variable.
-
-If you have a ClusterConfig instance, you can just access its task_id
-property instead of calling this function and re-parsing the environmental
-variable.
-
-##### Returns:
-
-  `TF_CONFIG['task']['index']`. Defaults to 0.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.extract_dask_labels.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.extract_dask_labels.md
deleted file mode 100644
index 2ccdd8a8e23..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.extract_dask_labels.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.learn.extract_dask_labels(labels)` {#extract_dask_labels}
-
-Extract data from dask.Series or dask.DataFrame for labels.
-
-Given a distributed dask.DataFrame or dask.Series containing exactly one
-column or name, this operation returns a single dask.DataFrame or dask.Series
-that can be iterated over.
-
-##### Args:
-
-
-*  <b>`labels`</b>: A distributed dask.DataFrame or dask.Series with exactly one
-          column or name.
-
-##### Returns:
-
-  A dask.DataFrame or dask.Series that can be iterated over.
-  If the supplied argument is neither a dask.DataFrame nor a dask.Series this
-  operation returns it without modification.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the supplied dask.DataFrame contains more than one
-              column or the supplied dask.Series contains more than
-              one name.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.monitors.PrintTensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.monitors.PrintTensor.md
deleted file mode 100644
index 4044c2ad300..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.monitors.PrintTensor.md
+++ /dev/null
@@ -1,187 +0,0 @@
-Prints given tensors every N steps.
-
-This is an `EveryN` monitor and has consistent semantic for `every_n`
-and `first_n`.
-
-The tensors will be printed to the log, with `INFO` severity.
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.__init__(tensor_names, every_n=100, first_n=1)` {#PrintTensor.__init__}
-
-Initializes a PrintTensor monitor.
-
-##### Args:
-
-
-*  <b>`tensor_names`</b>: `dict` of tag to tensor names or
-      `iterable` of tensor names (strings).
-*  <b>`every_n`</b>: `int`, print every N steps. See `PrintN.`
-*  <b>`first_n`</b>: `int`, also print the first N steps. See `PrintN.`
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.begin(max_steps=None)` {#PrintTensor.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.end(session=None)` {#PrintTensor.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.epoch_begin(epoch)` {#PrintTensor.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.epoch_end(epoch)` {#PrintTensor.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_post_step(step, session)` {#PrintTensor.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_step_begin(step)` {#PrintTensor.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.every_n_step_end(step, outputs)` {#PrintTensor.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.post_step(step, session)` {#PrintTensor.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.run_on_all_workers` {#PrintTensor.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.set_estimator(estimator)` {#PrintTensor.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.step_begin(step)` {#PrintTensor.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.PrintTensor.step_end(step, output)` {#PrintTensor.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.read_batch_examples.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.read_batch_examples.md
deleted file mode 100644
index 4f389b2bc96..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.learn.read_batch_examples.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.contrib.learn.read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, num_threads=1, read_batch_size=1, parse_fn=None, name=None, seed=None)` {#read_batch_examples}
-
-Adds operations to read, queue, batch `Example` protos.
-
-Given file pattern (or list of files), will setup a queue for file names,
-read `Example` proto using provided `reader`, use batch queue to create
-batches of examples of size `batch_size`.
-
-All queue runners are added to the queue runners collection, and may be
-started via `start_queue_runners`.
-
-All ops are added to the default graph.
-
-Use `parse_fn` if you need to do parsing / processing on single examples.
-
-##### Args:
-
-
-*  <b>`file_pattern`</b>: List of files or pattern of file paths containing
-      `Example` records. See `tf.gfile.Glob` for pattern rules.
-*  <b>`batch_size`</b>: An int or scalar `Tensor` specifying the batch size to use.
-*  <b>`reader`</b>: A function or class that returns an object with
-    `read` method, (filename tensor) -> (example tensor).
-*  <b>`randomize_input`</b>: Whether the input should be randomized.
-*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
-    dataset. If `None`, cycles through the dataset forever.
-    NOTE - If specified, creates a variable that must be initialized, so call
-    `tf.global_variables_initializer()` and run the op in a session.
-*  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing examples.
-*  <b>`read_batch_size`</b>: An int or scalar `Tensor` specifying the number of
-    records to read at once
-*  <b>`parse_fn`</b>: Parsing function, takes `Example` Tensor returns parsed
-    representation. If `None`, no parsing is done.
-*  <b>`name`</b>: Name of resulting op.
-*  <b>`seed`</b>: An integer (optional). Seed used if randomize_input == True.
-
-##### Returns:
-
-  String `Tensor` of batched `Example` proto.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: for invalid inputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md
deleted file mode 100644
index ec2653a33c1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.linalg.LinearOperatorMatrix.md
+++ /dev/null
@@ -1,486 +0,0 @@
-`LinearOperator` that wraps a [batch] matrix.
-
-This operator wraps a [batch] matrix `A` (which is a `Tensor`) with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `M x N` matrix.
-
-```python
-# Create a 2 x 2 linear operator.
-matrix = [[1., 2.], [3., 4.]]
-operator = LinearOperatorMatrix(matrix)
-
-operator.to_dense()
-==> [[1., 2.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-matrix = tf.random_normal(shape=[2, 3, 4, 4])
-operator = LinearOperatorMatrix(matrix)
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-#### Performance
-
-`LinearOperatorMatrix` has exactly the same performance as would be achieved
-by using standard `TensorFlow` matrix ops.  Intelligent choices are made
-based on the following initialization hints.
-
-* If `dtype` is real, and `is_self_adjoint` and `is_positive_definite`, a
-  Cholesky factorization is used for the determinant and solve.
-
-In all cases, suppose `operator` is a `LinearOperatorMatrix` of shape
-`[M, N]`, and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` is `O(M * N * R)`.
-* If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
-* If `M=N`, `operator.determinant()` is `O(N^3)`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.__init__(matrix, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorMatrix')` {#LinearOperatorMatrix.__init__}
-
-Initialize a `LinearOperatorMatrix`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-    Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorMatrix.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.apply(x, adjoint=False, name='apply')` {#LinearOperatorMatrix.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_non_singular(name='assert_non_singular')` {#LinearOperatorMatrix.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorMatrix.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorMatrix.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape` {#LinearOperatorMatrix.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorMatrix.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.determinant(name='det')` {#LinearOperatorMatrix.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension` {#LinearOperatorMatrix.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorMatrix.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.dtype` {#LinearOperatorMatrix.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.graph_parents` {#LinearOperatorMatrix.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_non_singular` {#LinearOperatorMatrix.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_positive_definite` {#LinearOperatorMatrix.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_self_adjoint` {#LinearOperatorMatrix.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.is_square` {#LinearOperatorMatrix.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.log_abs_determinant(name='log_abs_det')` {#LinearOperatorMatrix.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.name` {#LinearOperatorMatrix.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension` {#LinearOperatorMatrix.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorMatrix.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape` {#LinearOperatorMatrix.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.shape_tensor(name='shape_tensor')` {#LinearOperatorMatrix.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorMatrix.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank` {#LinearOperatorMatrix.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorMatrix.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorMatrix.to_dense(name='to_dense')` {#LinearOperatorMatrix.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md
deleted file mode 100644
index 59023568325..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.losses.log_loss.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.contrib.losses.log_loss(*args, **kwargs)` {#log_loss}
-
-Adds a Log Loss term to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.log_loss instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector. If the shape of
-`weights` matches the shape of `predictions`, then the loss of each
-measurable element of `predictions` is scaled by the corresponding value of
-`weights`.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs.
-*  <b>`labels`</b>: The ground truth output tensor, same dimensions as 'predictions'.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape
-    [batch_size] or a tensor whose shape matches `predictions`.
-*  <b>`epsilon`</b>: A small increment to add to avoid taking a log of zero.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.set_intersection.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.set_intersection.md
deleted file mode 100644
index fce01316261..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.set_intersection.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### `tf.contrib.metrics.set_intersection(a, b, validate_indices=True)` {#set_intersection}
-
-Compute set intersection of elements in last dimension of `a` and `b`.
-
-All but the last dimension of `a` and `b` must match.
-
-Example:
-
-```python
-  a = [
-    [
-      [
-        [1, 2],
-        [3],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-  b = [
-    [
-      [
-        [1, 3],
-        [2],
-      ],
-      [
-        [4, 5],
-        [5, 6, 7, 8],
-      ],
-    ],
-  ]
-  set_intersection(a, b) = [
-    [
-      [
-        [1],
-        [],
-      ],
-      [
-        [4],
-        [5, 6],
-      ],
-    ],
-  ]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` or `SparseTensor` of the same type as `b`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`b`</b>: `Tensor` or `SparseTensor` of the same type as `a`. If sparse, indices
-      must be sorted in row-major order.
-*  <b>`validate_indices`</b>: Whether to validate the order and range of sparse indices
-     in `a` and `b`.
-
-##### Returns:
-
-  A `SparseTensor` whose shape is the same rank as `a` and `b`, and all but
-  the last dimension the same. Elements along the last dimension contain the
-  intersections.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_false_positives_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_false_positives_at_thresholds.md
deleted file mode 100644
index c8a078eecd6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_false_positives_at_thresholds.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.metrics.streaming_false_positives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_false_positives_at_thresholds}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
deleted file mode 100644
index e6e5cae097c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_iou(predictions, labels, num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_iou}
-
-Calculate per-step mean Intersection-Over-Union (mIOU).
-
-Mean Intersection-Over-Union is a common evaluation metric for
-semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes.
-
-##### IOU is defined as follows:
-
-  IOU = true_positive / (true_positive + false_positive + false_negative).
-The predictions are accumulated in a confusion matrix, weighted by `weights`,
-and mIOU is then calculated from it.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `mean_iou`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of prediction results for semantic labels, whose
-    shape is [batch size] and type `int32` or `int64`. The tensor will be
-    flattened, if its rank > 1.
-*  <b>`labels`</b>: A `Tensor` of ground truth labels with shape [batch size] and of
-    type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
-*  <b>`num_classes`</b>: The possible number of labels the prediction task can
-    have. This value must be provided, since a confusion matrix of
-    dimension = [num_classes, num_classes] will be allocated.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
-*  <b>`metrics_collections`</b>: An optional list of collections that `mean_iou`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
-    added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_iou`</b>: A `Tensor` representing the mean intersection-over-union.
-*  <b>`update_op`</b>: An operation that increments the confusion matrix.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
deleted file mode 100644
index 7b9e286f134..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.contrib.metrics.streaming_recall(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall}
-
-Computes the recall of the predictions with respect to the labels.
-
-The `streaming_recall` function creates two local variables, `true_positives`
-and `false_negatives`, that are used to compute the recall. This value is
-ultimately returned as `recall`, an idempotent operation that simply divides
-`true_positives` by the sum of `true_positives`  and `false_negatives`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` that updates these variables and returns the `recall`. `update_op`
-weights each prediction by the corresponding value in `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`recall`</b>: Scalar float `Tensor` with the value of `true_positives` divided
-    by the sum of `true_positives` and `false_negatives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_negatives` variables appropriately and whose value matches
-    `recall`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md
deleted file mode 100644
index 7bdd57690e7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.contrib.metrics.streaming_root_mean_squared_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_root_mean_squared_error}
-
-Computes the root mean squared error between the labels and predictions.
-
-The `streaming_root_mean_squared_error` function creates two local variables,
-`total` and `count` that are used to compute the root mean squared error.
-This average is weighted by `weights`, and it is ultimately returned as
-`root_mean_squared_error`: an idempotent operation that takes the square root
-of the division of `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`root_mean_squared_error`. Internally, a `squared_error` operation computes
-the element-wise square of the difference between `predictions` and `labels`.
-Then `update_op` increments `total` with the reduced sum of the product of
-`weights` and `squared_error`, and it increments `count` with the reduced sum
-of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `root_mean_squared_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`root_mean_squared_error`</b>: A `Tensor` representing the current mean, the value
-    of `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `root_mean_squared_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md
deleted file mode 100644
index 7db1e1277e6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LSTMStateTuple.md
+++ /dev/null
@@ -1,54 +0,0 @@
-Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
-
-Stores two elements: `(c, h)`, in that order.
-
-Only used when `state_is_tuple=True`.
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__getnewargs__()` {#LSTMStateTuple.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__getstate__()` {#LSTMStateTuple.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
-
-Create new instance of LSTMStateTuple(c, h)
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.__repr__()` {#LSTMStateTuple.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.c` {#LSTMStateTuple.c}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.dtype` {#LSTMStateTuple.dtype}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMStateTuple.h` {#LSTMStateTuple.h}
-
-Alias for field number 1
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LayerNormBasicLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LayerNormBasicLSTMCell.md
deleted file mode 100644
index 814388a1a2e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.rnn.LayerNormBasicLSTMCell.md
+++ /dev/null
@@ -1,84 +0,0 @@
-LSTM unit with layer normalization and recurrent dropout.
-
-This class adds layer normalization and recurrent dropout to a
-basic LSTM unit. Layer normalization implementation is based on:
-
-  https://arxiv.org/abs/1607.06450.
-
-"Layer Normalization"
-Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
-
-and is applied before the internal nonlinearities.
-Recurrent dropout is base on:
-
-  https://arxiv.org/abs/1603.05118
-
-"Recurrent Dropout without Memory Loss"
-Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.__call__(inputs, state, scope=None)` {#LayerNormBasicLSTMCell.__call__}
-
-LSTM cell with layer normalization and recurrent dropout.
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.__init__(num_units, forget_bias=1.0, input_size=None, activation=tanh, layer_norm=True, norm_gain=1.0, norm_shift=0.0, dropout_keep_prob=1.0, dropout_prob_seed=None)` {#LayerNormBasicLSTMCell.__init__}
-
-Initializes the basic LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`activation`</b>: Activation function of the inner states.
-*  <b>`layer_norm`</b>: If `True`, layer normalization will be applied.
-*  <b>`norm_gain`</b>: float, The layer normalization gain initial value. If
-    `layer_norm` has been set to `False`, this argument will be ignored.
-*  <b>`norm_shift`</b>: float, The layer normalization shift initial value. If
-    `layer_norm` has been set to `False`, this argument will be ignored.
-*  <b>`dropout_keep_prob`</b>: unit Tensor or float between 0 and 1 representing the
-    recurrent dropout probability value. If float and 1.0, no dropout will
-    be applied.
-*  <b>`dropout_prob_seed`</b>: (optional) integer, the randomness seed.
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.output_size` {#LayerNormBasicLSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.state_size` {#LayerNormBasicLSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LayerNormBasicLSTMCell.zero_state(batch_size, dtype)` {#LayerNormBasicLSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md
deleted file mode 100644
index 0f6470c3172..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None, verify_shape=False)` {#make_tensor_proto}
-
-Create a TensorProto.
-
-##### Args:
-
-
-*  <b>`values`</b>: Values to put in the TensorProto.
-*  <b>`dtype`</b>: Optional tensor_pb2 DataType value.
-*  <b>`shape`</b>: List of integers representing the dimensions of tensor.
-*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
-
-##### Returns:
-
-  A TensorProto. Depending on the type, it may contain data in the
-  "tensor_content" attribute, which is not directly useful to Python programs.
-  To access the values you should convert the proto back to a numpy ndarray
-  with tensor_util.MakeNdarray(proto).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if unsupported types are provided.
-*  <b>`ValueError`</b>: if arguments have inappropriate values or if verify_shape is
-   True and shape of values is not equals to a shape from the argument.
-
-make_tensor_proto accepts "values" of a python scalar, a python list, a
-numpy ndarray, or a numpy scalar.
-
-If "values" is a python scalar or a python list, make_tensor_proto
-first convert it to numpy ndarray. If dtype is None, the
-conversion tries its best to infer the right numpy data
-type. Otherwise, the resulting numpy array has a compatible data
-type with the given dtype.
-
-In either case above, the numpy ndarray (either the caller provided
-or the auto converted) must have the compatible type with dtype.
-
-make_tensor_proto then converts the numpy array to a tensor proto.
-
-If "shape" is None, the resulting tensor proto represents the numpy
-array precisely.
-
-Otherwise, "shape" specifies the tensor's shape and the numpy array
-can not have more elements than what "shape" specifies.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
deleted file mode 100644
index baa00e57d53..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.cumsum(x, axis=0, exclusive=False, reverse=False, name=None)` {#cumsum}
-
-Compute the cumulative sum of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumsum, which means that the first
-element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
-instead:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-opposite direction:
-```prettyprint
-tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
-```
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
-*  <b>`reverse`</b>: A `bool` (default: False).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.decode_json_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.decode_json_example.md
deleted file mode 100644
index bf5184c40a8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.decode_json_example.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.decode_json_example(json_examples, name=None)` {#decode_json_example}
-
-Convert JSON-encoded Example records to binary protocol buffer strings.
-
-This op translates a tensor containing Example records, encoded using
-the [standard JSON
-mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-into a tensor containing the same records encoded as binary protocol
-buffers. The resulting tensor can then be fed to any of the other
-Example-parsing ops.
-
-##### Args:
-
-
-*  <b>`json_examples`</b>: A `Tensor` of type `string`.
-    Each string is a JSON object serialized according to the JSON
-    mapping of the Example proto.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-  Each string is a binary Example protocol buffer corresponding
-  to the respective element of `json_examples`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dequantize.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dequantize.md
deleted file mode 100644
index edf0de7a042..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dequantize.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.dequantize(input, min_range, max_range, mode=None, name=None)` {#dequantize}
-
-Dequantize the 'input' tensor into a float Tensor.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-if T == qint8, in[i] += (range(T) + 1)/ 2.0
-out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-If the input comes from a QuantizedRelu6, the output type is
-quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-Dequantize on quint8 will take each value, cast to float, and multiply
-by 6 / 255.
-Note that if quantizedtype is qint8, the operation will additionally add
-each value by 128 prior to casting.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = range / number_of_steps
-const double offset_input = static_cast<double>(input) - lowest_quantized;
-result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`min_range`</b>: A `Tensor` of type `float32`.
-    The minimum scalar value possibly produced for the input.
-*  <b>`max_range`</b>: A `Tensor` of type `float32`.
-    The maximum scalar value possibly produced for the input.
-*  <b>`mode`</b>: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST"`. Defaults to `"MIN_COMBINED"`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dynamic_partition.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dynamic_partition.md
deleted file mode 100644
index e24bc8c39eb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.dynamic_partition.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.dynamic_partition(data, partitions, num_partitions, name=None)` {#dynamic_partition}
-
-Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-
-For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-are placed in `outputs[i]` in lexicographic order of `js`, and the first
-dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-In detail,
-
-```python
-    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-
-    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-```
-
-`data.shape` must start with `partitions.shape`.
-
-For example:
-
-```python
-    # Scalar partitions.
-    partitions = 1
-    num_partitions = 2
-    data = [10, 20]
-    outputs[0] = []  # Empty with shape [0, 2]
-    outputs[1] = [[10, 20]]
-
-    # Vector partitions.
-    partitions = [0, 0, 1, 1, 0]
-    num_partitions = 2
-    data = [10, 20, 30, 40, 50]
-    outputs[0] = [10, 20, 50]
-    outputs[1] = [30, 40]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicPartition.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`.
-*  <b>`partitions`</b>: A `Tensor` of type `int32`.
-    Any shape.  Indices in the range `[0, num_partitions)`.
-*  <b>`num_partitions`</b>: An `int` that is `>= 1`.
-    The number of partitions to output.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `num_partitions` `Tensor` objects of the same type as data.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.erfc.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.erfc.md
deleted file mode 100644
index 62c13418f7d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.erfc.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.erfc(x, name=None)` {#erfc}
-
-Computes the complementary error function of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.AbortedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.AbortedError.md
deleted file mode 100644
index f2bc775dcb7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.AbortedError.md
+++ /dev/null
@@ -1,15 +0,0 @@
-The operation was aborted, typically due to a concurrent action.
-
-For example, running a
-[`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue)
-operation may raise `AbortedError` if a
-[`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close) operation
-previously ran.
-
-- - -
-
-#### `tf.errors.AbortedError.__init__(node_def, op, message)` {#AbortedError.__init__}
-
-Creates an `AbortedError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.InternalError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.InternalError.md
deleted file mode 100644
index dd229d2a3d3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.InternalError.md
+++ /dev/null
@@ -1,12 +0,0 @@
-Raised when the system experiences an internal error.
-
-This exception is raised when some invariant expected by the runtime
-has been broken. Catching this exception is not recommended.
-
-- - -
-
-#### `tf.errors.InternalError.__init__(node_def, op, message)` {#InternalError.__init__}
-
-Creates an `InternalError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.NotFoundError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.NotFoundError.md
deleted file mode 100644
index 49fec3c55c2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.NotFoundError.md
+++ /dev/null
@@ -1,14 +0,0 @@
-Raised when a requested entity (e.g., a file or directory) was not found.
-
-For example, running the
-[`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
-operation could raise `NotFoundError` if it receives the name of a file that
-does not exist.
-
-- - -
-
-#### `tf.errors.NotFoundError.__init__(node_def, op, message)` {#NotFoundError.__init__}
-
-Creates a `NotFoundError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.UnimplementedError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.UnimplementedError.md
deleted file mode 100644
index 945daa1a229..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.errors.UnimplementedError.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Raised when an operation has not been implemented.
-
-Some operations may raise this error when passed otherwise-valid
-arguments that it does not currently support. For example, running
-the [`tf.nn.max_pool()`](../../api_docs/python/nn.md#max_pool) operation
-would raise this error if pooling was requested on the batch dimension,
-because this is not yet supported.
-
-- - -
-
-#### `tf.errors.UnimplementedError.__init__(node_def, op, message)` {#UnimplementedError.__init__}
-
-Creates an `UnimplementedError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.fake_quant_with_min_max_args.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.fake_quant_with_min_max_args.md
deleted file mode 100644
index fcad8cb5001..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.fake_quant_with_min_max_args.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.fake_quant_with_min_max_args(inputs, min=None, max=None, name=None)` {#fake_quant_with_min_max_args}
-
-Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-
-Attributes [min; max] define the clamping range for the 'inputs' data.  Op
-divides this range into 255 steps (total of 256 values), then replaces each
-'inputs' value with the closest of the quantized step values.
-
-Quantization is called fake since the output is still in floating point.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `float32`.
-*  <b>`min`</b>: An optional `float`. Defaults to `-6`.
-*  <b>`max`</b>: An optional `float`. Defaults to `6`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.igamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.igamma.md
deleted file mode 100644
index 92b5fbe8510..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.igamma.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.igamma(a, x, name=None)` {#igamma}
-
-Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-
-The lower regularized incomplete Gamma function is defined as:
-
-```
-P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-```
-where
-```
-gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-```
-is the lower incomplete Gamma function.
-
-Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-Gamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.decode_gif.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.decode_gif.md
deleted file mode 100644
index 45e7ab9d22d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.decode_gif.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.image.decode_gif(contents, name=None)` {#decode_gif}
-
-Decode the first frame of a GIF-encoded image to a uint8 tensor.
-
-GIF with frame or transparency compression are not supported
-convert animated GIF from compressed to uncompressed by:
-
-convert $src.gif -coalesce $dst.gif
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The GIF-encoded image.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `uint8`.
-  4-D with shape `[num_frames, height, width, 3]`. RGB order
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.extract_glimpse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.extract_glimpse.md
deleted file mode 100644
index 83482124e76..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.extract_glimpse.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.image.extract_glimpse(input, size, offsets, centered=None, normalized=None, uniform_noise=None, name=None)` {#extract_glimpse}
-
-Extracts a glimpse from the input tensor.
-
-Returns a set of windows called glimpses extracted at location
-`offsets` from the input tensor. If the windows only partially
-overlaps the inputs, the non overlapping areas will be filled with
-random noise.
-
-The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-glimpse_width, channels]`. The channels and batch dimensions are the
-same as that of the input tensor. The height and width of the output
-windows are specified in the `size` parameter.
-
-The argument `normalized` and `centered` controls how the windows are built:
-
-* If the coordinates are normalized but not centered, 0.0 and 1.0
-  correspond to the minimum and maximum of each height and width
-  dimension.
-* If the coordinates are both normalized and centered, they range from
-  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-  left corner, the lower right corner is located at (1.0, 1.0) and the
-  center is at (0, 0).
-* If the coordinates are not normalized they are interpreted as
-  numbers of pixels.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `float32`.
-    A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-*  <b>`size`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of 2 elements containing the size of the glimpses
-    to extract.  The glimpse height must be specified first, following
-    by the glimpse width.
-*  <b>`offsets`</b>: A `Tensor` of type `float32`.
-    A 2-D integer tensor of shape `[batch_size, 2]` containing
-    the x, y locations of the center of each window.
-*  <b>`centered`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the offset coordinates are centered relative to
-    the image, in which case the (0, 0) offset is relative to the center
-    of the input images. If false, the (0,0) offset corresponds to the
-    upper left corner of the input images.
-*  <b>`normalized`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the offset coordinates are normalized.
-*  <b>`uniform_noise`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the noise should be generated using a
-    uniform distribution or a Gaussian distribution.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  A tensor representing the glimpses `[batch_size,
-  glimpse_height, glimpse_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.rgb_to_hsv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.rgb_to_hsv.md
deleted file mode 100644
index c08a086b88b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.image.rgb_to_hsv.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.image.rgb_to_hsv(images, name=None)` {#rgb_to_hsv}
-
-Converts one or more images from RGB to HSV.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. `images` converted to HSV.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.import_graph_def.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.import_graph_def.md
deleted file mode 100644
index 0ff3d621d44..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.import_graph_def.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.import_graph_def(graph_def, input_map=None, return_elements=None, name=None, op_dict=None, producer_op_list=None)` {#import_graph_def}
-
-Imports the TensorFlow graph in `graph_def` into the Python `Graph`.
-
-This function provides a way to import a serialized TensorFlow
-[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
-protocol buffer, and extract individual objects in the `GraphDef` as
-[`Tensor`](#Tensor) and [`Operation`](#Operation) objects. See
-[`Graph.as_graph_def()`](#Graph.as_graph_def) for a way to create a
-`GraphDef` proto.
-
-##### Args:
-
-
-*  <b>`graph_def`</b>: A `GraphDef` proto containing operations to be imported into
-    the default graph.
-*  <b>`input_map`</b>: A dictionary mapping input names (as strings) in `graph_def`
-    to `Tensor` objects. The values of the named input tensors in the
-    imported graph will be re-mapped to the respective `Tensor` values.
-*  <b>`return_elements`</b>: A list of strings containing operation names in
-    `graph_def` that will be returned as `Operation` objects; and/or
-    tensor names in `graph_def` that will be returned as `Tensor` objects.
-*  <b>`name`</b>: (Optional.) A prefix that will be prepended to the names in
-    `graph_def`. Defaults to `"import"`.
-*  <b>`op_dict`</b>: (Optional.) A dictionary mapping op type names to `OpDef` protos.
-    Must contain an `OpDef` proto for each op type named in `graph_def`.
-    If omitted, uses the `OpDef` protos registered in the global registry.
-*  <b>`producer_op_list`</b>: (Optional.) An `OpList` proto with the (possibly stripped)
-    list of `OpDef`s used by the producer of the graph. If provided, attrs
-    for ops in `graph_def` that are not in `op_dict` that have their default
-    value according to `producer_op_list` will be removed. This will allow
-    some more `GraphDef`s produced by later binaries to be accepted by
-    earlier binaries.
-
-##### Returns:
-
-  A list of `Operation` and/or `Tensor` objects from the imported graph,
-  corresponding to the names in `return_elements`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `graph_def` is not a `GraphDef` proto,
-    `input_map` is not a dictionary mapping strings to `Tensor` objects,
-    or `return_elements` is not a list of strings.
-*  <b>`ValueError`</b>: If `input_map`, or `return_elements` contains names that
-    do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
-    it refers to an unknown tensor).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.initialize_all_tables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.initialize_all_tables.md
deleted file mode 100644
index 4309820b84d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.initialize_all_tables.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.initialize_all_tables(*args, **kwargs)` {#initialize_all_tables}
-
-Returns an Op that initializes all tables of the default graph. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.tables_initializer` instead.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the initialization op.
-
-##### Returns:
-
-  An Op that initializes all tables.  Note that if there are
-  not tables the returned Op is a NoOp.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.load_op_library.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.load_op_library.md
deleted file mode 100644
index 4d6c027482d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.load_op_library.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.load_op_library(library_filename)` {#load_op_library}
-
-Loads a TensorFlow plugin, containing custom ops and kernels.
-
-Pass "library_filename" to a platform-specific mechanism for dynamically
-loading a library. The rules for determining the exact location of the
-library are platform-specific and are not documented here. When the
-library is loaded, ops and kernels registered in the library via the
-`REGISTER_*` macros are made available in the TensorFlow process. Note
-that ops with the same name as an existing op are rejected and not
-registered with the process.
-
-##### Args:
-
-
-*  <b>`library_filename`</b>: Path to the plugin.
-    Relative or absolute filesystem path to a dynamic library file.
-
-##### Returns:
-
-  A python module containing the Python wrappers for Ops defined in
-  the plugin.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: when unable to load the library or get the python wrappers.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
deleted file mode 100644
index aec816dcbad..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.maximum(x, y, name=None)` {#maximum}
-
-Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.min_max_variable_partitioner.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.min_max_variable_partitioner.md
deleted file mode 100644
index c3010441871..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.min_max_variable_partitioner.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.min_max_variable_partitioner(max_partitions=1, axis=0, min_slice_size=262144, bytes_per_string_element=16)` {#min_max_variable_partitioner}
-
-Partitioner to allocate minimum size per slice.
-
-Returns a partitioner that partitions the variable of given shape and dtype
-such that each partition has a minimum of `min_slice_size` slice of the
-variable. The maximum number of such partitions (upper bound) is given by
-`max_partitions`.
-
-##### Args:
-
-
-*  <b>`max_partitions`</b>: Upper bound on the number of partitions. Defaults to 1.
-*  <b>`axis`</b>: Axis along which to partition the variable. Defaults to 0.
-*  <b>`min_slice_size`</b>: Minimum size of the variable slice per partition. Defaults
-    to 256K.
-*  <b>`bytes_per_string_element`</b>: If the `Variable` is of type string, this provides
-    an estimate of how large each scalar in the `Variable` is.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.moving_average_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.moving_average_variables.md
deleted file mode 100644
index 467a666e2c7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.moving_average_variables.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.moving_average_variables()` {#moving_average_variables}
-
-Returns all variables that maintain their moving averages.
-
-If an `ExponentialMovingAverage` object is created and the `apply()`
-method is called on a list of variables, these variables will
-be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
-This convenience function returns the contents of that collection.
-
-##### Returns:
-
-  A list of Variable objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.multiply.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.multiply.md
deleted file mode 100644
index f1647ee45b2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.multiply.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.multiply(x, y, name=None)` {#multiply}
-
-Returns x * y element-wise.
-
-*NOTE*: ``tf.multiply`` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.batch_norm_with_global_normalization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.batch_norm_with_global_normalization.md
deleted file mode 100644
index a95bd71a04e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.batch_norm_with_global_normalization.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.nn.batch_norm_with_global_normalization(t, m, v, beta, gamma, variance_epsilon, scale_after_normalization, name=None)` {#batch_norm_with_global_normalization}
-
-Batch normalization.
-
-This op is deprecated. See `tf.nn.batch_normalization`.
-
-##### Args:
-
-
-*  <b>`t`</b>: A 4D input Tensor.
-*  <b>`m`</b>: A 1D mean Tensor with size matching the last dimension of t.
-    This is the first output from tf.nn.moments,
-    or a saved moving average thereof.
-*  <b>`v`</b>: A 1D variance Tensor with size matching the last dimension of t.
-    This is the second output from tf.nn.moments,
-    or a saved moving average thereof.
-*  <b>`beta`</b>: A 1D beta Tensor with size matching the last dimension of t.
-    An offset to be added to the normalized tensor.
-*  <b>`gamma`</b>: A 1D gamma Tensor with size matching the last dimension of t.
-    If "scale_after_normalization" is true, this tensor will be multiplied
-    with the normalized tensor.
-*  <b>`variance_epsilon`</b>: A small float number to avoid dividing by 0.
-*  <b>`scale_after_normalization`</b>: A bool indicating whether the resulted tensor
-    needs to be multiplied with gamma.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-   A batch-normalized `t`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.ctc_greedy_decoder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.ctc_greedy_decoder.md
deleted file mode 100644
index 1435f8cb5e9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.ctc_greedy_decoder.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.nn.ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True)` {#ctc_greedy_decoder}
-
-Performs greedy decoding on the logits given in input (best path).
-
-Note: Regardless of the value of merge_repeated, if the maximum index of a
-given time and batch corresponds to the blank index `(num_classes - 1)`, no
-new element is emitted.
-
-If `merge_repeated` is `True`, merge repeated classes in output.
-This means that if consecutive logits' maximum indices are the same,
-only the first of these is emitted.  The sequence `A B B * B * B` (where '*'
-is the blank label) becomes
-
-  * `A B B B` if `merge_repeated=True`.
-  * `A B B B B` if `merge_repeated=False`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: 3-D `float` `Tensor` sized
-    `[max_time x batch_size x num_classes]`.  The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector containing sequence lengths,
-    having size `[batch_size]`.
-*  <b>`merge_repeated`</b>: Boolean.  Default: True.
-
-##### Returns:
-
-  A tuple `(decoded, log_probabilities)` where
-
-*  <b>`decoded`</b>: A single-element list. `decoded[0]`
-    is an `SparseTensor` containing the decoded outputs s.t.:
-    `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
-      The rows store: `[batch, time]`.
-    `decoded.values`: Values vector, size `(total_decoded_outputs)`.
-      The vector stores the decoded classes.
-    `decoded.shape`: Shape vector, size `(2)`.
-      The shape values are: `[batch_size, max_decoded_length]`
-*  <b>`log_probability`</b>: A `float` matrix `(batch_size x 1)` containing sequence
-      log-probabilities.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.depthwise_conv2d_native_backprop_filter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.depthwise_conv2d_native_backprop_filter.md
deleted file mode 100644
index 5096756d7c4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.depthwise_conv2d_native_backprop_filter.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.nn.depthwise_conv2d_native_backprop_filter(input, filter_sizes, out_backprop, strides, padding, name=None)` {#depthwise_conv2d_native_backprop_filter}
-
-Computes the gradients of depthwise convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 4-D
-    `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.elu.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.elu.md
deleted file mode 100644
index 8ffeeca65c7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.elu.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.nn.elu(features, name=None)` {#elu}
-
-Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-
-See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-](http://arxiv.org/abs/1511.07289)
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md
deleted file mode 100644
index 24e26888317..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.nn.separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, rate=None, name=None)` {#separable_conv2d}
-
-2-D convolution with separable filters.
-
-Performs a depthwise convolution that acts separately on channels followed by
-a pointwise convolution that mixes channels.  Note that this is separability
-between dimensions `[1, 2]` and `3`, not spatial separability between
-dimensions `1` and `2`.
-
-In detail,
-
-    output[b, i, j, k] = sum_{di, dj, q, r]
-        input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-        depthwise_filter[di, dj, q, r] *
-        pointwise_filter[0, 0, q * channel_multiplier + r, k]
-
-`strides` controls the strides for the depthwise convolution only, since
-the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
-`strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-If any value in `rate` is greater than 1, we perform atrous depthwise
-convolution, in which case all values in the `strides` tensor must be equal
-to 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`depthwise_filter`</b>: 4-D `Tensor` with shape
-    `[filter_height, filter_width, in_channels, channel_multiplier]`.
-    Contains `in_channels` convolutional filters of depth 1.
-*  <b>`pointwise_filter`</b>: 4-D `Tensor` with shape
-    `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
-    filter to mix channels after `depthwise_filter` has convolved spatially.
-*  <b>`strides`</b>: 1-D of size 4.  The strides for the depthwise convolution for
-    each dimension of `input`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-    See the [comment
-      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`rate`</b>: 1-D of size 2. The dilation rate in which we sample input values
-    across the `height` and `width` dimensions in atrous convolution. If it is
-    greater than 1, then all values of strides must be 1.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If channel_multiplier * in_channels > out_channels,
-    which means that the separable convolution is overparameterized.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md
deleted file mode 100644
index 65da6889d97..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.nn.softmax(logits, dim=-1, name=None)` {#softmax}
-
-Computes softmax activations.
-
-For each batch `i` and class `j` we have
-
-    softmax = exp(logits) / reduce_sum(exp(logits), dim)
-
-##### Args:
-
-
-*  <b>`logits`</b>: A non-empty `Tensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`dim`</b>: The dimension softmax would be performed on. The default is -1 which
-    indicates the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `logits` is empty or `dim` is beyond the last
-    dimension of `logits`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.with_space_to_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.with_space_to_batch.md
deleted file mode 100644
index ced972a78d8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.with_space_to_batch.md
+++ /dev/null
@@ -1,133 +0,0 @@
-### `tf.nn.with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None, spatial_dims=None)` {#with_space_to_batch}
-
-Performs `op` on the space-to-batch representation of `input`.
-
-This has the effect of transforming sliding window operations into the
-corresponding "atrous" operation in which the input is sampled at the
-specified `dilation_rate`.
-
-In the special case that `dilation_rate` is uniformly 1, this simply returns:
-
-  op(input, num_spatial_dims, padding)
-
-Otherwise, it returns:
-
-  batch_to_space_nd(
-    op(space_to_batch_nd(input, adjusted_dilation_rate, adjusted_paddings),
-       num_spatial_dims,
-       "VALID")
-    adjusted_dilation_rate,
-    adjusted_crops),
-
-where:
-
-  adjusted_dilation_rate is an int64 tensor of shape [max(spatial_dims)],
-  adjusted_{paddings,crops} are int64 tensors of shape [max(spatial_dims), 2]
-
-defined as follows:
-
-We first define two int64 tensors `paddings` and `crops` of shape
-`[num_spatial_dims, 2]` based on the value of `padding` and the spatial
-dimensions of the `input`:
-
-If `padding = "VALID"`, then:
-
-  paddings, crops = required_space_to_batch_paddings(
-    input_shape[spatial_dims],
-    dilation_rate)
-
-If `padding = "SAME"`, then:
-
-  dilated_filter_shape =
-    filter_shape + (filter_shape - 1) * (dilation_rate - 1)
-
-  paddings, crops = required_space_to_batch_paddings(
-    input_shape[spatial_dims],
-    dilation_rate,
-    [(dilated_filter_shape - 1) // 2,
-     dilated_filter_shape - 1 - (dilated_filter_shape - 1) // 2])
-
-Because `space_to_batch_nd` and `batch_to_space_nd` assume that the spatial
-dimensions are contiguous starting at the second dimension, but the specified
-`spatial_dims` may not be, we must adjust `dilation_rate`, `paddings` and
-`crops` in order to be usable with these operations.  For a given dimension,
-if the block size is 1, and both the starting and ending padding and crop
-amounts are 0, then space_to_batch_nd effectively leaves that dimension alone,
-which is what is needed for dimensions not part of `spatial_dims`.
-Furthermore, `space_to_batch_nd` and `batch_to_space_nd` handle this case
-efficiently for any number of leading and trailing dimensions.
-
-For 0 <= i < len(spatial_dims), we assign:
-
-  adjusted_dilation_rate[spatial_dims[i] - 1] = dilation_rate[i]
-  adjusted_paddings[spatial_dims[i] - 1, :] = paddings[i, :]
-  adjusted_crops[spatial_dims[i] - 1, :] = crops[i, :]
-
-All unassigned values of `adjusted_dilation_rate` default to 1, while all
-unassigned values of `adjusted_paddings` and `adjusted_crops` default to 0.
-
-Note in the case that `dilation_rate` is not uniformly 1, specifying "VALID"
-padding is equivalent to specifying `padding = "SAME"` with a filter_shape of
-`[1]*N`.
-
-Advanced usage. Note the following optimization: A sequence of
-`with_space_to_batch` operations with identical (not uniformly 1)
-`dilation_rate` parameters and "VALID" padding
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", op_1)
-  ...
-  net = with_space_to_batch(net, dilation_rate, "VALID", op_k)
-
-can be combined into a single `with_space_to_batch` operation as follows:
-
-  def combined_op(converted_input, num_spatial_dims, _):
-    result = op_1(converted_input, num_spatial_dims, "VALID")
-    ...
-    result = op_k(result, num_spatial_dims, "VALID")
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", combined_op)
-
-This eliminates the overhead of `k-1` calls to `space_to_batch_nd` and
-`batch_to_space_nd`.
-
-Similarly, a sequence of `with_space_to_batch` operations with identical (not
-uniformly 1) `dilation_rate` parameters, "SAME" padding, and odd filter
-dimensions
-
-  net = with_space_to_batch(net, dilation_rate, "SAME", op_1, filter_shape_1)
-  ...
-  net = with_space_to_batch(net, dilation_rate, "SAME", op_k, filter_shape_k)
-
-can be combined into a single `with_space_to_batch` operation as follows:
-
-  def combined_op(converted_input, num_spatial_dims, _):
-    result = op_1(converted_input, num_spatial_dims, "SAME")
-    ...
-    result = op_k(result, num_spatial_dims, "SAME")
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", combined_op)
-
-##### Args:
-
-
-*  <b>`input`</b>: Tensor of rank > max(spatial_dims).
-*  <b>`dilation_rate`</b>: int32 Tensor of *known* shape [num_spatial_dims].
-*  <b>`padding`</b>: str constant equal to "VALID" or "SAME"
-*  <b>`op`</b>: Function that maps (input, num_spatial_dims, padding) -> output
-*  <b>`filter_shape`</b>: If padding = "SAME", specifies the shape of the convolution
-    kernel/pooling window as an integer Tensor of shape [>=num_spatial_dims].
-    If padding = "VALID", filter_shape is ignored and need not be specified.
-*  <b>`spatial_dims`</b>: Monotonically increasing sequence of `num_spatial_dims`
-    integers (which are >= 1) specifying the spatial dimensions of `input`
-    and output.  Defaults to: `range(1, num_spatial_dims+1)`.
-
-##### Returns:
-
-  The output Tensor as described above.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `padding` is invalid or the arguments are incompatible.
-*  <b>`ValueError`</b>: if `spatial_dims` are invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.one_hot.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.one_hot.md
deleted file mode 100644
index 7fe09d8cd01..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.one_hot.md
+++ /dev/null
@@ -1,131 +0,0 @@
-### `tf.one_hot(indices, depth, on_value=None, off_value=None, axis=None, dtype=None, name=None)` {#one_hot}
-
-Returns a one-hot tensor.
-
-The locations represented by indices in `indices` take value `on_value`,
-while all other locations take value `off_value`.
-
-`on_value` and `off_value` must have matching data types. If `dtype` is also
-provided, they must be the same data type as specified by `dtype`.
-
-If `on_value` is not provided, it will default to the value `1` with type
-`dtype`
-
-If `off_value` is not provided, it will default to the value `0` with type
-`dtype`
-
-If the input `indices` is rank `N`, the output will have rank `N+1`. The
-new axis is created at dimension `axis` (default: the new axis is appended
-at the end).
-
-If `indices` is a scalar the output shape will be a vector of length `depth`
-
-If `indices` is a vector of length `features`, the output shape will be:
-
-```
-  features x depth if axis == -1
-  depth x features if axis == 0
-```
-
-If `indices` is a matrix (batch) with shape `[batch, features]`, the output
-shape will be:
-
-```
-  batch x features x depth if axis == -1
-  batch x depth x features if axis == 1
-  depth x batch x features if axis == 0
-```
-
-If `dtype` is not provided, it will attempt to assume the data type of
-`on_value` or `off_value`, if one or both are passed in. If none of
-`on_value`, `off_value`, or `dtype` are provided, `dtype` will default to the
-value `tf.float32`.
-
-Note: If a non-numeric data type output is desired (`tf.string`, `tf.bool`,
-etc.), both `on_value` and `off_value` _must_ be provided to `one_hot`.
-
-Examples
-=========
-
-Suppose that
-
-```python
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 5.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[4 x 3]`:
-
-```python
-  output =
-  [5.0 0.0 0.0]  // one_hot(0)
-  [0.0 0.0 5.0]  // one_hot(2)
-  [0.0 0.0 0.0]  // one_hot(-1)
-  [0.0 5.0 0.0]  // one_hot(1)
-```
-
-Suppose that
-
-```python
-  indices = [[0, 2], [1, -1]]
-  depth = 3
-  on_value = 1.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[2 x 2 x 3]`:
-
-```python
-  output =
-  [
-    [1.0, 0.0, 0.0]  // one_hot(0)
-    [0.0, 0.0, 1.0]  // one_hot(2)
-  ][
-    [0.0, 1.0, 0.0]  // one_hot(1)
-    [0.0, 0.0, 0.0]  // one_hot(-1)
-  ]
-```
-
-Using default values for `on_value` and `off_value`:
-
-```python
-  indices = [0, 1, 2]
-  depth = 3
-```
-
-The output will be
-
-```python
-  output =
-  [[1., 0., 0.],
-   [0., 1., 0.],
-   [0., 0., 1.]]
-```
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `Tensor` of indices.
-*  <b>`depth`</b>: A scalar defining the depth of the one hot dimension.
-*  <b>`on_value`</b>: A scalar defining the value to fill in output when `indices[j]
-    = i`. (default: 1)
-*  <b>`off_value`</b>: A scalar defining the value to fill in output when `indices[j]
-    != i`. (default: 0)
-*  <b>`axis`</b>: The axis to fill (default: -1, a new inner-most axis).
-*  <b>`dtype`</b>: The data type of the output tensor.
-
-##### Returns:
-
-
-*  <b>`output`</b>: The one-hot tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If dtype of either `on_value` or `off_value` don't match `dtype`
-*  <b>`TypeError`</b>: If dtype of `on_value` and `off_value` don't match one another
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.op_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.op_scope.md
deleted file mode 100644
index 0aaac5e6579..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.op_scope.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.op_scope(values, name, default_name=None)` {#op_scope}
-
-DEPRECATED. Same as name_scope above, just different argument order.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
deleted file mode 100644
index 3616124c18a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
+++ /dev/null
@@ -1,197 +0,0 @@
-### `tf.parse_example(serialized, features, name=None, example_names=None)` {#parse_example}
-
-Parses `Example` protos into a `dict` of tensors.
-
-Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
-protos given in `serialized`.
-
-`example_names` may contain descriptive names for the corresponding serialized
-protos. These may be useful for debugging purposes, but they have no effect on
-the output. If not `None`, `example_names` must be the same length as
-`serialized`.
-
-This op parses serialized examples into a dictionary mapping keys to `Tensor`
-and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
-`SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
-and `SparseFeature` is mapped to a `SparseTensor`, and each
-`FixedLenFeature` is mapped to a `Tensor`.
-
-Each `VarLenFeature` maps to a `SparseTensor` of the specified type
-representing a ragged matrix. Its indices are `[batch, index]` where `batch`
-is the batch entry the value is from in `serialized`, and `index` is the
-value's index in the list of values associated with that feature and example.
-
-Each `SparseFeature` maps to a `SparseTensor` of the specified type
-representing a sparse matrix of shape
-`(serialized.size(), SparseFeature.size)`. Its indices are `[batch, index]`
-where `batch` is the batch entry the value is from in `serialized`, and
-`index` is the value's index is given by the values in the
-`SparseFeature.index_key` feature column.
-
-Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
-`tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
-
-`FixedLenFeature` entries with a `default_value` are optional. With no default
-value, we will fail if that `Feature` is missing from any example in
-`serialized`.
-
-Examples:
-
-For example, if one expects a `tf.float32` sparse feature `ft` and three
-serialized `Example`s are provided:
-
-```
-serialized = [
-  features
-    { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
-  features
-    { feature []},
-  features
-    { feature { key: "ft" value { float_list { value: [3.0] } } }
-]
-```
-
-then the output will look like:
-
-```
-{"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
-                    values=[1.0, 2.0, 3.0],
-                    dense_shape=(3, 2)) }
-```
-
-Given two `Example` input protos in `serialized`:
-
-```
-[
-  features {
-    feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
-    feature { key: "gps" value { float_list { value: [] } } }
-  },
-  features {
-    feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
-    feature { key: "dank" value { int64_list { value: [ 42 ] } } }
-    feature { key: "gps" value { } }
-  }
-]
-```
-
-And arguments
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "kw": VarLenFeature(tf.string),
-    "dank": VarLenFeature(tf.int64),
-    "gps": VarLenFeature(tf.float32),
-}
-```
-
-Then the output is a dictionary:
-
-```python
-{
-  "kw": SparseTensor(
-      indices=[[0, 0], [0, 1], [1, 0]],
-      values=["knit", "big", "emmy"]
-      dense_shape=[2, 2]),
-  "dank": SparseTensor(
-      indices=[[1, 0]],
-      values=[42],
-      dense_shape=[2, 1]),
-  "gps": SparseTensor(
-      indices=[],
-      values=[],
-      dense_shape=[2, 0]),
-}
-```
-
-For dense results in two serialized `Example`s:
-
-```
-[
-  features {
-    feature { key: "age" value { int64_list { value: [ 0 ] } } }
-    feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
-   },
-   features {
-    feature { key: "age" value { int64_list { value: [] } } }
-    feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
-  }
-]
-```
-
-We can use arguments:
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
-    "gender": FixedLenFeature([], dtype=tf.string),
-}
-```
-
-And the expected output is:
-
-```python
-{
-  "age": [[0], [-1]],
-  "gender": [["f"], ["f"]],
-}
-```
-
-Given two `Example` input protos in `serialized`:
-
-```
-[
-  features {
-    feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
-    feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
-  },
-  features {
-    feature { key: "val" value { float_list { value: [ 0.0 ] } } }
-    feature { key: "ix" value { int64_list { value: [ 42 ] } } }
-  }
-]
-```
-
-And arguments
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "sparse": SparseFeature(
-        index_key="ix", value_key="val", dtype=tf.float32, size=100),
-}
-```
-
-Then the output is a dictionary:
-
-```python
-{
-  "sparse": SparseTensor(
-      indices=[[0, 3], [0, 20], [1, 42]],
-      values=[0.5, -1.0, 0.0]
-      dense_shape=[2, 100]),
-}
-```
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
-    serialized `Example` protos.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature`,
-    `VarLenFeature`, and `SparseFeature` values.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
-    the serialized protos in the batch.
-
-##### Returns:
-
-  A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any feature is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.pow.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.pow.md
deleted file mode 100644
index fbb53fc9a1d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.pow.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.pow(x, y, name=None)` {#pow}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.python_io.tf_record_iterator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.python_io.tf_record_iterator.md
deleted file mode 100644
index 92550fe57ab..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.python_io.tf_record_iterator.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.python_io.tf_record_iterator(path, options=None)` {#tf_record_iterator}
-
-An iterator that read the records from a TFRecords file.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to the TFRecords file.
-*  <b>`options`</b>: (optional) A TFRecordOptions object.
-
-##### Yields:
-
-  Strings.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If `path` cannot be opened for reading.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_crop.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_crop.md
deleted file mode 100644
index d389872919e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_crop.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.random_crop(value, size, seed=None, name=None)` {#random_crop}
-
-Randomly crops a tensor to a given size.
-
-Slices a shape `size` portion out of `value` at a uniformly chosen offset.
-Requires `value.shape >= size`.
-
-If a dimension should not be cropped, pass the full size of that dimension.
-For example, RGB images can be cropped with
-`size = [crop_height, crop_width, 3]`.
-
-##### Args:
-
-
-*  <b>`value`</b>: Input tensor to crop.
-*  <b>`size`</b>: 1-D tensor with size the rank of `value`.
-*  <b>`seed`</b>: Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A cropped tensor of the same rank as `value` and shape `size`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_normal_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_normal_initializer.md
deleted file mode 100644
index f8932bee2e3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.random_normal_initializer.md
+++ /dev/null
@@ -1,25 +0,0 @@
-Initializer that generates tensors with a normal distribution.
-
-Args:
-  mean: a python scalar or a scalar tensor. Mean of the random values
-    to generate.
-  stddev: a python scalar or a scalar tensor. Standard deviation of the
-    random values to generate.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.random_normal_initializer.__call__(shape, dtype=None, partition_info=None)` {#random_normal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.random_normal_initializer.__init__(mean=0.0, stddev=1.0, seed=None, dtype=tf.float32)` {#random_normal_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.rank.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.rank.md
deleted file mode 100644
index 32f62a93a04..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.rank.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.rank(input, name=None)` {#rank}
-
-Returns the rank of a tensor.
-
-This operation returns an integer representing the rank of `input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-# shape of tensor 't' is [2, 2, 3]
-rank(t) ==> 3
-```
-
-**Note**: The rank of a tensor is not the same as the rank of a matrix. The
-rank of a tensor is the number of indices required to uniquely select each
-element of the tensor. Rank is also known as "order", "degree", or "ndims."
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int32`.
-
-@compatibility(numpy)
-Equivalent to np.ndim
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.reciprocal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.reciprocal.md
deleted file mode 100644
index d340aa5178b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.reciprocal.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.reciprocal(x, name=None)` {#reciprocal}
-
-Computes the reciprocal of x element-wise.
-
-I.e., \\(y = 1 / x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
deleted file mode 100644
index 08d5903aa97..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.self_adjoint_eig(tensor, name=None)` {#self_adjoint_eig}
-
-Computes the eigen decomposition of a batch of self-adjoint matrices.
-
-Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
-in `tensor` such that
-`tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i]`, for i=0...N-1.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`. Only the lower triangular part of
-    each inner inner matrix is referenced.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
-*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
-    matrices contain eigenvectors of the corresponding matrices in `tensor`
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sigmoid.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sigmoid.md
deleted file mode 100644
index 8ee71e13705..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sigmoid.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.sigmoid(x, name=None)` {#sigmoid}
-
-Computes sigmoid of `x` element-wise.
-
-Specifically, `y = 1 / (1 + exp(-x))`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A Tensor with type `float32`, `float64`, `int32`, `complex64`, `int64`,
-    or `qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A Tensor with the same type as `x` if `x.dtype != qint32`
-    otherwise the return type is `quint8`.
-
-@compatibility(numpy)
-Equivalent to np.scipy.special.expit
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.slice.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.slice.md
deleted file mode 100644
index aaaf6208e3a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.slice.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.slice(input_, begin, size, name=None)` {#slice}
-
-Extracts a slice from a tensor.
-
-This operation extracts a slice of size `size` from a tensor `input` starting
-at the location specified by `begin`. The slice `size` is represented as a
-tensor shape, where `size[i]` is the number of elements of the 'i'th dimension
-of `input` that you want to slice. The starting location (`begin`) for the
-slice is represented as an offset in each dimension of `input`. In other
-words, `begin[i]` is the offset into the 'i'th dimension of `input` that you
-want to slice from.
-
-`begin` is zero-based; `size` is one-based. If `size[i]` is -1,
-all remaining elements in dimension i are included in the
-slice. In other words, this is equivalent to setting:
-
-`size[i] = input.dim_size(i) - begin[i]`
-
-This operation requires that:
-
-`0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n]`
-
-For example:
-
-```python
-# 'input' is [[[1, 1, 1], [2, 2, 2]],
-#             [[3, 3, 3], [4, 4, 4]],
-#             [[5, 5, 5], [6, 6, 6]]]
-tf.slice(input, [1, 0, 0], [1, 1, 3]) ==> [[[3, 3, 3]]]
-tf.slice(input, [1, 0, 0], [1, 2, 3]) ==> [[[3, 3, 3],
-                                            [4, 4, 4]]]
-tf.slice(input, [1, 0, 0], [2, 1, 3]) ==> [[[3, 3, 3]],
-                                           [[5, 5, 5]]]
-```
-
-##### Args:
-
-
-*  <b>`input_`</b>: A `Tensor`.
-*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`size`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.space_to_depth.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.space_to_depth.md
deleted file mode 100644
index afffa0d1484..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.space_to_depth.md
+++ /dev/null
@@ -1,87 +0,0 @@
-### `tf.space_to_depth(input, block_size, name=None)` {#space_to_depth}
-
-SpaceToDepth for tensors of type T.
-
-Rearranges blocks of spatial data, into depth. More specifically,
-this op outputs a copy of the input tensor where values from the `height`
-and `width` dimensions are moved to the `depth` dimension.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Non-overlapping blocks of size `block_size x block size` are rearranged
-    into depth at each location.
-  * The depth of the output tensor is `input_depth * block_size * block_size`.
-  * The input tensor's height and width must be divisible by block_size.
-
-That is, assuming the input is in the shape:
-`[batch, height, width, depth]`,
-the shape of the output will be:
-`[batch, height/block_size, width/block_size, depth*block_size*block_size]`
-
-This operation requires that the input tensor be of rank 4, and that
-`block_size` be >=1 and a divisor of both the input `height` and `width`.
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
-
-```prettyprint
-x = [[[[1], [2]],
-      [[3], [4]]]]
-```
-
-This operation will output a tensor of shape `[1, 1, 1, 4]`:
-
-```prettyprint
-[[[[1, 2, 3, 4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-the corresponding output will have a single element (i.e. width and height are
-both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-The output element shape is `[1, 1, 4]`.
-
-For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-
-```prettyprint
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-This operation, for block_size of 2, will return the following tensor of shape
-`[1, 1, 1, 12]`
-
-```prettyprint
-[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-
-```prettyprint
-x = [[[[1],   [2],  [5],  [6]],
-      [[3],   [4],  [7],  [8]],
-      [[9],  [10], [13],  [14]],
-      [[11], [12], [15],  [16]]]]
-```
-
-the operator will return the following tensor of shape `[1 2 2 4]`:
-
-```prettyprint
-x = [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`block_size`</b>: An `int` that is `>= 2`. The size of the spatial block.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reduce_sum_sparse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reduce_sum_sparse.md
deleted file mode 100644
index 96a53cc87ae..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reduce_sum_sparse.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False, reduction_axes=None)` {#sparse_reduce_sum_sparse}
-
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The SparseTensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce; list or scalar. If `None` (the
-    default), reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retain reduced dimensions with length 1.
-*  <b>`reduction_axes`</b>: Deprecated name of axis
-
-##### Returns:
-
-  The reduced SparseTensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reset_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reset_shape.md
deleted file mode 100644
index 363b4cc9e3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_reset_shape.md
+++ /dev/null
@@ -1,60 +0,0 @@
-### `tf.sparse_reset_shape(sp_input, new_shape=None)` {#sparse_reset_shape}
-
-Resets the shape of a `SparseTensor` with indices and values unchanged.
-
-If `new_shape` is None, returns a copy of `sp_input` with its shape reset
-to the tight bounding box of `sp_input`.
-
-If `new_shape` is provided, then it must be larger or equal in all dimensions
-compared to the shape of `sp_input`. When this condition is met, the returned
-SparseTensor will have its shape reset to `new_shape` and its indices and
-values unchanged from that of `sp_input.`
-
-For example:
-
-  Consider a `sp_input` with shape [2, 3, 5]:
-
-    [0, 0, 1]: a
-    [0, 1, 0]: b
-    [0, 2, 2]: c
-    [1, 0, 3]: d
-
-  - It is an error to set `new_shape` as [3, 7] since this represents a
-    rank-2 tensor while `sp_input` is rank-3. This is either a ValueError
-    during graph construction (if both shapes are known) or an OpError during
-    run time.
-
-  - Setting `new_shape` as [2, 3, 6] will be fine as this shape is larger or
-    equal in every dimension compared to the original shape [2, 3, 5].
-
-  - On the other hand, setting new_shape as [2, 3, 4] is also an error: The
-    third dimension is smaller than the original shape [2, 3, 5] (and an
-    `InvalidArgumentError` will be raised).
-
-  - If `new_shape` is None, the returned SparseTensor will have a shape
-    [2, 3, 4], which is the tight bounding box of `sp_input`.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`new_shape`</b>: None or a vector representing the new shape for the returned
-    `SparseTensor`.
-
-##### Returns:
-
-  A `SparseTensor` indices and values unchanged from `input_sp`. Its shape is
-    `new_shape` if that is set. Otherwise it is  the tight bounding box of
-     `input_sp`
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-*  <b>`ValueError`</b>: If `new_shape` represents a tensor with a different rank from
-    that of `sp_input` (if shapes are known when graph is constructed).
-*  <b>`OpError`</b>: 
-    - If `new_shape` has dimension sizes that are too small.
-    - If shapes are not known during graph construction time, and during run
-      time it is found out that the ranks do not match.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_mean.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_mean.md
deleted file mode 100644
index af7affaa9ff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_mean.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.sparse_segment_mean(data, indices, segment_ids, name=None)` {#sparse_segment_mean}
-
-Computes the mean along sparse segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_sum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_sum.md
deleted file mode 100644
index e48ae891c3f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_segment_sum.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.sparse_segment_sum(data, indices, segment_ids, name=None)` {#sparse_segment_sum}
-
-Computes the sum along sparse segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-For example:
-
-```prettyprint
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-
-# Select two rows, one segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-  ==> [[0 0 0 0]]
-
-# Select two rows, two segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-  ==> [[ 1  2  3  4]
-       [-1 -2 -3 -4]]
-
-# Select all rows, two segments.
-tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-  ==> [[0 0 0 0]
-       [5 6 7 8]]
-
-# Which is equivalent to:
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-```
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_transpose.md
deleted file mode 100644
index fa4176a7641..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.sparse_transpose.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.sparse_transpose(sp_input, perm=None, name=None)` {#sparse_transpose}
-
-Transposes a `SparseTensor`
-
-The returned tensor's dimension i will correspond to the input dimension
-`perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-the rank of the input tensor. Hence by default, this operation performs a
-regular matrix transpose on 2-D input Tensors.
-
-For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
-
-    [0, 3]: b
-    [0, 1]: a
-    [3, 1]: d
-    [2, 0]: c
-
-then the output will be a `SparseTensor` of shape `[5, 4]` and
-`indices` / `values`:
-
-    [0, 2]: c
-    [1, 0]: a
-    [1, 3]: d
-    [3, 0]: b
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`perm`</b>: A permutation of the dimensions of `sp_input`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A transposed `SparseTensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.string_join.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.string_join.md
deleted file mode 100644
index b81537a70ce..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.string_join.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.string_join(inputs, separator=None, name=None)` {#string_join}
-
-Joins the strings in the given list of string tensors into one tensor;
-
-with the given separator (default is an empty separator).
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of at least 1 `Tensor` objects of type `string`.
-    A list of string tensors.  The tensors must all have the same shape,
-    or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-    of non-scalar inputs.
-*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
-    string, an optional join separator.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md
deleted file mode 100644
index f2d0c042d77..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.get_summary_description.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.get_summary_description.md
deleted file mode 100644
index 2e0189dfa8b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.get_summary_description.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.summary.get_summary_description(node_def)` {#get_summary_description}
-
-Given a TensorSummary node_def, retrieve its SummaryDescription.
-
-When a Summary op is instantiated, a SummaryDescription of associated
-metadata is stored in its NodeDef. This method retrieves the description.
-
-##### Args:
-
-
-*  <b>`node_def`</b>: the node_def_pb2.NodeDef of a TensorSummary op
-
-##### Returns:
-
-  a summary_pb2.SummaryDescription
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the node is not a summary op.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.scalar.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.scalar.md
deleted file mode 100644
index 3ae39cb1d92..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.scalar.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.summary.scalar(name, tensor, collections=None)` {#scalar}
-
-Outputs a `Summary` protocol buffer containing a single scalar value.
-
-The generated Summary has a Tensor.proto containing the input Tensor.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as the series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A real numeric Tensor containing a single value.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If tensor has the wrong shape or type.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.to_float.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.to_float.md
deleted file mode 100644
index b45b49b9824..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.to_float.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.to_float(x, name='ToFloat')` {#to_float}
-
-Casts a tensor to type `float32`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `float32`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.AdamOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.AdamOptimizer.md
deleted file mode 100644
index 97f8b825e3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.AdamOptimizer.md
+++ /dev/null
@@ -1,53 +0,0 @@
-Optimizer that implements the Adam algorithm.
-
-See [Kingma et. al., 2014](http://arxiv.org/abs/1412.6980)
-([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
-
-- - -
-
-#### `tf.train.AdamOptimizer.__init__(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')` {#AdamOptimizer.__init__}
-
-Construct a new Adam optimizer.
-
-Initialization:
-
-```
-m_0 <- 0 (Initialize initial 1st moment vector)
-v_0 <- 0 (Initialize initial 2nd moment vector)
-t <- 0 (Initialize timestep)
-```
-
-The update rule for `variable` with gradient `g` uses an optimization
-described at the end of section2 of the paper:
-
-```
-t <- t + 1
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-```
-
-The default value of 1e-8 for epsilon might not be a good default in
-general. For example, when training an Inception network on ImageNet a
-current good choice is 1.0 or 0.1.
-
-Note that in dense implement of this algorithm, m_t, v_t and variable will
-update even if g is zero, but in sparse implement, m_t, v_t and variable
-will not update in iterations g is zero.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning rate.
-*  <b>`beta1`</b>: A float value or a constant float tensor.
-    The exponential decay rate for the 1st moment estimates.
-*  <b>`beta2`</b>: A float value or a constant float tensor.
-    The exponential decay rate for the 2nd moment estimates.
-*  <b>`epsilon`</b>: A small constant for numerical stability.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name for the operations created when applying gradients.
-    Defaults to "Adam".
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
deleted file mode 100644
index d43253eb6e3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
+++ /dev/null
@@ -1,264 +0,0 @@
-A coordinator for threads.
-
-This class implements a simple mechanism to coordinate the termination of a
-set of threads.
-
-#### Usage:
-
-```python
-# Create a coordinator.
-coord = Coordinator()
-# Start a number of threads, passing the coordinator to each of them.
-...start thread 1...(coord, ...)
-...start thread N...(coord, ...)
-# Wait for all the threads to terminate.
-coord.join(threads)
-```
-
-Any of the threads can call `coord.request_stop()` to ask for all the threads
-to stop.  To cooperate with the requests, each thread must check for
-`coord.should_stop()` on a regular basis.  `coord.should_stop()` returns
-`True` as soon as `coord.request_stop()` has been called.
-
-A typical thread running with a coordinator will do something like:
-
-```python
-while not coord.should_stop():
-  ...do some work...
-```
-
-#### Exception handling:
-
-A thread can report an exception to the coordinator as part of the
-`should_stop()` call.  The exception will be re-raised from the
-`coord.join()` call.
-
-Thread code:
-
-```python
-try:
-  while not coord.should_stop():
-    ...do some work...
-except Exception as e:
-  coord.request_stop(e)
-```
-
-Main code:
-
-```python
-try:
-  ...
-  coord = Coordinator()
-  # Start a number of threads, passing the coordinator to each of them.
-  ...start thread 1...(coord, ...)
-  ...start thread N...(coord, ...)
-  # Wait for all the threads to terminate.
-  coord.join(threads)
-except Exception as e:
-  ...exception that was passed to coord.request_stop()
-```
-
-To simplify the thread implementation, the Coordinator provides a
-context handler `stop_on_exception()` that automatically requests a stop if
-an exception is raised.  Using the context handler the thread code above
-can be written as:
-
-```python
-with coord.stop_on_exception():
-  while not coord.should_stop():
-    ...do some work...
-```
-
-#### Grace period for stopping:
-
-After a thread has called `coord.request_stop()` the other threads have a
-fixed time to stop, this is called the 'stop grace period' and defaults to 2
-minutes.  If any of the threads is still alive after the grace period expires
-`coord.join()` raises a RuntimeException reporting the laggards.
-
-```python
-try:
-  ...
-  coord = Coordinator()
-  # Start a number of threads, passing the coordinator to each of them.
-  ...start thread 1...(coord, ...)
-  ...start thread N...(coord, ...)
-  # Wait for all the threads to terminate, give them 10s grace period
-  coord.join(threads, stop_grace_period_secs=10)
-except RuntimeException:
-  ...one of the threads took more than 10s to stop after request_stop()
-  ...was called.
-except Exception:
-  ...exception that was passed to coord.request_stop()
-```
-- - -
-
-#### `tf.train.Coordinator.__init__(clean_stop_exception_types=None)` {#Coordinator.__init__}
-
-Create a new Coordinator.
-
-##### Args:
-
-
-*  <b>`clean_stop_exception_types`</b>: Optional tuple of Exception types that should
-    cause a clean stop of the coordinator. If an exception of one of these
-    types is reported to `request_stop(ex)` the coordinator will behave as
-    if `request_stop(None)` was called.  Defaults to
-    `(tf.errors.OutOfRangeError,)` which is used by input queues to signal
-    the end of input. When feeding training data from a Python iterator it
-    is common to add `StopIteration` to this list.
-
-
-- - -
-
-#### `tf.train.Coordinator.clear_stop()` {#Coordinator.clear_stop}
-
-Clears the stop flag.
-
-After this is called, calls to `should_stop()` will return `False`.
-
-
-- - -
-
-#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
-
-Wait for threads to terminate.
-
-This call blocks until a set of threads have terminated.  The set of thread
-is the union of the threads passed in the `threads` argument and the list
-of threads that registered with the coordinator by calling
-`Coordinator.register_thread()`.
-
-After the threads stop, if an `exc_info` was passed to `request_stop`, that
-exception is re-raised.
-
-Grace period handling: When `request_stop()` is called, threads are given
-'stop_grace_period_secs' seconds to terminate.  If any of them is still
-alive after that period expires, a `RuntimeError` is raised.  Note that if
-an `exc_info` was passed to `request_stop()` then it is raised instead of
-that `RuntimeError`.
-
-##### Args:
-
-
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
-    addition to the registered threads.
-*  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
-    `request_stop()` has been called.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If any thread is still alive after `request_stop()`
-    is called and the grace period expires.
-
-
-- - -
-
-#### `tf.train.Coordinator.joined` {#Coordinator.joined}
-
-
-
-
-- - -
-
-#### `tf.train.Coordinator.raise_requested_exception()` {#Coordinator.raise_requested_exception}
-
-If an exception has been passed to `request_stop`, this raises it.
-
-
-- - -
-
-#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
-
-Register a thread to join.
-
-##### Args:
-
-
-*  <b>`thread`</b>: A Python thread to join.
-
-
-- - -
-
-#### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
-
-Request that the threads stop.
-
-After this is called, calls to `should_stop()` will return `True`.
-
-Note: If an exception is being passed in, in must be in the context of
-handling the exception (i.e. `try: ... except Exception as ex: ...`) and not
-a newly created one.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Coordinator.should_stop()` {#Coordinator.should_stop}
-
-Check if stop was requested.
-
-##### Returns:
-
-  True if a stop was requested.
-
-
-- - -
-
-#### `tf.train.Coordinator.stop_on_exception()` {#Coordinator.stop_on_exception}
-
-Context manager to request stop when an Exception is raised.
-
-Code that uses a coordinator must catch exceptions and pass
-them to the `request_stop()` method to stop the other threads
-managed by the coordinator.
-
-This context handler simplifies the exception handling.
-Use it as follows:
-
-```python
-with coord.stop_on_exception():
-  # Any exception raised in the body of the with
-  # clause is reported to the coordinator before terminating
-  # the execution of the body.
-  ...body...
-```
-
-This is completely equivalent to the slightly longer code:
-
-```python
-try:
-  ...body...
-exception Exception as ex:
-  coord.request_stop(ex)
-```
-
-##### Yields:
-
-  nothing.
-
-
-- - -
-
-#### `tf.train.Coordinator.wait_for_stop(timeout=None)` {#Coordinator.wait_for_stop}
-
-Wait till the Coordinator is told to stop.
-
-##### Args:
-
-
-*  <b>`timeout`</b>: Float.  Sleep for up to that many seconds waiting for
-    should_stop() to become True.
-
-##### Returns:
-
-  True if the Coordinator is told stop, False if the timeout expired.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.QueueRunner.from_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.QueueRunner.from_proto.md
deleted file mode 100644
index e7b5bc70e3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.QueueRunner.from_proto.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.train.QueueRunner.from_proto(queue_runner_def, import_scope=None)` {#QueueRunner.from_proto}
-
-Returns a `QueueRunner` object created from `queue_runner_def`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
deleted file mode 100644
index 499b65cc841..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Optimizer that implements the RMSProp algorithm.
-
-See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
-
-- - -
-
-#### `tf.train.RMSPropOptimizer.__init__(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp')` {#RMSPropOptimizer.__init__}
-
-Construct a new RMSProp optimizer.
-
-Note that in dense implement of this algorithm, m_t and v_t will
-update even if g is zero, but in sparse implement, m_t and v_t
-will not update in iterations g is zero.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning rate.
-*  <b>`decay`</b>: Discounting factor for the history/coming gradient
-*  <b>`momentum`</b>: A scalar tensor.
-*  <b>`epsilon`</b>: Small value to avoid zero denominator.
-*  <b>`use_locking`</b>: If True use locks for update operation.
-*  <b>`centered`</b>: If True, gradients are normalized by the estimated variance of
-    the gradient; if False, by the uncentered second moment. Setting this to
-    True may help with training, but is slightly more expensive in terms of
-    computation and memory. Defaults to False.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "RMSProp".
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Scaffold.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Scaffold.md
deleted file mode 100644
index 8882f4710d3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Scaffold.md
+++ /dev/null
@@ -1,144 +0,0 @@
-Structure to create or gather pieces commonly needed to train a model.
-
-When you build a model for training you usually need ops to initialize
-variables, a `Saver` to checkpoint them, an op to collect summaries for
-the visualizer, and so on.
-
-Various libraries built on top of the core TensorFlow library take care of
-creating some or all of these pieces and storing them in well known
-collections in the graph.  The `Scaffold` class helps pick these pieces from
-the graph collections, creating and adding them to the collections if needed.
-
-If you call the scaffold constructor without any arguments, it will pick
-pieces from the collections, creating default ones if needed when
-`scaffold.finalize()` is called.  You can pass arguments to the constructor to
-provide your own pieces.  Pieces that you pass to the constructor are not
-added to the graph collections.
-
-The following pieces are directly accessible as attributes of the `Scaffold`
-object:
-
-* `saver`: A `tf.Saver` object taking care of saving the variables.  Picked
-  from and stored into the `SAVERS` collection in the graph by default.
-* `init_op`: An op to run to initialize the variables.  Picked from and
-  stored into the `INIT_OP` collection in the graph by default.
-* `ready_op`: An op to verify that the variables are initialized.  Picked
-  from and stored into the `READY_OP` collection in the graph by default.
-* `ready_for_local_init_op`: An op to verify that global state has been
-  initialized and it is alright to run `local_init_op`.  Picked from and
-  stored into the `READY_FOR_LOCAL_INIT_OP` collection in the graph by
-  default. This is needed when the initialization of local variables depends
-  on the values of global variables.
-* `local_init_op`: An op to initialize the local variables.  Picked
-  from and stored into the `LOCAL_INIT_OP` collection in the graph by default.
-* `summary_op`: An op to run and merge the summaries in the graph.  Picked
-  from and stored into the `SUMMARY_OP` collection in the graph by default.
-* `global_step`: A tensor containing the global step counter.  Picked
-  from and stored into the `GLOBAL_STEP` collection in the graph by default.
-
-You can also pass the following additional pieces to the constructor:
-
-* `init_feed_dict`: A sessionn feed dictionary that should be used when
-   running the init op.
-* `init_fn`: A callable to run run after the init op to perform additional
-  initializations.  The callable will be called as
-  `init_fn(scaffold, session)`.
-- - -
-
-#### `tf.train.Scaffold.__init__(init_op=None, init_feed_dict=None, init_fn=None, ready_op=None, ready_for_local_init_op=None, local_init_op=None, summary_op=None, saver=None)` {#Scaffold.__init__}
-
-Create a scaffold.
-
-##### Args:
-
-
-*  <b>`init_op`</b>: Optional op for initializing variables.
-*  <b>`init_feed_dict`</b>: Optional session feed dictionary to use when running the
-    init_op.
-*  <b>`init_fn`</b>: Optional function to use to initialize the model after running
-    the init_op.  Will be called as `init_fn(scaffold, session)`.
-*  <b>`ready_op`</b>: Optional op to verify that the variables are initialized.  Must
-    return an empty 1D string tensor when the variables are initialized, or
-    a non-empty 1D string tensor listing the names of the non-initialized
-    variables.
-*  <b>`ready_for_local_init_op`</b>: Optional op to verify that the global variables
-    are initialized and `local_init_op` can be run. Must return an empty
-    1D string tensor when the global variables are initialized, or a
-    non-empty 1D string tensor listing the names of the non-initialized
-    global variables.
-*  <b>`local_init_op`</b>: Optional op to initialize local variables.
-*  <b>`summary_op`</b>: Optional op to gather all summaries.  Must return a scalar
-    string tensor containing a serialized `Summary` proto.
-*  <b>`saver`</b>: Optional `tf.Saver` object to use to save and restore variables.
-
-
-- - -
-
-#### `tf.train.Scaffold.finalize()` {#Scaffold.finalize}
-
-Creates operations if needed and finalizes the graph.
-
-
-- - -
-
-#### `tf.train.Scaffold.get_or_default(arg_name, collection_key, default_constructor)` {#Scaffold.get_or_default}
-
-Get from cache or create a default operation.
-
-
-- - -
-
-#### `tf.train.Scaffold.init_feed_dict` {#Scaffold.init_feed_dict}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.init_fn` {#Scaffold.init_fn}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.init_op` {#Scaffold.init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.local_init_op` {#Scaffold.local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.ready_for_local_init_op` {#Scaffold.ready_for_local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.ready_op` {#Scaffold.ready_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.saver` {#Scaffold.saver}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.summary_op` {#Scaffold.summary_op}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.SessionRunContext.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.SessionRunContext.md
deleted file mode 100644
index ce3e7647957..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.SessionRunContext.md
+++ /dev/null
@@ -1,57 +0,0 @@
-Provides information about the `session.run()` call being made.
-
-Provides information about original request to `Session.Run()` function.
-SessionRunHook objects can stop the loop by calling `request_stop()` of
-`run_context`. In the future we may use this object to add more information
-about run without changing the Hook API.
-- - -
-
-#### `tf.train.SessionRunContext.__init__(original_args, session)` {#SessionRunContext.__init__}
-
-Initializes SessionRunContext.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.original_args` {#SessionRunContext.original_args}
-
-A `SessionRunArgs` object holding the original arguments of `run()`.
-
-If user called `MonitoredSession.run(fetches=a, feed_dict=b)`, then this
-field is equal to SessionRunArgs(a, b).
-
-##### Returns:
-
- A `SessionRunArgs` object
-
-
-- - -
-
-#### `tf.train.SessionRunContext.request_stop()` {#SessionRunContext.request_stop}
-
-Sets stop requested field.
-
-Hooks can use this function to request stop of iterations.
-`MonitoredSession` checks whether this is called or not.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.session` {#SessionRunContext.session}
-
-A TensorFlow session object which will execute the `run`.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.stop_requested` {#SessionRunContext.stop_requested}
-
-Returns whether a stop is requested or not.
-
-If true, `MonitoredSession` stops iterations.
-
-##### Returns:
-
-  A `bool`
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md
deleted file mode 100644
index e599bfc21a5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md
+++ /dev/null
@@ -1,85 +0,0 @@
-Monitor to request stop at a specified step.
-- - -
-
-#### `tf.train.StopAtStepHook.__init__(num_steps=None, last_step=None)` {#StopAtStepHook.__init__}
-
-Create a StopAtStep Hook.
-
-This hook requests stop after either a number of steps have been
-executed or a last step has been reached.  Only of the two options can be
-specified.
-
-if `num_steps` is specified, it indicates the number of steps to execute
-after `begin()` is called.  If instead `last_step` is specified, it
-indicates the last step we want to execute, as passed to the `after_run()`
-call.
-
-##### Args:
-
-
-*  <b>`num_steps`</b>: Number of steps to execute.
-*  <b>`last_step`</b>: Step after which to stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.after_create_session(session, coord)` {#StopAtStepHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.after_run(run_context, run_values)` {#StopAtStepHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.before_run(run_context)` {#StopAtStepHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.begin()` {#StopAtStepHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.end(session)` {#StopAtStepHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Supervisor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Supervisor.md
deleted file mode 100644
index d6c6693a5a9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Supervisor.md
+++ /dev/null
@@ -1,859 +0,0 @@
-A training helper that checkpoints models and computes summaries.
-
-The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
-and a `SessionManager` that takes care of common needs of TensorFlow
-training programs.
-
-#### Use for a single program
-
-```python
-with tf.Graph().as_default():
-  ...add operations to the graph...
-  # Create a Supervisor that will checkpoint the model in '/tmp/mydir'.
-  sv = Supervisor(logdir='/tmp/mydir')
-  # Get a TensorFlow session managed by the supervisor.
-  with sv.managed_session(FLAGS.master) as sess:
-    # Use the session to train the graph.
-    while not sv.should_stop():
-      sess.run(<my_train_op>)
-```
-
-Within the `with sv.managed_session()` block all variables in the graph have
-been initialized.  In addition, a few services have been started to
-checkpoint the model and add summaries to the event log.
-
-If the program crashes and is restarted, the managed session automatically
-reinitialize variables from the most recent checkpoint.
-
-The supervisor is notified of any exception raised by one of the services.
-After an exception is raised, `should_stop()` returns `True`.  In that case
-the training loop should also stop.  This is why the training loop has to
-check for `sv.should_stop()`.
-
-Exceptions that indicate that the training inputs have been exhausted,
-`tf.errors.OutOfRangeError`, also cause `sv.should_stop()` to return `True`
-but are not re-raised from the `with` block: they indicate a normal
-termination.
-
-#### Use for multiple replicas
-
-To train with replicas you deploy the same program in a `Cluster`.
-One of the tasks must be identified as the *chief*: the task that handles
-initialization, checkpoints, summaries, and recovery.  The other tasks
-depend on the *chief* for these services.
-
-The only change you have to do to the single program code is to indicate
-if the program is running as the *chief*.
-
-```python
-# Choose a task as the chief. This could be based on server_def.task_index,
-# or job_def.name, or job_def.tasks. It's entirely up to the end user.
-# But there can be only one *chief*.
-is_chief = (server_def.task_index == 0)
-server = tf.train.Server(server_def)
-
-with tf.Graph().as_default():
-  ...add operations to the graph...
-  # Create a Supervisor that uses log directory on a shared file system.
-  # Indicate if you are the 'chief'
-  sv = Supervisor(logdir='/shared_directory/...', is_chief=is_chief)
-  # Get a Session in a TensorFlow server on the cluster.
-  with sv.managed_session(server.target) as sess:
-    # Use the session to train the graph.
-    while not sv.should_stop():
-      sess.run(<my_train_op>)
-```
-
-In the *chief* task, the `Supervisor` works exactly as in the first example
-above.  In the other tasks `sv.managed_session()` waits for the Model to have
-been initialized before returning a session to the training code.  The
-non-chief tasks depend on the chief task for initializing the model.
-
-If one of the tasks crashes and restarts, `managed_session()`
-checks if the Model is initialized.  If yes, it just creates a session and
-returns it to the training code that proceeds normally.  If the model needs
-to be initialized, the chief task takes care of reinitializing it; the other
-tasks just wait for the model to have been initialized.
-
-NOTE: This modified program still works fine as a single program.
-The single program marks itself as the chief.
-
-#### What `master` string to use
-
-Whether you are running on your machine or in the cluster you can use the
-following values for the --master flag:
-
-* Specifying `''` requests an in-process session that does not use RPC.
-
-* Specifying `'local'` requests a session that uses the RPC-based
-  "Master interface" to run TensorFlow programs. See
-  [`tf.train.Server.create_local_server()`](#Server.create_local_server) for
-  details.
-
-* Specifying `'grpc://hostname:port'` requests a session that uses
-  the RPC interface to a specific host, and also allows the in-process
-  master to access remote tensorflow workers. Often, it is
-  appropriate to pass `server.target` (for some `tf.train.Server`
-  named `server).
-
-#### Advanced use
-
-##### Launching additional services
-
-`managed_session()` launches the Checkpoint and Summary services (threads).
-If you need more services to run you can simply launch them in the block
-controlled by `managed_session()`.
-
-Example: Start a thread to print losses.  We want this thread to run
-every 60 seconds, so we launch it with `sv.loop()`.
-
-  ```python
-  ...
-  sv = Supervisor(logdir='/tmp/mydir')
-  with sv.managed_session(FLAGS.master) as sess:
-    sv.loop(60, print_loss, (sess, ))
-    while not sv.should_stop():
-      sess.run(my_train_op)
-  ```
-
-##### Launching fewer services
-
-`managed_session()` launches the "summary" and "checkpoint" threads which use
-either the optionally `summary_op` and `saver` passed to the constructor, or
-default ones created automatically by the supervisor.  If you want to run
-your own summary and checkpointing logic, disable these services by passing
-`None` to the `summary_op` and `saver` parameters.
-
-Example: Create summaries manually every 100 steps in the chief.
-
-  ```python
-  # Create a Supervisor with no automatic summaries.
-  sv = Supervisor(logdir='/tmp/mydir', is_chief=is_chief, summary_op=None)
-  # As summary_op was None, managed_session() does not start the
-  # summary thread.
-  with sv.managed_session(FLAGS.master) as sess:
-    for step in xrange(1000000):
-      if sv.should_stop():
-        break
-      if is_chief and step % 100 == 0:
-        # Create the summary every 100 chief steps.
-        sv.summary_computed(sess, sess.run(my_summary_op))
-      else:
-        # Train normally
-        sess.run(my_train_op)
-  ```
-
-##### Custom model initialization
-
-`managed_session()` only supports initializing the model by running an
-`init_op` or restoring from the latest checkpoint.  If you have special
-initialization needs, see how to specify a `local_init_op` when creating the
-supervisor.  You can also use the `SessionManager` directly to create a
-session and check if it could be initialized automatically.
-
-- - -
-
-#### `tf.train.Supervisor.__init__(graph=None, ready_op=0, ready_for_local_init_op=0, is_chief=True, init_op=0, init_feed_dict=None, local_init_op=0, logdir=None, summary_op=0, saver=0, global_step=0, save_summaries_secs=120, save_model_secs=600, recovery_wait_secs=30, stop_grace_secs=120, checkpoint_basename='model.ckpt', session_manager=None, summary_writer=0, init_fn=None)` {#Supervisor.__init__}
-
-Create a `Supervisor`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph`.  The graph that the model will use.  Defaults to the
-    default `Graph`.  The supervisor may add operations to the graph before
-    creating a session, but the graph should not be modified by the caller
-    after passing it to the supervisor.
-*  <b>`ready_op`</b>: 1-D string `Tensor`.  This tensor is evaluated by supervisors in
-    `prepare_or_wait_for_session()` to check if the model is ready to use.
-    The model is considered ready if it returns an empty array.  Defaults to
-    the tensor returned from `tf.report_uninitialized_variables()`  If
-    `None`, the model is not checked for readiness.
-*  <b>`ready_for_local_init_op`</b>: 1-D string `Tensor`.  This tensor is evaluated by
-    supervisors in `prepare_or_wait_for_session()` to check if the model is
-    ready to run the local_init_op.
-    The model is considered ready if it returns an empty array.  Defaults to
-    the tensor returned from
-    `tf.report_uninitialized_variables(tf.global_variables())`. If `None`,
-    the model is not checked for readiness before running local_init_op.
-*  <b>`is_chief`</b>: If True, create a chief supervisor in charge of initializing
-    and restoring the model.  If False, create a supervisor that relies
-    on a chief supervisor for inits and restore.
-*  <b>`init_op`</b>: `Operation`.  Used by chief supervisors to initialize the model
-    when it can not be recovered.  Defaults to an `Operation` that
-    initializes all variables.  If `None`, no initialization is done
-    automatically unless you pass a value for `init_fn`, see below.
-*  <b>`init_feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    This feed dictionary will be used when `init_op` is evaluated.
-*  <b>`local_init_op`</b>: `Operation`. Used by all supervisors to run initializations
-    that should run for every new supervisor instance. By default these
-    are table initializers and initializers for local variables.
-    If `None`, no further per supervisor-instance initialization is
-    done automatically.
-*  <b>`logdir`</b>: A string.  Optional path to a directory where to checkpoint the
-    model and log events for the visualizer.  Used by chief supervisors.
-    The directory will be created if it does not exist.
-*  <b>`summary_op`</b>: An `Operation` that returns a Summary for the event logs.
-    Used by chief supervisors if a `logdir` was specified.  Defaults to the
-    operation returned from summary.merge_all().  If `None`, summaries are
-    not computed automatically.
-*  <b>`saver`</b>: A Saver object.  Used by chief supervisors if a `logdir` was
-    specified.  Defaults to the saved returned by Saver().
-    If `None`, the model is not saved automatically.
-*  <b>`global_step`</b>: An integer Tensor of size 1 that counts steps.  The value
-    from 'global_step' is used in summaries and checkpoint filenames.
-    Default to the op named 'global_step' in the graph if it exists, is of
-    rank 1, size 1, and of type tf.int32 or tf.int64.  If `None` the global
-    step is not recorded in summaries and checkpoint files.  Used by chief
-    supervisors if a `logdir` was specified.
-*  <b>`save_summaries_secs`</b>: Number of seconds between the computation of
-    summaries for the event log.  Defaults to 120 seconds.  Pass 0 to
-    disable summaries.
-*  <b>`save_model_secs`</b>: Number of seconds between the creation of model
-    checkpoints.  Defaults to 600 seconds.  Pass 0 to disable checkpoints.
-*  <b>`recovery_wait_secs`</b>: Number of seconds between checks that the model
-    is ready.  Used by supervisors when waiting for a chief supervisor
-    to initialize or restore the model.  Defaults to 30 seconds.
-*  <b>`stop_grace_secs`</b>: Grace period, in seconds, given to running threads to
-    stop when `stop()` is called.  Defaults to 120 seconds.
-*  <b>`checkpoint_basename`</b>: The basename for checkpoint saving.
-*  <b>`session_manager`</b>: `SessionManager`, which manages Session creation and
-    recovery. If it is `None`, a default `SessionManager` will be created
-    with the set of arguments passed in for backwards compatibility.
-*  <b>`summary_writer`</b>: `SummaryWriter` to use or `USE_DEFAULT`.  Can be `None`
-    to indicate that no summaries should be written.
-*  <b>`init_fn`</b>: Optional callable used to initialize the model. Called
-    after the optional `init_op` is called.  The callable must accept one
-    argument, the session being initialized.
-
-##### Returns:
-
-  A `Supervisor`.
-
-
-- - -
-
-#### `tf.train.Supervisor.managed_session(master='', config=None, start_standard_services=True, close_summary_writer=True)` {#Supervisor.managed_session}
-
-Returns a context manager for a managed session.
-
-This context manager creates and automatically recovers a session.  It
-optionally starts the standard services that handle checkpoints and
-summaries.  It monitors exceptions raised from the `with` block or from the
-services and stops the supervisor as needed.
-
-The context manager is typically used as follows:
-
-```python
-def train():
-  sv = tf.train.Supervisor(...)
-  with sv.managed_session(<master>) as sess:
-    for step in xrange(..):
-      if sv.should_stop():
-        break
-      sess.run(<my training op>)
-      ...do other things needed at each training step...
-```
-
-An exception raised from the `with` block or one of the service threads is
-raised again when the block exits.  This is done after stopping all threads
-and closing the session.  For example, an `AbortedError` exception, raised
-in case of preemption of one of the workers in a distributed model, is
-raised again when the block exits.
-
-If you want to retry the training loop in case of preemption you can do it
-as follows:
-
-```python
-def main(...):
-  while True
-    try:
-      train()
-    except tf.errors.Aborted:
-      pass
-```
-
-As a special case, exceptions used for control flow, such as
-`OutOfRangeError` which reports that input queues are exhausted, are not
-raised again from the `with` block: they indicate a clean termination of
-the training loop and are considered normal termination.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-    Passed as-is to create the session.
-*  <b>`start_standard_services`</b>: Whether to start the standard services,
-    such as checkpoint, summary and step counter.
-*  <b>`close_summary_writer`</b>: Whether to close the summary writer when
-    closing the session.  Defaults to True.
-
-##### Returns:
-
-  A context manager that yields a `Session` restored from the latest
-  checkpoint or initialized from scratch if not checkpoint exists.  The
-  session is closed when the `with` block exits.
-
-
-- - -
-
-#### `tf.train.Supervisor.prepare_or_wait_for_session(master='', config=None, wait_for_checkpoint=False, max_wait_secs=7200, start_standard_services=True)` {#Supervisor.prepare_or_wait_for_session}
-
-Make sure the model is ready to be used.
-
-Create a session on 'master', recovering or initializing the model as
-needed, or wait for a session to be ready.  If running as the chief
-and `start_standard_service` is set to True, also call the session
-manager to start the standard services.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session,
-    which is passed as-is to create the session.
-*  <b>`wait_for_checkpoint`</b>: Whether we should wait for the availability of a
-    checkpoint before creating Session. Defaults to False.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-*  <b>`start_standard_services`</b>: Whether to start the standard services and the
-    queue runners.
-
-##### Returns:
-
-  A Session object that can be used to drive the model.
-
-
-- - -
-
-#### `tf.train.Supervisor.start_standard_services(sess)` {#Supervisor.start_standard_services}
-
-Start the standard services for 'sess'.
-
-This starts services in the background.  The services started depend
-on the parameters to the constructor and may include:
-
-  - A Summary thread computing summaries every save_summaries_secs.
-  - A Checkpoint thread saving the model every save_model_secs.
-  - A StepCounter thread measure step time.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session.
-
-##### Returns:
-
-  A list of threads that are running the standard services.  You can use
-  the Supervisor's Coordinator to join these threads with:
-    sv.coord.Join(<list of threads>)
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If called with a non-chief Supervisor.
-*  <b>`ValueError`</b>: If not `logdir` was passed to the constructor as the
-    services need a log directory.
-
-
-- - -
-
-#### `tf.train.Supervisor.start_queue_runners(sess, queue_runners=None)` {#Supervisor.start_queue_runners}
-
-Start threads for `QueueRunners`.
-
-Note that the queue runners collected in the graph key `QUEUE_RUNNERS`
-are already started automatically when you create a session with the
-supervisor, so unless you have non-collected queue runners to start
-you do not need to call this explicitly.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`queue_runners`</b>: A list of `QueueRunners`. If not specified, we'll use the
-    list of queue runners gathered in the graph under the key
-    `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  The list of threads started for the `QueueRunners`.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_computed(sess, summary, global_step=None)` {#Supervisor.summary_computed}
-
-Indicate that a summary was computed.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` object.
-*  <b>`summary`</b>: A Summary proto, or a string holding a serialized summary proto.
-*  <b>`global_step`</b>: Int. global step this summary is associated with. If `None`,
-    it will try to fetch the current step.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if 'summary' is not a Summary proto or a string.
-*  <b>`RuntimeError`</b>: if the Supervisor was created without a `logdir`.
-
-
-
-- - -
-
-#### `tf.train.Supervisor.stop(threads=None, close_summary_writer=True)` {#Supervisor.stop}
-
-Stop the services and the coordinator.
-
-This does not close the session.
-
-##### Args:
-
-
-*  <b>`threads`</b>: Optional list of threads to join with the coordinator.  If
-    `None`, defaults to the threads running the standard services, the
-    threads started for `QueueRunners`, and the threads started by the
-    `loop()` method.  To wait on additional threads, pass the
-    list in this parameter.
-*  <b>`close_summary_writer`</b>: Whether to close the `summary_writer`.  Defaults to
-    `True` if the summary writer was created by the supervisor, `False`
-    otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.request_stop(ex=None)` {#Supervisor.request_stop}
-
-Request that the coordinator stop the threads.
-
-See `Coordinator.request_stop()`.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Supervisor.should_stop()` {#Supervisor.should_stop}
-
-Check if the coordinator was told to stop.
-
-See `Coordinator.should_stop()`.
-
-##### Returns:
-
-  True if the coordinator was told to stop, False otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.stop_on_exception()` {#Supervisor.stop_on_exception}
-
-Context handler to stop the supervisor when an exception is raised.
-
-See `Coordinator.stop_on_exception()`.
-
-##### Returns:
-
-  A context handler.
-
-
-- - -
-
-#### `tf.train.Supervisor.wait_for_stop()` {#Supervisor.wait_for_stop}
-
-Block waiting for the coordinator to stop.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Supervisor.Loop(timer_interval_secs, target, args=None, kwargs=None)` {#Supervisor.Loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(*args, **kwargs)`
-repeatedly.  Otherwise it calls it every `timer_interval_secs`
-seconds.  The thread terminates when a stop is requested.
-
-The started thread is added to the list of threads managed by the supervisor
-so it does not need to be passed to the `stop()` method.
-
-##### Args:
-
-
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.Supervisor.PrepareSession(master='', config=None, wait_for_checkpoint=False, max_wait_secs=7200, start_standard_services=True)` {#Supervisor.PrepareSession}
-
-Make sure the model is ready to be used.
-
-Create a session on 'master', recovering or initializing the model as
-needed, or wait for a session to be ready.  If running as the chief
-and `start_standard_service` is set to True, also call the session
-manager to start the standard services.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session,
-    which is passed as-is to create the session.
-*  <b>`wait_for_checkpoint`</b>: Whether we should wait for the availability of a
-    checkpoint before creating Session. Defaults to False.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-*  <b>`start_standard_services`</b>: Whether to start the standard services and the
-    queue runners.
-
-##### Returns:
-
-  A Session object that can be used to drive the model.
-
-
-- - -
-
-#### `tf.train.Supervisor.RequestStop(ex=None)` {#Supervisor.RequestStop}
-
-Request that the coordinator stop the threads.
-
-See `Coordinator.request_stop()`.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Supervisor.ShouldStop()` {#Supervisor.ShouldStop}
-
-Check if the coordinator was told to stop.
-
-See `Coordinator.should_stop()`.
-
-##### Returns:
-
-  True if the coordinator was told to stop, False otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.StartQueueRunners(sess, queue_runners=None)` {#Supervisor.StartQueueRunners}
-
-Start threads for `QueueRunners`.
-
-Note that the queue runners collected in the graph key `QUEUE_RUNNERS`
-are already started automatically when you create a session with the
-supervisor, so unless you have non-collected queue runners to start
-you do not need to call this explicitly.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`queue_runners`</b>: A list of `QueueRunners`. If not specified, we'll use the
-    list of queue runners gathered in the graph under the key
-    `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  The list of threads started for the `QueueRunners`.
-
-
-- - -
-
-#### `tf.train.Supervisor.StartStandardServices(sess)` {#Supervisor.StartStandardServices}
-
-Start the standard services for 'sess'.
-
-This starts services in the background.  The services started depend
-on the parameters to the constructor and may include:
-
-  - A Summary thread computing summaries every save_summaries_secs.
-  - A Checkpoint thread saving the model every save_model_secs.
-  - A StepCounter thread measure step time.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session.
-
-##### Returns:
-
-  A list of threads that are running the standard services.  You can use
-  the Supervisor's Coordinator to join these threads with:
-    sv.coord.Join(<list of threads>)
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If called with a non-chief Supervisor.
-*  <b>`ValueError`</b>: If not `logdir` was passed to the constructor as the
-    services need a log directory.
-
-
-- - -
-
-#### `tf.train.Supervisor.Stop(threads=None, close_summary_writer=True)` {#Supervisor.Stop}
-
-Stop the services and the coordinator.
-
-This does not close the session.
-
-##### Args:
-
-
-*  <b>`threads`</b>: Optional list of threads to join with the coordinator.  If
-    `None`, defaults to the threads running the standard services, the
-    threads started for `QueueRunners`, and the threads started by the
-    `loop()` method.  To wait on additional threads, pass the
-    list in this parameter.
-*  <b>`close_summary_writer`</b>: Whether to close the `summary_writer`.  Defaults to
-    `True` if the summary writer was created by the supervisor, `False`
-    otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.StopOnException()` {#Supervisor.StopOnException}
-
-Context handler to stop the supervisor when an exception is raised.
-
-See `Coordinator.stop_on_exception()`.
-
-##### Returns:
-
-  A context handler.
-
-
-- - -
-
-#### `tf.train.Supervisor.SummaryComputed(sess, summary, global_step=None)` {#Supervisor.SummaryComputed}
-
-Indicate that a summary was computed.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` object.
-*  <b>`summary`</b>: A Summary proto, or a string holding a serialized summary proto.
-*  <b>`global_step`</b>: Int. global step this summary is associated with. If `None`,
-    it will try to fetch the current step.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if 'summary' is not a Summary proto or a string.
-*  <b>`RuntimeError`</b>: if the Supervisor was created without a `logdir`.
-
-
-- - -
-
-#### `tf.train.Supervisor.WaitForStop()` {#Supervisor.WaitForStop}
-
-Block waiting for the coordinator to stop.
-
-
-- - -
-
-#### `tf.train.Supervisor.coord` {#Supervisor.coord}
-
-Return the Coordinator used by the Supervisor.
-
-The Coordinator can be useful if you want to run multiple threads
-during your training.
-
-##### Returns:
-
-  A Coordinator object.
-
-
-- - -
-
-#### `tf.train.Supervisor.global_step` {#Supervisor.global_step}
-
-Return the global_step Tensor used by the supervisor.
-
-##### Returns:
-
-  An integer Tensor for the global_step.
-
-
-- - -
-
-#### `tf.train.Supervisor.init_feed_dict` {#Supervisor.init_feed_dict}
-
-Return the feed dictionary used when evaluating the `init_op`.
-
-##### Returns:
-
-  A feed dictionary or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.init_op` {#Supervisor.init_op}
-
-Return the Init Op used by the supervisor.
-
-##### Returns:
-
-  An Op or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.is_chief` {#Supervisor.is_chief}
-
-Return True if this is a chief supervisor.
-
-##### Returns:
-
-  A bool.
-
-
-- - -
-
-#### `tf.train.Supervisor.loop(timer_interval_secs, target, args=None, kwargs=None)` {#Supervisor.loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(*args, **kwargs)`
-repeatedly.  Otherwise it calls it every `timer_interval_secs`
-seconds.  The thread terminates when a stop is requested.
-
-The started thread is added to the list of threads managed by the supervisor
-so it does not need to be passed to the `stop()` method.
-
-##### Args:
-
-
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.Supervisor.ready_for_local_init_op` {#Supervisor.ready_for_local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Supervisor.ready_op` {#Supervisor.ready_op}
-
-Return the Ready Op used by the supervisor.
-
-##### Returns:
-
-  An Op or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_model_secs` {#Supervisor.save_model_secs}
-
-Return the delay between checkpoints.
-
-##### Returns:
-
-  A timestamp.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_path` {#Supervisor.save_path}
-
-Return the save path used by the supervisor.
-
-##### Returns:
-
-  A string.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_summaries_secs` {#Supervisor.save_summaries_secs}
-
-Return the delay between summary computations.
-
-##### Returns:
-
-  A timestamp.
-
-
-- - -
-
-#### `tf.train.Supervisor.saver` {#Supervisor.saver}
-
-Return the Saver used by the supervisor.
-
-##### Returns:
-
-  A Saver object.
-
-
-- - -
-
-#### `tf.train.Supervisor.session_manager` {#Supervisor.session_manager}
-
-Return the SessionManager used by the Supervisor.
-
-##### Returns:
-
-  A SessionManager object.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_op` {#Supervisor.summary_op}
-
-Return the Summary Tensor used by the chief supervisor.
-
-##### Returns:
-
-  A string Tensor for the summary or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_writer` {#Supervisor.summary_writer}
-
-Return the SummaryWriter used by the chief supervisor.
-
-##### Returns:
-
-  A SummaryWriter.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md
deleted file mode 100644
index 4fb1a2b5757..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md
+++ /dev/null
@@ -1,60 +0,0 @@
-### `tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#exponential_decay}
-
-Applies exponential decay to the learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an exponential decay function
-to a provided initial learning rate.  It requires a `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate *
-                        decay_rate ^ (global_step / decay_steps)
-```
-
-If the argument `staircase` is `True`, then `global_step / decay_steps` is an
-integer division and the decayed learning rate follows a staircase function.
-
-Example: decay every 100000 steps with a base of 0.96:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-starter_learning_rate = 0.1
-learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
-                                           100000, 0.96, staircase=True)
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Must be positive.  See the decay computation above.
-*  <b>`decay_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The decay rate.
-*  <b>`staircase`</b>: Boolean.  If `True` decay the learning rate at discrete intervals
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'ExponentialDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.slice_input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.slice_input_producer.md
deleted file mode 100644
index da888d0fc24..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.slice_input_producer.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.train.slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None)` {#slice_input_producer}
-
-Produces a slice of each `Tensor` in `tensor_list`.
-
-Implemented using a Queue -- a `QueueRunner` for the Queue
-is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-##### Args:
-
-
-*  <b>`tensor_list`</b>: A list of `Tensor` objects. Every `Tensor` in
-    `tensor_list` must have the same size in the first dimension.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `slice_input_producer`
-    produces each slice `num_epochs` times before generating
-    an `OutOfRange` error. If not specified, `slice_input_producer` can cycle
-    through the slices an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the integers are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A list of tensors, one for each element of `tensor_list`.  If the tensor
-  in `tensor_list` has shape `[N, a, b, .., z]`, then the corresponding output
-  tensor will have shape `[a, b, ..., z]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `slice_input_producer` produces nothing from `tensor_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.trainable_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.trainable_variables.md
deleted file mode 100644
index 894d64a2b4b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.trainable_variables.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.trainable_variables()` {#trainable_variables}
-
-Returns all variables created with `trainable=True`.
-
-When passed `trainable=True`, the `Variable()` constructor automatically
-adds new variables to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`. This convenience function returns the
-contents of that collection.
-
-##### Returns:
-
-  A list of Variable objects.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.truncated_normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.truncated_normal.md
deleted file mode 100644
index 9ae13882d34..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.truncated_normal.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.truncated_normal(shape, mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)` {#truncated_normal}
-
-Outputs random values from a truncated normal distribution.
-
-The generated values follow a normal distribution with specified mean and
-standard deviation, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`mean`</b>: A 0-D Tensor or Python value of type `dtype`. The mean of the
-    truncated normal distribution.
-*  <b>`stddev`</b>: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-    of the truncated normal distribution.
-*  <b>`dtype`</b>: The type of the output.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random truncated normal values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf_debug.watch_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf_debug.watch_graph.md
deleted file mode 100644
index 94f3489f06c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf_debug.watch_graph.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf_debug.watch_graph(run_options, graph, debug_ops='DebugIdentity', debug_urls=None, node_name_regex_whitelist=None, op_type_regex_whitelist=None)` {#watch_graph}
-
-Add debug watches to `RunOptions` for a TensorFlow graph.
-
-To watch all `Tensor`s on the graph, let both `node_name_regex_whitelist`
-and `op_type_regex_whitelist` be the default (`None`).
-
-N.B.: Under certain circumstances, not all specified `Tensor`s will be
-  actually watched (e.g., nodes that are constant-folded during runtime will
-  not be watched).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`graph`</b>: An instance of `ops.Graph`.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-*  <b>`debug_urls`</b>: URLs to send debug values to. Can be a list of strings,
-    a single string, or None. The case of a single string is equivalent to
-    a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
-    `grpc://localhost:12345`.
-*  <b>`node_name_regex_whitelist`</b>: Regular-expression whitelist for node_name,
-    e.g., `"(weight_[0-9]+|bias_.*)"`
-*  <b>`op_type_regex_whitelist`</b>: Regular-expression whitelist for the op type of
-    nodes, e.g., `"(Variable|Add)"`.
-    If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
-    are set, the two filtering operations will occur in a logical `AND`
-    relation. In other words, a node will be included if and only if it
-    hits both whitelists.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.DeviceSpec.from_string.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.DeviceSpec.from_string.md
deleted file mode 100644
index 5cbba0ada68..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.DeviceSpec.from_string.md
+++ /dev/null
@@ -1,18 +0,0 @@
-#### `tf.DeviceSpec.from_string(spec)` {#DeviceSpec.from_string}
-
-Construct a `DeviceSpec` from a string.
-
-##### Args:
-
-
-*  <b>`spec`</b>: a string of the form
-   /job:<name>/replica:<id>/task:<id>/device:CPU:<id>
-  or
-   /job:<name>/replica:<id>/task:<id>/device:GPU:<id>
-  as cpu and gpu are mutually exclusive.
-  All entries are optional.
-
-##### Returns:
-
-  A DeviceSpec.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.FixedLenFeature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.FixedLenFeature.md
deleted file mode 100644
index 55a007852a6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.FixedLenFeature.md
+++ /dev/null
@@ -1,59 +0,0 @@
-Configuration for parsing a fixed-length input feature.
-
-To treat sparse input as dense, provide a `default_value`; otherwise,
-the parse functions will fail on any examples missing this feature.
-
-Fields:
-  shape: Shape of input data.
-  dtype: Data type of input.
-  default_value: Value to be used if an example is missing this feature. It
-      must be compatible with `dtype`.
-- - -
-
-#### `tf.FixedLenFeature.__getnewargs__()` {#FixedLenFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.FixedLenFeature.__getstate__()` {#FixedLenFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.FixedLenFeature.__new__(_cls, shape, dtype, default_value=None)` {#FixedLenFeature.__new__}
-
-Create new instance of FixedLenFeature(shape, dtype, default_value)
-
-
-- - -
-
-#### `tf.FixedLenFeature.__repr__()` {#FixedLenFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.FixedLenFeature.default_value` {#FixedLenFeature.default_value}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.FixedLenFeature.dtype` {#FixedLenFeature.dtype}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.FixedLenFeature.shape` {#FixedLenFeature.shape}
-
-Alias for field number 0
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.Operation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.Operation.md
deleted file mode 100644
index 5bad8aaaabe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.Operation.md
+++ /dev/null
@@ -1,239 +0,0 @@
-Represents a graph node that performs computation on tensors.
-
-An `Operation` is a node in a TensorFlow `Graph` that takes zero or
-more `Tensor` objects as input, and produces zero or more `Tensor`
-objects as output. Objects of type `Operation` are created by
-calling a Python op constructor (such as
-[`tf.matmul()`](../../api_docs/python/math_ops.md#matmul))
-or [`Graph.create_op()`](../../api_docs/python/framework.md#Graph.create_op).
-
-For example `c = tf.matmul(a, b)` creates an `Operation` of type
-"MatMul" that takes tensors `a` and `b` as input, and produces `c`
-as output.
-
-After the graph has been launched in a session, an `Operation` can
-be executed by passing it to
-[`Session.run()`](../../api_docs/python/client.md#Session.run).
-`op.run()` is a shortcut for calling `tf.get_default_session().run(op)`.
-
-- - -
-
-#### `tf.Operation.name` {#Operation.name}
-
-The full name of this operation.
-
-
-- - -
-
-#### `tf.Operation.type` {#Operation.type}
-
-The type of the op (e.g. `"MatMul"`).
-
-
-- - -
-
-#### `tf.Operation.inputs` {#Operation.inputs}
-
-The list of `Tensor` objects representing the data inputs of this op.
-
-
-- - -
-
-#### `tf.Operation.control_inputs` {#Operation.control_inputs}
-
-The `Operation` objects on which this op has a control dependency.
-
-Before this op is executed, TensorFlow will ensure that the
-operations in `self.control_inputs` have finished executing. This
-mechanism can be used to run ops sequentially for performance
-reasons, or to ensure that the side effects of an op are observed
-in the correct order.
-
-##### Returns:
-
-  A list of `Operation` objects.
-
-
-- - -
-
-#### `tf.Operation.outputs` {#Operation.outputs}
-
-The list of `Tensor` objects representing the outputs of this op.
-
-
-- - -
-
-#### `tf.Operation.device` {#Operation.device}
-
-The name of the device to which this op has been assigned, if any.
-
-##### Returns:
-
-  The string name of the device to which this op has been
-  assigned, or an empty string if it has not been assigned to a
-  device.
-
-
-- - -
-
-#### `tf.Operation.graph` {#Operation.graph}
-
-The `Graph` that contains this operation.
-
-
-
-- - -
-
-#### `tf.Operation.run(feed_dict=None, session=None)` {#Operation.run}
-
-Runs this operation in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for this operation.
-
-*N.B.* Before invoking `Operation.run()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run)
-    for a description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to run to this operation. If
-    none, the default session will be used.
-
-
-
-- - -
-
-#### `tf.Operation.get_attr(name)` {#Operation.get_attr}
-
-Returns the value of the attr of this op with the given `name`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the attr to fetch.
-
-##### Returns:
-
-  The value of the attr, as a Python object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this op does not have an attr with the given `name`.
-
-
-- - -
-
-#### `tf.Operation.traceback` {#Operation.traceback}
-
-Returns the call stack from when this operation was constructed.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Operation.__init__(node_def, g, inputs=None, output_types=None, control_inputs=None, input_types=None, original_op=None, op_def=None)` {#Operation.__init__}
-
-Creates an `Operation`.
-
-NOTE: This constructor validates the name of the `Operation` (passed
-as `node_def.name`). Valid `Operation` names match the following
-regular expression:
-
-    [A-Za-z0-9.][A-Za-z0-9_.\\-/]*
-
-##### Args:
-
-
-*  <b>`node_def`</b>: `node_def_pb2.NodeDef`.  `NodeDef` for the `Operation`.
-    Used for attributes of `node_def_pb2.NodeDef`, typically `name`,
-    `op`, and `device`.  The `input` attribute is irrelevant here
-    as it will be computed when generating the model.
-*  <b>`g`</b>: `Graph`. The parent graph.
-*  <b>`inputs`</b>: list of `Tensor` objects. The inputs to this `Operation`.
-*  <b>`output_types`</b>: list of `DType` objects.  List of the types of the
-    `Tensors` computed by this operation.  The length of this list indicates
-    the number of output endpoints of the `Operation`.
-*  <b>`control_inputs`</b>: list of operations or tensors from which to have a
-    control dependency.
-*  <b>`input_types`</b>: List of `DType` objects representing the
-    types of the tensors accepted by the `Operation`.  By default
-    uses `[x.dtype.base_dtype for x in inputs]`.  Operations that expect
-    reference-typed inputs must specify these explicitly.
-*  <b>`original_op`</b>: Optional. Used to associate the new `Operation` with an
-    existing `Operation` (for example, a replica with the op that was
-    replicated).
-*  <b>`op_def`</b>: Optional. The `op_def_pb2.OpDef` proto that describes the
-    op type that this `Operation` represents.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if control inputs are not Operations or Tensors,
-    or if `node_def` is not a `NodeDef`,
-    or if `g` is not a `Graph`,
-    or if `inputs` are not tensors,
-    or if `inputs` and `input_types` are incompatible.
-*  <b>`ValueError`</b>: if the `node_def` name is not valid.
-
-
-- - -
-
-#### `tf.Operation.__repr__()` {#Operation.__repr__}
-
-
-
-
-- - -
-
-#### `tf.Operation.__str__()` {#Operation.__str__}
-
-
-
-
-- - -
-
-#### `tf.Operation.colocation_groups()` {#Operation.colocation_groups}
-
-Returns the list of colocation groups of the op.
-
-
-- - -
-
-#### `tf.Operation.node_def` {#Operation.node_def}
-
-Returns a serialized `NodeDef` representation of this operation.
-
-##### Returns:
-
-  A
-  [`NodeDef`](https://www.tensorflow.org/code/tensorflow/core/framework/node_def.proto)
-  protocol buffer.
-
-
-- - -
-
-#### `tf.Operation.op_def` {#Operation.op_def}
-
-Returns the `OpDef` proto that represents the type of this op.
-
-##### Returns:
-
-  An
-  [`OpDef`](https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto)
-  protocol buffer.
-
-
-- - -
-
-#### `tf.Operation.values()` {#Operation.values}
-
-DEPRECATED: Use outputs.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.QueueBase.from_list.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.QueueBase.from_list.md
deleted file mode 100644
index d9a2e7c71fe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.QueueBase.from_list.md
+++ /dev/null
@@ -1,21 +0,0 @@
-#### `tf.QueueBase.from_list(index, queues)` {#QueueBase.from_list}
-
-Create a queue using the queue reference from `queues[index]`.
-
-##### Args:
-
-
-*  <b>`index`</b>: An integer scalar tensor that determines the input that gets
-    selected.
-*  <b>`queues`</b>: A list of `QueueBase` objects.
-
-##### Returns:
-
-  A `QueueBase` object.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: When `queues` is not a list of `QueueBase` objects,
-    or when the data types of `queues` are not all the same.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.as_dtype.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.as_dtype.md
deleted file mode 100644
index 50a048aacba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.as_dtype.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.as_dtype(type_value)` {#as_dtype}
-
-Converts the given `type_value` to a `DType`.
-
-##### Args:
-
-
-*  <b>`type_value`</b>: A value that can be converted to a `tf.DType`
-    object. This may currently be a `tf.DType` object, a
-    [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
-    a string type name, or a `numpy.dtype`.
-
-##### Returns:
-
-  A `DType` corresponding to `type_value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `type_value` cannot be converted to a `DType`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md
deleted file mode 100644
index b50abb29dd4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.assert_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_equal}
-
-Assert the condition `x == y` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_equal(x, y)]):
-  output = tf.reduce_sum(x)
-```
-
-This condition holds if for every pair of (possibly broadcast) elements
-`x[i]`, `y[i]`, we have `x[i] == y[i]`.
-If both `x` and `y` are empty, this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`y`</b>: Numeric `Tensor`, same dtype as and broadcastable to `x`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`, `y`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_equal".
-
-##### Returns:
-
-  Op that raises `InvalidArgumentError` if `x == y` is False.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_variables_initialized.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_variables_initialized.md
deleted file mode 100644
index ac8604579de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_variables_initialized.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.assert_variables_initialized(var_list=None)` {#assert_variables_initialized}
-
-Returns an Op to check if variables are initialized.
-
-NOTE: This function is obsolete and will be removed in 6 months.  Please
-change your implementation to use `report_uninitialized_variables()`.
-
-When run, the returned Op will raise the exception `FailedPreconditionError`
-if any of the variables has not yet been initialized.
-
-Note: This function is implemented by trying to fetch the values of the
-variables. If one of the variables is not initialized a message may be
-logged by the C++ runtime. This is expected.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to check. Defaults to the
-    value of `global_variables().`
-
-##### Returns:
-
-  An Op, or None if there are no variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.bayesflow.entropy.renyi_ratio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.bayesflow.entropy.renyi_ratio.md
deleted file mode 100644
index 5b801848fc8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.bayesflow.entropy.renyi_ratio.md
+++ /dev/null
@@ -1,103 +0,0 @@
-### `tf.contrib.bayesflow.entropy.renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio')` {#renyi_ratio}
-
-Monte Carlo estimate of the ratio appearing in Renyi divergence.
-
-This can be used to compute the Renyi (alpha) divergence, or a log evidence
-approximation based on Renyi divergence.
-
-#### Definition
-
-With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
-the (biased for finite `n`) estimate:
-
-```
-(1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
-\approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
-```
-
-This ratio appears in different contexts:
-
-#### Renyi divergence
-
-If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
-`alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
-
-```
-# Choose reasonably high n to limit bias, see below.
-renyi_ratio(log_p, q, alpha, n=100)
-                \approx -1 * D_alpha[q || p],  where
-D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
-```
-
-The Renyi (or "alpha") divergence is non-negative and equal to zero iff
-`q = p`.  Various limits of `alpha` lead to different special case results:
-
-```
-alpha       D_alpha[q || p]
------       ---------------
---> 0       Log[ int_{q > 0} p(z) dz ]
-= 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
---> 1       KL[q || p]
-= 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
---> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
-```
-
-See "Renyi Divergence Variational Inference", by Li and Turner.
-
-#### Log evidence approximation
-
-If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
-an alternative to the ELBO common in variational inference.
-
-```
-L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
-```
-
-If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
-`ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
-interpolation between the ELBO and the true evidence.
-
-#### Stability notes
-
-Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
-is subject to underflow/overflow issues.  For that reason, it is evaluated in
-log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
-that reason, one may wish to shrink `alpha` gradually.  See the `Op`
-`renyi_alpha`.  Using `float64` will also help.
-
-
-#### Bias for finite sample size
-
-Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
-`E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
-estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
-with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
-same result as `elbo_ratio`, and as `n` increases the expected value
-of the estimator increases.
-
-#### Call signature
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_p`</b>: Callable mapping samples from `q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`q`</b>: `tf.contrib.distributions.Distribution`.
-     `float64` `dtype` recommended.
-     `log_p` and `q` should be supported on the same set.
-*  <b>`alpha`</b>: `Tensor` with shape `q.batch_shape` and values not equal to 1.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  The number of samples to use if `z` is not provided.
-    Note that this can be highly biased for small `n`, see docstring.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-
-*  <b>`renyi_result`</b>: The scaled log of sample mean.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ConditionalTransformedDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ConditionalTransformedDistribution.md
deleted file mode 100644
index 150a0afa9db..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ConditionalTransformedDistribution.md
+++ /dev/null
@@ -1,490 +0,0 @@
-A TransformedDistribution that allows intrinsic conditioning.
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.__init__(distribution, bijector=None, batch_shape=None, event_shape=None, validate_args=False, name=None)` {#ConditionalTransformedDistribution.__init__}
-
-Construct a Transformed Distribution.
-
-##### Args:
-
-
-*  <b>`distribution`</b>: The base distribution instance to transform. Typically an
-    instance of `Distribution`.
-*  <b>`bijector`</b>: The object responsible for calculating the transformation.
-    Typically an instance of `Bijector`. `None` means `Identity()`.
-*  <b>`batch_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `batch_shape`; valid only if `distribution.is_scalar_batch()`.
-*  <b>`event_shape`</b>: `integer` vector `Tensor` which overrides `distribution`
-    `event_shape`; valid only if `distribution.is_scalar_event()`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class. Default:
-    `bijector.name + distribution.name`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.allow_nan_stats` {#ConditionalTransformedDistribution.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.batch_shape` {#ConditionalTransformedDistribution.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.batch_shape_tensor(name='batch_shape_tensor')` {#ConditionalTransformedDistribution.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.bijector` {#ConditionalTransformedDistribution.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.cdf(*args, **kwargs)` {#ConditionalTransformedDistribution.cdf}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.copy(**override_parameters_kwargs)` {#ConditionalTransformedDistribution.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.covariance(name='covariance')` {#ConditionalTransformedDistribution.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.distribution` {#ConditionalTransformedDistribution.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.dtype` {#ConditionalTransformedDistribution.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.entropy(name='entropy')` {#ConditionalTransformedDistribution.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.event_shape` {#ConditionalTransformedDistribution.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.event_shape_tensor(name='event_shape_tensor')` {#ConditionalTransformedDistribution.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_continuous` {#ConditionalTransformedDistribution.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_scalar_batch(name='is_scalar_batch')` {#ConditionalTransformedDistribution.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.is_scalar_event(name='is_scalar_event')` {#ConditionalTransformedDistribution.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_cdf(*args, **kwargs)` {#ConditionalTransformedDistribution.log_cdf}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_prob(*args, **kwargs)` {#ConditionalTransformedDistribution.log_prob}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.log_survival_function(*args, **kwargs)` {#ConditionalTransformedDistribution.log_survival_function}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.mean(name='mean')` {#ConditionalTransformedDistribution.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.mode(name='mode')` {#ConditionalTransformedDistribution.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.name` {#ConditionalTransformedDistribution.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ConditionalTransformedDistribution.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.param_static_shapes(cls, sample_shape)` {#ConditionalTransformedDistribution.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.parameters` {#ConditionalTransformedDistribution.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.prob(*args, **kwargs)` {#ConditionalTransformedDistribution.prob}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.reparameterization_type` {#ConditionalTransformedDistribution.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.sample(*args, **kwargs)` {#ConditionalTransformedDistribution.sample}
-
-##### `kwargs`:
-
-*  `**condition_kwargs`: Named arguments forwarded to subclass implementation.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.stddev(name='stddev')` {#ConditionalTransformedDistribution.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.survival_function(*args, **kwargs)` {#ConditionalTransformedDistribution.survival_function}
-
-Additional documentation from `ConditionalTransformedDistribution`:
-
-##### `kwargs`:
-
-*  `bijector_kwargs`: Python dictionary of arg names/values forwarded to the bijector.
-*  `distribution_kwargs`: Python dictionary of arg names/values forwarded to the distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.validate_args` {#ConditionalTransformedDistribution.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ConditionalTransformedDistribution.variance(name='variance')` {#ConditionalTransformedDistribution.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusRate.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusRate.md
deleted file mode 100644
index a66d4e1c45d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusRate.md
+++ /dev/null
@@ -1,566 +0,0 @@
-Exponential with softplus transform on `rate`.
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.__init__(rate, validate_args=False, allow_nan_stats=True, name='ExponentialWithSoftplusRate')` {#ExponentialWithSoftplusRate.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.allow_nan_stats` {#ExponentialWithSoftplusRate.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.batch_shape` {#ExponentialWithSoftplusRate.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.batch_shape_tensor(name='batch_shape_tensor')` {#ExponentialWithSoftplusRate.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.cdf(value, name='cdf')` {#ExponentialWithSoftplusRate.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.concentration` {#ExponentialWithSoftplusRate.concentration}
-
-Concentration parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.copy(**override_parameters_kwargs)` {#ExponentialWithSoftplusRate.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.covariance(name='covariance')` {#ExponentialWithSoftplusRate.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.dtype` {#ExponentialWithSoftplusRate.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.entropy(name='entropy')` {#ExponentialWithSoftplusRate.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.event_shape` {#ExponentialWithSoftplusRate.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.event_shape_tensor(name='event_shape_tensor')` {#ExponentialWithSoftplusRate.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_continuous` {#ExponentialWithSoftplusRate.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_scalar_batch(name='is_scalar_batch')` {#ExponentialWithSoftplusRate.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.is_scalar_event(name='is_scalar_event')` {#ExponentialWithSoftplusRate.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_cdf(value, name='log_cdf')` {#ExponentialWithSoftplusRate.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_prob(value, name='log_prob')` {#ExponentialWithSoftplusRate.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.log_survival_function(value, name='log_survival_function')` {#ExponentialWithSoftplusRate.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.mean(name='mean')` {#ExponentialWithSoftplusRate.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.mode(name='mode')` {#ExponentialWithSoftplusRate.mode}
-
-Mode.
-
-Additional documentation from `Gamma`:
-
-The mode of a gamma distribution is `(shape - 1) / rate` when
-`shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
-an exception will be raised rather than returning `NaN`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.name` {#ExponentialWithSoftplusRate.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#ExponentialWithSoftplusRate.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.param_static_shapes(cls, sample_shape)` {#ExponentialWithSoftplusRate.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.parameters` {#ExponentialWithSoftplusRate.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.prob(value, name='prob')` {#ExponentialWithSoftplusRate.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.rate` {#ExponentialWithSoftplusRate.rate}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.reparameterization_type` {#ExponentialWithSoftplusRate.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.sample(sample_shape=(), seed=None, name='sample')` {#ExponentialWithSoftplusRate.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.stddev(name='stddev')` {#ExponentialWithSoftplusRate.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.survival_function(value, name='survival_function')` {#ExponentialWithSoftplusRate.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.validate_args` {#ExponentialWithSoftplusRate.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.ExponentialWithSoftplusRate.variance(name='variance')` {#ExponentialWithSoftplusRate.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
deleted file mode 100644
index e2676d43d4d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
+++ /dev/null
@@ -1,667 +0,0 @@
-The multivariate normal distribution on `R^k`.
-
-This distribution is defined by a 1-D mean `mu` and covariance matrix `sigma`.
-Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations.
-
-#### Mathematical details
-
-With `C = sigma`, the PDF of this distribution is:
-
-```
-f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
-```
-
-#### Examples
-
-A single multi-variate Gaussian distribution is defined by a vector of means
-of length `k`, and a covariance matrix of shape `k x k`.
-
-Extra leading dimensions, if provided, allow for batches.
-
-```python
-# Initialize a single 3-variate Gaussian with diagonal covariance.
-mu = [1, 2, 3.]
-sigma = [[1, 0, 0], [0, 3, 0], [0, 0, 2.]]
-dist = tf.contrib.distributions.MultivariateNormalFull(mu, chol)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-dist.pdf([-1, 0, 1])
-
-# Initialize a batch of two 3-variate Gaussians.
-mu = [[1, 2, 3], [11, 22, 33.]]
-sigma = ...  # shape 2 x 3 x 3, positive definite.
-dist = tf.contrib.distributions.MultivariateNormalFull(mu, sigma)
-
-# Evaluate this on a two observations, each in R^3, returning a length two
-# tensor.
-x = [[-1, 0, 1], [-11, 0, 11.]]  # Shape 2 x 3.
-dist.pdf(x)
-```
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.__init__(mu, sigma, validate_args=False, allow_nan_stats=True, name='MultivariateNormalFull')` {#MultivariateNormalFull.__init__}
-
-Multivariate Normal distributions on `R^k`.
-
-User must provide means `mu` and `sigma`, the mean and covariance.
-
-##### Args:
-
-
-*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
-    `b >= 0`.
-*  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  Whether to validate input
-    with asserts.  If `validate_args` is `False`, and the inputs are
-    invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: `Boolean`, default `True`.  If `False`, raise an
-    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: The name to give Ops created by the initializer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `mu` and `sigma` are different dtypes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.allow_nan_stats` {#MultivariateNormalFull.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.batch_shape` {#MultivariateNormalFull.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalFull.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.cdf(value, name='cdf')` {#MultivariateNormalFull.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.copy(**override_parameters_kwargs)` {#MultivariateNormalFull.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.covariance(name='covariance')` {#MultivariateNormalFull.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.dtype` {#MultivariateNormalFull.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.entropy(name='entropy')` {#MultivariateNormalFull.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.event_shape` {#MultivariateNormalFull.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalFull.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_continuous` {#MultivariateNormalFull.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalFull.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalFull.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_cdf(value, name='log_cdf')` {#MultivariateNormalFull.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_prob(value, name='log_prob')` {#MultivariateNormalFull.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalFull.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalFull.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mean(name='mean')` {#MultivariateNormalFull.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mode(name='mode')` {#MultivariateNormalFull.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.mu` {#MultivariateNormalFull.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.name` {#MultivariateNormalFull.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalFull.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.param_static_shapes(cls, sample_shape)` {#MultivariateNormalFull.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.parameters` {#MultivariateNormalFull.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.prob(value, name='prob')` {#MultivariateNormalFull.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.reparameterization_type` {#MultivariateNormalFull.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalFull.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sigma` {#MultivariateNormalFull.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.sigma_det(name='sigma_det')` {#MultivariateNormalFull.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.stddev(name='stddev')` {#MultivariateNormalFull.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.survival_function(value, name='survival_function')` {#MultivariateNormalFull.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.validate_args` {#MultivariateNormalFull.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalFull.variance(name='variance')` {#MultivariateNormalFull.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
deleted file mode 100644
index 43a405fd4e0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
+++ /dev/null
@@ -1,640 +0,0 @@
-The Normal distribution with location `loc` and `scale` parameters.
-
-#### Mathematical details
-
-The probability density function (pdf) is,
-
-```none
-pdf(x; mu, sigma) = exp(-0.5 (x - mu)**2 / sigma**2) / Z
-Z = (2 pi sigma**2)**0.5
-```
-
-where `loc = mu` is the mean, `scale = sigma` is the std. deviation, and, `Z`
-is the normalization constant.
-
-The Normal distribution is a member of the [location-scale family](
-https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
-constructed as,
-
-```none
-X ~ Normal(loc=0, scale=1)
-Y = loc + scale * X
-```
-
-#### Examples
-
-Examples of initialization of one or a batch of distributions.
-
-```python
-# Define a single scalar Normal distribution.
-dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
-
-# Evaluate the cdf at 1, returning a scalar.
-dist.cdf(1.)
-
-# Define a batch of two scalar valued Normals.
-# The first has mean 1 and standard deviation 11, the second 2 and 22.
-dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
-
-# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
-# returning a length two tensor.
-dist.prob([0, 1.5])
-
-# Get 3 samples, returning a 3 x 2 tensor.
-dist.sample([3])
-```
-
-Arguments are broadcast when possible.
-
-```python
-# Define a batch of two scalar valued Normals.
-# Both have mean 1, but different standard deviations.
-dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
-
-# Evaluate the pdf of both distributions on the same point, 3.0,
-# returning a length 2 tensor.
-dist.prob(3.0)
-```
-- - -
-
-#### `tf.contrib.distributions.Normal.__init__(loc, scale, validate_args=False, allow_nan_stats=True, name='Normal')` {#Normal.__init__}
-
-Construct Normal distributions with mean and stddev `loc` and `scale`.
-
-The parameters `loc` and `scale` must be shaped in a way that supports
-broadcasting (e.g. `loc + scale` is a valid operation).
-
-##### Args:
-
-
-*  <b>`loc`</b>: Floating point tensor; the means of the distribution(s).
-*  <b>`scale`</b>: Floating point tensor; the stddevs of the distribution(s).
-    Must contain only positive values.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`,
-    statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-    indicate the result is undefined.  When `False`, an exception is raised
-    if one or more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `loc` and `scale` have different `dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.allow_nan_stats` {#Normal.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.batch_shape` {#Normal.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.batch_shape_tensor(name='batch_shape_tensor')` {#Normal.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.cdf(value, name='cdf')` {#Normal.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.copy(**override_parameters_kwargs)` {#Normal.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.covariance(name='covariance')` {#Normal.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.dtype` {#Normal.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.entropy(name='entropy')` {#Normal.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.event_shape` {#Normal.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.event_shape_tensor(name='event_shape_tensor')` {#Normal.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_continuous` {#Normal.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_scalar_batch(name='is_scalar_batch')` {#Normal.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.is_scalar_event(name='is_scalar_event')` {#Normal.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.loc` {#Normal.loc}
-
-Distribution parameter for the mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_cdf(value, name='log_cdf')` {#Normal.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_prob(value, name='log_prob')` {#Normal.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.log_survival_function(value, name='log_survival_function')` {#Normal.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.mean(name='mean')` {#Normal.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.mode(name='mode')` {#Normal.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.name` {#Normal.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Normal.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.param_static_shapes(cls, sample_shape)` {#Normal.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.parameters` {#Normal.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.prob(value, name='prob')` {#Normal.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.reparameterization_type` {#Normal.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.sample(sample_shape=(), seed=None, name='sample')` {#Normal.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.scale` {#Normal.scale}
-
-Distribution parameter for standard deviation.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.stddev(name='stddev')` {#Normal.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.survival_function(value, name='survival_function')` {#Normal.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.validate_args` {#Normal.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Normal.variance(name='variance')` {#Normal.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.RelaxedBernoulli.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.RelaxedBernoulli.md
deleted file mode 100644
index 7df244ba2bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.RelaxedBernoulli.md
+++ /dev/null
@@ -1,725 +0,0 @@
-RelaxedBernoulli distribution with temperature and logits parameters.
-
-The RelaxedBernoulli is a distribution over the unit interval (0,1), which
-continuously approximates a Bernoulli. The degree of approximation is
-controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-becomes discrete with a distribution described by the `logits` or `probs`
-parameters, as the temperature goes to infinity the RelaxedBernoulli
-becomes the constant distribution that is identically 0.5.
-
-The RelaxedBernoulli distribution is a reparameterized continuous
-distribution that is the binary special case of the RelaxedOneHotCategorical
-distribution (Maddison et al., 2016; Jang et al., 2016). For details on the
-binary special case see the appendix of Maddison et al. (2016) where it is
-referred to as BinConcrete. If you use this distribution, please cite both
-papers.
-
-Some care needs to be taken for loss functions that depend on the
-log-probability of RelaxedBernoullis, because computing log-probabilities of
-the RelaxedBernoulli can suffer from underflow issues. In many case loss
-functions such as these are invariant under invertible transformations of
-the random variables. The KL divergence, found in the variational autoencoder
-loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
-random variable followed by a `tf.sigmoid` op, one solution is to treat
-the Logistic as the random variable and `tf.sigmoid` as downstream. The
-KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
-op, is equivalent to evaluating KL divergences of RelaxedBernoulli samples.
-See Maddison et al., 2016 for more details where this distribution is called
-the BinConcrete.
-
-An alternative approach is to evaluate Bernoulli log probability or KL
-directly on relaxed samples, as done in Jang et al., 2016. In this case,
-guarantees on the loss are usually violated. For instance, using a Bernoulli
-KL in a relaxed ELBO is no longer a lower bound on the log marginal
-probability of the observation. Thus care and early stopping are important.
-
-#### Examples
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-probabilities (0.1, 0.5, 0.4). Samples from these distributions will be in
-the unit interval (0,1).
-
-```python
-temperature = 0.5
-p = [0.1, 0.5, 0.4]
-dist = RelaxedBernoulli(temperature, probs=p)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1).
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, whose sigmoid approximate 3 Bernoullis
-with logits (-2, 2, 0).
-
-```python
-temperature = 0.5
-logits = [-2, 2, 0]
-dist = Logistic(logits/temperature, 1./temperature)
-samples = dist.sample()
-sigmoid_samples = tf.sigmoid(samples)
-# sigmoid_samples has the same distribution as samples from
-# RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1). Because the temperature is very low, samples from
-these distributions are almost discrete, usually taking values very close to 0
-or 1.
-
-```python
-temperature = 1e-5
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Creates three continuous distributions, which approximate 3 Bernoullis with
-logits (-2, 2, 0). Samples from these distributions will be in
-the unit interval (0,1). Because the temperature is very high, samples from
-these distributions are usually close to the (0.5, 0.5, 0.5) vector.
-
-```python
-temperature = 100
-logits = [-2, 2, 0]
-dist = RelaxedBernoulli(temperature, logits=logits)
-```
-
-Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution:
-A Continuous Relaxation of Discrete Random Variables. 2016.
-
-Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with
-Gumbel-Softmax. 2016.
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.__init__(temperature, logits=None, probs=None, validate_args=False, allow_nan_stats=True, name='RelaxedBernoulli')` {#RelaxedBernoulli.__init__}
-
-Construct RelaxedBernoulli distributions.
-
-##### Args:
-
-
-*  <b>`temperature`</b>: An 0-D `Tensor`, representing the temperature
-    of a set of RelaxedBernoulli distributions. The temperature should be
-    positive.
-*  <b>`logits`</b>: An N-D `Tensor` representing the log-odds
-    of a positive event. Each entry in the `Tensor` parametrizes
-    an independent RelaxedBernoulli distribution where the probability of an
-    event is sigmoid(logits). Only one of `logits` or `probs` should be
-    passed in.
-*  <b>`probs`</b>: An N-D `Tensor` representing the probability of a positive event.
-    Each entry in the `Tensor` parameterizes an independent Bernoulli
-    distribution. Only one of `logits` or `probs` should be passed in.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `probs` and `logits` are passed, or if neither.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.allow_nan_stats` {#RelaxedBernoulli.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.batch_shape` {#RelaxedBernoulli.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.batch_shape_tensor(name='batch_shape_tensor')` {#RelaxedBernoulli.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.bijector` {#RelaxedBernoulli.bijector}
-
-Function transforming x => y.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.cdf(value, name='cdf')` {#RelaxedBernoulli.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.copy(**override_parameters_kwargs)` {#RelaxedBernoulli.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.covariance(name='covariance')` {#RelaxedBernoulli.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.distribution` {#RelaxedBernoulli.distribution}
-
-Base distribution, p(x).
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.dtype` {#RelaxedBernoulli.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.entropy(name='entropy')` {#RelaxedBernoulli.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.event_shape` {#RelaxedBernoulli.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.event_shape_tensor(name='event_shape_tensor')` {#RelaxedBernoulli.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_continuous` {#RelaxedBernoulli.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_scalar_batch(name='is_scalar_batch')` {#RelaxedBernoulli.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.is_scalar_event(name='is_scalar_event')` {#RelaxedBernoulli.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_cdf(value, name='log_cdf')` {#RelaxedBernoulli.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_prob(value, name='log_prob')` {#RelaxedBernoulli.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-where `g^{-1}` is the inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.log_survival_function(value, name='log_survival_function')` {#RelaxedBernoulli.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.logits` {#RelaxedBernoulli.logits}
-
-Log-odds of `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.mean(name='mean')` {#RelaxedBernoulli.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.mode(name='mode')` {#RelaxedBernoulli.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.name` {#RelaxedBernoulli.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#RelaxedBernoulli.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.param_static_shapes(cls, sample_shape)` {#RelaxedBernoulli.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.parameters` {#RelaxedBernoulli.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.prob(value, name='prob')` {#RelaxedBernoulli.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `TransformedDistribution`:
-
-Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-inverse of `transform`.
-
-Also raises a `ValueError` if `inverse` was not provided to the
-distribution and `y` was not returned from `sample`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.probs` {#RelaxedBernoulli.probs}
-
-Probability of `1`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.reparameterization_type` {#RelaxedBernoulli.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.sample(sample_shape=(), seed=None, name='sample')` {#RelaxedBernoulli.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.stddev(name='stddev')` {#RelaxedBernoulli.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.survival_function(value, name='survival_function')` {#RelaxedBernoulli.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.temperature` {#RelaxedBernoulli.temperature}
-
-Distribution parameter for the location.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.validate_args` {#RelaxedBernoulli.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.RelaxedBernoulli.variance(name='variance')` {#RelaxedBernoulli.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md
deleted file mode 100644
index 95e118cef34..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md
+++ /dev/null
@@ -1,307 +0,0 @@
-Bijector constructed from custom callables.
-
-Example Use:
-
-```python
-exp = Inline(
-  forward_fn=tf.exp,
-  inverse_fn=tf.log,
-  inverse_log_det_jacobian_fn=(
-    lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="exp")
-```
-
-The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.__init__(forward_fn=None, inverse_fn=None, inverse_log_det_jacobian_fn=None, forward_log_det_jacobian_fn=None, forward_event_shape_fn=None, forward_event_shape_tensor_fn=None, inverse_event_shape_fn=None, inverse_event_shape_tensor_fn=None, is_constant_jacobian=False, validate_args=False, name='inline')` {#Inline.__init__}
-
-Creates a `Bijector` from callables.
-
-##### Args:
-
-
-*  <b>`forward_fn`</b>: Python callable implementing the forward transformation.
-*  <b>`inverse_fn`</b>: Python callable implementing the inverse transformation.
-*  <b>`inverse_log_det_jacobian_fn`</b>: Python callable implementing the
-    log o det o jacobian of the inverse transformation.
-*  <b>`forward_log_det_jacobian_fn`</b>: Python callable implementing the
-    log o det o jacobian of the forward transformation.
-*  <b>`forward_event_shape_fn`</b>: Python callable implementing non-identical
-    static event shape changes. Default: shape is assumed unchanged.
-*  <b>`forward_event_shape_tensor_fn`</b>: Python callable implementing non-identical
-    event shape changes. Default: shape is assumed unchanged.
-*  <b>`inverse_event_shape_fn`</b>: Python callable implementing non-identical
-    static event shape changes. Default: shape is assumed unchanged.
-*  <b>`inverse_event_shape_tensor_fn`</b>: Python callable implementing non-identical
-    event shape changes. Default: shape is assumed unchanged.
-*  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is constant
-    for all input arguments.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String`, name given to ops managed by this object.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.dtype` {#Inline.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.event_ndims` {#Inline.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward(x, name='forward')` {#Inline.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_event_shape(input_shape)` {#Inline.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#Inline.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#Inline.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.graph_parents` {#Inline.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse(y, name='inverse')` {#Inline.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#Inline.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_event_shape(output_shape)` {#Inline.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#Inline.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#Inline.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.is_constant_jacobian` {#Inline.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.name` {#Inline.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.Inline.validate_args` {#Inline.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.PowerTransform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.PowerTransform.md
deleted file mode 100644
index da166d0a2db..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.PowerTransform.md
+++ /dev/null
@@ -1,300 +0,0 @@
-Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-of this bijector.
-
-This bijector is equivalent to the `Exp` bijector when `c=0`.
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.__init__(power=0.0, event_ndims=0, validate_args=False, name='power_transform')` {#PowerTransform.__init__}
-
-Instantiates the `PowerTransform` bijector.
-
-##### Args:
-
-
-*  <b>`power`</b>: Python `float` scalar indicating the transform power, i.e.,
-    `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-*  <b>`event_ndims`</b>: Python scalar indicating the number of dimensions associated
-    with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
-    for correctness.
-*  <b>`name`</b>: `String` name given to ops managed by this object.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `power < 0` or is not known statically.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.dtype` {#PowerTransform.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.event_ndims` {#PowerTransform.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward(x, name='forward')` {#PowerTransform.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_event_shape(input_shape)` {#PowerTransform.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#PowerTransform.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#PowerTransform.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.graph_parents` {#PowerTransform.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse(y, name='inverse')` {#PowerTransform.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#PowerTransform.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_event_shape(output_shape)` {#PowerTransform.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#PowerTransform.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#PowerTransform.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.is_constant_jacobian` {#PowerTransform.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.name` {#PowerTransform.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.power` {#PowerTransform.power}
-
-The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.PowerTransform.validate_args` {#PowerTransform.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
deleted file mode 100644
index 08378205ac6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.distributions.kl(dist_a, dist_b, allow_nan=False, name=None)` {#kl}
-
-Get the KL-divergence KL(dist_a || dist_b).
-
-If there is no KL method registered specifically for `type(dist_a)` and
-`type(dist_b)`, then the class hierarchies of these types are searched.
-
-If one KL method is registered between any pairs of classes in these two
-parent hierarchies, it is used.
-
-If more than one such registered method exists, the method whose registered
-classes have the shortest sum MRO paths to the input types is used.
-
-If more than one such shortest path exists, the first method
-identified in the search is used (favoring a shorter MRO distance to
-`type(dist_a)`).
-
-##### Args:
-
-
-*  <b>`dist_a`</b>: The first distribution.
-*  <b>`dist_b`</b>: The second distribution.
-*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
-    if the KL returns NaN values for any batch entry of the given
-    distributions.  If `True`, the KL may return a NaN for the given entry.
-*  <b>`name`</b>: (optional) Name scope to use for created operations.
-
-##### Returns:
-
-  A Tensor with the batchwise KL-divergence between dist_a and dist_b.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If no KL method is defined for distribution types
-    of dist_a and dist_b.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assert_or_get_global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assert_or_get_global_step.md
deleted file mode 100644
index 0e67be85897..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assert_or_get_global_step.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.framework.assert_or_get_global_step(graph=None, global_step_tensor=None)` {#assert_or_get_global_step}
-
-Verifies that a global step tensor is valid or gets one if None is given.
-
-If `global_step_tensor` is not None, check that it is a valid global step
-tensor (using `assert_global_step`). Otherwise find a global step tensor using
-`get_global_step` and return it.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph to find the global step tensor for.
-*  <b>`global_step_tensor`</b>: The tensor to check for suitability as a global step.
-    If None is given (the default), find a global step tensor.
-
-##### Returns:
-
-  A tensor suitable as a global step, or `None` if none was provided and none
-  was found.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assign_from_values_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assign_from_values_fn.md
deleted file mode 100644
index 9a5a82c8c4d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.assign_from_values_fn.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.contrib.framework.assign_from_values_fn(var_names_to_values)` {#assign_from_values_fn}
-
-Returns a function that assigns specific variables from the given values.
-
-This function provides a mechanism for performing assignment of variables
-to values in a way that does not fill the graph with large assignment values.
-
-##### Args:
-
-
-*  <b>`var_names_to_values`</b>: A map from variable names to values.
-
-##### Returns:
-
-  A function that takes a single argument, a `tf.Session`, that applies the
-  assignment operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any of the given variable names were not found.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.filter_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.filter_variables.md
deleted file mode 100644
index 1574edb406e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.filter_variables.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.framework.filter_variables(var_list, include_patterns=None, exclude_patterns=None, reg_search=True)` {#filter_variables}
-
-Filter a list of variables using regular expressions.
-
-First includes variables according to the list of include_patterns.
-Afterwards, eliminates variables according to the list of exclude_patterns.
-
-For example, one can obtain a list of variables with the weights of all
-convolutional layers (depending on the network definition) by:
-
-```python
-variables = tf.contrib.framework.get_model_variables()
-conv_weight_variables = tf.contrib.framework.filter_variables(
-    variables,
-    include_patterns=['Conv'],
-    exclude_patterns=['biases', 'Logits'])
-```
-
-##### Args:
-
-
-*  <b>`var_list`</b>: list of variables.
-*  <b>`include_patterns`</b>: list of regular expressions to include. Defaults to None,
-      which means all variables are selected according to the include rules.
-      A variable is included if it matches any of the include_patterns.
-*  <b>`exclude_patterns`</b>: list of regular expressions to exclude. Defaults to None,
-      which means all variables are selected according to the exclude rules.
-      A variable is excluded if it matches any of the exclude_patterns.
-*  <b>`reg_search`</b>: boolean. If True (default), performs re.search to find matches
-      (i.e. pattern can match any substring of the variable name). If False,
-      performs re.match (i.e. regexp should match from the beginning of the
-      variable name).
-
-##### Returns:
-
-  filtered list of variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.get_variables_by_suffix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.get_variables_by_suffix.md
deleted file mode 100644
index a25cf9006e6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.get_variables_by_suffix.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.get_variables_by_suffix(suffix, scope=None)` {#get_variables_by_suffix}
-
-Gets the list of variables that end with the given suffix.
-
-##### Args:
-
-
-*  <b>`suffix`</b>: suffix for filtering the variables to return.
-*  <b>`scope`</b>: an optional scope for filtering the variables to return.
-
-##### Returns:
-
-  a copied list of variables with the given name and prefix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.has_arg_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.has_arg_scope.md
deleted file mode 100644
index 92f4a772ed4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.has_arg_scope.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.contrib.framework.has_arg_scope(func)` {#has_arg_scope}
-
-Checks whether a func has been decorated with @add_arg_scope or not.
-
-##### Args:
-
-
-*  <b>`func`</b>: function to check.
-
-##### Returns:
-
-  a boolean.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.with_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.with_shape.md
deleted file mode 100644
index 460c9c522c2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.framework.with_shape.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.contrib.framework.with_shape(expected_shape, tensor)` {#with_shape}
-
-Asserts tensor has expected shape.
-
-If tensor shape and expected_shape, are fully defined, assert they match.
-Otherwise, add assert op that will validate the shape when tensor is
-evaluated, and set shape on tensor.
-
-##### Args:
-
-
-*  <b>`expected_shape`</b>: Expected shape to assert, as a 1D array of ints, or tensor
-      of same.
-*  <b>`tensor`</b>: Tensor whose shape we're validating.
-
-##### Returns:
-
-  tensor, perhaps with a dependent assert operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if tensor has an invalid shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape.md
deleted file mode 100644
index b504100d0c3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape(dtype, shape=None, scope=None)` {#make_placeholder_from_dtype_and_shape}
-
-Create a tf.placeholder for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-The placeholder is named using the function placeholder_name (with no
-tensor argument).
-
-##### Args:
-
-
-*  <b>`dtype`</b>: the tensor type.
-*  <b>`shape`</b>: the tensor shape (optional).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A newly created tf.placeholder.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_view.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_view.md
deleted file mode 100644
index b95f8576005..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.make_view.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.graph_editor.make_view(*args, **kwargs)` {#make_view}
-
-Create a SubGraphView from selected operations and passthrough tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) `tf.Tensor`. Those objects will be converted
-    into a list of operations and a list of candidate for passthrough tensors.
-*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
-    the correct graph 2) for regular expression query
-
-##### Returns:
-
-  A subgraph view.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
deleted file mode 100644
index c765240585a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
-
-Create a tf.placeholder for the Graph Editor.
-
-Note that the correct graph scope must be set by the calling function.
-The placeholder is named using the function placeholder_name (with no
-tensor argument).
-
-##### Args:
-
-
-*  <b>`dtype`</b>: the tensor type.
-*  <b>`shape`</b>: the tensor shape (optional).
-*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
-    means that the scope of t is preserved. "" means the root scope.
-
-##### Returns:
-
-  A newly created tf.placeholder.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
deleted file mode 100644
index 80805e574fd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
-
-Create a SubGraphView from selected operations and passthrough tensors.
-
-##### Args:
-
-
-*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
-    `tf.Operation` 3) (array of) `tf.Tensor`. Those objects will be converted
-    into a list of operations and a list of candidate for passthrough tensors.
-*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
-    the correct graph 2) for regular expression query
-
-##### Returns:
-
-  A subgraph view.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if the optional keyword argument graph is not a `tf.Graph`
-    or if an argument in args is not an (array of) `tf.Tensor`
-    or an (array of) `tf.Operation` or a string or a regular expression.
-*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md
deleted file mode 100644
index 5a2ea657842..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.layers.convolution2d_transpose(*args, **kwargs)` {#convolution2d_transpose}
-
-Adds a convolution2d_transpose with an optional batch normalization layer.
-
-The function creates a variable called `weights`, representing the
-kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
-second variable called 'biases' is added to the result of the operation.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D `Tensor` of type `float` and shape
-    `[batch, height, width, in_channels]` for `NHWC` data format or
-    `[batch, in_channels, height, width]` for `NCHW` data format.
-*  <b>`num_outputs`</b>: Integer, the number of output filters.
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the filters. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: One of 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: Whether or not the variables should be trainable or not.
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tensor representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If 'kernel_size' is not a list of length 2.
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If `C` dimension of `inputs` is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.input_from_feature_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.input_from_feature_columns.md
deleted file mode 100644
index e0d5391be95..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.input_from_feature_columns.md
+++ /dev/null
@@ -1,58 +0,0 @@
-### `tf.contrib.layers.input_from_feature_columns(columns_to_tensors, feature_columns, weight_collections=None, trainable=True, scope=None)` {#input_from_feature_columns}
-
-A tf.contrib.layer style input layer builder based on FeatureColumns.
-
-Generally a single example in training data is described with feature columns.
-At the first layer of the model, this column oriented data should be converted
-to a single tensor. Each feature column needs a different kind of operation
-during this conversion. For example sparse features need a totally different
-handling than continuous features.
-
-Example:
-
-```python
-  # Building model for training
-  columns_to_tensor = tf.parse_example(...)
-  first_layer = input_from_feature_columns(
-      columns_to_tensors=columns_to_tensor,
-      feature_columns=feature_columns)
-  second_layer = fully_connected(inputs=first_layer, ...)
-  ...
-```
-
-where feature_columns can be defined as follows:
-
-```python
-  sparse_feature = sparse_column_with_hash_bucket(
-      column_name="sparse_col", ...)
-  sparse_feature_emb = embedding_column(sparse_id_column=sparse_feature, ...)
-  real_valued_feature = real_valued_column(...)
-  real_valued_buckets = bucketized_column(
-      source_column=real_valued_feature, ...)
-
-  feature_columns=[sparse_feature_emb, real_valued_buckets]
-```
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived by FeatureColumn.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A Tensor which can be consumed by hidden layers in the neural network.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be consumed by a neural network.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md
deleted file mode 100644
index 277976dacfd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md
+++ /dev/null
@@ -1,39 +0,0 @@
-### `tf.contrib.layers.layer_norm(*args, **kwargs)` {#layer_norm}
-
-Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450.
-
-  "Layer Normalization"
-
-  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
-
-Can be used as a normalizer function for conv2d and fully_connected.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A tensor with 2 or more dimensions. The normalization
-          occurs over all but the first dimension.
-*  <b>`center`</b>: If True, add offset of `beta` to normalized tensor. If False, `beta`
-    is ignored.
-*  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
-    not used. When the next layer is linear (also e.g. `nn.relu`), this can be
-    disabled since the scaling can be done by the next layer.
-*  <b>`activation_fn`</b>: Activation function, default set to None to skip it and
-    maintain a linear activation.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional collections for the variables.
-*  <b>`outputs_collections`</b>: Collections to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If rank or last dimension of `inputs` is undefined.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sequence_input_from_feature_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sequence_input_from_feature_columns.md
deleted file mode 100644
index 937cc2db48a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sequence_input_from_feature_columns.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.layers.sequence_input_from_feature_columns(*args, **kwargs)` {#sequence_input_from_feature_columns}
-
-Builds inputs for sequence models from `FeatureColumn`s. (experimental)
-
-THIS FUNCTION IS EXPERIMENTAL. It may change or be removed at any time, and without warning.
-
-
-See documentation for `input_from_feature_columns`. The following types of
-`FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`,
-`_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`,
-`_DataFrameColumn`. In addition, columns in `feature_columns` may not be
-constructed using any of the following: `ScatteredEmbeddingColumn`,
-`BucketizedColumn`, `CrossedColumn`.
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived by FeatureColumn.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A Tensor which can be consumed by hidden layers in the neural network.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be consumed by a neural network.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sparse_column_with_hash_bucket.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sparse_column_with_hash_bucket.md
deleted file mode 100644
index 0d00e31a68e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.sparse_column_with_hash_bucket.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.contrib.layers.sparse_column_with_hash_bucket(column_name, hash_bucket_size, combiner='sum', dtype=tf.string)` {#sparse_column_with_hash_bucket}
-
-Creates a _SparseColumn with hashed bucket configuration.
-
-Use this when your sparse features are in string or integer format, but you
-don't have a vocab file that maps each value to an integer ID.
-output_id = Hash(input_feature_string) % bucket_size
-
-##### Args:
-
-
-*  <b>`column_name`</b>: A string defining sparse column name.
-*  <b>`hash_bucket_size`</b>: An int that is > 1. The number of buckets.
-*  <b>`combiner`</b>: A string specifying how to reduce if the sparse column is
-    multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
-    the default. "sqrtn" often achieves good accuracy, in particular with
-    bag-of-words columns.
-      * "sum": do not normalize features in the column
-      * "mean": do l1 normalization on features in the column
-      * "sqrtn": do l2 normalization on features in the column
-    For more information: `tf.embedding_lookup_sparse`.
-*  <b>`dtype`</b>: The type of features. Only string and integer types are supported.
-
-##### Returns:
-
-  A _SparseColumn with hashed bucket configuration
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: hash_bucket_size is not greater than 2.
-*  <b>`ValueError`</b>: dtype is neither string nor integer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.build_parsing_serving_input_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.build_parsing_serving_input_fn.md
deleted file mode 100644
index 41ee38cccc0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.build_parsing_serving_input_fn.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.learn.build_parsing_serving_input_fn(feature_spec, default_batch_size=None)` {#build_parsing_serving_input_fn}
-
-Build an input_fn appropriate for serving, expecting fed tf.Examples.
-
-Creates an input_fn that expects a serialized tf.Example fed into a string
-placeholder.  The function parses the tf.Example according to the provided
-feature_spec, and returns all parsed Tensors as features.  This input_fn is
-for use at serving time, so the labels return value is always None.
-
-##### Args:
-
-
-*  <b>`feature_spec`</b>: a dict of string to `VarLenFeature`/`FixedLenFeature`.
-*  <b>`default_batch_size`</b>: the number of query examples expected per batch.
-      Leave unset for variable batch size (recommended).
-
-##### Returns:
-
-  An input_fn suitable for use in serving.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.monitors.EveryN.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.monitors.EveryN.md
deleted file mode 100644
index 0caa4f902e6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.monitors.EveryN.md
+++ /dev/null
@@ -1,232 +0,0 @@
-Base class for monitors that execute callbacks every N steps.
-
-This class adds three new callbacks:
-  - every_n_step_begin
-  - every_n_step_end
-  - every_n_post_step
-
-The callbacks are executed every n steps, or optionally every step for the
-first m steps, where m and n can both be user-specified.
-
-When extending this class, note that if you wish to use any of the
-`BaseMonitor` callbacks, you must call their respective super implementation:
-
-  def step_begin(self, step):
-    super(ExampleMonitor, self).step_begin(step)
-    return []
-
-Failing to call the super implementation will cause unpredictable behavior.
-
-The `every_n_post_step()` callback is also called after the last step if it
-was not already called through the regular conditions.  Note that
-`every_n_step_begin()` and `every_n_step_end()` do not receive that special
-treatment.
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.__init__(every_n_steps=100, first_n_steps=1)` {#EveryN.__init__}
-
-Initializes an `EveryN` monitor.
-
-##### Args:
-
-
-*  <b>`every_n_steps`</b>: `int`, the number of steps to allow between callbacks.
-*  <b>`first_n_steps`</b>: `int`, specifying the number of initial steps during
-    which the callbacks will always be executed, regardless of the value
-    of `every_n_steps`. Note that this value is relative to the global step
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.begin(max_steps=None)` {#EveryN.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.end(session=None)` {#EveryN.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.epoch_begin(epoch)` {#EveryN.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.epoch_end(epoch)` {#EveryN.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_post_step(step, session)` {#EveryN.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_step_begin(step)` {#EveryN.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.every_n_step_end(step, outputs)` {#EveryN.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.post_step(step, session)` {#EveryN.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.run_on_all_workers` {#EveryN.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.set_estimator(estimator)` {#EveryN.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.step_begin(step)` {#EveryN.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.EveryN.step_end(step, output)` {#EveryN.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md
deleted file mode 100644
index 5bd387a527b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.legacy_seq2seq.model_with_buckets.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.contrib.legacy_seq2seq.model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None)` {#model_with_buckets}
-
-Create a sequence-to-sequence model with support for bucketing.
-
-The seq2seq argument is a function that defines a sequence-to-sequence model,
-e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(
-    x, y, core_rnn_cell.GRUCell(24))
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of Tensors to feed the encoder; first seq2seq input.
-*  <b>`decoder_inputs`</b>: A list of Tensors to feed the decoder; second seq2seq input.
-*  <b>`targets`</b>: A list of 1D batch-sized int32 Tensors (desired output sequence).
-*  <b>`weights`</b>: List of 1D batch-sized float-Tensors to weight the targets.
-*  <b>`buckets`</b>: A list of pairs of (input size, output size) for each bucket.
-*  <b>`seq2seq`</b>: A sequence-to-sequence model function; it takes 2 input that
-    agree with encoder_inputs and decoder_inputs, and returns a pair
-    consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
-*  <b>`softmax_loss_function`</b>: Function (inputs-batch, labels-batch) -> loss-batch
-    to be used instead of the standard softmax (the default if this is None).
-*  <b>`per_example_loss`</b>: Boolean. If set, the returned loss will be a batch-sized
-    tensor of losses for each sequence in the batch. If unset, it will be
-    a scalar with the averaged loss from all examples.
-*  <b>`name`</b>: Optional name for this operation, defaults to "model_with_buckets".
-
-##### Returns:
-
-  A tuple of the form (outputs, losses), where:
-
-*  <b>`outputs`</b>: The outputs for each bucket. Its j'th element consists of a list
-      of 2D Tensors. The shape of output tensors can be either
-      [batch_size x output_size] or [batch_size x num_decoder_symbols]
-      depending on the seq2seq model used.
-*  <b>`losses`</b>: List of scalar Tensors, representing losses for each bucket, or,
-      if per_example_loss is set, a list of 1D batch-sized float Tensors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If length of encoder_inputsut, targets, or weights is smaller
-    than the largest (last) bucket.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md
deleted file mode 100644
index 9ccbabb4b99..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.losses.softmax_cross_entropy.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.losses.softmax_cross_entropy(*args, **kwargs)` {#softmax_cross_entropy}
-
-Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.softmax_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
-    new_onehot_labels = onehot_labels * (1 - label_smoothing)
-                        + label_smoothing / num_classes
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`onehot_labels`</b>: [batch_size, num_classes] one-hot-encoded labels.
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
-    of shape [batch_size].
-*  <b>`label_smoothing`</b>: If greater than 0 then smooth the labels.
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the mean loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `logits` doesn't match that of `onehot_labels`
-    or if the shape of `weights` is invalid or if `weights` is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md
deleted file mode 100644
index 7b1f2576776..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_absolute_error(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_absolute_error}
-
-Computes the mean absolute error between the labels and predictions.
-
-The `streaming_mean_absolute_error` function creates two local variables,
-`total` and `count` that are used to compute the mean absolute error. This
-average is weighted by `weights`, and it is ultimately returned as
-`mean_absolute_error`: an idempotent operation that simply divides `total` by
-`count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_absolute_error`. Internally, an `absolute_errors` operation computes the
-absolute value of the differences between `predictions` and `labels`. Then
-`update_op` increments `total` with the reduced sum of the product of
-`weights` and `absolute_errors`, and it increments `count` with the reduced
-sum of `weights`
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that
-    `mean_absolute_error` should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_absolute_error`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `mean_absolute_error`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
deleted file mode 100644
index 34a3eb06407..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.contrib.metrics.streaming_precision(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision}
-
-Computes the precision of the predictions with respect to the labels.
-
-The `streaming_precision` function creates two local variables,
-`true_positives` and `false_positives`, that are used to compute the
-precision. This value is ultimately returned as `precision`, an idempotent
-operation that simply divides `true_positives` by the sum of `true_positives`
-and `false_positives`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision`. `update_op` weights each prediction by the corresponding value in
-`weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
-*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
-    match `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `precision` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`precision`</b>: Scalar float `Tensor` with the value of `true_positives`
-    divided by the sum of `true_positives` and `false_positives`.
-*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
-    `false_positives` variables appropriately and whose value matches
-    `precision`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md
deleted file mode 100644
index 979083617de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.contrib.metrics.streaming_sensitivity_at_specificity(predictions, labels, specificity, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sensitivity_at_specificity}
-
-Computes the specificity at a given sensitivity.
-
-The `streaming_sensitivity_at_specificity` function creates four local
-variables, `true_positives`, `true_negatives`, `false_positives` and
-`false_negatives` that are used to compute the sensitivity at the given
-specificity value. The threshold for the given specificity value is computed
-and used to evaluate the corresponding sensitivity.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`sensitivity`. `update_op` increments the `true_positives`, `true_negatives`,
-`false_positives` and `false_negatives` counts with the weight of each case
-found in the `predictions` and `labels`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-For additional information about specificity and sensitivity, see the
-following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`specificity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
-    specificity.
-*  <b>`metrics_collections`</b>: An optional list of collections that `sensitivity`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`sensitivity`</b>: A scalar `Tensor` representing the sensitivity at the given
-    `specificity` value.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `sensitivity`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    `specificity` is not between 0 and 1, or if either `metrics_collections`
-    or `updates_collections` are not a list or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md
deleted file mode 100644
index ed12bd46573..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.contrib.metrics.streaming_specificity_at_sensitivity(predictions, labels, sensitivity, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, name=None)` {#streaming_specificity_at_sensitivity}
-
-Computes the specificity at a given sensitivity.
-
-The `streaming_specificity_at_sensitivity` function creates four local
-variables, `true_positives`, `true_negatives`, `false_positives` and
-`false_negatives` that are used to compute the specificity at the given
-sensitivity value. The threshold for the given sensitivity value is computed
-and used to evaluate the corresponding specificity.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`specificity`. `update_op` increments the `true_positives`, `true_negatives`,
-`false_positives` and `false_negatives` counts with the weight of each case
-found in the `predictions` and `labels`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-For additional information about specificity and sensitivity, see the
-following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
-    are in the range `[0, 1]`.
-*  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`sensitivity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
-    sensitivity.
-*  <b>`metrics_collections`</b>: An optional list of collections that `specificity`
-    should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`specificity`</b>: A scalar `Tensor` representing the specificity at the given
-    `specificity` value.
-*  <b>`update_op`</b>: An operation that increments the `true_positives`,
-    `true_negatives`, `false_positives` and `false_negatives` variables
-    appropriately and whose value matches `specificity`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    `sensitivity` is not between 0 and 1, or if either `metrics_collections`
-    or `updates_collections` are not a list or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_true_positives_at_thresholds.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_true_positives_at_thresholds.md
deleted file mode 100644
index 685b0ba5e9c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_true_positives_at_thresholds.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.metrics.streaming_true_positives_at_thresholds(predictions, labels, thresholds, weights=None)` {#streaming_true_positives_at_thresholds}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.opt.ExternalOptimizerInterface.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.opt.ExternalOptimizerInterface.md
deleted file mode 100644
index 7a9d543863e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.opt.ExternalOptimizerInterface.md
+++ /dev/null
@@ -1,56 +0,0 @@
-Base class for interfaces with external optimization algorithms.
-
-Subclass this and implement `_minimize` in order to wrap a new optimization
-algorithm.
-
-`ExternalOptimizerInterface` should not be instantiated directly; instead use
-e.g. `ScipyOptimizerInterface`.
-
-- - -
-
-#### `tf.contrib.opt.ExternalOptimizerInterface.__init__(loss, var_list=None, equalities=None, inequalities=None, **optimizer_kwargs)` {#ExternalOptimizerInterface.__init__}
-
-Initialize a new interface instance.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A scalar `Tensor` to be minimized.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`equalities`</b>: Optional list of equality constraint scalar `Tensor`s to be
-    held equal to zero.
-*  <b>`inequalities`</b>: Optional list of inequality constraint scalar `Tensor`s
-    to be kept nonnegative.
-*  <b>`**optimizer_kwargs`</b>: Other subclass-specific keyword arguments.
-
-
-
-- - -
-
-#### `tf.contrib.opt.ExternalOptimizerInterface.minimize(session=None, feed_dict=None, fetches=None, step_callback=None, loss_callback=None)` {#ExternalOptimizerInterface.minimize}
-
-Minimize a scalar `Tensor`.
-
-Variables subject to optimization are updated in-place at the end of
-optimization.
-
-Note that this method does *not* just return a minimization `Op`, unlike
-`Optimizer.minimize()`; instead it actually performs minimization by
-executing commands to control a `Session`.
-
-##### Args:
-
-
-*  <b>`session`</b>: A `Session` instance.
-*  <b>`feed_dict`</b>: A feed dict to be passed to calls to `session.run`.
-*  <b>`fetches`</b>: A list of `Tensor`s to fetch and supply to `loss_callback`
-    as positional arguments.
-*  <b>`step_callback`</b>: A function to be called at each optimization step;
-    arguments are the current values of all optimization variables
-    flattened into a single vector.
-*  <b>`loss_callback`</b>: A function to be called every time the loss and gradients
-    are computed, with evaluated fetches supplied as positional arguments.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md
deleted file mode 100644
index 9f13497f476..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.BasicRNNCell.md
+++ /dev/null
@@ -1,51 +0,0 @@
-The most basic RNN cell.
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.__call__(inputs, state, scope=None)` {#BasicRNNCell.__call__}
-
-Most basic RNN: output = new_state = act(W * input + U * state + B).
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.__init__(num_units, input_size=None, activation=tanh)` {#BasicRNNCell.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.output_size` {#BasicRNNCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.state_size` {#BasicRNNCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.BasicRNNCell.zero_state(batch_size, dtype)` {#BasicRNNCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.FusedRNNCellAdaptor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.FusedRNNCellAdaptor.md
deleted file mode 100644
index 18ee35ad475..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.FusedRNNCellAdaptor.md
+++ /dev/null
@@ -1,21 +0,0 @@
-This is an adaptor for RNNCell classes to be used with `FusedRNNCell`.
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCellAdaptor.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#FusedRNNCellAdaptor.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCellAdaptor.__init__(cell, use_dynamic_rnn=False)` {#FusedRNNCellAdaptor.__init__}
-
-Initialize the adaptor.
-
-##### Args:
-
-
-*  <b>`cell`</b>: an instance of a subclass of a `rnn_cell.RNNCell`.
-*  <b>`use_dynamic_rnn`</b>: whether to use dynamic (or static) RNN.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMBlockFusedCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMBlockFusedCell.md
deleted file mode 100644
index ccd8831ec67..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMBlockFusedCell.md
+++ /dev/null
@@ -1,72 +0,0 @@
-FusedRNNCell implementation of LSTM.
-
-This is an extremely efficient LSTM implementation, that uses a single TF op
-for the entire LSTM. It should be both faster and more memory-efficient than
-LSTMBlockCell defined above.
-
-The implementation is based on: http://arxiv.org/abs/1409.2329.
-
-We add forget_bias (default: 1) to the biases of the forget gate in order to
-reduce the scale of forgetting in the beginning of the training.
-
-The variable naming is consistent with `core_rnn_cell.LSTMCell`.
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#LSTMBlockFusedCell.__call__}
-
-Run this LSTM on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len, batch_size, input_size]`
-    or a list of `time_len` tensors of shape `[batch_size, input_size]`.
-*  <b>`initial_state`</b>: a tuple `(initial_cell_state, initial_output)` with tensors
-    of shape `[batch_size, self._num_units]`. If this is not provided, the
-    cell is expected to create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len).`
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` for the created subgraph; defaults to class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len, batch_size, output_size]`
-    or a list of time_len tensors of shape `[batch_size, output_size]`,
-    to match the type of the `inputs`.
-  - Final state: a tuple `(cell_state, output)` matching `initial_state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: in case of shape mismatches
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.__init__(num_units, forget_bias=1.0, cell_clip=None, use_peephole=False)` {#LSTMBlockFusedCell.__init__}
-
-Initialize the LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
-*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`cell_clip`</b>: clip the cell to this value. Defaults to `3`.
-*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMBlockFusedCell.num_units` {#LSTMBlockFusedCell.num_units}
-
-Number of units in this cell (output dimension).
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md
deleted file mode 100644
index 0d380d1e2e2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.LSTMCell.md
+++ /dev/null
@@ -1,124 +0,0 @@
-Long short-term memory unit (LSTM) recurrent network cell.
-
-The default non-peephole implementation is based on:
-
-  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-
-S. Hochreiter and J. Schmidhuber.
-"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
-
-The peephole implementation is based on:
-
-  https://research.google.com/pubs/archive/43905.pdf
-
-Hasim Sak, Andrew Senior, and Francoise Beaufays.
-"Long short-term memory recurrent neural network architectures for
- large scale acoustic modeling." INTERSPEECH, 2014.
-
-The class uses optional peep-hole connections, optional cell clipping, and
-an optional projection layer.
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.__call__(inputs, state, scope=None)` {#LSTMCell.__call__}
-
-Run one step of LSTM.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: input Tensor, 2D, batch x num_units.
-*  <b>`state`</b>: if `state_is_tuple` is False, this must be a state Tensor,
-    `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
-    tuple of state Tensors, both `2-D`, with column sizes `c_state` and
-    `m_state`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "lstm_cell".
-
-##### Returns:
-
-  A tuple containing:
-
-  - A `2-D, [batch x output_dim]`, Tensor representing the output of the
-    LSTM after reading `inputs` when previous state was `state`.
-    Here output_dim is:
-       num_proj if num_proj was set,
-       num_units otherwise.
-  - Tensor(s) representing the new state of LSTM after reading `inputs` when
-    the previous state was `state`.  Same type and shape(s) as `state`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input size cannot be inferred from inputs via
-    static shape inference.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.__init__(num_units, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=tanh)` {#LSTMCell.__init__}
-
-Initialize the parameters for an LSTM cell.
-
-##### Args:
-
-
-*  <b>`num_units`</b>: int, The number of units in the LSTM cell
-*  <b>`input_size`</b>: Deprecated and unused.
-*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
-*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
-    by this value prior to the cell output activation.
-*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
-    projection matrices.
-*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
-    matrices.  If None, no projection is performed.
-*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-    provided, then the projected values are clipped elementwise to within
-    `[-proj_clip, proj_clip]`.
-*  <b>`num_unit_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`num_proj_shards`</b>: Deprecated, will be removed by Jan. 2017.
-    Use a variable_scope partitioner instead.
-*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
-    in order to reduce the scale of forgetting at the beginning of
-    the training.
-*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
-    the `c_state` and `m_state`.  If False, they are concatenated
-    along the column axis.  This latter behavior will soon be deprecated.
-*  <b>`activation`</b>: Activation function of the inner states.
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.output_size` {#LSTMCell.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.state_size` {#LSTMCell.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.LSTMCell.zero_state(batch_size, dtype)` {#LSTMCell.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.ResidualWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.ResidualWrapper.md
deleted file mode 100644
index cbc0b1effe2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.rnn.ResidualWrapper.md
+++ /dev/null
@@ -1,76 +0,0 @@
-RNNCell wrapper that ensures cell inputs are added to the outputs.
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.__call__(inputs, state, scope=None)` {#ResidualWrapper.__call__}
-
-Run the cell and add its inputs to its outputs.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: cell inputs.
-*  <b>`state`</b>: cell state.
-*  <b>`scope`</b>: optional cell scope.
-
-##### Returns:
-
-  Tuple of cell outputs and new state.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If cell inputs and outputs have different structure (type).
-*  <b>`ValueError`</b>: If cell inputs and outputs have different structure (value).
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.__init__(cell)` {#ResidualWrapper.__init__}
-
-Constructs a `ResidualWrapper` for `cell`.
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of `RNNCell`.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.output_size` {#ResidualWrapper.output_size}
-
-Integer or TensorShape: size of outputs produced by this cell.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.state_size` {#ResidualWrapper.state_size}
-
-size(s) of state(s) used by this cell.
-
-It can be represented by an Integer, a TensorShape or a tuple of Integers
-or TensorShapes.
-
-
-- - -
-
-#### `tf.contrib.rnn.ResidualWrapper.zero_state(batch_size, dtype)` {#ResidualWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.util.make_ndarray.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.util.make_ndarray.md
deleted file mode 100644
index 7b2a81d48e9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.util.make_ndarray.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.util.make_ndarray(tensor)` {#make_ndarray}
-
-Create a numpy ndarray from a tensor.
-
-Create a numpy ndarray with the same shape and data as the tensor.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A TensorProto.
-
-##### Returns:
-
-  A numpy array with the tensor contents.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if tensor has unsupported type.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.count_up_to.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.count_up_to.md
deleted file mode 100644
index 97f802372c1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.count_up_to.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.count_up_to(ref, limit, name=None)` {#count_up_to}
-
-Increments 'ref' until it reaches 'limit'.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Should be from a scalar `Variable` node.
-*  <b>`limit`</b>: An `int`.
-    If incrementing ref would bring it above limit, instead generates an
-    'OutOfRange' error.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `ref`.
-  A copy of the input before increment. If nothing else modifies the
-  input, the values produced will all be distinct.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.dynamic_stitch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.dynamic_stitch.md
deleted file mode 100644
index 3eaba84d7cc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.dynamic_stitch.md
+++ /dev/null
@@ -1,59 +0,0 @@
-### `tf.dynamic_stitch(indices, data, name=None)` {#dynamic_stitch}
-
-Interleave the values from the `data` tensors into a single tensor.
-
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values are merged in order, so if an index appears in both `indices[m][i]` and
-`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-merged result.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicStitch.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`indices`</b>: A list of at least 1 `Tensor` objects of type `int32`.
-*  <b>`data`</b>: A list with the same number of `Tensor` objects as `indices` of `Tensor` objects of the same type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.error_code_from_exception_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.error_code_from_exception_type.md
deleted file mode 100644
index fce6574c6b1..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.error_code_from_exception_type.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.errors.error_code_from_exception_type(cls)` {#error_code_from_exception_type}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.exception_type_from_error_code.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.exception_type_from_error_code.md
deleted file mode 100644
index c635c56a014..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.errors.exception_type_from_error_code.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.errors.exception_type_from_error_code(error_code)` {#exception_type_from_error_code}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.fft3d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.fft3d.md
deleted file mode 100644
index a1cf358fe2d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.fft3d.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.fft3d(input, name=None)` {#fft3d}
-
-Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
-
-dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 3
-    dimensions of `input` are replaced with their 3D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft3
-  @end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection.md
deleted file mode 100644
index fc0044b490d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.get_collection(key, scope=None)` {#get_collection}
-
-Wrapper for `Graph.get_collection()` using the default graph.
-
-See [`Graph.get_collection()`](../../api_docs/python/framework.md#Graph.get_collection)
-for more details.
-
-##### Args:
-
-
-*  <b>`key`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-*  <b>`scope`</b>: (Optional.) If supplied, the resulting list is filtered to include
-    only items whose `name` attribute matches using `re.match`. Items
-    without a `name` attribute are never returned if a scope is supplied and
-    the choice or `re.match` means that a `scope` without special tokens
-    filters by prefix.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or
-  an empty list if no value has been added to that collection. The
-  list contains the values in the order under which they were
-  collected.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection_ref.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection_ref.md
deleted file mode 100644
index c393da22332..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_collection_ref.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.get_collection_ref(key)` {#get_collection_ref}
-
-Wrapper for `Graph.get_collection_ref()` using the default graph.
-
-See [`Graph.get_collection_ref()`](../../api_docs/python/framework.md#Graph.get_collection_ref)
-for more details.
-
-##### Args:
-
-
-*  <b>`key`</b>: The key for the collection. For example, the `GraphKeys` class
-    contains many standard names for collections.
-
-##### Returns:
-
-  The list of values in the collection with the given `name`, or an empty
-  list if no value has been added to that collection.  Note that this returns
-  the collection list itself, which can be modified in place to change the
-  collection.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_session_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_session_tensor.md
deleted file mode 100644
index 42623f6706b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.get_session_tensor.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.get_session_tensor(handle, dtype, name=None)` {#get_session_tensor}
-
-Get the tensor of type `dtype` by feeding a tensor handle.
-
-This is EXPERIMENTAL and subject to change.
-
-Get the value of the tensor from a tensor handle. The tensor
-is produced in a previous run() and stored in the state of the
-session.
-
-##### Args:
-
-
-*  <b>`handle`</b>: The string representation of a persistent tensor handle.
-*  <b>`dtype`</b>: The type of the output tensor.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A pair of tensors. The first is a placeholder for feeding a
-  tensor handle and the second is the tensor in the session state
-  keyed by the tensor handle.
-
-
-*  <b>`Example`</b>: 
-
-```python
-c = tf.multiply(a, b)
-h = tf.get_session_handle(c)
-h = sess.run(h)
-
-p, a = tf.get_session_tensor(h.handle, tf.float32)
-b = tf.multiply(a, 10)
-c = sess.run(b, feed_dict={p: h.handle})
-```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ifft.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ifft.md
deleted file mode 100644
index 4e8b5c691d3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ifft.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.ifft(input, name=None)` {#ifft}
-
-Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
-
-dimension of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its inverse 1D Fourier Transform.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.non_max_suppression.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.non_max_suppression.md
deleted file mode 100644
index d6b354a3d26..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.non_max_suppression.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.image.non_max_suppression(boxes, scores, max_output_size, iou_threshold=None, name=None)` {#non_max_suppression}
-
-Greedily selects a subset of bounding boxes in descending order of score,
-
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-
-  selected_indices = tf.image.non_max_suppression(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-
-##### Args:
-
-
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    A 2-D float tensor of shape `[num_boxes, 4]`.
-*  <b>`scores`</b>: A `Tensor` of type `float32`.
-    A 1-D float tensor of shape `[num_boxes]` representing a single
-    score corresponding to each box (each row of boxes).
-*  <b>`max_output_size`</b>: A `Tensor` of type `int32`.
-    A scalar integer tensor representing the maximum number of
-    boxes to be selected by non max suppression.
-*  <b>`iou_threshold`</b>: An optional `float`. Defaults to `0.5`.
-    A float representing the threshold for deciding whether boxes
-    overlap too much with respect to IOU.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int32`.
-  A 1-D integer tensor of shape `[M]` representing the selected
-  indices from the boxes tensor, where `M <= max_output_size`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.random_flip_left_right.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.random_flip_left_right.md
deleted file mode 100644
index d063895136a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.random_flip_left_right.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.image.random_flip_left_right(image, seed=None)` {#random_flip_left_right}
-
-Randomly flip an image horizontally (left to right).
-
-With a 1 in 2 chance, outputs the contents of `image` flipped along the
-second dimension, which is `width`.  Otherwise output the image as-is.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.resize_area.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.resize_area.md
deleted file mode 100644
index dbc6fd1bcd0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.image.resize_area.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.image.resize_area(images, size, align_corners=None, name=None)` {#resize_area}
-
-Resize `images` to `size` using area interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
deleted file mode 100644
index 3a00afa8db5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.less(x, y, name=None)` {#less}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.lgamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.lgamma.md
deleted file mode 100644
index a4add48fb4d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.lgamma.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.lgamma(x, name=None)` {#lgamma}
-
-Computes the log of the absolute value of `Gamma(x)` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
deleted file mode 100644
index e04b6a15d2c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.logical_or(x, y, name=None)` {#logical_or}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_band_part.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_band_part.md
deleted file mode 100644
index 87bd745c2af..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_band_part.md
+++ /dev/null
@@ -1,61 +0,0 @@
-### `tf.matrix_band_part(input, num_lower, num_upper, name=None)` {#matrix_band_part}
-
-Copy a tensor setting everything outside a central band in each innermost matrix
-
-to zero.
-
-The `band` part is computed as follows:
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor with the same shape where
-
-`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-
-The indicator function
-
-`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-                 (num_upper < 0 || (n-m) <= num_upper)`.
-
-For example:
-
-```prettyprint
-# if 'input' is [[ 0,  1,  2, 3]
-                 [-1,  0,  1, 2]
-                 [-2, -1,  0, 1]
-                 [-3, -2, -1, 0]],
-
-tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-                                       [-1,  0,  1, 2]
-                                       [ 0, -1,  0, 1]
-                                       [ 0,  0, -1, 0]],
-
-tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-                                      [-1,  0,  1, 0]
-                                      [-2, -1,  0, 1]
-                                      [ 0, -2, -1, 0]]
-```
-
-Useful special cases:
-
-```prettyprint
- tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
- tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
- tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k` tensor.
-*  <b>`num_lower`</b>: A `Tensor` of type `int64`.
-    0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-    lower triangle.
-*  <b>`num_upper`</b>: A `Tensor` of type `int64`.
-    0-D tensor. Number of superdiagonals to keep. If negative, keep
-    entire upper triangle.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Rank `k` tensor of the same shape as input. The extracted banded tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag.md
deleted file mode 100644
index 16ba620c836..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.matrix_diag(diagonal, name=None)` {#matrix_diag}
-
-Returns a batched diagonal tensor with a given batched diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-
-`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-
-For example:
-
-```prettyprint
-# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-and diagonal.shape = (2, 4)
-
-tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-                                     [0, 2, 0, 0]
-                                     [0, 0, 3, 0]
-                                     [0, 0, 0, 4]],
-                                    [[5, 0, 0, 0]
-                                     [0, 6, 0, 0]
-                                     [0, 0, 7, 0]
-                                     [0, 0, 0, 8]]]
-
-which has shape (2, 4, 4)
-```
-
-##### Args:
-
-
-*  <b>`diagonal`</b>: A `Tensor`. Rank `k`, where `k >= 1`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `diagonal`.
-  Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag_part.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag_part.md
deleted file mode 100644
index efaf772f6b9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_diag_part.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.matrix_diag_part(input, name=None)` {#matrix_diag_part}
-
-Returns the batched diagonal part of a batched tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the batched `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-
-`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-
-The input must be at least a matrix.
-
-For example:
-
-```prettyprint
-# 'input' is [[[1, 0, 0, 0]
-               [0, 2, 0, 0]
-               [0, 0, 3, 0]
-               [0, 0, 0, 4]],
-              [[5, 0, 0, 0]
-               [0, 6, 0, 0]
-               [0, 0, 7, 0]
-               [0, 0, 0, 8]]]
-
-and input.shape = (2, 4, 4)
-
-tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-which has shape (2, 4)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k` tensor where `k >= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The extracted diagonal(s) having shape
-  `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve.md
deleted file mode 100644
index 88d037f2fa6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}
-
-Solves systems of linear equations.
-
-`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `True` then each output matrix satisfies
-`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`, `complex64`, `complex128`.
-    Shape is `[..., M, M]`.
-*  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
-    Shape is `[..., M, K]`.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-    Boolean indicating whether to solve with `matrix` or its (block-wise)
-    adjoint.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `matrix`. Shape is `[..., M, K]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve_ls.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve_ls.md
deleted file mode 100644
index 7c163ae7f01..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_solve_ls.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None)` {#matrix_solve_ls}
-
-Solves one or more linear least-squares problems.
-
-`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form `M`-by-`N` matrices. Rhs is a tensor of shape `[..., M, K]` whose
-inner-most 2 dimensions form `M`-by-`K` matrices.   The computed output is a
-`Tensor` of shape `[..., N, K]` whose inner-most 2 dimensions form `M`-by-`K`
-matrices that solve the equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]` in the least squares
-sense.
-
-Below we will use the following notation for each pair of matrix and
-right-hand sides in the batch:
-
-`matrix`=\\(A \in \Re^{m \times n}\\),
-`rhs`=\\(B  \in \Re^{m \times k}\\),
-`output`=\\(X  \in \Re^{n \times k}\\),
-`l2_regularizer`=\\(\lambda\\).
-
-If `fast` is `True`, then the solution is computed by solving the normal
-equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-\\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k}} ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-\\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is
-the minimum-norm solution to the under-determined linear system, i.e.
-\\(X = \mathrm{argmin}_{Z \in \Re^{n \times k}} ||Z||_F^2 \\), subject to
-\\(A Z = B\\). Notice that the fast path is only numerically stable when
-\\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach}}}\\) or\\(\lambda\\)
-is sufficiently large.
-
-If `fast` is `False` an algorithm based on the numerically robust complete
-orthogonal decomposition is used. This computes the minimum-norm
-least-squares solution, even when \\(A\\) is rank deficient. This path is
-typically 6-7 times slower than the fast path. If `fast` is `False` then
-`l2_regularizer` is ignored.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`.
-*  <b>`rhs`</b>: `Tensor` of shape `[..., M, K]`.
-*  <b>`l2_regularizer`</b>: 0-D `double` `Tensor`. Ignored if `fast=False`.
-*  <b>`fast`</b>: bool. Defaults to `True`.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: `Tensor` of shape `[..., N, K]` whose inner-most 2 dimensions form
-    `M`-by-`K` matrices that solve the equations
-    `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]` in the least
-    squares sense.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_transpose.md
deleted file mode 100644
index 7bfbc549a27..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.matrix_transpose.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.matrix_transpose(a, name='matrix_transpose')` {#matrix_transpose}
-
-Transposes last two dimensions of tensor `a`.
-
-For example:
-
-```python
-# Matrix with no batch dimension.
-# 'x' is [[1 2 3]
-#         [4 5 6]]
-tf.matrix_transpose(x) ==> [[1 4]
-                                 [2 5]
-                                 [3 6]]
-
-# Matrix with two batch dimensions.
-# x.shape is [1, 2, 3, 4]
-# tf.matrix_transpose(x) is shape [1, 2, 4, 3]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor` with `rank >= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A transposed batch matrix `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `a` is determined statically to have `rank < 2`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
deleted file mode 100644
index b98661fad5a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
+++ /dev/null
@@ -1,115 +0,0 @@
-### `tf.nn.atrous_conv2d(value, filters, rate, padding, name=None)` {#atrous_conv2d}
-
-Atrous convolution (a.k.a. convolution with holes or dilated convolution).
-
-Computes a 2-D atrous convolution, also known as convolution with holes or
-dilated convolution, given 4-D `value` and `filters` tensors. If the `rate`
-parameter is equal to one, it performs regular 2-D convolution. If the `rate`
-parameter is greater than one, it performs convolution with holes, sampling
-the input values every `rate` pixels in the `height` and `width` dimensions.
-This is equivalent to convolving the input with a set of upsampled filters,
-produced by inserting `rate - 1` zeros between two consecutive values of the
-filters along the `height` and `width` dimensions, hence the name atrous
-convolution or convolution with holes (the French word trous means holes in
-English).
-
-More specifically:
-
-    output[b, i, j, k] = sum_{di, dj, q} filters[di, dj, q, k] *
-          value[b, i + rate * di, j + rate * dj, q]
-
-Atrous convolution allows us to explicitly control how densely to compute
-feature responses in fully convolutional networks. Used in conjunction with
-bilinear interpolation, it offers an alternative to `conv2d_transpose` in
-dense prediction tasks such as semantic image segmentation, optical flow
-computation, or depth estimation. It also allows us to effectively enlarge
-the field of view of filters without increasing the number of parameters or
-the amount of computation.
-
-For a description of atrous convolution and how it can be used for dense
-feature extraction, please see: [Semantic Image Segmentation with Deep
-Convolutional Nets and Fully Connected CRFs](http://arxiv.org/abs/1412.7062).
-The same operation is investigated further in [Multi-Scale Context Aggregation
-by Dilated Convolutions](http://arxiv.org/abs/1511.07122). Previous works
-that effectively use atrous convolution in different ways are, among others,
-[OverFeat: Integrated Recognition, Localization and Detection using
-Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
-Atrous convolution is also closely related to the so-called noble identities
-in multi-rate signal processing.
-
-There are many different ways to implement atrous convolution (see the refs
-above). The implementation here reduces
-
-```python
-    atrous_conv2d(value, filters, rate, padding=padding)
-```
-
-to the following three operations:
-
-```python
-    paddings = ...
-    net = space_to_batch(value, paddings, block_size=rate)
-    net = conv2d(net, filters, strides=[1, 1, 1, 1], padding="VALID")
-    crops = ...
-    net = batch_to_space(net, crops, block_size=rate)
-```
-
-Advanced usage. Note the following optimization: A sequence of `atrous_conv2d`
-operations with identical `rate` parameters, 'SAME' `padding`, and filters
-with odd heights/ widths:
-
-```python
-    net = atrous_conv2d(net, filters1, rate, padding="SAME")
-    net = atrous_conv2d(net, filters2, rate, padding="SAME")
-    ...
-    net = atrous_conv2d(net, filtersK, rate, padding="SAME")
-```
-
-can be equivalently performed cheaper in terms of computation and memory as:
-
-```python
-    pad = ...  # padding so that the input dims are multiples of rate
-    net = space_to_batch(net, paddings=pad, block_size=rate)
-    net = conv2d(net, filters1, strides=[1, 1, 1, 1], padding="SAME")
-    net = conv2d(net, filters2, strides=[1, 1, 1, 1], padding="SAME")
-    ...
-    net = conv2d(net, filtersK, strides=[1, 1, 1, 1], padding="SAME")
-    net = batch_to_space(net, crops=pad, block_size=rate)
-```
-
-because a pair of consecutive `space_to_batch` and `batch_to_space` ops with
-the same `block_size` cancel out when their respective `paddings` and `crops`
-inputs are identical.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default "NHWC"
-    format. Its shape is `[batch, in_height, in_width, in_channels]`.
-*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[filter_height, filter_width, in_channels, out_channels]`. `filters`'
-    `in_channels` dimension must match that of `value`. Atrous convolution is
-    equivalent to standard convolution with upsampled filters with effective
-    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
-    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
-    inserting `rate - 1` zeros along consecutive elements across the
-    `filters`' spatial dimensions.
-*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
-    the `height` and `width` dimensions. Equivalently, the rate by which we
-    upsample the filter values by inserting zeros across the `height` and
-    `width` dimensions. In the literature, the same parameter is sometimes
-    called `input stride` or `dilation`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.avg_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.avg_pool.md
deleted file mode 100644
index c6ef397b19b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.avg_pool.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.nn.avg_pool(value, ksize, strides, padding, data_format='NHWC', name=None)` {#avg_pool}
-
-Performs the average pooling on the input.
-
-Each entry in `output` is the mean of the corresponding size `ksize`
-window in `value`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
-    `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-*  <b>`ksize`</b>: A list of ints that has length >= 4.
-    The size of the window for each dimension of the input tensor.
-*  <b>`strides`</b>: A list of ints that has length >= 4.
-    The stride of the sliding window for each dimension of the
-    input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.  The average pooled output tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.conv3d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.conv3d_transpose.md
deleted file mode 100644
index 575b52def55..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.conv3d_transpose.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.nn.conv3d_transpose(value, filter, output_shape, strides, padding='SAME', name=None)` {#conv3d_transpose}
-
-The transpose of `conv3d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `conv3d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 5-D `Tensor` of type `float` and shape
-    `[batch, depth, height, width, in_channels]`.
-*  <b>`filter`</b>: A 5-D `Tensor` with the same type as `value` and shape
-    `[depth, height, width, output_channels, in_channels]`.  `filter`'s
-    `in_channels` dimension must match that of `value`.
-*  <b>`output_shape`</b>: A 1-D `Tensor` representing the output shape of the
-    deconvolution op.
-*  <b>`strides`</b>: A list of ints. The stride of the sliding window for each
-    dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter`'s shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fixed_unigram_candidate_sampler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fixed_unigram_candidate_sampler.md
deleted file mode 100644
index ad9b059e424..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fixed_unigram_candidate_sampler.md
+++ /dev/null
@@ -1,75 +0,0 @@
-### `tf.nn.fixed_unigram_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, vocab_file='', distortion=1.0, num_reserved_ids=0, num_shards=1, shard=0, unigrams=(), seed=None, name=None)` {#fixed_unigram_candidate_sampler}
-
-Samples a set of classes using the provided (fixed) base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution is read from a file or passed in as an
-in-memory array. There is also an option to skew the distribution by
-applying a distortion power to the weights.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`vocab_file`</b>: Each valid line in this file (which should have a CSV-like
-    format) corresponds to a valid word ID. IDs are in sequential order,
-    starting from num_reserved_ids. The last entry in each line is expected
-    to be a value corresponding to the count or relative probability. Exactly
-    one of `vocab_file` and `unigrams` needs to be passed to this operation.
-*  <b>`distortion`</b>: The distortion is used to skew the unigram probability
-    distribution.  Each weight is first raised to the distortion's power
-    before adding to the internal unigram distribution. As a result,
-    `distortion = 1.0` gives regular unigram sampling (as defined by the vocab
-    file), and `distortion = 0.0` gives a uniform distribution.
-*  <b>`num_reserved_ids`</b>: Optionally some reserved IDs can be added in the range
-    `[0, num_reserved_ids]` by the users. One use case is that a special
-    unknown word token is used as ID 0. These IDs will have a sampling
-    probability of 0.
-*  <b>`num_shards`</b>: A sampler can be used to sample from a subset of the original
-    range in order to speed up the whole computation through parallelism. This
-    parameter (together with `shard`) indicates the number of partitions that
-    are being used in the overall computation.
-*  <b>`shard`</b>: A sampler can be used to sample from a subset of the original range
-    in order to speed up the whole computation through parallelism. This
-    parameter (together with `num_shards`) indicates the particular partition
-    number of the operation, when partitioning is being used.
-*  <b>`unigrams`</b>: A list of unigram counts or probabilities, one per ID in
-    sequential order. Exactly one of `vocab_file` and `unigrams` should be
-    passed to this operation.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
deleted file mode 100644
index 367205ffd66..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
+++ /dev/null
@@ -1,57 +0,0 @@
-### `tf.nn.fractional_avg_pool(value, pooling_ratio, pseudo_random=None, overlapping=None, deterministic=None, seed=None, seed2=None, name=None)` {#fractional_avg_pool}
-
-Performs fractional average pooling on the input.
-
-Fractional average pooling is similar to Fractional max pooling in the pooling
-region generation step. The only difference is that after pooling regions are
-generated, a mean operation is performed instead of a max operation in each
-pooling region.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`pooling_ratio`</b>: A list of `floats` that has length `>= 4`.
-    Pooling ratio for each dimension of `value`, currently only
-    supports row and col dimension and should be >= 1.0. For example, a valid
-    pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-    must be 1.0 because we don't allow pooling on batch and channels
-    dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-    respectively.
-*  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, generates the pooling sequence in a
-    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-    difference between pseudorandom and random.
-*  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, it means when pooling, the values at the boundary
-    of adjacent pooling cells are used by both cells. For example:
-
-    `index  0  1  2  3  4`
-
-    `value  20 5  16 3  7`
-
-    If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-    The result would be [41/3, 26/3] for fractional avg pooling.
-
-*  <b>`deterministic`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, a fixed pooling region will be used when
-    iterating over a FractionalAvgPool node in the computation graph. Mainly used
-    in unit test to make FractionalAvgPool deterministic.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either seed or seed2 are set to be non-zero, the random number
-    generator is seeded by the given seed.  Otherwise, it is seeded by a
-    random seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    An second seed to avoid seed collision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, row_pooling_sequence, col_pooling_sequence).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `value`. output tensor after fractional avg pooling.
-*  <b>`row_pooling_sequence`</b>: A `Tensor` of type `int64`. row pooling sequence, needed to calculate gradient.
-*  <b>`col_pooling_sequence`</b>: A `Tensor` of type `int64`. column pooling sequence, needed to calculate gradient.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.in_top_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.in_top_k.md
deleted file mode 100644
index f46780649d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.in_top_k.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.nn.in_top_k(predictions, targets, k, name=None)` {#in_top_k}
-
-Says whether the targets are in the top `K` predictions.
-
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of type `float32`.
-    A `batch_size` x `classes` tensor.
-*  <b>`targets`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A `batch_size` vector of class ids.
-*  <b>`k`</b>: An `int`. Number of top elements to look at for computing precision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
deleted file mode 100644
index 81134df29fa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.nn.local_response_normalization(input, depth_radius=None, bias=None, alpha=None, beta=None, name=None)` {#local_response_normalization}
-
-Local Response Normalization.
-
-The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-dimension), and each vector is normalized independently.  Within a given vector,
-each component is divided by the weighted, squared sum of inputs within
-`depth_radius`.  In detail,
-
-    sqr_sum[a, b, c, d] =
-        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-    output = input / (bias + alpha * sqr_sum) ** beta
-
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D.
-*  <b>`depth_radius`</b>: An optional `int`. Defaults to `5`.
-    0-D.  Half-width of the 1-D normalization window.
-*  <b>`bias`</b>: An optional `float`. Defaults to `1`.
-    An offset (usually positive to avoid dividing by 0).
-*  <b>`alpha`</b>: An optional `float`. Defaults to `1`.
-    A scale factor, usually positive.
-*  <b>`beta`</b>: An optional `float`. Defaults to `0.5`. An exponent.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.quantized_avg_pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.quantized_avg_pool.md
deleted file mode 100644
index 4bc6a1dc6d5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.quantized_avg_pool.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.nn.quantized_avg_pool(input, min_input, max_input, ksize, strides, padding, name=None)` {#quantized_avg_pool}
-
-Produces the average pool of the input tensor for quantized types.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`ksize`</b>: A list of `ints`.
-    The size of the window for each dimension of the input tensor.
-    The length must be 4 to match the number of dimensions of the input.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor.  The length must be 4 to match the number of dimensions of the input.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.softmax_cross_entropy_with_logits.md
deleted file mode 100644
index d7a62e2da78..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.softmax_cross_entropy_with_logits.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.nn.softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None)` {#softmax_cross_entropy_with_logits}
-
-Computes softmax cross entropy between `logits` and `labels`.
-
-Measures the probability error in discrete classification tasks in which the
-classes are mutually exclusive (each entry is in exactly one class).  For
-example, each CIFAR-10 image is labeled with one and only one label: an image
-can be a dog or a truck, but not both.
-
-**NOTE:**  While the classes are mutually exclusive, their probabilities
-need not be.  All that is required is that each row of `labels` is
-a valid probability distribution.  If they are not, the computation of the
-gradient will be incorrect.
-
-If using exclusive `labels` (wherein one and only
-one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
-
-**WARNING:** This op expects unscaled logits, since it performs a `softmax`
-on `logits` internally for efficiency.  Do not call this op with the
-output of `softmax`, as it will produce incorrect results.
-
-`logits` and `labels` must have the same shape `[batch_size, num_classes]`
-and the same dtype (either `float16`, `float32`, or `float64`).
-
-**Note that to avoid confusion, it is required to pass only named arguments to
-this function.**
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: Each row `labels[i]` must be a valid probability distribution.
-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`dim`</b>: The class dimension. Defaulted to -1 which is the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.weighted_moments.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.weighted_moments.md
deleted file mode 100644
index def48d75528..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.weighted_moments.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.nn.weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False)` {#weighted_moments}
-
-Returns the frequency-weighted mean and variance of `x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A tensor.
-*  <b>`axes`</b>: 1-d tensor of int32 values; these are the axes along which
-    to compute mean and variance.
-*  <b>`frequency_weights`</b>: A tensor of positive weights which can be
-    broadcast with x.
-*  <b>`name`</b>: Name used to scope the operation.
-*  <b>`keep_dims`</b>: Produce moments with the same dimensionality as the input.
-
-##### Returns:
-
-  Two tensors: `weighted_mean` and `weighted_variance`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ones_like.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ones_like.md
deleted file mode 100644
index 5ca57f52a50..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.ones_like.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.ones_like(tensor, dtype=None, name=None, optimize=True)` {#ones_like}
-
-Creates a tensor with all elements set to 1.
-
-Given a single tensor (`tensor`), this operation returns a tensor of the same
-type and shape as `tensor` with all elements set to 1. Optionally, you can
-specify a new type (`dtype`) for the returned tensor.
-
-For example:
-
-```python
-# 'tensor' is [[1, 2, 3], [4, 5, 6]]
-tf.ones_like(tensor) ==> [[1, 1, 1], [1, 1, 1]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`dtype`</b>: A type for the returned `Tensor`. Must be `float32`, `float64`,
-    `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, `complex128` or
-    `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`optimize`</b>: if true, attempt to statically determine the shape of 'tensor'
-  and encode it as a constant.
-
-##### Returns:
-
-  A `Tensor` with all elements set to 1.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.polygamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.polygamma.md
deleted file mode 100644
index c8b5b2578a5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.polygamma.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.polygamma(a, x, name=None)` {#polygamma}
-
-Compute the polygamma function \\(\psi^{(n)}(x)\\).
-
-The polygamma function is defined as:
-
-```
-\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-```
-where \\(\psi(x)\\) is the digamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.python_io.TFRecordWriter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.python_io.TFRecordWriter.md
deleted file mode 100644
index 36c524310f4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.python_io.TFRecordWriter.md
+++ /dev/null
@@ -1,58 +0,0 @@
-A class to write records to a TFRecords file.
-
-This class implements `__enter__` and `__exit__`, and can be used
-in `with` blocks like a normal file.
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.__init__(path, options=None)` {#TFRecordWriter.__init__}
-
-Opens file `path` and creates a `TFRecordWriter` writing to it.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to the TFRecords file.
-*  <b>`options`</b>: (optional) A TFRecordOptions object.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If `path` cannot be opened for writing.
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.write(record)` {#TFRecordWriter.write}
-
-Write a string record to the file.
-
-##### Args:
-
-
-*  <b>`record`</b>: str
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.close()` {#TFRecordWriter.close}
-
-Close the file.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.python_io.TFRecordWriter.__enter__()` {#TFRecordWriter.__enter__}
-
-Enter a `with` block.
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.__exit__(unused_type, unused_value, unused_traceback)` {#TFRecordWriter.__exit__}
-
-Exit a `with` block, closing the file.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.random_normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.random_normal.md
deleted file mode 100644
index 1344423202d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.random_normal.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.random_normal(shape, mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)` {#random_normal}
-
-Outputs random values from a normal distribution.
-
-##### Args:
-
-
-*  <b>`shape`</b>: A 1-D integer Tensor or Python array. The shape of the output tensor.
-*  <b>`mean`</b>: A 0-D Tensor or Python value of type `dtype`. The mean of the normal
-    distribution.
-*  <b>`stddev`</b>: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-    of the normal distribution.
-*  <b>`dtype`</b>: The type of the output.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed for the distribution.
-    See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tensor of the specified shape filled with random normal values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.reduce_max.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.reduce_max.md
deleted file mode 100644
index cea9d707189..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.reduce_max.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.reduce_max(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_max}
-
-Computes the maximum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.max
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.rint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.rint.md
deleted file mode 100644
index 91fc557ee64..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.rint.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.rint(x, name=None)` {#rint}
-
-Returns element-wise integer closest to x.
-
-If the result is midway between two representable values,
-the even representable is chosen.
-For example:
-
-```
-rint(-1.5) ==> -2.0
-rint(0.5000001) ==> 1.0
-rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_nd_update.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_nd_update.md
deleted file mode 100644
index e7e975cbd3d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_nd_update.md
+++ /dev/null
@@ -1,60 +0,0 @@
-### `tf.scatter_nd_update(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_update}
-
-Applies sparse `updates` to individual values or slices within a given
-
-variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_update(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
-
-The resulting update to ref would look like this:
-
-    [1, 11, 3, 10, 9, 6, 7, 12]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated
-    values to add to ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want to
-  use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_sub.md
deleted file mode 100644
index 8f1afc42f6c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.scatter_sub.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.scatter_sub(ref, indices, updates, use_locking=None, name=None)` {#scatter_sub}
-
-Subtracts sparse updates to a variable reference.
-
-    # Scalar indices
-    ref[indices, ...] -= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] -= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their (negated) contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterSub.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to subtract from `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the subtraction will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_prod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_prod.md
deleted file mode 100644
index c1e3e74cf59..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_prod.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.segment_prod(data, segment_ids, name=None)` {#segment_prod}
-
-Computes the product along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \prod_j data_j\\) where the product is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentProd.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_sum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_sum.md
deleted file mode 100644
index be93c31a2ef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.segment_sum.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.segment_sum(data, segment_ids, name=None)` {#segment_sum}
-
-Computes the sum along segments of a tensor.
-
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
-
-Computes a tensor such that
-\\(output_i = \sum_j data_j\\) where sum is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentSum.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_reduce_sum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_reduce_sum.md
deleted file mode 100644
index 4c1e77ac36e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_reduce_sum.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### `tf.sparse_reduce_sum(sp_input, axis=None, keep_dims=False, reduction_axes=None)` {#sparse_reduce_sum}
-
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-similar to the indexing rules in Python.
-
-For example:
-
-```python
-# 'x' represents [[1, ?, 1]
-#                 [?, 1, ?]]
-# where ? is implicitly-zero.
-tf.sparse_reduce_sum(x) ==> 3
-tf.sparse_reduce_sum(x, 0) ==> [1, 1, 1]
-tf.sparse_reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
-tf.sparse_reduce_sum(x, 1, keep_dims=True) ==> [[2], [1]]
-tf.sparse_reduce_sum(x, [0, 1]) ==> 3
-```
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The SparseTensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce; list or scalar. If `None` (the
-    default), reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retain reduced dimensions with length 1.
-*  <b>`reduction_axes`</b>: Deprecated name of axis.
-
-##### Returns:
-
-  The reduced Tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md
deleted file mode 100644
index 27de39cda28..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.sparse_tensor_dense_matmul.md
+++ /dev/null
@@ -1,165 +0,0 @@
-### `tf.sparse_tensor_dense_matmul(sp_a, b, adjoint_a=False, adjoint_b=False, name=None)` {#sparse_tensor_dense_matmul}
-
-Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-
-No validity checking is performed on the indices of A.  However, the following
-input format is recommended for optimal behavior:
-
-if adjoint_a == false:
-  A should be sorted in lexicographically increasing order.  Use
-  sparse_reorder if you're not sure.
-if adjoint_a == true:
-  A should be sorted in order of increasing dimension 1 (i.e., "column major"
-  order instead of "row major" order).
-
-Deciding when to use sparse_tensor_dense_matmul vs. matmul(sp_a=True):
-
-There are a number of questions to ask in the decision process, including:
-
-* Will the SparseTensor A fit in memory if densified?
-* Is the column count of the product large (>> 1)?
-* Is the density of A larger than approximately 15%?
-
-If the answer to several of these questions is yes, consider
-converting the `SparseTensor` to a dense one and using `tf.matmul` with
-`sp_a=True`.
-
-This operation tends to perform well when A is more sparse, if the column size
-of the product is small (e.g. matrix-vector multiplication), if
-`sp_a.dense_shape` takes on large values.
-
-Below is a rough speed comparison between sparse_tensor_dense_matmul,
-labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
-the comparison, the time spent converting from a SparseTensor to a dense
-Tensor is not included, so it is overly conservative with respect to
-the time ratio.
-
-Benchmark system:
-CPU: Intel Ivybridge with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:12MB
-GPU: NVidia Tesla k40c
-
-Compiled with:
-`-c opt --config=cuda --copt=-mavx`
-
-```
-tensorflow/python/sparse_tensor_dense_matmul_op_test --benchmarks
-A sparse [m, k] with % nonzero values between 1% and 80%
-B dense [k, n]
-
-% nnz  n   gpu   m     k     dt(dense)     dt(sparse)   dt(sparse)/dt(dense)
-0.01   1   True  100   100   0.000221166   0.00010154   0.459112
-0.01   1   True  100   1000  0.00033858    0.000109275  0.322745
-0.01   1   True  1000  100   0.000310557   9.85661e-05  0.317385
-0.01   1   True  1000  1000  0.0008721     0.000100875  0.115669
-0.01   1   False 100   100   0.000208085   0.000107603  0.51711
-0.01   1   False 100   1000  0.000327112   9.51118e-05  0.290762
-0.01   1   False 1000  100   0.000308222   0.00010345   0.335635
-0.01   1   False 1000  1000  0.000865721   0.000101397  0.117124
-0.01   10  True  100   100   0.000218522   0.000105537  0.482958
-0.01   10  True  100   1000  0.000340882   0.000111641  0.327506
-0.01   10  True  1000  100   0.000315472   0.000117376  0.372064
-0.01   10  True  1000  1000  0.000905493   0.000123263  0.136128
-0.01   10  False 100   100   0.000221529   9.82571e-05  0.44354
-0.01   10  False 100   1000  0.000330552   0.000112615  0.340687
-0.01   10  False 1000  100   0.000341277   0.000114097  0.334324
-0.01   10  False 1000  1000  0.000819944   0.000120982  0.147549
-0.01   25  True  100   100   0.000207806   0.000105977  0.509981
-0.01   25  True  100   1000  0.000322879   0.00012921   0.400181
-0.01   25  True  1000  100   0.00038262    0.00014158   0.370035
-0.01   25  True  1000  1000  0.000865438   0.000202083  0.233504
-0.01   25  False 100   100   0.000209401   0.000104696  0.499979
-0.01   25  False 100   1000  0.000321161   0.000130737  0.407076
-0.01   25  False 1000  100   0.000377012   0.000136801  0.362856
-0.01   25  False 1000  1000  0.000861125   0.00020272   0.235413
-0.2    1   True  100   100   0.000206952   9.69219e-05  0.46833
-0.2    1   True  100   1000  0.000348674   0.000147475  0.422959
-0.2    1   True  1000  100   0.000336908   0.00010122   0.300439
-0.2    1   True  1000  1000  0.001022      0.000203274  0.198898
-0.2    1   False 100   100   0.000207532   9.5412e-05   0.459746
-0.2    1   False 100   1000  0.000356127   0.000146824  0.41228
-0.2    1   False 1000  100   0.000322664   0.000100918  0.312764
-0.2    1   False 1000  1000  0.000998987   0.000203442  0.203648
-0.2    10  True  100   100   0.000211692   0.000109903  0.519165
-0.2    10  True  100   1000  0.000372819   0.000164321  0.440753
-0.2    10  True  1000  100   0.000338651   0.000144806  0.427596
-0.2    10  True  1000  1000  0.00108312    0.000758876  0.70064
-0.2    10  False 100   100   0.000215727   0.000110502  0.512231
-0.2    10  False 100   1000  0.000375419   0.0001613    0.429653
-0.2    10  False 1000  100   0.000336999   0.000145628  0.432132
-0.2    10  False 1000  1000  0.00110502    0.000762043  0.689618
-0.2    25  True  100   100   0.000218705   0.000129913  0.594009
-0.2    25  True  100   1000  0.000394794   0.00029428   0.745402
-0.2    25  True  1000  100   0.000404483   0.0002693    0.665788
-0.2    25  True  1000  1000  0.0012002     0.00194494   1.62052
-0.2    25  False 100   100   0.000221494   0.0001306    0.589632
-0.2    25  False 100   1000  0.000396436   0.000297204  0.74969
-0.2    25  False 1000  100   0.000409346   0.000270068  0.659754
-0.2    25  False 1000  1000  0.00121051    0.00193737   1.60046
-0.5    1   True  100   100   0.000214981   9.82111e-05  0.456836
-0.5    1   True  100   1000  0.000415328   0.000223073  0.537101
-0.5    1   True  1000  100   0.000358324   0.00011269   0.314492
-0.5    1   True  1000  1000  0.00137612    0.000437401  0.317851
-0.5    1   False 100   100   0.000224196   0.000101423  0.452386
-0.5    1   False 100   1000  0.000400987   0.000223286  0.556841
-0.5    1   False 1000  100   0.000368825   0.00011224   0.304318
-0.5    1   False 1000  1000  0.00136036    0.000429369  0.31563
-0.5    10  True  100   100   0.000222125   0.000112308  0.505608
-0.5    10  True  100   1000  0.000461088   0.00032357   0.701753
-0.5    10  True  1000  100   0.000394624   0.000225497  0.571422
-0.5    10  True  1000  1000  0.00158027    0.00190898   1.20801
-0.5    10  False 100   100   0.000232083   0.000114978  0.495418
-0.5    10  False 100   1000  0.000454574   0.000324632  0.714146
-0.5    10  False 1000  100   0.000379097   0.000227768  0.600817
-0.5    10  False 1000  1000  0.00160292    0.00190168   1.18638
-0.5    25  True  100   100   0.00023429    0.000151703  0.647501
-0.5    25  True  100   1000  0.000497462   0.000598873  1.20386
-0.5    25  True  1000  100   0.000460778   0.000557038  1.20891
-0.5    25  True  1000  1000  0.00170036    0.00467336   2.74845
-0.5    25  False 100   100   0.000228981   0.000155334  0.678371
-0.5    25  False 100   1000  0.000496139   0.000620789  1.25124
-0.5    25  False 1000  100   0.00045473    0.000551528  1.21287
-0.5    25  False 1000  1000  0.00171793    0.00467152   2.71927
-0.8    1   True  100   100   0.000222037   0.000105301  0.47425
-0.8    1   True  100   1000  0.000410804   0.000329327  0.801664
-0.8    1   True  1000  100   0.000349735   0.000131225  0.375212
-0.8    1   True  1000  1000  0.00139219    0.000677065  0.48633
-0.8    1   False 100   100   0.000214079   0.000107486  0.502085
-0.8    1   False 100   1000  0.000413746   0.000323244  0.781261
-0.8    1   False 1000  100   0.000348983   0.000131983  0.378193
-0.8    1   False 1000  1000  0.00136296    0.000685325  0.50282
-0.8    10  True  100   100   0.000229159   0.00011825   0.516017
-0.8    10  True  100   1000  0.000498845   0.000532618  1.0677
-0.8    10  True  1000  100   0.000383126   0.00029935   0.781336
-0.8    10  True  1000  1000  0.00162866    0.00307312   1.88689
-0.8    10  False 100   100   0.000230783   0.000124958  0.541452
-0.8    10  False 100   1000  0.000493393   0.000550654  1.11606
-0.8    10  False 1000  100   0.000377167   0.000298581  0.791642
-0.8    10  False 1000  1000  0.00165795    0.00305103   1.84024
-0.8    25  True  100   100   0.000233496   0.000175241  0.75051
-0.8    25  True  100   1000  0.00055654    0.00102658   1.84458
-0.8    25  True  1000  100   0.000463814   0.000783267  1.68875
-0.8    25  True  1000  1000  0.00186905    0.00755344   4.04132
-0.8    25  False 100   100   0.000240243   0.000175047  0.728625
-0.8    25  False 100   1000  0.000578102   0.00104499   1.80763
-0.8    25  False 1000  100   0.000485113   0.000776849  1.60138
-0.8    25  False 1000  1000  0.00211448    0.00752736   3.55992
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: SparseTensor A, of rank 2.
-*  <b>`b`</b>: A dense Matrix with the same dtype as sp_a.
-*  <b>`adjoint_a`</b>: Use the adjoint of A in the matrix multiply.  If A is complex,
-    this is transpose(conj(A)).  Otherwise it's transpose(A).
-*  <b>`adjoint_b`</b>: Use the adjoint of B in the matrix multiply.  If B is complex,
-    this is transpose(conj(B)).  Otherwise it's transpose(B).
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A dense matrix (pseudo-code in dense np.matrix notation):
-    A = A.H if adjoint_a else A
-    B = B.H if adjoint_b else B
-    return A*B
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.square.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.square.md
deleted file mode 100644
index 940154968f4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.square.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.square(x, name=None)` {#square}
-
-Computes square of x element-wise.
-
-I.e., \(y = x * x = x^2\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.string_to_hash_bucket.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.string_to_hash_bucket.md
deleted file mode 100644
index 1b818c2d3b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.string_to_hash_bucket.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.string_to_hash_bucket(string_tensor, num_buckets, name=None)` {#string_to_hash_bucket}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process.
-
-Note that the hash function may change from time to time.
-This functionality will be deprecated and it's recommended to use
-`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriter.md
deleted file mode 100644
index 6e80a4a562f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriter.md
+++ /dev/null
@@ -1,191 +0,0 @@
-Writes `Summary` protocol buffers to event files.
-
-The `FileWriter` class provides a mechanism to create an event file in a
-given directory and add summaries and events to it. The class updates the
-file contents asynchronously. This allows a training program to call methods
-to add data to the file directly from the training loop, without slowing down
-training.
-
-- - -
-
-#### `tf.summary.FileWriter.__init__(logdir, graph=None, max_queue=10, flush_secs=120, graph_def=None)` {#FileWriter.__init__}
-
-Creates a `FileWriter` and an event file.
-
-On construction the summary writer creates a new event file in `logdir`.
-This event file will contain `Event` protocol buffers constructed when you
-call one of the following functions: `add_summary()`, `add_session_log()`,
-`add_event()`, or `add_graph()`.
-
-If you pass a `Graph` to the constructor it is added to
-the event file. (This is equivalent to calling `add_graph()` later).
-
-TensorBoard will pick the graph from the file and display it graphically so
-you can interactively explore the graph you built. You will usually pass
-the graph from the session in which you launched it:
-
-```python
-...create a graph...
-# Launch the graph in a session.
-sess = tf.Session()
-# Create a summary writer, add the 'graph' to the event file.
-writer = tf.summary.FileWriter(<some-directory>, sess.graph)
-```
-
-The other arguments to the constructor control the asynchronous writes to
-the event file:
-
-*  `flush_secs`: How often, in seconds, to flush the added summaries
-   and events to disk.
-*  `max_queue`: Maximum number of summaries or events pending to be
-   written to disk before one of the 'add' calls block.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: A string. Directory where event file will be written.
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`max_queue`</b>: Integer. Size of the queue for pending events and summaries.
-*  <b>`flush_secs`</b>: Number. How often, in seconds, to flush the
-    pending events and summaries to disk.
-*  <b>`graph_def`</b>: DEPRECATED: Use the `graph` argument instead.
-
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_summary(summary, global_step=None)` {#FileWriter.add_summary}
-
-Adds a `Summary` protocol buffer to the event file.
-
-This method wraps the provided summary in an `Event` protocol buffer
-and adds it to the event file.
-
-You can pass the result of evaluating any summary op, using
-[`Session.run()`](client.md#Session.run) or
-[`Tensor.eval()`](framework.md#Tensor.eval), to this
-function. Alternatively, you can pass a `tf.Summary` protocol
-buffer that you populate with your own data. The latter is
-commonly done to report evaluation results in event files.
-
-##### Args:
-
-
-*  <b>`summary`</b>: A `Summary` protocol buffer, optionally serialized as a string.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_session_log(session_log, global_step=None)` {#FileWriter.add_session_log}
-
-Adds a `SessionLog` protocol buffer to the event file.
-
-This method wraps the provided session in an `Event` protocol buffer
-and adds it to the event file.
-
-##### Args:
-
-
-*  <b>`session_log`</b>: A `SessionLog` protocol buffer.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_event(event)` {#FileWriter.add_event}
-
-Adds an event to the event file.
-
-##### Args:
-
-
-*  <b>`event`</b>: An `Event` protocol buffer.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_graph(graph, global_step=None, graph_def=None)` {#FileWriter.add_graph}
-
-Adds a `Graph` to the event file.
-
-The graph described by the protocol buffer will be displayed by
-TensorBoard. Most users pass a graph in the constructor instead.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-*  <b>`graph_def`</b>: DEPRECATED. Use the `graph` parameter instead.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both graph and graph_def are passed to the method.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_run_metadata(run_metadata, tag, global_step=None)` {#FileWriter.add_run_metadata}
-
-Adds a metadata information for a single session.run() call.
-
-##### Args:
-
-
-*  <b>`run_metadata`</b>: A `RunMetadata` protobuf object.
-*  <b>`tag`</b>: The tag name for this metadata.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    StepStats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
-
-
-- - -
-
-#### `tf.summary.FileWriter.get_logdir()` {#FileWriter.get_logdir}
-
-Returns the directory where event file will be written.
-
-
-
-- - -
-
-#### `tf.summary.FileWriter.flush()` {#FileWriter.flush}
-
-Flushes the event file to disk.
-
-Call this method to make sure that all pending events have been written to
-disk.
-
-
-- - -
-
-#### `tf.summary.FileWriter.close()` {#FileWriter.close}
-
-Flushes the event file to disk and close the file.
-
-Call this method when you do not need the summary writer anymore.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.summary.FileWriter.reopen()` {#FileWriter.reopen}
-
-Reopens the EventFileWriter.
-
-Can be called after `close()` to add more events in the same directory.
-The events will go into a new events file.
-
-Does nothing if the EventFileWriter was not closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriterCache.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriterCache.md
deleted file mode 100644
index 3c6c8773b3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.summary.FileWriterCache.md
+++ /dev/null
@@ -1,26 +0,0 @@
-Cache for file writers.
-
-This class caches file writers, one per directory.
-- - -
-
-#### `tf.summary.FileWriterCache.clear()` {#FileWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
-
-- - -
-
-#### `tf.summary.FileWriterCache.get(logdir)` {#FileWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.MomentumOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.MomentumOptimizer.md
deleted file mode 100644
index 810f802c254..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.MomentumOptimizer.md
+++ /dev/null
@@ -1,21 +0,0 @@
-Optimizer that implements the Momentum algorithm.
-
-- - -
-
-#### `tf.train.MomentumOptimizer.__init__(learning_rate, momentum, use_locking=False, name='Momentum', use_nesterov=False)` {#MomentumOptimizer.__init__}
-
-Construct a new Momentum optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`momentum`</b>: A `Tensor` or a floating point value.  The momentum.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Momentum".
-*  <b>`use_nesterov`</b>: If `True` use Nesterov Momentum.
-    See [Sutskever et. al., 2013](
-*  <b>`http`</b>: //jmlr.org/proceedings/papers/v28/sutskever13.pdf)
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.ProximalGradientDescentOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.ProximalGradientDescentOptimizer.md
deleted file mode 100644
index ee14fe89dfe..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.ProximalGradientDescentOptimizer.md
+++ /dev/null
@@ -1,24 +0,0 @@
-Optimizer that implements the proximal gradient descent algorithm.
-
-See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-- - -
-
-#### `tf.train.ProximalGradientDescentOptimizer.__init__(learning_rate, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='ProximalGradientDescent')` {#ProximalGradientDescentOptimizer.__init__}
-
-Construct a new proximal gradient descent optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning
-    rate to use.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "GradientDescent".
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SessionRunValues.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SessionRunValues.md
deleted file mode 100644
index 7856c8bf922..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SessionRunValues.md
+++ /dev/null
@@ -1,66 +0,0 @@
-Contains the results of `Session.run()`.
-
-In the future we may use this object to add more information about result of
-run without changing the Hook API.
-
-Args:
-  results: The return values from `Session.run()` corresponding to the fetches
-    attribute returned in the RunArgs. Note that this has the same shape as
-    the RunArgs fetches.  For example:
-      fetches = global_step_tensor
-      => results = nparray(int)
-      fetches = [train_op, summary_op, global_step_tensor]
-      => results = [None, nparray(string), nparray(int)]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-      => results = {'step': nparray(int), 'summ': nparray(string)}
-  options: `RunOptions` from the `Session.run()` call.
-  run_metadata: `RunMetadata` from the `Session.run()` call.
-- - -
-
-#### `tf.train.SessionRunValues.__getnewargs__()` {#SessionRunValues.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__getstate__()` {#SessionRunValues.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__new__(_cls, results, options, run_metadata)` {#SessionRunValues.__new__}
-
-Create new instance of SessionRunValues(results, options, run_metadata)
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__repr__()` {#SessionRunValues.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.train.SessionRunValues.options` {#SessionRunValues.options}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.train.SessionRunValues.results` {#SessionRunValues.results}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.train.SessionRunValues.run_metadata` {#SessionRunValues.run_metadata}
-
-Alias for field number 2
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.export_meta_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.export_meta_graph.md
deleted file mode 100644
index dd318197593..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.export_meta_graph.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.train.export_meta_graph(filename=None, meta_info_def=None, graph_def=None, saver_def=None, collection_list=None, as_text=False, graph=None, export_scope=None, clear_devices=False, **kwargs)` {#export_meta_graph}
-
-Returns `MetaGraphDef` proto. Optionally writes it to filename.
-
-This function exports the graph, saver, and collection objects into
-`MetaGraphDef` protocol buffer with the intention of it being imported
-at a later time or location to restart training, run inference, or be
-a subgraph.
-
-##### Args:
-
-
-*  <b>`filename`</b>: Optional filename including the path for writing the
-    generated `MetaGraphDef` protocol buffer.
-*  <b>`meta_info_def`</b>: `MetaInfoDef` protocol buffer.
-*  <b>`graph_def`</b>: `GraphDef` protocol buffer.
-*  <b>`saver_def`</b>: `SaverDef` protocol buffer.
-*  <b>`collection_list`</b>: List of string keys to collect.
-*  <b>`as_text`</b>: If `True`, writes the `MetaGraphDef` as an ASCII proto.
-*  <b>`graph`</b>: The `Graph` to import into. If `None`, use the default graph.
-*  <b>`export_scope`</b>: Optional `string`. Name scope under which to extract
-    the subgraph. The scope name will be striped from the node definitions
-    for easy import later into new name scopes. If `None`, the whole graph
-    is exported. graph_def and export_scope cannot both be specified.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during export.
-*  <b>`**kwargs`</b>: Optional keyed arguments.
-
-##### Returns:
-
-  A `MetaGraphDef` proto.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When the `GraphDef` is larger than 2GB.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_checkpoint_state.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_checkpoint_state.md
deleted file mode 100644
index 89635396059..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_checkpoint_state.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.train.get_checkpoint_state(checkpoint_dir, latest_filename=None)` {#get_checkpoint_state}
-
-Returns CheckpointState proto from the "checkpoint" file.
-
-If the "checkpoint" file contains a valid CheckpointState
-proto, returns it.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: The directory of checkpoints.
-*  <b>`latest_filename`</b>: Optional name of the checkpoint file.  Default to
-    'checkpoint'.
-
-##### Returns:
-
-  A CheckpointState if the state was available, None
-  otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the checkpoint read doesn't have model_checkpoint_path set.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_global_step.md
deleted file mode 100644
index 7ccb41889f3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.get_global_step.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.train.get_global_step(graph=None)` {#get_global_step}
-
-Get the global step tensor.
-
-The global step tensor must be an integer variable. We first try to find it
-in the collection `GLOBAL_STEP`, or by name `global_step:0`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph to find the global step in. If missing, use default graph.
-
-##### Returns:
-
-  The global step variable, or `None` if none was found.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the global step tensor has a non-integer type, or if it is not
-    a `Variable`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md
deleted file mode 100644
index fe85cb1b128..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md
+++ /dev/null
@@ -1,56 +0,0 @@
-### `tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#inverse_time_decay}
-
-Applies inverse time decay to the initial learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an inverse decay function
-to a provided initial learning rate.  It requires an `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate / (1 + decay_rate * t)
-```
-
-Example: decay 1/t with a rate of 0.5:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-learning_rate = 0.1
-k = 0.5
-learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
-
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: How often to apply decay.
-*  <b>`decay_rate`</b>: A Python number.  The decay rate.
-*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
-    continuous, fashion.
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'InverseTimeDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.truediv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.truediv.md
deleted file mode 100644
index 7a0c7a4aac3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.truediv.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.truediv(x, y, name=None)` {#truediv}
-
-Divides x / y elementwise (using Python 3 division operator semantics).
-
-NOTE: Prefer using the Tensor operator or tf.divide which obey Python
-division operator semantics.
-
-This function forces Python 3 division operator semantics where all integer
-arguments are cast to floating types first.   This op is generated by normal
-`x / y` division in Python 3 and in Python 2.7 with
-`from __future__ import division`.  If you want integer division that rounds
-down, use `x // y` or `tf.floordiv`.
-
-`x` and `y` must have the same numeric type.  If the inputs are floating
-point, the output will have the same type.  If the inputs are integral, the
-inputs are cast to `float32` for `int8` and `int16` and `float64` for `int32`
-and `int64` (matching the behavior of Numpy).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of numeric type.
-*  <b>`y`</b>: `Tensor` denominator of numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` evaluated in floating point.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` and `y` have different dtypes.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.unique.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.unique.md
deleted file mode 100644
index 5b9bc642c89..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.unique.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.unique(x, out_idx=None, name=None)` {#unique}
-
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```prettyprint
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx = unique(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (y, idx).
-
-*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
deleted file mode 100644
index 2bf61a01901..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
+++ /dev/null
@@ -1,100 +0,0 @@
-### `tf.variable_scope(name_or_scope, default_name=None, values=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None, use_resource=None)` {#variable_scope}
-
-Returns a context manager for defining ops that creates variables (layers).
-
-This context manager validates that the (optional) `values` are from
-the same graph, ensures that graph is the default graph, and pushes a
-name scope and a variable scope.
-
-If `name_or_scope` is not None, it is used as is. If `scope` is None, then
-`default_name` is used.  In that case, if the same name has been previously
-used in the same scope, it will made unique be appending `_N` to it.
-
-Variable scope allows to create new variables and to share already created
-ones while providing checks to not create or share by accident. For details,
-see the [Variable Scope How To](../../how_tos/variable_scope/index.md),
-here we present only a few basic examples.
-
-Simple example of how to create a new variable:
-
-```python
-with tf.variable_scope("foo"):
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-        assert v.name == "foo/bar/v:0"
-```
-
-Basic example of sharing a variable:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v", [1])
-assert v1 == v
-```
-
-Sharing a variable by capturing a scope and setting reuse:
-
-```python
-with tf.variable_scope("foo") as scope:
-    v = tf.get_variable("v", [1])
-    scope.reuse_variables()
-    v1 = tf.get_variable("v", [1])
-assert v1 == v
-```
-
-To prevent accidental sharing of variables, we raise an exception when
-getting an existing variable in a non-reusing scope.
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-    v1 = tf.get_variable("v", [1])
-    #  Raises ValueError("... v already exists ...").
-```
-
-Similarly, we raise an exception when trying to get a variable that
-does not exist in reuse mode.
-
-```python
-with tf.variable_scope("foo", reuse=True):
-    v = tf.get_variable("v", [1])
-    #  Raises ValueError("... v does not exists ...").
-```
-
-Note that the `reuse` flag is inherited: if we open a reusing scope,
-then all its sub-scopes become reusing as well.
-
-##### Args:
-
-
-*  <b>`name_or_scope`</b>: `string` or `VariableScope`: the scope to open.
-*  <b>`default_name`</b>: The default name to use if the `name_or_scope` argument is
-    `None`, this name will be uniquified. If name_or_scope is provided it
-    won't be used and therefore it is not required and can be None.
-*  <b>`values`</b>: The list of `Tensor` arguments that are passed to the op function.
-*  <b>`initializer`</b>: default initializer for variables within this scope.
-*  <b>`regularizer`</b>: default regularizer for variables within this scope.
-*  <b>`caching_device`</b>: default caching device for variables within this scope.
-*  <b>`partitioner`</b>: default partitioner for variables within this scope.
-*  <b>`custom_getter`</b>: default custom getter for variables within this scope.
-*  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
-    well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
-*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
-    in the passed scope, or inherited from parent scope).
-*  <b>`use_resource`</b>: If False, all variables will be regular Variables. If True,
-    experimental ResourceVariables with well-defined semantics will be used
-    instead. Defaults to False (will later change to True).
-
-##### Returns:
-
-  A scope that can be to captured and reused.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when trying to reuse within a create scope, or create within
-    a reuse scope, or if reuse is not `None` or `True`.
-*  <b>`TypeError`</b>: when the types of some arguments are not appropriate.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.where.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.where.md
deleted file mode 100644
index 8aaf2e14633..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.where.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.where(condition, x=None, y=None, name=None)` {#where}
-
-Return the elements, either from `x` or `y`, depending on the `condition`.
-
-If both `x` and `y` are None, then this operation returns the coordinates of
-true elements of `condition`.  The coordinates are returned in a 2-D tensor
-where the first dimension (rows) represents the number of true elements, and
-the second dimension (columns) represents the coordinates of the true
-elements. Keep in mind, the shape of the output tensor can vary depending on
-how many true values there are in input. Indices are output in row-major
-order.
-
-If both non-None, `x` and `y` must have the same shape.
-The `condition` tensor must be a scalar if `x` and `y` are scalar.
-If `x` and `y` are vectors or higher rank, then `condition` must be either a
-vector with size matching the first dimension of `x`, or must have the same
-shape as `x`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be taken
-from `x` (if true) or `y` (if false).
-
-If `condition` is a vector and `x` and `y` are higher rank matrices, then it
-chooses which row (outer dimension) to copy from `x` and `y`. If `condition`
-has the same shape as `x` and `y`, then it chooses which element to copy from
-`x` and `y`.
-
-##### Args:
-
-
-*  <b>`condition`</b>: A `Tensor` of type `bool`
-*  <b>`x`</b>: A Tensor which may have the same shape as `condition`. If `condition` is
-    rank 1, `x` may have higher rank, but its first dimension must match the
-    size of `condition`.
-*  <b>`y`</b>: A `tensor` with the same shape and type as `x`.
-*  <b>`name`</b>: A name of the operation (optional)
-
-##### Returns:
-
-  A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-  A `Tensor` with shape `(num_true, dim_size(condition))`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When exactly one of `x` or `y` is non-None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.FixedLenSequenceFeature.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.FixedLenSequenceFeature.__new__.md
deleted file mode 100644
index 33babc9eddd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.FixedLenSequenceFeature.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.FixedLenSequenceFeature.__new__(_cls, shape, dtype, allow_missing=False)` {#FixedLenSequenceFeature.__new__}
-
-Create new instance of FixedLenSequenceFeature(shape, dtype, allow_missing)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
deleted file mode 100644
index 74b46140d2d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
+++ /dev/null
@@ -1,44 +0,0 @@
-Standard names to use for graph collections.
-
-The standard library uses various well-known names to collect and
-retrieve values associated with a graph. For example, the
-`tf.Optimizer` subclasses default to optimizing the variables
-collected under `tf.GraphKeys.TRAINABLE_VARIABLES` if none is
-specified, but it is also possible to pass an explicit list of
-variables.
-
-The following standard keys are defined:
-
-* `GLOBAL_VARIABLES`: the default collection of `Variable` objects, shared
-  across distributed environment (model variables are subset of these). See
-  [`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
-  for more details.
-  Commonly, all `TRAINABLE_VARIABLES` variables will be in `MODEL_VARIABLES`,
-  and all `MODEL_VARIABLES` variables will be in `GLOBAL_VARIABLES`.
-* `LOCAL_VARIABLES`: the subset of `Variable` objects that are local to each
-  machine. Usually used for temporarily variables, like counters.
-  Note: use `tf.contrib.framework.local_variable` to add to this collection.
-* `MODEL_VARIABLES`: the subset of `Variable` objects that are used in the
-  model for inference (feed forward). Note: use
-  `tf.contrib.framework.model_variable` to add to this collection.
-* `TRAINABLE_VARIABLES`: the subset of `Variable` objects that will
-  be trained by an optimizer. See
-  [`tf.trainable_variables()`](../../api_docs/python/state_ops.md#trainable_variables)
-  for more details.
-* `SUMMARIES`: the summary `Tensor` objects that have been created in the
-  graph. See
-  [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
-  for more details.
-* `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
-  produce input for a computation. See
-  [`tf.start_queue_runners()`](../../api_docs/python/train.md#start_queue_runners)
-  for more details.
-* `MOVING_AVERAGE_VARIABLES`: the subset of `Variable` objects that will also
-  keep moving averages.  See
-  [`tf.moving_average_variables()`](../../api_docs/python/state_ops.md#moving_average_variables)
-  for more details.
-* `REGULARIZATION_LOSSES`: regularization losses collected during graph
-  construction.
-* `WEIGHTS`: weights inside neural network layers
-* `BIASES`: biases inside neural network layers
-* `ACTIVATIONS`: activations of neural network layers
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.IndexedSlices.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.IndexedSlices.md
deleted file mode 100644
index 6ca64ebf1bc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.IndexedSlices.md
+++ /dev/null
@@ -1,107 +0,0 @@
-A sparse representation of a set of tensor slices at given indices.
-
-This class is a simple wrapper for a pair of `Tensor` objects:
-
-* `values`: A `Tensor` of any dtype with shape `[D0, D1, ..., Dn]`.
-* `indices`: A 1-D integer `Tensor` with shape `[D0]`.
-
-An `IndexedSlices` is typically used to represent a subset of a larger
-tensor `dense` of shape `[LARGE0, D1, .. , DN]` where `LARGE0 >> D0`.
-The values in `indices` are the indices in the first dimension of
-the slices that have been extracted from the larger tensor.
-
-The dense tensor `dense` represented by an `IndexedSlices` `slices` has
-
-```python
-dense[slices.indices[i], :, :, :, ...] = slices.values[i, :, :, :, ...]
-```
-
-The `IndexedSlices` class is used principally in the definition of
-gradients for operations that have sparse gradients
-(e.g. [`tf.gather`](../../api_docs/python/array_ops.md#gather)).
-
-Contrast this representation with
-[`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-which uses multi-dimensional indices and scalar values.
-
-- - -
-
-#### `tf.IndexedSlices.__init__(values, indices, dense_shape=None)` {#IndexedSlices.__init__}
-
-Creates an `IndexedSlices`.
-
-
-
-- - -
-
-#### `tf.IndexedSlices.values` {#IndexedSlices.values}
-
-A `Tensor` containing the values of the slices.
-
-
-- - -
-
-#### `tf.IndexedSlices.indices` {#IndexedSlices.indices}
-
-A 1-D `Tensor` containing the indices of the slices.
-
-
-- - -
-
-#### `tf.IndexedSlices.dense_shape` {#IndexedSlices.dense_shape}
-
-A 1-D `Tensor` containing the shape of the corresponding dense tensor.
-
-
-
-- - -
-
-#### `tf.IndexedSlices.name` {#IndexedSlices.name}
-
-The name of this `IndexedSlices`.
-
-
-- - -
-
-#### `tf.IndexedSlices.dtype` {#IndexedSlices.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.IndexedSlices.device` {#IndexedSlices.device}
-
-The name of the device on which `values` will be produced, or `None`.
-
-
-- - -
-
-#### `tf.IndexedSlices.op` {#IndexedSlices.op}
-
-The `Operation` that produces `values` as an output.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.IndexedSlices.__neg__()` {#IndexedSlices.__neg__}
-
-
-
-
-- - -
-
-#### `tf.IndexedSlices.__str__()` {#IndexedSlices.__str__}
-
-
-
-
-- - -
-
-#### `tf.IndexedSlices.graph` {#IndexedSlices.graph}
-
-The `Graph` that contains the values, indices, and shape tensors.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
deleted file mode 100644
index 1c183cb1209..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
+++ /dev/null
@@ -1,314 +0,0 @@
-A class for running TensorFlow operations.
-
-A `Session` object encapsulates the environment in which `Operation`
-objects are executed, and `Tensor` objects are evaluated. For
-example:
-
-```python
-# Build a graph.
-a = tf.constant(5.0)
-b = tf.constant(6.0)
-c = a * b
-
-# Launch the graph in a session.
-sess = tf.Session()
-
-# Evaluate the tensor `c`.
-print(sess.run(c))
-```
-
-A session may own resources, such as
-[variables](../../api_docs/python/state_ops.md#Variable), [queues](../../api_docs/python/io_ops.md#QueueBase),
-and [readers](../../api_docs/python/io_ops.md#ReaderBase). It is important to release
-these resources when they are no longer required. To do this, either
-invoke the [`close()`](#Session.close) method on the session, or use
-the session as a context manager. The following two examples are
-equivalent:
-
-```python
-# Using the `close()` method.
-sess = tf.Session()
-sess.run(...)
-sess.close()
-
-# Using the context manager.
-with tf.Session() as sess:
-  sess.run(...)
-```
-
-The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
-protocol buffer exposes various configuration options for a
-session. For example, to create a session that uses soft constraints
-for device placement, and log the resulting placement decisions,
-create a session as follows:
-
-```python
-# Launch the graph in a session that allows soft device placement and
-# logs the placement decisions.
-sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
-                                        log_device_placement=True))
-```
-
-- - -
-
-#### `tf.Session.__init__(target='', graph=None, config=None)` {#Session.__init__}
-
-Creates a new TensorFlow session.
-
-If no `graph` argument is specified when constructing the session,
-the default graph will be launched in the session. If you are
-using more than one graph (created with `tf.Graph()` in the same
-process, you will have to use different sessions for each graph,
-but each graph can be used in multiple sessions. In this case, it
-is often clearer to pass the graph to be launched explicitly to
-the session constructor.
-
-##### Args:
-
-
-*  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See
-    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
-    for more examples.
-*  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
-*  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
-    protocol buffer with configuration options for the session.
-
-
-- - -
-
-#### `tf.Session.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#Session.run}
-
-Runs operations and evaluates tensors in `fetches`.
-
-This method runs one "step" of TensorFlow computation, by
-running the necessary graph fragment to execute every `Operation`
-and evaluate every `Tensor` in `fetches`, substituting the values in
-`feed_dict` for the corresponding input values.
-
-The `fetches` argument may be a single graph element, or an arbitrarily
-nested list, tuple, namedtuple, dict, or OrderedDict containing graph
-elements at its leaves.  A graph element can be one of the following types:
-
-* An [`Operation`](../../api_docs/python/framework.md#Operation).
-  The corresponding fetched value will be `None`.
-* A [`Tensor`](../../api_docs/python/framework.md#Tensor).
-  The corresponding fetched value will be a numpy ndarray containing the
-  value of that tensor.
-* A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
-  The corresponding fetched value will be a
-  [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
-  containing the value of that sparse tensor.
-* A `get_tensor_handle` op.  The corresponding fetched value will be a
-  numpy ndarray containing the handle of that tensor.
-* A `string` which is the name of a tensor or operation in the graph.
-
-The value returned by `run()` has the same shape as the `fetches` argument,
-where the leaves are replaced by the corresponding values returned by
-TensorFlow.
-
-Example:
-
-```python
-   a = tf.constant([10, 20])
-   b = tf.constant([1.0, 2.0])
-   # 'fetches' can be a singleton
-   v = session.run(a)
-   # v is the numpy array [10, 20]
-   # 'fetches' can be a list.
-   v = session.run([a, b])
-   # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
-   # 1-D array [1.0, 2.0]
-   # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
-   MyData = collections.namedtuple('MyData', ['a', 'b'])
-   v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
-   # v is a dict with
-   # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
-   # 'b' the numpy array [1.0, 2.0]
-   # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
-   # [10, 20].
-```
-
-The optional `feed_dict` argument allows the caller to override
-the value of tensors in the graph. Each key in `feed_dict` can be
-one of the following types:
-
-* If the key is a [`Tensor`](../../api_docs/python/framework.md#Tensor), the
-  value may be a Python scalar, string, list, or numpy ndarray
-  that can be converted to the same `dtype` as that
-  tensor. Additionally, if the key is a
-  [placeholder](../../api_docs/python/io_ops.md#placeholder), the shape of
-  the value will be checked for compatibility with the placeholder.
-* If the key is a
-  [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-  the value should be a
-  [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue).
-* If the key is a nested tuple of `Tensor`s or `SparseTensor`s, the value
-  should be a nested tuple with the same structure that maps to their
-  corresponding values as above.
-
-Each value in `feed_dict` must be convertible to a numpy array of the dtype
-of the corresponding key.
-
-The optional `options` argument expects a [`RunOptions`] proto. The options
-allow controlling the behavior of this particular step (e.g. turning tracing
-on).
-
-The optional `run_metadata` argument expects a [`RunMetadata`] proto. When
-appropriate, the non-Tensor output of this step will be collected there. For
-example, when users turn on tracing in `options`, the profiled info will be
-collected into this argument and passed back.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: A single graph element, a list of graph elements,
-    or a dictionary whose values are graph elements or lists of graph
-    elements (described above).
-*  <b>`feed_dict`</b>: A dictionary that maps graph elements to values
-    (described above).
-*  <b>`options`</b>: A [`RunOptions`] protocol buffer
-*  <b>`run_metadata`</b>: A [`RunMetadata`] protocol buffer
-
-##### Returns:
-
-  Either a single value if `fetches` is a single graph element, or
-  a list of values if `fetches` is a list, or a dictionary with the
-  same keys as `fetches` if that is a dictionary (described above).
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If this `Session` is in an invalid state (e.g. has been
-    closed).
-*  <b>`TypeError`</b>: If `fetches` or `feed_dict` keys are of an inappropriate type.
-*  <b>`ValueError`</b>: If `fetches` or `feed_dict` keys are invalid or refer to a
-    `Tensor` that doesn't exist.
-
-
-- - -
-
-#### `tf.Session.close()` {#Session.close}
-
-Closes this session.
-
-Calling this method frees all resources associated with the session.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    closing the TensorFlow session.
-
-
-
-- - -
-
-#### `tf.Session.graph` {#Session.graph}
-
-The graph that was launched in this session.
-
-
-
-- - -
-
-#### `tf.Session.as_default()` {#Session.as_default}
-
-Returns a context manager that makes this object the default session.
-
-Use with the `with` keyword to specify that calls to
-[`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
-[`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval) should be
-executed in this session.
-
-```python
-c = tf.constant(..)
-sess = tf.Session()
-
-with sess.as_default():
-  assert tf.get_default_session() is sess
-  print(c.eval())
-```
-
-To get the current default session, use
-[`tf.get_default_session()`](#get_default_session).
-
-
-*N.B.* The `as_default` context manager *does not* close the
-session when you exit the context, and you must close the session
-explicitly.
-
-```python
-c = tf.constant(...)
-sess = tf.Session()
-with sess.as_default():
-  print(c.eval())
-# ...
-with sess.as_default():
-  print(c.eval())
-
-sess.close()
-```
-
-Alternatively, you can use `with tf.Session():` to create a
-session that is automatically closed on exiting the context,
-including when an uncaught exception is raised.
-
-*N.B.* The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default session in that
-thread, you must explicitly add a `with sess.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  A context manager using this session as the default session.
-
-
-
-- - -
-
-#### `tf.Session.reset(target, containers=None, config=None)` {#Session.reset}
-
-Resets resource containers on `target`, and close all connected sessions.
-
-A resource container is distributed across all workers in the
-same cluster as `target`.  When a resource container on `target`
-is reset, resources associated with that container will be cleared.
-In particular, all Variables in the container will become undefined:
-they lose their values and shapes.
-
-NOTE:
-(i) reset() is currently only implemented for distributed sessions.
-(ii) Any sessions on the master named by `target` will be closed.
-
-If no resource containers are provided, all containers are reset.
-
-##### Args:
-
-
-*  <b>`target`</b>: The execution engine to connect to.
-*  <b>`containers`</b>: A list of resource container name strings, or `None` if all of
-    all the containers are to be reset.
-*  <b>`config`</b>: (Optional.) Protocol buffer with configuration options.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    resetting containers.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Session.__enter__()` {#Session.__enter__}
-
-
-
-
-- - -
-
-#### `tf.Session.__exit__(exec_type, exec_value, exec_tb)` {#Session.__exit__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.VarLenFeature.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.VarLenFeature.md
deleted file mode 100644
index 85f2546d3e2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.VarLenFeature.md
+++ /dev/null
@@ -1,39 +0,0 @@
-Configuration for parsing a variable-length input feature.
-
-Fields:
-  dtype: Data type of input.
-- - -
-
-#### `tf.VarLenFeature.__getnewargs__()` {#VarLenFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.VarLenFeature.__getstate__()` {#VarLenFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.VarLenFeature.__new__(_cls, dtype)` {#VarLenFeature.__new__}
-
-Create new instance of VarLenFeature(dtype,)
-
-
-- - -
-
-#### `tf.VarLenFeature.__repr__()` {#VarLenFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.VarLenFeature.dtype` {#VarLenFeature.dtype}
-
-Alias for field number 0
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Variable.md
deleted file mode 100644
index 8c921f7c048..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Variable.md
+++ /dev/null
@@ -1,1156 +0,0 @@
-See the [Variables How To](../../how_tos/variables/index.md) for a high
-level overview.
-
-A variable maintains state in the graph across calls to `run()`. You add a
-variable to the graph by constructing an instance of the class `Variable`.
-
-The `Variable()` constructor requires an initial value for the variable,
-which can be a `Tensor` of any type and shape. The initial value defines the
-type and shape of the variable. After construction, the type and shape of
-the variable are fixed. The value can be changed using one of the assign
-methods.
-
-If you want to change the shape of a variable later you have to use an
-`assign` Op with `validate_shape=False`.
-
-Just like any `Tensor`, variables created with `Variable()` can be used as
-inputs for other Ops in the graph. Additionally, all the operators
-overloaded for the `Tensor` class are carried over to variables, so you can
-also add nodes to the graph by just doing arithmetic on variables.
-
-```python
-import tensorflow as tf
-
-# Create a variable.
-w = tf.Variable(<initial-value>, name=<optional-name>)
-
-# Use the variable in the graph like any Tensor.
-y = tf.matmul(w, ...another variable or tensor...)
-
-# The overloaded operators are available too.
-z = tf.sigmoid(w + y)
-
-# Assign a new value to the variable with `assign()` or a related method.
-w.assign(w + 1.0)
-w.assign_add(1.0)
-```
-
-When you launch the graph, variables have to be explicitly initialized before
-you can run Ops that use their value. You can initialize a variable by
-running its *initializer op*, restoring the variable from a save file, or
-simply running an `assign` Op that assigns a value to the variable. In fact,
-the variable *initializer op* is just an `assign` Op that assigns the
-variable's initial value to the variable itself.
-
-```python
-# Launch the graph in a session.
-with tf.Session() as sess:
-    # Run the variable initializer.
-    sess.run(w.initializer)
-    # ...you now can run ops that use the value of 'w'...
-```
-
-The most common initialization pattern is to use the convenience function
-`global_variables_initializer()` to add an Op to the graph that initializes
-all the variables. You then run that Op after launching the graph.
-
-```python
-# Add an Op to initialize global variables.
-init_op = tf.global_variables_initializer()
-
-# Launch the graph in a session.
-with tf.Session() as sess:
-    # Run the Op that initializes global variables.
-    sess.run(init_op)
-    # ...you can now run any Op that uses variable values...
-```
-
-If you need to create a variable with an initial value dependent on another
-variable, use the other variable's `initialized_value()`. This ensures that
-variables are initialized in the right order.
-
-All variables are automatically collected in the graph where they are
-created. By default, the constructor adds the new variable to the graph
-collection `GraphKeys.GLOBAL_VARIABLES`. The convenience function
-`global_variables()` returns the contents of that collection.
-
-When building a machine learning model it is often convenient to distinguish
-between variables holding the trainable model parameters and other variables
-such as a `global step` variable used to count training steps. To make this
-easier, the variable constructor supports a `trainable=<bool>` parameter. If
-`True`, the new variable is also added to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`. The convenience function
-`trainable_variables()` returns the contents of this collection. The
-various `Optimizer` classes use this collection as the default list of
-variables to optimize.
-
-
-Creating a variable.
-
-- - -
-
-#### `tf.Variable.__init__(initial_value=None, trainable=True, collections=None, validate_shape=True, caching_device=None, name=None, variable_def=None, dtype=None, expected_shape=None, import_scope=None)` {#Variable.__init__}
-
-Creates a new variable with value `initial_value`.
-
-The new variable is added to the graph collections listed in `collections`,
-which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-
-If `trainable` is `True` the variable is also added to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`.
-
-This constructor creates both a `variable` Op and an `assign` Op to set the
-variable to its initial value.
-
-##### Args:
-
-
-*  <b>`initial_value`</b>: A `Tensor`, or Python object convertible to a `Tensor`,
-    which is the initial value for the Variable. The initial value must have
-    a shape specified unless `validate_shape` is set to False. Can also be a
-    callable with no argument that returns the initial value when called. In
-    that case, `dtype` must be specified. (Note that initializer functions
-    from init_ops.py must first be bound to a shape before being used here.)
-*  <b>`trainable`</b>: If `True`, the default, also adds the variable to the graph
-    collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
-    the default list of variables to use by the `Optimizer` classes.
-*  <b>`collections`</b>: List of graph collections keys. The new variable is added to
-    these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-*  <b>`validate_shape`</b>: If `False`, allows the variable to be initialized with a
-    value of unknown shape. If `True`, the default, the shape of
-    `initial_value` must be known.
-*  <b>`caching_device`</b>: Optional device string describing where the Variable
-    should be cached for reading.  Defaults to the Variable's device.
-    If not `None`, caches on another device.  Typical use is to cache
-    on the device where the Ops using the Variable reside, to deduplicate
-    copying through `Switch` and other conditional statements.
-*  <b>`name`</b>: Optional name for the variable. Defaults to `'Variable'` and gets
-    uniquified automatically.
-*  <b>`variable_def`</b>: `VariableDef` protocol buffer. If not `None`, recreates
-    the Variable object with its contents. `variable_def` and the other
-    arguments are mutually exclusive.
-*  <b>`dtype`</b>: If set, initial_value will be converted to the given type.
-    If `None`, either the datatype will be kept (if `initial_value` is
-    a Tensor), or `convert_to_tensor` will decide.
-*  <b>`expected_shape`</b>: A TensorShape. If set, initial_value is expected
-    to have this shape.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add to the
-    `Variable.` Only used when initializing from protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `variable_def` and initial_value are specified.
-*  <b>`ValueError`</b>: If the initial value is not specified, or does not have a
-    shape and `validate_shape` is `True`.
-
-
-- - -
-
-#### `tf.Variable.initialized_value()` {#Variable.initialized_value}
-
-Returns the value of the initialized variable.
-
-You should use this instead of the variable itself to initialize another
-variable with a value that depends on the value of this variable.
-
-Beware of using initialized_value except during initialization:
-initialized_value causes the Variable's initializer op to be run, so running
-this op resets the variable to the initial value.
-
-```python
-# Initialize 'v' with a random tensor.
-v = tf.Variable(tf.truncated_normal([10, 40]))
-# Use `initialized_value` to guarantee that `v` has been
-# initialized before its value is used to initialize `w`.
-# The random values are picked only once.
-w = tf.Variable(v.initialized_value() * 2.0)
-```
-
-##### Returns:
-
-  A `Tensor` holding the value of this variable after its initializer
-  has run.
-
-
-
-Changing a variable value.
-
-- - -
-
-#### `tf.Variable.assign(value, use_locking=False)` {#Variable.assign}
-
-Assigns a new value to the variable.
-
-This is essentially a shortcut for `assign(self, value)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. The new value for this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the assignment.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the assignment has completed.
-
-
-- - -
-
-#### `tf.Variable.assign_add(delta, use_locking=False)` {#Variable.assign_add}
-
-Adds a value to this variable.
-
- This is essentially a shortcut for `assign_add(self, delta)`.
-
-##### Args:
-
-
-*  <b>`delta`</b>: A `Tensor`. The value to add to this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the addition has completed.
-
-
-- - -
-
-#### `tf.Variable.assign_sub(delta, use_locking=False)` {#Variable.assign_sub}
-
-Subtracts a value from this variable.
-
-This is essentially a shortcut for `assign_sub(self, delta)`.
-
-##### Args:
-
-
-*  <b>`delta`</b>: A `Tensor`. The value to subtract from this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the subtraction has completed.
-
-
-- - -
-
-#### `tf.Variable.scatter_sub(sparse_delta, use_locking=False)` {#Variable.scatter_sub}
-
-Subtracts `IndexedSlices` from this variable.
-
-This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices,
-sparse_delta.values)`.
-
-##### Args:
-
-
-*  <b>`sparse_delta`</b>: `IndexedSlices` to be subtracted from this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the scattered subtraction has completed.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sparse_delta` is not an `IndexedSlices`.
-
-
-- - -
-
-#### `tf.Variable.count_up_to(limit)` {#Variable.count_up_to}
-
-Increments this variable until it reaches `limit`.
-
-When that Op is run it tries to increment the variable by `1`. If
-incrementing the variable would bring it above `limit` then the Op raises
-the exception `OutOfRangeError`.
-
-If no error is raised, the Op outputs the value of the variable before
-the increment.
-
-This is essentially a shortcut for `count_up_to(self, limit)`.
-
-##### Args:
-
-
-*  <b>`limit`</b>: value at which incrementing the variable raises an error.
-
-##### Returns:
-
-  A `Tensor` that will hold the variable value before the increment. If no
-  other Op modifies this variable, the values produced will all be
-  distinct.
-
-
-
-- - -
-
-#### `tf.Variable.eval(session=None)` {#Variable.eval}
-
-In a session, computes and returns the value of this variable.
-
-This is not a graph construction method, it does not add ops to the graph.
-
-This convenience method requires a session where the graph containing this
-variable has been launched. If no session is passed, the default session is
-used.  See the [Session class](../../api_docs/python/client.md#Session) for
-more information on launching a graph and on sessions.
-
-```python
-v = tf.Variable([1, 2])
-init = tf.global_variables_initializer()
-
-with tf.Session() as sess:
-    sess.run(init)
-    # Usage passing the session explicitly.
-    print(v.eval(sess))
-    # Usage with the default session.  The 'with' block
-    # above makes 'sess' the default session.
-    print(v.eval())
-```
-
-##### Args:
-
-
-*  <b>`session`</b>: The session to use to evaluate this variable. If
-    none, the default session is used.
-
-##### Returns:
-
-  A numpy `ndarray` with a copy of the value of this variable.
-
-
-
-Properties.
-
-- - -
-
-#### `tf.Variable.name` {#Variable.name}
-
-The name of this variable.
-
-
-- - -
-
-#### `tf.Variable.dtype` {#Variable.dtype}
-
-The `DType` of this variable.
-
-
-- - -
-
-#### `tf.Variable.get_shape()` {#Variable.get_shape}
-
-The `TensorShape` of this variable.
-
-##### Returns:
-
-  A `TensorShape`.
-
-
-- - -
-
-#### `tf.Variable.device` {#Variable.device}
-
-The device of this variable.
-
-
-- - -
-
-#### `tf.Variable.initializer` {#Variable.initializer}
-
-The initializer operation for this variable.
-
-
-- - -
-
-#### `tf.Variable.graph` {#Variable.graph}
-
-The `Graph` of this variable.
-
-
-- - -
-
-#### `tf.Variable.op` {#Variable.op}
-
-The `Operation` of this variable.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Variable.__abs__(a, *args)` {#Variable.__abs__}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
-
-- - -
-
-#### `tf.Variable.__add__(a, *args)` {#Variable.__add__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__and__(a, *args)` {#Variable.__and__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__div__(a, *args)` {#Variable.__div__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Variable.__floordiv__(a, *args)` {#Variable.__floordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Variable.__ge__(a, *args)` {#Variable.__ge__}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__getitem__(var, slice_spec)` {#Variable.__getitem__}
-
-Creates a slice helper object given a variable.
-
-This allows creating a sub-tensor from part of the current contents
-of a variable.
-See
-[`Tensor.__getitem__`](../../api_docs/python/framework.md#Tensor.__getitem__)
-for detailed examples of slicing.
-
-This function in addition also allows assignment to a sliced range.
-This is similar to `__setitem__` functionality in Python. However,
-the syntax is different so that the user can capture the assignment
-operation for grouping or passing to `sess.run()`.
-For example,
-
-```prettyprint
-import tensorflow as tf
-A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  print sess.run(A[:2, :2]) # => [[1,2], [4,5]]
-
-  op = A[:2,:2].assign(22. * tf.ones((2, 2)))
-  print sess.run(op) # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
-```
-
-Note that assignments currently do not support NumPy broadcasting
-semantics.
-
-##### Args:
-
-
-*  <b>`var`</b>: An `ops.Variable` object.
-*  <b>`slice_spec`</b>: The arguments to `Tensor.__getitem__`.
-
-##### Returns:
-
-  The appropriate slice of "tensor", based on "slice_spec".
-  As an operator. The operator also has a `assign()` method
-  that can be used to generate an assignment operator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If a slice range is negative size.
-*  <b>`TypeError`</b>: If the slice indices aren't int, slice, or Ellipsis.
-
-
-- - -
-
-#### `tf.Variable.__gt__(a, *args)` {#Variable.__gt__}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__invert__(a, *args)` {#Variable.__invert__}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__iter__()` {#Variable.__iter__}
-
-Dummy method to prevent iteration. Do not call.
-
-NOTE(mrry): If we register __getitem__ as an overloaded operator,
-Python will valiantly attempt to iterate over the variable's Tensor from 0
-to infinity.  Declaring this method prevents this unintended behavior.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: when invoked.
-
-
-- - -
-
-#### `tf.Variable.__le__(a, *args)` {#Variable.__le__}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__lt__(a, *args)` {#Variable.__lt__}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__mod__(a, *args)` {#Variable.__mod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__mul__(a, *args)` {#Variable.__mul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Variable.__neg__(a, *args)` {#Variable.__neg__}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__or__(a, *args)` {#Variable.__or__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__pow__(a, *args)` {#Variable.__pow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.__radd__(a, *args)` {#Variable.__radd__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rand__(a, *args)` {#Variable.__rand__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__rdiv__(a, *args)` {#Variable.__rdiv__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Variable.__rfloordiv__(a, *args)` {#Variable.__rfloordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Variable.__rmod__(a, *args)` {#Variable.__rmod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rmul__(a, *args)` {#Variable.__rmul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Variable.__ror__(a, *args)` {#Variable.__ror__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__rpow__(a, *args)` {#Variable.__rpow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.__rsub__(a, *args)` {#Variable.__rsub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rtruediv__(a, *args)` {#Variable.__rtruediv__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__rxor__(a, *args)` {#Variable.__rxor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Variable.__str__()` {#Variable.__str__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__sub__(a, *args)` {#Variable.__sub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__truediv__(a, *args)` {#Variable.__truediv__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__xor__(a, *args)` {#Variable.__xor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Variable.from_proto(variable_def, import_scope=None)` {#Variable.from_proto}
-
-Returns a `Variable` object created from `variable_def`.
-
-
-- - -
-
-#### `tf.Variable.initial_value` {#Variable.initial_value}
-
-Returns the Tensor used as the initial value for the variable.
-
-Note that this is different from `initialized_value()` which runs
-the op that initializes the variable before returning its value.
-This method returns the tensor that is used by the op that initializes
-the variable.
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.load(value, session=None)` {#Variable.load}
-
-Load new value into this variable
-
-Writes new value to variable's memory. Doesn't add ops to the graph.
-
-This convenience method requires a session where the graph containing this
-variable has been launched. If no session is passed, the default session is
-used.  See the [Session class](../../api_docs/python/client.md#Session) for
-more information on launching a graph and on sessions.
-
-```python
-v = tf.Variable([1, 2])
-init = tf.global_variables_initializer()
-
-with tf.Session() as sess:
-    sess.run(init)
-    # Usage passing the session explicitly.
-    v.load([2, 3], sess)
-    print(v.eval(sess)) # prints [2 3]
-    # Usage with the default session.  The 'with' block
-    # above makes 'sess' the default session.
-    v.load([3, 4], sess)
-    print(v.eval()) # prints [3 4]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: New variable value
-*  <b>`session`</b>: The session to use to evaluate this variable. If
-      none, the default session is used.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: Session is not passed and no default session
-
-
-- - -
-
-#### `tf.Variable.read_value()` {#Variable.read_value}
-
-Returns the value of this variable, read in the current context.
-
-Can be different from value() if it's on another device, with control
-dependencies, etc.
-
-##### Returns:
-
-  A `Tensor` containing the value of the variable.
-
-
-- - -
-
-#### `tf.Variable.set_shape(shape)` {#Variable.set_shape}
-
-Overrides the shape for this variable.
-
-##### Args:
-
-
-*  <b>`shape`</b>: the `TensorShape` representing the overridden shape.
-
-
-- - -
-
-#### `tf.Variable.to_proto(export_scope=None)` {#Variable.to_proto}
-
-Converts a `Variable` to a `VariableDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `VariableDef` protocol buffer, or `None` if the `Variable` is not
-  in the specified name scope.
-
-
-- - -
-
-#### `tf.Variable.value()` {#Variable.value}
-
-Returns the last snapshot of this variable.
-
-You usually do not need to call this method as all ops that need the value
-of the variable call it automatically through a `convert_to_tensor()` call.
-
-Returns a `Tensor` which holds the value of the variable.  You can not
-assign a new value to this tensor as it is not a reference to the variable.
-
-To avoid copies, if the consumer of the returned value is on the same device
-as the variable, this actually returns the live value of the variable, not
-a copy.  Updates to the variable are seen by the consumer.  If the consumer
-is on a different device it will get a copy of the variable.
-
-##### Returns:
-
-  A `Tensor` containing the value of the variable.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.acos.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.acos.md
deleted file mode 100644
index 15ecc97044e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.acos.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.acos(x, name=None)` {#acos}
-
-Computes acos of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md
deleted file mode 100644
index 44a278e0d49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.argmax.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.argmax(input, axis=None, name=None, dimension=None)` {#argmax}
-
-Returns the index with the largest value across axes of a tensor.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    int32, 0 <= axis < rank(input).  Describes which axis
-    of the input Tensor to reduce across. For vectors, use axis = 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md
deleted file mode 100644
index 6f93226afe0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.assert_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_negative}
-
-Assert the condition `x < 0` holds element-wise.
-
-Example of adding a dependency to an operation:
-
-```python
-with tf.control_dependencies([tf.assert_negative(x)]):
-  output = tf.reduce_sum(x)
-```
-
-Negative means, for every element `x[i]` of `x`, we have `x[i] < 0`.
-If `x` is empty this is trivially satisfied.
-
-##### Args:
-
-
-*  <b>`x`</b>: Numeric `Tensor`.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
-*  <b>`message`</b>: A string to prefix to the default message.
-*  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_negative".
-
-##### Returns:
-
-  Op raising `InvalidArgumentError` unless `x` is all negative.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_proper_iterable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_proper_iterable.md
deleted file mode 100644
index ba010737653..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_proper_iterable.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.assert_proper_iterable(values)` {#assert_proper_iterable}
-
-Static assert that values is a "proper" iterable.
-
-`Ops` that expect iterables of `Tensor` can call this to validate input.
-Useful since `Tensor`, `ndarray`, byte/text type are all iterables themselves.
-
-##### Args:
-
-
-*  <b>`values`</b>: Object to be checked.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `values` is not iterable or is one of
-    `Tensor`, `SparseTensor`, `np.array`, `tf.compat.bytes_or_text_types`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md
deleted file mode 100644
index 73232dddc13..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.assign_sub(ref, value, use_locking=None, name=None)` {#assign_sub}
-
-Update 'ref' by subtracting 'value' from it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types:
-    `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`,
-    `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be subtracted to the variable.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the subtraction will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.atan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.atan.md
deleted file mode 100644
index 63fe76f4603..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.atan.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.atan(x, name=None)` {#atan}
-
-Computes atan of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md
deleted file mode 100644
index 1f84141a13c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md
+++ /dev/null
@@ -1,136 +0,0 @@
-### `tf.batch_to_space_nd(input, block_shape, crops, name=None)` {#batch_to_space_nd}
-
-BatchToSpace for N-D tensors of type T.
-
-This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-`block_shape + [batch]`, interleaves these blocks back into the grid defined by
-the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-the input.  The spatial dimensions of this intermediate result are then
-optionally cropped according to `crops` to produce the output.  This is the
-reverse of SpaceToBatch.  See below for a precise description.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-    where spatial_shape has M dimensions.
-*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D with shape `[M]`, all values must be >= 1.
-*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D with shape `[M, 2]`, all values must be >= 0.
-      `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-      dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-      required that
-      `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-
-    This operation is equivalent to the following steps:
-
-    1. Reshape `input` to `reshaped` of shape:
-         [block_shape[0], ..., block_shape[M-1],
-          batch / prod(block_shape),
-          input_shape[1], ..., input_shape[N-1]]
-
-    2. Permute dimensions of `reshaped` to produce `permuted` of shape
-         [batch / prod(block_shape),
-
-          input_shape[1], block_shape[0],
-          ...,
-          input_shape[M], block_shape[M-1],
-
-          input_shape[M+1], ..., input_shape[N-1]]
-
-    3. Reshape `permuted` to produce `reshaped_permuted` of shape
-         [batch / prod(block_shape),
-
-          input_shape[1] * block_shape[0],
-          ...,
-          input_shape[M] * block_shape[M-1],
-
-          input_shape[M+1],
-          ...,
-          input_shape[N-1]]
-
-    4. Crop the start and end of dimensions `[1, ..., M]` of
-       `reshaped_permuted` according to `crops` to produce the output of shape:
-         [batch / prod(block_shape),
-
-          input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-          ...,
-          input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-
-          input_shape[M+1], ..., input_shape[N-1]]
-
-    Some examples:
-
-    (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    The output tensor has shape `[1, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    The output tensor has shape `[1, 2, 2, 3]` and value:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [0, 0]]`:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    The output tensor has shape `[1, 4, 4, 1]` and value:
-
-    ```prettyprint
-    x = [[[1],   [2],  [3],  [4]],
-         [[5],   [6],  [7],  [8]],
-         [[9],  [10], [11],  [12]],
-         [[13], [14], [15],  [16]]]
-    ```
-
-    (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-        `crops = [[0, 0], [2, 0]]`:
-
-    ```prettyprint
-    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-         [[[0], [2], [4]]], [[[0], [10], [12]]],
-         [[[0], [5], [7]]], [[[0], [13], [15]]],
-         [[[0], [6], [8]]], [[[0], [14], [16]]]]
-    ```
-
-    The output tensor has shape `[2, 2, 4, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.stochastic_tensor.MeanValue.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.stochastic_tensor.MeanValue.md
deleted file mode 100644
index 032b60f98bf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.stochastic_tensor.MeanValue.md
+++ /dev/null
@@ -1,36 +0,0 @@
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.__init__(stop_gradient=False)` {#MeanValue.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.declare_inputs(unused_stochastic_tensor, unused_inputs_dict)` {#MeanValue.declare_inputs}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.popped_above(unused_value_type)` {#MeanValue.popped_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.pushed_above(unused_value_type)` {#MeanValue.pushed_above}
-
-
-
-
-- - -
-
-#### `tf.contrib.bayesflow.stochastic_tensor.MeanValue.stop_gradient` {#MeanValue.stop_gradient}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.variational_inference.elbo_with_log_joint.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.variational_inference.elbo_with_log_joint.md
deleted file mode 100644
index 592be500f2c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.bayesflow.variational_inference.elbo_with_log_joint.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.bayesflow.variational_inference.elbo_with_log_joint(log_joint, variational=None, keep_batch_dim=True, form=None, name='ELBO')` {#elbo_with_log_joint}
-
-Evidence Lower BOund. `log p(x) >= ELBO`.
-
-This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
-See `elbo` for further details.
-
-Because only the joint is specified, analytic KL is not available.
-
-##### Args:
-
-
-*  <b>`log_joint`</b>: `Tensor` log p(x, Z).
-*  <b>`variational`</b>: list of `StochasticTensor` q(Z). If `None`, defaults to all
-    `StochasticTensor` objects upstream of `log_joint`.
-*  <b>`keep_batch_dim`</b>: bool. Whether to keep the batch dimension when summing
-    entropy term. When the sample is per data point, this should be True;
-    otherwise (e.g. in a Bayesian NN), this should be False.
-*  <b>`form`</b>: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-    ELBOForms.default.
-*  <b>`name`</b>: name to prefix ops with.
-
-##### Returns:
-
-  `Tensor` ELBO of the same type and shape as `log_joint`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if variationals in `variational` are not `StochasticTensor`s.
-*  <b>`TypeError`</b>: if form is not a valid ELBOForms constant.
-*  <b>`ValueError`</b>: if `variational` is None and there are no `StochasticTensor`s
-    upstream of `log_joint`.
-*  <b>`ValueError`</b>: if form is ELBOForms.analytic_kl.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
deleted file mode 100644
index 4238739932e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
+++ /dev/null
@@ -1,660 +0,0 @@
-Mixture distribution.
-
-The `Mixture` object implements batched mixture distributions.
-The mixture model is defined by a `Categorical` distribution (the mixture)
-and a python list of `Distribution` objects.
-
-Methods supported include `log_prob`, `prob`, `mean`, `sample`, and
-`entropy_lower_bound`.
-- - -
-
-#### `tf.contrib.distributions.Mixture.__init__(cat, components, validate_args=False, allow_nan_stats=True, name='Mixture')` {#Mixture.__init__}
-
-Initialize a Mixture distribution.
-
-A `Mixture` is defined by a `Categorical` (`cat`, representing the
-mixture probabilities) and a list of `Distribution` objects
-all having matching dtype, batch shape, event shape, and continuity
-properties (the components).
-
-The `num_classes` of `cat` must be possible to infer at graph construction
-time and match `len(components)`.
-
-##### Args:
-
-
-*  <b>`cat`</b>: A `Categorical` distribution instance, representing the probabilities
-      of `distributions`.
-*  <b>`components`</b>: A list or tuple of `Distribution` instances.
-    Each instance must have the same type, be defined on the same domain,
-    and have matching `event_shape` and `batch_shape`.
-*  <b>`validate_args`</b>: `Boolean`, default `False`.  If `True`, raise a runtime
-    error if batch or event ranks are inconsistent between cat and any of
-    the distributions.  This is only checked if the ranks cannot be
-    determined statically at graph construction time.
-*  <b>`allow_nan_stats`</b>: Boolean, default `True`.  If `False`, raise an
-   exception if a statistic (e.g. mean/mode/etc...) is undefined for any
-    batch member.  If `True`, batch members with valid parameters leading to
-    undefined statistics will return NaN for this statistic.
-*  <b>`name`</b>: A name for this distribution (optional).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If cat is not a `Categorical`, or `components` is not
-    a list or tuple, or the elements of `components` are not
-    instances of `Distribution`, or do not have matching `dtype`.
-*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or its
-    elements do not have a statically known event rank.
-    If `cat.num_classes` cannot be inferred at graph creation time,
-    or the constant value of `cat.num_classes` is not equal to
-    `len(components)`, or all `components` and `cat` do not have
-    matching static batch shapes, or all components do not
-    have matching static event shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.allow_nan_stats` {#Mixture.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.batch_shape` {#Mixture.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.batch_shape_tensor(name='batch_shape_tensor')` {#Mixture.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.cat` {#Mixture.cat}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.cdf(value, name='cdf')` {#Mixture.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.components` {#Mixture.components}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.copy(**override_parameters_kwargs)` {#Mixture.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.covariance(name='covariance')` {#Mixture.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.dtype` {#Mixture.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.entropy(name='entropy')` {#Mixture.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.entropy_lower_bound(name='entropy_lower_bound')` {#Mixture.entropy_lower_bound}
-
-A lower bound on the entropy of this mixture model.
-
-The bound below is not always very tight, and its usefulness depends
-on the mixture probabilities and the components in use.
-
-A lower bound is useful for ELBO when the `Mixture` is the variational
-distribution:
-
-\\(
-\log p(x) >= ELBO = \int q(z) \log p(x, z) dz + H[q]
-\\)
-
-where \\( p \\) is the prior distribution, \\( q \\) is the variational,
-and \\( H[q] \\) is the entropy of \\( q \\).  If there is a lower bound
-\\( G[q] \\) such that \\( H[q] \geq G[q] \\) then it can be used in
-place of \\( H[q] \\).
-
-For a mixture of distributions \\( q(Z) = \sum_i c_i q_i(Z) \\) with
-\\( \sum_i c_i = 1 \\), by the concavity of \\( f(x) = -x \log x \\), a
-simple lower bound is:
-
-\\(
-\begin{align}
-H[q] & = - \int q(z) \log q(z) dz \\\
-   & = - \int (\sum_i c_i q_i(z)) \log(\sum_i c_i q_i(z)) dz \\\
-   & \geq - \sum_i c_i \int q_i(z) \log q_i(z) dz \\\
-   & = \sum_i c_i H[q_i]
-\end{align}
-\\)
-
-This is the term we calculate below for \\( G[q] \\).
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A lower bound on the Mixture's entropy.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.event_shape` {#Mixture.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.event_shape_tensor(name='event_shape_tensor')` {#Mixture.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_continuous` {#Mixture.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_scalar_batch(name='is_scalar_batch')` {#Mixture.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.is_scalar_event(name='is_scalar_event')` {#Mixture.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_cdf(value, name='log_cdf')` {#Mixture.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_prob(value, name='log_prob')` {#Mixture.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.log_survival_function(value, name='log_survival_function')` {#Mixture.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.mean(name='mean')` {#Mixture.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.mode(name='mode')` {#Mixture.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.name` {#Mixture.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.num_components` {#Mixture.num_components}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Mixture.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.param_static_shapes(cls, sample_shape)` {#Mixture.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.parameters` {#Mixture.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.prob(value, name='prob')` {#Mixture.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.reparameterization_type` {#Mixture.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.sample(sample_shape=(), seed=None, name='sample')` {#Mixture.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.stddev(name='stddev')` {#Mixture.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.survival_function(value, name='survival_function')` {#Mixture.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.validate_args` {#Mixture.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Mixture.variance(name='variance')` {#Mixture.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.bijector.SoftmaxCentered.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.bijector.SoftmaxCentered.md
deleted file mode 100644
index 3e5f7de3d65..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.bijector.SoftmaxCentered.md
+++ /dev/null
@@ -1,297 +0,0 @@
-Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-bijection, the forward transformation appends a value to the input and the
-inverse removes this coordinate.  The appended coordinate represents a pivot,
-e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-coordinate.
-
-Because we append a coordinate, this bijector only supports `event_ndim in [0,
-1]`, i.e., scalars and vectors.
-
-Example Use:
-
-```python
-bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-# Result: [0.2, 0.3, 0.4, 0.1]
-# Extra result: 0.1
-
-bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-# Result: tf.log([2, 3, 4])
-# Extra coordinate removed.
-```
-
-At first blush it may seem like the [Invariance of domain](
-https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-implementation is not a bijection.  However, the appended dimension
-makes the (forward) image non-open and the theorem does not directly apply.
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.__init__(event_ndims=0, validate_args=False, name='softmax_centered')` {#SoftmaxCentered.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.dtype` {#SoftmaxCentered.dtype}
-
-dtype of `Tensor`s transformable by this distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.event_ndims` {#SoftmaxCentered.event_ndims}
-
-Returns then number of event dimensions this bijector operates on.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward(x, name='forward')` {#SoftmaxCentered.forward}
-
-Returns the forward `Bijector` evaluation, i.e., X = g(Y).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_event_shape(input_shape)` {#SoftmaxCentered.forward_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `forward_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `forward` function.
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `forward`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_event_shape_tensor(input_shape, name='forward_event_shape_tensor')` {#SoftmaxCentered.forward_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`input_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `forward` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`forward_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `forward`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.forward_log_det_jacobian(x, name='forward_log_det_jacobian')` {#SoftmaxCentered.forward_log_det_jacobian}
-
-Returns both the forward_log_det_jacobian.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.graph_parents` {#SoftmaxCentered.graph_parents}
-
-Returns this `Bijector`'s graph_parents as a Python list.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse(y, name='inverse')` {#SoftmaxCentered.inverse}
-
-Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian')` {#SoftmaxCentered.inverse_and_inverse_log_det_jacobian}
-
-Returns both the inverse evaluation and inverse_log_det_jacobian.
-
-Enables possibly more efficient calculation when both inverse and
-corresponding Jacobian are needed.
-
-See `inverse()`, `inverse_log_det_jacobian()` for more details.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
-    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_event_shape(output_shape)` {#SoftmaxCentered.inverse_event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `TensorShape` indicating event-portion shape passed into
-    `inverse` function.
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `TensorShape` indicating event-portion shape
-    after applying `inverse`. Possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_event_shape_tensor(output_shape, name='inverse_event_shape_tensor')` {#SoftmaxCentered.inverse_event_shape_tensor}
-
-Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
-
-##### Args:
-
-
-*  <b>`output_shape`</b>: `Tensor`, `int32` vector indicating event-portion shape
-    passed into `inverse` function.
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`inverse_event_shape_tensor`</b>: `Tensor`, `int32` vector indicating
-    event-portion shape after applying `inverse`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian')` {#SoftmaxCentered.inverse_log_det_jacobian}
-
-Returns the (log o det o Jacobian o inverse)(y).
-
-Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
-
-Note that `forward_log_det_jacobian` is the negative of this function.
-
-##### Args:
-
-
-*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
-    `self.dtype`.
-*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
-    `_inverse_and_inverse_log_det_jacobian` are implemented.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.is_constant_jacobian` {#SoftmaxCentered.is_constant_jacobian}
-
-Returns true iff the Jacobian is not a function of x.
-
-Note: Jacobian is either constant for both forward and inverse or neither.
-
-##### Returns:
-
-  `Boolean`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.name` {#SoftmaxCentered.name}
-
-Returns the string name of this `Bijector`.
-
-
-- - -
-
-#### `tf.contrib.distributions.bijector.SoftmaxCentered.validate_args` {#SoftmaxCentered.validate_args}
-
-Returns True if Tensor arguments will be validated.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.matrix_diag_transform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.matrix_diag_transform.md
deleted file mode 100644
index 6f0edd83040..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.matrix_diag_transform.md
+++ /dev/null
@@ -1,53 +0,0 @@
-### `tf.contrib.distributions.matrix_diag_transform(matrix, transform=None, name=None)` {#matrix_diag_transform}
-
-Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
-
-Create a trainable covariance defined by a Cholesky factor:
-
-```python
-# Transform network layer into 2 x 2 array.
-matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-
-# Make the diagonal positive.  If the upper triangle was zero, this would be a
-# valid Cholesky factor.
-chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-# OperatorPDCholesky ignores the upper triangle.
-operator = OperatorPDCholesky(chol)
-```
-
-Example of heteroskedastic 2-D linear regression.
-
-```python
-# Get a trainable Cholesky factor.
-matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-# Get a trainable mean.
-mu = tf.contrib.layers.fully_connected(activations, 2)
-
-# This is a fully trainable multivariate normal!
-dist = tf.contrib.distributions.MVNCholesky(mu, chol)
-
-# Standard log loss.  Minimizing this will "train" mu and chol, and then dist
-# will be a distribution predicting labels as multivariate Gaussians.
-loss = -1 * tf.reduce_mean(dist.log_prob(labels))
-```
-
-##### Args:
-
-
-*  <b>`matrix`</b>: Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
-    equal.
-*  <b>`transform`</b>: Element-wise function mapping `Tensors` to `Tensors`.  To
-    be applied to the diagonal of `matrix`.  If `None`, `matrix` is returned
-    unchanged.  Defaults to `None`.
-*  <b>`name`</b>: A name to give created ops.
-    Defaults to "matrix_diag_transform".
-
-##### Returns:
-
-  A `Tensor` with same shape and `dtype` as `matrix`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.normal_conjugates_known_scale_posterior.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.normal_conjugates_known_scale_posterior.md
deleted file mode 100644
index 9b2cce1cc41..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.normal_conjugates_known_scale_posterior.md
+++ /dev/null
@@ -1,48 +0,0 @@
-### `tf.contrib.distributions.normal_conjugates_known_scale_posterior(prior, scale, s, n)` {#normal_conjugates_known_scale_posterior}
-
-Posterior Normal distribution with conjugate prior on the mean.
-
-This model assumes that `n` observations (with sum `s`) come from a
-Normal with unknown mean `loc` (described by the Normal `prior`)
-and known variance `scale^2`.  The "known scale posterior" is
-the distribution of the unknown `loc`.
-
-Accepts a prior Normal distribution object, having parameters
-`loc0` and `scale0`, as well as known `scale` values of the predictive
-distribution(s) (also assumed Normal),
-and statistical estimates `s` (the sum(s) of the observations) and
-`n` (the number(s) of observations).
-
-Returns a posterior (also Normal) distribution object, with parameters
-`(loc', scale'^2)`, where:
-
-```
-mu ~ N(mu', sigma'^2)
-sigma'^2 = 1/(1/sigma0^2 + n/sigma^2),
-mu' = (mu0/sigma0^2 + s/sigma^2) * sigma'^2.
-```
-
-Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
-will broadcast in the case of multidimensional sets of parameters.
-
-##### Args:
-
-
-*  <b>`prior`</b>: `Normal` object of type `dtype`:
-    the prior distribution having parameters `(loc0, scale0)`.
-*  <b>`scale`</b>: tensor of type `dtype`, taking values `scale > 0`.
-    The known stddev parameter(s).
-*  <b>`s`</b>: Tensor of type `dtype`.  The sum(s) of observations.
-*  <b>`n`</b>: Tensor of type `int`.  The number(s) of observations.
-
-##### Returns:
-
-  A new Normal posterior distribution object for the unknown observation
-  mean `loc`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Normal object.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.assert_same_float_dtype.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.assert_same_float_dtype.md
deleted file mode 100644
index e5ecdd08984..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.assert_same_float_dtype.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.framework.assert_same_float_dtype(tensors=None, dtype=None)` {#assert_same_float_dtype}
-
-Validate and return float type based on `tensors` and `dtype`.
-
-For ops such as matrix multiplication, inputs and weights must be of the
-same float type. This function validates that all `tensors` are the same type,
-validates that type is `dtype` (if supplied), and returns the type. Type must
-be `dtypes.float32` or `dtypes.float64`. If neither `tensors` nor
-`dtype` is supplied, default to `dtypes.float32`.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: Tensors of input values. Can include `None` elements, which will be
-      ignored.
-*  <b>`dtype`</b>: Expected type.
-
-##### Returns:
-
-  Validated type.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if neither `tensors` nor `dtype` is supplied, or result is not
-      float.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.zero_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.zero_initializer.md
deleted file mode 100644
index 7f78c18e451..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.framework.zero_initializer.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.framework.zero_initializer(ref, use_locking=True, name='zero_initializer')` {#zero_initializer}
-
-Initialize 'ref' with all zeros, ref tensor should be uninitialized.
-If already initialized, you will get ValueError. This op is intended to
-save memory during initialization.
-
-##### Args:
-
-
-*  <b>`ref`</b>: ref of the tensor need to be zero initialized.
-*  <b>`name`</b>: optional name for this operation.
-
-##### Returns:
-
-  ref that initialized.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If ref tensor is initialized.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.ControlOutputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.ControlOutputs.md
deleted file mode 100644
index 30b5d435d11..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.ControlOutputs.md
+++ /dev/null
@@ -1,52 +0,0 @@
-The control outputs topology.
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.__init__(graph)` {#ControlOutputs.__init__}
-
-Create a dictionary of control-output dependencies.
-
-##### Args:
-
-
-*  <b>`graph`</b>: a `tf.Graph`.
-
-##### Returns:
-
-  A dictionary where a key is a `tf.Operation` instance and the
-     corresponding value is a list of all the ops which have the key
-     as one of their control-input dependencies.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: graph is not a `tf.Graph`.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.get(op)` {#ControlOutputs.get}
-
-return the control outputs of op.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.get_all()` {#ControlOutputs.get_all}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.graph` {#ControlOutputs.graph}
-
-
-
-
-- - -
-
-#### `tf.contrib.graph_editor.ControlOutputs.update()` {#ControlOutputs.update}
-
-Update the control outputs if the graph has changed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
deleted file mode 100644
index 2b8f433b54e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
+++ /dev/null
@@ -1,64 +0,0 @@
-Transform a subgraph into another one.
-
-By default, the constructor create a transform which copy a subgraph and
-replaces inputs with placeholders. This behavior can be modified by changing
-the handlers.
-- - -
-
-#### `tf.contrib.graph_editor.Transformer.__call__(sgv, dst_graph, dst_scope, src_scope='', reuse_dst_scope=False)` {#Transformer.__call__}
-
-Execute the transformation.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope, which specify the path from which the
-    relative path of the transformed nodes are computed. For instance, if
-    src_scope is a/ and dst_scoped is b/, then the node a/x/y will have a
-    relative path of x/y and will be transformed into b/x/y.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the arguments are invalid.
-
-
-- - -
-
-#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
-
-Transformer constructor.
-
-The following members can be modified:
-transform_op_handler: handle the transformation of a `tf.Operation`.
-  This handler defaults to a simple copy.
-assign_collections_handler: handle the assignment of collections.
-  This handler defaults to assigning new collections created under the
-  given name-scope.
-transform_external_input_handler: handle the transform of the inputs to
-  the given subgraph. This handler defaults to creating placeholders
-  instead of the ops just before the input tensors of the subgraph.
-transform_external_hidden_input_handler: handle the transform of the
-  hidden inputs of the subgraph, that is, the inputs which are not listed
-  in sgv.inputs. This handler defaults to a transform which keep the same
-  input if the source and destination graphs are the same, otherwise
-  use placeholders.
-transform_original_op_handler: handle the transform of original_op. This
-  handler defaults to transforming original_op only if they are in the
-  subgraph, otherwise they are ignored.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.check_cios.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.check_cios.md
deleted file mode 100644
index 6943d503765..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.check_cios.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.contrib.graph_editor.check_cios(control_inputs=False, control_outputs=None, control_ios=None)` {#check_cios}
-
-Do various check on control_inputs and control_outputs.
-
-##### Args:
-
-
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A tuple `(control_inputs, control_outputs)` where:
-    `control_inputs` is a boolean indicating whether to use control inputs.
-    `control_outputs` is an instance of util.ControlOutputs or None
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if control_inputs is an instance of util.ControlOutputs but
-    control_outputs is not None
-*  <b>`TypeError`</b>: if control_outputs is not None and is not a util.ControlOutputs.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.copy_with_input_replacements.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.copy_with_input_replacements.md
deleted file mode 100644
index 47a30fe1be0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.copy_with_input_replacements.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.graph_editor.copy_with_input_replacements(sgv, replacement_ts, dst_graph=None, dst_scope='', src_scope='', reuse_dst_scope=False)` {#copy_with_input_replacements}
-
-Copy a subgraph, replacing some of its inputs.
-
-Note a replacement only happens if the tensor to be replaced
-is an input of the given subgraph. The inputs of a subgraph can
-be queried using sgv.inputs.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
-    using the same rules as the function subgraph.make_view.
-*  <b>`replacement_ts`</b>: dictionary mapping from original tensors to the
-    replaced one.
-*  <b>`dst_graph`</b>: the destination graph.
-*  <b>`dst_scope`</b>: the destination scope.
-*  <b>`src_scope`</b>: the source scope.
-*  <b>`reuse_dst_scope`</b>: if True the dst_scope is re-used if it already exists.
-    Otherwise, the scope is given a unique name based on the one given
-    by appending an underscore followed by a digit (default).
-
-##### Returns:
-
-  A tuple `(sgv, info)` where:
-    `sgv` is the transformed subgraph view;
-    `info` is an instance of TransformerInfo containing
-    information about the transform, including mapping between
-    original and transformed tensors and operations.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules as the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
deleted file mode 100644
index 8230e9b3cd6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
-
-Detach both the inputs and the outputs of a subgraph view.
-
-##### Args:
-
-
-*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
-    subgraph using the same rules as the function subgraph.make_view.
-    Note that sgv is modified in place.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A tuple `(sgv, detached_inputs, detached_outputs)` where:
-  `sgv` is a new subgraph view of the detached subgraph;
-  `detach_inputs` is a list of the created input placeholders;
-  `detach_outputs` is a list of the created output placeholders.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_backward_walk_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_backward_walk_ops.md
deleted file mode 100644
index f22dc2ed1cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_backward_walk_ops.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.graph_editor.get_backward_walk_ops(seed_ops, inclusive=True, within_ops=None, stop_at_ts=(), control_inputs=False)` {#get_backward_walk_ops}
-
-Do a backward graph walk and return all the visited ops.
-
-##### Args:
-
-
-*  <b>`seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`inclusive`</b>: if True the given seed_ops are also part of the resulting set.
-*  <b>`within_ops`</b>: an iterable of `tf.Operation` within which the search is
-    restricted. If `within_ops` is `None`, the search is performed within
-    the whole graph.
-*  <b>`stop_at_ts`</b>: an iterable of tensors at which the graph walk stops.
-*  <b>`control_inputs`</b>: if True, control inputs will be used while moving backward.
-
-##### Returns:
-
-  A Python set of all the `tf.Operation` behind `seed_ops`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `seed_ops` or `within_ops` cannot be converted to a list of
-    `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_ops_ios.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_ops_ios.md
deleted file mode 100644
index 30491b740cf..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.get_ops_ios.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.contrib.graph_editor.get_ops_ios(ops, control_inputs=False, control_outputs=None, control_ios=None)` {#get_ops_ios}
-
-Return all the `tf.Operation` which are connected to an op in ops.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of `util.ControlOutputs` or `None`. If not `None`,
-    both control inputs and control outputs are enabled. This is equivalent to
-    set `control_inputs` to `True` and `control_outputs` to the
-    `util.ControlOutputs` instance.
-
-##### Returns:
-
-  All the `tf.Operation` surrounding the given ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ops` cannot be converted to a list of `tf.Operation`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.apply_regularization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.apply_regularization.md
deleted file mode 100644
index ec6beee0af0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.apply_regularization.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.contrib.layers.apply_regularization(regularizer, weights_list=None)` {#apply_regularization}
-
-Returns the summed penalty by applying `regularizer` to the `weights_list`.
-
-Adding a regularization penalty over the layer weights and embedding weights
-can help prevent overfitting the training data. Regularization over layer
-biases is less common/useful, but assuming proper data preprocessing/mean
-subtraction, it usually shouldn't hurt much either.
-
-##### Args:
-
-
-*  <b>`regularizer`</b>: A function that takes a single `Tensor` argument and returns
-    a scalar `Tensor` output.
-*  <b>`weights_list`</b>: List of weights `Tensors` or `Variables` to apply
-    `regularizer` over. Defaults to the `GraphKeys.WEIGHTS` collection if
-    `None`.
-
-##### Returns:
-
-  A scalar representing the overall regularization penalty.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `regularizer` does not return a scalar output, or if we find
-      no weights.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.conv2d_in_plane.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.conv2d_in_plane.md
deleted file mode 100644
index 83319a6d6bd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.conv2d_in_plane.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.contrib.layers.conv2d_in_plane(*args, **kwargs)` {#conv2d_in_plane}
-
-Performs the same in-plane convolution to each channel independently.
-
-This is useful for performing various simple channel-independent convolution
-operations such as image gradients:
-
-  image = tf.constant(..., shape=(16, 240, 320, 3))
-  vert_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[2, 1])
-  horz_gradients = layers.conv2d_in_plane(image,
-                                          kernel=[1, -1],
-                                          kernel_size=[1, 2])
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor with dimensions [batch_size, height, width, channels].
-*  <b>`kernel_size`</b>: A list of length 2 holding the [kernel_height, kernel_width] of
-    of the pooling. Can be an int if both values are the same.
-*  <b>`stride`</b>: A list of length 2 `[stride_height, stride_width]`.
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: Activation function. The default value is a ReLU function.
-    Explicitly set it to None to skip it and maintain a linear activation.
-*  <b>`normalizer_fn`</b>: Normalization function to use instead of `biases`. If
-    `normalizer_fn` is provided then `biases_initializer` and
-    `biases_regularizer` are ignored and `biases` are not created nor added.
-    default set to None for no normalizer function
-*  <b>`normalizer_params`</b>: Normalization function parameters.
-*  <b>`weights_initializer`</b>: An initializer for the weights.
-*  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
-*  <b>`biases_initializer`</b>: An initializer for the biases. If None skip biases.
-*  <b>`biases_regularizer`</b>: Optional regularizer for the biases.
-*  <b>`reuse`</b>: Whether or not the layer and its variables should be reused. To be
-    able to reuse the layer scope must be given.
-*  <b>`variables_collections`</b>: Optional list of collections for all the variables or
-    a dictionary containing a different list of collection per variable.
-*  <b>`outputs_collections`</b>: Collection to add the outputs.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for `variable_scope`.
-
-##### Returns:
-
-  A `Tensor` representing the output of the operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md
deleted file mode 100644
index 823ec0f7343..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.contrib.layers.max_pool2d(*args, **kwargs)` {#max_pool2d}
-
-Adds a 2D Max Pooling op.
-
-It is assumed that the pooling is done per image but not in batch or channels.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A 4-D tensor of shape `[batch_size, height, width, channels]` if
-    `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
-    `data_format` is `NCHW`.
-*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
-    pooling kernel over which the op is computed. Can be an int if both
-    values are the same.
-*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same. Note that presently
-    both strides must have the same value.
-*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
-*  <b>`data_format`</b>: A string. `NHWC` (default) and `NCHW` are supported.
-*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
-*  <b>`scope`</b>: Optional scope for name_scope.
-
-##### Returns:
-
-  A `Tensor` representing the results of the pooling operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `data_format` is neither `NHWC` nor `NCHW`.
-*  <b>`ValueError`</b>: If 'kernel_size' is not a 2-D list
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_examples.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_examples.md
deleted file mode 100644
index 8d2e3543f56..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_examples.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.layers.parse_feature_columns_from_examples(serialized, feature_columns, name=None, example_names=None)` {#parse_feature_columns_from_examples}
-
-Parses tf.Examples to extract tensors for given feature_columns.
-
-This is a wrapper of 'tf.parse_example'.
-
-Example:
-
-```python
-columns_to_tensor = parse_feature_columns_from_examples(
-    serialized=my_data,
-    feature_columns=my_features)
-
-# Where my_features are:
-# Define features and transformations
-sparse_feature_a = sparse_column_with_keys(
-    column_name="sparse_feature_a", keys=["AB", "CD", ...])
-
-embedding_feature_a = embedding_column(
-    sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
-
-sparse_feature_b = sparse_column_with_hash_bucket(
-    column_name="sparse_feature_b", hash_bucket_size=1000)
-
-embedding_feature_b = embedding_column(
-    sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
-
-crossed_feature_a_x_b = crossed_column(
-    columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
-
-real_feature = real_valued_column("real_feature")
-real_feature_buckets = bucketized_column(
-    source_column=real_feature, boundaries=[...])
-
-my_features = [embedding_feature_b, real_feature_buckets, embedding_feature_a]
-```
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
-    serialized `Example` protos.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
-    the serialized protos in the batch.
-
-##### Returns:
-
-  A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_sequence_examples.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_sequence_examples.md
deleted file mode 100644
index 99d37f4f772..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.parse_feature_columns_from_sequence_examples.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.contrib.layers.parse_feature_columns_from_sequence_examples(serialized, context_feature_columns, sequence_feature_columns, name=None, example_name=None)` {#parse_feature_columns_from_sequence_examples}
-
-Parses tf.SequenceExamples to extract tensors for given `FeatureColumn`s.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A scalar (0-D Tensor) of type string, a single serialized
-    `SequenceExample` proto.
-*  <b>`context_feature_columns`</b>: An iterable containing the feature columns for
-    context features. All items should be instances of classes derived from
-    `_FeatureColumn`. Can be `None`.
-*  <b>`sequence_feature_columns`</b>: An iterable containing the feature columns for
-    sequence features. All items should be instances of classes derived from
-    `_FeatureColumn`. Can be `None`.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_name`</b>: A scalar (0-D Tensor) of type string (optional), the names of
-    the serialized proto.
-
-##### Returns:
-
-  A tuple consisting of:
-
-*  <b>`context_features`</b>: a dict mapping `FeatureColumns` from
-    `context_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
-*  <b>`sequence_features`</b>: a dict mapping `FeatureColumns` from
-    `sequence_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.summarize_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.summarize_tensor.md
deleted file mode 100644
index 872ba5c9d49..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.summarize_tensor.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.layers.summarize_tensor(tensor, tag=None)` {#summarize_tensor}
-
-Summarize a tensor using a suitable summary type.
-
-This function adds a summary op for `tensor`. The type of summary depends on
-the shape of `tensor`. For scalars, a `scalar_summary` is created, for all
-other tensors, `histogram_summary` is used.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: The tensor to summarize
-*  <b>`tag`</b>: The tag to use, if None then use tensor's op's name.
-
-##### Returns:
-
-  The summary op created or None for string tensors.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.xavier_initializer_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.xavier_initializer_conv2d.md
deleted file mode 100644
index 9deeb48b5b7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.xavier_initializer_conv2d.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.layers.xavier_initializer_conv2d(uniform=True, seed=None, dtype=tf.float32)` {#xavier_initializer_conv2d}
-
-Returns an initializer performing "Xavier" initialization for weights.
-
-This function implements the weight initialization from:
-
-Xavier Glorot and Yoshua Bengio (2010):
-         Understanding the difficulty of training deep feedforward neural
-         networks. International conference on artificial intelligence and
-         statistics.
-
-This initializer is designed to keep the scale of the gradients roughly the
-same in all layers. In uniform distribution this ends up being the range:
-`x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
-deviation of `sqrt(3. / (in + out))` is used.
-
-##### Args:
-
-
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer for a weight matrix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.Experiment.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.Experiment.md
deleted file mode 100644
index 6e891922f4c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.Experiment.md
+++ /dev/null
@@ -1,229 +0,0 @@
-Experiment is a class containing all information needed to train a model.
-
-After an experiment is created (by passing an Estimator and inputs for
-training and evaluation), an Experiment instance knows how to invoke training
-and eval loops in a sensible fashion for distributed training.
-- - -
-
-#### `tf.contrib.learn.Experiment.__init__(*args, **kwargs)` {#Experiment.__init__}
-
-Constructor for `Experiment`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-23.
-Instructions for updating:
-local_eval_frequency is deprecated as local_run will be renamed to train_and_evaluate. Use min_eval_frequency and call train_and_evaluate instead. Note, however, that the default for min_eval_frequency is 1, meaning models will be evaluated every time a new checkpoint is available. In contrast, the default for local_eval_frequency is None, resulting in evaluation occurring only after training has completed. min_eval_frequency is ignored when calling the deprecated local_run.
-
-Creates an Experiment instance. None of the functions passed to this
-constructor are executed at construction time. They are stored and used
-when a method is executed which requires it.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: Object implementing `Trainable` and `Evaluable`.
-*  <b>`train_input_fn`</b>: function, returns features and labels for training.
-*  <b>`eval_input_fn`</b>: function, returns features and labels for evaluation. If
-    `eval_steps` is `None`, this should be configured only to produce for a
-    finite number of batches (generally, 1 epoch over the evaluation data).
-*  <b>`eval_metrics`</b>: `dict` of string, metric function. If `None`, default set
-    is used.
-*  <b>`train_steps`</b>: Perform this many steps of training. `None`, the default,
-    means train forever.
-*  <b>`eval_steps`</b>: `evaluate` runs until input is exhausted (or another exception
-    is raised), or for `eval_steps` steps, if specified.
-*  <b>`train_monitors`</b>: A list of monitors to pass to the `Estimator`'s `fit`
-    function.
-*  <b>`eval_hooks`</b>: A list of `SessionRunHook` hooks to pass to the
-    `Estimator`'s `evaluate` function.
-*  <b>`local_eval_frequency`</b>: Frequency of running eval in steps,
-    when running locally. If `None`, runs evaluation only at the end of
-    training.
-*  <b>`eval_delay_secs`</b>: Start evaluating after waiting for this many seconds.
-*  <b>`continuous_eval_throttle_secs`</b>: Do not re-evaluate unless the last
-    evaluation was started at least this many seconds ago for
-    continuous_eval().
-*  <b>`min_eval_frequency`</b>: (applies only to train_and_evaluate). the minimum
-    number of steps between evaluations. Of course, evaluation does not
-    occur if no new snapshot is available, hence, this is the minimum.
-*  <b>`delay_workers_by_global_step`</b>: if `True` delays training workers
-    based on global step instead of time.
-*  <b>`export_strategies`</b>: A list of `ExportStrategy`s, or a single one, or None.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `estimator` does not implement `Evaluable` and `Trainable`,
-    or if export_strategies has the wrong type.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.continuous_eval(delay_secs=None, throttle_delay_secs=None, evaluate_checkpoint_only_once=True, continuous_eval_predicate_fn=None)` {#Experiment.continuous_eval}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.continuous_eval_on_train_data(delay_secs=None, throttle_delay_secs=None, continuous_eval_predicate_fn=None)` {#Experiment.continuous_eval_on_train_data}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.estimator` {#Experiment.estimator}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.eval_metrics` {#Experiment.eval_metrics}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.eval_steps` {#Experiment.eval_steps}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.evaluate(delay_secs=None)` {#Experiment.evaluate}
-
-Evaluate on the evaluation data.
-
-Runs evaluation on the evaluation data and returns the result. Runs for
-`self._eval_steps` steps, or if it's `None`, then run until input is
-exhausted or another exception is raised. Start the evaluation after
-`delay_secs` seconds, or if it's `None`, defaults to using
-`self._eval_delay_secs` seconds.
-
-##### Args:
-
-
-*  <b>`delay_secs`</b>: Start evaluating after this many seconds. If `None`, defaults
-    to using `self._eval_delays_secs`.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.extend_train_hooks(additional_hooks)` {#Experiment.extend_train_hooks}
-
-Extends the hooks for training.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.local_run(*args, **kwargs)` {#Experiment.local_run}
-
-DEPRECATED FUNCTION
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-23.
-Instructions for updating:
-local_run will be renamed to train_and_evaluate and the new default behavior will be to run evaluation every time there is a new checkpoint.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.reset_export_strategies(new_export_strategies=None)` {#Experiment.reset_export_strategies}
-
-Resets the export strategies with the `new_export_strategies`.
-
-##### Args:
-
-
-*  <b>`new_export_strategies`</b>: A new list of `ExportStrategy`s, or a single one,
-    or None.
-
-##### Returns:
-
-  The old export strategies.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.run_std_server()` {#Experiment.run_std_server}
-
-Starts a TensorFlow server and joins the serving thread.
-
-Typically used for parameter servers.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if not enough information is available in the estimator's
-    config to create a server.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.test()` {#Experiment.test}
-
-Tests training and evaluating the estimator both for a single step.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train(delay_secs=None)` {#Experiment.train}
-
-Fit the estimator using the training data.
-
-Train the estimator for `self._train_steps` steps, after waiting for
-`delay_secs` seconds. If `self._train_steps` is `None`, train forever.
-
-##### Args:
-
-
-*  <b>`delay_secs`</b>: Start training after this many seconds.
-
-##### Returns:
-
-  The trained estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train_and_evaluate()` {#Experiment.train_and_evaluate}
-
-Interleaves training and evaluation.
-
-The frequency of evaluation is controlled by the contructor arg
-`min_eval_frequency`. When this parameter is None or 0, evaluation happens
-only after training has completed. Note that evaluation cannot happen
-more frequently than checkpoints are taken. If no new snapshots are
-available when evaluation is supposed to occur, then evaluation doesn't
-happen for another `min_eval_frequency` steps (assuming a checkpoint is
-available at that point). Thus, settings `min_eval_frequency` to 1 means
-that the model will be evaluated everytime there is a new checkpoint.
-
-This is particular useful for a "Master" task in the cloud, whose
-responsibility it is to take checkpoints, evaluate those checkpoints,
-and write out summaries. Participating in training as the supervisor
-allows such a task to accomplish the first and last items, while
-performing evaluation allows for the second.
-
-##### Returns:
-
-  The result of the `evaluate` call to the `Estimator`.
-
-
-- - -
-
-#### `tf.contrib.learn.Experiment.train_steps` {#Experiment.train_steps}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.KMeansClustering.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.KMeansClustering.md
deleted file mode 100644
index 4eb8750685c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.KMeansClustering.md
+++ /dev/null
@@ -1,410 +0,0 @@
-An Estimator for K-Means clustering.
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.__init__(num_clusters, model_dir=None, initial_clusters='random', distance_metric='squared_euclidean', random_seed=0, use_mini_batch=True, kmeans_plus_plus_num_retries=2, relative_tolerance=None, config=None)` {#KMeansClustering.__init__}
-
-Creates a model for running KMeans training and inference.
-
-##### Args:
-
-
-*  <b>`num_clusters`</b>: number of clusters to train.
-*  <b>`model_dir`</b>: the directory to save the model results and log files.
-*  <b>`initial_clusters`</b>: specifies how to initialize the clusters for training.
-    See clustering_ops.kmeans for the possible values.
-*  <b>`distance_metric`</b>: the distance metric used for clustering.
-    See clustering_ops.kmeans for the possible values.
-*  <b>`random_seed`</b>: Python integer. Seed for PRNG used to initialize centers.
-*  <b>`use_mini_batch`</b>: If true, use the mini-batch k-means algorithm. Else assume
-    full batch.
-*  <b>`kmeans_plus_plus_num_retries`</b>: For each point that is sampled during
-    kmeans++ initialization, this parameter specifies the number of
-    additional points to draw from the current distribution before selecting
-    the best. If a negative value is specified, a heuristic is used to
-    sample O(log(num_to_sample)) additional points.
-*  <b>`relative_tolerance`</b>: A relative tolerance of change in the loss between
-    iterations.  Stops learning if the loss changes less than this amount.
-    Note that this may not work correctly if use_mini_batch=True.
-*  <b>`config`</b>: See Estimator
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.__repr__()` {#KMeansClustering.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.clusters()` {#KMeansClustering.clusters}
-
-Returns cluster centers.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.config` {#KMeansClustering.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.evaluate(*args, **kwargs)` {#KMeansClustering.evaluate}
-
-See `Evaluable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.export(*args, **kwargs)` {#KMeansClustering.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn (and in most cases, input_feature_key) will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-##### Args:
-
-
-*  <b>`export_dir`</b>: A string containing a directory to write the exported graph
-    and checkpoints.
-*  <b>`input_fn`</b>: If `use_deprecated_input_fn` is true, then a function that given
-    `Tensor` of `Example` strings, parses it into features that are then
-    passed to the model. Otherwise, a function that takes no argument and
-    returns a tuple of (features, labels), where features is a dict of
-    string key to `Tensor` and labels is a `Tensor` that's currently not
-    used (and so can be `None`).
-*  <b>`input_feature_key`</b>: Only used if `use_deprecated_input_fn` is false. String
-    key into the features dict returned by `input_fn` that corresponds to a
-    the raw `Example` strings `Tensor` that the exported model will take as
-    input. Can only be `None` if you're using a custom `signature_fn` that
-    does not use the first arg (examples).
-*  <b>`use_deprecated_input_fn`</b>: Determines the signature format of `input_fn`.
-*  <b>`signature_fn`</b>: Function that returns a default signature and a named
-    signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-    for features and `Tensor` or `dict` of `Tensor`s for predictions.
-*  <b>`prediction_key`</b>: The key for a tensor in the `predictions` dict (output
-    from the `model_fn`) to use as the `predictions` input to the
-    `signature_fn`. Optional. If `None`, predictions will pass to
-    `signature_fn` without filtering.
-*  <b>`default_batch_size`</b>: Default batch size of the `Example` placeholder.
-*  <b>`exports_to_keep`</b>: Number of exports to keep.
-*  <b>`checkpoint_path`</b>: the checkpoint path of the model to be exported. If it is
-      `None` (which is default), will use the latest checkpoint in
-      export_dir.
-
-##### Returns:
-
-  The string path to the exported directory. NB: this functionality was
-  added ca. 2016/09/25; clients that depend on the return value may need
-  to handle the case where this function returns None because subclasses
-  are not returning a value.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#KMeansClustering.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.fit(*args, **kwargs)` {#KMeansClustering.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_params(deep=True)` {#KMeansClustering.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_variable_names()` {#KMeansClustering.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.get_variable_value(name)` {#KMeansClustering.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.model_dir` {#KMeansClustering.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.partial_fit(*args, **kwargs)` {#KMeansClustering.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.predict(*args, **kwargs)` {#KMeansClustering.predict}
-
-Returns predictions for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x` and 'batch_size' must be `None`.
-*  <b>`batch_size`</b>: Override default batch size. If set, 'input_fn' must be
-    'None'.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns all.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  A numpy array of predicted classes or regression values if the
-  constructor's `model_fn` returns a `Tensor` for `predictions` or a `dict`
-  of numpy arrays if `model_fn` returns a `dict`. Returns an iterable of
-  predictions if as_iterable is True.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If x and input_fn are both provided or both `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.predict_cluster_idx(input_fn=None)` {#KMeansClustering.predict_cluster_idx}
-
-Yields predicted cluster indices.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.score(input_fn=None, steps=None)` {#KMeansClustering.score}
-
-Predict total sum of distances to nearest clusters.
-
-Note that this function is different from the corresponding one in sklearn
-which returns the negative of the sum of distances.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: see predict.
-*  <b>`steps`</b>: see predict.
-
-##### Returns:
-
-  Total sum of distances to nearest clusters.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.set_params(**params)` {#KMeansClustering.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.KMeansClustering.transform(input_fn=None, as_iterable=False)` {#KMeansClustering.transform}
-
-Transforms each element to distances to cluster centers.
-
-Note that this function is different from the corresponding one in sklearn.
-For SQUARED_EUCLIDEAN distance metric, sklearn transform returns the
-EUCLIDEAN distance, while this function returns the SQUARED_EUCLIDEAN
-distance.
-
-##### Args:
-
-
-*  <b>`input_fn`</b>: see predict.
-*  <b>`as_iterable`</b>: see predict
-
-##### Returns:
-
-  Array with same number of rows as x, and num_clusters columns, containing
-  distances to the cluster centers.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TaskType.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TaskType.md
deleted file mode 100644
index 8b137891791..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TaskType.md
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.extract_pandas_labels.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.extract_pandas_labels.md
deleted file mode 100644
index 2cbb8e0652b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.extract_pandas_labels.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.learn.extract_pandas_labels(labels)` {#extract_pandas_labels}
-
-Extract data from pandas.DataFrame for labels.
-
-##### Args:
-
-
-*  <b>`labels`</b>: `pandas.DataFrame` or `pandas.Series` containing one column of
-    labels to be extracted.
-
-##### Returns:
-
-  A numpy `ndarray` of labels from the DataFrame.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if more than one column is found or type is not int, float or
-    bool.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
deleted file mode 100644
index 761aa79c300..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.linalg.LinearOperatorTriL.md
+++ /dev/null
@@ -1,488 +0,0 @@
-`LinearOperator` acting like a [batch] square lower triangular matrix.
-
-This operator acts like a [batch] lower triangular matrix `A` with shape
-`[B1,...,Bb, N, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `N x N` matrix.
-
-`LinearOperatorTriL` is initialized with a `Tensor` having dimensions
-`[B1,...,Bb, N, N]`. The upper triangle of the last two dimensions is ignored.
-
-```python
-# Create a 2 x 2 lower-triangular linear operator.
-tril = [[1., 2.], [3., 4.]]
-operator = LinearOperatorTriL(tril)
-
-# The upper triangle is ignored.
-operator.to_dense()
-==> [[1., 0.]
-     [3., 4.]]
-
-operator.shape
-==> [2, 2]
-
-operator.log_determinant()
-==> scalar Tensor
-
-x = ... Shape [2, 4] Tensor
-operator.apply(x)
-==> Shape [2, 4] Tensor
-
-# Create a [2, 3] batch of 4 x 4 linear operators.
-tril = tf.random_normal(shape=[2, 3, 4, 4])
-operator = LinearOperatorTriL(tril)
-```
-
-#### Shape compatibility
-
-This operator acts on [batch] matrix with compatible shape.
-`x` is a batch matrix with compatible shape for `apply` and `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
-x.shape =        [B1,...,Bb] + [N, R],  with R >= 0.
-```
-
-#### Performance
-
-Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
-and `x.shape = [N, R]`.  Then
-
-* `operator.apply(x)` involves `N^2 * R` multiplications.
-* `operator.solve(x)` involves `N * R` size `N` back-substitutions.
-* `operator.determinant()` involves a size `N` `reduce_prod`.
-
-If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and
-`[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`.
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.__init__(tril, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, name='LinearOperatorTriL')` {#LinearOperatorTriL.__init__}
-
-Initialize a `LinearOperatorTriL`.
-
-##### Args:
-
-
-*  <b>`tril`</b>: Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
-    The lower triangular part of `tril` defines this operator.  The strictly
-    upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-    This operator is non-singular if and only if its diagonal elements are
-    all non-zero.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  This operator is self-adjoint only if it is diagonal with
-    real-valued diagonal entries.  In this case it is advised to use
-    `LinearOperatorDiag`.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix
-        #Extension_for_non_symmetric_matrices
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `diag.dtype` is not an allowed type.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.add_to_tensor(x, name='add_to_tensor')` {#LinearOperatorTriL.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.apply(x, adjoint=False, name='apply')` {#LinearOperatorTriL.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_non_singular(name='assert_non_singular')` {#LinearOperatorTriL.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_positive_definite(name='assert_positive_definite')` {#LinearOperatorTriL.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperatorTriL.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape` {#LinearOperatorTriL.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperatorTriL.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.determinant(name='det')` {#LinearOperatorTriL.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension` {#LinearOperatorTriL.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperatorTriL.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.dtype` {#LinearOperatorTriL.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.graph_parents` {#LinearOperatorTriL.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_non_singular` {#LinearOperatorTriL.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_positive_definite` {#LinearOperatorTriL.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_self_adjoint` {#LinearOperatorTriL.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.is_square` {#LinearOperatorTriL.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.log_abs_determinant(name='log_abs_det')` {#LinearOperatorTriL.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.name` {#LinearOperatorTriL.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension` {#LinearOperatorTriL.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperatorTriL.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.shape` {#LinearOperatorTriL.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.shape_tensor(name='shape_tensor')` {#LinearOperatorTriL.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.solve(rhs, adjoint=False, name='solve')` {#LinearOperatorTriL.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank` {#LinearOperatorTriL.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperatorTriL.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperatorTriL.to_dense(name='to_dense')` {#LinearOperatorTriL.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md
deleted file mode 100644
index e50a0c0e4d8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.losses.sparse_softmax_cross_entropy.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.contrib.losses.sparse_softmax_cross_entropy(*args, **kwargs)` {#sparse_softmax_cross_entropy}
-
-Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.sparse_softmax_cross_entropy instead.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided,
-then the loss is simply scaled by the given value. If `weights` is a
-tensor of size [`batch_size`], then the loss weights apply to each
-corresponding sample.
-
-##### Args:
-
-
-*  <b>`logits`</b>: [batch_size, num_classes] logits outputs of the network .
-*  <b>`labels`</b>: [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64`
-    in the range `[0, num_classes)`.
-*  <b>`weights`</b>: Coefficients for the loss. The tensor must be a scalar or a tensor
-    of shape [batch_size] or [batch_size, 1].
-*  <b>`scope`</b>: the scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the mean loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `logits`, `labels`, and `weights` are
-    incompatible, or if `weights` is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
deleted file mode 100644
index 831de8ac6bb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.metrics.confusion_matrix(labels, predictions, num_classes=None, dtype=tf.int32, name=None, weights=None)` {#confusion_matrix}
-
-Deprecated. Use tf.confusion_matrix instead.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md
deleted file mode 100644
index 3a930314a39..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md
+++ /dev/null
@@ -1,50 +0,0 @@
-### `tf.contrib.metrics.streaming_accuracy(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_accuracy}
-
-Calculates how often `predictions` matches `labels`.
-
-The `streaming_accuracy` function creates two local variables, `total` and
-`count` that are used to compute the frequency with which `predictions`
-matches `labels`. This frequency is ultimately returned as `accuracy`: an
-idempotent operation that simply divides `total` by `count`.
-
-For estimation of the metric  over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the `accuracy`.
-Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
-where the corresponding elements of `predictions` and `labels` match and 0.0
-otherwise. Then `update_op` increments `total` with the reduced sum of the
-product of `weights` and `is_correct`, and it increments `count` with the
-reduced sum of `weights`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of any shape.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose shape matches
-    `predictions`.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or the same rank as `labels`, and
-    must be broadcastable to `labels` (i.e., all dimensions must be either
-    `1`, or the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that `accuracy` should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
-    be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`accuracy`</b>: A `Tensor` representing the accuracy, the value of `total` divided
-    by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately and whose value matches `accuracy`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md
deleted file mode 100644
index 442187dfa4a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.contrib.metrics.streaming_mean_cosine_distance(predictions, labels, dim, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_cosine_distance}
-
-Computes the cosine distance between the labels and predictions.
-
-The `streaming_mean_cosine_distance` function creates two local variables,
-`total` and `count` that are used to compute the average cosine distance
-between `predictions` and `labels`. This average is weighted by `weights`,
-and it is ultimately returned as `mean_distance`, which is an idempotent
-operation that simply divides `total` by `count`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`mean_distance`.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of the same shape as `labels`.
-*  <b>`labels`</b>: A `Tensor` of arbitrary shape.
-*  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`,
-    and whose dimension `dim` is 1.
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`mean_distance`</b>: A `Tensor` representing the current mean, the value of
-    `total` divided by `count`.
-*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
-    appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md
deleted file mode 100644
index c2bc5948744..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.contrib.metrics.streaming_pearson_correlation(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_pearson_correlation}
-
-Computes Pearson correlation coefficient between `predictions`, `labels`.
-
-The `streaming_pearson_correlation` function delegates to
-`streaming_covariance` the tracking of three [co]variances:
-
-- `streaming_covariance(predictions, labels)`, i.e. covariance
-- `streaming_covariance(predictions, predictions)`, i.e. variance
-- `streaming_covariance(labels, labels)`, i.e. variance
-
-The product-moment correlation ultimately returned is an idempotent operation
-`cov(predictions, labels) / sqrt(var(predictions) * var(labels))`. To
-facilitate correlation computation across multiple batches, the function
-groups the `update_op`s of the underlying streaming_covariance and returns an
-`update_op`.
-
-If `weights` is not None, then it is used to compute a weighted correlation.
-NOTE: these weights are treated as "frequency weights", as opposed to
-"reliability weights". See discussion of the difference on
-https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of arbitrary size.
-*  <b>`labels`</b>: A `Tensor` of the same size as predictions.
-*  <b>`weights`</b>: Optional `Tensor` indicating the frequency with which an example is
-    sampled. Rank must be 0, or the same rank as `labels`, and must be
-    broadcastable to `labels` (i.e., all dimensions must be either `1`, or
-    the same as the corresponding `labels` dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`pearson_r`</b>: A `Tensor` representing the current Pearson product-moment
-    correlation coefficient, the value of
-    `cov(predictions, labels) / sqrt(var(predictions) * var(labels))`.
-*  <b>`update_op`</b>: An operation that updates the underlying variables appropriately.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `labels` and `predictions` are of different sizes, or if
-    `weights` is the wrong size, or if either `metrics_collections` or
-    `updates_collections` are not a `list` or `tuple`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md
deleted file mode 100644
index f6cbb8c90a5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md
+++ /dev/null
@@ -1,57 +0,0 @@
-### `tf.contrib.metrics.streaming_sparse_average_precision_at_k(predictions, labels, k, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_average_precision_at_k}
-
-Computes average precision@k of predictions with respect to sparse labels.
-
-See `sparse_average_precision_at_k` for details on formula. `weights` are
-applied to the result of `sparse_average_precision_at_k`
-
-`streaming_sparse_average_precision_at_k` creates two local variables,
-`average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
-are used to compute the frequency. This frequency is ultimately returned as
-`average_precision_at_<k>`: an idempotent operation that simply divides
-`average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
-
-For estimation of the metric over a stream of data, the function creates an
-`update_op` operation that updates these variables and returns the
-`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-indicating the top `k` `predictions`. Set operations applied to `top_k` and
-`labels` calculate the true positives and false positives weighted by
-`weights`. Then `update_op` increments `true_positive_at_<k>` and
-`false_positive_at_<k>` using these values.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
-    N >= 1. Commonly, N=1 and `predictions` has shape
-    [batch size, num_classes]. The final dimension contains the logit values
-    for each class. [D1, ... DN] must match `labels`.
-*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
-    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-    target classes for the associated prediction. Commonly, N=1 and `labels`
-    has shape [batch_size, num_labels]. [D1, ... DN] must match
-    `predictions_`. Values should be in range [0, num_classes), where
-    num_classes is the last dimension of `predictions`. Values outside this
-    range are ignored.
-*  <b>`k`</b>: Integer, k for @k metric. This will calculate an average precision for
-    range `[1,k]`, as documented above.
-*  <b>`weights`</b>: `Tensor` whose rank is either 0, or n-1, where n is the rank of
-    `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
-    dimensions must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that values should
-    be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that updates should
-    be added to.
-*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
-
-##### Returns:
-
-
-*  <b>`mean_average_precision`</b>: Scalar `float64` `Tensor` with the mean average
-    precision values.
-*  <b>`update`</b>: `Operation` that increments  variables appropriately, and whose
-    value matches `metric`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md
deleted file mode 100644
index 5b9dfd33f47..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_true_negatives.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.contrib.metrics.streaming_true_negatives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_true_negatives}
-
-Sum the weights of true_negatives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
-    `weights` is not `None` and its shape doesn't match `predictions`, or if
-    either `metrics_collections` or `updates_collections` are not a list or
-    tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.opt.MovingAverageOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.opt.MovingAverageOptimizer.md
deleted file mode 100644
index 88f68a4b857..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.opt.MovingAverageOptimizer.md
+++ /dev/null
@@ -1,186 +0,0 @@
-Optimizer wrapper that maintains a moving average of parameters.
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.__init__(opt, average_decay=0.9999, num_updates=None, sequential_update=True)` {#MovingAverageOptimizer.__init__}
-
-Construct a new MovingAverageOptimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: A tf.Optimizer that will be used to compute and apply gradients.
-*  <b>`average_decay`</b>: Float.  Decay to use to maintain the moving averages
-                 of trained variables.
-                 See tf.train.ExponentialMovingAverage for details.
-*  <b>`num_updates`</b>: Optional count of number of updates applied to variables.
-               See tf.train.ExponentialMovingAverage for details.
-*  <b>`sequential_update`</b>: Bool. If False, will compute the moving average at the
-                     same time as the model is updated, potentially doing
-                     benign data races.
-                     If True, will update the moving average after gradient
-                     updates.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#MovingAverageOptimizer.apply_gradients}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.compute_gradients(loss, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)` {#MovingAverageOptimizer.compute_gradients}
-
-Compute gradients of `loss` for the variables in `var_list`.
-
-This is the first part of `minimize()`.  It returns a list
-of (gradient, variable) pairs where "gradient" is the gradient
-for "variable".  Note that "gradient" can be a `Tensor`, an
-`IndexedSlices`, or `None` if there is no gradient for the
-given variable.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A Tensor containing the value to minimize.
-*  <b>`var_list`</b>: Optional list of `tf.Variable` to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKey.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  A list of (gradient, variable) pairs. Variable is always present, but
-  gradient can be `None`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` contains anything else than `Variable` objects.
-*  <b>`ValueError`</b>: If some arguments are invalid.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_name()` {#MovingAverageOptimizer.get_name}
-
-
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_slot(var, name)` {#MovingAverageOptimizer.get_slot}
-
-Return a slot named `name` created for `var` by the Optimizer.
-
-Some `Optimizer` subclasses use additional variables.  For example
-`Momentum` and `Adagrad` use variables to accumulate updates.  This method
-gives access to these `Variable` objects if for some reason you need them.
-
-Use `get_slot_names()` to get the list of slot names created by the
-`Optimizer`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A variable passed to `minimize()` or `apply_gradients()`.
-*  <b>`name`</b>: A string.
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.get_slot_names()` {#MovingAverageOptimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-See `get_slot()`.
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.minimize(loss, global_step=None, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None)` {#MovingAverageOptimizer.minimize}
-
-Add operations to minimize `loss` by updating `var_list`.
-
-This method simply combines calls `compute_gradients()` and
-`apply_gradients()`. If you want to process the gradient before applying
-them call `compute_gradients()` and `apply_gradients()` explicitly instead
-of using this function.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A `Tensor` containing the value to minimize.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`name`</b>: Optional name for the returned operation.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  An Operation that updates the variables in `var_list`.  If `global_step`
-  was not `None`, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
-
-
-- - -
-
-#### `tf.contrib.opt.MovingAverageOptimizer.swapping_saver(var_list=None, name='swapping_saver', **kwargs)` {#MovingAverageOptimizer.swapping_saver}
-
-Create a saver swapping moving averages and variables.
-
-You should use this saver during training.  It will save the moving averages
-of the trained parameters under the original parameter names.  For
-evaluations or inference you should use a regular saver and it will
-automatically use the moving averages for the trained variable.
-
-You must call this function after all variables have been created and after
-you have called Optimizer.minimize().
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of variables to save, as per `Saver()`.
-            If set to None, will save all the variables that have been
-            created before this call.
-*  <b>`name`</b>: The name of the saver.
-*  <b>`**kwargs`</b>: Keyword arguments of `Saver()`.
-
-##### Returns:
-
-  A `tf.Saver` object.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If apply_gradients or minimize has not been called before.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.FusedRNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.FusedRNNCell.md
deleted file mode 100644
index d862d9ea9db..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.FusedRNNCell.md
+++ /dev/null
@@ -1,47 +0,0 @@
-Abstract object representing a fused RNN cell.
-
-A fused RNN cell represents the entire RNN expanded over the time
-dimension. In effect, this represents an entire recurrent network.
-
-Unlike RNN cells which are subclasses of `rnn_cell.RNNCell`, a `FusedRNNCell`
-operates on the entire time sequence at once, by putting the loop over time
-inside the cell. This usually leads to much more efficient, but more complex
-and less flexible implementations.
-
-Every `FusedRNNCell` must implement `__call__` with the following signature.
-- - -
-
-#### `tf.contrib.rnn.FusedRNNCell.__call__(inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#FusedRNNCell.__call__}
-
-Run this fused RNN on inputs, starting from the given state.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: `3-D` tensor with shape `[time_len x batch_size x input_size]`
-    or a list of `time_len` tensors of shape `[batch_size x input_size]`.
-*  <b>`initial_state`</b>: either a tensor with shape `[batch_size x state_size]`
-    or a tuple with shapes `[batch_size x s] for s in state_size`, if the
-    cell takes tuples. If this is not provided, the cell is expected to
-    create a zero initial state of type `dtype`.
-*  <b>`dtype`</b>: The data type for the initial state and expected output. Required
-    if `initial_state` is not provided or RNN state has a heterogeneous
-      dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs. An
-    `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
-    time_len)`.
-    Defaults to `time_len` for each element.
-*  <b>`scope`</b>: `VariableScope` or `string` for the created subgraph; defaults to
-    class name.
-
-##### Returns:
-
-  A pair containing:
-
-  - Output: A `3-D` tensor of shape `[time_len x batch_size x output_size]`
-    or a list of `time_len` tensors of shape `[batch_size x output_size]`,
-    to match the type of the `inputs`.
-  - Final state: Either a single `2-D` tensor, or a tuple of tensors
-    matching the arity and shapes of `initial_state`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md
deleted file mode 100644
index fec450ce787..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMStateTuple.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.contrib.rnn.LSTMStateTuple.__new__(_cls, c, h)` {#LSTMStateTuple.__new__}
-
-Create new instance of LSTMStateTuple(c, h)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md
deleted file mode 100644
index fb32ce3d2eb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.static_rnn.md
+++ /dev/null
@@ -1,65 +0,0 @@
-### `tf.contrib.rnn.static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#static_rnn}
-
-Creates a recurrent neural network specified by RNNCell `cell`.
-
-The simplest form of RNN network generated is:
-
-```python
-  state = cell.zero_state(...)
-  outputs = []
-  for input_ in inputs:
-    output, state = cell(input_, state)
-    outputs.append(output)
-  return (outputs, state)
-```
-However, a few other options are available:
-
-An initial state can be provided.
-If the sequence_length vector is provided, dynamic calculation is performed.
-This method of calculation does not compute the RNN steps past the maximum
-sequence length of the minibatch (thus saving computational time),
-and properly propagates the state at an example's sequence length
-to the final state output.
-
-The dynamic calculation performed is, at time `t` for batch row `b`,
-
-```python
-  (output, state)(b, t) =
-    (t >= sequence_length(b))
-      ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
-      : cell(input(b, t), state(b, t - 1))
-```
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`inputs`</b>: A length T list of inputs, each a `Tensor` of shape
-    `[batch_size, input_size]`, or a nested tuple of such elements.
-*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell.state_size`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
-    Required if initial_state is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`sequence_length`</b>: Specifies the length of each sequence in inputs.
-    An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-
-  - outputs is a length T list of outputs (one for each input), or a nested
-    tuple of such elements.
-  - state is the final state
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If `inputs` is `None` or an empty list, or if the input depth
-    (column size) cannot be inferred from inputs via shape inference.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.training.bucket_by_sequence_length.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.training.bucket_by_sequence_length.md
deleted file mode 100644
index f2e69fbb884..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.training.bucket_by_sequence_length.md
+++ /dev/null
@@ -1,55 +0,0 @@
-### `tf.contrib.training.bucket_by_sequence_length(input_length, tensors, batch_size, bucket_boundaries, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=True, shared_name=None, name=None)` {#bucket_by_sequence_length}
-
-Lazy bucketing of inputs according to their length.
-
-This method calls `tf.contrib.training.bucket` under the hood, after first
-subdividing the bucket boundaries into separate buckets and identifying which
-bucket the given `input_length` belongs to.  See the documentation for
-`which_bucket` for details of the other arguments.
-
-##### Args:
-
-
-*  <b>`input_length`</b>: `int32` scalar `Tensor`, the sequence length of tensors.
-*  <b>`tensors`</b>: The list or dictionary of tensors, representing a single element,
-    to bucket.  Nested lists are not supported.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue (all queues will have
-    the same size).  If a list is passed in then each bucket will have a
-    different batch_size.
-    (python int, int32 scalar or iterable of integers of length num_buckets).
-*  <b>`bucket_boundaries`</b>: int list, increasing non-negative numbers.
-    The edges of the buckets to use when bucketing tensors.  Two extra buckets
-    are created, one for `input_length < bucket_boundaries[0]` and
-    one for `input_length >= bucket_boundaries[-1]`.
-*  <b>`num_threads`</b>: An integer.  The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of minibatches in the top queue,
-    and also the maximum number of elements within each bucket.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batches to be smaller if there are insufficient items left in the queues.
-*  <b>`keep_input`</b>: A `bool` scalar Tensor.  If provided, this tensor controls
-    whether the input is added to the queue or not.  If it evaluates `True`,
-    then `tensors` are added to the bucket; otherwise they are dropped.  This
-    tensor essentially acts as a filtering mechanism.
-*  <b>`shared_name`</b>: (Optional). If set, the queues will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A tuple `(sequence_length, outputs)` where `sequence_length` is
-  a 1-D `Tensor` of size `batch_size` and `outputs` is a list or dictionary
-  of batched, bucketed, outputs corresponding to elements of `tensors`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `bucket_boundaries` is not a list of python integers.
-*  <b>`ValueError`</b>: if `bucket_boundaries` is empty or contains non-increasing
-    values or if batch_size is a list and it's length doesn't equal the number
-    of buckets.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.cross.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.cross.md
deleted file mode 100644
index eecf2e869b2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.cross.md
+++ /dev/null
@@ -1,22 +0,0 @@
-### `tf.cross(a, b, name=None)` {#cross}
-
-Compute the pairwise cross product.
-
-`a` and `b` must be the same shape; they can either be simple 3-element vectors,
-or any shape where the innermost dimension is 3. In the latter case, each pair
-of corresponding 3-element vectors is cross-multiplied independently.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    A tensor containing 3-element vectors.
-*  <b>`b`</b>: A `Tensor`. Must have the same type as `a`.
-    Another tensor, of same type and shape as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-  Pairwise cross product of the vectors in `a` and `b`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
deleted file mode 100644
index 332a12f7255..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.equal(x, y, name=None)` {#equal}
-
-Returns the truth value of (x == y) element-wise.
-
-*NOTE*: `Equal` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `quint8`, `qint8`, `qint32`, `string`, `bool`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.hessians.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.hessians.md
deleted file mode 100644
index 0aea7659cee..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.hessians.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.hessians(ys, xs, name='hessians', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)` {#hessians}
-
-Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
-
-`hessians()` adds ops to the graph to output the Hessian matrix of `ys`
-with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
-where each tensor is the Hessian of `sum(ys)`. This function currently
-only supports evaluating the Hessian with respect to (a list of) one-
-dimensional tensors.
-
-The Hessian is a matrix of second-order partial derivatives of a scalar
-tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
-
-##### Args:
-
-
-*  <b>`ys`</b>: A `Tensor` or list of tensors to be differentiated.
-*  <b>`xs`</b>: A `Tensor` or list of tensors to be used for differentiation.
-*  <b>`name`</b>: Optional name to use for grouping all the gradient ops together.
-    defaults to 'hessians'.
-*  <b>`colocate_gradients_with_ops`</b>: See `gradients()` documentation for details.
-*  <b>`gate_gradients`</b>: See `gradients()` documentation for details.
-*  <b>`aggregation_method`</b>: See `gradients()` documentation for details.
-
-##### Returns:
-
-  A list of Hessian matrices of `sum(y)` for each `x` in `xs`.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: if one of the operations between `xs` and `ys` does not
-    have a registered gradient function.
-*  <b>`ValueError`</b>: if the arguments are invalid or not supported. Currently,
-    this function only supports one-dimensional `x` in `xs`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.crop_to_bounding_box.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.crop_to_bounding_box.md
deleted file mode 100644
index 1ca4247a9b2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.crop_to_bounding_box.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.image.crop_to_bounding_box(image, offset_height, offset_width, target_height, target_width)` {#crop_to_bounding_box}
-
-Crops an image to a specified bounding box.
-
-This op cuts a rectangular part out of `image`. The top-left corner of the
-returned image is at `offset_height, offset_width` in `image`, and its
-lower-right corner is at
-`offset_height + target_height, offset_width + target_width`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor with shape `[height, width, channels]`
-*  <b>`offset_height`</b>: Vertical coordinate of the top-left corner of the result in
-                 the input.
-*  <b>`offset_width`</b>: Horizontal coordinate of the top-left corner of the result in
-                the input.
-*  <b>`target_height`</b>: Height of the result.
-*  <b>`target_width`</b>: Width of the result.
-
-##### Returns:
-
-  3-D tensor of image with shape `[target_height, target_width, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `image` is incompatible with the `offset_*` or
-    `target_*` arguments, or either `offset_height` or `offset_width` is
-    negative, or either `target_height` or `target_width` is not positive.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md
deleted file mode 100644
index fff67cd42f2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.image.draw_bounding_boxes(images, boxes, name=None)` {#draw_bounding_boxes}
-
-Draw bounding boxes on a batch of images.
-
-Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-boxes specified by the locations in `boxes`. The coordinates of the each
-bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-bounding box will be `(10, 40)` to `(50, 180)`.
-
-Parts of the bounding box may fall outside the image.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D with shape `[batch, height, width, depth]`. A batch of images.
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-    boxes.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`.
-  4-D with the same shape as `images`. The batch of input images with
-  bounding boxes drawn on the images.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md
deleted file mode 100644
index 8b7b8484432..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.image.per_image_standardization(image)` {#per_image_standardization}
-
-Linearly scales `image` to have zero mean and unit norm.
-
-This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
-of all values in image, and
-`adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.
-
-`stddev` is the standard deviation of all values in `image`. It is capped
-away from zero to protect against division by 0 when handling uniform images.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`.
-
-##### Returns:
-
-  The standardized image with same shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of 'image' is incompatible with this function.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_bilinear.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_bilinear.md
deleted file mode 100644
index a9580ca1995..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_bilinear.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.image.resize_bilinear(images, size, align_corners=None, name=None)` {#resize_bilinear}
-
-Resize `images` to `size` using bilinear interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
deleted file mode 100644
index a4b7e8f57ab..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.image.resize_images(images, size, method=0, align_corners=False)` {#resize_images}
-
-Resize `images` to `size` using the specified `method`.
-
-Resized images will be distorted if their original aspect ratio is not
-the same as `size`.  To avoid distortions see
-[`resize_image_with_crop_or_pad`](#resize_image_with_crop_or_pad).
-
-`method` can be one of:
-
-*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
-*   <b>`ResizeMethod.AREA`</b>: Area interpolation.
-
-##### Args:
-
-
-*  <b>`images`</b>: 4-D Tensor of shape `[batch, height, width, channels]` or
-          3-D Tensor of shape `[height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-        new size for the images.
-*  <b>`method`</b>: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-*  <b>`align_corners`</b>: bool. If true, exactly align all 4 corners of the input and
-                 output. Defaults to `false`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `images` is incompatible with the
-    shape arguments to this function
-*  <b>`ValueError`</b>: if `size` has invalid shape or type.
-*  <b>`ValueError`</b>: if an unsupported resize method is specified.
-
-##### Returns:
-
-  If `images` was 4-D, a 4-D float Tensor of shape
-  `[batch, new_height, new_width, channels]`.
-  If `images` was 3-D, a 3-D float Tensor of shape
-  `[new_height, new_width, channels]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.transpose_image.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.transpose_image.md
deleted file mode 100644
index 1cc527d345d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.transpose_image.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.image.transpose_image(image)` {#transpose_image}
-
-Transpose an image by swapping the first and second dimension.
-
-See also `transpose()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`
-
-##### Returns:
-
-  A 3-D tensor of shape `[width, height, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.is_inf.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.is_inf.md
deleted file mode 100644
index 56663d64170..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.is_inf.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.is_inf(x, name=None)` {#is_inf}
-
-Returns which elements of x are Inf.
-
-@compatibility(numpy)
-Equivalent to np.isinf
-@end_compatibility
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.lbeta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.lbeta.md
deleted file mode 100644
index e3ee18dfb3a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.lbeta.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.lbeta(x, name='lbeta')` {#lbeta}
-
-Computes `ln(|Beta(x)|)`, reducing along the last dimension.
-
-Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
-
-```Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)```
-
-And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-`lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)`.  In other words,
-the last dimension is treated as the `z` vector.
-
-Note that if `z = [u, v]`, then
-`Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt`, which defines the traditional
-bivariate beta function.
-
-##### Args:
-
-
-*  <b>`x`</b>: A rank `n + 1` `Tensor` with type `float`, or `double`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The logarithm of `|Beta(x)|` reducing along the last dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` is empty with rank one or less.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
deleted file mode 100644
index c8ce84b6691..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.less_equal(x, y, name=None)` {#less_equal}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
deleted file mode 100644
index ff49493f0c5..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
-
-Computes the inverse of one or more square invertible matrices or their
-
-adjoints (conjugate transposes).
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the inverse for all input submatrices `[..., :, :]`.
-
-The op uses LU decomposition with partial pivoting to compute the inverses.
-
-If a matrix is not invertible there is no guarantee what the op does. It
-may detect the condition and raise an exception or it may simply return a
-garbage result.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M, M]`.
-
-  @compatibility(numpy)
-  Equivalent to np.linalg.inv
-  @end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_set_diag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_set_diag.md
deleted file mode 100644
index a8f9bf6be83..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_set_diag.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.matrix_set_diag(input, diagonal, name=None)` {#matrix_set_diag}
-
-Returns a batched matrix tensor with new batched diagonal values.
-
-Given `input` and `diagonal`, this operation returns a tensor with the
-same shape and values as `input`, except for the main diagonal of the
-innermost matrices.  These will be overwritten by the values in `diagonal`.
-
-The output is computed as follows:
-
-Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-
-  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k+1`, where `k >= 1`.
-*  <b>`diagonal`</b>: A `Tensor`. Must have the same type as `input`.
-    Rank `k`, where `k >= 1`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Rank `k+1`, with `output.shape = input.shape`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.batch_normalization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.batch_normalization.md
deleted file mode 100644
index 4ef94aeda2e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.batch_normalization.md
+++ /dev/null
@@ -1,47 +0,0 @@
-### `tf.nn.batch_normalization(x, mean, variance, offset, scale, variance_epsilon, name=None)` {#batch_normalization}
-
-Batch normalization.
-
-As described in http://arxiv.org/abs/1502.03167.
-Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
-`scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):
-
-\\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)
-
-`mean`, `variance`, `offset` and `scale` are all expected to be of one of two
-shapes:
-
-  * In all generality, they can have the same number of dimensions as the
-    input `x`, with identical sizes as `x` for the dimensions that are not
-    normalized over (the 'depth' dimension(s)), and dimension 1 for the
-    others which are being normalized over.
-    `mean` and `variance` in this case would typically be the outputs of
-    `tf.nn.moments(..., keep_dims=True)` during training, or running averages
-    thereof during inference.
-  * In the common case where the 'depth' dimension is the last dimension in
-    the input tensor `x`, they may be one dimensional tensors of the same
-    size as the 'depth' dimension.
-    This is the case for example for the common `[batch, depth]` layout of
-    fully-connected layers, and `[batch, height, width, depth]` for
-    convolutions.
-    `mean` and `variance` in this case would typically be the outputs of
-    `tf.nn.moments(..., keep_dims=False)` during training, or running averages
-    thereof during inference.
-
-##### Args:
-
-
-*  <b>`x`</b>: Input `Tensor` of arbitrary dimensionality.
-*  <b>`mean`</b>: A mean `Tensor`.
-*  <b>`variance`</b>: A variance `Tensor`.
-*  <b>`offset`</b>: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
-    None. If present, will be added to the normalized tensor.
-*  <b>`scale`</b>: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
-    `None`. If present, the scale is applied to the normalized tensor.
-*  <b>`variance_epsilon`</b>: A small float number to avoid dividing by 0.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  the normalized, scaled, offset tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.bidirectional_dynamic_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.bidirectional_dynamic_rnn.md
deleted file mode 100644
index e57bbb03d09..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.bidirectional_dynamic_rnn.md
+++ /dev/null
@@ -1,84 +0,0 @@
-### `tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, initial_state_fw=None, initial_state_bw=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None)` {#bidirectional_dynamic_rnn}
-
-Creates a dynamic version of bidirectional recurrent neural network.
-
-Similar to the unidirectional case above (rnn) but takes input and builds
-independent forward and backward RNNs. The input_size of forward and
-backward cell must match. The initial state for both directions is zero by
-default (but can be set optionally) and no intermediate states are ever
-returned -- the network is fully unrolled for the given (passed in)
-length(s) of the sequence(s) or completely unrolled if length(s) is not
-given.
-
-##### Args:
-
-
-*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
-*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
-*  <b>`inputs`</b>: The RNN inputs.
-    If time_major == False (default), this must be a tensor of shape:
-      `[batch_size, max_time, input_size]`.
-    If time_major == True, this must be a tensor of shape:
-      `[max_time, batch_size, input_size]`.
-    [batch_size, input_size].
-*  <b>`sequence_length`</b>: An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
-    This must be a tensor of appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-    If `cell_fw.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
-    the corresponding properties of `cell_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial states and expected output.
-    Required if initial_states are not provided or RNN states have a
-    heterogeneous dtype.
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`time_major`</b>: The shape format of the `inputs` and `outputs` Tensors.
-    If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-    If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-    Using `time_major = True` is a bit more efficient because it avoids
-    transposes at the beginning and end of the RNN calculation.  However,
-    most TensorFlow data is batch-major, so by default this function
-    accepts input and emits output in batch-major form.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "bidirectional_rnn"
-
-##### Returns:
-
-  A tuple (outputs, output_states) where:
-
-*  <b>`outputs`</b>: A tuple (output_fw, output_bw) containing the forward and
-      the backward rnn output `Tensor`.
-      If time_major == False (default),
-        output_fw will be a `Tensor` shaped:
-        `[batch_size, max_time, cell_fw.output_size]`
-        and output_bw will be a `Tensor` shaped:
-        `[batch_size, max_time, cell_bw.output_size]`.
-      If time_major == True,
-        output_fw will be a `Tensor` shaped:
-        `[max_time, batch_size, cell_fw.output_size]`
-        and output_bw will be a `Tensor` shaped:
-        `[max_time, batch_size, cell_bw.output_size]`.
-      It returns a tuple instead of a single concatenated `Tensor`, unlike
-      in the `bidirectional_rnn`. If the concatenated one is preferred,
-      the forward and backward outputs can be concatenated as
-      `tf.concat(outputs, 2)`.
-*  <b>`output_states`</b>: A tuple (output_state_fw, output_state_bw) containing
-      the forward and the backward final states of bidirectional rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
deleted file mode 100644
index d40ed356575..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d}
-
-Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, out_channels]`, this op
-performs the following:
-
-1. Flattens the filter to a 2-D matrix with shape
-   `[filter_height * filter_width * in_channels, output_channels]`.
-2. Extracts image patches from the input tensor to form a *virtual*
-   tensor of shape `[batch, out_height, out_width,
-   filter_height * filter_width * in_channels]`.
-3. For each patch, right-multiplies the filter matrix and the image patch
-   vector.
-
-In detail, with the default NHWC format,
-
-    output[b, i, j, k] =
-        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-                        filter[di, dj, q, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-*  <b>`strides`</b>: A list of `ints`.
-    1-D of length 4.  The stride of the sliding window for each dimension
-    of `input`. Must be in the same order as the dimension specified with format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.convolution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.convolution.md
deleted file mode 100644
index f1ed0e2f534..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.convolution.md
+++ /dev/null
@@ -1,116 +0,0 @@
-### `tf.nn.convolution(input, filter, padding, strides=None, dilation_rate=None, name=None, data_format=None)` {#convolution}
-
-Computes sums of N-D convolutions (actually cross-correlation).
-
-This also supports either output striding via the optional `strides` parameter
-or atrous convolution (also known as convolution with holes or dilated
-convolution, based on the French word "trous" meaning holes in English) via
-the optional `dilation_rate` parameter.  Currently, however, output striding
-is not supported for atrous convolutions.
-
-Specifically, in the case that `data_format` does not start with "NC", given
-a rank (N+2) `input` Tensor of shape
-
-  [num_batches,
-   input_spatial_shape[0],
-   ...,
-   input_spatial_shape[N-1],
-   num_input_channels],
-
-a rank (N+2) `filter` Tensor of shape
-
-  [spatial_filter_shape[0],
-   ...,
-   spatial_filter_shape[N-1],
-   num_input_channels,
-   num_output_channels],
-
-an optional `dilation_rate` tensor of shape [N] (defaulting to [1]*N)
-specifying the filter upsampling/input downsampling rate, and an optional list
-of N `strides` (defaulting [1]*N), this computes for each N-D spatial output
-position (x[0], ..., x[N-1]):
-
-  output[b, x[0], ..., x[N-1], k] =
-
-      sum_{z[0], ..., z[N-1], q}
-
-          filter[z[0], ..., z[N-1], q, k] *
-          padded_input[b,
-                       x[0]*strides[0] + dilation_rate[0]*z[0],
-                       ...,
-                       x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
-                       q]
-
-where `padded_input` is obtained by zero padding the input using an effective
-spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
-output striding `strides` as described in the
-[comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
-
-In the case that `data_format` does start with `"NC"`, the `input` and output
-(but not the `filter`) are simply transposed as follows:
-
-  convolution(input, data_format, **kwargs) =
-    tf.transpose(convolution(tf.transpose(input, [0] + range(2,N+2) + [1]),
-                             **kwargs),
-                 [0, N+1] + range(1, N+1))
-
-It is required that 1 <= N <= 3.
-
-##### Args:
-
-
-*  <b>`input`</b>: An N-D `Tensor` of type `T`, of shape
-    `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, in_channels] + input_spatial_shape` if data_format starts
-    with "NC".
-*  <b>`filter`</b>: An N-D `Tensor` with the same type as `input` and shape
-    `spatial_filter_shape + [in_channels, out_channels]`.
-*  <b>`padding`</b>: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
-*  <b>`strides`</b>: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
-    Defaults to [1]*N.  If any value of strides is > 1, then all values of
-    dilation_rate must be 1.
-*  <b>`dilation_rate`</b>: Optional.  Sequence of N ints >= 1.  Specifies the filter
-    upsampling/input downsampling rate.  In the literature, the same parameter
-    is sometimes called `input stride` or `dilation`.  The effective filter
-    size used for the convolution will be `spatial_filter_shape +
-    (spatial_filter_shape - 1) * (rate - 1)`, obtained by inserting
-    (dilation_rate[i]-1) zeros between consecutive elements of the original
-    filter in each spatial dimension i.  If any value of dilation_rate is > 1,
-    then all values of strides must be 1.
-*  <b>`name`</b>: Optional name for the returned tensor.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, the valid value is "NDHWC".
-
-##### Returns:
-
-  A `Tensor` with the same type as `input` of shape
-
-      `[batch_size] + output_spatial_shape + [out_channels]`
-
-  if data_format is None or does not start with "NC", or
-
-      `[batch_size, out_channels] + output_spatial_shape`
-
-  if data_format starts with "NC",
-  where `output_spatial_shape` depends on the value of `padding`.
-
-  If padding == "SAME":
-    output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
-
-  If padding == "VALID":
-    output_spatial_shape[i] =
-      ceil((input_spatial_shape[i] -
-            (spatial_filter_shape[i]-1) * dilation_rate[i])
-           / strides[i]).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter` shape, if padding
-    is other than `"VALID"` or `"SAME"`, or if data_format is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
deleted file mode 100644
index f2ae7527e7e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
+++ /dev/null
@@ -1,102 +0,0 @@
-### `tf.nn.dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None)` {#dynamic_rnn}
-
-Creates a recurrent neural network specified by RNNCell `cell`.
-
-This function is functionally identical to the function `rnn` above, but
-performs fully dynamic unrolling of `inputs`.
-
-Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
-each frame.  Instead, `inputs` may be a single `Tensor` where
-the maximum time is either the first or second dimension (see the parameter
-`time_major`).  Alternatively, it may be a (possibly nested) tuple of
-Tensors, each of them having matching batch and time dimensions.
-The corresponding output is either a single `Tensor` having the same number
-of time steps and batch size, or a (possibly nested) tuple of such tensors,
-matching the nested structure of `cell.output_size`.
-
-The parameter `sequence_length` is optional and is used to copy-through state
-and zero-out outputs when past a batch element's sequence length. So it's more
-for correctness than performance, unlike in rnn().
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`inputs`</b>: The RNN inputs.
-
-    If `time_major == False` (default), this must be a `Tensor` of shape:
-      `[batch_size, max_time, ...]`, or a nested tuple of such
-      elements.
-
-    If `time_major == True`, this must be a `Tensor` of shape:
-      `[max_time, batch_size, ...]`, or a nested tuple of such
-      elements.
-
-    This may also be a (possibly nested) tuple of Tensors satisfying
-    this property.  The first two dimensions must match across all the inputs,
-    but otherwise the ranks and other shape components may differ.
-    In this case, input to `cell` at each time-step will replicate the
-    structure of these tuples, except for the time dimension (from which the
-    time is taken).
-
-    The input to `cell` at each time step will be a `Tensor` or (possibly
-    nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
-
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector sized `[batch_size]`.
-*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell.state_size`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
-    Required if initial_state is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`time_major`</b>: The shape format of the `inputs` and `outputs` Tensors.
-    If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-    If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-    Using `time_major = True` is a bit more efficient because it avoids
-    transposes at the beginning and end of the RNN calculation.  However,
-    most TensorFlow data is batch-major, so by default this function
-    accepts input and emits output in batch-major form.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-
-
-*  <b>`outputs`</b>: The RNN output `Tensor`.
-
-      If time_major == False (default), this will be a `Tensor` shaped:
-        `[batch_size, max_time, cell.output_size]`.
-
-      If time_major == True, this will be a `Tensor` shaped:
-        `[max_time, batch_size, cell.output_size]`.
-
-      Note, if `cell.output_size` is a (possibly nested) tuple of integers
-      or `TensorShape` objects, then `outputs` will be a tuple having the
-      same structure as `cell.output_size`, containing Tensors having shapes
-      corresponding to the shape data in `cell.output_size`.
-
-
-*  <b>`state`</b>: The final state.  If `cell.state_size` is an int, this
-      will be shaped `[batch_size, cell.state_size]`.  If it is a
-      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
-      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
-      be a tuple having the corresponding shapes.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If inputs is None or an empty list.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.learned_unigram_candidate_sampler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.learned_unigram_candidate_sampler.md
deleted file mode 100644
index 4f69938e595..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.learned_unigram_candidate_sampler.md
+++ /dev/null
@@ -1,53 +0,0 @@
-### `tf.nn.learned_unigram_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#learned_unigram_candidate_sampler}
-
-Samples a set of classes from a distribution learned during training.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is constructed on the fly
-during training.  It is a unigram distribution over the target
-classes seen so far during training.  Every integer in `[0, range_max)`
-begins with a weight of 1, and is incremented by 1 each time it is
-seen as a target class.  The base distribution is not saved to checkpoints,
-so it is reset when the model is reloaded.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.pool.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.pool.md
deleted file mode 100644
index 98a70fde53e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.pool.md
+++ /dev/null
@@ -1,80 +0,0 @@
-### `tf.nn.pool(input, window_shape, pooling_type, padding, dilation_rate=None, strides=None, name=None, data_format=None)` {#pool}
-
-Performs an N-D pooling operation.
-
-In the case that `data_format` does not start with "NC", computes for
-    0 <= b < batch_size,
-    0 <= x[i] < output_spatial_shape[i],
-    0 <= c < num_channels:
-
-  output[b, x[0], ..., x[N-1], c] =
-    REDUCE_{z[0], ..., z[N-1]}
-      input[b,
-            x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
-            ...
-            x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
-            c],
-
-where the reduction function REDUCE depends on the value of `pooling_type`,
-and pad_before is defined based on the value of `padding` as described in the
-[comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
-The reduction never includes out-of-bounds positions.
-
-In the case that `data_format` starts with `"NC"`, the `input` and output are
-simply transposed as follows:
-
-  pool(input, data_format, **kwargs) =
-    tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
-                      **kwargs),
-                 [0, N+1] + range(1, N+1))
-
-##### Args:
-
-
-*  <b>`input`</b>: Tensor of rank N+2, of shape
-    `[batch_size] + input_spatial_shape + [num_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, num_channels] + input_spatial_shape` if data_format starts
-    with "NC".  Pooling happens over the spatial dimensions only.
-*  <b>`window_shape`</b>: Sequence of N ints >= 1.
-*  <b>`pooling_type`</b>: Specifies pooling operation, must be "AVG" or "MAX".
-*  <b>`padding`</b>: The padding algorithm, must be "SAME" or "VALID".
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`dilation_rate`</b>: Optional.  Dilation rate.  List of N ints >= 1.
-    Defaults to [1]*N.  If any value of dilation_rate is > 1, then all values
-    of strides must be 1.
-*  <b>`strides`</b>: Optional.  Sequence of N ints >= 1.  Defaults to [1]*N.
-    If any value of strides is > 1, then all values of dilation_rate must be
-    1.
-*  <b>`name`</b>: Optional. Name of the op.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, the valid value is "NDHWC".
-
-##### Returns:
-
-  Tensor of rank N+2, of shape
-    [batch_size] + output_spatial_shape + [num_channels]
-
-  if data_format is None or does not start with "NC", or
-
-    [batch_size, num_channels] + output_spatial_shape
-
-  if data_format starts with "NC",
-  where `output_spatial_shape` depends on the value of padding:
-
-  If padding = "SAME":
-    output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
-  If padding = "VALID":
-    output_spatial_shape[i] =
-      ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
-           / strides[i]).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if arguments are invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.quantized_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.quantized_conv2d.md
deleted file mode 100644
index 0c9fd8f1db3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.quantized_conv2d.md
+++ /dev/null
@@ -1,39 +0,0 @@
-### `tf.nn.quantized_conv2d(input, filter, min_input, max_input, min_filter, max_filter, strides, padding, out_type=None, name=None)` {#quantized_conv2d}
-
-Computes a 2D convolution given quantized 4D input and filter tensors.
-
-The inputs are quantized tensors where the lowest value represents the real
-number of the associated minimum, and the highest represents the maximum.
-This means that you can only interpret the quantized output in the same way, by
-taking the returned minimum and maximum values into account.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    filter's input_depth dimension must match input's depth dimensions.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`min_filter`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized filter value represents.
-*  <b>`max_filter`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized filter value represents.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`. Defaults to `tf.qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor` of type `out_type`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.relu6.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.relu6.md
deleted file mode 100644
index 9695e557eba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.relu6.md
+++ /dev/null
@@ -1,15 +0,0 @@
-### `tf.nn.relu6(features, name=None)` {#relu6}
-
-Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-    `int16`, or `int8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.sufficient_statistics.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.sufficient_statistics.md
deleted file mode 100644
index 84aa6163312..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.sufficient_statistics.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.nn.sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None)` {#sufficient_statistics}
-
-Calculate the sufficient statistics for the mean and variance of `x`.
-
-These sufficient statistics are computed using the one pass algorithm on
-an input that's optionally shifted. See:
-https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`axes`</b>: Array of ints. Axes along which to compute mean and variance.
-*  <b>`shift`</b>: A `Tensor` containing the value by which to shift the data for
-    numerical stability, or `None` if no shift is to be performed. A shift
-    close to the true mean provides the most numerically stable results.
-*  <b>`keep_dims`</b>: produce statistics with the same dimensionality as the input.
-*  <b>`name`</b>: Name used to scope the operations that compute the sufficient stats.
-
-##### Returns:
-
-  Four `Tensor` objects of the same type as `x`:
-
-  * the count (number of elements to average over).
-  * the (possibly shifted) sum of the elements in the array.
-  * the (possibly shifted) sum of squares of the elements in the array.
-  * the shift by which the mean must be corrected or None if `shift` is None.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.weighted_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.weighted_cross_entropy_with_logits.md
deleted file mode 100644
index 12593f8412a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.weighted_cross_entropy_with_logits.md
+++ /dev/null
@@ -1,52 +0,0 @@
-### `tf.nn.weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None)` {#weighted_cross_entropy_with_logits}
-
-Computes a weighted cross entropy.
-
-This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
-allows one to trade off recall and precision by up- or down-weighting the
-cost of a positive error relative to a negative error.
-
-The usual cross-entropy cost is defined as:
-
-  targets * -log(sigmoid(logits)) + (1 - targets) * -log(1 - sigmoid(logits))
-
-The argument `pos_weight` is used as a multiplier for the positive targets:
-
-  targets * -log(sigmoid(logits)) * pos_weight +
-      (1 - targets) * -log(1 - sigmoid(logits))
-
-For brevity, let `x = logits`, `z = targets`, `q = pos_weight`.
-The loss is:
-
-      qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-    = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
-    = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
-    = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
-    = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
-    = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
-
-Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
-the implementation uses
-
-    (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
-
-`logits` and `targets` must have the same type and shape.
-
-##### Args:
-
-
-*  <b>`targets`</b>: A `Tensor` of the same type and shape as `logits`.
-*  <b>`logits`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`pos_weight`</b>: A coefficient to use on the positive examples.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `logits` with the componentwise
-  weighted logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `logits` and `targets` do not have the same shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.no_regularizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.no_regularizer.md
deleted file mode 100644
index cb556756417..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.no_regularizer.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.no_regularizer(_)` {#no_regularizer}
-
-Use this function to prevent regularization of variables.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.ones_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.ones_initializer.md
deleted file mode 100644
index 871e73ba25c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.ones_initializer.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Initializer that generates tensors initialized to 1.
-- - -
-
-#### `tf.ones_initializer.__call__(shape, dtype=None, partition_info=None)` {#ones_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.ones_initializer.__init__(dtype=tf.float32)` {#ones_initializer.__init__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.python_io.TFRecordOptions.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.python_io.TFRecordOptions.md
deleted file mode 100644
index 3c05efe8343..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.python_io.TFRecordOptions.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Options used for manipulating TFRecord files.
-- - -
-
-#### `tf.python_io.TFRecordOptions.__init__(compression_type)` {#TFRecordOptions.__init__}
-
-
-
-
-- - -
-
-#### `tf.python_io.TFRecordOptions.get_compression_type_string(cls, options)` {#TFRecordOptions.get_compression_type_string}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.quantized_concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.quantized_concat.md
deleted file mode 100644
index 0bb94d727d2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.quantized_concat.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.quantized_concat(concat_dim, values, input_mins, input_maxes, name=None)` {#quantized_concat}
-
-Concatenates quantized tensors along one dimension.
-
-##### Args:
-
-
-*  <b>`concat_dim`</b>: A `Tensor` of type `int32`.
-    0-D.  The dimension along which to concatenate.  Must be in the
-    range [0, rank(values)).
-*  <b>`values`</b>: A list of at least 2 `Tensor` objects of the same type.
-    The `N` Tensors to concatenate. Their ranks and types must match,
-    and their sizes must match in all dimensions except `concat_dim`.
-*  <b>`input_mins`</b>: A list with the same number of `Tensor` objects as `values` of `Tensor` objects of type `float32`.
-    The minimum scalar values for each of the input tensors.
-*  <b>`input_maxes`</b>: A list with the same number of `Tensor` objects as `values` of `Tensor` objects of type `float32`.
-    The maximum scalar values for each of the input tensors.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, output_min, output_max).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `values`. A `Tensor` with the concatenation of values stacked along the
-    `concat_dim` dimension.  This tensor's shape matches that of `values` except
-    in `concat_dim` where it has the sum of the sizes.
-*  <b>`output_min`</b>: A `Tensor` of type `float32`. The float value that the minimum quantized output value represents.
-*  <b>`output_max`</b>: A `Tensor` of type `float32`. The float value that the maximum quantized output value represents.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reduce_prod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reduce_prod.md
deleted file mode 100644
index 89810f84591..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reduce_prod.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.reduce_prod(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_prod}
-
-Computes the product of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.prod
-@end_compatibility
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reset_default_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reset_default_graph.md
deleted file mode 100644
index ae5a906a0d2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reset_default_graph.md
+++ /dev/null
@@ -1,10 +0,0 @@
-### `tf.reset_default_graph()` {#reset_default_graph}
-
-Clears the default graph stack and resets the global default graph.
-
-NOTE: The default graph is a property of the current thread. This
-function applies only to the current thread.  Calling this function while
-a `tf.Session` or `tf.InteractiveSession` is active will result in undefined
-behavior. Using any previously created `tf.Operation` or `tf.Tensor` objects
-after calling this function will result in undefined behavior.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reverse.md
deleted file mode 100644
index d040ecf92a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.reverse.md
+++ /dev/null
@@ -1,64 +0,0 @@
-### `tf.reverse(tensor, axis, name=None)` {#reverse}
-
-Reverses specific dimensions of a tensor.
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```prettyprint
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is -1
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int32`, `int64`, `bool`, `half`, `float32`, `float64`, `complex64`, `complex128`.
-    Up to 8-D.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    1-D. The indices of the dimensions to reverse.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`. The same shape as `tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.round.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.round.md
deleted file mode 100644
index 1693dbe61fa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.round.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.round(x, name=None)` {#round}
-
-Rounds the values of a tensor to the nearest integer, element-wise.
-
-Rounds half to even.  Also known as bankers rounding. If you want to round
-according to the current system rounding mode use tf::cint.
-For example:
-
-```python
-# 'a' is [0.9, 2.5, 2.3, 1.5, -4.5]
-tf.round(a) ==> [ 1.0, 2.0, 2.0, 2.0, -4.0 ]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.rsqrt.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.rsqrt.md
deleted file mode 100644
index 5f76fcd5932..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.rsqrt.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.rsqrt(x, name=None)` {#rsqrt}
-
-Computes reciprocal of square root of x element-wise.
-
-I.e., \\(y = 1 / \sqrt{x}\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_add.md
deleted file mode 100644
index a8f8b7a9b02..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_add.md
+++ /dev/null
@@ -1,46 +0,0 @@
-### `tf.scatter_add(ref, indices, updates, use_locking=None, name=None)` {#scatter_add}
-
-Adds sparse updates to a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] += updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] += updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to add to `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the addition will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_div.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_div.md
deleted file mode 100644
index ecd8e8b8900..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.scatter_div.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.scatter_div(ref, indices, updates, use_locking=None, name=None)` {#scatter_div}
-
-Divides a variable reference by sparse updates.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] /= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] /= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions divide.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of values that `ref` is divided by.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the operation will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md
deleted file mode 100644
index 7c0144f2f40..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.sequence_mask(lengths, maxlen=None, dtype=tf.bool, name=None)` {#sequence_mask}
-
-Return a mask tensor representing the first N positions of each row.
-
-Example:
-
-```python
-tf.sequence_mask([1, 3, 2], 5) =
-  [[True, False, False, False, False],
-   [True, True, True, False, False],
-   [True, True, False, False, False]]
-```
-
-##### Args:
-
-
-*  <b>`lengths`</b>: 1D integer tensor, all its values < maxlen.
-*  <b>`maxlen`</b>: scalar integer tensor, maximum length of each row. Default: use
-          maximum over lengths.
-*  <b>`dtype`</b>: output type of the resulting tensor.
-*  <b>`name`</b>: name of the op.
-
-##### Returns:
-
-  A 2D mask tensor, as shown in the example above, cast to specified dtype.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the arguments have invalid rank.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.set_random_seed.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.set_random_seed.md
deleted file mode 100644
index d8d3abc5eba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.set_random_seed.md
+++ /dev/null
@@ -1,98 +0,0 @@
-### `tf.set_random_seed(seed)` {#set_random_seed}
-
-Sets the graph-level random seed.
-
-Operations that rely on a random seed actually derive it from two seeds:
-the graph-level and operation-level seeds. This sets the graph-level seed.
-
-Its interactions with operation-level seeds is as follows:
-
-  1. If neither the graph-level nor the operation seed is set:
-    A random seed is used for this op.
-  2. If the graph-level seed is set, but the operation seed is not:
-    The system deterministically picks an operation seed in conjunction
-    with the graph-level seed so that it gets a unique random sequence.
-  3. If the graph-level seed is not set, but the operation seed is set:
-    A default graph-level seed and the specified operation seed are used to
-    determine the random sequence.
-  4. If both the graph-level and the operation seed are set:
-    Both seeds are used in conjunction to determine the random sequence.
-
-To illustrate the user-visible effects, consider these examples:
-
-To generate different sequences across sessions, set neither
-graph-level nor op-level seeds:
-
-```python
-a = tf.random_uniform([1])
-b = tf.random_normal([1])
-
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A3'
-  print(sess2.run(a))  # generates 'A4'
-  print(sess2.run(b))  # generates 'B3'
-  print(sess2.run(b))  # generates 'B4'
-```
-
-To generate the same repeatable sequence for an op across sessions, set the
-seed for the op:
-
-```python
-a = tf.random_uniform([1], seed=1)
-b = tf.random_normal([1])
-
-# Repeatedly running this block with the same graph will generate the same
-# sequence of values for 'a', but different sequences of values for 'b'.
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A1'
-  print(sess2.run(a))  # generates 'A2'
-  print(sess2.run(b))  # generates 'B3'
-  print(sess2.run(b))  # generates 'B4'
-```
-
-To make the random sequences generated by all ops be repeatable across
-sessions, set a graph-level seed:
-
-```python
-tf.set_random_seed(1234)
-a = tf.random_uniform([1])
-b = tf.random_normal([1])
-
-# Repeatedly running this block with the same graph will generate the same
-# sequences of 'a' and 'b'.
-print("Session 1")
-with tf.Session() as sess1:
-  print(sess1.run(a))  # generates 'A1'
-  print(sess1.run(a))  # generates 'A2'
-  print(sess1.run(b))  # generates 'B1'
-  print(sess1.run(b))  # generates 'B2'
-
-print("Session 2")
-with tf.Session() as sess2:
-  print(sess2.run(a))  # generates 'A1'
-  print(sess2.run(a))  # generates 'A2'
-  print(sess2.run(b))  # generates 'B1'
-  print(sess2.run(b))  # generates 'B2'
-```
-
-##### Args:
-
-
-*  <b>`seed`</b>: integer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_fill_empty_rows.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_fill_empty_rows.md
deleted file mode 100644
index 3ea1697f3de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_fill_empty_rows.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.sparse_fill_empty_rows(sp_input, default_value, name=None)` {#sparse_fill_empty_rows}
-
-Fills empty rows in the input 2-D `SparseTensor` with a default value.
-
-This op adds entries with the specified `default_value` at index
-`[row, 0]` for any row in the input that does not already have a value.
-
-For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [1, 0]: default_value
-    [2, 0]: c
-    [3, 1]: d
-    [4, 0]: default_value
-
-Note that the input may have empty columns at the end, with no effect on
-this op.
-
-The output `SparseTensor` will be in row-major order and will have the
-same shape as the input.
-
-This op also returns an indicator vector such that
-
-    empty_row_indicator[i] = True iff row i was an empty row.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: A `SparseTensor` with shape `[N, M]`.
-*  <b>`default_value`</b>: The value to fill for empty rows, with the same type as
-    `sp_input.`
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-
-*  <b>`sp_ordered_output`</b>: A `SparseTensor` with shape `[N, M]`, and with all empty
-    rows filled in with `default_value`.
-*  <b>`empty_row_indicator`</b>: A bool vector of length `N` indicating whether each
-    input row was empty.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_reorder.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_reorder.md
deleted file mode 100644
index 1e7b8fd8575..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_reorder.md
+++ /dev/null
@@ -1,41 +0,0 @@
-### `tf.sparse_reorder(sp_input, name=None)` {#sparse_reorder}
-
-Reorders a `SparseTensor` into the canonical, row-major ordering.
-
-Note that by convention, all sparse ops preserve the canonical ordering
-along increasing dimension number. The only time ordering can be violated
-is during manual manipulation of the indices and values to add entries.
-
-Reordering does not affect the shape of the `SparseTensor`.
-
-For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
-
-    [0, 3]: b
-    [0, 1]: a
-    [3, 1]: d
-    [2, 0]: c
-
-then the output will be a `SparseTensor` of shape `[4, 5]` and
-`indices` / `values`:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A `SparseTensor` with the same shape and non-empty values, but in
-  canonical ordering.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_retain.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_retain.md
deleted file mode 100644
index dcaa3036279..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_retain.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.sparse_retain(sp_input, to_retain)` {#sparse_retain}
-
-Retains specified non-empty values within a `SparseTensor`.
-
-For example, if `sp_input` has shape `[4, 5]` and 4 non-empty string values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-and `to_retain = [True, False, False, True]`, then the output will
-be a `SparseTensor` of shape `[4, 5]` with 2 non-empty values:
-
-    [0, 1]: a
-    [3, 1]: d
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor` with `N` non-empty elements.
-*  <b>`to_retain`</b>: A bool vector of length `N` with `M` true values.
-
-##### Returns:
-
-  A `SparseTensor` with the same shape as the input and `M` non-empty
-  elements corresponding to the true positions in `to_retain`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_segment_sqrt_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_segment_sqrt_n.md
deleted file mode 100644
index 83ae3d67ec6..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_segment_sqrt_n.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.sparse_segment_sqrt_n(data, indices, segment_ids, name=None)` {#sparse_segment_sqrt_n}
-
-Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-
-N is the size of the segment being reduced.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_to_dense.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_to_dense.md
deleted file mode 100644
index d4df5a91830..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sparse_to_dense.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0, validate_indices=True, name=None)` {#sparse_to_dense}
-
-Converts a sparse representation into a dense tensor.
-
-Builds an array `dense` with shape `output_shape` such that
-
-```python
-# If sparse_indices is scalar
-dense[i] = (i == sparse_indices ? sparse_values : default_value)
-
-# If sparse_indices is a vector, then for each i
-dense[sparse_indices[i]] = sparse_values[i]
-
-# If sparse_indices is an n by d matrix, then for each i in [0, n)
-dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-```
-
-All other values in `dense` are set to `default_value`.  If `sparse_values`
-is a scalar, all sparse indices are set to this single value.
-
-Indices should be sorted in lexicographic order, and indices must not
-contain any repeats. If `validate_indices` is True, these properties
-are checked during execution.
-
-##### Args:
-
-
-*  <b>`sparse_indices`</b>: A 0-D, 1-D, or 2-D `Tensor` of type `int32` or `int64`.
-    `sparse_indices[i]` contains the complete index where `sparse_values[i]`
-    will be placed.
-*  <b>`output_shape`</b>: A 1-D `Tensor` of the same type as `sparse_indices`.  Shape
-    of the dense output tensor.
-*  <b>`sparse_values`</b>: A 0-D or 1-D `Tensor`.  Values corresponding to each row of
-    `sparse_indices`, or a scalar value to be used for all sparse indices.
-*  <b>`default_value`</b>: A 0-D `Tensor` of the same type as `sparse_values`.  Value
-    to set for indices not specified in `sparse_indices`.  Defaults to zero.
-*  <b>`validate_indices`</b>: A boolean value.  If True, indices are checked to make
-    sure they are sorted in lexicographic order and that there are no repeats.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Dense `Tensor` of shape `output_shape`.  Has the same type as
-  `sparse_values`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
deleted file mode 100644
index 25abd415c3a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
+++ /dev/null
@@ -1,86 +0,0 @@
-### `tf.strided_slice(input_, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)` {#strided_slice}
-
-Extracts a strided slice from a tensor.
-
-To a first order, this operation extracts a slice of size `end - begin`
-from a tensor `input`
-starting at the location specified by `begin`. The slice continues by adding
-`stride` to the `begin` index until all dimensions are not less than `end`.
-Note that components of stride can be negative, which causes a reverse
-slice.
-
-This operation can be thought of an encoding of a numpy style sliced
-range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
-this function will be called as follows.
-
-`begin`, `end`, and `strides` will be all length n. n is in general
-not the same dimensionality as `input`.
-
-For the ith spec,
-`begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
-and `shrink_axis_mask` will have the ith bit corresponding to
-the ith spec.
-
-If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
-the fullest possible range in that dimension is used instead.
-`end_mask` works analogously, except with the end range.
-
-`foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
-`foo[::-1]` reverses a tensor with shape 8.
-
-
-If the ith bit of `ellipsis_mask`, as many unspecified dimensions
-as needed will be inserted between other dimensions. Only one
-non-zero bit is allowed in `ellipsis_mask`.
-
-For example `foo[3:5,...,4:5]` on a shape 10x3x3x10 tensor is
-equivalent to `foo[3:5,:,:,4:5]` and
-`foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
-
-If the ith bit of `new_axis_mask` is one, then a `begin`,
-`end`, and `stride` are ignored and a new length 1 dimension is
-added at this point in the output tensor.
-
-For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
-whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
-being 1<<1 == 2.
-
-If the ith bit of `shrink_axis_mask` is one, then `begin`,
-`end[i]`, and `stride[i]` are used to do a slice in the appropriate
-dimension, but the output tensor will be reduced in dimensionality
-by one. This is only valid if the ith entry of slice[i]==1.
-
-NOTE: `begin` and `end` are zero-indexed`.
-`strides` entries must be non-zero.
-
-
-```python
-# 'input' is [[[1, 1, 1], [2, 2, 2]],
-#             [[3, 3, 3], [4, 4, 4]],
-#             [[5, 5, 5], [6, 6, 6]]]
-tf.strided_slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
-tf.strided_slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
-                                                               [4, 4, 4]]]
-tf.strided_slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
-                                                                [3, 3, 3]]]
-```
-
-##### Args:
-
-
-*  <b>`input_`</b>: A `Tensor`.
-*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`end`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`strides`</b>: An `int32` or `int64` `Tensor`.
-*  <b>`begin_mask`</b>: An `int32` mask.
-*  <b>`end_mask`</b>: An `int32` mask.
-*  <b>`ellipsis_mask`</b>: An `int32` mask.
-*  <b>`new_axis_mask`</b>: An `int32` mask.
-*  <b>`shrink_axis_mask`</b>: An `int32` mask.
-*  <b>`var`</b>: The variable corresponding to `input_` or None
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.subtract.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.subtract.md
deleted file mode 100644
index 93a00899c9a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.subtract.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.subtract(x, y, name=None)` {#subtract}
-
-Returns x - y element-wise.
-
-*NOTE*: `tf.subtract` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.histogram.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.histogram.md
deleted file mode 100644
index 19df48fd3f4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.histogram.md
+++ /dev/null
@@ -1,25 +0,0 @@
-### `tf.summary.histogram(name, values, collections=None)` {#histogram}
-
-Outputs a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`values`</b>: A real numeric `Tensor`. Any shape. Values to use to
-    build the histogram.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.merge.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.merge.md
deleted file mode 100644
index 5a7bd8a0f53..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.summary.merge.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.summary.merge(inputs, collections=None, name=None)` {#merge}
-
-Merges summaries.
-
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `string` `Tensor` objects containing serialized `Summary`
-    protocol buffers.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer resulting from the merging.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.tensordot.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.tensordot.md
deleted file mode 100644
index 76c811e2132..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.tensordot.md
+++ /dev/null
@@ -1,54 +0,0 @@
-### `tf.tensordot(a, b, axes, name=None)` {#tensordot}
-
-Tensor contraction of a and b along specified axes.
-
-Tensordot (also known as tensor contraction) sums the product of elements
-from `a` and `b` over the indices specified by `a_axes` and `b_axes`.
-The lists `a_axes` and `b_axes` specify those pairs of axes along which to
-contract the tensors. The axis `a_axes[i]` of `a` must have the same dimension
-as axis `b_axes[i]` of `b` for all `i` in `range(0, len(a_axes))`. The lists
-`a_axes` and `b_axes` must have identical length and consist of unique
-integers that specify valid axes for each of the tensors.
-
-This operation corresponds to `numpy.tensordot(a, b, axes)`.
-
-Example 1: When `a` and `b` are matrices (order 2), the case `axes = 1`
-is equivalent to matrix multiplication.
-
-Example 2: When `a` and `b` are matrices (order 2), the case
-`axes = [[1], [0]]` is equivalent to matrix multiplication.
-
-Example 3: Suppose that \\(a_ijk\\) and \\(b_lmn\\) represent two
-tensors of order 3. Then, `contract(a, b, [0], [2])` is the order 4 tensor
-\\(c_{jklm}\\) whose entry
-corresponding to the indices \\((j,k,l,m)\\) is given by:
-
-\\( c_{jklm} = \sum_i a_{ijk} b_{lmi} \\).
-
-In general, `order(c) = order(a) + order(b) - 2*len(axes[0])`.
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` of type `float32` or `float64`.
-*  <b>`b`</b>: `Tensor` with the same type as `a`.
-*  <b>`axes`</b>: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-   If axes is a scalar, sum over the last N axes of a and the first N axes
-   of b in order.
-   If axes is a list or `Tensor` the first and second row contain the set of
-   unique integers specifying axes along which the contraction is computed,
-   for `a` and `b`, respectively. The number of axes for `a` and `b` must
-   be equal.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `a`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `a`, `b`, and `axes` are incompatible.
-*  <b>`IndexError`</b>: If the values in axes exceed the rank of the corresponding
-    tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.AdagradDAOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.AdagradDAOptimizer.md
deleted file mode 100644
index 5847c37ac94..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.AdagradDAOptimizer.md
+++ /dev/null
@@ -1,41 +0,0 @@
-Adagrad Dual Averaging algorithm for sparse linear models.
-
-See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
-
-This optimizer takes care of regularization of unseen features in a mini batch
-by updating them when they are seen with a closed form update rule that is
-equivalent to having updated them on every mini-batch.
-
-AdagradDA is typically used when there is a need for large sparsity in the
-trained model. This optimizer only guarantees sparsity for linear models. Be
-careful when using AdagradDA for deep networks as it will require careful
-initialization of the gradient accumulators for it to train.
-
-- - -
-
-#### `tf.train.AdagradDAOptimizer.__init__(learning_rate, global_step, initial_gradient_squared_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='AdagradDA')` {#AdagradDAOptimizer.__init__}
-
-Construct a new AdagradDA optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`global_step`</b>: A `Tensor` containing the current training step number.
-*  <b>`initial_gradient_squared_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "AdagradDA".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_gradient_squared_accumulator_value` is
-  invalid.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.Scaffold.get_or_default.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.Scaffold.get_or_default.md
deleted file mode 100644
index ecb8dc31eb4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.Scaffold.get_or_default.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.train.Scaffold.get_or_default(arg_name, collection_key, default_constructor)` {#Scaffold.get_or_default}
-
-Get from cache or create a default operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SessionRunArgs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SessionRunArgs.md
deleted file mode 100644
index 85695d22bec..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SessionRunArgs.md
+++ /dev/null
@@ -1,64 +0,0 @@
-Represents arguments to be added to a `Session.run()` call.
-
-Args:
-  fetches: Exactly like the 'fetches' argument to Session.Run().
-    Can be a single tensor or op, a list of 'fetches' or a dictionary
-    of fetches.  For example:
-      fetches = global_step_tensor
-      fetches = [train_op, summary_op, global_step_tensor]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-    Note that this can recurse as expected:
-      fetches = {'step': global_step_tensor,
-                 'ops': [train_op, check_nan_op]}
-  feed_dict: Exactly like the `feed_dict` argument to `Session.Run()`
-  options: Exactly like the `options` argument to `Session.run()`, i.e., a
-    config_pb2.RunOptions proto.
-- - -
-
-#### `tf.train.SessionRunArgs.__getnewargs__()` {#SessionRunArgs.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__getstate__()` {#SessionRunArgs.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__new__(cls, fetches, feed_dict=None, options=None)` {#SessionRunArgs.__new__}
-
-
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__repr__()` {#SessionRunArgs.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.feed_dict` {#SessionRunArgs.feed_dict}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.fetches` {#SessionRunArgs.fetches}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.options` {#SessionRunArgs.options}
-
-Alias for field number 2
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md
deleted file mode 100644
index 2d09da7b0ce..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md
+++ /dev/null
@@ -1,79 +0,0 @@
-Saves summaries every N steps.
-- - -
-
-#### `tf.train.SummarySaverHook.__init__(save_steps=None, save_secs=None, output_dir=None, summary_writer=None, scaffold=None, summary_op=None)` {#SummarySaverHook.__init__}
-
-Initializes a `SummarySaver` monitor.
-
-##### Args:
-
-
-*  <b>`save_steps`</b>: `int`, save summaries every N steps. Exactly one of
-      `save_secs` and `save_steps` should be set.
-*  <b>`save_secs`</b>: `int`, save summaries every N seconds.
-*  <b>`output_dir`</b>: `string`, the directory to save the summaries to. Only used
-      if no `summary_writer` is supplied.
-*  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
-      one will be created accordingly.
-*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
-*  <b>`summary_op`</b>: `Tensor` of type `string` containing the serialized `Summary`
-      protocol buffer or a list of `Tensor`. They are most likely an output
-      by TF summary methods like `tf.summary.scalar` or
-      `tf.summary.merge_all`. It can be passed in as one tensor; if more
-      than one, they must be passed in as a list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: Exactly one of scaffold or summary_op should be set.
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.after_create_session(session, coord)` {#SummarySaverHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.after_run(run_context, run_values)` {#SummarySaverHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.before_run(run_context)` {#SummarySaverHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.begin()` {#SummarySaverHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.end(session=None)` {#SummarySaverHook.end}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md
deleted file mode 100644
index 965f4f2eeff..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md
+++ /dev/null
@@ -1,81 +0,0 @@
-### `tf.train.batch(tensors, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#batch}
-
-Creates batches of tensors in `tensors`.
-
-The argument `tensors` can be a list or a dictionary of tensors.
-The value returned by the function will be of the same type
-as `tensors`.
-
-This function is implemented using a queue. A `QueueRunner` for the
-queue is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-If `enqueue_many` is `False`, `tensors` is assumed to represent a single
-example.  An input tensor with shape `[x, y, z]` will be output as a tensor
-with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors` is assumed to represent a batch of
-examples, where the first dimension is indexed by example, and all members of
-`tensors` should have the same size in the first dimension.  If an input
-tensor has shape `[*, x, y, z]`, the output will have shape `[batch_size, x,
-y, z]`.  The `capacity` argument controls the how long the prefetching is
-allowed to grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have shape `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same types as `tensors` (except if
-  the input is a list of one element, then it returns a tensor, not a list).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.do_quantize_training_on_graphdef.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.do_quantize_training_on_graphdef.md
deleted file mode 100644
index 6fbb908133c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.do_quantize_training_on_graphdef.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.train.do_quantize_training_on_graphdef(input_graph, num_bits)` {#do_quantize_training_on_graphdef}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md
deleted file mode 100644
index d0fa7f551eb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md
+++ /dev/null
@@ -1,70 +0,0 @@
-### `tf.train.import_meta_graph(meta_graph_or_file, clear_devices=False, import_scope=None, **kwargs)` {#import_meta_graph}
-
-Recreates a Graph saved in a `MetaGraphDef` proto.
-
-This function takes a `MetaGraphDef` protocol buffer as input. If
-the argument is a file containing a `MetaGraphDef` protocol buffer ,
-it constructs a protocol buffer from the file content. The function
-then adds all the nodes from the `graph_def` field to the
-current graph, recreates all the collections, and returns a saver
-constructed from the `saver_def` field.
-
-In combination with `export_meta_graph()`, this function can be used to
-
-* Serialize a graph along with other Python objects such as `QueueRunner`,
-  `Variable` into a `MetaGraphDef`.
-
-* Restart training from a saved graph and checkpoints.
-
-* Run inference from a saved graph and checkpoints.
-
-```Python
-...
-# Create a saver.
-saver = tf.train.Saver(...variables...)
-# Remember the training_op we want to run by adding it to a collection.
-tf.add_to_collection('train_op', train_op)
-sess = tf.Session()
-for step in xrange(1000000):
-    sess.run(train_op)
-    if step % 1000 == 0:
-        # Saves checkpoint, which by default also exports a meta_graph
-        # named 'my-model-global_step.meta'.
-        saver.save(sess, 'my-model', global_step=step)
-```
-
-Later we can continue training from this saved `meta_graph` without building
-the model from scratch.
-
-```Python
-with tf.Session() as sess:
-  new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
-  new_saver.restore(sess, 'my-save-dir/my-model-10000')
-  # tf.get_collection() returns a list. In this example we only want the
-  # first one.
-  train_op = tf.get_collection('train_op')[0]
-  for step in xrange(1000000):
-    sess.run(train_op)
-```
-
-NOTE: Restarting training from saved `meta_graph` only works if the
-device assignments have not changed.
-
-##### Args:
-
-
-*  <b>`meta_graph_or_file`</b>: `MetaGraphDef` protocol buffer or filename (including
-    the path) containing a `MetaGraphDef`.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during import.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
-    initializing from protocol buffer.
-*  <b>`**kwargs`</b>: Optional keyed arguments.
-
-##### Returns:
-
-  A saver constructed from `saver_def` in `MetaGraphDef` or None.
-
-  A None value is returned if no variables exist in the `MetaGraphDef`
-  (i.e., there are no variables to restore).
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.truncatemod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.truncatemod.md
deleted file mode 100644
index c75108fc55f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.truncatemod.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.truncatemod(x, y, name=None)` {#truncatemod}
-
-Returns element-wise remainder of division. This emulates C semantics where
-
-true, this follows C semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.variable_axis_size_partitioner.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.variable_axis_size_partitioner.md
deleted file mode 100644
index 5d8822e83ca..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.variable_axis_size_partitioner.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.variable_axis_size_partitioner(max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None)` {#variable_axis_size_partitioner}
-
-Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
-
-This partitioner will shard a Variable along one axis, attempting to keep
-the maximum shard size below `max_shard_bytes`.  In practice, this is not
-always possible when sharding along only one axis.  When this happens,
-this axis is sharded as much as possible (i.e., every dimension becomes
-a separate shard).
-
-If the partitioner hits the `max_shards` limit, then each shard may end up
-larger than `max_shard_bytes`. By default `max_shards` equals `None` and no
-limit on the number of shards is enforced.
-
-One reasonable value for `max_shard_bytes` is `(64 << 20) - 1`, or almost
-`64MB`, to keep below the protobuf byte limit.
-
-##### Args:
-
-
-*  <b>`max_shard_bytes`</b>: The maximum size any given shard is allowed to be.
-*  <b>`axis`</b>: The axis to partition along.  Default: outermost axis.
-*  <b>`bytes_per_string_element`</b>: If the `Variable` is of type string, this provides
-    an estimate of how large each scalar in the `Variable` is.
-*  <b>`max_shards`</b>: The maximum number of shards in int created taking precedence
-    over `max_shard_bytes`.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If any of the byte counts are non-positive.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.zeta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.zeta.md
deleted file mode 100644
index ed66237d385..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.zeta.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.zeta(x, q, name=None)` {#zeta}
-
-Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-
-The Hurwitz zeta function is defined as:
-
-```
-\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`q`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.DeviceSpec.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.DeviceSpec.md
deleted file mode 100644
index 2355a91e541..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.DeviceSpec.md
+++ /dev/null
@@ -1,147 +0,0 @@
-Represents a (possibly partial) specification for a TensorFlow device.
-
-`DeviceSpec`s are used throughout TensorFlow to describe where state is stored
-and computations occur. Using `DeviceSpec` allows you to parse device spec
-strings to verify their validity, merge them or compose them programmatically.
-
-Example:
-
-```python
-# Place the operations on device "GPU:0" in the "ps" job.
-device_spec = DeviceSpec(job="ps", device_type="GPU", device_index=0)
-with tf.device(device_spec):
-  # Both my_var and squared_var will be placed on /job:ps/device:GPU:0.
-  my_var = tf.Variable(..., name="my_variable")
-  squared_var = tf.square(my_var)
-```
-
-If a `DeviceSpec` is partially specified, it will be merged with other
-`DeviceSpec`s according to the scope in which it is defined. `DeviceSpec`
-components defined in inner scopes take precedence over those defined in
-outer scopes.
-
-```python
-with tf.device(DeviceSpec(job="train", )):
-  with tf.device(DeviceSpec(job="ps", device_type="GPU", device_index=0):
-    # Nodes created here will be assigned to /job:ps/device:GPU:0.
-  with tf.device(DeviceSpec(device_type="GPU", device_index=1):
-    # Nodes created here will be assigned to /job:train/device:GPU:1.
-```
-
-A `DeviceSpec` consists of 5 components -- each of
-which is optionally specified:
-
-* Job: The job name.
-* Replica: The replica index.
-* Task: The task index.
-* Device type: The device type string (e.g. "CPU" or "GPU").
-* Device index: The device index.
-- - -
-
-#### `tf.DeviceSpec.__init__(job=None, replica=None, task=None, device_type=None, device_index=None)` {#DeviceSpec.__init__}
-
-Create a new `DeviceSpec` object.
-
-##### Args:
-
-
-*  <b>`job`</b>: string.  Optional job name.
-*  <b>`replica`</b>: int.  Optional replica index.
-*  <b>`task`</b>: int.  Optional task index.
-*  <b>`device_type`</b>: Optional device type string (e.g. "CPU" or "GPU")
-*  <b>`device_index`</b>: int.  Optional device index.  If left
-    unspecified, device represents 'any' device_index.
-
-
-- - -
-
-#### `tf.DeviceSpec.from_string(spec)` {#DeviceSpec.from_string}
-
-Construct a `DeviceSpec` from a string.
-
-##### Args:
-
-
-*  <b>`spec`</b>: a string of the form
-   /job:<name>/replica:<id>/task:<id>/device:CPU:<id>
-  or
-   /job:<name>/replica:<id>/task:<id>/device:GPU:<id>
-  as cpu and gpu are mutually exclusive.
-  All entries are optional.
-
-##### Returns:
-
-  A DeviceSpec.
-
-
-- - -
-
-#### `tf.DeviceSpec.job` {#DeviceSpec.job}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.merge_from(dev)` {#DeviceSpec.merge_from}
-
-Merge the properties of "dev" into this `DeviceSpec`.
-
-##### Args:
-
-
-*  <b>`dev`</b>: a `DeviceSpec`.
-
-
-- - -
-
-#### `tf.DeviceSpec.parse_from_string(spec)` {#DeviceSpec.parse_from_string}
-
-Parse a `DeviceSpec` name into its components.
-
-##### Args:
-
-
-*  <b>`spec`</b>: a string of the form
-   /job:<name>/replica:<id>/task:<id>/device:CPU:<id>
-  or
-   /job:<name>/replica:<id>/task:<id>/device:GPU:<id>
-  as cpu and gpu are mutually exclusive.
-  All entries are optional.
-
-##### Returns:
-
-  The `DeviceSpec`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the spec was not valid.
-
-
-- - -
-
-#### `tf.DeviceSpec.replica` {#DeviceSpec.replica}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.task` {#DeviceSpec.task}
-
-
-
-
-- - -
-
-#### `tf.DeviceSpec.to_string()` {#DeviceSpec.to_string}
-
-Return a string representation of this `DeviceSpec`.
-
-##### Returns:
-
-  a string of the form
-  /job:<name>/replica:<id>/task:<id>/device:<device_type>:<id>.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.FixedLenFeature.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.FixedLenFeature.__new__.md
deleted file mode 100644
index f7838d1884f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.FixedLenFeature.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.FixedLenFeature.__new__(_cls, shape, dtype, default_value=None)` {#FixedLenFeature.__new__}
-
-Create new instance of FixedLenFeature(shape, dtype, default_value)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.NotDifferentiable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.NotDifferentiable.md
deleted file mode 100644
index c77655a1d3e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.NotDifferentiable.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.NotDifferentiable(op_type)` {#NotDifferentiable}
-
-Specifies that ops of type `op_type` is not differentiable.
-
-This function should *not* be used for operations that have a
-well-defined gradient that is not yet implemented.
-
-This function is only used when defining a new op type. It may be
-used for ops such as `tf.size()` that are not differentiable.  For
-example:
-
-```python
-tf.NotDifferentiable("Size")
-```
-
-The gradient computed for 'op_type' will then propagate zeros.
-
-For ops that have a well-defined gradient but are not yet implemented,
-no declaration should be made, and an error *must* be thrown if
-an attempt to request its gradient is made.
-
-##### Args:
-
-
-*  <b>`op_type`</b>: The string type of an operation. This corresponds to the
-    `OpDef.name` field for the proto that defines the operation.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `op_type` is not a string.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.Session.reset.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.Session.reset.md
deleted file mode 100644
index 4c47c022646..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.Session.reset.md
+++ /dev/null
@@ -1,29 +0,0 @@
-#### `tf.Session.reset(target, containers=None, config=None)` {#Session.reset}
-
-Resets resource containers on `target`, and close all connected sessions.
-
-A resource container is distributed across all workers in the
-same cluster as `target`.  When a resource container on `target`
-is reset, resources associated with that container will be cleared.
-In particular, all Variables in the container will become undefined:
-they lose their values and shapes.
-
-NOTE:
-(i) reset() is currently only implemented for distributed sessions.
-(ii) Any sessions on the master named by `target` will be closed.
-
-If no resource containers are provided, all containers are reset.
-
-##### Args:
-
-
-*  <b>`target`</b>: The execution engine to connect to.
-*  <b>`containers`</b>: A list of resource container name strings, or `None` if all of
-    all the containers are to be reset.
-*  <b>`config`</b>: (Optional.) Protocol buffer with configuration options.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    resetting containers.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md
deleted file mode 100644
index 60e2875407b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.assign_add(ref, value, use_locking=None, name=None)` {#assign_add}
-
-Update 'ref' by adding 'value' to it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types:
-    `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`,
-    `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be added to the variable.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the addition will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.clip_by_average_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.clip_by_average_norm.md
deleted file mode 100644
index 4598e183d80..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.clip_by_average_norm.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.clip_by_average_norm(t, clip_norm, name=None)` {#clip_by_average_norm}
-
-Clips tensor values to a maximum average L2-norm.
-
-Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
-normalizes `t` so that its average L2-norm is less than or equal to
-`clip_norm`. Specifically, if the average L2-norm is already less than or
-equal to `clip_norm`, then `t` is not modified. If the average L2-norm is
-greater than `clip_norm`, then this operation returns a tensor of the same
-type and shape as `t` with its values set to:
-
-`t * clip_norm / l2norm_avg(t)`
-
-In this case, the average L2-norm of the output tensor is `clip_norm`.
-
-This operation is typically used to clip gradients before applying them with
-an optimizer.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.concat_v2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.concat_v2.md
deleted file mode 100644
index 1b96b857b2a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.concat_v2.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.concat_v2(values, axis, name='concat_v2')` {#concat_v2}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.entropy.entropy_shannon.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.entropy.entropy_shannon.md
deleted file mode 100644
index 489d94783de..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.entropy.entropy_shannon.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.contrib.bayesflow.entropy.entropy_shannon(p, z=None, n=None, seed=None, form=None, name='entropy_shannon')` {#entropy_shannon}
-
-Monte Carlo or deterministic computation of Shannon's entropy.
-
-Depending on the kwarg `form`, this `Op` returns either the analytic entropy
-of the distribution `p`, or the sampled entropy:
-
-```
--n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
-    \approx - E_p[ Log[p(Z)] ]
-    = Entropy[p]
-```
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`p`</b>: `tf.contrib.distributions.Distribution`
-*  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`form`</b>: Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-    or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-    (attempt analytic entropy, fallback on sample).
-    Default value is `ELBOForms.default`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `form` not handled by this function.
-*  <b>`ValueError`</b>: If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace.md
deleted file mode 100644
index 10f7a67c632..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace(log_f, log_p, sampling_dist_q, z=None, n=None, seed=None, name='expectation_importance_sampler_logspace')` {#expectation_importance_sampler_logspace}
-
-Importance sampling with a positive function, in log-space.
-
-With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-returns
-
-```
-Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-=       Log[E_p[f(Z)]]
-```
-
-This integral is done in log-space with max-subtraction to better handle the
-often extreme values that `f(z) p(z) / q(z)` can take on.
-
-In contrast to `expectation_importance_sampler`, this `Op` returns values in
-log-space.
-
-
-User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-##### Args:
-
-
-*  <b>`log_f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_f` works "just like" `sampling_dist_q.log_prob`.
-*  <b>`log_p`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `log_p` works "just like" `q.log_prob`.
-*  <b>`sampling_dist_q`</b>: The sampling distribution.
-    `tf.contrib.distributions.Distribution`.
-    `float64` `dtype` recommended.
-    `log_p` and `q` should be supported on the same set.
-*  <b>`z`</b>: `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-*  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-*  <b>`seed`</b>: Python integer to seed the random number generator.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  Logarithm of the importance sampling estimate.  `Tensor` with `shape` equal
-    to batch shape of `q`, and `dtype` = `q.dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.stochastic_tensor.get_current_value_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.stochastic_tensor.get_current_value_type.md
deleted file mode 100644
index 98bd7241bba..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.stochastic_tensor.get_current_value_type.md
+++ /dev/null
@@ -1,4 +0,0 @@
-### `tf.contrib.bayesflow.stochastic_tensor.get_current_value_type()` {#get_current_value_type}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.variational_inference.register_prior.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.variational_inference.register_prior.md
deleted file mode 100644
index 45059c31996..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.bayesflow.variational_inference.register_prior.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.contrib.bayesflow.variational_inference.register_prior(variational, prior)` {#register_prior}
-
-Associate a variational `StochasticTensor` with a `Distribution` prior.
-
-This is a helper function used in conjunction with `elbo` that allows users
-to specify the mapping between variational distributions and their priors
-without having to pass in `variational_with_prior` explicitly.
-
-##### Args:
-
-
-*  <b>`variational`</b>: `StochasticTensor` q(Z). Approximating distribution.
-*  <b>`prior`</b>: `Distribution` p(Z). Prior distribution.
-
-##### Returns:
-
-  None
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if variational is not a `StochasticTensor` or `prior` is not
-    a `Distribution`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.BetaWithSoftplusConcentration.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.BetaWithSoftplusConcentration.md
deleted file mode 100644
index 62a630022c4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.BetaWithSoftplusConcentration.md
+++ /dev/null
@@ -1,598 +0,0 @@
-Beta with softplus transform of `concentration1` and `concentration0`.
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.__init__(concentration1, concentration0, validate_args=False, allow_nan_stats=True, name='BetaWithSoftplusConcentration')` {#BetaWithSoftplusConcentration.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.allow_nan_stats` {#BetaWithSoftplusConcentration.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.batch_shape` {#BetaWithSoftplusConcentration.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.batch_shape_tensor(name='batch_shape_tensor')` {#BetaWithSoftplusConcentration.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.cdf(value, name='cdf')` {#BetaWithSoftplusConcentration.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.concentration0` {#BetaWithSoftplusConcentration.concentration0}
-
-Concentration parameter associated with a `0` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.concentration1` {#BetaWithSoftplusConcentration.concentration1}
-
-Concentration parameter associated with a `1` outcome.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.copy(**override_parameters_kwargs)` {#BetaWithSoftplusConcentration.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.covariance(name='covariance')` {#BetaWithSoftplusConcentration.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.dtype` {#BetaWithSoftplusConcentration.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.entropy(name='entropy')` {#BetaWithSoftplusConcentration.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.event_shape` {#BetaWithSoftplusConcentration.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.event_shape_tensor(name='event_shape_tensor')` {#BetaWithSoftplusConcentration.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_continuous` {#BetaWithSoftplusConcentration.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_scalar_batch(name='is_scalar_batch')` {#BetaWithSoftplusConcentration.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.is_scalar_event(name='is_scalar_event')` {#BetaWithSoftplusConcentration.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_cdf(value, name='log_cdf')` {#BetaWithSoftplusConcentration.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_prob(value, name='log_prob')` {#BetaWithSoftplusConcentration.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.log_survival_function(value, name='log_survival_function')` {#BetaWithSoftplusConcentration.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.mean(name='mean')` {#BetaWithSoftplusConcentration.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.mode(name='mode')` {#BetaWithSoftplusConcentration.mode}
-
-Mode.
-
-Additional documentation from `Beta`:
-
-Note: The mode is undefined when `concentration1 <= 1` or
-`concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-is used for undefined modes.  If `self.allow_nan_stats` is `False` an
-exception is raised when one or more modes are undefined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.name` {#BetaWithSoftplusConcentration.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#BetaWithSoftplusConcentration.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.param_static_shapes(cls, sample_shape)` {#BetaWithSoftplusConcentration.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.parameters` {#BetaWithSoftplusConcentration.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.prob(value, name='prob')` {#BetaWithSoftplusConcentration.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Beta`:
-
-Note: `x` must have dtype `self.dtype` and be in
-`[0, 1].` It must have a shape compatible with `self.batch_shape()`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.reparameterization_type` {#BetaWithSoftplusConcentration.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.sample(sample_shape=(), seed=None, name='sample')` {#BetaWithSoftplusConcentration.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.stddev(name='stddev')` {#BetaWithSoftplusConcentration.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.survival_function(value, name='survival_function')` {#BetaWithSoftplusConcentration.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.total_concentration` {#BetaWithSoftplusConcentration.total_concentration}
-
-Sum of concentration parameters.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.validate_args` {#BetaWithSoftplusConcentration.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.BetaWithSoftplusConcentration.variance(name='variance')` {#BetaWithSoftplusConcentration.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md
deleted file mode 100644
index 60e967cdbfc..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md
+++ /dev/null
@@ -1,606 +0,0 @@
-MultivariateNormalDiag with `diag_stddev = softplus(diag_stddev)`.
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.__init__(mu, diag_stddev, validate_args=False, allow_nan_stats=True, name='MultivariateNormalDiagWithSoftplusStdDev')` {#MultivariateNormalDiagWithSoftplusStDev.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.allow_nan_stats` {#MultivariateNormalDiagWithSoftplusStDev.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.batch_shape` {#MultivariateNormalDiagWithSoftplusStDev.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.batch_shape_tensor(name='batch_shape_tensor')` {#MultivariateNormalDiagWithSoftplusStDev.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.cdf(value, name='cdf')` {#MultivariateNormalDiagWithSoftplusStDev.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagWithSoftplusStDev.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.covariance(name='covariance')` {#MultivariateNormalDiagWithSoftplusStDev.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.dtype` {#MultivariateNormalDiagWithSoftplusStDev.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.entropy(name='entropy')` {#MultivariateNormalDiagWithSoftplusStDev.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.event_shape` {#MultivariateNormalDiagWithSoftplusStDev.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.event_shape_tensor(name='event_shape_tensor')` {#MultivariateNormalDiagWithSoftplusStDev.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_continuous` {#MultivariateNormalDiagWithSoftplusStDev.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_scalar_batch(name='is_scalar_batch')` {#MultivariateNormalDiagWithSoftplusStDev.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.is_scalar_event(name='is_scalar_event')` {#MultivariateNormalDiagWithSoftplusStDev.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagWithSoftplusStDev.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_prob(value, name='log_prob')` {#MultivariateNormalDiagWithSoftplusStDev.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagWithSoftplusStDev.log_sigma_det}
-
-Log of determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.log_survival_function(value, name='log_survival_function')` {#MultivariateNormalDiagWithSoftplusStDev.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mean(name='mean')` {#MultivariateNormalDiagWithSoftplusStDev.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mode(name='mode')` {#MultivariateNormalDiagWithSoftplusStDev.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.mu` {#MultivariateNormalDiagWithSoftplusStDev.mu}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.name` {#MultivariateNormalDiagWithSoftplusStDev.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#MultivariateNormalDiagWithSoftplusStDev.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.param_static_shapes(cls, sample_shape)` {#MultivariateNormalDiagWithSoftplusStDev.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.parameters` {#MultivariateNormalDiagWithSoftplusStDev.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.prob(value, name='prob')` {#MultivariateNormalDiagWithSoftplusStDev.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `_MultivariateNormalOperatorPD`:
-
-`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
-shape can be broadcast up to either:
-
-```
-self.batch_shape + self.event_shape
-```
-
-or
-
-```
-[M1,...,Mm] + self.batch_shape + self.event_shape
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.reparameterization_type` {#MultivariateNormalDiagWithSoftplusStDev.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagWithSoftplusStDev.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sigma` {#MultivariateNormalDiagWithSoftplusStDev.sigma}
-
-Dense (batch) covariance matrix, if available.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.sigma_det(name='sigma_det')` {#MultivariateNormalDiagWithSoftplusStDev.sigma_det}
-
-Determinant of covariance matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.stddev(name='stddev')` {#MultivariateNormalDiagWithSoftplusStDev.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.survival_function(value, name='survival_function')` {#MultivariateNormalDiagWithSoftplusStDev.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.validate_args` {#MultivariateNormalDiagWithSoftplusStDev.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.variance(name='variance')` {#MultivariateNormalDiagWithSoftplusStDev.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md
deleted file mode 100644
index 8e5e27f7958..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md
+++ /dev/null
@@ -1,614 +0,0 @@
-Poisson distribution.
-
-The Poisson distribution is parameterized by an event `rate` parameter.
-
-#### Mathematical Details
-
-The probability mass function (pmf) is,
-
-```none
-pmf(k; lambda, k >= 0) = (lambda^k / k!) / Z
-Z = exp(lambda).
-```
-
-where `rate = lambda` and `Z` is the normalizing constant.
-- - -
-
-#### `tf.contrib.distributions.Poisson.__init__(rate, validate_args=False, allow_nan_stats=True, name='Poisson')` {#Poisson.__init__}
-
-Initialize a batch of Poisson distributions.
-
-##### Args:
-
-
-*  <b>`rate`</b>: Floating point tensor, the rate parameter of the
-    distribution(s). `rate` must be positive.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.allow_nan_stats` {#Poisson.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.batch_shape` {#Poisson.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.batch_shape_tensor(name='batch_shape_tensor')` {#Poisson.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.cdf(value, name='cdf')` {#Poisson.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.copy(**override_parameters_kwargs)` {#Poisson.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.covariance(name='covariance')` {#Poisson.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.dtype` {#Poisson.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.entropy(name='entropy')` {#Poisson.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.event_shape` {#Poisson.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.event_shape_tensor(name='event_shape_tensor')` {#Poisson.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_continuous` {#Poisson.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_scalar_batch(name='is_scalar_batch')` {#Poisson.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.is_scalar_event(name='is_scalar_event')` {#Poisson.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_cdf(value, name='log_cdf')` {#Poisson.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_prob(value, name='log_prob')` {#Poisson.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.log_survival_function(value, name='log_survival_function')` {#Poisson.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.mean(name='mean')` {#Poisson.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.mode(name='mode')` {#Poisson.mode}
-
-Mode.
-
-Additional documentation from `Poisson`:
-
-Note: when `rate` is an integer, there are actually two modes: `rate`
-and `rate - 1`. In this case we return the larger, i.e., `rate`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.name` {#Poisson.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#Poisson.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.param_static_shapes(cls, sample_shape)` {#Poisson.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.parameters` {#Poisson.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.prob(value, name='prob')` {#Poisson.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-
-Additional documentation from `Poisson`:
-
-Note that the input value must be a non-negative floating point tensor with
-dtype `dtype` and whose shape can be broadcast with `self.rate`. `x` is only
-legal if it is non-negative and its components are equal to integer values.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.rate` {#Poisson.rate}
-
-Rate parameter.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.reparameterization_type` {#Poisson.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.sample(sample_shape=(), seed=None, name='sample')` {#Poisson.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.stddev(name='stddev')` {#Poisson.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.survival_function(value, name='survival_function')` {#Poisson.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.validate_args` {#Poisson.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.Poisson.variance(name='variance')` {#Poisson.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md
deleted file mode 100644
index db8aefb1896..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md
+++ /dev/null
@@ -1,670 +0,0 @@
-The matrix Wishart distribution on positive definite matrices.
-
-This distribution is defined by a scalar degrees of freedom `df` and a
-symmetric, positive definite scale matrix.
-
-Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations
-where `(k, k)` is the event space shape.
-
-#### Mathematical Details
-
-The probability density function (pdf) is,
-
-```none
-pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
-Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
-```
-
-where:
-* `df >= k` denotes the degrees of freedom,
-* `scale` is a symmetric, positive definite, `k x k` matrix,
-* `Z` is the normalizing constant, and,
-* `Gamma_k` is the [multivariate Gamma function](
-  https://en.wikipedia.org/wiki/Multivariate_gamma_function).
-
-#### Examples
-
-```python
-# Initialize a single 3x3 Wishart with Full factored scale matrix and 5
-# degrees-of-freedom.(*)
-df = 5
-scale = ...  # Shape is [3, 3]; positive definite.
-dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
-
-# Evaluate this on an observation in R^3, returning a scalar.
-x = ... # A 3x3 positive definite matrix.
-dist.prob(x)  # Shape is [], a scalar.
-
-# Evaluate this on a two observations, each in R^{3x3}, returning a length two
-# Tensor.
-x = [x0, x1]  # Shape is [2, 3, 3].
-dist.prob(x)  # Shape is [2].
-
-# Initialize two 3x3 Wisharts with Full factored scale matrices.
-df = [5, 4]
-scale = ...  # Shape is [2, 3, 3].
-dist = tf.contrib.distributions.WishartFull(df=df, scale=scale)
-
-# Evaluate this on four observations.
-x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3]; xi is positive definite.
-dist.prob(x)  # Shape is [2, 2].
-
-# (*) - To efficiently create a trainable covariance matrix, see the example
-#   in tf.contrib.distributions.matrix_diag_transform.
-```
-- - -
-
-#### `tf.contrib.distributions.WishartFull.__init__(df, scale, cholesky_input_output_matrices=False, validate_args=False, allow_nan_stats=True, name='WishartFull')` {#WishartFull.__init__}
-
-Construct Wishart distributions.
-
-##### Args:
-
-
-*  <b>`df`</b>: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
-    or equal to dimension of the scale matrix.
-*  <b>`scale`</b>: `float` or `double` `Tensor`. The symmetric positive definite
-    scale matrix of the distribution.
-*  <b>`cholesky_input_output_matrices`</b>: `Boolean`. Any function which whose input
-    or output is a matrix assumes the input is Cholesky and returns a
-    Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
-    `sample_n` returns a Cholesky when
-    `cholesky_input_output_matrices=True`.
-*  <b>`validate_args`</b>: Python `Boolean`, default `False`. When `True` distribution
-    parameters are checked for validity despite possibly degrading runtime
-    performance. When `False` invalid inputs may silently render incorrect
-    outputs.
-*  <b>`allow_nan_stats`</b>: Python `Boolean`, default `True`. When `True`, statistics
-    (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-    result is undefined.  When `False`, an exception is raised if one or
-    more of the statistic's batch members are undefined.
-*  <b>`name`</b>: `String` name prefixed to Ops created by this class.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.allow_nan_stats` {#WishartFull.allow_nan_stats}
-
-Python boolean describing behavior when a stat is undefined.
-
-Stats return +/- infinity when it makes sense.  E.g., the variance
-of a Cauchy distribution is infinity.  However, sometimes the
-statistic is undefined, e.g., if a distribution's pdf does not achieve a
-maximum within the support of the distribution, the mode is undefined.
-If the mean is undefined, then by definition the variance is undefined.
-E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-undefined.
-
-##### Returns:
-
-
-*  <b>`allow_nan_stats`</b>: Python boolean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.batch_shape` {#WishartFull.batch_shape}
-
-Shape of a single sample from a single event index as a `TensorShape`.
-
-May be partially defined or unknown.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.batch_shape_tensor(name='batch_shape_tensor')` {#WishartFull.batch_shape_tensor}
-
-Shape of a single sample from a single event index as a 1-D `Tensor`.
-
-The batch dimensions are indexes into independent, non-identical
-parameterizations of this distribution.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`batch_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.cdf(value, name='cdf')` {#WishartFull.cdf}
-
-Cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-cdf(x) := P[X <= x]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`cdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.cholesky_input_output_matrices` {#WishartFull.cholesky_input_output_matrices}
-
-Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.copy(**override_parameters_kwargs)` {#WishartFull.copy}
-
-Creates a deep copy of the distribution.
-
-Note: the copy distribution may continue to depend on the original
-intialization arguments.
-
-##### Args:
-
-
-*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
-    arguments to override with new values.
-
-##### Returns:
-
-
-*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
-    of self.parameters and override_parameters_kwargs, i.e.,
-    `dict(self.parameters, **override_parameters_kwargs)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.covariance(name='covariance')` {#WishartFull.covariance}
-
-Covariance.
-
-Covariance is (possibly) defined only for non-scalar-event distributions.
-
-For example, for a length-`k`, vector-valued distribution, it is calculated
-as,
-
-```none
-Cov[i, j] = Covariance(X_i, X_j) = E[(X_i - E[X_i]) (X_j - E[X_j])]
-```
-
-where `Cov` is a (batch of) `k x k` matrix, `0 <= (i, j) < k`, and `E`
-denotes expectation.
-
-Alternatively, for non-vector, multivariate distributions (e.g.,
-matrix-valued, Wishart), `Covariance` shall return a (batch of) matrices
-under some vectorization of the events, i.e.,
-
-```none
-Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-````
-
-where `Cov` is a (batch of) `k' x k'` matrices,
-`0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
-mapping indices of this distribution's event dimensions to indices of a
-length-`k'` vector.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`covariance`</b>: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
-    where the first `n` dimensions are batch coordinates and
-    `k' = reduce_prod(self.event_shape)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.df` {#WishartFull.df}
-
-Wishart distribution degree(s) of freedom.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.dimension` {#WishartFull.dimension}
-
-Dimension of underlying vector space. The `p` in `R^(p*p)`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.dtype` {#WishartFull.dtype}
-
-The `DType` of `Tensor`s handled by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.entropy(name='entropy')` {#WishartFull.entropy}
-
-Shannon entropy in nats.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.event_shape` {#WishartFull.event_shape}
-
-Shape of a single sample from a single batch as a `TensorShape`.
-
-May be partially defined or unknown.
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `TensorShape`, possibly unknown.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.event_shape_tensor(name='event_shape_tensor')` {#WishartFull.event_shape_tensor}
-
-Shape of a single sample from a single batch as a 1-D int32 `Tensor`.
-
-##### Args:
-
-
-*  <b>`name`</b>: name to give to the op
-
-##### Returns:
-
-
-*  <b>`event_shape`</b>: `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_continuous` {#WishartFull.is_continuous}
-
-
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_scalar_batch(name='is_scalar_batch')` {#WishartFull.is_scalar_batch}
-
-Indicates that `batch_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_batch`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.is_scalar_event(name='is_scalar_event')` {#WishartFull.is_scalar_event}
-
-Indicates that `event_shape == []`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`is_scalar_event`</b>: `Boolean` `scalar` `Tensor`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_cdf(value, name='log_cdf')` {#WishartFull.log_cdf}
-
-Log cumulative distribution function.
-
-Given random variable `X`, the cumulative distribution function `cdf` is:
-
-```
-log_cdf(x) := Log[ P[X <= x] ]
-```
-
-Often, a numerical approximation can be used for `log_cdf(x)` that yields
-a more accurate answer than simply taking the logarithm of the `cdf` when
-`x << -1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`logcdf`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_normalization(name='log_normalization')` {#WishartFull.log_normalization}
-
-Computes the log normalizing constant, log(Z).
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_prob(value, name='log_prob')` {#WishartFull.log_prob}
-
-Log probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`log_prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.log_survival_function(value, name='log_survival_function')` {#WishartFull.log_survival_function}
-
-Log survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-log_survival_function(x) = Log[ P[X > x] ]
-                         = Log[ 1 - P[X <= x] ]
-                         = Log[ 1 - cdf(x) ]
-```
-
-Typically, different numerical approximations can be used for the log
-survival function, which are more accurate than `1 - cdf(x)` when `x >> 1`.
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mean(name='mean')` {#WishartFull.mean}
-
-Mean.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mean_log_det(name='mean_log_det')` {#WishartFull.mean_log_det}
-
-Computes E[log(det(X))] under this Wishart distribution.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.mode(name='mode')` {#WishartFull.mode}
-
-Mode.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.name` {#WishartFull.name}
-
-Name prepended to all ops created by this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.param_shapes(cls, sample_shape, name='DistributionParamShapes')` {#WishartFull.param_shapes}
-
-Shapes of parameters given the desired shape of a call to `sample()`.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.
-
-Subclasses should override class method `_param_shapes`.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `Tensor` or python list/tuple. Desired shape of a call to
-    `sample()`.
-*  <b>`name`</b>: name to prepend ops with.
-
-##### Returns:
-
-  `dict` of parameter name to `Tensor` shapes.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.param_static_shapes(cls, sample_shape)` {#WishartFull.param_static_shapes}
-
-param_shapes with static (i.e. `TensorShape`) shapes.
-
-This is a class method that describes what key/value arguments are required
-to instantiate the given `Distribution` so that a particular shape is
-returned for that instance's call to `sample()`.  Assumes that
-the sample's shape is known statically.
-
-Subclasses should override class method `_param_shapes` to return
-constant-valued tensors when constant values are fed.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: `TensorShape` or python list/tuple. Desired shape of a call
-    to `sample()`.
-
-##### Returns:
-
-  `dict` of parameter name to `TensorShape`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sample_shape` is a `TensorShape` and is not fully defined.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.parameters` {#WishartFull.parameters}
-
-Dictionary of parameters used to instantiate this `Distribution`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.prob(value, name='prob')` {#WishartFull.prob}
-
-Probability density/mass function (depending on `is_continuous`).
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`prob`</b>: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
-    values of type `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.reparameterization_type` {#WishartFull.reparameterization_type}
-
-Describes how samples from the distribution are reparameterized.
-
-Currently this is one of the static instances
-`distributions.FULLY_REPARAMETERIZED`
-or `distributions.NOT_REPARAMETERIZED`.
-
-##### Returns:
-
-  An instance of `ReparameterizationType`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.sample(sample_shape=(), seed=None, name='sample')` {#WishartFull.sample}
-
-Generate samples of the specified shape.
-
-Note that a call to `sample()` without arguments will generate a single
-sample.
-
-##### Args:
-
-
-*  <b>`sample_shape`</b>: 0D or 1D `int32` `Tensor`. Shape of the generated samples.
-*  <b>`seed`</b>: Python integer seed for RNG
-*  <b>`name`</b>: name to give to the op.
-
-##### Returns:
-
-
-*  <b>`samples`</b>: a `Tensor` with prepended dimensions `sample_shape`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.scale()` {#WishartFull.scale}
-
-Wishart distribution scale matrix.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.scale_operator_pd` {#WishartFull.scale_operator_pd}
-
-Wishart distribution scale matrix as an OperatorPD.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.stddev(name='stddev')` {#WishartFull.stddev}
-
-Standard deviation.
-
-Standard deviation is defined as,
-
-```none
-stddev = E[(X - E[X])**2]**0.5
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `stddev.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`stddev`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.survival_function(value, name='survival_function')` {#WishartFull.survival_function}
-
-Survival function.
-
-Given random variable `X`, the survival function is defined:
-
-```
-survival_function(x) = P[X > x]
-                     = 1 - P[X <= x]
-                     = 1 - cdf(x).
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: `float` or `double` `Tensor`.
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-  `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
-    `self.dtype`.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.validate_args` {#WishartFull.validate_args}
-
-Python boolean indicated possibly expensive checks are enabled.
-
-
-- - -
-
-#### `tf.contrib.distributions.WishartFull.variance(name='variance')` {#WishartFull.variance}
-
-Variance.
-
-Variance is defined as,
-
-```none
-Var = E[(X - E[X])**2]
-```
-
-where `X` is the random variable associated with this distribution, `E`
-denotes expectation, and `Var.shape = batch_shape + event_shape`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name to give this op.
-
-##### Returns:
-
-
-*  <b>`variance`</b>: Floating-point `Tensor` with shape identical to
-    `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.softplus_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.softplus_inverse.md
deleted file mode 100644
index 6f97b1f9594..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.softplus_inverse.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.contrib.distributions.softplus_inverse(x, name=None)` {#softplus_inverse}
-
-Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
-
-Mathematically this op is equivalent to:
-
-```none
-softplus_inverse = log(exp(x) - 1.)
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor`. Non-negative (not enforced), floating-point.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `Tensor`. Has the same type/shape as input `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
deleted file mode 100644
index 64aab3cffb9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.ffmpeg.decode_audio(contents, file_format=None, samples_per_second=None, channel_count=None)` {#decode_audio}
-
-Create an op that decodes the contents of an audio file.
-
-Note that ffmpeg is free to select the "best" audio track from an mp4.
-https://trac.ffmpeg.org/wiki/Map
-
-##### Args:
-
-
-*  <b>`contents`</b>: The binary contents of the audio file to decode. This is a
-      scalar.
-*  <b>`file_format`</b>: A string specifying which format the contents will conform
-      to. This can be mp3, mp4, ogg, or wav.
-*  <b>`samples_per_second`</b>: The number of samples per second that is assumed.
-      In some cases, resampling will occur to generate the correct sample
-      rate.
-*  <b>`channel_count`</b>: The number of channels that should be created from the
-      audio contents. If the contents have more than this number, then
-      some channels will be merged or dropped. If contents has fewer than
-      this, then additional channels will be created from the existing ones.
-
-##### Returns:
-
-  A rank 2 tensor that has time along dimension 0 and channels along
-  dimension 1. Dimension 0 will be `samples_per_second * length` wide, and
-  dimension 1 will be `channel_count` wide. If ffmpeg fails to decode the
-  audio then an empty tensor will be returned.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.assign_from_checkpoint_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.assign_from_checkpoint_fn.md
deleted file mode 100644
index e4d183b990b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.assign_from_checkpoint_fn.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.framework.assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False, reshape_variables=False)` {#assign_from_checkpoint_fn}
-
-Returns a function that assigns specific variables from a checkpoint.
-
-##### Args:
-
-
-*  <b>`model_path`</b>: The full path to the model checkpoint. To get latest checkpoint
-      use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
-*  <b>`var_list`</b>: A list of `Variable` objects or a dictionary mapping names in the
-      checkpoint to the correspoing variables to initialize. If empty or None,
-      it would return  no_op(), None.
-*  <b>`ignore_missing_vars`</b>: Boolean, if True it would ignore variables missing in
-      the checkpoint with a warning instead of failing.
-*  <b>`reshape_variables`</b>: Boolean, if True it would automatically reshape variables
-      which are of different shape then the ones stored in the checkpoint but
-      which have the same number of elements.
-
-##### Returns:
-
-  A function that takes a single argument, a `tf.Session`, that applies the
-  assignment operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the checkpoint specified at `model_path` is missing one of
-    the variables in `var_list`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.get_or_create_global_step.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.get_or_create_global_step.md
deleted file mode 100644
index bd9e41ee627..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.get_or_create_global_step.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.framework.get_or_create_global_step(graph=None)` {#get_or_create_global_step}
-
-Returns and create (if necessary) the global step variable.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph in which to create the global step. If missing, use default
-      graph.
-
-##### Returns:
-
-  the tensor representing the global step variable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.is_tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.is_tensor.md
deleted file mode 100644
index 9db3544e7ef..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.is_tensor.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.contrib.framework.is_tensor(x)` {#is_tensor}
-
-Check for tensor types.
-
-Check whether an object is a tensor. Equivalent to
-`isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
-
-##### Args:
-
-
-*  <b>`x`</b>: An python object to check.
-
-##### Returns:
-
-  `True` if `x` is a tensor, `False` if not.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.model_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.model_variable.md
deleted file mode 100644
index daa96911d92..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.model_variable.md
+++ /dev/null
@@ -1,34 +0,0 @@
-### `tf.contrib.framework.model_variable(*args, **kwargs)` {#model_variable}
-
-Gets an existing model variable with these parameters or creates a new one.
-
-##### Args:
-
-
-*  <b>`name`</b>: the name of the new or existing variable.
-*  <b>`shape`</b>: shape of the new or existing variable.
-*  <b>`dtype`</b>: type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: initializer for the variable if one is created.
-*  <b>`regularizer`</b>: a (Tensor -> Tensor or None) function; the result of
-      applying it on a newly created variable will be added to the collection
-      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: A list of collection names to which the Variable will be added.
-    Note that the variable is always also added to the
-    `GraphKeys.GLOBAL_VARIABLES` and `GraphKeys.MODEL_VARIABLES` collections.
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-      Variable should be cached for reading.  Defaults to the Variable's
-      device.
-*  <b>`device`</b>: Optional device to place the variable. It can be an string or a
-    function that is called to get the device for the variable.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and dtype of the `Variable` to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`custom_getter`</b>: Callable that allows overwriting the internal
-    get_variable method and has to have the same signature.
-
-##### Returns:
-
-  The created or existing variable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.remove_squeezable_dimensions.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.remove_squeezable_dimensions.md
deleted file mode 100644
index b444be2e1ce..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.framework.remove_squeezable_dimensions.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.framework.remove_squeezable_dimensions(predictions, labels, name=None)` {#remove_squeezable_dimensions}
-
-Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
-
-This will use static shape if available. Otherwise, it will add graph
-operations, which could result in a performance hit.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: Predicted values, a `Tensor` of arbitrary dimensions.
-*  <b>`labels`</b>: Label values, a `Tensor` whose dimensions match `predictions`.
-*  <b>`name`</b>: Name of the op.
-
-##### Returns:
-
-  Tuple of `predictions` and `labels`, possibly with last dim squeezed.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
deleted file mode 100644
index 29d5633ef15..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
+++ /dev/null
@@ -1,27 +0,0 @@
-### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
-
-Connect the outputs of sgv0 to the inputs of sgv1.
-
-##### Args:
-
-
-*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
-    converted to a subgraph using the same rules as the function
-    subgraph.make_view.
-    Note that sgv0 is modified in place.
-*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
-    converted to a subgraph using the same rules as the function
-    subgraph.make_view.
-    Note that sgv1 is modified in place.
-*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
-
-##### Returns:
-
-  A tuple `(sgv0, sgv1)` of the now connected subgraphs.
-
-##### Raises:
-
-
-*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
-    the same rules than the function subgraph.make_view.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_generating_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_generating_ops.md
deleted file mode 100644
index ac42fc9272a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_generating_ops.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.contrib.graph_editor.get_generating_ops(ts)` {#get_generating_ops}
-
-Return all the generating ops of the tensors in `ts`.
-
-##### Args:
-
-
-*  <b>`ts`</b>: a list of `tf.Tensor`
-
-##### Returns:
-
-  A list of all the generating `tf.Operation` of the tensors in `ts`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ts` cannot be converted to a list of `tf.Tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_walks_union_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_walks_union_ops.md
deleted file mode 100644
index af6fa7c093c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_walks_union_ops.md
+++ /dev/null
@@ -1,38 +0,0 @@
-### `tf.contrib.graph_editor.get_walks_union_ops(forward_seed_ops, backward_seed_ops, forward_inclusive=True, backward_inclusive=True, within_ops=None, control_inputs=False, control_outputs=None, control_ios=None)` {#get_walks_union_ops}
-
-Return the union of a forward and a backward walk.
-
-##### Args:
-
-
-*  <b>`forward_seed_ops`</b>: an iterable of operations from which the forward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the consumers of those tensors.
-*  <b>`backward_seed_ops`</b>: an iterable of operations from which the backward graph
-    walk starts. If a list of tensors is given instead, the seed_ops are set
-    to be the generators of those tensors.
-*  <b>`forward_inclusive`</b>: if True the given forward_seed_ops are also part of the
-    resulting set.
-*  <b>`backward_inclusive`</b>: if True the given backward_seed_ops are also part of the
-    resulting set.
-*  <b>`within_ops`</b>: restrict the search within those operations. If within_ops is
-    None, the search is done within the whole graph.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
-    control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
-    control inputs and control outputs are enabled. This is equivalent to set
-    control_inputs to True and control_outputs to the util.ControlOutputs
-    instance.
-
-##### Returns:
-
-  A Python set of all the tf.Operation in the union of a forward and a
-    backward walk.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if forward_seed_ops or backward_seed_ops or within_ops cannot be
-    converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_within_boundary_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_within_boundary_ops.md
deleted file mode 100644
index d49459205be..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.get_within_boundary_ops.md
+++ /dev/null
@@ -1,32 +0,0 @@
-### `tf.contrib.graph_editor.get_within_boundary_ops(ops, seed_ops, boundary_ops=(), inclusive=True, control_inputs=False, control_outputs=None, control_ios=None)` {#get_within_boundary_ops}
-
-Return all the `tf.Operation` within the given boundary.
-
-##### Args:
-
-
-*  <b>`ops`</b>: an object convertible to a list of `tf.Operation`. those ops define the
-    set in which to perform the operation (if a `tf.Graph` is given, it
-    will be converted to the list of all its operations).
-*  <b>`seed_ops`</b>: the operations from which to start expanding.
-*  <b>`boundary_ops`</b>: the ops forming the boundary.
-*  <b>`inclusive`</b>: if `True`, the result will also include the boundary ops.
-*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
-*  <b>`control_outputs`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, control outputs are enabled.
-*  <b>`control_ios`</b>: An instance of `util.ControlOutputs` or `None`. If not
-    `None`, both control inputs and control outputs are enabled. This is
-    equivalent to set control_inputs to True and control_outputs to
-    the `util.ControlOutputs` instance.
-
-##### Returns:
-
-  All the `tf.Operation` surrounding the given ops.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `ops` or `seed_ops` cannot be converted to a list of
-    `tf.Operation`.
-*  <b>`ValueError`</b>: if the boundary is intersecting with the seeds.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.make_view_from_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.make_view_from_scope.md
deleted file mode 100644
index 5d1fde94161..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.make_view_from_scope.md
+++ /dev/null
@@ -1,14 +0,0 @@
-### `tf.contrib.graph_editor.make_view_from_scope(scope, graph)` {#make_view_from_scope}
-
-Make a subgraph from a name scope.
-
-##### Args:
-
-
-*  <b>`scope`</b>: the name of the scope.
-*  <b>`graph`</b>: the `tf.Graph`.
-
-##### Returns:
-
-  A subgraph view representing the given scope.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.reroute_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.reroute_ts.md
deleted file mode 100644
index c3a51323318..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.reroute_ts.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.graph_editor.reroute_ts(ts0, ts1, can_modify=None, cannot_modify=None)` {#reroute_ts}
-
-For each tensor's pair, replace the end of t1 by the end of t0.
-
-B0 B1     B0 B1
-|  |    => |/
-A0 A1     A0 A1
-
-The end of the tensors in ts1 are left dangling.
-
-##### Args:
-
-
-*  <b>`ts0`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`ts1`</b>: an object convertible to a list of `tf.Tensor`.
-*  <b>`can_modify`</b>: iterable of operations which can be modified. Any operation
-    outside within_ops will be left untouched by this function.
-*  <b>`cannot_modify`</b>: iterable of operations which cannot be modified. Any
-    operation within cannot_modify will be left untouched by this function.
-
-##### Returns:
-
-  The number of individual modifications made by the function.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if ts0 or ts1 cannot be converted to a list of tf.Tensor.
-*  <b>`TypeError`</b>: if can_modify or cannot_modify is not None and cannot be
-    converted to a list of tf.Operation.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.create_feature_spec_for_parsing.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.create_feature_spec_for_parsing.md
deleted file mode 100644
index 898cecc1170..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.create_feature_spec_for_parsing.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.contrib.layers.create_feature_spec_for_parsing(feature_columns)` {#create_feature_spec_for_parsing}
-
-Helper that prepares features config from input feature_columns.
-
-The returned feature config can be used as arg 'features' in tf.parse_example.
-
-Typical usage example:
-
-```python
-# Define features and transformations
-feature_a = sparse_column_with_vocabulary_file(...)
-feature_b = real_valued_column(...)
-feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...)
-feature_a_x_feature_c = crossed_column(
-  columns=[feature_a, feature_c_bucketized], ...)
-
-feature_columns = set(
-  [feature_b, feature_c_bucketized, feature_a_x_feature_c])
-batch_examples = tf.parse_example(
-    serialized=serialized_examples,
-    features=create_feature_spec_for_parsing(feature_columns))
-```
-
-For the above example, create_feature_spec_for_parsing would return the dict:
-{
-  "feature_a": parsing_ops.VarLenFeature(tf.string),
-  "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
-  "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
-}
-
-##### Args:
-
-
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns. All items
-    should be instances of classes derived from _FeatureColumn, unless
-    feature_columns is a dict -- in which case, this should be true of all
-    values in the dict.
-
-##### Returns:
-
-  A dict mapping feature keys to FixedLenFeature or VarLenFeature values.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.joint_weighted_sum_from_feature_columns.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.joint_weighted_sum_from_feature_columns.md
deleted file mode 100644
index ccb2e6a606e..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.joint_weighted_sum_from_feature_columns.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### `tf.contrib.layers.joint_weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None)` {#joint_weighted_sum_from_feature_columns}
-
-A restricted linear prediction builder based on FeatureColumns.
-
-As long as all feature columns are unweighted sparse columns this computes the
-prediction of a linear model which stores all weights in a single variable.
-
-##### Args:
-
-
-*  <b>`columns_to_tensors`</b>: A mapping from feature column to tensors. 'string' key
-    means a base feature (not-transformed). It can have FeatureColumn as a
-    key too. That means that FeatureColumn is already transformed by input
-    pipeline. For example, `inflow` may have handled transformations.
-*  <b>`feature_columns`</b>: A set containing all the feature columns. All items in the
-    set should be instances of classes derived from FeatureColumn.
-*  <b>`num_outputs`</b>: An integer specifying number of outputs. Default value is 1.
-*  <b>`weight_collections`</b>: List of graph collections to which weights are added.
-*  <b>`trainable`</b>: If `True` also add variables to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-*  <b>`scope`</b>: Optional scope for variable_scope.
-
-##### Returns:
-
-  A tuple containing:
-
-    * A Tensor which represents predictions of a linear model.
-    * A list of Variables storing the weights.
-    * A Variable which is used for bias.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if FeatureColumn cannot be used for linear predictions.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.l1_regularizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.l1_regularizer.md
deleted file mode 100644
index edf410d1da0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.l1_regularizer.md
+++ /dev/null
@@ -1,21 +0,0 @@
-### `tf.contrib.layers.l1_regularizer(scale, scope=None)` {#l1_regularizer}
-
-Returns a function that can be used to apply L1 regularization to weights.
-
-L1 regularization encourages sparsity.
-
-##### Args:
-
-
-*  <b>`scale`</b>: A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-*  <b>`scope`</b>: An optional scope name.
-
-##### Returns:
-
-  A function with signature `l1(weights)` that apply L1 regularization.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If scale is negative or if scale is not a float.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.xavier_initializer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.xavier_initializer.md
deleted file mode 100644
index 55631e4b054..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.layers.xavier_initializer.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### `tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32)` {#xavier_initializer}
-
-Returns an initializer performing "Xavier" initialization for weights.
-
-This function implements the weight initialization from:
-
-Xavier Glorot and Yoshua Bengio (2010):
-         Understanding the difficulty of training deep feedforward neural
-         networks. International conference on artificial intelligence and
-         statistics.
-
-This initializer is designed to keep the scale of the gradients roughly the
-same in all layers. In uniform distribution this ends up being the range:
-`x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
-deviation of `sqrt(3. / (in + out))` is used.
-
-##### Args:
-
-
-*  <b>`uniform`</b>: Whether to use uniform or normal distributed random initialization.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`dtype`</b>: The data type. Only floating point types are supported.
-
-##### Returns:
-
-  An initializer for a weight matrix.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNLinearCombinedRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNLinearCombinedRegressor.md
deleted file mode 100644
index dea9500a817..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNLinearCombinedRegressor.md
+++ /dev/null
@@ -1,408 +0,0 @@
-A regressor for TensorFlow Linear and DNN joined training models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_x_sparse_feature_b = crossed_column(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNLinearCombinedRegressor(
-    # common settings
-    weight_column_name=weight_column_name,
-    # wide settings
-    linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
-    linear_optimizer=tf.train.FtrlOptimizer(...),
-    # deep settings
-    dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-    dnn_hidden_units=[1000, 500, 100],
-    dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
-
-# To apply L1 and L2 regularization, you can set optimizers as follows:
-tf.train.ProximalAdagradOptimizer(
-    learning_rate=0.1,
-    l1_regularization_strength=0.001,
-    l2_regularization_strength=0.001)
-# It is same for FtrlOptimizer.
-
-# Input builders
-def input_fn_train: # returns x, y
-  ...
-def input_fn_eval: # returns x, y
-  ...
-estimator.train(input_fn_train)
-estimator.evaluate(input_fn_eval)
-estimator.predict(x)
-```
-
-Input of `fit`, `train`, and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-    if `weight_column_name` is not `None`, a feature with
-      `key=weight_column_name` whose value is a `Tensor`.
-    for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `SparseColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `WeightedSparseColumn`, two features: the first with
-      `key` the id column name, the second with `key` the weight column name.
-      Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `RealValuedColumn, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.__init__(model_dir=None, weight_column_name=None, linear_feature_columns=None, linear_optimizer=None, _joint_linear_weights=False, dnn_feature_columns=None, dnn_optimizer=None, dnn_hidden_units=None, dnn_activation_fn=relu, dnn_dropout=None, gradient_clip_norm=None, enable_centered_bias=False, label_dimension=1, config=None, feature_engineering_fn=None, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNLinearCombinedRegressor.__init__}
-
-Initializes a DNNLinearCombinedRegressor instance.
-
-##### Args:
-
-
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator
-    to continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`linear_feature_columns`</b>: An iterable containing all the feature columns
-    used by linear part of the model. All items in the set must be
-    instances of classes derived from `FeatureColumn`.
-*  <b>`linear_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the linear part of the model. If `None`, will use a FTRL optimizer.
-  _joint_linear_weights: If True a single (possibly partitioned) variable
-    will be used to store the linear model weights. It's faster, but
-    requires that all columns are sparse and have the 'sum' combiner.
-
-*  <b>`dnn_feature_columns`</b>: An iterable containing all the feature columns used
-    by deep part of the model. All items in the set must be instances of
-    classes derived from `FeatureColumn`.
-*  <b>`dnn_optimizer`</b>: An instance of `tf.Optimizer` used to apply gradients to
-    the deep part of the model. If `None`, will use an Adagrad optimizer.
-*  <b>`dnn_hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected.
-*  <b>`dnn_activation_fn`</b>: Activation function applied to each layer. If None,
-    will use `tf.nn.relu`.
-*  <b>`dnn_dropout`</b>: When not None, the probability we will drop out
-    a given coordinate.
-*  <b>`gradient_clip_norm`</b>: A float > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    tf.clip_by_global_norm for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-*  <b>`config`</b>: RunConfig object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EmbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both linear_feature_columns and dnn_features_columns are
-    empty at the same time.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.__repr__()` {#DNNLinearCombinedRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.config` {#DNNLinearCombinedRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#DNNLinearCombinedRegressor.evaluate}
-
-See evaluable.Evaluable.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNLinearCombinedRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNLinearCombinedRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.fit(*args, **kwargs)` {#DNNLinearCombinedRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_params(deep=True)` {#DNNLinearCombinedRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_variable_names()` {#DNNLinearCombinedRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.get_variable_value(name)` {#DNNLinearCombinedRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.model_dir` {#DNNLinearCombinedRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.partial_fit(*args, **kwargs)` {#DNNLinearCombinedRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.predict(*args, **kwargs)` {#DNNLinearCombinedRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.predict_scores(*args, **kwargs)` {#DNNLinearCombinedRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNLinearCombinedRegressor.set_params(**params)` {#DNNLinearCombinedRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
deleted file mode 100644
index 017234aa0c2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ /dev/null
@@ -1,393 +0,0 @@
-A regressor for TensorFlow DNN models.
-
-Example:
-
-```python
-sparse_feature_a = sparse_column_with_hash_bucket(...)
-sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                        ...)
-sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                        ...)
-
-estimator = DNNRegressor(
-    feature_columns=[sparse_feature_a, sparse_feature_b],
-    hidden_units=[1024, 512, 256])
-
-# Or estimator using the ProximalAdagradOptimizer optimizer with
-# regularization.
-estimator = DNNRegressor(
-    feature_columns=[sparse_feature_a, sparse_feature_b],
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001
-    ))
-
-# Input builders
-def input_fn_train: # returns x, y
-  pass
-estimator.fit(input_fn=input_fn_train)
-
-def input_fn_eval: # returns x, y
-  pass
-estimator.evaluate(input_fn=input_fn_eval)
-estimator.predict(x=x)
-```
-
-Input of `fit` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-* if `weight_column_name` is not `None`, a feature with
-  `key=weight_column_name` whose value is a `Tensor`.
-* for each `column` in `feature_columns`:
-  - if `column` is a `SparseColumn`, a feature with `key=column.name`
-    whose `value` is a `SparseTensor`.
-  - if `column` is a `WeightedSparseColumn`, two features: the first with
-    `key` the id column name, the second with `key` the weight column name.
-    Both features' `value` must be a `SparseTensor`.
-  - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-    whose `value` is a `Tensor`.
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__init__(hidden_units, feature_columns, model_dir=None, weight_column_name=None, optimizer=None, activation_fn=relu, dropout=None, gradient_clip_norm=None, enable_centered_bias=False, config=None, feature_engineering_fn=None, label_dimension=1, embedding_lr_multipliers=None, input_layer_min_slice_size=None)` {#DNNRegressor.__init__}
-
-Initializes a `DNNRegressor` instance.
-
-##### Args:
-
-
-*  <b>`hidden_units`</b>: List of hidden units per layer. All layers are fully
-    connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
-    has 32.
-*  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
-    the model. All items in the set should be instances of classes derived
-    from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
-    also be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`weight_column_name`</b>: A string defining feature column name representing
-    weights. It is used to down weight or boost examples during training. It
-    will be multiplied by the loss of the example.
-*  <b>`optimizer`</b>: An instance of `tf.Optimizer` used to train the model. If
-    `None`, will use an Adagrad optimizer.
-*  <b>`activation_fn`</b>: Activation function applied to each layer. If `None`, will
-    use `tf.nn.relu`.
-*  <b>`dropout`</b>: When not `None`, the probability we will drop out a given
-    coordinate.
-*  <b>`gradient_clip_norm`</b>: A `float` > 0. If provided, gradients are clipped
-    to their global norm with this clipping ratio. See
-    `tf.clip_by_global_norm` for more details.
-*  <b>`enable_centered_bias`</b>: A bool. If True, estimator will learn a centered
-    bias variable for each class. Rest of the model structure learns the
-    residual after centered bias.
-*  <b>`config`</b>: `RunConfig` object to configure the runtime settings.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-*  <b>`label_dimension`</b>: Number of regression targets per example. This is the
-    size of the last dimension of the labels and logits `Tensor` objects
-    (typically, these have shape `[batch_size, label_dimension]`).
-*  <b>`embedding_lr_multipliers`</b>: Optional. A dictionary from `EbeddingColumn` to
-      a `float` multiplier. Multiplier will be used to multiply with
-      learning rate for the embedding variables.
-*  <b>`input_layer_min_slice_size`</b>: Optional. The min slice size of input layer
-      partitions. If not provided, will use the default of 64M.
-
-##### Returns:
-
-  A `DNNRegressor` estimator.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.__repr__()` {#DNNRegressor.__repr__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.config` {#DNNRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None, checkpoint_path=None, hooks=None)` {#DNNRegressor.evaluate}
-
-See evaluable.Evaluable.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNRegressor.export}
-
-See BaseEstimator.export.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.export_savedmodel(export_dir_base, serving_input_fn, default_output_alternative_key=None, assets_extra=None, as_text=False, checkpoint_path=None)` {#DNNRegressor.export_savedmodel}
-
-Exports inference graph as a SavedModel into given dir.
-
-##### Args:
-
-
-*  <b>`export_dir_base`</b>: A string containing a directory to write the exported
-    graph and checkpoints.
-*  <b>`serving_input_fn`</b>: A function that takes no argument and
-    returns an `InputFnOps`.
-*  <b>`default_output_alternative_key`</b>: the name of the head to serve when none is
-    specified.  Not needed for single-headed models.
-*  <b>`assets_extra`</b>: A dict specifying how to populate the assets.extra directory
-    within the exported SavedModel.  Each key should give the destination
-    path (including the filename) relative to the assets.extra directory.
-    The corresponding value gives the full path of the source file to be
-    copied.  For example, the simple case of copying a single file without
-    renaming it is specified as
-    `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-*  <b>`as_text`</b>: whether to write the SavedModel proto in text format.
-*  <b>`checkpoint_path`</b>: The checkpoint path to export.  If None (the default),
-    the most recent checkpoint found within the model directory is chosen.
-
-##### Returns:
-
-  The string path to the exported directory.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if an unrecognized export_type is requested.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.fit(*args, **kwargs)` {#DNNRegressor.fit}
-
-See `Trainable`. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_params(deep=True)` {#DNNRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_variable_names()` {#DNNRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.get_variable_value(name)` {#DNNRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.model_dir` {#DNNRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.partial_fit(*args, **kwargs)` {#DNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-12-01.
-Instructions for updating:
-Estimator is decoupled from Scikit Learn interface by moving into
-separate class SKCompat. Arguments x, y and batch_size are only
-available in the SKCompat class, Estimator will only accept input_fn.
-
-##### Example conversion:
-
-  est = Estimator(...) -> est = SKCompat(Estimator(...))
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of labels. The training label values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.predict(*args, **kwargs)` {#DNNRegressor.predict}
-
-Returns predictions for given features. (deprecated arguments) (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2017-03-01.
-Instructions for updating:
-Please switch to predict_scores, or set `outputs` argument.
-
-By default, returns predicted scores. But this default will be dropped
-soon. Users should either pass `outputs`, or call `predict_scores` method.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`outputs`</b>: list of `str`, name of the output to predict.
-    If `None`, returns scores.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-  If `outputs` is set, returns a dict of predictions.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.predict_scores(*args, **kwargs)` {#DNNRegressor.predict_scores}
-
-Returns predicted scores for given features. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-15.
-Instructions for updating:
-The default behavior of predict() is changing. The default value for
-as_iterable will change to True, and then the flag will be removed
-altogether. The behavior of this flag is described below.
-
-##### Args:
-
-
-*  <b>`x`</b>: features.
-*  <b>`input_fn`</b>: Input function. If set, x must be None.
-*  <b>`batch_size`</b>: Override default batch size.
-*  <b>`as_iterable`</b>: If True, return an iterable which keeps yielding predictions
-    for each example until inputs are exhausted. Note: The inputs must
-    terminate if you want the iterable to terminate (e.g. be sure to pass
-    num_epochs=1 if you are using something like read_batch_features).
-
-##### Returns:
-
-  Numpy array of predicted scores (or an iterable of predicted scores if
-  as_iterable is True). If `label_dimension == 1`, the shape of the output
-  is `[batch_size]`, otherwise the shape is `[batch_size, label_dimension]`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNRegressor.set_params(**params)` {#DNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md
deleted file mode 100644
index 3b420913ed0..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.LogisticRegressor.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.contrib.learn.LogisticRegressor(model_fn, thresholds=None, model_dir=None, config=None, feature_engineering_fn=None)` {#LogisticRegressor}
-
-Builds a logistic regression Estimator for binary classification.
-
-This method provides a basic Estimator with some additional metrics for custom
-binary classification models, including AUC, precision/recall and accuracy.
-
-Example:
-
-```python
-  # See tf.contrib.learn.Estimator(...) for details on model_fn structure
-  def my_model_fn(...):
-    pass
-
-  estimator = LogisticRegressor(model_fn=my_model_fn)
-
-  # Input builders
-  def input_fn_train:
-    pass
-
-  estimator.fit(input_fn=input_fn_train)
-  estimator.predict(x=x)
-```
-
-##### Args:
-
-
-*  <b>`model_fn`</b>: Model function with the signature:
-    `(features, labels, mode) -> (predictions, loss, train_op)`.
-    Expects the returned predictions to be probabilities in [0.0, 1.0].
-*  <b>`thresholds`</b>: List of floating point thresholds to use for accuracy,
-    precision, and recall metrics. If `None`, defaults to `[0.5]`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graphs, etc. This can also
-    be used to load checkpoints from the directory into a estimator to
-    continue training a previously saved model.
-*  <b>`config`</b>: A RunConfig configuration object.
-*  <b>`feature_engineering_fn`</b>: Feature engineering function. Takes features and
-                    labels which are the output of `input_fn` and
-                    returns features and labels which will be fed
-                    into the model.
-
-##### Returns:
-
-  A `tf.contrib.learn.Estimator` instance.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.Trainable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.Trainable.md
deleted file mode 100644
index 944903d1f0c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.Trainable.md
+++ /dev/null
@@ -1,45 +0,0 @@
-Interface for objects that are trainable by, e.g., `Experiment`.
-- - -
-
-#### `tf.contrib.learn.Trainable.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Trainable.fit}
-
-Trains a model given training data `x` predictions and `y` labels.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-     Can be iterator that returns arrays of features or dictionary of arrays of features.
-     The training input samples for fitting the model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-     Can be iterator that returns array of labels or dictionary of array of labels.
-     The training label values (class labels in classification, real numbers in regression).
-     If set, `input_fn` must be `None`. Note: For classification, label values must
-     be integers representing the class index (i.e. values from 0 to
-     n_classes-1).
-*  <b>`input_fn`</b>: Input function returning a tuple of:
-      features - `Tensor` or dictionary of string feature name to `Tensor`.
-      labels - `Tensor` or dictionary of `Tensor` with labels.
-    If input_fn is set, `x`, `y`, and `batch_size` must be `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    'steps' works incrementally. If you call two times fit(steps=10) then
-    training occurs in total 20 steps. If you don't want to have incremental
-    behaviour please set `max_steps` instead. If set, `max_steps` must be
-    `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.infer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.infer.md
deleted file mode 100644
index 32268ffd160..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.infer.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.learn.infer(*args, **kwargs)` {#infer}
-
-Restore graph from `restore_checkpoint_path` and run `output_dict` tensors. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-02-15.
-Instructions for updating:
-graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
-
-If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise,
-init all variables.
-
-##### Args:
-
-
-*  <b>`restore_checkpoint_path`</b>: A string containing the path to a checkpoint to
-    restore.
-*  <b>`output_dict`</b>: A `dict` mapping string names to `Tensor` objects to run.
-    Tensors must all be from the same graph.
-*  <b>`feed_dict`</b>: `dict` object mapping `Tensor` objects to input values to feed.
-
-##### Returns:
-
-  Dict of values read from `output_dict` tensors. Keys are the same as
-  `output_dict`, values are the results read from the corresponding `Tensor`
-  in `output_dict`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `output_dict` or `feed_dicts` is None or empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.monitors.LoggingTrainable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.monitors.LoggingTrainable.md
deleted file mode 100644
index 1d94d6e1f3f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.monitors.LoggingTrainable.md
+++ /dev/null
@@ -1,184 +0,0 @@
-Writes trainable variable values into log every N steps.
-
-Write the tensors in trainable variables `every_n` steps,
-starting with the `first_n`th step.
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.__init__(scope=None, every_n=100, first_n=1)` {#LoggingTrainable.__init__}
-
-Initializes LoggingTrainable monitor.
-
-##### Args:
-
-
-*  <b>`scope`</b>: An optional string to match variable names using re.match.
-*  <b>`every_n`</b>: Print every N steps.
-*  <b>`first_n`</b>: Print first N steps.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.begin(max_steps=None)` {#LoggingTrainable.begin}
-
-Called at the beginning of training.
-
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.end(session=None)` {#LoggingTrainable.end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.epoch_begin(epoch)` {#LoggingTrainable.epoch_begin}
-
-Begin epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun an epoch, or `epoch` < 0.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.epoch_end(epoch)` {#LoggingTrainable.epoch_end}
-
-End epoch.
-
-##### Args:
-
-
-*  <b>`epoch`</b>: `int`, the epoch number.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_post_step(step, session)` {#LoggingTrainable.every_n_post_step}
-
-Callback after a step is finished or `end()` is called.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`session`</b>: `Session` object.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_step_begin(step)` {#LoggingTrainable.every_n_step_begin}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.every_n_step_end(step, outputs)` {#LoggingTrainable.every_n_step_end}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.post_step(step, session)` {#LoggingTrainable.post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.run_on_all_workers` {#LoggingTrainable.run_on_all_workers}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.set_estimator(estimator)` {#LoggingTrainable.set_estimator}
-
-A setter called automatically by the target estimator.
-
-If the estimator is locked, this method does nothing.
-
-##### Args:
-
-
-*  <b>`estimator`</b>: the estimator that this monitor monitors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the estimator is None.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.step_begin(step)` {#LoggingTrainable.step_begin}
-
-Overrides `BaseMonitor.step_begin`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.LoggingTrainable.step_end(step, output)` {#LoggingTrainable.step_end}
-
-Overrides `BaseMonitor.step_end`.
-
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`output`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md
deleted file mode 100644
index 2d628aa16a4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq.md
+++ /dev/null
@@ -1,53 +0,0 @@
-### `tf.contrib.legacy_seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, num_decoder_symbols=None, output_projection=None, feed_previous=False, dtype=None, scope=None)` {#embedding_tied_rnn_seq2seq}
-
-Embedding RNN sequence-to-sequence model with tied (shared) parameters.
-
-This model first embeds encoder_inputs by a newly created embedding (of shape
-[num_symbols x input_size]). Then it runs an RNN to encode embedded
-encoder_inputs into a state vector. Next, it embeds decoder_inputs using
-the same embedding. Then it runs RNN decoder, initialized with the last
-encoder state, on embedded decoder_inputs. The decoder output is over symbols
-from 0 to num_decoder_symbols - 1 if num_decoder_symbols is none; otherwise it
-is over 0 to num_symbols - 1.
-
-##### Args:
-
-
-*  <b>`encoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`decoder_inputs`</b>: A list of 1D int32 Tensors of shape [batch_size].
-*  <b>`cell`</b>: core_rnn_cell.RNNCell defining the cell function and size.
-*  <b>`num_symbols`</b>: Integer; number of symbols for both encoder and decoder.
-*  <b>`embedding_size`</b>: Integer, the length of the embedding vector for each symbol.
-*  <b>`num_decoder_symbols`</b>: Integer; number of output symbols for decoder. If
-    provided, the decoder output is over symbols 0 to num_decoder_symbols - 1.
-    Otherwise, decoder output is over symbols 0 to num_symbols - 1. Note that
-    this assumes that the vocabulary is set up such that the first
-    num_decoder_symbols of num_symbols are part of decoding.
-*  <b>`output_projection`</b>: None or a pair (W, B) of output projection weights and
-    biases; W has shape [output_size x num_symbols] and B has
-    shape [num_symbols]; if provided and feed_previous=True, each
-    fed previous output will first be multiplied by W and added B.
-*  <b>`feed_previous`</b>: Boolean or scalar Boolean Tensor; if True, only the first
-    of decoder_inputs will be used (the "GO" symbol), and all other decoder
-    inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-    If False, decoder_inputs are used as given (the standard decoder case).
-*  <b>`dtype`</b>: The dtype to use for the initial RNN states (default: tf.float32).
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "embedding_tied_rnn_seq2seq".
-
-##### Returns:
-
-  A tuple of the form (outputs, state), where:
-
-*  <b>`outputs`</b>: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_symbols] containing the generated
-      outputs where output_symbols = num_decoder_symbols if
-      num_decoder_symbols is not None otherwise output_symbols = num_symbols.
-*  <b>`state`</b>: The state of each decoder cell at the final time-step.
-      It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When output_projection has the wrong shape.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md
deleted file mode 100644
index 00c35baf3cb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.linalg.LinearOperator.md
+++ /dev/null
@@ -1,520 +0,0 @@
-Base class defining a [batch of] linear operator[s].
-
-Subclasses of `LinearOperator` provide a access to common methods on a
-(batch) matrix, without the need to materialize the matrix.  This allows:
-
-* Matrix free computations
-* Operators that take advantage of special structure, while providing a
-  consistent API to users.
-
-#### Subclassing
-
-To enable a public method, subclasses should implement the leading-underscore
-version of the method.  The argument signature should be identical except for
-the omission of `name="..."`.  For example, to enable
-`apply(x, adjoint=False, name="apply")` a subclass should implement
-`_apply(x, adjoint=False)`.
-
-#### Performance contract
-
-Subclasses should implement a method only if it can be done with a reasonable
-performance increase over generic dense operations, either in time, parallel
-scalability, or memory usage.  For example, if the determinant can only be
-computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-should not be implemented.
-
-Class docstrings should contain an explanation of computational complexity.
-Since this is a high-performance library, attention should be paid to detail,
-and explanations can include constants as well as Big-O notation.
-
-#### Shape compatibility
-
-`LinearOperator` sub classes should operate on a [batch] matrix with
-compatible shape.  Class docstrings should define what is meant by compatible
-shape.  Some sub-classes may not support batching.
-
-An example is:
-
-`x` is a batch matrix with compatible shape for `apply` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
-x.shape =   [B1,...,Bb] + [N, R]
-```
-
-`rhs` is a batch matrix with compatible shape for `solve` if
-
-```
-operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
-rhs.shape =   [B1,...,Bb] + [M, R]
-```
-
-#### Example docstring for subclasses.
-
-This operator acts like a (batch) matrix `A` with shape
-`[B1,...,Bb, M, N]` for some `b >= 0`.  The first `b` indices index a
-batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-an `m x n` matrix.  Again, this matrix `A` may not be materialized, but for
-purposes of identifying and working with compatible arguments the shape is
-relevant.
-
-Examples:
-
-```python
-some_tensor = ... shape = ????
-operator = MyLinOp(some_tensor)
-
-operator.shape()
-==> [2, 4, 4]
-
-operator.log_determinant()
-==> Shape [2] Tensor
-
-x = ... Shape [2, 4, 5] Tensor
-
-operator.apply(x)
-==> Shape [2, 4, 5] Tensor
-```
-
-#### Shape compatibility
-
-This operator acts on batch matrices with compatible shape.
-FILL IN WHAT IS MEANT BY COMPATIBLE SHAPE
-
-#### Performance
-
-FILL THIS IN
-
-#### Matrix property hints
-
-This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-for `X = non_singular, self_adjoint, positive_definite, square`.
-These have the following meaning
-* If `is_X == True`, callers should expect the operator to have the
-  property `X`.  This is a promise that should be fulfilled, but is *not* a
-  runtime assert.  For example, finite floating point precision may result
-  in these promises being violated.
-* If `is_X == False`, callers should expect the operator to not have `X`.
-* If `is_X == None` (the default), callers should have no expectation either
-  way.
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.__init__(dtype, graph_parents=None, is_non_singular=None, is_self_adjoint=None, is_positive_definite=None, is_square=None, name=None)` {#LinearOperator.__init__}
-
-Initialize the `LinearOperator`.
-
-**This is a private method for subclass use.**
-**Subclasses should copy-paste this `__init__` documentation.**
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of the this `LinearOperator`.  Arguments to `apply` and
-    `solve` will have to be this type.
-*  <b>`graph_parents`</b>: Python list of graph prerequisites of this `LinearOperator`
-    Typically tensors that are passed during initialization.
-*  <b>`is_non_singular`</b>: Expect that this operator is non-singular.
-*  <b>`is_self_adjoint`</b>: Expect that this operator is equal to its hermitian
-    transpose.  If `dtype` is real, this is equivalent to being symmetric.
-*  <b>`is_positive_definite`</b>: Expect that this operator is positive definite,
-    meaning the real part of all eigenvalues is positive.  We do not require
-    the operator to be self-adjoint to be positive-definite.  See:
-*  <b>`https`</b>: //en.wikipedia.org/wiki/Positive-definite_matrix\
-        #Extension_for_non_symmetric_matrices
-*  <b>`is_square`</b>: Expect that this operator acts like square [batch] matrices.
-*  <b>`name`</b>: A name for this `LinearOperator`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If any member of graph_parents is `None` or not a `Tensor`.
-*  <b>`ValueError`</b>: If hints are set incorrectly.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.add_to_tensor(x, name='add_to_tensor')` {#LinearOperator.add_to_tensor}
-
-Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with same `dtype` and shape broadcastable to `self.shape`.
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  A `Tensor` with broadcast shape and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.apply(x, adjoint=False, name='apply')` {#LinearOperator.apply}
-
-Transform `x` with left multiplication:  `x --> Ax`.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` with compatible shape and same `dtype` as `self`.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, left multiply by the adjoint.
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_non_singular(name='assert_non_singular')` {#LinearOperator.assert_non_singular}
-
-Returns an `Op` that asserts this operator is non singular.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_positive_definite(name='assert_positive_definite')` {#LinearOperator.assert_positive_definite}
-
-Returns an `Op` that asserts this operator is positive definite.
-
-Here, positive definite means the real part of all eigenvalues is positive.
-We do not require the operator to be self-adjoint.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name to give this `Op`.
-
-##### Returns:
-
-  An `Op` that asserts this operator is positive definite.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.assert_self_adjoint(name='assert_self_adjoint')` {#LinearOperator.assert_self_adjoint}
-
-Returns an `Op` that asserts this operator is self-adjoint.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.batch_shape` {#LinearOperator.batch_shape}
-
-`TensorShape` of batch dimensions of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb])`, equivalent to `A.get_shape()[:-2]`
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.batch_shape_tensor(name='batch_shape_tensor')` {#LinearOperator.batch_shape_tensor}
-
-Shape of batch dimensions of this operator, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb]`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.determinant(name='det')` {#LinearOperator.determinant}
-
-Determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.domain_dimension` {#LinearOperator.domain_dimension}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.domain_dimension_tensor(name='domain_dimension_tensor')` {#LinearOperator.domain_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the domain of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `N`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.dtype` {#LinearOperator.dtype}
-
-The `DType` of `Tensor`s handled by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.graph_parents` {#LinearOperator.graph_parents}
-
-List of graph dependencies of this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_non_singular` {#LinearOperator.is_non_singular}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_positive_definite` {#LinearOperator.is_positive_definite}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_self_adjoint` {#LinearOperator.is_self_adjoint}
-
-
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.is_square` {#LinearOperator.is_square}
-
-Return `True/False` depending on if this operator is square.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.log_abs_determinant(name='log_abs_det')` {#LinearOperator.log_abs_determinant}
-
-Log absolute value of determinant for every batch member.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_square` is `False`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.name` {#LinearOperator.name}
-
-Name prepended to all ops created by this `LinearOperator`.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.range_dimension` {#LinearOperator.range_dimension}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Returns:
-
-  `Dimension` object.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.range_dimension_tensor(name='range_dimension_tensor')` {#LinearOperator.range_dimension_tensor}
-
-Dimension (in the sense of vector spaces) of the range of this operator.
-
-Determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `M`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op`.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.shape` {#LinearOperator.shape}
-
-`TensorShape` of this `LinearOperator`.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns
-`TensorShape([B1,...,Bb, M, N])`, equivalent to `A.get_shape()`.
-
-##### Returns:
-
-  `TensorShape`, statically determined, may be undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.shape_tensor(name='shape_tensor')` {#LinearOperator.shape_tensor}
-
-Shape of this `LinearOperator`, determined at runtime.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns a `Tensor` holding
-`[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.solve(rhs, adjoint=False, name='solve')` {#LinearOperator.solve}
-
-Solve `R` (batch) systems of equations exactly: `A X = rhs`.
-
-Examples:
-
-```python
-# Create an operator acting like a 10 x 2 x 2 matrix.
-operator = LinearOperator(...)
-operator.shape # = 10 x 2 x 2
-
-# Solve one linear system (R = 1) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 1
-X = operator.solve(RHS)  # shape 10 x 2 x 1
-
-# Solve five linear systems (R = 5) for every member of the length 10 batch.
-RHS = ... # shape 10 x 2 x 5
-X = operator.solve(RHS)
-X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`rhs`</b>: `Tensor` with same `dtype` as this operator and compatible shape.
-    See class docstring for definition of compatibility.
-*  <b>`adjoint`</b>: Python `bool`.  If `True`, solve the system involving the adjoint
-    of this `LinearOperator`.
-*  <b>`name`</b>: A name scope to use for ops added by this method.
-
-##### Returns:
-
-  `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
-
-##### Raises:
-
-
-*  <b>`NotImplementedError`</b>: If `self.is_non_singular` or `is_square` is False.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.tensor_rank` {#LinearOperator.tensor_rank}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  Python integer, or None if the tensor rank is undefined.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.tensor_rank_tensor(name='tensor_rank_tensor')` {#LinearOperator.tensor_rank_tensor}
-
-Rank (in the sense of tensors) of matrix corresponding to this operator.
-
-If this operator acts like the batch matrix `A` with
-`A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for this `Op.
-
-##### Returns:
-
-  `int32` `Tensor`, determined at runtime.
-
-
-- - -
-
-#### `tf.contrib.linalg.LinearOperator.to_dense(name='to_dense')` {#LinearOperator.to_dense}
-
-Return a dense (batch) matrix representing this operator.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md
deleted file mode 100644
index 533121794fd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.get_total_loss.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.contrib.losses.get_total_loss(*args, **kwargs)` {#get_total_loss}
-
-Returns a tensor whose value represents the total loss. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.get_total_loss instead.
-
-Notice that the function adds the given losses to the regularization losses.
-
-##### Args:
-
-
-*  <b>`add_regularization_losses`</b>: A boolean indicating whether or not to use the
-    regularization losses in the sum.
-*  <b>`name`</b>: The name of the returned tensor.
-
-##### Returns:
-
-  A `Tensor` whose value represents the total loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `losses` is not iterable.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md
deleted file mode 100644
index c23b57c3fad..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.losses.mean_pairwise_squared_error.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.contrib.losses.mean_pairwise_squared_error(*args, **kwargs)` {#mean_pairwise_squared_error}
-
-Adds a pairwise-errors-squared loss to the training procedure. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-Use tf.losses.mean_pairwise_squared_error instead.
-
-Unlike `mean_squared_error`, which is a measure of the differences between
-corresponding elements of `predictions` and `labels`,
-`mean_pairwise_squared_error` is a measure of the differences between pairs of
-corresponding elements of `predictions` and `labels`.
-
-For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
-three pairs of differences are summed to compute the loss:
-  loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
-
-Note that since the inputs are of size [batch_size, d0, ... dN], the
-corresponding pairs are computed within each batch sample but not across
-samples within a batch. For example, if `predictions` represents a batch of
-16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
-is drawn from each image, but not across images.
-
-`weights` acts as a coefficient for the loss. If a scalar is provided, then
-the loss is simply scaled by the given value. If `weights` is a tensor of size
-[batch_size], then the total loss for each sample of the batch is rescaled
-by the corresponding element in the `weights` vector.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
-    where N+1 is the total number of dimensions in `predictions`.
-*  <b>`labels`</b>: The ground truth output tensor, whose shape must match the shape of
-    the `predictions` tensor.
-*  <b>`weights`</b>: Coefficients for the loss a scalar, a tensor of shape [batch_size]
-    or a tensor whose shape matches `predictions`.
-*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
-
-##### Returns:
-
-  A scalar `Tensor` representing the loss value.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `predictions` doesn't match that of `labels` or
-    if the shape of `weights` is invalid.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metric_map.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metric_map.md
deleted file mode 100644
index fd4d3733c6a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metric_map.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.contrib.metrics.aggregate_metric_map(names_to_tuples)` {#aggregate_metric_map}
-
-Aggregates the metric names to tuple dictionary.
-
-This function is useful for pairing metric names with their associated value
-and update ops when the list of metrics is long. For example:
-
-```python
-  metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({
-      'Mean Absolute Error': new_slim.metrics.streaming_mean_absolute_error(
-          predictions, labels, weights),
-      'Mean Relative Error': new_slim.metrics.streaming_mean_relative_error(
-          predictions, labels, labels, weights),
-      'RMSE Linear': new_slim.metrics.streaming_root_mean_squared_error(
-          predictions, labels, weights),
-      'RMSE Log': new_slim.metrics.streaming_root_mean_squared_error(
-          predictions, labels, weights),
-  })
-```
-
-##### Args:
-
-
-*  <b>`names_to_tuples`</b>: a map of metric names to tuples, each of which contain the
-    pair of (value_tensor, update_op) from a streaming metric.
-
-##### Returns:
-
-  A dictionary from metric names to value ops and a dictionary from metric
-  names to update ops.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metrics.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metrics.md
deleted file mode 100644
index fc3844131c3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.aggregate_metrics.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.contrib.metrics.aggregate_metrics(*value_update_tuples)` {#aggregate_metrics}
-
-Aggregates the metric value tensors and update ops into two lists.
-
-##### Args:
-
-
-*  <b>`*value_update_tuples`</b>: a variable number of tuples, each of which contain the
-    pair of (value_tensor, update_op) from a streaming metric.
-
-##### Returns:
-
-  A list of value `Tensor` objects and a list of update ops.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `value_update_tuples` is empty.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md
deleted file mode 100644
index 14643052579..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.metrics.streaming_false_negatives.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.contrib.metrics.streaming_false_negatives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_false_negatives}
-
-Computes the total number of false positives.
-
-If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-##### Args:
-
-
-*  <b>`predictions`</b>: The predicted values, a `Tensor` of arbitrary dimensions. Will
-    be cast to `bool`.
-*  <b>`labels`</b>: The ground truth values, a `Tensor` whose dimensions must match
-    `predictions`. Will be cast to `bool`.
-*  <b>`weights`</b>: Optional `Tensor` whose rank is either 0, or the same rank as
-    `labels`, and must be broadcastable to `labels` (i.e., all dimensions
-    must be either `1`, or the same as the corresponding `labels`
-    dimension).
-*  <b>`metrics_collections`</b>: An optional list of collections that the metric
-    value variable should be added to.
-*  <b>`updates_collections`</b>: An optional list of collections that the metric update
-    ops should be added to.
-*  <b>`name`</b>: An optional variable_scope name.
-
-##### Returns:
-
-
-*  <b>`value_tensor`</b>: A `Tensor` representing the current value of the metric.
-*  <b>`update_op`</b>: An operation that accumulates the error from a batch of data.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
-    or if either `metrics_collections` or `updates_collections` are not a list
-    or tuple.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.rnn.CompiledWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.rnn.CompiledWrapper.md
deleted file mode 100644
index dd655070ca8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.rnn.CompiledWrapper.md
+++ /dev/null
@@ -1,58 +0,0 @@
-Wraps step execution in an XLA JIT scope.
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.__call__(inputs, state, scope=None)` {#CompiledWrapper.__call__}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.__init__(cell, compile_stateful=False)` {#CompiledWrapper.__init__}
-
-Create CompiledWrapper cell.
-
-##### Args:
-
-
-*  <b>`cell`</b>: Instance of `RNNCell`.
-*  <b>`compile_stateful`</b>: Whether to compile stateful ops like initializers
-    and random number generators (default: False).
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.output_size` {#CompiledWrapper.output_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.state_size` {#CompiledWrapper.state_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.rnn.CompiledWrapper.zero_state(batch_size, dtype)` {#CompiledWrapper.zero_state}
-
-Return zero-filled state tensor(s).
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
-*  <b>`dtype`</b>: the data type to use for the state.
-
-##### Returns:
-
-  If `state_size` is an int or TensorShape, then the return value is a
-  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
-
-  If `state_size` is a nested list or tuple, then the return value is
-  a nested list or tuple (of the same structure) of `2-D` tensors with
-the shapes `[batch_size x s]` for each s in `state_size`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md
deleted file mode 100644
index a805e936a3c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.training.SequenceQueueingStateSaver.md
+++ /dev/null
@@ -1,270 +0,0 @@
-SequenceQueueingStateSaver provides access to stateful values from input.
-
-This class is meant to be used instead of, e.g., a `Queue`, for splitting
-variable-length sequence inputs into segments of sequences with fixed length
-and batching them into mini-batches.  It maintains contexts and state for a
-sequence across the segments.  It can be used in conjunction with a
-`QueueRunner` (see the example below).
-
-The `SequenceQueueingStateSaver` (SQSS) accepts one example at a time via the
-inputs `input_length`, `input_key`, `input_sequences` (a dict),
-`input_context` (a dict), and `initial_states` (a dict).
-The sequences, values in `input_sequences`, may have variable first dimension
-(the `padded_length`), though this dimension must always be a multiple of
-`num_unroll`.  All other dimensions must be fixed and accessible via
-`get_shape` calls.  The length prior to padding can be recorded in
-`input_length`.  The context values in `input_context` must all have fixed and
-well defined dimensions.  The initial state values must all have fixed and
-well defined dimensions.
-
-The SQSS splits the sequences of an input example into segments of length
-`num_unroll`.  Across examples minibatches of size `batch_size` are formed.
-These minibatches contain a segment of the sequences, copy the context values,
-and maintain state, length, and key information of the original input
-examples.  In the first segment of an example the state is still the initial
-state.  It can then be updated; and updated state values are accessible in
-subsequent segments of the same example. After each segment
-`batch.save_state()` must be called which is done by the state_saving_rnn.
-Without this call, the dequeue op associated with the SQSS will not run.
-Internally, SQSS has a queue for the input examples. Its `capacity` is
-configurable.  If set smaller than `batch_size` then the dequeue op will block
-indefinitely.  A small multiple of `batch_size` is a good rule of thumb to
-prevent that queue from becoming a bottleneck and slowing down training.
-If set too large (and note that it defaults to unbounded) memory consumption
-goes up.  Moreover, when iterating over the same input examples multiple times
-reusing the same `key` the `capacity` must be smaller than the number of
-examples.
-
-The prefetcher, which reads one unrolled, variable-length input sequence at
-a time, is accessible via `prefetch_op`.  The underlying `Barrier` object
-is accessible via `barrier`.  Processed minibatches, as well as
-state read and write capabilities are accessible via `next_batch`.
-Specifically, `next_batch` provides access to all of the minibatched
-data, including the following, see `NextQueuedSequenceBatch` for details:
-
-*  `total_length`, `length`, `insertion_index`, `key`, `next_key`,
-*  `sequence` (the index each minibatch entry's time segment index),
-*  `sequence_count` (the total time segment count for each minibatch entry),
-*  `context` (a dict of the copied minibatched context values),
-*  `sequences` (a dict of the split minibatched variable-length sequences),
-*  `state` (to access the states of the current segments of these entries)
-*  `save_state` (to save the states for the next segments of these entries)
-
-Example usage:
-
-```python
-batch_size = 32
-num_unroll = 20
-lstm_size = 8
-cell = tf.contrib.rnn.BasicLSTMCell(num_units=lstm_size)
-initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
-
-raw_data = get_single_input_from_input_reader()
-length, key, sequences, context = my_parser(raw_data)
-assert "input" in sequences.keys()
-assert "label" in context.keys()
-initial_states = {"lstm_state": initial_state_value}
-
-stateful_reader = tf.SequenceQueueingStateSaver(
-    batch_size, num_unroll,
-    length=length, input_key=key, input_sequences=sequences,
-    input_context=context, initial_states=initial_states,
-    capacity=batch_size*100)
-
-batch = stateful_reader.next_batch
-inputs = batch.sequences["input"]
-context_label = batch.context["label"]
-
-inputs_by_time = tf.split(value=inputs, num_or_size_splits=num_unroll, axis=1)
-assert len(inputs_by_time) == num_unroll
-
-lstm_output, _ = tf.contrib.rnn.static_state_saving_rnn(
-  cell,
-  inputs_by_time,
-  state_saver=batch,
-  state_name="lstm_state")
-
-# Start a prefetcher in the background
-sess = tf.Session()
-num_threads = 3
-queue_runner = tf.train.QueueRunner(
-    stateful_reader, [stateful_reader.prefetch_op] * num_threads)
-tf.train.add_queue_runner(queue_runner)
-tf.train.start_queue_runners(sess=session)
-
-while True:
-  # Step through batches, perform training or inference...
-  session.run([lstm_output])
-```
-
-**Note**: Usually the barrier is given to a QueueRunner as in the
-    examples above.  The QueueRunner will close the barrier if the prefetch_op
-    receives an OutOfRange Error from upstream input queues (i.e., reaches
-    the end of the input).  If the barrier is closed no further new examples
-    are added to the SQSS.  The underlying barrier might, however, still
-    contain further unroll-steps of examples that have not undergone all
-    iterations.  To gracefully finish all examples, the flag
-    `allow_small_batch` must be set to true, which causes the SQSS to issue
-    progressively smaller mini-batches with the remaining examples.
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.__init__(batch_size, num_unroll, input_length, input_key, input_sequences, input_context, initial_states, capacity=None, allow_small_batch=False, name=None)` {#SequenceQueueingStateSaver.__init__}
-
-Creates the SequenceQueueingStateSaver.
-
-##### Args:
-
-
-*  <b>`batch_size`</b>: int or int32 scalar `Tensor`, how large minibatches should
-    be when accessing the `state()` method and `context`, `sequences`, etc,
-    properties.
-*  <b>`num_unroll`</b>: Python integer, how many time steps to unroll at a time.
-    The input sequences of length `k` are then split into `k / num_unroll`
-    many segments.
-*  <b>`input_length`</b>: An int32 scalar `Tensor`, the length of the sequence prior
-    to padding.  This value may be at most `padded_length` for any given
-    input (see below for the definition of `padded_length`).
-    Batched and total lengths of the current iteration are made accessible
-    via the `length` and `total_length` properties.  The shape of
-    input_length (scalar) must be fully specified.
-*  <b>`input_key`</b>: A string scalar `Tensor`, the **unique** key for the given
-    input.  This is used to keep track of the split minibatch elements
-    of this input.  Batched keys of the current iteration are made
-    accessible via the `key` property.  The shape of `input_key` (scalar)
-    must be fully specified.
-*  <b>`input_sequences`</b>: A dict mapping string names to `Tensor` values.  The
-    values must all have matching first dimension, called `padded_length`.
-    The `SequenceQueueingStateSaver` will split these tensors along
-    this first dimension into minibatch elements of dimension
-    `num_unroll`. Batched and segmented sequences of the current iteration
-    are made accessible via the `sequences` property.
-
-    **Note**: `padded_length` may be dynamic, and may vary from input
-    to input, but must always be a multiple of `num_unroll`.  The remainder
-    of the shape (other than the first dimension) must be fully specified.
-
-*  <b>`input_context`</b>: A dict mapping string names to `Tensor` values.  The values
-    are treated as "global" across all time splits of the given input,
-    and will be copied across for all minibatch elements accordingly.
-    Batched and copied context of the current iteration are made
-    accessible via the `context` property.
-
-    **Note**: All input_context values must have fully defined shapes.
-
-*  <b>`initial_states`</b>: A dict mapping string state names to multi-dimensional
-    values (e.g. constants or tensors).  This input defines the set of
-    states that will be kept track of during computing iterations, and
-    which can be accessed via the `state` and `save_state` methods.
-
-    **Note**: All initial_state values must have fully defined shapes.
-
-*  <b>`capacity`</b>: The max capacity of the SQSS in number of examples. Needs to be
-    at least `batch_size`. Defaults to unbounded.
-*  <b>`allow_small_batch`</b>: If true, the SQSS will return smaller batches when
-    there aren't enough input examples to fill a whole batch and the end of
-    the input has been reached (i.e., the underlying barrier has been
-    closed).
-*  <b>`name`</b>: An op name string (optional).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if any of the inputs is not an expected type.
-*  <b>`ValueError`</b>: if any of the input values is inconsistent, e.g. if
-  not enough shape information is available from inputs to build
-  the state saver.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.barrier` {#SequenceQueueingStateSaver.barrier}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.batch_size` {#SequenceQueueingStateSaver.batch_size}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.close(cancel_pending_enqueues=False, name=None)` {#SequenceQueueingStateSaver.close}
-
-Closes the barrier and the FIFOQueue.
-
-This operation signals that no more segments of new sequences will be
-enqueued. New segments of already inserted sequences may still be enqueued
-and dequeued if there is a sufficient number filling a batch or
-allow_small_batch is true. Otherwise dequeue operations will fail
-immediately.
-
-##### Args:
-
-
-*  <b>`cancel_pending_enqueues`</b>: (Optional.) A boolean, defaulting to
-    `False`. If `True`, all pending enqueues to the underlying queues will
-    be cancelled, and completing already started sequences is not possible.
-*  <b>`name`</b>: Optional name for the op.
-
-##### Returns:
-
-  The operation that closes the barrier and the FIFOQueue.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.name` {#SequenceQueueingStateSaver.name}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.next_batch` {#SequenceQueueingStateSaver.next_batch}
-
-The `NextQueuedSequenceBatch` providing access to batched output data.
-
-Also provides access to the `state` and `save_state` methods.
-The first time this gets called, it additionally prepares barrier reads
-and creates `NextQueuedSequenceBatch` / next_batch objects. Subsequent
-calls simply return the previously created `next_batch`.
-
-In order to access data in `next_batch` without blocking, the `prefetch_op`
-must have been run at least `batch_size` times (ideally in a separate
-thread, or launched via a `QueueRunner`). After processing a segment in
-`next_batch()`, `batch.save_state()` must be called which is done by the
-state_saving_rnn. Without this call, the dequeue op associated with the SQSS
-will not run.
-
-##### Returns:
-
-  A cached `NextQueuedSequenceBatch` instance.
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.num_unroll` {#SequenceQueueingStateSaver.num_unroll}
-
-
-
-
-- - -
-
-#### `tf.contrib.training.SequenceQueueingStateSaver.prefetch_op` {#SequenceQueueingStateSaver.prefetch_op}
-
-The op used to prefetch new data into the state saver.
-
-Running it once enqueues one new input example into the state saver.
-The first time this gets called, it additionally creates the prefetch_op.
-Subsequent calls simply return the previously created `prefetch_op`.
-
-It should be run in a separate thread via e.g. a `QueueRunner`.
-
-##### Returns:
-
-  An `Operation` that performs prefetching.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.InvalidArgumentError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.InvalidArgumentError.md
deleted file mode 100644
index 877325fe0b4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.InvalidArgumentError.md
+++ /dev/null
@@ -1,17 +0,0 @@
-Raised when an operation receives an invalid argument.
-
-This may occur, for example, if an operation is receives an input
-tensor that has an invalid value or shape. For example, the
-[`tf.matmul()`](../../api_docs/python/math_ops.md#matmul) op will raise this
-error if it receives an input that is not a matrix, and the
-[`tf.reshape()`](../../api_docs/python/array_ops.md#reshape) op will raise
-this error if the new shape does not match the number of elements in the input
-tensor.
-
-- - -
-
-#### `tf.errors.InvalidArgumentError.__init__(node_def, op, message)` {#InvalidArgumentError.__init__}
-
-Creates an `InvalidArgumentError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.UnknownError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.UnknownError.md
deleted file mode 100644
index 3e18ec866be..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.errors.UnknownError.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Unknown error.
-
-An example of where this error may be returned is if a Status value
-received from another address space belongs to an error-space that
-is not known to this address space. Also errors raised by APIs that
-do not return enough error information may be converted to this
-error.
-
-- - -
-
-#### `tf.errors.UnknownError.__init__(node_def, op, message, error_code=2)` {#UnknownError.__init__}
-
-Creates an `UnknownError`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.expm1.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.expm1.md
deleted file mode 100644
index a1867f0a30a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.expm1.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.expm1(x, name=None)` {#expm1}
-
-Computes exponential of x - 1 element-wise.
-
-I.e., \\(y = (\exp x) - 1\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.eye.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.eye.md
deleted file mode 100644
index b71edf9b969..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.eye.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.eye(num_rows, num_columns=None, batch_shape=None, dtype=tf.float32, name=None)` {#eye}
-
-Construct an identity matrix, or a batch of matrices.
-
-```python
-# Construct one identity matrix.
-tf.eye(2)
-==> [[1., 0.],
-     [0., 1.]]
-
-# Construct a batch of 3 identity matricies, each 2 x 2.
-# batch_identity[i, :, :] is a 2 x 2 identity matrix, i = 0, 1, 2.
-batch_identity = tf.eye(2, batch_shape=[3])
-
-# Construct one 2 x 3 "identity" matrix
-tf.eye(2, num_columns=3)
-==> [[ 1.,  0.,  0.],
-     [ 0.,  1.,  0.]]
-```
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Non-negative `int32` scalar `Tensor` giving the number of rows
-    in each batch matrix.
-*  <b>`num_columns`</b>: Optional non-negative `int32` scalar `Tensor` giving the number
-    of columns in each batch matrix.  Defaults to `num_rows`.
-*  <b>`batch_shape`</b>: `int32` `Tensor`.  If provided, returned `Tensor` will have
-    leading batch dimensions of this shape.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`
-*  <b>`name`</b>: A name for this `Op`.  Defaults to "eye".
-
-##### Returns:
-
-  A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.fill.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.fill.md
deleted file mode 100644
index 76de3e2d4df..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.fill.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### `tf.fill(dims, value, name=None)` {#fill}
-
-Creates a tensor filled with a scalar value.
-
-This operation creates a tensor of shape `dims` and fills it with `value`.
-
-For example:
-
-```prettyprint
-# Output tensor has shape [2, 3].
-fill([2, 3], 9) ==> [[9, 9, 9]
-                     [9, 9, 9]]
-```
-
-##### Args:
-
-
-*  <b>`dims`</b>: A `Tensor` of type `int32`.
-    1-D. Represents the shape of the output tensor.
-*  <b>`value`</b>: A `Tensor`. 0-D (scalar). Value to fill the returned tensor.
-
-    @compatibility(numpy)
-    Equivalent to np.full
-    @end_compatibility
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `value`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.foldr.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.foldr.md
deleted file mode 100644
index ae3471659f9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.foldr.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#foldr}
-
-foldr on the list of tensors unpacked from `elems` on dimension 0.
-
-This foldr operator repeatedly applies the callable `fn` to a sequence
-of elements from last to first. The elements are made of the tensors
-unpacked from `elems`. The callable fn takes two tensors as arguments.
-The first argument is the accumulated value computed from the preceding
-invocation of fn. If `initializer` is None, `elems` must contain at least
-one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `fn(initializer, values[0]).shape`.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.
-*  <b>`elems`</b>: A tensor that is unpacked into a sequence of tensors to apply `fn`.
-*  <b>`initializer`</b>: (optional) The initial value for the accumulator.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor resulting from applying `fn` consecutively to the list of tensors
-  unpacked from `elems`, from last to first.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable.
-
-##### Example:
-
-  ```python
-  elems = [1, 2, 3, 4, 5, 6]
-  sum = foldr(lambda a, x: a + x, elems)
-  # sum == 21
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.gather.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.gather.md
deleted file mode 100644
index 3c6be5988c9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.gather.md
+++ /dev/null
@@ -1,37 +0,0 @@
-### `tf.gather(params, indices, validate_indices=None, name=None)` {#gather}
-
-Gather slices from `params` according to `indices`.
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-If `indices` is a permutation and `len(indices) == params.shape[0]` then
-this operation will permute `params` accordingly.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/Gather.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`params`</b>: A `Tensor`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-*  <b>`validate_indices`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `params`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_default_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_default_graph.md
deleted file mode 100644
index bd734d1b98b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_default_graph.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### `tf.get_default_graph()` {#get_default_graph}
-
-Returns the default graph for the current thread.
-
-The returned graph will be the innermost graph on which a
-`Graph.as_default()` context has been entered, or a global default
-graph if none has been explicitly created.
-
-NOTE: The default graph is a property of the current thread. If you
-create a new thread, and wish to use the default graph in that
-thread, you must explicitly add a `with g.as_default():` in that
-thread's function.
-
-##### Returns:
-
-  The default `Graph` being used in the current thread.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md
deleted file mode 100644
index 2cc153d19c9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_local_variable.md
+++ /dev/null
@@ -1,87 +0,0 @@
-### `tf.get_local_variable(*args, **kwargs)` {#get_local_variable}
-
-Gets an existing *local* variable or creates a new one.
-
-Behavior is the same as in `get_variable`, except that variables are
-added to the `LOCAL_VARIABLES` collection and `trainable` is set to
-`False`.
-This function prefixes the name with the current variable scope
-and performs reuse checks. See the
-[Variable Scope How To](../../how_tos/variable_scope/index.md)
-for an extensive description of how reusing works. Here is a basic example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
-    w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v")  # The same as v above.
-```
-
-If initializer is `None` (the default), the default initializer passed in
-the variable scope will be used. If that one is `None` too, a
-`glorot_uniform_initializer` will be used. The initializer can also be
-a Tensor, in which case the variable is initialized to this value and shape.
-
-Similarly, if the regularizer is `None` (the default), the default regularizer
-passed in the variable scope will be used (if that is `None` too,
-then by default no regularization is performed).
-
-If a partitioner is provided, a `PartitionedVariable` is returned.
-Accessing this object as a `Tensor` returns the shards concatenated along
-the partition axis.
-
-Some useful partitioners are available.  See, e.g.,
-`variable_axis_size_partitioner` and `min_max_variable_partitioner`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the new or existing variable.
-*  <b>`shape`</b>: Shape of the new or existing variable.
-*  <b>`dtype`</b>: Type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: Initializer for the variable if one is created.
-*  <b>`regularizer`</b>: A (Tensor -> Tensor or None) function; the result of
-    applying it on a newly created variable will be added to the collection
-    GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`collections`</b>: List of graph collections keys to add the Variable to.
-    Defaults to `[GraphKeys.LOCAL_VARIABLES]` (see `tf.Variable`).
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-    Variable should be cached for reading.  Defaults to the Variable's
-    device.  If not `None`, caches on another device.  Typical use is to
-    cache on the device where the Ops using the Variable reside, to
-    deduplicate copying through `Switch` and other conditional statements.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and `dtype` of the Variable to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`validate_shape`</b>: If False, allows the variable to be initialized with a
-      value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
-*  <b>`use_resource`</b>: If False, creates a regular Variable. If true, creates an
-    experimental ResourceVariable instead with well-defined semantics.
-    Defaults to False (will later change to True).
-*  <b>`custom_getter`</b>: Callable that takes as a first argument the true getter, and
-    allows overwriting the internal get_variable method.
-    The signature of `custom_getter` should match that of this method,
-    but the most future-proof version will allow for changes:
-    `def custom_getter(getter, *args, **kwargs)`.  Direct access to
-    all `get_variable` parameters is also allowed:
-    `def custom_getter(getter, name, *args, **kwargs)`.  A simple identity
-    custom getter that simply creates variables with modified names is:
-    ```python
-    def custom_getter(getter, name, *args, **kwargs):
-      return getter(name + '_suffix', *args, **kwargs)
-    ```
-
-##### Returns:
-
-  The created or existing `Variable` (or `PartitionedVariable`, if a
-  partitioner was used).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when creating a new variable and shape is not declared,
-    when violating reuse during variable creation, or when `initializer` dtype
-    and `dtype` don't match. Reuse is set inside `variable_scope`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
deleted file mode 100644
index 3a5d19b4dfb..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
+++ /dev/null
@@ -1,86 +0,0 @@
-### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, use_resource=None, custom_getter=None)` {#get_variable}
-
-Gets an existing variable with these parameters or create a new one.
-
-This function prefixes the name with the current variable scope
-and performs reuse checks. See the
-[Variable Scope How To](../../how_tos/variable_scope/index.md)
-for an extensive description of how reusing works. Here is a basic example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
-    w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v")  # The same as v above.
-```
-
-If initializer is `None` (the default), the default initializer passed in
-the variable scope will be used. If that one is `None` too, a
-`glorot_uniform_initializer` will be used. The initializer can also be
-a Tensor, in which case the variable is initialized to this value and shape.
-
-Similarly, if the regularizer is `None` (the default), the default regularizer
-passed in the variable scope will be used (if that is `None` too,
-then by default no regularization is performed).
-
-If a partitioner is provided, a `PartitionedVariable` is returned.
-Accessing this object as a `Tensor` returns the shards concatenated along
-the partition axis.
-
-Some useful partitioners are available.  See, e.g.,
-`variable_axis_size_partitioner` and `min_max_variable_partitioner`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the new or existing variable.
-*  <b>`shape`</b>: Shape of the new or existing variable.
-*  <b>`dtype`</b>: Type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: Initializer for the variable if one is created.
-*  <b>`regularizer`</b>: A (Tensor -> Tensor or None) function; the result of
-    applying it on a newly created variable will be added to the collection
-    GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: List of graph collections keys to add the Variable to.
-    Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-    Variable should be cached for reading.  Defaults to the Variable's
-    device.  If not `None`, caches on another device.  Typical use is to
-    cache on the device where the Ops using the Variable reside, to
-    deduplicate copying through `Switch` and other conditional statements.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and `dtype` of the Variable to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`validate_shape`</b>: If False, allows the variable to be initialized with a
-      value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
-*  <b>`use_resource`</b>: If False, creates a regular Variable. If true, creates an
-    experimental ResourceVariable instead with well-defined semantics.
-    Defaults to False (will later change to True).
-*  <b>`custom_getter`</b>: Callable that takes as a first argument the true getter, and
-    allows overwriting the internal get_variable method.
-    The signature of `custom_getter` should match that of this method,
-    but the most future-proof version will allow for changes:
-    `def custom_getter(getter, *args, **kwargs)`.  Direct access to
-    all `get_variable` parameters is also allowed:
-    `def custom_getter(getter, name, *args, **kwargs)`.  A simple identity
-    custom getter that simply creates variables with modified names is:
-    ```python
-    def custom_getter(getter, name, *args, **kwargs):
-      return getter(name + '_suffix', *args, **kwargs)
-    ```
-
-##### Returns:
-
-  The created or existing `Variable` (or `PartitionedVariable`, if a
-  partitioner was used).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when creating a new variable and shape is not declared,
-    when violating reuse during variable creation, or when `initializer` dtype
-    and `dtype` don't match. Reuse is set inside `variable_scope`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.encode_png.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.encode_png.md
deleted file mode 100644
index fa073a771fa..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.encode_png.md
+++ /dev/null
@@ -1,28 +0,0 @@
-### `tf.image.encode_png(image, compression=None, name=None)` {#encode_png}
-
-PNG-encode an image.
-
-`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-where `channels` is:
-
-*   1: for grayscale.
-*   2: for grayscale + alpha.
-*   3: for RGB.
-*   4: for RGBA.
-
-The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-default or a value from 0 to 9.  9 is the highest compression level, generating
-the smallest output, but is slower.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor`. Must be one of the following types: `uint8`, `uint16`.
-    3-D with shape `[height, width, channels]`.
-*  <b>`compression`</b>: An optional `int`. Defaults to `-1`. Compression level.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. 0-D. PNG-encoded image.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.random_flip_up_down.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.random_flip_up_down.md
deleted file mode 100644
index 7ed36f5df24..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.image.random_flip_up_down.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.image.random_flip_up_down(image, seed=None)` {#random_flip_up_down}
-
-Randomly flips an image vertically (upside down).
-
-With a 1 in 2 chance, outputs the contents of `image` flipped along the first
-dimension, which is `height`.  Otherwise output the image as-is.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
deleted file mode 100644
index bcd0859e479..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
-
-Computes the determinant of one ore more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor containing the determinants
-for all input submatrices `[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[...]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_triangular_solve.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_triangular_solve.md
deleted file mode 100644
index 66403eccfed..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_triangular_solve.md
+++ /dev/null
@@ -1,44 +0,0 @@
-### `tf.matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#matrix_triangular_solve}
-
-Solves systems of linear equations with upper or lower triangular matrices by
-
-backsubstitution.
-
-`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is assumed to be zero and not accessed.
-If `lower` is False then the strictly lower triangular part of each inner-most
-matrix is assumed to be zero and not accessed.
-`rhs` is a tensor of shape `[..., M, K]`.
-
-The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-`True` then the innermost matrices in output` satisfy matrix equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `False` then the strictly then the  innermost matrices in
-`output` satisfy matrix equations
-`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
-    Shape is `[..., M, K]`.
-*  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether the innermost matrices in `matrix` are
-    lower or upper triangular.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-    Boolean indicating whether to solve with `matrix` or its (block-wise)
-             adjoint.
-
-    @compatibility(numpy)
-    Equivalent to np.linalg.triangular_solve
-    @end_compatibility
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `matrix`. Shape is `[..., M, K]`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.meshgrid.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.meshgrid.md
deleted file mode 100644
index 673a5c07178..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.meshgrid.md
+++ /dev/null
@@ -1,45 +0,0 @@
-### `tf.meshgrid(*args, **kwargs)` {#meshgrid}
-
-Broadcasts parameters for evaluation on an N-D grid.
-
-Given N one-dimensional coordinate arrays `*args`, returns a list `outputs`
-of N-D coordinate arrays for evaluating expressions on an N-D grid.
-
-Notes:
-
-`meshgrid` supports cartesian ('xy') and matrix ('ij') indexing conventions.
-When the `indexing` argument is set to 'xy' (the default), the broadcasting
-instructions for the first two dimensions are swapped.
-
-Examples:
-
-Calling `X, Y = meshgrid(x, y)` with the tensors
-
-```prettyprint
-  x = [1, 2, 3]
-  y = [4, 5, 6]
-```
-
-results in
-
-```prettyprint
-  X = [[1, 1, 1],
-       [2, 2, 2],
-       [3, 3, 3]]
-  Y = [[4, 5, 6],
-       [4, 5, 6],
-       [4, 5, 6]]
-```
-
-##### Args:
-
-
-*  <b>`*args`</b>: `Tensor`s with rank 1
-*  <b>`indexing`</b>: Either 'xy' or 'ij' (optional, default: 'xy')
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`outputs`</b>: A list of N `Tensor`s with rank N
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.bias_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.bias_add.md
deleted file mode 100644
index eee3edf9c34..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.bias_add.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.nn.bias_add(value, bias, data_format=None, name=None)` {#bias_add}
-
-Adds `bias` to `value`.
-
-This is (mostly) a special case of `tf.add` where `bias` is restricted to 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-Unlike `tf.add`, the type of `bias` is allowed to differ from `value` in the
-case where both types are quantized.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor` with type `float`, `double`, `int64`, `int32`, `uint8`,
-    `int16`, `int8`, `complex64`, or `complex128`.
-*  <b>`bias`</b>: A 1-D `Tensor` with size matching the last dimension of `value`.
-    Must be the same type as `value` unless `value` is a quantized type,
-    in which case a different quantized type may be used.
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.crelu.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.crelu.md
deleted file mode 100644
index 8f6d282c107..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.crelu.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.nn.crelu(features, name=None)` {#crelu}
-
-Computes Concatenated ReLU.
-
-Concatenates a ReLU which selects only the positive part of the activation
-with a ReLU which selects only the *negative* part of the activation.
-Note that as a result this non-linearity doubles the depth of the activations.
-Source: https://arxiv.org/abs/1603.05201
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-    `int16`, or `int8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `features`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.fused_batch_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.fused_batch_norm.md
deleted file mode 100644
index 154f5692f3b..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.fused_batch_norm.md
+++ /dev/null
@@ -1,33 +0,0 @@
-### `tf.nn.fused_batch_norm(x, scale, offset, mean=None, variance=None, epsilon=0.001, data_format='NHWC', is_training=True, name=None)` {#fused_batch_norm}
-
-Batch normalization.
-
-As described in http://arxiv.org/abs/1502.03167.
-
-##### Args:
-
-
-*  <b>`x`</b>: Input `Tensor` of 4 dimensions.
-*  <b>`scale`</b>: A `Tensor` of 1 dimension for scaling.
-*  <b>`offset`</b>: A `Tensor` of 1 dimension for bias.
-*  <b>`mean`</b>: A `Tensor` of 1 dimension for population mean used for inference.
-*  <b>`variance`</b>: A `Tensor` of 1 dimension for population variance
-            used for inference.
-*  <b>`epsilon`</b>: A small float number added to the variance of x.
-*  <b>`data_format`</b>: The data format for x. Either "NHWC" (default) or "NCHW".
-*  <b>`is_training`</b>: A bool value to specify if the operation is used for
-               training or inference.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`y`</b>: A 4D Tensor for the normalized, scaled, offsetted x.
-*  <b>`batch_mean`</b>: A 1D Tensor for the mean of x.
-*  <b>`batch_var`</b>: A 1D Tensor for the variance of x.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If mean or variance is not None when is_training is True.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
deleted file mode 100644
index 5424efd7a76..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
+++ /dev/null
@@ -1,30 +0,0 @@
-### `tf.nn.max_pool_with_argmax(input, ksize, strides, padding, Targmax=None, name=None)` {#max_pool_with_argmax}
-
-Performs max pooling on the input and outputs both max values and indices.
-
-The indices in `argmax` are flattened, so that a maximum value at position
-`[b, y, x, c]` becomes flattened index
-`((b * height + y) * width + x) * channels + c`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
-    The size of the window for each dimension of the input tensor.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    The stride of the sliding window for each dimension of the
-    input tensor.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`Targmax`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, argmax).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
-*  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md
deleted file mode 100644
index 370ad0a5d23..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.raw_rnn.md
+++ /dev/null
@@ -1,170 +0,0 @@
-### `tf.nn.raw_rnn(cell, loop_fn, parallel_iterations=None, swap_memory=False, scope=None)` {#raw_rnn}
-
-Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
-
-**NOTE: This method is still in testing, and the API may change.**
-
-This function is a more primitive version of `dynamic_rnn` that provides
-more direct access to the inputs each iteration.  It also provides more
-control over when to start and finish reading the sequence, and
-what to emit for the output.
-
-For example, it can be used to implement the dynamic decoder of a seq2seq
-model.
-
-Instead of working with `Tensor` objects, most operations work with
-`TensorArray` objects directly.
-
-The operation of `raw_rnn`, in pseudo-code, is basically the following:
-
-```python
-time = tf.constant(0, dtype=tf.int32)
-(finished, next_input, initial_state, _, loop_state) = loop_fn(
-    time=time, cell_output=None, cell_state=None, loop_state=None)
-emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype)
-state = initial_state
-while not all(finished):
-  (output, cell_state) = cell(next_input, state)
-  (next_finished, next_input, next_state, emit, loop_state) = loop_fn(
-      time=time + 1, cell_output=output, cell_state=cell_state,
-      loop_state=loop_state)
-  # Emit zeros and copy forward state for minibatch entries that are finished.
-  state = tf.where(finished, state, next_state)
-  emit = tf.where(finished, tf.zeros_like(emit), emit)
-  emit_ta = emit_ta.write(time, emit)
-  # If any new minibatch entries are marked as finished, mark these.
-  finished = tf.logical_or(finished, next_finished)
-  time += 1
-return (emit_ta, state, loop_state)
-```
-
-with the additional properties that output and state may be (possibly nested)
-tuples, as determined by `cell.output_size` and `cell.state_size`, and
-as a result the final `state` and `emit_ta` may themselves be tuples.
-
-A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this:
-
-```python
-inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
-                        dtype=tf.float32)
-sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
-inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
-inputs_ta = inputs_ta.unstack(inputs)
-
-cell = tf.contrib.rnn.LSTMCell(num_units)
-
-def loop_fn(time, cell_output, cell_state, loop_state):
-  emit_output = cell_output  # == None for time == 0
-  if cell_output is None:  # time == 0
-    next_cell_state = cell.zero_state(batch_size, tf.float32)
-  else:
-    next_cell_state = cell_state
-  elements_finished = (time >= sequence_length)
-  finished = tf.reduce_all(elements_finished)
-  next_input = tf.cond(
-      finished,
-      lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
-      lambda: inputs_ta.read(time))
-  next_loop_state = None
-  return (elements_finished, next_input, next_cell_state,
-          emit_output, next_loop_state)
-
-outputs_ta, final_state, _ = raw_rnn(cell, loop_fn)
-outputs = outputs_ta.stack()
-```
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`loop_fn`</b>: A callable that takes inputs
-    `(time, cell_output, cell_state, loop_state)`
-    and returns the tuple
-    `(finished, next_input, next_cell_state, emit_output, next_loop_state)`.
-    Here `time` is an int32 scalar `Tensor`, `cell_output` is a
-    `Tensor` or (possibly nested) tuple of tensors as determined by
-    `cell.output_size`, and `cell_state` is a `Tensor`
-    or (possibly nested) tuple of tensors, as determined by the `loop_fn`
-    on its first call (and should match `cell.state_size`).
-    The outputs are: `finished`, a boolean `Tensor` of
-    shape `[batch_size]`, `next_input`: the next input to feed to `cell`,
-    `next_cell_state`: the next state to feed to `cell`,
-    and `emit_output`: the output to store for this iteration.
-
-    Note that `emit_output` should be a `Tensor` or (possibly nested)
-    tuple of tensors with shapes and structure matching `cell.output_size`
-    and `cell_output` above.  The parameter `cell_state` and output
-    `next_cell_state` may be either a single or (possibly nested) tuple
-    of tensors.  The parameter `loop_state` and
-    output `next_loop_state` may be either a single or (possibly nested) tuple
-    of `Tensor` and `TensorArray` objects.  This last parameter
-    may be ignored by `loop_fn` and the return value may be `None`.  If it
-    is not `None`, then the `loop_state` will be propagated through the RNN
-    loop, for use purely by `loop_fn` to keep track of its own state.
-    The `next_loop_state` parameter returned may be `None`.
-
-    The first call to `loop_fn` will be `time = 0`, `cell_output = None`,
-    `cell_state = None`, and `loop_state = None`.  For this call:
-    The `next_cell_state` value should be the value with which to initialize
-    the cell's state.  It may be a final state from a previous RNN or it
-    may be the output of `cell.zero_state()`.  It should be a
-    (possibly nested) tuple structure of tensors.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of
-    appropriate type and shape `[batch_size] + cell.state_size`.
-    If `cell.state_size` is a (possibly nested) tuple of ints or
-    `TensorShape`, this will be a tuple having the corresponding shapes.
-    The `emit_output` value may be  either `None` or a (possibly nested)
-    tuple structure of tensors, e.g.,
-    `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
-    If this first `emit_output` return value is `None`,
-    then the `emit_ta` result of `raw_rnn` will have the same structure and
-    dtypes as `cell.output_size`.  Otherwise `emit_ta` will have the same
-    structure, shapes (prepended with a `batch_size` dimension), and dtypes
-    as `emit_output`.  The actual values returned for `emit_output` at this
-    initializing call are ignored.  Note, this emit structure must be
-    consistent across all time steps.
-
-
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A tuple `(emit_ta, final_state, final_loop_state)` where:
-
-  `emit_ta`: The RNN output `TensorArray`.
-     If `loop_fn` returns a (possibly nested) set of Tensors for
-     `emit_output` during initialization, (inputs `time = 0`,
-     `cell_output = None`, and `loop_state = None`), then `emit_ta` will
-     have the same structure, dtypes, and shapes as `emit_output` instead.
-     If `loop_fn` returns `emit_output = None` during this call,
-     the structure of `cell.output_size` is used:
-     If `cell.output_size` is a (possibly nested) tuple of integers
-     or `TensorShape` objects, then `emit_ta` will be a tuple having the
-     same structure as `cell.output_size`, containing TensorArrays whose
-     elements' shapes correspond to the shape data in `cell.output_size`.
-
-  `final_state`: The final cell state.  If `cell.state_size` is an int, this
-    will be shaped `[batch_size, cell.state_size]`.  If it is a
-    `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
-    If it is a (possibly nested) tuple of ints or `TensorShape`, this will
-    be a tuple having the corresponding shapes.
-
-  `final_loop_state`: The final loop state as returned by `loop_fn`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell, or `loop_fn` is not
-    a `callable`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.uniform_candidate_sampler.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.uniform_candidate_sampler.md
deleted file mode 100644
index c34056dc846..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.uniform_candidate_sampler.md
+++ /dev/null
@@ -1,49 +0,0 @@
-### `tf.nn.uniform_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#uniform_candidate_sampler}
-
-Samples a set of classes using a uniform base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is the uniform distribution
-over the range of integers `[0, range_max)`.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md
deleted file mode 100644
index dc519bbf764..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.zero_fraction.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.nn.zero_fraction(value, name=None)` {#zero_fraction}
-
-Returns the fraction of zeros in `value`.
-
-If `value` is empty, the result is `nan`.
-
-This is useful in summaries to measure and report sparsity.  For example,
-
-```python
-    z = tf.Relu(...)
-    summ = tf.contrib.deprecated.scalar_summary('sparsity',
-    tf.nn.zero_fraction(z))
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: A tensor of numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The fraction of zeros in `value`, with type `float32`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.ones.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.ones.md
deleted file mode 100644
index c218aa6c973..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.ones.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.ones(shape, dtype=tf.float32, name=None)` {#ones}
-
-Creates a tensor with all elements set to 1.
-
-This operation returns a tensor of type `dtype` with shape `shape` and all
-elements set to 1.
-
-For example:
-
-```python
-tf.ones([2, 3], tf.int32) ==> [[1, 1, 1], [1, 1, 1]]
-```
-
-##### Args:
-
-
-*  <b>`shape`</b>: Either a list of integers, or a 1-D `Tensor` of type `int32`.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with all elements set to 1.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.pad.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.pad.md
deleted file mode 100644
index 55a35db9b1d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.pad.md
+++ /dev/null
@@ -1,57 +0,0 @@
-### `tf.pad(tensor, paddings, mode='CONSTANT', name=None)` {#pad}
-
-Pads a tensor.
-
-This operation pads a `tensor` according to the `paddings` you specify.
-`paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
-`tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
-many values to add before the contents of `tensor` in that dimension, and
-`paddings[D, 1]` indicates how many values to add after the contents of
-`tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
-and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
-`mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
-no greater than `tensor.dim_size(D)`.
-
-The padded size of each dimension D of the output is:
-
-`paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
-
-For example:
-
-```python
-# 't' is [[1, 2, 3], [4, 5, 6]].
-# 'paddings' is [[1, 1,], [2, 2]].
-# rank of 't' is 2.
-pad(t, paddings, "CONSTANT") ==> [[0, 0, 0, 0, 0, 0, 0],
-                                  [0, 0, 1, 2, 3, 0, 0],
-                                  [0, 0, 4, 5, 6, 0, 0],
-                                  [0, 0, 0, 0, 0, 0, 0]]
-
-pad(t, paddings, "REFLECT") ==> [[6, 5, 4, 5, 6, 5, 4],
-                                 [3, 2, 1, 2, 3, 2, 1],
-                                 [6, 5, 4, 5, 6, 5, 4],
-                                 [3, 2, 1, 2, 3, 2, 1]]
-
-pad(t, paddings, "SYMMETRIC") ==> [[2, 1, 1, 2, 3, 3, 2],
-                                   [2, 1, 1, 2, 3, 3, 2],
-                                   [5, 4, 4, 5, 6, 6, 5],
-                                   [5, 4, 4, 5, 6, 6, 5]]
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: A `Tensor`.
-*  <b>`paddings`</b>: A `Tensor` of type `int32`.
-*  <b>`mode`</b>: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.realdiv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.realdiv.md
deleted file mode 100644
index facd6630dd3..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.realdiv.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.realdiv(x, y, name=None)` {#realdiv}
-
-Returns x / y element-wise for real types.
-
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.saturate_cast.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.saturate_cast.md
deleted file mode 100644
index 6a77c2791e2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.saturate_cast.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.saturate_cast(value, dtype, name=None)` {#saturate_cast}
-
-Performs a safe saturating cast of `value` to `dtype`.
-
-This function casts the input to `dtype` without applying any scaling.  If
-there is a danger that values would over or underflow in the cast, this op
-applies the appropriate clamping before the cast.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`.
-*  <b>`dtype`</b>: The desired output `DType`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `value` safely cast to `dtype`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scalar_mul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scalar_mul.md
deleted file mode 100644
index 5af291597d9..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scalar_mul.md
+++ /dev/null
@@ -1,23 +0,0 @@
-### `tf.scalar_mul(scalar, x)` {#scalar_mul}
-
-Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
-
-Intended for use in gradient code which might deal with `IndexedSlices`
-objects, which are easy to multiply by a scalar but more expensive to
-multiply with arbitrary tensors.
-
-##### Args:
-
-
-*  <b>`scalar`</b>: A 0-D scalar `Tensor`. Must have known shape.
-*  <b>`x`</b>: A `Tensor` or `IndexedSlices` to be scaled.
-
-##### Returns:
-
-  `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if scalar is not a 0-D `scalar`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
deleted file mode 100644
index 047971e2603..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
+++ /dev/null
@@ -1,92 +0,0 @@
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
-
-scan on the list of tensors unpacked from `elems` on dimension 0.
-
-The simplest version of `scan` repeatedly applies the callable `fn` to a
-sequence of elements from first to last. The elements are made of the tensors
-unpacked from `elems` on dimension 0. The callable fn takes two tensors as
-arguments. The first argument is the accumulated value computed from the
-preceding invocation of fn. If `initializer` is None, `elems` must contain
-at least one element, and its first element is used as the initializer.
-
-Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`.
-
-This method also allows multi-arity `elems` and accumulator.  If `elems`
-is a (possibly nested) list or tuple of tensors, then each of these tensors
-must have a matching first (unpack) dimension.  The second argument of
-`fn` must match the structure of `elems`.
-
-If no `initializer` is provided, the output structure and dtypes of `fn`
-are assumed to be the same as its input; and in this case, the first
-argument of `fn` must match the structure of `elems`.
-
-If an `initializer` is provided, then the output of `fn` must have the same
-structure as `initializer`; and the first argument of `fn` must match
-this structure.
-
-For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
-`[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-`fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
-`[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
- one that works in `python3`, is:
-`fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
-
-##### Args:
-
-
-*  <b>`fn`</b>: The callable to be performed.  It accepts two arguments.  The first
-    will have the same structure as `initializer` if one is provided,
-    otherwise it will have the same structure as `elems`.  The second
-    will have the same (possibly nested) structure as `elems`.  Its output
-    must have the same structure as `initializer` if one is provided,
-    otherwise it must have the same structure as `elems`.
-*  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
-    will be unpacked along their first dimension.  The nested sequence
-    of the resulting slices will be the first argument to `fn`.
-*  <b>`initializer`</b>: (optional) A tensor or (possibly nested) sequence of tensors,
-    initial value for the accumulator, and the expected output type of `fn`.
-*  <b>`parallel_iterations`</b>: (optional) The number of iterations allowed to run
-    in parallel.
-*  <b>`back_prop`</b>: (optional) True enables support for back propagation.
-*  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
-*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
-*  <b>`name`</b>: (optional) Name prefix for the returned tensors.
-
-##### Returns:
-
-  A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-  results of applying `fn` to tensors unpacked from `elems` along the first
-  dimension, and the previous accumulator value(s), from first to last.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `initializer` do not match.
-*  <b>`ValueError`</b>: if the lengths of the output of `fn` and `initializer`
-    do not match.
-
-##### Examples:
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  sum = scan(lambda a, x: a + x, elems)
-  # sum == [1, 3, 6, 10, 15, 21]
-  ```
-
-  ```python
-  elems = np.array([1, 2, 3, 4, 5, 6])
-  initializer = np.array(0)
-  sum_one = scan(
-      lambda a, x: x[0] - x[1] + a, (elems + 1, elems), initializer)
-  # sum_one == [1, 2, 3, 4, 5, 6]
-  ```
-
-  ```python
-  elems = np.array([1, 0, 0, 0, 0, 0])
-  initializer = (np.array(0), np.array(1))
-  fibonaccis = scan(lambda a, _: (a[1], a[0] + a[1]), elems, initializer)
-  # fibonaccis == ([1, 1, 2, 3, 5, 8], [1, 2, 3, 5, 8, 13])
-  ```
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.size.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.size.md
deleted file mode 100644
index 3df6d4cb2f7..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.size.md
+++ /dev/null
@@ -1,26 +0,0 @@
-### `tf.size(input, name=None, out_type=tf.int32)` {#size}
-
-Returns the size of a tensor.
-
-This operation returns an integer representing the number of elements in
-`input`.
-
-For example:
-
-```python
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-size(t) ==> 12
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`out_type`</b>: (Optional) The specified output type of the operation
-    (`int32` or `int64`). Defaults to tf.int32.
-
-##### Returns:
-
-  A `Tensor` of type `out_type`. Defaults to tf.int32.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md
deleted file mode 100644
index f61b0dfe088..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md
+++ /dev/null
@@ -1,110 +0,0 @@
-### `tf.space_to_batch(input, paddings, block_size, name=None)` {#space_to_batch}
-
-SpaceToBatch for 4-D tensors of type T.
-
-This is a legacy version of the more general SpaceToBatchND.
-
-Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-More specifically, this op outputs a copy of the input tensor where values from
-the `height` and `width` dimensions are moved to the `batch` dimension. After
-the zero-padding, both `height` and `width` of the input must be divisible by the
-block size.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. 4-D with shape `[batch, height, width, depth]`.
-*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-      the padding of the input with zeros across the spatial dimensions as follows:
-
-          paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-
-      The effective spatial dimensions of the zero-padded input tensor will be:
-
-          height_pad = pad_top + height + pad_bottom
-          width_pad = pad_left + width + pad_right
-
-    The attr `block_size` must be greater than one. It indicates the block size.
-
-      * Non-overlapping blocks of size `block_size x block size` in the height and
-        width dimensions are rearranged into the batch dimension at each location.
-      * The batch of the output tensor is `batch * block_size * block_size`.
-      * Both height_pad and width_pad must be divisible by block_size.
-
-    The shape of the output will be:
-
-        [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-         depth]
-
-    Some examples:
-
-    (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1], [2]], [[3], [4]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 1]` and value:
-
-    ```prettyprint
-    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    ```
-
-    (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1, 2, 3], [4, 5, 6]],
-          [[7, 8, 9], [10, 11, 12]]]]
-    ```
-
-    The output tensor has shape `[4, 1, 1, 3]` and value:
-
-    ```prettyprint
-    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-    ```
-
-    (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]],
-          [[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[4, 2, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]], [[9], [11]]],
-         [[[2], [4]], [[10], [12]]],
-         [[[5], [7]], [[13], [15]]],
-         [[[6], [8]], [[14], [16]]]]
-    ```
-
-    (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-
-    ```prettyprint
-    x = [[[[1],   [2],  [3],  [4]],
-          [[5],   [6],  [7],  [8]]],
-         [[[9],  [10], [11],  [12]],
-          [[13], [14], [15],  [16]]]]
-    ```
-
-    The output tensor has shape `[8, 1, 2, 1]` and value:
-
-    ```prettyprint
-    x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-         [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-    ```
-
-    Among others, this operation is useful for reducing atrous convolution into
-    regular convolution.
-
-*  <b>`block_size`</b>: An `int` that is `>= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
deleted file mode 100644
index 84f65ec55c4..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
+++ /dev/null
@@ -1,40 +0,0 @@
-### `tf.sparse_mask(a, mask_indices, name=None)` {#sparse_mask}
-
-Masks elements of `IndexedSlices`.
-
-Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices not
-specified in `mask_indices` are returned.
-
-This is useful when you need to extract a subset of slices in an
-`IndexedSlices` object.
-
-For example:
-
-```python
-# `a` contains slices at indices [12, 26, 37, 45] from a large tensor
-# with shape [1000, 10]
-a.indices => [12, 26, 37, 45]
-tf.shape(a.values) => [4, 10]
-
-# `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask its first and last indices (which are at absolute
-# indices 12, 45)
-b = tf.sparse_mask(a, [12, 45])
-
-b.indices => [26, 37]
-tf.shape(b.values) => [2, 10]
-
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: An `IndexedSlices` instance.
-*  <b>`mask_indices`</b>: Indices of elements to mask.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The masked `IndexedSlices` instance.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
deleted file mode 100644
index 99b87e7455f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
+++ /dev/null
@@ -1,98 +0,0 @@
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
-
-Combines a batch of feature ids and values into a single `SparseTensor`.
-
-The most common use case for this function occurs when feature ids and
-their corresponding values are stored in `Example` protos on disk.
-`parse_example` will return a batch of ids and a batch of values, and this
-function joins them into a single logical `SparseTensor` for use in
-functions such as `sparse_tensor_dense_matmul`, `sparse_to_dense`, etc.
-
-The `SparseTensor` returned by this function has the following properties:
-
-  - `indices` is equivalent to `sp_ids.indices` with the last
-    dimension discarded and replaced with `sp_ids.values`.
-  - `values` is simply `sp_values.values`.
-  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
-    `output.shape = [D0, D1, ..., Dn, vocab_size]`.
-
-For example, consider the following feature vectors:
-
-```python
-  vector1 = [-3, 0, 0, 0, 0, 0]
-  vector2 = [ 0, 1, 0, 4, 1, 0]
-  vector3 = [ 5, 0, 0, 9, 0, 0]
-```
-
-These might be stored sparsely in the following Example protos by storing
-only the feature ids (column number if the vectors are treated as a matrix)
-of the non-zero elements and the corresponding values:
-
-```python
-  examples = [Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[0])),
-                  "values": Feature(float_list=FloatList(value=[-3]))}),
-              Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[1, 4, 3])),
-                  "values": Feature(float_list=FloatList(value=[1, 1, 4]))}),
-              Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[0, 3])),
-                  "values": Feature(float_list=FloatList(value=[5, 9]))})]
-```
-
-The result of calling parse_example on these examples will produce a
-dictionary with entries for "ids" and "values". Passing those two objects
-to this function along with vocab_size=6, will produce a `SparseTensor` that
-sparsely represents all three instances. Namely, the `indices` property will
-contain the coordinates of the non-zero entries in the feature matrix (the
-first dimension is the row number in the matrix, i.e., the index within the
-batch, and the second dimension is the column number, i.e., the feature id);
-`values` will contain the actual values. `shape` will be the shape of the
-original matrix, i.e., (3, 6). For our example above, the output will be
-equal to:
-
-```python
-  SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
-               values=[-3, 1, 4, 1, 5, 9],
-               dense_shape=[3, 6])
-```
-
-This method generalizes to higher-dimensions by simply providing a list for
-both the sp_ids as well as the vocab_size.
-In this case the resulting `SparseTensor` has the following properties:
-  - `indices` is equivalent to `sp_ids[0].indices` with the last
-    dimension discarded and concatenated with
-    `sp_ids[0].values, sp_ids[1].values, ...`.
-  - `values` is simply `sp_values.values`.
-  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
-    `output.shape = [D0, D1, ..., Dn] + vocab_size`.
-
-##### Args:
-
-
-*  <b>`sp_ids`</b>: A single `SparseTensor` with `values` property of type `int32`
-    or `int64` or a Python list of such `SparseTensor`s or a list thereof.
-*  <b>`sp_values`</b>: A`SparseTensor` of any type.
-*  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
-    of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
-    Or a list thereof with `all(0 <= sp_ids[i].values < vocab_size[i])` for
-    all `i`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
-   `sp_values` are already sorted. If so skip sorting, False by default
-   (optional).
-
-##### Returns:
-
-  A `SparseTensor` compactly representing a batch of feature ids and values,
-  useful for passing to functions that expect such a `SparseTensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_values` is not a `SparseTensor`. Or if `sp_ids` is neither
-    a `SparseTensor` nor a list thereof. Or if `vocab_size` is not a
-    `Tensor` or a Python int and `sp_ids` is a `SparseTensor`. Or if
-    `vocab_size` is not a or list thereof and `sp_ids` is a list.
-*  <b>`ValueError`</b>: If `sp_ids` and `vocab_size` are lists of different lengths.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_reshape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_reshape.md
deleted file mode 100644
index 263f676fc15..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_reshape.md
+++ /dev/null
@@ -1,51 +0,0 @@
-### `tf.sparse_reshape(sp_input, shape, name=None)` {#sparse_reshape}
-
-Reshapes a `SparseTensor` to represent values in a new dense shape.
-
-This operation has the same semantics as `reshape` on the represented dense
-tensor.  The indices of non-empty values in `sp_input` are recomputed based
-on the new dense shape, and a new `SparseTensor` is returned containing the
-new indices and new shape.  The order of non-empty values in `sp_input` is
-unchanged.
-
-If one component of `shape` is the special value -1, the size of that
-dimension is computed so that the total dense size remains constant.  At
-most one component of `shape` can be -1.  The number of dense elements
-implied by `shape` must be the same as the number of dense elements
-originally represented by `sp_input`.
-
-For example, if `sp_input` has shape `[2, 3, 6]` and `indices` / `values`:
-
-    [0, 0, 0]: a
-    [0, 0, 1]: b
-    [0, 1, 0]: c
-    [1, 0, 0]: d
-    [1, 2, 3]: e
-
-and `shape` is `[9, -1]`, then the output will be a `SparseTensor` of
-shape `[9, 4]` and `indices` / `values`:
-
-    [0, 0]: a
-    [0, 1]: b
-    [1, 2]: c
-    [4, 2]: d
-    [8, 1]: e
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`shape`</b>: A 1-D (vector) int64 `Tensor` specifying the new dense shape of the
-    represented `SparseTensor`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A `SparseTensor` with the same non-empty values but with indices calculated
-  by the new dense shape.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
deleted file mode 100644
index 19f25f473da..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.squared_difference(x, y, name=None)` {#squared_difference}
-
-Returns (x - y)(x - y) element-wise.
-
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.string_to_number.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.string_to_number.md
deleted file mode 100644
index c6837bfa4a8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.string_to_number.md
+++ /dev/null
@@ -1,20 +0,0 @@
-### `tf.string_to_number(string_tensor, out_type=None, name=None)` {#string_to_number}
-
-Converts each string in the input Tensor to the specified numeric type.
-
-(Note that int32 overflow results in an error while float overflow
-results in a rounded value.)
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.float32, tf.int32`. Defaults to `tf.float32`.
-    The numeric type to interpret each string in `string_tensor` as.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-  A Tensor of the same shape as the input `string_tensor`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md
deleted file mode 100644
index 613f4ebd73d..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.tanh.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.tanh.md
deleted file mode 100644
index 154a13059ce..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.tanh.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### `tf.tanh(x, name=None)` {#tanh}
-
-Computes hyperbolic tangent of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A Tensor or SparseTensor with type `float`, `double`, `int32`,
-    `complex64`, `int64`, or `qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A Tensor or SparseTensor respectively with the same type as `x` if
-  `x.dtype != qint32` otherwise the return type is `quint8`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.test.is_gpu_available.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.test.is_gpu_available.md
deleted file mode 100644
index db6132b2595..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.test.is_gpu_available.md
+++ /dev/null
@@ -1,13 +0,0 @@
-### `tf.test.is_gpu_available(cuda_only=False)` {#is_gpu_available}
-
-Returns whether TensorFlow can access a GPU.
-
-##### Args:
-
-
-*  <b>`cuda_only`</b>: limit the search to CUDA gpus.
-
-##### Returns:
-
-  True iff a gpu device of the requested kind is available.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.to_bfloat16.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.to_bfloat16.md
deleted file mode 100644
index 3d55da1110c..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.to_bfloat16.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### `tf.to_bfloat16(x, name='ToBFloat16')` {#to_bfloat16}
-
-Casts a tensor to type `bfloat16`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` with same shape as `x` with type `bfloat16`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` cannot be cast to the `bfloat16`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md
deleted file mode 100644
index 1797a0d3b55..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FeedFnHook.md
+++ /dev/null
@@ -1,88 +0,0 @@
-Runs `feed_fn` and sets the `feed_dict` accordingly.
-- - -
-
-#### `tf.train.FeedFnHook.__init__(feed_fn)` {#FeedFnHook.__init__}
-
-Constructs the FeedFnHook with given `feed_fn`.
-
-##### Args:
-
-
-*  <b>`feed_fn`</b>: function, no arguments and returns `dict` to feed.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.after_create_session(session, coord)` {#FeedFnHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.after_run(run_context, run_values)` {#FeedFnHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.before_run(run_context)` {#FeedFnHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.FeedFnHook.begin()` {#FeedFnHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.end(session)` {#FeedFnHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md
deleted file mode 100644
index bf8e7184b68..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.FinalOpsHook.md
+++ /dev/null
@@ -1,111 +0,0 @@
-A run hook which evaluates `Tensors` at the end of a session.
-- - -
-
-#### `tf.train.FinalOpsHook.__init__(final_ops, final_ops_feed_dict=None)` {#FinalOpsHook.__init__}
-
-Constructs the FinalOpHook with ops to run at the end of the session.
-
-##### Args:
-
-
-*  <b>`final_ops`</b>: A single `Tensor`, a list of `Tensors` or a dictionary of
-    names to `Tensors`.
-*  <b>`final_ops_feed_dict`</b>: A feed dictionary to use when running
-    `final_ops_dict`.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.after_create_session(session, coord)` {#FinalOpsHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.after_run(run_context, run_values)` {#FinalOpsHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.before_run(run_context)` {#FinalOpsHook.before_run}
-
-Called before each call to run().
-
-You can return from this call a `SessionRunArgs` object indicating ops or
-tensors to add to the upcoming `run()` call.  These ops/tensors will be run
-together with the ops/tensors originally passed to the original run() call.
-The run args you return can also contain feeds to be added to the run()
-call.
-
-The `run_context` argument is a `SessionRunContext` that provides
-information about the upcoming `run()` call: the originally requested
-op/tensors, the TensorFlow Session.
-
-At this point graph is finalized and you can not add ops.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-
-##### Returns:
-
-  None or a `SessionRunArgs` object.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.begin()` {#FinalOpsHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.end(session)` {#FinalOpsHook.end}
-
-
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.final_ops_values` {#FinalOpsHook.final_ops_values}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.NanLossDuringTrainingError.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.NanLossDuringTrainingError.md
deleted file mode 100644
index d568c561146..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.NanLossDuringTrainingError.md
+++ /dev/null
@@ -1,8 +0,0 @@
-
-- - -
-
-#### `tf.train.NanLossDuringTrainingError.__str__()` {#NanLossDuringTrainingError.__str__}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.ProximalAdagradOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.ProximalAdagradOptimizer.md
deleted file mode 100644
index 002dabfbf9a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.ProximalAdagradOptimizer.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Optimizer that implements the Proximal Adagrad algorithm.
-
-See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-- - -
-
-#### `tf.train.ProximalAdagradOptimizer.__init__(learning_rate, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='ProximalAdagrad')` {#ProximalAdagradOptimizer.__init__}
-
-Construct a new ProximalAdagrad optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`initial_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adagrad".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_accumulator_value` is invalid.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.SessionRunValues.__new__.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.SessionRunValues.__new__.md
deleted file mode 100644
index 3540616254f..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.SessionRunValues.__new__.md
+++ /dev/null
@@ -1,4 +0,0 @@
-#### `tf.train.SessionRunValues.__new__(_cls, results, options, run_metadata)` {#SessionRunValues.__new__}
-
-Create new instance of SessionRunValues(results, options, run_metadata)
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
deleted file mode 100644
index 50ebc652abd..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
+++ /dev/null
@@ -1,65 +0,0 @@
-Steps per second monitor.
-- - -
-
-#### `tf.train.StepCounterHook.__init__(every_n_steps=100, every_n_secs=None, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.after_create_session(session, coord)` {#StepCounterHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.StepCounterHook.after_run(run_context, run_values)` {#StepCounterHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.before_run(run_context)` {#StepCounterHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.begin()` {#StepCounterHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.end(session)` {#StepCounterHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md
deleted file mode 100644
index c98eed194ec..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.train.input_producer(input_tensor, element_shape=None, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, summary_name=None, name=None, cancel_op=None)` {#input_producer}
-
-Output the rows of `input_tensor` to a queue for an input pipeline.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: A tensor with the rows to produce. Must be at least
-    one-dimensional. Must either have a fully-defined shape, or
-    `element_shape` must be defined.
-*  <b>`element_shape`</b>: (Optional.) A `TensorShape` representing the shape of a
-    row of `input_tensor`, if it cannot be inferred.
-*  <b>`num_epochs`</b>: (Optional.) An integer. If specified `input_producer` produces
-    each row of `input_tensor` `num_epochs` times before generating an
-    `OutOfRange` error. If not specified, `input_producer` can cycle through
-    the rows of `input_tensor` an unlimited number of times.
-*  <b>`shuffle`</b>: (Optional.) A boolean. If true, the rows are randomly shuffled
-    within each epoch.
-*  <b>`seed`</b>: (Optional.) An integer. The seed to use if `shuffle` is true.
-*  <b>`capacity`</b>: (Optional.) The capacity of the queue to be used for buffering
-    the input.
-*  <b>`shared_name`</b>: (Optional.) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`summary_name`</b>: (Optional.) If set, a scalar summary for the current queue
-    size will be generated, using this name as part of the tag.
-*  <b>`name`</b>: (Optional.) A name for queue.
-*  <b>`cancel_op`</b>: (Optional.) Cancel op for the queue
-
-##### Returns:
-
-  A queue with the output rows.  A `QueueRunner` for the queue is
-  added to the current `QUEUE_RUNNER` collection of the current
-  graph.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of the input cannot be inferred from the arguments.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md
deleted file mode 100644
index 1aba482ef09..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md
+++ /dev/null
@@ -1,36 +0,0 @@
-### `tf.train.string_input_producer(string_tensor, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None, cancel_op=None)` {#string_input_producer}
-
-Output strings (e.g. filenames) to a queue for an input pipeline.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A 1-D string tensor with the strings to produce.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `string_input_producer`
-    produces each string from `string_tensor` `num_epochs` times before
-    generating an `OutOfRange` error. If not specified,
-    `string_input_producer` can cycle through the strings in `string_tensor`
-    an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the strings are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-*  <b>`cancel_op`</b>: Cancel op for the queue (optional).
-
-##### Returns:
-
-  A queue with the output strings.  A `QueueRunner` for the Queue
-  is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the string_tensor is a null Python list.  At runtime,
-  will fail with an assertion if string_tensor becomes a null tensor.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.summary_iterator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.summary_iterator.md
deleted file mode 100644
index f998e620462..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.summary_iterator.md
+++ /dev/null
@@ -1,42 +0,0 @@
-### `tf.train.summary_iterator(path)` {#summary_iterator}
-
-An iterator for reading `Event` protocol buffers from an event file.
-
-You can use this function to read events written to an event file. It returns
-a Python iterator that yields `Event` protocol buffers.
-
-Example: Print the contents of an events file.
-
-```python
-for e in tf.train.summary_iterator(path to events file):
-    print(e)
-```
-
-Example: Print selected summary values.
-
-```python
-# This example supposes that the events file contains summaries with a
-# summary value tag 'loss'.  These could have been added by calling
-# `add_summary()`, passing the output of a scalar summary op created with
-# with: `tf.summary.scalar('loss', loss_tensor)`.
-for e in tf.train.summary_iterator(path to events file):
-    for v in e.summary.value:
-        if v.tag == 'loss':
-            print(v.simple_value)
-```
-
-See the protocol buffer definitions of
-[Event](https://www.tensorflow.org/code/tensorflow/core/util/event.proto)
-and
-[Summary](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-for more information about their attributes.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to an event file created by a `SummaryWriter`.
-
-##### Yields:
-
-  `Event` protocol buffers.
-
diff --git a/tensorflow/g3doc/api_docs/python/histogram_ops.md b/tensorflow/g3doc/api_docs/python/histogram_ops.md
deleted file mode 100644
index 912712fb028..00000000000
--- a/tensorflow/g3doc/api_docs/python/histogram_ops.md
+++ /dev/null
@@ -1,48 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Histograms
-[TOC]
-
-## Histograms
-
-- - -
-
-### `tf.histogram_fixed_width(values, value_range, nbins=100, dtype=tf.int32, name=None)` {#histogram_fixed_width}
-
-Return histogram of values.
-
-Given the tensor `values`, this operation returns a rank 1 histogram counting
-the number of entries in `values` that fell into every bin.  The bins are
-equal width and determined by the arguments `value_range` and `nbins`.
-
-##### Args:
-
-
-*  <b>`values`</b>: Numeric `Tensor`.
-*  <b>`value_range`</b>: Shape [2] `Tensor`.  new_values <= value_range[0] will be
-    mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
-    Must be same dtype as new_values.
-*  <b>`nbins`</b>: Scalar `int32 Tensor`.  Number of histogram bins.
-*  <b>`dtype`</b>: dtype for returned histogram.
-*  <b>`name`</b>: A name for this operation (defaults to 'histogram_fixed_width').
-
-##### Returns:
-
-  A 1-D `Tensor` holding histogram of values.
-
-
-*  <b>`Examples`</b>: 
-
-```python
-# Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-nbins = 5
-value_range = [0.0, 5.0]
-new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-with tf.default_session() as sess:
-  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  variables.global_variables_initializer().run()
-  sess.run(hist) => [2, 1, 1, 0, 2]
-```
-
-
diff --git a/tensorflow/g3doc/api_docs/python/image.md b/tensorflow/g3doc/api_docs/python/image.md
deleted file mode 100644
index baef42db057..00000000000
--- a/tensorflow/g3doc/api_docs/python/image.md
+++ /dev/null
@@ -1,1522 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Images
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Encoding and Decoding
-
-TensorFlow provides Ops to decode and encode JPEG and PNG formats.  Encoded
-images are represented by scalar string Tensors, decoded images by 3-D uint8
-tensors of shape `[height, width, channels]`. (PNG also supports uint16.)
-
-The encode and decode Ops apply to one image at a time.  Their input and output
-are all of variable size.  If you need fixed size images, pass the output of
-the decode Ops to one of the cropping and resizing Ops.
-
-Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
-presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
-to be stripped from the image and re-attached using slicing ops.
-
-- - -
-
-### `tf.image.decode_gif(contents, name=None)` {#decode_gif}
-
-Decode the first frame of a GIF-encoded image to a uint8 tensor.
-
-GIF with frame or transparency compression are not supported
-convert animated GIF from compressed to uncompressed by:
-
-convert $src.gif -coalesce $dst.gif
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The GIF-encoded image.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `uint8`.
-  4-D with shape `[num_frames, height, width, 3]`. RGB order
-
-
-
-- - -
-
-### `tf.image.decode_jpeg(contents, channels=None, ratio=None, fancy_upscaling=None, try_recover_truncated=None, acceptable_fraction=None, dct_method=None, name=None)` {#decode_jpeg}
-
-Decode a JPEG-encoded image to a uint8 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The JPEG-encoded image.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`.
-    Number of color channels for the decoded image.
-*  <b>`ratio`</b>: An optional `int`. Defaults to `1`. Downscaling ratio.
-*  <b>`fancy_upscaling`</b>: An optional `bool`. Defaults to `True`.
-    If true use a slower but nicer upscaling of the
-    chroma planes (yuv420/422 only).
-*  <b>`try_recover_truncated`</b>: An optional `bool`. Defaults to `False`.
-    If true try to recover an image from truncated input.
-*  <b>`acceptable_fraction`</b>: An optional `float`. Defaults to `1`.
-    The minimum required fraction of lines before a truncated
-    input is accepted.
-*  <b>`dct_method`</b>: An optional `string`. Defaults to `""`.
-    string specifying a hint about the algorithm used for
-    decompression.  Defaults to "" which maps to a system-specific
-    default.  Currently valid values are ["INTEGER_FAST",
-    "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-    jpeg library changes to a version that does not have that specific
-    option.)
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `uint8`. 3-D with shape `[height, width, channels]`..
-
-
-- - -
-
-### `tf.image.encode_jpeg(image, format=None, quality=None, progressive=None, optimize_size=None, chroma_downsampling=None, density_unit=None, x_density=None, y_density=None, xmp_metadata=None, name=None)` {#encode_jpeg}
-
-JPEG-encode an image.
-
-`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-
-The attr `format` can be used to override the color format of the encoded
-output.  Values can be:
-
-*   `''`: Use a default format based on the number of channels in the image.
-*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-    of `image` must be 1.
-*   `rgb`: Output an RGB JPEG image. The `channels` dimension
-    of `image` must be 3.
-
-If `format` is not specified or is the empty string, a default format is picked
-in function of the number of channels in `image`:
-
-*   1: Output a grayscale image.
-*   3: Output an RGB image.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor` of type `uint8`.
-    3-D with shape `[height, width, channels]`.
-*  <b>`format`</b>: An optional `string` from: `"", "grayscale", "rgb"`. Defaults to `""`.
-    Per pixel image format.
-*  <b>`quality`</b>: An optional `int`. Defaults to `95`.
-    Quality of the compression from 0 to 100 (higher is better and slower).
-*  <b>`progressive`</b>: An optional `bool`. Defaults to `False`.
-    If True, create a JPEG that loads progressively (coarse to fine).
-*  <b>`optimize_size`</b>: An optional `bool`. Defaults to `False`.
-    If True, spend CPU/RAM to reduce size with no quality change.
-*  <b>`chroma_downsampling`</b>: An optional `bool`. Defaults to `True`.
-    See http://en.wikipedia.org/wiki/Chroma_subsampling.
-*  <b>`density_unit`</b>: An optional `string` from: `"in", "cm"`. Defaults to `"in"`.
-    Unit used to specify `x_density` and `y_density`:
-    pixels per inch (`'in'`) or centimeter (`'cm'`).
-*  <b>`x_density`</b>: An optional `int`. Defaults to `300`.
-    Horizontal pixels per density unit.
-*  <b>`y_density`</b>: An optional `int`. Defaults to `300`.
-    Vertical pixels per density unit.
-*  <b>`xmp_metadata`</b>: An optional `string`. Defaults to `""`.
-    If not empty, embed this XMP metadata in the image header.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. 0-D. JPEG-encoded image.
-
-
-
-- - -
-
-### `tf.image.decode_png(contents, channels=None, dtype=None, name=None)` {#decode_png}
-
-Decode a PNG-encoded image to a uint8 or uint16 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the PNG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-
-If needed, the PNG-encoded image is transformed to match the requested number
-of color channels.
-
-##### Args:
-
-
-*  <b>`contents`</b>: A `Tensor` of type `string`. 0-D.  The PNG-encoded image.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`.
-    Number of color channels for the decoded image.
-*  <b>`dtype`</b>: An optional `tf.DType` from: `tf.uint8, tf.uint16`. Defaults to `tf.uint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `dtype`. 3-D with shape `[height, width, channels]`.
-
-
-- - -
-
-### `tf.image.encode_png(image, compression=None, name=None)` {#encode_png}
-
-PNG-encode an image.
-
-`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-where `channels` is:
-
-*   1: for grayscale.
-*   2: for grayscale + alpha.
-*   3: for RGB.
-*   4: for RGBA.
-
-The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-default or a value from 0 to 9.  9 is the highest compression level, generating
-the smallest output, but is slower.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor`. Must be one of the following types: `uint8`, `uint16`.
-    3-D with shape `[height, width, channels]`.
-*  <b>`compression`</b>: An optional `int`. Defaults to `-1`. Compression level.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. 0-D. PNG-encoded image.
-
-
-
-- - -
-
-### `tf.image.decode_image(contents, channels=None, name=None)` {#decode_image}
-
-Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
-Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate
-operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
-
-Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
-opposed to `decode_jpeg` and `decode_png`, which return 3-D arrays
-`[height, width, num_channels]`. Make sure to take this into account when
-constructing your graph if you are intermixing GIF files with JPEG and/or PNG
-files.
-
-##### Args:
-
-
-*  <b>`contents`</b>: 0-D `string`. The encoded image bytes.
-*  <b>`channels`</b>: An optional `int`. Defaults to `0`. Number of color channels for
-    the decoded image.
-*  <b>`name`</b>: A name for the operation (optional)
-
-##### Returns:
-
-  `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
-    JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF
-    images.
-
-
-
-## Resizing
-
-The resizing Ops accept input images as tensors of several types.  They always
-output resized images as float32 tensors.
-
-The convenience function [`resize_images()`](#resize_images) supports both 4-D
-and 3-D tensors as input and output.  4-D tensors are for batches of images,
-3-D tensors for individual images.
-
-Other resizing Ops only support 4-D batches of images as input:
-[`resize_area`](#resize_area), [`resize_bicubic`](#resize_bicubic),
-[`resize_bilinear`](#resize_bilinear),
-[`resize_nearest_neighbor`](#resize_nearest_neighbor).
-
-Example:
-
-```python
-# Decode a JPG image and resize it to 299 by 299 using default method.
-image = tf.image.decode_jpeg(...)
-resized_image = tf.image.resize_images(image, [299, 299])
-```
-
-- - -
-
-### `tf.image.resize_images(images, size, method=0, align_corners=False)` {#resize_images}
-
-Resize `images` to `size` using the specified `method`.
-
-Resized images will be distorted if their original aspect ratio is not
-the same as `size`.  To avoid distortions see
-[`resize_image_with_crop_or_pad`](#resize_image_with_crop_or_pad).
-
-`method` can be one of:
-
-*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
-*   <b>`ResizeMethod.AREA`</b>: Area interpolation.
-
-##### Args:
-
-
-*  <b>`images`</b>: 4-D Tensor of shape `[batch, height, width, channels]` or
-          3-D Tensor of shape `[height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-        new size for the images.
-*  <b>`method`</b>: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-*  <b>`align_corners`</b>: bool. If true, exactly align all 4 corners of the input and
-                 output. Defaults to `false`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `images` is incompatible with the
-    shape arguments to this function
-*  <b>`ValueError`</b>: if `size` has invalid shape or type.
-*  <b>`ValueError`</b>: if an unsupported resize method is specified.
-
-##### Returns:
-
-  If `images` was 4-D, a 4-D float Tensor of shape
-  `[batch, new_height, new_width, channels]`.
-  If `images` was 3-D, a 3-D float Tensor of shape
-  `[new_height, new_width, channels]`.
-
-
-
-- - -
-
-### `tf.image.resize_area(images, size, align_corners=None, name=None)` {#resize_area}
-
-Resize `images` to `size` using area interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
-
-- - -
-
-### `tf.image.resize_bicubic(images, size, align_corners=None, name=None)` {#resize_bicubic}
-
-Resize `images` to `size` using bicubic interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
-
-- - -
-
-### `tf.image.resize_bilinear(images, size, align_corners=None, name=None)` {#resize_bilinear}
-
-Resize `images` to `size` using bilinear interpolation.
-
-Input images can be of different types but output images are always float.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
-
-- - -
-
-### `tf.image.resize_nearest_neighbor(images, size, align_corners=None, name=None)` {#resize_nearest_neighbor}
-
-Resize `images` to `size` using nearest neighbor interpolation.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`size`</b>: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-    new size for the images.
-*  <b>`align_corners`</b>: An optional `bool`. Defaults to `False`.
-    If true, rescale input by (new_height - 1) / (height - 1), which
-    exactly aligns the 4 corners of images and resized images. If false, rescale
-    by new_height / height. Treat similarly the width dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-
-
-
-## Cropping
-
-- - -
-
-### `tf.image.resize_image_with_crop_or_pad(image, target_height, target_width)` {#resize_image_with_crop_or_pad}
-
-Crops and/or pads an image to a target width and height.
-
-Resizes an image to a target width and height by either centrally
-cropping the image or padding it evenly with zeros.
-
-If `width` or `height` is greater than the specified `target_width` or
-`target_height` respectively, this op centrally crops along that dimension.
-If `width` or `height` is smaller than the specified `target_width` or
-`target_height` respectively, this op centrally pads with 0 along that
-dimension.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`
-*  <b>`target_height`</b>: Target height.
-*  <b>`target_width`</b>: Target width.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `target_height` or `target_width` are zero or negative.
-
-##### Returns:
-
-  Cropped and/or padded image of shape
-  `[target_height, target_width, channels]`
-
-
-
-- - -
-
-### `tf.image.central_crop(image, central_fraction)` {#central_crop}
-
-Crop the central region of the image.
-
-Remove the outer parts of an image but retain the central region of the image
-along each dimension. If we specify central_fraction = 0.5, this function
-returns the region marked with "X" in the below diagram.
-
-     --------
-    |        |
-    |  XXXX  |
-    |  XXXX  |
-    |        |   where "X" is the central 50% of the image.
-     --------
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D float Tensor of shape [height, width, depth]
-*  <b>`central_fraction`</b>: float (0, 1], fraction of size to crop
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if central_crop_fraction is not within (0, 1].
-
-##### Returns:
-
-  3-D float Tensor
-
-
-- - -
-
-### `tf.image.pad_to_bounding_box(image, offset_height, offset_width, target_height, target_width)` {#pad_to_bounding_box}
-
-Pad `image` with zeros to the specified `height` and `width`.
-
-Adds `offset_height` rows of zeros on top, `offset_width` columns of
-zeros on the left, and then pads the image on the bottom and right
-with zeros until it has dimensions `target_height`, `target_width`.
-
-This op does nothing if `offset_*` is zero and the image already has size
-`target_height` by `target_width`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor with shape `[height, width, channels]`
-*  <b>`offset_height`</b>: Number of rows of zeros to add on top.
-*  <b>`offset_width`</b>: Number of columns of zeros to add on the left.
-*  <b>`target_height`</b>: Height of output image.
-*  <b>`target_width`</b>: Width of output image.
-
-##### Returns:
-
-  3-D tensor of shape `[target_height, target_width, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `image` is incompatible with the `offset_*` or
-    `target_*` arguments, or either `offset_height` or `offset_width` is
-    negative.
-
-
-- - -
-
-### `tf.image.crop_to_bounding_box(image, offset_height, offset_width, target_height, target_width)` {#crop_to_bounding_box}
-
-Crops an image to a specified bounding box.
-
-This op cuts a rectangular part out of `image`. The top-left corner of the
-returned image is at `offset_height, offset_width` in `image`, and its
-lower-right corner is at
-`offset_height + target_height, offset_width + target_width`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor with shape `[height, width, channels]`
-*  <b>`offset_height`</b>: Vertical coordinate of the top-left corner of the result in
-                 the input.
-*  <b>`offset_width`</b>: Horizontal coordinate of the top-left corner of the result in
-                the input.
-*  <b>`target_height`</b>: Height of the result.
-*  <b>`target_width`</b>: Width of the result.
-
-##### Returns:
-
-  3-D tensor of image with shape `[target_height, target_width, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of `image` is incompatible with the `offset_*` or
-    `target_*` arguments, or either `offset_height` or `offset_width` is
-    negative, or either `target_height` or `target_width` is not positive.
-
-
-- - -
-
-### `tf.image.extract_glimpse(input, size, offsets, centered=None, normalized=None, uniform_noise=None, name=None)` {#extract_glimpse}
-
-Extracts a glimpse from the input tensor.
-
-Returns a set of windows called glimpses extracted at location
-`offsets` from the input tensor. If the windows only partially
-overlaps the inputs, the non overlapping areas will be filled with
-random noise.
-
-The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-glimpse_width, channels]`. The channels and batch dimensions are the
-same as that of the input tensor. The height and width of the output
-windows are specified in the `size` parameter.
-
-The argument `normalized` and `centered` controls how the windows are built:
-
-* If the coordinates are normalized but not centered, 0.0 and 1.0
-  correspond to the minimum and maximum of each height and width
-  dimension.
-* If the coordinates are both normalized and centered, they range from
-  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-  left corner, the lower right corner is located at (1.0, 1.0) and the
-  center is at (0, 0).
-* If the coordinates are not normalized they are interpreted as
-  numbers of pixels.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `float32`.
-    A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-*  <b>`size`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of 2 elements containing the size of the glimpses
-    to extract.  The glimpse height must be specified first, following
-    by the glimpse width.
-*  <b>`offsets`</b>: A `Tensor` of type `float32`.
-    A 2-D integer tensor of shape `[batch_size, 2]` containing
-    the x, y locations of the center of each window.
-*  <b>`centered`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the offset coordinates are centered relative to
-    the image, in which case the (0, 0) offset is relative to the center
-    of the input images. If false, the (0,0) offset corresponds to the
-    upper left corner of the input images.
-*  <b>`normalized`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the offset coordinates are normalized.
-*  <b>`uniform_noise`</b>: An optional `bool`. Defaults to `True`.
-    indicates if the noise should be generated using a
-    uniform distribution or a Gaussian distribution.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  A tensor representing the glimpses `[batch_size,
-  glimpse_height, glimpse_width, channels]`.
-
-
-
-- - -
-
-### `tf.image.crop_and_resize(image, boxes, box_ind, crop_size, method=None, extrapolation_value=None, name=None)` {#crop_and_resize}
-
-Extracts crops from the input image tensor and bilinearly resizes them (possibly
-
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
-
-Returns a tensor with `crops` from the input `image` at positions defined at the
-bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`.
-    A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-    Both `image_height` and `image_width` need to be positive.
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-    specifies the coordinates of a box in the `box_ind[i]` image and is specified
-    in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-    `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-    `[0, 1]` interval of normalized image height is mapped to
-    `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-    which case the sampled crop is an up-down flipped version of the original
-    image. The width dimension is treated similarly. Normalized coordinates
-    outside the `[0, 1]` range are allowed, in which case we use
-    `extrapolation_value` to extrapolate the input image values.
-*  <b>`box_ind`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-    The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-*  <b>`crop_size`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-    cropped image patches are resized to this size. The aspect ratio of the image
-    content is not preserved. Both `crop_height` and `crop_width` need to be
-    positive.
-*  <b>`method`</b>: An optional `string` from: `"bilinear"`. Defaults to `"bilinear"`.
-    A string specifying the interpolation method. Only 'bilinear' is
-    supported for now.
-*  <b>`extrapolation_value`</b>: An optional `float`. Defaults to `0`.
-    Value used for extrapolation, when applicable.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32`.
-  A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-
-
-
-## Flipping, Rotating and Transposing
-
-- - -
-
-### `tf.image.flip_up_down(image)` {#flip_up_down}
-
-Flip an image horizontally (upside down).
-
-Outputs the contents of `image` flipped along the first dimension, which is
-`height`.
-
-See also `reverse()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
-
-- - -
-
-### `tf.image.random_flip_up_down(image, seed=None)` {#random_flip_up_down}
-
-Randomly flips an image vertically (upside down).
-
-With a 1 in 2 chance, outputs the contents of `image` flipped along the first
-dimension, which is `height`.  Otherwise output the image as-is.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
-
-
-- - -
-
-### `tf.image.flip_left_right(image)` {#flip_left_right}
-
-Flip an image horizontally (left to right).
-
-Outputs the contents of `image` flipped along the second dimension, which is
-`width`.
-
-See also `reverse()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
-
-- - -
-
-### `tf.image.random_flip_left_right(image, seed=None)` {#random_flip_left_right}
-
-Randomly flip an image horizontally (left to right).
-
-With a 1 in 2 chance, outputs the contents of `image` flipped along the
-second dimension, which is `width`.  Otherwise output the image as-is.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels].`
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  A 3-D tensor of the same type and shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
-
-
-- - -
-
-### `tf.image.transpose_image(image)` {#transpose_image}
-
-Transpose an image by swapping the first and second dimension.
-
-See also `transpose()`.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`
-
-##### Returns:
-
-  A 3-D tensor of shape `[width, height, channels]`
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of `image` not supported.
-
-
-
-- - -
-
-### `tf.image.rot90(image, k=1, name=None)` {#rot90}
-
-Rotate an image counter-clockwise by 90 degrees.
-
-##### Args:
-
-
-*  <b>`image`</b>: A 3-D tensor of shape `[height, width, channels]`.
-*  <b>`k`</b>: A scalar integer. The number of times the image is rotated by 90 degrees.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A rotated 3-D tensor of the same type and shape as `image`.
-
-
-
-## Converting Between Colorspaces.
-
-Image ops work either on individual images or on batches of images, depending on
-the shape of their input Tensor.
-
-If 3-D, the shape is `[height, width, channels]`, and the Tensor represents one
-image. If 4-D, the shape is `[batch_size, height, width, channels]`, and the
-Tensor represents `batch_size` images.
-
-Currently, `channels` can usefully be 1, 2, 3, or 4. Single-channel images are
-grayscale, images with 3 channels are encoded as either RGB or HSV. Images
-with 2 or 4 channels include an alpha channel, which has to be stripped from the
-image before passing the image to most image processing functions (and can be
-re-attached later).
-
-Internally, images are either stored in as one `float32` per channel per pixel
-(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel
-per pixel (values are assumed to lie in `[0,255]`).
-
-TensorFlow can convert between images in RGB or HSV. The conversion functions
-work only on float images, so you need to convert images in other formats using
-[`convert_image_dtype`](#convert-image-dtype).
-
-Example:
-
-```python
-# Decode an image and convert it to HSV.
-rgb_image = tf.image.decode_png(...,  channels=3)
-rgb_image_float = tf.image.convert_image_dtype(rgb_image, tf.float32)
-hsv_image = tf.image.rgb_to_hsv(rgb_image)
-```
-
-- - -
-
-### `tf.image.rgb_to_grayscale(images, name=None)` {#rgb_to_grayscale}
-
-Converts one or more images from RGB to Grayscale.
-
-Outputs a tensor of the same `DType` and rank as `images`.  The size of the
-last dimension of the output is 1, containing the Grayscale value of the
-pixels.
-
-##### Args:
-
-
-*  <b>`images`</b>: The RGB tensor to convert. Last dimension must have size 3 and
-    should contain RGB values.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The converted grayscale image(s).
-
-
-- - -
-
-### `tf.image.grayscale_to_rgb(images, name=None)` {#grayscale_to_rgb}
-
-Converts one or more images from Grayscale to RGB.
-
-Outputs a tensor of the same `DType` and rank as `images`.  The size of the
-last dimension of the output is 3, containing the RGB value of the pixels.
-
-##### Args:
-
-
-*  <b>`images`</b>: The Grayscale tensor to convert. Last dimension must be size 1.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The converted grayscale image(s).
-
-
-
-- - -
-
-### `tf.image.hsv_to_rgb(images, name=None)` {#hsv_to_rgb}
-
-Convert one or more images from HSV to RGB.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-See `rgb_to_hsv` for a description of the HSV encoding.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. `images` converted to RGB.
-
-
-- - -
-
-### `tf.image.rgb_to_hsv(images, name=None)` {#rgb_to_hsv}
-
-Converts one or more images from RGB to HSV.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`. `images` converted to HSV.
-
-
-
-- - -
-
-### `tf.image.convert_image_dtype(image, dtype, saturate=False, name=None)` {#convert_image_dtype}
-
-Convert `image` to `dtype`, scaling its values if needed.
-
-Images that are represented using floating point values are expected to have
-values in the range [0,1). Image data stored in integer data types are
-expected to have values in the range `[0,MAX]`, where `MAX` is the largest
-positive representable number for the data type.
-
-This op converts between data types, scaling the values appropriately before
-casting.
-
-Note that converting from floating point inputs to integer types may lead to
-over/underflow problems. Set saturate to `True` to avoid such problem in
-problematic conversions. If enabled, saturation will clip the output into the
-allowed range before performing a potentially dangerous cast (and only before
-performing such a cast, i.e., when casting from a floating point to an integer
-type, and when casting from a signed to an unsigned type; `saturate` has no
-effect on casts between floats, or on casts that increase the type's range).
-
-##### Args:
-
-
-*  <b>`image`</b>: An image.
-*  <b>`dtype`</b>: A `DType` to convert `image` to.
-*  <b>`saturate`</b>: If `True`, clip the input before casting (if necessary).
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  `image`, converted to `dtype`.
-
-
-
-## Image Adjustments
-
-TensorFlow provides functions to adjust images in various ways: brightness,
-contrast, hue, and saturation.  Each adjustment can be done with predefined
-parameters or with random parameters picked from predefined intervals. Random
-adjustments are often useful to expand a training set and reduce overfitting.
-
-If several adjustments are chained it is advisable to minimize the number of
-redundant conversions by first converting the images to the most natural data
-type and representation (RGB or HSV).
-
-- - -
-
-### `tf.image.adjust_brightness(image, delta)` {#adjust_brightness}
-
-Adjust the brightness of RGB or Grayscale images.
-
-This is a convenience method that converts an RGB image to float
-representation, adjusts its brightness, and then converts it back to the
-original data type. If several adjustments are chained it is advisable to
-minimize the number of redundant conversions.
-
-The value `delta` is added to all components of the tensor `image`. Both
-`image` and `delta` are converted to `float` before adding (and `image` is
-scaled appropriately if it is in fixed-point representation). For regular
-images, `delta` should be in the range `[0,1)`, as it is added to the image in
-floating point representation, where pixel values are in the `[0,1)` range.
-
-##### Args:
-
-
-*  <b>`image`</b>: A tensor.
-*  <b>`delta`</b>: A scalar. Amount to add to the pixel values.
-
-##### Returns:
-
-  A brightness-adjusted tensor of the same shape and type as `image`.
-
-
-- - -
-
-### `tf.image.random_brightness(image, max_delta, seed=None)` {#random_brightness}
-
-Adjust the brightness of images by a random factor.
-
-Equivalent to `adjust_brightness()` using a `delta` randomly picked in the
-interval `[-max_delta, max_delta)`.
-
-##### Args:
-
-
-*  <b>`image`</b>: An image.
-*  <b>`max_delta`</b>: float, must be non-negative.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  The brightness-adjusted image.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `max_delta` is negative.
-
-
-
-- - -
-
-### `tf.image.adjust_contrast(images, contrast_factor)` {#adjust_contrast}
-
-Adjust contrast of RGB or grayscale images.
-
-This is a convenience method that converts an RGB image to float
-representation, adjusts its contrast, and then converts it back to the
-original data type. If several adjustments are chained it is advisable to
-minimize the number of redundant conversions.
-
-`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-interpreted as `[height, width, channels]`.  The other dimensions only
-represent a collection of images, such as `[batch, height, width, channels].`
-
-Contrast is adjusted independently for each channel of each image.
-
-For each channel, this Op computes the mean of the image pixels in the
-channel and then adjusts each component `x` of each pixel to
-`(x - mean) * contrast_factor + mean`.
-
-##### Args:
-
-
-*  <b>`images`</b>: Images to adjust.  At least 3-D.
-*  <b>`contrast_factor`</b>: A float multiplier for adjusting contrast.
-
-##### Returns:
-
-  The contrast-adjusted image or images.
-
-
-- - -
-
-### `tf.image.random_contrast(image, lower, upper, seed=None)` {#random_contrast}
-
-Adjust the contrast of an image by a random factor.
-
-Equivalent to `adjust_contrast()` but uses a `contrast_factor` randomly
-picked in the interval `[lower, upper]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: An image tensor with 3 or more dimensions.
-*  <b>`lower`</b>: float.  Lower bound for the random contrast factor.
-*  <b>`upper`</b>: float.  Upper bound for the random contrast factor.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-
-##### Returns:
-
-  The contrast-adjusted tensor.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `upper <= lower` or if `lower < 0`.
-
-
-
-- - -
-
-### `tf.image.adjust_hue(image, delta, name=None)` {#adjust_hue}
-
-Adjust hue of an RGB image.
-
-This is a convenience method that converts an RGB image to float
-representation, converts it to HSV, add an offset to the hue channel, converts
-back to RGB and then back to the original data type. If several adjustments
-are chained it is advisable to minimize the number of redundant conversions.
-
-`image` is an RGB image.  The image hue is adjusted by converting the
-image to HSV and rotating the hue channel (H) by
-`delta`.  The image is then converted back to RGB.
-
-`delta` must be in the interval `[-1, 1]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`delta`</b>: float.  How much to add to the hue channel.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
-
-- - -
-
-### `tf.image.random_hue(image, max_delta, seed=None)` {#random_hue}
-
-Adjust the hue of an RGB image by a random factor.
-
-Equivalent to `adjust_hue()` but uses a `delta` randomly
-picked in the interval `[-max_delta, max_delta]`.
-
-`max_delta` must be in the interval `[0, 0.5]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`max_delta`</b>: float.  Maximum value for the random delta.
-*  <b>`seed`</b>: An operation-specific seed. It will be used in conjunction
-    with the graph-level seed to determine the real seeds that will be
-    used in this operation. Please see the documentation of
-    set_random_seed for its interaction with the graph-level random seed.
-
-##### Returns:
-
-  3-D float tensor of shape `[height, width, channels]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `max_delta` is invalid.
-
-
-
-- - -
-
-### `tf.image.adjust_gamma(image, gamma=1, gain=1)` {#adjust_gamma}
-
-Performs Gamma Correction on the input image.
-  Also known as Power Law Transform. This function transforms the
-  input image pixelwise according to the equation Out = In**gamma
-  after scaling each pixel to the range 0 to 1.
-
-##### Args:
-
-  image : A Tensor.
-  gamma : A scalar. Non negative real number.
-  gain  : A scalar. The constant multiplier.
-
-##### Returns:
-
-  A Tensor. Gamma corrected output image.
-
-##### Notes:
-
-  For gamma greater than 1, the histogram will shift towards left and
-  the output image will be darker than the input image.
-  For gamma less than 1, the histogram will shift towards right and
-  the output image will be brighter than the input image.
-
-##### References:
-
-  [1] http://en.wikipedia.org/wiki/Gamma_correction
-
-
-
-- - -
-
-### `tf.image.adjust_saturation(image, saturation_factor, name=None)` {#adjust_saturation}
-
-Adjust saturation of an RGB image.
-
-This is a convenience method that converts an RGB image to float
-representation, converts it to HSV, add an offset to the saturation channel,
-converts back to RGB and then back to the original data type. If several
-adjustments are chained it is advisable to minimize the number of redundant
-conversions.
-
-`image` is an RGB image.  The image saturation is adjusted by converting the
-image to HSV and multiplying the saturation (S) channel by
-`saturation_factor` and clipping. The image is then converted back to RGB.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`saturation_factor`</b>: float. Factor to multiply the saturation by.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
-
-- - -
-
-### `tf.image.random_saturation(image, lower, upper, seed=None)` {#random_saturation}
-
-Adjust the saturation of an RGB image by a random factor.
-
-Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
-picked in the interval `[lower, upper]`.
-
-##### Args:
-
-
-*  <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
-*  <b>`lower`</b>: float.  Lower bound for the random saturation factor.
-*  <b>`upper`</b>: float.  Upper bound for the random saturation factor.
-*  <b>`seed`</b>: An operation-specific seed. It will be used in conjunction
-    with the graph-level seed to determine the real seeds that will be
-    used in this operation. Please see the documentation of
-    set_random_seed for its interaction with the graph-level random seed.
-
-##### Returns:
-
-  Adjusted image(s), same shape and DType as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `upper <= lower` or if `lower < 0`.
-
-
-
-- - -
-
-### `tf.image.per_image_standardization(image)` {#per_image_standardization}
-
-Linearly scales `image` to have zero mean and unit norm.
-
-This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
-of all values in image, and
-`adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.
-
-`stddev` is the standard deviation of all values in `image`. It is capped
-away from zero to protect against division by 0 when handling uniform images.
-
-##### Args:
-
-
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`.
-
-##### Returns:
-
-  The standardized image with same shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of 'image' is incompatible with this function.
-
-
-
-## Working with Bounding Boxes
-
-- - -
-
-### `tf.image.draw_bounding_boxes(images, boxes, name=None)` {#draw_bounding_boxes}
-
-Draw bounding boxes on a batch of images.
-
-Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-boxes specified by the locations in `boxes`. The coordinates of the each
-bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-bounding box will be `(10, 40)` to `(50, 180)`.
-
-Parts of the bounding box may fall outside the image.
-
-##### Args:
-
-
-*  <b>`images`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D with shape `[batch, height, width, depth]`. A batch of images.
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-    boxes.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `images`.
-  4-D with the same shape as `images`. The batch of input images with
-  bounding boxes drawn on the images.
-
-
-- - -
-
-### `tf.image.non_max_suppression(boxes, scores, max_output_size, iou_threshold=None, name=None)` {#non_max_suppression}
-
-Greedily selects a subset of bounding boxes in descending order of score,
-
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-
-  selected_indices = tf.image.non_max_suppression(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-
-##### Args:
-
-
-*  <b>`boxes`</b>: A `Tensor` of type `float32`.
-    A 2-D float tensor of shape `[num_boxes, 4]`.
-*  <b>`scores`</b>: A `Tensor` of type `float32`.
-    A 1-D float tensor of shape `[num_boxes]` representing a single
-    score corresponding to each box (each row of boxes).
-*  <b>`max_output_size`</b>: A `Tensor` of type `int32`.
-    A scalar integer tensor representing the maximum number of
-    boxes to be selected by non max suppression.
-*  <b>`iou_threshold`</b>: An optional `float`. Defaults to `0.5`.
-    A float representing the threshold for deciding whether boxes
-    overlap too much with respect to IOU.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int32`.
-  A 1-D integer tensor of shape `[M]` representing the selected
-  indices from the boxes tensor, where `M <= max_output_size`.
-
-
-- - -
-
-### `tf.image.sample_distorted_bounding_box(image_size, bounding_boxes, seed=None, seed2=None, min_object_covered=None, aspect_ratio_range=None, area_range=None, max_attempts=None, use_image_if_no_bounding_boxes=None, name=None)` {#sample_distorted_bounding_box}
-
-Generate a single randomly distorted bounding box for an image.
-
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-
-##### Args:
-
-
-*  <b>`image_size`</b>: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`.
-    1-D, containing `[height, width, channels]`.
-*  <b>`bounding_boxes`</b>: A `Tensor` of type `float32`.
-    3-D with shape `[batch, N, 4]` describing the N bounding boxes
-    associated with the image.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either `seed` or `seed2` are set to non-zero, the random number
-    generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-    seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    A second seed to avoid seed collision.
-*  <b>`min_object_covered`</b>: An optional `float`. Defaults to `0.1`.
-    The cropped area of the image must contain at least this
-    fraction of any bounding box supplied. The value of this parameter should be
-    non-negative. In the case of 0, the cropped area does not need to overlap
-    any of the bounding boxes supplied.
-*  <b>`aspect_ratio_range`</b>: An optional list of `floats`. Defaults to `[0.75, 1.33]`.
-    The cropped area of the image must have an aspect ratio =
-    width / height within this range.
-*  <b>`area_range`</b>: An optional list of `floats`. Defaults to `[0.05, 1]`.
-    The cropped area of the image must contain a fraction of the
-    supplied image within in this range.
-*  <b>`max_attempts`</b>: An optional `int`. Defaults to `100`.
-    Number of attempts at generating a cropped region of the image
-    of the specified constraints. After `max_attempts` failures, return the entire
-    image.
-*  <b>`use_image_if_no_bounding_boxes`</b>: An optional `bool`. Defaults to `False`.
-    Controls behavior if no bounding boxes supplied.
-    If true, assume an implicit bounding box covering the whole input. If false,
-    raise an error.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (begin, size, bboxes).
-
-*  <b>`begin`</b>: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-    `tf.slice`.
-*  <b>`size`</b>: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to
-    `tf.slice`.
-*  <b>`bboxes`</b>: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-    Provide as input to `tf.image.draw_bounding_boxes`.
-
-
-
-## Denoising
-
-- - -
-
-### `tf.image.total_variation(images, name=None)` {#total_variation}
-
-Calculate and return the total variation for one or more images.
-
-The total variation is the sum of the absolute differences for neighboring
-pixel-values in the input images. This measures how much noise is in the
-images.
-
-This can be used as a loss-function during optimization so as to suppress
-noise in images. If you have a batch of images, then you should calculate
-the scalar loss-value as the sum:
-`loss = tf.reduce_sum(tf.image.total_variation(images))`
-
-This implements the anisotropic 2-D version of the formula described here:
-
-https://en.wikipedia.org/wiki/Total_variation_denoising
-
-##### Args:
-
-
-*  <b>`images`</b>: 4-D Tensor of shape `[batch, height, width, channels]` or
-          3-D Tensor of shape `[height, width, channels]`.
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if images.shape is not a 3-D or 4-D vector.
-
-##### Returns:
-
-  The total variation of `images`.
-
-  If `images` was 4-D, return a 1-D float Tensor of shape `[batch]` with the
-  total variation for each image in the batch.
-  If `images` was 3-D, return a scalar float with the total variation for
-  that image.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
deleted file mode 100644
index 1f123c90bd7..00000000000
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ /dev/null
@@ -1,1209 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# TensorFlow Python reference documentation
-
-* **[Building Graphs](../../api_docs/python/framework.md)**:
-  * [`add_to_collection`](../../api_docs/python/framework.md#add_to_collection)
-  * [`as_dtype`](../../api_docs/python/framework.md#as_dtype)
-  * [`container`](../../api_docs/python/framework.md#container)
-  * [`control_dependencies`](../../api_docs/python/framework.md#control_dependencies)
-  * [`convert_to_tensor`](../../api_docs/python/framework.md#convert_to_tensor)
-  * [`convert_to_tensor_or_indexed_slices`](../../api_docs/python/framework.md#convert_to_tensor_or_indexed_slices)
-  * [`convert_to_tensor_or_sparse_tensor`](../../api_docs/python/framework.md#convert_to_tensor_or_sparse_tensor)
-  * [`device`](../../api_docs/python/framework.md#device)
-  * [`DeviceSpec`](../../api_docs/python/framework.md#DeviceSpec)
-  * [`Dimension`](../../api_docs/python/framework.md#Dimension)
-  * [`DType`](../../api_docs/python/framework.md#DType)
-  * [`get_collection`](../../api_docs/python/framework.md#get_collection)
-  * [`get_collection_ref`](../../api_docs/python/framework.md#get_collection_ref)
-  * [`get_default_graph`](../../api_docs/python/framework.md#get_default_graph)
-  * [`get_seed`](../../api_docs/python/framework.md#get_seed)
-  * [`Graph`](../../api_docs/python/framework.md#Graph)
-  * [`GraphKeys`](../../api_docs/python/framework.md#GraphKeys)
-  * [`import_graph_def`](../../api_docs/python/framework.md#import_graph_def)
-  * [`load_file_system_library`](../../api_docs/python/framework.md#load_file_system_library)
-  * [`load_op_library`](../../api_docs/python/framework.md#load_op_library)
-  * [`name_scope`](../../api_docs/python/framework.md#name_scope)
-  * [`NoGradient`](../../api_docs/python/framework.md#NoGradient)
-  * [`NotDifferentiable`](../../api_docs/python/framework.md#NotDifferentiable)
-  * [`op_scope`](../../api_docs/python/framework.md#op_scope)
-  * [`Operation`](../../api_docs/python/framework.md#Operation)
-  * [`register_tensor_conversion_function`](../../api_docs/python/framework.md#register_tensor_conversion_function)
-  * [`RegisterGradient`](../../api_docs/python/framework.md#RegisterGradient)
-  * [`reset_default_graph`](../../api_docs/python/framework.md#reset_default_graph)
-  * [`shape`](../../api_docs/python/framework.md#shape)
-  * [`Tensor`](../../api_docs/python/framework.md#Tensor)
-  * [`TensorShape`](../../api_docs/python/framework.md#TensorShape)
-
-* **[Asserts and boolean checks.](../../api_docs/python/check_ops.md)**:
-  * [`assert_equal`](../../api_docs/python/check_ops.md#assert_equal)
-  * [`assert_greater`](../../api_docs/python/check_ops.md#assert_greater)
-  * [`assert_greater_equal`](../../api_docs/python/check_ops.md#assert_greater_equal)
-  * [`assert_integer`](../../api_docs/python/check_ops.md#assert_integer)
-  * [`assert_less`](../../api_docs/python/check_ops.md#assert_less)
-  * [`assert_less_equal`](../../api_docs/python/check_ops.md#assert_less_equal)
-  * [`assert_negative`](../../api_docs/python/check_ops.md#assert_negative)
-  * [`assert_non_negative`](../../api_docs/python/check_ops.md#assert_non_negative)
-  * [`assert_non_positive`](../../api_docs/python/check_ops.md#assert_non_positive)
-  * [`assert_positive`](../../api_docs/python/check_ops.md#assert_positive)
-  * [`assert_proper_iterable`](../../api_docs/python/check_ops.md#assert_proper_iterable)
-  * [`assert_rank`](../../api_docs/python/check_ops.md#assert_rank)
-  * [`assert_rank_at_least`](../../api_docs/python/check_ops.md#assert_rank_at_least)
-  * [`assert_type`](../../api_docs/python/check_ops.md#assert_type)
-  * [`is_non_decreasing`](../../api_docs/python/check_ops.md#is_non_decreasing)
-  * [`is_numeric_tensor`](../../api_docs/python/check_ops.md#is_numeric_tensor)
-  * [`is_strictly_increasing`](../../api_docs/python/check_ops.md#is_strictly_increasing)
-
-* **[Constants, Sequences, and Random Values](../../api_docs/python/constant_op.md)**:
-  * [`constant`](../../api_docs/python/constant_op.md#constant)
-  * [`fill`](../../api_docs/python/constant_op.md#fill)
-  * [`linspace`](../../api_docs/python/constant_op.md#linspace)
-  * [`multinomial`](../../api_docs/python/constant_op.md#multinomial)
-  * [`ones`](../../api_docs/python/constant_op.md#ones)
-  * [`ones_like`](../../api_docs/python/constant_op.md#ones_like)
-  * [`random_crop`](../../api_docs/python/constant_op.md#random_crop)
-  * [`random_gamma`](../../api_docs/python/constant_op.md#random_gamma)
-  * [`random_normal`](../../api_docs/python/constant_op.md#random_normal)
-  * [`random_shuffle`](../../api_docs/python/constant_op.md#random_shuffle)
-  * [`random_uniform`](../../api_docs/python/constant_op.md#random_uniform)
-  * [`range`](../../api_docs/python/constant_op.md#range)
-  * [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-  * [`truncated_normal`](../../api_docs/python/constant_op.md#truncated_normal)
-  * [`zeros`](../../api_docs/python/constant_op.md#zeros)
-  * [`zeros_like`](../../api_docs/python/constant_op.md#zeros_like)
-
-* **[Variables](../../api_docs/python/state_ops.md)**:
-  * [`all_variables`](../../api_docs/python/state_ops.md#all_variables)
-  * [`assert_variables_initialized`](../../api_docs/python/state_ops.md#assert_variables_initialized)
-  * [`assign`](../../api_docs/python/state_ops.md#assign)
-  * [`assign_add`](../../api_docs/python/state_ops.md#assign_add)
-  * [`assign_sub`](../../api_docs/python/state_ops.md#assign_sub)
-  * [`constant_initializer`](../../api_docs/python/state_ops.md#constant_initializer)
-  * [`count_up_to`](../../api_docs/python/state_ops.md#count_up_to)
-  * [`device`](../../api_docs/python/state_ops.md#device)
-  * [`export_meta_graph`](../../api_docs/python/state_ops.md#export_meta_graph)
-  * [`fixed_size_partitioner`](../../api_docs/python/state_ops.md#fixed_size_partitioner)
-  * [`get_checkpoint_state`](../../api_docs/python/state_ops.md#get_checkpoint_state)
-  * [`get_local_variable`](../../api_docs/python/state_ops.md#get_local_variable)
-  * [`get_variable`](../../api_docs/python/state_ops.md#get_variable)
-  * [`get_variable_scope`](../../api_docs/python/state_ops.md#get_variable_scope)
-  * [`global_variables`](../../api_docs/python/state_ops.md#global_variables)
-  * [`global_variables_initializer`](../../api_docs/python/state_ops.md#global_variables_initializer)
-  * [`import_meta_graph`](../../api_docs/python/state_ops.md#import_meta_graph)
-  * [`IndexedSlices`](../../api_docs/python/state_ops.md#IndexedSlices)
-  * [`initialize_all_tables`](../../api_docs/python/state_ops.md#initialize_all_tables)
-  * [`initialize_all_variables`](../../api_docs/python/state_ops.md#initialize_all_variables)
-  * [`initialize_local_variables`](../../api_docs/python/state_ops.md#initialize_local_variables)
-  * [`initialize_variables`](../../api_docs/python/state_ops.md#initialize_variables)
-  * [`is_variable_initialized`](../../api_docs/python/state_ops.md#is_variable_initialized)
-  * [`latest_checkpoint`](../../api_docs/python/state_ops.md#latest_checkpoint)
-  * [`local_variables`](../../api_docs/python/state_ops.md#local_variables)
-  * [`local_variables_initializer`](../../api_docs/python/state_ops.md#local_variables_initializer)
-  * [`make_template`](../../api_docs/python/state_ops.md#make_template)
-  * [`min_max_variable_partitioner`](../../api_docs/python/state_ops.md#min_max_variable_partitioner)
-  * [`model_variables`](../../api_docs/python/state_ops.md#model_variables)
-  * [`moving_average_variables`](../../api_docs/python/state_ops.md#moving_average_variables)
-  * [`no_regularizer`](../../api_docs/python/state_ops.md#no_regularizer)
-  * [`ones_initializer`](../../api_docs/python/state_ops.md#ones_initializer)
-  * [`orthogonal_initializer`](../../api_docs/python/state_ops.md#orthogonal_initializer)
-  * [`random_normal_initializer`](../../api_docs/python/state_ops.md#random_normal_initializer)
-  * [`random_uniform_initializer`](../../api_docs/python/state_ops.md#random_uniform_initializer)
-  * [`report_uninitialized_variables`](../../api_docs/python/state_ops.md#report_uninitialized_variables)
-  * [`Saver`](../../api_docs/python/state_ops.md#Saver)
-  * [`scatter_add`](../../api_docs/python/state_ops.md#scatter_add)
-  * [`scatter_div`](../../api_docs/python/state_ops.md#scatter_div)
-  * [`scatter_mul`](../../api_docs/python/state_ops.md#scatter_mul)
-  * [`scatter_nd_add`](../../api_docs/python/state_ops.md#scatter_nd_add)
-  * [`scatter_nd_sub`](../../api_docs/python/state_ops.md#scatter_nd_sub)
-  * [`scatter_nd_update`](../../api_docs/python/state_ops.md#scatter_nd_update)
-  * [`scatter_sub`](../../api_docs/python/state_ops.md#scatter_sub)
-  * [`scatter_update`](../../api_docs/python/state_ops.md#scatter_update)
-  * [`sparse_mask`](../../api_docs/python/state_ops.md#sparse_mask)
-  * [`tables_initializer`](../../api_docs/python/state_ops.md#tables_initializer)
-  * [`trainable_variables`](../../api_docs/python/state_ops.md#trainable_variables)
-  * [`truncated_normal_initializer`](../../api_docs/python/state_ops.md#truncated_normal_initializer)
-  * [`uniform_unit_scaling_initializer`](../../api_docs/python/state_ops.md#uniform_unit_scaling_initializer)
-  * [`update_checkpoint_state`](../../api_docs/python/state_ops.md#update_checkpoint_state)
-  * [`Variable`](../../api_docs/python/state_ops.md#Variable)
-  * [`variable_axis_size_partitioner`](../../api_docs/python/state_ops.md#variable_axis_size_partitioner)
-  * [`variable_op_scope`](../../api_docs/python/state_ops.md#variable_op_scope)
-  * [`variable_scope`](../../api_docs/python/state_ops.md#variable_scope)
-  * [`variables_initializer`](../../api_docs/python/state_ops.md#variables_initializer)
-  * [`VariableScope`](../../api_docs/python/state_ops.md#VariableScope)
-  * [`zeros_initializer`](../../api_docs/python/state_ops.md#zeros_initializer)
-
-* **[Tensor Transformations](../../api_docs/python/array_ops.md)**:
-  * [`batch_to_space`](../../api_docs/python/array_ops.md#batch_to_space)
-  * [`batch_to_space_nd`](../../api_docs/python/array_ops.md#batch_to_space_nd)
-  * [`bitcast`](../../api_docs/python/array_ops.md#bitcast)
-  * [`boolean_mask`](../../api_docs/python/array_ops.md#boolean_mask)
-  * [`broadcast_dynamic_shape`](../../api_docs/python/array_ops.md#broadcast_dynamic_shape)
-  * [`broadcast_static_shape`](../../api_docs/python/array_ops.md#broadcast_static_shape)
-  * [`cast`](../../api_docs/python/array_ops.md#cast)
-  * [`concat`](../../api_docs/python/array_ops.md#concat)
-  * [`concat_v2`](../../api_docs/python/array_ops.md#concat_v2)
-  * [`copy`](../../api_docs/python/array_ops.md#copy)
-  * [`depth_to_space`](../../api_docs/python/array_ops.md#depth_to_space)
-  * [`dequantize`](../../api_docs/python/array_ops.md#dequantize)
-  * [`dynamic_partition`](../../api_docs/python/array_ops.md#dynamic_partition)
-  * [`dynamic_stitch`](../../api_docs/python/array_ops.md#dynamic_stitch)
-  * [`expand_dims`](../../api_docs/python/array_ops.md#expand_dims)
-  * [`extract_image_patches`](../../api_docs/python/array_ops.md#extract_image_patches)
-  * [`fake_quant_with_min_max_args`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_args)
-  * [`fake_quant_with_min_max_args_gradient`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_args_gradient)
-  * [`fake_quant_with_min_max_vars`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_vars)
-  * [`fake_quant_with_min_max_vars_gradient`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_vars_gradient)
-  * [`fake_quant_with_min_max_vars_per_channel`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_vars_per_channel)
-  * [`fake_quant_with_min_max_vars_per_channel_gradient`](../../api_docs/python/array_ops.md#fake_quant_with_min_max_vars_per_channel_gradient)
-  * [`gather`](../../api_docs/python/array_ops.md#gather)
-  * [`gather_nd`](../../api_docs/python/array_ops.md#gather_nd)
-  * [`meshgrid`](../../api_docs/python/array_ops.md#meshgrid)
-  * [`one_hot`](../../api_docs/python/array_ops.md#one_hot)
-  * [`pad`](../../api_docs/python/array_ops.md#pad)
-  * [`parallel_stack`](../../api_docs/python/array_ops.md#parallel_stack)
-  * [`quantize_v2`](../../api_docs/python/array_ops.md#quantize_v2)
-  * [`quantized_concat`](../../api_docs/python/array_ops.md#quantized_concat)
-  * [`rank`](../../api_docs/python/array_ops.md#rank)
-  * [`required_space_to_batch_paddings`](../../api_docs/python/array_ops.md#required_space_to_batch_paddings)
-  * [`reshape`](../../api_docs/python/array_ops.md#reshape)
-  * [`reverse`](../../api_docs/python/array_ops.md#reverse)
-  * [`reverse_sequence`](../../api_docs/python/array_ops.md#reverse_sequence)
-  * [`reverse_v2`](../../api_docs/python/array_ops.md#reverse_v2)
-  * [`saturate_cast`](../../api_docs/python/array_ops.md#saturate_cast)
-  * [`scatter_nd`](../../api_docs/python/array_ops.md#scatter_nd)
-  * [`sequence_mask`](../../api_docs/python/array_ops.md#sequence_mask)
-  * [`setdiff1d`](../../api_docs/python/array_ops.md#setdiff1d)
-  * [`shape`](../../api_docs/python/array_ops.md#shape)
-  * [`shape_n`](../../api_docs/python/array_ops.md#shape_n)
-  * [`size`](../../api_docs/python/array_ops.md#size)
-  * [`slice`](../../api_docs/python/array_ops.md#slice)
-  * [`space_to_batch`](../../api_docs/python/array_ops.md#space_to_batch)
-  * [`space_to_batch_nd`](../../api_docs/python/array_ops.md#space_to_batch_nd)
-  * [`space_to_depth`](../../api_docs/python/array_ops.md#space_to_depth)
-  * [`split`](../../api_docs/python/array_ops.md#split)
-  * [`squeeze`](../../api_docs/python/array_ops.md#squeeze)
-  * [`stack`](../../api_docs/python/array_ops.md#stack)
-  * [`strided_slice`](../../api_docs/python/array_ops.md#strided_slice)
-  * [`string_to_number`](../../api_docs/python/array_ops.md#string_to_number)
-  * [`tile`](../../api_docs/python/array_ops.md#tile)
-  * [`to_bfloat16`](../../api_docs/python/array_ops.md#to_bfloat16)
-  * [`to_double`](../../api_docs/python/array_ops.md#to_double)
-  * [`to_float`](../../api_docs/python/array_ops.md#to_float)
-  * [`to_int32`](../../api_docs/python/array_ops.md#to_int32)
-  * [`to_int64`](../../api_docs/python/array_ops.md#to_int64)
-  * [`transpose`](../../api_docs/python/array_ops.md#transpose)
-  * [`unique_with_counts`](../../api_docs/python/array_ops.md#unique_with_counts)
-  * [`unstack`](../../api_docs/python/array_ops.md#unstack)
-
-* **[Math](../../api_docs/python/math_ops.md)**:
-  * [`abs`](../../api_docs/python/math_ops.md#abs)
-  * [`accumulate_n`](../../api_docs/python/math_ops.md#accumulate_n)
-  * [`acos`](../../api_docs/python/math_ops.md#acos)
-  * [`add`](../../api_docs/python/math_ops.md#add)
-  * [`add_n`](../../api_docs/python/math_ops.md#add_n)
-  * [`argmax`](../../api_docs/python/math_ops.md#argmax)
-  * [`argmin`](../../api_docs/python/math_ops.md#argmin)
-  * [`asin`](../../api_docs/python/math_ops.md#asin)
-  * [`atan`](../../api_docs/python/math_ops.md#atan)
-  * [`betainc`](../../api_docs/python/math_ops.md#betainc)
-  * [`ceil`](../../api_docs/python/math_ops.md#ceil)
-  * [`cholesky`](../../api_docs/python/math_ops.md#cholesky)
-  * [`cholesky_solve`](../../api_docs/python/math_ops.md#cholesky_solve)
-  * [`complex`](../../api_docs/python/math_ops.md#complex)
-  * [`conj`](../../api_docs/python/math_ops.md#conj)
-  * [`cos`](../../api_docs/python/math_ops.md#cos)
-  * [`count_nonzero`](../../api_docs/python/math_ops.md#count_nonzero)
-  * [`cross`](../../api_docs/python/math_ops.md#cross)
-  * [`cumprod`](../../api_docs/python/math_ops.md#cumprod)
-  * [`cumsum`](../../api_docs/python/math_ops.md#cumsum)
-  * [`diag`](../../api_docs/python/math_ops.md#diag)
-  * [`diag_part`](../../api_docs/python/math_ops.md#diag_part)
-  * [`digamma`](../../api_docs/python/math_ops.md#digamma)
-  * [`div`](../../api_docs/python/math_ops.md#div)
-  * [`divide`](../../api_docs/python/math_ops.md#divide)
-  * [`edit_distance`](../../api_docs/python/math_ops.md#edit_distance)
-  * [`einsum`](../../api_docs/python/math_ops.md#einsum)
-  * [`erf`](../../api_docs/python/math_ops.md#erf)
-  * [`erfc`](../../api_docs/python/math_ops.md#erfc)
-  * [`exp`](../../api_docs/python/math_ops.md#exp)
-  * [`expm1`](../../api_docs/python/math_ops.md#expm1)
-  * [`eye`](../../api_docs/python/math_ops.md#eye)
-  * [`fft`](../../api_docs/python/math_ops.md#fft)
-  * [`fft2d`](../../api_docs/python/math_ops.md#fft2d)
-  * [`fft3d`](../../api_docs/python/math_ops.md#fft3d)
-  * [`floor`](../../api_docs/python/math_ops.md#floor)
-  * [`floor_div`](../../api_docs/python/math_ops.md#floor_div)
-  * [`floordiv`](../../api_docs/python/math_ops.md#floordiv)
-  * [`floormod`](../../api_docs/python/math_ops.md#floormod)
-  * [`ifft`](../../api_docs/python/math_ops.md#ifft)
-  * [`ifft2d`](../../api_docs/python/math_ops.md#ifft2d)
-  * [`ifft3d`](../../api_docs/python/math_ops.md#ifft3d)
-  * [`igamma`](../../api_docs/python/math_ops.md#igamma)
-  * [`igammac`](../../api_docs/python/math_ops.md#igammac)
-  * [`imag`](../../api_docs/python/math_ops.md#imag)
-  * [`invert_permutation`](../../api_docs/python/math_ops.md#invert_permutation)
-  * [`lbeta`](../../api_docs/python/math_ops.md#lbeta)
-  * [`lgamma`](../../api_docs/python/math_ops.md#lgamma)
-  * [`log`](../../api_docs/python/math_ops.md#log)
-  * [`log1p`](../../api_docs/python/math_ops.md#log1p)
-  * [`matmul`](../../api_docs/python/math_ops.md#matmul)
-  * [`matrix_band_part`](../../api_docs/python/math_ops.md#matrix_band_part)
-  * [`matrix_determinant`](../../api_docs/python/math_ops.md#matrix_determinant)
-  * [`matrix_diag`](../../api_docs/python/math_ops.md#matrix_diag)
-  * [`matrix_diag_part`](../../api_docs/python/math_ops.md#matrix_diag_part)
-  * [`matrix_inverse`](../../api_docs/python/math_ops.md#matrix_inverse)
-  * [`matrix_set_diag`](../../api_docs/python/math_ops.md#matrix_set_diag)
-  * [`matrix_solve`](../../api_docs/python/math_ops.md#matrix_solve)
-  * [`matrix_solve_ls`](../../api_docs/python/math_ops.md#matrix_solve_ls)
-  * [`matrix_transpose`](../../api_docs/python/math_ops.md#matrix_transpose)
-  * [`matrix_triangular_solve`](../../api_docs/python/math_ops.md#matrix_triangular_solve)
-  * [`maximum`](../../api_docs/python/math_ops.md#maximum)
-  * [`minimum`](../../api_docs/python/math_ops.md#minimum)
-  * [`mod`](../../api_docs/python/math_ops.md#mod)
-  * [`multiply`](../../api_docs/python/math_ops.md#multiply)
-  * [`negative`](../../api_docs/python/math_ops.md#negative)
-  * [`norm`](../../api_docs/python/math_ops.md#norm)
-  * [`polygamma`](../../api_docs/python/math_ops.md#polygamma)
-  * [`pow`](../../api_docs/python/math_ops.md#pow)
-  * [`qr`](../../api_docs/python/math_ops.md#qr)
-  * [`real`](../../api_docs/python/math_ops.md#real)
-  * [`realdiv`](../../api_docs/python/math_ops.md#realdiv)
-  * [`reciprocal`](../../api_docs/python/math_ops.md#reciprocal)
-  * [`reduce_all`](../../api_docs/python/math_ops.md#reduce_all)
-  * [`reduce_any`](../../api_docs/python/math_ops.md#reduce_any)
-  * [`reduce_logsumexp`](../../api_docs/python/math_ops.md#reduce_logsumexp)
-  * [`reduce_max`](../../api_docs/python/math_ops.md#reduce_max)
-  * [`reduce_mean`](../../api_docs/python/math_ops.md#reduce_mean)
-  * [`reduce_min`](../../api_docs/python/math_ops.md#reduce_min)
-  * [`reduce_prod`](../../api_docs/python/math_ops.md#reduce_prod)
-  * [`reduce_sum`](../../api_docs/python/math_ops.md#reduce_sum)
-  * [`rint`](../../api_docs/python/math_ops.md#rint)
-  * [`round`](../../api_docs/python/math_ops.md#round)
-  * [`rsqrt`](../../api_docs/python/math_ops.md#rsqrt)
-  * [`scalar_mul`](../../api_docs/python/math_ops.md#scalar_mul)
-  * [`segment_max`](../../api_docs/python/math_ops.md#segment_max)
-  * [`segment_mean`](../../api_docs/python/math_ops.md#segment_mean)
-  * [`segment_min`](../../api_docs/python/math_ops.md#segment_min)
-  * [`segment_prod`](../../api_docs/python/math_ops.md#segment_prod)
-  * [`segment_sum`](../../api_docs/python/math_ops.md#segment_sum)
-  * [`self_adjoint_eig`](../../api_docs/python/math_ops.md#self_adjoint_eig)
-  * [`self_adjoint_eigvals`](../../api_docs/python/math_ops.md#self_adjoint_eigvals)
-  * [`setdiff1d`](../../api_docs/python/math_ops.md#setdiff1d)
-  * [`sign`](../../api_docs/python/math_ops.md#sign)
-  * [`sin`](../../api_docs/python/math_ops.md#sin)
-  * [`sparse_segment_mean`](../../api_docs/python/math_ops.md#sparse_segment_mean)
-  * [`sparse_segment_sqrt_n`](../../api_docs/python/math_ops.md#sparse_segment_sqrt_n)
-  * [`sparse_segment_sum`](../../api_docs/python/math_ops.md#sparse_segment_sum)
-  * [`sqrt`](../../api_docs/python/math_ops.md#sqrt)
-  * [`square`](../../api_docs/python/math_ops.md#square)
-  * [`squared_difference`](../../api_docs/python/math_ops.md#squared_difference)
-  * [`subtract`](../../api_docs/python/math_ops.md#subtract)
-  * [`svd`](../../api_docs/python/math_ops.md#svd)
-  * [`tan`](../../api_docs/python/math_ops.md#tan)
-  * [`tensordot`](../../api_docs/python/math_ops.md#tensordot)
-  * [`trace`](../../api_docs/python/math_ops.md#trace)
-  * [`transpose`](../../api_docs/python/math_ops.md#transpose)
-  * [`truediv`](../../api_docs/python/math_ops.md#truediv)
-  * [`truncatediv`](../../api_docs/python/math_ops.md#truncatediv)
-  * [`truncatemod`](../../api_docs/python/math_ops.md#truncatemod)
-  * [`unique`](../../api_docs/python/math_ops.md#unique)
-  * [`unsorted_segment_sum`](../../api_docs/python/math_ops.md#unsorted_segment_sum)
-  * [`where`](../../api_docs/python/math_ops.md#where)
-  * [`zeta`](../../api_docs/python/math_ops.md#zeta)
-
-* **[Strings](../../api_docs/python/string_ops.md)**:
-  * [`as_string`](../../api_docs/python/string_ops.md#as_string)
-  * [`decode_base64`](../../api_docs/python/string_ops.md#decode_base64)
-  * [`encode_base64`](../../api_docs/python/string_ops.md#encode_base64)
-  * [`reduce_join`](../../api_docs/python/string_ops.md#reduce_join)
-  * [`string_join`](../../api_docs/python/string_ops.md#string_join)
-  * [`string_split`](../../api_docs/python/string_ops.md#string_split)
-  * [`string_to_hash_bucket`](../../api_docs/python/string_ops.md#string_to_hash_bucket)
-  * [`string_to_hash_bucket_fast`](../../api_docs/python/string_ops.md#string_to_hash_bucket_fast)
-  * [`string_to_hash_bucket_strong`](../../api_docs/python/string_ops.md#string_to_hash_bucket_strong)
-  * [`substr`](../../api_docs/python/string_ops.md#substr)
-
-* **[Histograms](../../api_docs/python/histogram_ops.md)**:
-  * [`histogram_fixed_width`](../../api_docs/python/histogram_ops.md#histogram_fixed_width)
-
-* **[Control Flow](../../api_docs/python/control_flow_ops.md)**:
-  * [`add_check_numerics_ops`](../../api_docs/python/control_flow_ops.md#add_check_numerics_ops)
-  * [`Assert`](../../api_docs/python/control_flow_ops.md#Assert)
-  * [`case`](../../api_docs/python/control_flow_ops.md#case)
-  * [`check_numerics`](../../api_docs/python/control_flow_ops.md#check_numerics)
-  * [`cond`](../../api_docs/python/control_flow_ops.md#cond)
-  * [`count_up_to`](../../api_docs/python/control_flow_ops.md#count_up_to)
-  * [`equal`](../../api_docs/python/control_flow_ops.md#equal)
-  * [`greater`](../../api_docs/python/control_flow_ops.md#greater)
-  * [`greater_equal`](../../api_docs/python/control_flow_ops.md#greater_equal)
-  * [`group`](../../api_docs/python/control_flow_ops.md#group)
-  * [`identity`](../../api_docs/python/control_flow_ops.md#identity)
-  * [`is_finite`](../../api_docs/python/control_flow_ops.md#is_finite)
-  * [`is_inf`](../../api_docs/python/control_flow_ops.md#is_inf)
-  * [`is_nan`](../../api_docs/python/control_flow_ops.md#is_nan)
-  * [`less`](../../api_docs/python/control_flow_ops.md#less)
-  * [`less_equal`](../../api_docs/python/control_flow_ops.md#less_equal)
-  * [`logical_and`](../../api_docs/python/control_flow_ops.md#logical_and)
-  * [`logical_not`](../../api_docs/python/control_flow_ops.md#logical_not)
-  * [`logical_or`](../../api_docs/python/control_flow_ops.md#logical_or)
-  * [`logical_xor`](../../api_docs/python/control_flow_ops.md#logical_xor)
-  * [`no_op`](../../api_docs/python/control_flow_ops.md#no_op)
-  * [`not_equal`](../../api_docs/python/control_flow_ops.md#not_equal)
-  * [`Print`](../../api_docs/python/control_flow_ops.md#Print)
-  * [`tuple`](../../api_docs/python/control_flow_ops.md#tuple)
-  * [`verify_tensor_all_finite`](../../api_docs/python/control_flow_ops.md#verify_tensor_all_finite)
-  * [`where`](../../api_docs/python/control_flow_ops.md#where)
-  * [`while_loop`](../../api_docs/python/control_flow_ops.md#while_loop)
-
-* **[Higher Order Functions](../../api_docs/python/functional_ops.md)**:
-  * [`foldl`](../../api_docs/python/functional_ops.md#foldl)
-  * [`foldr`](../../api_docs/python/functional_ops.md#foldr)
-  * [`map_fn`](../../api_docs/python/functional_ops.md#map_fn)
-  * [`scan`](../../api_docs/python/functional_ops.md#scan)
-
-* **[TensorArray Operations](../../api_docs/python/tensor_array_ops.md)**:
-  * [`concat`](../../api_docs/python/tensor_array_ops.md#concat)
-  * [`gather`](../../api_docs/python/tensor_array_ops.md#gather)
-  * [`identity`](../../api_docs/python/tensor_array_ops.md#identity)
-  * [`split`](../../api_docs/python/tensor_array_ops.md#split)
-  * [`stack`](../../api_docs/python/tensor_array_ops.md#stack)
-  * [`TensorArray`](../../api_docs/python/tensor_array_ops.md#TensorArray)
-  * [`unstack`](../../api_docs/python/tensor_array_ops.md#unstack)
-
-* **[Tensor Handle Operations](../../api_docs/python/session_ops.md)**:
-  * [`delete_session_tensor`](../../api_docs/python/session_ops.md#delete_session_tensor)
-  * [`get_session_handle`](../../api_docs/python/session_ops.md#get_session_handle)
-  * [`get_session_tensor`](../../api_docs/python/session_ops.md#get_session_tensor)
-
-* **[Images](../../api_docs/python/image.md)**:
-  * [`adjust_brightness`](../../api_docs/python/image.md#adjust_brightness)
-  * [`adjust_contrast`](../../api_docs/python/image.md#adjust_contrast)
-  * [`adjust_gamma`](../../api_docs/python/image.md#adjust_gamma)
-  * [`adjust_hue`](../../api_docs/python/image.md#adjust_hue)
-  * [`adjust_saturation`](../../api_docs/python/image.md#adjust_saturation)
-  * [`central_crop`](../../api_docs/python/image.md#central_crop)
-  * [`convert_image_dtype`](../../api_docs/python/image.md#convert_image_dtype)
-  * [`crop_and_resize`](../../api_docs/python/image.md#crop_and_resize)
-  * [`crop_to_bounding_box`](../../api_docs/python/image.md#crop_to_bounding_box)
-  * [`decode_gif`](../../api_docs/python/image.md#decode_gif)
-  * [`decode_image`](../../api_docs/python/image.md#decode_image)
-  * [`decode_jpeg`](../../api_docs/python/image.md#decode_jpeg)
-  * [`decode_png`](../../api_docs/python/image.md#decode_png)
-  * [`draw_bounding_boxes`](../../api_docs/python/image.md#draw_bounding_boxes)
-  * [`encode_jpeg`](../../api_docs/python/image.md#encode_jpeg)
-  * [`encode_png`](../../api_docs/python/image.md#encode_png)
-  * [`extract_glimpse`](../../api_docs/python/image.md#extract_glimpse)
-  * [`flip_left_right`](../../api_docs/python/image.md#flip_left_right)
-  * [`flip_up_down`](../../api_docs/python/image.md#flip_up_down)
-  * [`grayscale_to_rgb`](../../api_docs/python/image.md#grayscale_to_rgb)
-  * [`hsv_to_rgb`](../../api_docs/python/image.md#hsv_to_rgb)
-  * [`non_max_suppression`](../../api_docs/python/image.md#non_max_suppression)
-  * [`pad_to_bounding_box`](../../api_docs/python/image.md#pad_to_bounding_box)
-  * [`per_image_standardization`](../../api_docs/python/image.md#per_image_standardization)
-  * [`random_brightness`](../../api_docs/python/image.md#random_brightness)
-  * [`random_contrast`](../../api_docs/python/image.md#random_contrast)
-  * [`random_flip_left_right`](../../api_docs/python/image.md#random_flip_left_right)
-  * [`random_flip_up_down`](../../api_docs/python/image.md#random_flip_up_down)
-  * [`random_hue`](../../api_docs/python/image.md#random_hue)
-  * [`random_saturation`](../../api_docs/python/image.md#random_saturation)
-  * [`resize_area`](../../api_docs/python/image.md#resize_area)
-  * [`resize_bicubic`](../../api_docs/python/image.md#resize_bicubic)
-  * [`resize_bilinear`](../../api_docs/python/image.md#resize_bilinear)
-  * [`resize_image_with_crop_or_pad`](../../api_docs/python/image.md#resize_image_with_crop_or_pad)
-  * [`resize_images`](../../api_docs/python/image.md#resize_images)
-  * [`resize_nearest_neighbor`](../../api_docs/python/image.md#resize_nearest_neighbor)
-  * [`rgb_to_grayscale`](../../api_docs/python/image.md#rgb_to_grayscale)
-  * [`rgb_to_hsv`](../../api_docs/python/image.md#rgb_to_hsv)
-  * [`rot90`](../../api_docs/python/image.md#rot90)
-  * [`sample_distorted_bounding_box`](../../api_docs/python/image.md#sample_distorted_bounding_box)
-  * [`total_variation`](../../api_docs/python/image.md#total_variation)
-  * [`transpose_image`](../../api_docs/python/image.md#transpose_image)
-
-* **[Sparse Tensors](../../api_docs/python/sparse_ops.md)**:
-  * [`sparse_add`](../../api_docs/python/sparse_ops.md#sparse_add)
-  * [`sparse_concat`](../../api_docs/python/sparse_ops.md#sparse_concat)
-  * [`sparse_fill_empty_rows`](../../api_docs/python/sparse_ops.md#sparse_fill_empty_rows)
-  * [`sparse_maximum`](../../api_docs/python/sparse_ops.md#sparse_maximum)
-  * [`sparse_merge`](../../api_docs/python/sparse_ops.md#sparse_merge)
-  * [`sparse_minimum`](../../api_docs/python/sparse_ops.md#sparse_minimum)
-  * [`sparse_reduce_sum`](../../api_docs/python/sparse_ops.md#sparse_reduce_sum)
-  * [`sparse_reduce_sum_sparse`](../../api_docs/python/sparse_ops.md#sparse_reduce_sum_sparse)
-  * [`sparse_reorder`](../../api_docs/python/sparse_ops.md#sparse_reorder)
-  * [`sparse_reset_shape`](../../api_docs/python/sparse_ops.md#sparse_reset_shape)
-  * [`sparse_reshape`](../../api_docs/python/sparse_ops.md#sparse_reshape)
-  * [`sparse_retain`](../../api_docs/python/sparse_ops.md#sparse_retain)
-  * [`sparse_softmax`](../../api_docs/python/sparse_ops.md#sparse_softmax)
-  * [`sparse_split`](../../api_docs/python/sparse_ops.md#sparse_split)
-  * [`sparse_tensor_dense_matmul`](../../api_docs/python/sparse_ops.md#sparse_tensor_dense_matmul)
-  * [`sparse_tensor_to_dense`](../../api_docs/python/sparse_ops.md#sparse_tensor_to_dense)
-  * [`sparse_to_dense`](../../api_docs/python/sparse_ops.md#sparse_to_dense)
-  * [`sparse_to_indicator`](../../api_docs/python/sparse_ops.md#sparse_to_indicator)
-  * [`sparse_transpose`](../../api_docs/python/sparse_ops.md#sparse_transpose)
-  * [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor)
-  * [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
-
-* **[Inputs and Readers](../../api_docs/python/io_ops.md)**:
-  * [`batch`](../../api_docs/python/io_ops.md#batch)
-  * [`batch_join`](../../api_docs/python/io_ops.md#batch_join)
-  * [`ConditionalAccumulator`](../../api_docs/python/io_ops.md#ConditionalAccumulator)
-  * [`ConditionalAccumulatorBase`](../../api_docs/python/io_ops.md#ConditionalAccumulatorBase)
-  * [`decode_csv`](../../api_docs/python/io_ops.md#decode_csv)
-  * [`decode_json_example`](../../api_docs/python/io_ops.md#decode_json_example)
-  * [`decode_raw`](../../api_docs/python/io_ops.md#decode_raw)
-  * [`FIFOQueue`](../../api_docs/python/io_ops.md#FIFOQueue)
-  * [`FixedLenFeature`](../../api_docs/python/io_ops.md#FixedLenFeature)
-  * [`FixedLengthRecordReader`](../../api_docs/python/io_ops.md#FixedLengthRecordReader)
-  * [`FixedLenSequenceFeature`](../../api_docs/python/io_ops.md#FixedLenSequenceFeature)
-  * [`IdentityReader`](../../api_docs/python/io_ops.md#IdentityReader)
-  * [`input_producer`](../../api_docs/python/io_ops.md#input_producer)
-  * [`limit_epochs`](../../api_docs/python/io_ops.md#limit_epochs)
-  * [`match_filenames_once`](../../api_docs/python/io_ops.md#match_filenames_once)
-  * [`matching_files`](../../api_docs/python/io_ops.md#matching_files)
-  * [`maybe_batch`](../../api_docs/python/io_ops.md#maybe_batch)
-  * [`maybe_batch_join`](../../api_docs/python/io_ops.md#maybe_batch_join)
-  * [`maybe_shuffle_batch`](../../api_docs/python/io_ops.md#maybe_shuffle_batch)
-  * [`maybe_shuffle_batch_join`](../../api_docs/python/io_ops.md#maybe_shuffle_batch_join)
-  * [`PaddingFIFOQueue`](../../api_docs/python/io_ops.md#PaddingFIFOQueue)
-  * [`parse_example`](../../api_docs/python/io_ops.md#parse_example)
-  * [`parse_single_example`](../../api_docs/python/io_ops.md#parse_single_example)
-  * [`parse_tensor`](../../api_docs/python/io_ops.md#parse_tensor)
-  * [`placeholder`](../../api_docs/python/io_ops.md#placeholder)
-  * [`placeholder_with_default`](../../api_docs/python/io_ops.md#placeholder_with_default)
-  * [`PriorityQueue`](../../api_docs/python/io_ops.md#PriorityQueue)
-  * [`QueueBase`](../../api_docs/python/io_ops.md#QueueBase)
-  * [`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue)
-  * [`range_input_producer`](../../api_docs/python/io_ops.md#range_input_producer)
-  * [`read_file`](../../api_docs/python/io_ops.md#read_file)
-  * [`ReaderBase`](../../api_docs/python/io_ops.md#ReaderBase)
-  * [`shuffle_batch`](../../api_docs/python/io_ops.md#shuffle_batch)
-  * [`shuffle_batch_join`](../../api_docs/python/io_ops.md#shuffle_batch_join)
-  * [`size`](../../api_docs/python/io_ops.md#size)
-  * [`slice_input_producer`](../../api_docs/python/io_ops.md#slice_input_producer)
-  * [`sparse_placeholder`](../../api_docs/python/io_ops.md#sparse_placeholder)
-  * [`SparseConditionalAccumulator`](../../api_docs/python/io_ops.md#SparseConditionalAccumulator)
-  * [`SparseFeature`](../../api_docs/python/io_ops.md#SparseFeature)
-  * [`string_input_producer`](../../api_docs/python/io_ops.md#string_input_producer)
-  * [`TextLineReader`](../../api_docs/python/io_ops.md#TextLineReader)
-  * [`TFRecordReader`](../../api_docs/python/io_ops.md#TFRecordReader)
-  * [`VarLenFeature`](../../api_docs/python/io_ops.md#VarLenFeature)
-  * [`WholeFileReader`](../../api_docs/python/io_ops.md#WholeFileReader)
-  * [`write_file`](../../api_docs/python/io_ops.md#write_file)
-
-* **[Data IO (Python functions)](../../api_docs/python/python_io.md)**:
-  * [`tf_record_iterator`](../../api_docs/python/python_io.md#tf_record_iterator)
-  * [`TFRecordCompressionType`](../../api_docs/python/python_io.md#TFRecordCompressionType)
-  * [`TFRecordOptions`](../../api_docs/python/python_io.md#TFRecordOptions)
-  * [`TFRecordWriter`](../../api_docs/python/python_io.md#TFRecordWriter)
-
-* **[Neural Network](../../api_docs/python/nn.md)**:
-  * [`atrous_conv2d`](../../api_docs/python/nn.md#atrous_conv2d)
-  * [`atrous_conv2d_transpose`](../../api_docs/python/nn.md#atrous_conv2d_transpose)
-  * [`avg_pool`](../../api_docs/python/nn.md#avg_pool)
-  * [`avg_pool3d`](../../api_docs/python/nn.md#avg_pool3d)
-  * [`batch_norm_with_global_normalization`](../../api_docs/python/nn.md#batch_norm_with_global_normalization)
-  * [`batch_normalization`](../../api_docs/python/nn.md#batch_normalization)
-  * [`bias_add`](../../api_docs/python/nn.md#bias_add)
-  * [`bidirectional_dynamic_rnn`](../../api_docs/python/nn.md#bidirectional_dynamic_rnn)
-  * [`compute_accidental_hits`](../../api_docs/python/nn.md#compute_accidental_hits)
-  * [`conv1d`](../../api_docs/python/nn.md#conv1d)
-  * [`conv2d`](../../api_docs/python/nn.md#conv2d)
-  * [`conv2d_backprop_filter`](../../api_docs/python/nn.md#conv2d_backprop_filter)
-  * [`conv2d_backprop_input`](../../api_docs/python/nn.md#conv2d_backprop_input)
-  * [`conv2d_transpose`](../../api_docs/python/nn.md#conv2d_transpose)
-  * [`conv3d`](../../api_docs/python/nn.md#conv3d)
-  * [`conv3d_backprop_filter_v2`](../../api_docs/python/nn.md#conv3d_backprop_filter_v2)
-  * [`conv3d_transpose`](../../api_docs/python/nn.md#conv3d_transpose)
-  * [`convolution`](../../api_docs/python/nn.md#convolution)
-  * [`crelu`](../../api_docs/python/nn.md#crelu)
-  * [`ctc_beam_search_decoder`](../../api_docs/python/nn.md#ctc_beam_search_decoder)
-  * [`ctc_greedy_decoder`](../../api_docs/python/nn.md#ctc_greedy_decoder)
-  * [`ctc_loss`](../../api_docs/python/nn.md#ctc_loss)
-  * [`depthwise_conv2d`](../../api_docs/python/nn.md#depthwise_conv2d)
-  * [`depthwise_conv2d_native`](../../api_docs/python/nn.md#depthwise_conv2d_native)
-  * [`depthwise_conv2d_native_backprop_filter`](../../api_docs/python/nn.md#depthwise_conv2d_native_backprop_filter)
-  * [`depthwise_conv2d_native_backprop_input`](../../api_docs/python/nn.md#depthwise_conv2d_native_backprop_input)
-  * [`dilation2d`](../../api_docs/python/nn.md#dilation2d)
-  * [`dropout`](../../api_docs/python/nn.md#dropout)
-  * [`dynamic_rnn`](../../api_docs/python/nn.md#dynamic_rnn)
-  * [`elu`](../../api_docs/python/nn.md#elu)
-  * [`embedding_lookup`](../../api_docs/python/nn.md#embedding_lookup)
-  * [`embedding_lookup_sparse`](../../api_docs/python/nn.md#embedding_lookup_sparse)
-  * [`erosion2d`](../../api_docs/python/nn.md#erosion2d)
-  * [`fixed_unigram_candidate_sampler`](../../api_docs/python/nn.md#fixed_unigram_candidate_sampler)
-  * [`fractional_avg_pool`](../../api_docs/python/nn.md#fractional_avg_pool)
-  * [`fractional_max_pool`](../../api_docs/python/nn.md#fractional_max_pool)
-  * [`fused_batch_norm`](../../api_docs/python/nn.md#fused_batch_norm)
-  * [`in_top_k`](../../api_docs/python/nn.md#in_top_k)
-  * [`l2_loss`](../../api_docs/python/nn.md#l2_loss)
-  * [`l2_normalize`](../../api_docs/python/nn.md#l2_normalize)
-  * [`learned_unigram_candidate_sampler`](../../api_docs/python/nn.md#learned_unigram_candidate_sampler)
-  * [`local_response_normalization`](../../api_docs/python/nn.md#local_response_normalization)
-  * [`log_poisson_loss`](../../api_docs/python/nn.md#log_poisson_loss)
-  * [`log_softmax`](../../api_docs/python/nn.md#log_softmax)
-  * [`log_uniform_candidate_sampler`](../../api_docs/python/nn.md#log_uniform_candidate_sampler)
-  * [`max_pool`](../../api_docs/python/nn.md#max_pool)
-  * [`max_pool3d`](../../api_docs/python/nn.md#max_pool3d)
-  * [`max_pool_with_argmax`](../../api_docs/python/nn.md#max_pool_with_argmax)
-  * [`moments`](../../api_docs/python/nn.md#moments)
-  * [`nce_loss`](../../api_docs/python/nn.md#nce_loss)
-  * [`normalize_moments`](../../api_docs/python/nn.md#normalize_moments)
-  * [`pool`](../../api_docs/python/nn.md#pool)
-  * [`quantized_avg_pool`](../../api_docs/python/nn.md#quantized_avg_pool)
-  * [`quantized_conv2d`](../../api_docs/python/nn.md#quantized_conv2d)
-  * [`quantized_max_pool`](../../api_docs/python/nn.md#quantized_max_pool)
-  * [`quantized_relu_x`](../../api_docs/python/nn.md#quantized_relu_x)
-  * [`raw_rnn`](../../api_docs/python/nn.md#raw_rnn)
-  * [`relu`](../../api_docs/python/nn.md#relu)
-  * [`relu6`](../../api_docs/python/nn.md#relu6)
-  * [`sampled_softmax_loss`](../../api_docs/python/nn.md#sampled_softmax_loss)
-  * [`separable_conv2d`](../../api_docs/python/nn.md#separable_conv2d)
-  * [`sigmoid`](../../api_docs/python/nn.md#sigmoid)
-  * [`sigmoid_cross_entropy_with_logits`](../../api_docs/python/nn.md#sigmoid_cross_entropy_with_logits)
-  * [`softmax`](../../api_docs/python/nn.md#softmax)
-  * [`softmax_cross_entropy_with_logits`](../../api_docs/python/nn.md#softmax_cross_entropy_with_logits)
-  * [`softplus`](../../api_docs/python/nn.md#softplus)
-  * [`softsign`](../../api_docs/python/nn.md#softsign)
-  * [`sparse_softmax_cross_entropy_with_logits`](../../api_docs/python/nn.md#sparse_softmax_cross_entropy_with_logits)
-  * [`sufficient_statistics`](../../api_docs/python/nn.md#sufficient_statistics)
-  * [`tanh`](../../api_docs/python/nn.md#tanh)
-  * [`top_k`](../../api_docs/python/nn.md#top_k)
-  * [`uniform_candidate_sampler`](../../api_docs/python/nn.md#uniform_candidate_sampler)
-  * [`weighted_cross_entropy_with_logits`](../../api_docs/python/nn.md#weighted_cross_entropy_with_logits)
-  * [`weighted_moments`](../../api_docs/python/nn.md#weighted_moments)
-  * [`with_space_to_batch`](../../api_docs/python/nn.md#with_space_to_batch)
-  * [`zero_fraction`](../../api_docs/python/nn.md#zero_fraction)
-
-* **[Running Graphs](../../api_docs/python/client.md)**:
-  * [`AbortedError`](../../api_docs/python/client.md#AbortedError)
-  * [`AlreadyExistsError`](../../api_docs/python/client.md#AlreadyExistsError)
-  * [`CancelledError`](../../api_docs/python/client.md#CancelledError)
-  * [`DataLossError`](../../api_docs/python/client.md#DataLossError)
-  * [`DeadlineExceededError`](../../api_docs/python/client.md#DeadlineExceededError)
-  * [`error_code_from_exception_type`](../../api_docs/python/client.md#error_code_from_exception_type)
-  * [`exception_type_from_error_code`](../../api_docs/python/client.md#exception_type_from_error_code)
-  * [`FailedPreconditionError`](../../api_docs/python/client.md#FailedPreconditionError)
-  * [`get_default_session`](../../api_docs/python/client.md#get_default_session)
-  * [`InteractiveSession`](../../api_docs/python/client.md#InteractiveSession)
-  * [`InternalError`](../../api_docs/python/client.md#InternalError)
-  * [`InvalidArgumentError`](../../api_docs/python/client.md#InvalidArgumentError)
-  * [`NotFoundError`](../../api_docs/python/client.md#NotFoundError)
-  * [`OpError`](../../api_docs/python/client.md#OpError)
-  * [`OutOfRangeError`](../../api_docs/python/client.md#OutOfRangeError)
-  * [`PermissionDeniedError`](../../api_docs/python/client.md#PermissionDeniedError)
-  * [`raise_exception_on_not_ok_status`](../../api_docs/python/client.md#raise_exception_on_not_ok_status)
-  * [`ResourceExhaustedError`](../../api_docs/python/client.md#ResourceExhaustedError)
-  * [`Session`](../../api_docs/python/client.md#Session)
-  * [`UnauthenticatedError`](../../api_docs/python/client.md#UnauthenticatedError)
-  * [`UnavailableError`](../../api_docs/python/client.md#UnavailableError)
-  * [`UnimplementedError`](../../api_docs/python/client.md#UnimplementedError)
-  * [`UnknownError`](../../api_docs/python/client.md#UnknownError)
-
-* **[Training](../../api_docs/python/train.md)**:
-  * [`AdadeltaOptimizer`](../../api_docs/python/train.md#AdadeltaOptimizer)
-  * [`AdagradDAOptimizer`](../../api_docs/python/train.md#AdagradDAOptimizer)
-  * [`AdagradOptimizer`](../../api_docs/python/train.md#AdagradOptimizer)
-  * [`AdamOptimizer`](../../api_docs/python/train.md#AdamOptimizer)
-  * [`add_queue_runner`](../../api_docs/python/train.md#add_queue_runner)
-  * [`AggregationMethod`](../../api_docs/python/train.md#AggregationMethod)
-  * [`assert_global_step`](../../api_docs/python/train.md#assert_global_step)
-  * [`basic_train_loop`](../../api_docs/python/train.md#basic_train_loop)
-  * [`checkpoint_exists`](../../api_docs/python/train.md#checkpoint_exists)
-  * [`CheckpointSaverHook`](../../api_docs/python/train.md#CheckpointSaverHook)
-  * [`ChiefSessionCreator`](../../api_docs/python/train.md#ChiefSessionCreator)
-  * [`clip_by_average_norm`](../../api_docs/python/train.md#clip_by_average_norm)
-  * [`clip_by_global_norm`](../../api_docs/python/train.md#clip_by_global_norm)
-  * [`clip_by_norm`](../../api_docs/python/train.md#clip_by_norm)
-  * [`clip_by_value`](../../api_docs/python/train.md#clip_by_value)
-  * [`ClusterSpec`](../../api_docs/python/train.md#ClusterSpec)
-  * [`Coordinator`](../../api_docs/python/train.md#Coordinator)
-  * [`do_quantize_training_on_graphdef`](../../api_docs/python/train.md#do_quantize_training_on_graphdef)
-  * [`exponential_decay`](../../api_docs/python/train.md#exponential_decay)
-  * [`ExponentialMovingAverage`](../../api_docs/python/train.md#ExponentialMovingAverage)
-  * [`FeedFnHook`](../../api_docs/python/train.md#FeedFnHook)
-  * [`FinalOpsHook`](../../api_docs/python/train.md#FinalOpsHook)
-  * [`FtrlOptimizer`](../../api_docs/python/train.md#FtrlOptimizer)
-  * [`generate_checkpoint_state_proto`](../../api_docs/python/train.md#generate_checkpoint_state_proto)
-  * [`get_checkpoint_mtimes`](../../api_docs/python/train.md#get_checkpoint_mtimes)
-  * [`get_global_step`](../../api_docs/python/train.md#get_global_step)
-  * [`global_norm`](../../api_docs/python/train.md#global_norm)
-  * [`global_step`](../../api_docs/python/train.md#global_step)
-  * [`GlobalStepWaiterHook`](../../api_docs/python/train.md#GlobalStepWaiterHook)
-  * [`GradientDescentOptimizer`](../../api_docs/python/train.md#GradientDescentOptimizer)
-  * [`gradients`](../../api_docs/python/train.md#gradients)
-  * [`hessians`](../../api_docs/python/train.md#hessians)
-  * [`inverse_time_decay`](../../api_docs/python/train.md#inverse_time_decay)
-  * [`LoggingTensorHook`](../../api_docs/python/train.md#LoggingTensorHook)
-  * [`LooperThread`](../../api_docs/python/train.md#LooperThread)
-  * [`MomentumOptimizer`](../../api_docs/python/train.md#MomentumOptimizer)
-  * [`MonitoredSession`](../../api_docs/python/train.md#MonitoredSession)
-  * [`MonitoredTrainingSession`](../../api_docs/python/train.md#MonitoredTrainingSession)
-  * [`NanLossDuringTrainingError`](../../api_docs/python/train.md#NanLossDuringTrainingError)
-  * [`NanTensorHook`](../../api_docs/python/train.md#NanTensorHook)
-  * [`natural_exp_decay`](../../api_docs/python/train.md#natural_exp_decay)
-  * [`NewCheckpointReader`](../../api_docs/python/train.md#NewCheckpointReader)
-  * [`Optimizer`](../../api_docs/python/train.md#Optimizer)
-  * [`piecewise_constant`](../../api_docs/python/train.md#piecewise_constant)
-  * [`polynomial_decay`](../../api_docs/python/train.md#polynomial_decay)
-  * [`ProximalAdagradOptimizer`](../../api_docs/python/train.md#ProximalAdagradOptimizer)
-  * [`ProximalGradientDescentOptimizer`](../../api_docs/python/train.md#ProximalGradientDescentOptimizer)
-  * [`QueueRunner`](../../api_docs/python/train.md#QueueRunner)
-  * [`replica_device_setter`](../../api_docs/python/train.md#replica_device_setter)
-  * [`RMSPropOptimizer`](../../api_docs/python/train.md#RMSPropOptimizer)
-  * [`Scaffold`](../../api_docs/python/train.md#Scaffold)
-  * [`Server`](../../api_docs/python/train.md#Server)
-  * [`SessionCreator`](../../api_docs/python/train.md#SessionCreator)
-  * [`SessionManager`](../../api_docs/python/train.md#SessionManager)
-  * [`SessionRunArgs`](../../api_docs/python/train.md#SessionRunArgs)
-  * [`SessionRunContext`](../../api_docs/python/train.md#SessionRunContext)
-  * [`SessionRunHook`](../../api_docs/python/train.md#SessionRunHook)
-  * [`SessionRunValues`](../../api_docs/python/train.md#SessionRunValues)
-  * [`SingularMonitoredSession`](../../api_docs/python/train.md#SingularMonitoredSession)
-  * [`start_queue_runners`](../../api_docs/python/train.md#start_queue_runners)
-  * [`StepCounterHook`](../../api_docs/python/train.md#StepCounterHook)
-  * [`stop_gradient`](../../api_docs/python/train.md#stop_gradient)
-  * [`StopAtStepHook`](../../api_docs/python/train.md#StopAtStepHook)
-  * [`summary_iterator`](../../api_docs/python/train.md#summary_iterator)
-  * [`SummarySaverHook`](../../api_docs/python/train.md#SummarySaverHook)
-  * [`Supervisor`](../../api_docs/python/train.md#Supervisor)
-  * [`SyncReplicasOptimizer`](../../api_docs/python/train.md#SyncReplicasOptimizer)
-  * [`WorkerSessionCreator`](../../api_docs/python/train.md#WorkerSessionCreator)
-  * [`write_graph`](../../api_docs/python/train.md#write_graph)
-
-* **[Wraps python functions](../../api_docs/python/script_ops.md)**:
-  * [`py_func`](../../api_docs/python/script_ops.md#py_func)
-
-* **[Summary Operations](../../api_docs/python/summary.md)**:
-  * [`audio`](../../api_docs/python/summary.md#audio)
-  * [`FileWriter`](../../api_docs/python/summary.md#FileWriter)
-  * [`FileWriterCache`](../../api_docs/python/summary.md#FileWriterCache)
-  * [`get_summary_description`](../../api_docs/python/summary.md#get_summary_description)
-  * [`histogram`](../../api_docs/python/summary.md#histogram)
-  * [`image`](../../api_docs/python/summary.md#image)
-  * [`merge`](../../api_docs/python/summary.md#merge)
-  * [`merge_all`](../../api_docs/python/summary.md#merge_all)
-  * [`scalar`](../../api_docs/python/summary.md#scalar)
-  * [`SummaryDescription`](../../api_docs/python/summary.md#SummaryDescription)
-  * [`TaggedRunMetadata`](../../api_docs/python/summary.md#TaggedRunMetadata)
-  * [`tensor_summary`](../../api_docs/python/summary.md#tensor_summary)
-
-* **[Testing](../../api_docs/python/test.md)**:
-  * [`assert_equal_graph_def`](../../api_docs/python/test.md#assert_equal_graph_def)
-  * [`Benchmark`](../../api_docs/python/test.md#Benchmark)
-  * [`compute_gradient`](../../api_docs/python/test.md#compute_gradient)
-  * [`compute_gradient_error`](../../api_docs/python/test.md#compute_gradient_error)
-  * [`get_temp_dir`](../../api_docs/python/test.md#get_temp_dir)
-  * [`gpu_device_name`](../../api_docs/python/test.md#gpu_device_name)
-  * [`is_built_with_cuda`](../../api_docs/python/test.md#is_built_with_cuda)
-  * [`is_gpu_available`](../../api_docs/python/test.md#is_gpu_available)
-  * [`main`](../../api_docs/python/test.md#main)
-  * [`test_src_dir_path`](../../api_docs/python/test.md#test_src_dir_path)
-  * [`TestCase`](../../api_docs/python/test.md#TestCase)
-
-* **[BayesFlow Entropy (contrib)](../../api_docs/python/contrib.bayesflow.entropy.md)**:
-  * [`elbo_ratio`](../../api_docs/python/contrib.bayesflow.entropy.md#elbo_ratio)
-  * [`entropy_shannon`](../../api_docs/python/contrib.bayesflow.entropy.md#entropy_shannon)
-  * [`renyi_alpha`](../../api_docs/python/contrib.bayesflow.entropy.md#renyi_alpha)
-  * [`renyi_ratio`](../../api_docs/python/contrib.bayesflow.entropy.md#renyi_ratio)
-
-* **[BayesFlow Monte Carlo (contrib)](../../api_docs/python/contrib.bayesflow.monte_carlo.md)**:
-  * [`expectation`](../../api_docs/python/contrib.bayesflow.monte_carlo.md#expectation)
-  * [`expectation_importance_sampler`](../../api_docs/python/contrib.bayesflow.monte_carlo.md#expectation_importance_sampler)
-  * [`expectation_importance_sampler_logspace`](../../api_docs/python/contrib.bayesflow.monte_carlo.md#expectation_importance_sampler_logspace)
-
-* **[BayesFlow Stochastic Graph (contrib)](../../api_docs/python/contrib.bayesflow.stochastic_graph.md)**:
-  * [`surrogate_loss`](../../api_docs/python/contrib.bayesflow.stochastic_graph.md#surrogate_loss)
-
-* **[BayesFlow Stochastic Tensors (contrib)](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md)**:
-  * [`BaseStochasticTensor`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#BaseStochasticTensor)
-  * [`get_current_value_type`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#get_current_value_type)
-  * [`MeanValue`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#MeanValue)
-  * [`ObservedStochasticTensor`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#ObservedStochasticTensor)
-  * [`SampleValue`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#SampleValue)
-  * [`StochasticTensor`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#StochasticTensor)
-  * [`value_type`](../../api_docs/python/contrib.bayesflow.stochastic_tensor.md#value_type)
-
-* **[BayesFlow Variational Inference (contrib)](../../api_docs/python/contrib.bayesflow.variational_inference.md)**:
-  * [`elbo`](../../api_docs/python/contrib.bayesflow.variational_inference.md#elbo)
-  * [`elbo_with_log_joint`](../../api_docs/python/contrib.bayesflow.variational_inference.md#elbo_with_log_joint)
-  * [`ELBOForms`](../../api_docs/python/contrib.bayesflow.variational_inference.md#ELBOForms)
-  * [`register_prior`](../../api_docs/python/contrib.bayesflow.variational_inference.md#register_prior)
-
-* **[CRF (contrib)](../../api_docs/python/contrib.crf.md)**:
-  * [`crf_binary_score`](../../api_docs/python/contrib.crf.md#crf_binary_score)
-  * [`crf_log_likelihood`](../../api_docs/python/contrib.crf.md#crf_log_likelihood)
-  * [`crf_log_norm`](../../api_docs/python/contrib.crf.md#crf_log_norm)
-  * [`crf_sequence_score`](../../api_docs/python/contrib.crf.md#crf_sequence_score)
-  * [`crf_unary_score`](../../api_docs/python/contrib.crf.md#crf_unary_score)
-  * [`CrfForwardRnnCell`](../../api_docs/python/contrib.crf.md#CrfForwardRnnCell)
-  * [`viterbi_decode`](../../api_docs/python/contrib.crf.md#viterbi_decode)
-
-* **[Statistical Distributions (contrib)](../../api_docs/python/contrib.distributions.md)**:
-  * [`Bernoulli`](../../api_docs/python/contrib.distributions.md#Bernoulli)
-  * [`BernoulliWithSigmoidProbs`](../../api_docs/python/contrib.distributions.md#BernoulliWithSigmoidProbs)
-  * [`Beta`](../../api_docs/python/contrib.distributions.md#Beta)
-  * [`BetaWithSoftplusConcentration`](../../api_docs/python/contrib.distributions.md#BetaWithSoftplusConcentration)
-  * [`Binomial`](../../api_docs/python/contrib.distributions.md#Binomial)
-  * [`Categorical`](../../api_docs/python/contrib.distributions.md#Categorical)
-  * [`Chi2`](../../api_docs/python/contrib.distributions.md#Chi2)
-  * [`Chi2WithAbsDf`](../../api_docs/python/contrib.distributions.md#Chi2WithAbsDf)
-  * [`ConditionalDistribution`](../../api_docs/python/contrib.distributions.md#ConditionalDistribution)
-  * [`ConditionalTransformedDistribution`](../../api_docs/python/contrib.distributions.md#ConditionalTransformedDistribution)
-  * [`Dirichlet`](../../api_docs/python/contrib.distributions.md#Dirichlet)
-  * [`DirichletMultinomial`](../../api_docs/python/contrib.distributions.md#DirichletMultinomial)
-  * [`Distribution`](../../api_docs/python/contrib.distributions.md#Distribution)
-  * [`Exponential`](../../api_docs/python/contrib.distributions.md#Exponential)
-  * [`ExponentialWithSoftplusRate`](../../api_docs/python/contrib.distributions.md#ExponentialWithSoftplusRate)
-  * [`ExpRelaxedOneHotCategorical`](../../api_docs/python/contrib.distributions.md#ExpRelaxedOneHotCategorical)
-  * [`Gamma`](../../api_docs/python/contrib.distributions.md#Gamma)
-  * [`GammaWithSoftplusConcentrationRate`](../../api_docs/python/contrib.distributions.md#GammaWithSoftplusConcentrationRate)
-  * [`InverseGamma`](../../api_docs/python/contrib.distributions.md#InverseGamma)
-  * [`InverseGammaWithSoftplusConcentrationRate`](../../api_docs/python/contrib.distributions.md#InverseGammaWithSoftplusConcentrationRate)
-  * [`kl`](../../api_docs/python/contrib.distributions.md#kl)
-  * [`Laplace`](../../api_docs/python/contrib.distributions.md#Laplace)
-  * [`LaplaceWithSoftplusScale`](../../api_docs/python/contrib.distributions.md#LaplaceWithSoftplusScale)
-  * [`Logistic`](../../api_docs/python/contrib.distributions.md#Logistic)
-  * [`matrix_diag_transform`](../../api_docs/python/contrib.distributions.md#matrix_diag_transform)
-  * [`Mixture`](../../api_docs/python/contrib.distributions.md#Mixture)
-  * [`Multinomial`](../../api_docs/python/contrib.distributions.md#Multinomial)
-  * [`MultivariateNormalCholesky`](../../api_docs/python/contrib.distributions.md#MultivariateNormalCholesky)
-  * [`MultivariateNormalDiag`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiag)
-  * [`MultivariateNormalDiagPlusVDVT`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiagPlusVDVT)
-  * [`MultivariateNormalDiagWithSoftplusStDev`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiagWithSoftplusStDev)
-  * [`MultivariateNormalFull`](../../api_docs/python/contrib.distributions.md#MultivariateNormalFull)
-  * [`Normal`](../../api_docs/python/contrib.distributions.md#Normal)
-  * [`normal_conjugates_known_scale_posterior`](../../api_docs/python/contrib.distributions.md#normal_conjugates_known_scale_posterior)
-  * [`normal_conjugates_known_scale_predictive`](../../api_docs/python/contrib.distributions.md#normal_conjugates_known_scale_predictive)
-  * [`NormalWithSoftplusScale`](../../api_docs/python/contrib.distributions.md#NormalWithSoftplusScale)
-  * [`OneHotCategorical`](../../api_docs/python/contrib.distributions.md#OneHotCategorical)
-  * [`Poisson`](../../api_docs/python/contrib.distributions.md#Poisson)
-  * [`QuantizedDistribution`](../../api_docs/python/contrib.distributions.md#QuantizedDistribution)
-  * [`RegisterKL`](../../api_docs/python/contrib.distributions.md#RegisterKL)
-  * [`RelaxedBernoulli`](../../api_docs/python/contrib.distributions.md#RelaxedBernoulli)
-  * [`RelaxedOneHotCategorical`](../../api_docs/python/contrib.distributions.md#RelaxedOneHotCategorical)
-  * [`ReparameterizationType`](../../api_docs/python/contrib.distributions.md#ReparameterizationType)
-  * [`softplus_inverse`](../../api_docs/python/contrib.distributions.md#softplus_inverse)
-  * [`StudentT`](../../api_docs/python/contrib.distributions.md#StudentT)
-  * [`StudentTWithAbsDfSoftplusScale`](../../api_docs/python/contrib.distributions.md#StudentTWithAbsDfSoftplusScale)
-  * [`TransformedDistribution`](../../api_docs/python/contrib.distributions.md#TransformedDistribution)
-  * [`Uniform`](../../api_docs/python/contrib.distributions.md#Uniform)
-  * [`WishartCholesky`](../../api_docs/python/contrib.distributions.md#WishartCholesky)
-  * [`WishartFull`](../../api_docs/python/contrib.distributions.md#WishartFull)
-
-* **[Random variable transformations (contrib)](../../api_docs/python/contrib.distributions.bijector.md)**:
-  * [`Affine`](../../api_docs/python/contrib.distributions.bijector.md#Affine)
-  * [`AffineLinearOperator`](../../api_docs/python/contrib.distributions.bijector.md#AffineLinearOperator)
-  * [`Bijector`](../../api_docs/python/contrib.distributions.bijector.md#Bijector)
-  * [`Chain`](../../api_docs/python/contrib.distributions.bijector.md#Chain)
-  * [`CholeskyOuterProduct`](../../api_docs/python/contrib.distributions.bijector.md#CholeskyOuterProduct)
-  * [`Exp`](../../api_docs/python/contrib.distributions.bijector.md#Exp)
-  * [`Identity`](../../api_docs/python/contrib.distributions.bijector.md#Identity)
-  * [`Inline`](../../api_docs/python/contrib.distributions.bijector.md#Inline)
-  * [`Invert`](../../api_docs/python/contrib.distributions.bijector.md#Invert)
-  * [`PowerTransform`](../../api_docs/python/contrib.distributions.bijector.md#PowerTransform)
-  * [`SigmoidCentered`](../../api_docs/python/contrib.distributions.bijector.md#SigmoidCentered)
-  * [`SoftmaxCentered`](../../api_docs/python/contrib.distributions.bijector.md#SoftmaxCentered)
-  * [`Softplus`](../../api_docs/python/contrib.distributions.bijector.md#Softplus)
-
-* **[FFmpeg (contrib)](../../api_docs/python/contrib.ffmpeg.md)**:
-  * [`decode_audio`](../../api_docs/python/contrib.ffmpeg.md#decode_audio)
-  * [`encode_audio`](../../api_docs/python/contrib.ffmpeg.md#encode_audio)
-
-* **[Framework (contrib)](../../api_docs/python/contrib.framework.md)**:
-  * [`add_arg_scope`](../../api_docs/python/contrib.framework.md#add_arg_scope)
-  * [`add_model_variable`](../../api_docs/python/contrib.framework.md#add_model_variable)
-  * [`arg_scope`](../../api_docs/python/contrib.framework.md#arg_scope)
-  * [`arg_scoped_arguments`](../../api_docs/python/contrib.framework.md#arg_scoped_arguments)
-  * [`assert_global_step`](../../api_docs/python/contrib.framework.md#assert_global_step)
-  * [`assert_or_get_global_step`](../../api_docs/python/contrib.framework.md#assert_or_get_global_step)
-  * [`assert_same_float_dtype`](../../api_docs/python/contrib.framework.md#assert_same_float_dtype)
-  * [`assert_scalar`](../../api_docs/python/contrib.framework.md#assert_scalar)
-  * [`assert_scalar_int`](../../api_docs/python/contrib.framework.md#assert_scalar_int)
-  * [`assign_from_checkpoint`](../../api_docs/python/contrib.framework.md#assign_from_checkpoint)
-  * [`assign_from_checkpoint_fn`](../../api_docs/python/contrib.framework.md#assign_from_checkpoint_fn)
-  * [`assign_from_values`](../../api_docs/python/contrib.framework.md#assign_from_values)
-  * [`assign_from_values_fn`](../../api_docs/python/contrib.framework.md#assign_from_values_fn)
-  * [`convert_to_tensor_or_sparse_tensor`](../../api_docs/python/contrib.framework.md#convert_to_tensor_or_sparse_tensor)
-  * [`create_global_step`](../../api_docs/python/contrib.framework.md#create_global_step)
-  * [`deprecated`](../../api_docs/python/contrib.framework.md#deprecated)
-  * [`deprecated_arg_values`](../../api_docs/python/contrib.framework.md#deprecated_arg_values)
-  * [`deprecated_args`](../../api_docs/python/contrib.framework.md#deprecated_args)
-  * [`filter_variables`](../../api_docs/python/contrib.framework.md#filter_variables)
-  * [`get_global_step`](../../api_docs/python/contrib.framework.md#get_global_step)
-  * [`get_graph_from_inputs`](../../api_docs/python/contrib.framework.md#get_graph_from_inputs)
-  * [`get_local_variables`](../../api_docs/python/contrib.framework.md#get_local_variables)
-  * [`get_model_variables`](../../api_docs/python/contrib.framework.md#get_model_variables)
-  * [`get_or_create_global_step`](../../api_docs/python/contrib.framework.md#get_or_create_global_step)
-  * [`get_unique_variable`](../../api_docs/python/contrib.framework.md#get_unique_variable)
-  * [`get_variables`](../../api_docs/python/contrib.framework.md#get_variables)
-  * [`get_variables_by_name`](../../api_docs/python/contrib.framework.md#get_variables_by_name)
-  * [`get_variables_by_suffix`](../../api_docs/python/contrib.framework.md#get_variables_by_suffix)
-  * [`get_variables_to_restore`](../../api_docs/python/contrib.framework.md#get_variables_to_restore)
-  * [`has_arg_scope`](../../api_docs/python/contrib.framework.md#has_arg_scope)
-  * [`init_from_checkpoint`](../../api_docs/python/contrib.framework.md#init_from_checkpoint)
-  * [`is_non_decreasing`](../../api_docs/python/contrib.framework.md#is_non_decreasing)
-  * [`is_numeric_tensor`](../../api_docs/python/contrib.framework.md#is_numeric_tensor)
-  * [`is_strictly_increasing`](../../api_docs/python/contrib.framework.md#is_strictly_increasing)
-  * [`is_tensor`](../../api_docs/python/contrib.framework.md#is_tensor)
-  * [`list_variables`](../../api_docs/python/contrib.framework.md#list_variables)
-  * [`load_checkpoint`](../../api_docs/python/contrib.framework.md#load_checkpoint)
-  * [`load_variable`](../../api_docs/python/contrib.framework.md#load_variable)
-  * [`local_variable`](../../api_docs/python/contrib.framework.md#local_variable)
-  * [`model_variable`](../../api_docs/python/contrib.framework.md#model_variable)
-  * [`reduce_sum_n`](../../api_docs/python/contrib.framework.md#reduce_sum_n)
-  * [`remove_squeezable_dimensions`](../../api_docs/python/contrib.framework.md#remove_squeezable_dimensions)
-  * [`variable`](../../api_docs/python/contrib.framework.md#variable)
-  * [`VariableDeviceChooser`](../../api_docs/python/contrib.framework.md#VariableDeviceChooser)
-  * [`with_same_shape`](../../api_docs/python/contrib.framework.md#with_same_shape)
-  * [`with_shape`](../../api_docs/python/contrib.framework.md#with_shape)
-  * [`zero_initializer`](../../api_docs/python/contrib.framework.md#zero_initializer)
-
-* **[Graph Editor (contrib)](../../api_docs/python/contrib.graph_editor.md)**:
-  * [`add_control_inputs`](../../api_docs/python/contrib.graph_editor.md#add_control_inputs)
-  * [`assign_renamed_collections_handler`](../../api_docs/python/contrib.graph_editor.md#assign_renamed_collections_handler)
-  * [`bypass`](../../api_docs/python/contrib.graph_editor.md#bypass)
-  * [`can_be_regex`](../../api_docs/python/contrib.graph_editor.md#can_be_regex)
-  * [`check_cios`](../../api_docs/python/contrib.graph_editor.md#check_cios)
-  * [`compute_boundary_ts`](../../api_docs/python/contrib.graph_editor.md#compute_boundary_ts)
-  * [`connect`](../../api_docs/python/contrib.graph_editor.md#connect)
-  * [`ControlOutputs`](../../api_docs/python/contrib.graph_editor.md#ControlOutputs)
-  * [`copy_op_handler`](../../api_docs/python/contrib.graph_editor.md#copy_op_handler)
-  * [`copy_with_input_replacements`](../../api_docs/python/contrib.graph_editor.md#copy_with_input_replacements)
-  * [`detach`](../../api_docs/python/contrib.graph_editor.md#detach)
-  * [`detach_control_inputs`](../../api_docs/python/contrib.graph_editor.md#detach_control_inputs)
-  * [`detach_control_outputs`](../../api_docs/python/contrib.graph_editor.md#detach_control_outputs)
-  * [`detach_inputs`](../../api_docs/python/contrib.graph_editor.md#detach_inputs)
-  * [`detach_outputs`](../../api_docs/python/contrib.graph_editor.md#detach_outputs)
-  * [`filter_ops`](../../api_docs/python/contrib.graph_editor.md#filter_ops)
-  * [`filter_ops_from_regex`](../../api_docs/python/contrib.graph_editor.md#filter_ops_from_regex)
-  * [`filter_ts`](../../api_docs/python/contrib.graph_editor.md#filter_ts)
-  * [`filter_ts_from_regex`](../../api_docs/python/contrib.graph_editor.md#filter_ts_from_regex)
-  * [`get_backward_walk_ops`](../../api_docs/python/contrib.graph_editor.md#get_backward_walk_ops)
-  * [`get_consuming_ops`](../../api_docs/python/contrib.graph_editor.md#get_consuming_ops)
-  * [`get_forward_walk_ops`](../../api_docs/python/contrib.graph_editor.md#get_forward_walk_ops)
-  * [`get_generating_ops`](../../api_docs/python/contrib.graph_editor.md#get_generating_ops)
-  * [`get_name_scope_ops`](../../api_docs/python/contrib.graph_editor.md#get_name_scope_ops)
-  * [`get_ops_ios`](../../api_docs/python/contrib.graph_editor.md#get_ops_ios)
-  * [`get_tensors`](../../api_docs/python/contrib.graph_editor.md#get_tensors)
-  * [`get_walks_intersection_ops`](../../api_docs/python/contrib.graph_editor.md#get_walks_intersection_ops)
-  * [`get_walks_union_ops`](../../api_docs/python/contrib.graph_editor.md#get_walks_union_ops)
-  * [`get_within_boundary_ops`](../../api_docs/python/contrib.graph_editor.md#get_within_boundary_ops)
-  * [`graph_replace`](../../api_docs/python/contrib.graph_editor.md#graph_replace)
-  * [`keep_t_if_possible_handler`](../../api_docs/python/contrib.graph_editor.md#keep_t_if_possible_handler)
-  * [`make_list_of_op`](../../api_docs/python/contrib.graph_editor.md#make_list_of_op)
-  * [`make_list_of_t`](../../api_docs/python/contrib.graph_editor.md#make_list_of_t)
-  * [`make_placeholder_from_dtype_and_shape`](../../api_docs/python/contrib.graph_editor.md#make_placeholder_from_dtype_and_shape)
-  * [`make_placeholder_from_tensor`](../../api_docs/python/contrib.graph_editor.md#make_placeholder_from_tensor)
-  * [`make_regex`](../../api_docs/python/contrib.graph_editor.md#make_regex)
-  * [`make_view`](../../api_docs/python/contrib.graph_editor.md#make_view)
-  * [`make_view_from_scope`](../../api_docs/python/contrib.graph_editor.md#make_view_from_scope)
-  * [`op_type`](../../api_docs/python/contrib.graph_editor.md#op_type)
-  * [`OpMatcher`](../../api_docs/python/contrib.graph_editor.md#OpMatcher)
-  * [`ph`](../../api_docs/python/contrib.graph_editor.md#ph)
-  * [`placeholder_name`](../../api_docs/python/contrib.graph_editor.md#placeholder_name)
-  * [`remove_control_inputs`](../../api_docs/python/contrib.graph_editor.md#remove_control_inputs)
-  * [`replace_t_with_placeholder_handler`](../../api_docs/python/contrib.graph_editor.md#replace_t_with_placeholder_handler)
-  * [`reroute_inputs`](../../api_docs/python/contrib.graph_editor.md#reroute_inputs)
-  * [`reroute_ios`](../../api_docs/python/contrib.graph_editor.md#reroute_ios)
-  * [`reroute_outputs`](../../api_docs/python/contrib.graph_editor.md#reroute_outputs)
-  * [`reroute_ts`](../../api_docs/python/contrib.graph_editor.md#reroute_ts)
-  * [`select_ops`](../../api_docs/python/contrib.graph_editor.md#select_ops)
-  * [`select_ops_and_ts`](../../api_docs/python/contrib.graph_editor.md#select_ops_and_ts)
-  * [`select_ts`](../../api_docs/python/contrib.graph_editor.md#select_ts)
-  * [`sgv`](../../api_docs/python/contrib.graph_editor.md#sgv)
-  * [`sgv_scope`](../../api_docs/python/contrib.graph_editor.md#sgv_scope)
-  * [`SubGraphView`](../../api_docs/python/contrib.graph_editor.md#SubGraphView)
-  * [`swap_inputs`](../../api_docs/python/contrib.graph_editor.md#swap_inputs)
-  * [`swap_ios`](../../api_docs/python/contrib.graph_editor.md#swap_ios)
-  * [`swap_outputs`](../../api_docs/python/contrib.graph_editor.md#swap_outputs)
-  * [`swap_ts`](../../api_docs/python/contrib.graph_editor.md#swap_ts)
-  * [`transform_op_if_inside_handler`](../../api_docs/python/contrib.graph_editor.md#transform_op_if_inside_handler)
-  * [`Transformer`](../../api_docs/python/contrib.graph_editor.md#Transformer)
-  * [`TransformerInfo`](../../api_docs/python/contrib.graph_editor.md#TransformerInfo)
-
-* **[Integrate (contrib)](../../api_docs/python/contrib.integrate.md)**:
-  * [`odeint`](../../api_docs/python/contrib.integrate.md#odeint)
-
-* **[Layers (contrib)](../../api_docs/python/contrib.layers.md)**:
-  * [`apply_regularization`](../../api_docs/python/contrib.layers.md#apply_regularization)
-  * [`avg_pool2d`](../../api_docs/python/contrib.layers.md#avg_pool2d)
-  * [`batch_norm`](../../api_docs/python/contrib.layers.md#batch_norm)
-  * [`bucketized_column`](../../api_docs/python/contrib.layers.md#bucketized_column)
-  * [`check_feature_columns`](../../api_docs/python/contrib.layers.md#check_feature_columns)
-  * [`conv2d_in_plane`](../../api_docs/python/contrib.layers.md#conv2d_in_plane)
-  * [`conv2d_transpose`](../../api_docs/python/contrib.layers.md#conv2d_transpose)
-  * [`convolution2d`](../../api_docs/python/contrib.layers.md#convolution2d)
-  * [`convolution2d_in_plane`](../../api_docs/python/contrib.layers.md#convolution2d_in_plane)
-  * [`convolution2d_transpose`](../../api_docs/python/contrib.layers.md#convolution2d_transpose)
-  * [`create_feature_spec_for_parsing`](../../api_docs/python/contrib.layers.md#create_feature_spec_for_parsing)
-  * [`crossed_column`](../../api_docs/python/contrib.layers.md#crossed_column)
-  * [`dropout`](../../api_docs/python/contrib.layers.md#dropout)
-  * [`embed_sequence`](../../api_docs/python/contrib.layers.md#embed_sequence)
-  * [`embedding_column`](../../api_docs/python/contrib.layers.md#embedding_column)
-  * [`flatten`](../../api_docs/python/contrib.layers.md#flatten)
-  * [`fully_connected`](../../api_docs/python/contrib.layers.md#fully_connected)
-  * [`infer_real_valued_columns`](../../api_docs/python/contrib.layers.md#infer_real_valued_columns)
-  * [`input_from_feature_columns`](../../api_docs/python/contrib.layers.md#input_from_feature_columns)
-  * [`joint_weighted_sum_from_feature_columns`](../../api_docs/python/contrib.layers.md#joint_weighted_sum_from_feature_columns)
-  * [`l1_regularizer`](../../api_docs/python/contrib.layers.md#l1_regularizer)
-  * [`l2_regularizer`](../../api_docs/python/contrib.layers.md#l2_regularizer)
-  * [`layer_norm`](../../api_docs/python/contrib.layers.md#layer_norm)
-  * [`legacy_fully_connected`](../../api_docs/python/contrib.layers.md#legacy_fully_connected)
-  * [`legacy_linear`](../../api_docs/python/contrib.layers.md#legacy_linear)
-  * [`legacy_relu`](../../api_docs/python/contrib.layers.md#legacy_relu)
-  * [`linear`](../../api_docs/python/contrib.layers.md#linear)
-  * [`make_place_holder_tensors_for_base_features`](../../api_docs/python/contrib.layers.md#make_place_holder_tensors_for_base_features)
-  * [`max_pool2d`](../../api_docs/python/contrib.layers.md#max_pool2d)
-  * [`multi_class_target`](../../api_docs/python/contrib.layers.md#multi_class_target)
-  * [`one_hot_column`](../../api_docs/python/contrib.layers.md#one_hot_column)
-  * [`one_hot_encoding`](../../api_docs/python/contrib.layers.md#one_hot_encoding)
-  * [`optimize_loss`](../../api_docs/python/contrib.layers.md#optimize_loss)
-  * [`parse_feature_columns_from_examples`](../../api_docs/python/contrib.layers.md#parse_feature_columns_from_examples)
-  * [`parse_feature_columns_from_sequence_examples`](../../api_docs/python/contrib.layers.md#parse_feature_columns_from_sequence_examples)
-  * [`real_valued_column`](../../api_docs/python/contrib.layers.md#real_valued_column)
-  * [`regression_target`](../../api_docs/python/contrib.layers.md#regression_target)
-  * [`relu`](../../api_docs/python/contrib.layers.md#relu)
-  * [`relu6`](../../api_docs/python/contrib.layers.md#relu6)
-  * [`repeat`](../../api_docs/python/contrib.layers.md#repeat)
-  * [`safe_embedding_lookup_sparse`](../../api_docs/python/contrib.layers.md#safe_embedding_lookup_sparse)
-  * [`scattered_embedding_column`](../../api_docs/python/contrib.layers.md#scattered_embedding_column)
-  * [`separable_conv2d`](../../api_docs/python/contrib.layers.md#separable_conv2d)
-  * [`separable_convolution2d`](../../api_docs/python/contrib.layers.md#separable_convolution2d)
-  * [`sequence_input_from_feature_columns`](../../api_docs/python/contrib.layers.md#sequence_input_from_feature_columns)
-  * [`shared_embedding_columns`](../../api_docs/python/contrib.layers.md#shared_embedding_columns)
-  * [`softmax`](../../api_docs/python/contrib.layers.md#softmax)
-  * [`sparse_column_with_hash_bucket`](../../api_docs/python/contrib.layers.md#sparse_column_with_hash_bucket)
-  * [`sparse_column_with_integerized_feature`](../../api_docs/python/contrib.layers.md#sparse_column_with_integerized_feature)
-  * [`sparse_column_with_keys`](../../api_docs/python/contrib.layers.md#sparse_column_with_keys)
-  * [`stack`](../../api_docs/python/contrib.layers.md#stack)
-  * [`sum_regularizer`](../../api_docs/python/contrib.layers.md#sum_regularizer)
-  * [`summarize_activation`](../../api_docs/python/contrib.layers.md#summarize_activation)
-  * [`summarize_activations`](../../api_docs/python/contrib.layers.md#summarize_activations)
-  * [`summarize_collection`](../../api_docs/python/contrib.layers.md#summarize_collection)
-  * [`summarize_tensor`](../../api_docs/python/contrib.layers.md#summarize_tensor)
-  * [`summarize_tensors`](../../api_docs/python/contrib.layers.md#summarize_tensors)
-  * [`unit_norm`](../../api_docs/python/contrib.layers.md#unit_norm)
-  * [`variance_scaling_initializer`](../../api_docs/python/contrib.layers.md#variance_scaling_initializer)
-  * [`weighted_sparse_column`](../../api_docs/python/contrib.layers.md#weighted_sparse_column)
-  * [`weighted_sum_from_feature_columns`](../../api_docs/python/contrib.layers.md#weighted_sum_from_feature_columns)
-  * [`xavier_initializer`](../../api_docs/python/contrib.layers.md#xavier_initializer)
-  * [`xavier_initializer_conv2d`](../../api_docs/python/contrib.layers.md#xavier_initializer_conv2d)
-
-* **[Learn (contrib)](../../api_docs/python/contrib.learn.md)**:
-  * [`BaseEstimator`](../../api_docs/python/contrib.learn.md#BaseEstimator)
-  * [`build_parsing_serving_input_fn`](../../api_docs/python/contrib.learn.md#build_parsing_serving_input_fn)
-  * [`DNNClassifier`](../../api_docs/python/contrib.learn.md#DNNClassifier)
-  * [`DNNLinearCombinedClassifier`](../../api_docs/python/contrib.learn.md#DNNLinearCombinedClassifier)
-  * [`DNNLinearCombinedRegressor`](../../api_docs/python/contrib.learn.md#DNNLinearCombinedRegressor)
-  * [`DNNRegressor`](../../api_docs/python/contrib.learn.md#DNNRegressor)
-  * [`Estimator`](../../api_docs/python/contrib.learn.md#Estimator)
-  * [`Evaluable`](../../api_docs/python/contrib.learn.md#Evaluable)
-  * [`evaluate`](../../api_docs/python/contrib.learn.md#evaluate)
-  * [`Experiment`](../../api_docs/python/contrib.learn.md#Experiment)
-  * [`ExportStrategy`](../../api_docs/python/contrib.learn.md#ExportStrategy)
-  * [`extract_dask_data`](../../api_docs/python/contrib.learn.md#extract_dask_data)
-  * [`extract_dask_labels`](../../api_docs/python/contrib.learn.md#extract_dask_labels)
-  * [`extract_pandas_data`](../../api_docs/python/contrib.learn.md#extract_pandas_data)
-  * [`extract_pandas_labels`](../../api_docs/python/contrib.learn.md#extract_pandas_labels)
-  * [`extract_pandas_matrix`](../../api_docs/python/contrib.learn.md#extract_pandas_matrix)
-  * [`infer`](../../api_docs/python/contrib.learn.md#infer)
-  * [`infer_real_valued_columns_from_input`](../../api_docs/python/contrib.learn.md#infer_real_valued_columns_from_input)
-  * [`infer_real_valued_columns_from_input_fn`](../../api_docs/python/contrib.learn.md#infer_real_valued_columns_from_input_fn)
-  * [`KMeansClustering`](../../api_docs/python/contrib.learn.md#KMeansClustering)
-  * [`LinearClassifier`](../../api_docs/python/contrib.learn.md#LinearClassifier)
-  * [`LinearRegressor`](../../api_docs/python/contrib.learn.md#LinearRegressor)
-  * [`LogisticRegressor`](../../api_docs/python/contrib.learn.md#LogisticRegressor)
-  * [`MetricSpec`](../../api_docs/python/contrib.learn.md#MetricSpec)
-  * [`ModeKeys`](../../api_docs/python/contrib.learn.md#ModeKeys)
-  * [`ModelFnOps`](../../api_docs/python/contrib.learn.md#ModelFnOps)
-  * [`NanLossDuringTrainingError`](../../api_docs/python/contrib.learn.md#NanLossDuringTrainingError)
-  * [`NotFittedError`](../../api_docs/python/contrib.learn.md#NotFittedError)
-  * [`PredictionKey`](../../api_docs/python/contrib.learn.md#PredictionKey)
-  * [`ProblemType`](../../api_docs/python/contrib.learn.md#ProblemType)
-  * [`read_batch_examples`](../../api_docs/python/contrib.learn.md#read_batch_examples)
-  * [`read_batch_features`](../../api_docs/python/contrib.learn.md#read_batch_features)
-  * [`read_batch_record_features`](../../api_docs/python/contrib.learn.md#read_batch_record_features)
-  * [`run_feeds`](../../api_docs/python/contrib.learn.md#run_feeds)
-  * [`run_n`](../../api_docs/python/contrib.learn.md#run_n)
-  * [`RunConfig`](../../api_docs/python/contrib.learn.md#RunConfig)
-  * [`TaskType`](../../api_docs/python/contrib.learn.md#TaskType)
-  * [`train`](../../api_docs/python/contrib.learn.md#train)
-  * [`Trainable`](../../api_docs/python/contrib.learn.md#Trainable)
-
-* **[Monitors (contrib)](../../api_docs/python/contrib.learn.monitors.md)**:
-  * [`BaseMonitor`](../../api_docs/python/contrib.learn.monitors.md#BaseMonitor)
-  * [`CaptureVariable`](../../api_docs/python/contrib.learn.monitors.md#CaptureVariable)
-  * [`CheckpointSaver`](../../api_docs/python/contrib.learn.monitors.md#CheckpointSaver)
-  * [`EveryN`](../../api_docs/python/contrib.learn.monitors.md#EveryN)
-  * [`ExportMonitor`](../../api_docs/python/contrib.learn.monitors.md#ExportMonitor)
-  * [`get_default_monitors`](../../api_docs/python/contrib.learn.monitors.md#get_default_monitors)
-  * [`GraphDump`](../../api_docs/python/contrib.learn.monitors.md#GraphDump)
-  * [`LoggingTrainable`](../../api_docs/python/contrib.learn.monitors.md#LoggingTrainable)
-  * [`NanLoss`](../../api_docs/python/contrib.learn.monitors.md#NanLoss)
-  * [`PrintTensor`](../../api_docs/python/contrib.learn.monitors.md#PrintTensor)
-  * [`replace_monitors_with_hooks`](../../api_docs/python/contrib.learn.monitors.md#replace_monitors_with_hooks)
-  * [`RunHookAdapterForMonitors`](../../api_docs/python/contrib.learn.monitors.md#RunHookAdapterForMonitors)
-  * [`StepCounter`](../../api_docs/python/contrib.learn.monitors.md#StepCounter)
-  * [`StopAtStep`](../../api_docs/python/contrib.learn.monitors.md#StopAtStep)
-  * [`SummarySaver`](../../api_docs/python/contrib.learn.monitors.md#SummarySaver)
-  * [`SummaryWriterCache`](../../api_docs/python/contrib.learn.monitors.md#SummaryWriterCache)
-  * [`ValidationMonitor`](../../api_docs/python/contrib.learn.monitors.md#ValidationMonitor)
-
-* **[Sequence to Sequence (contrib)](../../api_docs/python/contrib.legacy_seq2seq.md)**:
-  * [`attention_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#attention_decoder)
-  * [`basic_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#basic_rnn_seq2seq)
-  * [`embedding_attention_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_attention_decoder)
-  * [`embedding_attention_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_attention_seq2seq)
-  * [`embedding_rnn_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_rnn_decoder)
-  * [`embedding_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_rnn_seq2seq)
-  * [`embedding_tied_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#embedding_tied_rnn_seq2seq)
-  * [`model_with_buckets`](../../api_docs/python/contrib.legacy_seq2seq.md#model_with_buckets)
-  * [`one2many_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#one2many_rnn_seq2seq)
-  * [`rnn_decoder`](../../api_docs/python/contrib.legacy_seq2seq.md#rnn_decoder)
-  * [`sequence_loss`](../../api_docs/python/contrib.legacy_seq2seq.md#sequence_loss)
-  * [`sequence_loss_by_example`](../../api_docs/python/contrib.legacy_seq2seq.md#sequence_loss_by_example)
-  * [`tied_rnn_seq2seq`](../../api_docs/python/contrib.legacy_seq2seq.md#tied_rnn_seq2seq)
-
-* **[Linear Algebra (contrib)](../../api_docs/python/contrib.linalg.md)**:
-  * [`LinearOperator`](../../api_docs/python/contrib.linalg.md#LinearOperator)
-  * [`LinearOperatorComposition`](../../api_docs/python/contrib.linalg.md#LinearOperatorComposition)
-  * [`LinearOperatorDiag`](../../api_docs/python/contrib.linalg.md#LinearOperatorDiag)
-  * [`LinearOperatorIdentity`](../../api_docs/python/contrib.linalg.md#LinearOperatorIdentity)
-  * [`LinearOperatorMatrix`](../../api_docs/python/contrib.linalg.md#LinearOperatorMatrix)
-  * [`LinearOperatorScaledIdentity`](../../api_docs/python/contrib.linalg.md#LinearOperatorScaledIdentity)
-  * [`LinearOperatorTriL`](../../api_docs/python/contrib.linalg.md#LinearOperatorTriL)
-  * [`LinearOperatorUDVHUpdate`](../../api_docs/python/contrib.linalg.md#LinearOperatorUDVHUpdate)
-
-* **[Losses (contrib)](../../api_docs/python/contrib.losses.md)**:
-  * [`absolute_difference`](../../api_docs/python/contrib.losses.md#absolute_difference)
-  * [`add_loss`](../../api_docs/python/contrib.losses.md#add_loss)
-  * [`compute_weighted_loss`](../../api_docs/python/contrib.losses.md#compute_weighted_loss)
-  * [`cosine_distance`](../../api_docs/python/contrib.losses.md#cosine_distance)
-  * [`get_losses`](../../api_docs/python/contrib.losses.md#get_losses)
-  * [`get_regularization_losses`](../../api_docs/python/contrib.losses.md#get_regularization_losses)
-  * [`get_total_loss`](../../api_docs/python/contrib.losses.md#get_total_loss)
-  * [`hinge_loss`](../../api_docs/python/contrib.losses.md#hinge_loss)
-  * [`log_loss`](../../api_docs/python/contrib.losses.md#log_loss)
-  * [`mean_pairwise_squared_error`](../../api_docs/python/contrib.losses.md#mean_pairwise_squared_error)
-  * [`mean_squared_error`](../../api_docs/python/contrib.losses.md#mean_squared_error)
-  * [`sigmoid_cross_entropy`](../../api_docs/python/contrib.losses.md#sigmoid_cross_entropy)
-  * [`softmax_cross_entropy`](../../api_docs/python/contrib.losses.md#softmax_cross_entropy)
-  * [`sparse_softmax_cross_entropy`](../../api_docs/python/contrib.losses.md#sparse_softmax_cross_entropy)
-
-* **[Optimization (contrib)](../../api_docs/python/contrib.opt.md)**:
-  * [`ExternalOptimizerInterface`](../../api_docs/python/contrib.opt.md#ExternalOptimizerInterface)
-  * [`MovingAverageOptimizer`](../../api_docs/python/contrib.opt.md#MovingAverageOptimizer)
-  * [`ScipyOptimizerInterface`](../../api_docs/python/contrib.opt.md#ScipyOptimizerInterface)
-  * [`VariableClippingOptimizer`](../../api_docs/python/contrib.opt.md#VariableClippingOptimizer)
-
-* **[RNN and Cells (contrib)](../../api_docs/python/contrib.rnn.md)**:
-  * [`AttentionCellWrapper`](../../api_docs/python/contrib.rnn.md#AttentionCellWrapper)
-  * [`BasicLSTMCell`](../../api_docs/python/contrib.rnn.md#BasicLSTMCell)
-  * [`BasicRNNCell`](../../api_docs/python/contrib.rnn.md#BasicRNNCell)
-  * [`CompiledWrapper`](../../api_docs/python/contrib.rnn.md#CompiledWrapper)
-  * [`CoupledInputForgetGateLSTMCell`](../../api_docs/python/contrib.rnn.md#CoupledInputForgetGateLSTMCell)
-  * [`DeviceWrapper`](../../api_docs/python/contrib.rnn.md#DeviceWrapper)
-  * [`DropoutWrapper`](../../api_docs/python/contrib.rnn.md#DropoutWrapper)
-  * [`EmbeddingWrapper`](../../api_docs/python/contrib.rnn.md#EmbeddingWrapper)
-  * [`FusedRNNCell`](../../api_docs/python/contrib.rnn.md#FusedRNNCell)
-  * [`FusedRNNCellAdaptor`](../../api_docs/python/contrib.rnn.md#FusedRNNCellAdaptor)
-  * [`GridLSTMCell`](../../api_docs/python/contrib.rnn.md#GridLSTMCell)
-  * [`GRUBlockCell`](../../api_docs/python/contrib.rnn.md#GRUBlockCell)
-  * [`GRUCell`](../../api_docs/python/contrib.rnn.md#GRUCell)
-  * [`InputProjectionWrapper`](../../api_docs/python/contrib.rnn.md#InputProjectionWrapper)
-  * [`LayerNormBasicLSTMCell`](../../api_docs/python/contrib.rnn.md#LayerNormBasicLSTMCell)
-  * [`LSTMBlockCell`](../../api_docs/python/contrib.rnn.md#LSTMBlockCell)
-  * [`LSTMBlockFusedCell`](../../api_docs/python/contrib.rnn.md#LSTMBlockFusedCell)
-  * [`LSTMBlockWrapper`](../../api_docs/python/contrib.rnn.md#LSTMBlockWrapper)
-  * [`LSTMCell`](../../api_docs/python/contrib.rnn.md#LSTMCell)
-  * [`LSTMStateTuple`](../../api_docs/python/contrib.rnn.md#LSTMStateTuple)
-  * [`MultiRNNCell`](../../api_docs/python/contrib.rnn.md#MultiRNNCell)
-  * [`OutputProjectionWrapper`](../../api_docs/python/contrib.rnn.md#OutputProjectionWrapper)
-  * [`ResidualWrapper`](../../api_docs/python/contrib.rnn.md#ResidualWrapper)
-  * [`RNNCell`](../../api_docs/python/contrib.rnn.md#RNNCell)
-  * [`stack_bidirectional_dynamic_rnn`](../../api_docs/python/contrib.rnn.md#stack_bidirectional_dynamic_rnn)
-  * [`static_bidirectional_rnn`](../../api_docs/python/contrib.rnn.md#static_bidirectional_rnn)
-  * [`static_rnn`](../../api_docs/python/contrib.rnn.md#static_rnn)
-  * [`static_state_saving_rnn`](../../api_docs/python/contrib.rnn.md#static_state_saving_rnn)
-  * [`TimeFreqLSTMCell`](../../api_docs/python/contrib.rnn.md#TimeFreqLSTMCell)
-  * [`TimeReversedFusedRNN`](../../api_docs/python/contrib.rnn.md#TimeReversedFusedRNN)
-
-* **[Metrics (contrib)](../../api_docs/python/contrib.metrics.md)**:
-  * [`accuracy`](../../api_docs/python/contrib.metrics.md#accuracy)
-  * [`aggregate_metric_map`](../../api_docs/python/contrib.metrics.md#aggregate_metric_map)
-  * [`aggregate_metrics`](../../api_docs/python/contrib.metrics.md#aggregate_metrics)
-  * [`auc_using_histogram`](../../api_docs/python/contrib.metrics.md#auc_using_histogram)
-  * [`confusion_matrix`](../../api_docs/python/contrib.metrics.md#confusion_matrix)
-  * [`set_difference`](../../api_docs/python/contrib.metrics.md#set_difference)
-  * [`set_intersection`](../../api_docs/python/contrib.metrics.md#set_intersection)
-  * [`set_size`](../../api_docs/python/contrib.metrics.md#set_size)
-  * [`set_union`](../../api_docs/python/contrib.metrics.md#set_union)
-  * [`streaming_accuracy`](../../api_docs/python/contrib.metrics.md#streaming_accuracy)
-  * [`streaming_auc`](../../api_docs/python/contrib.metrics.md#streaming_auc)
-  * [`streaming_concat`](../../api_docs/python/contrib.metrics.md#streaming_concat)
-  * [`streaming_covariance`](../../api_docs/python/contrib.metrics.md#streaming_covariance)
-  * [`streaming_false_negatives`](../../api_docs/python/contrib.metrics.md#streaming_false_negatives)
-  * [`streaming_false_negatives_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_false_negatives_at_thresholds)
-  * [`streaming_false_positives`](../../api_docs/python/contrib.metrics.md#streaming_false_positives)
-  * [`streaming_false_positives_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_false_positives_at_thresholds)
-  * [`streaming_mean`](../../api_docs/python/contrib.metrics.md#streaming_mean)
-  * [`streaming_mean_absolute_error`](../../api_docs/python/contrib.metrics.md#streaming_mean_absolute_error)
-  * [`streaming_mean_cosine_distance`](../../api_docs/python/contrib.metrics.md#streaming_mean_cosine_distance)
-  * [`streaming_mean_iou`](../../api_docs/python/contrib.metrics.md#streaming_mean_iou)
-  * [`streaming_mean_relative_error`](../../api_docs/python/contrib.metrics.md#streaming_mean_relative_error)
-  * [`streaming_mean_squared_error`](../../api_docs/python/contrib.metrics.md#streaming_mean_squared_error)
-  * [`streaming_mean_tensor`](../../api_docs/python/contrib.metrics.md#streaming_mean_tensor)
-  * [`streaming_pearson_correlation`](../../api_docs/python/contrib.metrics.md#streaming_pearson_correlation)
-  * [`streaming_percentage_less`](../../api_docs/python/contrib.metrics.md#streaming_percentage_less)
-  * [`streaming_precision`](../../api_docs/python/contrib.metrics.md#streaming_precision)
-  * [`streaming_precision_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_precision_at_thresholds)
-  * [`streaming_recall`](../../api_docs/python/contrib.metrics.md#streaming_recall)
-  * [`streaming_recall_at_k`](../../api_docs/python/contrib.metrics.md#streaming_recall_at_k)
-  * [`streaming_recall_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_recall_at_thresholds)
-  * [`streaming_root_mean_squared_error`](../../api_docs/python/contrib.metrics.md#streaming_root_mean_squared_error)
-  * [`streaming_sensitivity_at_specificity`](../../api_docs/python/contrib.metrics.md#streaming_sensitivity_at_specificity)
-  * [`streaming_sparse_average_precision_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_average_precision_at_k)
-  * [`streaming_sparse_precision_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_precision_at_k)
-  * [`streaming_sparse_precision_at_top_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_precision_at_top_k)
-  * [`streaming_sparse_recall_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_recall_at_k)
-  * [`streaming_specificity_at_sensitivity`](../../api_docs/python/contrib.metrics.md#streaming_specificity_at_sensitivity)
-  * [`streaming_true_negatives`](../../api_docs/python/contrib.metrics.md#streaming_true_negatives)
-  * [`streaming_true_negatives_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_true_negatives_at_thresholds)
-  * [`streaming_true_positives`](../../api_docs/python/contrib.metrics.md#streaming_true_positives)
-  * [`streaming_true_positives_at_thresholds`](../../api_docs/python/contrib.metrics.md#streaming_true_positives_at_thresholds)
-
-* **[Training (contrib)](../../api_docs/python/contrib.training.md)**:
-  * [`batch_sequences_with_states`](../../api_docs/python/contrib.training.md#batch_sequences_with_states)
-  * [`bucket`](../../api_docs/python/contrib.training.md#bucket)
-  * [`bucket_by_sequence_length`](../../api_docs/python/contrib.training.md#bucket_by_sequence_length)
-  * [`NextQueuedSequenceBatch`](../../api_docs/python/contrib.training.md#NextQueuedSequenceBatch)
-  * [`rejection_sample`](../../api_docs/python/contrib.training.md#rejection_sample)
-  * [`resample_at_rate`](../../api_docs/python/contrib.training.md#resample_at_rate)
-  * [`SequenceQueueingStateSaver`](../../api_docs/python/contrib.training.md#SequenceQueueingStateSaver)
-  * [`stratified_sample`](../../api_docs/python/contrib.training.md#stratified_sample)
-  * [`weighted_resample`](../../api_docs/python/contrib.training.md#weighted_resample)
-
-* **[Utilities (contrib)](../../api_docs/python/contrib.util.md)**:
-  * [`constant_value`](../../api_docs/python/contrib.util.md#constant_value)
-  * [`make_ndarray`](../../api_docs/python/contrib.util.md#make_ndarray)
-  * [`make_tensor_proto`](../../api_docs/python/contrib.util.md#make_tensor_proto)
-  * [`ops_used_by_graph_def`](../../api_docs/python/contrib.util.md#ops_used_by_graph_def)
-  * [`stripped_op_list_for_graph`](../../api_docs/python/contrib.util.md#stripped_op_list_for_graph)
-
-* **[Copying Graph Elements (contrib)](../../api_docs/python/contrib.copy_graph.md)**:
-  * [`copy_op_to_graph`](../../api_docs/python/contrib.copy_graph.md#copy_op_to_graph)
-  * [`copy_variable_to_graph`](../../api_docs/python/contrib.copy_graph.md#copy_variable_to_graph)
-  * [`get_copied_op`](../../api_docs/python/contrib.copy_graph.md#get_copied_op)
-
-* **[TensorFlow Debugger](../../api_docs/python/tf_debug.md)**:
-  * [`add_debug_tensor_watch`](../../api_docs/python/tf_debug.md#add_debug_tensor_watch)
-  * [`DebugDumpDir`](../../api_docs/python/tf_debug.md#DebugDumpDir)
-  * [`DebugTensorDatum`](../../api_docs/python/tf_debug.md#DebugTensorDatum)
-  * [`DumpingDebugHook`](../../api_docs/python/tf_debug.md#DumpingDebugHook)
-  * [`DumpingDebugWrapperSession`](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession)
-  * [`has_inf_or_nan`](../../api_docs/python/tf_debug.md#has_inf_or_nan)
-  * [`load_tensor_from_event_file`](../../api_docs/python/tf_debug.md#load_tensor_from_event_file)
-  * [`LocalCLIDebugHook`](../../api_docs/python/tf_debug.md#LocalCLIDebugHook)
-  * [`LocalCLIDebugWrapperSession`](../../api_docs/python/tf_debug.md#LocalCLIDebugWrapperSession)
-  * [`watch_graph`](../../api_docs/python/tf_debug.md#watch_graph)
-  * [`watch_graph_with_blacklists`](../../api_docs/python/tf_debug.md#watch_graph_with_blacklists)
-
diff --git a/tensorflow/g3doc/api_docs/python/io_ops.md b/tensorflow/g3doc/api_docs/python/io_ops.md
deleted file mode 100644
index fad7a6f12e2..00000000000
--- a/tensorflow/g3doc/api_docs/python/io_ops.md
+++ /dev/null
@@ -1,3630 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Inputs and Readers
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Placeholders
-
-TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on [Feeding
-data](../../how_tos/reading_data/index.md#feeding).
-
-- - -
-
-### `tf.placeholder(dtype, shape=None, name=None)` {#placeholder}
-
-Inserts a placeholder for a tensor that will be always fed.
-
-**Important**: This tensor will produce an error if evaluated. Its value must
-be fed using the `feed_dict` optional argument to `Session.run()`,
-`Tensor.eval()`, or `Operation.run()`.
-
-For example:
-
-```python
-x = tf.placeholder(tf.float32, shape=(1024, 1024))
-y = tf.matmul(x, x)
-
-with tf.Session() as sess:
-  print(sess.run(y))  # ERROR: will fail because x was not fed.
-
-  rand_array = np.random.rand(1024, 1024)
-  print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
-```
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of elements in the tensor to be fed.
-*  <b>`shape`</b>: The shape of the tensor to be fed (optional). If the shape is not
-    specified, you can feed a tensor of any shape.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` that may be used as a handle for feeding a value, but not
-  evaluated directly.
-
-
-- - -
-
-### `tf.placeholder_with_default(input, shape, name=None)` {#placeholder_with_default}
-
-A placeholder op that passes through `input` when its output is not fed.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. The default value to produce when `output` is not fed.
-*  <b>`shape`</b>: A `tf.TensorShape` or list of `ints`.
-    The (possibly partial) shape of the tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  A placeholder tensor that defaults to `input` if it is not fed.
-
-
-
-For feeding `SparseTensor`s which are composite type,
-there is a convenience function:
-
-- - -
-
-### `tf.sparse_placeholder(dtype, shape=None, name=None)` {#sparse_placeholder}
-
-Inserts a placeholder for a sparse tensor that will be always fed.
-
-**Important**: This sparse tensor will produce an error if evaluated.
-Its value must be fed using the `feed_dict` optional argument to
-`Session.run()`, `Tensor.eval()`, or `Operation.run()`.
-
-For example:
-
-```python
-x = tf.sparse_placeholder(tf.float32)
-y = tf.sparse_reduce_sum(x)
-
-with tf.Session() as sess:
-  print(sess.run(y))  # ERROR: will fail because x was not fed.
-
-  indices = np.array([[3, 2, 0], [4, 5, 1]], dtype=np.int64)
-  values = np.array([1.0, 2.0], dtype=np.float32)
-  shape = np.array([7, 9, 2], dtype=np.int64)
-  print(sess.run(y, feed_dict={
-    x: tf.SparseTensorValue(indices, values, shape)}))  # Will succeed.
-  print(sess.run(y, feed_dict={
-    x: (indices, values, shape)}))  # Will succeed.
-
-  sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
-  sp_value = sp.eval(session)
-  print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
-```
-
-##### Args:
-
-
-*  <b>`dtype`</b>: The type of `values` elements in the tensor to be fed.
-*  <b>`shape`</b>: The shape of the tensor to be fed (optional). If the shape is not
-    specified, you can feed a sparse tensor of any shape.
-*  <b>`name`</b>: A name for prefixing the operations (optional).
-
-##### Returns:
-
-  A `SparseTensor` that may be used as a handle for feeding a value, but not
-  evaluated directly.
-
-
-
-## Readers
-
-TensorFlow provides a set of Reader classes for reading data formats.
-For more information on inputs and readers, see [Reading
-data](../../how_tos/reading_data/index.md).
-
-- - -
-
-### `class tf.ReaderBase` {#ReaderBase}
-
-Base class for different Reader types, that produce a record every step.
-
-Conceptually, Readers convert string 'work units' into records (key,
-value pairs).  Typically the 'work units' are filenames and the
-records are extracted from the contents of those files.  We want a
-single record produced per step, but a work unit can correspond to
-many records.
-
-Therefore we introduce some decoupling using a queue.  The queue
-contains the work units and the Reader dequeues from the queue when
-it is asked to produce a record (via Read()) but it has finished the
-last work unit.
-- - -
-
-#### `tf.ReaderBase.__init__(reader_ref, supports_serialize=False)` {#ReaderBase.__init__}
-
-Creates a new ReaderBase.
-
-##### Args:
-
-
-*  <b>`reader_ref`</b>: The operation that implements the reader.
-*  <b>`supports_serialize`</b>: True if the reader implementation can
-    serialize its state.
-
-
-- - -
-
-#### `tf.ReaderBase.num_records_produced(name=None)` {#ReaderBase.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.num_work_units_completed(name=None)` {#ReaderBase.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.read(queue, name=None)` {#ReaderBase.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.read_up_to(queue, num_records, name=None)` {#ReaderBase.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.reader_ref` {#ReaderBase.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.ReaderBase.reset(name=None)` {#ReaderBase.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.ReaderBase.restore_state(state, name=None)` {#ReaderBase.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.ReaderBase.serialize_state(name=None)` {#ReaderBase.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.ReaderBase.supports_serialize` {#ReaderBase.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-- - -
-
-### `class tf.TextLineReader` {#TextLineReader}
-
-A Reader that outputs the lines of a file delimited by newlines.
-
-Newlines are stripped from the output.
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.TextLineReader.__init__(skip_header_lines=None, name=None)` {#TextLineReader.__init__}
-
-Create a TextLineReader.
-
-##### Args:
-
-
-*  <b>`skip_header_lines`</b>: An optional int. Defaults to 0.  Number of lines
-    to skip from the beginning of every file.
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.TextLineReader.num_records_produced(name=None)` {#TextLineReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.num_work_units_completed(name=None)` {#TextLineReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.read(queue, name=None)` {#TextLineReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.read_up_to(queue, num_records, name=None)` {#TextLineReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.reader_ref` {#TextLineReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.TextLineReader.reset(name=None)` {#TextLineReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TextLineReader.restore_state(state, name=None)` {#TextLineReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TextLineReader.serialize_state(name=None)` {#TextLineReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.TextLineReader.supports_serialize` {#TextLineReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-- - -
-
-### `class tf.WholeFileReader` {#WholeFileReader}
-
-A Reader that outputs the entire contents of a file as a value.
-
-To use, enqueue filenames in a Queue.  The output of Read will
-be a filename (key) and the contents of that file (value).
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.WholeFileReader.__init__(name=None)` {#WholeFileReader.__init__}
-
-Create a WholeFileReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.WholeFileReader.num_records_produced(name=None)` {#WholeFileReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.num_work_units_completed(name=None)` {#WholeFileReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.read(queue, name=None)` {#WholeFileReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.read_up_to(queue, num_records, name=None)` {#WholeFileReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.reader_ref` {#WholeFileReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.WholeFileReader.reset(name=None)` {#WholeFileReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.WholeFileReader.restore_state(state, name=None)` {#WholeFileReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.WholeFileReader.serialize_state(name=None)` {#WholeFileReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.WholeFileReader.supports_serialize` {#WholeFileReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-- - -
-
-### `class tf.IdentityReader` {#IdentityReader}
-
-A Reader that outputs the queued work as both the key and value.
-
-To use, enqueue strings in a Queue.  Read will take the front
-work string and output (work, work).
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.IdentityReader.__init__(name=None)` {#IdentityReader.__init__}
-
-Create a IdentityReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.IdentityReader.num_records_produced(name=None)` {#IdentityReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.num_work_units_completed(name=None)` {#IdentityReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.read(queue, name=None)` {#IdentityReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.read_up_to(queue, num_records, name=None)` {#IdentityReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.reader_ref` {#IdentityReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.IdentityReader.reset(name=None)` {#IdentityReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.IdentityReader.restore_state(state, name=None)` {#IdentityReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.IdentityReader.serialize_state(name=None)` {#IdentityReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.IdentityReader.supports_serialize` {#IdentityReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-- - -
-
-### `class tf.TFRecordReader` {#TFRecordReader}
-
-A Reader that outputs the records from a TFRecords file.
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.TFRecordReader.__init__(name=None, options=None)` {#TFRecordReader.__init__}
-
-Create a TFRecordReader.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`options`</b>: A TFRecordOptions object (optional).
-
-
-- - -
-
-#### `tf.TFRecordReader.num_records_produced(name=None)` {#TFRecordReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.num_work_units_completed(name=None)` {#TFRecordReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.read(queue, name=None)` {#TFRecordReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.read_up_to(queue, num_records, name=None)` {#TFRecordReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.reader_ref` {#TFRecordReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.TFRecordReader.reset(name=None)` {#TFRecordReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TFRecordReader.restore_state(state, name=None)` {#TFRecordReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.TFRecordReader.serialize_state(name=None)` {#TFRecordReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.TFRecordReader.supports_serialize` {#TFRecordReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-- - -
-
-### `class tf.FixedLengthRecordReader` {#FixedLengthRecordReader}
-
-A Reader that outputs fixed-length records from a file.
-
-See ReaderBase for supported methods.
-- - -
-
-#### `tf.FixedLengthRecordReader.__init__(record_bytes, header_bytes=None, footer_bytes=None, name=None)` {#FixedLengthRecordReader.__init__}
-
-Create a FixedLengthRecordReader.
-
-##### Args:
-
-
-*  <b>`record_bytes`</b>: An int.
-*  <b>`header_bytes`</b>: An optional int. Defaults to 0.
-*  <b>`footer_bytes`</b>: An optional int. Defaults to 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.num_records_produced(name=None)` {#FixedLengthRecordReader.num_records_produced}
-
-Returns the number of records this reader has produced.
-
-This is the same as the number of Read executions that have
-succeeded.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.num_work_units_completed(name=None)` {#FixedLengthRecordReader.num_work_units_completed}
-
-Returns the number of work units this reader has finished processing.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  An int64 Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.read(queue, name=None)` {#FixedLengthRecordReader.read}
-
-Returns the next record (key, value pair) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (key, value).
-
-*  <b>`key`</b>: A string scalar Tensor.
-*  <b>`value`</b>: A string scalar Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.read_up_to(queue, num_records, name=None)` {#FixedLengthRecordReader.read_up_to}
-
-Returns up to num_records (key, value pairs) produced by a reader.
-
-Will dequeue a work unit from queue if necessary (e.g., when the
-Reader needs to start reading from a new file since it has
-finished with the previous file).
-It may return less than num_records even before the last batch.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A Queue or a mutable string Tensor representing a handle
-    to a Queue, with string work items.
-*  <b>`num_records`</b>: Number of records to read.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of Tensors (keys, values).
-
-*  <b>`keys`</b>: A 1-D string Tensor.
-*  <b>`values`</b>: A 1-D string Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.reader_ref` {#FixedLengthRecordReader.reader_ref}
-
-Op that implements the reader.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.reset(name=None)` {#FixedLengthRecordReader.reset}
-
-Restore a reader to its initial clean state.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.restore_state(state, name=None)` {#FixedLengthRecordReader.restore_state}
-
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`state`</b>: A string Tensor.
-    Result of a SerializeState of a Reader with matching type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.serialize_state(name=None)` {#FixedLengthRecordReader.serialize_state}
-
-Produce a string tensor that encodes the state of a reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A string Tensor.
-
-
-- - -
-
-#### `tf.FixedLengthRecordReader.supports_serialize` {#FixedLengthRecordReader.supports_serialize}
-
-Whether the Reader implementation can serialize its state.
-
-
-
-
-## Converting
-
-TensorFlow provides several operations that you can use to convert various data
-formats into tensors.
-
-- - -
-
-### `tf.decode_csv(records, record_defaults, field_delim=None, name=None)` {#decode_csv}
-
-Convert CSV records to tensors. Each column maps to one tensor.
-
-RFC 4180 format is expected for the CSV records.
-(https://tools.ietf.org/html/rfc4180)
-Note that we allow leading and trailing spaces with int or float field.
-
-##### Args:
-
-
-*  <b>`records`</b>: A `Tensor` of type `string`.
-    Each string is a record/row in the csv and all records should have
-    the same format.
-*  <b>`record_defaults`</b>: A list of `Tensor` objects with types from: `float32`, `int32`, `int64`, `string`.
-    One tensor per column of the input record, with either a
-    scalar default value for that column or empty if the column is required.
-*  <b>`field_delim`</b>: An optional `string`. Defaults to `","`.
-    delimiter to separate fields in a record.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `Tensor` objects. Has the same type as `record_defaults`.
-  Each tensor will have the same shape as records.
-
-
-- - -
-
-### `tf.decode_raw(bytes, out_type, little_endian=None, name=None)` {#decode_raw}
-
-Reinterpret the bytes of a string as a vector of numbers.
-
-##### Args:
-
-
-*  <b>`bytes`</b>: A `Tensor` of type `string`.
-    All the elements must have the same length.
-*  <b>`out_type`</b>: A `tf.DType` from: `tf.half, tf.float32, tf.float64, tf.int32, tf.uint8, tf.int16, tf.int8, tf.int64`.
-*  <b>`little_endian`</b>: An optional `bool`. Defaults to `True`.
-    Whether the input `bytes` are in little-endian order.
-    Ignored for `out_type` values that are stored in a single byte like
-    `uint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`.
-  A Tensor with one more dimension than the input `bytes`.  The
-  added dimension will have size equal to the length of the elements
-  of `bytes` divided by the number of bytes to represent `out_type`.
-
-
-
-- - -
-
-### Example protocol buffer
-
-TensorFlow's [recommended format for training
-examples](../../how_tos/reading_data/index.md#standard-tensorflow-format)
-is serialized `Example` protocol buffers, [described
-here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
-They contain `Features`, [described
-here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
-
-- - -
-
-### `class tf.VarLenFeature` {#VarLenFeature}
-
-Configuration for parsing a variable-length input feature.
-
-Fields:
-  dtype: Data type of input.
-- - -
-
-#### `tf.VarLenFeature.__getnewargs__()` {#VarLenFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.VarLenFeature.__getstate__()` {#VarLenFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.VarLenFeature.__new__(_cls, dtype)` {#VarLenFeature.__new__}
-
-Create new instance of VarLenFeature(dtype,)
-
-
-- - -
-
-#### `tf.VarLenFeature.__repr__()` {#VarLenFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.VarLenFeature.dtype` {#VarLenFeature.dtype}
-
-Alias for field number 0
-
-
-
-- - -
-
-### `class tf.FixedLenFeature` {#FixedLenFeature}
-
-Configuration for parsing a fixed-length input feature.
-
-To treat sparse input as dense, provide a `default_value`; otherwise,
-the parse functions will fail on any examples missing this feature.
-
-Fields:
-  shape: Shape of input data.
-  dtype: Data type of input.
-  default_value: Value to be used if an example is missing this feature. It
-      must be compatible with `dtype`.
-- - -
-
-#### `tf.FixedLenFeature.__getnewargs__()` {#FixedLenFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.FixedLenFeature.__getstate__()` {#FixedLenFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.FixedLenFeature.__new__(_cls, shape, dtype, default_value=None)` {#FixedLenFeature.__new__}
-
-Create new instance of FixedLenFeature(shape, dtype, default_value)
-
-
-- - -
-
-#### `tf.FixedLenFeature.__repr__()` {#FixedLenFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.FixedLenFeature.default_value` {#FixedLenFeature.default_value}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.FixedLenFeature.dtype` {#FixedLenFeature.dtype}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.FixedLenFeature.shape` {#FixedLenFeature.shape}
-
-Alias for field number 0
-
-
-
-- - -
-
-### `class tf.FixedLenSequenceFeature` {#FixedLenSequenceFeature}
-
-Configuration for a dense input feature in a sequence item.
-
-To treat a sparse input as dense, provide `allow_missing=True`; otherwise,
-the parse functions will fail on any examples missing this feature.
-
-Fields:
-  shape: Shape of input data.
-  dtype: Data type of input.
-  allow_missing: Whether to allow this feature to be missing from a feature
-    list item.
-- - -
-
-#### `tf.FixedLenSequenceFeature.__getnewargs__()` {#FixedLenSequenceFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__getstate__()` {#FixedLenSequenceFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__new__(_cls, shape, dtype, allow_missing=False)` {#FixedLenSequenceFeature.__new__}
-
-Create new instance of FixedLenSequenceFeature(shape, dtype, allow_missing)
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.__repr__()` {#FixedLenSequenceFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.allow_missing` {#FixedLenSequenceFeature.allow_missing}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.dtype` {#FixedLenSequenceFeature.dtype}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.FixedLenSequenceFeature.shape` {#FixedLenSequenceFeature.shape}
-
-Alias for field number 0
-
-
-
-- - -
-
-### `class tf.SparseFeature` {#SparseFeature}
-
-Configuration for parsing a sparse input feature.
-
-Fields:
-  index_key: Name of index feature.  The underlying feature's type must
-    be `int64` and its length must always match that of the `value_key`
-    feature.
-  value_key: Name of value feature.  The underlying feature's type must
-    be `dtype` and its length must always match that of the `index_key`
-    feature.
-  dtype: Data type of the `value_key` feature.
-  size: A Python int to specify a dimension of the dense shape. Each value in
-    the `index_key` feature must be in `[0, size)`.
-  already_sorted: A Python boolean to specify whether the values in
-    `index_key` are already sorted. If so skip sorting.
-    False by default (optional).
-- - -
-
-#### `tf.SparseFeature.__getnewargs__()` {#SparseFeature.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.SparseFeature.__getstate__()` {#SparseFeature.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.SparseFeature.__new__(_cls, index_key, value_key, dtype, size, already_sorted=False)` {#SparseFeature.__new__}
-
-Create new instance of SparseFeature(index_key, value_key, dtype, size, already_sorted)
-
-
-- - -
-
-#### `tf.SparseFeature.__repr__()` {#SparseFeature.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.SparseFeature.already_sorted` {#SparseFeature.already_sorted}
-
-Alias for field number 4
-
-
-- - -
-
-#### `tf.SparseFeature.dtype` {#SparseFeature.dtype}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.SparseFeature.index_key` {#SparseFeature.index_key}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.SparseFeature.size` {#SparseFeature.size}
-
-Alias for field number 3
-
-
-- - -
-
-#### `tf.SparseFeature.value_key` {#SparseFeature.value_key}
-
-Alias for field number 1
-
-
-
-- - -
-
-### `tf.parse_example(serialized, features, name=None, example_names=None)` {#parse_example}
-
-Parses `Example` protos into a `dict` of tensors.
-
-Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
-protos given in `serialized`.
-
-`example_names` may contain descriptive names for the corresponding serialized
-protos. These may be useful for debugging purposes, but they have no effect on
-the output. If not `None`, `example_names` must be the same length as
-`serialized`.
-
-This op parses serialized examples into a dictionary mapping keys to `Tensor`
-and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
-`SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
-and `SparseFeature` is mapped to a `SparseTensor`, and each
-`FixedLenFeature` is mapped to a `Tensor`.
-
-Each `VarLenFeature` maps to a `SparseTensor` of the specified type
-representing a ragged matrix. Its indices are `[batch, index]` where `batch`
-is the batch entry the value is from in `serialized`, and `index` is the
-value's index in the list of values associated with that feature and example.
-
-Each `SparseFeature` maps to a `SparseTensor` of the specified type
-representing a sparse matrix of shape
-`(serialized.size(), SparseFeature.size)`. Its indices are `[batch, index]`
-where `batch` is the batch entry the value is from in `serialized`, and
-`index` is the value's index is given by the values in the
-`SparseFeature.index_key` feature column.
-
-Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
-`tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
-
-`FixedLenFeature` entries with a `default_value` are optional. With no default
-value, we will fail if that `Feature` is missing from any example in
-`serialized`.
-
-Examples:
-
-For example, if one expects a `tf.float32` sparse feature `ft` and three
-serialized `Example`s are provided:
-
-```
-serialized = [
-  features
-    { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
-  features
-    { feature []},
-  features
-    { feature { key: "ft" value { float_list { value: [3.0] } } }
-]
-```
-
-then the output will look like:
-
-```
-{"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
-                    values=[1.0, 2.0, 3.0],
-                    dense_shape=(3, 2)) }
-```
-
-Given two `Example` input protos in `serialized`:
-
-```
-[
-  features {
-    feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
-    feature { key: "gps" value { float_list { value: [] } } }
-  },
-  features {
-    feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
-    feature { key: "dank" value { int64_list { value: [ 42 ] } } }
-    feature { key: "gps" value { } }
-  }
-]
-```
-
-And arguments
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "kw": VarLenFeature(tf.string),
-    "dank": VarLenFeature(tf.int64),
-    "gps": VarLenFeature(tf.float32),
-}
-```
-
-Then the output is a dictionary:
-
-```python
-{
-  "kw": SparseTensor(
-      indices=[[0, 0], [0, 1], [1, 0]],
-      values=["knit", "big", "emmy"]
-      dense_shape=[2, 2]),
-  "dank": SparseTensor(
-      indices=[[1, 0]],
-      values=[42],
-      dense_shape=[2, 1]),
-  "gps": SparseTensor(
-      indices=[],
-      values=[],
-      dense_shape=[2, 0]),
-}
-```
-
-For dense results in two serialized `Example`s:
-
-```
-[
-  features {
-    feature { key: "age" value { int64_list { value: [ 0 ] } } }
-    feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
-   },
-   features {
-    feature { key: "age" value { int64_list { value: [] } } }
-    feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
-  }
-]
-```
-
-We can use arguments:
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
-    "gender": FixedLenFeature([], dtype=tf.string),
-}
-```
-
-And the expected output is:
-
-```python
-{
-  "age": [[0], [-1]],
-  "gender": [["f"], ["f"]],
-}
-```
-
-Given two `Example` input protos in `serialized`:
-
-```
-[
-  features {
-    feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
-    feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
-  },
-  features {
-    feature { key: "val" value { float_list { value: [ 0.0 ] } } }
-    feature { key: "ix" value { int64_list { value: [ 42 ] } } }
-  }
-]
-```
-
-And arguments
-
-```
-example_names: ["input0", "input1"],
-features: {
-    "sparse": SparseFeature(
-        index_key="ix", value_key="val", dtype=tf.float32, size=100),
-}
-```
-
-Then the output is a dictionary:
-
-```python
-{
-  "sparse": SparseTensor(
-      indices=[[0, 3], [0, 20], [1, 42]],
-      values=[0.5, -1.0, 0.0]
-      dense_shape=[2, 100]),
-}
-```
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A vector (1-D Tensor) of strings, a batch of binary
-    serialized `Example` protos.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature`,
-    `VarLenFeature`, and `SparseFeature` values.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: A vector (1-D Tensor) of strings (optional), the names of
-    the serialized protos in the batch.
-
-##### Returns:
-
-  A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any feature is invalid.
-
-
-- - -
-
-### `tf.parse_single_example(serialized, features, name=None, example_names=None)` {#parse_single_example}
-
-Parses a single `Example` proto.
-
-Similar to `parse_example`, except:
-
-For dense tensors, the returned `Tensor` is identical to the output of
-`parse_example`, except there is no batch dimension, the output shape is the
-same as the shape given in `dense_shape`.
-
-For `SparseTensor`s, the first (batch) column of the indices matrix is removed
-(the indices matrix is a column vector), the values vector is unchanged, and
-the first (`batch_size`) entry of the shape vector is removed (it is now a
-single element vector).
-
-One might see performance advantages by batching `Example` protos with
-`parse_example` instead of using this function directly.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A scalar string Tensor, a single serialized Example.
-    See `_parse_single_example_raw` documentation for more details.
-*  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
-    `VarLenFeature` values.
-*  <b>`name`</b>: A name for this operation (optional).
-*  <b>`example_names`</b>: (Optional) A scalar string Tensor, the associated name.
-    See `_parse_single_example_raw` documentation for more details.
-
-##### Returns:
-
-  A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if any feature is invalid.
-
-
-- - -
-
-### `tf.parse_tensor(serialized, out_type, name=None)` {#parse_tensor}
-
-Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-
-##### Args:
-
-
-*  <b>`serialized`</b>: A `Tensor` of type `string`.
-    A scalar string containing a serialized TensorProto proto.
-*  <b>`out_type`</b>: A `tf.DType`.
-    The type of the serialized tensor.  The provided type must match the
-    type of the serialized tensor and no implicit conversion will take place.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `out_type`. A Tensor of type `out_type`.
-
-
-- - -
-
-### `tf.decode_json_example(json_examples, name=None)` {#decode_json_example}
-
-Convert JSON-encoded Example records to binary protocol buffer strings.
-
-This op translates a tensor containing Example records, encoded using
-the [standard JSON
-mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-into a tensor containing the same records encoded as binary protocol
-buffers. The resulting tensor can then be fed to any of the other
-Example-parsing ops.
-
-##### Args:
-
-
-*  <b>`json_examples`</b>: A `Tensor` of type `string`.
-    Each string is a JSON object serialized according to the JSON
-    mapping of the Example proto.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-  Each string is a binary Example protocol buffer corresponding
-  to the respective element of `json_examples`.
-
-
-
-## Queues
-
-TensorFlow provides several implementations of 'Queues', which are
-structures within the TensorFlow computation graph to stage pipelines
-of tensors together. The following describe the basic Queue interface
-and some implementations.  To see an example use, see [Threading and
-Queues](../../how_tos/threading_and_queues/index.md).
-
-- - -
-
-### `class tf.QueueBase` {#QueueBase}
-
-Base class for queue implementations.
-
-A queue is a TensorFlow data structure that stores tensors across
-multiple steps, and exposes operations that enqueue and dequeue
-tensors.
-
-Each queue element is a tuple of one or more tensors, where each
-tuple component has a static dtype, and may have a static shape. The
-queue implementations support versions of enqueue and dequeue that
-handle single elements, versions that support enqueuing and
-dequeuing a batch of elements at once.
-
-See [`tf.FIFOQueue`](#FIFOQueue) and
-[`tf.RandomShuffleQueue`](#RandomShuffleQueue) for concrete
-implementations of this class, and instructions on how to create
-them.
-
-- - -
-
-#### `tf.QueueBase.enqueue(vals, name=None)` {#QueueBase.enqueue}
-
-Enqueues one element to this queue.
-
-If the queue is full when this operation executes, it will block
-until the element has been enqueued.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed before this operation runs,
-`tf.errors.CancelledError` will be raised. If this operation is
-blocked, and either (i) the queue is closed by a close operation
-with `cancel_pending_enqueues=True`, or (ii) the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`vals`</b>: A tensor, a list or tuple of tensors, or a dictionary containing
-    the values to enqueue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that enqueues a new tuple of tensors to the queue.
-
-
-- - -
-
-#### `tf.QueueBase.enqueue_many(vals, name=None)` {#QueueBase.enqueue_many}
-
-Enqueues zero or more elements to this queue.
-
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tensors in `vals` must have the
-same size in the 0th dimension.
-
-If the queue is full when this operation executes, it will block
-until all of the elements have been enqueued.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed before this operation runs,
-`tf.errors.CancelledError` will be raised. If this operation is
-blocked, and either (i) the queue is closed by a close operation
-with `cancel_pending_enqueues=True`, or (ii) the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`vals`</b>: A tensor, a list or tuple of tensors, or a dictionary
-    from which the queue elements are taken.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that enqueues a batch of tuples of tensors to the queue.
-
-
-
-- - -
-
-#### `tf.QueueBase.dequeue(name=None)` {#QueueBase.dequeue}
-
-Dequeues one element from this queue.
-
-If the queue is empty when this operation executes, it will block
-until there is an element to dequeue.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed, the queue is empty, and there are no pending
-enqueue operations that can fulfill this request,
-`tf.errors.OutOfRangeError` will be raised. If the session is
-[closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of tensors that was dequeued.
-
-
-- - -
-
-#### `tf.QueueBase.dequeue_many(n, name=None)` {#QueueBase.dequeue_many}
-
-Dequeues and concatenates `n` elements from this queue.
-
-This operation concatenates queue-element component tensors along
-the 0th dimension to make a single component tensor.  All of the
-components in the dequeued tuple will have size `n` in the 0th dimension.
-
-If the queue is closed and there are less than `n` elements left, then an
-`OutOfRange` exception is raised.
-
-At runtime, this operation may raise an error if the queue is
-[closed](#QueueBase.close) before or during its execution. If the
-queue is closed, the queue contains fewer than `n` elements, and
-there are no pending enqueue operations that can fulfill this
-request, `tf.errors.OutOfRangeError` will be raised. If the
-session is [closed](../../api_docs/python/client.md#Session.close),
-`tf.errors.CancelledError` will be raised.
-
-##### Args:
-
-
-*  <b>`n`</b>: A scalar `Tensor` containing the number of elements to dequeue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of concatenated tensors that was dequeued.
-
-
-
-- - -
-
-#### `tf.QueueBase.size(name=None)` {#QueueBase.size}
-
-Compute the number of elements in this queue.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar tensor containing the number of elements in this queue.
-
-
-
-- - -
-
-#### `tf.QueueBase.close(cancel_pending_enqueues=False, name=None)` {#QueueBase.close}
-
-Closes this queue.
-
-This operation signals that no more elements will be enqueued in
-the given queue. Subsequent `enqueue` and `enqueue_many`
-operations will fail. Subsequent `dequeue` and `dequeue_many`
-operations will continue to succeed if sufficient elements remain
-in the queue. Subsequent `dequeue` and `dequeue_many` operations
-that would block will fail immediately.
-
-If `cancel_pending_enqueues` is `True`, all pending requests will also
-be cancelled.
-
-##### Args:
-
-
-*  <b>`cancel_pending_enqueues`</b>: (Optional.) A boolean, defaulting to
-    `False` (described above).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The operation that closes the queue.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.QueueBase.__init__(dtypes, shapes, names, queue_ref)` {#QueueBase.__init__}
-
-Constructs a queue object from a queue reference.
-
-The two optional lists, `shapes` and `names`, must be of the same length
-as `dtypes` if provided.  The values at a given index `i` indicate the
-shape and name to use for the corresponding queue component in `dtypes`.
-
-##### Args:
-
-
-*  <b>`dtypes`</b>: A list of types.  The length of dtypes must equal the number
-    of tensors in each element.
-*  <b>`shapes`</b>: Constraints on the shapes of tensors in an element:
-    A list of shape tuples or None. This list is the same length
-    as dtypes.  If the shape of any tensors in the element are constrained,
-    all must be; shapes can be None if the shapes should not be constrained.
-*  <b>`names`</b>: Optional list of names.  If provided, the `enqueue()` and
-    `dequeue()` methods will use dictionaries with these names as keys.
-    Must be None or a list or tuple of the same length as `dtypes`.
-*  <b>`queue_ref`</b>: The queue reference, i.e. the output of the queue op.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.QueueBase.dequeue_up_to(n, name=None)` {#QueueBase.dequeue_up_to}
-
-Dequeues and concatenates `n` elements from this queue.
-
-**Note** This operation is not supported by all queues.  If a queue does not
-support DequeueUpTo, then a `tf.errors.UnimplementedError` is raised.
-
-This operation concatenates queue-element component tensors along
-the 0th dimension to make a single component tensor. If the queue
-has not been closed, all of the components in the dequeued tuple
-will have size `n` in the 0th dimension.
-
-If the queue is closed and there are more than `0` but fewer than
-`n` elements remaining, then instead of raising a
-`tf.errors.OutOfRangeError` like [`dequeue_many`](#QueueBase.dequeue_many),
-less than `n` elements are returned immediately.  If the queue is
-closed and there are `0` elements left in the queue, then a
-`tf.errors.OutOfRangeError` is raised just like in `dequeue_many`.
-Otherwise the behavior is identical to `dequeue_many`.
-
-##### Args:
-
-
-*  <b>`n`</b>: A scalar `Tensor` containing the number of elements to dequeue.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tuple of concatenated tensors that was dequeued.
-
-
-- - -
-
-#### `tf.QueueBase.dtypes` {#QueueBase.dtypes}
-
-The list of dtypes for each component of a queue element.
-
-
-- - -
-
-#### `tf.QueueBase.from_list(index, queues)` {#QueueBase.from_list}
-
-Create a queue using the queue reference from `queues[index]`.
-
-##### Args:
-
-
-*  <b>`index`</b>: An integer scalar tensor that determines the input that gets
-    selected.
-*  <b>`queues`</b>: A list of `QueueBase` objects.
-
-##### Returns:
-
-  A `QueueBase` object.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: When `queues` is not a list of `QueueBase` objects,
-    or when the data types of `queues` are not all the same.
-
-
-- - -
-
-#### `tf.QueueBase.name` {#QueueBase.name}
-
-The name of the underlying queue.
-
-
-- - -
-
-#### `tf.QueueBase.names` {#QueueBase.names}
-
-The list of names for each component of a queue element.
-
-
-- - -
-
-#### `tf.QueueBase.queue_ref` {#QueueBase.queue_ref}
-
-The underlying queue reference.
-
-
-- - -
-
-#### `tf.QueueBase.shapes` {#QueueBase.shapes}
-
-The list of shapes for each component of a queue element.
-
-
-
-- - -
-
-### `class tf.FIFOQueue` {#FIFOQueue}
-
-A queue implementation that dequeues elements in first-in first-out order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.FIFOQueue.__init__(capacity, dtypes, shapes=None, names=None, shared_name=None, name='fifo_queue')` {#FIFOQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `FIFOQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `FIFOQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `dtypes`, and whose shapes are optionally described
-by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects
-    with the same length as `dtypes`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
-
-- - -
-
-### `class tf.PaddingFIFOQueue` {#PaddingFIFOQueue}
-
-A FIFOQueue that supports batching variable-sized tensors by padding.
-
-A `PaddingFIFOQueue` may contain components with dynamic shape, while also
-supporting `dequeue_many`.  See the constructor for more details.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.PaddingFIFOQueue.__init__(capacity, dtypes, shapes, names=None, shared_name=None, name='padding_fifo_queue')` {#PaddingFIFOQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `PaddingFIFOQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `PaddingFIFOQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `dtypes`, and whose shapes are described by the `shapes`
-argument.
-
-The `shapes` argument must be specified; each component of a queue
-element must have the respective shape.  Shapes of fixed
-rank but variable size are allowed by setting any shape dimension to None.
-In this case, the inputs' shape may vary along the given dimension, and
-`dequeue_many` will pad the given dimension with zeros up to the maximum
-shape of all elements in the given batch.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: A list of `TensorShape` objects, with the same length as
-    `dtypes`.  Any dimension in the `TensorShape` containing value
-    `None` is dynamic and allows values to be enqueued with
-     variable size in that dimension.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If shapes is not a list of shapes, or the lengths of dtypes
-    and shapes do not match, or if names is specified and the lengths of
-    dtypes and names do not match.
-
-
-
-- - -
-
-### `class tf.RandomShuffleQueue` {#RandomShuffleQueue}
-
-A queue implementation that dequeues elements in a random order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.RandomShuffleQueue.__init__(capacity, min_after_dequeue, dtypes, shapes=None, names=None, seed=None, shared_name=None, name='random_shuffle_queue')` {#RandomShuffleQueue.__init__}
-
-Create a queue that dequeues elements in a random order.
-
-A `RandomShuffleQueue` has bounded capacity; supports multiple
-concurrent producers and consumers; and provides exactly-once
-delivery.
-
-A `RandomShuffleQueue` holds a list of up to `capacity`
-elements. Each element is a fixed-length tuple of tensors whose
-dtypes are described by `dtypes`, and whose shapes are optionally
-described by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-The `min_after_dequeue` argument allows the caller to specify a
-minimum number of elements that will remain in the queue after a
-`dequeue` or `dequeue_many` operation completes, to ensure a
-minimum level of mixing of elements. This invariant is maintained
-by blocking those operations until sufficient elements have been
-enqueued. The `min_after_dequeue` argument is ignored after the
-queue has been closed.
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`min_after_dequeue`</b>: An integer (described above).
-*  <b>`dtypes`</b>: A list of `DType` objects. The length of `dtypes` must equal
-    the number of tensors in each queue element.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects
-    with the same length as `dtypes`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of string naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`seed`</b>: A Python integer. Used to create a random seed. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
-
-- - -
-
-### `class tf.PriorityQueue` {#PriorityQueue}
-
-A queue implementation that dequeues elements in prioritized order.
-
-See [`tf.QueueBase`](#QueueBase) for a description of the methods on
-this class.
-
-- - -
-
-#### `tf.PriorityQueue.__init__(capacity, types, shapes=None, names=None, shared_name=None, name='priority_queue')` {#PriorityQueue.__init__}
-
-Creates a queue that dequeues elements in a first-in first-out order.
-
-A `PriorityQueue` has bounded capacity; supports multiple concurrent
-producers and consumers; and provides exactly-once delivery.
-
-A `PriorityQueue` holds a list of up to `capacity` elements. Each
-element is a fixed-length tuple of tensors whose dtypes are
-described by `types`, and whose shapes are optionally described
-by the `shapes` argument.
-
-If the `shapes` argument is specified, each component of a queue
-element must have the respective fixed shape. If it is
-unspecified, different queue elements may have different shapes,
-but the use of `dequeue_many` is disallowed.
-
-Enqueues and Dequeues to the `PriorityQueue` must include an additional
-tuple entry at the beginning: the `priority`.  The priority must be
-an int64 scalar (for `enqueue`) or an int64 vector (for `enqueue_many`).
-
-##### Args:
-
-
-*  <b>`capacity`</b>: An integer. The upper bound on the number of elements
-    that may be stored in this queue.
-*  <b>`types`</b>: A list of `DType` objects. The length of `types` must equal
-    the number of tensors in each queue element, except the first priority
-    element.  The first tensor in each element is the priority,
-    which must be type int64.
-*  <b>`shapes`</b>: (Optional.) A list of fully-defined `TensorShape` objects,
-    with the same length as `types`, or `None`.
-*  <b>`names`</b>: (Optional.) A list of strings naming the components in the queue
-    with the same length as `dtypes`, or `None`.  If specified, the dequeue
-    methods return a dictionary with the names as keys.
-*  <b>`shared_name`</b>: (Optional.) If non-empty, this queue will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the queue operation.
-
-
-
-
-## Conditional Accumulators
-
-- - -
-
-### `class tf.ConditionalAccumulatorBase` {#ConditionalAccumulatorBase}
-
-A conditional accumulator for aggregating gradients.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-- - -
-
-#### `tf.ConditionalAccumulatorBase.__init__(dtype, shape, accumulator_ref)` {#ConditionalAccumulatorBase.__init__}
-
-Creates a new ConditionalAccumulator.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: Datatype of the accumulated gradients.
-*  <b>`shape`</b>: Shape of the accumulated gradients.
-*  <b>`accumulator_ref`</b>: A handle to the conditional accumulator, created by sub-
-    classes
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.accumulator_ref` {#ConditionalAccumulatorBase.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.dtype` {#ConditionalAccumulatorBase.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.name` {#ConditionalAccumulatorBase.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.num_accumulated(name=None)` {#ConditionalAccumulatorBase.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulatorBase.set_global_step(new_global_step, name=None)` {#ConditionalAccumulatorBase.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
-
-- - -
-
-### `class tf.ConditionalAccumulator` {#ConditionalAccumulator}
-
-A conditional accumulator for aggregating gradients.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-- - -
-
-#### `tf.ConditionalAccumulator.__init__(dtype, shape=None, shared_name=None, name='conditional_accumulator')` {#ConditionalAccumulator.__init__}
-
-Creates a new ConditionalAccumulator.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: Datatype of the accumulated gradients.
-*  <b>`shape`</b>: Shape of the accumulated gradients.
-*  <b>`shared_name`</b>: Optional. If non-empty, this accumulator will be shared under
-    the given name across multiple sessions.
-*  <b>`name`</b>: Optional name for the accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.accumulator_ref` {#ConditionalAccumulator.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.apply_grad(grad, local_step=0, name=None)` {#ConditionalAccumulator.apply_grad}
-
-Attempts to apply a gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-##### Args:
-
-
-*  <b>`grad`</b>: The gradient tensor to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.dtype` {#ConditionalAccumulator.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.name` {#ConditionalAccumulator.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.num_accumulated(name=None)` {#ConditionalAccumulator.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.set_global_step(new_global_step, name=None)` {#ConditionalAccumulator.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
-- - -
-
-#### `tf.ConditionalAccumulator.take_grad(num_required, name=None)` {#ConditionalAccumulator.take_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  A tensor holding the value of the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
-
-- - -
-
-### `class tf.SparseConditionalAccumulator` {#SparseConditionalAccumulator}
-
-A conditional accumulator for aggregating sparse gradients.
-
-Sparse gradients are represented by IndexedSlices.
-
-Up-to-date gradients (i.e., time step at which gradient was computed is
-equal to the accumulator's time step) are added to the accumulator.
-
-Extraction of the average gradient is blocked until the required number of
-gradients has been accumulated.
-
-Args:
-  dtype: Datatype of the accumulated gradients.
-  shape: Shape of the accumulated gradients.
-  shared_name: Optional. If non-empty, this accumulator will be shared under
-    the given name across multiple sessions.
-  name: Optional name for the accumulator.
-- - -
-
-#### `tf.SparseConditionalAccumulator.__init__(dtype, shape=None, shared_name=None, name='sparse_conditional_accumulator')` {#SparseConditionalAccumulator.__init__}
-
-
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.accumulator_ref` {#SparseConditionalAccumulator.accumulator_ref}
-
-The underlying accumulator reference.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.apply_grad(grad_indices, grad_values, grad_shape=None, local_step=0, name=None)` {#SparseConditionalAccumulator.apply_grad}
-
-Attempts to apply a sparse gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-A sparse gradient is represented by its indices, values and possibly empty
-or None shape. Indices must be a vector representing the locations of
-non-zero entries in the tensor. Values are the non-zero slices of the
-gradient, and must have the same first dimension as indices, i.e., the nnz
-represented by indices and values must be consistent. Shape, if not empty or
-None, must be consistent with the accumulator's shape (if also provided).
-
-##### Example:
-
-  A tensor [[0, 0], [0. 1], [2, 3]] can be represented
-
-*  <b>`indices`</b>: [1,2]
-*  <b>`values`</b>: [[0,1],[2,3]]
-*  <b>`shape`</b>: [3, 2]
-
-##### Args:
-
-
-*  <b>`grad_indices`</b>: Indices of the sparse gradient to be applied.
-*  <b>`grad_values`</b>: Values of the sparse gradient to be applied.
-*  <b>`grad_shape`</b>: Shape of the sparse gradient to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.apply_indexed_slices_grad(grad, local_step=0, name=None)` {#SparseConditionalAccumulator.apply_indexed_slices_grad}
-
-Attempts to apply a gradient to the accumulator.
-
-The attempt is silently dropped if the gradient is stale, i.e., local_step
-is less than the accumulator's global time step.
-
-##### Args:
-
-
-*  <b>`grad`</b>: The gradient IndexedSlices to be applied.
-*  <b>`local_step`</b>: Time step at which the gradient was computed.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  The operation that (conditionally) applies a gradient to the accumulator.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If grad is of the wrong shape
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.dtype` {#SparseConditionalAccumulator.dtype}
-
-The datatype of the gradients accumulated by this accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.name` {#SparseConditionalAccumulator.name}
-
-The name of the underlying accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.num_accumulated(name=None)` {#SparseConditionalAccumulator.num_accumulated}
-
-Number of gradients that have currently been aggregated in accumulator.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Number of accumulated gradients currently in accumulator.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.set_global_step(new_global_step, name=None)` {#SparseConditionalAccumulator.set_global_step}
-
-Sets the global time step of the accumulator.
-
-The operation logs a warning if we attempt to set to a time step that is
-lower than the accumulator's own time step.
-
-##### Args:
-
-
-*  <b>`new_global_step`</b>: Value of new time step. Can be a variable or a constant
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  Operation that sets the accumulator's time step.
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.take_grad(num_required, name=None)` {#SparseConditionalAccumulator.take_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  A tuple of indices, values, and shape representing the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
-- - -
-
-#### `tf.SparseConditionalAccumulator.take_indexed_slices_grad(num_required, name=None)` {#SparseConditionalAccumulator.take_indexed_slices_grad}
-
-Attempts to extract the average gradient from the accumulator.
-
-The operation blocks until sufficient number of gradients have been
-successfully applied to the accumulator.
-
-Once successful, the following actions are also triggered:
-- Counter of accumulated gradients is reset to 0.
-- Aggregated gradient is reset to 0 tensor.
-- Accumulator's internal time step is incremented by 1.
-
-##### Args:
-
-
-*  <b>`num_required`</b>: Number of gradients that needs to have been aggregated
-*  <b>`name`</b>: Optional name for the operation
-
-##### Returns:
-
-  An IndexedSlices holding the value of the average gradient.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: If num_required < 1
-
-
-
-
-## Dealing with the filesystem
-
-- - -
-
-### `tf.matching_files(pattern, name=None)` {#matching_files}
-
-Returns the set of files matching one or more glob patterns.
-
-Note that this routine only supports wildcard characters in the
-basename portion of the pattern, not in the directory portion.
-
-##### Args:
-
-
-*  <b>`pattern`</b>: A `Tensor` of type `string`.
-    Shell wildcard pattern(s). Scalar or vector of type string.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. A vector of matching filenames.
-
-
-- - -
-
-### `tf.read_file(filename, name=None)` {#read_file}
-
-Reads and outputs the entire contents of the input filename.
-
-##### Args:
-
-
-*  <b>`filename`</b>: A `Tensor` of type `string`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
-
-- - -
-
-### `tf.write_file(filename, contents, name=None)` {#write_file}
-
-Writes contents to the file at input filename. Creates file if not existing.
-
-##### Args:
-
-
-*  <b>`filename`</b>: A `Tensor` of type `string`.
-    scalar. The name of the file to which we write the contents.
-*  <b>`contents`</b>: A `Tensor` of type `string`.
-    scalar. The content to be written to the output file.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The created Operation.
-
-
-
-## Input pipeline
-
-TensorFlow functions for setting up an input-prefetching pipeline.
-Please see the [reading data how-to](../../how_tos/reading_data/index.md)
-for context.
-
-### Beginning of an input pipeline
-
-The "producer" functions add a queue to the graph and a corresponding
-`QueueRunner` for running the subgraph that fills that queue.
-
-- - -
-
-### `tf.train.match_filenames_once(pattern, name=None)` {#match_filenames_once}
-
-Save the list of files matching pattern, so it is only computed once.
-
-##### Args:
-
-
-*  <b>`pattern`</b>: A file pattern (glob), or 1D tensor of file patterns.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A variable that is initialized to the list of files matching the pattern(s).
-
-
-- - -
-
-### `tf.train.limit_epochs(tensor, num_epochs=None, name=None)` {#limit_epochs}
-
-Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
-
-Note: creates local counter `epochs`. Use `local_variables_initializer()` to
-initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: Any `Tensor`.
-*  <b>`num_epochs`</b>: A positive integer (optional).  If specified, limits the number
-    of steps the output tensor may be evaluated.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  tensor or `OutOfRange`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `num_epochs` is invalid.
-
-
-- - -
-
-### `tf.train.input_producer(input_tensor, element_shape=None, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, summary_name=None, name=None, cancel_op=None)` {#input_producer}
-
-Output the rows of `input_tensor` to a queue for an input pipeline.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: A tensor with the rows to produce. Must be at least
-    one-dimensional. Must either have a fully-defined shape, or
-    `element_shape` must be defined.
-*  <b>`element_shape`</b>: (Optional.) A `TensorShape` representing the shape of a
-    row of `input_tensor`, if it cannot be inferred.
-*  <b>`num_epochs`</b>: (Optional.) An integer. If specified `input_producer` produces
-    each row of `input_tensor` `num_epochs` times before generating an
-    `OutOfRange` error. If not specified, `input_producer` can cycle through
-    the rows of `input_tensor` an unlimited number of times.
-*  <b>`shuffle`</b>: (Optional.) A boolean. If true, the rows are randomly shuffled
-    within each epoch.
-*  <b>`seed`</b>: (Optional.) An integer. The seed to use if `shuffle` is true.
-*  <b>`capacity`</b>: (Optional.) The capacity of the queue to be used for buffering
-    the input.
-*  <b>`shared_name`</b>: (Optional.) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`summary_name`</b>: (Optional.) If set, a scalar summary for the current queue
-    size will be generated, using this name as part of the tag.
-*  <b>`name`</b>: (Optional.) A name for queue.
-*  <b>`cancel_op`</b>: (Optional.) Cancel op for the queue
-
-##### Returns:
-
-  A queue with the output rows.  A `QueueRunner` for the queue is
-  added to the current `QUEUE_RUNNER` collection of the current
-  graph.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shape of the input cannot be inferred from the arguments.
-
-
-- - -
-
-### `tf.train.range_input_producer(limit, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None)` {#range_input_producer}
-
-Produces the integers from 0 to limit-1 in a queue.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`limit`</b>: An int32 scalar tensor.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `range_input_producer`
-    produces each integer `num_epochs` times before generating an
-    OutOfRange error. If not specified, `range_input_producer` can cycle
-    through the integers an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the integers are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A Queue with the output integers.  A `QueueRunner` for the Queue
-  is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-
-- - -
-
-### `tf.train.slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None)` {#slice_input_producer}
-
-Produces a slice of each `Tensor` in `tensor_list`.
-
-Implemented using a Queue -- a `QueueRunner` for the Queue
-is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-##### Args:
-
-
-*  <b>`tensor_list`</b>: A list of `Tensor` objects. Every `Tensor` in
-    `tensor_list` must have the same size in the first dimension.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `slice_input_producer`
-    produces each slice `num_epochs` times before generating
-    an `OutOfRange` error. If not specified, `slice_input_producer` can cycle
-    through the slices an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the integers are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-
-##### Returns:
-
-  A list of tensors, one for each element of `tensor_list`.  If the tensor
-  in `tensor_list` has shape `[N, a, b, .., z]`, then the corresponding output
-  tensor will have shape `[a, b, ..., z]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `slice_input_producer` produces nothing from `tensor_list`.
-
-
-- - -
-
-### `tf.train.string_input_producer(string_tensor, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None, cancel_op=None)` {#string_input_producer}
-
-Output strings (e.g. filenames) to a queue for an input pipeline.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A 1-D string tensor with the strings to produce.
-*  <b>`num_epochs`</b>: An integer (optional). If specified, `string_input_producer`
-    produces each string from `string_tensor` `num_epochs` times before
-    generating an `OutOfRange` error. If not specified,
-    `string_input_producer` can cycle through the strings in `string_tensor`
-    an unlimited number of times.
-*  <b>`shuffle`</b>: Boolean. If true, the strings are randomly shuffled within each
-    epoch.
-*  <b>`seed`</b>: An integer (optional). Seed used if shuffle == True.
-*  <b>`capacity`</b>: An integer. Sets the queue capacity.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: A name for the operations (optional).
-*  <b>`cancel_op`</b>: Cancel op for the queue (optional).
-
-##### Returns:
-
-  A queue with the output strings.  A `QueueRunner` for the Queue
-  is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the string_tensor is a null Python list.  At runtime,
-  will fail with an assertion if string_tensor becomes a null tensor.
-
-
-
-### Batching at the end of an input pipeline
-
-These functions add a queue to the graph to assemble a batch of
-examples, with possible shuffling.  They also add a `QueueRunner` for
-running the subgraph that fills that queue.
-
-Use [`batch`](#batch) or [`batch_join`](#batch_join) for batching
-examples that have already been well shuffled.  Use
-[`shuffle_batch`](#shuffle_batch) or
-[`shuffle_batch_join`](#shuffle_batch_join) for examples that would
-benefit from additional shuffling.
-
-Use [`batch`](#batch) or [`shuffle_batch`](#shuffle_batch) if you want a
-single thread producing examples to batch, or if you have a
-single subgraph producing examples but you want to run it in *N* threads
-(where you increase *N* until it can keep the queue full).  Use
-[`batch_join`](#batch_join) or [`shuffle_batch_join`](#shuffle_batch_join)
-if you have *N* different subgraphs producing examples to batch and you
-want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
-
-- - -
-
-### `tf.train.batch(tensors, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#batch}
-
-Creates batches of tensors in `tensors`.
-
-The argument `tensors` can be a list or a dictionary of tensors.
-The value returned by the function will be of the same type
-as `tensors`.
-
-This function is implemented using a queue. A `QueueRunner` for the
-queue is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-If `enqueue_many` is `False`, `tensors` is assumed to represent a single
-example.  An input tensor with shape `[x, y, z]` will be output as a tensor
-with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors` is assumed to represent a batch of
-examples, where the first dimension is indexed by example, and all members of
-`tensors` should have the same size in the first dimension.  If an input
-tensor has shape `[*, x, y, z]`, the output will have shape `[batch_size, x,
-y, z]`.  The `capacity` argument controls the how long the prefetching is
-allowed to grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have shape `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same types as `tensors` (except if
-  the input is a list of one element, then it returns a tensor, not a list).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
-
-- - -
-
-### `tf.train.maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch}
-
-Conditionally creates batches of tensors based on `keep_input`.
-
-See docstring in `batch` for more details.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensors`.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensors` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
-
-- - -
-
-### `tf.train.batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#batch_join}
-
-Runs a list of tensors to fill a queue to create batches of examples.
-
-The `tensors_list` argument is a list of tuples of tensors, or a list of
-dictionaries of tensors.  Each element in the list is treated similarly
-to the `tensors` argument of `tf.train.batch()`.
-
-Enqueues a different list of tensors in different threads.
-Implemented using a queue -- a `QueueRunner` for the queue
-is added to the current `Graph`'s `QUEUE_RUNNER` collection.
-
-`len(tensors_list)` threads will be started,
-with thread `i` enqueuing the tensors from
-`tensors_list[i]`. `tensors_list[i1][j]` must match
-`tensors_list[i2][j]` in type and shape, except in the first
-dimension if `enqueue_many` is true.
-
-If `enqueue_many` is `False`, each `tensors_list[i]` is assumed
-to represent a single example. An input tensor `x` will be output as a
-tensor with shape `[batch_size] + x.shape`.
-
-If `enqueue_many` is `True`, `tensors_list[i]` is assumed to
-represent a batch of examples, where the first dimension is indexed
-by example, and all members of `tensors_list[i]` should have the
-same size in the first dimension.  The slices of any input tensor
-`x` are treated as examples, and the output tensors will have shape
-`[batch_size] + x.shape[1:]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-*N.B.:* If `dynamic_pad` is `False`, you must ensure that either
-(i) the `shapes` argument is passed, or (ii) all of the tensors in
-`tensors_list` must have fully-defined shapes. `ValueError` will be
-raised if neither of these conditions holds.
-
-If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
-tensors is known, but individual dimensions may have value `None`.
-In this case, for each enqueue the dimensions with value `None`
-may have a variable length; upon dequeue, the output tensors will be padded
-on the right to the maximum shape of the tensors in the current minibatch.
-For numbers, this padding takes value 0.  For strings, this padding is
-the empty string.  See `PaddingFIFOQueue` for more info.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list_list[i]`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensor_list_list`.
-
-
-- - -
-
-### `tf.train.maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_batch_join}
-
-Runs a list of tensors to conditionally fill a queue to create batches.
-
-See docstring in `batch_join` for more details.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list_list[i]`.
-*  <b>`dynamic_pad`</b>: Boolean.  Allow variable dimensions in input shapes.
-    The given dimensions are padded upon dequeue so that tensors within a
-    batch have the same shapes.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensor_list_list`.
-
-
-- - -
-
-### `tf.train.shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#shuffle_batch}
-
-Creates batches by randomly shuffling tensors.
-
-This function adds the following to the current `Graph`:
-
-* A shuffling queue into which tensors from `tensors` are enqueued.
-* A `dequeue_many` operation to create batches from the queue.
-* A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
-  from `tensors`.
-
-If `enqueue_many` is `False`, `tensors` is assumed to represent a
-single example.  An input tensor with shape `[x, y, z]` will be output
-as a tensor with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors` is assumed to represent a
-batch of examples, where the first dimension is indexed by example,
-and all members of `tensors` should have the same size in the
-first dimension.  If an input tensor has shape `[*, x, y, z]`, the
-output will have shape `[batch_size, x, y, z]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-For example:
-
-```python
-# Creates batches of 32 images and 32 labels.
-image_batch, label_batch = tf.train.shuffle_batch(
-      [single_image, single_label],
-      batch_size=32,
-      num_threads=4,
-      capacity=50000,
-      min_after_dequeue=10000)
-```
-
-*N.B.:* You must ensure that either (i) the `shapes` argument is
-passed, or (ii) all of the tensors in `tensors` must have
-fully-defined shapes. `ValueError` will be raised if neither of
-these conditions holds.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variables_initializer()` to initialize local variables.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
-
-- - -
-
-### `tf.train.maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, keep_input, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch}
-
-Creates batches by randomly shuffling conditionally-enqueued tensors.
-
-See docstring in `shuffle_batch` for more details.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: The list or dictionary of tensors to enqueue.
-*  <b>`batch_size`</b>: The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`num_threads`</b>: The number of threads enqueuing `tensor_list`.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list` is a single example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensor_list`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (Optional) If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the types as `tensors`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors`.
-
-
-- - -
-
-### `tf.train.shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#shuffle_batch_join}
-
-Create batches by randomly shuffling tensors.
-
-The `tensors_list` argument is a list of tuples of tensors, or a list of
-dictionaries of tensors.  Each element in the list is treated similarly
-to the `tensors` argument of `tf.train.shuffle_batch()`.
-
-This version enqueues a different list of tensors in different threads.
-It adds the following to the current `Graph`:
-
-* A shuffling queue into which tensors from `tensors_list` are enqueued.
-* A `dequeue_many` operation to create batches from the queue.
-* A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
-  from `tensors_list`.
-
-`len(tensors_list)` threads will be started, with thread `i` enqueuing
-the tensors from `tensors_list[i]`. `tensors_list[i1][j]` must match
-`tensors_list[i2][j]` in type and shape, except in the first dimension if
-`enqueue_many` is true.
-
-If `enqueue_many` is `False`, each `tensors_list[i]` is assumed
-to represent a single example.  An input tensor with shape `[x, y, z]`
-will be output as a tensor with shape `[batch_size, x, y, z]`.
-
-If `enqueue_many` is `True`, `tensors_list[i]` is assumed to
-represent a batch of examples, where the first dimension is indexed
-by example, and all members of `tensors_list[i]` should have the
-same size in the first dimension.  If an input tensor has shape `[*, x,
-y, z]`, the output will have shape `[batch_size, x, y, z]`.
-
-The `capacity` argument controls the how long the prefetching is allowed to
-grow the queues.
-
-The returned operation is a dequeue operation and will throw
-`tf.errors.OutOfRangeError` if the input queue is exhausted. If this
-operation is feeding another input queue, its queue runner will catch
-this exception, however, if this operation is used in your main thread
-you are responsible for catching this yourself.
-
-If `allow_smaller_final_batch` is `True`, a smaller batch value than
-`batch_size` is returned when the queue is closed and there are not enough
-elements to fill the batch, otherwise the pending elements are discarded.
-In addition, all output tensors' static shapes, as accessed via the
-`get_shape` method will have a first `Dimension` value of `None`, and
-operations that depend on fixed batch_size would fail.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors_list[i]`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors_list`.
-
-
-- - -
-
-### `tf.train.maybe_shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, keep_input, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None)` {#maybe_shuffle_batch_join}
-
-Create batches by randomly shuffling conditionally-enqueued tensors.
-
-See docstring in `shuffle_batch_join` for more details.
-
-##### Args:
-
-
-*  <b>`tensors_list`</b>: A list of tuples or dictionaries of tensors to enqueue.
-*  <b>`batch_size`</b>: An integer. The new batch size pulled from the queue.
-*  <b>`capacity`</b>: An integer. The maximum number of elements in the queue.
-*  <b>`min_after_dequeue`</b>: Minimum number elements in the queue after a
-    dequeue, used to ensure a level of mixing of elements.
-*  <b>`keep_input`</b>: A `bool` Tensor.  This tensor controls whether the input is
-    added to the queue or not.  If it is a scalar and evaluates `True`, then
-    `tensors` are all added to the queue. If it is a vector and `enqueue_many`
-    is `True`, then each example is added to the queue only if the
-    corresonding value in `keep_input` is `True`. This tensor essentially acts
-    as a filtering mechanism.
-*  <b>`seed`</b>: Seed for the random shuffling within the queue.
-*  <b>`enqueue_many`</b>: Whether each tensor in `tensor_list_list` is a single
-    example.
-*  <b>`shapes`</b>: (Optional) The shapes for each example.  Defaults to the
-    inferred shapes for `tensors_list[i]`.
-*  <b>`allow_smaller_final_batch`</b>: (Optional) Boolean. If `True`, allow the final
-    batch to be smaller if there are insufficient items left in the queue.
-*  <b>`shared_name`</b>: (optional). If set, this queue will be shared under the given
-    name across multiple sessions.
-*  <b>`name`</b>: (Optional) A name for the operations.
-
-##### Returns:
-
-  A list or dictionary of tensors with the same number and types as
-  `tensors_list[i]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `shapes` are not specified, and cannot be
-    inferred from the elements of `tensors_list`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
deleted file mode 100644
index 76636dc6f06..00000000000
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ /dev/null
@@ -1,3722 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Math
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-Note: Elementwise binary operations in TensorFlow follow [numpy-style
-broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-
-## Arithmetic Operators
-
-TensorFlow provides several operations that you can use to add basic arithmetic
-operators to your graph.
-
-- - -
-
-### `tf.add(x, y, name=None)` {#add}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.subtract(x, y, name=None)` {#subtract}
-
-Returns x - y element-wise.
-
-*NOTE*: `tf.subtract` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.multiply(x, y, name=None)` {#multiply}
-
-Returns x * y element-wise.
-
-*NOTE*: ``tf.multiply`` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.scalar_mul(scalar, x)` {#scalar_mul}
-
-Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
-
-Intended for use in gradient code which might deal with `IndexedSlices`
-objects, which are easy to multiply by a scalar but more expensive to
-multiply with arbitrary tensors.
-
-##### Args:
-
-
-*  <b>`scalar`</b>: A 0-D scalar `Tensor`. Must have known shape.
-*  <b>`x`</b>: A `Tensor` or `IndexedSlices` to be scaled.
-
-##### Returns:
-
-  `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if scalar is not a 0-D `scalar`.
-
-
-- - -
-
-### `tf.div(x, y, name=None)` {#div}
-
-Divides x / y elementwise (using Python 2 division operator semantics).
-
-NOTE: Prefer using the Tensor division operator or tf.divide which obey Python
-division operator semantics.
-
-This function divides `x` and `y`, forcing Python 2.7 semantics. That is,
-if one of `x` or `y` is a float, then the result will be a float.
-Otherwise, the output will be an integer type. Flooring semantics are used
-for integer division.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-### `tf.divide(x, y, name=None)` {#divide}
-
-Computes Python style division of `x` by `y`.
-
-
-- - -
-
-### `tf.truediv(x, y, name=None)` {#truediv}
-
-Divides x / y elementwise (using Python 3 division operator semantics).
-
-NOTE: Prefer using the Tensor operator or tf.divide which obey Python
-division operator semantics.
-
-This function forces Python 3 division operator semantics where all integer
-arguments are cast to floating types first.   This op is generated by normal
-`x / y` division in Python 3 and in Python 2.7 with
-`from __future__ import division`.  If you want integer division that rounds
-down, use `x // y` or `tf.floordiv`.
-
-`x` and `y` must have the same numeric type.  If the inputs are floating
-point, the output will have the same type.  If the inputs are integral, the
-inputs are cast to `float32` for `int8` and `int16` and `float64` for `int32`
-and `int64` (matching the behavior of Numpy).
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of numeric type.
-*  <b>`y`</b>: `Tensor` denominator of numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` evaluated in floating point.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` and `y` have different dtypes.
-
-
-- - -
-
-### `tf.floordiv(x, y, name=None)` {#floordiv}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-### `tf.realdiv(x, y, name=None)` {#realdiv}
-
-Returns x / y element-wise for real types.
-
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.truncatediv(x, y, name=None)` {#truncatediv}
-
-Returns x / y element-wise for integer types.
-
-Truncation designates that negative numbers will round fractional quantities
-toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
-than Python semantics. See `FloorDiv` for a division function that matches
-Python Semantics.
-
-*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.floor_div(x, y, name=None)` {#floor_div}
-
-Returns x // y element-wise.
-
-*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.truncatemod(x, y, name=None)` {#truncatemod}
-
-Returns element-wise remainder of division. This emulates C semantics where
-
-true, this follows C semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.floormod(x, y, name=None)` {#floormod}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.mod(x, y, name=None)` {#mod}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.cross(a, b, name=None)` {#cross}
-
-Compute the pairwise cross product.
-
-`a` and `b` must be the same shape; they can either be simple 3-element vectors,
-or any shape where the innermost dimension is 3. In the latter case, each pair
-of corresponding 3-element vectors is cross-multiplied independently.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    A tensor containing 3-element vectors.
-*  <b>`b`</b>: A `Tensor`. Must have the same type as `a`.
-    Another tensor, of same type and shape as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-  Pairwise cross product of the vectors in `a` and `b`.
-
-
-
-## Basic Math Functions
-
-TensorFlow provides several operations that you can use to add basic
-mathematical functions to your graph.
-
-- - -
-
-### `tf.add_n(inputs, name=None)` {#add_n}
-
-Adds all input tensors element-wise.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `Tensor` objects, each with same shape and type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as the elements of `inputs`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `inputs` don't all have same shape and dtype or the shape
-  cannot be inferred.
-
-
-- - -
-
-### `tf.abs(x, name=None)` {#abs}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
-
-- - -
-
-### `tf.negative(x, name=None)` {#negative}
-
-Computes numerical negative value element-wise.
-
-I.e., \(y = -x\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-
-- - -
-
-### `tf.sign(x, name=None)` {#sign}
-
-Returns an element-wise indication of the sign of a number.
-
-`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-
-For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-
-- - -
-
-### `tf.reciprocal(x, name=None)` {#reciprocal}
-
-Computes the reciprocal of x element-wise.
-
-I.e., \\(y = 1 / x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.square(x, name=None)` {#square}
-
-Computes square of x element-wise.
-
-I.e., \(y = x * x = x^2\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.round(x, name=None)` {#round}
-
-Rounds the values of a tensor to the nearest integer, element-wise.
-
-Rounds half to even.  Also known as bankers rounding. If you want to round
-according to the current system rounding mode use tf::cint.
-For example:
-
-```python
-# 'a' is [0.9, 2.5, 2.3, 1.5, -4.5]
-tf.round(a) ==> [ 1.0, 2.0, 2.0, 2.0, -4.0 ]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as `x`.
-
-
-- - -
-
-### `tf.sqrt(x, name=None)` {#sqrt}
-
-Computes square root of x element-wise.
-
-I.e., \(y = \sqrt{x} = x^{1/2}\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-
-- - -
-
-### `tf.rsqrt(x, name=None)` {#rsqrt}
-
-Computes reciprocal of square root of x element-wise.
-
-I.e., \\(y = 1 / \sqrt{x}\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.pow(x, y, name=None)` {#pow}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-### `tf.exp(x, name=None)` {#exp}
-
-Computes exponential of x element-wise.  \\(y = e^x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.expm1(x, name=None)` {#expm1}
-
-Computes exponential of x - 1 element-wise.
-
-I.e., \\(y = (\exp x) - 1\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.log(x, name=None)` {#log}
-
-Computes natural logarithm of x element-wise.
-
-I.e., \\(y = \log_e x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.log1p(x, name=None)` {#log1p}
-
-Computes natural logarithm of (1 + x) element-wise.
-
-I.e., \\(y = \log_e (1 + x)\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.ceil(x, name=None)` {#ceil}
-
-Returns element-wise smallest integer in not less than x.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.floor(x, name=None)` {#floor}
-
-Returns element-wise largest integer not greater than x.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.maximum(x, y, name=None)` {#maximum}
-
-Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.minimum(x, y, name=None)` {#minimum}
-
-Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-
-*NOTE*: `Minimum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.cos(x, name=None)` {#cos}
-
-Computes cos of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.sin(x, name=None)` {#sin}
-
-Computes sin of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.lbeta(x, name='lbeta')` {#lbeta}
-
-Computes `ln(|Beta(x)|)`, reducing along the last dimension.
-
-Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
-
-```Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)```
-
-And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-`lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)`.  In other words,
-the last dimension is treated as the `z` vector.
-
-Note that if `z = [u, v]`, then
-`Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt`, which defines the traditional
-bivariate beta function.
-
-##### Args:
-
-
-*  <b>`x`</b>: A rank `n + 1` `Tensor` with type `float`, or `double`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The logarithm of `|Beta(x)|` reducing along the last dimension.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` is empty with rank one or less.
-
-
-- - -
-
-### `tf.tan(x, name=None)` {#tan}
-
-Computes tan of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.acos(x, name=None)` {#acos}
-
-Computes acos of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.asin(x, name=None)` {#asin}
-
-Computes asin of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.atan(x, name=None)` {#atan}
-
-Computes atan of x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.lgamma(x, name=None)` {#lgamma}
-
-Computes the log of the absolute value of `Gamma(x)` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.digamma(x, name=None)` {#digamma}
-
-Computes Psi, the derivative of Lgamma (the log of the absolute value of
-
-`Gamma(x)`), element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.erf(x, name=None)` {#erf}
-
-Computes the Gauss error function of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-
-- - -
-
-### `tf.erfc(x, name=None)` {#erfc}
-
-Computes the complementary error function of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.squared_difference(x, y, name=None)` {#squared_difference}
-
-Returns (x - y)(x - y) element-wise.
-
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.igamma(a, x, name=None)` {#igamma}
-
-Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-
-The lower regularized incomplete Gamma function is defined as:
-
-```
-P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-```
-where
-```
-gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-```
-is the lower incomplete Gamma function.
-
-Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-Gamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
-
-- - -
-
-### `tf.igammac(a, x, name=None)` {#igammac}
-
-Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-
-The upper regularized incomplete Gamma function is defined as:
-
-```
-Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-```
-where
-```
-Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-```
-is the upper incomplete Gama function.
-
-Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-Gamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
-
-- - -
-
-### `tf.zeta(x, q, name=None)` {#zeta}
-
-Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-
-The Hurwitz zeta function is defined as:
-
-```
-\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`q`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.polygamma(a, x, name=None)` {#polygamma}
-
-Compute the polygamma function \\(\psi^{(n)}(x)\\).
-
-The polygamma function is defined as:
-
-```
-\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-```
-where \\(\psi(x)\\) is the digamma function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
-
-- - -
-
-### `tf.betainc(a, b, x, name=None)` {#betainc}
-
-Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-
-The regularized incomplete beta integral is defined as:
-
-```
-I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-```
-where
-
-```
-B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-```
-
-is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-beta function.
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`b`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`x`</b>: A `Tensor`. Must have the same type as `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `a`.
-
-
-- - -
-
-### `tf.rint(x, name=None)` {#rint}
-
-Returns element-wise integer closest to x.
-
-If the result is midway between two representable values,
-the even representable is chosen.
-For example:
-
-```
-rint(-1.5) ==> -2.0
-rint(0.5000001) ==> 1.0
-rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-
-## Matrix Math Functions
-
-TensorFlow provides several operations that you can use to add linear algebra
-functions on matrices to your graph.
-
-- - -
-
-### `tf.diag(diagonal, name=None)` {#diag}
-
-Returns a diagonal tensor with a given diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-
-`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-
-For example:
-
-```prettyprint
-# 'diagonal' is [1, 2, 3, 4]
-tf.diag(diagonal) ==> [[1, 0, 0, 0]
-                       [0, 2, 0, 0]
-                       [0, 0, 3, 0]
-                       [0, 0, 0, 4]]
-```
-
-##### Args:
-
-
-*  <b>`diagonal`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    Rank k tensor where k is at most 3.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `diagonal`.
-
-
-- - -
-
-### `tf.diag_part(input, name=None)` {#diag_part}
-
-Returns the diagonal part of the tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-
-`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-
-For example:
-
-```prettyprint
-# 'input' is [[1, 0, 0, 0]
-              [0, 2, 0, 0]
-              [0, 0, 3, 0]
-              [0, 0, 0, 4]]
-
-tf.diag_part(input) ==> [1, 2, 3, 4]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    Rank k tensor where k is 2, 4, or 6.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. The extracted diagonal.
-
-
-- - -
-
-### `tf.trace(x, name=None)` {#trace}
-
-Compute the trace of a tensor `x`.
-
-`trace(x)` returns the sum along the main diagonal of each inner-most matrix
-in x. If x is of rank `k` with shape `[I, J, K, ..., L, M, N]`, then output
-is a tensor of rank `k-2` with dimensions `[I, J, K, ..., L]` where
-
-`output[i, j, k, ..., l] = trace(x[i, j, i, ..., l, :, :])`
-
-For example:
-
-```python
-# 'x' is [[1, 2],
-#         [3, 4]]
-tf.trace(x) ==> 5
-
-# 'x' is [[1,2,3],
-#         [4,5,6],
-#         [7,8,9]]
-tf.trace(x) ==> 15
-
-# 'x' is [[[1,2,3],
-#          [4,5,6],
-#          [7,8,9]],
-#         [[-1,-2,-3],
-#          [-4,-5,-6],
-#          [-7,-8,-9]]]
-tf.trace(x) ==> [15,-15]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The trace of input tensor.
-
-
-- - -
-
-### `tf.transpose(a, perm=None, name='transpose')` {#transpose}
-
-Transposes `a`. Permutes the dimensions according to `perm`.
-
-The returned tensor's dimension i will correspond to the input dimension
-`perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-the rank of the input tensor. Hence by default, this operation performs a
-regular matrix transpose on 2-D input Tensors.
-
-For example:
-
-```python
-# 'x' is [[1 2 3]
-#         [4 5 6]]
-tf.transpose(x) ==> [[1 4]
-                     [2 5]
-                     [3 6]]
-
-# Equivalently
-tf.transpose(x, perm=[1, 0]) ==> [[1 4]
-                                  [2 5]
-                                  [3 6]]
-
-# 'perm' is more useful for n-dimensional tensors, for n > 2
-# 'x' is   [[[1  2  3]
-#            [4  5  6]]
-#           [[7  8  9]
-#            [10 11 12]]]
-# Take the transpose of the matrices in dimension-0
-tf.transpose(x, perm=[0, 2, 1]) ==> [[[1  4]
-                                      [2  5]
-                                      [3  6]]
-
-                                     [[7 10]
-                                      [8 11]
-                                      [9 12]]]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor`.
-*  <b>`perm`</b>: A permutation of the dimensions of `a`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A transposed `Tensor`.
-
-
-
-- - -
-
-### `tf.eye(num_rows, num_columns=None, batch_shape=None, dtype=tf.float32, name=None)` {#eye}
-
-Construct an identity matrix, or a batch of matrices.
-
-```python
-# Construct one identity matrix.
-tf.eye(2)
-==> [[1., 0.],
-     [0., 1.]]
-
-# Construct a batch of 3 identity matricies, each 2 x 2.
-# batch_identity[i, :, :] is a 2 x 2 identity matrix, i = 0, 1, 2.
-batch_identity = tf.eye(2, batch_shape=[3])
-
-# Construct one 2 x 3 "identity" matrix
-tf.eye(2, num_columns=3)
-==> [[ 1.,  0.,  0.],
-     [ 0.,  1.,  0.]]
-```
-
-##### Args:
-
-
-*  <b>`num_rows`</b>: Non-negative `int32` scalar `Tensor` giving the number of rows
-    in each batch matrix.
-*  <b>`num_columns`</b>: Optional non-negative `int32` scalar `Tensor` giving the number
-    of columns in each batch matrix.  Defaults to `num_rows`.
-*  <b>`batch_shape`</b>: `int32` `Tensor`.  If provided, returned `Tensor` will have
-    leading batch dimensions of this shape.
-*  <b>`dtype`</b>: The type of an element in the resulting `Tensor`
-*  <b>`name`</b>: A name for this `Op`.  Defaults to "eye".
-
-##### Returns:
-
-  A `Tensor` of shape `batch_shape + [num_rows, num_columns]`
-
-
-- - -
-
-### `tf.matrix_diag(diagonal, name=None)` {#matrix_diag}
-
-Returns a batched diagonal tensor with a given batched diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-
-`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-
-For example:
-
-```prettyprint
-# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-and diagonal.shape = (2, 4)
-
-tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-                                     [0, 2, 0, 0]
-                                     [0, 0, 3, 0]
-                                     [0, 0, 0, 4]],
-                                    [[5, 0, 0, 0]
-                                     [0, 6, 0, 0]
-                                     [0, 0, 7, 0]
-                                     [0, 0, 0, 8]]]
-
-which has shape (2, 4, 4)
-```
-
-##### Args:
-
-
-*  <b>`diagonal`</b>: A `Tensor`. Rank `k`, where `k >= 1`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `diagonal`.
-  Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-
-
-- - -
-
-### `tf.matrix_diag_part(input, name=None)` {#matrix_diag_part}
-
-Returns the batched diagonal part of a batched tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the batched `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-
-`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-
-The input must be at least a matrix.
-
-For example:
-
-```prettyprint
-# 'input' is [[[1, 0, 0, 0]
-               [0, 2, 0, 0]
-               [0, 0, 3, 0]
-               [0, 0, 0, 4]],
-              [[5, 0, 0, 0]
-               [0, 6, 0, 0]
-               [0, 0, 7, 0]
-               [0, 0, 0, 8]]]
-
-and input.shape = (2, 4, 4)
-
-tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-which has shape (2, 4)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k` tensor where `k >= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The extracted diagonal(s) having shape
-  `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-
-
-- - -
-
-### `tf.matrix_band_part(input, num_lower, num_upper, name=None)` {#matrix_band_part}
-
-Copy a tensor setting everything outside a central band in each innermost matrix
-
-to zero.
-
-The `band` part is computed as follows:
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor with the same shape where
-
-`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-
-The indicator function
-
-`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-                 (num_upper < 0 || (n-m) <= num_upper)`.
-
-For example:
-
-```prettyprint
-# if 'input' is [[ 0,  1,  2, 3]
-                 [-1,  0,  1, 2]
-                 [-2, -1,  0, 1]
-                 [-3, -2, -1, 0]],
-
-tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-                                       [-1,  0,  1, 2]
-                                       [ 0, -1,  0, 1]
-                                       [ 0,  0, -1, 0]],
-
-tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-                                      [-1,  0,  1, 0]
-                                      [-2, -1,  0, 1]
-                                      [ 0, -2, -1, 0]]
-```
-
-Useful special cases:
-
-```prettyprint
- tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
- tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
- tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k` tensor.
-*  <b>`num_lower`</b>: A `Tensor` of type `int64`.
-    0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-    lower triangle.
-*  <b>`num_upper`</b>: A `Tensor` of type `int64`.
-    0-D tensor. Number of superdiagonals to keep. If negative, keep
-    entire upper triangle.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Rank `k` tensor of the same shape as input. The extracted banded tensor.
-
-
-- - -
-
-### `tf.matrix_set_diag(input, diagonal, name=None)` {#matrix_set_diag}
-
-Returns a batched matrix tensor with new batched diagonal values.
-
-Given `input` and `diagonal`, this operation returns a tensor with the
-same shape and values as `input`, except for the main diagonal of the
-innermost matrices.  These will be overwritten by the values in `diagonal`.
-
-The output is computed as follows:
-
-Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-
-  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Rank `k+1`, where `k >= 1`.
-*  <b>`diagonal`</b>: A `Tensor`. Must have the same type as `input`.
-    Rank `k`, where `k >= 1`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  Rank `k+1`, with `output.shape = input.shape`.
-
-
-- - -
-
-### `tf.matrix_transpose(a, name='matrix_transpose')` {#matrix_transpose}
-
-Transposes last two dimensions of tensor `a`.
-
-For example:
-
-```python
-# Matrix with no batch dimension.
-# 'x' is [[1 2 3]
-#         [4 5 6]]
-tf.matrix_transpose(x) ==> [[1 4]
-                                 [2 5]
-                                 [3 6]]
-
-# Matrix with two batch dimensions.
-# x.shape is [1, 2, 3, 4]
-# tf.matrix_transpose(x) is shape [1, 2, 4, 3]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: A `Tensor` with `rank >= 2`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A transposed batch matrix `Tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `a` is determined statically to have `rank < 2`.
-
-
-
-- - -
-
-### `tf.matmul(a, b, transpose_a=False, transpose_b=False, adjoint_a=False, adjoint_b=False, a_is_sparse=False, b_is_sparse=False, name=None)` {#matmul}
-
-Multiplies matrix `a` by matrix `b`, producing `a` * `b`.
-
-The inputs must be matrices (or tensors of rank > 2, representing batches of
-matrices), with matching inner dimensions, possibly after transposition.
-
-Both matrices must be of the same type. The supported types are:
-`float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
-
-Either matrix can be transposed or adjointed (conjugated and transposed) on
-the fly by setting one of the corresponding flag to `True`. These are `False`
-by default.
-
-If one or both of the matrices contain a lot of zeros, a more efficient
-multiplication algorithm can be used by setting the corresponding
-`a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
-This optimization is only available for plain matrices (rank-2 tensors) with
-datatypes `bfloat16` or `float32`.
-
-For example:
-
-```python
-# 2-D tensor `a`
-a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3]) => [[1. 2. 3.]
-                                                      [4. 5. 6.]]
-# 2-D tensor `b`
-b = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2]) => [[7. 8.]
-                                                         [9. 10.]
-                                                         [11. 12.]]
-c = tf.matmul(a, b) => [[58 64]
-                        [139 154]]
-
-
-# 3-D tensor `a`
-a = tf.constant(np.arange(1, 13, dtype=np.int32),
-                shape=[2, 2, 3])                  => [[[ 1.  2.  3.]
-                                                       [ 4.  5.  6.]],
-                                                      [[ 7.  8.  9.]
-                                                       [10. 11. 12.]]]
-
-# 3-D tensor `b`
-b = tf.constant(np.arange(13, 25, dtype=np.int32),
-                shape=[2, 3, 2])                   => [[[13. 14.]
-                                                        [15. 16.]
-                                                        [17. 18.]],
-                                                       [[19. 20.]
-                                                        [21. 22.]
-                                                        [23. 24.]]]
-c = tf.matmul(a, b) => [[[ 94 100]
-                         [229 244]],
-                        [[508 532]
-                         [697 730]]]
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
-    `complex128` and rank > 1.
-*  <b>`b`</b>: `Tensor` with same type and rank as `a`.
-*  <b>`transpose_a`</b>: If `True`, `a` is transposed before multiplication.
-*  <b>`transpose_b`</b>: If `True`, `b` is transposed before multiplication.
-*  <b>`adjoint_a`</b>: If `True`, `a` is conjugated and transposed before
-    multiplication.
-*  <b>`adjoint_b`</b>: If `True`, `b` is conjugated and transposed before
-    multiplication.
-*  <b>`a_is_sparse`</b>: If `True`, `a` is treated as a sparse matrix.
-*  <b>`b_is_sparse`</b>: If `True`, `b` is treated as a sparse matrix.
-*  <b>`name`</b>: Name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same type as `a` and `b` where each inner-most matrix is
-  the product of the corresponding matrices in `a` and `b`, e.g. if all
-  transpose or adjoint attributes are `False`:
-
-  `output`[..., i, j] = sum_k (`a`[..., i, k] * `b`[..., k, j]),
-  for all indices i, j.
-
-
-*  <b>`Note`</b>: This is matrix product, not element-wise product.
-
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If transpose_a and adjoint_a, or transpose_b and adjoint_b
-    are both set to True.
-
-
-
-- - -
-
-### `tf.norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None)` {#norm}
-
-Computes the norm of vectors, matrices, and tensors.
-
-This function can compute 3 different matrix norms (Frobenius, 1-norm, and
-inf-norm) and up to 9218868437227405311 different vectors norms.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
-*  <b>`ord`</b>: Order of the norm. Supported values are 'fro', 'euclidean', `0`,
-    `1, `2`, `np.inf` and any positive real number yielding the corresponding
-    p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
-    `tensor` is a matrix and equivalent to 2-norm for vectors.
-    Some restrictions apply,
-      a) The Frobenius norm `fro` is not defined for vectors,
-      b) If axis is a 2-tuple (matrix-norm), only 'euclidean', 'fro', `1`,
-         `np.inf` are supported.
-    See the description of `axis` on how to compute norms for a batch of
-    vectors or matrices stored in a tensor.
-*  <b>`axis`</b>: If `axis` is `None` (the default), the input is considered a vector
-    and a single vector norm is computed over the entire set of values in the
-    tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
-    `norm(reshape(tensor, [-1]), ord=ord)`.
-    If `axis` is a Python integer, the input is considered a batch of vectors,
-    and `axis`t determines the axis in `tensor` over which to compute vector
-    norms.
-    If `axis` is a 2-tuple of Python integers it is considered a batch of
-    matrices and `axis` determines the axes in `tensor` over which to compute
-    a matrix norm.
-    Negative indices are supported. Example: If you are passing a tensor that
-    can be either a matrix or a batch of matrices at runtime, pass
-    `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
-    computed.
-*  <b>`keep_dims`</b>: If True, the axis indicated in `axis` are kept with size 1.
-    Otherwise, the dimensions in `axis` are removed from the output shape.
-*  <b>`name`</b>: The name of the op.
-
-##### Returns:
-
-
-*  <b>`output`</b>: A `Tensor` of the same type as tensor, containing the vector or
-    matrix norms. If `keep_dims` is True then the rank of output is equal to
-    the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
-    if `axis` is an integer, the rank of `output` is one less than the rank
-    of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
-    than the rank of `tensor`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `ord` or `axis` is invalid.
-
-@compatibility(numpy)
-Mostly equivalent to numpy.linalg.norm.
-Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
-
-##### Other differences:
-
-  a) If axis is `None`, treats the the flattened `tensor` as a vector
-   regardless of rank.
-  b) Explicitly supports 'euclidean' norm as the default, including for
-   higher order tensors.
-@end_compatibility
-
-
-- - -
-
-### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
-
-Computes the determinant of one ore more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor containing the determinants
-for all input submatrices `[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[...]`.
-
-
-- - -
-
-### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
-
-Computes the inverse of one or more square invertible matrices or their
-
-adjoints (conjugate transposes).
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the inverse for all input submatrices `[..., :, :]`.
-
-The op uses LU decomposition with partial pivoting to compute the inverses.
-
-If a matrix is not invertible there is no guarantee what the op does. It
-may detect the condition and raise an exception or it may simply return a
-garbage result.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M, M]`.
-
-  @compatibility(numpy)
-  Equivalent to np.linalg.inv
-  @end_compatibility
-
-
-- - -
-
-### `tf.cholesky(input, name=None)` {#cholesky}
-
-Computes the Cholesky decomposition of one or more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix Cholesky
-decomposition above. The output is a tensor of the same shape as the input
-containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M, M]`.
-
-
-- - -
-
-### `tf.cholesky_solve(chol, rhs, name=None)` {#cholesky_solve}
-
-Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
-
-```python
-# Solve 10 separate 2x2 linear systems:
-A = ... # shape 10 x 2 x 2
-RHS = ... # shape 10 x 2 x 1
-chol = tf.cholesky(A)  # shape 10 x 2 x 2
-X = tf.cholesky_solve(chol, RHS)  # shape 10 x 2 x 1
-# tf.matmul(A, X) ~ RHS
-X[3, :, 0]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 0]
-
-# Solve five linear systems (K = 5) for every member of the length 10 batch.
-A = ... # shape 10 x 2 x 2
-RHS = ... # shape 10 x 2 x 5
-...
-X[3, :, 2]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 2]
-```
-
-##### Args:
-
-
-*  <b>`chol`</b>: A `Tensor`.  Must be `float32` or `float64`, shape is `[..., M, M]`.
-    Cholesky factorization of `A`, e.g. `chol = tf.cholesky(A)`.
-    For that reason, only the lower triangular parts (including the diagonal)
-    of the last two dimensions of `chol` are used.  The strictly upper part is
-    assumed to be zero and not accessed.
-*  <b>`rhs`</b>: A `Tensor`, same type as `chol`, shape is `[..., M, K]`.
-*  <b>`name`</b>: A name to give this `Op`.  Defaults to `cholesky_solve`.
-
-##### Returns:
-
-  Solution to `A x = rhs`, shape `[..., M, K]`.
-
-
-- - -
-
-### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}
-
-Solves systems of linear equations.
-
-`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `True` then each output matrix satisfies
-`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`, `complex64`, `complex128`.
-    Shape is `[..., M, M]`.
-*  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
-    Shape is `[..., M, K]`.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-    Boolean indicating whether to solve with `matrix` or its (block-wise)
-    adjoint.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `matrix`. Shape is `[..., M, K]`.
-
-
-- - -
-
-### `tf.matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#matrix_triangular_solve}
-
-Solves systems of linear equations with upper or lower triangular matrices by
-
-backsubstitution.
-
-`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is assumed to be zero and not accessed.
-If `lower` is False then the strictly lower triangular part of each inner-most
-matrix is assumed to be zero and not accessed.
-`rhs` is a tensor of shape `[..., M, K]`.
-
-The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-`True` then the innermost matrices in output` satisfy matrix equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `False` then the strictly then the  innermost matrices in
-`output` satisfy matrix equations
-`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
-    Shape is `[..., M, K]`.
-*  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether the innermost matrices in `matrix` are
-    lower or upper triangular.
-*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
-    Boolean indicating whether to solve with `matrix` or its (block-wise)
-             adjoint.
-
-    @compatibility(numpy)
-    Equivalent to np.linalg.triangular_solve
-    @end_compatibility
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `matrix`. Shape is `[..., M, K]`.
-
-
-- - -
-
-### `tf.matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None)` {#matrix_solve_ls}
-
-Solves one or more linear least-squares problems.
-
-`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form `M`-by-`N` matrices. Rhs is a tensor of shape `[..., M, K]` whose
-inner-most 2 dimensions form `M`-by-`K` matrices.   The computed output is a
-`Tensor` of shape `[..., N, K]` whose inner-most 2 dimensions form `M`-by-`K`
-matrices that solve the equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]` in the least squares
-sense.
-
-Below we will use the following notation for each pair of matrix and
-right-hand sides in the batch:
-
-`matrix`=\\(A \in \Re^{m \times n}\\),
-`rhs`=\\(B  \in \Re^{m \times k}\\),
-`output`=\\(X  \in \Re^{n \times k}\\),
-`l2_regularizer`=\\(\lambda\\).
-
-If `fast` is `True`, then the solution is computed by solving the normal
-equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-\\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k}} ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-\\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is
-the minimum-norm solution to the under-determined linear system, i.e.
-\\(X = \mathrm{argmin}_{Z \in \Re^{n \times k}} ||Z||_F^2 \\), subject to
-\\(A Z = B\\). Notice that the fast path is only numerically stable when
-\\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach}}}\\) or\\(\lambda\\)
-is sufficiently large.
-
-If `fast` is `False` an algorithm based on the numerically robust complete
-orthogonal decomposition is used. This computes the minimum-norm
-least-squares solution, even when \\(A\\) is rank deficient. This path is
-typically 6-7 times slower than the fast path. If `fast` is `False` then
-`l2_regularizer` is ignored.
-
-##### Args:
-
-
-*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`.
-*  <b>`rhs`</b>: `Tensor` of shape `[..., M, K]`.
-*  <b>`l2_regularizer`</b>: 0-D `double` `Tensor`. Ignored if `fast=False`.
-*  <b>`fast`</b>: bool. Defaults to `True`.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: `Tensor` of shape `[..., N, K]` whose inner-most 2 dimensions form
-    `M`-by-`K` matrices that solve the equations
-    `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]` in the least
-    squares sense.
-
-
-- - -
-
-### `tf.qr(input, full_matrices=None, name=None)` {#qr}
-
-Computes the QR decompositions of one or more matrices.
-
-Computes the QR decomposition of each inner matrix in `tensor` such that
-`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-
-```prettyprint
-# a is a tensor.
-# q is a tensor of orthonormal matrices.
-# r is a tensor of upper triangular matrices.
-q, r = qr(a)
-q_full, r_full = qr(a, full_matrices=True)
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`, `complex64`, `complex128`.
-    A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-    form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-*  <b>`full_matrices`</b>: An optional `bool`. Defaults to `False`.
-    If true, compute full-sized `q` and `r`. If false
-    (the default), compute only the leading `P` columns of `q`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (q, r).
-
-*  <b>`q`</b>: A `Tensor`. Has the same type as `input`. Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-    `[..., M, M]`.
-*  <b>`r`</b>: A `Tensor`. Has the same type as `input`. Triangular factor. If `full_matrices` is `False` then shape is
-    `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-
-
-- - -
-
-### `tf.self_adjoint_eig(tensor, name=None)` {#self_adjoint_eig}
-
-Computes the eigen decomposition of a batch of self-adjoint matrices.
-
-Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
-in `tensor` such that
-`tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i]`, for i=0...N-1.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`. Only the lower triangular part of
-    each inner inner matrix is referenced.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
-*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
-    matrices contain eigenvectors of the corresponding matrices in `tensor`
-
-
-- - -
-
-### `tf.self_adjoint_eigvals(tensor, name=None)` {#self_adjoint_eigvals}
-
-Computes the eigenvalues of one or more self-adjoint matrices.
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
-    eigenvalues of `tensor[..., :, :]`.
-
-
-- - -
-
-### `tf.svd(tensor, full_matrices=False, compute_uv=True, name=None)` {#svd}
-
-Computes the singular value decompositions of one or more matrices.
-
-Computes the SVD of each inner matrix in `tensor` such that
-`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
-:])`
-
-```prettyprint
-# a is a tensor.
-# s is a tensor of singular values.
-# u is a tensor of left singular vectors.
-#v is a tensor of right singular vectors.
-s, u, v = svd(a)
-s = svd(a, compute_uv=False)
-```
-
-##### Args:
-
-
-*  <b>`tensor`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
-    `N`.
-*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
-    (the default), compute only the leading `P` singular vectors.
-    Ignored if `compute_uv` is `False`.
-*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
-    computed and returned in `u` and `v`, respectively. Otherwise, only the
-    singular values will be computed, which can be significantly faster.
-*  <b>`name`</b>: string, optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
-*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
-    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-    `[..., M, M]`. Not returned if `compute_uv` is `False`.
-*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
-    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
-    `[..., N, N]`. Not returned if `compute_uv` is `False`.
-
-@compatibility(numpy)
-Mostly equivalent to numpy.linalg.svd, except that the order of output
-arguments here is `s`, `u`, `v` when `compute_uv` is `True`, as opposed to
-`u`, `s`, `v` for numpy.linalg.svd.
-@end_compatibility
-
-
-
-
-## Tensor Math Function
-
-TensorFlow provides operations that you can use to add tensor functions to your
-graph.
-
-- - -
-
-### `tf.tensordot(a, b, axes, name=None)` {#tensordot}
-
-Tensor contraction of a and b along specified axes.
-
-Tensordot (also known as tensor contraction) sums the product of elements
-from `a` and `b` over the indices specified by `a_axes` and `b_axes`.
-The lists `a_axes` and `b_axes` specify those pairs of axes along which to
-contract the tensors. The axis `a_axes[i]` of `a` must have the same dimension
-as axis `b_axes[i]` of `b` for all `i` in `range(0, len(a_axes))`. The lists
-`a_axes` and `b_axes` must have identical length and consist of unique
-integers that specify valid axes for each of the tensors.
-
-This operation corresponds to `numpy.tensordot(a, b, axes)`.
-
-Example 1: When `a` and `b` are matrices (order 2), the case `axes = 1`
-is equivalent to matrix multiplication.
-
-Example 2: When `a` and `b` are matrices (order 2), the case
-`axes = [[1], [0]]` is equivalent to matrix multiplication.
-
-Example 3: Suppose that \\(a_ijk\\) and \\(b_lmn\\) represent two
-tensors of order 3. Then, `contract(a, b, [0], [2])` is the order 4 tensor
-\\(c_{jklm}\\) whose entry
-corresponding to the indices \\((j,k,l,m)\\) is given by:
-
-\\( c_{jklm} = \sum_i a_{ijk} b_{lmi} \\).
-
-In general, `order(c) = order(a) + order(b) - 2*len(axes[0])`.
-
-##### Args:
-
-
-*  <b>`a`</b>: `Tensor` of type `float32` or `float64`.
-*  <b>`b`</b>: `Tensor` with the same type as `a`.
-*  <b>`axes`</b>: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-   If axes is a scalar, sum over the last N axes of a and the first N axes
-   of b in order.
-   If axes is a list or `Tensor` the first and second row contain the set of
-   unique integers specifying axes along which the contraction is computed,
-   for `a` and `b`, respectively. The number of axes for `a` and `b` must
-   be equal.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `a`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the shapes of `a`, `b`, and `axes` are incompatible.
-*  <b>`IndexError`</b>: If the values in axes exceed the rank of the corresponding
-    tensor.
-
-
-
-
-## Complex Number Functions
-
-TensorFlow provides several operations that you can use to add complex number
-functions to your graph.
-
-- - -
-
-### `tf.complex(real, imag, name=None)` {#complex}
-
-Converts two real numbers to a complex number.
-
-Given a tensor `real` representing the real part of a complex number, and a
-tensor `imag` representing the imaginary part of a complex number, this
-operation returns complex numbers elementwise of the form \\(a + bj\\), where
-*a* represents the `real` part and *b* represents the `imag` part.
-
-The input tensors `real` and `imag` must have the same shape.
-
-For example:
-
-```
-# tensor 'real' is [2.25, 3.25]
-# tensor `imag` is [4.75, 5.75]
-tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-```
-
-##### Args:
-
-
-*  <b>`real`</b>: A `Tensor`. Must be one of the following types: `float32`,
-    `float64`.
-*  <b>`imag`</b>: A `Tensor`. Must have the same type as `real`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64` or `complex128`.
-
-
-- - -
-
-### `tf.conj(x, name=None)` {#conj}
-
-Returns the complex conjugate of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-complex numbers that are the complex conjugate of each element in `input`. The
-complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-real part and *b* is the imaginary part.
-
-The complex conjugate returned by this operation is of the form \\(a - bj\\).
-
-For example:
-
-    # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-    tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-
-If `x` is real, it is returned unchanged.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` to conjugate.  Must have numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` that is the conjugate of `x` (with the same type).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `x` is not a numeric tensor.
-
-
-- - -
-
-### `tf.imag(input, name=None)` {#imag}
-
-Returns the imaginary part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float32` or `float64` that is the imaginary part of each element in
-`input`. All elements in `input` must be complex numbers of the form \(a +
-bj\), where *a* is the real part and *b* is the imaginary part returned by
-this operation.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.imag(input) ==> [4.75, 5.75]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `complex64`,
-    `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32` or `float64`.
-
-
-- - -
-
-### `tf.real(input, name=None)` {#real}
-
-Returns the real part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float32` or `float64` that is the real part of each element in `input`.
-All elements in `input` must be complex numbers of the form \\(a + bj\\),
-where *a* is the real part returned by this operation and *b* is the
-imaginary part.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.real(input) ==> [-2.25, 3.25]
-```
-
-If `input` is already real, it is returned unchanged.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must have numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `float32` or `float64`.
-
-
-
-## Fourier Transform Functions
-
-TensorFlow provides several operations that you can use to add discrete
-Fourier transform functions to your graph.
-
-- - -
-
-### `tf.fft(input, name=None)` {#fft}
-
-Compute the 1-dimensional discrete Fourier Transform over the inner-most
-
-dimension of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its 1D Fourier Transform.
-
-
-- - -
-
-### `tf.ifft(input, name=None)` {#ifft}
-
-Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
-
-dimension of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its inverse 1D Fourier Transform.
-
-
-- - -
-
-### `tf.fft2d(input, name=None)` {#fft2d}
-
-Compute the 2-dimensional discrete Fourier Transform over the inner-most
-
-2 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 2
-    dimensions of `input` are replaced with their 2D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft2
-  @end_compatibility
-
-
-- - -
-
-### `tf.ifft2d(input, name=None)` {#ifft2d}
-
-Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
-
-2 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 2
-    dimensions of `input` are replaced with their inverse 2D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.ifft2
-  @end_compatibility
-
-
-- - -
-
-### `tf.fft3d(input, name=None)` {#fft3d}
-
-Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
-
-dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 3
-    dimensions of `input` are replaced with their 3D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft3
-  @end_compatibility
-
-
-- - -
-
-### `tf.ifft3d(input, name=None)` {#ifft3d}
-
-Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
-
-3 dimensions of `input`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `complex64`. A complex64 tensor.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `complex64`.
-  A complex64 tensor of the same shape as `input`. The inner-most 3
-    dimensions of `input` are replaced with their inverse 3D Fourier Transform.
-
-  @compatibility(numpy)
-  Equivalent to np.fft3
-  @end_compatibility
-
-
-
-## Reduction
-
-TensorFlow provides several operations that you can use to perform
-common math computations that reduce various dimensions of a tensor.
-
-- - -
-
-### `tf.reduce_sum(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_sum}
-
-Computes the sum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[1, 1, 1]
-#         [1, 1, 1]]
-tf.reduce_sum(x) ==> 6
-tf.reduce_sum(x, 0) ==> [2, 2, 2]
-tf.reduce_sum(x, 1) ==> [3, 3]
-tf.reduce_sum(x, 1, keep_dims=True) ==> [[3], [3]]
-tf.reduce_sum(x, [0, 1]) ==> 6
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.sum
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_prod(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_prod}
-
-Computes the product of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.prod
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_min(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_min}
-
-Computes the minimum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.min
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_max(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_max}
-
-Computes the maximum of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.max
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_mean(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_mean}
-
-Computes the mean of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[1., 1.]
-#         [2., 2.]]
-tf.reduce_mean(x) ==> 1.5
-tf.reduce_mean(x, 0) ==> [1.5, 1.5]
-tf.reduce_mean(x, 1) ==> [1.,  2.]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.mean
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_all(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_all}
-
-Computes the "logical and" of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[True,  True]
-#         [False, False]]
-tf.reduce_all(x) ==> False
-tf.reduce_all(x, 0) ==> [False, False]
-tf.reduce_all(x, 1) ==> [True, False]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The boolean tensor to reduce.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.all
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_any(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_any}
-
-Computes the "logical or" of elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-For example:
-
-```python
-# 'x' is [[True,  True]
-#         [False, False]]
-tf.reduce_any(x) ==> True
-tf.reduce_any(x, 0) ==> [True, True]
-tf.reduce_any(x, 1) ==> [True, False]
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The boolean tensor to reduce.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-@compatibility(numpy)
-Equivalent to np.any
-@end_compatibility
-
-
-- - -
-
-### `tf.reduce_logsumexp(input_tensor, axis=None, keep_dims=False, name=None, reduction_indices=None)` {#reduce_logsumexp}
-
-Computes log(sum(exp(elements across dimensions of a tensor))).
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-This function is more numerically stable than log(sum(exp(input))). It avoids
-overflows caused by taking the exp of large inputs and underflows caused by
-taking the log of small inputs.
-
-For example:
-
-```python
-# 'x' is [[0, 0, 0]]
-#         [0, 0, 0]]
-tf.reduce_logsumexp(x) ==> log(6)
-tf.reduce_logsumexp(x, 0) ==> [log(2), log(2), log(2)]
-tf.reduce_logsumexp(x, 1) ==> [log(3), log(3)]
-tf.reduce_logsumexp(x, 1, keep_dims=True) ==> [[log(3)], [log(3)]]
-tf.reduce_logsumexp(x, [0, 1]) ==> log(6)
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor.
-
-
-- - -
-
-### `tf.count_nonzero(input_tensor, axis=None, keep_dims=False, dtype=tf.int64, name=None, reduction_indices=None)` {#count_nonzero}
-
-Computes number of nonzero elements across dimensions of a tensor.
-
-Reduces `input_tensor` along the dimensions given in `axis`.
-Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-entry in `axis`. If `keep_dims` is true, the reduced dimensions
-are retained with length 1.
-
-If `axis` has no entries, all dimensions are reduced, and a
-tensor with a single element is returned.
-
-**NOTE** Floating point comparison to zero is done by exact floating point
-equality check.  Small values are **not** rounded to zero for purposes of
-the nonzero check.
-
-For example:
-
-```python
-# 'x' is [[0, 1, 0]
-#         [1, 1, 0]]
-tf.count_nonzero(x) ==> 3
-tf.count_nonzero(x, 0) ==> [1, 2, 0]
-tf.count_nonzero(x, 1) ==> [1, 2]
-tf.count_nonzero(x, 1, keep_dims=True) ==> [[1], [2]]
-tf.count_nonzero(x, [0, 1]) ==> 3
-```
-
-##### Args:
-
-
-*  <b>`input_tensor`</b>: The tensor to reduce. Should be of numeric type, or `bool`.
-*  <b>`axis`</b>: The dimensions to reduce. If `None` (the default),
-    reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retains reduced dimensions with length 1.
-*  <b>`dtype`</b>: The output dtype; defaults to `tf.int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`reduction_indices`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  The reduced tensor (number of nonzero values).
-
-
-
-- - -
-
-### `tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None)` {#accumulate_n}
-
-Returns the element-wise sum of a list of tensors.
-
-Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
-otherwise, these are inferred.
-
-NOTE: This operation is not differentiable and cannot be used if inputs depend
-on trainable variables. Please use `tf.add_n` for such cases.
-
-For example:
-
-```python
-# tensor 'a' is [[1, 2], [3, 4]]
-# tensor `b` is [[5, 0], [0, 6]]
-tf.accumulate_n([a, b, a]) ==> [[7, 4], [6, 14]]
-
-# Explicitly pass shape and type
-tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
-  ==> [[7, 4], [6, 14]]
-```
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `Tensor` objects, each with same shape and type.
-*  <b>`shape`</b>: Shape of elements of `inputs`.
-*  <b>`tensor_dtype`</b>: The type of `inputs`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of same shape and type as the elements of `inputs`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `inputs` don't all have same shape and dtype or the shape
-  cannot be inferred.
-
-
-
-- - -
-
-### `tf.einsum(equation, *inputs)` {#einsum}
-
-A generalized contraction between tensors of arbitrary dimension.
-
-This function returns a tensor whose elements are defined by `equation`,
-which is written in a shorthand form inspired by the Einstein summation
-convention.  As an example, consider multiplying two matrices
-A and B to form a matrix C.  The elements of C are given by:
-
-```
-  C[i,k] = sum_j A[i,j] * B[j,k]
-```
-
-The corresponding `equation` is:
-
-```
-  ij,jk->ik
-```
-
-In general, the `equation` is obtained from the more familiar element-wise
-equation by
-  1. removing variable names, brackets, and commas,
-  2. replacing "*" with ",",
-  3. dropping summation signs, and
-  4. moving the output to the right, and replacing "=" with "->".
-
-Many common operations can be expressed in this way.  For example:
-
-```python
-# Matrix multiplication
->>> einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
-
-# Dot product
->>> einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
-
-# Outer product
->>> einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
-
-# Transpose
->>> einsum('ij->ji', m)  # output[j,i] = m[i,j]
-
-# Batch matrix multiplication
->>> einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
-```
-
-This function behaves like `numpy.einsum`, but does not support:
-* Ellipses (subscripts like `ij...,jk...->ik...`)
-* Subscripts where an axis appears more than once for a single input
-  (e.g. `ijj,k->ik`).
-* Subscripts that are summed across multiple inputs (e.g., `ij,ij,jk->ik`).
-
-##### Args:
-
-
-*  <b>`equation`</b>: a `str` describing the contraction, in the same format as
-    `numpy.einsum`.
-*  <b>`inputs`</b>: the inputs to contract (each one a `Tensor`), whose shapes should
-    be consistent with `equation`.
-
-##### Returns:
-
-  The contracted `Tensor`, with shape determined by `equation`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If
-    - the format of `equation` is incorrect,
-    - the number of inputs implied by `equation` does not match `len(inputs)`,
-    - an axis appears in the output subscripts but not in any of the inputs,
-    - the number of dimensions of an input differs from the number of
-      indices in its subscript, or
-    - the input shapes are inconsistent along a particular axis.
-
-
-
-## Scan
-
-TensorFlow provides several operations that you can use to perform scans
-(running totals) across one axis of a tensor.
-
-- - -
-
-### `tf.cumsum(x, axis=0, exclusive=False, reverse=False, name=None)` {#cumsum}
-
-Compute the cumulative sum of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumsum, which means that the first
-element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
-instead:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-opposite direction:
-```prettyprint
-tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
-```
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
-*  <b>`reverse`</b>: A `bool` (default: False).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.cumprod(x, axis=0, exclusive=False, reverse=False, name=None)` {#cumprod}
-
-Compute the cumulative product of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumprod, which means that the
-first
-element of the input is identical to the first element of the output:
-```prettyprint
-tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-performed
-instead:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-opposite direction:
-```prettyprint
-tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-```
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-```prettyprint
-tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
-*  <b>`reverse`</b>: A `bool` (default: False).
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-
-## Segmentation
-
-TensorFlow provides several operations that you can use to perform common
-math computations on tensor segments.
-Here a segmentation is a partitioning of a tensor along
-the first dimension, i.e. it  defines a mapping from the first dimension onto
-`segment_ids`. The `segment_ids` tensor should be the size of
-the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
-where `k<d0`.
-In particular, a segmentation of a matrix tensor is a mapping of rows to
-segments.
-
-For example:
-
-```python
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-  ==>  [[0 0 0 0]
-        [5 6 7 8]]
-```
-
-- - -
-
-### `tf.segment_sum(data, segment_ids, name=None)` {#segment_sum}
-
-Computes the sum along segments of a tensor.
-
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
-
-Computes a tensor such that
-\\(output_i = \sum_j data_j\\) where sum is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentSum.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.segment_prod(data, segment_ids, name=None)` {#segment_prod}
-
-Computes the product along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \prod_j data_j\\) where the product is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentProd.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.segment_min(data, segment_ids, name=None)` {#segment_min}
-
-Computes the minimum along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMin.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.segment_max(data, segment_ids, name=None)` {#segment_max}
-
-Computes the maximum along segments of a tensor.
-
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
-
-Computes a tensor such that
-\\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-that `segment_ids[j] == i`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMax.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.segment_mean(data, segment_ids, name=None)` {#segment_mean}
-
-Computes the mean along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-\\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-over `j` such that `segment_ids[j] == i` and `N` is the total number of
-values summed.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMean.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor whose rank is equal to the rank of `data`'s
-    first dimension.  Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-
-- - -
-
-### `tf.unsorted_segment_sum(data, segment_ids, num_segments, name=None)` {#unsorted_segment_sum}
-
-Computes the sum along segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Computes a tensor such that
-`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-need not be sorted and need not cover all values in the full
-range of valid values.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-
-`num_segments` should equal the number of distinct segment IDs.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`segment_ids`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor whose shape is a prefix of `data.shape`.
-*  <b>`num_segments`</b>: A `Tensor` of type `int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for the first `segment_ids.rank`
-  dimensions, which are replaced with a single dimension which has size
-  `num_segments`.
-
-
-
-- - -
-
-### `tf.sparse_segment_sum(data, indices, segment_ids, name=None)` {#sparse_segment_sum}
-
-Computes the sum along sparse segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-For example:
-
-```prettyprint
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-
-# Select two rows, one segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-  ==> [[0 0 0 0]]
-
-# Select two rows, two segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-  ==> [[ 1  2  3  4]
-       [-1 -2 -3 -4]]
-
-# Select all rows, two segments.
-tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-  ==> [[0 0 0 0]
-       [5 6 7 8]]
-
-# Which is equivalent to:
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-```
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.sparse_segment_mean(data, indices, segment_ids, name=None)` {#sparse_segment_mean}
-
-Computes the mean along sparse segments of a tensor.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-- - -
-
-### `tf.sparse_segment_sqrt_n(data, indices, segment_ids, name=None)` {#sparse_segment_sqrt_n}
-
-Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-
-N is the size of the segment being reduced.
-
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
-
-##### Args:
-
-
-*  <b>`data`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A 1-D tensor. Has same rank as `segment_ids`.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    A 1-D tensor. Values should be sorted and can be repeated.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `data`.
-  Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-
-
-
-## Sequence Comparison and Indexing
-
-TensorFlow provides several operations that you can use to add sequence
-comparison and index extraction to your graph. You can use these operations to
-determine sequence differences and determine the indexes of specific values in
-a tensor.
-
-- - -
-
-### `tf.argmin(input, axis=None, name=None, dimension=None)` {#argmin}
-
-Returns the index with the smallest value across axes of a tensor.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    int32, 0 <= axis < rank(input).  Describes which axis
-    of the input Tensor to reduce across. For vectors, use axis = 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-
-
-- - -
-
-### `tf.argmax(input, axis=None, name=None, dimension=None)` {#argmax}
-
-Returns the index with the largest value across axes of a tensor.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`axis`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    int32, 0 <= axis < rank(input).  Describes which axis
-    of the input Tensor to reduce across. For vectors, use axis = 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-
-
-
-- - -
-
-### `tf.setdiff1d(x, y, index_dtype=tf.int32, name=None)` {#setdiff1d}
-
-Computes the difference between two lists of numbers or strings.
-
-Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all values that are in `x` but not in `y`. The returned list `out`
-is sorted in the same order that the numbers appear in `x` (duplicates are
-preserved). This operation also returns a list `idx` that represents the
-position of each `out` element in `x`. In other words:
-
-`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-
-For example, given this input:
-
-```prettyprint
-x = [1, 2, 3, 4, 5, 6]
-y = [1, 3, 5]
-```
-
-This operation would return:
-
-```prettyprint
-out ==> [2, 4, 6]
-idx ==> [1, 3, 5]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D. Values to keep.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`. 1-D. Values to remove.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (out, idx).
-
-*  <b>`out`</b>: A `Tensor`. Has the same type as `x`. 1-D. Values present in `x` but not in `y`.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D. Positions of `x` values preserved in `out`.
-
-
-- - -
-
-### `tf.where(condition, x=None, y=None, name=None)` {#where}
-
-Return the elements, either from `x` or `y`, depending on the `condition`.
-
-If both `x` and `y` are None, then this operation returns the coordinates of
-true elements of `condition`.  The coordinates are returned in a 2-D tensor
-where the first dimension (rows) represents the number of true elements, and
-the second dimension (columns) represents the coordinates of the true
-elements. Keep in mind, the shape of the output tensor can vary depending on
-how many true values there are in input. Indices are output in row-major
-order.
-
-If both non-None, `x` and `y` must have the same shape.
-The `condition` tensor must be a scalar if `x` and `y` are scalar.
-If `x` and `y` are vectors or higher rank, then `condition` must be either a
-vector with size matching the first dimension of `x`, or must have the same
-shape as `x`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be taken
-from `x` (if true) or `y` (if false).
-
-If `condition` is a vector and `x` and `y` are higher rank matrices, then it
-chooses which row (outer dimension) to copy from `x` and `y`. If `condition`
-has the same shape as `x` and `y`, then it chooses which element to copy from
-`x` and `y`.
-
-##### Args:
-
-
-*  <b>`condition`</b>: A `Tensor` of type `bool`
-*  <b>`x`</b>: A Tensor which may have the same shape as `condition`. If `condition` is
-    rank 1, `x` may have higher rank, but its first dimension must match the
-    size of `condition`.
-*  <b>`y`</b>: A `tensor` with the same shape and type as `x`.
-*  <b>`name`</b>: A name of the operation (optional)
-
-##### Returns:
-
-  A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-  A `Tensor` with shape `(num_true, dim_size(condition))`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When exactly one of `x` or `y` is non-None.
-
-
-- - -
-
-### `tf.unique(x, out_idx=None, name=None)` {#unique}
-
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```prettyprint
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx = unique(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. 1-D.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (y, idx).
-
-*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
-
-
-
-- - -
-
-### `tf.edit_distance(hypothesis, truth, normalize=True, name='edit_distance')` {#edit_distance}
-
-Computes the Levenshtein distance between sequences.
-
-This operation takes variable-length sequences (`hypothesis` and `truth`),
-each provided as a `SparseTensor`, and computes the Levenshtein distance.
-You can normalize the edit distance by length of `truth` by setting
-`normalize` to true.
-
-For example, given the following input:
-
-```python
-# 'hypothesis' is a tensor of shape `[2, 1]` with variable-length values:
-#   (0,0) = ["a"]
-#   (1,0) = ["b"]
-hypothesis = tf.SparseTensor(
-    [[0, 0, 0],
-     [1, 0, 0]],
-    ["a", "b"]
-    (2, 1, 1))
-
-# 'truth' is a tensor of shape `[2, 2]` with variable-length values:
-#   (0,0) = []
-#   (0,1) = ["a"]
-#   (1,0) = ["b", "c"]
-#   (1,1) = ["a"]
-truth = tf.SparseTensor(
-    [[0, 1, 0],
-     [1, 0, 0],
-     [1, 0, 1],
-     [1, 1, 0]]
-    ["a", "b", "c", "a"],
-    (2, 2, 2))
-
-normalize = True
-```
-
-This operation would return the following:
-
-```python
-# 'output' is a tensor of shape `[2, 2]` with edit distances normalized
-# by 'truth' lengths.
-output ==> [[inf, 1.0],  # (0,0): no truth, (0,1): no hypothesis
-           [0.5, 1.0]]  # (1,0): addition, (1,1): no hypothesis
-```
-
-##### Args:
-
-
-*  <b>`hypothesis`</b>: A `SparseTensor` containing hypothesis sequences.
-*  <b>`truth`</b>: A `SparseTensor` containing truth sequences.
-*  <b>`normalize`</b>: A `bool`. If `True`, normalizes the Levenshtein distance by
-    length of `truth.`
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A dense `Tensor` with rank `R - 1`, where R is the rank of the
-  `SparseTensor` inputs `hypothesis` and `truth`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If either `hypothesis` or `truth` are not a `SparseTensor`.
-
-
-
-- - -
-
-### `tf.invert_permutation(x, name=None)` {#invert_permutation}
-
-Computes the inverse permutation of a tensor.
-
-This operation computes the inverse of an index permutation. It takes a 1-D
-integer tensor `x`, which represents the indices of a zero-based array, and
-swaps each value with its index position. In other words, for an output tensor
-`y` and an input tensor `x`, this operation computes the following:
-
-`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-
-The values must include 0. There can be no duplicate values or negative values.
-
-For example:
-
-```prettyprint
-# tensor `x` is [3, 4, 0, 2, 1]
-invert_permutation(x) ==> [2, 4, 3, 0, 1]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`. 1-D.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
deleted file mode 100644
index c348155bd75..00000000000
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ /dev/null
@@ -1,3839 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Neural Network
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Activation Functions.
-
-The activation ops provide different types of nonlinearities for use in neural
-networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`,
-`softplus`, and `softsign`), continuous but not everywhere differentiable
-functions (`relu`, `relu6`, `crelu` and `relu_x`), and random regularization
-(`dropout`).
-
-All activation ops apply componentwise, and produce a tensor of the same
-shape as the input tensor.
-
-- - -
-
-### `tf.nn.relu(features, name=None)` {#relu}
-
-Computes rectified linear: `max(features, 0)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
-
-- - -
-
-### `tf.nn.relu6(features, name=None)` {#relu6}
-
-Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-    `int16`, or `int8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `features`.
-
-
-- - -
-
-### `tf.nn.crelu(features, name=None)` {#crelu}
-
-Computes Concatenated ReLU.
-
-Concatenates a ReLU which selects only the positive part of the activation
-with a ReLU which selects only the *negative* part of the activation.
-Note that as a result this non-linearity doubles the depth of the activations.
-Source: https://arxiv.org/abs/1603.05201
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-    `int16`, or `int8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `features`.
-
-
-- - -
-
-### `tf.nn.elu(features, name=None)` {#elu}
-
-Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-
-See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-](http://arxiv.org/abs/1511.07289)
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
-
-- - -
-
-### `tf.nn.softplus(features, name=None)` {#softplus}
-
-Computes softplus: `log(exp(features) + 1)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
-
-- - -
-
-### `tf.nn.softsign(features, name=None)` {#softsign}
-
-Computes softsign: `features / (abs(features) + 1)`.
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `features`.
-
-
-- - -
-
-### `tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)` {#dropout}
-
-Computes dropout.
-
-With probability `keep_prob`, outputs the input element scaled up by
-`1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
-sum is unchanged.
-
-By default, each element is kept or dropped independently.  If `noise_shape`
-is specified, it must be
-[broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
-will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
-and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
-kept independently and each row and column will be kept or not kept together.
-
-##### Args:
-
-
-*  <b>`x`</b>: A tensor.
-*  <b>`keep_prob`</b>: A scalar `Tensor` with the same type as x. The probability
-    that each element is kept.
-*  <b>`noise_shape`</b>: A 1-D `Tensor` of type `int32`, representing the
-    shape for randomly generated keep/drop flags.
-*  <b>`seed`</b>: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A Tensor of the same shape of `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `keep_prob` is not in `(0, 1]`.
-
-
-- - -
-
-### `tf.nn.bias_add(value, bias, data_format=None, name=None)` {#bias_add}
-
-Adds `bias` to `value`.
-
-This is (mostly) a special case of `tf.add` where `bias` is restricted to 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-Unlike `tf.add`, the type of `bias` is allowed to differ from `value` in the
-case where both types are quantized.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor` with type `float`, `double`, `int64`, `int32`, `uint8`,
-    `int16`, `int8`, `complex64`, or `complex128`.
-*  <b>`bias`</b>: A 1-D `Tensor` with size matching the last dimension of `value`.
-    Must be the same type as `value` unless `value` is a quantized type,
-    in which case a different quantized type may be used.
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-
-- - -
-
-### `tf.sigmoid(x, name=None)` {#sigmoid}
-
-Computes sigmoid of `x` element-wise.
-
-Specifically, `y = 1 / (1 + exp(-x))`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A Tensor with type `float32`, `float64`, `int32`, `complex64`, `int64`,
-    or `qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A Tensor with the same type as `x` if `x.dtype != qint32`
-    otherwise the return type is `quint8`.
-
-@compatibility(numpy)
-Equivalent to np.scipy.special.expit
-@end_compatibility
-
-
-- - -
-
-### `tf.tanh(x, name=None)` {#tanh}
-
-Computes hyperbolic tangent of `x` element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A Tensor or SparseTensor with type `float`, `double`, `int32`,
-    `complex64`, `int64`, or `qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A Tensor or SparseTensor respectively with the same type as `x` if
-  `x.dtype != qint32` otherwise the return type is `quint8`.
-
-
-
-## Convolution
-
-The convolution ops sweep a 2-D filter over a batch of images, applying the
-filter to each window of each image of the appropriate size.  The different
-ops trade off between generic vs. specific filters:
-
-* `conv2d`: Arbitrary filters that can mix channels together.
-* `depthwise_conv2d`: Filters that operate on each channel independently.
-* `separable_conv2d`: A depthwise spatial filter followed by a pointwise filter.
-
-Note that although these ops are called "convolution", they are strictly
-speaking "cross-correlation" since the filter is combined with an input window
-without reversing the filter.  For details, see [the properties of
-cross-correlation](https://en.wikipedia.org/wiki/Cross-correlation#Properties).
-
-The filter is applied to image patches of the same size as the filter and
-strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
-the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
-filter to every other image patch in each dimension, etc.
-
-Ignoring channels for the moment, and assume that the 4-D `input` has shape
-`[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
-`[filter_height, filter_width, ...]`, then the spatial semantics of the
-convolution ops are as follows: first, according to the padding scheme chosen
-as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
-For the `'SAME'` padding, the output height and width are computed as:
-
-    out_height = ceil(float(in_height) / float(strides[1]))
-    out_width  = ceil(float(in_width) / float(strides[2]))
-
-and the padding on the top and left are computed as:
-
-    pad_along_height = max((out_height - 1) * strides[1] +
-                        filter_height - in_height, 0)
-    pad_along_width = max((out_width - 1) * strides[2] +
-                       filter_width - in_width, 0)
-    pad_top = pad_along_height // 2
-    pad_bottom = pad_along_height - pad_top
-    pad_left = pad_along_width // 2
-    pad_right = pad_along_width - pad_left
-
-
-Note that the division by 2 means that there might be cases when the padding on
-both sides (top vs bottom, right vs left) are off by one. In this case, the
-bottom and right sides always get the one additional padded pixel. For example,
-when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the
-bottom. Note that this is different from existing libraries such as cuDNN and
-Caffe, which explicitly specify the number of padded pixels and always pad the
-same number of pixels on both sides.
-
-For the `'VALID`' padding, the output height and width are computed as:
-
-    out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
-    out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
-
-and the padding values are always zero. The output is then computed as
-
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
-
-where any value outside the original input image region are considered zero (
-i.e. we pad zero values around the border of the image).
-
-Since `input` is 4-D, each `input[b, i, j, :]` is a vector.  For `conv2d`, these
-vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new
-vectors.  For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]`
-is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
-concatenated.
-
-- - -
-
-### `tf.nn.convolution(input, filter, padding, strides=None, dilation_rate=None, name=None, data_format=None)` {#convolution}
-
-Computes sums of N-D convolutions (actually cross-correlation).
-
-This also supports either output striding via the optional `strides` parameter
-or atrous convolution (also known as convolution with holes or dilated
-convolution, based on the French word "trous" meaning holes in English) via
-the optional `dilation_rate` parameter.  Currently, however, output striding
-is not supported for atrous convolutions.
-
-Specifically, in the case that `data_format` does not start with "NC", given
-a rank (N+2) `input` Tensor of shape
-
-  [num_batches,
-   input_spatial_shape[0],
-   ...,
-   input_spatial_shape[N-1],
-   num_input_channels],
-
-a rank (N+2) `filter` Tensor of shape
-
-  [spatial_filter_shape[0],
-   ...,
-   spatial_filter_shape[N-1],
-   num_input_channels,
-   num_output_channels],
-
-an optional `dilation_rate` tensor of shape [N] (defaulting to [1]*N)
-specifying the filter upsampling/input downsampling rate, and an optional list
-of N `strides` (defaulting [1]*N), this computes for each N-D spatial output
-position (x[0], ..., x[N-1]):
-
-  output[b, x[0], ..., x[N-1], k] =
-
-      sum_{z[0], ..., z[N-1], q}
-
-          filter[z[0], ..., z[N-1], q, k] *
-          padded_input[b,
-                       x[0]*strides[0] + dilation_rate[0]*z[0],
-                       ...,
-                       x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
-                       q]
-
-where `padded_input` is obtained by zero padding the input using an effective
-spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
-output striding `strides` as described in the
-[comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
-
-In the case that `data_format` does start with `"NC"`, the `input` and output
-(but not the `filter`) are simply transposed as follows:
-
-  convolution(input, data_format, **kwargs) =
-    tf.transpose(convolution(tf.transpose(input, [0] + range(2,N+2) + [1]),
-                             **kwargs),
-                 [0, N+1] + range(1, N+1))
-
-It is required that 1 <= N <= 3.
-
-##### Args:
-
-
-*  <b>`input`</b>: An N-D `Tensor` of type `T`, of shape
-    `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, in_channels] + input_spatial_shape` if data_format starts
-    with "NC".
-*  <b>`filter`</b>: An N-D `Tensor` with the same type as `input` and shape
-    `spatial_filter_shape + [in_channels, out_channels]`.
-*  <b>`padding`</b>: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
-*  <b>`strides`</b>: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
-    Defaults to [1]*N.  If any value of strides is > 1, then all values of
-    dilation_rate must be 1.
-*  <b>`dilation_rate`</b>: Optional.  Sequence of N ints >= 1.  Specifies the filter
-    upsampling/input downsampling rate.  In the literature, the same parameter
-    is sometimes called `input stride` or `dilation`.  The effective filter
-    size used for the convolution will be `spatial_filter_shape +
-    (spatial_filter_shape - 1) * (rate - 1)`, obtained by inserting
-    (dilation_rate[i]-1) zeros between consecutive elements of the original
-    filter in each spatial dimension i.  If any value of dilation_rate is > 1,
-    then all values of strides must be 1.
-*  <b>`name`</b>: Optional name for the returned tensor.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, the valid value is "NDHWC".
-
-##### Returns:
-
-  A `Tensor` with the same type as `input` of shape
-
-      `[batch_size] + output_spatial_shape + [out_channels]`
-
-  if data_format is None or does not start with "NC", or
-
-      `[batch_size, out_channels] + output_spatial_shape`
-
-  if data_format starts with "NC",
-  where `output_spatial_shape` depends on the value of `padding`.
-
-  If padding == "SAME":
-    output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
-
-  If padding == "VALID":
-    output_spatial_shape[i] =
-      ceil((input_spatial_shape[i] -
-            (spatial_filter_shape[i]-1) * dilation_rate[i])
-           / strides[i]).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter` shape, if padding
-    is other than `"VALID"` or `"SAME"`, or if data_format is invalid.
-
-
-- - -
-
-### `tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d}
-
-Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, out_channels]`, this op
-performs the following:
-
-1. Flattens the filter to a 2-D matrix with shape
-   `[filter_height * filter_width * in_channels, output_channels]`.
-2. Extracts image patches from the input tensor to form a *virtual*
-   tensor of shape `[batch, out_height, out_width,
-   filter_height * filter_width * in_channels]`.
-3. For each patch, right-multiplies the filter matrix and the image patch
-   vector.
-
-In detail, with the default NHWC format,
-
-    output[b, i, j, k] =
-        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-                        filter[di, dj, q, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-*  <b>`strides`</b>: A list of `ints`.
-    1-D of length 4.  The stride of the sliding window for each dimension
-    of `input`. Must be in the same order as the dimension specified with format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.nn.depthwise_conv2d(input, filter, strides, padding, rate=None, name=None)` {#depthwise_conv2d}
-
-Depthwise 2-D convolution.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`
-containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
-applies a different filter to each input channel (expanding from 1 channel
-to `channel_multiplier` channels for each), then concatenates the results
-together.  The output has `in_channels * channel_multiplier` channels.
-
-In detail,
-
-    output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
-         filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
-                                         strides[2] * j + rate[1] * dj, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the
-same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-If any value in `rate` is greater than 1, we perform atrous depthwise
-convolution, in which case all values in the `strides` tensor must be equal
-to 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter`</b>: 4-D with shape
-    `[filter_height, filter_width, in_channels, channel_multiplier]`.
-*  <b>`strides`</b>: 1-D of size 4.  The stride of the sliding window for each
-    dimension of `input`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment
-      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`rate`</b>: 1-D of size 2. The dilation rate in which we sample input values
-    across the `height` and `width` dimensions in atrous convolution. If it is
-    greater than 1, then all values of strides must be 1.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A 4-D `Tensor` of shape
-  `[batch, out_height, out_width, in_channels * channel_multiplier].`
-
-
-- - -
-
-### `tf.nn.depthwise_conv2d_native(input, filter, strides, padding, name=None)` {#depthwise_conv2d_native}
-
-Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-a different filter to each input channel (expanding from 1 channel to
-`channel_multiplier` channels for each), then concatenates the results
-together. Thus, the output has `in_channels * channel_multiplier` channels.
-
-for k in 0..in_channels-1
-  for q in 0..channel_multiplier-1
-    output[b, i, j, k * channel_multiplier + q] =
-      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-                        filter[di, dj, k, q]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-*  <b>`strides`</b>: A list of `ints`.
-    1-D of length 4.  The stride of the sliding window for each dimension
-    of `input`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.nn.separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, rate=None, name=None)` {#separable_conv2d}
-
-2-D convolution with separable filters.
-
-Performs a depthwise convolution that acts separately on channels followed by
-a pointwise convolution that mixes channels.  Note that this is separability
-between dimensions `[1, 2]` and `3`, not spatial separability between
-dimensions `1` and `2`.
-
-In detail,
-
-    output[b, i, j, k] = sum_{di, dj, q, r]
-        input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-        depthwise_filter[di, dj, q, r] *
-        pointwise_filter[0, 0, q * channel_multiplier + r, k]
-
-`strides` controls the strides for the depthwise convolution only, since
-the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
-`strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-If any value in `rate` is greater than 1, we perform atrous depthwise
-convolution, in which case all values in the `strides` tensor must be equal
-to 1.
-
-##### Args:
-
-
-*  <b>`input`</b>: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`depthwise_filter`</b>: 4-D `Tensor` with shape
-    `[filter_height, filter_width, in_channels, channel_multiplier]`.
-    Contains `in_channels` convolutional filters of depth 1.
-*  <b>`pointwise_filter`</b>: 4-D `Tensor` with shape
-    `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
-    filter to mix channels after `depthwise_filter` has convolved spatially.
-*  <b>`strides`</b>: 1-D of size 4.  The strides for the depthwise convolution for
-    each dimension of `input`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-    See the [comment
-      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`rate`</b>: 1-D of size 2. The dilation rate in which we sample input values
-    across the `height` and `width` dimensions in atrous convolution. If it is
-    greater than 1, then all values of strides must be 1.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If channel_multiplier * in_channels > out_channels,
-    which means that the separable convolution is overparameterized.
-
-
-- - -
-
-### `tf.nn.atrous_conv2d(value, filters, rate, padding, name=None)` {#atrous_conv2d}
-
-Atrous convolution (a.k.a. convolution with holes or dilated convolution).
-
-Computes a 2-D atrous convolution, also known as convolution with holes or
-dilated convolution, given 4-D `value` and `filters` tensors. If the `rate`
-parameter is equal to one, it performs regular 2-D convolution. If the `rate`
-parameter is greater than one, it performs convolution with holes, sampling
-the input values every `rate` pixels in the `height` and `width` dimensions.
-This is equivalent to convolving the input with a set of upsampled filters,
-produced by inserting `rate - 1` zeros between two consecutive values of the
-filters along the `height` and `width` dimensions, hence the name atrous
-convolution or convolution with holes (the French word trous means holes in
-English).
-
-More specifically:
-
-    output[b, i, j, k] = sum_{di, dj, q} filters[di, dj, q, k] *
-          value[b, i + rate * di, j + rate * dj, q]
-
-Atrous convolution allows us to explicitly control how densely to compute
-feature responses in fully convolutional networks. Used in conjunction with
-bilinear interpolation, it offers an alternative to `conv2d_transpose` in
-dense prediction tasks such as semantic image segmentation, optical flow
-computation, or depth estimation. It also allows us to effectively enlarge
-the field of view of filters without increasing the number of parameters or
-the amount of computation.
-
-For a description of atrous convolution and how it can be used for dense
-feature extraction, please see: [Semantic Image Segmentation with Deep
-Convolutional Nets and Fully Connected CRFs](http://arxiv.org/abs/1412.7062).
-The same operation is investigated further in [Multi-Scale Context Aggregation
-by Dilated Convolutions](http://arxiv.org/abs/1511.07122). Previous works
-that effectively use atrous convolution in different ways are, among others,
-[OverFeat: Integrated Recognition, Localization and Detection using
-Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
-Atrous convolution is also closely related to the so-called noble identities
-in multi-rate signal processing.
-
-There are many different ways to implement atrous convolution (see the refs
-above). The implementation here reduces
-
-```python
-    atrous_conv2d(value, filters, rate, padding=padding)
-```
-
-to the following three operations:
-
-```python
-    paddings = ...
-    net = space_to_batch(value, paddings, block_size=rate)
-    net = conv2d(net, filters, strides=[1, 1, 1, 1], padding="VALID")
-    crops = ...
-    net = batch_to_space(net, crops, block_size=rate)
-```
-
-Advanced usage. Note the following optimization: A sequence of `atrous_conv2d`
-operations with identical `rate` parameters, 'SAME' `padding`, and filters
-with odd heights/ widths:
-
-```python
-    net = atrous_conv2d(net, filters1, rate, padding="SAME")
-    net = atrous_conv2d(net, filters2, rate, padding="SAME")
-    ...
-    net = atrous_conv2d(net, filtersK, rate, padding="SAME")
-```
-
-can be equivalently performed cheaper in terms of computation and memory as:
-
-```python
-    pad = ...  # padding so that the input dims are multiples of rate
-    net = space_to_batch(net, paddings=pad, block_size=rate)
-    net = conv2d(net, filters1, strides=[1, 1, 1, 1], padding="SAME")
-    net = conv2d(net, filters2, strides=[1, 1, 1, 1], padding="SAME")
-    ...
-    net = conv2d(net, filtersK, strides=[1, 1, 1, 1], padding="SAME")
-    net = batch_to_space(net, crops=pad, block_size=rate)
-```
-
-because a pair of consecutive `space_to_batch` and `batch_to_space` ops with
-the same `block_size` cancel out when their respective `paddings` and `crops`
-inputs are identical.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default "NHWC"
-    format. Its shape is `[batch, in_height, in_width, in_channels]`.
-*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[filter_height, filter_width, in_channels, out_channels]`. `filters`'
-    `in_channels` dimension must match that of `value`. Atrous convolution is
-    equivalent to standard convolution with upsampled filters with effective
-    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
-    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
-    inserting `rate - 1` zeros along consecutive elements across the
-    `filters`' spatial dimensions.
-*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
-    the `height` and `width` dimensions. Equivalently, the rate by which we
-    upsample the filter values by inserting zeros across the `height` and
-    `width` dimensions. In the literature, the same parameter is sometimes
-    called `input stride` or `dilation`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
-
-- - -
-
-### `tf.nn.atrous_conv2d_transpose(value, filters, output_shape, rate, padding, name=None)` {#atrous_conv2d_transpose}
-
-The transpose of `atrous_conv2d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `atrous_conv2d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
-    format. Its shape is `[batch, in_height, in_width, in_channels]`.
-*  <b>`filters`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[filter_height, filter_width, out_channels, in_channels]`. `filters`'
-    `in_channels` dimension must match that of `value`. Atrous convolution is
-    equivalent to standard convolution with upsampled filters with effective
-    height `filter_height + (filter_height - 1) * (rate - 1)` and effective
-    width `filter_width + (filter_width - 1) * (rate - 1)`, produced by
-    inserting `rate - 1` zeros along consecutive elements across the
-    `filters`' spatial dimensions.
-*  <b>`output_shape`</b>: A 1-D `Tensor` of shape representing the output shape of the
-    deconvolution op.
-*  <b>`rate`</b>: A positive int32. The stride with which we sample input values across
-    the `height` and `width` dimensions. Equivalently, the rate by which we
-    upsample the filter values by inserting zeros across the `height` and
-    `width` dimensions. In the literature, the same parameter is sometimes
-    called `input stride` or `dilation`.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filters`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`, or if the `rate` is less
-    than one, or if the output_shape is not a tensor with 4 elements.
-
-
-- - -
-
-### `tf.nn.conv2d_transpose(value, filter, output_shape, strides, padding='SAME', data_format='NHWC', name=None)` {#conv2d_transpose}
-
-The transpose of `conv2d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `conv2d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of type `float` and shape
-    `[batch, height, width, in_channels]` for `NHWC` data format or
-    `[batch, in_channels, height, width]` for `NCHW` data format.
-*  <b>`filter`</b>: A 4-D `Tensor` with the same type as `value` and shape
-    `[height, width, output_channels, in_channels]`.  `filter`'s
-    `in_channels` dimension must match that of `value`.
-*  <b>`output_shape`</b>: A 1-D `Tensor` representing the output shape of the
-    deconvolution op.
-*  <b>`strides`</b>: A list of ints. The stride of the sliding window for each
-    dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter`'s shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
-
-- - -
-
-### `tf.nn.conv1d(value, filters, stride, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv1d}
-
-Computes a 1-D convolution given 3-D input and filter tensors.
-
-Given an input tensor of shape
-  [batch, in_width, in_channels]
-if data_format is "NHWC", or
-  [batch, in_channels, in_width]
-if data_format is "NCHW",
-and a filter / kernel tensor of shape
-[filter_width, in_channels, out_channels], this op reshapes
-the arguments to pass them to conv2d to perform the equivalent
-convolution operation.
-
-Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-For example, if `data_format` does not start with "NC", a tensor of shape
-  [batch, in_width, in_channels]
-is reshaped to
-  [batch, 1, in_width, in_channels],
-and the filter is reshaped to
-  [1, filter_width, in_channels, out_channels].
-The result is then reshaped back to
-  [batch, out_width, out_channels]
-(where out_width is a function of the stride and padding as in conv2d) and
-returned to the caller.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 3D `Tensor`.  Must be of type `float32` or `float64`.
-*  <b>`filters`</b>: A 3D `Tensor`.  Must have the same type as `input`.
-*  <b>`stride`</b>: An `integer`.  The number of entries by which
-    the filter is moved right at each step.
-*  <b>`padding`</b>: 'SAME' or 'VALID'
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`.  Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from `"NHWC", "NCHW"`.  Defaults
-    to `"NHWC"`, the data is stored in the order of
-    [batch, in_width, in_channels].  The `"NCHW"` format stores
-    data as [batch, in_channels, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.  Has the same type as input.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `data_format` is invalid.
-
-
-- - -
-
-### `tf.nn.conv3d(input, filter, strides, padding, name=None)` {#conv3d}
-
-Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-
-In signal processing, cross-correlation is a measure of similarity of
-two waveforms as a function of a time-lag applied to one of them. This
-is also known as a sliding dot product or sliding inner-product.
-
-Our Conv3D implements a form of cross-correlation.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-    Shape `[filter_depth, filter_height, filter_width, in_channels,
-    out_channels]`. `in_channels` must match between `input` and `filter`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.nn.conv3d_transpose(value, filter, output_shape, strides, padding='SAME', name=None)` {#conv3d_transpose}
-
-The transpose of `conv3d`.
-
-This operation is sometimes called "deconvolution" after [Deconvolutional
-Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-actually the transpose (gradient) of `conv3d` rather than an actual
-deconvolution.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 5-D `Tensor` of type `float` and shape
-    `[batch, depth, height, width, in_channels]`.
-*  <b>`filter`</b>: A 5-D `Tensor` with the same type as `value` and shape
-    `[depth, height, width, output_channels, in_channels]`.  `filter`'s
-    `in_channels` dimension must match that of `value`.
-*  <b>`output_shape`</b>: A 1-D `Tensor` representing the output shape of the
-    deconvolution op.
-*  <b>`strides`</b>: A list of ints. The stride of the sliding window for each
-    dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`name`</b>: Optional name for the returned tensor.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If input/output depth does not match `filter`'s shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
-
-- - -
-
-### `tf.nn.conv2d_backprop_filter(input, filter_sizes, out_backprop, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d_backprop_filter}
-
-Computes the gradients of convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-    4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 4-D
-    `[filter_height, filter_width, in_channels, out_channels]` tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution. Must be in the same order as the dimension specified with
-    format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-
-
-- - -
-
-### `tf.nn.conv2d_backprop_input(input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d_backprop_input}
-
-Computes the gradients of convolution with respect to the input.
-
-##### Args:
-
-
-*  <b>`input_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the shape of `input`,
-    where `input` is a 4-D `[batch, height, width, channels]` tensor.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`.
-    4-D with shape
-    `[filter_height, filter_width, in_channels, out_channels]`.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `filter`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution. Must be in the same order as the dimension specified with
-    format.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`use_cudnn_on_gpu`</b>: An optional `bool`. Defaults to `True`.
-*  <b>`data_format`</b>: An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`.
-    Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `filter`.
-  4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
-
-
-- - -
-
-### `tf.nn.conv3d_backprop_filter_v2(input, filter_sizes, out_backprop, strides, padding, name=None)` {#conv3d_backprop_filter_v2}
-
-Computes the gradients of 3-D convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 5-D
-    `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-    tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-    out_channels]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.nn.depthwise_conv2d_native_backprop_filter(input, filter_sizes, out_backprop, strides, padding, name=None)` {#depthwise_conv2d_native_backprop_filter}
-
-Computes the gradients of depthwise convolution with respect to the filter.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    4-D with shape `[batch, in_height, in_width, in_channels]`.
-*  <b>`filter_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the tensor shape of `filter`,
-    where `filter` is a 4-D
-    `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `input`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-
-
-- - -
-
-### `tf.nn.depthwise_conv2d_native_backprop_input(input_sizes, filter, out_backprop, strides, padding, name=None)` {#depthwise_conv2d_native_backprop_input}
-
-Computes the gradients of depthwise convolution with respect to the input.
-
-##### Args:
-
-
-*  <b>`input_sizes`</b>: A `Tensor` of type `int32`.
-    An integer vector representing the shape of `input`,
-    where `input` is a 4-D `[batch, height, width, channels]` tensor.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    4-D with shape
-    `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-*  <b>`out_backprop`</b>: A `Tensor`. Must have the same type as `filter`.
-    4-D with shape `[batch, out_height, out_width, out_channels]`.
-    Gradients w.r.t. the output of the convolution.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    of the convolution.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `filter`.
-  4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
-
-
-
-## Pooling
-
-The pooling ops sweep a rectangular window over the input tensor, computing a
-reduction operation for each window (average, max, or max with argmax).  Each
-pooling op uses rectangular windows of size `ksize` separated by offset
-`strides`.  For example, if `strides` is all ones every window is used, if
-`strides` is all twos every other window is used in each dimension, etc.
-
-In detail, the output is
-
-    output[i] = reduce(value[strides * i:strides * i + ksize])
-
-where the indices also take into consideration the padding values. Please refer
-to the `Convolution` section for details about the padding calculation.
-
-- - -
-
-### `tf.nn.avg_pool(value, ksize, strides, padding, data_format='NHWC', name=None)` {#avg_pool}
-
-Performs the average pooling on the input.
-
-Each entry in `output` is the mean of the corresponding size `ksize`
-window in `value`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
-    `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-*  <b>`ksize`</b>: A list of ints that has length >= 4.
-    The size of the window for each dimension of the input tensor.
-*  <b>`strides`</b>: A list of ints that has length >= 4.
-    The stride of the sliding window for each dimension of the
-    input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  A `Tensor` with the same type as `value`.  The average pooled output tensor.
-
-
-- - -
-
-### `tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None)` {#max_pool}
-
-Performs the max pooling on the input.
-
-##### Args:
-
-
-*  <b>`value`</b>: A 4-D `Tensor` with shape `[batch, height, width, channels]` and
-    type `tf.float32`.
-*  <b>`ksize`</b>: A list of ints that has length >= 4.  The size of the window for
-    each dimension of the input tensor.
-*  <b>`strides`</b>: A list of ints that has length >= 4.  The stride of the sliding
-    window for each dimension of the input tensor.
-*  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`data_format`</b>: A string. 'NHWC' and 'NCHW' are supported.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-  A `Tensor` with type `tf.float32`.  The max pooled output tensor.
-
-
-- - -
-
-### `tf.nn.max_pool_with_argmax(input, ksize, strides, padding, Targmax=None, name=None)` {#max_pool_with_argmax}
-
-Performs max pooling on the input and outputs both max values and indices.
-
-The indices in `argmax` are flattened, so that a maximum value at position
-`[b, y, x, c]` becomes flattened index
-`((b * height + y) * width + x) * channels + c`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
-    The size of the window for each dimension of the input tensor.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    The stride of the sliding window for each dimension of the
-    input tensor.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`Targmax`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, argmax).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
-*  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.
-
-
-- - -
-
-### `tf.nn.avg_pool3d(input, ksize, strides, padding, name=None)` {#avg_pool3d}
-
-Performs 3D average pooling on the input.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The size of the window for each dimension of
-    the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  The average pooled output tensor.
-
-
-- - -
-
-### `tf.nn.max_pool3d(input, ksize, strides, padding, name=None)` {#max_pool3d}
-
-Performs 3D max pooling on the input.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-*  <b>`ksize`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The size of the window for each dimension of
-    the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 5`.
-    1-D tensor of length 5. The stride of the sliding window for each
-    dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. The max pooled output tensor.
-
-
-- - -
-
-### `tf.nn.fractional_avg_pool(value, pooling_ratio, pseudo_random=None, overlapping=None, deterministic=None, seed=None, seed2=None, name=None)` {#fractional_avg_pool}
-
-Performs fractional average pooling on the input.
-
-Fractional average pooling is similar to Fractional max pooling in the pooling
-region generation step. The only difference is that after pooling regions are
-generated, a mean operation is performed instead of a max operation in each
-pooling region.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`pooling_ratio`</b>: A list of `floats` that has length `>= 4`.
-    Pooling ratio for each dimension of `value`, currently only
-    supports row and col dimension and should be >= 1.0. For example, a valid
-    pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-    must be 1.0 because we don't allow pooling on batch and channels
-    dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-    respectively.
-*  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, generates the pooling sequence in a
-    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-    difference between pseudorandom and random.
-*  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, it means when pooling, the values at the boundary
-    of adjacent pooling cells are used by both cells. For example:
-
-    `index  0  1  2  3  4`
-
-    `value  20 5  16 3  7`
-
-    If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-    The result would be [41/3, 26/3] for fractional avg pooling.
-
-*  <b>`deterministic`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, a fixed pooling region will be used when
-    iterating over a FractionalAvgPool node in the computation graph. Mainly used
-    in unit test to make FractionalAvgPool deterministic.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either seed or seed2 are set to be non-zero, the random number
-    generator is seeded by the given seed.  Otherwise, it is seeded by a
-    random seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    An second seed to avoid seed collision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, row_pooling_sequence, col_pooling_sequence).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `value`. output tensor after fractional avg pooling.
-*  <b>`row_pooling_sequence`</b>: A `Tensor` of type `int64`. row pooling sequence, needed to calculate gradient.
-*  <b>`col_pooling_sequence`</b>: A `Tensor` of type `int64`. column pooling sequence, needed to calculate gradient.
-
-
-- - -
-
-### `tf.nn.fractional_max_pool(value, pooling_ratio, pseudo_random=None, overlapping=None, deterministic=None, seed=None, seed2=None, name=None)` {#fractional_max_pool}
-
-Performs fractional max pooling on the input.
-
-Fractional max pooling is slightly different than regular max pooling.  In
-regular max pooling, you downsize an input set by taking the maximum value of
-smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-a factor of N, where N is an integer.  Fractional max pooling, as you might
-expect from the word "fractional", means that the overall reduction ratio N
-does not have to be an integer.
-
-The sizes of the pooling regions are generated randomly but are fairly uniform.
-For example, let's look at the height dimension, and the constraints on the
-list of rows that will be pool boundaries.
-
-First we define the following:
-
-1.  input_row_length : the number of rows from the input set
-2.  output_row_length : which will be smaller than the input
-3.  alpha = input_row_length / output_row_length : our reduction ratio
-4.  K = floor(alpha)
-5.  row_pooling_sequence : this is the result list of pool boundary rows
-
-Then, row_pooling_sequence should satisfy:
-
-1.  a[0] = 0 : the first value of the sequence is 0
-2.  a[end] = input_row_length : the last value of the sequence is the size
-3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-4.  length(row_pooling_sequence) = output_row_length+1
-
-For more details on fractional max pooling, see this paper:
-[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`pooling_ratio`</b>: A list of `floats` that has length `>= 4`.
-    Pooling ratio for each dimension of `value`, currently only
-    supports row and col dimension and should be >= 1.0. For example, a valid
-    pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-    must be 1.0 because we don't allow pooling on batch and channels
-    dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-    respectively.
-*  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, generates the pooling sequence in a
-    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-    difference between pseudorandom and random.
-*  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, it means when pooling, the values at the boundary
-    of adjacent pooling cells are used by both cells. For example:
-
-    `index  0  1  2  3  4`
-
-    `value  20 5  16 3  7`
-
-    If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-    The result would be [20, 16] for fractional max pooling.
-
-*  <b>`deterministic`</b>: An optional `bool`. Defaults to `False`.
-    When set to True, a fixed pooling region will be used when
-    iterating over a FractionalMaxPool node in the computation graph. Mainly used
-    in unit test to make FractionalMaxPool deterministic.
-*  <b>`seed`</b>: An optional `int`. Defaults to `0`.
-    If either seed or seed2 are set to be non-zero, the random number
-    generator is seeded by the given seed.  Otherwise, it is seeded by a
-    random seed.
-*  <b>`seed2`</b>: An optional `int`. Defaults to `0`.
-    An second seed to avoid seed collision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, row_pooling_sequence, col_pooling_sequence).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `value`. output tensor after fractional max pooling.
-*  <b>`row_pooling_sequence`</b>: A `Tensor` of type `int64`. row pooling sequence, needed to calculate gradient.
-*  <b>`col_pooling_sequence`</b>: A `Tensor` of type `int64`. column pooling sequence, needed to calculate gradient.
-
-
-- - -
-
-### `tf.nn.pool(input, window_shape, pooling_type, padding, dilation_rate=None, strides=None, name=None, data_format=None)` {#pool}
-
-Performs an N-D pooling operation.
-
-In the case that `data_format` does not start with "NC", computes for
-    0 <= b < batch_size,
-    0 <= x[i] < output_spatial_shape[i],
-    0 <= c < num_channels:
-
-  output[b, x[0], ..., x[N-1], c] =
-    REDUCE_{z[0], ..., z[N-1]}
-      input[b,
-            x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
-            ...
-            x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
-            c],
-
-where the reduction function REDUCE depends on the value of `pooling_type`,
-and pad_before is defined based on the value of `padding` as described in the
-[comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
-The reduction never includes out-of-bounds positions.
-
-In the case that `data_format` starts with `"NC"`, the `input` and output are
-simply transposed as follows:
-
-  pool(input, data_format, **kwargs) =
-    tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
-                      **kwargs),
-                 [0, N+1] + range(1, N+1))
-
-##### Args:
-
-
-*  <b>`input`</b>: Tensor of rank N+2, of shape
-    `[batch_size] + input_spatial_shape + [num_channels]` if data_format does
-    not start with "NC" (default), or
-    `[batch_size, num_channels] + input_spatial_shape` if data_format starts
-    with "NC".  Pooling happens over the spatial dimensions only.
-*  <b>`window_shape`</b>: Sequence of N ints >= 1.
-*  <b>`pooling_type`</b>: Specifies pooling operation, must be "AVG" or "MAX".
-*  <b>`padding`</b>: The padding algorithm, must be "SAME" or "VALID".
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
-*  <b>`dilation_rate`</b>: Optional.  Dilation rate.  List of N ints >= 1.
-    Defaults to [1]*N.  If any value of dilation_rate is > 1, then all values
-    of strides must be 1.
-*  <b>`strides`</b>: Optional.  Sequence of N ints >= 1.  Defaults to [1]*N.
-    If any value of strides is > 1, then all values of dilation_rate must be
-    1.
-*  <b>`name`</b>: Optional. Name of the op.
-*  <b>`data_format`</b>: A string or None.  Specifies whether the channel dimension of
-    the `input` and output is the last dimension (default, or if `data_format`
-    does not start with "NC"), or the second dimension (if `data_format`
-    starts with "NC").  For N=1, the valid values are "NWC" (default) and
-    "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-    N=3, the valid value is "NDHWC".
-
-##### Returns:
-
-  Tensor of rank N+2, of shape
-    [batch_size] + output_spatial_shape + [num_channels]
-
-  if data_format is None or does not start with "NC", or
-
-    [batch_size, num_channels] + output_spatial_shape
-
-  if data_format starts with "NC",
-  where `output_spatial_shape` depends on the value of padding:
-
-  If padding = "SAME":
-    output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
-  If padding = "VALID":
-    output_spatial_shape[i] =
-      ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
-           / strides[i]).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if arguments are invalid.
-
-
-
-## Morphological filtering
-
-Morphological operators are non-linear filters used in image processing.
-
-[Greyscale morphological dilation
-](https://en.wikipedia.org/wiki/Dilation_(morphology))
-is the max-sum counterpart of standard sum-product convolution:
-
-    output[b, y, x, c] =
-        max_{dy, dx} input[b,
-                           strides[1] * y + rates[1] * dy,
-                           strides[2] * x + rates[2] * dx,
-                           c] +
-                     filter[dy, dx, c]
-
-The `filter` is usually called structuring function. Max-pooling is a special
-case of greyscale morphological dilation when the filter assumes all-zero
-values (a.k.a. flat structuring function).
-
-[Greyscale morphological erosion
-](https://en.wikipedia.org/wiki/Erosion_(morphology))
-is the min-sum counterpart of standard sum-product convolution:
-
-    output[b, y, x, c] =
-        min_{dy, dx} input[b,
-                           strides[1] * y - rates[1] * dy,
-                           strides[2] * x - rates[2] * dx,
-                           c] -
-                     filter[dy, dx, c]
-
-Dilation and erosion are dual to each other. The dilation of the input signal
-`f` by the structuring signal `g` is equal to the negation of the erosion of
-`-f` by the reflected `g`, and vice versa.
-
-Striding and padding is carried out in exactly the same way as in standard
-convolution. Please refer to the `Convolution` section for details.
-
-- - -
-
-### `tf.nn.dilation2d(input, filter, strides, rates, padding, name=None)` {#dilation2d}
-
-Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-
-The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-input channel is processed independently of the others with its own structuring
-function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-tensor depend on the `padding` algorithm. We currently only support the default
-"NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-(for consistency with `conv2d`, we use unmirrored filters):
-
-    output[b, y, x, c] =
-       max_{dy, dx} input[b,
-                          strides[1] * y + rates[1] * dy,
-                          strides[2] * x + rates[2] * dx,
-                          c] +
-                    filter[dy, dx, c]
-
-Max-pooling is a special case when the filter has size equal to the pooling
-kernel size and contains all zeros.
-
-Note on duality: The dilation of `input` by the `filter` is equal to the
-negation of the erosion of `-input` by the reflected `filter`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-    4-D with shape `[batch, in_height, in_width, depth]`.
-*  <b>`filter`</b>: A `Tensor`. Must have the same type as `input`.
-    3-D with shape `[filter_height, filter_width, depth]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    The stride of the sliding window for each dimension of the input
-    tensor. Must be: `[1, stride_height, stride_width, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    The input stride for atrous morphological dilation. Must be:
-    `[1, rate_height, rate_width, 1]`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-  4-D with shape `[batch, out_height, out_width, depth]`.
-
-
-- - -
-
-### `tf.nn.erosion2d(value, kernel, strides, rates, padding, name=None)` {#erosion2d}
-
-Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
-
-The `value` tensor has shape `[batch, in_height, in_width, depth]` and the
-`kernel` tensor has shape `[kernel_height, kernel_width, depth]`, i.e.,
-each input channel is processed independently of the others with its own
-structuring function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the
-output tensor depend on the `padding` algorithm. We currently only support the
-default "NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D erosion is given by:
-
-    output[b, y, x, c] =
-       min_{dy, dx} value[b,
-                          strides[1] * y - rates[1] * dy,
-                          strides[2] * x - rates[2] * dx,
-                          c] -
-                    kernel[dy, dx, c]
-
-Duality: The erosion of `value` by the `kernel` is equal to the negation of
-the dilation of `-value` by the reflected `kernel`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. 4-D with shape `[batch, in_height, in_width, depth]`.
-*  <b>`kernel`</b>: A `Tensor`. Must have the same type as `value`.
-    3-D with shape `[kernel_height, kernel_width, depth]`.
-*  <b>`strides`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. The stride of the sliding window for each dimension of
-    the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-*  <b>`rates`</b>: A list of `ints` that has length `>= 4`.
-    1-D of length 4. The input stride for atrous morphological dilation.
-    Must be: `[1, rate_height, rate_width, 1]`.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional). If not specified "erosion2d"
-    is used.
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `value`.
-  4-D with shape `[batch, out_height, out_width, depth]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `value` depth does not match `kernel`' shape, or if
-    padding is other than `'VALID'` or `'SAME'`.
-
-
-- - -
-
-### `tf.nn.with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None, spatial_dims=None)` {#with_space_to_batch}
-
-Performs `op` on the space-to-batch representation of `input`.
-
-This has the effect of transforming sliding window operations into the
-corresponding "atrous" operation in which the input is sampled at the
-specified `dilation_rate`.
-
-In the special case that `dilation_rate` is uniformly 1, this simply returns:
-
-  op(input, num_spatial_dims, padding)
-
-Otherwise, it returns:
-
-  batch_to_space_nd(
-    op(space_to_batch_nd(input, adjusted_dilation_rate, adjusted_paddings),
-       num_spatial_dims,
-       "VALID")
-    adjusted_dilation_rate,
-    adjusted_crops),
-
-where:
-
-  adjusted_dilation_rate is an int64 tensor of shape [max(spatial_dims)],
-  adjusted_{paddings,crops} are int64 tensors of shape [max(spatial_dims), 2]
-
-defined as follows:
-
-We first define two int64 tensors `paddings` and `crops` of shape
-`[num_spatial_dims, 2]` based on the value of `padding` and the spatial
-dimensions of the `input`:
-
-If `padding = "VALID"`, then:
-
-  paddings, crops = required_space_to_batch_paddings(
-    input_shape[spatial_dims],
-    dilation_rate)
-
-If `padding = "SAME"`, then:
-
-  dilated_filter_shape =
-    filter_shape + (filter_shape - 1) * (dilation_rate - 1)
-
-  paddings, crops = required_space_to_batch_paddings(
-    input_shape[spatial_dims],
-    dilation_rate,
-    [(dilated_filter_shape - 1) // 2,
-     dilated_filter_shape - 1 - (dilated_filter_shape - 1) // 2])
-
-Because `space_to_batch_nd` and `batch_to_space_nd` assume that the spatial
-dimensions are contiguous starting at the second dimension, but the specified
-`spatial_dims` may not be, we must adjust `dilation_rate`, `paddings` and
-`crops` in order to be usable with these operations.  For a given dimension,
-if the block size is 1, and both the starting and ending padding and crop
-amounts are 0, then space_to_batch_nd effectively leaves that dimension alone,
-which is what is needed for dimensions not part of `spatial_dims`.
-Furthermore, `space_to_batch_nd` and `batch_to_space_nd` handle this case
-efficiently for any number of leading and trailing dimensions.
-
-For 0 <= i < len(spatial_dims), we assign:
-
-  adjusted_dilation_rate[spatial_dims[i] - 1] = dilation_rate[i]
-  adjusted_paddings[spatial_dims[i] - 1, :] = paddings[i, :]
-  adjusted_crops[spatial_dims[i] - 1, :] = crops[i, :]
-
-All unassigned values of `adjusted_dilation_rate` default to 1, while all
-unassigned values of `adjusted_paddings` and `adjusted_crops` default to 0.
-
-Note in the case that `dilation_rate` is not uniformly 1, specifying "VALID"
-padding is equivalent to specifying `padding = "SAME"` with a filter_shape of
-`[1]*N`.
-
-Advanced usage. Note the following optimization: A sequence of
-`with_space_to_batch` operations with identical (not uniformly 1)
-`dilation_rate` parameters and "VALID" padding
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", op_1)
-  ...
-  net = with_space_to_batch(net, dilation_rate, "VALID", op_k)
-
-can be combined into a single `with_space_to_batch` operation as follows:
-
-  def combined_op(converted_input, num_spatial_dims, _):
-    result = op_1(converted_input, num_spatial_dims, "VALID")
-    ...
-    result = op_k(result, num_spatial_dims, "VALID")
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", combined_op)
-
-This eliminates the overhead of `k-1` calls to `space_to_batch_nd` and
-`batch_to_space_nd`.
-
-Similarly, a sequence of `with_space_to_batch` operations with identical (not
-uniformly 1) `dilation_rate` parameters, "SAME" padding, and odd filter
-dimensions
-
-  net = with_space_to_batch(net, dilation_rate, "SAME", op_1, filter_shape_1)
-  ...
-  net = with_space_to_batch(net, dilation_rate, "SAME", op_k, filter_shape_k)
-
-can be combined into a single `with_space_to_batch` operation as follows:
-
-  def combined_op(converted_input, num_spatial_dims, _):
-    result = op_1(converted_input, num_spatial_dims, "SAME")
-    ...
-    result = op_k(result, num_spatial_dims, "SAME")
-
-  net = with_space_to_batch(net, dilation_rate, "VALID", combined_op)
-
-##### Args:
-
-
-*  <b>`input`</b>: Tensor of rank > max(spatial_dims).
-*  <b>`dilation_rate`</b>: int32 Tensor of *known* shape [num_spatial_dims].
-*  <b>`padding`</b>: str constant equal to "VALID" or "SAME"
-*  <b>`op`</b>: Function that maps (input, num_spatial_dims, padding) -> output
-*  <b>`filter_shape`</b>: If padding = "SAME", specifies the shape of the convolution
-    kernel/pooling window as an integer Tensor of shape [>=num_spatial_dims].
-    If padding = "VALID", filter_shape is ignored and need not be specified.
-*  <b>`spatial_dims`</b>: Monotonically increasing sequence of `num_spatial_dims`
-    integers (which are >= 1) specifying the spatial dimensions of `input`
-    and output.  Defaults to: `range(1, num_spatial_dims+1)`.
-
-##### Returns:
-
-  The output Tensor as described above.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `padding` is invalid or the arguments are incompatible.
-*  <b>`ValueError`</b>: if `spatial_dims` are invalid.
-
-
-
-## Normalization
-
-Normalization is useful to prevent neurons from saturating when inputs may
-have varying scale, and to aid generalization.
-
-- - -
-
-### `tf.nn.l2_normalize(x, dim, epsilon=1e-12, name=None)` {#l2_normalize}
-
-Normalizes along dimension `dim` using an L2 norm.
-
-For a 1-D tensor with `dim = 0`, computes
-
-    output = x / sqrt(max(sum(x**2), epsilon))
-
-For `x` with more dimensions, independently normalizes each 1-D slice along
-dimension `dim`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`dim`</b>: Dimension along which to normalize.  A scalar or a vector of
-    integers.
-*  <b>`epsilon`</b>: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
-    divisor if `norm < sqrt(epsilon)`.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  A `Tensor` with the same shape as `x`.
-
-
-- - -
-
-### `tf.nn.local_response_normalization(input, depth_radius=None, bias=None, alpha=None, beta=None, name=None)` {#local_response_normalization}
-
-Local Response Normalization.
-
-The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-dimension), and each vector is normalized independently.  Within a given vector,
-each component is divided by the weighted, squared sum of inputs within
-`depth_radius`.  In detail,
-
-    sqr_sum[a, b, c, d] =
-        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-    output = input / (bias + alpha * sqr_sum) ** beta
-
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
-    4-D.
-*  <b>`depth_radius`</b>: An optional `int`. Defaults to `5`.
-    0-D.  Half-width of the 1-D normalization window.
-*  <b>`bias`</b>: An optional `float`. Defaults to `1`.
-    An offset (usually positive to avoid dividing by 0).
-*  <b>`alpha`</b>: An optional `float`. Defaults to `1`.
-    A scale factor, usually positive.
-*  <b>`beta`</b>: An optional `float`. Defaults to `0.5`. An exponent.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-- - -
-
-### `tf.nn.sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None)` {#sufficient_statistics}
-
-Calculate the sufficient statistics for the mean and variance of `x`.
-
-These sufficient statistics are computed using the one pass algorithm on
-an input that's optionally shifted. See:
-https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`axes`</b>: Array of ints. Axes along which to compute mean and variance.
-*  <b>`shift`</b>: A `Tensor` containing the value by which to shift the data for
-    numerical stability, or `None` if no shift is to be performed. A shift
-    close to the true mean provides the most numerically stable results.
-*  <b>`keep_dims`</b>: produce statistics with the same dimensionality as the input.
-*  <b>`name`</b>: Name used to scope the operations that compute the sufficient stats.
-
-##### Returns:
-
-  Four `Tensor` objects of the same type as `x`:
-
-  * the count (number of elements to average over).
-  * the (possibly shifted) sum of the elements in the array.
-  * the (possibly shifted) sum of squares of the elements in the array.
-  * the shift by which the mean must be corrected or None if `shift` is None.
-
-
-- - -
-
-### `tf.nn.normalize_moments(counts, mean_ss, variance_ss, shift, name=None)` {#normalize_moments}
-
-Calculate the mean and variance of based on the sufficient statistics.
-
-##### Args:
-
-
-*  <b>`counts`</b>: A `Tensor` containing a the total count of the data (one value).
-*  <b>`mean_ss`</b>: A `Tensor` containing the mean sufficient statistics: the (possibly
-    shifted) sum of the elements to average over.
-*  <b>`variance_ss`</b>: A `Tensor` containing the variance sufficient statistics: the
-    (possibly shifted) squared sum of the data to compute the variance over.
-*  <b>`shift`</b>: A `Tensor` containing the value by which the data is shifted for
-    numerical stability, or `None` if no shift was performed.
-*  <b>`name`</b>: Name used to scope the operations that compute the moments.
-
-##### Returns:
-
-  Two `Tensor` objects: `mean` and `variance`.
-
-
-- - -
-
-### `tf.nn.moments(x, axes, shift=None, name=None, keep_dims=False)` {#moments}
-
-Calculate the mean and variance of `x`.
-
-The mean and variance are calculated by aggregating the contents of `x`
-across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
-and variance of a vector.
-
-Note: for numerical stability, when shift=None, the true mean
-would be computed and used as shift.
-
-When using these moments for batch normalization (see
-`tf.nn.batch_normalization`):
-
- * for so-called "global normalization", used with convolutional filters with
-   shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
- * for simple batch normalization pass `axes=[0]` (batch only).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`.
-*  <b>`axes`</b>: Array of ints.  Axes along which to compute mean and
-    variance.
-*  <b>`shift`</b>: A `Tensor` containing the value by which to shift the data for
-    numerical stability, or `None` in which case the true mean of the data is
-    used as shift. A shift close to the true mean provides the most
-    numerically stable results.
-*  <b>`name`</b>: Name used to scope the operations that compute the moments.
-*  <b>`keep_dims`</b>: produce moments with the same dimensionality as the input.
-
-##### Returns:
-
-  Two `Tensor` objects: `mean` and `variance`.
-
-
-- - -
-
-### `tf.nn.weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False)` {#weighted_moments}
-
-Returns the frequency-weighted mean and variance of `x`.
-
-##### Args:
-
-
-*  <b>`x`</b>: A tensor.
-*  <b>`axes`</b>: 1-d tensor of int32 values; these are the axes along which
-    to compute mean and variance.
-*  <b>`frequency_weights`</b>: A tensor of positive weights which can be
-    broadcast with x.
-*  <b>`name`</b>: Name used to scope the operation.
-*  <b>`keep_dims`</b>: Produce moments with the same dimensionality as the input.
-
-##### Returns:
-
-  Two tensors: `weighted_mean` and `weighted_variance`.
-
-
-- - -
-
-### `tf.nn.fused_batch_norm(x, scale, offset, mean=None, variance=None, epsilon=0.001, data_format='NHWC', is_training=True, name=None)` {#fused_batch_norm}
-
-Batch normalization.
-
-As described in http://arxiv.org/abs/1502.03167.
-
-##### Args:
-
-
-*  <b>`x`</b>: Input `Tensor` of 4 dimensions.
-*  <b>`scale`</b>: A `Tensor` of 1 dimension for scaling.
-*  <b>`offset`</b>: A `Tensor` of 1 dimension for bias.
-*  <b>`mean`</b>: A `Tensor` of 1 dimension for population mean used for inference.
-*  <b>`variance`</b>: A `Tensor` of 1 dimension for population variance
-            used for inference.
-*  <b>`epsilon`</b>: A small float number added to the variance of x.
-*  <b>`data_format`</b>: The data format for x. Either "NHWC" (default) or "NCHW".
-*  <b>`is_training`</b>: A bool value to specify if the operation is used for
-               training or inference.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-
-*  <b>`y`</b>: A 4D Tensor for the normalized, scaled, offsetted x.
-*  <b>`batch_mean`</b>: A 1D Tensor for the mean of x.
-*  <b>`batch_var`</b>: A 1D Tensor for the variance of x.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If mean or variance is not None when is_training is True.
-
-
-- - -
-
-### `tf.nn.batch_normalization(x, mean, variance, offset, scale, variance_epsilon, name=None)` {#batch_normalization}
-
-Batch normalization.
-
-As described in http://arxiv.org/abs/1502.03167.
-Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
-`scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):
-
-\\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)
-
-`mean`, `variance`, `offset` and `scale` are all expected to be of one of two
-shapes:
-
-  * In all generality, they can have the same number of dimensions as the
-    input `x`, with identical sizes as `x` for the dimensions that are not
-    normalized over (the 'depth' dimension(s)), and dimension 1 for the
-    others which are being normalized over.
-    `mean` and `variance` in this case would typically be the outputs of
-    `tf.nn.moments(..., keep_dims=True)` during training, or running averages
-    thereof during inference.
-  * In the common case where the 'depth' dimension is the last dimension in
-    the input tensor `x`, they may be one dimensional tensors of the same
-    size as the 'depth' dimension.
-    This is the case for example for the common `[batch, depth]` layout of
-    fully-connected layers, and `[batch, height, width, depth]` for
-    convolutions.
-    `mean` and `variance` in this case would typically be the outputs of
-    `tf.nn.moments(..., keep_dims=False)` during training, or running averages
-    thereof during inference.
-
-##### Args:
-
-
-*  <b>`x`</b>: Input `Tensor` of arbitrary dimensionality.
-*  <b>`mean`</b>: A mean `Tensor`.
-*  <b>`variance`</b>: A variance `Tensor`.
-*  <b>`offset`</b>: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
-    None. If present, will be added to the normalized tensor.
-*  <b>`scale`</b>: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
-    `None`. If present, the scale is applied to the normalized tensor.
-*  <b>`variance_epsilon`</b>: A small float number to avoid dividing by 0.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-  the normalized, scaled, offset tensor.
-
-
-- - -
-
-### `tf.nn.batch_norm_with_global_normalization(t, m, v, beta, gamma, variance_epsilon, scale_after_normalization, name=None)` {#batch_norm_with_global_normalization}
-
-Batch normalization.
-
-This op is deprecated. See `tf.nn.batch_normalization`.
-
-##### Args:
-
-
-*  <b>`t`</b>: A 4D input Tensor.
-*  <b>`m`</b>: A 1D mean Tensor with size matching the last dimension of t.
-    This is the first output from tf.nn.moments,
-    or a saved moving average thereof.
-*  <b>`v`</b>: A 1D variance Tensor with size matching the last dimension of t.
-    This is the second output from tf.nn.moments,
-    or a saved moving average thereof.
-*  <b>`beta`</b>: A 1D beta Tensor with size matching the last dimension of t.
-    An offset to be added to the normalized tensor.
-*  <b>`gamma`</b>: A 1D gamma Tensor with size matching the last dimension of t.
-    If "scale_after_normalization" is true, this tensor will be multiplied
-    with the normalized tensor.
-*  <b>`variance_epsilon`</b>: A small float number to avoid dividing by 0.
-*  <b>`scale_after_normalization`</b>: A bool indicating whether the resulted tensor
-    needs to be multiplied with gamma.
-*  <b>`name`</b>: A name for this operation (optional).
-
-##### Returns:
-
-   A batch-normalized `t`.
-
-
-
-## Losses
-
-The loss ops measure error between two tensors, or between a tensor and zero.
-These can be used for measuring accuracy of a network in a regression task
-or for regularization purposes (weight decay).
-
-- - -
-
-### `tf.nn.l2_loss(t, name=None)` {#l2_loss}
-
-L2 Loss.
-
-Computes half the L2 norm of a tensor without the `sqrt`:
-
-    output = sum(t ** 2) / 2
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Typically 2-D, but may have any dimensions.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `t`. 0-D.
-
-
-- - -
-
-### `tf.nn.log_poisson_loss(targets, log_input, compute_full_loss=False, name=None)` {#log_poisson_loss}
-
-Computes log Poisson loss given `log_input`.
-
-Gives the log-likelihood loss between the prediction and the target under the
-assumption that the target has a Poisson distribution.
-Caveat: By default, this is not the exact loss, but the loss minus a
-  constant term [log(z!)]. That has no effect for optimization, but
-  does not play well with relative loss comparisons. To compute an
-  approximation of the log factorial term, specify
-  compute_full_loss=True to enable Stirling's Approximation.
-
-For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
-loss is
-
-      -log(exp(-x) * (x^z) / z!)
-    = -log(exp(-x) * (x^z)) + log(z!)
-    ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-        [ Note the second term is the Stirling's Approximation for log(z!).
-          It is invariant to x and does not affect optimization, though
-          important for correct relative loss comparisons. It is only
-          computed when compute_full_loss == True. ]
-    = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-    = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
-
-##### Args:
-
-
-*  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
-*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
-    term is dropped in favor of more efficient optimization.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `log_input` with the componentwise
-  logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `log_input` and `targets` do not have the same shape.
-
-
-
-## Classification
-
-TensorFlow provides several operations that help you perform classification.
-
-- - -
-
-### `tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)` {#sigmoid_cross_entropy_with_logits}
-
-Computes sigmoid cross entropy given `logits`.
-
-Measures the probability error in discrete classification tasks in which each
-class is independent and not mutually exclusive.  For instance, one could
-perform multilabel classification where a picture can contain both an elephant
-and a dog at the same time.
-
-For brevity, let `x = logits`, `z = labels`.  The logistic loss is
-
-      z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-    = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
-    = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
-    = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
-    = (1 - z) * x + log(1 + exp(-x))
-    = x - x * z + log(1 + exp(-x))
-
-For x < 0, to avoid overflow in exp(-x), we reformulate the above
-
-      x - x * z + log(1 + exp(-x))
-    = log(exp(x)) - x * z + log(1 + exp(-x))
-    = - x * z + log(1 + exp(x))
-
-Hence, to ensure stability and avoid overflow, the implementation uses this
-equivalent formulation
-
-    max(x, 0) - x * z + log(1 + exp(-abs(x)))
-
-`logits` and `labels` must have the same type and shape.
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: A `Tensor` of the same type and shape as `logits`.
-*  <b>`logits`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `logits` with the componentwise
-  logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `logits` and `labels` do not have the same shape.
-
-
-- - -
-
-### `tf.nn.softmax(logits, dim=-1, name=None)` {#softmax}
-
-Computes softmax activations.
-
-For each batch `i` and class `j` we have
-
-    softmax = exp(logits) / reduce_sum(exp(logits), dim)
-
-##### Args:
-
-
-*  <b>`logits`</b>: A non-empty `Tensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`dim`</b>: The dimension softmax would be performed on. The default is -1 which
-    indicates the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `logits` is empty or `dim` is beyond the last
-    dimension of `logits`.
-
-
-- - -
-
-### `tf.nn.log_softmax(logits, dim=-1, name=None)` {#log_softmax}
-
-Computes log softmax activations.
-
-For each batch `i` and class `j` we have
-
-    logsoftmax = logits - log(reduce_sum(exp(logits), dim))
-
-##### Args:
-
-
-*  <b>`logits`</b>: A non-empty `Tensor`. Must be one of the following types: `half`,
-    `float32`, `float64`.
-*  <b>`dim`</b>: The dimension softmax would be performed on. The default is -1 which
-    indicates the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
-
-##### Raises:
-
-
-*  <b>`InvalidArgumentError`</b>: if `logits` is empty or `dim` is beyond the last
-    dimension of `logits`.
-
-
-- - -
-
-### `tf.nn.softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, dim=-1, name=None)` {#softmax_cross_entropy_with_logits}
-
-Computes softmax cross entropy between `logits` and `labels`.
-
-Measures the probability error in discrete classification tasks in which the
-classes are mutually exclusive (each entry is in exactly one class).  For
-example, each CIFAR-10 image is labeled with one and only one label: an image
-can be a dog or a truck, but not both.
-
-**NOTE:**  While the classes are mutually exclusive, their probabilities
-need not be.  All that is required is that each row of `labels` is
-a valid probability distribution.  If they are not, the computation of the
-gradient will be incorrect.
-
-If using exclusive `labels` (wherein one and only
-one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
-
-**WARNING:** This op expects unscaled logits, since it performs a `softmax`
-on `logits` internally for efficiency.  Do not call this op with the
-output of `softmax`, as it will produce incorrect results.
-
-`logits` and `labels` must have the same shape `[batch_size, num_classes]`
-and the same dtype (either `float16`, `float32`, or `float64`).
-
-**Note that to avoid confusion, it is required to pass only named arguments to
-this function.**
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: Each row `labels[i]` must be a valid probability distribution.
-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`dim`</b>: The class dimension. Defaulted to -1 which is the last dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
-
-
-- - -
-
-### `tf.nn.sparse_softmax_cross_entropy_with_logits(_sentinel=None, labels=None, logits=None, name=None)` {#sparse_softmax_cross_entropy_with_logits}
-
-Computes sparse softmax cross entropy between `logits` and `labels`.
-
-Measures the probability error in discrete classification tasks in which the
-classes are mutually exclusive (each entry is in exactly one class).  For
-example, each CIFAR-10 image is labeled with one and only one label: an image
-can be a dog or a truck, but not both.
-
-**NOTE:**  For this operation, the probability of a given label is considered
-exclusive.  That is, soft classes are not allowed, and the `labels` vector
-must provide a single specific index for the true class for each row of
-`logits` (each minibatch entry).  For soft softmax classification with
-a probability distribution for each entry, see
-`softmax_cross_entropy_with_logits`.
-
-**WARNING:** This op expects unscaled logits, since it performs a softmax
-on `logits` internally for efficiency.  Do not call this op with the
-output of `softmax`, as it will produce incorrect results.
-
-A common use case is to have logits of shape `[batch_size, num_classes]` and
-labels of shape `[batch_size]`. But higher dimensions are supported.
-
-**Note that to avoid confusion, it is required to pass only named arguments to
-this function.**
-
-##### Args:
-
-  _sentinel: Used to prevent positional parameters. Internal, do not use.
-
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
-    `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
-    must be an index in `[0, num_classes)`. Other values will raise an
-    exception when this op is run on CPU, and return `NaN` for corresponding
-    loss and gradient rows on GPU.
-*  <b>`logits`</b>: Unscaled log probabilities of shape
-    `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `labels` and of the same type as `logits`
-  with the softmax cross entropy loss.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
-    of the labels is not equal to the rank of the labels minus one.
-
-
-- - -
-
-### `tf.nn.weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None)` {#weighted_cross_entropy_with_logits}
-
-Computes a weighted cross entropy.
-
-This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
-allows one to trade off recall and precision by up- or down-weighting the
-cost of a positive error relative to a negative error.
-
-The usual cross-entropy cost is defined as:
-
-  targets * -log(sigmoid(logits)) + (1 - targets) * -log(1 - sigmoid(logits))
-
-The argument `pos_weight` is used as a multiplier for the positive targets:
-
-  targets * -log(sigmoid(logits)) * pos_weight +
-      (1 - targets) * -log(1 - sigmoid(logits))
-
-For brevity, let `x = logits`, `z = targets`, `q = pos_weight`.
-The loss is:
-
-      qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-    = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
-    = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
-    = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
-    = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
-    = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
-
-Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
-the implementation uses
-
-    (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
-
-`logits` and `targets` must have the same type and shape.
-
-##### Args:
-
-
-*  <b>`targets`</b>: A `Tensor` of the same type and shape as `logits`.
-*  <b>`logits`</b>: A `Tensor` of type `float32` or `float64`.
-*  <b>`pos_weight`</b>: A coefficient to use on the positive examples.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of the same shape as `logits` with the componentwise
-  weighted logistic losses.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `logits` and `targets` do not have the same shape.
-
-
-
-## Embeddings
-
-TensorFlow provides library support for looking up values in embedding
-tensors.
-
-- - -
-
-### `tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None)` {#embedding_lookup}
-
-Looks up `ids` in a list of embedding tensors.
-
-This function is used to perform parallel lookups on the list of
-tensors in `params`.  It is a generalization of
-[`tf.gather()`](../../api_docs/python/array_ops.md#gather), where `params` is
-interpreted as a partitioning of a large embedding tensor.  `params` may be
-a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-partitioner.
-
-If `len(params) > 1`, each element `id` of `ids` is partitioned between
-the elements of `params` according to the `partition_strategy`.
-In all strategies, if the id space does not evenly divide the number of
-partitions, each of the first `(max_id + 1) % len(params)` partitions will
-be assigned one more id.
-
-If `partition_strategy` is `"mod"`, we assign each id to partition
-`p = id % len(params)`. For instance,
-13 ids are split across 5 partitions as:
-`[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
-
-If `partition_strategy` is `"div"`, we assign ids to partitions in a
-contiguous manner. In this case, 13 ids are split across 5 partitions as:
-`[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
-
-The results of the lookup are concatenated into a dense
-tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
-
-##### Args:
-
-
-*  <b>`params`</b>: A single tensor representing the complete embedding tensor,
-    or a list of P tensors all of same shape except for the first dimension,
-    representing sharded embedding tensors.  Alternatively, a
-    `PartitionedVariable`, created by partitioning along dimension 0. Each
-    element must be appropriately sized for the given `partition_strategy`.
-*  <b>`ids`</b>: A `Tensor` with type `int32` or `int64` containing the ids to be looked
-    up in `params`.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-    if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-    is `"mod"`.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`validate_indices`</b>: Whether or not to validate gather indices.
-*  <b>`max_norm`</b>: If not None, embedding values are l2-normalized to the value of
-   max_norm.
-
-##### Returns:
-
-  A `Tensor` with the same type as the tensors in `params`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `params` is empty.
-
-
-- - -
-
-### `tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, partition_strategy='mod', name=None, combiner=None, max_norm=None)` {#embedding_lookup_sparse}
-
-Computes embeddings for the given ids and weights.
-
-This op assumes that there is at least one id for each row in the dense tensor
-represented by sp_ids (i.e. there are no rows with empty features), and that
-all the indices of sp_ids are in canonical row-major order.
-
-It also assumes that all id values lie in the range [0, p0), where p0
-is the sum of the size of params along dimension 0.
-
-##### Args:
-
-
-*  <b>`params`</b>: A single tensor representing the complete embedding tensor,
-    or a list of P tensors all of same shape except for the first dimension,
-    representing sharded embedding tensors.  Alternatively, a
-    `PartitionedVariable`, created by partitioning along dimension 0. Each
-    element must be appropriately sized for the given `partition_strategy`.
-*  <b>`sp_ids`</b>: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
-    where N is typically batch size and M is arbitrary.
-*  <b>`sp_weights`</b>: either a SparseTensor of float / double weights, or None to
-    indicate all weights should be taken to be 1. If specified, sp_weights
-    must have exactly the same shape and indices as sp_ids.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-    if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-    is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: Optional name for the op.
-*  <b>`combiner`</b>: A string specifying the reduction op. Currently "mean", "sqrtn"
-    and "sum" are supported.
-    "sum" computes the weighted sum of the embedding results for each row.
-    "mean" is the weighted sum divided by the total weight.
-    "sqrtn" is the weighted sum divided by the square root of the sum of the
-    squares of the weights.
-*  <b>`max_norm`</b>: If not None, each embedding is normalized to have l2 norm equal
-    to max_norm before combining.
-
-##### Returns:
-
-  A dense tensor representing the combined embeddings for the
-  sparse ids. For each row in the dense tensor represented by sp_ids, the op
-  looks up the embeddings for all ids in that row, multiplies them by the
-  corresponding weight, and combines these embeddings as specified.
-
-  In other words, if
-
-    shape(combined params) = [p0, p1, ..., pm]
-
-  and
-
-    shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
-
-  then
-
-    shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
-
-  For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
-
-    [0, 0]: id 1, weight 2.0
-    [0, 1]: id 3, weight 0.5
-    [1, 0]: id 0, weight 1.0
-    [2, 3]: id 1, weight 3.0
-
-  with `combiner`="mean", then the output will be a 3x20 matrix where
-
-    output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
-    output[1, :] = params[0, :] * 1.0
-    output[2, :] = params[1, :] * 3.0
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If sp_ids is not a SparseTensor, or if sp_weights is neither
-    None nor SparseTensor.
-*  <b>`ValueError`</b>: If combiner is not one of {"mean", "sqrtn", "sum"}.
-
-
-
-## Recurrent Neural Networks
-
-TensorFlow provides a number of methods for constructing Recurrent
-Neural Networks.  Most accept an `RNNCell`-subclassed object
-(see the documentation for `tf.contrib.rnn`).
-
-- - -
-
-### `tf.nn.dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None)` {#dynamic_rnn}
-
-Creates a recurrent neural network specified by RNNCell `cell`.
-
-This function is functionally identical to the function `rnn` above, but
-performs fully dynamic unrolling of `inputs`.
-
-Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
-each frame.  Instead, `inputs` may be a single `Tensor` where
-the maximum time is either the first or second dimension (see the parameter
-`time_major`).  Alternatively, it may be a (possibly nested) tuple of
-Tensors, each of them having matching batch and time dimensions.
-The corresponding output is either a single `Tensor` having the same number
-of time steps and batch size, or a (possibly nested) tuple of such tensors,
-matching the nested structure of `cell.output_size`.
-
-The parameter `sequence_length` is optional and is used to copy-through state
-and zero-out outputs when past a batch element's sequence length. So it's more
-for correctness than performance, unlike in rnn().
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`inputs`</b>: The RNN inputs.
-
-    If `time_major == False` (default), this must be a `Tensor` of shape:
-      `[batch_size, max_time, ...]`, or a nested tuple of such
-      elements.
-
-    If `time_major == True`, this must be a `Tensor` of shape:
-      `[max_time, batch_size, ...]`, or a nested tuple of such
-      elements.
-
-    This may also be a (possibly nested) tuple of Tensors satisfying
-    this property.  The first two dimensions must match across all the inputs,
-    but otherwise the ranks and other shape components may differ.
-    In this case, input to `cell` at each time-step will replicate the
-    structure of these tuples, except for the time dimension (from which the
-    time is taken).
-
-    The input to `cell` at each time step will be a `Tensor` or (possibly
-    nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
-
-*  <b>`sequence_length`</b>: (optional) An int32/int64 vector sized `[batch_size]`.
-*  <b>`initial_state`</b>: (optional) An initial state for the RNN.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell.state_size`.
-*  <b>`dtype`</b>: (optional) The data type for the initial state and expected output.
-    Required if initial_state is not provided or RNN state has a heterogeneous
-    dtype.
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`time_major`</b>: The shape format of the `inputs` and `outputs` Tensors.
-    If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-    If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-    Using `time_major = True` is a bit more efficient because it avoids
-    transposes at the beginning and end of the RNN calculation.  However,
-    most TensorFlow data is batch-major, so by default this function
-    accepts input and emits output in batch-major form.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A pair (outputs, state) where:
-
-
-*  <b>`outputs`</b>: The RNN output `Tensor`.
-
-      If time_major == False (default), this will be a `Tensor` shaped:
-        `[batch_size, max_time, cell.output_size]`.
-
-      If time_major == True, this will be a `Tensor` shaped:
-        `[max_time, batch_size, cell.output_size]`.
-
-      Note, if `cell.output_size` is a (possibly nested) tuple of integers
-      or `TensorShape` objects, then `outputs` will be a tuple having the
-      same structure as `cell.output_size`, containing Tensors having shapes
-      corresponding to the shape data in `cell.output_size`.
-
-
-*  <b>`state`</b>: The final state.  If `cell.state_size` is an int, this
-      will be shaped `[batch_size, cell.state_size]`.  If it is a
-      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
-      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
-      be a tuple having the corresponding shapes.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell.
-*  <b>`ValueError`</b>: If inputs is None or an empty list.
-
-
-- - -
-
-### `tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, initial_state_fw=None, initial_state_bw=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None)` {#bidirectional_dynamic_rnn}
-
-Creates a dynamic version of bidirectional recurrent neural network.
-
-Similar to the unidirectional case above (rnn) but takes input and builds
-independent forward and backward RNNs. The input_size of forward and
-backward cell must match. The initial state for both directions is zero by
-default (but can be set optionally) and no intermediate states are ever
-returned -- the network is fully unrolled for the given (passed in)
-length(s) of the sequence(s) or completely unrolled if length(s) is not
-given.
-
-##### Args:
-
-
-*  <b>`cell_fw`</b>: An instance of RNNCell, to be used for forward direction.
-*  <b>`cell_bw`</b>: An instance of RNNCell, to be used for backward direction.
-*  <b>`inputs`</b>: The RNN inputs.
-    If time_major == False (default), this must be a tensor of shape:
-      `[batch_size, max_time, input_size]`.
-    If time_major == True, this must be a tensor of shape:
-      `[max_time, batch_size, input_size]`.
-    [batch_size, input_size].
-*  <b>`sequence_length`</b>: An int32/int64 vector, size `[batch_size]`,
-    containing the actual lengths for each of the sequences.
-*  <b>`initial_state_fw`</b>: (optional) An initial state for the forward RNN.
-    This must be a tensor of appropriate type and shape
-    `[batch_size, cell_fw.state_size]`.
-    If `cell_fw.state_size` is a tuple, this should be a tuple of
-    tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
-*  <b>`initial_state_bw`</b>: (optional) Same as for `initial_state_fw`, but using
-    the corresponding properties of `cell_bw`.
-*  <b>`dtype`</b>: (optional) The data type for the initial states and expected output.
-    Required if initial_states are not provided or RNN states have a
-    heterogeneous dtype.
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`time_major`</b>: The shape format of the `inputs` and `outputs` Tensors.
-    If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
-    If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
-    Using `time_major = True` is a bit more efficient because it avoids
-    transposes at the beginning and end of the RNN calculation.  However,
-    most TensorFlow data is batch-major, so by default this function
-    accepts input and emits output in batch-major form.
-*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
-    either of the initial states are not provided.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to
-    "bidirectional_rnn"
-
-##### Returns:
-
-  A tuple (outputs, output_states) where:
-
-*  <b>`outputs`</b>: A tuple (output_fw, output_bw) containing the forward and
-      the backward rnn output `Tensor`.
-      If time_major == False (default),
-        output_fw will be a `Tensor` shaped:
-        `[batch_size, max_time, cell_fw.output_size]`
-        and output_bw will be a `Tensor` shaped:
-        `[batch_size, max_time, cell_bw.output_size]`.
-      If time_major == True,
-        output_fw will be a `Tensor` shaped:
-        `[max_time, batch_size, cell_fw.output_size]`
-        and output_bw will be a `Tensor` shaped:
-        `[max_time, batch_size, cell_bw.output_size]`.
-      It returns a tuple instead of a single concatenated `Tensor`, unlike
-      in the `bidirectional_rnn`. If the concatenated one is preferred,
-      the forward and backward outputs can be concatenated as
-      `tf.concat(outputs, 2)`.
-*  <b>`output_states`</b>: A tuple (output_state_fw, output_state_bw) containing
-      the forward and the backward final states of bidirectional rnn.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
-
-
-- - -
-
-### `tf.nn.raw_rnn(cell, loop_fn, parallel_iterations=None, swap_memory=False, scope=None)` {#raw_rnn}
-
-Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
-
-**NOTE: This method is still in testing, and the API may change.**
-
-This function is a more primitive version of `dynamic_rnn` that provides
-more direct access to the inputs each iteration.  It also provides more
-control over when to start and finish reading the sequence, and
-what to emit for the output.
-
-For example, it can be used to implement the dynamic decoder of a seq2seq
-model.
-
-Instead of working with `Tensor` objects, most operations work with
-`TensorArray` objects directly.
-
-The operation of `raw_rnn`, in pseudo-code, is basically the following:
-
-```python
-time = tf.constant(0, dtype=tf.int32)
-(finished, next_input, initial_state, _, loop_state) = loop_fn(
-    time=time, cell_output=None, cell_state=None, loop_state=None)
-emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype)
-state = initial_state
-while not all(finished):
-  (output, cell_state) = cell(next_input, state)
-  (next_finished, next_input, next_state, emit, loop_state) = loop_fn(
-      time=time + 1, cell_output=output, cell_state=cell_state,
-      loop_state=loop_state)
-  # Emit zeros and copy forward state for minibatch entries that are finished.
-  state = tf.where(finished, state, next_state)
-  emit = tf.where(finished, tf.zeros_like(emit), emit)
-  emit_ta = emit_ta.write(time, emit)
-  # If any new minibatch entries are marked as finished, mark these.
-  finished = tf.logical_or(finished, next_finished)
-  time += 1
-return (emit_ta, state, loop_state)
-```
-
-with the additional properties that output and state may be (possibly nested)
-tuples, as determined by `cell.output_size` and `cell.state_size`, and
-as a result the final `state` and `emit_ta` may themselves be tuples.
-
-A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this:
-
-```python
-inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
-                        dtype=tf.float32)
-sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
-inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
-inputs_ta = inputs_ta.unstack(inputs)
-
-cell = tf.contrib.rnn.LSTMCell(num_units)
-
-def loop_fn(time, cell_output, cell_state, loop_state):
-  emit_output = cell_output  # == None for time == 0
-  if cell_output is None:  # time == 0
-    next_cell_state = cell.zero_state(batch_size, tf.float32)
-  else:
-    next_cell_state = cell_state
-  elements_finished = (time >= sequence_length)
-  finished = tf.reduce_all(elements_finished)
-  next_input = tf.cond(
-      finished,
-      lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
-      lambda: inputs_ta.read(time))
-  next_loop_state = None
-  return (elements_finished, next_input, next_cell_state,
-          emit_output, next_loop_state)
-
-outputs_ta, final_state, _ = raw_rnn(cell, loop_fn)
-outputs = outputs_ta.stack()
-```
-
-##### Args:
-
-
-*  <b>`cell`</b>: An instance of RNNCell.
-*  <b>`loop_fn`</b>: A callable that takes inputs
-    `(time, cell_output, cell_state, loop_state)`
-    and returns the tuple
-    `(finished, next_input, next_cell_state, emit_output, next_loop_state)`.
-    Here `time` is an int32 scalar `Tensor`, `cell_output` is a
-    `Tensor` or (possibly nested) tuple of tensors as determined by
-    `cell.output_size`, and `cell_state` is a `Tensor`
-    or (possibly nested) tuple of tensors, as determined by the `loop_fn`
-    on its first call (and should match `cell.state_size`).
-    The outputs are: `finished`, a boolean `Tensor` of
-    shape `[batch_size]`, `next_input`: the next input to feed to `cell`,
-    `next_cell_state`: the next state to feed to `cell`,
-    and `emit_output`: the output to store for this iteration.
-
-    Note that `emit_output` should be a `Tensor` or (possibly nested)
-    tuple of tensors with shapes and structure matching `cell.output_size`
-    and `cell_output` above.  The parameter `cell_state` and output
-    `next_cell_state` may be either a single or (possibly nested) tuple
-    of tensors.  The parameter `loop_state` and
-    output `next_loop_state` may be either a single or (possibly nested) tuple
-    of `Tensor` and `TensorArray` objects.  This last parameter
-    may be ignored by `loop_fn` and the return value may be `None`.  If it
-    is not `None`, then the `loop_state` will be propagated through the RNN
-    loop, for use purely by `loop_fn` to keep track of its own state.
-    The `next_loop_state` parameter returned may be `None`.
-
-    The first call to `loop_fn` will be `time = 0`, `cell_output = None`,
-    `cell_state = None`, and `loop_state = None`.  For this call:
-    The `next_cell_state` value should be the value with which to initialize
-    the cell's state.  It may be a final state from a previous RNN or it
-    may be the output of `cell.zero_state()`.  It should be a
-    (possibly nested) tuple structure of tensors.
-    If `cell.state_size` is an integer, this must be
-    a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
-    If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of
-    appropriate type and shape `[batch_size] + cell.state_size`.
-    If `cell.state_size` is a (possibly nested) tuple of ints or
-    `TensorShape`, this will be a tuple having the corresponding shapes.
-    The `emit_output` value may be  either `None` or a (possibly nested)
-    tuple structure of tensors, e.g.,
-    `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
-    If this first `emit_output` return value is `None`,
-    then the `emit_ta` result of `raw_rnn` will have the same structure and
-    dtypes as `cell.output_size`.  Otherwise `emit_ta` will have the same
-    structure, shapes (prepended with a `batch_size` dimension), and dtypes
-    as `emit_output`.  The actual values returned for `emit_output` at this
-    initializing call are ignored.  Note, this emit structure must be
-    consistent across all time steps.
-
-
-*  <b>`parallel_iterations`</b>: (Default: 32).  The number of iterations to run in
-    parallel.  Those operations which do not have any temporal dependency
-    and can be run in parallel, will be.  This parameter trades off
-    time for space.  Values >> 1 use more memory but take less time,
-    while smaller values use less memory but computations take longer.
-*  <b>`swap_memory`</b>: Transparently swap the tensors produced in forward inference
-    but needed for back prop from GPU to CPU.  This allows training RNNs
-    which would typically not fit on a single GPU, with very minimal (or no)
-    performance penalty.
-*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "rnn".
-
-##### Returns:
-
-  A tuple `(emit_ta, final_state, final_loop_state)` where:
-
-  `emit_ta`: The RNN output `TensorArray`.
-     If `loop_fn` returns a (possibly nested) set of Tensors for
-     `emit_output` during initialization, (inputs `time = 0`,
-     `cell_output = None`, and `loop_state = None`), then `emit_ta` will
-     have the same structure, dtypes, and shapes as `emit_output` instead.
-     If `loop_fn` returns `emit_output = None` during this call,
-     the structure of `cell.output_size` is used:
-     If `cell.output_size` is a (possibly nested) tuple of integers
-     or `TensorShape` objects, then `emit_ta` will be a tuple having the
-     same structure as `cell.output_size`, containing TensorArrays whose
-     elements' shapes correspond to the shape data in `cell.output_size`.
-
-  `final_state`: The final cell state.  If `cell.state_size` is an int, this
-    will be shaped `[batch_size, cell.state_size]`.  If it is a
-    `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
-    If it is a (possibly nested) tuple of ints or `TensorShape`, this will
-    be a tuple having the corresponding shapes.
-
-  `final_loop_state`: The final loop state as returned by `loop_fn`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cell` is not an instance of RNNCell, or `loop_fn` is not
-    a `callable`.
-
-
-
-## Connectionist Temporal Classification (CTC)
-
-- - -
-
-### `tf.nn.ctc_loss(labels, inputs, sequence_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)` {#ctc_loss}
-
-Computes the CTC (Connectionist Temporal Classification) Loss.
-
-This op implements the CTC loss as presented in the article:
-
-A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
-Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.
-
-http://www.cs.toronto.edu/~graves/icml_2006.pdf
-
-Input requirements:
-
-```
-sequence_length(b) <= time for all b
-
-max(labels.indices(labels.indices[:, 1] == b, 2))
-  <= sequence_length(b) for all b.
-```
-
-Notes:
-
-This class performs the softmax operation for you, so inputs should
-be e.g. linear projections of outputs by an LSTM.
-
-The `inputs` Tensor's innermost dimension size, `num_classes`, represents
-`num_labels + 1` classes, where num_labels is the number of true labels, and
-the largest value `(num_classes - 1)` is reserved for the blank label.
-
-For example, for a vocabulary containing 3 labels `[a, b, c]`,
-`num_classes = 4` and the labels indexing is `{a: 0, b: 1, c: 2, blank: 3}`.
-
-Regarding the arguments `preprocess_collapse_repeated` and
-`ctc_merge_repeated`:
-
-If `preprocess_collapse_repeated` is True, then a preprocessing step runs
-before loss calculation, wherein repeated labels passed to the loss
-are merged into single labels.  This is useful if the training labels come
-from, e.g., forced alignments and therefore have unnecessary repetitions.
-
-If `ctc_merge_repeated` is set False, then deep within the CTC calculation,
-repeated non-blank labels will not be merged and are interpreted
-as individual labels.  This is a simplified (non-standard) version of CTC.
-
-Here is a table of the (roughly) expected first order behavior:
-
-* `preprocess_collapse_repeated=False`, `ctc_merge_repeated=True`
-
-  Classical CTC behavior: Outputs true repeated classes with blanks in
-  between, and can also output repeated classes with no blanks in
-  between that need to be collapsed by the decoder.
-
-* `preprocess_collapse_repeated=True`, `ctc_merge_repeated=False`
-
-  Never learns to output repeated classes, as they are collapsed
-  in the input labels before training.
-
-* `preprocess_collapse_repeated=False`, `ctc_merge_repeated=False`
-
-  Outputs repeated classes with blanks in between, but generally does not
-  require the decoder to collapse/merge repeated classes.
-
-* `preprocess_collapse_repeated=True`, `ctc_merge_repeated=True`
-
-  Untested.  Very likely will not learn to output repeated classes.
-
-##### Args:
-
-
-*  <b>`labels`</b>: An `int32` `SparseTensor`.
-    `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
-    the id for (batch b, time t).
-    `labels.values[i]` must take on values in `[0, num_labels)`.
-    See `core/ops/ctc_ops.cc` for more details.
-*  <b>`inputs`</b>: 3-D `float` `Tensor`.
-    If time_major == False, this will be a `Tensor` shaped:
-      `[batch_size x max_time x num_classes]`.
-    If time_major == True (default), this will be a `Tensor` shaped:
-      `[max_time x batch_size x num_classes]`.
-    The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector, size `[batch_size]`.
-    The sequence lengths.
-*  <b>`preprocess_collapse_repeated`</b>: Boolean.  Default: False.
-    If True, repeated labels are collapsed prior to the CTC calculation.
-*  <b>`ctc_merge_repeated`</b>: Boolean.  Default: True.
-*  <b>`time_major`</b>: The shape format of the `inputs` Tensors.
-    If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
-    If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
-    Using `time_major = True` (default) is a bit more efficient because it avoids
-    transposes at the beginning of the ctc_loss calculation.  However, most
-    TensorFlow data is batch-major, so by this function also accepts inputs
-    in batch-major form.
-
-##### Returns:
-
-  A 1-D `float` `Tensor`, size `[batch]`, containing the negative log probabilities.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if labels is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.nn.ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True)` {#ctc_greedy_decoder}
-
-Performs greedy decoding on the logits given in input (best path).
-
-Note: Regardless of the value of merge_repeated, if the maximum index of a
-given time and batch corresponds to the blank index `(num_classes - 1)`, no
-new element is emitted.
-
-If `merge_repeated` is `True`, merge repeated classes in output.
-This means that if consecutive logits' maximum indices are the same,
-only the first of these is emitted.  The sequence `A B B * B * B` (where '*'
-is the blank label) becomes
-
-  * `A B B B` if `merge_repeated=True`.
-  * `A B B B B` if `merge_repeated=False`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: 3-D `float` `Tensor` sized
-    `[max_time x batch_size x num_classes]`.  The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector containing sequence lengths,
-    having size `[batch_size]`.
-*  <b>`merge_repeated`</b>: Boolean.  Default: True.
-
-##### Returns:
-
-  A tuple `(decoded, log_probabilities)` where
-
-*  <b>`decoded`</b>: A single-element list. `decoded[0]`
-    is an `SparseTensor` containing the decoded outputs s.t.:
-    `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
-      The rows store: `[batch, time]`.
-    `decoded.values`: Values vector, size `(total_decoded_outputs)`.
-      The vector stores the decoded classes.
-    `decoded.shape`: Shape vector, size `(2)`.
-      The shape values are: `[batch_size, max_decoded_length]`
-*  <b>`log_probability`</b>: A `float` matrix `(batch_size x 1)` containing sequence
-      log-probabilities.
-
-
-- - -
-
-### `tf.nn.ctc_beam_search_decoder(inputs, sequence_length, beam_width=100, top_paths=1, merge_repeated=True)` {#ctc_beam_search_decoder}
-
-Performs beam search decoding on the logits given in input.
-
-**Note** The `ctc_greedy_decoder` is a special case of the
-`ctc_beam_search_decoder` with `top_paths=1` and `beam_width=1` (but
-that decoder is faster for this special case).
-
-If `merge_repeated` is `True`, merge repeated classes in the output beams.
-This means that if consecutive entries in a beam are the same,
-only the first of these is emitted.  That is, when the top path
-is `A B B B B`, the return value is:
-
-  * `A B` if `merge_repeated = True`.
-  * `A B B B B` if `merge_repeated = False`.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: 3-D `float` `Tensor`, size
-    `[max_time x batch_size x num_classes]`.  The logits.
-*  <b>`sequence_length`</b>: 1-D `int32` vector containing sequence lengths,
-    having size `[batch_size]`.
-*  <b>`beam_width`</b>: An int scalar >= 0 (beam search beam width).
-*  <b>`top_paths`</b>: An int scalar >= 0, <= beam_width (controls output size).
-*  <b>`merge_repeated`</b>: Boolean.  Default: True.
-
-##### Returns:
-
-  A tuple `(decoded, log_probabilities)` where
-
-*  <b>`decoded`</b>: A list of length top_paths, where `decoded[j]`
-    is a `SparseTensor` containing the decoded outputs:
-    `decoded[j].indices`: Indices matrix `(total_decoded_outputs[j] x 2)`
-      The rows store: [batch, time].
-    `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
-      The vector stores the decoded classes for beam j.
-    `decoded[j].shape`: Shape vector, size `(2)`.
-      The shape values are: `[batch_size, max_decoded_length[j]]`.
-*  <b>`log_probability`</b>: A `float` matrix `(batch_size x top_paths)` containing
-      sequence log-probabilities.
-
-
-
-## Evaluation
-
-The evaluation ops are useful for measuring the performance of a network.
-They are typically used at evaluation time.
-
-- - -
-
-### `tf.nn.top_k(input, k=1, sorted=True, name=None)` {#top_k}
-
-Finds values and indices of the `k` largest entries for the last dimension.
-
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-##### Args:
-
-
-*  <b>`input`</b>: 1-D or higher `Tensor` with last dimension at least `k`.
-*  <b>`k`</b>: 0-D `int32` `Tensor`.  Number of top elements to look for along the last
-    dimension (along each row for matrices).
-*  <b>`sorted`</b>: If true the resulting `k` elements will be sorted by the values in
-    descending order.
-*  <b>`name`</b>: Optional name for the operation.
-
-##### Returns:
-
-
-*  <b>`values`</b>: The `k` largest elements along each last dimensional slice.
-*  <b>`indices`</b>: The indices of `values` within the last dimension of `input`.
-
-
-- - -
-
-### `tf.nn.in_top_k(predictions, targets, k, name=None)` {#in_top_k}
-
-Says whether the targets are in the top `K` predictions.
-
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-
-##### Args:
-
-
-*  <b>`predictions`</b>: A `Tensor` of type `float32`.
-    A `batch_size` x `classes` tensor.
-*  <b>`targets`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A `batch_size` vector of class ids.
-*  <b>`k`</b>: An `int`. Number of top elements to look at for computing precision.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
-
-
-
-## Candidate Sampling
-
-Do you want to train a multiclass or multilabel model with thousands
-or millions of output classes (for example, a language model with a
-large vocabulary)?  Training with a full Softmax is slow in this case,
-since all of the classes are evaluated for every training example.
-Candidate Sampling training algorithms can speed up your step times by
-only considering a small randomly-chosen subset of contrastive classes
-(called candidates) for each batch of training examples.
-
-See our
-[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
-
-### Sampled Loss Functions
-
-TensorFlow provides the following sampled loss functions for faster training.
-
-- - -
-
-### `tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=False, partition_strategy='mod', name='nce_loss')` {#nce_loss}
-
-Computes and returns the noise-contrastive estimation training loss.
-
-See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical
-models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms
-Reference](../../extras/candidate_sampling.pdf)
-
-Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
-so your labels must be sorted in order of decreasing frequency to achieve
-good results.  For more details, see
-[log_uniform_candidate_sampler](#log_uniform_candidate_sampler).
-
-Note: In the case where `num_true` > 1, we assign to each target class
-the target probability 1 / `num_true` so that the target probabilities
-sum to 1 per-example.
-
-Note: It would be useful to allow a variable number of target classes per
-example.  We hope to provide this functionality in a future release.
-For now, if you have a variable number of target classes, you can pad them
-out to a constant number by either repeating them or by padding
-with an otherwise unused class.
-
-##### Args:
-
-
-*  <b>`weights`</b>: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
-      objects whose concatenation along dimension 0 has shape
-      [num_classes, dim].  The (possibly-partitioned) class embeddings.
-*  <b>`biases`</b>: A `Tensor` of shape `[num_classes]`.  The class biases.
-*  <b>`labels`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-      num_true]`. The target classes.
-*  <b>`inputs`</b>: A `Tensor` of shape `[batch_size, dim]`.  The forward
-      activations of the input network.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`num_classes`</b>: An `int`. The number of possible classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`sampled_values`</b>: a tuple of (`sampled_candidates`, `true_expected_count`,
-      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
-      (if None, we default to `log_uniform_candidate_sampler`)
-*  <b>`remove_accidental_hits`</b>: A `bool`.  Whether to remove "accidental hits"
-      where a sampled class equals one of the target classes.  If set to
-      `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
-      learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
-      Default is False.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
-      Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `batch_size` 1-D tensor of per-example NCE losses.
-
-
-- - -
-
-### `tf.nn.sampled_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, partition_strategy='mod', name='sampled_softmax_loss')` {#sampled_softmax_loss}
-
-Computes and returns the sampled softmax training loss.
-
-This is a faster way to train a softmax classifier over a huge number of
-classes.
-
-This operation is for training only.  It is generally an underestimate of
-the full softmax loss.
-
-At inference time, you can compute full softmax probabilities with the
-expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
-
-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
-
-Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
-([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
-
-##### Args:
-
-
-*  <b>`weights`</b>: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
-      objects whose concatenation along dimension 0 has shape
-      [num_classes, dim].  The (possibly-sharded) class embeddings.
-*  <b>`biases`</b>: A `Tensor` of shape `[num_classes]`.  The class biases.
-*  <b>`labels`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-      num_true]`. The target classes.  Note that this format differs from
-      the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
-*  <b>`inputs`</b>: A `Tensor` of shape `[batch_size, dim]`.  The forward
-      activations of the input network.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`num_classes`</b>: An `int`. The number of possible classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`sampled_values`</b>: a tuple of (`sampled_candidates`, `true_expected_count`,
-      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
-      (if None, we default to `log_uniform_candidate_sampler`)
-*  <b>`remove_accidental_hits`</b>: A `bool`.  whether to remove "accidental hits"
-      where a sampled class equals one of the target classes.  Default is
-      True.
-*  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
-      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
-      Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `batch_size` 1-D tensor of per-example sampled softmax losses.
-
-
-
-### Candidate Samplers
-
-TensorFlow provides the following samplers for randomly sampling candidate
-classes when using one of the sampled loss functions above.
-
-- - -
-
-### `tf.nn.uniform_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#uniform_candidate_sampler}
-
-Samples a set of classes using a uniform base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is the uniform distribution
-over the range of integers `[0, range_max)`.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
-
-- - -
-
-### `tf.nn.log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#log_uniform_candidate_sampler}
-
-Samples a set of classes using a log-uniform (Zipfian) base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is an approximately log-uniform
-or Zipfian distribution:
-
-`P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
-
-This sampler is useful when the target classes approximately follow such
-a distribution - for example, if the classes represent words in a lexicon
-sorted in decreasing order of frequency. If your classes are not ordered by
-decreasing frequency, do not use this op.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
-
-- - -
-
-### `tf.nn.learned_unigram_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, seed=None, name=None)` {#learned_unigram_candidate_sampler}
-
-Samples a set of classes from a distribution learned during training.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution for this operation is constructed on the fly
-during training.  It is a unigram distribution over the target
-classes seen so far during training.  Every integer in `[0, range_max)`
-begins with a weight of 1, and is incremented by 1 each time it is
-seen as a target class.  The base distribution is not saved to checkpoints,
-so it is reset when the model is reloaded.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
-
-- - -
-
-### `tf.nn.fixed_unigram_candidate_sampler(true_classes, num_true, num_sampled, unique, range_max, vocab_file='', distortion=1.0, num_reserved_ids=0, num_shards=1, shard=0, unigrams=(), seed=None, name=None)` {#fixed_unigram_candidate_sampler}
-
-Samples a set of classes using the provided (fixed) base distribution.
-
-This operation randomly samples a tensor of sampled classes
-(`sampled_candidates`) from the range of integers `[0, range_max)`.
-
-The elements of `sampled_candidates` are drawn without replacement
-(if `unique=True`) or with replacement (if `unique=False`) from
-the base distribution.
-
-The base distribution is read from a file or passed in as an
-in-memory array. There is also an option to skew the distribution by
-applying a distortion power to the weights.
-
-In addition, this operation returns tensors `true_expected_count`
-and `sampled_expected_count` representing the number of times each
-of the target classes (`true_classes`) and the sampled
-classes (`sampled_candidates`) is expected to occur in an average
-tensor of sampled classes.  These values correspond to `Q(y|x)`
-defined in [this
-document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-If `unique=True`, then these are post-rejection probabilities and we
-compute them approximately.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`num_sampled`</b>: An `int`.  The number of classes to randomly sample per batch.
-*  <b>`unique`</b>: A `bool`. Determines whether all sampled classes in a batch are
-    unique.
-*  <b>`range_max`</b>: An `int`. The number of possible classes.
-*  <b>`vocab_file`</b>: Each valid line in this file (which should have a CSV-like
-    format) corresponds to a valid word ID. IDs are in sequential order,
-    starting from num_reserved_ids. The last entry in each line is expected
-    to be a value corresponding to the count or relative probability. Exactly
-    one of `vocab_file` and `unigrams` needs to be passed to this operation.
-*  <b>`distortion`</b>: The distortion is used to skew the unigram probability
-    distribution.  Each weight is first raised to the distortion's power
-    before adding to the internal unigram distribution. As a result,
-    `distortion = 1.0` gives regular unigram sampling (as defined by the vocab
-    file), and `distortion = 0.0` gives a uniform distribution.
-*  <b>`num_reserved_ids`</b>: Optionally some reserved IDs can be added in the range
-    `[0, num_reserved_ids]` by the users. One use case is that a special
-    unknown word token is used as ID 0. These IDs will have a sampling
-    probability of 0.
-*  <b>`num_shards`</b>: A sampler can be used to sample from a subset of the original
-    range in order to speed up the whole computation through parallelism. This
-    parameter (together with `shard`) indicates the number of partitions that
-    are being used in the overall computation.
-*  <b>`shard`</b>: A sampler can be used to sample from a subset of the original range
-    in order to speed up the whole computation through parallelism. This
-    parameter (together with `num_shards`) indicates the particular partition
-    number of the operation, when partitioning is being used.
-*  <b>`unigrams`</b>: A list of unigram counts or probabilities, one per ID in
-    sequential order. Exactly one of `vocab_file` and `unigrams` should be
-    passed to this operation.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled classes.
-*  <b>`true_expected_count`</b>: A tensor of type `float`.  Same shape as
-    `true_classes`. The expected counts under the sampling distribution
-    of each of `true_classes`.
-*  <b>`sampled_expected_count`</b>: A tensor of type `float`. Same shape as
-    `sampled_candidates`. The expected counts under the sampling distribution
-    of each of `sampled_candidates`.
-
-
-
-### Miscellaneous candidate sampling utilities
-
-- - -
-
-### `tf.nn.compute_accidental_hits(true_classes, sampled_candidates, num_true, seed=None, name=None)` {#compute_accidental_hits}
-
-Compute the position ids in `sampled_candidates` matching `true_classes`.
-
-In Candidate Sampling, this operation facilitates virtually removing
-sampled classes which happen to match target classes.  This is done
-in Sampled Softmax and Sampled Logistic.
-
-See our [Candidate Sampling Algorithms
-Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
-
-We presuppose that the `sampled_candidates` are unique.
-
-We call it an 'accidental hit' when one of the target classes
-matches one of the sampled classes.  This operation reports
-accidental hits as triples `(index, id, weight)`, where `index`
-represents the row number in `true_classes`, `id` represents the
-position in `sampled_candidates`, and weight is `-FLOAT_MAX`.
-
-The result of this op should be passed through a `sparse_to_dense`
-operation, then added to the logits of the sampled classes. This
-removes the contradictory effect of accidentally sampling the true
-target classes as noise classes for the same example.
-
-##### Args:
-
-
-*  <b>`true_classes`</b>: A `Tensor` of type `int64` and shape `[batch_size,
-    num_true]`. The target classes.
-*  <b>`sampled_candidates`</b>: A tensor of type `int64` and shape `[num_sampled]`.
-    The sampled_candidates output of CandidateSampler.
-*  <b>`num_true`</b>: An `int`.  The number of target classes per training example.
-*  <b>`seed`</b>: An `int`. An operation-specific seed. Default is 0.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`indices`</b>: A `Tensor` of type `int32` and shape `[num_accidental_hits]`.
-    Values indicate rows in `true_classes`.
-*  <b>`ids`</b>: A `Tensor` of type `int64` and shape `[num_accidental_hits]`.
-    Values indicate positions in `sampled_candidates`.
-*  <b>`weights`</b>: A `Tensor` of type `float` and shape `[num_accidental_hits]`.
-    Each value is `-FLOAT_MAX`.
-
-
-
-### Quantization ops
-
-- - -
-
-### `tf.nn.quantized_conv2d(input, filter, min_input, max_input, min_filter, max_filter, strides, padding, out_type=None, name=None)` {#quantized_conv2d}
-
-Computes a 2D convolution given quantized 4D input and filter tensors.
-
-The inputs are quantized tensors where the lowest value represents the real
-number of the associated minimum, and the highest represents the maximum.
-This means that you can only interpret the quantized output in the same way, by
-taking the returned minimum and maximum values into account.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`filter`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    filter's input_depth dimension must match input's depth dimensions.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`min_filter`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized filter value represents.
-*  <b>`max_filter`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized filter value represents.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`. Defaults to `tf.qint32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor` of type `out_type`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
-
-- - -
-
-### `tf.nn.quantized_relu_x(features, max_value, min_features, max_features, out_type=None, name=None)` {#quantized_relu_x}
-
-Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-
-##### Args:
-
-
-*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-*  <b>`max_value`</b>: A `Tensor` of type `float32`.
-*  <b>`min_features`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized value represents.
-*  <b>`max_features`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized value represents.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.qint8, tf.quint8, tf.qint16, tf.quint16, tf.qint32`. Defaults to `tf.quint8`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (activations, min_activations, max_activations).
-
-*  <b>`activations`</b>: A `Tensor` of type `out_type`. Has the same output shape as "features".
-*  <b>`min_activations`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized value represents.
-*  <b>`max_activations`</b>: A `Tensor` of type `float32`. The float value that the highest quantized value represents.
-
-
-- - -
-
-### `tf.nn.quantized_max_pool(input, min_input, max_input, ksize, strides, padding, name=None)` {#quantized_max_pool}
-
-Produces the max pool of the input tensor for quantized types.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`ksize`</b>: A list of `ints`.
-    The size of the window for each dimension of the input tensor.
-    The length must be 4 to match the number of dimensions of the input.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor. The length must be 4 to match the number of dimensions of the input.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
-
-- - -
-
-### `tf.nn.quantized_avg_pool(input, min_input, max_input, ksize, strides, padding, name=None)` {#quantized_avg_pool}
-
-Produces the average pool of the input tensor for quantized types.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `qint8`, `quint8`, `qint16`, `quint16`, `qint32`.
-    4-D with shape `[batch, height, width, channels]`.
-*  <b>`min_input`</b>: A `Tensor` of type `float32`.
-    The float value that the lowest quantized input value represents.
-*  <b>`max_input`</b>: A `Tensor` of type `float32`.
-    The float value that the highest quantized input value represents.
-*  <b>`ksize`</b>: A list of `ints`.
-    The size of the window for each dimension of the input tensor.
-    The length must be 4 to match the number of dimensions of the input.
-*  <b>`strides`</b>: A list of `ints`.
-    The stride of the sliding window for each dimension of the input
-    tensor.  The length must be 4 to match the number of dimensions of the input.
-*  <b>`padding`</b>: A `string` from: `"SAME", "VALID"`.
-    The type of padding algorithm to use.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A tuple of `Tensor` objects (output, min_output, max_output).
-
-*  <b>`output`</b>: A `Tensor`. Has the same type as `input`.
-*  <b>`min_output`</b>: A `Tensor` of type `float32`. The float value that the lowest quantized output value represents.
-*  <b>`max_output`</b>: A `Tensor` of type `float32`. The float value that the highest quantized output value represents.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `tf.nn.zero_fraction(value, name=None)` {#zero_fraction}
-
-Returns the fraction of zeros in `value`.
-
-If `value` is empty, the result is `nan`.
-
-This is useful in summaries to measure and report sparsity.  For example,
-
-```python
-    z = tf.Relu(...)
-    summ = tf.contrib.deprecated.scalar_summary('sparsity',
-    tf.nn.zero_fraction(z))
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: A tensor of numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The fraction of zeros in `value`, with type `float32`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/python_io.md b/tensorflow/g3doc/api_docs/python/python_io.md
deleted file mode 100644
index d9dd38bcd60..00000000000
--- a/tensorflow/g3doc/api_docs/python/python_io.md
+++ /dev/null
@@ -1,140 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Data IO (Python functions)
-[TOC]
-
-## Data IO (Python Functions)
-
-A TFRecords file represents a sequence of (binary) strings.  The format is not
-random access, so it is suitable for streaming large amounts of data but not
-suitable if fast sharding or other non-sequential access is desired.
-
-- - -
-
-### `class tf.python_io.TFRecordWriter` {#TFRecordWriter}
-
-A class to write records to a TFRecords file.
-
-This class implements `__enter__` and `__exit__`, and can be used
-in `with` blocks like a normal file.
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.__init__(path, options=None)` {#TFRecordWriter.__init__}
-
-Opens file `path` and creates a `TFRecordWriter` writing to it.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to the TFRecords file.
-*  <b>`options`</b>: (optional) A TFRecordOptions object.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If `path` cannot be opened for writing.
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.write(record)` {#TFRecordWriter.write}
-
-Write a string record to the file.
-
-##### Args:
-
-
-*  <b>`record`</b>: str
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.close()` {#TFRecordWriter.close}
-
-Close the file.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.python_io.TFRecordWriter.__enter__()` {#TFRecordWriter.__enter__}
-
-Enter a `with` block.
-
-
-- - -
-
-#### `tf.python_io.TFRecordWriter.__exit__(unused_type, unused_value, unused_traceback)` {#TFRecordWriter.__exit__}
-
-Exit a `with` block, closing the file.
-
-
-
-- - -
-
-### `tf.python_io.tf_record_iterator(path, options=None)` {#tf_record_iterator}
-
-An iterator that read the records from a TFRecords file.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to the TFRecords file.
-*  <b>`options`</b>: (optional) A TFRecordOptions object.
-
-##### Yields:
-
-  Strings.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If `path` cannot be opened for reading.
-
-
-- - -
-
-### `class tf.python_io.TFRecordCompressionType` {#TFRecordCompressionType}
-
-The type of compression for the record.
-
-- - -
-
-### `class tf.python_io.TFRecordOptions` {#TFRecordOptions}
-
-Options used for manipulating TFRecord files.
-- - -
-
-#### `tf.python_io.TFRecordOptions.__init__(compression_type)` {#TFRecordOptions.__init__}
-
-
-
-
-- - -
-
-#### `tf.python_io.TFRecordOptions.get_compression_type_string(cls, options)` {#TFRecordOptions.get_compression_type_string}
-
-
-
-
-
-
-- - -
-
-### TFRecords Format Details
-
-A TFRecords file contains a sequence of strings with CRC hashes.  Each record
-has the format
-
-    uint64 length
-    uint32 masked_crc32_of_length
-    byte   data[length]
-    uint32 masked_crc32_of_data
-
-and the records are concatenated together to produce the file.  The CRC32s
-are [described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check),
-and the mask of a CRC is
-
-    masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul
diff --git a/tensorflow/g3doc/api_docs/python/script_ops.md b/tensorflow/g3doc/api_docs/python/script_ops.md
deleted file mode 100644
index e7ee7365f3a..00000000000
--- a/tensorflow/g3doc/api_docs/python/script_ops.md
+++ /dev/null
@@ -1,68 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Wraps python functions
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Script Language Operators.
-
-TensorFlow provides allows you to wrap python/numpy functions as
-TensorFlow operators.
-
-- - -
-
-### `tf.py_func(func, inp, Tout, stateful=True, name=None)` {#py_func}
-
-Wraps a python function and uses it as a TensorFlow op.
-
-Given a python function `func`, which takes numpy arrays as its
-inputs and returns numpy arrays as its outputs, wrap this function as an
-operation in a TensorFlow graph. The following snippet constructs a simple
-TensorFlow graph that invokes the `np.sinh()` NumPy function as a operation
-in the graph:
-
-```python
-def my_func(x):
-  # x will be a numpy array with the contents of the placeholder below
-  return np.sinh(x)
-inp = tf.placeholder(tf.float32)
-y = tf.py_func(my_func, [inp], tf.float32)
-```
-
-**N.B.** The `tf.py_func()` operation has the following known limitations:
-
-* The body of the function (i.e. `func`) will not be serialized in a
-  `GraphDef`. Therefore, you should not use this function if you need to
-  serialize your model and restore it in a different environment.
-
-* The operation must run in the same address space as the Python program
-  that calls `tf.py_func()`. If you are using distributed TensorFlow, you
-  must run a `tf.train.Server` in the same process as the program that calls
-  `tf.py_func()` and you must pin the created operation to a device in that
-  server (e.g. using `with tf.device():`).
-
-##### Args:
-
-
-*  <b>`func`</b>: A Python function, which accepts a list of NumPy `ndarray` objects
-    having element types that match the corresponding `tf.Tensor` objects
-    in `inp`, and returns a list of `ndarray` objects (or a single `ndarray`)
-    having element types that match the corresponding values in `Tout`.
-*  <b>`inp`</b>: A list of `Tensor` objects.
-*  <b>`Tout`</b>: A list or tuple of tensorflow data types or a single tensorflow data
-    type if there is only one, indicating what `func` returns.
-*  <b>`stateful`</b>: (Boolean.) If True, the function should be considered stateful.
-    If a function is stateless, when given the same input it will return the
-    same output and have no observable side effects. Optimizations such as
-    common subexpression elimination are only performed on stateless
-    operations.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list of `Tensor` or a single `Tensor` which `func` computes.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/session_ops.md b/tensorflow/g3doc/api_docs/python/session_ops.md
deleted file mode 100644
index 342caaf7157..00000000000
--- a/tensorflow/g3doc/api_docs/python/session_ops.md
+++ /dev/null
@@ -1,119 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Tensor Handle Operations
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Tensor Handle Operations.
-
-TensorFlow provides several operators that allows the user to keep tensors
-"in-place" across run calls.
-
-- - -
-
-### `tf.get_session_handle(data, name=None)` {#get_session_handle}
-
-Return the handle of `data`.
-
-This is EXPERIMENTAL and subject to change.
-
-Keep `data` "in-place" in the runtime and create a handle that can be
-used to retrieve `data` in a subsequent run().
-
-Combined with `get_session_tensor`, we can keep a tensor produced in
-one run call in place, and use it as the input in a future run call.
-
-##### Args:
-
-
-*  <b>`data`</b>: A tensor to be stored in the session.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A scalar string tensor representing a unique handle for `data`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if `data` is not a Tensor.
-
-
-*  <b>`Example`</b>: 
-
-```python
-c = tf.multiply(a, b)
-h = tf.get_session_handle(c)
-h = sess.run(h)
-
-p, a = tf.get_session_tensor(h.handle, tf.float32)
-b = tf.multiply(a, 10)
-c = sess.run(b, feed_dict={p: h.handle})
-```
-
-
-- - -
-
-### `tf.get_session_tensor(handle, dtype, name=None)` {#get_session_tensor}
-
-Get the tensor of type `dtype` by feeding a tensor handle.
-
-This is EXPERIMENTAL and subject to change.
-
-Get the value of the tensor from a tensor handle. The tensor
-is produced in a previous run() and stored in the state of the
-session.
-
-##### Args:
-
-
-*  <b>`handle`</b>: The string representation of a persistent tensor handle.
-*  <b>`dtype`</b>: The type of the output tensor.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A pair of tensors. The first is a placeholder for feeding a
-  tensor handle and the second is the tensor in the session state
-  keyed by the tensor handle.
-
-
-*  <b>`Example`</b>: 
-
-```python
-c = tf.multiply(a, b)
-h = tf.get_session_handle(c)
-h = sess.run(h)
-
-p, a = tf.get_session_tensor(h.handle, tf.float32)
-b = tf.multiply(a, 10)
-c = sess.run(b, feed_dict={p: h.handle})
-```
-
-
-- - -
-
-### `tf.delete_session_tensor(handle, name=None)` {#delete_session_tensor}
-
-Delete the tensor for the given tensor handle.
-
-This is EXPERIMENTAL and subject to change.
-
-Delete the tensor of a given tensor handle. The tensor is produced
-in a previous run() and stored in the state of the session.
-
-##### Args:
-
-
-*  <b>`handle`</b>: The string representation of a persistent tensor handle.
-*  <b>`name`</b>: Optional name prefix for the return tensor.
-
-##### Returns:
-
-  A pair of graph elements. The first is a placeholder for feeding a
-  tensor handle and the second is a deletion operation.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
deleted file mode 100644
index 3222677be4c..00000000000
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ /dev/null
@@ -1,1457 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Sparse Tensors
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Sparse Tensor Representation
-
-TensorFlow supports a `SparseTensor` representation for data that is sparse
-in multiple dimensions. Contrast this representation with `IndexedSlices`,
-which is efficient for representing tensors that are sparse in their first
-dimension, and dense along all other dimensions.
-
-- - -
-
-### `class tf.SparseTensor` {#SparseTensor}
-
-Represents a sparse tensor.
-
-TensorFlow represents a sparse tensor as three separate dense tensors:
-`indices`, `values`, and `dense_shape`.  In Python, the three tensors are
-collected into a `SparseTensor` class for ease of use.  If you have separate
-`indices`, `values`, and `dense_shape` tensors, wrap them in a `SparseTensor`
-object before passing to the ops below.
-
-Concretely, the sparse tensor `SparseTensor(indices, values, dense_shape)`
-comprises the following components, where `N` and `ndims` are the number
-of values and number of dimensions in the `SparseTensor`, respectively:
-
-* `indices`: A 2-D int64 tensor of dense_shape `[N, ndims]`, which specifies
-  the indices of the elements in the sparse tensor that contain nonzero
-  values (elements are zero-indexed). For example, `indices=[[1,3], [2,4]]`
-  specifies that the elements with indexes of [1,3] and [2,4] have
-  nonzero values.
-
-* `values`: A 1-D tensor of any type and dense_shape `[N]`, which supplies the
-  values for each element in `indices`. For example, given
-  `indices=[[1,3], [2,4]]`, the parameter `values=[18, 3.6]` specifies
-  that element [1,3] of the sparse tensor has a value of 18, and element
-  [2,4] of the tensor has a value of 3.6.
-
-* `dense_shape`: A 1-D int64 tensor of dense_shape `[ndims]`, which specifies
-  the dense_shape of the sparse tensor. Takes a list indicating the number of
-  elements in each dimension. For example, `dense_shape=[3,6]` specifies a
-  two-dimensional 3x6 tensor, `dense_shape=[2,3,4]` specifies a
-  three-dimensional 2x3x4 tensor, and `dense_shape=[9]` specifies a
-  one-dimensional tensor with 9 elements.
-
-The corresponding dense tensor satisfies:
-
-```python
-dense.shape = dense_shape
-dense[tuple(indices[i])] = values[i]
-```
-
-By convention, `indices` should be sorted in row-major order (or equivalently
-lexicographic order on the tuples `indices[i]`). This is not enforced when
-`SparseTensor` objects are constructed, but most ops assume correct ordering.
-If the ordering of sparse tensor `st` is wrong, a fixed version can be
-obtained by calling `tf.sparse_reorder(st)`.
-
-Example: The sparse tensor
-
-```python
-SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
-```
-
-represents the dense tensor
-
-```python
-[[1, 0, 0, 0]
- [0, 0, 2, 0]
- [0, 0, 0, 0]]
-```
-
-- - -
-
-#### `tf.SparseTensor.__init__(indices, values, dense_shape)` {#SparseTensor.__init__}
-
-Creates a `SparseTensor`.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A 2-D int64 tensor of shape `[N, ndims]`.
-*  <b>`values`</b>: A 1-D tensor of any type and shape `[N]`.
-*  <b>`dense_shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
-
-##### Returns:
-
-  A `SparseTensor`.
-
-
-- - -
-
-#### `tf.SparseTensor.get_shape()` {#SparseTensor.get_shape}
-
-Get the `TensorShape` representing the shape of the dense tensor.
-
-##### Returns:
-
-  A `TensorShape` object.
-
-
-- - -
-
-#### `tf.SparseTensor.indices` {#SparseTensor.indices}
-
-The indices of non-zero values in the represented dense tensor.
-
-##### Returns:
-
-  A 2-D Tensor of int64 with dense_shape `[N, ndims]`, where `N` is the
-    number of non-zero values in the tensor, and `ndims` is the rank.
-
-
-- - -
-
-#### `tf.SparseTensor.values` {#SparseTensor.values}
-
-The non-zero values in the represented dense tensor.
-
-##### Returns:
-
-  A 1-D Tensor of any data type.
-
-
-- - -
-
-#### `tf.SparseTensor.dense_shape` {#SparseTensor.dense_shape}
-
-A 1-D Tensor of int64 representing the shape of the dense tensor.
-
-
-- - -
-
-#### `tf.SparseTensor.dtype` {#SparseTensor.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.SparseTensor.op` {#SparseTensor.op}
-
-The `Operation` that produces `values` as an output.
-
-
-- - -
-
-#### `tf.SparseTensor.graph` {#SparseTensor.graph}
-
-The `Graph` that contains the index, value, and dense_shape tensors.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.SparseTensor.__div__(sp_x, y)` {#SparseTensor.__div__}
-
-Component-wise divides a SparseTensor by a dense Tensor.
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-##### Args:
-
-
-*  <b>`sp_indices`</b>: A `Tensor` of type `int64`.
-    2-D.  `N x R` matrix with the indices of non-empty values in a
-    SparseTensor, possibly not in canonical ordering.
-*  <b>`sp_values`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    1-D.  `N` non-empty values corresponding to `sp_indices`.
-*  <b>`sp_shape`</b>: A `Tensor` of type `int64`.
-    1-D.  Shape of the input SparseTensor.
-*  <b>`dense`</b>: A `Tensor`. Must have the same type as `sp_values`.
-    `R`-D.  The dense Tensor operand.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `sp_values`.
-  1-D.  The `N` values that are operated on.
-
-
-- - -
-
-#### `tf.SparseTensor.__mul__(sp_x, y)` {#SparseTensor.__mul__}
-
-Component-wise multiplies a SparseTensor by a dense Tensor.
-
-The output locations corresponding to the implicitly zero elements in the sparse
-tensor will be zero (i.e., will not take up storage space), regardless of the
-contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-##### Args:
-
-
-*  <b>`sp_indices`</b>: A `Tensor` of type `int64`.
-    2-D.  `N x R` matrix with the indices of non-empty values in a
-    SparseTensor, possibly not in canonical ordering.
-*  <b>`sp_values`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    1-D.  `N` non-empty values corresponding to `sp_indices`.
-*  <b>`sp_shape`</b>: A `Tensor` of type `int64`.
-    1-D.  Shape of the input SparseTensor.
-*  <b>`dense`</b>: A `Tensor`. Must have the same type as `sp_values`.
-    `R`-D.  The dense Tensor operand.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `sp_values`.
-  1-D.  The `N` values that are operated on.
-
-
-- - -
-
-#### `tf.SparseTensor.__str__()` {#SparseTensor.__str__}
-
-
-
-
-- - -
-
-#### `tf.SparseTensor.__truediv__(sp_x, y)` {#SparseTensor.__truediv__}
-
-Internal helper function for 'sp_t / dense_t'.
-
-
-- - -
-
-#### `tf.SparseTensor.eval(feed_dict=None, session=None)` {#SparseTensor.eval}
-
-Evaluates this sparse tensor in a `Session`.
-
-Calling this method will execute all preceding operations that
-produce the inputs needed for the operation that produces this
-tensor.
-
-*N.B.* Before invoking `SparseTensor.eval()`, its graph must have been
-launched in a session, and either a default session must be
-available, or `session` must be specified explicitly.
-
-##### Args:
-
-
-*  <b>`feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
-    description of the valid feed values.
-*  <b>`session`</b>: (Optional.) The `Session` to be used to evaluate this sparse
-    tensor. If none, the default session will be used.
-
-##### Returns:
-
-  A `SparseTensorValue` object.
-
-
-- - -
-
-#### `tf.SparseTensor.from_value(cls, sparse_tensor_value)` {#SparseTensor.from_value}
-
-
-
-
-
-- - -
-
-### `class tf.SparseTensorValue` {#SparseTensorValue}
-
-SparseTensorValue(indices, values, dense_shape)
-- - -
-
-#### `tf.SparseTensorValue.__getnewargs__()` {#SparseTensorValue.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.SparseTensorValue.__getstate__()` {#SparseTensorValue.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.SparseTensorValue.__new__(_cls, indices, values, dense_shape)` {#SparseTensorValue.__new__}
-
-Create new instance of SparseTensorValue(indices, values, dense_shape)
-
-
-- - -
-
-#### `tf.SparseTensorValue.__repr__()` {#SparseTensorValue.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.SparseTensorValue.dense_shape` {#SparseTensorValue.dense_shape}
-
-Alias for field number 2
-
-
-- - -
-
-#### `tf.SparseTensorValue.indices` {#SparseTensorValue.indices}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.SparseTensorValue.values` {#SparseTensorValue.values}
-
-Alias for field number 1
-
-
-
-
-## Conversion
-
-- - -
-
-### `tf.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0, validate_indices=True, name=None)` {#sparse_to_dense}
-
-Converts a sparse representation into a dense tensor.
-
-Builds an array `dense` with shape `output_shape` such that
-
-```python
-# If sparse_indices is scalar
-dense[i] = (i == sparse_indices ? sparse_values : default_value)
-
-# If sparse_indices is a vector, then for each i
-dense[sparse_indices[i]] = sparse_values[i]
-
-# If sparse_indices is an n by d matrix, then for each i in [0, n)
-dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-```
-
-All other values in `dense` are set to `default_value`.  If `sparse_values`
-is a scalar, all sparse indices are set to this single value.
-
-Indices should be sorted in lexicographic order, and indices must not
-contain any repeats. If `validate_indices` is True, these properties
-are checked during execution.
-
-##### Args:
-
-
-*  <b>`sparse_indices`</b>: A 0-D, 1-D, or 2-D `Tensor` of type `int32` or `int64`.
-    `sparse_indices[i]` contains the complete index where `sparse_values[i]`
-    will be placed.
-*  <b>`output_shape`</b>: A 1-D `Tensor` of the same type as `sparse_indices`.  Shape
-    of the dense output tensor.
-*  <b>`sparse_values`</b>: A 0-D or 1-D `Tensor`.  Values corresponding to each row of
-    `sparse_indices`, or a scalar value to be used for all sparse indices.
-*  <b>`default_value`</b>: A 0-D `Tensor` of the same type as `sparse_values`.  Value
-    to set for indices not specified in `sparse_indices`.  Defaults to zero.
-*  <b>`validate_indices`</b>: A boolean value.  If True, indices are checked to make
-    sure they are sorted in lexicographic order and that there are no repeats.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Dense `Tensor` of shape `output_shape`.  Has the same type as
-  `sparse_values`.
-
-
-- - -
-
-### `tf.sparse_tensor_to_dense(sp_input, default_value=0, validate_indices=True, name=None)` {#sparse_tensor_to_dense}
-
-Converts a `SparseTensor` into a dense tensor.
-
-This op is a convenience wrapper around `sparse_to_dense` for `SparseTensor`s.
-
-For example, if `sp_input` has shape `[3, 5]` and non-empty string values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-
-and `default_value` is `x`, then the output will be a dense `[3, 5]`
-string tensor with values:
-
-    [[x a x b x]
-     [x x x x x]
-     [c x x x x]]
-
-Indices must be without repeats.  This is only
-tested if validate_indices is True.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`default_value`</b>: Scalar value to set for indices not specified in
-    `sp_input`.  Defaults to zero.
-*  <b>`validate_indices`</b>: A boolean value.  If `True`, indices are checked to make
-    sure they are sorted in lexicographic order and that there are no repeats.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional).
-
-##### Returns:
-
-  A dense tensor with shape `sp_input.dense_shape` and values specified by
-  the non-empty values in `sp_input`. Indices not in `sp_input` are assigned
-  `default_value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_to_indicator(sp_input, vocab_size, name=None)` {#sparse_to_indicator}
-
-Converts a `SparseTensor` of ids into a dense bool indicator tensor.
-
-The last dimension of `sp_input.indices` is discarded and replaced with
-the values of `sp_input`.  If `sp_input.dense_shape = [D0, D1, ..., Dn, K]`,
-then `output.shape = [D0, D1, ..., Dn, vocab_size]`, where
-
-    output[d_0, d_1, ..., d_n, sp_input[d_0, d_1, ..., d_n, k]] = True
-
-and False elsewhere in `output`.
-
-For example, if `sp_input.dense_shape = [2, 3, 4]` with non-empty values:
-
-    [0, 0, 0]: 0
-    [0, 1, 0]: 10
-    [1, 0, 3]: 103
-    [1, 1, 2]: 150
-    [1, 1, 3]: 149
-    [1, 1, 4]: 150
-    [1, 2, 1]: 121
-
-and `vocab_size = 200`, then the output will be a `[2, 3, 200]` dense bool
-tensor with False everywhere except at positions
-
-    (0, 0, 0), (0, 1, 10), (1, 0, 103), (1, 1, 149), (1, 1, 150),
-    (1, 2, 121).
-
-Note that repeats are allowed in the input SparseTensor.
-This op is useful for converting `SparseTensor`s into dense formats for
-compatibility with ops that expect dense tensors.
-
-The input `SparseTensor` must be in row-major order.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: A `SparseTensor` with `values` property of type `int32` or
-    `int64`.
-*  <b>`vocab_size`</b>: A scalar int64 Tensor (or Python int) containing the new size
-    of the last dimension, `all(0 <= sp_input.values < vocab_size)`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A dense bool indicator tensor representing the indices with specified value.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
-
-Combines a batch of feature ids and values into a single `SparseTensor`.
-
-The most common use case for this function occurs when feature ids and
-their corresponding values are stored in `Example` protos on disk.
-`parse_example` will return a batch of ids and a batch of values, and this
-function joins them into a single logical `SparseTensor` for use in
-functions such as `sparse_tensor_dense_matmul`, `sparse_to_dense`, etc.
-
-The `SparseTensor` returned by this function has the following properties:
-
-  - `indices` is equivalent to `sp_ids.indices` with the last
-    dimension discarded and replaced with `sp_ids.values`.
-  - `values` is simply `sp_values.values`.
-  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
-    `output.shape = [D0, D1, ..., Dn, vocab_size]`.
-
-For example, consider the following feature vectors:
-
-```python
-  vector1 = [-3, 0, 0, 0, 0, 0]
-  vector2 = [ 0, 1, 0, 4, 1, 0]
-  vector3 = [ 5, 0, 0, 9, 0, 0]
-```
-
-These might be stored sparsely in the following Example protos by storing
-only the feature ids (column number if the vectors are treated as a matrix)
-of the non-zero elements and the corresponding values:
-
-```python
-  examples = [Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[0])),
-                  "values": Feature(float_list=FloatList(value=[-3]))}),
-              Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[1, 4, 3])),
-                  "values": Feature(float_list=FloatList(value=[1, 1, 4]))}),
-              Example(features={
-                  "ids": Feature(int64_list=Int64List(value=[0, 3])),
-                  "values": Feature(float_list=FloatList(value=[5, 9]))})]
-```
-
-The result of calling parse_example on these examples will produce a
-dictionary with entries for "ids" and "values". Passing those two objects
-to this function along with vocab_size=6, will produce a `SparseTensor` that
-sparsely represents all three instances. Namely, the `indices` property will
-contain the coordinates of the non-zero entries in the feature matrix (the
-first dimension is the row number in the matrix, i.e., the index within the
-batch, and the second dimension is the column number, i.e., the feature id);
-`values` will contain the actual values. `shape` will be the shape of the
-original matrix, i.e., (3, 6). For our example above, the output will be
-equal to:
-
-```python
-  SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
-               values=[-3, 1, 4, 1, 5, 9],
-               dense_shape=[3, 6])
-```
-
-This method generalizes to higher-dimensions by simply providing a list for
-both the sp_ids as well as the vocab_size.
-In this case the resulting `SparseTensor` has the following properties:
-  - `indices` is equivalent to `sp_ids[0].indices` with the last
-    dimension discarded and concatenated with
-    `sp_ids[0].values, sp_ids[1].values, ...`.
-  - `values` is simply `sp_values.values`.
-  - If `sp_ids.dense_shape = [D0, D1, ..., Dn, K]`, then
-    `output.shape = [D0, D1, ..., Dn] + vocab_size`.
-
-##### Args:
-
-
-*  <b>`sp_ids`</b>: A single `SparseTensor` with `values` property of type `int32`
-    or `int64` or a Python list of such `SparseTensor`s or a list thereof.
-*  <b>`sp_values`</b>: A`SparseTensor` of any type.
-*  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
-    of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
-    Or a list thereof with `all(0 <= sp_ids[i].values < vocab_size[i])` for
-    all `i`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
-   `sp_values` are already sorted. If so skip sorting, False by default
-   (optional).
-
-##### Returns:
-
-  A `SparseTensor` compactly representing a batch of feature ids and values,
-  useful for passing to functions that expect such a `SparseTensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_values` is not a `SparseTensor`. Or if `sp_ids` is neither
-    a `SparseTensor` nor a list thereof. Or if `vocab_size` is not a
-    `Tensor` or a Python int and `sp_ids` is a `SparseTensor`. Or if
-    `vocab_size` is not a or list thereof and `sp_ids` is a list.
-*  <b>`ValueError`</b>: If `sp_ids` and `vocab_size` are lists of different lengths.
-
-
-
-## Manipulation
-
-- - -
-
-### `tf.sparse_concat(axis, sp_inputs, name=None, expand_nonconcat_dim=False, concat_dim=None)` {#sparse_concat}
-
-Concatenates a list of `SparseTensor` along the specified dimension.
-
-Concatenation is with respect to the dense versions of each sparse input.
-It is assumed that each inputs is a `SparseTensor` whose elements are ordered
-along increasing dimension number.
-
-If expand_nonconcat_dim is False, all inputs' shapes must match, except for
-the concat dimension. If expand_nonconcat_dim is True, then inputs' shapes are
-allowed to vary among all inputs.
-
-The `indices`, `values`, and `shapes` lists must have the same length.
-
-If expand_nonconcat_dim is False, then the output shape is identical to the
-inputs', except along the concat dimension, where it is the sum of the inputs'
-sizes along that dimension.
-
-If expand_nonconcat_dim is True, then the output shape along the non-concat
-dimensions will be expand to be the largest among all inputs, and it is the
-sum of the inputs sizes along the concat dimension.
-
-The output elements will be resorted to preserve the sort order along
-increasing dimension number.
-
-This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-values across all inputs. This is due to the need for an internal sort in
-order to concatenate efficiently across an arbitrary dimension.
-
-For example, if `axis = 1` and the inputs are
-
-    sp_inputs[0]: shape = [2, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-then the output will be
-
-    shape = [2, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b c  ]        [       ]   [b c          ]
-
-Another example, if 'axis = 1' and the inputs are
-
-    sp_inputs[0]: shape = [3, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [2, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-if expand_nonconcat_dim = False, this will result in an error. But if
-expand_nonconcat_dim = True, this will result in:
-
-    shape = [3, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [2, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b    ]        [       ]   [b            ]
-    [  c  ]                    [  c          ]
-
-
-##### Args:
-
-
-*  <b>`axis`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
-    where rank is the number of dimensions in each input `SparseTensor`.
-*  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional).
-*  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
-    dimensions. Defaulted to False.
-*  <b>`concat_dim`</b>: The old (deprecated) name for axis.
-
-##### Returns:
-
-  A `SparseTensor` with the concatenated output.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_inputs` is not a list of `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_reorder(sp_input, name=None)` {#sparse_reorder}
-
-Reorders a `SparseTensor` into the canonical, row-major ordering.
-
-Note that by convention, all sparse ops preserve the canonical ordering
-along increasing dimension number. The only time ordering can be violated
-is during manual manipulation of the indices and values to add entries.
-
-Reordering does not affect the shape of the `SparseTensor`.
-
-For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
-
-    [0, 3]: b
-    [0, 1]: a
-    [3, 1]: d
-    [2, 0]: c
-
-then the output will be a `SparseTensor` of shape `[4, 5]` and
-`indices` / `values`:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A `SparseTensor` with the same shape and non-empty values, but in
-  canonical ordering.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_reshape(sp_input, shape, name=None)` {#sparse_reshape}
-
-Reshapes a `SparseTensor` to represent values in a new dense shape.
-
-This operation has the same semantics as `reshape` on the represented dense
-tensor.  The indices of non-empty values in `sp_input` are recomputed based
-on the new dense shape, and a new `SparseTensor` is returned containing the
-new indices and new shape.  The order of non-empty values in `sp_input` is
-unchanged.
-
-If one component of `shape` is the special value -1, the size of that
-dimension is computed so that the total dense size remains constant.  At
-most one component of `shape` can be -1.  The number of dense elements
-implied by `shape` must be the same as the number of dense elements
-originally represented by `sp_input`.
-
-For example, if `sp_input` has shape `[2, 3, 6]` and `indices` / `values`:
-
-    [0, 0, 0]: a
-    [0, 0, 1]: b
-    [0, 1, 0]: c
-    [1, 0, 0]: d
-    [1, 2, 3]: e
-
-and `shape` is `[9, -1]`, then the output will be a `SparseTensor` of
-shape `[9, 4]` and `indices` / `values`:
-
-    [0, 0]: a
-    [0, 1]: b
-    [1, 2]: c
-    [4, 2]: d
-    [8, 1]: e
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`shape`</b>: A 1-D (vector) int64 `Tensor` specifying the new dense shape of the
-    represented `SparseTensor`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A `SparseTensor` with the same non-empty values but with indices calculated
-  by the new dense shape.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_split(keyword_required=KeywordRequired(), sp_input=None, num_split=None, axis=None, name=None, split_dim=None)` {#sparse_split}
-
-Split a `SparseTensor` into `num_split` tensors along `axis`.
-
-If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
-each slice starting from 0:`shape[axis] % num_split` gets extra one
-dimension. For example, if `axis = 1` and `num_split = 2` and the
-input is:
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    output_tensor[0] =
-    [    a ]
-    [b c   ]
-
-    output_tensor[1] =
-    [ d e  ]
-    [      ]
-
-##### Args:
-
-
-*  <b>`keyword_required`</b>: Python 2 standin for * (temporary for argument reorder)
-*  <b>`sp_input`</b>: The `SparseTensor` to split.
-*  <b>`num_split`</b>: A Python integer. The number of ways to split.
-*  <b>`axis`</b>: A 0-D `int32` `Tensor`. The dimension along which to split.
-*  <b>`name`</b>: A name for the operation (optional).
-*  <b>`split_dim`</b>: Deprecated old name for axis.
-
-##### Returns:
-
-  `num_split` `SparseTensor` objects resulting from splitting `value`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-*  <b>`ValueError`</b>: If the deprecated `split_dim` and `axis` are both non None.
-
-
-- - -
-
-### `tf.sparse_retain(sp_input, to_retain)` {#sparse_retain}
-
-Retains specified non-empty values within a `SparseTensor`.
-
-For example, if `sp_input` has shape `[4, 5]` and 4 non-empty string values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-and `to_retain = [True, False, False, True]`, then the output will
-be a `SparseTensor` of shape `[4, 5]` with 2 non-empty values:
-
-    [0, 1]: a
-    [3, 1]: d
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor` with `N` non-empty elements.
-*  <b>`to_retain`</b>: A bool vector of length `N` with `M` true values.
-
-##### Returns:
-
-  A `SparseTensor` with the same shape as the input and `M` non-empty
-  elements corresponding to the true positions in `to_retain`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_reset_shape(sp_input, new_shape=None)` {#sparse_reset_shape}
-
-Resets the shape of a `SparseTensor` with indices and values unchanged.
-
-If `new_shape` is None, returns a copy of `sp_input` with its shape reset
-to the tight bounding box of `sp_input`.
-
-If `new_shape` is provided, then it must be larger or equal in all dimensions
-compared to the shape of `sp_input`. When this condition is met, the returned
-SparseTensor will have its shape reset to `new_shape` and its indices and
-values unchanged from that of `sp_input.`
-
-For example:
-
-  Consider a `sp_input` with shape [2, 3, 5]:
-
-    [0, 0, 1]: a
-    [0, 1, 0]: b
-    [0, 2, 2]: c
-    [1, 0, 3]: d
-
-  - It is an error to set `new_shape` as [3, 7] since this represents a
-    rank-2 tensor while `sp_input` is rank-3. This is either a ValueError
-    during graph construction (if both shapes are known) or an OpError during
-    run time.
-
-  - Setting `new_shape` as [2, 3, 6] will be fine as this shape is larger or
-    equal in every dimension compared to the original shape [2, 3, 5].
-
-  - On the other hand, setting new_shape as [2, 3, 4] is also an error: The
-    third dimension is smaller than the original shape [2, 3, 5] (and an
-    `InvalidArgumentError` will be raised).
-
-  - If `new_shape` is None, the returned SparseTensor will have a shape
-    [2, 3, 4], which is the tight bounding box of `sp_input`.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`new_shape`</b>: None or a vector representing the new shape for the returned
-    `SparseTensor`.
-
-##### Returns:
-
-  A `SparseTensor` indices and values unchanged from `input_sp`. Its shape is
-    `new_shape` if that is set. Otherwise it is  the tight bounding box of
-     `input_sp`
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-*  <b>`ValueError`</b>: If `new_shape` represents a tensor with a different rank from
-    that of `sp_input` (if shapes are known when graph is constructed).
-*  <b>`OpError`</b>: 
-    - If `new_shape` has dimension sizes that are too small.
-    - If shapes are not known during graph construction time, and during run
-      time it is found out that the ranks do not match.
-
-
-- - -
-
-### `tf.sparse_fill_empty_rows(sp_input, default_value, name=None)` {#sparse_fill_empty_rows}
-
-Fills empty rows in the input 2-D `SparseTensor` with a default value.
-
-This op adds entries with the specified `default_value` at index
-`[row, 0]` for any row in the input that does not already have a value.
-
-For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [1, 0]: default_value
-    [2, 0]: c
-    [3, 1]: d
-    [4, 0]: default_value
-
-Note that the input may have empty columns at the end, with no effect on
-this op.
-
-The output `SparseTensor` will be in row-major order and will have the
-same shape as the input.
-
-This op also returns an indicator vector such that
-
-    empty_row_indicator[i] = True iff row i was an empty row.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: A `SparseTensor` with shape `[N, M]`.
-*  <b>`default_value`</b>: The value to fill for empty rows, with the same type as
-    `sp_input.`
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-
-*  <b>`sp_ordered_output`</b>: A `SparseTensor` with shape `[N, M]`, and with all empty
-    rows filled in with `default_value`.
-*  <b>`empty_row_indicator`</b>: A bool vector of length `N` indicating whether each
-    input row was empty.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-- - -
-
-### `tf.sparse_transpose(sp_input, perm=None, name=None)` {#sparse_transpose}
-
-Transposes a `SparseTensor`
-
-The returned tensor's dimension i will correspond to the input dimension
-`perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-the rank of the input tensor. Hence by default, this operation performs a
-regular matrix transpose on 2-D input Tensors.
-
-For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
-
-    [0, 3]: b
-    [0, 1]: a
-    [3, 1]: d
-    [2, 0]: c
-
-then the output will be a `SparseTensor` of shape `[5, 4]` and
-`indices` / `values`:
-
-    [0, 2]: c
-    [1, 0]: a
-    [1, 3]: d
-    [3, 0]: b
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The input `SparseTensor`.
-*  <b>`perm`</b>: A permutation of the dimensions of `sp_input`.
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A transposed `SparseTensor`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sp_input` is not a `SparseTensor`.
-
-
-
-## Reduction
-- - -
-
-### `tf.sparse_reduce_sum(sp_input, axis=None, keep_dims=False, reduction_axes=None)` {#sparse_reduce_sum}
-
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-similar to the indexing rules in Python.
-
-For example:
-
-```python
-# 'x' represents [[1, ?, 1]
-#                 [?, 1, ?]]
-# where ? is implicitly-zero.
-tf.sparse_reduce_sum(x) ==> 3
-tf.sparse_reduce_sum(x, 0) ==> [1, 1, 1]
-tf.sparse_reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
-tf.sparse_reduce_sum(x, 1, keep_dims=True) ==> [[2], [1]]
-tf.sparse_reduce_sum(x, [0, 1]) ==> 3
-```
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The SparseTensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce; list or scalar. If `None` (the
-    default), reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retain reduced dimensions with length 1.
-*  <b>`reduction_axes`</b>: Deprecated name of axis.
-
-##### Returns:
-
-  The reduced Tensor.
-
-
-- - -
-
-### `tf.sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False, reduction_axes=None)` {#sparse_reduce_sum_sparse}
-
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: The SparseTensor to reduce. Should have numeric type.
-*  <b>`axis`</b>: The dimensions to reduce; list or scalar. If `None` (the
-    default), reduces all dimensions.
-*  <b>`keep_dims`</b>: If true, retain reduced dimensions with length 1.
-*  <b>`reduction_axes`</b>: Deprecated name of axis
-
-##### Returns:
-
-  The reduced SparseTensor.
-
-
-
-## Math Operations
-- - -
-
-### `tf.sparse_add(a, b, thresh=0)` {#sparse_add}
-
-Adds two tensors, at least one of each is a `SparseTensor`.
-
-If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
-both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
-of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
-`Tensor`s.
-
-The indices of any input `SparseTensor` are assumed ordered in standard
-lexicographic order.  If this is not the case, before this step run
-`SparseReorder` to restore index ordering.
-
-If both arguments are sparse, we perform "clipping" as follows.  By default,
-if two values sum to zero at some index, the output `SparseTensor` would still
-include that particular location in its index, storing a zero in the
-corresponding value slot.  To override this, callers can specify `thresh`,
-indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-corresponding value and index would then not be included.  In particular,
-`thresh == 0.0` (default) means everything is kept and actual thresholding
-happens only for a positive value.
-
-For example, suppose the logical sum of two sparse operands is (densified):
-
-    [       2]
-    [.1     0]
-    [ 6   -.2]
-
-Then,
-
-    * `thresh == 0` (the default): all 5 index/value pairs will be returned.
-    * `thresh == 0.11`: only .1 and 0  will vanish, and the remaining three
-        index/value pairs will be returned.
-    * `thresh == 0.21`: .1, 0, and -.2 will vanish.
-
-##### Args:
-
-
-*  <b>`a`</b>: The first operand; `SparseTensor` or `Tensor`.
-*  <b>`b`</b>: The second operand; `SparseTensor` or `Tensor`.  At least one operand
-    must be sparse.
-*  <b>`thresh`</b>: A 0-D `Tensor`.  The magnitude threshold that determines if an
-  output value/index pair takes space.  Its dtype should match that of the
-  values if they are real; if the latter are complex64/complex128, then the
-  dtype should be float32/float64, correspondingly.
-
-##### Returns:
-
-  A `SparseTensor` or a `Tensor`, representing the sum.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
-
-
-- - -
-
-### `tf.sparse_softmax(sp_input, name=None)` {#sparse_softmax}
-
-Applies softmax to a batched N-D `SparseTensor`.
-
-The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-(where `N >= 2`), and with indices sorted in the canonical lexicographic
-order.
-
-This op is equivalent to applying the normal `tf.nn.softmax()` to each
-innermost logical submatrix with shape `[B, C]`, but with the catch that *the
-implicitly zero elements do not participate*.  Specifically, the algorithm is
-equivalent to:
-
-  (1) Applies `tf.nn.softmax()` to a densified view of each innermost
-      submatrix with shape `[B, C]`, along the size-C dimension;
-  (2) Masks out the original implicitly-zero locations;
-  (3) Renormalizes the remaining elements.
-
-Hence, the `SparseTensor` result has exactly the same non-zero indices and
-shape.
-
-Example:
-
-```python
-# First batch:
-# [?   e.]
-# [1.  ? ]
-# Second batch:
-# [e   ? ]
-# [e   e ]
-shape = [2, 2, 2]  # 3-D SparseTensor
-values = np.asarray([[[0., np.e], [1., 0.]], [[np.e, 0.], [np.e, np.e]]])
-indices = np.vstack(np.where(values)).astype(np.int64).T
-
-result = tf.sparse_softmax(tf.SparseTensor(indices, values, shape))
-# ...returning a 3-D SparseTensor, equivalent to:
-# [?   1.]     [1    ?]
-# [1.  ? ] and [.5  .5]
-# where ? means implicitly zero.
-```
-
-##### Args:
-
-
-*  <b>`sp_input`</b>: N-D `SparseTensor`, where `N >= 2`.
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: N-D `SparseTensor` representing the results.
-
-
-- - -
-
-### `tf.sparse_tensor_dense_matmul(sp_a, b, adjoint_a=False, adjoint_b=False, name=None)` {#sparse_tensor_dense_matmul}
-
-Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-
-No validity checking is performed on the indices of A.  However, the following
-input format is recommended for optimal behavior:
-
-if adjoint_a == false:
-  A should be sorted in lexicographically increasing order.  Use
-  sparse_reorder if you're not sure.
-if adjoint_a == true:
-  A should be sorted in order of increasing dimension 1 (i.e., "column major"
-  order instead of "row major" order).
-
-Deciding when to use sparse_tensor_dense_matmul vs. matmul(sp_a=True):
-
-There are a number of questions to ask in the decision process, including:
-
-* Will the SparseTensor A fit in memory if densified?
-* Is the column count of the product large (>> 1)?
-* Is the density of A larger than approximately 15%?
-
-If the answer to several of these questions is yes, consider
-converting the `SparseTensor` to a dense one and using `tf.matmul` with
-`sp_a=True`.
-
-This operation tends to perform well when A is more sparse, if the column size
-of the product is small (e.g. matrix-vector multiplication), if
-`sp_a.dense_shape` takes on large values.
-
-Below is a rough speed comparison between sparse_tensor_dense_matmul,
-labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
-the comparison, the time spent converting from a SparseTensor to a dense
-Tensor is not included, so it is overly conservative with respect to
-the time ratio.
-
-Benchmark system:
-CPU: Intel Ivybridge with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:12MB
-GPU: NVidia Tesla k40c
-
-Compiled with:
-`-c opt --config=cuda --copt=-mavx`
-
-```
-tensorflow/python/sparse_tensor_dense_matmul_op_test --benchmarks
-A sparse [m, k] with % nonzero values between 1% and 80%
-B dense [k, n]
-
-% nnz  n   gpu   m     k     dt(dense)     dt(sparse)   dt(sparse)/dt(dense)
-0.01   1   True  100   100   0.000221166   0.00010154   0.459112
-0.01   1   True  100   1000  0.00033858    0.000109275  0.322745
-0.01   1   True  1000  100   0.000310557   9.85661e-05  0.317385
-0.01   1   True  1000  1000  0.0008721     0.000100875  0.115669
-0.01   1   False 100   100   0.000208085   0.000107603  0.51711
-0.01   1   False 100   1000  0.000327112   9.51118e-05  0.290762
-0.01   1   False 1000  100   0.000308222   0.00010345   0.335635
-0.01   1   False 1000  1000  0.000865721   0.000101397  0.117124
-0.01   10  True  100   100   0.000218522   0.000105537  0.482958
-0.01   10  True  100   1000  0.000340882   0.000111641  0.327506
-0.01   10  True  1000  100   0.000315472   0.000117376  0.372064
-0.01   10  True  1000  1000  0.000905493   0.000123263  0.136128
-0.01   10  False 100   100   0.000221529   9.82571e-05  0.44354
-0.01   10  False 100   1000  0.000330552   0.000112615  0.340687
-0.01   10  False 1000  100   0.000341277   0.000114097  0.334324
-0.01   10  False 1000  1000  0.000819944   0.000120982  0.147549
-0.01   25  True  100   100   0.000207806   0.000105977  0.509981
-0.01   25  True  100   1000  0.000322879   0.00012921   0.400181
-0.01   25  True  1000  100   0.00038262    0.00014158   0.370035
-0.01   25  True  1000  1000  0.000865438   0.000202083  0.233504
-0.01   25  False 100   100   0.000209401   0.000104696  0.499979
-0.01   25  False 100   1000  0.000321161   0.000130737  0.407076
-0.01   25  False 1000  100   0.000377012   0.000136801  0.362856
-0.01   25  False 1000  1000  0.000861125   0.00020272   0.235413
-0.2    1   True  100   100   0.000206952   9.69219e-05  0.46833
-0.2    1   True  100   1000  0.000348674   0.000147475  0.422959
-0.2    1   True  1000  100   0.000336908   0.00010122   0.300439
-0.2    1   True  1000  1000  0.001022      0.000203274  0.198898
-0.2    1   False 100   100   0.000207532   9.5412e-05   0.459746
-0.2    1   False 100   1000  0.000356127   0.000146824  0.41228
-0.2    1   False 1000  100   0.000322664   0.000100918  0.312764
-0.2    1   False 1000  1000  0.000998987   0.000203442  0.203648
-0.2    10  True  100   100   0.000211692   0.000109903  0.519165
-0.2    10  True  100   1000  0.000372819   0.000164321  0.440753
-0.2    10  True  1000  100   0.000338651   0.000144806  0.427596
-0.2    10  True  1000  1000  0.00108312    0.000758876  0.70064
-0.2    10  False 100   100   0.000215727   0.000110502  0.512231
-0.2    10  False 100   1000  0.000375419   0.0001613    0.429653
-0.2    10  False 1000  100   0.000336999   0.000145628  0.432132
-0.2    10  False 1000  1000  0.00110502    0.000762043  0.689618
-0.2    25  True  100   100   0.000218705   0.000129913  0.594009
-0.2    25  True  100   1000  0.000394794   0.00029428   0.745402
-0.2    25  True  1000  100   0.000404483   0.0002693    0.665788
-0.2    25  True  1000  1000  0.0012002     0.00194494   1.62052
-0.2    25  False 100   100   0.000221494   0.0001306    0.589632
-0.2    25  False 100   1000  0.000396436   0.000297204  0.74969
-0.2    25  False 1000  100   0.000409346   0.000270068  0.659754
-0.2    25  False 1000  1000  0.00121051    0.00193737   1.60046
-0.5    1   True  100   100   0.000214981   9.82111e-05  0.456836
-0.5    1   True  100   1000  0.000415328   0.000223073  0.537101
-0.5    1   True  1000  100   0.000358324   0.00011269   0.314492
-0.5    1   True  1000  1000  0.00137612    0.000437401  0.317851
-0.5    1   False 100   100   0.000224196   0.000101423  0.452386
-0.5    1   False 100   1000  0.000400987   0.000223286  0.556841
-0.5    1   False 1000  100   0.000368825   0.00011224   0.304318
-0.5    1   False 1000  1000  0.00136036    0.000429369  0.31563
-0.5    10  True  100   100   0.000222125   0.000112308  0.505608
-0.5    10  True  100   1000  0.000461088   0.00032357   0.701753
-0.5    10  True  1000  100   0.000394624   0.000225497  0.571422
-0.5    10  True  1000  1000  0.00158027    0.00190898   1.20801
-0.5    10  False 100   100   0.000232083   0.000114978  0.495418
-0.5    10  False 100   1000  0.000454574   0.000324632  0.714146
-0.5    10  False 1000  100   0.000379097   0.000227768  0.600817
-0.5    10  False 1000  1000  0.00160292    0.00190168   1.18638
-0.5    25  True  100   100   0.00023429    0.000151703  0.647501
-0.5    25  True  100   1000  0.000497462   0.000598873  1.20386
-0.5    25  True  1000  100   0.000460778   0.000557038  1.20891
-0.5    25  True  1000  1000  0.00170036    0.00467336   2.74845
-0.5    25  False 100   100   0.000228981   0.000155334  0.678371
-0.5    25  False 100   1000  0.000496139   0.000620789  1.25124
-0.5    25  False 1000  100   0.00045473    0.000551528  1.21287
-0.5    25  False 1000  1000  0.00171793    0.00467152   2.71927
-0.8    1   True  100   100   0.000222037   0.000105301  0.47425
-0.8    1   True  100   1000  0.000410804   0.000329327  0.801664
-0.8    1   True  1000  100   0.000349735   0.000131225  0.375212
-0.8    1   True  1000  1000  0.00139219    0.000677065  0.48633
-0.8    1   False 100   100   0.000214079   0.000107486  0.502085
-0.8    1   False 100   1000  0.000413746   0.000323244  0.781261
-0.8    1   False 1000  100   0.000348983   0.000131983  0.378193
-0.8    1   False 1000  1000  0.00136296    0.000685325  0.50282
-0.8    10  True  100   100   0.000229159   0.00011825   0.516017
-0.8    10  True  100   1000  0.000498845   0.000532618  1.0677
-0.8    10  True  1000  100   0.000383126   0.00029935   0.781336
-0.8    10  True  1000  1000  0.00162866    0.00307312   1.88689
-0.8    10  False 100   100   0.000230783   0.000124958  0.541452
-0.8    10  False 100   1000  0.000493393   0.000550654  1.11606
-0.8    10  False 1000  100   0.000377167   0.000298581  0.791642
-0.8    10  False 1000  1000  0.00165795    0.00305103   1.84024
-0.8    25  True  100   100   0.000233496   0.000175241  0.75051
-0.8    25  True  100   1000  0.00055654    0.00102658   1.84458
-0.8    25  True  1000  100   0.000463814   0.000783267  1.68875
-0.8    25  True  1000  1000  0.00186905    0.00755344   4.04132
-0.8    25  False 100   100   0.000240243   0.000175047  0.728625
-0.8    25  False 100   1000  0.000578102   0.00104499   1.80763
-0.8    25  False 1000  100   0.000485113   0.000776849  1.60138
-0.8    25  False 1000  1000  0.00211448    0.00752736   3.55992
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: SparseTensor A, of rank 2.
-*  <b>`b`</b>: A dense Matrix with the same dtype as sp_a.
-*  <b>`adjoint_a`</b>: Use the adjoint of A in the matrix multiply.  If A is complex,
-    this is transpose(conj(A)).  Otherwise it's transpose(A).
-*  <b>`adjoint_b`</b>: Use the adjoint of B in the matrix multiply.  If B is complex,
-    this is transpose(conj(B)).  Otherwise it's transpose(B).
-*  <b>`name`</b>: A name prefix for the returned tensors (optional)
-
-##### Returns:
-
-  A dense matrix (pseudo-code in dense np.matrix notation):
-    A = A.H if adjoint_a else A
-    B = B.H if adjoint_b else B
-    return A*B
-
-
-- - -
-
-### `tf.sparse_maximum(sp_a, sp_b, name=None)` {#sparse_maximum}
-
-Returns the element-wise max of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-Example:
-
-```python
-sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
-sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-res = tf.sparse_maximum(sp_zero, sp_one).eval()
-# "res" should be equal to SparseTensor([[0], [1]], [0, 1], [7]).
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: a `SparseTensor` operand whose dtype is real, and indices
-    lexicographically ordered.
-*  <b>`sp_b`</b>: the other `SparseTensor` operand with the same requirements (and the
-    same shape).
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: the output SparseTensor.
-
-
-- - -
-
-### `tf.sparse_minimum(sp_a, sp_b, name=None)` {#sparse_minimum}
-
-Returns the element-wise min of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-Example:
-
-```python
-sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
-sp_one = sparse_tensor.SparseTensor([[1]], [1], [7])
-res = tf.sparse_minimum(sp_zero, sp_one).eval()
-# "res" should be equal to SparseTensor([[0], [1]], [0, 0], [7]).
-```
-
-##### Args:
-
-
-*  <b>`sp_a`</b>: a `SparseTensor` operand whose dtype is real, and indices
-    lexicographically ordered.
-*  <b>`sp_b`</b>: the other `SparseTensor` operand with the same requirements (and the
-    same shape).
-*  <b>`name`</b>: optional name of the operation.
-
-##### Returns:
-
-
-*  <b>`output`</b>: the output SparseTensor.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
deleted file mode 100644
index 6c25ef1ff38..00000000000
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ /dev/null
@@ -1,3711 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Variables
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Variables
-
-- - -
-
-### `class tf.Variable` {#Variable}
-
-See the [Variables How To](../../how_tos/variables/index.md) for a high
-level overview.
-
-A variable maintains state in the graph across calls to `run()`. You add a
-variable to the graph by constructing an instance of the class `Variable`.
-
-The `Variable()` constructor requires an initial value for the variable,
-which can be a `Tensor` of any type and shape. The initial value defines the
-type and shape of the variable. After construction, the type and shape of
-the variable are fixed. The value can be changed using one of the assign
-methods.
-
-If you want to change the shape of a variable later you have to use an
-`assign` Op with `validate_shape=False`.
-
-Just like any `Tensor`, variables created with `Variable()` can be used as
-inputs for other Ops in the graph. Additionally, all the operators
-overloaded for the `Tensor` class are carried over to variables, so you can
-also add nodes to the graph by just doing arithmetic on variables.
-
-```python
-import tensorflow as tf
-
-# Create a variable.
-w = tf.Variable(<initial-value>, name=<optional-name>)
-
-# Use the variable in the graph like any Tensor.
-y = tf.matmul(w, ...another variable or tensor...)
-
-# The overloaded operators are available too.
-z = tf.sigmoid(w + y)
-
-# Assign a new value to the variable with `assign()` or a related method.
-w.assign(w + 1.0)
-w.assign_add(1.0)
-```
-
-When you launch the graph, variables have to be explicitly initialized before
-you can run Ops that use their value. You can initialize a variable by
-running its *initializer op*, restoring the variable from a save file, or
-simply running an `assign` Op that assigns a value to the variable. In fact,
-the variable *initializer op* is just an `assign` Op that assigns the
-variable's initial value to the variable itself.
-
-```python
-# Launch the graph in a session.
-with tf.Session() as sess:
-    # Run the variable initializer.
-    sess.run(w.initializer)
-    # ...you now can run ops that use the value of 'w'...
-```
-
-The most common initialization pattern is to use the convenience function
-`global_variables_initializer()` to add an Op to the graph that initializes
-all the variables. You then run that Op after launching the graph.
-
-```python
-# Add an Op to initialize global variables.
-init_op = tf.global_variables_initializer()
-
-# Launch the graph in a session.
-with tf.Session() as sess:
-    # Run the Op that initializes global variables.
-    sess.run(init_op)
-    # ...you can now run any Op that uses variable values...
-```
-
-If you need to create a variable with an initial value dependent on another
-variable, use the other variable's `initialized_value()`. This ensures that
-variables are initialized in the right order.
-
-All variables are automatically collected in the graph where they are
-created. By default, the constructor adds the new variable to the graph
-collection `GraphKeys.GLOBAL_VARIABLES`. The convenience function
-`global_variables()` returns the contents of that collection.
-
-When building a machine learning model it is often convenient to distinguish
-between variables holding the trainable model parameters and other variables
-such as a `global step` variable used to count training steps. To make this
-easier, the variable constructor supports a `trainable=<bool>` parameter. If
-`True`, the new variable is also added to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`. The convenience function
-`trainable_variables()` returns the contents of this collection. The
-various `Optimizer` classes use this collection as the default list of
-variables to optimize.
-
-
-Creating a variable.
-
-- - -
-
-#### `tf.Variable.__init__(initial_value=None, trainable=True, collections=None, validate_shape=True, caching_device=None, name=None, variable_def=None, dtype=None, expected_shape=None, import_scope=None)` {#Variable.__init__}
-
-Creates a new variable with value `initial_value`.
-
-The new variable is added to the graph collections listed in `collections`,
-which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-
-If `trainable` is `True` the variable is also added to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`.
-
-This constructor creates both a `variable` Op and an `assign` Op to set the
-variable to its initial value.
-
-##### Args:
-
-
-*  <b>`initial_value`</b>: A `Tensor`, or Python object convertible to a `Tensor`,
-    which is the initial value for the Variable. The initial value must have
-    a shape specified unless `validate_shape` is set to False. Can also be a
-    callable with no argument that returns the initial value when called. In
-    that case, `dtype` must be specified. (Note that initializer functions
-    from init_ops.py must first be bound to a shape before being used here.)
-*  <b>`trainable`</b>: If `True`, the default, also adds the variable to the graph
-    collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
-    the default list of variables to use by the `Optimizer` classes.
-*  <b>`collections`</b>: List of graph collections keys. The new variable is added to
-    these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-*  <b>`validate_shape`</b>: If `False`, allows the variable to be initialized with a
-    value of unknown shape. If `True`, the default, the shape of
-    `initial_value` must be known.
-*  <b>`caching_device`</b>: Optional device string describing where the Variable
-    should be cached for reading.  Defaults to the Variable's device.
-    If not `None`, caches on another device.  Typical use is to cache
-    on the device where the Ops using the Variable reside, to deduplicate
-    copying through `Switch` and other conditional statements.
-*  <b>`name`</b>: Optional name for the variable. Defaults to `'Variable'` and gets
-    uniquified automatically.
-*  <b>`variable_def`</b>: `VariableDef` protocol buffer. If not `None`, recreates
-    the Variable object with its contents. `variable_def` and the other
-    arguments are mutually exclusive.
-*  <b>`dtype`</b>: If set, initial_value will be converted to the given type.
-    If `None`, either the datatype will be kept (if `initial_value` is
-    a Tensor), or `convert_to_tensor` will decide.
-*  <b>`expected_shape`</b>: A TensorShape. If set, initial_value is expected
-    to have this shape.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add to the
-    `Variable.` Only used when initializing from protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `variable_def` and initial_value are specified.
-*  <b>`ValueError`</b>: If the initial value is not specified, or does not have a
-    shape and `validate_shape` is `True`.
-
-
-- - -
-
-#### `tf.Variable.initialized_value()` {#Variable.initialized_value}
-
-Returns the value of the initialized variable.
-
-You should use this instead of the variable itself to initialize another
-variable with a value that depends on the value of this variable.
-
-Beware of using initialized_value except during initialization:
-initialized_value causes the Variable's initializer op to be run, so running
-this op resets the variable to the initial value.
-
-```python
-# Initialize 'v' with a random tensor.
-v = tf.Variable(tf.truncated_normal([10, 40]))
-# Use `initialized_value` to guarantee that `v` has been
-# initialized before its value is used to initialize `w`.
-# The random values are picked only once.
-w = tf.Variable(v.initialized_value() * 2.0)
-```
-
-##### Returns:
-
-  A `Tensor` holding the value of this variable after its initializer
-  has run.
-
-
-
-Changing a variable value.
-
-- - -
-
-#### `tf.Variable.assign(value, use_locking=False)` {#Variable.assign}
-
-Assigns a new value to the variable.
-
-This is essentially a shortcut for `assign(self, value)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: A `Tensor`. The new value for this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the assignment.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the assignment has completed.
-
-
-- - -
-
-#### `tf.Variable.assign_add(delta, use_locking=False)` {#Variable.assign_add}
-
-Adds a value to this variable.
-
- This is essentially a shortcut for `assign_add(self, delta)`.
-
-##### Args:
-
-
-*  <b>`delta`</b>: A `Tensor`. The value to add to this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the addition has completed.
-
-
-- - -
-
-#### `tf.Variable.assign_sub(delta, use_locking=False)` {#Variable.assign_sub}
-
-Subtracts a value from this variable.
-
-This is essentially a shortcut for `assign_sub(self, delta)`.
-
-##### Args:
-
-
-*  <b>`delta`</b>: A `Tensor`. The value to subtract from this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the subtraction has completed.
-
-
-- - -
-
-#### `tf.Variable.scatter_sub(sparse_delta, use_locking=False)` {#Variable.scatter_sub}
-
-Subtracts `IndexedSlices` from this variable.
-
-This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices,
-sparse_delta.values)`.
-
-##### Args:
-
-
-*  <b>`sparse_delta`</b>: `IndexedSlices` to be subtracted from this variable.
-*  <b>`use_locking`</b>: If `True`, use locking during the operation.
-
-##### Returns:
-
-  A `Tensor` that will hold the new value of this variable after
-  the scattered subtraction has completed.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `sparse_delta` is not an `IndexedSlices`.
-
-
-- - -
-
-#### `tf.Variable.count_up_to(limit)` {#Variable.count_up_to}
-
-Increments this variable until it reaches `limit`.
-
-When that Op is run it tries to increment the variable by `1`. If
-incrementing the variable would bring it above `limit` then the Op raises
-the exception `OutOfRangeError`.
-
-If no error is raised, the Op outputs the value of the variable before
-the increment.
-
-This is essentially a shortcut for `count_up_to(self, limit)`.
-
-##### Args:
-
-
-*  <b>`limit`</b>: value at which incrementing the variable raises an error.
-
-##### Returns:
-
-  A `Tensor` that will hold the variable value before the increment. If no
-  other Op modifies this variable, the values produced will all be
-  distinct.
-
-
-
-- - -
-
-#### `tf.Variable.eval(session=None)` {#Variable.eval}
-
-In a session, computes and returns the value of this variable.
-
-This is not a graph construction method, it does not add ops to the graph.
-
-This convenience method requires a session where the graph containing this
-variable has been launched. If no session is passed, the default session is
-used.  See the [Session class](../../api_docs/python/client.md#Session) for
-more information on launching a graph and on sessions.
-
-```python
-v = tf.Variable([1, 2])
-init = tf.global_variables_initializer()
-
-with tf.Session() as sess:
-    sess.run(init)
-    # Usage passing the session explicitly.
-    print(v.eval(sess))
-    # Usage with the default session.  The 'with' block
-    # above makes 'sess' the default session.
-    print(v.eval())
-```
-
-##### Args:
-
-
-*  <b>`session`</b>: The session to use to evaluate this variable. If
-    none, the default session is used.
-
-##### Returns:
-
-  A numpy `ndarray` with a copy of the value of this variable.
-
-
-
-Properties.
-
-- - -
-
-#### `tf.Variable.name` {#Variable.name}
-
-The name of this variable.
-
-
-- - -
-
-#### `tf.Variable.dtype` {#Variable.dtype}
-
-The `DType` of this variable.
-
-
-- - -
-
-#### `tf.Variable.get_shape()` {#Variable.get_shape}
-
-The `TensorShape` of this variable.
-
-##### Returns:
-
-  A `TensorShape`.
-
-
-- - -
-
-#### `tf.Variable.device` {#Variable.device}
-
-The device of this variable.
-
-
-- - -
-
-#### `tf.Variable.initializer` {#Variable.initializer}
-
-The initializer operation for this variable.
-
-
-- - -
-
-#### `tf.Variable.graph` {#Variable.graph}
-
-The `Graph` of this variable.
-
-
-- - -
-
-#### `tf.Variable.op` {#Variable.op}
-
-The `Operation` of this variable.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.Variable.__abs__(a, *args)` {#Variable.__abs__}
-
-Computes the absolute value of a tensor.
-
-Given a tensor of real numbers `x`, this operation returns a tensor
-containing the absolute value of each element in `x`. For example, if x is
-an input element and y is an output element, this operation computes
-\\(y = |x|\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-    `int64`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
-    values.
-
-
-- - -
-
-#### `tf.Variable.__add__(a, *args)` {#Variable.__add__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__and__(a, *args)` {#Variable.__and__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__div__(a, *args)` {#Variable.__div__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Variable.__floordiv__(a, *args)` {#Variable.__floordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Variable.__ge__(a, *args)` {#Variable.__ge__}
-
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__getitem__(var, slice_spec)` {#Variable.__getitem__}
-
-Creates a slice helper object given a variable.
-
-This allows creating a sub-tensor from part of the current contents
-of a variable.
-See
-[`Tensor.__getitem__`](../../api_docs/python/framework.md#Tensor.__getitem__)
-for detailed examples of slicing.
-
-This function in addition also allows assignment to a sliced range.
-This is similar to `__setitem__` functionality in Python. However,
-the syntax is different so that the user can capture the assignment
-operation for grouping or passing to `sess.run()`.
-For example,
-
-```prettyprint
-import tensorflow as tf
-A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  print sess.run(A[:2, :2]) # => [[1,2], [4,5]]
-
-  op = A[:2,:2].assign(22. * tf.ones((2, 2)))
-  print sess.run(op) # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
-```
-
-Note that assignments currently do not support NumPy broadcasting
-semantics.
-
-##### Args:
-
-
-*  <b>`var`</b>: An `ops.Variable` object.
-*  <b>`slice_spec`</b>: The arguments to `Tensor.__getitem__`.
-
-##### Returns:
-
-  The appropriate slice of "tensor", based on "slice_spec".
-  As an operator. The operator also has a `assign()` method
-  that can be used to generate an assignment operator.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If a slice range is negative size.
-*  <b>`TypeError`</b>: If the slice indices aren't int, slice, or Ellipsis.
-
-
-- - -
-
-#### `tf.Variable.__gt__(a, *args)` {#Variable.__gt__}
-
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__invert__(a, *args)` {#Variable.__invert__}
-
-Returns the truth value of NOT x element-wise.
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__iter__()` {#Variable.__iter__}
-
-Dummy method to prevent iteration. Do not call.
-
-NOTE(mrry): If we register __getitem__ as an overloaded operator,
-Python will valiantly attempt to iterate over the variable's Tensor from 0
-to infinity.  Declaring this method prevents this unintended behavior.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: when invoked.
-
-
-- - -
-
-#### `tf.Variable.__le__(a, *args)` {#Variable.__le__}
-
-Returns the truth value of (x <= y) element-wise.
-
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__lt__(a, *args)` {#Variable.__lt__}
-
-Returns the truth value of (x < y) element-wise.
-
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`, `half`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__mod__(a, *args)` {#Variable.__mod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__mul__(a, *args)` {#Variable.__mul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Variable.__neg__(a, *args)` {#Variable.__neg__}
-
-Computes numerical negative value element-wise.
-
-I.e., \\(y = -x\\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__or__(a, *args)` {#Variable.__or__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__pow__(a, *args)` {#Variable.__pow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.__radd__(a, *args)` {#Variable.__radd__}
-
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`, `complex128`, `string`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rand__(a, *args)` {#Variable.__rand__}
-
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__rdiv__(a, *args)` {#Variable.__rdiv__}
-
-Divide two values using Python 2 semantics. Used for Tensor.__div__.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` returns the quotient of x and y.
-
-
-- - -
-
-#### `tf.Variable.__rfloordiv__(a, *args)` {#Variable.__rfloordiv__}
-
-Divides `x / y` elementwise, rounding toward the most negative integer.
-
-The same as `tf.div(x,y)` for integers, but uses `tf.floor(tf.div(x,y))` for
-floating point arguments so that the result is always an integer (though
-possibly an integer represented as floating point).  This op is generated by
-`x // y` floor division in Python 3 and in Python 2.7 with
-`from __future__ import division`.
-
-Note that for efficiency, `floordiv` uses C semantics for negative numbers
-(unlike Python and Numpy).
-
-`x` and `y` must have the same type, and the result will have the same type
-as well.
-
-##### Args:
-
-
-*  <b>`x`</b>: `Tensor` numerator of real numeric type.
-*  <b>`y`</b>: `Tensor` denominator of real numeric type.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  `x / y` rounded down (except possibly towards zero for negative integers).
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the inputs are complex.
-
-
-- - -
-
-#### `tf.Variable.__rmod__(a, *args)` {#Variable.__rmod__}
-
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `float32`, `float64`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rmul__(a, *args)` {#Variable.__rmul__}
-
-Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse".
-
-
-- - -
-
-#### `tf.Variable.__ror__(a, *args)` {#Variable.__ror__}
-
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `bool`.
-*  <b>`y`</b>: A `Tensor` of type `bool`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `bool`.
-
-
-- - -
-
-#### `tf.Variable.__rpow__(a, *args)` {#Variable.__rpow__}
-
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`y`</b>: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-   or `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.__rsub__(a, *args)` {#Variable.__rsub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__rtruediv__(a, *args)` {#Variable.__rtruediv__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__rxor__(a, *args)` {#Variable.__rxor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Variable.__str__()` {#Variable.__str__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__sub__(a, *args)` {#Variable.__sub__}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-#### `tf.Variable.__truediv__(a, *args)` {#Variable.__truediv__}
-
-
-
-
-- - -
-
-#### `tf.Variable.__xor__(a, *args)` {#Variable.__xor__}
-
-x ^ y = (x | y) & ~(x & y).
-
-
-- - -
-
-#### `tf.Variable.from_proto(variable_def, import_scope=None)` {#Variable.from_proto}
-
-Returns a `Variable` object created from `variable_def`.
-
-
-- - -
-
-#### `tf.Variable.initial_value` {#Variable.initial_value}
-
-Returns the Tensor used as the initial value for the variable.
-
-Note that this is different from `initialized_value()` which runs
-the op that initializes the variable before returning its value.
-This method returns the tensor that is used by the op that initializes
-the variable.
-
-##### Returns:
-
-  A `Tensor`.
-
-
-- - -
-
-#### `tf.Variable.load(value, session=None)` {#Variable.load}
-
-Load new value into this variable
-
-Writes new value to variable's memory. Doesn't add ops to the graph.
-
-This convenience method requires a session where the graph containing this
-variable has been launched. If no session is passed, the default session is
-used.  See the [Session class](../../api_docs/python/client.md#Session) for
-more information on launching a graph and on sessions.
-
-```python
-v = tf.Variable([1, 2])
-init = tf.global_variables_initializer()
-
-with tf.Session() as sess:
-    sess.run(init)
-    # Usage passing the session explicitly.
-    v.load([2, 3], sess)
-    print(v.eval(sess)) # prints [2 3]
-    # Usage with the default session.  The 'with' block
-    # above makes 'sess' the default session.
-    v.load([3, 4], sess)
-    print(v.eval()) # prints [3 4]
-```
-
-##### Args:
-
-
-*  <b>`value`</b>: New variable value
-*  <b>`session`</b>: The session to use to evaluate this variable. If
-      none, the default session is used.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: Session is not passed and no default session
-
-
-- - -
-
-#### `tf.Variable.read_value()` {#Variable.read_value}
-
-Returns the value of this variable, read in the current context.
-
-Can be different from value() if it's on another device, with control
-dependencies, etc.
-
-##### Returns:
-
-  A `Tensor` containing the value of the variable.
-
-
-- - -
-
-#### `tf.Variable.set_shape(shape)` {#Variable.set_shape}
-
-Overrides the shape for this variable.
-
-##### Args:
-
-
-*  <b>`shape`</b>: the `TensorShape` representing the overridden shape.
-
-
-- - -
-
-#### `tf.Variable.to_proto(export_scope=None)` {#Variable.to_proto}
-
-Converts a `Variable` to a `VariableDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `VariableDef` protocol buffer, or `None` if the `Variable` is not
-  in the specified name scope.
-
-
-- - -
-
-#### `tf.Variable.value()` {#Variable.value}
-
-Returns the last snapshot of this variable.
-
-You usually do not need to call this method as all ops that need the value
-of the variable call it automatically through a `convert_to_tensor()` call.
-
-Returns a `Tensor` which holds the value of the variable.  You can not
-assign a new value to this tensor as it is not a reference to the variable.
-
-To avoid copies, if the consumer of the returned value is on the same device
-as the variable, this actually returns the live value of the variable, not
-a copy.  Updates to the variable are seen by the consumer.  If the consumer
-is on a different device it will get a copy of the variable.
-
-##### Returns:
-
-  A `Tensor` containing the value of the variable.
-
-
-
-
-## Variable helper functions
-
-TensorFlow provides a set of functions to help manage the set of variables
-collected in the graph.
-
-- - -
-
-### `tf.global_variables()` {#global_variables}
-
-Returns global variables.
-
-Global variables are variables that are shared across machines in a
-distributed environment. The `Variable()` constructor or `get_variable()`
-automatically adds new variables to the graph collection
-`GraphKeys.GLOBAL_VARIABLES`.
-This convenience function returns the contents of that collection.
-
-An alternative to global variables are local variables. See
-[`tf.local_variables()`](../../api_docs/python/state_ops.md#local_variables)
-
-##### Returns:
-
-  A list of `Variable` objects.
-
-
-- - -
-
-### `tf.local_variables()` {#local_variables}
-
-Returns local variables.
-
-Local variables - per process variables, usually not saved/restored to
-checkpoint and used for temporary or intermediate values.
-For example, they can be used as counters for metrics computation or
-number of epochs this machine has read data.
-The `tf.contrib.framework.local_variable()` function automatically adds the
-new variable to `GraphKeys.LOCAL_VARIABLES`.
-This convenience function returns the contents of that collection.
-
-An alternative to local variables are global variables. See
-[`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
-
-##### Returns:
-
-  A list of local `Variable` objects.
-
-
-- - -
-
-### `tf.model_variables()` {#model_variables}
-
-Returns all variables in the MODEL_VARIABLES collection.
-
-##### Returns:
-
-  A list of local Variable objects.
-
-
-- - -
-
-### `tf.trainable_variables()` {#trainable_variables}
-
-Returns all variables created with `trainable=True`.
-
-When passed `trainable=True`, the `Variable()` constructor automatically
-adds new variables to the graph collection
-`GraphKeys.TRAINABLE_VARIABLES`. This convenience function returns the
-contents of that collection.
-
-##### Returns:
-
-  A list of Variable objects.
-
-
-- - -
-
-### `tf.moving_average_variables()` {#moving_average_variables}
-
-Returns all variables that maintain their moving averages.
-
-If an `ExponentialMovingAverage` object is created and the `apply()`
-method is called on a list of variables, these variables will
-be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
-This convenience function returns the contents of that collection.
-
-##### Returns:
-
-  A list of Variable objects.
-
-
-
-- - -
-
-### `tf.global_variables_initializer()` {#global_variables_initializer}
-
-Returns an Op that initializes global variables.
-
-This is just a shortcut for `variable_initializers(global_variables())`
-
-##### Returns:
-
-  An Op that initializes global variables in the graph.
-
-
-- - -
-
-### `tf.local_variables_initializer()` {#local_variables_initializer}
-
-Returns an Op that initializes all local variables.
-
-This is just a shortcut for `variable_initializers(local_variables())`
-
-##### Returns:
-
-  An Op that initializes all local variables in the graph.
-
-
-- - -
-
-### `tf.variables_initializer(var_list, name='init')` {#variables_initializer}
-
-Returns an Op that initializes a list of variables.
-
-After you launch the graph in a session, you can run the returned Op to
-initialize all the variables in `var_list`. This Op runs all the
-initializers of the variables in `var_list` in parallel.
-
-Calling `initialize_variables()` is equivalent to passing the list of
-initializers to `Group()`.
-
-If `var_list` is empty, however, the function still returns an Op that can
-be run. That Op just has no effect.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to initialize.
-*  <b>`name`</b>: Optional name for the returned operation.
-
-##### Returns:
-
-  An Op that run the initializers of all the specified variables.
-
-
-- - -
-
-### `tf.is_variable_initialized(variable)` {#is_variable_initialized}
-
-Tests if a variable has been initialized.
-
-##### Args:
-
-
-*  <b>`variable`</b>: A `Variable`.
-
-##### Returns:
-
-  Returns a scalar boolean Tensor, `True` if the variable has been
-  initialized, `False` otherwise.
-
-
-- - -
-
-### `tf.report_uninitialized_variables(var_list=None, name='report_uninitialized_variables')` {#report_uninitialized_variables}
-
-Adds ops to list the names of uninitialized variables.
-
-When run, it returns a 1-D tensor containing the names of uninitialized
-variables if there are any, or an empty array if there are none.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to check. Defaults to the
-    value of `global_variables() + local_variables()`
-*  <b>`name`</b>: Optional name of the `Operation`.
-
-##### Returns:
-
-  A 1-D tensor containing names of the uninitialized variables, or an empty
-  1-D tensor if there are no variables or no uninitialized variables.
-
-
-- - -
-
-### `tf.assert_variables_initialized(var_list=None)` {#assert_variables_initialized}
-
-Returns an Op to check if variables are initialized.
-
-NOTE: This function is obsolete and will be removed in 6 months.  Please
-change your implementation to use `report_uninitialized_variables()`.
-
-When run, the returned Op will raise the exception `FailedPreconditionError`
-if any of the variables has not yet been initialized.
-
-Note: This function is implemented by trying to fetch the values of the
-variables. If one of the variables is not initialized a message may be
-logged by the C++ runtime. This is expected.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: List of `Variable` objects to check. Defaults to the
-    value of `global_variables().`
-
-##### Returns:
-
-  An Op, or None if there are no variables.
-
-
-
-- - -
-
-### `tf.assign(ref, value, validate_shape=None, use_locking=None, name=None)` {#assign}
-
-Update 'ref' by assigning 'value' to it.
-
-This operation outputs "ref" after the assignment is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`.
-    Should be from a `Variable` node. May be uninitialized.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be assigned to the variable.
-*  <b>`validate_shape`</b>: An optional `bool`. Defaults to `True`.
-    If true, the operation will validate that the shape
-    of 'value' matches the shape of the Tensor being assigned to.  If false,
-    'ref' will take on the shape of 'value'.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    If True, the assignment will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been reset.
-
-
-- - -
-
-### `tf.assign_add(ref, value, use_locking=None, name=None)` {#assign_add}
-
-Update 'ref' by adding 'value' to it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types:
-    `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`,
-    `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be added to the variable.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the addition will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-
-
-- - -
-
-### `tf.assign_sub(ref, value, use_locking=None, name=None)` {#assign_sub}
-
-Update 'ref' by subtracting 'value' from it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types:
-    `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`,
-    `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
-    The value to be subtracted to the variable.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the subtraction will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-
-
-
-## Saving and Restoring Variables
-
-- - -
-
-### `class tf.train.Saver` {#Saver}
-
-Saves and restores variables.
-
-See [Variables](../../how_tos/variables/index.md)
-for an overview of variables, saving and restoring.
-
-The `Saver` class adds ops to save and restore variables to and from
-*checkpoints*.  It also provides convenience methods to run these ops.
-
-Checkpoints are binary files in a proprietary format which map variable names
-to tensor values.  The best way to examine the contents of a checkpoint is to
-load it using a `Saver`.
-
-Savers can automatically number checkpoint filenames with a provided counter.
-This lets you keep multiple checkpoints at different steps while training a
-model.  For example you can number the checkpoint filenames with the training
-step number.  To avoid filling up disks, savers manage checkpoint files
-automatically. For example, they can keep only the N most recent files, or
-one checkpoint for every N hours of training.
-
-You number checkpoint filenames by passing a value to the optional
-`global_step` argument to `save()`:
-
-```python
-saver.save(sess, 'my-model', global_step=0) ==> filename: 'my-model-0'
-...
-saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000'
-```
-
-Additionally, optional arguments to the `Saver()` constructor let you control
-the proliferation of checkpoint files on disk:
-
-* `max_to_keep` indicates the maximum number of recent checkpoint files to
-  keep.  As new files are created, older files are deleted.  If None or 0,
-  all checkpoint files are kept.  Defaults to 5 (that is, the 5 most recent
-  checkpoint files are kept.)
-
-* `keep_checkpoint_every_n_hours`: In addition to keeping the most recent
-  `max_to_keep` checkpoint files, you might want to keep one checkpoint file
-  for every N hours of training.  This can be useful if you want to later
-  analyze how a model progressed during a long training session.  For
-  example, passing `keep_checkpoint_every_n_hours=2` ensures that you keep
-  one checkpoint file for every 2 hours of training.  The default value of
-  10,000 hours effectively disables the feature.
-
-Note that you still have to call the `save()` method to save the model.
-Passing these arguments to the constructor will not save variables
-automatically for you.
-
-A training program that saves regularly looks like:
-
-```python
-...
-# Create a saver.
-saver = tf.train.Saver(...variables...)
-# Launch the graph and train, saving the model every 1,000 steps.
-sess = tf.Session()
-for step in xrange(1000000):
-    sess.run(..training_op..)
-    if step % 1000 == 0:
-        # Append the step number to the checkpoint name:
-        saver.save(sess, 'my-model', global_step=step)
-```
-
-In addition to checkpoint files, savers keep a protocol buffer on disk with
-the list of recent checkpoints. This is used to manage numbered checkpoint
-files and by `latest_checkpoint()`, which makes it easy to discover the path
-to the most recent checkpoint. That protocol buffer is stored in a file named
-'checkpoint' next to the checkpoint files.
-
-If you create several savers, you can specify a different filename for the
-protocol buffer file in the call to `save()`.
-
-- - -
-
-#### `tf.train.Saver.__init__(var_list=None, reshape=False, sharded=False, max_to_keep=5, keep_checkpoint_every_n_hours=10000.0, name=None, restore_sequentially=False, saver_def=None, builder=None, defer_build=False, allow_empty=False, write_version=2, pad_step_number=False)` {#Saver.__init__}
-
-Creates a `Saver`.
-
-The constructor adds ops to save and restore variables.
-
-`var_list` specifies the variables that will be saved and restored. It can
-be passed as a `dict` or a list:
-
-* A `dict` of names to variables: The keys are the names that will be
-  used to save or restore the variables in the checkpoint files.
-* A list of variables: The variables will be keyed with their op name in
-  the checkpoint files.
-
-For example:
-
-```python
-v1 = tf.Variable(..., name='v1')
-v2 = tf.Variable(..., name='v2')
-
-# Pass the variables as a dict:
-saver = tf.train.Saver({'v1': v1, 'v2': v2})
-
-# Or pass them as a list.
-saver = tf.train.Saver([v1, v2])
-# Passing a list is equivalent to passing a dict with the variable op names
-# as keys:
-saver = tf.train.Saver({v.op.name: v for v in [v1, v2]})
-```
-
-The optional `reshape` argument, if `True`, allows restoring a variable from
-a save file where the variable had a different shape, but the same number
-of elements and type.  This is useful if you have reshaped a variable and
-want to reload it from an older checkpoint.
-
-The optional `sharded` argument, if `True`, instructs the saver to shard
-checkpoints per device.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: A list of `Variable`/`SaveableObject`, or a dictionary mapping
-    names to `SaveableObject`s. If `None`, defaults to the list of all
-    saveable objects.
-*  <b>`reshape`</b>: If `True`, allows restoring parameters from a checkpoint
-    where the variables have a different shape.
-*  <b>`sharded`</b>: If `True`, shard the checkpoints, one per device.
-*  <b>`max_to_keep`</b>: Maximum number of recent checkpoints to keep.
-    Defaults to 5.
-*  <b>`keep_checkpoint_every_n_hours`</b>: How often to keep checkpoints.
-    Defaults to 10,000 hours.
-*  <b>`name`</b>: String.  Optional name to use as a prefix when adding operations.
-*  <b>`restore_sequentially`</b>: A `Bool`, which if true, causes restore of different
-    variables to happen sequentially within each device.  This can lower
-    memory usage when restoring very large models.
-*  <b>`saver_def`</b>: Optional `SaverDef` proto to use instead of running the
-    builder. This is only useful for specialty code that wants to recreate
-    a `Saver` object for a previously built `Graph` that had a `Saver`.
-    The `saver_def` proto should be the one returned by the
-    `as_saver_def()` call of the `Saver` that was created for that `Graph`.
-*  <b>`builder`</b>: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-    Defaults to `BaseSaverBuilder()`.
-*  <b>`defer_build`</b>: If `True`, defer adding the save and restore ops to the
-    `build()` call. In that case `build()` should be called before
-    finalizing the graph or using the saver.
-*  <b>`allow_empty`</b>: If `False` (default) raise an error if there are no
-    variables in the graph. Otherwise, construct the saver anyway and make
-    it a no-op.
-*  <b>`write_version`</b>: controls what format to use when saving checkpoints.  It
-    also affects certain filepath matching logic.  The V2 format is the
-    recommended choice: it is much more optimized than V1 in terms of
-    memory required and latency incurred during restore.  Regardless of
-    this flag, the Saver is able to restore from both V2 and V1 checkpoints.
-*  <b>`pad_step_number`</b>: if True, pads the global step number in the checkpoint
-    filepaths to some fixed width (8 by default).  This is turned off by
-    default.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` is invalid.
-*  <b>`ValueError`</b>: If any of the keys or values in `var_list` are not unique.
-
-
-- - -
-
-#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta', write_meta_graph=True, write_state=True)` {#Saver.save}
-
-Saves variables.
-
-This method runs the ops added by the constructor for saving variables.
-It requires a session in which the graph was launched.  The variables to
-save must also have been initialized.
-
-The method returns the path of the newly created checkpoint file.  This
-path can be passed directly to a call to `restore()`.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session to use to save the variables.
-*  <b>`save_path`</b>: String.  Path to the checkpoint filename.  If the saver is
-    `sharded`, this is the prefix of the sharded checkpoint filename.
-*  <b>`global_step`</b>: If provided the global step number is appended to
-    `save_path` to create the checkpoint filename. The optional argument
-    can be a `Tensor`, a `Tensor` name or an integer.
-*  <b>`latest_filename`</b>: Optional name for the protocol buffer file that will
-    contains the list of most recent checkpoint filenames.  That file,
-    kept in the same directory as the checkpoint files, is automatically
-    managed by the saver to keep track of recent checkpoints.  Defaults to
-    'checkpoint'.
-*  <b>`meta_graph_suffix`</b>: Suffix for `MetaGraphDef` file. Defaults to 'meta'.
-*  <b>`write_meta_graph`</b>: `Boolean` indicating whether or not to write the meta
-    graph file.
-*  <b>`write_state`</b>: `Boolean` indicating whether or not to write the
-    `CheckpointStateProto`.
-
-##### Returns:
-
-  A string: path at which the variables were saved.  If the saver is
-    sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-    is the number of shards created.
-  If the saver is empty, returns None.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `sess` is not a `Session`.
-*  <b>`ValueError`</b>: If `latest_filename` contains path components, or if it
-    collides with `save_path`.
-*  <b>`RuntimeError`</b>: If save and restore ops weren't built.
-
-
-- - -
-
-#### `tf.train.Saver.restore(sess, save_path)` {#Saver.restore}
-
-Restores previously saved variables.
-
-This method runs the ops added by the constructor for restoring variables.
-It requires a session in which the graph was launched.  The variables to
-restore do not have to have been initialized, as restoring is itself a way
-to initialize variables.
-
-The `save_path` argument is typically a value previously returned from a
-`save()` call, or a call to `latest_checkpoint()`.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` to use to restore the parameters.
-*  <b>`save_path`</b>: Path where parameters were previously saved.
-
-
-
-Other utility methods.
-
-- - -
-
-#### `tf.train.Saver.last_checkpoints` {#Saver.last_checkpoints}
-
-List of not-yet-deleted checkpoint filenames.
-
-You can pass any of the returned values to `restore()`.
-
-##### Returns:
-
-  A list of checkpoint filenames, sorted from oldest to newest.
-
-
-- - -
-
-#### `tf.train.Saver.set_last_checkpoints_with_time(last_checkpoints_with_time)` {#Saver.set_last_checkpoints_with_time}
-
-Sets the list of old checkpoint filenames and timestamps.
-
-##### Args:
-
-
-*  <b>`last_checkpoints_with_time`</b>: A list of tuples of checkpoint filenames and
-    timestamps.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If last_checkpoints_with_time is not a list.
-
-
-- - -
-
-#### `tf.train.Saver.recover_last_checkpoints(checkpoint_paths)` {#Saver.recover_last_checkpoints}
-
-Recovers the internal saver state after a crash.
-
-This method is useful for recovering the "self._last_checkpoints" state.
-
-Globs for the checkpoints pointed to by `checkpoint_paths`.  If the files
-exist, use their mtime as the checkpoint timestamp.
-
-##### Args:
-
-
-*  <b>`checkpoint_paths`</b>: a list of checkpoint paths.
-
-
-- - -
-
-#### `tf.train.Saver.as_saver_def()` {#Saver.as_saver_def}
-
-Generates a `SaverDef` representation of this saver.
-
-##### Returns:
-
-  A `SaverDef` proto.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Saver.build()` {#Saver.build}
-
-Builds saver_def.
-
-
-- - -
-
-#### `tf.train.Saver.export_meta_graph(filename=None, collection_list=None, as_text=False, export_scope=None, clear_devices=False)` {#Saver.export_meta_graph}
-
-Writes `MetaGraphDef` to save_path/filename.
-
-##### Args:
-
-
-*  <b>`filename`</b>: Optional meta_graph filename including the path.
-*  <b>`collection_list`</b>: List of string keys to collect.
-*  <b>`as_text`</b>: If `True`, writes the meta_graph as an ASCII proto.
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during export.
-
-##### Returns:
-
-  A `MetaGraphDef` proto.
-
-
-- - -
-
-#### `tf.train.Saver.from_proto(saver_def, import_scope=None)` {#Saver.from_proto}
-
-Returns a `Saver` object created from `saver_def`.
-
-##### Args:
-
-
-*  <b>`saver_def`</b>: a `SaveDef` protocol buffer.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to use.
-
-##### Returns:
-
-  A `Saver` built from saver_def.
-
-
-- - -
-
-#### `tf.train.Saver.set_last_checkpoints(last_checkpoints)` {#Saver.set_last_checkpoints}
-
-DEPRECATED: Use set_last_checkpoints_with_time.
-
-Sets the list of old checkpoint filenames.
-
-##### Args:
-
-
-*  <b>`last_checkpoints`</b>: A list of checkpoint filenames.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If last_checkpoints is not a list.
-
-
-- - -
-
-#### `tf.train.Saver.to_proto(export_scope=None)` {#Saver.to_proto}
-
-Converts this `Saver` to a `SaverDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `SaverDef` protocol buffer.
-
-
-
-
-- - -
-
-### `tf.train.latest_checkpoint(checkpoint_dir, latest_filename=None)` {#latest_checkpoint}
-
-Finds the filename of latest saved checkpoint file.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: Directory where the variables were saved.
-*  <b>`latest_filename`</b>: Optional name for the protocol buffer file that
-    contains the list of most recent checkpoint filenames.
-    See the corresponding argument to `Saver.save()`.
-
-##### Returns:
-
-  The full path to the latest checkpoint or `None` if no checkpoint was found.
-
-
-
-- - -
-
-### `tf.train.get_checkpoint_state(checkpoint_dir, latest_filename=None)` {#get_checkpoint_state}
-
-Returns CheckpointState proto from the "checkpoint" file.
-
-If the "checkpoint" file contains a valid CheckpointState
-proto, returns it.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: The directory of checkpoints.
-*  <b>`latest_filename`</b>: Optional name of the checkpoint file.  Default to
-    'checkpoint'.
-
-##### Returns:
-
-  A CheckpointState if the state was available, None
-  otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the checkpoint read doesn't have model_checkpoint_path set.
-
-
-- - -
-
-### `tf.train.update_checkpoint_state(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None, latest_filename=None)` {#update_checkpoint_state}
-
-Updates the content of the 'checkpoint' file.
-
-This updates the checkpoint file containing a CheckpointState
-proto.
-
-##### Args:
-
-
-*  <b>`save_dir`</b>: Directory where the model was saved.
-*  <b>`model_checkpoint_path`</b>: The checkpoint file.
-*  <b>`all_model_checkpoint_paths`</b>: List of strings.  Paths to all not-yet-deleted
-    checkpoints, sorted from oldest to newest.  If this is a non-empty list,
-    the last element must be equal to model_checkpoint_path.  These paths
-    are also saved in the CheckpointState proto.
-*  <b>`latest_filename`</b>: Optional name of the checkpoint file.  Default to
-    'checkpoint'.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If the save paths conflict.
-
-
-
-## Sharing Variables
-
-TensorFlow provides several classes and operations that you can use to
-create variables contingent on certain conditions.
-
-- - -
-
-### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, use_resource=None, custom_getter=None)` {#get_variable}
-
-Gets an existing variable with these parameters or create a new one.
-
-This function prefixes the name with the current variable scope
-and performs reuse checks. See the
-[Variable Scope How To](../../how_tos/variable_scope/index.md)
-for an extensive description of how reusing works. Here is a basic example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
-    w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v")  # The same as v above.
-```
-
-If initializer is `None` (the default), the default initializer passed in
-the variable scope will be used. If that one is `None` too, a
-`glorot_uniform_initializer` will be used. The initializer can also be
-a Tensor, in which case the variable is initialized to this value and shape.
-
-Similarly, if the regularizer is `None` (the default), the default regularizer
-passed in the variable scope will be used (if that is `None` too,
-then by default no regularization is performed).
-
-If a partitioner is provided, a `PartitionedVariable` is returned.
-Accessing this object as a `Tensor` returns the shards concatenated along
-the partition axis.
-
-Some useful partitioners are available.  See, e.g.,
-`variable_axis_size_partitioner` and `min_max_variable_partitioner`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the new or existing variable.
-*  <b>`shape`</b>: Shape of the new or existing variable.
-*  <b>`dtype`</b>: Type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: Initializer for the variable if one is created.
-*  <b>`regularizer`</b>: A (Tensor -> Tensor or None) function; the result of
-    applying it on a newly created variable will be added to the collection
-    GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`trainable`</b>: If `True` also add the variable to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-*  <b>`collections`</b>: List of graph collections keys to add the Variable to.
-    Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-    Variable should be cached for reading.  Defaults to the Variable's
-    device.  If not `None`, caches on another device.  Typical use is to
-    cache on the device where the Ops using the Variable reside, to
-    deduplicate copying through `Switch` and other conditional statements.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and `dtype` of the Variable to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`validate_shape`</b>: If False, allows the variable to be initialized with a
-      value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
-*  <b>`use_resource`</b>: If False, creates a regular Variable. If true, creates an
-    experimental ResourceVariable instead with well-defined semantics.
-    Defaults to False (will later change to True).
-*  <b>`custom_getter`</b>: Callable that takes as a first argument the true getter, and
-    allows overwriting the internal get_variable method.
-    The signature of `custom_getter` should match that of this method,
-    but the most future-proof version will allow for changes:
-    `def custom_getter(getter, *args, **kwargs)`.  Direct access to
-    all `get_variable` parameters is also allowed:
-    `def custom_getter(getter, name, *args, **kwargs)`.  A simple identity
-    custom getter that simply creates variables with modified names is:
-    ```python
-    def custom_getter(getter, name, *args, **kwargs):
-      return getter(name + '_suffix', *args, **kwargs)
-    ```
-
-##### Returns:
-
-  The created or existing `Variable` (or `PartitionedVariable`, if a
-  partitioner was used).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when creating a new variable and shape is not declared,
-    when violating reuse during variable creation, or when `initializer` dtype
-    and `dtype` don't match. Reuse is set inside `variable_scope`.
-
-
-- - -
-
-### `tf.get_local_variable(*args, **kwargs)` {#get_local_variable}
-
-Gets an existing *local* variable or creates a new one.
-
-Behavior is the same as in `get_variable`, except that variables are
-added to the `LOCAL_VARIABLES` collection and `trainable` is set to
-`False`.
-This function prefixes the name with the current variable scope
-and performs reuse checks. See the
-[Variable Scope How To](../../how_tos/variable_scope/index.md)
-for an extensive description of how reusing works. Here is a basic example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])  # v.name == "foo/v:0"
-    w = tf.get_variable("w", [1])  # w.name == "foo/w:0"
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v")  # The same as v above.
-```
-
-If initializer is `None` (the default), the default initializer passed in
-the variable scope will be used. If that one is `None` too, a
-`glorot_uniform_initializer` will be used. The initializer can also be
-a Tensor, in which case the variable is initialized to this value and shape.
-
-Similarly, if the regularizer is `None` (the default), the default regularizer
-passed in the variable scope will be used (if that is `None` too,
-then by default no regularization is performed).
-
-If a partitioner is provided, a `PartitionedVariable` is returned.
-Accessing this object as a `Tensor` returns the shards concatenated along
-the partition axis.
-
-Some useful partitioners are available.  See, e.g.,
-`variable_axis_size_partitioner` and `min_max_variable_partitioner`.
-
-##### Args:
-
-
-*  <b>`name`</b>: The name of the new or existing variable.
-*  <b>`shape`</b>: Shape of the new or existing variable.
-*  <b>`dtype`</b>: Type of the new or existing variable (defaults to `DT_FLOAT`).
-*  <b>`initializer`</b>: Initializer for the variable if one is created.
-*  <b>`regularizer`</b>: A (Tensor -> Tensor or None) function; the result of
-    applying it on a newly created variable will be added to the collection
-    GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-*  <b>`collections`</b>: List of graph collections keys to add the Variable to.
-    Defaults to `[GraphKeys.LOCAL_VARIABLES]` (see `tf.Variable`).
-*  <b>`caching_device`</b>: Optional device string or function describing where the
-    Variable should be cached for reading.  Defaults to the Variable's
-    device.  If not `None`, caches on another device.  Typical use is to
-    cache on the device where the Ops using the Variable reside, to
-    deduplicate copying through `Switch` and other conditional statements.
-*  <b>`partitioner`</b>: Optional callable that accepts a fully defined `TensorShape`
-    and `dtype` of the Variable to be created, and returns a list of
-    partitions for each axis (currently only one axis can be partitioned).
-*  <b>`validate_shape`</b>: If False, allows the variable to be initialized with a
-      value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
-*  <b>`use_resource`</b>: If False, creates a regular Variable. If true, creates an
-    experimental ResourceVariable instead with well-defined semantics.
-    Defaults to False (will later change to True).
-*  <b>`custom_getter`</b>: Callable that takes as a first argument the true getter, and
-    allows overwriting the internal get_variable method.
-    The signature of `custom_getter` should match that of this method,
-    but the most future-proof version will allow for changes:
-    `def custom_getter(getter, *args, **kwargs)`.  Direct access to
-    all `get_variable` parameters is also allowed:
-    `def custom_getter(getter, name, *args, **kwargs)`.  A simple identity
-    custom getter that simply creates variables with modified names is:
-    ```python
-    def custom_getter(getter, name, *args, **kwargs):
-      return getter(name + '_suffix', *args, **kwargs)
-    ```
-
-##### Returns:
-
-  The created or existing `Variable` (or `PartitionedVariable`, if a
-  partitioner was used).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when creating a new variable and shape is not declared,
-    when violating reuse during variable creation, or when `initializer` dtype
-    and `dtype` don't match. Reuse is set inside `variable_scope`.
-
-
-- - -
-
-### `class tf.VariableScope` {#VariableScope}
-
-Variable scope object to carry defaults to provide to get_variable.
-
-Many of the arguments we need for get_variable in a variable store are most
-easily handled with a context. This object is used for the defaults.
-
-Attributes:
-  name: name of the current scope, used as prefix in get_variable.
-  initializer: default initializer passed to get_variable.
-  regularizer: default regularizer passed to get_variable.
-  reuse: Boolean or None, setting the reuse in get_variable.
-  caching_device: string, callable, or None: the caching device passed to
-    get_variable.
-  partitioner: callable or `None`: the partitioner passed to `get_variable`.
-  custom_getter: default custom getter passed to get_variable.
-  name_scope: The name passed to `tf.name_scope`.
-  dtype: default type passed to get_variable (defaults to DT_FLOAT).
-  use_resource: if False, create a normal Variable; if True create an
-    experimental ResourceVariable with well-defined semantics. Defaults
-    to False (will later change to True).
-- - -
-
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32, use_resource=None)` {#VariableScope.__init__}
-
-Creates a new VariableScope with the given properties.
-
-
-- - -
-
-#### `tf.VariableScope.caching_device` {#VariableScope.caching_device}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.custom_getter` {#VariableScope.custom_getter}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.dtype` {#VariableScope.dtype}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, use_resource=None, custom_getter=None)` {#VariableScope.get_variable}
-
-Gets an existing variable with this name or create a new one.
-
-
-- - -
-
-#### `tf.VariableScope.initializer` {#VariableScope.initializer}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.name` {#VariableScope.name}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.original_name_scope` {#VariableScope.original_name_scope}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.partitioner` {#VariableScope.partitioner}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.regularizer` {#VariableScope.regularizer}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.reuse` {#VariableScope.reuse}
-
-
-
-
-- - -
-
-#### `tf.VariableScope.reuse_variables()` {#VariableScope.reuse_variables}
-
-Reuse variables in this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_caching_device(caching_device)` {#VariableScope.set_caching_device}
-
-Set caching_device for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_custom_getter(custom_getter)` {#VariableScope.set_custom_getter}
-
-Set custom getter for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
-
-Set data type for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
-
-Set initializer for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_partitioner(partitioner)` {#VariableScope.set_partitioner}
-
-Set partitioner for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_regularizer(regularizer)` {#VariableScope.set_regularizer}
-
-Set regularizer for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.set_use_resource(use_resource)` {#VariableScope.set_use_resource}
-
-Sets whether to use ResourceVariables for this scope.
-
-
-- - -
-
-#### `tf.VariableScope.use_resource` {#VariableScope.use_resource}
-
-
-
-
-
-- - -
-
-### `tf.variable_scope(name_or_scope, default_name=None, values=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None, use_resource=None)` {#variable_scope}
-
-Returns a context manager for defining ops that creates variables (layers).
-
-This context manager validates that the (optional) `values` are from
-the same graph, ensures that graph is the default graph, and pushes a
-name scope and a variable scope.
-
-If `name_or_scope` is not None, it is used as is. If `scope` is None, then
-`default_name` is used.  In that case, if the same name has been previously
-used in the same scope, it will made unique be appending `_N` to it.
-
-Variable scope allows to create new variables and to share already created
-ones while providing checks to not create or share by accident. For details,
-see the [Variable Scope How To](../../how_tos/variable_scope/index.md),
-here we present only a few basic examples.
-
-Simple example of how to create a new variable:
-
-```python
-with tf.variable_scope("foo"):
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-        assert v.name == "foo/bar/v:0"
-```
-
-Basic example of sharing a variable:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v", [1])
-assert v1 == v
-```
-
-Sharing a variable by capturing a scope and setting reuse:
-
-```python
-with tf.variable_scope("foo") as scope:
-    v = tf.get_variable("v", [1])
-    scope.reuse_variables()
-    v1 = tf.get_variable("v", [1])
-assert v1 == v
-```
-
-To prevent accidental sharing of variables, we raise an exception when
-getting an existing variable in a non-reusing scope.
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-    v1 = tf.get_variable("v", [1])
-    #  Raises ValueError("... v already exists ...").
-```
-
-Similarly, we raise an exception when trying to get a variable that
-does not exist in reuse mode.
-
-```python
-with tf.variable_scope("foo", reuse=True):
-    v = tf.get_variable("v", [1])
-    #  Raises ValueError("... v does not exists ...").
-```
-
-Note that the `reuse` flag is inherited: if we open a reusing scope,
-then all its sub-scopes become reusing as well.
-
-##### Args:
-
-
-*  <b>`name_or_scope`</b>: `string` or `VariableScope`: the scope to open.
-*  <b>`default_name`</b>: The default name to use if the `name_or_scope` argument is
-    `None`, this name will be uniquified. If name_or_scope is provided it
-    won't be used and therefore it is not required and can be None.
-*  <b>`values`</b>: The list of `Tensor` arguments that are passed to the op function.
-*  <b>`initializer`</b>: default initializer for variables within this scope.
-*  <b>`regularizer`</b>: default regularizer for variables within this scope.
-*  <b>`caching_device`</b>: default caching device for variables within this scope.
-*  <b>`partitioner`</b>: default partitioner for variables within this scope.
-*  <b>`custom_getter`</b>: default custom getter for variables within this scope.
-*  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
-    well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
-*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
-    in the passed scope, or inherited from parent scope).
-*  <b>`use_resource`</b>: If False, all variables will be regular Variables. If True,
-    experimental ResourceVariables with well-defined semantics will be used
-    instead. Defaults to False (will later change to True).
-
-##### Returns:
-
-  A scope that can be to captured and reused.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: when trying to reuse within a create scope, or create within
-    a reuse scope, or if reuse is not `None` or `True`.
-*  <b>`TypeError`</b>: when the types of some arguments are not appropriate.
-
-
-- - -
-
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None, use_resource=None)` {#variable_op_scope}
-
-Deprecated: context manager for defining an op that creates variables.
-
-
-- - -
-
-### `tf.get_variable_scope()` {#get_variable_scope}
-
-Returns the current variable scope.
-
-
-- - -
-
-### `tf.make_template(name_, func_, create_scope_now_=False, unique_name_=None, custom_getter_=None, **kwargs)` {#make_template}
-
-Given an arbitrary function, wrap it so that it does variable sharing.
-
-This wraps `func_` in a Template and partially evaluates it. Templates are
-functions that create variables the first time they are called and reuse them
-thereafter. In order for `func_` to be compatible with a `Template` it must
-have the following properties:
-
-* The function should create all trainable variables and any variables that
-   should be reused by calling `tf.get_variable`. If a trainable variable is
-   created using `tf.Variable`, then a ValueError will be thrown. Variables
-   that are intended to be locals can be created by specifying
-   `tf.Variable(..., trainable=false)`.
-* The function may use variable scopes and other templates internally to
-    create and reuse variables, but it shouldn't use `tf.global_variables` to
-    capture variables that are defined outside of the scope of the function.
-* Internal scopes and variable names should not depend on any arguments that
-    are not supplied to `make_template`. In general you will get a ValueError
-    telling you that you are trying to reuse a variable that doesn't exist
-    if you make a mistake.
-
-In the following example, both `z` and `w` will be scaled by the same `y`. It
-is important to note that if we didn't assign `scalar_name` and used a
-different name for z and w that a `ValueError` would be thrown because it
-couldn't reuse the variable.
-
-```python
-def my_op(x, scalar_name):
-  var1 = tf.get_variable(scalar_name,
-                         shape=[],
-                         initializer=tf.constant_initializer(1))
-  return x * var1
-
-scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
-
-z = scale_by_y(input1)
-w = scale_by_y(input2)
-```
-
-As a safe-guard, the returned function will raise a `ValueError` after the
-first call if trainable variables are created by calling `tf.Variable`.
-
-If all of these are true, then 2 properties are enforced by the template:
-
-1. Calling the same template multiple times will share all non-local
-    variables.
-2. Two different templates are guaranteed to be unique, unless you reenter the
-    same variable scope as the initial definition of a template and redefine
-    it. An examples of this exception:
-
-```python
-def my_op(x, scalar_name):
-  var1 = tf.get_variable(scalar_name,
-                         shape=[],
-                         initializer=tf.constant_initializer(1))
-  return x * var1
-
-with tf.variable_scope('scope') as vs:
-  scale_by_y = tf.make_template('scale_by_y', my_op, scalar_name='y')
-  z = scale_by_y(input1)
-  w = scale_by_y(input2)
-
-# Creates a template that reuses the variables above.
-with tf.variable_scope(vs, reuse=True):
-  scale_by_y2 = tf.make_template('scale_by_y', my_op, scalar_name='y')
-  z2 = scale_by_y2(input1)
-  w2 = scale_by_y2(input2)
-```
-
-Depending on the value of `create_scope_now_`, the full variable scope may be
-captured either at the time of first call or at the time of construction. If
-this option is set to True, then all Tensors created by repeated calls to the
-template will have an extra trailing _N+1 to their name, as the first time the
-scope is entered in the Template constructor no Tensors are created.
-
-Note: `name_`, `func_` and `create_scope_now_` have a trailing underscore to
-reduce the likelihood of collisions with kwargs.
-
-##### Args:
-
-
-*  <b>`name_`</b>: A name for the scope created by this template. If necessary, the name
-    will be made unique by appending `_N` to the name.
-*  <b>`func_`</b>: The function to wrap.
-*  <b>`create_scope_now_`</b>: Boolean controlling whether the scope should be created
-    when the template is constructed or when the template is called. Default
-    is False, meaning the scope is created when the template is called.
-*  <b>`unique_name_`</b>: When used, it overrides name_ and is not made unique. If a
-    template of the same scope/unique_name already exists and reuse is false,
-    an error is raised. Defaults to None.
-*  <b>`custom_getter_`</b>: Optional custom getter for variables used in `func_`. See
-    the [`get_variable`](#get_variable) `custom_getter` documentation for
-    more information.
-*  <b>`**kwargs`</b>: Keyword arguments to apply to `func_`.
-
-##### Returns:
-
-  A function to encapsulate a set of variables which should be created once
-  and reused. An enclosing scope will created, either where `make_template`
-  is called, or wherever the result is called, depending on the value of
-  `create_scope_now_`. Regardless of the value, the first time the template
-  is called it will enter the scope with no reuse, and call `func_` to create
-  variables, which are guaranteed to be unique. All subsequent calls will
-  re-enter the scope and reuse those variables.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the name is None.
-
-
-
-- - -
-
-### `tf.no_regularizer(_)` {#no_regularizer}
-
-Use this function to prevent regularization of variables.
-
-
-
-- - -
-
-### `class tf.constant_initializer` {#constant_initializer}
-
-Initializer that generates tensors with constant values.
-
-The resulting tensor is populated with values of type `dtype`, as
-specified by arguments `value` following the desired `shape` of the
-new tensor (see examples below).
-
-The argument `value` can be a constant value, or a list of values of type
-`dtype`. If `value` is a list, then the length of the list must be less
-than or equal to the number of elements implied by the desired shape of the
-tensor. In the case where the total number of elements in `value` is less
-than the number of elements required by the tensor shape, the last element
-in `value` will be used to fill the remaining entries. If the total number of
-elements in `value` is greater than the number of elements required by the
-tensor shape, the initializer will raise a `ValueError`.
-
-Args:
-  value: A Python scalar, list of values, or a N-dimensional numpy array. All
-    elements of the initialized variable will be set to the corresponding
-    value in the `value` argument.
-  dtype: The data type.
-  verify_shape: Boolean that enables verification of the shape of `value`. If
-    `True`, the initializer will throw an error if the shape of `value` is not
-    compatible with the shape of the initialized tensor.
-
-Examples:
-  The following example can be rewritten using a numpy.ndarray instead
-  of the `value` list, even reshaped, as shown in the two commented lines
-  below the `value` list initialization.
-
-```python
-  >>> import numpy as np
-  >>> import tensorflow as tf
-
-  >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
-  >>> # value = np.array(value)
-  >>> # value = value.reshape([2, 4])
-  >>> init = tf.constant_initializer(value)
-
-  >>> print('fitting shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
-  >>>   x.initializer.run()
-  >>>   print(x.eval())
-
-  fitting shape:
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]]
-
-  >>> print('larger shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
-  >>>   x.initializer.run()
-  >>>   print(x.eval())
-
-  larger shape:
-  [[ 0.  1.  2.  3.]
-   [ 4.  5.  6.  7.]
-   [ 7.  7.  7.  7.]]
-
-  >>> print('smaller shape:')
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
-
-  ValueError: Too many elements provided. Needed at most 6, but received 8
-
-  >>> print('shape verification:')
-  >>> init_verify = tf.constant_initializer(value, verify_shape=True)
-  >>> with tf.Session():
-  >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init_verify)
-
-  TypeError: Expected Tensor's shape: (3, 4), got (8,).
-```
-- - -
-
-#### `tf.constant_initializer.__call__(shape, dtype=None, partition_info=None)` {#constant_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.constant_initializer.__init__(value=0, dtype=tf.float32, verify_shape=False)` {#constant_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.random_normal_initializer` {#random_normal_initializer}
-
-Initializer that generates tensors with a normal distribution.
-
-Args:
-  mean: a python scalar or a scalar tensor. Mean of the random values
-    to generate.
-  stddev: a python scalar or a scalar tensor. Standard deviation of the
-    random values to generate.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.random_normal_initializer.__call__(shape, dtype=None, partition_info=None)` {#random_normal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.random_normal_initializer.__init__(mean=0.0, stddev=1.0, seed=None, dtype=tf.float32)` {#random_normal_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.truncated_normal_initializer` {#truncated_normal_initializer}
-
-Initializer that generates a truncated normal distribution.
-
-These values are similar to values from a `random_normal_initializer`
-except that values more than two standard deviations from the mean
-are discarded and re-drawn. This is the recommended initializer for
-neural network weights and filters.
-
-Args:
-  mean: a python scalar or a scalar tensor. Mean of the random values
-    to generate.
-  stddev: a python scalar or a scalar tensor. Standard deviation of the
-    random values to generate.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.truncated_normal_initializer.__call__(shape, dtype=None, partition_info=None)` {#truncated_normal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.truncated_normal_initializer.__init__(mean=0.0, stddev=1.0, seed=None, dtype=tf.float32)` {#truncated_normal_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.random_uniform_initializer` {#random_uniform_initializer}
-
-Initializer that generates tensors with a uniform distribution.
-
-Args:
-  minval: A python scalar or a scalar tensor. Lower bound of the range
-    of random values to generate.
-  maxval: A python scalar or a scalar tensor. Upper bound of the range
-    of random values to generate.  Defaults to 1 for float types.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type.
-- - -
-
-#### `tf.random_uniform_initializer.__call__(shape, dtype=None, partition_info=None)` {#random_uniform_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.random_uniform_initializer.__init__(minval=0, maxval=None, seed=None, dtype=tf.float32)` {#random_uniform_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.uniform_unit_scaling_initializer` {#uniform_unit_scaling_initializer}
-
-Initializer that generates tensors without scaling variance.
-
-When initializing a deep network, it is in principle advantageous to keep
-the scale of the input variance constant, so it does not explode or diminish
-by reaching the final layer. If the input is `x` and the operation `x * W`,
-and we want to initialize `W` uniformly at random, we need to pick `W` from
-
-    [-sqrt(3) / sqrt(dim), sqrt(3) / sqrt(dim)]
-
-to keep the scale intact, where `dim = W.shape[0]` (the size of the input).
-A similar calculation for convolutional networks gives an analogous result
-with `dim` equal to the product of the first 3 dimensions.  When
-nonlinearities are present, we need to multiply this by a constant `factor`.
-See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
-([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
-and the calculation of constants. In section 2.3 there, the constants were
-numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.
-
-Args:
-  factor: Float.  A multiplicative factor by which the values will be scaled.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-  dtype: The data type. Only floating point types are supported.
-- - -
-
-#### `tf.uniform_unit_scaling_initializer.__call__(shape, dtype=None, partition_info=None)` {#uniform_unit_scaling_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.uniform_unit_scaling_initializer.__init__(factor=1.0, seed=None, dtype=tf.float32)` {#uniform_unit_scaling_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.zeros_initializer` {#zeros_initializer}
-
-Initializer that generates tensors initialized to 0.
-- - -
-
-#### `tf.zeros_initializer.__call__(shape, dtype=None, partition_info=None)` {#zeros_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.zeros_initializer.__init__(dtype=tf.float32)` {#zeros_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.ones_initializer` {#ones_initializer}
-
-Initializer that generates tensors initialized to 1.
-- - -
-
-#### `tf.ones_initializer.__call__(shape, dtype=None, partition_info=None)` {#ones_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.ones_initializer.__init__(dtype=tf.float32)` {#ones_initializer.__init__}
-
-
-
-
-
-- - -
-
-### `class tf.orthogonal_initializer` {#orthogonal_initializer}
-
-Initializer that generates an orthogonal matrix.
-
-If the shape of the tensor to initialize is two-dimensional, i is initialized
-with an orthogonal matrix obtained from the singular value decomposition of a
-matrix of uniform random numbers.
-
-If the shape of the tensor to initialize is more than two-dimensional,
-a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
-is initialized, where `n` is the length of the shape vector.
-The matrix is subsequently reshaped to give a tensor of the desired shape.
-
-Args:
-  gain: multiplicative factor to apply to the orthogonal matrix
-  dtype: The type of the output.
-  seed: A Python integer. Used to create random seeds. See
-    [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-    for behavior.
-- - -
-
-#### `tf.orthogonal_initializer.__call__(shape, dtype=None, partition_info=None)` {#orthogonal_initializer.__call__}
-
-
-
-
-- - -
-
-#### `tf.orthogonal_initializer.__init__(gain=1.0, dtype=tf.float32, seed=None)` {#orthogonal_initializer.__init__}
-
-
-
-
-
-
-## Variable Partitioners for Sharding
-
-- - -
-
-### `tf.fixed_size_partitioner(num_shards, axis=0)` {#fixed_size_partitioner}
-
-Partitioner to specify a fixed number of shards along given axis.
-
-##### Args:
-
-
-*  <b>`num_shards`</b>: `int`, number of shards to partition variable.
-*  <b>`axis`</b>: `int`, axis to partition on.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
-
-- - -
-
-### `tf.variable_axis_size_partitioner(max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None)` {#variable_axis_size_partitioner}
-
-Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
-
-This partitioner will shard a Variable along one axis, attempting to keep
-the maximum shard size below `max_shard_bytes`.  In practice, this is not
-always possible when sharding along only one axis.  When this happens,
-this axis is sharded as much as possible (i.e., every dimension becomes
-a separate shard).
-
-If the partitioner hits the `max_shards` limit, then each shard may end up
-larger than `max_shard_bytes`. By default `max_shards` equals `None` and no
-limit on the number of shards is enforced.
-
-One reasonable value for `max_shard_bytes` is `(64 << 20) - 1`, or almost
-`64MB`, to keep below the protobuf byte limit.
-
-##### Args:
-
-
-*  <b>`max_shard_bytes`</b>: The maximum size any given shard is allowed to be.
-*  <b>`axis`</b>: The axis to partition along.  Default: outermost axis.
-*  <b>`bytes_per_string_element`</b>: If the `Variable` is of type string, this provides
-    an estimate of how large each scalar in the `Variable` is.
-*  <b>`max_shards`</b>: The maximum number of shards in int created taking precedence
-    over `max_shard_bytes`.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If any of the byte counts are non-positive.
-
-
-- - -
-
-### `tf.min_max_variable_partitioner(max_partitions=1, axis=0, min_slice_size=262144, bytes_per_string_element=16)` {#min_max_variable_partitioner}
-
-Partitioner to allocate minimum size per slice.
-
-Returns a partitioner that partitions the variable of given shape and dtype
-such that each partition has a minimum of `min_slice_size` slice of the
-variable. The maximum number of such partitions (upper bound) is given by
-`max_partitions`.
-
-##### Args:
-
-
-*  <b>`max_partitions`</b>: Upper bound on the number of partitions. Defaults to 1.
-*  <b>`axis`</b>: Axis along which to partition the variable. Defaults to 0.
-*  <b>`min_slice_size`</b>: Minimum size of the variable slice per partition. Defaults
-    to 256K.
-*  <b>`bytes_per_string_element`</b>: If the `Variable` is of type string, this provides
-    an estimate of how large each scalar in the `Variable` is.
-
-##### Returns:
-
-  A partition function usable as the `partitioner` argument to
-  `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
-
-
-
-## Sparse Variable Updates
-
-The sparse update ops modify a subset of the entries in a dense `Variable`,
-either overwriting the entries or adding / subtracting a delta.  These are
-useful for training embedding models and similar lookup-based networks, since
-only a small subset of embedding vectors change in any given step.
-
-Since a sparse update of a large tensor may be generated automatically during
-gradient computation (as in the gradient of
-[`tf.gather`](../../api_docs/python/array_ops.md#gather)),
-an [`IndexedSlices`](#IndexedSlices) class is provided that encapsulates a set
-of sparse indices and values.  `IndexedSlices` objects are detected and handled
-automatically by the optimizers in most cases.
-
-- - -
-
-### `tf.scatter_update(ref, indices, updates, use_locking=None, name=None)` {#scatter_update}
-
-Applies sparse updates to a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] = updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] = updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-If values in `ref` is to be updated more than once, because there are
-duplicate entries in `indices`, the order at which the updates happen
-for each value is undefined.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterUpdate.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to store in `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    If True, the assignment will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_add(ref, indices, updates, use_locking=None, name=None)` {#scatter_add}
-
-Adds sparse updates to a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] += updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] += updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to add to `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the addition will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_sub(ref, indices, updates, use_locking=None, name=None)` {#scatter_sub}
-
-Subtracts sparse updates to a variable reference.
-
-    # Scalar indices
-    ref[indices, ...] -= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] -= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their (negated) contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterSub.png" alt>
-</div>
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to subtract from `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the subtraction will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_mul(ref, indices, updates, use_locking=None, name=None)` {#scatter_mul}
-
-Multiplies sparse updates into a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] *= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] *= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions multiply.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of updated values to multiply to `ref`.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the operation will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_div(ref, indices, updates, use_locking=None, name=None)` {#scatter_div}
-
-Divides a variable reference by sparse updates.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] /= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] /= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions divide.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    Should be from a `Variable` node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A tensor of indices into the first dimension of `ref`.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A tensor of values that `ref` is divided by.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    If True, the operation will be protected by a lock;
-    otherwise the behavior is undefined, but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_nd_update(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_update}
-
-Applies sparse `updates` to individual values or slices within a given
-
-variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_update(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
-
-The resulting update to ref would look like this:
-
-    [1, 11, 3, 10, 9, 6, 7, 12]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated
-    values to add to ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want to
-  use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_nd_add(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_add}
-
-Applies sparse addition between `updates` and individual values or slices
-
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
-
-The resulting update to ref would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated values
-    to add to ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.scatter_nd_sub(ref, indices, updates, use_locking=None, name=None)` {#scatter_nd_sub}
-
-Applies sparse subtraction between `updates` and individual values or slices
-
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-with 8 elements. In Python, that subtraction would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
-
-The resulting update to ref would look like this:
-
-    [1, -9, 3, -6, -4, 6, 7, -4]
-
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
-slices.
-
-##### Args:
-
-
-*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    A mutable Tensor. Should be from a Variable node.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    A Tensor. Must be one of the following types: int32, int64.
-    A tensor of indices into ref.
-*  <b>`updates`</b>: A `Tensor`. Must have the same type as `ref`.
-    A Tensor. Must have the same type as ref. A tensor of updated values
-    to subtract from ref.
-*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
-    An optional bool. Defaults to True. If True, the assignment will
-    be protected by a lock; otherwise the behavior is undefined,
-    but may exhibit less contention.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A mutable `Tensor`. Has the same type as `ref`.
-  Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-
-
-- - -
-
-### `tf.sparse_mask(a, mask_indices, name=None)` {#sparse_mask}
-
-Masks elements of `IndexedSlices`.
-
-Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices not
-specified in `mask_indices` are returned.
-
-This is useful when you need to extract a subset of slices in an
-`IndexedSlices` object.
-
-For example:
-
-```python
-# `a` contains slices at indices [12, 26, 37, 45] from a large tensor
-# with shape [1000, 10]
-a.indices => [12, 26, 37, 45]
-tf.shape(a.values) => [4, 10]
-
-# `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask its first and last indices (which are at absolute
-# indices 12, 45)
-b = tf.sparse_mask(a, [12, 45])
-
-b.indices => [26, 37]
-tf.shape(b.values) => [2, 10]
-
-```
-
-##### Args:
-
-
-*  <b>`a`</b>: An `IndexedSlices` instance.
-*  <b>`mask_indices`</b>: Indices of elements to mask.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The masked `IndexedSlices` instance.
-
-
-- - -
-
-### `class tf.IndexedSlices` {#IndexedSlices}
-
-A sparse representation of a set of tensor slices at given indices.
-
-This class is a simple wrapper for a pair of `Tensor` objects:
-
-* `values`: A `Tensor` of any dtype with shape `[D0, D1, ..., Dn]`.
-* `indices`: A 1-D integer `Tensor` with shape `[D0]`.
-
-An `IndexedSlices` is typically used to represent a subset of a larger
-tensor `dense` of shape `[LARGE0, D1, .. , DN]` where `LARGE0 >> D0`.
-The values in `indices` are the indices in the first dimension of
-the slices that have been extracted from the larger tensor.
-
-The dense tensor `dense` represented by an `IndexedSlices` `slices` has
-
-```python
-dense[slices.indices[i], :, :, :, ...] = slices.values[i, :, :, :, ...]
-```
-
-The `IndexedSlices` class is used principally in the definition of
-gradients for operations that have sparse gradients
-(e.g. [`tf.gather`](../../api_docs/python/array_ops.md#gather)).
-
-Contrast this representation with
-[`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-which uses multi-dimensional indices and scalar values.
-
-- - -
-
-#### `tf.IndexedSlices.__init__(values, indices, dense_shape=None)` {#IndexedSlices.__init__}
-
-Creates an `IndexedSlices`.
-
-
-
-- - -
-
-#### `tf.IndexedSlices.values` {#IndexedSlices.values}
-
-A `Tensor` containing the values of the slices.
-
-
-- - -
-
-#### `tf.IndexedSlices.indices` {#IndexedSlices.indices}
-
-A 1-D `Tensor` containing the indices of the slices.
-
-
-- - -
-
-#### `tf.IndexedSlices.dense_shape` {#IndexedSlices.dense_shape}
-
-A 1-D `Tensor` containing the shape of the corresponding dense tensor.
-
-
-
-- - -
-
-#### `tf.IndexedSlices.name` {#IndexedSlices.name}
-
-The name of this `IndexedSlices`.
-
-
-- - -
-
-#### `tf.IndexedSlices.dtype` {#IndexedSlices.dtype}
-
-The `DType` of elements in this tensor.
-
-
-- - -
-
-#### `tf.IndexedSlices.device` {#IndexedSlices.device}
-
-The name of the device on which `values` will be produced, or `None`.
-
-
-- - -
-
-#### `tf.IndexedSlices.op` {#IndexedSlices.op}
-
-The `Operation` that produces `values` as an output.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.IndexedSlices.__neg__()` {#IndexedSlices.__neg__}
-
-
-
-
-- - -
-
-#### `tf.IndexedSlices.__str__()` {#IndexedSlices.__str__}
-
-
-
-
-- - -
-
-#### `tf.IndexedSlices.graph` {#IndexedSlices.graph}
-
-The `Graph` that contains the values, indices, and shape tensors.
-
-
-
-
-### Read-only Lookup Tables
-
-- - -
-
-### `tf.initialize_all_tables(*args, **kwargs)` {#initialize_all_tables}
-
-Returns an Op that initializes all tables of the default graph. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.tables_initializer` instead.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the initialization op.
-
-##### Returns:
-
-  An Op that initializes all tables.  Note that if there are
-  not tables the returned Op is a NoOp.
-
-
-- - -
-
-### `tf.tables_initializer(name='init_all_tables')` {#tables_initializer}
-
-Returns an Op that initializes all tables of the default graph.
-
-##### Args:
-
-
-*  <b>`name`</b>: Optional name for the initialization op.
-
-##### Returns:
-
-  An Op that initializes all tables.  Note that if there are
-  not tables the returned Op is a NoOp.
-
-
-
-
-## Exporting and Importing Meta Graphs
-
-- - -
-
-### `tf.train.export_meta_graph(filename=None, meta_info_def=None, graph_def=None, saver_def=None, collection_list=None, as_text=False, graph=None, export_scope=None, clear_devices=False, **kwargs)` {#export_meta_graph}
-
-Returns `MetaGraphDef` proto. Optionally writes it to filename.
-
-This function exports the graph, saver, and collection objects into
-`MetaGraphDef` protocol buffer with the intention of it being imported
-at a later time or location to restart training, run inference, or be
-a subgraph.
-
-##### Args:
-
-
-*  <b>`filename`</b>: Optional filename including the path for writing the
-    generated `MetaGraphDef` protocol buffer.
-*  <b>`meta_info_def`</b>: `MetaInfoDef` protocol buffer.
-*  <b>`graph_def`</b>: `GraphDef` protocol buffer.
-*  <b>`saver_def`</b>: `SaverDef` protocol buffer.
-*  <b>`collection_list`</b>: List of string keys to collect.
-*  <b>`as_text`</b>: If `True`, writes the `MetaGraphDef` as an ASCII proto.
-*  <b>`graph`</b>: The `Graph` to import into. If `None`, use the default graph.
-*  <b>`export_scope`</b>: Optional `string`. Name scope under which to extract
-    the subgraph. The scope name will be striped from the node definitions
-    for easy import later into new name scopes. If `None`, the whole graph
-    is exported. graph_def and export_scope cannot both be specified.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during export.
-*  <b>`**kwargs`</b>: Optional keyed arguments.
-
-##### Returns:
-
-  A `MetaGraphDef` proto.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: When the `GraphDef` is larger than 2GB.
-
-
-- - -
-
-### `tf.train.import_meta_graph(meta_graph_or_file, clear_devices=False, import_scope=None, **kwargs)` {#import_meta_graph}
-
-Recreates a Graph saved in a `MetaGraphDef` proto.
-
-This function takes a `MetaGraphDef` protocol buffer as input. If
-the argument is a file containing a `MetaGraphDef` protocol buffer ,
-it constructs a protocol buffer from the file content. The function
-then adds all the nodes from the `graph_def` field to the
-current graph, recreates all the collections, and returns a saver
-constructed from the `saver_def` field.
-
-In combination with `export_meta_graph()`, this function can be used to
-
-* Serialize a graph along with other Python objects such as `QueueRunner`,
-  `Variable` into a `MetaGraphDef`.
-
-* Restart training from a saved graph and checkpoints.
-
-* Run inference from a saved graph and checkpoints.
-
-```Python
-...
-# Create a saver.
-saver = tf.train.Saver(...variables...)
-# Remember the training_op we want to run by adding it to a collection.
-tf.add_to_collection('train_op', train_op)
-sess = tf.Session()
-for step in xrange(1000000):
-    sess.run(train_op)
-    if step % 1000 == 0:
-        # Saves checkpoint, which by default also exports a meta_graph
-        # named 'my-model-global_step.meta'.
-        saver.save(sess, 'my-model', global_step=step)
-```
-
-Later we can continue training from this saved `meta_graph` without building
-the model from scratch.
-
-```Python
-with tf.Session() as sess:
-  new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
-  new_saver.restore(sess, 'my-save-dir/my-model-10000')
-  # tf.get_collection() returns a list. In this example we only want the
-  # first one.
-  train_op = tf.get_collection('train_op')[0]
-  for step in xrange(1000000):
-    sess.run(train_op)
-```
-
-NOTE: Restarting training from saved `meta_graph` only works if the
-device assignments have not changed.
-
-##### Args:
-
-
-*  <b>`meta_graph_or_file`</b>: `MetaGraphDef` protocol buffer or filename (including
-    the path) containing a `MetaGraphDef`.
-*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
-    or `Tensor` during import.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
-    initializing from protocol buffer.
-*  <b>`**kwargs`</b>: Optional keyed arguments.
-
-##### Returns:
-
-  A saver constructed from `saver_def` in `MetaGraphDef` or None.
-
-  A None value is returned if no variables exist in the `MetaGraphDef`
-  (i.e., there are no variables to restore).
-
-
-
-# Deprecated functions (removed after 2017-03-02). Please don't use them.
-
-- - -
-
-### `tf.all_variables(*args, **kwargs)` {#all_variables}
-
-See `tf.global_variables`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Please use tf.global_variables instead.
-
-
-- - -
-
-### `tf.initialize_all_variables(*args, **kwargs)` {#initialize_all_variables}
-
-See `tf.global_variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.global_variables_initializer` instead.
-
-
-- - -
-
-### `tf.initialize_local_variables(*args, **kwargs)` {#initialize_local_variables}
-
-See `tf.local_variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.local_variables_initializer` instead.
-
-
-- - -
-
-### `tf.initialize_variables(*args, **kwargs)` {#initialize_variables}
-
-See `tf.variables_initializer`. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
-Instructions for updating:
-Use `tf.variables_initializer` instead.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/string_ops.md b/tensorflow/g3doc/api_docs/python/string_ops.md
deleted file mode 100644
index a53210c46db..00000000000
--- a/tensorflow/g3doc/api_docs/python/string_ops.md
+++ /dev/null
@@ -1,405 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Strings
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-## Hashing
-
-String hashing ops take a string input tensor and map each element to an
-integer.
-
-- - -
-
-### `tf.string_to_hash_bucket_fast(input, num_buckets, name=None)` {#string_to_hash_bucket_fast}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. The strings to assign a hash bucket.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
-
-- - -
-
-### `tf.string_to_hash_bucket_strong(input, num_buckets, key, name=None)` {#string_to_hash_bucket_strong}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process. The hash function is a keyed hash function, where attribute `key`
-defines the key of the hash function. `key` is an array of 2 elements.
-
-A strong hash is important when inputs may be malicious, e.g. URLs with
-additional components. Adversaries could try to make their inputs hash to the
-same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it dificult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-time than `tf.string_to_hash_bucket_fast`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. The strings to assign a hash bucket.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`key`</b>: A list of `ints`.
-    The key for the keyed hash function passed as a list of two uint64
-    elements.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
-
-- - -
-
-### `tf.string_to_hash_bucket(string_tensor, num_buckets, name=None)` {#string_to_hash_bucket}
-
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process.
-
-Note that the hash function may change from time to time.
-This functionality will be deprecated and it's recommended to use
-`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-
-##### Args:
-
-
-*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
-*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `int64`.
-  A Tensor of the same shape as the input `string_tensor`.
-
-
-
-## Joining
-
-String joining ops concatenate elements of input string tensors to produce a new
-string tensor.
-
-- - -
-
-### `tf.reduce_join(inputs, axis=None, keep_dims=False, separator='', name=None, reduction_indices=None)` {#reduce_join}
-
-Joins a string Tensor across the given dimensions.
-
-Computes the string join across dimensions in the given string Tensor of shape
-`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-strings with the given separator (default: empty string).  Negative indices are
-counted backwards from the end, with `-1` being equivalent to `n - 1`.
-
-For example:
-
-```
-# tensor `a` is [["a", "b"], ["c", "d"]]
-tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-tf.reduce_join(a, []) ==> ["abcd"]
-```
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A `Tensor` of type `string`.
-    The input to be joined.  All reduced indices must have non-zero size.
-*  <b>`axis`</b>: A `Tensor` of type `int32`.
-    The dimensions to reduce over.  Dimensions are reduced in the
-    order specified.  Omitting `axis` is equivalent to passing
-    `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-*  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
-    If `True`, retain reduced dimensions with length `1`.
-*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
-    The separator to use when joining.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-  Has shape equal to that of the input with reduced dimensions removed or
-  set to `1` depending on `keep_dims`.
-
-
-- - -
-
-### `tf.string_join(inputs, separator=None, name=None)` {#string_join}
-
-Joins the strings in the given list of string tensors into one tensor;
-
-with the given separator (default is an empty separator).
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of at least 1 `Tensor` objects of type `string`.
-    A list of string tensors.  The tensors must all have the same shape,
-    or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-    of non-scalar inputs.
-*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
-    string, an optional join separator.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
-
-
-## Splitting
-
-- - -
-
-### `tf.string_split(source, delimiter=' ')` {#string_split}
-
-Split elements of `source` based on `delimiter` into a `SparseTensor`.
-
-Let N be the size of source (typically N will be the batch size). Split each
-element of `source` based on `delimiter` and return a `SparseTensor`
-containing the splitted tokens. Empty tokens are ignored.
-
-If `delimiter` is an empty string, each element of the `source` is split
-into individual strings, each containing one byte. (This includes splitting
-multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is
-treated as a set of delimiters with each considered a potential split point.
-
-For example:
-N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output
-will be
-
-st.indices = [0, 0;
-              0, 1;
-              1, 0;
-              1, 1;
-              1, 2]
-st.shape = [2, 3]
-st.values = ['hello', 'world', 'a', 'b', 'c']
-
-##### Args:
-
-
-*  <b>`source`</b>: `1-D` string `Tensor`, the strings to split.
-*  <b>`delimiter`</b>: `0-D` string `Tensor`, the delimiter character, the string should
-    be length 0 or 1.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If delimiter is not a string.
-
-##### Returns:
-
-  A `SparseTensor` of rank `2`, the strings split according to the delimiter.
-  The first column of the indices corresponds to the row in `source` and the
-  second column corresponds to the index of the split component in this row.
-
-
-- - -
-
-### `tf.substr(input, pos, len, name=None)` {#substr}
-
-Return substrings from `Tensor` of strings.
-
-For each string in the input `Tensor`, creates a substring starting at index
-`pos` with a total length of `len`.
-
-If `len` defines a substring that would extend beyond the length of the input
-string, then as many characters as possible are used.
-
-If `pos` is negative or specifies a character index larger than any of the input
-strings, then an `InvalidArgumentError` is thrown.
-
-`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-Op creation.
-
-*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
----
-
-Examples
-
-Using scalar `pos` and `len`:
-
-```
-input = [b'Hello', b'World']
-position = 1
-length = 3
-
-output = [b'ell', b'orl']
-```
-
-Using `pos` and `len` with same shape as `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen']]
-position = [[1, 2, 3],
-            [1, 2, 3],
-            [1, 2, 3]]
-length =   [[2, 3, 4],
-            [4, 3, 2],
-            [5, 5, 5]]
-
-output = [[b'en', b'eve', b'lve'],
-          [b'hirt', b'urt', b'te'],
-          [b'ixtee', b'vente', b'hteen']]
-```
-
-Broadcasting `pos` and `len` onto `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen'],
-         [b'nineteen', b'twenty', b'twentyone']]
-position = [1, 2, 3]
-length =   [1, 2, 3]
-
-output = [[b'e', b'ev', b'lve'],
-          [b'h', b'ur', b'tee'],
-          [b'i', b've', b'hte'],
-          [b'i', b'en', b'nty']]
-```
-
-Broadcasting `input` onto `pos` and `len`:
-
-```
-input = b'thirteen'
-position = [1, 5, 7]
-length =   [3, 2, 1]
-
-output = [b'hir', b'ee', b'n"]
-```
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Tensor of strings
-*  <b>`pos`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    Scalar defining the position of first character in each substring
-*  <b>`len`</b>: A `Tensor`. Must have the same type as `pos`.
-    Scalar defining the number of characters to include in each substring
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Tensor of substrings
-
-
-
-## Conversion
-
-- - -
-
-### `tf.as_string(input, precision=None, scientific=None, shortest=None, width=None, fill=None, name=None)` {#as_string}
-
-Converts each entry in the given tensor to strings.  Supports many numeric
-
-types and boolean.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`, `complex64`, `float32`, `float64`, `bool`, `int8`.
-*  <b>`precision`</b>: An optional `int`. Defaults to `-1`.
-    The post-decimal precision to use for floating point numbers.
-    Only used if precision > -1.
-*  <b>`scientific`</b>: An optional `bool`. Defaults to `False`.
-    Use scientific notation for floating point numbers.
-*  <b>`shortest`</b>: An optional `bool`. Defaults to `False`.
-    Use shortest representation (either scientific or standard) for
-    floating point numbers.
-*  <b>`width`</b>: An optional `int`. Defaults to `-1`.
-    Pad pre-decimal numbers to this width.
-    Applies to both floating point and integer numbers.
-    Only used if width > -1.
-*  <b>`fill`</b>: An optional `string`. Defaults to `""`.
-    The value to pad if width > -1.  If empty, pads with spaces.
-    Another typical value is '0'.  String cannot be longer than 1 character.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`.
-
-
-- - -
-
-### `tf.encode_base64(input, pad=None, name=None)` {#encode_base64}
-
-Encode strings into web-safe base64 format.
-
-Refer to the following article for more information on base64 format:
-en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-end so that the encoded has length multiple of 4. See Padding section of the
-link above.
-
-Web-safe means that the encoder uses - and _ instead of + and /.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Strings to be encoded.
-*  <b>`pad`</b>: An optional `bool`. Defaults to `False`.
-    Bool whether padding is applied at the ends.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Input strings encoded in base64.
-
-
-- - -
-
-### `tf.decode_base64(input, name=None)` {#decode_base64}
-
-Decode web-safe base64-encoded strings.
-
-Input may or may not have padding at the end. See EncodeBase64 for padding.
-Web-safe means that input must use - and _ instead of + and /.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor` of type `string`. Base64 strings to decode.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `string`. Decoded strings.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/summary.md b/tensorflow/g3doc/api_docs/python/summary.md
deleted file mode 100644
index be029f42906..00000000000
--- a/tensorflow/g3doc/api_docs/python/summary.md
+++ /dev/null
@@ -1,989 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Summary Operations
-[TOC]
-
-## Generation of summaries.
-
-### Class for writing Summaries
-- - -
-
-### `class tf.summary.FileWriter` {#FileWriter}
-
-Writes `Summary` protocol buffers to event files.
-
-The `FileWriter` class provides a mechanism to create an event file in a
-given directory and add summaries and events to it. The class updates the
-file contents asynchronously. This allows a training program to call methods
-to add data to the file directly from the training loop, without slowing down
-training.
-
-- - -
-
-#### `tf.summary.FileWriter.__init__(logdir, graph=None, max_queue=10, flush_secs=120, graph_def=None)` {#FileWriter.__init__}
-
-Creates a `FileWriter` and an event file.
-
-On construction the summary writer creates a new event file in `logdir`.
-This event file will contain `Event` protocol buffers constructed when you
-call one of the following functions: `add_summary()`, `add_session_log()`,
-`add_event()`, or `add_graph()`.
-
-If you pass a `Graph` to the constructor it is added to
-the event file. (This is equivalent to calling `add_graph()` later).
-
-TensorBoard will pick the graph from the file and display it graphically so
-you can interactively explore the graph you built. You will usually pass
-the graph from the session in which you launched it:
-
-```python
-...create a graph...
-# Launch the graph in a session.
-sess = tf.Session()
-# Create a summary writer, add the 'graph' to the event file.
-writer = tf.summary.FileWriter(<some-directory>, sess.graph)
-```
-
-The other arguments to the constructor control the asynchronous writes to
-the event file:
-
-*  `flush_secs`: How often, in seconds, to flush the added summaries
-   and events to disk.
-*  `max_queue`: Maximum number of summaries or events pending to be
-   written to disk before one of the 'add' calls block.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: A string. Directory where event file will be written.
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`max_queue`</b>: Integer. Size of the queue for pending events and summaries.
-*  <b>`flush_secs`</b>: Number. How often, in seconds, to flush the
-    pending events and summaries to disk.
-*  <b>`graph_def`</b>: DEPRECATED: Use the `graph` argument instead.
-
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_summary(summary, global_step=None)` {#FileWriter.add_summary}
-
-Adds a `Summary` protocol buffer to the event file.
-
-This method wraps the provided summary in an `Event` protocol buffer
-and adds it to the event file.
-
-You can pass the result of evaluating any summary op, using
-[`Session.run()`](client.md#Session.run) or
-[`Tensor.eval()`](framework.md#Tensor.eval), to this
-function. Alternatively, you can pass a `tf.Summary` protocol
-buffer that you populate with your own data. The latter is
-commonly done to report evaluation results in event files.
-
-##### Args:
-
-
-*  <b>`summary`</b>: A `Summary` protocol buffer, optionally serialized as a string.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_session_log(session_log, global_step=None)` {#FileWriter.add_session_log}
-
-Adds a `SessionLog` protocol buffer to the event file.
-
-This method wraps the provided session in an `Event` protocol buffer
-and adds it to the event file.
-
-##### Args:
-
-
-*  <b>`session_log`</b>: A `SessionLog` protocol buffer.
-*  <b>`global_step`</b>: Number. Optional global step value to record with the
-    summary.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_event(event)` {#FileWriter.add_event}
-
-Adds an event to the event file.
-
-##### Args:
-
-
-*  <b>`event`</b>: An `Event` protocol buffer.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_graph(graph, global_step=None, graph_def=None)` {#FileWriter.add_graph}
-
-Adds a `Graph` to the event file.
-
-The graph described by the protocol buffer will be displayed by
-TensorBoard. Most users pass a graph in the constructor instead.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph` object, such as `sess.graph`.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    graph.
-*  <b>`graph_def`</b>: DEPRECATED. Use the `graph` parameter instead.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both graph and graph_def are passed to the method.
-
-
-- - -
-
-#### `tf.summary.FileWriter.add_run_metadata(run_metadata, tag, global_step=None)` {#FileWriter.add_run_metadata}
-
-Adds a metadata information for a single session.run() call.
-
-##### Args:
-
-
-*  <b>`run_metadata`</b>: A `RunMetadata` protobuf object.
-*  <b>`tag`</b>: The tag name for this metadata.
-*  <b>`global_step`</b>: Number. Optional global step counter to record with the
-    StepStats.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
-
-
-- - -
-
-#### `tf.summary.FileWriter.get_logdir()` {#FileWriter.get_logdir}
-
-Returns the directory where event file will be written.
-
-
-
-- - -
-
-#### `tf.summary.FileWriter.flush()` {#FileWriter.flush}
-
-Flushes the event file to disk.
-
-Call this method to make sure that all pending events have been written to
-disk.
-
-
-- - -
-
-#### `tf.summary.FileWriter.close()` {#FileWriter.close}
-
-Flushes the event file to disk and close the file.
-
-Call this method when you do not need the summary writer anymore.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.summary.FileWriter.reopen()` {#FileWriter.reopen}
-
-Reopens the EventFileWriter.
-
-Can be called after `close()` to add more events in the same directory.
-The events will go into a new events file.
-
-Does nothing if the EventFileWriter was not closed.
-
-
-
-- - -
-
-### `class tf.summary.FileWriterCache` {#FileWriterCache}
-
-Cache for file writers.
-
-This class caches file writers, one per directory.
-- - -
-
-#### `tf.summary.FileWriterCache.clear()` {#FileWriterCache.clear}
-
-Clear cached summary writers. Currently only used for unit tests.
-
-
-- - -
-
-#### `tf.summary.FileWriterCache.get(logdir)` {#FileWriterCache.get}
-
-Returns the FileWriter for the specified directory.
-
-##### Args:
-
-
-*  <b>`logdir`</b>: str, name of the directory.
-
-##### Returns:
-
-  A `FileWriter`.
-
-
-
-
-### Summary Ops
-- - -
-
-### `tf.summary.tensor_summary(name, tensor, summary_description=None, collections=None)` {#tensor_summary}
-
-Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing the input tensor.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as the series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A tensor of any type and shape to serialize.
-*  <b>`summary_description`</b>: Optional summary_pb2.SummaryDescription()
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.summary.scalar(name, tensor, collections=None)` {#scalar}
-
-Outputs a `Summary` protocol buffer containing a single scalar value.
-
-The generated Summary has a Tensor.proto containing the input Tensor.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as the series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A real numeric Tensor containing a single value.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If tensor has the wrong shape or type.
-
-
-- - -
-
-### `tf.summary.histogram(name, values, collections=None)` {#histogram}
-
-Outputs a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`values`</b>: A real numeric `Tensor`. Any shape. Values to use to
-    build the histogram.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.summary.audio(name, tensor, sample_rate, max_outputs=3, collections=None)` {#audio}
-
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of
-`sample_rate`.
-
-The `tag` in the outputted Summary.Value protobufs is generated based on the
-name, with a suffix depending on the max_outputs setting:
-
-*  If `max_outputs` is 1, the summary value tag is '*name*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*name*/audio/0', '*name*/audio/1', etc
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-    or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`.
-*  <b>`sample_rate`</b>: A Scalar `float32` `Tensor` indicating the sample rate of the
-    signal in hertz.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate audio for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.summary.image(name, tensor, max_outputs=3, collections=None)` {#image}
-
-Outputs a `Summary` protocol buffer with images.
-
-The summary has up to `max_outputs` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` in the outputted Summary.Value protobufs is generated based on the
-name, with a suffix depending on the max_outputs setting:
-
-*  If `max_outputs` is 1, the summary value tag is '*name*/image'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*name*/image/0', '*name*/image/1', etc.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the generated node. Will also serve as a series name in
-    TensorBoard.
-*  <b>`tensor`</b>: A 4-D `uint8` or `float32` `Tensor` of shape `[batch_size, height,
-    width, channels]` where `channels` is 1, 3, or 4.
-*  <b>`max_outputs`</b>: Max number of batch elements to generate images for.
-*  <b>`collections`</b>: Optional list of ops.GraphKeys.  The collections to add the
-    summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer.
-
-
-- - -
-
-### `tf.summary.merge(inputs, collections=None, name=None)` {#merge}
-
-Merges summaries.
-
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-
-##### Args:
-
-
-*  <b>`inputs`</b>: A list of `string` `Tensor` objects containing serialized `Summary`
-    protocol buffers.
-*  <b>`collections`</b>: Optional list of graph collections keys. The new summary op is
-    added to these collections. Defaults to `[]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-  buffer resulting from the merging.
-
-
-- - -
-
-### `tf.summary.merge_all(key='summaries')` {#merge_all}
-
-Merges all summaries collected in the default graph.
-
-##### Args:
-
-
-*  <b>`key`</b>: `GraphKey` used to collect the summaries.  Defaults to
-    `GraphKeys.SUMMARIES`.
-
-##### Returns:
-
-  If no summaries were collected, returns None.  Otherwise returns a scalar
-  `Tensor` of type `string` containing the serialized `Summary` protocol
-  buffer resulting from the merging.
-
-
-
-## Utilities
-- - -
-
-### `tf.summary.get_summary_description(node_def)` {#get_summary_description}
-
-Given a TensorSummary node_def, retrieve its SummaryDescription.
-
-When a Summary op is instantiated, a SummaryDescription of associated
-metadata is stored in its NodeDef. This method retrieves the description.
-
-##### Args:
-
-
-*  <b>`node_def`</b>: the node_def_pb2.NodeDef of a TensorSummary op
-
-##### Returns:
-
-  a summary_pb2.SummaryDescription
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the node is not a summary op.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.summary.SummaryDescription` {#SummaryDescription}
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ByteSize()` {#SummaryDescription.ByteSize}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.Clear()` {#SummaryDescription.Clear}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ClearExtension(extension_handle)` {#SummaryDescription.ClearExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ClearField(field_name)` {#SummaryDescription.ClearField}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.CopyFrom(other_msg)` {#SummaryDescription.CopyFrom}
-
-Copies the content of the specified message into the current message.
-
-The method clears the current message and then merges the specified
-message using MergeFrom.
-
-##### Args:
-
-
-*  <b>`other_msg`</b>: Message to copy into the current one.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.DiscardUnknownFields()` {#SummaryDescription.DiscardUnknownFields}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.FindInitializationErrors()` {#SummaryDescription.FindInitializationErrors}
-
-Finds required fields which are not initialized.
-
-##### Returns:
-
-  A list of strings.  Each string is a path to an uninitialized field from
-  the top-level message, e.g. "foo.bar[5].baz".
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.HasExtension(extension_handle)` {#SummaryDescription.HasExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.HasField(field_name)` {#SummaryDescription.HasField}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.IsInitialized(errors=None)` {#SummaryDescription.IsInitialized}
-
-Checks if all required fields of a message are set.
-
-##### Args:
-
-
-*  <b>`errors`</b>: A list which, if provided, will be populated with the field
-           paths of all missing required fields.
-
-##### Returns:
-
-  True iff the specified message has all required fields set.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ListFields()` {#SummaryDescription.ListFields}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.MergeFrom(msg)` {#SummaryDescription.MergeFrom}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.MergeFromString(serialized)` {#SummaryDescription.MergeFromString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.ParseFromString(serialized)` {#SummaryDescription.ParseFromString}
-
-Parse serialized protocol buffer data into this message.
-
-Like MergeFromString(), except we clear the object first and
-do not return the value that MergeFromString returns.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SerializePartialToString()` {#SummaryDescription.SerializePartialToString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SerializeToString()` {#SummaryDescription.SerializeToString}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.SetInParent()` {#SummaryDescription.SetInParent}
-
-Sets the _cached_byte_size_dirty bit to true,
-and propagates this to our listener iff this was a state change.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.WhichOneof(oneof_name)` {#SummaryDescription.WhichOneof}
-
-Returns the name of the currently set field inside a oneof, or None.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__deepcopy__(memo=None)` {#SummaryDescription.__deepcopy__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__eq__(other)` {#SummaryDescription.__eq__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__getstate__()` {#SummaryDescription.__getstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__hash__()` {#SummaryDescription.__hash__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__init__(**kwargs)` {#SummaryDescription.__init__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__ne__(other_msg)` {#SummaryDescription.__ne__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__repr__()` {#SummaryDescription.__repr__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__setstate__(state)` {#SummaryDescription.__setstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__str__()` {#SummaryDescription.__str__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.__unicode__()` {#SummaryDescription.__unicode__}
-
-
-
-
-- - -
-
-#### `tf.summary.SummaryDescription.type_hint` {#SummaryDescription.type_hint}
-
-Magic attribute generated for "type_hint" proto field.
-
-
-
-- - -
-
-### `class tf.summary.TaggedRunMetadata` {#TaggedRunMetadata}
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ByteSize()` {#TaggedRunMetadata.ByteSize}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.Clear()` {#TaggedRunMetadata.Clear}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ClearExtension(extension_handle)` {#TaggedRunMetadata.ClearExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ClearField(field_name)` {#TaggedRunMetadata.ClearField}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.CopyFrom(other_msg)` {#TaggedRunMetadata.CopyFrom}
-
-Copies the content of the specified message into the current message.
-
-The method clears the current message and then merges the specified
-message using MergeFrom.
-
-##### Args:
-
-
-*  <b>`other_msg`</b>: Message to copy into the current one.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.DiscardUnknownFields()` {#TaggedRunMetadata.DiscardUnknownFields}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.FindInitializationErrors()` {#TaggedRunMetadata.FindInitializationErrors}
-
-Finds required fields which are not initialized.
-
-##### Returns:
-
-  A list of strings.  Each string is a path to an uninitialized field from
-  the top-level message, e.g. "foo.bar[5].baz".
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.HasExtension(extension_handle)` {#TaggedRunMetadata.HasExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.HasField(field_name)` {#TaggedRunMetadata.HasField}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.IsInitialized(errors=None)` {#TaggedRunMetadata.IsInitialized}
-
-Checks if all required fields of a message are set.
-
-##### Args:
-
-
-*  <b>`errors`</b>: A list which, if provided, will be populated with the field
-           paths of all missing required fields.
-
-##### Returns:
-
-  True iff the specified message has all required fields set.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ListFields()` {#TaggedRunMetadata.ListFields}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.MergeFrom(msg)` {#TaggedRunMetadata.MergeFrom}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.MergeFromString(serialized)` {#TaggedRunMetadata.MergeFromString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.ParseFromString(serialized)` {#TaggedRunMetadata.ParseFromString}
-
-Parse serialized protocol buffer data into this message.
-
-Like MergeFromString(), except we clear the object first and
-do not return the value that MergeFromString returns.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SerializePartialToString()` {#TaggedRunMetadata.SerializePartialToString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SerializeToString()` {#TaggedRunMetadata.SerializeToString}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.SetInParent()` {#TaggedRunMetadata.SetInParent}
-
-Sets the _cached_byte_size_dirty bit to true,
-and propagates this to our listener iff this was a state change.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.WhichOneof(oneof_name)` {#TaggedRunMetadata.WhichOneof}
-
-Returns the name of the currently set field inside a oneof, or None.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__deepcopy__(memo=None)` {#TaggedRunMetadata.__deepcopy__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__eq__(other)` {#TaggedRunMetadata.__eq__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__getstate__()` {#TaggedRunMetadata.__getstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__hash__()` {#TaggedRunMetadata.__hash__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__init__(**kwargs)` {#TaggedRunMetadata.__init__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__ne__(other_msg)` {#TaggedRunMetadata.__ne__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__repr__()` {#TaggedRunMetadata.__repr__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__setstate__(state)` {#TaggedRunMetadata.__setstate__}
-
-Support the pickle protocol.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__str__()` {#TaggedRunMetadata.__str__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.__unicode__()` {#TaggedRunMetadata.__unicode__}
-
-
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.run_metadata` {#TaggedRunMetadata.run_metadata}
-
-Magic attribute generated for "run_metadata" proto field.
-
-
-- - -
-
-#### `tf.summary.TaggedRunMetadata.tag` {#TaggedRunMetadata.tag}
-
-Magic attribute generated for "tag" proto field.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md b/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
deleted file mode 100644
index f8e76333b74..00000000000
--- a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
+++ /dev/null
@@ -1,306 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# TensorArray Operations
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-
-[TOC]
-
-TensorArray operations.
-
-## Classes containing dynamically sized arrays of Tensors.
-
-- - -
-
-### `class tf.TensorArray` {#TensorArray}
-
-Class wrapping dynamic-sized, per-time-step, write-once Tensor arrays.
-
-This class is meant to be used with dynamic iteration primitives such as
-`while_loop` and `map_fn`.  It supports gradient back-propagation via special
-"flow" control flow dependencies.
-
-- - -
-
-#### `tf.TensorArray.handle` {#TensorArray.handle}
-
-The reference to the TensorArray.
-
-
-- - -
-
-#### `tf.TensorArray.flow` {#TensorArray.flow}
-
-The flow `Tensor` forcing ops leading to this TensorArray state.
-
-
-- - -
-
-#### `tf.TensorArray.dtype` {#TensorArray.dtype}
-
-The data type of this TensorArray.
-
-
-
-- - -
-
-#### `tf.TensorArray.read(index, name=None)` {#TensorArray.read}
-
-Read the value at location `index` in the TensorArray.
-
-##### Args:
-
-
-*  <b>`index`</b>: 0-D.  int32 tensor with the index to read from.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The tensor at index `index`.
-
-
-- - -
-
-#### `tf.TensorArray.gather(indices, name=None)` {#TensorArray.gather}
-
-Return selected values in the TensorArray as a packed `Tensor`.
-
-All of selected values must have been written and their shapes
-must all match.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `1-D` `Tensor` taking values in `[0, max_value)`.  If
-    the `TensorArray` is not dynamic, `max_value=size()`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  The in the `TensorArray` selected by `indices`, packed into one tensor.
-
-
-- - -
-
-#### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
-
-Return the values in the TensorArray as a stacked `Tensor`.
-
-All of the values must have been written and their shapes must all match.
-If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray stacked into one tensor.
-
-
-- - -
-
-#### `tf.TensorArray.concat(name=None)` {#TensorArray.concat}
-
-Return the values in the TensorArray as a concatenated `Tensor`.
-
-All of the values must have been written, their ranks must match, and
-and their shapes must all match for all dimensions except the first.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray concatenated into one tensor.
-
-
-
-- - -
-
-#### `tf.TensorArray.write(index, value, name=None)` {#TensorArray.write}
-
-Write `value` into index `index` of the TensorArray.
-
-##### Args:
-
-
-*  <b>`index`</b>: 0-D.  int32 scalar with the index to write to.
-*  <b>`value`</b>: N-D.  Tensor of type `dtype`.  The Tensor to write to this index.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the write occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if there are more writers than specified.
-
-
-- - -
-
-#### `tf.TensorArray.scatter(indices, value, name=None)` {#TensorArray.scatter}
-
-Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
-
-##### Args:
-
-
-*  <b>`indices`</b>: A `1-D` `Tensor` taking values in `[0, max_value)`.  If
-    the `TensorArray` is not dynamic, `max_value=size()`.
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the scatter occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-- - -
-
-#### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
-
-Unstack the values of a `Tensor` in the TensorArray.
-
-If input value shapes have rank-`R`, then the output TensorArray will
-contain elements whose shapes are rank-`(R-1)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the unstack occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-- - -
-
-#### `tf.TensorArray.split(value, lengths, name=None)` {#TensorArray.split}
-
-Split the values of a `Tensor` into the TensorArray.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to split.
-*  <b>`lengths`</b>: 1-D.  int32 vector with the lengths to use when splitting
-    `value` along its first dimension.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the split occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
-
-- - -
-
-#### `tf.TensorArray.identity()` {#TensorArray.identity}
-
-Returns a TensorArray with the same content and properties.
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the control dependencies
-  from the contexts will become control dependencies for writes, reads, etc.
-  Use this object all for subsequent operations.
-
-
-
-- - -
-
-#### `tf.TensorArray.grad(source, flow=None, name=None)` {#TensorArray.grad}
-
-
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.TensorArray.__init__(dtype, size=None, dynamic_size=None, clear_after_read=None, tensor_array_name=None, handle=None, flow=None, infer_shape=True, element_shape=None, name=None)` {#TensorArray.__init__}
-
-Construct a new TensorArray or wrap an existing TensorArray handle.
-
-A note about the parameter `name`:
-
-The name of the `TensorArray` (even if passed in) is uniquified: each time
-a new `TensorArray` is created at runtime it is assigned its own name for
-the duration of the run.  This avoids name collisions if a `TensorArray`
-is created within a `while_loop`.
-
-##### Args:
-
-
-*  <b>`dtype`</b>: (required) data type of the TensorArray.
-*  <b>`size`</b>: (optional) int32 scalar `Tensor`: the size of the TensorArray.
-    Required if handle is not provided.
-*  <b>`dynamic_size`</b>: (optional) Python bool: If true, writes to the TensorArray
-    can grow the TensorArray past its initial size.  Default: False.
-*  <b>`clear_after_read`</b>: Boolean (optional, default: True).  If True, clear
-    TensorArray values after reading them.  This disables read-many
-    semantics, but allows early release of memory.
-*  <b>`tensor_array_name`</b>: (optional) Python string: the name of the TensorArray.
-    This is used when creating the TensorArray handle.  If this value is
-    set, handle should be None.
-*  <b>`handle`</b>: (optional) A `Tensor` handle to an existing TensorArray.  If this
-    is set, tensor_array_name should be None.
-*  <b>`flow`</b>: (optional) A float `Tensor` scalar coming from an existing
-    `TensorArray.flow`.
-*  <b>`infer_shape`</b>: (optional, default: True) If True, shape inference
-    is enabled.  In this case, all elements must have the same shape.
-*  <b>`element_shape`</b>: (optional, default: None) A `TensorShape` object specifying
-    the shape constraints of each of the elements of the TensorArray.
-    Need not be fully defined.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if both handle and tensor_array_name are provided.
-*  <b>`TypeError`</b>: if handle is provided but is not a Tensor.
-
-
-- - -
-
-#### `tf.TensorArray.close(name=None)` {#TensorArray.close}
-
-Close the current TensorArray.
-
-
-- - -
-
-#### `tf.TensorArray.size(name=None)` {#TensorArray.size}
-
-Return the size of the TensorArray.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/test.md b/tensorflow/g3doc/api_docs/python/test.md
deleted file mode 100644
index 7daf59dcda1..00000000000
--- a/tensorflow/g3doc/api_docs/python/test.md
+++ /dev/null
@@ -1,1161 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Testing
-[TOC]
-
-## Unit tests
-
-TensorFlow provides a convenience class inheriting from `unittest.TestCase`
-which adds methods relevant to TensorFlow tests.  Here is an example:
-
-```python
-    import tensorflow as tf
-
-
-    class SquareTest(tf.test.TestCase):
-
-      def testSquare(self):
-        with self.test_session():
-          x = tf.square([2, 3])
-          self.assertAllEqual(x.eval(), [4, 9])
-
-
-    if __name__ == '__main__':
-      tf.test.main()
-```
-
-`tf.test.TestCase` inherits from `unittest.TestCase` but adds a few additional
-methods.  We will document these methods soon.
-
-- - -
-
-### `tf.test.main(argv=None)` {#main}
-
-Runs all unit tests.
-
-
-- - -
-
-### `class tf.test.TestCase` {#TestCase}
-
-Base class for tests that need to test TensorFlow.
-- - -
-
-#### `tf.test.TestCase.__call__(*args, **kwds)` {#TestCase.__call__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__eq__(other)` {#TestCase.__eq__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__hash__()` {#TestCase.__hash__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__init__(methodName='runTest')` {#TestCase.__init__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__ne__(other)` {#TestCase.__ne__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__repr__()` {#TestCase.__repr__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.__str__()` {#TestCase.__str__}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.addCleanup(function, *args, **kwargs)` {#TestCase.addCleanup}
-
-Add a function, with arguments, to be called when the test is
-completed. Functions added are called on a LIFO basis and are
-called after tearDown on test failure or success.
-
-Cleanup items are called even if setUp fails (unlike tearDown).
-
-
-- - -
-
-#### `tf.test.TestCase.addTypeEqualityFunc(typeobj, function)` {#TestCase.addTypeEqualityFunc}
-
-Add a type specific assertEqual style function to compare a type.
-
-This method is for use by TestCase subclasses that need to register
-their own type equality functions to provide nicer error messages.
-
-##### Args:
-
-
-*  <b>`typeobj`</b>: The data type to call this function on when both values
-            are of the same type in assertEqual().
-*  <b>`function`</b>: The callable taking two arguments and an optional
-            msg= argument that raises self.failureException with a
-            useful error message when the two arguments are not equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllClose(a, b, rtol=1e-06, atol=1e-06)` {#TestCase.assertAllClose}
-
-Asserts that two numpy arrays have near values.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`rtol`</b>: relative tolerance
-*  <b>`atol`</b>: absolute tolerance
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllCloseAccordingToType(a, b, rtol=1e-06, atol=1e-06)` {#TestCase.assertAllCloseAccordingToType}
-
-Like assertAllClose, but also suitable for comparing fp16 arrays.
-
-In particular, the tolerance is reduced to 1e-3 if at least
-one of the arguments is of type float16.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`rtol`</b>: relative tolerance
-*  <b>`atol`</b>: absolute tolerance
-
-
-- - -
-
-#### `tf.test.TestCase.assertAllEqual(a, b)` {#TestCase.assertAllEqual}
-
-Asserts that two numpy arrays have the same values.
-
-##### Args:
-
-
-*  <b>`a`</b>: a numpy ndarray or anything can be converted to one.
-*  <b>`b`</b>: a numpy ndarray or anything can be converted to one.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertAlmostEqual}
-
-Fail if the two objects are unequal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-If the two objects compare equal then they will automatically
-compare almost equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertAlmostEquals(first, second, places=None, msg=None, delta=None)` {#TestCase.assertAlmostEquals}
-
-Fail if the two objects are unequal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-If the two objects compare equal then they will automatically
-compare almost equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertArrayNear(farray1, farray2, err)` {#TestCase.assertArrayNear}
-
-Asserts that two float arrays are near each other.
-
-Checks that for all elements of farray1 and farray2
-|f1 - f2| < err.  Asserts a test failure if not.
-
-##### Args:
-
-
-*  <b>`farray1`</b>: a list of float values.
-*  <b>`farray2`</b>: a list of float values.
-*  <b>`err`</b>: a float value.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDeviceEqual(device1, device2)` {#TestCase.assertDeviceEqual}
-
-Asserts that the two given devices are the same.
-
-##### Args:
-
-
-*  <b>`device1`</b>: A string device name or TensorFlow `DeviceSpec` object.
-*  <b>`device2`</b>: A string device name or TensorFlow `DeviceSpec` object.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDictContainsSubset(expected, actual, msg=None)` {#TestCase.assertDictContainsSubset}
-
-Checks whether actual is a superset of expected.
-
-
-- - -
-
-#### `tf.test.TestCase.assertDictEqual(d1, d2, msg=None)` {#TestCase.assertDictEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertEqual(first, second, msg=None)` {#TestCase.assertEqual}
-
-Fail if the two objects are unequal as determined by the '=='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertEquals(first, second, msg=None)` {#TestCase.assertEquals}
-
-Fail if the two objects are unequal as determined by the '=='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertFalse(expr, msg=None)` {#TestCase.assertFalse}
-
-Check that the expression is false.
-
-
-- - -
-
-#### `tf.test.TestCase.assertGreater(a, b, msg=None)` {#TestCase.assertGreater}
-
-Just like self.assertTrue(a > b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertGreaterEqual(a, b, msg=None)` {#TestCase.assertGreaterEqual}
-
-Just like self.assertTrue(a >= b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIn(member, container, msg=None)` {#TestCase.assertIn}
-
-Just like self.assertTrue(a in b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIs(expr1, expr2, msg=None)` {#TestCase.assertIs}
-
-Just like self.assertTrue(a is b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsInstance(obj, cls, msg=None)` {#TestCase.assertIsInstance}
-
-Same as self.assertTrue(isinstance(obj, cls)), with a nicer
-default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNone(obj, msg=None)` {#TestCase.assertIsNone}
-
-Same as self.assertTrue(obj is None), with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNot(expr1, expr2, msg=None)` {#TestCase.assertIsNot}
-
-Just like self.assertTrue(a is not b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertIsNotNone(obj, msg=None)` {#TestCase.assertIsNotNone}
-
-Included for symmetry with assertIsNone.
-
-
-- - -
-
-#### `tf.test.TestCase.assertItemsEqual(expected_seq, actual_seq, msg=None)` {#TestCase.assertItemsEqual}
-
-An unordered sequence specific comparison. It asserts that
-actual_seq and expected_seq have the same element counts.
-Equivalent to::
-
-    self.assertEqual(Counter(iter(actual_seq)),
-                     Counter(iter(expected_seq)))
-
-Asserts that each element has the same count in both sequences.
-
-##### Example:
-
-    - [0, 1, 1] and [1, 0, 1] compare equal.
-    - [0, 0, 1] and [0, 1] compare unequal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertLess(a, b, msg=None)` {#TestCase.assertLess}
-
-Just like self.assertTrue(a < b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertLessEqual(a, b, msg=None)` {#TestCase.assertLessEqual}
-
-Just like self.assertTrue(a <= b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertListEqual(list1, list2, msg=None)` {#TestCase.assertListEqual}
-
-A list-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`list1`</b>: The first list to compare.
-*  <b>`list2`</b>: The second list to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assertMultiLineEqual(first, second, msg=None)` {#TestCase.assertMultiLineEqual}
-
-Assert that two multi-line strings are equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNDArrayNear(ndarray1, ndarray2, err)` {#TestCase.assertNDArrayNear}
-
-Asserts that two numpy arrays have near values.
-
-##### Args:
-
-
-*  <b>`ndarray1`</b>: a numpy ndarray.
-*  <b>`ndarray2`</b>: a numpy ndarray.
-*  <b>`err`</b>: a float. The maximum absolute difference allowed.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNear(f1, f2, err, msg=None)` {#TestCase.assertNear}
-
-Asserts that two floats are near each other.
-
-Checks that |f1 - f2| < err and asserts a test failure
-if not.
-
-##### Args:
-
-
-*  <b>`f1`</b>: A float value.
-*  <b>`f2`</b>: A float value.
-*  <b>`err`</b>: A float value.
-*  <b>`msg`</b>: An optional string message to append to the failure message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEqual}
-
-Fail if the two objects are equal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is less than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-Objects that are equal automatically fail.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotAlmostEquals(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEquals}
-
-Fail if the two objects are equal as determined by their
-difference rounded to the given number of decimal places
-(default 7) and comparing to zero, or by comparing that the
-between the two objects is less than the given delta.
-
-Note that decimal places (from zero) are usually not the same
-as significant digits (measured from the most signficant digit).
-
-Objects that are equal automatically fail.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEqual(first, second, msg=None)` {#TestCase.assertNotEqual}
-
-Fail if the two objects are equal as determined by the '!='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEquals(first, second, msg=None)` {#TestCase.assertNotEquals}
-
-Fail if the two objects are equal as determined by the '!='
-operator.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotIn(member, container, msg=None)` {#TestCase.assertNotIn}
-
-Just like self.assertTrue(a not in b), but with a nicer default message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotIsInstance(obj, cls, msg=None)` {#TestCase.assertNotIsInstance}
-
-Included for symmetry with assertIsInstance.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotRegexpMatches(text, unexpected_regexp, msg=None)` {#TestCase.assertNotRegexpMatches}
-
-Fail the test if the text matches the regular expression.
-
-
-- - -
-
-#### `tf.test.TestCase.assertProtoEquals(expected_message_maybe_ascii, message)` {#TestCase.assertProtoEquals}
-
-Asserts that message is same as parsed expected_message_ascii.
-
-Creates another prototype of message, reads the ascii message into it and
-then compares them using self._AssertProtoEqual().
-
-##### Args:
-
-
-*  <b>`expected_message_maybe_ascii`</b>: proto message in original or ascii form
-*  <b>`message`</b>: the message to validate
-
-
-- - -
-
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=21, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaises(excClass, callableObj=None, *args, **kwargs)` {#TestCase.assertRaises}
-
-Fail unless an exception of class excClass is raised
-by callableObj when invoked with arguments args and keyword
-arguments kwargs. If a different type of exception is
-raised, it will not be caught, and the test case will be
-deemed to have suffered an error, exactly as for an
-unexpected exception.
-
-If called with callableObj omitted or None, will return a
-context object used like this::
-
-     with self.assertRaises(SomeException):
-         do_something()
-
-The context manager keeps a reference to the exception as
-the 'exception' attribute. This allows you to inspect the
-exception after the assertion::
-
-    with self.assertRaises(SomeException) as cm:
-        do_something()
-    the_exception = cm.exception
-    self.assertEqual(the_exception.error_code, 3)
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesOpError(expected_err_re_or_predicate)` {#TestCase.assertRaisesOpError}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesRegexp(expected_exception, expected_regexp, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesRegexp}
-
-Asserts that the message in a raised exception matches a regexp.
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_regexp`</b>: Regexp (re pattern object or string) expected
-            to be found in error message.
-*  <b>`callable_obj`</b>: Function to be called.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra kwargs.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRaisesWithPredicateMatch(exception_type, expected_err_re_or_predicate)` {#TestCase.assertRaisesWithPredicateMatch}
-
-Returns a context manager to enclose code expected to raise an exception.
-
-If the exception is an OpError, the op stack is also included in the message
-predicate search.
-
-##### Args:
-
-
-*  <b>`exception_type`</b>: The expected type of exception that should be raised.
-*  <b>`expected_err_re_or_predicate`</b>: If this is callable, it should be a function
-    of one argument that inspects the passed-in exception and
-    returns True (success) or False (please fail the test). Otherwise, the
-    error message is expected to match this regular expression partially.
-
-##### Returns:
-
-  A context manager to surround code that is expected to raise an
-  exception.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRegexpMatches(text, expected_regexp, msg=None)` {#TestCase.assertRegexpMatches}
-
-Fail the test unless the text matches the regular expression.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSequenceEqual(seq1, seq2, msg=None, seq_type=None)` {#TestCase.assertSequenceEqual}
-
-An equality assertion for ordered sequences (like lists and tuples).
-
-For the purposes of this function, a valid ordered sequence type is one
-which can be indexed, has a length, and has an equality operator.
-
-##### Args:
-
-
-*  <b>`seq1`</b>: The first sequence to compare.
-*  <b>`seq2`</b>: The second sequence to compare.
-*  <b>`seq_type`</b>: The expected datatype of the sequences, or None if no
-            datatype should be enforced.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSetEqual(set1, set2, msg=None)` {#TestCase.assertSetEqual}
-
-A set-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`set1`</b>: The first set to compare.
-*  <b>`set2`</b>: The second set to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-assertSetEqual uses ducktyping to support different types of sets, and
-is optimized for sets specifically (parameters must support a
-difference method).
-
-
-- - -
-
-#### `tf.test.TestCase.assertShapeEqual(np_array, tf_tensor)` {#TestCase.assertShapeEqual}
-
-Asserts that a Numpy ndarray and a TensorFlow tensor have the same shape.
-
-##### Args:
-
-
-*  <b>`np_array`</b>: A Numpy ndarray or Numpy scalar.
-*  <b>`tf_tensor`</b>: A Tensor.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments have the wrong type.
-
-
-- - -
-
-#### `tf.test.TestCase.assertStartsWith(actual, expected_start, msg=None)` {#TestCase.assertStartsWith}
-
-Assert that actual.startswith(expected_start) is True.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`expected_start`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertTrue(expr, msg=None)` {#TestCase.assertTrue}
-
-Check that the expression is true.
-
-
-- - -
-
-#### `tf.test.TestCase.assertTupleEqual(tuple1, tuple2, msg=None)` {#TestCase.assertTupleEqual}
-
-A tuple-specific equality assertion.
-
-##### Args:
-
-
-*  <b>`tuple1`</b>: The first tuple to compare.
-*  <b>`tuple2`</b>: The second tuple to compare.
-*  <b>`msg`</b>: Optional message to use on failure instead of a list of
-            differences.
-
-
-- - -
-
-#### `tf.test.TestCase.assert_(expr, msg=None)` {#TestCase.assert_}
-
-Check that the expression is true.
-
-
-- - -
-
-#### `tf.test.TestCase.checkedThread(target, args=None, kwargs=None)` {#TestCase.checkedThread}
-
-Returns a Thread wrapper that asserts 'target' completes successfully.
-
-This method should be used to create all threads in test cases, as
-otherwise there is a risk that a thread will silently fail, and/or
-assertions made in the thread will not be respected.
-
-##### Args:
-
-
-*  <b>`target`</b>: A callable object to be executed in the thread.
-*  <b>`args`</b>: The argument tuple for the target invocation. Defaults to ().
-*  <b>`kwargs`</b>: A dictionary of keyword arguments for the target invocation.
-    Defaults to {}.
-
-##### Returns:
-
-  A wrapper for threading.Thread that supports start() and join() methods.
-
-
-- - -
-
-#### `tf.test.TestCase.countTestCases()` {#TestCase.countTestCases}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.debug()` {#TestCase.debug}
-
-Run the test without collecting errors in a TestResult
-
-
-- - -
-
-#### `tf.test.TestCase.defaultTestResult()` {#TestCase.defaultTestResult}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.doCleanups()` {#TestCase.doCleanups}
-
-Execute all cleanup functions. Normally called for you after
-tearDown.
-
-
-- - -
-
-#### `tf.test.TestCase.fail(msg=None)` {#TestCase.fail}
-
-Fail immediately, with the given message.
-
-
-- - -
-
-#### `tf.test.TestCase.failIf(*args, **kwargs)` {#TestCase.failIf}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failIfAlmostEqual(*args, **kwargs)` {#TestCase.failIfAlmostEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failIfEqual(*args, **kwargs)` {#TestCase.failIfEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnless(*args, **kwargs)` {#TestCase.failUnless}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessAlmostEqual(*args, **kwargs)` {#TestCase.failUnlessAlmostEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessEqual(*args, **kwargs)` {#TestCase.failUnlessEqual}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.failUnlessRaises(*args, **kwargs)` {#TestCase.failUnlessRaises}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.get_temp_dir()` {#TestCase.get_temp_dir}
-
-Returns a unique temporary directory for the test to use.
-
-Across different test runs, this method will return a different folder.
-This will ensure that across different runs tests will not be able to
-pollute each others environment.
-
-##### Returns:
-
-  string, the path to the unique temporary directory created for this test.
-
-
-- - -
-
-#### `tf.test.TestCase.id()` {#TestCase.id}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.run(result=None)` {#TestCase.run}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.setUp()` {#TestCase.setUp}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.setUpClass(cls)` {#TestCase.setUpClass}
-
-Hook method for setting up class fixture before running tests in the class.
-
-
-- - -
-
-#### `tf.test.TestCase.shortDescription()` {#TestCase.shortDescription}
-
-Returns a one-line description of the test, or None if no
-description has been provided.
-
-The default implementation of this method returns the first line of
-the specified test method's docstring.
-
-
-- - -
-
-#### `tf.test.TestCase.skipTest(reason)` {#TestCase.skipTest}
-
-Skip this test.
-
-
-- - -
-
-#### `tf.test.TestCase.tearDown()` {#TestCase.tearDown}
-
-
-
-
-- - -
-
-#### `tf.test.TestCase.tearDownClass(cls)` {#TestCase.tearDownClass}
-
-Hook method for deconstructing the class fixture after running all tests in the class.
-
-
-- - -
-
-#### `tf.test.TestCase.test_session(graph=None, config=None, use_gpu=False, force_gpu=False)` {#TestCase.test_session}
-
-Returns a TensorFlow Session for use in executing tests.
-
-This method should be used for all functional tests.
-
-This method behaves different than session.Session: for performance reasons
-`test_session` will by default (if `graph` is None) reuse the same session
-across tests. This means you may want to either call the function
-`reset_default_graph()` before tests, or if creating an explicit new graph,
-pass it here (simply setting it with `as_default()` won't do it), which will
-trigger the creation of a new session.
-
-Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-`force_gpu` is True, all ops are pinned to `/gpu:0`. Otherwise, if `use_gpu`
-is True, TensorFlow tries to run as many ops on the GPU as possible. If both
-`force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
-
-Example:
-
-  class MyOperatorTest(test_util.TensorFlowTestCase):
-    def testMyOperator(self):
-      with self.test_session(use_gpu=True):
-        valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
-        result = MyOperator(valid_input).eval()
-        self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
-        invalid_input = [-1.0, 2.0, 7.0]
-        with self.assertRaisesOpError("negative input not supported"):
-          MyOperator(invalid_input).eval()
-
-##### Args:
-
-
-*  <b>`graph`</b>: Optional graph to use during the returned session.
-*  <b>`config`</b>: An optional config_pb2.ConfigProto to use to configure the
-    session.
-*  <b>`use_gpu`</b>: If True, attempt to run as many ops as possible on GPU.
-*  <b>`force_gpu`</b>: If True, pin all ops to `/gpu:0`.
-
-##### Returns:
-
-  A Session object that should be used as a context manager to surround
-  the graph building and execution code in a test case.
-
-
-
-- - -
-
-### `tf.test.test_src_dir_path(relative_path)` {#test_src_dir_path}
-
-Creates an absolute test srcdir path given a relative path.
-
-##### Args:
-
-
-*  <b>`relative_path`</b>: a path relative to tensorflow root.
-    e.g. "core/platform".
-
-##### Returns:
-
-  An absolute path to the linked in runfiles.
-
-
-
-## Utilities
-
-- - -
-
-### `tf.test.assert_equal_graph_def(actual, expected, checkpoint_v2=False)` {#assert_equal_graph_def}
-
-Asserts that two `GraphDef`s are (mostly) the same.
-
-Compares two `GraphDef` protos for equality, ignoring versions and ordering of
-nodes, attrs, and control inputs.  Node names are used to match up nodes
-between the graphs, so the naming of nodes must be consistent.
-
-##### Args:
-
-
-*  <b>`actual`</b>: The `GraphDef` we have.
-*  <b>`expected`</b>: The `GraphDef` we expected.
-*  <b>`checkpoint_v2`</b>: boolean determining whether to ignore randomized attribute
-      values that appear in V2 checkpoints.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: If the `GraphDef`s do not match.
-*  <b>`TypeError`</b>: If either argument is not a `GraphDef`.
-
-
-- - -
-
-### `tf.test.get_temp_dir()` {#get_temp_dir}
-
-Returns a temporary directory for use during tests.
-
-There is no need to delete the directory after the test.
-
-##### Returns:
-
-  The temporary directory.
-
-
-- - -
-
-### `tf.test.is_built_with_cuda()` {#is_built_with_cuda}
-
-Returns whether TensorFlow was built with CUDA (GPU) support.
-
-
-- - -
-
-### `tf.test.is_gpu_available(cuda_only=False)` {#is_gpu_available}
-
-Returns whether TensorFlow can access a GPU.
-
-##### Args:
-
-
-*  <b>`cuda_only`</b>: limit the search to CUDA gpus.
-
-##### Returns:
-
-  True iff a gpu device of the requested kind is available.
-
-
-- - -
-
-### `tf.test.gpu_device_name()` {#gpu_device_name}
-
-Returns the name of a GPU device if available or the empty string.
-
-
-
-## Gradient checking
-
-[`compute_gradient`](#compute_gradient) and
-[`compute_gradient_error`](#compute_gradient_error) perform numerical
-differentiation of graphs for comparison against registered analytic gradients.
-
-- - -
-
-### `tf.test.compute_gradient(x, x_shape, y, y_shape, x_init_value=None, delta=0.001, init_targets=None, extra_feed_dict=None)` {#compute_gradient}
-
-Computes and returns the theoretical and numerical Jacobian.
-
-If `x` or `y` is complex, the Jacobian will still be real but the
-corresponding Jacobian dimension(s) will be twice as large.  This is required
-even if both input and output is complex since TensorFlow graphs are not
-necessarily holomorphic, and may have gradients not expressible as complex
-numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
-with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
-
-    J[:m, :n] = d(Re y)/d(Re x)
-    J[:m, n:] = d(Im y)/d(Re x)
-    J[m:, :n] = d(Re y)/d(Im x)
-    J[m:, n:] = d(Im y)/d(Im x)
-
-##### Args:
-
-
-*  <b>`x`</b>: a tensor or list of tensors
-*  <b>`x_shape`</b>: the dimensions of x as a tuple or an array of ints. If x is a list,
-  then this is the list of shapes.
-
-*  <b>`y`</b>: a tensor
-*  <b>`y_shape`</b>: the dimensions of y as a tuple or an array of ints.
-*  <b>`x_init_value`</b>: (optional) a numpy array of the same shape as "x"
-    representing the initial value of x. If x is a list, this should be a list
-    of numpy arrays.  If this is none, the function will pick a random tensor
-    as the initial value.
-*  <b>`delta`</b>: (optional) the amount of perturbation.
-*  <b>`init_targets`</b>: list of targets to run to initialize model params.
-    TODO(mrry): remove this argument.
-*  <b>`extra_feed_dict`</b>: dict that allows fixing specified tensor values
-    during the Jacobian calculation.
-
-##### Returns:
-
-  Two 2-d numpy arrays representing the theoretical and numerical
-  Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns
-  where "x_size" is the number of elements in x and "y_size" is the
-  number of elements in y. If x is a list, returns a list of two numpy arrays.
-
-
-- - -
-
-### `tf.test.compute_gradient_error(x, x_shape, y, y_shape, x_init_value=None, delta=0.001, init_targets=None, extra_feed_dict=None)` {#compute_gradient_error}
-
-Computes the gradient error.
-
-Computes the maximum error for dy/dx between the computed Jacobian and the
-numerically estimated Jacobian.
-
-This function will modify the tensors passed in as it adds more operations
-and hence changing the consumers of the operations of the input tensors.
-
-This function adds operations to the current session. To compute the error
-using a particular device, such as a GPU, use the standard methods for
-setting a device (e.g. using with sess.graph.device() or setting a device
-function in the session constructor).
-
-##### Args:
-
-
-*  <b>`x`</b>: a tensor or list of tensors
-*  <b>`x_shape`</b>: the dimensions of x as a tuple or an array of ints. If x is a list,
-  then this is the list of shapes.
-
-*  <b>`y`</b>: a tensor
-*  <b>`y_shape`</b>: the dimensions of y as a tuple or an array of ints.
-*  <b>`x_init_value`</b>: (optional) a numpy array of the same shape as "x"
-    representing the initial value of x. If x is a list, this should be a list
-    of numpy arrays.  If this is none, the function will pick a random tensor
-    as the initial value.
-*  <b>`delta`</b>: (optional) the amount of perturbation.
-*  <b>`init_targets`</b>: list of targets to run to initialize model params.
-    TODO(mrry): Remove this argument.
-*  <b>`extra_feed_dict`</b>: dict that allows fixing specified tensor values
-    during the Jacobian calculation.
-
-##### Returns:
-
-  The maximum error in between the two Jacobians.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.test.Benchmark` {#Benchmark}
-
-Abstract class that provides helpers for TensorFlow benchmarks.
-- - -
-
-#### `tf.test.Benchmark.is_abstract(cls)` {#Benchmark.is_abstract}
-
-
-
-
-- - -
-
-#### `tf.test.Benchmark.report_benchmark(iters=None, cpu_time=None, wall_time=None, throughput=None, extras=None, name=None)` {#Benchmark.report_benchmark}
-
-Report a benchmark.
-
-##### Args:
-
-
-*  <b>`iters`</b>: (optional) How many iterations were run
-*  <b>`cpu_time`</b>: (optional) Total cpu time in seconds
-*  <b>`wall_time`</b>: (optional) Total wall time in seconds
-*  <b>`throughput`</b>: (optional) Throughput (in MB/s)
-*  <b>`extras`</b>: (optional) Dict mapping string keys to additional benchmark info.
-    Values may be either floats or values that are convertible to strings.
-*  <b>`name`</b>: (optional) Override the BenchmarkEntry name with `name`.
-    Otherwise it is inferred from the top-level method name.
-
-
-- - -
-
-#### `tf.test.Benchmark.run_op_benchmark(sess, op_or_tensor, feed_dict=None, burn_iters=2, min_iters=10, store_trace=False, store_memory_usage=True, name=None, extras=None, mbs=0)` {#Benchmark.run_op_benchmark}
-
-Run an op or tensor in the given session.  Report the results.
-
-##### Args:
-
-
-*  <b>`sess`</b>: `Session` object to use for timing.
-*  <b>`op_or_tensor`</b>: `Operation` or `Tensor` to benchmark.
-*  <b>`feed_dict`</b>: A `dict` of values to feed for each op iteration (see the
-    `feed_dict` parameter of `Session.run`).
-*  <b>`burn_iters`</b>: Number of burn-in iterations to run.
-*  <b>`min_iters`</b>: Minimum number of iterations to use for timing.
-*  <b>`store_trace`</b>: Boolean, whether to run an extra untimed iteration and
-    store the trace of iteration in the benchmark report.
-    The trace will be stored as a string in Google Chrome trace format
-    in the extras field "full_trace_chrome_format".
-*  <b>`store_memory_usage`</b>: Boolean, whether to run an extra untimed iteration,
-    calculate memory usage, and store that in extras fields.
-*  <b>`name`</b>: (optional) Override the BenchmarkEntry name with `name`.
-    Otherwise it is inferred from the top-level method name.
-*  <b>`extras`</b>: (optional) Dict mapping string keys to additional benchmark info.
-    Values may be either floats or values that are convertible to strings.
-*  <b>`mbs`</b>: (optional) The number of megabytes moved by this op, used to
-    calculate the ops throughput.
-
-##### Returns:
-
-  A `dict` containing the key-value pairs that were passed to
-  `report_benchmark`.
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/tf_debug.md b/tensorflow/g3doc/api_docs/python/tf_debug.md
deleted file mode 100644
index fb5e10fb7df..00000000000
--- a/tensorflow/g3doc/api_docs/python/tf_debug.md
+++ /dev/null
@@ -1,1653 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# TensorFlow Debugger
-[TOC]
-
-Public Python API of TensorFlow Debugger (tfdbg).
-
-## Functions for adding debug watches
-
-These functions help you modify `RunOptions` to specify which `Tensor`s are to
-be watched when the TensorFlow graph is executed at runtime.
-
-- - -
-
-### `tf_debug.add_debug_tensor_watch(run_options, node_name, output_slot=0, debug_ops='DebugIdentity', debug_urls=None)` {#add_debug_tensor_watch}
-
-Add watch on a `Tensor` to `RunOptions`.
-
-N.B.: Under certain circumstances, the `Tensor` may not be actually watched
-  (e.g., if the node of the `Tensor` is constant-folded during runtime).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`node_name`</b>: (`str`) name of the node to watch.
-*  <b>`output_slot`</b>: (`int`) output slot index of the tensor from the watched node.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
-    `list` of `str` or a single `str`. The latter case is equivalent to a
-    `list` of `str` with only one element.
-*  <b>`debug_urls`</b>: (`str` or `list` of `str`) URL(s) to send debug values to,
-    e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-
-
-- - -
-
-### `tf_debug.watch_graph(run_options, graph, debug_ops='DebugIdentity', debug_urls=None, node_name_regex_whitelist=None, op_type_regex_whitelist=None)` {#watch_graph}
-
-Add debug watches to `RunOptions` for a TensorFlow graph.
-
-To watch all `Tensor`s on the graph, let both `node_name_regex_whitelist`
-and `op_type_regex_whitelist` be the default (`None`).
-
-N.B.: Under certain circumstances, not all specified `Tensor`s will be
-  actually watched (e.g., nodes that are constant-folded during runtime will
-  not be watched).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`graph`</b>: An instance of `ops.Graph`.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-*  <b>`debug_urls`</b>: URLs to send debug values to. Can be a list of strings,
-    a single string, or None. The case of a single string is equivalent to
-    a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
-    `grpc://localhost:12345`.
-*  <b>`node_name_regex_whitelist`</b>: Regular-expression whitelist for node_name,
-    e.g., `"(weight_[0-9]+|bias_.*)"`
-*  <b>`op_type_regex_whitelist`</b>: Regular-expression whitelist for the op type of
-    nodes, e.g., `"(Variable|Add)"`.
-    If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
-    are set, the two filtering operations will occur in a logical `AND`
-    relation. In other words, a node will be included if and only if it
-    hits both whitelists.
-
-
-- - -
-
-### `tf_debug.watch_graph_with_blacklists(run_options, graph, debug_ops='DebugIdentity', debug_urls=None, node_name_regex_blacklist=None, op_type_regex_blacklist=None)` {#watch_graph_with_blacklists}
-
-Add debug tensor watches, blacklisting nodes and op types.
-
-This is similar to `watch_graph()`, but the node names and op types are
-blacklisted, instead of whitelisted.
-
-N.B.: Under certain circumstances, not all specified `Tensor`s will be
-  actually watched (e.g., nodes that are constant-folded during runtime will
-  not be watched).
-
-##### Args:
-
-
-*  <b>`run_options`</b>: An instance of `config_pb2.RunOptions` to be modified.
-*  <b>`graph`</b>: An instance of `ops.Graph`.
-*  <b>`debug_ops`</b>: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-*  <b>`debug_urls`</b>: URL(s) to send ebug values to, e.g.,
-    `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-*  <b>`node_name_regex_blacklist`</b>: Regular-expression blacklist for node_name.
-    This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
-*  <b>`op_type_regex_blacklist`</b>: Regular-expression blacklist for the op type of
-    nodes, e.g., `"(Variable|Add)"`.
-    If both node_name_regex_blacklist and op_type_regex_blacklist
-    are set, the two filtering operations will occur in a logical `OR`
-    relation. In other words, a node will be excluded if it hits either of
-    the two blacklists; a node will be included if and only if it hits
-    neither of the blacklists.
-
-
-
-
-## Classes for debug-dump data and directories
-
-These classes allow you to load and inspect tensor values dumped from
-TensorFlow graphs during runtime.
-
-- - -
-
-### `class tf_debug.DebugTensorDatum` {#DebugTensorDatum}
-
-A single tensor dumped by TensorFlow Debugger (tfdbg).
-
-Contains metadata about the dumped tensor, including `timestamp`,
-`node_name`, `output_slot`, `debug_op`, and path to the dump file
-(`file_path`).
-
-This type does not hold the generally space-expensive tensor value (numpy
-array). Instead, it points to the file from which the tensor value can be
-loaded (with the `get_tensor` method) if needed.
-- - -
-
-#### `tf_debug.DebugTensorDatum.__init__(dump_root, debug_dump_rel_path)` {#DebugTensorDatum.__init__}
-
-`DebugTensorDatum` constructor.
-
-##### Args:
-
-
-*  <b>`dump_root`</b>: (`str`) Debug dump root directory.
-*  <b>`debug_dump_rel_path`</b>: (`str`) Path to a debug dump file, relative to the
-      `dump_root`. For example, suppose the debug dump root
-      directory is `/tmp/tfdbg_1` and the dump file is at
-      `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
-      the value of the debug_dump_rel_path should be
-      `ns_1/node_a_0_DebugIdenity_1234456789`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the base file name of the dump file does not conform to
-    the dump file naming pattern:
-    `node_name`_`output_slot`_`debug_op`_`timestamp`
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.__repr__()` {#DebugTensorDatum.__repr__}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.__str__()` {#DebugTensorDatum.__str__}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.debug_op` {#DebugTensorDatum.debug_op}
-
-Name of the debug op.
-
-##### Returns:
-
-  (`str`) debug op name (e.g., `DebugIdentity`).
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.dump_size_bytes` {#DebugTensorDatum.dump_size_bytes}
-
-Size of the dump file.
-
-Unit: byte.
-
-##### Returns:
-
-  If the dump file exists, size of the dump file, in bytes.
-  If the dump file does not exist, None.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.file_path` {#DebugTensorDatum.file_path}
-
-Path to the file which stores the value of the dumped tensor.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.get_tensor()` {#DebugTensorDatum.get_tensor}
-
-Get tensor from the dump (`Event`) file.
-
-##### Returns:
-
-  The tensor loaded from the dump (`Event`) file.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.node_name` {#DebugTensorDatum.node_name}
-
-Name of the node from which the tensor value was dumped.
-
-##### Returns:
-
-  (`str`) name of the node watched by the debug op.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.output_slot` {#DebugTensorDatum.output_slot}
-
-Output slot index from which the tensor value was dumped.
-
-##### Returns:
-
-  (`int`) output slot index watched by the debug op.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.tensor_name` {#DebugTensorDatum.tensor_name}
-
-Name of the tensor watched by the debug op.
-
-##### Returns:
-
-  (`str`) `Tensor` name, in the form of `node_name`:`output_slot`
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.timestamp` {#DebugTensorDatum.timestamp}
-
-Timestamp of when this tensor value was dumped.
-
-##### Returns:
-
-  (`int`) The timestamp in microseconds.
-
-
-- - -
-
-#### `tf_debug.DebugTensorDatum.watch_key` {#DebugTensorDatum.watch_key}
-
-Watch key identities a debug watch on a tensor.
-
-##### Returns:
-
-  (`str`) A watch key, in the form of `tensor_name`:`debug_op`.
-
-
-
-- - -
-
-### `class tf_debug.DebugDumpDir` {#DebugDumpDir}
-
-Data set from a debug-dump directory on filesystem.
-
-An instance of `DebugDumpDir` contains all `DebugTensorDatum` instances
-in a tfdbg dump root directory.
-- - -
-
-#### `tf_debug.DebugDumpDir.__init__(dump_root, partition_graphs=None, validate=True)` {#DebugDumpDir.__init__}
-
-`DebugDumpDir` constructor.
-
-##### Args:
-
-
-*  <b>`dump_root`</b>: (`str`) path to the dump root directory.
-*  <b>`partition_graphs`</b>: A repeated field of GraphDefs representing the
-      partition graphs executed by the TensorFlow runtime.
-*  <b>`validate`</b>: (`bool`) whether the dump files are to be validated against the
-      partition graphs.
-
-##### Raises:
-
-
-*  <b>`IOError`</b>: If dump_root does not exist as a directory.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.debug_watch_keys(node_name)` {#DebugDumpDir.debug_watch_keys}
-
-Get all tensor watch keys of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`list` of `str`) all debug tensor watch keys. Returns an empty list if
-    the node name does not correspond to any debug watch keys.
-
-##### Raises:
-
-  `LookupError`: If debug watch information has not been loaded from
-    partition graphs yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.devices()` {#DebugDumpDir.devices}
-
-Get the list of devices.
-
-##### Returns:
-
-  (`list` of `str`) names of the devices.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.dumped_tensor_data` {#DebugDumpDir.dumped_tensor_data}
-
-
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.find(predicate, first_n=0)` {#DebugDumpDir.find}
-
-Find dumped tensor data by a certain predicate.
-
-##### Args:
-
-
-*  <b>`predicate`</b>: A callable that takes two input arguments:
-
-    ```python
-    def predicate(debug_tensor_datum, tensor):
-      # returns a bool
-    ```
-
-    where `debug_tensor_datum` is an instance of `DebugTensorDatum`, which
-    carries the metadata, such as the `Tensor`'s node name, output slot
-    timestamp, debug op name, etc.; and `tensor` is the dumped tensor value
-    as a `numpy.ndarray`.
-
-*  <b>`first_n`</b>: (`int`) return only the first n `DebugTensotDatum` instances (in
-    time order) for which the predicate returns True. To return all the
-    `DebugTensotDatum` instances, let first_n be <= 0.
-
-##### Returns:
-
-  A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
-   for which predicate returns True, sorted in ascending order of the
-   timestamp.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_dump_sizes_bytes(node_name, output_slot, debug_op)` {#DebugDumpDir.get_dump_sizes_bytes}
-
-Get the sizes of the dump files for a debug-dumped tensor.
-
-Unit of the file size: byte.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  (`list` of `int`): list of dump file sizes in bytes.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor watch key does not exist in the debug dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_rel_timestamps(node_name, output_slot, debug_op)` {#DebugDumpDir.get_rel_timestamps}
-
-Get the relative timestamp from for a debug-dumped tensor.
-
-Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
-absolute timestamp of the first dumped tensor in the dump root. The tensor
-may be dumped multiple times in the dump root directory, so a list of
-relative timestamps (`numpy.ndarray`) is returned.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  (`list` of `int`) list of relative timestamps.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor watch key does not exist in the debug dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_tensor_file_paths(node_name, output_slot, debug_op)` {#DebugDumpDir.get_tensor_file_paths}
-
-Get the file paths from a debug-dumped tensor.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  List of file path(s) loaded. This is a list because each debugged tensor
-    may be dumped multiple times.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor does not exist in the debug-dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.get_tensors(node_name, output_slot, debug_op)` {#DebugDumpDir.get_tensors}
-
-Get the tensor value from for a debug-dumped tensor.
-
-The tensor may be dumped multiple times in the dump root directory, so a
-list of tensors (`numpy.ndarray`) is returned.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node that the tensor is produced by.
-*  <b>`output_slot`</b>: (`int`) output slot index of tensor.
-*  <b>`debug_op`</b>: (`str`) name of the debug op.
-
-##### Returns:
-
-  List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the tensor does not exist in the debug-dump data.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.loaded_partition_graphs()` {#DebugDumpDir.loaded_partition_graphs}
-
-Test whether partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_attributes(node_name)` {#DebugDumpDir.node_attributes}
-
-Get the attributes of a node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node in question.
-
-##### Returns:
-
-  Attributes of the node.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-*  <b>`ValueError`</b>: If no node named node_name exists.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_device(node_name)` {#DebugDumpDir.node_device}
-
-Get the device of a node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`str`) name of the device on which the node is placed.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_exists(node_name)` {#DebugDumpDir.node_exists}
-
-Test if a node exists in the partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node to be checked.
-
-##### Returns:
-
-  A boolean indicating whether the node exists.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded yet.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_inputs(node_name, is_control=False)` {#DebugDumpDir.node_inputs}
-
-Get the inputs of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node.
-*  <b>`is_control`</b>: (`bool`) Whether control inputs, rather than non-control
-    inputs, are to be returned.
-
-##### Returns:
-
-  (`list` of `str`) inputs to the node, as a list of node names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_op_type(node_name)` {#DebugDumpDir.node_op_type}
-
-Get the op type of given node.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-
-##### Returns:
-
-  (`str`) op type of the node.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node op types have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_recipients(node_name, is_control=False)` {#DebugDumpDir.node_recipients}
-
-Get recipient of the given node's output according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: (`str`) name of the node.
-*  <b>`is_control`</b>: (`bool`) whether control outputs, rather than non-control
-    outputs, are to be returned.
-
-##### Returns:
-
-  (`list` of `str`) all inputs to the node, as a list of node names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.node_traceback(element_name)` {#DebugDumpDir.node_traceback}
-
-Try to retrieve the Python traceback of node's construction.
-
-##### Args:
-
-
-*  <b>`element_name`</b>: (`str`) Name of a graph element (node or tensor).
-
-##### Returns:
-
-  (list) The traceback list object as returned by the `extract_trace`
-    method of Python's traceback module.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If Python graph is not available for traceback lookup.
-*  <b>`KeyError`</b>: If the node cannot be found in the Python graph loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.nodes()` {#DebugDumpDir.nodes}
-
-Get a list of all nodes from the partition graphs.
-
-##### Returns:
-
-  All nodes' names, as a list of str.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.partition_graphs()` {#DebugDumpDir.partition_graphs}
-
-Get the partition graphs.
-
-##### Returns:
-
-  Partition graphs as repeated fields of GraphDef.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If no partition graphs have been loaded.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.run_feed_keys_info` {#DebugDumpDir.run_feed_keys_info}
-
-Get a str representation of the feed_dict used in the Session.run() call.
-
-##### Returns:
-
-  If the information is available, a `str` obtained from `repr(feed_dict)`.
-  If the information is not available, `None`.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.run_fetches_info` {#DebugDumpDir.run_fetches_info}
-
-Get a str representation of the fetches used in the Session.run() call.
-
-##### Returns:
-
-  If the information is available, a `str` obtained from `repr(fetches)`.
-  If the information is not available, `None`.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.set_python_graph(python_graph)` {#DebugDumpDir.set_python_graph}
-
-Provide Python `Graph` object to the wrapper.
-
-Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
-is a Python object and carries additional information such as the traceback
-of the construction of the nodes in the graph.
-
-##### Args:
-
-
-*  <b>`python_graph`</b>: (ops.Graph) The Python Graph object.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.size` {#DebugDumpDir.size}
-
-Total number of dumped tensors in the dump root directory.
-
-##### Returns:
-
-  (`int`) total number of dumped tensors in the dump root directory.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.t0` {#DebugDumpDir.t0}
-
-Absolute timestamp of the first dumped tensor.
-
-##### Returns:
-
-  (`int`) absolute timestamp of the first dumped tensor, in microseconds.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.transitive_inputs(node_name, include_control=True)` {#DebugDumpDir.transitive_inputs}
-
-Get the transitive inputs of given node according to partition graphs.
-
-##### Args:
-
-
-*  <b>`node_name`</b>: Name of the node
-*  <b>`include_control`</b>: Include control inputs (True by default).
-
-##### Returns:
-
-  (`list` of `str`) all transitive inputs to the node, as a list of node
-    names.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: If node inputs and control inputs have not been loaded
-     from partition graphs yet.
-*  <b>`ValueError`</b>: If the node does not exist in partition graphs.
-
-
-- - -
-
-#### `tf_debug.DebugDumpDir.watch_key_to_data(debug_watch_key)` {#DebugDumpDir.watch_key_to_data}
-
-Get all `DebugTensorDatum` instances corresponding to a debug watch key.
-
-##### Args:
-
-
-*  <b>`debug_watch_key`</b>: (`str`) debug watch key.
-
-##### Returns:
-
-  A list of `DebugTensorDatum` instances that correspond to the debug watch
-  key. If the watch key does not exist, returns an empty list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the debug watch key does not exist.
-
-
-
-
-
-## Functions for loading debug-dump data
-
-- - -
-
-### `tf_debug.load_tensor_from_event_file(event_file_path)` {#load_tensor_from_event_file}
-
-Load a tensor from an event file.
-
-Assumes that the event file contains a `Event` protobuf and the `Event`
-protobuf contains a `Tensor` value.
-
-##### Args:
-
-
-*  <b>`event_file_path`</b>: (`str`) path to the event file.
-
-##### Returns:
-
-  The tensor value loaded from the event file, as a `numpy.ndarray`. For
-  uninitialized Tensors, returns `None`. For Tensors of data types that
-  cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return
-  `None`.
-
-
-
-
-## Tensor-value predicates
-
-Built-in tensor-filter predicates to support conditional breakpoint between
-runs. See `DebugDumpDir.find()` for more details.
-
-- - -
-
-### `tf_debug.has_inf_or_nan(datum, tensor)` {#has_inf_or_nan}
-
-A predicate for whether a tensor consists of any bad numerical values.
-
-This predicate is common enough to merit definition in this module.
-Bad numerical values include `nan`s and `inf`s.
-The signature of this function follows the requirement of the method
-`DebugDumpDir.find()`.
-
-##### Args:
-
-
-*  <b>`datum`</b>: (`DebugTensorDatum`) Datum metadata.
-*  <b>`tensor`</b>: (`numpy.ndarray` or None) Value of the tensor. None represents
-    an uninitialized tensor.
-
-##### Returns:
-
-  (`bool`) True if and only if tensor consists of any nan or inf values.
-
-
-
-
-## Session wrapper class and `SessionRunHook` implementations
-
-These classes allow you to
-
-* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models
-  (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or
-* generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
-  `DumpingDebugHook` and `LocalCLIDebugHook`).
-
-- - -
-
-### `class tf_debug.DumpingDebugHook` {#DumpingDebugHook}
-
-A debugger hook that dumps debug data to filesystem.
-
-Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
-- - -
-
-#### `tf_debug.DumpingDebugHook.__enter__()` {#DumpingDebugHook.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugHook.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.__init__(session_root, watch_fn=None, log_usage=True)` {#DumpingDebugHook.__init__}
-
-Create a local debugger command-line interface (CLI) hook.
-
-##### Args:
-
-
-*  <b>`session_root`</b>: See doc of
-    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
-*  <b>`watch_fn`</b>: See doc of
-    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
-*  <b>`log_usage`</b>: (bool) Whether usage is to be logged.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.after_create_session(session, coord)` {#DumpingDebugHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.after_run(run_context, run_values)` {#DumpingDebugHook.after_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.before_run(run_context)` {#DumpingDebugHook.before_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.begin()` {#DumpingDebugHook.begin}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.close()` {#DumpingDebugHook.close}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.end(session)` {#DumpingDebugHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.graph` {#DumpingDebugHook.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugHook.invoke_node_stepper}
-
-See doc of BaseDebugWrapperSession.invoke_node_stepper.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_run_end(request)` {#DumpingDebugHook.on_run_end}
-
-See doc of BaseDebugWrapperSession.on_run_end.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_run_start(request)` {#DumpingDebugHook.on_run_start}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.on_session_init(request)` {#DumpingDebugHook.on_session_init}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugHook.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.partial_run_setup(fetches, feeds=None)` {#DumpingDebugHook.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugHook.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.sess_str` {#DumpingDebugHook.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugHook.session` {#DumpingDebugHook.session}
-
-
-
-
-
-- - -
-
-### `class tf_debug.DumpingDebugWrapperSession` {#DumpingDebugWrapperSession}
-
-Debug Session wrapper that dumps debug data to filesystem.
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__enter__()` {#DumpingDebugWrapperSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugWrapperSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.__init__(sess, session_root, watch_fn=None, log_usage=True)` {#DumpingDebugWrapperSession.__init__}
-
-Constructor of DumpingDebugWrapperSession.
-
-##### Args:
-
-
-*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
-*  <b>`session_root`</b>: (`str`) Path to the session root directory. Must be a
-    directory that does not exist or an empty directory. If the directory
-    does not exist, it will be created by the debugger core during debug
-    [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
-    calls.
-    As the `run()` calls occur, subdirectories will be added to
-    `session_root`. The subdirectories' names has the following pattern:
-      run_<epoch_time_stamp>_<uuid>
-    E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
-*  <b>`watch_fn`</b>: (`Callable`) A Callable that can be used to define per-run
-    debug ops and watched tensors. See the doc of
-    `NonInteractiveDebugWrapperSession.__init__()` for details.
-*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `session_root` is an existing and non-empty directory or
-   if `session_root` is a file.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.close()` {#DumpingDebugWrapperSession.close}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.graph` {#DumpingDebugWrapperSession.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugWrapperSession.invoke_node_stepper}
-
-See doc of BaseDebugWrapperSession.invoke_node_stepper.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_run_end(request)` {#DumpingDebugWrapperSession.on_run_end}
-
-See doc of BaseDebugWrapperSession.on_run_end.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_run_start(request)` {#DumpingDebugWrapperSession.on_run_start}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.on_session_init(request)` {#DumpingDebugWrapperSession.on_session_init}
-
-See doc of BaseDebugWrapperSession.on_run_start.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugWrapperSession.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#DumpingDebugWrapperSession.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugWrapperSession.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.sess_str` {#DumpingDebugWrapperSession.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.DumpingDebugWrapperSession.session` {#DumpingDebugWrapperSession.session}
-
-
-
-
-
-- - -
-
-### `class tf_debug.LocalCLIDebugHook` {#LocalCLIDebugHook}
-
-Command-line-interface debugger hook.
-
-Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__enter__()` {#LocalCLIDebugHook.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#LocalCLIDebugHook.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.__init__(ui_type='curses')` {#LocalCLIDebugHook.__init__}
-
-Create a local debugger command-line interface (CLI) hook.
-
-##### Args:
-
-
-*  <b>`ui_type`</b>: (str) user-interface type.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.add_tensor_filter(filter_name, tensor_filter)` {#LocalCLIDebugHook.add_tensor_filter}
-
-Add a tensor filter.
-
-See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-Override default behavior to accomodate the possibility of this method being
-called prior to the initialization of the underlying
-`LocalCLIDebugWrapperSession` object.
-
-##### Args:
-
-
-*  <b>`filter_name`</b>: See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()`
-    for details.
-*  <b>`tensor_filter`</b>: See doc of
-    `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.after_create_session(session, coord)` {#LocalCLIDebugHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.after_run(run_context, run_values)` {#LocalCLIDebugHook.after_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.before_run(run_context)` {#LocalCLIDebugHook.before_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.begin()` {#LocalCLIDebugHook.begin}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.close()` {#LocalCLIDebugHook.close}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.end(session)` {#LocalCLIDebugHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.graph` {#LocalCLIDebugHook.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#LocalCLIDebugHook.invoke_node_stepper}
-
-Overrides method in base class to implement interactive node stepper.
-
-##### Args:
-
-
-*  <b>`node_stepper`</b>: (`stepper.NodeStepper`) The underlying NodeStepper API
-    object.
-*  <b>`restore_variable_values_on_exit`</b>: (`bool`) Whether any variables whose
-    values have been altered during this node-stepper invocation should be
-    restored to their old values when this invocation ends.
-
-##### Returns:
-
-  The same return values as the `Session.run()` call on the same fetches as
-    the NodeStepper.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_run_end(request)` {#LocalCLIDebugHook.on_run_end}
-
-Overrides on-run-end callback.
-
-##### Actions taken:
-
-  1) Load the debug dump.
-  2) Bring up the Analyzer CLI.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of OnSessionInitRequest.
-
-##### Returns:
-
-  An instance of OnSessionInitResponse.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_run_start(request)` {#LocalCLIDebugHook.on_run_start}
-
-Overrides on-run-start callback.
-
-##### Invoke the CLI to let user choose what action to take:
-
-  `run` / `invoke_stepper`.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If user chooses to prematurely exit the debugger.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.on_session_init(request)` {#LocalCLIDebugHook.on_session_init}
-
-Overrides on-session-init callback.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.partial_run(handle, fetches, feed_dict=None)` {#LocalCLIDebugHook.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.partial_run_setup(fetches, feeds=None)` {#LocalCLIDebugHook.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#LocalCLIDebugHook.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.sess_str` {#LocalCLIDebugHook.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugHook.session` {#LocalCLIDebugHook.session}
-
-
-
-
-
-- - -
-
-### `class tf_debug.LocalCLIDebugWrapperSession` {#LocalCLIDebugWrapperSession}
-
-Concrete subclass of BaseDebugWrapperSession implementing a local CLI.
-
-This class has all the methods that a `session.Session` object has, in order
-to support debugging with minimal code changes. Invoking its `run()` method
-will launch the command-line interface (CLI) of tfdbg.
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__enter__()` {#LocalCLIDebugWrapperSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#LocalCLIDebugWrapperSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.__init__(sess, dump_root=None, log_usage=True, ui_type='curses')` {#LocalCLIDebugWrapperSession.__init__}
-
-Constructor of LocalCLIDebugWrapperSession.
-
-##### Args:
-
-
-*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
-*  <b>`dump_root`</b>: (`str`) optional path to the dump root directory. Must be a
-    directory that does not exist or an empty directory. If the directory
-    does not exist, it will be created by the debugger core during debug
-    `run()` calls and removed afterwards.
-*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
-*  <b>`ui_type`</b>: (`str`) requested UI type. Currently supported:
-    (curses | readline)
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If dump_root is an existing and non-empty directory or if
-    dump_root is a file.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.add_tensor_filter(filter_name, tensor_filter)` {#LocalCLIDebugWrapperSession.add_tensor_filter}
-
-Add a tensor filter.
-
-##### Args:
-
-
-*  <b>`filter_name`</b>: (`str`) name of the filter.
-*  <b>`tensor_filter`</b>: (`callable`) the filter callable. See the doc string of
-    `DebugDumpDir.find()` for more details about its signature.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.close()` {#LocalCLIDebugWrapperSession.close}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.graph` {#LocalCLIDebugWrapperSession.graph}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#LocalCLIDebugWrapperSession.invoke_node_stepper}
-
-Overrides method in base class to implement interactive node stepper.
-
-##### Args:
-
-
-*  <b>`node_stepper`</b>: (`stepper.NodeStepper`) The underlying NodeStepper API
-    object.
-*  <b>`restore_variable_values_on_exit`</b>: (`bool`) Whether any variables whose
-    values have been altered during this node-stepper invocation should be
-    restored to their old values when this invocation ends.
-
-##### Returns:
-
-  The same return values as the `Session.run()` call on the same fetches as
-    the NodeStepper.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_run_end(request)` {#LocalCLIDebugWrapperSession.on_run_end}
-
-Overrides on-run-end callback.
-
-##### Actions taken:
-
-  1) Load the debug dump.
-  2) Bring up the Analyzer CLI.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of OnSessionInitRequest.
-
-##### Returns:
-
-  An instance of OnSessionInitResponse.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_run_start(request)` {#LocalCLIDebugWrapperSession.on_run_start}
-
-Overrides on-run-start callback.
-
-##### Invoke the CLI to let user choose what action to take:
-
-  `run` / `invoke_stepper`.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If user chooses to prematurely exit the debugger.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.on_session_init(request)` {#LocalCLIDebugWrapperSession.on_session_init}
-
-Overrides on-session-init callback.
-
-##### Args:
-
-
-*  <b>`request`</b>: An instance of `OnSessionInitRequest`.
-
-##### Returns:
-
-  An instance of `OnSessionInitResponse`.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#LocalCLIDebugWrapperSession.partial_run}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#LocalCLIDebugWrapperSession.partial_run_setup}
-
-Sets up the feeds and fetches for partial runs in the session.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#LocalCLIDebugWrapperSession.run}
-
-Wrapper around Session.run() that inserts tensor watch options.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
-*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
-*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
-*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
-
-##### Returns:
-
-  Simply forwards the output of the wrapped `Session.run()` call.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.sess_str` {#LocalCLIDebugWrapperSession.sess_str}
-
-
-
-
-- - -
-
-#### `tf_debug.LocalCLIDebugWrapperSession.session` {#LocalCLIDebugWrapperSession.session}
-
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
deleted file mode 100644
index dc88d0e4137..00000000000
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ /dev/null
@@ -1,5962 +0,0 @@
-<!-- This file is machine generated: DO NOT EDIT! -->
-
-# Training
-[TOC]
-
-This library provides a set of classes and functions that helps train models.
-
-## Optimizers
-
-The Optimizer base class provides methods to compute gradients for a loss and
-apply gradients to variables.  A collection of subclasses implement classic
-optimization algorithms such as GradientDescent and Adagrad.
-
-You never instantiate the Optimizer class itself, but instead instantiate one
-of the subclasses.
-
-- - -
-
-### `class tf.train.Optimizer` {#Optimizer}
-
-Base class for optimizers.
-
-This class defines the API to add Ops to train a model.  You never use this
-class directly, but instead instantiate one of its subclasses such as
-`GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
-
-### Usage
-
-```python
-# Create an optimizer with the desired parameters.
-opt = GradientDescentOptimizer(learning_rate=0.1)
-# Add Ops to the graph to minimize a cost by updating a list of variables.
-# "cost" is a Tensor, and the list of variables contains tf.Variable
-# objects.
-opt_op = opt.minimize(cost, var_list=<list of variables>)
-```
-
-In the training program you will just have to run the returned Op.
-
-```python
-# Execute opt_op to do one step of training:
-opt_op.run()
-```
-
-### Processing gradients before applying them.
-
-Calling `minimize()` takes care of both computing the gradients and
-applying them to the variables.  If you want to process the gradients
-before applying them you can instead use the optimizer in three steps:
-
-1.  Compute the gradients with `compute_gradients()`.
-2.  Process the gradients as you wish.
-3.  Apply the processed gradients with `apply_gradients()`.
-
-Example:
-
-```python
-# Create an optimizer.
-opt = GradientDescentOptimizer(learning_rate=0.1)
-
-# Compute the gradients for a list of variables.
-grads_and_vars = opt.compute_gradients(loss, <list of variables>)
-
-# grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
-# need to the 'gradient' part, for example cap them, etc.
-capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
-
-# Ask the optimizer to apply the capped gradients.
-opt.apply_gradients(capped_grads_and_vars)
-```
-
-- - -
-
-#### `tf.train.Optimizer.__init__(use_locking, name)` {#Optimizer.__init__}
-
-Create a new Optimizer.
-
-This must be called by the constructors of subclasses.
-
-##### Args:
-
-
-*  <b>`use_locking`</b>: Bool. If True apply use locks to prevent concurrent updates
-    to variables.
-*  <b>`name`</b>: A non-empty string.  The name to use for accumulators created
-    for the optimizer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If name is malformed.
-
-
-
-- - -
-
-#### `tf.train.Optimizer.minimize(loss, global_step=None, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None)` {#Optimizer.minimize}
-
-Add operations to minimize `loss` by updating `var_list`.
-
-This method simply combines calls `compute_gradients()` and
-`apply_gradients()`. If you want to process the gradient before applying
-them call `compute_gradients()` and `apply_gradients()` explicitly instead
-of using this function.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A `Tensor` containing the value to minimize.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`var_list`</b>: Optional list of `Variable` objects to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKeys.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`name`</b>: Optional name for the returned operation.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  An Operation that updates the variables in `var_list`.  If `global_step`
-  was not `None`, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
-
-
-- - -
-
-#### `tf.train.Optimizer.compute_gradients(loss, var_list=None, gate_gradients=1, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)` {#Optimizer.compute_gradients}
-
-Compute gradients of `loss` for the variables in `var_list`.
-
-This is the first part of `minimize()`.  It returns a list
-of (gradient, variable) pairs where "gradient" is the gradient
-for "variable".  Note that "gradient" can be a `Tensor`, an
-`IndexedSlices`, or `None` if there is no gradient for the
-given variable.
-
-##### Args:
-
-
-*  <b>`loss`</b>: A Tensor containing the value to minimize.
-*  <b>`var_list`</b>: Optional list of `tf.Variable` to update to minimize
-    `loss`.  Defaults to the list of variables collected in the graph
-    under the key `GraphKey.TRAINABLE_VARIABLES`.
-*  <b>`gate_gradients`</b>: How to gate the computation of gradients.  Can be
-    `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Valid values are defined in the class `AggregationMethod`.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
-
-##### Returns:
-
-  A list of (gradient, variable) pairs. Variable is always present, but
-  gradient can be `None`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `var_list` contains anything else than `Variable` objects.
-*  <b>`ValueError`</b>: If some arguments are invalid.
-
-
-- - -
-
-#### `tf.train.Optimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#Optimizer.apply_gradients}
-
-Apply gradients to variables.
-
-This is the second part of `minimize()`. It returns an `Operation` that
-applies gradients.
-
-##### Args:
-
-
-*  <b>`grads_and_vars`</b>: List of (gradient, variable) pairs as returned by
-    `compute_gradients()`.
-*  <b>`global_step`</b>: Optional `Variable` to increment by one after the
-    variables have been updated.
-*  <b>`name`</b>: Optional name for the returned operation.  Default to the
-    name passed to the `Optimizer` constructor.
-
-##### Returns:
-
-  An `Operation` that applies the specified gradients. If `global_step`
-  was not None, that operation also increments `global_step`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `grads_and_vars` is malformed.
-*  <b>`ValueError`</b>: If none of the variables have gradients.
-
-
-
-### Gating Gradients
-
-Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
-argument that controls the degree of parallelism during the application of
-the gradients.
-
-The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
-
-<b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
-the maximum parallelism in execution, at the cost of some non-reproducibility
-in the results.  For example the two gradients of `matmul` depend on the input
-values: With `GATE_NONE` one of the gradients could be applied to one of the
-inputs _before_ the other gradient is computed resulting in non-reproducible
-results.
-
-<b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
-they are used.  This prevents race conditions for Ops that generate gradients
-for multiple inputs where the gradients depend on the inputs.
-
-<b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
-before any one of them is used.  This provides the least parallelism but can
-be useful if you want to process all gradients before applying any of them.
-
-### Slots
-
-Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
-allocate and manage additional variables associated with the variables to
-train.  These are called <i>Slots</i>.  Slots have names and you can ask the
-optimizer for the names of the slots that it uses.  Once you have a slot name
-you can ask the optimizer for the variable it created to hold the slot value.
-
-This can be useful if you want to log debug a training algorithm, report stats
-about the slots, etc.
-
-- - -
-
-#### `tf.train.Optimizer.get_slot_names()` {#Optimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-See `get_slot()`.
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.train.Optimizer.get_slot(var, name)` {#Optimizer.get_slot}
-
-Return a slot named `name` created for `var` by the Optimizer.
-
-Some `Optimizer` subclasses use additional variables.  For example
-`Momentum` and `Adagrad` use variables to accumulate updates.  This method
-gives access to these `Variable` objects if for some reason you need them.
-
-Use `get_slot_names()` to get the list of slot names created by the
-`Optimizer`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A variable passed to `minimize()` or `apply_gradients()`.
-*  <b>`name`</b>: A string.
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Optimizer.get_name()` {#Optimizer.get_name}
-
-
-
-
-
-
-- - -
-
-### `class tf.train.GradientDescentOptimizer` {#GradientDescentOptimizer}
-
-Optimizer that implements the gradient descent algorithm.
-
-- - -
-
-#### `tf.train.GradientDescentOptimizer.__init__(learning_rate, use_locking=False, name='GradientDescent')` {#GradientDescentOptimizer.__init__}
-
-Construct a new gradient descent optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning
-    rate to use.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "GradientDescent".
-
-
-
-- - -
-
-### `class tf.train.AdadeltaOptimizer` {#AdadeltaOptimizer}
-
-Optimizer that implements the Adadelta algorithm.
-
-See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
-([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
-
-- - -
-
-#### `tf.train.AdadeltaOptimizer.__init__(learning_rate=0.001, rho=0.95, epsilon=1e-08, use_locking=False, name='Adadelta')` {#AdadeltaOptimizer.__init__}
-
-Construct a new Adadelta optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value. The learning rate.
-*  <b>`rho`</b>: A `Tensor` or a floating point value. The decay rate.
-*  <b>`epsilon`</b>: A `Tensor` or a floating point value.  A constant epsilon used
-           to better conditioning the grad update.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adadelta".
-
-
-
-- - -
-
-### `class tf.train.AdagradOptimizer` {#AdagradOptimizer}
-
-Optimizer that implements the Adagrad algorithm.
-
-See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-or this
-[intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
-
-- - -
-
-#### `tf.train.AdagradOptimizer.__init__(learning_rate, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')` {#AdagradOptimizer.__init__}
-
-Construct a new Adagrad optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`initial_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adagrad".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_accumulator_value` is invalid.
-
-
-
-- - -
-
-### `class tf.train.AdagradDAOptimizer` {#AdagradDAOptimizer}
-
-Adagrad Dual Averaging algorithm for sparse linear models.
-
-See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
-
-This optimizer takes care of regularization of unseen features in a mini batch
-by updating them when they are seen with a closed form update rule that is
-equivalent to having updated them on every mini-batch.
-
-AdagradDA is typically used when there is a need for large sparsity in the
-trained model. This optimizer only guarantees sparsity for linear models. Be
-careful when using AdagradDA for deep networks as it will require careful
-initialization of the gradient accumulators for it to train.
-
-- - -
-
-#### `tf.train.AdagradDAOptimizer.__init__(learning_rate, global_step, initial_gradient_squared_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='AdagradDA')` {#AdagradDAOptimizer.__init__}
-
-Construct a new AdagradDA optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`global_step`</b>: A `Tensor` containing the current training step number.
-*  <b>`initial_gradient_squared_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "AdagradDA".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_gradient_squared_accumulator_value` is
-  invalid.
-
-
-
-- - -
-
-### `class tf.train.MomentumOptimizer` {#MomentumOptimizer}
-
-Optimizer that implements the Momentum algorithm.
-
-- - -
-
-#### `tf.train.MomentumOptimizer.__init__(learning_rate, momentum, use_locking=False, name='Momentum', use_nesterov=False)` {#MomentumOptimizer.__init__}
-
-Construct a new Momentum optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`momentum`</b>: A `Tensor` or a floating point value.  The momentum.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Momentum".
-*  <b>`use_nesterov`</b>: If `True` use Nesterov Momentum.
-    See [Sutskever et. al., 2013](
-*  <b>`http`</b>: //jmlr.org/proceedings/papers/v28/sutskever13.pdf)
-
-
-
-- - -
-
-### `class tf.train.AdamOptimizer` {#AdamOptimizer}
-
-Optimizer that implements the Adam algorithm.
-
-See [Kingma et. al., 2014](http://arxiv.org/abs/1412.6980)
-([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
-
-- - -
-
-#### `tf.train.AdamOptimizer.__init__(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')` {#AdamOptimizer.__init__}
-
-Construct a new Adam optimizer.
-
-Initialization:
-
-```
-m_0 <- 0 (Initialize initial 1st moment vector)
-v_0 <- 0 (Initialize initial 2nd moment vector)
-t <- 0 (Initialize timestep)
-```
-
-The update rule for `variable` with gradient `g` uses an optimization
-described at the end of section2 of the paper:
-
-```
-t <- t + 1
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-```
-
-The default value of 1e-8 for epsilon might not be a good default in
-general. For example, when training an Inception network on ImageNet a
-current good choice is 1.0 or 0.1.
-
-Note that in dense implement of this algorithm, m_t, v_t and variable will
-update even if g is zero, but in sparse implement, m_t, v_t and variable
-will not update in iterations g is zero.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning rate.
-*  <b>`beta1`</b>: A float value or a constant float tensor.
-    The exponential decay rate for the 1st moment estimates.
-*  <b>`beta2`</b>: A float value or a constant float tensor.
-    The exponential decay rate for the 2nd moment estimates.
-*  <b>`epsilon`</b>: A small constant for numerical stability.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name for the operations created when applying gradients.
-    Defaults to "Adam".
-
-
-
-- - -
-
-### `class tf.train.FtrlOptimizer` {#FtrlOptimizer}
-
-Optimizer that implements the FTRL algorithm.
-
-See this [paper](
-https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
-
-- - -
-
-#### `tf.train.FtrlOptimizer.__init__(learning_rate, learning_rate_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='Ftrl')` {#FtrlOptimizer.__init__}
-
-Construct a new FTRL optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A float value or a constant float `Tensor`.
-*  <b>`learning_rate_power`</b>: A float value, must be less or equal to zero.
-*  <b>`initial_accumulator_value`</b>: The starting value for accumulators.
-    Only positive values are allowed.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Ftrl".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-
-- - -
-
-### `class tf.train.ProximalGradientDescentOptimizer` {#ProximalGradientDescentOptimizer}
-
-Optimizer that implements the proximal gradient descent algorithm.
-
-See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-- - -
-
-#### `tf.train.ProximalGradientDescentOptimizer.__init__(learning_rate, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='ProximalGradientDescent')` {#ProximalGradientDescentOptimizer.__init__}
-
-Construct a new proximal gradient descent optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning
-    rate to use.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If True use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "GradientDescent".
-
-
-
-- - -
-
-### `class tf.train.ProximalAdagradOptimizer` {#ProximalAdagradOptimizer}
-
-Optimizer that implements the Proximal Adagrad algorithm.
-
-See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-- - -
-
-#### `tf.train.ProximalAdagradOptimizer.__init__(learning_rate, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='ProximalAdagrad')` {#ProximalAdagradOptimizer.__init__}
-
-Construct a new ProximalAdagrad optimizer.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A `Tensor` or a floating point value.  The learning rate.
-*  <b>`initial_accumulator_value`</b>: A floating point value.
-    Starting value for the accumulators, must be positive.
-*  <b>`l1_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`l2_regularization_strength`</b>: A float value, must be greater than or
-    equal to zero.
-*  <b>`use_locking`</b>: If `True` use locks for update operations.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients.  Defaults to "Adagrad".
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the `initial_accumulator_value` is invalid.
-
-
-
-- - -
-
-### `class tf.train.RMSPropOptimizer` {#RMSPropOptimizer}
-
-Optimizer that implements the RMSProp algorithm.
-
-See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
-
-- - -
-
-#### `tf.train.RMSPropOptimizer.__init__(learning_rate, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, name='RMSProp')` {#RMSPropOptimizer.__init__}
-
-Construct a new RMSProp optimizer.
-
-Note that in dense implement of this algorithm, m_t and v_t will
-update even if g is zero, but in sparse implement, m_t and v_t
-will not update in iterations g is zero.
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A Tensor or a floating point value.  The learning rate.
-*  <b>`decay`</b>: Discounting factor for the history/coming gradient
-*  <b>`momentum`</b>: A scalar tensor.
-*  <b>`epsilon`</b>: Small value to avoid zero denominator.
-*  <b>`use_locking`</b>: If True use locks for update operation.
-*  <b>`centered`</b>: If True, gradients are normalized by the estimated variance of
-    the gradient; if False, by the uncentered second moment. Setting this to
-    True may help with training, but is slightly more expensive in terms of
-    computation and memory. Defaults to False.
-*  <b>`name`</b>: Optional name prefix for the operations created when applying
-    gradients. Defaults to "RMSProp".
-
-
-
-
-## Gradient Computation
-
-TensorFlow provides functions to compute the derivatives for a given
-TensorFlow computation graph, adding operations to the graph. The
-optimizer classes automatically compute derivatives on your graph, but
-creators of new Optimizers or expert users can call the lower-level
-functions below.
-
-- - -
-
-### `tf.gradients(ys, xs, grad_ys=None, name='gradients', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)` {#gradients}
-
-Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.
-
-`ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
-is a list of `Tensor`, holding the gradients received by the
-`ys`. The list must be the same length as `ys`.
-
-`gradients()` adds ops to the graph to output the partial
-derivatives of `ys` with respect to `xs`.  It returns a list of
-`Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
-for y in `ys`.
-
-`grad_ys` is a list of tensors of the same length as `ys` that holds
-the initial gradients for each y in `ys`.  When `grad_ys` is None,
-we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
-user can provide their own initial `grad_ys` to compute the
-derivatives using a different initial gradient for each y (e.g., if
-one wanted to weight the gradient differently for each value in
-each y).
-
-##### Args:
-
-
-*  <b>`ys`</b>: A `Tensor` or list of tensors to be differentiated.
-*  <b>`xs`</b>: A `Tensor` or list of tensors to be used for differentiation.
-*  <b>`grad_ys`</b>: Optional. A `Tensor` or list of tensors the same size as
-    `ys` and holding the gradients computed for each y in `ys`.
-*  <b>`name`</b>: Optional name to use for grouping all the gradient ops together.
-    defaults to 'gradients'.
-*  <b>`colocate_gradients_with_ops`</b>: If True, try colocating gradients with
-    the corresponding op.
-*  <b>`gate_gradients`</b>: If True, add a tuple around the gradients returned
-    for an operations.  This avoids some race conditions.
-*  <b>`aggregation_method`</b>: Specifies the method used to combine gradient terms.
-    Accepted values are constants defined in the class `AggregationMethod`.
-
-##### Returns:
-
-  A list of `sum(dy/dx)` for each x in `xs`.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: if one of the operations between `x` and `y` does not
-    have a registered gradient function.
-*  <b>`ValueError`</b>: if the arguments are invalid.
-
-
-- - -
-
-### `class tf.AggregationMethod` {#AggregationMethod}
-
-A class listing aggregation methods used to combine gradients.
-
-Computing partial derivatives can require aggregating gradient
-contributions. This class lists the various methods that can
-be used to combine gradients in the graph:
-
-*  `ADD_N`: All of the gradient terms are summed as part of one
-   operation using the "AddN" op. It has the property that all
-   gradients must be ready before any aggregation is performed.
-*  `DEFAULT`: The system-chosen default aggregation method.
-
-
-- - -
-
-### `tf.stop_gradient(input, name=None)` {#stop_gradient}
-
-Stops gradient computation.
-
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, this op prevents the contribution of
-its inputs to be taken into account.  Normally, the gradient generator adds ops
-to a graph to compute the derivatives of a specified 'loss' by recursively
-finding out inputs that contributed to its computation.  If you insert this op
-in the graph it inputs are masked from the gradient generator.  They are not
-taken into account for computing gradients.
-
-This is useful any time you want to compute a value with TensorFlow but need
-to pretend that the value was a constant. Some examples include:
-
-*  The *EM* algorithm where the *M-step* should not involve backpropagation
-   through the output of the *E-step*.
-*  Contrastive divergence training of Boltzmann machines where, when
-   differentiating the energy function, the training must not backpropagate
-   through the graph that generated the samples from the model.
-*  Adversarial training, where no backprop should happen through the adversarial
-   example generation process.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`.
-
-
-
-- - -
-
-### `tf.hessians(ys, xs, name='hessians', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)` {#hessians}
-
-Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
-
-`hessians()` adds ops to the graph to output the Hessian matrix of `ys`
-with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
-where each tensor is the Hessian of `sum(ys)`. This function currently
-only supports evaluating the Hessian with respect to (a list of) one-
-dimensional tensors.
-
-The Hessian is a matrix of second-order partial derivatives of a scalar
-tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
-
-##### Args:
-
-
-*  <b>`ys`</b>: A `Tensor` or list of tensors to be differentiated.
-*  <b>`xs`</b>: A `Tensor` or list of tensors to be used for differentiation.
-*  <b>`name`</b>: Optional name to use for grouping all the gradient ops together.
-    defaults to 'hessians'.
-*  <b>`colocate_gradients_with_ops`</b>: See `gradients()` documentation for details.
-*  <b>`gate_gradients`</b>: See `gradients()` documentation for details.
-*  <b>`aggregation_method`</b>: See `gradients()` documentation for details.
-
-##### Returns:
-
-  A list of Hessian matrices of `sum(y)` for each `x` in `xs`.
-
-##### Raises:
-
-
-*  <b>`LookupError`</b>: if one of the operations between `xs` and `ys` does not
-    have a registered gradient function.
-*  <b>`ValueError`</b>: if the arguments are invalid or not supported. Currently,
-    this function only supports one-dimensional `x` in `xs`.
-
-
-
-
-## Gradient Clipping
-
-TensorFlow provides several operations that you can use to add clipping
-functions to your graph. You can use these functions to perform general data
-clipping, but they're particularly useful for handling exploding or vanishing
-gradients.
-
-- - -
-
-### `tf.clip_by_value(t, clip_value_min, clip_value_max, name=None)` {#clip_by_value}
-
-Clips tensor values to a specified min and max.
-
-Given a tensor `t`, this operation returns a tensor of the same type and
-shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-greater than `clip_value_max` are set to `clip_value_max`.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_value_min`</b>: A 0-D (scalar) `Tensor`. The minimum value to clip by.
-*  <b>`clip_value_max`</b>: A 0-D (scalar) `Tensor`. The maximum value to clip by.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
-
-- - -
-
-### `tf.clip_by_norm(t, clip_norm, axes=None, name=None)` {#clip_by_norm}
-
-Clips tensor values to a maximum L2-norm.
-
-Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
-normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
-along the dimensions given in `axes`. Specifically, in the default case
-where all dimensions are used for calculation, if the L2-norm of `t` is
-already less than or equal to `clip_norm`, then `t` is not modified. If
-the L2-norm is greater than `clip_norm`, then this operation returns a
-tensor of the same type and shape as `t` with its values set to:
-
-`t * clip_norm / l2norm(t)`
-
-In this case, the L2-norm of the output tensor is `clip_norm`.
-
-As another example, if `t` is a matrix and `axes == [1]`, then each row
-of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
-instead, each column of the output will be clipped.
-
-This operation is typically used to clip gradients before applying them with
-an optimizer.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
-*  <b>`axes`</b>: A 1-D (vector) `Tensor` of type int32 containing the dimensions
-    to use for computing the L2-norm. If `None` (the default), uses all
-    dimensions.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
-
-- - -
-
-### `tf.clip_by_average_norm(t, clip_norm, name=None)` {#clip_by_average_norm}
-
-Clips tensor values to a maximum average L2-norm.
-
-Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
-normalizes `t` so that its average L2-norm is less than or equal to
-`clip_norm`. Specifically, if the average L2-norm is already less than or
-equal to `clip_norm`, then `t` is not modified. If the average L2-norm is
-greater than `clip_norm`, then this operation returns a tensor of the same
-type and shape as `t` with its values set to:
-
-`t * clip_norm / l2norm_avg(t)`
-
-In this case, the average L2-norm of the output tensor is `clip_norm`.
-
-This operation is typically used to clip gradients before applying them with
-an optimizer.
-
-##### Args:
-
-
-*  <b>`t`</b>: A `Tensor`.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A clipped `Tensor`.
-
-
-- - -
-
-### `tf.clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None)` {#clip_by_global_norm}
-
-Clips values of multiple tensors by the ratio of the sum of their norms.
-
-Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
-this operation returns a list of clipped tensors `list_clipped`
-and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
-if you've already computed the global norm for `t_list`, you can specify
-the global norm with `use_norm`.
-
-To perform the clipping, the values `t_list[i]` are set to:
-
-    t_list[i] * clip_norm / max(global_norm, clip_norm)
-
-where:
-
-    global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
-
-If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
-otherwise they're all shrunk by the global ratio.
-
-Any of the entries of `t_list` that are of type `None` are ignored.
-
-This is the correct way to perform gradient clipping (for example, see
-[Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
-([pdf](http://arxiv.org/pdf/1211.5063.pdf))).
-
-However, it is slower than `clip_by_norm()` because all the parameters must be
-ready before the clipping operation can be performed.
-
-##### Args:
-
-
-*  <b>`t_list`</b>: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
-*  <b>`clip_norm`</b>: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
-*  <b>`use_norm`</b>: A 0-D (scalar) `Tensor` of type `float` (optional). The global
-    norm to use. If not provided, `global_norm()` is used to compute the norm.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-
-*  <b>`list_clipped`</b>: A list of `Tensors` of the same type as `list_t`.
-*  <b>`global_norm`</b>: A 0-D (scalar) `Tensor` representing the global norm.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `t_list` is not a sequence.
-
-
-- - -
-
-### `tf.global_norm(t_list, name=None)` {#global_norm}
-
-Computes the global norm of multiple tensors.
-
-Given a tuple or list of tensors `t_list`, this operation returns the
-global norm of the elements in all tensors in `t_list`. The global norm is
-computed as:
-
-`global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))`
-
-Any entries in `t_list` that are of type None are ignored.
-
-##### Args:
-
-
-*  <b>`t_list`</b>: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A 0-D (scalar) `Tensor` of type `float`.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `t_list` is not a sequence.
-
-
-
-## Decaying the learning rate
-- - -
-
-### `tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#exponential_decay}
-
-Applies exponential decay to the learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an exponential decay function
-to a provided initial learning rate.  It requires a `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate *
-                        decay_rate ^ (global_step / decay_steps)
-```
-
-If the argument `staircase` is `True`, then `global_step / decay_steps` is an
-integer division and the decayed learning rate follows a staircase function.
-
-Example: decay every 100000 steps with a base of 0.96:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-starter_learning_rate = 0.1
-learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
-                                           100000, 0.96, staircase=True)
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Must be positive.  See the decay computation above.
-*  <b>`decay_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The decay rate.
-*  <b>`staircase`</b>: Boolean.  If `True` decay the learning rate at discrete intervals
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'ExponentialDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
-
-- - -
-
-### `tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#inverse_time_decay}
-
-Applies inverse time decay to the initial learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an inverse decay function
-to a provided initial learning rate.  It requires an `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate / (1 + decay_rate * t)
-```
-
-Example: decay 1/t with a rate of 0.5:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-learning_rate = 0.1
-k = 0.5
-learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
-
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: How often to apply decay.
-*  <b>`decay_rate`</b>: A Python number.  The decay rate.
-*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
-    continuous, fashion.
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'InverseTimeDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
-
-- - -
-
-### `tf.train.natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#natural_exp_decay}
-
-Applies natural exponential decay to the initial learning rate.
-
-When training a model, it is often recommended to lower the learning rate as
-the training progresses.  This function applies an exponential decay function
-to a provided initial learning rate.  It requires an `global_step` value to
-compute the decayed learning rate.  You can just pass a TensorFlow variable
-that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
-```
-
-Example: decay exponentially with a base of 0.96:
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-learning_rate = 0.1
-k = 0.5
-learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
-
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: How often to apply decay.
-*  <b>`decay_rate`</b>: A Python number.  The decay rate.
-*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
-    continuous, fashion.
-*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
-    'ExponentialTimeDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
-
-- - -
-
-### `tf.train.piecewise_constant(x, boundaries, values, name=None)` {#piecewise_constant}
-
-Piecewise constant from boundaries and interval values.
-
-Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
-  for steps 100001 to 110000, and 0.1 for any additional steps.
-
-```python
-global_step = tf.Variable(0, trainable=False)
-boundaries = [100000, 110000]
-values = [1.0, 0.5, 0.1]
-learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-
-# Later, whenever we perform an optimization step, we increment global_step.
-```
-
-##### Args:
-
-
-*  <b>`x`</b>: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
-    `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
-*  <b>`boundaries`</b>: A list of `Tensor`s or `int`s or `float`s with strictly
-    increasing entries, and with all elements having the same type as `x`.
-*  <b>`values`</b>: A list of `Tensor`s or float`s or `int`s that specifies the values
-    for the intervals defined by `boundaries`. It should have one more element
-    than `boundaries`, and all elements should have the same type.
-*  <b>`name`</b>: A string. Optional name of the operation. Defaults to
-    'PiecewiseConstant'.
-
-##### Returns:
-
-  A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
-  `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
-  and values[-1] when `x > boundaries[-1]`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if types of `x` and `buondaries` do not match, or types of all
-      `values` do not match.
-
-
-- - -
-
-### `tf.train.polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None)` {#polynomial_decay}
-
-Applies a polynomial decay to the learning rate.
-
-It is commonly observed that a monotonically decreasing learning rate, whose
-degree of change is carefully chosen, results in a better performing model.
-This function applies a polynomial decay function to a provided initial
-`learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
-
-It requires a `global_step` value to compute the decayed learning rate.  You
-can just pass a TensorFlow variable that you increment at each training step.
-
-The function returns the decayed learning rate.  It is computed as:
-
-```python
-global_step = min(global_step, decay_steps)
-decayed_learning_rate = (learning_rate - end_learning_rate) *
-                        (1 - global_step / decay_steps) ^ (power) +
-                        end_learning_rate
-
-```
-
-If `cycle` is True then a multiple of `decay_steps` is used, the first one
-that is bigger than `global_steps`.
-
-```python
-decay_steps = decay_steps * ceil(global_step / decay_steps)
-decayed_learning_rate = (learning_rate - end_learning_rate) *
-                        (1 - global_step / decay_steps) ^ (power) +
-                        end_learning_rate
-
-```
-
-Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
-
-```python
-...
-global_step = tf.Variable(0, trainable=False)
-starter_learning_rate = 0.1
-end_learning_rate = 0.01
-decay_steps = 10000
-learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
-                                          decay_steps, end_learning_rate,
-                                          power=0.5)
-# Passing global_step to minimize() will increment it at each step.
-learning_step = (
-    tf.train.GradientDescentOptimizer(learning_rate)
-    .minimize(...my loss..., global_step=global_step)
-)
-```
-
-##### Args:
-
-
-*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The initial learning rate.
-*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Global step to use for the decay computation.  Must not be negative.
-*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
-    Must be positive.  See the decay computation above.
-*  <b>`end_learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The minimal end learning rate.
-*  <b>`power`</b>: A scalar `float32` or `float64` `Tensor` or a
-    Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
-*  <b>`cycle`</b>: A boolean, whether or not it should cycle beyond decay_steps.
-*  <b>`name`</b>: String.  Optional name of the operation. Defaults to
-    'PolynomialDecay'.
-
-##### Returns:
-
-  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-  learning rate.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `global_step` is not supplied.
-
-
-
-## Moving Averages
-
-Some training algorithms, such as GradientDescent and Momentum often benefit
-from maintaining a moving average of variables during optimization.  Using the
-moving averages for evaluations often improve results significantly.
-
-- - -
-
-### `class tf.train.ExponentialMovingAverage` {#ExponentialMovingAverage}
-
-Maintains moving averages of variables by employing an exponential decay.
-
-When training a model, it is often beneficial to maintain moving averages of
-the trained parameters.  Evaluations that use averaged parameters sometimes
-produce significantly better results than the final trained values.
-
-The `apply()` method adds shadow copies of trained variables and add ops that
-maintain a moving average of the trained variables in their shadow copies.
-It is used when building the training model.  The ops that maintain moving
-averages are typically run after each training step.
-The `average()` and `average_name()` methods give access to the shadow
-variables and their names.  They are useful when building an evaluation
-model, or when restoring a model from a checkpoint file.  They help use the
-moving averages in place of the last trained values for evaluations.
-
-The moving averages are computed using exponential decay.  You specify the
-decay value when creating the `ExponentialMovingAverage` object.  The shadow
-variables are initialized with the same initial values as the trained
-variables.  When you run the ops to maintain the moving averages, each
-shadow variable is updated with the formula:
-
-  `shadow_variable -= (1 - decay) * (shadow_variable - variable)`
-
-This is mathematically equivalent to the classic formula below, but the use
-of an `assign_sub` op (the `"-="` in the formula) allows concurrent lockless
-updates to the variables:
-
-  `shadow_variable = decay * shadow_variable + (1 - decay) * variable`
-
-Reasonable values for `decay` are close to 1.0, typically in the
-multiple-nines range: 0.999, 0.9999, etc.
-
-Example usage when creating a training model:
-
-```python
-# Create variables.
-var0 = tf.Variable(...)
-var1 = tf.Variable(...)
-# ... use the variables to build a training model...
-...
-# Create an op that applies the optimizer.  This is what we usually
-# would use as a training op.
-opt_op = opt.minimize(my_loss, [var0, var1])
-
-# Create an ExponentialMovingAverage object
-ema = tf.train.ExponentialMovingAverage(decay=0.9999)
-
-# Create the shadow variables, and add ops to maintain moving averages
-# of var0 and var1.
-maintain_averages_op = ema.apply([var0, var1])
-
-# Create an op that will update the moving averages after each training
-# step.  This is what we will use in place of the usual training op.
-with tf.control_dependencies([opt_op]):
-    training_op = tf.group(maintain_averages_op)
-
-...train the model by running training_op...
-```
-
-There are two ways to use the moving averages for evaluations:
-
-*  Build a model that uses the shadow variables instead of the variables.
-   For this, use the `average()` method which returns the shadow variable
-   for a given variable.
-*  Build a model normally but load the checkpoint files to evaluate by using
-   the shadow variable names.  For this use the `average_name()` method.  See
-   the [Saver class](../../api_docs/python/train.md#Saver) for more
-   information on restoring saved variables.
-
-Example of restoring the shadow variable values:
-
-```python
-# Create a Saver that loads variables from their saved shadow values.
-shadow_var0_name = ema.average_name(var0)
-shadow_var1_name = ema.average_name(var1)
-saver = tf.train.Saver({shadow_var0_name: var0, shadow_var1_name: var1})
-saver.restore(...checkpoint filename...)
-# var0 and var1 now hold the moving average values
-```
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.__init__(decay, num_updates=None, zero_debias=False, name='ExponentialMovingAverage')` {#ExponentialMovingAverage.__init__}
-
-Creates a new ExponentialMovingAverage object.
-
-The `apply()` method has to be called to create shadow variables and add
-ops to maintain moving averages.
-
-The optional `num_updates` parameter allows one to tweak the decay rate
-dynamically. It is typical to pass the count of training steps, usually
-kept in a variable that is incremented at each step, in which case the
-decay rate is lower at the start of training.  This makes moving averages
-move faster.  If passed, the actual decay rate used is:
-
-  `min(decay, (1 + num_updates) / (10 + num_updates))`
-
-##### Args:
-
-
-*  <b>`decay`</b>: Float.  The decay to use.
-*  <b>`num_updates`</b>: Optional count of number of updates applied to variables.
-*  <b>`zero_debias`</b>: If `True`, zero debias moving-averages that are initialized
-    with tensors.
-*  <b>`name`</b>: String. Optional prefix name to use for the name of ops added in
-    `apply()`.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.apply(var_list=None)` {#ExponentialMovingAverage.apply}
-
-Maintains moving averages of variables.
-
-`var_list` must be a list of `Variable` or `Tensor` objects.  This method
-creates shadow variables for all elements of `var_list`.  Shadow variables
-for `Variable` objects are initialized to the variable's initial value.
-They will be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
-For `Tensor` objects, the shadow variables are initialized to 0 and zero
-debiased (see docstring in `assign_moving_average` for more details).
-
-shadow variables are created with `trainable=False` and added to the
-`GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
-`tf.global_variables()`.
-
-Returns an op that updates all shadow variables as described above.
-
-Note that `apply()` can be called multiple times with different lists of
-variables.
-
-##### Args:
-
-
-*  <b>`var_list`</b>: A list of Variable or Tensor objects. The variables
-    and Tensors must be of types float16, float32, or float64.
-
-##### Returns:
-
-  An Operation that updates the moving averages.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the arguments are not all float16, float32, or float64.
-*  <b>`ValueError`</b>: If the moving average of one of the variables is already
-    being computed.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.average_name(var)` {#ExponentialMovingAverage.average_name}
-
-Returns the name of the `Variable` holding the average for `var`.
-
-The typical scenario for `ExponentialMovingAverage` is to compute moving
-averages of variables during training, and restore the variables from the
-computed moving averages during evaluations.
-
-To restore variables, you have to know the name of the shadow variables.
-That name and the original variable can then be passed to a `Saver()` object
-to restore the variable from the moving average value with:
-  `saver = tf.train.Saver({ema.average_name(var): var})`
-
-`average_name()` can be called whether or not `apply()` has been called.
-
-##### Args:
-
-
-*  <b>`var`</b>: A `Variable` object.
-
-##### Returns:
-
-  A string: The name of the variable that will be used or was used
-  by the `ExponentialMovingAverage class` to hold the moving average of
-  `var`.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.average(var)` {#ExponentialMovingAverage.average}
-
-Returns the `Variable` holding the average of `var`.
-
-##### Args:
-
-
-*  <b>`var`</b>: A `Variable` object.
-
-##### Returns:
-
-  A `Variable` object or `None` if the moving average of `var`
-  is not maintained.
-
-
-- - -
-
-#### `tf.train.ExponentialMovingAverage.variables_to_restore(moving_avg_variables=None)` {#ExponentialMovingAverage.variables_to_restore}
-
-Returns a map of names to `Variables` to restore.
-
-If a variable has a moving average, use the moving average variable name as
-the restore name; otherwise, use the variable name.
-
-For example,
-
-```python
-  variables_to_restore = ema.variables_to_restore()
-  saver = tf.train.Saver(variables_to_restore)
-```
-
-Below is an example of such mapping:
-
-```
-  conv/batchnorm/gamma/ExponentialMovingAverage: conv/batchnorm/gamma,
-  conv_4/conv2d_params/ExponentialMovingAverage: conv_4/conv2d_params,
-  global_step: global_step
-```
-
-##### Args:
-
-
-*  <b>`moving_avg_variables`</b>: a list of variables that require to use of the
-    moving variable name to be restored. If None, it will default to
-    variables.moving_average_variables() + variables.trainable_variables()
-
-##### Returns:
-
-  A map from restore_names to variables. The restore_name can be the
-  moving_average version of the variable name if it exist, or the original
-  variable name.
-
-
-
-
-## Coordinator and QueueRunner
-
-See [Threading and Queues](../../how_tos/threading_and_queues/index.md)
-for how to use threads and queues.  For documentation on the Queue API,
-see [Queues](../../api_docs/python/io_ops.md#queues).
-
-
-- - -
-
-### `class tf.train.Coordinator` {#Coordinator}
-
-A coordinator for threads.
-
-This class implements a simple mechanism to coordinate the termination of a
-set of threads.
-
-#### Usage:
-
-```python
-# Create a coordinator.
-coord = Coordinator()
-# Start a number of threads, passing the coordinator to each of them.
-...start thread 1...(coord, ...)
-...start thread N...(coord, ...)
-# Wait for all the threads to terminate.
-coord.join(threads)
-```
-
-Any of the threads can call `coord.request_stop()` to ask for all the threads
-to stop.  To cooperate with the requests, each thread must check for
-`coord.should_stop()` on a regular basis.  `coord.should_stop()` returns
-`True` as soon as `coord.request_stop()` has been called.
-
-A typical thread running with a coordinator will do something like:
-
-```python
-while not coord.should_stop():
-  ...do some work...
-```
-
-#### Exception handling:
-
-A thread can report an exception to the coordinator as part of the
-`should_stop()` call.  The exception will be re-raised from the
-`coord.join()` call.
-
-Thread code:
-
-```python
-try:
-  while not coord.should_stop():
-    ...do some work...
-except Exception as e:
-  coord.request_stop(e)
-```
-
-Main code:
-
-```python
-try:
-  ...
-  coord = Coordinator()
-  # Start a number of threads, passing the coordinator to each of them.
-  ...start thread 1...(coord, ...)
-  ...start thread N...(coord, ...)
-  # Wait for all the threads to terminate.
-  coord.join(threads)
-except Exception as e:
-  ...exception that was passed to coord.request_stop()
-```
-
-To simplify the thread implementation, the Coordinator provides a
-context handler `stop_on_exception()` that automatically requests a stop if
-an exception is raised.  Using the context handler the thread code above
-can be written as:
-
-```python
-with coord.stop_on_exception():
-  while not coord.should_stop():
-    ...do some work...
-```
-
-#### Grace period for stopping:
-
-After a thread has called `coord.request_stop()` the other threads have a
-fixed time to stop, this is called the 'stop grace period' and defaults to 2
-minutes.  If any of the threads is still alive after the grace period expires
-`coord.join()` raises a RuntimeException reporting the laggards.
-
-```python
-try:
-  ...
-  coord = Coordinator()
-  # Start a number of threads, passing the coordinator to each of them.
-  ...start thread 1...(coord, ...)
-  ...start thread N...(coord, ...)
-  # Wait for all the threads to terminate, give them 10s grace period
-  coord.join(threads, stop_grace_period_secs=10)
-except RuntimeException:
-  ...one of the threads took more than 10s to stop after request_stop()
-  ...was called.
-except Exception:
-  ...exception that was passed to coord.request_stop()
-```
-- - -
-
-#### `tf.train.Coordinator.__init__(clean_stop_exception_types=None)` {#Coordinator.__init__}
-
-Create a new Coordinator.
-
-##### Args:
-
-
-*  <b>`clean_stop_exception_types`</b>: Optional tuple of Exception types that should
-    cause a clean stop of the coordinator. If an exception of one of these
-    types is reported to `request_stop(ex)` the coordinator will behave as
-    if `request_stop(None)` was called.  Defaults to
-    `(tf.errors.OutOfRangeError,)` which is used by input queues to signal
-    the end of input. When feeding training data from a Python iterator it
-    is common to add `StopIteration` to this list.
-
-
-- - -
-
-#### `tf.train.Coordinator.clear_stop()` {#Coordinator.clear_stop}
-
-Clears the stop flag.
-
-After this is called, calls to `should_stop()` will return `False`.
-
-
-- - -
-
-#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
-
-Wait for threads to terminate.
-
-This call blocks until a set of threads have terminated.  The set of thread
-is the union of the threads passed in the `threads` argument and the list
-of threads that registered with the coordinator by calling
-`Coordinator.register_thread()`.
-
-After the threads stop, if an `exc_info` was passed to `request_stop`, that
-exception is re-raised.
-
-Grace period handling: When `request_stop()` is called, threads are given
-'stop_grace_period_secs' seconds to terminate.  If any of them is still
-alive after that period expires, a `RuntimeError` is raised.  Note that if
-an `exc_info` was passed to `request_stop()` then it is raised instead of
-that `RuntimeError`.
-
-##### Args:
-
-
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
-    addition to the registered threads.
-*  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
-    `request_stop()` has been called.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If any thread is still alive after `request_stop()`
-    is called and the grace period expires.
-
-
-- - -
-
-#### `tf.train.Coordinator.joined` {#Coordinator.joined}
-
-
-
-
-- - -
-
-#### `tf.train.Coordinator.raise_requested_exception()` {#Coordinator.raise_requested_exception}
-
-If an exception has been passed to `request_stop`, this raises it.
-
-
-- - -
-
-#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
-
-Register a thread to join.
-
-##### Args:
-
-
-*  <b>`thread`</b>: A Python thread to join.
-
-
-- - -
-
-#### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
-
-Request that the threads stop.
-
-After this is called, calls to `should_stop()` will return `True`.
-
-Note: If an exception is being passed in, in must be in the context of
-handling the exception (i.e. `try: ... except Exception as ex: ...`) and not
-a newly created one.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Coordinator.should_stop()` {#Coordinator.should_stop}
-
-Check if stop was requested.
-
-##### Returns:
-
-  True if a stop was requested.
-
-
-- - -
-
-#### `tf.train.Coordinator.stop_on_exception()` {#Coordinator.stop_on_exception}
-
-Context manager to request stop when an Exception is raised.
-
-Code that uses a coordinator must catch exceptions and pass
-them to the `request_stop()` method to stop the other threads
-managed by the coordinator.
-
-This context handler simplifies the exception handling.
-Use it as follows:
-
-```python
-with coord.stop_on_exception():
-  # Any exception raised in the body of the with
-  # clause is reported to the coordinator before terminating
-  # the execution of the body.
-  ...body...
-```
-
-This is completely equivalent to the slightly longer code:
-
-```python
-try:
-  ...body...
-exception Exception as ex:
-  coord.request_stop(ex)
-```
-
-##### Yields:
-
-  nothing.
-
-
-- - -
-
-#### `tf.train.Coordinator.wait_for_stop(timeout=None)` {#Coordinator.wait_for_stop}
-
-Wait till the Coordinator is told to stop.
-
-##### Args:
-
-
-*  <b>`timeout`</b>: Float.  Sleep for up to that many seconds waiting for
-    should_stop() to become True.
-
-##### Returns:
-
-  True if the Coordinator is told stop, False if the timeout expired.
-
-
-
-- - -
-
-### `class tf.train.QueueRunner` {#QueueRunner}
-
-Holds a list of enqueue operations for a queue, each to be run in a thread.
-
-Queues are a convenient TensorFlow mechanism to compute tensors
-asynchronously using multiple threads. For example in the canonical 'Input
-Reader' setup one set of threads generates filenames in a queue; a second set
-of threads read records from the files, processes them, and enqueues tensors
-on a second queue; a third set of threads dequeues these input records to
-construct batches and runs them through training operations.
-
-There are several delicate issues when running multiple threads that way:
-closing the queues in sequence as the input is exhausted, correctly catching
-and reporting exceptions, etc.
-
-The `QueueRunner`, combined with the `Coordinator`, helps handle these issues.
-- - -
-
-#### `tf.train.QueueRunner.__init__(queue=None, enqueue_ops=None, close_op=None, cancel_op=None, queue_closed_exception_types=None, queue_runner_def=None, import_scope=None)` {#QueueRunner.__init__}
-
-Create a QueueRunner.
-
-On construction the `QueueRunner` adds an op to close the queue.  That op
-will be run if the enqueue ops raise exceptions.
-
-When you later call the `create_threads()` method, the `QueueRunner` will
-create one thread for each op in `enqueue_ops`.  Each thread will run its
-enqueue op in parallel with the other threads.  The enqueue ops do not have
-to all be the same op, but it is expected that they all enqueue tensors in
-`queue`.
-
-##### Args:
-
-
-*  <b>`queue`</b>: A `Queue`.
-*  <b>`enqueue_ops`</b>: List of enqueue ops to run in threads later.
-*  <b>`close_op`</b>: Op to close the queue. Pending enqueue ops are preserved.
-*  <b>`cancel_op`</b>: Op to close the queue and cancel pending enqueue ops.
-*  <b>`queue_closed_exception_types`</b>: Optional tuple of Exception types that
-    indicate that the queue has been closed when raised during an enqueue
-    operation.  Defaults to `(tf.errors.OutOfRangeError,)`.  Another common
-    case includes `(tf.errors.OutOfRangeError, tf.errors.CancelledError)`,
-    when some of the enqueue ops may dequeue from other Queues.
-*  <b>`queue_runner_def`</b>: Optional `QueueRunnerDef` protocol buffer. If specified,
-    recreates the QueueRunner from its contents. `queue_runner_def` and the
-    other arguments are mutually exclusive.
-*  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
-    initializing from protocol buffer.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both `queue_runner_def` and `queue` are both specified.
-*  <b>`ValueError`</b>: If `queue` or `enqueue_ops` are not provided when not
-    restoring from `queue_runner_def`.
-
-
-- - -
-
-#### `tf.train.QueueRunner.cancel_op` {#QueueRunner.cancel_op}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.close_op` {#QueueRunner.close_op}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.create_threads(sess, coord=None, daemon=False, start=False)` {#QueueRunner.create_threads}
-
-Create threads to run the enqueue ops for the given session.
-
-This method requires a session in which the graph was launched.  It creates
-a list of threads, optionally starting them.  There is one thread for each
-op passed in `enqueue_ops`.
-
-The `coord` argument is an optional coordinator that the threads will use
-to terminate together and report exceptions.  If a coordinator is given,
-this method starts an additional thread to close the queue when the
-coordinator requests a stop.
-
-If previously created threads for the given session are still running, no
-new threads will be created.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`coord`</b>: Optional `Coordinator` object for reporting errors and checking
-    stop conditions.
-*  <b>`daemon`</b>: Boolean.  If `True` make the threads daemon threads.
-*  <b>`start`</b>: Boolean.  If `True` starts the threads.  If `False` the
-    caller must call the `start()` method of the returned threads.
-
-##### Returns:
-
-  A list of threads.
-
-
-- - -
-
-#### `tf.train.QueueRunner.enqueue_ops` {#QueueRunner.enqueue_ops}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.exceptions_raised` {#QueueRunner.exceptions_raised}
-
-Exceptions raised but not handled by the `QueueRunner` threads.
-
-Exceptions raised in queue runner threads are handled in one of two ways
-depending on whether or not a `Coordinator` was passed to
-`create_threads()`:
-
-* With a `Coordinator`, exceptions are reported to the coordinator and
-  forgotten by the `QueueRunner`.
-* Without a `Coordinator`, exceptions are captured by the `QueueRunner` and
-  made available in this `exceptions_raised` property.
-
-##### Returns:
-
-  A list of Python `Exception` objects.  The list is empty if no exception
-  was captured.  (No exceptions are captured when using a Coordinator.)
-
-
-- - -
-
-#### `tf.train.QueueRunner.from_proto(queue_runner_def, import_scope=None)` {#QueueRunner.from_proto}
-
-Returns a `QueueRunner` object created from `queue_runner_def`.
-
-
-- - -
-
-#### `tf.train.QueueRunner.name` {#QueueRunner.name}
-
-The string name of the underlying Queue.
-
-
-- - -
-
-#### `tf.train.QueueRunner.queue` {#QueueRunner.queue}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.queue_closed_exception_types` {#QueueRunner.queue_closed_exception_types}
-
-
-
-
-- - -
-
-#### `tf.train.QueueRunner.to_proto(export_scope=None)` {#QueueRunner.to_proto}
-
-Converts this `QueueRunner` to a `QueueRunnerDef` protocol buffer.
-
-##### Args:
-
-
-*  <b>`export_scope`</b>: Optional `string`. Name scope to remove.
-
-##### Returns:
-
-  A `QueueRunnerDef` protocol buffer, or `None` if the `Variable` is not in
-  the specified name scope.
-
-
-
-- - -
-
-### `class tf.train.LooperThread` {#LooperThread}
-
-A thread that runs code repeatedly, optionally on a timer.
-
-This thread class is intended to be used with a `Coordinator`.  It repeatedly
-runs code specified either as `target` and `args` or by the `run_loop()`
-method.
-
-Before each run the thread checks if the coordinator has requested stop.  In
-that case the looper thread terminates immediately.
-
-If the code being run raises an exception, that exception is reported to the
-coordinator and the thread terminates.  The coordinator will then request all
-the other threads it coordinates to stop.
-
-You typically pass looper threads to the supervisor `Join()` method.
-- - -
-
-#### `tf.train.LooperThread.__init__(coord, timer_interval_secs, target=None, args=None, kwargs=None)` {#LooperThread.__init__}
-
-Create a LooperThread.
-
-##### Args:
-
-
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Time boundaries at which to call Run(), or None
-    if it should be called back to back.
-*  <b>`target`</b>: Optional callable object that will be executed in the thread.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.train.LooperThread.__repr__()` {#LooperThread.__repr__}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.daemon` {#LooperThread.daemon}
-
-A boolean value indicating whether this thread is a daemon thread (True) or not (False).
-
-This must be set before start() is called, otherwise RuntimeError is
-raised. Its initial value is inherited from the creating thread; the
-main thread is not a daemon thread and therefore all threads created in
-the main thread default to daemon = False.
-
-The entire Python program exits when no alive non-daemon threads are
-left.
-
-
-- - -
-
-#### `tf.train.LooperThread.getName()` {#LooperThread.getName}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.ident` {#LooperThread.ident}
-
-Thread identifier of this thread or None if it has not been started.
-
-This is a nonzero integer. See the thread.get_ident() function. Thread
-identifiers may be recycled when a thread exits and another thread is
-created. The identifier is available even after the thread has exited.
-
-
-- - -
-
-#### `tf.train.LooperThread.isAlive()` {#LooperThread.isAlive}
-
-Return whether the thread is alive.
-
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
-
-
-- - -
-
-#### `tf.train.LooperThread.isDaemon()` {#LooperThread.isDaemon}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.is_alive()` {#LooperThread.is_alive}
-
-Return whether the thread is alive.
-
-This method returns True just before the run() method starts until just
-after the run() method terminates. The module function enumerate()
-returns a list of all alive threads.
-
-
-- - -
-
-#### `tf.train.LooperThread.join(timeout=None)` {#LooperThread.join}
-
-Wait until the thread terminates.
-
-This blocks the calling thread until the thread whose join() method is
-called terminates -- either normally or through an unhandled exception
-or until the optional timeout occurs.
-
-When the timeout argument is present and not None, it should be a
-floating point number specifying a timeout for the operation in seconds
-(or fractions thereof). As join() always returns None, you must call
-isAlive() after join() to decide whether a timeout happened -- if the
-thread is still alive, the join() call timed out.
-
-When the timeout argument is not present or None, the operation will
-block until the thread terminates.
-
-A thread can be join()ed many times.
-
-join() raises a RuntimeError if an attempt is made to join the current
-thread as that would cause a deadlock. It is also an error to join() a
-thread before it has been started and attempts to do so raises the same
-exception.
-
-
-- - -
-
-#### `tf.train.LooperThread.loop(coord, timer_interval_secs, target, args=None, kwargs=None)` {#LooperThread.loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(args)`
-repeatedly.  Otherwise `target(args)` is called every `timer_interval_secs`
-seconds.  The thread terminates when a stop of the coordinator is
-requested.
-
-##### Args:
-
-
-*  <b>`coord`</b>: A Coordinator.
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.LooperThread.name` {#LooperThread.name}
-
-A string used for identification purposes only.
-
-It has no semantics. Multiple threads may be given the same name. The
-initial name is set by the constructor.
-
-
-- - -
-
-#### `tf.train.LooperThread.run()` {#LooperThread.run}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.run_loop()` {#LooperThread.run_loop}
-
-Called at 'timer_interval_secs' boundaries.
-
-
-- - -
-
-#### `tf.train.LooperThread.setDaemon(daemonic)` {#LooperThread.setDaemon}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.setName(name)` {#LooperThread.setName}
-
-
-
-
-- - -
-
-#### `tf.train.LooperThread.start()` {#LooperThread.start}
-
-Start the thread's activity.
-
-It must be called at most once per thread object. It arranges for the
-object's run() method to be invoked in a separate thread of control.
-
-This method will raise a RuntimeError if called more than once on the
-same thread object.
-
-
-- - -
-
-#### `tf.train.LooperThread.start_loop()` {#LooperThread.start_loop}
-
-Called when the thread starts.
-
-
-- - -
-
-#### `tf.train.LooperThread.stop_loop()` {#LooperThread.stop_loop}
-
-Called when the thread stops.
-
-
-
-- - -
-
-### `tf.train.add_queue_runner(qr, collection='queue_runners')` {#add_queue_runner}
-
-Adds a `QueueRunner` to a collection in the graph.
-
-When building a complex model that uses many queues it is often difficult to
-gather all the queue runners that need to be run.  This convenience function
-allows you to add a queue runner to a well known collection in the graph.
-
-The companion method `start_queue_runners()` can be used to start threads for
-all the collected queue runners.
-
-##### Args:
-
-
-*  <b>`qr`</b>: A `QueueRunner`.
-*  <b>`collection`</b>: A `GraphKey` specifying the graph collection to add
-    the queue runner to.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
-
-
-- - -
-
-### `tf.train.start_queue_runners(sess=None, coord=None, daemon=True, start=True, collection='queue_runners')` {#start_queue_runners}
-
-Starts all queue runners collected in the graph.
-
-This is a companion method to `add_queue_runner()`.  It just starts
-threads for all queue runners collected in the graph.  It returns
-the list of all threads.
-
-##### Args:
-
-
-*  <b>`sess`</b>: `Session` used to run the queue ops.  Defaults to the
-    default session.
-*  <b>`coord`</b>: Optional `Coordinator` for coordinating the started threads.
-*  <b>`daemon`</b>: Whether the threads should be marked as `daemons`, meaning
-    they don't block program exit.
-*  <b>`start`</b>: Set to `False` to only create the threads, not start them.
-*  <b>`collection`</b>: A `GraphKey` specifying the graph collection to
-    get the queue runners from.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  A list of threads.
-
-
-
-## Distributed execution
-
-See [Distributed TensorFlow](../../how_tos/distributed/index.md) for
-more information about how to configure a distributed TensorFlow program.
-
-- - -
-
-### `class tf.train.Server` {#Server}
-
-An in-process TensorFlow server, for use in distributed training.
-
-A `tf.train.Server` instance encapsulates a set of devices and a
-[`tf.Session`](../../api_docs/python/client.md#Session) target that
-can participate in distributed training. A server belongs to a
-cluster (specified by a [`tf.train.ClusterSpec`](#ClusterSpec)), and
-corresponds to a particular task in a named job. The server can
-communicate with any other server in the same cluster.
-
-- - -
-
-#### `tf.train.Server.__init__(server_or_cluster_def, job_name=None, task_index=None, protocol=None, config=None, start=True)` {#Server.__init__}
-
-Creates a new server with the given definition.
-
-The `job_name`, `task_index`, and `protocol` arguments are optional, and
-override any information provided in `server_or_cluster_def`.
-
-##### Args:
-
-
-*  <b>`server_or_cluster_def`</b>: A `tf.train.ServerDef` or
-    `tf.train.ClusterDef` protocol buffer, or a
-    `tf.train.ClusterSpec` object, describing the server to be
-    created and/or the cluster of which it is a member.
-*  <b>`job_name`</b>: (Optional.) Specifies the name of the job of which the server
-    is a member. Defaults to the value in `server_or_cluster_def`, if
-    specified.
-*  <b>`task_index`</b>: (Optional.) Specifies the task index of the server in its
-    job. Defaults to the value in `server_or_cluster_def`, if specified.
-    Otherwise defaults to 0 if the server's job has only one task.
-*  <b>`protocol`</b>: (Optional.) Specifies the protocol to be used by the server.
-    Acceptable values include `"grpc"`. Defaults to the value in
-    `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
-*  <b>`config`</b>: (Options.) A `tf.ConfigProto` that specifies default
-    configuration options for all sessions that run on this server.
-*  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server
-    after creating it. Defaults to `True`.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    creating the TensorFlow server.
-
-
-- - -
-
-#### `tf.train.Server.create_local_server(config=None, start=True)` {#Server.create_local_server}
-
-Creates a new single-process cluster running on the local host.
-
-This method is a convenience wrapper for creating a
-`tf.train.Server` with a `tf.train.ServerDef` that specifies a
-single-process cluster containing a single task in a job called
-`"local"`.
-
-##### Args:
-
-
-*  <b>`config`</b>: (Options.) A `tf.ConfigProto` that specifies default
-    configuration options for all sessions that run on this server.
-*  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server after
-    creating it. Defaults to `True`.
-
-##### Returns:
-
-  A local `tf.train.Server`.
-
-
-- - -
-
-#### `tf.train.Server.target` {#Server.target}
-
-Returns the target for a `tf.Session` to connect to this server.
-
-To create a
-[`tf.Session`](../../api_docs/python/client.md#Session) that
-connects to this server, use the following snippet:
-
-```python
-server = tf.train.Server(...)
-with tf.Session(server.target):
-  # ...
-```
-
-##### Returns:
-
-  A string containing a session target for this server.
-
-
-- - -
-
-#### `tf.train.Server.server_def` {#Server.server_def}
-
-Returns the `tf.train.ServerDef` for this server.
-
-##### Returns:
-
-  A `tf.train.ServerDef` protocol buffer that describes the configuration
-  of this server.
-
-
-
-- - -
-
-#### `tf.train.Server.start()` {#Server.start}
-
-Starts this server.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    starting the TensorFlow server.
-
-
-- - -
-
-#### `tf.train.Server.join()` {#Server.join}
-
-Blocks until the server has shut down.
-
-This method currently blocks forever.
-
-##### Raises:
-
-  tf.errors.OpError: Or one of its subclasses if an error occurs while
-    joining the TensorFlow server.
-
-
-
-- - -
-
-### `class tf.train.Supervisor` {#Supervisor}
-
-A training helper that checkpoints models and computes summaries.
-
-The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
-and a `SessionManager` that takes care of common needs of TensorFlow
-training programs.
-
-#### Use for a single program
-
-```python
-with tf.Graph().as_default():
-  ...add operations to the graph...
-  # Create a Supervisor that will checkpoint the model in '/tmp/mydir'.
-  sv = Supervisor(logdir='/tmp/mydir')
-  # Get a TensorFlow session managed by the supervisor.
-  with sv.managed_session(FLAGS.master) as sess:
-    # Use the session to train the graph.
-    while not sv.should_stop():
-      sess.run(<my_train_op>)
-```
-
-Within the `with sv.managed_session()` block all variables in the graph have
-been initialized.  In addition, a few services have been started to
-checkpoint the model and add summaries to the event log.
-
-If the program crashes and is restarted, the managed session automatically
-reinitialize variables from the most recent checkpoint.
-
-The supervisor is notified of any exception raised by one of the services.
-After an exception is raised, `should_stop()` returns `True`.  In that case
-the training loop should also stop.  This is why the training loop has to
-check for `sv.should_stop()`.
-
-Exceptions that indicate that the training inputs have been exhausted,
-`tf.errors.OutOfRangeError`, also cause `sv.should_stop()` to return `True`
-but are not re-raised from the `with` block: they indicate a normal
-termination.
-
-#### Use for multiple replicas
-
-To train with replicas you deploy the same program in a `Cluster`.
-One of the tasks must be identified as the *chief*: the task that handles
-initialization, checkpoints, summaries, and recovery.  The other tasks
-depend on the *chief* for these services.
-
-The only change you have to do to the single program code is to indicate
-if the program is running as the *chief*.
-
-```python
-# Choose a task as the chief. This could be based on server_def.task_index,
-# or job_def.name, or job_def.tasks. It's entirely up to the end user.
-# But there can be only one *chief*.
-is_chief = (server_def.task_index == 0)
-server = tf.train.Server(server_def)
-
-with tf.Graph().as_default():
-  ...add operations to the graph...
-  # Create a Supervisor that uses log directory on a shared file system.
-  # Indicate if you are the 'chief'
-  sv = Supervisor(logdir='/shared_directory/...', is_chief=is_chief)
-  # Get a Session in a TensorFlow server on the cluster.
-  with sv.managed_session(server.target) as sess:
-    # Use the session to train the graph.
-    while not sv.should_stop():
-      sess.run(<my_train_op>)
-```
-
-In the *chief* task, the `Supervisor` works exactly as in the first example
-above.  In the other tasks `sv.managed_session()` waits for the Model to have
-been initialized before returning a session to the training code.  The
-non-chief tasks depend on the chief task for initializing the model.
-
-If one of the tasks crashes and restarts, `managed_session()`
-checks if the Model is initialized.  If yes, it just creates a session and
-returns it to the training code that proceeds normally.  If the model needs
-to be initialized, the chief task takes care of reinitializing it; the other
-tasks just wait for the model to have been initialized.
-
-NOTE: This modified program still works fine as a single program.
-The single program marks itself as the chief.
-
-#### What `master` string to use
-
-Whether you are running on your machine or in the cluster you can use the
-following values for the --master flag:
-
-* Specifying `''` requests an in-process session that does not use RPC.
-
-* Specifying `'local'` requests a session that uses the RPC-based
-  "Master interface" to run TensorFlow programs. See
-  [`tf.train.Server.create_local_server()`](#Server.create_local_server) for
-  details.
-
-* Specifying `'grpc://hostname:port'` requests a session that uses
-  the RPC interface to a specific host, and also allows the in-process
-  master to access remote tensorflow workers. Often, it is
-  appropriate to pass `server.target` (for some `tf.train.Server`
-  named `server).
-
-#### Advanced use
-
-##### Launching additional services
-
-`managed_session()` launches the Checkpoint and Summary services (threads).
-If you need more services to run you can simply launch them in the block
-controlled by `managed_session()`.
-
-Example: Start a thread to print losses.  We want this thread to run
-every 60 seconds, so we launch it with `sv.loop()`.
-
-  ```python
-  ...
-  sv = Supervisor(logdir='/tmp/mydir')
-  with sv.managed_session(FLAGS.master) as sess:
-    sv.loop(60, print_loss, (sess, ))
-    while not sv.should_stop():
-      sess.run(my_train_op)
-  ```
-
-##### Launching fewer services
-
-`managed_session()` launches the "summary" and "checkpoint" threads which use
-either the optionally `summary_op` and `saver` passed to the constructor, or
-default ones created automatically by the supervisor.  If you want to run
-your own summary and checkpointing logic, disable these services by passing
-`None` to the `summary_op` and `saver` parameters.
-
-Example: Create summaries manually every 100 steps in the chief.
-
-  ```python
-  # Create a Supervisor with no automatic summaries.
-  sv = Supervisor(logdir='/tmp/mydir', is_chief=is_chief, summary_op=None)
-  # As summary_op was None, managed_session() does not start the
-  # summary thread.
-  with sv.managed_session(FLAGS.master) as sess:
-    for step in xrange(1000000):
-      if sv.should_stop():
-        break
-      if is_chief and step % 100 == 0:
-        # Create the summary every 100 chief steps.
-        sv.summary_computed(sess, sess.run(my_summary_op))
-      else:
-        # Train normally
-        sess.run(my_train_op)
-  ```
-
-##### Custom model initialization
-
-`managed_session()` only supports initializing the model by running an
-`init_op` or restoring from the latest checkpoint.  If you have special
-initialization needs, see how to specify a `local_init_op` when creating the
-supervisor.  You can also use the `SessionManager` directly to create a
-session and check if it could be initialized automatically.
-
-- - -
-
-#### `tf.train.Supervisor.__init__(graph=None, ready_op=0, ready_for_local_init_op=0, is_chief=True, init_op=0, init_feed_dict=None, local_init_op=0, logdir=None, summary_op=0, saver=0, global_step=0, save_summaries_secs=120, save_model_secs=600, recovery_wait_secs=30, stop_grace_secs=120, checkpoint_basename='model.ckpt', session_manager=None, summary_writer=0, init_fn=None)` {#Supervisor.__init__}
-
-Create a `Supervisor`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: A `Graph`.  The graph that the model will use.  Defaults to the
-    default `Graph`.  The supervisor may add operations to the graph before
-    creating a session, but the graph should not be modified by the caller
-    after passing it to the supervisor.
-*  <b>`ready_op`</b>: 1-D string `Tensor`.  This tensor is evaluated by supervisors in
-    `prepare_or_wait_for_session()` to check if the model is ready to use.
-    The model is considered ready if it returns an empty array.  Defaults to
-    the tensor returned from `tf.report_uninitialized_variables()`  If
-    `None`, the model is not checked for readiness.
-*  <b>`ready_for_local_init_op`</b>: 1-D string `Tensor`.  This tensor is evaluated by
-    supervisors in `prepare_or_wait_for_session()` to check if the model is
-    ready to run the local_init_op.
-    The model is considered ready if it returns an empty array.  Defaults to
-    the tensor returned from
-    `tf.report_uninitialized_variables(tf.global_variables())`. If `None`,
-    the model is not checked for readiness before running local_init_op.
-*  <b>`is_chief`</b>: If True, create a chief supervisor in charge of initializing
-    and restoring the model.  If False, create a supervisor that relies
-    on a chief supervisor for inits and restore.
-*  <b>`init_op`</b>: `Operation`.  Used by chief supervisors to initialize the model
-    when it can not be recovered.  Defaults to an `Operation` that
-    initializes all variables.  If `None`, no initialization is done
-    automatically unless you pass a value for `init_fn`, see below.
-*  <b>`init_feed_dict`</b>: A dictionary that maps `Tensor` objects to feed values.
-    This feed dictionary will be used when `init_op` is evaluated.
-*  <b>`local_init_op`</b>: `Operation`. Used by all supervisors to run initializations
-    that should run for every new supervisor instance. By default these
-    are table initializers and initializers for local variables.
-    If `None`, no further per supervisor-instance initialization is
-    done automatically.
-*  <b>`logdir`</b>: A string.  Optional path to a directory where to checkpoint the
-    model and log events for the visualizer.  Used by chief supervisors.
-    The directory will be created if it does not exist.
-*  <b>`summary_op`</b>: An `Operation` that returns a Summary for the event logs.
-    Used by chief supervisors if a `logdir` was specified.  Defaults to the
-    operation returned from summary.merge_all().  If `None`, summaries are
-    not computed automatically.
-*  <b>`saver`</b>: A Saver object.  Used by chief supervisors if a `logdir` was
-    specified.  Defaults to the saved returned by Saver().
-    If `None`, the model is not saved automatically.
-*  <b>`global_step`</b>: An integer Tensor of size 1 that counts steps.  The value
-    from 'global_step' is used in summaries and checkpoint filenames.
-    Default to the op named 'global_step' in the graph if it exists, is of
-    rank 1, size 1, and of type tf.int32 or tf.int64.  If `None` the global
-    step is not recorded in summaries and checkpoint files.  Used by chief
-    supervisors if a `logdir` was specified.
-*  <b>`save_summaries_secs`</b>: Number of seconds between the computation of
-    summaries for the event log.  Defaults to 120 seconds.  Pass 0 to
-    disable summaries.
-*  <b>`save_model_secs`</b>: Number of seconds between the creation of model
-    checkpoints.  Defaults to 600 seconds.  Pass 0 to disable checkpoints.
-*  <b>`recovery_wait_secs`</b>: Number of seconds between checks that the model
-    is ready.  Used by supervisors when waiting for a chief supervisor
-    to initialize or restore the model.  Defaults to 30 seconds.
-*  <b>`stop_grace_secs`</b>: Grace period, in seconds, given to running threads to
-    stop when `stop()` is called.  Defaults to 120 seconds.
-*  <b>`checkpoint_basename`</b>: The basename for checkpoint saving.
-*  <b>`session_manager`</b>: `SessionManager`, which manages Session creation and
-    recovery. If it is `None`, a default `SessionManager` will be created
-    with the set of arguments passed in for backwards compatibility.
-*  <b>`summary_writer`</b>: `SummaryWriter` to use or `USE_DEFAULT`.  Can be `None`
-    to indicate that no summaries should be written.
-*  <b>`init_fn`</b>: Optional callable used to initialize the model. Called
-    after the optional `init_op` is called.  The callable must accept one
-    argument, the session being initialized.
-
-##### Returns:
-
-  A `Supervisor`.
-
-
-- - -
-
-#### `tf.train.Supervisor.managed_session(master='', config=None, start_standard_services=True, close_summary_writer=True)` {#Supervisor.managed_session}
-
-Returns a context manager for a managed session.
-
-This context manager creates and automatically recovers a session.  It
-optionally starts the standard services that handle checkpoints and
-summaries.  It monitors exceptions raised from the `with` block or from the
-services and stops the supervisor as needed.
-
-The context manager is typically used as follows:
-
-```python
-def train():
-  sv = tf.train.Supervisor(...)
-  with sv.managed_session(<master>) as sess:
-    for step in xrange(..):
-      if sv.should_stop():
-        break
-      sess.run(<my training op>)
-      ...do other things needed at each training step...
-```
-
-An exception raised from the `with` block or one of the service threads is
-raised again when the block exits.  This is done after stopping all threads
-and closing the session.  For example, an `AbortedError` exception, raised
-in case of preemption of one of the workers in a distributed model, is
-raised again when the block exits.
-
-If you want to retry the training loop in case of preemption you can do it
-as follows:
-
-```python
-def main(...):
-  while True
-    try:
-      train()
-    except tf.errors.Aborted:
-      pass
-```
-
-As a special case, exceptions used for control flow, such as
-`OutOfRangeError` which reports that input queues are exhausted, are not
-raised again from the `with` block: they indicate a clean termination of
-the training loop and are considered normal termination.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-    Passed as-is to create the session.
-*  <b>`start_standard_services`</b>: Whether to start the standard services,
-    such as checkpoint, summary and step counter.
-*  <b>`close_summary_writer`</b>: Whether to close the summary writer when
-    closing the session.  Defaults to True.
-
-##### Returns:
-
-  A context manager that yields a `Session` restored from the latest
-  checkpoint or initialized from scratch if not checkpoint exists.  The
-  session is closed when the `with` block exits.
-
-
-- - -
-
-#### `tf.train.Supervisor.prepare_or_wait_for_session(master='', config=None, wait_for_checkpoint=False, max_wait_secs=7200, start_standard_services=True)` {#Supervisor.prepare_or_wait_for_session}
-
-Make sure the model is ready to be used.
-
-Create a session on 'master', recovering or initializing the model as
-needed, or wait for a session to be ready.  If running as the chief
-and `start_standard_service` is set to True, also call the session
-manager to start the standard services.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session,
-    which is passed as-is to create the session.
-*  <b>`wait_for_checkpoint`</b>: Whether we should wait for the availability of a
-    checkpoint before creating Session. Defaults to False.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-*  <b>`start_standard_services`</b>: Whether to start the standard services and the
-    queue runners.
-
-##### Returns:
-
-  A Session object that can be used to drive the model.
-
-
-- - -
-
-#### `tf.train.Supervisor.start_standard_services(sess)` {#Supervisor.start_standard_services}
-
-Start the standard services for 'sess'.
-
-This starts services in the background.  The services started depend
-on the parameters to the constructor and may include:
-
-  - A Summary thread computing summaries every save_summaries_secs.
-  - A Checkpoint thread saving the model every save_model_secs.
-  - A StepCounter thread measure step time.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session.
-
-##### Returns:
-
-  A list of threads that are running the standard services.  You can use
-  the Supervisor's Coordinator to join these threads with:
-    sv.coord.Join(<list of threads>)
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If called with a non-chief Supervisor.
-*  <b>`ValueError`</b>: If not `logdir` was passed to the constructor as the
-    services need a log directory.
-
-
-- - -
-
-#### `tf.train.Supervisor.start_queue_runners(sess, queue_runners=None)` {#Supervisor.start_queue_runners}
-
-Start threads for `QueueRunners`.
-
-Note that the queue runners collected in the graph key `QUEUE_RUNNERS`
-are already started automatically when you create a session with the
-supervisor, so unless you have non-collected queue runners to start
-you do not need to call this explicitly.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`queue_runners`</b>: A list of `QueueRunners`. If not specified, we'll use the
-    list of queue runners gathered in the graph under the key
-    `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  The list of threads started for the `QueueRunners`.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_computed(sess, summary, global_step=None)` {#Supervisor.summary_computed}
-
-Indicate that a summary was computed.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` object.
-*  <b>`summary`</b>: A Summary proto, or a string holding a serialized summary proto.
-*  <b>`global_step`</b>: Int. global step this summary is associated with. If `None`,
-    it will try to fetch the current step.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if 'summary' is not a Summary proto or a string.
-*  <b>`RuntimeError`</b>: if the Supervisor was created without a `logdir`.
-
-
-
-- - -
-
-#### `tf.train.Supervisor.stop(threads=None, close_summary_writer=True)` {#Supervisor.stop}
-
-Stop the services and the coordinator.
-
-This does not close the session.
-
-##### Args:
-
-
-*  <b>`threads`</b>: Optional list of threads to join with the coordinator.  If
-    `None`, defaults to the threads running the standard services, the
-    threads started for `QueueRunners`, and the threads started by the
-    `loop()` method.  To wait on additional threads, pass the
-    list in this parameter.
-*  <b>`close_summary_writer`</b>: Whether to close the `summary_writer`.  Defaults to
-    `True` if the summary writer was created by the supervisor, `False`
-    otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.request_stop(ex=None)` {#Supervisor.request_stop}
-
-Request that the coordinator stop the threads.
-
-See `Coordinator.request_stop()`.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Supervisor.should_stop()` {#Supervisor.should_stop}
-
-Check if the coordinator was told to stop.
-
-See `Coordinator.should_stop()`.
-
-##### Returns:
-
-  True if the coordinator was told to stop, False otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.stop_on_exception()` {#Supervisor.stop_on_exception}
-
-Context handler to stop the supervisor when an exception is raised.
-
-See `Coordinator.stop_on_exception()`.
-
-##### Returns:
-
-  A context handler.
-
-
-- - -
-
-#### `tf.train.Supervisor.wait_for_stop()` {#Supervisor.wait_for_stop}
-
-Block waiting for the coordinator to stop.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.Supervisor.Loop(timer_interval_secs, target, args=None, kwargs=None)` {#Supervisor.Loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(*args, **kwargs)`
-repeatedly.  Otherwise it calls it every `timer_interval_secs`
-seconds.  The thread terminates when a stop is requested.
-
-The started thread is added to the list of threads managed by the supervisor
-so it does not need to be passed to the `stop()` method.
-
-##### Args:
-
-
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.Supervisor.PrepareSession(master='', config=None, wait_for_checkpoint=False, max_wait_secs=7200, start_standard_services=True)` {#Supervisor.PrepareSession}
-
-Make sure the model is ready to be used.
-
-Create a session on 'master', recovering or initializing the model as
-needed, or wait for a session to be ready.  If running as the chief
-and `start_standard_service` is set to True, also call the session
-manager to start the standard services.
-
-##### Args:
-
-
-*  <b>`master`</b>: name of the TensorFlow master to use.  See the `tf.Session`
-    constructor for how this is interpreted.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session,
-    which is passed as-is to create the session.
-*  <b>`wait_for_checkpoint`</b>: Whether we should wait for the availability of a
-    checkpoint before creating Session. Defaults to False.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-*  <b>`start_standard_services`</b>: Whether to start the standard services and the
-    queue runners.
-
-##### Returns:
-
-  A Session object that can be used to drive the model.
-
-
-- - -
-
-#### `tf.train.Supervisor.RequestStop(ex=None)` {#Supervisor.RequestStop}
-
-Request that the coordinator stop the threads.
-
-See `Coordinator.request_stop()`.
-
-##### Args:
-
-
-*  <b>`ex`</b>: Optional `Exception`, or Python `exc_info` tuple as returned by
-    `sys.exc_info()`.  If this is the first call to `request_stop()` the
-    corresponding exception is recorded and re-raised from `join()`.
-
-
-- - -
-
-#### `tf.train.Supervisor.ShouldStop()` {#Supervisor.ShouldStop}
-
-Check if the coordinator was told to stop.
-
-See `Coordinator.should_stop()`.
-
-##### Returns:
-
-  True if the coordinator was told to stop, False otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.StartQueueRunners(sess, queue_runners=None)` {#Supervisor.StartQueueRunners}
-
-Start threads for `QueueRunners`.
-
-Note that the queue runners collected in the graph key `QUEUE_RUNNERS`
-are already started automatically when you create a session with the
-supervisor, so unless you have non-collected queue runners to start
-you do not need to call this explicitly.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session`.
-*  <b>`queue_runners`</b>: A list of `QueueRunners`. If not specified, we'll use the
-    list of queue runners gathered in the graph under the key
-    `GraphKeys.QUEUE_RUNNERS`.
-
-##### Returns:
-
-  The list of threads started for the `QueueRunners`.
-
-
-- - -
-
-#### `tf.train.Supervisor.StartStandardServices(sess)` {#Supervisor.StartStandardServices}
-
-Start the standard services for 'sess'.
-
-This starts services in the background.  The services started depend
-on the parameters to the constructor and may include:
-
-  - A Summary thread computing summaries every save_summaries_secs.
-  - A Checkpoint thread saving the model every save_model_secs.
-  - A StepCounter thread measure step time.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A Session.
-
-##### Returns:
-
-  A list of threads that are running the standard services.  You can use
-  the Supervisor's Coordinator to join these threads with:
-    sv.coord.Join(<list of threads>)
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If called with a non-chief Supervisor.
-*  <b>`ValueError`</b>: If not `logdir` was passed to the constructor as the
-    services need a log directory.
-
-
-- - -
-
-#### `tf.train.Supervisor.Stop(threads=None, close_summary_writer=True)` {#Supervisor.Stop}
-
-Stop the services and the coordinator.
-
-This does not close the session.
-
-##### Args:
-
-
-*  <b>`threads`</b>: Optional list of threads to join with the coordinator.  If
-    `None`, defaults to the threads running the standard services, the
-    threads started for `QueueRunners`, and the threads started by the
-    `loop()` method.  To wait on additional threads, pass the
-    list in this parameter.
-*  <b>`close_summary_writer`</b>: Whether to close the `summary_writer`.  Defaults to
-    `True` if the summary writer was created by the supervisor, `False`
-    otherwise.
-
-
-- - -
-
-#### `tf.train.Supervisor.StopOnException()` {#Supervisor.StopOnException}
-
-Context handler to stop the supervisor when an exception is raised.
-
-See `Coordinator.stop_on_exception()`.
-
-##### Returns:
-
-  A context handler.
-
-
-- - -
-
-#### `tf.train.Supervisor.SummaryComputed(sess, summary, global_step=None)` {#Supervisor.SummaryComputed}
-
-Indicate that a summary was computed.
-
-##### Args:
-
-
-*  <b>`sess`</b>: A `Session` object.
-*  <b>`summary`</b>: A Summary proto, or a string holding a serialized summary proto.
-*  <b>`global_step`</b>: Int. global step this summary is associated with. If `None`,
-    it will try to fetch the current step.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if 'summary' is not a Summary proto or a string.
-*  <b>`RuntimeError`</b>: if the Supervisor was created without a `logdir`.
-
-
-- - -
-
-#### `tf.train.Supervisor.WaitForStop()` {#Supervisor.WaitForStop}
-
-Block waiting for the coordinator to stop.
-
-
-- - -
-
-#### `tf.train.Supervisor.coord` {#Supervisor.coord}
-
-Return the Coordinator used by the Supervisor.
-
-The Coordinator can be useful if you want to run multiple threads
-during your training.
-
-##### Returns:
-
-  A Coordinator object.
-
-
-- - -
-
-#### `tf.train.Supervisor.global_step` {#Supervisor.global_step}
-
-Return the global_step Tensor used by the supervisor.
-
-##### Returns:
-
-  An integer Tensor for the global_step.
-
-
-- - -
-
-#### `tf.train.Supervisor.init_feed_dict` {#Supervisor.init_feed_dict}
-
-Return the feed dictionary used when evaluating the `init_op`.
-
-##### Returns:
-
-  A feed dictionary or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.init_op` {#Supervisor.init_op}
-
-Return the Init Op used by the supervisor.
-
-##### Returns:
-
-  An Op or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.is_chief` {#Supervisor.is_chief}
-
-Return True if this is a chief supervisor.
-
-##### Returns:
-
-  A bool.
-
-
-- - -
-
-#### `tf.train.Supervisor.loop(timer_interval_secs, target, args=None, kwargs=None)` {#Supervisor.loop}
-
-Start a LooperThread that calls a function periodically.
-
-If `timer_interval_secs` is None the thread calls `target(*args, **kwargs)`
-repeatedly.  Otherwise it calls it every `timer_interval_secs`
-seconds.  The thread terminates when a stop is requested.
-
-The started thread is added to the list of threads managed by the supervisor
-so it does not need to be passed to the `stop()` method.
-
-##### Args:
-
-
-*  <b>`timer_interval_secs`</b>: Number. Time boundaries at which to call `target`.
-*  <b>`target`</b>: A callable object.
-*  <b>`args`</b>: Optional arguments to pass to `target` when calling it.
-*  <b>`kwargs`</b>: Optional keyword arguments to pass to `target` when calling it.
-
-##### Returns:
-
-  The started thread.
-
-
-- - -
-
-#### `tf.train.Supervisor.ready_for_local_init_op` {#Supervisor.ready_for_local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Supervisor.ready_op` {#Supervisor.ready_op}
-
-Return the Ready Op used by the supervisor.
-
-##### Returns:
-
-  An Op or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_model_secs` {#Supervisor.save_model_secs}
-
-Return the delay between checkpoints.
-
-##### Returns:
-
-  A timestamp.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_path` {#Supervisor.save_path}
-
-Return the save path used by the supervisor.
-
-##### Returns:
-
-  A string.
-
-
-- - -
-
-#### `tf.train.Supervisor.save_summaries_secs` {#Supervisor.save_summaries_secs}
-
-Return the delay between summary computations.
-
-##### Returns:
-
-  A timestamp.
-
-
-- - -
-
-#### `tf.train.Supervisor.saver` {#Supervisor.saver}
-
-Return the Saver used by the supervisor.
-
-##### Returns:
-
-  A Saver object.
-
-
-- - -
-
-#### `tf.train.Supervisor.session_manager` {#Supervisor.session_manager}
-
-Return the SessionManager used by the Supervisor.
-
-##### Returns:
-
-  A SessionManager object.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_op` {#Supervisor.summary_op}
-
-Return the Summary Tensor used by the chief supervisor.
-
-##### Returns:
-
-  A string Tensor for the summary or `None`.
-
-
-- - -
-
-#### `tf.train.Supervisor.summary_writer` {#Supervisor.summary_writer}
-
-Return the SummaryWriter used by the chief supervisor.
-
-##### Returns:
-
-  A SummaryWriter.
-
-
-
-- - -
-
-### `class tf.train.SessionManager` {#SessionManager}
-
-Training helper that restores from checkpoint and creates session.
-
-This class is a small wrapper that takes care of session creation and
-checkpoint recovery. It also provides functions that to facilitate
-coordination among multiple training threads or processes.
-
-* Checkpointing trained variables as the training progresses.
-* Initializing variables on startup, restoring them from the most recent
-  checkpoint after a crash, or wait for checkpoints to become available.
-
-### Usage:
-
-```python
-with tf.Graph().as_default():
-   ...add operations to the graph...
-  # Create a SessionManager that will checkpoint the model in '/tmp/mydir'.
-  sm = SessionManager()
-  sess = sm.prepare_session(master, init_op, saver, checkpoint_dir)
-  # Use the session to train the graph.
-  while True:
-    sess.run(<my_train_op>)
-```
-
-`prepare_session()` initializes or restores a model. It requires `init_op`
-and `saver` as an argument.
-
-A second process could wait for the model to be ready by doing the following:
-
-```python
-with tf.Graph().as_default():
-   ...add operations to the graph...
-  # Create a SessionManager that will wait for the model to become ready.
-  sm = SessionManager()
-  sess = sm.wait_for_session(master)
-  # Use the session to train the graph.
-  while True:
-    sess.run(<my_train_op>)
-```
-
-`wait_for_session()` waits for a model to be initialized by other processes.
-- - -
-
-#### `tf.train.SessionManager.__init__(local_init_op=None, ready_op=None, ready_for_local_init_op=None, graph=None, recovery_wait_secs=30)` {#SessionManager.__init__}
-
-Creates a SessionManager.
-
-The `local_init_op` is an `Operation` that is run always after a new session
-was created. If `None`, this step is skipped.
-
-The `ready_op` is an `Operation` used to check if the model is ready.  The
-model is considered ready if that operation returns an empty 1D string
-tensor. If the operation returns a non empty 1D string tensor, the elements
-are concatenated and used to indicate to the user why the model is not
-ready.
-
-The `ready_for_local_init_op` is an `Operation` used to check if the model
-is ready to run local_init_op.  The model is considered ready if that
-operation returns an empty 1D string tensor. If the operation returns a non
-empty 1D string tensor, the elements are concatenated and used to indicate
-to the user why the model is not ready.
-
-If `ready_op` is `None`, the model is not checked for readiness.
-
-`recovery_wait_secs` is the number of seconds between checks that
-the model is ready.  It is used by processes to wait for a model to
-be initialized or restored.  Defaults to 30 seconds.
-
-##### Args:
-
-
-*  <b>`local_init_op`</b>: An `Operation` run immediately after session creation.
-     Usually used to initialize tables and local variables.
-*  <b>`ready_op`</b>: An `Operation` to check if the model is initialized.
-*  <b>`ready_for_local_init_op`</b>: An `Operation` to check if the model is ready
-     to run local_init_op.
-*  <b>`graph`</b>: The `Graph` that the model will use.
-*  <b>`recovery_wait_secs`</b>: Seconds between checks for the model to be ready.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If ready_for_local_init_op is not None but local_init_op is
-    None
-
-
-- - -
-
-#### `tf.train.SessionManager.prepare_session(master, init_op=None, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None, init_feed_dict=None, init_fn=None)` {#SessionManager.prepare_session}
-
-Creates a `Session`. Makes sure the model is ready to be used.
-
-Creates a `Session` on 'master'. If a `saver` object is passed in, and
-`checkpoint_dir` points to a directory containing valid checkpoint
-files, then it will try to recover the model from checkpoint. If
-no checkpoint files are available, and `wait_for_checkpoint` is
-`True`, then the process would check every `recovery_wait_secs`,
-up to `max_wait_secs`, for recovery to succeed.
-
-If the model cannot be recovered successfully then it is initialized by
-either running the provided `init_op`, or calling the provided `init_fn`.
-The local_init_op is also run after init_op and init_fn, regardless of
-whether the model was recovered successfully, but only if
-ready_for_local_init_op passes.
-
-It is an error if the model cannot be recovered and no `init_op`
-or `init_fn` or `local_init_op` are passed.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`init_op`</b>: Optional `Operation` used to initialize the model.
-*  <b>`saver`</b>: A `Saver` object used to restore a model.
-*  <b>`checkpoint_dir`</b>: Path to the checkpoint files. The latest checkpoint in the
-    dir will be used to restore.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-*  <b>`wait_for_checkpoint`</b>: Whether to wait for checkpoint to become available.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for checkpoints to become available.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-*  <b>`init_feed_dict`</b>: Optional dictionary that maps `Tensor` objects to feed
-    values.  This feed dictionary is passed to the session `run()` call when
-    running the init op.
-*  <b>`init_fn`</b>: Optional callable used to initialize the model. Called after the
-    optional `init_op` is called.  The callable must accept one argument,
-    the session being initialized.
-
-##### Returns:
-
-  A `Session` object that can be used to drive the model.
-
-##### Raises:
-
-
-*  <b>`RuntimeError`</b>: If the model cannot be initialized or recovered.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both checkpoint_dir and checkpoint_filename_with_path are
-    set.
-
-
-- - -
-
-#### `tf.train.SessionManager.recover_session(master, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None)` {#SessionManager.recover_session}
-
-Creates a `Session`, recovering if possible.
-
-Creates a new session on 'master'.  If the session is not initialized
-and can be recovered from a checkpoint, recover it.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`saver`</b>: A `Saver` object used to restore a model.
-*  <b>`checkpoint_dir`</b>: Path to the checkpoint files. The latest checkpoint in the
-    dir will be used to restore.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-*  <b>`wait_for_checkpoint`</b>: Whether to wait for checkpoint to become available.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for checkpoints to become available.
-*  <b>`config`</b>: Optional `ConfigProto` proto used to configure the session.
-
-##### Returns:
-
-  A pair (sess, initialized) where 'initialized' is `True` if
-  the session could be recovered and initialized, `False` otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If both checkpoint_dir and checkpoint_filename_with_path are
-    set.
-
-
-- - -
-
-#### `tf.train.SessionManager.wait_for_session(master, config=None, max_wait_secs=inf)` {#SessionManager.wait_for_session}
-
-Creates a new `Session` and waits for model to be ready.
-
-Creates a new `Session` on 'master'.  Waits for the model to be
-initialized or recovered from a checkpoint.  It's expected that
-another thread or process will make the model ready, and that this
-is intended to be used by threads/processes that participate in a
-distributed training configuration where a different thread/process
-is responsible for initializing or recovering the model being trained.
-
-NB: The amount of time this method waits for the session is bounded
-by max_wait_secs. By default, this function will wait indefinitely.
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: Optional ConfigProto proto used to configure the session.
-*  <b>`max_wait_secs`</b>: Maximum time to wait for the session to become available.
-
-##### Returns:
-
-  A `Session`. May be None if the operation exceeds the timeout
-  specified by config.operation_timeout_in_ms.
-
-##### Raises:
-
-  tf.DeadlineExceededError: if the session is not available after
-    max_wait_secs.
-
-
-
-- - -
-
-### `class tf.train.ClusterSpec` {#ClusterSpec}
-
-Represents a cluster as a set of "tasks", organized into "jobs".
-
-A `tf.train.ClusterSpec` represents the set of processes that
-participate in a distributed TensorFlow computation. Every
-[`tf.train.Server`](#Server) is constructed in a particular cluster.
-
-To create a cluster with two jobs and five tasks, you specify the
-mapping from job names to lists of network addresses (typically
-hostname-port pairs).
-
-```python
-cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
-                                           "worker1.example.com:2222",
-                                           "worker2.example.com:2222"],
-                                "ps": ["ps0.example.com:2222",
-                                       "ps1.example.com:2222"]})
-```
-
-Each job may also be specified as a sparse mapping from task indices
-to network addresses. This enables a server to be configured without
-needing to know the identity of (for example) all other worker
-tasks:
-
-```python
-cluster = tf.train.ClusterSpec({"worker": {1: "worker1.example.com:2222"},
-                                "ps": ["ps0.example.com:2222",
-                                       "ps1.example.com:2222"]})
-```
-
-- - -
-
-#### `tf.train.ClusterSpec.as_cluster_def()` {#ClusterSpec.as_cluster_def}
-
-Returns a `tf.train.ClusterDef` protocol buffer based on this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.as_dict()` {#ClusterSpec.as_dict}
-
-Returns a dictionary from job names to their tasks.
-
-For each job, if the task index space is dense, the corresponding
-value will be a list of network addresses; otherwise it will be a
-dictionary mapping (sparse) task indices to the corresponding
-addresses.
-
-##### Returns:
-
-  A dictionary mapping job names to lists or dictionaries
-  describing the tasks in those jobs.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.ClusterSpec.__bool__()` {#ClusterSpec.__bool__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__eq__(other)` {#ClusterSpec.__eq__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__init__(cluster)` {#ClusterSpec.__init__}
-
-Creates a `ClusterSpec`.
-
-##### Args:
-
-
-*  <b>`cluster`</b>: A dictionary mapping one or more job names to (i) a
-    list of network addresses, or (ii) a dictionary mapping integer
-    task indices to network addresses; or a `tf.train.ClusterDef`
-    protocol buffer.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If `cluster` is not a dictionary mapping strings to lists
-    of strings, and not a `tf.train.ClusterDef` protobuf.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__ne__(other)` {#ClusterSpec.__ne__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.__nonzero__()` {#ClusterSpec.__nonzero__}
-
-
-
-
-- - -
-
-#### `tf.train.ClusterSpec.job_tasks(job_name)` {#ClusterSpec.job_tasks}
-
-Returns a mapping from task ID to address in the given job.
-
-NOTE: For backwards compatibility, this method returns a list. If
-the given job was defined with a sparse set of task indices, the
-length of this list may not reflect the number of tasks defined in
-this job. Use the [`num_tasks()`](#ClusterSpec.num_tasks) method
-to find the number of tasks defined in a particular job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  A list of task addresses, where the index in the list
-  corresponds to the task index of each task. The list may contain
-  `None` if the job was defined with a sparse set of task indices.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.jobs` {#ClusterSpec.jobs}
-
-Returns a list of job names in this cluster.
-
-##### Returns:
-
-  A list of strings, corresponding to the names of jobs in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.num_tasks(job_name)` {#ClusterSpec.num_tasks}
-
-Returns the number of tasks defined in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  The number of tasks defined in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.task_address(job_name, task_index)` {#ClusterSpec.task_address}
-
-Returns the address of the given task in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-*  <b>`task_index`</b>: A non-negative integer.
-
-##### Returns:
-
-  The address of the given task in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster,
-  or no task with index `task_index` is defined in that job.
-
-
-- - -
-
-#### `tf.train.ClusterSpec.task_indices(job_name)` {#ClusterSpec.task_indices}
-
-Returns a list of valid task indices in the given job.
-
-##### Args:
-
-
-*  <b>`job_name`</b>: The string name of a job in this cluster.
-
-##### Returns:
-
-  A list of valid task indices in the given job.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `job_name` does not name a job in this cluster,
-  or no task with index `task_index` is defined in that job.
-
-
-
-- - -
-
-### `tf.train.replica_device_setter(ps_tasks=0, ps_device='/job:ps', worker_device='/job:worker', merge_devices=True, cluster=None, ps_ops=None, ps_strategy=None)` {#replica_device_setter}
-
-Return a `device function` to use when building a Graph for replicas.
-
-Device Functions are used in `with tf.device(device_function):` statement to
-automatically assign devices to `Operation` objects as they are constructed,
-Device constraints are added from the inner-most context first, working
-outwards. The merging behavior adds constraints to fields that are yet unset
-by a more inner context. Currently the fields are (job, task, cpu/gpu).
-
-If `cluster` is `None`, and `ps_tasks` is 0, the returned function is a no-op.
-Otherwise, the value of `ps_tasks` is derived from `cluster`.
-
-By default, only Variable ops are placed on ps tasks, and the placement
-strategy is round-robin over all ps tasks. A custom `ps_strategy` may be used
-to do more intelligent placement, such as
-`tf.contrib.training.GreedyLoadBalancingStrategy`.
-
-For example,
-
-```python
-# To build a cluster with two ps jobs on hosts ps0 and ps1, and 3 worker
-# jobs on hosts worker0, worker1 and worker2.
-cluster_spec = {
-    "ps": ["ps0:2222", "ps1:2222"],
-    "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
-  # Build your graph
-  v1 = tf.Variable(...)  # assigned to /job:ps/task:0
-  v2 = tf.Variable(...)  # assigned to /job:ps/task:1
-  v3 = tf.Variable(...)  # assigned to /job:ps/task:0
-# Run compute
-```
-
-##### Args:
-
-
-*  <b>`ps_tasks`</b>: Number of tasks in the `ps` job.  Ignored if `cluster` is
-    provided.
-*  <b>`ps_device`</b>: String.  Device of the `ps` job.  If empty no `ps` job is used.
-    Defaults to `ps`.
-*  <b>`worker_device`</b>: String.  Device of the `worker` job.  If empty no `worker`
-    job is used.
-*  <b>`merge_devices`</b>: `Boolean`. If `True`, merges or only sets a device if the
-    device constraint is completely unset. merges device specification rather
-    than overriding them.
-*  <b>`cluster`</b>: `ClusterDef` proto or `ClusterSpec`.
-*  <b>`ps_ops`</b>: List of strings representing `Operation` types that need to be
-    placed on `ps` devices.  If `None`, defaults to `["Variable"]`.
-*  <b>`ps_strategy`</b>: A callable invoked for every ps `Operation` (i.e. matched by
-    `ps_ops`), that takes the `Operation` and returns the ps task index to
-    use.  If `None`, defaults to a round-robin strategy across all `ps`
-    devices.
-
-##### Returns:
-
-  A function to pass to `tf.device()`.
-
-##### Raises:
-
-  TypeError if `cluster` is not a dictionary or `ClusterDef` protocol buffer,
-  or if `ps_strategy` is provided but not a callable.
-
-
-- - -
-
-### `tf.train.MonitoredTrainingSession(master='', is_chief=True, checkpoint_dir=None, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, save_summaries_secs=None, config=None)` {#MonitoredTrainingSession}
-
-Creates a `MonitoredSession` for training.
-
-For a chief, this utility sets proper session initializer/restorer. It also
-creates hooks related to checkpoint and summary saving. For workers, this
-utility sets proper session creator which waits for the chief to
-inialize/restore.
-
-
-##### Args:
-
-
-*  <b>`master`</b>: `String` the TensorFlow master to use.
-*  <b>`is_chief`</b>: If `True`, it will take care of initialization and recovery the
-    underlying TensorFlow session. If `False`, it will wait on a chief to
-    initialize or recover the TensorFlow session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified, a default one is created. It's used to finalize the graph.
-*  <b>`hooks`</b>: Optional list of `SessionRunHook` objects.
-*  <b>`chief_only_hooks`</b>: list of `SessionRunHook` objects. Activate these hooks if
-    `is_chief==True`, ignore otherwise.
-*  <b>`save_checkpoint_secs`</b>: The frequency, in seconds, that a checkpoint is saved
-    using a default checkpoint saver. If `save_checkpoint_secs` is set to
-    `None`, then the default checkpoint saver isn't used.
-*  <b>`save_summaries_steps`</b>: The frequency, in number of global steps, that the
-    summaries are written to disk using a default summary saver. If both
-    `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
-    the default summary saver isn't used.
-*  <b>`save_summaries_secs`</b>: The frequency, in secs, that the summaries are written
-    to disk using a default summary saver.  If both `save_summaries_steps` and
-    `save_summaries_secs` are set to `None`, then the default summary saver
-    isn't used.
-*  <b>`config`</b>: an instance of `tf.ConfigProto` proto used to configure the session.
-    It's the `config` argument of constructor of `tf.Session`.
-
-##### Returns:
-
-  A `MonitoredSession` object.
-
-
-- - -
-
-### `class tf.train.MonitoredSession` {#MonitoredSession}
-
-Session-like object that handles initialization, recovery and hooks.
-
-Example usage:
-
-```python
-saver_hook = CheckpointSaverHook(...)
-summary_hook = SummaryHook(...)
-with MonitoredSession(session_creator=ChiefSessionCreator(...),
-                      hooks=[saver_hook, summary_hook]) as sess:
-  while not sess.should_stop():
-    sess.run(train_op)
-```
-
-Initialization: At creation time the monitored session does following things
-in given order:
-
-* calls `hook.begin()` for each given hook
-* finalizes the graph via `scaffold.finalize()`
-* create session
-* initializes the model via initialization ops provided by `Scaffold`
-* restores variables if a checkpoint exists
-* launches queue runners
-
-Run: When `run()` is called, the monitored session does following things:
-
-* calls `hook.before_run()`
-* calls TensorFlow `session.run()` with merged fetches and feed_dict
-* calls `hook.after_run()`
-* returns result of `session.run()` asked by user
-* if `AbortedError` occurs, it recovers or reinitializes the session before
-  executing the run() call again
-
-
-Exit: At the `close()`, the monitored session does following things in order:
-
-* calls `hook.end()`
-* closes the queue runners and the session
-* suppresses `OutOfRange` error which indicates that all inputs have been
-  processed if the monitored_session is used as a context
-
-How to set `tf.Session` arguments:
-
-* In most cases you can set session arguments as follows:
-
-```python
-MonitoredSession(
-  session_creator=ChiefSessionCreator(master=..., config=...))
-```
-
-* In distributed setting for a non-chief worker, you can use following:
-
-```python
-MonitoredSession(
-  session_creator=WorkerSessionCreator(master=..., config=...))
-```
-
-See `MonitoredTrainingSession` for an example usage based on chief or worker.
-
-Args:
-  session_creator: A factory object to create session. Typically a
-    `ChiefSessionCreator` which is the default one.
-  hooks: An iterable of `SessionRunHook' objects.
-
-Returns:
-  A MonitoredSession object.
-- - -
-
-#### `tf.train.MonitoredSession.__enter__()` {#MonitoredSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.__exit__(exception_type, exception_value, traceback)` {#MonitoredSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.__init__(session_creator=None, hooks=None)` {#MonitoredSession.__init__}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.close()` {#MonitoredSession.close}
-
-
-
-
-- - -
-
-#### `tf.train.MonitoredSession.graph` {#MonitoredSession.graph}
-
-The graph that was launched in this session.
-
-
-- - -
-
-#### `tf.train.MonitoredSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#MonitoredSession.run}
-
-Run ops in the monitored session.
-
-This method is completely compatible with the `tf.Session.run()` method.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as `tf.Session.run()`.
-*  <b>`feed_dict`</b>: Same as `tf.Session.run()`.
-*  <b>`options`</b>: Same as `tf.Session.run()`.
-*  <b>`run_metadata`</b>: Same as `tf.Session.run()`.
-
-##### Returns:
-
-  Same as `tf.Session.run()`.
-
-
-- - -
-
-#### `tf.train.MonitoredSession.should_stop()` {#MonitoredSession.should_stop}
-
-
-
-
-
-- - -
-
-### `class tf.train.SingularMonitoredSession` {#SingularMonitoredSession}
-
-Session-like object that handles initialization, restoring, and hooks.
-
-Please note that this utility is not recommended for distributed settings.
-For distributed settings, please use `tf.train.MonitoredSession`. The
-differences between `MonitoredSession` and `SingularMonitoredSession` are:
-* `MonitoredSession` handles `AbortedError` for distributed settings,
-  but `SingularMonitoredSession` does not.
-* `MonitoredSession` can be created in `chief` or `worker` modes.
-  `SingularMonitoredSession` is always created as `chief`.
-* You can access the raw `tf.Session` object used by
-  `SingularMonitoredSession`, whereas in MonitoredSession the raw session is
-  private. This can be used:
-  - To `run` without hooks.
-  - To save and restore.
-* All other functionality is identical.
-
-Example usage:
-```python
-saver_hook = CheckpointSaverHook(...)
-summary_hook = SummaryHook(...)
-with SingularMonitoredSession(hooks=[saver_hook, summary_hook]) as sess:
-  while not sess.should_stop():
-    sess.run(train_op)
-```
-
-Initialization: At creation time the hooked session does following things
-in given order:
-
-* calls `hook.begin()` for each given hook
-* finalizes the graph via `scaffold.finalize()`
-* create session
-* initializes the model via initialization ops provided by `Scaffold`
-* restores variables if a checkpoint exists
-* launches queue runners
-
-Run: When `run()` is called, the hooked session does following things:
-
-* calls `hook.before_run()`
-* calls TensorFlow `session.run()` with merged fetches and feed_dict
-* calls `hook.after_run()`
-* returns result of `session.run()` asked by user
-
-Exit: At the `close()`, the hooked session does following things in order:
-
-* calls `hook.end()`
-* closes the queue runners and the session
-* surpresses `OutOfRange` error which indicates that all inputs have been
-  processed if the `SingularMonitoredSession` is used as a context.
-- - -
-
-#### `tf.train.SingularMonitoredSession.__enter__()` {#SingularMonitoredSession.__enter__}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.__exit__(exception_type, exception_value, traceback)` {#SingularMonitoredSession.__exit__}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.__init__(hooks=None, scaffold=None, master='', config=None, checkpoint_dir=None)` {#SingularMonitoredSession.__init__}
-
-Creates a SingularMonitoredSession.
-
-##### Args:
-
-
-*  <b>`hooks`</b>: An iterable of `SessionRunHook' objects.
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.close()` {#SingularMonitoredSession.close}
-
-
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.graph` {#SingularMonitoredSession.graph}
-
-The graph that was launched in this session.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.raw_session()` {#SingularMonitoredSession.raw_session}
-
-Returns underlying `TensorFlow.Session` object.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#SingularMonitoredSession.run}
-
-Run ops in the monitored session.
-
-This method is completely compatible with the `tf.Session.run()` method.
-
-##### Args:
-
-
-*  <b>`fetches`</b>: Same as `tf.Session.run()`.
-*  <b>`feed_dict`</b>: Same as `tf.Session.run()`.
-*  <b>`options`</b>: Same as `tf.Session.run()`.
-*  <b>`run_metadata`</b>: Same as `tf.Session.run()`.
-
-##### Returns:
-
-  Same as `tf.Session.run()`.
-
-
-- - -
-
-#### `tf.train.SingularMonitoredSession.should_stop()` {#SingularMonitoredSession.should_stop}
-
-
-
-
-
-- - -
-
-### `class tf.train.Scaffold` {#Scaffold}
-
-Structure to create or gather pieces commonly needed to train a model.
-
-When you build a model for training you usually need ops to initialize
-variables, a `Saver` to checkpoint them, an op to collect summaries for
-the visualizer, and so on.
-
-Various libraries built on top of the core TensorFlow library take care of
-creating some or all of these pieces and storing them in well known
-collections in the graph.  The `Scaffold` class helps pick these pieces from
-the graph collections, creating and adding them to the collections if needed.
-
-If you call the scaffold constructor without any arguments, it will pick
-pieces from the collections, creating default ones if needed when
-`scaffold.finalize()` is called.  You can pass arguments to the constructor to
-provide your own pieces.  Pieces that you pass to the constructor are not
-added to the graph collections.
-
-The following pieces are directly accessible as attributes of the `Scaffold`
-object:
-
-* `saver`: A `tf.Saver` object taking care of saving the variables.  Picked
-  from and stored into the `SAVERS` collection in the graph by default.
-* `init_op`: An op to run to initialize the variables.  Picked from and
-  stored into the `INIT_OP` collection in the graph by default.
-* `ready_op`: An op to verify that the variables are initialized.  Picked
-  from and stored into the `READY_OP` collection in the graph by default.
-* `ready_for_local_init_op`: An op to verify that global state has been
-  initialized and it is alright to run `local_init_op`.  Picked from and
-  stored into the `READY_FOR_LOCAL_INIT_OP` collection in the graph by
-  default. This is needed when the initialization of local variables depends
-  on the values of global variables.
-* `local_init_op`: An op to initialize the local variables.  Picked
-  from and stored into the `LOCAL_INIT_OP` collection in the graph by default.
-* `summary_op`: An op to run and merge the summaries in the graph.  Picked
-  from and stored into the `SUMMARY_OP` collection in the graph by default.
-* `global_step`: A tensor containing the global step counter.  Picked
-  from and stored into the `GLOBAL_STEP` collection in the graph by default.
-
-You can also pass the following additional pieces to the constructor:
-
-* `init_feed_dict`: A sessionn feed dictionary that should be used when
-   running the init op.
-* `init_fn`: A callable to run run after the init op to perform additional
-  initializations.  The callable will be called as
-  `init_fn(scaffold, session)`.
-- - -
-
-#### `tf.train.Scaffold.__init__(init_op=None, init_feed_dict=None, init_fn=None, ready_op=None, ready_for_local_init_op=None, local_init_op=None, summary_op=None, saver=None)` {#Scaffold.__init__}
-
-Create a scaffold.
-
-##### Args:
-
-
-*  <b>`init_op`</b>: Optional op for initializing variables.
-*  <b>`init_feed_dict`</b>: Optional session feed dictionary to use when running the
-    init_op.
-*  <b>`init_fn`</b>: Optional function to use to initialize the model after running
-    the init_op.  Will be called as `init_fn(scaffold, session)`.
-*  <b>`ready_op`</b>: Optional op to verify that the variables are initialized.  Must
-    return an empty 1D string tensor when the variables are initialized, or
-    a non-empty 1D string tensor listing the names of the non-initialized
-    variables.
-*  <b>`ready_for_local_init_op`</b>: Optional op to verify that the global variables
-    are initialized and `local_init_op` can be run. Must return an empty
-    1D string tensor when the global variables are initialized, or a
-    non-empty 1D string tensor listing the names of the non-initialized
-    global variables.
-*  <b>`local_init_op`</b>: Optional op to initialize local variables.
-*  <b>`summary_op`</b>: Optional op to gather all summaries.  Must return a scalar
-    string tensor containing a serialized `Summary` proto.
-*  <b>`saver`</b>: Optional `tf.Saver` object to use to save and restore variables.
-
-
-- - -
-
-#### `tf.train.Scaffold.finalize()` {#Scaffold.finalize}
-
-Creates operations if needed and finalizes the graph.
-
-
-- - -
-
-#### `tf.train.Scaffold.get_or_default(arg_name, collection_key, default_constructor)` {#Scaffold.get_or_default}
-
-Get from cache or create a default operation.
-
-
-- - -
-
-#### `tf.train.Scaffold.init_feed_dict` {#Scaffold.init_feed_dict}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.init_fn` {#Scaffold.init_fn}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.init_op` {#Scaffold.init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.local_init_op` {#Scaffold.local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.ready_for_local_init_op` {#Scaffold.ready_for_local_init_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.ready_op` {#Scaffold.ready_op}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.saver` {#Scaffold.saver}
-
-
-
-
-- - -
-
-#### `tf.train.Scaffold.summary_op` {#Scaffold.summary_op}
-
-
-
-
-
-- - -
-
-### `class tf.train.SessionCreator` {#SessionCreator}
-
-A factory for tf.Session.
-- - -
-
-#### `tf.train.SessionCreator.create_session()` {#SessionCreator.create_session}
-
-
-
-
-
-- - -
-
-### `class tf.train.ChiefSessionCreator` {#ChiefSessionCreator}
-
-Creates a tf.Session  for a chief.
-- - -
-
-#### `tf.train.ChiefSessionCreator.__init__(scaffold=None, master='', config=None, checkpoint_dir=None, checkpoint_filename_with_path=None)` {#ChiefSessionCreator.__init__}
-
-Initializes a chief session creator.
-
-##### Args:
-
-
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-*  <b>`checkpoint_dir`</b>: A string.  Optional path to a directory where to restore
-    variables.
-*  <b>`checkpoint_filename_with_path`</b>: Full file name path to the checkpoint file.
-
-
-- - -
-
-#### `tf.train.ChiefSessionCreator.create_session()` {#ChiefSessionCreator.create_session}
-
-
-
-
-
-- - -
-
-### `class tf.train.WorkerSessionCreator` {#WorkerSessionCreator}
-
-Creates a tf.Session for a worker.
-- - -
-
-#### `tf.train.WorkerSessionCreator.__init__(scaffold=None, master='', config=None)` {#WorkerSessionCreator.__init__}
-
-Initializes a worker session creator.
-
-##### Args:
-
-
-*  <b>`scaffold`</b>: A `Scaffold` used for gathering or building supportive ops. If
-    not specified a default one is created. It's used to finalize the graph.
-*  <b>`master`</b>: `String` representation of the TensorFlow master to use.
-*  <b>`config`</b>: `ConfigProto` proto used to configure the session.
-
-
-- - -
-
-#### `tf.train.WorkerSessionCreator.create_session()` {#WorkerSessionCreator.create_session}
-
-
-
-
-
-
-## Reading Summaries from Event Files
-
-See [Summaries and
-TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) for an
-overview of summaries, event files, and visualization in TensorBoard.
-
-- - -
-
-### `tf.train.summary_iterator(path)` {#summary_iterator}
-
-An iterator for reading `Event` protocol buffers from an event file.
-
-You can use this function to read events written to an event file. It returns
-a Python iterator that yields `Event` protocol buffers.
-
-Example: Print the contents of an events file.
-
-```python
-for e in tf.train.summary_iterator(path to events file):
-    print(e)
-```
-
-Example: Print selected summary values.
-
-```python
-# This example supposes that the events file contains summaries with a
-# summary value tag 'loss'.  These could have been added by calling
-# `add_summary()`, passing the output of a scalar summary op created with
-# with: `tf.summary.scalar('loss', loss_tensor)`.
-for e in tf.train.summary_iterator(path to events file):
-    for v in e.summary.value:
-        if v.tag == 'loss':
-            print(v.simple_value)
-```
-
-See the protocol buffer definitions of
-[Event](https://www.tensorflow.org/code/tensorflow/core/util/event.proto)
-and
-[Summary](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-for more information about their attributes.
-
-##### Args:
-
-
-*  <b>`path`</b>: The path to an event file created by a `SummaryWriter`.
-
-##### Yields:
-
-  `Event` protocol buffers.
-
-
-
-## Training Hooks
-
-Hooks are tools that run in the process of training/evaluation of the model.
-
-- - -
-
-### `class tf.train.SessionRunHook` {#SessionRunHook}
-
-Hook to extend calls to MonitoredSession.run().
-- - -
-
-#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.after_run(run_context, run_values)` {#SessionRunHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.before_run(run_context)` {#SessionRunHook.before_run}
-
-Called before each call to run().
-
-You can return from this call a `SessionRunArgs` object indicating ops or
-tensors to add to the upcoming `run()` call.  These ops/tensors will be run
-together with the ops/tensors originally passed to the original run() call.
-The run args you return can also contain feeds to be added to the run()
-call.
-
-The `run_context` argument is a `SessionRunContext` that provides
-information about the upcoming `run()` call: the originally requested
-op/tensors, the TensorFlow Session.
-
-At this point graph is finalized and you can not add ops.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-
-##### Returns:
-
-  None or a `SessionRunArgs` object.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.begin()` {#SessionRunHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.SessionRunHook.end(session)` {#SessionRunHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.SessionRunArgs` {#SessionRunArgs}
-
-Represents arguments to be added to a `Session.run()` call.
-
-Args:
-  fetches: Exactly like the 'fetches' argument to Session.Run().
-    Can be a single tensor or op, a list of 'fetches' or a dictionary
-    of fetches.  For example:
-      fetches = global_step_tensor
-      fetches = [train_op, summary_op, global_step_tensor]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-    Note that this can recurse as expected:
-      fetches = {'step': global_step_tensor,
-                 'ops': [train_op, check_nan_op]}
-  feed_dict: Exactly like the `feed_dict` argument to `Session.Run()`
-  options: Exactly like the `options` argument to `Session.run()`, i.e., a
-    config_pb2.RunOptions proto.
-- - -
-
-#### `tf.train.SessionRunArgs.__getnewargs__()` {#SessionRunArgs.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__getstate__()` {#SessionRunArgs.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__new__(cls, fetches, feed_dict=None, options=None)` {#SessionRunArgs.__new__}
-
-
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.__repr__()` {#SessionRunArgs.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.feed_dict` {#SessionRunArgs.feed_dict}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.fetches` {#SessionRunArgs.fetches}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.train.SessionRunArgs.options` {#SessionRunArgs.options}
-
-Alias for field number 2
-
-
-
-- - -
-
-### `class tf.train.SessionRunContext` {#SessionRunContext}
-
-Provides information about the `session.run()` call being made.
-
-Provides information about original request to `Session.Run()` function.
-SessionRunHook objects can stop the loop by calling `request_stop()` of
-`run_context`. In the future we may use this object to add more information
-about run without changing the Hook API.
-- - -
-
-#### `tf.train.SessionRunContext.__init__(original_args, session)` {#SessionRunContext.__init__}
-
-Initializes SessionRunContext.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.original_args` {#SessionRunContext.original_args}
-
-A `SessionRunArgs` object holding the original arguments of `run()`.
-
-If user called `MonitoredSession.run(fetches=a, feed_dict=b)`, then this
-field is equal to SessionRunArgs(a, b).
-
-##### Returns:
-
- A `SessionRunArgs` object
-
-
-- - -
-
-#### `tf.train.SessionRunContext.request_stop()` {#SessionRunContext.request_stop}
-
-Sets stop requested field.
-
-Hooks can use this function to request stop of iterations.
-`MonitoredSession` checks whether this is called or not.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.session` {#SessionRunContext.session}
-
-A TensorFlow session object which will execute the `run`.
-
-
-- - -
-
-#### `tf.train.SessionRunContext.stop_requested` {#SessionRunContext.stop_requested}
-
-Returns whether a stop is requested or not.
-
-If true, `MonitoredSession` stops iterations.
-
-##### Returns:
-
-  A `bool`
-
-
-
-- - -
-
-### `class tf.train.SessionRunValues` {#SessionRunValues}
-
-Contains the results of `Session.run()`.
-
-In the future we may use this object to add more information about result of
-run without changing the Hook API.
-
-Args:
-  results: The return values from `Session.run()` corresponding to the fetches
-    attribute returned in the RunArgs. Note that this has the same shape as
-    the RunArgs fetches.  For example:
-      fetches = global_step_tensor
-      => results = nparray(int)
-      fetches = [train_op, summary_op, global_step_tensor]
-      => results = [None, nparray(string), nparray(int)]
-      fetches = {'step': global_step_tensor, 'summ': summary_op}
-      => results = {'step': nparray(int), 'summ': nparray(string)}
-  options: `RunOptions` from the `Session.run()` call.
-  run_metadata: `RunMetadata` from the `Session.run()` call.
-- - -
-
-#### `tf.train.SessionRunValues.__getnewargs__()` {#SessionRunValues.__getnewargs__}
-
-Return self as a plain tuple.  Used by copy and pickle.
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__getstate__()` {#SessionRunValues.__getstate__}
-
-Exclude the OrderedDict from pickling
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__new__(_cls, results, options, run_metadata)` {#SessionRunValues.__new__}
-
-Create new instance of SessionRunValues(results, options, run_metadata)
-
-
-- - -
-
-#### `tf.train.SessionRunValues.__repr__()` {#SessionRunValues.__repr__}
-
-Return a nicely formatted representation string
-
-
-- - -
-
-#### `tf.train.SessionRunValues.options` {#SessionRunValues.options}
-
-Alias for field number 1
-
-
-- - -
-
-#### `tf.train.SessionRunValues.results` {#SessionRunValues.results}
-
-Alias for field number 0
-
-
-- - -
-
-#### `tf.train.SessionRunValues.run_metadata` {#SessionRunValues.run_metadata}
-
-Alias for field number 2
-
-
-
-
-- - -
-
-### `class tf.train.LoggingTensorHook` {#LoggingTensorHook}
-
-Prints the given tensors once every N local steps or once every N seconds.
-
-The tensors will be printed to the log, with `INFO` severity.
-- - -
-
-#### `tf.train.LoggingTensorHook.__init__(tensors, every_n_iter=None, every_n_secs=None, formatter=None)` {#LoggingTensorHook.__init__}
-
-Initializes a LoggingHook monitor.
-
-##### Args:
-
-
-*  <b>`tensors`</b>: `dict` that maps string-valued tags to tensors/tensor names,
-      or `iterable` of tensors/tensor names.
-*  <b>`every_n_iter`</b>: `int`, print the values of `tensors` once every N local
-      steps taken on the current worker.
-*  <b>`every_n_secs`</b>: `int` or `float`, print the values of `tensors` once every N
-      seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
-      provided.
-*  <b>`formatter`</b>: function, takes dict of `tag`->`Tensor` and returns a string.
-      If `None` uses default printing all tensors.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if `every_n_iter` is non-positive.
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.after_create_session(session, coord)` {#LoggingTensorHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.after_run(run_context, run_values)` {#LoggingTensorHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.before_run(run_context)` {#LoggingTensorHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.begin()` {#LoggingTensorHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.LoggingTensorHook.end(session)` {#LoggingTensorHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.StopAtStepHook` {#StopAtStepHook}
-
-Monitor to request stop at a specified step.
-- - -
-
-#### `tf.train.StopAtStepHook.__init__(num_steps=None, last_step=None)` {#StopAtStepHook.__init__}
-
-Create a StopAtStep Hook.
-
-This hook requests stop after either a number of steps have been
-executed or a last step has been reached.  Only of the two options can be
-specified.
-
-if `num_steps` is specified, it indicates the number of steps to execute
-after `begin()` is called.  If instead `last_step` is specified, it
-indicates the last step we want to execute, as passed to the `after_run()`
-call.
-
-##### Args:
-
-
-*  <b>`num_steps`</b>: Number of steps to execute.
-*  <b>`last_step`</b>: Step after which to stop.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If one of the arguments is invalid.
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.after_create_session(session, coord)` {#StopAtStepHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.after_run(run_context, run_values)` {#StopAtStepHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.before_run(run_context)` {#StopAtStepHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.begin()` {#StopAtStepHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.StopAtStepHook.end(session)` {#StopAtStepHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.CheckpointSaverHook` {#CheckpointSaverHook}
-
-Saves checkpoints every N steps or seconds.
-- - -
-
-#### `tf.train.CheckpointSaverHook.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None, listeners=None)` {#CheckpointSaverHook.__init__}
-
-Initialize CheckpointSaverHook monitor.
-
-##### Args:
-
-
-*  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
-*  <b>`save_secs`</b>: `int`, save every N secs.
-*  <b>`save_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
-*  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
-*  <b>`listeners`</b>: List of `CheckpointSaverListener` subclass instances.
-    Used for callbacks that run immediately after the corresponding
-    CheckpointSaverHook callbacks, only in steps where the
-    CheckpointSaverHook was triggered.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: One of `save_steps` or `save_secs` should be set.
-*  <b>`ValueError`</b>: Exactly one of saver or scaffold should be set.
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.after_create_session(session, coord)` {#CheckpointSaverHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.after_run(run_context, run_values)` {#CheckpointSaverHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.before_run(run_context)` {#CheckpointSaverHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.begin()` {#CheckpointSaverHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.CheckpointSaverHook.end(session)` {#CheckpointSaverHook.end}
-
-
-
-
-
-- - -
-
-### `tf.train.NewCheckpointReader(filepattern)` {#NewCheckpointReader}
-
-
-
-
-- - -
-
-### `class tf.train.StepCounterHook` {#StepCounterHook}
-
-Steps per second monitor.
-- - -
-
-#### `tf.train.StepCounterHook.__init__(every_n_steps=100, every_n_secs=None, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.after_create_session(session, coord)` {#StepCounterHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.StepCounterHook.after_run(run_context, run_values)` {#StepCounterHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.before_run(run_context)` {#StepCounterHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.begin()` {#StepCounterHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.StepCounterHook.end(session)` {#StepCounterHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.NanLossDuringTrainingError` {#NanLossDuringTrainingError}
-
-
-- - -
-
-#### `tf.train.NanLossDuringTrainingError.__str__()` {#NanLossDuringTrainingError.__str__}
-
-
-
-
-
-- - -
-
-### `class tf.train.NanTensorHook` {#NanTensorHook}
-
-NaN Loss monitor.
-
-Monitors loss and stops training if loss is NaN.
-Can either fail with exception or just stop training.
-- - -
-
-#### `tf.train.NanTensorHook.__init__(loss_tensor, fail_on_nan_loss=True)` {#NanTensorHook.__init__}
-
-Initializes NanLoss monitor.
-
-##### Args:
-
-
-*  <b>`loss_tensor`</b>: `Tensor`, the loss tensor.
-*  <b>`fail_on_nan_loss`</b>: `bool`, whether to raise exception when loss is NaN.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.after_create_session(session, coord)` {#NanTensorHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.after_run(run_context, run_values)` {#NanTensorHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.NanTensorHook.before_run(run_context)` {#NanTensorHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.NanTensorHook.begin()` {#NanTensorHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.NanTensorHook.end(session)` {#NanTensorHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.SummarySaverHook` {#SummarySaverHook}
-
-Saves summaries every N steps.
-- - -
-
-#### `tf.train.SummarySaverHook.__init__(save_steps=None, save_secs=None, output_dir=None, summary_writer=None, scaffold=None, summary_op=None)` {#SummarySaverHook.__init__}
-
-Initializes a `SummarySaver` monitor.
-
-##### Args:
-
-
-*  <b>`save_steps`</b>: `int`, save summaries every N steps. Exactly one of
-      `save_secs` and `save_steps` should be set.
-*  <b>`save_secs`</b>: `int`, save summaries every N seconds.
-*  <b>`output_dir`</b>: `string`, the directory to save the summaries to. Only used
-      if no `summary_writer` is supplied.
-*  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
-      one will be created accordingly.
-*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
-*  <b>`summary_op`</b>: `Tensor` of type `string` containing the serialized `Summary`
-      protocol buffer or a list of `Tensor`. They are most likely an output
-      by TF summary methods like `tf.summary.scalar` or
-      `tf.summary.merge_all`. It can be passed in as one tensor; if more
-      than one, they must be passed in as a list.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: Exactly one of scaffold or summary_op should be set.
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.after_create_session(session, coord)` {#SummarySaverHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.after_run(run_context, run_values)` {#SummarySaverHook.after_run}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.before_run(run_context)` {#SummarySaverHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.begin()` {#SummarySaverHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.SummarySaverHook.end(session=None)` {#SummarySaverHook.end}
-
-
-
-
-
-- - -
-
-### `class tf.train.GlobalStepWaiterHook` {#GlobalStepWaiterHook}
-
-Delay execution until global step reaches to wait_until_step.
-
-This hook delays execution until global step reaches to `wait_until_step`. It
-is used to gradually start workers in distributed settings. One example usage
-would be setting `wait_until_step=int(K*log(task_id+1))` assuming that
-task_id=0 is the chief.
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.__init__(wait_until_step)` {#GlobalStepWaiterHook.__init__}
-
-Create a _GlobalStepWaiterHook.
-
-##### Args:
-
-
-*  <b>`wait_until_step`</b>: an `int` shows until which global step should we wait.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.after_create_session(session, coord)` {#GlobalStepWaiterHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.after_run(run_context, run_values)` {#GlobalStepWaiterHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.before_run(run_context)` {#GlobalStepWaiterHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.begin()` {#GlobalStepWaiterHook.begin}
-
-
-
-
-- - -
-
-#### `tf.train.GlobalStepWaiterHook.end(session)` {#GlobalStepWaiterHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-- - -
-
-### `class tf.train.FinalOpsHook` {#FinalOpsHook}
-
-A run hook which evaluates `Tensors` at the end of a session.
-- - -
-
-#### `tf.train.FinalOpsHook.__init__(final_ops, final_ops_feed_dict=None)` {#FinalOpsHook.__init__}
-
-Constructs the FinalOpHook with ops to run at the end of the session.
-
-##### Args:
-
-
-*  <b>`final_ops`</b>: A single `Tensor`, a list of `Tensors` or a dictionary of
-    names to `Tensors`.
-*  <b>`final_ops_feed_dict`</b>: A feed dictionary to use when running
-    `final_ops_dict`.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.after_create_session(session, coord)` {#FinalOpsHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.after_run(run_context, run_values)` {#FinalOpsHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.before_run(run_context)` {#FinalOpsHook.before_run}
-
-Called before each call to run().
-
-You can return from this call a `SessionRunArgs` object indicating ops or
-tensors to add to the upcoming `run()` call.  These ops/tensors will be run
-together with the ops/tensors originally passed to the original run() call.
-The run args you return can also contain feeds to be added to the run()
-call.
-
-The `run_context` argument is a `SessionRunContext` that provides
-information about the upcoming `run()` call: the originally requested
-op/tensors, the TensorFlow Session.
-
-At this point graph is finalized and you can not add ops.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-
-##### Returns:
-
-  None or a `SessionRunArgs` object.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.begin()` {#FinalOpsHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.end(session)` {#FinalOpsHook.end}
-
-
-
-
-- - -
-
-#### `tf.train.FinalOpsHook.final_ops_values` {#FinalOpsHook.final_ops_values}
-
-
-
-
-
-- - -
-
-### `class tf.train.FeedFnHook` {#FeedFnHook}
-
-Runs `feed_fn` and sets the `feed_dict` accordingly.
-- - -
-
-#### `tf.train.FeedFnHook.__init__(feed_fn)` {#FeedFnHook.__init__}
-
-Constructs the FeedFnHook with given `feed_fn`.
-
-##### Args:
-
-
-*  <b>`feed_fn`</b>: function, no arguments and returns `dict` to feed.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.after_create_session(session, coord)` {#FeedFnHook.after_create_session}
-
-Called when new TensorFlow session is created.
-
-This is called to signal the hooks that a new session has been created. This
-has two essential differences with the situation in which `begin` is called:
-
-* When this is called, the graph is finalized and ops can no longer be added
-    to the graph.
-* This method will also be called as a result of recovering a wrapped
-    session, not only at the beginning of the overall session.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that has been created.
-*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.after_run(run_context, run_values)` {#FeedFnHook.after_run}
-
-Called after each call to run().
-
-The `run_values` argument contains results of requested ops/tensors by
-`before_run()`.
-
-The `run_context` argument is the same one send to `before_run` call.
-`run_context.request_stop()` can be called to stop the iteration.
-
-##### Args:
-
-
-*  <b>`run_context`</b>: A `SessionRunContext` object.
-*  <b>`run_values`</b>: A SessionRunValues object.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.before_run(run_context)` {#FeedFnHook.before_run}
-
-
-
-
-- - -
-
-#### `tf.train.FeedFnHook.begin()` {#FeedFnHook.begin}
-
-Called once before using the session.
-
-When called, the default graph is the one that will be launched in the
-session.  The hook can modify the graph by adding new operations to it.
-After the `begin()` call the graph will be finalized and the other callbacks
-can not modify the graph anymore. Second call of `begin()` on the same
-graph, should not change the graph.
-
-
-- - -
-
-#### `tf.train.FeedFnHook.end(session)` {#FeedFnHook.end}
-
-Called at the end of session.
-
-The `session` argument can be used in case the hook wants to run final ops,
-such as saving a last checkpoint.
-
-##### Args:
-
-
-*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
-
-
-
-
-## Training Utilities
-
-- - -
-
-### `tf.train.global_step(sess, global_step_tensor)` {#global_step}
-
-Small helper to get the global step.
-
-```python
-# Creates a variable to hold the global_step.
-global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
-# Creates a session.
-sess = tf.Session()
-# Initializes the variable.
-print('global_step: %s' % tf.train.global_step(sess, global_step_tensor))
-
-global_step: 10
-```
-
-##### Args:
-
-
-*  <b>`sess`</b>: A TensorFlow `Session` object.
-*  <b>`global_step_tensor`</b>: `Tensor` or the `name` of the operation that contains
-    the global step.
-
-##### Returns:
-
-  The global step value.
-
-
-- - -
-
-### `tf.train.basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master='')` {#basic_train_loop}
-
-Basic loop to train a model.
-
-Calls `train_step_fn` in a loop to train a model.  The function is called as:
-
-```python
-train_step_fn(session, *args, **kwargs)
-```
-
-It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
-typically runs one training step in the session.
-
-##### Args:
-
-
-*  <b>`supervisor`</b>: `tf.Supervisor` to run the training services.
-*  <b>`train_step_fn`</b>: Callable to execute one training step.  Called
-    repeatedly as `train_step_fn(session, *args **kwargs)`.
-*  <b>`args`</b>: Optional positional arguments passed to `train_step_fn`.
-*  <b>`kwargs`</b>: Optional keyword arguments passed to `train_step_fn`.
-*  <b>`master`</b>: Master to use to create the training session.  Defaults to
-    `""` which causes the session to be created in the local process.
-
-
-- - -
-
-### `tf.train.get_global_step(graph=None)` {#get_global_step}
-
-Get the global step tensor.
-
-The global step tensor must be an integer variable. We first try to find it
-in the collection `GLOBAL_STEP`, or by name `global_step:0`.
-
-##### Args:
-
-
-*  <b>`graph`</b>: The graph to find the global step in. If missing, use default graph.
-
-##### Returns:
-
-  The global step variable, or `None` if none was found.
-
-##### Raises:
-
-
-*  <b>`TypeError`</b>: If the global step tensor has a non-integer type, or if it is not
-    a `Variable`.
-
-
-- - -
-
-### `tf.train.assert_global_step(global_step_tensor)` {#assert_global_step}
-
-Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
-
-##### Args:
-
-
-*  <b>`global_step_tensor`</b>: `Tensor` to test.
-
-
-- - -
-
-### `tf.train.write_graph(graph_or_graph_def, logdir, name, as_text=True)` {#write_graph}
-
-Writes a graph proto to a file.
-
-The graph is written as a binary proto unless `as_text` is `True`.
-
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
-```
-
-or
-
-```python
-v = tf.Variable(0, name='my_variable')
-sess = tf.Session()
-tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
-```
-
-##### Args:
-
-
-*  <b>`graph_or_graph_def`</b>: A `Graph` or a `GraphDef` protocol buffer.
-*  <b>`logdir`</b>: Directory where to write the graph. This can refer to remote
-    filesystems, such as Google Cloud Storage (GCS).
-*  <b>`name`</b>: Filename for the graph.
-*  <b>`as_text`</b>: If `True`, writes the graph as an ASCII proto.
-
-##### Returns:
-
-  The path of the output proto file.
-
-
-
-## Other Functions and Classes
-- - -
-
-### `class tf.train.SyncReplicasOptimizer` {#SyncReplicasOptimizer}
-
-Class to synchronize, aggregate gradients and pass them to the optimizer.
-
-In a typical asynchronous training environment, it's common to have some
-stale gradients. For example, with a N-replica asynchronous training,
-gradients will be applied to the variables N times independently. Depending
-on each replica's training speed, some gradients might be calculated from
-copies of the variable from several steps back (N-1 steps on average). This
-optimizer avoids stale gradients by collecting gradients from all replicas,
-averaging them, then applying them to the variables in one shot, after
-which replicas can fetch the new variables and continue.
-
-The following accumulators/queue are created:
-<empty line>
-* N `gradient accumulators`, one per variable to train. Gradients are pushed
-  to them and the chief worker will wait until enough gradients are collected
-  and then average them before applying to variables. The accumulator will
-  drop all stale gradients (more details in the accumulator op).
-* 1 `token` queue where the optimizer pushes the new global_step value after
-  all variables are updated.
-
-The following local variable is created:
-* `sync_rep_local_step`, one per replica. Compared against the global_step in
-  each accumulator to check for staleness of the gradients.
-
-The optimizer adds nodes to the graph to collect gradients and pause the
-trainers until variables are updated.
-For the Parameter Server job:
-<empty line>
-1. An accumulator is created for each variable, and each replica pushes the
-   gradients into the accumulators instead of directly applying them to the
-   variables.
-2. Each accumulator averages once enough gradients (replicas_to_aggregate)
-   have been accumulated.
-3. Apply the averaged gradients to the variables.
-4. Only after all variables have been updated, increment the global step.
-5. Only after step 4, pushes `global_step` in the `token_queue`, once for
-   each worker replica. The workers can now fetch the global step, use it to
-   update its local_step variable and start the next batch.
-
-For the replicas:
-<empty line>
-1. Start a step: fetch variables and compute gradients.
-2. Once the gradients have been computed, push them into gradient
-   accumulators. Each accumulator will check the staleness and drop the stale.
-3. After pushing all the gradients, dequeue an updated value of global_step
-   from the token queue and record that step to its local_step variable. Note
-   that this is effectively a barrier.
-4. Start the next batch.
-
-### Usage
-
-```python
-# Create any optimizer to update the variables, say a simple SGD:
-opt = GradientDescentOptimizer(learning_rate=0.1)
-
-# Wrap the optimizer with sync_replicas_optimizer with 50 replicas: at each
-# step the optimizer collects 50 gradients before applying to variables.
-# Note that if you want to have 2 backup replicas, you can change
-# total_num_replicas=52 and make sure this number matches how many physical
-# replicas you started in your job.
-opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
-                               total_num_replicas=50)
-
-# Some models have startup_delays to help stabilize the model but when using
-# sync_replicas training, set it to 0.
-
-# Now you can call `minimize()` or `compute_gradients()` and
-# `apply_gradients()` normally
-training_op = opt.minimize(total_loss, global_step=self.global_step)
-
-
-# You can create the hook which handles initialization and queues.
-sync_replicas_hook = opt.make_session_run_hook(is_chief)
-```
-
-In the training program, every worker will run the train_op as if not
-synchronized.
-
-```python
-with training.MonitoredTrainingSession(
-    master=workers[worker_id].target, is_chief=is_chief,
-    hooks=[sync_replicas_hook]) as mon_sess:
-  while not mon_sess.should_stop():
-    mon_sess.run(training_op)
-```
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.__init__(opt, replicas_to_aggregate, total_num_replicas=None, variable_averages=None, variables_to_average=None, use_locking=False, name='sync_replicas')` {#SyncReplicasOptimizer.__init__}
-
-Construct a sync_replicas optimizer.
-
-##### Args:
-
-
-*  <b>`opt`</b>: The actual optimizer that will be used to compute and apply the
-    gradients. Must be one of the Optimizer classes.
-*  <b>`replicas_to_aggregate`</b>: number of replicas to aggregate for each variable
-    update.
-*  <b>`total_num_replicas`</b>: Total number of tasks/workers/replicas, could be
-    different from replicas_to_aggregate.
-    If total_num_replicas > replicas_to_aggregate: it is backup_replicas +
-    replicas_to_aggregate.
-    If total_num_replicas < replicas_to_aggregate: Replicas compute
-    multiple batches per update to variables.
-*  <b>`variable_averages`</b>: Optional `ExponentialMovingAverage` object, used to
-    maintain moving averages for the variables passed in
-    `variables_to_average`.
-*  <b>`variables_to_average`</b>: a list of variables that need to be averaged. Only
-    needed if variable_averages is passed in.
-*  <b>`use_locking`</b>: If True use locks for update operation.
-*  <b>`name`</b>: string. Optional name of the returned operation.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.compute_gradients(*args, **kwargs)` {#SyncReplicasOptimizer.compute_gradients}
-
-Compute gradients of "loss" for the variables in "var_list".
-
-This simply wraps the compute_gradients() from the real optimizer. The
-gradients will be aggregated in the apply_gradients() so that user can
-modify the gradients like clipping with per replica global norm if needed.
-The global norm with aggregated gradients can be bad as one replica's huge
-gradients can hurt the gradients from other replicas.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for compute_gradients().
-*  <b>`**kwargs`</b>: Keyword arguments for compute_gradients().
-
-##### Returns:
-
-  A list of (gradient, variable) pairs.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.apply_gradients(grads_and_vars, global_step=None, name=None)` {#SyncReplicasOptimizer.apply_gradients}
-
-Apply gradients to variables.
-
-This contains most of the synchronization implementation and also wraps the
-apply_gradients() from the real optimizer.
-
-##### Args:
-
-
-*  <b>`grads_and_vars`</b>: List of (gradient, variable) pairs as returned by
-    compute_gradients().
-*  <b>`global_step`</b>: Optional Variable to increment by one after the
-    variables have been updated.
-*  <b>`name`</b>: Optional name for the returned operation.  Default to the
-    name passed to the Optimizer constructor.
-
-##### Returns:
-
-
-*  <b>`train_op`</b>: The op to dequeue a token so the replicas can exit this batch
-  and start the next one. This is executed by each replica.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If the grads_and_vars is empty.
-*  <b>`ValueError`</b>: If global step is not provided, the staleness cannot be
-    checked.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_chief_queue_runner()` {#SyncReplicasOptimizer.get_chief_queue_runner}
-
-Returns the QueueRunner for the chief to execute.
-
-This includes the operations to synchronize replicas: aggregate gradients,
-apply to variables, increment global step, insert tokens to token queue.
-
-Note that this can only be called after calling apply_gradients() which
-actually generates this queuerunner.
-
-##### Returns:
-
-  A `QueueRunner` for chief to execute.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this is called before apply_gradients().
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_init_tokens_op(num_tokens=-1)` {#SyncReplicasOptimizer.get_init_tokens_op}
-
-Returns the op to fill the sync_token_queue with the tokens.
-
-This is supposed to be executed in the beginning of the chief/sync thread
-so that even if the total_num_replicas is less than replicas_to_aggregate,
-the model can still proceed as the replicas can compute multiple steps per
-variable update. Make sure:
-`num_tokens >= replicas_to_aggregate - total_num_replicas`.
-
-##### Args:
-
-
-*  <b>`num_tokens`</b>: Number of tokens to add to the queue.
-
-##### Returns:
-
-  An op for the chief/sync replica to fill the token queue.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If this is called before apply_gradients().
-*  <b>`ValueError`</b>: If num_tokens are smaller than replicas_to_aggregate -
-    total_num_replicas.
-
-
-
-#### Other Methods
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_slot(*args, **kwargs)` {#SyncReplicasOptimizer.get_slot}
-
-Return a slot named "name" created for "var" by the Optimizer.
-
-This simply wraps the get_slot() from the actual optimizer.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for get_slot().
-*  <b>`**kwargs`</b>: Keyword arguments for get_slot().
-
-##### Returns:
-
-  The `Variable` for the slot if it was created, `None` otherwise.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.get_slot_names(*args, **kwargs)` {#SyncReplicasOptimizer.get_slot_names}
-
-Return a list of the names of slots created by the `Optimizer`.
-
-This simply wraps the get_slot_names() from the actual optimizer.
-
-##### Args:
-
-
-*  <b>`*args`</b>: Arguments for get_slot().
-*  <b>`**kwargs`</b>: Keyword arguments for get_slot().
-
-##### Returns:
-
-  A list of strings.
-
-
-- - -
-
-#### `tf.train.SyncReplicasOptimizer.make_session_run_hook(is_chief, num_tokens=-1)` {#SyncReplicasOptimizer.make_session_run_hook}
-
-Creates a hook to handle SyncReplicasHook ops such as initialization.
-
-
-
-- - -
-
-### `tf.train.checkpoint_exists(checkpoint_prefix)` {#checkpoint_exists}
-
-Checks whether a V1 or V2 checkpoint exists with the specified prefix.
-
-This is the recommended way to check if a checkpoint exists, since it takes
-into account the naming difference between V1 and V2 formats.
-
-##### Args:
-
-
-*  <b>`checkpoint_prefix`</b>: the prefix of a V1 or V2 checkpoint, with V2 taking
-    priority.  Typically the result of `Saver.save()` or that of
-    `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or
-    V1/V2.
-
-##### Returns:
-
-  A bool, true iff a checkpoint referred to by `checkpoint_prefix` exists.
-
-
-- - -
-
-### `tf.train.do_quantize_training_on_graphdef(input_graph, num_bits)` {#do_quantize_training_on_graphdef}
-
-
-
-
-- - -
-
-### `tf.train.generate_checkpoint_state_proto(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None)` {#generate_checkpoint_state_proto}
-
-Generates a checkpoint state proto.
-
-##### Args:
-
-
-*  <b>`save_dir`</b>: Directory where the model was saved.
-*  <b>`model_checkpoint_path`</b>: The checkpoint file.
-*  <b>`all_model_checkpoint_paths`</b>: List of strings.  Paths to all not-yet-deleted
-    checkpoints, sorted from oldest to newest.  If this is a non-empty list,
-    the last element must be equal to model_checkpoint_path.  These paths
-    are also saved in the CheckpointState proto.
-
-##### Returns:
-
-  CheckpointState proto with model_checkpoint_path and
-  all_model_checkpoint_paths updated to either absolute paths or
-  relative paths to the current save_dir.
-
-
-- - -
-
-### `tf.train.get_checkpoint_mtimes(checkpoint_prefixes)` {#get_checkpoint_mtimes}
-
-Returns the mtimes (modification timestamps) of the checkpoints.
-
-Globs for the checkpoints pointed to by `checkpoint_prefixes`.  If the files
-exist, collect their mtime.  Both V2 and V1 checkpoints are considered, in
-that priority.
-
-This is the recommended way to get the mtimes, since it takes into account
-the naming difference between V1 and V2 formats.
-
-##### Args:
-
-
-*  <b>`checkpoint_prefixes`</b>: a list of checkpoint paths, typically the results of
-    `Saver.save()` or those of `tf.train.latest_checkpoint()`, regardless of
-    sharded/non-sharded or V1/V2.
-
-##### Returns:
-
-  A list of mtimes (in microseconds) of the found checkpoints.
-
-
diff --git a/tensorflow/g3doc/contrib/learn/get_started/index.md b/tensorflow/g3doc/contrib/learn/get_started/index.md
deleted file mode 100644
index c99dbe962e5..00000000000
--- a/tensorflow/g3doc/contrib/learn/get_started/index.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# Introduction
-
-Below are few simple examples of the API to get you started with TensorFlow Learn.
-For more examples, please see [examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/skflow).
-
-## General tips
-
--  It's useful to re-scale dataset before passing to estimator to 0 mean and unit standard deviation. Stochastic Gradient Descent doesn't always do the right thing when variable are very different scale.
-
--  Categorical variables should be managed before passing input to the estimator.
-
-## Linear Classifier
-
-Simple linear classification:
-
-    from tensorflow.contrib import learn
-    from sklearn import datasets, metrics
-
-    iris = datasets.load_iris()
-    classifier = learn.TensorFlowLinearClassifier(n_classes=3)
-    classifier.fit(iris.data, iris.target)
-    score = metrics.accuracy_score(iris.target, classifier.predict(iris.data))
-    print("Accuracy: %f" % score)
-
-## Linear Regressor
-
-Simple linear regression:
-
-    from tensorflow.contrib import learn
-    from sklearn import datasets, metrics, preprocessing
-
-    boston = datasets.load_boston()
-    X = preprocessing.StandardScaler().fit_transform(boston.data)
-    regressor = learn.TensorFlowLinearRegressor()
-    regressor.fit(X, boston.target)
-    score = metrics.mean_squared_error(regressor.predict(X), boston.target)
-    print ("MSE: %f" % score)
-
-## Deep Neural Network
-
-Example of 3 layer network with 10, 20 and 10 hidden units respectively:
-
-    from tensorflow.contrib import learn
-    from sklearn import datasets, metrics
-
-    iris = datasets.load_iris()
-    feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-    classifier = learn.DNNClassifier(
-        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
-    classifier.fit(iris.data, iris.target, steps=100)
-    score = metrics.accuracy_score(iris.target, classifier.predict(iris.data))
-    print("Accuracy: %f" % score)
-
-## Custom model
-
-Example of how to pass a custom model to the TensorFlowEstimator:
-
-    from tensorflow.contrib import learn
-    from sklearn import datasets, metrics
-
-    iris = datasets.load_iris()
-
-    def my_model(X, y):
-        """This is DNN with 10, 20, 10 hidden layers, and dropout of 0.5 probability."""
-        layers = learn.ops.dnn(X, [10, 20, 10], keep_prob=0.5)
-        return learn.models.logistic_regression(layers, y)
-
-    classifier = learn.TensorFlowEstimator(model_fn=my_model, n_classes=3)
-    classifier.fit(iris.data, iris.target)
-    score = metrics.accuracy_score(iris.target, classifier.predict(iris.data))
-    print("Accuracy: %f" % score)
-
-## Saving / Restoring models
-
-Each estimator has a ``save`` method which takes folder path where all model information will be saved. For restoring you can just call ``learn.TensorFlowEstimator.restore(path)`` and it will return object of your class.
-
-Some example code:
-
-    from tensorflow.contrib import learn
-
-    classifier = learn.TensorFlowLinearRegression()
-    classifier.fit(...)
-    classifier.save('/tmp/tf_examples/my_model_1/')
-
-    new_classifier = TensorFlowEstimator.restore('/tmp/tf_examples/my_model_2')
-    new_classifier.predict(...)
-
-## Summaries
-
-To get nice visualizations and summaries you can use ``logdir`` parameter on ``fit``. It will start writing summaries for ``loss`` and histograms for variables in your model. You can also add custom summaries in your custom model function by calling ``tf.summary`` and passing Tensors to report.
-
-    classifier = learn.TensorFlowLinearRegression()
-    classifier.fit(X, y, logdir='/tmp/tf_examples/my_model_1/')
-
-Then run next command in command line:
-
-    tensorboard --logdir=/tmp/tf_examples/my_model_1
-
-and follow reported url.
-
-Graph visualization: Text classification RNN Graph image
-
-Loss visualization: Text classification RNN Loss image
-
-
-## More examples
-
-See [examples folder](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/skflow) for:
-
--  Easy way to handle categorical variables - words are just an example of categorical variable.
--  Text Classification - see examples for RNN, CNN on word and characters.
--  Language modeling and text sequence to sequence.
--  Images (CNNs) - see example for digit recognition.
--  More & deeper - different examples showing DNNs and CNNs
diff --git a/tensorflow/g3doc/contrib/learn/index.md b/tensorflow/g3doc/contrib/learn/index.md
deleted file mode 100644
index 7d77dccca1d..00000000000
--- a/tensorflow/g3doc/contrib/learn/index.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# TensorFlow Learn
-
-This is an API for building learning models with TensorFlow.
-This library covers variety of needs from linear models to *Deep Learning*
-applications like text and image understanding.
-
-## Get Started
-
-[View Introduction](get_started/index.md)
-
-## Tutorials
-
--  [Introduction to Scikit Flow and why you want to start learning
-   TensorFlow](https://medium.com/@ilblackdragon/tensorflow-tutorial-part-1-c559c63c0cb1)
--  [DNNs, custom model and Digit recognition
-   examples](https://medium.com/@ilblackdragon/tensorflow-tutorial-part-2-9ffe47049c92>)
--  [Categorical variables: One hot vs Distributed
-   representation](https://medium.com/@ilblackdragon/tensorflow-tutorial-part-3-c5fc0662bc08>)
--  More coming soon.
-
-## Community
-
-- Twitter [#skflow](https://twitter.com/search?q=skflow&src=typd>).
-- StackOverflow with
-[skflow tag](http://stackoverflow.com/questions/tagged/skflow>)
-for questions and struggles.
-- Github [issues](https://github.com/tensorflow/tensorflow/issues>)
-for technical discussions and feature requests.
-- [Gitter channel](https://gitter.im/tensorflow/skflow>)
-for non-trivial discussions.
diff --git a/tensorflow/g3doc/experimental/index.md b/tensorflow/g3doc/experimental/index.md
deleted file mode 100644
index d941a336a8e..00000000000
--- a/tensorflow/g3doc/experimental/index.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Experimental
-
-Collected in this section is information about experimental features included in
-Tensorflow. These features are either not complete or not included as part of
-core Tensorflow.
-
-### XLA
-
-[XLA](./xla/) is a domain-specific compiler for linear algebra that optimizes
-TensorFlow computations. The results are improvements in speed, memory usage,
-and portability on server and mobile platforms.
-
-[Learn more about XLA](./xla/)
diff --git a/tensorflow/g3doc/experimental/leftnav_files b/tensorflow/g3doc/experimental/leftnav_files
deleted file mode 100644
index 489f3a846aa..00000000000
--- a/tensorflow/g3doc/experimental/leftnav_files
+++ /dev/null
@@ -1,8 +0,0 @@
-### XLA
-xla/index.md
-xla/jit.md
-xla/tfcompile.md
-xla/developing_new_backend.md
-xla/operation_semantics.md
-xla/shapes.md
-xla/broadcasting.md
\ No newline at end of file
diff --git a/tensorflow/g3doc/experimental/xla/index.md b/tensorflow/g3doc/experimental/xla/index.md
deleted file mode 100644
index 702e03ea8f0..00000000000
--- a/tensorflow/g3doc/experimental/xla/index.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# XLA Overview
-
-> Note: XLA is experimental and considered alpha.  Most use cases will not
-> see improvements in performance (speed or decreased memory usage). We have
-> released XLA early so the Open Source Community can contribute to its
-> development, as well as create a path for integration with hardware
-> accelerators.
-
-XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
-algebra that optimizes TensorFlow computations. The results are improvements in
-speed, memory usage, and portability on server and mobile platforms. Initially,
-most users will not see large benefits from XLA, but are welcome to experiment
-by using XLA via [just-in-time (JIT) compilaton](jit.md) or [ahead-of-time (AOT)
-compilation](tfcompile.md). Developers targeting new hardware accelerators are
-especially encouraged to try out XLA.
-
-The XLA framework is experimental and in active development. In particular,
-while it is unlikely that the semantics of existing operations will change, it
-is expected that more operations will be added to cover important use cases. The
-team welcomes feedback from the community about missing functionality and
-community contributions via GitHub.
-
-## Why did we build XLA?
-
-We had several objectives for XLA to work with TensorFlow:
-
-*   *Improve execution speed.* Compile subgraphs to reduce the execution time of
-    short-lived Ops to eliminate overhead from the TensorFlow runtime, fuse
-    pipelined operations to reduce memory overhead, and specialize to known
-    tensor shapes to allow for more aggressive constant propagation.
-
-*   *Improve memory usage.* Analyze and schedule memory usage, in principle
-    eliminating many intermediate storage buffers.
-
-*   *Reduce reliance on custom Ops.* Remove the need for many custom Ops by
-    improving the performance of automatically fused low-level Ops to match the
-    performance of custom Ops that were fused by hand.
-
-*   *Reduce mobile footprint.* Eliminate the TensorFlow runtime by ahead-of-time
-    compiling the subgraph and emitting an object/header file pair that can be
-    linked directly into another application. The results can reduce the
-    footprint for mobile inference by several orders of magnitude.
-
-*   *Improve portability.* Make it relatively easy to write a new backend for
-    novel hardware, at which point a large fraction of TensorFlow programs will
-    run unmodified on that hardware. This is in contrast with the approach of
-    specializing individual monolithic Ops for new hardware, which requires
-    TensorFlow programs to be rewritten to make use of those Ops.
-
-## How does XLA work?
-
-The input language to XLA is called "HLO IR", or just HLO (High Level
-Optimizer). The semantics of HLO are described on the
-[Operation Semantics](operation_semantics.md) page. It
-is most convenient to think of HLO as a [compiler
-IR](https://en.wikipedia.org/wiki/Intermediate_representation).
-
-XLA takes graphs ("computations") defined in HLO and compiles them into machine
-instructions for various architectures. XLA is modular in the sense that it is
-easy to slot in an alternative backend to [target some novel HW
-architecture](developing_new_backend.md). The CPU backend for x64 and ARM64 as
-well as the NVIDIA GPU backend are in the TensorFlow source tree.
-
-The following diagram shows the compilation process in XLA:
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img src="../../images/how-does-xla-work.png">
-</div>
-
-XLA comes with several optimizations and analyses that are target-independent,
-such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
-target-independent operation fusion, and buffer analysis for allocating runtime
-memory for the computation.
-
-After the target-independent step, XLA sends the HLO computation to a backend.
-The backend can perform further HLO-level analyses and optimizations, this time
-with target specific information and needs in mind. For example, the XLA GPU
-backend may perform operation fusion beneficial specifically for the GPU
-programming model and determine how to partition the computation into streams.
-At this stage, backends may also pattern-match certain operations or
-combinations thereof to optimized library calls.
-
-The next step is target-specific code generation. The CPU and GPU backends
-included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
-and code-generation. These backends emit the LLVM IR necessary to represent the
-XLA HLO computation in an efficient manner, and then invoke LLVM to emit native
-code from this LLVM IR.
-
-The GPU backend currently supports NVIDIA GPUs via the LLVM NVPTX backend; the
-CPU backend supports multiple CPU ISAs.
-
-## Supported Platforms
-
-XLA currently supports [JIT compilation](jit.md) on x86-64 and NVIDIA GPUs; and
-[AOT compilation](tfcompile.md) for x86-64 and ARM.
diff --git a/tensorflow/g3doc/get_started/basic_usage.md b/tensorflow/g3doc/get_started/basic_usage.md
deleted file mode 100644
index c39f365ed8a..00000000000
--- a/tensorflow/g3doc/get_started/basic_usage.md
+++ /dev/null
@@ -1,324 +0,0 @@
-# Basic Usage
-
-To use TensorFlow you need to understand how TensorFlow:
-
-* Represents computations as graphs.
-* Executes graphs in the context of `Sessions`.
-* Represents data as tensors.
-* Maintains state with `Variables`.
-* Uses feeds and fetches to get data into and out of arbitrary operations.
-
-## Overview
-
-TensorFlow is a programming system in which you represent computations as
-graphs. Nodes in the graph are called *ops* (short for operations). An op takes
-zero or more `Tensors`, performs some computation, and produces zero or more
-`Tensors`. In TensorFlow terminology, a `Tensor` is a typed multi-dimensional
-array. For example, you can represent a mini-batch of images as a 4-D array of
-floating point numbers with dimensions `[batch, height, width, channels]`.
-
-A TensorFlow graph is a *description* of computations.  To compute anything,
-a graph must be launched in a `Session`.  A `Session` places the graph ops onto
-`Devices`, such as CPUs or GPUs, and provides methods to execute them.  These
-methods return tensors produced by ops as [numpy](http://www.numpy.org)
-`ndarray` objects in Python, and as `tensorflow::Tensor` instances in C and
-C++.
-
-## The computation graph
-
-TensorFlow programs are usually structured into a construction phase that
-assembles a graph, and an execution phase that uses a session to execute ops in
-the graph.
-
-For example, it is common to create a graph to represent and train a neural
-network in the construction phase, and then repeatedly execute a set of
-training ops in the graph in the execution phase.
-
-TensorFlow can be used from C, C++, and Python programs.  It is presently much
-easier to use the Python library to assemble graphs, as it provides a large set
-of helper functions not available in the C and C++ libraries.
-
-The session libraries have equivalent functionalities for the three languages.
-
-### Building the graph
-
-To build a graph, start by defining ops that do not need any input, such as
-`constant`, and pass their output to other ops that do computation.
-
-The op constructors in the Python library return objects that represent the
-output of the constructed ops.  You can pass these as inputs to other op
-constructors.
-
-The TensorFlow Python library has a *default graph* to which op constructors
-add nodes.  The default graph is sufficient for many applications.  See the
-[Graph class](../api_docs/python/framework.md#Graph) documentation for how
-to explicitly manage multiple graphs.
-
-```python
-import tensorflow as tf
-
-# Create a constant op that produces a 1x2 matrix.  The op is
-# added as a node to the default graph.
-#
-# The value returned by the constructor represents the output
-# of the constant op.
-matrix1 = tf.constant([[3., 3.]])
-
-# Create another constant that produces a 2x1 matrix.
-matrix2 = tf.constant([[2.],[2.]])
-
-# Create a matmul op that takes 'matrix1' and 'matrix2' as inputs.
-# The returned value, 'product', represents the result of the matrix
-# multiplication.
-product = tf.matmul(matrix1, matrix2)
-```
-
-The default graph now has three nodes: two `constant` ops and one `matmul`
-op. To actually multiply the matrices and get the result of the multiplication,
-you must *launch* the graph in a `Session`.
-
-### Launching the graph in a session
-
-After constructing a graph, you can launch it by creating a `Session` object.
-The `Session` launches the default graph, unless a different graph is specified
-in the constructor.  See the
-[Session class](../api_docs/python/client.md#session-management) for
-the complete API.
-
-```python
-# Launch the default graph.
-sess = tf.Session()
-
-# To run the matmul op we call the session 'run()' method, passing 'product'
-# which represents the output of the matmul op.  This indicates to the call
-# that we want to get the output of the matmul op back.
-#
-# All inputs needed by the op are run automatically by the session.  They
-# typically are run in parallel.
-#
-# The call 'run(product)' thus causes the execution of three ops in the
-# graph: the two constants and matmul.
-#
-# The output of the matmul is returned in 'result' as a numpy `ndarray` object.
-result = sess.run(product)
-print(result)
-# ==> [[ 12.]]
-
-# Close the Session when we're done.
-sess.close()
-```
-
-Sessions should be closed to release resources.  To manage resources more
-easily, use a `with` statement. Each `Session` implements a context manager that
-calls `close()` when exiting the block.
-
-```python
-with tf.Session() as sess:
-  result = sess.run([product])
-  print(result)
-```
-
-The TensorFlow implementation translates the graph definition into executable
-operations distributed across available compute resources, such as the CPU or
-one of your computer's GPU cards. In general you do not have to specify CPUs
-or GPUs explicitly. TensorFlow uses your first GPU, if you have one, for as
-many operations as possible.
-
-If you have more than one GPU available on your machine, to use a GPU beyond
-the first you must assign ops to it explicitly. Use `with...Device` statements
-to specify which CPU or GPU to use for operations:
-
-```python
-with tf.Session() as sess:
-  with tf.device("/gpu:1"):
-    matrix1 = tf.constant([[3., 3.]])
-    matrix2 = tf.constant([[2.],[2.]])
-    product = tf.matmul(matrix1, matrix2)
-    ...
-```
-
-Devices are specified with strings.  The currently supported devices are:
-
-*  `"/cpu:0"`: The CPU of your machine.
-*  `"/gpu:0"`: The GPU of your machine, if you have one.
-*  `"/gpu:1"`: The second GPU of your machine, etc.
-
-See [Using GPUs](../how_tos/using_gpu/index.md) for more information about GPUs
-and TensorFlow.
-
-### Launching the graph in a distributed session
-
-To create a TensorFlow cluster, launch a TensorFlow server on each of the
-machines in the cluster. When you instantiate a Session in your client, you
-pass it the network location of one of the machines in the cluster:
-
-```python
-with tf.Session("grpc://example.org:2222") as sess:
-  # Calls to sess.run(...) will be executed on the cluster.
-  ...
-```
-
-This machine becomes the master for the session. The master distributes the
-graph across other machines in the cluster (workers), much as the local
-implementation distributes the graph across available compute resources within
-a machine.
-
-You can use "with tf.device():" statements to directly specify workers for
-particular parts of the graph:
-
-```python
-with tf.device("/job:ps/task:0"):
-  weights = tf.Variable(...)
-  biases = tf.Variable(...)
-```
-
-See the [Distributed TensorFlow How To](../how_tos/distributed/) for more
-information about distributed sessions and clusters.
-
-## Interactive Usage
-
-The Python examples in the documentation launch the graph with a
-[`Session`](../api_docs/python/client.md#Session) and use the
-[`Session.run()`](../api_docs/python/client.md#Session.run) method to execute
-operations.
-
-For ease of use in interactive Python environments, such as
-[IPython](http://ipython.org) you can instead use the
-[`InteractiveSession`](../api_docs/python/client.md#InteractiveSession) class,
-and the [`Tensor.eval()`](../api_docs/python/framework.md#Tensor.eval) and
-[`Operation.run()`](../api_docs/python/framework.md#Operation.run) methods.  This
-avoids having to keep a variable holding the session.
-
-```python
-# Enter an interactive TensorFlow Session.
-import tensorflow as tf
-sess = tf.InteractiveSession()
-
-x = tf.Variable([1.0, 2.0])
-a = tf.constant([3.0, 3.0])
-
-# Initialize 'x' using the run() method of its initializer op.
-x.initializer.run()
-
-# Add an op to subtract 'a' from 'x'.  Run it and print the result
-sub = tf.subtract(x, a)
-print(sub.eval())
-# ==> [-2. -1.]
-
-# Close the Session when we're done.
-sess.close()
-```
-
-## Tensors
-
-TensorFlow programs use a tensor data structure to represent all data -- only
-tensors are passed between operations in the computation graph. You can think
-of a TensorFlow tensor as an n-dimensional array or list. A tensor has a
-static type, a rank, and a shape.  To learn more about how TensorFlow handles
-these concepts, see the [Rank, Shape, and Type](../resources/dims_types.md)
-reference.
-
-## Variables
-
-Variables maintain state across executions of the graph. The following example
-shows a variable serving as a simple counter.  See
-[Variables](../how_tos/variables/index.md) for more details.
-
-```python
-# Create a Variable, that will be initialized to the scalar value 0.
-state = tf.Variable(0, name="counter")
-
-# Create an Op to add one to `state`.
-
-one = tf.constant(1)
-new_value = tf.add(state, one)
-update = tf.assign(state, new_value)
-
-# Variables must be initialized by running an `init` Op after having
-# launched the graph.  We first have to add the `init` Op to the graph.
-init_op = tf.global_variables_initializer()
-
-# Launch the graph and run the ops.
-with tf.Session() as sess:
-  # Run the 'init' op
-  sess.run(init_op)
-  # Print the initial value of 'state'
-  print(sess.run(state))
-  # Run the op that updates 'state' and print 'state'.
-  for _ in range(3):
-    sess.run(update)
-    print(sess.run(state))
-
-# output:
-
-# 0
-# 1
-# 2
-# 3
-```
-
-The `assign()` operation in this code is a part of the expression graph just
-like the `add()` operation, so it does not actually perform the assignment
-until `run()` executes the expression.
-
-You typically represent the parameters of a statistical model as a set of
-Variables. For example, you would store the weights for a neural network as a
-tensor in a Variable. During training you update this tensor by running a
-training graph repeatedly.
-
-## Fetches
-
-To fetch the outputs of operations, execute the graph with a `run()` call on
-the `Session` object and pass in the tensors to retrieve. In the previous
-example we fetched the single node `state`, but you can also fetch multiple
-tensors:
-
-```python
-input1 = tf.constant([3.0])
-input2 = tf.constant([2.0])
-input3 = tf.constant([5.0])
-intermed = tf.add(input2, input3)
-mul = tf.multiply(input1, intermed)
-
-with tf.Session() as sess:
-  result = sess.run([mul, intermed])
-  print(result)
-
-# output:
-# [array([ 21.], dtype=float32), array([ 7.], dtype=float32)]
-```
-
-All the ops needed to produce the values of the requested tensors are run once
-(not once per requested tensor).
-
-## Feeds
-
-The examples above introduce tensors into the computation graph by storing them
-in `Constants` and `Variables`. TensorFlow also provides a feed mechanism for
-patching a tensor directly into any operation in the graph.
-
-A feed temporarily replaces the output of an operation with a tensor value.
-You supply feed data as an argument to a `run()` call. The feed is only used for
-the run call to which it is passed. The most common use case involves
-designating specific operations to be "feed" operations by using
-tf.placeholder() to create them:
-
-```python
-
-input1 = tf.placeholder(tf.float32)
-input2 = tf.placeholder(tf.float32)
-output = input1 * input2
-
-with tf.Session() as sess:
-  print(sess.run([output], feed_dict={input1:[7.], input2:[2.]}))
-
-# output:
-# [array([ 14.], dtype=float32)]
-```
-
-A `placeholder()` operation generates an error if you do not supply a feed for
-it. See the
-[MNIST fully-connected feed tutorial](../tutorials/mnist/tf/index.md)
-([source code](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py))
-for a larger-scale example of feeds.
-
diff --git a/tensorflow/g3doc/get_started/index.md b/tensorflow/g3doc/get_started/index.md
deleted file mode 100644
index 1642a87ece2..00000000000
--- a/tensorflow/g3doc/get_started/index.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Introduction
-
-Let's get you up and running with TensorFlow!
-
-But before we even get started, let's peek at what TensorFlow
-code looks like in the Python API, so you have a sense of where we're
-headed.
-
-Here's a little Python program that makes up some data in two dimensions, and
-then fits a line to it.
-
-```python
-import tensorflow as tf
-import numpy as np
-
-# Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3
-x_data = np.random.rand(100).astype(np.float32)
-y_data = x_data * 0.1 + 0.3
-
-# Try to find values for W and b that compute y_data = W * x_data + b
-# (We know that W should be 0.1 and b 0.3, but TensorFlow will
-# figure that out for us.)
-W = tf.Variable(tf.random_uniform([1], -1.0, 1.0))
-b = tf.Variable(tf.zeros([1]))
-y = W * x_data + b
-
-# Minimize the mean squared errors.
-loss = tf.reduce_mean(tf.square(y - y_data))
-optimizer = tf.train.GradientDescentOptimizer(0.5)
-train = optimizer.minimize(loss)
-
-# Before starting, initialize the variables.  We will 'run' this first.
-init = tf.global_variables_initializer()
-
-# Launch the graph.
-sess = tf.Session()
-sess.run(init)
-
-# Fit the line.
-for step in range(201):
-    sess.run(train)
-    if step % 20 == 0:
-        print(step, sess.run(W), sess.run(b))
-
-# Learns best fit is W: [0.1], b: [0.3]
-
-# Close the Session when we're done.
-sess.close()
-```
-
-The first part of this code builds the data flow graph.  TensorFlow does not
-actually run any computation until the session is created and the `run`
-function is called.
-
-To whet your appetite further, we suggest you check out what a classical
-machine learning problem looks like in TensorFlow.  In the land of neural
-networks the most "classic" classical problem is the MNIST handwritten digit
-classification.  We offer two introductions here, one for machine learning
-newbies, and one for pros.  If you've already trained dozens of MNIST models in
-other software packages, please take the red pill.  If you've never even heard
-of MNIST, definitely take the blue pill.  If you're somewhere in between, we
-suggest skimming blue, then red.
-
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row">
- <a href="../tutorials/mnist/beginners/index.md" title="MNIST for ML Beginners tutorial">
-   <img style="flex-grow:1; flex-shrink:1; border: 1px solid black;" src="../images/blue_pill.png" alt="MNIST for machine learning beginners tutorial" />
- </a>
- <a href="../tutorials/mnist/pros/index.md" title="Deep MNIST for ML Experts tutorial">
-   <img style="flex-grow:1; flex-shrink:1; border: 1px solid black;" src="../images/red_pill.png" alt="Deep MNIST for machine learning experts tutorial" />
- </a>
-</div>
-<p style="font-size:10px;">Images licensed CC BY-SA 4.0; original by W. Carter</p>
-
-If you're already sure you want to learn and install TensorFlow you can skip
-these and charge ahead.  Don't worry, you'll still get to see MNIST -- we'll
-also use MNIST as an example in our technical tutorial where we elaborate on
-TensorFlow features.
-
-## Recommended Next Steps
-* [Download and Setup](../get_started/os_setup.md)
-* [Basic Usage](../get_started/basic_usage.md)
-* [TensorFlow Mechanics 101](../tutorials/mnist/tf/index.md)
-* [Tinker with a neural network in your browser](http://playground.tensorflow.org)
diff --git a/tensorflow/g3doc/get_started/leftnav_files b/tensorflow/g3doc/get_started/leftnav_files
deleted file mode 100644
index 3fccbc0db51..00000000000
--- a/tensorflow/g3doc/get_started/leftnav_files
+++ /dev/null
@@ -1,3 +0,0 @@
-index.md
-os_setup.md
-basic_usage.md
\ No newline at end of file
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
deleted file mode 100644
index cdee3b70c63..00000000000
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ /dev/null
@@ -1,1319 +0,0 @@
-# Download and Setup
-
-You can install TensorFlow either from our provided binary packages or from the
-github source.
-
-## Requirements
-
-The TensorFlow Python API supports Python 2.7 and Python 3.3+.
-
-The GPU version works best with Cuda Toolkit 8.0 and
-cuDNN v5.1.  Other versions are supported (Cuda toolkit >= 7.0 and
-cuDNN >= v3) only when installing from sources.
-Please see [Cuda installation](#optional-install-cuda-gpus-on-linux) for
-details. For Mac OS X, please see 
-[Setup GPU for Mac](#optional-setup-gpu-for-mac).
-
-## Overview
-
-We support different ways to install TensorFlow:
-
-*  [Pip install](#pip-installation): Install TensorFlow on your machine,
-   possibly upgrading previously installed Python packages.  May impact existing
-   Python programs on your machine.
-*  [Virtualenv install](#virtualenv-installation): Install TensorFlow in its own
-   directory, not impacting any existing Python programs on your machine.
-*  [Anaconda install](#anaconda-installation): Install TensorFlow in its own
-   environment for those running the Anaconda Python distribution.  Does not
-   impact existing Python programs on your machine.
-*  [Docker install](#docker-installation): Run TensorFlow in a Docker container
-   isolated from all other programs on your machine.
-*  [Installing from sources](#installing-from-sources): Install TensorFlow by
-   building a pip wheel that you then install using pip.
-
-If you are familiar with Pip, Virtualenv, Anaconda, or Docker, please feel free
-to adapt the instructions to your particular needs.  The names of the pip and
-Docker images are listed in the corresponding installation sections.
-
-If you encounter installation errors, see
-[common problems](#common-problems) for some solutions.
-
-## Pip installation
-
-[Pip](https://en.wikipedia.org/wiki/Pip_(package_manager)) is a package
-management system used to install and manage software packages written in
-Python. We provide pip packages for TensorFlow on Linux, Mac OS X, and
-Windows. For Windows instructions, please see 
-[Pip installation on Windows](#pip-installation-on-windows).
-
-The packages that will be installed or upgraded during the pip install are
-listed in the [REQUIRED_PACKAGES section of
-setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py).
-
-Install pip (or pip3 for python3) if it is not already installed:
-
-```bash
-# Ubuntu/Linux 64-bit
-$ sudo apt-get install python-pip python-dev
-
-# Mac OS X
-$ sudo easy_install pip
-$ sudo easy_install --upgrade six
-```
-
-We have also uploaded the binaries to Pypi, so you can
-simply install tensorflow on Linux, Mac or Windows with pip install. Note you will need pip version 8.1 or later for the following commands to work on Linux :
-
-```bash
-$ pip install tensorflow
-```
-
-For installing the version with GPU support, please use:
-
-```bash
-$ pip install tensorflow-gpu
-```
-
-If the above commands do not work on your system, you can follow these instructions:
-
-```bash
-# Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py2-none-any.whl
-
-# Mac OS X, GPU enabled, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py2-none-any.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.3
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.3
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.6
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.6
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py3-none-any.whl
-
-# Mac OS X, GPU enabled, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py3-none-any.whl
-```
-
-Install TensorFlow:
-
-```bash
-# Python 2
-$ sudo pip install --upgrade $TF_BINARY_URL
-
-# Python 3
-$ sudo pip3 install --upgrade $TF_BINARY_URL
-```
-
-NOTE: If you are upgrading from a previous installation of TensorFlow < 0.7.1,
-you should uninstall the previous TensorFlow *and protobuf* using `pip
-uninstall` first to make sure you get a clean installation of the updated
-protobuf dependency.
-
-
-You can now [test your installation](#test-the-tensorflow-installation).
-
-
-### Pip installation on Windows
-
-TensorFlow supports only 64-bit Python 3.5 on Windows. We have tested the pip packages
-with the following distributions of Python:
-
-* [Python 3.5 from Anaconda](https://www.continuum.io/downloads#windows)
-
-* [Python 3.5 from python.org](https://www.python.org/downloads/release/python-352/).
-
-  NOTE: TensorFlow requires `MSVCP140.DLL`, which may not be installed on your system.
-  If, when you `import tensorflow as tf`, you see an error about `No module named
-  "_pywrap_tensorflow"` and/or `DLL load failed`, check whether `MSVCP140.DLL` is in
-  your `%PATH%` and, if not, you should install the [Visual C++ 2015
-  redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=53587)
-  (x64 version).
-  
-Both distributions include pip. To install the CPU-only version of
-TensorFlow, enter the following command at a command prompt:
-
-```bat
-C:\> pip install --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.0.0rc1-cp35-cp35m-win_amd64.whl
-```
-
-To install the GPU version of TensorFlow, enter the following command
-at a command prompt:
-
-```bat
-C:\> pip install --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.0.0rc1-cp35-cp35m-win_amd64.whl
-```
-
-You can now [test your installation](#test-the-tensorflow-installation).
-
-You can also [use Virtualenv](#virtualenv-installation) or [Anaconda
-environments](#anaconda-installation) to manage your installation of
-TensorFlow on Windows.
-
-
-## Virtualenv installation
-
-[Virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/) is a tool
-to keep the dependencies required by different Python projects in separate
-places.  The Virtualenv installation of TensorFlow will not override
-pre-existing version of the Python packages needed by TensorFlow.
-
-With [Virtualenv](https://pypi.python.org/pypi/virtualenv) the installation is
-as follows:
-
-*  Install pip and Virtualenv.
-*  Create a Virtualenv environment.
-*  Activate the Virtualenv environment and install TensorFlow in it.
-*  After the install you will activate the Virtualenv environment each time you
-   want to use TensorFlow.
-
-Install pip and Virtualenv:
-
-```bash
-# Ubuntu/Linux 64-bit
-$ sudo apt-get install python-pip python-dev python-virtualenv
-
-# Mac OS X
-$ sudo easy_install pip
-$ sudo pip install --upgrade virtualenv
-```
-
-Create a Virtualenv environment in the directory `~/tensorflow`:
-
-```bash
-$ virtualenv --system-site-packages ~/tensorflow
-```
-
-Activate the environment:
-
-```bash
-$ source ~/tensorflow/bin/activate  # If using bash
-$ source ~/tensorflow/bin/activate.csh  # If using csh
-(tensorflow)$  # Your prompt should change
-```
-
-Now, install TensorFlow just as you would for a regular Pip installation. First select the correct binary to install:
-
-```bash
-# Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py2-none-any.whl
-
-# Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py2-none-any.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.3
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.3
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.6
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.6
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py3-none-any.whl
-
-# Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py3-none-any.whl
-```
-
-Finally install TensorFlow:
-
-```bash
-# Python 2
-(tensorflow)$ pip install --upgrade $TF_BINARY_URL
-
-# Python 3
-(tensorflow)$ pip3 install --upgrade $TF_BINARY_URL
-```
-
-With the Virtualenv environment activated, you can now
-[test your installation](#test-the-tensorflow-installation).
-
-When you are done using TensorFlow, deactivate the environment.
-
-```bash
-(tensorflow)$ deactivate
-
-$  # Your prompt should change back
-```
-
-To use TensorFlow later you will have to activate the Virtualenv environment
-again:
-
-```bash
-$ source ~/tensorflow/bin/activate  # If using bash.
-$ source ~/tensorflow/bin/activate.csh  # If using csh.
-(tensorflow)$  # Your prompt should change.
-# Run Python programs that use TensorFlow.
-...
-# When you are done using TensorFlow, deactivate the environment.
-(tensorflow)$ deactivate
-```
-
-## Anaconda installation
-
-[Anaconda](https://www.continuum.io/why-anaconda) is a Python distribution that
-includes a large number of standard numeric and scientific computing packages.
-Anaconda uses a package manager called ["conda"](http://conda.pydata.org) that
-has its own [environment system](http://conda.pydata.org/docs/using/envs.html)
-similar to Virtualenv.
-
-As with Virtualenv, conda environments keep the dependencies required by
-different Python projects in separate places.  The Anaconda environment
-installation of TensorFlow will not override pre-existing version of the Python
-packages needed by TensorFlow.
-
-*  Install Anaconda.
-*  Create a conda environment.
-*  Activate the conda environment and install TensorFlow in it.
-*  After the install you will activate the conda environment each time you
-   want to use TensorFlow.
-*  Optionally install ipython and other packages into the conda environment.
-
-Install Anaconda:
-
-Follow the instructions on the [Anaconda download
-site](https://www.continuum.io/downloads).
-
-Note: If tensorflow has been installed via pip outside the Anaconda environment
-previously, then one should uninstall it if one wants to use the tensorflow 
-installed within an Anaconda environment, because Anaconda searches system 
-site-packages from `.local` with higher priority.
-```bash
-# Python 2
-$ pip uninstall tensorflow
-
-# Python 3
-$ pip3 uninstall tensorflow
-```
-
-
-Create a conda environment called `tensorflow`:
-
-```bash
-# Python 2.7
-$ conda create -n tensorflow python=2.7
-
-# Python 3.4
-$ conda create -n tensorflow python=3.4
-
-# Python 3.5
-$ conda create -n tensorflow python=3.5
-```
-
-Activate the environment and use conda or pip to install TensorFlow inside it.
-
-
-### Using conda
-
-A community maintained conda package is available [from
-conda-forge](https://github.com/conda-forge/tensorflow-feedstock).
-
-Only the CPU version of TensorFlow is available at the moment and can be
-installed in the conda environment for Python 2 or Python 3.
-
-```bash
-$ source activate tensorflow
-(tensorflow)$  # Your prompt should change
-
-# Linux/Mac OS X, Python 2.7/3.4/3.5, CPU only:
-(tensorflow)$ conda install -c conda-forge tensorflow
-```
-
-### Using pip
-
-If using pip make sure to use the `--ignore-installed` flag to prevent errors
-about `easy_install`.
-
-```bash
-$ source activate tensorflow
-(tensorflow)$  # Your prompt should change
-```
-
-Now, install TensorFlow just as you would for a regular Pip installation. First
-select the correct binary to install:
-
-```bash
-# Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp27-none-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py2-none-any.whl
-
-# Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py2-none-any.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.3
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.3
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp33-cp33m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp34-cp34m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp35-cp35m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, CPU only, Python 3.6
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Ubuntu/Linux 64-bit, GPU enabled, Python 3.6
-# Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.0rc1-cp36-cp36m-linux_x86_64.whl
-
-# Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.0rc1-py3-none-any.whl
-
-# Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.0rc1-py3-none-any.whl
-```
-
-Finally install TensorFlow:
-
-```bash
-# Python 2
-(tensorflow)$ pip install --ignore-installed --upgrade $TF_BINARY_URL
-
-# Python 3
-(tensorflow)$ pip3 install --ignore-installed --upgrade $TF_BINARY_URL
-```
-
-### Usage
-
-With the conda environment activated, you can now
-[test your installation](#test-the-tensorflow-installation).
-
-When you are done using TensorFlow, deactivate the environment.
-
-```bash
-(tensorflow)$ source deactivate
-
-$  # Your prompt should change back
-```
-
-To use TensorFlow later you will have to activate the conda environment again:
-
-```bash
-$ source activate tensorflow
-(tensorflow)$  # Your prompt should change.
-# Run Python programs that use TensorFlow.
-...
-# When you are done using TensorFlow, deactivate the environment.
-(tensorflow)$ source deactivate
-```
-
-### Install IPython
-
-To use tensorflow with IPython it may be necessary to install IPython into the
-tensorflow environment:
-
-```bash
-$ source activate tensorflow
-(tensorflow)$ conda install ipython
-```
-
-Similarly, other Python packages like pandas may need to get installed into the
-tensorflow environment before they can be used together with tensorflow.
-
-
-## Docker installation
-
-[Docker](http://docker.com/) is a system to build self contained versions of a
-Linux operating system running on your machine.  When you install and run
-TensorFlow via Docker it completely isolates the installation from pre-existing
-packages on your machine.
-
-We provide 4 Docker images:
-
-* `gcr.io/tensorflow/tensorflow`: TensorFlow CPU binary image.
-* `gcr.io/tensorflow/tensorflow:latest-devel`: CPU Binary image plus source
-code.
-* `gcr.io/tensorflow/tensorflow:latest-gpu`: TensorFlow GPU binary image.
-* `gcr.io/tensorflow/tensorflow:latest-devel-gpu`: GPU Binary image plus source
-code.
-
-We also have tags with `latest` replaced by a released version (e.g.,
-`1.0.0-rc1-gpu`).
-
-With Docker the installation is as follows:
-
-*  Install Docker on your machine.
-*  Create a [Docker
-group](https://docs.docker.com/engine/installation/linux/ubuntulinux/#/create-a-docker-group)
-to allow launching containers without `sudo`.
-*  Launch a Docker container with the TensorFlow image.  The image
-   gets downloaded automatically on first launch.
-
-See [installing Docker](http://docs.docker.com/engine/installation/) for
-instructions on installing Docker on your machine.
-
-After Docker is installed, launch a Docker container with the TensorFlow binary
-image as follows.
-
-```bash
-$ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow
-```
-
-The option `-p 8888:8888` is used to publish the Docker container᾿s internal
-port to the host machine, in this case to ensure Jupyter notebook connection.
-
-The format of the port mapping is `hostPort:containerPort`. You can specify any
-valid port number for the host port but have to use `8888` for the container
-port portion.
-
-If you're using a container with GPU support, some additional flags must be
-passed to expose the GPU device to the container.
-
-For NVidia GPU support install latest NVidia drivers and
-[nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run with
-
-```bash
-$ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
-```
-
-If you run into a problem running `nvidia-docker`, Please report an issue
-[here](https://github.com/NVIDIA/nvidia-docker/issues). 
-
-For more details see [TensorFlow docker
-readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
-
-You can now [test your installation](#test-the-tensorflow-installation) within
-the Docker container.
-
-## Test the TensorFlow installation
-
-### (Optional, Linux) Enable GPU Support
-
-If you installed the GPU version of TensorFlow, you must also install the Cuda
-Toolkit 8.0 and cuDNN v5.1.  Please see [Cuda
-installation](#optional-install-cuda-gpus-on-linux).
-
-You also need to set the `LD_LIBRARY_PATH` and `CUDA_HOME` environment
-variables.  Consider adding the commands below to your `~/.bash_profile`.  These
-assume your CUDA installation is in `/usr/local/cuda`:
-
-```bash
-export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-export CUDA_HOME=/usr/local/cuda
-```
-
-### Run TensorFlow from the Command Line
-
-See [common problems](#common-problems) if an error happens.
-
-Open a terminal and type the following:
-
-```bash
-$ python
-...
->>> import tensorflow as tf
->>> hello = tf.constant('Hello, TensorFlow!')
->>> sess = tf.Session()
->>> print(sess.run(hello))
-Hello, TensorFlow!
->>> a = tf.constant(10)
->>> b = tf.constant(32)
->>> print(sess.run(a + b))
-42
->>>
-```
-
-## Installing from sources
-
-When installing from source you will build a pip wheel that you then install
-using pip. You'll need pip for that, so install it as described
-[above](#pip-installation).
-
-To build TensorFlow from source on Windows, you can use experimental
-support for [Bazel on
-Windows](https://bazel.build/versions/master/docs/windows.html) or the
-[TensorFlow CMake
-build](https://github.com/tensorflow/tensorflow/tree/r1.0/tensorflow/contrib/cmake).
-
-### Clone the TensorFlow repository
-
-```bash
-$ git clone https://github.com/tensorflow/tensorflow
-```
-
-Note that these instructions will install the latest master branch of
-tensorflow. If you want to install a specific branch (such as a release branch),
-pass `-b <branchname>` to the `git clone` command and `--recurse-submodules` for
-r0.8 and earlier to fetch the protobuf library that TensorFlow depends on.
-
-### Prepare environment for Linux
-
-#### Install Bazel
-
-Follow instructions [here](http://bazel.build/docs/install.html) to install the
-dependencies for bazel. Then download the latest stable bazel version using the
-[installer for your system](https://github.com/bazelbuild/bazel/releases) and
-run the installer as mentioned there:
-
-```bash
-$ chmod +x PATH_TO_INSTALL.SH
-$ ./PATH_TO_INSTALL.SH --user
-```
-
-Remember to replace `PATH_TO_INSTALL.SH` with the location where you
-downloaded the installer.
-
-Finally, follow the instructions in that script to place `bazel` into your
-binary path.
-
-#### Install other dependencies
-
-```bash
-# For Python 2.7:
-$ sudo apt-get install python-numpy python-dev python-wheel python-mock
-# For Python 3.x:
-$ sudo apt-get install python3-numpy python3-dev python3-wheel python3-mock
-```
-
-#### Optional: Install CUDA (GPUs on Linux)
-
-In order to build or run TensorFlow with GPU support, both NVIDIA's Cuda Toolkit
-(>= 7.0) and cuDNN (>= v3) need to be installed.
-
-TensorFlow GPU support requires having a GPU card with NVidia Compute Capability
-(\>= 3.0).  Supported cards include but are not limited to:
-
-* NVidia Titan
-* NVidia Titan X
-* NVidia K20
-* NVidia K40
-
-##### Check NVIDIA Compute Capability of your GPU card
-
-[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
-
-##### Download and install Cuda Toolkit
-
-[https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
-
-Install version 8.0 if using our binary releases.
-
-Install the toolkit into e.g. `/usr/local/cuda`.
-
-##### Download and install cuDNN
-
-[https://developer.nvidia.com/cudnn](https://developer.nvidia.com/cudnn)
-
-Download cuDNN v5.1.
-
-Uncompress and copy the cuDNN files into the toolkit directory. Assuming the
-toolkit is installed in `/usr/local/cuda`, run the following commands (edited
-to reflect the cuDNN version you downloaded):
-
-``` bash
-tar xvzf cudnn-8.0-linux-x64-v5.1.tgz
-sudo cp -P cuda/include/cudnn.h /usr/local/cuda/include/
-sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda/lib64/
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-```
-
-##### Install other dependencies
-
-```bash
-$ sudo apt-get install libcupti-dev
-```
-
-#### Optional: Install OpenCL (Experimental, Linux only)
-
-In order to build or run TensorFlow with OpenCL support, both OpenCL (>= 1.2)
-and ComputeCpp (>= 0.1.1) need to be installed.
-
-TensorFlow can only take advantage of accelerators that support OpenCL 1.2.
-Supported accelerators include but are not limited to:
-
-*   AMD Fiji
-*   AMD Hawaii
-
-Note that this support is currently experimental and should not be relied upon
-for production (though it will mature over time).
-
-##### Download and install OpenCL drivers
-
-The exact steps required for a functional OpenCL installation will depend on
-your environment. For Unbuntu 14.04, the following steps are known to work:
-
-```bash
-sudo apt-get install ocl-icd-opencl-dev opencl-headers
-```
-
-You will also need to install the drivers for the accelerator itself. We've
-tested that the following drivers for AMD Fiji and Hawaii GPUs on Ubuntu 14.04:
-
-```bash
-sudo apt-get install fglrx-core fglrx-dev
-```
-
-##### Download and install the ComputeCpp compiler
-
-Download the compiler from [Codeplay's
-website](https://www.codeplay.com/products/computesuite/computecpp), uncompress
-and copy the files into e.g. `/usr/local/computecpp`:
-
-```bash
-tar -xvzf ComputeCpp-CE-0.1.1-Ubuntu.14.04-64bit.tar.gz
-sudo mkdir /usr/local/computecpp
-sudo cp -R ComputeCpp-CE-0.1.1-Linux /usr/local/computecpp
-sudo chmod -R a+r /usr/local/computecpp/
-sudo chmod -R a+x /usr/local/computecpp/bin
-```
-
-### Prepare environment for Mac OS X
-
-We recommend using [homebrew](http://brew.sh) to install the bazel dependency,
-and installing python dependencies using easy_install or pip.
-
-#### Dependencies
-
-Follow instructions [here](http://bazel.build/docs/install.html) to install the
-dependencies for bazel. You can then use homebrew to install bazel:
-
-```bash
-$ brew install bazel
-```
-
-You can install the python dependencies using easy_install or pip. Using
-easy_install, run
-
-```bash
-$ sudo easy_install -U six
-$ sudo easy_install -U numpy
-$ sudo easy_install wheel
-```
-
-We also recommend the [ipython](https://ipython.org) enhanced python shell,
-which you can install as follows:
-
-```bash
-$ sudo easy_install ipython
-```
-
-#### Optional: Setup GPU for Mac
-
-If you plan to  build with GPU support you will need to make sure you have
-GNU coreutils installed via homebrew:
-
-```bash
-$ brew install coreutils
-```
-
-Next you will need to make sure you have a recent [CUDA
-Toolkit](https://developer.nvidia.com/cuda-toolkit) installed by either
-downloading the package for your version of OSX directly from
-[NVIDIA](https://developer.nvidia.com/cuda-downloads) or by using the [Homebrew
-Cask](https://caskroom.github.io/) extension:
-
-```bash
-$ brew tap caskroom/cask
-$ brew cask install cuda
-```
-
-Once you have the CUDA Toolkit installed you will need to setup the required
-environment variables by adding the following to your `~/.bash_profile`:
-
-```bash
-export CUDA_HOME=/usr/local/cuda
-export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$CUDA_HOME/lib"
-export PATH="$CUDA_HOME/bin:$PATH"
-```
-
-Finally, you will also want to install the [CUDA Deep Neural
-Network](https://developer.nvidia.com/cudnn) (cuDNN v5.1) library which currently
-requires an [Accelerated Computing Developer
-Program](https://developer.nvidia.com/accelerated-computing-developer) account.
-Once you have it downloaded locally, you can unzip and move the header and
-libraries to your local CUDA Toolkit folder:
-
-```bash
-$ sudo mv include/cudnn.h /Developer/NVIDIA/CUDA-8.0/include/
-$ sudo mv lib/libcudnn* /Developer/NVIDIA/CUDA-8.0/lib
-$ sudo ln -s /Developer/NVIDIA/CUDA-8.0/lib/libcudnn* /usr/local/cuda/lib/
-```
-
-To verify the CUDA installation, you can build and run deviceQuery to make sure
-it passes.
-
-```bash
-$ cp -r /usr/local/cuda/samples ~/cuda-samples
-$ pushd ~/cuda-samples
-$ make
-$ popd
-$ ~/cuda-samples/bin/x86_64/darwin/release/deviceQuery
-```
-
-If you want to compile tensorflow and have XCode 7.3 and CUDA 7.5 installed, note that
-Xcode 7.3 is not yet compatible with CUDA 7.5. You can either upgrade to CUDA
-8.0, or you will need to download Xcode
-7.2 and select it as your default:
-
-```bash
-$ sudo xcode-select -s /Application/Xcode-7.2/Xcode.app
-```
-
-
-### Configure the installation
-
-Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter and allows (optional)
-configuration of the CUDA libraries.
-
-This step is used to locate the python and numpy header files as well as
-enabling GPU support if you have a CUDA enabled GPU and Toolkit installed.
-Select the option `Y` when asked to build TensorFlow with GPU support.
-
-If you have several versions of Cuda or cuDNN installed, you should definitely
-select one explicitly instead of relying on the system default.
-
-For example:
-
-```bash
-$ ./configure
-Please specify the location of python. [Default is /usr/bin/python]:
-Do you wish to build TensorFlow with Google Cloud Platform support? [y/N] N
-No Google Cloud Platform support will be enabled for TensorFlow
-Do you wish to build TensorFlow with GPU support? [y/N] y
-Do you wish to build TensorFlow with OpenCL support? [y/N] N
-GPU support will be enabled for TensorFlow
-Please specify which gcc nvcc should use as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: 8.0
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify the cuDNN version you want to use. [Leave empty to use system default]: 5
-Please specify the location where cuDNN 5 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
-You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size.
-
-Setting up Cuda include
-Setting up Cuda lib
-Setting up Cuda bin
-Setting up Cuda nvvm
-Setting up CUPTI include
-Setting up CUPTI lib64
-Configuration finished
-```
-
-[Default is: "3.5,5.2"]: 3.0
-
-This creates a canonical set of symbolic links to the Cuda libraries on your
-system.  Every time you change the Cuda library paths you need to run this step
-again before you invoke the bazel build command. For the cuDNN libraries, use
-'7.0' for R3, and '4.0.7' for R4.
-
-If you want to built support for OpenCL, select the option `Y` when asked to
-build TensorFlow with OpenCL support, and provide the location of the ComputeCpp
-compiler (e.g. /usr/local/computecpp).
-
-#### Known issues
-
-* Although it is possible to build both Cuda and non-Cuda configs under the same
-source tree, we recommend to run `bazel clean` when switching between these two
-configs in the same source tree.
-
-* You have to run configure before running bazel build. Otherwise, the build
-will fail with a clear error message. In the future, we might consider making
-this more convenient by including the configure step in our build process.
-
-
-### Create the pip package and install
-
-When building from source, you will still build a pip package and install that.
-
-Please note that building from sources takes a lot of memory resources by
-default and if you want to limit RAM usage you can add `--local_resources
-2048,.5,1.0` while invoking bazel.
-
-```bash
-$ bazel build --config opt //tensorflow/tools/pip_package:build_pip_package
-
-# To build with support for CUDA:
-$ bazel build --config opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
-
-# Alternatively, to build with support for OpenCL (Experimental):
-$ bazel build --config opt --config=sycl //tensorflow/tools/pip_package:build_pip_package
-
-$ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
-
-# The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.0.0rc1-py2-none-any.whl
-```
-
-## Optimizing CPU performance
-
-To be compatible with as wide a range of machines as possible, TensorFlow
-defaults to only using SSE4 SIMD instructions. Most modern computers support
-more advanced instructions. So if you're building a binary that you'll only
-be running on your own machine, you can enable these by using `-march=native`
-for optimization options when running `configure`. Then you can build your
-optimized binaries with the following command:
-
-``` bash
-$ bazel build --config opt //tensorflow/tools/pip_package:build_pip_package
-```
-
-If you are distributing a binary but know the capabilities of the machines
-you'll be running on, you can manually choose the right instructions with
-something like `-march=avx`. You may also want to enable multiple
-features using several arguments, for example
-`-mavx2,-mfma`.
-
-If you run a binary built using SIMD instructions on a machine that doesn't
-support them, you'll see an illegal instruction error when that code is
-executed.
-
-## Setting up TensorFlow for Development
-
-If you're working on TensorFlow itself, it is useful to be able to test your
-changes in an interactive python shell without having to reinstall TensorFlow.
-
-To set up TensorFlow such that all files are linked (instead of copied) from the
-system directories, run the following commands inside the TensorFlow root
-directory:
-
-```bash
-bazel build --config opt //tensorflow/tools/pip_package:build_pip_package
-
-# To build with GPU support:
-bazel build --config opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
-
-mkdir _python_build
-cd _python_build
-ln -s ../bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/* .
-ln -s ../tensorflow/tools/pip_package/* .
-python setup.py develop
-```
-
-Note that this setup still requires you to rebuild the
-`//tensorflow/tools/pip_package:build_pip_package` target every time you change
-a C++ file; add, delete, or move any python file; or if you change bazel build
-rules.
-
-Note also that `bazel test` will not always properly resolve dependencies
-through these symlinks, so test results may be unreliable. A workaround is to
-remove the `_python_build` directory before running `bazel test`.
-
-One more thing is that `python setup.py development` does not install the packages listed in `REQUIRED_PACKAGES` part of `setup.py`. You might need to install them separately.
-
-## Train your first TensorFlow neural net model
-
-Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
-
-```bash
-$ cd models/tutorials/image/mnist
-$ python convolutional.py
-Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
-Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
-Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
-Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
-Extracting data/train-images-idx3-ubyte.gz
-Extracting data/train-labels-idx1-ubyte.gz
-Extracting data/t10k-images-idx3-ubyte.gz
-Extracting data/t10k-labels-idx1-ubyte.gz
-Initialized!
-Epoch 0.00
-Minibatch loss: 12.054, learning rate: 0.010000
-Minibatch error: 90.6%
-Validation error: 84.6%
-Epoch 0.12
-Minibatch loss: 3.285, learning rate: 0.010000
-Minibatch error: 6.2%
-Validation error: 7.0%
-...
-...
-```
-
-## Common Problems
-
-### GPU-related issues
-
-If you encounter the following when trying to run a TensorFlow program:
-
-```python
-ImportError: libcudart.so.7.0: cannot open shared object file: No such file or directory
-```
-
-Make sure you followed the GPU installation
-[instructions](#optional-install-cuda-gpus-on-linux). If you built from source,
-and you left the Cuda or cuDNN version empty, try specifying them explicitly.
-
-### Protobuf library related issues
-
-TensorFlow pip package depends on protobuf pip package version
-3.1.0. Protobuf's pip package downloaded from [PyPI](https://pypi.python.org)
-(when running `pip install protobuf`) is a Python only library, that has
-Python implementations of proto serialization/deserialization which can be
-10x-50x slower than the C++ implementation. Protobuf also supports a binary
-extension for the Python package that contains fast C++ based proto parsing.
-This extension is not available in the standard Python only PIP package. We have
-created a custom binary pip package for protobuf that contains the binary
-extension. Follow these instructions to install the custom binary protobuf pip
-package:
-
-```bash
-# Ubuntu/Linux 64-bit:
-$ pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp27-none-linux_x86_64.whl
-
-# Mac OS X:
-$ pip install --upgrade https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp27-none-macosx_10_11_x86_64.whl
-```
-
-And for Python 3.5:
-
-```bash
-# Ubuntu/Linux 64-bit:
-$ pip3 install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp35-none-linux_x86_64.whl
-
-# Mac OS X:
-$ pip3 install --upgrade https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp35-none-macosx_10_11_x86_64.whl
-```
-
-If your system/configuration is not listed above, you can use the following
-instructions to build your own protobuf wheel file.
-To install its prerequisites, [see
-here](https://github.com/google/protobuf/blob/master/src/README.md):
-
-Then:
-
-```bash
-$ git clone https://github.com/google/protobuf.git
-$ cd protobuf
-$ ./autogen.sh
-$ CXXFLAGS="-fPIC -g -O2" ./configure
-$ make -j12
-$ export PROTOC=$PWD/src/protoc
-$ cd python
-$ python setup.py bdist_wheel --cpp_implementation --compile_static_extension
-$ pip uninstall protobuf
-$ pip install dist/<wheel file name>
-```
-
-Install the above package _after_ you have installed TensorFlow via pip, as the
-standard `pip install tensorflow` would install the python only pip package. The
-above pip package will over-write the existing protobuf package.
-Note that the binary pip package already has support for protobuf larger than
-64MB, that should fix errors such as these :
-
-```bash
-[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207] A
-protocol message was rejected because it was too big (more than 67108864 bytes).
-To increase the limit (or to disable these warnings), see
-CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-
-```
-
-### Pip installation issues
-
-#### Cannot import name 'descriptor'
-
-```python
-ImportError: Traceback (most recent call last):
-  File "/usr/local/lib/python3.4/dist-packages/tensorflow/core/framework/graph_pb2.py", line 6, in <module>
-    from google.protobuf import descriptor as _descriptor
-ImportError: cannot import name 'descriptor'
-```
-
-If you the above error when upgrading to a newer version of TensorFlow, try
-uninstalling both TensorFlow and protobuf (if installed) and re-installing
-TensorFlow (which will also install the correct protobuf dependency).
-
-#### Can't find setup.py
-
-If, during `pip install`, you encounter an error like:
-
-```bash
-...
-IOError: [Errno 2] No such file or directory: '/tmp/pip-o6Tpui-build/setup.py'
-```
-
-Solution: upgrade your version of pip:
-
-```bash
-pip install --upgrade pip
-```
-
-This may require `sudo`, depending on how pip is installed.
-
-#### SSLError: SSL_VERIFY_FAILED
-
-If, during pip install from a URL, you encounter an error like:
-
-```bash
-...
-SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
-```
-
-Solution: Download the wheel manually via curl or wget, and pip install locally.
-
-
-#### Operation not permitted
-
-If, despite using `sudo`, you encounter an error like:
-
-```bash
-...
-Installing collected packages: setuptools, protobuf, wheel, numpy, tensorflow
-Found existing installation: setuptools 1.1.6
-Uninstalling setuptools-1.1.6:
-Exception:
-...
-[Errno 1] Operation not permitted: '/tmp/pip-a1DXRT-uninstall/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/_markerlib'
-```
-
-Solution: Add an `--ignore-installed` flag to the pip command.
-
-#### Cannot remove entries from nonexistent file: easy-install.pth
-
-If during a `pip` installation using an Anaconda Python distribution you encounter the error:
-
-```
-Cannot remove entries from nonexistent file <path-to-anaconda-instalation>/anaconda[version]/lib/site-packages/easy-install.pth
-```
-
-1. Upgrade setuptools:  
-`pip install --upgrade -I setuptools`
-
-2. Install TensorFlow again adding `--ignore-installed` flag:  
-`pip install --ignore-installed --upgrade <tensorflow_url>`
-
-
-
-Step #1 might already solve the problem, however if it still persists, execute step #2.
-
-This issue occurs with new Anaconda installations when `pip` tries to remove `easy-install.pth`. 
-This file is not included in Anaconda packages, which causes the `pip` installation to fail.
-
-#### Cupti_wrapper.cc: Could not find cuptiActivityRegisterCallbacksin libcupti DSO
-
-If, when running a TensorFlow Python script, you encounter the following error:
-```
-c:\tf_jenkins\home\workspace\nightly-win\device\gpu\os\windows\tensorflow\core\platform\default\gpu\cupti_wrapper.cc:59] Check failed: ::tensorflow::Status::OK() == (::tensorflow::Env::Default()->GetSymbolFromLibrary( GetDsoHandle(), kName, &f  )) (OK vs. Not found: cuptiActivityRegisterCallbacks not found)could not find cuptiActivityRegisterCallbacksin libcupti DSO
-```
-
-Add `<path-to-cuda-folder>\NVIDIA GPU Computing Toolkit\CUDA\v8.0\extras\CUPTI\libx64` to your `PATH`.
-
-This issue occurs because on CUDA 8.0 the location of the file `cupti64_80.dll` is not on `PATH` by default.
-
-
-### Linux issues
-
-If you encounter:
-
-```python
-...
- "__add__", "__radd__",
-             ^
-SyntaxError: invalid syntax
-```
-
-Solution: make sure you are using Python 2.7.
-
-#### Ubuntu build issue on Linux 16.04 when building with --config=cuda: build fail with cuda: identifier "__builtin_ia32_mwaitx" is undefined.
-GitHub issue: https://github.com/tensorflow/tensorflow/issues/1066
-
-Solution: Add the following compiler flags to third_party/gpus/crosstool/CROSSTOOL
-
-cxx_flag: "-D_MWAITXINTRIN_H_INCLUDED"
-cxx_flag: "-D_FORCE_INLINES"
-
-### Mac OS X: ImportError: No module named copyreg
-
-On Mac OS X, you may encounter the following when importing tensorflow.
-
-```python
->>> import tensorflow as tf
-...
-ImportError: No module named copyreg
-```
-
-Solution: TensorFlow depends on protobuf, which requires the Python package
-`six-1.10.0`. Apple's default Python installation only provides `six-1.4.1`.
-
-You can resolve the issue in one of the following ways:
-
-* Upgrade the Python installation with the current version of `six`:
-
-```bash
-$ sudo easy_install -U six
-```
-
-* Install TensorFlow with a separate Python library:
-
-    *  Using [Virtualenv](#virtualenv-installation).
-    *  Using [Docker](#docker-installation).
-
-* Install a separate copy of Python via [Homebrew](http://brew.sh/) or
-[MacPorts](https://www.macports.org/) and re-install TensorFlow in that
-copy of Python.
-
-### Mac OS X: OSError: [Errno 1] Operation not permitted:
-
-On El Capitan, "six" is a special package that can't be modified, and this
-error is reported when "pip install" tried to modify this package. To fix use
-"ignore-installed" flag, ie
-
-sudo pip install --ignore-installed six https://storage.googleapis.com/....
-
-
-### Mac OS X: TypeError: `__init__()` got an unexpected keyword argument 'syntax'
-
-On Mac OS X, you may encounter the following when importing tensorflow.
-
-```
->>> import tensorflow as tf
-Traceback (most recent call last):
-  File "<stdin>", line 1, in <module>
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/__init__.py", line 4, in <module>
-    from tensorflow.python import *
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/python/__init__.py", line 13, in <module>
-    from tensorflow.core.framework.graph_pb2 import *
-...
-  File "/usr/local/lib/python2.7/site-packages/tensorflow/core/framework/tensor_shape_pb2.py", line 22, in <module>
-    serialized_pb=_b('\n,tensorflow/core/framework/tensor_shape.proto\x12\ntensorflow\"d\n\x10TensorShapeProto\x12-\n\x03\x64im\x18\x02 \x03(\x0b\x32 .tensorflow.TensorShapeProto.Dim\x1a!\n\x03\x44im\x12\x0c\n\x04size\x18\x01 \x01(\x03\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3')
-TypeError: __init__() got an unexpected keyword argument 'syntax'
-```
-
-This is due to a conflict between protobuf versions (we require protobuf 3.1.0).
-The best current solution is to make sure older versions of protobuf are not
-installed, such as:
-
-```bash
-$ pip install --upgrade protobuf
-```
-
-### Mac OS X: Segmentation Fault when import tensorflow
-
-On Mac OS X, you might get the following error when importing tensorflow in python:
-
-```
->>> import tensorflow
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.dylib locally
-"import tensorflow" terminated by signal SIGSEGV (Address boundary error)
-```
-
-This is due to the fact that by default, cuda creates libcuda.dylib, but tensorflow tries to load libcuda.1.dylib.
-This can be resolved by create a symbolic link:
-
-```bash
-ln -sf /usr/local/cuda/lib/libcuda.dylib /usr/local/cuda/lib/libcuda.1.dylib
-```
-
-### Mac OS X: RuntimeError: Broken toolchain: cannot link a simple C program
-
-On Mac OS X, when installing tensorflow you might see lots of warnings and errors, ending with a `Broken toolchain: cannot link a simple C program` message:
-
-```
->>> sudo pip install --upgrade $TF_BINARY_URL
-
-...<lots more warnings and errors>
-
-You have not agreed to the Xcode license agreements, please run 'xcodebuild -license' (for user-level acceptance) or 'sudo xcodebuild -license' (for system-wide acceptance) from within a Terminal window to review and agree to the Xcode license agreements.
-
-...<more stack trace output>
-
-  File "numpy/core/setup.py", line 653, in get_mathlib_info
-
-    raise RuntimeError("Broken toolchain: cannot link a simple C program")
-
-RuntimeError: Broken toolchain: cannot link a simple C program
-```
-
-This is typically because you have the Xcode build tools installed, but you still need to accept the license agreements.  To resolve it, accept the license agreement by opening Xcode, or by running `xcodebuild -license` from the command line.
-
-### Import Error
-When importing tensorflow, you may see an "ImportError" raised.  Below
-are some possible examples and solutions:
-
-```
-ImportError: /lib64/libc.so.6: version `GLIBC_2.16' not found (required by ..._pywrap_tensorflow.so)
-```
-
-This can occur if your operating system libraries are too old for
-our provided pip package binaries.  Solution: Try
-[building from sources](#installing-from-sources).
-
-```
-ImportError: cannot import name pywrap_tensorflow
-```
-
-This can occur if you happen to be in the tensorflow source code directory
-and try to import tensorflow: the order of search path prefers the current
-directory, and so tries to import directly from the source code instead of
-your installed tensorflow package.  Solution: don't import tensorflow
-from the tensorflow source code root directory, if you are.
-
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/index.md b/tensorflow/g3doc/how_tos/adding_an_op/index.md
deleted file mode 100644
index 6006e00a8ed..00000000000
--- a/tensorflow/g3doc/how_tos/adding_an_op/index.md
+++ /dev/null
@@ -1,1211 +0,0 @@
-# Adding a New Op
-
-PREREQUISITES:
-
-* Some familiarity with C++.
-* Must have installed the
-  [TensorFlow binary](../../get_started/os_setup.md#pip-installation), or must
-  have
-  [downloaded TensorFlow source](../../get_started/os_setup.md#installing-from-sources),
-  and be able to build it.
-
-If you'd like to incorporate an operation that isn't covered by the existing
-library, you can create a custom Op. To incorporate your custom Op, you'll need
-to:
-
-* Register the new Op in a C++ file. The Op registration is independent of the
-  implementation, and describes the semantics of how the Op is invoked. For
-  example, it defines the Op name, and specifies its inputs and outputs.
-  It also defines the shape function that is used for tensor shape inference.
-* Implement the Op in C++. This implementation is called a "kernel", and there
-  can be multiple kernels for different architectures (e.g. CPUs, GPUs) or
-  input / output types.
-* Optionally, create a Python wrapper. This wrapper is the public API to create
-  the Op. A default wrapper is generated from the Op registration, which can be
-  used directly or added to.
-* Optionally, write a function to compute gradients for the Op.
-* Test the Op, typically in Python. If you define gradients, you can verify them with the Python [`GradientChecker`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/gradient_checker.py).
-
-[TOC]
-
-## Define the Op's interface
-
-You define the interface of an Op by registering it with the TensorFlow system.
-In the registration, you specify the name of your Op, its inputs (types and
-names) and outputs (types and names), as well as docstrings and
-any [attrs](#attrs) the Op might require.
-
-To see how this works, suppose you'd like to create an Op that takes a tensor of
-`int32`s and outputs a copy of the tensor, with all but the first element set to
-zero. Create file `tensorflow/core/user_ops/zero_out.cc` and
-add a call to the `REGISTER_OP` macro that defines the interface for such an Op:
-
-```c++
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;
-
-REGISTER_OP("ZeroOut")
-    .Input("to_zero: int32")
-    .Output("zeroed: int32")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    });
-```
-
-This `ZeroOut` Op takes one tensor `to_zero` of 32-bit integers as input, and
-outputs a tensor `zeroed` of 32-bit integers of the same shape as the input.
-For example, if the input is a Tensor of shape [10, 20], then this shape
-function specifies that the output shape is also [10, 20].
-
-> A note on naming: The name of the Op should be unique and CamelCase.  Names
-> starting with an underscore (`_`) are reserved for internal use.
-
-## Implement the kernel for the Op
-
-After you define the interface, provide one or more implementations of the Op.
-To create one of these kernels, create a class that extends `OpKernel` and
-overrides the `Compute` method. The `Compute` method provides one `context`
-argument of type `OpKernelContext*`, from which you can access useful things
-like the input and output tensors.
-
-> Important note: Instances of your OpKernel may be accessed concurrently. Your
-> `Compute` method must be thread-safe. Guard any access to class members with a
-> mutex (Or better yet, don't share state via class members! Consider using a
-> [`ResourceMgr`](https://www.tensorflow.org/code/tensorflow/core/framework/resource_mgr.h)
-> to keep track of Op state).
-
-Add your kernel to the file you created above. The kernel might look something
-like this:
-
-```c++
-#include "tensorflow/core/framework/op_kernel.h"
-
-using namespace tensorflow;
-
-class ZeroOutOp : public OpKernel {
- public:
-  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
-
-    // Create an output tensor
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
-                                                     &output_tensor));
-    auto output = output_tensor->flat<int32>();
-
-    // Set all but the first element of the output tensor to 0.
-    const int N = input.size();
-    for (int i = 1; i < N; i++) {
-      output(i) = 0;
-    }
-
-    // Preserve the first input value if possible.
-    if (N > 0) output(0) = input(0);
-  }
-};
-```
-
-After implementing your kernel, you register it with the TensorFlow system. In
-the registration, you specify different constraints under which this kernel
-will run. For example, you might have one kernel made for CPUs, and a separate
-one for GPUs.
-
-To do this for the `ZeroOut` op, add the following to `zero_out.cc`:
-
-```c++
-REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
-```
-
-## Building the Op library
-### With TensorFlow binary installation
-
-You should be able to compile `zero_out.cc` with a `C++` compiler such as `g++`
-or `clang` available on your system. The binary PIP package installs the header
-files and the library that you need to compile your Op in locations that are
-system specific. However, the TensorFlow python library provides the
-`get_include` function to get the header directory.
-Here is the output of this function on a Ubuntu machine.
-
-```bash
-$ python
->>> import tensorflow as tf
->>> tf.sysconfig.get_include()
-'/usr/local/lib/python2.7/site-packages/tensorflow/include'
-
-```
-
-Assuming you have `g++` installed, here is the sequence of commands you can use
-to compile your Op into a dynamic library.
-
-```bash
-TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-
-g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I $TF_INC -O2
-```
-
-On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
-building the .so file.
-
-> Note on gcc version 5: gcc5 uses the new C++
-[ABI](https://gcc.gnu.org/gcc-5/changes.html#libstdcxx). The binary pip packages
-available on the TensorFlow website are built with gcc4 that uses the older ABI.
-If you compile your op library with gcc5, add `-D_GLIBCXX_USE_CXX11_ABI=0` to
-the command line to make the library compatible with the older abi.
-
-### With TensorFlow source installation
-
-If you have TensorFlow sources installed, you can make use of TensorFlow's build
-system to compile your Op. Place a BUILD file with following Bazel build rule in
-the [`tensorflow/core/user_ops`][user_ops] directory.
-
-```python
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-
-tf_custom_op_library(
-    name = "zero_out.so",
-    srcs = ["zero_out.cc"],
-)
-```
-
-Run the following command to build `zero_out.so`.
-
-```bash
-$ bazel build --config opt //tensorflow/core/user_ops:zero_out.so
-```
-
-> Note:
-Although you can create a shared library (a `.so` file) with the standard
-`cc_library` rule, we strongly recommend that you use the `tf_custom_op_library`
-macro. It adds some required dependencies, and performs checks to ensure that
-the shared library is compatible with TensorFlow's plugin loading mechanism.
-
-## Using the Op in Python
-
-TensorFlow Python API provides the
-[load_op_library](../../api_docs/python/framework#load_op_library) function to
-load the dynamic library and register the Op with the TensorFlow
-framework. `load_op_library` returns a Python module, that contains the Python
-wrappers for the Op. Thus, once you have built the op, you can do the following
-to run it from Python :
-
-```python
-import tensorflow as tf
-zero_out_module = tf.load_op_library('zero_out.so')
-with tf.Session(''):
-  zero_out_module.zero_out([[1, 2], [3, 4]]).eval()
-
-# Prints
-array([[1, 0],
-       [0, 0]], dtype=int32)
-```
-
-> Note: The generated function will be given a snake\_case name (to comply with
-> [PEP8](https://www.python.org/dev/peps/pep-0008/)).  So if your op is named
-> `ZeroOut` in the C++ files, the python function will be called `zero_out`.
-
-To make the Op available as a regular function `import`-able from a Python
-module, it maybe useful to have the `load_op_library` call in a Python source
-file as follows (see
-[zero_out_op_1.py](https://www.tensorflow.org/code/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_1.py))
-:
-
-```python
-import tensorflow as tf
-
-_zero_out_module = tf.load_op_library('zero_out_op_kernel_1.so')
-zero_out = _zero_out_module.zero_out
-```
-
-## Verify it works
-
-A good way to verify that you've successfully implemented your Op is to write a
-test for it. Create the file
-`tensorflow/python/kernel_tests/zero_out_op_test.py` with the contents:
-
-```python
-import tensorflow as tf
-
-class ZeroOutTest(tf.test.TestCase):
-  def testZeroOut(self):
-    zero_out_module = tf.load_op_library('zero_out.so')
-    with self.test_session():
-      result = zero_out_module.zero_out([5, 4, 3, 2, 1])
-      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
-
-if __name__ == "__main__":
-  tf.test.main()
-```
-
-Add a 'zero_out_op_test' target to `tensorflow/python/kernel_tests/BUILD` among the other CPU-only test targets:
-
-```
-tf_py_test(
-    name = "zero_out_op_test",
-    size = "small",
-    srcs = ["zero_out_op_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-)
-```
-
-Then run your test:
-
-```sh
-$ bazel test //tensorflow/python/kernel_tests:zero_out_op_test
-```
-
-## Validation
-
-The example above assumed that the Op applied to a tensor of any shape.  What
-if it only applied to vectors?  That means adding a check to the above OpKernel
-implementation.
-
-```c++
-  void Compute(OpKernelContext* context) override {
-    // Grab the input tensor
-    const Tensor& input_tensor = context->input(0);
-
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
-                errors::InvalidArgument("ZeroOut expects a 1-D vector."));
-    // ...
-  }
-```
-
-This asserts that the input is a vector, and returns having set the
-`InvalidArgument` status if it isn't.  The
-[`OP_REQUIRES` macro][validation-macros] takes three arguments:
-
-*   The `context`, which can either be an `OpKernelContext` or
-    `OpKernelConstruction` pointer (see
-    [`tensorflow/core/framework/op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op_kernel.h)),
-    for its `SetStatus()` method.
-*   The condition.  For example, there are functions for validating the shape
-    of a tensor in
-    [`tensorflow/core/framework/tensor_shape.h`](https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.h)
-*   The error itself, which is represented by a `Status` object, see
-    [`tensorflow/core/lib/core/status.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/status.h). A
-    `Status` has both a type (frequently `InvalidArgument`, but see the list of
-    types) and a message.  Functions for constructing an error may be found in
-    [`tensorflow/core/lib/core/errors.h`][validation-macros].
-
-Alternatively, if you want to test whether a `Status` object returned from some
-function is an error, and if so return it, use
-[`OP_REQUIRES_OK`][validation-macros].  Both of these macros return from the
-function on error.
-
-## Op registration
-
-### Attrs
-
-Ops can have attrs, whose values are set when the Op is added to a graph. These
-are used to configure the Op, and their values can be accessed both within the
-kernel implementation and in the types of inputs and outputs in the Op
-registration. Prefer using an input instead of an attr when possible, since
-inputs are more flexible.  They can change every step, be set using a feed, etc.
-Attrs are used for things that can't be done with inputs: any configuration
-that affects the signature (number or type of inputs or outputs) or that
-can't change from step-to-step.
-
-You define an attr when you register the Op, by specifying its name and type
-using the `Attr` method, which expects a spec of the form:
-
-```
-<name>: <attr-type-expr>
-```
-
-where `<name>` begins with a letter and can be composed of alphanumeric
-characters and underscores, and `<attr-type-expr>` is a type expression of the
-form [described below](#attr-types).
-
-For example, if you'd like the `ZeroOut` Op to preserve a user-specified index,
-instead of only the 0th element, you can register the Op like so:
-<code class="lang-c++"><pre>
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
-    .Output("zeroed: int32");
-</pre></code>
-
-Your kernel can then access this attr in its constructor via the `context`
-parameter:
-<code class="lang-c++"><pre>
-class ZeroOutOp : public OpKernel {
- public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
-    // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
-    // ...
-  }
- <b>private:
-  int preserve\_index\_;</b>
-};
-</pre></code>
-
-which can then be used in the `Compute` method:
-<code class="lang-c++"><pre>
-  void Compute(OpKernelContext\* context) override {
-    // ...
-<br/>    <b>// Check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
-  }
-</pre></code>
-
-> To preserve [backwards compatibility](#backwards-compatibility), you should
-> specify a [default value](#default-values-constraints) when adding an attr to
-> an existing op:
->
-> <code class="lang-c++"><pre>
-> REGISTER\_OP("ZeroOut")
->     <b>.Attr("preserve\_index: int = 0")</b>
->     .Input("to\_zero: int32")
->     .Output("zeroed: int32");
-> </pre></code>
-
-### Attr types
-
-The following types are supported in an attr:
-
-* `string`: Any sequence of bytes (not required to be UTF8).
-* `int`: A signed integer.
-* `float`: A floating point number.
-* `bool`: True or false.
-* `type`: One of the (non-ref) values of [`DataType`][DataTypeString].
-* `shape`: A [`TensorShapeProto`][TensorShapeProto].
-* `tensor`: A [`TensorProto`][TensorProto].
-* `list(<type>)`: A list of `<type>`, where `<type>` is one of the above types.
-  Note that `list(list(<type>))` is invalid.
-
-See also: [`op_def_builder.cc:FinalizeAttr`][FinalizeAttr] for a definitive list.
-
-#### Default values & constraints
-
-Attrs may have default values, and some types of attrs can have constraints. To
-define an attr with constraints, you can use the following `<attr-type-expr>`s:
-
-* `{'<string1>', '<string2>'}`: The value must be a string that has either the
-  value `<string1>` or `<string2>`.  The name of the type, `string`, is implied
-  when you use this syntax.  This emulates an enum:
-
-  ```c++
-  REGISTER_OP("EnumExample")
-      .Attr("e: {'apple', 'orange'}");
-  ```
-
-* `{<type1>, <type2>}`: The value is of type `type`, and must be one of
-  `<type1>` or `<type2>`, where `<type1>` and `<type2>` are supported
-  [tensor types](../../resources/dims_types.md#data-types).  You don't specify
-  that the type of the attr is `type`. This is implied when you have a list of
-  types in `{...}`.  For example, in this case the attr `t` is a type that must
-  be an `int32`, a `float`, or a `bool`:
-
-  ```c++
-  REGISTER_OP("RestrictedTypeExample")
-      .Attr("t: {int32, float, bool}");
-  ```
-
-* There are shortcuts for common type constraints:
-    * `numbertype`: Type `type` restricted to the numeric (non-string and
-      non-bool) types.
-    * `realnumbertype`: Like `numbertype` without complex types.
-    * `quantizedtype`: Like `numbertype` but just the quantized number types.
-
-    The specific lists of types allowed by these are defined by the functions
-    (like `NumberTypes()`) in
-    [`tensorflow/core/framework/types.h`](https://www.tensorflow.org/code/tensorflow/core/framework/types.h).
-    In this example the attr `t` must be one of the numeric types:
-
-    ```c++
-    REGISTER_OP("NumberType")
-        .Attr("t: numbertype");
-    ```
-
-    For this op:
-
-    ```python
-    tf.number_type(t=tf.int32)  # Valid
-    tf.number_type(t=tf.bool)   # Invalid
-    ```
-
-* `int >= <n>`: The value must be an int whose value is greater than or equal to
-  `<n>`, where `<n>` is a natural number.
-
-  For example, the following Op registration specifies that the attr `a` must
-  have a value that is at least `2`:
-
-  ```c++
-  REGISTER_OP("MinIntExample")
-      .Attr("a: int >= 2");
-  ```
-
-* `list(<type>) >= <n>`: A list of type `<type>` whose length is greater than
-  or equal to `<n>`.
-
-  For example, the following Op registration specifies that the attr `a` is a
-  list of types (either `int32` or `float`), and that there must be at least 3
-  of them:
-
-  ```c++
-  REGISTER_OP("TypeListExample")
-      .Attr("a: list({int32, float}) >= 3");
-  ```
-
-To set a default value for an attr (making it optional in the generated code),
-add `= <default>` to the end, as in:
-
-```c++
-REGISTER_OP("AttrDefaultExample")
-    .Attr("i: int = 0");
-```
-
-The supported syntax of the default value is what would be used in the proto
-representation of the resulting GraphDef definition.
-
-Here are examples for how to specify a default for all types:
-
-```c++
-REGISTER_OP("AttrDefaultExampleForAllTypes")
-   .Attr("s: string = 'foo'")
-   .Attr("i: int = 0")
-   .Attr("f: float = 1.0")
-   .Attr("b: bool = true")
-   .Attr("ty: type = DT_INT32")
-   .Attr("sh: shape = { dim { size: 1 } dim { size: 2 } }")
-   .Attr("te: tensor = { dtype: DT_INT32 int_val: 5 }")
-   .Attr("l_empty: list(int) = []")
-   .Attr("l_int: list(int) = [2, 3, 5, 7]");
-```
-
-Note in particular that the values of type `type` use [the `DT_*` names
-for the types](../../resources/dims_types.md#data-types).
-
-### Polymorphism
-#### Type Polymorphism
-
-For ops that can take different types as input or produce different output
-types, you can specify [an attr](#attrs) in
-[an input or output type](#inputs-and-outputs) in the Op registration.  Typically
-you would then register an `OpKernel` for each supported type.
-
-For instance, if you'd like the `ZeroOut` Op to work on `float`s
-in addition to `int32`s, your Op registration might look like:
-<code class="lang-c++"><pre>
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</pre></code>
-
-Your Op registration now specifies that the input's type must be `float`, or
-`int32`, and that its output will be the same type, since both have type `T`.
-
-> <a id="naming"></a>A note on naming: Inputs, outputs, and attrs generally should be
-> given snake\_case names.  The one exception is attrs that are used as the type
-> of an input or in the type of an input. Those attrs can be inferred when the
-> op is added to the graph and so don't appear in the op's function.  For
-> example, this last definition of ZeroOut will generate a Python function that
-> looks like:
->
-> ```python
-> def zero_out(to_zero, name=None):
->   """...
->   Args:
->     to_zero: A `Tensor`. Must be one of the following types:
->         `float32`, `int32`.
->     name: A name for the operation (optional).
->
->   Returns:
->     A `Tensor`. Has the same type as `to_zero`.
->   """
-> ```
->
-> If `to_zero` is passed an `int32` tensor, then `T` is automatically set to
-> `int32` (well, actually `DT_INT32`). Those inferred attrs are given
-> Capitalized or CamelCase names.
->
-> Compare this with an op that has a type attr that determines the output
-> type:
->
-> ```c++
-> REGISTER_OP("StringToNumber")
->     .Input("string_tensor: string")
->     .Output("output: out_type")
->     .Attr("out_type: {float, int32} = DT_FLOAT");
->     .Doc(R"doc(
-> Converts each string in the input Tensor to the specified numeric type.
-> )doc");
-> ```
->
-> In this case, the user has to specify the output type, as in the generated
-> Python:
->
-> ```python
-> def string_to_number(string_tensor, out_type=None, name=None):
->   """Converts each string in the input Tensor to the specified numeric type.
->
->   Args:
->     string_tensor: A `Tensor` of type `string`.
->     out_type: An optional `tf.DType` from: `tf.float32, tf.int32`.
->       Defaults to `tf.float32`.
->     name: A name for the operation (optional).
->
->   Returns:
->     A `Tensor` of type `out_type`.
->   """
-> ```
-
-<code class="lang-c++"><pre>
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
-  // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
- public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
-    // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
-    // Create an output tensor
-    Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
-    // Set all the elements of the output tensor to 0
-    const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
-    // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
-  }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
-// in the Op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
-    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
-    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    ZeroOutFloatOp);
-</b></pre></code>
-
-> To preserve [backwards compatibility](#backwards-compatibility), you should
-> specify a [default value](#default-values-constraints) when adding an attr to
-> an existing op:
->
-> <code class="lang-c++"><pre>
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
->   .Output("zeroed: T")
-> </pre></code>
-
-Lets say you wanted to add more types, say `double`:
-<code class="lang-c++"><pre>
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</pre></code>
-
-Instead of writing another `OpKernel` with redundant code as above, often you
-will be able to use a C++ template instead.  You will still have one kernel
-registration (`REGISTER_KERNEL_BUILDER` call) per overload.
-<code class="lang-c++"><pre>
-<b>template &lt;typename T&gt;</b>
-class ZeroOutOp : public OpKernel {
- public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
-    // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
-    // Create an output tensor
-    Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
-    // Set all the elements of the output tensor to 0
-    const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
-    // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
-  }
-};<br/>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
-// in the Op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
-    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;int32&gt;("T"),
-    <b>ZeroOutOp&lt;int32&gt;</b>);
-REGISTER\_KERNEL\_BUILDER(
-    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    <b>ZeroOutOp&lt;float&gt;</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
-    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;double&gt;("T"),
-    ZeroOutOp&lt;double&gt;);
-</b></pre></code>
-
-If you have more than a couple overloads, you can put the registration in a
-macro.
-
-```c++
-#include "tensorflow/core/framework/op_kernel.h"
-
-#define REGISTER_KERNEL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      ZeroOutOp<type>)
-
-REGISTER_KERNEL(int32);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-
-#undef REGISTER_KERNEL
-```
-
-Depending on the list of types you are registering the kernel for, you may be
-able to use a macro provided by
-[`tensorflow/core/framework/register_types.h`][register_types]:
-
-```c++
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-
-REGISTER_OP("ZeroOut")
-    .Attr("T: realnumbertype")
-    .Input("to_zero: T")
-    .Output("zeroed: T");
-
-template <typename T>
-class ZeroOutOp : public OpKernel { ... };
-
-#define REGISTER_KERNEL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("ZeroOut").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      ZeroOutOp<type>)
-
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
-
-#undef REGISTER_KERNEL
-```
-
-#### List Inputs and Outputs
-
-In addition to being able to accept or produce different types, ops can consume
-or produce a variable number of tensors.
-
-In the next example, the attr `T` holds a *list* of types, and is used as the
-type of both the input `in` and the output `out`.  The input and output are
-lists of tensors of that type (and the number and types of tensors in the output
-are the same as the input, since both have type `T`).
-
-```c++
-REGISTER_OP("PolymorphicListExample")
-    .Attr("T: list(type)")
-    .Input("in: T")
-    .Output("out: T");
-```
-
-You can also place restrictions on what types can be specified in the list. In
-this next case, the input is a list of `float` and `double` tensors. The Op
-accepts, for example, input types `(float, double, float)` and in that case the
-output type would also be `(float, double, float)`.
-
-```c++
-REGISTER_OP("ListTypeRestrictionExample")
-    .Attr("T: list({float, double})")
-    .Input("in: T")
-    .Output("out: T");
-```
-
-If you want all the tensors in a list to be of the same type, you might do
-something like:
-
-```c++
-REGISTER_OP("IntListInputExample")
-    .Attr("N: int")
-    .Input("in: N * int32")
-    .Output("out: int32");
-```
-
-This accepts a list of `int32` tensors, and uses an `int` attr `N` to
-specify the length of the list.
-
-This can be made [type polymorphic](#type-polymorphism) as well.  In the next
-example, the input is a list of tensors (with length `"N"`) of the same (but
-unspecified) type (`"T"`), and the output is a single tensor of matching type:
-
-```c++
-REGISTER_OP("SameListInputExample")
-    .Attr("N: int")
-    .Attr("T: type")
-    .Input("in: N * T")
-    .Output("out: T");
-```
-
-By default, tensor lists have a minimum length of 1. You can change that default
-using
-[a `">="` constraint on the corresponding attr](#default-values-constraints).
-In this next example, the input is a list of at least 2 `int32` tensors:
-
-```c++
-REGISTER_OP("MinLengthIntListExample")
-    .Attr("N: int >= 2")
-    .Input("in: N * int32")
-    .Output("out: int32");
-```
-
-The same syntax works with `"list(type)"` attrs:
-
-```c++
-REGISTER_OP("MinimumLengthPolymorphicListExample")
-    .Attr("T: list(type) >= 3")
-    .Input("in: T")
-    .Output("out: T");
-```
-
-### Inputs and Outputs
-
-To summarize the above, an Op registration can have multiple inputs and outputs:
-
-```c++
-REGISTER_OP("MultipleInsAndOuts")
-    .Input("y: int32")
-    .Input("z: float")
-    .Output("a: string")
-    .Output("b: int32");
-```
-
-Each input or output spec is of the form:
-
-```
-<name>: <io-type-expr>
-```
-
-where `<name>` begins with a letter and can be composed of alphanumeric
-characters and underscores. `<io-type-expr>` is one of the following type
-expressions:
-
-* `<type>`, where `<type>` is a supported input type (e.g. `float`, `int32`,
-  `string`). This specifies a single tensor of the given type.
-
-  See
-  [the list of supported Tensor types](../../resources/dims_types.md#data-types).
-
-  ```c++
-  REGISTER_OP("BuiltInTypesExample")
-      .Input("integers: int32")
-      .Input("complex_numbers: complex64");
-  ```
-
-* `<attr-type>`, where `<attr-type>` is the name of an [Attr](#attrs) with type
-  `type` or `list(type)` (with a possible type restriction). This syntax allows
-  for [polymorphic ops](#polymorphism).
-
-  ```c++
-  REGISTER_OP("PolymorphicSingleInput")
-      .Attr("T: type")
-      .Input("in: T);
-
-  REGISTER_OP("RestrictedPolymorphicSingleInput")
-      .Attr("T: {int32, int64}")
-      .Input("in: T);
-  ```
-
-  Referencing an attr of type `list(type)` allows you to accept a sequence of
-  tensors.
-
-  ```c++
-  REGISTER_OP("ArbitraryTensorSequenceExample")
-      .Attr("T: list(type)")
-      .Input("in: T")
-      .Output("out: T");
-
-  REGISTER_OP("RestrictedTensorSequenceExample")
-      .Attr("T: list({int32, int64})")
-      .Input("in: T")
-      .Output("out: T");
-  ```
-
-  Note that the number and types of tensors in the output `out` is the same as
-  in the input `in`, since both are of type `T`.
-
-* For a sequence of tensors with the same type: `<number> * <type>`, where
-  `<number>` is the name of an [Attr](#attrs) with type `int`.  The `<type>` can
-  either be
-  [a specific type like `int32` or `float`](../../resources/dims_types.md#data-types),
-  or the name of an attr with type `type`.  As an example of the first, this
-  Op accepts a list of `int32` tensors:
-
-  ```c++
-  REGISTER_OP("Int32SequenceExample")
-      .Attr("NumTensors: int")
-      .Input("in: NumTensors * int32")
-  ```
-
-  Whereas this Op accepts a list of tensors of any type, as long as they are all
-  the same:
-
-  ```c++
-  REGISTER_OP("SameTypeSequenceExample")
-      .Attr("NumTensors: int")
-      .Attr("T: type")
-      .Input("in: NumTensors * T")
-  ```
-
-* For a reference to a tensor: `Ref(<type>)`, where `<type>` is one of the
-  previous types.
-
-> A note on naming: Any attr used in the type of an input will be inferred.  By
-> convention those inferred attrs use capital names (like `T` or `N`).
-> Otherwise inputs, outputs, and attrs have names like function parameters
-> (e.g. `num_outputs`).  For more details, see the
-> [earlier note on naming](#naming).
-
-For more details, see
-[`tensorflow/core/framework/op_def_builder.h`][op_def_builder].
-
-### Backwards compatibility
-
-In general, changes to specifications must be backwards-compatible: changing the
-specification of an Op must not break prior serialized `GraphDef` protocol
-buffers constructed from older specifications.  The details of `GraphDef`
-compatibility are [described here](../../resources/versions.md#graphs).
-
-There are several ways to preserve backwards-compatibility.
-
-1. Any new attrs added to an operation must have default values defined, and
-   with that default value the Op must have the original behavior. To change an
-   operation from not polymorphic to polymorphic, you *must* give a default
-   value to the new type attr to preserve the original signature by default. For
-   example, if your operation was:
-
-   ```c++
-   REGISTER_OP("MyGeneralUnaryOp")
-       .Input("in: float")
-       .Output("out: float");
-   ```
-
-   you can make it polymorphic in a backwards-compatible way using:
-
-   ```c++
-   REGISTER_OP("MyGeneralUnaryOp")
-       .Input("in: T")
-       .Output("out: T")
-       .Attr("T: numerictype = DT_FLOAT");
-   ```
-
-2. You can safely make a constraint on an attr less restrictive.  For example,
-   you can change from `{int32, int64}` to `{int32, int64, float}` or `type`.
-   Or you may change from `{"apple", "orange"}` to `{"apple", "banana",
-   "orange"}` or `string`.
-
-3. You can change single inputs / outputs into list inputs / outputs, as long as
-   the default for the list type matches the old signature.
-
-4. You can add a new list input / output, if it defaults to empty.
-
-5. Namespace any new Ops you create, by prefixing the Op names with something
-   unique to your project. This avoids having your Op colliding with any Ops
-   that might be included in future versions of TensorFlow.
-
-6. Plan ahead! Try to anticipate future uses for the Op. Some signature changes
-   can't be done in a compatible way (for example, making a list of the same
-   type into a list of varying types).
-
-The full list of safe and unsafe changes can be found in
-[`tensorflow/core/framework/op_compatibility_test.cc`](https://www.tensorflow.org/code/tensorflow/core/framework/op_compatibility_test.cc).
-If you cannot make your change to an operation backwards compatible, then create
-a new operation with a new name with the new semantics.
-
-Also note that while these changes can maintain `GraphDef` compatibility, the
-generated Python code may change in a way that isn't compatible with old
-callers.  The Python API may be kept compatible by careful changes in a
-hand-written Python wrapper, by keeping the old signature except possibly adding
-new optional arguments to the end.  Generally incompatible changes may only be
-made when TensorFlow's changes major versions, and must conform to the
-[`GraphDef` version semantics](../../resources/versions.md#graphs).
-
-## GPU Support
-
-You can implement different OpKernels and register one for CPU and another for
-GPU, just like you can [register kernels for different types](#polymorphism).
-There are several examples of kernels with GPU support in
-[`tensorflow/core/kernels/`](https://www.tensorflow.org/code/tensorflow/core/kernels/).
-Notice some kernels have a CPU version in a `.cc` file, a GPU version in a file
-ending in `_gpu.cu.cc`, and some code shared in common in a `.h` file.
-
-For example, the [`pad` op](../../api_docs/python/array_ops.md#pad) has
-everything but the GPU kernel in [`tensorflow/core/kernels/pad_op.cc`][pad_op].
-The GPU kernel is in
-[`tensorflow/core/kernels/pad_op_gpu.cu.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op_gpu.cu.cc),
-and the shared code is a templated class defined in
-[`tensorflow/core/kernels/pad_op.h`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.h).
-One thing to note, even when the GPU kernel version of `pad` is used, it still
-needs its `"paddings"` input in CPU memory.  To mark that inputs or outputs are
-kept on the CPU, add a `HostMemory()` call to the kernel registration, e.g.:
-
-```c++
-#define REGISTER_GPU_KERNEL(T)                         \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                  \
-                              .Device(DEVICE_GPU)      \
-                              .TypeConstraint<T>("T")  \
-                              .HostMemory("paddings"), \
-                          PadOp<GPUDevice, T>)
-```
-
-### Compiling the kernel for the GPU device
-
-Look at
-[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc)
-for an example that uses a CUDA kernel to implement an op. The
-`tf_custom_op_library` accepts a `gpu_srcs` argument in which the list of source
-files containing the CUDA kernels (`*.cu.cc` files) can be specified. For use
-with a binary installation of TensorFlow, the CUDA kernels have to be compiled
-with NVIDIA's `nvcc` compiler. Here is the sequence of commands you can use to
-compile the
-[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc)
-and
-[cuda_op_kernel.cc](https://www.tensorflow.org/code/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cc)
-into a single dynamically loadable library:
-
-```bash
-nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \
--I $TF_INC -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
-
-g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \
-cuda_op_kernel.cu.o -I $TF_INC -fPIC -lcudart
-```
-
-`cuda_op_kernel.so` produced above can be loaded as usual in Python, using the
-`tf.load_op_library` function.
-
-Note that if your CUDA libraries are not installed in `/usr/local/lib64`,
-you'll need to specify the path explicitly in the second (g++) command above.
-For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in 
-`/usr/local/cuda-8.0`.
-
-## Implement the gradient in Python
-
-Given a graph of ops, TensorFlow uses automatic differentiation
-(backpropagation) to add new ops representing gradients with respect to the
-existing ops (see
-[Gradient Computation](../../api_docs/python/train.md#gradient-computation)).
-To make automatic differentiation work for new ops, you must register a gradient
-function which computes gradients with respect to the ops' inputs given
-gradients with respect to the ops' outputs.
-
-Mathematically, if an op computes \\(y = f(x)\\) the registered gradient op
-converts gradients \\(\partial L/ \partial y\\) of loss \\(L\\) with respect to
-\\(y\\) into gradients \\(\partial L/ \partial x\\) with respect to \\(x\\) via
-the chain rule:
-
-$$\frac{\partial L}{\partial x}
-    = \frac{\partial L}{\partial y} \frac{\partial y}{\partial x}
-    = \frac{\partial L}{\partial y} \frac{\partial f}{\partial x}.$$
-
-In the case of `ZeroOut`, only one entry in the input affects the output, so the
-gradient with respect to the input is a sparse "one hot" tensor.  This is
-expressed as follows:
-
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-@ops.RegisterGradient("ZeroOut")
-def _zero_out_grad(op, grad):
-  """The gradients for `zero_out`.
-
-  Args:
-    op: The `zero_out` `Operation` that we are differentiating, which we can use
-      to find the inputs and outputs of the original op.
-    grad: Gradient with respect to the output of the `zero_out` op.
-
-  Returns:
-    Gradients with respect to the input of `zero_out`.
-  """
-  to_zero = op.inputs[0]
-  shape = array_ops.shape(to_zero)
-  index = array_ops.zeros_like(shape)
-  first_grad = array_ops.reshape(grad, [-1])[0]
-  to_zero_grad = sparse_ops.sparse_to_dense([index], shape, first_grad, 0)
-  return [to_zero_grad]  # List of one Tensor, since we have one input
-```
-
-Details about registering gradient functions with
-[`ops.RegisterGradient`](../../api_docs/python/framework.md#RegisterGradient):
-
-* For an op with one output, the gradient function will take an
-  [`Operation`](../../api_docs/python/framework.md#Operation) `op` and a
-  [`Tensor`](../../api_docs/python/framework.md#Tensor) `grad` and build new ops
-  out of the tensors
-  [`op.inputs[i]`](../../api_docs/python/framework.md#Operation.inputs),
-  [`op.outputs[i]`](../../api_docs/python/framework.md#Operation.outputs), and `grad`.  Information
-  about any attrs can be found via
-  [`op.get_attr`](../../api_docs/python/framework.md#Operation.get_attr).
-
-* If the op has multiple outputs, the gradient function will take `op` and
-  `grads`, where `grads` is a list of gradients with respect to each output.
-  The result of the gradient function must be a list of `Tensor` objects
-  representing the gradients with respect to each input.
-
-* If there is no well-defined gradient for some input, such as for integer
-  inputs used as indices, the corresponding returned gradient should be
-  `None`.  For example, for an op taking a floating point tensor `x` and an
-  integer index `i`, the gradient function would `return [x_grad, None]`.
-
-* If there is no meaningful gradient for the op at all, use
-  `ops.NotDifferentiable("OpName")` to disable automatic differentiation.
-
-Note that at the time the gradient function is called, only the data flow graph
-of ops is available, not the tensor data itself.  Thus, all computation must be
-performed using other tensorflow ops, to be run at graph execution time.
-
-## Shape functions in C++
-
-The TensorFlow API has a feature called "shape inference" that provides
-information about the shapes of tensors without having to execute the
-graph. Shape inference is supported by "shape functions" that are registered for
-each op type in the C++ `REGISTER_OP` declaration, and perform two roles:
-asserting that the shapes of the inputs are compatible during graph
-construction, and specifying the shapes for the outputs.
-
-Shape functions are defined as operations on the
-`shape_inference::InferenceContext` class. For example, in the shape function
-for ZeroOut:
-
-```c++
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    });
-```
-
-`c->set_output(0, c->input(0));` declares that the first output's shape should
-be set to the first input's shape. There are a number of common shape functions
-that apply to many ops, such as `shape_inference::UnchangedShape` which can be
-found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows:
-
-```c++
-REGISTER_OP("ZeroOut")
-    .Input("to_zero: int32")
-    .Output("zeroed: int32")
-    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
-```
-
-A shape function can also constrain the shape of an input. For the version of
-[`ZeroOut` with a vector shape constraint](#validation), the shape function
-would be as follows:
-
-```c++
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      ::tensorflow::shape_inference::ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
-      c->set_output(0, input);
-      return Status::OK();
-    });
-```
-
-The `WithRank` call validates that the input shape `c->input(0)` has
-a shape with exactly one dimension (or if the input shape is unknown,
-the output shape will be a vector with one unknown dimension).
-
-If your op is [polymorphic with multiple inputs](#polymorphism), you can use
-members of `InferenceContext` to determine the number of shapes to check, and
-`Merge` to validate that the shapes are all compatible (alternatively, access
-attributes that indicate the lengths, with `InferenceContext::GetAttr`, which
-provides access to the attributes of the op).
-
-```c++
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      ::tensorflow::shape_inference::ShapeHandle input;
-      ::tensorflow::shape_inference::ShapeHandle output;
-      for (size_t i = 0; i < c->num_inputs(); ++i) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &input));
-        TF_RETURN_IF_ERROR(c->Merge(output, input, &output));
-      }
-      c->set_output(0, output);
-      return Status::OK();
-    });
-```
-
-Since shape inference is an optional feature, and the shapes of tensors may vary
-dynamically, shape functions must be robust to incomplete shape information for
-any of the inputs. The `Merge` method in [`InferenceContext`](https://www.tensorflow.org/code/tensorflow/core/framework/shape_inference.h)
-allows the caller to assert that two shapes are the same, even if either
-or both of them do not have complete information. Shape functions are defined
-for all of the core TensorFlow ops and provide many different usage examples.
-
-The `InferenceContext` class has a number of functions that can be used to
-define shape function manipulations.  For example, you can validate that a
-particular dimension has a very specific value using `InferenceContext::Dim` and
-`InferenceContext::WithValue`; you can specify that an output dimension is the
-sum / product of two input dimensions using `InferenceContext::Add` and
-`InferenceContext::Multiply`. See the `InferenceContext` class for
-all of the various shape manipulations you can specify.
-
-If you have a complicated shape function, you should consider adding a test for
-validating that various input shape combinations produce the expected output
-shape combinations.  You can see examples of how to write these tests in some
-our
-[core ops tests](https://www.tensorflow.org/code/tensorflow/core/ops/array_ops_test.cc).
-(The syntax of `INFER_OK` and `INFER_ERROR` are a little cryptic, but try to be
-compact in representing input and output shape specifications in tests.  For
-now, see the surrounding comments in those tests to get a sense of the shape
-string specification).
-
-
-[core-array_ops]:https://www.tensorflow.org/code/tensorflow/core/ops/array_ops.cc
-[python-user_ops]:https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py
-[tf-kernels]:https://www.tensorflow.org/code/tensorflow/core/kernels/
-[user_ops]:https://www.tensorflow.org/code/tensorflow/core/user_ops/
-[pad_op]:https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op.cc
-[standard_ops-py]:https://www.tensorflow.org/code/tensorflow/python/ops/standard_ops.py
-[standard_ops-cc]:https://www.tensorflow.org/code/tensorflow/cc/ops/standard_ops.h
-[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD
-[validation-macros]:https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h
-[op_def_builder]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.h
-[register_types]:https://www.tensorflow.org/code/tensorflow/core/framework/register_types.h
-[FinalizeAttr]:https://www.tensorflow.org/code/tensorflow/core/framework/op_def_builder.cc
-[DataTypeString]:https://www.tensorflow.org/code/tensorflow/core/framework/types.cc
-[python-BUILD]:https://www.tensorflow.org/code/tensorflow/python/BUILD
-[types-proto]:https://www.tensorflow.org/code/tensorflow/core/framework/types.proto
-[TensorShapeProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor_shape.proto
-[TensorProto]:https://www.tensorflow.org/code/tensorflow/core/framework/tensor.proto
diff --git a/tensorflow/g3doc/how_tos/debugger/index.md b/tensorflow/g3doc/how_tos/debugger/index.md
deleted file mode 100644
index f13c1a95bd9..00000000000
--- a/tensorflow/g3doc/how_tos/debugger/index.md
+++ /dev/null
@@ -1,394 +0,0 @@
-# TensorFlow Debugger (tfdbg) Command-Line-Interface Tutorial: MNIST
-
-[TOC]
-
-**(Experimental)**
-
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-provides visibility into the internal structure and states of running
-TensorFlow graphs. The insight gained from this visibility should facilitate
-debugging of various types of model bugs during training and inference.
-
-This tutorial showcases the features of tfdbg
-command-line interface (CLI), by focusing on how to debug a
-type of frequently-encountered bug in TensorFlow model development:
-bad numerical values (`nan`s and `inf`s) causing training to fail.
-
-To **observe** such an issue, run the following code without the debugger:
-
-```none
-python -m tensorflow.python.debug.examples.debug_mnist
-```
-
-This code trains a simple NN for MNIST digit image recognition. Notice that the
-accuracy increases slightly after the first training step, but then gets stuck
-at a low (near-chance) level:
-
-![debug_mnist training fails](../../images/tfdbg_screenshot_mnist_symptom.png)
-
-Scratching your head, you suspect that certain nodes in the training graph
-generated bad numeric values such as `inf`s and `nan`s. The computation-graph
-paradigm of TensorFlow makes it non-trivial to debug such model-internal states
-with general-purpose debuggers such as Python's `pdb`.
-**tfdbg** specializes in diagnosing these types of issues and pinpointing the
-exact node where the problem first surfaced.
-
-## Wrapping TensorFlow Sessions with tfdbg
-
-To add support for **tfdbg** in our example, we just need to add the following
-three lines of code, which wrap the Session object with a debugger wrapper when
-the `--debug` flag is provided:
-
-```python
-# Let your BUILD target depend on "//tensorflow/python/debug:debug_py"
-from tensorflow.python import debug as tf_debug
-
-sess = tf_debug.LocalCLIDebugWrapperSession(sess)
-sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
-```
-
-This wrapper has the same interface as Session, so enabling debugging requires
-no other changes to the code. But the wrapper provides additional features
-including:
-
-* Bringing up a terminal-based user interface (UI) before and after each
-`run()` call, to let you control the execution and inspect the graph's internal
-state.
-* Allowing you to register special "filters" for tensor values, to facilitate
-the diagnosis of issues.
-
-In this example, we are registering a tensor filter called
-[`has_inf_or_nan`](../../../g3doc/api_docs/python/tf_debug.md#has_inf_or_nan),
-which simply determines if there are any `nan` or `inf` values in any
-intermediate tensor of the graph. (This filter is a common enough use case that
-we ship it with the
-[`debug_data`](../../../g3doc/api_docs/python/tf_debug.md#classes-for-debug-dump-data-and-directories)
-module.)
-
-```python
-def has_inf_or_nan(datum, tensor):
-  return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
-```
-
-TIP: You can also write your own custom filters. See
-the [API documentation](../../../g3doc/api_docs/python/tf_debug.md#DebugDumpDir.find)
-of `DebugDumpDir.find()` for additional information.
-
-## Debugging Model Training with tfdbg
-
-Let's try training the model again with debugging enabled. Execute the command
-from above, this time with the `--debug` flag added:
-
-```none
-python -m tensorflow.python.debug.examples.debug_mnist --debug
-```
-
-The debug wrapper session will prompt you when it is about to execute the first
-`run()` call, with information regarding the fetched tensor and feed
-dictionaries displayed on the screen.
-
-![tfdbg run-start UI](../../images/tfdbg_screenshot_run_start.png)
-
-This is what we refer to as the *run-start UI*. If the screen size is
-too small to display the content of the message in its entirety, you can resize
-it or use the **PageUp** / **PageDown** / **Home** / **End** keys to navigate
-the screen output.
-
-As the screen output indicates, the first `run()` call calculates the accuracy
-using a test data set—i.e., a forward pass on the graph. You can enter the
-command `run` (or its shorthand `r`) to launch the `run()` call. On terminals
-that support mouse events, you can simply click the underlined `run` on the top
-left corner of the screen to proceed.
-
-This will bring up another screen
-right after the `run()` call has ended, which will display all dumped
-intermedate tensors from the run. (These tensors can also be obtained by
-running the command `lt` after you executed `run`.) This is called the
-**run-end UI**:
-
-![tfdbg run-end UI: accuracy](../../images/tfdbg_screenshot_run_end_accuracy.png)
-
-### tfdbg CLI Frequently-Used Commands
-
-Try the following commands at the `tfdbg>` prompt (referencing the code at
-`tensorflow/python/debug/examples/debug_mnist.py`):
-
-| Command Example    | Explanation           |
-|:----------------------------- |:----------------------------------- |
-| `pt hidden/Relu:0` | Print the value of the tensor `hidden/Relu:0`. |
-| `pt hidden/Relu:0[0:50,:]` | Print a subarray of the tensor `hidden/Relu:0`, using [numpy](http://www.numpy.org/)-style array slicing. |
-| `pt hidden/Relu:0[0:50,:] -a` | For a large tensor like the one here, print its value in its entirety—i.e., without using any ellipsis. May take a long time for large tensors. |
-| `pt hidden/Relu:0[0:10,:] -a -r [1,inf]` | Use the `-r` flag to highlight elements falling into the specified numerical range. Multiple ranges can be used in conjunction, e.g., `-r [[-inf,-1],[1,inf]]`.|
-| `@[10,0]` or `@10,0` | Navigate to indices [10, 0] in the tensor being displayed. |
-| `/inf` | Search the screen output with the regex `inf` and highlight any matches. |
-| `/` | Scroll to the next line with matches to the searched regex (if any). |
-| `ni -a hidden/Relu` | Display information about the node `hidden/Relu`, including node attributes. |
-| `ni -t hidden/Relu` | Display the stack trace of node `hidden/Relu`'s construction. |
-| `li -r hidden/Relu:0` | List the inputs to the node `hidden/Relu`, recursively—i.e., the input tree. |
-| `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
-| `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
-| `lt -t MatMul` | List all dumped tensors whose node type is `MatMul`. |
-| `run_info` or `ri` | Display information about the current run, including fetches and feeds. |
-| `help` | Print general help information listing all available **tfdbg** commands and their flags. |
-| `help lt` | Print the help information for the `lt` command. |
-
-In this first `run()` call, there happen to be no problematic numerical values.
-You can move on to the next run by using the command `run` or its shorthand `r`.
-
-> TIP: If you enter `run` or `r` repeatedly, you will be able to move through the
-> `run()` calls in a sequential manner.
->
-> You can also use the `-t` flag to move ahead a number of `run()` calls at a time, for example:
->
-> ```
-> tfdbg> run -t 10
-> ```
-
-Instead of entering `run` repeatedly and manually searching for `nan`s and
-`inf`s in the run-end UI after every `run()` call, you can use the following
-command to let the debugger repeatedly execute `run()` calls without stopping at
-the run-start or run-end prompt, until the first `nan` or `inf` value shows up
-in the graph. This is analogous to *conditional breakpoints* in some
-procedural-language debuggers:
-
-```none
-tfdbg> run -f has_inf_or_nan
-```
-
-
-
-> NOTE: This works because we have previously registered a filter for `nan`s and `inf`s called
-> `has_inf_or_nan` (as explained previously). If you have registered any other filters, you can
-> let **tfdbg** run till any tensors pass that filter as well, e.g.,
->
-> ```
-> # In python code:
-> sess.add_tensor_filter('my_filter', my_filter_callable)
->
-> # Run at tfdbg run-start prompt:
-> tfdbg> run -f my_filter
-> ```
-
-After you enter `run -f has_inf_or_nan`, you will see the following
-screen with a red-colored title line indicating **tfdbg** stopped immediately
-after a `run()` call generated intermediate tensors that passed the specified
-filter `has_inf_or_nan`:
-
-![tfdbg run-end UI: infs and nans](../../images/tfdbg_screenshot_run_end_inf_nan.png)
-
-As the screen display indicates, the `has_inf_or_nan` filter is first passed
-during the fourth `run()` call: an [Adam optimizer](https://arxiv.org/abs/1412.6980)
-forward-backward training pass on the graph. In this run, 36 (out of the total
-95) intermediate tensors contain `nan` or `inf` values. These tensors are listed
-in chronological order, with their timestamps displayed on the left. At the top
-of the list, you can see the first tensor in which the bad numerical values
-first surfaced: `cross_entropy/Log:0`.
-
-To view the value of the tensor, click the underlined tensor name
-`cross_entropy/Log:0` or enter the equivalent command:
-
-```none
-tfdbg> pt cross_entropy/Log:0
-```
-
-Scroll down a little and you will notice some scattered `inf` values. If the
-instances of `inf` and `nan` are difficult to spot by eye, you can use the
-following command to perform a regex search and highlight the output:
-
-```none
-tfdbg> /inf
-```
-
-Or, alternatively:
-
-```none
-tfdbg> /(inf|nan)
-```
-
-Why did these infinities appear? To further debug, display more information
-about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
-item on the top or entering the equivalent command:
-
-```none
-tfdbg> ni cross_entropy/Log
-```
-
-![tfdbg run-end UI: infs and nans](../../images/tfdbg_screenshot_run_end_node_info.png)
-
-You can see that this node has the op type `Log`
-and that its input is the node `softmax/Softmax`. Run the following command to
-take a closer look at the input tensor:
-
-```none
-tfdbg> pt softmax/Softmax:0
-```
-
-Examine the values in the input tensor, and search to see if there are any zeros:
-
-```none
-tfdbg> /0\.000
-```
-
-Indeed, there are zeros. Now it is clear that the origin of the bad numerical
-values is the node `cross_entropy/Log` taking logs of zeros. To find out the
-culprit line in the Python source code, use the `-t` flag of the `ni` command
-to show the traceback of the node's construction:
-
-```none
-tfdbg> ni -t cross_entropy/Log
-```
-
-From the traceback, you can see that the op is constructed at line 109 of
-[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py):
-
-```python
-diff = y_ * tf.log(y)
-```
-
-Apply a value clipping on the input to [`tf.log`](../../../g3doc/api_docs/python/math_ops.md#log)
-to resolve this problem:
-
-```python
-diff = y_ * tf.log(tf.clip_by_value(y, 1e-8, 1.0))
-```
-
-Now, try training again with `--debug`:
-
-```none
-python -m tensorflow.python.debug.examples.debug_mnist --debug
-```
-
-Enter `run -f has_inf_or_nan` at the `tfdbg>` prompt and confirm that no tensors
-are flagged as containing `nan` or `inf` values, and accuracy no longer gets
-stuck. Success!
-
-## Debugging tf-learn Estimators
-
-For documentation on **tfdbg** to debug
-[tf.contrib.learn](https://tensorflow.org/tutorials/tflearn/index.html)
-`Estimator`s and `Experiment`s, please see
-[How to Use TensorFlow Debugger (tfdbg) with tf.contrib.learn](tfdbg-tflearn.md).
-
-## Offline Debugging of Remotely-Running Sessions
-
-Oftentimes, your model is running in a remote machine or process that you don't
-have terminal access to. To perform model debugging in such cases, you can use
-the `offline_analyzer` of `tfdbg`. It operates on dumped data directories.
-If the process you are running is written in Python, you can
-configure the `RunOptions` proto that you call your `Session.run()` method
-with, by using the method
-[`debug_utils.watch_graph()`](../../../g3doc/api_docs/python/tf_debug.md#watch_graph).
-This will cause the intermediate tensors and runtime graphs to be dumped to a
-shared storage location of your choice when the `Session.run()` call occurs.
-For example:
-
-```python
-from tensorflow.python.debug import debug_utils
-
-# ... Code where your session and graph are set up...
-
-run_options = tf.RunOptions()
-debug_utils.watch_graph(
-      run_options,
-      session.graph,
-      debug_urls=["file:///shared/storage/location/tfdbg_dumps_1"])
-# Be sure to use different directories for different run() calls.
-
-session.run(fetches, feed_dict=feeds, options=run_options)
-```
-
-Later, in an environment that you have terminal access to, you can load and
-inspect the data in the dump directory on the shared storage by using the
-`offline_analyzer` of `tfdbg`. For example:
-
-```none
-python -m tensorflow.python.debug.cli.offline_analyzer \
-    --dump_dir=/cns/is-d/home/somebody/tfdbg_dumps_1
-```
-
-The `Session` wrapper `DumpingDebugWrapperSession` offers an easier and more
-flexible way to generate dumps on filesystem that can be analyzed offline.
-To use it, simply do:
-
-```python
-# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
-from tensorflow.python.debug import debug_utils
-
-sess = tf_debug.DumpingDebugWrapperSession(
-    sess, "/cns/is-d/home/somebody/tfdbg_dumps_1/", watch_fn=my_watch_fn)
-```
-
-`watch_fn=my_watch_fn` is a `Callable` that allows you to configure what
-`Tensor`s to watch on different `Session.run()` calls, as a function of the
-`fetches` and `feed_dict` to the `run()` call and other states. See
-[the API doc of DumpingDebugWrapperSession](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession.__init__)
-for more details.
-
-If you model code is written in C++ or other languages, you can also
-modify the `debug_options` field of `RunOptions` to generate debug dumps that
-can be inspected offline. See
-[the proto definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/debug.proto)
-for more details.
-
-## Other Features of the tfdbg CLI
-
-*   Navigation through command history using the Up and Down arrow keys.
-    Prefix-based navigation is also supported.
-*   Tab completion of commands and some command arguments.
-*   Write screen output to file by using bash-style redirection. For example:
-
-  ```none
-  tfdbg> pt cross_entropy/Log:0[:, 0:10] > /tmp/xent_value_slices.txt
-  ```
-
-## Frequently Asked Questions
-
-**Q**: _Do the timestamps on the left side of the `lt` output reflect actual
-       performance in a non-debugging session?_
-
-**A**: No. The debugger inserts additional special-purpose debug nodes to the
-       graph to record the values of intermediate tensors. These nodes certainly
-       slow down the graph execution. If you are interested in profiling your
-       model, check out
-       [tfprof](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tfprof)
-       and other profiling tools for TensorFlow.
-
-**Q**: _How do I link tfdbg against my `Session` in Bazel? Why do I see an
-       error such as "ImportError: cannot import name debug"?_
-
-**A**: In your BUILD rule, declare dependencies:
-       `"//tensorflow:tensorflow_py"` and `"//tensorflow/python/debug:debug_py"`.
-       The first is the dependency that you include to use TensorFlow even
-       without debugger support; the second enables the debugger.
-       Then, In your Python file, add:
-
-```python
-from tensorflow.python import debug as tf_debug
-
-# Then wrap your TensorFlow Session with the local-CLI wrapper.
-sess = tf_debug.LocalCLIDebugWrapperSession(sess)
-```
-
-**Q**: _Does tfdbg help debugging runtime errors such as shape mismatches?_
-
-**A**: Yes. tfdbg intercepts errors generated by ops during runtime and presents
-       the errors with some debug instructions to the user in the CLI.
-       See examples:
-
-```none
-# Debugging shape mismatch during matrix multiplication.
-python -m tensorflow.python.debug.examples.debug_errors \
-    --error shape_mismatch --debug
-
-# Debugging uninitialized variable.
-python -m tensorflow.python.debug.examples.debug_errors \
-    --error uninitialized_variable --debug
-```
-
-**Q**: _Why can't I select text in the tfdbg CLI?_
-
-**A**: This is because the tfdbg CLI enables mouse events in the terminal by
-       default. This [mouse-mask](https://linux.die.net/man/3/mousemask) mode
-       overrides default terminal interactions, including text selection. You
-       can re-enable text selection by using the command `mouse off` or
-       `m off`.
diff --git a/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md b/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md
deleted file mode 100644
index 17ba37908cd..00000000000
--- a/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# How to Use TensorFlow Debugger (tfdbg) with tf.contrib.learn
-
-[TOC]
-
-In [a previous tutorial](index.md), we described how to use TensorFlow Debugger (**tfdbg**)
-to debug TensorFlow graphs running in
-[`tf.Session`](https://tensorflow.org/api_docs/python/client.html#Session)
-objects managed by yourself. However, many users find
-[`tf.contrib.learn`](https://tensorflow.org/tutorials/tflearn/index.html)
-[Estimator](https://tensorflow.org/api_docs/python/contrib.learn.html?cl=head#Estimator)s
-to be a convenient higher-level API for creating and using models
-in TensorFlow. Part of the convenience is that `Estimator`s manage `Session`s
-internally. Fortunately, you can still use `tfdbg` with `Estimator`s by adding
-special hooks.
-
-## Debugging tf.contrib.learn Estimators
-
-Currently, **tfdbg** can debug the
-[fit()](https://tensorflow.org/api_docs/python/contrib.learn.html#BaseEstimator.fit)
-and
-[evaluate()](https://tensorflow.org/api_docs/python/contrib.learn.html#BaseEstimator.evaluate)
-methods of tf-learn `Estimator`s. To debug `Estimator.fit()`,
-create a `LocalCLIDebugHook` and supply it as the `monitors` argument. For example:
-
-```python
-# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
-from tensorflow.python import debug as tf_debug
-
-hooks = [tf_debug.LocalCLIDebugHook()]
-
-# Create a local CLI debug hook and use it as a monitor when calling fit().
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=1000,
-               monitors=hooks)
-```
-
-To debug `Estimator.evaluate()`, you can follow the example below:
-
-```python
-accuracy_score = classifier.evaluate(x=test_set.data,
-                                     y=test_set.target,
-                                     hooks=hooks)["accuracy"]
-```
-
-
-For a detailed [example](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_tflearn_iris.py) based on
-[tf-learn's iris tutorial](../../../g3doc/tutorials/tflearn/index.md),
-run:
-
-```none
-python -m tensorflow.python.debug.examples.debug_tflearn_iris --debug
-```
-
-## Debugging tf.contrib.learn Experiments
-
-`Experiment` is a construct in `tf.contrib.learn` at a higher level than
-`Estimator`.
-It provides a single interface for training and evaluating a model. To debug
-the `train()` and `evaluate()` calls to an `Experiment` object, you can
-use the keyword arguments `train_monitors` and `eval_hooks`, respectively, when
-calling its constructor. For example:
-
-```python
-# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
-from tensorflow.python import debug as tf_debug
-
-hooks = [tf_debug.LocalCLIDebugHook()]
-
-ex = experiment.Experiment(classifier,
-                           train_input_fn=iris_input_fn,
-                           eval_input_fn=iris_input_fn,
-                           train_steps=FLAGS.train_steps,
-                           eval_delay_secs=0,
-                           eval_steps=1,
-                           train_monitors=hooks,
-                           eval_hooks=hooks)
-
-ex.train()
-accuracy_score = ex.evaluate()["accuracy"]
-```
-
-To see the `debug_tflearn_iris` example run in the `Experiment` mode, do:
-
-```none
-python -m tensorflow.python.debug.examples.debug_tflearn_iris \
-    --use_experiment --debug
-```
-
-## Debugging Estimators and Experiments without Terminal Access
-
-If your `Estimator` or `Experiment` is running in an environment to which you
-do not have command-line access (e.g., a remote server), you can use the
-non-interactive `DumpingDebugHook`. For example:
-
-```python
-# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
-from tensorflow.python import debug as tf_debug
-
-hooks = [tf_debug.DumpingDebugHook("/cns/is-d/home/somebody/tfdbg_dumps_1")]
-```
-
-Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples
-above. As the training and/or evalution of `Estimator` or `Experiment`
-happens, directories of the naming pattern
-`/cns/is-d/home/somebody/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>`
-will appear. Each directory corresponds to a `Session.run()` call that underlies
-the `fit()` or `evaluate()` call. You can load these directories and inspect
-them in a command-line interface in an offline manner using the
-`offline_analyzer` offered by **tfdbg**. For example:
-
-```bash
-python -m tensorflow.python.debug.cli.offline_analyzer \
-    --dump_dir="/cns/is-d/home/somebody/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>"
-```
-
-The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be
-used to flexibly specify what `Tensor`s to watch on different `Session.run()`
-calls, as a function of the `fetches` and `feed_dict` and other states. See
-[this API doc](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession.__init__)
-for more details.
diff --git a/tensorflow/g3doc/how_tos/distributed/index.md b/tensorflow/g3doc/how_tos/distributed/index.md
deleted file mode 100644
index 880976ca8d6..00000000000
--- a/tensorflow/g3doc/how_tos/distributed/index.md
+++ /dev/null
@@ -1,355 +0,0 @@
-# Distributed TensorFlow
-
-This document shows how to create a cluster of TensorFlow servers, and how to
-distribute a computation graph across that cluster. We assume that you are
-familiar with the [basic concepts](../../get_started/basic_usage.md) of
-writing TensorFlow programs.
-
-## Hello distributed TensorFlow!
-
- To see a simple TensorFlow cluster in action, execute the following:
-
-```shell
-# Start a TensorFlow server as a single-process "cluster".
-$ python
->>> import tensorflow as tf
->>> c = tf.constant("Hello, distributed TensorFlow!")
->>> server = tf.train.Server.create_local_server()
->>> sess = tf.Session(server.target)  # Create a session on the server.
->>> sess.run(c)
-'Hello, distributed TensorFlow!'
-```
-
-The
-[`tf.train.Server.create_local_server()`](../../api_docs/python/train.md#Server.create_local_server)
-method creates a single-process cluster, with an in-process server.
-
-## Create a cluster
-
-A TensorFlow "cluster" is a set of "tasks" that participate in the distributed
-execution of a TensorFlow graph. Each task is associated with a TensorFlow
-"server", which contains a "master" that can be used to create sessions, and a
-"worker" that executes operations in the graph.  A cluster can also be divided
-into one or more "jobs", where each job contains one or more tasks.
-
-To create a cluster, you start one TensorFlow server per task in the cluster.
-Each task typically runs on a different machine, but you can run multiple tasks
-on the same machine (e.g. to control different GPU devices). In each task, do
-the following:
-
-1.  **Create a `tf.train.ClusterSpec`** that describes all of the tasks
-    in the cluster. This should be the same for each task.
-
-2.  **Create a `tf.train.Server`**, passing the `tf.train.ClusterSpec` to
-    the constructor, and identifying the local task with a job name
-    and task index.
-
-
-### Create a `tf.train.ClusterSpec` to describe the cluster
-
-The cluster specification dictionary maps job names to lists of network
-adresses. Pass this dictionary to the `tf.train.ClusterSpec` constructor.  For
-example:
-
-<table>
-  <tr><th><code>tf.train.ClusterSpec</code> construction</th><th>Available tasks</th>
-  <tr>
-    <td><pre>
-tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
-</pre></td>
-<td><code>/job:local/task:0<br/>/job:local/task:1</code></td>
-  </tr>
-  <tr>
-    <td><pre>
-tf.train.ClusterSpec({
-    "worker": [
-        "worker0.example.com:2222",
-        "worker1.example.com:2222",
-        "worker2.example.com:2222"
-    ],
-    "ps": [
-        "ps0.example.com:2222",
-        "ps1.example.com:2222"
-    ]})
-</pre></td><td><code>/job:worker/task:0</code><br/><code>/job:worker/task:1</code><br/><code>/job:worker/task:2</code><br/><code>/job:ps/task:0</code><br/><code>/job:ps/task:1</code></td>
-  </tr>
-</table>
-
-### Create a `tf.train.Server` instance in each task
-
-A [`tf.train.Server`](../../api_docs/python/train.md#Server) object contains a
-set of local devices, a set of connections to other tasks in its
-`tf.train.ClusterSpec`, and a
-["session target"](../../api_docs/python/client.md#Session) that can use these
-to perform a distributed computation. Each server is a member of a specific
-named job and has a task index within that job.  A server can communicate with
-any other server in the cluster.
-
-For example, to launch a cluster with two servers running on `localhost:2222`
-and `localhost:2223`, run the following snippets in two different processes on
-the local machine:
-
-```python
-# In task 0:
-cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
-server = tf.train.Server(cluster, job_name="local", task_index=0)
-```
-```python
-# In task 1:
-cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
-server = tf.train.Server(cluster, job_name="local", task_index=1)
-```
-
-**Note:** Manually specifying these cluster specifications can be tedious,
-especially for large clusters. We are working on tools for launching tasks
-programmatically, e.g. using a cluster manager like
-[Kubernetes](http://kubernetes.io). If there are particular cluster managers for
-which you'd like to see support, please raise a
-[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
-
-## Specifying distributed devices in your model
-
-To place operations on a particular process, you can use the same
-[`tf.device()`](../../api_docs/python/framework.md#device)
-function that is used to specify whether ops run on the CPU or GPU. For example:
-
-```python
-with tf.device("/job:ps/task:0"):
-  weights_1 = tf.Variable(...)
-  biases_1 = tf.Variable(...)
-
-with tf.device("/job:ps/task:1"):
-  weights_2 = tf.Variable(...)
-  biases_2 = tf.Variable(...)
-
-with tf.device("/job:worker/task:7"):
-  input, labels = ...
-  layer_1 = tf.nn.relu(tf.matmul(input, weights_1) + biases_1)
-  logits = tf.nn.relu(tf.matmul(layer_1, weights_2) + biases_2)
-  # ...
-  train_op = ...
-
-with tf.Session("grpc://worker7.example.com:2222") as sess:
-  for _ in range(10000):
-    sess.run(train_op)
-```
-
-In the above example, the variables are created on two tasks in the `ps` job,
-and the compute-intensive part of the model is created in the `worker`
-job. TensorFlow will insert the appropriate data transfers between the jobs
-(from `ps` to `worker` for the forward pass, and from `worker` to `ps` for
-applying gradients).
-
-## Replicated training
-
-A common training configuration, called "data parallelism," involves multiple
-tasks in a `worker` job training the same model on different mini-batches of
-data, updating shared parameters hosted in one or more tasks in a `ps`
-job. All tasks typically run on different machines. There are many ways to
-specify this structure in TensorFlow, and we are building libraries that will
-simplify the work of specifying a replicated model. Possible approaches include:
-
-* **In-graph replication.** In this approach, the client builds a single
-  `tf.Graph` that contains one set of parameters (in `tf.Variable` nodes pinned
-  to `/job:ps`); and multiple copies of the compute-intensive part of the model,
-  each pinned to a different task in `/job:worker`.
-
-* **Between-graph replication.** In this approach, there is a separate client
-  for each `/job:worker` task, typically in the same process as the worker
-  task. Each client builds a similar graph containing the parameters (pinned to
-  `/job:ps` as before using
-  [`tf.train.replica_device_setter()`](../../api_docs/python/train.md#replica_device_setter)
-  to map them deterministically to the same tasks); and a single copy of the
-  compute-intensive part of the model, pinned to the local task in
-  `/job:worker`.
-
-* **Asynchronous training.** In this approach, each replica of the graph has an
-  independent training loop that executes without coordination. It is compatible
-  with both forms of replication above.
-
-* **Synchronous training.** In this approach, all of the replicas read the same
-  values for the current parameters, compute gradients in parallel, and then
-  apply them together. It is compatible with in-graph replication (e.g. using
-  gradient averaging as in the
-  [CIFAR-10 multi-GPU trainer](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
-  and between-graph replication (e.g. using the
-  `tf.train.SyncReplicasOptimizer`).
-
-### Putting it all together: example trainer program
-
-The following code shows the skeleton of a distributed trainer program,
-implementing **between-graph replication** and **asynchronous training**. It
-includes the code for the parameter server and worker tasks.
-
-```python
-import argparse
-import sys
-
-import tensorflow as tf
-
-FLAGS = None
-
-
-def main(_):
-  ps_hosts = FLAGS.ps_hosts.split(",")
-  worker_hosts = FLAGS.worker_hosts.split(",")
-
-  # Create a cluster from the parameter server and worker hosts.
-  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
-
-  # Create and start a server for the local task.
-  server = tf.train.Server(cluster,
-                           job_name=FLAGS.job_name,
-                           task_index=FLAGS.task_index)
-
-  if FLAGS.job_name == "ps":
-    server.join()
-  elif FLAGS.job_name == "worker":
-
-    # Assigns ops to the local worker by default.
-    with tf.device(tf.train.replica_device_setter(
-        worker_device="/job:worker/task:%d" % FLAGS.task_index,
-        cluster=cluster)):
-
-      # Build model...
-      loss = ...
-      global_step = tf.Variable(0)
-
-      train_op = tf.train.AdagradOptimizer(0.01).minimize(
-          loss, global_step=global_step)
-
-      saver = tf.train.Saver()
-      summary_op = tf.summary.merge_all()
-      init_op = tf.global_variables_initializer()
-
-    # Create a "supervisor", which oversees the training process.
-    sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
-                             logdir="/tmp/train_logs",
-                             init_op=init_op,
-                             summary_op=summary_op,
-                             saver=saver,
-                             global_step=global_step,
-                             save_model_secs=600)
-
-    # The supervisor takes care of session initialization, restoring from
-    # a checkpoint, and closing when done or an error occurs.
-    with sv.managed_session(server.target) as sess:
-      # Loop until the supervisor shuts down or 1000000 steps have completed.
-      step = 0
-      while not sv.should_stop() and step < 1000000:
-        # Run a training step asynchronously.
-        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
-        # perform *synchronous* training.
-        _, step = sess.run([train_op, global_step])
-
-    # Ask for all the services to stop.
-    sv.stop()
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  # Flags for defining the tf.train.ClusterSpec
-  parser.add_argument(
-      "--ps_hosts",
-      type=str,
-      default="",
-      help="Comma-separated list of hostname:port pairs"
-  )
-  parser.add_argument(
-      "--worker_hosts",
-      type=str,
-      default="",
-      help="Comma-separated list of hostname:port pairs"
-  )
-  parser.add_argument(
-      "--job_name",
-      type=str,
-      default="",
-      help="One of 'ps', 'worker'"
-  )
-  # Flags for defining the tf.train.Server
-  parser.add_argument(
-      "--task_index",
-      type=int,
-      default=0,
-      help="Index of task within the job"
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
-```
-
-To start the trainer with two parameter servers and two workers, use the
-following command line (assuming the script is called `trainer.py`):
-
-```shell
-# On ps0.example.com:
-$ python trainer.py \
-     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
-     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
-     --job_name=ps --task_index=0
-# On ps1.example.com:
-$ python trainer.py \
-     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
-     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
-     --job_name=ps --task_index=1
-# On worker0.example.com:
-$ python trainer.py \
-     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
-     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
-     --job_name=worker --task_index=0
-# On worker1.example.com:
-$ python trainer.py \
-     --ps_hosts=ps0.example.com:2222,ps1.example.com:2222 \
-     --worker_hosts=worker0.example.com:2222,worker1.example.com:2222 \
-     --job_name=worker --task_index=1
-```
-
-## Glossary
-
-**Client**
-
-A client is typically a program that builds a TensorFlow graph and constructs a
-`tensorflow::Session` to interact with a cluster. Clients are typically written
-in Python or C++. A single client process can directly interact with multiple
-TensorFlow servers (see "Replicated training" above), and a single server can
-serve multiple clients.
-
-**Cluster**
-
-A TensorFlow cluster comprises a one or more "jobs", each divided into lists
-of one or more "tasks". A cluster is typically dedicated to a particular
-high-level objective, such as training a neural network, using many machines in
-parallel. A cluster is defined by a `tf.train.ClusterSpec` object.
-
-**Job**
-
-A job comprises a list of "tasks", which typically serve a common purpose.
-For example, a job named `ps` (for "parameter server") typically hosts nodes
-that store and update variables; while a job named `worker` typically hosts
-stateless nodes that perform compute-intensive tasks. The tasks in a job
-typically run on different machines. The set of job roles is flexible:
-for example, a `worker` may maintain some state.
-
-**Master service**
-
-An RPC service that provides remote access to a set of distributed devices,
-and acts as a session target. The master service implements the
-`tensorflow::Session` interface, and is responsible for coordinating work across
-one or more "worker services". All TensorFlow servers implement the master
-service.
-
-**Task**
-
-A task corresponds to a specific TensorFlow server, and typically corresponds
-to a single process. A task belongs to a particular "job" and is identified by
-its index within that job's list of tasks.
-
-**TensorFlow server**
-A process running a `tf.train.Server` instance, which is a member of a cluster,
-and exports a "master service" and "worker service".
-
-**Worker service**
-
-An RPC service that executes parts of a TensorFlow graph using its local devices.
-A worker service implements [worker_service.proto](https://www.tensorflow.org/code/tensorflow/core/protobuf/worker_service.proto).
-All TensorFlow servers implement the worker service.
diff --git a/tensorflow/g3doc/how_tos/documentation/index.md b/tensorflow/g3doc/how_tos/documentation/index.md
deleted file mode 100755
index 1ccaba32f51..00000000000
--- a/tensorflow/g3doc/how_tos/documentation/index.md
+++ /dev/null
@@ -1,428 +0,0 @@
-# Writing TensorFlow Documentation
-
-TensorFlow's documentation is maintained in
-[Markdown](https://daringfireball.net/projects/markdown/), and resides in the
-`g3doc/` directory. The *Introduction*, *Overview*, *Tutorials*, and *How-Tos*
-sections are manually edited.
-
-Anything in the `g3doc/api_docs` directory is generated from comments in the
-code, and should not be edited directly. The script `tools/docs/gen_docs.sh`
-generates the API documentation. If called without arguments, it rebuilds the
-Python API documentation only (i.e., documentation for Ops, whether defined in
-Python or C++). If `-a` is passed, it also rebuilds the documentation for the
-C++ API. It must be called from the `tools/docs` directory, and if called with
-`-a`, requires `doxygen` to be installed.
-
-## Python API Documentation
-
-Ops, classes, and utility functions are defined in Python modules, such as
-`image_ops.py`. The module docstring is inserted at the beginning of the
-Markdown file generated for the Python file. Thus, `image_ops.md` starts with
-the module docstring in `image_ops.py`. `python/framework/gen_docs_combined.py`
-contains the list of all _libraries_ for which Markdown files are created. If
-you are adding a new library (generating a separate section in the API
-documentation), you have to add it to the list of libraries in
-`gen_docs_combined.py`. For the C++ api, only a single library file exists, its
-Markdown is a string in `gen_cc_md.py`, from which `api_docs/cc/index.md` is
-created. The rest of the C++ documentation is generated from XML files generated
-by doxygen.
-
-In the module docstring of a file registered as a library, you can insert
-generated docs for Ops, classes, and functions by calling them out with the
-syntax `@@<python-name>` (at the beginning of an otherwise empty line). The
-called-out op, function, or class does not have to be defined in the same file.
-
-This allows you to control the order in which the Ops, classes, and functions
-are documented. Group them in a logical order, with interspersed high level
-documentation.
-
-Every public op, class or function must be called out with a `@@` entry in some
-library. If you don't, you will get `doc_gen_test` failures.
-
-Docs for Ops are automatically extracted from Python wrappers or C++ Ops
-registrations, Python wrappers have priority.
-
-* Python wrappers are in `python/ops/*.py`.
-* C++ Ops registrations are in `core/ops/*.cc`.
-
-Docs for Classes and Utility Functions are extracted from their docstrings.
-
-## Op Documentation Style Guide
-
-Ideally, you should provide the following information, in order of presentation:
-
-* A short sentence that describes what the op does.
-* A short description of what happens when you pass arguments to the op.
-* An example showing how the op works (pseudocode is best).
-* Requirements, caveats, important notes (if there are any).
-* Descriptions of inputs, outputs, and Attrs or other parameters of the op
-  constructor.
-
-Each of these is described in more detail
-[below](#description-of-the-docstring-sections).
-
-Write your text in Markdown (.md) format. A basic syntax reference is
-[here](https://daringfireball.net/projects/markdown/). You are allowed to use
-[MathJax](https://www.mathjax.org) notation for equations. Those will be
-rendered properly on [tensorflow.org](https://www.tensorflow.org), but don't
-show up on [github](https://github.com/tensorflow/tensorflow).
-
-### Writing About Code
-
-Put backticks around these things when they're used in text:
-
-- Argument names (e.g. `input`, `x`, `tensor`)
-- Returned tensor names (e.g. `output`, `idx`, `out`)
-- Data types (e.g. `int32`, `float`, `uint8`)
-- Other op names referenced in text (e.g. `list_diff()`, `shuffle()`)
-- Class names (e.g. `Tensor` when you actually mean a `Tensor` object; don't
-  capitalize or use backticks if you're just explaining what an op does to a
-  tensor, or a graph, or an operation in general)
-- File names (e.g. `image_ops.py`, or `/path-to-your-data/xml/example-name`)
-
-Put three backticks around sample code and pseudocode examples. And use `==>`
-instead of a single equal sign when you want to show what an op returns. For
-example:
-
-    ```
-    # 'input' is a tensor of shape [2, 3, 5]
-    (tf.expand_dims(input, 0)) ==> [1, 2, 3, 5]
-    ```
-
-If you're providing a Python code sample, add the python style label to ensure proper syntax highlighting:
-
-```markdown
- ```python
- # some Python code
- ```
-```
-
-Put single backticks around math expressions or conditions. For example:
-
-```markdown
-This operation requires that `-1-input.dims() <= dim <= input.dims()`.
-```
-
-### Tensor Dimensions
-
-When you're talking about a tensor in general, don't capitalize the word tensor.
-When you're talking about the specific object that's provided to an op as an
-argument or returned by an op, then you should capitalize the word Tensor and
-add backticks around it because you're talking about a `Tensor` object that gets
-passed.
-
-Don't use the word `Tensors` to describe multiple Tensor objects unless you
-really are talking about a `Tensors` object. Better to say "a list of `Tensor`
-objects.", or, maybe, "`Tensor`s".
-
-When you're talking about the size of a tensor, use these guidelines:
-
-Use the term "dimension" to refer to the size of a tensor. If you need to be
-specific about the size, use these conventions:
-
-- Refer to a scalar as a "0-D tensor"
-- Refer to a vector as a "1-D tensor"
-- Refer to a matrix as a "2-D tensor"
-- Refer to tensors with 3 or more dimensions as 3-D tensors or n-D tensors. Use
-  the word "rank" only if it makes sense, but try to use "dimension" instead.
-  Never use the word "order" to describe the size of a tensor.
-
-Use the word "shape" to describe in detail the dimensions of a tensor, and show
-the shape in square brackets with backticks. For example:
-
-```markdown
-If `input` is a 3-D tensor with shape `[3, 4, 3]`, this operation will return
-a 3-D tensor with shape `[6, 8, 6]`.
-```
-
-### Links
-
-To link to something else in the `g3docs` tree, use a relative path, like
-`[tf.parse_example](../api_docs/python/ops.md#parse_example)`
-Do not use absolute paths for internal links, as this will break the website
-generator.
-
-To link to source code, use a link starting with:
-`https://www.tensorflow.org/code/`, followed by
-the file name starting at the github root. For instance, a link to this file
-should be written as
-`https://www.tensorflow.org/code/tensorflow/g3doc/how_tos/documentation/index.md`.
-This ensures that [tensorflow.org](https://www.tensorflow.org/) can forward the link to the
-branch of the code corresponding to the version of the documentation you're
-viewing. Do not include url parameters in the URL.
-
-
-### Ops defined in C++
-
-All Ops defined in C++ must be documented as part of the `REGISTER_OP`
-declaration. The docstring in the C++ file is processed to automatically add
-some information for the input types, output types, and Attr types and default
-values.
-
-For example:
-
-```c++
-REGISTER_OP("PngDecode")
-    .Input("contents: string")
-    .Attr("channels: int = 0")
-    .Output("image: uint8")
-    .Doc(R"doc(
-Decodes the contents of a PNG file into a uint8 tensor.
-
-contents: PNG file contents.
-channels: Number of color channels, or 0 to autodetect based on the input.
-  Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.
-  If the input has a different number of channels, it will be transformed
-  accordingly.
-image:= A 3-D uint8 tensor of shape `[height, width, channels]`.
-  If `channels` is 0, the last dimension is determined
-  from the png contents.
-)doc");
-```
-
-Results in this piece of Markdown:
-
-```markdown
-### tf.image.png_decode(contents, channels=None, name=None) {#png_decode}
-
-Decodes the contents of a PNG file into a uint8 tensor.
-
-#### Args:
-
-*  <b>contents</b>: A string Tensor. PNG file contents.
-*  <b>channels</b>: An optional int. Defaults to 0.
-    Number of color channels, or 0 to autodetect based on the input.
-    Must be 0 for autodetect, 1 for grayscale, 3 for RGB, or 4 for RGBA.  If the
-    input has a different number of channels, it will be transformed accordingly.
-*  <b>name</b>: A name for the operation (optional).
-
-#### Returns:
-
-  A 3-D uint8 tensor of shape `[height, width, channels]`.
-  If `channels` is 0, the last dimension is determined
-  from the png contents.
-```
-
-Much of the argument description is added automatically. In particular, the doc
-generator automatically adds the name and type of all inputs, attrs, and
-outputs. In the above example, `<b>contents</b>: A string Tensor.` was added
-automatically. You should write your additional text to flow naturally after
-that description.
-
-For inputs and output, you can prefix your additional text with an equal sign to
-prevent the automatically added name and type. In the above example, the
-description for the output named `image` starts with `=` to prevent the addition
-of `A uint8 Tensor.` before our text `A 3-D uint8 Tensor...`. You cannot prevent
-the addition of the name, type, and default value of attrs this way, so write
-your text carefully.
-
-### Ops defined in Python
-
-If your op is defined in a `python/ops/*.py` file, then you need to provide
-text for all of the arguments and output (returned) tensors.
-
-You should conform to the usual Python docstring conventions, except that you
-should use Markdown in the docstring. The doc generator does not auto-generate
-any text for ops that are defined in Python, so what you write is what you get.
-
-Here's a simple example:
-
-```python
-def foo(x, y, name="bar"):
-  """Computes foo.
-
-  Given two 1-D tensors `x` and `y`, this operation computes the foo.
-
-  Example:
-
-  ```
-  # x is [1, 1]
-  # y is [2, 2]
-  tf.foo(x, y) ==> [3, 3]
-  ```
-
-  Args:
-    x: A `Tensor` of type `int32`.
-    y: A `Tensor` of type `int32`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` of type `int32` that is the foo of `x` and `y`.
-
-  Raises:
-    ValueError: If `x` or `y` are not of type `int32`.
-  """
-
-  ...
-```
-
-## Description of the Docstring Sections
-
-Here is more detail and examples for each of the elements of the docstrings.
-
-### Short sentence that describes what the op does.
-
-Examples:
-
-```markdown
-Concatenates tensors.
-```
-
-```markdown
-Flips an image horizontally from left to right.
-```
-
-```markdown
-Computes the Levenshtein distance between two sequences.
-```
-
-```markdown
-Saves a list of tensors to a file.
-```
-
-```markdown
-Extracts a slice from a tensor.
-```
-
-### Short description of what happens when you pass arguments to the op.
-
-Examples:
-
-```markdown
-Given a tensor input of numerical type, this operation returns a tensor of
-the same type and size with values reversed along dimension `seq_dim`. A
-vector `seq_lengths` determines which elements are reversed for each index
-within dimension 0 (usually the batch dimension).
-```
-
-```markdown
-This operation returns a tensor of type `dtype` and dimensions `shape`, with
-all elements set to zero.
-```
-
-### Example showing how the op works.
-
-The `squeeze()` op has a nice pseudocode example:
-
-    shape(input) => `[1, 2, 1, 3, 1, 1]`
-    shape(squeeze(input)) =>  `[2, 3]`
-
-The `tile()` op provides a good example in descriptive text:
-
-    For example, tiling `[a, b, c, d]` by 2 produces
-    `[[a, b, c, d], [a, b, c, d]]`.
-
-It is often helpful to show code samples in Python. Never put them in the C++
-Ops file, and avoid putting them in the Python Ops doc. Put them in the module
-or class docstring where the Ops constructors are called out.
-
-Here's an example from the module docsting in `image_ops.py`:
-
-    TensorFlow can convert between images in RGB or HSV. The conversion
-    functions work only on `float` images, so you need to convert images in
-    other formats using [`convert_image_dtype`](#convert-image-dtype).
-
-    Example:
-
-    ```python
-    # Decode an image and convert it to HSV.
-    rgb_image = tf.image.decode_png(...,  channels=3)
-    rgb_image_float = tf.image.convert_image_dtype(rgb_image, tf.float32)
-    hsv_image = tf.image.rgb_to_hsv(rgb_image)
-    ```
-
-### Requirements, caveats, important notes.
-
-Examples:
-
-```markdown
-This operation requires that: `-1-input.dims() <= dim <= input.dims()`
-```
-
-```
-Note: This tensor will produce an error if evaluated. Its value must
-be fed using the `feed_dict` optional argument to `Session.run()`,
-`Tensor.eval()`, or `Operation.run()`.
-```
-
-### Descriptions of arguments and output (returned) tensors.
-
-Keep the descriptions brief and to the point. You should not have to explain
-how the operation works in the argument sections.
-
-Mention if the Op has strong constraints on the dimensions of the input or
-output tensors. Remember that for C++ Ops, the type of the tensor is
-automatically added as either as "A ..type.. Tensor" or "A Tensor with type
-in {...list of types...}". In such cases, if the Op has a constraint on the
-dimensions either add text such as "Must be 4-D" or start the description with
-`=` (to prevent the tensor type to be added) and write something like
-"A 4-D float tensor".
-
-For example, here are two ways to document an image argument of a C++ op (note
-the "=" sign):
-
-```markdown
-image: Must be 4-D. The image to resize.
-```
-
-```markdown
-image:= A 4-D `float` tensor. The image to resize.
-```
-
-In the documentation, these will be rendered to markdown as
-
-```markdown
-image: A `float` Tensor. Must be 4-D. The image to resize.
-```
-
-```markdown
-image: A 4-D `float` Tensor. The image to resize.
-```
-
-### Optional arguments descriptions ("attrs")
-
-The doc generator always describe attrs type and default value, if any.
-You cannot override that with an equal sign because the description is very
-different in the C++ and Python generated docs.
-
-Phrase any additional attr description so that it flows well after the type
-and default value.
-
-Here's an example from `image_ops.py`:
-
-```c++
-REGISTER_OP("PngDecode")
-    .Input("contents: string")
-    .Attr("channels: int = 0")
-    .Output("image: uint8")
-    .Doc(R"doc(
-Decode a PNG-encoded image to a uint8 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the PNG-encoded image.
-*   1: output a grayscale image.
-
-...
-
-contents: 0-D. The PNG-encoded image.
-channels: Number of color channels for the decoded image.
-image: 3-D with shape `[height, width, channels]`.
-)doc");
-```
-
-This generates the following "Args" section:
-
-```markdown
-  contents: A string Tensor. 0-D. The PNG-encoded image.
-  channels: An optional `int`. Defaults to 0. Number of color channels for the
-    decoded image.
-  name: A name for the operation (optional).
-```
-
-
-
-
diff --git a/tensorflow/g3doc/how_tos/embedding_viz/index.md b/tensorflow/g3doc/how_tos/embedding_viz/index.md
deleted file mode 100644
index 529efb247f1..00000000000
--- a/tensorflow/g3doc/how_tos/embedding_viz/index.md
+++ /dev/null
@@ -1,268 +0,0 @@
-# TensorBoard: Embedding Visualization
-
-Embeddings are ubiquitous in machine learning, appearing in recommender systems,
-NLP, and many other applications. Indeed, in the context of TensorFlow, it's
-natural to view tensors (or slices of tensors) as points in space, so almost any
-TensorFlow system will naturally give rise to various embeddings.
-
-To learn more about embeddings and how to train them, see the
-[Vector Representations of Words](../../tutorials/word2vec/index.md) tutorial.
-If you are interested in embeddings of images, check out
-[this article](http://colah.github.io/posts/2014-10-Visualizing-MNIST/) for
-interesting visualizations of MNIST images. On the other hand, if you are
-interested in word embeddings,
-[this article](http://colah.github.io/posts/2015-01-Visualizing-Representations/)
-gives a good introduction.
-
-TensorBoard has a built-in visualizer, called the Embedding Projector, for
-interactive visualization and analysis of high-dimensional data like embeddings.
-It is meant to be useful for developers and researchers alike. It reads from the
-checkpoint files where you save your tensorflow variables. Although it's most
-useful for embeddings, it will load any 2D tensor, potentially including your
-training weights.
-
-<video autoplay loop style="max-width: 100%;">
-  <source src="../../images/embedding-mnist.mp4" type="video/mp4">
-  Sorry, your browser doesn't support HTML5 video in MP4 format.
-</video>
-
-By default, the Embedding Projector performs 3-dimensional
-[principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis),
-meaning it takes your high-dimensional data and tries to find a
-structure-preserving projection onto three dimensional space. Basically, it does
-this by rotating your data so that the first three dimensions reveal as much of
-the variance in the data as possible. There's a nice visual explanation
-[here](http://setosa.io/ev/principal-component-analysis/). Another extremely
-useful projection you can use is
-[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding).
-We talk about more t-SNE later in the tutorial.
-
-If you are working with an embedding, you'll probably want to attach
-labels/images to the data points to tell the visualizer what label/image each
-data point corresponds to. You can do this by generating a metadata file, and
-attaching it to the tensor using our Python API, or uploading it to an
-already-running TensorBoard.
-
-
-
-## Setup
-
-For in depth information on how to run TensorBoard and make sure you are
-logging all the necessary information,
-see [TensorBoard: Visualizing Learning](../../how_tos/summaries_and_tensorboard/index.md).
-
-To visualize your embeddings, there are 3 things you need to do:
-
-1) Setup a 2D tensor variable(s) that holds your embedding(s).
-
-```python
-embedding_var = tf.Variable(....)
-```
-
-2) Periodically save your embeddings in a <code>LOG_DIR</code>.
-
-```python
-saver = tf.train.Saver()
-saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), step)
-```
-
-The following step is not required, however if you have any metadata
-(labels, images) associated with your embedding, you need to link them to the
-tensor so TensorBoard knows about it.
-
-3) Associate metadata with your embedding.
-
-```python
-from tensorflow.contrib.tensorboard.plugins import projector
-# Use the same LOG_DIR where you stored your checkpoint.
-summary_writer = tf.train.SummaryWriter(LOG_DIR)
-
-# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
-config = projector.ProjectorConfig()
-
-# You can add multiple embeddings. Here we add only one.
-embedding = config.embeddings.add()
-embedding.tensor_name = embedding_var.name
-# Link this tensor to its metadata file (e.g. labels).
-embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
-
-# Saves a configuration file that TensorBoard will read during startup.
-projector.visualize_embeddings(summary_writer, config)
-```
-
-After running your model and training your embeddings, run TensorBoard and point
-it to the <code>LOG_DIR</code> of the job.
-
-```python
-tensorboard --logdir=LOG_DIR
-```
-
-Then click on the *Embeddings* tab on the top pane
-and select the appropriate run (if there are more than one run).
-
-
-## Metadata (optional)
-Usually embeddings have metadata associated with it (e.g. labels, images). The
-metadata should be stored in a separate file outside of the model checkpoint
-since the metadata is not a trainable parameter of the model. The format should
-be a TSV file with the first line containing column headers and subsequent lines
-contain the metadata values. Here's an example:
-
-```
-Name\tType\n
-Caterpie\tBug\n
-Charmeleon\tFire\n
-…
-```
-
-There is no explicit key shared with the main data file; instead, the order in
-the metadata file is assumed to match the order in the embedding tensor. In
-other words, the first line is the header information and the (i+1)-th line in
-the metadata file corresponds to the i-th row of the embedding tensor stored in
-the checkpoint.
-
-Note: If the TSV metadata file has only a single column, then we don’t expect a
-header row, and assume each row is the label of the embedding. We include this
-exception because it matches the commonly-used "vocab file" format.
-
-### Images
-If you have images associated with your embeddings, you will need to
-produce a single image consisting of small thumbnails of each data point.
-This is known as the
-[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image).
-The sprite should have the same number of rows and columns with thumbnails
-stored in row-first order: the first data point placed in the top left and the
-last data point in the bottom right:
-
-<table style="border: none;">
-<tr style="background-color: transparent;">
-  <td style="border: 1px solid black">0</td>
-  <td style="border: 1px solid black">1</td>
-  <td style="border: 1px solid black">2</td>
-</tr>
-<tr style="background-color: transparent;">
-  <td style="border: 1px solid black">3</td>
-  <td style="border: 1px solid black">4</td>
-  <td style="border: 1px solid black">5</td>
-</tr>
-<tr style="background-color: transparent;">
-  <td style="border: 1px solid black">6</td>
-  <td style="border: 1px solid black">7</td>
-  <td style="border: 1px solid black"></td>
-</tr>
-</table>
-
-Note in the example above that the last row doesn't have to be filled. For a
-concrete example of a sprite, see
-[this sprite image](../../images/mnist_10k_sprite.png) of 10,000 MNIST digits
-(100x100).
-
-Note: We currently support sprites up to 8192px X 8192px.
-
-After constructing the sprite, you need to tell the Embedding Projector where
-to find it:
-
-
-```python
-embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE
-# Specify the width and height of a single thumbnail.
-embedding.sprite.single_image_dim.extend([w, h])
-```
-
-## Interaction
-
-The Embedding Projector has three panels:
-
-1. *Data panel* on the top left, where you can choose the run, the embedding
-   tensor and data columns to color and label points by.
-2. *Projections panel* on the bottom left, where you choose the type of
-    projection (e.g. PCA, t-SNE).
-3. *Inspector panel* on the right side, where you can search for particular
-   points and see a list of nearest neighbors.
-
-### Projections
-The Embedding Projector has three methods of reducing the dimensionality of a
-data set: two linear and one nonlinear. Each method can be used to create either
-a two- or three-dimensional view.
-
-**Principal Component Analysis** A straightforward technique for reducing
-dimensions is Principal Component Analysis (PCA). The Embedding Projector
-computes the top 10 principal components. The menu lets you project those
-components onto any combination of two or three. PCA is a linear projection,
-often effective at examining global geometry.
-
-**t-SNE** A popular non-linear dimensionality reduction technique is t-SNE.
-The Embedding Projector offers both two- and three-dimensional t-SNE views.
-Layout is performed client-side animating every step of the algorithm. Because
-t-SNE often preserves some local structure, it is useful for exploring local
-neighborhoods and finding clusters. Although extremely useful for visualizing
-high-dimensional data, t-SNE plots can sometimes be mysterious or misleading.
-See this [great article](http://distill.pub/2016/misread-tsne/) for how to use
-t-SNE effectively.
-
-**Custom** You can also construct specialized linear projections based on text
-searches for finding meaningful directions in space. To define a projection
-axis, enter two search strings or regular expressions. The program computes the
-centroids of the sets of points whose labels match these searches, and uses the
-difference vector between centroids as a projection axis.
-
-### Navigation
-
-To explore a data set, you can navigate the views in either a 2D or a 3D mode,
-zooming, rotating, and panning using natural click-and-drag gestures.
-Clicking on a point causes the right pane to show an explicit textual list of
-nearest neighbors, along with distances to the current point. The
-nearest-neighbor points themselves are highlighted on the projection.
-
-Zooming into the cluster gives some information, but it is sometimes more
-helpful to restrict the view to a subset of points and perform projections only
-on those points. To do so, you can select points in multiple ways:
-
-1. After clicking on a point, its nearest neighbors are also selected.
-2. After a search, the points matching the query are selected.
-3. Enabling selection, clicking on a point and dragging defines a selection
-   sphere.
-
-After selecting a set of points, you can isolate those points for
-further analysis on their own with the "Isolate Points" button in the Inspector
-pane on the right hand side.
-
-
-![Selection of nearest neighbors](../../images/embedding-nearest-points.png "Selection of nearest neighbors")
-*Selection of the nearest neighbors of “important” in a word embedding dataset.*
-
-The combination of filtering with custom projection can be powerful. Below, we filtered
-the 100 nearest neighbors of “politics” and projected them onto the
-“best” - “worst” vector as an x axis. The y axis is random.
-
-You can see that on the right side we have “ideas”, “science”, “perspective”,
-“journalism” while on the left we have “crisis”, “violence” and “conflict”.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 30%;">
-      <img src="../../images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
-    </td>
-    <td style="width: 70%;">
-      <img src="../../images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 30%;">
-      Custom projection controls.
-    </td>
-    <td style="width: 70%;">
-      Custom projection of neighbors of "politics" onto "best" - "worst" vector.
-    </td>
-  </tr>
-</table>
-
-### Collaborative Features
-
-To share your findings, you can use the bookmark panel in the bottom right
-corner and save the current state (including computed coordinates of any
-projection) as a small file. The Projector can then be pointed to a set of one
-or more of these files, producing the panel below. Other users can then walk
-through a sequence of bookmarks.
-
-<img src="../../images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
diff --git a/tensorflow/g3doc/how_tos/graph_viz/index.md b/tensorflow/g3doc/how_tos/graph_viz/index.md
deleted file mode 100644
index 70f836d7ab7..00000000000
--- a/tensorflow/g3doc/how_tos/graph_viz/index.md
+++ /dev/null
@@ -1,315 +0,0 @@
-# TensorBoard: Graph Visualization
-
-TensorFlow computation graphs are powerful but complicated. The graph visualization can help you understand and debug them. Here's an example of the visualization at work.
-
-![Visualization of a TensorFlow graph](../../images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
-*Visualization of a TensorFlow graph.*
-
-To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see [TensorBoard: Visualizing Learning](../../how_tos/summaries_and_tensorboard/index.md).
-
-## Name scoping and nodes
-
-Typical TensorFlow graphs can have many thousands of nodes--far too many to see
-easily all at once, or even to lay out using standard graph tools. To simplify,
-variable names can be scoped and the visualization uses this information to
-define a hierarchy on the nodes in the graph.  By default, only the top of this
-hierarchy is shown. Here is an example that defines three operations under the
-`hidden` name scope using
-[`tf.name_scope`](../../api_docs/python/framework.md#name_scope):
-
-```python
-import tensorflow as tf
-
-with tf.name_scope('hidden') as scope:
-  a = tf.constant(5, name='alpha')
-  W = tf.Variable(tf.random_uniform([1, 2], -1.0, 1.0), name='weights')
-  b = tf.Variable(tf.zeros([1]), name='biases')
-```
-
-This results in the following three op names:
-
-* `hidden/alpha`
-* `hidden/weights`
-* `hidden/biases`
-
-By default, the visualization will collapse all three into a node labeled `hidden`.
-The extra detail isn't lost. You can double-click, or click
-on the orange `+` sign in the top right to expand the node, and then you'll see
-three subnodes for `alpha`, `weights` and `biases`.
-
-Here's a real-life example of a more complicated node in its initial and
-expanded states.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 50%;">
-      <img src="../../images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
-    </td>
-    <td style="width: 50%;">
-      <img src="../../images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 50%;">
-      Initial view of top-level name scope <code>pool_1</code>. Clicking on the orange <code>+</code> button on the top right or double-clicking on the node itself will expand it.
-    </td>
-    <td style="width: 50%;">
-      Expanded view of <code>pool_1</code> name scope. Clicking on the orange <code>-</code> button on the top right or double-clicking on the node itself will collapse the name scope.
-    </td>
-  </tr>
-</table>
-
-Grouping nodes by name scopes is critical to making a legible graph. If you're
-building a model, name scopes give you control over the resulting visualization.
-**The better your name scopes, the better your visualization.**
-
-The figure above illustrates a second aspect of the visualization. TensorFlow
-graphs have two kinds of connections: data dependencies and control
-dependencies. Data dependencies show the flow of tensors between two ops and
-are shown as solid arrows, while control dependencies use dotted lines. In the
-expanded view (right side of the figure above) all the connections are data
-dependencies with the exception of the dotted line connecting `CheckNumerics`
-and `control_dependency`.
-
-There's a second trick to simplifying the layout. Most TensorFlow graphs have a
-few nodes with many connections to other nodes. For example, many nodes might
-have a control dependency on an initialization step. Drawing all edges between
-the `init` node and its dependencies would create a very cluttered view.
-
-To reduce clutter, the visualization separates out all high-degree nodes to an
-*auxiliary* area on the right and doesn't draw lines to represent their edges.
-Instead of lines, we draw small *node icons* to indicate the connections.
-Separating out the auxiliary nodes typically doesn't remove critical
-information since these nodes are usually related to bookkeeping functions.
-See [Interaction](#interaction) for how to move nodes between the main graph
-and the auxiliary area.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 50%;">
-      <img src="../../images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
-    </td>
-    <td style="width: 50%;">
-      <img src="../../images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 50%;">
-      Node <code>conv_1</code> is connected to <code>save</code>. Note the little <code>save</code> node icon on its right.
-    </td>
-    <td style="width: 50%;">
-      <code>save</code> has a high degree, and will appear as an auxiliary node. The connection with <code>conv_1</code> is shown as a node icon on its left. To further reduce clutter, since <code>save</code> has a lot of connections, we show the first 5 and abbreviate the others as <code>... 12 more</code>.
-    </td>
-  </tr>
-</table>
-
-One last structural simplification is *series collapsing*. Sequential
-motifs--that is, nodes whose names differ by a number at the end and have
-isomorphic structures--are collapsed into a single *stack* of nodes, as shown
-below. For networks with long sequences, this greatly simplifies the view. As
-with hierarchical nodes, double-clicking expands the series. See
-[Interaction](#interaction) for how to disable/enable series collapsing for a
-specific set of nodes.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 50%;">
-      <img src="../../images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
-    </td>
-    <td style="width: 50%;">
-      <img src="../../images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 50%;">
-      A collapsed view of a node sequence.
-    </td>
-    <td style="width: 50%;">
-      A small piece of the expanded view, after double-click.
-    </td>
-  </tr>
-</table>
-
-Finally, as one last aid to legibility, the visualization uses special icons
-for constants and summary nodes. To summarize, here's a table of node symbols:
-
-Symbol | Meaning
---- | ---
-![Name scope](../../images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
-![Sequence of unconnected nodes](../../images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
-![Sequence of connected nodes](../../images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
-![Operation node](../../images/op_node.png "Operation node") | An individual operation node.
-![Constant node](../../images/constant.png "Constant node") | A constant.
-![Summary node](../../images/summary.png "Summary node") | A summary node.
-![Data flow edge](../../images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
-![Control dependency edge](../../images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
-![Reference edge](../../images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
-
-## Interaction {#interaction}
-
-Navigate the graph by panning and zooming. Click and drag to pan, and use a
-scroll gesture to zoom. Double-click on a node, or click on its `+` button, to
-expand a name scope that represents a group of operations. To easily keep
-track of the current viewpoint when zooming and panning, there is a minimap in
-the bottom right corner.
-
-To close an open node, double-click it again or click its `-` button. You can
-also click once to select a node. It will turn a darker color, and details
-about it and the nodes it connects to will appear in the info card at upper
-right corner of the visualization.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 50%;">
-      <img src="../../images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
-    </td>
-    <td style="width: 50%;">
-      <img src="../../images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 50%;">
-      Info card showing detailed information for the <code>conv2</code> name scope. The inputs and outputs are combined from the inputs and outputs of the operation nodes inside the name scope. For name scopes no attributes are shown.
-    </td>
-    <td style="width: 50%;">
-      Info card showing detailed information for the <code>DecodeRaw</code> operation node. In addition to inputs and outputs, the card shows the device and the attributes associated with the current operation.
-    </td>
-  </tr>
-</table>
-
-TensorBoard provides several ways to change the visual layout of the graph. This
-doesn't change the graph's computational semantics, but it can bring some
-clarity to the network's structure. By right clicking on a node or pressing
-buttons on the bottom of that node's info card, you can make the following
-changes to its layout:
-
-* Nodes can be moved between the main graph and the auxiliary area.
-* A series of nodes can be ungrouped so that the nodes in the series do not
-appear grouped together. Ungrouped series can likewise be regrouped.
-
-Selection can also be helpful in understanding high-degree nodes. Select any
-high-degree node, and the corresponding node icons for its other connections
-will be selected as well. This makes it easy, for example, to see which nodes
-are being saved--and which aren't.
-
-Clicking on a node name in the info card will select it. If necessary, the
-viewpoint will automatically pan so that the node is visible.
-
-Finally, you can choose two color schemes for your graph, using the color menu
-above the legend. The default *Structure View* shows structure: when two
-high-level nodes have the same structure, they appear in the same color of the
-rainbow. Uniquely structured nodes are gray. There's a second view, which shows
-what device the different operations run on. Name scopes are colored
-proportionally to the fraction of devices for the operations inside them.
-
-The images below give an illustration for a piece of a real-life graph.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 50%;">
-      <img src="../../images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
-    </td>
-    <td style="width: 50%;">
-      <img src="../../images/colorby_device.png" alt="Color by device" title="Color by device" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 50%;">
-      Structure view: The gray nodes have unique structure. The orange <code>conv1</code> and <code>conv2</code> nodes have the same structure, and analogously for nodes with other colors.
-    </td>
-    <td style="width: 50%;">
-      Device view: Name scopes are colored proportionally to the fraction of devices of the operation nodes inside them. Here, purple means GPU and the green is CPU.
-    </td>
-  </tr>
-</table>
-
-## Tensor shape information
-
-When the serialized `GraphDef` includes tensor shapes, the graph visualizer
-labels edges with tensor dimensions, and edge thickness reflects total tensor
-size. To include tensor shapes in the `GraphDef` pass the actual graph object
-(as in `sess.graph`) to the `SummaryWriter` when serializing the graph.
-The images below show the CIFAR-10 model with tensor shape information:
-<table width="100%;">
-  <tr>
-    <td style="width: 100%;">
-      <img src="../../images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 100%;">
-      CIFAR-10 model with tensor shape information.
-    </td>
-  </tr>
-</table>
-
-## Runtime statistics
-
-Often it is useful to collect runtime metadata for a run, such as total memory
-usage, total compute time, and tensor shapes for nodes. The code example below
-is a snippet from the train and test section of a modification of the
-[simple MNIST tutorial](../../tutorials/mnist/beginners/index.md),
-in which we have recorded summaries and runtime statistics. See the [Summaries Tutorial](../../how_tos/summaries_and_tensorboard/index.md#serializing-the-data)
-for details on how to record summaries.
-Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
-
-```python
-  # Train the model, and also write summaries.
-  # Every 10th step, measure test-set accuracy, and write test summaries
-  # All other steps, run train_step on training data, & add training summaries
-
-  def feed_dict(train):
-    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
-    if train or FLAGS.fake_data:
-      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
-      k = FLAGS.dropout
-    else:
-      xs, ys = mnist.test.images, mnist.test.labels
-      k = 1.0
-    return {x: xs, y_: ys, keep_prob: k}
-
-  for i in range(FLAGS.max_steps):
-    if i % 10 == 0:  # Record summaries and test-set accuracy
-      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
-      test_writer.add_summary(summary, i)
-      print('Accuracy at step %s: %s' % (i, acc))
-    else:  # Record train set summaries, and train
-      if i % 100 == 99:  # Record execution stats
-        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-        run_metadata = tf.RunMetadata()
-        summary, _ = sess.run([merged, train_step],
-                              feed_dict=feed_dict(True),
-                              options=run_options,
-                              run_metadata=run_metadata)
-        train_writer.add_run_metadata(run_metadata, 'step%d' % i)
-        train_writer.add_summary(summary, i)
-        print('Adding run metadata for', i)
-      else:  # Record a summary
-        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
-        train_writer.add_summary(summary, i)
-```
-
-This code will emit runtime statistics for every 100th step starting at step99.
-
-When you launch tensorboard and go to the Graph tab, you will now see options
-under "Session runs" which correspond to the steps where run metadata was added.
-Selecting one of these runs will show you the snapshot of the network at that
-step, fading out unused nodes. In the controls on the left hand side, you will
-be able to color the nodes by total memory or total compute time. Additionally,
-clicking on a node will display the exact total memory, compute time, and
-tensor output sizes.
-
-
-<table width="100%;">
-  <tr style="height: 380px">
-    <td>
-      <img src="../../images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
-    </td>
-    <td>
-      <img src="../../images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
-    </td>
-    <td>
-      <img src="../../images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
-    </td>
-  </tr>
-</table>
diff --git a/tensorflow/g3doc/how_tos/hadoop/index.md b/tensorflow/g3doc/how_tos/hadoop/index.md
deleted file mode 100644
index 2f01843604a..00000000000
--- a/tensorflow/g3doc/how_tos/hadoop/index.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# How to run TensorFlow on Hadoop
-
-This document describes how to run TensorFlow on Hadoop. It will be expanded to
-describe running on various cluster managers, but only describes running on HDFS
-at the moment.
-
-## HDFS
-
-We assume that you are familiar with [reading data](../reading_data/index.md).
-
-To use HDFS with TensorFlow, change the file paths you use to read and write
-data to an HDFS path. For example:
-
-```python
-filename_queue = tf.train.string_input_producer([
-    "hdfs://namenode:8020/path/to/file1.csv",
-    "hdfs://namenode:8020/path/to/file2.csv",
-])
-```
-
-If you want to use the namenode specified in your HDFS configuration files, then
-change the file prefix to `hdfs://default/`.
-
-When launching your TensorFlow program, the following environment variables must
-be set:
-
-*   **JAVA_HOME**: The location of your Java installation.
-*   **HADOOP_HDFS_HOME**: The location of your HDFS installation. You can also
-    set this environment variable by running:
-
-    ```shell
-    source ${HADOOP_HOME}/libexec/hadoop-config.sh
-    ```
-
-*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path 
-    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in 
-    `$HADOOP_HDFS_HOME/lib/native`. On Linux:
-
-    ```shell
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server
-    ```
-
-*   **CLASSPATH**: The Hadoop jars must be added prior to running your
-    TensorFlow program. The CLASSPATH set by
-    `${HADOOP_HOME}/libexec/hadoop-config.sh` is insufficient. Globs must be
-    expanded as described in the libhdfs documentation:
-
-    ```shell
-    CLASSPATH=$($HADOOP_HDFS_HOME}/bin/hadoop classpath --glob) python your_script.py
-    ```
-    For older version of Hadoop/libhdfs (older than 2.6.0), you have to expand the
-    classpath wildcard manually. For more details, see
-    [HADOOP-10903](https://issues.apache.org/jira/browse/HADOOP-10903).
-
-If the Hadoop cluster is in secure mode, the following environment variable must
-be set:
-
-*   **KERB_TICKET_CACHE_PATH**: The path of Kerberos ticket cache file. For example:
-
-    ```shell
-    export KERB_TICKET_CACHE_PATH=/tmp/krb5cc_10002
-    ```
-
-If you are running [Distributed TensorFlow](../distributed/index.md), then all
-workers must have the environment variables set and Hadoop installed.
diff --git a/tensorflow/g3doc/how_tos/image_retraining/index.md b/tensorflow/g3doc/how_tos/image_retraining/index.md
deleted file mode 100644
index 9fb78a121f4..00000000000
--- a/tensorflow/g3doc/how_tos/image_retraining/index.md
+++ /dev/null
@@ -1,321 +0,0 @@
-# How to Retrain Inception's Final Layer for New Categories
-
-Modern object recognition models have millions of parameters and can take weeks
-to fully train. Transfer learning is a technique that shortcuts a lot of this
-work by taking a fully-trained model for a set of categories like ImageNet, and
-retrains from the existing weights for new classes. In this example we'll be
-retraining the final layer from scratch, while leaving all the others untouched.
-For more information on the approach you can see
-[this paper on Decaf](http://arxiv.org/pdf/1310.1531v1.pdf).
-
-Though it's not as good as a full training run, this is surprisingly effective
-for many applications, and can be run in as little as thirty minutes on a
-laptop, without requiring a GPU. This tutorial will show you how to run the
-example script on your own images, and will explain some of the options you have
-to help control the training process.
-
-[TOC]
-
-## Training on Flowers
-
-![Daisies by Kelly Sikkema](../../images/daisies.jpg)
-[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
-
-Before you start any training, you'll need a set of images to teach the network
-about the new classes you want to recognize. There's a later section that
-explains how to prepare your own images, but to make it easy we've created an
-archive of creative-commons licensed flower photos to use initially. To get the
-set of flower photos, run these commands:
-
-```sh
-cd ~
-curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
-tar xzf flower_photos.tgz
-```
-
-Once you have the images, you can build the retrainer like this, from the root
-of your TensorFlow source directory:
-
-```sh
-bazel build tensorflow/examples/image_retraining:retrain
-```
-
-If you have a machine which supports [the AVX instruction set](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
-(common in x86 CPUs produced in the last few years) you can improve the running
-speed of the retraining by building for that architecture, like this (after choosing appropriate options in `configure`):
-
-```sh
-bazel build --config opt tensorflow/examples/image_retraining:retrain
-```
-
-The retrainer can then be run like this:
-
-```sh
-bazel-bin/tensorflow/examples/image_retraining/retrain --image_dir ~/flower_photos
-```
-
-This script loads the pre-trained Inception v3 model, removes the old top layer,
-and trains a new one on the flower photos you've downloaded. None of the flower
-species were in the original ImageNet classes the full network was trained on.
-The magic of transfer learning is that lower layers that have been trained to
-distinguish between some objects can be reused for many recognition tasks
-without any alteration.
-
-## Bottlenecks
-
-The script can take thirty minutes or more to complete, depending on the speed
-of your machine. The first phase analyzes all the images on disk and calculates
-the bottleneck values for each of them. 'Bottleneck' is an informal term we
-often use for the layer just before the final output layer that actually does
-the classification. This penultimate layer has been trained to output a set of
-values that's good enough for the classifier to use to distinguish between all
-the classes it's been asked to recognize. That means it has to be a meaningful
-and compact summary of the images, since it has to contain enough information
-for the classifier to make a good choice in a very small set of values. The
-reason our final layer retraining can work on new classes is that it turns out
-the kind of information needed to distinguish between all the 1,000 classes in
-ImageNet is often also useful to distinguish between new kinds of objects.
-
-Because every image is reused multiple times during training and calculating
-each bottleneck takes a significant amount of time, it speeds things up to
-cache these bottleneck values on disk so they don't have to be repeatedly
-recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
-if you rerun the script they'll be reused so you don't have to wait for this
-part again.
-
-## Training
-
-Once the bottlenecks are complete, the actual training of the top layer of the
-network begins. You'll see a series of step outputs, each one showing training
-accuracy, validation accuracy, and the cross entropy. The training accuracy
-shows what percent of the images used in the current training batch were
-labeled with the correct class. The validation accuracy is the precision on a
-randomly-selected group of images from a different set. The key difference is
-that the training accuracy is based on images that the network has been able
-to learn from so the network can overfit to the noise in the training data. A
-true measure of the performance of the network is to measure its performance on
-a data set not contained in the training data -- this is measured by the
-validation accuracy. If the train accuracy is high but the validation accuracy
-remains low, that means the network is overfitting and memorizing particular
-features in the training images that aren't helpful more generally. Cross
-entropy is a loss function which gives a glimpse into how well the learning
-process is progressing. The training's objective is to make the loss as small as
-possible, so you can tell if the learning is working by keeping an eye on
-whether the loss keeps trending downwards, ignoring the short-term noise.
-
-By default this script will run 4,000 training steps. Each step chooses ten
-images at random from the training set, finds their bottlenecks from the cache,
-and feeds them into the final layer to get predictions. Those predictions are
-then compared against the actual labels to update the final layer's weights
-through the back-propagation process. As the process continues you should see
-the reported accuracy improve, and after all the steps are done, a final test
-accuracy evaluation is run on a set of images kept separate from the training
-and validation pictures. This test evaluation is the best estimate of how the
-trained model will perform on the classification task. You should see an
-accuracy value of between 90% and 95%, though the exact value will vary from run
-to run since there's randomness in the training process. This number is based on
-the percent of the images in the test set that are given the correct label
-after the model is fully trained.
-
-## Visualizing the Retraining with TensorBoard
-
-The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
-
-To launch TensorBoard, run this command during or after retraining:
-
-```sh
-tensorboard --logdir /tmp/retrain_logs
-```
-
-Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
-
-The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-
-The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
-
-## Using the Retrained Model
-
-The script will write out a version of the Inception v3 network with a final
-layer retrained to your categories to /tmp/output_graph.pb, and a text file
-containing the labels to /tmp/output_labels.txt. These are both in a format that
-the [C++ and Python image classification examples](https://www.tensorflow.org/versions/master/tutorials/image_recognition/index.html)
-can read in, so you can start using your new model immediately. Since you've
-replaced the top layer, you will need to specify the new name in the script, for
-example with the flag `--output_layer=final_result` if you're using label_image.
-
-Here's an example of how to build and run the label_image example with your
-retrained graphs:
-
-```sh
-bazel build tensorflow/examples/label_image:label_image && \
-bazel-bin/tensorflow/examples/label_image/label_image \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---output_layer=final_result \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-You should see a list of flower labels, in most cases with daisy on top
-(though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out, and use the C++ code
-as a template to integrate with your own applications.
-
-If you'd like to use the retrained model in a Python program [this example from @eldor4do shows what you'll need to do](https://github.com/eldor4do/TensorFlow-Examples/blob/master/retraining-example.py).
-
-## Training on Your Own Categories
-
-If you've managed to get the script working on the flower example images, you
-can start looking at teaching it to recognize categories you care about instead.
-In theory all you'll need to do is point it at a set of sub-folders, each named
-after one of your categories and containing only images from that category. If
-you do that and pass the root folder of the subdirectories as the argument to
-`--image_dir`, the script should train just like it did for the flowers.
-
-Here's what the folder structure of the flowers archive looks like, to give you
-and example of the kind of layout the script is looking for:
-
-![Folder Structure](../../images/folder_structure.png)
-
-In practice it may take some work to get the accuracy you want. I'll try to
-guide you through some of the common problems you might encounter below.
-
-## Creating a Set of Training Images
-
-The first place to start is by looking at the images you've gathered, since the
-most common issues we see with training come from the data that's being fed in.
-
-For training to work well, you should gather at least a hundred photos of each
-kind of object you want to recognize. The more you can gather, the better the
-accuracy of your trained model is likely to be. You also need to make sure that
-the photos are a good representation of what your application will actually
-encounter. For example, if you take all your photos indoors against a blank wall
-and your users are trying to recognize objects outdoors, you probably won't see
-good results when you deploy.
-
-Another pitfall to avoid is that the learning process will pick up on anything
-that the labeled images have in common with each other, and if you're not
-careful that might be something that's not useful. For example if you photograph
-one kind of object in a blue room, and another in a green one, then the model
-will end up basing its prediction on the background color, not the features of
-the object you actually care about. To avoid this, try to take pictures in as
-wide a variety of situations as you can, at different times, and with different
-devices. If you want to know more about this problem, you can read about the
-classic (and possibly apocryphal)
-[tank recognition problem](http://www.jefftk.com/p/detecting-tanks).
-
-You may also want to think about the categories you use. It might be worth
-splitting big categories that cover a lot of different physical forms into
-smaller ones that are more visually distinct. For example instead of 'vehicle'
-you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
-whether you have a 'closed world' or an 'open world' problem. In a closed world,
-the only things you'll ever be asked to categorize are the classes of object you
-know about. This might apply to a plant recognition app where you know the user
-is likely to be taking a picture of a flower, so all you have to do is decide
-which species. By contrast a roaming robot might see all sorts of different
-things through its camera as it wanders around the world. In that case you'd
-want the classifier to report if it wasn't sure what it was seeing. This can be
-hard to do well, but often if you collect a large number of typical 'background'
-photos with no relevant objects in them, you can add them to an extra 'unknown'
-class in your image folders.
-
-It's also worth checking to make sure that all of your images are labeled
-correctly. Often user-generated tags are unreliable for our purposes, for
-example using #daisy for pictures of a person named Daisy. If you go through
-your images and weed out any mistakes it can do wonders for your overall
-accuracy.
-
-## Training Steps
-
-If you're happy with your images, you can take a look at improving your results
-by altering the details of the learning process. The simplest one to try is
-`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
-8,000 it will train for twice as long. The rate of improvement in the accuracy
-slows the longer you train for, and at some point will stop altogether, but you
-can experiment to see when you hit that limit for your model.
-
-## Distortions
-
-A common way of improving the results of image training is by deforming,
-cropping, or brightening the training inputs in random ways. This has the
-advantage of expanding the effective size of the training data thanks to all the
-possible variations of the same images, and tends to help the network learn to
-cope with all the distortions that will occur in real-life uses of the
-classifier. The biggest disadvantage of enabling these distortions in our script
-is that the bottleneck caching is no longer useful, since input images are never
-reused exactly. This means the training process takes a lot longer, so I
-recommend trying this as a way of fine-tuning your model once you've got one
-that you're reasonably happy with.
-
-You enable these distortions by passing `--random_crop`, `--random_scale` and
-`--random_brightness` to the script. These are all percentage values that
-control how much of each of the distortions is applied to each image. It's
-reasonable to start with values of 5 or 10 for each of them and then experiment
-to see which of them help with your application. `--flip_left_right` will
-randomly mirror half of the images horizontally, which makes sense as long as
-those inversions are likely to happen in your application. For example it
-wouldn't be a good idea if you were trying to recognize letters, since flipping
-them destroys their meaning.
-
-## Hyper-parameters
-
-There are several other parameters you can try adjusting to see if they help
-your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
-will take longer, but it can end up helping the overall precision. That's not
-always the case though, so you need to experiment carefully to see what works
-for your case. The `--train_batch_size` controls how many images are examined
-during one training step, and because the learning rate is applied per batch
-you'll need to reduce it if you have larger batches to get the same overall
-effect.
-
-## Training, Validation, and Testing Sets
-
-One of the things the script does under the hood when you point it at a folder
-of images is divide them up into three different sets. The largest is usually
-the training set, which are all the images fed into the network during training,
-with the results used to update the model's weights. You might wonder why we
-don't use all the images for training? A big potential problem when we're doing
-machine learning is that our model may just be memorizing irrelevant details of
-the training images to come up with the right answers. For example, you could
-imagine a network remembering a pattern in the background of each photo it was
-shown, and using that to match labels with objects. It could produce good
-results on all the images it's seen before during training, but then fail on new
-images because it's not learned general characteristics of the objects, just
-memorized unimportant details of the training images.
-
-This problem is known as overfitting, and to avoid it we keep some of our data
-out of the training process, so that the model can't memorize them. We then use
-those images as a check to make sure that overfitting isn't occurring, since if
-we see good accuracy on them it's a good sign the network isn't overfitting. The
-usual split is to put 80% of the images into the main training set, keep 10%
-aside to run as validation frequently during training, and then have a final 10%
-that are used less often as a testing set to predict the real-world performance
-of the classifier. These ratios can be controlled using the
-`--testing_percentage` and `--validation_percentage` flags. In general
-you should be able to leave these values at their defaults, since you won't
-usually find any advantage to training to adjusting them.
-
-Note that the script uses the image filenames (rather than a completely random
-function) to divide the images among the training, validation, and test sets.
-This is done to ensure that images don't get moved between training and testing
-sets on different runs, since that could be a problem if images that had been
-used for training a model were subsequently used in a validation set.
-
-You might notice that the validation accuracy fluctuates among iterations. Much
-of this fluctuation arises from the fact that a random subset of the validation
-set is chosen for each validation accuracy measurement. The fluctuations can be
-greatly reduced, at the cost of some increase in training time, by choosing
-`--validation_batch_size=-1`, which uses the entire validation set for each
-accuracy computation.
-
-Once training is complete, you may find it insightful to examine misclassified
-images in the test set. This can be done by adding the flag
-`--print_misclassified_test_images`. This may help you get a feeling for which
-types of images were most confusing for the model, and which categories were
-most difficult to distinguish. For instance, you might discover that some
-subtype of a particular category, or some unusual photo angle, is particularly
-difficult to identify, which may encourage you to add more training images of
-that subtype. Oftentimes, examining misclassified images can also point to
-errors in the input data set, such as mislabeled, low-quality, or ambiguous
-images. However, one should generally avoid point-fixing individual errors in
-the test set, since they are likely to merely reflect more general problems in
-the (much larger) training set.
diff --git a/tensorflow/g3doc/how_tos/index.md b/tensorflow/g3doc/how_tos/index.md
deleted file mode 100644
index 2d189427923..00000000000
--- a/tensorflow/g3doc/how_tos/index.md
+++ /dev/null
@@ -1,174 +0,0 @@
-# How-Tos
-
-
-## Variables: Creation, Initializing, Saving, and Restoring
-
-TensorFlow Variables are in-memory buffers containing tensors.  Learn how to
-use them to hold and update model parameters during training.
-
-[View Tutorial](variables/index.md)
-
-
-## TensorFlow Mechanics 101
-
-A step-by-step walk through of the details of using TensorFlow infrastructure
-to train models at scale, using MNIST handwritten digit recognition as a toy
-example.
-
-[View Tutorial](../tutorials/mnist/tf/index.md)
-
-
-## TensorBoard: Visualizing Learning
-
-TensorBoard is a useful tool for visualizing the training and evaluation of
-your model(s).  This tutorial describes how to build and run TensorBoard as well
-as how to add Summary ops to automatically output data to the Events files that
-TensorBoard uses for display.
-
-[View Tutorial](summaries_and_tensorboard/index.md)
-
-
-## TensorBoard: Graph Visualization
-
-This tutorial describes how to use the graph visualizer in TensorBoard to help
-you understand the dataflow graph and debug it.
-
-[View Tutorial](graph_viz/index.md)
-
-
-## TensorBoard: Embedding Visualization
-
-This tutorial describes how to use the embedding projector in TensorBoard to
-visualize your embeddings.
-
-[View Tutorial](embedding_viz/index.md)
-
-## Reading Data
-
-This tutorial describes the three main methods of getting data into your
-TensorFlow program: Feeding, Reading and Preloading.
-
-[View Tutorial](reading_data/index.md)
-
-## Distributed TensorFlow
-
-This tutorial describes how to execute TensorFlow programs using a cluster of
-TensorFlow servers.
-
-[View Tutorial](distributed/index.md)
-
-
-## Threading and Queues
-
-This tutorial describes the various constructs implemented by TensorFlow
-to facilitate asynchronous and concurrent training.
-
-[View Tutorial](threading_and_queues/index.md)
-
-
-## Adding a New Op
-
-TensorFlow already has a large suite of node operations from which you can
-compose in your graph, but here are the details of how to add you own custom Op.
-
-[View Tutorial](adding_an_op/index.md)
-
-
-## How to write TensorFlow code
-
-TensorFlow Style Guide is set of style decisions that both developers
-and users of TensorFlow should follow to increase the readability of their code,
-reduce the number of errors, and promote consistency.
-
-[View Style Guide](style_guide.md)
-
-
-## Writing Documentation
-
-TensorFlow's documentation is largely generated from its source code. Here is an
-introduction to the formats we use, a style guide, and instructions on how to
-build updated documentation from the source.
-
-[View Tutorial](documentation/index.md)
-
-
-## Custom Data Readers
-
-If you have a sizable custom data set, you may want to consider extending
-TensorFlow to read your data directly in it's native format.  Here's how.
-
-[View Tutorial](new_data_formats/index.md)
-
-
-## Using GPUs
-
-This tutorial describes how to construct and execute models on GPU(s).
-
-[View Tutorial](using_gpu/index.md)
-
-
-## Sharing Variables
-
-When deploying large models on multiple GPUs, or when unrolling complex LSTMs
-or RNNs, it is often necessary to access the same Variable objects from
-different locations in the model construction code.
-
-The "Variable Scope" mechanism is designed to facilitate that.
-
-[View Tutorial](variable_scope/index.md)
-
-## A Tool Developer's Guide to TensorFlow Model Files
-
-If you're developing a tool to load, analyze, or manipulate TensorFlow model
-files, it's useful to understand a bit about the format in which they're stored.
-This guide covers the details of the saved model format.
-
-[View Tutorial](../how_tos/tool_developers/index.md)
-
-## How to Retrain Inception using Transfer Learning
-
-Training a full object recognition model like Inception takes a long time and a
-lot of images. This example shows how to use the technique of transfer learning
-to retrain just the final layer of a fully-trained model to recognize new
-categories of objects, which is a lot faster and easier than completely
-retraining a new model.
-
-[View Tutorial](../how_tos/image_retraining/index.md)
-
-## How to use the TensorFlow Debugger
-
-The TensorFlow Debugger (tfdbg) is a specialized debugger for TensorFlow
-models. It provides visibility into the internal structure and state of running
-TensorFlow graphs. Using the command-line interface of tfdbg, you can debug
-model bugs and issues with fewer code changes and more insight.
-
-[View Tutorial](../how_tos/debugger/index.md)
-
-## How to Export and Import a Model
-
-This tutorial describes how to export everything pertaining to a running
-model and import it later for various purposes.
-
-[View Tutorial](../how_tos/meta_graph/index.md)
-
-## How to Quantize Neural Networks with TensorFlow
-
-This guide shows how you can convert a float model into one using eight-bit
-quantized parameters and calculations. It also describes how the quantization
-process works under the hood.
-
-[View Tutorial](../how_tos/quantization/index.md)
-
-## How to run TensorFlow on Hadoop
-
-This tutorial shows how to read and write HDFS files, and will later describe
-running on cluster managers.
-
-[View Tutorial](../how_tos/hadoop/index.md)
-
-## TensorFlow in other languages
-
-This guide describes how TensorFlow features can be provided in other
-programming languages.
-
-[View Tutorial](language_bindings/index.md)
diff --git a/tensorflow/g3doc/how_tos/language_bindings/index.md b/tensorflow/g3doc/how_tos/language_bindings/index.md
deleted file mode 100644
index 89d7d25162f..00000000000
--- a/tensorflow/g3doc/how_tos/language_bindings/index.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# TensorFlow in other languages
-
-## Background
-
-This document is intended as a guide for those interested in the creation or
-development of TensorFlow functionality in other programming languages. It
-describes the features of TensorFlow and recommended steps for making the same
-available in other programming languages.
-
-Python was the first client language supported by TensorFlow and currently
-supports the most features. More and more of that functionality is being moved
-into the core of TensorFlow (implemented in C++) and exposed via a [C API].
-Client languages should use the language's [foreign function interface
-(FFI)](https://en.wikipedia.org/wiki/Foreign_function_interface) to call into
-this [C API] to provide TensorFlow functionality.
-
-## Overview
-
-Providing TensorFlow functionality in a programming language can be broken down
-into broad categories:
-
--   *Run a predefined graph*: Given a `GraphDef` (or
-    `MetaGraphDef`) protocol message, be able to create a session, run queries,
-    and get tensor results. This is sufficient for a mobile app or server that
-    wants to run inference on a pre-trained model.
--   *Graph construction*: At least one function per defined
-    TensorFlow op that adds an operation to the graph. Ideally these functions
-    would be automatically generated so they stay in sync as the op definitions
-    are modified.
--   *Gradients (AKA automatic differentiation)*: Given a graph and a list of
-    input and output operations, add operations to the graph that compute the
-    partial deriviatives (gradients) of the inputs with respect to the outputs.
-    Allows for customization of the gradient function for a particular operation
-    in the graph.
--   *Functions*: Define a subgraph that may be called in multiple places in the
-    main `GraphDef`. Defines a `FunctionDef` in the `FunctionDefLibrary`
-    included in a `GraphDef`.
--   *Control Flow*: Construct "If" and "While" with user-specified subgraphs.
-    Ideally these work with gradients (see above).
--   *Neural Network library*: A number of components that together support the
-    creation of neural network models and training them (possibly in a
-    distributed setting). While it would be convenient to have this available in
-    other languages, there are currently no plans to support this in languages
-    other than Python. These libraries are typically wrappers over the features
-    described above.
-
-At a minimum, a language binding should support running a predefined graph, but
-most should also support graph construction. The TensorFlow Python API provides
-all these features.
-
-## Current Status
-
-New language support should be built on top of the [C API]. However, as you can
-see in the table below, not all functionality is available in C yet. Providing
-more functionality in the [C API] is an ongoing project.
-
-Feature                                        | Python                                                      | C
-:--------------------------------------------- | :---------------------------------------------------------- | :--
-Run a predefined Graph                         | `tf.import_graph_def`, `tf.Session`                         | `TF_GraphImportGraphDef`, `TF_NewSession`
-Graph construction with generated op functions | Yes                                                         | Yes (The C API supports client languages that do this)
-Gradients                                      | `tf.gradients`                                              |
-Functions                                      | `tf.python.framework.function.Defun`                        |
-Control Flow                                   | `tf.cond`, `tf.while_loop`                                  |
-Neural Network library                         | `tf.train`, `tf.nn`, `tf.contrib.layers`, `tf.contrib.slim` |
-
-## Recommended Approach
-
-### Run a predefined graph
-
-A language binding is expected to define the following classes:
-
--   `Graph`: A graph representing a TensorFlow computation. Consists of
-    operations (represented in the client language by `Operation`s) and
-    corresponds to a `TF_Graph` in the C API. Mainly used as an argument when
-    creating new `Operation` objects and when starting a `Session`. Also
-    supports iterating through the operations in the graph
-    (`TF_GraphNextOperation`), looking up operations by name
-    (`TF_GraphOperationByName`), and converting to and from a `GraphDef`
-    protocol message (`TF_GraphToGraphDef` and `TF_GraphImportGraphDef` in the C
-    API).
--   `Operation`: Represents a computation node in the graph. Corresponds to a
-    `TF_Operation` in the C API.
--   `Output`: Represents one of the outputs of an operation in the graph. Has a
-    `DataType` (and eventually a shape). May be passed as an input argument to a
-    function for adding operations to a graph, or to a `Session`'s `Run()`
-    method to fetch that output as a tensor. Corresponds to a `TF_Output` in the
-    C API.
--   `Session`: Represents a client to a particular instance of the TensorFlow
-    runtime. Its main job is to be constructed with a `Graph` and some options
-    and then field calls to `Run()` the graph. Corresponds to a `TF_Session` in
-    the C API.
--   `Tensor`: Represents an N-dimensional (rectangular) array with elements all
-    the same `DataType`. Gets data into and out of a `Session`'s `Run()` call.
-    Corresponds to a `TF_Tensor` in the C API.
--   `DataType`: An enumerant with all the possible tensor types supported by
-    TensorFlow. Corresponds to `TF_DataType` in the C API and often referred to
-    as `dtype` in the Python API.
-
-### Graph construction
-
-TensorFlow has many ops, and the list is not static, so we recommend generating
-the functions for adding ops to a graph instead of writing them by individually
-by hand (though writing a few by hand is a good way to figure out what the
-generator should generate). The information needed to generate a function is
-contained in an `OpDef` protocol message.
-
-There are a few ways to get a list of the `OpDef`s for the registered ops:
-
--   `TF_GetAllOpList` in the C API retrieves all registered `OpDef` protocol
-    messages. This can be used to write the generator in the client language.
-    This requires that the client language have protocol buffer support in order
-    to interpret the `OpDef` messages.
--   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
-    list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
-    in C++ (particularly useful for languages that do not have protocol buffer
-    support).
--   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
-
-The `OpDef` specifies the following:
-
--   Name of the op in CamelCase. For generated functions follow the conventions
-    of the language. For example, if the language uses snake_case, use that
-    instead of CamelCase for the op's function name.
--   A list of inputs and outputs. The types for these may be polymorphic by
-    referencing attributes, as described in the inputs and outputs section of
-    [Adding an
-    op](https://tensorflow.org/how_tos/adding_an_op/index.html).
--   A list of attributes, along with their default values (if any). Note that
-    some of these will be inferred (if they are determined by an input), some
-    will be optional (if they have a default), and some will be required (no
-    default).
--   Documentation for the op in general and the inputs, outputs, and
-    non-inferred attributes.
--   Some other fields that are used by the runtime and can be ignored by the
-    code generators.
-
-An `OpDef` can be converted into the text of a function that adds that op to the
-graph using the `TF_OperationDescription` C API (wrapped in the language's FFI):
-
--   Start with `TF_NewOperation()` to create the `TF_OperationDescription*`.
--   Call `TF_AddInput()` or `TF_AddInputList()` once per input (depending on
-    whether the input has a list type).
--   Call `TF_SetAttr*()` functions to set non-inferred attributes. May skip
-    attributes with defaults if you don't want to override the default value.
--   Set optional fields if necessary:
-    -   `TF_SetDevice()`: force the operation onto a specific device.
-    -   `TF_AddControlInput()`: add requirements that another operation finish
-        before this operation starts running
-    -   `TF_SetAttrString("_kernel")` to set the kernel label (rarely used)
-    -   `TF_ColocateWith()` to colocate one op with another
--   Call `TF_FinishOperation()` when done. This adds the operation to the graph,
-    after which it can't be modified.
-
-The existing examples run the code generator as part of the build process (using
-a Bazel genrule). Alternatively, the code generator can be run by an automated
-cron process, possibly checking in the result. This creates a risk of divergence
-between the generated code and the `OpDef`s checked into the repository, but is
-useful for languages where code is expected to be generated ahead of time like
-`go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
-some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
-
-#### Handling Constants
-
-Calling code will be much more concise if users can provide constants to input
-arguments. The generated code should convert those constants to operations that
-are added to the graph and used as input to the op being instantiated.
-
-#### Optional parameters
-
-If the language allows for optional parameters to a function (like keyword
-arguments with defaults in Python), use them for optional attributes, operation
-names, devices, control inputs etc. In some languages, these optional parameters
-can be set using dynamic scopes (like "with" blocks in Python). Without these
-features, the library may resort to the "builder pattern", as is done in the C++
-version of the TensorFlow API.
-
-#### Name scopes
-
-It is a good idea to have support for naming graph operations using some sort of
-scoping hierarchy, especially considering the fact that TensorBoard relies on it
-to display large graphs in a reasonable way. The existing Python and C++ APIs
-take different approaches: In Python, the "directory" part of the name
-(everything up to the last "/") comes from `with` blocks. In effect, there is a
-thread-local stack with the scopes defining the name hierarchy. The last
-component of the name is either supplied explicitly by the user (using the
-optional `name` keyword argument) or defaults to the name of the type of the op
-being added. In C++ the "directory" part of the name is stored in an explicit
-`Scope` object. The `NewSubScope()` method appends to that part of the name and
-returns a new `Scope`. The last component of the name is set using the
-`WithOpName()` method, and like Python defaults to the name of the type of op
-being added. `Scope` objects are explicitly passed around to specify the name of
-the context.
-
-#### Wrappers
-
-It may make sense to keep the generated functions private for some ops so that
-wrapper functions that do a little bit of additional work can be used instead.
-This also gives an escape hatch for supporting features outside the scope of
-generated code.
-
-One use of a wrapper is for supporting `SparseTensor` input and output. A
-`SparseTensor` is a tuple of 3 dense tensors: indices, values, and shape. values
-is a vector size [n], shape is a vector size [rank], and indices is a matrix
-size [n, rank]. There are some sparse ops that use this triple to represent a
-single sparse tensor.
-
-Another reason to use wrappers is for ops that hold state. There are a few such
-ops (e.g. a variable) that have several companion ops for operating on that
-state. The Python API has classes for these ops where the constructor creates
-the op, and methods on that class add operations to the graph that operate on
-the state.
-
-#### Other Considerations
-
--   It is good to have a list of keywords used to rename op functions and
-    arguments that collide with language keywords (or other symbols that will
-    cause trouble, like the names of library functions or variables referenced
-    in the generated code).
--   The function for adding a `Const` operation to a graph typically is a
-    wrapper since the generated function will typically have redundant
-    `DataType` inputs.
-
-### Gradients, functions and control flow
-
-At this time, support for gradients, functions and control flow operations ("if"
-and "while") is not available in languages other than Python. This will be
-updated when the [C API] provides necessary support.
-
-[C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/g3doc/how_tos/leftnav_files b/tensorflow/g3doc/how_tos/leftnav_files
deleted file mode 100644
index 011444fff16..00000000000
--- a/tensorflow/g3doc/how_tos/leftnav_files
+++ /dev/null
@@ -1,13 +0,0 @@
-variables/index.md
-summaries_and_tensorboard/index.md
-graph_viz/index.md
-embedding_viz/index.md
-reading_data/index.md
-threading_and_queues/index.md
-distributed/index.md
-adding_an_op/index.md
-new_data_formats/index.md
-using_gpu/index.md
-variable_scope/index.md
-hadoop/index.md
-language_bindings/index.md
diff --git a/tensorflow/g3doc/how_tos/meta_graph/index.md b/tensorflow/g3doc/how_tos/meta_graph/index.md
deleted file mode 100644
index 63b032995ed..00000000000
--- a/tensorflow/g3doc/how_tos/meta_graph/index.md
+++ /dev/null
@@ -1,283 +0,0 @@
-# Exporting and Importing a MetaGraph
-
-A [`MetaGraph`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) contains both a TensorFlow GraphDef
-as well as associated metadata necessary for running computation in a
-graph when crossing a process boundary.  It can also be used for long
-term storage of graphs.  The MetaGraph contains the information required
-to continue training, perform evaluation, or run inference on a previously trained graph.
-
-The APIs for exporting and importing the complete model are in
-the [`tf.train.Saver`](../../api_docs/python/state_ops.md#Saver) class:
-[`export_meta_graph`](../../api_docs/python/train.md#export_meta_graph)
-and
-[`import_meta_graph`](../../api_docs/python/train.md#import_meta_graph).
-
-## What's in a MetaGraph
-
-The information contained in a MetaGraph is expressed as a
-[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
-protocol buffer. It contains the following fields:
-
-* [`MetaInfoDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto) for meta information, such as version and other user information.
-* [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto) for describing the graph.
-* [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
-* [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
-map that further describes additional components of the model, such as
-[`Variables`](https://tensorflow.org/api_docs/python/state_ops.html),
-[`QueueRunners`](https://tensorflow.org/api_docs/python/train.html#QueueRunner), etc.  In order for a Python object to be serialized
-to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
-`from_proto()` methods, and register them with the system using
-`register_proto_function`.
-
-  For example,
-
-  ```Python
-  def to_proto(self, export_scope=None):
-
-    """Converts a `Variable` to a `VariableDef` protocol buffer.
-
-    Args:
-      export_scope: Optional `string`. Name scope to remove.
-
-    Returns:
-      A `VariableDef` protocol buffer, or `None` if the `Variable` is not
-      in the specified name scope.
-    """
-    if (export_scope is None or
-        self._variable.name.startswith(export_scope)):
-      var_def = variable_pb2.VariableDef()
-      var_def.variable_name = ops.strip_name_scope(
-          self._variable.name, export_scope)
-      var_def.initializer_name = ops.strip_name_scope(
-          self.initializer.name, export_scope)
-      var_def.snapshot_name = ops.strip_name_scope(
-          self._snapshot.name, export_scope)
-      if self._save_slice_info:
-        var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto(
-            export_scope=export_scope))
-      return var_def
-    else:
-      return None
-
-  @staticmethod
-  def from_proto(variable_def, import_scope=None):
-    """Returns a `Variable` object created from `variable_def`."""
-    return Variable(variable_def=variable_def, import_scope=import_scope)
-
-  ops.register_proto_function(ops.GraphKeys.GLOBAL_VARIABLES,
-                              proto_type=variable_pb2.VariableDef,
-                              to_proto=Variable.to_proto,
-                              from_proto=Variable.from_proto)
-  ```
-
-## Exporting a Complete Model to MetaGraph
-
-The API for exporting a running model as a MetaGraph is `export_meta_graph()`.
-
-  ```Python
-  def export_meta_graph(filename=None, collection_list=None, as_text=False):
-    """Writes `MetaGraphDef` to save_path/filename.
-
-    Args:
-      filename: Optional meta_graph filename including the path.
-      collection_list: List of string keys to collect.
-      as_text: If `True`, writes the meta_graph as an ASCII proto.
-
-    Returns:
-      A `MetaGraphDef` proto.
-    """
-  ```
-
-  A `collection` can contain any Python objects that users would like to
-  be able to uniquely identify and easily retrieve. These objects can be
-  special operations in the graph, such as `train_op`, or hyper parameters,
-  such as "learning rate".  Users can specify the list of collections
-  they would like to export.  If no `collection_list` is specified,
-  all collections in the model will be exported.
-
-  The API returns a serialized protocol buffer. If `filename` is
-  specified, the protocol buffer will also be written to a file.
-
-  Here are some of the typical usage models:
-
-  * Export the default running graph:
-
-  ```Python
-  # Build the model
-  ...
-  with tf.Session() as sess:
-    # Use the model
-    ...
-  # Export the model to /tmp/my-model.meta.
-  meta_graph_def = tf.train.export_meta_graph(filename='/tmp/my-model.meta')
-  ```
-
-  * Export the default running graph and only a subset of the collections.
-
-  ```Python
-  meta_graph_def = tf.train.export_meta_graph(
-      filename='/tmp/my-model.meta',
-      collection_list=["input_tensor", "output_tensor"])
-  ```
-
-
-The MetaGraph is also automatically exported via the `save()` API in
-[`tf.train.Saver`](../../api_docs/python/state_ops.md#Saver).
-
-
-## Import a MetaGraph
-
-The API for importing a MetaGraph file into a graph is `import_meta_graph()`.
-
-Here are some of the typical usage models:
-
-* Import and continue training without building the model from scratch.
-
-  ```Python
-  ...
-  # Create a saver.
-  saver = tf.train.Saver(...variables...)
-  # Remember the training_op we want to run by adding it to a collection.
-  tf.add_to_collection('train_op', train_op)
-  sess = tf.Session()
-  for step in xrange(1000000):
-      sess.run(train_op)
-      if step % 1000 == 0:
-          # Saves checkpoint, which by default also exports a meta_graph
-          # named 'my-model-global_step.meta'.
-          saver.save(sess, 'my-model', global_step=step)
-  ```
-
-  Later we can continue training from this saved `meta_graph` without building
-  the model from scratch.
-
-  ```Python
-  with tf.Session() as sess:
-    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
-    new_saver.restore(sess, 'my-save-dir/my-model-10000')
-    # tf.get_collection() returns a list. In this example we only want the
-    # first one.
-    train_op = tf.get_collection('train_op')[0]
-    for step in xrange(1000000):
-      sess.run(train_op)
-  ```
-
-* Import and extend the graph.
-
-  For example, we can first build an inference graph, export it as a meta graph:
-
-  ```Python
-  # Creates an inference graph.
-  # Hidden 1
-  images = tf.constant(1.2, tf.float32, shape=[100, 28])
-  with tf.name_scope("hidden1"):
-    weights = tf.Variable(
-        tf.truncated_normal([28, 128],
-                            stddev=1.0 / math.sqrt(float(28))),
-        name="weights")
-    biases = tf.Variable(tf.zeros([128]),
-                         name="biases")
-    hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
-  # Hidden 2
-  with tf.name_scope("hidden2"):
-    weights = tf.Variable(
-        tf.truncated_normal([128, 32],
-                            stddev=1.0 / math.sqrt(float(128))),
-        name="weights")
-    biases = tf.Variable(tf.zeros([32]),
-                         name="biases")
-    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
-  # Linear
-  with tf.name_scope("softmax_linear"):
-    weights = tf.Variable(
-        tf.truncated_normal([32, 10],
-                            stddev=1.0 / math.sqrt(float(32))),
-        name="weights")
-    biases = tf.Variable(tf.zeros([10]),
-                         name="biases")
-    logits = tf.matmul(hidden2, weights) + biases
-    tf.add_to_collection("logits", logits)
-
-  init_all_op = tf.global_variables_initializer()
-
-  with tf.Session() as sess:
-    # Initializes all the variables.
-    sess.run(init_all_op)
-    # Runs to logit.
-    sess.run(logits)
-    # Creates a saver.
-    saver0 = tf.train.Saver()
-    saver0.save(sess, 'my-save-dir/my-model-10000')
-    # Generates MetaGraphDef.
-    saver0.export_meta_graph('my-save-dir/my-model-10000.meta')
-  ```
-
-  Then later import it and extend it to a training graph.
-
-  ```Python
-  with tf.Session() as sess:
-    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta')
-    new_saver.restore(sess, 'my-save-dir/my-model-10000')
-    # Addes loss and train.
-    labels = tf.constant(0, tf.int32, shape=[100], name="labels")
-    batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size), 1)
-    concated = tf.concat([indices, labels], 1)
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
-    logits = tf.get_collection("logits")[0]
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        labels=onehot_labels, logits=logits, name="xentropy")
-    loss = tf.reduce_mean(cross_entropy, name="xentropy_mean")
-
-    tf.summary.scalar('loss', loss)
-    # Creates the gradient descent optimizer with the given learning rate.
-    optimizer = tf.train.GradientDescentOptimizer(0.01)
-
-    # Runs train_op.
-    train_op = optimizer.minimize(loss)
-    sess.run(train_op)
-  ```
-
-* Import a graph with preset devices.
-
-  Sometimes an exported meta graph is from a training environment that the
-  importer doesn't have. For example, the model might have been trained
-  on GPUs, or in a distributed environment with replicas. When importing
-  such models, it's useful to be able to clear the device settings in
-  the graph so that we can run it on locally available devices. This can
-  be achieved by calling `import_meta_graph` with the `clear_devices`
-  option set to `True`.
-
-  ```Python
-  with tf.Session() as sess:
-    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta',
-        clear_devices=True)
-    new_saver.restore(sess, 'my-save-dir/my-model-10000')
-    ...
-  ```
-
-* Import within the default graph.
-
-  Sometimes you might want to run `export_meta_graph` and `import_meta_graph`
-  in codelab using the default graph. In that case, you need to reset
-  the default graph by calling `tf.reset_default_graph()` first before
-  running import.
-
-  ```Python
-  meta_graph_def = tf.train.export_meta_graph()
-  ...
-  tf.reset_default_graph()
-  ...
-  tf.train.import_meta_graph(meta_graph_def)
-  ...
-  ```
-
-* Retrieve Hyper Parameters
-
-  ```Python
-  filename = ".".join([tf.latest_checkpoint(train_dir), "meta"])
-  tf.train.import_meta_graph(filename)
-  hparams = tf.get_collection("hparams")
-  ```
diff --git a/tensorflow/g3doc/how_tos/new_data_formats/index.md b/tensorflow/g3doc/how_tos/new_data_formats/index.md
deleted file mode 100644
index 4f66d6d140b..00000000000
--- a/tensorflow/g3doc/how_tos/new_data_formats/index.md
+++ /dev/null
@@ -1,219 +0,0 @@
-# Custom Data Readers
-
-PREREQUISITES:
-
-*   Some familiarity with C++.
-*   Must have
-    [downloaded TensorFlow source](../../get_started/os_setup.md#installing-from-sources), and be
-    able to build it.
-
-We divide the task of supporting a file format into two pieces:
-
-*   File formats: We use a *Reader* Op to read a *record* (which can be any
-    string) from a file.
-*   Record formats: We use decoder or parsing Ops to turn a string record
-    into tensors usable by TensorFlow.
-
-For example, to read a
-[CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-[a Reader for text files](../../api_docs/python/io_ops.md#TextLineReader)
-followed by
-[an Op that parses CSV data from a line of text](../../api_docs/python/io_ops.md#decode_csv).
-
-[TOC]
-
-## Writing a Reader for a file format
-
-A `Reader` is something that reads records from a file.  There are some examples
-of Reader Ops already built into TensorFlow:
-
-*   [`tf.TFRecordReader`](../../api_docs/python/io_ops.md#TFRecordReader)
-    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
-*   [`tf.FixedLengthRecordReader`](../../api_docs/python/io_ops.md#FixedLengthRecordReader)
-    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
-*   [`tf.TextLineReader`](../../api_docs/python/io_ops.md#TextLineReader)
-    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
-
-You can see these all expose the same interface, the only differences
-are in their constructors.  The most important method is `read`.
-It takes a queue argument, which is where it gets filenames to
-read from whenever it needs one (e.g. when the `read` op first runs, or
-the previous `read` reads the last record from a file).  It produces
-two scalar tensors: a string key and a string value.
-
-To create a new reader called `SomeReader`, you will need to:
-
-1.  In C++, define a subclass of
-    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/kernels/reader_base.h)
-    called `SomeReader`.
-2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
-3.  In Python, define a subclass of [`tf.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py) called `SomeReader`.
-
-You can put all the C++ code in a file in
-`tensorflow/core/user_ops/some_reader_op.cc`.  The code to read a file will live
-in a descendant of the C++ `ReaderBase` class, which is defined in
-[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/kernels/reader_base.h).
-You will need to implement the following methods:
-
-*   `OnWorkStartedLocked`: open the next file
-*   `ReadLocked`: read a record or report EOF/error
-*   `OnWorkFinishedLocked`: close the current file, and
-*   `ResetLocked`: get a clean slate after, e.g., an error
-
-These methods have names ending in "Locked" since `ReaderBase` makes sure
-to acquire a mutex before calling any of these methods, so you generally don't
-have to worry about thread safety (though that only protects the members of the
-class, not global state).
-
-For `OnWorkStartedLocked`, the name of the file to open is the value returned by
-the `current_work()` method.  `ReadLocked` has this signature:
-
-```c++
-Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
-```
-
-If `ReadLocked` successfully reads a record from the file, it should fill in:
-
-*   `*key`: with an identifier for the record, that a human could use to find
-    this record again.  You can include the filename from `current_work()`,
-    and append a record number or whatever.
-*   `*value`: with the contents of the record.
-*   `*produced`: set to `true`.
-
-If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
-return `Status::OK()`.  If there is an error, simply return it using one of the
-helper functions from
-[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
-without modifying any arguments.
-
-Next you will create the actual Reader op.  It will help if you are familiar
-with [the adding an op how-to](../../how_tos/adding_an_op/index.md).  The main steps
-are:
-
-*   Registering the op.
-*   Define and register an `OpKernel`.
-
-To register the op, you will use a `REGISTER_OP` call defined in
-[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
-Reader ops never take any input and always have a single output with type
-`Ref(string)`.  They should always call `SetIsStateful()`, and have a string
-`container` and `shared_name` attrs.  You may optionally define additional attrs
-for configuration or include documentation in a `Doc`.  For examples, see
-[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
-e.g.:
-
-```c++
-#include "tensorflow/core/framework/op.h"
-
-REGISTER_OP("TextLineReader")
-    .Output("reader_handle: Ref(string)")
-    .Attr("skip_header_lines: int = 0")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .SetIsStateful()
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-)doc");
-```
-
-To define an `OpKernel`, Readers can use the shortcut of descending from
-`ReaderOpKernel`, defined in
-[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
-and implement a constructor that calls `SetReaderFactory`.  After defining
-your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
-An example with no attrs:
-
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TFRecordReaderOp : public ReaderOpKernel {
- public:
-  explicit TFRecordReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    Env* env = context->env();
-    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
-                        TFRecordReaderOp);
-```
-
-An example with attrs:
-
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TextLineReaderOp : public ReaderOpKernel {
- public:
-  explicit TextLineReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    int skip_header_lines = -1;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("skip_header_lines", &skip_header_lines));
-    OP_REQUIRES(context, skip_header_lines >= 0,
-                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
-                                        skip_header_lines));
-    Env* env = context->env();
-    SetReaderFactory([this, skip_header_lines, env]() {
-      return new TextLineReader(name(), skip_header_lines, env);
-    });
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
-                        TextLineReaderOp);
-```
-
-The last step is to add the Python wrapper.  You will import
-`tensorflow.python.ops.io_ops` in
-[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
-and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
-
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import common_shapes
-from tensorflow.python.ops import io_ops
-
-class SomeReader(io_ops.ReaderBase):
-
-    def __init__(self, name=None):
-        rr = gen_user_ops.some_reader(name=name)
-        super(SomeReader, self).__init__(rr)
-
-
-ops.NotDifferentiable("SomeReader")
-ops.RegisterShape("SomeReader")(common_shapes.scalar_shape)
-```
-
-You can see some examples in
-[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
-
-## Writing an Op for a record format
-
-Generally this is an ordinary op that takes a scalar string record as input, and
-so follow [the instructions to add an Op](../../how_tos/adding_an_op/index.md).  You may
-optionally take a scalar string key as input, and include that in error messages
-reporting improperly formatted data.  That way users can more easily track down
-where the bad data came from.
-
-Examples of Ops useful for decoding records:
-
-*   [`tf.parse_single_example`](../../api_docs/python/io_ops.md#parse_single_example)
-    (and
-    [`tf.parse_example`](../../api_docs/python/io_ops.md#parse_example))
-*   [`tf.decode_csv`](../../api_docs/python/io_ops.md#decode_csv)
-*   [`tf.decode_raw`](../../api_docs/python/io_ops.md#decode_raw)
-
-Note that it can be useful to use multiple Ops to decode a particular record
-format.  For example, you may have an image saved as a string in
-[a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
-Depending on the format of that image, you might take the corresponding output
-from a
-[`tf.parse_single_example`](../../api_docs/python/io_ops.md#parse_single_example)
-op and call [`tf.image.decode_jpeg`](../../api_docs/python/image.md#decode_jpeg),
-[`tf.image.decode_png`](../../api_docs/python/image.md#decode_png), or
-[`tf.decode_raw`](../../api_docs/python/io_ops.md#decode_raw).  It is common to
-take the output of `tf.decode_raw` and use
-[`tf.slice`](../../api_docs/python/array_ops.md#slice) and
-[`tf.reshape`](../../api_docs/python/array_ops.md#reshape) to extract pieces.
diff --git a/tensorflow/g3doc/how_tos/quantization/index.md b/tensorflow/g3doc/how_tos/quantization/index.md
deleted file mode 100644
index fa61cadceae..00000000000
--- a/tensorflow/g3doc/how_tos/quantization/index.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# How to Quantize Neural Networks with TensorFlow
-
-When modern neural networks were being developed, the biggest challenge was
-getting them to work at all! That meant that accuracy and speed during training
-were the top priorities. Using floating point arithmetic was the easiest way to
-preserve accuracy, and GPUs were well-equipped to accelerate those calculations,
-so it's natural that not much attention was paid to other numerical formats.
-
-These days, we actually have a lot of models being deployed in commercial
-applications. The computation demands of training grow with the number of
-researchers, but the cycles needed for inference expand in proportion to users.
-That means pure inference efficiency has become a burning issue for a lot of
-teams.
-
-That is where quantization comes in. It's an umbrella term that covers a lot of
-different techniques to store numbers and perform calculations on them in more
-compact formats than 32-bit floating point. I am going to focus on eight-bit
-fixed point, for reasons I'll go into more detail on later.
-
-[TOC]
-
-## Why does Quantization Work?
-
-Training neural networks is done by applying many tiny nudges to the weights,
-and these small increments typically need floating point precision to work
-(though there are research efforts to use quantized representations here too).
-
-Taking a pre-trained model and running inference is very different. One of the
-magical qualities of deep networks is that they tend to cope very well with high
-levels of noise in their inputs. If you think about recognizing an object in a
-photo you've just taken, the network has to ignore all the CCD noise, lighting
-changes, and other non-essential differences between it and the training
-examples it's seen before, and focus on the important similarities instead. This
-ability means that they seem to treat low-precision calculations as just another
-source of noise, and still produce accurate results even with numerical formats
-that hold less information.
-
-## Why Quantize?
-
-Neural network models can take up a lot of space on disk, with the original
-AlexNet being over 200 MB in float format for example. Almost all of that size
-is taken up with the weights for the neural connections, since there are often
-many millions of these in a single model. Because they're all slightly different
-floating point numbers, simple compression formats like zip don't compress them
-well. They are arranged in large layers though, and within each layer the
-weights tend to be normally distributed within a certain range, for example -3.0
-to 6.0.
-
-The simplest motivation for quantization is to shrink file sizes by storing the
-min and max for each layer, and then compressing each float value to an
-eight-bit integer representing the closest real number in a linear set of 256
-within the range. For example with the -3.0 to 6.0 range, a 0 byte would
-represent -3.0, a 255 would stand for 6.0, and 128 would represent about 1.5.
-I'll go into the exact calculations later, since there's some subtleties, but
-this means you can get the benefit of a file on disk that's shrunk by 75%, and
-then convert back to float after loading so that your existing floating-point
-code can work without any changes.
-
-Another reason to quantize is to reduce the computational resources you need to
-do the inference calculations, by running them entirely with eight-bit inputs
-and outputs. This is a lot more difficult since it requires changes everywhere
-you do calculations, but offers a lot of potential rewards. Fetching eight-bit
-values only requires 25% of the memory bandwidth of floats, so you'll make much
-better use of caches and avoid bottlenecking on RAM access. You can also
-typically use SIMD operations that do many more operations per clock cycle. In
-some case you'll have a DSP chip available that can accelerate eight-bit
-calculations too, which can offer a lot of advantages.
-
-Moving calculations over to eight bit will help you run your models faster, and
-use less power (which is especially important on mobile devices). It also opens
-the door to a lot of embedded systems that can't run floating point code
-efficiently, so it can enable a lot of applications in the IoT world.
-
-## Why Not Train in Lower Precision Directly?
-
-There have been some experiments training at lower bit depths, but the results
-seem to indicate that you need higher than eight bit to handle the back
-propagation and gradients. That makes implementing the training more
-complicated, and so starting with inference made sense. We also already have a
-lot of float models already that we use and know well, so being able to convert
-them directly is very convenient.
-
-## How Can You Quantize Your Models?
-
-TensorFlow has production-grade support for eight-bit calculations built it. It
-also has a process for converting many models trained in floating-point over to
-equivalent graphs using quantized calculations for inference. For example,
-here's how you can translate the latest GoogLeNet model into a version that uses
-eight-bit computations:
-
-```sh
-curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
-tar xzf /tmp/inceptionv3.tgz -C /tmp/
-bazel build tensorflow/tools/quantization/tools:quantize_graph
-bazel-bin/tensorflow/tools/quantization/tools/quantize_graph \
-  --input=/tmp/classify_image_graph_def.pb \
-  --output_node_names="softmax" --output=/tmp/quantized_graph.pb \
-  --mode=eightbit
-```
-
-This will produce a new model that runs the same operations as the original, but
-with eight bit calculations internally, and all weights quantized as well. If
-you look at the file size, you'll see it's about a quarter of the original (23MB
-versus 91MB). You can still run this model using exactly the same inputs and
-outputs though, and you should get equivalent results. Here's an example:
-
-```sh
-# Note: You need to add the dependencies of the quantization operation to the
-#       cc_binary in the BUILD file of the label_image program:
-#
-#     //tensorflow/contrib/quantization:cc_ops
-#     //tensorflow/contrib/quantization/kernels:quantized_ops
-
-bazel build tensorflow/examples/label_image:label_image
-bazel-bin/tensorflow/examples/label_image/label_image \
---image=<input-image> \
---graph=/tmp/quantized_graph.pb \
---labels=/tmp/imagenet_synset_to_human_label_map.txt \
---input_width=299 \
---input_height=299 \
---input_mean=128 \
---input_std=128 \
---input_layer="Mul:0" \
---output_layer="softmax:0"
-```
-
-You'll see that this runs the newly-quantized graph, and outputs a very similar
-answer to the original.
-
-You can run the same process on your own models saved out as GraphDefs, with the
-input and output names adapted to those your network requires. I recommend that
-you run them through the freeze_graph script first, to convert checkpoints into
-constants stored in the file.
-
-## How Does the Quantization Process Work?
-
-We've implemented quantization by writing equivalent eight-bit versions of
-operations that are commonly used during inference. These include convolution,
-matrix multiplication, activation functions, pooling operations and
-concatenation. The conversion script first replaces all the individual ops it
-knows about with quantized equivalents. These are small sub-graphs that have
-conversion functions before and after to move the data between float and
-eight-bit. Below is an example of what they look like. First here's the original
-Relu operation, with float inputs and outputs:
-
-![Relu Diagram](https://www.tensorflow.org/images/quantization0.png)
-
-Then, this is the equivalent converted subgraph, still with float inputs and
-outputs, but with internal conversions so the calculations are done in eight
-bit.
-
-![Converted Diagram](https://www.tensorflow.org/images/quantization1.png)
-
-The min and max operations actually look at the values in the input float
-tensor, and then feeds them into the Dequantize operation that converts the
-tensor into eight-bits. There's more details on how the quantized representation
-works later on.
-
-Once the individual operations have been converted, the next stage is to remove
-unnecessary conversions to and from float. If there are consecutive sequences of
-operations that all have float equivalents, then there will be a lot of adjacent
-Dequantize/Quantize ops. This stage spots that pattern, recognizes that they
-cancel each other out, and removes them, like this:
-
-![Stripping Diagram](https://www.tensorflow.org/images/quantization2.png)
-
-Applied on a large scale to models where all of the operations have quantized
-equivalents, this gives a graph where all of the tensor calculations are done in
-eight bit, without having to convert to float.
-
-## What Representation is Used for Quantized Tensors?
-
-We approach converting floating-point arrays of numbers into eight-bit
-representations as a compression problem. We know that the weights and
-activation tensors in trained neural network models tend to have values that are
-distributed across comparatively small ranges (for example you might have -15 to
-+15 for weights, -500 to 1000 for activations on an image model, though the
-exact numbers will vary). We also know from experiment that neural nets tend to
-be very robust in the face of noise, and so the noise-like error produced by
-quantizing down to a small set of values will not hurt the precision of the
-overall results very much. We also want to pick a representation that's easy to
-perform calculations on, especially the large matrix multiplications that form
-the bulk of the work that's needed to run a model.
-
-These led us to pick a representation that has two floats to store the overall
-minimum and maximum values that are represented by the lowest and highest
-quantized value. Each entry in the quantized array represents a float value in
-that range, distributed linearly between the minimum and maximum. For example,
-if we have minimum = -10.0, and maximum = 30.0f, and an eight-bit array, here's
-what the quantized values represent:
-
-```
-Quantized | Float
---------- | -----
-0         | -10.0
-255       | 30.0
-128       | 10.0
-```
-
-The advantages of this format are that it can represent arbitrary magnitudes of
-ranges, they don't have to be symmetrical, it can represent signed and unsigned
-values, and the linear spread makes doing multiplications straightforward. There
-are alternatives like [Song Han's code books](http://arxiv.org/pdf/1510.00149.pdf)
-that can use lower bit depths by non-linearly distributing the float values
-across the representation, but these tend to be more expensive to calculate on.
-
-The advantage of having a strong and clear definition of the quantized format is
-that it's always possible to convert back and forth from float for operations
-that aren't quantization-ready, or to inspect the tensors for debugging
-purposes. One implementation detail in TensorFlow that we're hoping to improve
-in the future is that the minimum and maximum float values need to be passed as
-separate tensors to the one holding the quantized values, so graphs can get a
-bit dense!
-
-The nice thing about the minimum and maximum ranges is that they can often be
-pre-calculated. Weight parameters are constants known at load time, so their
-ranges can also be stored as constants. We often know the ranges for inputs (for
-examples images are usually RGB values in the range 0.0 to 255.0), and many
-activation functions have known ranges too. This can avoid having to analyze the
-outputs of an operation to determine the range, which we need to do for math ops
-like convolution or matrix multiplication which produce 32-bit accumulated
-results from 8-bit inputs.
-
-## What's Next?
-
-We've found that we can get extremely good performance on mobile and embedded
-devices by using eight-bit arithmetic rather than floating-point. You can see
-the framework we use to optimize matrix multiplications at
-[gemmlowp](https://github.com/google/gemmlowp). We still need to apply all the
-lessons we've learned to the TensorFlow ops to get maximum performance on
-mobile, but we're actively working on that. Right now, this quantized
-implementation is a reasonably fast and accurate reference implementation that
-we're hoping will enable wider support for our eight-bit models on a wider
-variety of devices. We also hope that this demonstration will encourage the
-community to explore what's possible with low-precision neural networks.
diff --git a/tensorflow/g3doc/how_tos/reading_data/index.md b/tensorflow/g3doc/how_tos/reading_data/index.md
deleted file mode 100644
index 1007f0248c2..00000000000
--- a/tensorflow/g3doc/how_tos/reading_data/index.md
+++ /dev/null
@@ -1,481 +0,0 @@
-# Reading data
-
-There are three main methods of getting data into a TensorFlow program:
-
-*   Feeding: Python code provides the data when running each step.
-*   Reading from files: an input pipeline reads the data from files
-    at the beginning of a TensorFlow graph.
-*   Preloaded data: a constant or variable in the TensorFlow graph holds
-    all the data (for small data sets).
-
-[TOC]
-
-## Feeding
-
-TensorFlow's feed mechanism lets you inject data into any Tensor in a
-computation graph. A python computation can thus feed data directly into the
-graph.
-
-Supply feed data through the `feed_dict` argument to a run() or eval() call
-that initiates computation.
-
-```python
-with tf.Session():
-  input = tf.placeholder(tf.float32)
-  classifier = ...
-  print(classifier.eval(feed_dict={input: my_python_preprocessing_fn()}))
-```
-
-While you can replace any Tensor with feed data, including variables and
-constants, the best practice is to use a
-[`placeholder` op](../../api_docs/python/io_ops.md#placeholder) node. A
-`placeholder` exists solely to serve as the target of feeds. It is not
-initialized and contains no data. A placeholder generates an error if
-it is executed without a feed, so you won't forget to feed it.
-
-An example using `placeholder` and feeding to train on MNIST data can be found
-in
-[`tensorflow/examples/tutorials/mnist/fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py),
-and is described in the [MNIST tutorial](../../tutorials/mnist/tf/index.md).
-
-## Reading from files
-
-A typical pipeline for reading records from files has the following stages:
-
-1.  The list of filenames
-2.  *Optional* filename shuffling
-3.  *Optional* epoch limit
-4.  Filename queue
-5.  A Reader for the file format
-6.  A decoder for a record read by the reader
-7.  *Optional* preprocessing
-8.  Example queue
-
-### Filenames, shuffling, and epoch limits
-
-For the list of filenames, use either a constant string Tensor (like
-`["file0", "file1"]` or `[("file%d" % i) for i in range(2)]`) or the
-[`tf.train.match_filenames_once`
-function](../../api_docs/python/io_ops.md#match_filenames_once).
-
-Pass the list of filenames to the [`tf.train.string_input_producer`
-function](../../api_docs/python/io_ops.md#string_input_producer).
-`string_input_producer` creates a FIFO queue for holding the filenames until
-the reader needs them.
-
-`string_input_producer` has options for shuffling and setting a maximum number
-of epochs. A queue runner adds the whole list of filenames to the queue once
-for each epoch, shuffling the filenames within an epoch if `shuffle=True`.
-This procedure provides a uniform sampling of files, so that examples are not
-under- or over- sampled relative to each other.
-
-The queue runner works in a thread separate from the reader that pulls
-filenames from the queue, so the shuffling and enqueuing process does not
-block the reader.
-
-### File formats
-
-Select the reader that matches your input file format and pass the filename
-queue to the reader's read method.  The read method outputs a key identifying
-the file and record (useful for debugging if you have some weird records), and
-a scalar string value. Use one (or more) of the decoder and conversion ops to
-decode this string into the tensors that make up an example.
-
-#### CSV files
-
-To read text files in [comma-separated value (CSV)
-format](https://tools.ietf.org/html/rfc4180), use a
-[`TextLineReader`](../../api_docs/python/io_ops.md#TextLineReader) with the
-[`decode_csv`](../../api_docs/python/io_ops.md#decode_csv) operation. For example:
-
-```python
-filename_queue = tf.train.string_input_producer(["file0.csv", "file1.csv"])
-
-reader = tf.TextLineReader()
-key, value = reader.read(filename_queue)
-
-# Default values, in case of empty columns. Also specifies the type of the
-# decoded result.
-record_defaults = [[1], [1], [1], [1], [1]]
-col1, col2, col3, col4, col5 = tf.decode_csv(
-    value, record_defaults=record_defaults)
-features = tf.stack([col1, col2, col3, col4])
-
-with tf.Session() as sess:
-  # Start populating the filename queue.
-  coord = tf.train.Coordinator()
-  threads = tf.train.start_queue_runners(coord=coord)
-
-  for i in range(1200):
-    # Retrieve a single instance:
-    example, label = sess.run([features, col5])
-
-  coord.request_stop()
-  coord.join(threads)
-```
-
-Each execution of `read` reads a single line from the file. The
-`decode_csv` op then parses the result into a list of tensors. The
-`record_defaults` argument determines the type of the resulting tensors and
-sets the default value to use if a value is missing in the input string.
-
-You must call `tf.train.start_queue_runners` to populate the queue before
-you call `run` or `eval` to execute the `read`. Otherwise `read` will
-block while it waits for filenames from the queue.
-
-#### Fixed length records
-
-To read binary files in which each record is a fixed number of bytes, use
-[`tf.FixedLengthRecordReader`](../../api_docs/python/io_ops.md#FixedLengthRecordReader)
-with the [`tf.decode_raw`](../../api_docs/python/io_ops.md#decode_raw) operation.
-The `decode_raw` op converts from a string to a uint8 tensor.
-
-For example, [the CIFAR-10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html)
-uses a file format where each record is represented using a fixed number of
-bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have
-a uint8 tensor, standard operations can slice out each piece and reformat as
-needed. For CIFAR-10, you can see how to do the reading and decoding in
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
-and described in
-[this tutorial](../../tutorials/deep_cnn/index.md#prepare-the-data).
-
-#### Standard TensorFlow format
-
-Another approach is to convert whatever data you have into a supported format.
-This approach makes it easier to mix and match data sets and network
-architectures. The recommended format for TensorFlow is a
-[TFRecords file](../../api_docs/python/python_io.md#tfrecords-format-details)
-containing
-[`tf.train.Example` protocol buffers](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
-(which contain
-[`Features`](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto)
-as a field).  You write a little program that gets your data, stuffs it in an
-`Example` protocol buffer, serializes the protocol buffer to a string, and then
-writes the string to a TFRecords file using the
-[`tf.python_io.TFRecordWriter` class](../../api_docs/python/python_io.md#TFRecordWriter).
-For example,
-[`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
-converts MNIST data to this format.
-
-To read a file of TFRecords, use
-[`tf.TFRecordReader`](../../api_docs/python/io_ops.md#TFRecordReader) with
-the [`tf.parse_single_example`](../../api_docs/python/io_ops.md#parse_single_example)
-decoder. The `parse_single_example` op decodes the example protocol buffers into
-tensors. An MNIST example using the data produced by `convert_to_records` can be
-found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
-which you can compare with the `fully_connected_feed` version.
-
-### Preprocessing
-
-You can then do any preprocessing of these examples you want. This would be any
-processing that doesn't depend on trainable parameters. Examples include
-normalization of your data, picking a random slice, adding noise or distortions,
-etc.  See
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
-for an example.
-
-### Batching
-
-At the end of the pipeline we use another queue to batch together examples for
-training, evaluation, or inference.  For this we use a queue that randomizes the
-order of examples, using the
-[`tf.train.shuffle_batch` function](../../api_docs/python/io_ops.md#shuffle_batch).
-
-Example:
-
-```
-def read_my_file_format(filename_queue):
-  reader = tf.SomeReader()
-  key, record_string = reader.read(filename_queue)
-  example, label = tf.some_decoder(record_string)
-  processed_example = some_processing(example)
-  return processed_example, label
-
-def input_pipeline(filenames, batch_size, num_epochs=None):
-  filename_queue = tf.train.string_input_producer(
-      filenames, num_epochs=num_epochs, shuffle=True)
-  example, label = read_my_file_format(filename_queue)
-  # min_after_dequeue defines how big a buffer we will randomly sample
-  #   from -- bigger means better shuffling but slower start up and more
-  #   memory used.
-  # capacity must be larger than min_after_dequeue and the amount larger
-  #   determines the maximum we will prefetch.  Recommendation:
-  #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
-  min_after_dequeue = 10000
-  capacity = min_after_dequeue + 3 * batch_size
-  example_batch, label_batch = tf.train.shuffle_batch(
-      [example, label], batch_size=batch_size, capacity=capacity,
-      min_after_dequeue=min_after_dequeue)
-  return example_batch, label_batch
-```
-
-If you need more parallelism or shuffling of examples between files, use
-multiple reader instances using the
-[`tf.train.shuffle_batch_join` function](../../api_docs/python/io_ops.md#shuffle_batch_join).
-For example:
-
-```
-def read_my_file_format(filename_queue):
-  # Same as above
-
-def input_pipeline(filenames, batch_size, read_threads, num_epochs=None):
-  filename_queue = tf.train.string_input_producer(
-      filenames, num_epochs=num_epochs, shuffle=True)
-  example_list = [read_my_file_format(filename_queue)
-                  for _ in range(read_threads)]
-  min_after_dequeue = 10000
-  capacity = min_after_dequeue + 3 * batch_size
-  example_batch, label_batch = tf.train.shuffle_batch_join(
-      example_list, batch_size=batch_size, capacity=capacity,
-      min_after_dequeue=min_after_dequeue)
-  return example_batch, label_batch
-```
-
-You still only use a single filename queue that is shared by all the readers.
-That way we ensure that the different readers use different files from the same
-epoch until all the files from the epoch have been started.  (It is also usually
-sufficient to have a single thread filling the filename queue.)
-
-An alternative is to use a single reader via the
-[`tf.train.shuffle_batch` function](../../api_docs/python/io_ops.md#shuffle_batch)
-with `num_threads` bigger than 1.  This will make it read from a single file at
-the same time (but faster than with 1 thread), instead of N files at once.
-This can be important:
-
-*   If you have more reading threads than input files, to avoid the risk that
-    you will have two threads reading the same example from the same file near
-    each other.
-*   Or if reading N files in parallel causes too many disk seeks.
-
-How many threads do you need? the `tf.train.shuffle_batch*` functions add a
-summary to the graph that indicates how full the example queue is. If you have
-enough reading threads, that summary will stay above zero.  You can
-[view your summaries as training progresses using TensorBoard](../../how_tos/summaries_and_tensorboard/index.md).
-
-### Creating threads to prefetch using `QueueRunner` objects
-
-The short version: many of the `tf.train` functions listed above add
-[`QueueRunner`](../../api_docs/python/train.md#QueueRunner) objects to your
-graph.  These require that you call
-[`tf.train.start_queue_runners`](../../api_docs/python/train.md#start_queue_runners)
-before running any training or inference steps, or it will hang forever. This
-will start threads that run the input pipeline, filling the example queue so
-that the dequeue to get the examples will succeed.  This is best combined with a
-[`tf.train.Coordinator`](../../api_docs/python/train.md#Coordinator) to cleanly
-shut down these threads when there are errors. If you set a limit on the number
-of epochs, that will use an epoch counter that will need to be initialized. The
-recommended code pattern combining these is:
-
-```python
-# Create the graph, etc.
-init_op = tf.global_variables_initializer()
-
-# Create a session for running operations in the Graph.
-sess = tf.Session()
-
-# Initialize the variables (like the epoch counter).
-sess.run(init_op)
-
-# Start input enqueue threads.
-coord = tf.train.Coordinator()
-threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-try:
-    while not coord.should_stop():
-        # Run training steps or whatever
-        sess.run(train_op)
-
-except tf.errors.OutOfRangeError:
-    print('Done training -- epoch limit reached')
-finally:
-    # When done, ask the threads to stop.
-    coord.request_stop()
-
-# Wait for threads to finish.
-coord.join(threads)
-sess.close()
-```
-
-#### Aside: What is happening here?
-
-First we create the graph. It will have a few pipeline stages that are
-connected by queues. The first stage will generate filenames to read and enqueue
-them in the filename queue. The second stage consumes filenames (using a
-`Reader`), produces examples, and enqueues them in an example queue. Depending
-on how you have set things up, you may actually have a few independent copies of
-the second stage, so that you can read from multiple files in parallel. At the
-end of these stages is an enqueue operation, which enqueues into a queue that
-the next stage dequeues from. We want to start threads running these enqueuing
-operations, so that our training loop can dequeue examples from the example
-queue.
-
-<div style="width:70%; margin-left:12%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/AnimatedFileQueues.gif">
-</div>
-
-The helpers in `tf.train` that create these queues and enqueuing operations add
-a [`tf.train.QueueRunner`](../../api_docs/python/train.md#QueueRunner) to the
-graph using the
-[`tf.train.add_queue_runner`](../../api_docs/python/train.md#add_queue_runner)
-function. Each `QueueRunner` is responsible for one stage, and holds the list of
-enqueue operations that need to be run in threads. Once the graph is
-constructed, the
-[`tf.train.start_queue_runners`](../../api_docs/python/train.md#start_queue_runners)
-function asks each QueueRunner in the graph to start its threads running the
-enqueuing operations.
-
-If all goes well, you can now run your training steps and the queues will be
-filled by the background threads. If you have set an epoch limit, at some point
-an attempt to dequeue examples will get an
-[`tf.OutOfRangeError`](../../api_docs/python/client.md#OutOfRangeError).  This
-is the TensorFlow equivalent of "end of file" (EOF) -- this means the epoch
-limit has been reached and no more examples are available.
-
-The last ingredient is the
-[`Coordinator`](../../api_docs/python/train.md#Coordinator). This is responsible
-for letting all the threads know if anything has signalled a shut down. Most
-commonly this would be because an exception was raised, for example one of the
-threads got an error when running some operation (or an ordinary Python
-exception).
-
-For more about threading, queues, QueueRunners, and Coordinators
-[see here](../../how_tos/threading_and_queues/index.md).
-
-#### Aside: How clean shut-down when limiting epochs works
-
-Imagine you have a model that has set a limit on the number of epochs to train
-on.  That means that the thread generating filenames will only run that many
-times before generating an `OutOfRange` error. The QueueRunner will catch that
-error, close the filename queue, and exit the thread. Closing the queue does two
-things:
-
-*   Any future attempt to enqueue in the filename queue will generate an error.
-    At this point there shouldn't be any threads trying to do that, but this
-    is helpful when queues are closed due to other errors.
-*   Any current or future dequeue will either succeed (if there are enough
-    elements left) or fail (with an `OutOfRange` error) immediately.  They won't
-    block waiting for more elements to be enqueued, since by the previous point
-    that can't happen.
-
-The point is that when the filename queue is closed, there will likely still be
-many filenames in that queue, so the next stage of the pipeline (with the reader
-and other preprocessing) may continue running for some time.  Once the filename
-queue is exhausted, though, the next attempt to dequeue a filename (e.g. from a
-reader that has finished with the file it was working on) will trigger an
-`OutOfRange` error.  In this case, though, you might have multiple threads
-associated with a single QueueRunner.  If this isn't the last thread in the
-QueueRunner, the `OutOfRange` error just causes the one thread to exit.  This
-allows the other threads, which are still finishing up their last file, to
-proceed until they finish as well.  (Assuming you are using a
-[`tf.train.Coordinator`](../../api_docs/python/train.md#Coordinator),
-other types of errors will cause all the threads to stop.)  Once all the reader
-threads hit the `OutOfRange` error, only then does the next queue, the example
-queue, gets closed.
-
-Again, the example queue will have some elements queued, so training will
-continue until those are exhausted.  If the example queue is a
-[`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue), say
-because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
-avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
-However, once the queue is closed that restriction will be lifted and the queue
-will eventually empty.  At that point the actual training threads, when they
-try and dequeue from example queue, will start getting `OutOfRange` errors and
-exiting.  Once all the training threads are done,
-[`tf.train.Coordinator.join`](../../api_docs/python/train.md#Coordinator.join)
-will return and you can exit cleanly.
-
-### Filtering records or producing multiple examples per record
-
-Instead of examples with shapes `[x, y, z]`, you will produce a batch of
-examples with shape `[batch, x, y, z]`.  The batch size can be 0 if you want to
-filter this record out (maybe it is in a hold-out set?), or bigger than 1 if you
-are producing multiple examples per record.  Then simply set `enqueue_many=True`
-when calling one of the batching functions (such as `shuffle_batch` or
-`shuffle_batch_join`).
-
-### Sparse input data
-
-SparseTensors don't play well with queues. If you use SparseTensors you have
-to decode the string records using
-[`tf.parse_example`](../../api_docs/python/io_ops.md#parse_example) **after**
-batching (instead of using `tf.parse_single_example` before batching).
-
-## Preloaded data
-
-This is only used for small data sets that can be loaded entirely in memory.
-There are two approaches:
-
-* Store the data in a constant.
-* Store the data in a variable, that you initialize and then never change.
-
-Using a constant is a bit simpler, but uses more memory (since the constant is
-stored inline in the graph data structure, which may be duplicated a few times).
-
-```python
-training_data = ...
-training_labels = ...
-with tf.Session():
-  input_data = tf.constant(training_data)
-  input_labels = tf.constant(training_labels)
-  ...
-```
-
-To instead use a variable, you need to also initialize it after the graph has been built.
-
-```python
-training_data = ...
-training_labels = ...
-with tf.Session() as sess:
-  data_initializer = tf.placeholder(dtype=training_data.dtype,
-                                    shape=training_data.shape)
-  label_initializer = tf.placeholder(dtype=training_labels.dtype,
-                                     shape=training_labels.shape)
-  input_data = tf.Variable(data_initializer, trainable=False, collections=[])
-  input_labels = tf.Variable(label_initializer, trainable=False, collections=[])
-  ...
-  sess.run(input_data.initializer,
-           feed_dict={data_initializer: training_data})
-  sess.run(input_labels.initializer,
-           feed_dict={label_initializer: training_labels})
-```
-
-Setting `trainable=False` keeps the variable out of the
-`GraphKeys.TRAINABLE_VARIABLES` collection in the graph, so we won't try and
-update it when training.  Setting `collections=[]` keeps the variable out of the
-`GraphKeys.GLOBAL_VARIABLES` collection used for saving and restoring checkpoints.
-
-Either way,
-[`tf.train.slice_input_producer function`](../../api_docs/python/io_ops.md#slice_input_producer)
-can be used to produce a slice at a time.  This shuffles the examples across an
-entire epoch, so further shuffling when batching is undesirable.  So instead of
-using the `shuffle_batch` functions, we use the plain
-[`tf.train.batch` function](../../api_docs/python/io_ops.md#batch).  To use
-multiple preprocessing threads, set the `num_threads` parameter to a number
-bigger than 1.
-
-An MNIST example that preloads the data using constants can be found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py), and one that preloads the data using variables can be found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py),
-You can compare these with the `fully_connected_feed` and
-`fully_connected_reader` versions above.
-
-## Multiple input pipelines
-
-Commonly you will want to train on one dataset and evaluate (or "eval") on
-another.  One way to do this is to actually have two separate processes:
-
-* The training process reads training input data and periodically writes
-  checkpoint files with all the trained variables.
-* The evaluation process restores the checkpoint files into an inference
-  model that reads validation input data.
-
-This is what is done in
-[the example CIFAR-10 model](../../tutorials/deep_cnn/index.md#save-and-restore-checkpoints).  This has a couple of benefits:
-
-* The eval is performed on a single snapshot of the trained variables.
-* You can perform the eval even after training has completed and exited.
-
-You can have the train and eval in the same graph in the same process, and share
-their trained variables.  See
-[the shared variables tutorial](../../how_tos/variable_scope/index.md).
diff --git a/tensorflow/g3doc/how_tos/style_guide.md b/tensorflow/g3doc/how_tos/style_guide.md
deleted file mode 100644
index 34504d22397..00000000000
--- a/tensorflow/g3doc/how_tos/style_guide.md
+++ /dev/null
@@ -1,185 +0,0 @@
-# TensorFlow Style Guide
-
-This page contains style decisions that both developers and users of TensorFlow
-should follow to increase the readability of their code, reduce the number of
-errors, and promote consistency.
-
-[TOC]
-
-## Python style
-
-Generally follow
-[PEP8 Python style guide](https://www.python.org/dev/peps/pep-0008/),
-except for using 2 spaces.
-
-
-## Python 2 and 3 compatible
-
-* All code needs to be compatible with Python 2 and 3.
-
-* Next lines should be present in all Python files:
-
-```
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-```
-
-* Use `six` to write compatible code (for example `six.moves.range`).
-
-
-## Bazel BUILD rules
-
-TensorFlow uses Bazel build system and enforces next requirements:
-
-* Every BUILD file should contain next header:
-
-```
-# Description:
-# <...>
-
-package(
-    default_visibility = ["//visibility:private"],
-    features = ["-parse_headers"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-```
-
-* At the end of every BUILD file, should contain:
-
-```
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//third_party/tensorflow:__subpackages__"],
-)
-```
-
-* When adding new BUILD file, add this line to `tensorflow/BUILD` file into `all_opensource_files` target.
-
-```
-"//third_party/tensorflow/<directory>:all_files",
-```
-
-* For all Python BUILD targets (libraries and tests) add next line:
-
-```
-srcs_version = "PY2AND3",
-```
-
-
-## Tensor
-
-* Operations that deal with batches may assume that the first dimension of a Tensor is the batch dimension.
-
-
-## Python operations
-
-A *Python operation* is a function that, given input tensors and parameters,
-creates a part of the graph and returns output tensors.
-
-* The first arguments should be tensors, followed by basic python parameters.
- The last argument is `name` with a default value of `None`.
- If operation needs to save some `Tensor`s to Graph collections,
- put the arguments with names of the collections right before `name` argument.
-
-* Tensor arguments should be either a single tensor or an iterable of tensors.
-  E.g. a "Tensor or list of Tensors" is too broad. See `assert_proper_iterable`.
-
-* Operations that take tensors as arguments should call `convert_to_tensor`
- to convert non-tensor inputs into tensors if they are using C++ operations.
- Note that the arguments are still described as a `Tensor` object
- of a specific dtype in the documentation.
-
-* Each Python operation should have an `op_scope` like below.
- Pass list of input tensors, `name` and a default name of the op as arguments.
-
-* Operations should contain an extensive Python comment with Args and Returns
- declarations that explain both the type and meaning of each value. Possible
- shapes, dtypes, or ranks should be specified in the description.
- [See documentation details](documentation/index.md)
-
-* For increased usability include an example of usage with inputs / outputs
- of the op in Example section.
-
-Example:
-
-    def my_op(tensor_in, other_tensor_in, my_param, other_param=0.5,
-              output_collections=(), name=None):
-    """My operation that adds two tensors with given coefficients.
-
-    Args:
-      tensor_in: `Tensor`, input tensor.
-      other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor.
-      my_param: `float`, coefficient for `tensor_in`.
-      other_param: `float`, coefficient for `other_tensor_in`.
-      output_collections: `tuple` of `string`s, name of the collection to
-                          collect result of this op.
-      name: `string`, name of the operation.
-
-    Returns:
-      `Tensor` of same shape as `tensor_in`, sum of input values with coefficients.
-
-    Example:
-      >>> my_op([1., 2.], [3., 4.], my_param=0.5, other_param=0.6,
-                output_collections=['MY_OPS'], name='add_t1t2')
-      [2.3, 3.4]
-    """
-    with tf.op_scope([tensor_in, other_tensor_in], name, "my_op"):
-      tensor_in = tf.convert_to_tensor(tensor_in)
-      other_tensor_in = tf.convert_to_tensor(other_tensor_in)
-      result = my_param * tensor_in + other_param * other_tensor_in
-      tf.add_to_collections(output_collections, result)
-      return result
-
-Usage:
-
-    output = my_op(t1, t2, my_param=0.5, other_param=0.6,
-                   output_collections=['MY_OPS'], name='add_t1t2')
-
-
-## Layers
-
-A *Layer* is a Python operation that combines variable creation and/or one or many
-other graph operations. Follow the same requirements as for regular Python
-operation.
-
-* If a layer creates one or more variables, the layer function
- should take next arguments also following order:
-  - `initializers`: Optionally allow to specify initializers for the variables.
-  - `regularizers`: Optionally allow to specify regularizers for the variables.
-  - `trainable`: which control if their variables are trainable or not.
-  - `scope`: `VariableScope` object that variable will be put under.
-  - `reuse`: `bool` indicator if the variable should be reused if
-             it's present in the scope.
-
-* Layers that behave differently during training should have:
-  - `is_training`: `bool` to indicate if a training graph is been built.
-
-
-Example:
-
-    def conv2d(inputs,
-               num_filters_out,
-               kernel_size,
-               stride=1,
-               padding='SAME',
-               activation_fn=tf.nn.relu,
-               normalization_fn=add_bias,
-               normalization_params=None,
-               initializers=None,
-               regularizers=None,
-               trainable=True,
-               scope=None,
-               reuse=None):
-      ... see implementation at tensorflow/contrib/layers/python/layers/layers.py ...
-
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
deleted file mode 100644
index 99fc0ea0fdb..00000000000
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ /dev/null
@@ -1,215 +0,0 @@
-# TensorBoard: Visualizing Learning
-
-The computations you'll use TensorFlow for - like training a massive
-deep neural network - can be complex and confusing. To make it easier to
-understand, debug, and optimize TensorFlow programs, we've included a suite of
-visualization tools called TensorBoard. You can use TensorBoard to visualize
-your TensorFlow graph, plot quantitative metrics about the execution of your
-graph, and show additional data like images that pass through it. When
-TensorBoard is fully configured, it looks like this:
-
-![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")
-
-
-This tutorial is intended to get you started with simple TensorBoard usage.
-There are other resources available as well! The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md)
-has a lot more information on TensorBoard usage, including tips & tricks, and
-debugging information.
-
-## Serializing the data
-
-TensorBoard operates by reading TensorFlow events files, which contain summary
-data that you can generate when running TensorFlow. Here's the general
-lifecycle for summary data within TensorBoard.
-
-First, create the TensorFlow graph that you'd like to collect summary
-data from, and decide which nodes you would like to annotate with
-[summary operations](../../api_docs/python/summary.md).
-
-For example, suppose you are training a convolutional neural network for
-recognizing MNIST digits. You'd like to record how the learning rate
-varies over time, and how the objective function is changing. Collect these by
-attaching [`scalar_summary`](../../api_docs/python/summary.md#scalar) ops
-to the nodes that output the learning rate and loss respectively. Then, give
-each `scalar_summary` a meaningful `tag`, like `'learning rate'` or `'loss
-function'`.
-
-Perhaps you'd also like to visualize the distributions of activations coming
-off a particular layer, or the distribution of gradients or weights. Collect
-this data by attaching
-[`histogram_summary`](../../api_docs/python/summary.md#histogram) ops to
-the gradient outputs and to the variable that holds your weights, respectively.
-
-For details on all of the summary operations available, check out the docs on
-[summary operations](../../api_docs/python/summary.md).
-
-Operations in TensorFlow don't do anything until you run them, or an op that
-depends on their output. And the summary nodes that we've just created are
-peripheral to your graph: none of the ops you are currently running depend on
-them. So, to generate summaries, we need to run all of these summary nodes.
-Managing them by hand would be tedious, so use
-[`tf.summary.merge_all`](../../api_docs/python/summary.md#merge_all)
-to combine them into a single op that generates all the summary data.
-
-Then, you can just run the merged summary op, which will generate a serialized
-`Summary` protobuf object with all of your summary data at a given step.
-Finally, to write this summary data to disk, pass the summary protobuf to a
-[`tf.summary.FileWriter`](../../api_docs/python/summary.md#FileWriter).
-
-The `FileWriter` takes a logdir in its constructor - this logdir is quite
-important, it's the directory where all of the events will be written out.
-Also, the `FileWriter` can optionally take a `Graph` in its constructor.
-If it receives a `Graph` object, then TensorBoard will visualize your graph
-along with tensor shape information. This will give you a much better sense of
-what flows through the graph: see
-[Tensor shape information](../../how_tos/graph_viz/index.md#tensor-shape-information).
-
-Now that you've modified your graph and have a `FileWriter`, you're ready to
-start running your network! If you want, you could run the merged summary op
-every single step, and record a ton of training data. That's likely to be more
-data than you need, though. Instead, consider running the merged summary op
-every `n` steps.
-
-The code example below is a modification of the
-[simple MNIST tutorial](http://tensorflow.org/tutorials/mnist/beginners/index.md),
-in which we have added some summary ops, and run them every ten steps. If you
-run this and then launch `tensorboard --logdir=/tmp/mnist_logs`, you'll be able
-to visualize statistics, such as how the weights or accuracy varied during
-training. The code below is an excerpt; full source is
-[here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
-
-```python
-def variable_summaries(var):
-  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-  with tf.name_scope('summaries'):
-    mean = tf.reduce_mean(var)
-    tf.summary.scalar('mean', mean)
-    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
-    tf.summary.scalar('stddev', stddev)
-    tf.summary.scalar('max', tf.reduce_max(var))
-    tf.summary.scalar('min', tf.reduce_min(var))
-    tf.summary.histogram('histogram', var)
-
-def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
-  """Reusable code for making a simple neural net layer.
-
-  It does a matrix multiply, bias add, and then uses relu to nonlinearize.
-  It also sets up name scoping so that the resultant graph is easy to read,
-  and adds a number of summary ops.
-  """
-  # Adding a name scope ensures logical grouping of the layers in the graph.
-  with tf.name_scope(layer_name):
-    # This Variable will hold the state of the weights for the layer
-    with tf.name_scope('weights'):
-      weights = weight_variable([input_dim, output_dim])
-      variable_summaries(weights)
-    with tf.name_scope('biases'):
-      biases = bias_variable([output_dim])
-      variable_summaries(biases)
-    with tf.name_scope('Wx_plus_b'):
-      preactivate = tf.matmul(input_tensor, weights) + biases
-      tf.summary.histogram('pre_activations', preactivate)
-    activations = act(preactivate, name='activation')
-    tf.summary.histogram('activations', activations)
-    return activations
-
-hidden1 = nn_layer(x, 784, 500, 'layer1')
-
-with tf.name_scope('dropout'):
-  keep_prob = tf.placeholder(tf.float32)
-  tf.summary.scalar('dropout_keep_probability', keep_prob)
-  dropped = tf.nn.dropout(hidden1, keep_prob)
-
-# Do not apply softmax activation yet, see below.
-y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
-
-with tf.name_scope('cross_entropy'):
-  # The raw formulation of cross-entropy,
-  #
-  # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
-  #                               reduction_indices=[1]))
-  #
-  # can be numerically unstable.
-  #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-  # raw outputs of the nn_layer above, and then average across
-  # the batch.
-  diff = tf.nn.softmax_cross_entropy_with_logits(targets=y_, logits=y)
-  with tf.name_scope('total'):
-    cross_entropy = tf.reduce_mean(diff)
-tf.summary.scalar('cross_entropy', cross_entropy)
-
-with tf.name_scope('train'):
-  train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
-      cross_entropy)
-
-with tf.name_scope('accuracy'):
-  with tf.name_scope('correct_prediction'):
-    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
-  with tf.name_scope('accuracy'):
-    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-tf.summary.scalar('accuracy', accuracy)
-
-# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
-merged = tf.summary.merge_all()
-train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                      sess.graph)
-test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test')
-tf.global_variables_initializer().run()
-```
-
-After we've initialized the `FileWriters`, we have to add summaries to the
-`FileWriters` as we train and test the model.
-
-```python
-# Train the model, and also write summaries.
-# Every 10th step, measure test-set accuracy, and write test summaries
-# All other steps, run train_step on training data, & add training summaries
-
-def feed_dict(train):
-  """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
-  if train or FLAGS.fake_data:
-    xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
-    k = FLAGS.dropout
-  else:
-    xs, ys = mnist.test.images, mnist.test.labels
-    k = 1.0
-  return {x: xs, y_: ys, keep_prob: k}
-
-for i in range(FLAGS.max_steps):
-  if i % 10 == 0:  # Record summaries and test-set accuracy
-    summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
-    test_writer.add_summary(summary, i)
-    print('Accuracy at step %s: %s' % (i, acc))
-  else:  # Record train set summaries, and train
-    summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
-    train_writer.add_summary(summary, i)
-```
-
-You're now all set to visualize this data using TensorBoard.
-
-
-## Launching TensorBoard
-
-To run TensorBoard, use the following command (alternatively `python -m
-tensorflow.tensorboard`)
-
-```bash
-tensorboard --logdir=path/to/log-directory
-```
-
-where `logdir` points to the directory where the `FileWriter` serialized its
-data.  If this `logdir` directory contains subdirectories which contain
-serialized data from separate runs, then TensorBoard will visualize the data
-from all of those runs. Once TensorBoard is running, navigate your web browser
-to `localhost:6006` to view the TensorBoard.
-
-When looking at TensorBoard, you will see the navigation tabs in the top right
-corner. Each tab represents a set of serialized data that can be visualized.
-
-For in depth information on how to use the *graph* tab to visualize your graph,
-see [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
-
-For more usage information on TensorBoard in general, see the [TensorBoard
-README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md).
diff --git a/tensorflow/g3doc/how_tos/supervisor/index.md b/tensorflow/g3doc/how_tos/supervisor/index.md
deleted file mode 100644
index 7b3e7677a2a..00000000000
--- a/tensorflow/g3doc/how_tos/supervisor/index.md
+++ /dev/null
@@ -1,406 +0,0 @@
-# Supervisor: Training Helper for Days-Long Trainings.
-
-To train a model with TensorFlow you can simply run a training op a number of
-times and save a checkpoint of the trained parameters when you're done.  This
-works well for small models that can train in a few hours.
-
-Larger models that require days of training, possibly across multiple replicas,
-need a more robust training process that:
-
- * Handles shutdowns and crashes cleanly.
- * Can be resumed after a shutdown or a crash.
- * Can be monitored through TensorBoard.
-
-To be able to resume training after a shutdown or a crash the training process
-must save checkpoints regularly.  On restart, it must look for the most recent
-checkpoint and load it before resuming training.
-
-To be monitored through TensorBoard, the training process must run summary ops
-regularly and append the returned values to an events file as explained in
-[TensorBoard: Visualizing Learning](../summaries_and_tensorboard/index.md).
-TensorBoard monitors events files and displays graphs reporting training
-progress over time.
-
-The [`tf.Supervisor` class](../../api_docs/python/train.md#Supervisor) provides
-a set of services that helps implement a robust training process.
-
-This how-to shows how to use the supervisor directly.  Please also consider
-using one of several frameworks built on top of the supervisor that provide
-richer training loops, and numerous customization options:
-[`tf.learn`](../../api_docs/python/contrib.learn.md) is a good choice.
-
-Note that the supervisor is very helpful for training large models, but can
-also be used for smaller models without any penalty.
-
-## Very Simple Scenario
-
-The simplest scenario for using a supervisor is to:
-
- * Create a `Supervisor` object, passing it the path to a directory where to
-   save checkpoints and summaries.
-
- * Ask the supervisor for a session with
-   [`managed_session()`](../../api_docs/python/train.md#Supervisor.managed_session).
-
- * Use the session to execute a train op, checking at each step if the
-   supervisor requests that the training stops.
-
-```python
-  ...create graph...
-  my_train_op = ...
-
-  sv = tf.Supervisor(logdir="/my/training/directory")
-  with sv.managed_session() as sess:
-    for step in range(100000):
-      if sv.should_stop():
-        break
-      sess.run(my_train_op)
-```
-
-### Started Services
-
-In the very simple scenario, the `managed_session()` call starts a few
-services, which run in their own threads, and use the managed session to run
-ops in your graph.
-
-If your graph contains an integer variable named `global_step`, the services
-use its value to measure the number of training steps executed.  See the [MNIST
-training tutorial](../../tutorials/mnist/tf/index.md#training) for how to
-create a `global_step` variable.
-
- * _Checkpointing_ service: Saves a copy of the graph variables in the logdir.
-   The checkpoint filename uses the value of the `global_step` variable if one
-   was added to your graph.  Runs every 10 minutes by default.
-
- * _Summary_ service: Runs all the summary ops and appends their output to an
-   [events file](../summaries_and_tensorboard/index.md) in the logdir.  Runs
-   every 2 minutes by default.
-
- * _Step counter_: Counts how many steps have been executed, by looking at
-   changes in the `global_step` variable.  Appends a summary to the events file
-   reporting the number of global steps per second.  The summary tag is
-   "global_step/sec".  This also runs every 2 minutes by default.
-
- * _Queue Runners_: If any [queue
-   runners](../../api_docs/python/train.md#QueueRunner) were added to the
-   graph, the supervisor launches them in their own threads.
-
-All time intervals can be changed when constructing the supervisor object.  See
-the [supervisor reference](#supervisor_reference) for details.
-
-### Checking for Stop
-
-The check for stop in the main training loop is important and necessary.
-
-Exceptions raised in the service threads are reported to the supervisor which
-then sets its `should_stop()` condition to true.  Other service threads notice
-that condition and terminate properly.  The main training loop, within the
-`managed_session()` block, must also check for the stop condition and
-terminate.
-
-Note that `managed_session()` takes care of catching exceptions raised from the
-training loop to report them to the supervisor.  The main loop does not need to
-do anything special about exceptions.  It only needs to check for the stop
-condition.
-
-### Recovery
-
-If the training program shuts down or crashes, its most recent checkpoint and
-event files are left in the logdir.  When you restart the program,
-`managed_session()` restores the graph from the most recent checkpoint and
-resumes training where it stopped.
-
-A new events file is created.  If you start TensorBoard and point it to the
-logdir, it will know how to merge the contents of the two events files and will
-show the training resuming at the last global step from the checkpoint.
-
-## Larger Model Scenario
-
-The very simple scenario is sufficient for most small to medium sized models.
-Larger models may run out memory when the summary service runs: The summary ops
-are run in parallel with the main loop running the train op.  This can cause
-memory usage to peak to up to two times the normal use.
-
-For a larger model you can tell the supervisor to not run the summary service
-and instead run it yourself in your main training loop: pass `summary_op=None`
-when constructing the supervisor.
-
-For example this code runs the summary op every 100 steps in the training loop:
-
-```python
-  ...create graph...
-  my_train_op = ...
-  my_summary_op = tf.summary.merge_all()
-
-  sv = tf.Supervisor(logdir="/my/training/directory",
-                     summary_op=None) # Do not run the summary service
-  with sv.managed_session() as sess:
-    for step in range(100000):
-      if sv.should_stop():
-        break
-      if step % 100 == 0:
-        _, summ = session.run([my_train_op, my_summary_op])
-        sv.summary_computed(sess, summ)
-      else:
-        session.run(my_train_op)
-```
-
-## Pre-trained Model Scenario
-
-The `managed_session()` call takes care of initializing the model in the
-session.  The model is restored from a checkpoint if one is available,
-or initialized from scratch otherwise.
-
-One common scenario is to initialize the model by loading a "pre-trained"
-checkpoint that was saved while training a usually slightly different model
-using a different dataset.
-
-You can load a pre-trained checkpoint by passing an "init function" to the
-supervisor.  This function is called only if the model needs to be initialized
-from scratch, not when the model can be recovered from a checkpoint from the
-logdir.
-
-To load the pre-trained model, the init function needs a
-[`tf.Saver`](../../api_docs/python/train.md#Saver) object, so you should create
-a saver for this purpose.  This is usually a good idea because the new model
-may contain variables that are not present in the pre-trained checkpoint: This
-saver must only restore the pre-trained variables.  If you were using the
-default saver, you could get an error trying to restore all the variables of
-the new model from the pre-trained checkpoint.
-
-```python
-  ...create graph...
-  # Create a saver that restores only the pre-trained variables.
-  pre_train_saver = tf.Saver([pre_train_var1, pre_train_var2])
-
-  # Define an init function that loads the pretrained checkpoint.
-  def load_pretrain(sess):
-    pre_train_saver.restore(sess, "<path to pre-trained-checkpoint>")
-
-  # Pass the init function to the supervisor.
-  #
-  # The init function is called _after_ the variables have been initialized
-  # by running the init_op.
-  sv = tf.Supervisor(logdir="/my/training/directory",
-                     init_fn=load_pretrain)
-  with sv.managed_session() as sess:
-    # Here sess was either initialized from the pre-trained-checkpoint or
-    # recovered from a checkpoint saved in a previous run of this code.
-    ...
-```
-
-## Running Your Own Services
-
-Supervisor services, such as the checkpointing service, run in threads parallel
-to the main training loop.  You sometimes want to add your own services, for
-example to fetch different sets of summaries on a different schedule than the
-usual summary service.
-
-Use the [`loop()`](../../api_docs/python/train.md#Supervisor.loop) method of
-the supervisor for this purpose.  It repeatedly calls a function of your choice
-on a timer until the supervisor stop condition becomes true, so it plays nicely
-with the other services.
-
-Example: Call `my_additional_summaries()` every 20mn:
-
-```python
-
-def my_additional_sumaries(sv, sess):
- ...fetch and write summaries, see below...
-
-...
-  sv = tf.Supervisor(logdir="/my/training/directory")
-  with sv.managed_session() as sess:
-    # Call my_additional_sumaries() every 1200s, or 20mn,
-    # passing (sv, sess) as arguments.
-    sv.loop(1200, my_additional_sumaries, args=(sv, sess))
-    ...main training loop...
-```
-
-## Writing Summaries
-
-The supervisor always creates an events file in its logdir, as well as a
-[`tf.SummaryWriter`](../../api_docs/python/train.md#SummaryWriter) to append
-events and summaries to that file.  If you want to write your own summaries it
-is a good idea to append them to that same events file: TensorBoard likes it
-better when only one events file in a directory is being actively appended to.
-
-The supervisor provides a helper function to append summaries:
-[`sv.summary_computed()`](../../api_docs/python/train.md#Supervisor.summary_computed).
-Just pass to the function the output returned by a summary op.  Here is an
-example of using that function to implement `my_additional_sumaries()` from the
-previous example:
-
-```python
-def my_additional_sumaries(sv, sess):
-  summaries = sess.run(my_additional_summary_op)
-  sv.summary_computed(sess, summaries)
-```
-
-For more advanced usages, the supervisor provides access to its summary writer
-through its
-[`summary_writer`](../../api_docs/python/train.md#Supervisor.summary_writer)
-attribute.
-
-## Supervisor Reference
-
-The [Very Simple Scenario](#very_simple_scenario), and the [Larger Model
-Scenario](#larger_model_scenario) show basic uses of a supervisor.  More
-advanced scenarios can be constructed by using the many options provided by the
-supervisor
-
-### Checkpointing: Where and When.
-
-The `managed_session()` call launches the checkpointing service, which can be
-configured by the following keyword arguments to the `Supervisor()`
-constructor:
-
- * `logdir`: path to a directory where the checkpointing service creates
-   checkpoints.  The directory is created if needed.  Passing `None` disables
-   the checkpointing and the summary services.
-
- * `checkpoint_basename`: Name of the checkpoint files to create, defaults to
-   "model.ckpt".
-
-   If the model contains a scalar integer variable named `global_step`, the
-   value of that variable is appended to the checkpoint filename.
-
-   For example, at global step 1234 the checkpoint filename is
-   "model.ckpt-1234".
-
- * `save_model_secs`: Number of seconds between each checkpoint.  Defaults to
-   600, or 10 minutes.
-
-   When choosing a value, consider how much work you want to lose in case of a
-   crash: you will never lose more than `save_model_secs` seconds of work.
-   Setting this to 0 disables the checkpointing service.
-
- * `saver`: A [`tf.Saver`](../../api_docs/python/train.md#Saver) object to use
-   for checkpointing.
-
-   If you do not pass one, the supervisor creates one for you by calling
-   `tf.Saver()`, which add ops to save and restore all variables in your model.
-   This is usally what you need.
-
-Example: Use a custom Saver and checkpoint every 30 seconds.
-
-```python
-  ...create graph...
-  my_saver = tf.Saver(<only some variables>)
-  sv = tf.Supervisor(logdir="/my/training/directory",
-                     saver=my_saver,
-                     save_model_secs=30)
-  with sv.managed_session() as sess:
-    ...training loop...
-```
-
-### Summaries: Where and When.
-
-The `managed_session()` call launches the summary service which fetches
-summaries and reports the number of steps executed per second.  It can be
-configured by the following keyword arguments to the `Supervisor()`
-constructor:
-
- * `logdir`: Path to a directory where the summary service creates event files.
-   The directory is created if needed.  Passing `None` disables the summary
-   service as well as the checkpointing services.
-
- * `save_summaries_secs`: Number of seconds between each run of the summary
-   service.  Defaults to 120, or 2 minutes.
-
-   When choosing a value, consider how expensive your summaries are, and how
-   much disk they will occupy.  Pass 0 to disable the summary service.
-
- * `summary_op`: Op to use to fetch the summaries.
-
-   If not specified, the supervisor use the first op in the
-   `tf.GraphKeys.SUMMARY_OP` [graph
-   collection](../../api_docs/python/framework#Graph.add_to_collection).  If
-   the collection is empty the supervisor creates an op that aggregates all
-   summaries in the graph using `tf.summary.merge_all()`.
-
-   Passing `None` disables the summary service.
-
- * `global_step`: Tensor to use to count the global step.
-
-   If not specified, the supervisor uses the first tensor in the
-   `tf.GraphKeys.GLOBAL_STEP` [graph
-   collection](../../api_docs/python/framework#Graph.add_to_collection).  If
-   the collection is empty, the supervisor looks for a scalar integer variable
-   named `global_step` in the graph.
-
-   If found, the global step tensor is used to measure the number of training
-   steps executed.  Note that your training op is responsible for incrementing
-   the global step value.
-
-### Model Initialization and Recovery
-
-The `managed_session()` call takes care of initializing or recovering a
-session.  It returns a session with a fully initialized model, ready to run
-ops.  If a checkpoint exists in the logdir when `managed_session()` is called,
-the model is initialized by loading that checkpoint, otherwise it is
-initialized by calling an init op and optionally an init function.
-
-When no checkpoint is available, model initialization is controlled by the
-following keyword arguments to the `Supervisor()` constructor:
-
- * `init_op`: Op to run to initialize the model.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.INIT_OP` collection.  If the collection is empty, the
-   supervisor adds an op to initialize all the variables in the graph by
-   calling `tf.global_variables_initializer()`.
-
-   Pass `None` to not use an init op.
-
- * `init_fn`: Python function to call to initialize the model.
-
-   If specified, called as `init_fn(sess)` where `sess` is the managed session.
-   If an init op is also used, the init function is called _after_ the init op.
-
- * `local_init_op`: An additional op to initialize parts of the graph that are
-   not saved in checkpoints such as tables and [local
-   variables](../../api_docs/python/contrib.framework.md#local_variable).  The
-   local init op is run _before_ the init op and the init function.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.LOCAL_INIT_OP` collection.  If the collection is empty the
-   supervisor adds an op to initialize all the tables and local variables in
-   the graph by calling `tf.initialize_all_tables()` and
-   `tf.initialize_all_local_variables()`.
-
-   Pass `None` to not use a local init op.
-
- * `ready_op`: Op to check if the model is initialized.
-
-   After running the local init op, the init op, and the init function, the
-   supervisor verifies that the model is fully initialized by running the ready
-   op.  This is an op that returns an empty string if the model is initialized,
-   or a description of what parts of the model are not initialized if not.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.READY_OP` collection.  If the collection is empty the
-   supervisor creates a ready op that verifies that all variables are
-   initialized by calling `tf.report_uninitialized_variables()`.
-
-   Pass `None` to disable the ready op.  In that case the model is not
-   checked after initialization.
-
-Checkpoint recovery is controlled by the following keyword arguments to the
-`Supervisor()` constructor:
-
- * `logdir`: Path to a directory in which to look for checkpoints.  The
-  checkpoint service saves a metadata file, named "checkpoint", in the
-  checkpoint directory that indicates the path to the most recent checkpoint.
-
-  This file is in text format. When in a pinch, you can edit it manually to
-  recover from a different checkpoint than the most recent one.
-
- * `ready_op`: (see above).  The ready op is run before and after loading the
-   checkpoint.  The first run checks if the model needs to be initialized and
-   the second run verifies that the model is fully initialized.
-
- * `local_init_op`: (see above).  The local init op is run before running the
-   ready op the first time, to initialize local variables and tables.
-
- * `saver`: (see above).  Saver object used to load the checkpoint.
diff --git a/tensorflow/g3doc/how_tos/threading_and_queues/index.md b/tensorflow/g3doc/how_tos/threading_and_queues/index.md
deleted file mode 100644
index 639ad116c9b..00000000000
--- a/tensorflow/g3doc/how_tos/threading_and_queues/index.md
+++ /dev/null
@@ -1,176 +0,0 @@
-# Threading and Queues
-
-Queues are a powerful mechanism for asynchronous computation using TensorFlow.
-
-Like everything in TensorFlow, a queue is a node in a TensorFlow graph. It's a
-stateful node, like a variable: other nodes can modify its content. In
-particular, nodes can enqueue new items in to the queue, or dequeue existing
-items from the queue.
-
-To get a feel for queues, let's consider a simple example. We will create a
-"first in, first out" queue (`FIFOQueue`) and fill it with zeros.
-Then we'll construct a graph
-that takes an item off the queue, adds one to that item, and puts it back on the
-end of the queue. Slowly, the numbers on the queue increase.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/IncremeterFifoQueue.gif">
-</div>
-
-`Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer
-to the queue instead of a normal value, allowing them to change it. We recommend
-you think of these as being like methods of the queue. In fact, in the Python
-API, they are methods of the queue object (e.g. `q.enqueue(...)`).
-
-**N.B.** Queue methods (such as `q.enqueue(...)`) *must* run on the same device
-as the queue. Incompatible device placement directives will be ignored when
-creating these operations.
-
-Now that you have a bit of a feel for queues, let's dive into the details...
-
-## Queue use overview
-
-Queues, such as `FIFOQueue` and `RandomShuffleQueue`, are important TensorFlow
-objects for computing tensors asynchronously in a graph.
-
-For example, a typical input architecture is to use a `RandomShuffleQueue` to
-prepare inputs for training a model:
-
-* Multiple threads prepare training examples and push them in the queue.
-* A training thread executes a training op that dequeues mini-batches from the
-  queue
-
-This architecture has many benefits, as highlighted in the
-[Reading data how to](../reading_data), which also gives an overview of
-functions that simplify the construction of input pipelines.
-
-The TensorFlow `Session` object is multithreaded, so multiple threads can
-easily use the same session and run ops in parallel.  However, it is not always
-easy to implement a Python program that drives threads as described above.  All
-threads must be able to stop together, exceptions must be caught and
-reported, and queues must be properly closed when stopping.
-
-TensorFlow provides two classes to help:
-[tf.Coordinator](../../api_docs/python/train.md#Coordinator) and
-[tf.QueueRunner](../../api_docs/python/train.md#QueueRunner). These two classes
-are designed to be used together. The `Coordinator` class helps multiple threads
-stop together and report exceptions to a program that waits for them to stop.
-The `QueueRunner` class is used to create a number of threads cooperating to
-enqueue tensors in the same queue.
-
-## Coordinator
-
-The Coordinator class helps multiple threads stop together.
-
-Its key methods are:
-
-* `should_stop()`: returns True if the threads should stop.
-* `request_stop(<exception>)`: requests that threads should stop.
-* `join(<list of threads>)`: waits until the specified threads have stopped.
-
-You first create a `Coordinator` object, and then create a number of threads
-that use the coordinator.  The threads typically run loops that stop when
-`should_stop()` returns `True`.
-
-Any thread can decide that the computation should stop.  It only has to call
-`request_stop()` and the other threads will stop as `should_stop()` will then
-return `True`.
-
-```python
-# Thread body: loop until the coordinator indicates a stop was requested.
-# If some condition becomes true, ask the coordinator to stop.
-def MyLoop(coord):
-  while not coord.should_stop():
-    ...do something...
-    if ...some condition...:
-      coord.request_stop()
-
-# Main code: create a coordinator.
-coord = Coordinator()
-
-# Create 10 threads that run 'MyLoop()'
-threads = [threading.Thread(target=MyLoop, args=(coord,)) for i in xrange(10)]
-
-# Start the threads and wait for all of them to stop.
-for t in threads: t.start()
-coord.join(threads)
-```
-
-Obviously, the coordinator can manage threads doing very different things.
-They don't have to be all the same as in the example above.  The coordinator
-also has support to capture and report exceptions.  See the [Coordinator class](../../api_docs/python/train.md#Coordinator) documentation for more details.
-
-## QueueRunner
-
-The `QueueRunner` class creates a number of threads that repeatedly run an
-enqueue op.  These threads can use a coordinator to stop together.  In
-addition, a queue runner runs a *closer thread* that automatically closes the
-queue if an exception is reported to the coordinator.
-
-You can use a queue runner to implement the architecture described above.
-
-First build a graph that uses a `Queue` for input examples.  Add ops that
-process examples and enqueue them in the queue.  Add training ops that start by
-dequeueing from the queue.
-
-```python
-example = ...ops to create one example...
-# Create a queue, and an op that enqueues examples one at a time in the queue.
-queue = tf.RandomShuffleQueue(...)
-enqueue_op = queue.enqueue(example)
-# Create a training graph that starts by dequeuing a batch of examples.
-inputs = queue.dequeue_many(batch_size)
-train_op = ...use 'inputs' to build the training part of the graph...
-```
-
-In the Python training program, create a `QueueRunner` that will run a few
-threads to process and enqueue examples.  Create a `Coordinator` and ask the
-queue runner to start its threads with the coordinator.  Write a training loop
-that also uses the coordinator.
-
-```
-# Create a queue runner that will run 4 threads in parallel to enqueue
-# examples.
-qr = tf.train.QueueRunner(queue, [enqueue_op] * 4)
-
-# Launch the graph.
-sess = tf.Session()
-# Create a coordinator, launch the queue runner threads.
-coord = tf.train.Coordinator()
-enqueue_threads = qr.create_threads(sess, coord=coord, start=True)
-# Run the training loop, controlling termination with the coordinator.
-for step in xrange(1000000):
-    if coord.should_stop():
-        break
-    sess.run(train_op)
-# When done, ask the threads to stop.
-coord.request_stop()
-# And wait for them to actually do it.
-coord.join(enqueue_threads)
-```
-
-## Handling exceptions
-
-Threads started by queue runners do more than just run the enqueue ops.  They
-also catch and handle exceptions generated by queues, including
-`OutOfRangeError` which is used to report that a queue was closed.
-
-A training program that uses a coordinator must similarly catch and report
-exceptions in its main loop.
-
-Here is an improved version of the training loop above.
-
-```python
-try:
-    for step in xrange(1000000):
-        if coord.should_stop():
-            break
-        sess.run(train_op)
-except Exception, e:
-    # Report exceptions to the coordinator.
-    coord.request_stop(e)
-finally:
-    # Terminate as usual.  It is innocuous to request stop twice.
-    coord.request_stop()
-    coord.join(threads)
-```
diff --git a/tensorflow/g3doc/how_tos/tool_developers/index.md b/tensorflow/g3doc/how_tos/tool_developers/index.md
deleted file mode 100644
index e29c2543e59..00000000000
--- a/tensorflow/g3doc/how_tos/tool_developers/index.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# A Tool Developer's Guide to TensorFlow Model Files
-
-Most users shouldn't need to care about the internal details of how TensorFlow
-stores data on disk, but you might if you're a tool developer. For example, you
-may want to analyze models, or convert back and forth between TensorFlow and
-other formats. This guide tries to explain some of the details of how you can
-work with the main files that hold model data, to make it easier to develop
-those kind of tools.
-
-[TOC]
-
-## Protocol Buffers
-
-All of TensorFlow's file formats are based on
-[Protocol Buffers](https://developers.google.com/protocol-buffers/?hl=en), so to
-start it's worth getting familiar with how they work. The summary is that you
-define data structures in text files, and the protobuf tools generate classes in
-C, Python, and other languages that can load, save, and access the data in a
-friendly way. We often refer to Protocol Buffers as protobufs, and I'll use
-that convention in this guide.
-
-## GraphDef
-
-The foundation of computation in TensorFlow is the `Graph` object. This holds a
-network of nodes, each representing one operation, connected to each other as
-inputs and outputs. After you've created a `Graph` object, you can save it out
-by calling `as_graph_def()`, which returns a `GraphDef` object.
-
-The GraphDef class is an object created by the ProtoBuf library from the
-definition in
-[tensorflow/core/framework/graph.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto). The protobuf tools parse
-this text file, and generate the code to load, store, and manipulate graph
-definitions. If you see a standalone TensorFlow file representing a model, it's
-likely to contain a serialized version of one of these `GraphDef` objects
-saved out by the protobuf code.
-
-This generated code is used to save and load the GraphDef files from disk. The code that actually loads the model looks like this:
-
-```python
-graph_def = graph_pb2.GraphDef()
-```
-
-This line creates an empty `GraphDef` object, the class that's been created
-from the textual definition in graph.proto. This is the object we're going to
-populate with the data from our file.
-
-```python
-with open(FLAGS.graph, "rb") as f:
-```
-
-Here we get a file handle for the path we've passed in to the script
-
-```python
-  if FLAGS.input_binary:
-    graph_def.ParseFromString(f.read())
-  else:
-    text_format.Merge(f.read(), graph_def)
-```
-
-## Text or Binary?
-
-There are actually two different formats that a ProtoBuf can be saved in.
-TextFormat is a human-readable form, which makes it nice for debugging and
-editing, but can get large when there's numerical data like weights stored in
-it. You can see a small example of that in
-[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt).
-
-Binary format files are a lot smaller than their text equivalents, even though
-they're not as readable for us. In this script, we ask the user to supply a
-flag indicating whether the input file is binary or text, so we know the right
-function to call. You can find an example of a large binary file inside the
-[inception_dec_2015.zip
-archive](https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015.zip), as `tensorflow_inception_graph.pb`.
-
-The API itself can be a bit confusing - the binary call is actually
-`ParseFromString()`, whereas you use a utility function from the `text_format`
-module to load textual files.
-
-## Nodes
-
-Once you've loaded a file into the `graph_def` variable, you can now access the
-data inside it. For most practical purposes, the important section is the list
-of nodes stored in the node member. Here's the code that loops through those:
-
-```python
-for node in graph_def.node
-```
-
-Each node is a `NodeDef` object, defined in
-[tensorflow/core/framework/node_def.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto). These
-are the fundamental building blocks of TensorFlow graphs, with each one defining
-a single operation along with its input connections. Here are the members of a
-`NodeDef`, and what they mean.
-
-### `name`
-
-Every node should have a unique identifier that's not used by any other nodes
-in the graph. If you don't specify one as you're building a graph using the
-Python API, one reflecting the name of operation, such as "MatMul",
-concatenated with a monotonically increasing number, such as "5", will be
-picked for you. The name is used when defining the connections between nodes,
-and when setting inputs and outputs for the whole graph when it's run.
-
-### `op`
-
-This defines what operation to run, for example `"Add"`, `"MatMul"`, or
-`"Conv2D"`. When a graph is run, this op name is looked up in a registry to
-find an implementation. The registry is populated by calls to the
-`REGISTER_OP()` macro, like those in
-[tensorflow/core/ops/nn_ops.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc).
-
-### `input`
-
-A list of strings, each one of which is the name of another node, optionally
-followed by a colon and an output port number. For example, a node with two
-inputs might have a list like `["some_node_name", "another_node_name"]`, which
-is equivalent to `["some_node_name:0", "another_node_name:0"]`, and defines the
-node's first input as the first output from the node with the name
-`"some_node_name"`, and a second input from the first output of
-`"another_node_name"`
-
-### `device`
-
-In most cases you can ignore this, since it defines where to run a node in a
-distributed environment, or when you want to force the operation onto CPU or
-GPU.
-
-### `attr`
-
-This is a key/value store holding all the attributes of a node. These are the
-permanent properties of nodes, things that don't change at runtime such as the
-size of filters for convolutions, or the values of constant ops. Because there
-can be so many different types of attribute values, from strings, to ints, to
-arrays of tensor values, there's a separate protobuf file defining the data
-structure that holds them, in
-[tensorflow/core/framework/attr_value.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto).
-
-Each attribute has a unique name string, and the expected attributes are listed
-when the operation is defined. If an attribute isn't present in a node, but it
-has a default listed in the operation definition, that default is used when the
-graph is created.
-
-You can access all of these members by calling `node.name`, `node.op`, etc. in
-Python. The list of nodes stored in the `GraphDef` is a full definition of the
-model architecture.
-
-## Freezing
-
-One confusing part about this is that the weights usually aren't stored inside
-the file format during training. Instead, they're held in separate checkpoint
-files, and there are `Variable` ops in the graph that load the latest values
-when they're initialized. It's often not very convenient to have separate files
-when you're deploying to production, so there's the
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) script that takes a graph definition and a set
-of checkpoints and freezes them together into a single file.
-
-What this does is load the `GraphDef`, pull in the values for all the variables
-from the latest checkpoint file, and then replace each `Variable` op with a
-`Const` that has the numerical data for the weights stored in its attributes
-It then strips away all the extraneous nodes that aren't used for forward
-inference, and saves out the resulting `GraphDef` into an output file.
-
-## Weight Formats
-
-If you're dealing with TensorFlow models that represent neural networks, one of
-the most common problems is extracting and interpreting the weight values. A
-common way to store them, for example in graphs created by the freeze_graph
-script, is as `Const` ops containing the weights as `Tensors`. These are
-defined in
-[tensorflow/core/framework/tensor.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto), and contain information
-about the size and type of the data, as well as the values themselves. In
-Python, you get a `TensorProto` object from a `NodeDef` representing a `Const`
-op by calling something like `some_node_def.attr['value'].tensor`.
-
-This will give you an object representing the weights data. The data itself
-will be stored in one of the lists with the suffix _val as indicated by the
-type of the object, for example `float_val` for 32-bit float data types.
-
-The ordering of convolution weight values is often tricky to deal with when
-converting between different frameworks. In TensorFlow, the filter weights for
-the `Conv2D` operation are stored on the second input, and are expected to be
-in the order `[filter_height, filter_width, input_depth, output_depth]`, where
-filter_count increasing by one means moving to an adjacent value in memory.
-
-Hopefully this rundown gives you a better idea of what's going on inside
-TensorFlow model files, and will help you if you ever need to manipulate them.
diff --git a/tensorflow/g3doc/how_tos/using_gpu/index.md b/tensorflow/g3doc/how_tos/using_gpu/index.md
deleted file mode 100644
index 2cf6dc72f9b..00000000000
--- a/tensorflow/g3doc/how_tos/using_gpu/index.md
+++ /dev/null
@@ -1,212 +0,0 @@
-# Using GPUs
-
-## Supported devices
-
-On a typical system, there are multiple computing devices. In TensorFlow, the
-supported device types are `CPU` and `GPU`. They are represented as `strings`.
-For example:
-
-*   `"/cpu:0"`: The CPU of your machine.
-*   `"/gpu:0"`: The GPU of your machine, if you have one.
-*   `"/gpu:1"`: The second GPU of your machine, etc.
-
-If a TensorFlow operation has both CPU and GPU implementations, the GPU devices
-will be given priority when the operation is assigned to a device. For example,
-`matmul` has both CPU and GPU kernels. On a system with devices `cpu:0` and
-`gpu:0`, `gpu:0` will be selected to run `matmul`.
-
-## Logging Device placement
-
-To find out which devices your operations and tensors are assigned to, create
-the session with `log_device_placement` configuration option set to `True`.
-
-```python
-# Creates a graph.
-a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
-b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-c = tf.matmul(a, b)
-# Creates a session with log_device_placement set to True.
-sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
-# Runs the op.
-print sess.run(c)
-```
-
-You should see the following output:
-
-```
-Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
-id: 0000:05:00.0
-b: /job:localhost/replica:0/task:0/gpu:0
-a: /job:localhost/replica:0/task:0/gpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
-[[ 22.  28.]
- [ 49.  64.]]
-
-```
-
-## Manual device placement
-
-If you would like a particular operation to run on a device of your choice
-instead of what's automatically selected for you, you can use `with tf.device`
-to create a device context such that all the operations within that context will
-have the same device assignment.
-
-```python
-# Creates a graph.
-with tf.device('/cpu:0'):
-  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
-  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-  c = tf.matmul(a, b)
-# Creates a session with log_device_placement set to True.
-sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
-# Runs the op.
-print sess.run(c)
-```
-
-You will see that now `a` and `b` are assigned to `cpu:0`.
-
-```
-Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
-id: 0000:05:00.0
-b: /job:localhost/replica:0/task:0/cpu:0
-a: /job:localhost/replica:0/task:0/cpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
-[[ 22.  28.]
- [ 49.  64.]]
-```
-
-## Allowing GPU memory growth
-
-By default, TensorFlow maps nearly all of the GPU memory of all GPUs (subject to
-[`CUDA_VISIBLE_DEVICES`](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars))
-visible to the process. This is done to more efficiently use the relatively
-precious GPU memory resources on the devices by reducing [memory
-fragmentation](https://en.wikipedia.org/wiki/Fragmentation_\(computing\)).
-
-In some cases it is desirable for the process to only allocate a subset of the
-available memory, or to only grow the memory usage as is needed by the process.
-TensorFlow provides two Config options on the Session to control this.
-
-The first is the `allow_growth` option, which attempts to allocate only as much
-GPU memory based on runtime allocations: it starts out allocating very little
-memory, and as Sessions get run and more GPU memory is needed, we extend the GPU
-memory region needed by the TensorFlow process. Note that we do not release
-memory, since that can lead to even worse memory fragmentation. To turn this
-option on, set the option in the ConfigProto by:
-
-```python
-config = tf.ConfigProto()
-config.gpu_options.allow_growth = True
-session = tf.Session(config=config, ...)
-```
-
-The second method is the `per_process_gpu_memory_fraction` option, which
-determines the fraction of the overall amount of memory that each visible GPU
-should be allocated. For example, you can tell TensorFlow to only allocate 40%
-of the total memory of each GPU by:
-
-```python
-config = tf.ConfigProto()
-config.gpu_options.per_process_gpu_memory_fraction = 0.4
-session = tf.Session(config=config, ...)
-```
-
-This is useful if you want to truly bound the amount of GPU memory available to
-the TensorFlow process.
-
-## Using a single GPU on a multi-GPU system
-
-If you have more than one GPU in your system, the GPU with the lowest ID will be
-selected by default. If you would like to run on a different GPU, you will need
-to specify the preference explicitly:
-
-```python
-# Creates a graph.
-with tf.device('/gpu:2'):
-  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
-  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-  c = tf.matmul(a, b)
-# Creates a session with log_device_placement set to True.
-sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
-# Runs the op.
-print sess.run(c)
-```
-
-If the device you have specified does not exist, you will get
-`InvalidArgumentError`:
-
-```
-InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
-Could not satisfy explicit device specification '/gpu:2'
-   [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
-   values: 1 2 3...>, _device="/gpu:2"]()]]
-```
-
-If you would like TensorFlow to automatically choose an existing and supported
-device to run the operations in case the specified one doesn't exist, you can
-set `allow_soft_placement` to `True` in the configuration option when creating
-the session.
-
-```python
-# Creates a graph.
-with tf.device('/gpu:2'):
-  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
-  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-  c = tf.matmul(a, b)
-# Creates a session with allow_soft_placement and log_device_placement set
-# to True.
-sess = tf.Session(config=tf.ConfigProto(
-      allow_soft_placement=True, log_device_placement=True))
-# Runs the op.
-print sess.run(c)
-```
-
-## Using multiple GPUs
-
-If you would like to run TensorFlow on multiple GPUs, you can construct your
-model in a multi-tower fashion where each tower is assigned to a different GPU.
-For example:
-
-```
-# Creates a graph.
-c = []
-for d in ['/gpu:2', '/gpu:3']:
-  with tf.device(d):
-    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
-    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
-    c.append(tf.matmul(a, b))
-with tf.device('/cpu:0'):
-  sum = tf.add_n(c)
-# Creates a session with log_device_placement set to True.
-sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
-# Runs the op.
-print sess.run(sum)
-```
-
-You will see the following output.
-
-```
-Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K20m, pci bus
-id: 0000:02:00.0
-/job:localhost/replica:0/task:0/gpu:1 -> device: 1, name: Tesla K20m, pci bus
-id: 0000:03:00.0
-/job:localhost/replica:0/task:0/gpu:2 -> device: 2, name: Tesla K20m, pci bus
-id: 0000:83:00.0
-/job:localhost/replica:0/task:0/gpu:3 -> device: 3, name: Tesla K20m, pci bus
-id: 0000:84:00.0
-Const_3: /job:localhost/replica:0/task:0/gpu:3
-Const_2: /job:localhost/replica:0/task:0/gpu:3
-MatMul_1: /job:localhost/replica:0/task:0/gpu:3
-Const_1: /job:localhost/replica:0/task:0/gpu:2
-Const: /job:localhost/replica:0/task:0/gpu:2
-MatMul: /job:localhost/replica:0/task:0/gpu:2
-AddN: /job:localhost/replica:0/task:0/cpu:0
-[[  44.   56.]
- [  98.  128.]]
-```
-
-The [cifar10 tutorial](../../tutorials/deep_cnn/index.md) is a good example
-demonstrating how to do training with multiple GPUs.
diff --git a/tensorflow/g3doc/how_tos/variable_scope/index.md b/tensorflow/g3doc/how_tos/variable_scope/index.md
deleted file mode 100644
index fda5f3aac66..00000000000
--- a/tensorflow/g3doc/how_tos/variable_scope/index.md
+++ /dev/null
@@ -1,373 +0,0 @@
-# Sharing Variables
-
-You can create, initialize, save and load single variables
-in the way described in the [Variables HowTo](../../how_tos/variables/index.md).
-But when building complex models you often need to share large sets of
-variables and you might want to initialize all of them in one place.
-This tutorial shows how this can be done using `tf.variable_scope()` and
-the `tf.get_variable()`.
-
-## The Problem
-
-Imagine you create a simple model for image filters, similar to our
-[Convolutional Neural Networks Tutorial](../../tutorials/deep_cnn/index.md)
-model but with only 2 convolutions (for simplicity of this example). If you use
-just `tf.Variable`, as explained in [Variables HowTo](../../how_tos/variables/index.md),
-your model might look like this.
-
-```python
-def my_image_filter(input_images):
-    conv1_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv1_weights")
-    conv1_biases = tf.Variable(tf.zeros([32]), name="conv1_biases")
-    conv1 = tf.nn.conv2d(input_images, conv1_weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    relu1 = tf.nn.relu(conv1 + conv1_biases)
-
-    conv2_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv2_weights")
-    conv2_biases = tf.Variable(tf.zeros([32]), name="conv2_biases")
-    conv2 = tf.nn.conv2d(relu1, conv2_weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv2 + conv2_biases)
-```
-
-As you can easily imagine, models quickly get much more complicated than
-this one, and even here we already have 4 different variables: `conv1_weights`,
-`conv1_biases`, `conv2_weights`, and `conv2_biases`.
-
-The problem arises when you want to reuse this model. Assume you want to
-apply your image filter to 2 different images, `image1` and `image2`.
-You want both images processed by the same filter with the same parameters.
-You can call `my_image_filter()` twice, but this will create two sets
-of variables, 4 variables in each one, for a total of 8 variables.
-
-```python
-# First call creates one set of 4 variables.
-result1 = my_image_filter(image1)
-# Another set of 4 variables is created in the second call.
-result2 = my_image_filter(image2)
-```
-
-A common way to share variables is to create them in a separate piece of code
-and pass them to functions that use them.   For example by using a dictionary:
-
-```python
-variables_dict = {
-    "conv1_weights": tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv1_weights")
-    "conv1_biases": tf.Variable(tf.zeros([32]), name="conv1_biases")
-    ... etc. ...
-}
-
-def my_image_filter(input_images, variables_dict):
-    conv1 = tf.nn.conv2d(input_images, variables_dict["conv1_weights"],
-        strides=[1, 1, 1, 1], padding='SAME')
-    relu1 = tf.nn.relu(conv1 + variables_dict["conv1_biases"])
-
-    conv2 = tf.nn.conv2d(relu1, variables_dict["conv2_weights"],
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv2 + variables_dict["conv2_biases"])
-
-# Both calls to my_image_filter() now use the same variables
-result1 = my_image_filter(image1, variables_dict)
-result2 = my_image_filter(image2, variables_dict)
-```
-
-While convenient, creating variables like above,
-outside of the code, breaks encapsulation:
-
-*  The code that builds the graph must document the names, types,
-   and shapes of variables to create.
-*  When the code changes, the callers may have to create more, or less,
-   or different variables.
-
-One way to address the problem is to use classes to create a model,
-where the classes take care of managing the variables they need.
-For a lighter solution, not involving classes, TensorFlow provides
-a *Variable Scope* mechanism that allows to easily share named variables
-while constructing a graph.
-
-## Variable Scope Example
-
-Variable Scope mechanism in TensorFlow consists of two main functions:
-
-* `tf.get_variable(<name>, <shape>, <initializer>)`:
-  Creates or returns a variable with a given name.
-* `tf.variable_scope(<scope_name>)`:
-  Manages namespaces for names passed to `tf.get_variable()`.
-
-The function `tf.get_variable()` is used to get or create a variable instead
-of a direct call to `tf.Variable`. It uses an *initializer* instead of passing
-the value directly, as in `tf.Variable`. An initializer is a function that
-takes the shape and provides a tensor with that shape. Here are some
-initializers available in TensorFlow:
-
-* `tf.constant_initializer(value)` initializes everything to the provided value,
-* `tf.random_uniform_initializer(a, b)` initializes uniformly from [a, b],
-* `tf.random_normal_initializer(mean, stddev)` initializes from the normal
-  distribution with the given mean and standard deviation.
-
-To see how `tf.get_variable()` solves the problem discussed
-before, let's refactor the code that created one convolution into
-a separate function, named `conv_relu`:
-
-```python
-def conv_relu(input, kernel_shape, bias_shape):
-    # Create variable named "weights".
-    weights = tf.get_variable("weights", kernel_shape,
-        initializer=tf.random_normal_initializer())
-    # Create variable named "biases".
-    biases = tf.get_variable("biases", bias_shape,
-        initializer=tf.constant_initializer(0.0))
-    conv = tf.nn.conv2d(input, weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv + biases)
-```
-
-This function uses short names `"weights"` and `"biases"`.
-We'd like to use it for both `conv1` and `conv2`, but
-the variables need to have different names.
-This is where `tf.variable_scope()` comes into play:
-it pushes a namespace for variables.
-
-```python
-def my_image_filter(input_images):
-    with tf.variable_scope("conv1"):
-        # Variables created here will be named "conv1/weights", "conv1/biases".
-        relu1 = conv_relu(input_images, [5, 5, 32, 32], [32])
-    with tf.variable_scope("conv2"):
-        # Variables created here will be named "conv2/weights", "conv2/biases".
-        return conv_relu(relu1, [5, 5, 32, 32], [32])
-```
-
-Now, let's see what happens when we call `my_image_filter()` twice.
-
-```
-result1 = my_image_filter(image1)
-result2 = my_image_filter(image2)
-# Raises ValueError(... conv1/weights already exists ...)
-```
-
-As you can see, `tf.get_variable()` checks that already existing variables
-are not shared by accident. If you want to share them, you need to specify
-it by setting `reuse_variables()` as follows.
-
-```
-with tf.variable_scope("image_filters") as scope:
-    result1 = my_image_filter(image1)
-    scope.reuse_variables()
-    result2 = my_image_filter(image2)
-```
-
-This is a good way to share variables, lightweight and safe.
-
-## How Does Variable Scope Work?
-
-### Understanding `tf.get_variable()`
-
-To understand variable scope it is necessary to first
-fully understand how `tf.get_variable()` works.
-Here is how `tf.get_variable` is usually called.
-
-```python
-v = tf.get_variable(name, shape, dtype, initializer)
-```
-
-This call does one of two things depending on the scope it is called in.
-Here are the two options.
-
-* Case 1: the scope is set for creating new variables, as evidenced by
-`tf.get_variable_scope().reuse == False`.
-
-In this case, `v` will be a newly created `tf.Variable` with the provided
-shape and data type. The full name of the created variable will be set to
-the current variable scope name + the provided `name` and a check will be
-performed to ensure that no variable with this full name exists yet.
-If a variable with this full name already exists, the function will
-raise a `ValueError`. If a new variable is created, it will be
-initialized to the value `initializer(shape)`. For example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-assert v.name == "foo/v:0"
-```
-
-* Case 2: the scope is set for reusing variables, as evidenced by
-`tf.get_variable_scope().reuse == True`.
-
-In this case, the call will search for an already existing variable with
-name equal to the current variable scope name + the provided `name`.
-If no such variable exists, a `ValueError` will be raised. If the variable
-is found, it will be returned. For example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v", [1])
-assert v1 is v
-```
-
-### Basics of `tf.variable_scope()`
-
-Knowing how `tf.get_variable()` works makes it easy to understand variable
-scope. The primary function of variable scope is to carry a name that will
-be used as prefix for variable names and a reuse-flag to distinguish the two
-cases described above. Nesting variable scopes appends their names in a way
-analogous to how directories work:
-
-```python
-with tf.variable_scope("foo"):
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-assert v.name == "foo/bar/v:0"
-```
-
-The current variable scope can be retrieved using `tf.get_variable_scope()`
-and the `reuse` flag of the current variable scope can be set to `True` by
-calling `tf.get_variable_scope().reuse_variables()`:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-    tf.get_variable_scope().reuse_variables()
-    v1 = tf.get_variable("v", [1])
-assert v1 is v
-```
-
-Note that you *cannot* set the `reuse` flag to `False`. The reason behind
-this is to allow to compose functions that create models. Imagine you write
-a function `my_image_filter(inputs)` as before. Someone calling the function
-in a variable scope with `reuse=True` would expect all inner variables to be
-reused as well. Allowing to force `reuse=False` inside the function would break
-this contract and make it hard to share parameters in this way.
-
-Even though you cannot set `reuse` to `False` explicitly, you can enter
-a reusing variable scope and then exit it, going back to a non-reusing one.
-This can be done using a `reuse=True` parameter when opening a variable scope.
-Note also that, for the same reason as above, the `reuse` parameter is
-inherited. So when you open a reusing variable scope, all sub-scopes will
-be reusing too.
-
-```python
-with tf.variable_scope("root"):
-    # At start, the scope is not reusing.
-    assert tf.get_variable_scope().reuse == False
-    with tf.variable_scope("foo"):
-        # Opened a sub-scope, still not reusing.
-        assert tf.get_variable_scope().reuse == False
-    with tf.variable_scope("foo", reuse=True):
-        # Explicitly opened a reusing scope.
-        assert tf.get_variable_scope().reuse == True
-        with tf.variable_scope("bar"):
-            # Now sub-scope inherits the reuse flag.
-            assert tf.get_variable_scope().reuse == True
-    # Exited the reusing scope, back to a non-reusing one.
-    assert tf.get_variable_scope().reuse == False
-```
-
-### Capturing variable scope
-
-In all examples presented above, we shared parameters only because their
-names agreed, that is, because we opened a reusing variable scope with
-exactly the same string. In more complex cases, it might be useful to pass
-a VariableScope object rather than rely on getting the names right.
-To this end, variable scopes can be captured and used instead of names
-when opening a new variable scope.
-
-```python
-with tf.variable_scope("foo") as foo_scope:
-    v = tf.get_variable("v", [1])
-with tf.variable_scope(foo_scope):
-    w = tf.get_variable("w", [1])
-with tf.variable_scope(foo_scope, reuse=True):
-    v1 = tf.get_variable("v", [1])
-    w1 = tf.get_variable("w", [1])
-assert v1 is v
-assert w1 is w
-```
-
-When opening a variable scope using a previously existing scope
-we jump out of the current variable scope prefix to an entirely
-different one. This is fully independent of where we do it.
-
-```python
-with tf.variable_scope("foo") as foo_scope:
-    assert foo_scope.name == "foo"
-with tf.variable_scope("bar"):
-    with tf.variable_scope("baz") as other_scope:
-        assert other_scope.name == "bar/baz"
-        with tf.variable_scope(foo_scope) as foo_scope2:
-            assert foo_scope2.name == "foo"  # Not changed.
-```
-
-### Initializers in variable scope
-
-Using `tf.get_variable()` allows to write functions that create or reuse
-variables and can be transparently called from outside. But what if we wanted
-to change the initializer of the created variables? Do we need to pass an extra
-argument to every function that creates variables? What about the most common
-case, when we want to set the default initializer for all variables in one
-place, on top of all functions? To help with these cases, variable scope
-can carry a default initializer. It is inherited by sub-scopes and passed
-to each `tf.get_variable()` call. But it will be overridden if another
-initializer is specified explicitly.
-
-```python
-with tf.variable_scope("foo", initializer=tf.constant_initializer(0.4)):
-    v = tf.get_variable("v", [1])
-    assert v.eval() == 0.4  # Default initializer as set above.
-    w = tf.get_variable("w", [1], initializer=tf.constant_initializer(0.3)):
-    assert w.eval() == 0.3  # Specific initializer overrides the default.
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-        assert v.eval() == 0.4  # Inherited default initializer.
-    with tf.variable_scope("baz", initializer=tf.constant_initializer(0.2)):
-        v = tf.get_variable("v", [1])
-        assert v.eval() == 0.2  # Changed default initializer.
-```
-
-### Names of ops in `tf.variable_scope()`
-
-We discussed how `tf.variable_scope` governs the names of variables.
-But how does it influence the names of other ops in the scope?
-It is natural that ops created inside a variable scope should also
-share that name. For this reason, when we do `with tf.variable_scope("name")`,
-this implicitly opens a `tf.name_scope("name")`. For example:
-
-```python
-with tf.variable_scope("foo"):
-    x = 1.0 + tf.get_variable("v", [1])
-assert x.op.name == "foo/add"
-```
-
-Name scopes can be opened in addition to a variable scope, and then
-they will only affect the names of the ops, but not of variables.
-
-```python
-with tf.variable_scope("foo"):
-    with tf.name_scope("bar"):
-        v = tf.get_variable("v", [1])
-        x = 1.0 + v
-assert v.name == "foo/v:0"
-assert x.op.name == "foo/bar/add"
-```
-
-When opening a variable scope using a captured object instead of a string,
-we do not alter the current name scope for ops.
-
-
-## Examples of Use
-
-Here are pointers to a few files that make use of variable scope. They can all
-be found in the [TensorFlow models repo](https://github.com/tensorflow/models).
-In particular, variable scope is heavily used for recurrent neural networks and
-sequence-to-sequence models.
-
-File | What's in it?
---- | ---
-`models/tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
-`models/tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
-`models/tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/g3doc/how_tos/variables/index.md b/tensorflow/g3doc/how_tos/variables/index.md
deleted file mode 100644
index 99d5eeb9954..00000000000
--- a/tensorflow/g3doc/how_tos/variables/index.md
+++ /dev/null
@@ -1,253 +0,0 @@
-# Variables: Creation, Initialization, Saving, and Loading
-
-When you train a model, you use [variables](../../api_docs/python/state_ops.md)
-to hold and update parameters.  Variables are in-memory buffers containing
-tensors.  They must be explicitly initialized and can be saved to disk during
-and after training. You can later restore saved values to exercise or analyze
-the model.
-
-This document references the following TensorFlow classes.  Follow the links to
-their reference manual for a complete description of their API:
-
-*  The [`tf.Variable`](../../api_docs/python/state_ops.md#Variable) class.
-*  The [`tf.train.Saver`](../../api_docs/python/state_ops.md#Saver) class.
-
-
-## Creation
-
-When you create a [Variable](../../api_docs/python/state_ops.md) you pass a
-`Tensor` as its initial value to the `Variable()` constructor.  TensorFlow
-provides a collection of ops that produce tensors often used for initialization
-from [constants or random values](../../api_docs/python/constant_op.md).
-
-Note that all these ops require you to specify the shape of the tensors.  That
-shape automatically becomes the shape of the variable.  Variables generally
-have a fixed shape, but TensorFlow provides advanced mechanisms to reshape
-variables.
-
-```python
-# Create two variables.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-biases = tf.Variable(tf.zeros([200]), name="biases")
-```
-
-Calling `tf.Variable()` adds several ops to the graph:
-
-*  A `variable` op that holds the variable value.
-*  An initializer op that sets the variable to its initial value.  This is
-   actually a `tf.assign` op.
-*  The ops for the initial value, such as the `zeros` op for the `biases`
-   variable in the example are also added to the graph.
-
-The value returned by `tf.Variable()` value is an instance of the Python class
-`tf.Variable`.
-
-### Device placement
-
-A variable can be pinned to a particular device when it is created, using a
-[`with tf.device(...):`](../../api_docs/python/framework.md#device) block:
-
-```python
-# Pin a variable to CPU.
-with tf.device("/cpu:0"):
-  v = tf.Variable(...)
-
-# Pin a variable to GPU.
-with tf.device("/gpu:0"):
-  v = tf.Variable(...)
-
-# Pin a variable to a particular parameter server task.
-with tf.device("/job:ps/task:7"):
-  v = tf.Variable(...)
-```
-
-**N.B.** Operations that mutate a variable, such as
-[`v.assign()`](../../api_docs/python/state_ops.md#Variable.assign) and the parameter
-update operations in a
-[`tf.train.Optimizer`](../../api_docs/python/train.md#Optimizer) *must* run on
-the same device as the variable. Incompatible device placement directives will
-be ignored when creating these operations.
-
-Device placement is particularly important when running in a replicated
-setting. See
-[`tf.train.replica_device_setter()`](../../api_docs/python/train.md#replica_device_setter)
-for details of a device function that can simplify the configuration for devices
-for a replicated model.
-
-## Initialization
-
-Variable initializers must be run explicitly before other ops in your model can
-be run.  The easiest way to do that is to add an op that runs all the variable
-initializers, and run that op before using the model.
-
-You can alternatively restore variable values from a checkpoint file, see
-below.
-
-Use `tf.global_variables_initializer()` to add an op to run variable initializers.
-Only run that op after you have fully constructed your model and launched it in
-a session.
-
-```python
-# Create two variables.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-biases = tf.Variable(tf.zeros([200]), name="biases")
-...
-# Add an op to initialize the variables.
-init_op = tf.global_variables_initializer()
-
-# Later, when launching the model
-with tf.Session() as sess:
-  # Run the init operation.
-  sess.run(init_op)
-  ...
-  # Use the model
-  ...
-```
-
-### Initialization from another Variable
-
-You sometimes need to initialize a variable from the initial value of another
-variable.  As the op added by `tf.global_variables_initializer()` initializes all
-variables in parallel you have to be careful when this is needed.
-
-To initialize a new variable from the value of another variable use the other
-variable's `initialized_value()` property.  You can use the initialized value
-directly as the initial value for the new variable, or you can use it as any
-other tensor to compute a value for the new variable.
-
-
-```python
-# Create a variable with a random value.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-# Create another variable with the same value as 'weights'.
-w2 = tf.Variable(weights.initialized_value(), name="w2")
-# Create another variable with twice the value of 'weights'
-w_twice = tf.Variable(weights.initialized_value() * 2.0, name="w_twice")
-```
-
-### Custom Initialization
-
-The convenience function `tf.global_variables_initializer()` adds an op to
-initialize *all variables* in the model.  You can also pass an explicit list of
-variables to initialize to `tf.variables_initializer`.  See the
-[Variables Documentation](../../api_docs/python/state_ops.md) for more options,
-including checking if variables are initialized.
-
-## Saving and Restoring
-
-The easiest way to save and restore a model is to use a `tf.train.Saver` object.
-The constructor adds `save` and `restore` ops to the graph for all, or a
-specified list, of the variables in the graph.  The saver object provides
-methods to run these ops, specifying paths for the checkpoint files to write to
-or read from.
-
-### Checkpoint Files
-
-Variables are saved in binary files that, roughly, contain a map from variable
-names to tensor values.
-
-When you create a `Saver` object, you can optionally choose names for the
-variables in the checkpoint files.  By default, it uses the value of the
-[`Variable.name`](../../api_docs/python/state_ops.md#Variable.name) property for
-each variable.
-
-To understand what variables are in a checkpoint, you can use the
-[`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py)
-library, and in particular, the `print_tensors_in_checkpoint_file` function.
-
-### Saving Variables
-
-Create a `Saver` with `tf.train.Saver()` to manage all variables in
-the model.
-
-```python
-# Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
-# Add an op to initialize the variables.
-init_op = tf.global_variables_initializer()
-
-# Add ops to save and restore all the variables.
-saver = tf.train.Saver()
-
-# Later, launch the model, initialize the variables, do some work, save the
-# variables to disk.
-with tf.Session() as sess:
-  sess.run(init_op)
-  # Do some work with the model.
-  ..
-  # Save the variables to disk.
-  save_path = saver.save(sess, "/tmp/model.ckpt")
-  print("Model saved in file: %s" % save_path)
-```
-
-### Restoring Variables
-
-The same `Saver` object is used to restore variables.  Note that when you
-restore variables from a file you do not have to initialize them beforehand.
-
-```python
-# Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
-# Add ops to save and restore all the variables.
-saver = tf.train.Saver()
-
-# Later, launch the model, use the saver to restore variables from disk, and
-# do some work with the model.
-with tf.Session() as sess:
-  # Restore variables from disk.
-  saver.restore(sess, "/tmp/model.ckpt")
-  print("Model restored.")
-  # Do some work with the model
-  ...
-```
-
-### Choosing which Variables to Save and Restore
-
-If you do not pass any argument to `tf.train.Saver()` the saver handles all
-variables in the graph.  Each one of them is saved under the name that was
-passed when the variable was created.
-
-It is sometimes useful to explicitly specify names for variables in the
-checkpoint files.  For example, you may have trained a model with a variable
-named `"weights"` whose value you want to restore in a new variable named
-`"params"`.
-
-It is also sometimes useful to only save or restore a subset of the variables
-used by a model.  For example, you may have trained a neural net with 5 layers,
-and you now want to train a new model with 6 layers, restoring the parameters
-from the 5 layers of the previously trained model into the first 5 layers of
-the new model.
-
-You can easily specify the names and variables to save by passing to the
-`tf.train.Saver()` constructor a Python dictionary: keys are the
-names to use, values are the variables to manage.
-
-Notes:
-
-*  You can create as many saver objects as you want if you need to save and
-   restore different subsets of the model variables.  The same variable can be
-   listed in multiple saver objects, its value is only changed when the saver
-   `restore()` method is run.
-
-*  If you only restore a subset of the model variables at the start
-   of a session, you have to run an initialize op for the other variables.  See
-   [`tf.variables_initializer()`](../../api_docs/python/state_ops.md#variables_initializer)
-   for more information.
-
-```python
-# Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
-# Add ops to save and restore only 'v2' using the name "my_v2"
-saver = tf.train.Saver({"my_v2": v2})
-# Use the saver object normally after that.
-...
-```
diff --git a/tensorflow/g3doc/images/getting_started.dot b/tensorflow/g3doc/images/getting_started.dot
deleted file mode 100644
index a9cae6c4b16..00000000000
--- a/tensorflow/g3doc/images/getting_started.dot
+++ /dev/null
@@ -1,14 +0,0 @@
-digraph Dependencies {
-  node [shape = oval];
-  "predictions: MatMul()" -> "data: Concat()"
-  "data: Concat()" -> data_left
-  "data: Concat()" -> data_right
-  "predictions: MatMul()" -> "weight_matrix: Reshape()"
-  "weight_matrix: Reshape()" -> "new_weights: Add()"
-  "new_weights: Add()" -> weights 
-  "new_weights: Add()" -> deltas
-  "update: Assign()" -> weights
-  "update: Assign()" -> "new_weights: Add()"
-  "InitializeAllVariables()" -> weights
-  "InitializeAllVariables()" -> init_value
-}
\ No newline at end of file
diff --git a/tensorflow/g3doc/resources/bib.md b/tensorflow/g3doc/resources/bib.md
deleted file mode 100644
index 907f06161e2..00000000000
--- a/tensorflow/g3doc/resources/bib.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# TensorFlow Whitepaper
-
-If you use TensorFlow in your research and would like to cite the TensorFlow
-system, we suggest you cite the [whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf):
-
-```
-@misc{tensorflow2015-whitepaper,
-title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
-url={http://tensorflow.org/},
-note={Software available from tensorflow.org},
-author={
-    Mart\'{\i}n~Abadi and
-    Ashish~Agarwal and
-    Paul~Barham and
-    Eugene~Brevdo and
-    Zhifeng~Chen and
-    Craig~Citro and
-    Greg~S.~Corrado and
-    Andy~Davis and
-    Jeffrey~Dean and
-    Matthieu~Devin and
-    Sanjay~Ghemawat and
-    Ian~Goodfellow and
-    Andrew~Harp and
-    Geoffrey~Irving and
-    Michael~Isard and
-    Yangqing Jia and
-    Rafal~Jozefowicz and
-    Lukasz~Kaiser and
-    Manjunath~Kudlur and
-    Josh~Levenberg and
-    Dan~Man\'{e} and
-    Rajat~Monga and
-    Sherry~Moore and
-    Derek~Murray and
-    Chris~Olah and
-    Mike~Schuster and
-    Jonathon~Shlens and
-    Benoit~Steiner and
-    Ilya~Sutskever and
-    Kunal~Talwar and
-    Paul~Tucker and
-    Vincent~Vanhoucke and
-    Vijay~Vasudevan and
-    Fernanda~Vi\'{e}gas and
-    Oriol~Vinyals and
-    Pete~Warden and
-    Martin~Wattenberg and
-    Martin~Wicke and
-    Yuan~Yu and
-    Xiaoqiang~Zheng},
-  year={2015},
-}
-```
-
-In textual form:
-
-```
-Martín Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo,
-Zhifeng Chen, Craig Citro, Greg S. Corrado, Andy Davis,
-Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Ian Goodfellow,
-Andrew Harp, Geoffrey Irving, Michael Isard, Rafal Jozefowicz, Yangqing Jia,
-Lukasz Kaiser, Manjunath Kudlur, Josh Levenberg, Dan Mané, Mike Schuster,
-Rajat Monga, Sherry Moore, Derek Murray, Chris Olah, Jonathon Shlens,
-Benoit Steiner, Ilya Sutskever, Kunal Talwar, Paul Tucker,
-Vincent Vanhoucke, Vijay Vasudevan, Fernanda Viégas,
-Oriol Vinyals, Pete Warden, Martin Wattenberg, Martin Wicke,
-Yuan Yu, and Xiaoqiang Zheng.
-TensorFlow: Large-scale machine learning on heterogeneous systems,
-2015. Software available from tensorflow.org.
-```
-
-If you use [TF.Learn](https://www.tensorflow.org/tutorials/tflearn/) in your research and would like to cite it, we suggest you cite the [whitepaper](https://arxiv.org/abs/1612.04251):
-
-```
-@article{tang2016tflearn,
-  title={TF.Learn: TensorFlow's High-level Module for Distributed Machine Learning},
-  author={Tang, Yuan},
-  journal={arXiv preprint arXiv:1612.04251},
-  year={2016}
-}
-```
-
-In textual form:
-```
-Tang, Yuan. "TF.Learn: TensorFlow's High-level Module for Distributed Machine Learning." arXiv preprint arXiv:1612.04251 (2016).
-```
\ No newline at end of file
diff --git a/tensorflow/g3doc/resources/data_versions.md b/tensorflow/g3doc/resources/data_versions.md
deleted file mode 100644
index e0dec89675f..00000000000
--- a/tensorflow/g3doc/resources/data_versions.md
+++ /dev/null
@@ -1,148 +0,0 @@
-# TensorFlow Data Versioning: GraphDefs and Checkpoints
-
-As described in [Compatibility for Graphs and Checkpoints](versions.md#graphs),
-TensorFlow marks each kind of data with version information in order to maintain
-backwards compatibility even across major releases in some cases.
-
-This document describes the versioning mechanism in more detail, and explains
-how to use it to change data formats safely.
-
-## Goals: backwards and partial forwards compatibility
-
-Consider the case of TensorFlow graphs serialized via the GraphDef protobuf.  We
-have a number of competing constraints:
-
-* We would like to be able to evolve TensorFlow in eventually incompatible ways:
-  removing ops, adding or removing attrs, etc.
-* GraphDefs produced by TensorFlow may live for months after they are generated,
-  so we want **backwards compatibility**: new versions of TensorFlow should be
-  able to read old data.
-* Sometimes a producer of a GraphDef is upgraded to a new version of TensorFlow
-  before the consumer of that data is updated, so we would like **forwards
-  compatibility**: new versions of TensorFlow should generate GraphDefs readable
-  by older versions of TensorFlow.  Unfortunately, forwards compatibility is
-  much more intrusive than backwards compatibility, so we support it only in
-  limited situations within Google and across *patch* releases for open source.
-
-For GraphDefs, we support backwards compatibility for 6 months and forwards
-compatibility for 3 weeks in limited situations.  For backwards compatibility,
-this means that we can only remove functionality 6 months after we stop
-producing data using that functionality.  Similarly, in the limited situations
-where we support forwards compatibility, we can add functionality only 3 weeks
-after TensorFlow can consume data using that functionality.
-
-In order to implement these semantics, we need to know when data is produced so
-that we can know when to enforce changes in formats.  The versioning system
-described below achieves that goal in a manner that supports both backwards and
-forwards compatibility (when they apply).
-
-For checkpoints, we have no plans to make either backwards or forwards
-incompatible changes, but still attach versions to checkpoints in case we ever
-do have to make a change.
-
-## Each type of data has separate version scheme
-
-Since different data formats evolve at different rates, we have a separate
-integer versioning scheme for each kind of data, and these schemes are separate
-from the overall version of TensorFlow.
-
-For now, there are data versions for GraphDefs (serialized computation graphs)
-and checkpoints (serialized variable state).  Both versioning schemes are
-defined in
-[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h).
-Whenever a new version is added, a note should be made in that header recording
-what changed and when.
-
-## Data, producers, and consumers
-
-In the discussion below, we consider version information for **data**, binaries
-that produce that data (**producers**), and binaries that consume that data
-(**consumers**):
-
-* Producer binaries have a version (`producer`) and a minimum consumer version
-  that they are compatible with (`min_consumer`).
-* Consumer binaries have a version (`consumer`) and a minimum producer version
-  that they are compatible with (`min_producer`).
-* Each piece of versioned data has a [`VersionDef
-  versions`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/versions.proto)
-  field which records the `producer` that made the data, the `min_consumer` that
-  it is compatible with, and a list of `bad_consumers` versions that are
-  disallowed.
-
-By default, when a producer makes some data, the data inherits the producer's
-`producer` and `min_consumer` versions.  `bad_consumers` can be set if specific
-consumer versions are known to contain bugs and must be avoided.  A consumer
-can accept a piece of data if
-
-* `consumer` >= data's `min_consumer`
-* data's `producer` >= consumer's `min_producer`
-* `consumer` not in data's `bad_consumers`
-
-Since both producers and consumers come from the same TensorFlow code base,
-[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h)
-contains a main binary version which is treated as either `producer` or
-`consumer` depending on context and both `min_consumer` and `min_producer`
-(needed by producers and consumers, respectively).  Specifically,
-
-* For GraphDef versions, we have `TF_GRAPH_DEF_VERSION`,
-  `TF_GRAPH_DEF_VERSION_MIN_CONSUMER`, and `TF_GRAPH_DEF_VERSION_MIN_PRODUCER`.
-* For checkpoint versions, we have `TF_CHECKPOINT_VERSION`,
-  `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and
-  `TF_CHECKPOINT_VERSION_MIN_PRODUCER`.
-
-## Evolving GraphDef versions
-
-We now discuss examples of using this versioning mechanism to make various
-changes to the GraphDef format.  Our goal is to be backwards compatible for six
-months, which means that data produced by TensorFlow at time `T` must be
-consumable by TensorFlow at time `T + 6 months`.  If forwards compatibility is
-desired, the data must be consumable at time `T - 3 weeks`.
-
-**Adding a new op:**
-
-1. Add the new op to both consumers and producers at the same time, and do not
-   change any GraphDef versions.  This type of change is automatically backwards
-   compatible, and is outside our forwards compatibility plan since existing
-   producer scripts will not suddenly use the new functionality.
-
-**Adding a new op and switching existing Python wrappers to use it:**
-
-1. Implement new consumer functionality and increment the binary version.
-2. If it is possible to make the wrappers use the new functionality only in
-   cases that did not work before, the wrappers can be updated now.
-3. If forwards compatibility is necessary, wait 3 weeks.
-4. Change Python wrappers to use the new functionality.  Do not increment
-   `min_consumer`, since models which do not use this op should not break.
-
-**Removing an op or restricting the functionality of an op:**
-
-1. Fix all producer scripts (not TensorFlow itself) to not use the banned op or
-   functionality.
-2. Increment the binary version and implement new consumer functionality that
-   bans the removed op or functionality for GraphDefs at the new version and
-   above.  If possible, make TensorFlow stop producing GraphDefs with the banned
-   functionality.  This can be done with
-   [`REGISTER_OP(...).Deprecated(deprecated_at_version, message)`](
-   https://github.com/tensorflow/tensorflow/blob/b289bc7a50fc0254970c60aaeba01c33de61a728/tensorflow/core/ops/array_ops.cc#L1009).
-3. Wait 6 months for backwards compatibility purposes.
-4. Increase `min_producer` to the GraphDef version from (2) and remove the
-   functionality entirely.
-
-**Changing the functionality of an op:**
-
-1. Add a new similar op named `SomethingV2` or similar and go through the
-   process of adding it and switching existing Python wrappers to use it (may
-   take 3 weeks if forwards compatibility is desired).
-2. Remove the old op (takes 6 months due to backwards compatibility).
-3. Increase `min_consumer` to rule out consumers with the old op, add back the
-   old op as an alias for `SomethingV2`, and go through the process to switch
-   existing Python wrappers to use it (may take 3 weeks).
-4. Go through the process to remove `SomethingV2`.
-
-**Banning a single consumer version that cannot run safely:**
-
-1. Bump the binary version and add the bad version to `bad_consumers` for all
-   new GraphDefs.  If possible, add to `bad_consumers` only for GraphDefs which
-   contain a certain op or similar.
-2. If existing consumers have the bad version, push them out as soon as
-   possible.
diff --git a/tensorflow/g3doc/resources/faq.md b/tensorflow/g3doc/resources/faq.md
deleted file mode 100644
index 3f84b766d3f..00000000000
--- a/tensorflow/g3doc/resources/faq.md
+++ /dev/null
@@ -1,320 +0,0 @@
-# Frequently Asked Questions
-
-This document provides answers to some of the frequently asked questions about
-TensorFlow. If you have a question that is not covered here, you might find an
-answer on one of the TensorFlow [community resources](../resources/index.md).
-
-[TOC]
-
-## Features and Compatibility
-
-#### Can I run distributed training on multiple computers?
-
-Yes! TensorFlow gained
-[support for distributed computation](../how_tos/distributed/index.md) in
-version 0.8. TensorFlow now supports multiple devices (CPUs and GPUs) in one or
-more computers.
-
-#### Does TensorFlow work with Python 3?
-
-As of the 0.6.0 release timeframe (Early December 2015), we do support Python
-3.3+.
-
-## Building a TensorFlow graph
-
-See also the
-[API documentation on building graphs](../api_docs/python/framework.md).
-
-#### Why does `c = tf.matmul(a, b)` not execute the matrix multiplication immediately?
-
-In the TensorFlow Python API, `a`, `b`, and `c` are
-[`Tensor`](../api_docs/python/framework.md#Tensor) objects. A `Tensor` object is
-a symbolic handle to the result of an operation, but does not actually hold the
-values of the operation's output. Instead, TensorFlow encourages users to build
-up complicated expressions (such as entire neural networks and its gradients) as
-a dataflow graph. You then offload the computation of the entire dataflow graph
-(or a subgraph of it) to a TensorFlow
-[`Session`](../api_docs/python/client.md#Session), which is able to execute the
-whole computation much more efficiently than executing the operations
-one-by-one.
-
-#### How are devices named?
-
-The supported device names are `"/device:CPU:0"` (or `"/cpu:0"`) for the CPU
-device, and `"/device:GPU:i"` (or `"/gpu:i"`) for the *i*th GPU device.
-
-#### How do I place operations on a particular device?
-
-To place a group of operations on a device, create them within a
-[`with tf.device(name):`](../api_docs/python/framework.md#device) context.  See
-the how-to documentation on
-[using GPUs with TensorFlow](../how_tos/using_gpu/index.md) for details of how
-TensorFlow assigns operations to devices, and the
-[CIFAR-10 tutorial](../tutorials/deep_cnn/index.md) for an example model that
-uses multiple GPUs.
-
-#### What are the different types of tensors that are available?
-
-TensorFlow supports a variety of different data types and tensor shapes. See the
-[ranks, shapes, and types reference](../resources/dims_types.md) for more details.
-
-## Running a TensorFlow computation
-
-See also the
-[API documentation on running graphs](../api_docs/python/client.md).
-
-#### What's the deal with feeding and placeholders?
-
-Feeding is a mechanism in the TensorFlow Session API that allows you to
-substitute different values for one or more tensors at run time. The `feed_dict`
-argument to [`Session.run()`](../api_docs/python/client.md#Session.run) is a
-dictionary that maps [`Tensor`](../api_docs/python/framework.md) objects to
-numpy arrays (and some other types), which will be used as the values of those
-tensors in the execution of a step.
-
-Often, you have certain tensors, such as inputs, that will always be fed. The
-[`tf.placeholder()`](../api_docs/python/io_ops.md#placeholder) op allows you
-to define tensors that *must* be fed, and optionally allows you to constrain
-their shape as well. See the
-[beginners' MNIST tutorial](../tutorials/mnist/beginners/index.md) for an
-example of how placeholders and feeding can be used to provide the training data
-for a neural network.
-
-#### What is the difference between `Session.run()` and `Tensor.eval()`?
-
-If `t` is a [`Tensor`](../api_docs/python/framework.md#Tensor) object,
-[`t.eval()`](../api_docs/python/framework.md#Tensor.eval) is shorthand for
-[`sess.run(t)`](../api_docs/python/client.md#Session.run) (where `sess` is the
-current [default session](../api_docs/python/client.md#get_default_session). The
-two following snippets of code are equivalent:
-
-```python
-# Using `Session.run()`.
-sess = tf.Session()
-c = tf.constant(5.0)
-print sess.run(c)
-
-# Using `Tensor.eval()`.
-c = tf.constant(5.0)
-with tf.Session():
-  print c.eval()
-```
-
-In the second example, the session acts as a
-[context manager](https://docs.python.org/2.7/reference/compound_stmts.html#with),
-which has the effect of installing it as the default session for the lifetime of
-the `with` block. The context manager approach can lead to more concise code for
-simple use cases (like unit tests); if your code deals with multiple graphs and
-sessions, it may be more straightforward to make explicit calls to
-`Session.run()`.
-
-#### Do Sessions have a lifetime? What about intermediate tensors?
-
-Sessions can own resources, such as
-[variables](../api_docs/python/state_ops.md#Variable),
-[queues](../api_docs/python/io_ops.md#QueueBase), and
-[readers](../api_docs/python/io_ops.md#ReaderBase); and these resources can use
-a significant amount of memory. These resources (and the associated memory) are
-released when the session is closed, by calling
-[`Session.close()`](../api_docs/python/client.md#Session.close).
-
-The intermediate tensors that are created as part of a call to
-[`Session.run()`](../api_docs/python/client.md) will be freed at or before the
-end of the call.
-
-#### Does the runtime parallelize parts of graph execution?
-
-The TensorFlow runtime parallelizes graph execution across many different
-dimensions:
-
-* The individual ops have parallel implementations, using multiple cores in a
-  CPU, or multiple threads in a GPU.
-* Independent nodes in a TensorFlow graph can run in parallel on multiple
-  devices, which makes it possible to speed up
-  [CIFAR-10 training using multiple GPUs](../tutorials/deep_cnn/index.md).
-* The Session API allows multiple concurrent steps (i.e. calls to
-  [Session.run()](../api_docs/python/client.md#Session.run) in parallel. This
-  enables the runtime to get higher throughput, if a single step does not use
-  all of the resources in your computer.
-
-#### Which client languages are supported in TensorFlow?
-
-TensorFlow is designed to support multiple client languages. Currently, the
-best-supported client language is [Python](../api_docs/python/index.md). The
-[C++ client API](../api_docs/cc/index.md) provides an interface for launching
-graphs and running steps; we also have an experimental API for
-[building graphs in C++](https://www.tensorflow.org/code/tensorflow/cc/tutorials/example_trainer.cc).
-
-We would like to support more client languages, as determined by community
-interest. TensorFlow has a
-[C-based client API](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
-that makes it easy to build a client in many different languages. We invite
-contributions of new language bindings.
-
-#### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
-
-TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on
-[using GPUs with TensorFlow](../how_tos/using_gpu/index.md) for details of how
-TensorFlow assigns operations to devices, and the
-[CIFAR-10 tutorial](../tutorials/deep_cnn/index.md) for an example model that
-uses multiple GPUs.
-
-Note that TensorFlow only uses GPU devices with a compute capability greater
-than 3.5.
-
-#### Why does `Session.run()` hang when using a reader or a queue?
-
-The [reader](../api_docs/python/io_ops.md#ReaderBase) and
-[queue](../api_docs/python/io_ops.md#QueueBase) classes provide special operations that
-can *block* until input (or free space in a bounded queue) becomes
-available. These operations allow you to build sophisticated
-[input pipelines](../how_tos/reading_data/index.md), at the cost of making the
-TensorFlow computation somewhat more complicated. See the how-to documentation
-for
-[using `QueueRunner` objects to drive queues and readers](../how_tos/reading_data/index.md#creating-threads-to-prefetch-using-queuerunner-objects)
-for more information on how to use them.
-
-## Variables
-
-See also the how-to documentation on [variables](../how_tos/variables/index.md)
-and [variable scopes](../how_tos/variable_scope/index.md), and
-[the API documentation for variables](../api_docs/python/state_ops.md).
-
-#### What is the lifetime of a variable?
-
-A variable is created when you first run the
-[`tf.Variable.initializer`](../api_docs/python/state_ops.md#Variable.initializer)
-operation for that variable in a session. It is destroyed when that
-[`session is closed`](../api_docs/python/client.md#Session.close).
-
-#### How do variables behave when they are concurrently accessed?
-
-Variables allow concurrent read and write operations. The value read from a
-variable may change if it is concurrently updated. By default, concurrent
-assigment operations to a variable are allowed to run with no mutual exclusion.
-To acquire a lock when assigning to a variable, pass `use_locking=True` to
-[`Variable.assign()`](../api_docs/python/state_ops.md#Variable.assign).
-
-## Tensor shapes
-
-See also the
-[`TensorShape` API documentation](../api_docs/python/framework.md#TensorShape).
-
-#### How can I determine the shape of a tensor in Python?
-
-In TensorFlow, a tensor has both a static (inferred) shape and a dynamic (true)
-shape. The static shape can be read using the
-[`tf.Tensor.get_shape()`](../api_docs/python/framework.md#Tensor.get_shape)
-method: this shape is inferred from the operations that were used to create the
-tensor, and may be
-[partially complete](../api_docs/python/framework.md#TensorShape). If the static
-shape is not fully defined, the dynamic shape of a `Tensor` `t` can be
-determined by evaluating [`tf.shape(t)`](../api_docs/python/array_ops.md#shape).
-
-#### What is the difference between `x.set_shape()` and `x = tf.reshape(x)`?
-
-The [`tf.Tensor.set_shape()`](../api_docs/python/framework.md) method updates
-the static shape of a `Tensor` object, and it is typically used to provide
-additional shape information when this cannot be inferred directly. It does not
-change the dynamic shape of the tensor.
-
-The [`tf.reshape()`](../api_docs/python/array_ops.md#reshape) operation creates
-a new tensor with a different dynamic shape.
-
-#### How do I build a graph that works with variable batch sizes?
-
-It is often useful to build a graph that works with variable batch sizes, for
-example so that the same code can be used for (mini-)batch training, and
-single-instance inference. The resulting graph can be
-[saved as a protocol buffer](../api_docs/python/framework.md#Graph.as_graph_def)
-and
-[imported into another program](../api_docs/python/framework.md#import_graph_def).
-
-When building a variable-size graph, the most important thing to remember is not
-to encode the batch size as a Python constant, but instead to use a symbolic
-`Tensor` to represent it. The following tips may be useful:
-
-* Use [`batch_size = tf.shape(input)[0]`](../api_docs/python/array_ops.md#shape)
-  to extract the batch dimension from a `Tensor` called `input`, and store it in
-  a `Tensor` called `batch_size`.
-
-* Use [`tf.reduce_mean()`](../api_docs/python/math_ops.md#reduce_mean) instead
-  of `tf.reduce_sum(...) / batch_size`.
-
-* If you use
-  [placeholders for feeding input](../how_tos/reading_data/index.md#feeding),
-  you can specify a variable batch dimension by creating the placeholder with
-  [`tf.placeholder(..., shape=[None, ...])`](../api_docs/python/io_ops.md#placeholder). The
-  `None` element of the shape corresponds to a variable-sized dimension.
-
-## TensorBoard
-
-#### How can I visualize a TensorFlow graph?
-
-See the [graph visualization tutorial](../how_tos/graph_viz/index.md).
-
-#### What is the simplest way to send data to TensorBoard?
-
-Add summary ops to your TensorFlow graph, and use a
-[`SummaryWriter`](../api_docs/python/train.md#SummaryWriter) to write
-these summaries to a log directory.  Then, start TensorBoard using
-
-    python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
-
-For more details, see the
-[Summaries and TensorBoard tutorial](../how_tos/summaries_and_tensorboard/index.md).
-
-#### Every time I launch TensorBoard, I get a network security popup!
-
-You can change TensorBoard to serve on localhost rather than '0.0.0.0' by
-the flag --host=localhost. This should quiet any security warnings.
-
-## Extending TensorFlow
-
-See also the how-to documentation for
-[adding a new operation to TensorFlow](../how_tos/adding_an_op/index.md).
-
-#### My data is in a custom format. How do I read it using TensorFlow?
-
-There are two main options for dealing with data in a custom format.
-
-The easier option is to write parsing code in Python that transforms the data
-into a numpy array, then feed a
-[`tf.placeholder()`](../api_docs/python/io_ops.md#placeholder) a tensor with
-that data. See the documentation on
-[using placeholders for input](../how_tos/reading_data/index.md#feeding) for
-more details. This approach is easy to get up and running, but the parsing can
-be a performance bottleneck.
-
-The more efficient option is to
-[add a new op written in C++](../how_tos/adding_an_op/index.md) that parses your
-data format. The
-[guide to handling new data formats](../how_tos/new_data_formats/index.md) has
-more information about the steps for doing this.
-
-#### How do I define an operation that takes a variable number of inputs?
-
-The TensorFlow op registration mechanism allows you to define inputs that are a
-single tensor, a list of tensors with the same type (for example when adding
-together a variable-length list of tensors), or a list of tensors with different
-types (for example when enqueuing a tuple of tensors to a queue).  See the
-how-to documentation for
-[adding an op with a list of inputs or outputs](../how_tos/adding_an_op/index.md#list-inputs-and-outputs)
-for more details of how to define these different input types.
-
-## Miscellaneous
-
-#### What is TensorFlow's coding style convention?
-
-The TensorFlow Python API adheres to the
-[PEP8](https://www.python.org/dev/peps/pep-0008/) conventions.<sup>*</sup> In
-particular, we use `CamelCase` names for classes, and `snake_case` names for
-functions, methods, and properties. We also adhere to the
-[Google Python style guide](https://google.github.io/styleguide/pyguide.html).
-
-The TensorFlow C++ code base adheres to the
-[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
-
-(<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
-indentation.)
-
diff --git a/tensorflow/g3doc/resources/glossary.md b/tensorflow/g3doc/resources/glossary.md
deleted file mode 100644
index a0ae73d9170..00000000000
--- a/tensorflow/g3doc/resources/glossary.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# Glossary
-
-**Broadcasting operation**
-
-An operation that uses [numpy-style broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-to make the shapes of its tensor arguments compatible.
-
-**Device**
-
-A piece of hardware that can run computation and has its own address space,
-like a GPU or CPU.
-
-**eval**
-
-A method of `Tensor` that returns the value of the `Tensor`, triggering any
-graph computation required to determine the value. You may only call `eval()`
-on a `Tensor` in a graph that has been launched in a session.
-
-**Feed**
-
-TensorFlow's mechanism for patching a tensor directly into any node in a graph
-launched in a session. You apply feeds when you trigger the execution of a
-graph, not when you build the graph. A feed temporarily replaces a node with a
-tensor value. You supply feed data as an argument to a `run()` or `eval()` call
-that initiates computation. After the run the feed disappears and the original
-node definition remains. You usually designate specific nodes to be "feed"
-nodes by using `tf.placeholder()` to create them. See
-[Basic Usage](../get_started/basic_usage.md) for more information.
-
-**Fetch**
-
-TensorFlow's mechanism for retrieving tensors from a graph launched in a
-session. You retrieve fetches when you trigger the execution of a graph, not
-when you build the graph. To fetch the tensor value of a node or nodes,
-execute the graph with a `run()` call on the `Session` object and pass a list of
-names of nodes to retrieve. See [Basic Usage](../get_started/basic_usage.md)
-for more information.
-
-**Graph**
-
-Describes a computation as a directed acyclic
-graph.  Nodes in the graph represent operations that must be
-performed. Edges in the graph represent either data or control
-dependencies. `GraphDef` is the proto used to describe a graph to the
-system (it is the API), and consists of a collection of `NodeDefs` (see
-below). A `GraphDef` may be converted to a (C++) `Graph` object which is
-easier to operate on.
-
-**IndexedSlices**
-
-In the Python API, TensorFlow's representation of a tensor that is sparse
-along only its first dimension. If the tensor is `k`-dimensional, an
-`IndexedSlices` instance logically represents a collection of
-`(k-1)`-dimensional slices along the tensor's first dimension. The indices of
-the slices are stored concatenated into a single 1-dimensional vector, and the
-corresponding slices are concatenated to form a single `k`-dimensional tensor. Use
-`SparseTensor` if the sparsity is not restricted to the first dimension.
-
-**Node**
-
-An element of a graph.
-
-Describes how to invoke a specific operation as one node in a specific
-computation `Graph`, including the values for any `attrs` needed to configure
-the operation. For operations that are polymorphic, the `attrs` include
-sufficient information to completely determine the signature of the `Node`.
-See `graph.proto` for details.
-
-**Op (operation)**
-
-In the TensorFlow runtime: A type of computation such as `add` or `matmul` or
-`concat`.  You can add new ops to the runtime as described [how to add an
-op](../how_tos/adding_an_op/index.md).
-
-In the Python API: A node in the graph.  Ops are represented by instances of
-the class [`tf.Operation`](../api_docs/python/framework.md#Operation).  The
-`type` property of an `Operation` indicates the run operation for the node,
-such as `add` or `matmul`.
-
-**Run**
-
-The action of executing ops in a launched graph.  Requires that the graph be
-launched in a `Session`.
-
-In the Python API: A method of the `Session` class:
-[`tf.Session.run`](../api_docs/python/client.md#Session).  You can pass tensors
-to feed and fetch to the `run()` call.
-
-In the C++ API: A method of the [`tensorflow::Session`](../api_docs/cc/ClassSession.md).
-
-**Session**
-
-A runtime object representing a launched graph.  Provides methods to execute
-ops in the graph.
-
-In the Python API: [`tf.Session`](../api_docs/python/client.md#Session)
-
-In the C++ API: class used to launch a graph and run operations
-[`tensorflow::Session`](../api_docs/cc/ClassSession.md).
-
-**Shape**
-
-The number of dimensions of a tensor and their sizes.
-
-In a launched graph: Property of the tensors that flow between nodes.  Some ops
-have strong requirements on the shape of their inputs and report errors at
-runtime if these are not met.
-
-In the Python API: Attribute of a Python `Tensor` in the graph construction
-API. During constructions the shape of tensors can be only partially known, or
-even unknown.  See
-[`tf.TensorShape`](../api_docs/python/framework.md#TensorShape)
-
-In the C++ API: class used to represent the shape of tensors
-[`tensorflow::TensorShape`](../api_docs/cc/ClassTensorShape.md).
-
-**SparseTensor**
-
-In the Python API, TensorFlow's representation of a tensor that is sparse in
-arbitrary positions. A `SparseTensor` stores only the non-empty values along
-with their indices, using a dictionary-of-keys format. In other words, if
-there are `m` non-empty values, it maintains a length-`m` vector of values and
-a matrix with m rows of indices. For efficiency, `SparseTensor` requires the
-indices to be sorted along increasing dimension number, i.e. in row-major
-order. Use `IndexedSlices` if the sparsity is only along the first dimension.
-
-**Tensor**
-
-A `Tensor` is a typed multi-dimensional array.  For example, a 4-D
-array of floating point numbers representing a mini-batch of images with
-dimensions `[batch, height, width, channel]`.
-
-In a launched graph: Type of the data that flow between nodes.
-
-In the Python API: class used to represent the output and inputs of ops added
-to the graph [`tf.Tensor`](../api_docs/python/framework.md#Tensor).  Instances of
-this class do not hold data.
-
-In the C++ API: class used to represent tensors returned from a
-[`Session::Run()`](../api_docs/cc/ClassSession.md) call
-[`tensorflow::Tensor`](../api_docs/cc/ClassTensor.md).
-Instances of this class hold data.
diff --git a/tensorflow/g3doc/resources/index.md b/tensorflow/g3doc/resources/index.md
deleted file mode 100644
index 034c17436db..00000000000
--- a/tensorflow/g3doc/resources/index.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Additional Resources
-
-## TensorFlow WhitePapers
-
-Additional details about the TensorFlow programming model and the underlying
-implementation can be found in this paper:
-
-* [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi)
-
-The original white paper introducing TensorFlow can be found here:
-
-* [TensorFlow: Large-scale machine learning on heterogeneous systems](http://download.tensorflow.org/paper/whitepaper2015.pdf)
-
-A white paper about
-[contrib.learn](https://www.tensorflow.org/tutorials/tflearn/) is also
-available:
-
-* [TF.Learn: TensorFlow's High-level Module for Distributed Machine Learning](https://arxiv.org/abs/1612.04251)
-
-### Citation
-
-If you use TensorFlow in your research and would like to cite the TensorFlow
-system, we suggest you cite the paper above.
-You can use this [BibTeX entry](bib.md).  As the project progresses, we
-may update the suggested citation with new papers.
-
-Please only use the TensorFlow name and marks when accurately referencing this
-software distribution, and do not use our marks in a way that suggests you are
-endorsed by or otherwise affiliated with Google. When referring to our marks,
-please include the following attribution statement: "TensorFlow, the TensorFlow
-logo and any related marks are trademarks of Google Inc."
-
-## What is TensorFlow used for?
-
-TensorFlow enables researchers to build machine learning models. We collect such
-models in our [Zoo](https://github.com/tensorflow/models). If you have built a 
-model with TensorFlow, you may consider publishing it there.
-
-We keep a list of projects that use TensorFlow [here](uses.md). If you made
-something amazing with TensorFlow, we'd like to hear about it!
-
-## Community
-
-The TensorFlow community has created many great projects around TensorFlow, including:
-
-* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
-* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
-* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
-* [Scikit Flow - Simplified Interface for TensorFlow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/learn/python/learn)
-* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
-* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
-* [Rust language bindings](https://github.com/google/tensorflow-rust)
-* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
-
-### Development
-
-The source code for TensorFlow is hosted on GitHub:
-<https://github.com/tensorflow/tensorflow>.
-
-If you are interested in contributing to TensorFlow please
-[review the contributing guide](
-https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
-
-### Help / Support / How do I?
-
-For help and support, technical or algorithmic questions, please submit
-your questions to Stack Overflow:
-<https://stackoverflow.com/questions/tagged/tensorflow>.
-You may also find answers in our [FAQ](faq.md), our [glossary](glossary.md), or
-in the [shapes, sizes and types guide](dims_types.md). Please do not use the
-mailing list or issue tracker for support.
-
-### Discussions
-
-For general discussions, please join the [TensorFlow discuss mailing list](
-https://groups.google.com/a/tensorflow.org/d/forum/discuss).
-This list is intended for general discussions about TensorFlow development and
-directions, not as a help forum. Instead, direct your questions to
-[Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow), and
-report issues on [GitHub](https://github.com/tensorflow/tensorflow/issues).
-
-### Report Issues
-
-Please report bugs, feature requests and installation / compatibility issues on
-the [TensorFlow issues tracker](
-https://github.com/tensorflow/tensorflow/issues) on GitHub.
-If you need help with using TensorFlow, please do not use the issue
-tracker for that. Instead, direct your questions to
-[Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).
-
-## Versioning
-
-TensorFlow uses [Semantic Versioning 2.0](http://semver.org).  For details on
-the versioning of our public API and binary compatibility, see the [versioning
-document](versions.md).  Additional details for developers are in [TensorFlow
-Data Versioning](data_versions.md).
-
-## Roadmap
-
-A roadmap containing what we're working on at the moment is [here](roadmap.md).
diff --git a/tensorflow/g3doc/resources/leftnav_files b/tensorflow/g3doc/resources/leftnav_files
deleted file mode 100644
index 0d29f95359f..00000000000
--- a/tensorflow/g3doc/resources/leftnav_files
+++ /dev/null
@@ -1,8 +0,0 @@
-bib.md
-uses.md
-faq.md
-glossary.md
-dims_types.md
-versions.md
-data_versions.md
-roadmap.md
diff --git a/tensorflow/g3doc/resources/versions.md b/tensorflow/g3doc/resources/versions.md
deleted file mode 100644
index 0a9d26b6a81..00000000000
--- a/tensorflow/g3doc/resources/versions.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# TensorFlow Version Semantics
-
-## Semantic Versioning 2.0
-
-TensorFlow follows Semantic Versioning 2.0 ([semver](http://semver.org)) for its
-public API. Each release version of TensorFlow has the form `MAJOR.MINOR.PATCH`.
-Changes to the each number have the following meaning:
-
-* **MAJOR**:  Backwards incompatible changes.  Code and data that worked with
-  a previous major release will not necessarily work with a new release.
-  However, in some cases existing TensorFlow data (graphs, checkpoints, and
-  other protobufs) may be migratable to the newer release; see below for details
-  on data compatibility.
-
-* **MINOR**: Backwards compatible features, speed improvements, etc.  Code and
-  data that worked with a previous minor release *and* which depends only the
-  public API will continue to work unchanged.  For details on what is and is
-  not the public API, see below.
-
-* **PATCH**: Backwards compatible bug fixes.
-
-## What is covered
-
-Only the public APIs of TensorFlow are backwards compatible across minor and
-patch versions.  The public APIs consist of
-
-* The documented public [Python](../api_docs/python) API, excluding `tf.contrib`.
-  This includes all public functions and classes (whose names do not start with
-  `_`) in the tensorflow module and its submodules. Note that the code in
-  the `examples/` to `tools/` directories is not reachable through the
-  tensorflow Python module and is thus not covered by the compatibility
-  guarantee.
-
-  If a symbol is available through the tensorflow Python module or its
-  submodules, but is not documented, then it is _not_ considered part of the
-  public API.
-
-* The [C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h).
-
-* The following protocol buffer files:
-  [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto),
-  [`config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto),
-  [`event`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/event.proto),
-  [`graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto),
-  [`op_def`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto),
-  [`reader_base`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/reader_base.proto),
-  [`summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto),
-  [`tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto),
-  [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto),
-  and [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto).
-
-## What is *not* covered
-
-Some API functions are explicitly marked as "experimental" and can change in
-backward incompatible ways between minor releases. These include:
-
-* **Experimental APIs**: The `tf.contrib` module and its submodules in Python
-  and any functions in the C API or fields in protocol buffers that are
-  explicitly commented as being experimental.
-
-* **Other languages**: TensorFlow APIs in languages other than Python and C,
-  such as:
-
-  - [C++](../api_docs/cc) (exposed through header files in
-    [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
-  - [Java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java)
-    ([#5](https://github.com/tensorflow/tensorflow/issues/5)), and
-  - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
-
-* **Details of composite ops:**  Many public functions in Python expand to
-  several primitive ops in the graph, and these details will be part of any
-  graphs saved to disk as `GraphDef`s.  These details are allowed to change for
-  minor releases. In particular, regressions tests that check for exact
-  matching between graphs are likely to break across minor releases, even though
-  the behavior of the graph should be unchanged and existing checkpoints will
-  still work.
-
-* **Floating point numerical details:** The specific floating point values
-  computed by ops may change at any time: users should rely only on approximate
-  accuracy and numerical stability, not on the specific bits computed.  Changes
-  to numerical formulas in minor and patch releases should result in comparable
-  or improved accuracy, with the caveat that in machine learning improved
-  accuracy of specific formulas may result in worse accuracy for the overall
-  system.
-
-* **Random numbers:** The specific random numbers computed by the [random
-  ops](../api_docs/python/constant_op.html#random-tensors) may change at any
-  time: users should rely only on approximately correct distributions and
-  statistical strength, not the specific bits computed.  However, we will make
-  changes to random bits rarely and ideally never for patch releases, and all
-  such intended changes will be documented.
-
-Furthermore, any API methods marked "deprecated" in the 1.0 release can
-be deleted in any subsequent minor release.
-
-## Compatibility for Graphs and Checkpoints
-
-Many users of TensorFlow will be saving graphs and trained models to disk for
-later evaluation or more training, often changing versions of TensorFlow in the
-process.  First, following semver, any graph or checkpoint written out with one
-version of TensorFlow can be loaded and evaluated with a later version of
-TensorFlow with the same major release.  However, we will endeavour to preserve
-backwards compatibility even across major releases when possible, so that the
-serialized files are usable over long periods of time.
-
-There are two main classes of saved TensorFlow data: graphs and checkpoints.
-Graphs describe the data flow graphs of ops to be run during training and
-inference, and checkpoints contain the saved tensor values of variables in a
-graph.
-
-Graphs are serialized via the `GraphDef` protocol buffer.  To facilitate (rare)
-backwards incompatible changes to graphs, each `GraphDef` has an integer version
-separate from the TensorFlow version.  The semantics are:
-
-* Each version of TensorFlow supports an interval of `GraphDef` versions.  This
-  interval with be constant across patch releases, and will only grow across
-  minor releases.  Dropping support for a `GraphDef` version will only occur
-  for a major release of TensorFlow.
-
-* Newly created graphs use the newest `GraphDef` version.
-
-* If a given version of TensorFlow supports the `GraphDef` version of a graph,
-  it will load and evaluate with the same behavior as when it was written out
-  (except for floating point numerical details and random numbers), regardless
-  of the major version of TensorFlow.  In particular, all checkpoint files will
-  be compatible.
-
-* If the `GraphDef` upper bound is increased to X in a (minor) release, there
-  will be at least six months before the lower bound is increased to X.
-
-For example (numbers and versions hypothetical), TensorFlow 1.2 might support
-`GraphDef` versions 4 to 7.  TensorFlow 1.3 could add `GraphDef` version 8 and
-support versions 4 to 8.  At least six months later, TensorFlow 2.0.0 could drop
-support for versions 4 to 7, leaving version 8 only.
-
-Finally, when support for a `GraphDef` version is dropped, we will attempt to
-provide tools for automatically converting graphs to a newer supported
-`GraphDef` version.
-
-For developer-level details about `GraphDef` versioning, including how to evolve
-the versions to account for changes, see [TensorFlow Data
-Versioning](data_versions.md).
diff --git a/tensorflow/g3doc/tutorials/BUILD b/tensorflow/g3doc/tutorials/BUILD
deleted file mode 100644
index 5642ade160e..00000000000
--- a/tensorflow/g3doc/tutorials/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-# Description:
-# Top-level tutorials files
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html b/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html
deleted file mode 100644
index 266faf042ed..00000000000
--- a/tensorflow/g3doc/tutorials/deep_cnn/cifar_tensorboard.html
+++ /dev/null
@@ -1,21 +0,0 @@
-<html>
-
-<head>
-  <title>TensorBoard Demo</title>
-  <script src="/tensorboard/webcomponents-lite.min.js"></script>
-  <link rel="import" href="/tensorboard/tf-tensorboard-demo.html">
-  <style>
-
-  html,body {
-    margin: 0;
-    padding: 0;
-    height: 100%;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-
-</style>
-</head>
-<body>
-  <tf-tensorboard-demo data-dir="/tensorboard/cifar"></tf-tensorboard-demo>
-</body>
-</html>
diff --git a/tensorflow/g3doc/tutorials/deep_cnn/index.md b/tensorflow/g3doc/tutorials/deep_cnn/index.md
deleted file mode 100644
index 8c3eeb40cb6..00000000000
--- a/tensorflow/g3doc/tutorials/deep_cnn/index.md
+++ /dev/null
@@ -1,453 +0,0 @@
-# Convolutional Neural Networks
-
-> **NOTE:** This tutorial is intended for *advanced* users of TensorFlow
-and assumes expertise and experience in machine learning.
-
-## Overview
-
-CIFAR-10 classification is a common benchmark problem in machine learning.  The
-problem is to classify RGB 32x32 pixel images across 10 categories:
-```
-airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck.
-```
-
-For more details refer to the [CIFAR-10 page](http://www.cs.toronto.edu/~kriz/cifar.html)
-and a [Tech Report](http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf)
-by Alex Krizhevsky.
-
-### Goals
-
-The goal of this tutorial is to build a relatively small [convolutional neural
-network](https://en.wikipedia.org/wiki/Convolutional_neural_network) (CNN) for
-recognizing images. In the process, this tutorial:
-
-1. Highlights a canonical organization for network architecture,
-training and evaluation.
-2. Provides a template for constructing larger and more sophisticated models.
-
-The reason CIFAR-10 was selected was that it is complex enough to exercise
-much of TensorFlow's ability to scale to large models. At the same time,
-the model is small enough to train fast, which is ideal for trying out
-new ideas and experimenting with new techniques.
-
-### Highlights of the Tutorial
-The CIFAR-10 tutorial demonstrates several important constructs for
-designing larger and more sophisticated models in TensorFlow:
-
-* Core mathematical components including [convolution](../../api_docs/python/nn.md#conv2d)
-([wiki](https://en.wikipedia.org/wiki/Convolution)),
-[rectified linear activations](../../api_docs/python/nn.md#relu)
-([wiki](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))),
-[max pooling](../../api_docs/python/nn.md#max_pool)
-([wiki](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer))
-and [local response normalization](../../api_docs/python/nn.md#local_response_normalization)
-(Chapter 3.3 in
-[AlexNet paper](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)).
-* [Visualization](../../how_tos/summaries_and_tensorboard/index.md)
-of network activities during training, including input images,
-losses and distributions of activations and gradients.
-* Routines for calculating the
-[moving average](../../api_docs/python/train.md#ExponentialMovingAverage)
-of learned parameters and using these averages
-during evaluation to boost predictive performance.
-* Implementation of a
-[learning rate schedule](../../api_docs/python/train.md#exponential_decay)
-that systematically decrements over time.
-* Prefetching [queues](../../api_docs/python/io_ops.md#shuffle_batch)
-for input
-data to isolate the model from disk latency and expensive image pre-processing.
-
-We also provide a [multi-GPU version](#training-a-model-using-multiple-gpu-cards)
-of the model which demonstrates:
-
-* Configuring a model to train across multiple GPU cards in parallel.
-* Sharing and updating variables among multiple GPUs.
-
-We hope that this tutorial provides a launch point for building larger CNNs for
-vision tasks on TensorFlow.
-
-### Model Architecture
-
-The model in this CIFAR-10 tutorial is a multi-layer architecture consisting of
-alternating convolutions and nonlinearities. These layers are followed by fully
-connected layers leading into a softmax classifier.  The model follows the
-architecture described by
-[Alex Krizhevsky](https://code.google.com/p/cuda-convnet/), with a few
-differences in the top few layers.
-
-This model achieves a peak performance of about 86% accuracy within a few hours
-of training time on a GPU. Please see [below](#evaluating-a-model) and the code
-for details.  It consists of 1,068,298 learnable parameters and requires about
-19.5M multiply-add operations to compute inference on a single image.
-
-## Code Organization
-
-The code for this tutorial resides in
-[`tensorflow_models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
-
-File | Purpose
---- | ---
-[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
-
-
-## CIFAR-10 Model
-
-The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
-The complete training
-graph contains roughly 765 operations. We find that we can make the code most
-reusable by constructing the graph with the following modules:
-
-1. [**Model inputs:**](#model-inputs) `inputs()` and `distorted_inputs()` add
-operations that read and preprocess CIFAR images for evaluation and training,
-respectively.
-1. [**Model prediction:**](#model-prediction) `inference()`
-adds operations that perform inference, i.e. classification, on supplied images.
-1. [**Model training:**](#model-training) `loss()` and `train()`
-add operations that compute the loss,
-gradients, variable updates and visualization summaries.
-
-### Model Inputs
-
-The input part of the model is built by the functions `inputs()` and
-`distorted_inputs()` which read images from the CIFAR-10 binary data files.
-These files contain fixed byte length records, so we use
-[`tf.FixedLengthRecordReader`](../../api_docs/python/io_ops.md#FixedLengthRecordReader).
-See [Reading Data](../../how_tos/reading_data/index.md#reading-from-files) to
-learn more about how the `Reader` class works.
-
-The images are processed as follows:
-
-*  They are cropped to 24 x 24 pixels, centrally for evaluation or
-   [randomly](../../api_docs/python/constant_op.md#random_crop) for training.
-*  They are [approximately whitened](../../api_docs/python/image.md#per_image_standardization)
-   to make the model insensitive to dynamic range.
-
-For training, we additionally apply a series of random distortions to
-artificially increase the data set size:
-
-* [Randomly flip](../../api_docs/python/image.md#random_flip_left_right) the image from left to right.
-* Randomly distort the [image brightness](../../api_docs/python/image.md#random_brightness).
-* Randomly distort the [image contrast](../../api_docs/python/image.md#random_contrast).
-
-Please see the [Images](../../api_docs/python/image.md) page for the list of
-available distortions. We also attach an
-[`image`](../../api_docs/python/summary.md#image) to the images
-so that we may visualize them in [TensorBoard](../../how_tos/summaries_and_tensorboard/index.md).
-This is a good practice to verify that inputs are built correctly.
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/cifar_image_summary.png">
-</div>
-
-Reading images from disk and distorting them can use a non-trivial amount of
-processing time. To prevent these operations from slowing down training, we run
-them inside 16 separate threads which continuously fill a TensorFlow
-[queue](../../api_docs/python/io_ops.md#shuffle_batch).
-
-### Model Prediction
-
-The prediction part of the model is constructed by the `inference()` function
-which adds operations to compute the *logits* of the predictions. That part of
-the model is organized as follows:
-
-Layer Name | Description
---- | ---
-`conv1` | [convolution](../../api_docs/python/nn.md#conv2d) and [rectified linear](../../api_docs/python/nn.md#relu) activation.
-`pool1` | [max pooling](../../api_docs/python/nn.md#max_pool).
-`norm1` | [local response normalization](../../api_docs/python/nn.md#local_response_normalization).
-`conv2` | [convolution](../../api_docs/python/nn.md#conv2d) and [rectified linear](../../api_docs/python/nn.md#relu) activation.
-`norm2` | [local response normalization](../../api_docs/python/nn.md#local_response_normalization).
-`pool2` | [max pooling](../../api_docs/python/nn.md#max_pool).
-`local3` | [fully connected layer with rectified linear activation](../../api_docs/python/nn.md).
-`local4` | [fully connected layer with rectified linear activation](../../api_docs/python/nn.md).
-`softmax_linear` | linear transformation to produce logits.
-
-Here is a graph generated from TensorBoard describing the inference operation:
-
-<div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/cifar_graph.png">
-</div>
-
-> **EXERCISE**: The output of `inference` are un-normalized logits. Try editing
-the network architecture to return normalized predictions using
-[`tf.nn.softmax()`](../../api_docs/python/nn.md#softmax).
-
-The `inputs()` and `inference()` functions provide all the components
-necessary to perform evaluation on a model. We now shift our focus towards
-building operations for training a model.
-
-> **EXERCISE:** The model architecture in `inference()` differs slightly from
-the CIFAR-10 model specified in
-[cuda-convnet](https://code.google.com/p/cuda-convnet/).  In particular, the top
-layers of Alex's original model are locally connected and not fully connected.
-Try editing the architecture to exactly reproduce the locally connected
-architecture in the top layer.
-
-### Model Training
-
-The usual method for training a network to perform N-way classification is
-[multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression),
-aka. *softmax regression*. Softmax regression applies a
-[softmax](../../api_docs/python/nn.md#softmax) nonlinearity to the
-output of the network and calculates the
-[cross-entropy](../../api_docs/python/nn.md#softmax_cross_entropy_with_logits)
-between the normalized predictions and a
-[1-hot encoding](../../api_docs/python/sparse_ops.md#sparse_to_dense) of the label.
-For regularization, we also apply the usual
-[weight decay](../../api_docs/python/nn.md#l2_loss) losses to all learned
-variables.  The objective function for the model is the sum of the cross entropy
-loss and all these weight decay terms, as returned by the `loss()` function.
-
-We visualize it in TensorBoard with a [`scalar`](../../api_docs/python/summary.md#scalar):
-
-![CIFAR-10 Loss](../../images/cifar_loss.png "CIFAR-10 Total Loss")
-
-We train the model using standard
-[gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
-algorithm (see [Training](../../api_docs/python/train.md) for other methods)
-with a learning rate that
-[exponentially decays](../../api_docs/python/train.md#exponential_decay)
-over time.
-
-![CIFAR-10 Learning Rate Decay](../../images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
-
-The `train()` function adds the operations needed to minimize the objective by
-calculating the gradient and updating the learned variables (see
-[`GradientDescentOptimizer`](../../api_docs/python/train.md#GradientDescentOptimizer)
-for details).  It returns an operation that executes all the calculations
-needed to train and update the model for one batch of images.
-
-## Launching and Training the Model
-
-We have built the model, let's now launch it and run the training operation with
-the script `cifar10_train.py`.
-
-```shell
-python cifar10_train.py
-```
-
-> **NOTE:** The first time you run any target in the CIFAR-10 tutorial,
-the CIFAR-10 dataset is automatically downloaded. The data set is ~160MB
-so you may want to grab a quick cup of coffee for your first run.
-
-You should see the output:
-
-```shell
-Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes.
-2015-11-04 11:45:45.927302: step 0, loss = 4.68 (2.0 examples/sec; 64.221 sec/batch)
-2015-11-04 11:45:49.133065: step 10, loss = 4.66 (533.8 examples/sec; 0.240 sec/batch)
-2015-11-04 11:45:51.397710: step 20, loss = 4.64 (597.4 examples/sec; 0.214 sec/batch)
-2015-11-04 11:45:54.446850: step 30, loss = 4.62 (391.0 examples/sec; 0.327 sec/batch)
-2015-11-04 11:45:57.152676: step 40, loss = 4.61 (430.2 examples/sec; 0.298 sec/batch)
-2015-11-04 11:46:00.437717: step 50, loss = 4.59 (406.4 examples/sec; 0.315 sec/batch)
-...
-```
-
-The script reports the total loss every 10 steps as well as the speed at which
-the last batch of data was processed. A few comments:
-
-* The first batch of data can be inordinately slow (e.g. several minutes) as the
-preprocessing threads fill up the shuffling queue with 20,000 processed CIFAR
-images.
-
-* The reported loss is the average loss of the most recent batch. Remember that
-this loss is the sum of the cross entropy and all weight decay terms.
-
-* Keep an eye on the processing speed of a batch. The numbers shown above were
-obtained on a Tesla K40c. If you are running on a CPU, expect slower performance.
-
-
-> **EXERCISE:** When experimenting, it is sometimes annoying that the first
-training step can take so long. Try decreasing the number of images that
-initially fill up the queue.  Search for `min_fraction_of_examples_in_queue`
-in `cifar10_input.py`.
-
-`cifar10_train.py` periodically [saves](../../api_docs/python/state_ops.md#Saver)
-all model parameters in
-[checkpoint files](../../how_tos/variables/index.md#saving-and-restoring)
-but it does *not* evaluate the model. The checkpoint file
-will be used by `cifar10_eval.py` to measure the predictive
-performance (see [Evaluating a Model](#evaluating-a-model) below).
-
-
-If you followed the previous steps, then you have now started training
-a CIFAR-10 model. [Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0)
-
-The terminal text returned from `cifar10_train.py` provides minimal insight into
-how the model is training. We want more insight into the model during training:
-
-* Is the loss *really* decreasing or is that just noise?
-* Is the model being provided appropriate images?
-* Are the gradients, activations and weights reasonable?
-* What is the learning rate currently at?
-
-[TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) provides this
-functionality, displaying data exported periodically from `cifar10_train.py` via
-a
-[`FileWriter`](../../api_docs/python/summary.md#FileWriter).
-
-For instance, we can watch how the distribution of activations and degree of
-sparsity in `local3` features evolve during training:
-
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row">
-  <img style="flex-grow:1; flex-shrink:1;" src="../../images/cifar_sparsity.png">
-  <img style="flex-grow:1; flex-shrink:1;" src="../../images/cifar_activations.png">
-</div>
-
-Individual loss functions, as well as the total loss, are particularly
-interesting to track over time. However, the loss exhibits a considerable amount
-of noise due to the small batch size employed by training.  In practice we find
-it extremely useful to visualize their moving averages in addition to their raw
-values.  See how the scripts use
-[`ExponentialMovingAverage`](../../api_docs/python/train.md#ExponentialMovingAverage)
-for this purpose.
-
-## Evaluating a Model
-
-Let us now evaluate how well the trained model performs on a hold-out data set.
-The model is evaluated by the script `cifar10_eval.py`.  It constructs the model
-with the `inference()` function and uses all 10,000 images in the evaluation set
-of CIFAR-10. It calculates the *precision at 1:* how often the top prediction
-matches the true label of the image.
-
-To monitor how the model improves during training, the evaluation script runs
-periodically on the latest checkpoint files created by the `cifar10_train.py`.
-
-```shell
-python cifar10_eval.py
-```
-
-> Be careful not to run the evaluation and training binary on the same GPU or
-else you might run out of memory. Consider running the evaluation on
-a separate GPU if available or suspending the training binary while running
-the evaluation on the same GPU.
-
-You should see the output:
-
-```shell
-2015-11-06 08:30:44.391206: precision @ 1 = 0.860
-...
-```
-
-The script merely returns the precision @ 1 periodically -- in this case
-it returned 86% accuracy. `cifar10_eval.py` also
-exports summaries that may be visualized in TensorBoard. These summaries
-provide additional insight into the model during evaluation.
-
-The training script calculates the
-[moving average](../../api_docs/python/train.md#ExponentialMovingAverage)
-version of all learned variables. The evaluation script substitutes
-all learned model parameters with the moving average version. This
-substitution boosts model performance at evaluation time.
-
-> **EXERCISE:** Employing averaged parameters may boost predictive performance
-by about 3% as measured by precision @ 1. Edit `cifar10_eval.py` to not employ
-the averaged parameters for the model and verify that the predictive performance
-drops.
-
-
-## Training a Model Using Multiple GPU Cards
-
-Modern workstations may contain multiple GPUs for scientific computation.
-TensorFlow can leverage this environment to run the training operation
-concurrently across multiple cards.
-
-Training a model in a parallel, distributed fashion requires
-coordinating training processes. For what follows we term *model replica*
-to be one copy of a model training on a subset of data.
-
-Naively employing asynchronous updates of model parameters
-leads to sub-optimal training performance
-because an individual model replica might be trained on a stale
-copy of the model parameters. Conversely, employing fully synchronous
-updates will be as slow as the slowest model replica.
-
-In a workstation with multiple GPU cards, each GPU will have similar speed
-and contain enough memory to run an entire CIFAR-10 model. Thus, we opt to
-design our training system in the following manner:
-
-* Place an individual model replica on each GPU.
-* Update model parameters synchronously by waiting for all GPUs to finish
-processing a batch of data.
-
-Here is a diagram of this model:
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/Parallelism.png">
-</div>
-
-Note that each GPU computes inference as well as the gradients for a unique
-batch of data. This setup effectively permits dividing up a larger batch
-of data across the GPUs.
-
-This setup requires that all GPUs share the model parameters. A well-known
-fact is that transferring data to and from GPUs is quite slow. For this
-reason, we decide to store and update all model parameters on the CPU (see
-green box). A fresh set of model parameters is transferred to the GPU
-when a new batch of data is processed by all GPUs.
-
-The GPUs are synchronized in operation. All gradients are accumulated from
-the GPUs and averaged (see green box). The model parameters are updated with
-the gradients averaged across all model replicas.
-
-### Placing Variables and Operations on Devices
-
-Placing operations and variables on devices requires some special
-abstractions.
-
-The first abstraction we require is a function for computing inference and
-gradients for a single model replica. In the code we term this abstraction
-a "tower". We must set two attributes for each tower:
-
-* A unique name for all operations within a tower.
-[`tf.name_scope()`](../../api_docs/python/framework.md#name_scope) provides
-this unique name by prepending a scope. For instance, all operations in
-the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
-
-* A preferred hardware device to run the operation within a tower.
-[`tf.device()`](../../api_docs/python/framework.md#device) specifies this. For
-instance, all operations in the first tower reside within `device('/gpu:0')`
-scope indicating that they should be run on the first GPU.
-
-All variables are pinned to the CPU and accessed via
-[`tf.get_variable()`](../../api_docs/python/state_ops.md#get_variable)
-in order to share them in a multi-GPU version.
-See how-to on [Sharing Variables](../../how_tos/variable_scope/index.md).
-
-### Launching and Training the Model on Multiple GPU cards
-
-If you have several GPU cards installed on your machine you can use them to
-train the model faster with the `cifar10_multi_gpu_train.py` script.  This
-version of the training script parallelizes the model across multiple GPU cards.
-
-```shell
-python cifar10_multi_gpu_train.py --num_gpus=2
-```
-
-Note that the number of GPU cards used defaults to 1. Additionally, if only 1
-GPU is available on your machine, all computations will be placed on it, even if
-you ask for more.
-
-> **EXERCISE:** The default settings for `cifar10_train.py` is to
-run on a batch size of 128. Try running `cifar10_multi_gpu_train.py` on 2 GPUs
-with a batch size of 64 and compare the training speed.
-
-## Next Steps
-
-[Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) You have
-completed the CIFAR-10 tutorial.
-
-If you are now interested in developing and training your own image
-classification system, we recommend forking this tutorial and replacing
-components to address your image classification problem.
-
-
-> **EXERCISE:** Download the
-[Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) data set.
-Fork the CIFAR-10 tutorial and swap in the SVHN as the input data. Try adapting
-the network architecture to improve predictive performance.
diff --git a/tensorflow/g3doc/tutorials/estimators/index.md b/tensorflow/g3doc/tutorials/estimators/index.md
deleted file mode 100644
index 01a2ed803c9..00000000000
--- a/tensorflow/g3doc/tutorials/estimators/index.md
+++ /dev/null
@@ -1,721 +0,0 @@
-# Creating Estimators in tf.contrib.learn
-
-The tf.contrib.learn framework makes it easy to construct and train machine
-learning models via its high-level
-[Estimator](../../api_docs/python/contrib.learn.md#estimators) API. `Estimator`
-offers classes you can instantiate to quickly configure common model types such
-as regressors and classifiers:
-
-*   [`LinearClassifier`](../../api_docs/python/contrib.learn.md#LinearClassifier):
-    Constructs a linear classification model.
-*   [`LinearRegressor`](../../api_docs/python/contrib.learn.md#LinearRegressor):
-    Constructs a linear regression model.
-*   [`DNNClassifier`](../../api_docs/python/contrib.learn.md#DNNClassifier):
-    Construct a neural network classification model.
-*   [`DNNRegressor`](../../api_docs/python/contrib.learn.md#DNNRegressor):
-    Construct a neural network regressions model.
-
-But what if none of `tf.contrib.learn`'s predefined model types meets your
-needs? Perhaps you need more granular control over model configuration, such as
-the ability to customize the loss function used for optimization, or specify
-different activation functions for each neural network layer. Or maybe you're
-implementing a ranking or recommendation system, and neither a classifier nor a
-regressor is appropriate for generating predictions.
-
-This tutorial covers how to create your own `Estimator` using the building
-blocks provided in `tf.contrib.learn`, which will predict the ages of
-[abalones](https://en.wikipedia.org/wiki/Abalone) based on their physical
-measurements. You'll learn how to do the following:
-
-*   Instantiate an `Estimator`
-*   Construct a custom model function
-*   Configure a neural network using `tf.contrib.layers`
-*   Choose an appropriate loss function from `tf.losses`
-*   Define a training op for your model
-*   Generate and return predictions
-
-## Prerequisites
-
-This tutorial assumes you already know tf.contrib.learn API basics, such as
-feature columns and `fit()` operations. If you've never used tf.contrib.learn
-before, or need a refresher, you should first review the following tutorials:
-
-*   [tf.contrib.learn Quickstart](../tflearn/index.md): Quick introduction to
-    training a neural network using tf.contrib.learn.
-*   [TensorFlow Linear Model Tutorial](../wide/index.md): Introduction to
-    feature columns, and an overview on building a linear classifier in
-    tf.contrib.learn.
-
-## An Abalone Age Predictor {#abalone-predictor}
-
-It's possible to estimate the age of an
-[abalone](https://en.wikipedia.org/wiki/Abalone) (sea snail) by the number of
-rings on its shell. However, because this task requires cutting, staining, and
-viewing the shell under a microscope, it's desirable to find other measurements
-that can predict age.
-
-The [Abalone Data Set](https://archive.ics.uci.edu/ml/datasets/Abalone) contains
-the following [feature
-data](https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names)
-for abalone:
-
-| Feature        | Description                                               |
-| -------------- | --------------------------------------------------------- |
-| Length         | Length of abalone (in longest direction; in mm)           |
-| Diameter       | Diameter of abalone (measurement perpendicular to length; |
-:                : in mm)                                                    :
-| Height         | Height of abalone (with its meat inside shell; in mm)     |
-| Whole Weight   | Weight of entire abalone (in grams)                       |
-| Shucked Weight | Weight of abalone meat only (in grams)                    |
-| Viscera Weight | Gut weight of abalone (in grams), after bleeding          |
-| Shell Weight   | Weight of dried abalone shell (in grams)                  |
-
-The label to predict is number of rings, as a proxy for abalone age.
-
-![Abalone shell](../../images/abalone_shell.jpg) **[“Abalone
-shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
-Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
-
-## Setup
-
-This tutorial uses three data sets.
-[`abalone_train.csv`](http://download.tensorflow.org/data/abalone_train.csv)
-contains labeled training data comprising 3,320 examples.
-[`abalone_test.csv`](http://download.tensorflow.org/data/abalone_test.csv)
-contains labeled test data for 850 examples.
-[`abalone_predict`](http://download.tensorflow.org/data/abalone_predict.csv)
-contains 7 examples on which to make predictions.
-
-The following sections walk through writing the `Estimator` code step by step;
-the [full, final code is available
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/estimators/abalone.py).
-
-## Loading Abalone CSV Data into TensorFlow Datasets
-
-To feed the abalone dataset into the model, you'll need to download and load the
-CSVs into TensorFlow `Dataset`s. First, add some standard Python and TensorFlow
-imports, and set up FLAGS:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import tempfile
-
-# Import urllib
-from six.moves import urllib
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
-FLAGS = None
-```
-
-Enable logging:
-
-```python
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Then define a function to load the CSVs (either from files specified in
-command-line options, or downloaded from
-[tensorflow.org](https://www.tensorflow.org/)):
-
-```python
-def maybe_download(train_data, test_data, predict_data):
-  """Maybe downloads training data and returns train and test file names."""
-  if train_data:
-    train_file_name = train_data
-  else:
-    train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_train.csv",
-        train_file.name)
-    train_file_name = train_file.name
-    train_file.close()
-    print("Training data is downloaded to %s" % train_file_name)
-
-  if test_data:
-    test_file_name = test_data
-  else:
-    test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
-    test_file_name = test_file.name
-    test_file.close()
-    print("Test data is downloaded to %s" % test_file_name)
-
-  if predict_data:
-    predict_file_name = predict_data
-  else:
-    predict_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_predict.csv",
-        predict_file.name)
-    predict_file_name = predict_file.name
-    predict_file.close()
-    print("Prediction data is downloaded to %s" % predict_file_name)
-
-  return train_file_name, test_file_name, predict_file_name
-```
-
-Finally, create `main()` and load the abalone CSVs into `Datasets`, defining
-flags to allow users to optionally specify CSV files for training, test, and
-prediction datasets via the command line (by default, files will be downloaded
-from [tensorflow.org](https://www.tensorflow.org/)):
-
-```python
-def main(unused_argv):
-  # Load datasets
-  abalone_train, abalone_test, abalone_predict = maybe_download(
-    FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
-
-  # Training examples
-  training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
-
-  # Test examples
-  test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
-
-  # Set of 7 examples for which to predict abalone ages
-  prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--train_data", type=str, default="", help="Path to the training data.")
-  parser.add_argument(
-      "--test_data", type=str, default="", help="Path to the test data.")
-  parser.add_argument(
-      "--predict_data",
-      type=str,
-      default="",
-      help="Path to the prediction data.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
-```
-
-## Instantiating an Estimator
-
-When defining a model using one of tf.contrib.learn's provided classes, such as
-`DNNClassifier`, you supply all the configuration parameters right in the
-constructor, e.g.:
-
-```python
-my_nn = tf.contrib.learn.DNNClassifier(feature_columns=[age, height, weight],
-                                       hidden_units=[10, 10, 10],
-                                       activation_fn=tf.nn.relu,
-                                       dropout=0.2,
-                                       n_classes=3,
-                                       optimizer="Adam")
-```
-
-You don't need to write any further code to instruct TensorFlow how to train the
-model, calculate loss, or return predictions; that logic is already baked into
-the `DNNClassifier`.
-
-By contrast, when you're creating your own estimator from scratch, the
-constructor accepts just two high-level parameters for model configuration,
-`model_fn` and `params`:
-
-```python
-nn = tf.contrib.learn.Estimator(
-    model_fn=model_fn, params=model_params)
-```
-
-*   `model_fn`: A function object that contains all the aforementioned logic to
-    support training, evaluation, and prediction. You are responsible for
-    implementing that functionality. The next section, [Constructing the
-    `model_fn`](#constructing-modelfn) covers creating a model function in
-    detail.
-
-*   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
-    that will be passed into the `model_fn`.
-
-NOTE: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
-`Estimator` initializer also accepts the general configuration arguments
-`model_dir` and `config`.
-
-For the abalone age predictor, the model will accept one hyperparameter:
-learning rate. Define `LEARNING_RATE` as a constant at the beginning of your
-code (highlighted in bold below), right after the logging configuration:
-
-<pre><code class="lang-python">tf.logging.set_verbosity(tf.logging.INFO)
-
-<strong># Learning rate for the model
-LEARNING_RATE = 0.001</strong></code></pre>
-
-NOTE: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
-needed to achieve the best results during model training.
-
-Then, add the following code to `main()`, which creates the dict `model_params`
-containing the learning rate and instantiates the `Estimator`:
-
-```python
-# Set model params
-model_params = {"learning_rate": LEARNING_RATE}
-
-# Instantiate Estimator
-nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
-```
-
-## Constructing the `model_fn` {#constructing-modelfn}
-
-The basic skeleton for an `Estimator` API model function looks like this:
-
-```python
-def model_fn(features, targets, mode, params):
-   # Logic to do the following:
-   # 1. Configure the model via TensorFlow operations
-   # 2. Define the loss function for training/evaluation
-   # 3. Define the training operation/optimizer
-   # 4. Generate predictions
-   # 5. Return predictions/loss/train_op/eval_metric_ops in ModelFnOps object
-   return ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops)
-```
-
-The `model_fn` must accept three arguments:
-
-*   `features`: A dict containing the features passed to the model via `fit()`,
-    `evaluate()`, or `predict()`.
-*   `targets`: A `Tensor` containing the labels passed to the model via `fit()`,
-    `evaluate()`, or `predict()`. Will be empty for `predict()` calls, as these
-    are the values the model will infer.
-*   `mode`: One of the following
-    [`ModeKeys`](../../api_docs/python/contrib.learn.md#ModeKeys) string values
-    indicating the context in which the model_fn was invoked:
-    *   `tf.contrib.learn.ModeKeys.TRAIN` The `model_fn` was invoked in training
-        mode—e.g., via a `fit()` call.
-    *   `tf.contrib.learn.ModeKeys.EVAL`. The `model_fn` was invoked in
-        evaluation mode—e.g., via an `evaluate()` call.
-    *   `tf.contrib.learn.ModeKeys.INFER`. The `model_fn` was invoked in
-        inference mode—e.g., via a `predict()` call.
-
-`model_fn` may also accept a `params` argument containing a dict of
-hyperparameters used for training (as shown in the skeleton above).
-
-The body of the function perfoms the following tasks (described in detail in the
-sections that follow):
-
-*   Configuring the model—here, for the abalone predictor, this will be a neural
-    network.
-*   Defining the loss function used to calculate how closely the model's
-    predictions match the target values.
-*   Defining the training operation that specifies the `optimizer` algorithm to
-    minimize the loss values calculated by the loss function.
-
-The `model_fn` must return a
-[`ModelFnOps`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/model_fn.py)
-object, which contains the following values:
-
-*   `mode` (required). The mode in which the model was run. Typically, you will
-    return the `mode` argument of the `model_fn` here.
-
-*   `predictions` (required in `INFER` and `EVAL` modes). A dict that maps key
-    names of your choice to `Tensor`s containing the predictions from the model,
-    e.g.:
-
-    ```python
-    predictions = {"results": tensor_of_predictions}
-    ```
-
-    In `INFER` mode, the dict that you return in `ModelFnOps` will then be
-    returned by `predict()`, so you can construct it in the format in which
-    you'd like to consume it.
-
-    In `EVAL` mode, the dict is used by [metric
-    functions](../../api_docs/python/contrib.metrics.md#metric-ops) to compute
-    metrics.
-
-*   `loss` (required in `EVAL` and `TRAIN` mode). A `Tensor` containing a scalar
-    loss value: the output of the model's loss function (discussed in more depth
-    later in [Defining loss for the model](#defining-loss)) calculated over all
-    the input examples. This is used in `TRAIN` mode for error handling and
-    logging, and is automatically included as a metric in `EVAL` mode.
-
-*   `train_op` (required only in `TRAIN` mode). An Op that runs one step of
-    training.
-
-*   `eval_metric_ops` (optional). A dict of name/value pairs specifying the
-    metrics that will be calculated when the model runs in `EVAL` mode. The name
-    is a label of your choice for the metric, and the value is the result of
-    your metric calculation. The
-    [`tf.metrics`](https://www.tensorflow.org/code/tensorflow/python/ops/metrics_impl.py)
-    module provides predefined functions for a variety of common metrics. The
-    following `eval_metric_ops` contains an `"accuracy"` metric calculated using
-    `tf.metrics.accuracy`:
-
-    ```python
-    eval_metric_ops = {
-        "accuracy": tf.metrics.accuracy(labels, predictions)
-    }
-    ```
-
-    If you do not specify `eval_metric_ops`, only `loss` will be calculated
-    during evaluation.
-
-### Configuring a neural network with `tf.contrib.layers`
-
-Constructing a [neural
-network](https://en.wikipedia.org/wiki/Artificial_neural_network) entails
-creating and connecting the input layer, the hidden layers, and the output
-layer.
-
-The input layer is a series of nodes (one for each feature in the model) that
-will accept the feature data that is passed to the `model_fn` in the `features`
-argument. If `features` contains an n-dimenional `Tensor` with all your feature
-data (which is the case if `x` and `y` `Dataset`s are passed to `fit()`,
-`evaluate()`, and `predict()` directly), then it can serve as the input layer.
-If `features` contains a dict of [feature
-columns](../linear/overview.md#feature-columns-and-transformations) passed to
-the model via an input function, you can convert it to an input-layer `Tensor`
-with the `input_from_feature_columns()` function in
-[tf.contrib.layers](../../api_docs/python/contrib.layers.md#layers-contrib).
-
-```python
-input_layer = tf.contrib.layers.input_from_feature_columns(
-    columns_to_tensors=features, feature_columns=[age, height, weight])
-```
-
-As shown above, `input_from_feature_columns()` takes two required arguments:
-
-*   `columns_to_tensors`. A mapping of the model's `FeatureColumns` to the
-    `Tensors` containing the corresponding feature data. This is exactly what is
-    passed to the `model_fn` in the `features` argument.
-*   `feature_columns`. A list of all the `FeatureColumns` in the model—`age`,
-    `height`, and `weight` in the above example.
-
-The input layer of the neural network then must be connected to one or more
-hidden layers via an [activation
-function](https://en.wikipedia.org/wiki/Activation_function) that performs a
-nonlinear transformation on the data from the previous layer. The last hidden
-layer is then connected to the output layer, the final layer in the model.
-tf.contrib.layers provides the following convenience functions for constructing
-fully connected layers:
-
-*   `relu(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
-    connected to the previous layer `inputs` with a [ReLU activation
-    function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\))
-    ([tf.nn.relu](../../api_docs/python/nn.md#relu)):
-
-    ```python
-    hidden_layer = tf.contrib.layers.relu(inputs=input_layer, num_outputs=10)
-    ```
-
-*   `relu6(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
-    connected to the previous layer `hidden_layer` with a ReLU 6 activation
-    function ([tf.nn.relu6](../../api_docs/python/nn.md#relu6)):
-
-    ```python
-    second_hidden_layer = tf.contrib.layers.relu6(inputs=hidden_layer, num_outputs=20)
-    ```
-
-*   `linear(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
-    connected to the previous layer `second_hidden_layer` with *no* activation
-    function, just a linear transformation:
-
-    ```python
-    output_layer = tf.contrib.layers.linear(inputs=second_hidden_layer, num_outputs=3)
-    ```
-
-All these functions are
-[partials](https://docs.python.org/2/library/functools.html#functools.partial)
-of the more general
-[`fully_connected()`](../../api_docs/python/contrib.layers.md#fully_connected)
-function, which can be used to add fully connected layers with other activation
-functions, e.g.:
-
-```python
-output_layer = tf.contrib.layers.fully_connected(inputs=second_hidden_layer,
-                                                 num_outputs=10,
-                                                 activation_fn=tf.sigmoid)
-```
-
-The above code creates the neural network layer `output_layer`, which is fully
-connected to `second_hidden_layer` with a sigmoid activation function
-([`tf.sigmoid`](../../api_docs/python/nn.md#sigmoid)). For a list of predefined
-activation functions available in TensorFlow, see the [API
-docs](../../api_docs/python/nn.md#activation-functions).
-
-Putting it all together, the following code constructs a full neural network for
-the abalone predictor, and captures its predictions:
-
-```python
-def model_fn(features, targets, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
-  ...
-```
-
-Here, because you'll be passing the abalone `Datasets` directly to `fit()`,
-`evaluate()`, and `predict()` via `x` and `y` arguments, the input layer is the
-`features` `Tensor` passed to the `model_fn`. The network contains two hidden
-layers, each with 10 nodes and a ReLU activation function. The output layer
-contains no activation function, and is
-[reshaped](../../api_docs/python/array_ops.md#reshape) to a one-dimensional
-tensor to capture the model's predictions, which are stored in
-`predictions_dict`.
-
-### Defining loss for the model {#defining-loss}
-
-The `ModelFnOps` returned by the `model_fn` must contain `loss`: a `Tensor`
-representing the loss value, which quantifies how well the model's predictions
-reflect the target values during training and evaluation runs. The
-[`tf.losses`](https://www.tensorflow.org/code/tensorflow/python/ops/losses/losses.py)
-module provides convenience functions for calculating loss using a variety of
-metrics, including:
-
-*   `absolute_difference(predictions, targets)`. Calculates loss using the
-    [absolute-difference
-    formula](https://en.wikipedia.org/wiki/Deviation_\(statistics\)#Unsigned_or_absolute_deviation)
-    (also known as L<sub>1</sub> loss).
-
-*   `log_loss(predictions, targets)`. Calculates loss using the [logistic loss
-    forumula](https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss)
-    (typically used in logistic regression).
-
-*   `mean_squared_error(predictions, targets)`. Calculates loss using the [mean
-    squared error](https://en.wikipedia.org/wiki/Mean_squared_error) (MSE; also
-    known as L<sub>2</sub> loss).
-
-The following example adds a definition for `loss` to the abalone `model_fn`
-using `mean_squared_error()` (in bold):
-
-<pre><code class="lang-python">def model_fn(features, targets, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
-
-  <strong># Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(targets, predictions)</strong>
-  ...</code></pre>
-
-See the [API docs](../../api_docs/python/contrib.losses.md#losses-contrib) for a
-full list of loss functions and more details on supported arguments and usage.
-
-Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
-The following code defines an `rmse` metric, which calculates the root mean
-squared error for the model predictions. Note that the `targets` tensor is cast
-to a `float64` type to match the data type of the `predictions` tensor, which
-will contain real values:
-
-```python
-eval_metric_ops = {
-    "rmse": tf.metrics.root_mean_squared_error(
-        tf.cast(targets, tf.float64), predictions)
-}
-```
-
-### Defining the training op for the model
-
-The training op defines the optimization algorithm TensorFlow will use when
-fitting the model to the training data. Typically when training, the goal is to
-minimize loss. The tf.contrib.layers API provides the function `optimize_loss`,
-which returns a training op that will do just that. `optimize_loss` has four
-required arguments:
-
-*   `loss`. The loss value calculated by the `model_fn` (see [Defining Loss for
-    the Model](#defining-loss)).
-*   `global_step`. An integer
-    [`Variable`](../../api_docs/python/state_ops.md#Variable) representing the
-    step counter to increment for each model training run. Can easily be
-    created/incremented in TensorFlow via the
-    [`get_global_step()`](../../api_docs/python/contrib.framework.md#get_global_step)
-    function.
-*   `learning_rate`. The [learning
-    rate](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Background)
-    (also known as _step size_) hyperparameter that the optimization algorithm
-    uses when training.
-*   `optimizer`. The optimization algorithm to use during training. `optimizer`
-    can accept any of the following string values, representing an optimization
-    algorithm predefined in `tf.contrib.layers.optimizers`:
-    *   `SGD`. Implementation of [gradient
-        descent](https://en.wikipedia.org/wiki/Gradient_descent)
-        ([tf.train.GradientDescentOptimizer](../../api_docs/python/train.md#GradientDescentOptimizer))
-    *   `Adagrad`. Implementation of the [AdaGrad optimization
-        algorithm](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-        ([tf.train.AdagradOptimizer](../../api_docs/python/train.md#AdagradOptimizer))
-    *   `Adam`. Implementation of the [Adam optimization
-        algorithm](http://arxiv.org/pdf/1412.6980.pdf)
-        ([tf.train.AdamOptimizer](../../api_docs/python/train.md#AdamOptimizer))
-    *   `Ftrl`. Implementation of the
-        [FTRL-Proximal](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-        ("Follow The (Proximally) Regularized Leader") algorithm
-        ([tf.train.FtrlOptimizer](../../api_docs/python/train.md#FtrlOptimizer))
-    *   `Momentum`. Implementation of stochastic gradient descent with
-        [momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)
-        ([tf.train.MomentumOptimizer](../../api_docs/python/train.md#MomentumOptimizer))
-    *   `RMSProp`. Implementation of the
-        [RMSprop](http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop)
-        algorithm
-        ([tf.train.RMSPropOptimizer](../../api_docs/python/train.md#RMSPropOptimizer))
-
-NOTE: The `optimize_loss` function supports additional optional arguments to
-further configure the optimizer, such as for implementing decay. See the [API
-docs](../../api_docs/python/contrib.layers.md#optimize_loss) for more info.
-
-The following code defines a training op for the abalone `model_fn` using the
-loss value calculated in [Defining Loss for the Model](#defining-loss), the
-learning rate passed to the function in `params`, and the SGD optimizer. For
-`global_step`, the convenience function
-[`get_global_step()`](../../api_docs/python/contrib.framework.md#get_global_step)
-in tf.contrib.framework takes care of generating an integer variable:
-
-```python
-train_op = tf.contrib.layers.optimize_loss(
-    loss=loss,
-    global_step=tf.contrib.framework.get_global_step(),
-    learning_rate=params["learning_rate"],
-    optimizer="SGD")
-```
-
-### The complete abalone `model_fn`
-
-Here's the final, complete `model_fn` for the abalone age predictor. The
-following code configures the neural network; defines loss and the training op;
-and returns a `ModelFnOps` object containing `mode`, `predictions_dict`, `loss`,
-and `train_op`:
-
-```python
-def model_fn(features, targets, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
-
-  # Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(targets, predictions)
-
-  # Calculate root mean squared error as additional eval metric
-  eval_metric_ops = {
-      "rmse":
-          tf.metrics.root_mean_squared_error(
-              tf.cast(targets, tf.float64), predictions)
-  }
-
-  train_op = tf.contrib.layers.optimize_loss(
-      loss=loss,
-      global_step=tf.contrib.framework.get_global_step(),
-      learning_rate=params["learning_rate"],
-      optimizer="SGD")
-
-  return model_fn_lib.ModelFnOps(
-      mode=mode,
-      predictions=predictions_dict,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops)
-```
-
-## Running the Abalone Model
-
-You've instantiated an `Estimator` for the abalone predictor and defined its
-behavior in `model_fn`; all that's left to do is train, evaluate, and make
-predictions.
-
-Add the following code to the end of `main()` to fit the neural network to the
-training data and evaluate accuracy:
-
-```python
-# Fit
-nn.fit(x=training_set.data, y=training_set.target, steps=5000)
-
-# Score accuracy
-ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
-print("Loss: %s" % ev["loss"])
-print("Root Mean Squared Error: %s" % ev["rmse"])
-```
-
-Then run the code. You should see output like the following:
-
-```none
-...
-INFO:tensorflow:loss = 4.86658, step = 4701
-INFO:tensorflow:loss = 4.86191, step = 4801
-INFO:tensorflow:loss = 4.85788, step = 4901
-...
-INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 5.581
-Loss: 5.581
-```
-
-The loss score reported is the mean squared error returned from the `model_fn`
-when run on the `ABALONE_TEST` data set.
-
-To predict ages for the `ABALONE_PREDICT` data set, add the following to
-`main()`:
-
-```python
-# Print out predictions
-predictions = nn.predict(x=prediction_set.data, as_iterable=True)
-for i, p in enumerate(predictions):
-  print("Prediction %s: %s" % (i + 1, p["ages"]))
-```
-
-Here, the `predict()` function returns results in `predictions` as an iterable.
-The `for` loop enumerates and prints out the results. Rerun the code, and you
-should see output similar to the following:
-
-```python
-...
-Prediction 1: 4.92229
-Prediction 2: 10.3225
-Prediction 3: 7.384
-Prediction 4: 10.6264
-Prediction 5: 11.0862
-Prediction 6: 9.39239
-Prediction 7: 11.1289
-```
-
-## Additional Resources
-
-Congrats! You've successfully built a tf.contrib.learn `Estimator` from scratch.
-For additional reference materials on building `Estimator`s, see the following
-sections of the API docs:
-
-*   [Estimators](../../api_docs/python/contrib.learn.md#estimators)
-*   [Layers](../../api_docs/python/contrib.layers.md#layers-contrib)
-*   [Losses](../../api_docs/python/contrib.losses.md#losses-contrib)
-*   [Optimization](../../api_docs/python/contrib.layers.md#optimization)
diff --git a/tensorflow/g3doc/tutorials/image_recognition/index.md b/tensorflow/g3doc/tutorials/image_recognition/index.md
deleted file mode 100644
index d4fa5ba780d..00000000000
--- a/tensorflow/g3doc/tutorials/image_recognition/index.md
+++ /dev/null
@@ -1,460 +0,0 @@
-# Image Recognition
-
-Our brains make vision seem easy. It doesn't take any effort for humans to
-tell apart a lion and a jaguar, read a sign, or recognize a human's face.
-But these are actually hard problems to solve with a computer: they only
-seem easy because our brains are incredibly good at understanding images.
-
-In the last few years the field of machine learning has made tremendous
-progress on addressing these difficult problems. In particular, we've
-found that a kind of model called a deep
-[convolutional neural network](http://colah.github.io/posts/2014-07-Conv-Nets-Modular/)
-can achieve reasonable performance on hard visual recognition tasks --
-matching or exceeding human performance in some domains.
-
-Researchers have demonstrated steady progress
-in computer vision by validating their work against
-[ImageNet](http://www.image-net.org) -- an academic benchmark for computer vision.
-Successive models continue to show improvements, each time achieving
-a new state-of-the-art result:
-[QuocNet], [AlexNet], [Inception (GoogLeNet)], [BN-Inception-v2].
-Researchers both internal and external to Google have published papers describing all
-these models but the results are still hard to reproduce.
-We're now taking the next step by releasing code for running image recognition
-on our latest model, [Inception-v3].
-
-[QuocNet]: http://static.googleusercontent.com/media/research.google.com/en//archive/unsupervised_icml2012.pdf
-[AlexNet]: http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
-[Inception (GoogLeNet)]: http://arxiv.org/abs/1409.4842
-[BN-Inception-v2]: http://arxiv.org/abs/1502.03167
-[Inception-v3]: http://arxiv.org/abs/1512.00567
-
-Inception-v3 is trained for the [ImageNet] Large Visual Recognition Challenge
-using the data from 2012. This is a standard task in computer vision,
-where models try to classify entire
-images into [1000 classes], like "Zebra", "Dalmatian", and "Dishwasher".
-For example, here are the results from [AlexNet] classifying some images:
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/AlexClassification.png">
-</div>
-
-To compare models, we examine how often the model fails to predict the
-correct answer as one of their top 5 guesses -- termed "top-5 error rate".
-[AlexNet] achieved by setting a top-5 error rate of 15.3% on the 2012
-validation data set; [Inception (GoogLeNet)] achieved 6.67%; 
-[BN-Inception-v2] achieved 4.9%; [Inception-v3] reaches 3.46%.
-
-> How well do humans do on ImageNet Challenge? There's a [blog post] by
-Andrej Karpathy who attempted to measure his own performance. He reached
-5.1% top-5 error rate.
-
-[ImageNet]: http://image-net.org/
-[1000 classes]: http://image-net.org/challenges/LSVRC/2014/browse-synsets
-[blog post]: http://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/
-
-This tutorial will teach you how to use [Inception-v3]. You'll learn how to
-classify images into [1000 classes] in Python or C++. We'll also discuss how to
-extract higher level features from this model which may be reused for other
-vision tasks.
-
-We're excited to see what the community will do with this model.
-
-
-##Usage with Python API
-
-`classify_image.py` downloads the trained model from `tensorflow.org`
-when the program is run for the first time. You'll need about 200M of free space
-available on your hard disk.
-
-Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
-
-    cd models/tutorials/image/imagenet
-    python classify_image.py
-
-The above command will classify a supplied image of a panda bear.
-
-<div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/cropped_panda.jpg">
-</div>
-
-If the model runs correctly, the script will produce the following output:
-
-    giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca (score = 0.88493)
-    indri, indris, Indri indri, Indri brevicaudatus (score = 0.00878)
-    lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens (score = 0.00317)
-    custard apple (score = 0.00149)
-    earthstar (score = 0.00127)
-
-If you wish to supply other JPEG images, you may do so by editing
-the `--image_file` argument.
-
-> If you download the model data to a different directory, you
-will need to point `--model_dir`  to the directory used.
-
-## Usage with the C++ API
-
-You can run the same [Inception-v3] model in C++ for use in production
-environments. You can download the archive containing the GraphDef that defines
-the model like this (running from the root directory of the TensorFlow
-repository):
-
-```bash
-wget https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015.zip -O tensorflow/examples/label_image/data/inception_dec_2015.zip
-
-unzip tensorflow/examples/label_image/data/inception_dec_2015.zip -d tensorflow/examples/label_image/data/
-```
-
-Next, we need to compile the C++ binary that includes the code to load and run the graph.
-If you've followed [the instructions to download the source installation of
-TensorFlow](../../get_started/os_setup.md#installing-from-sources)
-for your platform, you should be able to build the example by
-running this command from your shell terminal:
-
-```bash
-bazel build tensorflow/examples/label_image/...
-```
-
-That should create a binary executable that you can then run like this:
-
-```bash
-bazel-bin/tensorflow/examples/label_image/label_image
-```
-
-This uses the default example image that ships with the framework, and should
-output something similar to this:
-
-```
-I tensorflow/examples/label_image/main.cc:200] military uniform (866): 0.647296
-I tensorflow/examples/label_image/main.cc:200] suit (794): 0.0477196
-I tensorflow/examples/label_image/main.cc:200] academic gown (896): 0.0232411
-I tensorflow/examples/label_image/main.cc:200] bow tie (817): 0.0157356
-I tensorflow/examples/label_image/main.cc:200] bolo tie (940): 0.0145024
-```
-In this case, we're using the default image of
-[Admiral Grace Hopper](https://en.wikipedia.org/wiki/Grace_Hopper), and you can
-see the network correctly identifies she's wearing a military uniform, with a high
-score of 0.6.
-
-
-<div style="width:45%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/grace_hopper.jpg">
-</div>
-
-Next, try it out on your own images by supplying the --image= argument, e.g.
-
-```bash
-bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png
-```
-
-If you look inside the [`tensorflow/examples/label_image/main.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc)
-file, you can find out
-how it works. We hope this code will help you integrate TensorFlow into
-your own applications, so we will walk step by step through the main functions:
-
-The command line flags control where the files are loaded from, and properties of the input images.
-The model expects to get square 299x299 RGB images, so those are the `input_width`
-and `input_height` flags. We also need to scale the pixel values from integers that
-are between 0 and 255 to the floating point values that the graph operates on.
-We control the scaling with the `input_mean` and `input_std` flags: we first subtract
-`input_mean` from each pixel value, then divide it by `input_std`.
-
-These values probably look somewhat magical, but they are just defined by the
-original model author based on what he/she wanted to use as input images for
-training. If you have a graph that you've trained yourself, you'll just need
-to adjust the values to match whatever you used during your training process.
-
-You can see how they're applied to an image in the
-[`ReadTensorFromImageFile()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L88)
-function.
-
-```C++
-// Given an image file name, read in the data, try to decode it as an image,
-// resize it to the requested size, and then scale the values as desired.
-Status ReadTensorFromImageFile(string file_name, const int input_height,
-                               const int input_width, const float input_mean,
-                               const float input_std,
-                               std::vector<Tensor>* out_tensors) {
-  tensorflow::GraphDefBuilder b;
-```
-We start by creating a `GraphDefBuilder`, which is an object we can use to
-specify a model to run or load.
-
-```C++
-  string input_name = "file_reader";
-  string output_name = "normalized";
-  tensorflow::Node* file_reader =
-      tensorflow::ops::ReadFile(tensorflow::ops::Const(file_name, b.opts()),
-                                b.opts().WithName(input_name));
-```
-We then start creating nodes for the small model we want to run
-to load, resize, and scale the pixel values to get the result the main model
-expects as its input. The first node we create is just a `Const` op that holds a
-tensor with the file name of the image we want to load. That's then passed as the
-first input to the `ReadFile` op. You might notice we're passing `b.opts()` as the last
-argument to all the op creation functions. The argument ensures that the node is added to
-the model definition held in the `GraphDefBuilder`. We also name the `ReadFile`
-operator by making the `WithName()` call to `b.opts()`. This gives a name to the node,
-which isn't strictly necessary since an automatic name will be assigned if you don't
-do this, but it does make debugging a bit easier.
-
-```C++
-  // Now try to figure out what kind of file it is and decode it.
-  const int wanted_channels = 3;
-  tensorflow::Node* image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
-    image_reader = tensorflow::ops::DecodePng(
-        file_reader,
-        b.opts().WithAttr("channels", wanted_channels).WithName("png_reader"));
-  } else {
-    // Assume if it's not a PNG then it must be a JPEG.
-    image_reader = tensorflow::ops::DecodeJpeg(
-        file_reader,
-        b.opts().WithAttr("channels", wanted_channels).WithName("jpeg_reader"));
-  }
-  // Now cast the image data to float so we can do normal math on it.
-  tensorflow::Node* float_caster = tensorflow::ops::Cast(
-      image_reader, tensorflow::DT_FLOAT, b.opts().WithName("float_caster"));
-  // The convention for image ops in TensorFlow is that all images are expected
-  // to be in batches, so that they're four-dimensional arrays with indices of
-  // [batch, height, width, channel]. Because we only have a single image, we
-  // have to add a batch dimension of 1 to the start with ExpandDims().
-  tensorflow::Node* dims_expander = tensorflow::ops::ExpandDims(
-      float_caster, tensorflow::ops::Const(0, b.opts()), b.opts());
-  // Bilinearly resize the image to fit the required dimensions.
-  tensorflow::Node* resized = tensorflow::ops::ResizeBilinear(
-      dims_expander, tensorflow::ops::Const({input_height, input_width},
-                                            b.opts().WithName("size")),
-      b.opts());
-  // Subtract the mean and divide by the scale.
-  tensorflow::ops::Div(
-      tensorflow::ops::Sub(
-          resized, tensorflow::ops::Const({input_mean}, b.opts()), b.opts()),
-      tensorflow::ops::Const({input_std}, b.opts()),
-      b.opts().WithName(output_name));
-```
-We then keep adding more nodes, to decode the file data as an image, to cast the
-integers into floating point values, to resize it, and then finally to run the
-subtraction and division operations on the pixel values.
-
-```C++
-  // This runs the GraphDef network definition that we've just constructed, and
-  // returns the results in the output tensor.
-  tensorflow::GraphDef graph;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
-```
-At the end of this we have
-a model definition stored in the b variable, which we turn into a full graph
-definition with the `ToGraphDef()` function.
-
-```C++
-  std::unique_ptr<tensorflow::Session> session(
-      tensorflow::NewSession(tensorflow::SessionOptions()));
-  TF_RETURN_IF_ERROR(session->Create(graph));
-  TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
-  return Status::OK();
-```
-Then we create a [`Session`](http://www.tensorflow.org/versions/master/api_docs/cc/ClassSession.html#class-tensorflow-session)
-object, which is the interface to actually running the graph, and run it,
-specifying which node we want to get the output from, and where to put the
-output data.
-
-This gives us a vector of `Tensor` objects, which in this case we know will only be a
-single object long. You can think of a `Tensor` as a multi-dimensional array in this
-context, and it holds a 299 pixel high, 299 pixel wide, 3 channel image as float
-values. If you have your own image-processing framework in your product already, you
-should be able to use that instead, as long as you apply the same transformations
-before you feed images into the main graph.
-
-This is a simple example of creating a small TensorFlow graph dynamically in C++,
-but for the pre-trained Inception model we want to load a much larger definition from
-a file. You can see how we do that in the `LoadGraph()` function.
-
-```C++
-// Reads a model graph definition from disk, and creates a session object you
-// can use to run it.
-Status LoadGraph(string graph_file_name,
-                 std::unique_ptr<tensorflow::Session>* session) {
-  tensorflow::GraphDef graph_def;
-  Status load_graph_status =
-      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
-  if (!load_graph_status.ok()) {
-    return tensorflow::errors::NotFound("Failed to load compute graph at '",
-                                        graph_file_name, "'");
-  }
-```
-If you've looked through the image loading code, a lot of the terms should seem familiar. Rather than
-using a `GraphDefBuilder` to produce a `GraphDef` object, we load a protobuf file that
-directly contains the `GraphDef`.
-
-```C++
-  session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
-  Status session_create_status = (*session)->Create(graph_def);
-  if (!session_create_status.ok()) {
-    return session_create_status;
-  }
-  return Status::OK();
-}
-```
-Then we create a Session object from that `GraphDef` and
-pass it back to the caller so that they can run it at a later time.
-
-The `GetTopLabels()` function is a lot like the image loading, except that in this case
-we want to take the results of running the main graph, and turn it into a sorted list
-of the highest-scoring labels. Just like the image loader, it creates a
-`GraphDefBuilder`, adds a couple of nodes to it, and then runs the short graph to get a
-pair of output tensors. In this case they represent the sorted scores and index
-positions of the highest results.
-
-```C++
-// Analyzes the output of the Inception graph to retrieve the highest scores and
-// their positions in the tensor, which correspond to categories.
-Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
-                    Tensor* indices, Tensor* scores) {
-  tensorflow::GraphDefBuilder b;
-  string output_name = "top_k";
-  tensorflow::ops::TopK(tensorflow::ops::Const(outputs[0], b.opts()),
-                        how_many_labels, b.opts().WithName(output_name));
-  // This runs the GraphDef network definition that we've just constructed, and
-  // returns the results in the output tensors.
-  tensorflow::GraphDef graph;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
-  std::unique_ptr<tensorflow::Session> session(
-      tensorflow::NewSession(tensorflow::SessionOptions()));
-  TF_RETURN_IF_ERROR(session->Create(graph));
-  // The TopK node returns two outputs, the scores and their original indices,
-  // so we have to append :0 and :1 to specify them both.
-  std::vector<Tensor> out_tensors;
-  TF_RETURN_IF_ERROR(session->Run({}, {output_name + ":0", output_name + ":1"},
-                                  {}, &out_tensors));
-  *scores = out_tensors[0];
-  *indices = out_tensors[1];
-  return Status::OK();
-```
-The `PrintTopLabels()` function takes those sorted results, and prints them out in a
-friendly way. The `CheckTopLabel()` function is very similar, but just makes sure that
-the top label is the one we expect, for debugging purposes.
-
-At the end, [`main()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/label_image/main.cc#L252)
-ties together all of these calls.
-
-```C++
-int main(int argc, char* argv[]) {
-  // We need to call this to set up global state for TensorFlow.
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-  Status s = tensorflow::ParseCommandLineFlags(&argc, argv);
-  if (!s.ok()) {
-    LOG(ERROR) << "Error parsing command line flags: " << s.ToString();
-    return -1;
-  }
-
-  // First we load and initialize the model.
-  std::unique_ptr<tensorflow::Session> session;
-  string graph_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_graph);
-  Status load_graph_status = LoadGraph(graph_path, &session);
-  if (!load_graph_status.ok()) {
-    LOG(ERROR) << load_graph_status;
-    return -1;
-  }
-```
-We load the main graph.
-
-```C++
-  // Get the image from disk as a float array of numbers, resized and normalized
-  // to the specifications the main graph expects.
-  std::vector<Tensor> resized_tensors;
-  string image_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_image);
-  Status read_tensor_status = ReadTensorFromImageFile(
-      image_path, FLAGS_input_height, FLAGS_input_width, FLAGS_input_mean,
-      FLAGS_input_std, &resized_tensors);
-  if (!read_tensor_status.ok()) {
-    LOG(ERROR) << read_tensor_status;
-    return -1;
-  }
-  const Tensor& resized_tensor = resized_tensors[0];
-```
-Load, resize, and process the input image.
-
-```C++
-  // Actually run the image through the model.
-  std::vector<Tensor> outputs;
-  Status run_status = session->Run({{FLAGS_input_layer, resized_tensor}},
-                                   {FLAGS_output_layer}, {}, &outputs);
-  if (!run_status.ok()) {
-    LOG(ERROR) << "Running model failed: " << run_status;
-    return -1;
-  }
-```
-Here we run the loaded graph with the image as an input.
-
-```C++
-  // This is for automated testing to make sure we get the expected result with
-  // the default settings. We know that label 866 (military uniform) should be
-  // the top label for the Admiral Hopper image.
-  if (FLAGS_self_test) {
-    bool expected_matches;
-    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
-    if (!check_status.ok()) {
-      LOG(ERROR) << "Running check failed: " << check_status;
-      return -1;
-    }
-    if (!expected_matches) {
-      LOG(ERROR) << "Self-test failed!";
-      return -1;
-    }
-  }
-```
-For testing purposes we can check to make sure we get the output we expect here.
-
-```C++
-  // Do something interesting with the results we've generated.
-  Status print_status = PrintTopLabels(outputs, FLAGS_labels);
-```
-Finally we print the labels we found.
-
-```C++
-  if (!print_status.ok()) {
-    LOG(ERROR) << "Running print failed: " << print_status;
-    return -1;
-  }
-```
-
-The error handling here is using TensorFlow's `Status`
-object, which is very convenient because it lets you know whether any error has
-occurred with the `ok()` checker, and then can be printed out to give a readable error
-message.
-
-In this case we are demonstrating object recognition, but you should be able to
-use very similar code on other models you've found or trained yourself, across
-all
-sorts of domains. We hope this small example gives you some ideas on how to use
-TensorFlow within your own products.
-
-> **EXERCISE**: Transfer learning is the idea that, if you know how to solve a task well, you
-should be able to transfer some of that understanding to solving related
-problems.  One way to perform transfer learning is to remove the final
-classification layer of the network and extract
-the [next-to-last layer of the CNN](http://arxiv.org/abs/1310.1531), in this case a 2048 dimensional vector.
-There's a guide to doing this [in the how-to section](../../how_tos/image_retraining/index.html).
-
-
-## Resources for Learning More
-
-To learn about neural networks in general, Michael Nielsen's
-[free online book](http://neuralnetworksanddeeplearning.com/chap1.html)
-is an excellent resource. For convolutional neural networks in particular,
-Chris Olah has some
-[nice blog posts](http://colah.github.io/posts/2014-07-Conv-Nets-Modular/),
-and Michael Nielsen's book has a
-[great chapter](http://neuralnetworksanddeeplearning.com/chap6.html)
-covering them.
-
-To find out more about implementing convolutional neural networks, you can jump to
-the TensorFlow [deep convolutional networks tutorial](http://www.tensorflow.org/tutorials/deep_cnn/index.html),
-or start a bit more gently with our
-[ML beginner](http://www.tensorflow.org/tutorials/mnist/beginners/index.html)
-or [ML expert](http://www.tensorflow.org/tutorials/mnist/pros/index.html)
-MNIST starter tutorials. Finally, if you want to get up to speed on research
-in this area, you can
-read the recent work of all the papers referenced in this tutorial.
-
diff --git a/tensorflow/g3doc/tutorials/index.md b/tensorflow/g3doc/tutorials/index.md
deleted file mode 100644
index 505f1b42706..00000000000
--- a/tensorflow/g3doc/tutorials/index.md
+++ /dev/null
@@ -1,177 +0,0 @@
-# Tutorials
-
-## Basic Neural Networks
-
-The first few Tensorflow tutorials guide you through training and testing a
-simple neural network to classify handwritten digits from the MNIST database of
-digit images.
-
-### MNIST For ML Beginners
-
-If you're new to machine learning, we recommend starting here. You'll learn
-about a classic problem, handwritten digit classification (MNIST), and get a
-gentle introduction to multiclass classification.
-
-[View Tutorial](../tutorials/mnist/beginners/index.md)
-
-### Deep MNIST for Experts
-
-If you're already familiar with other deep learning software packages, and are
-already familiar with MNIST, this tutorial will give you a very brief primer on
-TensorFlow.
-
-[View Tutorial](../tutorials/mnist/pros/index.md)
-
-### TensorFlow Mechanics 101
-
-This is a technical tutorial, where we walk you through the details of using
-TensorFlow infrastructure to train models at scale. We use MNIST as the example.
-
-[View Tutorial](../tutorials/mnist/tf/index.md)
-
-## Easy ML with tf.contrib.learn
-
-### tf.contrib.learn Quickstart
-
-A quick introduction to tf.contrib.learn, a high-level API for TensorFlow.
-Build, train, and evaluate a neural network with just a few lines of code.
-
-[View Tutorial](../tutorials/tflearn/index.md)
-
-### Overview of Linear Models with tf.contrib.learn
-
-An overview of tf.contrib.learn's rich set of tools for working with linear
-models in TensorFlow.
-
-[View Tutorial](../tutorials/linear/overview.md)
-
-### Linear Model Tutorial
-
-This tutorial walks you through the code for building a linear model using
-tf.contrib.learn.
-
-[View Tutorial](../tutorials/wide/index.md)
-
-### Wide and Deep Learning Tutorial
-
-This tutorial shows you how to use tf.contrib.learn to jointly train a linear
-model and a deep neural net to harness the advantages of each type of model.
-
-[View Tutorial](../tutorials/wide_and_deep/index.md)
-
-### Logging and Monitoring Basics with tf.contrib.learn
-
-This tutorial shows you how to use TensorFlow’s logging capabilities and the
-Monitor API to audit the in-progress training of a neural network.
-
-[View Tutorial](../tutorials/monitors/index.md)
-
-### Building Input Functions with tf.contrib.learn
-
-This tutorial introduces you to creating input functions in tf.contrib.learn,
-and walks you through implementing an `input_fn` to train a neural network for
-predicting median house values.
-
-[View Tutorial](../tutorials/input_fn/index.md)
-
-### Creating Estimators in tf.contrib.learn
-
-This tutorial covers how to create your own `Estimator` using the building
-blocks provided in tf.contrib.learn. You'll build a model to predict the ages of
-abalones based on their physical measurements.
-
-[View Tutorial](../tutorials/estimators/index.md)
-
-### A Guide to TF Layers: Building a Convolutional Neural Network
-
-This tutorial introduces you to building neural networks in TensorFlow using the
-`tf.layers` module. You'll build a convolutional neural network `Estimator` to
-recognize the handwritten digits in the MNIST data set.
-
-[View Tutorial](../tutorials/layers/index.md)
-
-## TensorFlow Serving
-
-### TensorFlow Serving
-
-An introduction to TensorFlow Serving, a flexible, high-performance system for
-serving machine learning models, designed for production environments.
-
-[View Tutorial](../tutorials/tfserve/index.md)
-
-## Image Processing
-
-### Convolutional Neural Networks
-
-An introduction to convolutional neural networks using the CIFAR-10 data set.
-Convolutional neural nets are particularly tailored to images, since they
-exploit translation invariance to yield more compact and effective
-representations of visual content.
-
-[View Tutorial](../tutorials/deep_cnn/index.md)
-
-### Image Recognition
-
-How to run object recognition using a convolutional neural network trained on
-ImageNet Challenge data and label set.
-
-[View Tutorial](../tutorials/image_recognition/index.md)
-
-### Deep Dream Visual Hallucinations
-
-Building on the Inception recognition model, we will release a TensorFlow
-version of the [Deep Dream](https://github.com/google/deepdream) neural network
-visual hallucination software.
-
-[View
-Tutorial](https://nbviewer.jupyter.org/github/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb)
-
-## Language and Sequence Processing
-
-### Vector Representations of Words
-
-This tutorial motivates why it is useful to learn to represent words as vectors
-(called *word embeddings*). It introduces the word2vec model as an efficient
-method for learning embeddings. It also covers the high-level details behind
-noise-contrastive training methods (the biggest recent advance in training
-embeddings).
-
-[View Tutorial](../tutorials/word2vec/index.md)
-
-### Recurrent Neural Networks
-
-An introduction to RNNs, wherein we train an LSTM network to predict the next
-word in an English sentence. (A task sometimes called language modeling.)
-
-[View Tutorial](../tutorials/recurrent/index.md)
-
-### Sequence-to-Sequence Models
-
-A follow on to the RNN tutorial, where we assemble a sequence-to-sequence model
-for machine translation. You will learn to build your own English-to-French
-translator, entirely machine learned, end-to-end.
-
-[View Tutorial](../tutorials/seq2seq/index.md)
-
-### SyntaxNet: Neural Models of Syntax
-
-An introduction to SyntaxNet, a Natural Language Processing framework for
-TensorFlow.
-
-[View Tutorial](../tutorials/syntaxnet/index.md)
-
-## Non-ML Applications
-
-### Mandelbrot Set
-
-TensorFlow can be used for computation that has nothing to do with machine
-learning. Here's a naive implementation of Mandelbrot set visualization.
-
-[View Tutorial](../tutorials/mandelbrot/index.md)
-
-### Partial Differential Equations
-
-As another example of non-machine learning computation, we offer an example of a
-naive PDE simulation of raindrops landing on a pond.
-
-[View Tutorial](../tutorials/pdes/index.md)
diff --git a/tensorflow/g3doc/tutorials/input_fn/index.md b/tensorflow/g3doc/tutorials/input_fn/index.md
deleted file mode 100644
index 831576433e2..00000000000
--- a/tensorflow/g3doc/tutorials/input_fn/index.md
+++ /dev/null
@@ -1,390 +0,0 @@
-# Building Input Functions with tf.contrib.learn
-
-This tutorial introduces you to creating input functions in tf.contrib.learn.
-You'll get an overview of how to construct an `input_fn` to preprocess and feed
-data into your models. Then, you'll implement an `input_fn` that feeds training,
-evaluation, and prediction data into a neural network regressor for predicting
-median house values.
-
-## Custom Input Pipelines with input_fn
-
-When training a neural network using tf.contrib.learn, it's possible to pass
-your feature and target data directly into your `fit`, `evaluate`, or `predict`
-operations. Here's an example taken from the [tf.contrib.learn quickstart
-tutorial](../tflearn/index.md):
-
-```py
-training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-...
-
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000)
-```
-
-This approach works well when little to no manipulation of source data is
-required. But in cases where more feature engineering is needed,
-`tf.contrib.learn` supports using a custom input function (`input_fn`) to
-encapsulate the logic for preprocessing and piping data into your models.
-
-### Anatomy of an input_fn
-
-The following code illustrates the basic skeleton for an input function:
-
-```python
-def my_input_fn()
-
-    # Preprocess your data here...
-
-    # ...then return 1) a mapping of feature columns to Tensors with
-    # the corresponding feature data, and 2) a Tensor containing labels
-    return feature_cols, labels
-```
-
-The body of the input function contains the specific logic for preprocessing your
-input data, such as scrubbing out bad examples or [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling).
-
-Input functions must return the following two values containing the final
-feature and label data to be fed into your model (as shown in the above code
-skeleton):
-
-<dl>
-  <dt><code>feature_cols</code></dt>
-  <dd>A dict containing key/value pairs that map feature column
-names to <code>Tensor</code>s (or <code>SparseTensor</code>s) containing the corresponding feature
-data.</dd>
-  <dt><code>labels</code></dt>
-  <dd>A <code>Tensor</code> containing your label (target) values: the values your model aims to predict.</dd>
-</dl>
-
-### Converting Feature Data to Tensors
-
-If your feature/label data is stored in [_pandas_](http://pandas.pydata.org/)
-dataframes or [numpy](http://www.numpy.org/) arrays, you'll need to convert it
-to `Tensor`s before returning it from your `input_fn`.
-
-For continuous data, you can create and populate a `Tensor` using `tf.constant`:
-
-```python
-feature_column_data = [1, 2.4, 0, 9.9, 3, 120]
-feature_tensor = tf.constant(feature_column_data)
-```
-
-For [sparse, categorical data](https://en.wikipedia.org/wiki/Sparse_matrix)
-(data where the majority of values are 0), you'll instead want to populate a
-`SparseTensor`, which is instantiated with three arguments:
-
-<dl>
-  <dt><code>shape</code></dt>
-  <dd>The shape of the tensor. Takes a list indicating the number of elements in each dimension. For example, <code>shape=[3,6]</code> specifies a two-dimensional 3x6 tensor, <code>shape=[2,3,4]</code> specifies a three-dimensional 2x3x4 tensor, and <code>shape=[9]</code> specifies a one-dimensional tensor with 9 elements.</dd>
-  <dt><code>indices</code></dt>
-  <dd>The indices of the elements in your tensor that contain nonzero values. Takes a list of terms, where each term is itself a list containing the index of a nonzero element. (Elements are zero-indexed—i.e., [0,0] is the index value for the element in the first column of the first row in a two-dimensional tensor.) For example, <code>indices=[[1,3], [2,4]]</code> specifies that the elements with indexes of [1,3] and [2,4] have nonzero values.</dd>
-  <dt><code>values</code></dt>
-  <dd>A one-dimensional tensor of values. Term <code>i</code> in <code>values</code> corresponds to term <code>i</code> in <code>indices</code> and specifies its value. For example, given <code>indices=[[1,3], [2,4]]</code>, the parameter <code>values=[18, 3.6]</code> specifies that element [1,3] of the tensor has a value of 18, and element [2,4] of the tensor has a value of 3.6.</dd>
-</dl>
-
-The following code defines a two-dimensional `SparseTensor` with 3 rows and 5
-columns. The element with index [0,1] has a value of 6, and the element with
-index [2,4] has a value of 0.5 (all other values are 0):
-
-```python
-sparse_tensor = tf.SparseTensor(indices=[[0,1], [2,4]],
-                                values=[6, 0.5],
-                                shape=[3, 5])
-```
-
-This corresponds to the following dense tensor:
-
-```none
-[[0, 6, 0, 0, 0]
- [0, 0, 0, 0, 0]
- [0, 0, 0, 0, 0.5]]
-```
-
-For more on `SparseTensor`, see the
-[TensorFlow API documentation](../../api_docs/python/sparse_ops.md#SparseTensor).
-
-### Passing input_fn Data to Your Model
-
-To feed data to your model for training, you simply pass the input function
-you've created to your `fit` operation as the value of the `input_fn` parameter,
-e.g.:
-
-```python
-classifier.fit(input_fn=my_input_fn, steps=2000)
-```
-
-Note that the `input_fn` is responsible for supplying both feature and label
-data to the model, and replaces both the `x` and `y` parameters in `fit`. If you
-supply an `input_fn` value to `fit` that is not `None` in conjunction with
-either an `x` or `y` parameter that is not `None`, it will result in a
-`ValueError`.
-
-Also note that the `input_fn` parameter must receive a function object (i.e.,
-`input_fn=my_input_fn`), not the return value of a function call
-(`input_fn=my_input_fn()`). This means that if you try to pass parameters to the input
-function in your `fit` call, as in the following code, it will result in a
-`TypeError`:
-
-```python
-classifier.fit(input_fn=my_input_fn(training_set), steps=2000)
-```
-
-However, if you'd like to be able to parameterize your input function, there are
-other methods for doing so. You can employ a wrapper function that takes no
-arguments as your `input_fn` and use it to invoke your input function
-with the desired parameters. For example:
-
-```python
-def my_input_function_training_set():
-  return my_input_function(training_set)
-
-classifier.fit(input_fn=my_input_fn_training_set, steps=2000)
-```
-
-Alternatively, you can use Python's [`functools.partial`](https://docs.python.org/2/library/functools.html#functools.partial)
-function to construct a new function object with all parameter values fixed:
-
-```python
-classifier.fit(input_fn=functools.partial(my_input_function,
-                                          data_set=training_set), steps=2000)
-```
-
-A third option is to wrap your input_fn invocation in a
-[`lambda`](https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions)
-and pass it to the `input_fn` parameter:
-
-```python
-classifier.fit(input_fn=lambda: my_input_fn(training_set), steps=2000)
-```
-
-One big advantage of architecting your input pipeline as shown above—to accept a
-parameter for data set—is that you can pass the same `input_fn` to `evaluate`
-and `predict` operations by just changing the data set argument, e.g.:
-
-```python
-classifier.evaluate(input_fn=lambda: my_input_fn(test_set), steps=2000)
-```
-
-This approach enhances code maintainability: no need to capture `x` and `y`
-values in separate variables (e.g., `x_train`, `x_test`, `y_train`, `y_test`)
-for each type of operation.
-
-### A Neural Network Model for Boston House Values
-
-In the remainder of this tutorial, you'll write an input function for
-preprocessing a subset of Boston housing data pulled from the [UCI Housing Data
-Set](https://archive.ics.uci.edu/ml/datasets/Housing) and use it to feed data to
-a neural network regressor for predicting median house values.
-
-The [Boston CSV data sets](#setup) you'll use to train your neural network
-contain the following
-[feature data](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names)
-for Boston suburbs:
-
-Feature | Description
-------- | ---------------------------------------------------------------
-CRIM    | Crime rate per capita
-ZN      | Fraction of residential land zoned to permit 25,000+ sq ft lots
-INDUS   | Fraction of land that is non-retail business
-NOX     | Concentration of nitric oxides in parts per 10 million
-RM      | Average Rooms per dwelling
-AGE     | Fraction of owner-occupied residences built before 1940
-DIS     | Distance to Boston-area employment centers
-TAX     | Property tax rate per $10,000
-PTRATIO | Student-teacher ratio
-
-And the label your model will predict is MEDV, the median value of
-owner-occupied residences in thousands of dollars.
-
-## Setup {#setup}
-
-Download the following data sets:
-[boston_train.csv](http://download.tensorflow.org/data/boston_train.csv),
-[boston_test.csv](http://download.tensorflow.org/data/boston_test.csv), and
-[boston_predict.csv](http://download.tensorflow.org/data/boston_predict.csv).
-
-The following sections provide a step-by-step walkthrough of how to create an
-input function, feed these data sets into a neural network regressor, train and
-evaluate the model, and make house value predictions. The full, final code is [available
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/input_fn/boston.py).
-
-### Importing the Housing Data
-
-To start, set up your imports (including `pandas` and `tensorflow`) and [set
-logging verbosity](../monitors/index.md#enabling-logging-with-tensorflow) to
-`INFO` for more detailed log output:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-import pandas as pd
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Define the column names for the data set in `COLUMNS`. To distinguish features
-from the label, also define `FEATURES` and `LABEL`. Then read the three CSVs
-([train](http://download.tensorflow.org/data/boston_train.csv),
-[test](http://download.tensorflow.org/data/boston_test.csv), and
-[predict](http://download.tensorflow.org/data/boston_predict.csv)) into _pandas_
-`DataFrame`s:
-
-```python
-COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
-           "dis", "tax", "ptratio", "medv"]
-FEATURES = ["crim", "zn", "indus", "nox", "rm",
-            "age", "dis", "tax", "ptratio"]
-LABEL = "medv"
-
-training_set = pd.read_csv("boston_train.csv", skipinitialspace=True,
-                           skiprows=1, names=COLUMNS)
-test_set = pd.read_csv("boston_test.csv", skipinitialspace=True,
-                       skiprows=1, names=COLUMNS)
-prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
-                             skiprows=1, names=COLUMNS)
-```
-
-### Defining FeatureColumns and Creating the Regressor
-
-Next, create a list of `FeatureColumn`s for the input data, which formally
-specify the set of features to use for training. Because all features in the
-housing data set contain continuous values, you can create their
-`FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
-
-```python
-feature_cols = [tf.contrib.layers.real_valued_column(k)
-                  for k in FEATURES]
-```
-
-NOTE: For a more in-depth overview of feature columns, see
-[this introduction](../linear/overview.md#feature-columns-and-transformations),
-and for an example that illustrates how to define `FeatureColumns` for
-categorical data, see the [Linear Model Tutorial](../wide/index.md).
-
-Now, instantiate a `DNNRegressor` for the neural network regression model.
-You'll need to provide two arguments here: `hidden_units`, a hyperparameter
-specifying the number of nodes in each hidden layer (here, two hidden layers
-with 10 nodes each), and `feature_columns`, containing the list of
-`FeatureColumns` you just defined:
-
-```python
-regressor = tf.contrib.learn.DNNRegressor(
-    feature_columns=feature_cols, hidden_units=[10, 10])
-```
-
-### Building the input_fn
-
-To pass input data into the `regressor`, create an input function, which will
-accept a _pandas_ `Dataframe` and return feature column and label values as
-`Tensor`s:
-
-```python
-def input_fn(data_set):
-  feature_cols = {k: tf.constant(data_set[k].values)
-                  for k in FEATURES}
-  labels = tf.constant(data_set[LABEL].values)
-  return feature_cols, labels
-```
-
-Note that the input data is passed into `input_fn` in the `data_set` argument,
-which means the function can process any of the `DataFrame`s you've imported:
-`training_set`, `test_set`, and `prediction_set`.
-
-### Training the Regressor
-
-To train the neural network regressor, run `fit` with the `training_set` passed
-to the `input_fn` as follows:
-
-```python
-regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
-```
-
-You should see log output similar to the following, which reports training loss
-for every 100 steps:
-
-```none
-INFO:tensorflow:Step 1: loss = 483.179
-INFO:tensorflow:Step 101: loss = 81.2072
-INFO:tensorflow:Step 201: loss = 72.4354
-...
-INFO:tensorflow:Step 1801: loss = 33.4454
-INFO:tensorflow:Step 1901: loss = 32.3397
-INFO:tensorflow:Step 2001: loss = 32.0053
-INFO:tensorflow:Step 4801: loss = 27.2791
-INFO:tensorflow:Step 4901: loss = 27.2251
-INFO:tensorflow:Saving checkpoints for 5000 into /tmp/boston_model/model.ckpt.
-INFO:tensorflow:Loss for final step: 27.1674.
-```
-
-### Evaluating the Model
-
-Next, see how the trained model performs against the test data set. Run
-`evaluate`, and this time pass the `test_set` to the `input_fn`:
-
-```python
-ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
-```
-
-Retrieve the loss from the `ev` results and print it to output:
-
-```python
-loss_score = ev["loss"]
-print("Loss: {0:f}".format(loss_score))
-```
-
-You should see results similar to the following:
-
-```none
-INFO:tensorflow:Eval steps [0,1) for training step 5000.
-INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 11.9221
-Loss: 11.922098
-```
-
-### Making Predictions
-
-Finally, you can use the model to predict median house values for the
-`prediction_set`, which contains feature data but no labels for six examples:
-
-```python
-y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
-# .predict() returns an iterator; convert to a list and print predictions
-predictions = list(itertools.islice(y, 6))
-print ("Predictions: {}".format(str(predictions)))
-```
-
-Your results should contain six house-value predictions in thousands of dollars,
-e.g:
-
-```none
-Predictions: [ 33.30348587  17.04452896  22.56370163  34.74345398  14.55953979
-  19.58005714]
-```
-
-## Additional Resources
-
-This tutorial focused on creating an `input_fn` for a neural network regressor.
-To learn more about using `input_fn`s for other types of models, check out the
-following resources:
-
-*   [Large-scale Linear Models with TensorFlow](../linear/overview.md): This
-    introduction to linear models in TensorFlow provides a high-level overview
-    of feature columns and techniques for transforming input data.
-
-*   [TensorFlow Linear Model Tutorial](../wide/index.md): This tutorial covers
-    creating `FeatureColumn`s and an `input_fn` for a linear classification
-    model that predicts income range based on census data.
-
-*   [TensorFlow Wide & Deep Learning Tutorial](../wide/index.md): Building on
-    the [Linear Model Tutorial](../wide/index.md), this tutorial covers
-    `FeatureColumn` and `input_fn` creation for a "wide and deep" model that
-    combines a linear model and a neural network using
-    `DNNLinearCombinedClassifier`.
diff --git a/tensorflow/g3doc/tutorials/layers/index.md b/tensorflow/g3doc/tutorials/layers/index.md
deleted file mode 100644
index 017102de0e4..00000000000
--- a/tensorflow/g3doc/tutorials/layers/index.md
+++ /dev/null
@@ -1,753 +0,0 @@
-# A Guide to TF Layers: Building a Convolutional Neural Network
-
-Note: This tutorial requires TF v1.0 alpha; see the [Download and Setup page](https://www.tensorflow.org/versions/r1.0/get_started/os_setup) for installation instructions.
-
-The TensorFlow [`layers`
-module](https://www.tensorflow.org/code/tensorflow/python/layers/layers.py)
-provides a high-level API that makes it easy to construct a neural network. It
-provides methods that facilitate the creation of dense (fully connected) layers
-and convolutional layers, adding activation functions, and applying dropout
-regularization. In this tutorial, you'll learn how to use `layers` to build a
-convolutional neural network model to recognize the handwritten digits in the
-MNIST data set.
-
-![handwritten digits 0–9 from the MNIST data set](../../images/mnist_0-9.png)
-**The [MNIST dataset](http://yann.lecun.com/exdb/mnist/) comprises 60,000
-training examples and 10,000 test examples of the handwritten digits 0–9,
-formatted as 28x28-pixel monochrome images.**
-
-## Getting Started
-
-Let's set up the skeleton for our TensorFlow program. Create a file called
-`cnn_mnist.py`, and add the following code:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Imports
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.contrib import learn
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-# Our application logic will be added here
-
-if __name__ == "__main__":
-  tf.app.run()
-```
-
-As you work through the tutorial, you'll add code to construct, train, and
-evaluate the convolutional neural network. The complete, final code can be
-[found
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/layers/cnn_mnist.py).
-
-## Intro to Convolutional Neural Networks
-
-Convolutional neural networks (CNNs) are the current state-of-the-art model
-architecture for image classification tasks. CNNs apply a series of filters to
-the raw pixel data of an image to extract and learn higher-level features, which
-the model can then use for classification. CNNs contains three components:
-
-*   **Convolutional layers**, which apply a specified number of convolution
-    filters to the image. For each subregion, the layer performs a set of
-    mathematical operations to produce a single value in the output feature map.
-    Convolutional layers then typically apply a [ReLU activation
-    function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\)) to
-    the output to introduce nonlinearities into the model.
-
-*   **Pooling layers**, which [downsample the image
-    data](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer)
-    extracted by the convolutional layers to reduce the dimensionality of the
-    feature map in order to decrease processing time. A commonly used pooling
-    algorithm is max pooling, which extracts subregions of the feature map
-    (e.g., 2x2-pixel tiles), keeps their maximum value, and discards all other
-    values.
-
-*   **Dense (fully connected) layers**, which perform classification on the
-    features extracted by the convolutional layers and downsampled by the
-    pooling layers. In a dense layer, every node in the layer is connected to
-    every node in the preceding layer.
-
-Typically, a CNN is composed of a stack of convolutional modules that perform
-feature extraction. Each module consists of a convolutional layer followed by a
-pooling layer. The last convolutional module is followed by one or more dense
-layers that perform classification. The final dense layer in a CNN contains a
-single node for each target class in the model (all the possible classes the
-model may predict), with a
-[softmax](https://en.wikipedia.org/wiki/Softmax_function) activation function to
-generate a value between 0–1 for each node (the sum of all these softmax values
-is equal to 1). We can interpret the softmax values for a given image as
-relative measurements of how likely it is that the image falls into each target
-class.
-
-<p class="note"><b>NOTE:</b> For a more comprehensive walkthrough of CNN
-architecture, see Stanford University's <a href="http://cs231n.github.io/convolutional-networks/">
-Convolutional Neural Networks for Visual Recognition course materials</a>.</p>
-
-## Building the CNN MNIST Classifier {#building-cnn-classifier}
-
-Let's build a model to classify the images in the MNIST dataset using the
-following CNN architecture:
-
-1.  **Convolutional Layer #1**: Applies 32 5x5 filters (extracting 5x5-pixel
-    subregions), with ReLU activation function
-2.  **Pooling Layer #1**: Performs max pooling with a 2x2 filter and stride of 2
-    (which specifies that pooled regions do not overlap)
-3.  **Convolutional Layer #2**: Applies 64 5x5 filters, with ReLU activation
-    function
-4.  **Pooling Layer #2**: Again, performs max pooling with a 2x2 filter and
-    stride of 2
-5.  **Dense Layer #1**: 1,024 neurons, with dropout regularization rate of 0.4
-    (probability of 0.4 that any given element will be dropped during training)
-6.  **Dense Layer #2 (Logits Layer)**: 10 neurons, one for each digit target
-    class (0–9).
-
-The `tf.layers` module contains methods to create each of the three layer types
-above:
-
-*   `conv2d()`. Constructs a two-dimensional convolutional layer. Takes number
-    of filters, filter kernel size, padding, and activation function as
-    arguments.
-*   `max_pooling2d()`. Constructs a two-dimensional pooling layer using the
-    max-pooling algorithm. Takes pooling filter size and stride as arguments.
-*   `dense()`. Constructs a dense layer. Takes number of neurons and activation
-    function as arguments.
-
-Each of these methods accepts a tensor as input and returns a transformed tensor
-as output. This makes it easy to connect one layer to another: just take the
-output from one layer-creation method and supply it as input to another.
-
-Open `cnn_mnist.py` and add the following `cnn_model_fn` function, which
-conforms to the interface expected by TensorFlow's Estimator API (more on this
-later in [Create the Estimator](#create-the-estimator)). `cnn_mnist.py` takes
-MNIST feature data, labels, and [model
-mode](../../api_docs/python/contrib.learn.md#ModeKeys) (`TRAIN`, `EVAL`,
-`INFER`) as arguments; configures the CNN; and returns predictions, loss, and a
-training operation:
-
-```python
-def cnn_model_fn(features, labels, mode):
-  """Model function for CNN."""
-  # Input Layer
-  input_layer = tf.reshape(features, [-1, 28, 28, 1])
-
-  # Convolutional Layer #1
-  conv1 = tf.layers.conv2d(
-      inputs=input_layer,
-      filters=32,
-      kernel_size=[5, 5],
-      padding="same",
-      activation=tf.nn.relu)
-
-  # Pooling Layer #1
-  pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
-
-  # Convolutional Layer #2 and Pooling Layer #2
-  conv2 = tf.layers.conv2d(
-      inputs=pool1,
-      filters=64,
-      kernel_size=[5, 5],
-      padding="same",
-      activation=tf.nn.relu)
-  pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
-
-  # Dense Layer
-  pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
-  dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
-  dropout = tf.layers.dropout(
-      inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
-
-  # Logits Layer
-  logits = tf.layers.dense(inputs=dropout, units=10)
-
-  loss = None
-  train_op = None
-
-  # Calculate Loss (for both TRAIN and EVAL modes)
-  if mode != learn.ModeKeys.INFER:
-    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
-
-  # Configure the Training Op (for TRAIN mode)
-  if mode == learn.ModeKeys.TRAIN:
-    train_op = tf.contrib.layers.optimize_loss(
-        loss=loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=0.001,
-        optimizer="SGD")
-
-  # Generate Predictions
-  predictions = {
-      "classes": tf.argmax(
-          input=logits, axis=1),
-      "probabilities": tf.nn.softmax(
-          logits, name="softmax_tensor")
-  }
-
-  # Return a ModelFnOps object
-  return model_fn_lib.ModelFnOps(
-      mode=mode, predictions=predictions, loss=loss, train_op=train_op)
-```
-
-The following sections (with headings corresponding to each code block above)
-dive deeper into the `tf.layers` code used to create each layer, as well as how
-to calculate loss, configure the training op, and generate predictions. If
-you're already experienced with CNNs and [TensorFlow
-`Estimator`s](../estimators/index.md), and find the above code intuitive, you
-may want to skim these sections or just skip ahead to ["Training and Evaluating
-the CNN MNIST Classifier"](#training-evaluating).
-
-### Input Layer
-
-The methods in the `layers` module for creating convolutional and pooling layers
-for two-dimensional image data expect input tensors to have a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
-<em>channels</em>]</code>, defined as follows:
-
-*   _`batch_size`_. Size of the subset of examples to use when performing
-    gradient descent during training.
-*   _`image_width`_. Width of the example images.
-*   _`image_height`_. Height of the example images.
-*   _`channels`_. Number of color channels in the example images. For color
-    images, the number of channels is 3 (red, green, blue). For monochrome
-    images, there is just 1 channel (black).
-
-Here, our MNIST dataset is composed of monochrome 28x28 pixel images, so the
-desired shape for our input layer is <code>[<em>batch_size</em>, 28, 28,
-1]</code>.
-
-To convert our input feature map (`features`) to this shape, we can perform the
-following `reshape` operation:
-
-```python
-input_layer = tf.reshape(features, [-1, 28, 28, 1])
-```
-
-Note that we've indicated `-1` for batch size, which specifies that this
-dimension should be dynamically computed based on the number of input values in
-`features`, holding the size of all other dimensions constant. This allows us to
-treat `batch_size` as a hyperparameter that we can tune. For example, if we feed
-examples into our model in batches of 5, `features` will contain 3,920 values
-(one value for each pixel in each image), and `input_layer` will have a shape of
-`[5, 28, 28, 1]`. Similarly, if we feed examples in batches of 100, `features`
-will contain 78,400 values, and `input_layer` will have a shape of `[100, 28,
-28, 1]`.
-
-### Convolutional Layer #1
-
-In our first convolutional layer, we want to apply 32 5x5 filters to the input
-layer, with a ReLU activation function. We can use the `conv2d()` method in the
-`layers` module to create this layer as follows:
-
-```python
-conv1 = tf.layers.conv2d(
-    inputs=input_layer,
-    filters=32,
-    kernel_size=[5, 5],
-    padding="same",
-    activation=tf.nn.relu)
-```
-
-The `inputs` argument specifies our input tensor, which must have the shape
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
-<em>channels</em>]</code>. Here, we're connecting our first convolutional layer
-to `input_layer`, which has the shape <code>[<em>batch_size</em>, 28, 28,
-1]</code>.
-
-<p class="note"><b>NOTE:</b> <code>conv2d()</code> will instead accept a shape of <code>[<em>channels</em>,
-<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>]</code> when
-passed the argument <code>data_format=channels_first</code>.</p>
-
-The `filters` argument specifies the number of filters to apply (here, 32), and
-`kernel_size` specifies the dimensions of the filters as <code>[<em>width</em>,
-<em>height</em>]</code> (here, <code>[5, 5]</code>).
-
-<p class="tip"><b>TIP:</b> If filter width and height have the same value, you can instead specify a
-single integer for <code>kernel_size</code>—e.g., <code>kernel_size=5</code>.</p>
-
-The `padding` argument specifies one of two enumerated values
-(case-insensitive): `valid` (default value) or `same`. To specify that the
-output tensor should have the same width and height values as the input tensor,
-we set `padding=same` here, which instructs TensorFlow to add 0 values to the
-edges of the output tensor to preserve width and height of 28. (Without padding,
-a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
-24x24 locations to extract a 5x5 tile from a 28x28 grid.)
-
-The `activation` argument specifies the activation function to apply to the
-output of the convolution. Here, we specify ReLU activation with
-[`tf.nn.relu`](../../api_docs/python/nn.md#relu).
-
-Our output tensor produced by `conv2d()` has a shape of
-<code>[<em>batch_size</em>, 28, 28, 1]</code>: the same width and height
-dimensions as the input, but now with 32 channels holding the output from each
-of the filters.
-
-### Pooling Layer #1
-
-Next, we connect our first pooling layer to the convolutional layer we just
-created. We can use the `max_pooling2d()` method in `layers` to construct a
-layer that performs max pooling with a 2x2 filter and stride of 2:
-
-```python
-pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
-```
-
-Again, `inputs` specifies the input tensor, with a shape of
-<code>[<em>batch_size</em>, <em>image_width</em>, <em>image_height</em>,
-<em>channels</em>]</code>. Here, our input tensor is `conv1`, the output from
-the first convolutional layer, which has a shape of <code>[<em>batch_size</em>,
-28, 28, 32]</code>.
-
-<p class="note"><b>NOTE:</b> As with <code>conv2d()</code>, <code>max_pooling2d()</code> will instead accept a shape of
-<code>[<em>channels</em>, <em>batch_size</em>, <em>image_width</em>,
-<em>image_height</em>]</code> when passed the argument
-<code>data_format=channels_first</code>.</p>
-
-The `pool_size` argument specifies the size of the max pooling filter as
-<code>[<em>width</em>, <em>height</em>]</code> (here, `[2, 2]`). If both
-dimensions have the same value, you can instead specify a single integer (e.g.,
-`pool_size=2`).
-
-The `strides` argument specifies the size of the stride. Here, we set a stride
-of 2, which indicates that the subregions extracted by the filter should be
-separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
-this means that none of the regions extracted will overlap). If you want to set
-different stride values for width and height, you can instead specify a tuple or
-list (e.g., `stride=[3,6]`).
-
-Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 1]</code>: the 2x2 filter reduces width and
-height by 50%.
-
-### Convolutional Layer #2 and Pooling Layer #2
-
-We can connect a second convolutional and pooling layer to our CNN using
-`conv2d()` and `max_pooling2d()` as before. For convolutional layer #2, we
-configure 64 5x5 filters with ReLU activation, and for pooling layer #2, we use
-the same specs as pooling layer #1 (a 2x2 max pooling filter with stride of 2):
-
-```python
-conv2 = tf.layers.conv2d(
-    inputs=pool1,
-    filters=64,
-    kernel_size=[5, 5],
-    padding="same",
-    activation=tf.nn.relu)
-
-pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
-```
-
-Note that convolutional layer #2 takes the output tensor of our first pooling
-layer (`pool1`) as input, and produces the tensor `h_conv2` as output. `conv2`
-has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
-and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
-filters applied.
-
-Pooling layer #2 takes `conv2` as input, producing `pool2` as output. `pool2`
-has shape <code>[<em>batch_size</em>, 7, 7, 64]</code> (50% reduction of width
-and height from `conv2`).
-
-### Dense Layer
-
-Next, we want to add a dense layer (with 1,024 neurons and ReLU activation) to
-our CNN to perform classification on the features extracted by the
-convolution/pooling layers. Before we connect the layer, however, we'll flatten
-our feature map (`pool2`) to shape <code>[<em>batch_size</em>,
-<em>features</em>]</code>, so that our tensor has only two dimensions:
-
-```python
-pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
-```
-
-In the `reshape()` operation above, the `-1` signifies that the *`batch_size`*
-dimension will be dynamically calculated based on the number of examples in our
-input data. Each example has 7 (`pool2` width) * 7 (`pool2` height) * 64
-(`pool2` channels) features, so we want the `features` dimension to have a value
-of 7 * 7 * 64 (3136 in total). The output tensor, `pool2_flat`, has shape
-<code>[<em>batch_size</em>, 3136]</code>.
-
-Now, we can use the `dense()` method in `layers` to connect our dense layer as
-follows:
-
-```python
-dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
-```
-
-The `inputs` argument specifies the input tensor: our flattened feature map,
-`pool2_flat`. The `units` argument specifies the number of neurons in the dense
-layer (1,024). The `activation` argument takes the activation function; again,
-we'll use `tf.nn.relu` to add ReLU activation.
-
-To help improve the results of our model, we also apply dropout regularization
-to our dense layer, using the `dropout` method in `layers`:
-
-```python
-dropout = tf.layers.dropout(
-    inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
-```
-
-Again, `inputs` specifies the input tensor, which is the output tensor from our
-dense layer (`dense`).
-
-The `rate` argument specifies the dropout rate; here, we use `0.4`, which means
-40% of the elements will be randomly dropped out during training.
-
-The `training` argument takes a boolean specifying whether or not the model is
-currently being run in training mode; dropout will only be performed if
-`training` is `True`. Here, we check if the `mode` passed to our model function
-`cnn_model_fn` is `TRAIN` mode.
-
-Our output tensor `dropout` has shape <code>[<em>batch_size</em>, 1024]</code>.
-
-### Logits Layer
-
-The final layer in our neural network is the logits layer, which will return the
-raw values for our predictions. We create a dense layer with 10 neurons (one for
-each target class 0–9), with linear activation (the default):
-
-```python
-logits = tf.layers.dense(inputs=dropout, units=10)
-```
-
-Our final output tensor of the CNN, `logits`, has shape
-<code>[<em>batch_size</em>, 10]</code>.
-
-### Calculate Loss {#calculating-loss}
-
-For both training and evaluation, we need to define a [loss
-function](https://en.wikipedia.org/wiki/Loss_function) that measures how closely
-the model's predictions match the target classes. For multiclass classification
-problems like MNIST, [cross
-entropy](https://en.wikipedia.org/wiki/Cross_entropy) is typically used as the
-loss metric. The following code calculates cross entropy when the model runs in
-either `TRAIN` or `EVAL` mode:
-
-```python
-loss = None
-train_op = None
-
-# Calculate loss for both TRAIN and EVAL modes
-if mode != learn.ModeKeys.INFER:
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
-```
-
-Let's take a closer look at what's happening above.
-
-Our `labels` tensor contains a list of predictions for our examples, e.g. `[1,
-9, ...]`. In order to calculate cross-entropy, first we need to convert `labels`
-to the corresponding [one-hot
-encoding](https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science):
-
-```none
-[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
- ...]
-```
-
-We use the [`tf.one_hot()`](../../api_docs/python/array_ops.md#one_hot) function
-to perform this conversion. `tf.one_hot()` has two required arguments:
-
-*   `indices`. The locations in the one-hot tensor that will have "on
-    values"—i.e., the locations of `1` values in the tensor shown above.
-*   `depth`. The depth of the one-hot tensor—i.e., the number of target classes.
-    Here, the depth is `10`.
-
-The following code creates the one-hot tensor for our labels, `onehot_labels`:
-
-```python
-onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-```
-
-Because `labels` contains a series of values from 0–9, `indices` is just our
-`labels` tensor, with values cast to integers. The `depth` is `10` because we
-have 10 possible target classes, one for each digit.
-
-Next, we compute cross-entropy of `onehot_labels` and the softmax of the
-predictions from our logits layer. `tf.losses.softmax_cross_entropy()` takes
-`onehot_labels` and `logits` as arguments, performs softmax activation on
-`logits`, calculates cross-entropy, and returns our `loss` as a scalar `Tensor`:
-
-```python
-loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
-```
-
-### Configure the Training Op
-
-In the previous section, we defined loss for our CNN as the softmax
-cross-entropy of the logits layer and our labels. Let's configure our model to
-optimize this loss value during training, using the
-[`optimize_loss()`](../../api_docs/python/contrib.layers.md#optimize_loss)
-method in `tf.contrib.layers`. We'll use a learning rate of 0.001 and
-[stochastic gradient
-descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) as the
-optimization algorithm:
-
-```python
-# Configure the Training Op (for TRAIN mode)
-if mode == learn.ModeKeys.TRAIN:
-    train_op = tf.contrib.layers.optimize_loss(
-        loss=loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=0.001,
-        optimizer="SGD")
-```
-
-<p class="note"><b>NOTE:</b> For a more in-depth look at configuring training ops for Estimator model
-functions, see <a href="../estimators/index.md#defining_the_training_op_for_the_model">"Defining the training op for the
-model"</a> in the
-<a href="../estimators/index.md">"Creating Estimations in tf.contrib.learn"</a> tutorial.</p>
-
-### Generate Predictions {#generate-predictions}
-
-The logits layer of our model returns our predictions as raw values in a
-<code>[<em>batch_size</em>, 10]</code>-dimensional tensor. Let's convert these
-raw values into two different formats that our model function can return:
-
-*   The **predicted class** for each example: a digit from 0–9.
-*   The **probabilities** for each possible target class for each example: the
-    probability that the example is a 0, is a 1, is a 2, etc.
-
-For a given example, our predicted class is the element in the corresponding row
-of the logits tensor with the highest raw value. We can find the index of this
-element using the [`tf.argmax()`](../../api_docs/python/math_ops.md#argmax)
-function:
-
-```python
-tf.argmax(input=logits, axis=1)
-```
-
-The `input` argument specifies the tensor from which to extract maximum
-values—here `logits`. The `axis` argument specifies the axis of the `input`
-tensor along which to find the greatest value. Here, we want to find the largest
-value along the dimension with index of 1, which corresponds to our predictions
-(recall that our logits tensor has shape <code>[<em>batch_size</em>,
-10]</code>).
-
-We can derive probabilities from our logits layer by applying softmax activation
-using [`tf.nn.softmax()`](../../api_docs/python/nn.md#softmax):
-
-```python
-tf.nn.softmax(logits, name="softmax_tensor")
-```
-
-<p class="note"><b>NOTE:</b> We use the `name` argument to explicitly name this operation `softmax_tensor`, so we can reference it later. (We'll set up logging for the softmax values in <a href="#set-up-a-logging-hook">Set Up a Logging Hook</a>.)</p>
-
-We compile our predictions in a dict as follows:
-
-```python
-predictions = {
-    "classes": tf.argmax(
-        input=logits, axis=1),
-    "probabilities": tf.nn.softmax(
-        logits, name="softmax_tensor")
-}
-```
-
-Finally, now that we've got our `predictions`, `loss`, and `train_op`, we can
-return them, along with our `mode` argument, in a
-[`ModelFnOps`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/model_fn.py)
-object:
-
-```python
-# Return a ModelFnOps object
-return model_fn_lib.ModelFnOps(
-    mode=mode, predictions=predictions, loss=loss, train_op=train_op)
-```
-
-## Training and Evaluating the CNN MNIST Classifier {#training-evaluating}
-
-We've coded our MNIST CNN model function; now we're ready to train and evaluate
-it.
-
-### Load Training and Test Data
-
-First, let's load our training and test data. Add a `main()` function to
-`cnn_mnist.py` with the following code:
-
-```python
-def main(unused_argv):
-  # Load training and eval data
-  mnist = learn.datasets.load_dataset("mnist")
-  train_data = mnist.train.images # Returns np.array
-  train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
-  eval_data = mnist.test.images # Returns np.array
-  eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
-```
-
-We store the training feature data (the raw pixel values for 55,000 images of
-hand-drawn digits) and training labels (the corresponding value from 0–9 for
-each image) as [numpy
-arrays](https://docs.scipy.org/doc/numpy/reference/generated/numpy.array.html)
-in `train_data` and `train_labels`, respectively. Similarly, we store the
-evalulation feature data (10,000 images) and evaluation labels in `eval_data`
-and `eval_labels`, respectively.
-
-### Create the Estimator {#create-the-estimator}
-
-Next, let's create an `Estimator` (a TensorFlow class for performing high-level
-model training, evaluation, and inference) for our model. Add the following code
-to `main()`:
-
-```python
-# Create the Estimator
-mnist_classifier = learn.Estimator(
-      model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
-```
-
-The `model_fn` argument specifies the model function to use for training,
-evaluation, and inference; we pass it the `cnn_model_fn` we created in["Building
-the CNN MNIST Classifier."](#building-cnn-classifier) The `model_dir` argument
-specifies the directory where model data (checkpoints) will be saved (here, we
-specify the temp directory `/tmp/mnist_convnet_model`, but feel free to change
-to another directory of your choice).
-
-<p class="note"><b>NOTE:</b> For an in-depth walkthrough of the TensorFlow <code>Estimator</code> API, see the tutorial <a href="../estimators/index.md">"Creating Estimators in tf.contrib.learn."</a></p>
-
-### Set Up a Logging Hook {#set-up-a-logging-hook}
-
-Since CNNs can take a while to train, let's set up some logging so we can track
-progress during training. We can use TensorFlow's [`SessionRunHook`
-API](../../api_docs/python/train.md#SessionRunHook) to create a
-[`LoggingTensorHook`](https://tensorflow.org/api_docs/python/train.html?cl=head#LoggingTensorHook)
-that will log the probability values from the softmax layer of our CNN. Add the
-following to `main()`:
-
-```python
-# Set up logging for predictions
-  tensors_to_log = {"probabilities": "softmax_tensor"}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=50)
-```
-
-We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a
-label of our choice that will be printed in the log output, and the
-corresponding label is the name of a `Tensor` in the TensorFlow graph. Here, our
-`probabilities` can be found in `softmax_tensor`, the name we gave our softmax
-operation earlier when we generated the probabilities in `cnn_model_fn`.
-
-<p class="note"><b>NOTE:</b> If you don't explicitly assign a name to an operation via
-the `name` argument, TensorFlow will assign a default name. A couple easy ways to discover
-the names applied to operations are to visualize your graph on <a href="../../how_tos/graph_viz/index.md">TensorBoard</a>)
-or to enable the <a href="../../how_tos/debugger/index.md">TensorFlow Debugger (tfdbg)</a>.</p>
-
-Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the
-`tensors` argument. We set `every_n_iter=50`, which specifies that probabilities
-should be logged after every 50 steps of training.
-
-### Train the Model
-
-Now we're ready to train our model, which we can do by calling `fit()` on
-`mnist_classifier`. Add the following to `main()`:
-
-```python
-# Train the model
-mnist_classifier.fit(
-    x=train_data,
-    y=train_labels,
-    batch_size=100,
-    steps=20000,
-    monitors=[logging_hook])
-```
-
-In the `fit` call, we pass the training feature data and labels to `x` and `y`,
-respectively. We set a `batch_size` of `100` (which means that the model will
-train on minibatches of 100 examples at each step), and `steps` of `20000`
-(which means the model will train for 20,000 steps total). We pass our
-`logging_hook` to the `monitors` argument, so that it will be triggered during
-training.
-
-### Evaluate the Model
-
-Once training is complete, we want to evaluate our model to determine its
-accuracy on the MNIST test set. To set up the accuracy metric for our model, we
-need to create a metrics dict with a
-[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py)
-that calculates accuracy. Add the following to `main()`:
-
-```python
-# Configure the accuracy metric for evaluation
-metrics = {
-    "accuracy":
-        learn.metric_spec.MetricSpec(
-            metric_fn=tf.metrics.accuracy, prediction_key="classes"),
-}
-```
-
-We create our `MetricSpec`s with the following two arguments:
-
-*   `metric_fn`. The function that calculates and returns the value of our
-    metric. Here, we can use the predefined `accuracy` function in the
-    [`tf.metrics`](https://www.tensorflow.org/code/tensorflow/python/ops/metrics_impl.py)
-    module.
-*   `prediction_key`. The key of the tensor that contains the predictions
-    returned by the model function. Here, because we're building a
-    classification model, the prediction key is `"classes"`, which we specified
-    back in ["Generate Predictions."](#generate-predictions)
-
-Now that we've set up our `metrics` dict, we can evaluate the model. Add the
-following code, which performs evaluation and prints the results:
-
-```python
-# Evaluate the model and print results
-eval_results = mnist_classifier.evaluate(
-    x=eval_data, y=eval_labels, metrics=metrics)
-print(eval_results)
-```
-
-We pass our evaluation feature data and labels to `evaluate()` in the `x` and
-`y` arguments, respectively. The `metrics` argument takes the metrics dict we
-just defined.
-
-### Run the Model
-
-We've coded the CNN model function, `Estimator`, and the training/evaluation
-logic; now let's see the results. Run `cnn_mnist.py`.
-
-<p class="note"><b>NOTE:</b> Training CNNs is quite computationally intensive. Estimated completion time of <code>cnn_mnist.py</code> will vary depending on your processor, but will likely be upwards of 1 hour on CPU. To train more quickly, you can decrease the number of <code>steps</code> passed to <code>fit()</code>, but note that this will affect accuracy.</p>
-
-As the model trains, you'll see log output like the following:
-
-```python
-INFO:tensorflow:loss = 2.36026, step = 1
-INFO:tensorflow:probabilities = [[ 0.07722801  0.08618255  0.09256398, ...]]
-...
-INFO:tensorflow:loss = 2.13119, step = 101
-INFO:tensorflow:global_step/sec: 5.44132
-...
-INFO:tensorflow:Loss for final step: 0.553216.
-```
-
-When training is complete, you'll see the results of the model evaluation, e.g.:
-
-```python
-INFO:tensorflow:Restored model from /tmp/mnist_convnet_model
-INFO:tensorflow:Eval steps [0,inf) for training step 20000.
-INFO:tensorflow:Input iterator is exhausted.
-INFO:tensorflow:Saving evaluation summary for step 20000: accuracy = 0.9733, loss = 0.0902271
-{'loss': 0.090227105, 'global_step': 20000, 'accuracy': 0.97329998}
-```
-
-Here, we've achieved an accuracy of 97.3% on our test data set.
-
-## Additional Resources
-
-To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
-following resources:
-
-*   [Creating Estimators in tf.contrib.learn](../estimators/index.md). An
-    introduction to the TensorFlow Estimator API, which walks through
-    configuring an Estimator, writing a model function, calculating loss, and
-    defining a training op.
-*   [Deep MNIST for Experts: Building a Multilayer
-    CNN](../mnist/pros/index.md#build_a_multilayer_convolutional_network). Walks
-    through how to build a MNIST CNN classification model *without layers* using
-    lower-level TensorFlow operations.
diff --git a/tensorflow/g3doc/tutorials/leftnav_files b/tensorflow/g3doc/tutorials/leftnav_files
deleted file mode 100644
index 77ec0a0f39f..00000000000
--- a/tensorflow/g3doc/tutorials/leftnav_files
+++ /dev/null
@@ -1,26 +0,0 @@
-### Basic Neural Networks
-mnist/beginners/index.md
-mnist/pros/index.md
-mnist/tf/index.md
-### Easy ML with tf.contrib.learn
-tflearn/index.md
-linear/overview.md
-wide/index.md
-wide_and_deep/index.md
-monitors/index.md
-input_fn/index.md
-estimators/index.md
-layers/index.md
-### TensorFlow Serving
-tfserve/index.md
-### Image Processing
-deep_cnn/index.md
-image_recognition/index.md
-### Language and Sequence Processing
-word2vec/index.md
-recurrent/index.md
-seq2seq/index.md
-syntaxnet/index.md
-### Non-ML Applications
-mandelbrot/index.md
-pdes/index.md
diff --git a/tensorflow/g3doc/tutorials/linear/overview.md b/tensorflow/g3doc/tutorials/linear/overview.md
deleted file mode 100644
index b239ec2062b..00000000000
--- a/tensorflow/g3doc/tutorials/linear/overview.md
+++ /dev/null
@@ -1,238 +0,0 @@
-# Large-scale Linear Models with TensorFlow
-
-The tf.learn API provides (among other things) a rich set of tools for working
-with linear models in TensorFlow. This document provides an overview of those
-tools. It explains:
-
-   * what a linear model is.
-   * why you might want to use a linear model.
-   * how tf.learn makes it easy to build linear models in TensorFlow.
-   * how you can use tf.learn to combine linear models with
-   deep learning to get the advantages of both.
-
-Read this overview to decide whether the tf.learn linear model tools might be
-useful to you. Then do the [Linear Models tutorial](../wide/) to
-give it a try. This overview uses code samples from the tutorial, but the
-tutorial walks through the code in greater detail.
-
-To understand this overview it will help to have some familiarity
-with basic machine learning concepts, and also with
-[tf.learn](../tflearn/).
-
-[TOC]
-
-## What is a linear model?
-
-A *linear model* uses a single weighted sum of features to make a prediction.
-For example, if you have [data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
-on age, years of education, and weekly hours of
-work for a population, you can learn weights for each of those numbers so that
-their weighted sum estimates a person's salary. You can also use linear models
-for classification.
-
-Some linear models transform the weighted sum into a more convenient form. For
-example, *logistic regression* plugs the weighted sum into the logistic
-function to turn the output into a value between 0 and 1. But you still just
-have one weight for each input feature.
-
-## Why would you want to use a linear model?
-
-Why would you want to use so simple a model when recent research has
-demonstrated the power of more complex neural networks with many layers?
-
-Linear models:
-
-   * train quickly, compared to deep neural nets.
-   * can work well on very large feature sets.
-   * can be trained with algorithms that don't require a lot of fiddling
-   with learning rates, etc.
-   * can be interpreted and debugged more easily than neural nets.
-   You can examine the weights assigned to each feature to figure out what's
-   having the biggest impact on a prediction.
-   * provide an excellent starting point for learning about machine learning.
-   * are widely used in industry.
-
-## How does tf.learn help you build linear models?
-
-You can build a linear model from scratch in TensorFlow without the help of a
-special API. But tf.learn provides some tools that make it easier to build
-effective large-scale linear models.
-
-### Feature columns and transformations
-
-Much of the work of designing a linear model consists of transforming raw data
-into suitable input features. tf.learn uses the `FeatureColumn` abstraction to
-enable these transformations.
-
-A `FeatureColumn` represents a single feature in your data. A `FeatureColumn`
-may represent a quantity like 'height', or it may represent a category like
-'eye_color' where the value is drawn from a set of discrete possibilities like {'blue', 'brown', 'green'}.
-
-In the case of both *continuous features* like 'height' and *categorical
-features* like 'eye_color', a single value in the data might get transformed
-into a sequence of numbers before it is input into the model. The
-`FeatureColumn` abstraction lets you manipulate the feature as a single
-semantic unit in spite of this fact. You can specify transformations and
-select features to include without dealing with specific indices in the
-tensors you feed into the model.
-
-#### Sparse columns
-
-Categorical features in linear models are typically translated into a sparse
-vector in which each possible value has a corresponding index or id. For
-example, if there are only three possible eye colors you can represent
-'eye_color' as a length 3 vector: 'brown' would become [1, 0, 0], 'blue' would
-become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called
-"sparse" because they may be very long, with many zeros, when the set of
-possible values is very large (such as all English words).
-
-While you don't need to use sparse columns to use tf.learn linear models, one
-of the strengths of linear models is their ability to deal with large sparse
-vectors. Sparse features are a primary use case for the tf.learn linear model
-tools.
-
-##### Encoding sparse columns
-
-`FeatureColumn` handles the conversion of categorical values into vectors
-automatically, with code like this:
-
-```python
-eye_color = tf.contrib.layers.sparse_column_with_keys(
-  column_name="eye_color", keys=["blue", "brown", "green"])
-```
-
-where `eye_color` is the name of a column in your source data.
-
-You can also generate `FeatureColumn`s for categorical features for which you
-don't know all possible values. For this case you would use
-`sparse_column_with_hash_bucket()`, which uses a hash function to assign
-indices to feature values.
-
-```python
-education = tf.contrib.layers.sparse_column_with_hash_bucket(\
-    "education", hash_bucket_size=1000)
-```
-
-##### Feature Crosses
-
-Because linear models assign independent weights to separate features, they
-can't learn the relative importance of specific combinations of feature
-values. If you have a feature 'favorite_sport' and a feature 'home_city' and
-you're trying to predict whether a person likes to wear red, your linear model
-won't be able to learn that baseball fans from St. Louis especially like to
-wear red.
-
-You can get around this limitation by creating a new feature
-'favorite_sport_x_home_city'. The value of this feature for a given person is
-just the concatenation of the values of the two source features:
-'baseball_x_stlouis', for example. This sort of combination feature is called
-a *feature cross*.
-
-The `crossed_column()` method makes it easy to set up feature crosses:
-
-```python
-sport = tf.contrib.layers.sparse_column_with_hash_bucket(\
-    "sport", hash_bucket_size=1000)
-city = tf.contrib.layers.sparse_column_with_hash_bucket(\
-    "city", hash_bucket_size=1000)
-sport_x_city = tf.contrib.layers.crossed_column(
-    [sport, city], hash_bucket_size=int(1e4))
-```
-
-#### Continuous columns
-
-You can specify a continuous feature like so:
-
-```python
-age = tf.contrib.layers.real_valued_column("age")
-```
-
-Although, as a single real number, a continuous feature can often be input
-directly into the model, tf.learn offers useful transformations for this sort
-of column as well.
-
-##### Bucketization
-
-*Bucketization* turns a continuous column into a categorical column. This
-transformation lets you use continuous features in feature crosses, or learn
-cases where specific value ranges have particular importance.
-
-Bucketization divides the range of possible values into subranges called
-buckets:
-
-```python
-age_buckets = tf.contrib.layers.bucketized_column(
-    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-```
-
-The bucket into which a value falls becomes the categorical label for
-that value.
-
-#### Input function
-
-`FeatureColumn`s provide a specification for the input data for your model,
-indicating how to represent and transform the data. But they do not provide
-the data itself. You provide the data through an input function.
-
-The input function must return a dictionary of tensors. Each key corresponds to
-the name of a `FeatureColumn`. Each key's value is a tensor containing the
-values of that feature for all data instances. See
-[Building Input Functions with tf.contrib.learn](../input_fn/index.md) for a
-more comprehensive look at input functions, and `input_fn` in the
-[linear models tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
-for an example implementation of an input function.
-
-The input function is passed to the `fit()` and `evaluate()` calls that
-initiate training and testing, as described in the next section.
-
-### Linear estimators
-
-tf.learn's estimator classes provide a unified training and evaluation harness
-for regression and classification models. They take care of the details of the
-training and evaluation loops and allow the user to focus on model inputs and
-architecture.
-
-To build a linear estimator, you can use either the
-`tf.contrib.learn.LinearClassifier` estimator or the
-`tf.contrib.learn.LinearRegressor` estimator, for classification and
-regression respectively.
-
-As with all tf.learn estimators, to run the estimator you just:
-
-   1. Instantiate the estimator class. For the two linear estimator classes,
-   you pass a list of `FeatureColumn`s to the constructor.
-   2. Call the estimator's `fit()` method to train it.
-   3. Call the estimator's `evaluate()` method to see how it does.
-
-For example:
-
-```python
-e = tf.contrib.learn.LinearClassifier(feature_columns=[
-  native_country, education, occupation, workclass, marital_status,
-  race, age_buckets, education_x_occupation, age_buckets_x_race_x_occupation],
-  model_dir=YOUR_MODEL_DIRECTORY)
-e.fit(input_fn=input_fn_train, steps=200)
-# Evaluate for one step (one pass through the test data).
-results = e.evaluate(input_fn=input_fn_test, steps=1)
-
-# Print the stats for the evaluation.
-for key in sorted(results):
-    print "%s: %s" % (key, results[key])
-```
-
-### Wide and deep learning
-
-The tf.learn API also provides an estimator class that lets you jointly train
-a linear model and a deep neural network. This novel approach combines the
-ability of linear models to "memorize" key features with the generalization
-ability of neural nets. Use `tf.contrib.learn.DNNLinearCombinedClassifier` to
-create this sort of "wide and deep" model:
-
-```python
-e = tf.contrib.learn.DNNLinearCombinedClassifier(
-    model_dir=YOUR_MODEL_DIR,
-    linear_feature_columns=wide_columns,
-    dnn_feature_columns=deep_columns,
-    dnn_hidden_units=[100, 50])
-```
-For more information, see the [Wide and Deep Learning tutorial](../wide_and_deep/).
diff --git a/tensorflow/g3doc/tutorials/mandelbrot/index.md b/tensorflow/g3doc/tutorials/mandelbrot/index.md
deleted file mode 100755
index d4a45167cd6..00000000000
--- a/tensorflow/g3doc/tutorials/mandelbrot/index.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Mandelbrot Set
-
-Visualizing the [Mandelbrot set](https://en.wikipedia.org/wiki/Mandelbrot_set)
-doesn't have anything to do with machine learning, but it makes for a fun
-example of how one can use TensorFlow for general mathematics.  This is
-actually a pretty naive implementation of the visualization, but it makes the
-point.  (We may end up providing a more elaborate implementation down the line
-to produce more truly beautiful images.)
-
-Note: This tutorial was originally prepared as an IPython notebook.
-
-## Basic Setup
-
-We'll need a few imports to get started.
-
-```python
-# Import libraries for simulation
-import tensorflow as tf
-import numpy as np
-
-# Imports for visualization
-import PIL.Image
-from io import BytesIO
-from IPython.display import Image, display
-```
-
-Now we'll define a function to actually display the image once we have
-iteration counts.
-
-```python
-def DisplayFractal(a, fmt='jpeg'):
-  """Display an array of iteration counts as a
-     colorful picture of a fractal."""
-  a_cyclic = (6.28*a/20.0).reshape(list(a.shape)+[1])
-  img = np.concatenate([10+20*np.cos(a_cyclic),
-                        30+50*np.sin(a_cyclic),
-                        155-80*np.cos(a_cyclic)], 2)
-  img[a==a.max()] = 0
-  a = img
-  a = np.uint8(np.clip(a, 0, 255))
-  f = BytesIO()
-  PIL.Image.fromarray(a).save(f, fmt)
-  display(Image(data=f.getvalue()))
-```
-
-## Session and Variable Initialization
-
-For playing around like this, we often use an interactive session, but a regular
-session would work as well.
-
-```python
-   sess = tf.InteractiveSession()
-```
-
-It's handy that we can freely mix NumPy and TensorFlow.
-
-```python
-# Use NumPy to create a 2D array of complex numbers on [-2,2]x[-2,2]
-
-Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005]
-Z = X+1j*Y
-```
-
-Now we define and initialize TensorFlow tensors.
-
-```python
-xs = tf.constant(Z.astype(np.complex64))
-zs = tf.Variable(xs)
-ns = tf.Variable(tf.zeros_like(xs, tf.float32))
-```
-
-TensorFlow requires that you explicitly initialize variables before using them.
-
-```python
-tf.global_variables_initializer().run()
-```
-
-## Defining and Running the Computation
-
-Now we specify more of the computation...
-
-```python
-# Compute the new values of z: z^2 + x
-zs_ = zs*zs + xs
-
-# Have we diverged with this new value?
-not_diverged = tf.complex_abs(zs_) < 4
-
-# Operation to update the zs and the iteration count.
-#
-# Note: We keep computing zs after they diverge! This
-#       is very wasteful! There are better, if a little
-#       less simple, ways to do this.
-#
-step = tf.group(
-  zs.assign(zs_),
-  ns.assign_add(tf.cast(not_diverged, tf.float32))
-  )
-```
-
-... and run it for a couple hundred steps
-
-```python
-for i in range(200): step.run()
-```
-
-Let's see what we've got.
-
-```python
-DisplayFractal(ns.eval())
-```
-
-![jpeg](../../images/mandelbrot_output.jpg)
-
-Not bad!
-
-
diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
deleted file mode 100644
index 1d94d6f5b13..00000000000
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ /dev/null
@@ -1,453 +0,0 @@
-# MNIST For ML Beginners
-
-*This tutorial is intended for readers who are new to both machine learning and
-TensorFlow. If you already know what MNIST is, and what softmax (multinomial
-logistic) regression is, you might prefer this
-[faster paced tutorial](../pros/index.md).  Be sure to
-[install TensorFlow](../../../get_started/os_setup.md) before starting either
-tutorial.*
-
-When one learns how to program, there's a tradition that the first thing you do
-is print "Hello World." Just like programming has Hello World, machine learning
-has MNIST.
-
-MNIST is a simple computer vision dataset. It consists of images of handwritten
-digits like these:
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/MNIST.png">
-</div>
-
-It also includes labels for each image, telling us which digit it is. For
-example, the labels for the above images are 5, 0, 4, and 1.
-
-In this tutorial, we're going to train a model to look at images and predict
-what digits they are. Our goal isn't to train a really elaborate model that
-achieves state-of-the-art performance -- although we'll give you code to do that
-later! -- but rather to dip a toe into using TensorFlow. As such, we're going
-to start with a very simple model, called a Softmax Regression.
-
-The actual code for this tutorial is very short, and all the interesting
-stuff happens in just three lines. However, it is very
-important to understand the ideas behind it: both how TensorFlow works and the
-core machine learning concepts. Because of this, we are going to very carefully
-work through the code.
-
-## About this tutorial
-
-This tutorial is an explanation, line by line, of what is happening in the
-[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py) code.
-
-You can use this tutorial in a few different ways, including:
-
-- Copy and paste each code snippet, line by line, into a Python environment as
-  you read through the explanations of each line.
-
-- Run the entire `mnist_softmax.py` Python file either before or after reading
-  through the explanations, and use this tutorial to understand the lines of
-  code that aren't clear to you.
-
-What we will accomplish in this tutorial:
-
-- Learn about the MNIST data and softmax regressions
-
-- Create a function that is a model for recognizing digits, based on looking at
-  every pixel in the image
-
-- Use TensorFlow to train the model to recognize digits by having it "look" at
-  thousands of examples (and run our first TensorFlow session to do so)
-
-- Check the model's accuracy with our test data
-
-## The MNIST Data
-
-The MNIST data is hosted on
-[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/).  If you are copying and
-pasting in the code from this tutorial, start here with these two lines of code
-which will download and read in the data automatically:
-
-```python
-from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
-```
-
-The MNIST data is split into three parts: 55,000 data points of training
-data (`mnist.train`), 10,000 points of test data (`mnist.test`), and 5,000
-points of validation data (`mnist.validation`). This split is very important:
-it's essential in machine learning that we have separate data which we don't
-learn from so that we can make sure that what we've learned actually
-generalizes!
-
-As mentioned earlier, every MNIST data point has two parts: an image of a
-handwritten digit and a corresponding label. We'll call the images "x"
-and the labels "y". Both the training set and test set contain images and their
-corresponding labels; for example the training images are `mnist.train.images`
-and the training labels are `mnist.train.labels`.
-
-Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
-numbers:
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/MNIST-Matrix.png">
-</div>
-
-We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
-matter how we flatten the array, as long as we're consistent between images.
-From this perspective, the MNIST images are just a bunch of points in a
-784-dimensional vector space, with a
-[very rich structure](http://colah.github.io/posts/2014-10-Visualizing-MNIST/)
-(warning: computationally intensive visualizations).
-
-Flattening the data throws away information about the 2D structure of the image.
-Isn't that bad? Well, the best computer vision methods do exploit this
-structure, and we will in later tutorials. But the simple method we will be
-using here, a softmax regression (defined below), won't.
-
-The result is that `mnist.train.images` is a tensor (an n-dimensional array)
-with a shape of `[55000, 784]`. The first dimension is an index into the list
-of images and the second dimension is the index for each pixel in each image.
-Each entry in the tensor is a pixel intensity between 0 and 1, for a particular
-pixel in a particular image.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/mnist-train-xs.png">
-</div>
-
-Each image in MNIST has a corresponding label, a number between 0 and 9
-representing the digit drawn in the image.
-
-For the purposes of this tutorial, we're going to want our labels as "one-hot
-vectors". A one-hot vector is a vector which is 0 in most dimensions, and 1 in a
-single dimension. In this case, the \\(n\\)th digit will be represented as a
-vector which is 1 in the \\(n\\)th dimension. For example, 3 would be
-\\([0,0,0,1,0,0,0,0,0,0]\\).  Consequently, `mnist.train.labels` is a
-`[55000, 10]` array of floats.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/mnist-train-ys.png">
-</div>
-
-We're now ready to actually make our model!
-
-## Softmax Regressions
-
-We know that every image in MNIST is of a handwritten digit between zero and
-nine.  So there are only ten possible things that a given image can be. We want
-to be able to look at an image and give the probabilities for it being each
-digit. For example, our model might look at a picture of a nine and be 80% sure
-it's a nine, but give a 5% chance to it being an eight (because of the top loop)
-and a bit of probability to all the others because it isn't 100% sure.
-
-This is a classic case where a softmax regression is a natural, simple model.
-If you want to assign probabilities to an object being one of several different
-things, softmax is the thing to do, because softmax gives us a list of values
-between 0 and 1 that add up to 1. Even later on, when we train more sophisticated
-models, the final step will be a layer of softmax.
-
-A softmax regression has two steps: first we add up the evidence of our input
-being in certain classes, and then we convert that evidence into probabilities.
-
-To tally up the evidence that a given image is in a particular class, we do a
-weighted sum of the pixel intensities. The weight is negative if that pixel
-having a high intensity is evidence against the image being in that class, and
-positive if it is evidence in favor.
-
-The following diagram shows the weights one model learned for each of these
-classes. Red represents negative weights, while blue represents positive
-weights.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/softmax-weights.png">
-</div>
-
-We also add some extra evidence called a bias. Basically, we want to be able
-to say that some things are more likely independent of the input. The result is
-that the evidence for a class \\(i\\) given an input \\(x\\) is:
-
-$$\text{evidence}_i = \sum_j W_{i,~ j} x_j + b_i$$
-
-where \\(W_i\\) is the weights and \\(b_i\\) is the bias for class \\(i\\),
-and \\(j\\) is an index for summing over the pixels in our input image \\(x\\).
-We then convert the evidence tallies into our predicted probabilities
-\\(y\\) using the "softmax" function:
-
-$$y = \text{softmax}(\text{evidence})$$
-
-Here softmax is serving as an "activation" or "link" function, shaping
-the output of our linear function into the form we want -- in this case, a
-probability distribution over 10 cases.
-You can think of it as converting tallies
-of evidence into probabilities of our input being in each class.
-It's defined as:
-
-$$\text{softmax}(x) = \text{normalize}(\exp(x))$$
-
-If you expand that equation out, you get:
-
-$$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$
-
-But it's often more helpful to think of softmax the first way: exponentiating
-its inputs and then normalizing them.  The exponentiation means that one more
-unit of evidence increases the weight given to any hypothesis multiplicatively.
-And conversely, having one less unit of evidence means that a hypothesis gets a
-fraction of its earlier weight. No hypothesis ever has zero or negative
-weight. Softmax then normalizes these weights, so that they add up to one,
-forming a valid probability distribution. (To get more intuition about the
-softmax function, check out the
-[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax) on it in
-Michael Nielsen's book, complete with an interactive visualization.)
-
-You can picture our softmax regression as looking something like the following,
-although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
-the \\(x\\)s, add a bias, and then apply softmax.
-
-<div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/softmax-regression-scalargraph.png">
-</div>
-
-If we write that out as equations, we get:
-
-<div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/softmax-regression-scalarequation.png">
-</div>
-
-We can "vectorize" this procedure, turning it into a matrix multiplication
-and vector addition. This is helpful for computational efficiency. (It's also
-a useful way to think.)
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../../images/softmax-regression-vectorequation.png">
-</div>
-
-More compactly, we can just write:
-
-$$y = \text{softmax}(Wx + b)$$
-
-Now let's turn that into something that TensorFlow can use.
-
-## Implementing the Regression
-
-
-To do efficient numerical computing in Python, we typically use libraries like
-[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
-multiplication outside Python, using highly efficient code implemented in
-another language.  Unfortunately, there can still be a lot of overhead from
-switching back to Python every operation. This overhead is especially bad if you
-want to run computations on GPUs or in a distributed manner, where there can be
-a high cost to transferring data.
-
-TensorFlow also does its heavy lifting outside Python, but it takes things a
-step further to avoid this overhead.  Instead of running a single expensive
-operation independently from Python, TensorFlow lets us describe a graph of
-interacting operations that run entirely outside Python. (Approaches like this
-can be seen in a few machine learning libraries.)
-
-To use TensorFlow, first we need to import it.
-
-```python
-import tensorflow as tf
-```
-
-We describe these interacting operations by manipulating symbolic variables.
-Let's create one:
-
-```python
-x = tf.placeholder(tf.float32, [None, 784])
-```
-
-`x` isn't a specific value. It's a `placeholder`, a value that we'll input when
-we ask TensorFlow to run a computation. We want to be able to input any number
-of MNIST images, each flattened into a 784-dimensional vector. We represent
-this as a 2-D tensor of floating-point numbers, with a shape `[None, 784]`.
-(Here `None` means that a dimension can be of any length.)
-
-We also need the weights and biases for our model. We could imagine treating
-these like additional inputs, but TensorFlow has an even better way to handle
-it: `Variable`.  A `Variable` is a modifiable tensor that lives in TensorFlow's
-graph of interacting operations. It can be used and even modified by the
-computation. For machine learning applications, one generally has the model
-parameters be `Variable`s.
-
-```python
-W = tf.Variable(tf.zeros([784, 10]))
-b = tf.Variable(tf.zeros([10]))
-```
-
-We create these `Variable`s by giving `tf.Variable` the initial value of the
-`Variable`: in this case, we initialize both `W` and `b` as tensors full of
-zeros. Since we are going to learn `W` and `b`, it doesn't matter very much
-what they initially are.
-
-Notice that `W` has a shape of [784, 10] because we want to multiply the
-784-dimensional image vectors by it to produce 10-dimensional vectors of
-evidence for the difference classes. `b` has a shape of [10] so we can add it
-to the output.
-
-We can now implement our model. It only takes one line to define it!
-
-```python
-y = tf.nn.softmax(tf.matmul(x, W) + b)
-```
-
-First, we multiply `x` by `W` with the expression `tf.matmul(x, W)`. This is
-flipped from when we multiplied them in our equation, where we had \\(Wx\\), as
-a small trick to deal with `x` being a 2D tensor with multiple inputs. We then
-add `b`, and finally apply `tf.nn.softmax`.
-
-That's it. It only took us one line to define our model, after a couple short
-lines of setup. That isn't because TensorFlow is designed to make a softmax
-regression particularly easy: it's just a very flexible way to describe many
-kinds of numerical computations, from machine learning models to physics
-simulations. And once defined, our model can be run on different devices:
-your computer's CPU, GPUs, and even phones!
-
-
-## Training
-
-In order to train our model, we need to define what it means for the model to be
-good. Well, actually, in machine learning we typically define what it means for
-a model to be bad. We call this the cost, or the loss, and it represents how far
-off our model is from our desired outcome. We try to minimize that error, and
-the smaller the error margin, the better our model is.
-
-One very common, very nice function to determine the loss of a model is called
-"cross-entropy." Cross-entropy arises from thinking about information
-compressing codes in information theory but it winds up being an important idea
-in lots of areas, from gambling to machine learning. It's defined as:
-
-$$H_{y'}(y) = -\sum_i y'_i \log(y_i)$$
-
-Where \\(y\\) is our predicted probability distribution, and \\(y'\\) is the true
-distribution (the one-hot vector with the digit labels).  In some rough sense, the
-cross-entropy is measuring how inefficient our predictions are for describing
-the truth. Going into more detail about cross-entropy is beyond the scope of
-this tutorial, but it's well worth
-[understanding](http://colah.github.io/posts/2015-09-Visual-Information/).
-
-To implement cross-entropy we need to first add a new placeholder to input the
-correct answers:
-
-```python
-y_ = tf.placeholder(tf.float32, [None, 10])
-```
-
-Then we can implement the cross-entropy function, \\(-\sum y'\log(y)\\):
-
-```python
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
-```
-
-First, `tf.log` computes the logarithm of each element of `y`. Next, we multiply
-each element of `y_` with the corresponding element of `tf.log(y)`. Then
-`tf.reduce_sum` adds the elements in the second dimension of y, due to the
-`reduction_indices=[1]` parameter. Finally, `tf.reduce_mean` computes the mean
-over all the examples in the batch.
-
-Note that in the source code, we don't use this formulation, because it is
-numerically unstable.  Instead, we apply
-`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
-call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
-more numerically stable function internally computes the softmax activation.  In
-your code, consider using `tf.nn.softmax_cross_entropy_with_logits`
-instead.
-
-Now that we know what we want our model to do, it's very easy to have TensorFlow
-train it to do so.  Because TensorFlow knows the entire graph of your
-computations, it can automatically use the
-[backpropagation algorithm](http://colah.github.io/posts/2015-08-Backprop/) to
-efficiently determine how your variables affect the loss you ask it to
-minimize. Then it can apply your choice of optimization algorithm to modify the
-variables and reduce the loss.
-
-```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-```
-
-In this case, we ask TensorFlow to minimize `cross_entropy` using the
-[gradient descent algorithm](https://en.wikipedia.org/wiki/Gradient_descent)
-with a learning rate of 0.5. Gradient descent is a simple procedure, where
-TensorFlow simply shifts each variable a little bit in the direction that
-reduces the cost. But TensorFlow also provides
-[many other optimization algorithms](../../../api_docs/python/train.md#optimizers):
-using one is as simple as tweaking one line.
-
-What TensorFlow actually does here, behind the scenes, is to add new operations
-to your graph which implement backpropagation and gradient descent. Then it
-gives you back a single operation which, when run, does a step of gradient
-descent training, slightly tweaking your variables to reduce the loss.
-
-
-We can now launch the model in an `InteractiveSession`:
-
-```python
-sess = tf.InteractiveSession()
-```
-
-We first have to create an operation to initialize the variables we created:
-
-```python
-tf.global_variables_initializer().run()
-```
-
-
-Let's train -- we'll run the training step 1000 times!
-
-```python
-for i in range(1000):
-  batch_xs, batch_ys = mnist.train.next_batch(100)
-  sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
-```
-
-Each step of the loop, we get a "batch" of one hundred random data points from
-our training set. We run `train_step` feeding in the batches data to replace
-the `placeholder`s.
-
-Using small batches of random data is called stochastic training -- in this
-case, stochastic gradient descent. Ideally, we'd like to use all our data for
-every step of training because that would give us a better sense of what we
-should be doing, but that's expensive. So, instead, we use a different subset
-every time. Doing this is cheap and has much of the same benefit.
-
-
-
-## Evaluating Our Model
-
-How well does our model do?
-
-Well, first let's figure out where we predicted the correct label. `tf.argmax`
-is an extremely useful function which gives you the index of the highest entry
-in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our
-model thinks is most likely for each input, while `tf.argmax(y_,1)` is the
-correct label. We can use `tf.equal` to check if our prediction matches the
-truth.
-
-```python
-correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-```
-
-That gives us a list of booleans. To determine what fraction are correct, we
-cast to floating point numbers and then take the mean. For example,
-`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
-
-```python
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-```
-
-Finally, we ask for our accuracy on our test data.
-
-```python
-print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
-```
-
-This should be about 92%.
-
-Is that good? Well, not really. In fact, it's pretty bad. This is because we're
-using a very simple model. With some small changes, we can get to 97%. The best
-models can get to over 99.7% accuracy! (For more information, have a look at
-this
-[list of results](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html).)
-
-What matters is that we learned from this model. Still, if you're feeling a bit
-down about these results, check out
-[the next tutorial](../../../tutorials/mnist/pros/index.md) where we do a lot
-better, and learn how to build more sophisticated models using TensorFlow!
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
deleted file mode 100644
index 8638d6e0ee8..00000000000
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# Deep MNIST for Experts
-
-TensorFlow is a powerful library for doing large-scale numerical computation.
-One of the tasks at which it excels is implementing and training deep neural
-networks.  In this tutorial we will learn the basic building blocks of a
-TensorFlow model while constructing a deep convolutional MNIST classifier.
-
-*This introduction assumes familiarity with neural networks and the MNIST
-dataset. If you don't have
-a background with them, check out the
-[introduction for beginners](../beginners/index.md). Be sure to
-[install TensorFlow](../../../get_started/os_setup.md) before starting.*
-
-
-## About this tutorial
-
-The first part of this tutorial explains what is happening in the
-[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py)
-code, which is a basic implementation of a Tensorflow model.  The second part
-shows some ways to improve the accuracy.
-
-You can copy and paste each code snippet from this tutorial into a Python
-environment, or you can choose to just read through the code.
-
-What we will accomplish in this tutorial:
-
-- Create a softmax regression function that is a model for recognizing MNIST
-  digits, based on looking at every pixel in the image
-
-- Use Tensorflow to train the model to recognize digits by having it "look" at
-  thousands of examples (and run our first Tensorflow session to do so)
-
-- Check the model's accuracy with our test data
-
-- Build, train, and test a multilayer convolutional neural network to improve
-  the results
-
-## Setup
-
-Before we create our model, we will first load the MNIST dataset, and start a
-TensorFlow session.
-
-### Load MNIST Data
-
-If you are copying and pasting in the code from this tutorial, start here with
-these two lines of code which will download and read in the data automatically:
-
-```python
-from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
-```
-
-Here `mnist` is a lightweight class which stores the training, validation, and
-testing sets as NumPy arrays.  It also provides a function for iterating through
-data minibatches, which we will use below.
-
-### Start TensorFlow InteractiveSession
-
-TensorFlow relies on a highly efficient C++ backend to do its computation. The
-connection to this backend is called a session.  The common usage for TensorFlow
-programs is to first create a graph and then launch it in a session.
-
-Here we instead use the convenient `InteractiveSession` class, which makes
-TensorFlow more flexible about how you structure your code.  It allows you to
-interleave operations which build a
-[computation graph](../../../get_started/basic_usage.md#the-computation-graph)
-with ones that run the graph.  This is particularly convenient when working in
-interactive contexts like IPython.  If you are not using an
-`InteractiveSession`, then you should build the entire computation graph before
-starting a session and
-[launching the graph](../../../get_started/basic_usage.md#launching-the-graph-in-a-session).
-
-```python
-import tensorflow as tf
-sess = tf.InteractiveSession()
-```
-
-#### Computation Graph
-
-To do efficient numerical computing in Python, we typically use libraries like
-[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
-multiplication outside Python, using highly efficient code implemented in
-another language.  Unfortunately, there can still be a lot of overhead from
-switching back to Python every operation. This overhead is especially bad if you
-want to run computations on GPUs or in a distributed manner, where there can be
-a high cost to transferring data.
-
-TensorFlow also does its heavy lifting outside Python, but it takes things a
-step further to avoid this overhead.  Instead of running a single expensive
-operation independently from Python, TensorFlow lets us describe a graph of
-interacting operations that run entirely outside Python.  This approach is
-similar to that used in Theano or Torch.
-
-The role of the Python code is therefore to build this external computation
-graph, and to dictate which parts of the computation graph should be run. See
-the
-[Computation Graph](../../../get_started/basic_usage.md#the-computation-graph)
-section of
-[Basic Usage](../../../get_started/basic_usage.md)
-for more detail.
-
-## Build a Softmax Regression Model
-
-In this section we will build a softmax regression model with a single linear
-layer. In the next section, we will extend this to the case of softmax
-regression with a multilayer convolutional network.
-
-### Placeholders
-
-We start building the computation graph by creating nodes for the
-input images and target output classes.
-
-```python
-x = tf.placeholder(tf.float32, shape=[None, 784])
-y_ = tf.placeholder(tf.float32, shape=[None, 10])
-```
-
-Here `x` and `y_` aren't specific values. Rather, they are each a `placeholder`
--- a value that we'll input when we ask TensorFlow to run a computation.
-
-The input images `x` will consist of a 2d tensor of floating point numbers.
-Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality
-of a single flattened 28 by 28 pixel MNIST image, and `None` indicates that the
-first dimension, corresponding to the batch size, can be of any size.  The
-target output classes `y_` will also consist of a 2d tensor, where each row is a
-one-hot 10-dimensional vector indicating which digit class (zero through nine)
-the corresponding MNIST image belongs to.
-
-The `shape` argument to `placeholder` is optional, but it allows TensorFlow
-to automatically catch bugs stemming from inconsistent tensor shapes.
-
-### Variables
-
-We now define the weights `W` and biases `b` for our model. We could imagine
-treating these like additional inputs, but TensorFlow has an even better way to
-handle them: `Variable`.  A `Variable` is a value that lives in TensorFlow's
-computation graph.  It can be used and even modified by the computation. In
-machine learning applications, one generally has the model parameters be
-`Variable`s.
-
-```python
-W = tf.Variable(tf.zeros([784,10]))
-b = tf.Variable(tf.zeros([10]))
-```
-
-We pass the initial value for each parameter in the call to `tf.Variable`.  In
-this case, we initialize both `W` and `b` as tensors full of zeros. `W` is a
-784x10 matrix (because we have 784 input features and 10 outputs) and `b` is a
-10-dimensional vector (because we have 10 classes).
-
-Before `Variable`s can be used within a session, they must be initialized using
-that session.  This step takes the initial values (in this case tensors full of
-zeros) that have already been specified, and assigns them to each
-`Variable`. This can be done for all `Variables` at once:
-
-```python
-sess.run(tf.global_variables_initializer())
-```
-
-### Predicted Class and Loss Function
-
-We can now implement our regression model. It only takes one line!  We multiply
-the vectorized input images `x` by the weight matrix `W`, add the bias `b`.
-
-```python
-y = tf.matmul(x,W) + b
-```
-
-We can specify a loss function just as easily. Loss indicates how bad the
-model's prediction was on a single example; we try to minimize that while
-training across all the examples. Here, our loss function is the cross-entropy
-between the target and the softmax activation function applied to the model's
-prediction.  As in the beginners tutorial, we use the stable formulation:
-
-```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
-```
-
-Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
-softmax on the model's unnormalized model prediction and sums across all
-classes, and `tf.reduce_mean` takes the average over these sums.
-
-## Train the Model
-
-Now that we have defined our model and training loss function, it is
-straightforward to train using TensorFlow.  Because TensorFlow knows the entire
-computation graph, it can use automatic differentiation to find the gradients of
-the loss with respect to each of the variables.  TensorFlow has a variety of
-[built-in optimization algorithms](../../../api_docs/python/train.md#optimizers).
-For this example, we will use steepest gradient descent, with a step length of
-0.5, to descend the cross entropy.
-
-```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-```
-
-What TensorFlow actually did in that single line was to add new operations to
-the computation graph. These operations included ones to compute gradients,
-compute parameter update steps, and apply update steps to the parameters.
-
-The returned operation `train_step`, when run, will apply the gradient descent
-updates to the parameters. Training the model can therefore be accomplished by
-repeatedly running `train_step`.
-
-```python
-for i in range(1000):
-  batch = mnist.train.next_batch(100)
-  train_step.run(feed_dict={x: batch[0], y_: batch[1]})
-```
-
-We load 100 training examples in each training iteration. We then run the
-`train_step` operation, using `feed_dict` to replace the `placeholder` tensors
-`x` and `y_` with the training examples.  Note that you can replace any tensor
-in your computation graph using `feed_dict` -- it's not restricted to just
-`placeholder`s.
-
-### Evaluate the Model
-
-How well did our model do?
-
-First we'll figure out where we predicted the correct label. `tf.argmax` is an
-extremely useful function which gives you the index of the highest entry in a
-tensor along some axis. For example, `tf.argmax(y,1)` is the label our model
-thinks is most likely for each input, while `tf.argmax(y_,1)` is the true
-label. We can use `tf.equal` to check if our prediction matches the truth.
-
-```python
-correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-```
-
-That gives us a list of booleans. To determine what fraction are correct, we
-cast to floating point numbers and then take the mean. For example,
-`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
-
-```python
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-```
-
-Finally, we can evaluate our accuracy on the test data. This should be about
-92% correct.
-
-```python
-print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
-```
-
-## Build a Multilayer Convolutional Network
-
-Getting 92% accuracy on MNIST is bad. It's almost embarrassingly bad. In this
-section, we'll fix that, jumping from a very simple model to something
-moderately sophisticated: a small convolutional neural network. This will get us
-to around 99.2% accuracy -- not state of the art, but respectable.
-
-### Weight Initialization
-
-To create this model, we're going to need to create a lot of weights and biases.
-One should generally initialize weights with a small amount of noise for
-symmetry breaking, and to prevent 0 gradients. Since we're using
-[ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) neurons, it is
-also good practice to initialize them with a slightly positive initial bias to
-avoid "dead neurons". Instead of doing this repeatedly while we build the model,
-let's create two handy functions to do it for us.
-
-```python
-def weight_variable(shape):
-  initial = tf.truncated_normal(shape, stddev=0.1)
-  return tf.Variable(initial)
-
-def bias_variable(shape):
-  initial = tf.constant(0.1, shape=shape)
-  return tf.Variable(initial)
-```
-
-### Convolution and Pooling
-
-TensorFlow also gives us a lot of flexibility in convolution and pooling
-operations. How do we handle the boundaries? What is our stride size?
-In this example, we're always going to choose the vanilla version.
-Our convolutions uses a stride of one and are zero padded so that the
-output is the same size as the input. Our pooling is plain old max pooling
-over 2x2 blocks. To keep our code cleaner, let's also abstract those operations
-into functions.
-
-```python
-def conv2d(x, W):
-  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
-
-def max_pool_2x2(x):
-  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
-                        strides=[1, 2, 2, 1], padding='SAME')
-```
-
-### First Convolutional Layer
-
-We can now implement our first layer. It will consist of convolution, followed
-by max pooling. The convolution will compute 32 features for each 5x5 patch.
-Its weight tensor will have a shape of `[5, 5, 1, 32]`. The first two
-dimensions are the patch size, the next is the number of input channels, and
-the last is the number of output channels. We will also have a bias vector with
-a component for each output channel.
-
-```python
-W_conv1 = weight_variable([5, 5, 1, 32])
-b_conv1 = bias_variable([32])
-```
-
-To apply the layer, we first reshape `x` to a 4d tensor, with the second and
-third dimensions corresponding to image width and height, and the final
-dimension corresponding to the number of color channels.
-
-```python
-x_image = tf.reshape(x, [-1,28,28,1])
-```
-
-We then convolve `x_image` with the weight tensor, add the
-bias, apply the ReLU function, and finally max pool. The `max_pool_2x2` method will
-reduce the image size to 14x14.
-
-```python
-h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
-h_pool1 = max_pool_2x2(h_conv1)
-```
-
-### Second Convolutional Layer
-
-In order to build a deep network, we stack several layers of this type. The
-second layer will have 64 features for each 5x5 patch.
-
-```python
-W_conv2 = weight_variable([5, 5, 32, 64])
-b_conv2 = bias_variable([64])
-
-h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-h_pool2 = max_pool_2x2(h_conv2)
-```
-
-### Densely Connected Layer
-
-Now that the image size has been reduced to 7x7, we add a fully-connected layer
-with 1024 neurons to allow processing on the entire image. We reshape the tensor
-from the pooling layer into a batch of vectors,
-multiply by a weight matrix, add a bias, and apply a ReLU.
-
-```python
-W_fc1 = weight_variable([7 * 7 * 64, 1024])
-b_fc1 = bias_variable([1024])
-
-h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
-h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
-```
-
-#### Dropout
-
-To reduce overfitting, we will apply [dropout](
-https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf) before the readout layer.
-We create a `placeholder` for the probability that a neuron's output is kept
-during dropout. This allows us to turn dropout on during training, and turn it
-off during testing.
-TensorFlow's `tf.nn.dropout` op automatically handles scaling neuron outputs in
-addition to masking them, so dropout just works without any additional scaling.<sup id="a1">[1](#f1)</sup>
-
-```python
-keep_prob = tf.placeholder(tf.float32)
-h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
-```
-
-### Readout Layer
-
-Finally, we add a layer, just like for the one layer softmax regression
-above.
-
-```python
-W_fc2 = weight_variable([1024, 10])
-b_fc2 = bias_variable([10])
-
-y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
-```
-
-### Train and Evaluate the Model
-
-How well does this model do? To train and evaluate it we will use code that is
-nearly identical to that for the simple one layer SoftMax network above.
-
-The differences are that:
-
-- We will replace the steepest gradient descent optimizer with the more
-  sophisticated ADAM optimizer.
-
-- We will include the additional parameter `keep_prob` in `feed_dict` to control
-  the dropout rate.
-
-- We will add logging to every 100th iteration in the training process.
-
-Feel free to go ahead and run this code, but it does 20,000 training iterations
-and may take a while (possibly up to half an hour), depending on your processor.
-
-```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
-train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
-correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-sess.run(tf.global_variables_initializer())
-for i in range(20000):
-  batch = mnist.train.next_batch(50)
-  if i%100 == 0:
-    train_accuracy = accuracy.eval(feed_dict={
-        x:batch[0], y_: batch[1], keep_prob: 1.0})
-    print("step %d, training accuracy %g"%(i, train_accuracy))
-  train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
-
-print("test accuracy %g"%accuracy.eval(feed_dict={
-    x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
-```
-
-The final test set accuracy after running this code should be approximately 99.2%.
-
-We have learned how to quickly and easily build, train, and evaluate a
-fairly sophisticated deep learning model using TensorFlow.
-
-<b id="f1">1</b>: For this small convolutional network, performance is actually nearly identical with and without dropout. Dropout is often very effective at reducing overfitting, but it is most useful when training very large neural networks. [↩](#a1)
diff --git a/tensorflow/g3doc/tutorials/mnist/tf/index.md b/tensorflow/g3doc/tutorials/mnist/tf/index.md
deleted file mode 100644
index 6512b695edd..00000000000
--- a/tensorflow/g3doc/tutorials/mnist/tf/index.md
+++ /dev/null
@@ -1,488 +0,0 @@
-# TensorFlow Mechanics 101
-
-Code: [tensorflow/examples/tutorials/mnist/](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/)
-
-The goal of this tutorial is to show how to use TensorFlow to train and
-evaluate a simple feed-forward neural network for handwritten digit
-classification using the (classic) MNIST data set.  The intended audience for
-this tutorial is experienced machine learning users interested in using
-TensorFlow.
-
-These tutorials are not intended for teaching Machine Learning in general.
-
-Please ensure you have followed the instructions to [install TensorFlow](../../../get_started/os_setup.md).
-
-## Tutorial Files
-
-This tutorial references the following files:
-
-File | Purpose
---- | ---
-[`mnist.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist.py) | The code to build a fully-connected MNIST model.
-[`fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py) | The main code to train the built MNIST model against the downloaded dataset using a feed dictionary.
-
-Simply run the `fully_connected_feed.py` file directly to start training:
-
-```bash
-python fully_connected_feed.py
-```
-
-## Prepare the Data
-
-MNIST is a classic problem in machine learning. The problem is to look at
-greyscale 28x28 pixel images of handwritten digits and determine which digit
-the image represents, for all the digits from zero to nine.
-
-![MNIST Digits](../../../images/mnist_digits.png "MNIST Digits")
-
-For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
-or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
-
-### Download
-
-At the top of the `run_training()` method, the `input_data.read_data_sets()`
-function will ensure that the correct data has been downloaded to your local
-training folder and then unpack that data to return a dictionary of `DataSet`
-instances.
-
-```python
-data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
-```
-
-**NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
-safely ignored by the reader.
-
-Dataset | Purpose
---- | ---
-`data_sets.train` | 55000 images and labels, for primary training.
-`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy.
-`data_sets.test` | 10000 images and labels, for final testing of trained accuracy.
-
-### Inputs and Placeholders
-
-The `placeholder_inputs()` function creates two [`tf.placeholder`](../../../api_docs/python/io_ops.md#placeholder)
-ops that define the shape of the inputs, including the `batch_size`, to the
-rest of the graph and into which the actual training examples will be fed.
-
-```python
-images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
-                                                       mnist.IMAGE_PIXELS))
-labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
-```
-
-Further down, in the training loop, the full image and label datasets are
-sliced to fit the `batch_size` for each step, matched with these placeholder
-ops, and then passed into the `sess.run()` function using the `feed_dict`
-parameter.
-
-## Build the Graph
-
-After creating placeholders for the data, the graph is built from the
-`mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and
-`training()`.
-
-1.  `inference()` - Builds the graph as far as is required for running
-the network forward to make predictions.
-1.  `loss()` - Adds to the inference graph the ops required to generate
-loss.
-1.  `training()` - Adds to the loss graph the ops required to compute
-and apply gradients.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../../images/mnist_subgraph.png">
-</div>
-
-### Inference
-
-The `inference()` function builds the graph as far as needed to
-return the tensor that would contain the output predictions.
-
-It takes the images placeholder as input and builds on top
-of it a pair of fully connected layers with [ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation followed by a ten
-node linear layer specifying the output logits.
-
-Each layer is created beneath a unique [`tf.name_scope`](../../../api_docs/python/framework.md#name_scope)
-that acts as a prefix to the items created within that scope.
-
-```python
-with tf.name_scope('hidden1'):
-```
-
-Within the defined scope, the weights and biases to be used by each of these
-layers are generated into [`tf.Variable`](../../../api_docs/python/state_ops.md#Variable)
-instances, with their desired shapes:
-
-```python
-weights = tf.Variable(
-    tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
-                        stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
-    name='weights')
-biases = tf.Variable(tf.zeros([hidden1_units]),
-                     name='biases')
-```
-
-When, for instance, these are created under the `hidden1` scope, the unique
-name given to the weights variable would be "`hidden1/weights`".
-
-Each variable is given initializer ops as part of their construction.
-
-In this most common case, the weights are initialized with the
-[`tf.truncated_normal`](../../../api_docs/python/constant_op.md#truncated_normal)
-and given their shape of a 2-D tensor with
-the first dim representing the number of units in the layer from which the
-weights connect and the second dim representing the number of
-units in the layer to which the weights connect.  For the first layer, named
-`hidden1`, the dimensions are `[IMAGE_PIXELS, hidden1_units]` because the
-weights are connecting the image inputs to the hidden1 layer.  The
-`tf.truncated_normal` initializer generates a random distribution with a given
-mean and standard deviation.
-
-Then the biases are initialized with [`tf.zeros`](../../../api_docs/python/constant_op.md#zeros)
-to ensure they start with all zero values, and their shape is simply the number
-of units in the layer to which they connect.
-
-The graph's three primary ops -- two [`tf.nn.relu`](../../../api_docs/python/nn.md#relu)
-ops wrapping [`tf.matmul`](../../../api_docs/python/math_ops.md#matmul)
-for the hidden layers and one extra `tf.matmul` for the logits -- are then
-created, each in turn, with separate `tf.Variable` instances connected to each
-of the input placeholders or the output tensors of the previous layer.
-
-```python
-hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
-```
-
-```python
-hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
-```
-
-```python
-logits = tf.matmul(hidden2, weights) + biases
-```
-
-Finally, the `logits` tensor that will contain the output is returned.
-
-### Loss
-
-The `loss()` function further builds the graph by adding the required loss
-ops.
-
-First, the values from the `labels_placeholder` are converted to 64-bit integers. Then, a [`tf.nn.sparse_softmax_cross_entropy_with_logits`](../../../api_docs/python/nn.md#sparse_softmax_cross_entropy_with_logits) op is added to automatically produce 1-hot labels from the `labels_placeholder` and compare the output logits from the `inference()` function with those 1-hot labels.
-
-```python
-labels = tf.to_int64(labels)
-cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    labels=labels, logits=logits, name='xentropy')
-```
-
-It then uses [`tf.reduce_mean`](../../../api_docs/python/math_ops.md#reduce_mean)
-to average the cross entropy values across the batch dimension (the first
-dimension) as the total loss.
-
-```python
-loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
-```
-
-And the tensor that will then contain the loss value is returned.
-
-> Note: Cross-entropy is an idea from information theory that allows us
-> to describe how bad it is to believe the predictions of the neural network,
-> given what is actually true. For more information, read the blog post Visual
-> Information Theory (http://colah.github.io/posts/2015-09-Visual-Information/)
-
-### Training
-
-The `training()` function adds the operations needed to minimize the loss via
-[Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent).
-
-Firstly, it takes the loss tensor from the `loss()` function and hands it to a
-[`tf.scalar_summary`](../../../api_docs/python/train.md#scalar_summary),
-an op for generating summary values into the events file when used with a
-`SummaryWriter` (see below).  In this case, it will emit the snapshot value of
-the loss every time the summaries are written out.
-
-```python
-tf.scalar_summary(loss.op.name, loss)
-```
-
-Next, we instantiate a [`tf.train.GradientDescentOptimizer`](../../../api_docs/python/train.md#GradientDescentOptimizer)
-responsible for applying gradients with the requested learning rate.
-
-```python
-optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-```
-
-We then generate a single variable to contain a counter for the global
-training step and the [`minimize()`](../../../api_docs/python/train.md#Optimizer.minimize)
-op is used to both update the trainable weights in the system and increment the
-global step.  This op is, by convention, known as the `train_op` and is what must
-be run by a TensorFlow session in order to induce one full step of training
-(see below).
-
-```python
-global_step = tf.Variable(0, name='global_step', trainable=False)
-train_op = optimizer.minimize(loss, global_step=global_step)
-```
-
-## Train the Model
-
-Once the graph is built, it can be iteratively trained and evaluated in a loop
-controlled by the user code in `fully_connected_feed.py`.
-
-### The Graph
-
-At the top of the `run_training()` function is a python `with` command that
-indicates all of the built ops are to be associated with the default
-global [`tf.Graph`](../../../api_docs/python/framework.md#Graph)
-instance.
-
-```python
-with tf.Graph().as_default():
-```
-
-A `tf.Graph` is a collection of ops that may be executed together as a group.
-Most TensorFlow uses will only need to rely on the single default graph.
-
-More complicated uses with multiple graphs are possible, but beyond the scope of
-this simple tutorial.
-
-### The Session
-
-Once all of the build preparation has been completed and all of the necessary
-ops generated, a [`tf.Session`](../../../api_docs/python/client.md#Session)
-is created for running the graph.
-
-```python
-sess = tf.Session()
-```
-
-Alternately, a `Session` may be generated into a `with` block for scoping:
-
-```python
-with tf.Session() as sess:
-```
-
-The empty parameter to session indicates that this code will attach to
-(or create if not yet created) the default local session.
-
-Immediately after creating the session, all of the `tf.Variable`
-instances are initialized by calling [`sess.run()`](../../../api_docs/python/client.md#Session.run)
-on their initialization op.
-
-```python
-init = tf.global_variables_initializer()
-sess.run(init)
-```
-
-The [`sess.run()`](../../../api_docs/python/client.md#Session.run)
-method will run the complete subset of the graph that
-corresponds to the op(s) passed as parameters.  In this first call, the `init`
-op is a [`tf.group`](../../../api_docs/python/control_flow_ops.md#group)
-that contains only the initializers for the variables.  None of the rest of the
-graph is run here; that happens in the training loop below.
-
-### Train Loop
-
-After initializing the variables with the session, training may begin.
-
-The user code controls the training per step, and the simplest loop that
-can do useful training is:
-
-```python
-for step in xrange(FLAGS.max_steps):
-    sess.run(train_op)
-```
-
-However, this tutorial is slightly more complicated in that it must also slice
-up the input data for each step to match the previously generated placeholders.
-
-#### Feed the Graph
-
-For each step, the code will generate a feed dictionary that will contain the
-set of examples on which to train for the step, keyed by the placeholder
-ops they represent.
-
-In the `fill_feed_dict()` function, the given `DataSet` is queried for its next
-`batch_size` set of images and labels, and tensors matching the placeholders are
-filled containing the next images and labels.
-
-```python
-images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,
-                                               FLAGS.fake_data)
-```
-
-A python dictionary object is then generated with the placeholders as keys and
-the representative feed tensors as values.
-
-```python
-feed_dict = {
-    images_placeholder: images_feed,
-    labels_placeholder: labels_feed,
-}
-```
-
-This is passed into the `sess.run()` function's `feed_dict` parameter to provide
-the input examples for this step of training.
-
-#### Check the Status
-
-The code specifies two values to fetch in its run call: `[train_op, loss]`.
-
-```python
-for step in xrange(FLAGS.max_steps):
-    feed_dict = fill_feed_dict(data_sets.train,
-                               images_placeholder,
-                               labels_placeholder)
-    _, loss_value = sess.run([train_op, loss],
-                             feed_dict=feed_dict)
-```
-
-Because there are two values to fetch, `sess.run()` returns a tuple with two
-items.  Each `Tensor` in the list of values to fetch corresponds to a numpy
-array in the returned tuple, filled with the value of that tensor during this
-step of training. Since `train_op` is an `Operation` with no output value, the
-corresponding element in the returned tuple is `None` and, thus,
-discarded. However, the value of the `loss` tensor may become NaN if the model
-diverges during training, so we capture this value for logging.
-
-Assuming that the training runs fine without NaNs, the training loop also
-prints a simple status text every 100 steps to let the user know the state of
-training.
-
-```python
-if step % 100 == 0:
-    print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)
-```
-
-#### Visualize the Status
-
-In order to emit the events files used by [TensorBoard](../../../how_tos/summaries_and_tensorboard/index.md),
-all of the summaries (in this case, only one) are collected into a single Tensor
-during the graph building phase.
-
-```python
-summary = tf.merge_all_summaries()
-```
-
-And then after the session is created, a [`tf.train.SummaryWriter`](../../../api_docs/python/train.md#SummaryWriter)
-may be instantiated to write the events files, which
-contain both the graph itself and the values of the summaries.
-
-```python
-summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
-```
-
-Lastly, the events file will be updated with new summary values every time the
-`summary` is evaluated and the output passed to the writer's `add_summary()`
-function.
-
-```python
-summary_str = sess.run(summary, feed_dict=feed_dict)
-summary_writer.add_summary(summary_str, step)
-```
-
-When the events files are written, TensorBoard may be run against the training
-folder to display the values from the summaries.
-
-![MNIST TensorBoard](../../../images/mnist_tensorboard.png "MNIST TensorBoard")
-
-**NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial [Tensorboard: Visualizing Your Training](../../../how_tos/summaries_and_tensorboard/index.md).
-
-#### Save a Checkpoint
-
-In order to emit a checkpoint file that may be used to later restore a model
-for further training or evaluation, we instantiate a
-[`tf.train.Saver`](../../../api_docs/python/state_ops.md#Saver).
-
-```python
-saver = tf.train.Saver()
-```
-
-In the training loop, the [`saver.save()`](../../../api_docs/python/state_ops.md#Saver.save)
-method will periodically be called to write a checkpoint file to the training
-directory with the current values of all the trainable variables.
-
-```python
-saver.save(sess, FLAGS.train_dir, global_step=step)
-```
-
-At some later point in the future, training might be resumed by using the
-[`saver.restore()`](../../../api_docs/python/state_ops.md#Saver.restore)
-method to reload the model parameters.
-
-```python
-saver.restore(sess, FLAGS.train_dir)
-```
-
-## Evaluate the Model
-
-Every thousand steps, the code will attempt to evaluate the model against both
-the training and test datasets.  The `do_eval()` function is called thrice, for
-the training, validation, and test datasets.
-
-```python
-print 'Training Data Eval:'
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.train)
-print 'Validation Data Eval:'
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.validation)
-print 'Test Data Eval:'
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.test)
-```
-
-> Note that more complicated usage would usually sequester the `data_sets.test`
-> to only be checked after significant amounts of hyperparameter tuning.  For
-> the sake of a simple little MNIST problem, however, we evaluate against all of
-> the data.
-
-### Build the Eval Graph
-
-Before entering the training loop, the Eval op should have been built
-by calling the `evaluation()` function from `mnist.py` with the same
-logits/labels parameters as the `loss()` function.
-
-```python
-eval_correct = mnist.evaluation(logits, labels_placeholder)
-```
-
-The `evaluation()` function simply generates a [`tf.nn.in_top_k`](../../../api_docs/python/nn.md#in_top_k)
-op that can automatically score each model output as correct if the true label
-can be found in the K most-likely predictions.  In this case, we set the value
-of K to 1 to only consider a prediction correct if it is for the true label.
-
-```python
-eval_correct = tf.nn.in_top_k(logits, labels, 1)
-```
-
-### Eval Output
-
-One can then create a loop for filling a `feed_dict` and calling `sess.run()`
-against the `eval_correct` op to evaluate the model on the given dataset.
-
-```python
-for step in xrange(steps_per_epoch):
-    feed_dict = fill_feed_dict(data_set,
-                               images_placeholder,
-                               labels_placeholder)
-    true_count += sess.run(eval_correct, feed_dict=feed_dict)
-```
-
-The `true_count` variable simply accumulates all of the predictions that the
-`in_top_k` op has determined to be correct.  From there, the precision may be
-calculated from simply dividing by the total number of examples.
-
-```python
-precision = true_count / num_examples
-print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
-      (num_examples, true_count, precision))
-```
diff --git a/tensorflow/g3doc/tutorials/monitors/index.md b/tensorflow/g3doc/tutorials/monitors/index.md
deleted file mode 100644
index 35bd1fc4127..00000000000
--- a/tensorflow/g3doc/tutorials/monitors/index.md
+++ /dev/null
@@ -1,416 +0,0 @@
-# Logging and Monitoring Basics with tf.contrib.learn
-
-When training a model, it’s often valuable to track and evaluate progress in
-real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
-capabilities and the `Monitor` API to audit the in-progress training of a neural
-network classifier for categorizing irises. This tutorial builds on the code
-developed in [tf.contrib.learn Quickstart](../tflearn/index.md) so if you
-haven't yet completed that tutorial, you may want to explore it first,
-especially if you're looking for an intro/refresher on tf.contrib.learn basics.
-
-## Setup {#setup}
-
-For this tutorial, you'll be building upon the following code from
-[tf.contrib.learn Quickstart](../tflearn/index.md):
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-# Data sets
-IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv")
-IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
-
-def main(unused_argv):
-    # Load datasets.
-    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-
-    # Specify that all features have real-value data
-    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-    # Build 3 layer DNN with 10, 20, 10 units respectively.
-    classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                                hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                model_dir="/tmp/iris_model")
-
-    # Fit model.
-    classifier.fit(x=training_set.data,
-                   y=training_set.target,
-                   steps=2000)
-
-    # Evaluate accuracy.
-    accuracy_score = classifier.evaluate(x=test_set.data,
-                                         y=test_set.target)["accuracy"]
-    print('Accuracy: {0:f}'.format(accuracy_score))
-
-    # Classify two new flower samples.
-    new_samples = np.array(
-        [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
-    y = list(classifier.predict(new_samples, as_iterable=True))
-    print('Predictions: {}'.format(str(y)))
-
-if __name__ == "__main__":
-  tf.app.run()
-```
-
-Copy the above code into a file, and download the corresponding
-[training](http://download.tensorflow.org/data/iris_training.csv) and
-[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
-directory.
-
-In the following sections, you'll progressively make updates to the above code
-to add logging and monitoring capabilities. Final code incorporating all updates
-is [available for download
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iris_monitors.py).
-
-## Overview
-
-The [tf.contrib.learn Quickstart tutorial](../tflearn/index.md) walked through
-how to implement a neural net classifier to categorize iris examples into one of
-three species.
-
-But when [the code](#setup) from this tutorial is run, the output contains no
-logging tracking how model training is progressing&mdash;only the results of the
-`print` statements that were included:
-
-```none
-Accuracy: 0.933333
-Predictions: [1 2]
-```
-
-Without any logging, model training feels like a bit of a black box; you can't
-see what's happening as TensorFlow steps through gradient descent, get a sense
-of whether the model is converging appropriately, or audit to determine whether
-[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be
-appropriate.
-
-One way to address this problem would be to split model training into multiple
-`fit` calls with smaller numbers of steps in order to evaluate accuracy more
-progressively. However, this is not recommended practice, as it greatly slows
-down model training. Fortunately, tf.contrib.learn offers another solution: a
-[Monitor API](../../api_docs/python/contrib.learn.monitors.md) designed to help
-you log metrics and evaluate your model while training is in progress. In the
-following sections, you'll learn how to enable logging in TensorFlow, set up a
-ValidationMonitor to do streaming evaluations, and visualize your metrics using
-TensorBoard.
-
-## Enabling Logging with TensorFlow
-
-TensorFlow uses five different levels for log messages. In order of ascending
-severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you
-configure logging at any of these levels, TensorFlow will output all log
-messages corresponding to that level and all levels of higher severity. For
-example, if you set a logging level of `ERROR`, you'll get log output containing
-`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log
-messages from all five levels.
-
-By default, TensorFlow is configured at a logging level of `WARN`, but when
-tracking model training, you'll want to adjust the level to `INFO`, which will
-provide additional feedback as `fit` operations are in progress.
-
-Add the following line to the beginning of your code (right after your
-`import`s):
-
-```python
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Now when you run the code, you'll see additional log output like the following:
-
-```none
-INFO:tensorflow:loss = 1.18812, step = 1
-INFO:tensorflow:loss = 0.210323, step = 101
-INFO:tensorflow:loss = 0.109025, step = 201
-```
-
-With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss
-metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100
-steps.
-
-## Configuring a ValidationMonitor for Streaming Evaluation
-
-Logging training loss is helpful to get a sense whether your model is
-converging, but what if you want further insight into what's happening during
-training? tf.contrib.learn provides several high-level `Monitor`s you can attach
-to your `fit` operations to further track metrics and/or debug lower-level
-TensorFlow operations during model training, including:
-
-Monitor             | Description
-------------------- | -----------
-`CaptureVariable`   | Saves a specified variable's values into a collection at every _n_ steps of training
-`PrintTensor`       | Logs a specified tensor's values at every _n_ steps of training
-`SummarySaver`      | Saves [`Summary`](../../api_docs/python/train.md#summary-operations) [protocol buffers](https://developers.google.com/protocol-buffers/) for a given tensor using a [`SummaryWriter`](../../api_docs/python/train.md#SummaryWriter) at every _n_ steps of training
-`ValidationMonitor` | Logs a specified set of evaluation metrics at every _n_ steps of training, and, if desired, implements early stopping under certain conditions
-
-### Evaluating Every *N* Steps
-
-For the iris neural network classifier, while logging training loss, you might
-also want to simultaneously evaluate against test data to see how well the model
-is generalizing. You can accomplish this by configuring a `ValidationMonitor`
-with the test data (`test_set.data` and `test_set.target`), and setting how
-often to evaluate with `every_n_steps`. The default value of `every_n_steps` is
-`100`; here, set `every_n_steps` to `50` to evaluate after every 50 steps of
-model training:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50)
-```
-
-Place this code right before the line instantiating the `classifier`.
-
-`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations,
-so you'll want to modify instantiation of the `classifier` to add a
-[`RunConfig`](../../api_docs/python/contrib.learn.md#RunConfig) that includes
-`save_checkpoints_secs`, which specifies how many seconds should elapse between
-checkpoint saves during training. Because the iris data set is quite small, and
-thus trains quickly, it makes sense to set `save_checkpoints_secs` to 1 (saving
-a checkpoint every second) to ensure a sufficient number of checkpoints:
-
-```python
-classifier = tf.contrib.learn.DNNClassifier(
-    feature_columns=feature_columns,
-    hidden_units=[10, 20, 10],
-    n_classes=3,
-    model_dir="/tmp/iris_model",
-    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
-```
-
-NOTE: The `model_dir` parameter specifies an explicit directory
-(`/tmp/iris_model`) for model data to be stored; this directory path will be
-easier to reference later on than an autogenerated one. Each time you run the
-code, any existing data in `/tmp/iris_model` will be loaded, and model training
-will continue where it left off in the last run (e.g., running the script twice
-in succession will execute 4000 steps during training&mdash;2000 during each
-`fit` operation). To start over model training from scratch, delete
-`/tmp/iris_model` before running the code.
-
-Finally, to attach your `validation_monitor`, update the `fit` call to include a
-`monitors` param, which takes a list of all monitors to run during model
-training:
-
-```python
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000,
-               monitors=[validation_monitor])
-```
-
-Now, when you rerun the code, you should see validation metrics in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667
-```
-
-### Customizing the Evaluation Metrics with MetricSpec
-
-By default, if no evaluation metrics are specified, `ValidationMonitor` will log
-both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you
-can customize the list of metrics that will be run every 50 steps. To specify
-the exact metrics you'd like to run in each evaluation pass, you can add a
-`metrics` param to the `ValidationMonitor` constructor. `metrics` takes a dict
-of key/value pairs, where each key is the name you'd like logged for the metric,
-and the corresponding value is a
-[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py)
-object.
-
-The `MetricSpec` constructor accepts four parameters:
-
-*   `metric_fn`. The function that calculates and returns the value of a metric.
-    This can be a predefined function available in the [tf.contrib.metrics
-    module](../../api_docs/python/contrib.metrics.md), such as
-    [`streaming_precision`](https://www.tensorflow.org/code/tensorflow/contrib/metrics/python/ops/metric_ops.py)
-    or
-    [`streaming_recall`](https://www.tensorflow.org/code/tensorflow/contrib/metrics/python/ops/metric_ops.py).
-
-    Alternatively, you can define your own custom metric function, which must
-    take `predictions` and `labels` tensors as arguments (a `weights` argument
-    can also optionally be supplied). The function must return the value of the
-    metric in one of two formats:
-
-    *   A single tensor
-    *   A pair of ops `(value_op, update_op)`, where `value_op` returns the
-        metric value and `update_op` performs a corresponding operation to
-        update internal model state.
-
-*   `prediction_key`. The key of the tensor containing the predictions returned
-    by the model. This argument may be omitted if the model returns either a
-    single tensor or a dict with a single entry. For a `DNNClassifier` model,
-    class predictions will be returned in a tensor with the key
-    [`PredictionKey.CLASSES`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/prediction_key.py).
-
-*   `label_key`. The key of the tensor containing the labels returned by the
-    model, as specified by the model's [`input_fn`](../input_fn/index.md). As
-    with `prediction_key`, this argument may be omitted if the `input_fn`
-    returns either a single tensor or a dict with a single entry. In the iris
-    example in this tutorial, the `DNNClassifier` does not have an `input_fn`
-    (`x`,`y` data is passed directly to `fit`), so it's not necessary to provide
-    a `label_key`.
-
-*   `weights_key`. *Optional*. The key of the tensor (returned by the
-    [`input_fn`](../input_fn/index.md)) containing weights inputs for the
-    `metric_fn`.
-
-The following code creates a `validation_metrics` dict that defines three
-metrics to log during model evaluation:
-
-*   `"accuracy"`, using
-    [`streaming_accuracy`](https://www.tensorflow.org/code/tensorflow/contrib/metrics/python/ops/metric_ops.py)
-    as the `metric_fn`
-*   `"precision"`, using
-    [`streaming_precision`](https://www.tensorflow.org/code/tensorflow/contrib/metrics/python/ops/metric_ops.py)
-    as the `metric_fn`
-*   `"recall"`, using
-    [`streaming_recall`](https://www.tensorflow.org/code/tensorflow/contrib/metrics/python/ops/metric_ops.py)
-    as the `metric_fn`
-
-```python
-validation_metrics = {
-    "accuracy":
-        tf.contrib.learn.metric_spec.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_accuracy,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES),
-    "precision":
-        tf.contrib.learn.metric_spec.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_precision,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES),
-    "recall":
-        tf.contrib.learn.metric_spec.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_recall,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES)
-}
-```
-
-Add the above code before the `ValidationMonitor` constructor. Then revise the
-`ValidationMonitor` constructor as follows to add a `metrics` parameter to log
-the accuracy, precision, and recall metrics specified in `validation_metrics`
-(loss is always logged, and doesn't need to be explicity specified):
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics)
-```
-
-Rerun the code, and you should see precision and recall included in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): recall = 0.0, loss = 1.20626, global_step = 1, precision = 0.0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 600): recall = 1.0, loss = 0.0530696, global_step = 571, precision = 1.0, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1500): recall = 1.0, loss = 0.0617403, global_step = 1452, precision = 1.0, accuracy = 0.966667
-```
-
-### Early Stopping with ValidationMonitor
-
-Note that in the above log output, by step 600, the model has already achieved
-precision and recall rates of 1.0. This raises the question as to whether model
-training could benefit from [early
-stopping](https://en.wikipedia.org/wiki/Early_stopping).
-
-In addition to logging eval metrics, `ValidationMonitor`s make it easy to
-implement early stopping when specified conditions are met, via three params:
-
-| Param                            | Description                               |
-| -------------------------------- | ----------------------------------------- |
-| `early_stopping_metric`          | Metric that triggers early stopping       |
-:                                  : (e.g., loss or accuracy) under conditions :
-:                                  : specified in `early_stopping_rounds` and  :
-:                                  : `early_stopping_metric_minimize`. Default :
-:                                  : is `"loss"`.                              :
-| `early_stopping_metric_minimize` | `True` if desired model behavior is to    |
-:                                  : minimize the value of                     :
-:                                  : `early_stopping_metric`; `False` if       :
-:                                  : desired model behavior is to maximize the :
-:                                  : value of `early_stopping_metric`. Default :
-:                                  : is `True`.                                :
-| `early_stopping_rounds`          | Sets a number of steps during which if    |
-:                                  : the `early_stopping_metric` does not      :
-:                                  : decrease (if                              :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `True`) or increase (if                   :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `False`), training will be stopped.       :
-:                                  : Default is `None`, which means early      :
-:                                  : stopping will never occur.                :
-
-Make the following revision to the `ValidationMonitor` constructor, which
-specifies that if loss (`early_stopping_metric="loss"`) does not decrease
-(`early_stopping_metric_minimize=True`) over a period of 200 steps
-(`early_stopping_rounds=200`), model training will stop immediately at that
-point, and not complete the full 2000 steps specified in `fit`:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics,
-    early_stopping_metric="loss",
-    early_stopping_metric_minimize=True,
-    early_stopping_rounds=200)
-```
-
-Rerun the code to see if model training stops early:
-
-```none
-...
-INFO:tensorflow:Validation (step 1150): recall = 1.0, loss = 0.056436, global_step = 1119, precision = 1.0, accuracy = 0.966667
-INFO:tensorflow:Stopping. Best step: 800 with loss = 0.048313818872.
-```
-
-Indeed, here training stops at step 1150, indicating that for the past 200
-steps, loss did not decrease, and that overall, step 800 produced the smallest
-loss value against the test data set. This suggests that additional calibration
-of hyperparameters by decreasing the step count might further improve the model.
-
-## Visualizing Log Data with TensorBoard
-
-Reading through the log produced by `ValidationMonitor` provides plenty of raw
-data on model performance during training, but it may also be helpful to see
-visualizations of this data to get further insight into trends&mdash;for
-example, how accuracy is changing over step count. You can use TensorBoard (a
-separate program packaged with TensorFlow) to plot graphs like this by setting
-the `logdir` command-line argument to the directory where you saved your model
-training data (here, `/tmp/iris_model`). Run the following on your command line:
-
-<pre><strong>$ tensorboard --logdir=/tmp/iris_model/</strong>
-Starting TensorBoard 39 on port 6006</pre>
-
-Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
-*`<port_number>`* is the port specified in the command-line output (here,
-`6006`).
-
-If you click on the accuracy field, you'll see an image like the following,
-which shows accuracy plotted against step count:
-
-![Accuracy over step count in
-TensorBoard](../../images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
-
-For more on using TensorBoard, see [TensorBoard: Visualizing
-Learning](../../how_tos/summaries_and_tensorboard/index.md) and [TensorBoard:
-Graph Visualization](../../how_tos/graph_viz/index.md).
diff --git a/tensorflow/g3doc/tutorials/pdes/index.md b/tensorflow/g3doc/tutorials/pdes/index.md
deleted file mode 100755
index c4ed8dce657..00000000000
--- a/tensorflow/g3doc/tutorials/pdes/index.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# Partial Differential Equations
-
-TensorFlow isn't just for machine learning.  Here we give a (somewhat
-pedestrian) example of using TensorFlow for simulating the behavior of a
-[partial differential equation](
-https://en.wikipedia.org/wiki/Partial_differential_equation).
-We'll simulate the surface of square pond as a few raindrops land on it.
-
-Note: This tutorial was originally prepared as an IPython notebook.
-
-## Basic Setup
-
-A few imports we'll need.
-
-```python
-#Import libraries for simulation
-import tensorflow as tf
-import numpy as np
-
-#Imports for visualization
-import PIL.Image
-from io import BytesIO
-from IPython.display import clear_output, Image, display
-```
-
-A function for displaying the state of the pond's surface as an image.
-
-```python
-def DisplayArray(a, fmt='jpeg', rng=[0,1]):
-  """Display an array as a picture."""
-  a = (a - rng[0])/float(rng[1] - rng[0])*255
-  a = np.uint8(np.clip(a, 0, 255))
-  f = BytesIO()
-  PIL.Image.fromarray(a).save(f, fmt)
-  clear_output(wait = True)
-  display(Image(data=f.getvalue()))
-```
-
-Here we start an interactive TensorFlow session for convenience in playing
-around.  A regular session would work as well if we were doing this in an
-executable .py file.
-
-```python
-sess = tf.InteractiveSession()
-```
-
-## Computational Convenience Functions
-
-
-```python
-def make_kernel(a):
-  """Transform a 2D array into a convolution kernel"""
-  a = np.asarray(a)
-  a = a.reshape(list(a.shape) + [1,1])
-  return tf.constant(a, dtype=1)
-
-def simple_conv(x, k):
-  """A simplified 2D convolution operation"""
-  x = tf.expand_dims(tf.expand_dims(x, 0), -1)
-  y = tf.nn.depthwise_conv2d(x, k, [1, 1, 1, 1], padding='SAME')
-  return y[0, :, :, 0]
-
-def laplace(x):
-  """Compute the 2D laplacian of an array"""
-  laplace_k = make_kernel([[0.5, 1.0, 0.5],
-                           [1.0, -6., 1.0],
-                           [0.5, 1.0, 0.5]])
-  return simple_conv(x, laplace_k)
-```
-
-## Define the PDE
-
-Our pond is a perfect 500 x 500 square, as is the case for most ponds found in
-nature.
-
-```python
-N = 500
-```
-
-Here we create our pond and hit it with some rain drops.
-
-```python
-# Initial Conditions -- some rain drops hit a pond
-
-# Set everything to zero
-u_init = np.zeros([N, N], dtype=np.float32)
-ut_init = np.zeros([N, N], dtype=np.float32)
-
-# Some rain drops hit a pond at random points
-for n in range(40):
-  a,b = np.random.randint(0, N, 2)
-  u_init[a,b] = np.random.uniform()
-
-DisplayArray(u_init, rng=[-0.1, 0.1])
-```
-
-![jpeg](../../images/pde_output_1.jpg)
-
-
-Now let's specify the details of the differential equation.
-
-
-```python
-# Parameters:
-# eps -- time resolution
-# damping -- wave damping
-eps = tf.placeholder(tf.float32, shape=())
-damping = tf.placeholder(tf.float32, shape=())
-
-# Create variables for simulation state
-U  = tf.Variable(u_init)
-Ut = tf.Variable(ut_init)
-
-# Discretized PDE update rules
-U_ = U + eps * Ut
-Ut_ = Ut + eps * (laplace(U) - damping * Ut)
-
-# Operation to update the state
-step = tf.group(
-  U.assign(U_),
-  Ut.assign(Ut_))
-```
-
-## Run The Simulation
-
-This is where it gets fun -- running time forward with a simple for loop.
-
-```python
-# Initialize state to initial conditions
-tf.global_variables_initializer().run()
-
-# Run 1000 steps of PDE
-for i in range(1000):
-  # Step simulation
-  step.run({eps: 0.03, damping: 0.04})
-  DisplayArray(U.eval(), rng=[-0.1, 0.1])
-```
-
-![jpeg](../../images/pde_output_2.jpg)
-
-Look! Ripples!
-
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
deleted file mode 100644
index 993b1c345b3..00000000000
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ /dev/null
@@ -1,200 +0,0 @@
-# Recurrent Neural Networks
-
-## Introduction
-
-Take a look at [this great article](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
-for an introduction to recurrent neural networks and LSTMs in particular.
-
-## Language Modeling
-
-In this tutorial we will show how to train a recurrent neural network on
-a challenging task of language modeling. The goal of the problem is to fit a
-probabilistic model which assigns probabilities to sentences. It does so by
-predicting next words in a text given a history of previous words. For this
-purpose we will use the [Penn Tree Bank](https://catalog.ldc.upenn.edu/ldc99t42)
-(PTB) dataset, which is a popular benchmark for measuring quality of these
-models, whilst being small and relatively fast to train.
-
-Language modeling is key to many interesting problems such as speech
-recognition, machine translation, or image captioning. It is also fun --
-take a look [here](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
-
-For the purpose of this tutorial, we will reproduce the results from
-[Zaremba et al., 2014](http://arxiv.org/abs/1409.2329)
-([pdf](http://arxiv.org/pdf/1409.2329.pdf)), which achieves very good quality
-on the PTB dataset.
-
-## Tutorial Files
-
-This tutorial references the following files from `models/tutorials/rnn/ptb` in the [TensorFlow models repo](https://github.com/tensorflow/models):
-
-File | Purpose
---- | ---
-`ptb_word_lm.py` | The code to train a language model on the PTB dataset.
-`reader.py` | The code to read the dataset.
-
-## Download and Prepare the Data
-
-The data required for this tutorial is in the data/ directory of the
-PTB dataset from Tomas Mikolov's webpage:
-http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-
-The dataset is already preprocessed and contains overall 10000 different words,
-including the end-of-sentence marker and a special symbol (\<unk\>) for rare
-words. In `reader.py`, we convert each word to a unique integer identifier,
-in order to make it easy for the neural network to process the data.
-
-## The Model
-
-### LSTM
-
-The core of the model consists of an LSTM cell that processes one word at a
-time and computes probabilities of the possible values for the next word in the
-sentence. The memory state of the network is initialized with a vector of zeros
-and gets updated after reading each word. For computational reasons, we will
-process data in mini-batches of size `batch_size`.
-
-The basic pseudocode is as follows:
-
-```python
-lstm = rnn_cell.BasicLSTMCell(lstm_size)
-# Initial state of the LSTM memory.
-state = tf.zeros([batch_size, lstm.state_size])
-probabilities = []
-loss = 0.0
-for current_batch_of_words in words_in_dataset:
-    # The value of state is updated after processing each batch of words.
-    output, state = lstm(current_batch_of_words, state)
-
-    # The LSTM output can be used to make next word predictions
-    logits = tf.matmul(output, softmax_w) + softmax_b
-    probabilities.append(tf.nn.softmax(logits))
-    loss += loss_function(probabilities, target_words)
-```
-
-### Truncated Backpropagation
-
-By design, the output of a recurrent neural network (RNN) depends on arbitrarily
-distant inputs. Unfortunately, this makes backpropagation computation difficult.
-In order to make the learning process tractable, it is common practice to create
-an "unrolled" version of the network, which contains a fixed number
-(`num_steps`) of LSTM inputs and outputs. The model is then trained on this
-finite approximation of the RNN. This can be implemented by feeding inputs of
-length `num_steps` at a time and performing a backward pass after each
-such input block.
-
-Here is a simplified block of code for creating a graph which performs
-truncated backpropagation:
-
-```python
-# Placeholder for the inputs in a given iteration.
-words = tf.placeholder(tf.int32, [batch_size, num_steps])
-
-lstm = rnn_cell.BasicLSTMCell(lstm_size)
-# Initial state of the LSTM memory.
-initial_state = state = tf.zeros([batch_size, lstm.state_size])
-
-for i in range(num_steps):
-    # The value of state is updated after processing each batch of words.
-    output, state = lstm(words[:, i], state)
-
-    # The rest of the code.
-    # ...
-
-final_state = state
-```
-
-And this is how to implement an iteration over the whole dataset:
-
-```python
-# A numpy array holding the state of LSTM after each batch of words.
-numpy_state = initial_state.eval()
-total_loss = 0.0
-for current_batch_of_words in words_in_dataset:
-    numpy_state, current_loss = session.run([final_state, loss],
-        # Initialize the LSTM state from the previous iteration.
-        feed_dict={initial_state: numpy_state, words: current_batch_of_words})
-    total_loss += current_loss
-```
-
-### Inputs
-
-The word IDs will be embedded into a dense representation (see the
-[Vector Representations Tutorial](../../tutorials/word2vec/index.md)) before feeding to
-the LSTM. This allows the model to efficiently represent the knowledge about
-particular words. It is also easy to write:
-
-```python
-# embedding_matrix is a tensor of shape [vocabulary_size, embedding size]
-word_embeddings = tf.nn.embedding_lookup(embedding_matrix, word_ids)
-```
-
-The embedding matrix will be initialized randomly and the model will learn to
-differentiate the meaning of words just by looking at the data.
-
-### Loss Function
-
-We want to minimize the average negative log probability of the target words:
-
-$$ \text{loss} = -\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i} $$
-
-It is not very difficult to implement but the function
-`sequence_loss_by_example` is already available, so we can just use it here.
-
-The typical measure reported in the papers is average per-word perplexity (often
-just called perplexity), which is equal to
-
-$$e^{-\frac{1}{N}\sum_{i=1}^{N} \ln p_{\text{target}_i}} = e^{\text{loss}} $$
-
-and we will monitor its value throughout the training process.
-
-### Stacking multiple LSTMs
-
-To give the model more expressive power, we can add multiple layers of LSTMs
-to process the data. The output of the first layer will become the input of
-the second and so on.
-
-We have a class called `MultiRNNCell` that makes the implementation seamless:
-
-```python
-lstm = rnn_cell.BasicLSTMCell(lstm_size, state_is_tuple=False)
-stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers,
-    state_is_tuple=False)
-
-initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
-for i in range(num_steps):
-    # The value of state is updated after processing each batch of words.
-    output, state = stacked_lstm(words[:, i], state)
-
-    # The rest of the code.
-    # ...
-
-final_state = state
-```
-
-## Run the Code
-
-Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. Run the following commands:
-
-```bash
-cd models/tutorials/rnn/ptb
-python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
-```
-
-There are 3 supported model configurations in the tutorial code: "small",
-"medium" and "large". The difference between them is in size of the LSTMs and
-the set of hyperparameters used for training.
-
-The larger the model, the better results it should get. The `small` model should
-be able to reach perplexity below 120 on the test set and the `large` one below
-80, though it might take several hours to train.
-
-## What Next?
-
-There are several tricks that we haven't mentioned that make the model better,
-including:
-
-* decreasing learning rate schedule,
-* dropout between the LSTM layers.
-
-Study the code and modify it to improve the model even further.
diff --git a/tensorflow/g3doc/tutorials/seq2seq/index.md b/tensorflow/g3doc/tutorials/seq2seq/index.md
deleted file mode 100644
index 9d94167f9a9..00000000000
--- a/tensorflow/g3doc/tutorials/seq2seq/index.md
+++ /dev/null
@@ -1,336 +0,0 @@
-# Sequence-to-Sequence Models
-
-Recurrent neural networks can learn to model language, as already discussed
-in the [RNN Tutorial](../../tutorials/recurrent/index.md)
-(if you did not read it, please go through it before proceeding with this one).
-This raises an interesting question: could we condition the generated words on
-some input and generate a meaningful response? For example, could we train
-a neural network to translate from English to French? It turns out that
-the answer is *yes*.
-
-This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
-
-```
-cd models/tutorials/rnn/translate
-python translate.py --data_dir [your_data_directory]
-```
-
-It will download English-to-French translation data from the
-[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html)
-prepare it for training and train. It takes about 20GB of disk space,
-and a while to download and prepare (see [later](#lets-run-it) for details),
-so you can start and leave it running while reading this tutorial.
-
-This tutorial references the following files.
-
-File | What's in it?
---- | ---
-`tensorflow/tensorflow/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
-`models/tutorials/rnn/translate/seq2seq_model.py` | Neural translation sequence-to-sequence model.
-`models/tutorials/rnn/translate/data_utils.py` | Helper functions for preparing translation data.
-`models/tutorials/rnn/translate/translate.py` | Binary that trains and runs the translation model.
-
-
-## Sequence-to-sequence basics
-
-A basic sequence-to-sequence model, as introduced in
-[Cho et al., 2014](http://arxiv.org/abs/1406.1078)
-([pdf](http://arxiv.org/pdf/1406.1078.pdf)), consists of two recurrent neural
-networks (RNNs): an *encoder* that processes the input and a *decoder* that
-generates the output. This basic architecture is depicted below.
-
-<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/basic_seq2seq.png" />
-</div>
-
-Each box in the picture above represents a cell of the RNN, most commonly
-a GRU cell or an LSTM cell (see the [RNN Tutorial](../../tutorials/recurrent/index.md)
-for an explanation of those). Encoder and decoder can share weights or,
-as is more common, use a different set of parameters. Multi-layer cells
-have been successfully used in sequence-to-sequence models too, e.g. for
-translation [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215)
-([pdf](http://arxiv.org/pdf/1409.3215.pdf)).
-
-In the basic model depicted above, every input has to be encoded into
-a fixed-size state vector, as that is the only thing passed to the decoder.
-To allow the decoder more direct access to the input, an *attention* mechanism
-was introduced in [Bahdanau et al., 2014](http://arxiv.org/abs/1409.0473)
-([pdf](http://arxiv.org/pdf/1409.0473.pdf)).
-We will not go into the details of the attention mechanism (see the paper),
-suffice it to say that it allows the decoder to peek into the input at every
-decoding step. A multi-layer sequence-to-sequence network with LSTM cells and
-attention mechanism in the decoder looks like this.
-
-<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/attention_seq2seq.png" />
-</div>
-
-## TensorFlow seq2seq library
-
-As you can see above, there are many different sequence-to-sequence
-models. Each of these models can use different RNN cells, but all
-of them accept encoder inputs and decoder inputs. This motivates
-the interfaces in the TensorFlow seq2seq library (`tensorflow/tensorflow/python/ops/seq2seq.py`).
-The basic RNN encoder-decoder sequence-to-sequence model works as follows.
-
-```python
-outputs, states = basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell)
-```
-
-In the above call, `encoder_inputs` are a list of tensors representing inputs
-to the encoder, i.e., corresponding to the letters *A, B, C* in the first
-picture above. Similarly, `decoder_inputs` are tensors representing inputs
-to the decoder, *GO, W, X, Y, Z* on the first picture.
-
-The `cell` argument is an instance of the `models.rnn.rnn_cell.RNNCell` class
-that determines which cell will be used inside the model. You can use
-an existing cell, such as `GRUCell` or `LSTMCell`, or you can write your own.
-Moreover, `rnn_cell` provides wrappers to construct multi-layer cells,
-add dropout to cell inputs or outputs, or to do other transformations,
-see the [RNN Tutorial](../../tutorials/recurrent/index.md) for examples.
-
-The call to `basic_rnn_seq2seq` returns two arguments: `outputs` and `states`.
-Both of them are lists of tensors of the same length as `decoder_inputs`.
-Naturally, `outputs` correspond to the outputs of the decoder in each time-step,
-in the first picture above that would be *W, X, Y, Z, EOS*. The returned
-`states` represent the internal state of the decoder at every time-step.
-
-In many applications of sequence-to-sequence models, the output of the decoder
-at time t is fed back and becomes the input of the decoder at time t+1. At test
-time, when decoding a sequence, this is how the sequence is constructed.
-During training, on the other hand, it is common to provide the correct input
-to the decoder at every time-step, even if the decoder made a mistake before.
-Functions in `seq2seq.py` support both modes using the `feed_previous` argument.
-For example, let's analyze the following use of an embedding RNN model.
-
-```python
-outputs, states = embedding_rnn_seq2seq(
-    encoder_inputs, decoder_inputs, cell,
-    num_encoder_symbols, num_decoder_symbols,
-    output_projection=None, feed_previous=False)
-```
-
-In the `embedding_rnn_seq2seq` model, all inputs (both `encoder_inputs` and
-`decoder_inputs`) are integer-tensors that represent discrete values.
-They will be embedded into a dense representation (see the
-[Vectors Representations Tutorial](../../tutorials/word2vec/index.md) for more details
-on embeddings), but to construct these embeddings we need to specify
-the maximum number of discrete symbols that will appear: `num_encoder_symbols`
-on the encoder side, and `num_decoder_symbols` on the decoder side.
-
-In the above invocation, we set `feed_previous` to False. This means that the
-decoder will use `decoder_inputs` tensors as provided. If we set `feed_previous`
-to True, the decoder would only use the first element of `decoder_inputs`.
-All other tensors from this list would be ignored, and instead the previous
-output of the decoder would be used. This is used for decoding translations
-in our translation model, but it can also be used during training, to make
-the model more robust to its own mistakes, similar
-to [Bengio et al., 2015](http://arxiv.org/abs/1506.03099)
-([pdf](http://arxiv.org/pdf/1506.03099.pdf)).
-
-One more important argument used above is `output_projection`. If not specified,
-the outputs of the embedding model will be tensors of shape batch-size by
-`num_decoder_symbols` as they represent the logits for each generated symbol.
-When training models with large output vocabularies, i.e., when
-`num_decoder_symbols` is large, it is not practical to store these large
-tensors. Instead, it is better to return smaller output tensors, which will
-later be projected onto a large output tensor using `output_projection`.
-This allows to use our seq2seq models with a sampled softmax loss, as described
-in [Jean et. al., 2014](http://arxiv.org/abs/1412.2007)
-([pdf](http://arxiv.org/pdf/1412.2007.pdf)).
-
-In addition to `basic_rnn_seq2seq` and `embedding_rnn_seq2seq` there are a few
-more sequence-to-sequence models in `seq2seq.py`, take a look there. They all
-have similar interfaces, so we will not describe them in detail. We will use
-`embedding_attention_seq2seq` for our translation model below.
-
-## Neural translation model
-
-While the core of the sequence-to-sequence model is constructed by
-the functions in `tensorflow/tensorflow/python/ops/seq2seq.py`, there are still a few tricks
-that are worth mentioning that are used in our translation model in
-`models/tutorials/rnn/translate/seq2seq_model.py`.
-
-### Sampled softmax and output projection
-
-For one, as already mentioned above, we want to use sampled softmax to
-handle large output vocabulary. To decode from it, we need to keep track
-of the output projection. Both the sampled softmax loss and the output
-projections are constructed by the following code in `seq2seq_model.py`.
-
-```python
-  if num_samples > 0 and num_samples < self.target_vocab_size:
-    w = tf.get_variable("proj_w", [size, self.target_vocab_size])
-    w_t = tf.transpose(w)
-    b = tf.get_variable("proj_b", [self.target_vocab_size])
-    output_projection = (w, b)
-
-    def sampled_loss(inputs, labels):
-      labels = tf.reshape(labels, [-1, 1])
-      return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
-                                        self.target_vocab_size)
-```
-
-First, note that we only construct a sampled softmax if the number of samples
-(512 by default) is smaller than the target vocabulary size. For vocabularies
-smaller than 512, it might be a better idea to just use a standard softmax loss.
-
-Then, as you can see, we construct an output projection. It is a pair,
-consisting of a weight matrix and a bias vector. If used, the rnn cell
-will return vectors of shape batch-size by `size`, rather than batch-size
-by `target_vocab_size`. To recover logits, we need to multiply by the weight
-matrix and add the biases, as is done in lines 124-126 in `seq2seq_model.py`.
-
-```python
-if output_projection is not None:
-  self.outputs[b] = [tf.matmul(output, output_projection[0]) +
-                     output_projection[1] for ...]
-```
-
-### Bucketing and padding
-
-In addition to sampled softmax, our translation model also makes use
-of *bucketing*, which is a method to efficiently handle sentences of
-different lengths. Let us first clarify the problem. When translating
-English to French, we will have English sentences of different lengths L1
-on input, and French sentences of different lengths L2 on output. Since
-the English sentence is passed as `encoder_inputs`, and the French sentence
-comes as `decoder_inputs` (prefixed by a GO symbol), we should in principle
-create a seq2seq model for every pair (L1, L2+1) of lengths of an English
-and French sentence. This would result in an enormous graph consisting of
-many very similar subgraphs. On the other hand, we could just pad every
-sentence with a special PAD symbol. Then we'd need only one seq2seq model,
-for the padded lengths. But on shorter sentence our model would be inefficient,
-encoding and decoding many PAD symbols that are useless.
-
-As a compromise between constructing a graph for every pair of lengths and
-padding to a single length, we use a number of *buckets* and pad each sentence
-to the length of the bucket above it. In `translate.py` we use the following
-default buckets.
-
-```python
-buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
-```
-
-This means that if the input is an English sentence with 3 tokens,
-and the corresponding output is a French sentence with 6 tokens,
-then they will be put in the first bucket and padded to length 5 for
-encoder inputs, and length 10 for decoder inputs. If we have an English
-sentence with 8 tokens and the corresponding French sentence has 18 tokens,
-then they will not fit into the (10, 15) bucket, and so the (20, 25) bucket
-will be used, i.e. the English sentence will be padded to 20, and the French
-one to 25.
-
-Remember that when constructing decoder inputs we prepend the special `GO`
-symbol to the input data. This is done in the `get_batch()` function in
-`seq2seq_model.py`, which also reverses the input English sentence.
-Reversing the inputs was shown to improve results for the neural translation
-model in [Sutskever et al., 2014](http://arxiv.org/abs/1409.3215)
-([pdf](http://arxiv.org/pdf/1409.3215.pdf)).
-To put it all together, imagine we have the sentence "I go.", tokenized
-as `["I", "go", "."]` as input and the sentence "Je vais." as output,
-tokenized `["Je", "vais", "."]`. It will be put in the (5, 10) bucket,
-with encoder inputs representing `[PAD PAD "." "go" "I"]` and decoder
-inputs `[GO "Je" "vais" "." EOS PAD PAD PAD PAD PAD]`.
-
-
-## Let's run it
-
-To train the model described above, we need to a large English-French corpus.
-We will use the *10^9-French-English corpus* from the
-[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html)
-for training, and the 2013 news test from the same site as development set.
-Both data-sets will be downloaded to `data_dir` and training will start,
-saving checkpoints in `train_dir`, when this command is run.
-
-```
-python translate.py
-  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
-  --en_vocab_size=40000 --fr_vocab_size=40000
-```
-
-It takes  about 18GB of disk space and several hours to prepare the training
-corpus. It is unpacked, vocabulary files are created in `data_dir`, and then
-the corpus is tokenized and converted to integer ids. Note the parameters
-that determine vocabulary sizes. In the example above, all words outside
-the 40K most common ones will be converted to an `UNK` token representing
-unknown words. So if you change vocabulary size, the binary will re-map
-the corpus to token-ids again.
-
-After the data is prepared, training starts. Default parameters in `translate`
-are set to quite large values. Large models trained over a long time give good
-results, but it might take too long or use too much memory for your GPU.
-You can request to train a smaller model as in the following example.
-
-```
-python translate.py
-  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
-  --size=256 --num_layers=2 --steps_per_checkpoint=50
-```
-
-The above command will train a model with 2 layers (the default is 3),
-each layer with 256 units (default is 1024), and will save a checkpoint
-every 50 steps (the default is 200). You can play with these parameters
-to find out how large a model can be to fit into the memory of your GPU.
-
-During training, every `steps_per_checkpoint` steps the binary will print
-out statistics from recent steps. With the default parameters (3 layers
-of size 1024), first messages look like this.
-
-```
-global step 200 learning rate 0.5000 step-time 1.39 perplexity 1720.62
-  eval: bucket 0 perplexity 184.97
-  eval: bucket 1 perplexity 248.81
-  eval: bucket 2 perplexity 341.64
-  eval: bucket 3 perplexity 469.04
-global step 400 learning rate 0.5000 step-time 1.38 perplexity 379.89
-  eval: bucket 0 perplexity 151.32
-  eval: bucket 1 perplexity 190.36
-  eval: bucket 2 perplexity 227.46
-  eval: bucket 3 perplexity 238.66
-```
-
-You can see that each step takes just under 1.4 seconds, the perplexity
-on the training set and the perplexities on the development set
-for each bucket. After about 30K steps, we see perplexities on short
-sentences (bucket 0 and 1) going into single digits.
-Since the training corpus contains ~22M sentences, one epoch (going through
-the training data once) takes about 340K steps with batch-size of 64. At this
-point the model can be used for translating English sentences to French
-using the `--decode` option.
-
-```
-python translate.py --decode
-  --data_dir [your_data_directory] --train_dir [checkpoints_directory]
-
-Reading model parameters from /tmp/translate.ckpt-340000
->  Who is the president of the United States?
- Qui est le président des États-Unis ?
-```
-
-## What next?
-
-The example above shows how you can build your own English-to-French
-translator, end-to-end. Run it and see how the model performs for yourself.
-While it has reasonable quality, the default parameters will not give you
-the best translation model. Here are a few things you can improve.
-
-First of all, we use a very primitive tokenizer, the `basic_tokenizer` function
-in `data_utils`. A better tokenizer can be found on the
-[WMT'15 Website](http://www.statmt.org/wmt15/translation-task.html).
-Using that tokenizer, and a larger vocabulary, should improve your translations.
-
-Also, the default parameters of the translation model are not tuned.
-You can try changing the learning rate, decay, or initializing the weights
-of your model in a different way. You can also change the default
-`GradientDescentOptimizer` in `seq2seq_model.py` to a more advanced one, such
-as `AdagradOptimizer`. Try these things and see how they improve your results!
-
-Finally, the model presented above can be used for any sequence-to-sequence
-task, not only for translation. Even if you want to transform a sequence to
-a tree, for example to generate a parsing tree, the same model as above can
-give state-of-the-art results, as demonstrated in
-[Vinyals & Kaiser et al., 2014](http://arxiv.org/abs/1412.7449)
-([pdf](http://arxiv.org/pdf/1412.7449.pdf)).
-So you can not only build your own translator, you can also build a parser,
-a chat-bot, or any program that comes to your mind. Experiment!
diff --git a/tensorflow/g3doc/tutorials/syntaxnet/index.md b/tensorflow/g3doc/tutorials/syntaxnet/index.md
deleted file mode 100644
index 2ba3fb401ea..00000000000
--- a/tensorflow/g3doc/tutorials/syntaxnet/index.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# SyntaxNet
-
-## Introduction
-
-SyntaxNet is a neural-network Natural Language Processing framework for
-TensorFlow.
-
-## Basic SyntaxNet Tutorial
-
-The [tutorial](
-https://github.com/tensorflow/models/tree/master/syntaxnet#installation)
-shows you how to:
-
-   *   Install SyntaxNet.
-   *   Use the included, pretrained Parsey McParseface parser.
-   *   Train your own part-of-speech tagger.
-   *   Train your own parser.
\ No newline at end of file
diff --git a/tensorflow/g3doc/tutorials/tflearn/index.md b/tensorflow/g3doc/tutorials/tflearn/index.md
deleted file mode 100644
index 9f6485e30bb..00000000000
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ /dev/null
@@ -1,318 +0,0 @@
-# tf.contrib.learn Quickstart
-
-TensorFlow’s high-level machine learning API (tf.contrib.learn) makes it easy to
-configure, train, and evaluate a variety of machine learning models. In this
-tutorial, you’ll use tf.contrib.learn to construct a [neural
-network](https://en.wikipedia.org/wiki/Artificial_neural_network) classifier and
-train it on the [Iris data
-set](https://en.wikipedia.org/wiki/Iris_flower_data_set) to predict flower
-species based on sepal/petal geometry. You'll write code to perform the
-following five steps:
-
-1.  Load CSVs containing Iris training/test data into a TensorFlow `Dataset`
-2.  Construct a [neural network
-    classifier](../../api_docs/python/contrib.learn.md#DNNClassifier)
-3.  Fit the model using the training data
-4.  Evaluate the accuracy of the model
-5.  Classify new samples
-
-NOTE: Remember to [install TensorFlow on your
-machine](../../get_started/os_setup.md#download-and-setup) before getting
-started with this tutorial.
-
-## Complete Neural Network Source Code
-
-Here is the full code for the neural network classifier:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import numpy as np
-
-# Data sets
-IRIS_TRAINING = "iris_training.csv"
-IRIS_TEST = "iris_test.csv"
-
-# Load datasets.
-training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TRAINING,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TEST,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-
-# Specify that all features have real-value data
-feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-# Build 3 layer DNN with 10, 20, 10 units respectively.
-classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                            hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir="/tmp/iris_model")
-
-# Fit model.
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000)
-
-# Evaluate accuracy.
-accuracy_score = classifier.evaluate(x=test_set.data,
-                                     y=test_set.target)["accuracy"]
-print('Accuracy: {0:f}'.format(accuracy_score))
-
-# Classify two new flower samples.
-new_samples = np.array(
-    [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
-y = list(classifier.predict(new_samples, as_iterable=True))
-print('Predictions: {}'.format(str(y)))
-```
-
-The following sections walk through the code in detail.
-
-## Load the Iris CSV data to TensorFlow
-
-The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
-150 rows of data, comprising 50 samples from each of three related Iris species:
-*Iris setosa*, *Iris virginica*, and *Iris versicolor*.
-
-![Petal geometry compared for three iris species: Iris setosa, Iris virginica,
-and Iris versicolor](../../images/iris_three_species.jpg) **From left to right,
-[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
-[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
-[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
-[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
-and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
-(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
-2.0).**
-
-Each row contains the following data for each flower sample:
-[sepal](https://en.wikipedia.org/wiki/Sepal) length, sepal width,
-[petal](https://en.wikipedia.org/wiki/Petal) length, petal width, and flower
-species. Flower species are represented as integers, with 0 denoting *Iris
-setosa*, 1 denoting *Iris versicolor*, and 2 denoting *Iris virginica*.
-
-Sepal Length | Sepal Width | Petal Length | Petal Width | Species
-:----------- | :---------- | :----------- | :---------- | :-------
-5.1          | 3.5         | 1.4          | 0.2         | 0
-4.9          | 3.0         | 1.4          | 0.2         | 0
-4.7          | 3.2         | 1.3          | 0.2         | 0
-&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
-7.0          | 3.2         | 4.7          | 1.4         | 1
-6.4          | 3.2         | 4.5          | 1.5         | 1
-6.9          | 3.1         | 4.9          | 1.5         | 1
-&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
-6.5          | 3.0         | 5.2          | 2.0         | 2
-6.2          | 3.4         | 5.4          | 2.3         | 2
-5.9          | 3.0         | 5.1          | 1.8         | 2
-
-For this tutorial, the Iris data has been randomized and split into two separate
-CSVs:
-
-*   A training set of 120 samples
-    ([iris_training.csv](http://download.tensorflow.org/data/iris_training.csv))
-*   A test set of 30 samples
-    ([iris_test.csv](http://download.tensorflow.org/data/iris_test.csv)).
-
-Place these files in the same directory as your Python code.
-
-To get started, first import TensorFlow and numpy:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import numpy as np
-```
-
-Next, load the training and test sets into `Dataset`s using the
-[`load_csv_with_header()`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/datasets/base.py)
-method in `learn.datasets.base`. The `load_csv_with_header()` method takes three
-required arguments:
-
-*   `filename`, which takes the filepath to the CSV file
-*   `target_dtype`, which takes the [`numpy`
-    datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html) of the
-    dataset's target value.
-*   `features_dtype`, which takes the [`numpy`
-    datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html) of the
-    dataset's feature values.
-
-Here, the target (the value you're training the model to predict) is flower
-species, which is an integer from 0&ndash;2, so the appropriate `numpy` datatype
-is `np.int`:
-
-```python
-# Data sets
-IRIS_TRAINING = "iris_training.csv"
-IRIS_TEST = "iris_test.csv"
-
-# Load datasets.
-training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TRAINING,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TEST,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-```
-
-`Dataset`s in tf.contrib.learn are [named
-tuples](https://docs.python.org/2/library/collections.html#collections.namedtuple);
-you can access feature data and target values via the `data` and `target`
-fields. Here, `training_set.data` and `training_set.target` contain the feature
-data and target values for the training set, respectively, and `test_set.data`
-and `test_set.target` contain feature data and target values for the test set.
-
-Later on, in ["Fit the DNNClassifier to the Iris Training
-Data,"](#fit-dnnclassifier) you'll use `training_set.data` and
-`training_set.target` to train your model, and in ["Evaluate Model
-Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
-`test_set.target`. But first, you'll construct your model in the next section.
-
-## Construct a Deep Neural Network Classifier
-
-tf.contrib.learn offers a variety of predefined models, called
-[`Estimator`s](../../api_docs/python/contrib.learn.md#estimators), which you can
-use "out of the box" to run training and evaluation operations on your data.
-Here, you'll configure a Deep Neural Network Classifier model to fit the Iris
-data. Using tf.contrib.learn, you can instantiate your
-[`DNNClassifier`](../../api_docs/python/contrib.learn.md#DNNClassifier) with
-just a couple lines of code:
-
-```python
-# Specify that all features have real-value data
-feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-# Build 3 layer DNN with 10, 20, 10 units respectively.
-classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                            hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir="/tmp/iris_model")
-```
-
-The code above first defines the model's feature columns, which specify the data
-type for the features in the data set. All the feature data is continuous, so
-`tf.contrib.layers.real_valued_column` is the appropriate function to use to
-construct the feature columns. There are four features in the data set (sepal
-width, sepal height, petal width, and petal height), so accordingly `dimension`
-must be set to `4` to hold all the data.
-
-Then, the code creates a `DNNClassifier` model using the following arguments:
-
-*   `feature_columns=feature_columns`. The set of feature columns defined above.
-*   `hidden_units=[10, 20, 10]`. Three [hidden
-    layers](http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw),
-    containing 10, 20, and 10 neurons, respectively.
-*   `n_classes=3`. Three target classes, representing the three Iris species.
-*   `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save
-    checkpoint data during model training. For more on logging and monitoring
-    with TensorFlow, see [Logging and Monitoring Basics with
-    tf.contrib.learn](../monitors/index.md).
-
-## Fit the DNNClassifier to the Iris Training Data {#fit-dnnclassifier}
-
-Now that you've configured your DNN `classifier` model, you can fit it to the
-Iris training data using the
-[`fit`](../../api_docs/python/contrib.learn.md#BaseEstimator.fit) method. Pass
-as arguments your feature data (`training_set.data`), target values
-(`training_set.target`), and the number of steps to train (here, 2000):
-
-```python
-# Fit model
-classifier.fit(x=training_set.data, y=training_set.target, steps=2000)
-```
-
-The state of the model is preserved in the `classifier`, which means you can
-train iteratively if you like. For example, the above is equivalent to the
-following:
-
-```python
-classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
-classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
-```
-
-However, if you're looking to track the model while it trains, you'll likely
-want to instead use a TensorFlow
-[`monitor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/monitors.py)
-to perform logging operations. See the tutorial [&ldquo;Logging and Monitoring
-Basics with tf.contrib.learn&rdquo;](../monitors/index.md) for more on this
-topic.
-
-## Evaluate Model Accuracy {#evaluate-accuracy}
-
-You've fit your `DNNClassifier` model on the Iris training data; now, you can
-check its accuracy on the Iris test data using the
-[`evaluate`](../../api_docs/python/contrib.learn.md#BaseEstimator.evaluate)
-method. Like `fit`, `evaluate` takes feature data and target values as
-arguments, and returns a `dict` with the evaluation results. The following code
-passes the Iris test data&mdash;`test_set.data` and `test_set.target`&mdash;to
-`evaluate` and prints the `accuracy` from the results:
-
-```python
-accuracy_score = classifier.evaluate(x=test_set.data, y=test_set.target)["accuracy"]
-print('Accuracy: {0:f}'.format(accuracy_score))
-```
-
-Run the full script, and check the accuracy results:
-
-```
-Accuracy: 0.966667
-```
-
-Your accuracy result may vary a bit, but should be higher than 90%. Not bad for
-a relatively small data set!
-
-## Classify New Samples
-
-Use the estimator's `predict()` method to classify new samples. For example, say
-you have these two new flower samples:
-
-Sepal Length | Sepal Width | Petal Length | Petal Width
-:----------- | :---------- | :----------- | :----------
-6.4          | 3.2         | 4.5          | 1.5
-5.8          | 3.1         | 5.0          | 1.7
-
-You can predict their species with the following code:
-
-```python
-# Classify two new flower samples.
-new_samples = np.array(
-    [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
-y = list(classifier.predict(new_samples, as_iterable=True))
-print('Predictions: {}'.format(str(y)))
-```
-
-The `predict()` method returns an array of predictions, one for each sample:
-
-```python
-Prediction: [1 2]
-```
-
-The model thus predicts that the first sample is *Iris versicolor*, and the
-second sample is *Iris virginica*.
-
-## Additional Resources
-
-*   For further reference materials on tf.contrib.learn, see the official [API
-    docs](../../api_docs/python/contrib.learn.md).
-
-*   To learn more about using tf.contrib.learn to create linear models, see
-    [Large-scale Linear Models with TensorFlow](../linear/overview.md).
-
-*   To build your own Estimator using tf.contrib.learn APIs, check out [Building
-    Machine Learning Estimator in
-    TensorFlow](http://terrytangyuan.github.io/2016/07/08/understand-and-build-tensorflow-estimator/).
-
-*   To experiment with neural network modeling and visualization in the browser,
-    check out [Deep Playground](http://playground.tensorflow.org/).
-
-*   For more advanced tutorials on neural networks, see [Convolutional Neural
-    Networks](../deep_cnn/) and [Recurrent Neural Networks](../recurrent/).
diff --git a/tensorflow/g3doc/tutorials/wide/index.md b/tensorflow/g3doc/tutorials/wide/index.md
deleted file mode 100644
index d30ad113748..00000000000
--- a/tensorflow/g3doc/tutorials/wide/index.md
+++ /dev/null
@@ -1,476 +0,0 @@
-# TensorFlow Linear Model Tutorial
-
-In this tutorial, we will use the TF.Learn API in TensorFlow to solve a binary
-classification problem: Given census data about a person such as age, gender,
-education and occupation (the features), we will try to predict whether or not
-the person earns more than 50,000 dollars a year (the target label). We will
-train a **logistic regression** model, and given an individual's information our
-model will output a number between 0 and 1, which can be interpreted as the
-probability that the individual has an annual income of over 50,000 dollars.
-
-## Setup
-
-To try the code for this tutorial:
-
-1.  [Install TensorFlow](../../get_started/os_setup.md) if you haven't
-already.
-
-2.  Download [the tutorial code](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
-
-3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
-
-       ```shell
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
-
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-      ```
-
-    2. Use `pip` to install pandas:
-
-       ```shell
-       $ sudo pip install pandas
-       ```
-
-    If you have trouble installing pandas, consult the
-    [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
-    on the pandas site.
-
-4. Execute the tutorial code with the following command to train the linear
-model described in this tutorial:
-
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide
-   ```
-
-Read on to find out how this code builds its linear model.
-
-## Reading The Census Data
-
-The dataset we'll be using is the
-[Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).
-You can download the
-[training data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
-and [test data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test)
-manually or use code like this:
-
-```python
-import tempfile
-import urllib
-train_file = tempfile.NamedTemporaryFile()
-test_file = tempfile.NamedTemporaryFile()
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
-```
-
-Once the CSV files are downloaded, let's read them into
-[Pandas](http://pandas.pydata.org/) dataframes.
-
-```python
-import pandas as pd
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-           "marital_status", "occupation", "relationship", "race", "gender",
-           "capital_gain", "capital_loss", "hours_per_week", "native_country",
-           "income_bracket"]
-df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
-```
-
-Since the task is a binary classification problem, we'll construct a label
-column named "label" whose value is 1 if the income is over 50K, and 0
-otherwise.
-
-```python
-LABEL_COLUMN = "label"
-df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-```
-
-Next, let's take a look at the dataframe and see which columns we can use to
-predict the target label. The columns can be grouped into two types—categorical
-and continuous columns:
-
-*   A column is called **categorical** if its value can only be one of the
-    categories in a finite set. For example, the native country of a person
-    (U.S., India, Japan, etc.) or the education level (high school, college,
-    etc.) are categorical columns.
-*   A column is called **continuous** if its value can be any numerical value in
-    a continuous range. For example, the capital gain of a person (e.g. $14,084)
-    is a continuous column.
-
-```python
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
-```
-
-Here's a list of columns available in the Census Income dataset:
-
-| Column Name    | Type        | Description                       | {.sortable}
-| -------------- | ----------- | --------------------------------- |
-| age            | Continuous  | The age of the individual         |
-| workclass      | Categorical | The type of employer the          |
-:                :             : individual has (government,       :
-:                :             : military, private, etc.).         :
-| fnlwgt         | Continuous  | The number of people the census   |
-:                :             : takers believe that observation   :
-:                :             : represents (sample weight). This  :
-:                :             : variable will not be used.        :
-| education      | Categorical | The highest level of education    |
-:                :             : achieved for that individual.     :
-| education_num  | Continuous  | The highest level of education in |
-:                :             : numerical form.                   :
-| marital_status | Categorical | Marital status of the individual. |
-| occupation     | Categorical | The occupation of the individual. |
-| relationship   | Categorical | Wife, Own-child, Husband,         |
-:                :             : Not-in-family, Other-relative,    :
-:                :             : Unmarried.                        :
-| race           | Categorical | White, Asian-Pac-Islander,        |
-:                :             : Amer-Indian-Eskimo, Other, Black. :
-| gender         | Categorical | Female, Male.                     |
-| capital_gain   | Continuous  | Capital gains recorded.           |
-| capital_loss   | Continuous  | Capital Losses recorded.          |
-| hours_per_week | Continuous  | Hours worked per week.            |
-| native_country | Categorical | Country of origin of the          |
-:                :             : individual.                       :
-| income         | Categorical | ">50K" or "<=50K", meaning        |
-:                :             : whether the person makes more     :
-:                :             : than \$50,000 annually.           :
-
-## Converting Data into Tensors
-
-When building a TF.Learn model, the input data is specified by means of an Input
-Builder function. This builder function will not be called until it is later
-passed to TF.Learn methods such as `fit` and `evaluate`. The purpose of this
-function is to construct the input data, which is represented in the form of
-[Tensors](https://www.tensorflow.org/versions/r0.9/api_docs/python/framework.html#Tensor)
-or
-[SparseTensors](https://www.tensorflow.org/versions/r0.9/api_docs/python/sparse_ops.html#SparseTensor).
-In more detail, the Input Builder function returns the following as a pair:
-
-1.  `feature_cols`: A dict from feature column names to `Tensors` or
-    `SparseTensors`.
-2.  `label`: A `Tensor` containing the label column.
-
-The keys of the `feature_cols` will be used to construct columns in the
-next section. Because we want to call the `fit` and `evaluate` methods with
-different data, we define two different input builder functions,
-`train_input_fn` and `test_input_fn` which are identical except that they pass
-different data to `input_fn`. Note that `input_fn` will be called while
-constructing the TensorFlow graph, not while running the graph. What it is
-returning is a representation of the input data as the fundamental unit of
-TensorFlow computations, a `Tensor` (or `SparseTensor`).
-
-Our model represents the input data as *constant* tensors, meaning that the
-tensor represents a constant value, in this case the values of a particular
-column of `df_train` or `df_test`. This is the simplest way to pass data into
-TensorFlow. Another more advanced way to represent input data would be to
-construct an [Input Reader](https://www.tensorflow.org/versions/r0.9/api_docs/python/io_ops.html#inputs-and-readers)
-that represents a file or other data source, and iterates through the file as
-TensorFlow runs the graph. Each continuous column in the train or test dataframe
-will be converted into a `Tensor`, which in general is a good format to
-represent dense data. For categorical data, we must represent the data as a
-`SparseTensor`. This data format is good for representing sparse data.
-
-```python
-import tensorflow as tf
-
-def input_fn(df):
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values)
-                     for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {k: tf.SparseTensor(
-      indices=[[i, 0] for i in range(df[k].size)],
-      values=df[k].values,
-      shape=[df[k].size, 1])
-                      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
-
-def train_input_fn():
-  return input_fn(df_train)
-
-def eval_input_fn():
-  return input_fn(df_test)
-```
-
-## Selecting and Engineering Features for the Model
-
-Selecting and crafting the right set of feature columns is key to learning an
-effective model. A **feature column** can be either one of the raw columns in
-the original dataframe (let's call them **base feature columns**), or any new
-columns created based on some transformations defined over one or multiple base
-columns (let's call them **derived feature columns**). Basically, "feature
-column" is an abstract concept of any raw or derived variable that can be used
-to predict the target label.
-
-### Base Categorical Feature Columns
-
-To define a feature column for a categorical feature, we can create a
-`SparseColumn` using the TF.Learn API. If you know the set of all possible
-feature values of a column and there are only a few of them, you can use
-`sparse_column_with_keys`. Each key in the list will get assigned an
-auto-incremental ID starting from 0. For example, for the `gender` column we can
-assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
-doing:
-
-```python
-gender = tf.contrib.layers.sparse_column_with_keys(
-  column_name="gender", keys=["Female", "Male"])
-```
-
-What if we don't know the set of possible values in advance? Not a problem. We
-can use `sparse_column_with_hash_bucket` instead:
-
-```python
-education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
-```
-
-What will happen is that each possible value in the feature column `education`
-will be hashed to an integer ID as we encounter them in training. See an example
-illustration below:
-
-ID  | Feature
---- | -------------
-... |
-9   | `"Bachelors"`
-... |
-103 | `"Doctorate"`
-... |
-375 | `"Masters"`
-... |
-
-No matter which way we choose to define a `SparseColumn`, each feature string
-will be mapped into an integer ID by looking up a fixed mapping or by hashing.
-Note that hashing collisions are possible, but may not significantly impact the
-model quality. Under the hood, the `LinearModel` class is responsible for
-managing the mapping and creating `tf.Variable` to store the model parameters
-(also known as model weights) for each feature ID. The model parameters will be
-learned through the model training process we'll go through later.
-
-We'll do the similar trick to define the other categorical features:
-
-```python
-relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
-workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
-occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
-native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
-```
-
-### Base Continuous Feature Columns
-
-Similarly, we can define a `RealValuedColumn` for each continuous feature column
-that we want to use in the model:
-
-```python
-age = tf.contrib.layers.real_valued_column("age")
-education_num = tf.contrib.layers.real_valued_column("education_num")
-capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
-```
-
-### Making Continuous Features Categorical through Bucketization
-
-Sometimes the relationship between a continuous feature and the label is not
-linear. As an hypothetical example, a person's income may grow with age in the
-early stage of one's career, then the growth may slow at some point, and finally
-the income decreases after retirement. In this scenario, using the raw `age` as
-a real-valued feature column might not be a good choice because the model can
-only learn one of the three cases:
-
-1.  Income always increases at some rate as age grows (positive correlation),
-1.  Income always decreases at some rate as age grows (negative correlation), or
-1.  Income stays the same no matter at what age (no correlation)
-
-If we want to learn the fine-grained correlation between income and each age
-group seperately, we can leverage **bucketization**. Bucketization is a process
-of dividing the entire range of a continuous feature into a set of consecutive
-bins/buckets, and then converting the original numerical feature into a bucket
-ID (as a categorical feature) depending on which bucket that value falls into.
-So, we can define a `bucketized_column` over `age` as:
-
-```python
-age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-```
-
-where the `boundaries` is a list of bucket boundaries. In this case, there are
-10 boundaries, resulting in 11 age group buckets (from age 17 and below, 18-24,
-25-29, ..., to 65 and over).
-
-### Intersecting Multiple Columns with CrossedColumn
-
-Using each base feature column separately may not be enough to explain the data.
-For example, the correlation between education and the label (earning > 50,000
-dollars) may be different for different occupations. Therefore, if we only learn
-a single model weight for `education="Bachelors"` and `education="Masters"`, we
-won't be able to capture every single education-occupation combination (e.g.
-distinguishing between `education="Bachelors" AND occupation="Exec-managerial"`
-and `education="Bachelors" AND occupation="Craft-repair"`). To learn the
-differences between different feature combinations, we can add **crossed feature
-columns** to the model.
-
-```python
-education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
-```
-
-We can also create a `CrossedColumn` over more than two columns. Each
-constituent column can be either a base feature column that is categorical
-(`SparseColumn`), a bucketized real-valued feature column (`BucketizedColumn`),
-or even another `CrossColumn`. Here's an example:
-
-```python
-age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column(
-  [age_buckets, education, occupation], hash_bucket_size=int(1e6))
-```
-
-## Defining The Logistic Regression Model
-
-After processing the input data and defining all the feature columns, we're now
-ready to put them all together and build a Logistic Regression model. In the
-previous section we've seen several types of base and derived feature columns,
-including:
-
-*   `SparseColumn`
-*   `RealValuedColumn`
-*   `BucketizedColumn`
-*   `CrossedColumn`
-
-All of these are subclasses of the abstract `FeatureColumn` class, and can be
-added to the `feature_columns` field of a model:
-
-```python
-model_dir = tempfile.mkdtemp()
-m = tf.contrib.learn.LinearClassifier(feature_columns=[
-  gender, native_country, education, occupation, workclass, marital_status, race,
-  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
-  model_dir=model_dir)
-```
-
-The model also automatically learns a bias term, which controls the prediction
-one would make without observing any features (see the section "How Logistic
-Regression Works" for more explanations). The learned model files will be stored
-in `model_dir`.
-
-## Training and Evaluating Our Model
-
-After adding all the features to the model, now let's look at how to actually
-train the model. Training a model is just a one-liner using the TF.Learn API:
-
-```python
-m.fit(input_fn=train_input_fn, steps=200)
-```
-
-After the model is trained, we can evaluate how good our model is at predicting
-the labels of the holdout data:
-
-```python
-results = m.evaluate(input_fn=eval_input_fn, steps=1)
-for key in sorted(results):
-    print "%s: %s" % (key, results[key])
-```
-
-The first line of the output should be something like `accuracy: 0.83557522`,
-which means the accuracy is 83.6%. Feel free to try more features and
-transformations and see if you can do even better!
-
-If you'd like to see a working end-to-end example, you can download our
-[example code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py)
-and set the `model_type` flag to `wide`.
-
-## Adding Regularization to Prevent Overfitting
-
-Regularization is a technique used to avoid **overfitting**. Overfitting happens
-when your model does well on the data it is trained on, but worse on test data
-that the model has not seen before, such as live traffic. Overfitting generally
-occurs when a model is excessively complex, such as having too many parameters
-relative to the number of observed training data. Regularization allows for you
-to control your model's complexity and makes the model more generalizable to
-unseen data.
-
-In the Linear Model library, you can add L1 and L2 regularizations to the model
-as:
-
-```
-m = tf.contrib.learn.LinearClassifier(feature_columns=[
-  gender, native_country, education, occupation, workclass, marital_status, race,
-  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
-  optimizer=tf.train.FtrlOptimizer(
-    learning_rate=0.1,
-    l1_regularization_strength=1.0,
-    l2_regularization_strength=1.0),
-  model_dir=model_dir)
-```
-
-One important difference between L1 and L2 regularization is that L1
-regularization tends to make model weights stay at zero, creating sparser
-models, whereas L2 regularization also tries to make the model weights closer to
-zero but not necessarily zero. Therefore, if you increase the strength of L1
-regularization, you will have a smaller model size because many of the model
-weights will be zero. This is often desirable when the feature space is very
-large but sparse, and when there are resource constraints that prevent you from
-serving a model that is too large.
-
-In practice, you should try various combinations of L1, L2 regularization
-strengths and find the best parameters that best control overfitting and give
-you a desirable model size.
-
-## How Logistic Regression Works
-
-Finally, let's take a minute to talk about what the Logistic Regression model
-actually looks like in case you're not already familiar with it. We'll denote
-the label as \\(Y\\), and the set of observed features as a feature vector
-\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). We define \\(Y=1\\) if an individual earned >
-50,000 dollars and \\(Y=0\\) otherwise. In Logistic Regression, the probability of
-the label being positive (\\(Y=1\\)) given the features \\(\mathbf{x}\\) is given
-as:
-
-$$ P(Y=1|\mathbf{x}) = \frac{1}{1+\exp(-(\mathbf{w}^T\mathbf{x}+b))}$$
-
-where \\(\mathbf{w}=[w_1, w_2, ..., w_d]\\) are the model weights for the features
-\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). \\(b\\) is a constant that is often called
-the **bias** of the model. The equation consists of two parts—A linear model and
-a logistic function:
-
-*   **Linear Model**: First, we can see that \\(\mathbf{w}^T\mathbf{x}+b = b +
-    w_1x_1 + ... +w_dx_d\\) is a linear model where the output is a linear
-    function of the input features \\(\mathbf{x}\\). The bias \\(b\\) is the
-    prediction one would make without observing any features. The model weight
-    \\(w_i\\) reflects how the feature \\(x_i\\) is correlated with the positive
-    label. If \\(x_i\\) is positively correlated with the positive label, the
-    weight \\(w_i\\) increases, and the probability \\(P(Y=1|\mathbf{x})\\) will be
-    closer to 1. On the other hand, if \\(x_i\\) is negatively correlated with the
-    positive label, then the weight \\(w_i\\) decreases and the probability
-    \\(P(Y=1|\mathbf{x})\\) will be closer to 0.
-
-*   **Logistic Function**: Second, we can see that there's a logistic function
-    (also known as the sigmoid function) \\(S(t) = 1/(1+\exp(-t))\\) being applied
-    to the linear model. The logistic function is used to convert the output of
-    the linear model \\(\mathbf{w}^T\mathbf{x}+b\\) from any real number into the
-    range of \\([0, 1]\\), which can be interpreted as a probability.
-
-Model training is an optimization problem: The goal is to find a set of model
-weights (i.e. model parameters) to minimize a **loss function** defined over the
-training data, such as logistic loss for Logistic Regression models. The loss
-function measures the discrepancy between the ground-truth label and the model's
-prediction. If the prediction is very close to the ground-truth label, the loss
-value will be low; if the prediction is very far from the label, then the loss
-value would be high.
-
-## Learn Deeper
-
-If you're interested in learning more, check out our [Wide & Deep Learning
-Tutorial](../wide_and_deep/) where we'll show you how to combine
-the strengths of linear models and deep neural networks by jointly training them
-using the TF.Learn API.
diff --git a/tensorflow/g3doc/tutorials/wide_and_deep/index.md b/tensorflow/g3doc/tutorials/wide_and_deep/index.md
deleted file mode 100644
index 4928dd41a38..00000000000
--- a/tensorflow/g3doc/tutorials/wide_and_deep/index.md
+++ /dev/null
@@ -1,273 +0,0 @@
-# TensorFlow Wide & Deep Learning Tutorial
-
-In the previous [TensorFlow Linear Model Tutorial](../wide/),
-we trained a logistic regression model to predict the probability that the
-individual has an annual income of over 50,000 dollars using the [Census Income
-Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income). TensorFlow is
-great for training deep neural networks too, and you might be thinking which one
-you should choose—Well, why not both? Would it be possible to combine the
-strengths of both in one model?
-
-In this tutorial, we'll introduce how to use the TF.Learn API to jointly train a
-wide linear model and a deep feed-forward neural network. This approach combines
-the strengths of memorization and generalization. It's useful for generic
-large-scale regression and classification problems with sparse input features
-(e.g., categorical features with a large number of possible feature values). If
-you're interested in learning more about how Wide & Deep Learning works, please
-check out our [research paper](http://arxiv.org/abs/1606.07792).
-
-![Wide & Deep Spectrum of Models]
-(../../images/wide_n_deep.svg "Wide & Deep")
-
-The figure above shows a comparison of a wide model (logistic regression with
-sparse features and transformations), a deep model (feed-forward neural network
-with an embedding layer and several hidden layers), and a Wide & Deep model
-(joint training of both). At a high level, there are only 3 steps to configure a
-wide, deep, or Wide & Deep model using the TF.Learn API:
-
-1.  Select features for the wide part: Choose the sparse base columns and
-    crossed columns you want to use.
-1.  Select features for the deep part: Choose the continuous columns, the
-    embedding dimension for each categorical column, and the hidden layer sizes.
-1.  Put them all together in a Wide & Deep model
-    (`DNNLinearCombinedClassifier`).
-
-And that's it! Let's go through a simple example.
-
-## Setup
-
-To try the code for this tutorial:
-
-1.  [Install TensorFlow](../../get_started/os_setup.md) if you haven't
-already.
-
-2.  Download [the tutorial code](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
-
-3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
-
-       ```shell
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
-
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-      ```
-
-    2. Use `pip` to install pandas:
-
-       ```shell
-       $ sudo pip install pandas
-       ```
-
-    If you have trouble installing pandas, consult the
-    [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
-    on the pandas site.
-
-4. Execute the tutorial code with the following command to train the linear
-model described in this tutorial:
-
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
-   ```
-
-Read on to find out how this code builds its linear model.
-
-
-## Define Base Feature Columns
-
-First, let's define the base categorical and continuous feature columns that
-we'll use. These base columns will be the building blocks used by both the wide
-part and the deep part of the model.
-
-```python
-import tensorflow as tf
-
-# Categorical base columns.
-gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
-race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
-  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
-education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
-relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
-workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
-occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
-native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
-
-# Continuous base columns.
-age = tf.contrib.layers.real_valued_column("age")
-age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-education_num = tf.contrib.layers.real_valued_column("education_num")
-capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
-```
-
-## The Wide Model: Linear Model with Crossed Feature Columns
-
-The wide model is a linear model with a wide set of sparse and crossed feature
-columns:
-
-```python
-wide_columns = [
-  gender, native_country, education, occupation, workclass, relationship, age_buckets,
-  tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
-  tf.contrib.layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)),
-  tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))]
-```
-
-Wide models with crossed feature columns can memorize sparse interactions
-between features effectively. That being said, one limitation of crossed feature
-columns is that they do not generalize to feature combinations that have not
-appeared in the training data. Let's add a deep model with embeddings to fix
-that.
-
-## The Deep Model: Neural Network with Embeddings
-
-The deep model is a feed-forward neural network, as shown in the previous
-figure. Each of the sparse, high-dimensional categorical features are first
-converted into a low-dimensional and dense real-valued vector, often referred to
-as an embedding vector. These low-dimensional dense embedding vectors are
-concatenated with the continuous features, and then fed into the hidden layers
-of a neural network in the forward pass. The embedding values are initialized
-randomly, and are trained along with all other model parameters to minimize the
-training loss. If you're interested in learning more about embeddings, check out
-the TensorFlow tutorial on
-[Vector Representations of Words](https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html),
-or [Word Embedding](https://en.wikipedia.org/wiki/Word_embedding) on Wikipedia.
-
-We'll configure the embeddings for the categorical columns using
-`embedding_column`, and concatenate them with the continuous columns:
-
-```python
-deep_columns = [
-  tf.contrib.layers.embedding_column(workclass, dimension=8),
-  tf.contrib.layers.embedding_column(education, dimension=8),
-  tf.contrib.layers.embedding_column(gender, dimension=8),
-  tf.contrib.layers.embedding_column(relationship, dimension=8),
-  tf.contrib.layers.embedding_column(native_country, dimension=8),
-  tf.contrib.layers.embedding_column(occupation, dimension=8),
-  age, education_num, capital_gain, capital_loss, hours_per_week]
-```
-
-The higher the `dimension` of the embedding is, the more degrees of freedom the
-model will have to learn the representations of the features. For simplicity, we
-set the dimension to 8 for all feature columns here. Empirically, a more
-informed decision for the number of dimensions is to start with a value on the
-order of \\(\log_2(n)\\) or \\(k\sqrt[4]n\\), where \\(n\\) is the number of unique
-features in a feature column and \\(k\\) is a small constant (usually smaller than
-10).
-
-Through dense embeddings, deep models can generalize better and make predictions
-on feature pairs that were previously unseen in the training data. However, it
-is difficult to learn effective low-dimensional representations for feature
-columns when the underlying interaction matrix between two feature columns is
-sparse and high-rank. In such cases, the interaction between most feature pairs
-should be zero except a few, but dense embeddings will lead to nonzero
-predictions for all feature pairs, and thus can over-generalize. On the other
-hand, linear models with crossed features can memorize these “exception rules”
-effectively with fewer model parameters.
-
-Now, let's see how to jointly train wide and deep models and allow them to
-complement each other’s strengths and weaknesses.
-
-## Combining Wide and Deep Models into One
-
-The wide models and deep models are combined by summing up their final output
-log odds as the prediction, then feeding the prediction to a logistic loss
-function. All the graph definition and variable allocations have already been
-handled for you under the hood, so you simply need to create a
-`DNNLinearCombinedClassifier`:
-
-```python
-import tempfile
-model_dir = tempfile.mkdtemp()
-m = tf.contrib.learn.DNNLinearCombinedClassifier(
-    model_dir=model_dir,
-    linear_feature_columns=wide_columns,
-    dnn_feature_columns=deep_columns,
-    dnn_hidden_units=[100, 50])
-```
-
-## Training and Evaluating The Model
-
-Before we train the model, let's read in the Census dataset as we did in the
-[TensorFlow Linear Model tutorial](../wide/). The code for
-input data processing is provided here again for your convenience:
-
-```python
-import pandas as pd
-import urllib
-
-# Define the column names for the data sets.
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-  "marital_status", "occupation", "relationship", "race", "gender",
-  "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
-LABEL_COLUMN = 'label'
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
-                      "hours_per_week"]
-
-# Download the training and test data to temporary files.
-# Alternatively, you can download them yourself and change train_file and
-# test_file to your own paths.
-train_file = tempfile.NamedTemporaryFile()
-test_file = tempfile.NamedTemporaryFile()
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
-
-# Read the training and test data sets into Pandas dataframe.
-df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
-df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
-df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
-
-def input_fn(df):
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values)
-                     for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {k: tf.SparseTensor(
-      indices=[[i, 0] for i in range(df[k].size)],
-      values=df[k].values,
-      shape=[df[k].size, 1])
-                      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
-
-def train_input_fn():
-  return input_fn(df_train)
-
-def eval_input_fn():
-  return input_fn(df_test)
-```
-
-After reading in the data, you can train and evaluate the model:
-
-```python
-m.fit(input_fn=train_input_fn, steps=200)
-results = m.evaluate(input_fn=eval_input_fn, steps=1)
-for key in sorted(results):
-    print "%s: %s" % (key, results[key])
-```
-
-The first line of the output should be something like `accuracy: 0.84429705`. We
-can see that the accuracy was improved from about 83.6% using a wide-only linear
-model to about 84.4% using a Wide & Deep model. If you'd like to see a working
-end-to-end example, you can download our
-[example code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
-
-Note that this tutorial is just a quick example on a small dataset to get you
-familiar with the API. Wide & Deep Learning will be even more powerful if you
-try it on a large dataset with many sparse feature columns that have a large
-number of possible feature values. Again, feel free to take a look at our
-[research paper](http://arxiv.org/abs/1606.07792) for more ideas about how to
-apply Wide & Deep Learning in real-world large-scale maching learning problems.
diff --git a/tensorflow/g3doc/tutorials/word2vec/index.md b/tensorflow/g3doc/tutorials/word2vec/index.md
deleted file mode 100644
index 36f9f762ef4..00000000000
--- a/tensorflow/g3doc/tutorials/word2vec/index.md
+++ /dev/null
@@ -1,401 +0,0 @@
-# Vector Representations of Words
-
-In this tutorial we look at the word2vec model by
-[Mikolov et al.](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
-This model is used for learning vector representations of words, called "word
-embeddings".
-
-## Highlights
-
-This tutorial is meant to highlight the interesting, substantive parts of
-building a word2vec model in TensorFlow.
-
-* We start by giving the motivation for why we would want to
-represent words as vectors.
-* We look at the intuition behind the model and how it is trained
-(with a splash of math for good measure).
-* We also show a simple implementation of the model in TensorFlow.
-* Finally, we look at ways to make the naive version scale better.
-
-We walk through the code later during the tutorial, but if you'd prefer to dive
-straight in, feel free to look at the minimalistic implementation in
-[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
-This basic example contains the code needed to download some data, train on it a
-bit and visualize the result. Once you get comfortable with reading and running
-the basic version, you can graduate to
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
-which is a more serious implementation that showcases some more advanced
-TensorFlow principles about how to efficiently use threads to move data into a
-text model, how to checkpoint during training, etc.
-
-But first, let's look at why we would want to learn word embeddings in the first
-place. Feel free to skip this section if you're an Embedding Pro and you'd just
-like to get your hands dirty with the details.
-
-## Motivation: Why Learn Word Embeddings?
-
-Image and audio processing systems work with rich, high-dimensional datasets
-encoded as vectors of the individual raw pixel-intensities for image data, or
-e.g. power spectral density coefficients for audio data. For tasks like object
-or speech recognition we know that all the information required to successfully
-perform the task is encoded in the data (because humans can perform these tasks
-from the raw data).  However, natural language processing systems traditionally
-treat words as discrete atomic symbols, and therefore 'cat' may be represented
-as  `Id537` and 'dog' as `Id143`.  These encodings are arbitrary, and provide
-no useful information to the system regarding the relationships that may exist
-between the individual symbols. This means that the model can leverage
-very little of what it has learned about 'cats' when it is processing data about
-'dogs' (such that they are both animals, four-legged, pets, etc.). Representing
-words as unique, discrete ids furthermore leads to data sparsity, and usually
-means that we may need more data in order to successfully train statistical
-models.  Using vector representations can overcome some of these obstacles.
-
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/audio-image-text.png" alt>
-</div>
-
-[Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs)
-represent (embed) words in a continuous vector space where semantically
-similar words are mapped to nearby points ('are embedded nearby each other').
-VSMs have a long, rich history in NLP, but all methods depend in some way or
-another on the
-[Distributional Hypothesis](https://en.wikipedia.org/wiki/Distributional_semantics#Distributional_Hypothesis),
-which states that words that appear in the same contexts share
-semantic meaning. The different approaches that leverage this principle can be
-divided into two categories: *count-based methods* (e.g.
-[Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis)),
-and *predictive methods* (e.g.
-[neural probabilistic language models](http://www.scholarpedia.org/article/Neural_net_language_models)).
-
-This distinction is elaborated in much more detail by
-[Baroni et al.](http://clic.cimec.unitn.it/marco/publications/acl2014/baroni-etal-countpredict-acl2014.pdf),
-but in a nutshell: Count-based methods compute the statistics of
-how often some word co-occurs with its neighbor words in a large text corpus,
-and then map these count-statistics down to a small, dense vector for each word.
-Predictive models directly try to predict a word from its neighbors in terms of
-learned small, dense *embedding vectors* (considered parameters of the
-model).
-
-Word2vec is a particularly computationally-efficient predictive model for
-learning word embeddings from raw text. It comes in two flavors, the Continuous
-Bag-of-Words model (CBOW) and the Skip-Gram model (Section 3.1 and 3.2 in [Mikolov et al.](http://arxiv.org/pdf/1301.3781.pdf)). Algorithmically, these
-models are similar, except that CBOW predicts target words (e.g. 'mat') from
-source context words ('the cat sits on the'), while the skip-gram does the
-inverse and predicts source context-words from the target words. This inversion
-might seem like an arbitrary choice, but statistically it has the effect that
-CBOW smoothes over a lot of the distributional information (by treating an
-entire context as one observation). For the most part, this turns out to be a
-useful thing for smaller datasets. However, skip-gram treats each context-target
-pair as a new observation, and this tends to do better when we have larger
-datasets. We will focus on the skip-gram model in the rest of this tutorial.
-
-
-## Scaling up with Noise-Contrastive Training
-
-Neural probabilistic language models are traditionally trained using the
-[maximum likelihood](https://en.wikipedia.org/wiki/Maximum_likelihood) (ML)
-principle  to maximize the probability of the next word \\(w_t\\) (for "target")
-given the previous words \\(h\\) (for "history") in terms of a
-[*softmax* function](https://en.wikipedia.org/wiki/Softmax_function),
-
-$$
-\begin{align}
-P(w_t | h) &= \text{softmax}(\text{score}(w_t, h)) \\
-           &= \frac{\exp \{ \text{score}(w_t, h) \} }
-             {\sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} }
-\end{align}
-$$
-
-where \\(\text{score}(w_t, h)\\) computes the compatibility of word \\(w_t\\)
-with the context \\(h\\) (a dot product is commonly used). We train this model
-by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function) 
-on the training set, i.e. by maximizing
-
-$$
-\begin{align}
- J_\text{ML} &= \log P(w_t | h) \\
-  &= \text{score}(w_t, h) -
-     \log \left( \sum_\text{Word w' in Vocab} \exp \{ \text{score}(w', h) \} \right).
-\end{align}
-$$
-
-This yields a properly normalized probabilistic model for language modeling.
-However this is very expensive, because we need to compute and normalize each
-probability using the score for all other \\(V\\) words \\(w'\\) in the current
-context \\(h\\), *at every training step*.
-
-<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-nplm.png" alt>
-</div>
-
-On the other hand, for feature learning in word2vec we do not need a full
-probabilistic model. The CBOW and skip-gram models are instead trained using a
-binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)) 
-to discriminate the real target words \\(w_t\\) from \\(k\\) imaginary (noise) words \\(\tilde w\\), in the
-same context. We illustrate this below for a CBOW model. For skip-gram the
-direction is simply inverted.
-
-<div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/nce-nplm.png" alt>
-</div>
-
-Mathematically, the objective (for each example) is to maximize
-
-$$J_\text{NEG} = \log Q_\theta(D=1 |w_t, h) +
-  k \mathop{\mathbb{E}}_{\tilde w \sim P_\text{noise}}
-     \left[ \log Q_\theta(D = 0 |\tilde w, h) \right]$$
-
-where \\(Q_\theta(D=1 | w, h)\\) is the binary logistic regression probability
-under the model of seeing the word \\(w\\) in the context \\(h\\) in the dataset
-\\(D\\), calculated in terms of the learned embedding vectors \\(\theta\\). In
-practice we approximate the expectation by drawing \\(k\\) contrastive words
-from the noise distribution (i.e. we compute a
-[Monte Carlo average](https://en.wikipedia.org/wiki/Monte_Carlo_integration)).
-
-This objective is maximized when the model assigns high probabilities
-to the real words, and low probabilities to noise words. Technically, this is
-called
-[Negative Sampling](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf),
-and there is good mathematical motivation for using this loss function:
-The updates it proposes approximate the updates of the softmax function in the
-limit. But computationally it is especially appealing because computing the
-loss function now scales only with the number of *noise words* that we
-select (\\(k\\)), and not *all words* in the vocabulary (\\(V\\)). This makes it
-much faster to train. We will actually make use of the very similar
-[noise-contrastive estimation (NCE)](http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)
-loss, for which TensorFlow has a handy helper function `tf.nn.nce_loss()`.
-
-Let's get an intuitive feel for how this would work in practice!
-
-## The Skip-gram Model
-
-As an example, let's consider the dataset
-
-`the quick brown fox jumped over the lazy dog`
-
-We first form a dataset of words and the contexts in which they appear. We
-could define 'context' in any way that makes sense, and in fact people have
-looked at syntactic contexts (i.e. the syntactic dependents of the current
-target word, see e.g.
-[Levy et al.](https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf)),
-words-to-the-left of the target, words-to-the-right of the target, etc. For now,
-let's stick to the vanilla definition and define 'context' as the window
-of words to the left and to the right of a target word. Using a window
-size of 1, we then have the dataset
-
-`([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...`
-
-of `(context, target)` pairs. Recall that skip-gram inverts contexts and
-targets, and tries to predict each context word from its target word, so the
-task becomes to predict 'the' and 'brown' from 'quick', 'quick' and 'fox' from
-'brown', etc. Therefore our dataset becomes
-
-`(quick, the), (quick, brown), (brown, quick), (brown, fox), ...`
-
-of `(input, output)` pairs.  The objective function is defined over the entire
-dataset, but we typically optimize this with
-[stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
-(SGD) using one example at a time (or a 'minibatch' of `batch_size` examples,
-where typically `16 <= batch_size <= 512`). So let's look at one step of
-this process.
-
-Let's imagine at training step \\(t\\) we observe the first training case above,
-where the goal is to predict `the` from `quick`. We select `num_noise` number
-of noisy (contrastive) examples by drawing from some noise distribution,
-typically the unigram distribution, \\(P(w)\\). For simplicity let's say
-`num_noise=1` and we select `sheep` as a noisy example. Next we compute the
-loss for this pair of observed and noisy examples, i.e. the objective at time
-step \\(t\\) becomes
-
-$$J^{(t)}_\text{NEG} = \log Q_\theta(D=1 | \text{the, quick}) +
-  \log(Q_\theta(D=0 | \text{sheep, quick}))$$
-
-The goal is to make an update to the embedding parameters \\(\theta\\) to improve
-(in this case, maximize) this objective function.  We do this by deriving the
-gradient of the loss with respect to the embedding parameters \\(\theta\\), i.e.
-\\(\frac{\partial}{\partial \theta} J_\text{NEG}\\) (luckily TensorFlow provides
-easy helper functions for doing this!). We then perform an update to the
-embeddings by taking a small step in the direction of the gradient. When this
-process is repeated over the entire training set, this has the effect of
-'moving' the embedding vectors around for each word until the model is
-successful at discriminating real words from noise words.
-
-We can visualize the learned vectors by projecting them down to 2 dimensions
-using for instance something like the
-[t-SNE dimensionality reduction technique](http://lvdmaaten.github.io/tsne/).
-When we inspect these visualizations it becomes apparent that the vectors
-capture some general, and in fact quite useful, semantic information about
-words and their relationships to one another. It was very interesting when we
-first discovered that certain directions in the induced vector space specialize
-towards certain semantic relationships, e.g. *male-female*, *verb tense* and
-even *country-capital* relationships between words, as illustrated in the figure
-below (see also for example
-[Mikolov et al., 2013](http://www.aclweb.org/anthology/N13-1090)).
-
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/linear-relationships.png" alt>
-</div>
-
-This explains why these vectors are also useful as features for many canonical
-NLP prediction tasks, such as part-of-speech tagging or named entity recognition
-(see for example the original work by
-[Collobert et al., 2011](http://arxiv.org/abs/1103.0398)
-([pdf](http://arxiv.org/pdf/1103.0398.pdf)), or follow-up work by
-[Turian et al., 2010](http://www.aclweb.org/anthology/P10-1040)).
-
-But for now, let's just use them to draw pretty pictures!
-
-## Building the Graph
-
-This is all about embeddings, so let's define our embedding matrix.
-This is just a big random matrix to start.  We'll initialize the values to be
-uniform in the unit cube.
-
-```python
-embeddings = tf.Variable(
-    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-```
-
-The noise-contrastive estimation loss is defined in terms of a logistic regression
-model. For this, we need to define the weights and biases for each word in the
-vocabulary (also called the `output weights` as opposed to the `input
-embeddings`). So let's define that.
-
-```python
-nce_weights = tf.Variable(
-  tf.truncated_normal([vocabulary_size, embedding_size],
-                      stddev=1.0 / math.sqrt(embedding_size)))
-nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
-```
-
-Now that we have the parameters in place, we can define our skip-gram model
-graph. For simplicity, let's suppose we've already integerized our text corpus
-with a vocabulary so that each word is represented as an integer (see
-[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
-for the details). The skip-gram model takes two inputs. One is a batch full of
-integers representing the source context words, the other is for the target
-words. Let's create placeholder nodes for these inputs, so that we can feed in
-data later.
-
-```python
-# Placeholders for inputs
-train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-```
-
-Now what we need to do is look up the vector for each of the source words in
-the batch.  TensorFlow has handy helpers that make this easy.
-
-```python
-embed = tf.nn.embedding_lookup(embeddings, train_inputs)
-```
-
-Ok, now that we have the embeddings for each word, we'd like to try to predict
-the target word using the noise-contrastive training objective.
-
-```python
-# Compute the NCE loss, using a sample of the negative labels each time.
-loss = tf.reduce_mean(
-  tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
-                 num_sampled, vocabulary_size))
-```
-
-Now that we have a loss node, we need to add the nodes required to compute
-gradients and update the parameters, etc. For this we will use stochastic
-gradient descent, and TensorFlow has handy helpers to make this easy as well.
-
-```python
-# We use the SGD optimizer.
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
-```
-
-## Training the Model
-
-Training the model is then as simple as using a `feed_dict` to push data into
-the placeholders and calling
-[`session.run`](../../api_docs/python/client.md#Session.run) with this new data
-in a loop.
-
-```python
-for inputs, labels in generate_batch(...):
-  feed_dict = {training_inputs: inputs, training_labels: labels}
-  _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)
-```
-
-See the full example code in
-[tensorflow/examples/tutorials/word2vec/word2vec_basic.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/word2vec/word2vec_basic.py).
-
-## Visualizing the Learned Embeddings
-
-After training has finished we can visualize the learned embeddings using
-t-SNE.
-
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/tsne.png" alt>
-</div>
-
-Et voila! As expected, words that are similar end up clustering nearby each
-other. For a more heavyweight implementation of word2vec that showcases more of
-the advanced features of TensorFlow, see the implementation in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
-
-## Evaluating Embeddings: Analogical Reasoning
-
-Embeddings are useful for a wide variety of prediction tasks in NLP. Short of
-training a full-blown part-of-speech model or named-entity model, one simple way
-to evaluate embeddings is to directly use them to predict syntactic and semantic
-relationships like `king is to queen as father is to ?`. This is called
-*analogical reasoning* and the task was introduced by
-[Mikolov and colleagues
-](http://msr-waypoint.com/en-us/um/people/gzweig/Pubs/NAACL2013Regularities.pdf).
-Download the dataset for this task from
-[download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt).
-
-To see how we do this evaluation, have a look at the `build_eval_graph()` and
-`eval()` functions in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
-
-The choice of hyperparameters can strongly influence the accuracy on this task.
-To achieve state-of-the-art performance on this task requires training over a
-very large dataset, carefully tuning the hyperparameters and making use of
-tricks like subsampling the data, which is out of the scope of this tutorial.
-
-
-## Optimizing the Implementation
-
-Our vanilla implementation showcases the flexibility of TensorFlow. For
-example, changing the training objective is as simple as swapping out the call
-to `tf.nn.nce_loss()` for an off-the-shelf alternative such as
-`tf.nn.sampled_softmax_loss()`. If you have a new idea for a loss function, you
-can manually write an expression for the new objective in TensorFlow and let
-the optimizer compute its derivatives. This flexibility is invaluable in the
-exploratory phase of machine learning model development, where we are trying
-out several different ideas and iterating quickly.
-
-Once you have a model structure you're satisfied with, it may be worth
-optimizing your implementation to run more efficiently (and cover more data in
-less time).  For example, the naive code we used in this tutorial would suffer
-compromised speed because we use Python for reading and feeding data items --
-each of which require very little work on the TensorFlow back-end.  If you find
-your model is seriously bottlenecked on input data, you may want to implement a
-custom data reader for your problem, as described in
-[New Data Formats](../../how_tos/new_data_formats/index.md).  For the case of Skip-Gram
-modeling, we've actually already done this for you as an example in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
-
-If your model is no longer I/O bound but you want still more performance, you
-can take things further by writing your own TensorFlow Ops, as described in
-[Adding a New Op](../../how_tos/adding_an_op/index.md).  Again we've provided an
-example of this for the Skip-Gram case
-[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
-Feel free to benchmark these against each other to measure performance
-improvements at each stage.
-
-## Conclusion
-
-In this tutorial we covered the word2vec model, a computationally efficient
-model for learning word embeddings. We motivated why embeddings are useful,
-discussed efficient training techniques and showed how to implement all of this
-in TensorFlow. Overall, we hope that this has show-cased how TensorFlow affords
-you the flexibility you need for early experimentation, and the control you
-later need for bespoke optimized implementation.
diff --git a/tensorflow/go/BUILD b/tensorflow/go/BUILD
index d69233f4fe1..f16cffac994 100644
--- a/tensorflow/go/BUILD
+++ b/tensorflow/go/BUILD
@@ -9,6 +9,18 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+sh_test(
+    name = "test",
+    size = "small",
+    srcs = ["test.sh"],
+    data = [
+        ":all_files",  # Go sources
+        "//tensorflow:libtensorflow.so",  # C library
+        "//tensorflow/c:headers",  # C library header
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",  # Testdata for LoadSavedModel
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 59223211041..a1b4255292b 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -1,4 +1,4 @@
-# TensorFlow Go API
+# TensorFlow in Go
 
 Construct and execute TensorFlow graphs in Go.
 
@@ -8,34 +8,66 @@ Construct and execute TensorFlow graphs in Go.
 > without notice. The same goes for the awkward package path
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
-## Requirements
+## Quickstart
+1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
+    GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
+    TensorFlow C library will have to be built from source (see below).
+
+    -   Linux:
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
+        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
+    -   OS X
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
+
+    The following shell snippet downloads and extracts into `/usr/local`:
+
+    ```sh
+    TF_TYPE="cpu" # Set to "gpu" for GPU support
+    curl -L \
+      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
+    sudo tar -C /usr/local -xz
+    ```
+
+2.  `go get` this package (and run tests):
+
+    ```sh
+    go get github.com/tensorflow/tensorflow/tensorflow/go
+    go test github.com/tensorflow/tensorflow/tensorflow/go
+    ```
+
+3.  Done!
+
+### Installing into locations other than `/usr/local`
+
+Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
+
+## Building the TensorFlow C library from source
+
+If the "Quickstart" instructions above do not work (perhaps the release archives
+are not available for your operating system or architecture, or you're using a
+different version of CUDA/cuDNN), then the TensorFlow C library must be built
+from source.
+
+### Prerequisites
 
--   Go version 1.7+
 -   [bazel](https://www.bazel.build/versions/master/docs/install.html)
 -   Environment to build TensorFlow from source code
     ([Linux](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-linux)
-    or [Mac OS
+    or [OS
     X](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-mac-os-x)).
-    If you'd like to skip reading those details and do not care about GPU
-    support, try the following:
+    If you don't need GPU support, then try the following: `sh # Linux sudo
+    apt-get install python swig python-numpy # OS X with homebrew brew install
+    swig`
 
-    ```sh
-    # On Linux
-    sudo apt-get install python swig python-numpy
+### Build
 
-    # On Mac OS X with homebrew
-    brew install swig
-    ```
-
-## Installation
-
-1.  Download the TensorFlow source code:
+1.  Download the source code
 
     ```sh
     go get -d github.com/tensorflow/tensorflow/tensorflow/go
     ```
 
-2.  Build the TensorFlow library (`libtensorflow.so`):
+2.  Build the TensorFlow C library:
 
     ```sh
     cd ${GOPATH}/src/github.com/tensorflow/tensorflow
@@ -50,24 +82,46 @@ Construct and execute TensorFlow graphs in Go.
     a. Copying it to a system location, e.g.,
 
     ```sh
-    cp ${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow/libtensorflow.so /usr/local/lib
+    sudo cp ${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow/libtensorflow.so /usr/local/lib
     ```
 
     OR
 
-    b. Setting the
-    `LD_LIBRARY_PATH=${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow`
-    environment variable (`DYLD_LIBRARY_PATH` on Mac OS X).
+    b. Setting environment variables:
 
-After this, the `go` tool should be usable as normal. For example:
+    ```sh
+    export LIBRARY_PATH=${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow
+    # Linux
+    export LD_LIBRARY_PATH=${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow
+    # OS X
+    export DYLD_LIBRARY_PATH=${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow
+    ```
+
+4.  Build and test:
+
+    ```sh
+    go test github.com/tensorflow/tensorflow/tensorflow/go
+    ```
+
+### Generate wrapper functions for ops
+
+Go functions corresponding to TensorFlow operations are generated in `op/wrappers.go`. To regenerate them:
+
+Prerequisites:
+- [Protocol buffer compiler (protoc) 3.x](https://github.com/google/protobuf/releases/)
+- The TensorFlow repository under GOPATH
 
 ```sh
-go test -v github.com/tensorflow/tensorflow/tensorflow/go
+go generate github.com/tensorflow/tensorflow/tensorflow/go/op
 ```
 
+## Support
+
+Use [stackoverflow](http://stackoverflow.com/questions/tagged/tensorflow) and/or
+[Github issues](https://github.com/tensorflow/tensorflow/issues).
+
 ## Contributions
 
-This API has been built on top of the [C
-API](https://www.tensorflow.org/code/tensorflow/c/c_api.h),
-which is intended for building language bindings for TensorFlow functionality.
-However, this is far from complete. Contributions are welcome.
+Contributions are welcome. If making any signification changes, probably best to
+discuss on a [Github issue](https://github.com/tensorflow/tensorflow/issues)
+before investing too much time. Github pull requests are used for contributions.
diff --git a/tensorflow/go/doc.go b/tensorflow/go/doc.go
index 79fbf9797e4..a59652b160e 100644
--- a/tensorflow/go/doc.go
+++ b/tensorflow/go/doc.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package tensorflow is a Go binding to TensorFlow.
 //
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 51ad652e780..682bd245cc7 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow_test
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index b961f7200a0..15ef3b95253 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -20,11 +20,17 @@ go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
 cd $(dirname $0)
-TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
-PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
+for g in $(echo "${GOPATH//:/ }"); do
+    TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
+    PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
+    if [ -x "${PROTOC}" ]; then
+        break
+    fi
+done
 
 if [ ! -x "${PROTOC}" ]
 then
+  set +e
   PATH_PROTOC=$(which protoc)
   if [ ! -x "${PATH_PROTOC}" ]
   then
@@ -34,6 +40,7 @@ then
     exit 1
   fi
   PROTOC=$PATH_PROTOC
+  set -e
 fi
 
 # Ensure that protoc-gen-go is available in $PATH
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index d9ebec0f8ce..dec08dee1ca 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package internal generates Go source code with functions for TensorFlow operations.
 //
@@ -156,12 +158,13 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 `))
 
 	tmplOp = template.Must(template.New("op").Funcs(template.FuncMap{
-		"MakeComment": makeComment,
-		"GoType":      goType,
-		"CamelCase":   camelCase,
-		"Identifier":  identifier,
-		"IsListArg":   isListArg,
-		"IsListAttr":  isListAttr,
+		"MakeComment":       makeComment,
+		"GoType":            goType,
+		"CamelCase":         camelCase,
+		"Identifier":        identifier,
+		"IsListArg":         isListArg,
+		"IsListAttr":        isListAttr,
+		"StripLeadingColon": stripLeadingColon,
 	}).Parse(`
 {{if .OptionalAttrs -}}
 {{/* Type for specifying all optional attributes. */ -}}
@@ -174,7 +177,7 @@ type {{.Op.Name}}Attr func(optionalAttr)
 //
 // value: {{MakeComment .Description}}
 {{- end}}
-// If not specified, defaults to {{.DefaultValue}}
+// If not specified, defaults to {{StripLeadingColon .DefaultValue}}
 {{- if .HasMinimum}}
 //
 // {{if IsListAttr .}}REQUIRES: len(value) >= {{.Minimum}}{{else}}REQUIRES: value >= {{.Minimum}}{{end}}
@@ -212,6 +215,10 @@ func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr
 {{- end -}}
 {{- end -}}
 
+{{- if (not .Op.OutputArg) }}
+//
+// Returns the created operation.
+{{- else }}
 {{- if .DescribeOutputs}}
 //
 {{- if ((len .Op.OutputArg) eq 1) }}
@@ -223,6 +230,7 @@ func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr
 {{- end -}}
 {{- end -}}
 {{- end -}}
+{{- end -}}
 {{- /*
 
   The function signature.
@@ -244,10 +252,12 @@ func {{.Op.Name}}
 {{if .OptionalAttrs}}, optional ...{{.Op.Name}}Attr{{end -}}
 )
 
-{{- /* Construct outputs: len(OpDef.OutputArg) */ -}}
+{{- /* Construct outputs: len(OpDef.OutputArg) or a *tf.Operation */ -}}
 
 {{if .Op.OutputArg -}}
 ({{range $i,$a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier $a.Name}} {{if IsListArg $a}}[]{{end}}tf.Output{{end -}})
+{{- else -}}
+(o *tf.Operation)
 {{- end }} {
 	if scope.Err() != nil {
 		return
@@ -295,7 +305,7 @@ func {{.Op.Name}}
 	return {{range $i, $a := .Op.OutputArg}}{{if $i}}, {{end}}op.Output({{$i}}){{end}}
 	{{- end }}{{- /* if .HasListOutput */}}
 	{{- else }}
-	scope.AddOperation(opspec)
+	return scope.AddOperation(opspec)
 	{{- end }}{{- /* if .Op.OutputArg */}}
 }
 `))
@@ -445,6 +455,23 @@ func isListAttr(attrdef *pb.OpDef_AttrDef) bool {
 	return list
 }
 
+// stripLeadingColon removes the prefix of the string up to the first colon.
+//
+// This is useful when 's' corresponds to a "oneof" protocol buffer message.
+// For example, consider the protocol buffer message:
+//   oneof value { bool b = 1;  int64 i = 2; }
+// String() on a Go corresponding object (using proto.CompactTextString) will
+// print "b:true", or "i:7" etc. This function strips out the leading "b:" or
+// "i:".
+func stripLeadingColon(s fmt.Stringer) string {
+	x := s.String()
+	y := strings.SplitN(x, ":", 2)
+	if len(y) < 2 {
+		return x
+	}
+	return y[1]
+}
+
 func parseTFType(tfType string) (list bool, typ string) {
 	const (
 		listPrefix = "list("
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index c3057e9119d..c984c0063a9 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package internal
 
@@ -39,14 +41,16 @@ summary: "No. Op."
 `,
 			wanted: `
 // No. Op.
-func NoOp(scope *Scope) {
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "NoOp",
 	}
-	scope.AddOperation(opspec)
+	return scope.AddOperation(opspec)
 }
 `,
 		},
@@ -187,7 +191,7 @@ type DecodeJpegAttr func(optionalAttr)
 // DecodeJpegChannels sets the optional channels attribute to value.
 //
 // value: Number of color channels for the decoded image.
-// If not specified, defaults to i:0
+// If not specified, defaults to 0
 func DecodeJpegChannels(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
 		m["channels"] = value
@@ -198,7 +202,7 @@ func DecodeJpegChannels(value int64) DecodeJpegAttr {
 //
 // value: If true use a slower but nicer upscaling of the
 // chroma planes (yuv420/422 only).
-// If not specified, defaults to b:true
+// If not specified, defaults to true
 func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
 	return func(m optionalAttr) {
 		m["fancy_upscaling"] = value
@@ -209,7 +213,7 @@ func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
 //
 // value: The minimum required fraction of lines before a truncated
 // input is accepted.
-// If not specified, defaults to f:1
+// If not specified, defaults to 1
 func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
 	return func(m optionalAttr) {
 		m["acceptable_fraction"] = value
@@ -330,7 +334,7 @@ description: "Some description here."
 type ShapeNAttr func(optionalAttr)
 
 // ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to type:DT_INT32
+// If not specified, defaults to DT_INT32
 func ShapeNOutType(value tf.DataType) ShapeNAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
diff --git a/tensorflow/go/genop/internal/lib.go b/tensorflow/go/genop/internal/lib.go
index aea8037c9bb..71e8c1c93f8 100644
--- a/tensorflow/go/genop/internal/lib.go
+++ b/tensorflow/go/genop/internal/lib.go
@@ -1,19 +1,20 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 package internal
 
-// #cgo LDFLAGS: -L${SRCDIR}/../../../../bazel-bin/tensorflow -ltensorflow
+// #cgo LDFLAGS: -ltensorflow
 // #cgo CFLAGS: -I${SRCDIR}/../../../../
 import "C"
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 70bd129aff7..b6f8e2d5a8e 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 //go:generate sh generate.sh
 
@@ -33,7 +35,7 @@ func main() {
 	var (
 		filename = flag.String("outfile", "", "File to write generated source code to.")
 		header   = flag.String("header", "", "Path to a file whose contents will be copied into the generated file. Can be empty")
-		buf bytes.Buffer
+		buf      bytes.Buffer
 	)
 	flag.Parse()
 	if *filename == "" {
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index c0f91ffb305..46c600eab17 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -162,7 +164,11 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			for i, v := range in {
 				list[i] = v.c()
 			}
-			C.TF_AddInputList(cdesc, &list[0], C.int(size))
+			if size > 0 {
+				C.TF_AddInputList(cdesc, &list[0], C.int(size))
+			} else {
+				C.TF_AddInputList(cdesc, nil, 0)
+			}
 		}
 	}
 	status := newStatus()
@@ -179,11 +185,11 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			return nil, fmt.Errorf("%v (memory will be leaked)", err)
 		}
 	}
-	op := &Operation{
-		c: C.TF_FinishOperation(cdesc, status.c),
-		g: g,
+	c := C.TF_FinishOperation(cdesc, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
 	}
-	return op, status.Err()
+	return &Operation{c, g}, nil
 }
 
 func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, value interface{}) error {
@@ -202,7 +208,11 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 			list[i] = unsafe.Pointer(C.CString(s))
 			lens[i] = C.size_t(len(s))
 		}
-		C.TF_SetAttrStringList(cdesc, cAttrName, &list[0], &lens[0], C.int(size))
+		if size > 0 {
+			C.TF_SetAttrStringList(cdesc, cAttrName, &list[0], &lens[0], C.int(size))
+		} else {
+			C.TF_SetAttrStringList(cdesc, cAttrName, nil, nil, 0)
+		}
 		for _, s := range list {
 			C.free(s)
 		}
@@ -214,7 +224,11 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 		for i, v := range value {
 			list[i] = C.int64_t(v)
 		}
-		C.TF_SetAttrIntList(cdesc, cAttrName, &list[0], C.int(size))
+		if size > 0 {
+			C.TF_SetAttrIntList(cdesc, cAttrName, &list[0], C.int(size))
+		} else {
+			C.TF_SetAttrIntList(cdesc, cAttrName, nil, 0)
+		}
 	case float32:
 		C.TF_SetAttrFloat(cdesc, cAttrName, C.float(value))
 	case []float32:
@@ -223,7 +237,11 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 		for i, v := range value {
 			list[i] = C.float(v)
 		}
-		C.TF_SetAttrFloatList(cdesc, cAttrName, &list[0], C.int(size))
+		if size > 0 {
+			C.TF_SetAttrFloatList(cdesc, cAttrName, &list[0], C.int(size))
+		} else {
+			C.TF_SetAttrFloatList(cdesc, cAttrName, nil, 0)
+		}
 	case bool:
 		v := C.uchar(0)
 		if value {
@@ -238,11 +256,18 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 				list[i] = 1
 			}
 		}
-		C.TF_SetAttrBoolList(cdesc, cAttrName, &list[0], C.int(size))
+		if size > 0 {
+			C.TF_SetAttrBoolList(cdesc, cAttrName, &list[0], C.int(size))
+		} else {
+			C.TF_SetAttrBoolList(cdesc, cAttrName, nil, 0)
+		}
 	case DataType:
 		C.TF_SetAttrType(cdesc, cAttrName, C.TF_DataType(value))
 	case []DataType:
-		list := (*C.TF_DataType)(&value[0])
+		var list *C.TF_DataType
+		if len(value) > 0 {
+			list = (*C.TF_DataType)(&value[0])
+		}
 		C.TF_SetAttrTypeList(cdesc, cAttrName, list, C.int(len(value)))
 	case *Tensor:
 		C.TF_SetAttrTensor(cdesc, cAttrName, value.c, status.c)
@@ -255,7 +280,11 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 		for i, v := range value {
 			list[i] = v.c
 		}
-		C.TF_SetAttrTensorList(cdesc, cAttrName, &list[0], C.int(size), status.c)
+		var plist **C.TF_Tensor
+		if size > 0 {
+			plist = &list[0]
+		}
+		C.TF_SetAttrTensorList(cdesc, cAttrName, plist, C.int(size), status.c)
 		if err := status.Err(); err != nil {
 			return fmt.Errorf("bad value for attribute %q: %v", name, err)
 		}
@@ -276,7 +305,11 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 				dimsp[i] = &dims[i][0]
 			}
 		}
-		C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
+		if len(value) > 0 {
+			C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
+		} else {
+			C.TF_SetAttrShapeList(cdesc, cAttrName, nil, nil, 0)
+		}
 	default:
 		return fmt.Errorf("attribute %q has a type (%T) which is not valid for operation attributes", name, value)
 	}
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index 43f80ff4eb0..c3120bc7203 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/lib.go b/tensorflow/go/lib.go
index 2d67cba974b..2800eded60b 100644
--- a/tensorflow/go/lib.go
+++ b/tensorflow/go/lib.go
@@ -1,19 +1,21 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
-// #cgo LDFLAGS: -L${SRCDIR}/../../bazel-bin/tensorflow -ltensorflow
+// #cgo LDFLAGS: -ltensorflow
 // #cgo CFLAGS: -I${SRCDIR}/../../
 import "C"
diff --git a/tensorflow/go/op/.gitignore b/tensorflow/go/op/.gitignore
deleted file mode 100644
index 61af2de400a..00000000000
--- a/tensorflow/go/op/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-wrappers.go
diff --git a/tensorflow/go/op/generate.go b/tensorflow/go/op/generate.go
index 5b4d842d280..17ece1c7a25 100644
--- a/tensorflow/go/op/generate.go
+++ b/tensorflow/go/op/generate.go
@@ -1,19 +1,20 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 //go:generate go generate ../genop
 //go:generate go run ../genop/main.go -outfile wrappers.go
 
 package op
-
diff --git a/tensorflow/go/op/op.go b/tensorflow/go/op/op.go
index 29c59987247..1c20bd441ad 100644
--- a/tensorflow/go/op/op.go
+++ b/tensorflow/go/op/op.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package op defines functions for adding TensorFlow operations to a Graph.
 //
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index eaa27bfcd06..2451ba36069 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -1,22 +1,25 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Tests for the generated code of some operations.
 
 package op
 
 import (
+	"strings"
 	"testing"
 
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
@@ -31,3 +34,27 @@ func TestPlaceholder(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestAddOperationFailure(t *testing.T) {
+	// Inspired from https://github.com/tensorflow/tensorflow/issues/9931
+	s := NewScope()
+
+	resize := ResizeArea(s, Placeholder(s, tf.Float), Const(s, []int64{80, 80}))
+	if err := s.Err(); err == nil {
+		t.Fatal("ResizeArea expects an int32 Tensor for size, should fail when an int64 is provided")
+	}
+	// And any use of resize should panic with an error message more informative than SIGSEGV
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+		s, ok := r.(string)
+		if ok && strings.Contains(s, "see Scope.Err() for details") {
+			return
+		}
+		t.Errorf("Expected panic string to Scope.Err(), found %T: %q", r, r)
+	}()
+	_ = resize.Shape()
+	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
+}
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index c9fc432cd2d..a9ec79463a0 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package op
 
@@ -47,6 +49,11 @@ func NewScope() *Scope {
 	return &Scope{graph: tf.NewGraph(), namemap: make(map[string]int), err: new(scopeErr)}
 }
 
+// NewScopeWithGraph creates a Scope initialized with the Graph thats passed in
+func NewScopeWithGraph(g *tf.Graph) *Scope {
+	return &Scope{graph: g, namemap: make(map[string]int), err: new(scopeErr)}
+}
+
 // Finalize returns the Graph on which this scope operates on and renders s
 // unusable. If there was an error during graph construction, that error is
 // returned instead.
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 0c3825c1781..6fb5d32e503 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package op
 
@@ -93,6 +95,21 @@ func TestMultipleGeneratedOps(t *testing.T) {
 	}
 }
 
+func TestScopeWithGraph(t *testing.T) {
+	s1 := NewScope()
+	Const(s1, "hello")
+	graph, err := s1.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	s2 := NewScopeWithGraph(graph)
+	Const(s2.SubScope("addition"), "world")
+	if err := s2.Err(); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func Example() {
 	// This example creates a Graph that multiplies a constant matrix with
 	// a matrix to be provided during graph execution (via
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
new file mode 100644
index 00000000000..c72dfdd17e2
--- /dev/null
+++ b/tensorflow/go/op/wrappers.go
@@ -0,0 +1,24018 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// DO NOT EDIT
+// This file was machine generated by github.com/tensorflow/tensorflow/tensorflow/go/genop/internal
+//
+// WARNING: This generation of wrapper function for TensorFlow ops is in an
+// experimental state. The generated API can change without notice.
+
+package op
+
+import tf "github.com/tensorflow/tensorflow/tensorflow/go"
+
+// optionalAttr is an intentionally un-exported type to hide
+// details of how optional attributes to operations are implemented.
+type optionalAttr map[string]interface{}
+
+func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, int, error) {
+	size, err := op.OutputListSize(output)
+	if err != nil {
+		return nil, start, err
+	}
+	list := make([]tf.Output, size)
+	for i := 0; i < size; i++ {
+		list[i] = op.Output(start + i)
+	}
+	return list, start + size, nil
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
+//
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+//   same as `gradients`.
+// min, max: Quantization interval, floats of shape `[d]`.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs, shape same as
+// `inputs`:
+//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+//
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVars",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
+
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["output_range_given"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+//
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+//
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
+	}
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+//
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
+//
+// Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
+//
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedInstanceNorm",
+		Input: []tf.Input{
+			x, x_min, x_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OneHot",
+		Input: []tf.Input{
+			indices, depth, on_value, off_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bitcasts a tensor from one type to another without copying data.
+//
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
+//
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+//
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+//
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "Bitcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extract `patches` from `images` and put them in the "depth" output dimension.
+//
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractImagePatches",
+		Input: []tf.Input{
+			images,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchToSpace for N-D tensors of type T.
+//
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchToSpaceND",
+		Input: []tf.Input{
+			input, block_shape, crops,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SpaceToBatch for 4-D tensors of type T.
+//
+// This is a legacy version of the more general SpaceToBatchND.
+//
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatch",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
+//
+// value: If the quantization is signed or unsigned.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+//
+// value: If the range is given or should be computed from the tensor.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This op simulates the precision loss from the quantized forward pass by:
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
+//
+// There are different ways to quantize. This version does not use the full range
+// of the output type, choosing to elide the lowest possible value for symmetry
+// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
+// quantization), so that 0.0 maps to 0.
+//
+// To perform this op, we first find the range of values in our tensor. The range
+// we use is always centered on 0, so we find m such that
+//
+// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
+// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+//
+// Our input tensor range is then [-m, m].
+//
+// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
+// If signed_input is true, this is
+//
+//   [min_fixed, max_fixed ] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+//
+// Otherwise, if signed_input is false, the fixed-point range is
+//
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+//
+// From this we compute our scaling factor, s:
+//
+//   s = (max_fixed - min_fixed) / (2 * m).
+//
+// Now we can quantize and dequantize the elements of our tensor.  An element e
+// is transformed into e':
+//
+//   e' = (e * s).round_to_nearest() / s.
+//
+// Note that we have a different number of buckets in the signed vs. unsigned
+// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
+// vs. 255 in the unsigned case.
+//
+// For example, suppose num_bits = 8 and m = 1.  Then
+//
+//   [min_fixed, max_fixed] = [-127, 127], and
+//   s = (127 + 127) / 2 = 127.
+//
+// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
+// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If range_given, this is the min of the range, otherwise this input
+// will be ignored.
+//	input_max: If range_given, this is the max of the range, otherwise this input
+// will be ignored.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV2",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SpaceToBatch for N-D tensors of type T.
+//
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatchND",
+		Input: []tf.Input{
+			input, block_shape, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeSqueezeDims sets the optional squeeze_dims attribute to value.
+//
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeSqueezeDims(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `squeeze_dims`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
+//
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Squeeze",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor. The shape can be any partially-specified
+// shape.  To be unconstrained, pass in a shape with unknown rank.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+//
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastGradientArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Return the shape of s0 op s1 with broadcast.
+//
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns locations of true values in a boolean tensor.
+//
+// This operation returns the coordinates of true elements in `input`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `input`. Indices are output in row-major order.
+//
+// For example:
+//
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
+//
+// # `input` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, input tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Where",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
+//
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSliceGrad",
+		Input: []tf.Input{
+			shape, begin, end, strides, dy,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return a slice from 'input'.
+//
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
+//
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+//
+// Arguments:
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Slice",
+		Input: []tf.Input{
+			input, begin, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns shape of tensors.
+//
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShapeN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// Checks a tensor for NaN and Inf values.
+//
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+//
+// Arguments:
+//
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message": message}
+	opspec := tf.OpSpec{
+		Type: "CheckNumerics",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PreventGradient",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gather values or slices from `params` according to `indices`.
+//
+// `indices` is an integer tensor containing indices into `params`.  The last
+// dimension of `indices` can be at most the rank of `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] = params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
+//
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
+// ```
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
+// ```
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// Arguments:
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GatherNd",
+		Input: []tf.Input{
+			params, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a batched matrix tensor with new batched diagonal values.
+//
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
+//
+// The output is computed as follows:
+//
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+//
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSetDiag",
+		Input: []tf.Input{
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the diagonal part of the tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+//
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
+//
+// Arguments:
+//	input: Rank k tensor where k is 2, 4, or 6.
+//
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8, in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// number_of_steps = 1 << (# of bits in T)
+// range_adjust = number_of_steps / (number_of_steps - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / number_of_steps
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Dequantize",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a tensor of zeros with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ZerosLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `split_dim`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, split_dim tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, split_dim,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
+	}
+	return output
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
+//
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `split_dim`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, split_dim tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "Split",
+		Input: []tf.Input{
+			split_dim, value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
+	}
+	return output
+}
+
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
+//
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Concat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unique",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
+
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+//
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_channels"] = value
+	}
+}
+
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+//
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
+//
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+//
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
+//
+// Arguments:
+//	contents: The WAV-encoded audio, usually from a file.
+//
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeWav",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AllCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
+type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+//
+// value: Each valid line in this file (which should have a CSV-like format)
+// corresponds to a valid word ID. IDs are in sequential order, starting from
+// num_reserved_ids. The last entry in each line is expected to be a value
+// corresponding to the count or relative probability. Exactly one of vocab_file
+// and unigrams needs to be passed to this op.
+// If not specified, defaults to ""
+func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["vocab_file"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+//
+// value: The distortion is used to skew the unigram probability distribution.
+// Each weight is first raised to the distortion's power before adding to the
+// internal unigram distribution. As a result, distortion = 1.0 gives regular
+// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+// a uniform distribution.
+// If not specified, defaults to 1
+func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["distortion"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+//
+// value: Optionally some reserved IDs can be added in the range [0,
+// ..., num_reserved_ids) by the users. One use case is that a special unknown
+// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_reserved_ids"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'shard') indicates the number of partitions that are being
+// used in the overall computation.
+// If not specified, defaults to 1
+//
+// REQUIRES: value >= 1
+func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_shards"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'num_shards') indicates the particular partition number of a
+// sampler op, when partitioning is being used.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["shard"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+//
+// value: A list of unigram counts or probabilities, one per ID in sequential
+// order. Exactly one of vocab_file and unigrams should be passed to this op.
+// If not specified, defaults to <>
+func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["unigrams"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// A unigram sampler could use a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the distribution
+// from data on the fly. There is also an option to skew the distribution by
+// applying a distortion power to the weights.
+//
+// The vocabulary file should be in CSV-like format, with the last field
+// being the weight associated with the word.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
+//
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
+// If not specified, defaults to false
+func AbortExitWithoutError(value bool) AbortAttr {
+	return func(m optionalAttr) {
+		m["exit_without_error"] = value
+	}
+}
+
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
+//
+// Returns nothing but an exception.
+//
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Abort",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SpaceToDepth for tensors of type T.
+//
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `input_depth * block_size * block_size`.
+//   * The input tensor's height and width must be divisible by block_size.
+//
+// That is, assuming the input is in the shape:
+// `[batch, height, width, depth]`,
+// the shape of the output will be:
+// `[batch, height/block_size, width/block_size, depth*block_size*block_size]`
+//
+// This operation requires that the input tensor be of rank 4, and that
+// `block_size` be >=1 and a divisor of both the input `height` and `width`.
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "SpaceToDepth",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
+// indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
+// operator which extracts values or slices from a given tensor.
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
+//
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
+//
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
+//
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Switch",
+		Input: []tf.Input{
+			data, pred,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs greedy decoding on the logits given in inputs.
+//
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCGreedyDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLoss",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
+
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
+
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Merge",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DepthToSpace for tensors of type T.
+//
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// That is, assuming the input is in the shape:
+// `[batch, height, width, depth]`,
+// the shape of the output will be:
+// `[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`
+//
+// This operation requires that the input tensor be of rank 4, and that
+// `block_size` be >=1 and that `block_size * block_size` be a divisor of the
+// input depth.
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[ [1],   [2],  [5],  [6]],
+//      [ [3],   [4],  [7],  [8]],
+//      [ [9],  [10], [13],  [14]],
+//      [ [11], [12], [15],  [16]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "DepthToSpace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
+//
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StagePeek",
+		Input: []tf.Input{
+			index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
+	}
+	return values
+}
+
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
+
+// StageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageMemoryLimit sets the optional memory_limit attribute to value.
+//
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
+//
+// Arguments:
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Stage",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+//
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV2",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Get the current size of the TensorArray.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LearnedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Split the data from the input value into TensorArray elements.
+//
+// Assuming that `lengths` takes on values
+//
+//   ```(n0, n1, ..., n(T-1))```
+//
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV3",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a diagonal tensor with a given diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+//
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 3.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Diag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
+
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+//
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Concat the elements from the TensorArray into value `value`.
+//
+// Takes `T` elements of shapes
+//
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayConcatV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Push an element onto the tensor_array.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV3",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorArray for storing the gradients of values in the given handle.
+//
+// If the given TensorArray gradient already exists, returns a reference to it.
+//
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns the batched diagonal part of a batched tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+//
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
+//
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
+
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues `n` tuples of one or more tensors from the given queue.
+//
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
+//
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueUpToV2",
+		Input: []tf.Input{
+			handle, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
+}
+
+// Deprecated. Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
+
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues `n` tuples of one or more tensors from the given queue.
+//
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueManyV2",
+		Input: []tf.Input{
+			handle, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
+	}
+	return components
+}
+
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
+//
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceStridedSliceAssign",
+		Input: []tf.Input{
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
+//
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unstage",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
+	}
+	return values
+}
+
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+//
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
+	}
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
+//
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
+//
+// Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+//
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PriorityQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+//
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceEndMask sets the optional end_mask attribute to value.
+//
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Return a strided slice from `input`.
+//
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
+//
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
+//
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+//
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+//
+//
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+//
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
+//
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
+//
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
+//
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
+//
+// Arguments:
+//
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSlice",
+		Input: []tf.Input{
+			input, begin, end, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV2",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Releases any resources used by the given iterator.
+//
+// Returns the created operation.
+func IteratorDispose(scope *Scope, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorDispose",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Gets the next output from the given iterator.
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
+}
+
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+//
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that emits the records from one or more TFRecord files.
+//
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordDataset",
+		Input: []tf.Input{
+			filenames, compression_type,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a dataset that emits the records from one or more binary files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordDataset",
+		Input: []tf.Input{
+			filenames, header_bytes, record_bytes, footer_bytes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
+
+// PlaceholderShape sets the optional shape attribute to value.
+//
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Placeholder",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
+//
+// Arguments:
+//
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "CacheDataset",
+		Input: []tf.Input{
+			input_dataset, filename,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that yields a SparseTensor for each element of the input.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor.
+//
+//
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches and pads `batch_size` elements from the input.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PaddedBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayConcatV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RepeatDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorSliceDataset",
+		Input: []tf.Input{
+			indices, values, dense_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a tensor.
+//
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reshape",
+		Input: []tf.Input{
+			tensor, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV2",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
+
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradBoxes",
+		Input: []tf.Input{
+			grads, image, boxes, box_ind,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+//
+// Arguments:
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ShuffleDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size, seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels and the bounding box is
+// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
+// bounding box will be `(10, 40)` to `(50, 180)`.
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+//
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The PNG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodePng",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the contrast of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighborGrad",
+		Input: []tf.Input{
+			grads, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
+
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
+//
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+	return func(m optionalAttr) {
+		m["magnitude_squared"] = value
+	}
+}
+
+// Produces a visualization of audio data over time.
+//
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
+//
+// Arguments:
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
+//
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSpectrogram",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.
+//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGrad",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor transferred by GraphTransferer.
+//
+// The graph specifications are serialized by protobuf as graph_transfer_info.
+// The implementation / limitations may differ for each platform
+// and each available peripheral.
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV2",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinear",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+//
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+//
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResize",
+		Input: []tf.Input{
+			image, boxes, box_ind, crop_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomStandardNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Writes contents to the file at input filename. Creates file if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
+
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+//
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
+//
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
+//
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
+//
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodePng",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adapative SDCA for the inner loop.
+// If not specified, defaults to false
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseSetOperation",
+		Input: []tf.Input{
+			set1, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Delete the tensor specified by its handle in the session.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToDenseSetOperation",
+		Input: []tf.Input{
+			set1, set2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
+//
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+//
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
+//
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
+//
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TakeManySparseFromTensorsMap",
+		Input: []tf.Input{
+			sparse_handles,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
+
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// Arguments:
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBmp",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Constructs a tensor by tiling a given tensor.
+//
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
+//
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise min of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMinimum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate a sharded filename. The filename is printf formatted as
+//
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
+
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Computes the difference between two lists of numbers or strings.
+//
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
+//
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+//
+// For example, given this input:
+//
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
+//
+// Arguments:
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
+//
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ListDiff",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
+//
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
+//
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+//
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves the input tensors to disk.
+//
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Save",
+		Input: []tf.Input{
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Read an element from the TensorArray into output `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
+//
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x >= y) element-wise.
+//
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
+//
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Encode audio data using the WAV file format.
+//
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+//
+// Arguments:
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
+//
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeWav",
+		Input: []tf.Input{
+			audio, sample_rate,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes numerical negative value element-wise.
+//
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
+//
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
+
+// PackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
+//
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
+//
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
+//
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
+//
+// Arguments:
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Pack",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is -1
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Looks up keys in a table, outputs the corresponding values.
+//
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableFindV2",
+		Input: []tf.Input{
+			table_handle, keys, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// 2D fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
+//
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
+	}
+}
+
+// TensorSummaryLabels sets the optional labels attribute to value.
+//
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
+//
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
+//
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummary",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
+
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+//
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
+//
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPadGrad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvertPermutation",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+//
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
+//
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRows",
+		Input: []tf.Input{
+			indices, values, dense_shape, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+//   `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.image_summary('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// number_of_steps = 1 << (# of bits in T)
+// range_adjust = number_of_steps / (number_of_steps - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = number_of_steps / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeV2",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+//
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppression",
+		Input: []tf.Input{
+			boxes, scores, max_output_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApplyDelayCompensatedGradientDescentAttr is an optional argument to ApplyDelayCompensatedGradientDescent.
+type ApplyDelayCompensatedGradientDescentAttr func(optionalAttr)
+
+// ApplyDelayCompensatedGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ApplyDelayCompensatedGradientDescentUseLocking(value bool) ApplyDelayCompensatedGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var -= alpha * (delta + lambda * delta * (var - shadow))
+//
+// Update '*shadow' by changing it to the new value of 'var'
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//	lambda: The variance parameter.
+//	shadow: Same as "var".
+//
+// Returns the created operation.
+func ApplyDelayCompensatedGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, lambda tf.Output, shadow tf.Output, optional ...ApplyDelayCompensatedGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApplyDelayCompensatedGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta, lambda, shadow,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
+//
+// This operation returns an integer representing the number of elements in
+// `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Size",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Print",
+		Input: []tf.Input{
+			input, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Real",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableDenseHashTableV2",
+		Input: []tf.Input{
+			empty_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu6",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
+//
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AsString",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
+
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from `params` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+//
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Gather",
+		Input: []tf.Input{
+			params, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+//
+// The polygamma function is defined as:
+//
+//
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+//
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n"]
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the absolute value of a tensor.
+//
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
+//
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
+// rate.
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoisson",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts one or more images from RGB to HSV.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RGBToHSV",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+// in the least squares sense.
+//
+// matrix and right-hand sides in the batch:
+//
+// `matrix`=\\(A \in \Re^{m \times n}\\),
+// `rhs`=\\(B  \in \Re^{m \times k}\\),
+// `output`=\\(X  \in \Re^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
+// \\(A Z = B\\). Notice that the fast path is only numerically stable when
+// \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+//
+// Arguments:
+//
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReluX",
+		Input: []tf.Input{
+			features, max_value, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeV2Checkpoints",
+		Input: []tf.Input{
+			checkpoint_prefixes, destination_prefix,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
+//
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unpack",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// ReduceJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
+//
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReduceJoin",
+		Input: []tf.Input{
+			inputs, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
+	}
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
+		Input: []tf.Input{
+			condition, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Any",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset with a range of values. Corresponds to python's xrange.
+//
+// Arguments:
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RangeDataset",
+		Input: []tf.Input{
+			start, stop, step,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixTriangularSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Real-valued fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs all keys and values in the table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddManySparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV3",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D real-valued fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+func TextLineDataset(scope *Scope, filenames tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
+//
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopK",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopKV2",
+		Input: []tf.Input{
+			input, k,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomCrop",
+		Input: []tf.Input{
+			image, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Produces the average pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAvgPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+//
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+//
+// Arguments:
+//
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
+//
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBiasAdd",
+		Input: []tf.Input{
+			input, bias, min_input, max_input, min_bias, max_bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Quantized Batch normalization.
+//
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
+//
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the maximum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Max",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	opspec := tf.OpSpec{
+		Type: "Cast",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x AND y element-wise.
+//
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
+
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square of x element-wise.
+//
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inserts a dimension of 1 into a tensor's shape.
+//
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
+// zero; if you specify a negative number for `dim` it is counted backward from
+// the end.
+//
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
+//
+// Arguments:
+//
+//	dim: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExpandDims",
+		Input: []tf.Input{
+			input, dim,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
+
+// AllKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical and" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "All",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+//
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_random_seed"] = value
+	}
+}
+
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+//
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_shuffle_shift_ratio"] = value
+	}
+}
+
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+//
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
+//
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// Emits randomized records.
+//
+// Arguments:
+//	file_pattern: Glob pattern for the data files.
+//
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RecordInput",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the determinant of one ore more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchToSpace for 4-D tensors of type T.
+//
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "BatchToSpace",
+		Input: []tf.Input{
+			input, crops,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the index with the smallest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableFromTextFileV2",
+		Input: []tf.Input{
+			table_handle, filename,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAdd",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns an element-wise indication of the sign of a number.
+//
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Max along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
+//
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAdd",
+		Input: []tf.Input{
+			value, bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// The backward operation for "BiasAdd" on the "bias" tensor.
+//
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
+//
+// Arguments:
+//	out_backprop: Any number of dimensions.
+//
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddGrad",
+		Input: []tf.Input{
+			out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x <= y) element-wise.
+//
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LessEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects elements from `t` or `e`, depending on `condition`.
+//
+// The `t`, and `e` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `t` and `e` are scalars.
+// If `t` and `e` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `t`, or must have
+// the same shape as `t`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `t` (if true) or `e` (if false).
+//
+// If `condition` is a vector and `t` and `e` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `t` and `e`.
+// If `condition` has the same shape as `t` and `e`, then it chooses which
+// element to copy from `t` and `e`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
+//
+// Arguments:
+//
+//	t: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `t` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	e: = A `Tensor` with the same type and shape as `t`.
+//
+// Returns = A `Tensor` with the same type and shape as `t` and `e`.
+func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Select",
+		Input: []tf.Input{
+			condition, t, e,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the mean of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mean",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMax",
+		Input: []tf.Input{
+			input, dimension,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSum",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a quantized tensor as per the Reshape op.
+//
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReshape",
+		Input: []tf.Input{
+			tensor, shape, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes gradients for SparseSegmentSqrtN.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a sequence of numbers.
+//
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RestoreSlice",
+		Input: []tf.Input{
+			file_pattern, tensor_name, shape_and_slice,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op that passes through `input` when its output is not fed.
+//
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
+//
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderWithDefault",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMul",
+		Input: []tf.Input{
+			a, b, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMul",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
+//
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LoopCond",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizeDownAndShrinkRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// HashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
+//
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
+//
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
+
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
+//
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixInverse",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the saturation of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
+
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+	return func(m optionalAttr) {
+		m["compute_v"] = value
+	}
+}
+
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
+//
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
+//
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	opspec := tf.OpSpec{
+		Type: "DynamicPartition",
+		Input: []tf.Input{
+			data, partitions,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
+}
+
+// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// DEPRECATED at GraphDef version 17: Use Reciprocal
+//
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Inv",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for integer types.
+//
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+//
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
+}
+
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index df41c40a2b4..8fcad61f4c6 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -75,6 +77,11 @@ type Output struct {
 	Index int
 }
 
+// DataType returns the type of elements in the tensor produced by p.
+func (p Output) DataType() DataType {
+	return DataType(C.TF_OperationOutputType(p.c()))
+}
+
 // Shape returns the (possibly incomplete) shape of the tensor produced p.
 func (p Output) Shape() Shape {
 	status := newStatus()
@@ -106,6 +113,11 @@ func (p Output) Shape() Shape {
 }
 
 func (p Output) c() C.TF_Output {
+	if p.Op == nil {
+		// Attempt to provide a more useful panic message than "nil
+		// pointer dereference".
+		panic("nil-Operation. If the Output was created with a Scope object, see Scope.Err() for details.")
+	}
 	return C.TF_Output{oper: p.Op.c, index: C.int(p.Index)}
 }
 
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 32e29891797..7cba043af29 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -96,19 +98,22 @@ func TestOperationShapeAttribute(t *testing.T) {
 	// If and when the API to get attributes is added, check that here.
 }
 
-func TestOutputShape(t *testing.T) {
+func TestOutputDataTypeAndShape(t *testing.T) {
 	graph := NewGraph()
 	testdata := []struct {
 		Value interface{}
 		Shape []int64
+		dtype DataType
 	}{
 		{ // Scalar
 			int64(0),
 			[]int64{},
+			Int64,
 		},
 		{ // Vector
-			[]int64{1, 2, 3},
+			[]int32{1, 2, 3},
 			[]int64{3},
+			Int32,
 		},
 		{ // Matrix
 			[][]float64{
@@ -116,6 +121,7 @@ func TestOutputShape(t *testing.T) {
 				{4, 5, 6},
 			},
 			[]int64{2, 3},
+			Double,
 		},
 	}
 	for idx, test := range testdata {
@@ -124,6 +130,9 @@ func TestOutputShape(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
+			if got, want := c.DataType(), test.dtype; got != want {
+				t.Errorf("Got DataType %v, want %v", got, want)
+			}
 			shape := c.Shape()
 			if got, want := shape.NumDimensions(), len(test.Shape); got != want {
 				t.Fatalf("Got a shape with %d dimensions, want %d", got, want)
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
new file mode 100644
index 00000000000..7aeaaec942e
--- /dev/null
+++ b/tensorflow/go/saved_model.go
@@ -0,0 +1,74 @@
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+// #include <stdlib.h>
+// #include "tensorflow/c/c_api.h"
+import "C"
+
+// SavedModel represents the contents of loaded SavedModel.
+// TODO(jhseu): Add and document metagraphdef when we pregenerate protobufs.
+type SavedModel struct {
+	Session *Session
+	Graph   *Graph
+}
+
+// LoadSavedModel creates a new SavedModel from a model previously
+// exported to a directory on disk.
+//
+// Exported models contain a set of graphs and, optionally, variable values.
+// Tags in the model identify a single graph. LoadSavedModel initializes a
+// session with the identified graph and with variables initialized to from the
+// checkpoints on disk.
+//
+// The tensorflow package currently does not have the ability to export a model
+// to a directory from Go. This function thus currently targets loading models
+// exported in other languages, such as using tf.saved_model.builder in Python.
+// See:
+// https://www.tensorflow.org/code/tensorflow/python/saved_model/
+func LoadSavedModel(exportDir string, tags []string, options *SessionOptions) (*SavedModel, error) {
+	status := newStatus()
+	cOpt, doneOpt, err := options.c()
+	defer doneOpt()
+	if err != nil {
+		return nil, err
+	}
+	cExportDir := C.CString(exportDir)
+	cTags := make([]*C.char, len(tags))
+	for i := range tags {
+		cTags[i] = C.CString(tags[i])
+	}
+	graph := NewGraph()
+	// TODO(jhseu): Add support for run_options and meta_graph_def.
+	cSess := C.TF_LoadSessionFromSavedModel(cOpt, nil, cExportDir, (**C.char)(unsafe.Pointer(&cTags[0])), C.int(len(cTags)), graph.c, nil, status.c)
+	for i := range cTags {
+		C.free(unsafe.Pointer(cTags[i]))
+	}
+	C.free(unsafe.Pointer(cExportDir))
+
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	s := &Session{c: cSess}
+	runtime.SetFinalizer(s, func(s *Session) { s.Close() })
+	return &SavedModel{Session: s, Graph: graph}, nil
+}
diff --git a/tensorflow/go/saved_model_test.go b/tensorflow/go/saved_model_test.go
new file mode 100644
index 00000000000..5f6f70c3efb
--- /dev/null
+++ b/tensorflow/go/saved_model_test.go
@@ -0,0 +1,31 @@
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import "testing"
+
+func TestSavedModel(t *testing.T) {
+	bundle, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", []string{"serve"}, nil)
+	if err != nil {
+		t.Fatalf("LoadSavedModel(): %v", err)
+	}
+	if op := bundle.Graph.Operation("y"); op == nil {
+		t.Fatalf("\"y\" not found in graph")
+	}
+	// TODO(jhseu): half_plus_two has a tf.Example proto dependency to run. Add a
+	// more thorough test when the generated protobufs are available.
+}
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index dd629441efa..afa73030b88 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -20,6 +22,7 @@ import "C"
 
 import (
 	"errors"
+	"fmt"
 	"runtime"
 	"sync"
 	"unsafe"
@@ -47,9 +50,12 @@ type Session struct {
 // options may be nil to use the default options.
 func NewSession(graph *Graph, options *SessionOptions) (*Session, error) {
 	status := newStatus()
-	cOpt := options.c()
+	cOpt, doneOpt, err := options.c()
+	defer doneOpt()
+	if err != nil {
+		return nil, err
+	}
 	cSess := C.TF_NewSession(graph.c, cOpt, status.c)
-	C.TF_DeleteSessionOptions(cOpt)
 	if err := status.Err(); err != nil {
 		return nil, err
 	}
@@ -59,14 +65,14 @@ func NewSession(graph *Graph, options *SessionOptions) (*Session, error) {
 	return s, nil
 }
 
-// Run the graph with the associated session starting with the supplied inputs.
-// inputs and outputs may be set to nil. Runs, but does not return Tensors
-// for operations specified in targets.
+// Run the graph with the associated session starting with the supplied feeds
+// to compute the value of the requested fetches. Runs, but does not return
+// Tensors for operations specified in targets.
 //
-// On success, returns the Tensor outputs in the same order as supplied in
-// the outputs argument. If outputs is set to nil, the returned Tensor outputs
+// On success, returns the fetched Tensors in the same order as supplied in
+// the fetches argument. If fetches is set to nil, the returned Tensor fetches
 // is empty.
-func (s *Session) Run(inputs map[Output]*Tensor, outputs []Output, targets []*Operation) ([]*Tensor, error) {
+func (s *Session) Run(feeds map[Output]*Tensor, fetches []Output, targets []*Operation) ([]*Tensor, error) {
 	s.mu.Lock()
 	if s.c == nil {
 		s.mu.Unlock()
@@ -76,56 +82,126 @@ func (s *Session) Run(inputs map[Output]*Tensor, outputs []Output, targets []*Op
 	s.mu.Unlock()
 	defer s.wg.Done()
 
-	var inputPorts []C.TF_Output
-	var inputValues []*C.TF_Tensor
-	if inputs != nil {
-		for port, tensor := range inputs {
-			inputPorts = append(inputPorts, port.c())
-			inputValues = append(inputValues, tensor.c)
-		}
-	}
-
-	var outputPorts []C.TF_Output
-	for _, port := range outputs {
-		outputPorts = append(outputPorts, port.c())
-	}
-	outputValues := make([]*C.TF_Tensor, len(outputs))
-	var cTargets []*C.TF_Operation
-	for _, target := range targets {
-		cTargets = append(cTargets, target.c)
-	}
-
+	c := newCRunArgs(feeds, fetches, targets)
 	status := newStatus()
-	var inputPortsPtr *C.TF_Output
-	var inputValuesPtr **C.TF_Tensor
-	if len(inputPorts) > 0 {
-		inputPortsPtr = &inputPorts[0]
-		inputValuesPtr = &inputValues[0]
-	}
-
-	var outputPortsPtr *C.TF_Output
-	var outputValuesPtr **C.TF_Tensor
-	if len(outputPorts) > 0 {
-		outputPortsPtr = &outputPorts[0]
-		outputValuesPtr = &outputValues[0]
-	}
-
-	var cTargetsPtr **C.TF_Operation
-	if len(cTargets) > 0 {
-		cTargetsPtr = &cTargets[0]
-	}
-
-	C.TF_SessionRun(s.c, nil, inputPortsPtr, inputValuesPtr, C.int(len(inputPorts)), outputPortsPtr, outputValuesPtr, C.int(len(outputPorts)), cTargetsPtr, C.int(len(cTargets)), nil, status.c)
+	C.TF_SessionRun(s.c, nil,
+		ptrOutput(c.feeds), ptrTensor(c.feedTensors), C.int(len(feeds)),
+		ptrOutput(c.fetches), ptrTensor(c.fetchTensors), C.int(len(fetches)),
+		ptrOperation(c.targets), C.int(len(targets)),
+		nil, status.c)
 	if err := status.Err(); err != nil {
 		return nil, err
 	}
+	return c.toGo(), nil
+}
 
-	tensors := make([]*Tensor, len(outputValues))
-	for i, val := range outputValues {
-		tensors[i] = newTensorFromC(val)
+// PartialRun enables incremental evaluation of graphs.
+//
+// PartialRun allows the caller to pause the evaluation of a graph, run
+// arbitrary code that depends on the intermediate computation of the graph,
+// and then resume graph execution. The results of the arbitrary code can be
+// fed into the graph when resuming execution.  In contrast, Session.Run
+// executes the graph to compute the requested fetches using the provided feeds
+// and discards all intermediate state (e.g., value of intermediate tensors)
+// when it returns.
+//
+// For example, consider a graph for unsupervised training of a neural network
+// model. PartialRun can be used to pause execution after the forward pass of
+// the network, let the caller actuate the output (e.g., play a game, actuate a
+// robot etc.), determine the error/loss and then feed this calculated loss
+// when resuming the backward pass of the graph.
+type PartialRun struct {
+	session *Session
+	handle  *C.char
+}
+
+// Run resumes execution of the graph to compute the requested fetches and
+// targets with the provided feeds.
+func (pr *PartialRun) Run(feeds map[Output]*Tensor, fetches []Output, targets []*Operation) ([]*Tensor, error) {
+	var (
+		c      = newCRunArgs(feeds, fetches, targets)
+		status = newStatus()
+		s      = pr.session
+	)
+	s.mu.Lock()
+	if s.c == nil {
+		s.mu.Unlock()
+		return nil, errors.New("session is closed")
+	}
+	s.wg.Add(1)
+	s.mu.Unlock()
+	defer s.wg.Done()
+
+	C.TF_SessionPRun(s.c, pr.handle,
+		ptrOutput(c.feeds), ptrTensor(c.feedTensors), C.int(len(feeds)),
+		ptrOutput(c.fetches), ptrTensor(c.fetchTensors), C.int(len(fetches)),
+		ptrOperation(c.targets), C.int(len(targets)),
+		status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	return c.toGo(), nil
+}
+
+// NewPartialRun sets up the graph for incremental evaluation.
+//
+// All values of feeds, fetches and targets that may be provided to Run calls
+// on the returned PartialRun need to be provided to NewPartialRun.
+//
+// See documentation for the PartialRun type.
+func (s *Session) NewPartialRun(feeds, fetches []Output, targets []*Operation) (*PartialRun, error) {
+	var (
+		cfeeds   = make([]C.TF_Output, len(feeds))
+		cfetches = make([]C.TF_Output, len(fetches))
+		ctargets = make([]*C.TF_Operation, len(targets))
+
+		pcfeeds   *C.TF_Output
+		pcfetches *C.TF_Output
+		pctargets **C.TF_Operation
+
+		status = newStatus()
+	)
+	if len(feeds) > 0 {
+		pcfeeds = &cfeeds[0]
+		for i, o := range feeds {
+			cfeeds[i] = o.c()
+		}
+	}
+	if len(fetches) > 0 {
+		pcfetches = &cfetches[0]
+		for i, o := range fetches {
+			cfetches[i] = o.c()
+		}
+	}
+	if len(targets) > 0 {
+		pctargets = &ctargets[0]
+		for i, o := range targets {
+			ctargets[i] = o.c
+		}
 	}
 
-	return tensors, nil
+	s.mu.Lock()
+	if s.c == nil {
+		s.mu.Unlock()
+		return nil, errors.New("session is closed")
+	}
+	s.wg.Add(1)
+	s.mu.Unlock()
+	defer s.wg.Done()
+
+	pr := &PartialRun{session: s}
+	C.TF_SessionPRunSetup(s.c,
+		pcfeeds, C.int(len(feeds)),
+		pcfetches, C.int(len(fetches)),
+		pctargets, C.int(len(targets)),
+		&pr.handle, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	runtime.SetFinalizer(pr, func(pr *PartialRun) {
+		C.TF_DeletePRunHandle(pr.handle)
+	})
+	return pr, nil
 }
 
 // Close a session. This contacts any other processes associated with this
@@ -173,17 +249,98 @@ type SessionOptions struct {
 	// If the session disconnects from the remote process during its
 	// lifetime, session calls may fail immediately.
 	Target string
+
+	// Config is a binary-serialized representation of the
+	// tensorflow.ConfigProto protocol message
+	// (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
+	Config []byte
 }
 
 // c converts the SessionOptions to the C API's TF_SessionOptions. Callers must
-// deallocate by calling C.TF_DeleteSessionOptions().
-func (o *SessionOptions) c() *C.TF_SessionOptions {
+// deallocate by calling the returned done() closure.
+func (o *SessionOptions) c() (ret *C.TF_SessionOptions, done func(), err error) {
 	opt := C.TF_NewSessionOptions()
 	if o == nil {
-		return opt
+		return opt, func() { C.TF_DeleteSessionOptions(opt) }, nil
 	}
 	t := C.CString(o.Target)
 	C.TF_SetTarget(opt, t)
 	C.free(unsafe.Pointer(t))
-	return opt
+
+	var cConfig unsafe.Pointer
+	if sz := len(o.Config); sz > 0 {
+		status := newStatus()
+		// Copying into C-memory is the simplest thing to do in terms
+		// of memory safety and cgo rules ("C code may not keep a copy
+		// of a Go pointer after the call returns" from
+		// https://golang.org/cmd/cgo/#hdr-Passing_pointers).
+		cConfig = C.CBytes(o.Config)
+		C.TF_SetConfig(opt, cConfig, C.size_t(sz), status.c)
+		if err := status.Err(); err != nil {
+			C.TF_DeleteSessionOptions(opt)
+			return nil, func() {}, fmt.Errorf("invalid SessionOptions.Config: %v", err)
+		}
+	}
+	return opt, func() {
+		C.TF_DeleteSessionOptions(opt)
+		C.free(cConfig)
+	}, nil
+}
+
+// cRunArgs translates the arguments to Session.Run and PartialRun.Run into
+// values suitable for C library calls.
+type cRunArgs struct {
+	feeds        []C.TF_Output
+	feedTensors  []*C.TF_Tensor
+	fetches      []C.TF_Output
+	fetchTensors []*C.TF_Tensor
+	targets      []*C.TF_Operation
+}
+
+func newCRunArgs(feeds map[Output]*Tensor, fetches []Output, targets []*Operation) *cRunArgs {
+	c := &cRunArgs{
+		fetches:      make([]C.TF_Output, len(fetches)),
+		fetchTensors: make([]*C.TF_Tensor, len(fetches)),
+		targets:      make([]*C.TF_Operation, len(targets)),
+	}
+	for o, t := range feeds {
+		c.feeds = append(c.feeds, o.c())
+		c.feedTensors = append(c.feedTensors, t.c)
+	}
+	for i, o := range fetches {
+		c.fetches[i] = o.c()
+	}
+	for i, t := range targets {
+		c.targets[i] = t.c
+	}
+	return c
+}
+
+func (c *cRunArgs) toGo() []*Tensor {
+	ret := make([]*Tensor, len(c.fetchTensors))
+	for i, ct := range c.fetchTensors {
+		ret[i] = newTensorFromC(ct)
+	}
+	return ret
+}
+
+func ptrOutput(l []C.TF_Output) *C.TF_Output {
+	if len(l) == 0 {
+		return nil
+	}
+	return &l[0]
+}
+
+func ptrTensor(l []*C.TF_Tensor) **C.TF_Tensor {
+	if len(l) == 0 {
+		return nil
+	}
+	return &l[0]
+}
+
+func ptrOperation(l []*C.TF_Operation) **C.TF_Operation {
+	if len(l) == 0 {
+		return nil
+	}
+	return &l[0]
 }
diff --git a/tensorflow/go/session_test.go b/tensorflow/go/session_test.go
index 14ecca402b2..73d78a8e577 100644
--- a/tensorflow/go/session_test.go
+++ b/tensorflow/go/session_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -181,3 +183,103 @@ func TestConcurrency(t *testing.T) {
 		t.Errorf("Close() 2: %v", err)
 	}
 }
+
+func ExamplePartialRun() {
+	var (
+		// Create a graph: a + 2 + 3 + b.
+		//
+		// Skipping error handling for brevity of this example.
+		// The 'op' package can be used to make graph construction code
+		// with error handling more succinct.
+		g        = NewGraph()
+		a, _     = Placeholder(g, "a", Int32)
+		b, _     = Placeholder(g, "b", Int32)
+		two, _   = Const(g, "Two", int32(2))
+		three, _ = Const(g, "Three", int32(3))
+
+		plus2, _ = Add(g, "plus2", a, two)       // a + 2
+		plus3, _ = Add(g, "plus3", plus2, three) // (a + 2) + 3
+		plusB, _ = Add(g, "plusB", plus3, b)     // ((a + 2) + 3) + b
+
+	)
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		panic(err)
+	}
+	defer sess.Close()
+
+	// All the feeds, fetches and targets for subsequent PartialRun.Run
+	// calls must be provided at setup.
+	pr, err := sess.NewPartialRun(
+		[]Output{a, b},
+		[]Output{plus2, plusB},
+		[]*Operation{plus3.Op},
+	)
+	if err != nil {
+		panic(err)
+	}
+
+	// Feed 'a=1', fetch 'plus2', and compute (but do not fetch) 'plus3'.
+	// Imagine this to be the forward pass of unsupervised neural network
+	// training of a robot.
+	val, _ := NewTensor(int32(1))
+	fetches, err := pr.Run(
+		map[Output]*Tensor{a: val},
+		[]Output{plus2},
+		nil)
+	if err != nil {
+		panic(err)
+	}
+	v1 := fetches[0].Value().(int32)
+
+	// Now, feed 'b=4', fetch 'plusB=a+2+3+b'
+	// Imagine this to be the result of actuating the robot to determine
+	// the error produced by the current state of the neural network.
+	val, _ = NewTensor(int32(4))
+	fetches, err = pr.Run(
+		map[Output]*Tensor{b: val},
+		[]Output{plusB},
+		nil)
+	if err != nil {
+		panic(err)
+	}
+	v2 := fetches[0].Value().(int32)
+
+	fmt.Println(v1, v2)
+	// Output: 3 10
+}
+
+func TestSessionConfig(t *testing.T) {
+	// Exercise SessionOptions.
+	// Arguably, a better API would be for SessionOptions.Config to be the
+	// type generated by the protocol buffer compiler. But for now, the
+	// tensorflow package continues to be independent of protocol buffers
+	// and this test exercises the option since the implementation has a
+	// nuanced conversion to C types.
+	//
+	// Till then, the []byte form of Config here was generated using a toy
+	// tensorflow Python program:
+	/*
+	 import tensorflow
+	 c = tensorflow.ConfigProto()
+	 c.intra_op_parallelism_threads = 1
+	 print c.SerializeToString()
+	*/
+	graph := NewGraph()
+	c, err := Const(graph, "Const", int32(14))
+	if err != nil {
+		t.Fatal(err)
+	}
+	opts := SessionOptions{Config: []byte("(\x01")}
+	s, err := NewSession(graph, &opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+	output, err := s.Run(nil, []Output{c}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if output[0].Value().(int32) != 14 {
+		t.Fatalf("Got %v, want -1", output[0].Value())
+	}
+}
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
index c48bbf29a36..8d000cb9deb 100644
--- a/tensorflow/go/shape.go
+++ b/tensorflow/go/shape.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -57,7 +59,7 @@ func (s Shape) NumDimensions() int {
 //
 // REQUIRES: 0 <= dim < s.NumDimensions()
 func (s Shape) Size(dim int) int64 {
-	if dim < 0 || dim > s.NumDimensions() {
+	if dim < 0 || dim >= s.NumDimensions() {
 		return -1
 	}
 	return s.dims[dim]
diff --git a/tensorflow/go/shape_test.go b/tensorflow/go/shape_test.go
index f8f3d4e94bb..94ffd27162c 100644
--- a/tensorflow/go/shape_test.go
+++ b/tensorflow/go/shape_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/status.go b/tensorflow/go/status.go
index a1f7ed54810..b4df83665a5 100644
--- a/tensorflow/go/status.go
+++ b/tensorflow/go/status.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index f96e796e5e1..4a60c736b53 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -268,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 	if ret == nil {
 		panic(bug("DataType %v is not supported", dt))
 	}
-	for _ = range shape {
+	for range shape {
 		ret = reflect.SliceOf(ret)
 	}
 	return ret
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 9a879238301..2fc7553f872 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/test.sh b/tensorflow/go/test.sh
new file mode 100755
index 00000000000..6083608f225
--- /dev/null
+++ b/tensorflow/go/test.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# TensorFlow uses 'bazel' for builds and tests.
+# The TensorFlow Go API aims to be usable with the 'go' tool
+# (using 'go get' etc.) and thus without bazel.
+#
+# This script acts as a brige between bazel and go so that:
+#   bazel test :test
+# succeeds iff
+#   go test github.com/tensorflow/tensorflow/tensorflow/go
+# succeeds.
+
+set -ex
+
+# Find the 'go' tool
+if [[ ! -x "go" && -z $(which go) ]]
+then
+  if [[ -x "/usr/local/go/bin/go" ]]
+  then
+    export PATH="${PATH}:/usr/local/go/bin"
+  else
+    echo "Could not find the 'go' tool in PATH or /usr/local/go"
+    exit 1
+  fi
+fi
+
+# Setup a GOPATH that includes just the TensorFlow Go API.
+export GOPATH="${TEST_TMPDIR}/go"
+mkdir -p "${GOPATH}/src/github.com/tensorflow"
+ln -s "${PWD}" "${GOPATH}/src/github.com/tensorflow/tensorflow"
+
+# Ensure that the TensorFlow C library is accessible to the
+# linker at build and run time.
+export LIBRARY_PATH="${PWD}/tensorflow"
+OS=$(uname -s)
+if [[ "${OS}" = "Linux" ]]
+then
+  if [[ -z "${LD_LIBRARY_PATH}" ]]
+  then
+    export LD_LIBRARY_PATH="${PWD}/tensorflow"
+  else
+    export LD_LIBRARY_PATH="${PWD}/tensorflow:${LD_LIBRARY_PATH}"
+  fi
+elif [[ "${OS}" = "Darwin" ]]
+then
+  if [[ -z "${DYLD_LIBRARY_PATH}" ]]
+  then
+    export DYLD_LIBRARY_PATH="${PWD}/tensorflow"
+  else
+    export DYLD_LIBRARY_PATH="${PWD}/tensorflow:${DYLD_LIBRARY_PATH}"
+  fi
+fi
+
+# Document the Go version and run tests
+echo "Go version: $(go version)"
+go test \
+  github.com/tensorflow/tensorflow/tensorflow/go  \
+  github.com/tensorflow/tensorflow/tensorflow/go/op
diff --git a/tensorflow/go/util_test.go b/tensorflow/go/util_test.go
index 8ab365c6566..2bec954246f 100644
--- a/tensorflow/go/util_test.go
+++ b/tensorflow/go/util_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
@@ -46,9 +48,18 @@ func Const(g *Graph, name string, value interface{}) (Output, error) {
 
 func Neg(g *Graph, name string, port Output) (Output, error) {
 	op, err := g.AddOperation(OpSpec{
-		Type: "Neg",
-		Name: name,
+		Type:  "Neg",
+		Name:  name,
 		Input: []Input{port},
 	})
 	return op.Output(0), err
 }
+
+func Add(g *Graph, name string, x, y Output) (Output, error) {
+	op, err := g.AddOperation(OpSpec{
+		Type:  "Add",
+		Name:  name,
+		Input: []Input{x, y},
+	})
+	return op.Output(0), err
+}
diff --git a/tensorflow/go/version.go b/tensorflow/go/version.go
index c777c44bea3..7de909d036e 100644
--- a/tensorflow/go/version.go
+++ b/tensorflow/go/version.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 80fb500f30e..90372660cd2 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -5,10 +5,16 @@ package(default_visibility = ["//visibility:private"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("build_defs", "JAVACOPTS")
+
 java_library(
     name = "tensorflow",
-    srcs = [":java_sources"],
+    srcs = [
+        ":java_op_sources",
+        ":java_sources",
+    ],
     data = [":libtensorflow_jni"],
+    javacopts = JAVACOPTS,
     visibility = ["//visibility:public"],
 )
 
@@ -23,10 +29,19 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "java_op_sources",
+    srcs = glob(["src/main/java/org/tensorflow/op/*.java"]),
+    visibility = [
+        "//tensorflow/java:__pkg__",
+    ],
+)
+
 java_library(
     name = "testutil",
     testonly = 1,
     srcs = ["src/test/java/org/tensorflow/TestUtil.java"],
+    javacopts = JAVACOPTS,
     deps = [":tensorflow"],
 )
 
@@ -34,11 +49,12 @@ java_test(
     name = "GraphTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/GraphTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.GraphTest",
     deps = [
         ":tensorflow",
         ":testutil",
-        "//external:junit",
+        "@junit",
     ],
 )
 
@@ -46,11 +62,39 @@ java_test(
     name = "OperationBuilderTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/OperationBuilderTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.OperationBuilderTest",
     deps = [
         ":tensorflow",
         ":testutil",
-        "//external:junit",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "OperationTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/OperationTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.OperationTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "SavedModelBundleTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/SavedModelBundleTest.java"],
+    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.SavedModelBundleTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
     ],
 )
 
@@ -58,11 +102,12 @@ java_test(
     name = "SessionTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/SessionTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.SessionTest",
     deps = [
         ":tensorflow",
         ":testutil",
-        "//external:junit",
+        "@junit",
     ],
 )
 
@@ -70,11 +115,12 @@ java_test(
     name = "ShapeTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/ShapeTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.ShapeTest",
     deps = [
         ":tensorflow",
         ":testutil",
-        "//external:junit",
+        "@junit",
     ],
 )
 
@@ -82,10 +128,11 @@ java_test(
     name = "TensorFlowTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/TensorFlowTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.TensorFlowTest",
     deps = [
         ":tensorflow",
-        "//external:junit",
+        "@junit",
     ],
 )
 
@@ -93,11 +140,25 @@ java_test(
     name = "TensorTest",
     size = "small",
     srcs = ["src/test/java/org/tensorflow/TensorTest.java"],
+    javacopts = JAVACOPTS,
     test_class = "org.tensorflow.TensorTest",
     deps = [
         ":tensorflow",
         ":testutil",
-        "//external:junit",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "ScopeTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/ScopeTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.ScopeTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
     ],
 )
 
@@ -107,6 +168,7 @@ filegroup(
         "//tensorflow:darwin": [":libtensorflow_jni.dylib"],
         "//conditions:default": [":libtensorflow_jni.so"],
     }),
+    visibility = ["//visibility:public"],
 )
 
 LINKER_VERSION_SCRIPT = ":config/version_script.lds"
@@ -125,6 +187,7 @@ cc_binary(
             LINKER_EXPORTED_SYMBOLS,
         ],
         "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
             "-s",
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index bc4d491ad27..2abee05f4e2 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,21 +1,27 @@
 # TensorFlow for Java
 
-Java bindings for TensorFlow.
-
-> *WARNING*: The TensorFlow Java API is incomplete and experimental and can
-> change without notice. Progress can be followed in
-> [issue #5](https://github.com/tensorflow/tensorflow/issues/5).
+> *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
+> [API stability guarantees](https://www.tensorflow.org/programmers_guide/version_semantics).
 >
-> Till then, for using TensorFlow on Android refer to
+> For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android camera
-> demo](https://www.tensorflow.org/code/tensorflow/examples/android).
+> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
-## Requirements
+## Quickstart
 
--   [bazel](https://www.bazel.build/versions/master/docs/install.html)
--   Environment to build TensorFlow from source code
+-   Refer to [Installing TensorFlow for Java](https://www.tensorflow.org/install/install_java)
+-   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+-   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
+
+## Building from source
+
+If the quickstart instructions above do not work out, the TensorFlow Java and
+native libraries will need to be built from source.
+
+1.  Install [bazel](https://www.bazel.build/versions/master/docs/install.html)
+
+2.  Setup the environment to build TensorFlow from source code
     ([Linux](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-linux)
     or [Mac OS
     X](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-mac-os-x)).
@@ -30,26 +36,51 @@ Java bindings for TensorFlow.
     brew install swig
     ```
 
-## Installation
+3.  [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
+    (e.g., enable GPU support) and build:
 
-Configure and build the Java Archive (JAR) and native library:
+    ```sh
+    ./configure
+    bazel build --config opt \
+      //tensorflow/java:tensorflow \
+      //tensorflow/java:libtensorflow_jni
+    ```
+
+The command above will produce two files in the `bazel-bin/tensorflow/java`
+directory:
+
+*   An archive of Java classes: `libtensorflow.jar`
+*   A native library: `libtensorflow_jni.so` on Linux, `libtensorflow_jni.dylib`
+    on OS X, or `tensorflow_jni.dll` on Windows.
+
+To compile Java code that uses the TensorFlow Java API, include
+`libtensorflow.jar` in the classpath. For example:
 
 ```sh
-# Configure the build (e.g. GPU support etc.), as per
-# https://www.tensorflow.org/get_started/os_setup#configure_the_installation
-./configure
-
-# Build the JAR and native library
-bazel build --config opt \
-  //tensorflow/java:tensorflow \
-  //tensorflow/java:libtensorflow_jni
+javac -cp bazel-bin/tensorflow/java/libtensorflow.jar ...
 ```
 
+To execute the compiled program, include `libtensorflow.jar` in the classpath
+and the native library in the library path. For example:
+
+```sh
+java -cp bazel-bin/tensorflow/java/libtensorflow.jar \
+  -Djava.library.path=bazel-bin/tensorflow/java \
+  ...
+```
+
+Installation on Windows requires the more experimental [bazel on
+Windows](https://bazel.build/versions/master/docs/windows.html). Details are
+omitted here, but find inspiration in the script used for building the release
+archive:
+[`tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh`](https://www.tensorflow.org/code/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh).
+
 ### Maven
 
-To use the library in an external Java project, publish the library to a Maven
-repository.  For example, publish the library to the local Maven repository
-using the `mvn` tool (installed separately):
+Details of the release process for Maven Central are in
+[`maven/README.md`](https://www.tensorflow.org/code/tensorflow/java/maven/README.md).
+However, for development, you can push the library built from source to a local
+Maven repository with:
 
 ```sh
 bazel build -c opt //tensorflow/java:pom
@@ -58,45 +89,23 @@ mvn install:install-file \
   -DpomFile=../../bazel-bin/tensorflow/java/pom.xml
 ```
 
-Refer to the library using Maven coordinates.  For example, if you're using
-Maven then place this dependency into your `pom.xml` file (replacing
-0.12.head with the version of the TensorFlow runtime you wish to use).
+And then refer to this library in a project's `pom.xml` with: (replacing
+VERSION with the appropriate version of TensorFlow):
 
 ```xml
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>0.12.head</version>
+  <version>VERSION</version>
 </dependency>
 ```
 
-## Example
+### Bazel
 
-### With bazel
-
-Add a dependency on `//tensorflow/java:tensorflow` to the `java_binary` or
-`java_library` rule. For example:
+If your project uses bazel for builds, add a dependency on
+`//tensorflow/java:tensorflow` to the `java_binary` or `java_library` rule. For
+example:
 
 ```sh
 bazel run -c opt //tensorflow/java/src/main/java/org/tensorflow/examples:label_image
 ```
-
-### With `javac`
-
--   Add `libtensorflow.jar` to classpath for compilation. For example:
-
-    ```sh
-    javac \
-      -cp ../../bazel-bin/tensorflow/java/libtensorflow.jar \
-      ./src/main/java/org/tensorflow/examples/LabelImage.java
-    ```
-
--   Make `libtensorflow.jar` and `libtensorflow_jni.so`
-    (`libtensorflow_jni.dylib` on OS X) available during execution. For example:
-
-    ```sh
-    java \
-      -Djava.library.path=../../bazel-bin/tensorflow/java \
-      -cp ../../bazel-bin/tensorflow/java/libtensorflow.jar:./src/main/java \
-      org.tensorflow.examples.LabelImage
-    ```
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
new file mode 100644
index 00000000000..5bd5b9a388f
--- /dev/null
+++ b/tensorflow/java/build_defs.bzl
@@ -0,0 +1,163 @@
+# -*- Python -*-
+
+# Pin to Java 1.7 to ensure broader compatibility for the Java bindings on
+# Android. Note also that the android_library bazel rule currently enforces
+# java 7
+# https://github.com/bazelbuild/bazel/blob/6c1106b1a721516d3b3db54d2e1c31b44a76fbb1/src/main/java/com/google/devtools/build/lib/bazel/rules/android/BazelAndroidSemantics.java#L73
+
+JAVA_VERSION_OPTS = [
+    "-source 7 -target 7",
+]
+
+# A more robust set of lint and errorprone checks when building
+# Java source to improve code consistency.
+
+XLINT_OPTS = [
+    "-Werror",
+    "-Xlint:all",
+    "-Xlint:-serial",
+    "-Xlint:-try",
+]
+
+# The bazel errorprone plugin currently only enables default errorChecks
+# https://github.com/bazelbuild/bazel/blob/97975603e5ff2247e6bb352e3afd27fea38f108d/src/java_tools/buildjar/java/com/google/devtools/build/buildjar/javac/plugins/errorprone/ErrorPronePlugin.java#L52
+#
+# Default errorChecks are errorprone checkers listed under ENABLED_ERRORS at
+# https://github.com/google/error-prone/blob/c6f24bc387989158d99af28e7ae86755e56c5f38/core/src/main/java/com/google/errorprone/scanner/BuiltInCheckerSuppliers.java#L273
+#
+# Here we enable all available errorprone checks to converge on a consistent
+# code style.
+# https://github.com/google/error-prone/blob/c6f24bc387989158d99af28e7ae86755e56c5f38/core/src/main/java/com/google/errorprone/scanner/BuiltInCheckerSuppliers.java#L260
+
+# This list is from ENABLED_WARNINGS in
+# com/google/errorprone/scanner/BuiltInCheckerSuppliers.java
+EP_ENABLED_WARNINGS = [
+    "-Xep:AmbiguousMethodReference:ERROR",
+    "-Xep:ArgumentSelectionDefectChecker:ERROR",
+    "-Xep:AssertEqualsArgumentOrderChecker:ERROR",
+    "-Xep:BadAnnotationImplementation:ERROR",
+    "-Xep:BadComparable:ERROR",
+    "-Xep:BoxedPrimitiveConstructor:ERROR",
+    "-Xep:CannotMockFinalClass:ERROR",
+    "-Xep:ClassCanBeStatic:ERROR",
+    "-Xep:ClassNewInstance:ERROR",
+    "-Xep:DefaultCharset:ERROR",
+    "-Xep:DoubleCheckedLocking:ERROR",
+    "-Xep:ElementsCountedInLoop:ERROR",
+    "-Xep:EqualsHashCode:ERROR",
+    "-Xep:EqualsIncompatibleType:ERROR",
+    "-Xep:Finally:ERROR",
+    "-Xep:FloatingPointLiteralPrecision:ERROR",
+    "-Xep:FragmentInjection:ERROR",
+    "-Xep:FragmentNotInstantiable:ERROR",
+    "-Xep:FunctionalInterfaceClash:ERROR",
+    "-Xep:FutureReturnValueIgnored:ERROR",
+    "-Xep:GetClassOnEnum:ERROR",
+    "-Xep:ImmutableAnnotationChecker:ERROR",
+    "-Xep:ImmutableEnumChecker:ERROR",
+    "-Xep:IncompatibleModifiers:ERROR",
+    "-Xep:InjectOnConstructorOfAbstractClass:ERROR",
+    "-Xep:InputStreamSlowMultibyteRead:ERROR",
+    "-Xep:IterableAndIterator:ERROR",
+    "-Xep:JavaLangClash:ERROR",
+    "-Xep:JUnit3FloatingPointComparisonWithoutDelta:ERROR",
+    "-Xep:JUnitAmbiguousTestClass:ERROR",
+    "-Xep:LiteralClassName:ERROR",
+    "-Xep:LogicalAssignment:ERROR",
+    "-Xep:MissingFail:ERROR",
+    "-Xep:MissingOverride:ERROR",
+    "-Xep:MutableConstantField:ERROR",
+    "-Xep:NamedParameters:ERROR",
+    "-Xep:NarrowingCompoundAssignment:ERROR",
+    "-Xep:NonAtomicVolatileUpdate:ERROR",
+    "-Xep:NonOverridingEquals:ERROR",
+    "-Xep:NullableConstructor:ERROR",
+    "-Xep:NullablePrimitive:ERROR",
+    "-Xep:NullableVoid:ERROR",
+    "-Xep:OperatorPrecedence:ERROR",
+    "-Xep:OverridesGuiceInjectableMethod:ERROR",
+    "-Xep:PreconditionsInvalidPlaceholder:ERROR",
+    "-Xep:ProtoFieldPreconditionsCheckNotNull:ERROR",
+    "-Xep:ReferenceEquality:ERROR",
+    "-Xep:RequiredModifiers:ERROR",
+    "-Xep:ShortCircuitBoolean:ERROR",
+    "-Xep:SimpleDateFormatConstant:ERROR",
+    "-Xep:StaticGuardedByInstance:ERROR",
+    "-Xep:SynchronizeOnNonFinalField:ERROR",
+    "-Xep:TruthConstantAsserts:ERROR",
+    "-Xep:TypeParameterShadowing:ERROR",
+    "-Xep:TypeParameterUnusedInFormals:ERROR",
+    "-Xep:UnsynchronizedOverridesSynchronized:ERROR",
+    "-Xep:URLEqualsHashCode:ERROR",
+    "-Xep:WaitNotInLoop:ERROR",
+]
+
+# This list is from DISABLED_CHECKS in
+# com/google/errorprone/scanner/BuiltInCheckerSuppliers.java
+EP_DISABLED_CHECKS = [
+    "-Xep:AutoFactoryAtInject:ERROR",
+    "-Xep:AssertFalse:ERROR",
+    "-Xep:AssistedInjectAndInjectOnConstructors:ERROR",
+    "-Xep:AssistedInjectAndInjectOnSameConstructor:ERROR",
+    "-Xep:BigDecimalLiteralDouble:ERROR",
+    "-Xep:BindingToUnqualifiedCommonType:ERROR",
+    "-Xep:ClassName:ERROR",
+    "-Xep:ComparisonContractViolated:ERROR",
+    "-Xep:ConstantField:ERROR",
+    "-Xep:ConstructorInvokesOverridable:ERROR",
+    # False positives, disabled
+    # "-Xep:ConstructorLeaksThis:ERROR",
+    "-Xep:DepAnn:ERROR",
+    "-Xep:DivZero:ERROR",
+    "-Xep:EmptyIfStatement:ERROR",
+    "-Xep:EmptySetMultibindingContributions:ERROR",
+    "-Xep:EmptyTopLevelDeclaration:ERROR",
+    "-Xep:ExpectedExceptionChecker:ERROR",
+    "-Xep:HardCodedSdCardPath:ERROR",
+    "-Xep:InjectedConstructorAnnotations:ERROR",
+    "-Xep:InsecureCipherMode:ERROR",
+    "-Xep:InvalidTargetingOnScopingAnnotation:ERROR",
+    "-Xep:IterablePathParameter:ERROR",
+    "-Xep:JMockTestWithoutRunWithOrRuleAnnotation:ERROR",
+    "-Xep:JavaxInjectOnFinalField:ERROR",
+    "-Xep:LockMethodChecker:ERROR",
+    "-Xep:LongLiteralLowerCaseSuffix:ERROR",
+    "-Xep:MethodCanBeStatic:ERROR",
+    "-Xep:MissingDefault:ERROR",
+    "-Xep:MixedArrayDimensions:ERROR",
+    "-Xep:MoreThanOneQualifier:ERROR",
+    "-Xep:MultiVariableDeclaration:ERROR",
+    "-Xep:MultipleTopLevelClasses:ERROR",
+    "-Xep:NoAllocationChecker:ERROR",
+    "-Xep:NonCanonicalStaticMemberImport:ERROR",
+    "-Xep:NumericEquality:ERROR",
+    "-Xep:PackageLocation:ERROR",
+    "-Xep:PrimitiveArrayPassedToVarargsMethod:ERROR",
+    "-Xep:PrivateConstructorForUtilityClass:ERROR",
+    "-Xep:PrivateConstructorForNoninstantiableModule:ERROR",
+    "-Xep:ProtoStringFieldReferenceEquality:ERROR",
+    "-Xep:QualifierOrScopeOnInjectMethod:ERROR",
+    "-Xep:QualifierWithTypeUse:ERROR",
+    "-Xep:RedundantThrows:ERROR",
+    "-Xep:RemoveUnusedImports:ERROR",
+    "-Xep:ScopeAnnotationOnInterfaceOrAbstractClass:ERROR",
+    "-Xep:ScopeOrQualifierAnnotationRetention:ERROR",
+    "-Xep:StaticQualifiedUsingExpression:ERROR",
+    "-Xep:StaticOrDefaultInterfaceMethod:ERROR",
+    "-Xep:StringEquality:ERROR",
+    "-Xep:TestExceptionChecker:ERROR",
+    # TODO: stylistic changes in code
+    # "-Xep:ThrowsUncheckedException:ERROR",
+    # "-Xep:UngroupedOverloads:ERROR",
+    "-Xep:UnlockMethodChecker:ERROR",
+    "-Xep:UnnecessaryDefaultInEnumSwitch:ERROR",
+    "-Xep:UnnecessaryStaticImport:ERROR",
+    "-Xep:UseBinds:ERROR",
+    "-Xep:VarChecker:ERROR",
+    "-Xep:WildcardImport:ERROR",
+    "-Xep:WrongParameterPackage:ERROR",
+]
+
+EP_OPTS = EP_ENABLED_WARNINGS + EP_DISABLED_CHECKS
+
+JAVACOPTS = JAVA_VERSION_OPTS + XLINT_OPTS + EP_OPTS
diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore
new file mode 100644
index 00000000000..0e11e83a0cb
--- /dev/null
+++ b/tensorflow/java/maven/.gitignore
@@ -0,0 +1,11 @@
+# Files generated by the release script
+# (normally cleaned up by the script as well)
+target/
+libtensorflow/src
+libtensorflow/target
+libtensorflow_jni/src
+libtensorflow_jni/target
+tensorflow/src
+tensorflow/target
+proto/src
+proto/target
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
new file mode 100644
index 00000000000..17bb799961d
--- /dev/null
+++ b/tensorflow/java/maven/README.md
@@ -0,0 +1,127 @@
+# TensorFlow for Java using Maven
+
+The [TensorFlow Java
+API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+is available through artifacts uploaded to [Maven
+Central](https://oss.sonatype.org/content/repositories/snapshots/org/tensorflow/).
+This document describes the process of updating the release artifacts. It does
+_not_ describe how to use the artifacts, for which the reader is referred to the
+[TensorFlow for Java installation instructions](https://www.tensorflow.org/code/tensorflow/java/README.md).
+
+## Background
+
+TensorFlow source (which is primarily in C++) is built using
+[bazel](https://bazel.build) and not [maven](https://maven.apache.org/).  The
+Java API wraps over this native code and thus depends on platform (OS,
+architecture) specific native code.
+
+Hence, the process for building and uploading release artifacts is not a single
+`mvn deploy` command.
+
+## Artifact Structure
+
+There are five artifacts and thus `pom.xml`s involved in this release:
+
+1.  `tensorflow`: The single dependency for projects requiring TensorFlow for
+    Java. This convenience package depends on the two below, and is the one that
+    should typically be used in other programs.
+
+2.  `libtensorflow`: Java-only code for the [TensorFlow Java API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary).
+    The `.jar` itself has no native code, but requires the native code be either
+    already installed on the system or made available through
+    `libtensorflow_jni`.
+
+3.  `libtensorflow_jni`: The native libraries required by `libtensorflow`.
+    Native code for all supported platforms is packaged into a single `.jar`.
+
+4.  `proto`: Generated Java code for TensorFlow protocol buffers
+    (e.g., `MetaGraphDef`, `ConfigProto` etc.)
+
+5.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
+    shared by all of the above.
+
+## Updating the release
+
+The TensorFlow artifacts at Maven Central are created from files built as part
+of the TensorFlow release process (which uses `bazel`). The author's lack of
+familiarity with Maven best practices combined with the use of a different build
+system means that this process is possibly not ideal, but it's what we've got.
+Suggestions are welcome.
+
+In order to isolate the environment used for building, all release processes are
+conducted in a [Docker](https://www.docker.com) container.
+
+### Pre-requisites
+
+-   `docker`
+-   An account at [oss.sonatype.org](https://oss.sonatype.org/), that has
+    permissions to update artifacts in the `org.tensorflow` group. If your
+    account does not have permissions, then you'll need to ask someone who does
+    to [file a ticket](https://issues.sonatype.org/) to add to the permissions
+    ([sample ticket](https://issues.sonatype.org/browse/MVNCENTRAL-1637)).
+-   A GPG signing key, required [to sign the release artifacts](http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components).
+
+### Deploying to Maven Central
+
+1.  Create a file with your OSSRH credentials (or perhaps you use `mvn` and have
+    it in `~/.m2/settings.xml`):
+
+    ```sh
+    SONATYPE_USERNAME="your_sonatype.org_username_here"
+    SONATYPE_PASSWORD="your_sonatype.org_password_here"
+    GPG_PASSPHRASE="your_gpg_passphrase_here"
+    cat >/tmp/settings.xml <<EOF
+    <settings>
+      <servers>
+        <server>
+          <id>ossrh</id>
+          <username>${SONATYPE_USERNAME}</username>
+          <password>${SONATYPE_PASSWORD}</password>
+        </server>
+      </servers>
+      <profiles>
+        <profile>
+          <id>ossrh</id>
+          <activation>
+            <activeByDefault>true</activeByDefault>
+          </activation>
+          <properties>
+            <gpg.executable>gpg2</gpg.executable>
+            <gpg.passphrase>${GPG_PASSPHRASE}</gpg.passphrase>
+          </properties>
+        </profile>
+      </profiles>
+    </settings>
+    EOF
+    ```
+
+2.  Run the `release.sh` script.
+
+3.  If the script above succeeds then the artifacts would have been uploaded to
+    the private staging repository. After verifying the release, visit
+    https://oss.sonatype.org/#stagingRepositories, find the `org.tensorflow`
+    release and click on either `Release` to finalize the release, or `Drop` to
+    abort. Some things of note:
+
+    - For details, look at the [Sonatype guide](http://central.sonatype.org/pages/releasing-the-deployment.html).
+    - Syncing with [Maven Central](http://repo1.maven.org/maven2/org/tensorflow/)
+      can take 10 minutes to 2 hours (as per the [OSSRH
+      guide](http://central.sonatype.org/pages/ossrh-guide.html#releasing-to-central)).
+
+4.  Upon successful release, commit changes to all the `pom.xml` files
+    (which should have the updated version number).
+
+### Snapshots
+
+If the `TF_VERSION` provided to the `release.sh` script ends in `-SNAPSHOT`,
+then instead of using official release files, the nightly build artifacts from
+https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/ and
+https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/ will
+be used to upload to the Maven Central snapshots repository.
+
+
+## References
+
+-   [Sonatype guide](http://central.sonatype.org/pages/ossrh-guide.html) for
+    hosting releases.
+-   [Ticket that created the `org/tensorflow` configuration](https://issues.sonatype.org/browse/OSSRH-28072) on OSSRH.
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
new file mode 100644
index 00000000000..6d4c10f4caa
--- /dev/null
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -0,0 +1,54 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>Pure-Java code for the TensorFlow machine intelligence library.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.2.0-rc2</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>libtensorflow</artifactId>
+  <packaging>jar</packaging>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.6.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <version>2.2.1</version>
+        <executions>
+          <execution>
+            <id>attach-sources</id>
+            <goals>
+              <goal>jar-no-fork</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.9.1</version>
+        <executions>
+          <execution>
+            <id>attach-javadocs</id>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
new file mode 100644
index 00000000000..89b7c6528f7
--- /dev/null
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -0,0 +1,15 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>Platform-dependent native code for the TensorFlow Java library.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.2.0-rc2</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>libtensorflow_jni</artifactId>
+  <packaging>jar</packaging>
+</project>
+
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
new file mode 100644
index 00000000000..e8d8fe63781
--- /dev/null
+++ b/tensorflow/java/maven/pom.xml
@@ -0,0 +1,90 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <name>Parent pom.xml</name>
+  <description>Parent POM for the artifacts for TensorFlow for Java</description>
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.tensorflow</groupId>
+  <artifactId>parentpom</artifactId>
+  <version>1.2.0-rc2</version>
+  <packaging>pom</packaging>
+
+  <url>https://www.tensorflow.org</url>
+  <inceptionYear>2015</inceptionYear>
+
+  <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+
+  <scm>
+    <url>https://github.com/tensorflow/tensorflow.git</url>
+    <connection>git@github.com:tensorflow/tensorflow.git</connection>
+    <developerConnection>scm:git:https://github.com/tensorflow/tensorflow.git</developerConnection>
+  </scm>
+
+  <modules>
+    <module>libtensorflow</module>
+    <module>libtensorflow_jni</module>
+    <module>tensorflow</module>
+    <module>proto</module>
+  </modules>
+
+
+  <!-- Requirements from http://central.sonatype.org/pages/apache-maven.html -->
+  <distributionManagement>
+    <snapshotRepository>
+      <id>ossrh</id>
+      <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+    </snapshotRepository>
+    <repository>
+      <id>ossrh</id>
+      <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+    </repository>
+  </distributionManagement>
+  <!-- http://central.sonatype.org/pages/requirements.html#developer-information -->
+  <developers>
+    <developer>
+      <name>TensorFlowers</name>
+      <organization>TensorFlow</organization>
+      <organizationUrl>http://www.tensorflow.org</organizationUrl>
+    </developer>
+  </developers>
+  <build>
+    <plugins>
+      <!-- Staging repository configuration: http://central.sonatype.org/pages/apache-maven.html#nexus-staging-maven-plugin-for-deployment-and-release -->
+      <plugin>
+        <groupId>org.sonatype.plugins</groupId>
+        <artifactId>nexus-staging-maven-plugin</artifactId>
+        <version>1.6.7</version>
+        <extensions>true</extensions>
+        <configuration>
+           <serverId>ossrh</serverId>
+           <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+
+           <autoReleaseAfterClose>false</autoReleaseAfterClose>
+        </configuration>
+      </plugin>
+      <!-- GPG signed components: http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-gpg-plugin</artifactId>
+        <version>1.5</version>
+        <executions>
+          <execution>
+            <id>sign-artifacts</id>
+            <phase>verify</phase>
+            <goals>
+              <goal>sign</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
+
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
new file mode 100644
index 00000000000..1192cfe1c37
--- /dev/null
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -0,0 +1,62 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>Java API for TensorFlow protocol buffers.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.2.0-rc2</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>proto</artifactId>
+  <packaging>jar</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <version>3.3.1</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.6.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <version>2.2.1</version>
+        <executions>
+          <execution>
+            <id>attach-sources</id>
+            <goals>
+              <goal>jar-no-fork</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.9.1</version>
+        <executions>
+          <execution>
+            <id>attach-javadocs</id>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
new file mode 100755
index 00000000000..0bb4efbcc09
--- /dev/null
+++ b/tensorflow/java/maven/release.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to upload release artifacts for the TensorFlow Java library to
+# Maven Central. See README.md for an explanation.
+
+TF_VERSION="$1"
+SETTINGS_XML="$2"
+shift
+shift
+CMD="$*"
+
+if [[ -z "${TF_VERSION}" ]]
+then
+  echo "Usage: $0 <version to release> [<path to settings.xml>] ["bash" for debugging]"
+  exit 1
+fi
+
+if [[ -z "${SETTINGS_XML}" ]]
+then
+  SETTINGS_XML="$HOME/.m2/settings.xml"
+fi
+
+if [[ -z "${CMD}" ]]
+then
+  CMD="bash run_inside_container.sh"
+fi
+
+if [[ ! -f "${SETTINGS_XML}" ]]
+then
+  echo "No settings.xml (containing credentials for upload) found"
+  exit 1
+fi
+
+set -ex
+docker run \
+  -e TF_VERSION="${TF_VERSION}" \
+  -v ${PWD}:/tensorflow \
+  -v "${SETTINGS_XML}":/root/.m2/settings.xml \
+  -v ${HOME}/.gnupg:/root/.gnupg \
+  -w /tensorflow \
+  -it \
+  maven:3.3.9-jdk-8  \
+  ${CMD}
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
new file mode 100644
index 00000000000..302ed96c128
--- /dev/null
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script is intended to be run inside a docker container to provide a
+# hermetic process. See release.sh for the expected invocation.
+
+
+RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow"
+IS_SNAPSHOT="false"
+if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
+  IS_SNAPSHOT="true"
+fi
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
+
+set -ex
+
+clean() {
+  # Clean up any existing artifacts
+  # (though if run inside a clean docker container, there won't be any dirty
+  # artifacts lying around)
+  mvn -q clean
+  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow/src libtensorflow/target
+}
+
+update_version_in_pom() {
+  mvn versions:set -DnewVersion="${TF_VERSION}"
+}
+
+download_libtensorflow() {
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    URL="http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow-src.jar"
+  else
+    URL="${RELEASE_URL_PREFIX}/libtensorflow-src-${TF_VERSION}.jar"
+  fi
+  curl -L "${URL}" -o /tmp/src.jar
+  cd "${DIR}/libtensorflow"
+  jar -xvf /tmp/src.jar
+  rm -rf META-INF
+  cd "${DIR}"
+}
+
+download_libtensorflow_jni() {
+  NATIVE_DIR="${DIR}/libtensorflow_jni/src/main/resources/org/tensorflow/native"
+  mkdir -p "${NATIVE_DIR}"
+  cd "${NATIVE_DIR}"
+
+  mkdir linux-x86_64
+  mkdir windows-x86_64
+  mkdir darwin-x86_64
+
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    # Nightly builds from http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/
+    # and http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/
+    curl -L "http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz" | tar -xvz -C linux-x86_64
+    curl -L "http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=mac-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz" | tar -xvz -C darwin-x86_64
+    curl -L "http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/lastSuccessfulBuild/artifact/lib_package/libtensorflow_jni-cpu-windows-x86_64.zip" -o /tmp/windows.zip
+  else
+    curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-cpu-linux-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C linux-x86_64
+    curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-cpu-darwin-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C darwin-x86_64
+    curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-cpu-windows-x86_64-${TF_VERSION}.zip" -o /tmp/windows.zip
+  fi
+
+  unzip /tmp/windows.zip -d windows-x86_64
+  rm -f /tmp/windows.zip
+  # Updated timestamps seem to be required to get Maven to pick up the file.
+  touch linux-x86_64/*
+  touch darwin-x86_64/*
+  touch windows-x86_64/*
+  cd "${DIR}"
+}
+
+# Ideally, the .jar for generated Java code for TensorFlow protocol buffer files
+# would have been produced by bazel rules. However, protocol buffer library
+# support in bazel is in flux. Once
+# https://github.com/bazelbuild/bazel/issues/2626 has been resolved, perhaps
+# TensorFlow can move to something like
+# https://bazel.build/blog/2017/02/27/protocol-buffers.html
+# for generating C++, Java and Python code for protocol buffers.
+#
+# At that point, perhaps the libtensorflow build scripts
+# (tensorflow/tools/ci_build/builds/libtensorflow.sh) can build .jars for
+# generated code and this function would not need to download protoc to generate
+# code.
+generate_java_protos() {
+  # Clean any previous attempts
+  rm -rf "${DIR}/proto/tmp"
+
+  # Download protoc
+  curl -L "${PROTOC_RELEASE_URL}" -o "/tmp/protoc.zip"
+  mkdir -p "${DIR}/proto/tmp/protoc"
+  unzip -d "${DIR}/proto/tmp/protoc" "/tmp/protoc.zip"
+  rm -f "/tmp/protoc.zip"
+
+  # Download the release archive of TensorFlow protos.
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    URL="http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow_proto.zip"
+  else
+    URL="${RELEASE_URL_PREFIX}/libtensorflow_proto-${TF_VERSION}.zip"
+  fi
+  curl -L "${URL}" -o /tmp/libtensorflow_proto.zip
+  mkdir -p "${DIR}/proto/tmp/src"
+  unzip -d "${DIR}/proto/tmp/src" "/tmp/libtensorflow_proto.zip"
+  rm -f "/tmp/libtensorflow_proto.zip"
+
+  # Generate Java code
+  mkdir -p "${DIR}/proto/src/main/java"
+  find "${DIR}/proto/tmp/src" -name "*.proto" | xargs \
+  ${DIR}/proto/tmp/protoc/bin/protoc \
+    --proto_path="${DIR}/proto/tmp/src" \
+    --java_out="${DIR}/proto/src/main/java"
+
+  # Cleanup
+  rm -rf "${DIR}/proto/tmp"
+}
+
+if [ -z "${TF_VERSION}" ]
+then
+  echo "Must set the TF_VERSION environment variable"
+  exit 1
+fi
+
+DIR="$(realpath $(dirname $0))"
+cd "${DIR}"
+
+# The meat of the script.
+# Comment lines out appropriately if debugging/tinkering with the release
+# process.
+# gnupg2 is required for signing
+apt-get -qq update && apt-get -qqq install -y gnupg2
+clean
+update_version_in_pom
+download_libtensorflow
+download_libtensorflow_jni
+generate_java_protos
+# Build the release artifacts
+mvn verify
+# If successfully built, try to deploy.
+# If successfully deployed, clean.
+# If deployment fails, debug with
+#   ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash
+# To get a shell to poke around the maven artifacts with.
+mvn deploy && clean
+
+set +ex
+if [[ "${IS_SNAPSHOT}" == "false" ]]; then
+  echo "Uploaded to the staging repository"
+  echo "After validating the release: "
+  echo "1. Login to https://oss.sonatype.org/#stagingRepositories"
+  echo "2. Find the 'org.tensorflow' staging release and click either 'Release' to release or 'Drop' to abort"
+else
+  echo "Uploaded to the snapshot repository"
+fi
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
new file mode 100644
index 00000000000..31fb0151099
--- /dev/null
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -0,0 +1,49 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>TensorFlow for Java: A software library for machine intelligence.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.2.0-rc2</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>tensorflow</artifactId>
+  <packaging>jar</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>libtensorflow</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>libtensorflow_jni</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>3.0.2</version>
+        <executions>
+          <execution>
+            <id>default-jar</id>
+            <configuration>
+              <archive>
+                <manifestEntries>
+                  <Class-Path>libtensorflow.jar libtensorflow_jni.jar</Class-Path>
+                </manifestEntries>
+              </archive>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+
diff --git a/tensorflow/java/src/main/java/org/tensorflow/DataType.java b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
index a7c6e12b41a..cafa3ffc7d4 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/DataType.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
@@ -26,6 +26,9 @@ public enum DataType {
   /** 32-bit signed integer. */
   INT32(3),
 
+  /** 8-bit unsigned integer. */
+  UINT8(4),
+
   /**
    * A sequence of bytes.
    *
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index 6a1dd4c1136..c08fa9b1457 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -20,7 +20,7 @@ package org.tensorflow;
  *
  * <p>Instances of a Graph are thread-safe.
  *
- * <p><b>WARNING:</b> Resources consumed by the Graph object msut be explicitly freed by invoking
+ * <p><b>WARNING:</b> Resources consumed by the Graph object must be explicitly freed by invoking
  * the {@link #close()} method then the Graph object is no longer needed.
  */
 public final class Graph implements AutoCloseable {
@@ -30,6 +30,11 @@ public final class Graph implements AutoCloseable {
     nativeHandle = allocate();
   }
 
+  /** Create a Graph from an existing handle (takes ownership). */
+  Graph(long nativeHandle) {
+    this.nativeHandle = nativeHandle;
+  }
+
   /**
    * Release resources associated with the Graph.
    *
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
new file mode 100644
index 00000000000..dff3a45463e
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Input.java
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * Interface implemented by operands of a TensorFlow operation.
+ *
+ * <p>Example usage:
+ *
+ * <pre>{@code
+ * // The "decodeJpeg" operation can be used as input to the "cast" operation
+ * Input decodeJpeg = ops.image().decodeJpeg(...);
+ * ops.math().cast(decodeJpeg, DataType.FLOAT);
+ *
+ * // The output "y" of the "unique" operation can be used as input to the "cast" operation
+ * Output y = ops.array().unique(...).y();
+ * ops.math().cast(y, DataType.FLOAT);
+ *
+ * // The "split" operation can be used as input list to the "concat" operation
+ * Iterable<? extends Input> split = ops.array().split(...);
+ * ops.array().concat(0, split);
+ * }</pre>
+ */
+public interface Input {
+
+  /**
+   * Returns the symbolic handle of a tensor.
+   *
+   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
+   * used to obtain a symbolic handle that represents the computation of the input.
+   *
+   * @see {@link OperationBuilder#addInput(Output)}.
+   */
+  Output asOutput();
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
new file mode 100644
index 00000000000..d817239919d
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Helper class for loading the TensorFlow Java native library.
+ *
+ * <p>The Java TensorFlow bindings require a native (JNI) library. This library
+ * (libtensorflow_jni.so on Linux, libtensorflow_jni.dylib on OS X, tensorflow_jni.dll on Windows)
+ * can be made available to the JVM using the java.library.path System property (e.g., using
+ * -Djava.library.path command-line argument). However, doing so requires an additional step of
+ * configuration.
+ *
+ * <p>Alternatively, the native libraries can be packaed in a .jar, making them easily usable from
+ * build systems like Maven. However, in such cases, the native library has to be extracted from the
+ * .jar archive.
+ *
+ * <p>NativeLibrary.load() takes care of this. First looking for the library in java.library.path
+ * and failing that, it tries to find the OS and architecture specific version of the library in the
+ * set of ClassLoader resources (under org/tensorflow/native/OS-ARCH). The resources paths used for
+ * lookup must be consistent with any packaging (such as on Maven Central) of the TensorFlow Java
+ * native libraries.
+ */
+final class NativeLibrary {
+  private static final boolean DEBUG =
+      System.getProperty("org.tensorflow.NativeLibrary.DEBUG") != null;
+  private static final String LIBNAME = "tensorflow_jni";
+
+  public static void load() {
+    if (isLoaded() || tryLoadLibrary()) {
+      // Either:
+      // (1) The native library has already been statically loaded, OR
+      // (2) The required native code has been statically linked (through a custom launcher), OR
+      // (3) The native code is part of another library (such as an an application-level libraryh)
+      // that has already been loaded. For example, tensorflow/examples/android and
+      // tensorflow/contrib/android include the required native code in differently named libraries.
+      //
+      // Doesn't matter how, but it seems the native code is loaded, so nothing else to do.
+      return;
+    }
+    // Native code is not present, perhaps it has been packaged into the .jar file containing this.
+    final String resourceName = makeResourceName();
+    log("resourceName: " + resourceName);
+    final InputStream resource =
+        NativeLibrary.class.getClassLoader().getResourceAsStream(resourceName);
+    if (resource == null) {
+      throw new UnsatisfiedLinkError(
+          String.format(
+              "Cannot find TensorFlow native library for OS: %s, architecture: %s. "
+                  + "See https://github.com/tensorflow/tensorflow/tree/master/java/README.md "
+                  + "for possible solutions (such as building the library from source).",
+              os(), architecture()));
+    }
+    try {
+      System.load(extractResource(resource));
+    } catch (IOException e) {
+      throw new UnsatisfiedLinkError(
+          String.format(
+              "Unable to extract native library into a temporary file (%s)", e.toString()));
+    }
+  }
+
+  private static boolean tryLoadLibrary() {
+    try {
+      System.loadLibrary(LIBNAME);
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      log("tryLoadLibraryFailed: " + e.getMessage());
+      return false;
+    }
+  }
+
+  private static boolean isLoaded() {
+    try {
+      TensorFlow.version();
+      log("isLoaded: true");
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      return false;
+    }
+  }
+
+  private static String extractResource(InputStream resource) throws IOException {
+    final String sampleFilename = System.mapLibraryName(LIBNAME);
+    final int dot = sampleFilename.indexOf(".");
+    final String prefix = (dot < 0) ? sampleFilename : sampleFilename.substring(0, dot);
+    final String suffix = (dot < 0) ? null : sampleFilename.substring(dot);
+
+    final File dst = File.createTempFile(prefix, suffix);
+    final String dstPath = dst.getAbsolutePath();
+    dst.deleteOnExit();
+    log("extracting native library to: " + dstPath);
+    final long nbytes = copy(resource, dst);
+    log(String.format("copied %d bytes to %s", nbytes, dstPath));
+    return dstPath;
+  }
+
+  private static String os() {
+    final String p = System.getProperty("os.name").toLowerCase();
+    if (p.contains("linux")) {
+      return "linux";
+    } else if (p.contains("os x") || p.contains("darwin")) {
+      return "darwin";
+    } else if (p.contains("windows")) {
+      return "windows";
+    } else {
+      return p.replaceAll("\\s", "");
+    }
+  }
+
+  private static String architecture() {
+    final String arch = System.getProperty("os.arch").toLowerCase();
+    return (arch.equals("amd64")) ? "x86_64" : arch;
+  }
+
+  private static void log(String msg) {
+    if (DEBUG) {
+      System.err.println("org.tensorflow.NativeLibrary: " + msg);
+    }
+  }
+
+  private static String makeResourceName() {
+    return "org/tensorflow/native/"
+        + String.format("%s-%s/", os(), architecture())
+        + System.mapLibraryName(LIBNAME);
+  }
+
+  private static long copy(InputStream src, File dstFile) throws IOException {
+    FileOutputStream dst = new FileOutputStream(dstFile);
+    try {
+      byte[] buffer = new byte[1 << 20]; // 1MB
+      long ret = 0;
+      int n = 0;
+      while ((n = src.read(buffer)) >= 0) {
+        dst.write(buffer, 0, n);
+        ret += n;
+      }
+      return ret;
+    } finally {
+      dst.close();
+      src.close();
+    }
+  }
+
+  private NativeLibrary() {}
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index 6efcbcfbab8..5b89ce5e48c 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -70,6 +70,28 @@ public final class Operation {
     }
   }
 
+  /**
+   * Returns the size of the list of Tensors produced by this operation.
+   *
+   * <p>An Operation has multiple named outputs, each of which produces either
+   * a single tensor or a list of tensors. This method returns the size of
+   * the list of tensors for a specific named output of the operation.
+   *
+   * @param name identifier of the list of tensors (of which there may
+   *        be many) produced by this operation.
+   * @return the size of the list of Tensors produced by this named output.
+   * @throws IllegalArgumentException if this operation has no output
+   *         with the provided name.
+   */
+  public int outputListLength(final String name) {
+    Graph.Reference r = graph.ref();
+    try {
+      return outputListLength(unsafeNativeHandle, name);
+    } finally {
+      r.close();
+    }
+  }
+
   /** Returns a symbolic handle to one of the tensors produced by this operation. */
   public Output output(int idx) {
     return new Output(this, idx);
@@ -89,6 +111,16 @@ public final class Operation {
     }
   }
 
+  // Package private, meant primarily for the public Output.dataType() method.
+  DataType dtype(int output) {
+    Graph.Reference r = graph.ref();
+    try {
+      return DataType.fromC(dtype(r.nativeHandle(), unsafeNativeHandle, output));
+    } finally {
+      r.close();
+    }
+  }
+
   private final long unsafeNativeHandle;
   private final Graph graph;
 
@@ -98,5 +130,9 @@ public final class Operation {
 
   private static native int numOutputs(long handle);
 
+  private static native int outputListLength(long handle, String name);
+
   private static native long[] shape(long graphHandle, long opHandle, int output);
+
+  private static native int dtype(long graphHandle, long opHandle, int output);
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index cd59cf504a7..8f7559d39e9 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -28,7 +28,7 @@ import java.nio.charset.Charset;
  * <pre>{@code
  * // g is a Graph instance.
  * try (Tensor c1 = Tensor.create(3.0f)) {
- *   g.opBuilder("Constant", "MyConst")
+ *   g.opBuilder("Const", "MyConst")
  *       .setAttr("dtype", c1.dataType())
  *       .setAttr("value", c1)
  *       .build();
@@ -73,6 +73,29 @@ public final class OperationBuilder {
     return this;
   }
 
+  /**
+   * Ensure that the operation does not execute before the control operation does.
+   *
+   * <p>A control input is an Operation that must be executed before running the operation currently
+   * being built.
+   *
+   * <p>For example, an Assert operation may be added as a control input for this operation. The
+   * Assert now behaves as a pre-condition that will always verify itself before running the
+   * operation.
+   *
+   * @param control operation that must be executed before running this operation.
+   * @return the OperationBuilder instance for chaining.
+   */
+  public OperationBuilder addControlInput(Operation control) {
+    Graph.Reference r = graph.ref();
+    try {
+      addControlInput(unsafeNativeHandle, control.getUnsafeNativeHandle());
+    } finally {
+      r.close();
+    }
+    return this;
+  }
+
   public OperationBuilder addInputList(Output[] inputs) {
     Graph.Reference r = graph.ref();
     try {
@@ -244,6 +267,8 @@ public final class OperationBuilder {
 
   private static native void addInputList(long handle, long[] opHandles, int[] indices);
 
+  private static native void addControlInput(long handle, long opHandle);
+
   private static native void setDevice(long handle, String device);
 
   // The names of all the setAttr* family functions below correspond to the C library types, not the
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
index c894355b925..2e3f8d4eac5 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Output.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -20,8 +20,11 @@ package org.tensorflow;
  *
  * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
  * the {@link Operation} in a {@link Session}.
+ *
+ * <p>By implementing the {@link Input} interface, instances of this class could also be passed
+ * directly in input to an operation.
  */
-public final class Output {
+public final class Output implements Input {
 
   /** Handle to the idx-th output of the Operation {@code op}. */
   public Output(Operation op, int idx) {
@@ -39,14 +42,21 @@ public final class Output {
     return index;
   }
 
-  /**
-   * Returns the (possibly partially known) shape of the operation that will produce the tensor
-   * referred to by this Output.
-   */
+  /** Returns the (possibly partially known) shape of the tensor referred to by this Output. */
   public Shape shape() {
     return new Shape(operation.shape(index));
   }
 
+  /** Returns the DataType of the tensor referred to by this Output. */
+  public DataType dataType() {
+    return operation.dtype(index);
+  }
+
+  @Override
+  public Output asOutput() {
+    return this;
+  }
+
   private final Operation operation;
   private final int index;
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
new file mode 100644
index 00000000000..b4591dd8692
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@@ -0,0 +1,102 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * SavedModelBundle represents a model loaded from storage.
+ *
+ * <p>The model consists of a description of the computation (a {@link Graph}), a {@link Session}
+ * with tensors (e.g., parameters or variables in the graph) initialized to values saved in storage,
+ * and a description of the model (a serialized representation of a <a
+ * href="https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto">MetaGraphDef
+ * protocol buffer</a>).
+ */
+public class SavedModelBundle implements AutoCloseable {
+
+  /**
+   * Load a saved model from an export directory. The model that is being loaded should be created using
+   * the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model API</a>.
+   *
+   * @param exportDir the directory path containing a saved model.
+   * @param tags the tags identifying the specific metagraphdef to load.
+   * @return a bundle containing the graph and associated session.
+   */
+  public static SavedModelBundle load(String exportDir, String... tags) {
+    return load(exportDir, tags, null);
+  }
+
+  /**
+   * Returns the serialized <a
+   * href="https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto">MetaGraphDef
+   * protocol buffer</a> associated with the saved model.
+   */
+  public byte[] metaGraphDef() {
+    return metaGraphDef;
+  }
+
+  /** Returns the graph that describes the computation performed by the model. */
+  public Graph graph() {
+    return graph;
+  }
+
+  /**
+   * Returns the {@link Session} with which to perform computation using the model.
+   *
+   * @return the initialized session
+   */
+  public Session session() {
+    return session;
+  }
+
+  /**
+   * Releases resources (the {@link Graph} and {@link Session}) associated with the saved model
+   * bundle.
+   */
+  @Override
+  public void close() {
+    session.close();
+    graph.close();
+  }
+
+  private final Graph graph;
+  private final Session session;
+  private final byte[] metaGraphDef;
+
+  private SavedModelBundle(Graph graph, Session session, byte[] metaGraphDef) {
+    this.graph = graph;
+    this.session = session;
+    this.metaGraphDef = metaGraphDef;
+  }
+
+  /**
+   * Create a SavedModelBundle object from a handle to the C TF_Graph object and to the C TF_Session
+   * object, plus the serialized MetaGraphDef.
+   *
+   * <p>Invoked from the native load method. Takes ownership of the handles.
+   */
+  private static SavedModelBundle fromHandle(
+      long graphHandle, long sessionHandle, byte[] metaGraphDef) {
+    Graph graph = new Graph(graphHandle);
+    Session session = new Session(graph, sessionHandle);
+    return new SavedModelBundle(graph, session, metaGraphDef);
+  }
+
+  private static native SavedModelBundle load(String exportDir, String[] tags, byte[] runOptions);
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index 3640bf5bf95..f73cded4e3e 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -49,16 +49,38 @@ public final class Session implements AutoCloseable {
 
   /** Construct a new session with the associated {@link Graph}. */
   public Session(Graph g) {
+    this(g, null);
+  }
+
+  /**
+   * Construct a new session with the associated {@link Graph} and configuration options.
+   *
+   * @param g The {@link Graph} the created Session will operate on.
+   * @param config Configuration parameters for the session specified as a serialized <a
+   *     href="https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto">ConfigProto</a>
+   *     protocol buffer.
+   * @throws IllegalArgumentException if the config is not a valid serialization of the ConfigProto
+   *     protocol buffer.
+   */
+  public Session(Graph g, byte[] config) {
     graph = g;
     Graph.Reference r = g.ref();
     try {
-      nativeHandle = allocate(r.nativeHandle());
+      nativeHandle =
+          (config == null) ? allocate(r.nativeHandle()) : allocate2(r.nativeHandle(), null, config);
       graphRef = g.ref();
     } finally {
       r.close();
     }
   }
 
+  /** Wrap an existing session with the associated {@link Graph}. */
+  Session(Graph g, long nativeHandle) {
+    graph = g;
+    this.nativeHandle = nativeHandle;
+    graphRef = g.ref();
+  }
+
   /**
    * Release resources associated with the Session.
    *
@@ -98,10 +120,15 @@ public final class Session implements AutoCloseable {
     /**
      * Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
      *
-     * <p>This method is a shorthand for {@code feed(operation, 0, t)}.
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code feed(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     feed(operation_name, output_index)}. These colon-separated names are commonly used in the
+     *     {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle#metaGraphDef()}.
      */
     public Runner feed(String operation, Tensor t) {
-      return feed(operation, 0, t);
+      return feed(parseOutput(operation), t);
     }
 
     /**
@@ -120,13 +147,28 @@ public final class Session implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Use {@code t} instead of the Tensor referred to by executing the operation referred to by
+     * {@code output}.
+     */
+    public Runner feed(Output o, Tensor t) {
+      inputs.add(o);
+      inputTensors.add(t);
+      return this;
+    }
+
     /**
      * Make {@link #run()} return the output of {@code operation}.
      *
-     * <p>This method is a shorthand for {@code fetch(operation, 0)}
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code fetch(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     fetch(operation_name, output_index)}. These colon-separated names are commonly used in
+     *     the {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle#metaGraphDef()}.
      */
     public Runner fetch(String operation) {
-      return fetch(operation, 0);
+      return fetch(parseOutput(operation));
     }
 
     /**
@@ -143,8 +185,14 @@ public final class Session implements AutoCloseable {
       return this;
     }
 
+    /** Makes {@link #run()} return the Tensor referred to by {@code output}. */
+    public Runner fetch(Output output) {
+      outputs.add(output);
+      return this;
+    }
+
     /**
-     * Make {@link #run()} execute {@code operation}, but not return the evaluated {@link Tensor}.
+     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor}s.
      */
     public Runner addTarget(String operation) {
       Operation op = operationByName(operation);
@@ -154,6 +202,14 @@ public final class Session implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Make {@link #run()} execute {@code operation}, but not return any evaluated {@link Tensor}s.
+     */
+    public Runner addTarget(Operation operation) {
+      targets.add(operation);
+      return this;
+    }
+
     /**
      * (Experimental method): set options (typically for debugging) for this run.
      *
@@ -226,6 +282,7 @@ public final class Session implements AutoCloseable {
       for (Output o : outputs) {
         outputOpHandles[idx] = o.op().getUnsafeNativeHandle();
         outputOpIndices[idx] = o.index();
+        idx++;
       }
       idx = 0;
       for (Operation op : targets) {
@@ -298,6 +355,20 @@ public final class Session implements AutoCloseable {
       return op;
     }
 
+    private Output parseOutput(String opName) {
+      int colon = opName.lastIndexOf(':');
+      if (colon == -1 || colon == opName.length() - 1) {
+        return new Output(operationByName(opName), 0);
+      }
+      try {
+        String op = opName.substring(0, colon);
+        int index = Integer.parseInt(opName.substring(colon + 1));
+        return new Output(operationByName(op), index);
+      } catch (NumberFormatException e) {
+        return new Output(operationByName(opName), 0);
+      }
+    }
+
     private ArrayList<Output> inputs = new ArrayList<Output>();
     private ArrayList<Tensor> inputTensors = new ArrayList<Tensor>();
     private ArrayList<Output> outputs = new ArrayList<Output>();
@@ -340,8 +411,11 @@ public final class Session implements AutoCloseable {
   private long nativeHandle;
   private int numActiveRuns;
 
+  // TODO(ashankar): Remove after TensorFlow 1.2 has been released with allocate2().
   private static native long allocate(long graphHandle);
 
+  private static native long allocate2(long graphHandle, String target, byte[] config);
+
   private static native void delete(long handle);
 
   /**
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Shape.java b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
index ed74e9bb1b1..90d6cf7b854 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Shape.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
@@ -1,16 +1,17 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 
 package org.tensorflow;
 
@@ -77,7 +78,8 @@ public final class Shape {
   }
 
   /** Succint description of the shape meant for debugging. */
-  @Override public String toString() {
+  @Override
+  public String toString() {
     if (shape == null) {
       return "<unknown>";
     }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index 891d7966ad1..f4f853f716c 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -72,7 +72,8 @@ public final class Tensor implements AutoCloseable {
    * }</pre>
    *
    * @throws IllegalArgumentException if {@code obj} is not compatible with the TensorFlow type
-   *     system.
+   *     system, or if obj does not disambiguate between multiple DataTypes. In that case, consider
+   *     using {@link #create(DataType, long[], ByteBuffer)} instead.
    */
   public static Tensor create(Object obj) {
     Tensor t = new Tensor();
@@ -171,8 +172,7 @@ public final class Tensor implements AutoCloseable {
    *
    * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
    * encoded into {@code data} as per the specification of the TensorFlow <a
-   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
-   * API</a>.
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
    *
    * @param dataType the tensor datatype.
    * @param shape the tensor shape.
@@ -489,13 +489,15 @@ public final class Tensor implements AutoCloseable {
     // assumes a fully-known shape
     int n = 1;
     for (int i = 0; i < shape.length; i++) {
-      n *= shape[i];
+      n *= (int) shape[i];
     }
     return n;
   }
 
   private static int elemByteSize(DataType dataType) {
     switch (dataType) {
+      case UINT8:
+        return 1;
       case FLOAT:
       case INT32:
         return 4;
@@ -506,9 +508,8 @@ public final class Tensor implements AutoCloseable {
         return 1;
       case STRING:
         throw new IllegalArgumentException("STRING tensors do not have a fixed element size");
-      default:
-        throw new IllegalArgumentException("DataType " + dataType + " is not supported yet");
     }
+    throw new IllegalArgumentException("DataType " + dataType + " is not supported yet");
   }
 
   private static DataType dataTypeOf(Object o) {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
index 30781a228a1..c21214b7631 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
@@ -20,24 +20,20 @@ public final class TensorFlow {
   /** Returns the version of the underlying TensorFlow runtime. */
   public static native String version();
 
+  /**
+   * All the TensorFlow operations available in this address space.
+   *
+   * @return A serialized representation of an <a
+   *     href="https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto">OpList</a>
+   *     protocol buffer, which lists all the available TensorFlow operations.
+   */
+  public static native byte[] registeredOpList();
+
   private TensorFlow() {}
 
   /** Load the TensorFlow runtime C library. */
   static void init() {
-    try {
-      System.loadLibrary("tensorflow_jni");
-    } catch (UnsatisfiedLinkError e) {
-      // The native code might have been statically linked (through a custom launcher) or be part of
-      // an application-level library. For example, tensorflow/examples/android and
-      // tensorflow/contrib/android include the required native code in differently named libraries.
-      // To allow for such cases, the UnsatisfiedLinkError does not bubble up here.
-      try {
-        version();
-      } catch (UnsatisfiedLinkError e2) {
-        System.err.println(
-            "TensorFlow Java API methods will throw an UnsatisfiedLinkError unless native code shared libraries are loaded");
-      }
-    }
+    NativeLibrary.load();
   }
 
   static {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java b/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
new file mode 100644
index 00000000000..2e84cac1ac7
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * A class to manage scoped (hierarchical) names for operators.
+ *
+ * <p>{@code NameScope} manages hierarchical names where each component in the hierarchy is
+ * separated by a forward slash {@code '/'}. For instance, {@code nn/Const_72} or {@code
+ * nn/gradient/assign/init}. Each scope is a subtree in this hierarchy.
+ *
+ * <p>Use {@code NameScope} to group related operations within a hierarchy, which for example lets
+ * tensorboard coalesce nodes for better graph visualizations.
+ *
+ * <p>This class is package private, user code creates {@link Scope} which internally delegates
+ * calls to an underlying {@code NameScope}.
+ *
+ * <p>This class is <b>not</b> thread-safe.
+ */
+final class NameScope {
+
+  NameScope withSubScope(String scopeName) {
+    checkPattern(NAME_REGEX, scopeName);
+    // Override with opName if it exists.
+    String actualName = (opName != null) ? opName : scopeName;
+    String newPrefix = fullyQualify(makeUnique(actualName));
+    return new NameScope(newPrefix, null, null);
+  }
+
+  NameScope withName(String name) {
+    checkPattern(NAME_REGEX, name);
+    // All context except for the opName is shared with the new scope.
+    return new NameScope(opPrefix, name, ids);
+  }
+
+  String makeOpName(String name) {
+    checkPattern(NAME_REGEX, name);
+    // Override with opName if it exists.
+    String actualName = (opName != null) ? opName : name;
+    return fullyQualify(makeUnique(actualName));
+  }
+
+  /**
+   * Create a new, root-level namescope.
+   *
+   * <p>A root-level namescope generates operator names with no components, like {@code Const_72}
+   * and {@code result}.
+   */
+  NameScope() {
+    this(null, null, null);
+  }
+
+  private NameScope(String opPrefix, String opName, Map<String, Integer> ids) {
+    this.opPrefix = opPrefix;
+    this.opName = opName;
+    if (ids != null) {
+      this.ids = ids;
+    } else {
+      this.ids = new HashMap<String, Integer>();
+    }
+  }
+
+  // Generate a unique name, different from existing ids.
+  //
+  // ids is a map from id to integer, representing a counter of the
+  // number of previous requests to generate a unique name for the
+  // given id.
+  //
+  // For instance, the first use of makeUnique("a") adds "a" -> 1
+  // to ids and returns "a".
+  //
+  // The second use of makeUnique("a") updates ids to "a" -> 2
+  // and returns "a_1", and so on.
+  private String makeUnique(String id) {
+    if (!ids.containsKey(id)) {
+      ids.put(id, 1);
+      return id;
+    } else {
+      int cur = ids.get(id);
+      ids.put(id, cur + 1);
+      return String.format("%s_%d", id, cur);
+    }
+  }
+
+  private String fullyQualify(String name) {
+    if (opPrefix != null) {
+      return String.format("%s/%s", opPrefix, name);
+    } else {
+      return name;
+    }
+  }
+
+  // If opPrefix is non-null, it is a prefix applied to all names
+  // created by this instance.
+  private final String opPrefix;
+
+  // If opName is non-null, it is used to derive the unique name
+  // for operators rather than the provided default name.
+  private final String opName;
+
+  // NameScope generates unique names by appending a numeric suffix if
+  // needed. This is a map containing names already created by this
+  // instance mapped to the next available numeric suffix for it.
+  private final Map<String, Integer> ids;
+
+  private static void checkPattern(Pattern pattern, String name) {
+    if (name == null) {
+      throw new IllegalArgumentException("Names cannot be null");
+    }
+    if (!pattern.matcher(name).matches()) {
+      throw new IllegalArgumentException(
+          String.format(
+              "invalid name: '%s' does not match the regular expression %s",
+              name, NAME_REGEX.pattern()));
+    }
+  }
+
+  // The constraints for operator and scope names originate from restrictions on node names
+  // noted in the proto definition core/framework/node_def.proto for NodeDef and actually
+  // implemented in core/framework/node_def_util.cc [Note that the proto comment does not include
+  // dash (-) in names, while the actual implementation permits it. This regex follows the actual
+  // implementation.]
+  //
+  // This pattern is used to ensure fully qualified names always start with a LETTER_DIGIT_DOT,
+  // followed by zero or more LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE. SLASH is not permitted in
+  // actual user-supplied names to NameScope - it is used as a reserved character to separate
+  // subcomponents within fully qualified names.
+  private static final Pattern NAME_REGEX = Pattern.compile("[A-Za-z0-9.][A-Za-z0-9_.\\-]*");
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java b/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
new file mode 100644
index 00000000000..8de2eaeb797
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/Scope.java
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import org.tensorflow.Graph;
+
+/**
+ * Manages groups of related properties when creating Tensorflow Operations, such as a common name
+ * prefix.
+ *
+ * <p>A {@code Scope} is a container for common properties applied to TensorFlow Ops. Normal user
+ * code initializes a {@code Scope} and provides it to Operation building classes. For example:
+ *
+ * <pre>{@code
+ * Scope scope = new Scope(graph);
+ * Constant c = Constant.create(scope, 42);
+ * }</pre>
+ *
+ * <p>An Operation building class acquires a Scope, and uses it to set properties on the underlying
+ * Tensorflow ops. For example:
+ *
+ * <pre>{@code
+ * // An operator class that adds a constant.
+ * public class Constant {
+ *   public static Constant create(Scope scope, ...) {
+ *      scope.graph().opBuilder(
+ *        "Const", scope.makeOpName("Const"))
+ *        .setAttr(...)
+ *        .build()
+ *      ...
+ *   }
+ * }
+ * }</pre>
+ *
+ * <p><b>Scope hierarchy:</b>
+ *
+ * <p>A {@code Scope} provides various {@code with()} methods that create a new scope. The new scope
+ * typically has one property changed while other properties are inherited from the parent scope.
+ *
+ * <p>An example using {@code Constant} implemented as before:
+ *
+ * <pre>{@code
+ * Scope root = new Scope(graph);
+ *
+ * // The linear subscope will generate names like linear/...
+ * Scope linear = Scope.withSubScope("linear");
+ *
+ * // This op name will be "linear/W"
+ * Constant.create(linear.withName("W"), ...);
+ *
+ * // This op will be "linear/Const", using the default
+ * // name provided by Constant
+ * Constant.create(linear, ...);
+ *
+ * // This op will be "linear/Const_1", using the default
+ * // name provided by Constant and making it unique within
+ * // this scope
+ * Constant.create(linear, ...);
+ * }</pre>
+ *
+ * <p>Scope objects are <b>not</b> thread-safe.
+ */
+public final class Scope {
+
+  /**
+   * Create a new top-level scope.
+   *
+   * @param graph The graph instance to be managed by the scope.
+   */
+  public Scope(Graph graph) {
+    this(graph, new NameScope());
+  }
+
+  /** Returns the graph managed by this scope. */
+  public Graph graph() {
+    return graph;
+  }
+
+  /**
+   * Returns a new scope where added operations will have the provided name prefix.
+   *
+   * <p>Ops created with this scope will have {@code name/childScopeName/} as the prefix. The actual
+   * name will be unique in the returned scope. All other properties are inherited from the current
+   * scope.
+   *
+   * <p>The child scope name must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
+   *
+   * @param childScopeName name for the new child scope
+   * @return a new subscope
+   * @throws IllegalArgumentException if the name is invalid
+   */
+  public Scope withSubScope(String childScopeName) {
+    return new Scope(graph, nameScope.withSubScope(childScopeName));
+  }
+
+  /**
+   * Return a new scope that uses the provided name for an op.
+   *
+   * <p>Operations created within this scope will have a name of the form {@code
+   * name/opName[_suffix]}. This lets you name a specific operator more meaningfully.
+   *
+   * <p>Names must match the regular expression {@code [A-Za-z0-9.][A-Za-z0-9_.\-]*}
+   *
+   * @param opName name for an operator in the returned scope
+   * @return a new Scope that uses opName for operations.
+   * @throws IllegalArgumentException if the name is invalid
+   */
+  public Scope withName(String opName) {
+    return new Scope(graph, nameScope.withName(opName));
+  }
+
+  /**
+   * Create a unique name for an operator, using a provided default if necessary.
+   *
+   * <p>This is normally called only by operator building classes.
+   *
+   * <p>This method generates a unique name, appropriate for the name scope controlled by this
+   * instance. Typical operator building code might look like
+   *
+   * <pre>{@code
+   * scope.graph().opBuilder("Const", scope.makeOpName("Const"))...
+   * }</pre>
+   *
+   * <p><b>Note:</b> if you provide a composite operator building class (i.e, a class that adds a
+   * set of related operations to the graph by calling other operator building code) you should also
+   * create a {@link #withSubScope(String)} scope for the underlying operators to group them under a
+   * meaningful name.
+   *
+   * <pre>{@code
+   * public static Stddev create(Scope scope, ...) {
+   *   // group sub-operations under a common name
+   *   Scope group = scope.withSubScope("stddev");
+   *   ... Sqrt.create(group, Mean.create(group, ...))
+   * }
+   * }</pre>
+   *
+   * @param defaultName name for the underlying operator.
+   * @return unique name for the operator.
+   * @throws IllegalArgumentException if the default name is invalid.
+   */
+  public String makeOpName(String defaultName) {
+    return nameScope.makeOpName(defaultName);
+  }
+
+  private Scope(Graph graph, NameScope nameScope) {
+    this.graph = graph;
+    this.nameScope = nameScope;
+  }
+
+  private final Graph graph;
+  private final NameScope nameScope;
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
index bbccca8dd8d..dd4859e1b14 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@@ -16,10 +16,11 @@ limitations under the License.
 /**
  * Defines classes to build, save, load and execute TensorFlow models.
  *
- * <p><b>WARNING</b>: The API is currently experimental and may change. Follow <a
- * href="https://github.com/tensorflow/tensorflow/issues/5">issue #5</a> for updates. See <a
- * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a>
- * for installation instructions.
+ * <p><b>WARNING</b>: The API is currently experimental and is not covered by TensorFlow <a
+ * href="https://www.tensorflow.org/programmers_guide/version_semantics">API stability
+ * guarantees</a>. See <a
+ * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a> for installation
+ * instructions.
  *
  * <p>The <a
  * href="https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java">LabelImage</a>
diff --git a/tensorflow/java/src/main/native/exception_jni.cc b/tensorflow/java/src/main/native/exception_jni.cc
index 2df0973389a..4f9a84aa9a1 100644
--- a/tensorflow/java/src/main/native/exception_jni.cc
+++ b/tensorflow/java/src/main/native/exception_jni.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <stdarg.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/java/src/main/native/exception_jni.h"
@@ -29,12 +30,16 @@ const char kUnsupportedOperationException[] =
 void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
-  char* message = nullptr;
-  if (vasprintf(&message, fmt, args) >= 0) {
+  // Using vsnprintf() instead of vasprintf() because the latter doesn't seem to
+  // be easily available on Windows.
+  const size_t max_msg_len = 512;
+  char* message = static_cast<char*>(malloc(max_msg_len));
+  if (vsnprintf(message, max_msg_len, fmt, args) >= 0) {
     env->ThrowNew(env->FindClass(clazz), message);
   } else {
     env->ThrowNew(env->FindClass(clazz), "");
   }
+  free(message);
   va_end(args);
 }
 
diff --git a/tensorflow/java/src/main/native/exception_jni.h b/tensorflow/java/src/main/native/exception_jni.h
index 4dfcdf60c81..28f26d7ebfb 100644
--- a/tensorflow/java/src/main/native/exception_jni.h
+++ b/tensorflow/java/src/main/native/exception_jni.h
@@ -22,7 +22,7 @@ limitations under the License.
 extern "C" {
 #endif
 
-class TF_Status;
+struct TF_Status;
 
 extern const char kIllegalArgumentException[];
 extern const char kIllegalStateException[];
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index 5724c54f911..a7696182c7f 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -24,7 +24,7 @@ TF_OperationDescription* requireHandle(JNIEnv* env, jlong handle) {
   if (handle == 0) {
     throwException(env, kIllegalStateException,
                    "Operation has already been built");
-    return 0;
+    return nullptr;
   }
   return reinterpret_cast<TF_OperationDescription*>(handle);
 }
@@ -115,6 +115,20 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
   TF_AddInputList(d, o.get(), n);
 }
 
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv* env, jclass clazz, jlong handle, jlong op_handle) {
+  if (op_handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "control input is not valid, "
+                   "perhaps the Graph containing it has been closed()?");
+    return;
+  }
+  TF_Operation* control = reinterpret_cast<TF_Operation*>(op_handle);
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  TF_AddControlInput(d, control);
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(
     JNIEnv* env, jclass clazz, jlong handle, jstring device) {
   TF_OperationDescription* d = requireHandle(env, handle);
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index ae953c0fd63..9b64c328203 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -55,6 +55,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
     JNIEnv *, jclass, jlong, jlongArray, jintArray);
 
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    addControlInput
+ * Signature: (JJ)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv *, jclass, jlong, jlong);
+
 /*
  * Class:     org_tensorflow_OperationBuilder
  * Method:    setDevice
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index c13ccc65582..b3d5fc4ec37 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -66,6 +66,24 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv* env,
   return TF_OperationNumOutputs(op);
 }
 
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv* env,
+                                                                      jclass clazz,
+                                                                      jlong handle,
+                                                                      jstring name) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+
+  TF_Status* status = TF_NewStatus();
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int result = TF_OperationOutputListLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  return result;
+}
+
 JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(
     JNIEnv* env, jclass clazz, jlong graph_handle, jlong op_handle,
     jint output_index) {
@@ -116,3 +134,25 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(
   env->ReleaseLongArrayElements(ret, dims, 0);
   return ret;
 }
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong graph_handle,
+                                                           jlong op_handle,
+                                                           jint output_index) {
+  TF_Graph* graph = requireGraphHandle(env, graph_handle);
+  if (graph == nullptr) return 0;
+  TF_Operation* op = requireHandle(env, op_handle);
+  if (op == nullptr) return 0;
+
+  int num_outputs = TF_OperationNumOutputs(op);
+  if (output_index < 0 || output_index >= num_outputs) {
+    throwException(
+        env, kIndexOutOfBoundsException,
+        "invalid output index (%d) for an operation that has %d outputs",
+        output_index, num_outputs);
+    return 0;
+  }
+
+  return static_cast<jint>(TF_OperationOutputType(TF_Output{op, output_index}));
+}
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index 152ab139a5c..b5d156f7c27 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -1,4 +1,3 @@
-
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,6 +46,16 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_type(JNIEnv *, jclass,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv *,
                                                                 jclass, jlong);
 
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    outputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
 /*
  * Class:     org_tensorflow_Operation
  * Method:    shape
@@ -56,6 +65,14 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(JNIEnv *,
                                                                  jclass, jlong,
                                                                  jlong, jint);
 
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    dtype
+ * Signature: (JJI)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv *, jclass,
+                                                           jlong, jlong, jint);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/main/native/saved_model_bundle_jni.cc b/tensorflow/java/src/main/native/saved_model_bundle_jni.cc
new file mode 100644
index 00000000000..de6382a79c4
--- /dev/null
+++ b/tensorflow/java/src/main/native/saved_model_bundle_jni.cc
@@ -0,0 +1,107 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+#include <memory>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/saved_model_bundle_jni.h"
+
+JNIEXPORT jobject JNICALL Java_org_tensorflow_SavedModelBundle_load(
+    JNIEnv* env, jclass clazz, jstring export_dir, jobjectArray tags,
+    jbyteArray run_options) {
+  TF_Status* status = TF_NewStatus();
+  jobject bundle = nullptr;
+
+  // allocate parameters for TF_LoadSessionFromSavedModel
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_Buffer* crun_options = nullptr;
+  if (run_options != nullptr) {
+    size_t sz = env->GetArrayLength(run_options);
+    if (sz > 0) {
+      jbyte* run_options_data = env->GetByteArrayElements(run_options, nullptr);
+      crun_options =
+          TF_NewBufferFromString(static_cast<void*>(run_options_data), sz);
+      env->ReleaseByteArrayElements(run_options, run_options_data, JNI_ABORT);
+    }
+  }
+  const char* cexport_dir = env->GetStringUTFChars(export_dir, nullptr);
+  std::unique_ptr<const char* []> tags_ptrs;
+  size_t tags_len = env->GetArrayLength(tags);
+  tags_ptrs.reset(new const char*[tags_len]);
+  for (size_t i = 0; i < tags_len; ++i) {
+    jstring tag = static_cast<jstring>(env->GetObjectArrayElement(tags, i));
+    tags_ptrs[i] = env->GetStringUTFChars(tag, nullptr);
+    env->DeleteLocalRef(tag);
+  }
+
+  // load the session
+  TF_Graph* graph = TF_NewGraph();
+  TF_Buffer* metagraph_def = TF_NewBuffer();
+  TF_Session* session = TF_LoadSessionFromSavedModel(
+      opts, crun_options, cexport_dir, tags_ptrs.get(), tags_len, graph,
+      metagraph_def, status);
+
+  // release the parameters
+  TF_DeleteSessionOptions(opts);
+  if (crun_options != nullptr) {
+    TF_DeleteBuffer(crun_options);
+  }
+  env->ReleaseStringUTFChars(export_dir, cexport_dir);
+  for (size_t i = 0; i < tags_len; ++i) {
+    jstring tag = static_cast<jstring>(env->GetObjectArrayElement(tags, i));
+    env->ReleaseStringUTFChars(tag, tags_ptrs[i]);
+    env->DeleteLocalRef(tag);
+  }
+
+  // handle the result
+  if (throwExceptionIfNotOK(env, status)) {
+    // sizeof(jsize) is less than sizeof(size_t) on some platforms.
+    if (metagraph_def->length > std::numeric_limits<jint>::max()) {
+      throwException(
+          env, kIndexOutOfBoundsException,
+          "MetaGraphDef is too large to serialize into a byte[] array");
+    } else {
+      static_assert(sizeof(jbyte) == 1, "unexpected size of the jbyte type");
+      jint jmetagraph_len = static_cast<jint>(metagraph_def->length);
+      jbyteArray jmetagraph_def = env->NewByteArray(jmetagraph_len);
+      env->SetByteArrayRegion(jmetagraph_def, 0, jmetagraph_len,
+                              static_cast<const jbyte*>(metagraph_def->data));
+
+      jmethodID method = env->GetStaticMethodID(
+          clazz, "fromHandle", "(JJ[B)Lorg/tensorflow/SavedModelBundle;");
+      bundle = env->CallStaticObjectMethod(
+          clazz, method, reinterpret_cast<jlong>(graph),
+          reinterpret_cast<jlong>(session), jmetagraph_def);
+      graph = nullptr;
+      session = nullptr;
+      env->DeleteLocalRef(jmetagraph_def);
+    }
+  }
+
+  if (session != nullptr) {
+    TF_CloseSession(session, status);
+    // Result of close is ignored, delete anyway.
+    TF_DeleteSession(session, status);
+  }
+  if (graph != nullptr) {
+    TF_DeleteGraph(graph);
+  }
+  TF_DeleteBuffer(metagraph_def);
+  TF_DeleteStatus(status);
+
+  return bundle;
+}
diff --git a/tensorflow/java/src/main/native/saved_model_bundle_jni.h b/tensorflow/java/src/main/native/saved_model_bundle_jni.h
new file mode 100644
index 00000000000..6cce6a81bd1
--- /dev/null
+++ b/tensorflow/java/src/main/native/saved_model_bundle_jni.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
+#define TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_SavedModelBundle
+ * Method:    load
+ * Signature:
+ * (Ljava/lang/String;[Ljava/lang/String;[B)Lorg/tensorflow/SavedModelBundle;
+ */
+JNIEXPORT jobject JNICALL Java_org_tensorflow_SavedModelBundle_load(
+    JNIEnv *, jclass, jstring, jobjectArray, jbyteArray);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
diff --git a/tensorflow/java/src/main/native/session_jni.cc b/tensorflow/java/src/main/native/session_jni.cc
index 91ad0d9bcd9..e26367ea004 100644
--- a/tensorflow/java/src/main/native/session_jni.cc
+++ b/tensorflow/java/src/main/native/session_jni.cc
@@ -57,6 +57,7 @@ void resolveHandles(JNIEnv* env, const char* type, jlongArray src_array,
 
 void resolveOutputs(JNIEnv* env, const char* type, jlongArray src_op,
                     jintArray src_index, TF_Output* dst, jint n) {
+  if (env->ExceptionCheck()) return;
   jint len = env->GetArrayLength(src_op);
   if (len != n) {
     throwException(env, kIllegalArgumentException,
@@ -96,10 +97,18 @@ typedef std::unique_ptr<TF_Buffer, decltype(&TF_MaybeDeleteBuffer)>
 unique_tf_buffer MakeUniqueBuffer(TF_Buffer* buf) {
   return unique_tf_buffer(buf, TF_MaybeDeleteBuffer);
 }
+
 }  // namespace
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate(
     JNIEnv* env, jclass clazz, jlong graph_handle) {
+  return Java_org_tensorflow_Session_allocate2(env, clazz, graph_handle,
+                                               nullptr, nullptr);
+}
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate2(
+    JNIEnv* env, jclass clazz, jlong graph_handle, jstring target,
+    jbyteArray config) {
   if (graph_handle == 0) {
     throwException(env, kNullPointerException, "Graph has been close()d");
     return 0;
@@ -107,7 +116,27 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate(
   TF_Graph* graph = reinterpret_cast<TF_Graph*>(graph_handle);
   TF_Status* status = TF_NewStatus();
   TF_SessionOptions* opts = TF_NewSessionOptions();
+  const char* ctarget = nullptr;
+  jbyte* cconfig = nullptr;
+  if (target != nullptr) {
+    ctarget = env->GetStringUTFChars(target, nullptr);
+  }
+  if (config != nullptr) {
+    cconfig = env->GetByteArrayElements(config, nullptr);
+    TF_SetConfig(opts, cconfig,
+                 static_cast<size_t>(env->GetArrayLength(config)), status);
+    if (!throwExceptionIfNotOK(env, status)) {
+      env->ReleaseByteArrayElements(config, cconfig, JNI_ABORT);
+      return 0;
+    }
+  }
   TF_Session* session = TF_NewSession(graph, opts, status);
+  if (config != nullptr) {
+    env->ReleaseByteArrayElements(config, cconfig, JNI_ABORT);
+  }
+  if (target != nullptr) {
+    env->ReleaseStringUTFChars(target, ctarget);
+  }
   TF_DeleteSessionOptions(opts);
   bool ok = throwExceptionIfNotOK(env, status);
   TF_DeleteStatus(status);
@@ -182,6 +211,7 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
   }
 
   if (!throwExceptionIfNotOK(env, status)) {
+    TF_DeleteStatus(status);
     return nullptr;
   }
   jlong* t = env->GetLongArrayElements(output_tensor_handles, nullptr);
@@ -197,5 +227,6 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
     memcpy(elems, run_metadata->data, run_metadata->length);
     env->ReleaseByteArrayElements(ret, elems, JNI_COMMIT);
   }
+  TF_DeleteStatus(status);
   return ret;
 }
diff --git a/tensorflow/java/src/main/native/session_jni.h b/tensorflow/java/src/main/native/session_jni.h
index 56b8f0c2ddb..54c9c0aa4d8 100644
--- a/tensorflow/java/src/main/native/session_jni.h
+++ b/tensorflow/java/src/main/native/session_jni.h
@@ -30,6 +30,15 @@ extern "C" {
 JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate(JNIEnv *, jclass,
                                                              jlong);
 
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    allocate2
+ * Signature: (JLjava/lang/String;[B)J
+ */
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_allocate2(JNIEnv *, jclass,
+                                                              jlong, jstring,
+                                                              jbyteArray);
+
 /*
  * Class:     org_tensorflow_Session
  * Method:    delete
diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc
index 27897d2d127..dfdca357f78 100644
--- a/tensorflow/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/java/src/main/native/tensor_jni.cc
@@ -276,10 +276,7 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_allocateScalarBytes(
 
   TF_Status* status = TF_NewStatus();
   TF_StringEncode(src.get(), src_len, dst + 8, dst_len, status);
-  if (TF_GetCode(status) != TF_OK) {
-    // TODO(ashankar): Replace with throwExceptionIfNotOK() being added to
-    // exception_jni.h in another change.
-    throwException(env, kIllegalStateException, TF_Message(status));
+  if (!throwExceptionIfNotOK(env, status)) {
     TF_DeleteStatus(status);
     return 0;
   }
@@ -401,12 +398,7 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(
   size_t dst_len = 0;
   TF_Status* status = TF_NewStatus();
   TF_StringDecode(src, src_len, &dst, &dst_len, status);
-  if (TF_GetCode(status) != TF_OK) {
-    // TODO(ashankar): Replace with throwExceptionIfNotOK introduced into
-    // exception_jni.h by another change.
-    throwException(env, kIllegalArgumentException,
-                   "invalid tensor encoding: %s", TF_Message(status));
-  } else {
+  if (throwExceptionIfNotOK(env, status)) {
     ret = env->NewByteArray(dst_len);
     jbyte* cpy = env->GetByteArrayElements(ret, nullptr);
     memcpy(cpy, dst, dst_len);
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.cc b/tensorflow/java/src/main/native/tensorflow_jni.cc
index 746550adbd2..c553582e38d 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.cc
+++ b/tensorflow/java/src/main/native/tensorflow_jni.cc
@@ -20,3 +20,13 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv* env,
                                                                  jclass clazz) {
   return env->NewStringUTF(TF_Version());
 }
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv* env, jclass clazz) {
+  TF_Buffer* buf = TF_GetAllOpList();
+  jint length = static_cast<int>(buf->length);
+  jbyteArray ret = env->NewByteArray(length);
+  env->SetByteArrayRegion(ret, 0, length, static_cast<const jbyte*>(buf->data));
+  TF_DeleteBuffer(buf);
+  return ret;
+}
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.h b/tensorflow/java/src/main/native/tensorflow_jni.h
index 102951c472c..ecd9b15828d 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.h
+++ b/tensorflow/java/src/main/native/tensorflow_jni.h
@@ -30,6 +30,14 @@ extern "C" {
 JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv*,
                                                                  jclass);
 
+/*
+ * Class:     org_tensorflow_TensorFlow
+ * Method:    registeredOpList
+ * Signature: ()[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv*, jclass);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
index fa975e55cd9..f6dc3ee1e94 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
@@ -47,7 +47,7 @@ public class GraphTest {
 
   // Helper function whose implementation is based on knowledge of how
   // TestUtil.transpose_A_times_X is implemented.
-  private void validateImportedGraph(Graph g, String prefix) {
+  private static void validateImportedGraph(Graph g, String prefix) {
     Operation op = g.operation(prefix + "A");
     assertNotNull(op);
     assertEquals(prefix + "A", op.name());
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index b938e3a0a2c..b3bc3aaef9c 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -101,7 +101,7 @@ public class OperationBuilderTest {
       assertTrue(hasNode(g, "StringAndBool"));
       // int (TF "int" attributes are 64-bit signed, so a Java long).
       g.opBuilder("RandomUniform", "Int")
-          .addInput(TestUtil.constant(g, "RandomUniformShape", 1))
+          .addInput(TestUtil.constant(g, "RandomUniformShape", new int[]{1}))
           .setAttr("seed", 10)
           .setAttr("dtype", DataType.FLOAT)
           .build();
@@ -134,6 +134,7 @@ public class OperationBuilderTest {
               .build()
               .output(0);
       assertEquals(-1, n.shape().numDimensions());
+      assertEquals(DataType.FLOAT, n.dataType());
 
       n =
           g.opBuilder("Placeholder", "batch_of_vectors")
@@ -144,6 +145,34 @@ public class OperationBuilderTest {
       assertEquals(2, n.shape().numDimensions());
       assertEquals(-1, n.shape().size(0));
       assertEquals(784, n.shape().size(1));
+      assertEquals(DataType.FLOAT, n.dataType());
+    }
+  }
+
+  @Test
+  public void addControlInput() {
+    try (Graph g = new Graph();
+        Session s = new Session(g);
+        Tensor yes = Tensor.create(true);
+        Tensor no = Tensor.create(false)) {
+      Output placeholder = TestUtil.placeholder(g, "boolean", DataType.BOOL);
+      Operation check =
+          g.opBuilder("Assert", "assert")
+              .addInput(placeholder)
+              .addInputList(new Output[] {placeholder})
+              .build();
+      Operation noop = g.opBuilder("NoOp", "noop").addControlInput(check).build();
+
+      // No problems when the Assert check succeeds
+      s.runner().feed(placeholder, yes).addTarget(noop).run();
+
+      // Exception thrown by the execution of the Assert node
+      try {
+        s.runner().feed(placeholder, no).addTarget(noop).run();
+        fail("Did not run control operation.");
+      } catch (IllegalArgumentException e) {
+        // expected
+      }
     }
   }
 
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
new file mode 100644
index 00000000000..74fdcf484e9
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.Operation}. */
+@RunWith(JUnit4.class)
+public class OperationTest {
+
+  @Test
+  public void outputListLengthFailsOnInvalidName() {
+    try (Graph g = new Graph()) {
+      Operation op =
+          g.opBuilder("Add", "Add")
+              .addInput(TestUtil.constant(g, "x", 1))
+              .addInput(TestUtil.constant(g, "y", 2))
+              .build();
+      assertEquals(1, op.outputListLength("z"));
+
+      try {
+        op.outputListLength("unknown");
+        fail("Did not catch bad name");
+      } catch (IllegalArgumentException iae) {
+        // expected
+      }
+    }
+  }
+
+  @Test
+  public void outputListLength() {
+    assertEquals(1, split(new int[] {0, 1}, 1));
+    assertEquals(2, split(new int[] {0, 1}, 2));
+    assertEquals(3, split(new int[] {0, 1, 2}, 3));
+  }
+
+  private static int split(int[] values, int num_split) {
+    try (Graph g = new Graph()) {
+      return g.opBuilder("Split", "Split")
+          .addInput(TestUtil.constant(g, "split_dim", 0))
+          .addInput(TestUtil.constant(g, "values", values))
+          .setAttr("num_split", num_split)
+          .build()
+          .outputListLength("output");
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java b/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java
new file mode 100644
index 00000000000..7922f3329c7
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/SavedModelBundleTest.java
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.SavedModelBundle}. */
+@RunWith(JUnit4.class)
+public class SavedModelBundleTest {
+
+  private static final String SAVED_MODEL_PATH =
+      "tensorflow/cc/saved_model/testdata/half_plus_two/00000123";
+
+  @Test
+  public void load() {
+    try (SavedModelBundle bundle = SavedModelBundle.load(SAVED_MODEL_PATH, "serve")) {
+      assertNotNull(bundle.session());
+      assertNotNull(bundle.graph());
+      assertNotNull(bundle.metaGraphDef());
+    }
+  }
+
+  @Test
+  public void loadNonExistentBundle() {
+    try {
+      SavedModelBundle bundle = SavedModelBundle.load("__BAD__", "serve");
+      bundle.close();
+      fail("not expected");
+    } catch (org.tensorflow.TensorFlowException e) {
+      // expected exception
+      assertTrue(e.getMessage().contains("SavedModel not found"));
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index cf9bd11a374..50bdf351e3f 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -15,6 +15,7 @@ limitations under the License.
 
 package org.tensorflow;
 
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -30,7 +31,7 @@ import org.junit.runners.JUnit4;
 public class SessionTest {
 
   @Test
-  public void run() {
+  public void runUsingOperationNames() {
     try (Graph g = new Graph();
         Session s = new Session(g)) {
       TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
@@ -39,7 +40,54 @@ public class SessionTest {
               new AutoCloseableList<Tensor>(s.runner().feed("X", x).fetch("Y").run())) {
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
-        assertEquals(expected, outputs.get(0).copyTo(new int[1][1]));
+        assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
+      }
+    }
+  }
+
+  @Test
+  public void runUsingOperationHandles() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      TestUtil.transpose_A_times_X(g, new int[][] {{2}, {3}});
+      Output feed = g.operation("X").output(0);
+      Output fetch = g.operation("Y").output(0);
+      try (Tensor x = Tensor.create(new int[][] {{5}, {7}});
+          AutoCloseableList<Tensor> outputs =
+              new AutoCloseableList<Tensor>(s.runner().feed(feed, x).fetch(fetch).run())) {
+        assertEquals(1, outputs.size());
+        final int[][] expected = {{31}};
+        assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
+      }
+    }
+  }
+
+  @Test
+  public void runUsingColonSeparatedNames() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      Operation split =
+          g.opBuilder("Split", "Split")
+              .addInput(TestUtil.constant(g, "split_dim", 0))
+              .addInput(TestUtil.constant(g, "value", new int[] {1, 2, 3, 4}))
+              .setAttr("num_split", 2)
+              .build();
+      g.opBuilder("Add", "Add")
+          .addInput(split.output(0))
+          .addInput(split.output(1))
+          .build()
+          .output(0);
+      // Fetch using colon separated names.
+      try (Tensor fetched = s.runner().fetch("Split:1").run().get(0)) {
+        final int[] expected = {3, 4};
+        assertArrayEquals(expected, fetched.copyTo(new int[2]));
+      }
+      // Feed using colon separated names.
+      try (Tensor fed = Tensor.create(new int[] {4, 3, 2, 1});
+          Tensor fetched =
+              s.runner().feed("Split:0", fed).feed("Split:1", fed).fetch("Add").run().get(0)) {
+        final int[] expected = {8, 6, 4, 2};
+        assertArrayEquals(expected, fetched.copyTo(new int[4]));
       }
     }
   }
@@ -60,8 +108,8 @@ public class SessionTest {
         AutoCloseableList<Tensor> outputs = new AutoCloseableList<Tensor>(result.outputs);
         assertEquals(1, outputs.size());
         final int[][] expected = {{31}};
-        assertEquals(expected, outputs.get(0).copyTo(new int[1][1]));
-        // Sanity check on metadatar
+        assertArrayEquals(expected, outputs.get(0).copyTo(new int[1][1]));
+        // Sanity check on metadata
         // See comments in fullTraceRunOptions() for an explanation about
         // why this check is really silly. Ideally, this would be:
         /*
@@ -73,6 +121,20 @@ public class SessionTest {
     }
   }
 
+  @Test
+  public void runMultipleOutputs() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      TestUtil.constant(g, "c1", 2718);
+      TestUtil.constant(g, "c2", 31415);
+      AutoCloseableList<Tensor> outputs =
+          new AutoCloseableList<Tensor>(s.runner().fetch("c2").fetch("c1").run());
+      assertEquals(2, outputs.size());
+      assertEquals(31415, outputs.get(0).intValue());
+      assertEquals(2718, outputs.get(1).intValue());
+    }
+  }
+
   @Test
   public void failOnUseAfterClose() {
     try (Graph g = new Graph()) {
@@ -87,6 +149,12 @@ public class SessionTest {
     }
   }
 
+  @Test
+  public void createWithConfigProto() {
+    try (Graph g = new Graph();
+        Session s = new Session(g, singleThreadConfigProto())) {}
+  }
+
   private static final class AutoCloseableList<E extends AutoCloseable> extends ArrayList<E>
       implements AutoCloseable {
     AutoCloseableList(Collection<? extends E> c) {
@@ -119,7 +187,7 @@ public class SessionTest {
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
     // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
     //
-    // For this test, for now, the use of specific bytes sufficies.
+    // For this test, for now, the use of specific bytes suffices.
     return new byte[] {0x08, 0x03};
     /*
     return org.tensorflow.framework.RunOptions.newBuilder()
@@ -128,4 +196,25 @@ public class SessionTest {
         .toByteArray();
     */
   }
+
+  public static byte[] singleThreadConfigProto() {
+    // Ideally this would use the generated Java sources for protocol buffers
+    // and end up with something like the snippet below. However, generating
+    // the Java files for the .proto files in tensorflow/core:protos_all is
+    // a bit cumbersome in bazel until the proto_library rule is setup.
+    //
+    // See https://github.com/bazelbuild/bazel/issues/52#issuecomment-194341866
+    // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251515362
+    // https://github.com/bazelbuild/rules_go/pull/121#issuecomment-251692558
+    //
+    // For this test, for now, the use of specific bytes suffices.
+    return new byte[] {0x10, 0x01, 0x28, 0x01};
+    /*
+    return org.tensorflow.framework.ConfigProto.newBuilder()
+        .setInterOpParallelismThreads(1)
+        .setIntraOpParallelismThreads(1)
+        .build()
+        .toByteArray();
+     */
+  }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
index ff89aeffbbc..a31ea900d1c 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
@@ -28,4 +28,12 @@ public class TensorFlowTest {
   public void version() {
     assertTrue(TensorFlow.version().length() > 0);
   }
+
+  @Test
+  public void registeredOpList() {
+    // Would be nice to actually parse the output as a tensorflow.OpList protocol buffer message,
+    // but as of May 2017, bazel support for generating Java code from protocol buffer definitions
+    // was not sorted out. Revisit? Till then, at least exercise the code.
+    assertTrue(TensorFlow.registeredOpList().length > 0);
+  }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index e998843f05f..3ff59e71b22 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -15,6 +15,7 @@ limitations under the License.
 
 package org.tensorflow;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -43,7 +44,7 @@ public class TensorTest {
     boolean[] bools = {true, false, true, false};
     long[] bools_shape = {4};
     byte[] bools_ = TestUtil.bool2byte(bools);
-    byte[] strings = "test".getBytes();
+    byte[] strings = "test".getBytes(UTF_8);
     long[] strings_shape = {};
     byte[] strings_; // raw TF_STRING
     try (Tensor t = Tensor.create(strings)) {
@@ -471,7 +472,7 @@ public class TensorTest {
   @Test
   public void fromHandle() {
     // fromHandle is a package-visible method intended for use when the C TF_Tensor object has been
-    // created indepdently of the Java code. In practice, two Tensor instances MUST NOT have the
+    // created independently of the Java code. In practice, two Tensor instances MUST NOT have the
     // same native handle.
     //
     // An exception is made for this test, where the pitfalls of this is avoided by not calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
index 265e21203b0..6a3a16c2e17 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
@@ -54,6 +54,7 @@ public class TestUtil {
 
   /**
    * Counts the total number of elements in an ND array.
+   *
    * @param array the array to count the elements of
    * @return the number of elements
    */
@@ -61,10 +62,9 @@ public class TestUtil {
     int count = 0;
     for (int i = 0; i < Array.getLength(array); i++) {
       Object e = Array.get(array, i);
-      if(!e.getClass().isArray()) {
+      if (!e.getClass().isArray()) {
         count += 1;
-      }
-      else {
+      } else {
         count += flattenedNumElements(e);
       }
     }
@@ -73,6 +73,7 @@ public class TestUtil {
 
   /**
    * Flattens an ND-array into a 1D-array with the same elements.
+   *
    * @param array the array to flatten
    * @param elementType the element class (e.g. {@code Integer.TYPE} for an {@code int[]})
    * @return a flattened array
@@ -86,10 +87,9 @@ public class TestUtil {
   private static int flatten(Object array, Object out, int next) {
     for (int i = 0; i < Array.getLength(array); i++) {
       Object e = Array.get(array, i);
-      if(!e.getClass().isArray()) {
+      if (!e.getClass().isArray()) {
         Array.set(out, next++, e);
-      }
-      else {
+      } else {
         next = flatten(e, out, next);
       }
     }
@@ -99,11 +99,12 @@ public class TestUtil {
   /**
    * Converts a {@code boolean[]} to a {@code byte[]}.
    *
-   * <p>Suitable for creating tensors of type {@link DataType#BOOL} using {@link java.nio.ByteBuffer}.
+   * <p>Suitable for creating tensors of type {@link DataType#BOOL} using {@link
+   * java.nio.ByteBuffer}.
    */
   public static byte[] bool2byte(boolean[] array) {
     byte[] out = new byte[array.length];
-    for(int i = 0; i< array.length; i++) {
+    for (int i = 0; i < array.length; i++) {
       out[i] = array[i] ? (byte) 1 : (byte) 0;
     }
     return out;
@@ -112,13 +113,16 @@ public class TestUtil {
   /**
    * Converts a {@code byte[]} to a {@code boolean[]}.
    *
-   * <p>Suitable for reading tensors of type {@link DataType#BOOL} using {@link java.nio.ByteBuffer}.
+   * <p>Suitable for reading tensors of type {@link DataType#BOOL} using {@link
+   * java.nio.ByteBuffer}.
    */
   public static boolean[] byte2bool(byte[] array) {
     boolean[] out = new boolean[array.length];
-    for(int i = 0; i< array.length; i++) {
+    for (int i = 0; i < array.length; i++) {
       out[i] = array[i] != 0;
     }
     return out;
   }
+
+  private TestUtil() {}
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
new file mode 100644
index 00000000000..9256cb281d3
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Output;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+
+/** Unit tests for {@link org.tensorflow.Scope}. */
+@RunWith(JUnit4.class)
+public class ScopeTest {
+
+  @Test
+  public void basicNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      assertEquals("add", root.makeOpName("add"));
+      assertEquals("add_1", root.makeOpName("add"));
+      assertEquals("add_2", root.makeOpName("add"));
+      assertEquals("mul", root.makeOpName("mul"));
+    }
+  }
+
+  @Test
+  public void hierarchicalNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      Scope child = root.withSubScope("child");
+      assertEquals("child/add", child.makeOpName("add"));
+      assertEquals("child/add_1", child.makeOpName("add"));
+      assertEquals("child/mul", child.makeOpName("mul"));
+
+      Scope child_1 = root.withSubScope("child");
+      assertEquals("child_1/add", child_1.makeOpName("add"));
+      assertEquals("child_1/add_1", child_1.makeOpName("add"));
+      assertEquals("child_1/mul", child_1.makeOpName("mul"));
+
+      Scope c_c = root.withSubScope("c").withSubScope("c");
+      assertEquals("c/c/add", c_c.makeOpName("add"));
+
+      Scope c_1 = root.withSubScope("c");
+      Scope c_1_c = c_1.withSubScope("c");
+      assertEquals("c_1/c/add", c_1_c.makeOpName("add"));
+
+      Scope c_1_c_1 = c_1.withSubScope("c");
+      assertEquals("c_1/c_1/add", c_1_c_1.makeOpName("add"));
+    }
+  }
+
+  @Test
+  public void scopeAndOpNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+
+      Scope child = root.withSubScope("child");
+
+      assertEquals("child/add", child.makeOpName("add"));
+      assertEquals("child_1", root.makeOpName("child"));
+      assertEquals("child_2/p", root.withSubScope("child").makeOpName("p"));
+    }
+  }
+
+  @Test
+  public void validateNames() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+
+      final String[] invalid_names = {
+        "_", "-", "-x", // Names are constrained to start with [A-Za-z0-9.]
+        null, "", "a$", // Invalid characters
+        "a/b", // slashes not allowed
+      };
+
+      for (String name : invalid_names) {
+        try {
+          root.withName(name);
+          fail("failed to catch invalid op name.");
+        } catch (IllegalArgumentException ex) {
+          // expected
+        }
+        // Subscopes follow the same rules
+        try {
+          root.withSubScope(name);
+          fail("failed to catch invalid scope name: " + name);
+        } catch (IllegalArgumentException ex) {
+          // expected
+        }
+      }
+
+      // Unusual but valid names.
+      final String[] valid_names = {".", "..", "._-.", "a--."};
+
+      for (String name : valid_names) {
+        root.withName(name);
+        root.withSubScope(name);
+      }
+    }
+  }
+
+  @Test
+  public void basic() {
+    try (Graph g = new Graph()) {
+      Scope s = new Scope(g);
+      Const c1 = Const.create(s, 42);
+      assertEquals("Const", c1.output().op().name());
+      Const c2 = Const.create(s, 7);
+      assertEquals("Const_1", c2.output().op().name());
+      Const c3 = Const.create(s.withName("four"), 4);
+      assertEquals("four", c3.output().op().name());
+      Const c4 = Const.create(s.withName("four"), 4);
+      assertEquals("four_1", c4.output().op().name());
+    }
+  }
+
+  @Test
+  public void hierarchy() {
+    try (Graph g = new Graph()) {
+      Scope root = new Scope(g);
+      Scope child = root.withSubScope("child");
+      assertEquals("child/Const", Const.create(child, 42).output().op().name());
+      assertEquals("child/four", Const.create(child.withName("four"), 4).output().op().name());
+    }
+  }
+
+  @Test
+  public void composite() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope s = new Scope(g);
+      Output data = Const.create(s.withName("data"), new int[] {600, 470, 170, 430, 300}).output();
+
+      // Create a composite op with a customized name
+      Variance var1 = Variance.create(s.withName("example"), data);
+      assertEquals("example/variance", var1.output().op().name());
+
+      // Confirm internally added ops have the right names.
+      assertNotNull(g.operation("example/squared_deviation"));
+      assertNotNull(g.operation("example/Mean"));
+      assertNotNull(g.operation("example/zero"));
+
+      // Same composite op with a default name
+      Variance var2 = Variance.create(s, data);
+      assertEquals("variance/variance", var2.output().op().name());
+
+      // Confirm internally added ops have the right names.
+      assertNotNull(g.operation("variance/squared_deviation"));
+      assertNotNull(g.operation("variance/Mean"));
+      assertNotNull(g.operation("variance/zero"));
+
+      // Verify correct results as well.
+      Tensor result = sess.runner().fetch(var1.output()).run().get(0);
+      assertEquals(21704, result.intValue());
+      result = sess.runner().fetch(var2.output()).run().get(0);
+      assertEquals(21704, result.intValue());
+    }
+  }
+
+  // "handwritten" sample operator classes
+  private static final class Const {
+    private final Output output;
+
+    static Const create(Scope s, Object v) {
+      try (Tensor value = Tensor.create(v)) {
+        return new Const(
+            s.graph()
+                .opBuilder("Const", s.makeOpName("Const"))
+                .setAttr("dtype", value.dataType())
+                .setAttr("value", value)
+                .build()
+                .output(0));
+      }
+    }
+
+    Const(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class Mean {
+    private final Output output;
+
+    static Mean create(Scope s, Output input, Output reductionIndices) {
+      return new Mean(
+          s.graph()
+              .opBuilder("Mean", s.makeOpName("Mean"))
+              .addInput(input)
+              .addInput(reductionIndices)
+              .build()
+              .output(0));
+    }
+
+    Mean(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class SquaredDifference {
+    private final Output output;
+
+    static SquaredDifference create(Scope s, Output x, Output y) {
+      return new SquaredDifference(
+          s.graph()
+              .opBuilder("SquaredDifference", s.makeOpName("SquaredDifference"))
+              .addInput(x)
+              .addInput(y)
+              .build()
+              .output(0));
+    }
+
+    SquaredDifference(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+
+  private static final class Variance {
+    private final Output output;
+
+    static Variance create(Scope base, Output x) {
+      Scope s = base.withSubScope("variance");
+      Output zero = Const.create(s.withName("zero"), new int[] {0}).output();
+      Output sqdiff =
+          SquaredDifference.create(
+                  s.withName("squared_deviation"), x, Mean.create(s, x, zero).output())
+              .output();
+
+      return new Variance(Mean.create(s.withName("variance"), sqdiff, zero).output());
+    }
+
+    Variance(Output o) {
+      output = o;
+    }
+
+    Output output() {
+      return output;
+    }
+  }
+}
diff --git a/tensorflow/opensource_only/eigen.threadpool b/tensorflow/opensource_only/eigen.threadpool
deleted file mode 100644
index d2639af4d97..00000000000
--- a/tensorflow/opensource_only/eigen.threadpool
+++ /dev/null
@@ -1 +0,0 @@
-#include "unsupported/Eigen/CXX11/ThreadPool"
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 6c898752555..88d5980835c 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1,11 +1,13 @@
 # Description:
 # Python support for TensorFlow.
 
-package(default_visibility = [
-    "//engedu/ml/tf_from_scratch:__pkg__",
-    "//tensorflow:internal",
-    "//tensorflow_models:__subpackages__",
-])
+package(
+    default_visibility = [
+        "//engedu/ml/tf_from_scratch:__pkg__",
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
 
 licenses(["notice"])  # Apache 2.0
 
@@ -25,6 +27,8 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_py
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_lib_deps")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_plugin_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_verbs_deps")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_mpi_deps")
 
 py_library(
     name = "python",
@@ -41,11 +45,11 @@ py_library(
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
+        ":tf_optimizer",
         ":array_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
-        ":cloud_ops",
         ":confusion_matrix",
         ":control_flow_ops",
         ":errors",
@@ -61,10 +65,10 @@ py_library(
         ":nn",
         ":platform",
         ":script_ops",
-        ":sdca_ops",
         ":session_ops",
         ":sets",
         ":sparse_ops",
+        ":spectral_ops",
         ":standard_ops",
         ":state_ops",
         ":string_ops",
@@ -74,11 +78,15 @@ py_library(
         ":tensor_array_ops",
         ":training",
         ":ops",
+        ":saver_test_utils",
         ":test_ops",  # TODO: Break testing code out into separate rule.
         ":util",
         ":weights_broadcast_ops",
         "//third_party/py/numpy",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/saved_model",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
@@ -122,38 +130,6 @@ py_library(
     deps = [":platform_benchmark"],
 )
 
-py_library(
-    name = "cloud_ops",
-    srcs = [
-        "ops/cloud/__init__.py",
-        "ops/cloud/bigquery_reader_ops.py",
-        "ops/cloud/cloud.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cloud_ops_gen",
-        ":framework_for_generated_wrappers",
-    ],
-)
-
-tf_py_test(
-    name = "bigquery_reader_ops_test",
-    size = "small",
-    srcs = ["ops/cloud/bigquery_reader_ops_test.py"],
-    additional_deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":cloud_ops",
-        ":data_flow_ops",
-        ":io_ops",
-        ":parsing_ops",
-        ":util",
-        "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
-        "//tensorflow/core:cloud_ops_op_lib",
-    ],
-    tags = ["manual"],
-)
-
 tf_py_test(
     name = "resource_loader_test",
     size = "small",
@@ -182,6 +158,23 @@ tf_py_test(
     ],
 )
 
+cc_library(
+    name = "cost_analyzer_lib",
+    srcs = ["grappler/cost_analyzer.cc"],
+    hdrs = ["grappler/cost_analyzer.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
+        "//tensorflow/core/grappler/costs:op_performance_data_cc",
+        "//tensorflow/core/grappler/costs:utils",
+    ],
+)
+
 cc_library(
     name = "numpy_lib",
     srcs = ["lib/core/numpy.cc"],
@@ -194,6 +187,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ndarray_tensor_bridge",
+    srcs = ["lib/core/ndarray_tensor_bridge.cc"],
+    hdrs = ["lib/core/ndarray_tensor_bridge.h"],
+    deps = [
+        ":numpy_lib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -201,7 +206,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -210,6 +215,7 @@ cc_library(
     srcs = ["lib/core/py_func.cc"],
     hdrs = ["lib/core/py_func.h"],
     deps = [
+        ":ndarray_tensor_bridge",
         ":numpy_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -227,6 +233,7 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -238,6 +245,7 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -254,7 +262,7 @@ cc_binary(
     linkshared = 1,
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "@protobuf//:protobuf",
+        "@protobuf//:protobuf_headers",
     ],
 )
 
@@ -265,6 +273,7 @@ py_test(
     data = [":framework/test_file_system.so"],
     main = "framework/file_system_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":data_flow_ops",
@@ -311,12 +320,18 @@ py_test(
 cc_library(
     name = "python_op_gen",
     srcs = ["framework/python_op_gen.cc"],
-    hdrs = ["framework/python_op_gen.h"],
+    hdrs = [
+        "framework/python_op_gen.h",
+        "framework/python_op_gen_internal.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_cc",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
     ],
     alwayslink = 1,
 )
@@ -325,62 +340,121 @@ cc_library(
     name = "python_op_gen_main",
     srcs = ["framework/python_op_gen_main.cc"],
     visibility = ["//visibility:public"],
-    deps = [":python_op_gen"],
+    deps = [
+        ":python_op_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
-# What is needed for tf_gen_op_wrapper_py.
 py_library(
     name = "framework_for_generated_wrappers",
-    srcs = [
-        "framework/constant_op.py",
-        "framework/device.py",
-        "framework/dtypes.py",
-        "framework/function.py",
-        "framework/op_def_library.py",
-        "framework/op_def_registry.py",
-        "framework/ops.py",
-        "framework/registry.py",
-        "framework/tensor_shape.py",
-        "framework/versions.py",
-    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":platform",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        ":constant_op",
+        ":device",
+        ":dtypes",
+        ":framework_ops",
+        ":function",
+        ":op_def_library",
+        ":op_def_registry",
+        ":registry",
+        ":tensor_shape",
+        ":versions",
+    ],
+)
+
+# What is needed for tf_gen_op_wrapper_py. This is the same as
+# "framework_for_generated_wrappers" minus the "function" dep. This is to avoid
+# circular dependencies, as "function" uses generated op wrappers.
+py_library(
+    name = "framework_for_generated_wrappers_v2",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":constant_op",
+        ":device",
+        ":dtypes",
+        ":framework_ops",
+        ":op_def_library",
+        ":op_def_registry",
+        ":registry",
+        ":tensor_shape",
+        ":versions",
     ],
 )
 
 py_library(
     name = "framework",
     srcs = [
-        "framework/common_shapes.py",
         "framework/framework_lib.py",
         "framework/graph_io.py",
-        "framework/graph_util.py",
-        "framework/graph_util_impl.py",
         "framework/importer.py",
         "framework/load_library.py",
         "framework/meta_graph.py",
-        "framework/random_seed.py",
-        "framework/sparse_tensor.py",
         "framework/subscribe.py",
-        "framework/tensor_util.py",
     ],
     srcs_version = "PY2AND3",
+    deps = [
+        ":common_shapes",
+        ":cpp_shape_inference_proto_py",
+        ":errors",
+        ":framework_for_generated_wrappers",
+        ":graph_util",
+        ":lib",
+        ":platform",
+        ":pywrap_tensorflow",
+        ":random_seed",
+        ":sparse_tensor",
+        ":tensor_util",
+        ":util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "common_shapes",
+    srcs = ["framework/common_shapes.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":cpp_shape_inference_proto_py",
         ":errors",
-        ":framework_for_generated_wrappers",
-        ":lib",
-        ":platform",
+        ":framework_ops",
         ":pywrap_tensorflow",
-        ":util",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        ":tensor_shape",
+        ":tensor_util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "constant_op",
+    srcs = ["framework/constant_op.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":tensor_shape",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "device",
+    srcs = ["framework/device.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "dtypes",
+    srcs = ["framework/dtypes.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -394,6 +468,139 @@ py_library(
     deps = [":util"],
 )
 
+py_library(
+    name = "function",
+    srcs = ["framework/function.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":op_def_registry",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "graph_util",
+    srcs = [
+        "framework/graph_util.py",
+        "framework/graph_util_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":platform",
+        ":tensor_util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "op_def_library",
+    srcs = ["framework/op_def_library.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":platform",
+        ":tensor_shape",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "op_def_registry",
+    srcs = ["framework/op_def_registry.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "framework_ops",  # "ops" is already the name of a deprecated target
+    srcs = ["framework/ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device",
+        ":dtypes",
+        ":op_def_registry",
+        ":platform",
+        ":registry",
+        ":tensor_shape",
+        ":util",
+        ":versions",
+        "//tensorflow/core:protos_all_py",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "random_seed",
+    srcs = ["framework/random_seed.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_ops",
+    ],
+)
+
+py_library(
+    name = "registry",
+    srcs = ["framework/registry.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":platform",
+        ":util",
+    ],
+)
+
+py_library(
+    name = "sparse_tensor",
+    srcs = ["framework/sparse_tensor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":tensor_util",
+    ],
+)
+
+py_library(
+    name = "tensor_shape",
+    srcs = ["framework/tensor_shape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "tensor_util",
+    srcs = ["framework/tensor_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tensor_shape",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "versions",
+    srcs = ["framework/versions.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+    ],
+)
+
 # load("//third_party/py/cython:build_defs.bzl", "pyx_library")
 
 py_library(
@@ -417,12 +624,19 @@ py_library(
         ":platform",
         ":platform_test",
         ":pywrap_tensorflow",
+        ":training",
         ":util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "distributed_framework_test_lib",
+    srcs_version = "PY2AND3",
+    deps = [":framework_test_lib"],
+)
+
 py_library(
     name = "client_testlib",
     srcs = ["platform/test.py"],
@@ -483,6 +697,7 @@ py_test(
     srcs = ["framework/contrib_test.py"],
     main = "framework/contrib_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -513,11 +728,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":constant_op",
         ":control_flow_ops",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":functional_ops_gen",
+        ":sparse_tensor",
         ":tensor_array_ops",
+        ":tensor_shape",
         ":util",
         ":variable_scope",
     ],
@@ -579,18 +796,29 @@ py_test(
         ":nn_grad",
         ":nn_ops",
         ":random_ops",
+        ":test_ops",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
     ],
 )
 
+filegroup(
+    name = "meta_graph_testdata",
+    srcs = [
+        "framework/testdata/metrics_export_meta_graph.pb",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 py_test(
     name = "framework_meta_graph_test",
     size = "small",
     srcs = ["framework/meta_graph_test.py"],
+    data = ["//tensorflow/python:meta_graph_testdata"],
     main = "framework/meta_graph_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -599,6 +827,7 @@ py_test(
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":metrics",
         ":nn_ops",
         ":platform",
         ":random_ops",
@@ -617,7 +846,11 @@ cc_library(
     name = "test_ops_kernels",
     srcs = ["framework/test_ops.cc"],
     linkstatic = 1,
-    deps = ["//tensorflow/core:framework"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
     alwayslink = 1,
 )
 
@@ -657,6 +890,7 @@ py_test(
     srcs = ["framework/ops_test.py"],
     main = "framework/ops_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # test_ops_2 is not available in pip.
     deps = [
         ":control_flow_ops",
         ":errors",
@@ -752,6 +986,7 @@ py_test(
     srcs = ["framework/tensor_util_test.py"],
     main = "framework/tensor_util_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -769,6 +1004,7 @@ py_test(
     srcs = ["framework/test_util_test.py"],
     main = "framework/test_util_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":control_flow_ops",
         ":errors",
@@ -826,11 +1062,6 @@ tf_gen_op_wrapper_private_py(
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
-tf_gen_op_wrapper_private_py(
-    name = "cloud_ops_gen",
-    require_shape_functions = True,
-)
-
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
     require_shape_functions = True,
@@ -851,7 +1082,16 @@ tf_gen_op_wrapper_private_py(
     require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/contrib/lookup:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "dataset_ops_gen",
+    require_shape_functions = True,
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
 )
@@ -886,6 +1126,16 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "lookup_ops_gen",
+    require_shape_functions = True,
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/contrib/lookup:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     require_shape_functions = True,
@@ -960,6 +1210,11 @@ tf_gen_op_wrapper_private_py(
     require_shape_functions = True,
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "spectral_ops_gen",
+    require_shape_functions = True,
+)
+
 tf_gen_op_wrapper_private_py(
     name = "string_ops_gen",
     require_shape_functions = True,
@@ -995,9 +1250,14 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops_gen",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":common_shapes",
+        ":constant_op",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops_gen",
+        ":sparse_tensor",
+        ":tensor_shape",
+        ":tensor_util",
         ":util",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -1077,17 +1337,21 @@ py_library(
     srcs = ["ops/control_flow_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "tensor_shape",
         ":array_ops",
         ":array_ops_gen",
+        ":constant_op",
         ":control_flow_ops_gen",
         ":data_flow_ops_gen",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
         ":logging_ops_gen",
         ":math_ops",
         ":platform",
+        ":sparse_tensor",
         ":tensor_array_ops",
         ":util",
+        "//tensorflow/core:protos_all_py",
         "@six_archive//:six",
     ],
 )
@@ -1139,6 +1403,7 @@ py_library(
     deps = [
         ":array_ops",
         ":clip_ops",
+        ":data_flow_grad",
         ":data_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -1171,6 +1436,7 @@ py_library(
         ":math_grad",
         ":math_ops",
         ":platform",
+        ":spectral_grad",
         ":util",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -1230,7 +1496,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":framework_for_generated_wrappers",
+        ":constant_op",
+        ":dtypes",
         ":linalg_ops",
         ":math_ops",
         ":nn_ops",
@@ -1268,7 +1535,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
         ":linalg_ops_gen",
         ":math_ops",
         "//third_party/py/numpy",
@@ -1286,6 +1554,21 @@ py_library(
     ],
 )
 
+py_library(
+    name = "lookup_ops",
+    srcs = ["ops/lookup_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":lookup_ops_gen",
+        ":math_ops",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -1306,15 +1589,22 @@ py_library(
     srcs = ["ops/math_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "constant_op",
         ":array_ops",
+        ":common_shapes",
         ":control_flow_ops_gen",
         ":data_flow_ops_gen",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
+        ":graph_util",
         ":math_ops_gen",
+        ":nn_ops_gen",
         ":sparse_ops_gen",
+        ":sparse_tensor",
+        ":spectral_ops_gen",
         ":state_ops",
         ":state_ops_gen",
+        ":tensor_shape",
         ":util",
         "//third_party/py/numpy",
     ],
@@ -1338,10 +1628,12 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":resource_variable_ops_gen",
+        ":tensor_shape",
         ":util",
         ":variables",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -1365,6 +1657,7 @@ py_library(
         ":rnn",
         ":sparse_ops",
         ":util",
+        ":variables",
     ],
 )
 
@@ -1388,11 +1681,14 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
+        ":graph_util",
         ":math_ops",
         ":nn_ops_gen",
         ":random_ops",
+        ":tensor_shape",
+        ":tensor_util",
         "//third_party/py/numpy",
     ],
 )
@@ -1441,10 +1737,11 @@ py_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops",
         ":random_ops_gen",
+        ":random_seed",
     ],
 )
 
@@ -1467,13 +1764,23 @@ py_library(
 py_library(
     name = "rnn_cell",
     srcs = [
+        "ops/rnn_cell.py",
         "ops/rnn_cell_impl.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":clip_ops",
         ":framework_for_generated_wrappers",
+        ":init_ops",
+        ":layers_base",
+        ":math_ops",
+        ":nn_ops",
+        ":partitioned_variables",
+        ":random_ops",
         ":util",
+        ":variable_scope",
+        ":variables",
     ],
 )
 
@@ -1542,12 +1849,40 @@ py_library(
     ],
 )
 
+py_library(
+    name = "spectral_grad",
+    srcs = ["ops/spectral_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":spectral_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "spectral_ops",
+    srcs = ["ops/spectral_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":spectral_ops_gen",
+    ],
+)
+
 py_library(
     name = "confusion_matrix",
     srcs = ["ops/confusion_matrix.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":check_ops",
         ":control_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -1632,6 +1967,7 @@ py_library(
         ":io_ops",
         ":linalg_ops",
         ":logging_ops",
+        ":lookup_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -1643,6 +1979,7 @@ py_library(
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
+        ":spectral_grad",
         ":state_grad",
         ":state_ops",
         ":string_ops",
@@ -1667,9 +2004,10 @@ py_library(
     srcs = ["ops/state_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":resource_variable_ops_gen",
         ":state_ops_gen",
+        ":tensor_shape",
     ],
 )
 
@@ -1692,6 +2030,7 @@ py_library(
     deps = [
         ":framework_for_generated_wrappers",
         ":logging_ops_gen",
+        ":summary_op_util",
     ],
 )
 
@@ -1724,9 +2063,10 @@ py_library(
     deps = [
         ":array_ops",
         ":data_flow_ops_gen",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        ":framework_ops",
         ":math_ops",
+        ":tensor_shape",
+        ":tensor_util",
         ":util",
     ],
 )
@@ -1736,11 +2076,12 @@ py_library(
     srcs = ["ops/variable_scope.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
         ":init_ops",
         ":platform",
         ":resource_variable_ops",
+        ":tensor_shape",
         ":variables",
         "@six_archive//:six",
     ],
@@ -1753,10 +2094,13 @@ py_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
-        ":framework_for_generated_wrappers",
+        ":dtypes",
+        ":framework_ops",
         ":math_ops",
         ":state_ops",
+        ":tensor_shape",
         ":util",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -1848,10 +2192,8 @@ cuda_py_test(
         ":tensor_array_grad",
         ":tensor_array_ops",
         ":test_ops",
-        "//tensorflow/contrib/compiler:compiler_py",
         "//third_party/py/numpy",
     ],
-    xla_enabled = True,
 )
 
 cuda_py_test(
@@ -1905,6 +2247,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1926,6 +2269,7 @@ cuda_py_test(
     srcs = ["ops/math_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":errors",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":gradients",
@@ -1951,6 +2295,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1978,6 +2323,9 @@ cuda_py_test(
         ":nn",
         ":nn_grad",
         ":nn_ops",
+        ":partitioned_variables",
+        ":variable_scope",
+        ":variables",
         "//third_party/py/numpy",
     ],
 )
@@ -2031,6 +2379,7 @@ py_library(
         ":io_ops",
         ":io_ops_gen",
         ":lib",
+        ":lookup_ops",
         ":math_ops",
         ":platform",
         ":protos_all_py",
@@ -2038,6 +2387,7 @@ py_library(
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
+        ":sdca_ops",
         ":sparse_ops",
         ":state_ops",
         ":string_ops",
@@ -2051,6 +2401,37 @@ py_library(
     ],
 )
 
+py_test(
+    name = "evaluation_test",
+    size = "small",
+    srcs = ["training/evaluation_test.py"],
+    shard_count = 3,
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
+    ],
+    deps = [
+        ":array_ops",
+        ":client",
+        ":client_testlib",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":layers",
+        ":math_ops",
+        ":metrics",
+        ":platform",
+        ":state_ops",
+        ":summary",
+        ":training",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "client",
     srcs = [
@@ -2116,6 +2497,50 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tf_contextlib_test",
+    size = "small",
+    srcs = ["util/tf_contextlib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_decorator_test",
+    size = "small",
+    srcs = ["util/tf_decorator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_should_use_test",
+    size = "small",
+    srcs = ["util/tf_should_use_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_inspect_test",
+    size = "small",
+    srcs = ["util/tf_inspect_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_library(
     name = "util_example_parser_configuration",
     srcs = ["util/example_parser_configuration.py"],
@@ -2159,6 +2584,7 @@ py_test(
     srcs = ["util/protobuf/compare_test.py"],
     main = "util/protobuf/compare_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
     deps = [
         ":compare_test_proto_py",
         ":platform_test",
@@ -2199,22 +2625,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "quantize_training_test",
-    size = "small",
-    srcs = [
-        "client/quantize_training_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":pywrap_tensorflow",
-    ],
-)
-
 py_library(
     name = "device_lib",
     srcs = ["client/device_lib.py"],
@@ -2236,9 +2646,8 @@ cc_library(
         ":py_func_lib",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:framework",
-        "//tensorflow/core:protos_cc",
-        "//third_party/py/numpy:headers",
-        "//util/python:python_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -2262,6 +2671,7 @@ tf_cuda_library(
     hdrs = ["client/tf_session_helper.h"],
     deps = [
         ":construction_fails_op",
+        ":ndarray_tensor_bridge",
         ":numpy_lib",
         ":test_ops_kernels",
         "//tensorflow/c:c_api",
@@ -2269,29 +2679,41 @@ tf_cuda_library(
         "//tensorflow/core",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_cc",
+        "//tensorflow/core:protos_all_cc",
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
     ],
 )
 
-tf_py_wrap_cc(
+py_library(
     name = "pywrap_tensorflow",
+    srcs = ["pywrap_tensorflow.py"],
+    srcs_version = "PY2AND3",
+    deps = [":pywrap_tensorflow_internal"],
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_tensorflow_internal",
     srcs = ["tensorflow.i"],
     swig_includes = [
         "client/device_lib.i",
         "client/events_writer.i",
-        "client/quantize_training.i",
         "client/tf_session.i",
+        "client/tf_sessionrun_wrapper.i",
         "framework/cpp_shape_inference.i",
         "framework/python_op_gen.i",
+        "grappler/cost_analyzer.i",
+        "grappler/tf_optimizer.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
         "lib/io/py_record_reader.i",
         "lib/io/py_record_writer.i",
         "platform/base.i",
+        "training/quantize_training.i",
         "training/server_lib.i",
         "util/kernel_registry.i",
         "util/port.i",
@@ -2300,6 +2722,7 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
     ],
     deps = [
+        ":cost_analyzer_lib",
         ":cpp_shape_inference",
         ":kernel_registry",
         ":numpy_lib",
@@ -2313,13 +2736,21 @@ tf_py_wrap_cc(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core:lib",
+        "//tensorflow/core:reader_base",
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/tools/tfprof/internal:print_model_analysis",
         "//util/python:python_headers",
-    ] + tf_additional_lib_deps() + tf_additional_plugin_deps(),
+    ] + (tf_additional_lib_deps() +
+         tf_additional_plugin_deps() +
+         tf_additional_verbs_deps() +
+         tf_additional_mpi_deps()),
 )
 
 py_library(
@@ -2334,6 +2765,7 @@ py_library(
         ":errors",
         ":pywrap_tensorflow",
         ":util",
+        "@six_archive//:six",
     ],
 )
 
@@ -2488,6 +2920,7 @@ cuda_py_test(
     additional_deps = [
         ":client",
         ":client_testlib",
+        ":distributed_framework_test_lib",
         ":framework_for_generated_wrappers",
         ":partitioned_variables",
         ":training",
@@ -2515,6 +2948,7 @@ py_library(
     name = "timeline",
     srcs = ["client/timeline.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ],
@@ -2526,6 +2960,7 @@ tf_cuda_library(
     srcs = ["client/test_construction_fails_op.cc"],
     deps = [
         "//tensorflow/core",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_cc",
     ],
@@ -2537,6 +2972,11 @@ py_test(
     size = "small",
     srcs = ["client/session_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
+        "no_windows",
+    ],
     deps = [
         ":array_ops",
         ":client",
@@ -2557,6 +2997,70 @@ py_test(
     ],
 )
 
+py_test(
+    name = "session_clusterspec_prop_test",
+    size = "small",
+    srcs = ["client/session_clusterspec_prop_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",
+    ],
+    deps = [
+        ":array_ops",
+        ":client",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":math_ops",
+        ":platform_test",
+        ":state_ops",
+        ":training",
+        ":util",
+        ":variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "session_list_devices_test",
+    size = "small",
+    srcs = ["client/session_list_devices_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",
+    ],
+    deps = [
+        ":client",
+        ":framework",
+        ":framework_test_lib",
+        ":platform_test",
+        ":training",
+    ],
+)
+
+py_test(
+    name = "session_partial_run_test",
+    size = "small",
+    srcs = ["client/session_partial_run_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+    ],
+    deps = [
+        ":array_ops",
+        ":client",
+        ":errors",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":math_ops",
+        ":platform_test",
+        ":training",
+        ":util",
+        "@six_archive//:six",
+    ],
+)
+
 cuda_py_test(
     name = "timeline_test",
     size = "small",
@@ -2592,6 +3096,7 @@ py_test(
     size = "small",
     srcs = ["lib/io/file_io_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":errors",
@@ -2615,6 +3120,26 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "moving_averages_test",
+    size = "small",
+    srcs = [
+        "training/moving_averages_test.py",
+    ],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":constant_op",
+        ":dtypes",
+        ":framework_for_generated_wrappers",
+        ":framework_ops",
+        ":training",
+        ":variable_scope",
+        ":variables",
+    ],
+    tags = ["notsan"],
+)
+
 cuda_py_tests(
     name = "training_tests",
     size = "small",
@@ -2623,19 +3148,19 @@ cuda_py_tests(
         "training/adagrad_da_test.py",
         "training/adagrad_test.py",
         "training/basic_loops_test.py",
+        "training/checkpoint_utils_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
         "training/momentum_test.py",
-        "training/moving_averages_test.py",
         "training/optimizer_test.py",
         "training/proximal_adagrad_test.py",
         "training/proximal_gradient_descent_test.py",
+        "training/quantize_training_test.py",
         "training/queue_runner_test.py",
         "training/rmsprop_test.py",
-        "training/saver_test.py",
         "training/slot_creator_test.py",
         "training/tensorboard_logging_test.py",
         "training/training_ops_test.py",
@@ -2652,6 +3177,7 @@ cuda_py_tests(
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":lookup_ops",
         ":gradients",
         ":math_ops",
         ":nn_grad",
@@ -2677,12 +3203,58 @@ cuda_py_tests(
     ],
 )
 
+py_library(
+    name = "saver_test_utils",
+    srcs = ["training/saver_test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":lookup_ops_gen",
+        ":training",
+    ],
+)
+
+cuda_py_test(
+    name = "saver_test",
+    size = "medium",
+    srcs = [
+        "training/saver_test.py",
+    ],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":control_flow_ops",
+        ":data_flow_ops",
+        ":errors",
+        ":gradients",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":saver_test_utils",
+        ":partitioned_variables",
+        ":platform",
+        ":platform_test",
+        ":pywrap_tensorflow",
+        ":random_ops",
+        ":resource_variable_ops",
+        ":sparse_ops",
+        ":summary",
+        ":training",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_test(
     name = "saver_large_variable_test",
     size = "small",
     srcs = ["training/saver_large_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "manual",
         "noasan",  # http://b/30379628
         "notsan",  # http://b/30379628
     ],
@@ -2738,6 +3310,7 @@ py_test(
     size = "small",
     srcs = ["training/supervisor_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2760,6 +3333,7 @@ py_test(
     size = "small",
     srcs = ["training/basic_session_run_hooks_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":client",
         ":client_testlib",
@@ -2783,6 +3357,7 @@ py_test(
     size = "small",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":array_ops",
         ":client",
@@ -2798,6 +3373,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_util_test",
+    size = "small",
+    srcs = ["training/training_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":framework",
+        ":platform",
+        ":training",
+        ":variables",
+    ],
+)
+
 tf_py_test(
     name = "input_test",
     size = "small",
@@ -2817,6 +3406,18 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "summary_op_util",
+    srcs = ["ops/summary_op_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":platform",
+    ],
+)
+
 py_library(
     name = "summary",
     srcs = glob(
@@ -2834,6 +3435,7 @@ py_library(
         ":platform",
         ":protos_all_py",
         ":pywrap_tensorflow",
+        ":summary_op_util",
         ":summary_ops",
         ":util",
         "//third_party/py/numpy",
@@ -2845,13 +3447,9 @@ py_tests(
     name = "summary_tests",
     size = "small",
     srcs = [
-        "summary/event_accumulator_test.py",
-        "summary/event_file_inspector_test.py",
-        "summary/event_multiplexer_test.py",
-        "summary/impl/directory_watcher_test.py",
-        "summary/impl/event_file_loader_test.py",
-        "summary/impl/reservoir_test.py",
+        "summary/plugin_asset_test.py",
         "summary/summary_test.py",
+        "summary/text_summary_test.py",
         "summary/writer/writer_test.py",
     ],
     additional_deps = [
@@ -2870,15 +3468,10 @@ py_tests(
 )
 
 py_library(
-    name = "layers",
+    name = "layers_base",
     srcs = [
         "layers/__init__.py",
         "layers/base.py",
-        "layers/convolutional.py",
-        "layers/core.py",
-        "layers/layers.py",
-        "layers/normalization.py",
-        "layers/pooling.py",
         "layers/utils.py",
     ],
     srcs_version = "PY2AND3",
@@ -2888,6 +3481,31 @@ py_library(
         ":framework",
         ":framework_for_generated_wrappers",
         ":init_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = [
+        "layers/convolutional.py",
+        "layers/core.py",
+        "layers/layers.py",
+        "layers/normalization.py",
+        "layers/pooling.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":init_ops",
+        ":layers_base",
         ":math_ops",
         ":nn",
         ":standard_ops",
@@ -2978,13 +3596,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "layers_normalization_test",
     size = "small",
     srcs = ["layers/normalization_test.py"],
-    main = "layers/normalization_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2994,39 +3610,7 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
-)
-
-py_library(
-    name = "docs",
-    srcs = ["framework/docs.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_library(
-    name = "gen_docs_combined_lib",
-    srcs = ["framework/gen_docs_combined.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":docs",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
-        "//tensorflow/python/debug:debug_py",
-    ],
-)
-
-py_binary(
-    name = "gen_docs_combined",
-    srcs = ["framework/gen_docs_combined.py"],
-    main = "framework/gen_docs_combined.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":docs",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/debug:debug_py",
-    ],
+    main = "layers/normalization_test.py",
 )
 
 # -----------------------------------------------------------------------------
@@ -3037,6 +3621,7 @@ py_test(
     size = "small",
     srcs = ["ops/dequantize_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3050,6 +3635,7 @@ py_test(
     size = "small",
     srcs = ["ops/quantized_conv_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -3144,6 +3730,25 @@ cuda_py_test(
     main = "ops/split_benchmark.py",
 )
 
+cuda_py_test(
+    name = "transpose_benchmark",
+    size = "medium",
+    srcs = ["ops/transpose_benchmark.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client",
+        ":client_testlib",
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":platform",
+        ":platform_benchmark",
+        ":variables",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    main = "ops/transpose_benchmark.py",
+)
+
 cuda_py_test(
     name = "session_benchmark",
     srcs = ["client/session_benchmark.py"],
@@ -3159,3 +3764,94 @@ cuda_py_test(
     ],
     main = "client/session_benchmark.py",
 )
+
+py_library(
+    name = "tf_optimizer",
+    srcs = [
+        "grappler/tf_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":pywrap_tensorflow_internal"],
+)
+
+py_test(
+    name = "tf_optimizer_test",
+    size = "small",
+    srcs = ["grappler/tf_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # tf_optimizer is not available in pip.
+    deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "memory_optimizer_test",
+    size = "medium",
+    srcs = [
+        "grappler/memory_optimizer_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "layout_optimizer_test",
+    size = "medium",
+    srcs = [
+        "grappler/layout_optimizer_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":nn",
+        ":random_ops",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "cost_analyzer",
+    srcs = [
+        "grappler/cost_analyzer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":pywrap_tensorflow_internal"],
+)
+
+py_test(
+    name = "cost_analyzer_test",
+    size = "small",
+    srcs = ["grappler/cost_analyzer_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":cost_analyzer",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":nn",
+        ":nn_grad",
+        ":random_ops",
+        ":state_ops",
+        ":training",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 39ec56ca327..5357d36330a 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -26,11 +26,9 @@ import tensorflow as tf
 
 import ctypes
 import importlib
-import inspect
 import sys
 import traceback
 
-
 # TODO(drpng): write up instructions for editing this file in a doc and point to
 # the doc instead.
 # If you want to edit this file to expose modules in public tensorflow API, you
@@ -46,30 +44,9 @@ import traceback
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
 
-# On UNIX-based platforms, pywrap_tensorflow is a SWIG-generated
-# python library that dynamically loads _pywrap_tensorflow.so. The
-# default mode for loading keeps all the symbol private and not
-# visible to other libraries that may be loaded. Setting the mode to
-# RTLD_GLOBAL to make the symbols visible, so that custom op libraries
-# imported using `tf.load_op_library()` can access symbols defined in
-# _pywrap_tensorflow.so.
 import numpy as np
-try:
-  if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
-    _default_dlopen_flags = sys.getdlopenflags()
-    sys.setdlopenflags(_default_dlopen_flags | ctypes.RTLD_GLOBAL)
-    from tensorflow.python import pywrap_tensorflow
-    sys.setdlopenflags(_default_dlopen_flags)
-  else:
-    # TODO(keveman,mrry): Support dynamic op loading on platforms that do not
-    # use `dlopen()` for dynamic loading.
-    from tensorflow.python import pywrap_tensorflow
-except ImportError:
-  msg = """%s\n\nFailed to load the native TensorFlow runtime.\n
-See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_started/os_setup.md#import_error\n
-for some common reasons and solutions.  Include the entire stack trace
-above this error message when asking for help.""" % traceback.format_exc()
-  raise ImportError(msg)
+
+from tensorflow.python import pywrap_tensorflow
 
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
@@ -77,7 +54,9 @@ from tensorflow.core.framework.node_def_pb2 import *
 from tensorflow.core.framework.summary_pb2 import *
 from tensorflow.core.framework.attr_value_pb2 import *
 from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
+from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
 from tensorflow.core.protobuf.config_pb2 import *
+from tensorflow.core.protobuf.tensorflow_server_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
 # Framework
@@ -95,17 +74,18 @@ from tensorflow.python.ops.standard_ops import *
 # pylint: enable=wildcard-import
 
 # Bring in subpackages.
+from tensorflow.python.estimator import estimator_lib as estimator
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
+from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import sdca_ops as sdca
-from tensorflow.python.ops import image_ops as image
-from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops import sets
-from tensorflow.python.saved_model import saved_model
-from tensorflow.python.util import compat
+from tensorflow.python.ops import spectral_ops as spectral
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
+from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
 
 # Import the names from python/training.py as train.Name.
@@ -151,7 +131,9 @@ from tensorflow.python.ops import tensor_array_ops
 # documentation, or remove.
 _allowed_symbols = [
     'AttrValue',
+    'AutoParallelOptions',
     'ConfigProto',
+    'ClusterDef',
     'DeviceSpec',
     'Event',
     'GPUOptions',
@@ -162,9 +144,11 @@ _allowed_symbols = [
     'GraphOptions',
     'HistogramProto',
     'LogMessage',
+    'MetaGraphDef',
     'NameAttrList',
     'NodeDef',
     'OptimizerOptions',
+    'RewriterConfig',
     'RunOptions',
     'RunMetadata',
     'SessionLog',
@@ -181,7 +165,6 @@ _allowed_symbols.extend([
     'neg',  # use tf.negative instead.
     'sub',  # use tf.subtract instead.
     'create_partitioned_variables',
-    'concat_v2',  # use tf.concat instead
     'deserialize_many_sparse',
     'lin_space',
     'list_diff',  # Use tf.listdiff instead.
@@ -189,7 +172,7 @@ _allowed_symbols.extend([
     'parse_single_sequence_example',
     'serialize_many_sparse',
     'serialize_sparse',
-    'sparse_matmul',   ## use tf.matmul instead.
+    'sparse_matmul',  ## use tf.matmul instead.
 ])
 
 # This is needed temporarily because we import it explicitly.
@@ -230,6 +213,8 @@ _allowed_symbols.extend([
     'app',
     'compat',
     'errors',
+    'estimator',
+    'feature_column',
     'flags',
     'gfile',
     'graph_util',
@@ -242,8 +227,8 @@ _allowed_symbols.extend([
     'python_io',
     'resource_loader',
     'saved_model',
-    'sdca',
     'sets',
+    'spectral',
     'summary',
     'sysconfig',
     'test',
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index 91f328205b8..b9ecaa4c851 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -13,20 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 
-"""This library contains classes for launching graphs and executing operations.
+"""Support for launching graphs and executing operations.
 
-The [basic usage](../../get_started/index.md#basic-usage) guide has
-examples of how a graph is launched in a [`tf.Session`](#Session).
-
-## Session management
+See the @{$python/client} guide.
 
 @@Session
 @@InteractiveSession
-
 @@get_default_session
-
-## Error classes and convenience functions
-
 @@OpError
 @@CancelledError
 @@UnknownError
@@ -44,7 +37,6 @@ examples of how a graph is launched in a [`tf.Session`](#Session).
 @@InternalError
 @@UnavailableError
 @@DataLossError
-
 @@exception_type_from_error_code
 @@error_code_from_exception_type
 @@raise_exception_on_not_ok_status
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 561ce09099b..7bba10efacf 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/events_writer.i b/tensorflow/python/client/events_writer.i
index ab83688074d..de030fcb428 100644
--- a/tensorflow/python/client/events_writer.i
+++ b/tensorflow/python/client/events_writer.i
@@ -28,6 +28,7 @@ limitations under the License.
 %unignore tensorflow::EventsWriter;
 %unignore tensorflow::EventsWriter::EventsWriter;
 %unignore tensorflow::EventsWriter::~EventsWriter;
+%unignore tensorflow::EventsWriter::InitWithSuffix;
 %unignore tensorflow::EventsWriter::FileName;
 %rename("_WriteSerializedEvent") tensorflow::EventsWriter::WriteSerializedEvent;
 %unignore tensorflow::EventsWriter::Flush;
diff --git a/tensorflow/python/client/quantize_training_test.py b/tensorflow/python/client/quantize_training_test.py
deleted file mode 100644
index 0220fa88cea..00000000000
--- a/tensorflow/python/client/quantize_training_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the SWIG-wrapped quantize training rewriting."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.client import session
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class PywrapQuantizeTrainingTest(test.TestCase):
-
-  # Mainly to verify the python interface is working.
-  # More tests for this function can be found in the related c++ tests.
-  def testQuantizeTraining(self):
-    with session.Session() as sess:
-      a = constant_op.constant(6.0, shape=[1, 1])
-      b = constant_op.constant(7.0, shape=[1, 1])
-      c = math_ops.matmul(a, b, name='matmul')
-
-      self.assertEquals(len(sess.graph_def.node), 3)
-
-      result = pywrap_tensorflow.do_quantize_training_on_graphdef(
-          sess.graph_def, 8)
-
-      # 2 convert ops are added so it should have 5 nodes now.
-      self.assertEquals(len(result.node), 5)
-
-      self.assertEquals(c.eval(), 42)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 248d4c9b819..ad2ee13db58 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
+from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -394,7 +395,7 @@ class _FetchHandler(object):
   # TODO(touts): Make this class also take care of destructuring the feed
   # dict instead of doing it in the callers.
 
-  def __init__(self, graph, fetches, feeds):
+  def __init__(self, graph, fetches, feeds, feed_handles=None):
     """Creates a fetch handler.
 
     Args:
@@ -402,28 +403,32 @@ class _FetchHandler(object):
         and to convert all fetches to tensors or ops as needed.
       fetches: An arbitrary fetch structure: singleton, list, tuple,
         namedtuple, or dict.
-      feeds: A feed dict where keys are fully resolved tensor names.
+      feeds: A feed dict where keys are Tensors.
+      feed_handles: A dict from feed Tensors to TensorHandle objects used as
+        direct feeds.
     """
     with graph.as_default():
       self._fetch_mapper = _FetchMapper.for_fetch(fetches)
     self._fetches = []
     self._targets = []
     self._feeds = feeds
+    self._feed_handles = feed_handles or {}
     self._ops = []
     self._fetch_handles = {}
     for fetch in self._fetch_mapper.unique_fetches():
-      fetch_name = compat.as_bytes(fetch.name)
       if isinstance(fetch, ops.Operation):
         self._assert_fetchable(graph, fetch)
-        self._targets.append(fetch_name)
+        self._targets.append(fetch)
         self._ops.append(True)
       else:
         self._assert_fetchable(graph, fetch.op)
-        self._fetches.append(fetch_name)
+        self._fetches.append(fetch)
         self._ops.append(False)
       # Remember the fetch if it is for a tensor handle.
-      if isinstance(fetch, ops.Tensor) and fetch.op.type == 'GetSessionHandle':
-        self._fetch_handles[fetch_name] = fetch.op.inputs[0].dtype
+      if (isinstance(fetch, ops.Tensor) and
+          (fetch.op.type == 'GetSessionHandle' or
+           fetch.op.type == 'GetSessionHandleV2')):
+        self._fetch_handles[fetch] = fetch.op.inputs[0].dtype
     self._final_fetches = [x for x in self._fetches if x not in feeds]
 
   def _assert_fetchable(self, graph, op):
@@ -477,7 +482,12 @@ class _FetchHandler(object):
       else:
         # If the fetch was in the feeds, use the fed value, otherwise
         # use the returned value.
-        value = self._feeds.get(self._fetches[i])
+        if self._fetches[i] in self._feed_handles:
+          # A fetch had a corresponding direct TensorHandle feed. Call eval()
+          # to obtain the Tensor value from the TensorHandle.
+          value = self._feed_handles[self._fetches[i]].eval()
+        else:
+          value = self._feeds.get(self._fetches[i])
         if value is None:
           value = tensor_values[j]
           j += 1
@@ -491,6 +501,51 @@ class _FetchHandler(object):
     return self._fetch_mapper.build_results(full_values)
 
 
+def _name_list(tensor_list):
+  """Utility function for transitioning to the new session API.
+
+  Args:
+    tensor_list: a list of `Tensor`s.
+
+  Returns:
+    A list of each `Tensor`s name (as byte arrays).
+  """
+  return [compat.as_bytes(t.name) for t in tensor_list]
+
+
+class _DeviceAttributes(object):
+  """Struct-like object describing a device's attributes.
+
+  Each device has 3 key properties:
+   - name: the fully-qualified TensorFlow path to the device. For
+        example: /job:worker/replica:0/task:3/device:CPU:0
+   - device_type: the type of the device (e.g. CPU, GPU, TPU, etc.)
+   - memory_limit_bytes: the maximum amount of memory available on the device
+        (in bytes).
+  """
+
+  def __init__(self, name, device_type, memory_limit_bytes):
+    self._name = device.canonical_name(name)
+    self._device_type = device_type
+    self._memory_limit_bytes = memory_limit_bytes
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def device_type(self):
+    return self._device_type
+
+  @property
+  def memory_limit_bytes(self):
+    return self._memory_limit_bytes
+
+  def __repr__(self):
+    return '_DeviceAttributes(%s, %s, %d)' % (self.name, self.device_type,
+                                              self.memory_limit_bytes,)
+
+
 class BaseSession(SessionInterface):
   """A class for interacting with a TensorFlow computation.
 
@@ -545,14 +600,66 @@ class BaseSession(SessionInterface):
       self._config = None
       self._add_shapes = False
 
+    # pylint: disable=protected-access
+    # We cache _USE_C_API's value because some test cases will create a session
+    # with _USE_C_API = False but set it back to True before calling close().
+    self._created_with_new_api = ops._USE_C_API
+    # pylint: enable=protected-access
+
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
       with errors.raise_exception_on_not_ok_status() as status:
-        self._session = tf_session.TF_NewDeprecatedSession(opts, status)
+        if self._created_with_new_api:
+          # pylint: disable=protected-access
+          self._session = tf_session.TF_NewSession(self._graph._c_graph, opts,
+                                                   status)
+          # pylint: enable=protected-access
+        else:
+          self._session = tf_session.TF_NewDeprecatedSession(opts, status)
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
+  def list_devices(self):
+    """Lists available devices in this session.
+
+    ```python
+    devices = sess.list_devices()
+    for d in devices:
+      print(d.name)
+    ```
+
+    Each element in the list has the following properties:
+     - `name`: A string with the full name of the device. ex:
+          `/job:worker/replica:0/task:3/device:CPU:0`
+     - `device_type`: The type of the device (e.g. `CPU`, `GPU`, `TPU`.)
+     - `memory_limit`: The maximum amount of memory available on the device.
+          Note: depending on the device, it is possible the usable memory could
+          be substantially less.
+    Raises:
+      tf.errors.OpError: If it encounters an error (e.g. session is in an
+      invalid state, or network errors occur).
+
+    Returns:
+      A list of devices in the session.
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      if self._created_with_new_api:
+        raw_device_list = tf_session.TF_SessionListDevices(
+            self._session, status)
+      else:
+        raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
+            self._session, status)
+      device_list = []
+      size = tf_session.TF_DeviceListCount(raw_device_list)
+      for i in range(size):
+        name = tf_session.TF_DeviceListName(raw_device_list, i, status)
+        device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
+        memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
+        device_list.append(_DeviceAttributes(name, device_type, memory))
+      tf_session.TF_DeleteDeviceList(raw_device_list)
+      return device_list
+
   def close(self):
     """Closes this session.
 
@@ -562,11 +669,18 @@ class BaseSession(SessionInterface):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         closing the TensorFlow session.
     """
-    with self._extend_lock:
-      if self._opened and not self._closed:
+    if self._created_with_new_api:
+      if self._session and not self._closed:
         self._closed = True
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_CloseDeprecatedSession(self._session, status)
+          tf_session.TF_CloseSession(self._session, status)
+
+    else:
+      with self._extend_lock:
+        if self._opened and not self._closed:
+          self._closed = True
+          with errors.raise_exception_on_not_ok_status() as status:
+            tf_session.TF_CloseDeprecatedSession(self._session, status)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -575,11 +689,24 @@ class BaseSession(SessionInterface):
     except Exception:  # pylint: disable=broad-except
       pass
     if self._session is not None:
+      # We create `status` outside the `try` block because at shutdown
+      # `tf_session` may have been garbage collected, and the creation of a
+      # status object may fail. In that case, we prefer to ignore the failure
+      # and silently leak the session object, since the program is about to
+      # terminate.
+      status = None
       try:
         status = tf_session.TF_NewStatus()
-        tf_session.TF_DeleteDeprecatedSession(self._session, status)
+        if self._created_with_new_api:
+          tf_session.TF_DeleteSession(self._session, status)
+        else:
+          tf_session.TF_DeleteDeprecatedSession(self._session, status)
+      except AttributeError:
+        # 'NoneType' object has no attribute 'TF_NewStatus'
+        pass
       finally:
-        tf_session.TF_DeleteStatus(status)
+        if status is not None:
+          tf_session.TF_DeleteStatus(status)
       self._session = None
 
   @property
@@ -605,9 +732,8 @@ class BaseSession(SessionInterface):
     """Returns a context manager that makes this object the default session.
 
     Use with the `with` keyword to specify that calls to
-    [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
-    [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval) should be
-    executed in this session.
+    @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in
+    this session.
 
     ```python
     c = tf.constant(..)
@@ -618,9 +744,7 @@ class BaseSession(SessionInterface):
       print(c.eval())
     ```
 
-    To get the current default session, use
-    [`tf.get_default_session()`](#get_default_session).
-
+    To get the current default session, use @{tf.get_default_session}.
 
     *N.B.* The `as_default` context manager *does not* close the
     session when you exit the context, and you must close the session
@@ -642,14 +766,19 @@ class BaseSession(SessionInterface):
     session that is automatically closed on exiting the context,
     including when an uncaught exception is raised.
 
-    *N.B.* The default graph is a property of the current thread. If you
+    *N.B.* The default session is a property of the current thread. If you
     create a new thread, and wish to use the default session in that
     thread, you must explicitly add a `with sess.as_default():` in that
     thread's function.
 
+    *N.B.* Entering a `with sess.as_default():` block does not affect
+    the current default graph. If you are using multiple graphs, and
+    `sess.graph` is different from the value of @{tf.get_default_graph},
+    you must explicitly enter a `with sess.graph.as_default():` block
+    to make `sess.graph` the default graph.
+
     Returns:
       A context manager using this session as the default session.
-
     """
     return ops.default_session(self)
 
@@ -665,14 +794,14 @@ class BaseSession(SessionInterface):
     nested list, tuple, namedtuple, dict, or OrderedDict containing graph
     elements at its leaves.  A graph element can be one of the following types:
 
-    * An [`Operation`](../../api_docs/python/framework.md#Operation).
+    * An @{tf.Operation}.
       The corresponding fetched value will be `None`.
-    * A [`Tensor`](../../api_docs/python/framework.md#Tensor).
+    * A @{tf.Tensor}.
       The corresponding fetched value will be a numpy ndarray containing the
       value of that tensor.
-    * A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
+    * A @{tf.SparseTensor}.
       The corresponding fetched value will be a
-      [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
+      @{tf.SparseTensorValue}
       containing the value of that sparse tensor.
     * A `get_tensor_handle` op.  The corresponding fetched value will be a
       numpy ndarray containing the handle of that tensor.
@@ -692,14 +821,14 @@ class BaseSession(SessionInterface):
        # v is the numpy array [10, 20]
        # 'fetches' can be a list.
        v = session.run([a, b])
-       # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+       # v is a Python list with 2 numpy arrays: the 1-D array [10, 20] and the
        # 1-D array [1.0, 2.0]
        # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
        MyData = collections.namedtuple('MyData', ['a', 'b'])
        v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
        # v is a dict with
-       # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
-       # 'b' the numpy array [1.0, 2.0]
+       # v['k1'] is a MyData namedtuple with 'a' (the numpy array [10, 20]) and
+       # 'b' (the numpy array [1.0, 2.0])
        # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
        # [10, 20].
     ```
@@ -708,16 +837,16 @@ class BaseSession(SessionInterface):
     the value of tensors in the graph. Each key in `feed_dict` can be
     one of the following types:
 
-    * If the key is a [`Tensor`](../../api_docs/python/framework.md#Tensor), the
+    * If the key is a @{tf.Tensor}, the
       value may be a Python scalar, string, list, or numpy ndarray
       that can be converted to the same `dtype` as that
       tensor. Additionally, if the key is a
-      [placeholder](../../api_docs/python/io_ops.md#placeholder), the shape of
+      @{tf.placeholder}, the shape of
       the value will be checked for compatibility with the placeholder.
     * If the key is a
-      [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
+      @{tf.SparseTensor},
       the value should be a
-      [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue).
+      @{tf.SparseTensorValue}.
     * If the key is a nested tuple of `Tensor`s or `SparseTensor`s, the value
       should be a nested tuple with the same structure that maps to their
       corresponding values as above.
@@ -842,6 +971,8 @@ class BaseSession(SessionInterface):
       TypeError: If `fetches` or `feed_dict` keys are of an inappropriate type.
       tf.errors.OpError: Or one of its subclasses if a TensorFlow error happens.
     """
+    assert not self._created_with_new_api, 'Partial runs don\'t work with C API'
+
     def _feed_fn(feed):
       for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
@@ -887,7 +1018,8 @@ class BaseSession(SessionInterface):
                                        target_list, status)
 
     return self._do_call(_setup_fn, self._session, feed_list,
-                         fetch_handler.fetches(), fetch_handler.targets())
+                         _name_list(fetch_handler.fetches()),
+                         _name_list(fetch_handler.targets()))
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the presence of `handle`."""
@@ -906,10 +1038,11 @@ class BaseSession(SessionInterface):
                          'graph before calling run().')
 
     # Create request.
-    feed_dict_string = {}
+    feed_dict_tensor = {}
     feed_map = {}
 
     # Validate and process feed_dict.
+    feed_handles = {}
     if feed_dict:
       feed_dict = nest.flatten_dict_items(feed_dict)
       for feed, feed_val in feed_dict.items():
@@ -924,7 +1057,7 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
-                            'strings, lists, or numpy ndarrays.')
+                            'strings, lists, numpy ndarrays, or TensorHandles.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
           if isinstance(subfeed_val,
@@ -935,38 +1068,136 @@ class BaseSession(SessionInterface):
                 ' Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
-          np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
+          is_tensor_handle_feed = isinstance(subfeed_val,
+                                             session_ops.TensorHandle)
+          if is_tensor_handle_feed:
+            np_val = subfeed_val.to_numpy_array()
+            feed_handles[subfeed_t] = subfeed_val
+          else:
+            np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
 
-          if not subfeed_t.get_shape().is_compatible_with(np_val.shape):
+          if (not is_tensor_handle_feed and
+              not subfeed_t.get_shape().is_compatible_with(np_val.shape)):
             raise ValueError(
                 'Cannot feed value of shape %r for Tensor %r, '
                 'which has shape %r'
                 % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
           if not self.graph.is_feedable(subfeed_t):
             raise ValueError('Tensor %s may not be fed.' % subfeed_t)
-          subfeed_name = compat.as_bytes(subfeed_t.name)
-          feed_dict_string[subfeed_name] = np_val
-          feed_map[subfeed_name] = (subfeed_t, subfeed_val)
+
+          feed_dict_tensor[subfeed_t] = np_val
+          feed_map[compat.as_bytes(subfeed_t.name)] = (subfeed_t, subfeed_val)
 
     # Create a fetch handler to take care of the structure of fetches.
-    fetch_handler = _FetchHandler(self._graph, fetches, feed_dict_string)
+    fetch_handler = _FetchHandler(
+        self._graph, fetches, feed_dict_tensor, feed_handles=feed_handles)
 
     # Run request and get response.
-    # We need to keep the movers alive for the following _do_run().
+    # We need to keep the returned movers alive for the following _do_run().
     # These movers are no longer needed when _do_run() completes, and
     # are deleted when `movers` goes out of scope when this _run() ends.
     # TODO(yuanbyu, keveman): Revisit whether we should just treat feeding
     # of a handle from a different device as an error.
-    movers = self._update_with_movers(feed_dict_string, feed_map)
+    _ = self._update_with_movers(feed_dict_tensor, feed_map)
     final_fetches = fetch_handler.fetches()
     final_targets = fetch_handler.targets()
-    if final_fetches or final_targets:
+    # We only want to really perform the run if fetches or targets are provided,
+    # or if the call is a partial run that specifies feeds.
+    if final_fetches or final_targets or (handle and feed_dict_tensor):
       results = self._do_run(handle, final_targets, final_fetches,
-                             feed_dict_string, options, run_metadata)
+                             feed_dict_tensor, options, run_metadata)
     else:
       results = []
     return fetch_handler.build_results(self, results)
 
+  def make_callable(self, fetches, feed_list=None):
+    """Returns a Python callable that runs a particular step.
+
+    The returned callable will take `len(feed_list)` arguments whose types
+    must be compatible feed values for the respective elements of `feed_list`.
+    For example, if element `i` of `feed_list` is a `tf.Tensor`, the `i`th
+    argument to the returned callable must be a numpy ndarray (or something
+    convertible to an ndarray) with matching element type and shape. See
+    @{tf.Session.run} for details of the allowable feed key and value types.
+
+    The returned callable will have the same return type as
+    `tf.Session.run(fetches, ...)`. For example, if `fetches` is a `tf.Tensor`,
+    the callable will return a numpy ndarray; if `fetches` is a `tf.Operation`,
+    it will return `None`.
+
+    Args:
+      fetches: A value or list of values to fetch. See @{tf.Session.run}
+        for details of the allowable fetch types.
+      feed_list: (Optional.) A list of `feed_dict` keys. See
+        @{tf.Session.run} for details of the allowable feed key types.
+
+    Returns:
+      A function that when called will execute the step defined by
+      `feed_list` and `fetches` in this session.
+
+    Raises:
+      TypeError: If `fetches` or `feed_list` cannot be interpreted
+        as arguments to @{tf.Session.run}.
+    """
+    assert not self._created_with_new_api, ('session.make_callable() doesn\'t '
+                                            'work with C API')
+
+    if feed_list is not None:
+      if not isinstance(feed_list, (list, tuple)):
+        raise TypeError('`feed_list` must be a list or tuple.')
+      # Delegate any non-empty feed lists to the existing `run()` logic.
+      # TODO(mrry): Refactor the feed handling logic from
+      # `Session._run()` so that we can convert the feeds to a list of
+      # strings here.
+      def _generic_run(*feed_args):
+        feed_dict = {feed: feed_val
+                     for feed, feed_val in zip(feed_list, feed_args)}
+        return self.run(fetches, feed_dict=feed_dict)
+      return _generic_run
+
+    # Ensure any changes to the graph are reflected in the runtime.
+    # Note that we don't need to do this on subsequent calls to the
+    # returned object, because the arguments to `fetches` must already be
+    # in the graph.
+    self._extend_graph()
+
+    # Create a fetch handler to take care of the structure of fetches.
+    fetch_handler = _FetchHandler(self._graph, fetches, {})
+    fetch_list_as_strings = _name_list(fetch_handler.fetches())
+    target_list_as_strings = _name_list(fetch_handler.targets())
+
+    if isinstance(fetches, ops.Operation):
+      # Special case for fetching a single operation, because the
+      # function will have no return value.
+      assert not fetch_list_as_strings
+      assert len(target_list_as_strings) == 1
+      def _single_operation_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          tf_session.TF_Run(self._session, None, {}, [],
+                            target_list_as_strings, status, None)
+      return _single_operation_run
+    elif isinstance(fetches, ops.Tensor):
+      # Special case for fetching a single tensor, because the
+      # function can return the result of `TF_Run()` directly.
+      assert len(fetch_list_as_strings) == 1
+      assert not target_list_as_strings
+      def _single_tensor_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = tf_session.TF_Run(self._session, None, {},
+                                      fetch_list_as_strings, [], status, None)
+        return results[0]
+      return _single_tensor_run
+    else:
+      # In all other cases, we must use `fetch_handler` to build the
+      # results for us.
+      def _fetch_handler_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = tf_session.TF_Run(self._session, None, {},
+                                      fetch_list_as_strings,
+                                      target_list_as_strings, status, None)
+        return fetch_handler.build_results(self, results)
+      return _fetch_handler_run
+
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
 
@@ -976,12 +1207,9 @@ class BaseSession(SessionInterface):
 
     Args:
       handle: a handle for partial_run. None if this is just a call to run().
-      target_list: A list of byte arrays corresponding to names of tensors
-        or operations to be run to, but not fetched.
-      fetch_list: A list of byte arrays corresponding to names of tensors to
-        be fetched and operations to be run.
-      feed_dict: A dictionary that maps tensor names (as byte arrays) to
-        numpy ndarrays.
+      target_list: A list of operations to be run, but not fetched.
+      fetch_list: A list of tensors to be fetched.
+      feed_dict: A dictionary that maps tensors to numpy ndarrays.
       options: A (pointer to a) [`RunOptions`] protocol buffer, or None
       run_metadata: A (pointer to a) [`RunMetadata`] protocol buffer, or None
 
@@ -994,16 +1222,34 @@ class BaseSession(SessionInterface):
     Raises:
       tf.errors.OpError: Or one of its subclasses on error.
     """
+    if self._created_with_new_api:
+      # pylint: disable=protected-access
+      feeds = dict((t._as_tf_output(), v) for t, v in feed_dict.items())
+      fetches = [t._as_tf_output() for t in fetch_list]
+      targets = [op._c_op for op in target_list]
+      # pylint: enable=protected-access
+    else:
+      feeds = dict((compat.as_bytes(t.name), v) for t, v in feed_dict.items())
+      fetches = _name_list(fetch_list)
+      targets = _name_list(target_list)
+
     def _run_fn(session, feed_dict, fetch_list, target_list, options,
                 run_metadata):
       # Ensure any changes to the graph are reflected in the runtime.
       self._extend_graph()
       with errors.raise_exception_on_not_ok_status() as status:
-        return tf_session.TF_Run(session, options,
-                                 feed_dict, fetch_list, target_list,
-                                 status, run_metadata)
+        if self._created_with_new_api:
+          return tf_session.TF_SessionRun_wrapper(
+              session, options, feed_dict, fetch_list, target_list,
+              run_metadata, status)
+        else:
+          return tf_session.TF_Run(session, options,
+                                   feed_dict, fetch_list, target_list,
+                                   status, run_metadata)
 
     def _prun_fn(session, handle, feed_dict, fetch_list):
+      assert not self._created_with_new_api, ('Partial runs don\'t work with '
+                                              'C API')
       if target_list:
         raise RuntimeError('partial_run() requires empty target_list.')
       with errors.raise_exception_on_not_ok_status() as status:
@@ -1011,11 +1257,10 @@ class BaseSession(SessionInterface):
                                   status)
 
     if handle is None:
-      return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-                           target_list, options, run_metadata)
+      return self._do_call(_run_fn, self._session, feeds, fetches, targets,
+                           options, run_metadata)
     else:
-      return self._do_call(_prun_fn, self._session, handle, feed_dict,
-                           fetch_list)
+      return self._do_call(_prun_fn, self._session, handle, feeds, fetches)
 
   def _do_call(self, fn, *args):
     try:
@@ -1035,6 +1280,10 @@ class BaseSession(SessionInterface):
       raise type(e)(node_def, op, message)
 
   def _extend_graph(self):
+    # Nothing to do if we're using the new session interface
+    # TODO(skyewm): remove this function altogether eventually
+    if self._created_with_new_api: return
+
     # Ensure any changes to the graph are reflected in the runtime.
     with self._extend_lock:
       if self._graph.version > self._current_version:
@@ -1062,17 +1311,16 @@ class BaseSession(SessionInterface):
         tensors_to_delete = self._dead_handles
         self._dead_handles = []
     # Delete the dead tensors.
-    # TODO(yuanbyu): For now we use a sequence of runs to minimize the graph
-    # size and the overhead of graph construction/partitioning.
     if tensors_to_delete:
-      for tensor_handle in tensors_to_delete:
-        feeds = {}
-        fetches = []
+      feeds = {}
+      fetches = []
+      for deleter_key, tensor_handle in enumerate(tensors_to_delete):
         holder, deleter = session_ops._get_handle_deleter(self.graph,
+                                                          deleter_key,
                                                           tensor_handle)
         feeds[holder] = tensor_handle
         fetches.append(deleter)
-        self.run(fetches, feed_dict=feeds)
+      self.run(fetches, feed_dict=feeds)
 
   def _update_with_movers(self, feed_dict, feed_map):
     # If a tensor handle that is fed to a device incompatible placeholder,
@@ -1095,7 +1343,9 @@ class BaseSession(SessionInterface):
       handles = self.run(fetches, feed_dict=feeds)
       for handle_mover, handle in zip(handle_movers, handles):
         np_val = np.array(handle.handle, dtype=np.object)
-        feed_dict[handle_mover[0]] = np_val
+        feed_name = handle_mover[0]
+        feed_tensor = feed_map[feed_name][0]
+        feed_dict[feed_tensor] = np_val
       return handles
 
 
@@ -1120,10 +1370,10 @@ class Session(BaseSession):
   ```
 
   A session may own resources, such as
-  [variables](../../api_docs/python/state_ops.md#Variable), [queues](../../api_docs/python/io_ops.md#QueueBase),
-  and [readers](../../api_docs/python/io_ops.md#ReaderBase). It is important to release
+  @{tf.Variable}, @{tf.QueueBase},
+  and @{tf.ReaderBase}. It is important to release
   these resources when they are no longer required. To do this, either
-  invoke the [`close()`](#Session.close) method on the session, or use
+  invoke the @{tf.Session.close} method on the session, or use
   the session as a context manager. The following two examples are
   equivalent:
 
@@ -1150,17 +1400,6 @@ class Session(BaseSession):
   sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                           log_device_placement=True))
   ```
-
-  @@__init__
-  @@run
-  @@close
-
-  @@graph
-
-  @@as_default
-
-  @@reset
-
   """
 
   def __init__(self, target='', graph=None, config=None):
@@ -1177,7 +1416,7 @@ class Session(BaseSession):
     Args:
       target: (Optional.) The execution engine to connect to.
         Defaults to using an in-process engine. See
-        [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
+        @{$distributed$Distributed TensorFlow}
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
       config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
@@ -1253,8 +1492,8 @@ class InteractiveSession(BaseSession):
 
   The only difference with a regular `Session` is that an `InteractiveSession`
   installs itself as the default session on construction.
-  The methods [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval)
-  and [`Operation.run()`](../../api_docs/python/framework.md#Operation.run)
+  The methods @{tf.Tensor.eval}
+  and @{tf.Operation.run}
   will use that session to run ops.
 
   This is convenient in interactive shells and [IPython
@@ -1285,9 +1524,6 @@ class InteractiveSession(BaseSession):
     # We can also use 'c.eval()' here.
     print(c.eval())
   ```
-
-  @@__init__
-  @@close
   """
 
   def __init__(self, target='', graph=None, config=None):
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 614eede6816..721bca91b71 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -92,26 +92,127 @@ class SessionBenchmark(test.Benchmark):
     print("%s %d %f" % (name, size, np.median(times)))
     self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
 
+  def _benchmarkFetchPrebuilt(self, name, target, size, iters):
+    """Runs a microbenchmark to measure the cost of fetching a tensor.
+
+    Reports the median cost of fetching a tensor of `size` * `sizeof(float)`
+    bytes.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      size: The number of floating-point numbers to be fetched.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the tensor to be fetched as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([size]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        runner = sess.make_callable(v)
+        runner()  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          runner()
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %d %f" % (name, size, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
+  def _benchmarkRunOp(self, name, target, iters):
+    """Runs a microbenchmark to measure the cost of running an op.
+
+    Reports the median cost of running a trivial (Variable) op.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the op to be run as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        sess.run(v.op)  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          sess.run(v.op)
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %f" % (name, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
+  def _benchmarkRunOpPrebuilt(self, name, target, iters):
+    """Runs a microbenchmark to measure the cost of running an op.
+
+    Reports the median cost of running a trivial (Variable) op.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the op to be run as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        runner = sess.make_callable(v.op)
+        runner()  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          runner()
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %f" % (name, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
   def benchmarkGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self._benchmarkFeed("benchmark_session_feed_grpc_4B", server.target, 1,
-                        10000)
+                        30000)
     session.Session.reset(server.target)
-    self._benchmarkFeed("benchmark_session_feed_grpc_4MB", server.target, 1
-                        << 20, 100)
+    self._benchmarkFeed("benchmark_session_feed_grpc_4MB", server.target,
+                        1 << 20, 25000)
     session.Session.reset(server.target)
     self._benchmarkFetch("benchmark_session_fetch_grpc_4B", server.target, 1,
-                         20000)
+                         40000)
     session.Session.reset(server.target)
-    self._benchmarkFetch("benchmark_session_fetch_grpc_4MB", server.target, 1
-                         << 20, 100)
+    self._benchmarkFetch("benchmark_session_fetch_grpc_4MB", server.target,
+                         1 << 20, 20000)
+    session.Session.reset(server.target)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_grpc_4B",
+                                 server.target, 1, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_grpc_4MB",
+                                 server.target, 1 << 20, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkRunOp("benchmark_session_runop_grpc", server.target, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkRunOpPrebuilt("benchmark_session_runopprebuilt_grpc",
+                                 server.target, 100000)
     session.Session.reset(server.target)
 
   def benchmarkDirectSession(self):
-    self._benchmarkFeed("benchmark_session_feed_direct_4B", "", 1, 5000)
-    self._benchmarkFeed("benchmark_session_feed_direct_4MB", "", 1 << 20, 200)
-    self._benchmarkFetch("benchmark_session_fetch_direct_4B", "", 1, 5000)
-    self._benchmarkFetch("benchmark_session_fetch_direct_4MB", "", 1 << 20, 100)
+    self._benchmarkFeed("benchmark_session_feed_direct_4B", "", 1, 80000)
+    self._benchmarkFeed("benchmark_session_feed_direct_4MB", "", 1 << 20, 20000)
+    self._benchmarkFetch("benchmark_session_fetch_direct_4B", "", 1, 100000)
+    self._benchmarkFetch("benchmark_session_fetch_direct_4MB", "", 1 << 20,
+                         20000)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_direct_4B",
+                                 "", 1, 200000)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_direct_4MB",
+                                 "", 1 << 20, 200000)
+    self._benchmarkRunOp("benchmark_session_runop_direct", "", 200000)
+    self._benchmarkRunOpPrebuilt("benchmark_session_runopprebuilt_direct", "",
+                                 200000)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
new file mode 100644
index 00000000000..f982c6ae373
--- /dev/null
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -0,0 +1,326 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.client.session.Session's ClusterSpec Propagation.
+
+These tests exercise the ClusterSpec Propagation capabilities of distributed
+Sessions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import server_lib
+
+ops._USE_C_API = True
+
+# NOTE(mrry): Dummy shape registration for ops used in the tests, since they
+# don't have C++ op registrations on which to attach C++ shape fns.
+ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
+
+
+class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationSimple(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationWorker2Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:worker/replica:0/task:1/device:CPU:0' ==
+                         dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationWorker1Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationThreeServers2Graphs(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker1'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker2'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.Graph().as_default() as g1:
+      with ops.device('/job:worker1/task:1'):
+        var1 = variables.Variable(array_ops.zeros([2]), name='var1')
+        update_op1 = state_ops.assign_add(
+            var1, array_ops.ones([2]), name='var1_assign_add')
+        init1 = variables.global_variables_initializer()
+
+    with ops.Graph().as_default() as g2:
+      with ops.device('/job:worker2/task:1'):
+        var2 = variables.Variable(array_ops.zeros([2]), name='var2')
+        update_op2 = state_ops.assign_add(
+            var2, array_ops.ones([2]), name='var2_assign_add')
+        init2 = variables.global_variables_initializer()
+
+    sess1 = session.Session(server2.target, graph=g1, config=config1)
+    sess2 = session.Session(server2.target, graph=g2, config=config2)
+
+    init1.run(session=sess1)
+    init2.run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+
+    self.assertAllEqual(expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+    self.assertAllEqual(expected_ones, sess2.run(update_op2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess2.run(var2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationThreeServers(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.device('/job:worker/task:1'):
+      var = variables.Variable(array_ops.zeros([2]), name='var')
+      feed = array_ops.placeholder(dtypes.float32, shape=(2))
+      update_op = var.assign_add(feed)
+
+    sess1 = session.Session(server2.target, config=config1)
+    sess2 = session.Session(server2.target, config=config2)
+
+    variables.global_variables_initializer().run(session=sess1)
+    variables.global_variables_initializer().run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess2.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones + expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess2.run(var))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var))
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testClusterSpecPropagationThreeServersOneCluster(self):
+    """Boots 3 servers, ensures appropriate communication across workers.
+
+    Additionally, in this cluster, we ensure the master is not the 0-th worker.
+
+    Note: this test only uses one session.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server3.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    job.tasks[2] = server1.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    # Add ops to the devices in non-linear order.
+
+    with ops.device('/job:worker/task:1'):
+      feed1 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const1 = constant_op.constant(2.0)
+      mul1 = const1 * feed1
+
+    with ops.device('/job:worker/task:2'):
+      feed2 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const2 = constant_op.constant(2.0)
+      mul2 = const2 * feed2
+
+    with ops.device('/job:worker/task:0'):
+      feed0 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const0 = constant_op.constant(2.0)
+      mul0 = const0 * feed0
+
+    sum_op = mul0 + mul1 + mul2
+
+    ones = np.ones([2])
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    # Run!
+    with session.Session(server1.target, config=config) as sess:
+      output = sess.run(
+          sum_op,
+          options=run_options,
+          run_metadata=run_metadata,
+          feed_dict={feed1: ones,
+                     feed2: ones,
+                     feed0: ones})
+      self.assertAllEqual(6 * ones, output)
+
+      self.assertEqual(
+          3,
+          len([
+              dev_stats.device
+              for dev_stats in run_metadata.step_stats.dev_stats
+              for node_stats in dev_stats.node_stats
+              if '/job:worker/replica:0/task:' in dev_stats.device and
+              node_stats.node_name.startswith('Const')
+          ]), run_metadata)
+
+  @test_util.disable_c_api  # Partial runs don't work with C API
+  def testClusterSpecPropagationPartialRun(self):
+    """Test successful partial run with ClusterSpec propagation."""
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.device('/job:worker/task:0'):
+      a = array_ops.placeholder(dtypes.float32, shape=[])
+    with ops.device('/job:worker/task:1'):
+      b = array_ops.placeholder(dtypes.float32, shape=[])
+      c = array_ops.placeholder(dtypes.float32, shape=[])
+      r1 = math_ops.add(a, b)
+    with ops.device('/job:worker/task:0'):
+      r2 = math_ops.multiply(r1, c)
+
+    with session.Session(server1.target, config=config) as sess:
+      h = sess.partial_run_setup([r1, r2], [a, b, c])
+      res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+      self.assertEqual(3, res)
+      res = sess.partial_run(h, r2, feed_dict={c: 3})
+      self.assertEqual(9, res)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
new file mode 100644
index 00000000000..c1e9e5e48fc
--- /dev/null
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -0,0 +1,98 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.client.session.Session's list_devices API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import server_lib
+
+
+class SessionListDevicesTestMethods(object):
+  """Mixin with test methods."""
+
+  def testListDevices(self):
+    with session.Session() as sess:
+      devices = sess.list_devices()
+      self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in set(
+          [d.name for d in devices]), devices)
+      self.assertGreaterEqual(1, len(devices), devices)
+
+  def testListDevicesGrpcSession(self):
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target) as sess:
+      devices = sess.list_devices()
+      self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in set(
+          [d.name for d in devices]), devices)
+      self.assertGreaterEqual(1, len(devices), devices)
+
+  def testListDevicesClusterSpecPropagation(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+    with session.Session(server1.target, config=config) as sess:
+      devices = sess.list_devices()
+      device_names = set([d.name for d in devices])
+      self.assertTrue(
+          '/job:worker/replica:0/task:0/device:CPU:0' in device_names)
+      self.assertTrue(
+          '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
+      self.assertGreaterEqual(2, len(devices), devices)
+
+
+class SessionListDevicesTest(SessionListDevicesTestMethods,
+                             test_util.TensorFlowTestCase):
+  """Test case that invokes test methods with _USE_C_API=False."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = False
+    super(SessionListDevicesTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(SessionListDevicesTest, self).tearDown()
+
+
+class SessionListDevicesWithCApiTest(SessionListDevicesTestMethods,
+                                     test_util.TensorFlowTestCase):
+  """Test case that invokes test methods with _USE_C_API=True."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = True
+    super(SessionListDevicesWithCApiTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(SessionListDevicesWithCApiTest, self).tearDown()
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
new file mode 100644
index 00000000000..9e0eca2089e
--- /dev/null
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -0,0 +1,289 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.client.session.Session's partial run APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import server_lib
+
+ops._USE_C_API = True
+
+# NOTE(mrry): Dummy shape registration for ops used in the tests, since they
+# don't have C++ op registrations on which to attach C++ shape fns.
+ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
+
+
+class PartialRunTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    # Partial runs don't work with C API
+    ops._USE_C_API = False
+    super(PartialRunTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = True
+    super(PartialRunTest, self).tearDown()
+
+  def runTestPartialRun(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(r1, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    self.assertEqual(3, res)
+    temp = res * 17
+    res = sess.partial_run(h, r2, feed_dict={c: temp})
+    self.assertEqual(153, res)
+
+    # Call again on the same graph.
+    h2 = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h2, r1, feed_dict={a: 1, b: 2})
+    self.assertEqual(3, res)
+    temp = res * 18
+    res = sess.partial_run(h2, r2, feed_dict={c: temp})
+    self.assertEqual(162, res)
+
+  def runTestPartialRunIncomplete(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(r1, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    self.assertEqual(3, res)
+
+  def runTestConcurrentPartialRun(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(r1, c)
+
+    h1 = sess.partial_run_setup([r1], [a, b, c])
+    h2 = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h1, r1, feed_dict={a: 1, b: 2})
+    self.assertEqual(3, res)
+    temp = res * 19
+    res = sess.partial_run(h2, r1, feed_dict={a: temp, b: 9})
+    self.assertEqual(66, res)
+    res = sess.partial_run(h2, r2, feed_dict={c: 7})
+    self.assertEqual(462, res)
+
+  def runTestManyPartialRun(self, sess):
+    steps = 200
+    inputs = []
+    outputs = []
+    a = constant_op.constant(2.0, dtypes.float32)
+    for i in xrange(steps):
+      inputs.append(array_ops.placeholder(dtypes.float32, shape=[]))
+      a = math_ops.multiply(a, inputs[i])
+      outputs.append(a)
+
+    h = sess.partial_run_setup(outputs, inputs)
+    for i in xrange(steps):
+      res = sess.partial_run(h, outputs[i], feed_dict={inputs[i]: 1.0})
+    self.assertEqual(2.0, res)
+
+    feed_dict = {}
+    for i in xrange(steps):
+      feed_dict[inputs[i]] = 1.0
+    res = sess.run(outputs, feed_dict)
+    self.assertEqual(steps, len(res))
+    self.assertEqual(2.0, res[-1])
+
+  def runTestRunAndPartialRun(self, sess):
+    a = constant_op.constant(2.0, dtypes.float32)
+    b = a * 2
+    c = b * 3
+    r1 = sess.run([b, c])
+    h = sess.partial_run_setup([b, c], [])
+    r2 = sess.partial_run(h, [b, c])
+    self.assertEqual(r1, r2)
+
+  def runTestPartialRunMissingPlaceholderFeedException(self, sess):
+    x = array_ops.placeholder(dtypes.float32, shape=())
+    fetches = [x * 2, x * 3]
+    handle = sess.partial_run_setup(fetches=fetches, feeds=[])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'You must feed a value for placeholder'):
+      sess.partial_run(handle, fetches[0])
+
+  def runTestPartialRunUnspecifiedFeed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+
+    h = sess.partial_run_setup([r1], [a, b])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r1, feed_dict={a: 1, b: 2, c: 3})
+
+  def runTestPartialRunUnspecifiedFetch(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1], [a, b, c])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fed.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFetched(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fetched.$'):
+      sess.partial_run(h, r1, feed_dict={c: 3})
+
+  def runTestPartialRunEmptyFetches(self, sess):
+    a = array_ops.placeholder(dtypes.float32)
+    b = a * 2.0
+
+    h = sess.partial_run_setup(fetches=[b], feeds=[a])
+    sess.partial_run(h, [], {a: 3.0})
+    r = sess.partial_run(h, [b], {})
+    self.assertEqual([6.0], r)
+
+  def testInvalidPartialRunSetup(self):
+    sess = session.Session()
+    x = array_ops.placeholder(dtypes.float32, shape=[])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'specify at least one target to fetch or execute.'):
+      sess.partial_run_setup(fetches=[], feeds=[x])
+
+  def testPartialRunDirect(self):
+    self.runTestPartialRun(session.Session())
+
+  def testPartialRunIncompleteDirect(self):
+    self.runTestPartialRunIncomplete(session.Session())
+
+  def testConcurrentPartialRunDirect(self):
+    self.runTestConcurrentPartialRun(session.Session())
+
+  def testManyPartialRunDirect(self):
+    self.runTestManyPartialRun(session.Session())
+
+  def testRunAndPartialRunDirect(self):
+    self.runTestRunAndPartialRun(session.Session())
+
+  def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
+    self.runTestPartialRunMissingPlaceholderFeedException(session.Session())
+
+  def testPartialRunUnspecifiedFeedDirect(self):
+    self.runTestPartialRunUnspecifiedFeed(session.Session())
+
+  def testPartialRunUnspecifiedFetchDirect(self):
+    self.runTestPartialRunUnspecifiedFetch(session.Session())
+
+  def testPartialRunAlreadyFedDirect(self):
+    self.runTestPartialRunAlreadyFed(session.Session())
+
+  def testPartialRunAlreadyFetchedDirect(self):
+    self.runTestPartialRunAlreadyFetched(session.Session())
+
+  def testPartialRunEmptyFetchesDirect(self):
+    self.runTestPartialRunEmptyFetches(session.Session())
+
+  def testPartialRunDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRun(session.Session(server.target))
+
+  def testPartialRunIncompleteDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunIncomplete(session.Session(server.target))
+
+  def testConcurrentPartialRunDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestConcurrentPartialRun(session.Session(server.target))
+
+  def testManyPartialRunDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestManyPartialRun(session.Session(server.target))
+
+  def testRunAndPartialRunDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestRunAndPartialRun(session.Session(server.target))
+
+  def testPartialRunMissingPlaceholderFeedExceptionDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunMissingPlaceholderFeedException(
+        session.Session(server.target))
+
+  def testPartialRunUnspecifiedFeedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFeed(session.Session(server.target))
+
+  def testPartialRunUnspecifiedFetchDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFetch(session.Session(server.target))
+
+  def testPartialRunAlreadyFedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFed(session.Session(server.target))
+
+  def testPartialRunAlreadyFetchedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFetched(session.Session(server.target))
+
+  def testPartialRunEmptyFetchesDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunEmptyFetches(session.Session(server.target))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index e53a046a34d..a1f98059cd8 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
+import sys
 import threading
 import time
 
@@ -28,6 +30,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -42,12 +45,16 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
+ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
@@ -56,6 +63,7 @@ ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 class SessionTest(test_util.TensorFlowTestCase):
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testUseExistingGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -65,6 +73,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = c.eval()
       self.assertAllEqual(result, [[42.0]])
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testUseDefaultGraph(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -131,6 +140,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError(lambda e: e.op == a.op):
         a.eval()
 
+  @test_util.disable_c_api  # Partial runs don't work with C API
   def testErrorCodeWithNoNodeDef(self):
     with session.Session() as s:
       a = array_ops.placeholder(dtypes.float32, shape=[])
@@ -144,6 +154,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
+  @test_util.disable_c_api  # No shape registration for 'ConstructionFails'
   def testOpConstructionErrorPayload(self):
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
@@ -186,6 +197,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(TypeError):
         s.run({'a': a, 'b': None})
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchSingleton(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -193,6 +205,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
+      tensor_runner = sess.make_callable(a)
+      res = tensor_runner()
+      self.assertEqual(42.0, res)
+      op_runner = sess.make_callable(a.op)
+      res = op_runner()
+      self.assertEqual(None, res)
 
   def testFetchSingletonByName(self):
     with session.Session() as sess:
@@ -202,6 +220,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchList(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -211,13 +230,13 @@ class SessionTest(test_util.TensorFlowTestCase):
       assign = v.assign([63.0])
       res = sess.run([a, b, c, a.name, assign.op])
       self.assertTrue(isinstance(res, list))
-      self.assertEqual(42.0, res[0])
-      self.assertEqual(None, res[1])
-      self.assertEqual(44.0, res[2])
-      self.assertEqual(42.0, res[3])
-      self.assertEqual(None, res[4])
-      self.assertEqual(63.0, sess.run(v))
+      self.assertEqual([42.0, None, 44.0, 42.0, None], res)
+      list_runner = sess.make_callable([a, b, c, a.name, assign.op])
+      res = list_runner()
+      self.assertTrue(isinstance(res, list))
+      self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchTuple(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -225,11 +244,13 @@ class SessionTest(test_util.TensorFlowTestCase):
       c = constant_op.constant(44.0)
       res = sess.run((a, b, c, a.name))
       self.assertTrue(isinstance(res, tuple))
-      self.assertEqual(42.0, res[0])
-      self.assertEqual(None, res[1])
-      self.assertEqual(44.0, res[2])
-      self.assertEqual(42.0, res[3])
+      self.assertEqual((42.0, None, 44.0, 42.0), res)
+      tuple_runner = sess.make_callable((a, b, c, a.name))
+      res = tuple_runner()
+      self.assertTrue(isinstance(res, tuple))
+      self.assertEqual((42.0, None, 44.0, 42.0), res)
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchNamedTuple(self):
     # pylint: disable=invalid-name
     ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
@@ -243,6 +264,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res.a)
       self.assertEqual(None, res.b)
       self.assertEqual(44.0, res.c)
+      namedtuple_runner = sess.make_callable(ABC(a, b, c))
+      res = namedtuple_runner()
+      self.assertTrue(isinstance(res, ABC))
+      self.assertEqual(42.0, res.a)
+      self.assertEqual(None, res.b)
+      self.assertEqual(44.0, res.c)
 
   def testFetchDict(self):
     with session.Session() as sess:
@@ -841,6 +868,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       v_val = v.eval()
       self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testExtendWithGroupBy(self):
     with session.Session() as s:
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1052,6 +1080,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testNotEntered(self):
     # pylint: disable=protected-access
     self.assertEqual(ops._default_session_stack.get_default(), None)
@@ -1067,6 +1096,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           ValueError, lambda e: 'No default session is registered.' in str(e)):
         c_2.eval()
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInteractive(self):
     with ops.device('/cpu:0'):
       sess = session.InteractiveSession()
@@ -1079,6 +1109,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1100,6 +1131,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       a.eval()
     sess.close()
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDefaultSessionPlacePrunedGraph(self):
     sess = session.Session()
 
@@ -1121,6 +1153,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
     sess.close()
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testSharedGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1140,6 +1173,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(b_val, [[2.0, 2.0, 2.0]])
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFeedAndFetch(self):
     with session.Session() as sess:
       for dtype in [dtypes.float16,
@@ -1181,6 +1215,11 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
+          feed_fetch_runner = sess.make_callable([out_t, feed_t], [feed_t])
+          out_v, feed_v = feed_fetch_runner(np_array)
+          self.assertAllEqual(np_array, out_v)
+          self.assertAllEqual(np_array, feed_v)
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
@@ -1323,128 +1362,13 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess_2.run(c_1.op)
       self.assertEqual(2.0, sess_2.run(c_2))
 
-  def runTestPartialRun(self, sess):
-    a = array_ops.placeholder(dtypes.float32, shape=[])
-    b = array_ops.placeholder(dtypes.float32, shape=[])
-    c = array_ops.placeholder(dtypes.float32, shape=[])
-    r1 = math_ops.add(a, b)
-    r2 = math_ops.multiply(r1, c)
-
-    h = sess.partial_run_setup([r1, r2], [a, b, c])
-    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-    self.assertEqual(3, res)
-    temp = res * 17
-    res = sess.partial_run(h, r2, feed_dict={c: temp})
-    self.assertEqual(153, res)
-
-    # Call again on the same graph.
-    h2 = sess.partial_run_setup([r1, r2], [a, b, c])
-    res = sess.partial_run(h2, r1, feed_dict={a: 1, b: 2})
-    self.assertEqual(3, res)
-    temp = res * 18
-    res = sess.partial_run(h2, r2, feed_dict={c: temp})
-    self.assertEqual(162, res)
-
-  def runTestPartialRunIncomplete(self, sess):
-    a = array_ops.placeholder(dtypes.float32, shape=[])
-    b = array_ops.placeholder(dtypes.float32, shape=[])
-    c = array_ops.placeholder(dtypes.float32, shape=[])
-    r1 = math_ops.add(a, b)
-    r2 = math_ops.multiply(r1, c)
-
-    h = sess.partial_run_setup([r1, r2], [a, b, c])
-    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-    self.assertEqual(3, res)
-
-  def runTestConcurrentPartialRun(self, sess):
-    a = array_ops.placeholder(dtypes.float32, shape=[])
-    b = array_ops.placeholder(dtypes.float32, shape=[])
-    c = array_ops.placeholder(dtypes.float32, shape=[])
-    r1 = math_ops.add(a, b)
-    r2 = math_ops.multiply(r1, c)
-
-    h1 = sess.partial_run_setup([r1], [a, b, c])
-    h2 = sess.partial_run_setup([r1, r2], [a, b, c])
-    res = sess.partial_run(h1, r1, feed_dict={a: 1, b: 2})
-    self.assertEqual(3, res)
-    temp = res * 19
-    res = sess.partial_run(h2, r1, feed_dict={a: temp, b: 9})
-    self.assertEqual(66, res)
-    res = sess.partial_run(h2, r2, feed_dict={c: 7})
-    self.assertEqual(462, res)
-
-  def runTestManyPartialRun(self, sess):
-    steps = 200
-    inputs = []
-    outputs = []
-    a = constant_op.constant(2.0, dtypes.float32)
-    for i in xrange(steps):
-      inputs.append(array_ops.placeholder(dtypes.float32, shape=[]))
-      a = math_ops.multiply(a, inputs[i])
-      outputs.append(a)
-
-    h = sess.partial_run_setup(outputs, inputs)
-    for i in xrange(steps):
-      res = sess.partial_run(h, outputs[i], feed_dict={inputs[i]: 1.0})
-    self.assertEqual(2.0, res)
-
-    feed_dict = {}
-    for i in xrange(steps):
-      feed_dict[inputs[i]] = 1.0
-    res = sess.run(outputs, feed_dict)
-    self.assertEqual(steps, len(res))
-    self.assertEqual(2.0, res[-1])
-
-  def runTestRunAndPartialRun(self, sess):
-    a = constant_op.constant(2.0, dtypes.float32)
-    b = a * 2
-    c = b * 3
-    r1 = sess.run([b, c])
-    h = sess.partial_run_setup([b, c], [])
-    r2 = sess.partial_run(h, [b, c])
-    self.assertEqual(r1, r2)
-
-  def testPartialRunDirect(self):
-    self.runTestPartialRun(session.Session())
-
-  def testPartialRunIncompleteDirect(self):
-    self.runTestPartialRunIncomplete(session.Session())
-
-  def testConcurrentPartialRunDirect(self):
-    self.runTestConcurrentPartialRun(session.Session())
-
-  def testManyPartialRunDirect(self):
-    self.runTestManyPartialRun(session.Session())
-
-  def testRunAndPartialRunDirect(self):
-    self.runTestRunAndPartialRun(session.Session())
-
-  def testPartialRunDist(self):
-    server = server_lib.Server.create_local_server()
-    self.runTestPartialRun(session.Session(server.target))
-
-  def testPartialRunIncompleteDist(self):
-    server = server_lib.Server.create_local_server()
-    self.runTestPartialRunIncomplete(session.Session(server.target))
-
-  def testConcurrentPartialRunDist(self):
-    server = server_lib.Server.create_local_server()
-    self.runTestConcurrentPartialRun(session.Session(server.target))
-
-  def testManyPartialRunDist(self):
-    server = server_lib.Server.create_local_server()
-    self.runTestManyPartialRun(session.Session(server.target))
-
-  def testRunAndPartialRunDist(self):
-    server = server_lib.Server.create_local_server()
-    self.runTestRunAndPartialRun(session.Session(server.target))
-
   def testFeedDictKeyException(self):
     with session.Session() as sess:
       a = constant_op.constant(1.0, dtypes.float32, name='a')
       with self.assertRaisesRegexp(TypeError, 'Cannot interpret feed_dict'):
         sess.run(a, feed_dict={'a': [2.0]})
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testPerStepTrace(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1465,6 +1389,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testRunOptionsRunMetadata(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1500,6 +1425,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'may not be fed'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInferShapesFalse(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
@@ -1508,6 +1434,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       # Avoid lint error regarding 'unused' var a.
       self.assertTrue(a == a)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInferShapesTrue(self):
     config = config_pb2.ConfigProto(
         graph_options=config_pb2.GraphOptions(infer_shapes=True))
@@ -1518,6 +1445,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       # Avoid lint error regarding 'unused' var a.
       self.assertTrue(a == a)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testBuildCostModel(self):
     run_options = config_pb2.RunOptions()
     config = config_pb2.ConfigProto(
@@ -1597,6 +1525,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
+  @test_util.disable_c_api  # Partial runs don't work with C API
   def testRegisterFetchAndFeedConversionFunctions(self):
     class SquaredTensor(object):
       def __init__(self, tensor):
@@ -1624,6 +1553,151 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
+  def testDefaultLogDevicePlacement(self):
+    class CaptureStderr(str):
+      """Class to capture stderr from C++ shared library."""
+
+      def __enter__(self):
+        self._esc = compat.as_str('\b')
+        self._output = compat.as_str('')
+        self._stderr = sys.stderr
+        self._fd = self._stderr.fileno()
+        self._out_pipe, in_pipe = os.pipe()
+        # Save the original io stream.
+        self._dup_fd = os.dup(self._fd)
+        # Replace the original io stream with in pipe.
+        os.dup2(in_pipe, self._fd)
+        return self
+
+      def __exit__(self, *args):
+        self._stderr.write(self._esc)
+        self._stderr.flush()
+        self.read()
+        os.close(self._out_pipe)
+        # Restore the original io stream.
+        os.dup2(self._dup_fd, self._fd)
+
+      def read(self):
+        while True:
+          data = os.read(self._out_pipe, 1)
+          if not data or compat.as_str(data) == self._esc:
+            break
+          self._output += compat.as_str(data)
+
+      def __str__(self):
+        return self._output
+
+    # Passing the config to the server, but not the session should still result
+    # in logging device placement.
+    config = config_pb2.ConfigProto(log_device_placement=True)
+    server = server_lib.Server.create_local_server(config=config)
+    a = constant_op.constant(1)
+    b = constant_op.constant(2)
+    c = a + b
+    with session.Session(server.target) as sess:
+      with CaptureStderr() as log:
+        sess.run(c)
+      # Ensure that we did log device placement.
+      self.assertTrue('/job:local/replica:0/task:0/cpu:0' in str(log), str(log))
+
+  def testLocalMasterSessionTimeout(self):
+    # Test that the timeout passed in a config to the session works correctly.
+    config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
+    server = server_lib.Server.create_local_server()
+    q = data_flow_ops.FIFOQueue(1, dtypes.float32)
+    dequeued_t = q.dequeue()
+
+    with session.Session(server.target, config=config) as sess:
+      # Intentionally do not run any enqueue_ops so that dequeue will block
+      # until operation_timeout_in_ms.
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(dequeued_t)
+
+  def testDefaultServerTimeout(self):
+    # Test that the default server config timeout gets used when no Session
+    # config is provided.
+    config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
+    server = server_lib.Server.create_local_server(config=config)
+    q = data_flow_ops.FIFOQueue(1, dtypes.float32)
+    dequeued_t = q.dequeue()
+
+    with session.Session(server.target) as sess:
+      # Intentionally do not run any enqueue_ops so that dequeue will block
+      # until operation_timeout_in_ms.
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(dequeued_t)
+
+  def runTestBuildGraphError(self, sess):
+    # Ensure that errors from building the graph get propagated.
+    data = array_ops.placeholder(dtypes.float32, shape=[])
+    enter_1 = control_flow_ops.enter(data, 'foo_1', False)
+    enter_2 = control_flow_ops.enter(data, 'foo_2', False)
+    res = math_ops.add(enter_1, enter_2)
+    with self.assertRaisesOpError('has inputs from different frames'):
+      sess.run(res, feed_dict={data: 1.0})
+
+  def testBuildGraphErrorDirect(self):
+    self.runTestBuildGraphError(session.Session())
+
+  def testBuildGraphErrorDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestBuildGraphError(session.Session(server.target))
+
+  def testGraphOptimizer(self):
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=False, constant_folding=True)
+    graph_options = config_pb2.GraphOptions(
+        rewrite_options=rewrite_options, build_cost_model=1)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+
+    with ops.Graph().as_default() as g:
+      r1 = random_ops.random_normal(shape=[2, 3], name='R1')
+      r2 = random_ops.random_normal(shape=[2, 3], name='R2')
+      copy1 = array_ops.stop_gradient(r1)
+      copy2 = array_ops.identity(r2)
+      result = copy1 + copy2
+
+      with session.Session(graph=g, config=config) as sess:
+        metadata = config_pb2.RunMetadata()
+        sess.run(result, run_metadata=metadata)
+
+    # Check that we optimized the graph by looking at the cost model: the add
+    # node should have been reconnected directly to the R1 and R2 nodes.
+    found_valid_nodes = 0
+    for node in metadata.cost_graph.node:
+      if node.name == 'R1':
+        r1_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'R2':
+        r2_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'add':
+        if node.input_info[0].preceding_node == r1_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r2_cost_id)
+          found_valid_nodes += 1
+        elif node.input_info[0].preceding_node == r2_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r1_cost_id)
+          found_valid_nodes += 1
+    self.assertEqual(3, found_valid_nodes)
+
+  def testDeviceAttributes(self):
+    attrs = session._DeviceAttributes(
+        '/job:worker/replica:0/task:3/device:CPU:2', 'TYPE', 1337)
+    self.assertEqual(1337, attrs.memory_limit_bytes)
+    self.assertEqual('/job:worker/replica:0/task:3/device:CPU:2', attrs.name)
+    self.assertEqual('TYPE', attrs.device_type)
+    str_repr = '%s' % attrs
+    self.assertTrue(str_repr.startswith('_DeviceAttributes'), str_repr)
+
+  def testDeviceAttributesCanonicalization(self):
+    attrs = session._DeviceAttributes('/job:worker/replica:0/task:3/cpu:1',
+                                      'TYPE', 1337)
+    self.assertEqual(1337, attrs.memory_limit_bytes)
+    self.assertEqual('/job:worker/replica:0/task:3/device:CPU:1', attrs.name)
+    self.assertEqual('TYPE', attrs.device_type)
+    str_repr = '%s' % attrs
+    self.assertTrue(str_repr.startswith('_DeviceAttributes'), str_repr)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 779d480a1e9..4a62a96935e 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,11 +18,14 @@ limitations under the License.
 %{
 
 #include "tensorflow/python/client/tf_session_helper.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"
 
 %}
 
+%include "tensorflow/python/client/tf_sessionrun_wrapper.i"
+
 // Required to use PyArray_* functions.
 %init %{
 tensorflow::ImportNumpy();
@@ -57,6 +60,9 @@ tensorflow::ImportNumpy();
   }
 }
 
+// Constants used by TensorHandle (get_session_handle).
+%constant const char* TENSOR_HANDLE_KEY = tensorflow::SessionState::kTensorHandleResourceTypeName;
+
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN TYPEMAPS FOR tensorflow::TF_Run_wrapper()
 ////////////////////////////////////////////////////////////////////////////////
@@ -65,7 +71,8 @@ tensorflow::ImportNumpy();
 // represented as a list of strings.
 %typemap(in) const tensorflow::NameVector& (
     tensorflow::NameVector temp,
-    tensorflow::Safe_PyObjectPtr temp_string_list(tensorflow::make_safe(nullptr))) {
+    tensorflow::Safe_PyObjectPtr temp_string_list(
+        tensorflow::make_safe(static_cast<PyObject*>(nullptr)))) {
   if (!PyList_Check($input)) {
     SWIG_fail;
   }
@@ -112,7 +119,7 @@ tensorflow::ImportNumpy();
 
 // Build a Python list of outputs and return it.
 %typemap(argout) tensorflow::PyObjectVector* out_values {
-  tensorflow::Safe_PyObjectVector out_values_safe;
+  std::vector<tensorflow::Safe_PyObjectPtr> out_values_safe;
   for (size_t i = 0; i < $1->size(); ++i) {
     out_values_safe.emplace_back(tensorflow::make_safe($1->at(i)));
   }
@@ -152,34 +159,66 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-// Include the functions from c_api.h, except TF_Run.
-%ignoreall
-%unignore TF_Code;
-%unignore TF_Status;
-%unignore TF_Buffer;
-%unignore TF_NewBuffer;
-%unignore TF_NewBufferFromString;
-%unignore TF_DeleteBuffer;
-%unignore TF_GetBuffer;
-%unignore TF_NewStatus;
-%unignore TF_DeleteStatus;
-%unignore TF_GetCode;
-%unignore TF_Message;
-%unignore TF_SessionOptions;
+%inline %{
+// Helper function to convert a Python list of Tensors to a C++ vector of
+// TF_Outputs.
+//
+// Caller should have already checked that `py_tensor_list` is a list (this
+// isn't done in this function to allow for function-specific error messages)
+void PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec) {
+  size_t size = PyList_Size(py_tensor_list);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem(py_tensor_list, i);
+    TF_Output* input_ptr;
+    SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                    SWIGTYPE_p_TF_Output, 0);
+    vec->push_back(*input_ptr);
+  }
+}
+%}
+
+// Converts input Python list of wrapped TF_Outputs into a single array
+%typemap(in) (const TF_Output* inputs, int num_inputs)
+    (std::vector<TF_Output> inputs) {
+  if (!PyList_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError, "$symname: expected Python list of wrapped TF_Outputs");
+  }
+  PyTensorListToVector($input, &inputs);
+  $1 = inputs.data();
+  $2 = inputs.size();
+}
+
+// TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
+// skip for now
+%ignore TF_WhileParams;
+%ignore TF_NewWhile;
+%ignore TF_FinishWhile;
+%ignore TF_AbortWhile;
+
+// These are defined below, avoid duplicate definitions
+%ignore TF_Run;
+%ignore TF_PRun;
+%ignore TF_PRunSetup;
+
+// We use TF_SessionRun_wrapper instead of TF_SessionRun
+%ignore TF_SessionRun;
+%unignore TF_SessionRun_wrapper;
+// The %exception block above releases the Python GIL for the length of each
+// wrapped method. We disable this behavior for TF_SessionRun_wrapper because it
+// uses Python method(s) that expect the GIL to be held (at least
+// PyArray_Return, maybe others).
+%noexception TF_SessionRun_wrapper;
+
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
-%unignore TF_DeleteSessionOptions;
-%unignore TF_NewDeprecatedSession;
-%unignore TF_CloseDeprecatedSession;
-%unignore TF_DeleteDeprecatedSession;
-%unignore TF_ExtendGraph;
-%unignore TF_NewLibrary;
-%unignore TF_LoadLibrary;
-%unignore TF_GetOpList;
-%include "tensorflow/c/c_api.h"
-%ignoreall
 
+%include "tensorflow/c/c_api.h"
+
+%ignoreall
 %insert("python") %{
   def TF_NewSessionOptions(target=None, config=None):
     # NOTE: target and config are validated in the session constructor.
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index a69c56368fb..86088d0ab49 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -17,32 +17,22 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+#include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
 
 namespace {
 
-// Container types for the various temporary values used internally in
-// the wrapper.
-
-// A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
-typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
-
-// Safe containers for (an) owned TF_Tensor(s). On destruction, the
-// tensor will be deleted by TF_DeleteTensor.
-typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
-    Safe_TF_TensorPtr;
-typedef std::vector<Safe_TF_TensorPtr> Safe_TF_TensorVector;
-Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
-  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
-}
+static const char* kFeedDictErrorMsg =
+    "feed_dict must be a dictionary mapping strings to NumPy arrays.";
 
 Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
                                    TF_DataType* out_tf_datatype) {
@@ -50,7 +40,11 @@ Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
   PyObject* value;
   Py_ssize_t pos = 0;
   if (PyDict_Next(descr->fields, &pos, &key, &value)) {
-    const char* key_string = PyBytes_AsString(key);
+    // In Python 3, the keys of numpy custom struct types are unicode, unlike
+    // Python 2, where the keys are bytes.
+    const char* key_string =
+        PyBytes_Check(key) ? PyBytes_AsString(key)
+                           : PyBytes_AsString(PyUnicode_AsASCIIString(key));
     if (!key_string) {
       return errors::Internal("Corrupt numpy type descriptor");
     }
@@ -69,6 +63,8 @@ Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
       *out_tf_datatype = TF_QUINT16;
     } else if (key == "qint32") {
       *out_tf_datatype = TF_QINT32;
+    } else if (key == "resource") {
+      *out_tf_datatype = TF_RESOURCE;
     } else {
       return errors::Internal("Unsupported numpy data type");
     }
@@ -125,6 +121,8 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       // Quantized types are currently represented as custom struct types.
       // PyArray_TYPE returns NPY_VOID for structs, and we should look into
       // descr to derive the actual type.
+      // Direct feeds of certain types of ResourceHandles are represented as a
+      // custom struct type.
       return PyArrayDescr_to_TF_DataType(descr, out_tf_datatype);
     default:
       // TODO(mrry): Support these.
@@ -133,75 +131,6 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
   return Status::OK();
 }
 
-Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
-                                   int* out_pyarray_type) {
-  switch (tf_datatype) {
-    case TF_HALF:
-      *out_pyarray_type = NPY_FLOAT16;
-      break;
-    case TF_FLOAT:
-      *out_pyarray_type = NPY_FLOAT32;
-      break;
-    case TF_DOUBLE:
-      *out_pyarray_type = NPY_FLOAT64;
-      break;
-    case TF_INT32:
-      *out_pyarray_type = NPY_INT32;
-      break;
-    case TF_UINT8:
-      *out_pyarray_type = NPY_UINT8;
-      break;
-    case TF_UINT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    case TF_INT8:
-      *out_pyarray_type = NPY_INT8;
-      break;
-    case TF_INT16:
-      *out_pyarray_type = NPY_INT16;
-      break;
-    case TF_INT64:
-      *out_pyarray_type = NPY_INT64;
-      break;
-    case TF_BOOL:
-      *out_pyarray_type = NPY_BOOL;
-      break;
-    case TF_COMPLEX64:
-      *out_pyarray_type = NPY_COMPLEX64;
-      break;
-    case TF_COMPLEX128:
-      *out_pyarray_type = NPY_COMPLEX128;
-      break;
-    case TF_STRING:
-      *out_pyarray_type = NPY_OBJECT;
-      break;
-    // TODO(keveman): These should be changed to NPY_VOID, and the type used for
-    // the resulting numpy array should be the custom struct types that we
-    // expect for quantized types.
-    case TF_QINT8:
-      *out_pyarray_type = NPY_INT8;
-      break;
-    case TF_QUINT8:
-      *out_pyarray_type = NPY_UINT8;
-      break;
-    case TF_QINT16:
-      *out_pyarray_type = NPY_INT16;
-      break;
-    case TF_QUINT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    case TF_QINT32:
-      *out_pyarray_type = NPY_INT32;
-      break;
-    case TF_BFLOAT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    default:
-      return errors::Internal("Unsupported fetch type");
-  }
-  return Status::OK();
-}
-
 // Iterate over the string array 'array', extract the ptr and len of each string
 // element and call f(ptr, len).
 template <typename F>
@@ -322,86 +251,224 @@ static Status CopyStringToPyArrayElement(PyArrayObject* pyarray, void* i_ptr,
   return Status::OK();
 }
 
-// Converts the given TF_Tensor to a Numpy array.
+// Determine the dimensions of a numpy ndarray to be created to represent an
+// output Tensor.
+gtl::InlinedVector<npy_intp, 4> GetPyArrayDimensionsForTensor(
+    const TF_Tensor* tensor, tensorflow::int64* nelems) {
+  if (TF_TensorType(tensor) == TF_RESOURCE) {
+    gtl::InlinedVector<npy_intp, 4> dims(1);
+    ResourceHandle* resource_handle =
+        reinterpret_cast<ResourceHandle*>(TF_TensorData(tensor));
+    dims[0] = resource_handle->SerializeAsString().size();
+    *nelems = dims[0];
+
+    return dims;
+  } else {
+    const int ndims = TF_NumDims(tensor);
+    gtl::InlinedVector<npy_intp, 4> dims(ndims);
+    *nelems = 1;
+    for (int i = 0; i < ndims; ++i) {
+      dims[i] = TF_Dim(tensor, i);
+      *nelems *= dims[i];
+    }
+
+    return dims;
+  }
+}
+
+// Determine the type description (PyArray_Descr) of a numpy ndarray to be
+// created to represent an output Tensor.
+Status GetPyArrayDescrForTensor(const TF_Tensor* tensor,
+                                PyArray_Descr** descr) {
+  if (TF_TensorType(tensor) == TF_RESOURCE) {
+    PyObject* field = PyTuple_New(3);
+#if PY_MAJOR_VERSION < 3
+    PyTuple_SetItem(field, 0, PyBytes_FromString("resource"));
+#else
+    PyTuple_SetItem(field, 0, PyUnicode_FromString("resource"));
+#endif
+    PyTuple_SetItem(field, 1, PyArray_TypeObjectFromType(NPY_UBYTE));
+    PyTuple_SetItem(field, 2, PyLong_FromLong(1));
+    PyObject* fields = PyList_New(1);
+    PyList_SetItem(fields, 0, field);
+    int convert_result = PyArray_DescrConverter(fields, descr);
+    Py_CLEAR(field);
+    Py_CLEAR(fields);
+    if (convert_result != 1) {
+      return errors::Internal("Failed to create numpy array description for ",
+                              "TF_RESOURCE-type tensor");
+    }
+  } else {
+    int type_num = -1;
+    TF_RETURN_IF_ERROR(
+        TF_DataType_to_PyArray_TYPE(TF_TensorType(tensor), &type_num));
+    *descr = PyArray_DescrFromType(type_num);
+  }
+
+  return Status::OK();
+}
+
+// Converts the given TF_Tensor to a numpy ndarray.
 // If the returned status is OK, the caller becomes the owner of *out_array.
-Status TF_Tensor_to_PyObject(TF_Tensor* tensor, PyObject** out_array) {
+Status TFTensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   // A fetched operation will correspond to a null tensor, and a None
   // in Python.
   if (tensor == nullptr) {
     Py_INCREF(Py_None);
-    *out_array = Py_None;
+    *out_ndarray = Py_None;
     return Status::OK();
   }
 
-  const int ndims = TF_NumDims(tensor);
-  gtl::InlinedVector<npy_intp, 4> dims(ndims);
-  tensorflow::int64 nelems = 1;
-  for (int i = 0; i < ndims; ++i) {
-    dims[i] = TF_Dim(tensor, i);
-    nelems *= dims[i];
-  }
+  tensorflow::int64 nelems = -1;
+  gtl::InlinedVector<npy_intp, 4> dims =
+      GetPyArrayDimensionsForTensor(tensor.get(), &nelems);
 
   // Convert TensorFlow dtype to numpy type descriptor.
-  int type_num = -1;
-  TF_RETURN_IF_ERROR(
-      TF_DataType_to_PyArray_TYPE(TF_TensorType(tensor), &type_num));
-  PyArray_Descr* descr = PyArray_DescrFromType(type_num);
+  PyArray_Descr* descr = nullptr;
+  TF_RETURN_IF_ERROR(GetPyArrayDescrForTensor(tensor.get(), &descr));
+
+  // If the type is neither string nor resource we can reuse the Tensor memory.
+  TF_Tensor* original = tensor.get();
+  TF_Tensor* moved = TF_TensorMaybeMove(tensor.release());
+  if (moved != nullptr) {
+    if (ArrayFromMemory(dims.size(), dims.data(), TF_TensorData(moved),
+                        static_cast<DataType>(TF_TensorType(moved)),
+                        [moved] { TF_DeleteTensor(moved); }, out_ndarray)
+            .ok()) {
+      return Status::OK();
+    }
+  }
+  tensor.reset(original);
 
   // Copy the TF_TensorData into a newly-created ndarray and return it.
-  // TODO(mrry): Perhaps investigate zero-copy approaches. This would involve
-  // creating an ndarray-like object that wraps the TF_Tensor buffer, and
-  // maps its destructor to TF_DeleteTensor.
   Safe_PyObjectPtr safe_out_array =
-      tensorflow::make_safe(PyArray_Empty(ndims, dims.data(), descr, 0));
+      tensorflow::make_safe(PyArray_Empty(dims.size(), dims.data(), descr, 0));
   if (!safe_out_array) {
     return errors::Internal("Could not allocate ndarray");
   }
   PyArrayObject* py_array =
       reinterpret_cast<PyArrayObject*>(safe_out_array.get());
   if (PyArray_NBYTES(py_array) !=
-      static_cast<int64>(TF_TensorByteSize(tensor))) {
-    if (TF_TensorType(tensor) == TF_STRING) {
+      static_cast<int64>(TF_TensorByteSize(tensor.get()))) {
+    if (TF_TensorType(tensor.get()) == TF_STRING) {
       // Copy element by element.
       auto iter = tensorflow::make_safe(PyArray_IterNew(safe_out_array.get()));
       for (tensorflow::int64 i = 0; i < nelems; ++i) {
-        auto s =
-            CopyStringToPyArrayElement(py_array, iter.get(), tensor, nelems, i);
+        auto s = CopyStringToPyArrayElement(py_array, iter.get(), tensor.get(),
+                                            nelems, i);
         if (!s.ok()) {
           return s;
         }
         PyArray_ITER_NEXT(iter.get());
       }
+    } else if (TF_TensorType(tensor.get()) == TF_RESOURCE) {
+      ResourceHandle* resource_handle =
+          reinterpret_cast<ResourceHandle*>(TF_TensorData(tensor.get()));
+      memcpy(PyArray_DATA(py_array),
+             resource_handle->SerializeAsString().c_str(),
+             PyArray_NBYTES(py_array));
     } else {
       return errors::Internal("ndarray was ", PyArray_NBYTES(py_array),
                               " bytes but TF_Tensor was ",
-                              TF_TensorByteSize(tensor), " bytes");
+                              TF_TensorByteSize(tensor.get()), " bytes");
     }
   } else {
-    memcpy(PyArray_DATA(py_array), TF_TensorData(tensor),
+    memcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()),
            PyArray_NBYTES(py_array));
   }
 
   // PyArray_Return turns rank 0 arrays into numpy scalars
-  *out_array = PyArray_Return(
+  *out_ndarray = PyArray_Return(
       reinterpret_cast<PyArrayObject*>(safe_out_array.release()));
   return Status::OK();
 }
 
+// Converts the given numpy ndarray to a (safe) TF_Tensor. The returned
+// TF_Tensor in `out_tensor` will have its own Python reference to `ndarray`s
+// data. After `out_tensor` is destroyed, this reference must (eventually) be
+// decremented via ClearDecrefCache().
+//
+// If `ndarray` contains a resource handle, `*resource_handle` will be set to
+// the deserialized handle. Otherwise it is set to nullptr. Caller becomes owner
+// of `*resource_handle` if it's set, and it must outlive the returned
+// `out_tensor`.
+//
+// `resource_handle` and `out_tensor` must be non-null. Caller retains ownership
+// of `ndarray`.
+Status PyArrayToTFTensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor,
+                         ResourceHandle** resource_handle) {
+  DCHECK(out_tensor != nullptr);
+  DCHECK(resource_handle != nullptr);
+  *resource_handle = nullptr;
+
+  // Make sure we dereference this array object in case of error, etc.
+  Safe_PyObjectPtr array_safe(make_safe(
+      PyArray_FromAny(ndarray, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr)));
+  if (!array_safe) return errors::InvalidArgument(kFeedDictErrorMsg);
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+
+  // Convert numpy dtype to TensorFlow dtype.
+  TF_DataType dtype = TF_FLOAT;
+  TF_RETURN_IF_ERROR(PyArray_TYPE_to_TF_DataType(array, &dtype));
+
+  tensorflow::int64 nelems = 1;
+  gtl::InlinedVector<int64_t, 4> dims;
+  for (int i = 0; i < PyArray_NDIM(array); ++i) {
+    dims.push_back(PyArray_SHAPE(array)[i]);
+    nelems *= dims[i];
+  }
+
+  // Create a TF_Tensor based on the fed data. In the case of non-string data
+  // type, this steals a reference to array, which will be relinquished when
+  // the underlying buffer is deallocated. For string, a new temporary buffer
+  // is allocated into which the strings are encoded.
+  if (dtype == TF_RESOURCE) {
+    const string serialized(reinterpret_cast<char*>(PyArray_DATA(array)),
+                            PyArray_NBYTES(array));
+    *resource_handle = new ResourceHandle();
+    (*resource_handle)->ParseFromString(serialized);
+    TF_Tensor* tf_tensor =
+        TF_AllocateTensor(dtype, {}, 0, sizeof(ResourceHandle));
+    std::memcpy(TF_TensorData(tf_tensor),
+                reinterpret_cast<void*>(*resource_handle),
+                sizeof(ResourceHandle));
+    *out_tensor = make_safe(tf_tensor);
+  } else if (dtype != TF_STRING) {
+    size_t size = PyArray_NBYTES(array);
+    array_safe.release();
+    *out_tensor = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
+                                         PyArray_DATA(array), size,
+                                         &DelayedNumpyDecref, array));
+  } else {
+    size_t size = 0;
+    void* encoded = nullptr;
+    TF_RETURN_IF_ERROR(EncodePyBytesArray(array, nelems, &size, &encoded));
+    *out_tensor =
+        make_safe(TF_NewTensor(dtype, dims.data(), dims.size(), encoded, size,
+                               [](void* data, size_t len, void* arg) {
+                                 delete[] reinterpret_cast<char*>(data);
+                               },
+                               nullptr));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Safe_PyObjectPtr make_safe(PyObject* o) {
   return Safe_PyObjectPtr(o, Py_DECREF_wrapper);
 }
 
+Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
+  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
+}
+
 void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
                            const TF_Buffer* run_options, PyObject* feed_dict,
                            const NameVector& output_names,
                            const NameVector& target_nodes,
                            TF_Status* out_status, PyObjectVector* out_values,
                            TF_Buffer* run_outputs) {
-  static const char* kFeedDictErrorMsg =
-      "feed_dict must be a dictionary mapping strings to NumPy arrays.";
-
   // 1. Convert the feed inputs to the appropriate form for TF_Run.
   if (!PyDict_Check(feed_dict)) {
     Set_TF_Status_from_Status(out_status,
@@ -410,7 +477,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
   }
 
   NameVector input_names;
-  Safe_TF_TensorVector inputs_safe;  // Used to delete tensors.
+  std::vector<Safe_TF_TensorPtr> inputs_safe;  // Used to delete tensors.
   TF_TensorVector inputs_unsafe;     // Used to contain the arg to TF_Run.
 
   PyObject* key;
@@ -418,6 +485,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
   Py_ssize_t pos = 0;
   int index = 0;
   Status s;
+
+  gtl::InlinedVector<std::unique_ptr<ResourceHandle>, 4> resource_handles;
   while (PyDict_Next(feed_dict, &pos, &key, &value)) {
     char* key_string = PyBytes_AsString(key);
     if (!key_string) {
@@ -427,68 +496,27 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     }
     input_names.push_back(key_string);
 
-    // The array object will be dereferenced at the end of this iteration
-    // (or if we return early due to an error).
-    Safe_PyObjectPtr array_safe(make_safe(
-        PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr)));
-    if (!array_safe) {
-      Set_TF_Status_from_Status(out_status,
-                                errors::InvalidArgument(kFeedDictErrorMsg));
-      return;
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-
-    // Convert numpy dtype to TensorFlow dtype.
-    TF_DataType dtype = TF_FLOAT;
-    s = PyArray_TYPE_to_TF_DataType(array, &dtype);
+    inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
+    ResourceHandle* resource_handle;
+    s = PyArrayToTFTensor(value, &inputs_safe.back(), &resource_handle);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
     }
-
-    tensorflow::int64 nelems = 1;
-    gtl::InlinedVector<int64_t, 4> dims;
-    for (int i = 0; i < PyArray_NDIM(array); ++i) {
-      dims.push_back(PyArray_SHAPE(array)[i]);
-      nelems *= dims[i];
-    }
-
-    // Create a TF_Tensor based on the fed data. In the case of non-string data
-    // type, this steals a reference to array, which will be relinquished when
-    // the underlying buffer is deallocated. For string, a new temporary buffer
-    // is allocated into which the strings are encoded.
-    if (dtype != TF_STRING) {
-      // NOTE(mrry): We currently copy the numpy array into a new
-      // buffer to avoid possible issues on deallocation (such as
-      // having to acquire the Python Global Interpreter Lock).
-      // TODO(mrry): Investigate in what cases we can safely acquire
-      size_t size = PyArray_NBYTES(array);
-      TF_Tensor* tensor =
-          TF_AllocateTensor(dtype, dims.data(), dims.size(), size);
-      std::memcpy(TF_TensorData(tensor), PyArray_DATA(array), size);
-      inputs_safe.emplace_back(make_safe(tensor));
-    } else {
-      size_t size = 0;
-      void* encoded = nullptr;
-      Status s = EncodePyBytesArray(array, nelems, &size, &encoded);
-      if (!s.ok()) {
-        Set_TF_Status_from_Status(out_status, s);
-        return;
-      }
-      inputs_safe.emplace_back(
-          make_safe(TF_NewTensor(dtype, dims.data(), dims.size(), encoded, size,
-                                 [](void* data, size_t len, void* arg) {
-                                   delete[] reinterpret_cast<char*>(data);
-                                 },
-                                 array)));
-    }
     inputs_unsafe.push_back(inputs_safe.back().get());
+    if (resource_handle != nullptr) {
+      resource_handles.emplace_back(resource_handle);
+    }
     ++index;
   }
 
   // 2. Allocate a container for the output data.
   TF_TensorVector outputs(output_names.size());
 
+  // In case any tensors were leftover from previous runs we might as well clear
+  // them here.
+  ClearDecrefCache();
+
   // 3. Actually call TF_Run().
   Py_BEGIN_ALLOW_THREADS;
   if (handle == nullptr) {
@@ -507,23 +535,26 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
 
   Py_END_ALLOW_THREADS;
 
+  // Decref any numpy arrays we are not using anymore.
+  ClearDecrefCache();
+
   if (TF_GetCode(out_status) != TF_OK) {
     return;
   }
 
   // 4. We now own the fetched tensors, so set up a safe container to
   // delete them when we exit this scope.
-  Safe_TF_TensorVector tf_outputs_safe;
+  std::vector<Safe_TF_TensorPtr> tf_outputs_safe;
   for (const auto& output : outputs) {
     tf_outputs_safe.emplace_back(make_safe(output));
   }
 
   // 5. Convert the fetched tensors into numpy ndarrays. Store them in a safe
   // container so that we do not leak
-  Safe_PyObjectVector py_outputs_safe;
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
   for (size_t i = 0; i < output_names.size(); ++i) {
     PyObject* py_array;
-    s = TF_Tensor_to_PyObject(outputs[i], &py_array);
+    s = TFTensorToPyArray(std::move(tf_outputs_safe[i]), &py_array);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -547,6 +578,7 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
                     PyObjectVector* out_values, TF_Buffer* run_outputs) {
   TF_Run_wrapper_helper(session, nullptr, run_options, feed_dict, output_names,
                         target_nodes, out_status, out_values, run_outputs);
+  ClearDecrefCache();
 }
 
 // Wrapper for TF_PRunSetup that converts the arguments to appropriate types.
@@ -562,6 +594,14 @@ void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
       const_cast<const char**>(output_names.data()), output_names.size(),
       const_cast<const char**>(target_nodes.data()), target_nodes.size(),
       out_handle, out_status);
+  // TF_PRunSetup leaves out_handle undefined if it fails, but SWIG will call
+  // free(out_handle) on the returned handle regardless. Thus, must make sure it
+  // is valid.
+  if (TF_GetCode(out_status) != TF_OK) {
+    char* tmp = new char[1];
+    tmp[0] = '\0';
+    *out_handle = tmp;
+  }
   Py_END_ALLOW_THREADS;
 }
 
@@ -573,6 +613,7 @@ void TF_PRun_wrapper(TF_DeprecatedSession* session, const char* handle,
                      TF_Status* out_status, PyObjectVector* out_values) {
   TF_Run_wrapper_helper(session, handle, nullptr, feed_dict, output_names,
                         NameVector(), out_status, out_values, nullptr);
+  ClearDecrefCache();
 }
 
 // Wrapper for TF_Reset that converts the string vectors to character arrays.
@@ -582,6 +623,107 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
            out_status);
 }
 
+void TF_SessionRun_wrapper_helper(TF_Session* session,
+                                  const TF_Buffer* run_options,
+                                  const std::vector<TF_Output>& inputs,
+                                  const std::vector<PyObject*>& input_ndarrays,
+                                  const std::vector<TF_Output>& outputs,
+                                  const std::vector<TF_Operation*>& targets,
+                                  TF_Buffer* run_metadata,
+                                  TF_Status* out_status,
+                                  std::vector<PyObject*>* py_outputs) {
+  DCHECK_EQ(inputs.size(), input_ndarrays.size());
+  DCHECK(py_outputs != nullptr);
+  DCHECK(py_outputs->empty());
+  Status s;
+
+  // Convert input ndarray PyObjects to TF_Tensors. We maintain a continuous
+  // array of TF_Tensor*s as well as scoped containers to make sure they're
+  // cleaned up properly.
+  //
+  // Memory management:
+  // PyArrayToTFTensor() creates a new ndarray PyObject from the input
+  // ndarray. We manage the new ndarray's lifetime in order to keep the
+  // underlying data buffer alive (the new ndarray also guarantees a contiguous
+  // data buffer). The new ndarray's data buffer is used to create the
+  // corresponding TF_Tensor. The TF_Tensor's deallocator will queue the new
+  // ndarray to be decref'd by the next ClearDecrefCache() call (we can't call
+  // Py_DECREF in the deallocator directly because the GIL must be held).
+  //
+  // Note that TF_Tensor may directly delegate its data and deallocator to a
+  // TensorBuffer, which may outlive the TF_Tensor (e.g. if the tensor gets
+  // queued or assigned to a variable).
+  TF_TensorVector input_vals;
+  std::vector<Safe_TF_TensorPtr> input_vals_safe;
+  gtl::InlinedVector<std::unique_ptr<ResourceHandle>, 4> resource_handles;
+  for (PyObject* ndarray : input_ndarrays) {
+    input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
+    ResourceHandle* resource_handle;
+    s = PyArrayToTFTensor(ndarray, &input_vals_safe.back(), &resource_handle);
+    if (resource_handle != nullptr) {
+      resource_handles.emplace_back(resource_handle);
+    }
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+    input_vals.push_back(input_vals_safe.back().get());
+  }
+
+  // Allocate space for output TF_Tensor*s
+  TF_TensorVector output_vals(outputs.size());
+
+  // Clear up any unused memory leftover from previous runs
+  ClearDecrefCache();
+
+  // Call TF_SessionRun() (and release GIL during execution)
+  Py_BEGIN_ALLOW_THREADS;
+  TF_SessionRun(session, run_options, inputs.data(), input_vals.data(),
+                inputs.size(), outputs.data(), output_vals.data(),
+                outputs.size(), targets.data(), targets.size(), run_metadata,
+                out_status);
+  Py_END_ALLOW_THREADS;
+
+  // Create scoped containers for output tensors
+  std::vector<Safe_TF_TensorPtr> output_vals_safe;
+  for (TF_Tensor* output : output_vals) {
+    output_vals_safe.emplace_back(make_safe(output));
+  }
+
+  // Convert outputs to ndarrays (in scoped containers)
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
+  for (int i = 0; i < outputs.size(); ++i) {
+    PyObject* py_array;
+    s = TFTensorToPyArray(std::move(output_vals_safe[i]), &py_array);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+    py_outputs_safe.emplace_back(make_safe(py_array));
+  }
+
+  // If we reach this point, we have successfully built a list of objects so we
+  // can release them from the safe container into the return vector.
+  for (int i = 0; i < outputs.size(); ++i) {
+    py_outputs->push_back(py_outputs_safe[i].release());
+  }
+}
+
+void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
+                           const std::vector<TF_Output>& inputs,
+                           const std::vector<PyObject*>& input_ndarrays,
+                           const std::vector<TF_Output>& outputs,
+                           const std::vector<TF_Operation*>& targets,
+                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           std::vector<PyObject*>* py_outputs) {
+  TF_SessionRun_wrapper_helper(session, run_options, inputs, input_ndarrays,
+                               outputs, targets, run_metadata, out_status,
+                               py_outputs);
+  // Release any unused ndarray references (see memory management comment in
+  // TF_SessionRun_wrapper_helper)
+  ClearDecrefCache();
+}
+
 string EqualGraphDefWrapper(const string& actual, const string& expected) {
   GraphDef actual_def;
   if (!actual_def.ParseFromString(actual)) {
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index b36faf1f95f..727e8ade52f 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -37,14 +37,24 @@ typedef tensorflow::gtl::InlinedVector<const char*, 8> NameVector;
 // A PyObjectVector is a vector of borrowed pointers to PyObjects.
 typedef tensorflow::gtl::InlinedVector<PyObject*, 8> PyObjectVector;
 
-// Safe containers for (an) owned PyObject(s). On destruction, the
-// reference count of the contained object will be decremented.
+// A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
+typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
 inline void Py_DECREF_wrapper(PyObject* o) { Py_DECREF(o); }
+// Note: can't use decltype(&Py_DECREF_wrapper) due to SWIG
 typedef void (*Py_DECREF_wrapper_type)(PyObject*);
 typedef std::unique_ptr<PyObject, Py_DECREF_wrapper_type> Safe_PyObjectPtr;
-typedef std::vector<Safe_PyObjectPtr> Safe_PyObjectVector;
 Safe_PyObjectPtr make_safe(PyObject* o);
 
+// Safe containers for an owned TF_Tensor. On destruction, the tensor will be
+// deleted by TF_DeleteTensor.
+// Note: can't use decltype(&TF_DeleteTensor) due to SWIG
+typedef void (*TF_DeleteTensor_type)(TF_Tensor*);
+typedef std::unique_ptr<TF_Tensor, TF_DeleteTensor_type> Safe_TF_TensorPtr;
+Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
+
 // Run the graph associated with the session starting with the
 // supplied inputs[].  Regardless of success or failure, inputs[] are
 // stolen by the implementation (i.e. the implementation will
@@ -106,6 +116,18 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
 // for no difference.
 string EqualGraphDefWrapper(const string& actual, const string& expected);
 
+// Runs the graph associated with the session starting with the supplied inputs.
+// On success, `py_outputs` is populated with a numpy ndarray for each output
+// (the caller must decref these ndarrays, although this will likely be handled
+// by the Python gc). `session`, `out_status`, and `py_outputs` must be
+// non-null. `py_outputs` should be empty.
+void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
+                           const std::vector<TF_Output>& inputs,
+                           const std::vector<PyObject*>& input_ndarrays,
+                           const std::vector<TF_Output>& outputs,
+                           const std::vector<TF_Operation*>& targets,
+                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           std::vector<PyObject*>* py_outputs);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/tf_sessionrun_wrapper.i b/tensorflow/python/client/tf_sessionrun_wrapper.i
new file mode 100644
index 00000000000..289792fef26
--- /dev/null
+++ b/tensorflow/python/client/tf_sessionrun_wrapper.i
@@ -0,0 +1,98 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps for TF_SessionRun_wrapper()
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/client/tf_session_helper.h"
+%}
+
+// Required to use PyArray_* functions.
+%init %{
+tensorflow::ImportNumpy();
+%}
+
+// $input is a Python dict mapping wrapped TF_Outputs to ndarrays.
+%typemap(in) (const std::vector<TF_Output>& inputs,
+              const std::vector<PyObject*>& input_ndarrays)
+    (std::vector<TF_Output> inputs, std::vector<PyObject*> input_ndarrays) {
+  if (!PyDict_Check($input)) {
+    SWIG_exception_fail(SWIG_TypeError, "$symname: expected dict");
+  }
+  PyObject* key;
+  PyObject* value;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next($input, &pos, &key, &value)) {
+    TF_Output* input_ptr;
+    SWIG_ConvertPtr(key, reinterpret_cast<void**>(&input_ptr),
+                    SWIGTYPE_p_TF_Output, 0);
+    inputs.push_back(*input_ptr);
+
+    if (!PyArray_Check(value)) {
+      SWIG_exception_fail(
+          SWIG_TypeError,
+          "$symname: expected all values in input dict to be ndarray");
+    }
+    input_ndarrays.push_back(value);
+  }
+  $1 = &inputs;
+  $2 = &input_ndarrays;
+}
+
+// $input is a Python list of wrapped TF_Operations
+%typemap(in) (const std::vector<TF_Operation*>& targets)
+    (std::vector<TF_Operation*> targets) {
+  if (!PyList_Check($input)) {
+    SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+  }
+  size_t size = PyList_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem($input, i);
+    TF_Operation* oper_ptr;
+    SWIG_ConvertPtr(item, reinterpret_cast<void**>(&oper_ptr),
+                    SWIGTYPE_p_TF_Operation, 0);
+    targets.push_back(oper_ptr);
+  }
+  $1 = &targets;
+}
+
+// $input is a Python list of wrapped TF_Outputs
+%typemap(in) (const std::vector<TF_Output>& outputs)
+    (std::vector<TF_Output> outputs) {
+  if (!PyList_Check($input)) {
+    SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+  }
+  PyTensorListToVector($input, &outputs);
+  $1 = &outputs;
+}
+
+// Create temporary py_outputs_vec variable to store return value
+%typemap(in, numinputs=0) (std::vector<PyObject*>* py_outputs)
+    (std::vector<PyObject*> py_outputs_vec) {
+  $1 = &py_outputs_vec;
+}
+
+// Convert py_outputs to returned Python list
+%typemap(argout) (std::vector<PyObject*>* py_outputs) {
+  $result = PyList_New($1->size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  for (int i = 0; i < $1->size(); ++i) {
+    PyList_SET_ITEM($result, i, (*$1)[i]);
+  }
+}
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 339a6a72e0d..39446b6ca27 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -15,6 +15,7 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "debug_py",
@@ -29,9 +30,21 @@ py_library(
     ],
 )
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "debug_pip",
+    deps = [
+        ":debug_py",
+        ":offline_analyzer",
+        ":session_debug_testlib",
+    ] + if_not_windows([
+        ":debug_examples",
+    ]),
+)
+
 py_library(
     name = "debug_data",
-    srcs = ["debug_data.py"],
+    srcs = ["lib/debug_data.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework",
@@ -42,13 +55,23 @@ py_library(
 
 py_library(
     name = "debug_utils",
-    srcs = ["debug_utils.py"],
+    srcs = ["lib/debug_utils.py"],
     srcs_version = "PY2AND3",
 )
 
+py_library(
+    name = "source_utils",
+    srcs = ["lib/source_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":profiling",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "stepper",
-    srcs = ["stepper.py"],
+    srcs = ["lib/stepper.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -92,6 +115,7 @@ py_library(
     srcs = ["cli/tensor_format.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":debug_data",
         ":debugger_cli_common",
         "//third_party/py/numpy",
     ],
@@ -121,11 +145,36 @@ py_library(
         ":command_parser",
         ":debug_data",
         ":debugger_cli_common",
+        ":source_utils",
         ":ui_factory",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "profiling",
+    srcs = ["lib/profiling.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "profile_analyzer_cli",
+    srcs = ["cli/profile_analyzer_cli.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cli_shared",
+        ":command_parser",
+        ":debug_data",
+        ":debugger_cli_common",
+        ":profiling",
+        ":source_utils",
+        ":ui_factory",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "stepper_cli",
     srcs = ["cli/stepper_cli.py"],
@@ -151,13 +200,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "curses_widgets",
+    srcs = ["cli/curses_widgets.py"],
+    srcs_version = "PY2AND3",
+    deps = [":debugger_cli_common"],
+)
+
 py_library(
     name = "curses_ui",
     srcs = ["cli/curses_ui.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":base_ui",
+        ":cli_shared",
         ":command_parser",
+        ":curses_widgets",
         ":debugger_cli_common",
         ":tensor_format",
         "@six_archive//:six",
@@ -211,6 +269,7 @@ py_library(
         ":debug_data",
         ":debugger_cli_common",
         ":framework",
+        ":profile_analyzer_cli",
         ":stepper_cli",
         ":ui_factory",
     ],
@@ -224,6 +283,7 @@ py_library(
         ":debug_utils",
         ":dumping_wrapper",
         ":framework",
+        ":grpc_wrapper",
         ":local_cli_wrapper",
         ":stepper",
         "//tensorflow/python:training",
@@ -242,6 +302,16 @@ py_binary(
     ],
 )
 
+py_library(
+    name = "debug_examples",
+    deps = [
+        ":debug_errors",
+        ":debug_fibonacci",
+        ":debug_mnist",
+        ":debug_tflearn_iris",
+    ],
+)
+
 py_binary(
     name = "debug_fibonacci",
     srcs = ["examples/debug_fibonacci.py"],
@@ -292,7 +362,7 @@ py_test(
     name = "debug_data_test",
     size = "small",
     srcs = [
-        "debug_data_test.py",
+        "lib/debug_data_test.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -307,7 +377,7 @@ py_test(
     name = "debug_utils_test",
     size = "small",
     srcs = [
-        "debug_utils_test.py",
+        "lib/debug_utils_test.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -323,11 +393,31 @@ py_test(
     ],
 )
 
+py_test(
+    name = "source_utils_test",
+    size = "small",
+    srcs = [
+        "lib/source_utils_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_data",
+        ":debug_utils",
+        ":source_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_test",
     size = "small",
     srcs = [
-        "stepper_test.py",
+        "lib/stepper_test.py",
     ],
     additional_deps = [
         ":stepper",
@@ -362,6 +452,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "profiling_test",
+    size = "small",
+    srcs = ["lib/profiling_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":profiling",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 py_test(
     name = "curses_ui_test",
     size = "small",
@@ -369,6 +472,7 @@ py_test(
         "cli/curses_ui_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":curses_ui",
         ":debugger_cli_common",
@@ -399,7 +503,7 @@ py_test(
 
 py_library(
     name = "session_debug_testlib",
-    srcs = ["session_debug_testlib.py"],
+    srcs = ["lib/session_debug_testlib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -413,8 +517,11 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -425,7 +532,7 @@ py_library(
 cuda_py_test(
     name = "session_debug_file_test",
     size = "small",
-    srcs = ["session_debug_file_test.py"],
+    srcs = ["lib/session_debug_file_test.py"],
     additional_deps = [
         ":debug_data",
         ":debug_utils",
@@ -436,6 +543,23 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "session_debug_multi_gpu_test",
+    size = "small",
+    srcs = ["lib/session_debug_multi_gpu_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
 )
 
 py_test(
@@ -514,6 +638,7 @@ cuda_py_test(
         ":debug_data",
         ":debug_utils",
         ":debugger_cli_common",
+        ":source_utils",
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow:tensorflow_py",
@@ -524,10 +649,30 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
 )
 
+py_test(
+    name = "profile_analyzer_cli_test",
+    size = "small",
+    srcs = [
+        "cli/profile_analyzer_cli_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":command_parser",
+        ":profile_analyzer_cli",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_cli_test",
     size = "small",
@@ -597,13 +742,14 @@ py_test(
 
 sh_test(
     name = "examples_test",
-    size = "small",
+    size = "medium",
     srcs = ["examples/examples_test.sh"],
     data = [
         ":debug_errors",
         ":debug_fibonacci",
         ":debug_mnist",
         ":debug_tflearn_iris",
+        ":offline_analyzer",
     ],
 )
 
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 2ef44ca6d91..750d21f80d3 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -14,52 +14,22 @@
 # ==============================================================================
 """Public Python API of TensorFlow Debugger (tfdbg).
 
-## Functions for adding debug watches
-
-These functions help you modify `RunOptions` to specify which `Tensor`s are to
-be watched when the TensorFlow graph is executed at runtime.
+See the @{$python/tfdbg} guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
 @@watch_graph_with_blacklists
-
-
-## Classes for debug-dump data and directories
-
-These classes allow you to load and inspect tensor values dumped from
-TensorFlow graphs during runtime.
-
 @@DebugTensorDatum
 @@DebugDumpDir
-
-
-## Functions for loading debug-dump data
-
 @@load_tensor_from_event_file
-
-
-## Tensor-value predicates
-
-Built-in tensor-filter predicates to support conditional breakpoint between
-runs. See `DebugDumpDir.find()` for more details.
-
 @@has_inf_or_nan
-
-
-## Session wrapper class and `SessionRunHook` implementations
-
-These classes allow you to
-
-* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models
-  (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or
-* generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
-  `DumpingDebugHook` and `LocalCLIDebugHook`).
-
 @@DumpingDebugHook
 @@DumpingDebugWrapperSession
+@@GrpcDebugHook
+@@GrpcDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
-
+@@WatchOptions
 """
 
 from __future__ import absolute_import
@@ -67,16 +37,24 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-imports
-from tensorflow.python.debug.debug_data import DebugDumpDir
-from tensorflow.python.debug.debug_data import DebugTensorDatum
-from tensorflow.python.debug.debug_data import has_inf_or_nan
-from tensorflow.python.debug.debug_data import load_tensor_from_event_file
+from tensorflow.python.debug.lib.debug_data import DebugDumpDir
+from tensorflow.python.debug.lib.debug_data import DebugTensorDatum
+from tensorflow.python.debug.lib.debug_data import has_inf_or_nan
+from tensorflow.python.debug.lib.debug_data import load_tensor_from_event_file
 
-from tensorflow.python.debug.debug_utils import add_debug_tensor_watch
-from tensorflow.python.debug.debug_utils import watch_graph
-from tensorflow.python.debug.debug_utils import watch_graph_with_blacklists
+from tensorflow.python.debug.lib.debug_utils import add_debug_tensor_watch
+from tensorflow.python.debug.lib.debug_utils import watch_graph
+from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
 
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
+from tensorflow.python.debug.wrappers.framework import WatchOptions
+from tensorflow.python.debug.wrappers.grpc_wrapper import GrpcDebugWrapperSession
 from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
+from tensorflow.python.debug.wrappers.hooks import GrpcDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
+
+from tensorflow.python.util import all_util as _all_util
+
+
+_all_util.remove_undocumented(__name__)
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 8e609b86e91..da27f4cebea 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -29,12 +29,14 @@ import re
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.debug import debug_data
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import ui_factory
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import source_utils
 
+RL = debugger_cli_common.RichLine
 
 # String constants for the depth-dependent hanging indent at the beginning
 # of each line.
@@ -89,7 +91,7 @@ def _add_main_menu(output,
     menu.append(
         debugger_cli_common.MenuItem(
             "node_info",
-            "node_info -a -d %s" % node_name,
+            "node_info -a -d -t %s" % node_name,
             enabled=enable_node_info))
     menu.append(
         debugger_cli_common.MenuItem(
@@ -303,7 +305,6 @@ class DebugAnalyzer(object):
         help="Numerical ranges to highlight tensor elements in. "
         "Examples: -r 0,1e-8, -r [-0.1,0.1], "
         "-r \"[[-inf, -0.1], [0.1, inf]]\"")
-
     ap.add_argument(
         "-a",
         "--all",
@@ -312,12 +313,62 @@ class DebugAnalyzer(object):
         help="Print the tensor in its entirety, i.e., do not use ellipses.")
     self._arg_parsers["print_tensor"] = ap
 
+    # Parser for print_source.
+    ap = argparse.ArgumentParser(
+        description="Print a Python source file with overlaid debug "
+        "information, including the nodes (ops) or Tensors created at the "
+        "source lines.",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "source_file_path",
+        type=str,
+        help="Path to the source file.")
+    ap.add_argument(
+        "-t",
+        "--tensors",
+        dest="tensors",
+        action="store_true",
+        help="Label lines with dumped Tensors, instead of ops.")
+    ap.add_argument(
+        "-m",
+        "--max_elements_per_line",
+        type=int,
+        default=10,
+        help="Maximum number of elements (ops or Tensors) to show per source "
+             "line.")
+    ap.add_argument(
+        "-b",
+        "--line_begin",
+        type=int,
+        default=1,
+        help="Print source beginning at line number (1-based.)")
+    self._arg_parsers["print_source"] = ap
+
+    # Parser for list_source.
+    ap = argparse.ArgumentParser(
+        description="List source files responsible for constructing nodes and "
+        "tensors present in the run().",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "-p",
+        "--path_filter",
+        type=str,
+        default="",
+        help="Regular expression filter for file path.")
+    ap.add_argument(
+        "-n",
+        "--node_name_filter",
+        type=str,
+        default="",
+        help="Regular expression filter for node name.")
+    self._arg_parsers["list_source"] = ap
+
     # TODO(cais): Implement list_nodes.
 
   def add_tensor_filter(self, filter_name, filter_callable):
     """Add a tensor filter.
 
-    A tensor filter is a named callable of the siganture:
+    A tensor filter is a named callable of the signature:
       filter_callable(dump_datum, tensor),
 
     wherein dump_datum is an instance of debug_data.DebugTensorDatum carrying
@@ -709,15 +760,20 @@ class DebugAnalyzer(object):
       construction.
     """
 
-    lines = ["", "", "Traceback of node construction:"]
-    font_attr_segs = {len(lines) - 1: [(0, len(lines[-1]), "bold")]}
+    lines = [RL(""), RL(""), RL("Traceback of node construction:", "bold")]
 
     try:
       node_stack = self._debug_dump.node_traceback(node_name)
       for depth, (file_path, line, function_name, text) in enumerate(
           node_stack):
         lines.append("%d: %s" % (depth, file_path))
-        lines.append("  Line:     %d" % line)
+
+        attribute = debugger_cli_common.MenuItem(
+            "", "ps %s -b %d" % (file_path, line)) if text else None
+        line_number_line = RL("  ")
+        line_number_line += RL("Line:     %d" % line, attribute)
+        lines.append(line_number_line)
+
         lines.append("  Function: %s" % function_name)
         lines.append("  Text:     " + (("\"%s\"" % text) if text else "None"))
         lines.append("")
@@ -726,8 +782,7 @@ class DebugAnalyzer(object):
     except LookupError:
       lines.append("(Unavailable because no Python graph has been loaded)")
 
-    return debugger_cli_common.RichTextLines(lines,
-                                             font_attr_segs=font_attr_segs)
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
 
   def list_inputs(self, args, screen_info=None):
     """Command handler for inputs.
@@ -942,6 +997,187 @@ class DebugAnalyzer(object):
 
     return output
 
+  def _reconstruct_print_source_command(self,
+                                        parsed,
+                                        line_begin_decrease=0,
+                                        max_elements_per_line_increase=0):
+    return "ps %s %s -b %d -m %d" % (
+        parsed.source_file_path, "-t" if parsed.tensors else "",
+        max(parsed.line_begin - line_begin_decrease, 1),
+        parsed.max_elements_per_line + max_elements_per_line_increase)
+
+  def print_source(self, args, screen_info=None):
+    """Print the content of a source file."""
+    del screen_info  # Unused.
+
+    parsed = self._arg_parsers["print_source"].parse_args(args)
+
+    source_annotation = source_utils.annotate_source(
+        self._debug_dump,
+        parsed.source_file_path,
+        do_dumped_tensors=parsed.tensors,
+        min_line=parsed.line_begin)
+
+    source_lines, line_num_width = source_utils.load_source(
+        parsed.source_file_path)
+
+    labeled_source_lines = []
+    if parsed.line_begin > 1:
+      omitted_info_line = RL(
+          "(... Omitted %d source lines ...) " % (parsed.line_begin - 1),
+          "bold")
+      omitted_info_line += RL(
+          "+5",
+          debugger_cli_common.MenuItem(
+              None,
+              self._reconstruct_print_source_command(
+                  parsed, line_begin_decrease=5)))
+      labeled_source_lines.append(omitted_info_line)
+
+    for i, line in enumerate(source_lines[parsed.line_begin - 1:]):
+      annotated_line = RL("L%d" % (i + parsed.line_begin),
+                          cli_shared.COLOR_YELLOW)
+      annotated_line += " " * (line_num_width - len(annotated_line))
+      annotated_line += line
+      labeled_source_lines.append(annotated_line)
+
+      if i + parsed.line_begin in source_annotation:
+        sorted_elements = sorted(source_annotation[i + parsed.line_begin])
+        for k, element in enumerate(sorted_elements):
+          if k >= parsed.max_elements_per_line:
+            # TODO(cais): Replace this accordion pattern with the easier-to-use
+            # INIT_SCROLL_POS_KEY.
+            omitted_info_line = RL("    (... Omitted %d of %d %s ...) " % (
+                len(sorted_elements) - parsed.max_elements_per_line,
+                len(sorted_elements),
+                "tensor(s)" if parsed.tensors else "op(s)"))
+            omitted_info_line += RL(
+                "+5",
+                debugger_cli_common.MenuItem(
+                    None,
+                    self._reconstruct_print_source_command(
+                        parsed, max_elements_per_line_increase=5)))
+            labeled_source_lines.append(omitted_info_line)
+            break
+
+          label = RL(" " * 4)
+          if self._debug_dump.debug_watch_keys(
+              debug_data.get_node_name(element)):
+            attribute = debugger_cli_common.MenuItem("", "pt %s" % element)
+          else:
+            attribute = cli_shared.COLOR_BLUE
+
+          label += RL(element, attribute)
+          labeled_source_lines.append(label)
+
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(
+        labeled_source_lines)
+    _add_main_menu(output, node_name=None)
+    return output
+
+  def _make_source_table(self, source_list, is_tf_py_library):
+    """Make a table summarizing the source files that create nodes and tensors.
+
+    Args:
+      source_list: List of source files and related information as a list of
+        tuples (file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
+        first_line).
+      is_tf_py_library: (`bool`) whether this table is for files that belong
+        to the TensorFlow Python library.
+
+    Returns:
+      The table as a `debugger_cli_common.RichTextLines` object.
+    """
+    path_head = "Source file path"
+    num_nodes_head = "#(nodes)"
+    num_tensors_head = "#(tensors)"
+    num_dumps_head = "#(tensor dumps)"
+
+    if is_tf_py_library:
+      # Use color to mark files that are guessed to belong to TensorFlow Python
+      # library.
+      color = cli_shared.COLOR_GRAY
+      lines = [RL("TensorFlow Python library file(s):", color)]
+    else:
+      color = cli_shared.COLOR_WHITE
+      lines = [RL("File(s) outside TensorFlow Python library:", color)]
+
+    if not source_list:
+      lines.append(RL("[No files.]"))
+      lines.append(RL())
+      return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+
+    path_column_width = max(
+        max([len(item[0]) for item in source_list]), len(path_head)) + 1
+    num_nodes_column_width = max(
+        max([len(str(item[2])) for item in source_list]),
+        len(num_nodes_head)) + 1
+    num_tensors_column_width = max(
+        max([len(str(item[3])) for item in source_list]),
+        len(num_tensors_head)) + 1
+
+    head = RL(path_head + " " * (path_column_width - len(path_head)), color)
+    head += RL(num_nodes_head + " " * (
+        num_nodes_column_width - len(num_nodes_head)), color)
+    head += RL(num_tensors_head + " " * (
+        num_tensors_column_width - len(num_tensors_head)), color)
+    head += RL(num_dumps_head, color)
+
+    lines.append(head)
+
+    for (file_path, _, num_nodes, num_tensors, num_dumps,
+         first_line_num) in source_list:
+      path_attributes = [color]
+      if source_utils.is_extension_uncompiled_python_source(file_path):
+        path_attributes.append(
+            debugger_cli_common.MenuItem(None, "ps %s -b %d" %
+                                         (file_path, first_line_num)))
+
+      line = RL(file_path, path_attributes)
+      line += " " * (path_column_width - len(line))
+      line += RL(
+          str(num_nodes) + " " * (num_nodes_column_width - len(str(num_nodes))),
+          color)
+      line += RL(
+          str(num_tensors) + " " *
+          (num_tensors_column_width - len(str(num_tensors))), color)
+      line += RL(str(num_dumps), color)
+      lines.append(line)
+    lines.append(RL())
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+
+  def list_source(self, args, screen_info=None):
+    """List Python source files that constructed nodes and tensors."""
+    del screen_info  # Unused.
+
+    parsed = self._arg_parsers["list_source"].parse_args(args)
+    source_list = source_utils.list_source_files_against_dump(
+        self._debug_dump,
+        path_regex_whitelist=parsed.path_filter,
+        node_name_regex_whitelist=parsed.node_name_filter)
+
+    top_lines = [
+        RL("List of source files that created nodes in this run", "bold")]
+    if parsed.path_filter:
+      top_lines.append(
+          RL("File path regex filter: \"%s\"" % parsed.path_filter))
+    if parsed.node_name_filter:
+      top_lines.append(
+          RL("Node name regex filter: \"%s\"" % parsed.node_name_filter))
+    top_lines.append(RL())
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(top_lines)
+    if not source_list:
+      output.append("[No source file information.]")
+      return output
+
+    output.extend(self._make_source_table(
+        [item for item in source_list if not item[1]], False))
+    output.extend(self._make_source_table(
+        [item for item in source_list if item[1]], True))
+    _add_main_menu(output, node_name=None)
+    return output
+
   def _list_inputs_or_outputs(self,
                               recursive,
                               node_name,
@@ -1172,7 +1408,7 @@ class DebugAnalyzer(object):
       lines.append(line)
       font_attr_segs[len(lines) - 1] = [(
           len(line) - len(non_ctrl), len(line),
-          debugger_cli_common.MenuItem(None, "ni -a -d %s" % non_ctrl))]
+          debugger_cli_common.MenuItem(None, "ni -a -d -t %s" % non_ctrl))]
 
     if ctrls:
       lines.append("")
@@ -1182,7 +1418,7 @@ class DebugAnalyzer(object):
         lines.append(line)
         font_attr_segs[len(lines) - 1] = [(
             len(line) - len(ctrl), len(line),
-            debugger_cli_common.MenuItem(None, "ni -a -d %s" % ctrl))]
+            debugger_cli_common.MenuItem(None, "ni -a -d -t %s" % ctrl))]
 
     return debugger_cli_common.RichTextLines(
         lines, font_attr_segs=font_attr_segs)
@@ -1246,7 +1482,10 @@ class DebugAnalyzer(object):
     return output_with_header
 
 
-def create_analyzer_ui(debug_dump, tensor_filters=None, ui_type="curses"):
+def create_analyzer_ui(debug_dump,
+                       tensor_filters=None,
+                       ui_type="curses",
+                       on_ui_exit=None):
   """Create an instance of CursesUI based on a DebugDumpDir object.
 
   Args:
@@ -1254,6 +1493,7 @@ def create_analyzer_ui(debug_dump, tensor_filters=None, ui_type="curses"):
     tensor_filters: (dict) A dict mapping tensor filter name (str) to tensor
       filter (Callable).
     ui_type: (str) requested UI type, e.g., "curses", "readline".
+    on_ui_exit: (`Callable`) the callback to be called when the UI exits.
 
   Returns:
     (base_ui.BaseUI) A BaseUI subtype object with a set of standard analyzer
@@ -1266,7 +1506,7 @@ def create_analyzer_ui(debug_dump, tensor_filters=None, ui_type="curses"):
       analyzer.add_tensor_filter(
           tensor_filter_name, tensor_filters[tensor_filter_name])
 
-  cli = ui_factory.get_ui(ui_type)
+  cli = ui_factory.get_ui(ui_type, on_ui_exit=on_ui_exit)
   cli.register_command_handler(
       "list_tensors",
       analyzer.list_tensors,
@@ -1292,6 +1532,16 @@ def create_analyzer_ui(debug_dump, tensor_filters=None, ui_type="curses"):
       analyzer.print_tensor,
       analyzer.get_help("print_tensor"),
       prefix_aliases=["pt"])
+  cli.register_command_handler(
+      "print_source",
+      analyzer.print_source,
+      analyzer.get_help("print_source"),
+      prefix_aliases=["ps"])
+  cli.register_command_handler(
+      "list_source",
+      analyzer.list_source,
+      analyzer.get_help("list_source"),
+      prefix_aliases=["ls"])
 
   dumped_tensor_names = []
   for datum in debug_dump.dumped_tensor_data:
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 5d3f1aed2fb..ce224fff208 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import shutil
 import tempfile
 
@@ -25,11 +26,13 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_data
-from tensorflow.python.debug import debug_utils
 from tensorflow.python.debug.cli import analyzer_cli
+from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import source_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -37,6 +40,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+def line_number_above():
+  return tf_inspect.stack()[1][2] - 1
 
 
 def parse_op_and_node(line):
@@ -490,10 +498,14 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     cls._is_gpu_available = test.is_gpu_available()
     if cls._is_gpu_available:
-      cls._main_device = "/job:localhost/replica:0/task:0/gpu:0"
+      gpu_name = test_util.gpu_device_name()
+      cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name
     else:
       cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
 
+    cls._curr_file_path = os.path.abspath(
+        tf_inspect.getfile(tf_inspect.currentframe()))
+
     cls._sess = session.Session()
     with cls._sess as sess:
       u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
@@ -502,14 +514,19 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       u_name = "simple_mul_add/u"
       v_name = "simple_mul_add/v"
 
-      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u_init = constant_op.constant(u_init_val, shape=[2, 2], name="u_init")
       u = variables.Variable(u_init, name=u_name)
-      v_init = constant_op.constant(v_init_val, shape=[2, 1])
+      cls._u_line_number = line_number_above()
+
+      v_init = constant_op.constant(v_init_val, shape=[2, 1], name="v_init")
       v = variables.Variable(v_init, name=v_name)
+      cls._v_line_number = line_number_above()
 
       w = math_ops.matmul(u, v, name="simple_mul_add/matmul")
+      cls._w_line_number = line_number_above()
 
       x = math_ops.add(w, w, name="simple_mul_add/add")
+      cls._x_line_number = line_number_above()
 
       u.initializer.run()
       v.initializer.run()
@@ -550,6 +567,16 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         cls._analyzer.print_tensor,
         cls._analyzer.get_help("print_tensor"),
         prefix_aliases=["pt"])
+    cls._registry.register_command_handler(
+        "print_source",
+        cls._analyzer.print_source,
+        cls._analyzer.get_help("print_source"),
+        prefix_aliases=["ps"])
+    cls._registry.register_command_handler(
+        "list_source",
+        cls._analyzer.list_source,
+        cls._analyzer.get_help("list_source"),
+        prefix_aliases=["ls"])
 
   @classmethod
   def tearDownClass(cls):
@@ -887,7 +914,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         ["ERROR: There is no node named \"bar\" in the partition graphs"],
         out.lines)
     # Check color indicating error.
-    self.assertEqual({0: [(0, 59, "red")]}, out.font_attr_segs)
+    self.assertEqual({0: [(0, 59, cli_shared.COLOR_RED)]}, out.font_attr_segs)
     check_main_menu(self, out, list_tensors_enabled=True)
 
   def testPrintTensor(self):
@@ -1116,6 +1143,246 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                                  "There is no tensor filter named \"bar\""):
       analyzer.get_tensor_filter("bar")
 
+  def _findSourceLine(self, annotated_source, line_number):
+    """Find line of given line number in annotated source.
+
+    Args:
+      annotated_source: (debugger_cli_common.RichTextLines) the annotated source
+      line_number: (int) 1-based line number
+
+    Returns:
+      (int) If line_number is found, 0-based line index in
+        annotated_source.lines. Otherwise, None.
+    """
+
+    index = None
+    for i, line in enumerate(annotated_source.lines):
+      if line.startswith("L%d " % line_number):
+        index = i
+        break
+    return index
+
+  def testPrintSourceForOpNamesWholeFileWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "print_source", [self._curr_file_path], screen_info={"cols": 80})
+
+    # Verify the annotation of the line that creates u.
+    index = self._findSourceLine(out, self._u_line_number)
+    self.assertEqual(
+        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+         self._u_line_number,
+         "    simple_mul_add/u",
+         "    simple_mul_add/u/Assign",
+         "    simple_mul_add/u/read"],
+        out.lines[index : index + 4])
+    self.assertEqual("pt simple_mul_add/u",
+                     out.font_attr_segs[index + 1][0][2].content)
+    # simple_mul_add/u/Assign is not used in this run because the Variable has
+    # already been initialized.
+    self.assertEqual(cli_shared.COLOR_BLUE, out.font_attr_segs[index + 2][0][2])
+    self.assertEqual("pt simple_mul_add/u/read",
+                     out.font_attr_segs[index + 3][0][2].content)
+
+    # Verify the annotation of the line that creates v.
+    index = self._findSourceLine(out, self._v_line_number)
+    self.assertEqual(
+        ["L%d         v = variables.Variable(v_init, name=v_name)" %
+         self._v_line_number,
+         "    simple_mul_add/v"],
+        out.lines[index : index + 2])
+    self.assertEqual("pt simple_mul_add/v",
+                     out.font_attr_segs[index + 1][0][2].content)
+
+    # Verify the annotation of the line that creates w.
+    index = self._findSourceLine(out, self._w_line_number)
+    self.assertEqual(
+        ["L%d         " % self._w_line_number +
+         "w = math_ops.matmul(u, v, name=\"simple_mul_add/matmul\")",
+         "    simple_mul_add/matmul"],
+        out.lines[index : index + 2])
+    self.assertEqual("pt simple_mul_add/matmul",
+                     out.font_attr_segs[index + 1][0][2].content)
+
+    # Verify the annotation of the line that creates x.
+    index = self._findSourceLine(out, self._x_line_number)
+    self.assertEqual(
+        ["L%d         " % self._x_line_number +
+         "x = math_ops.add(w, w, name=\"simple_mul_add/add\")",
+         "    simple_mul_add/add"],
+        out.lines[index : index + 2])
+    self.assertEqual("pt simple_mul_add/add",
+                     out.font_attr_segs[index + 1][0][2].content)
+
+  def testPrintSourceForTensorNamesWholeFileWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "print_source",
+        [self._curr_file_path, "--tensors"],
+        screen_info={"cols": 80})
+
+    # Verify the annotation of the line that creates u.
+    index = self._findSourceLine(out, self._u_line_number)
+    self.assertEqual(
+        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+         self._u_line_number,
+         "    simple_mul_add/u/read:0",
+         "    simple_mul_add/u:0"],
+        out.lines[index : index + 3])
+    self.assertEqual("pt simple_mul_add/u/read:0",
+                     out.font_attr_segs[index + 1][0][2].content)
+    self.assertEqual("pt simple_mul_add/u:0",
+                     out.font_attr_segs[index + 2][0][2].content)
+
+  def testPrintSourceForOpNamesStartingAtSpecifiedLineWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "print_source",
+        [self._curr_file_path, "-b", "3"],
+        screen_info={"cols": 80})
+
+    self.assertIn("Omitted 2 source lines", out.lines[0])
+    self.assertTrue(out.lines[0].endswith("+5"))
+    expand_lines_command = out.font_attr_segs[0][-1][2].content
+    self.assertStartsWith(expand_lines_command,
+                          "ps %s " % self._curr_file_path)
+    self.assertIn("-b 1", expand_lines_command)
+
+    self.assertIsNone(self._findSourceLine(out, 1))
+    self.assertIsNone(self._findSourceLine(out, 2))
+    self.assertIsNotNone(self._findSourceLine(out, 3))
+
+    index = self._findSourceLine(out, self._u_line_number)
+    self.assertEqual(
+        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+         self._u_line_number,
+         "    simple_mul_add/u",
+         "    simple_mul_add/u/Assign",
+         "    simple_mul_add/u/read"],
+        out.lines[index : index + 4])
+    self.assertEqual("pt simple_mul_add/u",
+                     out.font_attr_segs[index + 1][0][2].content)
+    # simple_mul_add/u/Assign is not used in this run because the Variable has
+    # already been initialized.
+    self.assertEqual(cli_shared.COLOR_BLUE, out.font_attr_segs[index + 2][0][2])
+    self.assertEqual("pt simple_mul_add/u/read",
+                     out.font_attr_segs[index + 3][0][2].content)
+
+  def testPrintSourceForOpNameSettingMaximumElementCountWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "print_source",
+        [self._curr_file_path, "-m", "1"],
+        screen_info={"cols": 80})
+
+    index = self._findSourceLine(out, self._u_line_number)
+    self.assertEqual(
+        ["L%d         u = variables.Variable(u_init, name=u_name)" %
+         self._u_line_number,
+         "    simple_mul_add/u",
+         "    (... Omitted 2 of 3 op(s) ...) +5"],
+        out.lines[index : index + 3])
+    self.assertEqual("pt simple_mul_add/u",
+                     out.font_attr_segs[index + 1][0][2].content)
+    more_elements_command = out.font_attr_segs[index + 2][-1][2].content
+    self.assertStartsWith(more_elements_command,
+                          "ps %s " % self._curr_file_path)
+    self.assertIn(" -m 6", more_elements_command)
+
+  def testListSourceWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", [])
+
+    non_tf_lib_files_start = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("Source file path")][0] + 1
+    non_tf_lib_files_end = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("TensorFlow Python library file(s):")][0] - 1
+    non_tf_lib_files = [
+        line.split(" ")[0] for line
+        in out.lines[non_tf_lib_files_start : non_tf_lib_files_end]]
+    self.assertIn(self._curr_file_path, non_tf_lib_files)
+
+    # Check that the TF library files are marked with special color attribute.
+    for i in xrange(non_tf_lib_files_end + 1, len(out.lines)):
+      if not out.lines[i]:
+        continue
+      for attr_seg in  out.font_attr_segs[i]:
+        self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
+                        attr_seg[2] == cli_shared.COLOR_GRAY)
+
+  def testListSourceWithNodeNameFilterWithMatchesWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", ["-n", ".*/read"])
+
+    self.assertStartsWith(out.lines[1], "Node name regex filter: \".*/read\"")
+
+    non_tf_lib_files_start = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("Source file path")][0] + 1
+    non_tf_lib_files_end = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("TensorFlow Python library file(s):")][0] - 1
+    non_tf_lib_files = [
+        line.split(" ")[0] for line
+        in out.lines[non_tf_lib_files_start : non_tf_lib_files_end]]
+    self.assertIn(self._curr_file_path, non_tf_lib_files)
+
+    # Check that the TF library files are marked with special color attribute.
+    for i in xrange(non_tf_lib_files_end + 1, len(out.lines)):
+      if not out.lines[i]:
+        continue
+      for attr_seg in  out.font_attr_segs[i]:
+        self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
+                        attr_seg[2] == cli_shared.COLOR_GRAY)
+
+  def testListSourceWithNodeNameFilterWithNoMatchesWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", ["-n", "^$"])
+
+    self.assertEqual([
+        "List of source files that created nodes in this run",
+        "Node name regex filter: \"^$\"", "",
+        "[No source file information.]"], out.lines)
+
+  def testListSourceWithPathAndNodeNameFiltersWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "list_source", ["-p", self._curr_file_path, "-n", ".*read"])
+
+    self.assertEqual([
+        "List of source files that created nodes in this run",
+        "File path regex filter: \"%s\"" % self._curr_file_path,
+        "Node name regex filter: \".*read\"", ""], out.lines[:4])
+
+  def testListSourceWithCompiledPythonSourceWorks(self):
+    def fake_list_source_files_against_dump(dump,
+                                            path_regex_whitelist=None,
+                                            node_name_regex_whitelist=None):
+      del dump, path_regex_whitelist, node_name_regex_whitelist
+      return [("compiled_1.pyc", False, 10, 20, 30, 4),
+              ("compiled_2.pyo", False, 10, 20, 30, 5),
+              ("uncompiled.py", False, 10, 20, 30, 6)]
+
+    with test.mock.patch.object(
+        source_utils, "list_source_files_against_dump",
+        side_effect=fake_list_source_files_against_dump):
+      out = self._registry.dispatch_command("list_source", [])
+
+      self.assertStartsWith(out.lines[4], "compiled_1.pyc")
+      self.assertEqual((0, 14, [cli_shared.COLOR_WHITE]),
+                       out.font_attr_segs[4][0])
+      self.assertStartsWith(out.lines[5], "compiled_2.pyo")
+      self.assertEqual((0, 14, [cli_shared.COLOR_WHITE]),
+                       out.font_attr_segs[5][0])
+      self.assertStartsWith(out.lines[6], "uncompiled.py")
+      self.assertEqual(0, out.font_attr_segs[6][0][0])
+      self.assertEqual(13, out.font_attr_segs[6][0][1])
+      self.assertEqual(cli_shared.COLOR_WHITE, out.font_attr_segs[6][0][2][0])
+      self.assertEqual("ps uncompiled.py -b 6",
+                       out.font_attr_segs[6][0][2][1].content)
+
 
 class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
 
@@ -1195,7 +1462,8 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
 
     cls._is_gpu_available = test.is_gpu_available()
     if cls._is_gpu_available:
-      cls._main_device = "/job:localhost/replica:0/task:0/gpu:0"
+      gpu_name = test_util.gpu_device_name()
+      cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name
     else:
       cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
 
@@ -1281,7 +1549,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     # Verify the menu items (command shortcuts) in the output.
     check_menu_item(self, out, 10,
                     len(out.lines[10]) - len("control_deps/x/read"),
-                    len(out.lines[10]), "ni -a -d control_deps/x/read")
+                    len(out.lines[10]), "ni -a -d -t control_deps/x/read")
     if out.lines[13].endswith("control_deps/ctrl_dep_y"):
       y_line = 13
       z_line = 14
@@ -1290,10 +1558,12 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
       z_line = 13
     check_menu_item(self, out, y_line,
                     len(out.lines[y_line]) - len("control_deps/ctrl_dep_y"),
-                    len(out.lines[y_line]), "ni -a -d control_deps/ctrl_dep_y")
+                    len(out.lines[y_line]),
+                    "ni -a -d -t control_deps/ctrl_dep_y")
     check_menu_item(self, out, z_line,
                     len(out.lines[z_line]) - len("control_deps/ctrl_dep_z"),
-                    len(out.lines[z_line]), "ni -a -d control_deps/ctrl_dep_z")
+                    len(out.lines[z_line]),
+                    "ni -a -d -t control_deps/ctrl_dep_z")
 
   def testListInputsNonRecursiveNoControl(self):
     """List inputs non-recursively, without any control inputs."""
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index 30b30de3344..9164e18bcf5 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
+
 import numpy as np
 import six
 
@@ -26,11 +28,27 @@ from tensorflow.python.debug.cli import tensor_format
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 
+RL = debugger_cli_common.RichLine
 
 # Default threshold number of elements above which ellipses will be used
 # when printing the value of the tensor.
 DEFAULT_NDARRAY_DISPLAY_THRESHOLD = 2000
 
+COLOR_BLACK = "black"
+COLOR_BLUE = "blue"
+COLOR_CYAN = "cyan"
+COLOR_GRAY = "gray"
+COLOR_GREEN = "green"
+COLOR_MAGENTA = "magenta"
+COLOR_RED = "red"
+COLOR_WHITE = "white"
+COLOR_YELLOW = "yellow"
+
+TIME_UNIT_US = "us"
+TIME_UNIT_MS = "ms"
+TIME_UNIT_S = "s"
+TIME_UNITS = [TIME_UNIT_US, TIME_UNIT_MS, TIME_UNIT_S]
+
 
 def bytes_to_readable_str(num_bytes, include_b=False):
   """Generate a human-readable string representing number of bytes.
@@ -62,6 +80,34 @@ def bytes_to_readable_str(num_bytes, include_b=False):
   return result
 
 
+def time_to_readable_str(value_us, force_time_unit=None):
+  """Convert time value to human-readable string.
+
+  Args:
+    value_us: time value in microseconds.
+    force_time_unit: force the output to use the specified time unit. Must be
+      in TIME_UNITS.
+
+  Returns:
+    Human-readable string representation of the time value.
+
+  Raises:
+    ValueError: if force_time_unit value is not in TIME_UNITS.
+  """
+  if not value_us:
+    return "0"
+  if force_time_unit:
+    if force_time_unit not in TIME_UNITS:
+      raise ValueError("Invalid time unit: %s" % force_time_unit)
+    order = TIME_UNITS.index(force_time_unit)
+    time_unit = force_time_unit
+    return "{:.10g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
+  else:
+    order = min(len(TIME_UNITS) - 1, int(math.log(value_us, 10) / 3))
+    time_unit = TIME_UNITS[order]
+    return "{:.3g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
+
+
 def parse_ranges_highlight(ranges_string):
   """Process ranges highlight string.
 
@@ -152,9 +198,8 @@ def error(msg):
       for screen output.
   """
 
-  full_msg = "ERROR: " + msg
-  return debugger_cli_common.RichTextLines(
-      [full_msg], font_attr_segs={0: [(0, len(full_msg), "red")]})
+  return debugger_cli_common.rich_text_lines_from_rich_line_list([
+      RL("ERROR: " + msg, COLOR_RED)])
 
 
 def _get_fetch_name(fetch):
@@ -203,7 +248,7 @@ def _recommend_command(command, description, indent=2, create_link=False):
 
   Args:
     command: (str) The command to recommend.
-    description: (str) A description of what the the command does.
+    description: (str) A description of what the command does.
     indent: (int) How many spaces to indent in the beginning.
     create_link: (bool) Whether a command link is to be applied to the command
       string.
@@ -214,16 +259,16 @@ def _recommend_command(command, description, indent=2, create_link=False):
   """
 
   indent_str = " " * indent
-  lines = [indent_str + command + ":", indent_str + "  " + description]
 
   if create_link:
-    font_attr_segs = {
-        0: [(indent, indent + len(command), [
-            debugger_cli_common.MenuItem("", command), "bold"])]}
+    font_attr = [debugger_cli_common.MenuItem("", command), "bold"]
   else:
-    font_attr_segs = {0: [(indent, indent + len(command), "bold")]}
+    font_attr = "bold"
 
-  return debugger_cli_common.RichTextLines(lines, font_attr_segs=font_attr_segs)
+  lines = [RL(indent_str) + RL(command, font_attr) + ":",
+           indent_str + "  " + description]
+
+  return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
 
 
 def get_tfdbg_logo():
@@ -301,30 +346,26 @@ def get_run_start_intro(run_call_count,
       _recommend_command(
           "run -t <T>",
           "Execute run() calls (T - 1) times without debugging, then "
-          "execute run() one more time and drop back to the CLI"))
+          "execute run() once more with debugging and drop back to the CLI"))
   out.extend(
       _recommend_command(
           "run -f <filter_name>",
           "Keep executing run() calls until a dumped tensor passes a given, "
           "registered filter (conditional breakpoint mode)"))
 
-  more_font_attr_segs = {}
   more_lines = ["    Registered filter(s):"]
   if tensor_filters:
     filter_names = []
     for filter_name in tensor_filters:
       filter_names.append(filter_name)
-      more_lines.append("        * " + filter_name)
       command_menu_node = debugger_cli_common.MenuItem(
           "", "run -f %s" % filter_name)
-      more_font_attr_segs[len(more_lines) - 1] = [
-          (10, len(more_lines[-1]), command_menu_node)]
+      more_lines.append(RL("        * ") + RL(filter_name, command_menu_node))
   else:
     more_lines.append("        (None)")
 
   out.extend(
-      debugger_cli_common.RichTextLines(
-          more_lines, font_attr_segs=more_font_attr_segs))
+      debugger_cli_common.rich_text_lines_from_rich_line_list(more_lines))
 
   out.extend(
       _recommend_command(
@@ -334,11 +375,10 @@ def get_run_start_intro(run_call_count,
           "inspect/modify their values", create_link=True))
 
   out.append("")
-  suggest_help = "For more details, see help."
-  out.append(
-      suggest_help,
-      font_attr_segs=[(len(suggest_help) - 5, len(suggest_help) - 1,
-                       debugger_cli_common.MenuItem("", "help"))])
+
+  out.append_rich_line(RL("For more details, see ") +
+                       RL("help.", debugger_cli_common.MenuItem("", "help")) +
+                       ".")
   out.append("")
 
   # Make main menu for the run-start intro.
@@ -407,14 +447,12 @@ def get_error_intro(tf_error):
 
   intro_lines = [
       "--------------------------------------",
-      "!!! An error occurred during the run !!!",
+      RL("!!! An error occurred during the run !!!", "blink"),
       "",
       "You may use the following commands to debug:",
   ]
-  intro_font_attr_segs = {1: [(0, len(intro_lines[1]), "blink")]}
 
-  out = debugger_cli_common.RichTextLines(
-      intro_lines, font_attr_segs=intro_font_attr_segs)
+  out = debugger_cli_common.rich_text_lines_from_rich_line_list(intro_lines)
 
   out.extend(
       _recommend_command("ni -a -d -t %s" % op_name,
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 1ef3c342546..647bbd5f0f2 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -70,6 +70,41 @@ class BytesToReadableStrTest(test_util.TensorFlowTestCase):
             1024**3, include_b=True))
 
 
+class TimeToReadableStrTest(test_util.TensorFlowTestCase):
+
+  def testNoneTimeWorks(self):
+    self.assertEqual("0", cli_shared.time_to_readable_str(None))
+
+  def testMicrosecondsTime(self):
+    self.assertEqual("40us", cli_shared.time_to_readable_str(40))
+
+  def testMillisecondTime(self):
+    self.assertEqual("40ms", cli_shared.time_to_readable_str(40e3))
+
+  def testSecondTime(self):
+    self.assertEqual("40s", cli_shared.time_to_readable_str(40e6))
+
+  def testForceTimeUnit(self):
+    self.assertEqual("40s",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("40000ms",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_MS))
+    self.assertEqual("40000000us",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_US))
+    self.assertEqual("4e-05s",
+                     cli_shared.time_to_readable_str(
+                         40, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("0",
+                     cli_shared.time_to_readable_str(
+                         0, force_time_unit=cli_shared.TIME_UNIT_S))
+
+    with self.assertRaisesRegexp(ValueError, r"Invalid time unit: ks"):
+      cli_shared.time_to_readable_str(100, force_time_unit="ks")
+
+
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/command_parser.py b/tensorflow/python/debug/cli/command_parser.py
index 8311c36ea24..143c1045199 100644
--- a/tensorflow/python/debug/cli/command_parser.py
+++ b/tensorflow/python/debug/cli/command_parser.py
@@ -26,6 +26,31 @@ _BRACKETS_PATTERN = re.compile(r"\[[^\]]*\]")
 _QUOTES_PATTERN = re.compile(r"\"[^\"]*\"")
 _WHITESPACE_PATTERN = re.compile(r"\s+")
 
+_NUMBER_PATTERN = re.compile(r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?")
+
+
+class Interval(object):
+  """Represents an interval between a start and end value."""
+
+  def __init__(self, start, start_included, end, end_included):
+    self.start = start
+    self.start_included = start_included
+    self.end = end
+    self.end_included = end_included
+
+  def contains(self, value):
+    if value < self.start or value == self.start and not self.start_included:
+      return False
+    if value > self.end or value == self.end and not self.end_included:
+      return False
+    return True
+
+  def __eq__(self, other):
+    return (self.start == other.start and
+            self.start_included == other.start_included and
+            self.end == other.end and
+            self.end_included == other.end_included)
+
 
 def parse_command(command):
   """Parse command string into a list of arguments.
@@ -91,15 +116,26 @@ def extract_output_file_path(args):
   if args and args[-1].endswith(">"):
     raise SyntaxError("Redirect file path is empty")
   elif args and args[-1].startswith(">"):
-    output_file_path = args[-1][1:]
-    args = args[:-1]
+    try:
+      _parse_interval(args[-1])
+      if len(args) > 1 and args[-2].startswith("-"):
+        output_file_path = None
+      else:
+        output_file_path = args[-1][1:]
+        args = args[:-1]
+    except ValueError:
+      output_file_path = args[-1][1:]
+      args = args[:-1]
   elif len(args) > 1 and args[-2] == ">":
     output_file_path = args[-1]
     args = args[:-2]
   elif args and args[-1].count(">") == 1:
     gt_index = args[-1].index(">")
-    output_file_path = args[-1][gt_index + 1:]
-    args[-1] = args[-1][:gt_index]
+    if gt_index > 0 and args[-1][gt_index - 1] == "=":
+      output_file_path = None
+    else:
+      output_file_path = args[-1][gt_index + 1:]
+      args[-1] = args[-1][:gt_index]
   elif len(args) > 1 and args[-2].endswith(">"):
     output_file_path = args[-1]
     args = args[:-1]
@@ -149,6 +185,35 @@ def validate_slicing_string(slicing_string):
   return bool(re.search(r"^\[(\d|,|\s|:)+\]$", slicing_string))
 
 
+def _parse_slices(slicing_string):
+  """Construct a tuple of slices from the slicing string.
+
+  The string must be a valid slicing string.
+
+  Args:
+    slicing_string: (str) Input slicing string to be parsed.
+
+  Returns:
+    tuple(slice1, slice2, ...)
+
+  Raises:
+    ValueError: If tensor_slicing is not a valid numpy ndarray slicing str.
+  """
+  parsed = []
+  for slice_string in slicing_string[1:-1].split(","):
+    indices = slice_string.split(":")
+    if len(indices) == 1:
+      parsed.append(int(indices[0].strip()))
+    elif 2 <= len(indices) <= 3:
+      parsed.append(
+          slice(*[
+              int(index.strip()) if index.strip() else None for index in indices
+          ]))
+    else:
+      raise ValueError("Invalid tensor-slicing string.")
+  return tuple(parsed)
+
+
 def parse_indices(indices_string):
   """Parse a string representing indices.
 
@@ -214,6 +279,131 @@ def parse_ranges(range_string):
   return ranges
 
 
+def parse_memory_interval(interval_str):
+  """Convert a human-readable memory interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[10kB, 20kB]", "<100M", ">100G"). Only the units "kB", "MB", "GB"
+      are supported. The "B character at the end of the input `str` may be
+      omitted.
+
+  Returns:
+    `Interval` object where start and end are in bytes.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  str_interval = _parse_interval(interval_str)
+  interval_start = 0
+  interval_end = float("inf")
+  if str_interval.start:
+    interval_start = parse_readable_size_str(str_interval.start)
+  if str_interval.end:
+    interval_end = parse_readable_size_str(str_interval.end)
+  if interval_start > interval_end:
+    raise ValueError(
+        "Invalid interval %s. Start of interval must be less than or equal "
+        "to end of interval." % interval_str)
+  return Interval(interval_start, str_interval.start_included,
+                  interval_end, str_interval.end_included)
+
+
+def parse_time_interval(interval_str):
+  """Convert a human-readable time interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[10us, 20us]", "<100s", ">100ms"). Supported time suffixes are
+      us, ms, s.
+
+  Returns:
+    `Interval` object where start and end are in microseconds.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  str_interval = _parse_interval(interval_str)
+  interval_start = 0
+  interval_end = float("inf")
+  if str_interval.start:
+    interval_start = parse_readable_time_str(str_interval.start)
+  if str_interval.end:
+    interval_end = parse_readable_time_str(str_interval.end)
+  if interval_start > interval_end:
+    raise ValueError(
+        "Invalid interval %s. Start must be before end of interval." %
+        interval_str)
+  return Interval(interval_start, str_interval.start_included,
+                  interval_end, str_interval.end_included)
+
+
+def _parse_interval(interval_str):
+  """Convert a human-readable interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[1M, 2M]", "<100k", ">100ms"). The items following the ">", "<",
+      ">=" and "<=" signs have to start with a number (e.g., 3.0, -2, .98).
+      The same requirement applies to the items in the parentheses or brackets.
+
+  Returns:
+    Interval object where start or end can be None
+    if the range is specified as "<N" or ">N" respectively.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  interval_str = interval_str.strip()
+  if interval_str.startswith("<="):
+    if _NUMBER_PATTERN.match(interval_str[2:].strip()):
+      return Interval(start=None, start_included=False,
+                      end=interval_str[2:].strip(), end_included=True)
+    else:
+      raise ValueError("Invalid value string after <= in '%s'" % interval_str)
+  if interval_str.startswith("<"):
+    if _NUMBER_PATTERN.match(interval_str[1:].strip()):
+      return Interval(start=None, start_included=False,
+                      end=interval_str[1:].strip(), end_included=False)
+    else:
+      raise ValueError("Invalid value string after < in '%s'" % interval_str)
+  if interval_str.startswith(">="):
+    if _NUMBER_PATTERN.match(interval_str[2:].strip()):
+      return Interval(start=interval_str[2:].strip(), start_included=True,
+                      end=None, end_included=False)
+    else:
+      raise ValueError("Invalid value string after >= in '%s'" % interval_str)
+  if interval_str.startswith(">"):
+    if _NUMBER_PATTERN.match(interval_str[1:].strip()):
+      return Interval(start=interval_str[1:].strip(), start_included=False,
+                      end=None, end_included=False)
+    else:
+      raise ValueError("Invalid value string after > in '%s'" % interval_str)
+
+  if (not interval_str.startswith(("[", "("))
+      or not interval_str.endswith(("]", ")"))):
+    raise ValueError(
+        "Invalid interval format: %s. Valid formats are: [min, max], "
+        "(min, max), <max, >min" % interval_str)
+  interval = interval_str[1:-1].split(",")
+  if len(interval) != 2:
+    raise ValueError(
+        "Incorrect interval format: %s. Interval should specify two values: "
+        "[min, max] or (min, max)." % interval_str)
+
+  start_item = interval[0].strip()
+  if not _NUMBER_PATTERN.match(start_item):
+    raise ValueError("Invalid first item in interval: '%s'" % start_item)
+  end_item = interval[1].strip()
+  if not _NUMBER_PATTERN.match(end_item):
+    raise ValueError("Invalid second item in interval: '%s'" % end_item)
+
+  return Interval(start=start_item,
+                  start_included=(interval_str[0] == "["),
+                  end=end_item,
+                  end_included=(interval_str[-1] == "]"))
+
+
 def parse_readable_size_str(size_str):
   """Convert a human-readable str representation to number of bytes.
 
@@ -248,6 +438,34 @@ def parse_readable_size_str(size_str):
                      size_str)
 
 
+def parse_readable_time_str(time_str):
+  """Parses a time string in the format N, Nus, Nms, Ns.
+
+  Args:
+    time_str: (`str`) string consisting of an integer time value optionally
+      followed by 'us', 'ms', or 's' suffix. If suffix is not specified,
+      value is assumed to be in microseconds. (e.g. 100us, 8ms, 5s, 100).
+
+  Returns:
+    Microseconds value.
+  """
+  def parse_positive_float(value_str):
+    value = float(value_str)
+    if value < 0:
+      raise ValueError(
+          "Invalid time %s. Time value must be positive." % value_str)
+    return value
+
+  time_str = time_str.strip()
+  if time_str.endswith("us"):
+    return int(parse_positive_float(time_str[:-2]))
+  elif time_str.endswith("ms"):
+    return int(parse_positive_float(time_str[:-2]) * 1e3)
+  elif time_str.endswith("s"):
+    return int(parse_positive_float(time_str[:-1]) * 1e6)
+  return int(parse_positive_float(time_str))
+
+
 def evaluate_tensor_slice(tensor, tensor_slicing):
   """Call eval on the slicing of a tensor, with validation.
 
@@ -268,4 +486,4 @@ def evaluate_tensor_slice(tensor, tensor_slicing):
   if not validate_slicing_string(tensor_slicing):
     raise ValueError("Invalid tensor-slicing string.")
 
-  return eval("tensor" + tensor_slicing)  # pylint: disable=eval-used
+  return tensor[_parse_slices(tensor_slicing)]
diff --git a/tensorflow/python/debug/cli/command_parser_test.py b/tensorflow/python/debug/cli/command_parser_test.py
index 3f8b8744c32..1ea890be8c9 100644
--- a/tensorflow/python/debug/cli/command_parser_test.py
+++ b/tensorflow/python/debug/cli/command_parser_test.py
@@ -132,6 +132,63 @@ class ExtractOutputFilePathTest(test_util.TensorFlowTestCase):
     self.assertEqual(["pt", "a:0"], args)
     self.assertEqual(output_path, "/tmp/foo.txt")
 
+  def testFlagWithEqualGreaterThanShouldIgnoreIntervalFlags(self):
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time=>100ms"])
+    self.assertEqual(["lp", "--execution_time=>100ms"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">1.2s"])
+    self.assertEqual(["lp", "--execution_time", ">1.2s"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "-e", ">1200"])
+    self.assertEqual(["lp", "-e", ">1200"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--foo_value", ">-.2MB"])
+    self.assertEqual(["lp", "--foo_value", ">-.2MB"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--bar_value", ">-42e3GB"])
+    self.assertEqual(["lp", "--bar_value", ">-42e3GB"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">=100ms"])
+    self.assertEqual(["lp", "--execution_time", ">=100ms"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time=>=100ms"])
+    self.assertEqual(["lp", "--execution_time=>=100ms"], args)
+    self.assertIsNone(output_path)
+
+  def testFlagWithEqualGreaterThanShouldRecognizeFilePaths(self):
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", ">1.2s"])
+    self.assertEqual(["lp"], args)
+    self.assertEqual("1.2s", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">x.yms"])
+    self.assertEqual(["lp", "--execution_time"], args)
+    self.assertEqual("x.yms", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--memory", ">a.1kB"])
+    self.assertEqual(["lp", "--memory"], args)
+    self.assertEqual("a.1kB", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--memory", ">e002MB"])
+    self.assertEqual(["lp", "--memory"], args)
+    self.assertEqual("e002MB", output_path)
+
   def testOneArgumentIsHandledCorrectly(self):
     args, output_path = command_parser.extract_output_file_path(["lt"])
     self.assertEqual(["lt"], args)
@@ -297,5 +354,161 @@ class ParseReadableSizeStrTest(test_util.TensorFlowTestCase):
       command_parser.parse_readable_size_str("2EB")
 
 
+class ParseReadableTimeStrTest(test_util.TensorFlowTestCase):
+
+  def testParseNoUnitWorks(self):
+    self.assertEqual(0, command_parser.parse_readable_time_str("0"))
+    self.assertEqual(100, command_parser.parse_readable_time_str("100 "))
+    self.assertEqual(25, command_parser.parse_readable_time_str(" 25 "))
+
+  def testParseSeconds(self):
+    self.assertEqual(1e6, command_parser.parse_readable_time_str("1 s"))
+    self.assertEqual(2e6, command_parser.parse_readable_time_str("2s"))
+
+  def testParseMicros(self):
+    self.assertEqual(2, command_parser.parse_readable_time_str("2us"))
+
+  def testParseMillis(self):
+    self.assertEqual(2e3, command_parser.parse_readable_time_str("2ms"))
+
+  def testParseUnsupportedUnitRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError, r".*float.*2us.*"):
+      command_parser.parse_readable_time_str("2uss")
+
+    with self.assertRaisesRegexp(
+        ValueError, r".*float.*2m.*"):
+      command_parser.parse_readable_time_str("2m")
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Invalid time -1. Time value must be positive."):
+      command_parser.parse_readable_time_str("-1s")
+
+
+class ParseInterval(test_util.TensorFlowTestCase):
+
+  def testParseTimeInterval(self):
+    self.assertEquals(
+        command_parser.Interval(10, True, 1e3, True),
+        command_parser.parse_time_interval("[10us, 1ms]"))
+    self.assertEquals(
+        command_parser.Interval(10, False, 1e3, False),
+        command_parser.parse_time_interval("(10us, 1ms)"))
+    self.assertEquals(
+        command_parser.Interval(10, False, 1e3, True),
+        command_parser.parse_time_interval("(10us, 1ms]"))
+    self.assertEquals(
+        command_parser.Interval(10, True, 1e3, False),
+        command_parser.parse_time_interval("[10us, 1ms)"))
+    self.assertEquals(command_parser.Interval(0, False, 1e3, True),
+                      command_parser.parse_time_interval("<=1ms"))
+    self.assertEquals(
+        command_parser.Interval(1e3, True, float("inf"), False),
+        command_parser.parse_time_interval(">=1ms"))
+    self.assertEquals(command_parser.Interval(0, False, 1e3, False),
+                      command_parser.parse_time_interval("<1ms"))
+    self.assertEquals(
+        command_parser.Interval(1e3, False, float("inf"), False),
+        command_parser.parse_time_interval(">1ms"))
+
+  def testParseTimeGreaterLessThanWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+      command_parser.parse_time_interval(">=wms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+      command_parser.parse_time_interval(">Yms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+      command_parser.parse_time_interval("<= _ms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+      command_parser.parse_time_interval("<-ms")
+
+  def testParseTimeIntervalsWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("[wms, 10ms]")
+    with self.assertRaisesRegexp(ValueError,
+                                 "Invalid second item in interval:"):
+      command_parser.parse_time_interval("[ 0ms, _ms]")
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("(xms, _ms]")
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("((3ms, _ms)")
+
+  def testInvalidTimeIntervalRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval format: \[10us, 1ms. Valid formats are: "
+        r"\[min, max\], \(min, max\), <max, >min"):
+      command_parser.parse_time_interval("[10us, 1ms")
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Incorrect interval format: \[10us, 1ms, 2ms\]. Interval should "
+        r"specify two values: \[min, max\] or \(min, max\)"):
+      command_parser.parse_time_interval("[10us, 1ms, 2ms]")
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval \[1s, 1ms\]. Start must be before end of interval."):
+      command_parser.parse_time_interval("[1s, 1ms]")
+
+  def testParseMemoryInterval(self):
+    self.assertEquals(
+        command_parser.Interval(1024, True, 2048, True),
+        command_parser.parse_memory_interval("[1k, 2k]"))
+    self.assertEquals(
+        command_parser.Interval(1024, False, 2048, False),
+        command_parser.parse_memory_interval("(1kB, 2kB)"))
+    self.assertEquals(
+        command_parser.Interval(1024, False, 2048, True),
+        command_parser.parse_memory_interval("(1k, 2k]"))
+    self.assertEquals(
+        command_parser.Interval(1024, True, 2048, False),
+        command_parser.parse_memory_interval("[1k, 2k)"))
+    self.assertEquals(
+        command_parser.Interval(0, False, 2048, True),
+        command_parser.parse_memory_interval("<=2k"))
+    self.assertEquals(
+        command_parser.Interval(11, True, float("inf"), False),
+        command_parser.parse_memory_interval(">=11"))
+    self.assertEquals(command_parser.Interval(0, False, 2048, False),
+                      command_parser.parse_memory_interval("<2k"))
+    self.assertEquals(
+        command_parser.Interval(11, False, float("inf"), False),
+        command_parser.parse_memory_interval(">11"))
+
+  def testParseMemoryIntervalsWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+      command_parser.parse_time_interval(">=wM")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+      command_parser.parse_time_interval(">YM")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+      command_parser.parse_time_interval("<= _MB")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+      command_parser.parse_time_interval("<-MB")
+
+  def testInvalidMemoryIntervalRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval \[5k, 3k\]. Start of interval must be less than or "
+        "equal to end of interval."):
+      command_parser.parse_memory_interval("[5k, 3k]")
+
+  def testIntervalContains(self):
+    interval = command_parser.Interval(
+        start=1, start_included=True, end=10, end_included=True)
+    self.assertTrue(interval.contains(1))
+    self.assertTrue(interval.contains(10))
+    self.assertTrue(interval.contains(5))
+
+    interval.start_included = False
+    self.assertFalse(interval.contains(1))
+    self.assertTrue(interval.contains(10))
+
+    interval.end_included = False
+    self.assertFalse(interval.contains(1))
+    self.assertFalse(interval.contains(10))
+
+    interval.start_included = True
+    self.assertTrue(interval.contains(1))
+    self.assertFalse(interval.contains(10))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index ba17c1632d8..6a571c097ee 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -20,17 +20,34 @@ from __future__ import print_function
 import collections
 import curses
 from curses import textpad
+import os
 import signal
 import sys
+import threading
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.debug.cli import base_ui
+from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
+from tensorflow.python.debug.cli import curses_widgets
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import tensor_format
 
 
+_SCROLL_REFRESH = "refresh"
+_SCROLL_UP = "up"
+_SCROLL_DOWN = "down"
+_SCROLL_UP_A_LINE = "up_a_line"
+_SCROLL_DOWN_A_LINE = "down_a_line"
+_SCROLL_HOME = "home"
+_SCROLL_END = "end"
+_SCROLL_TO_LINE_INDEX = "scroll_to_line_index"
+
+_COLOR_READY_COLORTERMS = ["gnome-terminal", "xfce4-terminal"]
+_COLOR_ENABLED_TERM = "xterm-256color"
+
+
 def _get_command_from_line_attr_segs(mouse_x, attr_segs):
   """Attempt to extract command from the attribute segments of a line.
 
@@ -51,6 +68,146 @@ def _get_command_from_line_attr_segs(mouse_x, attr_segs):
           return attr.content
 
 
+class ScrollBar(object):
+  """Vertical ScrollBar for Curses-based CLI.
+
+  An object of this class has knowledge of the location of the scroll bar
+  in the screen coordinates, the current scrolling position, and the total
+  number of text lines in the screen text. By using this information, it
+  can generate text rendering of the scroll bar, which consists of and UP
+  button on the top and a DOWN button on the bottom, in addition to a scroll
+  block in between, whose exact location is determined by the scrolling
+  position. The object can also calculate the scrolling command (e.g.,
+  _SCROLL_UP_A_LINE, _SCROLL_DOWN) from the coordinate of a mouse click
+  event in the screen region it occupies.
+  """
+
+  BASE_ATTR = cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE
+
+  def __init__(self,
+               min_x,
+               min_y,
+               max_x,
+               max_y,
+               scroll_position,
+               output_num_rows):
+    """Constructor of ScrollBar.
+
+    Args:
+      min_x: (int) left index of the scroll bar on the screen (inclusive).
+      min_y: (int) top index of the scroll bar on the screen (inclusive).
+      max_x: (int) right index of the scroll bar on the screen (inclusive).
+      max_y: (int) bottom index of the scroll bar on the screen (inclusive).
+      scroll_position: (int) 0-based location of the screen output. For example,
+        if the screen output is scrolled to the top, the value of
+        scroll_position should be 0. If it is scrolled to the bottom, the value
+        should be output_num_rows - 1.
+      output_num_rows: (int) Total number of output rows.
+
+    Raises:
+      ValueError: If the width or height of the scroll bar, as determined
+       by min_x, max_x, min_y and max_y, is too small.
+    """
+
+    self._min_x = min_x
+    self._min_y = min_y
+    self._max_x = max_x
+    self._max_y = max_y
+    self._scroll_position = scroll_position
+    self._output_num_rows = output_num_rows
+    self._scroll_bar_height = max_y - min_y + 1
+
+    if self._max_x < self._min_x:
+      raise ValueError("Insufficient width for ScrollBar (%d)" %
+                       (self._max_x - self._min_x + 1))
+    if self._max_y < self._min_y + 3:
+      raise ValueError("Insufficient height for ScrollBar (%d)" %
+                       (self._max_y - self._min_y + 1))
+
+  def _block_y(self, screen_coord_sys=False):
+    """Get the 0-based y coordinate of the scroll block.
+
+    This y coordinate takes into account the presence of the UP and DN buttons
+    present at the top and bottom of the ScrollBar. For example, at the home
+    location, the return value will be 1; at the bottom location, the return
+    value will be self._scroll_bar_height - 2.
+
+    Args:
+      screen_coord_sys: (`bool`) whether the return value will be in the
+        screen coordinate system.
+
+    Returns:
+      (int) 0-based y coordinate of the scroll block, in the ScrollBar
+        coordinate system by default. For example,
+        when scroll position is at the top, this return value will be 1 (not 0,
+        because of the presence of the UP button). When scroll position is at
+        the bottom, this return value will be self._scroll_bar_height - 2
+        (not self._scroll_bar_height - 1, because of the presence of the DOWN
+        button).
+    """
+
+    rel_block_y = int(
+        float(self._scroll_position) / (self._output_num_rows - 1) *
+        (self._scroll_bar_height - 3)) + 1
+    return rel_block_y + self._min_y if screen_coord_sys else rel_block_y
+
+  def layout(self):
+    """Get the RichTextLines layout of the scroll bar.
+
+    Returns:
+      (debugger_cli_common.RichTextLines) The text layout of the scroll bar.
+    """
+    width = self._max_x - self._min_x + 1
+    empty_line = " " * width
+    foreground_font_attr_segs = [(0, width, self.BASE_ATTR)]
+
+    if self._output_num_rows > 1:
+      block_y = self._block_y()
+
+      if width == 1:
+        up_text = "U"
+        down_text = "D"
+      elif width == 2:
+        up_text = "UP"
+        down_text = "DN"
+      elif width == 3:
+        up_text = "UP "
+        down_text = "DN "
+      else:
+        up_text = " UP "
+        down_text = "DOWN"
+
+      layout = debugger_cli_common.RichTextLines(
+          [up_text], font_attr_segs={0: [(0, width, self.BASE_ATTR)]})
+      for i in xrange(1, self._scroll_bar_height - 1):
+        font_attr_segs = foreground_font_attr_segs if i == block_y else None
+        layout.append(empty_line, font_attr_segs=font_attr_segs)
+      layout.append(down_text, font_attr_segs=foreground_font_attr_segs)
+    else:
+      layout = debugger_cli_common.RichTextLines(
+          [empty_line] * self._scroll_bar_height)
+
+    return layout
+
+  def get_click_command(self, mouse_y):
+    # TODO(cais): Support continuous scrolling when the mouse button is held
+    # down.
+    if self._output_num_rows <= 1:
+      return None
+    elif mouse_y == self._min_y:
+      return _SCROLL_UP_A_LINE
+    elif mouse_y == self._max_y:
+      return _SCROLL_DOWN_A_LINE
+    elif (mouse_y > self._block_y(screen_coord_sys=True) and
+          mouse_y < self._max_y):
+      return _SCROLL_DOWN
+    elif (mouse_y < self._block_y(screen_coord_sys=True) and
+          mouse_y > self._min_y):
+      return _SCROLL_UP
+    else:
+      return None
+
+
 class CursesUI(base_ui.BaseUI):
   """Curses-based Command-line UI.
 
@@ -60,41 +217,61 @@ class CursesUI(base_ui.BaseUI):
 
   CLI_TERMINATOR_KEY = 7  # Terminator key for input text box.
   CLI_TAB_KEY = ord("\t")
+  BACKSPACE_KEY = ord("\b")
   REGEX_SEARCH_PREFIX = "/"
   TENSOR_INDICES_NAVIGATION_PREFIX = "@"
 
+  _NAVIGATION_FORWARD_COMMAND = "next"
+  _NAVIGATION_BACK_COMMAND = "prev"
+
+  # Limit screen width to work around the limitation of the curses library that
+  # it may return invalid x coordinates for large values.
+  _SCREEN_WIDTH_LIMIT = 220
+
   # Possible Enter keys. 343 is curses key code for the num-pad Enter key when
   # num lock is off.
   CLI_CR_KEYS = [ord("\n"), ord("\r"), 343]
 
-  _SCROLL_REFRESH = "refresh"
-  _SCROLL_UP = "up"
-  _SCROLL_DOWN = "down"
-  _SCROLL_HOME = "home"
-  _SCROLL_END = "end"
-  _SCROLL_TO_LINE_INDEX = "scroll_to_line_index"
+  _KEY_MAP = {
+      127: curses.KEY_BACKSPACE,  # Backspace
+      curses.KEY_DC: 4,  # Delete
+  }
 
   _FOREGROUND_COLORS = {
-      "white": curses.COLOR_WHITE,
-      "red": curses.COLOR_RED,
-      "green": curses.COLOR_GREEN,
-      "yellow": curses.COLOR_YELLOW,
-      "blue": curses.COLOR_BLUE,
-      "cyan": curses.COLOR_CYAN,
-      "magenta": curses.COLOR_MAGENTA,
-      "black": curses.COLOR_BLACK,
+      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
+      cli_shared.COLOR_RED: curses.COLOR_RED,
+      cli_shared.COLOR_GREEN: curses.COLOR_GREEN,
+      cli_shared.COLOR_YELLOW: curses.COLOR_YELLOW,
+      cli_shared.COLOR_BLUE: curses.COLOR_BLUE,
+      cli_shared.COLOR_CYAN: curses.COLOR_CYAN,
+      cli_shared.COLOR_MAGENTA: curses.COLOR_MAGENTA,
+      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
   }
   _BACKGROUND_COLORS = {
-      "white": curses.COLOR_WHITE,
-      "black": curses.COLOR_BLACK,
+      "transparent": -1,
+      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
+      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
   }
 
   # Font attribute for search and highlighting.
-  _SEARCH_HIGHLIGHT_FONT_ATTR = "black_on_white"
-  _ARRAY_INDICES_COLOR_PAIR = "black_on_white"
-  _ERROR_TOAST_COLOR_PAIR = "red_on_white"
-  _INFO_TOAST_COLOR_PAIR = "blue_on_white"
-  _STATUS_BAR_COLOR_PAIR = "black_on_white"
+  _SEARCH_HIGHLIGHT_FONT_ATTR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _ARRAY_INDICES_COLOR_PAIR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _ERROR_TOAST_COLOR_PAIR = (
+      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
+  _INFO_TOAST_COLOR_PAIR = (
+      cli_shared.COLOR_BLUE + "_on_" + cli_shared.COLOR_WHITE)
+  _STATUS_BAR_COLOR_PAIR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _UI_WAIT_COLOR_PAIR = (
+      cli_shared.COLOR_MAGENTA + "_on_" + cli_shared.COLOR_WHITE)
+  _NAVIGATION_WARNING_COLOR_PAIR = (
+      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
+
+  _UI_WAIT_MESSAGE = "Processing..."
+
+  _single_instance_lock = threading.Lock()
 
   def __init__(self, on_ui_exit=None):
     """Constructor of CursesUI.
@@ -130,6 +307,8 @@ class CursesUI(base_ui.BaseUI):
 
     self._pending_command = ""
 
+    self._nav_history = curses_widgets.CursesNavigationHistory(10)
+
     # State related to screen output.
     self._output_pad = None
     self._output_pad_row = 0
@@ -137,8 +316,12 @@ class CursesUI(base_ui.BaseUI):
     self._curr_unwrapped_output = None
     self._curr_wrapped_output = None
 
-    # Register signal handler for SIGINT.
-    signal.signal(signal.SIGINT, self._interrupt_handler)
+    try:
+      # Register signal handler for SIGINT.
+      signal.signal(signal.SIGINT, self._interrupt_handler)
+    except ValueError:
+      # Running in a child thread, can't catch signals.
+      pass
 
     self.register_command_handler(
         "mouse",
@@ -162,11 +345,15 @@ class CursesUI(base_ui.BaseUI):
 
     self._title_row = 0
 
+    # Row index of the Navigation Bar (i.e., the bar that contains forward and
+    # backward buttons and displays the current command line).
+    self._nav_bar_row = 1
+
     # Top row index of the output pad.
     # A "pad" is a curses object that holds lines of text and not limited to
     # screen size. It can be rendered on the screen partially with scroll
     # parameters specified.
-    self._output_top_row = 1
+    self._output_top_row = 2
 
     # Number of rows that the output pad has.
     self._output_num_rows = (
@@ -193,7 +380,7 @@ class CursesUI(base_ui.BaseUI):
     # Size of view port on screen, which is always smaller or equal to the
     # screen size.
     self._output_pad_screen_height = self._output_num_rows - 1
-    self._output_pad_screen_width = self._max_x - 1
+    self._output_pad_screen_width = self._max_x - 2
     self._output_pad_screen_location = self.rectangle(
         top=self._output_top_row,
         left=0,
@@ -205,29 +392,43 @@ class CursesUI(base_ui.BaseUI):
 
     Creates curses stdscr and initialize the color pairs for display.
     """
-
+    # If the terminal type is color-ready, enable it.
+    if os.getenv("COLORTERM") in _COLOR_READY_COLORTERMS:
+      os.environ["TERM"] = _COLOR_ENABLED_TERM
     self._stdscr = curses.initscr()
     self._command_window = None
+    self._screen_color_init()
 
-    # Prepare color pairs.
+  def _screen_color_init(self):
+    """Initialization of screen colors."""
     curses.start_color()
-
+    curses.use_default_colors()
     self._color_pairs = {}
     color_index = 0
 
+    # Prepare color pairs.
     for fg_color in self._FOREGROUND_COLORS:
       for bg_color in self._BACKGROUND_COLORS:
-
         color_index += 1
         curses.init_pair(color_index, self._FOREGROUND_COLORS[fg_color],
                          self._BACKGROUND_COLORS[bg_color])
 
         color_name = fg_color
-        if bg_color != "black":
+        if bg_color != "transparent":
           color_name += "_on_" + bg_color
 
         self._color_pairs[color_name] = curses.color_pair(color_index)
 
+    # Try getting color(s) available only under 256-color support.
+    try:
+      color_index += 1
+      curses.init_pair(color_index, 245, -1)
+      self._color_pairs[cli_shared.COLOR_GRAY] = curses.color_pair(color_index)
+    except curses.error:
+      # Use fall-back color(s):
+      self._color_pairs[cli_shared.COLOR_GRAY] = (
+          self._color_pairs[cli_shared.COLOR_GREEN])
+
     # A_BOLD or A_BLINK is not really a "color". But place it here for
     # convenience.
     self._color_pairs["bold"] = curses.A_BOLD
@@ -235,7 +436,7 @@ class CursesUI(base_ui.BaseUI):
     self._color_pairs["underline"] = curses.A_UNDERLINE
 
     # Default color pair to use when a specified color pair does not exist.
-    self._default_color_pair = self._color_pairs["white"]
+    self._default_color_pair = self._color_pairs[cli_shared.COLOR_WHITE]
 
   def _screen_launch(self, enable_mouse_on_start):
     """Launch the curses screen."""
@@ -269,8 +470,12 @@ class CursesUI(base_ui.BaseUI):
     curses.echo()
     curses.endwin()
 
-    # Remove SIGINT handler.
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
+    try:
+      # Remove SIGINT handler.
+      signal.signal(signal.SIGINT, signal.SIG_DFL)
+    except ValueError:
+     # Can't catch signals unless you're the main thread.
+      pass
 
   def run_ui(self,
              init_command=None,
@@ -279,6 +484,11 @@ class CursesUI(base_ui.BaseUI):
              enable_mouse_on_start=True):
     """Run the CLI: See the doc of base_ui.BaseUI.run_ui for more details."""
 
+    # Only one instance of the Curses UI can be running at a time, since
+    # otherwise they would try to both read from the same keystrokes, and write
+    # to the same screen.
+    self._single_instance_lock.acquire()
+
     self._screen_launch(enable_mouse_on_start=enable_mouse_on_start)
 
     # Optional initial command.
@@ -296,12 +506,14 @@ class CursesUI(base_ui.BaseUI):
 
     self._screen_terminate()
 
+    self._single_instance_lock.release()
+
     return exit_token
 
   def get_help(self):
     return self._command_handler_registry.get_help()
 
-  def _screen_create_command_textbox(self, existing_command):
+  def _screen_create_command_textbox(self, existing_command=None):
     """Create command textbox on screen.
 
     Args:
@@ -338,7 +550,11 @@ class CursesUI(base_ui.BaseUI):
         existing_command = self._pending_command
       self._screen_create_command_textbox(existing_command)
 
-      command, terminator, pending_command_changed = self._get_user_command()
+      try:
+        command, terminator, pending_command_changed = self._get_user_command()
+      except debugger_cli_common.CommandLineExit as e:
+        return e.exit_token
+
       if not command and terminator != self.CLI_TAB_KEY:
         continue
 
@@ -392,6 +608,36 @@ class CursesUI(base_ui.BaseUI):
 
   def _screen_refresh_size(self):
     self._max_y, self._max_x = self._stdscr.getmaxyx()
+    if self._max_x > self._SCREEN_WIDTH_LIMIT:
+      self._max_x = self._SCREEN_WIDTH_LIMIT
+
+  def _navigate_screen_output(self, command):
+    """Navigate in screen output history.
+
+    Args:
+      command: (`str`) the navigation command, from
+        {self._NAVIGATION_FORWARD_COMMAND, self._NAVIGATION_BACK_COMMAND}.
+    """
+    if command == self._NAVIGATION_FORWARD_COMMAND:
+      if self._nav_history.can_go_forward():
+        item = self._nav_history.go_forward()
+        scroll_position = item.scroll_position
+      else:
+        self._toast("At the LATEST in navigation history!",
+                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
+        return
+    else:
+      if self._nav_history.can_go_back():
+        item = self._nav_history.go_back()
+        scroll_position = item.scroll_position
+      else:
+        self._toast("At the OLDEST in navigation history!",
+                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
+        return
+
+    self._display_output(item.screen_output)
+    if scroll_position != 0:
+      self._scroll_output(_SCROLL_TO_LINE_INDEX, line_index=scroll_position)
 
   def _dispatch_command(self, command):
     """Dispatch user command.
@@ -404,10 +650,17 @@ class CursesUI(base_ui.BaseUI):
       A non-None value means the UI loop should exit.
     """
 
+    if self._output_pad:
+      self._toast(self._UI_WAIT_MESSAGE, color=self._UI_WAIT_COLOR_PAIR)
+
     if command in self.CLI_EXIT_COMMANDS:
       # Explicit user command-triggered exit: EXPLICIT_USER_EXIT as the exit
       # token.
       return debugger_cli_common.EXPLICIT_USER_EXIT
+    elif (command == self._NAVIGATION_FORWARD_COMMAND or
+          command == self._NAVIGATION_BACK_COMMAND):
+      self._navigate_screen_output(command)
+      return
 
     if command:
       self._command_history_store.add_command(command)
@@ -437,10 +690,9 @@ class CursesUI(base_ui.BaseUI):
           indices = command_parser.parse_indices(indices_str)
           omitted, line_index, _, _ = tensor_format.locate_tensor_element(
               self._curr_wrapped_output, indices)
-
           if not omitted:
             self._scroll_output(
-                self._SCROLL_TO_LINE_INDEX, line_index=line_index)
+                _SCROLL_TO_LINE_INDEX, line_index=line_index)
         except Exception as e:  # pylint: disable=broad-except
           self._error_toast(str(e))
       else:
@@ -458,7 +710,8 @@ class CursesUI(base_ui.BaseUI):
       # Empty command: take no action. Should not exit.
       return
 
-    screen_info = {"cols": self._max_x}
+    # Take into account scroll bar width.
+    screen_info = {"cols": self._max_x - 2}
     exit_token = None
     if self._command_handler_registry.is_registered(prefix):
       try:
@@ -478,6 +731,8 @@ class CursesUI(base_ui.BaseUI):
     if exit_token is not None:
       return exit_token
 
+    self._nav_history.add_item(command, screen_output, 0)
+
     self._display_output(screen_output)
     if output_file_path:
       try:
@@ -518,6 +773,8 @@ class CursesUI(base_ui.BaseUI):
 
     Raises:
       TypeError: If the input x is not of type int.
+      debugger_cli_common.CommandLineExit: If a mouse-triggered command returns
+        an exit token when dispatched.
     """
     if not isinstance(x, int):
       raise TypeError("Key validator expected type int, received type %s" %
@@ -531,16 +788,16 @@ class CursesUI(base_ui.BaseUI):
       self._textbox_curr_terminator = self.CLI_TAB_KEY
       return self.CLI_TERMINATOR_KEY
     elif x == curses.KEY_PPAGE:
-      self._scroll_output(self._SCROLL_UP)
+      self._scroll_output(_SCROLL_UP_A_LINE)
       return x
     elif x == curses.KEY_NPAGE:
-      self._scroll_output(self._SCROLL_DOWN)
+      self._scroll_output(_SCROLL_DOWN_A_LINE)
       return x
     elif x == curses.KEY_HOME:
-      self._scroll_output(self._SCROLL_HOME)
+      self._scroll_output(_SCROLL_HOME)
       return x
     elif x == curses.KEY_END:
-      self._scroll_output(self._SCROLL_END)
+      self._scroll_output(_SCROLL_END)
       return x
     elif x in [curses.KEY_UP, curses.KEY_DOWN]:
       # Command history navigation.
@@ -582,24 +839,33 @@ class CursesUI(base_ui.BaseUI):
         mouse_event_type = None
 
       if mouse_event_type == curses.BUTTON1_RELEASED:
-        command = self._fetch_hyperlink_command(mouse_x, mouse_y)
-        if command:
-          self._auto_key_in(command)
-          self._textbox_curr_terminator = x
-          return self.CLI_TERMINATOR_KEY
+        # Logic for mouse-triggered scrolling.
+        if mouse_x >= self._max_x - 2:
+          scroll_command = self._scroll_bar.get_click_command(mouse_y)
+          if scroll_command is not None:
+            self._scroll_output(scroll_command)
+          return x
+        else:
+          command = self._fetch_hyperlink_command(mouse_x, mouse_y)
+          if command:
+            self._screen_create_command_textbox()
+            exit_token = self._dispatch_command(command)
+            if exit_token is not None:
+              raise debugger_cli_common.CommandLineExit(exit_token=exit_token)
     else:
       # Mark the pending command as modified.
       self._textbox_pending_command_changed = True
       # Invalidate active command history.
       self._command_pointer = 0
       self._active_command_history = []
-      return x
+      return self._KEY_MAP.get(x, x)
 
   def _screen_getmouse(self):
     return curses.getmouse()
 
   def _redraw_output(self):
     if self._curr_unwrapped_output is not None:
+      self._display_nav_bar()
       self._display_main_menu(self._curr_unwrapped_output)
       self._display_output(self._curr_unwrapped_output, is_refresh=True)
 
@@ -608,7 +874,11 @@ class CursesUI(base_ui.BaseUI):
     if self._main_menu_pad:
       output_top += 1
 
-    if mouse_y == self._output_top_row and self._main_menu_pad:
+    if mouse_y == self._nav_bar_row and self._nav_bar:
+      # Click was in the nav bar.
+      return _get_command_from_line_attr_segs(mouse_x,
+                                              self._nav_bar.font_attr_segs[0])
+    elif mouse_y == self._output_top_row and self._main_menu_pad:
       # Click was in the menu bar.
       return _get_command_from_line_attr_segs(mouse_x,
                                               self._main_menu.font_attr_segs[0])
@@ -634,15 +904,28 @@ class CursesUI(base_ui.BaseUI):
     self._screen_draw_text_line(
         self._title_row, self._title_line, color=title_color)
 
-  def _auto_key_in(self, command):
+  def _auto_key_in(self, command, erase_existing=False):
     """Automatically key in a command to the command Textbox.
 
     Args:
-      command: The command, as a string.
+      command: The command, as a string or None.
+      erase_existing: (bool) whether existing text (if any) is to be erased
+          first.
     """
+    if erase_existing:
+      self._erase_existing_command()
+
+    command = command or ""
     for c in command:
       self._command_textbox.do_command(ord(c))
 
+  def _erase_existing_command(self):
+    """Erase existing text in command textpad."""
+
+    existing_len = len(self._command_textbox.gather())
+    for _ in xrange(existing_len):
+      self._command_textbox.do_command(self.BACKSPACE_KEY)
+
   def _screen_draw_text_line(self, row, line, attr=curses.A_NORMAL, color=None):
     """Render a line of text on the screen.
 
@@ -702,7 +985,7 @@ class CursesUI(base_ui.BaseUI):
 
     # Wrap the output lines according to screen width.
     self._curr_wrapped_output, wrapped_line_indices = (
-        debugger_cli_common.wrap_rich_text_lines(output, self._max_x - 1))
+        debugger_cli_common.wrap_rich_text_lines(output, self._max_x - 2))
 
     # Append lines to curr_wrapped_output so that the user can scroll to a
     # state where the last text line is on the top of the output area.
@@ -715,9 +998,10 @@ class CursesUI(base_ui.BaseUI):
       self._curr_wrapped_output.lines.append("Output cut off at %d lines!" %
                                              self.max_output_lines)
       self._curr_wrapped_output.font_attr_segs[self.max_output_lines] = [
-          (0, len(output.lines[-1]), "magenta")
+          (0, len(output.lines[-1]), cli_shared.COLOR_MAGENTA)
       ]
 
+    self._display_nav_bar()
     self._display_main_menu(self._curr_wrapped_output)
 
     (self._output_pad, self._output_pad_height,
@@ -787,16 +1071,19 @@ class CursesUI(base_ui.BaseUI):
 
       if next_match_line >= 0:
         self._scroll_output(
-            self._SCROLL_TO_LINE_INDEX, line_index=next_match_line)
+            _SCROLL_TO_LINE_INDEX, line_index=next_match_line)
       else:
         # Regex search found no match >= current line number. Display message
         # stating as such.
         self._toast("Pattern not found", color=self._ERROR_TOAST_COLOR_PAIR)
     elif is_refresh:
-      self._scroll_output(self._SCROLL_REFRESH)
+      self._scroll_output(_SCROLL_REFRESH)
+    elif debugger_cli_common.INIT_SCROLL_POS_KEY in output.annotations:
+      line_index = output.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY]
+      self._scroll_output(_SCROLL_TO_LINE_INDEX, line_index=line_index)
     else:
       self._output_pad_row = 0
-      self._scroll_output(self._SCROLL_HOME)
+      self._scroll_output(_SCROLL_HOME)
 
   def _display_lines(self, output, min_num_rows):
     """Display RichTextLines object on screen.
@@ -825,7 +1112,7 @@ class CursesUI(base_ui.BaseUI):
 
     # Size of the output pad, which may exceed screen size and require
     # scrolling.
-    cols = self._max_x - 1
+    cols = self._max_x - 2
 
     # Create new output pad.
     pad = self._screen_new_output_pad(rows, cols)
@@ -839,6 +1126,18 @@ class CursesUI(base_ui.BaseUI):
 
     return pad, rows, cols
 
+  def _display_nav_bar(self):
+    nav_bar_width = self._max_x - 2
+    self._nav_bar_pad = self._screen_new_output_pad(1, nav_bar_width)
+    self._nav_bar = self._nav_history.render(
+        nav_bar_width,
+        self._NAVIGATION_BACK_COMMAND,
+        self._NAVIGATION_FORWARD_COMMAND)
+    self._screen_add_line_to_output_pad(
+        self._nav_bar_pad, 0, self._nav_bar.lines[0][:nav_bar_width - 1],
+        color_segments=(self._nav_bar.font_attr_segs[0]
+                        if 0 in self._nav_bar.font_attr_segs else None))
+
   def _display_main_menu(self, output):
     """Display main menu associated with screen output, if the menu exists.
 
@@ -853,12 +1152,12 @@ class CursesUI(base_ui.BaseUI):
           debugger_cli_common.MAIN_MENU_KEY].format_as_single_line(
               prefix="| ", divider=" | ", enabled_item_attrs=["underline"])
 
-      self._main_menu_pad = self._screen_new_output_pad(1, self._max_x - 1)
+      self._main_menu_pad = self._screen_new_output_pad(1, self._max_x - 2)
 
       # The unwrapped menu line may exceed screen width, in which case it needs
       # to be cut off.
       wrapped_menu, _ = debugger_cli_common.wrap_rich_text_lines(
-          self._main_menu, self._max_x - 2)
+          self._main_menu, self._max_x - 3)
       self._screen_add_line_to_output_pad(
           self._main_menu_pad,
           0,
@@ -940,12 +1239,26 @@ class CursesUI(base_ui.BaseUI):
                 screen_location_left, screen_location_bottom,
                 screen_location_right)
 
+    self._scroll_bar = ScrollBar(
+        self._max_x - 2,
+        3,
+        self._max_x - 1,
+        self._output_num_rows + 1,
+        self._output_pad_row,
+        self._output_pad_height - self._output_pad_screen_height)
+
+    (scroll_pad, _, _) = self._display_lines(
+        self._scroll_bar.layout(), self._output_num_rows - 1)
+    scroll_pad.refresh(
+        0, 0, self._output_top_row + 1, self._max_x - 2,
+        self._output_num_rows + 1, self._max_x - 1)
+
   def _scroll_output(self, direction, line_index=None):
     """Scroll the output pad.
 
     Args:
-      direction: _SCROLL_REFRESH, _SCROLL_UP, _SCROLL_DOWN, _SCROLL_HOME or
-        _SCROLL_END, _SCROLL_TO_LINE_INDEX
+      direction: _SCROLL_REFRESH, _SCROLL_UP, _SCROLL_DOWN, _SCROLL_UP_A_LINE,
+        _SCROLL_DOWN_A_LINE, _SCROLL_HOME, _SCROLL_END, _SCROLL_TO_LINE_INDEX
       line_index: (int) Specifies the zero-based line index to scroll to.
         Applicable only if direction is _SCROLL_TO_LINE_INDEX.
 
@@ -959,32 +1272,46 @@ class CursesUI(base_ui.BaseUI):
       # No output pad is present. Do nothing.
       return
 
-    if direction == self._SCROLL_REFRESH:
+    if direction == _SCROLL_REFRESH:
       pass
-    elif direction == self._SCROLL_UP:
-      # Scroll up
+    elif direction == _SCROLL_UP:
+      # Scroll up.
+      self._output_pad_row -= int(self._output_num_rows / 3)
+      if self._output_pad_row < 0:
+        self._output_pad_row = 0
+    elif direction == _SCROLL_DOWN:
+      # Scroll down.
+      self._output_pad_row += int(self._output_num_rows / 3)
+      if (self._output_pad_row >
+          self._output_pad_height - self._output_pad_screen_height - 1):
+        self._output_pad_row = (
+            self._output_pad_height - self._output_pad_screen_height - 1)
+    elif direction == _SCROLL_UP_A_LINE:
+      # Scroll up a line
       if self._output_pad_row - 1 >= 0:
         self._output_pad_row -= 1
-    elif direction == self._SCROLL_DOWN:
-      # Scroll down
+    elif direction == _SCROLL_DOWN_A_LINE:
+      # Scroll down a line
       if self._output_pad_row + 1 < (
           self._output_pad_height - self._output_pad_screen_height):
         self._output_pad_row += 1
-    elif direction == self._SCROLL_HOME:
+    elif direction == _SCROLL_HOME:
       # Scroll to top
       self._output_pad_row = 0
-    elif direction == self._SCROLL_END:
+    elif direction == _SCROLL_END:
       # Scroll to bottom
       self._output_pad_row = (
           self._output_pad_height - self._output_pad_screen_height - 1)
-    elif direction == self._SCROLL_TO_LINE_INDEX:
+    elif direction == _SCROLL_TO_LINE_INDEX:
       if not isinstance(line_index, int):
         raise TypeError("Invalid line_index type (%s) under mode %s" %
-                        (type(line_index), self._SCROLL_TO_LINE_INDEX))
+                        (type(line_index), _SCROLL_TO_LINE_INDEX))
       self._output_pad_row = line_index
     else:
       raise ValueError("Unsupported scroll mode: %s" % direction)
 
+    self._nav_history.update_scroll_position(self._output_pad_row)
+
     # Actually scroll the output pad: refresh with new location.
     output_pad_top = self._output_pad_screen_location.top
     if self._main_menu_pad:
@@ -994,6 +1321,7 @@ class CursesUI(base_ui.BaseUI):
                                    self._output_pad_screen_location.left,
                                    self._output_pad_screen_location.bottom,
                                    self._output_pad_screen_location.right)
+    self._screen_render_nav_bar()
     self._screen_render_menu_pad()
 
     self._scroll_info = self._compile_ui_status_summary()
@@ -1002,6 +1330,12 @@ class CursesUI(base_ui.BaseUI):
         self._scroll_info,
         color=self._STATUS_BAR_COLOR_PAIR)
 
+  def _screen_render_nav_bar(self):
+    if self._nav_bar_pad:
+      self._nav_bar_pad.refresh(0, 0, self._nav_bar_row, 0,
+                                self._output_pad_screen_location.top,
+                                self._max_x)
+
   def _screen_render_menu_pad(self):
     if self._main_menu_pad:
       self._main_menu_pad.refresh(0, 0, self._output_pad_screen_location.top, 0,
@@ -1183,7 +1517,7 @@ class CursesUI(base_ui.BaseUI):
 
     if self._curr_unwrapped_output:
       # Force refresh screen output.
-      self._scroll_output(self._SCROLL_REFRESH)
+      self._scroll_output(_SCROLL_REFRESH)
 
     if not candidates:
       return
@@ -1197,7 +1531,7 @@ class CursesUI(base_ui.BaseUI):
         })
 
     candidates_output, _ = debugger_cli_common.wrap_rich_text_lines(
-        candidates_output, self._max_x - 2)
+        candidates_output, self._max_x - 3)
 
     # Calculate how many lines the candidate text should occupy. Limit it to
     # a maximum value.
@@ -1210,7 +1544,7 @@ class CursesUI(base_ui.BaseUI):
     pad, _, _ = self._display_lines(candidates_output, 0)
     self._screen_scroll_output_pad(
         pad, 0, 0, self._candidates_top_row, 0,
-        self._candidates_top_row + candidates_num_rows - 1, self._max_x - 1)
+        self._candidates_top_row + candidates_num_rows - 1, self._max_x - 2)
 
   def _toast(self, message, color=None, line_index=None):
     """Display a one-line message on the screen.
@@ -1226,10 +1560,12 @@ class CursesUI(base_ui.BaseUI):
 
     pad, _, _ = self._display_lines(
         debugger_cli_common.RichTextLines(
-            message, font_attr_segs={0: [(0, len(message), color or "white")]}),
+            message,
+            font_attr_segs={
+                0: [(0, len(message), color or cli_shared.COLOR_WHITE)]}),
         0)
 
-    right_end = min(len(message), self._max_x - 1)
+    right_end = min(len(message), self._max_x - 2)
 
     if line_index is None:
       line_index = self._output_scroll_row - 1
@@ -1257,8 +1593,11 @@ class CursesUI(base_ui.BaseUI):
         self.INFO_MESSAGE_PREFIX + message, color=self._INFO_TOAST_COLOR_PAIR)
 
   def _interrupt_handler(self, signal_num, frame):
-    _ = signal_num  # Unused.
-    _ = frame  # Unused.
+    del signal_num  # Unused.
+    del frame  # Unused.
+
+    if self._on_ui_exit:
+      self._on_ui_exit()
 
     self._screen_terminate()
     print("\ntfdbg: caught SIGINT; calling sys.exit(1).", file=sys.stderr)
diff --git a/tensorflow/python/debug/cli/curses_ui_test.py b/tensorflow/python/debug/cli/curses_ui_test.py
index 4bb6e9e17be..15e1356d292 100644
--- a/tensorflow/python/debug/cli/curses_ui_test.py
+++ b/tensorflow/python/debug/cli/curses_ui_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import argparse
 import curses
 import tempfile
+import threading
 
 import numpy as np
+from six.moves import queue
 
 from tensorflow.python.debug.cli import curses_ui
 from tensorflow.python.debug.cli import debugger_cli_common
@@ -80,6 +82,11 @@ class MockCursesUI(curses_ui.CursesUI):
 
     curses_ui.CursesUI.__init__(self)
 
+    # Override the default path to the command history file to avoid test
+    # concurrency issues.
+    self._command_history_store = debugger_cli_common.CommandHistory(
+        history_file_path=tempfile.mktemp())
+
   # Below, override the _screen_ prefixed member methods that interact with the
   # actual terminal, so that the mock can run in a terminal-less environment.
 
@@ -106,7 +113,7 @@ class MockCursesUI(curses_ui.CursesUI):
   def _screen_create_command_window(self):
     pass
 
-  def _screen_create_command_textbox(self, existing_command):
+  def _screen_create_command_textbox(self, existing_command=None):
     """Override to insert observer of existing commands.
 
     Used in testing of history navigation and tab completion.
@@ -202,6 +209,9 @@ class MockCursesUI(curses_ui.CursesUI):
 
     self.main_menu_list.append(self._main_menu)
 
+  def _screen_render_nav_bar(self):
+    pass
+
   def _screen_render_menu_pad(self):
     pass
 
@@ -299,6 +309,18 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual([], ui._active_command_history)
     self.assertEqual("", ui._pending_command)
 
+  def testCursesUiInChildThreadStartsWithoutException(self):
+    result = queue.Queue()
+    def child_thread():
+      try:
+        MockCursesUI(40, 80)
+      except ValueError as e:
+        result.put(e)
+    t = threading.Thread(target=child_thread)
+    t.start()
+    t.join()
+    self.assertTrue(result.empty())
+
   def testRunUIExitImmediately(self):
     """Make sure that the UI can exit properly after launch."""
 
@@ -873,7 +895,7 @@ class CursesTest(test_util.TensorFlowTestCase):
   def testRegexSearchUnderLineWrapping(self):
     ui = MockCursesUI(
         40,
-        5,  # Use a narrow window to trigger line wrapping
+        6,  # Use a narrow window to trigger line wrapping
         command_sequence=[
             string_to_codes("babble -n 3 -l foo-bar-baz-qux\n"),
             string_to_codes("/foo\n"),  # Regex search and highlight.
@@ -958,7 +980,11 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual([0], ui.output_pad_rows)
 
     # Invalid regex should have led to a toast error message.
-    self.assertEqual(["ERROR: Invalid regular expression: \"[\""], ui.toasts)
+    self.assertEqual(
+        [MockCursesUI._UI_WAIT_MESSAGE,
+         "ERROR: Invalid regular expression: \"[\"",
+         MockCursesUI._UI_WAIT_MESSAGE],
+        ui.toasts)
 
   def testRegexSearchFromCommandHistory(self):
     """Test regex search commands are recorded in command history."""
@@ -1004,7 +1030,7 @@ class CursesTest(test_util.TensorFlowTestCase):
     """Test displaying tensor with indices."""
 
     ui = MockCursesUI(
-        8,  # Use a small screen height to cause scrolling.
+        9,  # Use a small screen height to cause scrolling.
         80,
         command_sequence=[
             string_to_codes("print_ones --size 5\n"),
@@ -1149,19 +1175,19 @@ class CursesTest(test_util.TensorFlowTestCase):
 
     self.assertEqual({
         0: None,
-        -1: [1, 0]
+        -1: [0, 0]
     }, ui.output_array_pointer_indices[0])
     self.assertEqual({
         0: [0, 0],
-        -1: [3, 0]
+        -1: [2, 0]
     }, ui.output_array_pointer_indices[1])
     self.assertEqual({
         0: [1, 0],
-        -1: [4, 0]
+        -1: [3, 0]
     }, ui.output_array_pointer_indices[2])
     self.assertEqual({
         0: [0, 0],
-        -1: [3, 0]
+        -1: [2, 0]
     }, ui.output_array_pointer_indices[3])
 
   def testScrollTensorByInvalidIndices(self):
@@ -1188,10 +1214,10 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(ui.output_array_pointer_indices))
 
     # Check error messages.
-    self.assertEqual("ERROR: Indices exceed tensor dimensions.", ui.toasts[1])
+    self.assertEqual("ERROR: Indices exceed tensor dimensions.", ui.toasts[2])
     self.assertEqual("ERROR: invalid literal for int() with base 10: ''",
-                     ui.toasts[2])
-    self.assertEqual("ERROR: Empty indices.", ui.toasts[3])
+                     ui.toasts[4])
+    self.assertEqual("ERROR: Empty indices.", ui.toasts[6])
 
   def testWriteScreenOutputToFileWorks(self):
     output_path = tempfile.mktemp()
@@ -1296,6 +1322,23 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
     self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
 
+  def testMouseClickOnLinkWithExistingTextTriggersCommand(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 10 -k\n"),
+            string_to_codes("foo"),  # Enter some existing code in the textbox.
+            [curses.KEY_MOUSE, 1, 4],  # A click on a hyperlink.
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(2, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
+    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
+
   def testMouseClickOffLinkDoesNotTriggersCommand(self):
     ui = MockCursesUI(
         40,
@@ -1323,7 +1366,7 @@ class CursesTest(test_util.TensorFlowTestCase):
         command_sequence=[
             string_to_codes("babble -n 10 -m\n"),
             # A click on the enabled menu item.
-            [curses.KEY_MOUSE, 3, 1],
+            [curses.KEY_MOUSE, 3, 2],
             self._EXIT
         ])
     ui.register_command_handler("babble", self._babble, "")
@@ -1363,6 +1406,265 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(ui.unwrapped_outputs))
     self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
 
+  def testNavigationUsingCommandLineWorks(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 2\n"),
+            string_to_codes("babble -n 4\n"),
+            string_to_codes("prev\n"),
+            string_to_codes("next\n"),
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(4, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
+
+  def testNavigationOverOldestLimitUsingCommandLineGivesCorrectWarning(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 2\n"),
+            string_to_codes("babble -n 4\n"),
+            string_to_codes("prev\n"),
+            string_to_codes("prev\n"),  # Navigate over oldest limit.
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(3, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
+
+    self.assertEqual("At the OLDEST in navigation history!", ui.toasts[-2])
+
+  def testNavigationOverLatestLimitUsingCommandLineGivesCorrectWarning(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 2\n"),
+            string_to_codes("babble -n 4\n"),
+            string_to_codes("prev\n"),
+            string_to_codes("next\n"),
+            string_to_codes("next\n"),  # Navigate over latest limit.
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(4, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
+
+    self.assertEqual("At the LATEST in navigation history!", ui.toasts[-2])
+
+  def testMouseClicksOnNavBarWorks(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 2\n"),
+            string_to_codes("babble -n 4\n"),
+            # A click on the back (prev) button of the nav bar.
+            [curses.KEY_MOUSE, 3, 1],
+            # A click on the forward (prev) button of the nav bar.
+            [curses.KEY_MOUSE, 7, 1],
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(4, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
+
+  def testMouseClicksOnNavBarAfterPreviousScrollingWorks(self):
+    ui = MockCursesUI(
+        40,
+        80,
+        command_sequence=[
+            string_to_codes("babble -n 2\n"),
+            [curses.KEY_NPAGE],   # Scroll down one line.
+            string_to_codes("babble -n 4\n"),
+            # A click on the back (prev) button of the nav bar.
+            [curses.KEY_MOUSE, 3, 1],
+            # A click on the forward (prev) button of the nav bar.
+            [curses.KEY_MOUSE, 7, 1],
+            self._EXIT
+        ])
+    ui.register_command_handler("babble", self._babble, "")
+    ui.run_ui()
+
+    self.assertEqual(6, len(ui.unwrapped_outputs))
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
+    # From manual scroll.
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[1].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[2].lines)
+    # From history navigation.
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[3].lines)
+    # From history navigation's auto-scroll to history scroll position.
+    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[4].lines)
+    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[5].lines)
+
+    self.assertEqual(6, len(ui.scroll_messages))
+    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
+    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[1])
+    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[2])
+    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[3])
+    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[4])
+    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[5])
+
+
+class ScrollBarTest(test_util.TensorFlowTestCase):
+
+  def testConstructorRaisesExceptionForNotEnoughHeight(self):
+    with self.assertRaisesRegexp(
+        ValueError, r"Insufficient height for ScrollBar \(2\)"):
+      curses_ui.ScrollBar(0, 0, 1, 1, 0, 0)
+
+  def testLayoutIsEmptyForZeroRow(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 0)
+    layout = scroll_bar.layout()
+    self.assertEqual(["  "] * 8, layout.lines)
+    self.assertEqual({}, layout.font_attr_segs)
+
+  def testLayoutIsEmptyFoOneRow(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 1)
+    layout = scroll_bar.layout()
+    self.assertEqual(["  "] * 8, layout.lines)
+    self.assertEqual({}, layout.font_attr_segs)
+
+  def testClickCommandForOneRowIsNone(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 1)
+    self.assertIsNone(scroll_bar.get_click_command(0))
+    self.assertIsNone(scroll_bar.get_click_command(3))
+    self.assertIsNone(scroll_bar.get_click_command(7))
+    self.assertIsNone(scroll_bar.get_click_command(8))
+
+  def testLayoutIsCorrectForTopPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
+    self.assertEqual(
+        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         1: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testWidth1LayoutIsCorrectForTopPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 0, 7, 0, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual(["U"] + [" "] * 6 + ["D"], layout.lines)
+    self.assertEqual(
+        {0: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)],
+         1: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testWidth3LayoutIsCorrectForTopPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 2, 7, 0, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual(["UP "] + ["   "] * 6 + ["DN "], layout.lines)
+    self.assertEqual(
+        {0: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)],
+         1: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testWidth4LayoutIsCorrectForTopPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 3, 7, 0, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual([" UP "] + ["    "] * 6 + ["DOWN"], layout.lines)
+    self.assertEqual(
+        {0: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)],
+         1: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testLayoutIsCorrectForBottomPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 19, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
+    self.assertEqual(
+        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         6: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testLayoutIsCorrectForMiddlePosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 10, 20)
+    layout = scroll_bar.layout()
+    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
+    self.assertEqual(
+        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         3: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
+         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
+        layout.font_attr_segs)
+
+  def testClickCommandsAreCorrectForMiddlePosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 10, 20)
+    self.assertIsNone(scroll_bar.get_click_command(-1))
+    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
+                     scroll_bar.get_click_command(0))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(1))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(2))
+    self.assertIsNone(scroll_bar.get_click_command(3))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(5))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(6))
+    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
+                     scroll_bar.get_click_command(7))
+    self.assertIsNone(scroll_bar.get_click_command(8))
+
+  def testClickCommandsAreCorrectForBottomPosition(self):
+    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 19, 20)
+    self.assertIsNone(scroll_bar.get_click_command(-1))
+    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
+                     scroll_bar.get_click_command(0))
+    for i in range(1, 6):
+      self.assertEqual(curses_ui._SCROLL_UP,
+                       scroll_bar.get_click_command(i))
+    self.assertIsNone(scroll_bar.get_click_command(6))
+    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
+                     scroll_bar.get_click_command(7))
+    self.assertIsNone(scroll_bar.get_click_command(8))
+
+  def testClickCommandsAreCorrectForScrollBarNotAtZeroMinY(self):
+    scroll_bar = curses_ui.ScrollBar(0, 5, 1, 12, 10, 20)
+    self.assertIsNone(scroll_bar.get_click_command(0))
+    self.assertIsNone(scroll_bar.get_click_command(4))
+    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
+                     scroll_bar.get_click_command(5))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(6))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(7))
+    self.assertIsNone(scroll_bar.get_click_command(8))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(10))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(11))
+    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
+                     scroll_bar.get_click_command(12))
+    self.assertIsNone(scroll_bar.get_click_command(13))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/cli/curses_widgets.py b/tensorflow/python/debug/cli/curses_widgets.py
new file mode 100644
index 00000000000..012ac5a6031
--- /dev/null
+++ b/tensorflow/python/debug/cli/curses_widgets.py
@@ -0,0 +1,202 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Widgets for Curses-based CLI."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.debug.cli import debugger_cli_common
+
+
+RL = debugger_cli_common.RichLine
+
+
+class NavigationHistoryItem(object):
+  """Individual item in navigation history."""
+
+  def __init__(self, command, screen_output, scroll_position):
+    """Constructor of NavigationHistoryItem.
+
+    Args:
+      command: (`str`) the command line text.
+      screen_output: the screen output of the command.
+      scroll_position: (`int`) scroll position in the screen output.
+    """
+    self.command = command
+    self.screen_output = screen_output
+    self.scroll_position = scroll_position
+
+
+class CursesNavigationHistory(object):
+  """Navigation history containing commands, outputs and scroll info."""
+
+  BACK_ARROW_TEXT = "<--"
+  FORWARD_ARROW_TEXT = "-->"
+
+  def __init__(self, capacity):
+    """Constructor of CursesNavigationHistory.
+
+    Args:
+      capacity: (`int`) How many items this object can hold. Each item consists
+        of a command stirng, an output RichTextLines object and a scroll
+        position.
+
+    Raises:
+      ValueError: If capacity is not a positive number.
+    """
+    if capacity <= 0:
+      raise ValueError("In valid capacity value: %d" % capacity)
+
+    self._capacity = capacity
+    self._items = []
+    self._pointer = -1
+
+  def add_item(self, command, screen_output, scroll_position):
+    """Add an item to the navigation histoyr.
+
+    Args:
+      command: command line text.
+      screen_output: screen output produced for the command.
+      scroll_position: (`int`) scroll position in the screen output.
+    """
+    if self._pointer + 1 < len(self._items):
+      self._items = self._items[:self._pointer + 1]
+    self._items.append(
+        NavigationHistoryItem(command, screen_output, scroll_position))
+    if len(self._items) > self._capacity:
+      self._items = self._items[-self._capacity:]
+    self._pointer = len(self._items) - 1
+
+  def update_scroll_position(self, new_scroll_position):
+    """Update the scroll position of the currently-pointed-to history item.
+
+    Args:
+      new_scroll_position: (`int`) new scroll-position value.
+
+    Raises:
+      ValueError: If the history is empty.
+    """
+    if not self._items:
+      raise ValueError("Empty navigation history")
+    self._items[self._pointer].scroll_position = new_scroll_position
+
+  def size(self):
+    return len(self._items)
+
+  def pointer(self):
+    return self._pointer
+
+  def go_back(self):
+    """Go back one place in the history, if possible.
+
+    Decrease the pointer value by 1, if possible. Otherwise, the pointer value
+    will be unchanged.
+
+    Returns:
+      The updated pointer value.
+
+    Raises:
+      ValueError: If history is empty.
+    """
+    if not self._items:
+      raise ValueError("Empty navigation history")
+
+    if self.can_go_back():
+      self._pointer -= 1
+    return self._items[self._pointer]
+
+  def go_forward(self):
+    """Go forward one place in the history, if possible.
+
+    Increase the pointer value by 1, if possible. Otherwise, the pointer value
+    will be unchanged.
+
+    Returns:
+      The updated pointer value.
+
+    Raises:
+      ValueError: If history is empty.
+    """
+    if not self._items:
+      raise ValueError("Empty navigation history")
+
+    if self.can_go_forward():
+      self._pointer += 1
+    return self._items[self._pointer]
+
+  def can_go_back(self):
+    """Test whether client can go back one place.
+
+    Returns:
+      (`bool`) Whether going back one place is possible.
+    """
+    return self._pointer >= 1
+
+  def can_go_forward(self):
+    """Test whether client can go forward one place.
+
+    Returns:
+      (`bool`) Whether going back one place is possible.
+    """
+    return self._pointer + 1 < len(self._items)
+
+  def render(self,
+             max_length,
+             backward_command,
+             forward_command,
+             latest_command_attribute="black_on_white",
+             old_command_attribute="magenta_on_white"):
+    """Render the rich text content of the single-line navigation bar.
+
+    Args:
+      max_length: (`int`) Maximum length of the navigation bar, in characters.
+      backward_command: (`str`) command for going backward. Used to construct
+        the shortcut menu item.
+      forward_command: (`str`) command for going forward. Used to construct the
+        shortcut menu item.
+       latest_command_attribute: font attribute for lastest command.
+       old_command_attribute: font attribute for old (non-latest) command.
+
+    Returns:
+      (`debugger_cli_common.RichTextLines`) the navigation bar text with
+        attributes.
+
+    """
+    output = RL("| ")
+    output += RL(
+        self.BACK_ARROW_TEXT,
+        (debugger_cli_common.MenuItem(None, backward_command)
+         if self.can_go_back() else None))
+    output += RL(" ")
+    output += RL(
+        self.FORWARD_ARROW_TEXT,
+        (debugger_cli_common.MenuItem(None, forward_command)
+         if self.can_go_forward() else None))
+
+    if self._items:
+      command_attribute = (latest_command_attribute
+                           if (self._pointer == (len(self._items) - 1))
+                           else old_command_attribute)
+      output += RL(" | ")
+      if self._pointer != len(self._items) - 1:
+        output += RL("(-%d) " % (len(self._items) - 1 - self._pointer),
+                     command_attribute)
+
+      if len(output) < max_length:
+        maybe_truncated_command = self._items[self._pointer].command[
+            :(max_length - len(output))]
+        output += RL(maybe_truncated_command, command_attribute)
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list([output])
diff --git a/tensorflow/python/debug/cli/curses_widgets_test.py b/tensorflow/python/debug/cli/curses_widgets_test.py
new file mode 100644
index 00000000000..fb0d3f4a0d6
--- /dev/null
+++ b/tensorflow/python/debug/cli/curses_widgets_test.py
@@ -0,0 +1,266 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for curses-based CLI widgets."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.debug.cli import curses_widgets
+from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+RTL = debugger_cli_common.RichTextLines
+CNH = curses_widgets.CursesNavigationHistory
+
+
+class CNHTest(test_util.TensorFlowTestCase):
+
+  def testConstructorWorks(self):
+    CNH(10)
+
+  def testConstructorWithInvalidCapacityErrors(self):
+    with self.assertRaises(ValueError):
+      CNH(0)
+    with self.assertRaises(ValueError):
+      CNH(-1)
+
+  def testInitialStateIsCorrect(self):
+    nav_history = CNH(10)
+    self.assertEqual(0, nav_history.size())
+    self.assertFalse(nav_history.can_go_forward())
+    self.assertFalse(nav_history.can_go_back())
+
+    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+      nav_history.go_back()
+    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+      nav_history.go_forward()
+    with self.assertRaisesRegexp(ValueError, "Empty navigation history"):
+      nav_history.update_scroll_position(3)
+
+  def testAddOneItemWorks(self):
+    nav_history = CNH(10)
+    nav_history.add_item("foo", RTL(["bar"]), 0)
+
+    self.assertEqual(1, nav_history.size())
+    self.assertEqual(0, nav_history.pointer())
+
+    self.assertFalse(nav_history.can_go_forward())
+    self.assertFalse(nav_history.can_go_back())
+
+    output = nav_history.go_back()
+    self.assertEqual("foo", output.command)
+    self.assertEqual(["bar"], output.screen_output.lines)
+    self.assertEqual(0, output.scroll_position)
+
+  def testAddItemsBeyondCapacityWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("foo", RTL(["foo_output"]), 0)
+    nav_history.add_item("bar", RTL(["bar_output"]), 0)
+
+    self.assertEqual(2, nav_history.size())
+    self.assertEqual(1, nav_history.pointer())
+    self.assertTrue(nav_history.can_go_back())
+    self.assertFalse(nav_history.can_go_forward())
+
+    nav_history.add_item("baz", RTL(["baz_output"]), 0)
+
+    self.assertEqual(2, nav_history.size())
+    self.assertEqual(1, nav_history.pointer())
+    self.assertTrue(nav_history.can_go_back())
+    self.assertFalse(nav_history.can_go_forward())
+
+    item = nav_history.go_back()
+    self.assertEqual("bar", item.command)
+    self.assertFalse(nav_history.can_go_back())
+    self.assertTrue(nav_history.can_go_forward())
+
+    item = nav_history.go_forward()
+    self.assertEqual("baz", item.command)
+    self.assertTrue(nav_history.can_go_back())
+    self.assertFalse(nav_history.can_go_forward())
+
+  def testAddItemFromNonLatestPointerPositionWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("foo", RTL(["foo_output"]), 0)
+    nav_history.add_item("bar", RTL(["bar_output"]), 0)
+
+    nav_history.go_back()
+    nav_history.add_item("baz", RTL(["baz_output"]), 0)
+
+    self.assertEqual(2, nav_history.size())
+    self.assertEqual(1, nav_history.pointer())
+    self.assertTrue(nav_history.can_go_back())
+    self.assertFalse(nav_history.can_go_forward())
+
+    item = nav_history.go_back()
+    self.assertEqual("foo", item.command)
+    item = nav_history.go_forward()
+    self.assertEqual("baz", item.command)
+
+  def testUpdateScrollPositionOnLatestItemWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
+    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
+
+    nav_history.update_scroll_position(1)
+    nav_history.go_back()
+    item = nav_history.go_forward()
+    self.assertEqual("bar", item.command)
+    self.assertEqual(1, item.scroll_position)
+
+  def testUpdateScrollPositionOnOldItemWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
+    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
+
+    item = nav_history.go_back()
+    self.assertEqual("foo", item.command)
+    self.assertEqual(0, item.scroll_position)
+
+    nav_history.update_scroll_position(1)
+    nav_history.go_forward()
+    item = nav_history.go_back()
+    self.assertEqual("foo", item.command)
+    self.assertEqual(1, item.scroll_position)
+
+    item = nav_history.go_forward()
+    self.assertEqual("bar", item.command)
+    self.assertEqual(0, item.scroll_position)
+
+  def testRenderWithEmptyHistoryWorks(self):
+    nav_history = CNH(2)
+
+    output = nav_history.render(40, "prev", "next")
+    self.assertEqual(1, len(output.lines))
+    self.assertEqual(
+        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT,
+        output.lines[0])
+    self.assertEqual({}, output.font_attr_segs)
+
+  def testRenderLatestWithSufficientLengthWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
+    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
+
+    output = nav_history.render(
+        40,
+        "prev",
+        "next",
+        latest_command_attribute="green",
+        old_command_attribute="yellow")
+    self.assertEqual(1, len(output.lines))
+    self.assertEqual(
+        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
+        " | bar",
+        output.lines[0])
+    self.assertEqual(2, output.font_attr_segs[0][0][0])
+    self.assertEqual(5, output.font_attr_segs[0][0][1])
+    self.assertEqual("prev", output.font_attr_segs[0][0][2].content)
+
+    self.assertEqual(12, output.font_attr_segs[0][1][0])
+    self.assertEqual(15, output.font_attr_segs[0][1][1])
+    self.assertEqual("green", output.font_attr_segs[0][1][2])
+
+  def testRenderOldButNotOldestWithSufficientLengthWorks(self):
+    nav_history = CNH(3)
+    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
+    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
+    nav_history.add_item("baz", RTL(["baz_out", "more_baz_out"]), 0)
+
+    nav_history.go_back()
+
+    output = nav_history.render(
+        40,
+        "prev",
+        "next",
+        latest_command_attribute="green",
+        old_command_attribute="yellow")
+    self.assertEqual(1, len(output.lines))
+    self.assertEqual(
+        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
+        " | (-1) bar",
+        output.lines[0])
+    self.assertEqual(2, output.font_attr_segs[0][0][0])
+    self.assertEqual(5, output.font_attr_segs[0][0][1])
+    self.assertEqual("prev", output.font_attr_segs[0][0][2].content)
+
+    self.assertEqual(6, output.font_attr_segs[0][1][0])
+    self.assertEqual(9, output.font_attr_segs[0][1][1])
+    self.assertEqual("next", output.font_attr_segs[0][1][2].content)
+
+    self.assertEqual(12, output.font_attr_segs[0][2][0])
+    self.assertEqual(17, output.font_attr_segs[0][2][1])
+    self.assertEqual("yellow", output.font_attr_segs[0][2][2])
+
+    self.assertEqual(17, output.font_attr_segs[0][3][0])
+    self.assertEqual(20, output.font_attr_segs[0][3][1])
+    self.assertEqual("yellow", output.font_attr_segs[0][3][2])
+
+  def testRenderOldestWithSufficientLengthWorks(self):
+    nav_history = CNH(3)
+    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
+    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
+    nav_history.add_item("baz", RTL(["baz_out", "more_baz_out"]), 0)
+
+    nav_history.go_back()
+    nav_history.go_back()
+
+    output = nav_history.render(
+        40,
+        "prev",
+        "next",
+        latest_command_attribute="green",
+        old_command_attribute="yellow")
+    self.assertEqual(1, len(output.lines))
+    self.assertEqual(
+        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
+        " | (-2) foo",
+        output.lines[0])
+    self.assertEqual(6, output.font_attr_segs[0][0][0])
+    self.assertEqual(9, output.font_attr_segs[0][0][1])
+    self.assertEqual("next", output.font_attr_segs[0][0][2].content)
+
+    self.assertEqual(12, output.font_attr_segs[0][1][0])
+    self.assertEqual(17, output.font_attr_segs[0][1][1])
+    self.assertEqual("yellow", output.font_attr_segs[0][1][2])
+
+    self.assertEqual(17, output.font_attr_segs[0][2][0])
+    self.assertEqual(20, output.font_attr_segs[0][2][1])
+    self.assertEqual("yellow", output.font_attr_segs[0][2][2])
+
+  def testRenderWithInsufficientLengthWorks(self):
+    nav_history = CNH(2)
+    nav_history.add_item("long_command", RTL(["output"]), 0)
+
+    output = nav_history.render(
+        15,
+        "prev",
+        "next",
+        latest_command_attribute="green",
+        old_command_attribute="yellow")
+    self.assertEqual(1, len(output.lines))
+    self.assertEqual(
+        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
+        " | lon",
+        output.lines[0])
+
+    self.assertEqual(12, output.font_attr_segs[0][0][0])
+    self.assertEqual(15, output.font_attr_segs[0][0][1])
+    self.assertEqual("green", output.font_attr_segs[0][0][2])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/cli/debugger_cli_common.py b/tensorflow/python/debug/cli/debugger_cli_common.py
index f4274d958d0..12e79ab07a4 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common.py
@@ -18,10 +18,12 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import os
 import re
 import sre_constants
 import traceback
 
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.platform import gfile
@@ -30,6 +32,7 @@ HELP_INDENT = "  "
 
 EXPLICIT_USER_EXIT = "explicit_user_exit"
 REGEX_MATCH_LINES_KEY = "regex_match_lines"
+INIT_SCROLL_POS_KEY = "init_scroll_pos"
 
 MAIN_MENU_KEY = "mm:"
 
@@ -88,7 +91,7 @@ class RichLine(object):
         attributes applied to the corresponding substrings.
     """
     ret = RichLine()
-    if isinstance(other, str):
+    if isinstance(other, six.string_types):
       ret.text = self.text + other
       ret.font_attr_segs = self.font_attr_segs[:]
       return ret
@@ -102,22 +105,30 @@ class RichLine(object):
     else:
       raise TypeError("%r cannot be concatenated with a RichLine" % other)
 
+  def __len__(self):
+    return len(self.text)
 
-def rich_text_lines_from_rich_line_list(rich_text_list):
-  """Convert a list of RichLine objects to a RichTextLines object.
+
+def rich_text_lines_from_rich_line_list(rich_text_list, annotations=None):
+  """Convert a list of RichLine objects or strings to a RichTextLines object.
 
   Args:
-    rich_text_list: a list of RichLine objects
+    rich_text_list: a list of RichLine objects or strings
+    annotations: annotatoins for the resultant RichTextLines object.
 
   Returns:
     A corresponding RichTextLines object.
   """
-  lines = [rl.text for rl in rich_text_list]
+  lines = []
   font_attr_segs = {}
   for i, rl in enumerate(rich_text_list):
-    if rl.font_attr_segs:
-      font_attr_segs[i] = rl.font_attr_segs
-  return RichTextLines(lines, font_attr_segs)
+    if isinstance(rl, RichLine):
+      lines.append(rl.text)
+      if rl.font_attr_segs:
+        font_attr_segs[i] = rl.font_attr_segs
+    else:
+      lines.append(rl)
+  return RichTextLines(lines, font_attr_segs, annotations=annotations)
 
 
 class RichTextLines(object):
@@ -166,7 +177,7 @@ class RichTextLines(object):
     """
     if isinstance(lines, list):
       self._lines = lines
-    elif isinstance(lines, str):
+    elif isinstance(lines, six.string_types):
       self._lines = [lines]
     else:
       raise ValueError("Unexpected type in lines: %s" % type(lines))
@@ -313,6 +324,9 @@ class RichTextLines(object):
     if font_attr_segs:
       self._font_attr_segs[len(self._lines) - 1] = font_attr_segs
 
+  def append_rich_line(self, rich_line):
+    self.append(rich_line.text, rich_line.font_attr_segs)
+
   def prepend(self, line, font_attr_segs=None):
     """Prepend (i.e., add to the front) a single line of text.
 
@@ -594,7 +608,7 @@ class CommandHandlerRegistry(object):
       raise ValueError("handler is not callable")
 
     # Make sure that help info is a string.
-    if not isinstance(help_info, str):
+    if not isinstance(help_info, six.string_types):
       raise ValueError("help_info is not a str")
 
     # Process prefix aliases.
@@ -636,7 +650,7 @@ class CommandHandlerRegistry(object):
         3) the handler is found for the prefix, but it fails to return a
           RichTextLines or raise any exception.
       CommandLineExit:
-        If the command handler raises this type of exception, tihs method will
+        If the command handler raises this type of exception, this method will
         simply pass it along.
     """
     if not prefix:
@@ -826,7 +840,7 @@ class TabCompletionRegistry(object):
 
     Args:
       context_words: A list of context words belonging to the context being
-        registerd. It is a list of str, instead of a single string, to support
+        registered. It is a list of str, instead of a single string, to support
         synonym words triggering the same tab-completion context, e.g.,
         both "drink" and the short-hand "dr" can trigger the same context.
       comp_items: A list of completion items, as a list of str.
@@ -959,16 +973,51 @@ class TabCompletionRegistry(object):
 class CommandHistory(object):
   """Keeps command history and supports lookup."""
 
-  def __init__(self, limit=100):
+  _HISTORY_FILE_NAME = ".tfdbg_history"
+
+  def __init__(self, limit=100, history_file_path=None):
     """CommandHistory constructor.
 
     Args:
       limit: Maximum number of the most recent commands that this instance
         keeps track of, as an int.
+      history_file_path: (str) Manually specified path to history file. Used in
+        testing.
     """
 
     self._commands = []
     self._limit = limit
+    self._history_file_path = (
+        history_file_path or self._get_default_history_file_path())
+    self._load_history_from_file()
+
+  def _load_history_from_file(self):
+    if os.path.isfile(self._history_file_path):
+      try:
+        with open(self._history_file_path, "rt") as history_file:
+          commands = history_file.readlines()
+        self._commands = [command.strip() for command in commands
+                          if command.strip()]
+
+        # Limit the size of the history file.
+        if len(self._commands) > self._limit:
+          self._commands = self._commands[-self._limit:]
+          with open(self._history_file_path, "wt") as history_file:
+            for command in self._commands:
+              history_file.write(command + "\n")
+      except IOError:
+        print("WARNING: writing history file failed.")
+
+  def _add_command_to_history_file(self, command):
+    try:
+      with open(self._history_file_path, "at") as history_file:
+        history_file.write(command + "\n")
+    except IOError:
+      pass
+
+  @classmethod
+  def _get_default_history_file_path(cls):
+    return os.path.join(os.path.expanduser("~"), cls._HISTORY_FILE_NAME)
 
   def add_command(self, command):
     """Add a command to the command history.
@@ -980,7 +1029,11 @@ class CommandHistory(object):
       TypeError: if command is not a str.
     """
 
-    if not isinstance(command, str):
+    if self._commands and command == self._commands[-1]:
+      # Ignore repeating commands in a row.
+      return
+
+    if not isinstance(command, six.string_types):
       raise TypeError("Attempt to enter non-str entry to command history")
 
     self._commands.append(command)
@@ -988,6 +1041,8 @@ class CommandHistory(object):
     if len(self._commands) > self._limit:
       self._commands = self._commands[-self._limit:]
 
+    self._add_command_to_history_file(command)
+
   def most_recent_n(self, n):
     """Look up the n most recent commands.
 
diff --git a/tensorflow/python/debug/cli/debugger_cli_common_test.py b/tensorflow/python/debug/cli/debugger_cli_common_test.py
index e60e05c481e..1b7a5962fe7 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common_test.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import stat
 import tempfile
 
 from tensorflow.python.debug.cli import debugger_cli_common
@@ -76,6 +77,23 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(screen_output.font_attr_segs[0]))
     self.assertEqual(1, len(screen_output.annotations))
 
+  def testRichLinesAppendRichLine(self):
+    rtl = debugger_cli_common.RichTextLines(
+        "Roses are red",
+        font_attr_segs={0: [(0, 5, "red")]})
+    rtl.append_rich_line(debugger_cli_common.RichLine("Violets are ") +
+                         debugger_cli_common.RichLine("blue", "blue"))
+    self.assertEqual(2, len(rtl.lines))
+    self.assertEqual(2, len(rtl.font_attr_segs))
+    self.assertEqual(1, len(rtl.font_attr_segs[0]))
+    self.assertEqual(1, len(rtl.font_attr_segs[1]))
+
+  def testRichLineLenMethodWorks(self):
+    self.assertEqual(0, len(debugger_cli_common.RichLine()))
+    self.assertEqual(0, len(debugger_cli_common.RichLine("")))
+    self.assertEqual(1, len(debugger_cli_common.RichLine("x")))
+    self.assertEqual(6, len(debugger_cli_common.RichLine("x y z ", "blue")))
+
   def testRichTextLinesConstructorIncomplete(self):
     # Test RichTextLines constructor, with incomplete keyword arguments.
     screen_output = debugger_cli_common.RichTextLines(
@@ -904,7 +922,18 @@ class TabCompletionRegistryTest(test_util.TensorFlowTestCase):
 class CommandHistoryTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self._cmd_hist = debugger_cli_common.CommandHistory(limit=3)
+    self._history_file_path = tempfile.mktemp()
+    self._cmd_hist = debugger_cli_common.CommandHistory(
+        limit=3, history_file_path=self._history_file_path)
+
+  def tearDown(self):
+    if os.path.isfile(self._history_file_path):
+      os.remove(self._history_file_path)
+
+  def _restoreFileReadWritePermissions(self, file_path):
+    os.chmod(file_path,
+             (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | stat.S_IWUSR |
+              stat.S_IWGRP | stat.S_IWOTH))
 
   def testLookUpMostRecent(self):
     self.assertEqual([], self._cmd_hist.most_recent_n(3))
@@ -959,6 +988,71 @@ class CommandHistoryTest(test_util.TensorFlowTestCase):
         TypeError, "Attempt to enter non-str entry to command history"):
       self._cmd_hist.add_command(["print_tensor node_a:0"])
 
+  def testRepeatingCommandsDoNotGetLoggedRepeatedly(self):
+    self._cmd_hist.add_command("help")
+    self._cmd_hist.add_command("help")
+
+    self.assertEqual(["help"], self._cmd_hist.most_recent_n(2))
+
+  def testCommandHistoryFileIsCreated(self):
+    self.assertFalse(os.path.isfile(self._history_file_path))
+    self._cmd_hist.add_command("help")
+    self.assertTrue(os.path.isfile(self._history_file_path))
+    with open(self._history_file_path, "rt") as f:
+      self.assertEqual(["help\n"], f.readlines())
+
+  def testLoadingCommandHistoryFileObeysLimit(self):
+    self._cmd_hist.add_command("help 1")
+    self._cmd_hist.add_command("help 2")
+    self._cmd_hist.add_command("help 3")
+    self._cmd_hist.add_command("help 4")
+
+    cmd_hist_2 = debugger_cli_common.CommandHistory(
+        limit=3, history_file_path=self._history_file_path)
+    self.assertEqual(["help 2", "help 3", "help 4"],
+                     cmd_hist_2.most_recent_n(3))
+
+    with open(self._history_file_path, "rt") as f:
+      self.assertEqual(
+          ["help 2\n", "help 3\n", "help 4\n"], f.readlines())
+
+  def testCommandHistoryHandlesReadingIOErrorGracoiusly(self):
+    with open(self._history_file_path, "wt") as f:
+      f.write("help\n")
+
+    # Change file to not readable by anyone.
+    os.chmod(self._history_file_path, 0)
+
+    # The creation of a CommandHistory object should not error out.
+    debugger_cli_common.CommandHistory(
+        limit=3, history_file_path=self._history_file_path)
+
+    self._restoreFileReadWritePermissions(self._history_file_path)
+
+  def testCommandHistoryHandlesWritingIOErrorGracoiusly(self):
+    with open(self._history_file_path, "wt") as f:
+      f.write("help\n")
+
+    # Change file to read-only.
+    os.chmod(self._history_file_path,
+             stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
+
+    # Reading from the file should still work.
+    cmd_hist_2 = debugger_cli_common.CommandHistory(
+        limit=3, history_file_path=self._history_file_path)
+    self.assertEqual(["help"], cmd_hist_2.most_recent_n(1))
+
+    # Writing should no longer work, but it should fail silently and
+    # the within instance-command history should still work.
+    cmd_hist_2.add_command("foo")
+    self.assertEqual(["help", "foo"], cmd_hist_2.most_recent_n(2))
+
+    cmd_hist_3 = debugger_cli_common.CommandHistory(
+        limit=3, history_file_path=self._history_file_path)
+    self.assertEqual(["help"], cmd_hist_3.most_recent_n(1))
+
+    self._restoreFileReadWritePermissions(self._history_file_path)
+
 
 class MenuNodeTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/debug/cli/offline_analyzer.py b/tensorflow/python/debug/cli/offline_analyzer.py
index 8edfeeb62e9..f04dc162830 100644
--- a/tensorflow/python/debug/cli/offline_analyzer.py
+++ b/tensorflow/python/debug/cli/offline_analyzer.py
@@ -22,8 +22,8 @@ import sys
 
 # Google-internal import(s).
 
-from tensorflow.python.debug import debug_data
 from tensorflow.python.debug.cli import analyzer_cli
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.platform import app
 
 
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli.py b/tensorflow/python/debug/cli/profile_analyzer_cli.py
new file mode 100644
index 00000000000..3304194b1cb
--- /dev/null
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli.py
@@ -0,0 +1,799 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Formats and displays profiling information."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+
+import numpy as np
+
+from tensorflow.python.debug.cli import cli_shared
+from tensorflow.python.debug.cli import command_parser
+from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import ui_factory
+from tensorflow.python.debug.lib import profiling
+from tensorflow.python.debug.lib import source_utils
+
+RL = debugger_cli_common.RichLine
+
+SORT_OPS_BY_OP_NAME = "node"
+SORT_OPS_BY_OP_TYPE = "op_type"
+SORT_OPS_BY_OP_TIME = "op_time"
+SORT_OPS_BY_EXEC_TIME = "exec_time"
+SORT_OPS_BY_START_TIME = "start_time"
+SORT_OPS_BY_LINE = "line"
+
+_DEVICE_NAME_FILTER_FLAG = "device_name_filter"
+_NODE_NAME_FILTER_FLAG = "node_name_filter"
+_OP_TYPE_FILTER_FLAG = "op_type_filter"
+
+
+class ProfileDataTableView(object):
+  """Table View of profiling data."""
+
+  def __init__(self, profile_datum_list, time_unit=cli_shared.TIME_UNIT_US):
+    """Constructor.
+
+    Args:
+      profile_datum_list: List of `ProfileDatum` objects.
+      time_unit: must be in cli_shared.TIME_UNITS.
+    """
+    self._profile_datum_list = profile_datum_list
+    self.formatted_start_time = [
+        datum.start_time for datum in profile_datum_list]
+    self.formatted_op_time = [
+        cli_shared.time_to_readable_str(datum.op_time,
+                                        force_time_unit=time_unit)
+        for datum in profile_datum_list]
+    self.formatted_exec_time = [
+        cli_shared.time_to_readable_str(
+            datum.node_exec_stats.all_end_rel_micros,
+            force_time_unit=time_unit)
+        for datum in profile_datum_list]
+
+    self._column_names = ["Node",
+                          "Op Type",
+                          "Start Time (us)",
+                          "Op Time (%s)" % time_unit,
+                          "Exec Time (%s)" % time_unit,
+                          "Filename:Lineno(function)"]
+    self._column_sort_ids = [SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+                             SORT_OPS_BY_START_TIME, SORT_OPS_BY_OP_TIME,
+                             SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_LINE]
+
+  def value(self,
+            row,
+            col,
+            device_name_filter=None,
+            node_name_filter=None,
+            op_type_filter=None):
+    """Get the content of a cell of the table.
+
+    Args:
+      row: (int) row index.
+      col: (int) column index.
+      device_name_filter: Regular expression to filter by device name.
+      node_name_filter: Regular expression to filter by node name.
+      op_type_filter: Regular expression to filter by op type.
+
+    Returns:
+      A debuggre_cli_common.RichLine object representing the content of the
+      cell, potentially with a clickable MenuItem.
+
+    Raises:
+      IndexError: if row index is out of range.
+    """
+    menu_item = None
+    if col == 0:
+      text = self._profile_datum_list[row].node_exec_stats.node_name
+    elif col == 1:
+      text = self._profile_datum_list[row].op_type
+    elif col == 2:
+      text = str(self.formatted_start_time[row])
+    elif col == 3:
+      text = str(self.formatted_op_time[row])
+    elif col == 4:
+      text = str(self.formatted_exec_time[row])
+    elif col == 5:
+      command = "ps"
+      if device_name_filter:
+        command += " --%s %s" % (_DEVICE_NAME_FILTER_FLAG,
+                                 device_name_filter)
+      if node_name_filter:
+        command += " --%s %s" % (_NODE_NAME_FILTER_FLAG, node_name_filter)
+      if op_type_filter:
+        command += " --%s %s" % (_OP_TYPE_FILTER_FLAG, op_type_filter)
+      command += " %s --init_line %d" % (
+          self._profile_datum_list[row].file_path,
+          self._profile_datum_list[row].line_number)
+      menu_item = debugger_cli_common.MenuItem(None, command)
+      text = self._profile_datum_list[row].file_line_func
+    else:
+      raise IndexError("Invalid column index %d." % col)
+
+    return RL(text, font_attr=menu_item)
+
+  def row_count(self):
+    return len(self._profile_datum_list)
+
+  def column_count(self):
+    return len(self._column_names)
+
+  def column_names(self):
+    return self._column_names
+
+  def column_sort_id(self, col):
+    return self._column_sort_ids[col]
+
+
+def _list_profile_filter(
+    profile_datum,
+    node_name_regex,
+    file_path_regex,
+    op_type_regex,
+    op_time_interval,
+    exec_time_interval,
+    min_lineno=-1,
+    max_lineno=-1):
+  """Filter function for list_profile command.
+
+  Args:
+    profile_datum: A `ProfileDatum` object.
+    node_name_regex: Regular expression pattern object to filter by name.
+    file_path_regex: Regular expression pattern object to filter by file path.
+    op_type_regex: Regular expression pattern object to filter by op type.
+    op_time_interval: `Interval` for filtering op time.
+    exec_time_interval: `Interval` for filtering exec time.
+    min_lineno: Lower bound for 1-based line number, inclusive.
+      If <= 0, has no effect.
+    max_lineno: Upper bound for 1-based line number, exclusive.
+      If <= 0, has no effect.
+    # TODO(cais): Maybe filter by function name.
+
+  Returns:
+    True iff profile_datum should be included.
+  """
+  if node_name_regex and not node_name_regex.match(
+      profile_datum.node_exec_stats.node_name):
+    return False
+  if file_path_regex:
+    if (not profile_datum.file_path or
+        not file_path_regex.match(profile_datum.file_path)):
+      return False
+  if (min_lineno > 0 and profile_datum.line_number and
+      profile_datum.line_number < min_lineno):
+    return False
+  if (max_lineno > 0 and profile_datum.line_number and
+      profile_datum.line_number >= max_lineno):
+    return False
+  if (profile_datum.op_type is not None and op_type_regex and
+      not op_type_regex.match(profile_datum.op_type)):
+    return False
+  if op_time_interval is not None and not op_time_interval.contains(
+      profile_datum.op_time):
+    return False
+  if exec_time_interval and not exec_time_interval.contains(
+      profile_datum.node_exec_stats.all_end_rel_micros):
+    return False
+  return True
+
+
+def _list_profile_sort_key(profile_datum, sort_by):
+  """Get a profile_datum property to sort by in list_profile command.
+
+  Args:
+    profile_datum: A `ProfileDatum` object.
+    sort_by: (string) indicates a value to sort by.
+      Must be one of SORT_BY* constants.
+
+  Returns:
+    profile_datum property to sort by.
+  """
+  if sort_by == SORT_OPS_BY_OP_NAME:
+    return profile_datum.node_exec_stats.node_name
+  elif sort_by == SORT_OPS_BY_OP_TYPE:
+    return profile_datum.op_type
+  elif sort_by == SORT_OPS_BY_LINE:
+    return profile_datum.file_line_func
+  elif sort_by == SORT_OPS_BY_OP_TIME:
+    return profile_datum.op_time
+  elif sort_by == SORT_OPS_BY_EXEC_TIME:
+    return profile_datum.node_exec_stats.all_end_rel_micros
+  else:  # sort by start time
+    return profile_datum.node_exec_stats.all_start_micros
+
+
+class ProfileAnalyzer(object):
+  """Analyzer for profiling data."""
+
+  def __init__(self, graph, run_metadata):
+    """ProfileAnalyzer constructor.
+
+    Args:
+      graph: (tf.Graph) Python graph object.
+      run_metadata: A `RunMetadata` protobuf object.
+
+    Raises:
+      ValueError: If run_metadata is None.
+    """
+    self._graph = graph
+    if not run_metadata:
+      raise ValueError("No RunMetadata passed for profile analysis.")
+    self._run_metadata = run_metadata
+    self._arg_parsers = {}
+    ap = argparse.ArgumentParser(
+        description="List nodes profile information.",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "-d",
+        "--%s" % _DEVICE_NAME_FILTER_FLAG,
+        dest=_DEVICE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter device name by regex.")
+    ap.add_argument(
+        "-n",
+        "--%s" % _NODE_NAME_FILTER_FLAG,
+        dest=_NODE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter node name by regex.")
+    ap.add_argument(
+        "-t",
+        "--%s" % _OP_TYPE_FILTER_FLAG,
+        dest=_OP_TYPE_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter op type by regex.")
+    # TODO(annarev): allow file filtering at non-stack top position.
+    ap.add_argument(
+        "-f",
+        "--file_path_filter",
+        dest="file_path_filter",
+        type=str,
+        default="",
+        help="filter by file name at the top position of node's creation "
+             "stack that does not belong to TensorFlow library.")
+    ap.add_argument(
+        "--min_lineno",
+        dest="min_lineno",
+        type=int,
+        default=-1,
+        help="(Inclusive) lower bound for 1-based line number in source file. "
+             "If <= 0, has no effect.")
+    ap.add_argument(
+        "--max_lineno",
+        dest="max_lineno",
+        type=int,
+        default=-1,
+        help="(Exclusive) upper bound for 1-based line number in source file. "
+             "If <= 0, has no effect.")
+    ap.add_argument(
+        "-e",
+        "--execution_time",
+        dest="execution_time",
+        type=str,
+        default="",
+        help="Filter by execution time interval "
+             "(includes compute plus pre- and post -processing time). "
+             "Supported units are s, ms and us (default). "
+             "E.g. -e >100s, -e <100, -e [100us,1000ms]")
+    ap.add_argument(
+        "-o",
+        "--op_time",
+        dest="op_time",
+        type=str,
+        default="",
+        help="Filter by op time interval (only includes compute time). "
+             "Supported units are s, ms and us (default). "
+             "E.g. -e >100s, -e <100, -e [100us,1000ms]")
+    ap.add_argument(
+        "-s",
+        "--sort_by",
+        dest="sort_by",
+        type=str,
+        default=SORT_OPS_BY_START_TIME,
+        help=("the field to sort the data by: (%s)" %
+              " | ".join([SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+                          SORT_OPS_BY_START_TIME, SORT_OPS_BY_OP_TIME,
+                          SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_LINE])))
+    ap.add_argument(
+        "-r",
+        "--reverse",
+        dest="reverse",
+        action="store_true",
+        help="sort the data in reverse (descending) order")
+    ap.add_argument(
+        "--time_unit",
+        dest="time_unit",
+        type=str,
+        default=cli_shared.TIME_UNIT_US,
+        help="Time unit (" + " | ".join(cli_shared.TIME_UNITS) + ")")
+
+    self._arg_parsers["list_profile"] = ap
+
+    ap = argparse.ArgumentParser(
+        description="Print a Python source file with line-level profile "
+                    "information",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "source_file_path",
+        type=str,
+        help="Path to the source_file_path")
+    ap.add_argument(
+        "--cost_type",
+        type=str,
+        choices=["exec_time", "op_time"],
+        default="exec_time",
+        help="Type of cost to display")
+    ap.add_argument(
+        "--time_unit",
+        dest="time_unit",
+        type=str,
+        default=cli_shared.TIME_UNIT_US,
+        help="Time unit (" + " | ".join(cli_shared.TIME_UNITS) + ")")
+    ap.add_argument(
+        "-d",
+        "--%s" % _DEVICE_NAME_FILTER_FLAG,
+        dest=_DEVICE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter device name by regex.")
+    ap.add_argument(
+        "-n",
+        "--%s" % _NODE_NAME_FILTER_FLAG,
+        dest=_NODE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter node name by regex.")
+    ap.add_argument(
+        "-t",
+        "--%s" % _OP_TYPE_FILTER_FLAG,
+        dest=_OP_TYPE_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter op type by regex.")
+    ap.add_argument(
+        "--init_line",
+        dest="init_line",
+        type=int,
+        default=0,
+        help="The 1-based line number to scroll to initially.")
+
+    self._arg_parsers["print_source"] = ap
+
+  def list_profile(self, args, screen_info=None):
+    """Command handler for list_profile.
+
+    List per-operation profile information.
+
+    Args:
+      args: Command-line arguments, excluding the command prefix, as a list of
+        str.
+      screen_info: Optional dict input containing screen information such as
+        cols.
+
+    Returns:
+      Output text lines as a RichTextLines object.
+    """
+    screen_cols = 80
+    if screen_info and "cols" in screen_info:
+      screen_cols = screen_info["cols"]
+
+    parsed = self._arg_parsers["list_profile"].parse_args(args)
+    op_time_interval = (command_parser.parse_time_interval(parsed.op_time)
+                        if parsed.op_time else None)
+    exec_time_interval = (
+        command_parser.parse_time_interval(parsed.execution_time)
+        if parsed.execution_time else None)
+    node_name_regex = (re.compile(parsed.node_name_filter)
+                       if parsed.node_name_filter else None)
+    file_path_regex = (re.compile(parsed.file_path_filter)
+                       if parsed.file_path_filter else None)
+    op_type_regex = (re.compile(parsed.op_type_filter)
+                     if parsed.op_type_filter else None)
+
+    output = debugger_cli_common.RichTextLines([""])
+    device_name_regex = (re.compile(parsed.device_name_filter)
+                         if parsed.device_name_filter else None)
+    data_generator = self._get_profile_data_generator()
+    device_count = len(self._run_metadata.step_stats.dev_stats)
+    for index in range(device_count):
+      device_stats = self._run_metadata.step_stats.dev_stats[index]
+      if not device_name_regex or device_name_regex.match(device_stats.device):
+        profile_data = [
+            datum for datum in data_generator(device_stats)
+            if _list_profile_filter(
+                datum, node_name_regex, file_path_regex, op_type_regex,
+                op_time_interval, exec_time_interval,
+                min_lineno=parsed.min_lineno, max_lineno=parsed.max_lineno)]
+        profile_data = sorted(
+            profile_data,
+            key=lambda datum: _list_profile_sort_key(datum, parsed.sort_by),
+            reverse=parsed.reverse)
+        output.extend(
+            self._get_list_profile_lines(
+                device_stats.device, index, device_count,
+                profile_data, parsed.sort_by, parsed.reverse, parsed.time_unit,
+                device_name_filter=parsed.device_name_filter,
+                node_name_filter=parsed.node_name_filter,
+                op_type_filter=parsed.op_type_filter,
+                screen_cols=screen_cols))
+    return output
+
+  def _get_profile_data_generator(self):
+    """Get function that generates `ProfileDatum` objects.
+
+    Returns:
+      A function that generates `ProfileDatum` objects.
+    """
+    node_to_file_path = {}
+    node_to_line_number = {}
+    node_to_func_name = {}
+    node_to_op_type = {}
+    for op in self._graph.get_operations():
+      for trace_entry in reversed(op.traceback):
+        file_path = trace_entry[0]
+        line_num = trace_entry[1]
+        func_name = trace_entry[2]
+        if not source_utils.guess_is_tensorflow_py_library(file_path):
+          break
+      node_to_file_path[op.name] = file_path
+      node_to_line_number[op.name] = line_num
+      node_to_func_name[op.name] = func_name
+      node_to_op_type[op.name] = op.type
+
+    def profile_data_generator(device_step_stats):
+      for node_stats in device_step_stats.node_stats:
+        if node_stats.node_name == "_SOURCE" or node_stats.node_name == "_SINK":
+          continue
+        yield profiling.ProfileDatum(
+            device_step_stats.device,
+            node_stats,
+            node_to_file_path.get(node_stats.node_name, ""),
+            node_to_line_number.get(node_stats.node_name, 0),
+            node_to_func_name.get(node_stats.node_name, ""),
+            node_to_op_type.get(node_stats.node_name, ""))
+    return profile_data_generator
+
+  def _get_list_profile_lines(
+      self, device_name, device_index, device_count,
+      profile_datum_list, sort_by, sort_reverse, time_unit,
+      device_name_filter=None, node_name_filter=None, op_type_filter=None,
+      screen_cols=80):
+    """Get `RichTextLines` object for list_profile command for a given device.
+
+    Args:
+      device_name: (string) Device name.
+      device_index: (int) Device index.
+      device_count: (int) Number of devices.
+      profile_datum_list: List of `ProfileDatum` objects.
+      sort_by: (string) Identifier of column to sort. Sort identifier
+          must match value of SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+          SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_MEMORY or SORT_OPS_BY_LINE.
+      sort_reverse: (bool) Whether to sort in descending instead of default
+          (ascending) order.
+      time_unit: time unit, must be in cli_shared.TIME_UNITS.
+      device_name_filter: Regular expression to filter by device name.
+      node_name_filter: Regular expression to filter by node name.
+      op_type_filter: Regular expression to filter by op type.
+      screen_cols: (int) Number of columns available on the screen (i.e.,
+        available screen width).
+
+    Returns:
+      `RichTextLines` object containing a table that displays profiling
+      information for each op.
+    """
+    profile_data = ProfileDataTableView(profile_datum_list, time_unit=time_unit)
+
+    # Calculate total time early to calculate column widths.
+    total_op_time = sum(datum.op_time for datum in profile_datum_list)
+    total_exec_time = sum(datum.node_exec_stats.all_end_rel_micros
+                          for datum in profile_datum_list)
+    device_total_row = [
+        "Device Total", "",
+        cli_shared.time_to_readable_str(total_op_time,
+                                        force_time_unit=time_unit),
+        cli_shared.time_to_readable_str(total_exec_time,
+                                        force_time_unit=time_unit)]
+
+    # Calculate column widths.
+    column_widths = [
+        len(column_name) for column_name in profile_data.column_names()]
+    for col in range(len(device_total_row)):
+      column_widths[col] = max(column_widths[col], len(device_total_row[col]))
+    for col in range(len(column_widths)):
+      for row in range(profile_data.row_count()):
+        column_widths[col] = max(
+            column_widths[col], len(profile_data.value(
+                row,
+                col,
+                device_name_filter=device_name_filter,
+                node_name_filter=node_name_filter,
+                op_type_filter=op_type_filter)))
+      column_widths[col] += 2  # add margin between columns
+
+    # Add device name.
+    output = [RL("-" * screen_cols)]
+    device_row = "Device %d of %d: %s" % (
+        device_index + 1, device_count, device_name)
+    output.append(RL(device_row))
+    output.append(RL())
+
+    # Add headers.
+    base_command = "list_profile"
+    row = RL()
+    for col in range(profile_data.column_count()):
+      column_name = profile_data.column_names()[col]
+      sort_id = profile_data.column_sort_id(col)
+      command = "%s -s %s" % (base_command, sort_id)
+      if sort_by == sort_id and not sort_reverse:
+        command += " -r"
+      head_menu_item = debugger_cli_common.MenuItem(None, command)
+      row += RL(column_name, font_attr=[head_menu_item, "bold"])
+      row += RL(" " * (column_widths[col] - len(column_name)))
+
+    output.append(row)
+
+    # Add data rows.
+    for row in range(profile_data.row_count()):
+      new_row = RL()
+      for col in range(profile_data.column_count()):
+        new_cell = profile_data.value(
+            row,
+            col,
+            device_name_filter=device_name_filter,
+            node_name_filter=node_name_filter,
+            op_type_filter=op_type_filter)
+        new_row += new_cell
+        new_row += RL(" " * (column_widths[col] - len(new_cell)))
+      output.append(new_row)
+
+    # Add stat totals.
+    row_str = ""
+    for col in range(len(device_total_row)):
+      row_str += ("{:<%d}" % column_widths[col]).format(device_total_row[col])
+    output.append(RL())
+    output.append(RL(row_str))
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(output)
+
+  def _measure_list_profile_column_widths(self, profile_data):
+    """Determine the maximum column widths for each data list.
+
+    Args:
+      profile_data: list of ProfileDatum objects.
+
+    Returns:
+      List of column widths in the same order as columns in data.
+    """
+    num_columns = len(profile_data.column_names())
+    widths = [len(column_name) for column_name in profile_data.column_names()]
+    for row in range(profile_data.row_count()):
+      for col in range(num_columns):
+        widths[col] = max(
+            widths[col], len(str(profile_data.row_values(row)[col])) + 2)
+    return widths
+
+  _LINE_COST_ATTR = cli_shared.COLOR_CYAN
+  _LINE_NUM_ATTR = cli_shared.COLOR_YELLOW
+  _NUM_NODES_HEAD = "#nodes"
+  _NUM_EXECS_SUB_HEAD = "(#execs)"
+  _LINENO_HEAD = "lineno"
+  _SOURCE_HEAD = "source"
+
+  def print_source(self, args, screen_info=None):
+    """Print a Python source file with line-level profile information.
+
+    Args:
+      args: Command-line arguments, excluding the command prefix, as a list of
+        str.
+      screen_info: Optional dict input containing screen information such as
+        cols.
+
+    Returns:
+      Output text lines as a RichTextLines object.
+    """
+    del screen_info
+
+    parsed = self._arg_parsers["print_source"].parse_args(args)
+
+    device_name_regex = (re.compile(parsed.device_name_filter)
+                         if parsed.device_name_filter else None)
+
+    profile_data = []
+    data_generator = self._get_profile_data_generator()
+    device_count = len(self._run_metadata.step_stats.dev_stats)
+    for index in range(device_count):
+      device_stats = self._run_metadata.step_stats.dev_stats[index]
+      if device_name_regex and not device_name_regex.match(device_stats.device):
+        continue
+      profile_data.extend([datum for datum in data_generator(device_stats)])
+
+    source_annotation = source_utils.annotate_source_against_profile(
+        profile_data,
+        os.path.expanduser(parsed.source_file_path),
+        node_name_filter=parsed.node_name_filter,
+        op_type_filter=parsed.op_type_filter)
+    if not source_annotation:
+      return debugger_cli_common.RichTextLines(
+          ["The source file %s does not contain any profile information for "
+           "the previous Session run under the following "
+           "filters:" % parsed.source_file_path,
+           "  --%s: %s" % (_DEVICE_NAME_FILTER_FLAG, parsed.device_name_filter),
+           "  --%s: %s" % (_NODE_NAME_FILTER_FLAG, parsed.node_name_filter),
+           "  --%s: %s" % (_OP_TYPE_FILTER_FLAG, parsed.op_type_filter)])
+
+    max_total_cost = 0
+    for line_index in source_annotation:
+      total_cost = self._get_total_cost(source_annotation[line_index],
+                                        parsed.cost_type)
+      max_total_cost = max(max_total_cost, total_cost)
+
+    source_lines, line_num_width = source_utils.load_source(
+        parsed.source_file_path)
+
+    cost_bar_max_length = 10
+    total_cost_head = parsed.cost_type
+    column_widths = {
+        "cost_bar": cost_bar_max_length + 3,
+        "total_cost": len(total_cost_head) + 3,
+        "num_nodes_execs": len(self._NUM_EXECS_SUB_HEAD) + 1,
+        "line_number": line_num_width,
+    }
+
+    head = RL(
+        " " * column_widths["cost_bar"] +
+        total_cost_head +
+        " " * (column_widths["total_cost"] - len(total_cost_head)) +
+        self._NUM_NODES_HEAD +
+        " " * (column_widths["num_nodes_execs"] - len(self._NUM_NODES_HEAD)),
+        font_attr=self._LINE_COST_ATTR)
+    head += RL(self._LINENO_HEAD, font_attr=self._LINE_NUM_ATTR)
+    sub_head = RL(
+        " " * (column_widths["cost_bar"] +
+               column_widths["total_cost"]) +
+        self._NUM_EXECS_SUB_HEAD +
+        " " * (column_widths["num_nodes_execs"] -
+               len(self._NUM_EXECS_SUB_HEAD)) +
+        " " * column_widths["line_number"],
+        font_attr=self._LINE_COST_ATTR)
+    sub_head += RL(self._SOURCE_HEAD, font_attr="bold")
+    lines = [head, sub_head]
+
+    output_annotations = {}
+    for i, line in enumerate(source_lines):
+      lineno = i + 1
+      if lineno in source_annotation:
+        annotation = source_annotation[lineno]
+        cost_bar = self._render_normalized_cost_bar(
+            self._get_total_cost(annotation, parsed.cost_type), max_total_cost,
+            cost_bar_max_length)
+        annotated_line = cost_bar
+        annotated_line += " " * (column_widths["cost_bar"] - len(cost_bar))
+
+        total_cost = RL(cli_shared.time_to_readable_str(
+            self._get_total_cost(annotation, parsed.cost_type),
+            force_time_unit=parsed.time_unit),
+                        font_attr=self._LINE_COST_ATTR)
+        total_cost += " " * (column_widths["total_cost"] - len(total_cost))
+        annotated_line += total_cost
+
+        file_path_filter = re.escape(parsed.source_file_path) + "$"
+        command = "lp --file_path_filter %s --min_lineno %d --max_lineno %d" % (
+            file_path_filter, lineno, lineno + 1)
+        if parsed.device_name_filter:
+          command += " --%s %s" % (_DEVICE_NAME_FILTER_FLAG,
+                                   parsed.device_name_filter)
+        if parsed.node_name_filter:
+          command += " --%s %s" % (_NODE_NAME_FILTER_FLAG,
+                                   parsed.node_name_filter)
+        if parsed.op_type_filter:
+          command += " --%s %s" % (_OP_TYPE_FILTER_FLAG,
+                                   parsed.op_type_filter)
+        menu_item = debugger_cli_common.MenuItem(None, command)
+        num_nodes_execs = RL("%d(%d)" % (annotation.node_count,
+                                         annotation.node_exec_count),
+                             font_attr=[self._LINE_COST_ATTR, menu_item])
+        num_nodes_execs += " " * (
+            column_widths["num_nodes_execs"] - len(num_nodes_execs))
+        annotated_line += num_nodes_execs
+      else:
+        annotated_line = RL(
+            " " * sum(column_widths[col_name] for col_name in column_widths
+                      if col_name != "line_number"))
+
+      line_num_column = RL(" L%d" % (lineno), self._LINE_NUM_ATTR)
+      line_num_column += " " * (
+          column_widths["line_number"] - len(line_num_column))
+      annotated_line += line_num_column
+      annotated_line += line
+      lines.append(annotated_line)
+
+      if parsed.init_line == lineno:
+        output_annotations[
+            debugger_cli_common.INIT_SCROLL_POS_KEY] = len(lines) - 1
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(
+        lines, annotations=output_annotations)
+
+  def _get_total_cost(self, aggregated_profile, cost_type):
+    if cost_type == "exec_time":
+      return aggregated_profile.total_exec_time
+    elif cost_type == "op_time":
+      return aggregated_profile.total_op_time
+    else:
+      raise ValueError("Unsupported cost type: %s" % cost_type)
+
+  def _render_normalized_cost_bar(self, cost, max_cost, length):
+    """Render a text bar representing a normalized cost.
+
+    Args:
+      cost: the absolute value of the cost.
+      max_cost: the maximum cost value to normalize the absolute cost with.
+      length: (int) length of the cost bar, in number of characters, excluding
+        the brackets on the two ends.
+
+    Returns:
+      An instance of debugger_cli_common.RichTextLine.
+    """
+    num_ticks = int(np.ceil(float(cost) / max_cost * length))
+    num_ticks = num_ticks or 1  # Minimum is 1 tick.
+    output = RL("[", font_attr=self._LINE_COST_ATTR)
+    output += RL("|" * num_ticks + " " * (length - num_ticks),
+                 font_attr=["bold", self._LINE_COST_ATTR])
+    output += RL("]", font_attr=self._LINE_COST_ATTR)
+    return output
+
+  def get_help(self, handler_name):
+    return self._arg_parsers[handler_name].format_help()
+
+
+def create_profiler_ui(graph,
+                       run_metadata,
+                       ui_type="curses",
+                       on_ui_exit=None):
+  """Create an instance of CursesUI based on a `tf.Graph` and `RunMetadata`.
+
+  Args:
+    graph: Python `Graph` object.
+    run_metadata: A `RunMetadata` protobuf object.
+    ui_type: (str) requested UI type, e.g., "curses", "readline".
+    on_ui_exit: (`Callable`) the callback to be called when the UI exits.
+
+  Returns:
+    (base_ui.BaseUI) A BaseUI subtype object with a set of standard analyzer
+      commands and tab-completions registered.
+  """
+
+  analyzer = ProfileAnalyzer(graph, run_metadata)
+
+  cli = ui_factory.get_ui(ui_type, on_ui_exit=on_ui_exit)
+  cli.register_command_handler(
+      "list_profile",
+      analyzer.list_profile,
+      analyzer.get_help("list_profile"),
+      prefix_aliases=["lp"])
+  cli.register_command_handler(
+      "print_source",
+      analyzer.print_source,
+      analyzer.get_help("print_source"),
+      prefix_aliases=["ps"])
+
+  return cli
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
new file mode 100644
index 00000000000..a709cb4107e
--- /dev/null
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -0,0 +1,456 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for profile_analyzer_cli."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import profile_analyzer_cli
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+def _line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+def _at_least_one_line_matches(pattern, lines):
+  pattern_re = re.compile(pattern)
+  for i, line in enumerate(lines):
+    if pattern_re.search(line):
+      return True, i
+  return False, None
+
+
+def _assert_at_least_one_line_matches(pattern, lines):
+  any_match, _ = _at_least_one_line_matches(pattern, lines)
+  if not any_match:
+    raise AssertionError(
+        "%s does not match any line in %s." % (pattern, str(lines)))
+
+
+def _assert_no_lines_match(pattern, lines):
+  any_match, _ = _at_least_one_line_matches(pattern, lines)
+  if any_match:
+    raise AssertionError(
+        "%s matched at least one line in %s." % (pattern, str(lines)))
+
+
+class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
+
+  def testNodeInfoEmpty(self):
+    graph = ops.Graph()
+    run_metadata = config_pb2.RunMetadata()
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+    self.assertEquals([""], prof_output)
+
+  def testSingleDevice(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=3)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file1", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+
+    _assert_at_least_one_line_matches(r"Device 1 of 1: deviceA", prof_output)
+    _assert_at_least_one_line_matches(r"^Add/123.*add.*2us.*4us", prof_output)
+    _assert_at_least_one_line_matches(r"^Mul/456.*mul.*1us.*3us", prof_output)
+
+  def testMultipleDevices(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=3)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1])
+
+    device2 = run_metadata.step_stats.dev_stats.add()
+    device2.device = "deviceB"
+    device2.node_stats.extend([node1])
+
+    graph = test.mock.MagicMock()
+    op = test.mock.MagicMock()
+    op.name = "Add/123"
+    op.traceback = [("a/b/file1", 10, "some_var")]
+    op.type = "abc"
+    graph.get_operations.return_value = [op]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+
+    _assert_at_least_one_line_matches(r"Device 1 of 2: deviceA", prof_output)
+    _assert_at_least_one_line_matches(r"Device 2 of 2: deviceB", prof_output)
+
+    # Try filtering by device.
+    prof_output = prof_analyzer.list_profile(["-d", "deviceB"]).lines
+    _assert_at_least_one_line_matches(r"Device 2 of 2: deviceB", prof_output)
+    _assert_no_lines_match(r"Device 1 of 2: deviceA", prof_output)
+
+  def testWithSession(self):
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+
+    with session.Session() as sess:
+      a = constant_op.constant([1, 2, 3])
+      b = constant_op.constant([2, 2, 1])
+      result = math_ops.add(a, b)
+
+      sess.run(result, options=options, run_metadata=run_metadata)
+
+      prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(
+          sess.graph, run_metadata)
+      prof_output = prof_analyzer.list_profile([]).lines
+
+      _assert_at_least_one_line_matches("Device 1 of", prof_output)
+      expected_headers = [
+          "Node", r"Start Time \(us\)", r"Op Time \(.*\)", r"Exec Time \(.*\)",
+          r"Filename:Lineno\(function\)"]
+      _assert_at_least_one_line_matches(
+          ".*".join(expected_headers), prof_output)
+      _assert_at_least_one_line_matches(r"^Add/", prof_output)
+      _assert_at_least_one_line_matches(r"Device Total", prof_output)
+
+  def testSorting(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Default sort by start time (i.e. all_start_micros).
+    prof_output = prof_analyzer.list_profile([]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    # Default sort in reverse.
+    prof_output = prof_analyzer.list_profile(["-r"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by name.
+    prof_output = prof_analyzer.list_profile(["-s", "node"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by op time (i.e. op_end_rel_micros - op_start_rel_micros).
+    prof_output = prof_analyzer.list_profile(["-s", "op_time"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    # Sort by exec time (i.e. all_end_rel_micros).
+    prof_output = prof_analyzer.list_profile(["-s", "exec_time"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by line number.
+    prof_output = prof_analyzer.list_profile(["-s", "line"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+
+  def testFiltering(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Filter by name
+    prof_output = prof_analyzer.list_profile(["-n", "Add"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+    # Filter by op_type
+    prof_output = prof_analyzer.list_profile(["-t", "mul"]).lines
+    _assert_at_least_one_line_matches(r"Mul/456", prof_output)
+    _assert_no_lines_match(r"Add/123", prof_output)
+    # Filter by file name.
+    prof_output = prof_analyzer.list_profile(["-f", ".*file2"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+    # Fitler by execution time.
+    prof_output = prof_analyzer.list_profile(["-e", "[5, 10]"]).lines
+    _assert_at_least_one_line_matches(r"Mul/456", prof_output)
+    _assert_no_lines_match(r"Add/123", prof_output)
+    # Fitler by op time.
+    prof_output = prof_analyzer.list_profile(["-o", ">=2"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+
+  def testSpecifyingTimeUnit(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Force time unit.
+    prof_output = prof_analyzer.list_profile(["--time_unit", "ms"]).lines
+    _assert_at_least_one_line_matches(r"Add/123.*add.*0\.002ms", prof_output)
+    _assert_at_least_one_line_matches(r"Mul/456.*mul.*0\.005ms", prof_output)
+    _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
+
+
+class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(ProfileAnalyzerPrintSourceTest, self).setUp()
+
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+    with session.Session() as sess:
+      loop_cond = lambda x: math_ops.less(x, 10)
+      self.loop_cond_lineno = _line_number_above()
+      loop_body = lambda x: math_ops.add(x, 1)
+      self.loop_body_lineno = _line_number_above()
+      x = constant_op.constant(0, name="x")
+      self.x_lineno = _line_number_above()
+      loop = control_flow_ops.while_loop(loop_cond, loop_body, [x])
+      self.loop_lineno = _line_number_above()
+      self.assertEqual(
+          10, sess.run(loop, options=options, run_metadata=run_metadata))
+
+      self.prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(
+          sess.graph, run_metadata)
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    super(ProfileAnalyzerPrintSourceTest, self).tearDown()
+
+  def testPrintSourceForWhileLoop(self):
+    prof_output = self.prof_analyzer.print_source([__file__])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceOutputContainsClickableLinks(self):
+    prof_output = self.prof_analyzer.print_source([__file__])
+    any_match, line_index = _at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    self.assertTrue(any_match)
+    any_menu_item_match = False
+    for seg in prof_output.font_attr_segs[line_index]:
+      if (isinstance(seg[2][1], debugger_cli_common.MenuItem) and
+          seg[2][1].content.startswith("lp --file_path_filter ") and
+          "--min_lineno %d" % self.loop_cond_lineno in seg[2][1].content and
+          "--max_lineno %d" % (self.loop_cond_lineno + 1) in seg[2][1].content):
+        any_menu_item_match = True
+        break
+    self.assertTrue(any_menu_item_match)
+
+  def testPrintSourceWithNonDefaultTimeUnit(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--time_unit", "ms"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceWithNodeNameFilter(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--node_name_filter", "x$"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(1\) .*L%d.*(\S)+" % self.x_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+    # Check clickable link.
+    _, line_index = _at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(1\) .*L%d.*(\S)+" % self.x_lineno,
+        prof_output.lines)
+    any_menu_item_match = False
+    for seg in prof_output.font_attr_segs[line_index]:
+      if (isinstance(seg[2][1], debugger_cli_common.MenuItem) and
+          seg[2][1].content.startswith("lp --file_path_filter ") and
+          "--node_name_filter x$" in seg[2][1].content and
+          "--min_lineno %d" % self.x_lineno in seg[2][1].content and
+          "--max_lineno %d" % (self.x_lineno + 1) in seg[2][1].content):
+        any_menu_item_match = True
+        break
+    self.assertTrue(any_menu_item_match)
+
+  def testPrintSourceWithOpTypeFilter(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--op_type_filter", "Less"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(11\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceWithNonexistentDeviceGivesCorrectErrorMessage(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--device_name_filter", "foo_device"])
+
+    _assert_at_least_one_line_matches(
+        r"The source file .* does not contain any profile information for the "
+        "previous Session run", prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r".*--device_name_filter: foo_device", prof_output.lines)
+
+  def testPrintSourceWithUnrelatedFileShowsCorrectErrorMessage(self):
+    prof_output = self.prof_analyzer.print_source([tf_inspect.__file__])
+    _assert_at_least_one_line_matches(
+        r"The source file .* does not contain any profile information for the "
+        "previous Session run", prof_output.lines)
+
+  def testPrintSourceOutputContainsInitScrollPosAnnotation(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--init_line", str(self.loop_cond_lineno)])
+    self.assertEqual(
+        self.loop_cond_lineno + 1,  # The extra line is due to the head lines.
+        prof_output.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
index 6bdbfd72b03..94eb2754da2 100644
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ b/tensorflow/python/debug/cli/stepper_cli.py
@@ -22,11 +22,11 @@ import argparse
 import numpy as np  # pylint: disable=unused-import
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.debug import stepper
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import tensor_format
+from tensorflow.python.debug.lib import stepper
 
 RL = debugger_cli_common.RichLine
 
@@ -68,19 +68,19 @@ class NodeStepperCLI(object):
   _UPDATED_ATTRIBUTE = "bold"
 
   _STATE_COLORS = {
-      STATE_CONT: "green",
-      STATE_DIRTY_VARIABLE: "magenta",
-      STATE_DUMPED_INTERMEDIATE: "blue",
-      STATE_OVERRIDDEN: "yellow",
-      STATE_IS_PLACEHOLDER: "cyan",
-      STATE_UNFEEDABLE: "red",
+      STATE_CONT: cli_shared.COLOR_GREEN,
+      STATE_DIRTY_VARIABLE: cli_shared.COLOR_MAGENTA,
+      STATE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
+      STATE_OVERRIDDEN: cli_shared.COLOR_YELLOW,
+      STATE_IS_PLACEHOLDER: cli_shared.COLOR_CYAN,
+      STATE_UNFEEDABLE: cli_shared.COLOR_RED,
   }
 
   _FEED_COLORS = {
-      stepper.NodeStepper.FEED_TYPE_CLIENT: "white",
-      stepper.NodeStepper.FEED_TYPE_HANDLE: "green",
-      stepper.NodeStepper.FEED_TYPE_OVERRIDE: "yellow",
-      stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE: "blue",
+      stepper.NodeStepper.FEED_TYPE_CLIENT: cli_shared.COLOR_WHITE,
+      stepper.NodeStepper.FEED_TYPE_HANDLE: cli_shared.COLOR_GREEN,
+      stepper.NodeStepper.FEED_TYPE_OVERRIDE: cli_shared.COLOR_YELLOW,
+      stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
   }
 
   def __init__(self, node_stepper):
@@ -247,42 +247,31 @@ class NodeStepperCLI(object):
     ]
 
     lines = []
-    font_attr_segs = {}
     if verbose:
       lines.extend(
           ["Topologically-sorted transitive input(s) and fetch(es):", ""])
 
-    line_counter = len(lines)
     for i, element_name in enumerate(self._sorted_nodes):
       if i < index_range[0] or i >= index_range[1]:
         continue
 
-      font_attr_segs[line_counter] = []
-
       # TODO(cais): Use fixed-width text to show node index.
-      node_prefix = "(%d / %d)" % (i + 1, len(self._sorted_nodes))
       if i == self._next:
-        node_prefix = "  " + self.NEXT_NODE_POINTER_STR + node_prefix
-        font_attr_segs[line_counter].append((0, 3, "bold"))
+        node_prefix = RL("  ") + RL(self.NEXT_NODE_POINTER_STR, "bold")
       else:
-        node_prefix = "     " + node_prefix
+        node_prefix = RL("     ")
 
-      node_prefix += "  ["
-      labels, label_font_attr_segs = self._get_status_labels(
+      node_prefix += "(%d / %d)" % (i + 1, len(self._sorted_nodes)) + "  ["
+      node_prefix += self._get_status_labels(
           element_name,
           handle_node_names,
           intermediate_tensor_names,
           override_names,
-          dirty_variable_names,
-          len(node_prefix))
-      node_prefix += labels
-      font_attr_segs[line_counter].extend(label_font_attr_segs)
+          dirty_variable_names)
 
       lines.append(node_prefix + "] " + element_name)
-      line_counter += 1
 
-    output = debugger_cli_common.RichTextLines(
-        lines, font_attr_segs=font_attr_segs)
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
 
     if verbose:
       output.extend(self._node_status_label_legend())
@@ -294,8 +283,7 @@ class NodeStepperCLI(object):
                          handle_node_names,
                          intermediate_tensor_names,
                          override_names,
-                         dirty_variable_names,
-                         offset):
+                         dirty_variable_names):
     """Get a string of status labels for a graph element.
 
     A status label indicates that a node has a certain state in this
@@ -309,19 +297,18 @@ class NodeStepperCLI(object):
       element_name: (str) name of the graph element.
       handle_node_names: (list of str) Names of the nodes of which the output
         tensors' handles are available.
-      intermediate_tensor_names: (list of str) TOOD(cais): document.
+      intermediate_tensor_names: (list of str) Names of the intermediate tensor
+        dumps generated from the graph element.
       override_names: (list of str) Names of the tensors of which the values
         are overridden.
       dirty_variable_names: (list of str) Names of the dirty variables.
-      offset: (int) Initial offset of the font attribute segments.
 
     Returns:
-      (str) The string made of status labels that currently apply to the graph
-        element.
-      (list of tuples) The font attribute segments, with offset applied.
+      (RichLine) The rich text string of status labels that currently apply to
+        the graph element.
     """
 
-    status = RL(" " * offset)
+    status = RL()
 
     node_name = element_name.split(":")[0]
     status += (RL(self.STATE_IS_PLACEHOLDER,
@@ -350,9 +337,7 @@ class NodeStepperCLI(object):
                   self._STATE_COLORS[self.STATE_DIRTY_VARIABLE])
                if element_name in dirty_variable_names else " ")
 
-    # TODO(ebreck) Return status here, once the caller is updated with the
-    # RichLine API.
-    return status.text[offset:], status.font_attr_segs
+    return status
 
   def _node_status_label_legend(self):
     """Get legend for node-status labels.
@@ -362,8 +347,8 @@ class NodeStepperCLI(object):
     """
 
     return debugger_cli_common.rich_text_lines_from_rich_line_list([
-        RL(""),
-        RL("Legend:"),
+        "",
+        "Legend:",
         (RL("  ") +
          RL(self.STATE_IS_PLACEHOLDER,
             self._STATE_COLORS[self.STATE_IS_PLACEHOLDER]) +
@@ -444,18 +429,18 @@ class NodeStepperCLI(object):
     """
     feed_types = self._node_stepper.last_feed_types()
 
-    out = debugger_cli_common.RichTextLines(["Stepper used feeds:"])
+    out = ["Stepper used feeds:"]
     if feed_types:
       for feed_name in feed_types:
         feed_info = RL("  %s : " % feed_name)
         feed_info += RL(feed_types[feed_name],
                         self._FEED_COLORS[feed_types[feed_name]])
-        out.append(feed_info.text, font_attr_segs=feed_info.font_attr_segs)
+        out.append(feed_info)
     else:
       out.append("  (No feeds)")
     out.append("")
 
-    return out
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(out)
 
   def _report_last_updated(self):
     """Generate a report of the variables updated in the last cont/step call.
@@ -472,8 +457,8 @@ class NodeStepperCLI(object):
     rich_lines = [RL("Updated:", self._UPDATED_ATTRIBUTE)]
     sorted_last_updated = sorted(list(last_updated))
     for updated in sorted_last_updated:
-      rich_lines.append(RL("  %s" % updated))
-    rich_lines.append(RL(""))
+      rich_lines.append("  %s" % updated)
+    rich_lines.append("")
     return debugger_cli_common.rich_text_lines_from_rich_line_list(rich_lines)
 
   def step(self, args, screen_info=None):
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 00a06d13341..06e1228b951 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -23,8 +23,8 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.client import session
-from tensorflow.python.debug import stepper
 from tensorflow.python.debug.cli import stepper_cli
+from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index c3c4bcf2150..bb7ac314303 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -24,6 +24,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.lib import debug_data
 
 _NUMPY_OMISSION = "...,"
 _NUMPY_DEFAULT_EDGE_ITEMS = 3
@@ -112,10 +113,10 @@ def format_tensor(tensor,
           (8 + proper_len + 1, 8 + proper_len + 1 + debug_op_len, "yellow")
       ]
 
-  if tensor is None:
+  if isinstance(tensor, debug_data.InconvertibleTensorProto):
     if lines:
       lines.append("")
-    lines.append("Uninitialized tensor")
+    lines.extend(str(tensor).split("\n"))
     return debugger_cli_common.RichTextLines(lines)
   elif not isinstance(tensor, np.ndarray):
     # If tensor is not a np.ndarray, return simple text-line representation of
diff --git a/tensorflow/python/debug/cli/tensor_format_test.py b/tensorflow/python/debug/cli/tensor_format_test.py
index 8392a873675..ec80bb998ef 100644
--- a/tensorflow/python/debug/cli/tensor_format_test.py
+++ b/tensorflow/python/debug/cli/tensor_format_test.py
@@ -20,7 +20,11 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.debug.cli import tensor_format
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 
@@ -363,10 +367,28 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
       if i < 1:
         self.assertNotIn(p + i * 6 + 5, out.annotations)
 
-  def testFormatNone(self):
-    out = tensor_format.format_tensor(None, "a")
+  def testFormatUninitializedTensor(self):
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_FLOAT"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto, False), "a")
 
-    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor"], out.lines)
+    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor:"],
+                     out.lines[:3])
+    self.assertEqual(str(tensor_proto).split("\n"), out.lines[3:])
+
+  def testFormatResourceTypeTensor(self):
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_RESOURCE"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto), "a")
+
+    self.assertEqual(["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(str(tensor_proto).split("\n"), out.lines[2:])
 
   def testLocateTensorElement1DNoEllipsis(self):
     a = np.zeros(20)
@@ -821,9 +843,15 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual([12, None], end_cols)
 
   def testLocateTensorElementAnnotationsUnavailable(self):
-    out = tensor_format.format_tensor(None, "a")
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_FLOAT"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto, False), "a")
 
-    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor"], out.lines)
+    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor:"],
+                     out.lines[:3])
 
     with self.assertRaisesRegexp(
         AttributeError, "tensor_metadata is not available in annotations"):
diff --git a/tensorflow/python/debug/debug_data.py b/tensorflow/python/debug/debug_data.py
deleted file mode 100644
index 110ce376073..00000000000
--- a/tensorflow/python/debug/debug_data.py
+++ /dev/null
@@ -1,1338 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and functions to handle debug-dump data of TensorFlow Debugger."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import gfile
-
-
-METADATA_FILE_PREFIX = "_tfdbg_"
-GRAPH_FILE_TAG = "graph_"
-FETCHES_INFO_FILE_TAG = "fetches_info_"
-FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
-
-
-def load_tensor_from_event_file(event_file_path):
-  """Load a tensor from an event file.
-
-  Assumes that the event file contains a `Event` protobuf and the `Event`
-  protobuf contains a `Tensor` value.
-
-  Args:
-    event_file_path: (`str`) path to the event file.
-
-  Returns:
-    The tensor value loaded from the event file, as a `numpy.ndarray`. For
-    uninitialized Tensors, returns `None`. For Tensors of data types that
-    cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return
-    `None`.
-  """
-
-  event = event_pb2.Event()
-  with gfile.Open(event_file_path, "rb") as f:
-    event.ParseFromString(f.read())
-
-    if (event.summary.value[0].tensor.tensor_content or
-        event.summary.value[0].tensor.string_val):
-      # Initialized tensor.
-      try:
-        tensor_value = tensor_util.MakeNdarray(event.summary.value[0].tensor)
-      except KeyError:
-        tensor_value = None
-    else:
-      # Uninitialized tensor or tensor of unconvertible data type.
-      tensor_value = None
-
-  return tensor_value
-
-
-def _load_graph_def_from_event_file(event_file_path):
-  event = event_pb2.Event()
-  with gfile.Open(event_file_path, "rb") as f:
-    event.ParseFromString(f.read())
-
-  return graph_pb2.GraphDef.FromString(event.graph_def)
-
-
-def _load_log_message_from_event_file(event_file_path):
-  event = event_pb2.Event()
-  with gfile.Open(event_file_path, "rb") as f:
-    event.ParseFromString(f.read())
-
-  return event.log_message.message
-
-
-def parse_node_or_tensor_name(name):
-  """Get the node name from a string that can be node or tensor name.
-
-  Args:
-    name: An input node name (e.g., "node_a") or tensor name (e.g.,
-      "node_a:0"), as a str.
-
-  Returns:
-    1) The node name, as a str. If the input name is a tensor name, i.e.,
-      consists of a colon, the final colon and the following output slot
-      will be stripped.
-    2) If the input name is a tensor name, the output slot, as an int. If
-      the input name is not a tensor name, None.
-  """
-
-  if ":" in name and not name.endswith(":"):
-    node_name = name[:name.rfind(":")]
-    output_slot = int(name[name.rfind(":") + 1:])
-
-    return node_name, output_slot
-  else:
-    return name, None
-
-
-def _is_graph_file(file_name):
-  return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
-
-
-def _is_run_fetches_info_file(file_name):
-  return file_name == METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG
-
-
-def _is_run_feed_keys_info_file(file_name):
-  return file_name == METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG
-
-
-def get_node_name(element_name):
-  return element_name.split(":")[0] if ":" in element_name else element_name
-
-
-def get_output_slot(element_name):
-  """Get the output slot number from the name of a graph element.
-
-  If element_name is a node name without output slot at the end, 0 will be
-  assumed.
-
-  Args:
-    element_name: (`str`) name of the graph element in question.
-
-  Returns:
-    (`int`) output slot number.
-  """
-  return int(element_name.split(":")[-1]) if ":" in element_name else 0
-
-
-def _get_tensor_name(node_name, output_slot):
-  """Get tensor name given node name and output slot index.
-
-  Args:
-    node_name: Name of the node that outputs the tensor, as a string.
-    output_slot: Output slot index of the tensor, as an integer.
-
-  Returns:
-    Name of the tensor, as a string.
-  """
-
-  return "%s:%d" % (node_name, output_slot)
-
-
-def _get_tensor_watch_key(node_name, output_slot, debug_op):
-  """Get the string representation of a debug watch on a tensor.
-
-  Args:
-    node_name: Name of the node by which the watched tensor is produced, as a
-        string.
-    output_slot: Output slot index of the tensor, as an integer.
-    debug_op: Name of the debug op that is used to watch the tensor, as a
-        string.
-
-  Returns:
-    A string representing the debug watch on the tensor (i.e., the "watch
-        key").
-  """
-  return "%s:%s" % (_get_tensor_name(node_name, output_slot), debug_op)
-
-
-def _is_copy_node(node_name):
-  """Determine whether a node name is that of a debug Copy node.
-
-  Such nodes are inserted by TensorFlow core upon request in
-  RunOptions.debug_options.debug_tensor_watch_opts.
-
-  Args:
-    node_name: Name of the node.
-
-  Returns:
-    A bool indicating whether the input argument is the name of a debug Copy
-    node.
-  """
-  return node_name.startswith("__copy_")
-
-
-def _is_debug_node(node_name):
-  """Determine whether a node name is that of a debug node.
-
-  Such nodes are inserted by TensorFlow core upon request in
-  RunOptions.debug_options.debug_tensor_watch_opts.
-
-  Args:
-    node_name: Name of the node.
-
-  Returns:
-    A bool indicating whether the input argument is the name of a debug node.
-  """
-  return node_name.startswith("__dbg_")
-
-
-def _parse_debug_node_name(node_name):
-  """Parse the name of a debug node.
-
-  Args:
-    node_name: Name of the debug node.
-
-  Returns:
-    1. Name of the watched node, as a str.
-    2. Output slot index of the watched tensor, as an int.
-    3. Index of the debug node, as an int.
-    4. Name of the debug op, as a str, e.g, "DebugIdentity".
-
-  Raises:
-    ValueError: If the input node name is not a valid debug node name.
-  """
-  prefix = "__dbg_"
-
-  name = node_name
-  if not name.startswith(prefix):
-    raise ValueError("Invalid prefix in debug node name: '%s'" % node_name)
-
-  name = name[len(prefix):]
-
-  if name.count("_") < 2:
-    raise ValueError("Invalid debug node name: '%s'" % node_name)
-
-  debug_op = name[name.rindex("_") + 1:]
-  name = name[:name.rindex("_")]
-
-  debug_op_index = int(name[name.rindex("_") + 1:])
-  name = name[:name.rindex("_")]
-
-  if name.count(":") != 1:
-    raise ValueError("Invalid tensor name in debug node name: '%s'" % node_name)
-
-  watched_node_name = name[:name.index(":")]
-  watched_output_slot = int(name[name.index(":") + 1:])
-
-  return watched_node_name, watched_output_slot, debug_op_index, debug_op
-
-
-def has_inf_or_nan(datum, tensor):
-  """A predicate for whether a tensor consists of any bad numerical values.
-
-  This predicate is common enough to merit definition in this module.
-  Bad numerical values include `nan`s and `inf`s.
-  The signature of this function follows the requirement of the method
-  `DebugDumpDir.find()`.
-
-  Args:
-    datum: (`DebugTensorDatum`) Datum metadata.
-    tensor: (`numpy.ndarray` or None) Value of the tensor. None represents
-      an uninitialized tensor.
-
-  Returns:
-    (`bool`) True if and only if tensor consists of any nan or inf values.
-  """
-
-  _ = datum  # Datum metadata is unused in this predicate.
-
-  if tensor is None:
-    # Uninitialized tensor doesn't have bad numerical values.
-    return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
-        np.issubdtype(tensor.dtype, np.complex) or
-        np.issubdtype(tensor.dtype, np.integer)):
-    return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
-  else:
-    return False
-
-
-class DebugTensorDatum(object):
-  """A single tensor dumped by TensorFlow Debugger (tfdbg).
-
-  Contains metadata about the dumped tensor, including `timestamp`,
-  `node_name`, `output_slot`, `debug_op`, and path to the dump file
-  (`file_path`).
-
-  This type does not hold the generally space-expensive tensor value (numpy
-  array). Instead, it points to the file from which the tensor value can be
-  loaded (with the `get_tensor` method) if needed.
-  """
-
-  def __init__(self, dump_root, debug_dump_rel_path):
-    """`DebugTensorDatum` constructor.
-
-    Args:
-      dump_root: (`str`) Debug dump root directory.
-      debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
-          `dump_root`. For example, suppose the debug dump root
-          directory is `/tmp/tfdbg_1` and the dump file is at
-          `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
-          the value of the debug_dump_rel_path should be
-          `ns_1/node_a_0_DebugIdenity_1234456789`.
-
-    Raises:
-      ValueError: If the base file name of the dump file does not conform to
-        the dump file naming pattern:
-        `node_name`_`output_slot`_`debug_op`_`timestamp`
-    """
-
-    base = os.path.basename(debug_dump_rel_path)
-
-    if base.count("_") < 3:
-      raise ValueError(
-          "Dump file path does not conform to the naming pattern: %s" % base)
-
-    # TODO(cais): Add hostname and pid to support dumps from distributed
-    #             sessions.
-
-    self._timestamp = int(base.split("_")[-1])
-    self._debug_op = base.split("_")[-2]
-    self._output_slot = int(base.split("_")[-3])
-
-    namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/")
-    node_base_name = "_".join(base.split("_")[:-3])
-    if not namespace or namespace == ".":
-      self._node_name = node_base_name
-    else:
-      self._node_name = namespace + "/" + node_base_name
-
-    self._file_path = os.path.join(dump_root, debug_dump_rel_path)
-    self._dump_size_bytes = (gfile.Stat(self._file_path).length if
-                             gfile.Exists(self._file_path) else None)
-
-    self._run_fetches_info = None
-    self._run_feed_keys_info = None
-
-  def __str__(self):
-    return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name,
-                                                    self.output_slot,
-                                                    self.debug_op,
-                                                    self.timestamp)
-
-  def __repr__(self):
-    return self.__str__()
-
-  def get_tensor(self):
-    """Get tensor from the dump (`Event`) file.
-
-    Returns:
-      The tensor loaded from the dump (`Event`) file.
-    """
-
-    return load_tensor_from_event_file(self.file_path)
-
-  # TODO(cais): Add time unit suffix to timestamp and t0 (us).
-  @property
-  def timestamp(self):
-    """Timestamp of when this tensor value was dumped.
-
-    Returns:
-      (`int`) The timestamp in microseconds.
-    """
-
-    return self._timestamp
-
-  @property
-  def debug_op(self):
-    """Name of the debug op.
-
-    Returns:
-      (`str`) debug op name (e.g., `DebugIdentity`).
-    """
-
-    return self._debug_op
-
-  @property
-  def node_name(self):
-    """Name of the node from which the tensor value was dumped.
-
-    Returns:
-      (`str`) name of the node watched by the debug op.
-    """
-
-    return self._node_name
-
-  @property
-  def output_slot(self):
-    """Output slot index from which the tensor value was dumped.
-
-    Returns:
-      (`int`) output slot index watched by the debug op.
-    """
-
-    return self._output_slot
-
-  @property
-  def tensor_name(self):
-    """Name of the tensor watched by the debug op.
-
-    Returns:
-      (`str`) `Tensor` name, in the form of `node_name`:`output_slot`
-    """
-
-    return _get_tensor_name(self.node_name, self.output_slot)
-
-  @property
-  def watch_key(self):
-    """Watch key identities a debug watch on a tensor.
-
-    Returns:
-      (`str`) A watch key, in the form of `tensor_name`:`debug_op`.
-    """
-
-    return _get_tensor_watch_key(self.node_name, self.output_slot,
-                                 self.debug_op)
-
-  @property
-  def file_path(self):
-    """Path to the file which stores the value of the dumped tensor."""
-
-    return self._file_path
-
-  @property
-  def dump_size_bytes(self):
-    """Size of the dump file.
-
-    Unit: byte.
-
-    Returns:
-      If the dump file exists, size of the dump file, in bytes.
-      If the dump file does not exist, None.
-    """
-
-    return self._dump_size_bytes
-
-
-class DebugDumpDir(object):
-  """Data set from a debug-dump directory on filesystem.
-
-  An instance of `DebugDumpDir` contains all `DebugTensorDatum` instances
-  in a tfdbg dump root directory.
-  """
-
-  def __init__(self, dump_root, partition_graphs=None, validate=True):
-    """`DebugDumpDir` constructor.
-
-    Args:
-      dump_root: (`str`) path to the dump root directory.
-      partition_graphs: A repeated field of GraphDefs representing the
-          partition graphs executed by the TensorFlow runtime.
-      validate: (`bool`) whether the dump files are to be validated against the
-          partition graphs.
-
-    Raises:
-      IOError: If dump_root does not exist as a directory.
-    """
-
-    if not gfile.IsDirectory(dump_root):
-      raise IOError("Dump root directory %s does not exist" % dump_root)
-
-    self._load_dumps(dump_root)
-    self._create_tensor_watch_maps()
-    self._load_partition_graphs(partition_graphs, validate)
-
-    self._python_graph = None
-
-  def _load_dumps(self, dump_root):
-    """Load `DebugTensorDatum` instances from the dump root.
-
-    Populates a list of `DebugTensorDatum` instance and sorts the list by
-    ascending timestamp.
-
-    This sorting order reflects the order in which the TensorFlow executor
-    processed the nodes of the graph. It is (one of many possible) topological
-    sort of the nodes. This is useful for displaying tensors in the debugger
-    frontend as well as for the use case in which the user wants to find a
-    "culprit tensor", i.e., the first tensor in the graph that exhibits certain
-    problematic properties, i.e., all zero values, or bad numerical values such
-    as nan and inf.
-
-    In addition, creates a map from node name to debug watches. In this Map,
-    the key is the watched node name; the value is a dictionary.
-    Of this dictionary, the key is the watched_output_slot.
-
-    This method attempts to load the debug watches from the tensor dump files
-    first, before loading the full set of debug watches from the partition
-    graphs as done later. This is necessary because sometimes the partition
-    graphs may not be available, e.g., when the run errors out.
-
-    Args:
-      dump_root: (`str`) Dump root directory.
-    """
-
-    self._dump_root = dump_root
-    self._dump_tensor_data = []
-    self._dump_graph_file_paths = []
-
-    self._debug_watches = collections.defaultdict(
-        lambda: collections.defaultdict(set))
-
-    for root, _, files in gfile.Walk(self._dump_root):
-      for f in files:
-        if f.startswith(METADATA_FILE_PREFIX):
-          if _is_graph_file(f):
-            self._dump_graph_file_paths.append(
-                os.path.join(self._dump_root, root, f))
-
-          if _is_run_fetches_info_file(f):
-            self._run_fetches_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          if _is_run_feed_keys_info_file(f):
-            self._run_feed_keys_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          continue
-
-        datum = self._dump_file_name_to_datum(root, f)
-        self._dump_tensor_data.append(datum)
-
-        self._debug_watches[datum.node_name][datum.output_slot].add(
-            datum.debug_op)
-
-    self._dump_tensor_data = sorted(
-        self._dump_tensor_data, key=lambda x: x.timestamp)
-
-    if self._dump_tensor_data:
-      self._t0 = self._dump_tensor_data[0].timestamp
-    else:
-      self._t0 = None
-
-  def _dump_file_name_to_datum(self, dir_name, file_name):
-    """Obtain a DebugTensorDatum from the directory and file name.
-
-    Args:
-      dir_name: (`str`) Name of the directory in which the dump file resides.
-      file_name: (`str`) Base name of the dump file.
-
-    Returns:
-      (`DebugTensorDatum`) The `DebugTensorDatum` loaded from the dump file.
-    """
-
-    # Calculate the relative path of the dump file with respect to the root.
-    debug_dump_rel_path = os.path.join(
-        os.path.relpath(dir_name, self._dump_root), file_name)
-
-    return DebugTensorDatum(self._dump_root, debug_dump_rel_path)
-
-  def _create_tensor_watch_maps(self):
-    """Create maps from tensor watch keys to datum and to timestamps.
-
-    Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`
-    item. Also make a map from watch key to relative timestamp.
-    "relative" means (absolute timestamp - t0).
-    """
-
-    self._watch_key_to_datum = {}
-    self._watch_key_to_rel_time = {}
-    self._watch_key_to_dump_size_bytes = {}
-    for datum in self._dump_tensor_data:
-      if datum.watch_key not in self._watch_key_to_datum:
-        self._watch_key_to_datum[datum.watch_key] = [datum]
-        self._watch_key_to_rel_time[datum.watch_key] = [
-            datum.timestamp - self._t0
-        ]
-        self._watch_key_to_dump_size_bytes[datum.watch_key] = [
-            datum.dump_size_bytes
-        ]
-      else:
-        self._watch_key_to_datum[datum.watch_key].append(datum)
-        self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
-                                                            self._t0)
-        self._watch_key_to_dump_size_bytes[datum.watch_key].append(
-            datum.dump_size_bytes)
-
-  def set_python_graph(self, python_graph):
-    """Provide Python `Graph` object to the wrapper.
-
-    Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
-    is a Python object and carries additional information such as the traceback
-    of the construction of the nodes in the graph.
-
-    Args:
-      python_graph: (ops.Graph) The Python Graph object.
-    """
-
-    self._python_graph = python_graph
-    self._node_traceback = {}
-    if self._python_graph:
-      for op in self._python_graph.get_operations():
-        self._node_traceback[op.name] = op.traceback
-
-  @property
-  def dumped_tensor_data(self):
-    return self._dump_tensor_data
-
-  @property
-  def t0(self):
-    """Absolute timestamp of the first dumped tensor.
-
-    Returns:
-      (`int`) absolute timestamp of the first dumped tensor, in microseconds.
-    """
-
-    return self._t0
-
-  @property
-  def size(self):
-    """Total number of dumped tensors in the dump root directory.
-
-    Returns:
-      (`int`) total number of dumped tensors in the dump root directory.
-    """
-
-    return len(self._dump_tensor_data)
-
-  def _load_partition_graphs(self, partition_graphs, validate):
-    """Load and process partition graphs.
-
-    Load the graphs; parse the input and control input structure; obtain the
-    device and op type of each node; remove the Copy and debug ops inserted
-    by the debugger. The gathered information can be used to validate the
-    tensor dumps.
-
-    Args:
-      partition_graphs: Partition graphs executed by the TensorFlow runtime,
-        represented as repeated fields of GraphDef.
-        If no partition_graph is available, use None.
-      validate: (`bool`) Whether the dump files are to be validated against the
-        partition graphs.
-    """
-
-    if partition_graphs:
-      self._partition_graphs = partition_graphs
-    elif self._dump_graph_file_paths:
-      # In case partition graphs are not available from arguments, load them
-      # from the dump directory.
-      self._partition_graphs = [
-          _load_graph_def_from_event_file(dump_file_path)
-          for dump_file_path in self._dump_graph_file_paths
-      ]
-    else:
-      self._partition_graphs = None
-      return
-
-    self._node_attributes = {}
-
-    self._node_inputs = {}
-    self._node_ctrl_inputs = {}
-
-    self._node_recipients = {}
-    self._node_ctrl_recipients = {}
-
-    self._devices = []
-    self._node_devices = {}
-    self._node_op_types = {}
-
-    self._copy_send_nodes = []
-
-    for pg in self._partition_graphs:
-      for node in pg.node:
-        self._process_partition_graph_node(node)
-
-    self._prune_non_control_edges_of_debug_ops()
-    self._prune_control_edges_of_debug_ops()
-
-    self._populate_recipient_maps()
-
-    if validate:
-      self._validate_dump_with_graphs()
-
-  def _process_partition_graph_node(self, node):
-    """Process a node from the partition graphs.
-
-    Args:
-      node: (NodeDef) A partition-graph node to be processed.
-
-    Raises:
-      ValueError: If duplicate node names are encountered.
-    """
-
-    if _is_debug_node(node.name):
-      # This is a debug node. Parse the node name and retrieve the
-      # information about debug watches on tensors. But do not include
-      # the node in the graph.
-      (watched_node_name, watched_output_slot, _,
-       debug_op) = _parse_debug_node_name(node.name)
-
-      self._debug_watches[watched_node_name][watched_output_slot].add(
-          debug_op)
-
-      return
-
-    if node.name in self._node_inputs:
-      raise ValueError("Duplicate node name: '%s'" % node.name)
-
-    self._node_attributes[node.name] = node.attr
-
-    if node.device not in self._devices and node.device:
-      self._devices.append(node.device)
-
-    self._node_inputs[node.name] = []
-    self._node_ctrl_inputs[node.name] = []
-    self._node_recipients[node.name] = []
-    self._node_ctrl_recipients[node.name] = []
-
-    self._node_devices[node.name] = node.device
-    self._node_op_types[node.name] = node.op
-
-    for inp in node.input:
-      if _is_copy_node(inp) and node.op == "_Send":
-        self._copy_send_nodes.append(node.name)
-
-      if inp.startswith("^"):
-        cinp = inp[1:]
-        self._node_ctrl_inputs[node.name].append(cinp)
-      else:
-        self._node_inputs[node.name].append(inp)
-
-  def _prune_nodes_from_input_and_recipient_maps(self, nodes_to_prune):
-    """Prune nodes out of input and recipient maps.
-
-    Args:
-      nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned.
-    """
-
-    for node in nodes_to_prune:
-      del self._node_inputs[node]
-      del self._node_ctrl_inputs[node]
-      del self._node_recipients[node]
-      del self._node_ctrl_recipients[node]
-
-  def _prune_non_control_edges_of_debug_ops(self):
-    """Prune (non-control) edges related to debug ops.
-
-    Prune the Copy ops and associated _Send ops inserted by the debugger out
-    from the non-control inputs and output recipients map. Replace the inputs
-    and recipients with original ones.
-    """
-
-    copy_nodes = []
-    for node in self._node_inputs:
-      if node in self._copy_send_nodes:
-        continue
-
-      if _is_copy_node(node):
-        copy_nodes.append(node)
-
-      inputs = self._node_inputs[node]
-
-      for i in xrange(len(inputs)):
-        inp = inputs[i]
-        if _is_copy_node(inp):
-          # Find the input to the Copy node, which should be the original
-          # input to the node.
-          orig_inp = self._node_inputs[inp][0]
-          inputs[i] = orig_inp
-
-    self._prune_nodes_from_input_and_recipient_maps(copy_nodes)
-    self._prune_nodes_from_input_and_recipient_maps(self._copy_send_nodes)
-
-  def _prune_control_edges_of_debug_ops(self):
-    """Prune control edges related to the debug ops."""
-
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
-      debug_op_inputs = []
-      for ctrl_inp in ctrl_inputs:
-        if _is_debug_node(ctrl_inp):
-          debug_op_inputs.append(ctrl_inp)
-      for debug_op_inp in debug_op_inputs:
-        ctrl_inputs.remove(debug_op_inp)
-
-  def _populate_recipient_maps(self):
-    """Populate the map from node name to recipient(s) of its output(s)."""
-
-    for node in self._node_inputs:
-      inputs = self._node_inputs[node]
-      for inp in inputs:
-        inp = get_node_name(inp)
-        if inp not in self._node_recipients:
-          self._node_recipients[inp] = []
-        self._node_recipients[inp].append(node)
-
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
-      for ctrl_inp in ctrl_inputs:
-        if ctrl_inp in self._copy_send_nodes:
-          continue
-
-        if ctrl_inp not in self._node_ctrl_recipients:
-          self._node_ctrl_recipients[ctrl_inp] = []
-        self._node_ctrl_recipients[ctrl_inp].append(node)
-
-  def _validate_dump_with_graphs(self):
-    """Validate the dumped tensor data against the partition graphs.
-
-    Only the watched nodes are validated by this method, because tfdbg allows
-    clients to watch only a subset of the nodes.
-
-    Raises:
-      LookupError: If the partition graphs have not been loaded yet.
-      ValueError: If dumps contain node names not found in partition graph.
-        Or if the temporal order of the dump's timestamps violate the
-        input relations on the partition graphs.
-    """
-
-    if not self._partition_graphs:
-      raise LookupError("No partition graphs loaded.")
-
-    # Verify that the node names in the dump data are all present in the
-    # partition graphs.
-    for datum in self._dump_tensor_data:
-      if datum.node_name not in self._node_inputs:
-        raise ValueError("Node name '%s' is not found in partition graphs." %
-                         datum.node_name)
-
-    pending_inputs = {}
-    for node in self._node_inputs:
-      pending_inputs[node] = []
-      inputs = self._node_inputs[node]
-      for inp in inputs:
-        inp_node = get_node_name(inp)
-        inp_output_slot = get_output_slot(inp)
-        if (inp_node in self._debug_watches and
-            inp_output_slot in self._debug_watches[inp_node] and
-            (inp_node, inp_output_slot) not in pending_inputs[node]):
-          pending_inputs[node].append((inp_node, inp_output_slot))
-
-    for datum in self._dump_tensor_data:
-      node = datum.node_name
-      slot = datum.output_slot
-      if pending_inputs[node]:
-        raise ValueError("Causality violated in timing relations of debug "
-                         "dumps: %s (%d): "
-                         "these input(s) are not satisfied: %s" %
-                         (node, datum.timestamp, repr(pending_inputs[node])))
-
-      recipients = self._node_recipients[node]
-      for recipient in recipients:
-        recipient_pending_inputs = pending_inputs[recipient]
-        if (node, slot) in recipient_pending_inputs:
-          if self.node_op_type(recipient) == "Merge":
-            # If this is a Merge op, we automatically clear the list because
-            # a Merge node only requires one of its two inputs.
-            del recipient_pending_inputs[:]
-          else:
-            del recipient_pending_inputs[
-                recipient_pending_inputs.index((node, slot))]
-
-  def loaded_partition_graphs(self):
-    """Test whether partition graphs have been loaded."""
-    return self._partition_graphs is not None
-
-  def partition_graphs(self):
-    """Get the partition graphs.
-
-    Returns:
-      Partition graphs as repeated fields of GraphDef.
-
-    Raises:
-      LookupError: If no partition graphs have been loaded.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError("No partition graphs have been loaded.")
-
-    return self._partition_graphs
-
-  @property
-  def run_fetches_info(self):
-    """Get a str representation of the fetches used in the Session.run() call.
-
-    Returns:
-      If the information is available, a `str` obtained from `repr(fetches)`.
-      If the information is not available, `None`.
-    """
-
-    return self._run_fetches_info
-
-  @property
-  def run_feed_keys_info(self):
-    """Get a str representation of the feed_dict used in the Session.run() call.
-
-    Returns:
-      If the information is available, a `str` obtained from `repr(feed_dict)`.
-      If the information is not available, `None`.
-    """
-
-    return self._run_feed_keys_info
-
-  def nodes(self):
-    """Get a list of all nodes from the partition graphs.
-
-    Returns:
-      All nodes' names, as a list of str.
-
-    Raises:
-      LookupError: If no partition graphs have been loaded.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError("No partition graphs have been loaded.")
-
-    return [node_name for node_name in self._node_inputs]
-
-  def node_attributes(self, node_name):
-    """Get the attributes of a node.
-
-    Args:
-      node_name: Name of the node in question.
-
-    Returns:
-      Attributes of the node.
-
-    Raises:
-      LookupError: If no partition graphs have been loaded.
-      ValueError: If no node named node_name exists.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError("No partition graphs have been loaded.")
-
-    if node_name in self._node_attributes:
-      return self._node_attributes[node_name]
-    else:
-      raise ValueError("No node named \"%s\" exists." % node_name)
-
-  def node_inputs(self, node_name, is_control=False):
-    """Get the inputs of given node according to partition graphs.
-
-    Args:
-      node_name: Name of the node.
-      is_control: (`bool`) Whether control inputs, rather than non-control
-        inputs, are to be returned.
-
-    Returns:
-      (`list` of `str`) inputs to the node, as a list of node names.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
-      ValueError: If the node does not exist in partition graphs.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError(
-          "Node inputs are not loaded from partition graphs yet.")
-
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
-
-    if is_control:
-      return self._node_ctrl_inputs[node_name]
-    else:
-      return self._node_inputs[node_name]
-
-  def transitive_inputs(self, node_name, include_control=True):
-    """Get the transitive inputs of given node according to partition graphs.
-
-    Args:
-      node_name: Name of the node
-      include_control: Include control inputs (True by default).
-
-    Returns:
-      (`list` of `str`) all transitive inputs to the node, as a list of node
-        names.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
-      ValueError: If the node does not exist in partition graphs.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError(
-          "Node inputs are not loaded from partition graphs yet.")
-
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
-
-    inputs = []
-
-    # Keep track of visited nodes to avoid infinite loops during input
-    # tracing.
-    visited_nodes = []
-
-    def trace_inputs(node):
-      """Inner function for recursive tracing of node inputs.
-
-      The transitive input names are appended to the list captured list
-      "inputs".
-
-      Args:
-        node: Name of the node, as a str.
-      """
-      node = get_node_name(node)
-
-      # Stop the tracing at a Merge op, as it is generally impossible to infer
-      # outside the runtime which input to the Merge op is alive.
-      if self._node_op_types[node] == "Merge":
-        return
-
-      if node in visited_nodes:
-        return
-      visited_nodes.append(node)
-
-      for inp in self._node_inputs[node]:
-        if inp == node_name:
-          continue
-        inputs.append(inp)
-        trace_inputs(inp)
-
-      if include_control:
-        for ctrl_inp in self._node_ctrl_inputs[node]:
-          if ctrl_inp == node_name:
-            continue
-          inputs.append(ctrl_inp)
-          trace_inputs(ctrl_inp)
-
-    trace_inputs(node_name)
-
-    return inputs
-
-  def node_recipients(self, node_name, is_control=False):
-    """Get recipient of the given node's output according to partition graphs.
-
-    Args:
-      node_name: (`str`) name of the node.
-      is_control: (`bool`) whether control outputs, rather than non-control
-        outputs, are to be returned.
-
-    Returns:
-      (`list` of `str`) all inputs to the node, as a list of node names.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
-      ValueError: If the node does not exist in partition graphs.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError(
-          "Node recipients are not loaded from partition graphs yet.")
-
-    if node_name not in self._node_recipients:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
-
-    if is_control:
-      return self._node_ctrl_recipients[node_name]
-    else:
-      return self._node_recipients[node_name]
-
-  def devices(self):
-    """Get the list of devices.
-
-    Returns:
-      (`list` of `str`) names of the devices.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError("Devices are not loaded from partition graphs yet.")
-
-    return self._devices
-
-  def node_exists(self, node_name):
-    """Test if a node exists in the partition graphs.
-
-    Args:
-      node_name: (`str`) name of the node to be checked.
-
-    Returns:
-      A boolean indicating whether the node exists.
-
-    Raises:
-      LookupError: If no partition graphs have been loaded yet.
-    """
-
-    if self._node_inputs is None:
-      raise LookupError(
-          "Nodes have not been loaded from partition graphs yet.")
-
-    return node_name in self._node_inputs
-
-  def node_device(self, node_name):
-    """Get the device of a node.
-
-    Args:
-      node_name: (`str`) name of the node.
-
-    Returns:
-      (`str`) name of the device on which the node is placed.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
-      ValueError: If the node does not exist in partition graphs.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError(
-          "Node devices are not loaded from partition graphs yet.")
-
-    if node_name not in self._node_devices:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
-
-    return self._node_devices[node_name]
-
-  def node_op_type(self, node_name):
-    """Get the op type of given node.
-
-    Args:
-      node_name: (`str`) name of the node.
-
-    Returns:
-      (`str`) op type of the node.
-
-    Raises:
-      LookupError: If node op types have not been loaded
-         from partition graphs yet.
-      ValueError: If the node does not exist in partition graphs.
-    """
-
-    if self._partition_graphs is None:
-      raise LookupError(
-          "Node op types are not loaded from partition graphs yet.")
-
-    if node_name not in self._node_op_types:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
-
-    return self._node_op_types[node_name]
-
-  def debug_watch_keys(self, node_name):
-    """Get all tensor watch keys of given node according to partition graphs.
-
-    Args:
-      node_name: (`str`) name of the node.
-
-    Returns:
-      (`list` of `str`) all debug tensor watch keys. Returns an empty list if
-        the node name does not correspond to any debug watch keys.
-
-    Raises:
-      `LookupError`: If debug watch information has not been loaded from
-        partition graphs yet.
-    """
-
-    if node_name not in self._debug_watches:
-      return []
-
-    watch_keys = []
-    for watched_slot in self._debug_watches[node_name]:
-      debug_ops = self._debug_watches[node_name][watched_slot]
-      for debug_op in debug_ops:
-        watch_keys.append(
-            _get_tensor_watch_key(node_name, watched_slot, debug_op))
-
-    return watch_keys
-
-  def watch_key_to_data(self, debug_watch_key):
-    """Get all `DebugTensorDatum` instances corresponding to a debug watch key.
-
-    Args:
-      debug_watch_key: (`str`) debug watch key.
-
-    Returns:
-      A list of `DebugTensorDatum` instances that correspond to the debug watch
-      key. If the watch key does not exist, returns an empty list.
-
-    Raises:
-      ValueError: If the debug watch key does not exist.
-    """
-
-    return self._watch_key_to_datum.get(debug_watch_key, [])
-
-  def find(self, predicate, first_n=0):
-    """Find dumped tensor data by a certain predicate.
-
-    Args:
-      predicate: A callable that takes two input arguments:
-
-        ```python
-        def predicate(debug_tensor_datum, tensor):
-          # returns a bool
-        ```
-
-        where `debug_tensor_datum` is an instance of `DebugTensorDatum`, which
-        carries the metadata, such as the `Tensor`'s node name, output slot
-        timestamp, debug op name, etc.; and `tensor` is the dumped tensor value
-        as a `numpy.ndarray`.
-      first_n: (`int`) return only the first n `DebugTensotDatum` instances (in
-        time order) for which the predicate returns True. To return all the
-        `DebugTensotDatum` instances, let first_n be <= 0.
-
-    Returns:
-      A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
-       for which predicate returns True, sorted in ascending order of the
-       timestamp.
-    """
-
-    matched_data = []
-    for datum in self._dump_tensor_data:
-      if predicate(datum, datum.get_tensor()):
-        matched_data.append(datum)
-
-        if first_n > 0 and len(matched_data) >= first_n:
-          break
-
-    return matched_data
-
-  def get_tensor_file_paths(self, node_name, output_slot, debug_op):
-    """Get the file paths from a debug-dumped tensor.
-
-    Args:
-      node_name: (`str`) name of the node that the tensor is produced by.
-      output_slot: (`int`) output slot index of tensor.
-      debug_op: (`str`) name of the debug op.
-
-    Returns:
-      List of file path(s) loaded. This is a list because each debugged tensor
-        may be dumped multiple times.
-
-    Raises:
-      ValueError: If the tensor does not exist in the debug-dump data.
-    """
-
-    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
-
-    return [datum.file_path for datum in self._watch_key_to_datum[watch_key]]
-
-  def get_tensors(self, node_name, output_slot, debug_op):
-    """Get the tensor value from for a debug-dumped tensor.
-
-    The tensor may be dumped multiple times in the dump root directory, so a
-    list of tensors (`numpy.ndarray`) is returned.
-
-    Args:
-      node_name: (`str`) name of the node that the tensor is produced by.
-      output_slot: (`int`) output slot index of tensor.
-      debug_op: (`str`) name of the debug op.
-
-    Returns:
-      List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
-
-    Raises:
-      ValueError: If the tensor does not exist in the debug-dump data.
-    """
-
-    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
-
-    return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]]
-
-  def get_rel_timestamps(self, node_name, output_slot, debug_op):
-    """Get the relative timestamp from for a debug-dumped tensor.
-
-    Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
-    absolute timestamp of the first dumped tensor in the dump root. The tensor
-    may be dumped multiple times in the dump root directory, so a list of
-    relative timestamps (`numpy.ndarray`) is returned.
-
-    Args:
-      node_name: (`str`) name of the node that the tensor is produced by.
-      output_slot: (`int`) output slot index of tensor.
-      debug_op: (`str`) name of the debug op.
-
-    Returns:
-      (`list` of `int`) list of relative timestamps.
-
-    Raises:
-      ValueError: If the tensor watch key does not exist in the debug dump data.
-    """
-
-    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
-
-    return self._watch_key_to_rel_time[watch_key]
-
-  def get_dump_sizes_bytes(self, node_name, output_slot, debug_op):
-    """Get the sizes of the dump files for a debug-dumped tensor.
-
-    Unit of the file size: byte.
-
-    Args:
-      node_name: (`str`) name of the node that the tensor is produced by.
-      output_slot: (`int`) output slot index of tensor.
-      debug_op: (`str`) name of the debug op.
-
-    Returns:
-      (`list` of `int`): list of dump file sizes in bytes.
-
-    Raises:
-      ValueError: If the tensor watch key does not exist in the debug dump data.
-    """
-
-    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
-
-    return self._watch_key_to_dump_size_bytes[watch_key]
-
-  def node_traceback(self, element_name):
-    """Try to retrieve the Python traceback of node's construction.
-
-    Args:
-      element_name: (`str`) Name of a graph element (node or tensor).
-
-    Returns:
-      (list) The traceback list object as returned by the `extract_trace`
-        method of Python's traceback module.
-
-    Raises:
-      LookupError: If Python graph is not available for traceback lookup.
-      KeyError: If the node cannot be found in the Python graph loaded.
-    """
-
-    if self._python_graph is None:
-      raise LookupError("Python graph is not available for traceback lookup")
-
-    node_name = get_node_name(element_name)
-    if node_name not in self._node_traceback:
-      raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)
-
-    return self._node_traceback[node_name]
diff --git a/tensorflow/python/debug/debug_data_test.py b/tensorflow/python/debug/debug_data_test.py
deleted file mode 100644
index 753b76358b6..00000000000
--- a/tensorflow/python/debug/debug_data_test.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tfdbg module debug_data."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.python.debug import debug_data
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
-
-  def testParseNodeName(self):
-    node_name, slot = debug_data.parse_node_or_tensor_name("namespace1/node_1")
-
-    self.assertEqual("namespace1/node_1", node_name)
-    self.assertIsNone(slot)
-
-  def testParseTensorName(self):
-    node_name, slot = debug_data.parse_node_or_tensor_name(
-        "namespace1/node_2:3")
-
-    self.assertEqual("namespace1/node_2", node_name)
-    self.assertEqual(3, slot)
-
-
-class NodeNameChecksTest(test_util.TensorFlowTestCase):
-
-  def testIsCopyNode(self):
-    self.assertTrue(debug_data._is_copy_node("__copy_ns1/ns2/node3_0"))
-
-    self.assertFalse(debug_data._is_copy_node("copy_ns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("_copy_ns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("_copyns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("__dbg_ns1/ns2/node3_0"))
-
-  def testIsDebugNode(self):
-    self.assertTrue(
-        debug_data._is_debug_node("__dbg_ns1/ns2/node3:0_0_DebugIdentity"))
-
-    self.assertFalse(
-        debug_data._is_debug_node("dbg_ns1/ns2/node3:0_0_DebugIdentity"))
-    self.assertFalse(
-        debug_data._is_debug_node("_dbg_ns1/ns2/node3:0_0_DebugIdentity"))
-    self.assertFalse(
-        debug_data._is_debug_node("_dbgns1/ns2/node3:0_0_DebugIdentity"))
-    self.assertFalse(debug_data._is_debug_node("__copy_ns1/ns2/node3_0"))
-
-
-class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
-
-  def testParseDebugNodeName_valid(self):
-    debug_node_name_1 = "__dbg_ns_a/ns_b/node_c:1_0_DebugIdentity"
-    (watched_node, watched_output_slot, debug_op_index,
-     debug_op) = debug_data._parse_debug_node_name(debug_node_name_1)
-
-    self.assertEqual("ns_a/ns_b/node_c", watched_node)
-    self.assertEqual(1, watched_output_slot)
-    self.assertEqual(0, debug_op_index)
-    self.assertEqual("DebugIdentity", debug_op)
-
-  def testParseDebugNodeName_invalidPrefix(self):
-    invalid_debug_node_name_1 = "__copy_ns_a/ns_b/node_c:1_0_DebugIdentity"
-
-    with self.assertRaisesRegexp(ValueError, "Invalid prefix"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
-
-  def testParseDebugNodeName_missingDebugOpIndex(self):
-    invalid_debug_node_name_1 = "__dbg_node1:0_DebugIdentity"
-
-    with self.assertRaisesRegexp(ValueError, "Invalid debug node name"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
-
-  def testParseDebugNodeName_invalidWatchedTensorName(self):
-    invalid_debug_node_name_1 = "__dbg_node1_0_DebugIdentity"
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "Invalid tensor name in debug node name"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
-
-
-class HasNanOrInfTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._dummy_datum = dummy_datum = debug_data.DebugTensorDatum(
-        "/foo", "bar_0_DebugIdentity_42")
-
-  def testNaN(self):
-    a = np.array([np.nan, np.nan, 7.0])
-    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testInf(self):
-    a = np.array([np.inf, np.inf, 7.0])
-    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testNanAndInf(self):
-    a = np.array([np.inf, np.nan, 7.0])
-    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testNoNanOrInf(self):
-    a = np.array([0.0, 0.0, 7.0])
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testEmpty(self):
-    a = np.array([])
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testNone(self):
-    a = None
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testDTypeComplexWorks(self):
-    a = np.array([1j, 3j, 3j, 7j], dtype=np.complex128)
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-    b = np.array([1j, 3j, 3j, 7j, np.nan], dtype=np.complex128)
-    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, b))
-
-  def testDTypeIntegerWorks(self):
-    a = np.array([1, 3, 3, 7], dtype=np.int16)
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testDTypeStringGivesFalse(self):
-    """isnan and isinf are not applicable to strings."""
-
-    a = np.array(["s", "p", "a", "m"])
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-  def testDTypeObjectGivesFalse(self):
-    dt = np.dtype([("spam", np.str_, 16), ("eggs", np.float64, (2,))])
-    a = np.array([("spam", (8.0, 7.0)), ("eggs", (6.0, 5.0))], dtype=dt)
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
-
-
-class DebugTensorDatumTest(test_util.TensorFlowTestCase):
-
-  def testDebugDatum(self):
-    dump_root = "/tmp/tfdbg_1"
-    debug_dump_rel_path = "ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385"
-
-    datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
-
-    self.assertEqual("DebugIdentity", datum.debug_op)
-    self.assertEqual("ns1/ns2/node_a_1", datum.node_name)
-    self.assertEqual(2, datum.output_slot)
-    self.assertEqual("ns1/ns2/node_a_1:2", datum.tensor_name)
-    self.assertEqual(1472563253536385, datum.timestamp)
-    self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key)
-    self.assertEqual(
-        os.path.join(dump_root, debug_dump_rel_path), datum.file_path)
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     str(datum))
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     repr(datum))
-
-  def testDumpSizeBytesIsNoneForNonexistentFilePath(self):
-    dump_root = "/tmp/tfdbg_1"
-    debug_dump_rel_path = "ns1/ns2/node_foo_1_2_DebugIdentity_1472563253536385"
-    datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
-
-    self.assertIsNone(datum.dump_size_bytes)
-
-
-class DebugDumpDirTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._dump_root = tempfile.mktemp()
-    os.mkdir(self._dump_root)
-
-  def tearDown(self):
-    # Tear down temporary dump directory.
-    shutil.rmtree(self._dump_root)
-
-  def testDebugDumpDir_nonexistentDumpRoot(self):
-    with self.assertRaisesRegexp(IOError, "does not exist"):
-      debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
-
-  def testDebugDumpDir_invalidFileNamingPattern(self):
-    # File name with too few underscores should lead to an exception.
-    open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb")
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not conform to the naming pattern"):
-      debug_data.DebugDumpDir(self._dump_root)
-
-  def testDebugDumpDir_emptyDumpDir(self):
-    dump_dir = debug_data.DebugDumpDir(self._dump_root)
-
-    self.assertIsNone(dump_dir.t0)
-    self.assertEqual([], dump_dir.dumped_tensor_data)
-
-
-class GetNodeNameAndOutputSlotTest(test_util.TensorFlowTestCase):
-
-  def testParseTensorNameInputWorks(self):
-    self.assertEqual("a", debug_data.get_node_name("a:0"))
-    self.assertEqual(0, debug_data.get_output_slot("a:0"))
-
-    self.assertEqual("_b", debug_data.get_node_name("_b:1"))
-    self.assertEqual(1, debug_data.get_output_slot("_b:1"))
-
-  def testParseNodeNameInputWorks(self):
-    self.assertEqual("a", debug_data.get_node_name("a"))
-    self.assertEqual(0, debug_data.get_output_slot("a"))
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/debug_utils.py b/tensorflow/python/debug/debug_utils.py
deleted file mode 100644
index 3d6d5ad4476..00000000000
--- a/tensorflow/python/debug/debug_utils.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow Debugger (tfdbg) Utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-
-def add_debug_tensor_watch(run_options,
-                           node_name,
-                           output_slot=0,
-                           debug_ops="DebugIdentity",
-                           debug_urls=None):
-  """Add watch on a `Tensor` to `RunOptions`.
-
-  N.B.: Under certain circumstances, the `Tensor` may not be actually watched
-    (e.g., if the node of the `Tensor` is constant-folded during runtime).
-
-  Args:
-    run_options: An instance of `config_pb2.RunOptions` to be modified.
-    node_name: (`str`) name of the node to watch.
-    output_slot: (`int`) output slot index of the tensor from the watched node.
-    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
-      `list` of `str` or a single `str`. The latter case is equivalent to a
-      `list` of `str` with only one element.
-    debug_urls: (`str` or `list` of `str`) URL(s) to send debug values to,
-      e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-  """
-
-  watch_opts = run_options.debug_options.debug_tensor_watch_opts
-
-  watch = watch_opts.add()
-  watch.node_name = node_name
-  watch.output_slot = output_slot
-
-  if isinstance(debug_ops, str):
-    debug_ops = [debug_ops]
-
-  watch.debug_ops.extend(debug_ops)
-
-  if debug_urls:
-    if isinstance(debug_urls, str):
-      debug_urls = [debug_urls]
-
-    watch.debug_urls.extend(debug_urls)
-
-
-def watch_graph(run_options,
-                graph,
-                debug_ops="DebugIdentity",
-                debug_urls=None,
-                node_name_regex_whitelist=None,
-                op_type_regex_whitelist=None):
-  """Add debug watches to `RunOptions` for a TensorFlow graph.
-
-  To watch all `Tensor`s on the graph, let both `node_name_regex_whitelist`
-  and `op_type_regex_whitelist` be the default (`None`).
-
-  N.B.: Under certain circumstances, not all specified `Tensor`s will be
-    actually watched (e.g., nodes that are constant-folded during runtime will
-    not be watched).
-
-  Args:
-    run_options: An instance of `config_pb2.RunOptions` to be modified.
-    graph: An instance of `ops.Graph`.
-    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-    debug_urls: URLs to send debug values to. Can be a list of strings,
-      a single string, or None. The case of a single string is equivalent to
-      a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
-      `grpc://localhost:12345`.
-    node_name_regex_whitelist: Regular-expression whitelist for node_name,
-      e.g., `"(weight_[0-9]+|bias_.*)"`
-    op_type_regex_whitelist: Regular-expression whitelist for the op type of
-      nodes, e.g., `"(Variable|Add)"`.
-      If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
-      are set, the two filtering operations will occur in a logical `AND`
-      relation. In other words, a node will be included if and only if it
-      hits both whitelists.
-  """
-
-  if isinstance(debug_ops, str):
-    debug_ops = [debug_ops]
-
-  if node_name_regex_whitelist:
-    node_name_pattern = re.compile(node_name_regex_whitelist)
-  else:
-    node_name_pattern = None
-
-  if op_type_regex_whitelist:
-    op_type_pattern = re.compile(op_type_regex_whitelist)
-  else:
-    op_type_pattern = None
-
-  ops = graph.get_operations()
-  for op in ops:
-    # Skip nodes without any output tensors.
-    if not op.outputs:
-      continue
-
-    node_name = op.name
-    op_type = op.type
-
-    if node_name_pattern and not node_name_pattern.match(node_name):
-      continue
-    if op_type_pattern and not op_type_pattern.match(op_type):
-      continue
-
-    for slot in xrange(len(op.outputs)):
-      add_debug_tensor_watch(
-          run_options,
-          node_name,
-          output_slot=slot,
-          debug_ops=debug_ops,
-          debug_urls=debug_urls)
-
-
-def watch_graph_with_blacklists(run_options,
-                                graph,
-                                debug_ops="DebugIdentity",
-                                debug_urls=None,
-                                node_name_regex_blacklist=None,
-                                op_type_regex_blacklist=None):
-  """Add debug tensor watches, blacklisting nodes and op types.
-
-  This is similar to `watch_graph()`, but the node names and op types are
-  blacklisted, instead of whitelisted.
-
-  N.B.: Under certain circumstances, not all specified `Tensor`s will be
-    actually watched (e.g., nodes that are constant-folded during runtime will
-    not be watched).
-
-  Args:
-    run_options: An instance of `config_pb2.RunOptions` to be modified.
-    graph: An instance of `ops.Graph`.
-    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
-    debug_urls: URL(s) to send ebug values to, e.g.,
-      `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
-    node_name_regex_blacklist: Regular-expression blacklist for node_name.
-      This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
-    op_type_regex_blacklist: Regular-expression blacklist for the op type of
-      nodes, e.g., `"(Variable|Add)"`.
-      If both node_name_regex_blacklist and op_type_regex_blacklist
-      are set, the two filtering operations will occur in a logical `OR`
-      relation. In other words, a node will be excluded if it hits either of
-      the two blacklists; a node will be included if and only if it hits
-      neither of the blacklists.
-  """
-
-  if isinstance(debug_ops, str):
-    debug_ops = [debug_ops]
-
-  if node_name_regex_blacklist:
-    node_name_pattern = re.compile(node_name_regex_blacklist)
-  else:
-    node_name_pattern = None
-
-  if op_type_regex_blacklist:
-    op_type_pattern = re.compile(op_type_regex_blacklist)
-  else:
-    op_type_pattern = None
-
-  ops = graph.get_operations()
-  for op in ops:
-    # Skip nodes without any output tensors.
-    if not op.outputs:
-      continue
-
-    node_name = op.name
-    op_type = op.type
-
-    if node_name_pattern and node_name_pattern.match(node_name):
-      continue
-    if op_type_pattern and op_type_pattern.match(op_type):
-      continue
-
-    for slot in xrange(len(op.outputs)):
-      add_debug_tensor_watch(
-          run_options,
-          node_name,
-          output_slot=slot,
-          debug_ops=debug_ops,
-          debug_urls=debug_urls)
diff --git a/tensorflow/python/debug/examples/README.md b/tensorflow/python/debug/examples/README.md
index 41bf777c97a..cb4d484092f 100644
--- a/tensorflow/python/debug/examples/README.md
+++ b/tensorflow/python/debug/examples/README.md
@@ -1,4 +1,9 @@
 Hi, there!
 
-The documentation of **TensorFlow Debugger (tfdbg)** has moved to
-[this new location](../../../g3doc/how_tos/debugger/index.md).
+The documentation of **TensorFlow Debugger (tfdbg)** has moved.
+
+See the source version at
+[this new location](../../../docs_src/programmers_guide/debugger.md).
+
+See the public website version at
+[https://www.tensorflow.org/programmers_guide/debugger](https://www.tensorflow.org/programmers_guide/debugger).
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 57ebba689d7..a8df503abde 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -111,7 +111,8 @@ def main(_):
 
   hooks = None
   if FLAGS.debug:
-    debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type)
+    debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
+                                            dump_root=FLAGS.dump_root)
     debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     hooks = [debug_hook]
 
@@ -186,5 +187,10 @@ if __name__ == "__main__":
       const=True,
       default=False,
       help="Use debugger to track down bad values during training")
+  parser.add_argument(
+      "--dump_root",
+      type=str,
+      default="",
+      help="Optional custom root directory for temporary debug dump data")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 397078b91da..0b5401a7f29 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -15,40 +15,95 @@
 # ==============================================================================
 #
 # Bash unit tests for TensorFlow Debugger (tfdbg) Python examples that do not
-# involve downloading data.
+# involve downloading data. Also tests the binary offline_analyzer.
+#
+# Command-line flags:
+#   --virtualenv: (optional) If set, will test the examples and binaries
+#     against pip install of TensorFlow in a virtualenv.
 
 set -e
 
+IS_VIRTUALENV=0
+PYTHON_BIN_PATH=""
+while true; do
+  if [[ -z "$1" ]]; then
+    break
+  elif [[ "$1" == "--virtualenv" ]]; then
+    IS_VIRTUALENV=1
+    PYTHON_BIN_PATH=$(which python)
+    echo
+    echo "IS_VIRTUALENV = ${IS_VIRTUALENV}"
+    echo "PYTHON_BIN_PATH = ${PYTHON_BIN_PATH}"
+    echo "Will test tfdbg examples and binaries against virtualenv pip install."
+    echo
+  fi
+  shift 1
+done
 
-DEBUG_FIBONACCI_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_fibonacci"
+if [[ -z "${PYTHON_BIN_PATH}" ]]; then
+  DEBUG_FIBONACCI_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_fibonacci"
+  DEBUG_ERRORS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_errors"
+  DEBUG_MNIST_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_mnist"
+  DEBUG_TFLEARN_IRIS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_tflearn_iris"
+  OFFLINE_ANALYZER_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/offline_analyzer"
+else
+  DEBUG_FIBONACCI_BIN="${PYTHON_BIN_PATH} -m tensorflow.python.debug.examples.debug_fibonacci"
+  DEBUG_ERRORS_BIN="${PYTHON_BIN_PATH} -m tensorflow.python.debug.examples.debug_errors"
+  DEBUG_MNIST_BIN="${PYTHON_BIN_PATH} -m tensorflow.python.debug.examples.debug_mnist"
+  DEBUG_TFLEARN_IRIS_BIN="${PYTHON_BIN_PATH} -m tensorflow.python.debug.examples.debug_tflearn_iris"
+  OFFLINE_ANALYZER_BIN="${PYTHON_BIN_PATH} -m tensorflow.python.debug.cli.offline_analyzer"
+fi
 
 # Override the default ui_type=curses to allow the test to pass in a tty-less
 # test environment.
-cat << EOF | "${DEBUG_FIBONACCI_BIN}" --ui_type=readline
+cat << EOF | ${DEBUG_FIBONACCI_BIN} --ui_type=readline
 run
 exit
 EOF
 
-
-DEBUG_ERRORS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_errors"
-
-cat << EOF | "${DEBUG_ERRORS_BIN}" --error=no_error --ui_type=readline
+cat << EOF | ${DEBUG_ERRORS_BIN} --error=no_error --ui_type=readline
 run
 exit
 EOF
 
-
-DEBUG_MNIST_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_mnist"
-
-# Use a large enough "run -t" number to let the process end properly.
-cat << EOF | "${DEBUG_MNIST_BIN}" --debug --fake_data --ui_type=readline
-run -f has_inf_or_nan
-run -t 1000
-EOF
-
-
-DEBUG_TFLEARN_IRIS_BIN="$TEST_SRCDIR/org_tensorflow/tensorflow/python/debug/debug_tflearn_iris"
-
-cat << EOF | "${DEBUG_TFLEARN_IRIS_BIN}" --debug --fake_data --train_steps=2 --ui_type=readline
+cat << EOF | ${DEBUG_MNIST_BIN} --debug --max_steps=1 --fake_data --ui_type=readline
+run -t 1
+run --node_name_filter hidden --op_type_filter MatMul
 run -f has_inf_or_nan
 EOF
+
+# Test the custom dump_root option.
+CUSTOM_DUMP_ROOT=$(mktemp -d)
+mkdir -p ${CUSTOM_DUMP_ROOT}
+
+cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --fake_data --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+run -p
+run -f has_inf_or_nan
+EOF
+
+# Verify that the dump root has been cleaned up on exit.
+if [[ -d "${CUSTOM_DUMP_ROOT}" ]]; then
+  echo "ERROR: dump root at ${CUSTOM_DUMP_ROOT} failed to be cleaned up." 1>&2
+  exit 1
+fi
+
+# Test offline_analyzer.
+echo
+echo "Testing offline_analyzer"
+echo
+
+# TODO(cais): Generate an actual debug dump and load it with offline_analyzer,
+# so that we can test the binary runs with a non-error exit code.
+set +e
+OUTPUT=$(${OFFLINE_ANALYZER_BIN} 2>&1)
+set -e
+
+EXPECTED_OUTPUT="ERROR: dump_dir flag is empty."
+if [[ "${OUTPUT}" != "${EXPECTED_OUTPUT}" ]]; then
+  echo "ERROR: offline_analyzer output didn't match expectation: ${OUTPUT}" 1>&2
+  echo "Expected output: ${EXPECTED_OUTPUT}"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: tfdbg examples and binaries test PASSED"
diff --git a/tensorflow/python/summary/impl/__init__.py b/tensorflow/python/debug/lib/__init__.py
similarity index 100%
rename from tensorflow/python/summary/impl/__init__.py
rename to tensorflow/python/debug/lib/__init__.py
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
new file mode 100644
index 00000000000..0cdf1891272
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -0,0 +1,1948 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and functions to handle debug-dump data of TensorFlow Debugger."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import glob
+import json
+import os
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import op_def_registry
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.platform import gfile
+
+
+# TODO(cais): Tie these string constants in with C++?
+METADATA_FILE_PREFIX = "_tfdbg_"
+CORE_METADATA_TAG = "core_metadata_"
+GRAPH_FILE_TAG = "graph_"
+DEVICE_TAG = "device_"
+
+FETCHES_INFO_FILE_TAG = "fetches_info_"
+FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
+
+
+class InconvertibleTensorProto(object):
+  """Represents a TensorProto that cannot be converted to np.ndarray."""
+
+  def __init__(self, tensor_proto, initialized=True):
+    """Constructor.
+
+    Args:
+      tensor_proto: the `TensorProto` object that cannot be represented as a
+        `np.ndarray` object.
+      initialized: (`bool`) whether the Tensor is initialized.
+    """
+    self._tensor_proto = tensor_proto
+    self._initialized = initialized
+
+  def __str__(self):
+    output = "" if self._initialized else "Uninitialized tensor:\n"
+    output += str(self._tensor_proto)
+    return output
+
+  @property
+  def initialized(self):
+    return self._initialized
+
+
+def load_tensor_from_event_file(event_file_path):
+  """Load a tensor from an event file.
+
+  Assumes that the event file contains a `Event` protobuf and the `Event`
+  protobuf contains a `Tensor` value.
+
+  Args:
+    event_file_path: (`str`) path to the event file.
+
+  Returns:
+    The tensor value loaded from the event file, as a `numpy.ndarray`. For
+    uninitialized Tensors, returns `None`. For Tensors of data types that
+    cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return
+    `None`.
+  """
+
+  event = event_pb2.Event()
+  with gfile.Open(event_file_path, "rb") as f:
+    event.ParseFromString(f.read())
+    return load_tensor_from_event(event)
+
+
+def load_tensor_from_event(event):
+  """Load a tensor from an Event proto.
+
+  Args:
+    event: The Event proto, assumed to hold a tensor value in its
+        summary.value[0] field.
+
+  Returns:
+    The tensor value loaded from the event file, as a `numpy.ndarray`, if
+    representation of the tensor value by a `numpy.ndarray` is possible.
+    For uninitialized Tensors, returns `None`. For Tensors of data types that
+    cannot be represented as `numpy.ndarray` (e.g., `tf.resource`), return
+    the `TensorProto` protobuf object without converting it to a
+    `numpy.ndarray`.
+  """
+
+  tensor_proto = event.summary.value[0].tensor
+  if tensor_proto.tensor_content or tensor_proto.string_val:
+    # Initialized tensor.
+    if tensor_proto.dtype == types_pb2.DT_RESOURCE:
+      tensor_value = InconvertibleTensorProto(tensor_proto)
+    else:
+      try:
+        tensor_value = tensor_util.MakeNdarray(tensor_proto)
+      except KeyError:
+        tensor_value = InconvertibleTensorProto(tensor_proto)
+  else:
+    # Uninitialized tensor or tensor of unconvertible data type.
+    tensor_value = InconvertibleTensorProto(tensor_proto, False)
+
+  return tensor_value
+
+
+def _load_graph_def_from_event_file(event_file_path):
+  event = event_pb2.Event()
+  with gfile.Open(event_file_path, "rb") as f:
+    event.ParseFromString(f.read())
+
+  return graph_pb2.GraphDef.FromString(event.graph_def)
+
+
+def _load_log_message_from_event_file(event_file_path):
+  event = event_pb2.Event()
+  with gfile.Open(event_file_path, "rb") as f:
+    event.ParseFromString(f.read())
+
+  return event.log_message.message
+
+
+def parse_node_or_tensor_name(name):
+  """Get the node name from a string that can be node or tensor name.
+
+  Args:
+    name: An input node name (e.g., "node_a") or tensor name (e.g.,
+      "node_a:0"), as a str.
+
+  Returns:
+    1) The node name, as a str. If the input name is a tensor name, i.e.,
+      consists of a colon, the final colon and the following output slot
+      will be stripped.
+    2) If the input name is a tensor name, the output slot, as an int. If
+      the input name is not a tensor name, None.
+  """
+
+  if ":" in name and not name.endswith(":"):
+    node_name = name[:name.rfind(":")]
+    output_slot = int(name[name.rfind(":") + 1:])
+
+    return node_name, output_slot
+  else:
+    return name, None
+
+
+def _is_graph_file(file_name):
+  return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
+
+
+def _is_run_fetches_info_file(file_name):
+  return file_name == METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG
+
+
+def _is_run_feed_keys_info_file(file_name):
+  return file_name == METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG
+
+
+def get_node_name(element_name):
+  return element_name.split(":")[0] if ":" in element_name else element_name
+
+
+def get_output_slot(element_name):
+  """Get the output slot number from the name of a graph element.
+
+  If element_name is a node name without output slot at the end, 0 will be
+  assumed.
+
+  Args:
+    element_name: (`str`) name of the graph element in question.
+
+  Returns:
+    (`int`) output slot number.
+  """
+  return int(element_name.split(":")[-1]) if ":" in element_name else 0
+
+
+def _get_tensor_name(node_name, output_slot):
+  """Get tensor name given node name and output slot index.
+
+  Args:
+    node_name: Name of the node that outputs the tensor, as a string.
+    output_slot: Output slot index of the tensor, as an integer.
+
+  Returns:
+    Name of the tensor, as a string.
+  """
+
+  return "%s:%d" % (node_name, output_slot)
+
+
+def _get_tensor_watch_key(node_name, output_slot, debug_op):
+  """Get the string representation of a debug watch on a tensor.
+
+  Args:
+    node_name: Name of the node by which the watched tensor is produced, as a
+        string.
+    output_slot: Output slot index of the tensor, as an integer.
+    debug_op: Name of the debug op that is used to watch the tensor, as a
+        string.
+
+  Returns:
+    A string representing the debug watch on the tensor (i.e., the "watch
+        key").
+  """
+  return "%s:%s" % (_get_tensor_name(node_name, output_slot), debug_op)
+
+
+def is_copy_node(node_name):
+  """Determine whether a node name is that of a debug Copy node.
+
+  Such nodes are inserted by TensorFlow core upon request in
+  RunOptions.debug_options.debug_tensor_watch_opts.
+
+  Args:
+    node_name: Name of the node.
+
+  Returns:
+    A bool indicating whether the input argument is the name of a debug Copy
+    node.
+  """
+  return node_name.startswith("__copy_")
+
+
+def is_debug_node(node_name):
+  """Determine whether a node name is that of a debug node.
+
+  Such nodes are inserted by TensorFlow core upon request in
+  RunOptions.debug_options.debug_tensor_watch_opts.
+
+  Args:
+    node_name: Name of the node.
+
+  Returns:
+    A bool indicating whether the input argument is the name of a debug node.
+  """
+  return node_name.startswith("__dbg_")
+
+
+def parse_debug_node_name(node_name):
+  """Parse the name of a debug node.
+
+  Args:
+    node_name: Name of the debug node.
+
+  Returns:
+    1. Name of the watched node, as a str.
+    2. Output slot index of the watched tensor, as an int.
+    3. Index of the debug node, as an int.
+    4. Name of the debug op, as a str, e.g, "DebugIdentity".
+
+  Raises:
+    ValueError: If the input node name is not a valid debug node name.
+  """
+  prefix = "__dbg_"
+
+  name = node_name
+  if not name.startswith(prefix):
+    raise ValueError("Invalid prefix in debug node name: '%s'" % node_name)
+
+  name = name[len(prefix):]
+
+  if name.count("_") < 2:
+    raise ValueError("Invalid debug node name: '%s'" % node_name)
+
+  debug_op = name[name.rindex("_") + 1:]
+  name = name[:name.rindex("_")]
+
+  debug_op_index = int(name[name.rindex("_") + 1:])
+  name = name[:name.rindex("_")]
+
+  if name.count(":") != 1:
+    raise ValueError("Invalid tensor name in debug node name: '%s'" % node_name)
+
+  watched_node_name = name[:name.index(":")]
+  watched_output_slot = int(name[name.index(":") + 1:])
+
+  return watched_node_name, watched_output_slot, debug_op_index, debug_op
+
+
+def has_inf_or_nan(datum, tensor):
+  """A predicate for whether a tensor consists of any bad numerical values.
+
+  This predicate is common enough to merit definition in this module.
+  Bad numerical values include `nan`s and `inf`s.
+  The signature of this function follows the requirement of the method
+  `DebugDumpDir.find()`.
+
+  Args:
+    datum: (`DebugTensorDatum`) Datum metadata.
+    tensor: (`numpy.ndarray` or None) Value of the tensor. None represents
+      an uninitialized tensor.
+
+  Returns:
+    (`bool`) True if and only if tensor consists of any nan or inf values.
+  """
+
+  _ = datum  # Datum metadata is unused in this predicate.
+
+  if isinstance(tensor, InconvertibleTensorProto):
+    # Uninitialized tensor doesn't have bad numerical values.
+    # Also return False for data types that cannot be represented as numpy
+    # arrays.
+    return False
+  elif (np.issubdtype(tensor.dtype, np.float) or
+        np.issubdtype(tensor.dtype, np.complex) or
+        np.issubdtype(tensor.dtype, np.integer)):
+    return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
+  else:
+    return False
+
+
+_CoreMetadata = collections.namedtuple("CoreMetadata", [
+    "global_step", "session_run_index", "executor_step_index", "input_names",
+    "output_names", "target_nodes"
+])
+
+
+def extract_core_metadata_from_event_proto(event):
+  json_metadata = json.loads(event.log_message.message)
+  return _CoreMetadata(json_metadata["global_step"],
+                       json_metadata["session_run_index"],
+                       json_metadata["executor_step_index"],
+                       json_metadata["input_names"],
+                       json_metadata["output_names"],
+                       json_metadata["target_nodes"])
+
+
+def device_name_to_device_path(device_name):
+  """Convert device name to device path."""
+  device_name_items = device_name.split("/")
+  device_name_items = [item.replace(":", "_") for item in device_name_items]
+  return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)
+
+
+def device_path_to_device_name(device_dir):
+  """Parse device name from device path.
+
+  Args:
+    device_dir: (str) a directory name for the device.
+
+  Returns:
+    (str) parsed device name.
+  """
+  path_items = os.path.basename(device_dir)[
+      len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
+  return "/".join([
+      path_item.replace("_", ":", 1) for path_item in path_items])
+
+
+class DebugTensorDatum(object):
+  """A single tensor dumped by TensorFlow Debugger (tfdbg).
+
+  Contains metadata about the dumped tensor, including `timestamp`,
+  `node_name`, `output_slot`, `debug_op`, and path to the dump file
+  (`file_path`).
+
+  This type does not hold the generally space-expensive tensor value (numpy
+  array). Instead, it points to the file from which the tensor value can be
+  loaded (with the `get_tensor` method) if needed.
+  """
+
+  def __init__(self, dump_root, debug_dump_rel_path):
+    """`DebugTensorDatum` constructor.
+
+    Args:
+      dump_root: (`str`) Debug dump root directory. This path should not include
+        the path component that represents the device name (see also below).
+      debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
+        `dump_root`. The first item of this relative path is assumed to be
+        a path representing the name of the device that the Tensor belongs to.
+        See `device_path_to_device_name` for more details on the device path.
+        For example, suppose the debug dump root
+        directory is `/tmp/tfdbg_1` and the dump file is at
+        `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`,
+        then the value of the debug_dump_rel_path should be
+        `<device_path>/ns_1/node_a_0_DebugIdenity_1234456789`.
+
+    Raises:
+      ValueError: If the base file name of the dump file does not conform to
+        the dump file naming pattern:
+        `node_name`_`output_slot`_`debug_op`_`timestamp`
+    """
+
+    path_components = os.path.normpath(debug_dump_rel_path).split(os.sep)
+    self._device_name = device_path_to_device_name(path_components[0])
+    base = path_components[-1]
+    if base.count("_") < 3:
+      raise ValueError(
+          "Dump file path does not conform to the naming pattern: %s" % base)
+
+    self._extended_timestamp = base.split("_")[-1]
+    # It may include an index suffix at the end if file path collision happened
+    # due to identical timestamps.
+    if "-" in self._extended_timestamp:
+      self._timestamp = int(
+          self._extended_timestamp[:self._extended_timestamp.find("-")])
+    else:
+      self._timestamp = int(self._extended_timestamp)
+
+    self._debug_op = base.split("_")[-2]
+    self._output_slot = int(base.split("_")[-3])
+
+    node_base_name = "_".join(base.split("_")[:-3])
+    self._node_name = "/".join(path_components[1:-1] + [node_base_name])
+
+    self._file_path = os.path.join(dump_root, debug_dump_rel_path)
+    self._dump_size_bytes = (gfile.Stat(self._file_path).length if
+                             gfile.Exists(self._file_path) else None)
+
+  def __str__(self):
+    return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name,
+                                                        self.node_name,
+                                                        self.output_slot,
+                                                        self.debug_op,
+                                                        self.timestamp)
+
+  def __repr__(self):
+    return self.__str__()
+
+  def get_tensor(self):
+    """Get tensor from the dump (`Event`) file.
+
+    Returns:
+      The tensor loaded from the dump (`Event`) file.
+    """
+
+    return load_tensor_from_event_file(self.file_path)
+
+  # TODO(cais): Add time unit suffix to timestamp and t0 (us).
+  @property
+  def timestamp(self):
+    """Timestamp of when this tensor value was dumped.
+
+    Returns:
+      (`int`) The timestamp in microseconds.
+    """
+
+    return self._timestamp
+
+  @property
+  def extended_timestamp(self):
+    """Extended timestamp, possibly with an index suffix.
+
+    The index suffix, e.g., "-1", is for disambiguating multiple dumps of the
+    same tensor with the same timestamp, which can occur if the dumping events
+    are spaced by shorter than the temporal resolution of the timestamps.
+
+    Returns:
+      (`str`) The extended timestamp.
+    """
+
+    return self._extended_timestamp
+
+  @property
+  def debug_op(self):
+    """Name of the debug op.
+
+    Returns:
+      (`str`) debug op name (e.g., `DebugIdentity`).
+    """
+
+    return self._debug_op
+
+  @property
+  def device_name(self):
+    """Name of the device that the tensor belongs to.
+
+    Returns:
+      (`str`) device name.
+    """
+
+    return self._device_name
+
+  @property
+  def node_name(self):
+    """Name of the node from which the tensor value was dumped.
+
+    Returns:
+      (`str`) name of the node watched by the debug op.
+    """
+
+    return self._node_name
+
+  @property
+  def output_slot(self):
+    """Output slot index from which the tensor value was dumped.
+
+    Returns:
+      (`int`) output slot index watched by the debug op.
+    """
+
+    return self._output_slot
+
+  @property
+  def tensor_name(self):
+    """Name of the tensor watched by the debug op.
+
+    Returns:
+      (`str`) `Tensor` name, in the form of `node_name`:`output_slot`
+    """
+
+    return _get_tensor_name(self.node_name, self.output_slot)
+
+  @property
+  def watch_key(self):
+    """Watch key identities a debug watch on a tensor.
+
+    Returns:
+      (`str`) A watch key, in the form of `tensor_name`:`debug_op`.
+    """
+
+    return _get_tensor_watch_key(self.node_name, self.output_slot,
+                                 self.debug_op)
+
+  @property
+  def file_path(self):
+    """Path to the file which stores the value of the dumped tensor."""
+
+    return self._file_path
+
+  @property
+  def dump_size_bytes(self):
+    """Size of the dump file.
+
+    Unit: byte.
+
+    Returns:
+      If the dump file exists, size of the dump file, in bytes.
+      If the dump file does not exist, None.
+    """
+
+    return self._dump_size_bytes
+
+
+class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):
+  pass
+
+
+class _GraphTracingReachedDestination(Exception):
+  pass
+
+
+class _DFSGraphTracer(object):
+  """Graph input tracer using depth-first search."""
+
+  def __init__(self,
+               input_lists,
+               skip_node_names=None,
+               destination_node_name=None):
+    """Constructor of _DFSGraphTracer.
+
+    Args:
+      input_lists: A list of dicts. Each dict is an adjacency (input) map from
+        the recipient node name as the key and the list of input node names
+        as the value.
+      skip_node_names: Optional: a list of node names to skip tracing.
+      destination_node_name: Optional: destination node name. If not `None`, it
+        should be the name of a destination not as a str and the graph tracing
+        will raise GraphTracingReachedDestination as soon as the node has been
+        reached.
+
+    Raises:
+      _GraphTracingReachedDestination: if stop_at_node_name is not None and
+        the specified node is reached.
+    """
+
+    self._input_lists = input_lists
+    self._skip_node_names = skip_node_names
+
+    self._inputs = []
+    self._visited_nodes = []
+    self._depth_count = 0
+    self._depth_list = []
+
+    self._destination_node_name = destination_node_name
+
+  def trace(self, graph_element_name):
+    """Trace inputs.
+
+    Args:
+      graph_element_name: Name of the node or an output tensor of the node, as a
+        str.
+
+    Raises:
+      _GraphTracingReachedDestination: if destination_node_name of this tracer
+        object is not None and the specified node is reached.
+    """
+    self._depth_count += 1
+
+    node_name = get_node_name(graph_element_name)
+
+    if node_name == self._destination_node_name:
+      raise _GraphTracingReachedDestination()
+
+    if node_name in self._skip_node_names:
+      return
+    if node_name in self._visited_nodes:
+      return
+
+    self._visited_nodes.append(node_name)
+
+    for input_list in self._input_lists:
+      for inp in input_list[node_name]:
+        if get_node_name(inp) in self._visited_nodes:
+          continue
+        self._inputs.append(inp)
+        self._depth_list.append(self._depth_count)
+        self.trace(inp)
+
+    self._depth_count -= 1
+
+  def inputs(self):
+    return self._inputs
+
+  def depth_list(self):
+    return self._depth_list
+
+
+# TODO(cais): This class is getting too large in line count. Refactor to make it
+# smaller and easier to maintain.
+class DebugDumpDir(object):
+  """Data set from a debug-dump directory on filesystem.
+
+  An instance of `DebugDumpDir` contains all `DebugTensorDatum` instances
+  in a tfdbg dump root directory.
+  """
+
+  def __init__(self, dump_root, partition_graphs=None, validate=True):
+    """`DebugDumpDir` constructor.
+
+    Args:
+      dump_root: (`str`) path to the dump root directory.
+      partition_graphs: A repeated field of GraphDefs representing the
+          partition graphs executed by the TensorFlow runtime.
+      validate: (`bool`) whether the dump files are to be validated against the
+          partition graphs.
+
+    Raises:
+      IOError: If dump_root does not exist as a directory.
+      ValueError: If more than one core metadata file is found under the dump
+        root directory.
+    """
+
+    if not gfile.IsDirectory(dump_root):
+      raise IOError("Dump root directory %s does not exist" % dump_root)
+
+    self._core_metadata = []
+
+    # Find the list of devices.
+    self._dump_root = dump_root
+
+    self._load_core_metadata()
+    self._load_fetches_info()
+    self._load_feeds_info()
+    self._load_all_device_dumps(partition_graphs, validate)
+
+    self._python_graph = None
+
+  def _load_all_device_dumps(self, partition_graphs, validate):
+    """Load the dump data for all devices."""
+    device_dirs = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))
+
+    self._device_names = []
+    self._t0s = {}
+    self._dump_tensor_data = {}
+    self._dump_graph_file_paths = {}
+    self._debug_watches = {}
+    self._watch_key_to_devices = {}
+    self._watch_key_to_datum = {}
+    self._watch_key_to_rel_time = {}
+    self._watch_key_to_dump_size_bytes = {}
+    for device_dir in device_dirs:
+      device_name = device_path_to_device_name(device_dir)
+      self._device_names.append(device_name)
+      self._load_device_dumps(device_name, device_dir)
+    self._load_partition_graphs(partition_graphs, validate)
+    self._calculate_t0()
+
+    for device_name in self._device_names:
+      self._create_tensor_watch_maps(device_name)
+
+  def _load_device_dumps(self, device_name, device_root):
+    """Load `DebugTensorDatum` instances from the dump root of a given device.
+
+    Populates a map {device_name: a list of `DebugTensorDatum`}, where the list
+    is sorted by  ascending timestamp.
+
+    This sorting order reflects the order in which the TensorFlow executor
+    processed the nodes of the graph. It is (one of many possible) topological
+    sort of the nodes. This is useful for displaying tensors in the debugger
+    frontend as well as for the use case in which the user wants to find a
+    "culprit tensor", i.e., the first tensor in the graph that exhibits certain
+    problematic properties, i.e., all zero values, or bad numerical values such
+    as nan and inf.
+
+    In addition, creates a map from node name to debug watches. In this Map,
+    the key is the watched node name; the value is a dictionary.
+    Of this dictionary, the key is the watched_output_slot.
+
+    This method attempts to load the debug watches from the tensor dump files
+    first, before loading the full set of debug watches from the partition
+    graphs as done later. This is necessary because sometimes the partition
+    graphs may not be available, e.g., when the run errors out.
+
+    Args:
+      device_name: (`str`) name of the device.
+      device_root: (`str`) dump root directory of the given device.
+
+    Raises:
+      ValueError: If GraphDef for the device is not available.
+    """
+
+    self._dump_tensor_data[device_name] = []
+    self._debug_watches[device_name] = collections.defaultdict(
+        lambda: collections.defaultdict(set))
+
+    for root, _, files in gfile.Walk(device_root):
+      for f in files:
+        if _is_graph_file(f):
+          self._dump_graph_file_paths[device_name] = os.path.join(
+              device_root, root, f)
+        else:
+          datum = self._dump_file_name_to_datum(root, f)
+          self._dump_tensor_data[device_name].append(datum)
+          self._debug_watches[device_name][datum.node_name][
+              datum.output_slot].add(datum.debug_op)
+
+    self._dump_tensor_data[device_name] = sorted(
+        self._dump_tensor_data[device_name],
+        key=lambda x: x.extended_timestamp)
+
+    if self._dump_tensor_data[device_name]:
+      self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp
+    else:
+      self._t0s[device_name] = None
+
+  def _calculate_t0(self):
+    """Calculate the first timestamp across all devices."""
+    t0s = [t0 for t0 in six.itervalues(self._t0s) if t0 is not None]
+    self._t0 = min(t0s) if t0s else None
+
+  def _load_core_metadata(self):
+    core_metadata_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))
+    for core_metadata_file in core_metadata_files:
+      with gfile.Open(core_metadata_file, "rb") as f:
+        event = event_pb2.Event()
+        event.ParseFromString(f.read())
+        self._core_metadata.append(
+            extract_core_metadata_from_event_proto(event))
+
+  def _load_fetches_info(self):
+    fetches_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))
+    self._run_fetches_info = []
+    for fetches_info_file in fetches_info_files:
+      self._run_fetches_info.append(
+          _load_log_message_from_event_file(fetches_info_file))
+
+  def _load_feeds_info(self):
+    feeds_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))
+    self._run_feed_keys_info = []
+    for feeds_info_file in feeds_info_files:
+      self._run_feed_keys_info.append(
+          _load_log_message_from_event_file(feeds_info_file))
+
+  def _dump_file_name_to_datum(self, dir_name, file_name):
+    """Obtain a DebugTensorDatum from the directory and file name.
+
+    Args:
+      dir_name: (`str`) Name of the directory in which the dump file resides.
+      file_name: (`str`) Base name of the dump file.
+
+    Returns:
+      (`DebugTensorDatum`) The `DebugTensorDatum` loaded from the dump file.
+    """
+
+    # Calculate the relative path of the dump file with respect to the root.
+    debug_dump_rel_path = os.path.join(
+        os.path.relpath(dir_name, self._dump_root), file_name)
+    return DebugTensorDatum(self._dump_root, debug_dump_rel_path)
+
+  def _create_tensor_watch_maps(self, device_name):
+    """Create maps from tensor watch keys to datum and to timestamps.
+
+    Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`
+    item. Also make a map from watch key to relative timestamp.
+    "relative" means (absolute timestamp - t0).
+
+    Args:
+      device_name: (str) name of the device.
+    """
+
+    self._watch_key_to_datum[device_name] = {}
+    self._watch_key_to_rel_time[device_name] = {}
+    self._watch_key_to_dump_size_bytes[device_name] = {}
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.watch_key not in self._watch_key_to_devices:
+        self._watch_key_to_devices[datum.watch_key] = {device_name}
+      else:
+        self._watch_key_to_devices[datum.watch_key].add(device_name)
+
+      if datum.watch_key not in self._watch_key_to_datum[device_name]:
+        self._watch_key_to_datum[device_name][datum.watch_key] = [datum]
+        self._watch_key_to_rel_time[device_name][datum.watch_key] = [
+            datum.timestamp - self._t0]
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [
+            datum.dump_size_bytes]
+      else:
+        self._watch_key_to_datum[device_name][datum.watch_key].append(datum)
+        self._watch_key_to_rel_time[device_name][datum.watch_key].append(
+            datum.timestamp - self._t0)
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append(
+            datum.dump_size_bytes)
+
+  def set_python_graph(self, python_graph):
+    """Provide Python `Graph` object to the wrapper.
+
+    Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
+    is a Python object and carries additional information such as the traceback
+    of the construction of the nodes in the graph.
+
+    Args:
+      python_graph: (ops.Graph) The Python Graph object.
+    """
+
+    self._python_graph = python_graph
+    self._node_traceback = {}
+    if self._python_graph:
+      for op in self._python_graph.get_operations():
+        self._node_traceback[op.name] = op.traceback
+
+  @property
+  def python_graph(self):
+    """Get the Python graph.
+
+    Returns:
+      If the Python graph has been set, returns a `tf.Graph` object. Otherwise,
+      returns None.
+    """
+
+    return self._python_graph
+
+  @property
+  def core_metadata(self):
+    """Metadata about the `Session.run()` call from the core runtime.
+
+    Of the three counters available in the return value, `global_step` is
+    supplied by the caller of the debugged `Session.run()`, while
+    `session_run_index` and `executor_step_index` are determined by the state
+    of the core runtime, automatically. For the same fetch list, feed keys and
+    debug tensor watch options, the same executor will be used and
+    `executor_step_index` should increase by one at a time. However, runs with
+    different fetch lists, feed keys and debug_tensor watch options that all
+    share the same `Session` object can lead to gaps in `session_run_index`.
+
+    Returns:
+      If core metadata are loaded, a `namedtuple` with the fields:
+        `global_step`: A global step count supplied by the caller of
+          `Session.run()`. It is optional to the caller. If the caller did not
+          supply this parameter, its value will be -1.
+        `session_run_index`: A sorted index for Run() calls to the underlying
+          TensorFlow `Session` object.
+        `executor_step_index`: A counter for invocations of a given runtime
+          executor. The same executor is re-used for the same fetched tensors,
+          target nodes, input feed keys and debug tensor watch options.
+        `input_names`: Names of the input (feed) Tensors.
+        `output_names`: Names of the output (fetched) Tensors.
+        `target_nodes`: Names of the target nodes.
+      If the core metadata have not been loaded, `None`.
+      If more than one core metadata files exist, return a list of the
+        `nametuple` described above.
+    """
+
+    output = self._core_metadata
+    return output[0] if len(output) == 1 else output
+
+  @property
+  def dumped_tensor_data(self):
+    """Retrieve dumped tensor data."""
+    if len(self.devices()) == 1:
+      return self._dump_tensor_data[self.devices()[0]]
+    else:
+      all_devices_data = six.itervalues(self._dump_tensor_data)
+      data = []
+      for device_data in all_devices_data:
+        data.extend(device_data)
+      return sorted(data, key=lambda x: x.extended_timestamp)
+
+  @property
+  def t0(self):
+    """Absolute timestamp of the first dumped tensor across all devices.
+
+    Returns:
+      (`int`) absolute timestamp of the first dumped tensor, in microseconds.
+    """
+    return self._t0
+
+  @property
+  def size(self):
+    """Total number of dumped tensors in the dump root directory.
+
+    Returns:
+      (`int`) The total number of dumped tensors in the dump root directory.
+    """
+    return sum(len(self._dump_tensor_data[device_name])
+               for device_name in self._dump_tensor_data)
+
+  def _load_partition_graphs(self, partition_graphs, validate):
+    """Load and process partition graphs.
+
+    Load the graphs; parse the input and control input structure; obtain the
+    device and op type of each node; remove the Copy and debug ops inserted
+    by the debugger. The gathered information can be used to validate the
+    tensor dumps.
+
+    Args:
+      partition_graphs: A repeated field of GraphDefs representing the
+          partition graphs executed by the TensorFlow runtime.
+      validate: (`bool`) Whether the dump files are to be validated against the
+        partition graphs.
+
+    Raises:
+      ValueError: If the partition GraphDef of one or more devices fail to be
+        loaded.
+    """
+
+    self._node_attributes = {}
+    self._node_inputs = {}
+    self._node_reversed_ref_inputs = {}
+    self._node_ctrl_inputs = {}
+    self._node_recipients = {}
+    self._node_ctrl_recipients = {}
+    self._node_devices = {}
+    self._node_op_types = {}
+    self._copy_send_nodes = {}
+    self._ref_args = {}
+
+    self._partition_graphs = {}
+    for device_name in self._device_names:
+      partition_graph = None
+      if device_name in self._dump_graph_file_paths:
+        partition_graph = _load_graph_def_from_event_file(
+            self._dump_graph_file_paths[device_name])
+      else:
+        partition_graph = self._find_partition_graph(partition_graphs,
+                                                     device_name)
+
+      if partition_graph:
+        self._partition_graphs[device_name] = partition_graph
+
+      self._node_attributes[device_name] = {}
+      self._node_inputs[device_name] = {}
+      self._node_reversed_ref_inputs[device_name] = {}
+      self._node_ctrl_inputs[device_name] = {}
+      self._node_recipients[device_name] = {}
+      self._node_ctrl_recipients[device_name] = {}
+      self._node_op_types[device_name] = {}
+      self._copy_send_nodes[device_name] = []
+      self._ref_args[device_name] = []
+
+      if partition_graph:
+        for node in partition_graph.node:
+          self._process_partition_graph_node(device_name, node)
+
+      self._prune_non_control_edges_of_debug_ops(device_name)
+      self._prune_control_edges_of_debug_ops(device_name)
+
+      self._populate_recipient_maps(device_name)
+
+      if device_name in self._partition_graphs and validate:
+        self._validate_dump_with_graphs(device_name)
+
+  def _find_partition_graph(self, partition_graphs, device_name):
+    if partition_graphs is None:
+      return None
+    else:
+      for graph_def in partition_graphs:
+        for node_def in graph_def.node:
+          if node_def.device == device_name:
+            return graph_def
+      return None
+
+  def _get_ref_args(self, node):
+    """Determine whether an input of an op is ref-type.
+
+    Args:
+      node: A `NodeDef`.
+
+    Returns:
+      A list of the arg names (as strs) that are ref-type.
+    """
+
+    op_def = op_def_registry.get_registered_ops().get(node.op)
+    ref_args = []
+    if op_def:
+      for i, output_arg in enumerate(op_def.output_arg):
+        if output_arg.is_ref:
+          arg_name = node.name if i == 0 else (node.name + ":%d" % i)
+          ref_args.append(arg_name)
+    return ref_args
+
+  def _process_partition_graph_node(self, device_name, node):
+    """Process a node from the partition graphs.
+
+    Args:
+      device_name: (str) device name.
+      node: (NodeDef) A partition-graph node to be processed.
+
+    Raises:
+      ValueError: If duplicate node names are encountered.
+    """
+
+    if is_debug_node(node.name):
+      # This is a debug node. Parse the node name and retrieve the
+      # information about debug watches on tensors. But do not include
+      # the node in the graph.
+      (watched_node_name, watched_output_slot, _,
+       debug_op) = parse_debug_node_name(node.name)
+
+      self._debug_watches[device_name][watched_node_name][
+          watched_output_slot].add(debug_op)
+
+      return
+
+    if node.name in self._node_inputs[device_name]:
+      raise ValueError("Duplicate node name on device %s: '%s'" %
+                       (device_name, node.name))
+
+    self._node_attributes[device_name][node.name] = node.attr
+
+    self._node_inputs[device_name][node.name] = []
+    self._node_ctrl_inputs[device_name][node.name] = []
+    self._node_recipients[device_name][node.name] = []
+    self._node_ctrl_recipients[device_name][node.name] = []
+
+    if node.name not in self._node_devices:
+      self._node_devices[node.name] = set()
+    self._node_devices[node.name].add(node.device)
+    self._node_op_types[device_name][node.name] = node.op
+    self._ref_args[device_name].extend(self._get_ref_args(node))
+
+    for inp in node.input:
+      if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"):
+        self._copy_send_nodes[device_name].append(node.name)
+
+      if inp.startswith("^"):
+        cinp = inp[1:]
+        self._node_ctrl_inputs[device_name][node.name].append(cinp)
+      else:
+        self._node_inputs[device_name][node.name].append(inp)
+
+  def _prune_nodes_from_input_and_recipient_maps(self,
+                                                 device_name,
+                                                 nodes_to_prune):
+    """Prune nodes out of input and recipient maps.
+
+    Args:
+      device_name: (`str`) device name.
+      nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned.
+    """
+
+    for node in nodes_to_prune:
+      del self._node_inputs[device_name][node]
+      del self._node_ctrl_inputs[device_name][node]
+      del self._node_recipients[device_name][node]
+      del self._node_ctrl_recipients[device_name][node]
+
+  def _prune_non_control_edges_of_debug_ops(self, device_name):
+    """Prune (non-control) edges related to debug ops.
+
+    Prune the Copy ops and associated _Send ops inserted by the debugger out
+    from the non-control inputs and output recipients map. Replace the inputs
+    and recipients with original ones.
+
+    Args:
+      device_name: (`str`) device name.
+    """
+
+    copy_nodes = []
+    for node in self._node_inputs[device_name]:
+      if node in self._copy_send_nodes[device_name]:
+        continue
+
+      if is_copy_node(node):
+        copy_nodes.append(node)
+
+      inputs = self._node_inputs[device_name][node]
+
+      for i in xrange(len(inputs)):
+        inp = inputs[i]
+        if is_copy_node(inp):
+          # Find the input to the Copy node, which should be the original
+          # input to the node.
+          orig_inp = self._node_inputs[device_name][inp][0]
+          inputs[i] = orig_inp
+
+    self._prune_nodes_from_input_and_recipient_maps(device_name, copy_nodes)
+    self._prune_nodes_from_input_and_recipient_maps(
+        device_name, self._copy_send_nodes[device_name])
+
+  def _prune_control_edges_of_debug_ops(self, device_name):
+    """Prune control edges related to the debug ops."""
+
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
+      debug_op_inputs = []
+      for ctrl_inp in ctrl_inputs:
+        if is_debug_node(ctrl_inp):
+          debug_op_inputs.append(ctrl_inp)
+      for debug_op_inp in debug_op_inputs:
+        ctrl_inputs.remove(debug_op_inp)
+
+  def _populate_recipient_maps(self, device_name):
+    """Populate the map from node name to recipient(s) of its output(s).
+
+    This method also populates the input map based on reversed ref edges.
+
+    Args:
+      device_name: name of device.
+    """
+
+    for node in self._node_inputs[device_name]:
+      inputs = self._node_inputs[device_name][node]
+      for inp in inputs:
+        inp = get_node_name(inp)
+        if inp not in self._node_recipients[device_name]:
+          self._node_recipients[device_name][inp] = []
+        self._node_recipients[device_name][inp].append(node)
+
+        if inp in self._ref_args[device_name]:
+          if inp not in self._node_reversed_ref_inputs[device_name]:
+            self._node_reversed_ref_inputs[device_name][inp] = []
+          self._node_reversed_ref_inputs[device_name][inp].append(node)
+
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
+      for ctrl_inp in ctrl_inputs:
+        if ctrl_inp in self._copy_send_nodes[device_name]:
+          continue
+
+        if ctrl_inp not in self._node_ctrl_recipients[device_name]:
+          self._node_ctrl_recipients[device_name][ctrl_inp] = []
+        self._node_ctrl_recipients[device_name][ctrl_inp].append(node)
+
+  def _validate_dump_with_graphs(self, device_name):
+    """Validate the dumped tensor data against the partition graphs.
+
+    Only the watched nodes are validated by this method, because tfdbg allows
+    clients to watch only a subset of the nodes.
+
+    Args:
+      device_name: (`str`) device name.
+
+    Raises:
+      LookupError: If the partition graphs have not been loaded yet.
+      ValueError: If dumps contain node names not found in partition graph.
+        Or if the temporal order of the dump's timestamps violate the
+        input relations on the partition graphs.
+    """
+
+    if not self._partition_graphs[device_name]:
+      raise LookupError(
+          "No partition graphs loaded for device %s" % device_name)
+
+    # Verify that the node names in the dump data are all present in the
+    # partition graphs.
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.node_name not in self._node_inputs[device_name]:
+        raise ValueError("Node name '%s' is not found in partition graphs of "
+                         "device %s." % (datum.node_name, device_name))
+
+    pending_inputs = {}
+    for node in self._node_inputs[device_name]:
+      pending_inputs[node] = []
+      inputs = self._node_inputs[device_name][node]
+      for inp in inputs:
+        inp_node = get_node_name(inp)
+        inp_output_slot = get_output_slot(inp)
+        # Inputs from Enter and NextIteration nodes are not validated because
+        # DebugNodeInserter::InsertNodes() in the debugger core skips creating
+        # control edges from debug ops watching these types of nodes.
+        if (inp_node in self._debug_watches[device_name] and
+            inp_output_slot in self._debug_watches[device_name][inp_node] and
+            self._node_op_types[device_name].get(inp) not in (
+                "Enter", "NextIteration") and
+            (inp_node, inp_output_slot) not in pending_inputs[node]):
+          pending_inputs[node].append((inp_node, inp_output_slot))
+
+    for i, datum in enumerate(self._dump_tensor_data[device_name]):
+      node = datum.node_name
+      slot = datum.output_slot
+      # In some cases (e.g., system clocks with insufficient precision),
+      # the upstream and downstream tensors may have identical timestamps, the
+      # following check examines this possibility and avoids raising an error if
+      # that is the case.
+      if not self._satisfied_at_timestamp(
+          device_name, pending_inputs[node], datum.timestamp, start_i=i + 1):
+        raise ValueError("Causality violated in timing relations of debug "
+                         "dumps: %s (%d): "
+                         "these input(s) are not satisfied: %s" %
+                         (node, datum.timestamp, repr(pending_inputs[node])))
+
+      recipients = self._node_recipients[device_name][node]
+      for recipient in recipients:
+        recipient_pending_inputs = pending_inputs[recipient]
+        if (node, slot) in recipient_pending_inputs:
+          if self.node_op_type(recipient) == "Merge":
+            # If this is a Merge op, we automatically clear the list because
+            # a Merge node only requires one of its two inputs.
+            del recipient_pending_inputs[:]
+          else:
+            del recipient_pending_inputs[
+                recipient_pending_inputs.index((node, slot))]
+
+  def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0):
+    """Determine whether pending inputs are satisfied at given timestamp.
+
+    Note: This method mutates the input argument "pending".
+
+    Args:
+      device_name: (str) device name.
+      pending: A list of 2-tuple (node_name, output_slot): the dependencies to
+        check.
+      timestamp: (int) the timestamp in question.
+      start_i: (int) the index in self._dump_tensor_data to start searching for
+        the timestamp.
+
+    Returns:
+      (bool) Whether all the dependencies in pending are satisfied at the
+        timestamp. If pending is empty to begin with, return True.
+    """
+    if not pending:
+      return True
+
+    for datum in self._dump_tensor_data[device_name][start_i:]:
+      if datum.timestamp > timestamp:
+        break
+      if (datum.timestamp == timestamp and
+          (datum.node_name, datum.output_slot) in pending):
+        pending.remove((datum.node_name, datum.output_slot))
+        if not pending:
+          return True
+
+    return not pending
+
+  def loaded_partition_graphs(self):
+    """Test whether partition graphs have been loaded."""
+    return self._partition_graphs is not None
+
+  def partition_graphs(self):
+    """Get the partition graphs.
+
+    Returns:
+      Partition graphs as a list of GraphDef.
+
+    Raises:
+      LookupError: If no partition graphs have been loaded.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError("No partition graphs have been loaded.")
+
+    return self._partition_graphs.values()
+
+  @property
+  def run_fetches_info(self):
+    """Get a str representation of the fetches used in the Session.run() call.
+
+    Returns:
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(fetches)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` from `repr(fetches)`.
+      If the information is not available, `None`.
+    """
+
+    output = self._run_fetches_info
+    return output[0] if len(output) == 1 else output
+
+  @property
+  def run_feed_keys_info(self):
+    """Get a str representation of the feed_dict used in the Session.run() call.
+
+    Returns:
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(feed_dict)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` obtained from `repr(feed_dict)`.
+      If the information is not available, `None`.
+    """
+
+    output = self._run_feed_keys_info
+    return output[0] if len(output) == 1 else output
+
+  def _infer_device_name(self, device_name, node_name):
+    """Infer the device name given node name.
+
+    If device_name is provided (i.e., not None), it'll be simply returned right
+    away.
+
+    Args:
+      device_name: (str or None) name of the device. If None, will try to infer
+        the device name by looking at the available nodes.
+      node_name: (str) name of the node.
+
+    Returns:
+      (str) Inferred name of the device, if available.
+
+    Raises:
+      ValueError: If the node name does not exist on any of the available
+        devices or if there are multiple devices that contain the node with
+        the given name.
+    """
+    if device_name is None:
+      if node_name in self._node_devices:
+        if len(self._node_devices[node_name]) == 1:
+          return list(self._node_devices[node_name])[0]
+        else:
+          raise ValueError(
+              "There are multiple (%d) devices with nodes named '%s' but "
+              "device_name is not specified." %
+              (len(self._node_devices[node_name]), node_name))
+      else:
+        raise ValueError("None of the %d device(s) has a node named '%s'." %
+                         (len(self._device_names), node_name))
+    else:
+      return device_name
+
+  def nodes(self, device_name=None):
+    """Get a list of all nodes from the partition graphs.
+
+    Args:
+      device_name: (`str`) name of device. If None, all nodes from all available
+        devices will be included.
+
+    Returns:
+      All nodes' names, as a list of str.
+
+    Raises:
+      LookupError: If no partition graphs have been loaded.
+      ValueError: If specified node name does not exist.
+    """
+    if self._partition_graphs is None:
+      raise LookupError("No partition graphs have been loaded.")
+    if device_name is None:
+      nodes = []
+      for device_name in self._node_inputs:
+        nodes.extend(self._node_inputs[device_name].keys())
+      return nodes
+    else:
+      if device_name not in self._node_inputs:
+        raise ValueError("Invalid device name: %s" % device_name)
+      return self._node_inputs[device_name].keys()
+
+  def node_attributes(self, node_name, device_name=None):
+    """Get the attributes of a node.
+
+    Args:
+      node_name: Name of the node in question.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      Attributes of the node.
+
+    Raises:
+      LookupError: If no partition graphs have been loaded.
+    """
+    if self._partition_graphs is None:
+      raise LookupError("No partition graphs have been loaded.")
+
+    device_name = self._infer_device_name(device_name, node_name)
+    return self._node_attributes[device_name][node_name]
+
+  def node_inputs(self, node_name, is_control=False, device_name=None):
+    """Get the inputs of given node according to partition graphs.
+
+    Args:
+      node_name: Name of the node.
+      is_control: (`bool`) Whether control inputs, rather than non-control
+        inputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      (`list` of `str`) inputs to the node, as a list of node names.
+
+    Raises:
+      LookupError: If node inputs and control inputs have not been loaded
+         from partition graphs yet.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError(
+          "Node inputs are not loaded from partition graphs yet.")
+
+    device_name = self._infer_device_name(device_name, node_name)
+    if is_control:
+      return self._node_ctrl_inputs[device_name][node_name]
+    else:
+      return self._node_inputs[device_name][node_name]
+
+  def transitive_inputs(self,
+                        node_name,
+                        include_control=True,
+                        include_reversed_ref=False,
+                        device_name=None,):
+    """Get the transitive inputs of given node according to partition graphs.
+
+    Args:
+      node_name: Name of the node.
+      include_control: Include control inputs (True by default).
+      include_reversed_ref: Whether a ref input, say from A to B, is to be also
+        considered as an input from B to A. The rationale is that ref inputs
+        generally let the recipient (e.g., B in this case) mutate the value of
+        the source (e.g., A in this case). So the reverse direction of the ref
+        edge reflects the direction of information flow.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      (`list` of `str`) all transitive inputs to the node, as a list of node
+        names.
+
+    Raises:
+      LookupError: If node inputs and control inputs have not been loaded
+         from partition graphs yet.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError(
+          "Node inputs are not loaded from partition graphs yet.")
+
+    device_name = self._infer_device_name(device_name, node_name)
+
+    input_lists = [self._node_inputs[device_name]]
+    if include_control:
+      input_lists.append(self._node_ctrl_inputs[device_name])
+    if include_reversed_ref:
+      input_lists.append(self._node_reversed_ref_inputs[device_name])
+    tracer = _DFSGraphTracer(
+        input_lists,
+        skip_node_names=self._get_merge_node_names(device_name))
+    tracer.trace(node_name)
+    return tracer.inputs()
+
+  def _get_merge_node_names(self, device_name):
+    """Lazily get a list of Merge nodes on a given device."""
+    if device_name not in self._device_names:
+      raise ValueError("Invalid device name: %s" % device_name)
+
+    if not hasattr(self, "_merge_node_names"):
+      self._merge_node_names = {}
+    if device_name not in self._merge_node_names:
+      self._merge_node_names[device_name] = [
+          node for node in self._node_op_types[device_name]
+          if self._node_op_types[device_name][node] == "Merge"]
+    return self._merge_node_names[device_name]
+
+  def find_some_path(self,
+                     src_node_name,
+                     dst_node_name,
+                     include_control=True,
+                     include_reversed_ref=False,
+                     device_name=None):
+    """Find a path between a source node and a destination node.
+
+    Limitation: the source and destination are required to be on the same
+    device, i.e., this method does not yet take into account Send/Recv nodes
+    across devices.
+
+    TODO(cais): Make this method work across device edges by tracing Send/Recv
+      nodes.
+
+    Args:
+      src_node_name: (`str`) name of the source node or name of an output tensor
+        of the node.
+      dst_node_name: (`str`) name of the destination node or name of an output
+        tensor of the node.
+      include_control: (`bool`) whrther control edges are considered in the
+        graph tracing.
+      include_reversed_ref: Whether a ref input, say from A to B, is to be also
+        considered as an input from B to A. The rationale is that ref inputs
+        generally let the recipient (e.g., B in this case) mutate the value of
+        the source (e.g., A in this case). So the reverse direction of the ref
+        edge reflects the direction of information flow.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      A path from the src_node_name to dst_node_name, as a `list` of `str`, if
+      it exists. The list includes src_node_name as the first item and
+      dst_node_name as the last.
+      If such a path does not exist, `None`.
+
+    Raises:
+      ValueError: If the source and destination nodes are not on the same
+        device.
+    """
+    src_device_name = self._infer_device_name(device_name, src_node_name)
+    dst_device_name = self._infer_device_name(device_name, dst_node_name)
+
+    if src_device_name != dst_device_name:
+      raise ValueError(
+          "Source (%s) and destination (%s) are not on the same device: "
+          "%s vs. %s" % (src_node_name, dst_node_name, src_device_name,
+                         dst_device_name))
+
+    input_lists = [self._node_inputs[dst_device_name]]
+    if include_control:
+      input_lists.append(self._node_ctrl_inputs[dst_device_name])
+    if include_reversed_ref:
+      input_lists.append(self._node_reversed_ref_inputs[dst_device_name])
+    tracer = _DFSGraphTracer(
+        input_lists,
+        skip_node_names=self._get_merge_node_names(dst_device_name),
+        destination_node_name=src_node_name)
+    # Here the value of destination_node_name is src_node_name, because we
+    # are tracing the graph from output to its inputs (i.e., going backwards
+    # on the graph).
+
+    try:
+      tracer.trace(dst_node_name)
+    except _GraphTracingReachedDestination:
+      # Prune nodes not on the path.
+      inputs = [dst_node_name] + tracer.inputs()
+      depth_list = [0] + tracer.depth_list()
+
+      path = []
+      curr_depth = depth_list[-1]
+      for inp, depth in zip(reversed(inputs), reversed(depth_list)):
+        if depth == curr_depth:
+          path.append(inp)
+          curr_depth -= 1
+      return path
+
+  def node_recipients(self, node_name, is_control=False, device_name=None):
+    """Get recipient of the given node's output according to partition graphs.
+
+    Args:
+      node_name: (`str`) name of the node.
+      is_control: (`bool`) whether control outputs, rather than non-control
+        outputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      (`list` of `str`) all inputs to the node, as a list of node names.
+
+    Raises:
+      LookupError: If node inputs and control inputs have not been loaded
+         from partition graphs yet.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError(
+          "Node recipients are not loaded from partition graphs yet.")
+
+    device_name = self._infer_device_name(device_name, node_name)
+    if is_control:
+      return self._node_ctrl_recipients[device_name][node_name]
+    else:
+      return self._node_recipients[device_name][node_name]
+
+  def devices(self):
+    """Get the list of device names.
+
+    Returns:
+      (`list` of `str`) names of the devices.
+    """
+
+    return self._device_names
+
+  def node_exists(self, node_name, device_name=None):
+    """Test if a node exists in the partition graphs.
+
+    Args:
+      node_name: (`str`) name of the node to be checked.
+      device_name: optional device name. If None, will search for the node
+        on all available devices. Otherwise, search for the node only on
+        the given device.
+
+    Returns:
+      A boolean indicating whether the node exists.
+
+    Raises:
+      LookupError: If no partition graphs have been loaded yet.
+      ValueError: If device_name is specified but cannot be found.
+    """
+
+    if self._node_inputs is None:
+      raise LookupError(
+          "Nodes have not been loaded from partition graphs yet.")
+
+    if (device_name is not None) and device_name not in self._node_inputs:
+      raise ValueError(
+          "The specified device_name '%s' cannot be found." % device_name)
+
+    node_inputs_all_devices = (self._node_inputs if device_name is None
+                               else (self._node_inputs[device_name],))
+
+    return any(node_name in node_inputs_all_devices[dev_name]
+               for dev_name in node_inputs_all_devices)
+
+  def node_device(self, node_name):
+    """Get the names of the devices that has nodes of the specified name.
+
+    Args:
+      node_name: (`str`) name of the node.
+
+    Returns:
+      (`str` or `list` of `str`) name of the device(s) on which the node of the
+        given name is found. Returns a `str` if there is only one such device,
+        otherwise return a `list` of `str`.
+
+    Raises:
+      LookupError: If node inputs and control inputs have not been loaded
+         from partition graphs yet.
+      ValueError: If the node does not exist in partition graphs.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError(
+          "Node devices are not loaded from partition graphs yet.")
+
+    if node_name not in self._node_devices:
+      raise ValueError("Node '%s' does not exist in partition graphs." %
+                       node_name)
+
+    output = list(self._node_devices[node_name])
+    return output[0] if len(output) == 1 else output
+
+  def node_op_type(self, node_name, device_name=None):
+    """Get the op type of given node.
+
+    Args:
+      node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      (`str`) op type of the node.
+
+    Raises:
+      LookupError: If node op types have not been loaded
+         from partition graphs yet.
+    """
+
+    if self._partition_graphs is None:
+      raise LookupError(
+          "Node op types are not loaded from partition graphs yet.")
+
+    device_name = self._infer_device_name(device_name, node_name)
+    return self._node_op_types[device_name][node_name]
+
+  def debug_watch_keys(self, node_name, device_name=None):
+    """Get all tensor watch keys of given node according to partition graphs.
+
+    Args:
+      node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
+
+    Returns:
+      (`list` of `str`) all debug tensor watch keys. Returns an empty list if
+        the node name does not correspond to any debug watch keys.
+
+    Raises:
+      `LookupError`: If debug watch information has not been loaded from
+        partition graphs yet.
+    """
+
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+    except ValueError:
+      return []
+
+    if node_name not in self._debug_watches[device_name]:
+      return []
+
+    watch_keys = []
+    for watched_slot in self._debug_watches[device_name][node_name]:
+      debug_ops = self._debug_watches[device_name][node_name][watched_slot]
+      for debug_op in debug_ops:
+        watch_keys.append(
+            _get_tensor_watch_key(node_name, watched_slot, debug_op))
+
+    return watch_keys
+
+  def watch_key_to_data(self, debug_watch_key, device_name=None):
+    """Get all `DebugTensorDatum` instances corresponding to a debug watch key.
+
+    Args:
+      debug_watch_key: (`str`) debug watch key.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
+
+    Returns:
+      A list of `DebugTensorDatum` instances that correspond to the debug watch
+      key. If the watch key does not exist, returns an empty list.
+
+    Raises:
+      ValueError: If there are multiple devices that have the debug_watch_key,
+        but device_name is not specified.
+    """
+    if device_name is None:
+      matching_device_names = [
+          name for name in self._watch_key_to_datum
+          if debug_watch_key in self._watch_key_to_datum[name]]
+      if not matching_device_names:
+        return []
+      elif len(matching_device_names) == 1:
+        device_name = matching_device_names[0]
+      else:
+        raise ValueError(
+            "The debug watch key '%s' exists on multiple (%d) devices, but "
+            "device name is not specified." %
+            (debug_watch_key, len(matching_device_names)))
+    elif device_name not in self._debug_key_to_datum:
+      raise ValueError(
+          "There is no device named '%s' consisting of debug watch keys." %
+          device_name)
+
+    return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
+
+  def find(self, predicate, first_n=0, device_name=None):
+    """Find dumped tensor data by a certain predicate.
+
+    Args:
+      predicate: A callable that takes two input arguments:
+
+        ```python
+        def predicate(debug_tensor_datum, tensor):
+          # returns a bool
+        ```
+
+        where `debug_tensor_datum` is an instance of `DebugTensorDatum`, which
+        carries the metadata, such as the `Tensor`'s node name, output slot
+        timestamp, debug op name, etc.; and `tensor` is the dumped tensor value
+        as a `numpy.ndarray`.
+      first_n: (`int`) return only the first n `DebugTensotDatum` instances (in
+        time order) for which the predicate returns True. To return all the
+        `DebugTensotDatum` instances, let first_n be <= 0.
+      device_name: optional device name.
+
+    Returns:
+      A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
+       for which predicate returns True, sorted in ascending order of the
+       timestamp.
+    """
+
+    matched_data = []
+    for device in (self._dump_tensor_data if device_name is None
+                   else (self._dump_tensor_data[device_name],)):
+      for datum in self._dump_tensor_data[device]:
+        if predicate(datum, datum.get_tensor()):
+          matched_data.append(datum)
+
+          if first_n > 0 and len(matched_data) >= first_n:
+            return matched_data
+
+    return matched_data
+
+  def get_tensor_file_paths(self,
+                            node_name,
+                            output_slot,
+                            debug_op,
+                            device_name=None):
+    """Get the file paths from a debug-dumped tensor.
+
+    Args:
+      node_name: (`str`) name of the node that the tensor is produced by.
+      output_slot: (`int`) output slot index of tensor.
+      debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
+
+    Returns:
+      List of file path(s) loaded. This is a list because each debugged tensor
+        may be dumped multiple times.
+
+    Raises:
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in
+        the debug-dump data.
+    """
+
+    device_name = self._infer_device_name(device_name, node_name)
+    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
+    if watch_key not in self._watch_key_to_datum[device_name]:
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
+
+    return [datum.file_path for datum in
+            self._watch_key_to_datum[device_name][watch_key]]
+
+  def get_tensors(self, node_name, output_slot, debug_op, device_name=None):
+    """Get the tensor value from for a debug-dumped tensor.
+
+    The tensor may be dumped multiple times in the dump root directory, so a
+    list of tensors (`numpy.ndarray`) is returned.
+
+    Args:
+      node_name: (`str`) name of the node that the tensor is produced by.
+      output_slot: (`int`) output slot index of tensor.
+      debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
+
+    Returns:
+      List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
+
+    Raises:
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in
+        the debug-dump data.
+    """
+
+    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+      return [datum.get_tensor() for datum in
+              self._watch_key_to_datum[device_name][watch_key]]
+    except (ValueError, KeyError):
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
+
+  def get_rel_timestamps(self,
+                         node_name,
+                         output_slot,
+                         debug_op,
+                         device_name=None):
+    """Get the relative timestamp from for a debug-dumped tensor.
+
+    Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
+    absolute timestamp of the first dumped tensor in the dump root. The tensor
+    may be dumped multiple times in the dump root directory, so a list of
+    relative timestamps (`numpy.ndarray`) is returned.
+
+    Args:
+      node_name: (`str`) name of the node that the tensor is produced by.
+      output_slot: (`int`) output slot index of tensor.
+      debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
+
+    Returns:
+      (`list` of `int`) list of relative timestamps.
+
+    Raises:
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not
+        exist in the debug dump data.
+    """
+
+    device_name = self._infer_device_name(device_name, node_name)
+    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
+    if watch_key not in self._watch_key_to_datum[device_name]:
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+
+    # TODO(cais): Figure out whether this should be relative to the global t0.
+    return self._watch_key_to_rel_time[device_name][watch_key]
+
+  def get_dump_sizes_bytes(self,
+                           node_name,
+                           output_slot,
+                           debug_op,
+                           device_name=None):
+    """Get the sizes of the dump files for a debug-dumped tensor.
+
+    Unit of the file size: byte.
+
+    Args:
+      node_name: (`str`) name of the node that the tensor is produced by.
+      output_slot: (`int`) output slot index of tensor.
+      debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
+
+    Returns:
+      (`list` of `int`): list of dump file sizes in bytes.
+
+    Raises:
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not
+        exist in the debug dump data.
+    """
+
+    device_name = self._infer_device_name(device_name, node_name)
+    watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
+    if watch_key not in self._watch_key_to_datum[device_name]:
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
+
+    return self._watch_key_to_dump_size_bytes[device_name][watch_key]
+
+  def node_traceback(self, element_name):
+    """Try to retrieve the Python traceback of node's construction.
+
+    Args:
+      element_name: (`str`) Name of a graph element (node or tensor).
+
+    Returns:
+      (list) The traceback list object as returned by the `extract_trace`
+        method of Python's traceback module.
+
+    Raises:
+      LookupError: If Python graph is not available for traceback lookup.
+      KeyError: If the node cannot be found in the Python graph loaded.
+    """
+
+    if self._python_graph is None:
+      raise LookupError("Python graph is not available for traceback lookup")
+
+    node_name = get_node_name(element_name)
+    if node_name not in self._node_traceback:
+      raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)
+
+    return self._node_traceback[node_name]
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
new file mode 100644
index 00000000000..70dc8c11500
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -0,0 +1,358 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tfdbg module debug_data."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class DeviceNamePathConversionTest(test_util.TensorFlowTestCase):
+
+  def testDeviceNameToDevicePath(self):
+    self.assertEqual(
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_ps,replica_1,task_2,cpu_0",
+        debug_data.device_name_to_device_path("/job:ps/replica:1/task:2/cpu:0"))
+
+  def testDevicePathToDeviceName(self):
+    self.assertEqual(
+        "/job:ps/replica:1/task:2/cpu:0",
+        debug_data.device_path_to_device_name(
+            debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+            ",job_ps,replica_1,task_2,cpu_0"))
+
+
+class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
+
+  def testParseNodeName(self):
+    node_name, slot = debug_data.parse_node_or_tensor_name("namespace1/node_1")
+
+    self.assertEqual("namespace1/node_1", node_name)
+    self.assertIsNone(slot)
+
+  def testParseTensorName(self):
+    node_name, slot = debug_data.parse_node_or_tensor_name(
+        "namespace1/node_2:3")
+
+    self.assertEqual("namespace1/node_2", node_name)
+    self.assertEqual(3, slot)
+
+
+class NodeNameChecksTest(test_util.TensorFlowTestCase):
+
+  def testIsCopyNode(self):
+    self.assertTrue(debug_data.is_copy_node("__copy_ns1/ns2/node3_0"))
+
+    self.assertFalse(debug_data.is_copy_node("copy_ns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("_copy_ns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("_copyns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("__dbg_ns1/ns2/node3_0"))
+
+  def testIsDebugNode(self):
+    self.assertTrue(
+        debug_data.is_debug_node("__dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+
+    self.assertFalse(
+        debug_data.is_debug_node("dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+    self.assertFalse(
+        debug_data.is_debug_node("_dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+    self.assertFalse(
+        debug_data.is_debug_node("_dbgns1/ns2/node3:0_0_DebugIdentity"))
+    self.assertFalse(debug_data.is_debug_node("__copy_ns1/ns2/node3_0"))
+
+
+class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
+
+  def testParseDebugNodeName_valid(self):
+    debug_node_name_1 = "__dbg_ns_a/ns_b/node_c:1_0_DebugIdentity"
+    (watched_node, watched_output_slot, debug_op_index,
+     debug_op) = debug_data.parse_debug_node_name(debug_node_name_1)
+
+    self.assertEqual("ns_a/ns_b/node_c", watched_node)
+    self.assertEqual(1, watched_output_slot)
+    self.assertEqual(0, debug_op_index)
+    self.assertEqual("DebugIdentity", debug_op)
+
+  def testParseDebugNodeName_invalidPrefix(self):
+    invalid_debug_node_name_1 = "__copy_ns_a/ns_b/node_c:1_0_DebugIdentity"
+
+    with self.assertRaisesRegexp(ValueError, "Invalid prefix"):
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
+
+  def testParseDebugNodeName_missingDebugOpIndex(self):
+    invalid_debug_node_name_1 = "__dbg_node1:0_DebugIdentity"
+
+    with self.assertRaisesRegexp(ValueError, "Invalid debug node name"):
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
+
+  def testParseDebugNodeName_invalidWatchedTensorName(self):
+    invalid_debug_node_name_1 = "__dbg_node1_0_DebugIdentity"
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Invalid tensor name in debug node name"):
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
+
+
+class HasNanOrInfTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._dummy_datum = dummy_datum = debug_data.DebugTensorDatum(
+        "/foo", "bar_0_DebugIdentity_42")
+
+  def testNaN(self):
+    a = np.array([np.nan, np.nan, 7.0])
+    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testInf(self):
+    a = np.array([np.inf, np.inf, 7.0])
+    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testNanAndInf(self):
+    a = np.array([np.inf, np.nan, 7.0])
+    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testNoNanOrInf(self):
+    a = np.array([0.0, 0.0, 7.0])
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testEmpty(self):
+    a = np.array([])
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testInconvertibleTensorProto(self):
+    self.assertFalse(debug_data.has_inf_or_nan(
+        self._dummy_datum,
+        debug_data.InconvertibleTensorProto(tensor_pb2.TensorProto(),
+                                            initialized=False)))
+    self.assertFalse(debug_data.has_inf_or_nan(
+        self._dummy_datum,
+        debug_data.InconvertibleTensorProto(tensor_pb2.TensorProto(),
+                                            initialized=True)))
+
+  def testDTypeComplexWorks(self):
+    a = np.array([1j, 3j, 3j, 7j], dtype=np.complex128)
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+    b = np.array([1j, 3j, 3j, 7j, np.nan], dtype=np.complex128)
+    self.assertTrue(debug_data.has_inf_or_nan(self._dummy_datum, b))
+
+  def testDTypeIntegerWorks(self):
+    a = np.array([1, 3, 3, 7], dtype=np.int16)
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testDTypeStringGivesFalse(self):
+    """isnan and isinf are not applicable to strings."""
+
+    a = np.array(["s", "p", "a", "m"])
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+  def testDTypeObjectGivesFalse(self):
+    dt = np.dtype([("spam", np.str_, 16), ("eggs", np.float64, (2,))])
+    a = np.array([("spam", (8.0, 7.0)), ("eggs", (6.0, 5.0))], dtype=dt)
+    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+
+
+class DebugTensorDatumTest(test_util.TensorFlowTestCase):
+
+  def testDebugDatum(self):
+    dump_root = "/tmp/tfdbg_1"
+    debug_dump_rel_path = (
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0" +
+        "/ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385")
+
+    datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
+
+    self.assertEqual("DebugIdentity", datum.debug_op)
+    self.assertEqual("ns1/ns2/node_a_1", datum.node_name)
+    self.assertEqual(2, datum.output_slot)
+    self.assertEqual("ns1/ns2/node_a_1:2", datum.tensor_name)
+    self.assertEqual(1472563253536385, datum.timestamp)
+    self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key)
+    self.assertEqual(
+        os.path.join(dump_root, debug_dump_rel_path), datum.file_path)
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), str(datum))
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), repr(datum))
+
+  def testDumpSizeBytesIsNoneForNonexistentFilePath(self):
+    dump_root = "/tmp/tfdbg_1"
+    debug_dump_rel_path = "ns1/ns2/node_foo_1_2_DebugIdentity_1472563253536385"
+    datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
+
+    self.assertIsNone(datum.dump_size_bytes)
+
+
+class DebugDumpDirTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._dump_root = tempfile.mktemp()
+    os.mkdir(self._dump_root)
+
+  def tearDown(self):
+    # Tear down temporary dump directory.
+    shutil.rmtree(self._dump_root)
+
+  def _makeDataDirWithMultipleDevicesAndDuplicateNodeNames(self):
+    cpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    gpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_0")
+    gpu_1_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_1")
+    os.makedirs(cpu_0_dir)
+    os.makedirs(gpu_0_dir)
+    os.makedirs(gpu_1_dir)
+    open(os.path.join(
+        cpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536386"), "wb")
+    open(os.path.join(
+        gpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536385"), "wb")
+    open(os.path.join(
+        gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb")
+
+  def testDebugDumpDir_nonexistentDumpRoot(self):
+    with self.assertRaisesRegexp(IOError, "does not exist"):
+      debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
+
+  def testDebugDumpDir_invalidFileNamingPattern(self):
+    # File name with too few underscores should lead to an exception.
+    device_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    os.makedirs(device_dir)
+    open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb")
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "does not conform to the naming pattern"):
+      debug_data.DebugDumpDir(self._dump_root)
+
+  def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    dump_dir = debug_data.DebugDumpDir(
+        self._dump_root,
+        partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
+    self.assertItemsEqual(
+        ["/job:localhost/replica:0/task:0/cpu:0",
+         "/job:localhost/replica:0/task:0/gpu:0",
+         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+    self.assertEqual(1472563253536385, dump_dir.t0)
+    self.assertEqual(3, dump_dir.size)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Invalid device name: "):
+      dump_dir.nodes("/job:localhost/replica:0/task:0/gpu:2")
+    self.assertItemsEqual(["node_foo_1", "node_foo_1", "node_foo_1"],
+                          dump_dir.nodes())
+    self.assertItemsEqual(
+        ["node_foo_1"],
+        dump_dir.nodes(device_name="/job:localhost/replica:0/task:0/cpu:0"))
+
+  def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node = graph_gpu_1.node.add()  # Here is the duplicate.
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Duplicate node name on device "):
+      debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
+  def testDebugDumpDir_emptyDumpDir(self):
+    dump_dir = debug_data.DebugDumpDir(self._dump_root)
+
+    self.assertIsNone(dump_dir.t0)
+    self.assertEqual([], dump_dir.dumped_tensor_data)
+
+
+class GetNodeNameAndOutputSlotTest(test_util.TensorFlowTestCase):
+
+  def testParseTensorNameInputWorks(self):
+    self.assertEqual("a", debug_data.get_node_name("a:0"))
+    self.assertEqual(0, debug_data.get_output_slot("a:0"))
+
+    self.assertEqual("_b", debug_data.get_node_name("_b:1"))
+    self.assertEqual(1, debug_data.get_output_slot("_b:1"))
+
+  def testParseNodeNameInputWorks(self):
+    self.assertEqual("a", debug_data.get_node_name("a"))
+    self.assertEqual(0, debug_data.get_output_slot("a"))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/debug_utils.py b/tensorflow/python/debug/lib/debug_utils.py
new file mode 100644
index 00000000000..f1e972940b7
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_utils.py
@@ -0,0 +1,261 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Debugger (tfdbg) Utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+def add_debug_tensor_watch(run_options,
+                           node_name,
+                           output_slot=0,
+                           debug_ops="DebugIdentity",
+                           debug_urls=None,
+                           tolerate_debug_op_creation_failures=False,
+                           global_step=-1):
+  """Add watch on a `Tensor` to `RunOptions`.
+
+  N.B.:
+    1. Under certain circumstances, the `Tensor` may not get actually watched
+      (e.g., if the node of the `Tensor` is constant-folded during runtime).
+    2. For debugging purposes, the `parallel_iteration` attribute of all
+      `tf.while_loop`s in the graph are set to 1 to prevent any node from
+      being executed multiple times concurrently. This change does not affect
+      subsequent non-debugged runs of the same `tf.while_loop`s.
+
+  Args:
+    run_options: An instance of `config_pb2.RunOptions` to be modified.
+    node_name: (`str`) name of the node to watch.
+    output_slot: (`int`) output slot index of the tensor from the watched node.
+    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
+      `list` of `str` or a single `str`. The latter case is equivalent to a
+      `list` of `str` with only one element.
+      For debug op types with customizable attributes, each debug op string can
+      optionally contain a list of attribute names, in the syntax of:
+        debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
+    debug_urls: (`str` or `list` of `str`) URL(s) to send debug values to,
+      e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
+    tolerate_debug_op_creation_failures: (`bool`) Whether to tolerate debug op
+      creation failures by not throwing exceptions.
+    global_step: (`int`) Optional global_step count for this debug tensor
+      watch.
+  """
+
+  watch_opts = run_options.debug_options.debug_tensor_watch_opts
+  run_options.debug_options.global_step = global_step
+
+  watch = watch_opts.add()
+  watch.tolerate_debug_op_creation_failures = (
+      tolerate_debug_op_creation_failures)
+  watch.node_name = node_name
+  watch.output_slot = output_slot
+
+  if isinstance(debug_ops, str):
+    debug_ops = [debug_ops]
+
+  watch.debug_ops.extend(debug_ops)
+
+  if debug_urls:
+    if isinstance(debug_urls, str):
+      debug_urls = [debug_urls]
+
+    watch.debug_urls.extend(debug_urls)
+
+
+def watch_graph(run_options,
+                graph,
+                debug_ops="DebugIdentity",
+                debug_urls=None,
+                node_name_regex_whitelist=None,
+                op_type_regex_whitelist=None,
+                tensor_dtype_regex_whitelist=None,
+                tolerate_debug_op_creation_failures=False,
+                global_step=-1):
+  """Add debug watches to `RunOptions` for a TensorFlow graph.
+
+  To watch all `Tensor`s on the graph, let both `node_name_regex_whitelist`
+  and `op_type_regex_whitelist` be the default (`None`).
+
+  N.B.:
+    1. Under certain circumstances, the `Tensor` may not get actually watched
+      (e.g., if the node of the `Tensor` is constant-folded during runtime).
+    2. For debugging purposes, the `parallel_iteration` attribute of all
+      `tf.while_loop`s in the graph are set to 1 to prevent any node from
+      being executed multiple times concurrently. This change does not affect
+      subsequent non-debugged runs of the same `tf.while_loop`s.
+
+
+  Args:
+    run_options: An instance of `config_pb2.RunOptions` to be modified.
+    graph: An instance of `ops.Graph`.
+    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
+    debug_urls: URLs to send debug values to. Can be a list of strings,
+      a single string, or None. The case of a single string is equivalent to
+      a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
+      `grpc://localhost:12345`.
+      For debug op types with customizable attributes, each debug op name string
+      can optionally contain a list of attribute names, in the syntax of:
+        debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
+    node_name_regex_whitelist: Regular-expression whitelist for node_name,
+      e.g., `"(weight_[0-9]+|bias_.*)"`
+    op_type_regex_whitelist: Regular-expression whitelist for the op type of
+      nodes, e.g., `"(Variable|Add)"`.
+      If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
+      are set, the two filtering operations will occur in a logical `AND`
+      relation. In other words, a node will be included if and only if it
+      hits both whitelists.
+    tensor_dtype_regex_whitelist: Regular-expression whitelist for Tensor
+      data type, e.g., `"^int.*"`.
+      This whitelist operates in logical `AND` relations to the two whitelists
+      above.
+    tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
+      failures (e.g., due to dtype incompatibility) are to be tolerated by not
+      throwing exceptions.
+    global_step: (`int`) Optional global_step count for this debug tensor
+      watch.
+  """
+
+  if isinstance(debug_ops, str):
+    debug_ops = [debug_ops]
+
+  node_name_pattern = (re.compile(node_name_regex_whitelist)
+                       if node_name_regex_whitelist else None)
+  op_type_pattern = (re.compile(op_type_regex_whitelist)
+                     if op_type_regex_whitelist else None)
+  tensor_dtype_pattern = (re.compile(tensor_dtype_regex_whitelist)
+                          if tensor_dtype_regex_whitelist else None)
+
+  ops = graph.get_operations()
+  for op in ops:
+    # Skip nodes without any output tensors.
+    if not op.outputs:
+      continue
+
+    node_name = op.name
+    op_type = op.type
+
+    if node_name_pattern and not node_name_pattern.match(node_name):
+      continue
+    if op_type_pattern and not op_type_pattern.match(op_type):
+      continue
+
+    for slot in xrange(len(op.outputs)):
+      if (tensor_dtype_pattern and
+          not tensor_dtype_pattern.match(op.outputs[slot].dtype.name)):
+        continue
+
+      add_debug_tensor_watch(
+          run_options,
+          node_name,
+          output_slot=slot,
+          debug_ops=debug_ops,
+          debug_urls=debug_urls,
+          tolerate_debug_op_creation_failures=(
+              tolerate_debug_op_creation_failures),
+          global_step=global_step)
+
+
+def watch_graph_with_blacklists(run_options,
+                                graph,
+                                debug_ops="DebugIdentity",
+                                debug_urls=None,
+                                node_name_regex_blacklist=None,
+                                op_type_regex_blacklist=None,
+                                tensor_dtype_regex_blacklist=None,
+                                tolerate_debug_op_creation_failures=False,
+                                global_step=-1):
+  """Add debug tensor watches, blacklisting nodes and op types.
+
+  This is similar to `watch_graph()`, but the node names and op types are
+  blacklisted, instead of whitelisted.
+
+  N.B.:
+    1. Under certain circumstances, the `Tensor` may not get actually watched
+      (e.g., if the node of the `Tensor` is constant-folded during runtime).
+    2. For debugging purposes, the `parallel_iteration` attribute of all
+      `tf.while_loop`s in the graph are set to 1 to prevent any node from
+      being executed multiple times concurrently. This change does not affect
+      subsequent non-debugged runs of the same `tf.while_loop`s.
+
+  Args:
+    run_options: An instance of `config_pb2.RunOptions` to be modified.
+    graph: An instance of `ops.Graph`.
+    debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
+      See the documentation of `watch_graph` for more details.
+    debug_urls: URL(s) to send debug values to, e.g.,
+      `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
+    node_name_regex_blacklist: Regular-expression blacklist for node_name.
+      This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
+    op_type_regex_blacklist: Regular-expression blacklist for the op type of
+      nodes, e.g., `"(Variable|Add)"`.
+      If both node_name_regex_blacklist and op_type_regex_blacklist
+      are set, the two filtering operations will occur in a logical `OR`
+      relation. In other words, a node will be excluded if it hits either of
+      the two blacklists; a node will be included if and only if it hits
+      neither of the blacklists.
+    tensor_dtype_regex_blacklist: Regular-expression blacklist for Tensor
+      data type, e.g., `"^int.*"`.
+      This blacklist operates in logical `OR` relations to the two whitelists
+      above.
+    tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
+      failures (e.g., due to dtype incompatibility) are to be tolerated by not
+      throwing exceptions.
+    global_step: (`int`) Optional global_step count for this debug tensor
+      watch.
+  """
+
+  if isinstance(debug_ops, str):
+    debug_ops = [debug_ops]
+
+  node_name_pattern = (re.compile(node_name_regex_blacklist) if
+                       node_name_regex_blacklist else None)
+  op_type_pattern = (re.compile(op_type_regex_blacklist) if
+                     op_type_regex_blacklist else None)
+  tensor_dtype_pattern = (re.compile(tensor_dtype_regex_blacklist) if
+                          tensor_dtype_regex_blacklist else None)
+
+  ops = graph.get_operations()
+  for op in ops:
+    # Skip nodes without any output tensors.
+    if not op.outputs:
+      continue
+
+    node_name = op.name
+    op_type = op.type
+
+    if node_name_pattern and node_name_pattern.match(node_name):
+      continue
+    if op_type_pattern and op_type_pattern.match(op_type):
+      continue
+
+    for slot in xrange(len(op.outputs)):
+      if (tensor_dtype_pattern and
+          tensor_dtype_pattern.match(op.outputs[slot].dtype.name)):
+        continue
+
+      add_debug_tensor_watch(
+          run_options,
+          node_name,
+          output_slot=slot,
+          debug_ops=debug_ops,
+          debug_urls=debug_urls,
+          tolerate_debug_op_creation_failures=(
+              tolerate_debug_op_creation_failures),
+          global_step=global_step)
diff --git a/tensorflow/python/debug/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
similarity index 81%
rename from tensorflow/python/debug/debug_utils_test.py
rename to tensorflow/python/debug/lib/debug_utils_test.py
index 081c27a6c4f..5b1875e092b 100644
--- a/tensorflow/python/debug/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -21,10 +21,12 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_utils
+from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -253,6 +255,31 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
+  def testWatchGraph_tensorDTypeWhitelist(self):
+    debug_utils.watch_graph(
+        self._run_options,
+        self._graph,
+        debug_urls="file:///tmp/tfdbg_1",
+        tensor_dtype_regex_whitelist=".*_ref")
+
+    node_names = self._verify_watches(
+        self._run_options.debug_options.debug_tensor_watch_opts, 0,
+        ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
+    self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
+
+  def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
+    debug_utils.watch_graph(
+        self._run_options,
+        self._graph,
+        debug_urls="file:///tmp/tfdbg_1",
+        node_name_regex_whitelist="^a.*",
+        tensor_dtype_regex_whitelist=".*_ref")
+
+    node_names = self._verify_watches(
+        self._run_options.debug_options.debug_tensor_watch_opts, 0,
+        ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
+    self.assertItemsEqual(["a1", "a1/Assign"], node_names)
+
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -292,6 +319,39 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
+  def testWatchGraph_tensorDTypeBlacklists(self):
+    debug_utils.watch_graph_with_blacklists(
+        self._run_options,
+        self._graph,
+        debug_urls="file:///tmp/tfdbg_1",
+        tensor_dtype_regex_blacklist=".*_ref")
+
+    node_names = self._verify_watches(
+        self._run_options.debug_options.debug_tensor_watch_opts, 0,
+        ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
+    self.assertNotIn("a1", node_names)
+    self.assertNotIn("a1/Assign", node_names)
+    self.assertNotIn("b", node_names)
+    self.assertNotIn("b/Assign", node_names)
+    self.assertIn("s", node_names)
+
+  def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
+    debug_utils.watch_graph_with_blacklists(
+        self._run_options,
+        self._graph,
+        debug_urls="file:///tmp/tfdbg_1",
+        node_name_regex_blacklist="^s$",
+        tensor_dtype_regex_blacklist=".*_ref")
+
+    node_names = self._verify_watches(
+        self._run_options.debug_options.debug_tensor_watch_opts, 0,
+        ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
+    self.assertNotIn("a1", node_names)
+    self.assertNotIn("a1/Assign", node_names)
+    self.assertNotIn("b", node_names)
+    self.assertNotIn("b/Assign", node_names)
+    self.assertNotIn("s", node_names)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/lib/profiling.py b/tensorflow/python/debug/lib/profiling.py
new file mode 100644
index 00000000000..dd580251a69
--- /dev/null
+++ b/tensorflow/python/debug/lib/profiling.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data structures and algorithms for profiling information."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+class ProfileDatum(object):
+  """Profile data point."""
+
+  def __init__(self,
+               device_name,
+               node_exec_stats,
+               file_path,
+               line_number,
+               func_name,
+               op_type):
+    """Constructor.
+
+    Args:
+      device_name: (string) name of the device.
+      node_exec_stats: `NodeExecStats` proto.
+      file_path: path to the source file involved in creating the op.
+      line_number: line number in the file involved in creating the op.
+      func_name: name of the function that the line belongs to.
+      op_type: (string) Operation type.
+    """
+    self.device_name = device_name
+    self.node_exec_stats = node_exec_stats
+    self.file_path = file_path
+    self.line_number = line_number
+    self.func_name = func_name
+    if self.file_path:
+      self.file_line_func = "%s:%d(%s)" % (
+          os.path.basename(self.file_path), self.line_number, self.func_name)
+    else:
+      self.file_line_func = ""
+    self.op_type = op_type
+    self.start_time = self.node_exec_stats.all_start_micros
+    self.op_time = (self.node_exec_stats.op_end_rel_micros -
+                    self.node_exec_stats.op_start_rel_micros)
+
+  @property
+  def exec_time(self):
+    """Op execution time plus pre- and post-processing."""
+    return self.node_exec_stats.all_end_rel_micros
+
+
+class AggregateProfile(object):
+  """Profile summary data for aggregating a number of ProfileDatum."""
+
+  def __init__(self, profile_datum):
+    """Constructor.
+
+    Args:
+      profile_datum: (`ProfileDatum`) an instance of `ProfileDatum` to
+        initialize this object with.
+    """
+
+    self.total_op_time = profile_datum.op_time
+    self.total_exec_time = profile_datum.exec_time
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+    self._node_to_exec_count = {device_and_node: 1}
+
+  def add(self, profile_datum):
+    """Accumulate a new instance of ProfileDatum.
+
+    Args:
+      profile_datum: (`ProfileDatum`) an instance of `ProfileDatum` to
+        accumulate to this object.
+    """
+
+    self.total_op_time += profile_datum.op_time
+    self.total_exec_time += profile_datum.exec_time
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+    if device_and_node in self._node_to_exec_count:
+      self._node_to_exec_count[device_and_node] += 1
+    else:
+      self._node_to_exec_count[device_and_node] = 1
+
+  @property
+  def node_count(self):
+    return len(self._node_to_exec_count)
+
+  @property
+  def node_exec_count(self):
+    return sum(self._node_to_exec_count.values())
diff --git a/tensorflow/python/debug/lib/profiling_test.py b/tensorflow/python/debug/lib/profiling_test.py
new file mode 100644
index 00000000000..f7b120ba5d9
--- /dev/null
+++ b/tensorflow/python/debug/lib/profiling_test.py
@@ -0,0 +1,100 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for the basic data structures and algorithms for profiling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.python.debug.lib import profiling
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class AggregateProfile(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    node_1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+    self.profile_datum_1 = profiling.ProfileDatum(
+        "cpu:0", node_1, "/foo/bar.py", 10, "func1", "Add")
+
+    node_2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        op_start_rel_micros=13,
+        op_end_rel_micros=16,
+        all_end_rel_micros=17)
+    self.profile_datum_2 = profiling.ProfileDatum(
+        "cpu:0", node_2, "/foo/bar.py", 11, "func1", "Mul")
+
+    node_3 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=103,
+        op_end_rel_micros=105,
+        all_end_rel_micros=4)
+    self.profile_datum_3 = profiling.ProfileDatum(
+        "cpu:0", node_3, "/foo/bar.py", 12, "func1", "Add")
+
+    node_4 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=203,
+        op_end_rel_micros=205,
+        all_end_rel_micros=4)
+    self.profile_datum_4 = profiling.ProfileDatum(
+        "gpu:0", node_4, "/foo/bar.py", 13, "func1", "Add")
+
+  def testAggregateProfileConstructorWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+
+    self.assertEqual(2, aggregate_data.total_op_time)
+    self.assertEqual(4, aggregate_data.total_exec_time)
+    self.assertEqual(1, aggregate_data.node_count)
+    self.assertEqual(1, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithDifferentNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_2)
+
+    self.assertEqual(5, aggregate_data.total_op_time)
+    self.assertEqual(21, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(2, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithSameNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_2)
+    aggregate_data.add(self.profile_datum_3)
+
+    self.assertEqual(7, aggregate_data.total_op_time)
+    self.assertEqual(25, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(3, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithDifferentDeviceSameNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_4)
+
+    self.assertEqual(4, aggregate_data.total_op_time)
+    self.assertEqual(8, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(2, aggregate_data.node_exec_count)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
similarity index 84%
rename from tensorflow/python/debug/session_debug_file_test.py
rename to tensorflow/python/debug/lib/session_debug_file_test.py
index 8acd3975b70..bb3e4ead35f 100644
--- a/tensorflow/python/debug/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -18,13 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import shutil
+import tempfile
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_data
-from tensorflow.python.debug import debug_utils
-from tensorflow.python.debug import session_debug_testlib
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import session_debug_testlib
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -108,5 +111,25 @@ class SessionDebugTest(session_debug_testlib.SessionDebugTestBase):
               dump.get_rel_timestamps("%s/read" % v_name, 0,
                                       "DebugIdentity")[0], 0)
 
+
+class SessionDebugConcurrentTest(
+    session_debug_testlib.DebugConcurrentRunCallsTest):
+
+  def setUp(self):
+    self._num_concurrent_runs = 3
+    self._dump_roots = []
+    for _ in range(self._num_concurrent_runs):
+      self._dump_roots.append(tempfile.mkdtemp())
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    for dump_root in self._dump_roots:
+      if os.path.isdir(dump_root):
+        shutil.rmtree(dump_root)
+
+  def _get_concurrent_debug_urls(self):
+    return [("file://%s" % dump_root) for dump_root in self._dump_roots]
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
new file mode 100644
index 00000000000..b0dc25851ca
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities under multiple (i.e., >1) GPUs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._dump_root = tempfile.mkdtemp()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+    # Tear down temporary dump directory.
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+
+  def testMultiGPUSessionRun(self):
+    local_devices = device_lib.list_local_devices()
+    gpu_device_names = []
+    for device in local_devices:
+      if device.device_type == "GPU":
+        gpu_device_names.append(device.name)
+    gpu_device_names = sorted(gpu_device_names)
+
+    if len(gpu_device_names) < 2:
+      self.skipTest(
+          "This test requires at least 2 GPUs, but only %d is available." %
+          len(gpu_device_names))
+
+    with session.Session() as sess:
+      v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v")
+      with ops.device(gpu_device_names[0]):
+        u0 = math_ops.add(v, v, name="u0")
+      with ops.device(gpu_device_names[1]):
+        u1 = math_ops.multiply(v, v, name="u1")
+      w = math_ops.subtract(u1, u0, name="w")
+
+      sess.run(v.initializer)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(run_options, sess.graph,
+                              debug_urls="file://" + self._dump_root)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertAllClose(
+          [80.0, 195.0],
+          sess.run(w, options=run_options, run_metadata=run_metadata))
+
+      debug_dump_dir = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.assertEqual(3, len(debug_dump_dir.devices()))
+      self.assertAllClose(
+          [10.0, 15.0], debug_dump_dir.get_tensors("v", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [20.0, 30.0], debug_dump_dir.get_tensors("u0", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [100.0, 225.0],
+          debug_dump_dir.get_tensors("u1", 0, "DebugIdentity")[0])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
new file mode 100644
index 00000000000..67f5e9d29e6
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -0,0 +1,1510 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities in tf.Session."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import glob
+import os
+import shutil
+import tempfile
+import threading
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class _RNNCellForTest(rnn_cell_impl.RNNCell):
+  """RNN cell for testing."""
+
+  def __init__(self, input_output_size, state_size):
+    self._input_output_size = input_output_size
+    self._state_size = state_size
+    self._w = variables.Variable(1.0, dtype=dtypes.float32, name="w")
+
+  @property
+  def output_size(self):
+    return self._input_output_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  def __call__(self, input_, state, scope=None):
+    return (math_ops.multiply(self._w, input_), state)
+
+
+class SessionDebugTestBase(test_util.TensorFlowTestCase):
+  """Base class for unit tests of tfdbg running with tf.Session."""
+
+  @classmethod
+  def setUpClass(cls):
+    if test.is_gpu_available():
+      cls._expected_partition_graph_count = 2
+      cls._expected_num_devices = 2
+      gpu_name = test_util.gpu_device_name()
+      cls._main_device = "/job:localhost/replica:0/task:0" + gpu_name
+    else:
+      cls._expected_partition_graph_count = 1
+      cls._expected_num_devices = 1
+      cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
+
+  @classmethod
+  def tearDownClass(cls):
+    pass
+
+  def setUp(self):
+    self._dump_root = tempfile.mkdtemp()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+    # Tear down temporary dump directory.
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+
+  def _debug_urls(self, run_number=None):
+    raise NotImplementedError(
+        "_debug_urls() method is not implemented in the base test class.")
+
+  def _debug_dump_dir(self, run_number=None):
+    raise NotImplementedError(
+        "_debug_dump_dir() method is not implemented in the base test class.")
+
+  def _debug_run_and_get_dump(self,
+                              sess,
+                              fetches,
+                              feed_dict=None,
+                              debug_ops="DebugIdentity",
+                              tolerate_debug_op_creation_failures=False,
+                              global_step=-1,
+                              validate=True,
+                              expected_partition_graph_count=None):
+    """Run fetches with debugging and obtain DebugDumpDir.
+
+    Args:
+      sess: the tf.Session to be used.
+      fetches: fetches of the Session.run().
+      feed_dict: feed dict for the Session.run().
+      debug_ops: name(s) of the debug ops to be used.
+      tolerate_debug_op_creation_failures: whether to tolerate debug op
+        creation failures.
+      global_step: Optional global step.
+      validate: whether to validate dumped tensors against graph.
+      expected_partition_graph_count: optional count of partition graphs to
+        assert on.
+
+    Returns:
+      1. Return values of the Session.run().
+      2. The DebugDumpDir object from the debugged run().
+    """
+
+    run_options = config_pb2.RunOptions(output_partition_graphs=True)
+    debug_utils.watch_graph(
+        run_options,
+        sess.graph,
+        debug_ops=debug_ops,
+        debug_urls=self._debug_urls(),
+        tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures,
+        global_step=global_step)
+    run_metadata = config_pb2.RunMetadata()
+    run_output = sess.run(fetches,
+                          feed_dict=feed_dict,
+                          options=run_options,
+                          run_metadata=run_metadata)
+
+    if expected_partition_graph_count is not None:
+      self.assertEqual(expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+    return run_output, debug_data.DebugDumpDir(
+        self._dump_root, partition_graphs=run_metadata.partition_graphs,
+        validate=validate)
+
+  def _generate_dump_from_simple_addition_graph(self):
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      v_init_val = np.array([[2.0], [-1.0]])
+
+      # Use node names with overlapping namespace (i.e., parent directory) to
+      # test concurrent, non-racing directory creation.
+      u_name = "u"
+      v_name = "v"
+      w_name = "w"
+
+      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant(v_init_val, shape=[2, 1])
+      v = variables.Variable(v_init, name=v_name)
+
+      w = math_ops.matmul(u, v, name=w_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = "file://%s" % self._dump_root
+
+      # Add debug tensor watch for u.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s/read" % u_name, 0, debug_urls=debug_urls)
+      # Add debug tensor watch for v.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+
+      # Invoke Session.run().
+      sess.run(w, options=run_options, run_metadata=run_metadata)
+
+      self.assertEqual(self._expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+    simple_add_results = collections.namedtuple("SimpleAddResults", [
+        "u_init_val", "v_init_val", "u", "v", "w", "u_name", "v_name", "w_name",
+        "dump"
+    ])
+    return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name,
+                              w_name, dump)
+
+  def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self):
+    with session.Session() as sess:
+      u = variables.Variable(2.1, name="u")
+      v = variables.Variable(20.0, name="v")
+      w = math_ops.multiply(u, v, name="w")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = self._debug_urls()
+      debug_utils.add_debug_tensor_watch(
+          run_options,
+          "u",
+          0, ["DebugNumericSummary(gated_grpc=True)", "DebugIdentity"],
+          debug_urls=debug_urls)
+      debug_utils.add_debug_tensor_watch(
+          run_options, "v", 0, ["DebugNumericSummary"], debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+      r = sess.run(w, options=run_options, run_metadata=run_metadata)
+      self.assertAllClose(42.0, r)
+
+      u_copy_node_def = None
+      v_copy_node_def = None
+      for partition_graph in run_metadata.partition_graphs:
+        for node_def in partition_graph.node:
+          if debug_data.is_copy_node(node_def.name):
+            if node_def.name == "__copy_u_0":
+              u_copy_node_def = node_def
+            elif node_def.name == "__copy_v_0":
+              v_copy_node_def = node_def
+
+      self.assertIsNotNone(u_copy_node_def)
+      debug_ops_spec = u_copy_node_def.attr["debug_ops_spec"].list.s
+      self.assertEqual(2, len(debug_ops_spec))
+      self.assertEqual("DebugNumericSummary;%s;1" % debug_urls[0],
+                       debug_ops_spec[0].decode("utf-8"))
+      self.assertEqual("DebugIdentity;%s;0" % debug_urls[0],
+                       debug_ops_spec[1].decode("utf-8"))
+
+      self.assertIsNotNone(v_copy_node_def)
+      debug_ops_spec = v_copy_node_def.attr["debug_ops_spec"].list.s
+      self.assertEqual(1, len(debug_ops_spec))
+      self.assertEqual("DebugNumericSummary;%s;0" % debug_urls[0],
+                       debug_ops_spec[0].decode("utf-8"))
+
+  def testConcurrentDumpingToPathsWithOverlappingParentDirsWorks(self):
+    results = self._generate_dump_from_simple_addition_graph()
+    self.assertTrue(results.dump.loaded_partition_graphs())
+
+    # Since global_step is not explicitly specified, it should take its default
+    # value: -1.
+    self.assertEqual(-1, results.dump.core_metadata.global_step)
+    self.assertGreaterEqual(results.dump.core_metadata.session_run_index, 0)
+    self.assertGreaterEqual(results.dump.core_metadata.executor_step_index, 0)
+    self.assertEqual([], results.dump.core_metadata.input_names)
+    self.assertEqual([results.w.name], results.dump.core_metadata.output_names)
+    self.assertEqual([], results.dump.core_metadata.target_nodes)
+
+    # Verify the dumped tensor values for u and v.
+    self.assertEqual(2, results.dump.size)
+
+    self.assertAllClose([results.u_init_val],
+                        results.dump.get_tensors("%s/read" % results.u_name, 0,
+                                                 "DebugIdentity"))
+    self.assertAllClose([results.v_init_val],
+                        results.dump.get_tensors("%s/read" % results.v_name, 0,
+                                                 "DebugIdentity"))
+
+    self.assertGreaterEqual(
+        results.dump.get_rel_timestamps("%s/read" % results.u_name, 0,
+                                        "DebugIdentity")[0], 0)
+    self.assertGreaterEqual(
+        results.dump.get_rel_timestamps("%s/read" % results.v_name, 0,
+                                        "DebugIdentity")[0], 0)
+
+    self.assertGreater(
+        results.dump.get_dump_sizes_bytes("%s/read" % results.u_name, 0,
+                                          "DebugIdentity")[0], 0)
+    self.assertGreater(
+        results.dump.get_dump_sizes_bytes("%s/read" % results.v_name, 0,
+                                          "DebugIdentity")[0], 0)
+
+  def testGetOpTypeWorks(self):
+    results = self._generate_dump_from_simple_addition_graph()
+
+    self.assertEqual(results.u.op.type,
+                     results.dump.node_op_type(results.u_name))
+    self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
+    self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
+
+    with self.assertRaisesRegexp(
+        ValueError, r"None of the .* device\(s\) has a node named "):
+      results.dump.node_op_type("foo_bar")
+
+  def testDumpStringTensorsWorks(self):
+    with session.Session() as sess:
+      str1_init_val = np.array(b"abc")
+      str2_init_val = np.array(b"def")
+
+      str1_init = constant_op.constant(str1_init_val)
+      str2_init = constant_op.constant(str2_init_val)
+
+      str1_name = "str1"
+      str2_name = "str2"
+      str1 = variables.Variable(str1_init, name=str1_name)
+      str2 = variables.Variable(str2_init, name=str2_name)
+      # Concatenate str1 and str2
+      str_concat = math_ops.add(str1, str2, name="str_concat")
+
+      str1.initializer.run()
+      str2.initializer.run()
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = self._debug_urls()
+
+      # Add debug tensor watch for u.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s/read" % str1_name, 0, debug_urls=debug_urls)
+      # Add debug tensor watch for v.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s/read" % str2_name, 0, debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(str_concat, options=run_options, run_metadata=run_metadata)
+
+      # String ops are located on CPU.
+      self.assertEqual(1, len(run_metadata.partition_graphs))
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      self.assertIn(str1_name, dump.nodes())
+      self.assertIn(str2_name, dump.nodes())
+
+      self.assertEqual(2, dump.size)
+
+      self.assertEqual([str1_init_val],
+                       dump.get_tensors("%s/read" % str1_name, 0,
+                                        "DebugIdentity"))
+      self.assertEqual([str2_init_val],
+                       dump.get_tensors("%s/read" % str2_name, 0,
+                                        "DebugIdentity"))
+
+      self.assertGreaterEqual(
+          dump.get_rel_timestamps("%s/read" % str1_name, 0, "DebugIdentity")[0],
+          0)
+      self.assertGreaterEqual(
+          dump.get_rel_timestamps("%s/read" % str2_name, 0, "DebugIdentity")[0],
+          0)
+
+      self.assertGreater(
+          dump.get_dump_sizes_bytes("%s/read" % str1_name, 0,
+                                    "DebugIdentity")[0], 0)
+      self.assertGreater(
+          dump.get_dump_sizes_bytes("%s/read" % str2_name, 0,
+                                    "DebugIdentity")[0], 0)
+
+  def testDumpUninitializedVariable(self):
+    op_namespace = "testDumpUninitializedVariable"
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      s_init_val = b"str1"
+
+      u_name = "%s/u" % op_namespace
+      s_name = "%s/s" % op_namespace
+
+      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u = variables.Variable(u_init, name=u_name)
+      s_init = constant_op.constant(s_init_val)
+      s = variables.Variable(s_init, name=s_name)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = self._debug_urls()
+
+      # Add debug tensor watch for u.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s" % u_name, 0, debug_urls=debug_urls)
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s" % s_name, 0, debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+
+      # Initialize u and s.
+      sess.run(variables.global_variables_initializer(),
+               options=run_options,
+               run_metadata=run_metadata)
+
+      # Verify the dump file for the uninitialized value of u.
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      self.assertEqual(2, dump.size)
+      self.assertEqual(self._expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+
+      # Verify that the variable is properly initialized by the run() call.
+      u_vals = dump.get_tensors(u_name, 0, "DebugIdentity")
+      s_vals = dump.get_tensors(s_name, 0, "DebugIdentity")
+      self.assertEqual(1, len(u_vals))
+      self.assertIsInstance(u_vals[0], debug_data.InconvertibleTensorProto)
+      self.assertFalse(u_vals[0].initialized)
+      self.assertEqual(1, len(s_vals))
+      self.assertIsInstance(s_vals[0], debug_data.InconvertibleTensorProto)
+      self.assertFalse(s_vals[0].initialized)
+
+      # Call run() again, to check that u is initialized properly.
+      self.assertAllClose(u_init_val, sess.run(u))
+      self.assertEqual(s_init_val, sess.run(s))
+
+  def testDebugWhileLoopGeneratesMultipleDumps(self):
+    with session.Session() as sess:
+      num_iter = 10
+
+      # "u" is the Variable being updated in the loop.
+      u_name = "testDumpToFileWhileLoop/u"
+      u_namespace = u_name.split("/")[0]
+
+      u_init_val = np.array(11.0)
+      u_init = constant_op.constant(u_init_val)
+      u = variables.Variable(u_init, name=u_name)
+
+      # "v" is the increment.
+      v_name = "testDumpToFileWhileLoop/v"
+      v_namespace = v_name.split("/")[0]
+
+      v_init_val = np.array(2.0)
+      v_init = constant_op.constant(v_init_val)
+      v = variables.Variable(v_init, name=v_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      i = constant_op.constant(0, name="testDumpToFileWhileLoop/i")
+
+      def cond(i):
+        return math_ops.less(i, num_iter)
+
+      def body(i):
+        new_u = state_ops.assign_add(u, v)
+        new_i = math_ops.add(i, 1)
+        op = control_flow_ops.group(new_u)
+        new_i = control_flow_ops.with_dependencies([op], new_i)
+        return [new_i]
+
+      loop = control_flow_ops.while_loop(
+          cond, body, [i], parallel_iterations=10)
+
+      # Create RunOptions for debug-watching tensors
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = self._debug_urls()
+
+      # Add debug tensor watch for u.
+      debug_utils.add_debug_tensor_watch(
+          run_options, u_name, 0, debug_urls=debug_urls)
+      # Add debug tensor watch for v.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
+      # Add debug tensor watch for while/Identity.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "while/Identity", 0, debug_urls=debug_urls)
+      # Add debug tensor watch for while/Add/y.
+      debug_utils.add_debug_tensor_watch(
+          run_options, "while/Add/y", 0, debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+      r = sess.run(loop, options=run_options, run_metadata=run_metadata)
+
+      self.assertEqual(self._expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+
+      self.assertEqual(num_iter, r)
+      u_val_final = sess.run(u)
+      self.assertAllClose(u_init_val + num_iter * v_init_val, u_val_final)
+
+      # Verify dump files
+      self.assertTrue(os.path.isdir(self._dump_root))
+
+      u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace))
+      v_glob_out = glob.glob(os.path.join(
+          self._dump_root, "*", v_namespace, "v"))
+      self.assertTrue(os.path.isdir(u_glob_out[0]))
+      self.assertTrue(os.path.isdir(v_glob_out[0]))
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      # Expected dumped tensors: u, v/read, 10 iterations of while/Identity,
+      # and 10 iterations of while/Add/y.
+      self.assertEqual(1 + 1 + num_iter + num_iter, dump.size)
+
+      # Verify tensor values.
+      self.assertAllClose([u_init_val],
+                          dump.get_tensors(u_name, 0, "DebugIdentity"))
+      self.assertAllClose([v_init_val],
+                          dump.get_tensors("%s/read" % v_name, 0,
+                                           "DebugIdentity"))
+
+      while_id_tensors = dump.get_tensors("while/Identity", 0, "DebugIdentity")
+      self.assertEqual(10, len(while_id_tensors))
+      for k in xrange(len(while_id_tensors)):
+        self.assertAllClose(np.array(k), while_id_tensors[k])
+
+      # Verify ascending timestamps from the while loops.
+      while_id_rel_timestamps = dump.get_rel_timestamps("while/Identity", 0,
+                                                        "DebugIdentity")
+      while_id_dump_sizes_bytes = dump.get_dump_sizes_bytes("while/Identity", 0,
+                                                            "DebugIdentity")
+      self.assertEqual(10, len(while_id_rel_timestamps))
+      prev_rel_time = 0
+      prev_dump_size_bytes = while_id_dump_sizes_bytes[0]
+      for rel_time, dump_size_bytes in zip(while_id_rel_timestamps,
+                                           while_id_dump_sizes_bytes):
+        self.assertGreaterEqual(rel_time, prev_rel_time)
+        self.assertEqual(dump_size_bytes, prev_dump_size_bytes)
+        prev_rel_time = rel_time
+        prev_dump_size_bytes = dump_size_bytes
+
+      # Test querying debug watch keys from node name.
+      watch_keys = dump.debug_watch_keys("while/Identity")
+      self.assertEqual(["while/Identity:0:DebugIdentity"], watch_keys)
+
+      # Test querying debug datum instances from debug watch key.
+      self.assertEqual(10, len(dump.watch_key_to_data(watch_keys[0])))
+      self.assertEqual([], dump.watch_key_to_data("foo"))
+
+  def testDebugWhileLoopWatchingWholeGraphWorks(self):
+    with session.Session() as sess:
+      loop_body = lambda i: math_ops.add(i, 2)
+      loop_cond = lambda i: math_ops.less(i, 16)
+
+      i = constant_op.constant(10, name="i")
+      loop = control_flow_ops.while_loop(loop_cond, loop_body, [i])
+
+      loop_result, dump = self._debug_run_and_get_dump(sess, loop)
+      self.assertEqual(16, loop_result)
+
+      self.assertEqual(
+          [[10]], dump.get_tensors("while/Enter", 0, "DebugIdentity"))
+      self.assertEqual(
+          [[12], [14], [16]],
+          dump.get_tensors("while/NextIteration", 0, "DebugIdentity"))
+
+  def testDebugTrainingDynamicRNNWorks(self):
+    with session.Session() as sess:
+      input_size = 3
+      state_size = 2
+      time_steps = 4
+      batch_size = 2
+
+      input_values = np.random.randn(time_steps, batch_size, input_size)
+      sequence_length = np.random.randint(0, time_steps, size=batch_size)
+      concat_inputs = array_ops.placeholder(
+          dtypes.float32, shape=(time_steps, batch_size, input_size))
+
+      outputs_dynamic, _ = rnn.dynamic_rnn(
+          _RNNCellForTest(input_size, state_size),
+          inputs=concat_inputs,
+          sequence_length=sequence_length,
+          time_major=True,
+          dtype=dtypes.float32)
+      toy_loss = math_ops.reduce_sum(outputs_dynamic * outputs_dynamic)
+      train_op = gradient_descent.GradientDescentOptimizer(
+          learning_rate=0.1).minimize(toy_loss, name="train_op")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph_with_blacklists(
+          run_options,
+          sess.graph,
+          node_name_regex_blacklist="(.*rnn/while/.*|.*TensorArray.*)",
+          debug_urls=self._debug_urls())
+      # b/36870549: Nodes with these name patterns need to be excluded from
+      # tfdbg in order to prevent MSAN warnings of uninitialized Tensors
+      # under both file:// and grpc:// debug URL schemes.
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(train_op, feed_dict={concat_inputs: input_values},
+               options=run_options, run_metadata=run_metadata)
+
+      debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+  def testDebugCondWatchingWholeGraphWorks(self):
+    with session.Session() as sess:
+      x = variables.Variable(10.0, name="x")
+      y = variables.Variable(20.0, name="y")
+      cond = control_flow_ops.cond(
+          x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
+
+      sess.run(variables.global_variables_initializer())
+
+      cond_result, dump = self._debug_run_and_get_dump(sess, cond)
+      self.assertEqual(21, cond_result)
+
+      self.assertAllClose(
+          [21.0], dump.get_tensors("cond/Merge", 0, "DebugIdentity"))
+
+  def testFindNodesWithBadTensorValues(self):
+    with session.Session() as sess:
+      u_name = "testFindNodesWithBadTensorValues/u"
+      v_name = "testFindNodesWithBadTensorValues/v"
+      w_name = "testFindNodesWithBadTensorValues/w"
+      x_name = "testFindNodesWithBadTensorValues/x"
+      y_name = "testFindNodesWithBadTensorValues/y"
+      z_name = "testFindNodesWithBadTensorValues/z"
+
+      u_init = constant_op.constant([2.0, 4.0])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant([2.0, 1.0])
+      v = variables.Variable(v_init, name=v_name)
+
+      # Expected output: [0.0, 3.0]
+      w = math_ops.subtract(u, v, name=w_name)
+
+      # Expected output: [inf, 1.3333]
+      x = math_ops.div(u, w, name=x_name)
+
+      # Expected output: [nan, 4.0]
+      y = math_ops.multiply(w, x, name=y_name)
+
+      z = math_ops.multiply(y, y, name=z_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, z,
+          expected_partition_graph_count=self._expected_partition_graph_count)
+
+      def has_bad_value(_, tensor):
+        return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
+
+      # Find all "offending tensors".
+      bad_data = dump.find(has_bad_value)
+
+      # Verify that the nodes with bad values are caught through running find
+      # on the debug dump.
+      self.assertEqual(3, len(bad_data))
+      self.assertEqual(x_name, bad_data[0].node_name)
+      self.assertEqual(y_name, bad_data[1].node_name)
+      self.assertEqual(z_name, bad_data[2].node_name)
+
+      # Test first_n kwarg of find(): Find the first offending tensor.
+      first_bad_datum = dump.find(has_bad_value, first_n=1)
+
+      self.assertEqual(1, len(first_bad_datum))
+      self.assertEqual(x_name, first_bad_datum[0].node_name)
+
+  def _session_run_for_graph_structure_lookup(self):
+    with session.Session() as sess:
+      u_name = "testDumpGraphStructureLookup/u"
+      v_name = "testDumpGraphStructureLookup/v"
+      w_name = "testDumpGraphStructureLookup/w"
+
+      u_init = constant_op.constant([2.0, 4.0])
+      u = variables.Variable(u_init, name=u_name)
+      v = math_ops.add(u, u, name=v_name)
+      w = math_ops.add(v, v, name=w_name)
+
+      u.initializer.run()
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, w,
+          expected_partition_graph_count=self._expected_partition_graph_count)
+
+    return u_name, v_name, w_name, dump
+
+  def testGraphStructureLookupGivesDevicesAndNodesInfo(self):
+    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
+
+    # Test num_devices().
+    self.assertEqual(self._expected_num_devices, len(dump.devices()))
+
+    # Test node_device().
+    self.assertEqual(self._main_device, dump.node_device(u_name))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "does not exist in partition graphs"):
+      dump.node_device(u_name + "foo")
+
+    # Test node_exists().
+    self.assertTrue(dump.node_exists(u_name))
+    self.assertTrue(dump.node_exists(u_name + "/read"))
+    self.assertFalse(dump.node_exists(u_name + "/read" + "/foo"))
+
+  def testGraphStructureLookupGivesNodesAndAttributes(self):
+    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
+
+    u_read_name = u_name + "/read"
+
+    # Test node name list lookup of the DebugDumpDir object.
+    if test_util.gpu_device_name():
+      node_names = dump.nodes(
+          device_name="/job:localhost/replica:0/task:0/gpu:0")
+    else:
+      node_names = dump.nodes()
+    self.assertTrue(u_name in node_names)
+    self.assertTrue(u_read_name in node_names)
+
+    # Test querying node attributes.
+    u_attr = dump.node_attributes(u_name)
+    self.assertEqual(dtypes.float32, u_attr["dtype"].type)
+    self.assertEqual(1, len(u_attr["shape"].shape.dim))
+    self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"None of the .* device\(s\) has a node named "):
+      dump.node_attributes("foo")
+
+  def testGraphStructureLookupGivesDebugWatchKeys(self):
+    u_name, v_name, w_name, dump = (
+        self._session_run_for_graph_structure_lookup())
+
+    # Test querying the debug watch keys with node names.
+    self.assertEqual(["%s:0:DebugIdentity" % u_name],
+                     dump.debug_watch_keys(u_name))
+    self.assertEqual(["%s:0:DebugIdentity" % v_name],
+                     dump.debug_watch_keys(v_name))
+    self.assertEqual(["%s:0:DebugIdentity" % w_name],
+                     dump.debug_watch_keys(w_name))
+    self.assertEqual([], dump.debug_watch_keys("foo"))
+
+    # Test querying debug datum instances from debug watch.
+    u_data = dump.watch_key_to_data(dump.debug_watch_keys(u_name)[0])
+    self.assertEqual(1, len(u_data))
+    self.assertEqual(u_name, u_data[0].node_name)
+    self.assertEqual(0, u_data[0].output_slot)
+    self.assertEqual("DebugIdentity", u_data[0].debug_op)
+    self.assertGreaterEqual(u_data[0].timestamp, 0)
+    self.assertEqual([], dump.watch_key_to_data("foo"))
+
+  def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
+    u_name, v_name, w_name, dump = (
+        self._session_run_for_graph_structure_lookup())
+
+    u_read_name = u_name + "/read"
+
+    # Test the inputs lookup of the DebugDumpDir object.
+    self.assertEqual([], dump.node_inputs(u_name))
+    self.assertEqual([u_name], dump.node_inputs(u_read_name))
+    self.assertEqual([u_read_name] * 2, dump.node_inputs(v_name))
+    self.assertEqual([v_name] * 2, dump.node_inputs(w_name))
+
+    self.assertEqual([], dump.node_inputs(u_name, is_control=True))
+    self.assertEqual([], dump.node_inputs(u_read_name, is_control=True))
+    self.assertEqual([], dump.node_inputs(v_name, is_control=True))
+    self.assertEqual([], dump.node_inputs(w_name, is_control=True))
+
+    # Test the outputs recipient lookup of the DebugDumpDir object.
+    self.assertTrue(u_read_name in dump.node_recipients(u_name))
+    self.assertEqual(2, dump.node_recipients(u_read_name).count(v_name))
+    self.assertEqual(2, dump.node_recipients(v_name).count(w_name))
+
+    self.assertEqual([], dump.node_recipients(u_name, is_control=True))
+    self.assertEqual([], dump.node_recipients(u_read_name, is_control=True))
+    self.assertEqual([], dump.node_recipients(v_name, is_control=True))
+    self.assertEqual([], dump.node_recipients(w_name, is_control=True))
+
+    # Test errors raised on invalid node names.
+    with self.assertRaisesRegexp(
+        ValueError, r"None of the .* device\(s\) has a node named "):
+      dump.node_inputs(u_name + "foo")
+    with self.assertRaisesRegexp(
+        ValueError, r"None of the .* device\(s\) has a node named "):
+      dump.node_recipients(u_name + "foo")
+
+    # Test transitive_inputs().
+    self.assertEqual([], dump.transitive_inputs(u_name))
+    self.assertEqual([u_name], dump.transitive_inputs(u_read_name))
+    self.assertEqual(
+        set([u_name, u_read_name]), set(dump.transitive_inputs(v_name)))
+    self.assertEqual(
+        set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
+
+    with self.assertRaisesRegexp(
+        ValueError, r"None of the .* device\(s\) has a node named "):
+      dump.transitive_inputs(u_name + "foo")
+
+  def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
+    _, _, _, dump = self._session_run_for_graph_structure_lookup()
+
+    # Now load the dump again, without the partition graphs, so we can check
+    # errors are not raised because the partition graphs are loaded from the
+    # dump directory.
+    dump = debug_data.DebugDumpDir(self._dump_root, validate=False)
+    self.assertTrue(dump.loaded_partition_graphs())
+
+  def testGraphPathFindingOnControlEdgesWorks(self):
+    with session.Session() as sess:
+      v1 = variables.Variable(1.0, name="v1")
+      v2 = variables.Variable(2.0, name="v2")
+      v3 = variables.Variable(3.0, name="v3")
+      a = math_ops.add(v1, v2, name="a")
+      with ops.control_dependencies([a]):
+        c = math_ops.subtract(v3, v3, name="c")
+
+      sess.run(variables.global_variables_initializer())
+      _, dump = self._debug_run_and_get_dump(sess, c)
+
+      self.assertEqual(["v1", "v1/read", "a", "c"],
+                       dump.find_some_path("v1", "c"))
+      self.assertIsNone(dump.find_some_path("v1", "c", include_control=False))
+
+  def testGraphPathFindingReverseRefEdgeWorks(self):
+    with session.Session() as sess:
+      v = variables.Variable(10.0, name="v")
+      delta = variables.Variable(1.0, name="delta")
+      inc_v = state_ops.assign_add(v, delta, name="inc_v")
+
+      sess.run(variables.global_variables_initializer())
+      _, dump = self._debug_run_and_get_dump(sess, inc_v)
+
+      self.assertEqual(
+          ["delta", "delta/read", "inc_v", "v"],
+          dump.find_some_path("delta", "v", include_reversed_ref=True))
+      self.assertIsNone(dump.find_some_path("delta", "v"))
+
+  def testCausalityCheckOnDumpsDetectsWrongTemporalOrder(self):
+    with session.Session() as sess:
+      u_name = "testDumpCausalityCheck/u"
+      v_name = "testDumpCausalityCheck/v"
+      w_name = "testDumpCausalityCheck/w"
+
+      u_init = constant_op.constant([2.0, 4.0])
+      u = variables.Variable(u_init, name=u_name)
+      v = math_ops.add(u, u, name=v_name)
+      w = math_ops.add(v, v, name=w_name)
+
+      u.initializer.run()
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugIdentity"],
+          debug_urls=self._debug_urls())
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(w, options=run_options, run_metadata=run_metadata)
+
+      self.assertEqual(self._expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+
+      # First, loading the original dump without supplying the
+      # partition_graphs should not cause a LookupError, validation occurs
+      # only with partition_graphs loaded.
+      debug_data.DebugDumpDir(self._dump_root)
+
+      # Now, loading the original dump with partition graphs supplied should
+      # succeed. The validation should pass quietly.
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      # Get the dump file names and compute their timestamps.
+      self.assertEqual(
+          1, len(dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")))
+      v_file_path = dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")[0]
+
+      self.assertEqual(
+          1, len(dump.get_tensor_file_paths(w_name, 0, "DebugIdentity")))
+      w_file_path = dump.get_tensor_file_paths(w_name, 0, "DebugIdentity")[0]
+
+      v_timestamp = int(v_file_path[v_file_path.rindex("_") + 1:])
+      w_timestamp = int(w_file_path[w_file_path.rindex("_") + 1:])
+
+      # Swap and slightly shift the time stamps of the last two dumped tensors,
+      # to simulate "causality violation", which can happen if the dump
+      # directory contains incomplete data and/or mixes data from different
+      # Session.run() calls.
+      v_file_path_1 = v_file_path[:v_file_path.rindex(
+          "_")] + "_%d" % w_timestamp
+      w_file_path_1 = w_file_path[:w_file_path.rindex("_")] + "_%d" % (
+          v_timestamp - 1)
+
+      os.rename(v_file_path, v_file_path_1)
+      os.rename(w_file_path, w_file_path_1)
+
+      # Load the dump directory again. Now a ValueError is expected to be
+      # raised due to the timestamp swap.
+      with self.assertRaisesRegexp(ValueError, "Causality violated"):
+        dump = debug_data.DebugDumpDir(
+            self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      # Loading the dump directory with kwarg "validate" set explicitly to
+      # False should get rid of the error.
+      dump = debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=run_metadata.partition_graphs,
+          validate=False)
+
+      # Next, set the two times stamps to be the same, which should be fine.
+      v_file_path_2 = v_file_path[:v_file_path.rindex(
+          "_")] + "_%d" % w_timestamp
+      w_file_path_2 = w_file_path[:w_file_path.rindex(
+          "_")] + "_%d" % w_timestamp
+
+      os.rename(v_file_path_1, v_file_path_2)
+      os.rename(w_file_path_1, w_file_path_2)
+
+      debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+  def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self):
+    with session.Session() as sess:
+      x_name = "oneOfTwoSlots/x"
+      u_name = "oneOfTwoSlots/u"
+      v_name = "oneOfTwoSlots/v"
+      w_name = "oneOfTwoSlots/w"
+      y_name = "oneOfTwoSlots/y"
+
+      x = variables.Variable([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
+      sess.run(x.initializer)
+
+      unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
+
+      v = math_ops.add(unique_x, unique_x, name=v_name)
+      w = math_ops.add(indices, indices, name=w_name)
+      y = math_ops.add(w, w, name=y_name)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      # Watch only the first output slot of u, even though it has two output
+      # slots.
+      debug_utils.add_debug_tensor_watch(
+          run_options, u_name, 0, debug_urls=self._debug_urls())
+      debug_utils.add_debug_tensor_watch(
+          run_options, w_name, 0, debug_urls=self._debug_urls())
+      debug_utils.add_debug_tensor_watch(
+          run_options, y_name, 0, debug_urls=self._debug_urls())
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run([v, y], options=run_options, run_metadata=run_metadata)
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=run_metadata.partition_graphs,
+          validate=True)
+
+      self.assertAllClose([1, 3, 7],
+                          dump.get_tensors(u_name, 0, "DebugIdentity")[0])
+
+  def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
+    """Test watching output slots not attached to any outgoing edges."""
+
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      u = constant_op.constant(u_init_val, shape=[2, 2], name="u")
+
+      # Create a control edge from a node with an output: From u to z.
+      # Node u will get executed only because of the control edge. The output
+      # tensor u:0 is not attached to any outgoing edge in the graph. This test
+      # checks that the debugger can watch such a tensor.
+      with ops.control_dependencies([u]):
+        z = control_flow_ops.no_op(name="z")
+
+      _, dump = self._debug_run_and_get_dump(sess, z)
+
+      # Assert that the DebugIdentity watch on u works properly.
+      self.assertEqual(1, len(dump.dumped_tensor_data))
+      datum = dump.dumped_tensor_data[0]
+      self.assertEqual("u", datum.node_name)
+      self.assertEqual(0, datum.output_slot)
+      self.assertEqual("DebugIdentity", datum.debug_op)
+      self.assertAllClose([[5.0, 3.0], [-1.0, 0.0]], datum.get_tensor())
+
+  def testWatchingVariableUpdateOpsSeesUpdatedValues(self):
+    """Watch output slots on Variable-updating ops, with no emitted edges."""
+
+    with session.Session() as sess:
+      u_init = constant_op.constant(10.0)
+      u = variables.Variable(u_init, name="gdo/u")
+      v_init = constant_op.constant(20.0)
+      v = variables.Variable(v_init, name="gdo/v")
+
+      w = math_ops.multiply(u, v, name="gdo/w")
+      # gdo stands for GradientDescentOptimizer.
+
+      train_op = gradient_descent.GradientDescentOptimizer(
+          learning_rate=0.1).minimize(
+              w, name="gdo/train")
+
+      u.initializer.run()
+      v.initializer.run()
+
+      _, dump = self._debug_run_and_get_dump(sess, train_op)
+
+      update_u_data = dump.watch_key_to_data(
+          "gdo/train/update_gdo/u/ApplyGradientDescent:0:DebugIdentity")
+      self.assertEqual(1, len(update_u_data))
+
+      # Gradient descent on u: w = u * v, so dw / du = v.
+      # Updated value of u should be:
+      #   10.0 - learning_rate * v = 10.0 - 0.1 * 20.0 = 8.0
+      self.assertAllClose(8.0, update_u_data[0].get_tensor())
+
+      update_v_data = dump.watch_key_to_data(
+          "gdo/train/update_gdo/v/ApplyGradientDescent:0:DebugIdentity")
+      self.assertEqual(1, len(update_v_data))
+
+      # Gradient descent on u: w = u * v, so dw / dv = u.
+      # Updated value of u should be:
+      #   20.0 - learning_rate * u = 20.0 - 0.1 * 10.0 = 19.0
+      self.assertAllClose(19.0, update_v_data[0].get_tensor())
+
+      # Verify that the Variables u and v are updated properly.
+      self.assertAllClose(8.0, sess.run(u))
+      self.assertAllClose(19.0, sess.run(v))
+
+  def testAllowsWatchingUnconnectedOutputTensor(self):
+    """Watch an output slot not emitting any edges.
+
+    (Not even control edges from the node.)
+    """
+
+    with session.Session() as sess:
+      x_init = constant_op.constant([2, 2, 3, 5, 5])
+      x = variables.Variable(x_init, name="unconnected/x")
+
+      # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the
+      # graph. Let the debugger watch the unused slot 1.
+      unique_x, _ = array_ops.unique(x, name="unconnected/unique_x")
+      y = math_ops.add(unique_x, [0, 1, 2], name="unconnected/y")
+
+      x.initializer.run()
+
+      # Verify that only slot 0 of unique_x has recipients, while slot 1 of the
+      # same node does not have recipients.
+      unique_x_slot_0_recipients = []
+      unique_x_slot_1_recipients = []
+      for op in sess.graph.get_operations():
+        for inp in op.inputs:
+          if inp.name == "unconnected/unique_x:0":
+            unique_x_slot_0_recipients.append(op.name)
+          elif inp.name == "unconnected/unique_x:1":
+            unique_x_slot_1_recipients.append(op.name)
+
+      self.assertEqual(["unconnected/y"], unique_x_slot_0_recipients)
+      self.assertEqual([], unique_x_slot_1_recipients)
+
+      y_result, dump = self._debug_run_and_get_dump(sess, y)
+      self.assertAllClose([2, 4, 7], y_result)
+
+      # Assert that the connected slot (slot 0) is dumped properly.
+      unique_x_slot_0_dumps = dump.watch_key_to_data(
+          "unconnected/unique_x:0:DebugIdentity")
+      self.assertEqual(1, len(unique_x_slot_0_dumps))
+      self.assertEqual("unconnected/unique_x",
+                       unique_x_slot_0_dumps[0].node_name)
+      self.assertEqual(0, unique_x_slot_0_dumps[0].output_slot)
+      self.assertAllClose([2, 3, 5], unique_x_slot_0_dumps[0].get_tensor())
+
+      # Assert that the unconnected slot (slot 1) is dumped properly.
+      unique_x_slot_1_dumps = dump.watch_key_to_data(
+          "unconnected/unique_x:1:DebugIdentity")
+      self.assertEqual(1, len(unique_x_slot_1_dumps))
+      self.assertEqual("unconnected/unique_x",
+                       unique_x_slot_1_dumps[0].node_name)
+      self.assertEqual(1, unique_x_slot_1_dumps[0].output_slot)
+      self.assertAllClose([0, 0, 1, 2, 2],
+                          unique_x_slot_1_dumps[0].get_tensor())
+
+  def testSuccessiveDebuggingRunsIncreasesCounters(self):
+    """Test repeated Session.run() calls with debugger increments counters."""
+
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32, name="successive/ph")
+      x = array_ops.transpose(ph, name="mismatch/x")
+      y = array_ops.squeeze(ph, name="mismatch/y")
+
+      _, dump1 = self._debug_run_and_get_dump(
+          sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=1)
+      self.assertEqual(1, dump1.core_metadata.global_step)
+      self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0)
+      self.assertEqual(0, dump1.core_metadata.executor_step_index)
+      self.assertEqual([ph.name], dump1.core_metadata.input_names)
+      self.assertEqual([x.name], dump1.core_metadata.output_names)
+      self.assertEqual([], dump1.core_metadata.target_nodes)
+      shutil.rmtree(self._dump_root)
+
+      # Calling run() with the same feed, same output and same debug watch
+      # options should increment both session_run_index and
+      # executor_step_index.
+      _, dump2 = self._debug_run_and_get_dump(
+          sess, x, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=2)
+      self.assertEqual(2, dump2.core_metadata.global_step)
+      self.assertEqual(dump1.core_metadata.session_run_index + 1,
+                       dump2.core_metadata.session_run_index)
+      self.assertEqual(dump1.core_metadata.executor_step_index + 1,
+                       dump2.core_metadata.executor_step_index)
+      self.assertEqual([ph.name], dump2.core_metadata.input_names)
+      self.assertEqual([x.name], dump2.core_metadata.output_names)
+      self.assertEqual([], dump2.core_metadata.target_nodes)
+      shutil.rmtree(self._dump_root)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options, sess.graph, debug_urls=self._debug_urls(), global_step=3)
+
+      # Calling run() with a different output should increment
+      # session_run_index, but not executor_step_index.
+      _, dump3 = self._debug_run_and_get_dump(
+          sess, y, feed_dict={ph: np.array([[7.0, 8.0]])}, global_step=3)
+      self.assertEqual(3, dump3.core_metadata.global_step)
+      self.assertEqual(dump2.core_metadata.session_run_index + 1,
+                       dump3.core_metadata.session_run_index)
+      self.assertEqual(0, dump3.core_metadata.executor_step_index)
+      self.assertEqual([ph.name], dump3.core_metadata.input_names)
+      self.assertEqual([y.name], dump3.core_metadata.output_names)
+      self.assertEqual([], dump3.core_metadata.target_nodes)
+
+  def testDebuggingDuringOpError(self):
+    """Test the debug tensor dumping when error occurs in graph runtime."""
+
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32, name="mismatch/ph")
+      x = array_ops.transpose(ph, name="mismatch/x")
+      m = constant_op.constant(
+          np.array(
+              [[1.0, 2.0]], dtype=np.float32), name="mismatch/m")
+      y = math_ops.matmul(m, x, name="mismatch/y")
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugIdentity"],
+          debug_urls=self._debug_urls())
+
+      with self.assertRaises(errors.OpError):
+        sess.run(y,
+                 options=run_options,
+                 feed_dict={ph: np.array([[-3.0], [0.0]])})
+
+      dump = debug_data.DebugDumpDir(self._dump_root)
+
+      self.assertGreaterEqual(dump.core_metadata.session_run_index, 0)
+      self.assertGreaterEqual(dump.core_metadata.executor_step_index, 0)
+      self.assertEqual([ph.name], dump.core_metadata.input_names)
+      self.assertEqual([y.name], dump.core_metadata.output_names)
+      self.assertEqual([], dump.core_metadata.target_nodes)
+
+      # Despite the fact that the run() call errored out and partition_graphs
+      # are not available via run_metadata, the partition graphs should still
+      # have been loaded from the dump directory.
+      self.assertTrue(dump.loaded_partition_graphs())
+
+      m_dumps = dump.watch_key_to_data("mismatch/m:0:DebugIdentity")
+      self.assertEqual(1, len(m_dumps))
+      self.assertAllClose(np.array([[1.0, 2.0]]), m_dumps[0].get_tensor())
+
+      x_dumps = dump.watch_key_to_data("mismatch/x:0:DebugIdentity")
+      self.assertEqual(1, len(x_dumps))
+      self.assertAllClose(np.array([[-3.0, 0.0]]), x_dumps[0].get_tensor())
+
+  def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
+    with session.Session() as sess:
+      a = variables.Variable(
+          [
+              np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
+              -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
+          ],
+          dtype=np.float32,
+          name="numeric_summary/a")
+      b = variables.Variable(
+          [0.0] * 18, dtype=np.float32, name="numeric_summary/b")
+      c = math_ops.add(a, b, name="numeric_summary/c")
+
+      sess.run(variables.global_variables_initializer())
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, c, debug_ops=["DebugNumericSummary"])
+      self.assertTrue(dump.loaded_partition_graphs())
+
+      self.assertAllClose([[
+          1.0, 18.0, 4.0, 2.0, 2.0, 3.0, 2.0, 5.0, -3.0, 7.0, 0.85714286,
+          8.97959184, 1.0, 1.0, 18.0
+      ]], dump.get_tensors("numeric_summary/a/read", 0, "DebugNumericSummary"))
+
+  def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self):
+    with session.Session() as sess:
+      a = variables.Variable(
+          [42], dtype=np.float32, name="numeric_summary_uninit/a")
+
+      _, dump = self._debug_run_and_get_dump(
+          sess, a.initializer, debug_ops=["DebugNumericSummary"])
+
+      self.assertTrue(dump.loaded_partition_graphs())
+
+      # DebugNumericSummary output should reflect the uninitialized state of
+      # the watched tensor.
+      numeric_summary = dump.get_tensors("numeric_summary_uninit/a", 0,
+                                         "DebugNumericSummary")[0]
+      self.assertAllClose([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                          numeric_summary[0:8])
+      # Check dtype (index 12), ndims (index 13) and dimension sizes (index
+      # 14+).
+      self.assertAllClose([1.0, 1.0, 1.0], numeric_summary[12:])
+      self.assertTrue(np.isinf(numeric_summary[8]))
+      self.assertGreater(numeric_summary[8], 0.0)
+      self.assertTrue(np.isinf(numeric_summary[9]))
+      self.assertLess(numeric_summary[9], 0.0)
+      self.assertTrue(np.isnan(numeric_summary[10]))
+      self.assertTrue(np.isnan(numeric_summary[11]))
+
+  def testDebugNumericSummaryFailureIsToleratedWhenOrdered(self):
+    with session.Session() as sess:
+      a = variables.Variable("1", name="a")
+      b = variables.Variable("3", name="b")
+      c = variables.Variable("2", name="c")
+
+      d = math_ops.add(a, b, name="d")
+      e = math_ops.add(d, c, name="e")
+      n = parsing_ops.string_to_number(e, name="n")
+      m = math_ops.add(n, n, name="m")
+
+      sess.run(variables.global_variables_initializer())
+
+      # Using DebugNumericSummary on sess.run(m) with the default
+      # tolerate_debug_op_creation_failures=False should error out due to the
+      # presence of string-dtype Tensors in the graph.
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary"],
+          debug_urls=self._debug_urls())
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(m, options=run_options, run_metadata=run_metadata)
+
+      # Using tolerate_debug_op_creation_failures=True should get rid of the
+      # error.
+      m_result, dump = self._debug_run_and_get_dump(
+          sess, m, debug_ops=["DebugNumericSummary"],
+          tolerate_debug_op_creation_failures=True)
+      self.assertEqual(264, m_result)
+
+      # The integer-dtype Tensors in the graph should have been dumped
+      # properly.
+      self.assertIn("n:0:DebugNumericSummary", dump.debug_watch_keys("n"))
+      self.assertIn("m:0:DebugNumericSummary", dump.debug_watch_keys("m"))
+
+  def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
+    with session.Session() as sess:
+      a = variables.Variable(10.0, name="a")
+      b = variables.Variable(0.0, name="b")
+      c = variables.Variable(0.0, name="c")
+
+      x = math_ops.divide(a, b, name="x")
+      y = math_ops.multiply(x, c, name="y")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"1 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary: foo"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0; bar=false)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"2 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary:"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0; mute_if_healthy=true)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"1 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary: foo"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+  def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
+    with session.Session() as sess:
+      a = variables.Variable(10.0, name="a")
+      b = variables.Variable(0.0, name="b")
+      c = variables.Variable(0.0, name="c")
+
+      x = math_ops.divide(a, b, name="x")
+      y = math_ops.multiply(x, c, name="y")
+
+      sess.run(variables.global_variables_initializer())
+
+      # Here, validate=False is necessary to avoid causality check error.
+      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
+      #   debug ops with mute_if_healthy=false attribute during validation.
+      _, dump = self._debug_run_and_get_dump(
+          sess, y, debug_ops=["DebugNumericSummary(mute_if_healthy=true)"],
+          validate=False)
+
+      self.assertEqual(2, dump.size)
+      self.assertAllClose([[
+          1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, np.nan,
+          np.nan, 1.0, 0.0
+      ]], dump.get_tensors("x", 0, "DebugNumericSummary"))
+      self.assertAllClose([[
+          1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.inf, -np.inf, np.nan,
+          np.nan, 1.0, 0.0
+      ]], dump.get_tensors("y", 0, "DebugNumericSummary"))
+
+      # Another run with the default mute_if_healthy (false) value should
+      # dump all the tensors.
+      shutil.rmtree(self._dump_root)
+      _, dump = self._debug_run_and_get_dump(
+          sess, y, debug_ops=["DebugNumericSummary()"])
+      self.assertEqual(8, dump.size)
+
+  def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
+    with session.Session() as sess:
+      a = variables.Variable([10.0, 10.0], name="a")
+      b = variables.Variable([10.0, 2.0], name="b")
+
+      x = math_ops.add(a, b, name="x")  # [20.0, 12.0]
+      y = math_ops.divide(x, b, name="y")  # [2.0, 6.0]
+
+      sess.run(variables.global_variables_initializer())
+
+      # Here, validate=False is necessary to avoid causality check error.
+      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
+      #   debug ops with mute_if_healthy=false attribute during validation.
+      _, dump = self._debug_run_and_get_dump(
+          sess, y, debug_ops=[
+              "DebugNumericSummary(mute_if_healthy=true; upper_bound=11.0)"],
+          validate=False)
+
+      self.assertEqual(1, dump.size)
+      self.assertAllClose([[
+          1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 20.0, 16.0, 16.0, 1.0,
+          1.0, 2.0]], dump.get_tensors("x", 0, "DebugNumericSummary"))
+
+  def testDebugQueueOpsDoesNotoErrorOut(self):
+    with session.Session() as sess:
+      q = data_flow_ops.FIFOQueue(3, "float", name="fifo_queue")
+      q_init = q.enqueue_many(([101.0, 202.0, 303.0],), name="enqueue_many")
+
+      _, dump = self._debug_run_and_get_dump(sess, q_init)
+      self.assertTrue(dump.loaded_partition_graphs())
+
+      fifo_queue_tensor = dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0]
+      self.assertIsInstance(fifo_queue_tensor,
+                            debug_data.InconvertibleTensorProto)
+      self.assertTrue(fifo_queue_tensor.initialized)
+      self.assertAllClose(
+          [101.0, 202.0, 303.0],
+          dump.get_tensors("enqueue_many/component_0", 0, "DebugIdentity")[0])
+
+  def testLookUpNodePythonTracebackWorks(self):
+    with session.Session() as sess:
+      u_init = constant_op.constant(10.0)
+      u = variables.Variable(u_init, name="traceback/u")
+      v_init = constant_op.constant(20.0)
+      v = variables.Variable(v_init, name="traceback/v")
+
+      w = math_ops.multiply(u, v, name="traceback/w")
+
+      sess.run(variables.global_variables_initializer())
+      _, dump = self._debug_run_and_get_dump(sess, w)
+
+      # Prior to setting the Python graph, attempts to do traceback lookup
+      # should lead to exceptions.
+      with self.assertRaisesRegexp(
+          LookupError, "Python graph is not available for traceback lookup"):
+        dump.node_traceback("traceback/w")
+
+      dump.set_python_graph(sess.graph)
+
+      # After setting the Python graph, attempts to look up nonexistent nodes
+      # should lead to exceptions.
+      with self.assertRaisesRegexp(KeyError,
+                                   r"Cannot find node \"foo\" in Python graph"):
+        dump.node_traceback("foo")
+
+      # Lookup should work with node name input.
+      traceback = dump.node_traceback("traceback/w")
+      self.assertIsInstance(traceback, list)
+      self.assertGreater(len(traceback), 0)
+      for trace in traceback:
+        self.assertIsInstance(trace, tuple)
+
+      # Lookup should also work with tensor name input.
+      traceback = dump.node_traceback("traceback/w:0")
+      self.assertIsInstance(traceback, list)
+      self.assertGreater(len(traceback), 0)
+      for trace in traceback:
+        self.assertIsInstance(trace, tuple)
+
+
+class DebugConcurrentRunCallsTest(test_util.TensorFlowTestCase):
+  """Test for debugging concurrent Session.run() calls."""
+
+  def _get_concurrent_debug_urls(self):
+    """Abstract method to generate debug URLs for concurrent debugged runs."""
+    raise NotImplementedError(
+        "_get_concurrent_debug_urls is not implemented in the base test class")
+
+  def testDebugConcurrentVariableUpdates(self):
+    if test.is_gpu_available():
+      self.skipTest("No testing concurrent runs on a single GPU.")
+
+    with session.Session() as sess:
+      v = variables.Variable(30.0, name="v")
+      constants = []
+      for i in xrange(self._num_concurrent_runs):
+        constants.append(constant_op.constant(1.0, name="c%d" % i))
+      incs = [
+          state_ops.assign_add(
+              v, c, use_locking=True, name=("inc%d" % i))
+          for (i, c) in enumerate(constants)
+      ]
+      sess.run(v.initializer)
+
+      concurrent_debug_urls = self._get_concurrent_debug_urls()
+
+      def inc_job(index):
+        run_options = config_pb2.RunOptions(output_partition_graphs=True)
+        debug_utils.watch_graph(
+            run_options, sess.graph, debug_urls=concurrent_debug_urls[index])
+        for _ in xrange(100):
+          sess.run(incs[index], options=run_options)
+
+      inc_threads = []
+      for index in xrange(self._num_concurrent_runs):
+        inc_thread = threading.Thread(target=functools.partial(inc_job, index))
+        inc_thread.start()
+        inc_threads.append(inc_thread)
+      for inc_thread in inc_threads:
+        inc_thread.join()
+
+      self.assertAllClose(30.0 + 1.0 * self._num_concurrent_runs * 100,
+                          sess.run(v))
+
+      all_session_run_indices = []
+      for index in xrange(self._num_concurrent_runs):
+        dump = debug_data.DebugDumpDir(self._dump_roots[index])
+        self.assertTrue(dump.loaded_partition_graphs())
+
+        v_data = dump.get_tensors("v", 0, "DebugIdentity")
+        self.assertEqual(100, len(v_data))
+
+        # Examine all the core metadata files
+        core_metadata_files = glob.glob(
+            os.path.join(self._dump_roots[index], "_tfdbg_core*"))
+
+        timestamps = []
+        session_run_indices = []
+        executor_step_indices = []
+        for core_metadata_file in core_metadata_files:
+          with open(core_metadata_file, "rb") as f:
+            event = event_pb2.Event()
+            event.ParseFromString(f.read())
+            core_metadata = (
+                debug_data.extract_core_metadata_from_event_proto(event))
+            timestamps.append(event.wall_time)
+            session_run_indices.append(core_metadata.session_run_index)
+            executor_step_indices.append(core_metadata.executor_step_index)
+
+        all_session_run_indices.extend(session_run_indices)
+
+        # Assert that executor_step_index increases by one at a time.
+        executor_step_indices = zip(timestamps, executor_step_indices)
+        executor_step_indices = sorted(
+            executor_step_indices, key=lambda x: x[0])
+        for i in xrange(len(executor_step_indices) - 1):
+          self.assertEquals(executor_step_indices[i][1] + 1,
+                            executor_step_indices[i + 1][1])
+
+        # Assert that session_run_index increase monotonically.
+        session_run_indices = zip(timestamps, session_run_indices)
+        session_run_indices = sorted(session_run_indices, key=lambda x: x[0])
+        for i in xrange(len(session_run_indices) - 1):
+          self.assertGreater(session_run_indices[i + 1][1],
+                             session_run_indices[i][1])
+
+      # Assert that the session_run_indices from the concurrent run() calls are
+      # all unique.
+      self.assertEqual(len(all_session_run_indices),
+                       len(set(all_session_run_indices)))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/source_utils.py b/tensorflow/python/debug/lib/source_utils.py
new file mode 100644
index 00000000000..ad4e37d22e2
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_utils.py
@@ -0,0 +1,315 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and functions that help to inspect Python source w.r.t. TF graphs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import re
+
+import numpy as np
+
+from tensorflow.python.debug.lib import profiling
+
+
+_TENSORFLOW_BASEDIR = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.normpath(os.path.abspath(__file__))))))
+
+UNCOMPILED_SOURCE_SUFFIXES = (".py")
+COMPILED_SOURCE_SUFFIXES = (".pyc", ".pyo")
+
+
+def _norm_abs_path(file_path):
+  return os.path.normpath(os.path.abspath(file_path))
+
+
+def is_extension_uncompiled_python_source(file_path):
+  _, extension = os.path.splitext(file_path)
+  return extension.lower() in UNCOMPILED_SOURCE_SUFFIXES
+
+
+def is_extension_compiled_python_source(file_path):
+  _, extension = os.path.splitext(file_path)
+  return extension.lower() in COMPILED_SOURCE_SUFFIXES
+
+
+def _convert_watch_key_to_tensor_name(watch_key):
+  return watch_key[:watch_key.rfind(":")]
+
+
+def guess_is_tensorflow_py_library(py_file_path):
+  """Guess whether a Python source file is a part of the tensorflow library.
+
+  Special cases:
+    1) Returns False for unit-test files in the library (*_test.py),
+    2) Returns False for files under python/debug/examples.
+
+  Args:
+    py_file_path: full path of the Python source file in question.
+
+  Returns:
+    (`bool`) Whether the file is a part of the tensorflow library.
+
+  Raises:
+    ValueError: if the extension name of py_file_path does not indicate a Python
+      source file (compiled or uncomplied).
+  """
+  if (not is_extension_uncompiled_python_source(py_file_path) and
+      not is_extension_compiled_python_source(py_file_path)):
+    raise ValueError(
+        "Input file path (%s) is not a Python source file." % py_file_path)
+  py_file_path = _norm_abs_path(py_file_path)
+
+  return (py_file_path.startswith(_TENSORFLOW_BASEDIR) and
+          not py_file_path.endswith("_test.py") and
+          not os.path.dirname(py_file_path).endswith(
+              os.path.normpath("python/debug/examples")))
+
+
+def load_source(source_file_path):
+  with open(source_file_path, "rU") as f:
+    source_text = f.read()
+  source_lines = source_text.split("\n")
+  line_num_width = int(np.ceil(np.log10(len(source_lines)))) + 3
+  return source_lines, line_num_width
+
+
+def annotate_source(dump,
+                    source_file_path,
+                    do_dumped_tensors=False,
+                    file_stack_top=False,
+                    min_line=None,
+                    max_line=None):
+  """Annotate a Python source file with a list of ops created at each line.
+
+  (The annotation doesn't change the source file itself.)
+
+  Args:
+    dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
+      has been loaded.
+    source_file_path: (`str`) Path to the source file being annotated.
+    do_dumped_tensors: (`str`) Whether dumped Tensors, instead of ops are to be
+      used to annotate the source file.
+    file_stack_top: (`bool`) Whether only the top stack trace in the
+      specified source file is to be annotated.
+    min_line: (`None` or `int`) The 1-based line to start annotate the source
+      file from (inclusive).
+    max_line: (`None` or `int`) The 1-based line number to end the annotation
+      at (exclusive).
+
+  Returns:
+    A `dict` mapping 1-based line number to a list of op name(s) created at
+      that line, or tensor names if `do_dumped_tensors` is True.
+
+  Raises:
+    ValueError: If the dump object does not have a Python graph set.
+  """
+
+  py_graph = dump.python_graph
+  if not py_graph:
+    raise ValueError("Cannot perform source annotation due to a lack of set "
+                     "Python graph in the dump object")
+
+  source_file_path = _norm_abs_path(source_file_path)
+
+  line_to_op_names = {}
+  for op in py_graph.get_operations():
+    for file_path, line_number, _, _ in reversed(dump.node_traceback(op.name)):
+      if (min_line is not None and line_number < min_line or
+          max_line is not None and line_number >= max_line):
+        continue
+
+      if _norm_abs_path(file_path) != source_file_path:
+        continue
+
+      if do_dumped_tensors:
+        watch_keys = dump.debug_watch_keys(op.name)
+        # Convert watch keys to unique Tensor names.
+        items_to_append = list(
+            set(map(_convert_watch_key_to_tensor_name, watch_keys)))
+      else:
+        items_to_append = [op.name]
+
+      if line_number in line_to_op_names:
+        line_to_op_names[line_number].extend(items_to_append)
+      else:
+        line_to_op_names[line_number] = items_to_append
+
+      if file_stack_top:
+        break
+
+  return line_to_op_names
+
+
+def list_source_files_against_dump(dump,
+                                   path_regex_whitelist=None,
+                                   node_name_regex_whitelist=None):
+  """Generate a list of source files with information regarding ops and tensors.
+
+  Args:
+    dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
+      has been loaded.
+    path_regex_whitelist: A regular-expression filter for source file path.
+    node_name_regex_whitelist: A regular-expression filter for node names.
+
+  Returns:
+    A list of tuples regarding the Python source files involved in constructing
+    the ops and tensors contained in `dump`. Each tuple is:
+      (source_file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
+       first_line)
+
+      is_tf_library: (`bool`) A guess of whether the file belongs to the
+        TensorFlow Python library.
+      num_nodes: How many nodes were created by lines of this source file.
+        These include nodes with dumps and those without.
+      num_tensors: How many Tensors were created by lines of this source file.
+        These include Tensors with dumps and those without.
+      num_dumps: How many debug Tensor dumps were from nodes (and Tensors)
+        that were created by this source file.
+      first_line: The first line number (1-based) that created any nodes or
+        Tensors in this source file.
+
+    The list is sorted by ascending order of source_file_path.
+
+  Raises:
+    ValueError: If the dump object does not have a Python graph set.
+  """
+
+  py_graph = dump.python_graph
+  if not py_graph:
+    raise ValueError("Cannot generate source list due to a lack of set "
+                     "Python graph in the dump object")
+
+  path_to_node_names = collections.defaultdict(set)
+  path_to_tensor_names = collections.defaultdict(set)
+  path_to_first_line = {}
+  tensor_name_to_num_dumps = {}
+
+  path_regex = (re.compile(path_regex_whitelist)
+                if path_regex_whitelist else None)
+  node_name_regex = (re.compile(node_name_regex_whitelist)
+                     if node_name_regex_whitelist else None)
+
+  to_skip_file_paths = set()
+  for op in py_graph.get_operations():
+    if node_name_regex and not node_name_regex.match(op.name):
+      continue
+
+    for file_path, line_number, _, _ in dump.node_traceback(op.name):
+      file_path = _norm_abs_path(file_path)
+      if (file_path in to_skip_file_paths or
+          path_regex and not path_regex.match(file_path) or
+          not os.path.isfile(file_path)):
+        to_skip_file_paths.add(file_path)
+        continue
+
+      path_to_node_names[file_path].add(op.name)
+      if file_path in path_to_first_line:
+        if path_to_first_line[file_path] > line_number:
+          path_to_first_line[file_path] = line_number
+      else:
+        path_to_first_line[file_path] = line_number
+
+      for output_tensor in op.outputs:
+        tensor_name = output_tensor.name
+        path_to_tensor_names[file_path].add(tensor_name)
+
+      watch_keys = dump.debug_watch_keys(op.name)
+      for watch_key in watch_keys:
+        node_name, output_slot, debug_op = watch_key.split(":")
+        tensor_name = "%s:%s" % (node_name, output_slot)
+        if tensor_name not in tensor_name_to_num_dumps:
+          tensor_name_to_num_dumps[tensor_name] = len(
+              dump.get_tensors(node_name, int(output_slot), debug_op))
+
+  path_to_num_dumps = {}
+  for path in path_to_tensor_names:
+    path_to_num_dumps[path] = sum(
+        tensor_name_to_num_dumps.get(tensor_name, 0)
+        for tensor_name in path_to_tensor_names[path])
+
+  output = []
+  for file_path in path_to_node_names:
+    output.append((
+        file_path,
+        guess_is_tensorflow_py_library(file_path),
+        len(path_to_node_names.get(file_path, {})),
+        len(path_to_tensor_names.get(file_path, {})),
+        path_to_num_dumps.get(file_path, 0),
+        path_to_first_line[file_path]))
+
+  return sorted(output, key=lambda x: x[0])
+
+
+def annotate_source_against_profile(profile_data,
+                                    source_file_path,
+                                    node_name_filter=None,
+                                    op_type_filter=None,
+                                    min_line=None,
+                                    max_line=None):
+  """Annotate a Python source file with profiling information at each line.
+
+  (The annotation doesn't change the source file itself.)
+
+  Args:
+    profile_data: (`list` of `ProfileDatum`) A list of `ProfileDatum`.
+    source_file_path: (`str`) Path to the source file being annotated.
+    node_name_filter: Regular expression to filter by node name.
+    op_type_filter: Regular expression to filter by op type.
+    min_line: (`None` or `int`) The 1-based line to start annotate the source
+      file from (inclusive).
+    max_line: (`None` or `int`) The 1-based line number to end the annotation
+      at (exclusive).
+
+  Returns:
+    A `dict` mapping 1-based line number to a the namedtuple
+      `profiling.LineOrFuncProfileSummary`.
+  """
+
+  source_file_path = _norm_abs_path(source_file_path)
+
+  node_name_regex = re.compile(node_name_filter) if node_name_filter else None
+  op_type_regex = re.compile(op_type_filter) if op_type_filter else None
+
+  line_to_profile_summary = {}
+  for profile_datum in profile_data:
+    if not profile_datum.file_path:
+      continue
+
+    if _norm_abs_path(profile_datum.file_path) != source_file_path:
+      continue
+
+    if (min_line is not None and profile_datum.line_number < min_line or
+        max_line is not None and profile_datum.line_number >= max_line):
+      continue
+
+    if (node_name_regex and
+        not node_name_regex.match(profile_datum.node_exec_stats.node_name)):
+      continue
+
+    if op_type_regex and not op_type_regex.match(profile_datum.op_type):
+      continue
+
+    if profile_datum.line_number not in line_to_profile_summary:
+      line_to_profile_summary[profile_datum.line_number] = (
+          profiling.AggregateProfile(profile_datum))
+    else:
+      line_to_profile_summary[profile_datum.line_number].add(profile_datum)
+
+  return line_to_profile_summary
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
new file mode 100644
index 00000000000..4a8d4eaa99f
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -0,0 +1,345 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for source_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
+
+
+def line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self.curr_file_path = os.path.normpath(os.path.abspath(__file__))
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+  def testGuessedBaseDirIsProbablyCorrect(self):
+    self.assertEqual("tensorflow",
+                     os.path.basename(source_utils._TENSORFLOW_BASEDIR))
+
+  def testUnitTestFileReturnsFalse(self):
+    self.assertFalse(
+        source_utils.guess_is_tensorflow_py_library(self.curr_file_path))
+
+  def testSourceUtilModuleReturnsTrue(self):
+    self.assertTrue(
+        source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
+
+  def testFileInPythonKernelsPathReturnsTrue(self):
+    x = constant_op.constant(42.0, name="x")
+    self.assertTrue(
+        source_utils.guess_is_tensorflow_py_library(x.op.traceback[-1][0]))
+
+  def testNonPythonFileRaisesException(self):
+    with self.assertRaisesRegexp(ValueError, r"is not a Python source file"):
+      source_utils.guess_is_tensorflow_py_library(
+          os.path.join(os.path.dirname(self.curr_file_path), "foo.cc"))
+
+
+class SourceHelperTest(test_util.TensorFlowTestCase):
+
+  def createAndRunGraphHelper(self):
+    """Create and run a TensorFlow Graph to generate debug dumps.
+
+    This is intentionally done in separate method, to make it easier to test
+    the stack-top mode of source annotation.
+    """
+
+    self.dump_root = self.get_temp_dir()
+    self.curr_file_path = os.path.abspath(
+        tf_inspect.getfile(tf_inspect.currentframe()))
+
+    # Run a simple TF graph to generate some debug dumps that can be used in
+    # source annotation.
+    with session.Session() as sess:
+      self.u_init = constant_op.constant(
+          np.array([[5.0, 3.0], [-1.0, 0.0]]), shape=[2, 2], name="u_init")
+      self.u_init_line_number = line_number_above()
+
+      self.u = variables.Variable(self.u_init, name="u")
+      self.u_line_number = line_number_above()
+
+      self.v_init = constant_op.constant(
+          np.array([[2.0], [-1.0]]), shape=[2, 1], name="v_init")
+      self.v_init_line_number = line_number_above()
+
+      self.v = variables.Variable(self.v_init, name="v")
+      self.v_line_number = line_number_above()
+
+      self.w = math_ops.matmul(self.u, self.v, name="w")
+      self.w_line_number = line_number_above()
+
+      sess.run(self.u.initializer)
+      sess.run(self.v.initializer)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options, sess.graph, debug_urls=["file://%s" % self.dump_root])
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(self.w, options=run_options, run_metadata=run_metadata)
+
+      self.dump = debug_data.DebugDumpDir(
+          self.dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.dump.set_python_graph(sess.graph)
+
+  def setUp(self):
+    self.createAndRunGraphHelper()
+    self.helper_line_number = line_number_above()
+
+  def tearDown(self):
+    if os.path.isdir(self.dump_root):
+      shutil.rmtree(self.dump_root)
+    ops.reset_default_graph()
+
+  def testAnnotateWholeValidSourceFileGivesCorrectResult(self):
+    source_annotation = source_utils.annotate_source(self.dump,
+                                                     self.curr_file_path)
+
+    self.assertIn(self.u_init.op.name,
+                  source_annotation[self.u_init_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
+    self.assertIn(self.v_init.op.name,
+                  source_annotation[self.v_init_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.w_line_number])
+
+    # In the non-stack-top (default) mode, the helper line should be annotated
+    # with all the ops as well.
+    self.assertIn(self.u_init.op.name,
+                  source_annotation[self.helper_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.v_init.op.name,
+                  source_annotation[self.helper_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.helper_line_number])
+
+  def testAnnotateWithStackTopGivesCorrectResult(self):
+    source_annotation = source_utils.annotate_source(
+        self.dump, self.curr_file_path, file_stack_top=True)
+
+    self.assertIn(self.u_init.op.name,
+                  source_annotation[self.u_init_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
+    self.assertIn(self.v_init.op.name,
+                  source_annotation[self.v_init_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.w_line_number])
+
+    # In the stack-top mode, the helper line should not have been annotated.
+    self.assertNotIn(self.helper_line_number, source_annotation)
+
+  def testAnnotateSubsetOfLinesGivesCorrectResult(self):
+    source_annotation = source_utils.annotate_source(
+        self.dump,
+        self.curr_file_path,
+        min_line=self.u_line_number,
+        max_line=self.u_line_number + 1)
+
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
+    self.assertNotIn(self.v_line_number, source_annotation)
+
+  def testAnnotateDumpedTensorsGivesCorrectResult(self):
+    source_annotation = source_utils.annotate_source(
+        self.dump, self.curr_file_path, do_dumped_tensors=True)
+
+    # Note: Constant Tensors u_init and v_init may not get dumped due to
+    #   constant-folding.
+    self.assertIn(self.u.name, source_annotation[self.u_line_number])
+    self.assertIn(self.v.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.name, source_annotation[self.w_line_number])
+
+    self.assertNotIn(self.u.op.name, source_annotation[self.u_line_number])
+    self.assertNotIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertNotIn(self.w.op.name, source_annotation[self.w_line_number])
+
+    self.assertIn(self.u.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.v.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.w.name, source_annotation[self.helper_line_number])
+
+  def testCallingAnnotateSourceWithoutPythonGraphRaisesException(self):
+    self.dump.set_python_graph(None)
+    with self.assertRaises(ValueError):
+      source_utils.annotate_source(self.dump, self.curr_file_path)
+
+  def testCallingAnnotateSourceOnUnrelatedSourceFileDoesNotError(self):
+    # Create an unrelated source file.
+    unrelated_source_path = tempfile.mktemp()
+    with open(unrelated_source_path, "wt") as source_file:
+      source_file.write("print('hello, world')\n")
+
+    self.assertEqual({},
+                     source_utils.annotate_source(self.dump,
+                                                  unrelated_source_path))
+
+    # Clean up unrelated source file.
+    os.remove(unrelated_source_path)
+
+
+class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
+
+  def createAndRunGraphWithWhileLoop(self):
+    """Create and run a TensorFlow Graph with a while loop to generate dumps."""
+
+    self.dump_root = self.get_temp_dir()
+    self.curr_file_path = os.path.abspath(
+        tf_inspect.getfile(tf_inspect.currentframe()))
+
+    # Run a simple TF graph to generate some debug dumps that can be used in
+    # source annotation.
+    with session.Session() as sess:
+      loop_body = lambda i: math_ops.add(i, 2)
+      self.traceback_first_line = line_number_above()
+
+      loop_cond = lambda i: math_ops.less(i, 16)
+
+      i = constant_op.constant(10, name="i")
+      loop = control_flow_ops.while_loop(loop_cond, loop_body, [i])
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options, sess.graph, debug_urls=["file://%s" % self.dump_root])
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(loop, options=run_options, run_metadata=run_metadata)
+
+      self.dump = debug_data.DebugDumpDir(
+          self.dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.dump.set_python_graph(sess.graph)
+
+  def setUp(self):
+    self.createAndRunGraphWithWhileLoop()
+
+  def tearDown(self):
+    if os.path.isdir(self.dump_root):
+      shutil.rmtree(self.dump_root)
+    ops.reset_default_graph()
+
+  def testGenerateSourceList(self):
+    source_list = source_utils.list_source_files_against_dump(self.dump)
+
+    # Assert that the file paths are sorted and unique.
+    file_paths = [item[0] for item in source_list]
+    self.assertEqual(sorted(file_paths), file_paths)
+    self.assertEqual(len(set(file_paths)), len(file_paths))
+
+    # Assert that each item of source_list has length 6.
+    for item in source_list:
+      self.assertTrue(isinstance(item, tuple))
+      self.assertEqual(6, len(item))
+
+    # The while loop body should have executed 3 times. The following table
+    # lists the tensors and how many times each of them is dumped.
+    #   Tensor name            # of times dumped:
+    #   i:0                    1
+    #   while/Enter:0          1
+    #   while/Merge:0          4
+    #   while/Merge:1          4
+    #   while/Less/y:0         4
+    #   while/Less:0           4
+    #   while/LoopCond:0       4
+    #   while/Switch:0         1
+    #   while/Swtich:1         3
+    #   while/Identity:0       3
+    #   while/Add/y:0          3
+    #   while/Add:0            3
+    #   while/NextIteration:0  3
+    #   while/Exit:0           1
+    # ----------------------------
+    #   (Total)                39
+    #
+    # The total number of nodes is 12.
+    # The total number of tensors is 14 (2 of the nodes have 2 outputs:
+    #   while/Merge, while/Switch).
+
+    _, is_tf_py_library, num_nodes, num_tensors, num_dumps, first_line = (
+        source_list[file_paths.index(self.curr_file_path)])
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(12, num_nodes)
+    self.assertEqual(14, num_tensors)
+    self.assertEqual(39, num_dumps)
+    self.assertEqual(self.traceback_first_line, first_line)
+
+  def testGenerateSourceListWithNodeNameFilter(self):
+    source_list = source_utils.list_source_files_against_dump(
+        self.dump, node_name_regex_whitelist=r"while/Add.*")
+
+    # Assert that the file paths are sorted.
+    file_paths = [item[0] for item in source_list]
+    self.assertEqual(sorted(file_paths), file_paths)
+    self.assertEqual(len(set(file_paths)), len(file_paths))
+
+    # Assert that each item of source_list has length 4.
+    for item in source_list:
+      self.assertTrue(isinstance(item, tuple))
+      self.assertEqual(6, len(item))
+
+    # Due to the node-name filtering the result should only contain 2 nodes
+    # and 2 tensors. The total number of dumped tensors should be 6:
+    #   while/Add/y:0          3
+    #   while/Add:0            3
+    _, is_tf_py_library, num_nodes, num_tensors, num_dumps, _ = (
+        source_list[file_paths.index(self.curr_file_path)])
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(2, num_nodes)
+    self.assertEqual(2, num_tensors)
+    self.assertEqual(6, num_dumps)
+
+  def testGenerateSourceListWithPathRegexFilter(self):
+    curr_file_basename = os.path.basename(self.curr_file_path)
+    source_list = source_utils.list_source_files_against_dump(
+        self.dump,
+        path_regex_whitelist=(
+            ".*" + curr_file_basename.replace(".", "\\.") + "$"))
+
+    self.assertEqual(1, len(source_list))
+    (file_path, is_tf_py_library, num_nodes, num_tensors, num_dumps,
+     first_line) = source_list[0]
+    self.assertEqual(self.curr_file_path, file_path)
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(12, num_nodes)
+    self.assertEqual(14, num_tensors)
+    self.assertEqual(39, num_dumps)
+    self.assertEqual(self.traceback_first_line, first_line)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/stepper.py b/tensorflow/python/debug/lib/stepper.py
similarity index 99%
rename from tensorflow/python/debug/stepper.py
rename to tensorflow/python/debug/lib/stepper.py
index ab500f52e35..c814520b7e7 100644
--- a/tensorflow/python/debug/stepper.py
+++ b/tensorflow/python/debug/lib/stepper.py
@@ -26,8 +26,8 @@ import time
 import six
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.debug import debug_data
-from tensorflow.python.debug import debug_utils
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import session_ops
 
@@ -623,7 +623,7 @@ class NodeStepper(object):
         elif (can_feed and inp not in feeds and
               use_tensor_handles and inp.name in self._tensor_handles):
           # Tensor handle found in cache.
-          feeds[inp] = self._tensor_handles[inp.name].eval()
+          feeds[inp] = self._tensor_handles[inp.name]
           self._last_feed_types[inp.name] = self.FEED_TYPE_HANDLE
         elif (can_feed and inp not in feeds and
               use_dumped_intermediates and
diff --git a/tensorflow/python/debug/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
similarity index 99%
rename from tensorflow/python/debug/stepper_test.py
rename to tensorflow/python/debug/lib/stepper_test.py
index 41a62c49b18..78e7b3b5eba 100644
--- a/tensorflow/python/debug/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
-from tensorflow.python.debug.stepper import NodeStepper
+from tensorflow.python.debug.lib.stepper import NodeStepper
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -591,7 +591,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
     with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
       self.assertIsNone(stepper.last_updated())
 
-  def testContToUpdateInvalidatesDumpedIntermedates(self):
+  def testContToUpdateInvalidatesDumpedIntermediates(self):
     with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
       self.assertAllClose(400.0, stepper.cont("q:0"))
       self.assertItemsEqual(["v/read:0", "p:0"],
diff --git a/tensorflow/python/debug/session_debug_testlib.py b/tensorflow/python/debug/session_debug_testlib.py
deleted file mode 100644
index ba453ef7e09..00000000000
--- a/tensorflow/python/debug/session_debug_testlib.py
+++ /dev/null
@@ -1,1049 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for debugger functionalities in tf.Session."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-import shutil
-import tempfile
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.debug import debug_data
-from tensorflow.python.debug import debug_utils
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
-from tensorflow.python.training import gradient_descent
-
-
-class SessionDebugTestBase(test_util.TensorFlowTestCase):
-  """Base class for unit tests of tfdbg running with tf.Session."""
-
-  @classmethod
-  def setUpClass(cls):
-    if test.is_gpu_available():
-      cls._expected_partition_graph_count = 2
-      cls._expected_num_devices = 2
-      cls._main_device = "/job:localhost/replica:0/task:0/gpu:0"
-    else:
-      cls._expected_partition_graph_count = 1
-      cls._expected_num_devices = 1
-      cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
-
-  @classmethod
-  def tearDownClass(cls):
-    pass
-
-  def setUp(self):
-    self._dump_root = tempfile.mkdtemp()
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-    # Tear down temporary dump directory.
-    if os.path.isdir(self._dump_root):
-      shutil.rmtree(self._dump_root)
-
-  def _debug_urls(self, run_number=None):
-    raise NotImplementedError(
-        "_debug_urls() method is not implemented in the base test class.")
-
-  def _debug_dump_dir(self, run_number=None):
-    raise NotImplementedError(
-        "_debug_dump_dir() method is not implemented in the base test class.")
-
-  def _generate_dump_from_simple_addition_graph(self):
-    with session.Session() as sess:
-      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
-      v_init_val = np.array([[2.0], [-1.0]])
-
-      # Use node names with overlapping namespace (i.e., parent directory) to
-      # test concurrent, non-racing directory creation.
-      u_name = "u"
-      v_name = "v"
-      w_name = "w"
-
-      u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.Variable(u_init, name=u_name)
-      v_init = constant_op.constant(v_init_val, shape=[2, 1])
-      v = variables.Variable(v_init, name=v_name)
-
-      w = math_ops.matmul(u, v, name=w_name)
-
-      u.initializer.run()
-      v.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_urls = "file://%s" % self._dump_root
-
-      # Add debug tensor watch for u.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s/read" % u_name, 0, debug_urls=debug_urls)
-      # Add debug tensor watch for v.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
-
-      run_metadata = config_pb2.RunMetadata()
-
-      # Invoke Session.run().
-      sess.run(w, options=run_options, run_metadata=run_metadata)
-
-      self.assertEqual(self._expected_partition_graph_count,
-                       len(run_metadata.partition_graphs))
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-    simple_add_results = collections.namedtuple("SimpleAddResults", [
-        "u_init_val", "v_init_val", "u", "v", "w", "u_name", "v_name", "w_name",
-        "dump"
-    ])
-    return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name,
-                              w_name, dump)
-
-  def testConcurrentDumpingToPathsWithOverlappingParentDirsWorks(self):
-    results = self._generate_dump_from_simple_addition_graph()
-    self.assertTrue(results.dump.loaded_partition_graphs())
-
-    # Verify the dumped tensor values for u and v.
-    self.assertEqual(2, results.dump.size)
-
-    self.assertAllClose([results.u_init_val],
-                        results.dump.get_tensors("%s/read" % results.u_name, 0,
-                                                 "DebugIdentity"))
-    self.assertAllClose([results.v_init_val],
-                        results.dump.get_tensors("%s/read" % results.v_name, 0,
-                                                 "DebugIdentity"))
-
-    self.assertGreaterEqual(
-        results.dump.get_rel_timestamps("%s/read" % results.u_name, 0,
-                                        "DebugIdentity")[0], 0)
-    self.assertGreaterEqual(
-        results.dump.get_rel_timestamps("%s/read" % results.v_name, 0,
-                                        "DebugIdentity")[0], 0)
-
-    self.assertGreater(
-        results.dump.get_dump_sizes_bytes("%s/read" % results.u_name, 0,
-                                          "DebugIdentity")[0], 0)
-    self.assertGreater(
-        results.dump.get_dump_sizes_bytes("%s/read" % results.v_name, 0,
-                                          "DebugIdentity")[0], 0)
-
-  def testGetOpTypeWorks(self):
-    results = self._generate_dump_from_simple_addition_graph()
-
-    self.assertEqual(results.u.op.type,
-                     results.dump.node_op_type(results.u_name))
-    self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
-    self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
-
-    with self.assertRaisesRegexp(
-        ValueError, "Node 'foo_bar' does not exist in partition graphs."):
-      results.dump.node_op_type("foo_bar")
-
-  def testDumpStringTensorsWorks(self):
-    with session.Session() as sess:
-      str1_init_val = np.array(b"abc")
-      str2_init_val = np.array(b"def")
-
-      str1_init = constant_op.constant(str1_init_val)
-      str2_init = constant_op.constant(str2_init_val)
-
-      str1_name = "str1"
-      str2_name = "str2"
-      str1 = variables.Variable(str1_init, name=str1_name)
-      str2 = variables.Variable(str2_init, name=str2_name)
-      # Concatenate str1 and str2
-      str_concat = math_ops.add(str1, str2, name="str_concat")
-
-      str1.initializer.run()
-      str2.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_urls = self._debug_urls()
-
-      # Add debug tensor watch for u.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s/read" % str1_name, 0, debug_urls=debug_urls)
-      # Add debug tensor watch for v.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s/read" % str2_name, 0, debug_urls=debug_urls)
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(str_concat, options=run_options, run_metadata=run_metadata)
-
-      # String ops are located on CPU.
-      self.assertEqual(1, len(run_metadata.partition_graphs))
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      self.assertIn(str1_name, dump.nodes())
-      self.assertIn(str2_name, dump.nodes())
-
-      self.assertEqual(2, dump.size)
-
-      self.assertEqual([str1_init_val],
-                       dump.get_tensors("%s/read" % str1_name, 0,
-                                        "DebugIdentity"))
-      self.assertEqual([str2_init_val],
-                       dump.get_tensors("%s/read" % str2_name, 0,
-                                        "DebugIdentity"))
-
-      self.assertGreaterEqual(
-          dump.get_rel_timestamps("%s/read" % str1_name, 0, "DebugIdentity")[0],
-          0)
-      self.assertGreaterEqual(
-          dump.get_rel_timestamps("%s/read" % str2_name, 0, "DebugIdentity")[0],
-          0)
-
-      self.assertGreater(
-          dump.get_dump_sizes_bytes("%s/read" % str1_name, 0,
-                                    "DebugIdentity")[0], 0)
-      self.assertGreater(
-          dump.get_dump_sizes_bytes("%s/read" % str2_name, 0,
-                                    "DebugIdentity")[0], 0)
-
-  def testDumpUninitializedVariable(self):
-    op_namespace = "testDumpUninitializedVariable"
-    with session.Session() as sess:
-      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
-      s_init_val = b"str1"
-
-      u_name = "%s/u" % op_namespace
-      s_name = "%s/s" % op_namespace
-
-      u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.Variable(u_init, name=u_name)
-      s_init = constant_op.constant(s_init_val)
-      s = variables.Variable(s_init, name=s_name)
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_urls = self._debug_urls()
-
-      # Add debug tensor watch for u.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s" % u_name, 0, debug_urls=debug_urls)
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s" % s_name, 0, debug_urls=debug_urls)
-
-      run_metadata = config_pb2.RunMetadata()
-
-      # Initialize u and s.
-      sess.run(variables.global_variables_initializer(),
-               options=run_options,
-               run_metadata=run_metadata)
-
-      # Verify the dump file for the uninitialized value of u.
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      self.assertEqual(2, dump.size)
-      self.assertEqual(self._expected_partition_graph_count,
-                       len(run_metadata.partition_graphs))
-
-      # Verify that the variable is properly initialized by the run() call.
-      u_vals = dump.get_tensors(u_name, 0, "DebugIdentity")
-      s_vals = dump.get_tensors(s_name, 0, "DebugIdentity")
-      self.assertEqual(1, len(u_vals))
-      self.assertIsNone(u_vals[0])
-      self.assertEqual(1, len(s_vals))
-      self.assertIsNone(s_vals[0])
-
-      # Call run() again, to check that u is initialized properly.
-      self.assertAllClose(u_init_val, sess.run(u))
-      self.assertEqual(s_init_val, sess.run(s))
-
-  def testDebugWhileLoopGeneratesMultipleDumps(self):
-    with session.Session() as sess:
-      num_iter = 10
-
-      # "u" is the Variable being updated in the loop.
-      u_name = "testDumpToFileWhileLoop/u"
-      u_namespace = u_name.split("/")[0]
-
-      u_init_val = np.array(11.0)
-      u_init = constant_op.constant(u_init_val)
-      u = variables.Variable(u_init, name=u_name)
-
-      # "v" is the increment.
-      v_name = "testDumpToFileWhileLoop/v"
-      v_namespace = v_name.split("/")[0]
-
-      v_init_val = np.array(2.0)
-      v_init = constant_op.constant(v_init_val)
-      v = variables.Variable(v_init, name=v_name)
-
-      u.initializer.run()
-      v.initializer.run()
-
-      i = constant_op.constant(0, name="testDumpToFileWhileLoop/i")
-
-      def cond(i):
-        return math_ops.less(i, num_iter)
-
-      def body(i):
-        new_u = state_ops.assign_add(u, v)
-        new_i = math_ops.add(i, 1)
-        op = control_flow_ops.group(new_u)
-        new_i = control_flow_ops.with_dependencies([op], new_i)
-        return [new_i]
-
-      loop = control_flow_ops.while_loop(cond, body, [i], parallel_iterations=1)
-
-      # Create RunOptions for debug-watching tensors
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_urls = self._debug_urls()
-
-      # Add debug tensor watch for u.
-      debug_utils.add_debug_tensor_watch(
-          run_options, u_name, 0, debug_urls=debug_urls)
-      # Add debug tensor watch for v.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "%s/read" % v_name, 0, debug_urls=debug_urls)
-      # Add debug tensor watch for while/Identity.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "while/Identity", 0, debug_urls=debug_urls)
-      # Add debug tensor watch for while/Add/y.
-      debug_utils.add_debug_tensor_watch(
-          run_options, "while/Add/y", 0, debug_urls=debug_urls)
-
-      run_metadata = config_pb2.RunMetadata()
-      r = sess.run(loop, options=run_options, run_metadata=run_metadata)
-
-      self.assertEqual(self._expected_partition_graph_count,
-                       len(run_metadata.partition_graphs))
-
-      self.assertEqual(num_iter, r)
-
-      u_val_final = sess.run(u)
-      self.assertAllClose(u_init_val + num_iter * v_init_val, u_val_final)
-
-      # Verify dump files
-      self.assertTrue(os.path.isdir(self._dump_root))
-
-      self.assertTrue(os.path.isdir(os.path.join(self._dump_root, u_namespace)))
-      self.assertTrue(
-          os.path.isdir(os.path.join(self._dump_root, v_namespace, "v")))
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Expected dumped tensors: u, v/read, 10 iterations of while/Identity,
-      # and 10 iterations of while/Add/y.
-      self.assertEqual(1 + 1 + num_iter + num_iter, dump.size)
-
-      # Verify tensor values.
-      self.assertAllClose([u_init_val],
-                          dump.get_tensors(u_name, 0, "DebugIdentity"))
-      self.assertAllClose([v_init_val],
-                          dump.get_tensors("%s/read" % v_name, 0,
-                                           "DebugIdentity"))
-
-      while_id_tensors = dump.get_tensors("while/Identity", 0, "DebugIdentity")
-      self.assertEqual(10, len(while_id_tensors))
-      for k in xrange(len(while_id_tensors)):
-        self.assertAllClose(np.array(k), while_id_tensors[k])
-
-      # Verify ascending timestamps from the while loops.
-      while_id_rel_timestamps = dump.get_rel_timestamps("while/Identity", 0,
-                                                        "DebugIdentity")
-      while_id_dump_sizes_bytes = dump.get_dump_sizes_bytes("while/Identity", 0,
-                                                            "DebugIdentity")
-      self.assertEqual(10, len(while_id_rel_timestamps))
-      prev_rel_time = 0
-      prev_dump_size_bytes = while_id_dump_sizes_bytes[0]
-      for rel_time, dump_size_bytes in zip(while_id_rel_timestamps,
-                                           while_id_dump_sizes_bytes):
-        self.assertGreaterEqual(rel_time, prev_rel_time)
-        self.assertEqual(dump_size_bytes, prev_dump_size_bytes)
-        prev_rel_time = rel_time
-        prev_dump_size_bytes = dump_size_bytes
-
-      # Test querying debug watch keys from node name.
-      watch_keys = dump.debug_watch_keys("while/Identity")
-      self.assertEqual(["while/Identity:0:DebugIdentity"], watch_keys)
-
-      # Test querying debug datum instances from debug watch key.
-      self.assertEqual(10, len(dump.watch_key_to_data(watch_keys[0])))
-      self.assertEqual([], dump.watch_key_to_data("foo"))
-
-  def testFindNodesWithBadTensorValues(self):
-    with session.Session() as sess:
-      u_name = "testFindNodesWithBadTensorValues/u"
-      v_name = "testFindNodesWithBadTensorValues/v"
-      w_name = "testFindNodesWithBadTensorValues/w"
-      x_name = "testFindNodesWithBadTensorValues/x"
-      y_name = "testFindNodesWithBadTensorValues/y"
-      z_name = "testFindNodesWithBadTensorValues/z"
-
-      u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
-      v_init = constant_op.constant([2.0, 1.0])
-      v = variables.Variable(v_init, name=v_name)
-
-      # Expected output: [0.0, 3.0]
-      w = math_ops.subtract(u, v, name=w_name)
-
-      # Expected output: [inf, 1.3333]
-      x = math_ops.div(u, w, name=x_name)
-
-      # Expected output: [nan, 4.0]
-      y = math_ops.multiply(w, x, name=y_name)
-
-      z = math_ops.multiply(y, y, name=z_name)
-
-      u.initializer.run()
-      v.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(z, options=run_options, run_metadata=run_metadata)
-
-      self.assertEqual(self._expected_partition_graph_count,
-                       len(run_metadata.partition_graphs))
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      def has_bad_value(_, tensor):
-        return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
-
-      # Find all "offending tensors".
-      bad_data = dump.find(has_bad_value)
-
-      # Verify that the nodes with bad values are caught through running find
-      # on the debug dump.
-      self.assertEqual(3, len(bad_data))
-      self.assertEqual(x_name, bad_data[0].node_name)
-      self.assertEqual(y_name, bad_data[1].node_name)
-      self.assertEqual(z_name, bad_data[2].node_name)
-
-      # Test first_n kwarg of find(): Find the first offending tensor.
-      first_bad_datum = dump.find(has_bad_value, first_n=1)
-
-      self.assertEqual(1, len(first_bad_datum))
-      self.assertEqual(x_name, first_bad_datum[0].node_name)
-
-  def _session_run_for_graph_structure_lookup(self):
-    with session.Session() as sess:
-      u_name = "testDumpGraphStructureLookup/u"
-      v_name = "testDumpGraphStructureLookup/v"
-      w_name = "testDumpGraphStructureLookup/w"
-
-      u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
-      v = math_ops.add(u, u, name=v_name)
-      w = math_ops.add(v, v, name=w_name)
-
-      u.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(w, options=run_options, run_metadata=run_metadata)
-
-    self.assertEqual(self._expected_partition_graph_count,
-                     len(run_metadata.partition_graphs))
-
-    dump = debug_data.DebugDumpDir(
-        self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-    return u_name, v_name, w_name, dump
-
-  def testGraphStructureLookupGivesDevicesAndNodesInfo(self):
-    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
-
-    # Test num_devices().
-    self.assertEqual(self._expected_num_devices, len(dump.devices()))
-
-    # Test node_device().
-    self.assertEqual(self._main_device, dump.node_device(u_name))
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
-      dump.node_device(u_name + "foo")
-
-    # Test node_exists().
-    self.assertTrue(dump.node_exists(u_name))
-    self.assertTrue(dump.node_exists(u_name + "/read"))
-    self.assertFalse(dump.node_exists(u_name + "/read" + "/foo"))
-
-  def testGraphStructureLookupGivesNodesAndAttributes(self):
-    u_name, _, _, dump = self._session_run_for_graph_structure_lookup()
-
-    u_read_name = u_name + "/read"
-
-    # Test node name list lookup of the DebugDumpDir object.
-    node_names = dump.nodes()
-    self.assertTrue(u_name in node_names)
-    self.assertTrue(u_read_name in node_names)
-
-    # Test querying node attributes.
-    u_attr = dump.node_attributes(u_name)
-    self.assertEqual(dtypes.float32, u_attr["dtype"].type)
-    self.assertEqual(1, len(u_attr["shape"].shape.dim))
-    self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
-
-    with self.assertRaisesRegexp(ValueError, "No node named \"foo\" exists"):
-      dump.node_attributes("foo")
-
-  def testGraphStructureLookupGivesDebugWatchKeys(self):
-    u_name, v_name, w_name, dump = (
-        self._session_run_for_graph_structure_lookup())
-
-    # Test querying the debug watch keys with node names.
-    self.assertEqual(["%s:0:DebugIdentity" % u_name],
-                     dump.debug_watch_keys(u_name))
-    self.assertEqual(["%s:0:DebugIdentity" % v_name],
-                     dump.debug_watch_keys(v_name))
-    self.assertEqual(["%s:0:DebugIdentity" % w_name],
-                     dump.debug_watch_keys(w_name))
-    self.assertEqual([], dump.debug_watch_keys("foo"))
-
-    # Test querying debug datum instances from debug watch.
-    u_data = dump.watch_key_to_data(dump.debug_watch_keys(u_name)[0])
-    self.assertEqual(1, len(u_data))
-    self.assertEqual(u_name, u_data[0].node_name)
-    self.assertEqual(0, u_data[0].output_slot)
-    self.assertEqual("DebugIdentity", u_data[0].debug_op)
-    self.assertGreaterEqual(u_data[0].timestamp, 0)
-
-    self.assertEqual([], dump.watch_key_to_data("foo"))
-
-  def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
-    u_name, v_name, w_name, dump = (
-        self._session_run_for_graph_structure_lookup())
-
-    u_read_name = u_name + "/read"
-
-    # Test the inputs lookup of the DebugDumpDir object.
-    self.assertEqual([], dump.node_inputs(u_name))
-    self.assertEqual([u_name], dump.node_inputs(u_read_name))
-    self.assertEqual([u_read_name] * 2, dump.node_inputs(v_name))
-    self.assertEqual([v_name] * 2, dump.node_inputs(w_name))
-
-    self.assertEqual([], dump.node_inputs(u_name, is_control=True))
-    self.assertEqual([], dump.node_inputs(u_read_name, is_control=True))
-    self.assertEqual([], dump.node_inputs(v_name, is_control=True))
-    self.assertEqual([], dump.node_inputs(w_name, is_control=True))
-
-    # Test the outputs recipient lookup of the DebugDumpDir object.
-    self.assertTrue(u_read_name in dump.node_recipients(u_name))
-    self.assertEqual(2, dump.node_recipients(u_read_name).count(v_name))
-    self.assertEqual(2, dump.node_recipients(v_name).count(w_name))
-
-    self.assertEqual([], dump.node_recipients(u_name, is_control=True))
-    self.assertEqual([], dump.node_recipients(u_read_name, is_control=True))
-    self.assertEqual([], dump.node_recipients(v_name, is_control=True))
-    self.assertEqual([], dump.node_recipients(w_name, is_control=True))
-
-    # Test errors raised on invalid node names.
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
-      dump.node_inputs(u_name + "foo")
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
-      dump.node_recipients(u_name + "foo")
-
-    # Test transitive_inputs().
-    self.assertEqual([], dump.transitive_inputs(u_name))
-    self.assertEqual([u_name], dump.transitive_inputs(u_read_name))
-    self.assertEqual(
-        set([u_name, u_read_name]), set(dump.transitive_inputs(v_name)))
-    self.assertEqual(
-        set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
-      dump.transitive_inputs(u_name + "foo")
-
-  def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
-    _, _, _, dump = self._session_run_for_graph_structure_lookup()
-
-    # Now load the dump again, without the partition graphs, so we can check
-    # errors are not raised because the partition graphs are loaded from the
-    # dump directory.
-    dump = debug_data.DebugDumpDir(self._dump_root, validate=False)
-    self.assertTrue(dump.loaded_partition_graphs())
-
-  def testCausalityCheckOnDumpsDetectsWrongTemporalOrder(self):
-    with session.Session() as sess:
-      u_name = "testDumpCausalityCheck/u"
-      v_name = "testDumpCausalityCheck/v"
-      w_name = "testDumpCausalityCheck/w"
-
-      u_init = constant_op.constant([2.0, 4.0])
-      u = variables.Variable(u_init, name=u_name)
-      v = math_ops.add(u, u, name=v_name)
-      w = math_ops.add(v, v, name=w_name)
-
-      u.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(w, options=run_options, run_metadata=run_metadata)
-
-      self.assertEqual(self._expected_partition_graph_count,
-                       len(run_metadata.partition_graphs))
-
-      # First, loading the original dump without supplying the
-      # partition_graphs should not cause a LookupError, validation occurs
-      # only with partition_graphs loaded.
-      debug_data.DebugDumpDir(self._dump_root)
-
-      # Now, loading the original dump with partition graphs supplied should
-      # succeed. The validation should pass quietly.
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Get the dump file names and compute their timestamps.
-      self.assertEqual(
-          1, len(dump.get_tensor_file_paths(u_name, 0, "DebugIdentity")))
-      u_file_path = dump.get_tensor_file_paths(u_name, 0, "DebugIdentity")[0]
-
-      self.assertEqual(
-          1, len(dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")))
-      v_file_path = dump.get_tensor_file_paths(v_name, 0, "DebugIdentity")[0]
-
-      u_timestamp = int(u_file_path[u_file_path.rindex("_") + 1:])
-      v_timestamp = int(v_file_path[v_file_path.rindex("_") + 1:])
-
-      # Swap the time stamps
-      new_u_file_path = u_file_path[:u_file_path.rindex(
-          "_")] + "_%d" % v_timestamp
-      new_v_file_path = v_file_path[:v_file_path.rindex(
-          "_")] + "_%d" % u_timestamp
-
-      os.rename(u_file_path, new_u_file_path)
-      os.rename(v_file_path, new_v_file_path)
-
-      # Load the dump directory again. Now a ValueError is expected to be
-      # raised due to the timestamp swap.
-      with self.assertRaisesRegexp(ValueError, "Causality violated"):
-        dump = debug_data.DebugDumpDir(
-            self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Loading the dump directory with kwarg "validate" set explicitly to
-      # False should get rid of the error.
-      dump = debug_data.DebugDumpDir(
-          self._dump_root,
-          partition_graphs=run_metadata.partition_graphs,
-          validate=False)
-
-  def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self):
-    with session.Session() as sess:
-      x_name = "oneOfTwoSlots/x"
-      u_name = "oneOfTwoSlots/u"
-      v_name = "oneOfTwoSlots/v"
-      w_name = "oneOfTwoSlots/w"
-      y_name = "oneOfTwoSlots/y"
-
-      x = variables.Variable([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
-      sess.run(x.initializer)
-
-      unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
-
-      v = math_ops.add(unique_x, unique_x, name=v_name)
-      w = math_ops.add(indices, indices, name=w_name)
-      y = math_ops.add(w, w, name=y_name)
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      # Watch only the first output slot of u, even though it has two output
-      # slots.
-      debug_utils.add_debug_tensor_watch(
-          run_options, u_name, 0, debug_urls=self._debug_urls())
-      debug_utils.add_debug_tensor_watch(
-          run_options, w_name, 0, debug_urls=self._debug_urls())
-      debug_utils.add_debug_tensor_watch(
-          run_options, y_name, 0, debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run([v, y], options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root,
-          partition_graphs=run_metadata.partition_graphs,
-          validate=True)
-
-      self.assertAllClose([1, 3, 7],
-                          dump.get_tensors(u_name, 0, "DebugIdentity")[0])
-
-  def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
-    """Test watching output slots not attached to any outgoing edges."""
-
-    with session.Session() as sess:
-      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
-      u = constant_op.constant(u_init_val, shape=[2, 2], name="u")
-
-      # Create a control edge from a node with an output: From u to z.
-      # Node u will get executed only because of the control edge. The output
-      # tensor u:0 is not attached to any outgoing edge in the graph. This test
-      # checks that the debugger can watch such a tensor.
-      with ops.control_dependencies([u]):
-        z = control_flow_ops.no_op(name="z")
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(z, options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Assert that the DebugIdentity watch on u works properly.
-      self.assertEqual(1, len(dump.dumped_tensor_data))
-      datum = dump.dumped_tensor_data[0]
-      self.assertEqual("u", datum.node_name)
-      self.assertEqual(0, datum.output_slot)
-      self.assertEqual("DebugIdentity", datum.debug_op)
-      self.assertAllClose([[5.0, 3.0], [-1.0, 0.0]], datum.get_tensor())
-
-  def testWatchingVariableUpdateOpsSeesUpdatedValues(self):
-    """Watch output slots on Variable-updating ops, with no emitted edges."""
-
-    with session.Session() as sess:
-      u_init = constant_op.constant(10.0)
-      u = variables.Variable(u_init, name="gdo/u")
-      v_init = constant_op.constant(20.0)
-      v = variables.Variable(v_init, name="gdo/v")
-
-      w = math_ops.multiply(u, v, name="gdo/w")
-      # gdo stands for GradientDescentOptimizer.
-
-      train_op = gradient_descent.GradientDescentOptimizer(
-          learning_rate=0.1).minimize(
-              w, name="gdo/train")
-
-      u.initializer.run()
-      v.initializer.run()
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      sess.run(train_op, options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      update_u_data = dump.watch_key_to_data(
-          "gdo/train/update_gdo/u/ApplyGradientDescent:0:DebugIdentity")
-      self.assertEqual(1, len(update_u_data))
-
-      # Gradient descent on u: w = u * v, so dw / du = v.
-      # Updated value of u should be:
-      #   10.0 - learning_rate * v = 10.0 - 0.1 * 20.0 = 8.0
-      self.assertAllClose(8.0, update_u_data[0].get_tensor())
-
-      update_v_data = dump.watch_key_to_data(
-          "gdo/train/update_gdo/v/ApplyGradientDescent:0:DebugIdentity")
-      self.assertEqual(1, len(update_v_data))
-
-      # Gradient descent on u: w = u * v, so dw / dv = u.
-      # Updated value of u should be:
-      #   20.0 - learning_rate * u = 20.0 - 0.1 * 10.0 = 19.0
-      self.assertAllClose(19.0, update_v_data[0].get_tensor())
-
-      # Verify that the Variables u and v are updated properly.
-      self.assertAllClose(8.0, sess.run(u))
-      self.assertAllClose(19.0, sess.run(v))
-
-  def testAllowsWatchingUnconnectedOutputTensor(self):
-    """Watch an output slot not emitting any edges.
-
-    (Not even control edges from the node.)
-    """
-
-    with session.Session() as sess:
-      x_init = constant_op.constant([2, 2, 3, 5, 5])
-      x = variables.Variable(x_init, name="unconnected/x")
-
-      # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the
-      # graph. Let the debugger watch the unused slot 1.
-      unique_x, _ = array_ops.unique(x, name="unconnected/unique_x")
-      y = math_ops.add(unique_x, [0, 1, 2], name="unconnected/y")
-
-      x.initializer.run()
-
-      # Verify that only slot 0 of unique_x has recipients, while slot 1 of the
-      # same node does not have recipients.
-      unique_x_slot_0_recipients = []
-      unique_x_slot_1_recipients = []
-      for op in sess.graph.get_operations():
-        for inp in op.inputs:
-          if inp.name == "unconnected/unique_x:0":
-            unique_x_slot_0_recipients.append(op.name)
-          elif inp.name == "unconnected/unique_x:1":
-            unique_x_slot_1_recipients.append(op.name)
-
-      self.assertEqual(["unconnected/y"], unique_x_slot_0_recipients)
-      self.assertEqual([], unique_x_slot_1_recipients)
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      run_metadata = config_pb2.RunMetadata()
-      result = sess.run(y, options=run_options, run_metadata=run_metadata)
-      self.assertAllClose([2, 4, 7], result)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Assert that the connected slot (slot 0) is dumped properly.
-      unique_x_slot_0_dumps = dump.watch_key_to_data(
-          "unconnected/unique_x:0:DebugIdentity")
-      self.assertEqual(1, len(unique_x_slot_0_dumps))
-      self.assertEqual("unconnected/unique_x",
-                       unique_x_slot_0_dumps[0].node_name)
-      self.assertEqual(0, unique_x_slot_0_dumps[0].output_slot)
-      self.assertAllClose([2, 3, 5], unique_x_slot_0_dumps[0].get_tensor())
-
-      # Assert that the unconnected slot (slot 1) is dumped properly.
-      unique_x_slot_1_dumps = dump.watch_key_to_data(
-          "unconnected/unique_x:1:DebugIdentity")
-      self.assertEqual(1, len(unique_x_slot_1_dumps))
-      self.assertEqual("unconnected/unique_x",
-                       unique_x_slot_1_dumps[0].node_name)
-      self.assertEqual(1, unique_x_slot_1_dumps[0].output_slot)
-      self.assertAllClose([0, 0, 1, 2, 2],
-                          unique_x_slot_1_dumps[0].get_tensor())
-
-  def testDebuggingDuringOpError(self):
-    """Test the debug tensor dumping when error occurs in graph runtime."""
-
-    with session.Session() as sess:
-      ph = array_ops.placeholder(dtypes.float32, name="mismatch/ph")
-      x = array_ops.transpose(ph, name="mismatch/x")
-      m = constant_op.constant(
-          np.array(
-              [[1.0, 2.0]], dtype=np.float32), name="mismatch/m")
-      y = math_ops.matmul(m, x, name="mismatch/y")
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugIdentity"],
-          debug_urls=self._debug_urls())
-
-      with self.assertRaises(errors.OpError):
-        sess.run(y,
-                 options=run_options,
-                 feed_dict={ph: np.array([[-3.0], [0.0]])})
-
-      dump = debug_data.DebugDumpDir(self._dump_root)
-
-      # Despite the fact that the run() call errored out and partition_graphs
-      # are not available via run_metadata, the partition graphs should still
-      # have been loaded from the dump directory.
-      self.assertTrue(dump.loaded_partition_graphs())
-
-      m_dumps = dump.watch_key_to_data("mismatch/m:0:DebugIdentity")
-      self.assertEqual(1, len(m_dumps))
-      self.assertAllClose(np.array([[1.0, 2.0]]), m_dumps[0].get_tensor())
-
-      x_dumps = dump.watch_key_to_data("mismatch/x:0:DebugIdentity")
-      self.assertEqual(1, len(x_dumps))
-      self.assertAllClose(np.array([[-3.0, 0.0]]), x_dumps[0].get_tensor())
-
-  def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
-    with session.Session() as sess:
-      a = variables.Variable(
-          [
-              np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
-              -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
-          ],
-          dtype=np.float32,
-          name="numeric_summary/a")
-      b = variables.Variable(
-          [0.0] * 18, dtype=np.float32, name="numeric_summary/b")
-      c = math_ops.add(a, b, name="numeric_summary/c")
-
-      sess.run(variables.global_variables_initializer())
-
-      run_metadata = config_pb2.RunMetadata()
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugNumericSummary"],
-          debug_urls=self._debug_urls())
-
-      sess.run(c, options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-      self.assertTrue(dump.loaded_partition_graphs())
-
-      self.assertAllClose([[
-          1.0, 18.0, 2.0, 2.0, 3.0, 2.0, 5.0, 4.0, -3.0, 7.0, 0.85714286,
-          8.97959184
-      ]], dump.get_tensors("numeric_summary/a/read", 0, "DebugNumericSummary"))
-
-  def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self):
-    with session.Session() as sess:
-      a = variables.Variable(
-          [42], dtype=np.float32, name="numeric_summary_uninit/a")
-
-      run_metadata = config_pb2.RunMetadata()
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_ops=["DebugNumericSummary"],
-          debug_urls=self._debug_urls())
-
-      sess.run(a.initializer, options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-      self.assertTrue(dump.loaded_partition_graphs())
-
-      # DebugNumericSummary output should reflect the uninitialized state of
-      # the watched tensor.
-      numeric_summary = dump.get_tensors("numeric_summary_uninit/a", 0,
-                                         "DebugNumericSummary")[0]
-      self.assertAllClose([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                          numeric_summary[0:8])
-      self.assertTrue(np.isinf(numeric_summary[8]))
-      self.assertGreater(numeric_summary[8], 0.0)
-      self.assertTrue(np.isinf(numeric_summary[9]))
-      self.assertLess(numeric_summary[9], 0.0)
-      self.assertTrue(np.isnan(numeric_summary[10]))
-      self.assertTrue(np.isnan(numeric_summary[11]))
-
-  def testDebugQueueOpsDoesNotoErrorOut(self):
-    with session.Session() as sess:
-      q = data_flow_ops.FIFOQueue(3, "float", name="fifo_queue")
-      q_init = q.enqueue_many(([101.0, 202.0, 303.0],), name="enqueue_many")
-
-      run_metadata = config_pb2.RunMetadata()
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options,
-          sess.graph,
-          debug_urls=self._debug_urls())
-
-      sess.run(q_init, options=run_options, run_metadata=run_metadata)
-
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-      self.assertTrue(dump.loaded_partition_graphs())
-
-      self.assertIsNone(dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0])
-      self.assertAllClose(
-          [101.0, 202.0, 303.0],
-          dump.get_tensors("enqueue_many/component_0", 0, "DebugIdentity")[0])
-
-  def testLookUpNodePythonTracebackWorks(self):
-    with session.Session() as sess:
-      u_init = constant_op.constant(10.0)
-      u = variables.Variable(u_init, name="traceback/u")
-      v_init = constant_op.constant(20.0)
-      v = variables.Variable(v_init, name="traceback/v")
-
-      w = math_ops.multiply(u, v, name="traceback/w")
-
-      sess.run(variables.global_variables_initializer())
-
-      run_metadata = config_pb2.RunMetadata()
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(
-          run_options, sess.graph, debug_urls=self._debug_urls())
-
-      sess.run(w, options=run_options, run_metadata=run_metadata)
-      dump = debug_data.DebugDumpDir(
-          self._dump_root, partition_graphs=run_metadata.partition_graphs)
-
-      # Prior to setting the Python graph, attempts to do traceback lookup
-      # should lead to exceptions.
-      with self.assertRaisesRegexp(
-          LookupError, "Python graph is not available for traceback lookup"):
-        dump.node_traceback("traceback/w")
-
-      dump.set_python_graph(sess.graph)
-
-      # After setting the Python graph, attempts to look up nonexistent nodes
-      # should lead to exceptions.
-      with self.assertRaisesRegexp(KeyError,
-                                   r"Cannot find node \"foo\" in Python graph"):
-        dump.node_traceback("foo")
-
-      # Lookup should work with node name input.
-      traceback = dump.node_traceback("traceback/w")
-      self.assertIsInstance(traceback, list)
-      self.assertGreater(len(traceback), 0)
-      for trace in traceback:
-        self.assertIsInstance(trace, tuple)
-
-      # Lookup should also work with tensor name input.
-      traceback = dump.node_traceback("traceback/w:0")
-      self.assertIsInstance(traceback, list)
-      self.assertGreater(len(traceback), 0)
-      for trace in traceback:
-        self.assertIsInstance(trace, tuple)
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index dc3f8468c61..63229a85398 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -18,12 +18,12 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import threading
 import time
-import uuid
 
 # Google-internal import(s).
 from tensorflow.core.util import event_pb2
-from tensorflow.python.debug import debug_data
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.platform import gfile
 
@@ -31,7 +31,12 @@ from tensorflow.python.platform import gfile
 class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
   """Debug Session wrapper that dumps debug data to filesystem."""
 
-  def __init__(self, sess, session_root, watch_fn=None, log_usage=True):
+  def __init__(self,
+               sess,
+               session_root,
+               watch_fn=None,
+               thread_name_filter=None,
+               log_usage=True):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
@@ -39,15 +44,18 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       session_root: (`str`) Path to the session root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
-        [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
+        @{tf.Session.run}
         calls.
         As the `run()` calls occur, subdirectories will be added to
         `session_root`. The subdirectories' names has the following pattern:
-          run_<epoch_time_stamp>_<uuid>
+          run_<epoch_time_stamp>_<zero_based_run_counter>
         E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
       watch_fn: (`Callable`) A Callable that can be used to define per-run
         debug ops and watched tensors. See the doc of
         `NonInteractiveDebugWrapperSession.__init__()` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
@@ -59,7 +67,7 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       pass  # No logging for open-source.
 
     framework.NonInteractiveDebugWrapperSession.__init__(
-        self, sess, watch_fn=watch_fn)
+        self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter)
 
     if gfile.Exists(session_root):
       if not gfile.IsDirectory(session_root):
@@ -71,11 +79,14 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
             session_root)
     self._session_root = session_root
 
-  def _prepare_run_debug_urls(self, fetches, feed_dict):
+    self._run_counter = 0
+    self._run_counter_lock = threading.Lock()
+
+  def prepare_run_debug_urls(self, fetches, feed_dict):
     """Implementation of abstrat method in superclass.
 
-    See doc of `NonInteractiveDebugWrapperSession.__prepare_run_debug_urls()`
-    for details. This implentation creates a run-specific subdirectory under
+    See doc of `NonInteractiveDebugWrapperSession.prepare_run_debug_urls()`
+    for details. This implementation creates a run-specific subdirectory under
     self._session_root and stores information regarding run `fetches` and
     `feed_dict.keys()` in the subdirectory.
 
@@ -89,8 +100,11 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
     """
 
     # Add a UUID to accommodate the possibility of concurrent run() calls.
-    run_dir = os.path.join(self._session_root, "run_%d_%s" %
-                           (int(time.time() * 1e6), uuid.uuid4().hex))
+    self._run_counter_lock.acquire()
+    run_dir = os.path.join(self._session_root, "run_%d_%d" %
+                           (int(time.time() * 1e6), self._run_counter))
+    self._run_counter += 1
+    self._run_counter_lock.release()
     gfile.MkDir(run_dir)
 
     fetches_event = event_pb2.Event()
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 2be1077e28e..5474b0e27f9 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -21,11 +21,13 @@ import glob
 import os
 import shutil
 import tempfile
+import threading
 
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_data
-from tensorflow.python.debug import stepper
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import stepper
 from tensorflow.python.debug.wrappers import dumping_wrapper
+from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -144,7 +146,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
           watch_fn=bad_watch_fn,
           log_usage=False)
 
-  def testDumpingWithWatchFnOnFetchesWorks(self):
+  def testDumpingWithLegacyWatchFnOnFetchesWorks(self):
     """Use a watch_fn that returns different whitelists for different runs."""
 
     def watch_fn(fetches, feeds):
@@ -186,8 +188,8 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         self.assertEqual(repr(self.dec_v), dump.run_fetches_info)
         self.assertEqual(repr(None), dump.run_feed_keys_info)
 
-  def testDumpingWithWatchFnWithNonDefaultDebugOpsWorks(self):
-    """Use a watch_fn tha specifies non-default debug ops."""
+  def testDumpingWithLegacyWatchFnWithNonDefaultDebugOpsWorks(self):
+    """Use a watch_fn that specifies non-default debug ops."""
 
     def watch_fn(fetches, feeds):
       del fetches, feeds
@@ -206,9 +208,40 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     dump = debug_data.DebugDumpDir(dump_dirs[0])
 
     self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
-    self.assertEqual(12,
+    self.assertEqual(14,
                      len(dump.get_tensors("v", 0, "DebugNumericSummary")[0]))
 
+  def testDumpingWithWatchFnWithNonDefaultDebugOpsWorks(self):
+    """Use a watch_fn that specifies non-default debug ops."""
+
+    def watch_fn(fetches, feeds):
+      del fetches, feeds
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity", "DebugNumericSummary"],
+          node_name_regex_whitelist=r"^v.*",
+          op_type_regex_whitelist=r".*",
+          tensor_dtype_regex_whitelist=".*_ref")
+
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess,
+        session_root=self.session_root,
+        watch_fn=watch_fn,
+        log_usage=False)
+
+    sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+
+    self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+    self.assertEqual(14,
+                     len(dump.get_tensors("v", 0, "DebugNumericSummary")[0]))
+
+    dumped_nodes = [dump.node_name for dump in dump.dumped_tensor_data]
+    self.assertNotIn("inc_v", dumped_nodes)
+    self.assertNotIn("delta", dumped_nodes)
+
   def testDumpingDebugHookWithoutWatchFnWorks(self):
     dumping_hook = hooks.DumpingDebugHook(self.session_root, log_usage=False)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
@@ -227,6 +260,49 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testDumpingDebugHookWithStatefulWatchFnWorks(self):
     watch_fn_state = {"run_counter": 0}
 
+    def counting_watch_fn(fetches, feed_dict):
+      del fetches, feed_dict
+      watch_fn_state["run_counter"] += 1
+      if watch_fn_state["run_counter"] % 2 == 1:
+        # If odd-index run (1-based), watch every ref-type tensor.
+        return framework.WatchOptions(
+            debug_ops="DebugIdentity",
+            tensor_dtype_regex_whitelist=".*_ref")
+      else:
+        # If even-index run, watch nothing.
+        return framework.WatchOptions(
+            debug_ops="DebugIdentity",
+            node_name_regex_whitelist=r"^$",
+            op_type_regex_whitelist=r"^$")
+
+    dumping_hook = hooks.DumpingDebugHook(
+        self.session_root, watch_fn=counting_watch_fn, log_usage=False)
+    mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
+    for _ in range(4):
+      mon_sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    dump_dirs = sorted(
+        dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1]))
+    self.assertEqual(4, len(dump_dirs))
+
+    for i, dump_dir in enumerate(dump_dirs):
+      self._assert_correct_run_subdir_naming(os.path.basename(dump_dir))
+      dump = debug_data.DebugDumpDir(dump_dir)
+      if i % 2 == 0:
+        self.assertAllClose([10.0 + 1.0 * i],
+                            dump.get_tensors("v", 0, "DebugIdentity"))
+        self.assertNotIn("delta",
+                         [datum.node_name for datum in dump.dumped_tensor_data])
+      else:
+        self.assertEqual(0, dump.size)
+
+      self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+      self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+  def testDumpingDebugHookWithStatefulLegacyWatchFnWorks(self):
+    watch_fn_state = {"run_counter": 0}
+
     def counting_watch_fn(fetches, feed_dict):
       del fetches, feed_dict
       watch_fn_state["run_counter"] += 1
@@ -260,6 +336,25 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
       self.assertEqual(repr(None), dump.run_feed_keys_info)
 
+  def testDumpingFromMultipleThreadsObeysThreadNameFilter(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False,
+        thread_name_filter=r"MainThread$")
+
+    self.assertAllClose(1.0, sess.run(self.delta))
+    def child_thread_job():
+      sess.run(sess.run(self.eta))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    thread.join()
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+    self.assertEqual(1, dump.size)
+    self.assertEqual("delta", dump.dumped_tensor_data[0].node_name)
+
   def testCallingInvokeNodeStepperOnDumpingWrapperRaisesException(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 00ab7277cb2..2c239038e44 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -112,12 +112,15 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import re
+import threading
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_utils
-from tensorflow.python.debug import stepper
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 
 
 # Helper function.
@@ -216,6 +219,9 @@ class OnRunStartAction(object):
   # Run once with debug tensor-watching.
   DEBUG_RUN = "debug_run"
 
+  # Run once with profiler.
+  PROFILE_RUN = "profile_run"
+
   # Run without debug tensor-watching.
   NON_DEBUG_RUN = "non_debug_run"
 
@@ -237,7 +243,9 @@ class OnRunStartResponse(object):
                debug_urls,
                debug_ops="DebugIdentity",
                node_name_regex_whitelist=None,
-               op_type_regex_whitelist=None):
+               op_type_regex_whitelist=None,
+               tensor_dtype_regex_whitelist=None,
+               tolerate_debug_op_creation_failures=False):
     """Constructor of `OnRunStartResponse`.
 
     Args:
@@ -250,6 +258,10 @@ class OnRunStartResponse(object):
       node_name_regex_whitelist: Regular-expression whitelist for node
         name.
       op_type_regex_whitelist: Regular-expression whitelist for op type.
+      tensor_dtype_regex_whitelist: Regular-expression whitelist for tensor
+        dtype.
+      tolerate_debug_op_creation_failures: Whether debug op creation failures
+        are to be tolerated.
     """
 
     _check_type(action, str)
@@ -262,6 +274,9 @@ class OnRunStartResponse(object):
 
     self.node_name_regex_whitelist = node_name_regex_whitelist
     self.op_type_regex_whitelist = op_type_regex_whitelist
+    self.tensor_dtype_regex_whitelist = tensor_dtype_regex_whitelist
+    self.tolerate_debug_op_creation_failures = (
+        tolerate_debug_op_creation_failures)
 
 
 class OnRunEndRequest(object):
@@ -317,11 +332,17 @@ class BaseDebugWrapperSession(session.SessionInterface):
   # TODO(cais): Add on_cont_start and on_cont_end callbacks once the stepper is
   # is available.
 
-  def __init__(self, sess):
+  def __init__(self, sess, thread_name_filter=None):
     """Constructor of `BaseDebugWrapperSession`.
 
     Args:
       sess: An (unwrapped) TensorFlow session instance.
+      thread_name_filter: Regular-expression filter (whitelist) for name(s) of
+        thread(s) on which the wrapper session will be active. This regular
+        expression is used in a start-anchored fashion on the thread name, i.e.,
+        by applying the `match` method of the compiled pattern. The default
+        `None` means that the wrapper session will be active on all threads.
+        E.g., r"MainThread$", r"QueueRunnerThread.*".
 
     Raises:
       ValueError: On invalid `OnSessionInitAction` value.
@@ -330,14 +351,10 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     _check_type(sess, session.BaseSession)
 
-    # TODO(cais): Remove this check once tfdbg is integrated with GrpcSession.
-    if sess.sess_str:
-      raise NotImplementedError(
-          "Non-DirectSession support is not available from TensorFlow "
-          "Debugger yet (sess_str=%s)" % sess.sess_str)
-
     # The session being wrapped.
     self._sess = sess
+    self._thread_name_filter_pattern = (re.compile(thread_name_filter)
+                                        if thread_name_filter else None)
 
     # Keeps track of number of run calls that have been performed on this
     # debug-wrapper session.
@@ -362,6 +379,10 @@ class BaseDebugWrapperSession(session.SessionInterface):
   def graph(self):
     return self._sess.graph
 
+  @property
+  def graph_def(self):
+    return self._sess.graph_def
+
   @property
   def sess_str(self):
     return self._sess.sess_str
@@ -370,6 +391,9 @@ class BaseDebugWrapperSession(session.SessionInterface):
   def session(self):
     return self._sess
 
+  def as_default(self):
+    return ops.default_session(self)
+
   def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
     """Wrapper around Session.run() that inserts tensor watch options.
 
@@ -387,6 +411,11 @@ class BaseDebugWrapperSession(session.SessionInterface):
     """
 
     self._run_call_count += 1
+    if self._is_disabled_thread():
+      return self._sess.run(fetches,
+                            feed_dict=feed_dict,
+                            options=options,
+                            run_metadata=run_metadata)
 
     # Invoke on-run-start callback and obtain response.
     run_start_resp = self.on_run_start(
@@ -399,12 +428,16 @@ class BaseDebugWrapperSession(session.SessionInterface):
       decorated_run_options = options or config_pb2.RunOptions()
       run_metadata = run_metadata or config_pb2.RunMetadata()
 
-      self._decorate_run_options(
+      self._decorate_run_options_for_debug(
           decorated_run_options,
           run_start_resp.debug_urls,
           debug_ops=run_start_resp.debug_ops,
           node_name_regex_whitelist=run_start_resp.node_name_regex_whitelist,
-          op_type_regex_whitelist=run_start_resp.op_type_regex_whitelist)
+          op_type_regex_whitelist=run_start_resp.op_type_regex_whitelist,
+          tensor_dtype_regex_whitelist=(
+              run_start_resp.tensor_dtype_regex_whitelist),
+          tolerate_debug_op_creation_failures=(
+              run_start_resp.tolerate_debug_op_creation_failures))
 
       # Invoke the run() method of the wrapped Session. Catch any TensorFlow
       # runtime errors.
@@ -424,6 +457,19 @@ class BaseDebugWrapperSession(session.SessionInterface):
           client_graph_def=self._sess.graph.as_graph_def(),
           tf_error=tf_error)
 
+    elif run_start_resp.action == OnRunStartAction.PROFILE_RUN:
+      decorated_run_options = options or config_pb2.RunOptions()
+      run_metadata = run_metadata or config_pb2.RunMetadata()
+      self._decorate_run_options_for_profile(decorated_run_options)
+      retvals = self._sess.run(fetches,
+                               feed_dict=feed_dict,
+                               options=decorated_run_options,
+                               run_metadata=run_metadata)
+      run_end_req = OnRunEndRequest(
+          run_start_resp.action,
+          run_metadata=run_metadata,
+          client_graph_def=self._sess.graph.as_graph_def())
+
     elif (run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN or
           run_start_resp.action == OnRunStartAction.INVOKE_STEPPER):
       if run_start_resp.action == OnRunStartAction.INVOKE_STEPPER:
@@ -452,6 +498,11 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     return retvals
 
+  def _is_disabled_thread(self):
+    thread_name = threading.current_thread().name or ""
+    return (self._thread_name_filter_pattern and
+            not self._thread_name_filter_pattern.match(thread_name))
+
   def partial_run_setup(self, fetches, feeds=None):
     """Sets up the feeds and fetches for partial runs in the session."""
     raise NotImplementedError(
@@ -461,12 +512,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
     raise NotImplementedError(
         "partial_run is not implemented for debug-wrapper sessions.")
 
-  def _decorate_run_options(self,
-                            run_options,
-                            debug_urls,
-                            debug_ops="DebugIdentity",
-                            node_name_regex_whitelist=None,
-                            op_type_regex_whitelist=None):
+  def _decorate_run_options_for_debug(
+      self,
+      run_options,
+      debug_urls,
+      debug_ops="DebugIdentity",
+      node_name_regex_whitelist=None,
+      op_type_regex_whitelist=None,
+      tensor_dtype_regex_whitelist=None,
+      tolerate_debug_op_creation_failures=False):
     """Modify a RunOptions object for debug tensor watching.
 
     Specifies request for outputting partition graphs. Adds
@@ -480,6 +534,10 @@ class BaseDebugWrapperSession(session.SessionInterface):
       node_name_regex_whitelist: Regular-expression whitelist for node
         name.
       op_type_regex_whitelist: Regular-expression whitelist for op type.
+      tensor_dtype_regex_whitelist: Regular-expression whitelist for tensor
+        dtype.
+      tolerate_debug_op_creation_failures: Whether debug op creation failures
+        are to be tolerated.
     """
 
     run_options.output_partition_graphs = True
@@ -489,7 +547,18 @@ class BaseDebugWrapperSession(session.SessionInterface):
         debug_urls=debug_urls,
         debug_ops=debug_ops,
         node_name_regex_whitelist=node_name_regex_whitelist,
-        op_type_regex_whitelist=op_type_regex_whitelist)
+        op_type_regex_whitelist=op_type_regex_whitelist,
+        tensor_dtype_regex_whitelist=tensor_dtype_regex_whitelist,
+        tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures)
+
+  def _decorate_run_options_for_profile(self, run_options):
+    """Modify a RunOptions object for profiling TensorFlow graph execution.
+
+    Args:
+      run_options: (RunOptions) the modified RunOptions object.
+    """
+
+    run_options.trace_level = config_pb2.RunOptions.FULL_TRACE
 
   @abc.abstractmethod
   def on_session_init(self, request):
@@ -574,39 +643,84 @@ class BaseDebugWrapperSession(session.SessionInterface):
     """
 
 
+class WatchOptions(object):
+  """Type for return values of watch_fn."""
+
+  def __init__(self,
+               debug_ops=None,
+               node_name_regex_whitelist=None,
+               op_type_regex_whitelist=None,
+               tensor_dtype_regex_whitelist=None,
+               tolerate_debug_op_creation_failures=False):
+    """Constructor of WatchOptions: Debug watch options.
+
+    Used as return values of `watch_fn`s.
+
+    Args:
+      debug_ops: (`str` or `list of str`) Debug ops to be used.
+      node_name_regex_whitelist: Regular-expression whitelist for node_name,
+        e.g., `"(weight_[0-9]+|bias_.*)"`
+      op_type_regex_whitelist: Regular-expression whitelist for the op type of
+        nodes, e.g., `"(Variable|Add)"`.
+        If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
+        are set, the two filtering operations will occur in a logical `AND`
+        relation. In other words, a node will be included if and only if it
+        hits both whitelists.
+      tensor_dtype_regex_whitelist: Regular-expression whitelist for Tensor
+        data type, e.g., `"^int.*"`.
+        This whitelist operates in logical `AND` relations to the two whitelists
+        above.
+      tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
+        failures (e.g., due to dtype incompatibility) are to be tolerated by not
+        throwing exceptions.
+    """
+    if debug_ops:
+      self.debug_ops = debug_ops
+    else:
+      self.debug_ops = ["DebugIdentity"]
+    self.node_name_regex_whitelist = node_name_regex_whitelist
+    self.op_type_regex_whitelist = op_type_regex_whitelist
+    self.tensor_dtype_regex_whitelist = tensor_dtype_regex_whitelist
+    self.tolerate_debug_op_creation_failures = (
+        tolerate_debug_op_creation_failures)
+
+  def __repr__(self):
+    return ("WatchOptions(debug_ops=%r, node_name_regex_whitelist=%r, "
+            "op_type_regex_whitelist=%r, tensor_dtype_regex_whitelist=%r, "
+            "tolerate_debug_op_creation_failures=%r)" % (
+                self.debug_ops, self.node_name_regex_whitelist,
+                self.op_type_regex_whitelist, self.tensor_dtype_regex_whitelist,
+                self.tolerate_debug_op_creation_failures))
+
+
 class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
   """Base class for non-interactive (i.e., non-CLI) debug wrapper sessions."""
 
-  def __init__(self, sess, watch_fn=None):
+  def __init__(self, sess, watch_fn=None, thread_name_filter=None):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
       sess: The TensorFlow `Session` object being wrapped.
-      watch_fn: (`Callable`) A Callable of the following signature:
-        ```
-        def watch_fn(fetches, feeds):
-          # Args:
-          #   fetches: the fetches to the `Session.run()` call.
-          #   feeds: the feeds to the `Session.run()` call.
-          #
-          # Returns: (node_name_regex_whitelist, op_type_regex_whitelist)
-          #   debug_ops: (str or list of str) Debug op(s) to be used by the
-          #     debugger in this run() call.
-          #   node_name_regex_whitelist: Regular-expression whitelist for node
-          #     name. Same as the corresponding arg to `debug_util.watch_graph`.
-          #   op_type_regex_whiteslit: Regular-expression whitelist for op type.
-          #     Same as the corresponding arg to `debug_util.watch_graph`.
-          #
-          #   Both or either can be None. If both are set, the two whitelists
-          #   will operate in a logical AND relation. This is consistent with
-          #   `debug_utils.watch_graph()`.
-        ```
+      watch_fn: (`Callable`) A Callable that maps the fetches and feeds of a
+        debugged `Session.run()` call to `WatchOptions.`
+        * Args:
+          * `fetches`: the fetches to the `Session.run()` call.
+          * `feeds`: the feeds to the `Session.run()` call.
 
+        * Returns:
+         (`tf_debug.WatchOptions`) An object containing debug options including
+           the debug ops to use, the node names, op types and/or tensor data
+           types to watch, etc. See the documentation of `tf_debug.WatchOptions`
+           for more details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
     Raises:
        TypeError: If a non-None `watch_fn` is specified and it is not callable.
     """
 
-    BaseDebugWrapperSession.__init__(self, sess)
+    BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
     self._watch_fn = None
     if watch_fn is not None:
@@ -620,7 +734,7 @@ class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
     return OnSessionInitResponse(OnSessionInitAction.PROCEED)
 
   @abc.abstractmethod
-  def _prepare_run_debug_urls(self, fetches, feed_dict):
+  def prepare_run_debug_urls(self, fetches, feed_dict):
     """Abstract method to be implemented by concrete subclasses.
 
     This method prepares the run-specific debug URL(s).
@@ -637,16 +751,18 @@ class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
   def on_run_start(self, request):
     """See doc of BaseDebugWrapperSession.on_run_start."""
 
-    (debug_urls, debug_ops, node_name_regex_whitelist,
-     op_type_regex_whitelist) = self._prepare_run_watch_config(
-         request.fetches, request.feed_dict)
+    debug_urls, watch_opts = self._prepare_run_watch_config(
+        request.fetches, request.feed_dict)
 
     return OnRunStartResponse(
         OnRunStartAction.DEBUG_RUN,
         debug_urls,
-        debug_ops=debug_ops,
-        node_name_regex_whitelist=node_name_regex_whitelist,
-        op_type_regex_whitelist=op_type_regex_whitelist)
+        debug_ops=watch_opts.debug_ops,
+        node_name_regex_whitelist=watch_opts.node_name_regex_whitelist,
+        op_type_regex_whitelist=watch_opts.op_type_regex_whitelist,
+        tensor_dtype_regex_whitelist=watch_opts.tensor_dtype_regex_whitelist,
+        tolerate_debug_op_creation_failures=(
+            watch_opts.tolerate_debug_op_creation_failures))
 
   def _prepare_run_watch_config(self, fetches, feed_dict):
     """Get the debug_urls, and node/op whitelists for the current run() call.
@@ -658,24 +774,20 @@ class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
     Returns:
       debug_urls: (str or list of str) Debug URLs for the current run() call.
         Currently, the list consists of only one URL that is a file:// URL.
-      debug_ops: (str or list of str) Debug op(s) to be used by the
-        debugger.
-      node_name_regex_whitelist: (str or regex) Regular-expression whitelist for
-        node name. Same as the same-name argument to debug_utils.watch_graph.
-      op_type_regex_whitelist: (str or regex) Regular-expression whitelist for
-        op type. Same as the same-name argument to debug_utils.watch_graph.
+      watch_options: (WatchOptions) The return value of a watch_fn, containing
+        options including debug_ops, and whitelists.
     """
 
-    debug_urls = self._prepare_run_debug_urls(fetches, feed_dict)
-    debug_ops = "DebugIdentity"
-    node_name_regex_whitelist = None
-    op_type_regex_whitelist = None
-    if self._watch_fn is not None:
-      debug_ops, node_name_regex_whitelist, op_type_regex_whitelist = (
-          self._watch_fn(fetches, feed_dict))
+    debug_urls = self.prepare_run_debug_urls(fetches, feed_dict)
+    if self._watch_fn is None:
+      watch_options = WatchOptions()
+    else:
+      watch_options = self._watch_fn(fetches, feed_dict)
+      if isinstance(watch_options, tuple):
+        # For legacy return type (tuples).
+        watch_options = WatchOptions(*watch_options)
 
-    return (debug_urls, debug_ops, node_name_regex_whitelist,
-            op_type_regex_whitelist)
+    return debug_urls, watch_options
 
   def on_run_end(self, request):
     """See doc of BaseDebugWrapperSession.on_run_end."""
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index d56c7057f66..2b2289d6a81 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -17,13 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import shutil
 import tempfile
+import threading
 
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.debug import debug_data
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +34,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -39,7 +43,7 @@ from tensorflow.python.platform import googletest
 class TestDebugWrapperSession(framework.BaseDebugWrapperSession):
   """A concrete implementation of BaseDebugWrapperSession for test."""
 
-  def __init__(self, sess, dump_root, observer):
+  def __init__(self, sess, dump_root, observer, thread_name_filter=None):
     # Supply dump root.
     self._dump_root = dump_root
 
@@ -47,7 +51,8 @@ class TestDebugWrapperSession(framework.BaseDebugWrapperSession):
     self._obs = observer
 
     # Invoke superclass constructor.
-    framework.BaseDebugWrapperSession.__init__(self, sess)
+    framework.BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
   def on_session_init(self, request):
     """Override abstract on-session-init callback method."""
@@ -155,7 +160,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self._c_val = np.array([[-4.0], [6.0]])
 
     self._a_init = constant_op.constant(
-        self._a_init_val, shape=[2, 2], name="a1_init")
+        self._a_init_val, shape=[2, 2], name="a_init")
     self._b_init = constant_op.constant(
         self._b_init_val, shape=[2, 1], name="b_init")
 
@@ -180,7 +185,8 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
     # Tear down temporary dump directory.
-    shutil.rmtree(self._dump_root)
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
 
     ops.reset_default_graph()
 
@@ -201,6 +207,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(isinstance(wrapper_sess, session.SessionInterface))
     self.assertEqual(self._sess.sess_str, wrapper_sess.sess_str)
     self.assertEqual(self._sess.graph, wrapper_sess.graph)
+    self.assertEqual(self._sess.graph_def, wrapper_sess.graph_def)
 
     # Check that the partial_run_setup and partial_run are not implemented for
     # the debug wrapper session.
@@ -208,7 +215,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
       wrapper_sess.partial_run_setup(self._p)
 
   def testInteractiveSessionInit(self):
-    """The wrapper should work also on other subclassses of session.Session."""
+    """The wrapper should work also on other subclasses of session.Session."""
 
     TestDebugWrapperSession(
         session.InteractiveSession(), self._dump_root, self._observer)
@@ -306,22 +313,78 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     with wrapper as sess:
       sess.run(self._s)
 
+  def testUsingWrappedSessionShouldSupportEvalWithAsDefault(self):
+    wrapper = TestDebugWrapperSession(self._sess, self._dump_root,
+                                      self._observer)
+
+    with wrapper.as_default():
+      foo = constant_op.constant(42, name="foo")
+      self.assertEqual(42, foo.eval())
+      self.assertEqual(foo, self._observer["run_fetches"])
+
   def testWrapperShouldSupportSessionClose(self):
     wrapper = TestDebugWrapperSession(self._sess, self._dump_root,
                                       self._observer)
     wrapper.close()
 
-  def testUsingNonDirectSessionRaisesNotImplementedError(self):
-    # TODO(cais): Remove this test once tfdbg is integrated with GrpcSession.
-    fake_non_direct_session = session.Session()
-    fake_non_direct_session._target = "foo"
+  def testWrapperThreadNameFilterMainThread(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter="MainThread")
 
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r"Non-DirectSession support is not available from TensorFlow Debugger "
-        r"yet \(sess_str=foo\)"):
-      TestDebugWrapperSession(
-          fake_non_direct_session, self._dump_root, self._observer)
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(1, dump.size)
+    self.assertEqual("a_init", dump.dumped_tensor_data[0].node_name)
+
+  def testWrapperThreadNameFilterChildThread(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter=r"Child.*")
+
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(1, dump.size)
+    self.assertEqual("b_init", dump.dumped_tensor_data[0].node_name)
+
+  def testWrapperThreadNameFilterBothThreads(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter=None)
+
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root, validate=False)
+    self.assertEqual(2, dump.size)
+    self.assertItemsEqual(
+        ["a_init", "b_init"],
+        [datum.node_name for datum in dump.dumped_tensor_data])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index a56abc2c19d..4062016607c 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -30,6 +30,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
                sess,
                grpc_debug_server_addresses,
                watch_fn=None,
+               thread_name_filter=None,
                log_usage=True):
     """Constructor of DumpingDebugWrapperSession.
 
@@ -43,6 +44,9 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       watch_fn: (`Callable`) A Callable that can be used to define per-run
         debug ops and watched tensors. See the doc of
         `NonInteractiveDebugWrapperSession.__init__()` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
@@ -54,7 +58,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       pass  # No logging for open-source.
 
     framework.NonInteractiveDebugWrapperSession.__init__(
-        self, sess, watch_fn=watch_fn)
+        self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter)
 
     if isinstance(grpc_debug_server_addresses, str):
       self._grpc_debug_server_urls = [
@@ -73,10 +77,10 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
           "Expected type str or list in grpc_debug_server_addresses, "
           "received type %s" % type(grpc_debug_server_addresses))
 
-  def _prepare_run_debug_urls(self, fetches, feed_dict):
+  def prepare_run_debug_urls(self, fetches, feed_dict):
     """Implementation of abstract method in superclass.
 
-    See doc of `NonInteractiveDebugWrapperSession.__prepare_run_debug_urls()`
+    See doc of `NonInteractiveDebugWrapperSession.prepare_run_debug_urls()`
     for details.
 
     Args:
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 58a8f7f9c97..f6194f5fad7 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -19,13 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.debug import debug_utils
-from tensorflow.python.debug import stepper
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import stepper
 from tensorflow.python.debug.wrappers import dumping_wrapper
 from tensorflow.python.debug.wrappers import framework
+from tensorflow.python.debug.wrappers import grpc_wrapper
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.training import session_run_hook
 
+# The prefix for GRPC endpoint URLs.
+_GRPC_ENDPOINT_PREFIX = "grpc://"
+
 
 class LocalCLIDebugHook(session_run_hook.SessionRunHook,
                         local_cli_wrapper.LocalCLIDebugWrapperSession):
@@ -35,14 +39,26 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self, ui_type="curses"):
+  def __init__(self,
+               ui_type="curses",
+               dump_root=None,
+               thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
       ui_type: (str) user-interface type.
+      dump_root: (`str`) optional path to the dump root directory. Must be a
+        directory that does not exist or an empty directory. If the directory
+        does not exist, it will be created by the debugger core during debug
+        `run()` calls and removed afterwards.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
     """
 
     self._ui_type = ui_type
+    self._dump_root = dump_root
+    self._thread_name_filter = thread_name_filter
     self._wrapper_initialized = False
     self._pending_tensor_filters = {}
 
@@ -50,7 +66,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accomodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -73,7 +89,11 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
   def before_run(self, run_context):
     if not self._wrapper_initialized:
       local_cli_wrapper.LocalCLIDebugWrapperSession.__init__(
-          self, run_context.session, ui_type=self._ui_type)
+          self,
+          run_context.session,
+          ui_type=self._ui_type,
+          dump_root=self._dump_root,
+          thread_name_filter=self._thread_name_filter)
 
       # Actually register tensor filters registered prior to the construction
       # of the underlying LocalCLIDebugWrapperSession object.
@@ -98,8 +118,20 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     run_args = session_run_hook.SessionRunArgs(
         None, feed_dict=None, options=config_pb2.RunOptions())
     if self._performed_action == framework.OnRunStartAction.DEBUG_RUN:
-      self._decorate_options_for_debug(run_args.options,
-                                       run_context.session.graph)
+      self._decorate_options_for_debug(
+          run_args.options,
+          run_context.session.graph,
+          framework.WatchOptions(
+              node_name_regex_whitelist=(
+                  on_run_start_response.node_name_regex_whitelist),
+              op_type_regex_whitelist=(
+                  on_run_start_response.op_type_regex_whitelist),
+              tensor_dtype_regex_whitelist=(
+                  on_run_start_response.tensor_dtype_regex_whitelist),
+              tolerate_debug_op_creation_failures=(
+                  on_run_start_response.tolerate_debug_op_creation_failures)))
+    elif self._performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._decorate_run_options_for_profile(run_args.options)
     elif self._performed_action == framework.OnRunStartAction.INVOKE_STEPPER:
       # The _finalized property must be set to False so that the NodeStepper
       # can insert ops for retrieving TensorHandles.
@@ -124,16 +156,17 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
                                                    run_values.run_metadata)
     self.on_run_end(on_run_end_request)
 
-  def _decorate_options_for_debug(self, options, graph):
-    """Modify RunOptions.debug_options.debug_tensor_watch_opts for debugging.
-
-    Args:
-      options: (config_pb2.RunOptions) The RunOptions instance to be modified.
-      graph: A TensorFlow Graph object.
-    """
-
+  def _decorate_options_for_debug(self, options, graph, watch_options):
+    """Modify RunOptions.debug_options.debug_tensor_watch_opts for debugging."""
     debug_utils.watch_graph(
-        options, graph, debug_urls=self._get_run_debug_urls())
+        options,
+        graph,
+        debug_urls=self._get_run_debug_urls(),
+        node_name_regex_whitelist=watch_options.node_name_regex_whitelist,
+        op_type_regex_whitelist=watch_options.op_type_regex_whitelist,
+        tensor_dtype_regex_whitelist=watch_options.tensor_dtype_regex_whitelist,
+        tolerate_debug_op_creation_failures=(
+            watch_options.tolerate_debug_op_creation_failures))
     options.output_partition_graphs = True
 
 
@@ -145,7 +178,11 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self, session_root, watch_fn=None, log_usage=True):
+  def __init__(self,
+               session_root,
+               watch_fn=None,
+               thread_name_filter=None,
+               log_usage=True):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -153,11 +190,15 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
         `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
       watch_fn: See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (bool) Whether usage is to be logged.
     """
 
     self._session_root = session_root
     self._watch_fn = watch_fn
+    self._thread_name_filter = thread_name_filter
     self._log_usage = log_usage
     self._wrapper_initialized = False
 
@@ -166,27 +207,32 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
 
   def before_run(self, run_context):
     if not self._wrapper_initialized:
+      # TODO(cais): Make this hook have a DumpingDebugWrapperSession property
+      # instead of subclassing DumpingDebugWrapperSession.
       dumping_wrapper.DumpingDebugWrapperSession.__init__(
           self,
           run_context.session,
           self._session_root,
           watch_fn=self._watch_fn,
+          thread_name_filter=self._thread_name_filter,
           log_usage=self._log_usage)
       self._wrapper_initialized = True
 
     self._run_call_count += 1
 
-    (debug_urls, debug_ops, node_name_regex_whitelist,
-     op_type_regex_whitelist) = self._prepare_run_watch_config(
-         run_context.original_args.fetches, run_context.original_args.feed_dict)
+    debug_urls, watch_options = self._prepare_run_watch_config(
+        run_context.original_args.fetches, run_context.original_args.feed_dict)
     run_options = config_pb2.RunOptions()
     debug_utils.watch_graph(
         run_options,
         run_context.session.graph,
         debug_urls=debug_urls,
-        debug_ops=debug_ops,
-        node_name_regex_whitelist=node_name_regex_whitelist,
-        op_type_regex_whitelist=op_type_regex_whitelist)
+        debug_ops=watch_options.debug_ops,
+        node_name_regex_whitelist=watch_options.node_name_regex_whitelist,
+        op_type_regex_whitelist=watch_options.op_type_regex_whitelist,
+        tensor_dtype_regex_whitelist=watch_options.tensor_dtype_regex_whitelist,
+        tolerate_debug_op_creation_failures=(
+            watch_options.tolerate_debug_op_creation_failures))
 
     run_args = session_run_hook.SessionRunArgs(
         None, feed_dict=None, options=run_options)
@@ -194,3 +240,94 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
 
   def after_run(self, run_context, run_values):
     pass
+
+
+class GrpcDebugHook(session_run_hook.SessionRunHook):
+  """A hook that streams debugger-related events to any grpc_debug_server.
+
+  For example, the debugger data server is a grpc_debug_server. The debugger
+  data server writes debugger-related events it receives via GRPC to logdir.
+  This enables debugging features in Tensorboard such as health pills.
+
+  When the arguments of debug_utils.watch_graph changes, strongly consider
+  changing arguments here too so that features are available to tflearn users.
+
+  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  """
+
+  def __init__(self,
+               grpc_debug_server_addresses,
+               watch_fn=None,
+               thread_name_filter=None,
+               log_usage=True):
+    """Constructs a GrpcDebugHook.
+
+    Args:
+      grpc_debug_server_addresses: (`list` of `str`) A list of the gRPC debug
+        server addresses, in the format of <host:port>, without the "grpc://"
+        prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
+      watch_fn: A function that allows for customizing which ops to watch at
+        which specific steps. See doc of
+        `dumping_wrapper.DumpingDebugWrapperSession.__init__` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
+      log_usage: (bool) Whether usage is to be logged.
+
+    Raises:
+      ValueError: if any debugger server addresses start with grpc://.
+    """
+
+    for address in grpc_debug_server_addresses:
+      if address.startswith(_GRPC_ENDPOINT_PREFIX):
+        raise ValueError(
+            ("Debug server address %r starts with %r. It should not because "
+             "the hook already automatically adds the prefix.") % (
+                 address, _GRPC_ENDPOINT_PREFIX))
+
+    # A wrapper session responsible for GRPC communication.
+    self._grpc_debug_wrapper_session = None
+    self._thread_name_filter = thread_name_filter
+
+    self._grpc_debug_server_addresses = grpc_debug_server_addresses
+    self._watch_fn = watch_fn
+    self._log_usage = log_usage
+
+  def before_run(self, run_context):
+    """Called right before a session is run.
+
+    Args:
+      run_context: A session_run_hook.SessionRunContext. Encapsulates
+        information on the run.
+
+    Returns:
+      A session_run_hook.SessionRunArgs object.
+    """
+
+    if not self._grpc_debug_wrapper_session:
+      self._grpc_debug_wrapper_session = grpc_wrapper.GrpcDebugWrapperSession(
+          run_context.session,
+          self._grpc_debug_server_addresses,
+          watch_fn=self._watch_fn,
+          thread_name_filter=self._thread_name_filter,
+          log_usage=self._log_usage)
+
+    fetches = run_context.original_args.fetches
+    feed_dict = run_context.original_args.feed_dict
+    watch_options = self._watch_fn(fetches, feed_dict)
+    run_options = config_pb2.RunOptions()
+    debug_utils.watch_graph(
+        run_options,
+        run_context.session.graph,
+        debug_urls=self._grpc_debug_wrapper_session.prepare_run_debug_urls(
+            fetches, feed_dict),
+        debug_ops=watch_options.debug_ops,
+        node_name_regex_whitelist=watch_options.node_name_regex_whitelist,
+        op_type_regex_whitelist=watch_options.op_type_regex_whitelist,
+        tensor_dtype_regex_whitelist=watch_options.tensor_dtype_regex_whitelist,
+        tolerate_debug_op_creation_failures=(
+            watch_options.tolerate_debug_op_creation_failures))
+
+    return session_run_hook.SessionRunArgs(
+        None, feed_dict=None, options=run_options)
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 9cf147a81c2..fe822df6ce3 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -24,12 +24,13 @@ import sys
 import tempfile
 
 # Google-internal import(s).
-from tensorflow.python.debug import debug_data
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.cli import ui_factory
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
 
 
@@ -44,7 +45,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
   will launch the command-line interface (CLI) of tfdbg.
   """
 
-  def __init__(self, sess, dump_root=None, log_usage=True, ui_type="curses"):
+  def __init__(self,
+               sess,
+               dump_root=None,
+               log_usage=True,
+               ui_type="curses",
+               thread_name_filter=None):
     """Constructor of LocalCLIDebugWrapperSession.
 
     Args:
@@ -52,10 +58,13 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
-        `run()` calls and removed afterwards.
+        `run()` calls and removed afterwards. If `None`, the debug dumps will
+        be at tfdbg_<random_string> under the system temp directory.
       log_usage: (`bool`) whether the usage of this class is to be logged.
       ui_type: (`str`) requested UI type. Currently supported:
         (curses | readline)
+      thread_name_filter: Regular-expression white list for thread name. See
+        the doc of `BaseDebugWrapperSession` for details.
 
     Raises:
       ValueError: If dump_root is an existing and non-empty directory or if
@@ -65,9 +74,10 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if log_usage:
       pass  # No logging for open-source.
 
-    framework.BaseDebugWrapperSession.__init__(self, sess)
+    framework.BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
-    if dump_root is None:
+    if not dump_root:
       self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
     else:
       if os.path.isfile(dump_root):
@@ -132,6 +142,33 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         type=str,
         default="",
         help="Run until a tensor in the graph passes the specified filter.")
+    ap.add_argument(
+        "--node_name_filter",
+        dest="node_name_filter",
+        type=str,
+        default="",
+        help="Regular-expression filter for node names to be watched in the "
+        "run, e.g., loss, reshape.*")
+    ap.add_argument(
+        "--op_type_filter",
+        dest="op_type_filter",
+        type=str,
+        default="",
+        help="Regular-expression filter for op type to be watched in the run, "
+        "e.g., (MatMul|Add), Variable.*")
+    ap.add_argument(
+        "--tensor_dtype_filter",
+        dest="tensor_dtype_filter",
+        type=str,
+        default="",
+        help="Regular-expression filter for tensor dtype to be watched in the "
+        "run, e.g., (float32|float64), int.*")
+    ap.add_argument(
+        "-p",
+        "--profile",
+        dest="profile",
+        action="store_true",
+        help="Run and profile TensorFlow graph execution.")
     self._argparsers["run"] = ap
 
     ap = argparse.ArgumentParser(
@@ -175,15 +212,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       `run` / `invoke_stepper`.
 
     Args:
-      request: An instance of `OnSessionInitRequest`.
+      request: An instance of `OnRunStartRequest`.
 
     Returns:
-      An instance of `OnSessionInitResponse`.
-
-    Raises:
-      RuntimeError: If user chooses to prematurely exit the debugger.
+      An instance of `OnRunStartResponse`.
     """
-
     self._is_run_start = True
     self._update_run_calls_state(request.run_call_count, request.fetches,
                                  request.feed_dict)
@@ -194,6 +227,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       return framework.OnRunStartResponse(framework.OnRunStartAction.DEBUG_RUN,
                                           self._get_run_debug_urls())
 
+    self._exit_if_requested_by_user()
+
     if self._run_call_count > 1 and not self._skip_debug:
       if self._run_through_times > 0:
         # Just run through without debugging.
@@ -202,9 +237,10 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       elif self._run_through_times == 0:
         # It is the run at which the run-end CLI will be launched: activate
         # debugging.
-        return framework.OnRunStartResponse(
-            framework.OnRunStartAction.DEBUG_RUN,
-            self._get_run_debug_urls())
+        return (self._run_start_response or
+                framework.OnRunStartResponse(
+                    framework.OnRunStartAction.DEBUG_RUN,
+                    self._get_run_debug_urls()))
 
     if self._run_start_response is None:
       self._prep_cli_for_run_start()
@@ -213,6 +249,10 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       if self._run_through_times > 1:
         self._run_through_times -= 1
 
+    self._exit_if_requested_by_user()
+    return self._run_start_response
+
+  def _exit_if_requested_by_user(self):
     if self._run_start_response == debugger_cli_common.EXPLICIT_USER_EXIT:
       # Explicit user "exit" command leads to sys.exit(1).
       print(
@@ -220,8 +260,6 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
           file=sys.stderr)
       sys.exit(1)
 
-    return self._run_start_response
-
   def _prep_cli_for_run_start(self):
     """Prepare (but not launch) the CLI for run-start."""
 
@@ -263,6 +301,13 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       elif request.client_graph_def:
         partition_graphs = [request.client_graph_def]
 
+      if request.tf_error and not os.path.isdir(self._dump_root):
+        # It is possible that the dump root may not exist due to errors that
+        # have occurred prior to graph execution (e.g., invalid device
+        # assignments), in which case we will just raise the exception as the
+        # unwrapped Session does.
+        raise request.tf_error
+
       debug_dump = debug_data.DebugDumpDir(
           self._dump_root, partition_graphs=partition_graphs)
       debug_dump.set_python_graph(self._sess.graph)
@@ -280,12 +325,16 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
           passed_filter = self._active_tensor_filter
           self._active_tensor_filter = None
 
-      self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter)
+      self._prep_debug_cli_for_run_end(
+          debug_dump, request.tf_error, passed_filter)
 
       self._run_start_response = self._launch_cli()
 
       # Clean up the dump generated by this run.
       self._remove_dump_root()
+    elif request.performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._prep_profile_cli_for_run_end(self._sess.graph, request.run_metadata)
+      self._run_start_response = self._launch_cli()
     else:
       # No debug information to show following a non-debug run() call.
       self._run_start_response = None
@@ -298,7 +347,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if os.path.isdir(self._dump_root):
       shutil.rmtree(self._dump_root)
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     """Prepare (but not launch) CLI for run-end, with debug dump from the run.
 
     Args:
@@ -326,7 +375,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         self._title_color = "red_on_white"
 
     self._run_cli = analyzer_cli.create_analyzer_ui(
-        debug_dump, self._tensor_filters, ui_type=self._ui_type)
+        debug_dump, self._tensor_filters, ui_type=self._ui_type,
+        on_ui_exit=self._remove_dump_root)
 
     # Get names of all dumped tensors.
     dumped_tensor_names = []
@@ -352,6 +402,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if help_intro:
       self._run_cli.set_help_intro(help_intro)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self._init_command = "lp"
+    self._run_cli = profile_analyzer_cli.create_profiler_ui(
+        py_graph, run_metadata, ui_type=self._ui_type)
+    self._title = "run-end (profiler mode): " + self._run_description
+
   def _launch_cli(self):
     """Launch the interactive command-line interface.
 
@@ -386,9 +442,17 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
   def _run_handler(self, args, screen_info=None):
     """Command handler for "run" command during on-run-start."""
 
-    _ = screen_info  # Currently unused.
+    del screen_info  # Currently unused.
 
     parsed = self._argparsers["run"].parse_args(args)
+    parsed.node_name_filter = parsed.node_name_filter or None
+    parsed.op_type_filter = parsed.op_type_filter or None
+    parsed.tensor_dtype_filter = parsed.tensor_dtype_filter or None
+
+    if parsed.profile:
+      raise debugger_cli_common.CommandLineExit(
+          exit_token=framework.OnRunStartResponse(
+              framework.OnRunStartAction.PROFILE_RUN, []))
 
     if parsed.till_filter_pass:
       # For the run-till-bad-numerical-value-appears mode, use the DEBUG_RUN
@@ -416,7 +480,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
 
     # Raise CommandLineExit exception to cause the CLI to exit.
     raise debugger_cli_common.CommandLineExit(
-        exit_token=framework.OnRunStartResponse(action, debug_urls))
+        exit_token=framework.OnRunStartResponse(
+            action,
+            debug_urls,
+            node_name_regex_whitelist=parsed.node_name_filter,
+            op_type_regex_whitelist=parsed.op_type_filter,
+            tensor_dtype_regex_whitelist=parsed.tensor_dtype_filter))
 
   def _register_this_run_info(self, curses_cli):
     curses_cli.register_command_handler(
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 5b52bd81177..f8e32eca25e 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -27,10 +27,14 @@ from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -44,7 +48,10 @@ class LocalCLIDebuggerWrapperSessionForTest(
   Inserts observer variables for assertions.
   """
 
-  def __init__(self, command_args_sequence, sess, dump_root=None):
+  def __init__(self,
+               command_args_sequence,
+               sess,
+               dump_root=None):
     """Constructor of the for-test subclass.
 
     Args:
@@ -66,15 +73,21 @@ class LocalCLIDebuggerWrapperSessionForTest(
         "tf_errors": [],
         "run_start_cli_run_numbers": [],
         "run_end_cli_run_numbers": [],
+        "profiler_py_graphs": [],
+        "profiler_run_metadata": [],
     }
 
   def _prep_cli_for_run_start(self):
     pass
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     self.observers["debug_dumps"].append(debug_dump)
     self.observers["tf_errors"].append(tf_error)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self.observers["profiler_py_graphs"].append(py_graph)
+    self.observers["profiler_run_metadata"].append(run_metadata)
+
   def _launch_cli(self):
     if self._is_run_start:
       self.observers["run_start_cli_run_numbers"].append(self._run_call_count)
@@ -98,9 +111,15 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self._tmp_dir = tempfile.mktemp()
 
     self.v = variables.Variable(10.0, name="v")
+    self.w = variables.Variable(21.0, name="w")
     self.delta = constant_op.constant(1.0, name="delta")
     self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
 
+    self.w_int = control_flow_ops.with_dependencies(
+        [self.inc_v],
+        math_ops.cast(self.w, dtypes.int32, name="w_int_inner"),
+        name="w_int_outer")
+
     self.ph = array_ops.placeholder(dtypes.float32, name="ph")
     self.xph = array_ops.transpose(self.ph, name="xph")
     self.m = constant_op.constant(
@@ -110,7 +129,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.sess = session.Session()
 
     # Initialize variable.
-    self.sess.run(self.v.initializer)
+    self.sess.run(variables.global_variables_initializer())
 
   def tearDown(self):
     ops.reset_default_graph()
@@ -172,6 +191,16 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     # they should be both None.
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
+  def testRunsWithEmptyStringDumpRootWorks(self):
+    # Test command sequence: run, run
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [[], []], self.sess, dump_root="")
+
+    # run under debug mode.
+    wrapped_sess.run(self.inc_v)
+
+    self.assertAllClose(11.0, self.sess.run(self.v))
+
   def testRunInfoOutputAtRunEndIsCorrect(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [[], [], []], self.sess, dump_root=self._tmp_dir)
@@ -303,6 +332,16 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     tf_error = wrapped_sess.observers["tf_errors"][0]
     self.assertEqual("y", tf_error.op.name)
 
+  def testRuntimeErrorBeforeGraphExecutionIsRaised(self):
+    # Use an impossible device name to cause an error before graph execution.
+    with ops.device("/gpu:1337"):
+      w = variables.Variable([1.0] * 10, name="w")
+
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [[]], self.sess, dump_root=self._tmp_dir)
+    with self.assertRaisesRegexp(errors.OpError, r".*[Dd]evice.*1337.*"):
+      wrapped_sess.run(w)
+
   def testRunTillFilterPassesShouldLaunchCLIAtCorrectRun(self):
     # Test command sequence:
     #   run -f greater_than_twelve; run -f greater_than_twelve; run;
@@ -335,6 +374,121 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
+  def testRunsUnderDebugModeWithWatchFnFilteringNodeNames(self):
+    # Test command sequence:
+    #   run --node_name_filter inc.*
+    #   run --node_name_filter delta
+    #   run
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["--node_name_filter", "inc.*"], ["--node_name_filter", "delta"], []],
+        self.sess, dump_root=self._tmp_dir)
+
+    # run under debug mode twice.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    # Verify that the assign_add op did take effect.
+    self.assertAllClose(12.0, self.sess.run(self.v))
+
+    # Verify that the dumps have been generated and picked up during run-end.
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+
+    dumps = wrapped_sess.observers["debug_dumps"][0]
+    self.assertEqual(1, dumps.size)
+    self.assertEqual("inc_v", dumps.dumped_tensor_data[0].node_name)
+
+    dumps = wrapped_sess.observers["debug_dumps"][1]
+    self.assertEqual(1, dumps.size)
+    self.assertEqual("delta", dumps.dumped_tensor_data[0].node_name)
+
+  def testRunsUnderDebugModeWithWatchFnFilteringOpTypes(self):
+    # Test command sequence:
+    #   run --node_name_filter delta
+    #   run --op_type_filter AssignAdd
+    #   run
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["--node_name_filter", "delta"],
+         ["--op_type_filter", "AssignAdd"],
+         []],
+        self.sess, dump_root=self._tmp_dir)
+
+    # run under debug mode twice.
+    wrapped_sess.run(self.inc_v)
+    wrapped_sess.run(self.inc_v)
+
+    # Verify that the assign_add op did take effect.
+    self.assertAllClose(12.0, self.sess.run(self.v))
+
+    # Verify that the dumps have been generated and picked up during run-end.
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+
+    dumps = wrapped_sess.observers["debug_dumps"][0]
+    self.assertEqual(1, dumps.size)
+    self.assertEqual("delta", dumps.dumped_tensor_data[0].node_name)
+
+    dumps = wrapped_sess.observers["debug_dumps"][1]
+    self.assertEqual(1, dumps.size)
+    self.assertEqual("inc_v", dumps.dumped_tensor_data[0].node_name)
+
+  def testRunsUnderDebugModeWithWatchFnFilteringTensorDTypes(self):
+    # Test command sequence:
+    #   run --op_type_filter Variable.*
+    #   run --dtype_filter int32
+    #   run
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["--op_type_filter", "Variable.*"],
+         ["--tensor_dtype_filter", "int32"], []],
+        self.sess, dump_root=self._tmp_dir)
+
+    # run under debug mode twice.
+    wrapped_sess.run(self.w_int)
+    wrapped_sess.run(self.w_int)
+
+    # Verify that the dumps have been generated and picked up during run-end.
+    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
+
+    dumps = wrapped_sess.observers["debug_dumps"][0]
+    self.assertEqual(2, dumps.size)
+    self.assertItemsEqual(
+        ["v", "w"], [dumps.dumped_tensor_data[i].node_name for i in [0, 1]])
+
+    dumps = wrapped_sess.observers["debug_dumps"][1]
+    self.assertEqual(2, dumps.size)
+    self.assertEqual(
+        ["w_int_inner", "w_int_outer"],
+        [dumps.dumped_tensor_data[i].node_name for i in [0, 1]])
+
+  def testRunsUnderDebugModeWithWatchFnFilteringOpTypesAndTensorDTypes(self):
+    # Test command sequence:
+    #   run --op_type_filter Cast --dtype_filter int32
+    #   run
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["--op_type_filter", "Cast", "--tensor_dtype_filter", "int32"], []],
+        self.sess, dump_root=self._tmp_dir)
+
+    # run under debug mode twice.
+    wrapped_sess.run(self.w_int)
+
+    # Verify that the dumps have been generated and picked up during run-end.
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+
+    dumps = wrapped_sess.observers["debug_dumps"][0]
+    self.assertEqual(1, dumps.size)
+    self.assertEqual("w_int_inner", dumps.dumped_tensor_data[0].node_name)
+
+  def testRunUnderProfilerModeWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-p"], []], self.sess)
+
+    wrapped_sess.run(self.w_int)
+
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_run_metadata"]))
+    self.assertTrue(
+        wrapped_sess.observers["profiler_run_metadata"][0].step_stats)
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_py_graphs"]))
+    self.assertIsInstance(
+        wrapped_sess.observers["profiler_py_graphs"][0], ops.Graph)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
new file mode 100644
index 00000000000..0cfd02466e9
--- /dev/null
+++ b/tensorflow/python/estimator/BUILD
@@ -0,0 +1,622 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "estimator_py",
+    srcs = ["estimator_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dnn",
+        ":dnn_linear_combined",
+        ":estimator",
+        ":export",
+        ":inputs",
+        ":linear",
+        ":model_fn",
+        ":parsing_utils",
+        ":run_config",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "model_fn",
+    srcs = ["model_fn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "model_fn_test",
+    size = "small",
+    srcs = ["model_fn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_fn",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "run_config",
+    srcs = ["run_config.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "run_config_test",
+    size = "small",
+    srcs = ["run_config_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":run_config",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "dnn",
+    srcs = ["canned/dnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        ":optimizers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "dnn_testing_utils",
+    testonly = 1,
+    srcs = ["canned/dnn_testing_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":head",
+        ":metric_keys",
+        ":model_fn",
+        ":numpy_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "dnn_test",
+    size = "medium",
+    srcs = ["canned/dnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dnn",
+        ":dnn_testing_utils",
+        ":export_export",
+        ":numpy_io",
+        ":pandas_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "dnn_linear_combined",
+    srcs = ["canned/dnn_linear_combined.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        ":optimizers",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "dnn_linear_combined_test",
+    size = "medium",
+    srcs = ["canned/dnn_linear_combined_test.py"],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dnn_linear_combined",
+        ":dnn_testing_utils",
+        ":export_export",
+        ":linear_testing_utils",
+        ":numpy_io",
+        ":pandas_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "estimator",
+    srcs = [
+        "estimator.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export",
+        ":model_fn",
+        ":run_config",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+py_test(
+    name = "estimator_test",
+    srcs = ["estimator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":export",
+        ":model_fn",
+        ":numpy_io",
+        ":run_config",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:saver_test_utils",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+py_library(
+    name = "parsing_utils",
+    srcs = [
+        "canned/parsing_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "parsing_utils_test",
+    size = "small",
+    srcs = ["canned/parsing_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":parsing_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "export_output",
+    srcs = ["export/export_output.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
+
+py_test(
+    name = "export_output_test",
+    size = "small",
+    srcs = ["export/export_output_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_library(
+    name = "export",
+    srcs = [
+        "export/export_lib.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_export",
+        ":export_output",
+    ],
+)
+
+py_library(
+    name = "export_export",
+    srcs = [
+        "export/export.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_test(
+    name = "export_test",
+    size = "small",
+    srcs = ["export/export_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_export",
+        ":export_output",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "head",
+    srcs = ["canned/head.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        ":metric_keys",
+        ":model_fn",
+        ":prediction_keys",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_test(
+    name = "head_test",
+    size = "small",
+    srcs = ["canned/head_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":head",
+        ":metric_keys",
+        ":model_fn",
+        ":numpy_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_library(
+    name = "inputs",
+    srcs = ["inputs/inputs.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":numpy_io",
+        ":pandas_io",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "linear",
+    srcs = ["canned/linear.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":optimizers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "linear_testing_utils",
+    testonly = 1,
+    srcs = ["canned/linear_testing_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":export_export",
+        ":linear",
+        ":metric_keys",
+        ":numpy_io",
+        ":pandas_io",
+        ":run_config",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "linear_test",
+    size = "medium",
+    srcs = ["canned/linear_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":estimator",
+        ":export_export",
+        ":linear",
+        ":linear_testing_utils",
+        ":metric_keys",
+        ":numpy_io",
+        ":pandas_io",
+        ":run_config",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "metric_keys",
+    srcs = ["canned/metric_keys.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_fn",
+    ],
+)
+
+py_library(
+    name = "numpy_io",
+    srcs = ["inputs/numpy_io.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inputs_queues",
+    ],
+)
+
+py_test(
+    name = "numpy_io_test",
+    size = "small",
+    srcs = ["inputs/numpy_io_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":numpy_io",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "optimizers",
+    srcs = ["canned/optimizers.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "optimizers_test",
+    size = "small",
+    srcs = ["canned/optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "pandas_io",
+    srcs = ["inputs/pandas_io.py"],
+    srcs_version = "PY2AND3",
+    deps = [":inputs_queues"],
+)
+
+py_test(
+    name = "pandas_io_test",
+    size = "small",
+    srcs = ["inputs/pandas_io_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pandas_io",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "prediction_keys",
+    srcs = ["canned/prediction_keys.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
+py_library(
+    name = "inputs_queues",
+    srcs = [
+        "inputs/queues/__init__.py",
+        "inputs/queues/feeding_functions.py",
+        "inputs/queues/feeding_queue_runner.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:training"],
+)
+
+py_test(
+    name = "feeding_functions_test",
+    size = "small",
+    srcs = [
+        "inputs/queues/feeding_functions_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inputs_queues",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "feeding_queue_runner_test",
+    size = "small",
+    srcs = ["inputs/queues/feeding_queue_runner_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inputs_queues",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
diff --git a/.gitmodules b/tensorflow/python/estimator/__init__.py
similarity index 100%
rename from .gitmodules
rename to tensorflow/python/estimator/__init__.py
diff --git a/tensorflow/python/estimator/canned/__init__.py b/tensorflow/python/estimator/canned/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
new file mode 100644
index 00000000000..013a43a8b08
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -0,0 +1,383 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Deep Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+# The default learning rate of 0.05 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.05
+
+
+def _add_hidden_layer_summary(value, tag):
+  summary.scalar('%s/fraction_of_zero_values' % tag, nn.zero_fraction(value))
+  summary.histogram('%s/activation' % tag, value)
+
+
+def _dnn_model_fn(
+    features, labels, mode, head, hidden_units, feature_columns,
+    optimizer='Adagrad', activation_fn=nn.relu, dropout=None,
+    input_layer_partitioner=None, config=None):
+  """Deep Neural Net model_fn.
+
+  Args:
+    features: Dict of `Tensor` (depends on data passed to `train`).
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    hidden_units: Iterable of integer number of hidden units per layer.
+    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05.
+    activation_fn: Activation function applied to each layer.
+    dropout: When not `None`, the probability we will drop out a given
+      coordinate.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    predictions: A dict of `Tensor` objects.
+    loss: A scalar containing the loss of the step.
+    train_op: The op for training.
+  """
+  optimizer = optimizers.get_optimizer_instance(
+      optimizer, learning_rate=_LEARNING_RATE)
+  num_ps_replicas = config.num_ps_replicas if config else 0
+
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'dnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+    with variable_scope.variable_scope(
+        'input_from_feature_columns',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      net = feature_column_lib.input_layer(
+          features=features,
+          feature_columns=feature_columns)
+
+    for layer_id, num_hidden_units in enumerate(hidden_units):
+      with variable_scope.variable_scope(
+          'hiddenlayer_%d' % layer_id,
+          values=(net,)) as hidden_layer_scope:
+        net = core_layers.dense(
+            net,
+            units=num_hidden_units,
+            activation=activation_fn,
+            kernel_initializer=init_ops.glorot_uniform_initializer(),
+            name=hidden_layer_scope)
+        if dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+          net = core_layers.dropout(net, rate=dropout, training=True)
+      _add_hidden_layer_summary(net, hidden_layer_scope.name)
+
+    with variable_scope.variable_scope(
+        'logits',
+        values=(net,)) as logits_scope:
+      logits = core_layers.dense(
+          net,
+          units=head.logits_dimension,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer(),
+          name=logits_scope)
+    _add_hidden_layer_summary(logits, logits_scope.name)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class DNNClassifier(estimator.Estimator):
+  """A classifier for TensorFlow DNN models.
+
+  Example:
+
+  ```python
+  sparse_feature_a = sparse_column_with_hash_bucket(...)
+  sparse_feature_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+
+  estimator = DNNClassifier(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      hidden_units=[1024, 512, 256])
+
+  # Or estimator using the ProximalAdagradOptimizer optimizer with
+  # regularization.
+  estimator = DNNClassifier(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      hidden_units=[1024, 512, 256],
+      optimizer=tf.train.ProximalAdagradOptimizer(
+        learning_rate=0.1,
+        l1_regularization_strength=0.001
+      ))
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               hidden_units,
+               feature_columns,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Adagrad',
+               activation_fn=nn.relu,
+               dropout=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `DNNClassifier` instance.
+
+    Args:
+      hidden_units: Iterable of number hidden units per layer. All layers are
+        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
+        second one has 32.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `_FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      activation_fn: Activation function applied to each layer. If `None`, will
+        use `tf.nn.relu`.
+      dropout: When not `None`, the probability we will drop out a given
+        coordinate.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _dnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          hidden_units=hidden_units,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          activation_fn=activation_fn,
+          dropout=dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(DNNClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+class DNNRegressor(estimator.Estimator):
+  """A regressor for TensorFlow DNN models.
+
+  Example:
+
+  ```python
+  sparse_feature_a = sparse_column_with_hash_bucket(...)
+  sparse_feature_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+
+  estimator = DNNRegressor(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      hidden_units=[1024, 512, 256])
+
+  # Or estimator using the ProximalAdagradOptimizer optimizer with
+  # regularization.
+  estimator = DNNRegressor(
+      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      hidden_units=[1024, 512, 256],
+      optimizer=tf.train.ProximalAdagradOptimizer(
+        learning_rate=0.1,
+        l1_regularization_strength=0.001
+      ))
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               hidden_units,
+               feature_columns,
+               model_dir=None,
+               label_dimension=1,
+               weight_column=None,
+               optimizer='Adagrad',
+               activation_fn=nn.relu,
+               dropout=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `DNNRegressor` instance.
+
+    Args:
+      hidden_units: Iterable of number hidden units per layer. All layers are
+        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
+        second one has 32.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `_FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      activation_fn: Activation function applied to each layer. If `None`, will
+        use `tf.nn.relu`.
+      dropout: When not `None`, the probability we will drop out a given
+        coordinate.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    def _model_fn(features, labels, mode, config):
+      return _dnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head_lib.  # pylint: disable=protected-access
+          _regression_head_with_mean_squared_error_loss(
+              label_dimension=label_dimension, weight_column=weight_column),
+          hidden_units=hidden_units,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          activation_fn=activation_fn,
+          dropout=dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(DNNRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
new file mode 100644
index 00000000000..935f6564eb5
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -0,0 +1,538 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow estimators for Linear and DNN joined training models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import sync_replicas_optimizer
+from tensorflow.python.training import training_util
+
+# The default learning rates are a historical artifact of the initial
+# implementation.
+_DNN_LEARNING_RATE = 0.001
+_LINEAR_LEARNING_RATE = 0.005
+
+
+def _check_no_sync_replicas_optimizer(optimizer):
+  if isinstance(optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
+    raise ValueError(
+        'SyncReplicasOptimizer does not support multi optimizers case. '
+        'Therefore, it is not supported in DNNLinearCombined model. '
+        'If you want to use this optimizer, please use either DNN or Linear '
+        'model.')
+
+
+def _linear_learning_rate(num_linear_feature_columns):
+  """Returns the default learning rate of the linear model.
+
+  The calculation is a historical artifact of this initial implementation, but
+  has proven a reasonable choice.
+
+  Args:
+    num_linear_feature_columns: The number of feature columns of the linear
+      model.
+
+  Returns:
+    A float.
+  """
+  default_learning_rate = 1. / math.sqrt(num_linear_feature_columns)
+  return min(_LINEAR_LEARNING_RATE, default_learning_rate)
+
+
+def _add_layer_summary(value, tag):
+  summary.scalar('%s/fraction_of_zero_values' % tag, nn.zero_fraction(value))
+  summary.histogram('%s/activation' % tag, value)
+
+
+def _dnn_linear_combined_model_fn(
+    features, labels, mode, head,
+    linear_feature_columns=None, linear_optimizer='Ftrl',
+    dnn_feature_columns=None, dnn_optimizer='Adagrad', dnn_hidden_units=None,
+    dnn_activation_fn=nn.relu, dnn_dropout=None,
+    input_layer_partitioner=None, config=None):
+  """Deep Neural Net and Linear combined model_fn.
+
+  Args:
+    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype
+      `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `Head` instance.
+    linear_feature_columns: An iterable containing all the feature columns used
+      by the Linear model.
+    linear_optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training the Linear model. Defaults to the Ftrl
+      optimizer.
+    dnn_feature_columns: An iterable containing all the feature columns used by
+      the DNN model.
+    dnn_optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training the DNN model. Defaults to the Adagrad
+      optimizer.
+    dnn_hidden_units: List of hidden units per DNN layer.
+    dnn_activation_fn: Activation function applied to each DNN layer. If `None`,
+      will use `tf.nn.relu`.
+    dnn_dropout: When not `None`, the probability we will drop out a given DNN
+      coordinate.
+    input_layer_partitioner: Partitioner for input layer.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    `ModelFnOps`
+
+  Raises:
+    ValueError: If both `linear_feature_columns` and `dnn_features_columns`
+      are empty at the same time, or `input_layer_partitioner` is missing.
+  """
+  if not linear_feature_columns and not dnn_feature_columns:
+    raise ValueError(
+        'Either linear_feature_columns or dnn_feature_columns must be defined.')
+  num_ps_replicas = config.num_ps_replicas if config else 0
+  input_layer_partitioner = input_layer_partitioner or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas,
+          min_slice_size=64 << 20))
+
+  # Build DNN Logits.
+  dnn_parent_scope = 'dnn'
+
+  if not dnn_feature_columns:
+    dnn_logits = None
+  else:
+    dnn_optimizer = optimizers.get_optimizer_instance(
+        dnn_optimizer, learning_rate=_DNN_LEARNING_RATE)
+    _check_no_sync_replicas_optimizer(dnn_optimizer)
+    if not dnn_hidden_units:
+      raise ValueError(
+          'dnn_hidden_units must be defined when dnn_feature_columns is '
+          'specified.')
+    dnn_partitioner = (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas))
+    with variable_scope.variable_scope(
+        dnn_parent_scope,
+        values=tuple(six.itervalues(features)),
+        partitioner=dnn_partitioner):
+      with variable_scope.variable_scope('input',
+                                         partitioner=input_layer_partitioner):
+        net = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=dnn_feature_columns)
+
+      for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+        with variable_scope.variable_scope(
+            'hiddenlayer_%d' % layer_id,
+            values=(net,)) as dnn_hidden_layer_scope:
+          net = core_layers.dense(
+              net,
+              units=num_hidden_units,
+              activation=dnn_activation_fn,
+              kernel_initializer=init_ops.glorot_uniform_initializer(),
+              name=dnn_hidden_layer_scope)
+          if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+            net = core_layers.dropout(net, rate=dnn_dropout, training=True)
+        _add_layer_summary(net, dnn_hidden_layer_scope.name)
+
+      with variable_scope.variable_scope(
+          'logits',
+          values=(net,)) as dnn_logits_scope:
+        dnn_logits = core_layers.dense(
+            net,
+            units=head.logits_dimension,
+            activation=None,
+            kernel_initializer=init_ops.glorot_uniform_initializer(),
+            name=dnn_logits_scope)
+      _add_layer_summary(dnn_logits, dnn_logits_scope.name)
+
+  linear_parent_scope = 'linear'
+
+  if not linear_feature_columns:
+    linear_logits = None
+  else:
+    linear_optimizer = optimizers.get_optimizer_instance(
+        linear_optimizer,
+        learning_rate=_linear_learning_rate(len(linear_feature_columns)))
+    _check_no_sync_replicas_optimizer(linear_optimizer)
+    with variable_scope.variable_scope(
+        linear_parent_scope,
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner) as scope:
+      linear_logits = feature_column_lib.linear_model(
+          features=features,
+          feature_columns=linear_feature_columns,
+          units=head.logits_dimension)
+      _add_layer_summary(linear_logits, scope.name)
+
+  # Combine logits and build full model.
+  if dnn_logits is not None and linear_logits is not None:
+    logits = dnn_logits + linear_logits
+  elif dnn_logits is not None:
+    logits = dnn_logits
+  else:
+    logits = linear_logits
+
+  def _train_op_fn(loss):
+    """Returns the op to optimize the loss."""
+    train_ops = []
+    global_step = training_util.get_global_step()
+    if dnn_logits is not None:
+      train_ops.append(
+          dnn_optimizer.minimize(
+              loss,
+              var_list=ops.get_collection(
+                  ops.GraphKeys.TRAINABLE_VARIABLES,
+                  scope=dnn_parent_scope)))
+    if linear_logits is not None:
+      train_ops.append(
+          linear_optimizer.minimize(
+              loss,
+              var_list=ops.get_collection(
+                  ops.GraphKeys.TRAINABLE_VARIABLES,
+                  scope=linear_parent_scope)))
+
+    train_op = control_flow_ops.group(*train_ops)
+    with ops.control_dependencies([train_op]):
+      with ops.colocate_with(global_step):
+        return state_ops.assign_add(global_step, 1)
+
+  return head.create_estimator_spec(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+
+
+class DNNLinearCombinedClassifier(estimator.Estimator):
+  """An estimator for TensorFlow Linear and DNN joined classification models.
+
+  Note: This estimator is also known as wide-n-deep.
+
+  Example:
+
+  ```python
+  numeric_feature = numeric_column(...)
+  sparse_column_a = categorical_column_with_hash_bucket(...)
+  sparse_column_b = categorical_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+
+  estimator = DNNLinearCombinedClassifier(
+      # wide settings
+      linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
+      linear_optimizer=tf.train.FtrlOptimizer(...),
+      # deep settings
+      dnn_feature_columns=[
+          sparse_feature_a_emb, sparse_feature_b_emb, numeric_feature],
+      dnn_hidden_units=[1000, 500, 100],
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+
+  # To apply L1 and L2 regularization, you can set optimizers as follows:
+  tf.train.ProximalAdagradOptimizer(
+      learning_rate=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=0.001)
+  # It is same for FtrlOptimizer.
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  """
+
+  def __init__(self,
+               model_dir=None,
+               linear_feature_columns=None,
+               linear_optimizer='Ftrl',
+               dnn_feature_columns=None,
+               dnn_optimizer='Adagrad',
+               dnn_hidden_units=None,
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a DNNLinearCombinedClassifier instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      linear_feature_columns: An iterable containing all the feature columns
+        used by linear part of the model. All items in the set must be
+        instances of classes derived from `FeatureColumn`.
+      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+      dnn_feature_columns: An iterable containing all the feature columns used
+        by deep part of the model. All items in the set must be instances of
+        classes derived from `FeatureColumn`.
+      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+      dnn_hidden_units: List of hidden units per layer. All layers are fully
+        connected.
+      dnn_activation_fn: Activation function applied to each layer. If None,
+        will use `tf.nn.relu`.
+      dnn_dropout: When not None, the probability we will drop out
+        a given coordinate.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      input_layer_partitioner: Partitioner for input layer. Defaults to
+        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: RunConfig object to configure the runtime settings.
+
+    Raises:
+      ValueError: If both linear_feature_columns and dnn_features_columns are
+        empty at the same time.
+    """
+    linear_feature_columns = linear_feature_columns or []
+    dnn_feature_columns = dnn_feature_columns or []
+    self._feature_columns = (
+        list(linear_feature_columns) + list(dnn_feature_columns))
+    if not self._feature_columns:
+      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
+                       'must be defined.')
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes,
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _dnn_linear_combined_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          linear_feature_columns=linear_feature_columns,
+          linear_optimizer=linear_optimizer,
+          dnn_feature_columns=dnn_feature_columns,
+          dnn_optimizer=dnn_optimizer,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+
+    super(DNNLinearCombinedClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+class DNNLinearCombinedRegressor(estimator.Estimator):
+  """An estimator for TensorFlow Linear and DNN joined models for regression.
+
+  Note: This estimator is also known as wide-n-deep.
+
+  Example:
+
+  ```python
+  numeric_feature = numeric_column(...)
+  sparse_column_a = categorical_column_with_hash_bucket(...)
+  sparse_column_b = categorical_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+
+  estimator = DNNLinearCombinedRegressor(
+      # wide settings
+      linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
+      linear_optimizer=tf.train.FtrlOptimizer(...),
+      # deep settings
+      dnn_feature_columns=[
+          sparse_feature_a_emb, sparse_feature_b_emb, numeric_feature],
+      dnn_hidden_units=[1000, 500, 100],
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+
+  # To apply L1 and L2 regularization, you can set optimizers as follows:
+  tf.train.ProximalAdagradOptimizer(
+      learning_rate=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=0.001)
+  # It is same for FtrlOptimizer.
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  """
+
+  def __init__(self,
+               model_dir=None,
+               linear_feature_columns=None,
+               linear_optimizer='Ftrl',
+               dnn_feature_columns=None,
+               dnn_optimizer='Adagrad',
+               dnn_hidden_units=None,
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               label_dimension=1,
+               weight_column=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a DNNLinearCombinedRegressor instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      linear_feature_columns: An iterable containing all the feature columns
+        used by linear part of the model. All items in the set must be
+        instances of classes derived from `FeatureColumn`.
+      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+      dnn_feature_columns: An iterable containing all the feature columns used
+        by deep part of the model. All items in the set must be instances of
+        classes derived from `FeatureColumn`.
+      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+      dnn_hidden_units: List of hidden units per layer. All layers are fully
+        connected.
+      dnn_activation_fn: Activation function applied to each layer. If None,
+        will use `tf.nn.relu`.
+      dnn_dropout: When not None, the probability we will drop out
+        a given coordinate.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      input_layer_partitioner: Partitioner for input layer. Defaults to
+        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: RunConfig object to configure the runtime settings.
+
+    Raises:
+      ValueError: If both linear_feature_columns and dnn_features_columns are
+        empty at the same time.
+    """
+    linear_feature_columns = linear_feature_columns or []
+    dnn_feature_columns = dnn_feature_columns or []
+    self._feature_columns = (
+        list(linear_feature_columns) + list(dnn_feature_columns))
+    if not self._feature_columns:
+      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
+                       'must be defined.')
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_linear_combined_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head_lib.  # pylint: disable=protected-access
+          _regression_head_with_mean_squared_error_loss(
+              label_dimension=label_dimension, weight_column=weight_column),
+          linear_feature_columns=linear_feature_columns,
+          linear_optimizer=linear_optimizer,
+          dnn_feature_columns=dnn_feature_columns,
+          dnn_optimizer=dnn_optimizer,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+
+    super(DNNLinearCombinedRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
new file mode 100644
index 00000000000..486b302ab5c
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -0,0 +1,734 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dnn_linear_combined.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import dnn_testing_utils
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.estimator.inputs import pandas_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import optimizer as optimizer_lib
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+class DNNOnlyModelFnTest(dnn_testing_utils.BaseDNNModelFnTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNModelFnTest.__init__(self, self._dnn_only_model_fn)
+
+  def _dnn_only_model_fn(self,
+                         features,
+                         labels,
+                         mode,
+                         head,
+                         hidden_units,
+                         feature_columns,
+                         optimizer='Adagrad',
+                         activation_fn=nn.relu,
+                         dropout=None,
+                         input_layer_partitioner=None,
+                         config=None):
+    return dnn_linear_combined._dnn_linear_combined_model_fn(
+        features=features,
+        labels=labels,
+        mode=mode,
+        head=head,
+        linear_feature_columns=[],
+        dnn_hidden_units=hidden_units,
+        dnn_feature_columns=feature_columns,
+        dnn_optimizer=optimizer,
+        dnn_activation_fn=activation_fn,
+        dnn_dropout=dropout,
+        input_layer_partitioner=input_layer_partitioner,
+        config=config)
+
+
+# A function to mimic linear-regressor init reuse same tests.
+def _linear_regressor_fn(feature_columns,
+                         model_dir=None,
+                         label_dimension=1,
+                         weight_column=None,
+                         optimizer='Ftrl',
+                         config=None,
+                         partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedRegressor(
+      model_dir=model_dir,
+      linear_feature_columns=feature_columns,
+      linear_optimizer=optimizer,
+      label_dimension=label_dimension,
+      weight_column=weight_column,
+      input_layer_partitioner=partitioner,
+      config=config)
+
+
+class LinearOnlyRegressorPartitionerTest(
+    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearOnlyRegressorEvaluationTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearOnlyRegressorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearOnlyRegressorIntegrationTest(
+    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearOnlyRegressorTrainingTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_regressor_fn)
+
+
+def _linear_classifier_fn(feature_columns,
+                          model_dir=None,
+                          n_classes=2,
+                          weight_column=None,
+                          label_vocabulary=None,
+                          optimizer='Ftrl',
+                          config=None,
+                          partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedClassifier(
+      model_dir=model_dir,
+      linear_feature_columns=feature_columns,
+      linear_optimizer=optimizer,
+      n_classes=n_classes,
+      weight_column=weight_column,
+      label_vocabulary=label_vocabulary,
+      input_layer_partitioner=partitioner,
+      config=config)
+
+
+class LinearOnlyClassifierTrainingTest(
+    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearOnlyClassifierClassesEvaluationTest(
+    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearOnlyClassifierPredictTest(
+    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearOnlyClassifierIntegrationTest(
+    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    est = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+    label_dimension = 1
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size, dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(data)
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x': feature_pb2.Feature(
+                  float_list=feature_pb2.FloatList(value=datum)),
+              'y': feature_pb2.Feature(
+                  float_list=feature_pb2.FloatList(value=datum)),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+    }
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+# A function to mimic dnn-classifier init reuse same tests.
+def _dnn_classifier_fn(hidden_units,
+                       feature_columns,
+                       model_dir=None,
+                       n_classes=2,
+                       weight_column=None,
+                       label_vocabulary=None,
+                       optimizer='Adagrad',
+                       config=None,
+                       input_layer_partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedClassifier(
+      model_dir=model_dir,
+      dnn_hidden_units=hidden_units,
+      dnn_feature_columns=feature_columns,
+      dnn_optimizer=optimizer,
+      n_classes=n_classes,
+      weight_column=weight_column,
+      label_vocabulary=label_vocabulary,
+      input_layer_partitioner=input_layer_partitioner,
+      config=config)
+
+
+class DNNOnlyClassifierEvaluateTest(
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+class DNNOnlyClassifierPredictTest(
+    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+class DNNOnlyClassifierTrainTest(
+    dnn_testing_utils.BaseDNNClassifierTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+# A function to mimic dnn-regressor init reuse same tests.
+def _dnn_regressor_fn(hidden_units,
+                      feature_columns,
+                      model_dir=None,
+                      label_dimension=1,
+                      weight_column=None,
+                      optimizer='Adagrad',
+                      config=None,
+                      input_layer_partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedRegressor(
+      model_dir=model_dir,
+      dnn_hidden_units=hidden_units,
+      dnn_feature_columns=feature_columns,
+      dnn_optimizer=optimizer,
+      label_dimension=label_dimension,
+      weight_column=weight_column,
+      input_layer_partitioner=input_layer_partitioner,
+      config=config)
+
+
+class DNNOnlyRegressorEvaluateTest(
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+class DNNOnlyRegressorPredictTest(
+    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+class DNNOnlyRegressorTrainTest(
+    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _as_label(self, data_in_float):
+    return np.rint(data_in_float).astype(np.int64)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      n_classes, batch_size):
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    est = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    input_dimension = 2
+    batch_size = 10
+    data = np.linspace(
+        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
+    x_data = data.reshape(batch_size, input_dimension)
+    y_data = self._as_label(np.reshape(data[:batch_size], (batch_size, 1)))
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+    input_dimension = 1
+    n_classes = 2
+    batch_size = 10
+    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(self._as_label(data))
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    input_dimension = 2
+    n_classes = 3
+    batch_size = 10
+    data = np.linspace(0., n_classes-1., batch_size * input_dimension,
+                       dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum)),
+              'y':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=self._as_label(datum[:1]))),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = linear_testing_utils.queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+class DNNLinearCombinedTests(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, real_optimizer, var_name_prefix):
+    """Verifies global_step is None and var_names start with given prefix."""
+
+    def _minimize(loss, global_step=None, var_list=None):
+      self.assertIsNone(global_step)
+      trainable_vars = var_list or ops.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES)
+      var_names = [var.name for var in trainable_vars]
+      self.assertTrue(
+          all([name.startswith(var_name_prefix) for name in var_names]))
+      # var is used to check this op called by training.
+      var = variables_lib.Variable(0., name=(var_name_prefix + '_called'))
+      with ops.control_dependencies([var.assign(100.)]):
+        return real_optimizer.minimize(loss, global_step, var_list)
+
+    optimizer_mock = test.mock.NonCallableMagicMock(
+        spec=optimizer_lib.Optimizer, wraps=real_optimizer)
+    optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    return optimizer_mock
+
+  def test_train_op_calls_both_dnn_and_linear(self):
+    opt = gradient_descent.GradientDescentOptimizer(1.)
+    x_column = feature_column.numeric_column('x')
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[0.], [1.]])},
+        y=np.array([[0.], [1.]]),
+        batch_size=1,
+        shuffle=False)
+    est = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[x_column],
+        # verifies linear_optimizer is used only for linear part.
+        linear_optimizer=self._mock_optimizer(opt, 'linear'),
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=[x_column],
+        # verifies dnn_optimizer is used only for linear part.
+        dnn_optimizer=self._mock_optimizer(opt, 'dnn'),
+        model_dir=self._model_dir)
+    est.train(input_fn, steps=1)
+    # verifies train_op fires linear minimize op
+    self.assertEqual(100.,
+                     checkpoint_utils.load_variable(
+                         self._model_dir, 'binary_logistic_head/linear_called'))
+    # verifies train_op fires dnn minimize op
+    self.assertEqual(100.,
+                     checkpoint_utils.load_variable(
+                         self._model_dir, 'binary_logistic_head/dnn_called'))
+
+  def test_dnn_and_linear_logits_are_added(self):
+    with ops.Graph().as_default():
+      variables_lib.Variable([[1.0]], name='linear/linear_model/x/weights')
+      variables_lib.Variable([2.0], name='linear/linear_model/bias_weights')
+      variables_lib.Variable([[3.0]], name='dnn/hiddenlayer_0/kernel')
+      variables_lib.Variable([4.0], name='dnn/hiddenlayer_0/bias')
+      variables_lib.Variable([[5.0]], name='dnn/logits/kernel')
+      variables_lib.Variable([6.0], name='dnn/logits/bias')
+      variables_lib.Variable(1, name='global_step', dtype=dtypes.int64)
+      linear_testing_utils.save_variables_to_ckpt(self._model_dir)
+
+    x_column = feature_column.numeric_column('x')
+    est = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=[x_column],
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[x_column],
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
+    # linear logits = 10*1 + 2 = 12
+    # dnn logits = (10*3 + 4)*5 + 6 = 176
+    # logits = dnn + linear = 176 + 12 = 188
+    self.assertAllClose(
+        {
+            prediction_keys.PredictionKeys.PREDICTIONS: [188.],
+        },
+        next(est.predict(input_fn=input_fn)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/dnn_test.py b/tensorflow/python/estimator/canned/dnn_test.py
new file mode 100644
index 00000000000..fa2354135a6
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn_test.py
@@ -0,0 +1,474 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator.canned import dnn
+from tensorflow.python.estimator.canned import dnn_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.estimator.inputs import pandas_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import queue_runner
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+def _dnn_classifier_fn(*args, **kwargs):
+  return dnn.DNNClassifier(*args, **kwargs)
+
+
+class DNNModelFnTest(dnn_testing_utils.BaseDNNModelFnTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNModelFnTest.__init__(self, dnn._dnn_model_fn)
+
+
+class DNNClassifierEvaluateTest(
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+class DNNClassifierPredictTest(
+    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+class DNNClassifierTrainTest(
+    dnn_testing_utils.BaseDNNClassifierTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
+        self, _dnn_classifier_fn)
+
+
+def _dnn_regressor_fn(*args, **kwargs):
+  return dnn.DNNRegressor(*args, **kwargs)
+
+
+class DNNRegressorEvaluateTest(
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+class DNNRegressorPredictTest(
+    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+class DNNRegressorTrainTest(
+    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
+        self, _dnn_regressor_fn)
+
+
+def _queue_parsed_features(feature_map):
+  tensors_to_enqueue = []
+  keys = []
+  for key, tensor in six.iteritems(feature_map):
+    keys.append(key)
+    tensors_to_enqueue.append(tensor)
+  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
+  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
+  queue_runner.add_queue_runner(
+      queue_runner.QueueRunner(
+          input_queue,
+          [input_queue.enqueue(tensors_to_enqueue)]))
+  dequeued_tensors = input_queue.dequeue()
+  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
+
+
+class DNNRegressorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    est = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+    label_dimension = 1
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size, dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(data)
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x': feature_pb2.Feature(
+                  float_list=feature_pb2.FloatList(value=datum)),
+              'y': feature_pb2.Feature(
+                  float_list=feature_pb2.FloatList(value=datum)),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+    }
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = _queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = _queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = _queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+class DNNClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _as_label(self, data_in_float):
+    return np.rint(data_in_float).astype(np.int64)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      n_classes, batch_size):
+    feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    est = dnn.DNNClassifier(
+        hidden_units=(2, 2),
+        feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    input_dimension = 2
+    batch_size = 10
+    data = np.linspace(
+        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
+    x_data = data.reshape(batch_size, input_dimension)
+    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+    input_dimension = 1
+    n_classes = 3
+    batch_size = 10
+    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(self._as_label(data))
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x,
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    input_dimension = 2
+    n_classes = 3
+    batch_size = 10
+    data = np.linspace(
+        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum)),
+              'y':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=self._as_label(datum[:1]))),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = _queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = _queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = _queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
new file mode 100644
index 00000000000..269da5246d5
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -0,0 +1,1282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils to be used in testing DNN estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary as summary_lib
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+# pylint rules which are disabled by default for test files.
+# pylint: disable=invalid-name,protected-access,missing-docstring
+
+# Names of variables created by model.
+LEARNING_RATE_NAME = 'dnn/regression_head/dnn/learning_rate'
+HIDDEN_WEIGHTS_NAME_PATTERN = 'dnn/hiddenlayer_%d/kernel'
+HIDDEN_BIASES_NAME_PATTERN = 'dnn/hiddenlayer_%d/bias'
+LOGITS_WEIGHTS_NAME = 'dnn/logits/kernel'
+LOGITS_BIASES_NAME = 'dnn/logits/bias'
+
+
+def assert_close(expected, actual, rtol=1e-04, message='', name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs((expected - actual) / expected, 'diff')
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=(message, 'Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        summarize=expected.get_shape().num_elements(),
+        name=scope)
+
+
+def create_checkpoint(weights_and_biases, global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    weights_and_biases: Iterable of tuples of weight and bias values.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  weights, biases = zip(*weights_and_biases)
+  model_weights = {}
+
+  # Hidden layer weights.
+  for i in range(0, len(weights) - 1):
+    model_weights[HIDDEN_WEIGHTS_NAME_PATTERN % i] = weights[i]
+    model_weights[HIDDEN_BIASES_NAME_PATTERN % i] = biases[i]
+
+  # Output layer weights.
+  model_weights[LOGITS_WEIGHTS_NAME] = weights[-1]
+  model_weights[LOGITS_BIASES_NAME] = biases[-1]
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+
+    # Initialize vars and save checkpoint.
+    with tf_session.Session() as sess:
+      variables_lib.global_variables_initializer().run()
+      global_step_var.assign(global_step).eval()
+      saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
+  """Returns a mock head that validates logits values and variable names."""
+  hidden_weights_names = [(HIDDEN_WEIGHTS_NAME_PATTERN + '/part_0:0') % i
+                          for i in range(len(hidden_units))]
+  hidden_biases_names = [(HIDDEN_BIASES_NAME_PATTERN + '/part_0:0') % i
+                         for i in range(len(hidden_units))]
+  expected_var_names = (
+      hidden_weights_names + hidden_biases_names +
+      [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
+
+  def _create_estimator_spec(features, mode, logits, labels, train_op_fn):
+    del features, labels  # Not used.
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    testcase.assertItemsEqual(expected_var_names,
+                              [var.name for var in trainable_vars])
+    loss = constant_op.constant(1.)
+    assert_logits = assert_close(
+        expected_logits, logits, message='Failed for mode={}. '.format(mode))
+    with ops.control_dependencies([assert_logits]):
+      if mode == model_fn.ModeKeys.TRAIN:
+        return model_fn.EstimatorSpec(
+            mode=mode, loss=loss, train_op=train_op_fn(loss))
+      elif mode == model_fn.ModeKeys.EVAL:
+        return model_fn.EstimatorSpec(mode=mode, loss=array_ops.identity(loss))
+      elif mode == model_fn.ModeKeys.PREDICT:
+        return model_fn.EstimatorSpec(
+            mode=mode, predictions={'logits': array_ops.identity(logits)})
+      else:
+        testcase.fail('Invalid mode: {}'.format(mode))
+
+  head = test.mock.NonCallableMagicMock(spec=head_lib._Head)
+  head.logits_dimension = logits_dimension
+  head.create_estimator_spec = test.mock.MagicMock(wraps=_create_estimator_spec)
+
+  return head
+
+
+def mock_optimizer(testcase, hidden_units, expected_loss=None):
+  """Creates a mock optimizer to test the train method.
+
+  Args:
+    testcase: A TestCase instance.
+    hidden_units: Iterable of integer sizes for the hidden layers.
+    expected_loss: If given, will assert the loss value.
+
+  Returns:
+    A mock Optimizer.
+  """
+  hidden_weights_names = [(HIDDEN_WEIGHTS_NAME_PATTERN + '/part_0:0') % i
+                          for i in range(len(hidden_units))]
+  hidden_biases_names = [(HIDDEN_BIASES_NAME_PATTERN + '/part_0:0') % i
+                         for i in range(len(hidden_units))]
+  expected_var_names = (
+      hidden_weights_names + hidden_biases_names +
+      [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
+
+  def _minimize(loss, global_step=None, var_list=None):
+    """Mock of optimizer.minimize."""
+    trainable_vars = var_list or ops.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    testcase.assertItemsEqual(expected_var_names,
+                              [var.name for var in trainable_vars])
+
+    # Verify loss. We can't check the value directly, so we add an assert op.
+    testcase.assertEquals(0, loss.shape.ndims)
+    if expected_loss is None:
+      if global_step is not None:
+        return state_ops.assign_add(global_step, 1).op
+      return control_flow_ops.no_op()
+    assert_loss = assert_close(
+        math_ops.to_float(expected_loss, name='expected'),
+        loss,
+        name='assert_loss')
+    with ops.control_dependencies((assert_loss,)):
+      if global_step is not None:
+        return state_ops.assign_add(global_step, 1).op
+      return control_flow_ops.no_op()
+
+  optimizer_mock = test.mock.NonCallableMagicMock(
+      spec=optimizer.Optimizer,
+      wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+  optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
+
+  return optimizer_mock
+
+
+class BaseDNNModelFnTest(object):
+  """Tests that _dnn_model_fn passes expected logits to mock head."""
+
+  def __init__(self, dnn_model_fn):
+    self._dnn_model_fn = dnn_model_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_logits(self, mode, hidden_units, logits_dimension, inputs,
+                   expected_logits):
+    """Tests that the expected logits are passed to mock head."""
+    with ops.Graph().as_default():
+      training_util.create_global_step()
+      head = mock_head(
+          self,
+          hidden_units=hidden_units,
+          logits_dimension=logits_dimension,
+          expected_logits=expected_logits)
+      estimator_spec = self._dnn_model_fn(
+          features={'age': constant_op.constant(inputs)},
+          labels=constant_op.constant([[1]]),
+          mode=mode,
+          head=head,
+          hidden_units=hidden_units,
+          feature_columns=[
+              feature_column.numeric_column(
+                  'age', shape=np.array(inputs).shape[1:])
+          ],
+          optimizer=mock_optimizer(self, hidden_units))
+      with monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=self._model_dir) as sess:
+        if mode == model_fn.ModeKeys.TRAIN:
+          sess.run(estimator_spec.train_op)
+        elif mode == model_fn.ModeKeys.EVAL:
+          sess.run(estimator_spec.loss)
+        elif mode == model_fn.ModeKeys.PREDICT:
+          sess.run(estimator_spec.predictions)
+        else:
+          self.fail('Invalid mode: {}'.format(mode))
+
+  def test_one_dim_logits(self):
+    """Tests one-dimensional logits.
+
+    input_layer = [[10]]
+    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
+    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
+                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
+    logits = [[-1*2.38 +1*0 +0.3]] = [[-2.08]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          hidden_units=(2, 2),
+          logits_dimension=1,
+          inputs=[[10.]],
+          expected_logits=[[-2.08]])
+
+  def test_multi_dim_logits(self):
+    """Tests multi-dimensional logits.
+
+    input_layer = [[10]]
+    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
+    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
+                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
+    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38]]
+           = [[-2.08, 2.08, 1.19]]
+    """
+    base_global_step = 100
+    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
+                                                 [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      base_global_step, self._model_dir)
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          hidden_units=(2, 2),
+          logits_dimension=3,
+          inputs=[[10.]],
+          expected_logits=[[-2.08, 2.08, 1.19]])
+
+  def test_multi_example_multi_dim_logits(self):
+    """Tests multiple examples and multi-dimensional logits.
+
+    input_layer = [[10], [5]]
+    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)],
+                      [relu(0.6*5 +0.1), relu(0.5*5 -0.1)]]
+                   = [[6.1, 4.9], [3.1, 2.4]]
+    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)],
+                      [relu(1*3.1 -0.8*2.4 +0.2), relu(0.8*3.1 -1*2.4 -0.1)]]
+                   = [[2.38, 0], [1.38, 0]]
+    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38],
+              [-1*1.38 +0.3, 1*1.38 -0.3, 0.5*1.38]]
+           = [[-2.08, 2.08, 1.19], [-1.08, 1.08, 0.69]]
+    """
+    base_global_step = 100
+    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
+                                                 [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      base_global_step, self._model_dir)
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          hidden_units=(2, 2),
+          logits_dimension=3,
+          inputs=[[10.], [5.]],
+          expected_logits=[[-2.08, 2.08, 1.19], [-1.08, 1.08, .69]])
+
+  def test_multi_dim_input_one_dim_logits(self):
+    """Tests multi-dimensional inputs and one-dimensional logits.
+
+    input_layer = [[10, 8]]
+    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
+                   = [[1.3, 0.9]]
+    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
+                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
+    logits = [[-1*0.78 +1*0 +0.3]] = [[-0.48]]
+    """
+    base_global_step = 100
+    create_checkpoint((([[.6, .5], [-.6, -.5]],
+                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+                       ([[-1.], [1.]], [.3]),), base_global_step,
+                      self._model_dir)
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          hidden_units=(2, 2),
+          logits_dimension=1,
+          inputs=[[10., 8.]],
+          expected_logits=[[-0.48]])
+
+  def test_multi_dim_input_multi_dim_logits(self):
+    """Tests multi-dimensional inputs and multi-dimensional logits.
+
+    input_layer = [[10, 8]]
+    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
+                   = [[1.3, 0.9]]
+    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
+                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
+    logits = [[-1*0.78 + 0.3, 1*0.78 -0.3, 0.5*0.78]] = [[-0.48, 0.48, 0.39]]
+    """
+    base_global_step = 100
+    create_checkpoint((([[.6, .5], [-.6, -.5]],
+                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      base_global_step, self._model_dir)
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          hidden_units=(2, 2),
+          logits_dimension=3,
+          inputs=[[10., 8.]],
+          expected_logits=[[-0.48, 0.48, 0.39]])
+
+  def test_multi_feature_column_multi_dim_logits(self):
+    """Tests multiple feature columns and multi-dimensional logits.
+
+    All numbers are the same as test_multi_dim_input_multi_dim_logits. The only
+    difference is that the input consists of two 1D feature columns, instead of
+    one 2D feature column.
+    """
+    base_global_step = 100
+    create_checkpoint((([[.6, .5], [-.6, -.5]],
+                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      base_global_step, self._model_dir)
+    hidden_units = (2, 2)
+    logits_dimension = 3
+    inputs = ([[10.]], [[8.]])
+    expected_logits = [[-0.48, 0.48, 0.39]]
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      with ops.Graph().as_default():
+        training_util.create_global_step()
+        head = mock_head(
+            self,
+            hidden_units=hidden_units,
+            logits_dimension=logits_dimension,
+            expected_logits=expected_logits)
+        estimator_spec = self._dnn_model_fn(
+            features={
+                'age': constant_op.constant(inputs[0]),
+                'height': constant_op.constant(inputs[1])
+            },
+            labels=constant_op.constant([[1]]),
+            mode=mode,
+            head=head,
+            hidden_units=hidden_units,
+            feature_columns=[
+                feature_column.numeric_column('age'),
+                feature_column.numeric_column('height')
+            ],
+            optimizer=mock_optimizer(self, hidden_units))
+        with monitored_session.MonitoredTrainingSession(
+            checkpoint_dir=self._model_dir) as sess:
+          if mode == model_fn.ModeKeys.TRAIN:
+            sess.run(estimator_spec.train_op)
+          elif mode == model_fn.ModeKeys.EVAL:
+            sess.run(estimator_spec.loss)
+          elif mode == model_fn.ModeKeys.PREDICT:
+            sess.run(estimator_spec.predictions)
+          else:
+            self.fail('Invalid mode: {}'.format(mode))
+
+
+class BaseDNNClassifierEvaluateTest(object):
+
+  def __init__(self, dnn_classifier_fn):
+    self._dnn_classifier_fn = dnn_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_one_dim(self):
+    """Asserts evaluation metrics for one-dimensional input and logits."""
+    global_step = 100
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age')],
+        model_dir=self._model_dir)
+    def _input_fn():
+      # batch_size = 2, one false label, and one true.
+      return {'age': [[10.], [10.]]}, [[1], [0]]
+    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-2.08], [-2.08]] =>
+    # logistic = 1/(1 + exp(-logits)) = [[0.11105597], [0.11105597]]
+    # loss = -1. * log(0.111) -1. * log(0.889) = 2.31544200
+    expected_loss = 2.31544200
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2.,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+        metric_keys.MetricKeys.PREDICTION_MEAN: 0.11105597,
+        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+        # There is no good way to calculate AUC for only two data points. But
+        # that is what the algorithm returns.
+        metric_keys.MetricKeys.AUC: 0.5,
+        metric_keys.MetricKeys.AUC_PR: 0.75,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_multi_dim(self):
+    """Asserts evaluation metrics for multi-dimensional input and logits."""
+    global_step = 100
+    create_checkpoint(
+        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
+                                               [.2, -.2]),
+         ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3,
+                                           .0]),), global_step, self._model_dir)
+    n_classes = 3
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age', shape=[2])],
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    def _input_fn():
+      # batch_size = 2, one false label, and one true.
+      return {'age': [[10., 8.], [10., 8.]]}, [[1], [0]]
+    # Uses identical numbers as
+    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-0.48, 0.48, 0.39], [-0.48, 0.48, 0.39]]
+    # probabilities = exp(logits)/sum(exp(logits))
+    #               = [[0.16670536, 0.43538380, 0.39791084],
+    #                  [0.16670536, 0.43538380, 0.39791084]]
+    # loss = -log(0.43538380) - log(0.16670536)
+    expected_loss = 2.62305466
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_float_labels(self):
+    """Asserts evaluation metrics for float labels in binary classification."""
+    global_step = 100
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age')],
+        model_dir=self._model_dir)
+    def _input_fn():
+      # batch_size = 2, one false label, and one true.
+      return {'age': [[10.], [10.]]}, [[0.8], [0.4]]
+    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-2.08], [-2.08]] =>
+    # logistic = 1/(1 + exp(-logits)) = [[0.11105597], [0.11105597]]
+    # loss = -0.8 * log(0.111) -0.2 * log(0.889)
+    #        -0.4 * log(0.111) -0.6 * log(0.889) = 2.7314420
+    metrics = dnn_classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertAlmostEqual(2.7314420, metrics[metric_keys.MetricKeys.LOSS])
+
+  def test_multi_dim_weights(self):
+    """Tests evaluation with weights."""
+    # Uses same checkpoint with test_multi_dims
+    global_step = 100
+    create_checkpoint((([[.6, .5], [-.6, -.5]],
+                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      global_step, self._model_dir)
+    n_classes = 3
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age', shape=[2])],
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    def _input_fn():
+      # batch_size = 2, one false label, and one true.
+      return {'age': [[10., 8.], [10., 8.]], 'w': [[10.], [100.]]}, [[1], [0]]
+
+    # Uses identical numbers as test_multi_dims
+    # See that test for calculation of logits.
+    # loss = -log(0.43538380)*10 - log(0.16670536)*100
+    expected_loss = 187.468007
+    metrics = dnn_classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertAlmostEqual(
+        expected_loss, metrics[metric_keys.MetricKeys.LOSS], places=3)
+
+
+class BaseDNNRegressorEvaluateTest(object):
+
+  def __init__(self, dnn_regressor_fn):
+    self._dnn_regressor_fn = dnn_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_one_dim(self):
+    """Asserts evaluation metrics for one-dimensional input and logits."""
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    global_step = 100
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
+
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age')],
+        model_dir=self._model_dir)
+    def _input_fn():
+      return {'age': [[10.]]}, [[1.]]
+    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-2.08]] => predictions = [-2.08].
+    # loss = (1+2.08)^2 = 9.4864
+    expected_loss = 9.4864
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_multi_dim(self):
+    """Asserts evaluation metrics for multi-dimensional input and logits."""
+    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=3.
+    global_step = 100
+    create_checkpoint(
+        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
+                                               [.2, -.2]),
+         ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3,
+                                           .0]),), global_step, self._model_dir)
+    label_dimension = 3
+
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age', shape=[2])],
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+    def _input_fn():
+      return {'age': [[10., 8.]]}, [[1., -1., 0.5]]
+    # Uses identical numbers as
+    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-0.48, 0.48, 0.39]]
+    # loss = (1+0.48)^2 + (-1-0.48)^2 + (0.5-0.39)^2 = 4.3929
+    expected_loss = 4.3929
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / label_dimension,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_multi_dim_weights(self):
+    """Asserts evaluation metrics for multi-dimensional input and logits."""
+    # same checkpoint with test_multi_dim.
+    global_step = 100
+    create_checkpoint((([[.6, .5], [-.6, -.5]],
+                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
+                      global_step, self._model_dir)
+    label_dimension = 3
+
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=(2, 2),
+        feature_columns=[feature_column.numeric_column('age', shape=[2])],
+        label_dimension=label_dimension,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    def _input_fn():
+      return {'age': [[10., 8.]], 'w': [10.]}, [[1., -1., 0.5]]
+
+    # Uses identical numbers as test_multi_dim.
+    # See that test for calculation of logits.
+    # loss = 4.3929*10
+    expected_loss = 43.929
+    metrics = dnn_regressor.evaluate(input_fn=_input_fn, steps=1)
+    self.assertAlmostEqual(
+        expected_loss, metrics[metric_keys.MetricKeys.LOSS], places=3)
+
+
+class BaseDNNClassifierPredictTest(object):
+
+  def __init__(self, dnn_classifier_fn):
+    self._dnn_classifier_fn = dnn_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_one_dim(self, label_vocabulary, label_output_fn):
+    """Asserts predictions for one-dimensional input and logits."""
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),),
+        global_step=0,
+        model_dir=self._model_dir)
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        label_vocabulary=label_vocabulary,
+        feature_columns=(feature_column.numeric_column('x'),),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
+    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-2.08] =>
+    # logistic = exp(-2.08)/(1 + exp(-2.08)) = 0.11105597
+    # probabilities = [1-logistic, logistic] = [0.88894403, 0.11105597]
+    # class_ids = argmax(probabilities) = [0]
+    predictions = next(dnn_classifier.predict(input_fn=input_fn))
+    self.assertAllClose([-2.08],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose([0.11105597],
+                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
+    self.assertAllClose(
+        [0.88894403,
+         0.11105597], predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([0],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertAllEqual([label_output_fn(0)],
+                        predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def test_one_dim_without_label_vocabulary(self):
+    self._test_one_dim(label_vocabulary=None,
+                       label_output_fn=lambda x: ('%s' % x).encode())
+
+  def test_one_dim_with_label_vocabulary(self):
+    n_classes = 2
+    self._test_one_dim(
+        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+  def _test_multi_dim_with_3_classes(self, label_vocabulary, label_output_fn):
+    """Asserts predictions for multi-dimensional input and logits."""
+    create_checkpoint(
+        (([[.6, .5], [-.6, -.5]], [.1, -.1]),
+         ([[1., .8], [-.8, -1.]], [.2, -.2]), ([[-1., 1., .5], [-1., 1., .5]],
+                                               [.3, -.3, .0]),),
+        global_step=0,
+        model_dir=self._model_dir)
+
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x', shape=(2,)),),
+        label_vocabulary=label_vocabulary,
+        n_classes=3,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        # Inputs shape is (batch_size, num_inputs).
+        x={'x': np.array([[10., 8.]])},
+        batch_size=1,
+        shuffle=False)
+    # Uses identical numbers as
+    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-0.48, 0.48, 0.39] =>
+    # probabilities[i] = exp(logits[i]) / sum_j exp(logits[j]) =>
+    # probabilities = [0.16670536, 0.43538380, 0.39791084]
+    # class_ids = argmax(probabilities) = [1]
+    predictions = next(dnn_classifier.predict(input_fn=input_fn))
+    self.assertItemsEqual(
+        [prediction_keys.PredictionKeys.LOGITS,
+         prediction_keys.PredictionKeys.PROBABILITIES,
+         prediction_keys.PredictionKeys.CLASS_IDS,
+         prediction_keys.PredictionKeys.CLASSES],
+        six.iterkeys(predictions))
+    self.assertAllClose(
+        [-0.48, 0.48, 0.39], predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose(
+        [0.16670536, 0.43538380, 0.39791084],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllEqual(
+        [1], predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertAllEqual(
+        [label_output_fn(1)],
+        predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def test_multi_dim_with_3_classes_but_no_label_vocab(self):
+    self._test_multi_dim_with_3_classes(
+        label_vocabulary=None,
+        label_output_fn=lambda x: ('%s' % x).encode())
+
+  def test_multi_dim_with_3_classes_and_label_vocab(self):
+    n_classes = 3
+    self._test_multi_dim_with_3_classes(
+        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+
+class BaseDNNRegressorPredictTest(object):
+
+  def __init__(self, dnn_regressor_fn):
+    self._dnn_regressor_fn = dnn_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_one_dim(self):
+    """Asserts predictions for one-dimensional input and logits."""
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),),
+        global_step=0,
+        model_dir=self._model_dir)
+
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x'),),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
+    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-2.08]] => predictions = [-2.08].
+    self.assertAllClose({
+        prediction_keys.PredictionKeys.PREDICTIONS: [-2.08],
+    }, next(dnn_regressor.predict(input_fn=input_fn)))
+
+  def test_multi_dim(self):
+    """Asserts predictions for multi-dimensional input and logits."""
+    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=3.
+    create_checkpoint(
+        (([[.6, .5], [-.6, -.5]], [.1, -.1]),
+         ([[1., .8], [-.8, -1.]], [.2, -.2]), ([[-1., 1., .5], [-1., 1., .5]],
+                                               [.3, -.3,
+                                                .0]),), 100, self._model_dir)
+
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x', shape=(2,)),),
+        label_dimension=3,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        # Inputs shape is (batch_size, num_inputs).
+        x={'x': np.array([[10., 8.]])},
+        batch_size=1,
+        shuffle=False)
+    # Uses identical numbers as
+    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-0.48, 0.48, 0.39]] => predictions = [-0.48, 0.48, 0.39]
+    self.assertAllClose({
+        prediction_keys.PredictionKeys.PREDICTIONS: [-0.48, 0.48, 0.39],
+    }, next(dnn_regressor.predict(input_fn=input_fn)))
+
+
+class _SummaryHook(session_run_hook.SessionRunHook):
+  """Saves summaries every N steps."""
+
+  def __init__(self):
+    self._summaries = []
+
+  def begin(self):
+    self._summary_op = summary_lib.merge_all()
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs({'summary': self._summary_op})
+
+  def after_run(self, run_context, run_values):
+    s = summary_pb2.Summary()
+    s.ParseFromString(run_values.results['summary'])
+    self._summaries.append(s)
+
+  def summaries(self):
+    return tuple(self._summaries)
+
+
+def _assert_checkpoint(
+    testcase, global_step, input_units, hidden_units, output_units, model_dir):
+  """Asserts checkpoint contains expected variables with proper shapes.
+
+  Args:
+    testcase: A TestCase instance.
+    global_step: Expected global step value.
+    input_units: The dimension of input layer.
+    hidden_units: Iterable of integer sizes for the hidden layers.
+    output_units: The dimension of output layer (logits).
+    model_dir: The model directory.
+  """
+  shapes = {
+      name: shape
+      for (name, shape) in checkpoint_utils.list_variables(model_dir)
+  }
+
+  # Global step.
+  testcase.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+  testcase.assertEqual(
+      global_step,
+      checkpoint_utils.load_variable(
+          model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+  # Hidden layer weights.
+  prev_layer_units = input_units
+  for i in range(len(hidden_units)):
+    layer_units = hidden_units[i]
+    testcase.assertAllEqual(
+        (prev_layer_units, layer_units),
+        shapes[HIDDEN_WEIGHTS_NAME_PATTERN % i])
+    testcase.assertAllEqual(
+        (layer_units,),
+        shapes[HIDDEN_BIASES_NAME_PATTERN % i])
+    prev_layer_units = layer_units
+
+  # Output layer weights.
+  testcase.assertAllEqual((prev_layer_units, output_units),
+                          shapes[LOGITS_WEIGHTS_NAME])
+  testcase.assertAllEqual((output_units,),
+                          shapes[LOGITS_BIASES_NAME])
+
+
+def _assert_simple_summary(testcase, expected_values, actual_summary):
+  """Assert summary the specified simple values.
+
+  Args:
+    testcase: A TestCase instance.
+    expected_values: Dict of expected tags and simple values.
+    actual_summary: `summary_pb2.Summary`.
+  """
+  testcase.assertAllClose(expected_values, {
+      v.tag: v.simple_value
+      for v in actual_summary.value if (v.tag in expected_values)
+  })
+
+
+class BaseDNNClassifierTrainTest(object):
+
+  def __init__(self, dnn_classifier_fn):
+    self._dnn_classifier_fn = dnn_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_from_scratch_with_default_optimizer_binary(self):
+    hidden_units = (2, 2)
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+
+    # Train for a few steps, then validate final checkpoint.
+    num_steps = 5
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps)
+    _assert_checkpoint(
+        self, num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1, model_dir=self._model_dir)
+
+  def test_from_scratch_with_default_optimizer_multi_class(self):
+    hidden_units = (2, 2)
+    n_classes = 3
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, then validate final checkpoint.
+    num_steps = 5
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[2]]), steps=num_steps)
+    _assert_checkpoint(
+        self, num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=n_classes, model_dir=self._model_dir)
+
+  def test_from_scratch_validate_summary(self):
+    hidden_units = (2, 2)
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units)
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    _assert_checkpoint(
+        self, num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1, model_dir=self._model_dir)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      summary_keys = [v.tag for v in summary.value]
+      self.assertIn(metric_keys.MetricKeys.LOSS, summary_keys)
+      self.assertIn(metric_keys.MetricKeys.LOSS_MEAN, summary_keys)
+
+  def test_binary_classification(self):
+    base_global_step = 100
+    hidden_units = (2, 2)
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
+
+    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-2.08] => probabilities = [0.889, 0.111]
+    # loss = -1. * log(0.111) = 2.19772100
+    expected_loss = 2.19772100
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      _assert_simple_summary(
+          self,
+          {
+              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
+              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': .5,
+              'dnn/dnn/logits/fraction_of_zero_values': 0.,
+              metric_keys.MetricKeys.LOSS: expected_loss,
+          },
+          summary)
+    _assert_checkpoint(
+        self, base_global_step + num_steps, input_units=1,
+        hidden_units=hidden_units, output_units=1, model_dir=self._model_dir)
+
+  def test_binary_classification_float_labels(self):
+    base_global_step = 100
+    hidden_units = (2, 2)
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
+
+    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-2.08] => probabilities = [0.889, 0.111]
+    # loss = -0.8 * log(0.111) -0.2 * log(0.889) = 1.7817210
+    expected_loss = 1.7817210
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[0.8]]), steps=num_steps)
+    self.assertEqual(1, opt.minimize.call_count)
+
+  def test_multi_class(self):
+    n_classes = 3
+    base_global_step = 100
+    hidden_units = (2, 2)
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1., 1., .5], [-1., 1., .5]],
+          [.3, -.3, .0]),), base_global_step, self._model_dir)
+
+    # Uses identical numbers as DNNModelFnTest.test_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-2.08, 2.08, 1.19] => probabilities = [0.0109, 0.7011, 0.2879]
+    # loss = -1. * log(0.7011) = 0.35505795
+    expected_loss = 0.35505795
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_classifier = self._dnn_classifier_fn(
+        n_classes=n_classes,
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_classifier.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      _assert_simple_summary(
+          self,
+          {
+              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
+              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': .5,
+              'dnn/dnn/logits/fraction_of_zero_values': 0.,
+              metric_keys.MetricKeys.LOSS: expected_loss,
+          },
+          summary)
+    _assert_checkpoint(
+        self, base_global_step + num_steps, input_units=1,
+        hidden_units=hidden_units, output_units=n_classes,
+        model_dir=self._model_dir)
+
+
+class BaseDNNRegressorTrainTest(object):
+
+  def __init__(self, dnn_regressor_fn):
+    self._dnn_regressor_fn = dnn_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_from_scratch_with_default_optimizer(self):
+    hidden_units = (2, 2)
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+
+    # Train for a few steps, then validate final checkpoint.
+    num_steps = 5
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((10,),)), steps=num_steps)
+    _assert_checkpoint(
+        self, num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1, model_dir=self._model_dir)
+
+  def test_from_scratch(self):
+    hidden_units = (2, 2)
+    opt = mock_optimizer(self, hidden_units=hidden_units)
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((5.,),)), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    _assert_checkpoint(
+        self, num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1, model_dir=self._model_dir)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      summary_keys = [v.tag for v in summary.value]
+      self.assertIn(metric_keys.MetricKeys.LOSS, summary_keys)
+      self.assertIn(metric_keys.MetricKeys.LOSS_MEAN, summary_keys)
+
+  def test_one_dim(self):
+    """Asserts train loss for one-dimensional input and logits."""
+    base_global_step = 100
+    hidden_units = (2, 2)
+    create_checkpoint(
+        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
+         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
+
+    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [-2.08] => predictions = [-2.08]
+    # loss = (1 + 2.08)^2 = 9.4864
+    expected_loss = 9.4864
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': [[10.]]}, [[1.]]), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      _assert_simple_summary(
+          self,
+          {
+              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
+              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': 0.5,
+              'dnn/dnn/logits/fraction_of_zero_values': 0.,
+              metric_keys.MetricKeys.LOSS: expected_loss,
+          },
+          summary)
+    _assert_checkpoint(
+        self, base_global_step + num_steps, input_units=1,
+        hidden_units=hidden_units, output_units=1, model_dir=self._model_dir)
+
+  def test_multi_dim(self):
+    """Asserts train loss for multi-dimensional input and logits."""
+    base_global_step = 100
+    hidden_units = (2, 2)
+    create_checkpoint(
+        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
+                                               [.2, -.2]),
+         ([[-1., 1., .5], [-1., 1., .5]],
+          [.3, -.3, .0]),), base_global_step, self._model_dir)
+    input_dimension = 2
+    label_dimension = 3
+
+    # Uses identical numbers as
+    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
+    # See that test for calculation of logits.
+    # logits = [[-0.48, 0.48, 0.39]]
+    # loss = (1+0.48)^2 + (-1-0.48)^2 + (0.5-0.39)^2 = 4.3929
+    expected_loss = 4.3929
+    opt = mock_optimizer(
+        self, hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=hidden_units,
+        feature_columns=[
+            feature_column.numeric_column('age', shape=[input_dimension])],
+        label_dimension=label_dimension,
+        optimizer=opt,
+        model_dir=self._model_dir)
+    self.assertEqual(0, opt.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': [[10., 8.]]}, [[1., -1., 0.5]]),
+        steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, opt.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      _assert_simple_summary(
+          self,
+          {
+              metric_keys.MetricKeys.LOSS_MEAN: expected_loss / label_dimension,
+              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
+              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': 0.5,
+              'dnn/dnn/logits/fraction_of_zero_values': 0.,
+              metric_keys.MetricKeys.LOSS: expected_loss,
+          },
+          summary)
+    _assert_checkpoint(
+        self, base_global_step + num_steps, input_units=input_dimension,
+        hidden_units=hidden_units, output_units=label_dimension,
+        model_dir=self._model_dir)
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
new file mode 100644
index 00000000000..a1c1f1be0ba
--- /dev/null
+++ b/tensorflow/python/estimator/canned/head.py
@@ -0,0 +1,786 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Abstractions for the head(s) of a model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
+
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+
+class _Head(object):
+  """Interface for the head/top of a model.
+
+  Given logits (or output of a hidden layer), a Head knows how to compute
+  predictions, loss, train_op, metrics and export outputs. It is meant to:
+
+  1. Simplify writing model_fn and to make model_fn more configurable
+  2. Support wide range of machine learning models. Since most heads can work
+     with logits, they can support DNN, RNN, Wide, Wide&Deep,
+     Global objectives, Gradient boosted trees and many other types
+     of machine learning models.
+
+  Common usage:
+  Here is simplified model_fn to build a DNN regression model.
+    ```python
+    def _my_dnn_model_fn(features, labels, mode, params, config=None):
+      # Optionally your callers can pass head to model_fn as a param.
+      head = tf.contrib.learn.regression_head(...)
+      input = tf.contrib.layers.input_from_feature_columns(features, ...)
+      last_hidden_layer_out = tf.contrib.layers.stack(
+          input, tf.contrib.layers.fully_connected, [1000, 500])
+      logits = tf.contrib.layers.fully_connected(
+          last_hidden_layer_out, head.logits_dimension, activation_fn=None)
+
+      def _train_op_fn(loss):
+        return optimizer.minimize(loss)
+
+      return head.create_estimator_spec(
+          features=features,
+          labels=labels,
+          mode=mode,
+          logits=logits,
+          train_op_fn=_train_op_fn)
+    ```
+
+  There are cases where computing and applying gradients can not be meaningfully
+  captured with train_op_fn we support (for example, with sync optimizer). In
+  such case, you can take the responsibility on your own. Here is a common
+  use case,
+    ```python
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        labels=labels,
+        mode=mode,
+        logits=logits,
+        train_op_fn=tf.contrib.learn.no_op_train_fn)
+    if mode == model_fn.ModeKeys.TRAIN:
+      optimizer = ...
+      sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
+      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
+                                                  loss=estimator_spec.loss, ...)
+      hooks = [sync.make_session_run_hook(is_chief)]
+      ... upate train_op and hooks in EstimatorSpec and return
+    ```
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def logits_dimension(self):
+    """Size of the last dimension of the logits `Tensor`.
+
+    Typically, logits is of shape `[batch_size, logits_dimension]`.
+
+    Returns:
+      The expected size of the `logits` tensor.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  @abc.abstractmethod
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """Returns `EstimatorSpec` that a model_fn can return.
+
+    Please note that,
+    + Exactly one of `logits` and `logits_input` must be provided.
+    + All args must be passed via name.
+
+    Args:
+      features: Input `dict` of `Tensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` to be used by the head.
+      labels: Labels `Tensor`, or `dict` of same.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
+          to optimize the model with the loss. This is used in TRAIN mode and
+          must not be None. None is allowed in other modes. If you want to
+          optimize loss yourself you can pass `no_op_train_fn` and then use
+          EstimatorSpec.loss to compute and apply gradients.
+
+    Returns:
+      `EstimatorSpec`.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+
+def _maybe_expand_dim(tensor):
+  """Expand the dim of `tensor` with static rank 1."""
+  with ops.name_scope(None, 'maybe_expand_dim', (tensor,)):
+    tensor = sparse_tensor.convert_to_tensor_or_sparse_tensor(tensor)
+    if isinstance(tensor, sparse_tensor.SparseTensor):
+      raise ValueError('SparseTensor labels are not supported.')
+    static_shape = tensor.shape
+    if static_shape is None:
+      return tensor
+
+    return (array_ops.expand_dims(tensor, -1) if static_shape.ndims == 1
+            else tensor)
+
+
+def _check_labels(labels, expected_labels_dimension):
+  """Check labels type and shape."""
+  with ops.name_scope(None, 'labels', (labels,)) as scope:
+    labels = sparse_tensor.convert_to_tensor_or_sparse_tensor(labels)
+    if isinstance(labels, sparse_tensor.SparseTensor):
+      raise ValueError('SparseTensor labels are not supported.')
+    labels_shape = array_ops.shape(labels)
+    err_msg = 'labels shape must be [batch_size, {}]'.format(
+        expected_labels_dimension)
+    assert_rank = check_ops.assert_rank(labels, 2, message=err_msg)
+    with ops.control_dependencies([assert_rank]):
+      static_shape = labels.shape
+      if static_shape is not None:
+        dim1 = static_shape[1]
+        if (dim1 is not None) and (dim1 != expected_labels_dimension):
+          raise ValueError(
+              'labels shape must be [batch_size, labels_dimension], got %s.' %
+              (static_shape,))
+      assert_dimension = check_ops.assert_equal(
+          expected_labels_dimension, labels_shape[1], message=err_msg)
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.identity(labels, name=scope)
+
+
+def _check_logits(logits, expected_logits_dimension):
+  """Check logits type and shape."""
+  with ops.name_scope(None, 'logits', (logits,)) as scope:
+    logits = math_ops.to_float(logits)
+    logits_shape = array_ops.shape(logits)
+    assert_rank = check_ops.assert_rank(
+        logits, 2, data=[logits_shape],
+        message='logits shape must be [batch_size, logits_dimension]')
+    with ops.control_dependencies([assert_rank]):
+      static_shape = logits.shape
+      if static_shape is not None:
+        dim1 = static_shape[1]
+        if (dim1 is not None) and (dim1 != expected_logits_dimension):
+          raise ValueError(
+              'logits shape must be [batch_size, logits_dimension], got %s.' %
+              (static_shape,))
+      assert_dimension = check_ops.assert_equal(
+          expected_logits_dimension, logits_shape[1], data=[logits_shape],
+          message='logits shape must be [batch_size, logits_dimension]')
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.identity(logits, name=scope)
+
+
+def _indicator_labels_mean(labels, weights=None, name=None):
+  with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
+    labels = math_ops.to_float(labels, name='labels')
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, labels)
+    return metrics_lib.mean(labels, weights=weights, name=scope)
+
+
+def _accuracy_baseline(labels_mean):
+  """Return accuracy baseline based on labels mean.
+
+  This is the best the model could do by always predicting one class.
+
+  Args:
+    labels_mean: Tuple of value and update op.
+
+  Returns:
+    Tuple of value and update op.
+  """
+  with ops.name_scope(None, 'accuracy_baseline', labels_mean):
+    value, update_op = labels_mean
+    return (
+        math_ops.maximum(value, 1. - value, name='value'),
+        math_ops.maximum(update_op, 1 - update_op, name='update_op'))
+
+
+def _predictions_mean(predictions, weights=None, name=None):
+  with ops.name_scope(
+      name, 'predictions_mean', (predictions, weights)) as scope:
+    predictions = math_ops.to_float(predictions, name='predictions')
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
+    return metrics_lib.mean(predictions, weights=weights, name=scope)
+
+
+def _auc(labels, predictions, weights=None, curve='ROC', name=None):
+  with ops.name_scope(name, 'auc', (predictions, labels, weights)) as scope:
+    predictions = math_ops.to_float(predictions, name='predictions')
+    if labels.dtype.base_dtype != dtypes.bool:
+      logging.warning('Casting %s labels to bool.', labels.dtype)
+      labels = math_ops.cast(labels, dtypes.bool)
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
+    return metrics_lib.auc(
+        labels=labels, predictions=predictions, weights=weights, curve=curve,
+        name=scope)
+
+
+def _accuracy_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'accuracy_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    threshold_predictions = math_ops.to_float(
+        math_ops.greater_equal(predictions, threshold))
+    return metrics_lib.accuracy(
+        labels=labels, predictions=threshold_predictions, weights=weights,
+        name=scope)
+
+
+def _precision_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'precision_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    precision_tensor, update_op = metrics_lib.precision_at_thresholds(
+        labels=labels, predictions=predictions, thresholds=(threshold,),
+        weights=weights, name=scope)
+    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
+
+
+def _recall_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'recall_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    precision_tensor, update_op = metrics_lib.recall_at_thresholds(
+        labels=labels, predictions=predictions, thresholds=(threshold,),
+        weights=weights, name=scope)
+    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
+
+
+def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
+                                                      weight_column=None,
+                                                      label_vocabulary=None):
+  """Creates a '_Head' for multi class classification.
+
+  This head expects to be fed integer labels specifying the class index.
+
+  Args:
+    n_classes: Number of classes, must be greater than 2 (for 2 classes, use
+      `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    label_vocabulary: A list of strings represents possible label values. If it
+      is not given, that means labels are already encoded as integer within
+      [0, n_classes). If given, labels must be string type and have any value in
+      `label_vocabulary`. Also there will be errors if vocabulary is not
+      provided and labels are string.
+
+  Returns:
+    An instance of `_Head` for  multi class classification.
+
+  Raises:
+    ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
+  """
+  if label_vocabulary is not None and not isinstance(label_vocabulary,
+                                                     (list, tuple)):
+    raise ValueError('label_vocabulary should be a list. Given type: {}'.format(
+        type(label_vocabulary)))
+
+  return _MultiClassHeadWithSoftmaxCrossEntropyLoss(n_classes, weight_column,
+                                                    label_vocabulary)
+
+
+class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
+  """See `_multi_class_head_with_softmax_cross_entropy_loss`."""
+
+  def __init__(self, n_classes, weight_column=None, label_vocabulary=None):
+    if (n_classes is None) or (n_classes <= 2):
+      raise ValueError('n_classes must be > 2: %s.' % n_classes)
+    self._n_classes = n_classes
+    self._weight_column = weight_column
+    self._label_vocabulary = label_vocabulary
+
+  @property
+  def logits_dimension(self):
+    return self._n_classes
+
+  def _eval_metric_ops(self, labels, probabilities, logits,
+                       class_ids, weights, unweighted_loss):
+    """Returns the Eval metric ops."""
+    with ops.name_scope(
+        None, 'metrics',
+        (labels, probabilities, logits, class_ids, weights, unweighted_loss)):
+      keys = metric_keys.MetricKeys
+      metric_ops = {
+          # Estimator already adds a metric for loss.
+          # TODO(xiejw): Any other metrics?
+          keys.LOSS_MEAN: metrics_lib.mean(
+              unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+          keys.ACCURACY: metrics_lib.accuracy(
+              labels=labels, predictions=class_ids, weights=weights,
+              name=keys.ACCURACY),
+      }
+    return metric_ops
+
+  def _label_ids(self, labels):
+    """Converts labels to integer id space."""
+    if self._label_vocabulary is None:
+      if not labels.dtype.is_integer:
+        raise ValueError('Labels dtype should be integer '
+                         'Instead got %s.' % labels.dtype)
+      label_ids = labels
+    else:
+      if labels.dtype != dtypes.string:
+        raise ValueError('Labels dtype should be string if there is a '
+                         'vocabulary. Instead got {}'.format(labels.dtype))
+      label_ids = lookup_ops.index_table_from_tensor(
+          vocabulary_list=tuple(self._label_vocabulary),
+          name='class_id_lookup').lookup(labels)
+    return _assert_range(label_ids, self._n_classes)
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None,
+        default_name='multi_class_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+      logits = _check_logits(logits, self.logits_dimension)
+
+      # Predict.
+      pred_keys = prediction_keys.PredictionKeys
+      with ops.name_scope(None, 'predictions', (logits,)):
+        # class_ids's shape is [batch_size]
+        class_ids = math_ops.argmax(logits, 1, name=pred_keys.CLASS_IDS)
+        class_ids = array_ops.expand_dims(class_ids, axis=(1,))
+        if self._label_vocabulary:
+          table = lookup_ops.index_to_string_table_from_tensor(
+              vocabulary_list=self._label_vocabulary,
+              name='class_string_lookup')
+          classes = table.lookup(class_ids)
+        else:
+          classes = string_ops.as_string(class_ids, name='str_classes')
+
+        probabilities = nn.softmax(logits, name=pred_keys.PROBABILITIES)
+        predictions = {
+            pred_keys.LOGITS: logits,
+            pred_keys.PROBABILITIES: probabilities,
+            # Expand to [batch_size, 1]
+            pred_keys.CLASS_IDS: class_ids,
+            pred_keys.CLASSES: classes,
+        }
+      if mode == model_fn.ModeKeys.PREDICT:
+        batch_size = array_ops.shape(probabilities)[0]
+        export_class_list = self._label_vocabulary
+        if not export_class_list:
+          export_class_list = string_ops.as_string(
+              math_ops.range(self._n_classes))
+        export_output_classes = array_ops.tile(
+            input=array_ops.expand_dims(input=export_class_list, axis=0),
+            multiples=[batch_size, 1])
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={
+                '':
+                    export_output.ClassificationOutput(
+                        scores=probabilities,
+                        # `ClassificationOutput` requires string classes.
+                        classes=export_output_classes)
+            })
+
+      # Eval.
+      label_ids = self._label_ids(_check_labels(_maybe_expand_dim(labels), 1))
+
+      unweighted_loss = losses.sparse_softmax_cross_entropy(
+          labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
+      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
+      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=(1,))
+      weights = _weights(features, self._weight_column)
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=self._eval_metric_ops(
+                labels=label_ids,
+                probabilities=probabilities,
+                logits=logits,
+                class_ids=class_ids,
+                unweighted_loss=unweighted_loss,
+                weights=weights))
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
+
+
+def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
+    weight_column=None, thresholds=None, label_vocabulary=None):
+  """Creates a `Head` for single label binary classification.
+
+  This head uses `sigmoid_cross_entropy_with_logits` loss.
+
+  This head expects to be fed float labels of shape `(batch_size, 1)`.
+
+  Args:
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    thresholds: Iterable of floats in the range `(0, 1)`. For binary
+      classification metrics such as precision and recall, an eval metric is
+      generated for each threshold value. This threshold is applied to the
+      logistic values to determine the binary classification (i.e., above the
+      threshold is `true`, below is `false`.
+    label_vocabulary: A list of strings represents possible label values. If it
+      is not given, that means labels are already encoded within [0, 1]. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. Also there will be errors if vocabulary is not
+      provided and labels are string.
+
+  Returns:
+    An instance of `Head` for binary classification.
+
+  Raises:
+    ValueError: if `thresholds` contains a value outside of `(0, 1)`.
+  """
+  thresholds = tuple(thresholds) if thresholds else tuple()
+  if label_vocabulary is not None and not isinstance(label_vocabulary,
+                                                     (list, tuple)):
+    raise ValueError('label_vocabulary should be a list. Given type: {}'.format(
+        type(label_vocabulary)))
+
+  for threshold in thresholds:
+    if (threshold <= 0.0) or (threshold >= 1.0):
+      raise ValueError('thresholds not in (0, 1): %s.' % (thresholds,))
+  return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
+      weight_column=weight_column,
+      thresholds=thresholds,
+      label_vocabulary=label_vocabulary)
+
+
+class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
+  """See `_binary_logistic_head_with_sigmoid_cross_entropy_loss`."""
+
+  def __init__(self, weight_column=None, thresholds=None,
+               label_vocabulary=None):
+    self._weight_column = weight_column
+    self._thresholds = thresholds
+    self._label_vocabulary = label_vocabulary
+
+  @property
+  def logits_dimension(self):
+    return 1
+
+  def _eval_metric_ops(self,
+                       labels,
+                       logits,
+                       logistic,
+                       scores,
+                       class_ids,
+                       unweighted_loss,
+                       weights=None):
+    with ops.name_scope(None, 'metrics', (labels, logits, logistic, scores,
+                                          class_ids, unweighted_loss, weights)):
+      keys = metric_keys.MetricKeys
+      labels_mean = _indicator_labels_mean(
+          labels=labels, weights=weights, name=keys.LABEL_MEAN)
+      metric_ops = {
+          # Estimator already adds a metric for loss.
+          keys.LOSS_MEAN:
+              metrics_lib.mean(
+                  unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+          keys.ACCURACY:
+              metrics_lib.accuracy(
+                  labels=labels,
+                  predictions=class_ids,
+                  weights=weights,
+                  name=keys.ACCURACY),
+          keys.PREDICTION_MEAN:
+              _predictions_mean(
+                  predictions=logistic,
+                  weights=weights,
+                  name=keys.PREDICTION_MEAN),
+          keys.LABEL_MEAN:
+              labels_mean,
+          keys.ACCURACY_BASELINE:
+              _accuracy_baseline(labels_mean),
+          keys.AUC:
+              _auc(
+                  labels=labels,
+                  predictions=logistic,
+                  weights=weights,
+                  name=keys.AUC),
+          keys.AUC_PR:
+              _auc(
+                  labels=labels,
+                  predictions=logistic,
+                  weights=weights,
+                  curve='PR',
+                  name=keys.AUC_PR)
+      }
+      for threshold in self._thresholds:
+        accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
+        metric_ops[accuracy_key] = _accuracy_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=accuracy_key)
+        # Precision for positive examples.
+        precision_key = keys.PRECISION_AT_THRESHOLD % threshold
+        metric_ops[precision_key] = _precision_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=precision_key)
+        # Recall for positive examples.
+        recall_key = keys.RECALL_AT_THRESHOLD % threshold
+        metric_ops[recall_key] = _recall_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=recall_key)
+      return metric_ops
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None, default_name='binary_logistic_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+
+      # Predict.
+      pred_keys = prediction_keys.PredictionKeys
+      logits = _check_logits(logits, self.logits_dimension)
+      logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
+      two_class_logits = array_ops.concat(
+          (array_ops.zeros_like(logits), logits), 1, name='two_class_logits')
+      scores = nn.softmax(two_class_logits, name=pred_keys.PROBABILITIES)
+      class_ids = array_ops.reshape(
+          math_ops.argmax(two_class_logits, axis=1), (-1, 1), name='classes')
+      if self._label_vocabulary:
+        table = lookup_ops.index_to_string_table_from_tensor(
+            vocabulary_list=self._label_vocabulary, name='class_string_lookup')
+        classes = table.lookup(class_ids)
+      else:
+        classes = string_ops.as_string(class_ids, name='str_classes')
+      predictions = {
+          pred_keys.LOGITS: logits,
+          pred_keys.LOGISTIC: logistic,
+          pred_keys.PROBABILITIES: scores,
+          pred_keys.CLASS_IDS: class_ids,
+          pred_keys.CLASSES: classes,
+      }
+      if mode == model_fn.ModeKeys.PREDICT:
+        batch_size = array_ops.shape(logistic)[0]
+        export_class_list = self._label_vocabulary
+        if not export_class_list:
+          export_class_list = string_ops.as_string([0, 1])
+        export_output_classes = array_ops.tile(
+            input=array_ops.expand_dims(input=export_class_list, axis=0),
+            multiples=[batch_size, 1])
+        classifier_output = export_output.ClassificationOutput(
+            scores=scores,
+            # `ClassificationOutput` requires string classes.
+            classes=export_output_classes)
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={
+                '': classifier_output,  # to be same as other heads.
+                'classification': classifier_output,  # to be called by name.
+                _DEFAULT_SERVING_KEY: classifier_output,  # default
+                'regression': export_output.RegressionOutput(value=logistic)
+            })
+
+      # Eval.
+      labels = _check_labels(_maybe_expand_dim(labels), self.logits_dimension)
+      if self._label_vocabulary is not None:
+        labels = lookup_ops.index_table_from_tensor(
+            vocabulary_list=tuple(self._label_vocabulary),
+            name='class_id_lookup').lookup(labels)
+      labels = math_ops.to_float(labels)
+      labels = _assert_range(labels, 2)
+      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits, name='loss')
+      weights = _weights(features, self._weight_column)
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=self._eval_metric_ops(
+                labels=labels,
+                logits=logits,
+                logistic=logistic,
+                scores=scores,
+                class_ids=class_ids,
+                unweighted_loss=unweighted_loss,
+                weights=weights))
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
+
+
+def _regression_head_with_mean_squared_error_loss(weight_column=None,
+                                                  label_dimension=1):
+  """Creates a `_Head` for regression using the mean squared loss.
+
+  Args:
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+
+  Returns:
+    An instance of `_Head` for linear regression.
+  """
+  return _RegressionHeadWithMeanSquaredErrorLoss(
+      weight_column=weight_column, label_dimension=label_dimension)
+
+
+class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
+  """`Head` for regression using the mean squared loss."""
+
+  def __init__(self, label_dimension, weight_column=None):
+    """`Head` for regression."""
+    if label_dimension < 1:
+      raise ValueError('Invalid label_dimension %s.' % label_dimension)
+    self._logits_dimension = label_dimension
+    self._weight_column = weight_column
+
+  @property
+  def logits_dimension(self):
+    return self._logits_dimension
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None,
+        default_name='regression_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+
+      # Predict.
+      logits = _check_logits(logits, self._logits_dimension)
+      predictions = {prediction_keys.PredictionKeys.PREDICTIONS: logits}
+      if mode == model_fn.ModeKeys.PREDICT:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={'': export_output.RegressionOutput(value=logits)})
+
+      # Eval.
+      labels = _check_labels(_maybe_expand_dim(math_ops.to_float(labels)),
+                             self._logits_dimension)
+      unweighted_loss = losses.mean_squared_error(
+          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+      weights = _weights(features, self._weight_column)
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        # Estimator already adds a metric for loss.
+        eval_metric_ops = {
+            metric_keys.MetricKeys.LOSS_MEAN: metrics_lib.mean(
+                unweighted_loss, weights=weights)
+        }
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=eval_metric_ops)
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
+
+
+def _assert_range(labels, n_classes):
+  assert_less = check_ops.assert_less(
+      labels,
+      ops.convert_to_tensor(n_classes, dtype=labels.dtype),
+      message='Label IDs must < n_classes')
+  assert_greater = check_ops.assert_non_negative(
+      labels, message='Label IDs must >= 0')
+  with ops.control_dependencies((assert_less, assert_greater)):
+    return array_ops.identity(labels)
+
+
+def _weights(features, weight_column):
+  """Fetches weights from features."""
+  if weight_column is None:
+    return 1.
+  if isinstance(weight_column, six.string_types):
+    weight_column = feature_column_lib.numeric_column(key=weight_column)
+  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
+    raise TypeError('Weight column must be either a string or _NumericColumn. '
+                    'Given type: {}.'.format(type(weight_column)))
+  weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
+      feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
+  if not (weights.dtype.is_floating or weights.dtype.is_integer):
+    raise ValueError('Weight column should be castable to float. '
+                     'Given dtype: {}'.format(weights.dtype))
+  weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights'))
+  return weights
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
new file mode 100644
index 00000000000..c6ea54f08ea
--- /dev/null
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -0,0 +1,1889 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for head.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import queue_runner_impl
+
+
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+
+def _initialize_variables(test_case, scaffold):
+  scaffold.finalize()
+  test_case.assertIsNone(scaffold.init_feed_dict)
+  test_case.assertIsNone(scaffold.init_fn)
+  scaffold.init_op.run()
+  scaffold.ready_for_local_init_op.eval()
+  scaffold.local_init_op.run()
+  scaffold.ready_op.eval()
+  test_case.assertIsNotNone(scaffold.saver)
+
+
+def _assert_simple_summaries(test_case, expected_summaries, summary_str,
+                             tol=1e-6):
+  """Assert summary the specified simple values.
+
+  Args:
+    test_case: test case.
+    expected_summaries: Dict of expected tags and simple values.
+    summary_str: Serialized `summary_pb2.Summary`.
+    tol: Tolerance for relative and absolute.
+  """
+  summary = summary_pb2.Summary()
+  summary.ParseFromString(summary_str)
+  test_case.assertAllClose(expected_summaries, {
+      v.tag: v.simple_value for v in summary.value
+  }, rtol=tol, atol=tol)
+
+
+def _assert_no_hooks(test_case, spec):
+  test_case.assertAllEqual([], spec.training_chief_hooks)
+  test_case.assertAllEqual([], spec.training_hooks)
+
+
+def _sigmoid(logits):
+  return 1 / (1 + np.exp(-logits))
+
+
+# TODO(roumposg): Reuse the code from dnn_testing_utils.
+def _assert_close(expected, actual, rtol=1e-04, message='',
+                  name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs((expected - actual) / expected, 'diff')
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=(message, 'Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        summarize=expected.get_shape().num_elements(),
+        name=scope)
+
+
+class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
+
+  def test_n_classes_is_none(self):
+    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=None)
+
+  def test_n_classes_is_2(self):
+    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=2)
+
+  def test_invalid_logits_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    logits_2x2 = np.array(((45., 44.), (41., 42.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((30.,), (42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_2x2)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((30.,), (42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
+            logits_placeholder: logits_2x2
+        })
+
+  def test_invalid_labels_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    labels_2x2 = np.array(((45, 44), (41, 42),), dtype=np.int)
+    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x2)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            logits_placeholder: logits_2x3,
+            labels_placeholder: labels_2x2
+        })
+
+  def test_invalid_labels_type(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    labels_2x1 = np.array(((1.,), (1.,),))
+    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_placeholder,
+          labels=labels_placeholder)
+
+  def test_invalid_labels_values(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    labels_2x1_with_large_id = np.array(((45,), (1,),), dtype=np.int)
+    labels_2x1_with_negative_id = np.array(((-5,), (1,),), dtype=np.int)
+    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesOpError('Label IDs must < n_classes'):
+        spec.loss.eval({
+            labels_placeholder: labels_2x1_with_large_id,
+            logits_placeholder: logits_2x3
+        })
+
+    with self.test_session():
+      with self.assertRaisesOpError('Label IDs must >= 0'):
+        spec.loss.eval({
+            labels_placeholder: labels_2x1_with_negative_id,
+            logits_placeholder: logits_2x3
+        })
+
+  def test_invalid_labels_sparse_tensor(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    labels_2x1 = sparse_tensor.SparseTensor(
+        values=['english', 'italian'],
+        indices=[[0, 0], [1, 0]],
+        dense_shape=[2, 1])
+    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'SparseTensor labels are not supported.'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x1)
+
+  def test_incompatible_labels_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    # Here batch sizes are different.
+    values_3x1 = np.array(((1,), (1,), (1,),))
+    values_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'Dimensions must be equal'):
+      head.create_estimator_spec(
+          features={'x': values_2x3},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_2x3,
+          labels=values_3x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_2x3},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(
+          errors.OpError,
+          'logits and labels must have the same first dimension'):
+        spec.loss.eval({
+            labels_placeholder: values_3x1,
+            logits_placeholder: values_2x3
+        })
+
+  def test_predict(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    logits = [[1., 0., 0.], [0., 0., 1.]]
+    expected_probabilities = [[0.576117, 0.2119416, 0.2119416],
+                              [0.2119416, 0.2119416, 0.576117]]
+    expected_class_ids = [[0], [2]]
+    expected_classes = [[b'0'], [b'2']]
+    expected_export_classes = [[b'0', b'1', b'2']] * 2
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    self.assertItemsEqual(
+        ('', _DEFAULT_SERVING_KEY), spec.export_outputs.keys())
+
+    # Assert predictions and export_outputs.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(logits,
+                          predictions[prediction_keys.PredictionKeys.LOGITS])
+      self.assertAllClose(
+          expected_probabilities,
+          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+      self.assertAllClose(expected_class_ids,
+                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+      self.assertAllEqual(expected_classes,
+                          predictions[prediction_keys.PredictionKeys.CLASSES])
+
+      self.assertAllClose(
+          expected_probabilities,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
+      self.assertAllEqual(
+          expected_export_classes,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
+
+  def test_predict_with_vocabulary_list(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
+
+    logits = [[1., 0., 0.], [0., 0., 1.]]
+    expected_classes = [[b'aang'], [b'zuko']]
+    expected_export_classes = [[b'aang', b'iroh', b'zuko']] * 2
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllEqual(
+          expected_classes,
+          sess.run(spec.predictions[prediction_keys.PredictionKeys.CLASSES]))
+      self.assertAllEqual(
+          expected_export_classes,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
+
+  def test_weight_should_not_impact_prediction(self):
+    n_classes = 3
+    logits = [[1., 0., 0.], [0., 0., 1.]]
+    expected_probabilities = [[0.576117, 0.2119416, 0.2119416],
+                              [0.2119416, 0.2119416, 0.576117]]
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column='label_weights')
+
+    weights_2x1 = [[1.], [2.]]
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'label_weights': weights_2x1,
+        },
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(logits,
+                          predictions[prediction_keys.PredictionKeys.LOGITS])
+      self.assertAllClose(
+          expected_probabilities,
+          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+
+  def test_eval(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / 2,
+        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval()
+                             for k in value_ops},
+          rtol=tol,
+          atol=tol)
+
+  def test_eval_with_label_vocabulary(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
+
+    logits = [[10., 0, 0], [0, 10, 0]]
+    labels = [[b'iroh'], [b'iroh']]
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / 2,
+        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
+    }
+
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
+  def test_weighted_multi_example_eval(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
+    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
+    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
+    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
+    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
+    expected_loss = 30.
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'label_weights': weights_3x1,
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / np.sum(weights_3x1),
+        # Weighted accuracy is 1 * 3.0 / sum weights = 0.5
+        keys.ACCURACY: 0.5,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
+  def test_train(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    expected_train_result = 'my_train_op'
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+      }, summary_str, tol)
+
+  def test_train_with_one_dim_label_and_weights(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column='label_weights')
+
+    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
+    labels_rank_1 = np.array((1, 2, 2,), dtype=np.int64)
+    weights_rank_1 = np.array((1., 2., 3.,), dtype=np.float64)
+
+    self.assertEqual((3,), labels_rank_1.shape)
+    self.assertEqual((3,), weights_rank_1.shape)
+
+    expected_train_result = 'my_train_op'
+    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
+    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
+    expected_loss = 30.
+
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.float32),
+            'label_weights': weights_rank_1,
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels_rank_1,
+        train_op_fn=_train_op_fn)
+
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_MEAN: (
+              expected_loss / np.sum(weights_rank_1)),
+      }, summary_str, tol)
+
+  def test_train_with_vocabulary(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
+
+    logits = [[10., 0, 0], [0, 10, 0]]
+    labels = [[b'iroh'], [b'iroh']]
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+
+    def _train_op_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss = sess.run(spec.loss)
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+
+  def test_weighted_multi_example_train(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
+    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
+    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
+    expected_train_result = 'my_train_op'
+    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
+    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
+    expected_loss = 30.
+
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.float32),
+            'label_weights': weights_3x1,
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss mean = sum(cross_entropy(labels, logits) * [1,2,3]) / (1+2+3)
+          #      = sum([10, 10, 0] * [1, 2, 3]) / 6 = 30 / 6
+          metric_keys.MetricKeys.LOSS_MEAN:
+              expected_loss / np.sum(weights_3x1),
+      }, summary_str, tol)
+
+
+# TODO(ptucker): Add thresholds tests.
+class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
+
+  def test_threshold_too_small(self):
+    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          thresholds=(0., 0.5))
+
+  def test_threshold_too_large(self):
+    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          thresholds=(0.5, 1.))
+
+  def test_invalid_logits_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 1).
+    logits_2x2 = np.array(((45., 44.), (41., 42.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_2x2)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
+            logits_placeholder: logits_2x2
+        })
+
+  def test_invalid_labels_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Labels and logits should be shape (batch_size, 1).
+    labels_2x2 = np.array(((45., 44.), (41., 42.),))
+    logits_2x1 = np.array(((45.,), (41.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x1,
+          labels=labels_2x2)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            logits_placeholder: logits_2x1,
+            labels_placeholder: labels_2x2
+        })
+
+  def test_incompatible_labels_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Both logits and labels should be shape (batch_size, 1).
+    values_2x1 = np.array(((0.,), (1.,),))
+    values_3x1 = np.array(((0.,), (1.,), (0.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(
+        ValueError, 'logits and labels must have the same shape'):
+      head.create_estimator_spec(
+          features={'x': values_2x1},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_2x1,
+          labels=values_3x1)
+    with self.assertRaisesRegexp(
+        ValueError, 'logits and labels must have the same shape'):
+      head.create_estimator_spec(
+          features={'x': values_2x1},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_3x1,
+          labels=values_2x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_2x1},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        spec.loss.eval({
+            labels_placeholder: values_2x1,
+            logits_placeholder: values_3x1
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        spec.loss.eval({
+            labels_placeholder: values_3x1,
+            logits_placeholder: values_2x1
+        })
+
+  def test_predict(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = [[0.3], [-0.4]]
+    expected_logistics = [[0.574443], [0.401312]]
+    expected_probabilities = [[0.425557, 0.574443], [0.598688, 0.401312]]
+    expected_class_ids = [[1], [0]]
+    expected_classes = [[b'1'], [b'0']]
+    expected_export_classes = [[b'0', b'1']] * 2
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    self.assertIsNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNone(spec.train_op)
+    self.assertItemsEqual(('', 'classification', 'regression',
+                           _DEFAULT_SERVING_KEY), spec.export_outputs.keys())
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(logits,
+                          predictions[prediction_keys.PredictionKeys.LOGITS])
+      self.assertAllClose(expected_logistics,
+                          predictions[prediction_keys.PredictionKeys.LOGISTIC])
+      self.assertAllClose(
+          expected_probabilities,
+          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+      self.assertAllClose(expected_class_ids,
+                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+      self.assertAllEqual(expected_classes,
+                          predictions[prediction_keys.PredictionKeys.CLASSES])
+      self.assertAllClose(
+          expected_probabilities,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
+      self.assertAllEqual(
+          expected_export_classes,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
+      self.assertAllClose(expected_logistics,
+                          sess.run(spec.export_outputs['regression'].value))
+
+  def test_predict_with_vocabulary_list(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        label_vocabulary=['aang', 'iroh'])
+
+    logits = [[1.], [0.]]
+    expected_classes = [[b'iroh'], [b'aang']]
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllEqual(
+          expected_classes,
+          sess.run(spec.predictions[prediction_keys.PredictionKeys.CLASSES]))
+
+  def test_eval(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((1,), (1,),), dtype=np.int32))
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+        # loss_mean = loss/2 = 41./2 = 20.5
+        keys.LOSS_MEAN: 20.5,
+        keys.ACCURACY: 1./2,
+        keys.PREDICTION_MEAN: 1./2,
+        keys.LABEL_MEAN: 2./2,
+        keys.ACCURACY_BASELINE: 2./2,
+        keys.AUC: 0.,
+        keys.AUC_PR: 1.,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(41., loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
+  def test_eval_with_vocabulary_list(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        label_vocabulary=['aang', 'iroh'])
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=[[b'iroh'], [b'iroh']])
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      sess.run(update_ops)
+      self.assertAllClose(1. / 2,
+                          value_ops[metric_keys.MetricKeys.ACCURACY].eval())
+
+  def test_eval_with_thresholds(self):
+    thresholds = [0.25, 0.5, 0.75]
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        thresholds=thresholds)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=np.array(((-1,), (1,),), dtype=np.float32),
+        labels=np.array(((1,), (1,),), dtype=np.int32))
+
+    # probabilities[i] = 1/(1 + exp(-logits[i])) =>
+    # probabilities = [1/(1 + exp(1)), 1/(1 + exp(-1))] = [0.269, 0.731]
+    # loss = -sum(ln(probabilities[label[i]])) = -ln(0.269) -ln(0.731)
+    #      = 1.62652338
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: 1.62652338 / 2.,
+        keys.ACCURACY: 1./2,
+        keys.PREDICTION_MEAN: 1./2,
+        keys.LABEL_MEAN: 2./2,
+        keys.ACCURACY_BASELINE: 2./2,
+        keys.AUC: 0.,
+        keys.AUC_PR: 1.,
+        keys.ACCURACY_AT_THRESHOLD % thresholds[0]: 1.,
+        keys.PRECISION_AT_THRESHOLD % thresholds[0]: 1.,
+        keys.RECALL_AT_THRESHOLD % thresholds[0]: 1.,
+        keys.ACCURACY_AT_THRESHOLD % thresholds[1]: .5,
+        keys.PRECISION_AT_THRESHOLD % thresholds[1]: 1.,
+        keys.RECALL_AT_THRESHOLD % thresholds[1]: .5,
+        keys.ACCURACY_AT_THRESHOLD % thresholds[2]: 0.,
+        keys.PRECISION_AT_THRESHOLD % thresholds[2]: 0.,
+        keys.RECALL_AT_THRESHOLD % thresholds[2]: 0.,
+    }
+
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+
+  def test_train(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+    expected_loss = 41.
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((1,), (1,),), dtype=np.float64),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/2 = 41/2 = 20.5
+          metric_keys.MetricKeys.LOSS_MEAN: 20.5,
+      }, summary_str)
+
+  def test_float_labels_train(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = sum(cross_entropy(labels, logits))
+    #      = sum(-label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i]))
+    #      = -0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5))
+    #        -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))
+    #      = 1.2484322
+    expected_loss = 1.2484322
+    def _train_op_fn(loss):
+      with ops.control_dependencies((_assert_close(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss)),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={'x': np.array([[42]], dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array([[0.8], [0.4]], dtype=np.float32),
+        train_op_fn=_train_op_fn)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAlmostEqual(expected_loss, loss, delta=1.e-5)
+      self.assertEqual(expected_train_result, train_result)
+
+  def test_float_labels_eval(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array([[42]], dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array([[0.8], [0.4]], dtype=np.float32))
+
+    # loss = sum(cross_entropy(labels, logits))
+    #      = sum(-label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i]))
+    #      = -0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5))
+    #        -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))
+    #      = 1.2484322
+    expected_loss = 1.2484322
+
+    # Assert loss.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAlmostEqual(expected_loss, loss, delta=1.e-5)
+      self.assertAlmostEqual(
+          expected_loss / 2., metrics[metric_keys.MetricKeys.LOSS_MEAN])
+
+  def test_weighted_multi_example_predict(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(
+          logits.astype(np.float32),
+          predictions[prediction_keys.PredictionKeys.LOGITS])
+      self.assertAllClose(
+          _sigmoid(logits).astype(np.float32),
+          predictions[prediction_keys.PredictionKeys.LOGISTIC])
+      self.assertAllClose(
+          [[0., 1.], [1., 0.],
+           [0., 1.]], predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+      self.assertAllClose([[1], [0], [1]],
+                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+      self.assertAllEqual([[b'1'], [b'0'], [b'1']],
+                          predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def test_weighted_multi_example_eval(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((1,), (1,), (0,)), dtype=np.int32))
+
+    # label_mean = (1*1 + .1*1 + 1.5*0)/(1 + .1 + 1.5) = 1.1/2.6
+    #            = .42307692307
+    expected_label_mean = .42307692307
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # losses = label_weights*cross_entropy(labels, logits)
+        #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
+        # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
+        # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
+        #           = 70.1/2.6 = 26.9615384615
+        keys.LOSS_MEAN: 26.9615384615,
+        # accuracy = (1*1 + .1*0 + 1.5*0)/(1 + .1 + 1.5) = 1/2.6 = .38461538461
+        keys.ACCURACY: .38461538461,
+        # prediction_mean = (1*1 + .1*0 + 1.5*1)/(1 + .1 + 1.5) = 2.5/2.6
+        #                 = .96153846153
+        keys.PREDICTION_MEAN: .96153846153,
+        keys.LABEL_MEAN: expected_label_mean,
+        keys.ACCURACY_BASELINE: 1 - expected_label_mean,
+        keys.AUC: .45454565,
+        keys.AUC_PR: .6737757325172424,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(70.1, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
+  def test_train_with_one_dim_labels_and_weights(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
+    labels_rank_1 = np.array((1., 1., 0.,))
+    weights_rank_1 = np.array(((1., .1, 1.5,)), dtype=np.float64)
+    self.assertEqual((3,), labels_rank_1.shape)
+    self.assertEqual((3,), weights_rank_1.shape)
+
+    expected_train_result = b'my_train_op'
+    # losses = label_weights*cross_entropy(labels, logits)
+    #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
+    # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
+    expected_loss = 70.1
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
+            'label_weights': weights_rank_1,
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels_rank_1,
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertIsNotNone(spec.train_op)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((
+          spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
+          #           = 70.1/2.6 = 26.9615384615
+          metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
+      }, summary_str)
+
+  def test_weighted_multi_example_train(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # losses = label_weights*cross_entropy(labels, logits)
+    #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
+    # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
+    expected_loss = 70.1
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((1.,), (1.,), (0.,))),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertIsNotNone(spec.train_op)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((
+          spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
+          #           = 70.1/2.6 = 26.9615384615
+          metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
+      }, summary_str)
+
+
+class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
+
+  def test_invalid_label_dimension(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
+      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=-1)
+    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
+      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=0)
+
+  def test_invalid_logits(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    logits_1d = np.array(((45.,), (41.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_1d)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PREDICTIONS].eval({
+            logits_placeholder: logits_1d
+        })
+
+  def test_incompatible_labels_eval(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
+    values_1d = np.array(((43.,), (44.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': values_1d},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_3d,
+          labels=values_1d)
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': values_3d}, labels=values_3d,
+          mode=model_fn.ModeKeys.EVAL, logits=values_1d, train_op_fn=None)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_1d},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.loss.eval({
+            labels_placeholder: values_3d,
+            logits_placeholder: values_1d
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            labels_placeholder: values_1d,
+            logits_placeholder: values_3d
+        })
+
+  def test_incompatible_labels_train(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
+    values_1d = np.array(((43.,), (44.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': values_1d},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=values_3d,
+          labels=values_1d,
+          train_op_fn=lambda x: x)
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': values_3d},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=values_1d,
+          labels=values_3d,
+          train_op_fn=lambda x: x)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_1d},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits_placeholder,
+        labels=labels_placeholder,
+        train_op_fn=lambda x: x)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.loss.eval({
+            labels_placeholder: values_3d,
+            logits_placeholder: values_1d
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            labels_placeholder: values_1d,
+            logits_placeholder: values_3d
+        })
+
+  def test_predict(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertIsNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNone(spec.train_op)
+    self.assertItemsEqual(
+        ('', signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY),
+        spec.export_outputs.keys())
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions.
+    with self.test_session():
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(logits, spec.predictions[prediction_key].eval())
+
+  def test_eval(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((43,), (44,),), dtype=np.int32))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = (43-45)^2 + (44-41)^2 = 4+9 = 13
+      self.assertAllClose(13., loss)
+      # loss_mean = loss/2 = 13/2 = 6.5
+      expected_loss_mean = 6.5
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_train(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
+    expected_loss = 13
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((43.,), (44.,),), dtype=np.float64),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/2 = 13/2 = 6.5
+          metric_keys.MetricKeys.LOSS_MEAN: 6.5,
+      }, summary_str)
+
+  def test_weighted_multi_example_eval(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((35,), (42,), (45,)), dtype=np.int32))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+      self.assertAllClose(101.6, loss)
+      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
+      expected_loss_mean = 39.0769231
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_weight_with_numeric_column(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column=feature_column_lib.numeric_column(
+            'label_weights', normalizer_fn=lambda x: x + 1.))
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x':
+                np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights':
+                np.array(((0.,), (-0.9,), (0.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((35,), (42,), (45,)), dtype=np.int32))
+
+    # Assert loss.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss = sess.run(spec.loss)
+      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+      self.assertAllClose(101.6, loss)
+
+  def test_weighted_multi_example_train(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+    expected_loss = 101.6
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.float32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((35.,), (42.,), (45.,)), dtype=np.float32),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
+          metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
+      }, summary_str)
+
+  def test_with_one_dim_label_and_weight(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+    expected_loss = 101.6
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
+    weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
+    labels_rank_1 = np.array((35., 42., 45.,))
+    self.assertEqual((3,), x_feature_rank_1.shape)
+    self.assertEqual((3,), weight_rank_1.shape)
+    self.assertEqual((3,), labels_rank_1.shape)
+
+    spec = head.create_estimator_spec(
+        features={
+            'x': x_feature_rank_1,
+            'label_weights': weight_rank_1,
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels_rank_1,
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
+          metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
+      }, summary_str)
+
+  def test_weighted_multi_value_eval(self):
+    """3d label, 1 example, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45., 41., 44.),))
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42., 43., 44.),)),
+            'label_weights': np.array(((1., .1, 1.5),)),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((35., 42., 45.),)))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+      self.assertAllClose(101.6, loss)
+      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+      expected_loss_mean = 39.076923
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_weighted_multi_value_train(self):
+    """3d label, 1 example, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45., 41., 44.),))
+    expected_train_result = b'my_train_op'
+    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+    expected_loss = 101.6
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42., 43., 44.),)),
+            'label_weights': np.array(((1., .1, 1.5),)),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((35., 42., 45.),)),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Evaluate predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+          metric_keys.MetricKeys.LOSS_MEAN: 39.076923,
+      }, summary_str)
+
+  def test_weighted_multi_batch_eval(self):
+    """1d label, 1 example, 3 batches."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45.,), (41.,), (44.,)))
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': np.array(((42.,), (43.,), (44.,))),
+            'label_weights': np.array(((1.,), (.1,), (1.5,))),
+            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
+            # batched version of it, and pop it off before passing to
+            # `create_estimator_spec`.
+            'logits': logits,
+        },
+        y=np.array(((35.,), (42.,), (45.,))),
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    batched_features, batched_labels = input_fn()
+    batched_logits = batched_features.pop('logits')
+    spec = head.create_estimator_spec(
+        features=batched_features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=batched_logits,
+        labels=batched_labels,
+        train_op_fn=None)
+
+    # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
+    # loss = sum(losses) = 100+.1+1.5 = 101.6
+    # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+    expected_metrics = {metric_keys.MetricKeys.LOSS_MEAN: 39.076923}
+
+    # Assert spec contains expected tensors.
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    _assert_no_hooks(self, spec)
+
+    with self.test_session() as sess:
+      # Finalize graph and initialize variables.
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      queue_runner_impl.start_queue_runners()
+
+      # Run tensors for `steps` steps.
+      steps = len(logits)
+      results = tuple([
+          sess.run((
+              spec.loss,
+              # The `[1]` gives us the metric update op.
+              {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+          )) for _ in range(steps)
+      ])
+
+      # Assert losses and metrics.
+      self.assertAllClose((100, .1, 1.5), [r[0] for r in results])
+      # For metrics, check results of both update (in `results`) and value ops.
+      # Note: we only check the result of the last step for streaming metrics.
+      self.assertAllClose(expected_metrics, results[steps - 1][1])
+      self.assertAllClose(expected_metrics, {
+          k: spec.eval_metric_ops[k][0].eval() for k in spec.eval_metric_ops
+      })
+
+  def test_weighted_multi_batch_train(self):
+    """1d label, 1 example, 3 batches."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45.,), (41.,), (44.,)))
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': np.array(((42.,), (43.,), (44.,))),
+            'label_weights': np.array(((1.,), (.1,), (1.5,))),
+            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
+            # batched version of it, and pop it off before passing to
+            # `create_estimator_spec`.
+            'logits': logits,
+        },
+        y=np.array(((35.,), (42.,), (45.,))),
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    batched_features, batched_labels = input_fn()
+    batched_logits = batched_features.pop('logits')
+    spec = head.create_estimator_spec(
+        features=batched_features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=batched_logits,
+        labels=batched_labels,
+        train_op_fn=lambda loss: loss * -7.)
+
+    # Assert spec contains expected tensors.
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertIsNotNone(spec.train_op)
+
+    with self.test_session() as sess:
+      # Finalize graph and initialize variables.
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      queue_runner_impl.start_queue_runners()
+
+      results = tuple([
+          sess.run((spec.loss, spec.train_op)) for _ in range(len(logits))
+      ])
+
+      # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
+      expected_losses = np.array((100, .1, 1.5))
+      self.assertAllClose(expected_losses, [r[0] for r in results])
+      self.assertAllClose(expected_losses * -7., [r[1] for r in results])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
new file mode 100644
index 00000000000..fd929b260bd
--- /dev/null
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -0,0 +1,314 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear Estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import training_util
+
+
+# The default learning rate of 0.2 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.2
+
+
+def _get_default_optimizer(feature_columns):
+  learning_rate = min(_LEARNING_RATE, 1.0 / math.sqrt(len(feature_columns)))
+  return ftrl.FtrlOptimizer(learning_rate=learning_rate)
+
+
+# TODO(b/36813849): Revisit passing params vs named arguments.
+def _linear_model_fn(features, labels, mode, params, config):
+  """A model_fn for linear models that use a gradient-based optimizer.
+
+  Args:
+    features: Dict of `Tensor`.
+    labels: `Tensor` of shape `[batch_size, logits_dimension]`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * head: A `Head` instance.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * optimizer: string, `Optimizer` object, or callable that defines the
+          optimizer to use for training. If `None`, will use a FTRL optimizer.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or params are invalid.
+  """
+  head = params['head']
+  feature_columns = tuple(params['feature_columns'])
+  optimizer = optimizers.get_optimizer_instance(
+      params.get('optimizer') or _get_default_optimizer(feature_columns),
+      learning_rate=_LEARNING_RATE)
+  num_ps_replicas = config.num_ps_replicas if config else 0
+
+  partitioner = params.get('partitioner') or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas,
+          min_slice_size=64 << 20))
+
+  with variable_scope.variable_scope(
+      'linear',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+
+    logits = feature_column_lib.linear_model(
+        features=features,
+        feature_columns=feature_columns,
+        units=head.logits_dimension)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class LinearClassifier(estimator.Estimator):
+  """Linear classifier model.
+
+  Train a linear model to classify instances into one of multiple possible
+  classes. When number of possible classes is 2, this is binary classification.
+
+  Example:
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  # Estimator using the default optimizer.
+  estimator = LinearClassifier(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
+
+  # Or estimator using the FTRL optimizer with regularization.
+  estimator = LinearClassifier(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      optimizer=tf.train.FtrlOptimizer(
+        learning_rate=0.1,
+        l1_regularization_strength=0.001
+      ))
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    ...
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `SparseColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `WeightedSparseColumn`, two features: the first with
+      `key` the id column name, the second with `key` the weight column name.
+      Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               feature_columns,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Ftrl',
+               config=None,
+               partitioner=None):
+    """Construct a `LinearClassifier` estimator object.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Note that class labels are integers representing the class index (i.e.
+        values from 0 to n_classes-1). For arbitrary label values (e.g. string
+        labels), convert to class indices first.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to FTRL optimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+
+    Returns:
+      A `LinearClassifier` estimator.
+
+    Raises:
+      ValueError: if n_classes < 2.
+    """
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    super(LinearClassifier, self).__init__(
+        model_fn=_linear_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params={
+            'head': head,
+            'feature_columns': feature_columns,
+            'optimizer': optimizer,
+            'partitioner': partitioner,
+        })
+
+
+class LinearRegressor(estimator.Estimator):
+  """An estimator for TensorFlow Linear regression problems.
+
+  Train a linear regression model to predict label value given observation of
+  feature values.
+
+  Example:
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  estimator = LinearRegressor(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    ...
+  def input_fn_eval: # returns x, y
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a KeyError:
+
+  * if `weight_column` is not `None`:
+    key=weight_column, value=a `Tensor`
+  * for column in `feature_columns`:
+    - if isinstance(column, `SparseColumn`):
+        key=column.name, value=a `SparseTensor`
+    - if isinstance(column, `WeightedSparseColumn`):
+        {key=id column name, value=a `SparseTensor`,
+         key=weight column name, value=a `SparseTensor`}
+    - if isinstance(column, `RealValuedColumn`):
+        key=column.name, value=a `Tensor`
+  """
+
+  def __init__(self,
+               feature_columns,
+               model_dir=None,
+               label_dimension=1,
+               weight_column=None,
+               optimizer='Ftrl',
+               config=None,
+               partitioner=None):
+    """Initializes a `LinearRegressor` instance.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to FTRL optimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+    """
+    super(LinearRegressor, self).__init__(
+        model_fn=_linear_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params={
+            # pylint: disable=protected-access
+            'head':
+                head_lib._regression_head_with_mean_squared_error_loss(
+                    label_dimension=label_dimension,
+                    weight_column=weight_column),
+            # pylint: enable=protected-access
+            'feature_columns':
+                feature_columns,
+            'optimizer':
+                optimizer,
+            'partitioner':
+                partitioner,
+        })
diff --git a/tensorflow/python/estimator/canned/linear_test.py b/tensorflow/python/estimator/canned/linear_test.py
new file mode 100644
index 00000000000..4ab190dd2d6
--- /dev/null
+++ b/tensorflow/python/estimator/canned/linear_test.py
@@ -0,0 +1,122 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for linear.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator.canned import linear
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.platform import test
+
+
+def _linear_regressor_fn(*args, **kwargs):
+  return linear.LinearRegressor(*args, **kwargs)
+
+
+def _linear_classifier_fn(*args, **kwargs):
+  return linear.LinearClassifier(*args, **kwargs)
+
+
+# Tests for Linear Regressor.
+
+
+class LinearRegressorPartitionerTest(
+    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearRegressorEvaluationTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearRegressorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearRegressorIntegrationTest(
+    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
+        self, _linear_regressor_fn)
+
+
+class LinearRegressorTrainingTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_regressor_fn)
+
+
+# Tests for Linear Classifier.
+
+
+class LinearClassifierTrainingTest(
+    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearClassifierEvaluationTest(
+    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearClassifierPredictTest(
+    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+class LinearClassifierIntegrationTest(
+    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
+        self, linear_classifier_fn=_linear_classifier_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
new file mode 100644
index 00000000000..965ac8cbdd3
--- /dev/null
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -0,0 +1,1807 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for testing linear estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import linear
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.estimator.inputs import pandas_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import queue_runner
+from tensorflow.python.training import saver
+from tensorflow.python.training import session_run_hook
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+# pylint rules which are disabled by default for test files.
+# pylint: disable=invalid-name,protected-access,missing-docstring
+
+# Names of variables created by model.
+AGE_WEIGHT_NAME = 'linear/linear_model/age/weights'
+HEIGHT_WEIGHT_NAME = 'linear/linear_model/height/weights'
+BIAS_NAME = 'linear/linear_model/bias_weights'
+LANGUAGE_WEIGHT_NAME = 'linear/linear_model/language/weights'
+
+
+def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def save_variables_to_ckpt(model_dir):
+  init_all_op = [variables.global_variables_initializer()]
+  with tf_session.Session() as sess:
+    sess.run(init_all_op)
+    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+def queue_parsed_features(feature_map):
+  tensors_to_enqueue = []
+  keys = []
+  for key, tensor in six.iteritems(feature_map):
+    keys.append(key)
+    tensors_to_enqueue.append(tensor)
+  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
+  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
+  queue_runner.add_queue_runner(
+      queue_runner.QueueRunner(input_queue,
+                               [input_queue.enqueue(tensors_to_enqueue)]))
+  dequeued_tensors = input_queue.dequeue()
+  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+def sigmoid(x):
+  return 1 / (1 + np.exp(-1.0 * x))
+
+
+class CheckPartitionerVarHook(session_run_hook.SessionRunHook):
+  """A `SessionRunHook` to check a paritioned variable."""
+
+  def __init__(self, test_case, var_name, var_dim, partitions):
+    self._test_case = test_case
+    self._var_name = var_name
+    self._var_dim = var_dim
+    self._partitions = partitions
+
+  def begin(self):
+    with variable_scope.variable_scope(
+        variable_scope.get_variable_scope()) as scope:
+      scope.reuse_variables()
+      partitioned_weight = variable_scope.get_variable(
+          self._var_name, shape=(self._var_dim, 1))
+      self._test_case.assertTrue(
+          isinstance(partitioned_weight, variables.PartitionedVariable))
+      for part in partitioned_weight:
+        self._test_case.assertEqual(self._var_dim // self._partitions,
+                                    part.get_shape()[0])
+
+
+class BaseLinearRegressorPartitionerTest(object):
+
+  def __init__(self, linear_regressor_fn):
+    self._linear_regressor_fn = linear_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testPartitioner(self):
+    x_dim = 64
+    partitions = 4
+
+    def _partitioner(shape, dtype):
+      del dtype  # unused; required by Fn signature.
+      # Only partition the embedding tensor.
+      return [partitions, 1] if shape[0] == x_dim else [1]
+
+    regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.categorical_column_with_hash_bucket(
+            'language', hash_bucket_size=x_dim),),
+        partitioner=_partitioner,
+        model_dir=self._model_dir)
+
+    def _input_fn():
+      return {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english', 'spanish'],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2])
+      }, [[10.]]
+
+    hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
+                                   partitions)
+    regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
+
+  def testDefaultPartitionerWithMultiplePsReplicas(self):
+    partitions = 2
+    # This results in weights larger than the default partition size of 64M,
+    # so partitioned weights are created (each weight uses 4 bytes).
+    x_dim = 32 << 20
+
+    class FakeRunConfig(run_config.RunConfig):
+
+      @property
+      def num_ps_replicas(self):
+        return partitions
+
+    # Mock the device setter as ps is not available on test machines.
+    with test.mock.patch.object(
+        estimator,
+        '_get_replica_device_setter',
+        return_value=lambda _: '/cpu:0'):
+      linear_regressor = self._linear_regressor_fn(
+          feature_columns=(
+              feature_column_lib.categorical_column_with_hash_bucket(
+                  'language', hash_bucket_size=x_dim),),
+          config=FakeRunConfig(),
+          model_dir=self._model_dir)
+
+      def _input_fn():
+        return {
+            'language':
+                sparse_tensor.SparseTensor(
+                    values=['english', 'spanish'],
+                    indices=[[0, 0], [0, 1]],
+                    dense_shape=[1, 2])
+        }, [[10.]]
+
+      hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
+                                     partitions)
+      linear_regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
+
+
+# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
+class BaseLinearRegressorEvaluationTest(object):
+
+  def __init__(self, linear_regressor_fn):
+    self._linear_regressor_fn = linear_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_evaluation_for_simple_data(self):
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10. Loss is 3**2 = 9.
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 9.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_batch(self):
+    """Tests evaluation for batch_size==2."""
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the sum over batch = 9 + 9 = 18
+    # Average loss is the average over batch = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 18.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_weights(self):
+    """Tests evaluation with weights."""
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    def _input_fn():
+      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
+      labels = ((10.,), (10.,))
+      return features, labels
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        weight_column='weights',
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(input_fn=_input_fn, steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
+    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 27.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_for_multi_dimensions(self):
+    x_dim = 3
+    label_dim = 2
+    with ops.Graph().as_default():
+      variables.Variable(
+          [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name=AGE_WEIGHT_NAME)
+      variables.Variable([7.0, 8.0], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column(
+            'age', shape=(x_dim,)),),
+        label_dimension=label_dim,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([[2., 4., 5.]]),
+        },
+        y=np.array([[46., 58.]]),
+        batch_size=1,
+        num_epochs=None,
+        shuffle=False)
+    eval_metrics = linear_regressor.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
+         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
+
+    # Logit is
+    #   [2., 4., 5.] * [1.0, 2.0] + [7.0, 8.0] = [39, 50] + [7.0, 8.0]
+    #                  [3.0, 4.0]
+    #                  [5.0, 6.0]
+    # which is [46, 58]
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+  def test_evaluation_for_multiple_feature_columns(self):
+    with ops.Graph().as_default():
+      variables.Variable([[10.0]], name=AGE_WEIGHT_NAME)
+      variables.Variable([[2.0]], name=HEIGHT_WEIGHT_NAME)
+      variables.Variable([5.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    batch_size = 2
+    feature_columns = [
+        feature_column_lib.numeric_column('age'),
+        feature_column_lib.numeric_column('height')
+    ]
+    input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array([20, 40]),
+           'height': np.array([4, 8])},
+        y=np.array([[213.], [421.]]),
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=False)
+
+    est = self._linear_regressor_fn(
+        feature_columns=feature_columns, model_dir=self._model_dir)
+
+    eval_metrics = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
+         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
+
+    # Logit is [(20. * 10.0 + 4 * 2.0 + 5.0), (40. * 10.0 + 8 * 2.0 + 5.0)] =
+    # [213.0, 421.0], while label is [213., 421.]. Loss = 0.
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+
+class BaseLinearRegressorPredictTest(object):
+
+  def __init__(self, linear_regressor_fn):
+    self._linear_regressor_fn = linear_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    with ops.Graph().as_default():
+      variables.Variable([[10.]], name='linear/linear_model/x/weights')
+      variables.Variable([.2], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('x'),),
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[2.]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x * weight + bias = 2. * 10. + .2 = 20.2
+    self.assertAllClose([[20.2]], predicted_scores)
+
+  def testMultiDim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    batch_size = 2
+    label_dimension = 3
+    x_dim = 4
+    feature_columns = (feature_column_lib.numeric_column('x', shape=(x_dim,)),)
+    with ops.Graph().as_default():
+      variables.Variable(  # shape=[x_dim, label_dimension]
+          [[1., 2., 3.], [2., 3., 4.], [3., 4., 5.], [4., 5., 6.]],
+          name='linear/linear_model/x/weights')
+      variables.Variable(  # shape=[label_dimension]
+          [.2, .4, .6], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        # x shape=[batch_size, x_dim]
+        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # score = x * weight + bias, shape=[batch_size, label_dimension]
+    self.assertAllClose([[30.2, 40.4, 50.6], [70.2, 96.4, 122.6]],
+                        predicted_scores)
+
+  def testTwoFeatureColumns(self):
+    """Tests predict with two feature columns."""
+    with ops.Graph().as_default():
+      variables.Variable([[10.]], name='linear/linear_model/x0/weights')
+      variables.Variable([[20.]], name='linear/linear_model/x1/weights')
+      variables.Variable([.2], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('x0'),
+                         feature_column_lib.numeric_column('x1')),
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x0': np.array([[2.]]),
+           'x1': np.array([[3.]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x0 * weight0 + x1 * weight1 + bias = 2. * 10. + 3. * 20 + .2 = 80.2
+    self.assertAllClose([[80.2]], predicted_scores)
+
+
+class BaseLinearRegressorIntegrationTest(object):
+
+  def __init__(self, linear_regressor_fn):
+    self._linear_regressor_fn = linear_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, label_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = self._linear_regressor_fn(
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['predictions'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+
+    # Pandas DataFrame natually supports 1 dim data only.
+    label_dimension = 1
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.array([1., 2., 3., 4.], dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(data)
+    prediction_length = 4
+
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x, batch_size=batch_size, shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum)),
+              'y':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum[:label_dimension])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+    }
+
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+
+class BaseLinearRegressorTrainingTest(object):
+
+  def __init__(self, linear_regressor_fn):
+    self._linear_regressor_fn = linear_regressor_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % AGE_WEIGHT_NAME,
+        '%s/part_0:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step=None, var_list=None):
+      trainable_vars = var_list or ops.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(expected_var_names,
+                            [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        if global_step is not None:
+          return state_ops.assign_add(global_step, 1).op
+        return control_flow_ops.no_op()
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        if global_step is not None:
+          return state_ops.assign_add(global_step, 1).op
+        return control_flow_ops.no_op()
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(self,
+                         expected_global_step,
+                         expected_age_weight=None,
+                         expected_bias=None):
+    shapes = {
+        name: shape
+        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(self._model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([1, 1], shapes[AGE_WEIGHT_NAME])
+    if expected_age_weight is not None:
+      self.assertEqual(expected_age_weight,
+                       checkpoint_utils.load_variable(self._model_dir,
+                                                      AGE_WEIGHT_NAME))
+
+    self.assertEqual([1], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertEqual(expected_bias,
+                       checkpoint_utils.load_variable(self._model_dir,
+                                                      BIAS_NAME))
+
+  def testFromScratchWithDefaultOptimizer(self):
+    # Create LinearRegressor.
+    label = 5.
+    age = 17
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self._assert_checkpoint(num_steps)
+
+  def testTrainWithOneDimLabel(self):
+    label_dimension = 1
+    batch_size = 20
+    feature_columns = [feature_column_lib.numeric_column('age', shape=(1,))]
+    est = self._linear_regressor_fn(
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
+    self.assertEqual((batch_size,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(200)
+
+  def testTrainWithOneDimWeight(self):
+    label_dimension = 1
+    batch_size = 20
+    feature_columns = [feature_column_lib.numeric_column('age', shape=(1,))]
+    est = self._linear_regressor_fn(
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
+    self.assertEqual((batch_size,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1,
+           'w': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(200)
+
+  def testFromScratch(self):
+    # Create LinearRegressor.
+    label = 5.
+    age = 17
+    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
+    mock_optimizer = self._mock_optimizer(expected_loss=25.)
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        expected_global_step=num_steps,
+        expected_age_weight=0.,
+        expected_bias=0.)
+
+  def testFromCheckpoint(self):
+    # Create initial checkpoint.
+    age_weight = 10.0
+    bias = 5.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = age * age_weight + bias = 17 * 10. + 5. = 175
+    # loss = (logits - label)^2 = (175 - 5)^2 = 28900
+    mock_optimizer = self._mock_optimizer(expected_loss=28900.)
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+  def testFromCheckpointMultiBatch(self):
+    # Create initial checkpoint.
+    age_weight = 10.0
+    bias = 5.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = age * age_weight + bias
+    # logits[0] = 17 * 10. + 5. = 175
+    # logits[1] = 15 * 10. + 5. = 155
+    # loss = sum(logits - label)^2 = (175 - 5)^2 + (155 - 3)^2 = 52004
+    mock_optimizer = self._mock_optimizer(expected_loss=52004.)
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
+        steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+
+class BaseLinearClassifierTrainingTest(object):
+
+  def __init__(self, linear_classifier_fn):
+    self._linear_classifier_fn = linear_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % AGE_WEIGHT_NAME,
+        '%s/part_0:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(
+      self, n_classes, expected_global_step, expected_age_weight=None,
+      expected_bias=None):
+    logits_dimension = n_classes if n_classes > 2 else 1
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([1, logits_dimension],
+                     shapes[AGE_WEIGHT_NAME])
+    if expected_age_weight is not None:
+      self.assertAllEqual(expected_age_weight,
+                          checkpoint_utils.load_variable(
+                              self._model_dir,
+                              AGE_WEIGHT_NAME))
+
+    self.assertEqual([logits_dimension], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertAllEqual(expected_bias,
+                          checkpoint_utils.load_variable(
+                              self._model_dir, BIAS_NAME))
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    label = 0
+    age = 17
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self._assert_checkpoint(n_classes, num_steps)
+
+  def testBinaryClassesFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassesFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def _testTrainWithTwoDimsLabel(self, n_classes):
+    batch_size = 20
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    data_rank_2 = np.array([[0], [1]])
+    self.assertEqual((2,), data_rank_1.shape)
+    self.assertEqual((2, 1), data_rank_2.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_2,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithTwoDimsLabel(self):
+    self._testTrainWithTwoDimsLabel(n_classes=2)
+
+  def testMultiClassesTrainWithTwoDimsLabel(self):
+    self._testTrainWithTwoDimsLabel(n_classes=4)
+
+  def _testTrainWithOneDimLabel(self, n_classes):
+    batch_size = 20
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    self.assertEqual((2,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithOneDimLabel(self):
+    self._testTrainWithOneDimLabel(n_classes=2)
+
+  def testMultiClassesTrainWithOneDimLabel(self):
+    self._testTrainWithOneDimLabel(n_classes=4)
+
+  def _testTrainWithTwoDimsWeight(self, n_classes):
+    batch_size = 20
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        weight_column='w',
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    data_rank_2 = np.array([[0], [1]])
+    self.assertEqual((2,), data_rank_1.shape)
+    self.assertEqual((2, 1), data_rank_2.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1, 'w': data_rank_2}, y=data_rank_1,
+        batch_size=batch_size, num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithTwoDimsWeight(self):
+    self._testTrainWithTwoDimsWeight(n_classes=2)
+
+  def testMultiClassesTrainWithTwoDimsWeight(self):
+    self._testTrainWithTwoDimsWeight(n_classes=4)
+
+  def _testTrainWithOneDimWeight(self, n_classes):
+    batch_size = 20
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        weight_column='w',
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    self.assertEqual((2,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1, 'w': data_rank_1}, y=data_rank_1,
+        batch_size=batch_size, num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithOneDimWeight(self):
+    self._testTrainWithOneDimWeight(n_classes=2)
+
+  def testMultiClassesTrainWithOneDimWeight(self):
+    self._testTrainWithOneDimWeight(n_classes=4)
+
+  def _testFromScratch(self, n_classes):
+    label = 1
+    age = 17
+    # For binary classifer:
+    #   loss = sigmoid_cross_entropy(logits, label) where logits=0 (weights are
+    #   all zero initially) and label = 1 so,
+    #      loss = 1 * -log ( sigmoid(logits) ) = 0.69315
+    # For multi class classifer:
+    #   loss = cross_entropy(logits, label) where logits are all 0s (weights are
+    #   all zero initially) and label = 1 so,
+    #      loss = 1 * -log ( 1.0 / n_classes )
+    # For this particular test case, as logits are same, the formular
+    # 1 * -log ( 1.0 / n_classes ) covers both binary and multi class cases.
+    mock_optimizer = self._mock_optimizer(
+        expected_loss=-1 * math.log(1.0/n_classes))
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=num_steps,
+        expected_age_weight=[[0.]] if n_classes == 2 else [[0.] * n_classes],
+        expected_bias=[0.] if n_classes == 2 else [.0] * n_classes)
+
+  def testBinaryClassesFromScratch(self):
+    self._testFromScratch(n_classes=2)
+
+  def testMultiClassesFromScratch(self):
+    self._testFromScratch(n_classes=4)
+
+  def _testFromCheckpoint(self, n_classes):
+    # Create initial checkpoint.
+    label = 1
+    age = 17
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[2.0]] if n_classes == 2 else (
+        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # For binary classifer:
+    #   logits = age * age_weight + bias = 17 * 2. - 35. = -1.
+    #   loss = sigmoid_cross_entropy(logits, label)
+    #   so, loss = 1 * -log ( sigmoid(-1) ) = 1.3133
+    # For multi class classifer:
+    #   loss = cross_entropy(logits, label)
+    #   where logits = 17 * age_weight + bias and label = 1
+    #   so, loss = 1 * -log ( soft_max(logits)[1] )
+    if n_classes == 2:
+      expected_loss = 1.3133
+    else:
+      logits = age_weight * age + bias
+      logits_exp = np.exp(logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_loss = -1 * math.log(softmax[0, label])
+
+    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+  def testBinaryClassesFromCheckpoint(self):
+    self._testFromCheckpoint(n_classes=2)
+
+  def testMultiClassesFromCheckpoint(self):
+    self._testFromCheckpoint(n_classes=4)
+
+  def _testFromCheckpointFloatLabels(self, n_classes):
+    """Tests float labels for binary classification."""
+    # Create initial checkpoint.
+    if n_classes > 2:
+      return
+    label = 0.8
+    age = 17
+    age_weight = [[2.0]]
+    bias = [-35.0]
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = age * age_weight + bias = 17 * 2. - 35. = -1.
+    # loss = sigmoid_cross_entropy(logits, label)
+    # => loss = -0.8 * log(sigmoid(-1)) -0.2 * log(sigmoid(+1)) = 1.1132617
+    mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testBinaryClassesFromCheckpointFloatLabels(self):
+    self._testFromCheckpointFloatLabels(n_classes=2)
+
+  def testMultiClassesFromCheckpointFloatLabels(self):
+    self._testFromCheckpointFloatLabels(n_classes=4)
+
+  def _testFromCheckpointMultiBatch(self, n_classes):
+    # Create initial checkpoint.
+    label = [1, 0]
+    age = [17, 18.5]
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[2.0]] if n_classes == 2 else (
+        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # For binary classifer:
+    #   logits = age * age_weight + bias
+    #   logits[0] = 17 * 2. - 35. = -1.
+    #   logits[1] = 18.5 * 2. - 35. = 2.
+    #   loss = sigmoid_cross_entropy(logits, label)
+    #   so, loss[0] = 1 * -log ( sigmoid(-1) ) = 1.3133
+    #       loss[1] = (1 - 0) * -log ( 1- sigmoid(2) ) = 2.1269
+    # For multi class classifer:
+    #   loss = cross_entropy(logits, label)
+    #   where logits = [17, 18.5] * age_weight + bias and label = [1, 0]
+    #   so, loss = 1 * -log ( soft_max(logits)[label] )
+    if n_classes == 2:
+      expected_loss = (1.3133 + 2.1269)
+    else:
+      logits = age_weight * np.reshape(age, (2, 1)) + bias
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      expected_loss = expected_loss_0 + expected_loss_1
+
+    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
+
+    est = linear.LinearClassifier(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': (age)}, (label)),
+        steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+  def testBinaryClassesFromCheckpointMultiBatch(self):
+    self._testFromCheckpointMultiBatch(n_classes=2)
+
+  def testMultiClassesFromCheckpointMultiBatch(self):
+    self._testFromCheckpointMultiBatch(n_classes=4)
+
+
+class BaseLinearClassifierEvaluationTest(object):
+
+  def __init__(self, linear_classifier_fn):
+    self._linear_classifier_fn = linear_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _test_evaluation_for_simple_data(self, n_classes):
+    label = 1
+    age = 1.
+
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[-11.0]] if n_classes == 2 else (
+        np.reshape(-11.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [-30.0] if n_classes == 2 else [-30.0] * n_classes
+
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = self._linear_classifier_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=1)
+
+    if n_classes == 2:
+      # Binary classes: loss = sum(corss_entropy(41)) = 41.
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: 41.,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: 41.,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PREDICTION_MEAN: 0.,
+          metric_keys.MetricKeys.LABEL_MEAN: 1.,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
+          metric_keys.MetricKeys.AUC: 0.,
+          metric_keys.MetricKeys.AUC_PR: 1.,
+      }
+    else:
+      # Multi classes: loss = 1 * -log ( soft_max(logits)[label] )
+      logits = age_weight * age + bias
+      logits_exp = np.exp(logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_loss = -1 * math.log(softmax[0, label])
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_for_simple_data(self):
+    self._test_evaluation_for_simple_data(n_classes=2)
+
+  def test_multi_classes_evaluation_for_simple_data(self):
+    self._test_evaluation_for_simple_data(n_classes=4)
+
+  def _test_evaluation_batch(self, n_classes):
+    """Tests evaluation for batch_size==2."""
+    label = [1, 0]
+    age = [17., 18.]
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[2.0]] if n_classes == 2 else (
+        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = self._linear_classifier_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': (age)}, (label)), steps=1)
+
+    if n_classes == 2:
+      # Logits are (-1., 1.) labels are (1, 0).
+      # Loss is
+      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
+      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(1)) = 1.3133
+      expected_loss = 1.3133 * 2
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PREDICTION_MEAN: 0.5,
+          metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+          metric_keys.MetricKeys.AUC: 0.,
+          metric_keys.MetricKeys.AUC_PR: 0.25,
+      }
+    else:
+      # Multi classes: loss = 1 * -log ( soft_max(logits)[label] )
+      logits = age_weight * np.reshape(age, (2, 1)) + bias
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      expected_loss = expected_loss_0 + expected_loss_1
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_batch(self):
+    self._test_evaluation_batch(n_classes=2)
+
+  def test_multi_classes_evaluation_batch(self):
+    self._test_evaluation_batch(n_classes=4)
+
+  def _test_evaluation_weights(self, n_classes):
+    """Tests evaluation with weights."""
+
+    label = [1, 0]
+    age = [17., 18.]
+    weights = [1., 2.]
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[2.0]] if n_classes == 2 else (
+        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = self._linear_classifier_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': (age), 'w': (weights)}, (label)), steps=1)
+
+    if n_classes == 2:
+      # Logits are (-1., 1.) labels are (1, 0).
+      # Loss is
+      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
+      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(1)) = 1.3133
+      #   weights = [1., 2.]
+      expected_loss = 1.3133 * (1. + 2.)
+      loss_mean = expected_loss / (1.0 + 2.0)
+      label_mean = np.average(label, weights=weights)
+      logits = [-1, 1]
+      logistics = sigmoid(np.array(logits))
+      predictions_mean = np.average(logistics, weights=weights)
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
+          metric_keys.MetricKeys.LABEL_MEAN: label_mean,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: (
+              max(label_mean, 1-label_mean)),
+          metric_keys.MetricKeys.AUC: 0.,
+          metric_keys.MetricKeys.AUC_PR: 0.1668,
+      }
+    else:
+      # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
+      logits = age_weight * np.reshape(age, (2, 1)) + bias
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      loss_mean = np.average([expected_loss_0, expected_loss_1],
+                             weights=weights)
+      expected_loss = loss_mean * np.sum(weights)
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_weights(self):
+    self._test_evaluation_weights(n_classes=2)
+
+  def test_multi_classes_evaluation_weights(self):
+    self._test_evaluation_weights(n_classes=4)
+
+
+class BaseLinearClassifierPredictTest(object):
+
+  def __init__(self, linear_classifier_fn):
+    self._linear_classifier_fn = linear_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _testPredications(self, n_classes, label_vocabulary, label_output_fn):
+    """Tests predict when all variables are one-dimensional."""
+    age = 1.
+
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    age_weight = [[-11.0]] if n_classes == 2 else (
+        np.reshape(-11.0 * np.array(list(range(n_classes)), dtype=np.float32),
+                   (1, n_classes)))
+    bias = [10.0] if n_classes == 2 else [10.0] * n_classes
+
+    with ops.Graph().as_default():
+      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = self._linear_classifier_fn(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        label_vocabulary=label_vocabulary,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array([[age]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+
+    if n_classes == 2:
+      scalar_logits = np.asscalar(
+          np.reshape(np.array(age_weight) * age + bias, (1,)))
+      two_classes_logits = [0, scalar_logits]
+      two_classes_logits_exp = np.exp(two_classes_logits)
+      softmax = two_classes_logits_exp / two_classes_logits_exp.sum()
+
+      expected_predictions = {
+          'class_ids': [0],
+          'classes': [label_output_fn(0)],
+          'logistic': [sigmoid(np.array(scalar_logits))],
+          'logits': [scalar_logits],
+          'probabilities': softmax,
+      }
+    else:
+      onedim_logits = np.reshape(np.array(age_weight) * age + bias, (-1,))
+      class_ids = onedim_logits.argmax()
+      logits_exp = np.exp(onedim_logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_predictions = {
+          'class_ids': [class_ids],
+          'classes': [label_output_fn(class_ids)],
+          'logits': onedim_logits,
+          'probabilities': softmax,
+      }
+
+    self.assertEqual(1, len(predictions))
+    # assertAllClose cannot handle byte type.
+    self.assertEqual(expected_predictions['classes'], predictions[0]['classes'])
+    expected_predictions.pop('classes')
+    predictions[0].pop('classes')
+    self.assertAllClose(sorted_key_dict(expected_predictions),
+                        sorted_key_dict(predictions[0]))
+
+  def testBinaryClassesWithoutLabelVocabulary(self):
+    n_classes = 2
+    self._testPredications(n_classes,
+                           label_vocabulary=None,
+                           label_output_fn=lambda x: ('%s' % x).encode())
+
+  def testBinaryClassesWithLabelVocabulary(self):
+    n_classes = 2
+    self._testPredications(
+        n_classes,
+        label_vocabulary=['class_vocab_{}'.format(i)
+                          for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+  def testMultiClassesWithoutLabelVocabulary(self):
+    n_classes = 4
+    self._testPredications(
+        n_classes,
+        label_vocabulary=None,
+        label_output_fn=lambda x: ('%s' % x).encode())
+
+  def testMultiClassesWithLabelVocabulary(self):
+    n_classes = 4
+    self._testPredications(
+        n_classes,
+        label_vocabulary=['class_vocab_{}'.format(i)
+                          for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+
+class BaseLinearClassifierIntegrationTest(object):
+
+  def __init__(self, linear_classifier_fn):
+    self._linear_classifier_fn = linear_classifier_fn
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
+                          predict_input_fn, input_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = self._linear_classifier_fn(
+        feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['classes'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, 1), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def _test_numpy_input_fn(self, n_classes):
+    """Tests complete flow with numpy_input_fn."""
+    input_dimension = 4
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+    target = np.array([1] * batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=target,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=target,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_numpy_input_fn(self):
+    self._test_numpy_input_fn(n_classes=2)
+
+  def test_multi_classes_numpy_input_fn(self):
+    self._test_numpy_input_fn(n_classes=4)
+
+  def _test_pandas_input_fn(self, n_classes):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+
+    # Pandas DataFrame natually supports 1 dim data only.
+    input_dimension = 1
+    batch_size = 10
+    data = np.array([1., 2., 3., 4.], dtype=np.float32)
+    target = np.array([1, 0, 1, 0], dtype=np.int32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(target)
+    prediction_length = 4
+
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x, batch_size=batch_size, shuffle=False)
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_pandas_input_fn(self):
+    self._test_pandas_input_fn(n_classes=2)
+
+  def test_multi_classes_pandas_input_fn(self):
+    self._test_pandas_input_fn(n_classes=4)
+
+  def _test_input_fn_from_parse_example(self, n_classes):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    input_dimension = 2
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+    target = np.array([1] * batch_size, dtype=np.int64)
+
+    serialized_examples = []
+    for x, y in zip(data, target):
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=x)),
+              'y':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[y])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_input_fn_from_parse_example(self):
+    self._test_input_fn_from_parse_example(n_classes=2)
+
+  def test_multi_classes_input_fn_from_parse_example(self):
+    self._test_input_fn_from_parse_example(n_classes=4)
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
new file mode 100644
index 00000000000..91e3bf1d83a
--- /dev/null
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enum for model prediction keys."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import model_fn
+
+
+# TODO(pucker): Merge with model_fn.MetricKeys once we've worked out out naming
+# conventions.
+class MetricKeys(object):
+  """Metric key strings."""
+  LOSS = model_fn.MetricKeys.LOSS
+  LOSS_MEAN = model_fn.MetricKeys.AVERAGE_LOSS
+
+  ACCURACY = 'accuracy'
+  # This is the best the model could do by always predicting one class.
+  # Should be < ACCURACY in a trained model.
+  ACCURACY_BASELINE = 'accuracy_baseline'
+  AUC = 'auc'
+  AUC_PR = 'auc_precision_recall'
+  LABEL_MEAN = 'label/mean'
+  PREDICTION_MEAN = 'prediction/mean'
+
+  # The following require a threshold applied, should be float in range (0, 1).
+  ACCURACY_AT_THRESHOLD = 'accuracy/positive_threshold_%g'
+  PRECISION_AT_THRESHOLD = 'precision/positive_threshold_%g'
+  RECALL_AT_THRESHOLD = 'recall/positive_threshold_%g'
diff --git a/tensorflow/python/estimator/canned/optimizers.py b/tensorflow/python/estimator/canned/optimizers.py
new file mode 100644
index 00000000000..f72c5ca5cbb
--- /dev/null
+++ b/tensorflow/python/estimator/canned/optimizers.py
@@ -0,0 +1,78 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods related to optimizers used in canned_estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import rmsprop
+
+
+_OPTIMIZER_CLS_NAMES = {
+    'Adagrad': adagrad.AdagradOptimizer,
+    'Adam': adam.AdamOptimizer,
+    'Ftrl': ftrl.FtrlOptimizer,
+    'RMSProp': rmsprop.RMSPropOptimizer,
+    'SGD': gradient_descent.GradientDescentOptimizer,
+}
+
+
+def get_optimizer_instance(opt, learning_rate=None):
+  """Returns an optimizer instance.
+
+  Supports the following types for the given `opt`:
+  * An `Optimizer` instance: Returns the given `opt`.
+  * A string: Creates an `Optimizer` subclass with the given `learning_rate`.
+    Supported strings:
+    * 'Adagrad': Returns an `AdagradOptimizer`.
+    * 'Adam': Returns an `AdamOptimizer`.
+    * 'Ftrl': Returns an `FtrlOptimizer`.
+    * 'RMSProp': Returns an `RMSPropOptimizer`.
+    * 'SGD': Returns a `GradientDescentOptimizer`.
+
+  Args:
+    opt: An `Optimizer` instance, or string, as discussed above.
+    learning_rate: A float. Only used if `opt` is a string.
+
+  Returns:
+    An `Optimizer` instance.
+
+  Raises:
+    ValueError: If `opt` is an unsupported string.
+    ValueError: If `opt` is a supported string but `learning_rate` was not
+      specified.
+    ValueError: If `opt` is none of the above types.
+  """
+  if isinstance(opt, six.string_types):
+    if opt in six.iterkeys(_OPTIMIZER_CLS_NAMES):
+      if not learning_rate:
+        raise ValueError('learning_rate must be specified when opt is string.')
+      return _OPTIMIZER_CLS_NAMES[opt](learning_rate=learning_rate)
+    raise ValueError(
+        'Unsupported optimizer name: {}. Supported names are: {}'.format(
+            opt, tuple(sorted(six.iterkeys(_OPTIMIZER_CLS_NAMES)))))
+  if not isinstance(opt, optimizer_lib.Optimizer):
+    raise ValueError(
+        'The given object is not an Optimizer instance. Given: {}'.format(opt))
+  return opt
diff --git a/tensorflow/python/estimator/canned/optimizers_test.py b/tensorflow/python/estimator/canned/optimizers_test.py
new file mode 100644
index 00000000000..ee28756155a
--- /dev/null
+++ b/tensorflow/python/estimator/canned/optimizers_test.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimizers.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import rmsprop
+
+
+class GetOptimizerInstance(test.TestCase):
+
+  def test_unsupported_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Unsupported optimizer name: unsupported_name'):
+      optimizers.get_optimizer_instance('unsupported_name', learning_rate=0.1)
+
+  def test_supported_name_but_learning_rate_none(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'learning_rate must be specified when opt is string'):
+      optimizers.get_optimizer_instance('Adagrad', learning_rate=None)
+
+  def test_adagrad(self):
+    opt = optimizers.get_optimizer_instance('Adagrad', learning_rate=0.1)
+    self.assertIsInstance(opt, adagrad.AdagradOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_adam(self):
+    opt = optimizers.get_optimizer_instance('Adam', learning_rate=0.1)
+    self.assertIsInstance(opt, adam.AdamOptimizer)
+    self.assertAlmostEqual(0.1, opt._lr)
+
+  def test_ftrl(self):
+    opt = optimizers.get_optimizer_instance('Ftrl', learning_rate=0.1)
+    self.assertIsInstance(opt, ftrl.FtrlOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_rmsprop(self):
+    opt = optimizers.get_optimizer_instance('RMSProp', learning_rate=0.1)
+    self.assertIsInstance(opt, rmsprop.RMSPropOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_sgd(self):
+    opt = optimizers.get_optimizer_instance('SGD', learning_rate=0.1)
+    self.assertIsInstance(opt, gradient_descent.GradientDescentOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_object(self):
+    class _TestOptimizer(optimizer_lib.Optimizer):
+
+      def __init__(self):
+        super(_TestOptimizer, self).__init__(
+            use_locking=False, name='TestOptimizer')
+
+    opt = optimizers.get_optimizer_instance(_TestOptimizer())
+    self.assertIsInstance(opt, _TestOptimizer)
+
+  def test_object_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'The given object is not an Optimizer instance'):
+      optimizers.get_optimizer_instance((1, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
new file mode 100644
index 00000000000..af584965bb9
--- /dev/null
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -0,0 +1,160 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Parsing related helper function to be used in `input_fn`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import parsing_ops
+
+
+def classifier_parse_example_spec(feature_columns,
+                                  label_key,
+                                  label_dtype=dtypes.int64,
+                                  label_default=None,
+                                  weight_column=None):
+  """Generates parsing spec for tf.parse_example to be used with classifiers.
+
+  If users keep data in tf.Example format, they need to call tf.parse_example
+  with a proper feature spec. There are two main things that this utility helps:
+
+  * Users need to combine parsing spec of features with labels and weights
+    (if any) since they are all parsed from same tf.Example instance. This
+    utility combines these specs.
+  * It is difficult to map expected label by a classifier such as DNNClassifier
+    to corresponding tf.parse_example spec. This utility encodes it by getting
+    related information from users (key, dtype).
+
+  Example output of parsing spec:
+
+  ```python
+  # Define features and transformations
+  feature_b = tf.feature_column.numeric_column(...)
+  feature_c_bucketized = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column("feature_c"), ...)
+  feature_a_x_feature_c = tf.feature_column.crossed_column(
+      columns=["feature_a", feature_c_bucketized], ...)
+
+  feature_columns = [feature_b, feature_c_bucketized, feature_a_x_feature_c]
+  parsing_spec = tf.estimator.classifier_parse_example_spec(
+      feature_columns, label_key='my-label', label_dtype=tf.string)
+
+  # For the above example, classifier_parse_example_spec would return the dict:
+  assert parsing_spec == {
+    "feature_a": parsing_ops.VarLenFeature(tf.string),
+    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+    "my-label" : parsing_ops.FixedLenFeature([1], dtype=tf.string)
+  }
+  ```
+
+  Example usage with a classifier:
+
+  ```python
+  feature_columns = # define features via tf.feature_column
+  estimator = DNNClassifier(
+      n_classes=1000,
+      feature_columns=feature_columns,
+      weight_column='example-weight',
+      label_vocabulary=['photos', 'keep', ...],
+      hidden_units=[256, 64, 16])
+  # This label configuration tells the classifier the following:
+  # * weights are retrieved with key 'example-weight'
+  # * label is string and can be one of the following ['photos', 'keep', ...]
+  # * integer id for label 'photos' is 0, 'keep' is 1, ...
+
+
+  # Input builders
+  def input_fn_train():  # Returns a dictionary which also contains labels.
+    features = tf.contrib.learn.read_keyed_batch_features(
+        file_pattern=train_files,
+        batch_size=batch_size,
+        # creates parsing configuration for tf.parse_example
+        features=tf.estimator.classifier_parse_example_spec(
+            feature_columns,
+            label_key='my-label',
+            label_dtype=tf.string,
+            weight_column='example-weight'),
+        reader=tf.RecordIOReader)
+     labels = features.pop('my-label')
+     return features, labels
+
+  estimator.train(input_fn=input_fn_train)
+  ```
+
+  Args:
+    feature_columns: An iterable containing all feature columns. All items
+      should be instances of classes derived from `_FeatureColumn`.
+    label_key: A string identifying the label. It means tf.Example stores labels
+      with this key.
+    label_dtype: A `tf.dtype` identifies the type of labels. By default it is
+      `tf.int64`. If user defines a `label_vocabulary`, this should be set as
+      `tf.string`. `tf.float32` labels are only supported for binary
+      classification.
+    label_default: used as label if label_key does not exist in given
+      tf.Example. An example usage: let's say `label_key` is 'clicked' and
+      tf.Example contains clicked data only for positive examples in following
+      format `key:clicked, value:1`. This means that if there is no data with
+      key 'clicked' it should count as negative example by setting
+      `label_deafault=0`. Type of this value should be compatible with
+      `label_dtype`.
+    weight_column: (Optional) A `tf.feature_column.numeric_column` represents
+      the weight used given classifier.
+
+  Returns:
+    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
+    value.
+
+  Raises:
+    ValueError: If label is used in `feature_columns`.
+    ValueError: If weight_column is used in `feature_columns`.
+    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
+      instance.
+    ValueError: If `weight_column` is not a `_NumericColumn` instance.
+    ValueError: if label_key is None.
+    ValueError: if label_dtype is neither an integer nor string
+  """
+  parsing_spec = fc.make_parse_example_spec(feature_columns)
+  if label_key in parsing_spec:
+    raise ValueError('label should not be used as feature. '
+                     'label_key: {}, features: {}'.format(
+                         label_key, parsing_spec.keys()))
+  parsing_spec[label_key] = parsing_ops.FixedLenFeature((1,), label_dtype,
+                                                        label_default)
+
+  if weight_column is None:
+    return parsing_spec
+
+  if isinstance(weight_column, six.string_types):
+    weight_column = fc.numeric_column(weight_column)
+
+  if not isinstance(weight_column, fc._NumericColumn):  # pylint: disable=protected-access
+    raise ValueError('weight_column should be an instance of '
+                     'tf.feature_column.numeric_column. '
+                     'Given type: {} value: {}'.format(
+                         type(weight_column), weight_column))
+
+  if weight_column.key in parsing_spec:
+    raise ValueError('weight_column should not be used as feature. '
+                     'weight_column: {}, features: {}'.format(
+                         weight_column.key, parsing_spec.keys()))
+
+  parsing_spec.update(weight_column._parse_example_spec)  # pylint: disable=protected-access
+  return parsing_spec
diff --git a/tensorflow/python/estimator/canned/parsing_utils_test.py b/tensorflow/python/estimator/canned/parsing_utils_test.py
new file mode 100644
index 00000000000..c83823e7508
--- /dev/null
+++ b/tensorflow/python/estimator/canned/parsing_utils_test.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parsing_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator.canned import parsing_utils
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+class ClassifierParseExampleSpec(test.TestCase):
+  """Tests tf.estimator.classifier_parse_example_spec."""
+
+  def test_defaults(self):
+    parsing_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')], label_key='b')
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_string(self):
+    parsing_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        label_dtype=dtypes.string)
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.string),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  # TODO(ispir): test label_default_value compatibility with label_dtype
+  def test_label_default_value(self):
+    parsing_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        label_default=0)
+    expected_spec = {
+        'a':
+            parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b':
+            parsing_ops.FixedLenFeature(
+                (1,), dtype=dtypes.int64, default_value=0),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_weight_column_as_string(self):
+    parsing_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        weight_column='c')
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
+        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_weight_column_as_numeric_column(self):
+    parsing_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        weight_column=fc.numeric_column('c'))
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
+        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_label_key_should_not_be_used_as_feature(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'label should not be used as feature'):
+      parsing_utils.classifier_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')], label_key='a')
+
+  def test_weight_column_should_not_be_used_as_feature(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'weight_column should not be used as feature'):
+      parsing_utils.classifier_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')],
+          label_key='b',
+          weight_column=fc.numeric_column('a'))
+
+  def test_weight_column_should_be_a_numeric_column(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'tf.feature_column.numeric_column'):
+      not_a_numeric_column = 3
+      parsing_utils.classifier_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')],
+          label_key='b',
+          weight_column=not_a_numeric_column)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/prediction_keys.py b/tensorflow/python/estimator/canned/prediction_keys.py
new file mode 100644
index 00000000000..16890ec09a5
--- /dev/null
+++ b/tensorflow/python/estimator/canned/prediction_keys.py
@@ -0,0 +1,34 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enum for model prediction keys."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class PredictionKeys(object):
+  """Enum for canonical model prediction keys.
+
+  The following values are defined:
+  PREDICTIONS: Used by models that predict values, such as regressor models.
+  """
+
+  CLASSES = 'classes'
+  CLASS_IDS = 'class_ids'
+  LOGISTIC = 'logistic'
+  LOGITS = 'logits'
+  PREDICTIONS = 'predictions'
+  PROBABILITIES = 'probabilities'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
new file mode 100644
index 00000000000..293aa752531
--- /dev/null
+++ b/tensorflow/python/estimator/estimator.py
@@ -0,0 +1,868 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base Estimator class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.export.export import build_all_signature_defs
+from tensorflow.python.estimator.export.export import get_timestamped_export_dir
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import evaluation
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import saver
+from tensorflow.python.training import training
+from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+_VALID_MODEL_FN_ARGS = set(
+    ['features', 'labels', 'mode', 'params', 'config'])
+
+
+class Estimator(object):
+  """Estimator class to train and evaluate TensorFlow models.
+
+  The `Estimator` object wraps a model which is specified by a `model_fn`,
+  which, given inputs and a number of other parameters, returns the ops
+  necessary to perform training, evaluation, or predictions.
+
+  All outputs (checkpoints, event files, etc.) are written to `model_dir`, or a
+  subdirectory thereof. If `model_dir` is not set, a temporary directory is
+  used.
+
+  The `config` argument can be passed `RunConfig` object containing information
+  about the execution environment. It is passed on to the `model_fn`, if the
+  `model_fn` has a parameter named "config" (and input functions in the same
+  manner). If the `config` parameter is not passed, it is instantiated by the
+  `Estimator`. Not passing config means that defaults useful for local execution
+  are used. `Estimator` makes config available to the model (for instance, to
+  allow specialization based on the number of workers available), and also uses
+  some of its fields to control internals, especially regarding checkpointing.
+
+  The `params` argument contains hyperparameters. It is passed to the
+  `model_fn`, if the `model_fn` has a parameter named "params", and to the input
+  functions in the same manner. `Estimator` only passes params along, it does
+  not inspect it. The structure of `params` is therefore entirely up to the
+  developer.
+
+  None of `Estimator`'s methods can be overridden in subclasses (its
+  constructor enforces this). Subclasses should use `model_fn` to configure
+  the base class, and may add methods implementing specialized functionality.
+  """
+
+  def __init__(self, model_fn, model_dir=None, config=None, params=None):
+    """Constructs an `Estimator` instance.
+
+    Args:
+      model_fn: Model function. Follows the signature:
+
+        * Args:
+
+          * `features`: This is the first item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same.
+          * `labels`: This is the second item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same (for multi-head models). If
+                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
+                 the `model_fn`'s signature does not accept `mode`, the
+                 `model_fn` must still be able to handle `labels=None`.
+          * `mode`: Optional. Specifies if this training, evaluation or
+                 prediction. See `ModeKeys`.
+          * `params`: Optional `dict` of hyperparameters.  Will receive what
+                 is passed to Estimator in `params` parameter. This allows
+                 to configure Estimators from hyper parameter tuning.
+          * `config`: Optional configuration object. Will receive what is passed
+                 to Estimator in `config` parameter, or the default `config`.
+                 Allows updating things in your model_fn based on configuration
+                 such as `num_ps_replicas`, or `model_dir`.
+
+        * Returns:
+          `EstimatorSpec`
+
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
+      config: Configuration object.
+      params: `dict` of hyper parameters that will be passed into `model_fn`.
+              Keys are names of parameters, values are basic python types.
+
+    Raises:
+      ValueError: parameters of `model_fn` don't match `params`.
+      ValueError: if this is called via a subclass and if that class overrides
+        a member of `Estimator`.
+    """
+    Estimator._assert_members_are_not_overridden(self)
+
+    if config is None:
+      self._config = run_config.RunConfig()
+      logging.info('Using default config.')
+    else:
+      if not isinstance(config, run_config.RunConfig):
+        raise ValueError(
+            'config must be an instance of RunConfig, but provided %s.' %
+            config)
+      self._config = config
+
+    # Model directory.
+    if (model_dir is not None) and (self._config.model_dir is not None):
+      if model_dir != self._config.model_dir:
+        # pylint: disable=g-doc-exception
+        raise ValueError(
+            "model_dir are set both in constructor and RunConfig, but with "
+            "different values. In constructor: '{}', in RunConfig: "
+            "'{}' ".format(model_dir, self._config.model_dir))
+        # pylint: enable=g-doc-exception
+
+    self._model_dir = model_dir or self._config.model_dir
+    if self._model_dir is None:
+      self._model_dir = tempfile.mkdtemp()
+      logging.warning('Using temporary folder as model directory: %s',
+                      self._model_dir)
+    if self._config.model_dir is None:
+      self._config = self._config.replace(model_dir=self._model_dir)
+    logging.info('Using config: %s', str(vars(self._config)))
+
+    if self._config.session_config is None:
+      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    else:
+      self._session_config = self._config.session_config
+
+    self._device_fn = _get_replica_device_setter(self._config)
+
+    if model_fn is None:
+      raise ValueError('model_fn must be provided to Estimator.')
+    _verify_model_fn_args(model_fn, params)
+    self._model_fn = model_fn
+    self._params = params or {}
+
+  @property
+  def model_dir(self):
+    return self._model_dir
+
+  @property
+  def config(self):
+    return copy.deepcopy(self._config)
+
+  @property
+  def params(self):
+    return copy.deepcopy(self._params)
+
+  def train(self, input_fn, hooks=None, steps=None, max_steps=None):
+    """Trains a model given training data input_fn.
+
+    Args:
+      input_fn: Input function returning a tuple of:
+          features - `Tensor` or dictionary of string feature name to `Tensor`.
+          labels - `Tensor` or dictionary of `Tensor` with labels.
+      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
+        inside the training loop.
+      steps: Number of steps for which to train model. If `None`, train forever
+        or train until input_fn generates the `OutOfRange` or `StopIteration`
+        error. 'steps' works incrementally. If you call two times
+        train(steps=10) then training occurs in total 20 steps. If `OutOfRange`
+        or `StopIteration` error occurs in the middle, training stops before 20
+        steps. If you don't want to have incremental behavior please set
+        `max_steps` instead. If set, `max_steps` must be `None`.
+      max_steps: Number of total steps for which to train model. If `None`,
+        train forever or train until input_fn generates the `OutOfRange` or
+        `StopIteration` error. If set, `steps` must be `None`. If `OutOfRange`
+        or `StopIteration` error occurs in the middle, training stops before
+        `max_steps` steps.
+
+        Two calls to `train(steps=100)` means 200 training
+        iterations. On the other hand, two calls to `train(max_steps=100)` means
+        that the second call will not do any iteration since first call did
+        all 100 steps.
+
+    Returns:
+      `self`, for chaining.
+
+    Raises:
+      ValueError: If both `steps` and `max_steps` are not `None`.
+      ValueError: If either `steps` or `max_steps` is <= 0.
+    """
+    if (steps is not None) and (max_steps is not None):
+      raise ValueError('Can not provide both steps and max_steps.')
+    if steps is not None and steps <= 0:
+      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
+    if max_steps is not None and max_steps <= 0:
+      raise ValueError(
+          'Must specify max_steps > 0, given: {}'.format(max_steps))
+
+    if max_steps is not None:
+      start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
+      if max_steps <= start_step:
+        logging.info('Skipping training since max_steps has already saved.')
+        return self
+
+    hooks = _check_hooks_type(hooks)
+    if steps is not None or max_steps is not None:
+      hooks.append(training.StopAtStepHook(steps, max_steps))
+
+    loss = self._train_model(input_fn=input_fn, hooks=hooks)
+    logging.info('Loss for final step: %s.', loss)
+    return self
+
+  def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
+               name=None):
+    """Evaluates the model given evaluation data input_fn.
+
+    For each step, calls `input_fn`, which returns one batch of data.
+    Evaluates until:
+    - `steps` batches are processed, or
+    - `input_fn` raises an end-of-input exception (`OutOfRangeError` or
+    `StopIteration`).
+
+    Args:
+      input_fn: Input function returning a tuple of:
+          features - Dictionary of string feature name to `Tensor` or
+            `SparseTensor`.
+          labels - `Tensor` or dictionary of `Tensor` with labels.
+      steps: Number of steps for which to evaluate model. If `None`, evaluates
+        until `input_fn` raises an end-of-input exception.
+      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
+        inside the evaluation call.
+      checkpoint_path: Path of a specific checkpoint to evaluate. If `None`, the
+        latest checkpoint in `model_dir` is used.
+      name: Name of the evaluation if user needs to run multiple evaluations on
+        different data sets, such as on training data vs test data. Metrics for
+        different evaluations are saved in separate folders, and appear
+        separately in tensorboard.
+
+    Returns:
+      A dict containing the evaluation metrics specified in `model_fn` keyed by
+      name, as well as an entry `global_step` which contains the value of the
+      global step for which this evaluation was performed.
+
+    Raises:
+      ValueError: If `steps <= 0`.
+      ValueError: If no model has been trained, namely `model_dir`, or the
+        given `checkpoint_path` is empty.
+    """
+    hooks = _check_hooks_type(hooks)
+    if steps is not None:
+      if steps <= 0:
+        raise ValueError('Must specify steps > 0, given: {}'.format(steps))
+      hooks.append(evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
+          num_evals=steps))
+
+    return self._evaluate_model(
+        input_fn=input_fn,
+        hooks=hooks,
+        checkpoint_path=checkpoint_path,
+        name=name)
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None):
+    """Returns predictions for given features.
+
+    Args:
+      input_fn: Input function returning features which is a dictionary of
+        string feature name to `Tensor` or `SparseTensor`. If it returns a
+        tuple, first item is extracted as features. Prediction continues until
+        `input_fn` raises an end-of-input exception (`OutOfRangeError` or
+        `StopIteration`).
+      predict_keys: list of `str`, name of the keys to predict. It is used if
+        the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
+        then rest of the predictions will be filtered from the dictionary. If
+        `None`, returns all.
+      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
+        inside the prediction call.
+      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
+        latest checkpoint in `model_dir` is used.
+
+    Yields:
+      Evaluated values of `predictions` tensors.
+
+    Raises:
+      ValueError: Could not find a trained model in model_dir.
+      ValueError: if batch length of predictions are not same.
+      ValueError: If there is a conflict between `predict_keys` and
+        `predictions`. For example if `predict_keys` is not `None` but
+        `EstimatorSpec.predictions` is not a `dict`.
+    """
+    hooks = _check_hooks_type(hooks)
+    # Check that model has been trained.
+    if not checkpoint_path:
+      checkpoint_path = saver.latest_checkpoint(self._model_dir)
+    if not checkpoint_path:
+      raise ValueError('Could not find trained model in model_dir: {}.'.format(
+          self._model_dir))
+
+    with ops.Graph().as_default() as g:
+      random_seed.set_random_seed(self._config.tf_random_seed)
+      self._create_and_assert_global_step(g)
+      features = self._get_features_from_input_fn(input_fn)
+      estimator_spec = self._call_model_fn(features, None,
+                                           model_fn_lib.ModeKeys.PREDICT)
+      predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
+      with training.MonitoredSession(
+          session_creator=training.ChiefSessionCreator(
+              checkpoint_filename_with_path=checkpoint_path,
+              scaffold=estimator_spec.scaffold,
+              config=self._session_config),
+          hooks=hooks) as mon_sess:
+        while not mon_sess.should_stop():
+          preds_evaluated = mon_sess.run(predictions)
+          if not isinstance(predictions, dict):
+            for pred in preds_evaluated:
+              yield pred
+          else:
+            for i in range(self._extract_batch_length(preds_evaluated)):
+              yield {
+                  key: value[i]
+                  for key, value in six.iteritems(preds_evaluated)
+              }
+
+  def _assert_members_are_not_overridden(self):
+    allowed_overrides = set(['_create_global_step'])
+    estimator_members = set([m for m in Estimator.__dict__.keys()
+                             if not m.startswith('__')])
+    subclass_members = set(self.__class__.__dict__.keys())
+    common_members = estimator_members & subclass_members - allowed_overrides
+    overridden_members = [
+        m for m in common_members
+        if Estimator.__dict__[m] != self.__class__.__dict__[m]]
+    if overridden_members:
+      raise ValueError(
+          'Subclasses of Estimator cannot override members of Estimator. '
+          '{} does override {}'.format(self.__class__, overridden_members))
+
+  def export_savedmodel(
+      self, export_dir_base, serving_input_receiver_fn,
+      assets_extra=None,
+      as_text=False,
+      checkpoint_path=None):
+    """Exports inference graph as a SavedModel into given dir.
+
+    This method builds a new graph by first calling the
+    serving_input_receiver_fn to obtain feature `Tensor`s, and then calling
+    this `Estimator`'s model_fn to generate the model graph based on those
+    features. It restores the given checkpoint (or, lacking that, the most
+    recent checkpoint) into this graph in a fresh session.  Finally it creates
+    a timestamped export directory below the given export_dir_base, and writes
+    a `SavedModel` into it containing a single `MetaGraphDef` saved from this
+    session.
+
+    The exported `MetaGraphDef` will provide one `SignatureDef` for each
+    element of the export_outputs dict returned from the model_fn, named using
+    the same keys.  One of these keys is always
+    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+    signature will be served when a serving request does not specify one.
+    For each signature, the outputs are provided by the corresponding
+    `ExportOutput`s, and the inputs are always the input receivers provided by
+    the serving_input_receiver_fn.
+
+    Extra assets may be written into the SavedModel via the extra_assets
+    argument.  This should be a dict, where each key gives a destination path
+    (including the filename) relative to the assets.extra directory.  The
+    corresponding value gives the full path of the source file to be copied.
+    For example, the simple case of copying a single file without renaming it
+    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+
+    Args:
+      export_dir_base: A string containing a directory in which to create
+        timestamped subdirectories containing exported SavedModels.
+      serving_input_receiver_fn: A function that takes no argument and
+        returns a `ServingInputReceiver`.
+      assets_extra: A dict specifying how to populate the assets.extra directory
+        within the exported SavedModel, or `None` if no extra assets are needed.
+      as_text: whether to write the SavedModel proto in text format.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+
+    Returns:
+      The string path to the exported directory.
+
+    Raises:
+      ValueError: if no serving_input_receiver_fn is provided, no export_outputs
+          are provided, or no checkpoint can be found.
+    """
+    if serving_input_receiver_fn is None:
+      raise ValueError('serving_input_receiver_fn must be defined.')
+
+    with ops.Graph().as_default() as g:
+      self._create_and_assert_global_step(g)
+      random_seed.set_random_seed(self._config.tf_random_seed)
+      serving_input_receiver = serving_input_receiver_fn()
+
+      # Call the model_fn and collect the export_outputs.
+      estimator_spec = self._call_model_fn(
+          features=serving_input_receiver.features,
+          labels=None,
+          mode=model_fn_lib.ModeKeys.PREDICT)
+
+      # Build the SignatureDefs from receivers and all outputs
+      signature_def_map = build_all_signature_defs(
+          serving_input_receiver.receiver_tensors,
+          estimator_spec.export_outputs)
+
+      if not checkpoint_path:
+        # Locate the latest checkpoint
+        checkpoint_path = saver.latest_checkpoint(self._model_dir)
+      if not checkpoint_path:
+        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
+
+      export_dir = get_timestamped_export_dir(export_dir_base)
+
+      # TODO(soergel): Consider whether MonitoredSession makes sense here
+      with tf_session.Session() as session:
+
+        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
+            sharded=True)
+        saver_for_restore.restore(session, checkpoint_path)
+
+        # TODO(b/36111876): replace legacy_init_op with main_op mechanism
+        # pylint: disable=protected-access
+        local_init_op = (
+            estimator_spec.scaffold.local_init_op or
+            monitored_session.Scaffold._default_local_init_op())
+        # pylint: enable=protected-access
+
+        # Perform the export
+        builder = saved_model_builder.SavedModelBuilder(export_dir)
+        builder.add_meta_graph_and_variables(
+            session, [tag_constants.SERVING],
+            signature_def_map=signature_def_map,
+            assets_collection=ops.get_collection(
+                ops.GraphKeys.ASSET_FILEPATHS),
+            legacy_init_op=local_init_op)
+        builder.save(as_text)
+
+      # Add the extra assets
+      if assets_extra:
+        assets_extra_path = os.path.join(compat.as_bytes(export_dir),
+                                         compat.as_bytes('assets.extra'))
+        for dest_relative, source in assets_extra.items():
+          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+                                       compat.as_bytes(dest_relative))
+          dest_path = os.path.dirname(dest_absolute)
+          gfile.MakeDirs(dest_path)
+          gfile.Copy(source, dest_absolute)
+
+      return export_dir
+
+  def _get_features_from_input_fn(self, input_fn):
+    result = input_fn()
+    if not ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+      logging.warning('Input graph does not contain a QueueRunner. '
+                      'That means predict yields forever. '
+                      'This is probably a mistake.')
+    if isinstance(result, (list, tuple)):
+      return result[0]
+    return result
+
+  def _extract_batch_length(self, preds_evaluated):
+    """Extracts batch length of predictions."""
+    batch_length = None
+    for key, value in six.iteritems(preds_evaluated):
+      batch_length = batch_length or value.shape[0]
+      if value.shape[0] != batch_length:
+        raise ValueError('Batch length of predictions should be same. %s has '
+                         'different batch length then others.' % key)
+    return batch_length
+
+  def _extract_keys(self, predictions, predict_keys):
+    """Extracts `predict_keys` from `predictions`."""
+    if not predict_keys:
+      return predictions
+    if not isinstance(predictions, dict):
+      raise ValueError(
+          'predict_keys argument is not valid in case of non-dict predictions.')
+    existing_keys = predictions.keys()
+    predictions = {
+        key: value
+        for key, value in six.iteritems(predictions) if key in predict_keys
+    }
+    if not predictions:
+      raise ValueError('Expected to run at least one output from %s, '
+                       'provided %s.' % (existing_keys, predict_keys))
+    return predictions
+
+  def _create_global_step(self, graph):
+    """Creates the global step tensor in graph.
+
+    The global step tensor must be an integer type with name 'global_step' and
+    be added to the collection ${tf.GraphKeys.GLOBAL_STEP}.
+
+    Args:
+      graph: The graph in which to create the global step tensor.
+
+    Returns:
+      The global step `Tensor`.
+    """
+    return training.create_global_step(graph)
+
+  def _create_and_assert_global_step(self, graph):
+    """Creates and asserts properties of the global step.
+
+    Args:
+      graph: The graph in which to create the global step tensor.
+
+    Returns:
+      The global step `Tensor`.
+    """
+    step = self._create_global_step(graph)
+    assert step == training.get_global_step()
+    assert step.dtype.is_integer
+    return step
+
+  def _call_model_fn(self, features, labels, mode):
+    """Calls model function.
+
+    Args:
+      features: features dict.
+      labels: labels dict.
+      mode: ModeKeys
+
+    Returns:
+      An `EstimatorSpec` object.
+
+    Raises:
+      ValueError: if model_fn returns invalid objects.
+    """
+    model_fn_args = _model_fn_args(self._model_fn)
+    kwargs = {}
+    if 'mode' in model_fn_args:
+      kwargs['mode'] = mode
+    if 'params' in model_fn_args:
+      kwargs['params'] = self.params
+    if 'config' in model_fn_args:
+      kwargs['config'] = self.config
+    model_fn_results = self._model_fn(
+        features=features, labels=labels, **kwargs)
+
+    if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
+      raise ValueError('model_fn should return an EstimatorSpec.')
+
+    return model_fn_results
+
+  def _train_model(self, input_fn, hooks):
+    all_hooks = []
+    with ops.Graph().as_default() as g, g.device(self._device_fn):
+      random_seed.set_random_seed(self._config.tf_random_seed)
+      global_step_tensor = self._create_and_assert_global_step(g)
+      with ops.device('/cpu:0'):
+        features, labels = input_fn()
+      estimator_spec = self._call_model_fn(features, labels,
+                                           model_fn_lib.ModeKeys.TRAIN)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
+      all_hooks.extend(hooks)
+      all_hooks.extend([
+          training.NanTensorHook(estimator_spec.loss),
+          training.LoggingTensorHook(
+              {
+                  'loss': estimator_spec.loss,
+                  'step': global_step_tensor
+              },
+              every_n_iter=100)
+      ])
+      all_hooks.extend(estimator_spec.training_hooks)
+
+      if not (estimator_spec.scaffold.saver or
+              ops.get_collection(ops.GraphKeys.SAVERS)):
+        ops.add_to_collection(ops.GraphKeys.SAVERS,
+                              training.Saver(
+                                  sharded=True,
+                                  max_to_keep=self._config.keep_checkpoint_max,
+                                  defer_build=True,
+                                  save_relative_paths=True))
+
+      chief_hooks = []
+      if (self._config.save_checkpoints_secs or
+          self._config.save_checkpoints_steps):
+        saver_hook_exists = any([
+            isinstance(h, training.CheckpointSaverHook)
+            for h in (all_hooks + chief_hooks +
+                      list(estimator_spec.training_chief_hooks))
+        ])
+        if not saver_hook_exists:
+          chief_hooks = [
+              training.CheckpointSaverHook(
+                  self._model_dir,
+                  save_secs=self._config.save_checkpoints_secs,
+                  save_steps=self._config.save_checkpoints_steps,
+                  scaffold=estimator_spec.scaffold)
+          ]
+      with training.MonitoredTrainingSession(
+          master=self._config.master,
+          is_chief=self._config.is_chief,
+          checkpoint_dir=self._model_dir,
+          scaffold=estimator_spec.scaffold,
+          hooks=all_hooks,
+          chief_only_hooks=(
+              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
+          save_checkpoint_secs=0,  # Saving is handled by a hook.
+          save_summaries_steps=self._config.save_summary_steps,
+          config=self._session_config) as mon_sess:
+        loss = None
+        while not mon_sess.should_stop():
+          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
+      return loss
+
+  def _evaluate_model(self,
+                      input_fn,
+                      hooks=None,
+                      checkpoint_path=None,
+                      name=''):
+    """Evaluates the model using the training.evaluation library."""
+    # Check that model has been trained (if nothing has been set explicitly).
+    if not checkpoint_path:
+      latest_path = saver.latest_checkpoint(self._model_dir)
+      if not latest_path:
+        raise ValueError('Could not find trained model in model_dir: {}.'.
+                         format(self._model_dir))
+      checkpoint_path = latest_path
+
+    # Setup output directory.
+    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
+                            'eval_' + name)
+
+    with ops.Graph().as_default() as g:
+      random_seed.set_random_seed(self._config.tf_random_seed)
+      global_step_tensor = self._create_and_assert_global_step(g)
+      features, labels = input_fn()
+      estimator_spec = self._call_model_fn(
+          features, labels, model_fn_lib.ModeKeys.EVAL)
+
+      if model_fn_lib.MetricKeys.LOSS in estimator_spec.eval_metric_ops:
+        raise ValueError(
+            'Metric with name "%s" is not allowed, because Estimator ' % (
+                model_fn_lib.MetricKeys.LOSS) +
+            'already defines a default metric with the same name.')
+      estimator_spec.eval_metric_ops[
+          model_fn_lib.MetricKeys.LOSS] = metrics_lib.mean(estimator_spec.loss)
+
+      update_op, eval_dict = _extract_metric_update_ops(
+          estimator_spec.eval_metric_ops)
+
+      if ops.GraphKeys.GLOBAL_STEP in eval_dict:
+        raise ValueError(
+            'Metric with name `global_step` is not allowed, because Estimator '
+            'already defines a default metric with the same name.')
+      eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
+
+      eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
+          checkpoint_path=checkpoint_path,
+          master=self._config.evaluation_master,
+          scaffold=estimator_spec.scaffold,
+          eval_ops=update_op,
+          final_ops=eval_dict,
+          hooks=hooks,
+          config=self._session_config)
+
+      _write_dict_to_summary(
+          output_dir=eval_dir,
+          dictionary=eval_results,
+          current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])
+
+    return eval_results
+
+
+def _check_hooks_type(hooks):
+  """Returns hooks if all are SessionRunHook, raises TypeError otherwise."""
+  hooks = list(hooks or [])
+  for h in hooks:
+    if not isinstance(h, training.SessionRunHook):
+      raise TypeError('Hooks must be a SessionRunHook, given: {}'.format(h))
+  return hooks
+
+
+def _get_replica_device_setter(config):
+  """Creates a replica device setter if required as a default device_fn.
+
+  `Estimator` uses ReplicaDeviceSetter as a default device placer. It sets the
+  distributed related arguments such as number of ps_replicas based on given
+  config.
+
+  Args:
+    config: A `RunConfig` instance.
+
+  Returns:
+    A replica device setter, or None.
+  """
+  ps_ops = [
+      'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
+      'MutableHashTableV2', 'MutableHashTableOfTensors',
+      'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
+      'MutableDenseHashTableV2'
+  ]
+
+  if config.task_type:
+    worker_device = '/job:%s/task:%d' % (config.task_type, config.task_id)
+  else:
+    worker_device = '/job:worker'
+
+  if config.num_ps_replicas > 0:
+    return training.replica_device_setter(
+        ps_tasks=config.num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        ps_ops=ps_ops,
+        cluster=config.cluster_spec)
+  else:
+    return None
+
+
+def _model_fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  _, fn = tf_decorator.unwrap(fn)
+  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
+    # Handle functools.partial and similar objects.
+    return tuple([
+        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
+        if arg not in set(fn.keywords.keys())
+    ])
+  # Handle function.
+  return tuple(tf_inspect.getargspec(fn).args)
+
+
+def _verify_model_fn_args(model_fn, params):
+  """Verifies model fn arguments."""
+  args = set(_model_fn_args(model_fn))
+  if 'features' not in args:
+    raise ValueError('model_fn (%s) must include features argument.' % model_fn)
+  if 'labels' not in args:
+    raise ValueError('model_fn (%s) must include labels argument.' % model_fn)
+  if params is not None and 'params' not in args:
+    raise ValueError('model_fn (%s) does not include params argument, '
+                     'but params (%s) is passed to Estimator.' % (model_fn,
+                                                                  params))
+  if params is None and 'params' in args:
+    logging.warning('Estimator\'s model_fn (%s) includes params '
+                    'argument, but params are not passed to Estimator.',
+                    model_fn)
+  if tf_inspect.ismethod(model_fn):
+    if 'self' in args:
+      args.remove('self')
+  non_valid_args = list(args - _VALID_MODEL_FN_ARGS)
+  if non_valid_args:
+    raise ValueError('model_fn (%s) has following not expected args: %s' %
+                     (model_fn, non_valid_args))
+
+
+def _load_global_step_from_checkpoint_dir(checkpoint_dir):
+  try:
+    checkpoint_reader = training.NewCheckpointReader(
+        training.latest_checkpoint(checkpoint_dir))
+    return checkpoint_reader.get_tensor(ops.GraphKeys.GLOBAL_STEP)
+  except:  # pylint: disable=bare-except
+    return 0
+
+
+def _extract_metric_update_ops(eval_dict):
+  """Separate update operations from metric value operations."""
+  update_ops = []
+  value_ops = {}
+  # Sort metrics lexicographically so graph is identical every time.
+  for name, metric_ops in sorted(six.iteritems(eval_dict)):
+    value_ops[name] = metric_ops[0]
+    update_ops.append(metric_ops[1])
+
+  if update_ops:
+    update_op = control_flow_ops.group(*update_ops)
+  else:
+    update_op = None
+
+  return update_op, value_ops
+
+
+def _dict_to_str(dictionary):
+  """Get a `str` representation of a `dict`.
+
+  Args:
+    dictionary: The `dict` to be represented as `str`.
+
+  Returns:
+    A `str` representing the `dictionary`.
+  """
+  return ', '.join('%s = %s' % (k, v)
+                   for k, v in sorted(six.iteritems(dictionary)))
+
+
+def _write_dict_to_summary(output_dir,
+                           dictionary,
+                           current_global_step):
+  """Writes a `dict` into summary file in given output directory.
+
+  Args:
+    output_dir: `str`, directory to write the summary file in.
+    dictionary: the `dict` to be written to summary file.
+    current_global_step: `int`, the current global step.
+  """
+  logging.info('Saving dict for global step %d: %s', current_global_step,
+               _dict_to_str(dictionary))
+  summary_writer = writer_cache.FileWriterCache.get(output_dir)
+  summary_proto = summary_pb2.Summary()
+  for key in dictionary:
+    if dictionary[key] is None:
+      continue
+    if key == 'global_step':
+      continue
+    value = summary_proto.value.add()
+    value.tag = key
+    if (isinstance(dictionary[key], np.float32) or
+        isinstance(dictionary[key], float)):
+      value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
+    else:
+      logging.warn(
+          'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
+          key)
+  summary_writer.add_summary(summary_proto, current_global_step)
+  summary_writer.flush()
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
new file mode 100644
index 00000000000..52cedad0125
--- /dev/null
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator: High level tools for working with models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.export import export_lib as export
+from tensorflow.python.estimator.inputs import inputs
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.model_fn import ModeKeys
+from tensorflow.python.estimator.run_config import RunConfig
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'inputs',
+    'export',
+    'Estimator',
+    'EstimatorSpec',
+    'ModeKeys',
+    'RunConfig',
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
new file mode 100644
index 00000000000..b86afece431
--- /dev/null
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -0,0 +1,1627 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+import tempfile
+
+import numpy as np
+import six
+
+from google.protobuf import text_format
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import layers
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_state_pb2
+from tensorflow.python.training import saver
+from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training
+from tensorflow.python.util import compat
+
+_TMP_DIR = '/tmp'
+_ANOTHER_TMP_DIR = '/another_tmp'
+
+
+def dummy_model_fn(features, labels, params):
+  _, _, _ = features, labels, params
+
+
+class EstimatorInheritanceConstraintTest(test.TestCase):
+  """Tests that sub classes cannot override methods of Estimator."""
+
+  def test_override_a_method(self):
+    class _Estimator(estimator.Estimator):
+
+      def __init__(self):
+        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
+
+      def predict(self, input_fn, predict_keys=None, hooks=None):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'cannot override members of Estimator.*predict'):
+      _Estimator()
+
+  def test_override_a_method_with_tricks(self):
+    class _Estimator(estimator.Estimator):
+
+      def __init__(self):
+        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
+
+      def _assert_members_are_not_overridden(self):
+        pass  # HAHA! I tricked you!
+
+      def predict(self, input_fn, predict_keys=None, hooks=None):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'cannot override members of Estimator.*predict'):
+      _Estimator()
+
+  def test_extension_of_api_is_ok(self):
+    class _Estimator(estimator.Estimator):
+
+      def __init__(self):
+        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
+
+      def predict_proba(self, input_fn, predict_keys=None, hooks=None):
+        pass
+
+    _Estimator()
+
+  def test_override_allowed_method(self):
+    class _Estimator(estimator.Estimator):
+
+      def __init__(self):
+        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
+
+      def _create_global_step(self, graph):
+        pass
+
+    _Estimator()
+
+
+class EstimatorConstructorTest(test.TestCase):
+
+  def test_config_must_be_a_run_config(self):
+    with self.assertRaisesRegexp(ValueError, 'an instance of RunConfig'):
+      estimator.Estimator(model_fn=None, config='NotARunConfig')
+
+  def test_model_fn_must_be_provided(self):
+    with self.assertRaisesRegexp(ValueError, 'model_fn.* must be'):
+      estimator.Estimator(model_fn=None)
+
+  def test_property_accessors(self):
+
+    def model_fn(features, labels, params):
+      _, _, _ = features, labels, params
+
+    class FakeConfig(run_config.RunConfig):
+      pass
+
+    params = {'hidden_layers': [3, 4]}
+    est = estimator.Estimator(
+        model_fn=model_fn, model_dir='bla', config=FakeConfig(), params=params)
+    self.assertTrue(isinstance(est.config, FakeConfig))
+    self.assertEqual(params, est.params)
+    self.assertEqual('bla', est.model_dir)
+
+  def test_default_config(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(model_fn=model_fn)
+    self.assertTrue(isinstance(est.config, run_config.RunConfig))
+
+  def test_default_model_dir(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
+      est = estimator.Estimator(model_fn=model_fn)
+      self.assertEqual(_TMP_DIR, est.config.model_dir)
+      self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_model_dir_in_constructor(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(model_fn=model_fn, model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_model_dir_in_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(model_fn=model_fn, config=FakeConfig())
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_same_model_dir_in_constructor_and_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(
+        model_fn=model_fn, config=FakeConfig(), model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_different_model_dir_in_constructor_and_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'model_dir are set both in constructor and RunConfig, but '
+        'with different values'):
+      estimator.Estimator(
+          model_fn=model_fn, config=FakeConfig(), model_dir=_ANOTHER_TMP_DIR)
+
+  def test_model_fn_args_must_include_features(self):
+
+    def model_fn(x, labels):
+      _, _ = x, labels
+
+    with self.assertRaisesRegexp(ValueError, 'features'):
+      estimator.Estimator(model_fn=model_fn)
+
+  def test_model_fn_args_must_include_labels(self):
+
+    def model_fn(features, y):
+      _, _ = features, y
+
+    with self.assertRaisesRegexp(ValueError, 'labels'):
+      estimator.Estimator(model_fn=model_fn)
+
+  def test_if_params_provided_then_model_fn_should_accept_it(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    estimator.Estimator(model_fn=model_fn)
+    with self.assertRaisesRegexp(ValueError, 'params'):
+      estimator.Estimator(model_fn=model_fn, params={'hidden_layers': 4})
+
+  def test_not_known_model_fn_args(self):
+
+    def model_fn(features, labels, something):
+      _, _, _ = features, labels, something
+
+    with self.assertRaisesRegexp(ValueError, 'something'):
+      estimator.Estimator(model_fn=model_fn)
+
+  def test_not_known_model_fn_args_handled_by_lambda(self):
+    def model_fn(features, labels, something):
+      _, _, _ = features, labels, something
+
+    new_model_fn = lambda features, labels: model_fn(  # pylint: disable=g-long-lambda
+        features, labels, 'something')
+    estimator.Estimator(model_fn=new_model_fn)
+
+  def test_if_model_fn_is_a_member_function_of_a_class(self):
+
+    class ModelFnClass(object):
+
+      def __init__(self):
+        estimator.Estimator(model_fn=self.model_fn)
+
+      def model_fn(self, features, labels, mode):
+        _, _, _ = features, labels, mode
+
+    ModelFnClass()
+
+
+def dummy_input_fn():
+  return ({'x': constant_op.constant([[1], [1]])},
+          constant_op.constant([[1], [1]]))
+
+
+def model_fn_global_step_incrementer(features, labels, mode):
+  _, _ = features, labels
+  global_step = training.get_global_step()
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      loss=constant_op.constant(1.),
+      train_op=state_ops.assign_add(global_step, 1))
+
+
+def _estimator_spec(
+    expected_features, expected_labels, actual_features, actual_labels, mode):
+  assert_ops = tuple([
+      check_ops.assert_equal(
+          expected_features[k], actual_features[k], name='assert_%s' % k)
+      for k in expected_features
+  ] + [
+      check_ops.assert_equal(
+          expected_labels, actual_labels, name='assert_labels')
+  ])
+  global_step = training.get_global_step()
+  with ops.control_dependencies(assert_ops):
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        predictions=constant_op.constant(0.),
+        loss=constant_op.constant(0.),
+        train_op=state_ops.assign_add(global_step, 1))
+
+
+def _make_input_fn(features, labels):
+  def _input_fn():
+    return {
+        k: constant_op.constant(v)
+        for k, v in six.iteritems(features)
+    }, constant_op.constant(labels)
+  return _input_fn
+
+
+class EstimatorTrainTest(test.TestCase):
+
+  def test_minimal_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    def _model_fn(features, labels):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels,
+          model_fn_lib.ModeKeys.TRAIN)
+
+    with self.assertRaisesRegexp(ValueError, 'does not include params'):
+      estimator.Estimator(model_fn=_model_fn, params={'a': 'b'})
+    est = estimator.Estimator(model_fn=_model_fn, config=run_config.RunConfig())
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_all_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # Note that args are all passed by keyword, so can be in any order.
+    def _model_fn(mode, params, features, labels, config):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels, mode)
+
+    est = estimator.Estimator(
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_partial_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+    expected_foo = 45.
+    expected_bar = 46.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    def _model_fn(features, labels, foo, mode, params, config, bar):
+      model_fn_call_count[0] += 1
+      self.assertEqual(expected_foo, foo)
+      self.assertEqual(expected_bar, bar)
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels, mode)
+    partial_model_fn = functools.partial(
+        _model_fn, foo=expected_foo, bar=expected_bar)
+
+    est = estimator.Estimator(
+        model_fn=partial_model_fn, params=expected_params,
+        config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_model_fn_must_return_estimator_spec(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+      return 'NotGoodNotGood'
+
+    est = estimator.Estimator(model_fn=model_fn)
+    with self.assertRaisesRegexp(ValueError, 'EstimatorSpec'):
+      est.train(dummy_input_fn, steps=1)
+
+  def test_run_train_op_and_saves_at_the_end(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=5)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
+
+  def test_steps_and_saves_reloads(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=5)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
+    est.train(dummy_input_fn, steps=5)
+    self.assertEqual(
+        10, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
+
+  def test_max_step(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, max_steps=5)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
+    est.train(dummy_input_fn, max_steps=5)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
+
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=model_fn_global_step_incrementer)
+    est1.train(dummy_input_fn, steps=5)
+
+    # We have to clear the cache before we can rename the directory,
+    # otherwise open file handles will prevent the delete on Windows.
+    writer_cache.FileWriterCache.clear()
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=model_fn_global_step_incrementer)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
+    est2.train(dummy_input_fn, steps=5)
+    self.assertEqual(
+        10, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
+
+  def test_steps0_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
+      est.train(dummy_input_fn, steps=0)
+
+  def test_steps_negative_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
+      est.train(dummy_input_fn, steps=-1)
+
+  def test_max_steps0_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
+      est.train(dummy_input_fn, max_steps=0)
+
+  def test_max_steps_negative_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
+      est.train(dummy_input_fn, max_steps=-1)
+
+  def test_scaffold_is_used(self):
+    self.is_init_fn_called = False
+
+    def _init_fn(scaffold, sess):
+      _, _ = scaffold, sess
+      self.is_init_fn_called = True
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(init_fn=_init_fn))
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    self.assertTrue(self.is_init_fn_called)
+
+  def test_hooks_should_be_session_run_hook(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
+      est.train(dummy_input_fn, steps=1, hooks=['NotAHook'])
+
+  def test_training_hooks_are_used(self):
+    chief_hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          training_chief_hooks=[chief_hook],
+          training_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    self.assertFalse(chief_hook.begin.called)
+    self.assertFalse(hook.begin.called)
+    est.train(dummy_input_fn, steps=1)
+    self.assertTrue(chief_hook.begin.called)
+    self.assertTrue(hook.begin.called)
+
+  def test_chief_only_hook_should_not_be_called_on_non_chief(self):
+    chief_hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          training_chief_hooks=[chief_hook],
+          training_hooks=[hook])
+
+    class NonChiefRunConfig(run_config.RunConfig):
+      @property
+      def is_chief(self):  # pylint: disable=g-wrong-blank-lines
+        return False
+
+    # Mocking the SessionManager.wait_for_session, so that worker doesn't wait
+    # for chief.
+    def get_initialized_session(*args, **kwargs):
+      # Session doesn't take 'max_wait_secs' argument.
+      kwargs.pop('max_wait_secs', None)
+      scaffold = training.Scaffold().finalize()
+      sess = session.Session(*args, **kwargs)
+      sess.run(scaffold.init_op)
+      return sess
+
+    with test.mock.patch.object(
+        training.SessionManager,
+        'wait_for_session',
+        side_effect=get_initialized_session):
+      est = estimator.Estimator(
+          model_fn=_model_fn_hooks, config=NonChiefRunConfig())
+      self.assertFalse(chief_hook.begin.called)
+      self.assertFalse(hook.begin.called)
+      est.train(dummy_input_fn, steps=1)
+      self.assertFalse(chief_hook.begin.called)
+      self.assertTrue(hook.begin.called)
+
+  def test_features_labels_mode(self):
+    given_features = {'test-features': [[1], [1]]}
+    given_labels = {'test-labels': [[1], [1]]}
+
+    def _input_fn():
+      return given_features, given_labels
+
+    def _model_fn(features, labels, mode):
+      self.features, self.labels, self.mode = features, labels, mode
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    self.assertEqual(given_features, self.features)
+    self.assertEqual(given_labels, self.labels)
+    self.assertEqual(model_fn_lib.ModeKeys.TRAIN, self.mode)
+
+  def test_graph_initialization_global_step_and_random_seed(self):
+    expected_random_seed = run_config.RunConfig().tf_random_seed
+    def _model_fn(features, labels, mode):
+      _, _, _ = features, labels, mode
+      self.assertIsNotNone(training.get_global_step())
+      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+
+
+def _model_fn_with_eval_metric_ops(features, labels, mode, params):
+  _, _ = features, labels
+  metric_name = params.get('metric_name') or 'metric'
+  metric_value = params.get('metric_value') or 2.
+  global_step = training.get_global_step()
+  loss = constant_op.constant(1.)
+  metric_update_op = loss.op
+  metric_tensor = control_flow_ops.with_dependencies(
+      [metric_update_op], constant_op.constant(metric_value))
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      loss=loss,
+      predictions={'predictions': constant_op.constant(1.)},
+      train_op=state_ops.assign_add(global_step, 1),
+      eval_metric_ops={metric_name: (metric_tensor, metric_update_op)})
+
+
+class _StepCounterHook(session_run_hook.SessionRunHook):
+  """Hooks that counts the number of times it is called."""
+
+  def __init__(self):
+    self._steps = 0
+
+  def before_run(self, run_context):
+    del run_context
+    self._steps += 1
+
+  @property
+  def steps(self):
+    return self._steps
+
+
+class EstimatorEvaluateTest(test.TestCase):
+
+  def test_model_fn_must_return_estimator_spec(self):
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      if mode == model_fn_lib.ModeKeys.EVAL:
+        return 'NotGoodNotGood'
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(1.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(
+        ValueError, 'model_fn should return an EstimatorSpec'):
+      est.evaluate(dummy_input_fn, steps=1)
+
+  def test_no_trained_model(self):
+    est = estimator.Estimator(model_fn=_model_fn_with_eval_metric_ops)
+    with self.assertRaisesRegexp(
+        ValueError, 'Could not find trained model in model_dir'):
+      est.evaluate(dummy_input_fn, steps=1)
+
+  def test_scores(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops,
+        params={
+            'metric_name': 'metric',
+            'metric_value': 2.})
+    est.train(dummy_input_fn, steps=5)
+    scores = est.evaluate(dummy_input_fn, steps=1)
+    self.assertIn('metric', scores)
+    self.assertAlmostEqual(2., scores['metric'])
+
+  def test_tuple_metrics(self):
+    def _model_fn(features, labels, mode):
+      del features  # unused
+      del labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          loss=constant_op.constant(1.),
+          eval_metric_ops={
+              'nested_metric': (
+                  ((constant_op.constant(2.), constant_op.constant(1)),
+                   constant_op.constant(3., dtype=dtypes.float64)),
+                  control_flow_ops.no_op())})
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    evaluation = est.evaluate(dummy_input_fn, steps=1)
+    ((two_float, one_integer), three_double) = evaluation['nested_metric']
+    self.assertAlmostEqual(2., two_float)
+    self.assertEqual(1, one_integer)
+    self.assertAlmostEqual(3., three_double)
+
+  def test_steps0_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    est.train(dummy_input_fn, steps=5)
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
+      est.evaluate(dummy_input_fn, steps=0)
+
+  def test_steps_negative_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops)
+    est.train(dummy_input_fn, steps=5)
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
+      est.evaluate(dummy_input_fn, steps=-1)
+
+  def test_global_step_metric_raises_error(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops,
+        params={
+            'metric_name': 'global_step',
+            'metric_value': 2.})
+    est.train(dummy_input_fn, steps=5)
+    with self.assertRaisesRegexp(
+        ValueError, 'Metric with name `global_step` is not allowed'):
+      est.evaluate(dummy_input_fn, steps=1)
+
+  def test_global_step_is_reported(self):
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops,
+        params={'metric_name': 'metric',
+                'metric_value': 2.})
+    est.train(dummy_input_fn, steps=5)
+    scores = est.evaluate(dummy_input_fn, steps=1)
+    self.assertIn('global_step', scores)
+    self.assertEqual(5, scores['global_step'])
+
+  def test_loss_metric_is_reported(self):
+
+    def _model_fn_with_incremental_loss(features, labels, mode):
+      _, _ = features, labels
+      local_weight = variables.Variable(
+          0., name='local_weight', collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      # Loss will be 2, 4, 6, ...
+      loss = 2 * state_ops.assign_add(local_weight, 1.)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=loss,
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn_with_incremental_loss)
+    est.train(dummy_input_fn, steps=1)
+    scores = est.evaluate(dummy_input_fn, steps=5)
+    self.assertIn(model_fn_lib.MetricKeys.LOSS, scores)
+    # Average loss will be (2 + 4 + 6 + 8 + 10)/5=6
+    self.assertAlmostEqual(6., scores[model_fn_lib.MetricKeys.LOSS])
+
+  def test_hooks_should_be_session_run_hook(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
+      est.evaluate(dummy_input_fn, steps=5, hooks=['NotAHook'])
+
+  def test_hooks_are_used(self):
+    step_counter_hook = _StepCounterHook()
+
+    est = estimator.Estimator(model_fn=_model_fn_with_eval_metric_ops)
+    est.train(dummy_input_fn, steps=1)
+    est.evaluate(dummy_input_fn, steps=5, hooks=[step_counter_hook])
+    self.assertEqual(5, step_counter_hook.steps)
+
+  def test_evaluate_from_checkpoint(self):
+    params = {
+        'metric_name': 'metric',
+        'metric_value': 2.}
+    est1 = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops,
+        params=params)
+    est1.train(dummy_input_fn, steps=5)
+    est2 = estimator.Estimator(
+        model_fn=_model_fn_with_eval_metric_ops,
+        params=params)
+    scores = est2.evaluate(
+        dummy_input_fn,
+        steps=1,
+        checkpoint_path=saver.latest_checkpoint(est1.model_dir))
+    self.assertEqual(5, scores['global_step'])
+
+  def test_scaffold_is_used(self):
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables.Variable(1., name='weight')
+      real_saver = saver.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(saver=self.mock_saver))
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    est.evaluate(dummy_input_fn, steps=1)
+    self.assertTrue(self.mock_saver.restore.called)
+
+  def test_features_labels_mode(self):
+    given_features = {'test-features': [[1], [1]]}
+    given_labels = {'test-labels': [[1], [1]]}
+
+    def _input_fn():
+      return given_features, given_labels
+
+    def _model_fn(features, labels, mode):
+      self.features, self.labels, self.mode = features, labels, mode
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    est.evaluate(_input_fn, steps=1)
+    self.assertEqual(given_features, self.features)
+    self.assertEqual(given_labels, self.labels)
+    self.assertEqual(model_fn_lib.ModeKeys.EVAL, self.mode)
+
+  def test_graph_initialization_global_step_and_random_seed(self):
+    expected_random_seed = run_config.RunConfig().tf_random_seed
+    def _model_fn(features, labels, mode):
+      _, _, _ = features, labels, mode
+      self.assertIsNotNone(training.get_global_step())
+      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    est.evaluate(dummy_input_fn, steps=1)
+
+
+class EstimatorPredictTest(test.TestCase):
+
+  def test_no_trained_model_in_model_dir(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Could not find trained model in model_dir'):
+      next(est.predict(dummy_input_fn))
+
+  def test_no_trained_model_invalid_checkpoint_path(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    with self.assertRaises(ValueError):
+      next(
+          est.predict(
+              dummy_input_fn,
+              checkpoint_path=saver.latest_checkpoint('fakedir')))
+
+  def test_tensor_predictions(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    self.assertEqual(10., next(est.predict(dummy_input_fn)))
+
+  def test_warn_if_no_queue_runner(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      next(est.predict(dummy_input_fn))
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'Input graph does not contain a QueueRunner.')
+
+  def test_input_fn_can_return_just_features(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+
+    def _only_features():
+      return {'x': constant_op.constant([[0.]])}
+
+    self.assertEqual([10.], next(est.predict(_only_features)))
+
+  def test_batch_size_mismatch(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={
+              'y1': constant_op.constant([[10.]]),
+              'y2': constant_op.constant([[12.], [13]])
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Batch length of predictions should be same'):
+      next(est.predict(dummy_input_fn))
+
+  def test_predict_keys_defined_for_tensor(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'predict_keys argument is not valid in case of non-dict predictions'):
+      next(est.predict(dummy_input_fn, predict_keys=['y']))
+
+  def test_predict_keys_does_not_exists(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={
+              'y1': constant_op.constant([[10.]]),
+              'y2': constant_op.constant([[12.]])
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Expected to run at least one output from'):
+      next(est.predict(dummy_input_fn, predict_keys=['y3']))
+
+  def test_return_given_predict_keys(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={
+              'y1': constant_op.constant([[10.]]),
+              'y2': constant_op.constant([[12.]])
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    results = next(est.predict(dummy_input_fn, predict_keys=['y1']))
+    self.assertIn('y1', results)
+    self.assertNotIn('y2', results)
+
+  def test_yield_rows_of_tensor(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.], [12.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    results = est.predict(dummy_input_fn)
+    self.assertEqual([10.], next(results))
+    self.assertEqual([12.], next(results))
+
+  def test_yield_rows_of_dict(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={
+              'y1': constant_op.constant([[10.], [12]]),
+              'y2': constant_op.constant([[0.], [2.]])
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    results = est.predict(dummy_input_fn)
+    self.assertDictEqual({'y1': [10.], 'y2': [0.]}, next(results))
+    self.assertDictEqual({'y1': [12.], 'y2': [2.]}, next(results))
+
+  def test_hooks_should_be_session_run_hook(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
+      next(est.predict(dummy_input_fn, hooks=['NotAHook']))
+
+  def test_hooks_are_used(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.], [12.]]))
+
+    step_counter_hook = _StepCounterHook()
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    results = est.predict(dummy_input_fn, hooks=[step_counter_hook])
+    self.assertEqual(0, step_counter_hook.steps)  # not called yet
+    next(results)
+    self.assertEqual(1, step_counter_hook.steps)  # first call
+    next(results)
+    self.assertEqual(1, step_counter_hook.steps)  # it's in same batch
+    next(results)
+    self.assertEqual(2, step_counter_hook.steps)  # next batch
+
+  def test_predict_from_old_model_dir(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      v = variables.Variable([[16.]], name='weight')
+      prediction = v * 2
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=prediction)
+
+    est1 = estimator.Estimator(model_fn=_model_fn)
+    est1.train(dummy_input_fn, steps=1)
+    est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
+    self.assertEqual([32.], next(est2.predict(dummy_input_fn)))
+
+  def test_predict_from_checkpoint_path(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      v = variables.Variable([[16.]], name='weight')
+      prediction = v * 2
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=prediction)
+
+    est1 = estimator.Estimator(model_fn=_model_fn)
+    est1.train(dummy_input_fn, steps=1)
+    est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
+    self.assertEqual(
+        [32.],
+        next(
+            est2.predict(
+                dummy_input_fn,
+                checkpoint_path=saver.latest_checkpoint(est1.model_dir))))
+
+  def test_scaffold_is_used(self):
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables.Variable(1., name='weight')
+      real_saver = saver.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(saver=self.mock_saver))
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    next(est.predict(dummy_input_fn))
+    self.assertTrue(self.mock_saver.restore.called)
+
+  def test_features_labels_mode(self):
+    given_features = {'test-features': [[1], [1]]}
+    given_labels = {'test-labels': [[1], [1]]}
+
+    def _input_fn():
+      return given_features, given_labels
+
+    def _model_fn(features, labels, mode):
+      self.features, self.labels, self.mode = features, labels, mode
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    next(est.predict(_input_fn))
+    self.assertEqual(given_features, self.features)
+    self.assertIsNone(self.labels)
+    self.assertEqual(model_fn_lib.ModeKeys.PREDICT, self.mode)
+
+  def test_graph_initialization_global_step_and_random_seed(self):
+    expected_random_seed = run_config.RunConfig().tf_random_seed
+    def _model_fn(features, labels, mode):
+      _, _, _ = features, labels, mode
+      self.assertIsNotNone(training.get_global_step())
+      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    next(est.predict(dummy_input_fn))
+
+
+def _model_fn_for_export_tests(features, labels, mode):
+  _, _ = features, labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
+  with ops.control_dependencies([update_global_step]):
+    train_op = constant_op.constant(2.)
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      predictions=constant_op.constant(10.),
+      loss=constant_op.constant(1.),
+      train_op=train_op,
+      export_outputs={
+          'test': export_output.ClassificationOutput(scores, classes)})
+
+
+def _model_fn_with_saveables_for_export_tests(features, labels, mode):
+  _, _ = features, labels
+  table = saver_test_utils.CheckpointedOp(name='v2')
+  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
+  with ops.control_dependencies([update_global_step]):
+    train_op = table.insert('k1', 30.0)
+  prediction = table.lookup('k1', 0.0)
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      predictions=prediction,
+      loss=constant_op.constant(1.),
+      train_op=train_op,
+      export_outputs={
+          'test': export_output.PredictOutput({'prediction': prediction})})
+
+
+_VOCAB_FILE_CONTENT = 'emerson\nlake\npalmer\n'
+_EXTRA_FILE_CONTENT = 'kermit\npiggy\nralph\n'
+
+
+class EstimatorExportTest(test.TestCase):
+
+  def test_export_savedmodel_proto_roundtrip(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('saved_model.pb'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_savedmodel_with_saveables_proto_roundtrip(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_fn=_model_fn_with_saveables_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('saved_model.pb'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        # Note that the SavedModel builder replaced the Saver with a new one
+        self.assertTrue('save_1/LookupTableImportV2' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_savedmodel_assets(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Create a fake asset.
+    vocab_file_name = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('my_vocab_file'))
+    vocab_file = gfile.GFile(vocab_file_name, mode='w')
+    vocab_file.write(_VOCAB_FILE_CONTENT)
+    vocab_file.close()
+
+    # hack in an op that uses the asset, in order to test asset export.
+    # this is not actually valid, of course.
+    def serving_input_receiver_with_asset_fn():
+      features, receiver_tensor = serving_input_receiver_fn()
+      filename = ops.convert_to_tensor(vocab_file_name,
+                                       dtypes.string,
+                                       name='asset_filepath')
+      ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
+      features['bogus_filename'] = filename
+
+      return export.ServingInputReceiver(features, receiver_tensor)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_with_asset_fn)
+
+    # Check that the asset files are in the right places.
+    expected_vocab_file_name = os.path.join(
+        compat.as_bytes(export_dir), compat.as_bytes('assets/my_vocab_file'))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir), compat.as_bytes('assets'))))
+    self.assertTrue(gfile.Exists(expected_vocab_file_name))
+    self.assertEqual(
+        compat.as_bytes(_VOCAB_FILE_CONTENT),
+        compat.as_bytes(gfile.GFile(expected_vocab_file_name).read()))
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        assets = [
+            x.eval()
+            for x in graph.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+        ]
+        self.assertItemsEqual([vocab_file_name], assets)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertTrue('asset_filepath' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # cleanup
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_savedmodel_extra_assets(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Create a fake asset.
+    extra_file_name = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('my_extra_file'))
+    extra_file = gfile.GFile(extra_file_name, mode='w')
+    extra_file.write(_EXTRA_FILE_CONTENT)
+    extra_file.close()
+
+    # Perform the export.
+    assets_extra = {'some/sub/directory/my_extra_file': extra_file_name}
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(export_dir_base,
+                                       serving_input_receiver_fn,
+                                       assets_extra=assets_extra)
+
+    # Check that the asset files are in the right places.
+    expected_extra_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('assets.extra/some/sub/directory/my_extra_file'))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir), compat.as_bytes('assets.extra'))))
+    self.assertTrue(gfile.Exists(expected_extra_path))
+    self.assertEqual(
+        compat.as_bytes(_EXTRA_FILE_CONTENT),
+        compat.as_bytes(gfile.GFile(expected_extra_path).read()))
+
+    # cleanup
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_scaffold_is_used_for_saver(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables.Variable(1., name='weight')
+      real_saver = saver.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      scores = constant_op.constant([3.])
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(saver=self.mock_saver),
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    est.export_savedmodel(export_dir_base, serving_input_receiver_fn)
+
+    self.assertTrue(self.mock_saver.restore.called)
+
+  def test_scaffold_is_used_for_local_init(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      my_int = variables.Variable(1, name='my_int',
+                                  collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      scores = constant_op.constant([3.])
+      with ops.control_dependencies([
+          variables.local_variables_initializer(),
+          lookup_ops.tables_initializer()
+      ]):
+        assign_op = state_ops.assign(my_int, 12345)
+
+      # local_initSop must be an Operation, not a Tensor.
+      custom_local_init_op = control_flow_ops.group(assign_op)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(local_init_op=custom_local_init_op),
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(export_dir_base,
+                                       serving_input_receiver_fn)
+
+    # Restore, to validate that the custom local_init_op runs.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        my_int = graph.get_tensor_by_name('my_int:0')
+        my_int_value = sess.run(my_int)
+        self.assertEqual(12345, my_int_value)
+
+  def test_features_labels_mode(self):
+    given_features = {'test-features': constant_op.constant([[1], [1]])}
+
+    def serving_input_receiver_fn():
+      return export.ServingInputReceiver(
+          given_features, array_ops.placeholder(dtype=dtypes.string))
+
+    def _model_fn(features, labels, mode):
+      self.features, self.labels, self.mode = features, labels, mode
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]),
+          export_outputs={
+              'test': export_output.ClassificationOutput(
+                  constant_op.constant([[0.]]))
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
+    self.assertEqual(given_features, self.features)
+    self.assertIsNone(self.labels)
+    self.assertEqual(model_fn_lib.ModeKeys.PREDICT, self.mode)
+
+  def test_graph_initialization_global_step_and_random_seed(self):
+    expected_random_seed = run_config.RunConfig().tf_random_seed
+    def _model_fn(features, labels, mode):
+      _, _, _ = features, labels, mode
+      self.assertIsNotNone(training.get_global_step())
+      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[0.]]),
+          export_outputs={
+              'test': export_output.ClassificationOutput(
+                  constant_op.constant([[0.]]))
+          })
+
+    def serving_input_receiver_fn():
+      return export.ServingInputReceiver(
+          {'test-features': constant_op.constant([[1], [1]])},
+          array_ops.placeholder(dtype=dtypes.string))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+    est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
+
+
+class EstimatorHookOrderingTest(test.TestCase):
+
+  def testCustomHooksAreCalledBeforeNanTensorHook(self):
+
+    def nan_making_model_fn(mode, features, labels):
+      """A graph that generates NaN's for testing."""
+      del features, labels
+
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, name='global_step')
+      inc_global_step = state_ops.assign_add(global_step, 1)
+      nan_const = constant_op.constant(np.nan, dtype=dtypes.float32)
+      loss = control_flow_ops.cond(
+          inc_global_step > 1, lambda: nan_const, lambda: 1.0)
+
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=global_step.read_value(),
+          loss=loss,
+          train_op=inc_global_step)
+
+    def empty_input_fn():
+      return dict(), None
+
+    class AfterRunCountingHook(session_run_hook.SessionRunHook):
+      """Hooks that counts the number of times after_run() is called."""
+
+      def __init__(self):
+        self.after_run_count = 0
+
+      def after_run(self, run_context, run_values):
+        del run_context, run_values
+        self.after_run_count += 1
+
+    test_hook = AfterRunCountingHook()
+    est = estimator.Estimator(model_fn=nan_making_model_fn)
+    with self.assertRaises(basic_session_run_hooks.NanLossDuringTrainingError):
+      est.train(input_fn=empty_input_fn, steps=2, hooks=[test_hook])
+    self.assertEqual(2, test_hook.after_run_count)
+
+
+class EstimatorIntegrationTest(test.TestCase):
+
+  def test_complete_flow_with_a_simple_linear_model(self):
+
+    def _model_fn(features, labels, mode):
+      predictions = layers.dense(
+          features['x'], 1, kernel_initializer=init_ops.zeros_initializer())
+      export_outputs = {
+          'predictions': export_output.RegressionOutput(predictions)
+      }
+
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        return model_fn_lib.EstimatorSpec(
+            mode, predictions=predictions, export_outputs=export_outputs)
+
+      loss = losses.mean_squared_error(labels, predictions)
+      train_op = training.GradientDescentOptimizer(learning_rate=0.5).minimize(
+          loss, training.get_global_step())
+      eval_metric_ops = {
+          'absolute_error': metrics_lib.mean_absolute_error(
+              labels, predictions)
+      }
+
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=predictions,
+          loss=loss,
+          train_op=train_op,
+          eval_metric_ops=eval_metric_ops,
+          export_outputs=export_outputs)
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    data = np.linspace(0., 1., 100, dtype=np.float32).reshape(-1, 1)
+
+    # TRAIN
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=50, num_epochs=None, shuffle=True)
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=50, num_epochs=1, shuffle=True)
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores['global_step'])
+    self.assertGreater(0.1, scores['absolute_error'])
+
+    # PREDICT
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=None, batch_size=10, num_epochs=1, shuffle=False)
+    predictions = list(est.predict(predict_input_fn))
+    self.assertAllClose(data, predictions, atol=0.01)
+
+    # EXPORT
+    feature_spec = {'x': parsing_ops.FixedLenFeature([1], dtypes.float32)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/export/__init__.py b/tensorflow/python/estimator/export/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
new file mode 100644
index 00000000000..a0c1b63b08c
--- /dev/null
+++ b/tensorflow/python/estimator/export/export.py
@@ -0,0 +1,185 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Configuration and utilities for receiving inputs at serving time."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import time
+
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util import compat
+
+
+_SINGLE_FEATURE_DEFAULT_NAME = 'feature'
+_SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+
+
+class ServingInputReceiver(collections.namedtuple('ServingInputReceiver',
+                                                  ['features',
+                                                   'receiver_tensors'])):
+  """A return type for a serving_input_receiver_fn.
+
+  The expected return values are:
+    features: A dict of string to `Tensor` or `SparseTensor`, specifying the
+      features to be passed to the model.
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed.  Typically, this is a
+      single placeholder expecting serialized `tf.Example` protos.
+  """
+  # TODO(soergel): add receiver_alternatives when supported in serving.
+
+  def __new__(cls, features, receiver_tensors):
+    if features is None:
+      raise ValueError('features must be defined.')
+    if not isinstance(features, dict):
+      features = {_SINGLE_FEATURE_DEFAULT_NAME: features}
+    for name, tensor in features.items():
+      if not isinstance(name, six.string_types):
+        raise ValueError('feature keys must be strings: {}.'.format(name))
+      if not (isinstance(tensor, ops.Tensor)
+              or isinstance(tensor, sparse_tensor.SparseTensor)):
+        raise ValueError(
+            'feature {} must be a Tensor or SparseTensor.'.format(name))
+
+    if receiver_tensors is None:
+      raise ValueError('receiver_tensors must be defined.')
+    if not isinstance(receiver_tensors, dict):
+      receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+    for name, tensor in receiver_tensors.items():
+      if not isinstance(name, six.string_types):
+        raise ValueError(
+            'receiver_tensors keys must be strings: {}.'.format(name))
+      if not isinstance(tensor, ops.Tensor):
+        raise ValueError(
+            'receiver_tensor {} must be a Tensor.'.format(name))
+
+    return super(ServingInputReceiver, cls).__new__(
+        cls, features=features, receiver_tensors=receiver_tensors)
+
+
+def build_parsing_serving_input_receiver_fn(feature_spec,
+                                            default_batch_size=None):
+  """Build a serving_input_receiver_fn expecting fed tf.Examples.
+
+  Creates a serving_input_receiver_fn that expects a serialized tf.Example fed
+  into a string placeholder.  The function parses the tf.Example according to
+  the provided feature_spec, and returns all parsed Tensors as features.
+
+  Args:
+    feature_spec: a dict of string to `VarLenFeature`/`FixedLenFeature`.
+    default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
+
+  Returns:
+    A serving_input_receiver_fn suitable for use in serving.
+  """
+  def serving_input_receiver_fn():
+    """An input_fn that expects a serialized tf.Example."""
+    serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
+                                                  shape=[default_batch_size],
+                                                  name='input_example_tensor')
+    receiver_tensors = {'examples': serialized_tf_example}
+    features = parsing_ops.parse_example(serialized_tf_example, feature_spec)
+    return ServingInputReceiver(features, receiver_tensors)
+
+  return serving_input_receiver_fn
+
+
+def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
+  """Build a serving_input_receiver_fn expecting feature Tensors.
+
+  Creates an serving_input_receiver_fn that expects all features to be fed
+  directly.
+
+  Args:
+    features: a dict of string to `Tensor`.
+    default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
+
+  Returns:
+    A serving_input_receiver_fn.
+  """
+  def serving_input_receiver_fn():
+    """A serving_input_receiver_fn that expects features to be fed directly."""
+    receiver_tensors = {}
+    for name, t in features.items():
+      shape_list = t.get_shape().as_list()
+      shape_list[0] = default_batch_size
+      shape = tensor_shape.TensorShape(shape_list)
+
+      # Reuse the feature tensor name for the placeholder, excluding the index
+      placeholder_name = t.name.split(':')[0]
+      receiver_tensors[name] = array_ops.placeholder(dtype=t.dtype,
+                                                     shape=shape,
+                                                     name=placeholder_name)
+    # TODO(b/34885899): remove the unnecessary copy
+    # The features provided are simply the placeholders, but we defensively copy
+    # the dict because it may be mutated.
+    return ServingInputReceiver(receiver_tensors, receiver_tensors.copy())
+
+  return serving_input_receiver_fn
+
+
+### Below utilities are specific to SavedModel exports.
+
+
+def build_all_signature_defs(receiver_tensors, export_outputs):
+  """Build `SignatureDef`s for all export outputs."""
+  if not isinstance(receiver_tensors, dict):
+    receiver_tensors = {'receiver': receiver_tensors}
+  if export_outputs is None or not isinstance(export_outputs, dict):
+    raise ValueError('export_outputs must be a dict.')
+
+  signature_def_map = {
+      '{}'.format(output_key or 'None'):
+      export_output.as_signature_def(receiver_tensors)
+      for output_key, export_output in export_outputs.items()}
+
+  return signature_def_map
+
+
+def get_timestamped_export_dir(export_dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  Each export is written into a new subdirectory named using the
+  current time.  This guarantees monotonically increasing version
+  numbers even across multiple runs of the pipeline.
+  The timestamp used is the number of seconds since epoch UTC.
+
+  Args:
+    export_dir_base: A string containing a directory to write the exported
+        graph and checkpoints.
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+  """
+  export_timestamp = int(time.time())
+
+  export_dir = os.path.join(
+      compat.as_bytes(export_dir_base),
+      compat.as_bytes(str(export_timestamp)))
+  return export_dir
+
diff --git a/tensorflow/python/estimator/export/export_lib.py b/tensorflow/python/estimator/export/export_lib.py
new file mode 100644
index 00000000000..99cd81d678b
--- /dev/null
+++ b/tensorflow/python/estimator/export/export_lib.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods for exporting Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.estimator.export.export import build_parsing_serving_input_receiver_fn
+from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
+from tensorflow.python.estimator.export.export import ServingInputReceiver
+from tensorflow.python.estimator.export.export_output import ClassificationOutput
+from tensorflow.python.estimator.export.export_output import ExportOutput
+from tensorflow.python.estimator.export.export_output import PredictOutput
+from tensorflow.python.estimator.export.export_output import RegressionOutput
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'build_parsing_serving_input_receiver_fn',
+    'build_raw_serving_input_receiver_fn',
+    'ServingInputReceiver',
+    'ClassificationOutput',
+    'ExportOutput',
+    'PredictOutput',
+    'RegressionOutput',
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
new file mode 100644
index 00000000000..7c7f92872eb
--- /dev/null
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -0,0 +1,190 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different types of export output."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import signature_def_utils
+
+
+class ExportOutput(object):
+  """Represents an output of a model that can be served.
+
+  These typically correspond to model heads.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def as_signature_def(self, receiver_tensors):
+    """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
+
+    The SignatureDef will specify outputs as described in this ExportOutput,
+    and will use the provided receiver_tensors as inputs.
+
+    Args:
+      receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+        input nodes that will be fed.
+    """
+    pass
+
+
+class ClassificationOutput(ExportOutput):
+  """Represents the output of a classification head.
+
+  Either classes or scores or both must be set.
+
+  The classes `Tensor` must provide string labels, not integer class IDs.
+
+  If only classes is set, it is interpreted as providing top-k results in
+  descending order.
+
+  If only scores is set, it is interpreted as providing a score for every class
+  in order of class ID.
+
+  If both classes and scores are set, they are interpreted as zipped, so each
+  score corresponds to the class at the same index.  Clients should not depend
+  on the order of the entries.
+  """
+
+  def __init__(self, scores=None, classes=None):
+    """Constructor for `ClassificationOutput`.
+
+    Args:
+      scores: A float `Tensor` giving scores (sometimes but not always
+          interpretable as probabilities) for each class.  May be `None`, but
+          only if `classes` is set.  Interpretation varies-- see class doc.
+      classes: A string `Tensor` giving predicted class labels.  May be `None`,
+          but only if `scores` is set.  Interpretation varies-- see class doc.
+
+    Raises:
+      ValueError: if neither classes nor scores is set, or one of them is not a
+          `Tensor` with the correct dtype.
+    """
+    if (scores is not None
+        and not (isinstance(scores, ops.Tensor)
+                 and scores.dtype.is_floating)):
+      raise ValueError('Classification scores must be a float32 Tensor; '
+                       'got {}'.format(scores))
+    if (classes is not None
+        and not (isinstance(classes, ops.Tensor)
+                 and dtypes.as_dtype(classes.dtype) == dtypes.string)):
+      raise ValueError('Classification classes must be a string Tensor; '
+                       'got {}'.format(classes))
+    if scores is None and classes is None:
+      raise ValueError('At least one of scores and classes must be set.')
+
+    self._scores = scores
+    self._classes = classes
+
+  @property
+  def scores(self):
+    return self._scores
+
+  @property
+  def classes(self):
+    return self._classes
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.classification_signature_def(
+        examples, self.classes, self.scores)
+
+
+class RegressionOutput(ExportOutput):
+  """Represents the output of a regression head."""
+
+  def __init__(self, value):
+    """Constructor for `RegressionOutput`.
+
+    Args:
+      value: a float `Tensor` giving the predicted values.  Required.
+
+    Raises:
+      ValueError: if the value is not a `Tensor` with dtype tf.float32.
+    """
+    if not (isinstance(value, ops.Tensor) and value.dtype.is_floating):
+      raise ValueError('Regression output value must be a float32 Tensor; '
+                       'got {}'.format(value))
+    self._value = value
+
+  @property
+  def value(self):
+    return self._value
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.regression_signature_def(examples, self.value)
+
+
+class PredictOutput(ExportOutput):
+  """Represents the output of a generic prediction head.
+
+  A generic prediction need not be either a classification or a regression.
+
+  Named outputs must be provided as a dict from string to `Tensor`,
+  """
+
+  def __init__(self, outputs):
+    """Constructor for PredictOutput.
+
+    Args:
+      outputs: A dict of string to `Tensor` representing the predictions.
+
+    Raises:
+      ValueError: if the outputs is not dict, or any of its keys are not
+          strings, or any of its values are not `Tensor`s.
+    """
+    if not isinstance(outputs, dict):
+      raise ValueError(
+          'Prediction outputs must be given as a dict of string to Tensor; '
+          'got {}'.format(outputs))
+    for key, value in outputs.items():
+      if not isinstance(key, six.string_types):
+        raise ValueError(
+            'Prediction output key must be a string; got {}.'.format(key))
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            'Prediction output value must be a Tensor; got {}.'.format(value))
+    self._outputs = outputs
+
+  @property
+  def outputs(self):
+    return self._outputs
+
+  def as_signature_def(self, receiver_tensors):
+    return signature_def_utils.predict_signature_def(receiver_tensors,
+                                                     self.outputs)
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
new file mode 100644
index 00000000000..035a9a143e6
--- /dev/null
+++ b/tensorflow/python/estimator/export/export_output_test.py
@@ -0,0 +1,231 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+
+
+class ExportOutputTest(test.TestCase):
+
+  def test_regress_value_must_be_float(self):
+    value = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
+    with self.assertRaises(ValueError) as e:
+      export_output_lib.RegressionOutput(value)
+    self.assertEqual('Regression output value must be a float32 Tensor; got '
+                     'Tensor("output-tensor-1:0", shape=(1,), dtype=string)',
+                     str(e.exception))
+
+  def test_classify_classes_must_be_strings(self):
+    classes = array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1")
+    with self.assertRaises(ValueError) as e:
+      export_output_lib.ClassificationOutput(classes=classes)
+    self.assertEqual('Classification classes must be a string Tensor; got '
+                     'Tensor("output-tensor-1:0", shape=(1,), dtype=float32)',
+                     str(e.exception))
+
+  def test_classify_scores_must_be_float(self):
+    scores = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
+    with self.assertRaises(ValueError) as e:
+      export_output_lib.ClassificationOutput(scores=scores)
+    self.assertEqual('Classification scores must be a float32 Tensor; got '
+                     'Tensor("output-tensor-1:0", shape=(1,), dtype=string)',
+                     str(e.exception))
+
+  def test_classify_requires_classes_or_scores(self):
+    with self.assertRaises(ValueError) as e:
+      export_output_lib.ClassificationOutput()
+    self.assertEqual("At least one of scores and classes must be set.",
+                     str(e.exception))
+
+  def test_build_standardized_signature_def_regression(self):
+    input_tensors = {
+        "input-1":
+            array_ops.placeholder(
+                dtypes.string, 1, name="input-tensor-1")
+    }
+    value = array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1")
+
+    export_output = export_output_lib.RegressionOutput(value)
+    actual_signature_def = export_output.as_signature_def(input_tensors)
+
+    expected_signature_def = meta_graph_pb2.SignatureDef()
+    shape = tensor_shape_pb2.TensorShapeProto(
+        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
+    dtype_string = types_pb2.DataType.Value("DT_STRING")
+    expected_signature_def.inputs[
+        signature_constants.REGRESS_INPUTS].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+    expected_signature_def.outputs[
+        signature_constants.REGRESS_OUTPUTS].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="output-tensor-1:0",
+                                      dtype=dtype_float,
+                                      tensor_shape=shape))
+
+    expected_signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
+    self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_classes_only(self):
+    """Tests classification with one output tensor."""
+    input_tensors = {
+        "input-1":
+            array_ops.placeholder(
+                dtypes.string, 1, name="input-tensor-1")
+    }
+    classes = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
+
+    export_output = export_output_lib.ClassificationOutput(classes=classes)
+    actual_signature_def = export_output.as_signature_def(input_tensors)
+
+    expected_signature_def = meta_graph_pb2.SignatureDef()
+    shape = tensor_shape_pb2.TensorShapeProto(
+        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+    dtype_string = types_pb2.DataType.Value("DT_STRING")
+    expected_signature_def.inputs[
+        signature_constants.CLASSIFY_INPUTS].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+    expected_signature_def.outputs[
+        signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="output-tensor-1:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+
+    expected_signature_def.method_name = (
+        signature_constants.CLASSIFY_METHOD_NAME)
+    self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_both(self):
+    """Tests multiple output tensors that include classes and scores."""
+    input_tensors = {
+        "input-1":
+            array_ops.placeholder(
+                dtypes.string, 1, name="input-tensor-1")
+    }
+    classes = array_ops.placeholder(dtypes.string, 1,
+                                    name="output-tensor-classes")
+    scores = array_ops.placeholder(dtypes.float32, 1,
+                                   name="output-tensor-scores")
+
+    export_output = export_output_lib.ClassificationOutput(
+        scores=scores, classes=classes)
+    actual_signature_def = export_output.as_signature_def(input_tensors)
+
+    expected_signature_def = meta_graph_pb2.SignatureDef()
+    shape = tensor_shape_pb2.TensorShapeProto(
+        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
+    dtype_string = types_pb2.DataType.Value("DT_STRING")
+    expected_signature_def.inputs[
+        signature_constants.CLASSIFY_INPUTS].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+    expected_signature_def.outputs[
+        signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="output-tensor-classes:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+    expected_signature_def.outputs[
+        signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="output-tensor-scores:0",
+                                      dtype=dtype_float,
+                                      tensor_shape=shape))
+
+    expected_signature_def.method_name = (
+        signature_constants.CLASSIFY_METHOD_NAME)
+    self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_scores_only(self):
+    """Tests classification without classes tensor."""
+    input_tensors = {
+        "input-1":
+            array_ops.placeholder(
+                dtypes.string, 1, name="input-tensor-1")
+    }
+
+    scores = array_ops.placeholder(dtypes.float32, 1,
+                                   name="output-tensor-scores")
+
+    export_output = export_output_lib.ClassificationOutput(
+        scores=scores)
+    actual_signature_def = export_output.as_signature_def(input_tensors)
+
+    expected_signature_def = meta_graph_pb2.SignatureDef()
+    shape = tensor_shape_pb2.TensorShapeProto(
+        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
+    dtype_string = types_pb2.DataType.Value("DT_STRING")
+    expected_signature_def.inputs[
+        signature_constants.CLASSIFY_INPUTS].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
+                                      dtype=dtype_string,
+                                      tensor_shape=shape))
+    expected_signature_def.outputs[
+        signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+            meta_graph_pb2.TensorInfo(name="output-tensor-scores:0",
+                                      dtype=dtype_float,
+                                      tensor_shape=shape))
+
+    expected_signature_def.method_name = (
+        signature_constants.CLASSIFY_METHOD_NAME)
+    self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_predict_output_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    outputs = {
+        "output0": constant_op.constant([0]),
+        u"output1": constant_op.constant([1]),
+    }
+    export_output_lib.PredictOutput(outputs)
+
+  def test_predict_output_outputs_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction outputs must be given as a dict of string to Tensor"):
+      export_output_lib.PredictOutput(constant_op.constant([0]))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output key must be a string"):
+      export_output_lib.PredictOutput({1: constant_op.constant([0])})
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output value must be a Tensor"):
+      export_output_lib.PredictOutput({
+          "prediction1": sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+      })
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
new file mode 100644
index 00000000000..6864a845f37
--- /dev/null
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -0,0 +1,274 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import time
+
+from google.protobuf import text_format
+
+from tensorflow.core.example import example_pb2
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+
+
+class ExportTest(test_util.TensorFlowTestCase):
+
+  def test_serving_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.ServingInputReceiver(features, receiver_tensors)
+
+  def test_serving_input_receiver_features_invalid(self):
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.ServingInputReceiver(
+          features=None,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
+      export.ServingInputReceiver(
+          features={1: constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
+      export.ServingInputReceiver(
+          features={"feature1": [1]},
+          receiver_tensors=receiver_tensors)
+
+  def test_serving_input_receiver_receiver_tensors_invalid(self):
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={"example1": [1]})
+
+  def test_single_feature_single_receiver(self):
+    feature = constant_op.constant(5)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    input_receiver = export.ServingInputReceiver(
+        feature, receiver_tensor)
+    # single feature is automatically named
+    feature_key, = input_receiver.features.keys()
+    self.assertEqual("feature", feature_key)
+    # single receiver is automatically named
+    receiver_key, = input_receiver.receiver_tensors.keys()
+    self.assertEqual("input", receiver_key)
+
+  def test_multi_feature_single_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.ServingInputReceiver(features, receiver_tensor)
+
+  def test_multi_feature_multi_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    receiver_tensors = {"baz": array_ops.placeholder(dtypes.int64),
+                        "qux": array_ops.placeholder(dtypes.float32)}
+    _ = export.ServingInputReceiver(features, receiver_tensors)
+
+  def test_feature_wrong_type(self):
+    feature = "not a tensor"
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    with self.assertRaises(ValueError):
+      _ = export.ServingInputReceiver(feature, receiver_tensor)
+
+  def test_receiver_wrong_type(self):
+    feature = constant_op.constant(5)
+    receiver_tensor = "not a tensor"
+    with self.assertRaises(ValueError):
+      _ = export.ServingInputReceiver(feature, receiver_tensor)
+
+  def test_build_parsing_serving_input_receiver_fn(self):
+    feature_spec = {"int_feature": parsing_ops.VarLenFeature(dtypes.int64),
+                    "float_feature": parsing_ops.VarLenFeature(dtypes.float32)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    with ops.Graph().as_default():
+      serving_input_receiver = serving_input_receiver_fn()
+      self.assertEqual(set(["int_feature", "float_feature"]),
+                       set(serving_input_receiver.features.keys()))
+      self.assertEqual(set(["examples"]),
+                       set(serving_input_receiver.receiver_tensors.keys()))
+
+      example = example_pb2.Example()
+      text_format.Parse("features: { "
+                        "  feature: { "
+                        "    key: 'int_feature' "
+                        "    value: { "
+                        "      int64_list: { "
+                        "        value: [ 21, 2, 5 ] "
+                        "      } "
+                        "    } "
+                        "  } "
+                        "  feature: { "
+                        "    key: 'float_feature' "
+                        "    value: { "
+                        "      float_list: { "
+                        "        value: [ 525.25 ] "
+                        "      } "
+                        "    } "
+                        "  } "
+                        "} ", example)
+
+      with self.test_session() as sess:
+        sparse_result = sess.run(
+            serving_input_receiver.features,
+            feed_dict={
+                serving_input_receiver.receiver_tensors["examples"].name:
+                [example.SerializeToString()]})
+        self.assertAllEqual([[0, 0], [0, 1], [0, 2]],
+                            sparse_result["int_feature"].indices)
+        self.assertAllEqual([21, 2, 5],
+                            sparse_result["int_feature"].values)
+        self.assertAllEqual([[0, 0]],
+                            sparse_result["float_feature"].indices)
+        self.assertAllEqual([525.25],
+                            sparse_result["float_feature"].values)
+
+  def test_build_raw_serving_input_receiver_fn(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    serving_input_receiver_fn = export.build_raw_serving_input_receiver_fn(
+        features)
+    with ops.Graph().as_default():
+      serving_input_receiver = serving_input_receiver_fn()
+      self.assertEqual(set(["feature_1", "feature_2"]),
+                       set(serving_input_receiver.features.keys()))
+      self.assertEqual(set(["feature_1", "feature_2"]),
+                       set(serving_input_receiver.receiver_tensors.keys()))
+      self.assertEqual(
+          dtypes.string,
+          serving_input_receiver.receiver_tensors["feature_1"].dtype)
+      self.assertEqual(
+          dtypes.int32,
+          serving_input_receiver.receiver_tensors["feature_2"].dtype)
+
+  def test_build_all_signature_defs_explicit_default(self):
+    receiver_tensor = constant_op.constant(["11"])
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(receiver_tensor,
+                                                         output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(receiver_tensor,
+                                                             output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def({
+                "receiver": receiver_tensor
+            }, {"some_output_3": output_3})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  def test_build_all_signature_defs_export_outputs_required(self):
+    receiver_tensor = constant_op.constant(["11"])
+
+    with self.assertRaises(ValueError) as e:
+      export.build_all_signature_defs(receiver_tensor, None)
+
+    self.assertEqual("export_outputs must be a dict.", str(e.exception))
+
+  def test_get_timestamped_export_dir(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    export_dir_1 = export.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_2 = export.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_3 = export.get_timestamped_export_dir(
+        export_dir_base)
+
+    # Export directories should be named using a timestamp that is seconds
+    # since epoch.  Such a timestamp is 10 digits long.
+    time_1 = os.path.basename(export_dir_1)
+    self.assertEqual(10, len(time_1))
+    time_2 = os.path.basename(export_dir_2)
+    self.assertEqual(10, len(time_2))
+    time_3 = os.path.basename(export_dir_3)
+    self.assertEqual(10, len(time_3))
+
+    self.assertTrue(int(time_1) < int(time_2))
+    self.assertTrue(int(time_2) < int(time_3))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/estimator/inputs/__init__.py b/tensorflow/python/estimator/inputs/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/estimator/inputs/inputs.py b/tensorflow/python/estimator/inputs/inputs.py
new file mode 100644
index 00000000000..1a1c9a6c3fb
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/inputs.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods to create simple input_fns."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
+from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'numpy_input_fn',
+    'pandas_input_fn'
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
new file mode 100644
index 00000000000..b31c5492d86
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow dict of numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+from tensorflow.python.estimator.inputs.queues import feeding_functions
+
+# Key name to pack the target into dict of `features`. See
+# `_get_unique_target_key` for details.
+_TARGET_KEY = '__target_key__'
+
+
+def _get_unique_target_key(features):
+  """Returns a key not existed in the input dict `features`.
+
+  Caller of `input_fn` usually provides `features` (dict of numpy arrays) and
+  `target`, but the underlying feeding module expects a single dict of numpy
+  arrays as input. So, the `target` needs to be packed into the `features`
+  temporarily and unpacked after calling the feeding function. Toward this goal,
+  this function returns a key not existed in the `features` to pack the
+  `target`.
+  """
+  target_key = _TARGET_KEY
+  while target_key in features:
+    target_key += '_n'
+  return target_key
+
+
+def numpy_input_fn(x,
+                   y=None,
+                   batch_size=128,
+                   num_epochs=1,
+                   shuffle=None,
+                   queue_capacity=1000,
+                   num_threads=1):
+  """Returns input function that would feed dict of numpy arrays into the model.
+
+  This returns a function outputting `features` and `target` based on the dict
+  of numpy arrays. The dict `features` has the same keys as the `x`.
+
+  Example:
+  ```python
+  age = np.arange(4) * 1.0
+  height = np.arange(32, 36)
+  x = {'age': age, 'height': height}
+  y = np.arange(-32, -28)
+
+  with tf.Session() as session:
+    input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+  ```
+
+  Args:
+    x: dict of numpy array object.
+    y: numpy array object. `None` if absent.
+    batch_size: Integer, size of batches to return.
+    num_epochs: Integer, number of epochs to iterate over data. If `None` will
+      run forever.
+    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+      time.
+    queue_capacity: Integer, size of queue to accumulate.
+    num_threads: Integer, number of threads used for reading and enqueueing. In
+      order to have predicted and repeatable order of reading and enqueueing,
+      such as in prediction and evaluation mode, `num_threads` should be 1.
+
+  Returns:
+    Function, that has signature of ()->(dict of `features`, `target`)
+
+  Raises:
+    ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
+      values in `x` have same shape).
+    TypeError: `x` is not a dict or `shuffle` is not bool.
+  """
+
+  if not isinstance(shuffle, bool):
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
+
+  def input_fn():
+    """Numpy input function."""
+    if not isinstance(x, dict):
+      raise TypeError('x must be dict; got {}'.format(type(x).__name__))
+
+    # Make a shadow copy and also ensure the order of iteration is consistent.
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+
+    unique_target_key = _get_unique_target_key(ordered_dict_x)
+    if y is not None:
+      ordered_dict_x[unique_target_key] = y
+
+    if len(set(v.shape[0] for v in ordered_dict_x.values())) != 1:
+      shape_dict_of_x = {k: ordered_dict_x[k].shape
+                         for k in ordered_dict_x.keys()}
+      shape_of_y = None if y is None else y.shape
+      raise ValueError('Length of tensors in x and y is mismatched. All '
+                       'elements in x and y must have the same length.\n'
+                       'Shapes in x: {}\n'
+                       'Shape for y: {}\n'.format(shape_dict_of_x, shape_of_y))
+
+    queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
+        ordered_dict_x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+
+    features = (queue.dequeue_many(batch_size) if num_epochs is None
+                else queue.dequeue_up_to(batch_size))
+
+    # Remove the first `Tensor` in `features`, which is the row number.
+    if len(features) > 0:
+      features.pop(0)
+
+    features = dict(zip(ordered_dict_x.keys(), features))
+    if y is not None:
+      target = features.pop(unique_target_key)
+      return features, target
+    return features
+
+  return input_fn
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
new file mode 100644
index 00000000000..02df22b6323
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -0,0 +1,290 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class NumpyIoTest(test.TestCase):
+
+  def testNumpyInputFn(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      session.run([features, target])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithVeryLargeBatchSizeAndMultipleEpochs(self):
+    a = np.arange(2) * 1.0
+    b = np.arange(32, 34)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -30)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=128, shuffle=False, num_epochs=2)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1, 0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33, 32, 33])
+      self.assertAllEqual(res[1], [-32, -31, -32, -31])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithZeroEpochs(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=0)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeNotDividedByDataSize(self):
+    batch_size = 2
+    a = np.arange(5) * 1.0
+    b = np.arange(32, 37)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -27)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2, 3])
+      self.assertAllEqual(res[0]['b'], [34, 35])
+      self.assertAllEqual(res[1], [-30, -29])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [4])
+      self.assertAllEqual(res[0]['b'], [36])
+      self.assertAllEqual(res[1], [-28])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeNotDividedByDataSizeAndMultipleEpochs(self):
+    batch_size = 2
+    a = np.arange(3) * 1.0
+    b = np.arange(32, 35)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -29)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=3)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2, 0])
+      self.assertAllEqual(res[0]['b'], [34, 32])
+      self.assertAllEqual(res[1], [-30, -32])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [1, 2])
+      self.assertAllEqual(res[0]['b'], [33, 34])
+      self.assertAllEqual(res[1], [-31, -30])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1])
+      self.assertAllEqual(res[0]['b'], [32, 33])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [2])
+      self.assertAllEqual(res[0]['b'], [34])
+      self.assertAllEqual(res[1], [-30])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithBatchSizeLargerThanDataSize(self):
+    batch_size = 10
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -28)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [0, 1, 2, 3])
+      self.assertAllEqual(res[0]['b'], [32, 33, 34, 35])
+      self.assertAllEqual(res[1], [-32, -31, -30, -29])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithDifferentDimensionsOfFeatures(self):
+    a = np.array([[1, 2], [3, 4]])
+    b = np.array([5, 6])
+    x = {'a': a, 'b': b}
+    y = np.arange(-32, -30)
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], [[1, 2], [3, 4]])
+      self.assertAllEqual(res[0]['b'], [5, 6])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithXAsNonDict(self):
+    x = np.arange(32, 36)
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x must be dict'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x, y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testNumpyInputFnWithNonBoolShuffle(self):
+    x = np.arange(32, 36)
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError,
+                                   'shuffle must be explicitly set as boolean'):
+        # Default shuffle is None.
+        numpy_io.numpy_input_fn(x, y)
+
+  def testNumpyInputFnWithTargetKeyAlreadyInX(self):
+    array = np.arange(32, 36)
+    x = {'__target_key__': array}
+    y = np.arange(4)
+
+    with self.test_session():
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      input_fn()
+      self.assertAllEqual(x['__target_key__'], array)
+      # The input x should not be mutated.
+      self.assertItemsEqual(x.keys(), ['__target_key__'])
+
+  def testNumpyInputFnWithMismatchLengthOfInputs(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    x_mismatch_length = {'a': np.arange(1), 'b': b}
+    y_longer_length = np.arange(10)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'Length of tensors in x and y is mismatched.'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x, y_longer_length, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+      with self.assertRaisesRegexp(
+          ValueError, 'Length of tensors in x and y is mismatched.'):
+        failing_input_fn = numpy_io.numpy_input_fn(
+            x=x_mismatch_length,
+            y=None,
+            batch_size=2,
+            shuffle=False,
+            num_epochs=1)
+        failing_input_fn()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
new file mode 100644
index 00000000000..90d6145377d
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -0,0 +1,121 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Methods to allow pandas.DataFrame."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.estimator.inputs.queues import feeding_functions
+
+try:
+  # pylint: disable=g-import-not-at-top
+  # pylint: disable=unused-import
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+def pandas_input_fn(x,
+                    y=None,
+                    batch_size=128,
+                    num_epochs=1,
+                    shuffle=None,
+                    queue_capacity=1000,
+                    num_threads=1,
+                    target_column='target'):
+  """Returns input function that would feed Pandas DataFrame into the model.
+
+  Note: `y`'s index must match `x`'s index.
+
+  Args:
+    x: pandas `DataFrame` object.
+    y: pandas `Series` object. `None` if absent.
+    batch_size: int, size of batches to return.
+    num_epochs: int, number of epochs to iterate over data. If not `None`,
+      read attempts that would exceed this value will raise `OutOfRangeError`.
+    shuffle: bool, whether to read the records in random order.
+    queue_capacity: int, size of the read queue. If `None`, it will be set
+      roughly to the size of `x`.
+    num_threads: Integer, number of threads used for reading and enqueueing. In
+      order to have predicted and repeatable order of reading and enqueueing,
+      such as in prediction and evaluation mode, `num_threads` should be 1.
+    target_column: str, name to give the target column `y`.
+
+  Returns:
+    Function, that has signature of ()->(dict of `features`, `target`)
+
+  Raises:
+    ValueError: if `x` already contains a column with the same name as `y`, or
+      if the indexes of `x` and `y` don't match.
+    TypeError: `shuffle` is not bool.
+  """
+  if not HAS_PANDAS:
+    raise TypeError(
+        'pandas_input_fn should not be called without pandas installed')
+
+  if not isinstance(shuffle, bool):
+    raise TypeError('shuffle must be explicitly set as boolean; '
+                    'got {}'.format(shuffle))
+
+  x = x.copy()
+  if y is not None:
+    if target_column in x:
+      raise ValueError(
+          'Cannot use name %s for target column: DataFrame already has a '
+          'column with that name: %s' % (target_column, x.columns))
+    if not np.array_equal(x.index, y.index):
+      raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
+                       'Index for y: %s\n' % (x.index, y.index))
+    x[target_column] = y
+
+  # TODO(mdan): These are memory copies. We probably don't need 4x slack space.
+  # The sizes below are consistent with what I've seen elsewhere.
+  if queue_capacity is None:
+    if shuffle:
+      queue_capacity = 4 * len(x)
+    else:
+      queue_capacity = len(x)
+  min_after_dequeue = max(queue_capacity / 4, 1)
+
+  def input_fn():
+    """Pandas input function."""
+    queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
+        x,
+        queue_capacity,
+        shuffle=shuffle,
+        min_after_dequeue=min_after_dequeue,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+    if num_epochs is None:
+      features = queue.dequeue_many(batch_size)
+    else:
+      features = queue.dequeue_up_to(batch_size)
+    assert len(features) == len(x.columns) + 1, ('Features should have one '
+                                                 'extra element for the index.')
+    features = features[1:]
+    features = dict(zip(list(x.columns), features))
+    if y is not None:
+      target = features.pop(target_column)
+      return features, target
+    return features
+  return input_fn
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
new file mode 100644
index 00000000000..e5912a3b28e
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -0,0 +1,249 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pandas_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.estimator.inputs import pandas_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+class PandasIoTest(test.TestCase):
+
+  def makeTestDataFrame(self):
+    index = np.arange(100, 104)
+    a = np.arange(4)
+    b = np.arange(32, 36)
+    x = pd.DataFrame({'a': a, 'b': b}, index=index)
+    y = pd.Series(np.arange(-32, -28), index=index)
+    return x, y
+
+  def callInputFnOnce(self, input_fn, session):
+    results = input_fn()
+    coord = coordinator.Coordinator()
+    threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+    result_values = session.run(results)
+    coord.request_stop()
+    coord.join(threads)
+    return result_values
+
+  def testPandasInputFn_IndexMismatch(self):
+    if not HAS_PANDAS:
+      return
+    x, _ = self.makeTestDataFrame()
+    y_noindex = pd.Series(np.arange(-32, -28))
+    with self.assertRaises(ValueError):
+      pandas_io.pandas_input_fn(
+          x, y_noindex, batch_size=2, shuffle=False, num_epochs=1)
+
+  def testPandasInputFn_NonBoolShuffle(self):
+    if not HAS_PANDAS:
+      return
+    x, _ = self.makeTestDataFrame()
+    y_noindex = pd.Series(np.arange(-32, -28))
+    with self.assertRaisesRegexp(TypeError,
+                                 'shuffle must be explicitly set as boolean'):
+      # Default shuffle is None
+      pandas_io.pandas_input_fn(x, y_noindex)
+
+  def testPandasInputFn_ProducesExpectedOutputs(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, y = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+
+      features, target = self.callInputFnOnce(input_fn, session)
+
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+      self.assertAllEqual(target, [-32, -31])
+
+  def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      index = np.arange(100, 102)
+      a = np.arange(2)
+      b = np.arange(32, 34)
+      x = pd.DataFrame({'a': a, 'b': b}, index=index)
+      y = pd.Series(np.arange(-32, -30), index=index)
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=128, shuffle=False, num_epochs=2)
+
+      results = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [0, 1, 0, 1])
+      self.assertAllEqual(features['b'], [32, 33, 32, 33])
+      self.assertAllEqual(target, [-32, -31, -32, -31])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run(results)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      index = np.arange(100, 105)
+      a = np.arange(5)
+      b = np.arange(32, 37)
+      x = pd.DataFrame({'a': a, 'b': b}, index=index)
+      y = pd.Series(np.arange(-32, -27), index=index)
+
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+
+      results = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+      self.assertAllEqual(target, [-32, -31])
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [2, 3])
+      self.assertAllEqual(features['b'], [34, 35])
+      self.assertAllEqual(target, [-30, -29])
+
+      features, target = session.run(results)
+      self.assertAllEqual(features['a'], [4])
+      self.assertAllEqual(features['b'], [36])
+      self.assertAllEqual(target, [-28])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run(results)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testPandasInputFn_OnlyX(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, _ = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y=None, batch_size=2, shuffle=False, num_epochs=1)
+
+      features = self.callInputFnOnce(input_fn, session)
+
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+
+  def testPandasInputFn_ExcludesIndex(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, y = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+
+      features, _ = self.callInputFnOnce(input_fn, session)
+
+      self.assertFalse('index' in features)
+
+  def assertInputsCallableNTimes(self, input_fn, session, n):
+    inputs = input_fn()
+    coord = coordinator.Coordinator()
+    threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+    for _ in range(n):
+      session.run(inputs)
+    with self.assertRaises(errors.OutOfRangeError):
+      session.run(inputs)
+    coord.request_stop()
+    coord.join(threads)
+
+  def testPandasInputFn_RespectsEpoch_NoShuffle(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, y = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=4, shuffle=False, num_epochs=1)
+
+      self.assertInputsCallableNTimes(input_fn, session, 1)
+
+  def testPandasInputFn_RespectsEpoch_WithShuffle(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, y = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=4, shuffle=True, num_epochs=1)
+
+      self.assertInputsCallableNTimes(input_fn, session, 1)
+
+  def testPandasInputFn_RespectsEpoch_WithShuffleAutosize(self):
+    if not HAS_PANDAS:
+      return
+    with self.test_session() as session:
+      x, y = self.makeTestDataFrame()
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=True, queue_capacity=None, num_epochs=2)
+
+      self.assertInputsCallableNTimes(input_fn, session, 4)
+
+  def testPandasInputFn_RespectsEpochUnevenBatches(self):
+    if not HAS_PANDAS:
+      return
+    x, y = self.makeTestDataFrame()
+    with self.test_session() as session:
+      input_fn = pandas_io.pandas_input_fn(
+          x, y, batch_size=3, shuffle=False, num_epochs=1)
+
+      # Before the last batch, only one element of the epoch should remain.
+      self.assertInputsCallableNTimes(input_fn, session, 2)
+
+  def testPandasInputFn_Idempotent(self):
+    if not HAS_PANDAS:
+      return
+    x, y = self.makeTestDataFrame()
+    for _ in range(2):
+      pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)()
+    for _ in range(2):
+      pandas_io.pandas_input_fn(
+          x, y, batch_size=2, shuffle=True, num_epochs=1)()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/inputs/queues/__init__.py b/tensorflow/python/estimator/inputs/queues/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
new file mode 100644
index 00000000000..a6f5157680f
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -0,0 +1,408 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions for enqueuing data from arrays and pandas `DataFrame`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import types as tp
+import numpy as np
+import six
+
+from tensorflow.python.estimator.inputs.queues import feeding_queue_runner as fqr
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import queue_runner
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+def _get_integer_indices_for_next_batch(
+    batch_indices_start, batch_size, epoch_end, array_length,
+    current_epoch, total_epochs):
+  """Returns the integer indices for next batch.
+
+  If total epochs is not None and current epoch is the final epoch, the end
+  index of the next batch should not exceed the `epoch_end` (i.e., the final
+  batch might not have size `batch_size` to avoid overshooting the last epoch).
+
+  Args:
+    batch_indices_start: Integer, the index to start next batch.
+    batch_size: Integer, size of batches to return.
+    epoch_end: Integer, the end index of the epoch. The epoch could start from a
+      random position, so `epoch_end` provides the end index for that.
+    array_length: Integer, the length of the array.
+    current_epoch: Integer, the epoch number has been emitted.
+    total_epochs: Integer or `None`, the total number of epochs to emit. If
+      `None` will run forever.
+
+  Returns:
+    A tuple of a list with integer indices for next batch and `current_epoch`
+    value after the next batch.
+
+  Raises:
+    OutOfRangeError if `current_epoch` is not less than `total_epochs`.
+
+  """
+  if total_epochs is not None and current_epoch >= total_epochs:
+    raise errors.OutOfRangeError(None, None,
+                                 "Already emitted %s epochs." % current_epoch)
+
+  batch_indices_end = batch_indices_start + batch_size
+  batch_indices = [j % array_length for j in
+                   range(batch_indices_start, batch_indices_end)]
+  epoch_end_indices = [i for i, x in enumerate(batch_indices) if x == epoch_end]
+  current_epoch += len(epoch_end_indices)
+
+  if total_epochs is None or current_epoch < total_epochs:
+    return (batch_indices, current_epoch)
+
+  # Now we might have emitted more data for expected epochs. Need to trim.
+  final_epoch_end_inclusive = epoch_end_indices[
+      -(current_epoch - total_epochs + 1)]
+  batch_indices = batch_indices[:final_epoch_end_inclusive + 1]
+
+  return (batch_indices, total_epochs)
+
+
+class _ArrayFeedFn(object):
+  """Creates feed dictionaries from numpy arrays."""
+
+  def __init__(self,
+               placeholders,
+               array,
+               batch_size,
+               random_start=False,
+               seed=None,
+               num_epochs=None):
+    if len(placeholders) != 2:
+      raise ValueError("_array_feed_fn expects 2 placeholders; got {}.".format(
+          len(placeholders)))
+    self._placeholders = placeholders
+    self._array = array
+    self._max = len(array)
+    self._batch_size = batch_size
+    self._num_epochs = num_epochs
+    self._epoch = 0
+    random.seed(seed)
+    self._trav = random.randrange(self._max) if random_start else 0
+    self._epoch_end = (self._trav - 1) % self._max
+
+  def __call__(self):
+    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
+        batch_indices_start=self._trav,
+        batch_size=self._batch_size,
+        epoch_end=self._epoch_end,
+        array_length=self._max,
+        current_epoch=self._epoch,
+        total_epochs=self._num_epochs)
+
+    self._trav = (integer_indexes[-1] + 1) % self._max
+    return {
+        self._placeholders[0]: integer_indexes,
+        self._placeholders[1]: self._array[integer_indexes]
+    }
+
+
+class _OrderedDictNumpyFeedFn(object):
+  """Creates feed dictionaries from `OrderedDict`s of numpy arrays."""
+
+  def __init__(self,
+               placeholders,
+               ordered_dict_of_arrays,
+               batch_size,
+               random_start=False,
+               seed=None,
+               num_epochs=None):
+    if len(placeholders) != len(ordered_dict_of_arrays) + 1:
+      raise ValueError("Expected {} placeholders; got {}.".format(
+          len(ordered_dict_of_arrays), len(placeholders)))
+    self._index_placeholder = placeholders[0]
+    self._col_placeholders = placeholders[1:]
+    self._ordered_dict_of_arrays = ordered_dict_of_arrays
+    self._max = len(next(iter(ordered_dict_of_arrays.values())))
+    for _, v in ordered_dict_of_arrays.items():
+      if len(v) != self._max:
+        raise ValueError("Array lengths must match.")
+    self._batch_size = batch_size
+    self._num_epochs = num_epochs
+    self._epoch = 0
+    random.seed(seed)
+    self._trav = random.randrange(self._max) if random_start else 0
+    self._epoch_end = (self._trav - 1) % self._max
+
+  def __call__(self):
+    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
+        batch_indices_start=self._trav,
+        batch_size=self._batch_size,
+        epoch_end=self._epoch_end,
+        array_length=self._max,
+        current_epoch=self._epoch,
+        total_epochs=self._num_epochs)
+
+    self._trav = (integer_indexes[-1] + 1) % self._max
+    feed_dict = {self._index_placeholder: integer_indexes}
+    cols = [
+        column[integer_indexes]
+        for column in self._ordered_dict_of_arrays.values()
+    ]
+    feed_dict.update(dict(zip(self._col_placeholders, cols)))
+    return feed_dict
+
+
+class _PandasFeedFn(object):
+  """Creates feed dictionaries from pandas `DataFrames`."""
+
+  def __init__(self,
+               placeholders,
+               dataframe,
+               batch_size,
+               random_start=False,
+               seed=None,
+               num_epochs=None):
+    if len(placeholders) != len(dataframe.columns) + 1:
+      raise ValueError("Expected {} placeholders; got {}.".format(
+          len(dataframe.columns), len(placeholders)))
+    self._index_placeholder = placeholders[0]
+    self._col_placeholders = placeholders[1:]
+    self._dataframe = dataframe
+    self._max = len(dataframe)
+    self._batch_size = batch_size
+    self._num_epochs = num_epochs
+    self._epoch = 0
+    random.seed(seed)
+    self._trav = random.randrange(self._max) if random_start else 0
+    self._epoch_end = (self._trav - 1) % self._max
+
+  def __call__(self):
+    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
+        batch_indices_start=self._trav,
+        batch_size=self._batch_size,
+        epoch_end=self._epoch_end,
+        array_length=self._max,
+        current_epoch=self._epoch,
+        total_epochs=self._num_epochs)
+
+    self._trav = (integer_indexes[-1] + 1) % self._max
+    result = self._dataframe.iloc[integer_indexes]
+    cols = [result[col].values for col in result.columns]
+    feed_dict = dict(zip(self._col_placeholders, cols))
+    feed_dict[self._index_placeholder] = result.index.values
+    return feed_dict
+
+
+class _GeneratorFeedFn(object):
+  """Creates feed dictionaries from `Generator` of `dicts` of numpy arrays."""
+
+  def __init__(self,
+               placeholders,
+               generator,
+               batch_size,
+               random_start=False,
+               seed=None,
+               num_epochs=None):
+    first_sample = next(generator())
+    if len(placeholders) != len(first_sample):
+      raise ValueError("Expected {} placeholders; got {}.".format(
+          len(first_sample), len(placeholders)))
+    self._keys = sorted(list(first_sample.keys()))
+    self._col_placeholders = placeholders
+    self._generator_function = generator
+    self._iterator = generator()
+    self._batch_size = batch_size
+    self._num_epochs = num_epochs
+    self._epoch = 0
+    random.seed(seed)
+
+  def __call__(self):
+    if self._num_epochs and self._epoch >= self._num_epochs:
+      raise errors.OutOfRangeError(None, None,
+                                   "Already emitted %s epochs." % self._epoch)
+    list_dict = {}
+    list_dict_size = 0
+    while list_dict_size < self._batch_size:
+      try:
+        data_row = next(self._iterator)
+      except StopIteration:
+        self._epoch += 1
+        self._iterator = self._generator_function()
+        data_row = next(self._iterator)
+      for index, key in enumerate(self._keys):
+        if key not in data_row.keys():
+          raise KeyError("key mismatch between dicts emitted by GenFun"
+                         "Expected {} keys; got {}".format(
+                             self._keys, data_row.keys()))
+        list_dict.setdefault(self._col_placeholders[index],
+                             list()).append(data_row[key])
+        list_dict_size += 1
+    feed_dict = {key: np.asarray(item) for key, item in list(list_dict.items())}
+    return feed_dict
+
+
+def _enqueue_data(data,
+                  capacity,
+                  shuffle=False,
+                  min_after_dequeue=None,
+                  num_threads=1,
+                  seed=None,
+                  name="enqueue_input",
+                  enqueue_size=1,
+                  num_epochs=None):
+  """Creates a queue filled from a numpy array or pandas `DataFrame`.
+
+    Returns a queue filled with the rows of the given (`OrderedDict` of) array
+    or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued
+    `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of)
+    numpy arrays, the first enqueued `Tensor` contains the row number.
+
+  Args:
+    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator
+       yielding `dict`s of numpy arrays  or pandas `DataFrame` that will be read
+       into the queue.
+    capacity: the capacity of the queue.
+    shuffle: whether or not to shuffle the rows of the array.
+    min_after_dequeue: minimum number of elements that can remain in the queue
+    after a dequeue operation. Only used when `shuffle` is true. If not set,
+    defaults to `capacity` / 4.
+    num_threads: number of threads used for reading and enqueueing.
+    seed: used to seed shuffling and reader starting points.
+    name: a scope name identifying the data.
+    enqueue_size: the number of rows to enqueue per step.
+    num_epochs: limit enqueuing to a specified number of epochs, if provided.
+
+  Returns:
+    A queue filled with the rows of the given (`OrderedDict` of) array or
+      `DataFrame`.
+
+  Raises:
+    TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
+      arrays, a numpy `ndarray`, or a generator producing these.
+  """
+  with ops.name_scope(name):
+    if isinstance(data, np.ndarray):
+      types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
+      queue_shapes = [(), data.shape[1:]]
+      get_feed_fn = _ArrayFeedFn
+    elif isinstance(data, collections.OrderedDict):
+      types = [dtypes.int64] + [
+          dtypes.as_dtype(col.dtype) for col in data.values()
+      ]
+      queue_shapes = [()] + [col.shape[1:] for col in data.values()]
+      get_feed_fn = _OrderedDictNumpyFeedFn
+    elif isinstance(data, tp.FunctionType):
+      x_first_el = six.next(data())
+      x_first_keys = sorted(x_first_el.keys())
+      x_first_values = [x_first_el[key] for key in x_first_keys]
+      types = [dtypes.as_dtype(col.dtype) for col in x_first_values]
+      queue_shapes = [col.shape for col in x_first_values]
+      get_feed_fn = _GeneratorFeedFn
+    elif HAS_PANDAS and isinstance(data, pd.DataFrame):
+      types = [
+          dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)
+      ]
+      queue_shapes = [() for _ in types]
+      get_feed_fn = _PandasFeedFn
+    else:
+      raise TypeError(
+          "data must be either a numpy array or pandas DataFrame if pandas is "
+          "installed; got {}".format(type(data).__name__))
+
+    # TODO(jamieas): TensorBoard warnings for all warnings below once available.
+
+    if num_threads > 1 and num_epochs is not None:
+      logging.warning(
+          "enqueue_data was called with num_epochs and num_threads > 1. "
+          "num_epochs is applied per thread, so this will produce more "
+          "epochs than you probably intend. "
+          "If you want to limit epochs, use one thread.")
+
+    if shuffle and num_threads > 1 and num_epochs is not None:
+      logging.warning(
+          "enqueue_data was called with shuffle=True, num_threads > 1, and "
+          "num_epochs. This will create multiple threads, all reading the "
+          "array/dataframe in order adding to the same shuffling queue; the "
+          "results will likely not be sufficiently shuffled.")
+
+    if not shuffle and num_threads > 1:
+      logging.warning(
+          "enqueue_data was called with shuffle=False and num_threads > 1. "
+          "This will create multiple threads, all reading the "
+          "array/dataframe in order. If you want examples read in order, use"
+          " one thread; if you want multiple threads, enable shuffling.")
+
+    if shuffle:
+      min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
+                              min_after_dequeue)
+      queue = data_flow_ops.RandomShuffleQueue(
+          capacity,
+          min_after_dequeue,
+          dtypes=types,
+          shapes=queue_shapes,
+          seed=seed)
+    else:
+      min_after_dequeue = 0  # just for the summary text
+      queue = data_flow_ops.FIFOQueue(
+          capacity, dtypes=types, shapes=queue_shapes)
+
+    enqueue_ops = []
+    feed_fns = []
+
+    for i in range(num_threads):
+      # Note the placeholders have no shapes, so they will accept any
+      # enqueue_size.  enqueue_many below will break them up.
+      placeholders = [array_ops.placeholder(t) for t in types]
+
+      enqueue_ops.append(queue.enqueue_many(placeholders))
+      seed_i = None if seed is None else (i + 1) * seed
+      feed_fns.append(
+          get_feed_fn(
+              placeholders,
+              data,
+              enqueue_size,
+              random_start=shuffle,
+              seed=seed_i,
+              num_epochs=num_epochs))
+
+    runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
+        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
+    queue_runner.add_queue_runner(runner)
+
+    full = (math_ops.cast(
+        math_ops.maximum(0, queue.size() - min_after_dequeue),
+        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
+    # Note that name contains a '/' at the end so we intentionally do not place
+    # a '/' after %s below.
+    summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
+                    (queue.name, min_after_dequeue,
+                     capacity - min_after_dequeue))
+    summary.scalar(summary_name, full)
+    return queue
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py b/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py
new file mode 100644
index 00000000000..0e602d3f331
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py
@@ -0,0 +1,295 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests feeding functions using arrays and `DataFrames`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from tensorflow.python.estimator.inputs.queues import feeding_functions as ff
+from tensorflow.python.platform import test
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+def vals_to_list(a):
+  return {
+      key: val.tolist() if isinstance(val, np.ndarray) else val
+      for key, val in a.items()
+  }
+
+
+class _FeedingFunctionsTestCase(test.TestCase):
+  """Tests for feeding functions."""
+
+  def testArrayFeedFnBatchOne(self):
+    array = np.arange(32).reshape([16, 2])
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, 1)
+
+    # cycle around a couple times
+    for x in range(0, 100):
+      i = x % 16
+      expected = {
+          "index_placeholder": [i],
+          "value_placeholder": [[2 * i, 2 * i + 1]]
+      }
+      actual = aff()
+      self.assertEqual(expected, vals_to_list(actual))
+
+  def testArrayFeedFnBatchFive(self):
+    array = np.arange(32).reshape([16, 2])
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, 5)
+
+    # cycle around a couple times
+    for _ in range(0, 101, 2):
+      aff()
+
+    expected = {
+        "index_placeholder": [15, 0, 1, 2, 3],
+        "value_placeholder": [[30, 31], [0, 1], [2, 3], [4, 5], [6, 7]]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testArrayFeedFnBatchTwoWithOneEpoch(self):
+    array = np.arange(5) + 10
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [0, 1],
+        "value_placeholder": [10, 11]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [2, 3],
+        "value_placeholder": [12, 13]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [4],
+        "value_placeholder": [14]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testArrayFeedFnBatchOneHundred(self):
+    array = np.arange(32).reshape([16, 2])
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, 100)
+
+    expected = {
+        "index_placeholder":
+            list(range(0, 16)) * 6 + list(range(0, 4)),
+        "value_placeholder":
+            np.arange(32).reshape([16, 2]).tolist() * 6 +
+            [[0, 1], [2, 3], [4, 5], [6, 7]]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testArrayFeedFnBatchOneHundredWithSmallerArrayAndMultipleEpochs(self):
+    array = np.arange(2) + 10
+    placeholders = ["index_placeholder", "value_placeholder"]
+    aff = ff._ArrayFeedFn(placeholders, array, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [0, 1, 0, 1],
+        "value_placeholder": [10, 11, 10, 11],
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testPandasFeedFnBatchOne(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 64)
+    array2 = np.arange(64, 96)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, 1)
+
+    # cycle around a couple times
+    for x in range(0, 100):
+      i = x % 32
+      expected = {
+          "index_placeholder": [i + 96],
+          "a_placeholder": [32 + i],
+          "b_placeholder": [64 + i]
+      }
+      actual = aff()
+      self.assertEqual(expected, vals_to_list(actual))
+
+  def testPandasFeedFnBatchFive(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 64)
+    array2 = np.arange(64, 96)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, 5)
+
+    # cycle around a couple times
+    for _ in range(0, 101, 2):
+      aff()
+
+    expected = {
+        "index_placeholder": [127, 96, 97, 98, 99],
+        "a_placeholder": [63, 32, 33, 34, 35],
+        "b_placeholder": [95, 64, 65, 66, 67]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testPandasFeedFnBatchTwoWithOneEpoch(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 37)
+    array2 = np.arange(64, 69)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 101))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [96, 97],
+        "a_placeholder": [32, 33],
+        "b_placeholder": [64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [98, 99],
+        "a_placeholder": [34, 35],
+        "b_placeholder": [66, 67]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [100],
+        "a_placeholder": [36],
+        "b_placeholder": [68]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testPandasFeedFnBatchOneHundred(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 64)
+    array2 = np.arange(64, 96)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, 100)
+
+    expected = {
+        "index_placeholder": list(range(96, 128)) * 3 + list(range(96, 100)),
+        "a_placeholder": list(range(32, 64)) * 3 + list(range(32, 36)),
+        "b_placeholder": list(range(64, 96)) * 3 + list(range(64, 68))
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testPandasFeedFnBatchOneHundredWithSmallDataArrayAndMultipleEpochs(self):
+    if not HAS_PANDAS:
+      return
+    array1 = np.arange(32, 34)
+    array2 = np.arange(64, 66)
+    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 98))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._PandasFeedFn(placeholders, df, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [96, 97, 96, 97],
+        "a_placeholder": [32, 33, 32, 33],
+        "b_placeholder": [64, 65, 64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testOrderedDictNumpyFeedFnBatchTwoWithOneEpoch(self):
+    a = np.arange(32, 37)
+    b = np.arange(64, 69)
+    x = {"a": a, "b": b}
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._OrderedDictNumpyFeedFn(
+        placeholders, ordered_dict_x, batch_size=2, num_epochs=1)
+
+    expected = {
+        "index_placeholder": [0, 1],
+        "a_placeholder": [32, 33],
+        "b_placeholder": [64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [2, 3],
+        "a_placeholder": [34, 35],
+        "b_placeholder": [66, 67]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+    expected = {
+        "index_placeholder": [4],
+        "a_placeholder": [36],
+        "b_placeholder": [68]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+  def testOrderedDictNumpyFeedFnLargeBatchWithSmallArrayAndMultipleEpochs(self):
+    a = np.arange(32, 34)
+    b = np.arange(64, 66)
+    x = {"a": a, "b": b}
+    ordered_dict_x = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
+    aff = ff._OrderedDictNumpyFeedFn(
+        placeholders, ordered_dict_x, batch_size=100, num_epochs=2)
+
+    expected = {
+        "index_placeholder": [0, 1, 0, 1],
+        "a_placeholder": [32, 33, 32, 33],
+        "b_placeholder": [64, 65, 64, 65]
+    }
+    actual = aff()
+    self.assertEqual(expected, vals_to_list(actual))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py
new file mode 100644
index 00000000000..afbcab596a1
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py
@@ -0,0 +1,180 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `QueueRunner` that takes a feed function as an argument."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import queue_runner as qr
+
+
+class _FeedingQueueRunner(qr.QueueRunner):
+  """A queue runner that allows the feeding of values such as numpy arrays."""
+
+  def __init__(self, queue=None, enqueue_ops=None, close_op=None,
+               cancel_op=None, feed_fns=None,
+               queue_closed_exception_types=None):
+    """Initialize the queue runner.
+
+    For further documentation, see `queue_runner.py`. Note that
+    `FeedingQueueRunner` does not support construction from protobuffer nor
+    serialization to protobuffer.
+
+    Args:
+      queue: A `Queue`.
+      enqueue_ops: List of enqueue ops to run in threads later.
+      close_op: Op to close the queue. Pending enqueue ops are preserved.
+      cancel_op: Op to close the queue and cancel pending enqueue ops.
+      feed_fns: a list of functions that return a dictionary mapping fed
+        `Tensor`s to values. Must be the same length as `enqueue_ops`.
+      queue_closed_exception_types: Optional tuple of Exception types that
+        indicate that the queue has been closed when raised during an enqueue
+        operation.  Defaults to
+        `(tf.errors.OutOfRangeError, tf.errors.CancelledError)`.
+
+    Raises:
+      ValueError: `feed_fns` is not `None` and has different length than
+        `enqueue_ops`.
+    """
+    if queue_closed_exception_types is None:
+      queue_closed_exception_types = (
+          errors.OutOfRangeError, errors.CancelledError)
+    super(_FeedingQueueRunner, self).__init__(
+        queue, enqueue_ops, close_op,
+        cancel_op, queue_closed_exception_types=queue_closed_exception_types)
+    if feed_fns is None:
+      self._feed_fns = [None for _ in enqueue_ops]
+    else:
+      if len(feed_fns) != len(enqueue_ops):
+        raise ValueError(
+            "If feed_fns is not None, it must have the same length as "
+            "enqueue_ops.")
+      self._feed_fns = feed_fns
+
+  # pylint: disable=broad-except
+  def _run(self, sess, enqueue_op, feed_fn, coord=None):
+    """Execute the enqueue op in a loop, close the queue in case of error.
+
+    Args:
+      sess: A `Session`.
+      enqueue_op: The `Operation` to run.
+      feed_fn: the feed function to pass to `sess.run`.
+      coord: Optional `Coordinator` object for reporting errors and checking
+        for stop conditions.
+
+    """
+    # TODO(jamieas): Reduce code duplication with `QueueRunner`.
+    if coord:
+      coord.register_thread(threading.current_thread())
+    decremented = False
+    try:
+      while True:
+        if coord and coord.should_stop():
+          break
+        try:
+          feed_dict = None if feed_fn is None else feed_fn()
+          sess.run(enqueue_op, feed_dict=feed_dict)
+        except (errors.OutOfRangeError, errors.CancelledError):
+          # This exception indicates that a queue was closed.
+          with self._lock:
+            self._runs_per_session[sess] -= 1
+            decremented = True
+            if self._runs_per_session[sess] == 0:
+              try:
+                sess.run(self._close_op)
+              except Exception as e:
+                # Intentionally ignore errors from close_op.
+                logging.vlog(1, "Ignored exception: %s", str(e))
+            return
+    except Exception as e:
+      # This catches all other exceptions.
+      if coord:
+        coord.request_stop(e)
+      else:
+        logging.error("Exception in QueueRunner: %s", str(e))
+        with self._lock:
+          self._exceptions_raised.append(e)
+        raise
+    finally:
+      # Make sure we account for all terminations: normal or errors.
+      if not decremented:
+        with self._lock:
+          self._runs_per_session[sess] -= 1
+
+  def create_threads(self, sess, coord=None, daemon=False, start=False):
+    """Create threads to run the enqueue ops for the given session.
+
+    This method requires a session in which the graph was launched.  It creates
+    a list of threads, optionally starting them.  There is one thread for each
+    op passed in `enqueue_ops`.
+
+    The `coord` argument is an optional coordinator, that the threads will use
+    to terminate together and report exceptions.  If a coordinator is given,
+    this method starts an additional thread to close the queue when the
+    coordinator requests a stop.
+
+    If previously created threads for the given session are still running, no
+    new threads will be created.
+
+    Args:
+      sess: A `Session`.
+      coord: Optional `Coordinator` object for reporting errors and checking
+        stop conditions.
+      daemon: Boolean.  If `True` make the threads daemon threads.
+      start: Boolean.  If `True` starts the threads.  If `False` the
+        caller must call the `start()` method of the returned threads.
+
+    Returns:
+      A list of threads.
+    """
+    with self._lock:
+      try:
+        if self._runs_per_session[sess] > 0:
+          # Already started: no new threads to return.
+          return []
+      except KeyError:
+        # We haven't seen this session yet.
+        pass
+      self._runs_per_session[sess] = len(self._enqueue_ops)
+      self._exceptions_raised = []
+
+    ret_threads = [threading.Thread(target=self._run,
+                                    args=(sess, op, feed_fn, coord))
+                   for op, feed_fn in zip(self._enqueue_ops, self._feed_fns)]
+    if coord:
+      ret_threads.append(threading.Thread(target=self._close_on_stop,
+                                          args=(sess, self._cancel_op, coord)))
+    for t in ret_threads:
+      if daemon:
+        t.daemon = True
+      if start:
+        t.start()
+    return ret_threads
+
+  def _init_from_proto(self, queue_runner_def):
+    raise NotImplementedError(
+        "{} does not support initialization from proto.".format(type(
+            self).__name__))
+
+  def to_proto(self):
+    raise NotImplementedError(
+        "{} does not support serialization to proto.".format(type(
+            self).__name__))
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py
new file mode 100644
index 00000000000..6292eb7da1b
--- /dev/null
+++ b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py
@@ -0,0 +1,140 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests `FeedingQueueRunner` using arrays and `DataFrames`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator.inputs.queues import feeding_functions as ff
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+
+def get_rows(array, row_indices):
+  rows = [array[i] for i in row_indices]
+  return np.vstack(rows)
+
+
+class FeedingQueueRunnerTestCase(test.TestCase):
+  """Tests for `FeedingQueueRunner`."""
+
+  def testArrayFeeding(self):
+    with ops.Graph().as_default():
+      array = np.arange(32).reshape([16, 2])
+      q = ff._enqueue_data(array, capacity=100)
+      batch_size = 3
+      dq_op = q.dequeue_many(batch_size)
+      with session.Session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        for i in range(100):
+          indices = [
+              j % array.shape[0]
+              for j in range(batch_size * i, batch_size * (i + 1))
+          ]
+          expected_dq = get_rows(array, indices)
+          dq = sess.run(dq_op)
+          np.testing.assert_array_equal(indices, dq[0])
+          np.testing.assert_array_equal(expected_dq, dq[1])
+        coord.request_stop()
+        coord.join(threads)
+
+  def testArrayFeedingMultiThread(self):
+    with ops.Graph().as_default():
+      array = np.arange(256).reshape([128, 2])
+      q = ff._enqueue_data(array, capacity=128, num_threads=8, shuffle=True)
+      batch_size = 3
+      dq_op = q.dequeue_many(batch_size)
+      with session.Session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        for _ in range(100):
+          dq = sess.run(dq_op)
+          indices = dq[0]
+          expected_dq = get_rows(array, indices)
+          np.testing.assert_array_equal(expected_dq, dq[1])
+        coord.request_stop()
+        coord.join(threads)
+
+  def testPandasFeeding(self):
+    if not HAS_PANDAS:
+      return
+    with ops.Graph().as_default():
+      array1 = np.arange(32)
+      array2 = np.arange(32, 64)
+      df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(64, 96))
+      q = ff._enqueue_data(df, capacity=100)
+      batch_size = 5
+      dq_op = q.dequeue_many(5)
+      with session.Session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        for i in range(100):
+          indices = [
+              j % array1.shape[0]
+              for j in range(batch_size * i, batch_size * (i + 1))
+          ]
+          expected_df_indices = df.index[indices]
+          expected_rows = df.iloc[indices]
+          dq = sess.run(dq_op)
+          np.testing.assert_array_equal(expected_df_indices, dq[0])
+          for col_num, col in enumerate(df.columns):
+            np.testing.assert_array_equal(expected_rows[col].values,
+                                          dq[col_num + 1])
+        coord.request_stop()
+        coord.join(threads)
+
+  def testPandasFeedingMultiThread(self):
+    if not HAS_PANDAS:
+      return
+    with ops.Graph().as_default():
+      array1 = np.arange(128, 256)
+      array2 = 2 * array1
+      df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(128))
+      q = ff._enqueue_data(df, capacity=128, num_threads=8, shuffle=True)
+      batch_size = 5
+      dq_op = q.dequeue_many(batch_size)
+      with session.Session() as sess:
+        coord = coordinator.Coordinator()
+        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+        for _ in range(100):
+          dq = sess.run(dq_op)
+          indices = dq[0]
+          expected_rows = df.iloc[indices]
+          for col_num, col in enumerate(df.columns):
+            np.testing.assert_array_equal(expected_rows[col].values,
+                                          dq[col_num + 1])
+        coord.request_stop()
+        coord.join(threads)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
new file mode 100644
index 00000000000..1aa26239620
--- /dev/null
+++ b/tensorflow/python/estimator/model_fn.py
@@ -0,0 +1,312 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Classes and methods related to model_fn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import six
+
+from tensorflow.python.estimator.export.export_output import ExportOutput
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import nest
+
+
+class ModeKeys(object):
+  """Standard names for model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training mode.
+  * `EVAL`: evaluation mode.
+  * `PREDICT`: inference mode.
+  """
+
+  TRAIN = 'train'
+  EVAL = 'eval'
+  PREDICT = 'infer'
+
+
+class MetricKeys(object):
+  """Metric key strings."""
+  LOSS = 'loss'
+  AVERAGE_LOSS = 'average_loss'
+
+
+class EstimatorSpec(
+    collections.namedtuple('EstimatorSpec', [
+        'predictions', 'loss', 'train_op', 'eval_metric_ops',
+        'export_outputs', 'training_chief_hooks', 'training_hooks',
+        'scaffold'
+    ])):
+  """Ops and objects returned from a `model_fn` and passed to `Estimator`.
+
+  `EstimatorSpec` fully defines the model to be run by `Estimator`.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metric_ops=None,
+              export_outputs=None,
+              training_chief_hooks=None,
+              training_hooks=None,
+              scaffold=None):
+    """Creates a validated `EstimatorSpec` instance.
+
+    Depending on the value of `mode`, different arguments are required. Namely
+    * For `mode == ModeKeys.TRAIN`: required fields are `loss` and `train_op`.
+    * For `mode == ModeKeys.EVAL`: required field is`loss`.
+    * For `mode == ModeKeys.PREDICT`: required fields are `predictions`.
+
+    model_fn can populate all arguments independent of mode. In this case, some
+    arguments will be ignored by `Estimator`. E.g. `train_op` will be ignored
+    in eval and infer modes. Example:
+
+    ```python
+    def my_model_fn(mode, features, labels):
+      predictions = ...
+      loss = ...
+      train_op = ...
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          predictions=predictions,
+          loss=loss,
+          train_op=train_op)
+    ```
+
+    Alternatively, model_fn can just populate the arguments appropriate to the
+    given mode. Example:
+
+    ```python
+    def my_model_fn(mode, features, labels):
+      if (mode == tf.estimator.ModeKeys.TRAIN or
+          mode == tf.estimator.ModeKeys.EVAL):
+        loss = ...
+      else:
+        loss = None
+      if mode == tf.estimator.ModeKeys.TRAIN:
+        train_op = ...
+      else:
+        train_op = None
+      if mode == tf.estimator.ModeKeys.PREDICT:
+        predictions = ...
+      else:
+        predictions = None
+
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          predictions=predictions,
+          loss=loss,
+          train_op=train_op)
+    ```
+
+    Args:
+      mode: A `ModeKeys`. Specifies if this is training, evaluation or
+        prediction.
+      predictions: Predictions `Tensor` or dict of `Tensor`.
+      loss: Training loss `Tensor`. Must be either scalar, or with shape `[1]`.
+      train_op: Op for the training step.
+      eval_metric_ops: Dict of metric results keyed by name. The values of the
+        dict are the results of calling a metric function, namely a
+        `(metric_tensor, update_op)` tuple.
+      export_outputs: Describes the output signatures to be exported to
+        `SavedModel` and used during serving.
+        A dict `{name: output}` where:
+        * name: An arbitrary name for this output.
+        * output: an `ExportOutput` object such as `ClassificationOutput`,
+            `RegressionOutput`, or `PredictOutput`.
+        Single-headed models only need to specify one entry in this dictionary.
+        Multi-headed models should specify one entry for each head, one of
+        which must be named using
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.
+      training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run on the chief worker during training.
+      training_hooks: Iterable of `tf.train.SessionRunHook` objects that to run
+        on all workers during training.
+      scaffold: A `tf.train.Scaffold` object that can be used to set
+        initialization, saver, and more to be used in training.
+
+    Returns:
+      A validated `EstimatorSpec` object.
+
+    Raises:
+      ValueError: If validation fails.
+      TypeError: If any of the arguments is not the expected type.
+    """
+    # Validate train_op.
+    if train_op is None:
+      if mode == ModeKeys.TRAIN:
+        raise ValueError('Missing train_op.')
+    else:
+      _check_is_tensor_or_operation(train_op, 'train_op')
+
+    # Validate loss.
+    if loss is None:
+      if mode in (ModeKeys.TRAIN, ModeKeys.EVAL):
+        raise ValueError('Missing loss.')
+    else:
+      loss = _check_is_tensor(loss, 'loss')
+      loss_shape = loss.get_shape()
+      if loss_shape.num_elements() not in (None, 1):
+        raise ValueError('Loss must be scalar, given: {}'.format(loss))
+      if not loss_shape.is_compatible_with(tensor_shape.scalar()):
+        loss = array_ops.reshape(loss, [])
+
+    # Validate predictions.
+    if predictions is None:
+      if mode == ModeKeys.PREDICT:
+        raise ValueError('Missing predictions.')
+      predictions = {}
+    else:
+      if isinstance(predictions, dict):
+        predictions = {
+            k: _check_is_tensor(v, 'predictions[{}]'.format(k))
+            for k, v in six.iteritems(predictions)
+        }
+      else:
+        predictions = _check_is_tensor(predictions, 'predictions')
+
+    # Validate eval_metric_ops.
+    if eval_metric_ops is None:
+      eval_metric_ops = {}
+    else:
+      if not isinstance(eval_metric_ops, dict):
+        raise TypeError(
+            'eval_metric_ops must be a dict, given: {}'.format(eval_metric_ops))
+      for key, metric_value_and_update in six.iteritems(eval_metric_ops):
+        if (not isinstance(metric_value_and_update, tuple) or
+            len(metric_value_and_update) != 2):
+          raise TypeError(
+              'Values of eval_metric_ops must be (metric_value, update_op) '
+              'tuples, given: {} for key: {}'.format(
+                  metric_value_and_update, key))
+        metric_value, metric_update = metric_value_and_update
+        for metric_value_member in nest.flatten(metric_value):
+          # Allow (possibly nested) tuples for metric values, but require that
+          # each of them be Tensors or Operations.
+          _check_is_tensor_or_operation(metric_value_member,
+                                        'eval_metric_ops[{}]'.format(key))
+        _check_is_tensor_or_operation(metric_update,
+                                      'eval_metric_ops[{}]'.format(key))
+
+    # Validate export_outputs.
+    if export_outputs is not None:
+      if not isinstance(export_outputs, dict):
+        raise TypeError('export_outputs must be dict, given: {}'.format(
+            export_outputs))
+      for v in six.itervalues(export_outputs):
+        if not isinstance(v, ExportOutput):
+          raise TypeError(
+              'Values in export_outputs must be ExportOutput objects. '
+              'Given: {}'.format(export_outputs))
+      # Note export_outputs is allowed to be empty.
+      if len(export_outputs) == 1:
+        (key, value), = export_outputs.items()
+        if key != signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+          export_outputs[
+              signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = value
+      if len(export_outputs) > 1:
+        if (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+            not in export_outputs):
+          raise ValueError(
+              'Multiple export_outputs were provided, but none of them is '
+              'specified as the default.  Do this by naming one of them with '
+              'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.')
+
+    # Validate that all tensors and ops are from the default graph.
+    default_graph = ops.get_default_graph()
+
+    # We enumerate possible error causes here to aid in debugging.
+    error_message_template = (
+        '{0} with "{1}" must be from the default graph. '
+        'Possible causes of this error include: \n\n'
+        '1) {0} was created outside the context of the default graph.'
+        '\n\n'
+        '2) The object passed through to EstimatorSpec was not created '
+        'in the most recent call to "model_fn".')
+
+    if isinstance(predictions, dict):
+      for key, value in six.iteritems(predictions):
+        if value.graph is not default_graph:
+          raise ValueError(error_message_template.format(
+              'prediction values',
+              '{0}: {1}'.format(key, value.name)))
+    elif predictions is not None:
+      # 'predictions' must be a single Tensor.
+      if predictions.graph is not default_graph:
+        raise ValueError(error_message_template.format(
+            'prediction values', predictions.name))
+
+    if loss is not None and loss.graph is not default_graph:
+      raise ValueError(error_message_template.format('loss', loss.name))
+    if train_op is not None and train_op.graph is not default_graph:
+      raise ValueError(error_message_template.format('train_op', train_op.name))
+    for key, value in list(six.iteritems(eval_metric_ops)):
+      values = nest.flatten(value)
+      for value in values:
+        if value.graph is not default_graph:
+          raise ValueError(error_message_template.format(
+              'eval_metric_ops',
+              '{0}: {1}'.format(key, value.name)))
+
+    # Validate hooks.
+    training_chief_hooks = tuple(training_chief_hooks or [])
+    training_hooks = tuple(training_hooks or [])
+    for hook in training_hooks + training_chief_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError(
+            'All hooks must be SessionRunHook instances, given: {}'.format(
+                hook))
+
+    scaffold = scaffold or monitored_session.Scaffold()
+    # Validate scaffold.
+    if not isinstance(scaffold, monitored_session.Scaffold):
+      raise TypeError(
+          'scaffold must be tf.train.Scaffold. Given: {}'.format(scaffold))
+
+    return super(EstimatorSpec, cls).__new__(
+        cls,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=export_outputs,
+        training_chief_hooks=training_chief_hooks,
+        training_hooks=training_hooks,
+        scaffold=scaffold)
+
+
+def _check_is_tensor_or_operation(x, name):
+  if not (isinstance(x, ops.Operation) or isinstance(x, ops.Tensor)):
+    raise TypeError('{} must be Operation or Tensor, given: {}'.format(name, x))
+
+
+def _check_is_tensor(x, tensor_name):
+  """Returns `x` if it is a `Tensor`, raises TypeError otherwise."""
+  if not isinstance(x, ops.Tensor):
+    raise TypeError('{} must be Tensor, given: {}'.format(tensor_name, x))
+  return x
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
new file mode 100644
index 00000000000..96c38a987b3
--- /dev/null
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -0,0 +1,548 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for model_fn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_run_hook
+
+
+class _FakeHook(session_run_hook.SessionRunHook):
+  """Fake implementation of `SessionRunHook`."""
+
+
+class _InvalidHook(object):
+  """Invalid hook (not a subclass of `SessionRunHook`)."""
+
+
+class _InvalidScaffold(object):
+  """Invalid scaffold (not a subclass of `Scaffold`)."""
+
+
+class EstimatorSpecTrainTest(test.TestCase):
+  """Tests EstimatorSpec in train mode."""
+
+  def testRequiredArgumentsSet(self):
+    """Tests that no errors are raised when all required arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          loss=constant_op.constant(1.),
+          train_op=control_flow_ops.no_op())
+
+  def testAllArgumentsSet(self):
+    """Tests that no errors are raised when all arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      predictions = {'loss': loss}
+      classes = constant_op.constant('hello')
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=loss,
+          train_op=control_flow_ops.no_op(),
+          eval_metric_ops={'loss': (control_flow_ops.no_op(), loss)},
+          export_outputs={
+              'head_name': export_output.ClassificationOutput(classes=classes)
+          },
+          training_chief_hooks=[_FakeHook()],
+          training_hooks=[_FakeHook()],
+          scaffold=monitored_session.Scaffold())
+
+  def testLossNumber(self):
+    """Tests that error is raised when loss is a number (not Tensor)."""
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=1.,
+            train_op=control_flow_ops.no_op())
+
+  def testLoss1DTensor(self):
+    """Tests that no errors are raised when loss is 1D tensor."""
+    with ops.Graph().as_default(), self.test_session():
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          loss=constant_op.constant([1.]),
+          train_op=control_flow_ops.no_op())
+
+  def testLossMissing(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Missing loss'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN, train_op=control_flow_ops.no_op())
+
+  def testLossNotScalar(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant([1., 2.]),
+            train_op=control_flow_ops.no_op())
+
+  def testLossSparseTensor(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = sparse_tensor.SparseTensor(
+          indices=[[0]],
+          values=[0.],
+          dense_shape=[1])
+      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=loss,
+            train_op=control_flow_ops.no_op())
+
+  def testLossFromDifferentGraph(self):
+    with ops.Graph().as_default():
+      loss = constant_op.constant(1.)
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'must be from the default graph'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=loss,
+            train_op=control_flow_ops.no_op())
+
+  def testTrainOpMissing(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Missing train_op'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN, loss=constant_op.constant(1.))
+
+  def testTrainOpNotOperationAndTensor(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(TypeError,
+                                   'train_op must be Operation or Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant(1.),
+            train_op='Not an Operation or Tensor')
+
+  def testTrainOpFromDifferentGraph(self):
+    with ops.Graph().as_default():
+      train_op = control_flow_ops.no_op()
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'must be from the default graph'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant(1.),
+            train_op=train_op)
+
+  def testTrainingChiefHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant(1.),
+            train_op=control_flow_ops.no_op(),
+            training_chief_hooks=[_InvalidHook()])
+
+  def testTrainingHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant(1.),
+            train_op=control_flow_ops.no_op(),
+            training_hooks=[_InvalidHook()])
+
+  def testScaffoldInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, r'scaffold must be tf\.train\.Scaffold'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.TRAIN,
+            loss=constant_op.constant(1.),
+            train_op=control_flow_ops.no_op(),
+            scaffold=_InvalidScaffold())
+
+  def testReturnDefaultScaffold(self):
+    with ops.Graph().as_default(), self.test_session():
+      estimator_spec = model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          loss=constant_op.constant(1.),
+          train_op=control_flow_ops.no_op())
+      self.assertIsNotNone(estimator_spec.scaffold)
+
+
+class EstimatorSpecEvalTest(test.TestCase):
+  """Tests EstimatorSpec in eval mode."""
+
+  def testRequiredArgumentsSet(self):
+    """Tests that no errors are raised when all required arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          predictions={'loss': loss},
+          loss=loss)
+
+  def testAllArgumentsSet(self):
+    """Tests that no errors are raised when all arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      predictions = {'loss': loss}
+      classes = constant_op.constant('hello')
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          predictions=predictions,
+          loss=loss,
+          train_op=control_flow_ops.no_op(),
+          eval_metric_ops={'loss': (control_flow_ops.no_op(), loss)},
+          export_outputs={
+              'head_name': export_output.ClassificationOutput(classes=classes)
+          },
+          training_chief_hooks=[_FakeHook()],
+          training_hooks=[_FakeHook()],
+          scaffold=monitored_session.Scaffold())
+
+  def testTupleMetric(self):
+    """Tests that no errors are raised when a metric is tuple-valued."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          loss=loss,
+          eval_metric_ops={
+              'some_metric': ((loss, loss, (constant_op.constant(2), loss)),
+                              control_flow_ops.no_op())})
+
+  def testLoss1DTensor(self):
+    """Tests that no errors are raised when loss is 1D tensor."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant([1.])
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          predictions={'loss': loss},
+          loss=loss)
+
+  def testLossNumber(self):
+    """Tests that error is raised when loss is a number (not Tensor)."""
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': constant_op.constant(1.)},
+            loss=1.)
+
+  def testLossMissing(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Missing loss'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': constant_op.constant(1.)})
+
+  def testLossNotScalar(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant([1., 2.])
+      with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss)
+
+  def testLossSparseTensor(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = sparse_tensor.SparseTensor(
+          indices=[[0]],
+          values=[0.],
+          dense_shape=[1])
+      with self.assertRaisesRegexp(
+          TypeError, 'loss must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'prediction': constant_op.constant(1.)},
+            loss=loss)
+
+  def testLossFromDifferentGraph(self):
+    with ops.Graph().as_default():
+      loss = constant_op.constant(1.)
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'must be from the default graph'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'prediction': constant_op.constant(1.)},
+            loss=loss)
+
+  def testPredictionsMissingIsOkay(self):
+    with ops.Graph().as_default(), self.test_session():
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL, loss=constant_op.constant(1.))
+
+  def testPredictionsTensor(self):
+    """Tests that no error is raised when predictions is Tensor (not dict)."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          predictions=loss,
+          loss=loss)
+
+  def testPredictionsNumber(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, r'predictions\[number\] must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'number': 1.},
+            loss=constant_op.constant(1.))
+
+  def testPredictionsSparseTensor(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {
+          'sparse': sparse_tensor.SparseTensor(
+              indices=[[0]],
+              values=[0.],
+              dense_shape=[1])}
+      with self.assertRaisesRegexp(
+          TypeError, r'predictions\[sparse\] must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=constant_op.constant(1.))
+
+  def testPredictionsFromDifferentGraph(self):
+    with ops.Graph().as_default():
+      predictions = {'loss': constant_op.constant(1.)}
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'must be from the default graph'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=constant_op.constant(1.))
+
+  def testEvalMetricOpsNoDict(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      with self.assertRaisesRegexp(
+          TypeError, 'eval_metric_ops must be a dict'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss,
+            eval_metric_ops=loss)
+
+  def testEvalMetricOpsNoTuple(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      with self.assertRaisesRegexp(
+          TypeError,
+          (r'Values of eval_metric_ops must be \(metric_value, update_op\) '
+           'tuples')):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss,
+            eval_metric_ops={'loss': loss})
+
+  def testEvalMetricOpsNoTensorOrOperation(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss,
+            eval_metric_ops={'loss': ('NonTensor', loss)})
+
+  def testEvalMetricNestedNoTensorOrOperation(self):
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss,
+            eval_metric_ops={'loss': ((('NonTensor',),),
+                                      control_flow_ops.no_op())})
+
+  def testEvalMetricOpsFromDifferentGraph(self):
+    with ops.Graph().as_default():
+      eval_metric_ops = {
+          'loss': (control_flow_ops.no_op(), constant_op.constant(1.))}
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      with self.assertRaisesRegexp(
+          ValueError, 'must be from the default graph'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions={'loss': loss},
+            loss=loss,
+            eval_metric_ops=eval_metric_ops)
+
+
+class EstimatorSpecInferTest(test.TestCase):
+  """Tests EstimatorSpec in infer mode."""
+
+  def testRequiredArgumentsSet(self):
+    """Tests that no errors are raised when all required arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.PREDICT,
+          predictions={'loss': constant_op.constant(1.)})
+
+  def testAllArgumentsSet(self):
+    """Tests that no errors are raised when all arguments are set."""
+    with ops.Graph().as_default(), self.test_session():
+      loss = constant_op.constant(1.)
+      predictions = {'loss': loss}
+      classes = constant_op.constant('hello')
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.PREDICT,
+          predictions=predictions,
+          loss=loss,
+          train_op=control_flow_ops.no_op(),
+          eval_metric_ops={'loss': (control_flow_ops.no_op(), loss)},
+          export_outputs={
+              'head_name': export_output.ClassificationOutput(classes=classes)
+          },
+          training_chief_hooks=[_FakeHook()],
+          training_hooks=[_FakeHook()],
+          scaffold=monitored_session.Scaffold())
+
+  def testPredictionsMissing(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'Missing predictions'):
+        model_fn.EstimatorSpec(mode=model_fn.ModeKeys.PREDICT)
+
+  def testPredictionsTensor(self):
+    """Tests that no error is raised when predictions is Tensor (not dict)."""
+    with ops.Graph().as_default(), self.test_session():
+      model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.PREDICT, predictions=constant_op.constant(1.))
+
+  def testPredictionsNumber(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, r'predictions\[number\] must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT, predictions={'number': 1.})
+
+  def testPredictionsSparseTensor(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {
+          'sparse': sparse_tensor.SparseTensor(
+              indices=[[0]],
+              values=[0.],
+              dense_shape=[1])}
+      with self.assertRaisesRegexp(
+          TypeError, r'predictions\[sparse\] must be Tensor'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT, predictions=predictions)
+
+  def testExportOutputsNoDict(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {'loss': constant_op.constant(1.)}
+      classes = constant_op.constant('hello')
+      with self.assertRaisesRegexp(
+          TypeError, 'export_outputs must be dict'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs=export_output.ClassificationOutput(classes=classes))
+
+  def testExportOutputsValueNotExportOutput(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {'loss': constant_op.constant(1.)}
+      with self.assertRaisesRegexp(
+          TypeError,
+          r"Values in export_outputs must be ExportOutput objects. "
+          r"Given: {'head_name': {'loss': <tf.Tensor 'Const:0' shape=\(\) "
+          r"dtype=float32>}}"):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={'head_name': predictions})
+
+  def testExportOutputsSingleheadMissingDefault(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {'loss': constant_op.constant(1.)}
+      output_1 = constant_op.constant([1.])
+      regression_output = export_output.RegressionOutput(value=output_1)
+      export_outputs = {
+          'head-1': regression_output,
+          }
+      estimator_spec = model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.PREDICT,
+          predictions=predictions,
+          export_outputs=export_outputs)
+      expected_export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+          regression_output,
+          'head-1': regression_output,
+      }
+      self.assertEqual(expected_export_outputs, estimator_spec.export_outputs)
+
+  def testExportOutputsMultiheadWithDefault(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {'loss': constant_op.constant(1.)}
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(['2'])
+      output_3 = constant_op.constant(['3'])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+          export_output.RegressionOutput(value=output_1),
+          'head-2': export_output.ClassificationOutput(classes=output_2),
+          'head-3': export_output.PredictOutput(outputs={
+              'some_output_3': output_3
+          })}
+      estimator_spec = model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.PREDICT,
+          predictions=predictions,
+          export_outputs=export_outputs)
+      self.assertEqual(export_outputs, estimator_spec.export_outputs)
+
+  def testExportOutputsMultiheadMissingDefault(self):
+    with ops.Graph().as_default(), self.test_session():
+      predictions = {'loss': constant_op.constant(1.)}
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(['2'])
+      output_3 = constant_op.constant(['3'])
+      export_outputs = {
+          'head-1': export_output.RegressionOutput(value=output_1),
+          'head-2': export_output.ClassificationOutput(classes=output_2),
+          'head-3': export_output.PredictOutput(outputs={
+              'some_output_3': output_3
+          })}
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Multiple export_outputs were provided, but none of them is '
+          'specified as the default.  Do this by naming one of them with '
+          'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs=export_outputs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
new file mode 100644
index 00000000000..30ba18d07db
--- /dev/null
+++ b/tensorflow/python/estimator/run_config.py
@@ -0,0 +1,245 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Environment configuration object for Estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import six
+
+from tensorflow.core.protobuf import config_pb2
+
+
+# A list of the property names in RunConfig user allows to change.
+_DEFAULT_REPLACEABLE_LIST = [
+    'model_dir',
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+
+
+def _validate_save_ckpt_with_replaced_keys(new_copy, replaced_keys):
+  """Validates the save ckpt properties."""
+  # Ensure one (and only one) of save_steps and save_secs is not None.
+  # Also, if user sets one save ckpt property, say steps, the other one (secs)
+  # should be set as None to improve usability.
+
+  save_steps = new_copy.save_checkpoints_steps
+  save_secs = new_copy.save_checkpoints_secs
+
+  if ('save_checkpoints_steps' in replaced_keys and
+      'save_checkpoints_secs' in replaced_keys):
+    # If user sets both properties explicitly, we need to error out if both
+    # are set or neither of them are set.
+    if save_steps is not None and save_secs is not None:
+      raise ValueError(_SAVE_CKPT_ERR)
+  elif 'save_checkpoints_steps' in replaced_keys and save_steps is not None:
+    new_copy._save_checkpoints_secs = None  # pylint: disable=protected-access
+  elif 'save_checkpoints_secs' in replaced_keys and save_secs is not None:
+    new_copy._save_checkpoints_steps = None  # pylint: disable=protected-access
+
+
+def _validate_properties(run_config):
+  """Validates the properties."""
+  def _validate(property_name, cond, message):
+    property_value = getattr(run_config, property_name)
+    if property_value is not None and not cond(property_value):
+      raise ValueError(message)
+
+  _validate('model_dir', lambda dir: dir,
+            message='model_dir should be non-empty')
+
+  _validate('save_summary_steps', lambda steps: steps >= 0,
+            message='save_summary_steps should be >= 0')
+
+  _validate('save_checkpoints_steps', lambda steps: steps >= 0,
+            message='save_checkpoints_steps should be >= 0')
+  _validate('save_checkpoints_secs', lambda secs: secs >= 0,
+            message='save_checkpoints_secs should be >= 0')
+
+  _validate('session_config',
+            lambda sc: isinstance(sc, config_pb2.ConfigProto),
+            message='session_config must be instance of ConfigProto')
+
+  _validate('keep_checkpoint_max', lambda keep_max: keep_max >= 0,
+            message='keep_checkpoint_max should be >= 0')
+  _validate('keep_checkpoint_every_n_hours', lambda keep_hours: keep_hours > 0,
+            message='keep_checkpoint_every_n_hours should be > 0')
+
+  _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
+            message='tf_random_seed must be integer.')
+
+
+class TaskType(object):
+  MASTER = 'master'
+  PS = 'ps'
+  WORKER = 'worker'
+
+
+class RunConfig(object):
+  """This class specifies the configurations for an `Estimator` run."""
+
+  def __init__(self):
+    self._model_dir = None
+    self._tf_random_seed = 1
+    self._save_summary_steps = 100
+    self._save_checkpoints_secs = 600
+    self._save_checkpoints_steps = None
+    self._session_config = None
+    self._keep_checkpoint_max = 5
+    self._keep_checkpoint_every_n_hours = 10000
+    _validate_properties(self)
+
+  @property
+  def cluster_spec(self):
+    return None
+
+  @property
+  def evaluation_master(self):
+    return ''
+
+  @property
+  def is_chief(self):
+    return True
+
+  @property
+  def master(self):
+    return ''
+
+  @property
+  def num_ps_replicas(self):
+    return 0
+
+  @property
+  def num_worker_replicas(self):
+    return 1
+
+  @property
+  def task_id(self):
+    return 0
+
+  @property
+  def task_type(self):
+    return TaskType.WORKER
+
+  @property
+  def tf_random_seed(self):
+    return self._tf_random_seed
+
+  @property
+  def save_summary_steps(self):
+    return self._save_summary_steps
+
+  @property
+  def save_checkpoints_secs(self):
+    return self._save_checkpoints_secs
+
+  @property
+  def session_config(self):
+    return self._session_config
+
+  @property
+  def save_checkpoints_steps(self):
+    return self._save_checkpoints_steps
+
+  @property
+  def keep_checkpoint_max(self):
+    return self._keep_checkpoint_max
+
+  @property
+  def keep_checkpoint_every_n_hours(self):
+    return self._keep_checkpoint_every_n_hours
+
+  @property
+  def model_dir(self):
+    return self._model_dir
+
+  def replace(self, **kwargs):
+    """Returns a new instance of `RunConfig` replacing specified properties.
+
+    Only the properties in the following list are allowed to be replaced:
+      - `model_dir`.
+      - `tf_random_seed`,
+      - `save_summary_steps`,
+      - `save_checkpoints_steps`,
+      - `save_checkpoints_secs`,
+      - `session_config`,
+      - `keep_checkpoint_max`,
+      - `keep_checkpoint_every_n_hours`,
+
+    In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
+    can be set (should not be both).
+
+    Args:
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+    return self._replace(
+        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, **kwargs)
+
+  def _replace(self, allowed_properties_list=None, **kwargs):
+    """See `replace`.
+
+    N.B.: This implementation assumes that for key named "foo", the underlying
+    property the RunConfig holds is "_foo" (with one leading underscore).
+
+    Args:
+      allowed_properties_list: The property name list allowed to be replaced.
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+
+    new_copy = copy.deepcopy(self)
+
+    allowed_properties_list = allowed_properties_list or []
+
+    for key, new_value in six.iteritems(kwargs):
+      if key in allowed_properties_list:
+        setattr(new_copy, '_' + key, new_value)
+        continue
+
+      raise ValueError(
+          'Replacing {} is not supported. Allowed properties are {}.'.format(
+              key, allowed_properties_list))
+
+    _validate_save_ckpt_with_replaced_keys(new_copy, kwargs.keys())
+    _validate_properties(new_copy)
+    return new_copy
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
new file mode 100644
index 00000000000..18d5b2a3f64
--- /dev/null
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -0,0 +1,183 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RunConfig tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import test
+
+_TEST_DIR = 'test_dir'
+_MASTER = 'master_'
+_NOT_SUPPORTED_REPLACE_PROPERTY_MSG = 'Replacing .*is not supported'
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+_MODEL_DIR_ERR = 'model_dir should be non-empty'
+_SAVE_SUMMARY_STEPS_ERR = 'save_summary_steps should be >= 0'
+_SAVE_CKPT_STEPS_ERR = 'save_checkpoints_steps should be >= 0'
+_SAVE_CKPT_SECS_ERR = 'save_checkpoints_secs should be >= 0'
+_SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
+_KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
+_KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
+_TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+
+
+class RunConfigTest(test.TestCase):
+
+  def test_default_property_values(self):
+    config = run_config_lib.RunConfig()
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.session_config)
+    self.assertEqual(1, config.tf_random_seed)
+    self.assertEqual(100, config.save_summary_steps)
+    self.assertEqual(600, config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertEqual(5, config.keep_checkpoint_max)
+    self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
+
+  def test_model_dir(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertIsNone(empty_config.model_dir)
+
+    new_config = empty_config.replace(model_dir=_TEST_DIR)
+    self.assertEqual(_TEST_DIR, new_config.model_dir)
+
+  def test_replace_with_allowed_properties(self):
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_secs=14,
+        session_config=session_config,
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(session_config, config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+  def test_replace_none_value(self):
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=None,
+        model_dir=None,
+        save_summary_steps=None,
+        save_checkpoints_secs=None,
+        save_checkpoints_steps=None,
+        session_config=None,
+        keep_checkpoint_max=None,
+        keep_checkpoint_every_n_hours=None)
+    self.assertIsNone(config.tf_random_seed)
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.save_summary_steps)
+    self.assertIsNone(config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertIsNone(config.session_config)
+    self.assertIsNone(config.keep_checkpoint_max)
+    self.assertIsNone(config.keep_checkpoint_every_n_hours)
+
+  def test_replace_with_disallowallowed_properties(self):
+    config = run_config_lib.RunConfig()
+    with self.assertRaises(ValueError):
+      # tf_random_seed is not allowed to be replaced.
+      config.replace(master='_master')
+    with self.assertRaises(ValueError):
+      config.replace(some_undefined_property=123)
+
+  def test_replace(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      # master is not allowed to be replaced.
+      config.replace(master=_MASTER)
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      config.replace(some_undefined_property=_MASTER)
+
+  def test_replace_invalid_values(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
+      config.replace(model_dir='')
+    with self.assertRaisesRegexp(ValueError, _SAVE_SUMMARY_STEPS_ERR):
+      config.replace(save_summary_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_STEPS_ERR):
+      config.replace(save_checkpoints_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_SECS_ERR):
+      config.replace(save_checkpoints_secs=-1)
+    with self.assertRaisesRegexp(ValueError, _SESSION_CONFIG_ERR):
+      config.replace(session_config={})
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_MAX_ERR):
+      config.replace(keep_checkpoint_max=-1)
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_HOURS_ERR):
+      config.replace(keep_checkpoint_every_n_hours=0)
+    with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
+      config.replace(tf_random_seed=1.0)
+
+
+class RunConfigSaveCheckpointsTest(test.TestCase):
+
+  def test_save_checkpoint(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertEqual(600, empty_config.save_checkpoints_secs)
+    self.assertIsNone(empty_config.save_checkpoints_steps)
+
+    config_with_steps = empty_config.replace(save_checkpoints_steps=100)
+    del empty_config
+    self.assertEqual(100, config_with_steps.save_checkpoints_steps)
+    self.assertIsNone(config_with_steps.save_checkpoints_secs)
+
+    config_with_secs = config_with_steps.replace(save_checkpoints_secs=200)
+    del config_with_steps
+    self.assertEqual(200, config_with_secs.save_checkpoints_secs)
+    self.assertIsNone(config_with_secs.save_checkpoints_steps)
+
+  def test_save_checkpoint_both_steps_and_secs_are_not_none(self):
+    empty_config = run_config_lib.RunConfig()
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_ERR):
+      empty_config.replace(save_checkpoints_steps=100,
+                           save_checkpoints_secs=200)
+
+  def test_save_checkpoint_both_steps_and_secs_are_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(
+        save_checkpoints_steps=None, save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_secs_to_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_steps_to_none(self):
+    config_with_steps = run_config_lib.RunConfig().replace(
+        save_checkpoints_steps=100)
+    config_without_ckpt = config_with_steps.replace(save_checkpoints_steps=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
new file mode 100644
index 00000000000..673111be9a4
--- /dev/null
+++ b/tensorflow/python/feature_column/BUILD
@@ -0,0 +1,85 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "feature_column_py",
+    srcs = ["feature_column_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "vocabulary_testdata",
+    srcs = [
+        "testdata/embedding.ckpt.data-00000-of-00001",
+        "testdata/embedding.ckpt.index",
+        "testdata/embedding.ckpt.meta",
+        "testdata/warriors_vocabulary.txt",
+        "testdata/wire_vocabulary.txt",
+    ],
+)
+
+py_test(
+    name = "feature_column_test",
+    srcs = ["feature_column_test.py"],
+    data = [":vocabulary_testdata"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:inputs",
+    ],
+)
diff --git a/tensorflow/python/feature_column/__init__.py b/tensorflow/python/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
new file mode 100644
index 00000000000..99aedc5d4b8
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -0,0 +1,2464 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn abstraction.
+
+FeatureColumns provide a high level abstraction for ingesting and representing
+features. FeatureColumns are also the primary way of encoding features for
+canned ${tf.estimator.Estimator}s.
+
+When using FeatureColumns with `Estimators`, the type of feature column you
+should choose depends on (1) the feature type and (2) the model type.
+
+1. Feature type:
+
+  * Continuous features can be represented by `numeric_column`.
+  * Categorical features can be represented by any `categorical_column_with_*`
+  column:
+    - `categorical_column_with_vocabulary_list`
+    - `categorical_column_with_vocabulary_file`
+    - `categorical_column_with_hash_bucket`
+    - `categorical_column_with_identity`
+    - `weighted_categorical_column`
+
+2. Model type:
+
+  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
+
+    Continuous features can be directly fed into deep neural network models.
+
+      age_column = numeric_column("age")
+
+    To feed sparse features into DNN models, wrap the column with
+    `embedding_column` or `indicator_column`. `indicator_column` is recommended
+    for features with only a few possible values. For features with many
+    possible values, to reduce the size of your model, `embedding_column` is
+    recommended.
+
+      embedded_dept_column = embedding_column(
+          categorical_column_with_vocabulary_list(
+              "department", ["math", "philosphy", ...]), dimension=10)
+
+  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
+
+    Sparse features can be fed directly into linear models. They behave like an
+    indicator column but with an efficient implementation.
+
+      dept_column = categorical_column_with_vocabulary_list("department",
+          ["math", "philosophy", "english"])
+
+    It is recommended that continuous features be bucketized before being
+    fed into linear models.
+
+      bucketized_age_column = bucketized_column(
+          source_column=age_column,
+          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+
+    Sparse features can be crossed (also known as conjuncted or combined) in
+    order to form non-linearities, and then fed into linear models.
+
+      cross_dept_age_column = crossed_column(
+          columns=["department", bucketized_age_column],
+          hash_bucket_size=1000)
+
+Example of building canned `Estimator`s using FeatureColumns:
+
+  ```python
+  # Define features and transformations
+  deep_feature_columns = [age_column, embedded_dept_column]
+  wide_feature_columns = [dept_column, bucketized_age_column,
+      cross_dept_age_column]
+
+  # Build deep model
+  estimator = DNNClassifier(
+      feature_columns=deep_feature_columns,
+      hidden_units=[500, 250, 50])
+  estimator.train(...)
+
+  # Or build a wide model
+  estimator = LinearClassifier(
+      feature_columns=wide_feature_columns)
+  estimator.train(...)
+
+  # Or build a wide and deep model!
+  estimator = DNNLinearCombinedClassifier(
+      linear_feature_columns=wide_feature_columns,
+      dnn_feature_columns=deep_feature_columns,
+      dnn_hidden_units=[500, 250, 50])
+  estimator.train(...)
+  ```
+
+
+FeatureColumns can also be transformed into a generic input layer for
+custom models using `input_layer`.
+
+Example of building model using FeatureColumns, this can be used in a
+`model_fn` which is given to the {tf.estimator.Estimator}:
+
+  ```python
+  # Building model via layers
+
+  deep_feature_columns = [age_column, embedded_dept_column]
+  columns_to_tensor = parse_feature_columns_from_examples(
+      serialized=my_data,
+      feature_columns=deep_feature_columns)
+  first_layer = input_layer(
+      features=columns_to_tensor,
+      feature_columns=deep_feature_columns)
+  second_layer = fully_connected(first_layer, ...)
+  ```
+
+NOTE: Functions prefixed with "_" indicate experimental or private parts of
+the API subject to change, and should not be relied upon!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import math
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.util import nest
+
+
+def input_layer(features,
+                feature_columns,
+                weight_collections=None,
+                trainable=True):
+  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  columns = [price, keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
+  prediction = tf.layers.dense(dense_tensor, 1)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_DenseColumn` such as `numeric_column`, `embedding_column`,
+      `bucketized_column`, `indicator_column`. If you have categorical features,
+      you can wrap them with an `embedding_column` or `indicator_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents input layer of a model. Its shape
+    is (batch_size, first_layer_dimension) and its dtype is `float32`.
+    first_layer_dimension is determined based on given `feature_columns`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(None, default_name=column.name):
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
+def linear_model(features,
+                 feature_columns,
+                 units=1,
+                 sparse_combiner='sum',
+                 weight_collections=None,
+                 trainable=True):
+  """Returns a linear prediction `Tensor` based on given `feature_columns`.
+
+  This function generates a weighted sum based on output dimension `units`.
+  Weighted sum refers to logits in classification problems. It refers to the
+  prediction itself for linear regression problems.
+
+  Note on supported columns: `linear_model` treats categorical columns as
+  `indicator_column`s while `input_layer` explicitly requires wrapping each
+  of them with an `embedding_column` or an `indicator_column`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  keywords_price = crossed_column('keywords', price_buckets, ...)
+  columns = [price_buckets, keywords, keywords_price ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values are `Tensor` or `SparseTensor` depending on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_FeatureColumn`s.
+    units: An integer, dimensionality of the output space. Default value is 1.
+    sparse_combiner: A string specifying how to reduce if a sparse column is
+      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
+      the default. "sqrtn" often achieves good accuracy, in particular with
+      bag-of-words columns. It combines each sparse columns independently.
+        * "sum": do not normalize features in the column
+        * "mean": do l1 normalization on features in the column
+        * "sqrtn": do l2 normalization on features in the column
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents predictions/logits of a linear model. Its shape
+    is (batch_size, units) and its dtype is `float32`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
+      nor `_CategoricalColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+      raise ValueError('Items of feature_columns must be either a _DenseColumn '
+                       'or _CategoricalColumn. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='linear_model', values=features.values()):
+    weighted_sums = []
+    ordered_columns = []
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(None, default_name=column.name):
+        ordered_columns.append(column)
+        if isinstance(column, _CategoricalColumn):
+          weighted_sums.append(_create_categorical_column_weighted_sum(
+              column, builder, units, sparse_combiner, weight_collections,
+              trainable))
+        else:
+          weighted_sums.append(_create_dense_column_weighted_sum(
+              column, builder, units, weight_collections, trainable))
+    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    bias = variable_scope.get_variable(
+        'bias_weights',
+        shape=[units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, bias, name='weighted_sum')
+
+    return predictions
+
+
+def _transform_features(features, feature_columns):
+  """Returns transformed features based on features columns passed in.
+
+  Please note that most probably you would not need to use this function. Please
+  check `input_layer` and `linear_model` to see whether they will
+  satisfy your use case or not.
+
+  Example:
+
+  ```python
+  # Define features and transformations
+  crosses_a_x_b = crossed_column(
+      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
+  price_buckets = bucketized_column(
+      source_column=numeric_column("price"), boundaries=[...])
+
+  columns = [crosses_a_x_b, price_buckets]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  transformed = transform_features(features=features, feature_columns=columns)
+
+  assertCountEqual(columns, transformed.keys())
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing all the `_FeatureColumn`s.
+
+  Returns:
+    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
+  """
+  _check_feature_columns(feature_columns)
+  outputs = {}
+  with ops.name_scope(
+      None, default_name='transform_features', values=features.values()):
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with ops.name_scope(None, default_name=column.name):
+        outputs[column] = builder.get(column)
+  return outputs
+
+
+def make_parse_example_spec(feature_columns):
+  """Creates parsing spec dictionary from input feature_columns.
+
+  The returned dictionary can be used as arg 'features' in `tf.parse_example`.
+
+  Typical usage example:
+
+  ```python
+  # Define features and transformations
+  feature_b = numeric_column(...)
+  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
+  feature_a_x_feature_c = crossed_column(
+      columns=["feature_a", feature_c_bucketized], ...)
+
+  feature_columns = set(
+      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
+  features = tf.parse_example(
+      serialized=serialized_examples,
+      features=make_parse_example_spec(feature_columns))
+  ```
+
+  For the above example, make_parse_example_spec would return the dict:
+  {
+    "feature_a": parsing_ops.VarLenFeature(tf.string),
+    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+  }
+
+  Args:
+    feature_columns: An iterable containing all feature columns. All items
+      should be instances of classes derived from `_FeatureColumn`.
+
+  Returns:
+    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
+    value.
+
+  Raises:
+    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
+      instance.
+  """
+  result = {}
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError(
+          'All feature_columns must be _FeatureColumn instances. '
+          'Given: {}'.format(column))
+    config = column._parse_example_spec  # pylint: disable=protected-access
+    for key, value in six.iteritems(config):
+      if key in result and value != result[key]:
+        raise ValueError(
+            'feature_columns contain different parse_spec for key '
+            '{}. Given {} and {}'.format(key, value, result[key]))
+    result.update(config)
+  return result
+
+
+def embedding_column(
+    categorical_column, dimension, combiner='mean', initializer=None,
+    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
+    trainable=True):
+  """`_DenseColumn` that converts from sparse, categorical input.
+
+  Use this when your inputs are sparse, but you want to convert them to a dense
+  representation (e.g., to feed to a DNN).
+
+  Inputs must be a `_CategoricalColumn` created by any of the
+  `categorical_column_*` function. Here is an example embedding of an identity
+  column for a DNN model:
+
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by a
+      `categorical_column_with_*` function. This column produces the sparse IDs
+      that are inputs to the embedding lookup.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
+      which to restore the column weights. Required if `ckpt_to_load_from` is
+      not `None`.
+    max_norm: If not `None`, embedding values are l2-normalized to this value.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    `_DenseColumn` that converts from sparse input.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+  """
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  return _EmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      initializer=initializer,
+      ckpt_to_load_from=ckpt_to_load_from,
+      tensor_name_in_ckpt=tensor_name_in_ckpt,
+      max_norm=max_norm,
+      trainable=trainable)
+
+
+def numeric_column(key,
+                   shape=(1,),
+                   default_value=None,
+                   dtype=dtypes.float32,
+                   normalizer_fn=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  columns = [price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    shape: An iterable of integers specifies the shape of the `Tensor`. An
+      integer can be given which means a single dimension `Tensor` with given
+      width. The `Tensor` representing the column will have the shape of
+      [batch_size] + `shape`.
+    default_value: A single value compatible with `dtype` or an iterable of
+      values compatible with `dtype` which the column takes on during
+      `tf.Example` parsing if data is missing. A default value of `None` will
+      cause `tf.parse_example` to fail if an example does not contain this
+      column. If a single value is provided, the same value will be applied as
+      the default value for every item. If an iterable of values is provided,
+      the shape of the `default_value` should be equal to the given `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `_NumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int
+    ValueError: if any dimension in shape is not a positive integer
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = _check_shape(shape, key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  default_value = _check_default_value(shape, default_value, dtype, key)
+
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  return _NumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
+def bucketized_column(source_column, boundaries):
+  """Represents discretized dense input.
+
+  Buckets include the left boundary, and exclude the right boundary. Namely,
+  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
+  `[1., 2.)`, and `[2., +inf)`.
+
+  For example, if the inputs are
+    `boundaries` = [0, 10, 100]
+    input tensor = [[-5, 10000]
+                    [150,   10]
+                    [5,    100]]
+
+  then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  `bucketized_column` can also be crossed with another categorical column using
+  `crossed_column`:
+  ```python
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  # 'keywords' is a string feature.
+  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
+  columns = [price_x_keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    source_column: A one-dimensional dense column which is generated with
+      `numeric_column`.
+    boundaries: A sorted list or tuple of floats specifying the boundaries.
+
+  Returns:
+    A `_BucketizedColumn`.
+
+  Raises:
+    ValueError: If `source_column` is not a numeric column, or if it is not
+      one-dimensional.
+    ValueError: If `boundaries` is not a sorted list or tuple.
+  """
+  if not isinstance(source_column, _NumericColumn):
+    raise ValueError(
+        'source_column must be a column generated with numeric_column(). '
+        'Given: {}'.format(source_column))
+  if len(source_column.shape) > 1:
+    raise ValueError(
+        'source_column must be one-dimensional column. '
+        'Given: {}'.format(source_column))
+  if (not boundaries or
+      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+    raise ValueError('boundaries must be a sorted list.')
+  for i in range(len(boundaries) - 1):
+    if boundaries[i] >= boundaries[i + 1]:
+      raise ValueError('boundaries must be a sorted list.')
+  return _BucketizedColumn(source_column, tuple(boundaries))
+
+
+def _assert_string_or_int(dtype, prefix):
+  if (dtype != dtypes.string) and (not dtype.is_integer):
+    raise ValueError(
+        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
+
+
+def categorical_column_with_hash_bucket(key,
+                                        hash_bucket_size,
+                                        dtype=dtypes.string):
+  """Represents sparse feature where ids are set by hashing.
+
+  Use this when your sparse features are in string or integer format, and you
+  want to distribute your inputs into a finite number of buckets by hashing.
+  output_id = Hash(input_feature_string) % bucket_size
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example:
+
+  ```python
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  columns = [keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  keywords_embedded = embedding_column(keywords, 16)
+  columns = [keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_HashedCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if hash_bucket_size is None:
+    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
+
+  if hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be at least 1. '
+                     'hash_bucket_size: {}, key: {}'.format(
+                         hash_bucket_size, key))
+
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
+
+
+def categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A `_CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_CategoricalColumn` with a vocabulary file.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is not a non-negative integer.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if not vocabulary_file:
+    raise ValueError('Missing vocabulary_file in {}.'.format(key))
+  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
+  if (vocabulary_size is None) or (vocabulary_size < 1):
+    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
+  if num_oov_buckets:
+    if default_value is not None:
+      raise ValueError(
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
+    if num_oov_buckets < 0:
+      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
+          num_oov_buckets, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  return _VocabularyFileCategoricalColumn(
+      key=key,
+      vocabulary_file=vocabulary_file,
+      vocabulary_size=vocabulary_size,
+      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
+      default_value=-1 if default_value is None else default_value,
+      dtype=dtype)
+
+
+def categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1):
+  """A `_CategoricalColumn` with in-memory vocabulary.
+
+  Logic for feature f is:
+  id = vocabulary_list.index_of(f) if f in vocabulary_list else default_value
+
+  Use this when your inputs are in string or integer format, and you have an
+  in-memory vocabulary mapping each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use `default_value` to specify how to
+  include out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in `vocabulary_list` is assigned an ID
+  0-4 corresponding to its index (e.g., input 'B' produces output 2). All other
+  inputs are assigned `default_value` 0.
+
+  Linear model:
+  ```python
+  colors = categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
+  columns = [colors, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(colors, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The value to use for values not in `vocabulary_list`.
+
+  Returns:
+    A `_CategoricalColumn` with in-memory vocabulary.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: if `dtype` is not integer or string.
+  """
+  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
+    raise ValueError(
+        'vocabulary_list {} must be non-empty, column_name: {}'.format(
+            vocabulary_list, key))
+  if len(set(vocabulary_list)) != len(vocabulary_list):
+    raise ValueError(
+        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
+            vocabulary_list, key))
+  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
+  _assert_string_or_int(
+      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
+  if dtype is None:
+    dtype = vocabulary_dtype
+  elif dtype.is_integer != vocabulary_dtype.is_integer:
+    raise ValueError(
+        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
+            dtype, vocabulary_dtype, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _VocabularyListCategoricalColumn(
+      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
+      default_value=default_value)
+
+
+def categorical_column_with_identity(key, num_buckets, default_value=None):
+  """A `_CategoricalColumn` that returns identity values.
+
+  Use this when your inputs are integers in the range `[0, num_buckets)`, and
+  you want to use the input value itself as the categorical ID. Values outside
+  this range will result in `default_value` if specified, otherwise it will
+  fail.
+
+  Typically, this is used for contiguous ranges of integer indexes, but
+  it doesn't have to be. This might be inefficient, however, if many of IDs
+  are unused. Consider `categorical_column_with_hash_bucket` in that case.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in the range `[0, 1000000)` is assigned
+  the same value. All other inputs are assigned `default_value` 0. Note that a
+  literal 0 in inputs will result in the same default ID.
+
+  Linear model:
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [video_id, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace inputs in that range.
+
+  Returns:
+    A `_CategoricalColumn` that returns identity values.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  if num_buckets < 1:
+    raise ValueError(
+        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
+  if (default_value is not None) and (
+      (default_value < 0) or (default_value >= num_buckets)):
+    raise ValueError(
+        'default_value {} not in range [0, {}), column_name {}'.format(
+            default_value, num_buckets, key))
+  return _IdentityCategoricalColumn(
+      key=key, num_buckets=num_buckets, default_value=default_value)
+
+
+def indicator_column(categorical_column):
+  """Represents multi-hot representation of given categorical column.
+
+  Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use
+  `embedding_column` if the inputs are sparse.
+
+  ```python
+  name = indicator_column(categorical_column_with_vocabulary_list('name',
+      ['bob', 'george', 'wanda'])
+  columns = [name, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
+  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
+  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` or `crossed_column` functions.
+
+  Returns:
+    An `_IndicatorColumn`.
+  """
+  return _IndicatorColumn(categorical_column)
+
+
+def weighted_categorical_column(
+    categorical_column, weight_feature_key, dtype=dtypes.float32):
+  """Applies weight values to a `_CategoricalColumn`.
+
+  Use this when each of your sparse inputs has both an ID and a value. For
+  example, if you're representing text documents as a collection of word
+  frequencies, you can provide 2 parallel sparse input features ('terms' and
+  'frequencies' below).
+
+  Example:
+
+  Input `tf.Example` objects:
+  [
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "very" value: "model"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.3 value: 0.1}}
+      }
+    },
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "when" value: "course" value: "human"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
+      }
+    }
+  ]
+
+  ```python
+  categorical_column = categorical_column_with_hash_bucket(
+      column_name='terms', hash_bucket_size=1000)
+  weighted_column = weighted_categorical_column(
+      categorical_column=categorical_column, weight_feature_key='frequencies')
+  columns = [weighted_column, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  This assumes the input dictionary contains a `SparseTensor` for key
+  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
+  the same indices and dense shape.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by
+      `categorical_column_with_*` functions.
+    weight_feature_key: String key for weight values.
+    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
+      are supported.
+
+  Returns:
+    A `_CategoricalColumn` composed of two sparse features: one represents id,
+    the other represents weight (value) of the id feature in that example.
+
+  Raises:
+    ValueError: if `dtype` is not convertible to float.
+  """
+  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
+  return _WeightedCategoricalColumn(
+      categorical_column=categorical_column,
+      weight_feature_key=weight_feature_key,
+      dtype=dtype)
+
+
+def crossed_column(keys, hash_bucket_size, hash_key=None):
+  """Returns a column for performing crosses of categorical features.
+
+  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
+  the transformation can be thought of as:
+    Hash(cartesian product of features) % `hash_bucket_size`
+
+  For example, if the input features are:
+  * SparseTensor referred by first key: shape = [2, 2]
+      [0, 0]: "a"
+      [1, 0]: "b"
+      [1, 1]: "c"
+
+  * SparseTensor referred by second key: shape = [2, 1]
+      [0, 0]: "d"
+      [1, 0]: "e"
+
+  then crossed feature will look like:
+      shape = [2, 2]
+      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
+      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
+      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
+
+  Here is an example to create a linear model with crosses of string features:
+  ```python
+  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  You could also use vocabulary lookup before crossing:
+  ```python
+  keywords = categorical_column_with_vocabulary_file(
+      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
+  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  If an input feature is of numeric type, you can use
+  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
+  ```python
+  # vertical_id is an integer categorical feature.
+  vertical_id = categorical_column_with_identity('vertical_id', 10K)
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  columns = [vertical_id_x_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  To use crossed column in DNN model, you need to add it in an embedding column
+  as in this example:
+  ```python
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
+  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
+  ```
+
+  Args:
+    keys: An iterable identifying the features to be crossed. Each element can
+      be either:
+      * string: Will use the corresponding feature which must be of string type.
+      * `_CategoricalColumn`: Will use the transformed tensor produced by this
+        column. Does not support hashed categorical column.
+    hash_bucket_size: An int > 1. The number of buckets.
+    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+      function to combine the crosses fingerprints on SparseCrossOp (optional).
+
+  Returns:
+    A `_CrossedColumn`.
+
+  Raises:
+    ValueError: If `len(keys) < 2`.
+    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
+    ValueError: If any of the keys is `_HashedCategoricalColumn`.
+    ValueError: If `hash_bucket_size < 1`.
+  """
+  if not hash_bucket_size or hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be > 1. '
+                     'hash_bucket_size: {}'.format(hash_bucket_size))
+  if not keys or len(keys) < 2:
+    raise ValueError(
+        'keys must be a list with length > 1. Given: {}'.format(keys))
+  for key in keys:
+    if (not isinstance(key, six.string_types) and
+        not isinstance(key, _CategoricalColumn)):
+      raise ValueError(
+          'Unsupported key type. All keys must be either string, or '
+          'categorical column except _HashedCategoricalColumn. '
+          'Given: {}'.format(key))
+    if isinstance(key, _HashedCategoricalColumn):
+      raise ValueError(
+          'categorical_column_with_hash_bucket is not supported for crossing. '
+          'Hashing before crossing will increase probability of collision. '
+          'Instead, use the feature name as a string. Given: {}'.format(key))
+  return _CrossedColumn(
+      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
+      hash_key=hash_key)
+
+
+class _FeatureColumn(object):
+  """Represents a feature column abstraction.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  To distinguish the concept of a feature family and a specific binary feature
+  within a family, we refer to a feature family like "country" as a feature
+  column. Following is an example feature in a `tf.Example` format:
+    {key: "country",  value: [ "US" ]}
+  In this example the value of feature is "US" and "country" refers to the
+  column of the feature.
+
+  This class is an abstract class. User should not create instances of this.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def name(self):
+    """Returns string. used for variable_scope and naming."""
+    pass
+
+  @abc.abstractmethod
+  def _transform_feature(self, inputs):
+    """Returns intermediate representation (usually a `Tensor`).
+
+    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
+    that other feature columns can use.
+
+    Example usage of `inputs`:
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
+    be used as follows:
+
+    ```python
+    raw_tensor = inputs.get('raw')
+    fc_tensor = inputs.get(input_fc)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+    """
+    pass
+
+  @abc.abstractproperty
+  def _parse_example_spec(self):
+    """Returns a `tf.Example` parsing spec as dict.
+
+    It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
+    dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
+    supported objects. Please check documentation of ${tf.parse_example} for all
+    supported spec objects.
+
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). One possible implementation of
+    _parse_example_spec is as follows:
+
+    ```python
+    spec = {'raw': tf.FixedLenFeature(...)}
+    spec.update(input_fc._parse_example_spec)
+    return spec
+    ```
+    """
+    pass
+
+
+class _DenseColumn(_FeatureColumn):
+  """Represents a column which can be represented as `Tensor`.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  Some examples of this type are: numeric_column, embedding_column,
+  indicator_column.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def _variable_shape(self):
+    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
+    pass
+
+  @abc.abstractmethod
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns a `Tensor`.
+
+    The output of this function will be used by model-builder-functions. For
+    example the pseudo code of `input_layer` will be like:
+
+    ```python
+    def input_layer(features, feature_columns, ...):
+      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
+      return tf.concat(outputs)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: List of graph collections to which Variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
+
+    Returns:
+      `Tensor` of shape [batch_size] + `_variable_shape`.
+    """
+    pass
+
+
+def _create_dense_column_weighted_sum(
+    column, builder, units, weight_collections, trainable):
+  """Create a weighted sum of a dense column for linear_model."""
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+  batch_size = array_ops.shape(tensor)[0]
+  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=[num_elements, units],
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return math_ops.matmul(tensor, weight, name='weighted_sum')
+
+
+class _CategoricalColumn(_FeatureColumn):
+  """Represents a categorical feature.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
+      'IdWeightPair', ['id_tensor', 'weight_tensor'])
+
+  @abc.abstractproperty
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    pass
+
+  @abc.abstractmethod
+  def _get_sparse_tensors(self,
+                          inputs,
+                          weight_collections=None,
+                          trainable=None):
+    """Returns an IdWeightPair.
+
+    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
+    weights.
+
+    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
+    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
+    `SparseTensor` of `float` or `None` to indicate all weights should be
+    taken to be 1. If specified, `weight_tensor` must have exactly the same
+    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
+    output of a `VarLenFeature` which is a ragged matrix.
+
+    Args:
+      inputs: A `LazyBuilder` as a cache to get input tensors required to
+        create `IdWeightPair`.
+      weight_collections: List of graph collections to which variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
+    """
+    pass
+
+
+def _create_categorical_column_weighted_sum(
+    column, builder, units, sparse_combiner, weight_collections, trainable):
+  """Create a weighted sum of a categorical column for linear_model."""
+  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
+      array_ops.shape(sparse_tensors.id_tensor)[0], -1
+  ])
+  weight_tensor = sparse_tensors.weight_tensor
+  if weight_tensor is not None:
+    weight_tensor = sparse_ops.sparse_reshape(
+        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
+
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=(column._num_buckets, units),  # pylint: disable=protected-access
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return _safe_embedding_lookup_sparse(
+      weight,
+      id_tensor,
+      sparse_weights=weight_tensor,
+      combiner=sparse_combiner,
+      name='weighted_sum')
+
+
+class _LazyBuilder(object):
+  """Handles caching of transformations while building the model.
+
+  `_FeatureColumn` specifies how to digest an input column to the network. Some
+  feature columns require data transformations. This class caches those
+  transformations.
+
+  Some features may be used in more than one place. For example, one can use a
+  bucketized feature by itself and a cross with it. In that case we
+  should create only one bucketization op instead of creating ops for each
+  feature column separately. To handle re-use of transformed columns,
+  `_LazyBuilder` caches all previously transformed columns.
+
+  Example:
+  We're trying to use the following `_FeatureColumn`s:
+
+  ```python
+    bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
+    keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
+    age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
+    ... = linear_model(features,
+                            [bucketized_age, keywords, age_X_keywords]
+  ```
+
+  If we transform each column independently, then we'll get duplication of
+  bucketization (one for cross, one for bucketization itself).
+  The `_LazyBuilder` eliminates this duplication.
+  """
+
+  def __init__(self, features):
+    """Creates a `_LazyBuilder`.
+
+    Args:
+      features: A mapping from feature column to objects that are `Tensor` or
+        `SparseTensor`, or can be converted to same via
+        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
+        signifies a base feature (not-transformed). A `_FeatureColumn` key
+        means that this `Tensor` is the output of an existing `_FeatureColumn`
+        which can be reused.
+    """
+    self._features = features.copy()
+    self._feature_tensors = {}
+
+  def get(self, key):
+    """Returns a `Tensor` for the given key.
+
+    A `str` key is used to access a base feature (not-transformed). When a
+    `_FeatureColumn` is passed, the transformed feature is returned if it
+    already exists, otherwise the given `_FeatureColumn` is asked to provide its
+    transformed output, which is then cached.
+
+    Args:
+      key: a `str` or a `_FeatureColumn`.
+
+    Returns:
+      The transformed `Tensor` corresponding to the `key`.
+
+    Raises:
+      ValueError: if key is not found or a transformed `Tensor` cannot be
+        computed.
+    """
+    if key in self._feature_tensors:
+      # FeatureColumn is already transformed or converted.
+      return self._feature_tensors[key]
+
+    if key in self._features:
+      feature_tensor = self._get_raw_feature_as_tensor(key)
+      self._feature_tensors[key] = feature_tensor
+      return feature_tensor
+
+    if not isinstance(key, (str, _FeatureColumn)):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      'Provided: {}'.format(key))
+
+    if not isinstance(key, _FeatureColumn):
+      raise ValueError('Feature {} is not in features dictionary.'.format(key))
+
+    column = key
+    logging.debug('Transforming feature_column %s.', column)
+    transformed = column._transform_feature(self)  # pylint: disable=protected-access
+    if transformed is None:
+      raise ValueError('Column {} is not supported.'.format(column.name))
+    self._feature_tensors[column] = transformed
+    return transformed
+
+  def _get_raw_feature_as_tensor(self, key):
+    """Gets the raw_feature (keyed by `key`) as `tensor`.
+
+    The raw feature is converted to (sparse) tensor and maybe expand dim.
+
+    For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
+    the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
+    error out as it is not supported.
+
+    Args:
+      key: A `str` key to access the raw feature.
+
+    Returns:
+      A `Tensor` or `SparseTensor`.
+
+    Raises:
+      ValueError: if the raw feature has rank 0.
+    """
+    raw_feature = self._features[key]
+    feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+        raw_feature)
+
+    def expand_dims(input_tensor):
+      # Input_tensor must have rank 1.
+      if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+        return sparse_ops.sparse_reshape(
+            input_tensor, [array_ops.shape(input_tensor)[0], -1])
+      else:
+        return array_ops.expand_dims(input_tensor, -1)
+
+    rank = feature_tensor.get_shape().ndims
+    if rank is not None:
+      if rank == 0:
+        raise ValueError(
+            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
+                key, feature_tensor))
+      return feature_tensor if rank != 1 else expand_dims(feature_tensor)
+
+    # Handle dynamic rank.
+    with ops.control_dependencies([
+        check_ops.assert_positive(
+            array_ops.rank(feature_tensor),
+            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
+                key, feature_tensor))]):
+      return control_flow_ops.cond(
+          math_ops.equal(1, array_ops.rank(feature_tensor)),
+          lambda: expand_dims(feature_tensor),
+          lambda: feature_tensor)
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _shape_offsets(shape):
+  """Returns moving offset for each dimension given shape."""
+  offsets = []
+  for dim in reversed(shape):
+    if offsets:
+      offsets.append(dim * offsets[-1])
+    else:
+      offsets.append(dim)
+  offsets.reverse()
+  return offsets
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _to_sparse_input(input_tensor, ignore_value=None):
+  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
+
+  If `input_tensor` is already a `SparseTensor`, just return it.
+
+  Args:
+    input_tensor: A string or integer `Tensor`.
+    ignore_value: Entries in `dense_tensor` equal to this value will be
+      absent from the resulting `SparseTensor`. If `None`, default value of
+      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
+
+  Returns:
+    A `SparseTensor` with the same shape as `input_tensor`.
+
+  Raises:
+    ValueError: when `input_tensor`'s rank is `None`.
+  """
+  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+      input_tensor)
+  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+    return input_tensor
+  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
+    input_rank = input_tensor.get_shape().ndims
+    if input_rank is None:
+      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
+      raise ValueError('Undefined input_tensor shape.')
+    if ignore_value is None:
+      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
+    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
+    indices = array_ops.where(math_ops.not_equal(
+        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
+    # Flattens the tensor and indices for use with gather.
+    flat_tensor = array_ops.reshape(input_tensor, [-1])
+    flat_indices = indices[:, input_rank - 1]
+    # Computes the correct flattened indices for 2d (or higher) tensors.
+    if input_rank > 1:
+      higher_dims = indices[:, :input_rank - 1]
+      shape_offsets = array_ops.stack(
+          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
+      offsets = math_ops.reduce_sum(
+          math_ops.multiply(higher_dims, shape_offsets),
+          reduction_indices=[1])
+      flat_indices = math_ops.add(flat_indices, offsets)
+    values = array_ops.gather(flat_tensor, flat_indices)
+    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+
+
+def _check_feature_columns(feature_columns):
+  """Verifies feature_columns input."""
+  if isinstance(feature_columns, dict):
+    raise ValueError('Expected feature_columns to be iterable, found dict.')
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError('Items of feature_columns must be a _FeatureColumn. '
+                       'Given (type {}): {}.'.format(type(column), column))
+  if not feature_columns:
+    raise ValueError('feature_columns must not be empty.')
+  name_to_column = dict()
+  for column in feature_columns:
+    if column.name in name_to_column:
+      raise ValueError('Duplicate feature column name found for columns: {} '
+                       'and {}. This usually means that these columns refer to '
+                       'same base feature. Either one must be discarded or a '
+                       'duplicated but renamed item must be inserted in '
+                       'features dict.'.format(column,
+                                               name_to_column[column.name]))
+    name_to_column[column.name] = column
+
+
+class _NumericColumn(_DenseColumn,
+                     collections.namedtuple('_NumericColumn', [
+                         'key', 'shape', 'default_value', 'dtype',
+                         'normalizer_fn'
+                     ])):
+  """see `numeric_column`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
+    }
+
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError(
+          'The corresponding Tensor of numerical column must be a Tensor. '
+          'SparseTensor is not supported. key: {}'.format(self.key))
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return math_ops.to_float(input_tensor)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(self.shape)
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing numeric feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
+                        collections.namedtuple('_BucketizedColumn', [
+                            'source_column', 'boundaries'])):
+  """See `bucketized_column`."""
+
+  @property
+  def name(self):
+    return '{}_bucketized'.format(self.source_column.name)
+
+  @property
+  def _parse_example_spec(self):
+    return self.source_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    source_tensor = inputs.get(self.source_column)
+    return math_ops._bucketize(  # pylint: disable=protected-access
+        source_tensor,
+        boundaries=self.boundaries)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return array_ops.one_hot(
+        indices=math_ops.to_int64(input_tensor),
+        depth=len(self.boundaries) + 1,
+        on_value=1.,
+        off_value=0.)
+
+  @property
+  def _num_buckets(self):
+    # By construction, source_column is always one-dimensional.
+    return (len(self.boundaries) + 1) * self.source_column.shape[0]
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    input_tensor = inputs.get(self)
+    batch_size = array_ops.shape(input_tensor)[0]
+    # By construction, source_column is always one-dimensional.
+    source_dimension = self.source_column.shape[0]
+
+    i1 = array_ops.reshape(
+        array_ops.tile(
+            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+            [1, source_dimension]),
+        (-1,))
+    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
+    # Flatten the bucket indices and unique them across dimensions
+    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
+    bucket_indices = (
+        array_ops.reshape(input_tensor, (-1,)) +
+        (len(self.boundaries) + 1) * i2)
+
+    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
+    dense_shape = math_ops.to_int64(array_ops.stack(
+        [batch_size, source_dimension]))
+    sparse_tensor = sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=bucket_indices,
+        dense_shape=dense_shape)
+    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
+
+
+class _EmbeddingColumn(
+    _DenseColumn,
+    collections.namedtuple('_EmbeddingColumn', (
+        'categorical_column', 'dimension', 'combiner', 'initializer',
+        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
+    ))):
+  """See `_embedding_column`."""
+
+  @property
+  def name(self):
+    if not hasattr(self, '_name'):
+      self._name = '{}_embedding'.format(self.categorical_column.name)
+    return self._name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
+  @property
+  def _variable_shape(self):
+    if not hasattr(self, '_shape'):
+      self._shape = tensor_shape.vector(self.dimension)
+    return self._shape
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    # Get sparse IDs and weights.
+    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+        inputs, weight_collections=weight_collections, trainable=trainable)
+    sparse_ids = sparse_tensors.id_tensor
+    sparse_weights = sparse_tensors.weight_tensor
+
+    # Create embedding weight, and restore from checkpoint if necessary.
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=(self.categorical_column._num_buckets, self.dimension),  # pylint: disable=protected-access
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    if self.ckpt_to_load_from is not None:
+      to_restore = embedding_weights
+      if isinstance(to_restore, variables.PartitionedVariable):
+        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+          self.tensor_name_in_ckpt: to_restore
+      })
+
+    # Return embedding lookup result.
+    return _safe_embedding_lookup_sparse(
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
+
+
+def _create_tuple(shape, value):
+  """Returns a tuple with given shape and filled with value."""
+  if shape:
+    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
+  return value
+
+
+def _as_tuple(value):
+  if not nest.is_sequence(value):
+    return value
+  return tuple([_as_tuple(v) for v in value])
+
+
+def _check_shape(shape, key):
+  """Returns shape if it's valid, raises error otherwise."""
+  assert shape is not None
+  if not nest.is_sequence(shape):
+    shape = [shape]
+  shape = tuple(shape)
+  for dimension in shape:
+    if not isinstance(dimension, int):
+      raise TypeError('shape dimensions must be integer. '
+                      'shape: {}, key: {}'.format(shape, key))
+    if dimension < 1:
+      raise ValueError('shape dimensions must be greater than 0. '
+                       'shape: {}, key: {}'.format(shape, key))
+  return shape
+
+
+def _is_shape_and_default_value_compatible(default_value, shape):
+  """Verifies compatibility of shape and default_value."""
+  # Invalid condition:
+  #  * if default_value is not a scalar and shape is empty
+  #  * or if default_value is an iterable and shape is not empty
+  if nest.is_sequence(default_value) != bool(shape):
+    return False
+  if not shape:
+    return True
+  if len(default_value) != shape[0]:
+    return False
+  for i in range(shape[0]):
+    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
+      return False
+  return True
+
+
+def _check_default_value(shape, default_value, dtype, key):
+  """Returns default value as tuple if it's valid, otherwise raises errors.
+
+  This function verifies that `default_value` is compatible with both `shape`
+  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
+  it casts default_value to a tuple and returns it. `key` is used only
+  for error message.
+
+  Args:
+    shape: An iterable of integers specifies the shape of the `Tensor`.
+    default_value: If a single value is provided, the same value will be applied
+      as the default value for every item. If an iterable of values is
+      provided, the shape of the `default_value` should be equal to the given
+      `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    key: Column name, used only for error messages.
+
+  Returns:
+    A tuple which will be used as default value.
+
+  Raises:
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  if default_value is None:
+    return None
+
+  if isinstance(default_value, int):
+    return _create_tuple(shape, default_value)
+
+  if isinstance(default_value, float) and dtype.is_floating:
+    return _create_tuple(shape, default_value)
+
+  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
+    default_value = default_value.tolist()
+
+  if nest.is_sequence(default_value):
+    if not _is_shape_and_default_value_compatible(default_value, shape):
+      raise ValueError(
+          'The shape of default_value must be equal to given shape. '
+          'default_value: {}, shape: {}, key: {}'.format(
+              default_value, shape, key))
+    # Check if the values in the list are all integers or are convertible to
+    # floats.
+    is_list_all_int = all(
+        isinstance(v, int) for v in nest.flatten(default_value))
+    is_list_has_float = any(
+        isinstance(v, float) for v in nest.flatten(default_value))
+    if is_list_all_int:
+      return _as_tuple(default_value)
+    if is_list_has_float and dtype.is_floating:
+      return _as_tuple(default_value)
+  raise TypeError('default_value must be compatible with dtype. '
+                  'default_value: {}, dtype: {}, key: {}'.format(
+                      default_value, dtype, key))
+
+
+class _HashedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_HashedCategoricalColumn',
+                           ['key', 'hash_bucket_size', 'dtype'])):
+  """see `categorical_column_with_hash_bucket`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError('SparseColumn input must be a SparseTensor.')
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    if self.dtype == dtypes.string:
+      sparse_values = input_tensor.values
+    else:
+      sparse_values = string_ops.as_string(input_tensor.values)
+
+    sparse_id_values = string_ops.string_to_hash_bucket_fast(
+        sparse_values, self.hash_bucket_size, name='lookup')
+    return sparse_tensor_lib.SparseTensor(
+        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyFileCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyFileCategoricalColumn', (
+        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
+        'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_file`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_file` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_file(
+        vocabulary_file=self.vocabulary_file,
+        num_oov_buckets=self.num_oov_buckets,
+        vocab_size=self.vocabulary_size,
+        default_value=self.default_value,
+        key_dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.vocabulary_size + self.num_oov_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyListCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyListCategoricalColumn', (
+        'key', 'vocabulary_list', 'dtype', 'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_list`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_tensor` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_tensor(
+        vocabulary_list=tuple(self.vocabulary_list),
+        default_value=self.default_value,
+        dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return len(self.vocabulary_list)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _IdentityCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_IdentityCategoricalColumn', (
+        'key', 'num_buckets', 'default_value'
+    ))):
+
+  """See `categorical_column_with_identity`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if not input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Invalid input, not integer. key: {} dtype: {}'.format(
+              self.key, input_tensor.dtype))
+
+    values = math_ops.to_int64(input_tensor.values, name='values')
+    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
+    zero = math_ops.to_int64(0, name='zero')
+    if self.default_value is None:
+      # Fail if values are out-of-range.
+      assert_less = check_ops.assert_less(
+          values, num_buckets, data=(values, num_buckets),
+          name='assert_less_than_num_buckets')
+      assert_greater = check_ops.assert_greater_equal(
+          values, zero, data=(values,),
+          name='assert_greater_or_equal_0')
+      with ops.control_dependencies((assert_less, assert_greater)):
+        values = array_ops.identity(values)
+    else:
+      # Assign default for out-of-range values.
+      values = array_ops.where(
+          math_ops.logical_or(
+              values < zero, values >= num_buckets, name='out_of_range'),
+          array_ops.fill(
+              dims=array_ops.shape(values),
+              value=math_ops.to_int64(self.default_value),
+              name='default_values'),
+          values)
+
+    return sparse_tensor_lib.SparseTensor(
+        indices=input_tensor.indices,
+        values=values,
+        dense_shape=input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.num_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _WeightedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_WeightedCategoricalColumn', (
+        'categorical_column', 'weight_feature_key', 'dtype'
+    ))):
+  """See `weighted_categorical_column`."""
+
+  @property
+  def name(self):
+    return '{}_weighted_by_{}'.format(
+        self.categorical_column.name, self.weight_feature_key)
+
+  @property
+  def _parse_example_spec(self):
+    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+    if self.weight_feature_key in config:
+      raise ValueError('Parse config {} already exists for {}.'.format(
+          config[self.weight_feature_key], self.weight_feature_key))
+    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
+    return config
+
+  @property
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    weight_tensor = inputs.get(self.weight_feature_key)
+    if weight_tensor is None:
+      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
+    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+        weight_tensor)
+    if self.dtype != weight_tensor.dtype.base_dtype:
+      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
+          self.dtype, weight_tensor.dtype))
+    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
+      # The weight tensor can be a regular Tensor. In this case, sparsify it.
+      weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0)
+    if not weight_tensor.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return (inputs.get(self.categorical_column), weight_tensor)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    tensors = inputs.get(self)
+    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
+
+
+class _CrossedColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_CrossedColumn',
+                           ['keys', 'hash_bucket_size', 'hash_key'])):
+  """See `crossed_column`."""
+
+  @property
+  def name(self):
+    feature_names = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, _FeatureColumn):
+        feature_names.append(key.name)
+      else:  # key must be a string
+        feature_names.append(key)
+    return '_X_'.join(sorted(feature_names))
+
+  @property
+  def _parse_example_spec(self):
+    config = {}
+    for key in self.keys:
+      if isinstance(key, _FeatureColumn):
+        config.update(key._parse_example_spec)  # pylint: disable=protected-access
+      else:  # key must be a string
+        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
+    return config
+
+  def _transform_feature(self, inputs):
+    feature_tensors = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, six.string_types):
+        feature_tensors.append(inputs.get(key))
+      elif isinstance(key, _CategoricalColumn):
+        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+        if ids_and_weights.weight_tensor is not None:
+          raise ValueError(
+              'crossed_column does not support weight_tensor, but the given '
+              'column populates weight_tensor. '
+              'Given column: {}'.format(key.name))
+        feature_tensors.append(ids_and_weights.id_tensor)
+      else:
+        raise ValueError('Unsupported column type. Given: {}'.format(key))
+    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
+        inputs=feature_tensors,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+def _collect_leaf_level_keys(cross):
+  """Collects base keys by expanding all nested crosses.
+
+  Args:
+    cross: A `_CrossedColumn`.
+
+  Returns:
+    A list of strings or `_CategoricalColumn` instances.
+  """
+  leaf_level_keys = []
+  for k in cross.keys:
+    if isinstance(k, _CrossedColumn):
+      leaf_level_keys.extend(_collect_leaf_level_keys(k))
+    else:
+      leaf_level_keys.append(k)
+  return leaf_level_keys
+
+
+# TODO(zakaria): Move this to embedding_ops and make it public.
+def _safe_embedding_lookup_sparse(embedding_weights,
+                                  sparse_ids,
+                                  sparse_weights=None,
+                                  combiner='mean',
+                                  default_id=None,
+                                  name=None,
+                                  partition_strategy='div',
+                                  max_norm=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+        created by partitioning along dimension 0.  The total unpartitioned
+        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
+        vocab size and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+        ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+        float weights corresponding to `sparse_ids`, or `None` if all weights
+        are be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
+        the default.
+    default_id: The id to use for an entry with no features.
+    name: A name for this operation (optional).
+    partition_strategy: A string specifying the partitioning strategy.
+        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+        combining.
+
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  if embedding_weights is None:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+
+  dtype = sparse_weights.dtype if sparse_weights is not None else None
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
+
+  with ops.name_scope(name, 'embedding_lookup',
+                      embedding_weights + [sparse_ids,
+                                           sparse_weights]) as scope:
+    # Reshape higher-rank sparse ids and weights to linear segment ids.
+    original_shape = sparse_ids.dense_shape
+    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
+    original_rank = (
+        array_ops.size(original_shape)
+        if original_rank_dim.value is None
+        else original_rank_dim.value)
+    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
+        math_ops.reduce_prod(
+            array_ops.slice(original_shape, [0], [original_rank - 1])),
+        array_ops.gather(original_shape, original_rank - 1)])
+    if sparse_weights is not None:
+      sparse_weights = sparse_tensor_lib.SparseTensor(
+          sparse_ids.indices,
+          sparse_weights.values, sparse_ids.dense_shape)
+
+    # Prune invalid ids and weights.
+    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+
+    # Fill in dummy values for empty features, if necessary.
+    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
+                                                                 default_id or
+                                                                 0)
+    if sparse_weights is not None:
+      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
+
+    result = embedding_ops.embedding_lookup_sparse(
+        embedding_weights,
+        sparse_ids,
+        sparse_weights,
+        combiner=combiner,
+        partition_strategy=partition_strategy,
+        name=None if default_id is None else scope,
+        max_norm=max_norm)
+
+    if default_id is None:
+      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
+      # for use in Select.
+      is_row_empty = array_ops.tile(
+          array_ops.reshape(is_row_empty, [-1, 1]),
+          array_ops.stack([1, array_ops.shape(result)[1]]))
+
+      result = array_ops.where(is_row_empty,
+                               array_ops.zeros_like(result),
+                               result,
+                               name=scope)
+
+    # Reshape back from linear ids back into higher-dimensional dense result.
+    final_result = array_ops.reshape(
+        result,
+        array_ops.concat([
+            array_ops.slice(
+                math_ops.cast(original_shape, dtypes.int32), [0],
+                [original_rank - 1]),
+            array_ops.slice(array_ops.shape(result), [1], [-1])
+        ], 0))
+    final_result.set_shape(tensor_shape.unknown_shape(
+        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
+    return final_result
+
+
+def _prune_invalid_ids(sparse_ids, sparse_weights):
+  """Prune invalid IDs (< 0) from the input ids and weights."""
+  is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
+  if sparse_weights is not None:
+    is_id_valid = math_ops.logical_and(
+        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+  sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
+  if sparse_weights is not None:
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
+  return sparse_ids, sparse_weights
+
+
+class _IndicatorColumn(_DenseColumn,
+                       collections.namedtuple('_IndicatorColumn',
+                                              ['categorical_column'])):
+  """Represents a one-hot column for use in deep networks.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` function.
+  """
+
+  @property
+  def name(self):
+    return '{}_indicator'.format(self.categorical_column.name)
+
+  def _transform_feature(self, inputs):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+
+    Raises:
+      ValueError: if input rank is not known at graph building time.
+    """
+    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    id_tensor = id_weight_pair.id_tensor
+    weight_tensor = id_weight_pair.weight_tensor
+
+    # If the underlying column is weighted, return the input as a dense tensor.
+    if weight_tensor is not None:
+      weighted_column = sparse_ops.sparse_merge(
+          sp_ids=id_tensor,
+          sp_values=weight_tensor,
+          vocab_size=self._variable_shape[-1])
+      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+
+    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
+        id_tensor, default_value=-1)
+
+    # One hot must be float for tf.concat reasons since all other inputs to
+    # input_layer are float32.
+    one_hot_id_tensor = array_ops.one_hot(
+        dense_id_tensor,
+        depth=self._variable_shape[-1],
+        on_value=1.0,
+        off_value=0.0)
+
+    # Reduce to get a multi-hot per example.
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  @property
+  def _variable_shape(self):
+    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
+    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+def _verify_static_batch_size_equality(tensors, columns):
+  # bath_size is a tf.Dimension object.
+  expected_batch_size = None
+  for i in range(0, len(tensors)):
+    if tensors[i].shape[0].value is not None:
+      if expected_batch_size is None:
+        bath_size_column_index = i
+        expected_batch_size = tensors[i].shape[0]
+      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
+        raise ValueError(
+            'Batch size (first dimension) of each feature must be same. '
+            'Batch size of columns ({}, {}): ({}, {})'.format(
+                columns[bath_size_column_index].name, columns[i].name,
+                expected_batch_size, tensors[i].shape[0]))
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
new file mode 100644
index 00000000000..8a57986764f
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FeatureColumns: tools for ingesting and representing features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.python.feature_column.feature_column import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'input_layer',
+    'linear_model',
+    'make_parse_example_spec',
+    'embedding_column',
+    'crossed_column',
+    'numeric_column',
+    'bucketized_column',
+    'categorical_column_with_hash_bucket',
+    'categorical_column_with_vocabulary_file',
+    'categorical_column_with_vocabulary_list',
+    'categorical_column_with_identity',
+    'weighted_categorical_column',
+    'indicator_column',
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
new file mode 100644
index 00000000000..cfa2a0f7d47
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -0,0 +1,4067 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _CategoricalColumn
+from tensorflow.python.feature_column.feature_column import _DenseColumn
+from tensorflow.python.feature_column.feature_column import _FeatureColumn
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+def _initialized_session():
+  sess = session.Session()
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class LazyColumnTest(test.TestCase):
+
+  def test_transormations_called_once(self):
+
+    class TransformCounter(_FeatureColumn):
+
+      def __init__(self):
+        self.num_transform = 0
+
+      @property
+      def name(self):
+        return 'TransformCounter'
+
+      def _transform_feature(self, cache):
+        self.num_transform += 1  # Count transform calls.
+        return cache.get('a')
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = TransformCounter()
+    self.assertEqual(0, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+
+  def test_returns_transform_output(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = Transformer()
+    self.assertEqual('Output', builder.get(column))
+    self.assertEqual('Output', builder.get(column))
+
+  def test_does_not_pollute_given_features_dict(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    features = {'a': [[2], [3.]]}
+    builder = _LazyBuilder(features=features)
+    builder.get(Transformer())
+    self.assertEqual(['a'], list(features.keys()))
+
+  def test_error_if_feature_is_not_found(self):
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'bbb is not in features dictionary'):
+      builder.get('bbb')
+
+  def test_not_supported_feature_column(self):
+
+    class NotAProperColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotAProperColumn'
+
+      def _transform_feature(self, cache):
+        # It should return not None.
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'NotAProperColumn is not supported'):
+      builder.get(NotAProperColumn())
+
+  def test_key_should_be_string_or_feature_colum(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(
+        TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
+      builder.get(NotAFeatureColumn())
+
+
+class NumericColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual((1,), a.shape)
+    self.assertIsNone(a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
+
+  def test_shape_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_default_value_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', default_value=4.)
+    self.assertEqual((4.,), a.default_value)
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual(((3., 2.),), a.default_value)
+
+  def test_shape_and_default_value_compatibility(self):
+    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc.numeric_column(
+        'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
+
+  def test_default_value_type_check(self):
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
+      fc.numeric_column(
+          'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError,
+                                 'default_value must be compatible with dtype'):
+      fc.numeric_column('aaa', default_value=['string'])
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      fc.numeric_column(
+          'aaa', shape=[
+              1.0,
+          ])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'shape dimensions must be greater than 0'):
+      fc.numeric_column(
+          'aaa', shape=[
+              0,
+          ])
+
+  def test_dtype_is_convertable_to_float(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'dtype must be convertible to float'):
+      fc.numeric_column('aaa', dtype=dtypes.string)
+
+  def test_scalar_deafult_value_fills_the_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example_no_default_value(self):
+    price = fc.numeric_column('price', shape=[2])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_parse_example_with_default_value(self):
+    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    no_data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'something_else':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString(),
+                    no_data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      fc.numeric_column('price', normalizer_fn='NotACallable')
+
+  def test_normalizer_fn_transform_feature(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
+    with self.test_session():
+      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
+
+  def test_get_dense_tensor(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
+    self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
+
+  def test_sparse_tensor_not_supported(self):
+    price = fc.numeric_column('price')
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a_copy = copy.deepcopy(a)
+    self.assertEqual(a_copy.name, 'aaa')
+    self.assertEqual(a_copy.shape, (1, 2))
+    self.assertEqual(a_copy.default_value, ((3., 2.),))
+
+  def test_numpy_default_value(self):
+    a = fc.numeric_column(
+        'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
+    self.assertEqual(a.default_value, ((3., 2.),))
+
+  def test_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
+
+class BucketizedColumnTest(test.TestCase):
+
+  def test_invalid_source_column_type(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'source_column must be a column generated with numeric_column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_source_column_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3])
+    with self.assertRaisesRegexp(
+        ValueError, 'source_column must be one-dimensional column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_boundaries(self):
+    a = fc.numeric_column('aaa')
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=None)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=1.)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 0])
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 1])
+
+  def test_name(self):
+    a = fc.numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual('aaa_bucketized', b.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
+    }, b._parse_example_spec)
+
+  def test_variable_shape(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
+    self.assertAllEqual((2, 3), b._variable_shape)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
+    self.assertEqual(6, b._num_buckets)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([bucketized_price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      transformed_tensor = _transform_features({
+          'price': [[-1., 1.], [5., 6.]]
+      }, [bucketized_price])
+      with _initialized_session():
+        self.assertAllEqual([[0, 1], [3, 4]],
+                            transformed_tensor[bucketized_price].eval())
+
+  def test_get_dense_tensor_one_input_value(self):
+    """Tests _get_dense_tensor() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.]],
+             [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]],
+             [[0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_dense_tensor_two_input_values(self):
+    """Tests _get_dense_tensor() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_sparse_tensors_one_input_value(self):
+    """Tests _get_sparse_tensors() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
+        self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
+
+  def test_get_sparse_tensors_two_input_values(self):
+    """Tests _get_sparse_tensors() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        # Values 0-4 correspond to the first column of the input price.
+        # Values 5-9 correspond to the second column of the input price.
+        self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
+        self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
+
+  def test_sparse_tensor_input_not_supported(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      bucketized_price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[2])
+    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a_bucketized_copy = copy.deepcopy(a_bucketized)
+    self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
+    self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
+    self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
+
+  def test_linear_model_one_input_value(self):
+    """Tests linear_model() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_linear_model_two_input_values(self):
+    """Tests linear_model() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.],
+             [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
+
+class HashedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a.key)
+    self.assertEqual(10, a.hash_bucket_size)
+    self.assertEqual(dtypes.string, a.dtype)
+
+  def test_bucket_size_should_be_given(self):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
+      fc.categorical_column_with_hash_bucket('aaa', None)
+
+  def test_bucket_size_should_be_positive(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'hash_bucket_size must be at least 1'):
+      fc.categorical_column_with_hash_bucket('aaa', 0)
+
+  def test_dtype_should_be_string_or_integer(self):
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(10, column.hash_bucket_size)
+      self.assertEqual(10, column._num_buckets)
+      self.assertEqual(dtypes.string, column.dtype)
+
+  def test_parse_spec_string(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, a._parse_example_spec)
+
+  def test_parse_spec_int(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_strings_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    outputs = _transform_features({'wire': wire_tensor}, [hashed_sparse])
+    output = outputs[hashed_sparse]
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [6, 4, 1]
+    with self.test_session():
+      self.assertEqual(dtypes.int64, output.values.dtype)
+      self.assertAllEqual(expected_values, output.values.eval())
+      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
+      self.assertAllEqual(wire_tensor.dense_shape.eval(),
+                          output.dense_shape.eval())
+
+  def test_tensor_dtype_should_be_string_or_integer(self):
+    string_fc = fc.categorical_column_with_hash_bucket(
+        'a_string', 10, dtype=dtypes.string)
+    int_fc = fc.categorical_column_with_hash_bucket(
+        'a_int', 10, dtype=dtypes.int32)
+    float_fc = fc.categorical_column_with_hash_bucket(
+        'a_float', 10, dtype=dtypes.string)
+    int_tensor = sparse_tensor.SparseTensor(
+        values=[101],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    string_tensor = sparse_tensor.SparseTensor(
+        values=['101'],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    float_tensor = sparse_tensor.SparseTensor(
+        values=[101.],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    builder = _LazyBuilder({
+        'a_int': int_tensor,
+        'a_string': string_tensor,
+        'a_float': float_tensor
+    })
+    builder.get(string_fc)
+    builder.get(int_fc)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      builder.get(float_fc)
+
+  def test_dtype_should_match_with_tensor(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      builder.get(hashed_sparse)
+
+  def test_ints_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=[101, 201, 301],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_int32_64_is_compatible(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_get_sparse_tensors(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2])
+    })
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+
+class CrossedColumnTest(test.TestCase):
+
+  def test_keys_empty(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column([], 10)
+
+  def test_keys_length_one(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column(['a'], 10)
+
+  def test_key_type_unsupported(self):
+    with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'categorical_column_with_hash_bucket is not supported'):
+      fc.crossed_column(
+          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+
+  def test_hash_bucket_size_negative(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], -1)
+
+  def test_hash_bucket_size_zero(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], 0)
+
+  def test_hash_bucket_size_none(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], None)
+
+  def test_name(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_leaf_keys_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 10)
+    self.assertEqual({
+        'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
+        'c': parsing_ops.VarLenFeature(dtypes.string),
+    }, crossed._parse_example_spec)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 15)
+    self.assertEqual(15, crossed._num_buckets)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    crossed2_copy = copy.deepcopy(crossed2)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(15, crossed2_copy.hash_bucket_size)
+    self.assertEqual(5, crossed2_copy.hash_key)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.])),
+            'wire':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price_cross_wire]))
+    self.assertIn('price', features)
+    self.assertIn('wire', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      wire_sparse = features['wire']
+      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
+      # Use byte constants to pass the open-source test.
+      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
+      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    hash_bucket_size = 10
+    price_cross_wire = fc.crossed_column(
+        [bucketized_price, 'wire'], hash_bucket_size)
+    features = {
+        'price': constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire': sparse_tensor.SparseTensor(
+            values=['omar', 'stringer', 'marlo'],
+            indices=[[0, 0], [1, 0], [1, 1]],
+            dense_shape=[2, 2]),
+    }
+    outputs = _transform_features(features, [price_cross_wire])
+    output = outputs[price_cross_wire]
+    with self.test_session() as sess:
+      output_val = sess.run(output)
+      self.assertAllEqual(
+          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
+      for val in output_val.values:
+        self.assertIn(val, list(range(hash_bucket_size)))
+      self.assertAllEqual([2, 4], output_val.dense_shape)
+
+  def test_get_sparse_tensors(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+          'd1':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d1A', 'd1B', 'd1C'],
+                  dense_shape=(2, 2)),
+          'd2':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d2A', 'd2B', 'd2C'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed2._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+             (1, 14), (1, 15)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (
+            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+  def test_get_sparse_tensors_simple(self):
+    """Same as test_get_sparse_tensors, but with simpler values."""
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (1, 0, 1, 3, 4, 2)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  def test_linear_model(self):
+    """Tests linear_model.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'a': constant_op.constant(((-1., .5), (.5, 1.))),
+          'c': sparse_tensor.SparseTensor(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=['cA', 'cB', 'cC'],
+              dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(
+            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_linear_model_with_weights(self):
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name: parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
+                dtypes.float32),
+            }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc.linear_model({
+            t.name: sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[0, 1, 2],
+                dense_shape=(2, 2)),
+            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[1., 10., 2.],
+                dense_shape=(2, 2)),
+            'c': sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=['cA', 'cB', 'cC'],
+                dense_shape=(2, 2)),
+        }, (crossed,))
+
+
+def get_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            'linear_model/' + column.name)[0]
+
+
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.linear_model(features={}, feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def _transform_feature(self, cache):
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.linear_model(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.linear_model(features, [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': constant_op.constant([-1., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = fc.linear_model(features, [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
+
+    net = fc.linear_model(features, [price_buckets, body_style])
+    bias = get_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose(
+          [[10 - 1000 + 5.], [1000 - 10 + 5.]],
+          sess.run(net, feed_dict={
+              features['price']: price_data,
+              features['body-style']: body_style_data}))
+
+    # Dense categorical_column with unknown shape is not allowed.
+    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
+      fc.linear_model(features, [price_buckets, body_style, country])
+
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc.linear_model(features, [price])
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc.linear_model(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+class InputLayerTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.input_layer(features={}, feature_columns=[])
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1.], [5.]], net.eval())
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          net.eval()
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session():
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.input_layer(features, [price_a, price_b])
+      net2 = fc.input_layer(features, [price_b, price_a])
+      with _initialized_session():
+        self.assertAllClose([[1., 3.]], net1.eval())
+        self.assertAllClose([[1., 3.]], net2.eval())
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+        fc.input_layer(features, [animal])
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    # one_hot_body_style has 3 dims in input_layer.
+    one_hot_body_style = fc.indicator_column(body_style)
+    # embedded_body_style has 5 dims in input_layer.
+    embedded_body_style = fc.embedding_column(body_style, dimension=5,
+                                              initializer=_initializer)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([11., 12., 13., 14.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_body_style])
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
+          sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in input_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in input_layer.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(country, dimension=5,
+                                           initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': constant_op.constant([11., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
+        # This is dense tensor for the categorical_column.
+        'country': constant_op.constant(['CA', 'US']),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+    self.assertEqual(1, features['country'].shape.ndims)
+
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_country])
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+          sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in input_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in input_layer.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(country, dimension=5,
+                                           initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        # This is dense tensor for the categorical_column.
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+    self.assertIsNone(features['country'].shape.ndims)
+
+    price_data = np.array([11., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
+
+    # Dense categorical_column with unknown shape is not allowed.
+    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
+      fc.input_layer(features, [price, one_hot_body_style, embedded_country])
+
+    net = fc.input_layer(features, [price, one_hot_body_style])
+    self.assertEqual(1 + 3, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 11.], [1., 0., 0., 12.]],
+          sess.run(net, feed_dict={
+              features['price']: price_data,
+              features['body-style']: body_style_data}))
+
+  def test_with_rank_0_feature(self):
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc.input_layer(features, [price])
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc.input_layer(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+class MakeParseExampleSpecTest(test.TestCase):
+
+  class _TestFeatureColumn(_FeatureColumn,
+                           collections.namedtuple('_TestFeatureColumn',
+                                                  ['parse_spec'])):
+
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
+  def test_no_feature_columns(self):
+    actual = fc.make_parse_example_spec([])
+    self.assertDictEqual({}, actual)
+
+  def test_invalid_type(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'All feature_columns must be _FeatureColumn instances.*invalid_column'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+
+  def test_one_feature_column(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_two_feature_columns(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2})))
+    self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
+
+  def test_equal_keys_different_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'feature_columns contain different parse_spec for key key1'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}),
+           self._TestFeatureColumn({key1: parse_spec2})))
+
+  def test_equal_keys_equal_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key1: parse_spec1})))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_multiple_features_dict(self):
+    """parse_spc for one column is a dict with length > 1."""
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    key3 = 'key3'
+    parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
+    self.assertDictEqual(
+        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class VocabularyFileCategoricalColumnTest(test.TestCase):
+
+  def setUp(self):
+    super(VocabularyFileCategoricalColumnTest, self).setUp()
+
+    # Contains ints, Golden State Warriors jersey numbers: 30, 35, 11, 23, 22
+    self._warriors_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/warriors_vocabulary.txt')
+    self._warriors_vocabulary_size = 5
+
+    # Contains strings, character names from 'The Wire': omar, stringer, marlo
+    self._wire_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  def test_defaults(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    self.assertEqual(7, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(7, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_vocabulary_file_none(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=None, vocabulary_size=3)
+
+  def test_vocabulary_file_empty_string(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='', vocabulary_size=3)
+
+  def test_invalid_vocabulary_file(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_vocabulary_size(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=None)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=-1)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=0)
+
+  def test_too_large_vocabulary_size(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size + 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_num_oov_buckets(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          num_oov_buckets=-1)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          dtype=dtypes.float64)
+
+  def test_invalid_buckets_and_default_value(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'both num_oov_buckets and default_value'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          num_oov_buckets=100,
+          default_value=2)
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        dtype=dtypes.string)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(self,
+                                  sparse_tensor.SparseTensorValue(
+                                      indices=inputs.indices,
+                                      values=np.array(
+                                          (2, -1, 0), dtype=np.int64),
+                                      dense_shape=inputs.dense_shape),
+                                  id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+        dense_shape=(2, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 33, 0, 62), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_small_vocabulary_size(self):
+    # 'marlo' is the last entry in our vocabulary file, so be setting
+    # `vocabulary_size` to 1 less than number of entries in file, we take
+    # 'marlo' out of the vocabulary.
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size - 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((-1, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 60, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+
+class VocabularyListCategoricalColumnTest(test.TestCase):
+
+  def test_defaults_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_defaults_int(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        default_value=-99)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.float32)
+
+  def test_invalid_mapping_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12., 24., 36.))
+
+  def test_mismatched_int_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.int32)
+
+  def test_mismatched_string_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
+
+  def test_none_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=None)
+
+  def test_empty_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=tuple([]))
+
+  def test_duplicate_mapping(self):
+    with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 12))
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example_string(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_parse_example_int(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(11, 21, 31))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=[11, 21],
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=np.array((11, 100, 30, 22), dtype=np.int32),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa':
+                np.array(
+                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual(3, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> None, 'omar' -> 0: wire_var[0] = 1
+        self.assertAllClose(((3.,), (1.,)), predictions.eval())
+
+
+class IdentityCategoricalColumnTest(test.TestCase):
+
+  def test_constructor(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, column._parse_example_spec)
+
+  def test_invalid_num_buckets_zero(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+
+  def test_invalid_num_buckets_negative(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+
+  def test_invalid_default_value_too_small(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=-1)
+
+  def test_invalid_default_value_too_big(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=3)
+
+  def test_invalid_input_dtype(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([11, 21], dtype=np.int64),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((0, -1), (1, 0))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_greater_or_equal_0'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 99, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_less_than_num_buckets'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_default_value(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 99),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int32)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=input_indices,
+        values=input_values,
+        dense_shape=input_shape)
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=np.array((2, 2), dtype=np.int64)),
+          id_weight_pair.id_tensor.eval(feed_dict={
+              input_indices: ((0, 0), (1, 0), (1, 1)),
+              input_values: (1, -1, 99),
+              input_shape: (2, 2),
+          }))
+
+  def test_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+
+class TransformFeaturesTest(test.TestCase):
+
+  # All transform tests are distributed in column test.
+  # Here we only test multi column case and naming
+  def transform_multi_column(self):
+    bucketized_price = fc.bucketized_column(
+        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    with ops.Graph().as_default():
+      features = {
+          'price': [[-1.], [5.]],
+          'wire':
+              sparse_tensor.SparseTensor(
+                  values=['omar', 'stringer', 'marlo'],
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  dense_shape=[2, 2])
+      }
+      transformed = _transform_features(features,
+                                        [bucketized_price, hashed_sparse])
+      with _initialized_session():
+        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
+        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+
+  def test_column_order(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _LoggerColumn(_FeatureColumn):
+
+      def __init__(self, name):
+        self._name = name
+
+      @property
+      def name(self):
+        return self._name
+
+      def _transform_feature(self, inputs):
+        del inputs
+        self.call_order = call_logger['count']
+        call_logger['count'] += 1
+        return 'Anything'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with ops.Graph().as_default():
+      column1 = _LoggerColumn('1')
+      column2 = _LoggerColumn('2')
+      call_logger = {'count': 0}
+      _transform_features({}, [column1, column2])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+      call_logger = {'count': 0}
+      _transform_features({}, [column2, column1])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+
+class IndicatorColumnTest(test.TestCase):
+
+  def test_indicator_column(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc.indicator_column(a)
+    self.assertEqual(indicator_a.categorical_column.name, 'a')
+    self.assertEqual(indicator_a._variable_shape, [1, 4])
+
+    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc.indicator_column(b)
+    self.assertEqual(indicator_b.categorical_column.name, 'b')
+    self.assertEqual(indicator_b._variable_shape, [1, 100])
+
+  def test_1D_shape_succeeds(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_hash_bucket('animal', 4))
+    builder = _LazyBuilder({'animal': ['fox', 'fox']})
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+  def test_2D_shape_succeeds(self):
+    # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
+    animal = fc.indicator_column(
+        fc.categorical_column_with_hash_bucket('animal', 4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [1, 0]],
+                values=['fox', 'fox'],
+                dense_shape=[2, 1])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+  def test_multi_hot(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+  def test_multi_hot2(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+
+  def test_deep_copy(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    column = fc.indicator_column(a)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.categorical_column.name, 'a')
+    self.assertEqual(column.name, 'a_indicator')
+    self.assertEqual(column._variable_shape, [1, 4])
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_indicator]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    features = {
+        'aaa': sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=('marlo', 'skywalker', 'omar'),
+            dense_shape=(2, 2))
+    }
+    indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
+    with _initialized_session():
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+
+  def test_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = fc.linear_model(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+  def test_input_layer(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = fc.input_layer(features, [animal])
+      with _initialized_session():
+        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+
+
+class EmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('mean', embedding_column.combiner)
+    self.assertIsNotNone(embedding_column.initializer)
+    self.assertIsNone(embedding_column.ckpt_to_load_from)
+    self.assertIsNone(embedding_column.tensor_name_in_ckpt)
+    self.assertIsNone(embedding_column.max_norm)
+    self.assertTrue(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('my_combiner', embedding_column.combiner)
+    self.assertEqual('my_initializer', embedding_column.initializer())
+    self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+    self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+    self.assertEqual(42., embedding_column.max_norm)
+    self.assertFalse(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_deep_copy(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    original = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    for embedding_column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', embedding_column.categorical_column.name)
+      self.assertEqual(3, embedding_column.categorical_column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column.categorical_column._parse_example_spec)
+
+      self.assertEqual(embedding_dimension, embedding_column.dimension)
+      self.assertEqual('my_combiner', embedding_column.combiner)
+      self.assertEqual('my_initializer', embedding_column.initializer())
+      self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+      self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+      self.assertEqual(42., embedding_column.max_norm)
+      self.assertFalse(embedding_column.trainable)
+      self.assertEqual('aaa_embedding', embedding_column.name)
+      self.assertEqual(
+          (embedding_dimension,), embedding_column._variable_shape)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column._parse_example_spec)
+
+  def test_invalid_initializer(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_embedded = fc.embedding_column(a, dimension=2)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_embedded]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc.embedding_column(a, dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2))
+    }
+    outputs = _transform_features(features, [a, a_embedded])
+    output_a = outputs[a]
+    output_embedded = outputs[a_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_3d(self):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)),
+        values=(2, 0, 1, 1, 2),
+        dense_shape=(4, 2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 3
+    embedding_values = (
+        (1., 2., 4.),   # id 0
+        (3., 5., 1.),   # id 1
+        (7., 11., 2.),  # id 2
+        (2., 7., 12.)   # id 3
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]]
+        ((7., 11., 2.), (0., 0., 0.)),
+        # example 1, ids [[], [0, 1]], embedding
+        # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]]
+        ((0., 0., 0.), (2., 3.5, 2.5)),
+        # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]]
+        ((0., 0., 0.), (0., 0., 0.)),
+        # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]]
+        ((3., 5., 1.), (7., 11., 2.)),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_weight_collections(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+
+    # Provide sparse input and get dense result.
+    embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }), weight_collections=('my_vars',))
+
+    # Assert expected embedding variable and lookups.
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    my_vars = ops.get_collection('my_vars')
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int64)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa':
+                sparse_tensor.SparseTensorValue(
+                    indices=input_indices,
+                    values=input_values,
+                    dense_shape=input_shape)
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
+          feed_dict={
+              input_indices: sparse_input.indices,
+              input_values: sparse_input.values,
+              input_shape: sparse_input.dense_shape,
+          }))
+
+  def test_get_dense_tensor_restore_from_ckpt(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable. The checkpoint file contains _embedding_values.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    ckpt_path = test.test_src_dir_path(
+        'python/feature_column/testdata/embedding.ckpt')
+    ckpt_tensor = 'my_embedding'
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        ckpt_to_load_from=ckpt_path,
+        tensor_name_in_ckpt=ckpt_tensor)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars[
+          'linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
+  def test_input_layer(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in trainable_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+  def test_input_layer_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer, trainable=False)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+
+class WeightedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    self.assertEqual('ids_weighted_by_values', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'ids': parsing_ops.VarLenFeature(dtypes.int64),
+        'values': parsing_ops.VarLenFeature(dtypes.float32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    """Tests deepcopy of categorical_column_with_hash_bucket."""
+    original = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('ids_weighted_by_values', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'ids': parsing_ops.VarLenFeature(dtypes.int64),
+          'values': parsing_ops.VarLenFeature(dtypes.float32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype_none(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=None)
+
+  def test_invalid_dtype_string(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=dtypes.string)
+
+  def test_invalid_input_dtype(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    strings = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
+      _transform_features({'ids': strings, 'values': strings}, (column,))
+
+  def test_column_name_collision(self):
+    with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='aaa', num_buckets=3),
+          weight_feature_key='aaa')._parse_example_spec()
+
+  def test_missing_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, 'values is not in features dictionary'):
+      _transform_features({'ids': inputs}, (column,))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+            'weights':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[1., 10.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_weighted]))
+    self.assertIn('aaa', features)
+    self.assertIn('weights', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([1., 10.], dtype=np.float32),
+              dense_shape=[1, 2]),
+          features['weights'].eval())
+
+  def test_transform_features(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_input(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': ((0, -1), (1, 0)),
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': ((.5, 0.), (1., .1)),
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((.5, 1., .1), dtype=np.float32),
+              dense_shape=(2, 2)),
+          weight_tensor.eval())
+
+  def test_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(.5, 1., .1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, r'Dimensions.*are not compatible'):
+        fc.linear_model({
+            'ids': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 2, 1),
+                dense_shape=(2, 2)),
+            'values': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                values=(.5, 11., 1., .1),
+                dense_shape=(2, 2))
+        }, (column,))
+
+  def test_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,))
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  # TODO(ptucker): Add test with embedding of weighted categorical.
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001
new file mode 100644
index 00000000000..5cc36d86d60
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.index b/tensorflow/python/feature_column/testdata/embedding.ckpt.index
new file mode 100644
index 00000000000..c1f35a8fcff
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.index differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.meta b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta
new file mode 100644
index 00000000000..65bc3f2becb
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta differ
diff --git a/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
new file mode 100644
index 00000000000..6c917fa6999
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
@@ -0,0 +1,5 @@
+30
+35
+11
+23
+22
diff --git a/tensorflow/python/feature_column/testdata/wire_vocabulary.txt b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
new file mode 100644
index 00000000000..32c6b5692a0
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
@@ -0,0 +1,3 @@
+omar
+stringer
+marlo
diff --git a/tensorflow/python/framework/common_shapes.py b/tensorflow/python/framework/common_shapes.py
index 487387cd836..79bf0879d70 100644
--- a/tensorflow/python/framework/common_shapes.py
+++ b/tensorflow/python/framework/common_shapes.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np
 import six.moves
 
-from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import errors
@@ -554,24 +553,11 @@ def broadcast_shape(shape_x, shape_y):
   return tensor_shape.TensorShape(return_dims)
 
 
-def call_cpp_shape_fn(op,
-                      input_tensors_needed=None,
-                      input_tensors_as_shapes_needed=None,
-                      debug_python_shape_fn=None,
-                      require_shape_fn=True):
+def call_cpp_shape_fn(op, require_shape_fn=True):
   """A shape function that delegates to the registered C++ shape function.
 
   Args:
     op: the node in the graph for which to compute output shapes.
-    input_tensors_needed: a list of input tensor indices for which to compute
-      the input tensor's value and pass to the C++ shape function.
-    input_tensors_as_shapes_needed: a list of input tensor indices for which to
-      compute the constant_value_as_shape and pass to the C++ shape function.
-    debug_python_shape_fn: For testing only during migration to using
-      call_cpp_shape_fn. Do not submit calls that set this,
-      as the comparison is slow. If non-None, the python shape function;
-      this function will be called and its output compared to that of
-      the C++ shape function.
     require_shape_fn: If true, and the C++ shape function is not registered
       in the current binary then an exception is raised; otherwise, if the
       C++ shape function is not registered then unknown_shape is used.
@@ -597,17 +583,16 @@ def call_cpp_shape_fn(op,
     # calls the C / C-API directly, we should be able to remove this.
     return {
         "shapes": [tensor_shape.TensorShape(op.get_attr("value").tensor_shape)],
-        "handle_shapes": [tensor_shape.TensorShape(None).as_proto()],
-        "handle_dtypes": [types_pb2.DT_INVALID]
+        "handle_data": [None]
     }
 
-  input_tensors_needed = input_tensors_needed or []
-  input_tensors_as_shapes_needed = input_tensors_as_shapes_needed or []
+  input_tensors_needed = []
+  input_tensors_as_shapes_needed = []
 
   while True:
     res = _call_cpp_shape_fn_impl(op, input_tensors_needed,
                                   input_tensors_as_shapes_needed,
-                                  debug_python_shape_fn, require_shape_fn)
+                                  require_shape_fn)
     if not isinstance(res, dict):
       # Handles the case where _call_cpp_shape_fn_impl calls unknown_shape(op).
       return res
@@ -631,18 +616,17 @@ def call_cpp_shape_fn(op,
 
 
 def _call_cpp_shape_fn_impl(
-    op, input_tensors_needed,
-    input_tensors_as_shapes_needed,
-    debug_python_shape_fn, require_shape_fn):
+    op, input_tensors_needed, input_tensors_as_shapes_needed, require_shape_fn):
   """Core implementaton of call_cpp_shape_fn."""
+  graph_def_version = op.graph.graph_def_versions.producer
   node_def_str = op.node_def.SerializeToString()
 
   def tensor_to_inference_result(t):
     r = cpp_shape_inference_pb2.CppShapeInferenceResult()
     r.shape.CopyFrom(t.get_shape().as_proto())
     # pylint: disable=protected-access
-    r.handle_shape.CopyFrom(t._handle_shape)
-    r.handle_dtype = t._handle_dtype
+    if t._handle_data is not None:
+      r.handle_data.CopyFrom(t._handle_data)
     # pylint: enable=protected-access
     return r.SerializeToString()
   input_shapes = [tensor_to_inference_result(i) for i in op.inputs]
@@ -666,8 +650,8 @@ def _call_cpp_shape_fn_impl(
   try:
     with errors.raise_exception_on_not_ok_status() as status:
       output = pywrap_tensorflow.RunCppShapeInference(
-          node_def_str, input_shapes, input_tensors, input_tensors_as_shapes,
-          status)
+          graph_def_version, node_def_str, input_shapes, input_tensors,
+          input_tensors_as_shapes, status)
   except errors.InvalidArgumentError as err:
     if err.message.startswith("No shape inference function exists for op"):
       missing_shape_fn = True
@@ -688,29 +672,15 @@ def _call_cpp_shape_fn_impl(
       for s in output_shapes
   ]
   result = [r.shape for r in result_protos]
-  result_handle_shapes = [r.handle_shape for r in result_protos]
-  result_handle_dtypes = [r.handle_dtype for r in result_protos]
+  result_handle_data = [
+      r.handle_data if r.handle_data.is_set else None for r in result_protos
+  ]
 
-  if debug_python_shape_fn:
-    try:
-      python_result = [tensor_shape.as_shape(s)
-                       for s in debug_python_shape_fn(op)]
-    except Exception as err:
-      raise AssertionError("Python shape function return error but "
-                           "C++ shape functon did not: %s" % str(err))
-    result_as_shapes = [tensor_shape.as_shape(s) for s in result]
-    if str(result_as_shapes) != str(python_result):
-      raise ValueError(
-          ("Python vs CPP shape mismatch.  "
-           "CPP: %s vs python: %s on node %s "
-           "with input shapes %s") % (
-               str(result_as_shapes), str(python_result), str(op.node_def),
-               ",".join([str(i.get_shape()) for i in op.inputs])))
-
-  return {"shapes": result,
-          "handle_shapes": result_handle_shapes,
-          "handle_dtypes": result_handle_dtypes,
-          "inputs_needed": output[-1]}
+  return {
+      "shapes": result,
+      "handle_data": result_handle_data,
+      "inputs_needed": output[-1]
+  }
 
 # pylint: disable=protected-access
 ops._set_call_cpp_shape_fn(call_cpp_shape_fn)
diff --git a/tensorflow/python/framework/common_shapes_test.py b/tensorflow/python/framework/common_shapes_test.py
index dc99720e8a1..62d9b568041 100644
--- a/tensorflow/python/framework/common_shapes_test.py
+++ b/tensorflow/python/framework/common_shapes_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -26,6 +28,38 @@ from tensorflow.python.platform import googletest
 
 class CommonShapesTest(test_util.TensorFlowTestCase):
 
+  # Asserts that we get the same result with numpy (for known shapes), and that
+  # the order of arguments does not matter (i.e., broadcasting is reflexive).
+  def _assert_incompatible_broadcast(self, shape1, shape2):
+    if shape1.dims is not None and shape2.dims is not None:
+      zeros1 = np.zeros(shape1.as_list())
+      zeros2 = np.zeros(shape2.as_list())
+      with self.assertRaises(ValueError):
+        np.broadcast(zeros1, zeros2)
+      with self.assertRaises(ValueError):
+        np.broadcast(zeros2, zeros1)
+    with self.assertRaises(ValueError):
+      common_shapes.broadcast_shape(shape1, shape2)
+    with self.assertRaises(ValueError):
+      common_shapes.broadcast_shape(shape2, shape1)
+
+  # Asserts that we get the same result with numpy (for known shapes), and that
+  # the order of arguments does not matter (i.e., broadcasting is reflexive).
+  def _assert_broadcast(self, expected, shape1, shape2):
+    if shape1.dims is not None and shape2.dims is not None:
+      expected_np = expected.as_list()
+      zeros1 = np.zeros(shape1.as_list())
+      zeros2 = np.zeros(shape2.as_list())
+      self.assertAllEqual(expected_np, np.broadcast(zeros1, zeros2).shape)
+      self.assertAllEqual(expected_np, np.broadcast(zeros2, zeros1).shape)
+      self.assertEqual(
+          expected, common_shapes.broadcast_shape(shape1, shape2))
+      self.assertEqual(
+          expected, common_shapes.broadcast_shape(shape2, shape1))
+    else:
+      self.assertEqual(expected, common_shapes.broadcast_shape(shape1, shape2))
+      self.assertEqual(expected, common_shapes.broadcast_shape(shape2, shape1))
+
   def testBroadcast_one_dimension(self):
     s1 = tensor_shape.vector(5)
     s2 = tensor_shape.vector(7)
@@ -35,29 +69,138 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
     expanded_scalar = tensor_shape.TensorShape([1])
 
     # Tensors with same shape should have the same broadcast result.
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, s1))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, s2))
-    self.assertEqual(unknown, common_shapes.broadcast_shape(unknown, unknown))
-    self.assertEqual(scalar, common_shapes.broadcast_shape(scalar, scalar))
-    self.assertEqual(expanded_scalar, common_shapes.broadcast_shape(
-        expanded_scalar, expanded_scalar))
+    for shape in (s1, s2, unknown, scalar, expanded_scalar):
+      self._assert_broadcast(expected=shape, shape1=shape, shape2=shape)
 
-    # [] acts like an identity.
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, scalar))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, scalar))
+    # [] and [1] act like identity.
+    self._assert_broadcast(expected=s1, shape1=s1, shape2=scalar)
+    self._assert_broadcast(expected=s2, shape1=s2, shape2=scalar)
+    self._assert_broadcast(expected=s1, shape1=s1, shape2=expanded_scalar)
+    self._assert_broadcast(expected=s2, shape1=s2, shape2=expanded_scalar)
 
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, expanded_scalar))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, expanded_scalar))
+    self._assert_broadcast(expected=unknown, shape1=s1, shape2=unknown)
+    self._assert_broadcast(expected=unknown, shape1=s2, shape2=unknown)
 
-    self.assertEqual(unknown, common_shapes.broadcast_shape(s1, unknown))
-    self.assertEqual(unknown, common_shapes.broadcast_shape(s2, unknown))
+    self._assert_broadcast(
+        expected=expanded_scalar, shape1=scalar, shape2=expanded_scalar)
 
-    self.assertEqual(expanded_scalar, common_shapes.broadcast_shape(
-        scalar, expanded_scalar))
+    self._assert_incompatible_broadcast(shape1=s1, shape2=s2)
 
-    with self.assertRaises(ValueError):
-      common_shapes.broadcast_shape(s1, s2)
-      common_shapes.broadcast_shape(s2, s1)
+  def testBroadcast_many_dimensions(self):
+    unknown = tensor_shape.unknown_shape()
+    shape_0 = tensor_shape.scalar()
+    shape_1 = tensor_shape.vector(1)
+    shape_4 = tensor_shape.vector(4)
+    shape_1x4 = tensor_shape.matrix(1, 4)
+    shape_4x1 = tensor_shape.matrix(4, 1)
+    shape_3x4 = tensor_shape.matrix(3, 4)
+    shape_4x3 = tensor_shape.matrix(4, 3)
+
+    # Tensors with same shape should have the same broadcast result.
+    for shape in (
+        shape_0, shape_1, shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+      self._assert_broadcast(expected=shape, shape1=shape, shape2=shape)
+
+    # [] and [1] act like identity.
+    for identity in (shape_0, shape_1):
+      for shape in (shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+        self._assert_broadcast(expected=shape, shape1=identity, shape2=shape)
+
+    # Unknown in, unknown out.
+    for shape in (shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+      self._assert_broadcast(expected=unknown, shape1=shape, shape2=unknown)
+
+    self._assert_broadcast(expected=shape_1x4, shape1=shape_4, shape2=shape_1x4)
+    shape_4x4 = tensor_shape.matrix(4, 4)
+    self._assert_broadcast(expected=shape_4x4, shape1=shape_4, shape2=shape_4x1)
+    self._assert_broadcast(expected=shape_3x4, shape1=shape_4, shape2=shape_3x4)
+    self._assert_incompatible_broadcast(shape1=shape_4, shape2=shape_4x3)
+    self._assert_broadcast(
+        expected=shape_4x4, shape1=shape_1x4, shape2=shape_4x1)
+    self._assert_broadcast(
+        expected=shape_3x4, shape1=shape_1x4, shape2=shape_3x4)
+    self._assert_incompatible_broadcast(shape1=shape_1x4, shape2=shape_4x3)
+    self._assert_incompatible_broadcast(shape1=shape_4x1, shape2=shape_3x4)
+    self._assert_broadcast(
+        expected=shape_4x3, shape1=shape_4x1, shape2=shape_4x3)
+    self._assert_incompatible_broadcast(shape1=shape_3x4, shape2=shape_4x3)
+
+  # Asserts that the order of arguments does not matter (i.e., broadcasting is
+  # reflexive).
+  def _assert_broadcast_with_unknown_dims(self, expected, shape1, shape2):
+    actual_dims = common_shapes.broadcast_shape(shape1, shape2).dims
+    reflexive_actual_dims = common_shapes.broadcast_shape(shape2, shape1).dims
+
+    if actual_dims is None:
+      self.assertIsNone(reflexive_actual_dims)
+    elif reflexive_actual_dims is None:
+      self.assertIsNone(actual_dims)
+    else:
+      self.assertEqual(len(actual_dims), len(reflexive_actual_dims))
+      for actual_dim, reflexive_actual_dim in zip(
+          actual_dims, reflexive_actual_dims):
+        self.assertEqual(actual_dim.value, reflexive_actual_dim.value)
+
+    expected_dims = expected.dims
+    if expected_dims is None:
+      self.assertIsNone(actual_dims)
+    elif actual_dims is None:
+      self.assertIsNone(expected_dims)
+    else:
+      self.assertEqual(len(expected_dims), len(actual_dims))
+      for expected_dim, actual_dim in zip(expected_dims, actual_dims):
+        self.assertEqual(expected_dim.value, actual_dim.value)
+
+  def testBroadcast_unknown_dims(self):
+    unknown = tensor_shape.unknown_shape()
+    shape_0 = tensor_shape.scalar()
+    shape_1 = tensor_shape.vector(1)
+    # pylint: disable=invalid-name
+    shape_U = tensor_shape.vector(None)
+    shape_1xU = tensor_shape.matrix(1, None)
+    shape_Ux1 = tensor_shape.matrix(None, 1)
+    shape_4xU = tensor_shape.matrix(4, None)
+    shape_Ux4 = tensor_shape.matrix(None, 4)
+    # pylint: enable=invalid-name
+
+    # Tensors with same shape should have the same broadcast result.
+    for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+      self._assert_broadcast_with_unknown_dims(
+          expected=shape, shape1=shape, shape2=shape)
+
+    # [] and [1] act like identity.
+    for identity in (shape_0, shape_1):
+      for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+        self._assert_broadcast_with_unknown_dims(
+            expected=shape, shape1=identity, shape2=shape)
+
+    # Unknown in, unknown out.
+    for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+      self._assert_broadcast_with_unknown_dims(
+          expected=unknown, shape1=shape, shape2=unknown)
+
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_1xU, shape1=shape_U, shape2=shape_1xU)
+    shape_UxU = tensor_shape.matrix(None, None)  # pylint: disable=invalid-name
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_UxU, shape1=shape_U, shape2=shape_Ux1)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_U, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_U, shape2=shape_Ux4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_UxU, shape1=shape_1xU, shape2=shape_Ux1)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_1xU, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_1xU, shape2=shape_Ux4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_Ux1, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_Ux1, shape2=shape_Ux4)
+    shape_4x4 = tensor_shape.matrix(4, 4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4x4, shape1=shape_4xU, shape2=shape_Ux4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 3bcc5377797..dd0fdaf7124 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -13,82 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Constant Value Tensors
+"""Operations that generate constants.
 
-TensorFlow provides several operations that you can use to generate constants.
+See the @{$python/constant_op$constants guide}.
 
 @@zeros
 @@zeros_like
-
 @@ones
 @@ones_like
-
 @@fill
-
 @@constant
-
-## Sequences
-
 @@linspace
-
 @@range
-
-## Random Tensors
-
-TensorFlow has several ops that create random tensors with different
-distributions.  The random ops are stateful, and create new random values each
-time they are evaluated.
-
-The `seed` keyword argument in these functions acts in conjunction with
-the graph-level random seed. Changing either the graph-level seed using
-[`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed) or the
-op-level seed will change the underlying seed of these operations. Setting
-neither graph-level nor op-level seed, results in a random seed for all
-operations.
-See [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-for details on the interaction between operation-level and graph-level random
-seeds.
-
-### Examples:
-
-```python
-# Create a tensor of shape [2, 3] consisting of random normal values, with mean
-# -1 and standard deviation 4.
-norm = tf.random_normal([2, 3], mean=-1, stddev=4)
-
-# Shuffle the first dimension of a tensor
-c = tf.constant([[1, 2], [3, 4], [5, 6]])
-shuff = tf.random_shuffle(c)
-
-# Each time we run these ops, different results are generated
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-
-# Set an op-level seed to generate repeatable sequences across sessions.
-norm = tf.random_normal([2, 3], seed=1234)
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-sess = tf.Session()
-print(sess.run(norm))
-print(sess.run(norm))
-```
-
-Another common use of random values is the initialization of variables. Also see
-the [Variables How To](../../how_tos/variables/index.md).
-
-```python
-# Use random uniform values in [0, 1) as the initializer for a variable of shape
-# [2, 3]. The default type is float32.
-var = tf.Variable(tf.random_uniform([2, 3]), name="var")
-init = tf.global_variables_initializer()
-
-sess = tf.Session()
-sess.run(init)
-print(sess.run(var))
-```
-
 @@random_normal
 @@truncated_normal
 @@random_uniform
@@ -96,6 +32,7 @@ print(sess.run(var))
 @@random_crop
 @@multinomial
 @@random_gamma
+@@random_poisson
 @@set_random_seed
 """
 
@@ -170,6 +107,14 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   return const_tensor
 
 
+def is_constant(tensor_or_op):
+  if isinstance(tensor_or_op, ops.Tensor):
+    op = tensor_or_op.op
+  else:
+    op = tensor_or_op
+  return op.type == "Const"
+
+
 def _constant_tensor_conversion_function(v, dtype=None, name=None,
                                          as_ref=False):
   _ = as_ref
@@ -192,14 +137,24 @@ def _tensor_shape_tensor_conversion_function(s, dtype=None, name=None,
   if not s.is_fully_defined():
     raise ValueError(
         "Cannot convert a partially known TensorShape to a Tensor: %s" % s)
+  s_list = s.as_list()
+  int64_value = 0
+  for dim in s_list:
+    if dim >= 2**31:
+      int64_value = dim
+      break
+
   if dtype is not None:
     if dtype not in (dtypes.int32, dtypes.int64):
       raise TypeError("Cannot convert a TensorShape to dtype: %s" % dtype)
+    if dtype == dtypes.int32 and int64_value:
+      raise ValueError("Cannot convert a TensorShape to dtype int32; "
+                       "a dimension is too large (%s)" % int64_value)
   else:
-    dtype = dtypes.int32
+    dtype = dtypes.int64 if int64_value else dtypes.int32
   if name is None:
     name = "shape_as_tensor"
-  return constant(s.as_list(), dtype=dtype, name=name)
+  return constant(s_list, dtype=dtype, name=name)
 
 ops.register_tensor_conversion_function(
     tensor_shape.TensorShape, _tensor_shape_tensor_conversion_function, 100)
diff --git a/tensorflow/python/framework/contrib_test.py b/tensorflow/python/framework/contrib_test.py
index 8ca0c69d775..f2eaf7c2eea 100644
--- a/tensorflow/python/framework/contrib_test.py
+++ b/tensorflow/python/framework/contrib_test.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 class ContribTest(test.TestCase):
@@ -29,17 +28,17 @@ class ContribTest(test.TestCase):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
     _ = tf.contrib.layers  # `tf.contrib` is loaded lazily on first use.
-    assert inspect.ismodule(tf.contrib)
+    assert tf_inspect.ismodule(tf.contrib)
 
   def testLayers(self):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
-    assert inspect.ismodule(tf.contrib.layers)
+    assert tf_inspect.ismodule(tf.contrib.layers)
 
   def testLinearOptimizer(self):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
-    assert inspect.ismodule(tf.contrib.linear_optimizer)
+    assert tf_inspect.ismodule(tf.contrib.linear_optimizer)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/cpp_shape_inference.cc b/tensorflow/python/framework/cpp_shape_inference.cc
index cc08e3b7050..d5e58c174bb 100644
--- a/tensorflow/python/framework/cpp_shape_inference.cc
+++ b/tensorflow/python/framework/cpp_shape_inference.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/python/framework/cpp_shape_inference.h"
 
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -24,6 +23,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_func.h"
 
 namespace tensorflow {
+
 namespace swig {
 namespace {
 
@@ -47,7 +47,7 @@ void ProtoFromShapeHandle(tensorflow::shape_inference::ShapeHandle s,
 }
 
 Status RunCppShapeInferenceImpl(
-    const string& serialized_node_def,
+    int graph_def_version, const string& serialized_node_def,
     const std::vector<string>& input_serialized_shapes,
     const std::vector<PyObject*>& input_constant_tensor_values,
     const std::vector<string>& input_constant_tensor_as_shape_values,
@@ -71,11 +71,11 @@ Status RunCppShapeInferenceImpl(
 
   // Convert input shapes.
   std::vector<TensorShapeProto> input_shapes;
-  std::vector<TensorShapeProto> input_handle_shapes;
-  std::vector<DataType> input_handle_dtypes;
+  std::vector<
+      std::unique_ptr<std::vector<std::pair<TensorShapeProto, DataType>>>>
+      input_handle_shapes_and_types;
   input_shapes.resize(input_serialized_shapes.size());
-  input_handle_shapes.resize(input_serialized_shapes.size());
-  input_handle_dtypes.resize(input_serialized_shapes.size());
+  input_handle_shapes_and_types.resize(input_serialized_shapes.size());
   CppShapeInferenceResult tmp;
   for (int i = 0; i < input_serialized_shapes.size(); ++i) {
     tmp.Clear();
@@ -83,9 +83,17 @@ Status RunCppShapeInferenceImpl(
       return errors::InvalidArgument(
           "Error parsing shape proto during cpp shape inference");
     }
+
     input_shapes[i].Swap(tmp.mutable_shape());
-    input_handle_dtypes[i] = tmp.handle_dtype();
-    input_handle_shapes[i].Swap(tmp.mutable_handle_shape());
+
+    if (tmp.handle_data().is_set()) {
+      input_handle_shapes_and_types[i].reset(
+          new std::vector<std::pair<TensorShapeProto, DataType>>);
+      auto& v = *input_handle_shapes_and_types[i];
+      for (const auto& x : tmp.handle_data().shape_and_type()) {
+        v.emplace_back(x.shape(), x.dtype());
+      }
+    }
   }
 
   // Convert input tensor values;
@@ -115,8 +123,9 @@ Status RunCppShapeInferenceImpl(
 
   // Run shape inference.
   tensorflow::shape_inference::InferenceContext c(
-      &node, op_reg_data->op_def, input_shapes, input_tensors,
-      input_tensor_as_shapes_protos, input_handle_shapes, input_handle_dtypes);
+      graph_def_version, &node, op_reg_data->op_def, input_shapes,
+      input_tensors, input_tensor_as_shapes_protos,
+      input_handle_shapes_and_types);
   TF_RETURN_IF_ERROR(c.construction_status());
 
   TF_RETURN_IF_ERROR(c.Run(op_reg_data->shape_inference_fn));
@@ -127,9 +136,18 @@ Status RunCppShapeInferenceImpl(
   for (int i = 0; i < c.num_outputs(); ++i) {
     out.Clear();
     ProtoFromShapeHandle(c.output(i), &c, out.mutable_shape());
-    ProtoFromShapeHandle(c.output_handle_shape(i), &c,
-                         out.mutable_handle_shape());
-    out.set_handle_dtype(c.output_handle_dtype(i));
+
+    const auto* shapes_and_types = c.output_handle_shapes_and_types(i);
+    if (shapes_and_types != nullptr) {
+      auto* out_handle_data = out.mutable_handle_data();
+      out_handle_data->set_is_set(true);
+      for (const auto& p : *shapes_and_types) {
+        auto* out_shape_and_type = out_handle_data->add_shape_and_type();
+        ProtoFromShapeHandle(p.shape, &c, out_shape_and_type->mutable_shape());
+        out_shape_and_type->set_dtype(p.dtype);
+      }
+    }
+
     CHECK(out.AppendToString(&(*output_tensor_shape_protos)[i]));
   }
 
@@ -151,7 +169,7 @@ Status RunCppShapeInferenceImpl(
 }  // namespace
 
 std::vector<string> RunCppShapeInference(
-    const string& serialized_node_def,
+    int graph_def_version, const string& serialized_node_def,
     const std::vector<string>& input_serialized_shapes,
     PyObject* input_constant_tensor_values,
     const std::vector<string>& input_constant_tensor_as_shape_values,
@@ -163,6 +181,7 @@ std::vector<string> RunCppShapeInference(
 
   std::vector<PyObject*> input_constant_tensor_values_v;
   int cnt = PyList_Size(input_constant_tensor_values);
+  input_constant_tensor_values_v.reserve(cnt);
   for (int i = 0; i < cnt; ++i) {
     input_constant_tensor_values_v.push_back(
         PyList_GetItem(input_constant_tensor_values, i));
@@ -171,7 +190,7 @@ std::vector<string> RunCppShapeInference(
   std::vector<string> output;
   string input_tensors_needed_out;
   tensorflow::Status status = RunCppShapeInferenceImpl(
-      serialized_node_def, input_serialized_shapes,
+      graph_def_version, serialized_node_def, input_serialized_shapes,
       input_constant_tensor_values_v, input_constant_tensor_as_shape_values,
       &output, &input_tensors_needed_out);
 
diff --git a/tensorflow/python/framework/cpp_shape_inference.h b/tensorflow/python/framework/cpp_shape_inference.h
index 79b37aa6b42..afca7277c77 100644
--- a/tensorflow/python/framework/cpp_shape_inference.h
+++ b/tensorflow/python/framework/cpp_shape_inference.h
@@ -42,7 +42,7 @@ namespace swig {
 // This is temporary code to be used during the migration
 // from python shape inference functions to C++ shape inference functions.
 std::vector<string> RunCppShapeInference(
-    const string& serialized_node_def,
+    int graph_def_version, const string& serialized_node_def,
     const std::vector<string>& input_serialized_shapes,
     PyObject* input_constant_tensor_values,
     const std::vector<string>& input_constant_tensor_as_shape_values,
diff --git a/tensorflow/python/framework/cpp_shape_inference.proto b/tensorflow/python/framework/cpp_shape_inference.proto
index 74c46d001e9..11199a9720f 100644
--- a/tensorflow/python/framework/cpp_shape_inference.proto
+++ b/tensorflow/python/framework/cpp_shape_inference.proto
@@ -7,9 +7,21 @@ import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 
 message CppShapeInferenceResult {
+  message HandleShapeAndType {
+    TensorShapeProto shape = 1;
+    DataType dtype = 2;
+  }
+  message HandleData {
+    bool is_set = 1;
+
+    // Only valid if <is_set>.
+    repeated HandleShapeAndType shape_and_type = 2;
+  }
   TensorShapeProto shape = 1;
-  TensorShapeProto handle_shape = 2;
-  DataType handle_dtype = 3;
+
+  reserved 2;  // was handle_shape
+  reserved 3;  // was handle_dtype
+  HandleData handle_data = 4;
 }
 
 message CppShapeInferenceInputsNeeded {
diff --git a/tensorflow/python/framework/docs.py b/tensorflow/python/framework/docs.py
deleted file mode 100644
index 4ae0046117b..00000000000
--- a/tensorflow/python/framework/docs.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Updates generated docs from Python doc comments.
-
-Updates the documentation files.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import inspect
-import os
-import re
-
-
-_arg_re = re.compile(" *([*]{0,2}[a-zA-Z][a-zA-Z0-9_]*):")
-_section_re = re.compile("([A-Z][a-zA-Z ]*):$")
-_always_drop_symbol_re = re.compile("_[_a-zA-Z0-9]")
-_anchor_re = re.compile(r"^[\w.]+$")
-_member_mark = "@@"
-_indiv_dir = "functions_and_classes"
-_num_subdirs = 10
-_subdir_prefix = "shard"
-
-
-class Document(object):
-  """Base class for an automatically generated document."""
-
-  def write_markdown_to_file(self, f):
-    """Writes a Markdown-formatted version of this document to file `f`.
-
-    Args:
-      f: The output file.
-    """
-    raise NotImplementedError("Document.WriteToFile")
-
-
-class Index(Document):
-  """An automatically generated index for a collection of documents."""
-
-  def __init__(self, module_to_name, members, filename_to_library_map,
-               path_prefix):
-    """Creates a new Index.
-
-    Args:
-      module_to_name: Dictionary mapping modules to short names.
-      members: Dictionary mapping member name to (fullname, member).
-      filename_to_library_map: A list of (filename, Library) pairs. The order
-        corresponds to the order in which the libraries appear in the index.
-      path_prefix: Prefix to add to links in the index.
-    """
-    self._module_to_name = module_to_name
-    self._members = members
-    self._filename_to_library_map = filename_to_library_map
-    self._path_prefix = path_prefix
-
-  def write_markdown_to_file(self, f):
-    """Writes this index to file `f`.
-
-    The output is formatted as an unordered list. Each list element
-    contains the title of the library, followed by a list of symbols
-    in that library hyperlinked to the corresponding anchor in that
-    library.
-
-    Args:
-      f: The output file.
-    """
-    print("<!-- This file is machine generated: DO NOT EDIT! -->", file=f)
-    print("", file=f)
-    print("# TensorFlow Python reference documentation", file=f)
-    print("", file=f)
-    fullname_f = lambda name: self._members[name][0]
-    anchor_f = lambda name: get_anchor(self._module_to_name, fullname_f(name))
-
-    for filename, library in self._filename_to_library_map:
-      sorted_names = sorted(library.mentioned, key=lambda x: (str.lower(x), x))
-      member_names = [n for n in sorted_names if n in self._members]
-      # TODO(wicke): This is a hack that should be removed as soon as the
-      # website code allows it.
-      full_filename = self._path_prefix + filename
-      links = ["[`%s`](%s#%s)" % (name, full_filename, anchor_f(name))
-               for name in member_names]
-      if links:
-        print("* **[%s](%s)**:" % (library.title, full_filename), file=f)
-        for link in links:
-          print("  * %s" % link, file=f)
-        print("", file=f)
-
-
-def collect_members(module_to_name, exclude=()):
-  """Collect all symbols from a list of modules.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    exclude: Set of fully qualified names to exclude.
-
-  Returns:
-    Dictionary mapping name to (fullname, member) pairs.
-
-  Raises:
-    RuntimeError: if we can not resolve a name collision.
-  """
-  members = {}
-  for module, module_name in module_to_name.items():
-    all_names = getattr(module, "__all__", None)
-    for name, member in inspect.getmembers(module):
-      if ((inspect.isfunction(member)
-           or inspect.isclass(member)
-           or isinstance(member, functools.partial))
-          and not _always_drop_symbol_re.match(name) and
-          (all_names is None or name in all_names)):
-        fullname = "%s.%s" % (module_name, name)
-        if fullname in exclude:
-          continue
-        if name in members:
-          other_fullname, other_member = members[name]
-          if member is not other_member:
-            raise RuntimeError("Short name collision between %s and %s" %
-                               (fullname, other_fullname))
-          if len(fullname) == len(other_fullname):
-            raise RuntimeError("Can't decide whether to use %s or %s for %s: "
-                               "both full names have length %d" %
-                               (fullname, other_fullname, name, len(fullname)))
-          if len(fullname) > len(other_fullname):
-            continue  # Use the shorter full name
-        members[name] = fullname, member
-  return members
-
-
-def get_anchor(module_to_name, fullname):
-  """Turn a full member name into an anchor.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    fullname: Fully qualified name of symbol.
-
-  Returns:
-    HTML anchor string.  The longest module name prefix of fullname is
-    removed to make the anchor.
-
-  Raises:
-    ValueError: If fullname uses characters invalid in an anchor.
-  """
-  if not _anchor_re.match(fullname):
-    raise ValueError("'%s' is not a valid anchor" % fullname)
-  anchor = fullname
-  for module_name in module_to_name.values():
-    if fullname.startswith(module_name + "."):
-      rest = fullname[len(module_name)+1:]
-      # Use this prefix iff it is longer than any found before
-      if len(anchor) > len(rest):
-        anchor = rest
-  return anchor
-
-
-def _stable_hash(s):
-  """A simple string hash that won't change from run to run."""
-  ret = 0
-  for c in s:
-    ret = ret * 97 + ord(c)
-  return ret
-
-
-class Library(Document):
-  """An automatically generated document for a set of functions and classes."""
-
-  def __init__(self,
-               title,
-               module,
-               module_to_name,
-               members,
-               documented,
-               exclude_symbols=(),
-               prefix=None):
-    """Creates a new Library.
-
-    Args:
-      title: A human-readable title for the library.
-      module: Module to pull high level docstring from (for table of contents,
-        list of Ops to document, etc.).
-      module_to_name: Dictionary mapping modules to short names.
-      members: Dictionary mapping member name to (fullname, member).
-      documented: Set of documented names to update.
-      exclude_symbols: A list of specific symbols to exclude.
-      prefix: A string to include at the beginning of the page.
-    """
-    self._title = title
-    self._module = module
-    self._module_to_name = module_to_name
-    self._members = dict(members)  # Copy since we mutate it below
-    self._exclude_symbols = frozenset(exclude_symbols)
-    documented.update(exclude_symbols)
-    self._documented = documented
-    self._mentioned = set()
-    self._prefix = prefix or ""
-
-  @property
-  def title(self):
-    """The human-readable title for this library."""
-    return self._title
-
-  @property
-  def mentioned(self):
-    """Set of names mentioned in this library."""
-    return self._mentioned
-
-  @property
-  def exclude_symbols(self):
-    """Set of excluded symbols."""
-    return self._exclude_symbols
-
-  def _should_include_member(self, name):
-    """Returns True if this member should be included in the document."""
-    # __x__ should be documented always
-    name_is_operator = name.startswith("__") and name.endswith("__")
-    name_is_private = name.startswith("_") and not name_is_operator
-    name_is_excluded = name in self._exclude_symbols
-    return not (name_is_private or name_is_excluded)
-
-  def get_imported_modules(self, module):
-    """Returns the list of modules imported from `module`."""
-    for name, member in inspect.getmembers(module):
-      if inspect.ismodule(member):
-        yield name, member
-
-  def get_class_members(self, cls_name, cls):
-    """Returns the list of class members to document in `cls`.
-
-    This function filters the class member to ONLY return those
-    defined by the class.  It drops the inherited ones.
-
-    Args:
-      cls_name: Qualified name of `cls`.
-      cls: An inspect object of type 'class'.
-
-    Yields:
-      name, member tuples.
-    """
-    for name, member in inspect.getmembers(cls):
-      # Only show methods and properties presently.  In Python 3,
-      # methods register as isfunction.
-      is_method = (inspect.ismethod(member) or inspect.isfunction(member)
-                   or isinstance(member, functools.partial))
-      if not (is_method or isinstance(member, property)):
-        continue
-      if self._should_include_member(name):
-        yield name, ("%s.%s" % (cls_name, name), member)
-
-  def shard_dir(self, name):
-    """Returns the path of the doc subdirectory for member `name`.
-
-    When generating individual files for each function and class, we shard
-    the files across several directories to avoid hitting the limit for
-    files per directory. This function determines the subdirectory for
-    a member based on a stable hash of its name.
-
-    Args:
-      name: string. The name of a function or class.
-
-    Returns:
-      The path to a subdirectory of the api docs directory.
-    """
-    index = _stable_hash(name) % _num_subdirs
-    return os.path.join(self.functions_and_classes_dir,
-                        _subdir_prefix + str(index))
-
-  def set_functions_and_classes_dir(self, dirname):
-    """Sets the name of the directory for function and class markdown files.
-
-    Args:
-      dirname: string. The name of the directory in which to store function
-        and class markdown files.
-    """
-    self.functions_and_classes_dir = dirname
-
-  def _generate_signature_for_function(self, func):
-    """Given a function, returns a string representing its args."""
-    args_list = []
-    if isinstance(func, functools.partial):
-      argspec = inspect.getargspec(func.func)
-      # Remove the args from the original function that have been used up.
-      first_default_arg = (
-          len(argspec.args or []) - len(argspec.defaults or []))
-      partial_args = len(func.args)
-      if argspec.args:
-        argspec_args = list(argspec.args[partial_args:])
-      else:
-        argspec_args = []
-      if argspec.defaults:
-        argspec_defaults = list(argspec.defaults[
-            max(0, partial_args-first_default_arg):])
-      else:
-        argspec_defaults = []
-      first_default_arg = max(0, first_default_arg - partial_args)
-      for kwarg in func.keywords:
-        if kwarg in argspec_args:
-          i = argspec_args.index(kwarg)
-          argspec_args.pop(i)
-          if i >= first_default_arg:
-            argspec_defaults.pop(i-first_default_arg)
-          else:
-            first_default_arg -= 1
-      argspec_varargs = None
-      argspec_keywords = None
-
-    else:
-      argspec = inspect.getargspec(func)
-      argspec_args = argspec.args
-      argspec_defaults = argspec.defaults
-      argspec_varargs = argspec.varargs
-      argspec_keywords = argspec.keywords
-
-    first_arg_with_default = (
-        len(argspec_args or []) - len(argspec_defaults or []))
-    for arg in argspec_args[:first_arg_with_default]:
-      if arg == "self":
-        # Python documentation typically skips `self` when printing method
-        # signatures.
-        continue
-      args_list.append(arg)
-
-    # TODO(mrry): This is a workaround for documenting signature of
-    # functions that have the @contextlib.contextmanager decorator.
-    # TODO(aselle): This workaround is brittle on TestCase.__call__
-    #  so we need to wrap this in a try/catch
-    # We should do something better.
-    if argspec_varargs == "args" and argspec_keywords == "kwds":
-      try:
-        original_func = func.__closure__[0].cell_contents
-        return self._generate_signature_for_function(original_func)
-      except TypeError:
-        pass
-
-    if argspec_defaults:
-      for arg, default in zip(
-          argspec_args[first_arg_with_default:], argspec_defaults):
-        if callable(default):
-          if hasattr(default, "__name__"):
-            args_list.append("%s=%s" % (arg, default.__name__))
-          else:
-            # A callable may be a class instance.
-            # TODO(fchollet): handle case with non-default constructor
-            # arguments (currently not present in the TF codebase).
-            args_list.append("%s=%s()" % (arg, default.__class__.__name__))
-        else:
-          args_list.append("%s=%r" % (arg, default))
-    if argspec_varargs:
-      args_list.append("*" + argspec_varargs)
-    if argspec_keywords:
-      args_list.append("**" + argspec_keywords)
-    return "(" + ", ".join(args_list) + ")"
-
-  def _remove_docstring_indent(self, docstring):
-    """Remove indenting.
-
-    We follow Python's convention and remove the minimum indent of the lines
-    after the first, see:
-    https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
-    preserving relative indentation.
-
-    Args:
-      docstring: A docstring.
-
-    Returns:
-      A list of strings, one per line, with the minimum indent stripped.
-    """
-    docstring = docstring or ""
-    lines = docstring.strip().split("\n")
-
-    min_indent = len(docstring)
-    for l in lines[1:]:
-      l = l.rstrip()
-      if l:
-        i = 0
-        while i < len(l) and l[i] == " ":
-          i += 1
-        if i < min_indent: min_indent = i
-    for i in range(1, len(lines)):
-      l = lines[i].rstrip()
-      if len(l) >= min_indent:
-        l = l[min_indent:]
-      lines[i] = l
-    return lines
-
-  def _print_formatted_docstring(self, docstring, f):
-    """Formats the given `docstring` as Markdown and prints it to `f`."""
-    lines = self._remove_docstring_indent(docstring)
-
-    # Output the lines, identifying "Args" and other section blocks.
-    i = 0
-
-    def _at_start_of_section():
-      """Returns the header if lines[i] is at start of a docstring section."""
-      l = lines[i]
-      match = _section_re.match(l)
-      if match and i + 1 < len(
-          lines) and lines[i + 1].startswith(" "):
-        return match.group(1)
-      else:
-        return None
-
-    while i < len(lines):
-      l = lines[i]
-
-      section_header = _at_start_of_section()
-      if section_header:
-        if i == 0 or lines[i-1]:
-          print("", file=f)
-        # Use at least H4 to keep these out of the TOC.
-        print("##### " + section_header + ":", file=f)
-        print("", file=f)
-        i += 1
-        outputting_list = False
-        while i < len(lines):
-          l = lines[i]
-          # A new section header terminates the section.
-          if _at_start_of_section():
-            break
-          match = _arg_re.match(l)
-          if match:
-            if not outputting_list:
-              # We need to start a list. In Markdown, a blank line needs to
-              # precede a list.
-              print("", file=f)
-              outputting_list = True
-            suffix = l[len(match.group()):].lstrip()
-            print("*  <b>`" + match.group(1) + "`</b>: " + suffix, file=f)
-          else:
-            # For lines that don't start with _arg_re, continue the list if it
-            # has enough indentation.
-            outputting_list &= l.startswith("   ")
-            print(l, file=f)
-          i += 1
-      else:
-        print(l, file=f)
-        i += 1
-
-  def _print_function(self, f, prefix, fullname, func):
-    """Prints the given function to `f`."""
-    heading = prefix + " `" + fullname
-    if not isinstance(func, property):
-      heading += self._generate_signature_for_function(func)
-    heading += "` {#%s}" % get_anchor(self._module_to_name, fullname)
-    print(heading, file=f)
-    print("", file=f)
-    self._print_formatted_docstring(inspect.getdoc(func), f)
-    print("", file=f)
-
-  def _write_member_markdown_to_file(self, f, prefix, name, member):
-    """Print `member` to `f`."""
-    if (inspect.isfunction(member) or inspect.ismethod(member)
-        or (isinstance(member, functools.partial)
-            and inspect.isfunction(member.func))
-        or isinstance(member, property)):
-      print("- - -", file=f)
-      print("", file=f)
-      self._print_function(f, prefix, name, member)
-      print("", file=f)
-
-      # Write an individual file for each function.
-      if inspect.isfunction(member):
-        indivf = open(
-            os.path.join(self.shard_dir(name), name + ".md"), "w+")
-        self._print_function(indivf, prefix, name, member)
-    elif (inspect.isclass(member)
-          or (isinstance(member, functools.partial)
-              and inspect.isclass(member.func))):
-      print("- - -", file=f)
-      print("", file=f)
-      print("%s `class %s` {#%s}" % (prefix, name,
-                                     get_anchor(self._module_to_name, name)),
-            file=f)
-      print("", file=f)
-      self._write_class_markdown_to_file(f, name, member)
-      print("", file=f)
-
-      # Write an individual file for each class.
-      indivf = open(
-          os.path.join(self.shard_dir(name), name + ".md"), "w+")
-      self._write_class_markdown_to_file(indivf, name, member)
-    else:
-      raise RuntimeError("Member %s has unknown type %s" % (name, type(member)))
-
-  def _write_docstring_markdown_to_file(self, f, prefix, docstring, members,
-                                        imports):
-    for l in self._remove_docstring_indent(docstring):
-      if l.startswith(_member_mark):
-        name = l[len(_member_mark):].strip(" \t")
-        if name in members:
-          self._documented.add(name)
-          self._mentioned.add(name)
-          self._write_member_markdown_to_file(f, prefix, *members[name])
-          del members[name]
-        elif name in imports:
-          self._write_module_markdown_to_file(f, imports[name])
-        else:
-          raise ValueError("%s: unknown member `%s`, markdown=`%s`." % (
-              self._title, name, l))
-      else:
-        print(l, file=f)
-
-  def _write_class_markdown_to_file(self, f, name, cls):
-    """Write the class doc to `f`.
-
-    Args:
-      f: File to write to.
-      name: name to use.
-      cls: class object.
-    """
-    # Build the list of class methods to document.
-    methods = dict(self.get_class_members(name, cls))
-    # Used later to check if any methods were called out in the class
-    # docstring.
-    num_methods = len(methods)
-    try:
-      self._write_docstring_markdown_to_file(f, "####", inspect.getdoc(cls),
-                                             methods, {})
-    except ValueError as e:
-      raise ValueError(str(e) + " in class `%s`" % cls.__name__)
-
-    # If some methods were not described, describe them now if they are
-    # defined by the class itself (not inherited).  If NO methods were
-    # described, describe all methods.
-    #
-    # TODO(touts): when all methods have been categorized make it an error
-    # if some methods are not categorized.
-    any_method_called_out = (len(methods) != num_methods)
-    if any_method_called_out:
-      other_methods = {n: m for n, m in methods.items() if n in cls.__dict__}
-      if other_methods:
-        print("\n#### Other Methods", file=f)
-    else:
-      other_methods = methods
-    for name in sorted(other_methods):
-      self._write_member_markdown_to_file(f, "####", *other_methods[name])
-
-  def _write_module_markdown_to_file(self, f, module):
-    imports = dict(self.get_imported_modules(module))
-    self._write_docstring_markdown_to_file(f, "###", inspect.getdoc(module),
-                                           self._members, imports)
-
-  def write_markdown_to_file(self, f):
-    """Prints this library to file `f`.
-
-    Args:
-      f: File to write to.
-
-    Returns:
-      Dictionary of documented members.
-    """
-    print("<!-- This file is machine generated: DO NOT EDIT! -->", file=f)
-    print("", file=f)
-    # TODO(touts): Do not insert these.  Let the doc writer put them in
-    # the module docstring explicitly.
-    print("#", self._title, file=f)
-    if self._prefix:
-      print(self._prefix, file=f)
-    print("[TOC]", file=f)
-    print("", file=f)
-    if self._module is not None:
-      self._write_module_markdown_to_file(f, self._module)
-
-  def write_other_members(self, f, catch_all=False):
-    """Writes the leftover members to `f`.
-
-    Args:
-      f: File to write to.
-      catch_all: If true, document all missing symbols from any module.
-        Otherwise, document missing symbols from just this module.
-    """
-    if catch_all:
-      names = self._members.items()
-    else:
-      names = inspect.getmembers(self._module)
-      all_names = getattr(self._module, "__all__", None)
-      if all_names is not None:
-        names = [(n, m) for n, m in names if n in all_names]
-    leftovers = []
-    for name, _ in names:
-      if name in self._members and name not in self._documented:
-        leftovers.append(name)
-    if leftovers:
-      print("%s: undocumented members: %d" % (self._title, len(leftovers)))
-      print("\n## Other Functions and Classes", file=f)
-      for name in sorted(leftovers):
-        print("  %s" % name)
-        self._documented.add(name)
-        self._mentioned.add(name)
-        self._write_member_markdown_to_file(f, "###", *self._members[name])
-
-  def assert_no_leftovers(self):
-    """Generate an error if there are leftover members."""
-    leftovers = []
-    for name in self._members:
-      if name in self._members and name not in self._documented:
-        leftovers.append(name)
-    if leftovers:
-      raise RuntimeError("%s: undocumented members: %s" %
-                         (self._title, ", ".join(leftovers)))
-
-
-def write_libraries(output_dir, libraries):
-  """Write a list of libraries to disk.
-
-  Args:
-    output_dir: Output directory.
-    libraries: List of (filename, library) pairs.
-  """
-  files = [open(os.path.join(output_dir, k), "w") for k, _ in libraries]
-
-  # Set the directory in which to save individual class and function md files,
-  # creating it if it doesn't exist. Create subdirectories to avoid hitting
-  # the limit for number of files in a directory.
-  indiv_dir = os.path.join(output_dir, _indiv_dir)
-  if not os.path.exists(indiv_dir):
-    os.makedirs(indiv_dir)
-
-  for i in range(0, _num_subdirs):
-    subdir = os.path.join(indiv_dir, _subdir_prefix + str(i))
-    if not os.path.exists(subdir):
-      os.makedirs(subdir)
-
-  # Document mentioned symbols for all libraries
-  for f, (_, v) in zip(files, libraries):
-    v.set_functions_and_classes_dir(indiv_dir)
-    v.write_markdown_to_file(f)
-  # Document symbols that no library mentioned.  We do this after writing
-  # out all libraries so that earlier libraries know what later libraries
-  # documented.
-  for f, (_, v) in zip(files, libraries):
-    v.write_other_members(f)
-    f.close()
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 7cc5854a307..3e6c04982b4 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -54,22 +54,6 @@ class DType(object):
 
   The `tf.as_dtype()` function converts numpy types and string type
   names to a `DType` object.
-
-  @@is_compatible_with
-  @@name
-  @@base_dtype
-  @@real_dtype
-  @@is_bool
-  @@is_floating
-  @@is_complex
-  @@is_integer
-  @@is_quantized
-  @@is_unsigned
-
-  @@as_numpy_dtype
-  @@as_datatype_enum
-
-  @@limits
   """
 
   def __init__(self, type_enum):
@@ -286,6 +270,9 @@ class DType(object):
     """Returns the string name for this `DType`."""
     return _TYPE_TO_STRING[self._type_enum]
 
+  def __int__(self):
+    return self._type_enum
+
   def __str__(self):
     return "<dtype: %r>" % self.name
 
@@ -474,6 +461,9 @@ _np_qint16 = np.dtype([("qint16", np.int16, 1)])
 _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
 _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 
+# Custom struct dtype for directly-fed ResourceHandles of supported type(s).
+np_resource = np.dtype([("resource", np.ubyte, 1)])
+
 # Standard mappings between types_pb2.DataType values and numpy.dtypes.
 _NP_TO_TF = frozenset([
     (np.float16, float16),
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index fac2cf4def9..5bb60763b6e 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.framework.importer."""
+"""Tests for tensorflow.python.framework.dtypes."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -45,8 +45,8 @@ class TypesTest(test_util.TensorFlowTestCase):
     for datatype_enum in types_pb2.DataType.values():
       if datatype_enum == types_pb2.DT_INVALID:
         continue
-      self.assertEqual(datatype_enum,
-                       dtypes.as_dtype(datatype_enum).as_datatype_enum)
+      dt = dtypes.as_dtype(datatype_enum)
+      self.assertEqual(datatype_enum, dt.as_datatype_enum)
 
   def testAllTypesConvertibleToNumpyDtype(self):
     for datatype_enum in types_pb2.DataType.values():
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 79e0b5f069c..32c96ec9471 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -32,9 +32,6 @@ class OpError(Exception):
 
   Whenever possible, the session will raise a more specific subclass
   of `OpError` from the `tf.errors` module.
-
-  @@op
-  @@node_def
   """
 
   def __init__(self, node_def, op, message, error_code):
@@ -64,9 +61,9 @@ class OpError(Exception):
 
     *N.B.* If the failed op was synthesized at runtime, e.g. a `Send`
     or `Recv` op, there will be no corresponding
-    [`Operation`](../../api_docs/python/framework.md#Operation)
+    @{tf.Operation}
     object.  In that case, this will return `None`, and you should
-    instead use the [`OpError.node_def`](#OpError.node_def) to
+    instead use the @{tf.OpError.node_def} to
     discover information about the op.
 
     Returns:
@@ -159,10 +156,10 @@ class CancelledError(OpError):
   """Raised when an operation or step is cancelled.
 
   For example, a long-running operation (e.g.
-  [`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue) may be
+  @{tf.QueueBase.enqueue} may be
   cancelled by running another operation (e.g.
-  [`queue.close(cancel_pending_enqueues=True)`](../../api_docs/python/io_ops.md#QueueBase.close),
-  or by [closing the session](../../api_docs/python/client.md#Session.close).
+  @{tf.QueueBase.close},
+  or by @{tf.Session.close}.
   A step that is running such a long-running operation will fail by raising
   `CancelledError`.
 
@@ -197,9 +194,9 @@ class InvalidArgumentError(OpError):
 
   This may occur, for example, if an operation is receives an input
   tensor that has an invalid value or shape. For example, the
-  [`tf.matmul()`](../../api_docs/python/math_ops.md#matmul) op will raise this
+  @{tf.matmul} op will raise this
   error if it receives an input that is not a matrix, and the
-  [`tf.reshape()`](../../api_docs/python/array_ops.md#reshape) op will raise
+  @{tf.reshape} op will raise
   this error if the new shape does not match the number of elements in the input
   tensor.
 
@@ -230,7 +227,7 @@ class NotFoundError(OpError):
   """Raised when a requested entity (e.g., a file or directory) was not found.
 
   For example, running the
-  [`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
+  @{tf.WholeFileReader.read}
   operation could raise `NotFoundError` if it receives the name of a file that
   does not exist.
 
@@ -246,7 +243,7 @@ class AlreadyExistsError(OpError):
   """Raised when an entity that we attempted to create already exists.
 
   For example, running an operation that saves a file
-  (e.g. [`tf.train.Saver.save()`](../../api_docs/python/train.md#Saver.save))
+  (e.g. @{tf.train.Saver.save})
   could potentially raise this exception if an explicit filename for an
   existing file was passed.
 
@@ -263,7 +260,7 @@ class PermissionDeniedError(OpError):
   """Raised when the caller does not have permission to run an operation.
 
   For example, running the
-  [`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
+  @{tf.WholeFileReader.read}
   operation could raise `PermissionDeniedError` if it receives the name of a
   file for which the user does not have the read file permission.
 
@@ -309,7 +306,7 @@ class FailedPreconditionError(OpError):
   """Operation was rejected because the system is not in a state to execute it.
 
   This exception is most commonly raised when running an operation
-  that reads a [`tf.Variable`](../../api_docs/python/state_ops.md#Variable)
+  that reads a @{tf.Variable}
   before it has been initialized.
 
   @@__init__
@@ -325,9 +322,9 @@ class AbortedError(OpError):
   """The operation was aborted, typically due to a concurrent action.
 
   For example, running a
-  [`queue.enqueue()`](../../api_docs/python/io_ops.md#QueueBase.enqueue)
+  @{tf.QueueBase.enqueue}
   operation may raise `AbortedError` if a
-  [`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close) operation
+  @{tf.QueueBase.close} operation
   previously ran.
 
   @@__init__
@@ -342,9 +339,9 @@ class OutOfRangeError(OpError):
   """Raised when an operation iterates past the valid input range.
 
   This exception is raised in "end-of-file" conditions, such as when a
-  [`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
+  @{tf.QueueBase.dequeue}
   operation is blocked on an empty queue, and a
-  [`queue.close()`](../../api_docs/python/io_ops.md#QueueBase.close)
+  @{tf.QueueBase.close}
   operation executes.
 
   @@__init__
@@ -361,7 +358,7 @@ class UnimplementedError(OpError):
 
   Some operations may raise this error when passed otherwise-valid
   arguments that it does not currently support. For example, running
-  the [`tf.nn.max_pool()`](../../api_docs/python/nn.md#max_pool) operation
+  the @{tf.nn.max_pool} operation
   would raise this error if pooling was requested on the batch dimension,
   because this is not yet supported.
 
@@ -406,7 +403,7 @@ class DataLossError(OpError):
   """Raised when unrecoverable data loss or corruption is encountered.
 
   For example, this may be raised by running a
-  [`tf.WholeFileReader.read()`](../../api_docs/python/io_ops.md#WholeFileReader)
+  @{tf.WholeFileReader.read}
   operation, if the file is truncated while it is being read.
 
   @@__init__
@@ -459,8 +456,8 @@ def _make_specific_exception(node_def, op, message, error_code):
 
 @contextlib.contextmanager
 def raise_exception_on_not_ok_status():
+  status = pywrap_tensorflow.TF_NewStatus()
   try:
-    status = pywrap_tensorflow.TF_NewStatus()
     yield status
     if pywrap_tensorflow.TF_GetCode(status) != 0:
       raise _make_specific_exception(
diff --git a/tensorflow/python/framework/file_system_test.py b/tensorflow/python/framework/file_system_test.py
index fb5659dd706..5eb59141a2a 100644
--- a/tensorflow/python/framework/file_system_test.py
+++ b/tensorflow/python/framework/file_system_test.py
@@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Tests for functions."""
+"""Tests for file_system."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import load_library
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index fe935881c68..80f936e8ab4 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -41,6 +41,8 @@
 @@import_graph_def
 @@load_file_system_library
 @@load_op_library
+@@make_tensor_proto
+@@make_ndarray
 
 ## Graph collections
 
@@ -98,6 +100,10 @@ from tensorflow.python.framework.sparse_tensor import convert_to_tensor_or_spars
 from tensorflow.python.framework.subscribe import subscribe
 from tensorflow.python.framework.importer import import_graph_def
 
+# Utilities for working with Tensors
+from tensorflow.python.framework.tensor_util import make_tensor_proto
+from tensorflow.python.framework.tensor_util import MakeNdarray as make_ndarray
+
 # Needed when you defined a new Op in C++.
 from tensorflow.python.framework.ops import RegisterGradient
 from tensorflow.python.framework.ops import NotDifferentiable
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index feba9757082..dbd406ebd5e 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -21,8 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import hashlib
-import inspect
 import re
 
 from tensorflow.core.framework import attr_value_pb2
@@ -32,359 +32,210 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
-def _make_argname_from_tensor_name(name):
-  return re.sub(":0$", "", name).replace(":", "_o")
+class Defun(object):
+  """Decorator used to define TensorFlow functions.
 
+  Use this decorator to make a Python function usable directly as a TensorFlow
+  function.
 
-def _tensor_to_argdef(t, name=None, used_names=None):
-  """Convert tensor t to an argdef, with a specified name or a unique name."""
-  arg = op_def_pb2.OpDef.ArgDef()
-  if name is None:
-    arg.name = _make_argname_from_tensor_name(t.name)
-    if used_names is not None:
-      if arg.name in used_names:
-        i = 0
-        while True:
-          new_name = "%s_U%d" % (arg.name, i)
-          if new_name not in used_names:
-            arg.name = new_name
-            break
-          i += 1
-      used_names.add(arg.name)
-  else:
-    arg.name = name
-  arg.type = t.dtype.as_datatype_enum
-  return arg
+  The decorated function must add ops to the default graph and return zero or
+  more `Tensor` objects.  Call the decorator with named arguments, one for each
+  argument of the function to decorate, with the expected type of the argument
+  as value.
 
+  For example if the function to decorate accepts two `tf.float32` arguments
+  named `x` and `y`, call the decorator with:
 
-def _get_node_def(op):
-  return op._node_def  # pylint: disable=protected-access
+      @Defun(tf.float32, tf.float32)
+      def foo(x, y):
+        ...
 
+  When you call the decorated function it will add `call` ops to the
+  default graph and adds the definition of the function into the
+  default graph. Because the addition of the function into the graph
+  is deferred, the decorator can be used anywhere in the program.
 
-def _get_op_def(op):
-  # pylint: disable=protected-access
-  if hasattr(op, "_sig"):
-    return getattr(op, "_sig")
-  else:
-    return op_def_registry.get_registered_ops()[op.type]
-  # pylint: enable=protected-access
+  Any variables created inside of the function are hoisted into the outer graph.
+  Note that the variables are created in the variable scope that was active
+  during the first call to the function. Subsequent function calls will refer to
+  the same set of variables.
 
+  Definitions of functions are frozen in a graph as soon as the graph is used to
+  create a session. Therefore, nodes using the function must be created in the
+  graph before the corresponding session is created.
 
-def _is_in_placeholders(op, func_arg_placeholders):
-  return op.values() and (op.values()[0].name in func_arg_placeholders)
+  Example, but also see the [How To on functions](link_needed).
 
+  ```python
+  # Defining the function.
+  @tf.Defun(tf.float32, tf.float32)
+  def MyFunc(x, y):
+    return x + y, x - y
 
-def _create_input_dict(function_graph, func_arg_placeholders):
-  """Create a mapping from graph tensor names to function tensor names."""
-  input_dict = {}
-  for op in function_graph.get_operations():
-    if _is_in_placeholders(op, func_arg_placeholders):
-      input_dict[op.values()[0].name] = op.values()[0].name
-      input_dict[op.name] = op.name
-    else:
-      op_def = _get_op_def(op)
-      attrs = _get_node_def(op).attr
-      o = 0
-      for arg_def in op_def.output_arg:
-        if arg_def.number_attr:
-          num = attrs[arg_def.number_attr].i
-        elif arg_def.type_list_attr:
-          num = len(attrs[arg_def.type_list_attr].list.type)
-        else:
-          num = 1
-        for i in range(num):
-          result = "%s:%s:%d" % (op.name, arg_def.name, i)
-          input_dict[op.values()[o].name] = result
-          if o == 0:
-            input_dict[op.name] = result
-          o += 1
-  return input_dict
-
-
-def _add_op_node(op, func, input_dict):
-  """Converts an op to a function def node and add it to `func`."""
-  # Add an entry in func.node_def
-
-  # Note that extend() makes a copy in this case, see:
-  # https://developers.google.com/protocol-buffers/docs/reference/python-generated#repeated-message-fields
-  func.node_def.extend([_get_node_def(op)])
-  node_def = func.node_def[-1]
-  for i in range(len(node_def.input)):
-    if not node_def.input[i].startswith("^"):
-      assert node_def.input[i] in input_dict, (
-          "%s missing from %s" % (node_def.input[i], input_dict.items()))
-      node_def.input[i] = input_dict[node_def.input[i]]
-
-
-def _graph_to_function_def(graph, inputs, outputs, out_names=None):
-  """Returns `graph` as a `FunctionDef` protocol buffer.
-
-  This method creates a [`FunctionDef`](
-  https://www.tensorflow.org/code/tensorflow/core/framework/function.proto)
-  protocol buffer that contains all the ops present in the graph.  The
-  graph effectively becomes the body of the function.
-
-  The arguments `inputs` and `outputs` will be listed as the inputs
-  and outputs tensors of the function.  They must be lists of
-  tensors present in the graph.  The lists can optionally be empty.
-
-  Args:
-    graph: Graph.
-    inputs: List of tensors. Inputs to the function.
-    outputs: List of tensors. Outputs of the function.
-    out_names: Optional list of string names for the outputs.
-
-  Returns:
-    A FunctionDef protocol buffer.
-
-  Raises:
-    ValueError: if out_names is specified and the wrong length.
-  """
-  func = function_pb2.FunctionDef()
-  func.signature.name = "_"
-  used_names = set()
-  func.signature.input_arg.extend([_tensor_to_argdef(i, used_names=used_names)
-                                   for i in inputs])
-  if out_names is None:
-    used_names = set()
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, used_names=used_names) for o in outputs])
-  elif len(outputs) != len(out_names):
-    raise ValueError(
-        "Length of out_names (%d) does not match number of outputs (%d): %s" %
-        (len(out_names), len(outputs), ", ".join(out_names)))
-  elif len(out_names) != len(set(out_names)):
-    raise ValueError(
-        "Must not have duplicates in out_names: %s" % ", ".join(out_names))
-  else:
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
-  func_arg_placeholders = set([i.name for i in inputs])
-  input_dict = _create_input_dict(graph, func_arg_placeholders)
-
-  for op in graph.get_operations():
-    if _is_in_placeholders(op, func_arg_placeholders):
-      continue
-    _add_op_node(op, func, input_dict)
-
-  if out_names is None:
-    for index, o in enumerate(outputs):
-      k = func.signature.output_arg[index].name
-      func.ret[k] = input_dict[o.name]
-  else:
-    for o, n in zip(outputs, out_names):
-      func.ret[n] = input_dict[o.name]
-
-  return func
-
-
-def _parse_kwargs_as_attrs(**kwargs):
-  """Parses **kwargs into a node's attributes."""
-  attrs = {}
-
-  noinline = kwargs.pop("noinline", None)
-  if noinline is not None:
-    attrs["_noinline"] = attr_value_pb2.AttrValue(b=bool(noinline))
-
-  compiled = kwargs.pop("compiled", None)
-  if compiled is not None:
-    attrs["_XlaCompile"] = attr_value_pb2.AttrValue(b=bool(compiled))
-
-  if kwargs:
-    raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
-  return attrs
-
-
-def _call(sig, *inputs, **kwargs):
-  """Adds a node calling a function.
-
-  This adds a `call` op to the default graph that calls the function
-  of signature `sig`, passing the tensors in `inputs` as arguments.
-  It returns the outputs of the call, which are one or more tensors.
-
-  `sig` is OpDefArg.a `_DefinedFunction` object.
-
-  You can pass an optional keyword parameter `name=string` to name the
-  added operation.
-
-  You can pass an optional keyword parameter `noinline=True|False` to
-  instruct the runtime not to inline the function body into the call
-  site.
-
-  Args:
-    sig: OpDefArg. The signature of the function.
-    *inputs: arguments to the function.
-    **kwargs: Optional keyword arguments.  Can only contain 'name' or
-        'noinline'.
-
-  Returns:
-     A Tensor if the function returns a single value; a list of Tensors
-     if the functio returns multiple value; the Operation if the function
-     returns no values.
-
-  Raises:
-    ValueError: if the arguments are invalid.
-  """
-  if len(inputs) != len(sig.input_arg):
-    raise ValueError("Expected number of arguments: %d, received: %d" %
-                     (len(sig.input_arg), len(inputs)))
-  name = kwargs.pop("name", None)
-  attrs = _parse_kwargs_as_attrs(**kwargs)
-  g = ops.get_default_graph()
-  func_name = sig.name
-  output_types = [dtypes.DType(x.type) for x in sig.output_arg]
-  with ops.name_scope(name, func_name, inputs) as name:
-    op = g.create_op(
-        func_name,
-        list(inputs),
-        output_types,
-        name=name,
-        attrs=attrs,
-        compute_shapes=False)
-  setattr(op, "_sig", sig)  # Remember the signature.
-  if op.outputs:
-    if len(op.outputs) == 1:
-      return op.outputs[0]
-    else:
-      return tuple(op.outputs)
-  else:
-    return op
-
-
-def _get_func_name(func):
-  if callable(func):
-    if inspect.isfunction(func):
-      return func.__name__
-    elif inspect.ismethod(func):
-      return "%s.%s" % (func.__self__.__name__, func.__name__)
-    else:  # Probably a class instance with __call__
-      return type(func)
-  else:
-    raise ValueError("Argument must be callable")
-
-
-class _FuncGraph(ops.Graph):
-  """A helper for construction a function.
-
-  _FuncGraph overrides ops.Graph's create_op() so that we can keep
-  track of every inputs into every op created inside the function.  If
-  any input is from other graphs, we keep track of it in self.capture
-  and substitue the input with a place holder.
-
-  Each captured input's corresponding place holder is converted into a
-  function argument and the caller passes in the captured tensor.
+  # Building the graph.
+  a = tf.Constant([1.0])
+  b = tf.Constant([2.0])
+  c, d = MyFunc(a, b, name='mycall')
+  ```
   """
 
-  def __init__(self, *args, **kwargs):
-    super(_FuncGraph, self).__init__(*args, **kwargs)
-    self._building_function = True
-    self._outer_graph = ops.get_default_graph()
-    self._vscope = vs.get_variable_scope()
-    self._old_custom_getter = self._vscope.custom_getter
-    self._captured = {}
-    self.extra_inputs = []
-    self.extra_args = []
-    self.extra_vars = []
+  def __init__(self, *input_types, **kwargs):
+    """Create a `Defun` decorator.
 
-  def getvar(self,
-             getter,
-             name,
-             shape=None,
-             dtype=None,
-             initializer=None,
-             trainable=True,
-             collections=None,
-             use_resource=None,
-             **kwargs):
-    """A custom variable getter."""
-    # Here, we switch the default graph to the outer graph and ask the
-    # variable scope in which the function is defined to give us the
-    # variable. The variable is stashed in extra_vars and returned to
-    # the caller.
-    #
-    # We capture these variables so that the variable definition is
-    # hoisted upward to the outer most graph.
-    with self._outer_graph.as_default():
-      # pylint: disable=protected-access
-      var = self._vscope.get_variable(
-          vs._get_default_variable_store(),
-          name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          trainable=trainable,
-          collections=collections,
-          use_resource=use_resource)
-      self.extra_vars.append(var)
-      return var
+    Args:
+      *input_types: A list of `tf.DType`
+      **kwargs: Optional keyword arguments, including
+         func_name - (optional).  A python string, the name to use to
+           declare this `Function` in the graph.
 
-  def create_op(self, op_type, inputs, data_types, **kwargs):
-    for i, x in enumerate(inputs):
-      if x.graph is not self:
-        # Referring to a tensor from other graph.
-        if x in self._captured:
-          # Captured already.
-          inputs[i] = self._captured[x]
-        else:
-          # Substitute with a placeholder.
-          self.extra_inputs.append(x)
-          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
-          inputs[i] = ph
-          self._captured[x] = ph
-          self.extra_args.append(ph)
-    return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
-                                             **kwargs)
+         grad_func - (optional).  A function implementing the gradient
+           of the function-to-register.  This is either a
+           `_DefinedFunction` or a `Declare` object. The gradient
+           function must satisify the criterion defined in
+           function.proto:GradientDef.
+
+         python_grad_func - (optional).  A function implementing the
+           gradient of the function python-side. This function must
+           take the current op and the gradients w.r.t. its outputs,
+           and return the gradients w.r.t. the inputs. That is it must
+           implement the interface expected by `tf.RegisterGradient`).
+           This will be called by tf.gradients to add the gradient ops
+           to the graph. At most one of grad_func and python_grad_func
+           can be specified.
+
+         out_names = (optional). A list of strings, one per output
+           tensor.
+
+         shape_func - (optional). A function taking the op and returning a list
+           of static shapes to set for the function's outputs.
+    """
+    self._input_types = input_types
+    self._func_name = kwargs.pop("func_name", None)
+    self._grad_func = kwargs.pop("grad_func", None)
+    self._python_grad_func = kwargs.pop("python_grad_func", None)
+    self._out_names = kwargs.pop("out_names", None)
+    self._extra_kwargs = kwargs
+
+  def __call__(self, func):
+    # Various sanity checks on the callable func.
+    if not callable(func):
+      raise ValueError("func %s must be callable" % func)
+
+    # Func should not use kwargs and defaults.
+    argspec = tf_inspect.getargspec(func)
+    if argspec.keywords or argspec.defaults:
+      raise ValueError("Functions with argument defaults or keyword "
+                       "arguments are not supported.")
+
+    # Computes how many arguments 'func' has.
+    min_args = len(argspec.args)
+    max_args = min_args
+    if argspec.varargs:
+      max_args = 1000000
+    argnames = argspec.args
+    if tf_inspect.ismethod(func):
+      # 1st argument is the "class" type.
+      min_args -= 1
+      argnames = argnames[1:]
+
+    if self._input_types:
+      # If Defun is given a list of types for the inputs, the number
+      # of input types should be compatible with 'func'.
+      num = len(self._input_types)
+      if num < min_args or num > max_args:
+        raise ValueError(
+            "The function has fewer arguments than the number of specified "
+            "input types.")
+      return _DefinedFunction(
+          func,
+          argnames,
+          self._input_types,
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # 'func' expects no arguments and input types is an empty list.
+    if min_args == 0 and max_args == 0:
+      return _DefinedFunction(
+          func, [], [],
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # Input types are unknown. It's an overloaded function and hence
+    # its definition needs to be deferred until it's called.
+    return _OverloadedFunction(
+        func,
+        argnames,
+        self._func_name,
+        self._grad_func,
+        self._python_grad_func,
+        out_names=self._out_names,
+        **self._extra_kwargs)
 
 
-def get_extra_vars():
-  """Returns the captured variables by the function.
+class Declare(object):
+  """Declares a TensorFlow function.
 
-  Returns:
-    If the default graph is being used to define a function, the
-    returned list of variables are those created inside the function
-    body so far. Otherwise, returns an empty list.
+  The object represents a TensorFlow function which will be defined
+  later during a graph construction.
+
+  For example,
+    # Declares  a function Foo, which takes a tf.int32 named "n" and a
+    # tf.float32 named "x" as inputs and returns a tf.float32 named "z"
+    # as its output.
+    foo = Declare("Foo", [("n", tf.int32), ("x", tf.float32)],
+                  [("z", tf.float32)])
+
+    # Defines a function Bar calls Foo.
+    @tf.Defun(tf.float32)
+    def Bar(x):
+      return foo(6, x)
+
+    # Defines Foo, with output named "z".
+    @tf.Defun(tf.int32, tf.float32, out_names=["z"])
+    def Foo(n, x):
+       ...  # Calculation.
+       return result
   """
-  g = ops.get_default_graph()
-  if isinstance(g, _FuncGraph):
-    return g.extra_vars
-  else:
-    return []
 
+  def __init__(self, func_name, inputs, outputs):
+    """Creates a `Declare` object.
 
-def get_extra_inputs():
-  """Returns the captured input tensors by the function.
+    Args:
+      func_name: The name of the function.
+      inputs: A list of (name, data type) pairs of function arguments.
+      outputs: A list of (name, data type) pairs of function return values.
+    """
+    self._sig = op_def_pb2.OpDef()
+    self._sig.name = func_name
 
-  Returns:
-    If the default graph is being used to define a function, the
-    returned list of tensors are those accessed inside the function body
-    but defined outside the function body so far. Otherwise, returns an
-    empty list.
-  """
-  g = ops.get_default_graph()
-  if isinstance(g, _FuncGraph):
-    return g.extra_inputs
-  else:
-    return []
+    def _to_argdef_list(args):
+      names = [n for n, t in args]
+      if len(names) != len(set(names)):
+        raise ValueError("Expected names to all be unique: %s" % str(names))
+      return [
+          op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
+          for n, t in args
+      ]
 
+    self._sig.input_arg.extend(_to_argdef_list(inputs))
+    self._sig.output_arg.extend(_to_argdef_list(outputs))
 
-def get_extra_args():
-  """Returns the corresponding function arguments for the captured inputs.
-
-  Returns:
-    If the default graph is being used to define a function, the
-    returned list of place holders are those used inside the function
-    body corresponding those returned by get_extra_inputs(). Otherwise,
-    returns an empty list.
-  """
-  g = ops.get_default_graph()
-  if isinstance(g, _FuncGraph):
-    return g.extra_args
-  else:
-    return []
+  def __call__(self, *inputs, **kwargs):
+    inputs = [ops.convert_to_tensor(_) for _ in inputs]
+    return _call(self._sig, *inputs, **kwargs)[0]
 
 
 class _DefinedFunction(object):
@@ -406,6 +257,7 @@ class _DefinedFunction(object):
                grad_func=None,
                python_grad_func=None,
                out_names=None,
+               shape_func=None,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -422,6 +274,8 @@ class _DefinedFunction(object):
         the function python-side.
       out_names: An optional list of strings for the function return value
         names.
+      shape_func: An optional function mapping an op to a list of static
+        output shapes.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -435,8 +289,10 @@ class _DefinedFunction(object):
     self._grad_func = grad_func
     self._python_grad_func = python_grad_func
     self._out_names = out_names
+    self._shape_func = shape_func
     self._extra_kwargs = kwargs
     self._definition = None  # Constructed lazily.
+    self._sub_functions = dict()  # Constructed with definition.
 
     self._args = []
     assert isinstance(input_types, (list, tuple))
@@ -510,72 +366,83 @@ class _DefinedFunction(object):
       outputs = [ops.convert_to_tensor(_) for _ in outputs]
     self._extra_inputs = temp_graph.extra_inputs
     inputs.extend(temp_graph.extra_args)
+    # pylint: disable=protected-access
+    self._sub_functions = temp_graph._functions
+    # pylint: enable=protected-access
 
     # Build the FunctionDef
     self._definition = _graph_to_function_def(
-        temp_graph, inputs, outputs, out_names=self._out_names)
+        temp_graph,
+        temp_graph.get_operations(),
+        inputs,
+        outputs,
+        out_names=self._out_names)
 
     # Extra kwargs are treated as attrs on the function def.
-    kwargs_attr = _parse_kwargs_as_attrs(**self._extra_kwargs)
+    sig_pre_func_name = self._func_name or _get_func_name(self._func)
+    kwargs_attr = _parse_kwargs_as_attrs(sig_pre_func_name,
+                                         **self._extra_kwargs)
     for k in kwargs_attr:
       self._definition.attr[k].CopyFrom(kwargs_attr[k])
 
     # Hash the definition and its dependencies.
-    hasher = hashlib.sha1()
-
-    def _hash_func_def():
-      """Hash the function definition agnostic to node/map ordering."""
-
-      def update_num(n):
-        hasher.update(compat.as_bytes("%x" % n))
-
-      def update_str(s):
-        update_num(len(s))
-        hasher.update(compat.as_bytes(s))
-
-      def update_strs(slist):
-        update_num(len(slist))
-        for s in slist:
-          update_str(s)
-
-      for adef in self._definition.signature.input_arg:
-        update_str(adef.SerializeToString())
-
-      for adef in self._definition.signature.output_arg:
-        update_str(adef.SerializeToString())
-
-      for n in sorted(self._definition.node_def, key=lambda n: n.name):
-        update_str(n.name)
-        update_str(n.op)
-        update_strs(n.input)
-        update_num(len(n.attr))
-        # NOTE: protobuf map serialization does not guarantee ordering.
-        for k in sorted(n.attr):
-          update_str(k)
-          update_str(n.attr[k].SerializeToString())
-
-    _hash_func_def()
-    # pylint: disable=protected-access
-    self._sub_functions = temp_graph._functions
-    for subname in sorted(self._sub_functions.keys()):
-      hasher.update(compat.as_bytes(self._sub_functions[subname]._hash_str))
-    # pylint: enable=protected-access
-
-    # Uses the first 8 bytes sha1 hash digest as the __hash__.
-    self._hash_str = hasher.hexdigest()[:8]
-    self._hash = int(self._hash_str, 16)
+    self._hash_str = self._create_hash_str(
+        self._definition.signature.input_arg,
+        self._definition.signature.output_arg, self._definition.node_def)
 
     # Finally, we decide the function name to use.  If not specified,
-    # make up something which is almost certainly unique.
+    # make up something which is almost certainly unique (but deterministic).
     if not self._func_name:
       self._func_name = "_".join([_get_func_name(self._func), self._hash_str])
     self._definition.signature.name = self._func_name
     if self._func.__doc__:
       self._definition.signature.description = self._func.__doc__
 
-  def __hash__(self):
-    self._create_definition_if_needed()
-    return self._hash
+  def _create_hash_str(self, input_arg, output_arg, node_def):
+    """Creates an 8-character string unique to this input.
+
+    Args:
+      input_arg: the input_arg field of an OpDef
+                 (e.g. self._definition.signature.input_arg)
+      output_arg: the output_arg field of an OpDef
+                 (e.g. self._definition.signature.output_arg)
+      node_def: the node_def field of a FunctionDef
+                (e.g. self._definition.node_def)
+
+    Returns:
+      The unique string for this input
+    """
+    hasher = hashlib.sha1()
+
+    def update_num(n):
+      hasher.update(compat.as_bytes("%x" % n))
+
+    def update_str(s):
+      update_num(len(s))
+      hasher.update(compat.as_bytes(s))
+
+    def update_strs(slist):
+      update_num(len(slist))
+      for s in slist:
+        update_str(s)
+
+    for adef in input_arg:
+      update_str(adef.SerializeToString())
+
+    for adef in output_arg:
+      update_str(adef.SerializeToString())
+
+    for n in sorted(node_def, key=lambda n: n.name):
+      update_str(n.name)
+      update_str(n.op)
+      update_strs(n.input)
+      update_num(len(n.attr))
+      # NOTE: protobuf map serialization does not guarantee ordering.
+      for k in sorted(n.attr):
+        update_str(k)
+        update_str(n.attr[k].SerializeToString())
+
+    return hasher.hexdigest()[:8]
 
   def add_to_graph(self, g):
     """Adds this function into the graph g."""
@@ -584,7 +451,7 @@ class _DefinedFunction(object):
     # pylint: disable=protected-access
     # If 'g' has an identical function already, do nothing.
     prev = g._get_function(self.name)
-    if prev and (prev._hash == self._hash):
+    if prev and (prev._hash_str == self._hash_str):
       return
 
     # Adds this function into 'g'.
@@ -602,36 +469,15 @@ class _DefinedFunction(object):
   def __call__(self, *args, **kwargs):
     self.add_to_graph(ops.get_default_graph())
     args = [ops.convert_to_tensor(_) for _ in args] + self._extra_inputs
-    return _call(self._definition.signature, *args, **kwargs)
-
-# NOTE: The list needs to be extended when more data types are added.
-_DTYPE_TO_STR = {
-    dtypes.float16: "f16",
-    dtypes.float32: "f32",
-    dtypes.float64: "f64",
-    dtypes.int32: "i32",
-    dtypes.uint8: "i8",
-    dtypes.uint16: "u16",
-    dtypes.int16: "i16",
-    dtypes.int8: "i8",
-    dtypes.string: "s",
-    dtypes.complex64: "c64",
-    dtypes.complex128: "c128",
-    dtypes.int64: "i64",
-    dtypes.bool: "b",
-    dtypes.qint8: "qi8",
-    dtypes.quint8: "qu8",
-    dtypes.qint16: "qi16",
-    dtypes.quint16: "qu16",
-    dtypes.qint32: "qi32",
-    dtypes.bfloat16: "b16"
-}
-
-
-def _type_list_to_str(types):
-  if any([_ not in _DTYPE_TO_STR for _ in types]):
-    raise ValueError("Unsupported dtypes: %s" % types)
-  return "".join([_DTYPE_TO_STR[_] for _ in types])
+    ret, op = _call(self._definition.signature, *args, **kwargs)
+    if self._shape_func is not None:
+      shapes = self._shape_func(op)
+      if len(shapes) != len(op.outputs):
+        raise ValueError("shape_func produced %d shapes for %d outputs" %
+                         (len(shapes), len(op.outputs)))
+      for (t, shape) in zip(op.outputs, shapes):
+        t.set_shape(shape)
+    return ret
 
 
 class _OverloadedFunction(object):
@@ -697,10 +543,15 @@ class _OverloadedFunction(object):
       name = self._func_name
       if name is not None:
         name = "_".join([name, key])
-      defined = _DefinedFunction(self._func, self._argnames, input_types, name,
-                                 None, self._python_grad_func,
-                                 out_names=self._out_names,
-                                 **self._extra_kwargs)
+      defined = _DefinedFunction(
+          self._func,
+          self._argnames,
+          input_types,
+          name,
+          None,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
       _ = defined.name  # Fully instantiate the function definition.
       if self._grad_func:
         # If _grad_func is given, it is another
@@ -711,8 +562,8 @@ class _OverloadedFunction(object):
             for _ in defined.definition.signature.output_arg
         ]
         # pylint: disable=protected-access
-        defined._grad_func = self._grad_func.instantiate(input_types +
-                                                         output_types)
+        defined._grad_func = self._grad_func.instantiate(
+            input_types + output_types)
         # pylint: enable=protected-access
       self._overload[key] = defined
     return defined
@@ -729,175 +580,511 @@ class _OverloadedFunction(object):
     return self.instantiate(input_types)(*args, **kwargs)
 
 
-class Defun(object):
-  """Decorator used to define TensorFlow functions.
+class _FuncGraph(ops.Graph):
+  """A helper for construction a function.
 
-  Use this decorator to make a Python function usable directly as a TensorFlow
+  _FuncGraph overrides ops.Graph's create_op() so that we can keep
+  track of every inputs into every op created inside the function.  If
+  any input is from other graphs, we keep track of it in self.capture
+  and substitue the input with a place holder.
+
+  Each captured input's corresponding place holder is converted into a
+  function argument and the caller passes in the captured tensor.
+  """
+
+  def __init__(self, *args, **kwargs):
+    super(_FuncGraph, self).__init__(*args, **kwargs)
+    self._building_function = True
+    self._outer_graph = ops.get_default_graph()
+    self._vscope = vs.get_variable_scope()
+    self._old_custom_getter = self._vscope.custom_getter
+    self._captured = {}
+    self.extra_inputs = []
+    self.extra_args = []
+    self.extra_vars = []
+
+  def getvar(
+      self,
+      getter,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      reuse=None,
+      trainable=True,
+      collections=None,  # pylint: disable=redefined-outer-name
+      use_resource=None,
+      **kwargs):
+    """A custom variable getter."""
+    # Here, we switch the default graph to the outer graph and ask the
+    # variable scope in which the function is defined to give us the
+    # variable. The variable is stashed in extra_vars and returned to
+    # the caller.
+    #
+    # We capture these variables so that the variable definition is
+    # hoisted upward to the outer most graph.
+    with self._outer_graph.as_default():
+      # pylint: disable=protected-access
+      var = self._vscope.get_variable(
+          vs._get_default_variable_store(),
+          name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          use_resource=use_resource)
+      self.extra_vars.append(var)
+      if isinstance(var, resource_variable_ops.ResourceVariable):
+        # For resource-based variables read the variable outside the function
+        # and pass in the value. This ensures that the function is pure and
+        # differentiable. TODO(apassos) this may have performance problems if
+        # the function will only do embedding lookups on the variable.
+        return var.value()
+      return var
+
+  def create_op(self, op_type, inputs, data_types, **kwargs):
+    for i, x in enumerate(inputs):
+      if x.graph is not self:
+        # Referring to a tensor from other graph.
+        if x in self._captured:
+          # Captured already.
+          inputs[i] = self._captured[x]
+        else:
+          # Substitute with a placeholder.
+          self.extra_inputs.append(x)
+          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # pylint: disable=protected-access
+          ph._handle_data = x._handle_data
+          # pylint: enable=protected-access
+          inputs[i] = ph
+          self._captured[x] = ph
+          self.extra_args.append(ph)
+    return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
+                                             **kwargs)
+
+
+def _call(sig, *inputs, **kwargs):
+  """Adds a node calling a function.
+
+  This adds a `call` op to the default graph that calls the function
+  of signature `sig`, passing the tensors in `inputs` as arguments.
+  It returns the outputs of the call, which are one or more tensors.
+
+  `sig` is OpDefArg.a `_DefinedFunction` object.
+
+  You can pass an optional keyword parameter `name=string` to name the
+  added operation.
+
+  You can pass an optional keyword parameter `noinline=True|False` to
+  instruct the runtime not to inline the function body into the call
+  site.
+
+  Args:
+    sig: OpDefArg. The signature of the function.
+    *inputs: arguments to the function.
+    **kwargs: Optional keyword arguments.  Can only contain 'name' or
+        'noinline'.
+
+  Returns:
+     A 2-element tuple. First element: a Tensor if the function returns a single
+     value; a list of Tensors if the function returns multiple value; the
+     Operation if the function returns no values. Second element: the Operation.
+
+  Raises:
+    ValueError: if the arguments are invalid.
+  """
+  if len(inputs) != len(sig.input_arg):
+    raise ValueError("Expected number of arguments: %d, received: %d" %
+                     (len(sig.input_arg), len(inputs)))
+  name = kwargs.pop("name", None)
+  g = ops.get_default_graph()
+  func_name = sig.name
+  attrs = _parse_kwargs_as_attrs(func_name, **kwargs)
+  output_types = [dtypes.DType(x.type) for x in sig.output_arg]
+  with ops.name_scope(name, func_name, inputs) as name:
+    op = g.create_op(
+        func_name,
+        list(inputs),
+        output_types,
+        name=name,
+        attrs=attrs,
+        op_def=sig,
+        compute_shapes=False)
+  if op.outputs:
+    if len(op.outputs) == 1:
+      ret = op.outputs[0]
+    else:
+      ret = tuple(op.outputs)
+  else:
+    ret = op
+  return ret, op
+
+
+def _from_definition(fdef, grad_func=None):
+  """Creates a _DefinedFunction initialized from a FunctionDef proto.
+
+  Args:
+    fdef: a FunctionDef
+    grad_func: a _DefinedFunction or None
+
+  Returns:
+    A _DefinedFunction representing fdef
+  """
+  # The Python callable is only needed to create a FunctionDef. Since we have
+  # the FunctionDef here, we don't need to set _DefinedFunction._func (nor do we
+  # have access to such a callable here).
+  func = None
+  argnames = [arg.name for arg in fdef.signature.input_arg]
+  input_types = tuple(
+      dtypes.as_dtype(arg.type) for arg in fdef.signature.input_arg)
+  func_name = fdef.signature.name
+  # Note: FunctionDefs do not include python gradient functions, so if the
+  # original _DefinedFunction included one it will not be reflected here.
+  python_grad_func = None
+  out_names = [arg.name for arg in fdef.signature.output_arg]
+  result = _DefinedFunction(func, argnames, input_types, func_name, grad_func,
+                            python_grad_func, out_names)
+  # pylint: disable=protected-access
+  result._definition = fdef
+  # Captured inputs are added as regular inputs to a function when it's
+  # serialized, i.e. any extra inputs from the original function are now
+  # included in `result`._args
+  result._extra_inputs = []
+  result._hash_str = result._create_hash_str(
+      result._definition.signature.input_arg,
+      result._definition.signature.output_arg, result._definition.node_def)
+  # pylint: enable=protected-access
+  return result
+
+
+def _from_library(lib):
+  """Creates _DefinedFunctions initialized from a FunctionDefLibrary proto.
+
+  This method handles assigning the correct gradient functions to each
   function.
 
-  The decorated function must add ops to the default graph and return zero or
-  more `Tensor` objects.  Call the decorator with named arguments, one for each
-  argument of the function to decorate, with the expected type of the argument
-  as value.
+  Args:
+    lib: a FunctionDefLibrary
 
-  For example if the function to decorate accepts two `tf.float32` arguments
-  named `x` and `y`, call the decorator with:
-
-      @Defun(tf.float32, tf.float32)
-      def foo(x, y):
-        ...
-
-  When you call the decorated function it will add `call` ops to the
-  default graph and adds the definition of the function into the
-  default graph. Because the addition of the function into the graph
-  is deferred, the decorator can be used anywhere in the program.
-
-  Example, but also see the [How To on functions](link_needed).
-
-  ```python
-  # Defining the function.
-  @tf.Defun(tf.float32, tf.float32)
-  def MyFunc(x, y):
-    return x + y, x - y
-
-  # Building the graph.
-  a = tf.Constant([1.0])
-  b = tf.Constant([2.0])
-  c, d = MyFunc(a, b, name='mycall')
-  ```
-
-  @@__init__
+  Returns:
+    A list of _DefinedFunctions
 
+  Raises:
+    ValueError: `lib` is invalid
   """
+  if not lib.function and not lib.gradient:
+    return []
 
-  def __init__(self, *input_types, **kwargs):
-    """Create a `Defun` decorator.
+  # function name -> FunctionDef proto
+  funcs = {fdef.signature.name: fdef for fdef in lib.function}
 
-    Args:
-      *input_types: A list of `tf.DType`
-      **kwargs: Optional keyword arguments, including
-         func_name - (optional).  A python string, the name to use to
-           declare this `Function` in the graph.
+  # Validate that all references function names have function defs
+  for g in lib.gradient:
+    if g.function_name not in funcs:
+      raise ValueError("FunctionDefLibrary missing '%s' FunctionDef\n%s" %
+                       (g.function_name, str(lib)))
+    if g.gradient_func not in funcs:
+      raise ValueError("FunctionDefLibrary missing '%s' FunctionDef\n%s" %
+                       (g.gradient_func, str(lib)))
 
-         grad_func - (optional).  A function implementing the gradient
-           of the function-to-register.  This is either a
-           `_DefinedFunction` or a `Declare` object. The gradient
-           function must satisify the criterion defined in
-           function.proto:GradientDef.
+  # function name -> gradient function name
+  func_to_grad = collections.defaultdict(lambda: None)
+  # gradient function name -> names of functions having that grad function
+  grad_to_funcs = collections.defaultdict(list)
 
-         python_grad_func - (optional).  A function implementing the
-           gradient of the function python-side. This function must
-           take the current op and the gradients w.r.t. its outputs,
-           and return the gradients w.r.t. the inputs. That is it must
-           implement the interface expected by `tf.RegisterGradient`).
-           This will be called by tf.gradients to add the gradient ops
-           to the graph. At most one of grad_func and python_grad_func
-           can be specified.
+  for gdef in lib.gradient:
+    func_to_grad[gdef.function_name] = gdef.gradient_func
+    grad_to_funcs[gdef.gradient_func].append(gdef.function_name)
 
-         out_names = (optional). A list of strings, one per output
-           tensor.
-    """
-    self._input_types = input_types
-    self._func_name = kwargs.pop("func_name", None)
-    self._grad_func = kwargs.pop("grad_func", None)
-    self._python_grad_func = kwargs.pop("python_grad_func", None)
-    self._out_names = kwargs.pop("out_names", None)
-    self._extra_kwargs = kwargs
+  # Start with functions without gradients
+  ready = [
+      fdef for fdef in lib.function if func_to_grad[fdef.signature.name] is None
+  ]
+  if not ready:
+    raise ValueError("FunctionDefLibrary contains cyclic gradient functions!\n"
+                     + str(lib))
+  # function name -> _DefinedFunction
+  initialized = {}
 
-  def __call__(self, func):
-    # Various sanity checks on the callable func.
-    if not callable(func):
-      raise ValueError("func %s must be callable" % func)
+  while ready:
+    fdef = ready.pop()
+    name = fdef.signature.name
 
-    # Func should not use kwargs and defaults.
-    argspec = inspect.getargspec(func)
-    if argspec.keywords or argspec.defaults:
-      raise ValueError("Functions with argument defaults or keyword "
-                       "arguments are not supported.")
+    grad = initialized.get(func_to_grad[name])
+    if func_to_grad[name]:
+      assert grad
+    defined_func = _from_definition(fdef, grad_func=grad)
+    initialized[name] = defined_func
 
-    # Computes how many arguments 'func' has.
-    min_args = len(argspec.args)
-    max_args = min_args
-    if argspec.varargs:
-      max_args = 1000000
-    argnames = argspec.args
-    if inspect.ismethod(func):
-      # 1st argument is the "class" type.
-      min_args -= 1
-      argnames = argnames[1:]
+    ready.extend(funcs[f] for f in grad_to_funcs[name])
 
-    if self._input_types:
-      # If Defun is given a list of types for the inputs, the number
-      # of input types should be compatible with 'func'.
-      num = len(self._input_types)
-      if num < min_args or num > max_args:
-        raise ValueError(
-            "The function has fewer arguments than the number of specified "
-            "input types.")
-      return _DefinedFunction(func, argnames, self._input_types,
-                              self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
-
-    # 'func' expects no arguments and input types is an empty list.
-    if min_args == 0 and max_args == 0:
-      return _DefinedFunction(func, [], [], self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
-
-    # Input types are unknown. It's an overloaded function and hence
-    # its definition needs to be deferred until it's called.
-    return _OverloadedFunction(func, argnames, self._func_name, self._grad_func,
-                               self._python_grad_func,
-                               out_names=self._out_names, **self._extra_kwargs)
+  return initialized.values()
 
 
-class Declare(object):
-  """Declares a TensorFlow function.
+def _graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
+  """Returns `graph` as a `FunctionDef` protocol buffer.
 
-  The object represents a TensorFlow function which will be defined
-  later during a graph construction.
+  This method creates a [`FunctionDef`](
+  https://www.tensorflow.org/code/tensorflow/core/framework/function.proto)
+  protocol buffer that contains all the ops in `operations`.  The
+  operations become the body of the function.
 
-  For example,
-    # Declares  a function Foo, which takes a tf.int32 named "n" and a
-    # tf.float32 named "n" as inputs and returns a tf.float32 named "z"
-    # as its output.
-    foo = Declare("Foo", [("n", tf.int32), ("x", tf.float32)],
-                  [("z", tf.float32)])
+  The arguments `inputs` and `outputs` will be listed as the inputs
+  and outputs tensors of the function.  They must be lists of
+  tensors present in the graph.  The lists can optionally be empty.
 
-    # Defines a function Bar calls Foo.
-    @tf.Defun(tf.float32)
-    def Bar(x):
-      return foo(6, x)
+  Args:
+    graph: Graph.
+    operations: the operations to put in the function. Must be a subset of
+     the operations in the graph.
+    inputs: List of tensors. Inputs to the function.
+    outputs: List of tensors. Outputs of the function.
+    out_names: Optional list of string names for the outputs.
 
-    # Defines Foo, with output named "z".
-    @tf.Defun(tf.int32, tf.float32, out_names=["z"])
-    def Foo(n, x):
-       ...  # Calculation.
-       return result
+  Returns:
+    A FunctionDef protocol buffer.
+
+  Raises:
+    ValueError: if out_names is specified and the wrong length.
   """
+  func = function_pb2.FunctionDef()
+  func.signature.name = "_"
+  used_names = set()
+  func.signature.input_arg.extend(
+      [_tensor_to_argdef(i, used_names=used_names) for i in inputs])
+  # Initializes the input map with all placeholder input tensors.
+  initial_dict = {}
+  for o, m in zip(inputs, func.signature.input_arg):
+    initial_dict[o.name] = m.name
+  if out_names is None:
+    used_names = set()
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, used_names=used_names) for o in outputs])
+  elif len(outputs) != len(out_names):
+    raise ValueError(
+        "Length of out_names (%d) does not match number of outputs (%d): %s" %
+        (len(out_names), len(outputs), ", ".join(out_names)))
+  elif len(out_names) != len(set(out_names)):
+    raise ValueError(
+        "Must not have duplicates in out_names: %s" % ", ".join(out_names))
+  else:
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
+  func_arg_placeholders = set([i.name for i in inputs])
+  input_dict = _create_input_dict(graph, func_arg_placeholders,
+                                  initial_value=initial_dict)
 
-  def __init__(self, func_name, inputs, outputs):
-    """Creates a `Declare` object.
+  for op in operations:
+    if _is_in_placeholders(op, func_arg_placeholders):
+      continue
+    _add_op_node(op, func, input_dict)
 
-    Args:
-      func_name: The name of the function.
-      inputs: A list of (name, data type) pairs of function arguments.
-      outputs: A list of (name, data type) pairs of function return values.
-    """
-    self._sig = op_def_pb2.OpDef()
-    self._sig.name = func_name
+  if out_names is None:
+    for index, o in enumerate(outputs):
+      k = func.signature.output_arg[index].name
+      func.ret[k] = input_dict[o.name]
+  else:
+    for o, n in zip(outputs, out_names):
+      func.ret[n] = input_dict[o.name]
 
-    def _to_argdef_list(args):
-      names = [n for n, t in args]
-      if len(names) != len(set(names)):
-        raise ValueError("Expected names to all be unique: %s" % str(names))
-      return [op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
-              for n, t in args]
+  return func
 
-    self._sig.input_arg.extend(_to_argdef_list(inputs))
-    self._sig.output_arg.extend(_to_argdef_list(outputs))
 
-  def __call__(self, *inputs, **kwargs):
-    inputs = [ops.convert_to_tensor(_) for _ in inputs]
-    return _call(self._sig, *inputs, **kwargs)
+def _make_argname_from_tensor_name(name):
+  return re.sub(":0$", "", name).replace(":", "_o")
+
+
+def _tensor_to_argdef(t, name=None, used_names=None):
+  """Convert tensor t to an argdef, with a specified name or a unique name."""
+  arg = op_def_pb2.OpDef.ArgDef()
+  if name is None:
+    arg.name = _make_argname_from_tensor_name(t.name)
+    if used_names is not None:
+      if arg.name in used_names:
+        i = 0
+        while True:
+          new_name = "%s_U%d" % (arg.name, i)
+          if new_name not in used_names:
+            arg.name = new_name
+            break
+          i += 1
+      used_names.add(arg.name)
+  else:
+    arg.name = name
+  arg.type = t.dtype.as_datatype_enum
+  return arg
+
+
+def _get_node_def(op):
+  return op._node_def  # pylint: disable=protected-access
+
+
+def _get_op_def(op):
+  return op.op_def or op_def_registry.get_registered_ops()[op.type]
+
+
+def _is_in_placeholders(op, func_arg_placeholders):
+  """Checks whether any output of this op is in func_arg_placeholders."""
+  return op.values() and any(x.name in func_arg_placeholders
+                             for x in op.values())
+
+
+def _create_input_dict(function_graph,
+                       func_arg_placeholders,
+                       initial_value=None):
+  """Create a mapping from graph tensor names to function tensor names."""
+  if initial_value is None:
+    input_dict = {}
+  else:
+    input_dict = dict(initial_value)
+  for op in function_graph.get_operations():
+    if _is_in_placeholders(op, func_arg_placeholders):
+      input_dict[op.name] = op.name
+    else:
+      op_def = _get_op_def(op)
+      attrs = _get_node_def(op).attr
+      o = 0
+      for arg_def in op_def.output_arg:
+        if arg_def.number_attr:
+          num = attrs[arg_def.number_attr].i
+        elif arg_def.type_list_attr:
+          num = len(attrs[arg_def.type_list_attr].list.type)
+        else:
+          num = 1
+        for i in range(num):
+          result = "%s:%s:%d" % (op.name, arg_def.name, i)
+          input_dict[op.values()[o].name] = result
+          if o == 0:
+            input_dict[op.name] = result
+          o += 1
+  return input_dict
+
+
+def _add_op_node(op, func, input_dict):
+  """Converts an op to a function def node and add it to `func`."""
+  # Add an entry in func.node_def
+
+  # Note that extend() makes a copy in this case, see:
+  # https://developers.google.com/protocol-buffers/docs/reference/python-generated#repeated-message-fields
+  func.node_def.extend([_get_node_def(op)])
+  node_def = func.node_def[-1]
+  for i in range(len(node_def.input)):
+    if not node_def.input[i].startswith("^"):
+      assert node_def.input[i] in input_dict, ("%s missing from %s" %
+                                               (node_def.input[i],
+                                                input_dict.items()))
+      node_def.input[i] = input_dict[node_def.input[i]]
+
+
+def _parse_kwargs_as_attrs(func_name, **kwargs):
+  """Parses **kwargs into a node's attributes."""
+  attrs = {}
+
+  noinline = kwargs.pop("noinline", None)
+  if noinline is not None:
+    attrs["_noinline"] = attr_value_pb2.AttrValue(b=bool(noinline))
+
+  compiled = kwargs.pop("compiled", None)
+  separate_compiled_gradients = kwargs.pop("separate_compiled_gradients", None)
+  if compiled is not None:
+    attrs["_XlaCompile"] = attr_value_pb2.AttrValue(b=bool(compiled))
+    attrs["_XlaSeparateCompiledGradients"] = attr_value_pb2.AttrValue(
+        b=bool(separate_compiled_gradients))
+    attrs["_XlaScope"] = attr_value_pb2.AttrValue(
+        s=("function_%s" % func_name).encode())
+
+  if kwargs:
+    raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
+  return attrs
+
+
+def _get_func_name(func):
+  _, func = tf_decorator.unwrap(func)
+  if callable(func):
+    if tf_inspect.isfunction(func):
+      return func.__name__
+    elif tf_inspect.ismethod(func):
+      return "%s.%s" % (func.__self__.__name__, func.__name__)
+    else:  # Probably a class instance with __call__
+      return type(func)
+  else:
+    raise ValueError("Argument must be callable")
+
+
+def get_extra_vars():
+  """Returns the captured variables by the function.
+
+  Returns:
+    If the default graph is being used to define a function, the
+    returned list of variables are those created inside the function
+    body so far. Otherwise, returns an empty list.
+  """
+  g = ops.get_default_graph()
+  if isinstance(g, _FuncGraph):
+    return g.extra_vars
+  else:
+    return []
+
+
+def get_extra_inputs():
+  """Returns the captured input tensors by the function.
+
+  Returns:
+    If the default graph is being used to define a function, the
+    returned list of tensors are those accessed inside the function body
+    but defined outside the function body so far. Otherwise, returns an
+    empty list.
+  """
+  g = ops.get_default_graph()
+  if isinstance(g, _FuncGraph):
+    return g.extra_inputs
+  else:
+    return []
+
+
+def get_extra_args():
+  """Returns the corresponding function arguments for the captured inputs.
+
+  Returns:
+    If the default graph is being used to define a function, the
+    returned list of place holders are those used inside the function
+    body corresponding those returned by get_extra_inputs(). Otherwise,
+    returns an empty list.
+  """
+  g = ops.get_default_graph()
+  if isinstance(g, _FuncGraph):
+    return g.extra_args
+  else:
+    return []
+
+
+def _type_list_to_str(types):
+  if any([_ not in _DTYPE_TO_STR for _ in types]):
+    raise ValueError("Unsupported dtypes: %s" % types)
+  return "".join([_DTYPE_TO_STR[_] for _ in types])
+
+
+# NOTE: The list needs to be extended when more data types are added.
+_DTYPE_TO_STR = {
+    dtypes.float16: "f16",
+    dtypes.float32: "f32",
+    dtypes.float64: "f64",
+    dtypes.int32: "i32",
+    dtypes.uint8: "i8",
+    dtypes.uint16: "u16",
+    dtypes.int16: "i16",
+    dtypes.int8: "i8",
+    dtypes.string: "s",
+    dtypes.complex64: "c64",
+    dtypes.complex128: "c128",
+    dtypes.int64: "i64",
+    dtypes.bool: "b",
+    dtypes.qint8: "qi8",
+    dtypes.quint8: "qu8",
+    dtypes.qint16: "qi16",
+    dtypes.quint16: "qu16",
+    dtypes.qint32: "qi32",
+    dtypes.bfloat16: "b16"
+}
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 16ac07e6702..c4e841b81f5 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import time
 
 import numpy as np
 
+from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
@@ -37,6 +39,7 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -322,6 +325,48 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  def testControlFlowStrictness(self):
+    """Inlined functions must not execute in a untaken control flow branch."""
+
+    @function.Defun(dtypes.int32)
+    def AssertFail(x):
+      # Assertion that always fails and does not have a data dependency on `x`.
+      assert_false = control_flow_ops.Assert(False, [42])
+      with ops.control_dependencies([assert_false]):
+        return array_ops.identity(x)
+
+    with ops.device("CPU"):
+      pred = array_ops.placeholder(dtypes.bool)
+      x = array_ops.placeholder(dtypes.int32)
+      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
+      # pylint: disable=unnecessary-lambda
+      loop = control_flow_ops.while_loop(lambda y: pred,
+                                         lambda y: AssertFail(y), [x])
+      # pylint: enable=unnecessary-lambda
+
+    # Enables inlining.
+    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(
+            opt_level=config_pb2.OptimizerOptions.L0,
+            do_common_subexpression_elimination=True,
+            do_function_inlining=True,
+            do_constant_folding=True)))
+
+    with session.Session(config=config) as sess:
+      # Since the 'False' branch is not taken, the assertion should not fire.
+      self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
+
+      # The assertion should still fire if the False branch is taken.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(cond, {pred: False, x: 3})
+
+      # Similarly for loops.
+      self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(loop, {pred: True, x: 3})
+
   def testVar(self):
 
     @function.Defun(dtypes.float32)
@@ -337,6 +382,23 @@ class FunctionTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(z.eval(), 101.)
 
+  def testResourceVarAsImplicitInput(self):
+    g = ops.Graph()
+    with g.as_default(), ops.device("cpu:0"):
+      v = variable_scope.get_variable(
+          "var", (4, 4), dtypes.float32, use_resource=True)
+
+      @function.Defun()
+      def Foo():
+        return array_ops.identity(v)
+
+      y = v.value()
+      z = Foo()
+
+    with self.test_session(graph=g):
+      v.initializer.run()
+      self.assertAllEqual(y.eval(), z.eval())
+
   def testDefineErrors(self):
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, "can not return None"):
@@ -525,6 +587,48 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
 
+  def testDeclare(self):
+    foo = function.Declare("Foo", [("x", dtypes.float32)],
+                           [("y", dtypes.float32)])
+
+    @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
+    def FooImpl(x):
+      return x * x + 1
+
+    x = array_ops.placeholder(dtypes.float32)
+    y = foo(x)
+
+    g = ops.get_default_graph()
+    FooImpl.add_to_graph(g)
+
+    with self.test_session():
+      rand = np.random.uniform(size=(3, 3))
+      expected = rand * rand + 1.0
+      self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
+
+  def testDeclareUsedInDefun(self):
+    foo = function.Declare("Foo", [("x", dtypes.float32)],
+                           [("y", dtypes.float32)])
+
+    @function.Defun()
+    def Bar(x):
+      return foo(x)
+
+    @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
+    def FooImpl(x):
+      return x * x + 1
+
+    x = array_ops.placeholder(dtypes.float32)
+    y = Bar(x)
+
+    g = ops.get_default_graph()
+    FooImpl.add_to_graph(g)
+
+    with self.test_session():
+      rand = np.random.uniform(size=(3, 3))
+      expected = rand * rand + 1.0
+      self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
+
   def testDeclareTypeMistake(self):
     foo = function.Declare("Foo", [("x", dtypes.float32)],
                            [("y", dtypes.float32)])
@@ -644,6 +748,274 @@ class FunctionTest(test.TestCase):
       self.assertAllEqual(v0, 20.)
       self.assertAllEqual(v1, 20.)
 
+  def testShapeFunction(self):
+    @function.Defun(dtypes.float32,
+                    shape_func=lambda op: [op.inputs[0].get_shape()])
+    def Foo(x):
+      return x + 1.0
+
+    @function.Defun(
+        shape_func=lambda op: [[1] + op.inputs[0].get_shape().as_list()])
+    def Bar(x):
+      return array_ops.stack([x])
+
+    g = ops.Graph()
+    with g.as_default():
+      x = Foo([1.0, 2.0])
+      self.assertEqual(x.get_shape().as_list(), [2])
+      y = Bar(array_ops.zeros([1, 2, 3]))
+      self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
+
+  def testVariableReuse(self):
+    def LinearWithReuse(input_tensor, reuse=None):
+      size = input_tensor.shape.dims[1]
+      with variable_scope.variable_scope("linear", reuse=reuse):
+        w = variable_scope.get_variable("w", shape=[size, size],
+                                        dtype=input_tensor.dtype)
+      return math_ops.matmul(input_tensor, w)
+
+    @function.Defun(dtypes.float32)
+    def Foo(inputs):
+      inputs = array_ops.reshape(inputs, [32, 100])
+      hidden = LinearWithReuse(inputs)
+      return LinearWithReuse(hidden, reuse=True)
+
+    input_op = array_ops.placeholder(shape=[32, 100], dtype=dtypes.float32)
+    output_op = Foo(input_op)
+
+    global_vars = variables.global_variables()
+    self.assertEqual(len(global_vars), 1)
+    self.assertEqual(global_vars[0].name, "linear/w:0")
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      output_val = sess.run(output_op,
+                            feed_dict={input_op: np.random.rand(32, 100)})
+      self.assertEqual(output_val.shape, (32, 100))
+
+  def testFunctionCallInDifferentVariableScopes(self):
+    @function.Defun(dtypes.float32)
+    def Foo(inputs):
+      var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32,
+                                        initializer=init_ops.ones_initializer())
+      return inputs + var
+
+    input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32)
+    with variable_scope.variable_scope("vs1"):
+      out1_op = Foo(input_op)
+
+    with variable_scope.variable_scope("vs2"):
+      out2_op = Foo(input_op)
+
+    global_vars = variables.global_variables()
+    self.assertEqual(len(global_vars), 1)
+    self.assertEqual(global_vars[0].name, "vs1/var:0")
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      out1, out2 = sess.run([out1_op, out2_op],
+                            feed_dict={input_op: np.linspace(1, 10, 10)})
+      self.assertAllEqual(out1, np.linspace(2, 11, 10))
+      self.assertAllEqual(out2, np.linspace(2, 11, 10))
+
+  def testTwoInputsSameOp(self):
+    g = ops.Graph()
+    with g.as_default():
+      m = array_ops.placeholder(dtypes.float32)
+      s, u, v = linalg_ops.svd(m)
+      ss = math_ops.reduce_sum(s)
+      uu = math_ops.reduce_sum(u)
+      vv = math_ops.reduce_sum(v)
+      result = ss + uu + vv
+    f = function._graph_to_function_def(
+        g,
+        g.get_operations()[1:],  # skip the placeholder
+        [s, u, v],
+        [result])
+    self.assertEqual(len(f.signature.input_arg), 3)
+
+
+class FunctionsFromProtos(test.TestCase):
+
+  def expectFunctionsEqual(self, func, grad_func=None, new_func=None):
+    if new_func is None:
+      # Make a copy of func.definition to avoid any bugs masked by using the
+      # same object
+      serialized_fdef = func.definition.SerializeToString()
+      # Serialize and then deserialize `func` to create `new_func`
+      fdef = function_pb2.FunctionDef.FromString(serialized_fdef)
+      new_func = function._from_definition(fdef, grad_func=grad_func)
+    self.assertEqual(func.name, new_func.name)
+    self.assertEqual(func.definition, new_func.definition)
+    self.assertEqual(func.grad_func_name, new_func.grad_func_name)
+    self.assertEqual(func.declared_input_types, new_func.declared_input_types)
+    self.assertEqual(func.captured_inputs, new_func.captured_inputs)
+
+  def testBasic(self):
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def Foo(x, y):
+      return x + y
+    self.expectFunctionsEqual(Foo)
+
+  def testGradFunc(self):
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def G(x, dy):
+      return x * dy
+
+    @function.Defun(dtypes.float32, grad_func=G)
+    def F(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+    self.expectFunctionsEqual(F, grad_func=G)
+
+  def testCapturedInputs(self):
+    c = constant_op.constant(10, dtypes.int64)
+    @function.Defun(dtypes.int64)
+    def Foo(x):
+      return x + c
+
+    new_func = function._from_definition(Foo.definition)
+
+    self.assertEqual(Foo.name, new_func.name)
+    self.assertEqual(Foo.definition, new_func.definition)
+    self.assertEqual(Foo.grad_func_name, new_func.grad_func_name)
+
+    # Captured inputs are added as regular inputs to the function definition
+    self.assertEqual(new_func.declared_input_types,
+                     Foo.declared_input_types + (dtypes.int64,))
+    self.assertEqual(len(new_func.captured_inputs), 0)
+
+  def testNestedFunctions(self):
+    @function.Defun(dtypes.float32)
+    def Outer(x):
+
+      @function.Defun(dtypes.float32)
+      def Inner(y):
+        return y + 1
+
+      return Inner(Inner(x))
+
+    self.expectFunctionsEqual(Outer)
+
+  def testFromLibrary(self):
+    # Define some functions with different gradient functions. Note that many of
+    # the below functions are identical since function bodies don't matter for
+    # this test.
+
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def G1(x, dy):
+      return x * dy
+
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def G2(x, dy):
+      return x * dy
+
+    # F1 and F2 have the same gradient function
+    @function.Defun(dtypes.float32, grad_func=G1)
+    def F1(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    @function.Defun(dtypes.float32, grad_func=G1)
+    def F2(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    # F3 has a different gradient function
+    @function.Defun(dtypes.float32, grad_func=G2)
+    def F3(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    # F4 has no gradient function
+    @function.Defun(dtypes.float32)
+    def F4(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    # Instantiate all functions
+    g = ops.Graph()
+    with g.as_default():
+      c = constant_op.constant(1.0, dtypes.float32)
+      f1 = F1(c)
+      f2 = F2(c)
+      f3 = F3(c)
+      f4 = F4(c)
+      gradients_impl.gradients([f1, f2, f3, f4], c)
+
+    library = g.as_graph_def().library
+    new_funcs = function._from_library(library)
+
+    def CheckNewFunc(func):
+      new_func = [f for f in new_funcs if f.name == func.name]
+      self.assertEqual(len(new_func), 1)
+      self.expectFunctionsEqual(func, new_func=new_func[0])
+
+    CheckNewFunc(G1)
+    CheckNewFunc(G2)
+    CheckNewFunc(F1)
+    CheckNewFunc(F2)
+    CheckNewFunc(F3)
+    CheckNewFunc(F4)
+
+  def testFromLibraryEmptyLib(self):
+    library = function_pb2.FunctionDefLibrary()
+    self.assertEqual(len(function._from_library(library)), 0)
+
+  def testFromLibraryMissingFuncDef(self):
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def G1(x, dy):
+      return x * dy
+
+    @function.Defun(dtypes.float32)
+    def F1(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    gradient = function_pb2.GradientDef()
+    gradient.function_name = F1.name
+    gradient.gradient_func = G1.name
+
+    # Create invalid function def that is missing G1 function def
+    library = function_pb2.FunctionDefLibrary()
+    library.gradient.extend([gradient])
+    library.function.extend([F1.definition])
+
+    with self.assertRaisesRegexp(
+        ValueError, "FunctionDefLibrary missing 'G1_........' FunctionDef"):
+      function._from_library(library)
+
+    # Create invalid function def that is missing F1 function def
+    library = function_pb2.FunctionDefLibrary()
+    library.gradient.extend([gradient])
+    library.function.extend([G1.definition])
+
+    with self.assertRaisesRegexp(
+        ValueError, "FunctionDefLibrary missing 'F1_........' FunctionDef"):
+      function._from_library(library)
+
+  def testFromLibraryCyclicGradFuncs(self):
+    @function.Defun(dtypes.float32)
+    def F1(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    @function.Defun(dtypes.float32)
+    def F2(x):
+      return math_ops.exp(x) - math_ops.exp(-x)
+
+    # Create invalid function def library where F1 has gradient function F2 and
+    # F2 has gradient function F1
+    library = function_pb2.FunctionDefLibrary()
+    library.function.extend([F1.definition, F2.definition])
+
+    gradient1 = function_pb2.GradientDef()
+    gradient1.function_name = F1.name
+    gradient1.gradient_func = F2.name
+
+    gradient2 = function_pb2.GradientDef()
+    gradient2.function_name = F2.name
+    gradient2.gradient_func = F1.name
+
+    library.gradient.extend([gradient1, gradient2])
+
+    with self.assertRaisesRegexp(
+        ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
+      function._from_library(library)
+
 
 class FunctionOverloadTest(test.TestCase):
 
@@ -827,9 +1199,9 @@ class UnrollLSTMTest(test.TestCase):
       d1 = RunForwardBackward("cell", cfg)
       d2 = RunForwardBackward("loop", cfg)
       d3 = RunForwardBackward("loop10", cfg)
-      self.assertAllClose(d0, d1, rtol=1e-4)
-      self.assertAllClose(d0, d2, rtol=1e-4)
-      self.assertAllClose(d0, d3, rtol=1e-4)
+      self.assertAllClose(d0, d1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(d0, d2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
 
 
 class FunctionInlineControlTest(test.TestCase):
@@ -842,6 +1214,7 @@ class FunctionInlineControlTest(test.TestCase):
             do_common_subexpression_elimination=True,
             do_function_inlining=True,
             do_constant_folding=True)))
+    cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
     for noinline in [False, True]:
 
       @function.Defun(dtype, noinline=noinline)
@@ -880,7 +1253,7 @@ class FunctionInlineControlTest(test.TestCase):
       def MetadataHasCell(run_metadata):
         for dev_stats in run_metadata.step_stats.dev_stats:
           for node_stats in dev_stats.node_stats:
-            if "Cell" in node_stats.timeline_label:
+            if cell_func_call_pattern.search(node_stats.timeline_label):
               return True
         return False
 
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
deleted file mode 100644
index fb0a4dda3d2..00000000000
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Updates generated docs from Python doc comments."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import os.path
-import sys
-
-import tensorflow as tf
-
-from tensorflow.contrib import ffmpeg
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.client import client_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import docs
-from tensorflow.python.framework import framework_lib
-
-FLAGS = None
-
-
-PREFIX_TEXT = """
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
-"""
-
-
-def module_names():
-  return [
-      "tf",
-      "tf.errors",
-      "tf.image",
-      "tf.nn",
-      "tf.train",
-      "tf.python_io",
-      "tf.saved_model",
-      "tf.summary",
-      "tf.test",
-      "tf.contrib.bayesflow.entropy",
-      "tf.contrib.bayesflow.monte_carlo",
-      "tf.contrib.bayesflow.stochastic_graph",
-      "tf.contrib.bayesflow.stochastic_tensor",
-      "tf.contrib.bayesflow.variational_inference",
-      "tf.contrib.copy_graph",
-      "tf.contrib.crf",
-      "tf.contrib.distributions",
-      "tf.contrib.distributions.bijector",
-      "tf.contrib.ffmpeg",
-      "tf.contrib.framework",
-      "tf.contrib.graph_editor",
-      "tf.contrib.integrate",
-      "tf.contrib.layers",
-      "tf.contrib.learn",
-      "tf.contrib.learn.monitors",
-      "tf.contrib.legacy_seq2seq",
-      "tf.contrib.linalg",
-      "tf.contrib.losses",
-      "tf.contrib.metrics",
-      "tf.contrib.opt",
-      "tf.contrib.rnn",
-      "tf.contrib.solvers",
-      "tf.contrib.training",
-      "tf.contrib.util",
-      "tf_debug",
-  ]
-
-
-def find_module(base_module, name):
-  if name == "tf":
-    return base_module
-  # Special case for ffmpeg is needed since it's not linked in by default due
-  # to size concerns.
-  elif name == "tf.contrib.ffmpeg":
-    return ffmpeg
-  elif name == "tf_debug":
-    return tf_debug
-  elif name.startswith("tf."):
-    subname = name[3:]
-    subnames = subname.split(".")
-    parent_module = base_module
-    for s in subnames:
-      if not hasattr(parent_module, s):
-        raise ValueError(
-            "Module not found: {}. Submodule {} not found in parent module {}."
-            " Possible candidates are {}".format(
-                name, s, parent_module.__name__, dir(parent_module)))
-      parent_module = getattr(parent_module, s)
-    return parent_module
-  else:
-    raise ValueError(
-        "Invalid module name: {}. Module names must start with 'tf.'".format(
-            name))
-
-
-def get_module_to_name(names):
-  return collections.OrderedDict([(find_module(tf, x), x) for x in names])
-
-
-def all_libraries(module_to_name, members, documented):
-  """Make a list of the individual files that we want to create.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    members: Dictionary mapping member name to (fullname, member).
-    documented: Set of documented names to update.
-
-  Returns:
-    List of (filename, docs.Library) pairs.
-  """
-  def library(name, title, module=None, **args):
-    if module is None:
-      module = sys.modules["tensorflow.python.ops." + name]
-    return (name + ".md", docs.Library(title=title,
-                                       module_to_name=module_to_name,
-                                       members=members,
-                                       documented=documented,
-                                       module=module,
-                                       **args))
-  return collections.OrderedDict([
-      # Splits of module 'tf'.
-      library("framework", "Building Graphs", framework_lib),
-      library("check_ops", "Asserts and boolean checks."),
-      library("constant_op", "Constants, Sequences, and Random Values",
-              constant_op, prefix=PREFIX_TEXT),
-      library("state_ops",
-              "Variables",
-              exclude_symbols=["create_partitioned_variables"],
-              prefix=PREFIX_TEXT),
-      library("array_ops",
-              "Tensor Transformations",
-              exclude_symbols=["list_diff"],
-              prefix=PREFIX_TEXT),
-      library("math_ops",
-              "Math",
-              exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
-                               "lin_space", "sparse_segment_mean_grad"],
-              prefix=PREFIX_TEXT),
-      library("string_ops", "Strings",
-              prefix=PREFIX_TEXT),
-      library("histogram_ops", "Histograms"),
-      library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
-      library("functional_ops", "Higher Order Functions", prefix=PREFIX_TEXT),
-      library("tensor_array_ops", "TensorArray Operations", prefix=PREFIX_TEXT),
-      library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
-      library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
-              prefix=PREFIX_TEXT),
-      library("sparse_ops",
-              "Sparse Tensors",
-              exclude_symbols=["serialize_sparse", "serialize_many_sparse",
-                               "deserialize_many_sparse"],
-              prefix=PREFIX_TEXT),
-      library("io_ops",
-              "Inputs and Readers",
-              exclude_symbols=["LookupTableBase", "HashTable",
-                               "initialize_all_tables",
-                               "tables_initializer",
-                               "parse_single_sequence_example",
-                               "string_to_hash_bucket"],
-              prefix=PREFIX_TEXT),
-      library("python_io", "Data IO (Python functions)", tf.python_io),
-      library("nn",
-              "Neural Network",
-              tf.nn,
-              exclude_symbols=["conv2d_backprop_input",
-                               "conv2d_backprop_filter", "avg_pool_grad",
-                               "max_pool_grad", "max_pool_grad_with_argmax",
-                               "batch_norm_with_global_normalization_grad",
-                               "lrn_grad", "relu6_grad", "softplus_grad",
-                               "softsign_grad", "xw_plus_b", "relu_layer",
-                               "lrn", "batch_norm_with_global_normalization",
-                               "batch_norm_with_global_normalization_grad",
-                               "all_candidate_sampler", "seq2seq"],
-              prefix=PREFIX_TEXT),
-      library("client", "Running Graphs", client_lib),
-      library("train",
-              "Training",
-              tf.train,
-              exclude_symbols=["Feature", "Features", "BytesList", "FloatList",
-                               "Int64List", "Example", "InferenceExample",
-                               "FeatureList", "FeatureLists", "RankingExample",
-                               "SequenceExample"]),
-      library("script_ops",
-              "Wraps python functions",
-              prefix=PREFIX_TEXT),
-      library("summary", "Summary Operations", tf.summary),
-      library("test", "Testing", tf.test),
-      library("contrib.bayesflow.entropy",
-              "BayesFlow Entropy (contrib)",
-              tf.contrib.bayesflow.entropy),
-      library("contrib.bayesflow.monte_carlo",
-              "BayesFlow Monte Carlo (contrib)",
-              tf.contrib.bayesflow.monte_carlo),
-      library("contrib.bayesflow.stochastic_graph",
-              "BayesFlow Stochastic Graph (contrib)",
-              tf.contrib.bayesflow.stochastic_graph),
-      library("contrib.bayesflow.stochastic_tensor",
-              "BayesFlow Stochastic Tensors (contrib)",
-              tf.contrib.bayesflow.stochastic_tensor),
-      library("contrib.bayesflow.variational_inference",
-              "BayesFlow Variational Inference (contrib)",
-              tf.contrib.bayesflow.variational_inference),
-      library("contrib.crf", "CRF (contrib)", tf.contrib.crf),
-      library("contrib.distributions", "Statistical Distributions (contrib)",
-              tf.contrib.distributions),
-      library("contrib.distributions.bijector",
-              "Random variable transformations (contrib)",
-              tf.contrib.distributions.bijector),
-      library("contrib.ffmpeg", "FFmpeg (contrib)", ffmpeg),
-      library("contrib.framework", "Framework (contrib)", tf.contrib.framework),
-      library("contrib.graph_editor", "Graph Editor (contrib)",
-              tf.contrib.graph_editor),
-      library("contrib.integrate", "Integrate (contrib)", tf.contrib.integrate),
-      library("contrib.layers", "Layers (contrib)", tf.contrib.layers),
-      library("contrib.learn", "Learn (contrib)", tf.contrib.learn),
-      library("contrib.learn.monitors", "Monitors (contrib)",
-              tf.contrib.learn.monitors),
-      library("contrib.legacy_seq2seq", "Sequence to Sequence (contrib)",
-              tf.contrib.legacy_seq2seq),
-      library("contrib.linalg", "Linear Algebra (contrib)",
-              tf.contrib.linalg),
-      library("contrib.losses", "Losses (contrib)", tf.contrib.losses),
-      library("contrib.opt", "Optimization (contrib)", tf.contrib.opt),
-      library("contrib.rnn", "RNN and Cells (contrib)", tf.contrib.rnn),
-      library("contrib.metrics", "Metrics (contrib)", tf.contrib.metrics),
-      library("contrib.training", "Training (contrib)", tf.contrib.training),
-      library("contrib.util", "Utilities (contrib)", tf.contrib.util),
-      library("contrib.copy_graph", "Copying Graph Elements (contrib)",
-              tf.contrib.copy_graph),
-      library("tf_debug", "TensorFlow Debugger", tf_debug),
-  ])
-
-_hidden_symbols = ["Event", "LogMessage", "Summary", "SessionLog", "xrange",
-                   "HistogramProto", "ConfigProto", "NodeDef", "GraphDef",
-                   "GPUOptions", "GraphOptions", "RunOptions", "RunMetadata",
-                   "SessionInterface", "BaseSession", "NameAttrList",
-                   "AttrValue", "OptimizerOptions",
-                   "CollectionDef", "MetaGraphDef", "QueueRunnerDef",
-                   "SaverDef", "VariableDef", "TestCase", "GrpcServer",
-                   "ClusterDef", "JobDef", "ServerDef", "TensorInfo"]
-
-# TODO(skleinfeld, deannarubin) Address shortname
-# conflict between tf.contrib.learn.NanLossDuringTrainingError and
-# tf.contrib.learn.monitors.NanLossDuringTrainingError, arising due
-# to imports in learn/python/learn/__init__.py
-# TODO(wicke): Remove contrib.layers.relu* after shortnames are
-# disabled.  These conflict with tf.nn.relu*
-EXCLUDE = frozenset(["tf.contrib.learn.monitors.NanLossDuringTrainingError",
-                     "tf.contrib.layers.dropout",
-                     "tf.contrib.layers.bias_add",
-                     "tf.contrib.layers.conv2d",
-                     "tf.contrib.layers.conv2d_transpose",
-                     "tf.contrib.layers.separable_conv2d",
-                     "tf.contrib.layers.softmax",
-                     "tf.contrib.layers.relu", "tf.contrib.layers.relu6",
-                     "tf.contrib.framework.assert_global_step",
-                     "tf.contrib.framework.get_global_step",
-                     "tf.contrib.learn.NanLossDuringTrainingError",
-                     "tf.contrib.layers.stack",
-                     "tf.contrib.layers.ProblemType",
-                     "tf.confusion_matrix"])
-
-
-def main(unused_argv):
-  if not FLAGS.out_dir:
-    tf.logging.error("out_dir not specified")
-    return -1
-
-  # Document libraries
-  documented = set()
-  module_to_name = get_module_to_name(module_names())
-  members = docs.collect_members(module_to_name, exclude=EXCLUDE)
-  libraries = all_libraries(module_to_name, members, documented).items()
-
-  # Define catch_all library before calling write_libraries to avoid complaining
-  # about generically hidden symbols.
-  catch_all = docs.Library(title="Catch All", module=None,
-                           exclude_symbols=_hidden_symbols,
-                           module_to_name=module_to_name, members=members,
-                           documented=documented)
-
-  # Write docs to files
-  docs.write_libraries(FLAGS.out_dir, libraries)
-
-  # Make it easy to search for hidden symbols
-  if FLAGS.print_hidden_regex:
-    hidden = set(_hidden_symbols)
-    for _, lib in libraries:
-      hidden.update(lib.exclude_symbols)
-    print(r"hidden symbols regex = r'\b(%s)\b'" % "|".join(sorted(hidden)))
-
-  # Verify that all symbols are mentioned in some library doc.
-  catch_all.assert_no_leftovers()
-
-  # Generate index
-  with open(os.path.join(FLAGS.out_dir, "index.md"), "w") as f:
-    docs.Index(module_to_name, members, libraries,
-               "../../api_docs/python/").write_markdown_to_file(f)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--out_dir",
-      type=str,
-      default=None,
-      help="Directory to which docs should be written.")
-  parser.add_argument(
-      "--print_hidden_regex",
-      type="bool",
-      nargs="?",
-      const=True,
-      default=False,
-      help="Dump a regular expression matching any hidden symbol")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 0033a370883..f909bcd62d2 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import os.path
 
+from google.protobuf import text_format
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 
@@ -64,7 +65,8 @@ def write_graph(graph_or_graph_def, logdir, name, as_text=True):
     file_io.recursive_create_dir(logdir)
   path = os.path.join(logdir, name)
   if as_text:
-    file_io.atomic_write_string_to_file(path, str(graph_def))
+    file_io.atomic_write_string_to_file(path,
+                                        text_format.MessageToString(graph_def))
   else:
     file_io.atomic_write_string_to_file(path, graph_def.SerializeToString())
   return path
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index b048e8b911f..a9068217f5c 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -157,6 +157,8 @@ def extract_sub_graph(graph_def, dest_nodes):
   out = graph_pb2.GraphDef()
   for n in nodes_to_keep_list:
     out.node.extend([copy.deepcopy(name_to_node_map[n])])
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
 
   return out
 
@@ -239,11 +241,13 @@ def convert_variables_to_constants(sess, input_graph_def, output_node_names,
     else:
       output_node.CopyFrom(input_node)
     output_graph_def.node.extend([output_node])
+
+  output_graph_def.library.CopyFrom(inference_graph.library)
   print("Converted %d variables to const ops." % how_many_converted)
   return output_graph_def
 
 
-def remove_training_nodes(input_graph):
+def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
   There are nodes like Identity and CheckNumerics that are only useful
@@ -255,17 +259,22 @@ def remove_training_nodes(input_graph):
 
   Args:
     input_graph: Model to analyze and prune.
+    protected_nodes: An optional list of names of nodes to be kept
+      unconditionally. This is for example useful to preserve Identity output
+      nodes.
 
   Returns:
     A list of nodes with the unnecessary ones removed.
   """
+  if not protected_nodes:
+    protected_nodes = []
 
   types_to_remove = {"CheckNumerics": True}
 
   input_nodes = input_graph.node
   names_to_remove = {}
   for node in input_nodes:
-    if node.op in types_to_remove:
+    if node.op in types_to_remove and node.name not in protected_nodes:
       names_to_remove[node.name] = True
 
   nodes_after_removal = []
@@ -286,7 +295,7 @@ def remove_training_nodes(input_graph):
   types_to_splice = {"Identity": True}
   names_to_splice = {}
   for node in nodes_after_removal:
-    if node.op in types_to_splice:
+    if node.op in types_to_splice and node.name not in protected_nodes:
       # We don't want to remove nodes that have control edge inputs, because
       # they might be involved in subtle dependency issues that removing them
       # will jeopardize.
@@ -307,10 +316,10 @@ def remove_training_nodes(input_graph):
     del new_node.input[:]
     for full_input_name in input_before_removal:
       input_name = re.sub(r"^\^", "", full_input_name)
-      if input_name in names_to_splice:
-        new_node.input.append(names_to_splice[input_name])
-      else:
-        new_node.input.append(full_input_name)
+      while input_name in names_to_splice:
+        full_input_name = names_to_splice[input_name]
+        input_name = re.sub(r"^\^", "", full_input_name)
+      new_node.input.append(full_input_name)
     nodes_after_splicing.append(new_node)
 
   output_graph = graph_pb2.GraphDef()
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index a7e8faec007..f6e9bc9dad3 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -187,6 +188,36 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertEqual("n3", sub_graph.node[2].name)
     self.assertEqual("n5", sub_graph.node[3].name)
 
+  def testConvertVariablesToConstsWithFunctions(self):
+    @function.Defun(dtypes.float32)
+    def plus_one(x):
+      return x + 1.0
+
+    with ops.Graph().as_default():
+      variable_node = variables.Variable(1.0, name="variable_node")
+      _ = variables.Variable(1.0, name="unused_variable_node")
+      defun_node = plus_one(variable_node)
+      output_node = math_ops_lib.multiply(
+          defun_node, 2.0, name="output_node")
+
+      with session.Session() as sess:
+        init = variables.initialize_variables([variable_node])
+        sess.run(init)
+        output = sess.run(output_node)
+        self.assertNear(4.0, output, 0.00001)
+        variable_graph_def = sess.graph.as_graph_def()
+
+        # First get the constant_graph_def when variable_names_whitelist is set,
+        # note that if variable_names_whitelist is not set an error will be
+        # thrown because unused_variable_node is not initialized.
+        constant_graph_def = graph_util.convert_variables_to_constants(
+            sess,
+            variable_graph_def, ["output_node"],
+            variable_names_whitelist=set(["variable_node"]))
+
+        self.assertEqual(variable_graph_def.library,
+                         constant_graph_def.library)
+
   def testConvertVariablesToConsts(self):
     with ops.Graph().as_default():
       variable_node = variables.Variable(1.0, name="variable_node")
@@ -318,6 +349,29 @@ class DeviceFunctionsTest(test.TestCase):
     output = graph_util.remove_training_nodes(graph_def)
     self.assertProtoEquals(expected_output, output)
 
+  def testRemoveIdentityChains(self):
+    """Check that chains of Identity nodes are correctly pruned.
+
+    Create a chain of four nodes, A, B, C, and D where A inputs B, B inputs C,
+    and C inputs D. Nodes B and C are "Identity" and should be pruned, resulting
+    in the nodes A and D, where A inputs D.
+    """
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_node_def("Aop", "A", ["B"]), self.create_node_def(
+            "Identity", "B", ["C"]), self.create_node_def(
+                "Identity", "C", ["D"]), self.create_node_def("Dop", "D", [])
+    ])
+
+    expected_graph_def = graph_pb2.GraphDef()
+    expected_graph_def.node.extend([
+        self.create_node_def("Aop", "A", ["D"]), self.create_node_def(
+            "Dop", "D", [])
+    ])
+
+    self.assertProtoEquals(expected_graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 77d08224315..025e2136206 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import copy
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -151,14 +154,15 @@ def _FindAttrInOpDef(attr_name, op_def):
 
 def import_graph_def(graph_def, input_map=None, return_elements=None,
                      name=None, op_dict=None, producer_op_list=None):
-  """Imports the TensorFlow graph in `graph_def` into the Python `Graph`.
+  """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
   [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
   protocol buffer, and extract individual objects in the `GraphDef` as
-  [`Tensor`](#Tensor) and [`Operation`](#Operation) objects. See
-  [`Graph.as_graph_def()`](#Graph.as_graph_def) for a way to create a
-  `GraphDef` proto.
+  @{tf.Tensor} and @{tf.Operation} objects. Once extracted,
+  these objects are placed into the current default `Graph`. See
+  @{tf.Graph.as_graph_def} for a way to create a `GraphDef`
+  proto.
 
   Args:
     graph_def: A `GraphDef` proto containing operations to be imported into
@@ -170,7 +174,8 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       `graph_def` that will be returned as `Operation` objects; and/or
       tensor names in `graph_def` that will be returned as `Tensor` objects.
     name: (Optional.) A prefix that will be prepended to the names in
-      `graph_def`. Defaults to `"import"`.
+      `graph_def`. Note that this does not apply to imported function names.
+      Defaults to `"import"`.
     op_dict: (Optional.) A dictionary mapping op type names to `OpDef` protos.
       Must contain an `OpDef` proto for each op type named in `graph_def`.
       If omitted, uses the `OpDef` protos registered in the global registry.
@@ -231,9 +236,24 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
   else:
     producer_op_dict = {op.name: op for op in producer_op_list.op}
 
+  g = ops.get_default_graph()
+
+  # Add any functions defined in `graph_def` to `g`
+  if graph_def.library and graph_def.library.function:
+    # Copy op_dict so we don't clobber the original
+    op_dict = copy.copy(op_dict)
+    # pylint: disable=protected-access
+    # Note that we do not prepend `name` to the function name. The reasoning is
+    # that function names are similar to op definition names, which currently do
+    # not have a scoped name or namespace scheme.
+    functions = function._from_library(graph_def.library)
+    for f in functions:
+      f.add_to_graph(g)
+      op_dict[f.name] = f.definition.signature
+    # pylint: enable=protected-access
+
   # LINT.IfChange
   with ops.name_scope(name, 'import', input_map.values()) as scope:
-    g = ops.get_default_graph()
     # TODO(ashankar): Should this just copy over or should it do some
     # more nuanced merging? For example, the graph may already have some
     # marked "bad versions" and we don't want to lose those because of
@@ -241,11 +261,13 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
     # more nuanced.
     g.graph_def_versions.CopyFrom(graph_def.versions)
 
-    if input_map:
+    if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
       if not scope:
         # The caller must have passed `name=''`.
-        raise ValueError('tf.import_graph_def() requires a non-empty `name` '
-                         'if `input_map` is used.')
+        raise ValueError(
+            'tf.import_graph_def() requires a non-empty `name` if `input_map` '
+            'contains non-Tensor values. Try calling tf.convert_to_tensor() on '
+            '`input_map` values before calling tf.import_graph_def().')
       with ops.name_scope('_inputs'):
         input_map = {k: ops.convert_to_tensor(v) for k, v in input_map.items()}
 
@@ -254,6 +276,9 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
 
     # 1. Add operations without their inputs.
     for node in graph_def.node:
+      # Check to see if this op's name matches a previously seen op
+      if node.name in name_to_op:
+        raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
       # Set any default attr values that aren't present.
       if node.op not in op_dict:
         raise ValueError('No op named %s in defined operations.' % node.op)
@@ -286,10 +311,14 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
           compute_shapes=False, compute_device=False,
           op_def=op_def)
 
+    # Maps from a node to the op it is colocated with, if colocation
+    # is specified in the attributes.
+    colocation_pairs = {}
     # 2. Add inputs to the operations.
     for node in graph_def.node:
       op = name_to_op[node.name]
       input_types = _InputTypes(node, op_dict)
+      apply_device_function = True
 
       # Rewrite the colocation attributes in the graph, since the
       # names of new ops may have changed.
@@ -308,6 +337,14 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
               original_op = name_to_op[op_to_bind_to]
               new_class_values.append(compat.as_bytes(
                   'loc:@' + original_op.name))
+              if op_to_bind_to != node.name:
+                # Keep track of this mapping for a later phase.
+                colocation_pairs[op] = original_op
+                # Don't apply this op's device function,
+                # the colocation constraint will ensure
+                # the proper device gets assigned at runtime.
+                apply_device_function = False
+
             else:
               new_class_values.append(class_value)
           value.list.CopyFrom(attr_value_pb2.AttrValue.ListValue(
@@ -366,7 +403,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
             raise ValueError(_InvalidNodeMessage(
                 node, 'Input tensor %r %s' % (input_name, te)))
 
-      # pylint: disable=protected_access
+      # pylint: disable=protected-access
       if op._input_dtypes != input_types:
         raise ValueError(
             _InvalidNodeMessage(
@@ -374,12 +411,13 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                 'Input types mismatch (expected %r but got %r)'
                 % (', '.join(dtypes.as_dtype(x).name for x in input_types),
                    ', '.join(x.name for x in op._input_dtypes))))
-      # pylint: enable=protected_access
+      # pylint: enable=protected-access
 
-      # Execute shape inference for this op.
-      # NOTE(mrry): If the graph contains a cycle, the full shape information
-      # may not be available for this op's inputs.
-      ops.set_shapes_for_outputs(op)
+      if not g._is_function(op.type):  # pylint: disable=protected-access
+        # Execute shape inference for this op.
+        # NOTE(mrry): If the graph contains a cycle, the full shape information
+        # may not be available for this op's inputs.
+        ops.set_shapes_for_outputs(op)
       # For nodes with _output_shapes set, set the output shapes.
       if '_output_shapes' in op.node_def.attr:
         for i, output in enumerate(op.outputs):
@@ -412,6 +450,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                            'WholeFileReader', 'TextLineReader',
                            'FixedLengthRecordReader',
                            'TFRecordReader', 'IdentityReader',
+                           'LMDBReader',
                            'RefSwitch', 'RefEnter', 'RefNextIteration',
                            'RefMerge', 'RefIdentity']:
               pass
@@ -426,11 +465,22 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
 
         del op.node_def.attr['_output_shapes']
 
-      # Apply device functions for this op.
       # NOTE(mrry): We do this after configuring the inputs, because
       # the result of the device functions may depend on the inputs.
-      with _MaybeDevice(node.device):
-        g._apply_device_functions(op)  # pylint: disable=protected-access
+      if apply_device_function:
+        with _MaybeDevice(node.device):
+          g._apply_device_functions(op)  # pylint: disable=protected-access
+
+    # The following loop populates the device field of ops that are
+    # colocated with another op.  This is implied by the colocation
+    # attribute, but we propagate the device field for completeness.
+    for op, coloc_op in colocation_pairs.items():
+      # If the colocation op has no device, even after a device
+      # application, there's nothing to do here.
+      if not coloc_op.device:
+        continue
+      coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+      op._set_device(coloc_device)  # pylint: disable=protected-access
 
     # Treat unused input mappings as an error, because they are likely to be
     # due to a typo.
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 813f27bd178..7fdbcfd8561 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -27,10 +27,12 @@ from tensorflow.core.framework import op_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -42,29 +44,29 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-def _unknown_shape(op):
+def _UnknownShape(op):
   return [tensor_shape.unknown_shape() for _ in op.outputs]
 
 
 # NOTE(cwhipkey): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
-ops.RegisterShape("If")(_unknown_shape)
-ops.RegisterShape("Iff")(_unknown_shape)
-ops.RegisterShape("Ii")(_unknown_shape)
-ops.RegisterShape("Iif")(_unknown_shape)
-ops.RegisterShape("Iii")(_unknown_shape)
-ops.RegisterShape("In")(_unknown_shape)
-ops.RegisterShape("Iri")(_unknown_shape)
-ops.RegisterShape("None")(_unknown_shape)
-ops.RegisterShape("Of")(_unknown_shape)
-ops.RegisterShape("Oi")(_unknown_shape)
-ops.RegisterShape("Oif")(_unknown_shape)
-ops.RegisterShape("Oii")(_unknown_shape)
-ops.RegisterShape("OpWithDefaultAttr")(_unknown_shape)
-ops.RegisterShape("OpWithFutureDefaultAttr")(_unknown_shape)
-ops.RegisterShape("Or")(_unknown_shape)
-ops.RegisterShape("Otl")(_unknown_shape)
-ops.RegisterShape("Unary")(_unknown_shape)
+ops.RegisterShape("If")(_UnknownShape)
+ops.RegisterShape("Iff")(_UnknownShape)
+ops.RegisterShape("Ii")(_UnknownShape)
+ops.RegisterShape("Iif")(_UnknownShape)
+ops.RegisterShape("Iii")(_UnknownShape)
+ops.RegisterShape("In")(_UnknownShape)
+ops.RegisterShape("Iri")(_UnknownShape)
+ops.RegisterShape("None")(_UnknownShape)
+ops.RegisterShape("Of")(_UnknownShape)
+ops.RegisterShape("Oi")(_UnknownShape)
+ops.RegisterShape("Oif")(_UnknownShape)
+ops.RegisterShape("Oii")(_UnknownShape)
+ops.RegisterShape("OpWithDefaultAttr")(_UnknownShape)
+ops.RegisterShape("OpWithFutureDefaultAttr")(_UnknownShape)
+ops.RegisterShape("Or")(_UnknownShape)
+ops.RegisterShape("Otl")(_UnknownShape)
+ops.RegisterShape("Unary")(_UnknownShape)
 
 _op_list = op_def_pb2.OpList()
 text_format.Merge("""
@@ -598,6 +600,88 @@ class ImportGraphDefTest(test.TestCase):
             value { list { s: 'loc:@imported_graph/A' } }
           } }""", b.graph.as_graph_def())
 
+  def testColocationWithDeviceFn(self):
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None' attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }
+          node { name: 'B' op: 'None'  attr {
+            key: '_class'
+            value { list { s: 'loc:@A' } }
+          } }""")
+
+    # A device function that places "A" on one device and "B" on
+    # another device.  Because B is colocated with A, we test that B's
+    # device function is overridden by A.
+    def CustomDeviceFn(op):
+      if "A" in op.name:
+        return "/device:A:0"
+      else:
+        return "/device:B:0"
+
+    with ops.Graph().as_default():
+      with ops.device(CustomDeviceFn):
+        b, = importer.import_graph_def(
+            original_graph_def, return_elements=["B"], name="imported_graph")
+
+      self.assertProtoEqualsVersion("""
+          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+                }
+          }
+          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+          } }""", b.graph.as_graph_def())
+
+    # Test a scenario where 'A' doesn't get a device; 'A' should
+    # not have a device, but during runtime will get colocated with
+    # 'B' because of the colocation attribute.
+    def BDeviceFn(op):
+      if "B" in op.name:
+        return "/device:B:0"
+      return ""
+
+    with ops.Graph().as_default():
+      with ops.device(BDeviceFn):
+        b, = importer.import_graph_def(
+            original_graph_def, return_elements=["B"], name="imported_graph")
+
+      self.assertProtoEqualsVersion("""
+          node { name: 'imported_graph/A' op: 'None'
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+                }
+          }
+          node { name: 'imported_graph/B' op: 'None'
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+          } }""", b.graph.as_graph_def())
+
+    # Only A gets a device, so B inherits it implicitly.
+    def ADeviceFn(op):
+      if "A" in op.name:
+        return "/device:A:0"
+      return ""
+
+    with ops.Graph().as_default():
+      with ops.device(ADeviceFn):
+        b, = importer.import_graph_def(
+            original_graph_def, return_elements=["B"], name="imported_graph")
+
+      self.assertProtoEqualsVersion("""
+          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+                }
+          }
+          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
+                attr {
+                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
+          } }""", b.graph.as_graph_def())
+
   def testNamePrefixColocationAttrsMultipleImport(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' }
@@ -653,13 +737,28 @@ class ImportGraphDefTest(test.TestCase):
             self._MakeGraphDef(""), input_map=[constant_op.constant(5.0)])
       self.assertEqual("input_map must be a dictionary mapping strings to "
                        "Tensor objects.", str(e.exception))
+    graph_def = self._MakeGraphDef("""
+         node { name: 'a' op: 'Placeholder'
+                attr { key: 'dtype' value { type: DT_FLOAT } }}
+         node { name: 'id' op: 'Identity' input: 'a:0'
+                attr { key: 'T' value { type: DT_FLOAT } }}""")
+    with ops.Graph().as_default():
       with self.assertRaises(ValueError) as e:
         importer.import_graph_def(
-            self._MakeGraphDef(""),
-            input_map={"a:0": constant_op.constant(5.0)},
+            graph_def,
+            input_map={"a:0": variables.Variable(5.0)},
             name="")
-      self.assertEqual("tf.import_graph_def() requires a non-empty `name` "
-                       "if `input_map` is used.", str(e.exception))
+      self.assertStartsWith(str(e.exception),
+                            "tf.import_graph_def() requires a non-empty `name` "
+                            "if `input_map` contains non-Tensor values.")
+    with ops.Graph().as_default():
+      t, = importer.import_graph_def(
+          graph_def,
+          input_map={"a:0": constant_op.constant(5.0)},
+          name="",
+          return_elements=["id:0"])
+      with self.test_session():
+        self.assertEqual(5.0, t.eval())
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
@@ -668,6 +767,17 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual("return_elements must be a list of strings.",
                        str(e.exception))
 
+  def testDuplicateOperationNames(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError) as e:
+        importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'Oi' }
+            node { name: 'B' op: 'Oi' }
+            node { name: 'A' op: 'Oi' }
+            """))
+      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
+
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant(5.0, dtype=dtypes.float32, name="c")
@@ -724,26 +834,27 @@ class ImportGraphDefTest(test.TestCase):
   def testWithDeviceFunctionDependingOnInputs(self):
     with ops.Graph().as_default() as g:
       with ops.device("/job:ps"):
-        v = variables.Variable(1.0)
-      unused_assign_op = v.assign(2.0)
-      unused_assign_2_op = v.assign(3.0)
-      unused_add_t = v + v
+        v1 = constant_op.constant(1.0)
+        v2 = constant_op.constant(1.0)
+      _ = v1 + v2
+      _ = v1 - v2
+      _ = array_ops.identity(v1)
     gdef = g.as_graph_def()
 
     # We'll use the following device function to observe ops with two inputs.
     ops_with_two_inputs = []
 
-    def input_counter(op):
-      if any(in_t.dtype._is_ref_dtype for in_t in op.inputs):  # pylint: disable=protected-access
+    def InputCounter(op):
+      if len(op.inputs) == 2:
         ops_with_two_inputs.append(op)
       return ""
 
     with ops.Graph().as_default() as g:
-      with ops.device(input_counter):
+      with ops.device(InputCounter):
         importer.import_graph_def(gdef)
 
-    # We expect to see the initializer, two assign operations, and the add op.
-    self.assertEqual(4, len(ops_with_two_inputs))
+    # We expect to see the add and subtract, but not identity.
+    self.assertEqual(2, len(ops_with_two_inputs))
 
   def testGradient(self):
     with ops.Graph().as_default() as g:
@@ -829,6 +940,24 @@ class ImportGraphDefTest(test.TestCase):
         with self.assertRaisesRegexp(Exception, pat):
           sess.run(x)
 
+  def testVersionAppliesToOpConstruction(self):
+    """These tests rely on shape fns in test_ops.cc."""
+    with ops.Graph().as_default():
+      importer.import_graph_def(
+          self._MakeGraphDef(
+              "node { name: 'A' op: 'RequiresOlderGraphVersion' }",
+              producer=versions.GRAPH_DEF_VERSION - 1),
+          return_elements=["A"])
+
+    with ops.Graph().as_default():
+      with self.assertRaisesWithPredicateMatch(ValueError,
+                                               "Wrong graph version.*"):
+        importer.import_graph_def(
+            self._MakeGraphDef(
+                "node { name: 'A' op: 'RequiresOlderGraphVersion' }",
+                producer=versions.GRAPH_DEF_VERSION),
+            return_elements=["A"])
+
   def testDefaultAttrsAdded(self):
     with ops.Graph().as_default():
       a = importer.import_graph_def(
@@ -869,6 +998,135 @@ class ImportGraphDefTest(test.TestCase):
           producer_op_list=producer_op_list)
       self.assertEqual(987, a[0].get_attr("default_int"))
 
+  def testFunctions(self):
+    dtype = dtypes.float32
+    @function.Defun(dtype, dtype, dtype, dtype)
+    def Grad(x, y, dout1, dout2):  # pylint: disable=unused-argument
+      # Return the inputs for simplicity of testing. The correct return value
+      # would be (dout1 + dout2, dout1 - dout2)
+      return x, y
+
+    @function.Defun(dtype, dtype, grad_func=Grad)
+    def FuncWithGrad(x, y):
+      return x + y, x - y
+
+    @function.Defun(dtypes.int32)
+    def ExternalTensorFunc(x):
+      # c must be defined in the containing graph
+      return x + c
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def OuterFunc(x, y):
+
+      @function.Defun(dtypes.int32)
+      def InnerFunc(x):
+        return x + x
+
+      return InnerFunc(x) + y
+
+    # Create graph with function calls and export to GraphDef
+    with ops.Graph().as_default() as g1:
+      p1 = array_ops.placeholder(dtype, name="p1")
+      p2 = array_ops.placeholder(dtype, name="p2")
+      # pylint: disable=unexpected-keyword-arg
+      a, b = FuncWithGrad(p1, p2, name="f")
+
+      c = constant_op.constant(10, dtype=dtypes.int32)
+      ExternalTensorFunc(1, name="external")
+
+      OuterFunc(10, 1, name="outer")
+      # pylint: enable=unexpected-keyword-arg
+
+    gdef = g1.as_graph_def()
+
+    # Import GraphDef into new graph, add imported gradients, and test that
+    # imported functions can be run
+    with ops.Graph().as_default() as g2:
+      p1, p2, a, b = importer.import_graph_def(
+          gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="")
+      grad = gradients_impl.gradients([a], [p1, p2])
+
+      with self.test_session(graph=g2) as sess:
+        feed_dict = {p1: 1, p2: 2}
+        a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict)
+        self.assertEqual(a_val, 3.0)
+        self.assertEqual(b_val, -1.0)
+        # Grad function returns inputs values for testing
+        self.assertEqual(grad_val, [1.0, 2.0])
+        self.assertEqual(sess.run("external:0"), 11)
+        self.assertEqual(sess.run("outer:0"), 21)
+
+    # Export the new graph and reimport to test that imported functions can be
+    # successfully exported/imported again
+    gdef = g2.as_graph_def()
+    with ops.Graph().as_default() as g3:
+      p1, p2, a, b = importer.import_graph_def(
+          gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="")
+      # Create new gradient functions (in additional to the imported gradient
+      # functions created in g2).
+      grad = gradients_impl.gradients([a], [p1, p2])
+
+      with self.test_session(graph=g3) as sess:
+        feed_dict = {p1: 1, p2: 2}
+        a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict)
+        self.assertEqual(a_val, 3.0)
+        self.assertEqual(b_val, -1.0)
+        self.assertEqual(grad_val, [1.0, 2.0])
+        self.assertEqual(sess.run("external:0"), 11)
+        self.assertEqual(sess.run("outer:0"), 21)
+
+  def testImportInsideDefun(self):
+    g = ops.Graph()
+    with g.as_default():
+      @function.Defun()
+      def Add2(x, y):
+        return math_ops.add(x, y)
+
+      x = constant_op.constant(3.0, dtype=dtypes.float32)
+      y = constant_op.constant(-5.0, dtype=dtypes.float32)
+      z = Add2(x, y, name="z")  # pylint: disable=unexpected-keyword-arg
+
+    gdef = g.as_graph_def()
+
+    @function.Defun()
+    def TestFunc():
+      return importer.import_graph_def(gdef, return_elements=["z:0"])[0]
+
+    z = TestFunc()
+
+    with self.test_session():
+      z_val = z.eval()
+      self.assertEqual(z_val, -2.0)
+
+  def testImportGraphWithFunctionTwice(self):
+    g = ops.Graph()
+    with g.as_default():
+      @function.Defun()
+      def Add2(x, y):
+        return math_ops.add(x, y)
+
+      x = array_ops.placeholder(dtype=dtypes.float32, name="x")
+      y = array_ops.placeholder(dtype=dtypes.float32, name="y")
+      _ = Add2(x, y, name="z")  # pylint: disable=unexpected-keyword-arg
+
+    gdef = g.as_graph_def()
+
+    x = random_ops.random_uniform(dtype=dtypes.float32, shape=())
+    y = random_ops.random_uniform(dtype=dtypes.float32, shape=())
+    input_map = {"x:0": x, "y:0": y}
+
+    with ops.name_scope("first"):
+      z1 = importer.import_graph_def(gdef, return_elements=["z:0"],
+                                     input_map=input_map)[0]
+
+    with ops.name_scope("second"):
+      z2 = importer.import_graph_def(gdef, return_elements=["z:0"],
+                                     input_map=input_map)[0]
+
+    with self.test_session() as sess:
+      z1_val, z2_val = sess.run((z1, z2))
+      self.assertAllEqual(z1_val, z2_val)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 5520681940d..44ad6a7c3bf 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -71,6 +71,10 @@ def load_op_library(library_filename):
   op_list.ParseFromString(compat.as_bytes(op_list_str))
   wrappers = py_tf.GetPythonWrappers(op_list_str)
 
+  # Delete the library handle to release any memory held in C
+  # that are no longer needed.
+  py_tf.TF_DeleteLibraryHandle(lib_handle)
+
   # Get a unique name for the module.
   module_name = hashlib.md5(wrappers).hexdigest()
   if module_name in sys.modules:
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 26344d38528..9f4e4a18f28 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -44,6 +44,11 @@ from tensorflow.python.util import compat
 # Prefix to be added to unbound input names so they are easily identifiable.
 _UNBOUND_INPUT_PREFIX = "$unbound_inputs_"
 
+# List of collections that didn't register proto functions, as a result in
+# a previously exported meta_graph the items are of a different data type.
+_COMPAT_COLLECTION_LIST = [ops.GraphKeys.LOCAL_VARIABLES,
+                           ops.GraphKeys.MODEL_VARIABLES]
+
 
 def _node_def(from_node_def, export_scope, unbound_inputs, clear_devices=False):
   """Create a `NodeDef` proto with export_scope stripped.
@@ -217,14 +222,131 @@ def _get_kind_name(item):
   return kind
 
 
-def _should_include_node(node_or_node_name, export_scope):
+SAVE_AND_RESTORE_OPS = ["SaveV2",
+                        "Save", "SaveSlice",
+                        "LegacySave", "LegacySaveSlice",
+                        "RestoreV2",
+                        "Restore", "RestoreSlice",
+                        "LegacyRestore", "LegacyRestoreSlice"]
+
+
+def _op_name(tensor_name):
+  """Extract the Op name from a Tensor name.
+
+  The Op name is everything before a colon, if present,
+  not including any ^ prefix denoting a control dependency.
+
+  Args:
+    tensor_name: the full name of a Tensor in the graph.
+  Returns:
+    The name of the Op of which the given Tensor is an output.
+  Raises:
+    ValueError: if tensor_name is None or empty.
+  """
+  if not tensor_name:
+    raise ValueError("Tensor name cannot be empty or None.")
+
+  # Control dependency inputs start with ^.
+  if tensor_name.startswith("^"):
+    tensor_name = tensor_name[1:]
+  if ":" in tensor_name:
+    op_name, _ = tensor_name.split(":")
+    return op_name
+  return tensor_name
+
+
+def _get_scope(node_name):
+  """Extract the scope name from a node name.
+
+  The scope name is everything before the final slash,
+  not including any ^ prefix denoting a control dependency.
+
+  Args:
+    node_name: the full name of an Op or a Tensor in the graph.
+  Returns:
+    The deepest named scope containing the node.
+  Raises:
+    ValueError: if tensor_name is None or empty
+  """
+  if not node_name:
+    raise ValueError("Node name cannot be empty or None.")
+
+  # Control dependency inputs start with ^.
+  if node_name.startswith("^"):
+    node_name = node_name[1:]
+  if "/" in node_name:
+    scope, _ = node_name.rsplit("/", 1)
+    return scope
+
+  return ""
+
+
+def _find_extraneous_saver_nodes(graph_def, saver_def):
+  """Identifies any nodes in the graph_def related to unused Savers.
+
+  This approach assumes that each Saver is cleanly isolated in its own name
+  scope, so we need only identify the scopes associated with extraneous Savers
+  and return all the nodes in those scopes.
+
+  Args:
+    graph_def: a GraphDef proto to evaluate.
+    saver_def: a SaverDef proto referencing Save/Restore ops to be retained.
+  Returns:
+    An iterable of node names that may be safely omitted.
+  """
+  # TODO(soergel): confirm that the assumption of scope isolation is valid.
+  # If not, we need to walk up the graph from any restore_all nodes, and walk
+  # down the graph from any Save/Restore nodes.  I drafted that approach too,
+  # but it seems unnecessarily complex given the name scope solution.
+
+  # load the graph DAG in minimal form, without initializing a full Graph object
+  nodes = {node_def.name:
+           (set([_op_name(x) for x in node_def.input]), node_def.op)
+           for node_def in graph_def.node}
+
+  retain_scope_save = None
+  retain_scope_restore = None
+  # It's possible to have no saver if the graph has no Variables
+  if saver_def is not None:
+    save_op_name = _op_name(saver_def.save_tensor_name)
+    restore_op_name = _op_name(saver_def.restore_op_name)
+
+    # The save and restore scopes should always be the same, but if they differ
+    # for some reason, we retain them both to be safe.
+    retain_scope_restore = _get_scope(restore_op_name) + "/"
+    retain_scope_save = _get_scope(save_op_name) + "/"
+
+  all_saver_node_names = set([name for name, (_, op) in nodes.items()
+                              if op in SAVE_AND_RESTORE_OPS])
+
+  all_saver_scopes = (set([_get_scope(x) for x in all_saver_node_names])
+                      - all_saver_node_names)
+  all_saver_scopes = set([x + "/" for x in all_saver_scopes])
+
+  extraneous_scopes = all_saver_scopes - set([retain_scope_save,
+                                              retain_scope_restore])
+
+  extraneous_node_names = set()
+  for name, _ in nodes.items():
+    for extraneous_scope in extraneous_scopes:
+      if name.startswith(extraneous_scope):
+        extraneous_node_names.add(name)
+        break
+
+  return extraneous_node_names
+
+
+def _should_include_node(node_or_node_name, export_scope, exclude_nodes):
   """Returns `True` if a node should be included.
 
   Args:
     node_or_node_name: A node or `string` node name.
     export_scope: `string`. Name scope under which to extract the subgraph. The
-      scope name will be striped from the node definitions for easy import later
-      into new name scopes.
+      scope name will be stripped from the node definitions for easy import
+      later into new name scopes.
+    exclude_nodes: An iterable of nodes or `string` node names to omit from the
+      export, or None.  Note no sanity-checking is done, so this list must be
+      carefully constructed to avoid producing an invalid graph.
 
   Returns:
     `True` if the node should be included.
@@ -238,12 +360,17 @@ def _should_include_node(node_or_node_name, export_scope):
   else:
     node_name = node_or_node_name
 
+  if exclude_nodes and (node_or_node_name in exclude_nodes
+                        or node_name in exclude_nodes):
+    return False
+
   return (node_name.startswith(_UNBOUND_INPUT_PREFIX) or
           (not export_scope or node_name.startswith(export_scope)))
 
 
 def add_collection_def(meta_graph_def, key, graph=None,
-                       export_scope=None):
+                       export_scope=None, exclude_nodes=None,
+                       override_contents=None):
   """Adds a collection to MetaGraphDef protocol buffer.
 
   Args:
@@ -251,6 +378,10 @@ def add_collection_def(meta_graph_def, key, graph=None,
     key: One of the GraphKeys or user-defined string.
     graph: The `Graph` from which to get collections.
     export_scope: Optional `string`. Name scope to remove.
+    exclude_nodes: An iterable of nodes or `string` node names to omit from the
+      collection, or None.
+    override_contents: An iterable of values to place in the collection,
+      ignoring the current values (if set).
   """
   if graph and not isinstance(graph, ops.Graph):
     raise TypeError("graph must be of type Graph, not %s", type(graph))
@@ -263,10 +394,14 @@ def add_collection_def(meta_graph_def, key, graph=None,
   # Sets graph to default graph if it's not passed in.
   graph = graph or ops.get_default_graph()
 
-  collection_list = graph.get_collection(key)
+  if override_contents:
+    collection_list = override_contents
+  else:
+    collection_list = graph.get_collection(key)
+
   # Remove nodes that should not be exported from the collection list.
   collection_list = [x for x in collection_list if
-                     _should_include_node(x, export_scope)]
+                     _should_include_node(x, export_scope, exclude_nodes)]
   if not collection_list:
     return
 
@@ -311,7 +446,9 @@ def create_meta_graph_def(meta_info_def=None,
                           saver_def=None,
                           collection_list=None,
                           graph=None,
-                          export_scope=None):
+                          export_scope=None,
+                          exclude_nodes=None,
+                          clear_extraneous_savers=False):
   """Construct and returns a `MetaGraphDef` protocol buffer.
 
   Args:
@@ -321,7 +458,11 @@ def create_meta_graph_def(meta_info_def=None,
     collection_list: List of string keys to collect.
     graph: The `Graph` to create `MetaGraphDef` out of.
     export_scope: Optional `string`. Name scope to remove.
-
+    exclude_nodes: An iterable of nodes or `string` node names to omit from all
+      collection, or None.
+    clear_extraneous_savers: Remove any preexisting SaverDefs from the SAVERS
+        collection.  Note this method does not alter the graph, so any
+        extraneous Save/Restore ops should have been removed already, as needed.
   Returns:
     MetaGraphDef protocol buffer.
 
@@ -374,14 +515,25 @@ def create_meta_graph_def(meta_info_def=None,
     meta_graph_def.saver_def.MergeFrom(saver_def)
 
   # Adds collection_list.
-  if collection_list:
+  if collection_list is not None:
     clist = collection_list
   else:
     clist = graph.get_all_collection_keys()
+
   for ctype in clist:
-    add_collection_def(meta_graph_def, ctype,
-                       graph=graph,
-                       export_scope=export_scope)
+    if clear_extraneous_savers and ctype == ops.GraphKeys.SAVERS:
+      # Avoid importing Saver here
+      from_proto = ops.get_from_proto_function(ctype)
+      add_collection_def(meta_graph_def, ctype,
+                         graph=graph,
+                         export_scope=export_scope,
+                         exclude_nodes=exclude_nodes,
+                         override_contents=[from_proto(saver_def)])
+    else:
+      add_collection_def(meta_graph_def, ctype,
+                         graph=graph,
+                         export_scope=export_scope,
+                         exclude_nodes=exclude_nodes)
   return meta_graph_def
 
 
@@ -422,15 +574,16 @@ def import_scoped_meta_graph(meta_graph_or_file,
                              graph=None,
                              import_scope=None,
                              input_map=None,
-                             unbound_inputs_col_name="unbound_inputs"):
-  """Recreates a`Graph` saved in a `MetaGraphDef` proto.
+                             unbound_inputs_col_name="unbound_inputs",
+                             restore_collections_predicate=(lambda key: True)):
+  """Recreates a `Graph` saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
   the argument is a file containing a `MetaGraphDef` protocol buffer ,
   it constructs a protocol buffer from the file content. The function
   then adds all the nodes from the `graph_def` field to the
-  current graph, recreates all the collections, and returns a saver
-  constructed from the `saver_def` field.
+  current graph, recreates the desired collections, and returns a dictionary of
+  all the Variables imported into the name scope.
 
   In combination with `export_scoped_meta_graph()`, this function can be used to
 
@@ -453,6 +606,10 @@ def import_scoped_meta_graph(meta_graph_or_file,
       `Tensor` objects. The values of the named input tensors in the imported
       graph will be re-mapped to the respective `Tensor` values.
     unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
 
   Returns:
     A dictionary of all the `Variables` imported into the name scope.
@@ -498,11 +655,16 @@ def import_scoped_meta_graph(meta_graph_or_file,
         input_graph_def, name=(import_scope or ""), input_map=input_map,
         producer_op_list=producer_op_list)
 
+    scope_to_prepend_to_names = "/".join(
+        [part for part in [graph.get_name_scope(), import_scope] if part])
+
     # Restores all the other collections.
     for key, col_def in meta_graph_def.collection_def.items():
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
+      if not restore_collections_predicate(key):
+        continue
 
       kind = col_def.WhichOneof("kind")
       if kind is None:
@@ -510,20 +672,24 @@ def import_scoped_meta_graph(meta_graph_or_file,
                       key)
         continue
       from_proto = ops.get_from_proto_function(key)
-      if from_proto:
-        assert kind == "bytes_list"
+      if from_proto and kind == "bytes_list":
         proto_type = ops.get_collection_proto_type(key)
         for value in col_def.bytes_list.value:
           proto = proto_type()
           proto.ParseFromString(value)
           graph.add_to_collection(
-              key, from_proto(proto, import_scope=import_scope))
+              key, from_proto(proto, import_scope=scope_to_prepend_to_names))
       else:
         field = getattr(col_def, kind)
+        if key in _COMPAT_COLLECTION_LIST:
+          logging.warning(
+              "The saved meta_graph is possibly from an older release:\n"
+              "'%s' collection should be of type 'byte_list', but instead "
+              "is of type '%s'.", key, kind)
         if kind == "node_list":
           for value in field.value:
             col_op = graph.as_graph_element(
-                ops.prepend_name_scope(value, import_scope))
+                ops.prepend_name_scope(value, scope_to_prepend_to_names))
             graph.add_to_collection(key, col_op)
         elif kind == "int64_list":
           # NOTE(opensource): This force conversion is to work around the fact
@@ -534,13 +700,13 @@ def import_scoped_meta_graph(meta_graph_or_file,
         else:
           for value in field.value:
             graph.add_to_collection(
-                key, ops.prepend_name_scope(value, import_scope))
+                key, ops.prepend_name_scope(value, scope_to_prepend_to_names))
 
     var_list = {}
     variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                     scope=import_scope)
+                                     scope=scope_to_prepend_to_names)
     for v in variables:
-      var_list[ops.strip_name_scope(v.name, import_scope)] = v
+      var_list[ops.strip_name_scope(v.name, scope_to_prepend_to_names)] = v
 
   return var_list
 
@@ -552,6 +718,8 @@ def export_scoped_meta_graph(filename=None,
                              as_text=False,
                              unbound_inputs_col_name="unbound_inputs",
                              clear_devices=False,
+                             saver_def=None,
+                             clear_extraneous_savers=False,
                              **kwargs):
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
@@ -564,11 +732,11 @@ def export_scoped_meta_graph(filename=None,
     filename: Optional filename including the path for writing the
       generated `MetaGraphDef` protocol buffer.
     graph_def: `GraphDef` protocol buffer.
-    graph: The `Graph` to import into. If `None`, use the default graph.
+    graph: The `Graph` to export. If `None`, use the default graph.
     export_scope: Optional `string`. Name scope under which to extract
-      the subgraph. The scope name will be striped from the node definitions
+      the subgraph. The scope name will be stripped from the node definitions
       for easy import later into new name scopes. If `None`, the whole graph
-      is exported. graph_def and export_scope cannot both be specified.
+      is exported.
     as_text: If `True`, writes the `MetaGraphDef` as an ASCII proto.
     unbound_inputs_col_name: Optional `string`. If provided, a string collection
       with the given name will be added to the returned `MetaGraphDef`,
@@ -576,8 +744,12 @@ def export_scoped_meta_graph(filename=None,
       `MetaGraphDef`.
     clear_devices: Boolean which controls whether to clear device information
       before exporting the graph.
-    **kwargs: Optional keyed arguments, including meta_info_def,
-      saver_def, collection_list.
+    saver_def: `SaverDef` protocol buffer.
+    clear_extraneous_savers: Remove any Saver-related information from the
+        graph (both Save/Restore ops and SaverDefs) that are not associated
+        with the provided SaverDef.
+    **kwargs: Optional keyed arguments, including meta_info_def and
+        collection_list.
 
   Returns:
     A `MetaGraphDef` proto and dictionary of `Variables` in the exported
@@ -587,13 +759,19 @@ def export_scoped_meta_graph(filename=None,
     ValueError: When the `GraphDef` is larger than 2GB.
   """
   graph = graph or ops.get_default_graph()
+
+  exclude_nodes = None
   unbound_inputs = []
-  if export_scope or clear_devices:
+  if export_scope or clear_extraneous_savers or clear_devices:
     if graph_def:
       new_graph_def = graph_pb2.GraphDef()
       new_graph_def.versions.CopyFrom(graph_def.versions)
+
+      if clear_extraneous_savers:
+        exclude_nodes = _find_extraneous_saver_nodes(graph_def, saver_def)
+
       for node_def in graph_def.node:
-        if _should_include_node(node_def.name, export_scope):
+        if _should_include_node(node_def.name, export_scope, exclude_nodes):
           new_node_def = _node_def(node_def, export_scope, unbound_inputs,
                                    clear_devices=clear_devices)
           new_graph_def.node.extend([new_node_def])
@@ -604,8 +782,15 @@ def export_scoped_meta_graph(filename=None,
       # pylint: disable=protected-access
       graph_def.versions.CopyFrom(graph.graph_def_versions)
       bytesize = 0
+
+      if clear_extraneous_savers:
+        exclude_nodes = _find_extraneous_saver_nodes(graph.as_graph_def(),
+                                                     saver_def)
+
       for key in sorted(graph._nodes_by_id):
-        if _should_include_node(graph._nodes_by_id[key].name, export_scope):
+        if _should_include_node(graph._nodes_by_id[key].name,
+                                export_scope,
+                                exclude_nodes):
           value = graph._nodes_by_id[key]
       # pylint: enable=protected-access
           node_def = _node_def(value.node_def, export_scope, unbound_inputs,
@@ -631,13 +816,16 @@ def export_scoped_meta_graph(filename=None,
   variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                    scope=export_scope)
   for v in variables:
-    if _should_include_node(v, export_scope):
+    if _should_include_node(v, export_scope, exclude_nodes):
       var_list[ops.strip_name_scope(v.name, export_scope)] = v
 
   scoped_meta_graph_def = create_meta_graph_def(
       graph_def=graph_def,
       graph=graph,
       export_scope=export_scope,
+      exclude_nodes=exclude_nodes,
+      clear_extraneous_savers=clear_extraneous_savers,
+      saver_def=saver_def,
       **kwargs)
 
   if filename:
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index de17eeb5b40..13a92c3c7ec 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -29,10 +29,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -332,7 +334,82 @@ class ScopedMetaGraphTest(test.TestCase):
     del orig_meta_graphs[0].collection_def["unbound_inputs"]
     del new_meta_graphs[0].collection_def["unbound_inputs"]
     for a, b in zip(orig_meta_graphs, new_meta_graphs):
-      self.assertProtoEquals(a, b)
+      test_util.assert_meta_graph_protos_equal(self, a, b)
+
+  def testScopedImportUnderNameScope(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True, name="myvar")
+    meta_graph_def, _ = meta_graph.export_scoped_meta_graph(graph=graph)
+
+    graph = ops.Graph()
+    with graph.as_default():
+      with ops.name_scope("foo"):
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def, import_scope="bar")
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "foo/bar/myvar:0")
+
+  def testScopedImportWithSelectedCollections(self):
+    meta_graph_filename = os.path.join(
+        _TestDir("selected_collections_import"), "meta_graph.pb")
+
+    graph = ops.Graph()
+    # Add a variable to populate two collections. The functionality tested is
+    # not specific to variables, but using variables in the test is convenient.
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True)
+    self.assertTrue(
+        all([
+            graph.get_collection(key)
+            for key in
+            [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
+        ]))
+    meta_graph.export_scoped_meta_graph(
+        filename=meta_graph_filename, graph=graph)
+
+    def _test_import(include_collection_keys, omit_collection_keys):
+      assert set(include_collection_keys).isdisjoint(omit_collection_keys)
+      newgraph = ops.Graph()
+      import_scope = "some_scope_name"
+
+      def _restore_collections_predicate(collection_key):
+        return (collection_key in include_collection_keys and
+                collection_key not in omit_collection_keys)
+
+      meta_graph.import_scoped_meta_graph(
+          meta_graph_filename,
+          graph=newgraph,
+          import_scope=import_scope,
+          restore_collections_predicate=_restore_collections_predicate)
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in include_collection_keys
+      ]
+      self.assertTrue(all(collection_values))
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in omit_collection_keys
+      ]
+      self.assertFalse(any(collection_values))
+
+    _test_import(
+        include_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ],
+        omit_collection_keys=[])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES])
+    _test_import(
+        include_collection_keys=[],
+        omit_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ])
 
   def _testScopedExportWithQueue(self, test_dir, exported_filename):
     graph = ops.Graph()
@@ -523,5 +600,62 @@ class ScopedMetaGraphTest(test.TestCase):
     self.assertEqual("", str(graph2.as_graph_element("matmul").device))
 
 
+class MetaGraphWithVariableScopeTest(test.TestCase):
+
+  def testMetricsCollection(self):
+
+    def _enqueue_vector(sess, queue, values, shape=None):
+      if not shape:
+        shape = (1, len(values))
+      dtype = queue.dtypes[0]
+      sess.run(
+          queue.enqueue(constant_op.constant(
+              values, dtype=dtype, shape=shape)))
+
+    meta_graph_filename = os.path.join(
+        _TestDir("metrics_export"), "meta_graph.pb")
+
+    graph = ops.Graph()
+    with self.test_session(graph=graph) as sess:
+      values_queue = data_flow_ops.FIFOQueue(
+          4, dtypes.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      _, update_op = metrics.mean(values)
+
+      initializer = variables.local_variables_initializer()
+      sess.run(initializer)
+      sess.run(update_op)
+
+    meta_graph.export_scoped_meta_graph(
+        filename=meta_graph_filename, graph=graph)
+
+    # Verifies that importing a meta_graph with LOCAL_VARIABLES collection
+    # works correctly.
+    graph = ops.Graph()
+    with self.test_session(graph=graph) as sess:
+      meta_graph.import_scoped_meta_graph(meta_graph_filename)
+      initializer = variables.local_variables_initializer()
+      sess.run(initializer)
+
+    # Verifies that importing an old meta_graph where "local_variables"
+    # collection is of node_list type works, but cannot build initializer
+    # with the collection.
+    graph = ops.Graph()
+    with self.test_session(graph=graph) as sess:
+      meta_graph.import_scoped_meta_graph(
+          test.test_src_dir_path(
+              "python/framework/testdata/metrics_export_meta_graph.pb"))
+      self.assertEqual(len(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)),
+                       2)
+      with self.assertRaisesRegexp(
+          AttributeError, "'Tensor' object has no attribute 'initializer'"):
+        initializer = variables.local_variables_initializer()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index cb799542268..662c2c679c8 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -19,8 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 import six
 
 from tensorflow.core.framework import attr_value_pb2
@@ -33,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_contextlib
 
 
 def _Attr(op_def, name):
@@ -241,7 +240,7 @@ class _OpInfo(object):
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def _MaybeColocateWith(inputs):
   """A context manager for (maybe) colocating with a list of input tensors.
 
@@ -329,7 +328,7 @@ class OpDefLibrary(object):
       # Need to flatten all the arguments into a list.
       # pylint: disable=protected-access
       g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
-      # pyline: enable=protected-access
+      # pylint: enable=protected-access
     except AssertionError as e:
       raise RuntimeError(
           "Cannot determine graph for Op '%s' due to: %s"
@@ -618,8 +617,8 @@ class OpDefLibrary(object):
         if input_arg.is_ref:
           if not all(x._is_ref_dtype for x in types):  # pylint: disable=protected-access
             raise TypeError(
-                "Input '%s' of '%s' Op requires l-value input" %
-                (input_name, op_type_name))
+                ("'%s' Op requires that input '%s' be a mutable tensor " +
+                "(e.g.: a tf.Variable)") % (op_type_name, input_name))
           input_types.extend(types)
         else:
           input_types.extend(base_types)
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 0fc7f0b3536..715e863b787 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -1462,7 +1462,8 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError) as cm:
       self._lib.apply_op("RefIn", a=2)
     self.assertEqual(str(cm.exception),
-                     "Input 'a' of 'RefIn' Op requires l-value input")
+                     "'RefIn' Op requires that input 'a' be a mutable tensor " +
+                     "(e.g.: a tf.Variable)")
 
     input_a = self._lib.apply_op("RefOut", T=dtypes.int32, name="t")
     input_b = self._lib.apply_op("RefOut", T=dtypes.int32, name="u")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 86517ec0e91..46417c23246 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
 import copy
 import linecache
 import re
@@ -32,11 +31,11 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
-from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
@@ -44,6 +43,25 @@ from tensorflow.python.framework import versions
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import tf_contextlib
+
+
+# Temporary global switch determining if we should enable the work-in-progress
+# calls to the C API. Currently disabled by default but can be manually enabled
+# e.g. in tests. This will be removed once all functionality is supported and
+# there's no performance penalty with it enabled.
+#
+# TODO(skyewm) before we can remove this:
+# - functions
+# - import_graph_def() incrementally adds inputs to ops (i.e. creates an
+#   Operation and then calls _add_input()). The current code requires that all
+#   inputs be specified when creating the Operation (since we call
+#   TF_FinishOperation()).
+# - ops_test.py (and others?) create unregistered op types
+# - while loop
+# - performance (e.g. delete/refactor redundant Python functionality, switch to
+#   new session API)
+_USE_C_API = False
 
 
 def _override_helper(clazz_object, operator, func):
@@ -70,60 +88,6 @@ def _override_helper(clazz_object, operator, func):
   setattr(clazz_object, operator, func)
 
 
-def _convert_stack(stack):
-  """Converts a stack extracted using _extract_stack() to a traceback stack.
-
-  Args:
-    stack: A list of n 4-tuples, (filename, lineno, name, frame_globals).
-
-  Returns:
-    A list of n 4-tuples (filename, lineno, name, code), where the code tuple
-    element is calculated from the corresponding elements of the input tuple.
-  """
-  ret = []
-  for filename, lineno, name, frame_globals in stack:
-    linecache.checkcache(filename)
-    line = linecache.getline(filename, lineno, frame_globals)
-    if line:
-      line = line.strip()
-    else:
-      line = None
-    ret.append((filename, lineno, name, line))
-  return ret
-
-
-# pylint: disable=line-too-long
-def _extract_stack():
-  """A lightweight re-implementation of traceback.extract_stack.
-
-  NOTE(mrry): traceback.extract_stack eagerly retrieves the line of code for
-    each stack frame using linecache, which results in an abundance of stat()
-    calls. This implementation does not retrieve the code, and any consumer
-    should apply _convert_stack to the result to obtain a traceback that can
-    be formatted etc. using traceback methods.
-
-  Returns:
-    A list of 4-tuples (filename, lineno, name, frame_globals) corresponding to
-    the call stack of the current thread.
-  """
-  # pylint: enable=line-too-long
-  try:
-    raise ZeroDivisionError
-  except ZeroDivisionError:
-    f = sys.exc_info()[2].tb_frame.f_back
-  ret = []
-  while f is not None:
-    lineno = f.f_lineno
-    co = f.f_code
-    filename = co.co_filename
-    name = co.co_name
-    frame_globals = f.f_globals
-    ret.append((filename, lineno, name, frame_globals))
-    f = f.f_back
-  ret.reverse()
-  return ret
-
-
 def _as_graph_element(obj):
   """Convert `obj` to a graph element if possible, otherwise return `None`.
 
@@ -199,7 +163,7 @@ class Tensor(_TensorLike):
   A `Tensor` is a symbolic handle to one of the outputs of an
   `Operation`. It does not hold the values of that operation's output,
   but instead provides a means of computing those values in a
-  TensorFlow [`Session`](../../api_docs/python/client.md#Session).
+  TensorFlow @{tf.Session}.
 
   This class has two primary purposes:
 
@@ -210,7 +174,7 @@ class Tensor(_TensorLike):
 
   2. After the graph has been launched in a session, the value of the
      `Tensor` can be computed by passing it to
-     [`Session.run()`](../../api_docs/python/client.md#Session.run).
+     @{tf.Session.run}.
      `t.eval()` is a shortcut for calling
      `tf.get_default_session().run(t)`.
 
@@ -230,20 +194,6 @@ class Tensor(_TensorLike):
   # Execute the graph and store the value that `e` represents in `result`.
   result = sess.run(e)
   ```
-
-  @@dtype
-  @@name
-  @@value_index
-  @@graph
-  @@op
-  @@consumers
-
-  @@eval
-
-  @@get_shape
-  @@shape
-  @@set_shape
-
   """
 
   # List of Python operators that we allow to override.
@@ -279,7 +229,9 @@ class Tensor(_TensorLike):
       # Unary.
       "__invert__",
       "__neg__",
-      "__abs__"
+      "__abs__",
+      "__matmul__",
+      "__rmatmul__"
   }
 
   def __init__(self, op, value_index, dtype):
@@ -305,8 +257,8 @@ class Tensor(_TensorLike):
     self._consumers = []
 
     # Attributes used for C++ shape inference. Not inspected, only forwarded.
-    self._handle_shape = tensor_shape_pb2.TensorShapeProto()
-    self._handle_dtype = types_pb2.DT_INVALID
+    # If set, will be a HandleData object from cpp_shape_inference.proto.
+    self._handle_data = None
 
   @property
   def op(self):
@@ -341,7 +293,7 @@ class Tensor(_TensorLike):
 
     The shape is computed using shape inference functions that are
     registered in the Op for each `Operation`.  See
-    [`TensorShape`](../../api_docs/python/framework.md#TensorShape)
+    @{tf.TensorShape}
     for more details of what a shape represents.
 
     The inferred shape of a tensor is used to provide shape
@@ -469,6 +421,13 @@ class Tensor(_TensorLike):
     else:
       return "%s:%d" % (self._op.name, self._value_index)
 
+  def _as_tf_output(self):
+    assert self.op._c_op  # pylint: disable=protected-access
+    tf_output = c_api.TF_Output()
+    tf_output.oper = self.op._c_op  # pylint: disable=protected-access
+    tf_output.index = self.value_index
+    return tf_output
+
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
         self.name,
@@ -569,7 +528,7 @@ class Tensor(_TensorLike):
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
+        See @{tf.Session.run} for a
         description of the valid feed values.
       session: (Optional.) The `Session` to be used to evaluate this tensor. If
         none, the default session will be used.
@@ -1016,22 +975,11 @@ class IndexedSlices(_TensorLike):
 
   The `IndexedSlices` class is used principally in the definition of
   gradients for operations that have sparse gradients
-  (e.g. [`tf.gather`](../../api_docs/python/array_ops.md#gather)).
+  (e.g. @{tf.gather}).
 
   Contrast this representation with
-  [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
+  @{tf.SparseTensor},
   which uses multi-dimensional indices and scalar values.
-
-  @@__init__
-
-  @@values
-  @@indices
-  @@dense_shape
-
-  @@name
-  @@dtype
-  @@device
-  @@op
   """
 
   def __init__(self, values, indices, dense_shape=None):
@@ -1144,8 +1092,8 @@ class Operation(object):
   more `Tensor` objects as input, and produces zero or more `Tensor`
   objects as output. Objects of type `Operation` are created by
   calling a Python op constructor (such as
-  [`tf.matmul()`](../../api_docs/python/math_ops.md#matmul))
-  or [`Graph.create_op()`](../../api_docs/python/framework.md#Graph.create_op).
+  @{tf.matmul})
+  or @{tf.Graph.create_op}.
 
   For example `c = tf.matmul(a, b)` creates an `Operation` of type
   "MatMul" that takes tensors `a` and `b` as input, and produces `c`
@@ -1153,21 +1101,8 @@ class Operation(object):
 
   After the graph has been launched in a session, an `Operation` can
   be executed by passing it to
-  [`Session.run()`](../../api_docs/python/client.md#Session.run).
+  @{tf.Session.run}.
   `op.run()` is a shortcut for calling `tf.get_default_session().run(op)`.
-
-  @@name
-  @@type
-  @@inputs
-  @@control_inputs
-  @@outputs
-  @@device
-  @@graph
-
-  @@run
-
-  @@get_attr
-  @@traceback
   """
 
   def __init__(self, node_def, g, inputs=None, output_types=None,
@@ -1242,7 +1177,11 @@ class Operation(object):
     else:
       if not all(x.is_compatible_with(i.dtype)
                  for i, x in zip(self._inputs, input_types)):
-        raise TypeError("Inputs are not compatible with input types")
+        raise TypeError("In op '%s', input types (%s) are not compatible "
+                        "with expected types (%s)" % (
+                            self.node_def.name,
+                            [i.dtype for i in self._inputs],
+                            input_types))
     self._input_types = input_types
 
     # Build the list of control inputs.
@@ -1261,7 +1200,7 @@ class Operation(object):
 
     self._original_op = original_op
     self._op_def = op_def
-    self._traceback = _extract_stack()
+    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
     # Add this op to the current control flow context:
     self._control_flow_context = g._get_control_flow_context()
     if self._control_flow_context is not None:
@@ -1274,6 +1213,103 @@ class Operation(object):
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
+    if _USE_C_API:
+      assert self._graph._c_graph, (  # pylint: disable=protected-access
+          "_USE_C_API set to False when creating Graph, you may need to "
+          "manually set 'ops._USE_C_API = True' before creating the Graph")
+      if self._op_def:
+        # TODO(skyewm): op_def_library.apply_op() flattens the incoming
+        # inputs. Refactor so we don't have to do this here.
+        grouped_inputs = self._reconstruct_sequence_inputs(
+            self._op_def, self._inputs, self._node_def.attr)
+      else:
+        # If no OpDef is specified, assume all inputs are scalar.
+        grouped_inputs = self._inputs
+
+      self._c_op = self._create_c_op(self._graph, self._node_def,
+                                     grouped_inputs, self._control_inputs)
+    else:
+      self._c_op = None
+
+  def _create_c_op(self, graph, node_def, inputs, control_inputs):
+    """Creates a TF_Operation.
+
+    Arguments:
+      graph: a `Graph`.
+      node_def: `node_def_pb2.NodeDef` for the operation to create.
+      inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
+        `Tensor`s (corresponding to sequence inputs, e.g. "int64 * N",
+        "list(int64)"). The length of the list should be equal to the number of
+        inputs specified by this operation's op def.
+      control_inputs: A list of `Operation`s to set as control dependencies.
+
+    Returns:
+      A wrapped TF_Operation*.
+    """
+    # pylint: disable=protected-access
+    op_desc = c_api.TF_NewOperation(graph._c_graph, compat.as_str(node_def.op),
+                                    compat.as_str(node_def.name))
+    # Add inputs
+    for op_input in inputs:
+      if isinstance(op_input, (list, tuple)):
+        c_api.TF_AddInputList(op_desc, [t._as_tf_output() for t in op_input])
+      else:
+        c_api.TF_AddInput(op_desc, op_input._as_tf_output())
+
+    # Add control inputs
+    for control_input in control_inputs:
+      c_api.TF_AddControlInput(op_desc, control_input._c_op)
+    # pylint: enable=protected-access
+
+    # Add attrs
+    for name, attr_value in node_def.attr.items():
+      serialized = attr_value.SerializeToString()
+      # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
+      # It might be worth creating a convenient way to re-use the same status.
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized,
+                                   status)
+
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_op = c_api.TF_FinishOperation(op_desc, status)
+
+    return c_op
+
+  def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
+    """Regroups a flat list of input tensors into scalar and sequence inputs.
+
+    Arguments:
+      op_def: The `op_def_pb2.OpDef` (for knowing the input types)
+      inputs: a list of input `Tensor`s to the op.
+      attrs: mapping from attr name to `attr_value_pb2.AttrValue` (these define
+        how long each sequence is)
+
+    Returns:
+      A list of `Tensor`s (corresponding to scalar inputs) and lists of
+      `Tensor`s (corresponding to sequence inputs).
+    """
+    grouped_inputs = []
+    i = 0
+    for input_arg in op_def.input_arg:
+      if input_arg.number_attr:
+        input_len = attrs[input_arg.number_attr].i
+        is_sequence = True
+      elif input_arg.type_list_attr:
+        input_len = len(attrs[input_arg.type_list_attr].list.type)
+        is_sequence = True
+      else:
+        input_len = 1
+        is_sequence = False
+
+      if is_sequence:
+        grouped_inputs.append(inputs[i:i + input_len])
+      else:
+        grouped_inputs.append(inputs[i])
+      i += input_len
+
+    assert i == len(inputs)
+    return grouped_inputs
+
   def colocation_groups(self):
     """Returns the list of colocation groups of the op."""
     default_colocation_group = [compat.as_bytes("loc:@%s" %
@@ -1338,6 +1374,7 @@ class Operation(object):
     Args:
       device: string or device..  The device to set.
     """
+    assert not _USE_C_API, "Operation._set_device doesn't work with C API"
     self._node_def.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
@@ -1512,6 +1549,7 @@ class Operation(object):
 
   @property
   def node_def(self):
+    # pylint: disable=line-too-long
     """Returns a serialized `NodeDef` representation of this operation.
 
     Returns:
@@ -1519,10 +1557,12 @@ class Operation(object):
       [`NodeDef`](https://www.tensorflow.org/code/tensorflow/core/framework/node_def.proto)
       protocol buffer.
     """
+    # pylint: enable=line-too-long
     return self._node_def
 
   @property
   def op_def(self):
+    # pylint: disable=line-too-long
     """Returns the `OpDef` proto that represents the type of this op.
 
     Returns:
@@ -1530,12 +1570,23 @@ class Operation(object):
       [`OpDef`](https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto)
       protocol buffer.
     """
+    # pylint: enable=line-too-long
     return self._op_def
 
   @property
   def traceback(self):
     """Returns the call stack from when this operation was constructed."""
-    return _convert_stack(self._traceback)
+    return self._graph._convert_stack(self._traceback)  # pylint: disable=protected-access
+
+  @property
+  def traceback_with_start_lines(self):
+    """Same as traceback but includes start line of function definition.
+
+    Returns:
+      A list of 5-tuples (filename, lineno, name, code, func_start_lineno).
+    """
+    return self._graph._convert_stack(  # pylint: disable=protected-access
+        self._traceback, include_func_start_lineno=True)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -1560,12 +1611,18 @@ class Operation(object):
     if x.HasField("list"):
       for f in fields:
         if getattr(x.list, f):
-          return list(getattr(x.list, f))
+          if f == "type":
+            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+          else:
+            return list(getattr(x.list, f))
       return []
     else:
       for f in fields:
         if x.HasField(f):
-          return getattr(x, f)
+          if f == "type":
+            return dtypes.as_dtype(getattr(x, f))
+          else:
+            return getattr(x, f)
       assert False, "Unsupported field type in " + str(x)
 
   def run(self, feed_dict=None, session=None):
@@ -1580,7 +1637,7 @@ class Operation(object):
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See [`Session.run()`](../../api_docs/python/client.md#Session.run)
+        See @{tf.Session.run}
         for a description of the valid feed values.
       session: (Optional.) The `Session` to be used to run to this operation. If
         none, the default session will be used.
@@ -1614,8 +1671,6 @@ class RegisterGradient(object):
   The decorator argument `op_type` is the string type of an
   operation. This corresponds to the `OpDef.name` field for the proto
   that defines the operation.
-
-  @@__init__
   """
 
   def __init__(self, op_type):
@@ -1762,12 +1817,10 @@ def set_shapes_for_outputs(op):
     # Returned by call_cpp_shape_fn
     shapes_dict = shapes
     shapes = shapes_dict["shapes"]
-    handle_shapes = shapes_dict["handle_shapes"]
-    handle_dtypes = shapes_dict["handle_dtypes"]
-    for output, handle_shape, handle_dtype in zip(op.outputs, handle_shapes, handle_dtypes):
+    handle_datas = shapes_dict["handle_data"]
+    for output, handle_data in zip(op.outputs, handle_datas):
       # pylint: disable=protected-access
-      output._handle_shape = handle_shape
-      output._handle_dtype = handle_dtype
+      output._handle_data = handle_data
       # pylint: enable=protected-access
 
   if len(op.outputs) != len(shapes):
@@ -1926,17 +1979,29 @@ def _name_from_scope_name(name):
   return name[:-1] if name[-1] == "/" else name
 
 
+class _ScopedTF_Graph(object):
+
+  def __init__(self):
+    self.graph = c_api.TF_NewGraph()
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api.TF_DeleteGraph is not None:
+      c_api.TF_DeleteGraph(self.graph)
+
+
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
   A `Graph` contains a set of
-  [`Operation`](../../api_docs/python/framework.md#Operation) objects,
+  @{tf.Operation} objects,
   which represent units of computation; and
-  [`Tensor`](../../api_docs/python/framework.md#Tensor) objects, which represent
+  @{tf.Tensor} objects, which represent
   the units of data that flow between operations.
 
   A default `Graph` is always registered, and accessible by calling
-  [`tf.get_default_graph()`](../../api_docs/python/framework.md#get_default_graph).
+  @{tf.get_default_graph}.
   To add an operation to the default graph, simply call one of the functions
   that defines a new `Operation`:
 
@@ -1946,7 +2011,7 @@ class Graph(object):
   ```
 
   Another typical usage involves the
-  [`Graph.as_default()`](../../api_docs/python/framework.md#Graph.as_default)
+  @{tf.Graph.as_default}
   context manager, which overrides the current default graph for the
   lifetime of the context:
 
@@ -1963,41 +2028,13 @@ class Graph(object):
   synchronization must be provided. Unless otherwise specified, all methods
   are not thread-safe.
 
-  @@__init__
-  @@as_default
-  @@as_graph_def
-  @@finalize
-  @@finalized
-
-  @@control_dependencies
-  @@device
-  @@name_scope
-
   A `Graph` instance supports an arbitrary number of "collections"
   that are identified by name. For convenience when building a large
   graph, collections can store groups of related objects: for
   example, the `tf.Variable` uses a collection (named
-  [`tf.GraphKeys.GLOBAL_VARIABLES`](../../api_docs/python/framework.md#GraphKeys)) for
+  @{tf.GraphKeys.GLOBAL_VARIABLES}) for
   all variables that are created during the construction of a graph. The caller
   may define additional collections by specifying a new name.
-
-  @@add_to_collection
-  @@add_to_collections
-  @@get_collection
-  @@get_collection_ref
-
-  @@as_graph_element
-  @@get_operation_by_name
-  @@get_tensor_by_name
-  @@get_operations
-
-  @@seed
-  @@unique_name
-  @@version
-  @@graph_def_versions
-
-  @@create_op
-  @@gradient_override_map
   """
 
   def __init__(self):
@@ -2067,6 +2104,83 @@ class Graph(object):
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
+    # TODO(skyewm): fold as much of the above as possible into the C
+    # implementation
+    if _USE_C_API:
+      self._scoped_c_graph = _ScopedTF_Graph()
+    else:
+      self._scoped_c_graph = None
+
+  def _convert_stack(self, stack, include_func_start_lineno=False):
+    """Converts a stack extracted using _extract_stack() to a traceback stack.
+
+    Args:
+      stack: A list of n 5-tuples,
+        (filename, lineno, name, frame_globals, func_start_lineno).
+      include_func_start_lineno: True if function start line number should be
+        included as the 5th entry in return tuples.
+
+    Returns:
+      A list of n 4-tuples or 5-tuples
+      (filename, lineno, name, code, [optional: func_start_lineno]), where the
+      code tuple element is calculated from the corresponding elements of the
+      input tuple.
+    """
+    ret = []
+    for (filename, lineno, name, frame_globals, func_start_lineno,
+         unused_frame_info) in stack:
+      linecache.checkcache(filename)
+      line = linecache.getline(filename, lineno, frame_globals)
+      if line:
+        line = line.strip()
+      else:
+        line = None
+      if include_func_start_lineno:
+        ret.append((filename, lineno, name, line, func_start_lineno))
+      else:
+        ret.append((filename, lineno, name, line))
+    return ret
+
+  def _extract_stack(self):
+    """A lightweight, extensible re-implementation of traceback.extract_stack.
+
+    NOTE(mrry): traceback.extract_stack eagerly retrieves the line of code for
+      each stack frame using linecache, which results in an abundance of stat()
+      calls. This implementation does not retrieve the code, and any consumer
+      should apply _convert_stack to the result to obtain a traceback that can
+      be formatted etc. using traceback methods.
+
+    Derived classes can implement _extract_frame_info() to add extra information
+    to the traceback.
+
+    Returns:
+      A list of 6-tuples
+      (filename, lineno, name, frame_globals, func_start_lineno, custom_info)
+      corresponding to the call stack of the current thread.
+    """
+    try:
+      raise ZeroDivisionError
+    except ZeroDivisionError:
+      f = sys.exc_info()[2].tb_frame.f_back
+    ret = []
+    while f is not None:
+      lineno = f.f_lineno
+      co = f.f_code
+      filename = co.co_filename
+      name = co.co_name
+      frame_globals = f.f_globals
+      func_start_lineno = co.co_firstlineno
+      frame_info = self._extract_frame_info(f)
+      ret.append((filename, lineno, name, frame_globals, func_start_lineno,
+                  frame_info))
+      f = f.f_back
+    ret.reverse()
+    return ret
+
+  def _extract_frame_info(self, frame):  # pylint: disable=unused-argument
+    """Extracts custom information from a frame in an op traceback."""
+    return None
+
   def _check_not_finalized(self):
     """Check if the graph is finalized.
 
@@ -2102,12 +2216,18 @@ class Graph(object):
       self._version = max(self._version, op._id)
       # pylint: enable=protected-access
 
+  @property
+  def _c_graph(self):
+    if self._scoped_c_graph:
+      return self._scoped_c_graph.graph
+    return None
+
   @property
   def version(self):
     """Returns a version number that increases as ops are added to the graph.
 
     Note that this is unrelated to the
-    [GraphDef version](#Graph.graph_def_version).
+    @{tf.Graph.graph_def_versions}.
     """
     if self._finalized:
       return self._version
@@ -2117,6 +2237,7 @@ class Graph(object):
 
   @property
   def graph_def_versions(self):
+    # pylint: disable=line-too-long
     """The GraphDef version information of this graph.
 
     For details on the meaning of each version, see
@@ -2125,6 +2246,7 @@ class Graph(object):
     Returns:
       A `VersionDef`.
     """
+    # pylint: enable=line-too-long
     return self._graph_def_versions
 
   @property
@@ -2147,7 +2269,7 @@ class Graph(object):
     After calling `g.finalize()`, no new operations can be added to
     `g`.  This method is used to ensure that no operations are added
     to a graph when it is shared between multiple threads, for example
-    when using a [`QueueRunner`](../../api_docs/python/train.md#QueueRunner).
+    when using a @{tf.train.QueueRunner}.
     """
     self._finalized = True
 
@@ -2178,11 +2300,12 @@ class Graph(object):
     self._control_flow_context = context
 
   def _as_graph_def(self, from_version=None, add_shapes=False):
+    # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
     The serialized `GraphDef` can be imported into another `Graph`
-    (using [`import_graph_def()`](#import_graph_def)) or used with the
-    [C++ Session API](../../api_docs/cc/index.md).
+    (using @{tf.import_graph_def}) or used with the
+    [C++ Session API](../../../../api_docs/cc/index.md).
 
     This method is thread-safe.
 
@@ -2203,6 +2326,7 @@ class Graph(object):
       ValueError: If the `graph_def` would be too large.
 
     """
+    # pylint: enable=line-too-long
     with self._lock:
       graph = graph_pb2.GraphDef()
       graph.versions.CopyFrom(self._graph_def_versions)
@@ -2232,10 +2356,11 @@ class Graph(object):
       return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
+    # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
     The serialized `GraphDef` can be imported into another `Graph`
-    (using [`import_graph_def()`](#import_graph_def)) or used with the
+    (using @{tf.import_graph_def}) or used with the
     [C++ Session API](../../api_docs/cc/index.md).
 
     This method is thread-safe.
@@ -2254,6 +2379,7 @@ class Graph(object):
     Raises:
       ValueError: If the `graph_def` would be too large.
     """
+    # pylint: enable=line-too-long
     result, _ = self._as_graph_def(from_version, add_shapes)
     return result
 
@@ -2674,6 +2800,11 @@ class Graph(object):
     """
     return _default_graph_stack.get_controller(self)
 
+  @property
+  def collections(self):
+    """Returns the names of the collections known to this graph."""
+    return list(self._collections)
+
   def add_to_collection(self, name, value):
     """Stores `value` in the collection with the given `name`.
 
@@ -2748,11 +2879,11 @@ class Graph(object):
     Args:
       name: The key for the collection. For example, the `GraphKeys` class
         contains many standard names for collections.
-      scope: (Optional.) If supplied, the resulting list is filtered to include
-        only items whose `name` attribute matches using `re.match`. Items
-        without a `name` attribute are never returned if a scope is supplied and
-        the choice or `re.match` means that a `scope` without special tokens
-        filters by prefix.
+      scope: (Optional.) A string. If supplied, the resulting list is filtered
+        to include only items whose `name` attribute matches `scope` using
+        `re.match`. Items without a `name` attribute are never returned if a
+        scope is supplied. The choice of `re.match` means that a `scope` without
+        special tokens filters by prefix.
 
     Returns:
       The list of values in the collection with the given `name`, or
@@ -2791,7 +2922,7 @@ class Graph(object):
       if name in self._collections:
         del self._collections[name]
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _original_op(self, op):
     """Python 'with' handler to help annotate ops with their originator.
 
@@ -2816,8 +2947,8 @@ class Graph(object):
     finally:
       self._default_original_op = old_original_op
 
-  # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  # pylint: disable=g-doc-return-or-yield,line-too-long
+  @tf_contextlib.contextmanager
   def name_scope(self, name):
     r"""Returns a context manager that creates hierarchical names for operations.
 
@@ -2899,7 +3030,8 @@ class Graph(object):
       A context manager that installs `name` as a new name scope.
 
     Raises:
-      ValueError: If `name` is not a valid scope name. The rules are the
+      ValueError: If `name` is not a valid scope name, according to the rules
+        above.
     """
     if name:
       if self._name_stack:
@@ -2925,7 +3057,7 @@ class Graph(object):
       yield "" if new_stack is None else new_stack + "/"
     finally:
       self._name_stack = old_stack
-  # pylint: enable=g-doc-return-or-yield
+  # pylint: enable=g-doc-return-or-yield,line-too-long
 
   def unique_name(self, name, mark_as_used=True):
     """Return a unique operation name for `name`.
@@ -2972,7 +3104,24 @@ class Graph(object):
         self._names_in_use[name] = 1
     return name
 
-  @contextlib.contextmanager
+  def get_name_scope(self):
+    """Returns the current name scope.
+
+    For example:
+
+    ```python
+    with tf.name_scope('scope1'):
+      with tf.name_scope('scope2'):
+        print(tf.get_default_graph().get_name_scope())
+    ```
+    would print the string `scope1/scope2`.
+
+    Returns:
+      A string representing the current name scope.
+    """
+    return self._name_stack
+
+  @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
 
@@ -3047,8 +3196,9 @@ class Graph(object):
       if ignore_existing:
         self._colocation_stack = current_stack
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def device(self, device_name_or_function):
+    # pylint: disable=line-too-long
     """Returns a context manager that specifies the default device to use.
 
     The `device_name_or_function` argument may either be a device name
@@ -3105,6 +3255,7 @@ class Graph(object):
       created ops.
 
     """
+    # pylint: enable=line-too-long
     if (device_name_or_function is not None
         and not callable(device_name_or_function)):
       device_function = pydev.merge_device(device_name_or_function)
@@ -3129,7 +3280,7 @@ class Graph(object):
       op._set_device(device_function(op))
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def container(self, container_name):
     """Returns a context manager that specifies the resource container to use.
 
@@ -3397,7 +3548,7 @@ class Graph(object):
     return self._ControlDependenciesController(self, control_ops)
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _attr_scope(self, attr_map):
     """EXPERIMENTAL: A context manager for setting attributes on operators.
 
@@ -3462,7 +3613,7 @@ class Graph(object):
   # pylint: enable=g-doc-return-or-yield
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _kernel_label_map(self, op_to_kernel_label_map):
     """EXPERIMENTAL: A context manager for setting kernel labels.
 
@@ -3524,7 +3675,7 @@ class Graph(object):
   # pylint: enable=g-doc-return-or-yield
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def gradient_override_map(self, op_type_map):
     """EXPERIMENTAL: A context manager for overriding gradient functions.
 
@@ -3610,7 +3761,7 @@ def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
   See
-  [`Graph.device()`](../../api_docs/python/framework.md#Graph.device)
+  @{tf.Graph.device}
   for more details.
 
   Args:
@@ -3644,7 +3795,7 @@ def colocate_with(op, ignore_existing=False):
 def control_dependencies(control_inputs):
   """Wrapper for `Graph.control_dependencies()` using the default graph.
 
-  See [`Graph.control_dependencies()`](../../api_docs/python/framework.md#Graph.control_dependencies)
+  See @{tf.Graph.control_dependencies}
   for more details.
 
   Args:
@@ -3682,7 +3833,7 @@ class _DefaultStack(threading.local):
   def enforce_nesting(self, value):
     self._enforce_nesting = value
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def get_controller(self, default):
     """A context manager for manipulating a default stack."""
     try:
@@ -3997,7 +4148,7 @@ class GraphKeys(object):
 
   * `GLOBAL_VARIABLES`: the default collection of `Variable` objects, shared
     across distributed environment (model variables are subset of these). See
-    [`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
+    @{tf.global_variables}
     for more details.
     Commonly, all `TRAINABLE_VARIABLES` variables will be in `MODEL_VARIABLES`,
     and all `MODEL_VARIABLES` variables will be in `GLOBAL_VARIABLES`.
@@ -4009,25 +4160,29 @@ class GraphKeys(object):
     `tf.contrib.framework.model_variable` to add to this collection.
   * `TRAINABLE_VARIABLES`: the subset of `Variable` objects that will
     be trained by an optimizer. See
-    [`tf.trainable_variables()`](../../api_docs/python/state_ops.md#trainable_variables)
+    @{tf.trainable_variables}
     for more details.
   * `SUMMARIES`: the summary `Tensor` objects that have been created in the
     graph. See
-    [`tf.summary.merge_all()`](../../api_docs/python/summary.md#merge_all)
+    @{tf.summary.merge_all}
     for more details.
   * `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
     produce input for a computation. See
-    [`tf.start_queue_runners()`](../../api_docs/python/train.md#start_queue_runners)
+    @{tf.train.start_queue_runners}
     for more details.
   * `MOVING_AVERAGE_VARIABLES`: the subset of `Variable` objects that will also
     keep moving averages.  See
-    [`tf.moving_average_variables()`](../../api_docs/python/state_ops.md#moving_average_variables)
+    @{tf.moving_average_variables}
     for more details.
   * `REGULARIZATION_LOSSES`: regularization losses collected during graph
     construction.
-  * `WEIGHTS`: weights inside neural network layers
-  * `BIASES`: biases inside neural network layers
-  * `ACTIVATIONS`: activations of neural network layers
+
+  The following standard keys are _defined_, but their collections are **not**
+  automatically populated as many of the others are:
+
+  * `WEIGHTS`
+  * `BIASES`
+  * `ACTIVATIONS`
   """
 
   # Key to collect Variable objects that are global (shared across machines).
@@ -4096,6 +4251,10 @@ class GraphKeys(object):
   COND_CONTEXT = "cond_context"
   WHILE_CONTEXT = "while_context"
 
+  # Key for streaming model ports.
+  # NOTE(yuanbyu): internal and experimental.
+  _STREAMING_MODEL_PORTS = "streaming_model_ports"
+
   @decorator_utils.classproperty
   def VARIABLES(cls):  # pylint: disable=no-self-argument
     logging.warning("VARIABLES collection name is deprecated, "
@@ -4107,7 +4266,7 @@ class GraphKeys(object):
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
-  See [`Graph.add_to_collection()`](../../api_docs/python/framework.md#Graph.add_to_collection)
+  See @{tf.Graph.add_to_collection}
   for more details.
 
   Args:
@@ -4121,7 +4280,7 @@ def add_to_collection(name, value):
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
-  See [`Graph.add_to_collections()`](../../api_docs/python/framework.md#Graph.add_to_collections)
+  See @{tf.Graph.add_to_collections}
   for more details.
 
   Args:
@@ -4135,7 +4294,7 @@ def add_to_collections(names, value):
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
-  See [`Graph.get_collection_ref()`](../../api_docs/python/framework.md#Graph.get_collection_ref)
+  See @{tf.Graph.get_collection_ref}
   for more details.
 
   Args:
@@ -4154,7 +4313,7 @@ def get_collection_ref(key):
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
-  See [`Graph.get_collection()`](../../api_docs/python/framework.md#Graph.get_collection)
+  See @{tf.Graph.get_collection}
   for more details.
 
   Args:
@@ -4181,14 +4340,14 @@ def get_all_collection_keys():
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def name_scope(name, default_name=None, values=None):
   """Returns a context manager for use when defining a Python op.
 
   This context manager validates that the given `values` are from the
   same graph, makes that graph the default graph, and pushes a
   name scope in that graph (see
-  [`Graph.name_scope()`](../../api_docs/python/framework.md#Graph.name_scope)
+  @{tf.Graph.name_scope}
   for more details on that).
 
   For example, to define a new Python op called `my_op`:
@@ -4243,10 +4402,15 @@ def strip_name_scope(name, export_scope):
     is None.
   """
   if export_scope:
-    # Strips export_scope/, export_scope///,
-    # ^export_scope/, loc:@export_scope/.
-    str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
-    return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    try:
+      # Strips export_scope/, export_scope///,
+      # ^export_scope/, loc:@export_scope/.
+      str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
+      return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
@@ -4263,15 +4427,20 @@ def prepend_name_scope(name, import_scope):
     is None.
   """
   if import_scope:
-    str_to_replace = r"([\^]|loc:@|^)(.*)"
-    return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
-                  compat.as_str(name))
+    try:
+      str_to_replace = r"([\^]|loc:@|^)(.*)"
+      return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
+                    compat.as_str(name))
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
   logging.warn("tf.op_scope(values, name, default_name) is deprecated,"
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index bd4764982f5..00891d2d219 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -18,7 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
+import weakref
+
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -31,6 +37,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_ops_2
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resources
@@ -351,6 +358,32 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("noop", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=noop>", repr(op))
 
+  def testGetAttr(self):
+    list_value = attr_value_pb2.AttrValue.ListValue()
+    list_value.type.append(types_pb2.DT_STRING)
+    list_value.type.append(types_pb2.DT_DOUBLE)
+    op = ops.Operation(
+        ops._NodeDef(
+            "noop",
+            "op1",
+            attrs={
+                "value": attr_value_pb2.AttrValue(i=32),
+                "dtype": attr_value_pb2.AttrValue(type=types_pb2.DT_INT32),
+                "list": attr_value_pb2.AttrValue(list=list_value)
+            }), ops.Graph(), [], [dtypes.int32])
+    self.assertEqual(32, op.get_attr("value"))
+
+    d = op.get_attr("dtype")
+    # First check that d is a DType, because the assertEquals will
+    # work no matter what since DType overrides __eq__
+    self.assertIsInstance(d, dtypes.DType)
+    self.assertEqual(dtypes.int32, d)
+
+    l = op.get_attr("list")
+    for x in l:
+      self.assertIsInstance(x, dtypes.DType)
+    self.assertEqual([dtypes.string, dtypes.double], l)
+
 
 class CreateOpTest(test_util.TensorFlowTestCase):
 
@@ -860,6 +893,15 @@ class ObjectWithName(object):
 
 class CollectionTest(test_util.TensorFlowTestCase):
 
+  def test_get_collections(self):
+    g = ops.Graph()
+    self.assertSequenceEqual(g.collections, [])
+    g.add_to_collection("key", 12)
+    g.add_to_collection("key", 15)
+    self.assertSequenceEqual(g.collections, ["key"])
+    g.add_to_collection("other", "foo")
+    self.assertSequenceEqual(sorted(g.collections), ["key", "other"])
+
   def test_add_to_collection(self):
     g = ops.Graph()
     g.add_to_collection("key", 12)
@@ -1021,18 +1063,28 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
-    g = ops.Graph()
-    a = _apply_op(g, "const", [], [dtypes.float32])
-    b = _apply_op(g, "const", [], [dtypes.float32])
-    with g.control_dependencies([a]):
-      c = _apply_op(g, "const", [], [dtypes.float32])
-      d = _apply_op(g, "identity", [b], [dtypes.float32])
-      e = _apply_op(g, "identity", [c], [dtypes.float32])
+    ops._USE_C_API = True
+    try:
+      g = ops.Graph()
+      with g.as_default():
+        # Creating unregistered ops with _apply_op() doesn't work with the C API
+        # TODO(skyewm): address this more consistently. Possible solutions are
+        # to use registered ops in all tests, create a way to register ops in
+        # Python tests, or conditionally disable the op registration check in
+        # the C API.
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(1.0)
+        with g.control_dependencies([a]):
+          c = constant_op.constant(1.0)
+          d = array_ops.identity(b)
+          e = array_ops.identity(c)
 
-    self.assertEqual(c.op.control_inputs, [a.op])
-    self.assertEqual(d.op.control_inputs, [a.op])
-    # e should be dominated by c.
-    self.assertEqual(e.op.control_inputs, [])
+      self.assertEqual(c.op.control_inputs, [a.op])
+      self.assertEqual(d.op.control_inputs, [a.op])
+      # e should be dominated by c.
+      self.assertEqual(e.op.control_inputs, [])
+    finally:
+      ops._USE_C_API = False
 
   def testBasicWithConversion(self):
     g = ops.Graph()
@@ -1298,6 +1350,32 @@ class GraphTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       g.as_graph_element(NonConvertibleObj())
 
+  # Regression test against creating custom __del__ functions in classes
+  # involved in cyclic references, e.g. Graph and Operation. (Python won't gc
+  # cycles that require calling a __del__ method, because the __del__ method can
+  # theoretically increase the object's refcount to "save" it from gc, and any
+  # already-deleted objects in the cycle would have be to restored.)
+  def testGarbageCollected(self):
+    # Create a graph we can delete and a weak reference to monitor if it's gc'd
+    g = ops.Graph()
+    g_ref = weakref.ref(g)
+    # Create some ops
+    with g.as_default():
+      a = constant_op.constant(2.0)
+      b = constant_op.constant(3.0)
+      c = math_ops.add(a, b)
+    # Create a session we can delete
+    with session.Session(graph=g) as sess:
+      sess.run(c)
+    # Delete all references and trigger gc
+    del g
+    del a
+    del b
+    del c
+    del sess
+    gc.collect()
+    self.assertIsNone(g_ref())
+
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
@@ -1662,6 +1740,37 @@ class NameScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(es, striped)
       self.assertEqual(ep, ops.prepend_name_scope(striped, name_scope_to_add))
 
+  def testGetNameScope(self):
+    with ops.Graph().as_default() as g:
+      with ops.name_scope("scope1"):
+        with ops.name_scope("scope2"):
+          with ops.name_scope("scope3"):
+            self.assertEqual("scope1/scope2/scope3", g.get_name_scope())
+          self.assertEqual("scope1/scope2", g.get_name_scope())
+        self.assertEqual("scope1", g.get_name_scope())
+      self.assertEqual("", g.get_name_scope())
+
+
+class TracebackTest(test_util.TensorFlowTestCase):
+
+  def testTracebackWithStartLines(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(2.0)
+      sess.run(
+          a,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(sess.graph.get_operations())
+
+      # Tests that traceback_with_start_lines is the same as traceback
+      # but includes one more element at the end.
+      for op in sess.graph.get_operations():
+        self.assertEquals(len(op.traceback), len(op.traceback_with_start_lines))
+        for frame, frame_with_start_line in zip(
+            op.traceback, op.traceback_with_start_lines):
+          self.assertEquals(5, len(frame_with_start_line))
+          self.assertEquals(frame, frame_with_start_line[:-1])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 64be2c70a11..142bd535738 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -20,9 +20,13 @@ limitations under the License.
 #include <unordered_map>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -33,9 +37,10 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/framework/python_op_gen_internal.h"
 
 namespace tensorflow {
-namespace {
+namespace python_op_gen_internal {
 
 const int kRightMargin = 78;
 
@@ -64,14 +69,11 @@ bool IsPythonReserved(const string& s) {
        "UnicodeEncodeError", "UnicodeError", "UnicodeTranslateError",
        "UnicodeWarning", "UserWarning", "ValueError", "Warning",
        "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
-       "__package__",
-       // Imports and symbols used in the generated code:
-       "_op_def_lib", "text_format", "op_def_pb2", "op_def_library", "ops"});
+       "__package__"});
 
   return kPythonReserved->count(s) > 0;
 }
 
-// Add a _ to the end of s if necessary to avoid a Python keyword or built-in.
 string AvoidPythonReserved(const string& s) {
   if (IsPythonReserved(s)) return strings::StrCat(s, "_");
   return s;
@@ -175,13 +177,12 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         prefix = "A list of";
       }
     } else {
-      prefix = strings::StrCat(
-          "A list with the same number of `Tensor` objects as `",
-          AvoidPythonReserved(*original_arg), "` of");
+      prefix = strings::StrCat("A list with the same length as `",
+                               AvoidPythonReserved(*original_arg), "` of");
     }
 
     if (arg.type() != DT_INVALID) {
-      return strings::StrCat(prefix, " `Tensor` objects of type ",
+      return strings::StrCat(prefix, " `Tensor` objects with type ",
                              TypeString(arg.type(), arg.is_ref()), ".");
     } else {
       original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
@@ -189,20 +190,22 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         strings::StrAppend(&prefix, " mutable");
       }
       if (original_arg == nullptr) {
-        return strings::StrCat(prefix, " `Tensor` objects of type ",
-                               arg.type_attr(), ".");
+        return strings::StrCat(prefix, " `Tensor` objects with type `",
+                               arg.type_attr(), "`.");
       } else if (*original_arg == arg.name()) {
         const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
         if (attr->has_allowed_values()) {
           return strings::StrCat(prefix,
-                                 " `Tensor` objects of the same type in: ",
+                                 " `Tensor` objects with the same type in: ",
                                  TypeListString(attr->allowed_values()), ".");
         } else {
-          return strings::StrCat(prefix, " `Tensor` objects of the same type.");
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type.");
         }
       } else {
-        return strings::StrCat(prefix, " `Tensor` objects of the same type as ",
-                               AvoidPythonReserved(*original_arg), ".");
+        return strings::StrCat(prefix,
+                               " `Tensor` objects with the same type as `",
+                               AvoidPythonReserved(*original_arg), "`.");
       }
     }
   } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
@@ -241,19 +244,19 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
   }
 }
 
-static string GetReturns(const OpDef& op_def,
-                         const std::vector<string>& output_type_string) {
+string GetReturns(const OpDef& op_def,
+                  const std::vector<string>& output_type_string) {
   string result;
   DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
   const int num_outs = op_def.output_arg_size();
-  strings::Appendf(&result, "\n  Returns:\n");
+  strings::StrAppend(&result, "\n  Returns:\n");
   if (num_outs == 0) {
-    strings::Appendf(&result, "    The created Operation.\n");
+    strings::StrAppend(&result, "    The created Operation.\n");
   } else {
     if (num_outs == 1) {
       StringPiece description = op_def.output_arg(0).description();
       if (ConsumeEquals(&description)) {  // Skip the generated type info.
-        strings::Appendf(&result, "%s", Indent(4, 4, description).c_str());
+        strings::StrAppend(&result, Indent(4, 4, description));
       } else {
         // Special case of one output, don't use the name of the output unless
         // there is no description.
@@ -272,7 +275,7 @@ static string GetReturns(const OpDef& op_def,
         } else if (!description.empty()) {
           AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
         }
-        strings::Appendf(&result, "%s", Indent(4, 4, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 4, desc));
       }
     } else {
       std::vector<string> out_names(num_outs);
@@ -283,8 +286,8 @@ static string GetReturns(const OpDef& op_def,
           out_names[i] = strings::StrCat("output", i);
         }
       }
-      strings::Appendf(&result, "    A tuple of `Tensor` objects (%s).\n",
-                       str_util::Join(out_names, ", ").c_str());
+      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
+                         str_util::Join(out_names, ", "), ").\n\n");
       for (int i = 0; i < num_outs; ++i) {
         string desc = strings::StrCat(out_names[i], ": ");
         StringPiece description = op_def.output_arg(i).description();
@@ -307,7 +310,7 @@ static string GetReturns(const OpDef& op_def,
             strings::StrAppend(&desc, type);
           }
         }
-        strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 6, desc));
       }
     }
   }
@@ -318,8 +321,8 @@ string StringToPython(const string& str) {
   return strings::StrCat("\"", str_util::CEscape(str), "\"");
 }
 
-string DataTypeToPython(DataType dtype) {
-  return strings::StrCat("tf.", PythonDataTypeString(dtype));
+string DataTypeToPython(DataType dtype, const string& dtype_module) {
+  return strings::StrCat(dtype_module, PythonDataTypeString(dtype));
 }
 
 string ShapeToPython(const TensorShapeProto& shape) {
@@ -337,7 +340,12 @@ string ShapeToPython(const TensorShapeProto& shape) {
   return python;
 }
 
-string AttrListToPython(const AttrValue& value) {
+string TensorToPython(const TensorProto& proto) {
+  return ProtoShortDebugString(proto);
+}
+
+string AttrListToPython(const AttrValue& value,
+                        const string& dtype_module = "tf.") {
   string ret;
   if (value.list().s_size() > 0) {
     for (int i = 0; i < value.list().s_size(); ++i) {
@@ -362,18 +370,30 @@ string AttrListToPython(const AttrValue& value) {
   } else if (value.list().type_size() > 0) {
     for (int i = 0; i < value.list().type_size(); ++i) {
       if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, DataTypeToPython(value.list().type(i)));
+      strings::StrAppend(&ret,
+                         DataTypeToPython(value.list().type(i), dtype_module));
     }
   } else if (value.list().shape_size() > 0) {
     for (int i = 0; i < value.list().shape_size(); ++i) {
       if (i > 0) strings::StrAppend(&ret, ", ");
       strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
     }
+  } else if (value.list().tensor_size() > 0) {
+    for (int i = 0; i < value.list().tensor_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
+    }
+  } else if (value.list().func_size() > 0) {
+    for (int i = 0; i < value.list().func_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
+    }
   }
   return ret;
 }
 
-string AttrValueToPython(const string& type, const AttrValue& value) {
+string AttrValueToPython(const string& type, const AttrValue& value,
+                         const string& dtype_module) {
   if (type == "string") {
     return StringToPython(value.s());
   } else if (type == "int") {
@@ -383,41 +403,71 @@ string AttrValueToPython(const string& type, const AttrValue& value) {
   } else if (type == "bool") {
     return value.b() ? "True" : "False";
   } else if (type == "type") {
-    return DataTypeToPython(value.type());
+    return DataTypeToPython(value.type(), dtype_module);
   } else if (type == "shape") {
     return ShapeToPython(value.shape());
+  } else if (type == "tensor") {
+    return TensorToPython(value.tensor());
+  } else if (type == "func") {
+    return StringToPython(value.func().name());
+  } else if (StringPiece(type).starts_with("list(")) {
+    return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
-    return strings::StrCat("[", AttrListToPython(value), "]");
+    return "?";
   }
 }
 
-static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
-  string result;
-  // Map from attr name to the first input arg it is inferred from.
-  std::unordered_map<string, string> inferred_attrs;
+void GenerateLowerCaseOpName(const string& str, string* result) {
+  const char joiner = '_';
+  const int last_index = str.size() - 1;
+  for (int i = 0; i <= last_index; ++i) {
+    const char c = str[i];
+    // Emit a joiner only if a previous-lower-to-now-upper or a
+    // now-upper-to-next-lower transition happens.
+    if (isupper(c) && (i > 0)) {
+      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+        result->push_back(joiner);
+      }
+    }
+    result->push_back(tolower(c));
+  }
+}
+
+static void AddDelimiter(string* append_to, const string& delim) {
+  if (!append_to->empty()) strings::StrAppend(append_to, delim);
+}
+
+GenPythonOp::GenPythonOp(const OpDef& op_def, const string& function_name)
+    : op_def_(op_def),
+      function_name_(function_name),
+      num_outs_(op_def.output_arg_size()) {}
+
+GenPythonOp::~GenPythonOp() {}
+
+string GenPythonOp::Code() {
   // This has all the input args followed by those attrs that don't have
   // defaults.
   std::vector<string> args_no_default;
   // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs and the graph ("g") parameter.
+  // No input args are included, just attrs.
   std::vector<string> args_with_defaults;
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg(op_def_.input_arg(i));
     args_no_default.push_back(arg.name());
     if (!arg.type_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs, arg.type_attr(), arg.name());
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
     } else if (!arg.type_list_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs, arg.type_list_attr(),
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_list_attr(),
                               arg.name());
     }
     if (!arg.number_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs, arg.number_attr(), arg.name());
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
     }
   }
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
     // Do not add inferred attrs to the Python function signature.
-    if (inferred_attrs.find(attr.name()) == inferred_attrs.end()) {
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
       if (attr.has_default_value()) {
         args_with_defaults.push_back(attr.name());
       } else {
@@ -428,111 +478,92 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
 
   // Save the list of attr parameters (attrs that won't be inferred),
   // those with defaults go at the end.
-  std::vector<string> attrs;
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default (before
-  // "g" gets added to args_no_default, so it only has attrs).
-  attrs.reserve(args_no_default.size() - op_def.input_arg_size() +
-                args_with_defaults.size());
-  attrs.insert(attrs.end(), args_no_default.begin() + op_def.input_arg_size(),
-               args_no_default.end());
-  attrs.insert(attrs.end(), args_with_defaults.begin(),
-               args_with_defaults.end());
+  // from the end of args_no_default, and adding args_no_default.
+  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
+                 args_with_defaults.size());
+  attrs_.insert(attrs_.end(),
+                args_no_default.begin() + op_def_.input_arg_size(),
+                args_no_default.end());
+  attrs_.insert(attrs_.end(), args_with_defaults.begin(),
+                args_with_defaults.end());
 
-  std::vector<string> param_names;
-  param_names.reserve(args_no_default.size() + args_with_defaults.size());
+  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
   string parameters;
   for (const string& name : args_no_default) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    AddDelimiter(&parameters, ", ");
     const string param = AvoidPythonReserved(name);
     strings::StrAppend(&parameters, param);
-    param_names.push_back(param);
+    param_names_.push_back(param);
   }
   for (const string& name : args_with_defaults) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    AddDelimiter(&parameters, ", ");
     const string param = AvoidPythonReserved(name);
     strings::StrAppend(&parameters, param, "=None");
-    param_names.push_back(param);
+    param_names_.push_back(param);
   }
-  const bool has_args = args_no_default.size() + args_with_defaults.size() > 0;
+  AddDelimiter(&parameters, ", ");
+  strings::StrAppend(&parameters, "name=None");
 
-  const string lower_op_name = strings::StrCat(is_hidden ? "_" : "", op_name);
+  AddDefLine(parameters);
+  AddDocStringDescription();
+  AddDocStringArgs();
+  AddDocStringInputs();
+  AddDocStringAttrs();
+  AddDocStringNameArg();
+  AddOutputGlobals();
+  AddDocStringOutputs();
+  strings::StrAppend(&result_, "  \"\"\"\n");
+  AddBody("  ");
+  strings::StrAppend(&result_, "\n\n");
 
-  // Prepare the list of output names
-  const int num_outs = op_def.output_arg_size();
-  std::vector<string> out_names(num_outs);
-  for (int i = 0; i < num_outs; ++i) {
-    if (!op_def.output_arg(i).name().empty()) {
-      out_names[i] = op_def.output_arg(i).name();
-    } else {
-      out_names[i] = strings::StrCat("output", i);
-    }
-  }
-  string out_names_list =
-      strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+  return prelude_ + result_;
+}
 
-  // Provide the output names as a Python list
-  string lower_op_name_outputs =
-      strings::StrCat("_", lower_op_name, "_outputs");
-  const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-  strings::Appendf(
-      &result, "%s\n",
-      WordWrap(outputs_prefix, out_names_list, kRightMargin).c_str());
-  strings::Appendf(&result, "\n\n");
+void GenPythonOp::AddDefLine(const string& parameters) {
+  const string def_prefix = strings::StrCat("def ", function_name_, "(");
+  strings::StrAppend(
+      &result_, WordWrap(def_prefix, parameters + "):", kRightMargin), "\n");
+}
 
-  // Prepare a NamedTuple type to hold the outputs, if there are multiple
-  if (num_outs > 1) {
-    const string tuple_type_prefix = strings::StrCat(
-        "_", op_def.name(), "Output = _collections.namedtuple(");
-    const string tuple_type_suffix = strings::StrCat(
-        "\"", op_def.name(), "\", ", lower_op_name_outputs, ")");
-    strings::Appendf(
-        &result, "%s\n",
-        WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin).c_str());
-    strings::Appendf(&result, "\n\n");
-  }
-
-  // Print: def Function(parameters):
-  const string def_prefix = strings::StrCat("def ", lower_op_name, "(");
-  const string def_suffix =
-      strings::StrCat(parameters, has_args ? ", " : "", "name=None):");
-
-  strings::Appendf(&result, "%s\n",
-                   WordWrap(def_prefix, def_suffix, kRightMargin).c_str());
-
-  // Format the Op's descriptions so that it can be a Python docstring.
+void GenPythonOp::AddDocStringDescription() {
   string comment;
-  if (op_def.summary().empty()) {
+  if (op_def_.summary().empty()) {
     comment = "TODO: add doc.\n";
   } else {
-    comment = strings::StrCat(op_def.summary(), "\n");
-    if (!op_def.description().empty()) {
-      strings::StrAppend(&comment, "\n", Indent(2, 2, op_def.description()));
+    comment = strings::StrCat(op_def_.summary(), "\n");
+    if (!op_def_.description().empty()) {
+      strings::StrAppend(&comment, "\n", Indent(2, 2, op_def_.description()));
     }
   }
+  strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
+}
 
-  strings::Appendf(&result, "  r\"\"\"%s\n  Args:\n", comment.c_str());
+void GenPythonOp::AddDocStringArgs() {
+  strings::StrAppend(&result_, "  Args:\n");
+}
 
-  // Inputs
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
-    StringPiece description = op_def.input_arg(i).description();
+void GenPythonOp::AddDocStringInputs() {
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg(op_def_.input_arg(i));
+    StringPiece description = op_def_.input_arg(i).description();
     string desc;
     if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names[i], ": ");
+      desc = strings::StrCat(param_names_[i], ": ");
     } else {
-      desc = strings::StrCat(param_names[i], ": ",
-                             ArgTypeName(op_def, arg, inferred_attrs, false));
+      desc = strings::StrCat(param_names_[i], ": ",
+                             ArgTypeName(op_def_, arg, inferred_attrs_, false));
     }
     if (!description.empty()) {
       AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
+}
 
-  // Attrs
-  for (const string& name : attrs) {
-    const auto& attr = *FindAttr(name, op_def);
+void GenPythonOp::AddDocStringAttrs() {
+  for (const string& name : attrs_) {
+    const auto& attr = *FindAttr(name, op_def_);
     string desc = strings::StrCat(AvoidPythonReserved(name), ": ");
 
     static const char* const kAttrTypeName[][2] = {
@@ -549,6 +580,10 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
         {"shape", "`tf.TensorShape` or list of `ints`"},
         {"list(shape)",
          "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
+        {"tensor", "`tf.TensorProto`"},
+        {"list(tensor)", "list of `tf.TensorProto` objects"},
+        {"func", "function decorated with @Defun"},
+        {"list(func)", "list of functions decorated with @Defun"},
     };
     for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
       if (attr.type() == kAttrTypeName[i][0]) {
@@ -592,66 +627,94 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
       AppendWithinWidth(&desc, attr.description(),
                         kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
+}
 
-  strings::Appendf(&result, "    name: A name for the operation (optional).\n");
+void GenPythonOp::AddDocStringNameArg() {
+  strings::StrAppend(&result_,
+                     "    name: A name for the operation (optional).\n");
+}
 
+void GenPythonOp::AddOutputGlobals() {
+  // Prepare a NamedTuple type to hold the outputs, if there are multiple
+  if (num_outs_ > 1) {
+    // Prepare the list of output names
+    std::vector<string> out_names(num_outs_);
+    for (int i = 0; i < num_outs_; ++i) {
+      if (!op_def_.output_arg(i).name().empty()) {
+        out_names[i] = op_def_.output_arg(i).name();
+      } else {
+        out_names[i] = strings::StrCat("output", i);
+      }
+    }
+    string out_names_list =
+        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+
+    // Provide the output names as a Python list
+    string lower_op_name_outputs =
+        strings::StrCat("_", function_name_, "_outputs");
+    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
+    strings::StrAppend(&prelude_, "\n",
+                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
+                       "\n");
+
+    strings::StrAppend(&prelude_, "_", op_def_.name(),
+                       "Output = _collections.namedtuple(\n");
+    const string tuple_type_prefix = "    ";
+    const string tuple_type_suffix = strings::StrCat(
+        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
+    strings::StrAppend(
+        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
+        "\n\n");
+  }
+  strings::StrAppend(&prelude_, "\n");
+}
+
+void GenPythonOp::AddDocStringOutputs() {
   std::vector<string> output_type_string;
-  output_type_string.reserve(op_def.output_arg_size());
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+  output_type_string.reserve(num_outs_);
+  for (int i = 0; i < num_outs_; ++i) {
     output_type_string.push_back(
-        ArgTypeName(op_def, op_def.output_arg(i), inferred_attrs, true));
+        ArgTypeName(op_def_, op_def_.output_arg(i), inferred_attrs_, true));
   }
-  strings::StrAppend(&result, GetReturns(op_def, output_type_string));
+  strings::StrAppend(&result_, GetReturns(op_def_, output_type_string));
+}
 
-  string return_prefix = strings::StrCat("  result = _op_def_lib.apply_op(");
-  string return_args = strings::StrCat("\"", op_def.name(), "\", ");
-  for (size_t i = 0; i < param_names.size(); ++i) {
-    strings::StrAppend(&return_args, param_names[i], "=", param_names[i], ", ");
+void GenPythonOp::AddBody(const string& prefix) {
+  string return_prefix =
+      strings::StrCat(prefix, "result = _op_def_lib.apply_op(");
+  string return_args = strings::StrCat("\"", op_def_.name(), "\", ");
+  for (size_t i = 0; i < param_names_.size(); ++i) {
+    strings::StrAppend(&return_args, param_names_[i], "=", param_names_[i],
+                       ", ");
   }
   strings::StrAppend(&return_args, "name=name)");
 
-  strings::Appendf(&result, "  \"\"\"\n%s\n",
-                   // Wrap the arguments, and indent to the (.
-                   WordWrap(return_prefix, return_args, kRightMargin).c_str());
+  strings::StrAppend(&result_,
+                     // Wrap the arguments, and indent to the (.
+                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
 
-  if (num_outs <= 1) {
-    strings::Appendf(&result, "  return result\n");
+  if (num_outs_ <= 1) {
+    strings::StrAppend(&result_, prefix, "return result\n");
   } else {
-    string return_tuple =
-        strings::StrCat("  return _", op_def.name(), "Output._make(result)\n");
-    strings::Appendf(&result, "%s", return_tuple.c_str());
-  }
-
-  strings::Appendf(&result, "\n\n");
-  return result;
-}
-
-void GenerateLowerCaseOpName(const string& str, string* result) {
-  char joiner = '_';
-  int last_index = str.size() - 1;
-  for (int i = 0; i <= last_index; ++i) {
-    char c = str[i];
-    // Emit a joiner only if a previous-lower-to-now-upper or a
-    // now-upper-to-next-lower transition happens.
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
-      }
-    }
-    result->push_back(tolower(c));
+    strings::StrAppend(&result_, prefix, "return _", op_def_.name(),
+                       "Output._make(result)\n");
   }
 }
 
-}  // namespace
+}  // namespace python_op_gen_internal
+
+string GetPythonOp(const OpDef& op_def, const string& function_name) {
+  return python_op_gen_internal::GenPythonOp(op_def, function_name).Code();
+}
 
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes) {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::Appendf(&result, R"("""Python wrappers around Brain.
+  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
 """
@@ -683,24 +746,24 @@ from tensorflow.python.framework import op_def_library as _op_def_library
       }
     }
 
-    // PrintPythonOp(op_def, is_hidden, op_def.name());
-    string lower_case_name;
-    GenerateLowerCaseOpName(op_def.name(), &lower_case_name);
+    string function_name;
+    python_op_gen_internal::GenerateLowerCaseOpName(op_def.name(),
+                                                    &function_name);
+    if (is_hidden) function_name = strings::StrCat("_", function_name);
 
     // When users create custom python wrappers, they may link in the
     // default op registry by accident, and because they can't
     // enumerate all 'hidden' symbols, this guard is to prevent
     // instantiating a python reserved word in their wrapper.
-    if (!is_hidden && IsPythonReserved(lower_case_name)) {
+    if (python_op_gen_internal::IsPythonReserved(function_name)) {
       continue;
     }
 
-    strings::StrAppend(&result,
-                       GetPythonOp(op_def, is_hidden, lower_case_name));
+    strings::StrAppend(&result, GetPythonOp(op_def, function_name));
 
     if (!require_shapes) {
-      strings::Appendf(&result, "_ops.RegisterShape(\"%s\")(None)\n",
-                       op_def.name().c_str());
+      strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
+                         "\")(None)\n");
     }
 
     auto added = out->Add();
@@ -722,7 +785,7 @@ _InitOpDefLibrary.op_list_ascii = """%s"""
 
 _op_def_lib = _InitOpDefLibrary()
 )",
-                   cleaned_ops.DebugString().c_str());
+                   ProtoDebugString(cleaned_ops).c_str());
   return result;
 }
 
@@ -731,8 +794,8 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
   printf("%s", GetPythonOps(ops, hidden_ops, require_shapes).c_str());
 }
 
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len) {
-  string op_list_str(op_wrapper_buf, op_wrapper_len);
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
+  string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
   return GetPythonOps(ops, {}, false);
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 424244fcc55..f485044c5af 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -31,11 +31,13 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
+string GetPythonOp(const OpDef& op_def, const string& function_name);
 
 // Get the python wrappers for a list of ops in a OpList.
-// buf should be a pointer to a buffer containing the binary encoded OpList
-// proto, and len should be the length of that buffer.
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len);
+// `op_list_buf` should be a pointer to a buffer containing
+// the binary encoded OpList proto, and `op_list_len` should be the
+// length of that buffer.
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 740eff4ecdb..26ec4e8e66b 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -25,7 +25,7 @@ limitations under the License.
 // going from python bytes to const char* tries to decode the
 // contents from utf-8 to unicode for Python version >= 3, but
 // we want the bytes to be uninterpreted.
-%typemap(in) (const char* op_wrapper_buf, size_t op_wrapper_len) {
+%typemap(in) (const char* op_list_buf, size_t op_list_len) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
new file mode 100644
index 00000000000..44b1aed71f1
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#define THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+
+// Returns true if s is a Python keyword or built-in.
+bool IsPythonReserved(const string& s);
+
+// Add a _ to the end of s if necessary to avoid a Python keyword or built-in.
+string AvoidPythonReserved(const string& s);
+
+// Convert an AttrValue with type `type` to the Python representation for
+// that value.
+string AttrValueToPython(const string& type, const AttrValue& value,
+                         const string& dtype_module = "tf.");
+
+void GenerateLowerCaseOpName(const string& str, string* result);
+
+class GenPythonOp {
+ public:
+  GenPythonOp(const OpDef& op_def, const string& function_name);
+  virtual ~GenPythonOp();
+
+  virtual string Code();
+
+ protected:
+  // Print: def Function(parameters):
+  void AddDefLine(const string& parameters);
+
+  // Format the Op's descriptions so that it can be a Python docstring.
+  void AddDocStringDescription();
+
+  void AddDocStringArgs();
+  void AddDocStringInputs();
+  void AddDocStringAttrs();
+  void AddDocStringNameArg();
+  void AddOutputGlobals();
+  void AddDocStringOutputs();
+  void AddBody(const string& prefix);
+
+  // From constructor arguments
+  const OpDef& op_def_;
+  const string& function_name_;
+  const int num_outs_;
+
+  // Return value from Code() is prelude_ + result_.
+  string prelude_;  // Code before function definition
+  string result_;   // Function definition
+
+  // Map from attr name to the first input arg it is inferred from
+  std::unordered_map<string, string> inferred_attrs_;
+
+  // The names of the non-inferred attrs, in parameter order
+  std::vector<string> attrs_;
+
+  // All parameters, including inputs & non-inferred attrs, required and those
+  // with defaults, except "name"
+  std::vector<string> param_names_;
+};
+
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 27df2888393..0d8bd4bcf16 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -26,8 +26,9 @@ from tensorflow.python.framework import ops
 DEFAULT_GRAPH_SEED = 87654321
 _MAXINT32 = 2**31 - 1
 
+
 def _truncate_seed(seed):
-  return seed % _MAXINT32 # truncate to fit into 32-bit integer
+  return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
 def get_seed(op_seed):
@@ -39,7 +40,7 @@ def get_seed(op_seed):
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  @{set_random_seed}.
+  @{tf.set_random_seed}.
 
   Args:
     op_seed: integer.
@@ -50,15 +51,20 @@ def get_seed(op_seed):
   """
   graph_seed = ops.get_default_graph().seed
   if graph_seed is not None:
-    if op_seed is not None:
-      return _truncate_seed(graph_seed), _truncate_seed(op_seed)
-    else:
-      return _truncate_seed(graph_seed), _truncate_seed(ops.get_default_graph()._last_id)
+    if op_seed is None:
+      # pylint: disable=protected-access
+      op_seed = ops.get_default_graph()._last_id
+    seeds = _truncate_seed(graph_seed), _truncate_seed(op_seed)
   else:
     if op_seed is not None:
-      return _truncate_seed(DEFAULT_GRAPH_SEED), _truncate_seed(op_seed)
+      seeds = DEFAULT_GRAPH_SEED, _truncate_seed(op_seed)
     else:
-      return None, None
+      seeds = None, None
+  # Avoid (0, 0) as the C++ ops interpret it as nondeterminism, which would
+  # be unexpected since Python docs say nondeterminism is (None, None).
+  if seeds == (0, 0):
+    return (0, _MAXINT32)
+  return seeds
 
 
 def set_random_seed(seed):
diff --git a/tensorflow/python/framework/random_seed_test.py b/tensorflow/python/framework/random_seed_test.py
index 1d0dbb950af..c1d2b05b0b7 100644
--- a/tensorflow/python/framework/random_seed_test.py
+++ b/tensorflow/python/framework/random_seed_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.framework.ops."""
+"""Tests for tensorflow.python.framework.random_seed."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,6 +34,9 @@ class RandomSeedTest(test.TestCase):
         ((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)),
         ((1, None), (1, 0)),  # 0 will be the default_graph._lastid.
         ((1, 1), (1, 1)),
+        ((0, 0), (0, 2**31 - 1)),  # Avoid nondeterministic (0, 0) output
+        ((2**31 - 1, 0), (0, 2**31 - 1)),  # Don't wrap to (0, 0) either
+        ((0, 2**31 - 1), (0, 2**31 - 1)),  # Wrapping for the other argument
     ]
     for tc in test_cases:
       tinput, toutput = tc[0], tc[1]
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index c74077be95f..10f5579ae59 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -89,15 +89,6 @@ class SparseTensor(_TensorLike):
    [0, 0, 2, 0]
    [0, 0, 0, 0]]
   ```
-
-  @@__init__
-  @@get_shape
-  @@indices
-  @@values
-  @@dense_shape
-  @@dtype
-  @@op
-  @@graph
   """
 
   @classmethod
@@ -212,7 +203,7 @@ class SparseTensor(_TensorLike):
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See [`Session.run()`](../../api_docs/python/client.md#Session.run) for a
+        See @{tf.Session.run} for a
         description of the valid feed values.
       session: (Optional.) The `Session` to be used to evaluate this sparse
         tensor. If none, the default session will be used.
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 19a2b187b9b..e709eaeda14 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tests for tensorflow.python.framework.ops."""
+"""Tests for tensorflow.python.framework.sparse_tensor."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index ba2ccd11097..2654bca31c8 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -23,6 +23,8 @@ import re
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 
 
 def _recursive_apply(tensors, apply_fn):
@@ -44,6 +46,8 @@ def _recursive_apply(tensors, apply_fn):
   tensors_type = type(tensors)
   if tensors_type is ops.Tensor:
     return apply_fn(tensors)
+  elif tensors_type is variables.Variable:
+    return apply_fn(tensors.value())
   elif isinstance(tensors, (list, tuple)):
     tensors = [_recursive_apply(t, apply_fn) for t in tensors]
     if tensors_type is list:
@@ -53,7 +57,7 @@ def _recursive_apply(tensors, apply_fn):
     return tensors_type(*tensors)  # collections.namedtuple
   elif tensors_type is dict:
     return dict([(k, _recursive_apply(v, apply_fn))
-                 for k, v in tensors.iteritems()])
+                 for k, v in tensors.items()])
   else:
     raise TypeError('_recursive_apply argument %r has invalid type %r' %
                     (tensors, tensors_type))
@@ -215,6 +219,12 @@ def _subscribe(tensor, side_effects, control_cache):
     The modified replacement to the passed in tensor which triggers the side
     effects or the given tensor, if it was already been subscribed.
   """
+  # Check if the given tensor has a numpy compatible type (see dtypes.py).
+  # If not, we cannot subscribe it, so we just return the original tensor.
+  if not tensor.dtype.is_numpy_compatible:
+    logging.debug(('Tensor {} has an un-supported {} type and cannot be '
+                   'subscribed.').format(tensor.name, tensor.dtype))
+    return tensor
 
   if _is_subscribed_identity(tensor):
     return _subscribe_extend(tensor, side_effects)
@@ -266,7 +276,7 @@ def subscribe(tensors, side_effects):
     Subscribed tensors, which are identity copies of the passed in tensors
       in the same passed in structure, but the graph has been modified
       such that these are downstream of the control dependencies for
-      the side effect graphs. Use these functionally equivelant tensors
+      the side effect graphs. Use these functionally equivalent tensors
       instead of the passed in tensors for further construction or running.
   """
   if not hasattr(side_effects, '__iter__'):
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index adb9192e7b4..73e490d23d1 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -18,18 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import subscribe
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
 class SubscribeTest(test_util.TensorFlowTestCase):
 
+  def _ExpectSubscribedIdentities(self, container):
+    """Convenience function to test a container of subscribed identities."""
+    self.assertTrue(
+        all(subscribe._is_subscribed_identity(x) for x in container))
+
   def testSideEffect(self):
     a = constant_op.constant(1)
     b = constant_op.constant(1)
@@ -57,8 +67,53 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertEquals(d_out, [42])
     self.assertEquals(shared, [2, 2, 2])
 
+  def testSupportedTypes(self):
+    """Confirm that supported types are correctly detected and handled."""
+
+    a = constant_op.constant(1)
+    b = constant_op.constant(1)
+    c = math_ops.add(a, b)
+
+    def sub(t):
+      return t
+
+    # Tuples.
+    subscribed = subscribe.subscribe(
+        (a, b), lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertIsInstance(subscribed, tuple)
+    self._ExpectSubscribedIdentities(subscribed)
+
+    # Lists.
+    subscribed = subscribe.subscribe(
+        [a, b], lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertIsInstance(subscribed, list)
+    self._ExpectSubscribedIdentities(subscribed)
+
+    # Dictionaries.
+    subscribed = subscribe.subscribe({
+        'first': a,
+        'second': b
+    }, lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertIsInstance(subscribed, dict)
+    self._ExpectSubscribedIdentities(subscribed.values())
+
+    # Namedtuples.
+    # pylint: disable=invalid-name
+    TensorPair = collections.namedtuple('TensorPair', ['first', 'second'])
+    # pylint: enable=invalid-name
+    pair = TensorPair(a, b)
+    subscribed = subscribe.subscribe(
+        pair, lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertIsInstance(subscribed, TensorPair)
+    self._ExpectSubscribedIdentities(subscribed)
+
+    # Expect an exception to be raised for unsupported types.
+    with self.assertRaisesRegexp(TypeError, 'has invalid type'):
+      subscribe.subscribe(c.name,
+                          lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+
   def testCaching(self):
-    """Confirm caching of control output is recacluated between calls."""
+    """Confirm caching of control output is recalculated between calls."""
     a = constant_op.constant(1)
     b = constant_op.constant(2)
     with ops.control_dependencies([a]):
@@ -147,6 +202,71 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertIn('graph2', shared)
     self.assertIn('graph3', shared)
 
+  def testSubscribeVariable(self):
+    """Confirm that variables can be subscribed."""
+    v1 = variables.Variable(0.0)
+    v2 = variables.Variable(4.0)
+    add = math_ops.add(v1, v2)
+    assign_v1 = v1.assign(3.0)
+
+    shared = []
+
+    def sub(t):
+      shared.append(t)
+      return t
+
+    v1_sub = subscribe.subscribe(
+        v1, lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertTrue(subscribe._is_subscribed_identity(v1_sub))
+
+    with self.test_session() as sess:
+      # Initialize the variables first.
+      sess.run([v1.initializer])
+      sess.run([v2.initializer])
+
+      # Expect the side effects to be triggered when evaluating the add op as
+      # it will read the value of the variable.
+      sess.run([add])
+      self.assertEquals(1, len(shared))
+
+      # Expect the side effect not to be triggered when evaluating the assign
+      # op as it will not access the 'read' output of the variable.
+      sess.run([assign_v1])
+      self.assertEquals(1, len(shared))
+
+      sess.run([add])
+      self.assertEquals(2, len(shared))
+
+      # Make sure the values read from the variable match the expected ones.
+      self.assertEquals([0.0, 3.0], shared)
+
+  def testResourceType(self):
+    """Confirm that subscribe correctly handles tensors with 'resource' type."""
+    tensor_array = tensor_array_ops.TensorArray(
+        dtype=dtypes.float32,
+        tensor_array_name='test',
+        size=3,
+        infer_shape=False)
+    writer = tensor_array.write(0, [[4.0, 5.0]])
+    reader = writer.read(0)
+
+    shared = []
+
+    def sub(t):
+      shared.append(t)
+      return t
+
+    # TensorArray's handle output tensor has a 'resource' type and cannot be
+    # subscribed as it's not 'numpy compatible' (see dtypes.py).
+    # Expect that the original tensor is returned when subscribing to it.
+    tensor_array_sub = subscribe.subscribe(
+        tensor_array.handle, lambda t: script_ops.py_func(sub, [t], [t.dtype]))
+    self.assertIs(tensor_array_sub, tensor_array.handle)
+    self.assertFalse(subscribe._is_subscribed_identity(tensor_array.handle))
+
+    with self.test_session() as sess:
+      sess.run([reader])
+    self.assertEquals(0, len(shared))
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index b2c015c0b63..3aedbfef0d5 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Helper classes for tensor shape inference."""
 from __future__ import absolute_import
 from __future__ import division
@@ -31,8 +30,8 @@ class Dimension(object):
       self._value = None
     else:
       self._value = int(value)
-      if (not isinstance(value, compat.bytes_or_text_types)
-          and self._value != value):
+      if (not isinstance(value, compat.bytes_or_text_types) and
+          self._value != value):
         raise ValueError("Ambiguous dimension: %s" % value)
       if self._value < 0:
         raise ValueError("Dimension %d must be >= 0" % self._value)
@@ -67,6 +66,11 @@ class Dimension(object):
   def __int__(self):
     return self._value
 
+  # This is needed for Windows.
+  # See https://github.com/tensorflow/tensorflow/pull/9780
+  def __long__(self):
+    return self._value
+
   def __index__(self):
     # Allow use in Python 3 range
     return self._value
@@ -89,9 +93,8 @@ class Dimension(object):
       True if this Dimension and `other` are compatible.
     """
     other = as_dimension(other)
-    return (self._value is None
-            or other.value is None
-            or self._value == other.value)
+    return (self._value is None or other.value is None or
+            self._value == other.value)
 
   def assert_is_compatible_with(self, other):
     """Raises an exception if `other` is not compatible with this Dimension.
@@ -104,8 +107,8 @@ class Dimension(object):
         is_compatible_with).
     """
     if not self.is_compatible_with(other):
-      raise ValueError("Dimensions %s and %s are not compatible"
-                       % (self, other))
+      raise ValueError("Dimensions %s and %s are not compatible" % (self,
+                                                                    other))
 
   def merge_with(self, other):
     """Returns a Dimension that combines the information in `self` and `other`.
@@ -385,39 +388,17 @@ class TensorShape(object):
   `Tensor`. It may be one of the following:
 
   * *Fully-known shape:* has a known number of dimensions and a known size
-    for each dimension.
+    for each dimension. e.g. `TensorShape([16, 256])`
   * *Partially-known shape:* has a known number of dimensions, and an unknown
-    size for one or more dimension.
+    size for one or more dimension. e.g. `TensorShape([None, 256])`
   * *Unknown shape:* has an unknown number of dimensions, and an unknown
-    size in all dimensions.
+    size in all dimensions. e.g. `TensorShape(None)`
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See [`Shape functions in
-  C++`](../../how_tos/adding_an_op/index.md#shape-functions-in-c) for
-  details of shape functions and how to register them. Alternatively,
-  the shape may be set explicitly using
-  [`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape).
-
-  @@merge_with
-  @@concatenate
-
-  @@ndims
-  @@dims
-  @@as_list
-  @@as_proto
-  @@is_compatible_with
-  @@is_fully_defined
-
-  @@with_rank
-  @@with_rank_at_least
-  @@with_rank_at_most
-
-  @@assert_has_rank
-  @@assert_same_rank
-  @@assert_is_compatible_with
-  @@assert_is_fully_defined
-
+  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  for details of shape functions and how to register them. Alternatively,
+  the shape may be set explicitly using @{tf.Tensor.set_shape}.
   """
 
   def __init__(self, dims):
@@ -435,7 +416,7 @@ class TensorShape(object):
       self._dims = None
     elif isinstance(dims, compat.bytes_or_text_types):
       raise TypeError("A string has ambiguous TensorShape, please wrap in a "
-                       "list or convert to an int: %s" % dims)
+                      "list or convert to an int: %s" % dims)
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:
         self._dims = None
@@ -443,7 +424,8 @@ class TensorShape(object):
         self._dims = [
             # Protos store variable-size dimensions as -1
             as_dimension(dim.size if dim.size != -1 else None)
-            for dim in dims.dim]
+            for dim in dims.dim
+        ]
     elif isinstance(dims, TensorShape):
       self._dims = dims.dims
     else:
@@ -540,7 +522,7 @@ class TensorShape(object):
           # suffixes of otherwise unknown shapes.
           return unknown_shape()
         else:
-          return unknown_shape(ndims=stop-start)
+          return unknown_shape(ndims=stop - start)
       else:
         return Dimension(None)
 
@@ -581,8 +563,7 @@ class TensorShape(object):
           new_dims.append(dim.merge_with(other[i]))
         return TensorShape(new_dims)
       except ValueError:
-        raise ValueError("Shapes %s and %s are not compatible" %
-                         (self, other))
+        raise ValueError("Shapes %s and %s are not compatible" % (self, other))
 
   def concatenate(self, other):
     """Returns the concatenation of the dimension in `self` and `other`.
@@ -620,8 +601,8 @@ class TensorShape(object):
     other = as_shape(other)
     if self.ndims is not None and other.ndims is not None:
       if self.ndims != other.ndims:
-        raise ValueError(
-            "Shapes %s and %s must have the same rank" % (self, other))
+        raise ValueError("Shapes %s and %s must have the same rank" % (self,
+                                                                       other))
 
   def assert_has_rank(self, rank):
     """Raises an exception if `self` is not compatible with the given `rank`.
@@ -757,8 +738,8 @@ class TensorShape(object):
 
   def is_fully_defined(self):
     """Returns True iff `self` is fully defined in every dimension."""
-    return (self._dims is not None
-            and all(dim.value is not None for dim in self._dims))
+    return (self._dims is not None and all(dim.value is not None
+                                           for dim in self._dims))
 
   def assert_is_fully_defined(self):
     """Raises an exception if `self` is not fully defined in every dimension.
@@ -788,9 +769,10 @@ class TensorShape(object):
       return tensor_shape_pb2.TensorShapeProto(unknown_rank=True)
     else:
       return tensor_shape_pb2.TensorShapeProto(dim=[
-          tensor_shape_pb2.TensorShapeProto.Dim(
-              size=-1 if d.value is None else d.value)
-          for d in self._dims])
+          tensor_shape_pb2.TensorShapeProto.Dim(size=-1
+                                                if d.value is None else d.value)
+          for d in self._dims
+      ])
 
   def __eq__(self, other):
     """Returns True if `self` is equivalent to `other`."""
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 99eae47dbc0..10811100010 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -355,6 +355,10 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       nparray = values.astype(dtype.as_numpy_dtype)
     else:
       nparray = values
+  elif callable(getattr(values, "__array__", None)):
+    # If a class has the __array__ method, then it is possible to convert
+    # to numpy array.
+    nparray = np.asarray(values, dtype=dtype)
   else:
     if values is None:
       raise ValueError("None values not supported.")
@@ -364,7 +368,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       np_dt = dtype.as_numpy_dtype
     else:
       np_dt = None
-    if np.prod(shape) == 0:
+    # If shape is None, numpy.prod returns None when dtype is not set, but raises
+    # exception when dtype is set to np.int64
+    if shape is not None and np.prod(shape, dtype=np.int64) == 0:
       nparray = np.empty(shape, dtype=np_dt)
     else:
       _AssertCompatible(values, dtype)
@@ -410,7 +416,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     shape_size = nparray.size
   else:
     shape = [int(dim) for dim in shape]
-    shape_size = np.prod(shape)
+    shape_size = np.prod(shape, dtype=np.int64)
     is_same_size = shape_size == nparray.size
 
     if verify_shape:
@@ -441,7 +447,22 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   # we flatten it conservatively.
   if numpy_dtype == dtypes.string and not isinstance(values, np.ndarray):
     proto_values = _FlattenToStrings(values)
-    tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
+
+    # At this point, values may be a list of objects that we could not
+    # identify a common type for (hence it was inferred as
+    # np.object/dtypes.string).  If we are unable to convert it to a
+    # string, we raise a more helpful error message.
+    #
+    # Ideally, we'd be able to convert the elements of the list to a
+    # common type, but this type inference requires some thinking and
+    # so we defer it for now.
+    try:
+      str_values = [compat.as_bytes(x) for x in proto_values]
+    except TypeError:
+      raise TypeError("Failed to convert object of type %s to Tensor. "
+                      "Contents: %s. Consider casting elements to a "
+                      "supported type." % (type(values), values))
+    tensor_proto.string_val.extend(str_values)
     return tensor_proto
 
   # TensorFlow expects C order (a.k.a., eigen row major).
@@ -472,7 +493,7 @@ def MakeNdarray(tensor):
 
   """
   shape = [d.size for d in tensor.tensor_shape.dim]
-  num_elements = np.prod(shape)
+  num_elements = np.prod(shape, dtype=np.int64)
   tensor_dtype = dtypes.as_dtype(tensor.dtype)
   dtype = tensor_dtype.as_numpy_dtype
 
@@ -652,6 +673,13 @@ def _ConstantValue(tensor):
         return None
       values.append(value)
     return np.array(values)
+  elif tensor.op.type == "Fill":
+    fill_shape = tensor.shape
+    fill_value = constant_value(tensor.op.inputs[1])
+    if fill_shape.is_fully_defined() and fill_value is not None:
+      return np.full(fill_shape.as_list(), fill_value, dtype=fill_value.dtype)
+    else:
+      return None
   else:
     return None
 
@@ -749,3 +777,18 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
       ret = ret.merge_with(tensor_shape.TensorShape(
           [d if d != -1 else None for d in value]))
     return ret
+
+
+def is_tensor(x):  # pylint: disable=invalid-name
+  """Check whether `x` is of tensor type.
+
+  Check whether an object is a tensor. Equivalent to
+  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
+
+  Args:
+    x: An python object to check.
+
+  Returns:
+    `True` if `x` is a tensor, `False` if not.
+  """
+  return isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x)  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index b5dab1ebff3..8949702b875 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import sys
+import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -47,13 +48,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatN(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -65,12 +66,12 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTyped(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
+        """, t)
     else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
@@ -83,13 +84,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTypeCoerce(self):
     t = tensor_util.make_tensor_proto([10, 20, 30], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -102,13 +103,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatTypeCoerceNdarray(self):
     arr = np.asarray([10, 20, 30], dtype="int")
     t = tensor_util.make_tensor_proto(arr, dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -120,13 +121,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[1, 3])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -138,13 +139,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes2(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[3, 1])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } dim { size: 1 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } dim { size: 1 } }
@@ -166,13 +167,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatNpArrayFloat64(self):
     t = tensor_util.make_tensor_proto(
         np.array([[10.0, 20.0, 30.0]], dtype=np.float64))
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_DOUBLE  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_DOUBLE
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -257,13 +258,13 @@ class TensorUtilTest(test.TestCase):
 
   def testIntNDefaultType(self):
     t = tensor_util.make_tensor_proto([10, 20, 30, 40], shape=[2, 2])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT32  
         tensor_shape { dim { size: 2 } dim { size: 2 } }  
         tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT32
         tensor_shape { dim { size: 2 } dim { size: 2 } }
@@ -327,13 +328,13 @@ class TensorUtilTest(test.TestCase):
   def testLongN(self):
     t = tensor_util.make_tensor_proto(
         [10, 20, 30], shape=[1, 3], dtype=dtypes.int64)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT64  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -345,13 +346,13 @@ class TensorUtilTest(test.TestCase):
 
   def testLongNpArray(self):
     t = tensor_util.make_tensor_proto(np.array([10, 20, 30]))
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT64  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 3 } }
@@ -366,13 +367,13 @@ class TensorUtilTest(test.TestCase):
     data = [(21,), (22,), (23,)]
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QINT32  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT32
         tensor_shape { dim { size: 3 } }
@@ -403,13 +404,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.quint16)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QUINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QUINT16
         tensor_shape { dim { size: 3 } }
@@ -420,13 +421,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint16)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT16
         tensor_shape { dim { size: 3 } }
@@ -631,6 +632,11 @@ class TensorUtilTest(test.TestCase):
     with self.assertRaises(TypeError):
       tensor_util.make_tensor_proto([3], dtype=dtypes.qint8)
 
+    # Validate the helpful error message when trying to convert an
+    # unconvertible list as strings.
+    with self.assertRaisesRegexp(TypeError, "Failed to convert object"):
+      tensor_util.make_tensor_proto([tensor_shape.Dimension(1)])
+
   def testTensorShapeVerification(self):
     array = np.array([[1], [2]])
     correct_shape = (2, 1)
@@ -662,6 +668,23 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [1, 4]))
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
+  def testMockArray(self):
+
+    class MockArray(object):
+
+      def __init__(self, array):
+        self.array = array
+
+      def __array__(self, dtype=None):
+        return np.asarray(self.array, dtype)
+
+    with self.test_session() as sess:
+      ma = MockArray(np.array([10, 20, 30]))
+      t = ops.convert_to_tensor(ma)
+      a = sess.run(t)
+      self.assertEquals(np.int64, a.dtype)
+      self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
+
 
 class ConstantValueTest(test.TestCase):
 
@@ -690,6 +713,13 @@ class ConstantValueTest(test.TestCase):
     self.assertAllEqual(np_val, c_val)
     self.assertEqual(np.int32, c_val.dtype)
 
+  def testFill(self):
+    np_val = np.array([-1, -1, -1], dtype=np.float32)
+    tf_val = array_ops.fill([3], constant_op.constant(-1.0))
+    c_val = tensor_util.constant_value(tf_val)
+    self.assertAllEqual(np_val, c_val)
+    self.assertEqual(np.float32, c_val.dtype)
+
   def testSize(self):
     tf_val = array_ops.size(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 32b9b82e56b..094ea6f658a 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -18,14 +18,20 @@ limitations under the License.
 namespace tensorflow {
 
 class TestRandomAccessFile : public RandomAccessFile {
-  // The filecontents is all A's
+  // The file contents is 10 bytes of all A's
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
+    Status s;
     for (int i = 0; i < n; ++i) {
+      if (offset + i >= 10) {
+        n = i;
+        s = errors::OutOfRange("EOF");
+        break;
+      }
       scratch[i] = 'A';
     }
     *result = StringPiece(scratch, n);
-    return Status::OK();
+    return s;
   }
 };
 
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index c19094847d8..275f9bec25a 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_handle.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -31,6 +32,16 @@ REGISTER_OP("GraphDefVersion")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("RequiresOlderGraphVersion")
+    .Output("version: int32")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      if (c->graph_def_version() != TF_GRAPH_DEF_VERSION - 1) {
+        return errors::InvalidArgument("Wrong graph version for shape");
+      }
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("Old")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(8, "For reasons");
@@ -96,8 +107,8 @@ REGISTER_KERNEL_BUILDER(Name("KernelLabel")
 
 class GraphDefVersionOp : public OpKernel {
  public:
-  GraphDefVersionOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {}
+  explicit GraphDefVersionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {}
 
   void Compute(OpKernelContext* ctx) override {
     Tensor* output = nullptr;
@@ -114,7 +125,7 @@ REGISTER_KERNEL_BUILDER(Name("GraphDefVersion").Device(DEVICE_CPU),
 
 class OldOp : public OpKernel {
  public:
-  OldOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit OldOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {}
 };
@@ -134,7 +145,7 @@ REGISTER_KERNEL_BUILDER(Name("ResourceInitializedOp").Device(DEVICE_CPU),
 
 class ResourceCreateOp : public OpKernel {
  public:
-  ResourceCreateOp(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit ResourceCreateOp(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
     OP_REQUIRES_OK(c,
@@ -150,7 +161,7 @@ class ResourceUsingOp : public OpKernel {
  public:
   explicit ResourceUsingOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-  void Compute(OpKernelContext* ctx) {
+  void Compute(OpKernelContext* ctx) override {
     StubResource* unused;
     OP_REQUIRES_OK(ctx, LookupResource<StubResource>(
                             ctx, HandleFromInput(ctx, 0), &unused));
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f2fd687adf6..dd2fd7b2abe 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -29,23 +29,43 @@ import threading
 import numpy as np
 import six
 
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:
+  _portpicker_import_error = _error
+  portpicker = None
+
+# pylint: disable=g-import-not-at-top
+from google.protobuf import descriptor_pool
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 from tensorflow.python.util.protobuf import compare
 
 
+def gpu_device_name():
+  """Returns the name of a GPU device if available or the empty string."""
+  for x in device_lib.list_local_devices():
+    if x.device_type == "GPU" or x.device_type == "SYCL":
+      return x.name
+  return ""
+
+
 def assert_ops_in_graph(expected_ops, graph):
   """Assert all expected operations are found.
 
@@ -63,14 +83,12 @@ def assert_ops_in_graph(expected_ops, graph):
   for node in gd.node:
     if node.name in expected_ops:
       if expected_ops[node.name] != node.op:
-        raise ValueError(
-            "Expected op for node %s is different. %s vs %s" % (
-                node.name, expected_ops[node.name], node.op))
+        raise ValueError("Expected op for node %s is different. %s vs %s" %
+                         (node.name, expected_ops[node.name], node.op))
       actual_ops[node.name] = node
   if set(expected_ops.keys()) != set(actual_ops.keys()):
-    raise ValueError(
-        "Not all expected ops are present. Expected %s, found %s" % (
-            expected_ops.keys(), actual_ops.keys()))
+    raise ValueError("Not all expected ops are present. Expected %s, found %s" %
+                     (expected_ops.keys(), actual_ops.keys()))
   return actual_ops
 
 
@@ -108,6 +126,36 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     raise AssertionError(compat.as_str(diff))
 
 
+def assert_meta_graph_protos_equal(tester, a, b):
+  """Compares MetaGraphDefs `a` and `b` in unit test class `tester`."""
+  # Carefully check the collection_defs
+  tester.assertEqual(set(a.collection_def), set(b.collection_def))
+  collection_keys = a.collection_def.keys()
+  for k in collection_keys:
+    a_value = a.collection_def[k]
+    b_value = b.collection_def[k]
+    proto_type = ops.get_collection_proto_type(k)
+    if proto_type:
+      a_proto = proto_type()
+      b_proto = proto_type()
+      # Number of entries in the collections is the same
+      tester.assertEqual(len(a_value.bytes_list.value),
+                         len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(
+          a_value.bytes_list.value,
+          b_value.bytes_list.value):
+        a_proto.ParseFromString(a_value_item)
+        b_proto.ParseFromString(b_value_item)
+        tester.assertProtoEquals(a_proto, b_proto)
+    else:
+      tester.assertEquals(a_value, b_value)
+  # Compared the fields directly, remove their raw values from the
+  # proto comparison below.
+  a.ClearField("collection_def")
+  b.ClearField("collection_def")
+  tester.assertProtoEquals(a, b)
+
+
 # Matches attributes named via _SHARDED_SUFFIX in
 # tensorflow/python/training/saver.py
 _SHARDED_SAVE_OP_PATTERN = "_temp_[0-9a-z]{32}/part"
@@ -135,11 +183,80 @@ def CudaSupportsHalfMatMulAndConv():
   return pywrap_tensorflow.CudaSupportsHalfMatMulAndConv()
 
 
+def NHWCToNCHW(input_tensor):
+  """Converts the input from the NHWC format to NCHW.
+
+  Args:
+    input_tensor: a 4- or 5-D tensor, or an array representing shape
+
+  Returns:
+    converted tensor or shape array
+  """
+  # tensor dim -> new axis order
+  new_axes = {
+      4: [0, 3, 1, 2],
+      5: [0, 4, 1, 2, 3]
+  }
+  if isinstance(input_tensor, ops.Tensor):
+    ndims = input_tensor.shape.ndims
+    return array_ops.transpose(input_tensor, new_axes[ndims])
+  else:
+    ndims = len(input_tensor)
+    return [input_tensor[a] for a in new_axes[ndims]]
+
+
+def NCHWToNHWC(input_tensor):
+  """Converts the input from the NCHW format to NHWC.
+
+  Args:
+    input_tensor: a 4- or 5-D tensor, or an array representing shape
+
+  Returns:
+    converted tensor or shape array
+  """
+  # tensor dim -> new axis order
+  new_axes = {
+      4: [0, 2, 3, 1],
+      5: [0, 2, 3, 4, 1]
+  }
+  if isinstance(input_tensor, ops.Tensor):
+    ndims = input_tensor.shape.ndims
+    return array_ops.transpose(input_tensor, new_axes[ndims])
+  else:
+    ndims = len(input_tensor)
+    return [input_tensor[a] for a in new_axes[ndims]]
+
+
+# TODO(skyewm): remove this eventually
+def disable_c_api(fn):
+  """Decorator for disabling the C API on a test.
+
+  Note this disables the C API after running the test class's setup/teardown
+  methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+  # pylint: disable=protected-access
+  def disable_c_api_wrapper(*args, **kwargs):
+    prev_value = ops._USE_C_API
+    ops._USE_C_API = False
+    try:
+      fn(*args, **kwargs)
+    finally:
+      ops._USE_C_API = prev_value
+  # pylint: disable=protected-access
+  return disable_c_api_wrapper
+
+
 class TensorFlowTestCase(googletest.TestCase):
   """Base class for tests that need to test TensorFlow.
   """
 
-  def __init__(self, methodName="runTest"):
+  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TensorFlowTestCase, self).__init__(methodName)
     self._threads = []
     self._tempdir = None
@@ -166,9 +283,13 @@ class TensorFlowTestCase(googletest.TestCase):
   def get_temp_dir(self):
     """Returns a unique temporary directory for the test to use.
 
-    Across different test runs, this method will return a different folder.
-    This will ensure that across different runs tests will not be able to
-    pollute each others environment.
+    If you call this method multiple times during in a test, it will return the
+    same folder. However, across different runs the directories will be
+    different. This will ensure that across different runs tests will not be
+    able to pollute each others environment.
+    If you need multiple unique directories within a single test, you should
+    use tempfile.mkdtemp as follows:
+      tempfile.mkdtemp(dir=self.get_temp_dir()):
 
     Returns:
       string, the path to the unique temporary directory created for this test.
@@ -198,23 +319,27 @@ class TensorFlowTestCase(googletest.TestCase):
     then compares them using self._AssertProtoEqual().
 
     Args:
-      expected_message_maybe_ascii: proto message in original or ascii form
-      message: the message to validate
+      expected_message_maybe_ascii: proto message in original or ascii form.
+      message: the message to validate.
     """
 
-    if type(expected_message_maybe_ascii) == type(message):
+    if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
       self._AssertProtoEquals(expected_message, message)
     elif isinstance(expected_message_maybe_ascii, str):
       expected_message = type(message)()
-      text_format.Merge(expected_message_maybe_ascii, expected_message)
+      text_format.Merge(expected_message_maybe_ascii, expected_message,
+                        descriptor_pool=descriptor_pool.Default())
       self._AssertProtoEquals(expected_message, message)
     else:
       assert False, ("Can't compare protos of type %s and %s" %
                      (type(expected_message_maybe_ascii), type(message)))
 
   def assertProtoEqualsVersion(
-      self, expected, actual, producer=versions.GRAPH_DEF_VERSION,
+      self,
+      expected,
+      actual,
+      producer=versions.GRAPH_DEF_VERSION,
       min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER):
     expected = "versions { producer: %d min_consumer: %d };\n%s" % (
         producer, min_consumer, expected)
@@ -281,7 +406,16 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
+
     def prepare_config(config):
+      """Returns a config for sessions.
+
+      Args:
+        config: An optional config_pb2.ConfigProto to use to configure the
+          session.
+      Returns:
+        A config_pb2.ConfigProto object.
+      """
       if config is None:
         config = config_pb2.ConfigProto()
         config.allow_soft_placement = not force_gpu
@@ -296,12 +430,17 @@ class TensorFlowTestCase(googletest.TestCase):
 
     if graph is None:
       if self._cached_session is None:
-        self._cached_session = session.Session(graph=None,
-                                               config=prepare_config(config))
+        self._cached_session = session.Session(
+            graph=None, config=prepare_config(config))
       sess = self._cached_session
       with sess.graph.as_default(), sess.as_default():
         if force_gpu:
-          with sess.graph.device("/gpu:0"):
+          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # otherwise
+          gpu_name = gpu_device_name()
+          if not gpu_name:
+            gpu_name = "/gpu:0"
+          with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
           yield sess
@@ -311,13 +450,19 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       with session.Session(graph=graph, config=prepare_config(config)) as sess:
         if force_gpu:
-          with sess.graph.device("/gpu:0"):
+          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # otherwise
+          gpu_name = gpu_device_name()
+          if not gpu_name:
+            gpu_name = "/gpu:0"
+          with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
           yield sess
         else:
           with sess.graph.device("/cpu:0"):
             yield sess
+
   # pylint: enable=g-doc-return-or-yield
 
   class _CheckedThread(object):
@@ -368,8 +513,7 @@ class TensorFlowTestCase(googletest.TestCase):
       """
       self._thread.join()
       if self._exception is not None:
-        self._testcase.fail(
-            "Error in checkedThread: %s" % str(self._exception))
+        self._testcase.fail("Error in checkedThread: %s" % str(self._exception))
 
     def is_alive(self):
       """Returns whether the thread is alive.
@@ -401,6 +545,7 @@ class TensorFlowTestCase(googletest.TestCase):
     ret = TensorFlowTestCase._CheckedThread(self, target, args, kwargs)
     self._threads.append(ret)
     return ret
+
 # pylint: enable=invalid-name
 
   def assertNear(self, f1, f2, err, msg=None):
@@ -415,9 +560,10 @@ class TensorFlowTestCase(googletest.TestCase):
       err: A float value.
       msg: An optional string message to append to the failure message.
     """
-    self.assertTrue(math.fabs(f1 - f2) <= err,
-                    "%f != %f +/- %f%s" % (
-                        f1, f2, err, " (%s)" % msg if msg is not None else ""))
+    self.assertTrue(
+        math.fabs(f1 - f2) <= err,
+        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
+                               if msg is not None else ""))
 
   def assertArrayNear(self, farray1, farray2, err):
     """Asserts that two float arrays are near each other.
@@ -452,20 +598,11 @@ class TensorFlowTestCase(googletest.TestCase):
       a = np.array(a)
     return a
 
-  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-    """Asserts that two numpy arrays have near values.
-
-    Args:
-      a: a numpy ndarray or anything can be converted to one.
-      b: a numpy ndarray or anything can be converted to one.
-      rtol: relative tolerance
-      atol: absolute tolerance
-    """
+  def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(
-        a.shape, b.shape,
-        "Shape mismatch: expected %s, got %s." % (a.shape, b.shape))
+    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
+                     (a.shape, b.shape))
     if not np.allclose(a, b, rtol=rtol, atol=atol):
       # Prints more details than np.testing.assert_allclose.
       #
@@ -489,11 +626,47 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not close dif = ", np.abs(x - y))
       print("not close tol = ", atol + rtol * np.abs(y))
       print("dtype = %s, shape = %s" % (a.dtype, a.shape))
-      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
+      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
 
-  def assertAllCloseAccordingToType(self, a, b, rtol=1e-6, atol=1e-6,
-                                    float_rtol=1e-6, float_atol=1e-6,
-                                    half_rtol=1e-3, half_atol=1e-3):
+  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+    """Asserts that two numpy arrays, or dicts of same, have near values.
+
+    This does not support nested dicts.
+
+    Args:
+      a: A numpy ndarray (or anything can be converted to one), or dict of same.
+        Must be a dict iff `b` is a dict.
+      b: A numpy ndarray (or anything can be converted to one), or dict of same.
+        Must be a dict iff `a` is a dict.
+      rtol: relative tolerance.
+      atol: absolute tolerance.
+
+    Raises:
+      ValueError: if only one of `a` and `b` is a dict.
+    """
+    is_a_dict = isinstance(a, dict)
+    if is_a_dict != isinstance(b, dict):
+      raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a, b))
+    if is_a_dict:
+      self.assertItemsEqual(
+          a.keys(), b.keys(),
+          msg="mismatched keys, expected %s, got %s" % (a.keys(), b.keys()))
+      for k in a:
+        self._assertArrayLikeAllClose(
+            a[k], b[k], rtol=rtol, atol=atol,
+            msg="%s: expected %s, got %s." % (k, a, b))
+    else:
+      self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+
+  def assertAllCloseAccordingToType(self,
+                                    a,
+                                    b,
+                                    rtol=1e-6,
+                                    atol=1e-6,
+                                    float_rtol=1e-6,
+                                    float_atol=1e-6,
+                                    half_rtol=1e-3,
+                                    half_atol=1e-3):
     """Like assertAllClose, but also suitable for comparing fp16 arrays.
 
     In particular, the tolerance is reduced to 1e-3 if at least
@@ -502,16 +675,17 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       a: a numpy ndarray or anything can be converted to one.
       b: a numpy ndarray or anything can be converted to one.
-      rtol: relative tolerance
-      atol: absolute tolerance
-      float_rtol: relative tolerance for float32
-      float_atol: absolute tolerance for float32
-      half_rtol: relative tolerance for float16
-      half_atol: absolute tolerance for float16
+      rtol: relative tolerance.
+      atol: absolute tolerance.
+      float_rtol: relative tolerance for float32.
+      float_atol: absolute tolerance for float32.
+      half_rtol: relative tolerance for float16.
+      half_atol: absolute tolerance for float16.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    if a.dtype == np.float32 or b.dtype == np.float32:
+    if (a.dtype == np.float32 or b.dtype == np.float32 or
+        a.dtype == np.complex64 or b.dtype == np.complex64):
       rtol = max(rtol, float_rtol)
       atol = max(atol, float_atol)
     if a.dtype == np.float16 or b.dtype == np.float16:
@@ -529,9 +703,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(
-        a.shape, b.shape,
-        "Shape mismatch: expected %s, got %s." % (a.shape, b.shape))
+    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
+                     (a.shape, b.shape))
     same = (a == b)
 
     if a.dtype == np.float32 or a.dtype == np.float64:
@@ -573,22 +746,25 @@ class TensorFlowTestCase(googletest.TestCase):
     if callable(expected_err_re_or_predicate):
       predicate = expected_err_re_or_predicate
     else:
+
       def predicate(e):
         err_str = e.message if isinstance(e, errors.OpError) else str(e)
         op = e.op if isinstance(e, errors.OpError) else None
         while op is not None:
           err_str += "\nCaused by: " + op.name
-          op = op._original_op
+          op = op._original_op  # pylint: disable=protected-access
         logging.info("Searching within error strings: '%s' within '%s'",
                      expected_err_re_or_predicate, err_str)
         return re.search(expected_err_re_or_predicate, err_str)
+
     try:
       yield
       self.fail(exception_type.__name__ + " not raised")
     except Exception as e:  # pylint: disable=broad-except
       if not isinstance(e, exception_type) or not predicate(e):
-        raise AssertionError("Exception of type %s: %s" %
-                             (str(type(e)), str(e)))
+        raise AssertionError("Exception of type %s: %s" % (str(type(e)),
+                                                           str(e)))
+
   # pylint: enable=g-doc-return-or-yield
 
   def assertRaisesOpError(self, expected_err_re_or_predicate):
@@ -625,8 +801,71 @@ class TensorFlowTestCase(googletest.TestCase):
 
   # Fix Python 3 compatibility issues
   if six.PY3:
+    # pylint: disable=invalid-name
+
     # Silence a deprecation warning
     assertRaisesRegexp = googletest.TestCase.assertRaisesRegex
 
     # assertItemsEqual is assertCountEqual as of 3.2.
     assertItemsEqual = googletest.TestCase.assertCountEqual
+
+    # pylint: enable=invalid-name
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create and start local servers and return the associated `Server` objects.
+
+  Example:
+  ```python
+  workers, _ = tf.test.create_local_cluster(num_workers=2, num_ps=2)
+
+  worker_sessions = [tf.Session(w.target) for w in workers]
+
+  with tf.device("/job:ps/task:0"):
+    ...
+  with tf.device("/job:ps/task:1"):
+    ...
+  with tf.device("/job:worker/task:0"):
+    ...
+  with tf.device("/job:worker/task:1"):
+    ...
+
+  worker_sessions[0].run(...)
+  ```
+
+  Args:
+    num_workers: Number of worker servers to start.
+    num_ps: Number of PS servers to start.
+    protocol: Communication protocol.  Allowed values are documented in
+      the documentation of `tf.train.Server`.
+
+  Returns:
+    A tuple `(worker_servers, ps_servers)`.  `worker_servers` is a list
+    of `num_workers` objects of type `tf.train.Server` (all running locally);
+    and `ps_servers` is a list of `num_ps` objects of similar type.
+
+  Raises:
+    ImportError: if portpicker module was not found at load time
+  """
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  ps_servers = [
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
+  ]
+
+  return workers, ps_servers
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index cb021c11708..6129fa2e0d0 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -22,11 +22,11 @@ import random
 import threading
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -88,6 +88,34 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     # test original comparison
     self.assertProtoEquals(graph_def, graph_def)
 
+  def testAssertProtoEqualsAny(self):
+    # Test assertProtoEquals with a protobuf.Any field.
+    meta_graph_def_str = """
+    meta_info_def {
+      meta_graph_version: "outer"
+      any_info {
+        [type.googleapis.com/tensorflow.MetaGraphDef] {
+          meta_info_def {
+            meta_graph_version: "inner"
+          }
+        }
+      }
+    }
+    """
+    meta_graph_def_outer = meta_graph_pb2.MetaGraphDef()
+    meta_graph_def_outer.meta_info_def.meta_graph_version = "outer"
+    meta_graph_def_inner = meta_graph_pb2.MetaGraphDef()
+    meta_graph_def_inner.meta_info_def.meta_graph_version = "inner"
+    meta_graph_def_outer.meta_info_def.any_info.Pack(meta_graph_def_inner)
+    self.assertProtoEquals(meta_graph_def_str, meta_graph_def_outer)
+    self.assertProtoEquals(meta_graph_def_outer, meta_graph_def_outer)
+
+    # Check if the assertion failure message contains the content of
+    # the inner proto.
+    with self.assertRaisesRegexp(AssertionError,
+                                 r'meta_graph_version: "inner"'):
+      self.assertProtoEquals("", meta_graph_def_outer)
+
   def testNDArrayNear(self):
     a1 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     a2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
@@ -168,7 +196,47 @@ class TestUtilTest(test_util.TensorFlowTestCase):
   def testAllCloseScalars(self):
     self.assertAllClose(7, 7 + 1e-8)
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
-      self.assertAllClose(7, 8)
+      self.assertAllClose(7, 7 + 1e-5)
+
+  def testAllCloseDictToNonDict(self):
+    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+      self.assertAllClose(1, {"a": 1})
+    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+      self.assertAllClose({"a": 1}, 1)
+
+  def testAllCloseDicts(self):
+    a = 7
+    b = (2., 3.)
+    c = np.ones((3, 2, 4)) * 7.
+    expected = {"a": a, "b": b, "c": c}
+
+    # Identity.
+    self.assertAllClose(expected, expected)
+    self.assertAllClose(expected, dict(expected))
+
+    # With each item removed.
+    for k in expected:
+      actual = dict(expected)
+      del actual[k]
+      with self.assertRaisesRegexp(AssertionError, r"mismatched keys"):
+        self.assertAllClose(expected, actual)
+
+    # With each item changed.
+    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+      self.assertAllClose(expected, {"a": a + 1e-5, "b": b, "c": c})
+    with self.assertRaisesRegexp(AssertionError, r"Shape mismatch"):
+      self.assertAllClose(expected, {"a": a, "b": b + (4.,), "c": c})
+    c_copy = np.array(c)
+    c_copy[1, 1, 1] += 1e-5
+    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+      self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy})
+
+  def testAllCloseNestedDicts(self):
+    a = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"inputs could not be safely coerced to any supported types"):
+      self.assertAllClose(a, a)
 
   def testArrayNear(self):
     a = [1, 2]
@@ -184,8 +252,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertArrayNear(a, b, 0.001)
 
   def testForceGPU(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Cannot assign a device to node"):
+    with self.assertRaises(errors.InvalidArgumentError):
       with self.test_session(force_gpu=True):
         # this relies on us not having a GPU implementation for assert, which
         # seems sensible
@@ -196,50 +263,50 @@ class TestUtilTest(test_util.TensorFlowTestCase):
   def testAssertAllCloseAccordingToType(self):
     # test float64
     self.assertAllCloseAccordingToType(
-      np.asarray([1e-8], dtype=np.float64),
-      np.asarray([2e-8], dtype=np.float64),
-      rtol=1e-8, atol=1e-8
+        np.asarray([1e-8], dtype=np.float64),
+        np.asarray([2e-8], dtype=np.float64),
+        rtol=1e-8, atol=1e-8
     )
 
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
-        np.asarray([1e-7], dtype=np.float64),
-        np.asarray([2e-7], dtype=np.float64),
-        rtol=1e-8, atol=1e-8
+          np.asarray([1e-7], dtype=np.float64),
+          np.asarray([2e-7], dtype=np.float64),
+          rtol=1e-8, atol=1e-8
       )
 
     # test float32
     self.assertAllCloseAccordingToType(
-      np.asarray([1e-7], dtype=np.float32),
-      np.asarray([2e-7], dtype=np.float32),
-      rtol=1e-8, atol=1e-8,
-      float_rtol=1e-7, float_atol=1e-7
+        np.asarray([1e-7], dtype=np.float32),
+        np.asarray([2e-7], dtype=np.float32),
+        rtol=1e-8, atol=1e-8,
+        float_rtol=1e-7, float_atol=1e-7
     )
 
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
-        np.asarray([1e-6], dtype=np.float32),
-        np.asarray([2e-6], dtype=np.float32),
-        rtol=1e-8, atol=1e-8,
-        float_rtol=1e-7, float_atol=1e-7
+          np.asarray([1e-6], dtype=np.float32),
+          np.asarray([2e-6], dtype=np.float32),
+          rtol=1e-8, atol=1e-8,
+          float_rtol=1e-7, float_atol=1e-7
       )
 
     # test float16
     self.assertAllCloseAccordingToType(
-      np.asarray([1e-4], dtype=np.float16),
-      np.asarray([2e-4], dtype=np.float16),
-      rtol=1e-8, atol=1e-8,
-      float_rtol=1e-7, float_atol=1e-7,
-      half_rtol=1e-4, half_atol=1e-4
+        np.asarray([1e-4], dtype=np.float16),
+        np.asarray([2e-4], dtype=np.float16),
+        rtol=1e-8, atol=1e-8,
+        float_rtol=1e-7, float_atol=1e-7,
+        half_rtol=1e-4, half_atol=1e-4
     )
 
     with (self.assertRaises(AssertionError)):
       self.assertAllCloseAccordingToType(
-        np.asarray([1e-3], dtype=np.float16),
-        np.asarray([2e-3], dtype=np.float16),
-        rtol=1e-8, atol=1e-8,
-        float_rtol=1e-7, float_atol=1e-7,
-        half_rtol=1e-4, half_atol=1e-4
+          np.asarray([1e-3], dtype=np.float16),
+          np.asarray([2e-3], dtype=np.float16),
+          rtol=1e-8, atol=1e-8,
+          float_rtol=1e-7, float_atol=1e-7,
+          half_rtol=1e-4, half_atol=1e-4
       )
 
   def testRandomSeed(self):
diff --git a/tensorflow/python/framework/testdata/metrics_export_meta_graph.pb b/tensorflow/python/framework/testdata/metrics_export_meta_graph.pb
new file mode 100644
index 00000000000..fffbe8df2f0
--- /dev/null
+++ b/tensorflow/python/framework/testdata/metrics_export_meta_graph.pb
@@ -0,0 +1,1558 @@
+meta_info_def {
+  stripped_op_list {
+    op {
+      name: "Assign"
+      input_arg {
+        name: "ref"
+        type_attr: "T"
+        is_ref: true
+      }
+      input_arg {
+        name: "value"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "output_ref"
+        type_attr: "T"
+        is_ref: true
+      }
+      attr {
+        name: "T"
+        type: "type"
+      }
+      attr {
+        name: "validate_shape"
+        type: "bool"
+        default_value {
+          b: true
+        }
+      }
+      attr {
+        name: "use_locking"
+        type: "bool"
+        default_value {
+          b: true
+        }
+      }
+      allows_uninitialized_input: true
+    }
+    op {
+      name: "AssignAdd"
+      input_arg {
+        name: "ref"
+        type_attr: "T"
+        is_ref: true
+      }
+      input_arg {
+        name: "value"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "output_ref"
+        type_attr: "T"
+        is_ref: true
+      }
+      attr {
+        name: "T"
+        type: "type"
+        allowed_values {
+          list {
+            type: DT_FLOAT
+            type: DT_DOUBLE
+            type: DT_INT64
+            type: DT_INT32
+            type: DT_UINT8
+            type: DT_UINT16
+            type: DT_INT16
+            type: DT_INT8
+            type: DT_COMPLEX64
+            type: DT_COMPLEX128
+            type: DT_QINT8
+            type: DT_QUINT8
+            type: DT_QINT32
+            type: DT_HALF
+          }
+        }
+      }
+      attr {
+        name: "use_locking"
+        type: "bool"
+        default_value {
+          b: false
+        }
+      }
+    }
+    op {
+      name: "Cast"
+      input_arg {
+        name: "x"
+        type_attr: "SrcT"
+      }
+      output_arg {
+        name: "y"
+        type_attr: "DstT"
+      }
+      attr {
+        name: "SrcT"
+        type: "type"
+      }
+      attr {
+        name: "DstT"
+        type: "type"
+      }
+    }
+    op {
+      name: "Const"
+      output_arg {
+        name: "output"
+        type_attr: "dtype"
+      }
+      attr {
+        name: "value"
+        type: "tensor"
+      }
+      attr {
+        name: "dtype"
+        type: "type"
+      }
+    }
+    op {
+      name: "FIFOQueueV2"
+      output_arg {
+        name: "handle"
+        type: DT_RESOURCE
+      }
+      attr {
+        name: "component_types"
+        type: "list(type)"
+        has_minimum: true
+        minimum: 1
+      }
+      attr {
+        name: "shapes"
+        type: "list(shape)"
+        default_value {
+          list {
+          }
+        }
+        has_minimum: true
+      }
+      attr {
+        name: "capacity"
+        type: "int"
+        default_value {
+          i: -1
+        }
+      }
+      attr {
+        name: "container"
+        type: "string"
+        default_value {
+          s: ""
+        }
+      }
+      attr {
+        name: "shared_name"
+        type: "string"
+        default_value {
+          s: ""
+        }
+      }
+      is_stateful: true
+    }
+    op {
+      name: "Greater"
+      input_arg {
+        name: "x"
+        type_attr: "T"
+      }
+      input_arg {
+        name: "y"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "z"
+        type: DT_BOOL
+      }
+      attr {
+        name: "T"
+        type: "type"
+        allowed_values {
+          list {
+            type: DT_FLOAT
+            type: DT_DOUBLE
+            type: DT_INT32
+            type: DT_INT64
+            type: DT_UINT8
+            type: DT_INT16
+            type: DT_INT8
+            type: DT_UINT16
+            type: DT_HALF
+          }
+        }
+      }
+    }
+    op {
+      name: "Identity"
+      input_arg {
+        name: "input"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "output"
+        type_attr: "T"
+      }
+      attr {
+        name: "T"
+        type: "type"
+      }
+    }
+    op {
+      name: "NoOp"
+    }
+    op {
+      name: "QueueDequeueV2"
+      input_arg {
+        name: "handle"
+        type: DT_RESOURCE
+      }
+      output_arg {
+        name: "components"
+        type_list_attr: "component_types"
+      }
+      attr {
+        name: "component_types"
+        type: "list(type)"
+        has_minimum: true
+        minimum: 1
+      }
+      attr {
+        name: "timeout_ms"
+        type: "int"
+        default_value {
+          i: -1
+        }
+      }
+      is_stateful: true
+    }
+    op {
+      name: "QueueEnqueueV2"
+      input_arg {
+        name: "handle"
+        type: DT_RESOURCE
+      }
+      input_arg {
+        name: "components"
+        type_list_attr: "Tcomponents"
+      }
+      attr {
+        name: "Tcomponents"
+        type: "list(type)"
+        has_minimum: true
+        minimum: 1
+      }
+      attr {
+        name: "timeout_ms"
+        type: "int"
+        default_value {
+          i: -1
+        }
+      }
+      is_stateful: true
+    }
+    op {
+      name: "RealDiv"
+      input_arg {
+        name: "x"
+        type_attr: "T"
+      }
+      input_arg {
+        name: "y"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "z"
+        type_attr: "T"
+      }
+      attr {
+        name: "T"
+        type: "type"
+        allowed_values {
+          list {
+            type: DT_HALF
+            type: DT_FLOAT
+            type: DT_DOUBLE
+            type: DT_UINT8
+            type: DT_INT8
+            type: DT_UINT16
+            type: DT_INT16
+            type: DT_INT32
+            type: DT_INT64
+            type: DT_COMPLEX64
+            type: DT_COMPLEX128
+          }
+        }
+      }
+    }
+    op {
+      name: "Select"
+      input_arg {
+        name: "condition"
+        type: DT_BOOL
+      }
+      input_arg {
+        name: "t"
+        type_attr: "T"
+      }
+      input_arg {
+        name: "e"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "output"
+        type_attr: "T"
+      }
+      attr {
+        name: "T"
+        type: "type"
+      }
+    }
+    op {
+      name: "Sum"
+      input_arg {
+        name: "input"
+        type_attr: "T"
+      }
+      input_arg {
+        name: "reduction_indices"
+        type_attr: "Tidx"
+      }
+      output_arg {
+        name: "output"
+        type_attr: "T"
+      }
+      attr {
+        name: "keep_dims"
+        type: "bool"
+        default_value {
+          b: false
+        }
+      }
+      attr {
+        name: "T"
+        type: "type"
+        allowed_values {
+          list {
+            type: DT_FLOAT
+            type: DT_DOUBLE
+            type: DT_INT64
+            type: DT_INT32
+            type: DT_UINT8
+            type: DT_UINT16
+            type: DT_INT16
+            type: DT_INT8
+            type: DT_COMPLEX64
+            type: DT_COMPLEX128
+            type: DT_QINT8
+            type: DT_QUINT8
+            type: DT_QINT32
+            type: DT_HALF
+          }
+        }
+      }
+      attr {
+        name: "Tidx"
+        type: "type"
+        default_value {
+          type: DT_INT32
+        }
+        allowed_values {
+          list {
+            type: DT_INT32
+            type: DT_INT64
+          }
+        }
+      }
+    }
+    op {
+      name: "VariableV2"
+      output_arg {
+        name: "ref"
+        type_attr: "dtype"
+        is_ref: true
+      }
+      attr {
+        name: "shape"
+        type: "shape"
+      }
+      attr {
+        name: "dtype"
+        type: "type"
+      }
+      attr {
+        name: "container"
+        type: "string"
+        default_value {
+          s: ""
+        }
+      }
+      attr {
+        name: "shared_name"
+        type: "string"
+        default_value {
+          s: ""
+        }
+      }
+      is_stateful: true
+    }
+  }
+  tensorflow_version: "1.1.0-rc2"
+  tensorflow_git_version: "unknown"
+}
+graph_def {
+  node {
+    name: "fifo_queue"
+    op: "FIFOQueueV2"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "capacity"
+      value {
+        i: 4
+      }
+    }
+    attr {
+      key: "component_types"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "container"
+      value {
+        s: ""
+      }
+    }
+    attr {
+      key: "shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "shared_name"
+      value {
+        s: ""
+      }
+    }
+  }
+  node {
+    name: "Const"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+          tensor_content: "\000\000\000\000\000\000\200?"
+        }
+      }
+    }
+  }
+  node {
+    name: "fifo_queue_enqueue"
+    op: "QueueEnqueueV2"
+    input: "fifo_queue"
+    input: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "Tcomponents"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "timeout_ms"
+      value {
+        i: -1
+      }
+    }
+  }
+  node {
+    name: "Const_1"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+          tensor_content: "ff\206\300\232\231\021A"
+        }
+      }
+    }
+  }
+  node {
+    name: "fifo_queue_enqueue_1"
+    op: "QueueEnqueueV2"
+    input: "fifo_queue"
+    input: "Const_1"
+    device: "/device:CPU:0"
+    attr {
+      key: "Tcomponents"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "timeout_ms"
+      value {
+        i: -1
+      }
+    }
+  }
+  node {
+    name: "Const_2"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+          tensor_content: "\000\000\320@\000\000\000\000"
+        }
+      }
+    }
+  }
+  node {
+    name: "fifo_queue_enqueue_2"
+    op: "QueueEnqueueV2"
+    input: "fifo_queue"
+    input: "Const_2"
+    device: "/device:CPU:0"
+    attr {
+      key: "Tcomponents"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "timeout_ms"
+      value {
+        i: -1
+      }
+    }
+  }
+  node {
+    name: "Const_3"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+          tensor_content: "\315\314L\300\000\000\200@"
+        }
+      }
+    }
+  }
+  node {
+    name: "fifo_queue_enqueue_3"
+    op: "QueueEnqueueV2"
+    input: "fifo_queue"
+    input: "Const_3"
+    device: "/device:CPU:0"
+    attr {
+      key: "Tcomponents"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "timeout_ms"
+      value {
+        i: -1
+      }
+    }
+  }
+  node {
+    name: "fifo_queue_Dequeue"
+    op: "QueueDequeueV2"
+    input: "fifo_queue"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 1
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "component_types"
+      value {
+        list {
+          type: DT_FLOAT
+        }
+      }
+    }
+    attr {
+      key: "timeout_ms"
+      value {
+        i: -1
+      }
+    }
+  }
+  node {
+    name: "mean/total/Initializer/zeros"
+    op: "Const"
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/total"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/total"
+    op: "VariableV2"
+    device: "/device:CPU:0"
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/total"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "container"
+      value {
+        s: ""
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "shape"
+      value {
+        shape {
+        }
+      }
+    }
+    attr {
+      key: "shared_name"
+      value {
+        s: ""
+      }
+    }
+  }
+  node {
+    name: "mean/total/Assign"
+    op: "Assign"
+    input: "mean/total"
+    input: "mean/total/Initializer/zeros"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/total"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "use_locking"
+      value {
+        b: true
+      }
+    }
+    attr {
+      key: "validate_shape"
+      value {
+        b: true
+      }
+    }
+  }
+  node {
+    name: "mean/total/read"
+    op: "Identity"
+    input: "mean/total"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/total"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/count/Initializer/zeros"
+    op: "Const"
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/count"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/count"
+    op: "VariableV2"
+    device: "/device:CPU:0"
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/count"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "container"
+      value {
+        s: ""
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "shape"
+      value {
+        shape {
+        }
+      }
+    }
+    attr {
+      key: "shared_name"
+      value {
+        s: ""
+      }
+    }
+  }
+  node {
+    name: "mean/count/Assign"
+    op: "Assign"
+    input: "mean/count"
+    input: "mean/count/Initializer/zeros"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/count"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "use_locking"
+      value {
+        b: true
+      }
+    }
+    attr {
+      key: "validate_shape"
+      value {
+        b: true
+      }
+    }
+  }
+  node {
+    name: "mean/count/read"
+    op: "Identity"
+    input: "mean/count"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/count"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Size"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_INT32
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT32
+          tensor_shape {
+          }
+          int_val: 2
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/ToFloat_1"
+    op: "Cast"
+    input: "mean/Size"
+    device: "/device:CPU:0"
+    attr {
+      key: "DstT"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "SrcT"
+      value {
+        type: DT_INT32
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Const"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_INT32
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT32
+          tensor_shape {
+            dim {
+              size: 2
+            }
+          }
+          tensor_content: "\000\000\000\000\001\000\000\000"
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Sum"
+    op: "Sum"
+    input: "fifo_queue_Dequeue"
+    input: "mean/Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "Tidx"
+      value {
+        type: DT_INT32
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "keep_dims"
+      value {
+        b: false
+      }
+    }
+  }
+  node {
+    name: "mean/AssignAdd"
+    op: "AssignAdd"
+    input: "mean/total"
+    input: "mean/Sum"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/total"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "use_locking"
+      value {
+        b: false
+      }
+    }
+  }
+  node {
+    name: "mean/AssignAdd_1"
+    op: "AssignAdd"
+    input: "mean/count"
+    input: "mean/ToFloat_1"
+    input: "^fifo_queue_Dequeue"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_class"
+      value {
+        list {
+          s: "loc:@mean/count"
+        }
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "use_locking"
+      value {
+        b: false
+      }
+    }
+  }
+  node {
+    name: "mean/Greater/y"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Greater"
+    op: "Greater"
+    input: "mean/count/read"
+    input: "mean/Greater/y"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/truediv"
+    op: "RealDiv"
+    input: "mean/total/read"
+    input: "mean/count/read"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/value/e"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/value"
+    op: "Select"
+    input: "mean/Greater"
+    input: "mean/truediv"
+    input: "mean/value/e"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Greater_1/y"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/Greater_1"
+    op: "Greater"
+    input: "mean/AssignAdd_1"
+    input: "mean/Greater_1/y"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/truediv_1"
+    op: "RealDiv"
+    input: "mean/AssignAdd"
+    input: "mean/AssignAdd_1"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/update_op/e"
+    op: "Const"
+    device: "/device:CPU:0"
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+    attr {
+      key: "dtype"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_FLOAT
+          tensor_shape {
+          }
+          float_val: 0.0
+        }
+      }
+    }
+  }
+  node {
+    name: "mean/update_op"
+    op: "Select"
+    input: "mean/Greater_1"
+    input: "mean/truediv_1"
+    input: "mean/update_op/e"
+    device: "/device:CPU:0"
+    attr {
+      key: "T"
+      value {
+        type: DT_FLOAT
+      }
+    }
+    attr {
+      key: "_output_shapes"
+      value {
+        list {
+          shape {
+          }
+        }
+      }
+    }
+  }
+  node {
+    name: "init"
+    op: "NoOp"
+    input: "^mean/total/Assign"
+    input: "^mean/count/Assign"
+    device: "/device:CPU:0"
+  }
+  versions {
+    producer: 23
+  }
+}
+collection_def {
+  key: "local_variables"
+  value {
+    node_list {
+      value: "mean/total:0"
+      value: "mean/count:0"
+    }
+  }
+}
+collection_def {
+  key: "update_op"
+  value {
+    node_list {
+      value: "mean/update_op:0"
+    }
+  }
+}
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
new file mode 100644
index 00000000000..29976b79495
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -0,0 +1,231 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/grappler/cost_analyzer.h"
+
+#include <iomanip>
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
+                           const string& suffix)
+    : item_(&item),
+      measure_estimator_(cluster, 10, 0),
+      analytical_estimator_(cluster, false),
+      suffix_(suffix) {}
+
+Status CostAnalyzer::GenerateReport(std::ostream& os) {
+  GatherCosts();
+  PreprocessCosts();
+  AnalyzeCosts();
+  PrintAnalysis(os);
+  return Status::OK();
+}
+
+void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
+                                CostGraphDef* cost_graph, int64* total_time) {
+  TF_CHECK_OK(cost_estimator->Initialize(*item_));
+  Costs costs;
+  const Status status =
+      cost_estimator->PredictCosts(item_->graph, cost_graph, &costs);
+  *total_time = costs.execution_time.count();
+  if (!status.ok()) {
+    LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
+               << status.error_message();
+    return;
+  }
+}
+
+void CostAnalyzer::GatherCosts() {
+  CostGraphDef cost_graph_measured;
+  PredictCosts(&measure_estimator_, &cost_graph_measured,
+               &total_time_measured_);
+  VLOG(1) << "Graph size: " << item_->graph.node_size();
+  VLOG(1) << "cost_graph_measured size: " << cost_graph_measured.node_size();
+
+  CostGraphDef cost_graph_analytical;
+  PredictCosts(&analytical_estimator_, &cost_graph_analytical,
+               &total_time_analytical_);
+  VLOG(1) << "cost_graph_analytical size: "
+          << cost_graph_analytical.node_size();
+
+  CostGraphDef cost_graph_analytical_filtered;
+  CostGraphDef cost_graph_measured_filtered;
+  std::map<string, const CostGraphDef_Node*> measured_nodes;
+  for (const auto& node : cost_graph_measured.node()) {
+    measured_nodes[node.name()] = &node;
+  }
+  for (const auto& node : cost_graph_analytical.node()) {
+    auto it = measured_nodes.find(node.name());
+    // Filter the nodes that are not the cost nodes returned by
+    // MeasuringCostEstimator.
+    if (it == measured_nodes.end()) {
+      continue;
+    }
+    auto added_node_analytical = cost_graph_analytical_filtered.add_node();
+    auto added_node_measured = cost_graph_measured_filtered.add_node();
+    *added_node_analytical = node;
+    *added_node_measured = *(it->second);
+  }
+  VLOG(1) << "cost_graph_analytical_filtered size: "
+          << cost_graph_analytical_filtered.node_size();
+
+  // TODO(yaozhang): add a test to make sure that op_perf_analytical_ and
+  // op_perf_ cover the same set of nodes.
+  op_perf_analytical_ = CostGraphToOpPerformanceData(
+      cost_graph_analytical_filtered, item_->graph);
+  op_perf_ =
+      CostGraphToOpPerformanceData(cost_graph_measured_filtered, item_->graph);
+}
+
+void CostAnalyzer::PreprocessCosts() {
+  for (int i = 0; i < op_perf_.op_performance_size(); i++) {
+    OpPerformance* perf = op_perf_.mutable_op_performance(i);
+    const OpPerformance& analytical = op_perf_analytical_.op_performance(i);
+    perf->set_compute_time(analytical.compute_time());
+    perf->set_memory_time(analytical.memory_time());
+    double measured_cost = perf->compute_cost();
+
+    double analytical_compute_cost = analytical.compute_time();
+    if (analytical_compute_cost == 0) {
+      // Negative infinity indidates unavailable data.
+      perf->set_compute_efficiency(-INFINITY);
+    } else {
+      perf->set_compute_efficiency(analytical_compute_cost / measured_cost);
+    }
+
+    double analytical_memory_cost = analytical.memory_time();
+    if (analytical_memory_cost == 0) {
+      // Negative infinity indidates unavailable data.
+      perf->set_memory_efficiency(-INFINITY);
+    } else {
+      perf->set_memory_efficiency(analytical_memory_cost / measured_cost);
+    }
+  }
+}
+
+
+void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
+  for (const auto& op : ops) {
+    ops_.push_back(op.second);
+  }
+  struct CompareByTime {
+    bool operator()(const OpPerfSummary& a, const OpPerfSummary& b) const {
+      return a.time > b.time;
+    }
+  };
+  std::stable_sort(ops_.begin(), ops_.end(), CompareByTime());
+}
+
+void CostAnalyzer::AnalyzeCosts() {
+  std::map<string, OpPerfSummary> ops;
+  for (const auto& op_perf : op_perf_.op_performance()) {
+    string op_name = op_perf.op().op();
+    ops[op_name].count++;
+    ops[op_name].time += op_perf.compute_cost();
+    ops[op_name].compute_time += op_perf.compute_time();
+    ops[op_name].memory_time += op_perf.memory_time();
+    ops[op_name].time_upper += op_perf.compute_time() + op_perf.memory_time();
+    ops[op_name].time_lower +=
+        std::max(op_perf.compute_time(), op_perf.memory_time());
+    ops[op_name].name = op_name;
+  }
+  SortOpsByTime(ops);
+
+  total_time_measured_serialized_ = 0;
+  total_time_analytical_upper_ = 0;
+  total_time_analytical_lower_ = 0;
+  for (const auto& op : ops_) {
+    total_time_measured_serialized_ += op.time;
+    total_time_analytical_upper_ += op.time_upper;
+    total_time_analytical_lower_ += op.time_lower;
+  }
+}
+
+void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
+  os << std::endl;
+  os << std::left << std::setw(50)
+     << "Total time measured in ns (serialized): " << std::right
+     << std::setw(20) << total_time_measured_serialized_ << std::endl;
+  os << std::left << std::setw(50)
+     << "Total time measured in ns (actual): " << std::right << std::setw(20)
+     << total_time_measured_ << std::endl;
+  os << std::left << std::setw(50)
+     << "Total time analytical in ns (upper bound): " << std::right
+     << std::setw(20) << total_time_analytical_upper_ << std::endl;
+  os << std::left << std::setw(50)
+     << "Total time analytical in ns (lower bound): " << std::right
+     << std::setw(20) << total_time_analytical_lower_ << std::endl;
+  double efficiency_upper = static_cast<double>(total_time_analytical_upper_) /
+                            static_cast<double>(total_time_measured_);
+  os << std::left << std::setw(50)
+     << "Overall efficiency (analytical upper/actual): " << std::right
+     << std::setw(20) << efficiency_upper << std::endl;
+  double efficiency_lower = static_cast<double>(total_time_analytical_lower_) /
+                            static_cast<double>(total_time_measured_);
+  os << std::left << std::setw(50)
+     << "Overall efficiency (analytical lower/actual): " << std::right
+     << std::setw(20) << efficiency_lower << std::endl;
+  os << std::endl;
+
+  int width = 35;
+  int width_narrow = 15;
+  int width_wide = 20;
+  os << std::setw(width + 1) << "Op,";
+  os << std::setw(width_narrow + 1) << "Count,";
+  os << std::setw(width_wide + 1) << "Measured time (ns),";
+  os << std::setw(width_narrow + 2) << "Time percent,";
+  os << std::setw(width_narrow + 2) << "Acc percent,";
+  os << std::setw(width_wide + 1) << "Analytical upper,";
+  os << std::setw(width_wide + 1) << "Analytical lower,";
+  os << std::setw(width_narrow + 2) << "Overall eff";
+  os << std::setw(width_narrow + 2) << "Compute eff";
+  os << std::setw(width_narrow + 2) << "Memory eff" << std::endl;
+  float acc_percent = 0;
+  for (const auto& op : ops_) {
+    double percent = static_cast<double>(op.time) /
+                     static_cast<double>(total_time_measured_serialized_);
+    double eff =
+        static_cast<double>(op.time_upper) / static_cast<double>(op.time);
+    double compute_eff =
+        static_cast<double>(op.compute_time) / static_cast<double>(op.time);
+    double memory_eff =
+        static_cast<double>(op.memory_time) / static_cast<double>(op.time);
+    os << std::setw(width) << op.name << ",";
+    os << std::setw(width_narrow) << op.count << ",";
+    os << std::setw(width_wide) << op.time << ",";
+    os << std::setw(width_narrow) << std::setprecision(2) << percent * 100
+       << "%,";
+    acc_percent += percent;
+    os << std::setw(width_narrow) << std::setprecision(2) << acc_percent * 100
+       << "%,";
+    os << std::setw(width_wide) << op.time_upper << ",";
+    os << std::setw(width_wide) << op.time_lower << ",";
+    os << std::setw(width_narrow) << std::setprecision(2) << eff * 100 << "%,";
+    os << std::setw(width_narrow) << std::setprecision(2) << compute_eff * 100
+       << "%,";
+    os << std::setw(width_narrow) << std::setprecision(2) << memory_eff * 100
+       << "%,";
+    os << std::endl;
+  }
+  os << std::endl;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
new file mode 100644
index 00000000000..3700bf5fb37
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
+
+#include <iostream>
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+
+namespace tensorflow {
+class GraphDef;
+class CostGraphDef;
+
+namespace grappler {
+struct GrapplerItem;
+
+// Aggregated perf summary for ops of the same type in a graph.
+struct OpPerfSummary {
+  string name;
+  int64 count;
+  int64 time;
+  int64 compute_time;
+  int64 memory_time;
+  // Upper and lower bound for estimated time.
+  int64 time_upper;
+  int64 time_lower;
+};
+
+// Generate op-level performance insights on compute/memory
+// efficiency, as well as graph-level aggregated performance statistics.
+class CostAnalyzer {
+ public:
+  explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
+                        const string& suffix);
+  Status GenerateReport(std::ostream& os);
+
+ private:
+  void PredictCosts(CostEstimator* cost_estimator, CostGraphDef* cost_graph,
+                    int64* total_time);
+  void GatherCosts();
+  void PreprocessCosts();
+  void AnalyzeCosts();
+  void SortOpsByTime(std::map<string, OpPerfSummary> ops);
+  void PrintAnalysis(std::ostream& os) const;
+
+  const GrapplerItem* item_;
+  MeasuringCostEstimator measure_estimator_;
+  AnalyticalCostEstimator analytical_estimator_;
+  OpPerformanceList op_perf_;
+  OpPerformanceList op_perf_analytical_;
+  int64 total_time_measured_;
+  int64 total_time_analytical_;
+  std::vector<OpPerfSummary> ops_;
+  int64 total_time_measured_serialized_;
+  int64 total_time_analytical_upper_;
+  int64 total_time_analytical_lower_;
+  string suffix_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
new file mode 100644
index 00000000000..a51d8673c99
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/lib/core/strings.i"
+%include "tensorflow/python/platform/base.i"
+
+%typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%{
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/python/grappler/cost_analyzer.h"
+%}
+
+%{
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph) {
+  tensorflow::grappler::ItemConfig cfg;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef("metagraph", metagraph, cfg);
+
+  // TODO(bsteiner): we should wrap the tf session instead to properly handle the case of a
+  // distributed setup.
+  const int timeout_s = 3600;
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+  tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
+  TF_CHECK_OK(cluster.Provision());
+
+  string suffix;
+  tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
+
+  std::stringstream os;
+  analyzer.GenerateReport(os);
+  return os.str();
+}
+
+%}
+
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
new file mode 100644
index 00000000000..d16614c7c75
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -0,0 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Provides a proper python API for the symbols exported through swig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as tf_wrap
+from tensorflow.python.framework import errors
+
+
+def GenerateCostReport(metagraph):
+  """Analyze the cost of each TensorFlow operation in the provided metagraph."""
+  with errors.raise_exception_on_not_ok_status():
+    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString())
+  return ret_from_swig
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
new file mode 100644
index 00000000000..d59f1d04f61
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -0,0 +1,116 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the cost analyzer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import cost_analyzer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+class PyWrapOptimizeGraphTest(test.TestCase):
+
+  def testBasic(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant(10, name="a")
+    b = constant_op.constant(20, name="b")
+    c = math_ops.add_n([a, b], name="c")
+    d = math_ops.add_n([b, c], name="d")
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = cost_analyzer.GenerateCostReport(mg)
+
+    # Check the report headers
+    self.assertTrue(b"Total time measured in ns (serialized):" in report)
+    self.assertTrue(b"Total time measured in ns (actual):" in report)
+    self.assertTrue(b"Total time analytical in ns (upper bound):" in report)
+    self.assertTrue(b"Total time analytical in ns (lower bound):" in report)
+    self.assertTrue(b"Overall efficiency (analytical upper/actual):" in report)
+    self.assertTrue(b"Overall efficiency (analytical lower/actual):" in report)
+
+    # Also print the report to make it easier to debug
+    print("{}".format(report))
+
+  def testSmallNetwork(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
+    label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
+    w = variables.Variable(
+        random_ops.truncated_normal([5, 5, 1, 32], stddev=0.1))
+    b = variables.Variable(random_ops.truncated_normal([32], stddev=0.1))
+    conv = nn_ops.conv2d(image, w, strides=[1, 1, 1, 1], padding="SAME")
+    h_conv = nn_ops.relu(conv + b)
+    h_conv_flat = array_ops.reshape(h_conv, [1, -1])
+
+    w_fc = variables.Variable(
+        random_ops.truncated_normal([25088, 10], stddev=0.1))
+    b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
+    y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)
+
+    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
+        label * math_ops.log(y_conv), reduction_indices=[1]))
+    _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)
+
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+    report = cost_analyzer.GenerateCostReport(mg)
+
+    # Print the report to make it easier to debug
+    print("{}".format(report))
+
+    self.assertTrue(b"MatMul" in report)
+    self.assertTrue(b"ApplyAdam" in report)
+    self.assertTrue(b"Conv2D" in report)
+    self.assertTrue(b"Conv2DBackpropInput" in report)
+    self.assertTrue(b"Conv2DBackpropFilter" in report)
+    self.assertTrue(b"Softmax" in report)
+
+    for op_type in [
+        b"MatMul", b"Conv2D", b"Conv2DBackpropInput", b"Conv2DBackpropFilter"
+    ]:
+      matcher = re.compile(
+          br"\s+" + op_type + br",\s*(\d+),\s*(\d+),\s*([\d\.eE+-]+)%,\s*" +
+          br"([\d\.eE+-]+)%,\s*(-?\d+),\s*(\d+),", re.MULTILINE)
+      m = matcher.search(report)
+
+      op_count = int(m.group(1))
+      # upper = int(m.group(5))
+      lower = int(m.group(6))
+      if op_type is b"MatMul":
+        self.assertEqual(3, op_count)
+      else:
+        self.assertEqual(1, op_count)
+      self.assertTrue(0 <= lower)
+      # self.assertTrue(0 < upper)
+      # self.assertTrue(lower <= upper)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
new file mode 100644
index 00000000000..d372e3a677e
--- /dev/null
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -0,0 +1,109 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler LayoutOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+def weight(shape):
+  """weights generates a weight of a given shape."""
+  return random_ops.truncated_normal(shape, seed=0, stddev=0.1)
+
+
+def bias(shape):
+  """bias generates a bias of a given shape."""
+  return constant_op.constant(0.1, shape=shape)
+
+
+def conv2d(x, w):
+  """conv2d returns a 2d convolution layer with full stride."""
+  return nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+
+
+def max_pool_2x2(x):
+  """max_pool_2x2 downsamples a feature map by 2X."""
+  return nn.max_pool(
+      x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+
+# Taken from tensorflow/examples/tutorials/mnist/mnist_deep.py
+def two_layer_model():
+  random_seed.set_random_seed(0)
+  x = random_ops.truncated_normal([1, 784], seed=0)
+  x_image = array_ops.reshape(x, [-1, 28, 28, 1])
+  w_conv1 = weight([5, 5, 1, 32])
+  b_conv1 = bias([32])
+  h_conv1 = nn.relu(conv2d(x_image, w_conv1) + b_conv1)
+  h_pool1 = max_pool_2x2(h_conv1)
+  w_conv2 = weight([5, 5, 32, 64])
+  b_conv2 = bias([64])
+  h_conv2 = nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
+  h_pool2 = max_pool_2x2(h_conv2)
+  return h_pool2
+
+
+class LayoutOptimizerTest(test.TestCase):
+  """Tests the Grappler layout optimizer."""
+
+  def testTwoConvLayers(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = two_layer_model()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      rewrite_options = rewriter_config_pb2.RewriterConfig(
+          optimize_tensor_layout=True)
+      graph_options = config_pb2.GraphOptions(
+          rewrite_options=rewrite_options,
+          build_cost_model=1)
+      config = config_pb2.ConfigProto(graph_options=graph_options)
+
+      with session.Session(config=config) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1',
+                    nodes)
+
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
new file mode 100644
index 00000000000..581f17c2ca2
--- /dev/null
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the swig wrapper tf_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MemoryOptimizerTest(test.TestCase):
+  """Tests the Grappler memory optimizer."""
+
+  def testNoSwapping(self):
+    """Make sure the graph is preserved when there is nothing to swap."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 4)
+    self.assertItemsEqual([node.name
+                           for node in graph.node], ['a', 'b', 'c', 'd'])
+
+  def testSimpleSwap(self):
+    """Check that the swap annotations are followed."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+
+    d.op.node_def.attr['_swap_to_host'].i = 0
+
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 6)
+    self.assertItemsEqual([node.name for node in graph.node], [
+        'a',
+        'b',
+        'c',
+        'd',
+        'swap_in_d_0',
+        'swap_out_d_0',
+    ])
+    for node in graph.node:
+      if node.name == 'swap_in_d_0':
+        self.assertEqual('swap_out_d_0', node.input[0])
+        self.assertEqual('^b', node.input[1])
+      elif node.name == 'swap_out_d_0':
+        self.assertEqual('b', node.input[0])
+      elif node.name == 'd':
+        self.assertEqual('swap_in_d_0', node.input[0])
+        self.assertEqual('c', node.input[1])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
new file mode 100644
index 00000000000..a8067467d91
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+%include "tensorflow/python/platform/base.i"
+
+%typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%typemap(in) const tensorflow::RewriterConfig& (
+    tensorflow::RewriterConfig temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The RewriterConfig could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%{
+  #include <memory>
+  #include "tensorflow/c/tf_status_helper.h"
+  #include "tensorflow/core/lib/core/status.h"
+  #include "tensorflow/core/framework/graph.pb.h"
+  #include "tensorflow/core/grappler/grappler_item.h"
+  #include "tensorflow/core/grappler/grappler_item_builder.h"
+  #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+  #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/meta_graph.pb.h"
+  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+PyObject* TF_OptimizeGraph(
+      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::MetaGraphDef& metagraph,
+      const string& graph_id, TF_Status* out_status) {
+    tensorflow::grappler::ItemConfig item_config;
+    item_config.inline_functions = false;
+    item_config.apply_optimizations = false;
+    std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+        tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
+    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+    tensorflow::grappler::VirtualCluster cluster(device_map);
+    tensorflow::GraphDef out_graph;
+    tensorflow::Status status = tensorflow::grappler::RunMetaOptimizer(
+        *grappler_item, rewriter_config, &cluster, &out_graph);
+    tensorflow::Set_TF_Status_from_Status(out_status, status);
+    string out_graph_str = out_graph.SerializeAsString();
+    PyObject* ret = PyBytes_FromStringAndSize(out_graph_str.data(),
+                                              out_graph_str.size());
+    return ret;
+  }
+%}
+
+
+// Wrap this function
+PyObject* TF_OptimizeGraph(
+    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::MetaGraphDef& metagraph,
+    const string& graph_id, TF_Status* out_status);
+
+
+
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
new file mode 100644
index 00000000000..d0464c60542
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -0,0 +1,35 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Provides a proper python API for the symbols exported through swig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python import pywrap_tensorflow as tf_opt
+from tensorflow.python.framework import errors
+
+
+def OptimizeGraph(rewriter_config, metagraph, graph_id=b'graph_to_optimize'):
+  """Optimize the provided metagraph."""
+  with errors.raise_exception_on_not_ok_status() as status:
+    ret_from_swig = tf_opt.TF_OptimizeGraph(rewriter_config.SerializeToString(),
+                                            metagraph.SerializeToString(),
+                                            graph_id, status)
+  if ret_from_swig is None:
+    return None
+  out_graph = graph_pb2.GraphDef().FromString(ret_from_swig)
+  return out_graph
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
new file mode 100644
index 00000000000..b1efc2dbfbb
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the swig wrapper tf_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PyWrapOptimizeGraphTest(test.TestCase):
+
+  def testBasic(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    rewriter_config.optimizers.append('constfold')
+
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 5)
+    self.assertItemsEqual([node.name for node in graph.node],
+                          ['a', 'b', 'c', 'd', 'ConstantFolding/c'])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 119cffe0e9b..c89118d4bf7 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -29,6 +29,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
     ],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -93,6 +94,18 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
     ],
+    tags = ["no_windows"],
+)
+
+tf_py_test(
+    name = "bincount_op_test",
+    size = "small",
+    srcs = ["bincount_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
 )
 
 tf_py_test(
@@ -109,9 +122,9 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "cholesky_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["cholesky_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -122,6 +135,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
     ],
+    shard_count = 5,
 )
 
 tf_py_test(
@@ -133,6 +147,7 @@ tf_py_test(
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -203,6 +218,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "decode_bmp_op_test",
+    size = "small",
+    srcs = ["decode_bmp_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:nn_grad",
+    ],
+)
+
 tf_py_test(
     name = "decode_image_op_test",
     size = "small",
@@ -216,6 +244,7 @@ tf_py_test(
         "//tensorflow/python:nn_grad",
     ],
     data = ["//tensorflow/core:image_testdata"],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -391,6 +420,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "lookup_ops_test",
+    size = "small",
+    srcs = ["lookup_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_py_test(
     name = "losses_test",
     size = "medium",
@@ -540,7 +585,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "resource_variable_ops_test",
     size = "small",
     srcs = ["resource_variable_ops_test.py"],
@@ -567,7 +612,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "scatter_nd_ops_test",
     size = "medium",
     srcs = ["scatter_nd_ops_test.py"],
@@ -832,6 +877,7 @@ tf_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -879,7 +925,10 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-    tags = ["noasan"],
+    tags = [
+        "no_windows",
+        "noasan",
+    ],
 )
 
 cuda_py_test(
@@ -897,7 +946,7 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "diag_op_test",
     size = "medium",
     srcs = ["diag_op_test.py"],
@@ -910,6 +959,7 @@ tf_py_test(
         "//tensorflow/python:platform",
     ],
     shard_count = 2,
+    tags = ["no_windows_gpu"],
 )
 
 tf_py_test(
@@ -928,6 +978,21 @@ tf_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    data = ["//tensorflow/core:lmdb_testdata"],
+    tags = ["no_windows"],
+)
+
+cuda_py_test(
+    name = "aggregate_ops_test",
+    size = "small",
+    srcs = ["aggregate_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
 )
 
 cuda_py_test(
@@ -956,6 +1021,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
     ],
@@ -1073,6 +1139,7 @@ cuda_py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1194,6 +1261,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    tags = ["manual"],
 )
 
 cuda_py_test(
@@ -1255,6 +1323,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1359,6 +1428,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 20,
 )
 
 cuda_py_test(
@@ -1392,6 +1462,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "multinomial_op_big_test",
+    size = "medium",
+    srcs = ["multinomial_op_big_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+    ],
+    shard_count = 3,
+)
+
 cuda_py_test(
     name = "numerics_test",
     size = "small",
@@ -1417,6 +1506,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -1472,6 +1562,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:script_ops",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1487,7 +1578,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "random_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["random_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1522,6 +1613,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -1566,9 +1658,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "scalar_strict_test",
+    name = "scalar_test",
     size = "small",
-    srcs = ["scalar_strict_test.py"],
+    srcs = ["scalar_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
@@ -1762,7 +1854,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "split_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["split_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1811,6 +1903,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1851,14 +1944,17 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
+        "//tensorflow/python:training",
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
     ],
+    flaky = 1,  # create_local_cluster sometimes times out.
 )
 
 cuda_py_test(
@@ -1870,11 +1966,12 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "small",
+    size = "large",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1966,6 +2063,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["manual"],
 )
 
 cuda_py_test(
@@ -2010,7 +2108,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "conv_ops_test",
-    size = "medium",
+    size = "large",
     srcs = ["conv_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2028,6 +2126,8 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2038,12 +2138,29 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
 )
 
+tf_py_test(
+    name = "neon_depthwise_conv_op_test",
+    size = "medium",
+    srcs = ["neon_depthwise_conv_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+    ],
+    tags = ["no_windows"],
+)
+
 cuda_py_test(
     name = "division_future_test",
     size = "medium",
@@ -2053,6 +2170,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    tags = ["manual"],
 )
 
 cuda_py_test(
@@ -2064,12 +2182,14 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops",
     ],
+    shard_count = 3,
 )
 
 cuda_py_test(
     name = "pooling_ops_3d_test",
-    size = "medium",  # http://b/30600785
+    size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2095,6 +2215,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -2110,6 +2231,22 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "random_poisson_test",
+    size = "medium",
+    srcs = ["random_poisson_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+    ],
 )
 
 cuda_py_test(
@@ -2136,6 +2273,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 10,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2175,6 +2313,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2193,6 +2332,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_grad",
         "//tensorflow/python:sparse_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -2232,7 +2372,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "stage_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["stage_op_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
@@ -2244,6 +2384,20 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "map_stage_op_test",
+    size = "medium",
+    srcs = ["map_stage_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:data_flow_ops",
+    ],
+)
+
 cuda_py_test(
     name = "concat_op_test",
     size = "medium",
@@ -2302,11 +2456,15 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_grad",
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
-    tags = ["notap"],  # b/30226163
+    tags = [
+        "manual",
+        "notap",  # b/30226163
+    ],
 )
 
 cuda_py_test(
@@ -2317,7 +2475,6 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
@@ -2523,6 +2680,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bucketize_op_test",
+    size = "small",
+    srcs = ["bucketize_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+tf_py_test(
+    name = "sparse_cross_op_test",
+    size = "small",
+    srcs = ["sparse_cross_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
new file mode 100644
index 00000000000..f56917f7e9b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for aggregate_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class AddNTest(test.TestCase):
+  # AddN special-cases adding the first M inputs to make (N - M) divisible by 8,
+  # after which it adds the remaining (N - M) tensors 8 at a time in a loop.
+  # Test N in [1, 10] so we check each special-case from 1 to 9 and one
+  # iteration of the loop.
+  _MAX_N = 10
+
+  def _supported_types(self):
+    if test.is_gpu_available():
+      return [dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+              dtypes.complex128]
+    return [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+            dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+            dtypes.complex128]
+
+  def _buildData(self, shape, dtype):
+    data = np.random.randn(*shape).astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testAddN(self):
+    np.random.seed(12345)
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in self._supported_types():
+        for count in range(1, self._MAX_N + 1):
+          data = [self._buildData((2, 2), dtype) for _ in range(count)]
+          actual = sess.run(math_ops.add_n(data))
+          expected = np.sum(np.vstack(
+              [np.expand_dims(d, 0) for d in data]), axis=0)
+          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+  def testUnknownShapes(self):
+    np.random.seed(12345)
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in self._supported_types():
+        data = self._buildData((2, 2), dtype)
+        for count in range(1, self._MAX_N + 1):
+          data_ph = array_ops.placeholder(dtype=dtype)
+          actual = sess.run(math_ops.add_n([data_ph] * count), {data_ph: data})
+          expected = np.sum(np.vstack([np.expand_dims(data, 0)] * count),
+                            axis=0)
+          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 61447dfc777..7b8cd256643 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -33,6 +33,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
@@ -238,7 +240,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, x_np)
 
   def _reverse1DimAuto(self, np_dtype):
-    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)
+    x_np = np.array([1, 200, 3, 40, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
@@ -246,7 +248,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
-    x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np_dtype)
+    x_np = np.array([[1, 200, 3], [4, 5, 60]], dtype=np_dtype)
 
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
@@ -281,14 +283,14 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
 
@@ -396,11 +398,12 @@ class StridedSliceChecker(object):
   REF_TENSOR_ALIGNED = np.arange(1, 97, dtype=np.float32).reshape(3, 4, 8)
 
   def __init__(self, test, x, tensor_type=dtypes.int32, check_type_infer=True):
+    self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if tensor_type.is_complex:
+      self.x_np -= 1j * self.x_np
     self.test = test
-    self.x = math_ops.cast(
-        constant_op.constant(
-            x, dtype=dtypes.float32), dtype=tensor_type)
-    self.x_np = np.array(x)
+    self.x = constant_op.constant(self.x_np, dtype=tensor_type)
     self.check_type_infer = check_type_infer
 
   def __getitem__(self, spec):
@@ -434,142 +437,136 @@ class StridedSliceChecker(object):
     return tensor
 
 
+STRIDED_SLICE_TYPES = [dtypes.int32, dtypes.int64, dtypes.int16, dtypes.int8,
+                       dtypes.float32, dtypes.float64, dtypes.complex64,
+                       dtypes.complex128]
+
+
 class StridedSliceTest(test_util.TensorFlowTestCase):
   """Test the strided slice operation with variants of slices."""
 
   def test_basic_slice(self):
-    for tensor_type in [
-        dtypes.int32, dtypes.int64, dtypes.int16, dtypes.int8, dtypes.float32,
-        dtypes.float64
-    ]:
-      for use_gpu in [False, True]:
-        with self.test_session(use_gpu=use_gpu):
-          checker = StridedSliceChecker(
-              self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
-          _ = checker[:, :, :]
-          # Various ways of representing identity slice
-          _ = checker[:, :, :]
-          _ = checker[::, ::, ::]
-          _ = checker[::1, ::1, ::1]
-          # Not zero slice
-          _ = checker[::1, ::5, ::2]
-          # Reverse in each dimension independently
-          _ = checker[::-1, :, :]
-          _ = checker[:, ::-1, :]
-          _ = checker[:, :, ::-1]
-          ## negative index tests i.e. n-2 in first component
-          _ = checker[-2::-1, :, ::1]
-          # negative index tests i.e. n-2 in first component, non-unit stride
-          _ = checker[-2::-1, :, ::2]
+    for tensor_type in STRIDED_SLICE_TYPES:
+      with self.test_session(use_gpu=True):
+        checker = StridedSliceChecker(
+            self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
+        _ = checker[:, :, :]
+        # Various ways of representing identity slice
+        _ = checker[:, :, :]
+        _ = checker[::, ::, ::]
+        _ = checker[::1, ::1, ::1]
+        # Not zero slice
+        _ = checker[::1, ::5, ::2]
+        # Reverse in each dimension independently
+        _ = checker[::-1, :, :]
+        _ = checker[:, ::-1, :]
+        _ = checker[:, :, ::-1]
+        ## negative index tests i.e. n-2 in first component
+        _ = checker[-2::-1, :, ::1]
+        # negative index tests i.e. n-2 in first component, non-unit stride
+        _ = checker[-2::-1, :, ::2]
 
-          # Check rank-0 examples
-          checker2 = StridedSliceChecker(self, 5, tensor_type=dtypes.int32)
-          _ = checker2[None]
-          _ = checker2[...]
-          _ = checker2[tuple()]
+        # Check rank-0 examples
+        checker2 = StridedSliceChecker(self, 5, tensor_type=tensor_type)
+        _ = checker2[None]
+        _ = checker2[...]
+        _ = checker2[tuple()]
 
   def testDegenerateSlices(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
-        # degenerate by offering a forward interval with a negative stride
-        _ = checker[0:-1:-1, :, :]
-        # degenerate with a reverse interval with a positive stride
-        _ = checker[-1:0, :, :]
-        # empty interval in every dimension
-        _ = checker[-1:0, 2:2, 2:3:-1]
+    with self.test_session(use_gpu=True):
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      # degenerate by offering a forward interval with a negative stride
+      _ = checker[0:-1:-1, :, :]
+      # degenerate with a reverse interval with a positive stride
+      _ = checker[-1:0, :, :]
+      # empty interval in every dimension
+      _ = checker[-1:0, 2:2, 2:3:-1]
 
   def testEllipsis(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
-        checker = StridedSliceChecker(self, raw)
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
+      checker = StridedSliceChecker(self, raw)
 
-        _ = checker[0:]
-        # implicit ellipsis
-        _ = checker[0:, ...]
-        # ellipsis alone
-        _ = checker[...]
-        # ellipsis at end
-        _ = checker[0:1, ...]
-        # ellipsis at begin
-        _ = checker[..., 0:1]
-        # ellipsis at middle
-        _ = checker[0:1, ..., 0:1]
-        # multiple ellipses not allowed
-        with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
-          _ = checker[..., :, ...].eval()
+      _ = checker[0:]
+      # implicit ellipsis
+      _ = checker[0:, ...]
+      # ellipsis alone
+      _ = checker[...]
+      # ellipsis at end
+      _ = checker[0:1, ...]
+      # ellipsis at begin
+      _ = checker[..., 0:1]
+      # ellipsis at middle
+      _ = checker[0:1, ..., 0:1]
+      # multiple ellipses not allowed
+      with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
+        _ = checker[..., :, ...].eval()
 
   def testShrink(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw)
-        _ = checker[:, :, :, :, 3]
-        _ = checker[..., 3]
-        _ = checker[:, 0]
-        _ = checker[:, :, 0]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw)
+      _ = checker[:, :, :, :, 3]
+      _ = checker[..., 3]
+      _ = checker[:, 0]
+      _ = checker[:, :, 0]
 
   def testTensorIndexing(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw, check_type_infer=False)
-        bar = constant_op.constant(2)
-        bar2 = constant_op.constant(3)
-        _ = checker[..., bar:bar2]
-        _ = checker[..., bar]
-        with self.assertRaisesRegexp(
-            TypeError,
-            "Value passed to parameter 'begin' has DataType float32 not in "
-            "list of allowed values"):
-          _ = checker[..., 3.0]
-        _ = checker[..., 3]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw, check_type_infer=False)
+      bar = constant_op.constant(2)
+      bar2 = constant_op.constant(3)
+      _ = checker[..., bar:bar2]
+      _ = checker[..., bar]
+      with self.assertRaisesRegexp(
+          TypeError,
+          "Value passed to parameter 'begin' has DataType float32 not in "
+          "list of allowed values"):
+        _ = checker[..., 3.0]
+      _ = checker[..., 3]
 
   def testExpand(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw)
-        # new axis (followed by implicit ellipsis)
-        _ = checker[np.newaxis]
-        # newaxis after ellipsis
-        _ = checker[..., np.newaxis]
-        # newaxis in between ellipsis and explicit range
-        _ = checker[..., np.newaxis, :]
-        _ = checker[:, ..., np.newaxis, :, :]
-        # Reverse final dimension with new axis
-        _ = checker[:, :, np.newaxis, :, 2::-1]
-        # Ellipsis in middle of two newaxis
-        _ = checker[np.newaxis, ..., np.newaxis]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw)
+      # new axis (followed by implicit ellipsis)
+      _ = checker[np.newaxis]
+      # newaxis after ellipsis
+      _ = checker[..., np.newaxis]
+      # newaxis in between ellipsis and explicit range
+      _ = checker[..., np.newaxis, :]
+      _ = checker[:, ..., np.newaxis, :, :]
+      # Reverse final dimension with new axis
+      _ = checker[:, :, np.newaxis, :, 2::-1]
+      # Ellipsis in middle of two newaxis
+      _ = checker[np.newaxis, ..., np.newaxis]
 
   def testExpandVariable(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        x = variables.Variable(7, dtype=dtypes.int32)
-        x.initializer.run()
-        y = x[None].eval()
-        self.assertEqual(y.shape, (1,))
-        self.assertAllEqual(y, (7,))
+    with self.test_session(use_gpu=True):
+      x = variables.Variable(7, dtype=dtypes.int32)
+      x.initializer.run()
+      y = x[None].eval()
+      self.assertEqual(y.shape, (1,))
+      self.assertAllEqual(y, (7,))
 
   def testOptimizedCases(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        checker = StridedSliceChecker(self,
-                                      StridedSliceChecker.REF_TENSOR_ALIGNED)
-        # Identity
-        _ = checker[:]
-        # Identity
-        _ = checker[...]
-        # Identity
-        _ = checker[np.newaxis, ..., np.newaxis]
-        # First axis slice
-        _ = checker[1:]
-        # First axis slice
-        _ = checker[np.newaxis, 1:]
+    with self.test_session(use_gpu=True):
+      checker = StridedSliceChecker(self,
+                                    StridedSliceChecker.REF_TENSOR_ALIGNED)
+      # Identity
+      _ = checker[:]
+      # Identity
+      _ = checker[...]
+      # Identity
+      _ = checker[np.newaxis, ..., np.newaxis]
+      # First axis slice
+      _ = checker[1:]
+      # First axis slice
+      _ = checker[np.newaxis, 1:]
 
 
 class StridedSliceShapeChecker(object):
@@ -586,7 +583,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
   """Test the shape inference of StridedSliceShapes."""
 
   def testUnknown(self):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(dtypes.float32)
       a = StridedSliceShapeChecker(uncertain_tensor)
       a_slice_shape = a[...]
@@ -597,45 +594,43 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
     self.assertEqual(x.as_list(), y.as_list())
 
   def testTensorShapeUncertain(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        uncertain_tensor = array_ops.placeholder(
-            dtypes.float32, shape=(5, None, 7))
-        a = StridedSliceShapeChecker(uncertain_tensor)
-        self.tensorShapeEqual(a[3:5], tensor_shape.TensorShape([2, None, 7]))
-        self.tensorShapeEqual(a[3:5, :, 4], tensor_shape.TensorShape([2, None]))
-        self.tensorShapeEqual(a[3:5, 3:4, 4],
-                              tensor_shape.TensorShape([2, None]))
-        self.tensorShapeEqual(a[3:5, :, 5:10],
-                              tensor_shape.TensorShape([2, None, 2]))
-        self.tensorShapeEqual(a[3:5, :, 50:3],
-                              tensor_shape.TensorShape([2, None, 0]))
-        self.tensorShapeEqual(a[3:5, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[1:5:2, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[:5:3, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[:2:3, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([1, None, 1, 0]))
-        self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
-                              tensor_shape.TensorShape([5, None, 1, 4]))
+    with self.test_session(use_gpu=True):
+      uncertain_tensor = array_ops.placeholder(
+          dtypes.float32, shape=(5, None, 7))
+      a = StridedSliceShapeChecker(uncertain_tensor)
+      self.tensorShapeEqual(a[3:5], tensor_shape.TensorShape([2, None, 7]))
+      self.tensorShapeEqual(a[3:5, :, 4], tensor_shape.TensorShape([2, None]))
+      self.tensorShapeEqual(a[3:5, 3:4, 4],
+                            tensor_shape.TensorShape([2, None]))
+      self.tensorShapeEqual(a[3:5, :, 5:10],
+                            tensor_shape.TensorShape([2, None, 2]))
+      self.tensorShapeEqual(a[3:5, :, 50:3],
+                            tensor_shape.TensorShape([2, None, 0]))
+      self.tensorShapeEqual(a[3:5, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[1:5:2, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[:5:3, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[:2:3, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([1, None, 1, 0]))
+      self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
+                            tensor_shape.TensorShape([5, None, 1, 4]))
 
   def testTensorValuedIndexShape(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        defined_shape_tensor = array_ops.placeholder(
-            dtypes.float32, shape=(5, 3, 7))
-        index_value = array_ops.placeholder(dtypes.int32, shape=())
-        a = StridedSliceShapeChecker(defined_shape_tensor)
-        self.tensorShapeEqual(a[index_value], tensor_shape.TensorShape([3, 7]))
-        self.tensorShapeEqual(a[index_value, ::-1],
-                              tensor_shape.TensorShape([3, 7]))
-        self.tensorShapeEqual(a[index_value, ::-2],
-                              tensor_shape.TensorShape([2, 7]))
-        other_scalar = array_ops.placeholder(dtypes.int32, shape=())
-        self.tensorShapeEqual(a[index_value, other_scalar:2],
-                              tensor_shape.TensorShape([None, 7]))
+    with self.test_session(use_gpu=True):
+      defined_shape_tensor = array_ops.placeholder(
+          dtypes.float32, shape=(5, 3, 7))
+      index_value = array_ops.placeholder(dtypes.int32, shape=())
+      a = StridedSliceShapeChecker(defined_shape_tensor)
+      self.tensorShapeEqual(a[index_value], tensor_shape.TensorShape([3, 7]))
+      self.tensorShapeEqual(a[index_value, ::-1],
+                            tensor_shape.TensorShape([3, 7]))
+      self.tensorShapeEqual(a[index_value, ::-2],
+                            tensor_shape.TensorShape([2, 7]))
+      other_scalar = array_ops.placeholder(dtypes.int32, shape=())
+      self.tensorShapeEqual(a[index_value, other_scalar:2],
+                            tensor_shape.TensorShape([None, 7]))
 
 
 class GradSliceChecker(object):
@@ -680,35 +675,33 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
   """Test that strided slice's custom gradient produces correct gradients."""
 
   def testGradient(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        var = variables.Variable(
-            array_ops.reshape(
-                math_ops.range(1, 97, 1), shape=(6, 4, 4)))
-        init = variables.global_variables_initializer()
-        sess.run(init)
+    with self.test_session(use_gpu=True) as sess:
+      var = variables.Variable(
+          array_ops.reshape(
+              math_ops.range(1, 97, 1), shape=(6, 4, 4)))
+      init = variables.global_variables_initializer()
+      sess.run(init)
 
-        grad = GradSliceChecker(self, sess, var,
-                                np.array(range(1, 97, 1)).reshape((6, 4, 4)))
-        _ = grad[2:6:2, 1:3, 1:3]
-        _ = grad[3:0:-2, 1:3, 1:3]
-        _ = grad[3:0:-2, array_ops.newaxis, 1:3, 2, array_ops.newaxis]
-        _ = grad[3:0:-2, 1:3, 2]
-        _ = grad[:, -1, :]
-        _ = grad[:, -2, :]
-        with self.assertRaisesRegexp(ValueError, "out of bounds"):
-          _ = grad[:, -200, :]
-        with self.assertRaisesRegexp(ValueError, "out of bounds"):
-          _ = grad[:, 200, :]
+      grad = GradSliceChecker(self, sess, var,
+                              np.array(range(1, 97, 1)).reshape((6, 4, 4)))
+      _ = grad[2:6:2, 1:3, 1:3]
+      _ = grad[3:0:-2, 1:3, 1:3]
+      _ = grad[3:0:-2, array_ops.newaxis, 1:3, 2, array_ops.newaxis]
+      _ = grad[3:0:-2, 1:3, 2]
+      _ = grad[:, -1, :]
+      _ = grad[:, -2, :]
+      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+        _ = grad[:, -200, :]
+      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+        _ = grad[:, 200, :]
 
   def testGradientZero(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        var = variables.Variable(8)
-        init = variables.global_variables_initializer()
-        sess.run(init)
-        grad = GradSliceChecker(self, sess, var, np.array(8))
-        _ = grad[tuple()]
+    with self.test_session(use_gpu=True) as sess:
+      var = variables.Variable(8)
+      init = variables.global_variables_initializer()
+      sess.run(init)
+      grad = GradSliceChecker(self, sess, var, np.array(8))
+      _ = grad[tuple()]
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
@@ -815,25 +808,37 @@ class StridedSliceBenchmark(test_lib.Benchmark):
 
 class StridedSliceAssignChecker(object):
 
-  def __init__(self, test, x, tensor_type=dtypes.float32):
+  def __init__(self, test, x, tensor_type=dtypes.float32, use_resource=False):
     self.tensor_type = tensor_type
     self.test = test
-    self.x = math_ops.cast(
-        constant_op.constant(
-            x, dtype=dtypes.float32), dtype=tensor_type)
-    self.x_np = np.array(x)
+    self._use_resource = use_resource
+
+    self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if tensor_type.is_complex:
+      self.x_np -= 1j * self.x_np
+    self.x = constant_op.constant(self.x_np, dtype=tensor_type)
 
   def __setitem__(self, index, value):
-    for use_gpu in [False, True]:
-      with self.test.test_session(use_gpu=use_gpu) as sess:
+    value = np.array(value).astype(self.tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if self.tensor_type.is_complex:
+      value -= 1j * value
+
+    with self.test.test_session(use_gpu=True) as sess:
+      if self._use_resource:
+        var = resource_variable_ops.ResourceVariable(self.x)
+      else:
         var = variables.Variable(self.x)
-        sess.run(variables.initialize_variables([var]))
-        val = sess.run(var[index].assign(
-            constant_op.constant(
-                value, dtype=self.tensor_type)))
-        valnp = np.copy(self.x_np)
-        valnp[index] = np.array(value)
-        self.test.assertAllEqual(val, valnp)
+      sess.run(variables.initialize_variables([var]))
+      val = sess.run(var[index].assign(value))
+      # val_copy is used to check that tf.assign works equivalently to the
+      # assign method above.
+      val_copy = sess.run(state_ops.assign(var[index], value))
+      valnp = np.copy(self.x_np)
+      valnp[index] = np.array(value)
+      self.test.assertAllEqual(val, valnp)
+      self.test.assertAllEqual(val_copy, valnp)
 
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
@@ -846,31 +851,40 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         bar = foo[:2].assign(constant_op.constant([1, 2]))
         sess.run(bar)
 
-  def testSliceAssign(self):
-    checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]])
-    # Check if equal
-    checker[:] = [[10, 20, 30], [40, 50, 60]]
-    # Check trivial (1,1) shape tensor
-    checker[1:2, 1:2] = [[666]]
-    # shrinks shape changes
-    checker[1:2, 1] = [666]
-    checker[1, 1:2] = [666]
-    checker[1, 1] = 666
-    # newaxis shape changes
-    checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
-    # shrink and newaxis
-    checker[None, None, 0, 0:1] = [[[999]]]
-    # Non unit strides
-    checker[::1, ::-2] = [[33, 333], [44, 444]]
-    # degenerate interval
-    checker[8:10, 0] = []
-    checker[8:10, 8:10] = [[]]
+  def doTestSliceAssign(self, use_resource):
+    for dtype in STRIDED_SLICE_TYPES:
+      checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]],
+                                          use_resource=use_resource,
+                                          tensor_type=dtype)
+      # Check if equal
+      checker[:] = [[10, 20, 30], [40, 50, 60]]
+      # Check trivial (1,1) shape tensor
+      checker[1:2, 1:2] = [[66]]
+      # shrinks shape changes
+      checker[1:2, 1] = [66]
+      checker[1, 1:2] = [66]
+      checker[1, 1] = 66
+      # newaxis shape changes
+      checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
+      # shrink and newaxis
+      checker[None, None, 0, 0:1] = [[[99]]]
+      # Non unit strides
+      checker[::1, ::-2] = [[3, 33], [4, 44]]
+      # degenerate interval
+      checker[8:10, 0] = []
+      checker[8:10, 8:10] = [[]]
     # Assign vector to scalar (rank-0) using newaxis
-    checker2 = StridedSliceAssignChecker(self, 2225)
+    checker2 = StridedSliceAssignChecker(self, 222)
     checker2[()] = 6  # no indices
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  def testSliceAssign(self):
+    self.doTestSliceAssign(use_resource=False)
+
+  def testSliceAssignResource(self):
+    self.doTestSliceAssign(use_resource=True)
+
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index ca73cf88f86..7f1ed1fac6b 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -53,124 +55,170 @@ def upsample_filters(filters, rate):
   return output
 
 
+@contextlib.contextmanager
+def delay_checks(sess):
+  """Context manager for combining checks depending on tensor evaluations.
+
+  Each call to Session.run has some overhead, and this overhead can easily
+  account for the majority of the time spent in tests that call Session.run (or
+  Tensor.eval) many times.
+
+  This context manager provides a mechanism for registering callback functions
+  and associated tensors.  When the context is exited, all of the tensors
+  associated with all of the registrations are evaluated with a single call to
+  Session.run, and then each registered callback function is called with the
+  values of its associated tensors.
+
+  Args:
+    sess: The session to use to evaluate the tensors.
+
+  Yields:
+    A function `add_check(check, *args, **kwargs)` where `check` is the
+    callback function to be invoked, and `*args` and `**kwargs` specify the
+    associated Tensors.
+  """
+  checks = []
+  def add_check(check, *args, **kwargs):
+    checks.append((check, args, kwargs))
+  yield add_check
+  all_values = sess.run([[args, kwargs] for _, args, kwargs in checks])
+  for (check, _, _), (args, kwargs) in zip(checks, all_values):
+    check(*args, **kwargs)
+
+
 class AtrousConvolutionTest(test.TestCase):
 
-  def _test_atrous_convolution(self, input_shape, filter_shape, dilation_rate,
-                               **kwargs):
-    filters = np.arange(
-        np.prod(filter_shape), dtype=np.float32).reshape(filter_shape)
+  def _test_atrous_convolution(self, add_check, input_shape, filter_shape,
+                               dilation_rate, **kwargs):
+    filters = np.arange(np.prod(filter_shape),
+                        dtype=np.float32).reshape(filter_shape)
     filters_upsampled = upsample_filters(filters, dilation_rate)
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
-    y1 = nn_ops.convolution(
-        input=x, filter=filters, dilation_rate=dilation_rate, **kwargs)
+    y1 = nn_ops.convolution(input=x, filter=filters,
+                            dilation_rate=dilation_rate, **kwargs)
     y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs)
-    self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+
+    def check(y1_eval, y2_eval):
+      self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2)
+
+    add_check(check, y1, y2)
 
   def testAtrousConvolution2D(self):
-    with self.test_session():
-      for padding in ["SAME", "VALID"]:
-        for height, width in [[9, 9], [9, 10]]:
-          for kernel_height, kernel_width in [[1, 1], [2, 2], [2, 3]]:
-            for dilation_rate in [[1, 1], [3, 2], [2, 1]]:
-              self._test_atrous_convolution(
-                  input_shape=[2, height, width, 2],
-                  filter_shape=[kernel_height, kernel_width, 2, 2],
-                  padding=padding,
-                  dilation_rate=dilation_rate)
+    with self.test_session() as sess:
+      with delay_checks(sess) as add_check:
+        for padding in ["SAME", "VALID"]:
+          for height, width in [[9, 9], [9, 10]]:
+            for kernel_height, kernel_width in [[1, 1], [2, 2], [2, 3]]:
+              for dilation_rate in [[1, 1], [3, 2], [2, 1]]:
+                self._test_atrous_convolution(
+                    add_check=add_check,
+                    input_shape=[2, height, width, 2],
+                    filter_shape=[kernel_height, kernel_width, 2, 2],
+                    padding=padding,
+                    dilation_rate=dilation_rate,
+                )
 
   def testAtrousConvolution3D(self):
-    with self.test_session():
-      for padding in ["SAME", "VALID"]:
-        for depth, height, width in [[9, 9, 10], [9, 10, 9]]:
-          for kernel_depth, kernel_height, kernel_width in [[3, 3, 3],
-                                                            [3, 2, 2],
-                                                            [2, 1, 3]]:
-            for dilation_rate in [[1, 1, 1], [3, 3, 3], [3, 2, 3], [3, 1, 2]]:
-              self._test_atrous_convolution(
-                  input_shape=[2, depth, height, width, 2],
-                  filter_shape=[
-                      kernel_depth, kernel_height, kernel_width, 2, 2
-                  ],
-                  padding=padding,
-                  dilation_rate=dilation_rate)
+    with self.test_session() as sess:
+      with delay_checks(sess) as add_check:
+        for padding in ["SAME", "VALID"]:
+          for depth, height, width in [[9, 9, 10], [9, 10, 9]]:
+            for kernel_depth, kernel_height, kernel_width in [[3, 3,
+                                                               3], [3, 2, 2],
+                                                              [2, 1, 3]]:
+              for dilation_rate in [[1, 1, 1], [3, 3, 3], [3, 2, 3], [3, 1, 2]]:
+                self._test_atrous_convolution(
+                    add_check=add_check,
+                    input_shape=[2, depth, height, width, 2],
+                    filter_shape=[
+                        kernel_depth, kernel_height, kernel_width, 2, 2
+                    ],
+                    padding=padding,
+                    dilation_rate=dilation_rate,
+                )
 
   def testAtrousConvolution1D(self):
-    with self.test_session():
-      for padding in ["SAME", "VALID"]:
-        for width in [9, 10]:
-          for kernel_width in range(1, 4):
-            for rate in range(1, 4):
-              self._test_atrous_convolution(
-                  input_shape=[2, width, 2],
-                  filter_shape=[kernel_width, 2, 2],
-                  padding=padding,
-                  dilation_rate=[rate])
+    with self.test_session() as sess:
+      with delay_checks(sess) as add_check:
+        for padding in ["SAME", "VALID"]:
+          for width in [9, 10]:
+            for kernel_width in range(1, 4):
+              for rate in range(1, 4):
+                self._test_atrous_convolution(
+                    add_check=add_check,
+                    input_shape=[2, width, 2],
+                    filter_shape=[kernel_width, 2, 2],
+                    padding=padding,
+                    dilation_rate=[rate],
+                )
 
   def testAtrousConvolutionNC(self):
     if test.is_gpu_available(cuda_only=True):
       # "NCW" and "NCHW" formats are currently supported only on CUDA.
-      with self.test_session(use_gpu=True):
-        for padding in ["SAME", "VALID"]:
-          self._test_atrous_convolution(
-              input_shape=[2, 2, 9],
-              padding=padding,
-              filter_shape=[3, 2, 2],
-              dilation_rate=[2],
-              data_format="NCW")
-          self._test_atrous_convolution(
-              input_shape=[2, 2, 9, 5],
-              padding=padding,
-              filter_shape=[3, 3, 2, 2],
-              dilation_rate=[2, 1],
-              data_format="NCHW")
+      with self.test_session(use_gpu=True) as sess:
+        with delay_checks(sess) as add_check:
+          for padding in ["SAME", "VALID"]:
+            self._test_atrous_convolution(
+                add_check=add_check,
+                input_shape=[2, 2, 9],
+                padding=padding,
+                filter_shape=[3, 2, 2],
+                dilation_rate=[2],
+                data_format="NCW",
+            )
+            self._test_atrous_convolution(
+                add_check=add_check,
+                input_shape=[2, 2, 9, 5],
+                padding=padding,
+                filter_shape=[3, 3, 2, 2],
+                dilation_rate=[2, 1],
+                data_format="NCHW",
+            )
 
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
 
     See the documentation of with_space_to_batch.
     """
-    with self.test_session():
-      for padding in ["SAME", "VALID"]:
-        for height in range(15, 17):
-          for width in range(15, 17):
-            x_shape = [3, height, width, 2]
-            x = np.random.random_sample(x_shape).astype(np.float32)
+    with self.test_session() as sess:
+      with delay_checks(sess) as add_check:
+        for padding in ["SAME", "VALID"]:
+          for height in range(15, 17):
+            for width in range(15, 17):
+              x_shape = [3, height, width, 2]
+              x = np.random.random_sample(x_shape).astype(np.float32)
 
-            kernel_sizes = [1, 3] if padding == "SAME" else range(1, 3)
-            for kernel in kernel_sizes:
-              f_shape = [kernel, kernel, 2, 2]
-              f1 = 1e-2 * np.random.random_sample(f_shape).astype(np.float32)
-              f2 = 1e-2 * np.random.random_sample(f_shape).astype(np.float32)
+              kernel_sizes = [1, 3] if padding == "SAME" else range(1, 3)
+              for kernel in kernel_sizes:
+                f_shape = [kernel, kernel, 2, 2]
+                f1 = 1e-2 * np.random.random_sample(f_shape).astype(np.float32)
+                f2 = 1e-2 * np.random.random_sample(f_shape).astype(np.float32)
 
-              def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
-                result = nn_ops.convolution(
-                    input=converted_input, filter=f1,
-                    padding=padding)  # pylint: disable=cell-var-from-loop
-                result = nn_ops.convolution(
-                    input=result, filter=f2,
-                    padding=padding)  # pylint: disable=cell-var-from-loop
-                return result
+                def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
+                  # pylint: disable=cell-var-from-loop
+                  result = nn_ops.convolution(input=converted_input, filter=f1,
+                                              padding=padding)
+                  result = nn_ops.convolution(input=result, filter=f2,
+                                              padding=padding)
+                  # pylint: enable=cell-var-from-loop
+                  return result
 
-              for rate_height in range(2, 4):
-                for rate_width in range(2, 4):
-                  dilation_rate = [rate_height, rate_width]
-                  y1 = nn_ops.convolution(
-                      input=x,
-                      filter=f1,
-                      padding=padding,
-                      dilation_rate=dilation_rate)
-                  y1 = nn_ops.convolution(
-                      input=y1,
-                      filter=f2,
-                      padding=padding,
-                      dilation_rate=dilation_rate)
-                  y2 = nn_ops.with_space_to_batch(
-                      input=x,
-                      dilation_rate=dilation_rate,
-                      op=combined_op,
-                      padding="VALID")
-                  self.assertAllClose(
-                      y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+                for rate_height in range(2, 4):
+                  for rate_width in range(2, 4):
+                    dilation_rate = [rate_height, rate_width]
+                    y1 = nn_ops.convolution(input=x, filter=f1, padding=padding,
+                                            dilation_rate=dilation_rate)
+                    y1 = nn_ops.convolution(input=y1, filter=f2,
+                                            padding=padding,
+                                            dilation_rate=dilation_rate)
+                    y2 = nn_ops.with_space_to_batch(
+                        input=x, dilation_rate=dilation_rate, op=combined_op,
+                        padding="VALID")
+
+                    def check(y1_eval, y2_eval):
+                      self.assertAllClose(y1_eval, y2_eval, rtol=1e-2,
+                                          atol=1e-2)
+                    add_check(check, y1, y2)
 
   def _test_gradient(self, x_shape, f_shape, dilation_rate, padding):
     x_val = np.random.random_sample(x_shape).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index e90543a44b0..7f49c639577 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -402,7 +402,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         fail_insert_op.run()
 
-      # This op should succeed because the barrier has not cancelled
+      # This op should succeed because the barrier has not canceled
       # pending enqueues
       insert_1_op.run()
       self.assertEquals(size_t.eval(), [3])
@@ -461,7 +461,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         fail_insert_op.run()
 
-      # This op should fail because the queue is cancelled.
+      # This op should fail because the queue is canceled.
       with self.assertRaisesOpError("is closed"):
         insert_2_op.run()
 
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 0438d95bc4d..b5b17ff80ab 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -18,15 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import itertools
+import threading
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
 from tensorflow.python.platform import test
 
@@ -107,6 +113,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.arctan, math_ops.atan, use_gpu)
     self._compare(data, np.ceil, math_ops.ceil, use_gpu)
     self._compare(data, np.cos, math_ops.cos, use_gpu)
+    self._compare(data, np.cosh, math_ops.cosh, use_gpu)
     self._compare(data, np.exp, math_ops.exp, use_gpu)
     self._compare(data, np.floor, math_ops.floor, use_gpu)
     self._compare(data, np.log, math_ops.log, use_gpu)
@@ -114,6 +121,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.negative, math_ops.negative, use_gpu)
     self._compare(data, self._rsqrt, math_ops.rsqrt, use_gpu)
     self._compare(data, np.sin, math_ops.sin, use_gpu)
+    self._compare(data, np.sinh, math_ops.sinh, use_gpu)
     self._compare(data, np.sqrt, math_ops.sqrt, use_gpu)
     self._compare(data, np.square, math_ops.square, use_gpu)
     self._compare(data, np.tan, math_ops.tan, use_gpu)
@@ -123,7 +131,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     for dtype in [np.float32]:
       self._testDtype(dtype, use_gpu=True)
 
-  def testFloorDevide(self):
+  def testFloorDivide(self):
     x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
     y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
@@ -219,5 +227,50 @@ class BroadcastSimpleTest(test.TestCase):
     self._compareGpu(x, y + 0.1, np.floor_divide, math_ops.floordiv)
 
 
-if __name__ == "__main__":
+class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
+  """Tests concurrent sessions executing on the same GPU."""
+
+  def _run_session(self, session, results):
+    n_iterations = 500
+    with session as s:
+      data = variables.Variable(1.0)
+      with ops.device('/gpu:0'):
+        random_seed.set_random_seed(1)
+        matrix1 = variables.Variable(
+            random_ops.truncated_normal([1024, 1]), name='matrix1')
+        matrix2 = variables.Variable(
+            random_ops.truncated_normal([1, 1024]), name='matrix2')
+        x1 = math_ops.multiply(data, matrix1, name='x1')
+        x3 = math_ops.matmul(x1, math_ops.matmul(matrix2, matrix1))
+        x4 = math_ops.matmul(array_ops.transpose(x3), x3, name='x4')
+        s.run(variables.global_variables_initializer())
+
+        for _ in xrange(n_iterations):
+          value = s.run(x4)
+          results.add(value.flat[0])
+          if len(results) != 1:
+            break
+
+  def testConcurrentSessions(self):
+    n_threads = 4
+    threads = []
+    results = []
+    for _ in xrange(n_threads):
+      session = self.test_session(graph=ops.Graph(), use_gpu=True)
+      results.append(set())
+      args = (session, results[-1])
+      threads.append(threading.Thread(target=self._run_session, args=args))
+
+    for thread in threads:
+      thread.start()
+    for thread in threads:
+      thread.join()
+
+    flat_results = set([x for x in itertools.chain(*results)])
+    self.assertEqual(1,
+                     len(flat_results),
+                     'Expected single value, got %r' % flat_results)
+
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index a1aad2f4e1f..34089e8dbea 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -112,6 +112,7 @@ class BatchMatmulOpTest(test.TestCase):
 
     compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
     compareNonEmpty(self, [1, 2, 3], [1, 3, 1])
+    compareNonEmpty(self, [1, 1, 3], [1, 3, 5])
     compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
     compareNonEmpty(self, [7, 1, 3], [7, 3, 5])
     compareNonEmpty(self, [7, 2, 3], [7, 3, 1])
@@ -175,7 +176,13 @@ class BatchMatmulGradientTest(test.TestCase):
   def _compare(self, b, n, k, m, dtype, adjoint_a, adjoint_b):
     np.random.seed(42)
     x = np.random.normal(0, 1, b * n * k).astype(dtype).reshape([b, n, k])
+    if dtype in (np.complex64, np.complex128):
+      x.imag = np.random.normal(0, 1,
+                                b * n * k).astype(dtype).reshape([b, n, k])
     y = np.random.normal(0, 1, b * k * m).astype(dtype).reshape([b, k, m])
+    if dtype in (np.complex64, np.complex128):
+      y.imag = np.random.normal(0, 1,
+                                b * k * m).astype(dtype).reshape([b, k, m])
     self._checkGrad(x, y, adjoint_a, adjoint_b)
 
 
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index afdb436dc68..08b03f85180 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -25,76 +25,78 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
 class BetaincTest(test.TestCase):
-  use_gpu = False
 
-  def _testBetaInc(self, dtype):
+  def _testBetaInc(self, a_s, b_s, x_s, dtype):
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       np_dt = dtype.as_numpy_dtype
 
       # Test random values
-      a_s = np.abs(np.random.randn(10, 10) * 30).astype(np_dt)  # in (0, infty)
-      b_s = np.abs(np.random.randn(10, 10) * 30).astype(np_dt)  # in (0, infty)
-      x_s = np.random.rand(10, 10).astype(np_dt)  # in (0, 1)
-      with self.test_session(use_gpu=self.use_gpu):
-        tf_a_s = constant_op.constant(a_s, dtype=dtype)
-        tf_b_s = constant_op.constant(b_s, dtype=dtype)
-        tf_x_s = constant_op.constant(x_s, dtype=dtype)
-        tf_out = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s).eval()
+      a_s = a_s.astype(np_dt)  # in (0, infty)
+      b_s = b_s.astype(np_dt)  # in (0, infty)
+      x_s = x_s.astype(np_dt)  # in (0, 1)
+      tf_a_s = constant_op.constant(a_s, dtype=dtype)
+      tf_b_s = constant_op.constant(b_s, dtype=dtype)
+      tf_x_s = constant_op.constant(x_s, dtype=dtype)
+      tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
+      with self.test_session():
+        tf_out = tf_out_t.eval()
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
       tol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      self.assertAllCloseAccordingToType(scipy_out, tf_out, rtol=tol, atol=tol)
+      self.assertAllCloseAccordingToType(scipy_out, tf_out, rtol=tol, atol=0)
 
       # Test out-of-range values (most should return nan output)
       combinations = list(itertools.product([-1, 0, 0.5, 1.0, 1.5], repeat=3))
       a_comb, b_comb, x_comb = np.asarray(list(zip(*combinations)), dtype=np_dt)
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb).astype(np_dt)
       self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
 
       # Test broadcasting between scalars and other shapes
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, b_s, x_s).astype(np_dt),
             math_ops.betainc(0.1, b_s, x_s).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(a_s, 0.1, x_s).astype(np_dt),
             math_ops.betainc(a_s, 0.1, x_s).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(a_s, b_s, 0.1).astype(np_dt),
             math_ops.betainc(a_s, b_s, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, b_s, 0.1).astype(np_dt),
             math_ops.betainc(0.1, b_s, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, 0.1, 0.1).astype(np_dt),
             math_ops.betainc(0.1, 0.1, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
 
       with self.assertRaisesRegexp(ValueError, "must be equal"):
         math_ops.betainc(0.5, [0.5], [[0.5]])
 
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         with self.assertRaisesOpError("Shapes of .* are inconsistent"):
           a_p = array_ops.placeholder(dtype)
           b_p = array_ops.placeholder(dtype)
@@ -108,14 +110,79 @@ class BetaincTest(test.TestCase):
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
   def testBetaIncFloat(self):
-    self._testBetaInc(dtypes.float32)
+    a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
   def testBetaIncDouble(self):
-    self._testBetaInc(dtypes.float64)
+    a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  def testBetaIncDoubleVeryLargeValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
-class BetaincTestGPU(BetaincTest):
-  use_gpu = True
+  def testBetaIncDoubleVerySmallValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
+
+  def testBetaIncFloatVerySmallValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
+
+  def testBetaIncFpropAndBpropAreNeverNAN(self):
+    with self.test_session() as sess:
+      space = np.logspace(-8, 5).tolist()
+      space_x = np.linspace(1e-16, 1 - 1e-16).tolist()
+      ga_s, gb_s, gx_s = zip(*list(itertools.product(space, space, space_x)))
+      # Test grads are never nan
+      ga_s_t = constant_op.constant(ga_s, dtype=dtypes.float32)
+      gb_s_t = constant_op.constant(gb_s, dtype=dtypes.float32)
+      gx_s_t = constant_op.constant(gx_s, dtype=dtypes.float32)
+      tf_gout_t = math_ops.betainc(ga_s_t, gb_s_t, gx_s_t)
+      tf_gout, grads_x = sess.run(
+          [tf_gout_t,
+           gradients_impl.gradients(tf_gout_t, [ga_s_t, gb_s_t, gx_s_t])[2]])
+
+      # Equivalent to `assertAllFalse` (if it existed).
+      self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
+                          np.isnan(tf_gout))
+      self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
+                          np.isnan(grads_x))
+
+  def testBetaIncGrads(self):
+    err_tolerance = 1e-3
+    with self.test_session():
+      # Test gradient
+      ga_s = np.abs(np.random.randn(2, 2) * 30)  # in (0, infty)
+      gb_s = np.abs(np.random.randn(2, 2) * 30)  # in (0, infty)
+      gx_s = np.random.rand(2, 2)  # in (0, 1)
+      tf_ga_s = constant_op.constant(ga_s, dtype=dtypes.float64)
+      tf_gb_s = constant_op.constant(gb_s, dtype=dtypes.float64)
+      tf_gx_s = constant_op.constant(gx_s, dtype=dtypes.float64)
+      tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
+      err = gradient_checker.compute_gradient_error(
+          [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
+      print("betainc gradient err = %g " % err)
+      self.assertLess(err, err_tolerance)
+
+      # Test broadcast gradient
+      gx_s = np.random.rand()  # in (0, 1)
+      tf_gx_s = constant_op.constant(gx_s, dtype=dtypes.float64)
+      tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
+      err = gradient_checker.compute_gradient_error(
+          [tf_gx_s], [()], tf_gout_t, ga_s.shape)
+      print("betainc gradient err = %g " % err)
+      self.assertLess(err, err_tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
new file mode 100644
index 00000000000..7a610debd1d
--- /dev/null
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -0,0 +1,92 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for math_ops.bincount."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+class BincountTest(test_util.TensorFlowTestCase):
+
+  def test_empty(self):
+    with self.test_session():
+      self.assertAllEqual(
+          math_ops.bincount([], minlength=5).eval(), [0, 0, 0, 0, 0])
+      self.assertAllEqual(math_ops.bincount([], minlength=1).eval(), [0])
+      self.assertAllEqual(math_ops.bincount([], minlength=0).eval(), [])
+      self.assertEqual(
+          math_ops.bincount([], minlength=0, dtype=np.float32).eval().dtype,
+          np.float32)
+      self.assertEqual(
+          math_ops.bincount([], minlength=3, dtype=np.float64).eval().dtype,
+          np.float64)
+
+  def test_values(self):
+    with self.test_session():
+      self.assertAllEqual(
+          math_ops.bincount([1, 1, 1, 2, 2, 3]).eval(), [0, 3, 2, 1])
+      arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
+      self.assertAllEqual(math_ops.bincount(arr).eval(), [0, 5, 4, 3, 2, 1])
+      arr += [0, 0, 0, 0, 0, 0]
+      self.assertAllEqual(math_ops.bincount(arr).eval(), [6, 5, 4, 3, 2, 1])
+
+      self.assertAllEqual(math_ops.bincount([]).eval(), [])
+      self.assertAllEqual(math_ops.bincount([0, 0, 0]).eval(), [3])
+      self.assertAllEqual(math_ops.bincount([5]).eval(), [0, 0, 0, 0, 0, 1])
+      self.assertAllEqual(
+          math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
+
+  def test_maxlength(self):
+    with self.test_session():
+      self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
+      self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
+      self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
+
+  def test_random_with_weights(self):
+    num_samples = 10000
+    with self.test_session():
+      np.random.seed(42)
+      for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
+        arr = np.random.randint(0, 1000, num_samples)
+        if dtype == dtypes.int32 or dtype == dtypes.int64:
+          weights = np.random.randint(-100, 100, num_samples)
+        else:
+          weights = np.random.random(num_samples)
+        self.assertAllEqual(
+            math_ops.bincount(arr, weights).eval(),
+            np.bincount(arr, weights))
+
+  def test_zero_weights(self):
+    with self.test_session():
+      self.assertAllEqual(
+          math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
+          np.zeros(1000))
+
+  def test_negative(self):
+    with self.test_session():
+      with self.assertRaises(errors.InvalidArgumentError):
+        math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 329658f9c29..077a4b8e277 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 class BitcastTest(test.TestCase):
 
   def _testBitcast(self, x, datatype, shape):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       tf_ans = array_ops.bitcast(x, datatype)
       out = tf_ans.eval()
       buff_after = memoryview(out).tobytes()
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
new file mode 100644
index 00000000000..ed53cc62940
--- /dev/null
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -0,0 +1,68 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for bucketize_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BucketizationOpTest(test.TestCase):
+
+  def testInt(self):
+    op = math_ops._bucketize(
+        constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
+        boundaries=[0, 3, 8, 11])
+    expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def testFloat(self):
+    op = math_ops._bucketize(
+        constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
+        boundaries=[0., 3., 8., 11.])
+    expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def test2DInput(self):
+    op = math_ops._bucketize(
+        constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
+        boundaries=[0, 3, 8, 11])
+    expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def testInvalidBoundariesOrder(self):
+    op = math_ops._bucketize(
+        constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
+        sess.run(op)
+
+  def testBoundariesNotList(self):
+    with self.assertRaisesRegexp(
+        TypeError, "Expected list for attr boundaries"):
+      math_ops._bucketize(constant_op.constant([-5, 0]), boundaries=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index a2df4cb2a74..ed859e37741 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -80,23 +80,35 @@ class AssertEqualTest(test.TestCase):
 
   def test_raises_when_greater(self):
     with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+      # Static check
+      static_small = constant_op.constant([1, 2], name="small")
+      static_big = constant_op.constant([3, 4], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
       with ops.control_dependencies(
           [check_ops.assert_equal(
               big, small, message="fail")]):
         out = array_ops.identity(small)
       with self.assertRaisesOpError("fail.*big.*small"):
-        out.eval()
+        out.eval(feed_dict={small: [1, 2], big: [3, 4]})
 
   def test_raises_when_less(self):
     with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
-      big = constant_op.constant([4, 2], name="big")
+      # Static check
+      static_small = constant_op.constant([3, 1], name="small")
+      static_big = constant_op.constant([4, 2], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
       with ops.control_dependencies([check_ops.assert_equal(small, big)]):
         out = array_ops.identity(small)
       with self.assertRaisesOpError("small.*big"):
-        out.eval()
+        out.eval(feed_dict={small: [3, 1], big: [4, 2]})
 
   def test_doesnt_raise_when_equal_and_broadcastable_shapes(self):
     with self.test_session():
@@ -124,6 +136,55 @@ class AssertEqualTest(test.TestCase):
       out.eval()
 
 
+class AssertNoneEqualTest(test.TestCase):
+
+  def test_doesnt_raise_when_not_equal(self):
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      big = constant_op.constant([10, 20], name="small")
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(big, small)]):
+        out = array_ops.identity(small)
+      out.eval()
+
+  def test_raises_when_equal(self):
+    with self.test_session():
+      small = constant_op.constant([3, 1], name="small")
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(small, small)]):
+        out = array_ops.identity(small)
+      with self.assertRaisesOpError("x != y did not hold"):
+        out.eval()
+
+  def test_doesnt_raise_when_not_equal_and_broadcastable_shapes(self):
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      big = constant_op.constant([3], name="big")
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(small, big)]):
+        out = array_ops.identity(small)
+      out.eval()
+
+  def test_raises_when_not_equal_but_non_broadcastable_shapes(self):
+    with self.test_session():
+      small = constant_op.constant([1, 1, 1], name="small")
+      big = constant_op.constant([10, 10], name="big")
+      with self.assertRaisesRegexp(ValueError, "must be"):
+        with ops.control_dependencies(
+            [check_ops.assert_none_equal(small, big)]):
+          out = array_ops.identity(small)
+        out.eval()
+
+  def test_doesnt_raise_when_both_empty(self):
+    with self.test_session():
+      larry = constant_op.constant([])
+      curly = constant_op.constant([])
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(larry, curly)]):
+        out = array_ops.identity(larry)
+      out.eval()
+
+
 class AssertLessTest(test.TestCase):
 
   def test_raises_when_equal(self):
@@ -893,5 +954,69 @@ class IsNonDecreasingTest(test.TestCase):
       self.assertTrue(check_ops.is_non_decreasing([]).eval())
 
 
+class FloatDTypeTest(test.TestCase):
+
+  def test_assert_same_float_dtype(self):
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype(None, None))
+    self.assertIs(dtypes.float32, check_ops.assert_same_float_dtype([], None))
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype([], dtypes.float32))
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype(None, dtypes.float32))
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype([None, None], None))
+    self.assertIs(
+        dtypes.float32,
+        check_ops.assert_same_float_dtype([None, None], dtypes.float32))
+
+    const_float = constant_op.constant(3.0, dtype=dtypes.float32)
+    self.assertIs(
+        dtypes.float32,
+        check_ops.assert_same_float_dtype([const_float], dtypes.float32))
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [const_float], dtypes.int32)
+
+    sparse_float = sparse_tensor.SparseTensor(
+        constant_op.constant([[111], [232]], dtypes.int64),
+        constant_op.constant([23.4, -43.2], dtypes.float32),
+        constant_op.constant([500], dtypes.int64))
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype([sparse_float],
+                                                    dtypes.float32))
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [sparse_float], dtypes.int32)
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [const_float, None, sparse_float], dtypes.float64)
+
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype(
+                      [const_float, sparse_float]))
+    self.assertIs(dtypes.float32,
+                  check_ops.assert_same_float_dtype(
+                      [const_float, sparse_float], dtypes.float32))
+
+    const_int = constant_op.constant(3, dtype=dtypes.int32)
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [sparse_float, const_int])
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [sparse_float, const_int], dtypes.int32)
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [sparse_float, const_int], dtypes.float32)
+    self.assertRaises(ValueError, check_ops.assert_same_float_dtype,
+                      [const_int])
+
+
+class AssertScalarTest(test.TestCase):
+
+  def test_assert_scalar(self):
+    check_ops.assert_scalar(constant_op.constant(3))
+    check_ops.assert_scalar(constant_op.constant("foo"))
+    check_ops.assert_scalar(3)
+    check_ops.assert_scalar("foo")
+    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+      check_ops.assert_scalar(constant_op.constant([3, 4]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index bbe1d052f03..b7f8f5c51f6 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -21,16 +21,71 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+# Different gradient implementations for benchmark purposes
+def SpecializedGrad(l, grad):
+  return gen_linalg_ops.cholesky_grad(l, grad)
+
+
+def _GradWithInverseL(l, l_inverse, grad):
+  middle = math_ops.matmul(l, grad, adjoint_a=True)
+  middle = array_ops.matrix_set_diag(middle,
+                                     0.5 * array_ops.matrix_diag_part(middle))
+  middle = array_ops.matrix_band_part(middle, -1, 0)
+  grad_a = math_ops.matmul(
+      math_ops.matmul(l_inverse, middle, adjoint_a=True), l_inverse)
+  grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a))
+  return grad_a * 0.5
+
+
+def TriAngSolveCompositeGrad(l, grad):
+  # Gradient is l^{-H} @ ((l^{H} @ grad) * (tril(ones)-1/2*eye)) @ l^{-1}
+
+  # Compute ((l^{H} @ grad) * (tril(ones)-1/2*eye)) = middle
+  middle = math_ops.matmul(l, grad, adjoint_a=True)
+  middle = array_ops.matrix_set_diag(middle,
+                                     0.5 * array_ops.matrix_diag_part(middle))
+  middle = array_ops.matrix_band_part(middle, -1, 0)
+
+  # Compute l^{-H} @ middle = z
+  l_inverse_middle = linalg_ops.matrix_triangular_solve(l, middle, adjoint=True)
+
+  # We need to compute z @ l^{-1}. With matrix_triangular_solve we
+  # actually compute l^{-H} @ z^{H} = grad. Since we later add grad^{H}
+  # we can ommit the conjugate transpose here.
+  z_h = math_ops.conj(array_ops.matrix_transpose(l_inverse_middle))
+  grad_a = linalg_ops.matrix_triangular_solve(l, z_h, adjoint=True)
+  grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a))
+  return grad_a * 0.5
+
+
+def MatrixInverseCompositeGrad(l, grad):
+  l_inverse = linalg_ops.matrix_inverse(l)
+  return _GradWithInverseL(l, l_inverse, grad)
+
+
+def TriAngInvCompositeGrad(l, grad):
+  num_rows = array_ops.shape(l)[-1]
+  batch_shape = array_ops.shape(l)[:-2]
+  l_inverse = linalg_ops.matrix_triangular_solve(
+      l, linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=l.dtype))
+  return _GradWithInverseL(l, l_inverse, grad)
+
+
 class CholeskyOpTest(test.TestCase):
 
   def _verifyCholeskyBase(self, sess, x, chol, verification):
@@ -48,13 +103,20 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       chol = linalg_ops.cholesky(x)
       verification = math_ops.matmul(chol, chol, adjoint_b=True)
       self._verifyCholeskyBase(sess, x, chol, verification)
 
   def testBasic(self):
-    self._verifyCholesky(np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]))
+    data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
+    for dtype in (np.float32, np.float64):
+      self._verifyCholesky(data.astype(dtype))
+    for dtype in (np.complex64, np.complex128):
+      complex_data = np.tril(1j * data, -1).astype(dtype)
+      complex_data += np.triu(-1j * data, 1).astype(dtype)
+      complex_data += data
+      self._verifyCholesky(complex_data)
 
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
@@ -63,12 +125,18 @@ class CholeskyOpTest(test.TestCase):
     odd_sized_array = np.array([[[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]])
     self._verifyCholesky(np.vstack((odd_sized_array, odd_sized_array)))
 
-    #  Generate random positive-definite matrices.
+    # Generate random positive-definite matrices.
     matrices = np.random.rand(10, 5, 5)
     for i in xrange(10):
       matrices[i] = np.dot(matrices[i].T, matrices[i])
     self._verifyCholesky(matrices)
 
+    # Generate random complex valued positive-definite matrices.
+    matrices = np.random.rand(10, 5, 5) + 1j * np.random.rand(10, 5, 5)
+    for i in xrange(10):
+      matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
+    self._verifyCholesky(matrices)
+
   def testNonSquareMatrix(self):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
@@ -84,11 +152,12 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
-  def testNotInvertible(self):
+  def testNotInvertibleCPU(self):
     # The input should be invertible.
-    with self.test_session():
-      with self.assertRaisesOpError("LLT decomposition was not successful. The"
-                                    " input might not be valid."):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesOpError(
+          "Cholesky decomposition was not successful. The"
+          " input might not be valid."):
         # All rows of the matrix below add to zero
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
@@ -107,7 +176,14 @@ class CholeskyGradTest(test.TestCase):
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
-    self.runFiniteDifferences(shapes)
+    self.runFiniteDifferences(
+        shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
+
+  def testSmallMatricesComplex(self):
+    np.random.seed(0)
+    shapes = self.getShapes([1, 2, 10])
+    self.runFiniteDifferences(
+        shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
   def testOneBlockMatrices(self):
     np.random.seed(0)
@@ -129,25 +205,61 @@ class CholeskyGradTest(test.TestCase):
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
 
+  def testTwoBlockMatrixComplexFloat(self):
+    np.random.seed(0)
+    shapes = self.getShapes([2 * self._backprop_block_size + 1])
+    self.runFiniteDifferences(
+        shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
+
+  def testTwoBlockMatrixComplexDouble(self):
+    np.random.seed(0)
+    shapes = self.getShapes([2 * self._backprop_block_size + 1])
+    self.runFiniteDifferences(
+        shapes, dtypes=(dtypes_lib.complex128,), scalarTest=True)
+
+  def testAgainstSpecialized(self):
+    np.random.seed(0)
+    data = np.random.randn(33, 33).astype(np.float32)
+    data = np.matmul(data, data.T)
+    grad_data = np.random.randn(*data.shape).astype(np.float32)
+
+    with ops.Graph().as_default(), self.test_session(use_gpu=False) as s:
+      x = constant_op.constant(data, dtypes_lib.float32)
+      chol = linalg_ops.cholesky(x)
+      composite_grad = gradients_impl.gradients(chol, x, grad_data)[0]
+      specialized_grad = SpecializedGrad(chol, grad_data)
+      reference, actual = s.run([specialized_grad, composite_grad])
+    self.assertAllClose(reference, actual)
+
   def runFiniteDifferences(self,
                            shapes,
-                           dtypes=(dtypes_lib.float32, dtypes_lib.float64),
+                           dtypes=(dtypes_lib.float32, dtypes_lib.float64,
+                                   dtypes_lib.complex64, dtypes_lib.complex128),
                            scalarTest=False):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       for shape in shapes:
         for batch in False, True:
           for dtype in dtypes:
             if not scalarTest:
-              x = constant_op.constant(
-                  np.random.randn(shape[0], shape[1]), dtype)
-              tensor = math_ops.matmul(x, array_ops.transpose(x)) / shape[0]
+              data = np.random.randn(shape[0], shape[1])
+              if dtype.is_complex:
+                data = data.astype(np.complex64)
+                data += 1j * np.random.randn(shape[0], shape[1])
+              x = constant_op.constant(data, dtype)
+              tensor = math_ops.matmul(
+                  x, math_ops.conj(array_ops.transpose(x))) / shape[0]
             else:
               # This is designed to be a faster test for larger matrices.
-              x = constant_op.constant(np.random.randn(), dtype)
+              data = np.random.randn()
+              if dtype.is_complex:
+                data = np.complex64(data)
+                data += 1j * np.random.randn()
+              x = constant_op.constant(data, dtype)
               R = constant_op.constant(
                   np.random.randn(shape[0], shape[1]), dtype)
               e = math_ops.multiply(R, x)
-              tensor = math_ops.matmul(e, array_ops.transpose(e)) / shape[0]
+              tensor = math_ops.matmul(
+                  e, math_ops.conj(array_ops.transpose(e))) / shape[0]
 
             # Inner-most matrices in tensor are positive definite.
             if batch:
@@ -156,15 +268,87 @@ class CholeskyGradTest(test.TestCase):
             y = linalg_ops.cholesky(tensor)
             if scalarTest:
               y = math_ops.reduce_mean(y)
-            error = gradient_checker.compute_gradient_error(x,
-                                                            x._shape_as_list(),
-                                                            y,
-                                                            y._shape_as_list())
+            error = gradient_checker.compute_gradient_error(
+                x, x._shape_as_list(), y, y._shape_as_list())
             tf_logging.info("error = %f", error)
             if dtype == dtypes_lib.float64:
               self.assertLess(error, 1e-5)
+            elif dtype == dtypes_lib.complex128:
+              self.assertLess(error, 5e-5)
             else:
-              self.assertLess(error, 3e-3)
+              self.assertLess(error, 5e-3)
+
+
+class CholeskyBenchmark(test.Benchmark):
+
+  sizes = [
+      (4, 4), (16, 16), (256, 256), (1024, 1024), (2048, 2048),
+      (513, 2, 2), (513, 8, 8), (4, 513, 2, 2)
+  ]
+
+  def _GenerateData(self, size):
+    batch_shape = size[:-2]
+    size = size[-2:]
+    assert size[0] == size[1]
+    n = size[0]
+    data = np.ones(size).astype(np.float32) / (2.0 * n) + np.diag(
+        np.ones(n).astype(np.float32))
+    return np.tile(data, batch_shape + (1, 1))
+
+  def benchmarkCholeskyOp(self):
+    for size in self.sizes:
+      data = self._GenerateData(size)
+
+      with ops.Graph().as_default(), \
+          session.Session() as sess, \
+          ops.device("/cpu:0"):
+        l = linalg_ops.cholesky(data)
+        self.run_op_benchmark(
+            sess, control_flow_ops.group(l,),
+            min_iters=25,
+            name="cholesky_cpu_{size}".format(size=size))
+
+      if test.is_gpu_available(True):
+        with ops.Graph().as_default(), \
+            session.Session() as sess, \
+            ops.device("/gpu:0"):
+          l = linalg_ops.cholesky(data)
+          self.run_op_benchmark(
+              sess, l,
+              min_iters=25,
+              name="cholesky_gpu_{size}".format(size=size))
+
+  def benchmarkGradVariants(self):
+    def _BenchmarkGrad(grad_fn, name, device):
+      for size in self.sizes:
+        data = self._GenerateData(size)
+        l = np.linalg.cholesky(data)
+        grad_data = np.random.randn(*data.shape).astype(np.float32)
+        with ops.Graph().as_default(), \
+            session.Session() as sess, \
+            ops.device(device):
+          grad = grad_fn(l, grad_data)
+          self.run_op_benchmark(
+              sess, control_flow_ops.group(grad,),
+              min_iters=25,
+              name="{name}_{dev}_{size}".format(
+                  name=name, dev=grad.device, size=size))
+
+    if test.is_gpu_available(True):
+      _BenchmarkGrad(
+          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/gpu:0")
+      _BenchmarkGrad(
+          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/gpu:0")
+      _BenchmarkGrad(
+          TriAngSolveCompositeGrad, "composite_triangular_solve", "/gpu:0")
+
+    _BenchmarkGrad(
+        MatrixInverseCompositeGrad, "composite_matrix_inverse", "/cpu:0")
+    _BenchmarkGrad(
+        TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/cpu:0")
+    _BenchmarkGrad(
+        TriAngSolveCompositeGrad, "composite_triangular_solve", "/cpu:0")
+    _BenchmarkGrad(SpecializedGrad, "specialized", "/cpu:0")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index bbd1ab46ae2..5c8b71da174 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -37,6 +37,16 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueBadShape(self):
+    with self.test_session():
+      x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
+      # Use a nonsensical shape.
+      clip = constant_op.constant([1.0, 2.0])
+      with self.assertRaises(ValueError):
+        _ = clip_ops.clip_by_value(x, -clip, clip)
+      with self.assertRaises(ValueError):
+        _ = clip_ops.clip_by_value(x, 1.0, clip)
+
   def testClipByValueNonFinite(self):
     with self.test_session():
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
@@ -65,6 +75,14 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans)
     self.assertAllClose(np_ans, tf_ans_tensor)
 
+  def testClipByNormBadShape(self):
+    with self.test_session():
+      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
+      # Use a nonsensical shape.
+      clip = constant_op.constant([1.0, 2.0])
+      with self.assertRaises(ValueError):
+        _ = clip_ops.clip_by_norm(x, clip)
+
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a9cd0dd726b..0bb5b551555 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class ConcatOpTest(test.TestCase):
 
   def testHStack(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 0)
@@ -50,7 +50,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[4:, :], params[p2])
 
   def testVStack(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 1)
@@ -76,7 +76,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[2:, :], p2)
 
   def testRefType(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = np.random.rand(4, 4).astype("f")
       p2 = np.random.rand(4, 4).astype("f")
       v1 = variables.Variable(p1)
@@ -89,7 +89,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:4, :], p1)
     self.assertAllEqual(result[4:, :], p2)
 
-  def _testRandom(self, dtype, use_gpu=False):
+  def _testRandom(self, dtype):
     # Random dims of rank 5
     shape = np.random.randint(1, 5, size=5)
     # Random number of tensors, but always > 1.
@@ -101,7 +101,7 @@ class ConcatOpTest(test.TestCase):
       dtype_feed = dtypes.float32
     else:
       dtype_feed = dtype
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=True):
       p = []
       for i in np.arange(num_tensors):
         input_shape = shape
@@ -139,11 +139,11 @@ class ConcatOpTest(test.TestCase):
 
   def testRandom(self):
     self._testRandom(dtypes.float32)
-    self._testRandom(dtypes.float32, use_gpu=True)
     self._testRandom(dtypes.int16)
-    self._testRandom(dtypes.int32, use_gpu=True)
+    self._testRandom(dtypes.int32)
     self._testRandom(dtypes.bfloat16)
-    self._testRandom(dtypes.bfloat16, use_gpu=True)
+    self._testRandom(dtypes.complex64)
+    self._testRandom(dtypes.complex128)
 
   def testInvalidConcatDimTypeAndShape(self):
     a = variables.Variable(constant_op.constant(1.0, shape=[1]))
@@ -166,38 +166,42 @@ class ConcatOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.concat(1, constant_op.constant(0, shape=[1]))
 
-  def _testGradientsSimple(self, use_gpu):
+  def _testGradientsSimple(self, dtype):
     # Test both positive and negative concat axis.
     # -2 and 1 correspond to the same axis for 3-dimensional tensors.
     for axis in [-2, 1]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.test_session(use_gpu=True):
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
           shape = [10, x, 2]
-          t = np.random.rand(*shape).astype("f")
+          t = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+          if dtype.is_complex:
+            t += -1j * t
           inp.append(t)
           inp_tensors.append(
               constant_op.constant(
-                  [float(y) for y in t.flatten()],
+                  t.flatten(),
                   shape=shape,
-                  dtype=dtypes.float32))
+                  dtype=dtype))
         c = array_ops.concat(inp_tensors, axis)
         output_shape = [10, 9, 2]
-        grad_inp = np.random.rand(*output_shape).astype("f")
+        grad_inp = np.random.rand(*output_shape).astype(dtype.as_numpy_dtype)
+        if dtype.is_complex:
+          grad_inp += -1j * grad_inp
         grad_tensor = constant_op.constant(
-            [float(x) for x in grad_inp.flatten()], shape=output_shape)
+            grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
         result = concated_grad.eval()
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsSimpleAll(self):
-    self._testGradientsSimple(use_gpu=True)
-    self._testGradientsSimple(use_gpu=False)
+  def testGradientsSimple(self):
+    self._testGradientsSimple(dtypes.float32)
+    self._testGradientsSimple(dtypes.complex64)
 
-  def _testGradientsFirstDim(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+  def testGradientsFirstDim(self):
+    with self.test_session(use_gpu=True):
       inp = []
       inp_tensors = []
       for x in [1, 2, 6]:
@@ -206,29 +210,25 @@ class ConcatOpTest(test.TestCase):
         inp.append(t)
         inp_tensors.append(
             constant_op.constant(
-                [float(y) for y in t.flatten()],
+                t.flatten(),
                 shape=shape,
                 dtype=dtypes.float32))
       c = array_ops.concat(inp_tensors, 0)
       output_shape = [9, 10, 2]
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
       result = concated_grad.eval()
 
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsFirstDimAll(self):
-    self._testGradientsFirstDim(use_gpu=False)
-    self._testGradientsFirstDim(use_gpu=True)
-
-  def _testGradientsLastDim(self, use_gpu):
+  def testGradientsLastDim(self):
     # Test both positive and negative concat axis.
     # -1 and 2 correspond to the same axis for 3-dimensional tensors.
     for axis in [-1, 2]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.test_session(use_gpu=True):
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -237,25 +237,21 @@ class ConcatOpTest(test.TestCase):
           inp.append(t)
           inp_tensors.append(
               constant_op.constant(
-                  [float(y) for y in t.flatten()],
+                  t.flatten(),
                   shape=shape,
                   dtype=dtypes.float32))
         c = array_ops.concat(inp_tensors, 2)
         output_shape = [10, 2, 9]
         grad_inp = np.random.rand(*output_shape).astype("f")
         grad_tensor = constant_op.constant(
-            [float(x) for x in grad_inp.flatten()], shape=output_shape)
+            grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
         result = concated_grad.eval()
 
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsLastDimAll(self):
-    self._testGradientsLastDim(use_gpu=False)
-    self._testGradientsLastDim(use_gpu=True)
-
-  def _RunAndVerifyGradientsRandom(self, use_gpu):
+  def _RunAndVerifyGradientsRandom(self):
     # Random dims of rank 5
     input_shape = np.random.randint(1, 5, size=5)
     # Random number of tensors
@@ -263,7 +259,7 @@ class ConcatOpTest(test.TestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=True):
       inp = []
       inp_tensors = []
       for x in concat_dim_sizes:
@@ -272,16 +268,13 @@ class ConcatOpTest(test.TestCase):
         t = np.random.rand(*shape).astype("f")
         inp.append(t)
         inp_tensors.append(
-            constant_op.constant(
-                [float(y) for y in t.flatten()],
-                shape=shape,
-                dtype=dtypes.float32))
+            constant_op.constant(t.flatten(), shape=shape,
+                                 dtype=dtypes.float32))
       c = array_ops.concat(inp_tensors, concat_dim)
       output_shape = input_shape
       output_shape[concat_dim] = concat_dim_sizes.sum()
       grad_inp = np.random.rand(*output_shape).astype("f")
-      grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+      grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
       result = concated_grad.eval()
@@ -290,8 +283,7 @@ class ConcatOpTest(test.TestCase):
 
   def testGradientsRandom(self):
     for _ in range(5):
-      self._RunAndVerifyGradientsRandom(use_gpu=False)
-      self._RunAndVerifyGradientsRandom(use_gpu=True)
+      self._RunAndVerifyGradientsRandom()
 
   def testGradientWithUnknownInputDim(self):
     with self.test_session(use_gpu=True):
@@ -302,7 +294,7 @@ class ConcatOpTest(test.TestCase):
       output_shape = [10, 2, 9]
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(inp) for inp in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
 
       grad = gradients_impl.gradients([c], [x, y], [grad_tensor])
       concated_grad = array_ops.concat(grad, 2)
@@ -364,24 +356,23 @@ class ConcatOpTest(test.TestCase):
   def testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        for shape0 in (), (2,):
-          axis = len(shape0)
-          for shape1 in (), (3,):
-            for n0 in 0, 1, 2:
-              for n1 in 0, 1, 2:
-                x0 = np.random.randn(*(shape0 + (n0,) + shape1))
-                x1 = np.random.randn(*(shape0 + (n1,) + shape1))
-                correct = np.concatenate([x0, x1], axis=axis)
-                # TODO(irving): Make tf.concat handle map, then drop list().
-                xs = list(map(constant_op.constant, [x0, x1]))
-                c = array_ops.concat(xs, axis)
-                self.assertAllEqual(c.eval(), correct)
-                # Check gradients
-                dc = np.random.randn(*c.get_shape().as_list())
-                dxs = sess.run(gradients_impl.gradients(c, xs, dc))
-                self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
+    with self.test_session(use_gpu=True) as sess:
+      for shape0 in (), (2,):
+        axis = len(shape0)
+        for shape1 in (), (3,):
+          for n0 in 0, 1, 2:
+            for n1 in 0, 1, 2:
+              x0 = np.random.randn(*(shape0 + (n0,) + shape1))
+              x1 = np.random.randn(*(shape0 + (n1,) + shape1))
+              correct = np.concatenate([x0, x1], axis=axis)
+              # TODO(irving): Make tf.concat handle map, then drop list().
+              xs = list(map(constant_op.constant, [x0, x1]))
+              c = array_ops.concat(xs, axis)
+              self.assertAllEqual(c.eval(), correct)
+              # Check gradients
+              dc = np.random.randn(*c.get_shape().as_list())
+              dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+              self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
   def testTensorConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
@@ -565,7 +556,7 @@ class ConcatOpTest(test.TestCase):
       c = array_ops.concat(inp_tensors, axis)
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, axis)
       result = concated_grad.eval(feed_dict=feed_dict)
@@ -578,7 +569,7 @@ class ConcatOpTest(test.TestCase):
           array_ops.concat(inp_tensors, axis), gather_indexes)
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.gather(
           array_ops.concat(grad, axis), gather_indexes)
@@ -617,15 +608,14 @@ class ConcatOpTest(test.TestCase):
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        cdim = constant_op.constant(1, dtypes.int32)
-        s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-        s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-        s2 = constant_op.constant([2, 20, 5], dtypes.int32)
-        off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
-        self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
+    with self.test_session(use_gpu=True) as sess:
+      cdim = constant_op.constant(1, dtypes.int32)
+      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+      s1 = constant_op.constant([2, 7, 5], dtypes.int32)
+      s2 = constant_op.constant([2, 20, 5], dtypes.int32)
+      off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+      ans = sess.run(off)
+      self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
   def testNotVector(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 2d116df2ffb..2f56540a318 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -44,11 +44,13 @@ class ConfusionMatrixTest(test.TestCase):
       ], confusion_matrix.confusion_matrix(
           labels=[1, 2, 4], predictions=[2, 2, 4]).eval())
 
-  def _testConfMatrix(self, labels, predictions, truth, weights=None):
+  def _testConfMatrix(self, labels, predictions, truth, weights=None,
+                      num_classes=None):
     with self.test_session():
       dtype = predictions.dtype
       ans = confusion_matrix.confusion_matrix(
-          labels, predictions, dtype=dtype, weights=weights).eval()
+          labels, predictions, dtype=dtype, weights=weights,
+          num_classes=num_classes).eval()
       self.assertAllClose(truth, ans, atol=1e-10)
       self.assertEqual(ans.dtype, dtype)
 
@@ -176,6 +178,34 @@ class ConfusionMatrixTest(test.TestCase):
     self._testConfMatrix(
         labels=labels, predictions=predictions, weights=weights, truth=truth)
 
+  def testLabelsTooLarge(self):
+    labels = np.asarray([1, 1, 0, 3, 5], dtype=np.int32)
+    predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
+    with self.assertRaisesOpError("`labels`.*x < y"):
+      self._testConfMatrix(
+          labels=labels, predictions=predictions, num_classes=3, truth=None)
+
+  def testLabelsNegative(self):
+    labels = np.asarray([1, 1, 0, -1, -1], dtype=np.int32)
+    predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
+    with self.assertRaisesOpError("`labels`.*negative values"):
+      self._testConfMatrix(
+          labels=labels, predictions=predictions, num_classes=3, truth=None)
+
+  def testPredictionsTooLarge(self):
+    labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
+    predictions = np.asarray([2, 1, 0, 3, 5], dtype=np.int32)
+    with self.assertRaisesOpError("`predictions`.*x < y"):
+      self._testConfMatrix(
+          labels=labels, predictions=predictions, num_classes=3, truth=None)
+
+  def testPredictionsNegative(self):
+    labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
+    predictions = np.asarray([2, 1, 0, -1, -1], dtype=np.int32)
+    with self.assertRaisesOpError("`predictions`.*negative values"):
+      self._testConfMatrix(
+          labels=labels, predictions=predictions, num_classes=3, truth=None)
+
   def testInvalidRank_predictionsTooBig(self):
     labels = np.asarray([1, 2, 3])
     predictions = np.asarray([[1, 2, 3]])
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index e502d58895f..40c6a9e614d 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from google.protobuf import text_format
+
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -170,8 +174,9 @@ class ConstantTest(test.TestCase):
         constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   # pylint: enable=g-long-lambda
-
-  def testTooLargeConstant(self):
+  # TODO(b/35396543): Temporarily disable: suspicion that
+  # this is causing test timeouts.
+  def _testTooLargeConstant(self):
     with ops.Graph().as_default():
       large_array = np.zeros((512, 1024, 1024), dtype=np.float32)
       with self.assertRaisesRegexp(
@@ -179,7 +184,9 @@ class ConstantTest(test.TestCase):
           "Cannot create a tensor proto whose content is larger than 2GB."):
         c = constant_op.constant(large_array)
 
-  def testTooLargeGraph(self):
+  # TODO(b/35396543): Temporarily disable: suspicion that
+  # this is causing test timeouts.
+  def _testTooLargeGraph(self):
     with ops.Graph().as_default() as g:
       large_array = np.zeros((256, 1024, 1024), dtype=np.float32)
       c = constant_op.constant(large_array)
@@ -223,6 +230,29 @@ class AsTensorTest(test.TestCase):
       self.assertEqual(dtypes_lib.int32, x.dtype)
       self.assertAllEqual([1, 2, 3], x.eval())
 
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
+      self.assertEqual(dtypes_lib.int32, x.dtype)
+      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
+                                dtype=dtypes_lib.int32)
+      self.assertEqual(dtypes_lib.int32, x.dtype)
+      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
+      self.assertEqual(dtypes_lib.int64, x.dtype)
+      self.assertAllEqual([2**31, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
+                                dtype=dtypes_lib.int64)
+      self.assertEqual(dtypes_lib.int64, x.dtype)
+      self.assertAllEqual([2**31, 2, 3], x.eval())
+
+      with self.assertRaisesRegexp(
+          ValueError, "a dimension is too large .2147483648."):
+        x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
+                                  dtype=dtypes_lib.int32)
+
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
@@ -343,7 +373,7 @@ class ZerosTest(test.TestCase):
 
 class ZerosLikeTest(test.TestCase):
 
-  def _compareZeros(self, dtype, use_gpu):
+  def _compareZeros(self, dtype, fully_defined_shape, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       # Creates a tensor of non-zero values with shape 2 x 3.
       # NOTE(kearnes): The default numpy dtype associated with tf.string is
@@ -354,16 +384,24 @@ class ZerosLikeTest(test.TestCase):
         numpy_dtype = np.string_
       else:
         numpy_dtype = dtype.as_numpy_dtype
-      d = constant_op.constant(np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+      if fully_defined_shape:
+        d = constant_op.constant(
+            np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+      else:
+        d = array_ops.placeholder(dtype=dtype)
       # Constructs a tensor of zeros of the same dimensions and type as "d".
       z_var = array_ops.zeros_like(d)
       # Test that the type is correct
       self.assertEqual(z_var.dtype, dtype)
       # Test that the shape is correct
-      self.assertEqual([2, 3], z_var.get_shape())
+      if fully_defined_shape:
+        self.assertEqual([2, 3], z_var.get_shape())
 
       # Test that the value is correct
-      z_value = z_var.eval()
+      feed_dict = {}
+      if not fully_defined_shape:
+        feed_dict[d] = np.ones((2, 3), dtype=numpy_dtype)
+      z_value = z_var.eval(feed_dict=feed_dict)
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
 
@@ -374,14 +412,16 @@ class ZerosLikeTest(test.TestCase):
         dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64,
         dtypes_lib.string
     ]:
-      self._compareZeros(dtype, False)
+      self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
+      self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
         dtypes_lib.bool, dtypes_lib.int64, dtypes_lib.string
     ]:
-      self._compareZeros(dtype, True)
+      self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
+      self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
 
   def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
@@ -587,7 +627,7 @@ class PlaceholderTest(test.TestCase):
 
   def testDtype(self):
     with self.test_session():
-      p = array_ops.placeholder(dtypes_lib.float32, name="p")
+      p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
       p_identity = array_ops.identity(p)
       feed_array = np.random.rand(10, 10)
       self.assertAllClose(
@@ -614,6 +654,24 @@ class PlaceholderTest(test.TestCase):
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :5]})
 
+  def testUnknownShape(self):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes_lib.float32, shape=None, name="p")
+      p_identity = array_ops.identity(p)
+      # can feed anything
+      feed_array = np.random.rand(10, 3)
+      self.assertAllClose(
+          p_identity.eval(feed_dict={p: feed_array}), feed_array)
+      feed_array = np.random.rand(4, 2, 5)
+      self.assertAllClose(
+          p_identity.eval(feed_dict={p: feed_array}), feed_array)
+
+  def testScalarShape(self):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes_lib.float32, shape=[], name="p")
+      p_identity = array_ops.identity(p)
+      self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
+
   def testPartialShape(self):
     with self.test_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -632,14 +690,15 @@ class PlaceholderTest(test.TestCase):
       with ops.control_dependencies([p]):
         c = constant_op.constant(5, dtypes_lib.int32)
       d = math_ops.multiply(p, c)
-      self.assertEqual(10, d.eval(feed_dict={p: 2}))
+      val = np.array(2).astype(np.int)
+      self.assertEqual(10, d.eval(feed_dict={p: val}))
 
   def testBadShape(self):
     with self.assertRaises(ValueError):
       array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
 
   def testTensorStr(self):
-    a = array_ops.placeholder(dtypes_lib.float32, name="a")
+    a = array_ops.placeholder(dtypes_lib.float32, shape=None, name="a")
     self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
 
     b = array_ops.placeholder(dtypes_lib.int32, shape=(32, 40), name="b")
@@ -648,92 +707,81 @@ class PlaceholderTest(test.TestCase):
     c = array_ops.placeholder(dtypes_lib.qint32, shape=(32, None, 2), name="c")
     self.assertEqual("<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
 
-
-class PlaceholderV2Test(test.TestCase):
-
-  def testDtype(self):
+  def testOldGraph(self):
+    # Load graph generated from earlier version of TF where
+    # placeholder shape was not set.
+    #
+    # a = tf.placeholder(tf.float32)
+    # b = a + 1.0
+    #
+    # Older graph's default shape is 'shape {}', not 'shape {
+    # unknown_rank: true }'
+    graph = """
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "add/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "Placeholder"
+  input: "add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+"""
+    gdef = graph_pb2.GraphDef()
+    text_format.Merge(graph, gdef)
     with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 10)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
+      p, ret = importer.import_graph_def(
+          gdef, return_elements=["Placeholder:0", "add:0"])
 
-      with self.assertRaisesOpError(
-          "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
-
-  def testShape(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=(10, 10), name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 10)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
-      with self.assertRaisesOpError(
-          "must feed a value for placeholder tensor 'p' with dtype float and "
-          r"shape \[10,10\]"):
-        p_identity.eval()
-
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Cannot feed value of shape" in str(e)):
-        p_identity.eval(feed_dict={p: feed_array[:5, :5]})
-
-  def testUnknownShape(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="p")
-      p_identity = array_ops.identity(p)
-      # can feed anything
-      feed_array = np.random.rand(10, 3)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-      feed_array = np.random.rand(4, 2, 5)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
-  def testScalarShape(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=[], name="p")
-      p_identity = array_ops.identity(p)
-      self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
-
-  def testPartialShape(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(
-          dtypes_lib.float32, shape=[None, 3], name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 3)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Cannot feed value of shape" in str(e)):
-        p_identity.eval(feed_dict={p: feed_array[:5, :2]})
-
-  def testControlDependency(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.int32, shape=[], name="p")
-      with ops.control_dependencies([p]):
-        c = constant_op.constant(5, dtypes_lib.int32)
-      d = math_ops.multiply(p, c)
-      val = np.array(2).astype(np.int)
-      self.assertEqual(10, d.eval(feed_dict={p: val}))
-
-  def testBadShape(self):
-    with self.assertRaises(ValueError):
-      array_ops.placeholder_v2(dtypes_lib.float32, shape=(-1, 10))
-
-  def testTensorStr(self):
-    a = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="a")
-    self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
-
-    b = array_ops.placeholder_v2(dtypes_lib.int32, shape=(32, 40), name="b")
-    self.assertEqual("<tf.Tensor 'b:0' shape=(32, 40) dtype=int32>", repr(b))
-
-    c = array_ops.placeholder_v2(
-        dtypes_lib.qint32, shape=(32, None, 2), name="c")
-    self.assertEqual("<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
+      # Feed in a vector of two elements.  Since the producer version
+      # of 21, a shape of {} is interpreted as "any shape".  If
+      # producer version were 22, then we'd get a shape mismatch
+      # error.
+      self.assertAllEqual([2.0, 3.0], ret.eval(feed_dict={p: [1.0, 2.0]}))
 
 
 class PlaceholderWithDefaultTest(test.TestCase):
@@ -768,6 +816,12 @@ class PlaceholderWithDefaultTest(test.TestCase):
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
+  def testGradient(self):
+    with self.test_session():
+      x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
+      y = array_ops.placeholder_with_default(x, None)
+      err = gradient_checker.compute_gradient_error(x, [5, 7], y, [5, 7])
+      self.assertLess(err, 1e-3)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 1e510b28689..91694cd0b25 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -45,6 +45,8 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -95,6 +97,16 @@ def all_fetchables():
   return tensor_names
 
 
+def all_feedables():
+  feedable_tensors = []
+  graph = ops.get_default_graph()
+  for op in graph.get_operations():
+    for t in op.inputs:
+      if graph.is_feedable(t):
+        feedable_tensors.append(t)
+  return feedable_tensors
+
+
 def opt_cfg():
   return config_pb2.ConfigProto(
       allow_soft_placement=True,
@@ -186,7 +198,7 @@ class ControlFlowTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
-          lambda e: "The tensor returned for" in str(e)):
+          lambda e: "Retval[0] does not have value" in str(e)):
         dead_branch.eval()
 
   def testSwitchMergeLess(self):
@@ -313,16 +325,41 @@ class ControlFlowTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
+  def testCondInt(self):
+    p = array_ops.placeholder(dtypes.bool, shape=[])
+    v = constant_op.constant(10)
+    fn1 = lambda: math_ops.add(v, 1)
+    fn2 = lambda: math_ops.subtract(v, 1)
+    y = control_flow_ops.cond(p, fn1, fn2)
+    grad = gradients_impl.gradients(y, [v])
+    self.assertAllEqual([None], grad)
+
   def testFetchables(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
-      control_flow_ops.cond(constant_op.constant(True),
-                            lambda: x + 2,
-                            lambda: x + 0)
+      control_flow_ops.cond(
+          constant_op.constant(True), lambda: x + 2, lambda: x + 0)
       tensor_names = all_fetchables()
       for name in tensor_names:
         sess.run(name, feed_dict={x: 3})
 
+  def testFeedable(self):
+    with self.test_session() as sess:
+      c = constant_op.constant(2)
+      i0 = constant_op.constant(0)
+      r = control_flow_ops.while_loop(lambda i: i < 1000,
+                                      lambda i: math_ops.square(c) + i, [i0])
+      self.assertEqual(1000, r.eval(feed_dict={i0: 0}))
+      feedable_tensors = all_feedables()
+      for t in feedable_tensors:
+        sess.run(r, feed_dict={t: 3})
+      graph = ops.get_default_graph()
+      for op in graph.get_operations():
+        for t in op.inputs:
+          if t not in feedable_tensors and t.dtype is dtypes.int32:
+            with self.assertRaisesRegexp(ValueError, "may not be fed"):
+              sess.run(r, feed_dict={t: 3})
+
   def testCondIndexedSlices(self):
     with self.test_session():
       values = constant_op.constant(10)
@@ -361,11 +398,12 @@ class ControlFlowTest(test.TestCase):
       rv = resource_variable_ops.ResourceVariable(True)
       variables.global_variables_initializer().run()
       t = ops.convert_to_tensor(1.0)
+
       def case():
-        assign = resource_variable_ops.assign_variable_op(
-            rv.handle, False)
+        assign = resource_variable_ops.assign_variable_op(rv.handle, False)
         with ops.control_dependencies([assign]):
           return array_ops.identity(t)
+
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
   def testCondIndexedSlicesDifferentTypes(self):
@@ -517,13 +555,15 @@ class ControlFlowTest(test.TestCase):
     with self.test_session() as sess:
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
       a = constant_op.constant(3)
+
       def true_branch():
         with ops.control_dependencies([control_holder]):
           _ = a + 1
         return a + 2
-      r = control_flow_ops.cond(constant_op.constant(True),
-                                true_branch,
-                                lambda: constant_op.constant(1))
+
+      r = control_flow_ops.cond(
+          constant_op.constant(True), true_branch,
+          lambda: constant_op.constant(1))
       self.assertEqual(5, r.eval())
 
   def testUninitializedRefIdentity(self):
@@ -743,16 +783,37 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
-                                      compute, [i, c, o], [
-                                          i.get_shape(),
-                                          tensor_shape.unknown_shape(),
-                                          tensor_shape.unknown_shape()
-                                      ])
+      r = control_flow_ops.while_loop(
+          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
+              i.get_shape(), tensor_shape.unknown_shape(),
+              tensor_shape.unknown_shape()
+          ])
       result = r[2].eval()
     self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  def testBufferForwarding(self):
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    with self.test_session() as sess:
+      with ops.device("/cpu:0"):
+        c = constant_op.constant(2)
+        i0 = constant_op.constant(0)
+        r = control_flow_ops.while_loop(lambda i: i < 1000,
+                                        lambda i: math_ops.square(c) + i, [i0])
+      r_val = sess.run(r, options=run_options, run_metadata=run_metadata)
+      self.assertEqual(1000, r_val)
+      self.assertTrue(run_metadata.HasField("step_stats"))
+      unique_allocs = set()
+      for node_stat in run_metadata.step_stats.dev_stats[0].node_stats:
+        for output in node_stat.output:
+          unique_allocs.add(
+              output.tensor_description.allocation_description.ptr)
+      # Prior to cl/147536680, the number of unique allocations was about 1005.
+      self.assertLess(len(unique_allocs), 756)
+
   def _testWhile_Gpu_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       n = constant_op.constant(1.0)
@@ -1024,6 +1085,27 @@ class ControlFlowTest(test.TestCase):
                                            (constant_op.constant(5),))
       self.assertEqual(0, sess.run(loop))
 
+  def testWhileCondWithControl_1(self):
+    with self.test_session():
+      v = variable_scope.get_variable(
+          "v", [], initializer=init_ops.constant_initializer(2))
+      i0 = constant_op.constant(0)
+      with ops.control_dependencies([i0]):
+        def loop_condition(i):
+          return i < 4
+
+        def loop_body(i):
+          some_cond = control_flow_ops.cond(
+              constant_op.constant(True),
+              lambda: state_ops.assign(v, math_ops.square(v)),
+              lambda: v)
+          with ops.control_dependencies([some_cond]):
+            return i + 1
+      r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
+      variables.global_variables_initializer().run()
+      self.assertEqual(4, r.eval())
+      self.assertAllClose(65536.0, v.eval())
+
   def testWhileCondExitControl(self):
     with self.test_session():
       v = variables.Variable(1)
@@ -1341,8 +1423,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(45, rx.eval())
 
   def _testWhileGrad_ColocateGradients(self, colocate):
-    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available() else "/gpu:0"
-    gpu_short_name = gpu_dev_name.split('/')[-1]
+    gpu_dev_name = test.gpu_device_name().lower() if test.is_gpu_available(
+    ) else "/gpu:0"
+    gpu_short_name = gpu_dev_name.split("/")[-1]
 
     with self.test_session(graph=ops.Graph()) as sess:
       v = constant_op.constant(2.0, name="v")
@@ -1458,16 +1541,21 @@ class ControlFlowTest(test.TestCase):
   def _testNestedWhileCondWhileGrad(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
+
       def inner_loop(s):
         z = constant_op.constant(0)
         c = lambda i, x: math_ops.less(i, 4)
         b = lambda i, x: [math_ops.add(i, 1), math_ops.multiply(x, 2.0)]
         return control_flow_ops.while_loop(c, b, [z, s])
+
       c = lambda x: math_ops.less(x, 128.0)
+
       def b(x):
-        return control_flow_ops.cond(constant_op.constant(True),
-                                     lambda: math_ops.square(inner_loop(x)[1]),
-                                     lambda: math_ops.multiply(x, 2.0))
+        return control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: math_ops.square(inner_loop(x)[1]),
+            lambda: math_ops.multiply(x, 2.0))
+
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(512.0, r.eval())
@@ -1523,10 +1611,9 @@ class ControlFlowTest(test.TestCase):
     with self.test_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
-          named(
-              a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0), constant_op.constant(3.0)),
-          constant_op.constant(4.0)
+          named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
+          (constant_op.constant(2.0),
+           constant_op.constant(3.0)), constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1551,10 +1638,9 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
-          named(
-              a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0), constant_op.constant(3.0)),
-          constant_op.constant(4.0)
+          named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
+          (constant_op.constant(2.0),
+           constant_op.constant(3.0)), constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2085,6 +2171,29 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  def testStopGradMultiFlows(self):
+    with self.test_session():
+      def body(i, y, r):
+        x = variable_scope.get_variable(
+            "x", shape=(), dtype=dtypes.float32,
+            initializer=init_ops.ones_initializer())
+        y *= x
+        return [i + 1, y, r + math_ops.reduce_sum(y)]
+
+      i0 = constant_op.constant(0)
+      y0 = array_ops.ones(5)
+      r0 = constant_op.constant(0.0)
+      cond = lambda i, y, r: i < 1
+      _, _, r = control_flow_ops.while_loop(
+          cond, body, [i0, y0, r0], back_prop=True)
+
+      vars_ = variables.global_variables()
+      grads = linalg_ops.norm(gradients_impl.gradients(r, vars_)[0])
+      z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
+      result = gradients_impl.gradients(z, vars_)[0]
+      variables.global_variables_initializer().run()
+      self.assertEqual(5.0, result.eval())
+
   def testOneValueCond(self):
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -2495,15 +2604,11 @@ class TupleTest(test.TestCase):
       with self.test_session():
         v1 = variables.Variable([1.0])
         add1 = math_ops.add(
-            control_flow_ops.with_dependencies(
-                [v1.initializer],
-                v1._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             2.0)
         v2 = variables.Variable([10.0])
         add2 = math_ops.add(
-            control_flow_ops.with_dependencies(
-                [v2.initializer],
-                v2._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             20.0)
         t1, _, t2 = control_flow_ops.tuple([add1, None, add2])
 
@@ -2531,18 +2636,14 @@ class TupleTest(test.TestCase):
             np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(
                 np.float32))
         v1_at_1 = ops.IndexedSlices(
-            control_flow_ops.with_dependencies(
-                [v1.initializer],
-                v1._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
         v2 = variables.Variable(
             np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(
                 np.float32))
         v2_at_1 = ops.IndexedSlices(
-            control_flow_ops.with_dependencies(
-                [v2.initializer],
-                v2._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
         st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1])
@@ -2617,6 +2718,5 @@ class AssertTest(test.TestCase):
       # No copy was performed for the guarded assert
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 59d5161af40..14622ab4678 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -23,16 +23,30 @@ import math
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+def GetTestConfigs():
+  """Get all the valid tests configs to run.
+
+  Returns:
+    all the valid test configs as tuples of data_format and use_gpu.
+  """
+  test_configs = [("NDHWC", False), ("NDHWC", True)]
+  if test.is_gpu_available(cuda_only=True):
+    # "NCDHW" format is only supported on CUDA.
+    test_configs += [("NCDHW", True)]
+  return test_configs
+
+
 class Conv3DTest(test.TestCase):
 
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
-                    expected):
+  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
+                            padding, data_format, use_gpu):
     total_size_1 = 1
     total_size_2 = 1
     for s in tensor_in_sizes:
@@ -40,23 +54,49 @@ class Conv3DTest(test.TestCase):
     for s in filter_in_sizes:
       total_size_2 *= s
 
-    if isinstance(stride, collections.Iterable):
-      strides = [1] + list(stride) + [1]
-    else:
-      strides = [1, stride, stride, stride, 1]
-
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
-      conv = nn_ops.conv3d(t1, t2, strides, padding=padding)
-      value = sess.run(conv)
-    print("expected = ", expected)
-    print("actual = ", value)
-    self.assertArrayNear(expected, value.flatten(), 1e-5)
+
+      if isinstance(stride, collections.Iterable):
+        strides = [1] + list(stride) + [1]
+      else:
+        strides = [1, stride, stride, stride, 1]
+
+      if data_format == "NCDHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        strides = test_util.NHWCToNCHW(strides)
+      conv = nn_ops.conv3d(t1, t2, strides, padding=padding,
+                           data_format=data_format)
+      if data_format == "NCDHW":
+        conv = test_util.NCHWToNHWC(conv)
+
+      return conv
+
+  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
+                    expected):
+    results = []
+    for data_format, use_gpu in GetTestConfigs():
+      result = self._SetupValuesForDevice(
+          tensor_in_sizes,
+          filter_in_sizes,
+          stride,
+          padding,
+          data_format,
+          use_gpu=use_gpu)
+      results.append(result)
+      tolerance = 1e-2 if use_gpu else 1e-5
+      with self.test_session() as sess:
+        values = sess.run(results)
+        for value in values:
+          print("expected = ", expected)
+          print("actual = ", value)
+          self.assertAllClose(expected, value.flatten(), atol=tolerance,
+                              rtol=1e-6)
 
   def testConv3D1x1x1Filter(self):
     expected_output = [
@@ -250,10 +290,13 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         expected=[50, 60])
 
-  def ConstructAndTestGradient(self, batch, input_planes, input_rows,
-                               input_cols, filter_planes, filter_rows,
-                               filter_cols, in_depth, out_depth, stride,
-                               padding, test_input):
+  def _ConstructAndTestGradientForConfig(
+      self, batch, input_shape, filter_shape, in_depth, out_depth, stride,
+      padding, test_input, data_format, use_gpu):
+
+    input_planes, input_rows, input_cols = input_shape
+    filter_planes, filter_rows, filter_cols = filter_shape
+
     input_shape = [batch, input_planes, input_rows, input_cols, in_depth]
     filter_shape = [
         filter_planes, filter_rows, filter_cols, in_depth, out_depth
@@ -284,10 +327,14 @@ class Conv3DTest(test.TestCase):
       filter_size *= x
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
-    if test.is_gpu_available():
+
+    if test.is_gpu_available() and use_gpu:
       data_type = dtypes.float32
+      # TODO(mjanusz): Modify gradient_checker to also provide max relative
+      # error and synchronize the tolerance levels between the tests for forward
+      # and backward computations.
       if test.is_gpu_available():
-        tolerance = 4e-3
+        tolerance = 5e-3
       else:
         # As of Aug 2016, higher tolerance is needed for some CPU architectures.
         # Runs on a single machine can also generate slightly different errors
@@ -296,16 +343,28 @@ class Conv3DTest(test.TestCase):
     else:
       data_type = dtypes.float64
       tolerance = 1e-8
-    with self.test_session(use_gpu=True):
-      input_tensor = constant_op.constant(
+    with self.test_session(use_gpu=use_gpu):
+      orig_input_tensor = constant_op.constant(
           input_data, shape=input_shape, dtype=data_type, name="input")
       filter_tensor = constant_op.constant(
           filter_data, shape=filter_shape, dtype=data_type, name="filter")
+
+      if data_format == "NCDHW":
+        input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
+        strides = test_util.NHWCToNCHW(strides)
+      else:
+        input_tensor = orig_input_tensor
+
       conv = nn_ops.conv3d(
-          input_tensor, filter_tensor, strides, padding, name="conv")
+          input_tensor, filter_tensor, strides, padding,
+          data_format=data_format, name="conv")
+
+      if data_format == "NCDHW":
+        conv = test_util.NCHWToNHWC(conv)
 
       if test_input:
-        err = gradient_checker.compute_gradient_error(input_tensor, input_shape,
+        err = gradient_checker.compute_gradient_error(orig_input_tensor,
+                                                      input_shape,
                                                       conv, output_shape)
       else:
         err = gradient_checker.compute_gradient_error(filter_tensor,
@@ -314,15 +373,16 @@ class Conv3DTest(test.TestCase):
     print("conv3d gradient error = ", err)
     self.assertLess(err, tolerance)
 
+  def ConstructAndTestGradient(self, **kwargs):
+    for data_format, use_gpu in GetTestConfigs():
+      self._ConstructAndTestGradientForConfig(data_format=data_format,
+                                              use_gpu=use_gpu, **kwargs)
+
   def testInputGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=3,
-        input_rows=5,
-        input_cols=4,
-        filter_planes=3,
-        filter_rows=3,
-        filter_cols=3,
+        input_shape=(3, 5, 4),
+        filter_shape=(3, 3, 3),
         in_depth=2,
         out_depth=3,
         stride=1,
@@ -332,12 +392,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=4,
-        input_planes=4,
-        input_rows=6,
-        input_cols=5,
-        filter_planes=2,
-        filter_rows=2,
-        filter_cols=2,
+        input_shape=(4, 6, 5),
+        filter_shape=(2, 2, 2),
         in_depth=2,
         out_depth=3,
         stride=1,
@@ -347,12 +403,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=6,
-        input_rows=3,
-        input_cols=5,
-        filter_planes=3,
-        filter_rows=3,
-        filter_cols=3,
+        input_shape=(6, 3, 5),
+        filter_shape=(3, 3, 3),
         in_depth=2,
         out_depth=3,
         stride=2,
@@ -362,12 +414,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=7,
-        input_rows=6,
-        input_cols=5,
-        filter_planes=2,
-        filter_rows=2,
-        filter_cols=2,
+        input_shape=(7, 6, 5),
+        filter_shape=(2, 2, 2),
         in_depth=2,
         out_depth=3,
         stride=2,
@@ -377,12 +425,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=3,
-        input_rows=7,
-        input_cols=6,
-        filter_planes=3,
-        filter_rows=3,
-        filter_cols=3,
+        input_shape=(3, 7, 6),
+        filter_shape=(3, 3, 3),
         in_depth=2,
         out_depth=3,
         stride=3,
@@ -392,12 +436,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=4,
-        input_rows=4,
-        input_cols=7,
-        filter_planes=4,
-        filter_rows=4,
-        filter_cols=4,
+        input_shape=(4, 4, 7),
+        filter_shape=(4, 4, 4),
         in_depth=2,
         out_depth=3,
         stride=3,
@@ -407,12 +447,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=3,
-        input_rows=2,
-        input_cols=2,
-        filter_planes=3,
-        filter_rows=2,
-        filter_cols=1,
+        input_shape=(3, 2, 2),
+        filter_shape=(3, 2, 1),
         in_depth=2,
         out_depth=1,
         stride=1,
@@ -422,12 +458,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=3,
-        input_rows=6,
-        input_cols=5,
-        filter_planes=2,
-        filter_rows=2,
-        filter_cols=2,
+        input_shape=(3, 6, 5),
+        filter_shape=(2, 2, 2),
         in_depth=2,
         out_depth=3,
         stride=1,
@@ -437,12 +469,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=6,
-        input_rows=3,
-        input_cols=4,
-        filter_planes=3,
-        filter_rows=3,
-        filter_cols=3,
+        input_shape=(6, 3, 4),
+        filter_shape=(3, 3, 3),
         in_depth=2,
         out_depth=3,
         stride=2,
@@ -452,12 +480,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=4,
-        input_planes=7,
-        input_rows=3,
-        input_cols=5,
-        filter_planes=2,
-        filter_rows=2,
-        filter_cols=2,
+        input_shape=(7, 3, 5),
+        filter_shape=(2, 2, 2),
         in_depth=2,
         out_depth=3,
         stride=2,
@@ -467,12 +491,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=9,
-        input_rows=3,
-        input_cols=6,
-        filter_planes=3,
-        filter_rows=3,
-        filter_cols=3,
+        input_shape=(9, 3, 6),
+        filter_shape=(3, 3, 3),
         in_depth=2,
         out_depth=3,
         stride=3,
@@ -482,12 +502,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=9,
-        input_rows=4,
-        input_cols=7,
-        filter_planes=4,
-        filter_rows=4,
-        filter_cols=4,
+        input_shape=(9, 4, 7),
+        filter_shape=(4, 4, 4),
         in_depth=2,
         out_depth=3,
         stride=3,
@@ -497,12 +513,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(
         batch=1,
-        input_planes=5,
-        input_rows=8,
-        input_cols=7,
-        filter_planes=1,
-        filter_rows=2,
-        filter_cols=3,
+        input_shape=(5, 8, 7),
+        filter_shape=(1, 2, 3),
         in_depth=2,
         out_depth=3,
         stride=[2, 3, 1],
@@ -512,12 +524,8 @@ class Conv3DTest(test.TestCase):
   def testFilterGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=5,
-        input_rows=4,
-        input_cols=3,
-        filter_planes=5,
-        filter_rows=4,
-        filter_cols=3,
+        input_shape=(5, 4, 3),
+        filter_shape=(5, 4, 3),
         in_depth=2,
         out_depth=3,
         stride=1,
@@ -527,12 +535,8 @@ class Conv3DTest(test.TestCase):
   def testInputGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
-        input_planes=5,
-        input_rows=4,
-        input_cols=3,
-        filter_planes=5,
-        filter_rows=4,
-        filter_cols=3,
+        input_shape=(5, 4, 3),
+        filter_shape=(5, 4, 3),
         in_depth=2,
         out_depth=3,
         stride=1,
@@ -542,12 +546,8 @@ class Conv3DTest(test.TestCase):
   def disabledtestFilterGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(
         batch=1,
-        input_planes=5,
-        input_rows=8,
-        input_cols=7,
-        filter_planes=1,
-        filter_rows=2,
-        filter_cols=3,
+        input_shape=(5, 8, 7),
+        filter_shape=(1, 2, 3),
         in_depth=2,
         out_depth=3,
         stride=[2, 3, 1],
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 428253f55e0..6adfd79fd4d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -19,16 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import time
 
 import numpy as np
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
@@ -145,34 +139,6 @@ def GetShrunkInceptionShapes(shrink=10):
     yield i, f, o, s, p
 
 
-def NHWCToNCHW(input_tensor):
-  """Convert the input from NHWC format to NCHW.
-
-  Args:
-    input_tensor:  a 4-D tensor, or a 4-element array representing the same.
-  Returns:
-    the converted tensor or a shape array
-  """
-  if isinstance(input_tensor, ops.Tensor):
-    return array_ops.transpose(input_tensor, [0, 3, 1, 2])
-  else:
-    return [input_tensor[0], input_tensor[3], input_tensor[1], input_tensor[2]]
-
-
-def NCHWToNHWC(input_tensor):
-  """Convert the input from NCHW format to NHWC.
-
-  Args:
-    input_tensor:  a 4-D tensor, or a 4-element array representing the same.
-  Returns:
-    the converted tensor or a shape array
-  """
-  if isinstance(input_tensor, ops.Tensor):
-    return array_ops.transpose(input_tensor, [0, 2, 3, 1])
-  else:
-    return [input_tensor[0], input_tensor[2], input_tensor[3], input_tensor[1]]
-
-
 def GetTestConfigs():
   """Get all the valid tests configs to run.
 
@@ -223,17 +189,17 @@ class Conv2DTest(test.TestCase):
     # numbers from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
       if data_format == "NCHW":
-        t1 = NHWCToNCHW(t1)
-        strides = NHWCToNCHW(strides)
+        t1 = test_util.NHWCToNCHW(t1)
+        strides = test_util.NHWCToNCHW(strides)
       conv = nn_ops.conv2d(
           t1, t2, strides=strides, padding=padding, data_format=data_format)
       if data_format == "NCHW":
-        conv = NCHWToNHWC(conv)
+        conv = test_util.NCHWToNHWC(conv)
 
       return conv
 
@@ -258,12 +224,12 @@ class Conv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=filter_in_sizes)
         strides = [1] + conv_strides + [1]
         if data_format == "NCHW":
-          t1 = NHWCToNCHW(t1)
-          strides = NHWCToNCHW(strides)
+          t1 = test_util.NHWCToNCHW(t1)
+          strides = test_util.NHWCToNCHW(strides)
         conv = nn_ops.conv2d(
             t1, t2, strides=strides, padding=padding, data_format=data_format)
         if data_format == "NCHW":
-          conv = NCHWToNHWC(conv)
+          conv = test_util.NCHWToNHWC(conv)
         return conv
 
     tensors = []
@@ -412,7 +378,7 @@ class Conv2DTest(test.TestCase):
         expected=[50, 60])
 
     # TODO this currently fails.
-    #self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
+    # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
     #                   filter_in_sizes=[2, 2, 1, 1],
     #                   strides=[4, 4], padding="SAME",
     #                   expected=[72, 112, 392, 432])
@@ -433,18 +399,18 @@ class Conv2DTest(test.TestCase):
     x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
       if data_format == "NCHW":
-        input_sizes = NHWCToNCHW(input_sizes)
+        input_sizes = test_util.NHWCToNCHW(input_sizes)
       t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
       t1 = constant_op.constant(x1, shape=filter_sizes)
       t2 = constant_op.constant(x2, shape=output_sizes)
       strides = [1] + strides + [1]
       if data_format == "NCHW":
-        t2 = NHWCToNCHW(t2)
-        strides = NHWCToNCHW(strides)
+        t2 = test_util.NHWCToNCHW(t2)
+        strides = test_util.NHWCToNCHW(strides)
       conv = nn_ops.conv2d_backprop_input(
           t0, t1, t2, strides=strides, padding=padding, data_format=data_format)
       if data_format == "NCHW":
-        conv = NCHWToNHWC(conv)
+        conv = test_util.NCHWToNHWC(conv)
       # "values" consists of two tensors for two backprops
       value = sess.run(conv)
       self.assertShapeEqual(value, conv)
@@ -458,9 +424,9 @@ class Conv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(data_format, use_gpu):
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.test_session(use_gpu=use_gpu):
         if data_format == "NCHW":
-          new_input_sizes = NHWCToNCHW(input_sizes)
+          new_input_sizes = test_util.NHWCToNCHW(input_sizes)
         else:
           new_input_sizes = input_sizes
         t0 = constant_op.constant(new_input_sizes, shape=[len(new_input_sizes)])
@@ -468,8 +434,8 @@ class Conv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         strides = [1] + conv_strides + [1]
         if data_format == "NCHW":
-          t2 = NHWCToNCHW(t2)
-          strides = NHWCToNCHW(strides)
+          t2 = test_util.NHWCToNCHW(t2)
+          strides = test_util.NHWCToNCHW(strides)
         conv = nn_ops.conv2d_backprop_input(
             t0,
             t1,
@@ -478,7 +444,7 @@ class Conv2DTest(test.TestCase):
             padding=padding,
             data_format=data_format)
         if data_format == "NCHW":
-          conv = NCHWToNHWC(conv)
+          conv = test_util.NCHWToNHWC(conv)
         ret = conv.eval()
         self.assertShapeEqual(ret, conv)
         return ret
@@ -592,9 +558,9 @@ class Conv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
         explicit_strides = [1] + strides + [1]
         if data_format == "NCHW":
-          t0 = NHWCToNCHW(t0)
-          t2 = NHWCToNCHW(t2)
-          explicit_strides = NHWCToNCHW(explicit_strides)
+          t0 = test_util.NHWCToNCHW(t0)
+          t2 = test_util.NHWCToNCHW(t2)
+          explicit_strides = test_util.NHWCToNCHW(explicit_strides)
         conv = nn_ops.conv2d_backprop_filter(
             t0,
             t1,
@@ -614,15 +580,15 @@ class Conv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(data_format, use_gpu):
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.test_session(use_gpu=use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes)
         strides = [1] + conv_strides + [1]
         if data_format == "NCHW":
-          t0 = NHWCToNCHW(t0)
-          t2 = NHWCToNCHW(t2)
-          strides = NHWCToNCHW(strides)
+          t0 = test_util.NHWCToNCHW(t0)
+          t2 = test_util.NHWCToNCHW(t2)
+          strides = test_util.NHWCToNCHW(strides)
         conv = nn_ops.conv2d_backprop_filter(
             t0,
             t1,
@@ -745,8 +711,8 @@ class Conv2DTest(test.TestCase):
             filter_data, shape=filter_shape, dtype=dtype, name="filter")
         strides = [1, stride_rows, stride_cols, 1]
         if data_format == "NCHW":
-          new_input_tensor = NHWCToNCHW(input_tensor)
-          strides = NHWCToNCHW(strides)
+          new_input_tensor = test_util.NHWCToNCHW(input_tensor)
+          strides = test_util.NHWCToNCHW(strides)
         else:
           new_input_tensor = input_tensor
         conv = nn_ops.conv2d(
@@ -757,7 +723,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             name="conv")
         if data_format == "NCHW":
-          conv = NCHWToNHWC(conv)
+          conv = test_util.NCHWToNHWC(conv)
         self.assertEqual(output_shape, conv.get_shape())
         if test_input:
           jacob_t, jacob_n = gradient_checker.compute_gradient(input_tensor,
@@ -1228,8 +1194,14 @@ class SeparableConv2DTest(test.TestCase):
     x = [f * 0.5 for f in range(1, total_size + 1)]
     return constant_op.constant(x, shape=sizes)
 
-  def _VerifyValues(self, tensor_in_sizes, depthwise_filter_in_sizes,
-                    pointwise_filter_in_sizes, stride, padding, expected):
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    depthwise_filter_in_sizes,
+                    pointwise_filter_in_sizes,
+                    stride,
+                    padding,
+                    expected,
+                    data_format="NHWC"):
     """Verifies the output values of the separable convolution function.
 
     Args:
@@ -1239,20 +1211,37 @@ class SeparableConv2DTest(test.TestCase):
       stride: Stride.
       padding: Padding type.
       expected: An array containing the expected operation outputs.
+      data_format: string data format for input tensor.
     """
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       t1 = self._InitValues(tensor_in_sizes)
       f1 = self._InitValues(depthwise_filter_in_sizes)
       f1.set_shape(depthwise_filter_in_sizes)
       f2 = self._InitValues(pointwise_filter_in_sizes)
+
+      real_t1 = t1
+      strides = [1, stride, stride, 1]
+      if data_format == "NCHW":
+        real_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        strides = [1, 1, stride, stride]
+
       conv = nn_impl.separable_conv2d(
-          t1, f1, f2, strides=[1, stride, stride, 1], padding=padding)
+          real_t1,
+          f1,
+          f2,
+          strides=strides,
+          padding=padding,
+          data_format=data_format)
+
+      if data_format == "NCHW":
+        conv = array_ops.transpose(conv, [0, 2, 3, 1])
+
       value = sess.run(conv)
     print("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
-  def testSeparableConv2D(self):
+  def _testSeparableConv2D(self, data_format):
     # The output is the result of two convolutions:
     # First with tensor_in[1, 4, 4, 2] * filter1[2, 2, 2, 3].
     # Second with intermediate_out[1, 4, 4, 6] * filter2[1, 1, 6, 7].
@@ -1280,9 +1269,18 @@ class SeparableConv2DTest(test.TestCase):
         pointwise_filter_in_sizes=[1, 1, 6, 7],
         stride=1,
         padding="SAME",
-        expected=expected_output)
+        expected=expected_output,
+        data_format=data_format)
 
-  def testSeparableConv2DEqualInputOutputDepth(self):
+  def testSeparableConv2D(self):
+    self._testSeparableConv2D("NHWC")
+
+  def testSeparableConv2DNCHW(self):
+    if not test.is_gpu_available():
+      return
+    self._testSeparableConv2D("NCHW")
+
+  def _testSeparableConv2DEqualInputOutputDepth(self, data_format):
     # The output is the result of two convolutions:
     # First with tensor_in[1, 4, 4, 2] * filter1[2, 2, 3, 3].
     # Second with intermediate_out[1, 4, 4, 6] * filter2[1, 1, 6, 6].
@@ -1308,20 +1306,16 @@ class SeparableConv2DTest(test.TestCase):
         pointwise_filter_in_sizes=[1, 1, 6, 6],
         stride=1,
         padding="SAME",
-        expected=expected_output)
+        expected=expected_output,
+        data_format=data_format)
 
-  def testSeparableConv2DIllegalCases(self):
-    # Output depth less then input depth.
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Refusing to perform an overparameterized separable convolution"):
-      self._VerifyValues(
-          tensor_in_sizes=[1, 4, 4, 2],
-          depthwise_filter_in_sizes=[2, 2, 2, 3],
-          pointwise_filter_in_sizes=[1, 1, 6, 5],
-          stride=1,
-          padding="SAME",
-          expected=None)
+  def testSeparableConv2DEqualInputOutputDepth(self):
+    self._testSeparableConv2DEqualInputOutputDepth("NHWC")
+
+  def testSeparableConv2DEqualInputOutputDepthNCHW(self):
+    if not test.is_gpu_available():
+      return
+    self._testSeparableConv2DEqualInputOutputDepth("NCHW")
 
 
 class DeepConv2DTest(test.TestCase):
@@ -1404,9 +1398,14 @@ class Conv2DBenchmark(test.Benchmark):
         print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
-def GetInceptionFwdTest(input_size, filter_size, stride, padding):
+def GetInceptionFwdTest(input_size, filter_size, stride, padding,
+                        gpu_only=False):
 
   def Test(self):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping InceptionFwd %s", (input_size, filter_size,
+                                                   stride, padding))
+      return
     tf_logging.info("Testing InceptionFwd %s", (input_size, filter_size, stride,
                                                 padding))
     self._CompareFwdValues(input_size, filter_size, [stride, stride], padding)
@@ -1415,9 +1414,14 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding):
 
 
 def GetInceptionBackInputTest(input_size, filter_size, output_size, stride,
-                              padding):
+                              padding,
+                              gpu_only=False):
 
   def Test(self):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping InceptionBackInput %s",
+                      (input_size, filter_size, output_size, stride, padding))
+      return
     tf_logging.info("Testing InceptionBackInput %s",
                     (input_size, filter_size, output_size, stride, padding))
     self._CompareBackpropInput(input_size, filter_size, output_size,
@@ -1427,9 +1431,13 @@ def GetInceptionBackInputTest(input_size, filter_size, output_size, stride,
 
 
 def GetInceptionBackFilterTest(input_size, filter_size, output_size, strides,
-                               padding):
+                               padding, gpu_only=False):
 
   def Test(self):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping InceptionBackFilter %s",
+                      (input_size, filter_size, output_size, strides, padding))
+      return
     tf_logging.info("Testing InceptionBackFilter %s",
                     (input_size, filter_size, output_size, strides, padding))
     self._CompareBackFilter(input_size, filter_size, output_size, strides,
@@ -1450,4 +1458,21 @@ if __name__ == "__main__":
             GetInceptionBackFilterTest(input_size_, filter_size_, output_size_,
                                        [stride_, stride_], padding_))
 
+  # TODO(b/35359731)
+  # Fwd, BckInput, and BackFilter to test that for certain input parameter
+  # set, winograd nonfused algorithm will be excluded from conv autotune. If
+  # in such case, winograd nonfused algorithm is added as one option of the
+  # conv autotune, and cuDNN version is smaller than 7, the following tests
+  # will fail.
+  ishape = [1, 400, 400, 1]
+  fshape = [1, 1, 1, 256]
+  oshape = [1, 400, 400, 256]
+  setattr(Conv2DTest, "testInceptionFwd_No_Winograd_Nonfused",
+          GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True))
+  setattr(Conv2DTest, "testInceptionBackInput_No_Winograd_Nonfused",
+          GetInceptionBackInputTest(ishape, fshape, oshape, 1, "SAME",
+                                    gpu_only=True))
+  setattr(Conv2DTest, "testInceptionBackFilter_No_Winograd_Nonfused",
+          GetInceptionBackFilterTest(ishape, fshape, oshape, [1, 1], "SAME",
+                                     gpu_only=True))
   test.main()
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index 431587e11c8..e1920eb5680 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -23,6 +23,7 @@ import itertools
 import numpy as np
 from six.moves import zip_longest
 
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
@@ -221,6 +222,7 @@ class CTCGreedyDecoderTest(test.TestCase):
                     [1, 3], dtype=np.int64)),
     ]
 
+    # Test correct decoding.
     self._testCTCDecoder(
         ctc_ops.ctc_beam_search_decoder,
         inputs,
@@ -230,6 +232,19 @@ class CTCGreedyDecoderTest(test.TestCase):
         beam_width=2,
         top_paths=2)
 
+    # Requesting more paths than the beam width allows.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 (".*requested more paths than the beam "
+                                  "width.*")):
+      self._testCTCDecoder(
+          ctc_ops.ctc_beam_search_decoder,
+          inputs,
+          seq_lens,
+          log_prob_truth,
+          decode_truth,
+          beam_width=2,
+          top_paths=3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index 434faefafae..5b93f90a799 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -257,7 +257,7 @@ class CTCLossTest(test.TestCase):
       # Taking ths second gradient should fail, since it is not
       # yet supported.
       with self.assertRaisesRegexp(LookupError,
-                                   ".*No gradient defined.*PreventGradient.*"):
+                                   "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index cfe2754b323..b47139e6b8b 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -165,6 +166,9 @@ class UnaryOpTest(test.TestCase):
   def _sigmoid(self, x):
     return 1.0 / (1.0 + np.exp(-x))
 
+  def _log_sigmoid(self, x):
+    return np.log(self._sigmoid(x))
+
   def _replace_domain_error_with_inf(self, fn):
 
     def func(x):
@@ -196,8 +200,11 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareBoth(x, self._log_sigmoid, math_ops.log_sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
@@ -240,6 +247,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(x, np.log, math_ops.log)
     self._compareBoth(x, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, np.sign, math_ops.sign)
@@ -280,6 +289,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.expm1, math_ops.expm1)
     self._compareBoth(z, np.log, math_ops.log)
     self._compareBoth(z, np.log1p, math_ops.log1p)
+    self._compareBoth(x, np.sinh, math_ops.sinh)
+    self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
@@ -372,10 +383,10 @@ class UnaryOpTest(test.TestCase):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
                                                     2).astype(np.complex64)
     y = x + 0.5  # no zeros
-    self._compareCpu(x, np.abs, math_ops.abs)
-    self._compareCpu(x, np.abs, _ABS)
-    self._compareCpu(x, np.negative, math_ops.negative)
-    self._compareCpu(x, np.negative, _NEG)
+    self._compareBoth(x, np.abs, math_ops.abs)
+    self._compareBoth(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
     self._compareCpu(y, self._inv, math_ops.reciprocal)
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
@@ -384,6 +395,8 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
+    self._compareCpu(x, np.sinh, math_ops.sinh)
+    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -399,17 +412,17 @@ class UnaryOpTest(test.TestCase):
     def complex_sign(x):
       return x / np.abs(x)
 
-    self._compareCpu(y, complex_sign, math_ops.sign)
+    self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
                                                     2).astype(np.complex128)
     y = x + 0.5  # no zeros
-    self._compareCpu(x, np.abs, math_ops.abs)
-    self._compareCpu(x, np.abs, _ABS)
-    self._compareCpu(x, np.negative, math_ops.negative)
-    self._compareCpu(x, np.negative, _NEG)
+    self._compareBoth(x, np.abs, math_ops.abs)
+    self._compareBoth(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
     self._compareCpu(y, self._inv, math_ops.reciprocal)
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
@@ -418,6 +431,8 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
+    self._compareCpu(x, np.sinh, math_ops.sinh)
+    self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
@@ -433,7 +448,7 @@ class UnaryOpTest(test.TestCase):
     def complex_sign(x):
       return x / np.abs(x)
 
-    self._compareCpu(y, complex_sign, math_ops.sign)
+    self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testGradGrad(self):
@@ -585,7 +600,8 @@ class BinaryOpTest(test.TestCase):
 
   def _compareBoth(self, x, y, np_func, tf_func, also_compare_variables=False):
     self._compareCpu(x, y, np_func, tf_func, also_compare_variables)
-    if x.dtype in (np.float16, np.float32, np.float64):
+    if x.dtype in (np.float16, np.float32, np.float64, np.complex64,
+                   np.complex128):
       if tf_func not in (_FLOORDIV, math_ops.floordiv, math_ops.igamma,
                          math_ops.igammac, math_ops.zeta, math_ops.polygamma):
         self._compareGradientX(x, y, np_func, tf_func)
@@ -609,6 +625,13 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
     self._compareBoth(x, y + 0.1, np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.arctan2, math_ops.atan2)
+    x1 = np.random.randn(5, 6).astype(np.float32)
+    x2 = np.random.randn(5, 6).astype(np.float32)
+    # Remove tiny values--atan2 gradients are flaky near the origin.
+    x1[np.abs(x1) < 0.05] = 0.05 * np.sign(x1[np.abs(x1) < 0.05])
+    x2[np.abs(x2) < 0.05] = 0.05 * np.sign(x2[np.abs(x2) < 0.05])
+    self._compareBoth(x1, x2, np.arctan2, math_ops.atan2)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       a_pos_small = np.linspace(0.1, 2, 15).reshape(1, 3, 5).astype(np.float32)
@@ -666,6 +689,13 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
     self._compareBoth(x, y + 0.1, np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.arctan2, math_ops.atan2)
+    x1 = np.random.randn(7, 4).astype(np.float64)
+    x2 = np.random.randn(7, 4).astype(np.float64)
+    # Remove tiny values--atan2 gradients are flaky near the origin.
+    x1[np.abs(x1) < 0.5] = 0.5 * np.sign(x1[np.abs(x1) < 0.5])
+    x2[np.abs(x2) < 0.5] = 0.5 * np.sign(x2[np.abs(x2) < 0.5])
+    self._compareBoth(x1, x2, np.arctan2, math_ops.atan2)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       a_pos_small = np.linspace(0.1, 2, 15).reshape(1, 3, 5).astype(np.float32)
@@ -677,6 +707,11 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  def testUint8Basic(self):
+    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint8)
+    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint8)
+    self._compareBoth(x, y, np.add, math_ops.add)
+
   def testInt8Basic(self):
     x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int8)
     y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int8)
@@ -1084,11 +1119,24 @@ class BinaryOpTest(test.TestCase):
           error = gradient_checker.compute_gradient_error(y, [], z, [])
           self.assertLess(error, 2e-4)
 
+  def testAtan2SpecialValues(self):
+    x1l, x2l = zip((+0.0, +0.0), (+0.0, -0.0), (-0.0, +0.0), (-0.0, -0.0),
+                   (1.2345, float("inf")), (1.2345, -float("inf")),
+                   (-4.321, float("inf")), (-4.125, -float("inf")),
+                   (float("inf"), float("inf")), (float("inf"), -float("inf")),
+                   (-float("inf"), float("inf")), (-float("inf"),
+                                                   -float("inf")))
+    for dtype in np.float32, np.float64:
+      x1 = np.array(x1l).astype(dtype)
+      x2 = np.array(x2l).astype(dtype)
+      self._compareCpu(x1, x2, np.arctan2, math_ops.atan2)
+      self._compareGpu(x1, x2, np.arctan2, math_ops.atan2)
+
 
 class ComparisonOpTest(test.TestCase):
 
-  def _compare(self, func, x, y, dtype):
-    with self.test_session(use_gpu=False):
+  def _compareScalar(self, func, x, y, dtype):
+    with self.test_session(use_gpu=True):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
@@ -1101,38 +1149,30 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for x in data:
         for y in data:
-          self.assertEqual(self._compare(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compare(math_ops.less_equal, x, y, t), x <= y)
-          self.assertEqual(self._compare(math_ops.greater, x, y, t), x > y)
+          self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
+          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
+                           x <= y)
+          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
+                           x > y)
           self.assertEqual(
-              self._compare(math_ops.greater_equal, x, y, t), x >= y)
-          self.assertEqual(self._compare(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compare(math_ops.not_equal, x, y, t), x != y)
+              self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
+          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
+          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
+                           x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
-          self.assertEqual(self._compare(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compare(math_ops.not_equal, x, y, t), x != y)
+          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
+          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
+                           x != y)
 
-  def _compareCpu(self, x, y, np_func, tf_func):
-    np_ans = np_func(x, y)
-    with self.test_session(use_gpu=False):
-      out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_cpu = out.eval()
-    self.assertAllEqual(np_ans, tf_cpu)
-
-  def _compareGpu(self, x, y, np_func, tf_func):
+  def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
     with self.test_session(use_gpu=True):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_gpu = out.eval()
-    self.assertAllEqual(np_ans, tf_gpu)
-
-  def _compareBoth(self, x, y, np_func, tf_func):
-    self._compareCpu(x, y, np_func, tf_func)
-    if x.dtype == np.float16 or x.dtype == np.float32 or x.dtype == np.float64:
-      self._compareGpu(x, y, np_func, tf_func)
+      tf_ans = out.eval()
+    self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
     x = np.linspace(-15, 15, 6).reshape(1, 3, 2)
@@ -1140,28 +1180,31 @@ class ComparisonOpTest(test.TestCase):
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
       xt = x.astype(t)
       yt = y.astype(t)
-      self._compareBoth(xt, yt, np.less, math_ops.less)
-      self._compareBoth(xt, yt, np.less_equal, math_ops.less_equal)
-      self._compareBoth(xt, yt, np.greater, math_ops.greater)
-      self._compareBoth(xt, yt, np.greater_equal, math_ops.greater_equal)
-      self._compareBoth(xt, yt, np.equal, math_ops.equal)
-      self._compareBoth(xt, yt, np.not_equal, math_ops.not_equal)
-    # TODO(zhifengc): complex64 doesn't work on GPU yet.
+      self._compare(xt, yt, np.less, math_ops.less)
+      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+      self._compare(xt, yt, np.greater, math_ops.greater)
+      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+      self._compare(xt, yt, np.equal, math_ops.equal)
+      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+    # Complex types do not support ordering but do support equality tests.
     for t in [np.complex64, np.complex128]:
-      self._compareCpu(x.astype(t), y.astype(t), np.equal, math_ops.equal)
-      self._compareCpu(
-          x.astype(t), y.astype(t), np.not_equal, math_ops.not_equal)
+      xt = x.astype(t)
+      xt -= 1j * xt
+      yt = y.astype(t)
+      yt -= 1j * yt
+      self._compare(xt, yt, np.equal, math_ops.equal)
+      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
 
   def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
     x = np.linspace(-15, 15, np.prod(xs)).astype(dtype).reshape(xs)
     y = np.linspace(20, -10, np.prod(ys)).astype(dtype).reshape(ys)
-    self._compareCpu(x, y, np_func, tf_func)
-    self._compareCpu(y, x, np_func, tf_func)
-    if x.dtype == np.float16 or x.dtype == np.float32 or x.dtype == np.float64:
-      self._compareGpu(x, y, np_func, tf_func)
-      self._compareGpu(y, x, np_func, tf_func)
+    if dtype in (np.complex64, np.complex128):
+      x -= 1j * x
+      y -= 1j * y
+    self._compare(x, y, np_func, tf_func)
+    self._compare(y, x, np_func, tf_func)
 
-  def _testBCastByFunc(self, np_func, tf_func):
+  def _testBCastByFunc(self, np_func, tf_func, include_complex=False):
     shapes = [
         ([1, 3, 2], [1]),
         ([1, 3, 2], [2]),
@@ -1182,6 +1225,9 @@ class ComparisonOpTest(test.TestCase):
         np.int32,
         np.int64,
     ]
+    if include_complex:
+      dtypes.extend([np.complex64, np.complex128])
+
     for (xs, ys) in shapes:
       for dtype in dtypes:
         self._compareBCast(xs, ys, dtype, np_func, tf_func)
@@ -1199,10 +1245,11 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.greater_equal, math_ops.greater_equal)
 
   def testBCastEqual(self):
-    self._testBCastByFunc(np.equal, math_ops.equal)
+    self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal)
+    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
+                          include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
new file mode 100644
index 00000000000..783492a6f25
--- /dev/null
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeBmpOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.platform import test
+
+
+
+class DecodeBmpOpTest(test.TestCase):
+
+  def testex1(self):
+    img_bytes = [[[0, 0, 255], [0, 255, 0]], [[255, 0, 0], [255, 255, 255]]]
+    # Encoded BMP bytes from Wikipedia
+    encoded_bytes = [
+        0x42, 0x40,
+        0x46, 0, 0, 0,
+        0, 0,
+        0, 0,
+        0x36, 0, 0, 0,
+        0x28, 0, 0, 0,
+        0x2, 0, 0, 0,
+        0x2, 0, 0, 0,
+        0x1, 0,
+        0x18, 0,
+        0, 0, 0, 0,
+        0x10, 0, 0, 0,
+        0x13, 0xb, 0, 0,
+        0x13, 0xb, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0xff,
+        0xff, 0xff, 0xff,
+        0, 0,
+        0xff, 0, 0,
+        0, 0xff, 0,
+        0, 0,
+    ]
+
+    byte_string = bytes(bytearray(encoded_bytes))
+    img_in = constant_op.constant(byte_string, dtype=dtypes.string)
+    decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
+
+    with self.test_session():
+      decoded = decode.eval()
+      self.assertAllEqual(decoded, img_bytes)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index 17051c34809..38533793288 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -53,6 +53,17 @@ class DecodeCSVOpTest(test.TestCase):
 
     self._test(args, expected_out)
 
+  def testSimpleNoQuoteDelimiter(self):
+    args = {
+        "records": ["1", "2", '"3"'],
+        "record_defaults": [[""]],
+        "use_quote_delim": False,
+    }
+
+    expected_out = [[b"1", b"2", b'"3"']]
+
+    self._test(args, expected_out)
+
   def testScalar(self):
     args = {"records": '1,""', "record_defaults": [[3], [4]]}
 
@@ -66,6 +77,14 @@ class DecodeCSVOpTest(test.TestCase):
 
     self._test(args, expected_out)
 
+  def test2DNoQuoteDelimiter(self):
+    args = {"records": [["1", "2"], ['""', '"']],
+            "record_defaults": [[""]],
+            "use_quote_delim": False}
+    expected_out = [[[b"1", b"2"], [b'""', b'"']]]
+
+    self._test(args, expected_out)
+
   def testInt64(self):
     args = {
         "records": ["1", "2", '"2147483648"'],
@@ -107,6 +126,17 @@ class DecodeCSVOpTest(test.TestCase):
 
     self._test(args, expected_out)
 
+  def testWithDefaultsAndNoQuoteDelimiter(self):
+    args = {
+        "records": [",1,", "0.2,3,bcd", '3.0,,"'],
+        "record_defaults": [[1.0], [0], ["a"]],
+        "use_quote_delim": False,
+    }
+
+    expected_out = [[1.0, 0.2, 3.0], [1, 3, 0], [b"a", b"bcd", b"\""]]
+
+    self._test(args, expected_out)
+
   def testWithTabDelim(self):
     args = {
         "records": ["1\t1", "0.2\t3", "3.0\t"],
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 52f48c3368b..58280432d63 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -33,13 +33,24 @@ prefix_path = "tensorflow/core/lib"
 
 class DecodeImageOpTest(test.TestCase):
 
+  def testBmp(self):
+    # Read a real bmp and verify shape
+    path = os.path.join(prefix_path, "bmp", "testdata", "lena.bmp")
+    with self.test_session(use_gpu=True) as sess:
+      bmp0 = io_ops.read_file(path)
+      image0 = image_ops.decode_image(bmp0)
+      image1 = image_ops.decode_bmp(bmp0)
+      bmp0, image0, image1 = sess.run([bmp0, image0, image1])
+      self.assertEqual(len(bmp0), 4194)
+      self.assertAllEqual(image0, image1)
+
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
-    WIDTH = 20
-    HEIGHT = 40
-    STRIDE = 5
-    shape = (12, HEIGHT, WIDTH, 3)
+    width = 20
+    height = 40
+    stride = 5
+    shape = (12, height, width, 3)
 
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(path)
@@ -52,13 +63,13 @@ class DecodeImageOpTest(test.TestCase):
 
       for frame_idx, frame in enumerate(image0):
         gt = np.zeros(shape[1:], dtype=np.uint8)
-        start = frame_idx * STRIDE
-        end = (frame_idx + 1) * STRIDE
-        if end <= WIDTH:
+        start = frame_idx * stride
+        end = (frame_idx + 1) * stride
+        if end <= width:
           gt[:, start:end, :] = 255
         else:
-          start -= WIDTH
-          end -= WIDTH
+          start -= width
+          end -= width
           gt[start:end, :, :] = 255
 
         self.assertAllClose(frame, gt)
@@ -79,11 +90,15 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
+      bad_channels = image_ops.decode_image(jpeg0, channels=4)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        bad_channels.eval()
+
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
     inputs = [(1, "lena_gray.png")]
     for channels_in, filename in inputs:
-      for channels in 0, 1, 3:
+      for channels in 0, 1, 3, 4:
         with self.test_session(use_gpu=True) as sess:
           path = os.path.join(prefix_path, "png", "testdata", filename)
           png0 = io_ops.read_file(path)
@@ -100,11 +115,6 @@ class DecodeImageOpTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         decode.eval()
 
-  def testInvalidChannels(self):
-    image_bytes = b"unused"
-    with self.assertRaises(ValueError):
-      decode = image_ops.decode_image(image_bytes, channels=4)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index cd7216c5279..e986b7ff2b6 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -54,18 +54,26 @@ class DecodeRawOpTest(test.TestCase):
       self.assertEqual([None, None], decode.get_shape().as_list())
 
       result = decode.eval(feed_dict={in_bytes: ["AaBC"]})
-      if sys.byteorder == "big":
-        self.assertAllEqual(
-            [[ord("A") * 256 + ord("a"), ord("B") * 256 + ord("C")]], result)
-      else:
-        self.assertAllEqual(
-            [[ord("A") + ord("a") * 256, ord("B") + ord("C") * 256]], result)
+      self.assertAllEqual(
+          [[ord("A") + ord("a") * 256, ord("B") + ord("C") * 256]], result)
 
       with self.assertRaisesOpError(
           "Input to DecodeRaw has length 3 that is not a multiple of 2, the "
           "size of int16"):
         decode.eval(feed_dict={in_bytes: ["123", "456"]})
 
+  def testEndianness(self):
+    with self.test_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode_le = parsing_ops.decode_raw(
+          in_bytes, out_type=dtypes.int32, little_endian=True)
+      decode_be = parsing_ops.decode_raw(
+          in_bytes, out_type=dtypes.int32, little_endian=False)
+      result = decode_le.eval(feed_dict={in_bytes: ["\x01\x02\x03\x04"]})
+      self.assertAllEqual([[0x04030201]], result)
+      result = decode_be.eval(feed_dict={in_bytes: ["\x01\x02\x03\x04"]})
+      self.assertAllEqual([[0x01020304]], result)
+
   def testToFloat16(self):
     with self.test_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -77,6 +85,14 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  def testEmptyStringInput(self):
+    with self.test_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.float16)
+
+      result = decode.eval(feed_dict={in_bytes: [""]})
+      self.assertEqual(len(result), 1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index 4d3b8b62e04..d33bf1ba120 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -87,6 +87,68 @@ class AssignOpTest(test.TestCase):
       self.assertTrue((vals > 0).all())
       self.assertTrue((vals <= 20).all())
 
+  # NOTE(skyewm): We exclude these tests from the TSAN TAP target, because they
+  # contain non-benign but known data races between the variable assignment and
+  # returning the output tensors. This issue will be resolved with the new
+  # resource variables.
+  def testParallelUpdateWithLocking(self):
+    with self.test_session() as sess:
+      zeros_t = array_ops.fill([1024, 1024], 0.0)
+      ones_t = array_ops.fill([1024, 1024], 1.0)
+      p = variables.Variable(zeros_t)
+      adds = [
+          state_ops.assign_add(
+              p, ones_t, use_locking=True) for _ in range(20)
+      ]
+      p.initializer.run()
+
+      def run_add(add_op):
+        sess.run(add_op)
+
+      threads = [
+          self.checkedThread(
+              target=run_add, args=(add_op,)) for add_op in adds
+      ]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      vals = p.eval()
+      ones = np.ones((1024, 1024)).astype(np.float32)
+      self.assertAllEqual(vals, ones * 20)
+
+  def testParallelAssignWithLocking(self):
+    with self.test_session() as sess:
+      zeros_t = array_ops.fill([1024, 1024], 0.0)
+      ones_t = array_ops.fill([1024, 1024], 1.0)
+      p = variables.Variable(zeros_t)
+      assigns = [
+          state_ops.assign(
+              p, math_ops.multiply(ones_t, float(i)), use_locking=True)
+          for i in range(1, 21)
+      ]
+      p.initializer.run()
+
+      def run_assign(assign_op):
+        sess.run(assign_op)
+
+      threads = [
+          self.checkedThread(
+              target=run_assign, args=(assign_op,)) for assign_op in assigns
+      ]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      vals = p.eval()
+
+      # Assert every element is the same, and taken from one of the assignments.
+      self.assertTrue(vals[0, 0] > 0)
+      self.assertTrue(vals[0, 0] <= 20)
+      self.assertAllEqual(vals, np.ones([1024, 1024]) * vals[0, 0])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 10cd98d08ee..4dda9f093b5 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -113,70 +112,6 @@ class AssignOpTest(test.TestCase):
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
-  # NOTE(mrry): See also
-  #   dense_update_ops_no_tsan_test.AssignOpTest, which contains a benign
-  #   data race and must run without TSAN.
-  def testParallelUpdateWithLocking(self):
-    with self.test_session() as sess:
-      zeros_t = array_ops.fill([1024, 1024], 0.0)
-      ones_t = array_ops.fill([1024, 1024], 1.0)
-      p = variables.Variable(zeros_t)
-      adds = [
-          state_ops.assign_add(
-              p, ones_t, use_locking=True) for _ in range(20)
-      ]
-      p.initializer.run()
-
-      def run_add(add_op):
-        sess.run(add_op)
-
-      threads = [
-          self.checkedThread(
-              target=run_add, args=(add_op,)) for add_op in adds
-      ]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      vals = p.eval()
-      ones = np.ones((1024, 1024)).astype(np.float32)
-      self.assertAllEqual(vals, ones * 20)
-
-  # NOTE(mrry): See also
-  #   dense_update_ops_no_tsan_test.[...].testParallelAssignWithoutLocking,
-  #   which contains a benign data race and must run without TSAN.
-  def testParallelAssignWithLocking(self):
-    with self.test_session() as sess:
-      zeros_t = array_ops.fill([1024, 1024], 0.0)
-      ones_t = array_ops.fill([1024, 1024], 1.0)
-      p = variables.Variable(zeros_t)
-      assigns = [
-          state_ops.assign(
-              p, math_ops.multiply(ones_t, float(i)), use_locking=True)
-          for i in range(1, 21)
-      ]
-      p.initializer.run()
-
-      def run_assign(assign_op):
-        sess.run(assign_op)
-
-      threads = [
-          self.checkedThread(
-              target=run_assign, args=(assign_op,)) for assign_op in assigns
-      ]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      vals = p.eval()
-
-      # Assert every element is the same, and taken from one of the assignments.
-      self.assertTrue(vals[0, 0] > 0)
-      self.assertTrue(vals[0, 0] <= 20)
-      self.assertAllEqual(vals, np.ones([1024, 1024]) * vals[0, 0])
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 427db392217..8ba9d0efff7 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -82,10 +83,19 @@ def CheckGradConfigsToTest():
 
 class DepthwiseConv2DTest(test.TestCase):
 
-  # This is testing against the output of the implementation using the
-  # combination of conv_2d and slicing ops.
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
-                    use_gpu):
+  # This is testing that depthwise_conv2d and depthwise_conv2d_native
+  # produce the same results.  It also tests that NCHW and NWHC
+  # formats agree, by comparing the depthwise_conv2d_native with
+  # 'NCHW' format (with transposition) matches the 'NHWC' format using
+  # the higher level interface.
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    stride,
+                    padding,
+                    data_type,
+                    use_gpu,
+                    data_format="NHWC"):
     """Verifies the output values of the convolution function.
 
     Args:
@@ -95,7 +105,9 @@ class DepthwiseConv2DTest(test.TestCase):
         [filter_rows, filter_cols, input_depth, depth_multiplier].
       stride: Stride.
       padding: Padding type.
+      data_type: The data type to use.
       use_gpu: Whether to use GPU.
+      data_format: The data_format of the input. "NHWC" or "NCHW".
     """
     total_size_1 = 1
     total_size_2 = 1
@@ -103,37 +115,78 @@ class DepthwiseConv2DTest(test.TestCase):
       total_size_1 *= s
     for s in filter_in_sizes:
       total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
+    # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
-      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      if data_type == dtypes.float32:
+        tolerance = 1e-5
+      else:
+        self.assertEqual(data_type, dtypes.float64)
+        tolerance = 1e-8
+
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=data_type)
       t1.set_shape(tensor_in_sizes)
-      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=data_type)
+
+      native_t1 = t1
+      strides = [1, stride, stride, 1]
+      if data_format == "NCHW":
+        # Transpose from NWHC input to NCHW
+        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        strides = [1, 1, stride, stride]
+
       conv_native = nn_ops.depthwise_conv2d_native(
+          native_t1,
+          t2,
+          strides=strides,
+          data_format=data_format,
+          padding=padding)
+
+      if data_format == "NCHW":
+        # Transpose back from NCHW to NHWC
+        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
+
+      conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
 
-      conv_gold = nn_impl.depthwise_conv2d(
-          t1, t2, strides=[1, stride, stride, 1], padding=padding)
       native_result = sess.run(conv_native)
-      gold_result = sess.run(conv_gold)
+      interface_result = sess.run(conv_interface)
 
-    print("diff matrix:",
-          np.amax(np.ravel(native_result) - np.ravel(gold_result)))
-    self.assertArrayNear(np.ravel(native_result), np.ravel(gold_result), 1e-5)
+    print("data_type:", data_type, "use_gpu:", use_gpu, "max diff = ",
+          np.amax(np.absolute(native_result - interface_result)))
+    self.assertArrayNear(
+        np.ravel(native_result), np.ravel(interface_result), tolerance)
     self.assertShapeEqual(native_result, conv_native)
-    self.assertShapeEqual(native_result, conv_gold)
+    self.assertShapeEqual(native_result, conv_interface)
 
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Processing ", index, "th config.")
-      if index == 2:
+      print("Testing DepthwiseConv2D,", index, "th config:", input_size, "*",
+            filter_size, "stride:", stride, "padding:", padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
         self._VerifyValues(
-            input_size, filter_size, stride, padding, use_gpu=True)
-      self._VerifyValues(
-          input_size, filter_size, stride, padding, use_gpu=False)
+            input_size, filter_size, stride, padding, data_type, use_gpu=True)
+
+  def testDepthwiseConv2DFormat(self):
+    if not test.is_gpu_available():
+      return
+
+    for index, (input_size, filter_size, _, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Testing DepthwiseConv2DFormat,", index, "th config:", input_size,
+            "*", filter_size, "stride:", stride, "padding:", padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
+        self._VerifyValues(
+            input_size,
+            filter_size,
+            stride,
+            padding,
+            data_type,
+            use_gpu=True,
+            data_format="NCHW")
 
 # This is testing against hand calculated results.
 
@@ -243,9 +296,16 @@ class DepthwiseConv2DTest(test.TestCase):
   # depthwise gradient ops with the gradients computed numerically (details can
   # be found in the compute_gradient_error().
   # Note this check is very expensive so the input should not be too big.
-  def _ConstructAndTestGradient(self, input_shape, filter_shape, output_shape,
-                                stride, padding, data_type, test_input,
-                                use_gpu):
+  def _ConstructAndTestGradient(self,
+                                input_shape,
+                                filter_shape,
+                                output_shape,
+                                stride,
+                                padding,
+                                data_type,
+                                test_input,
+                                use_gpu,
+                                data_format="NHWC"):
     input_size = 1
     for x in input_shape:
       input_size *= x
@@ -265,53 +325,116 @@ class DepthwiseConv2DTest(test.TestCase):
           input_data, shape=input_shape, dtype=data_type, name="input")
       filter_tensor = constant_op.constant(
           filter_data, shape=filter_shape, dtype=data_type, name="filter")
+
+      native_input = input_tensor
+      strides = [1, stride, stride, 1]
+      if data_format == "NCHW":
+        # Transpose from NWHC input to NCHW
+        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+        native_input = array_ops.transpose(input_tensor, [0, 3, 1, 2])
+        input_shape = [
+            input_shape[0], input_shape[3], input_shape[1], input_shape[2]
+        ]
+        output_shape = [
+            output_shape[0], output_shape[3], output_shape[1], output_shape[2]
+        ]
+        strides = [1, 1, stride, stride]
+
       depthwise_conv2d = nn_ops.depthwise_conv2d_native(
-          input_tensor,
-          filter_tensor, [1, stride, stride, 1],
+          native_input,
+          filter_tensor,
+          strides,
           padding,
+          data_format=data_format,
           name="depthwise_conv2d")
+
       self.assertEqual(output_shape, depthwise_conv2d.get_shape())
       if test_input:
-        err = gradient_checker.compute_gradient_error(input_tensor, input_shape,
-                                                      depthwise_conv2d,
-                                                      output_shape)
+        err = gradient_checker.compute_gradient_error(
+            native_input, input_shape, depthwise_conv2d, output_shape)
       else:
         err = gradient_checker.compute_gradient_error(filter_tensor,
                                                       filter_shape,
                                                       depthwise_conv2d,
                                                       output_shape)
-      print("depthwise conv_2d gradient error = ", err)
+      print("data_type:", data_type, "use_gpu:", use_gpu, ", error = ", err)
       self.assertLess(err, tolerance)
 
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Processing ", index, "th config.")
-      for use_gpu in [True, False]:
+      print("Testing DepthwiseConv2DInputGrad,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
             output_size,
             stride,
             padding,
-            dtypes.float32,
+            data_type,
             test_input=True,
-            use_gpu=use_gpu)
+            use_gpu=True)
+
+  def testDepthwiseConv2DInputGradFormat(self):
+    if not test.is_gpu_available():
+      return
+
+    for index, (input_size, filter_size, output_size, stride,
+                padding) in enumerate(CheckGradConfigsToTest()):
+      print("Testing DepthwiseConv2DInputGradFormat,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=True,
+            use_gpu=True,
+            data_format="NCHW")
 
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Processing ", index, "th config.")
-      for use_gpu in [True, False]:
+      print("Testing DepthwiseConv2DFilterGrad,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
             output_size,
             stride,
             padding,
-            dtypes.float32,
+            data_type,
             test_input=False,
-            use_gpu=use_gpu)
+            use_gpu=True)
+
+  def testDepthwiseConv2DFilterGradFormat(self):
+    if not test.is_gpu_available():
+      return
+
+    for index, (input_size, filter_size, output_size, stride,
+                padding) in enumerate(CheckGradConfigsToTest()):
+      print("Testing DepthwiseConv2DFilterGradFormat,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
+      for data_type in [dtypes.float32, dtypes.float64]:
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=False,
+            use_gpu=True,
+            data_format="NCHW")
 
   def _CompareBackpropInputFloat(self, input_sizes, filter_sizes, output_sizes,
                                  stride, padding):
@@ -356,7 +479,9 @@ class DepthwiseConv2DTest(test.TestCase):
   def testDepthwiseConv2DInputGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Processing ", index, "th config.")
+      print("Testing DepthwiseConv2DInputGradCompare,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
       self._CompareBackpropInputFloat(input_size, filter_size, output_size,
                                       stride, padding)
       self._CompareBackpropInputDouble(input_size, filter_size, output_size,
@@ -405,11 +530,14 @@ class DepthwiseConv2DTest(test.TestCase):
   def testDepthwiseConv2DFilterGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Processing ", index, "th config.")
+      print("Testing DepthwiseConv2DFilterGradCompare,", index, "th config:",
+            input_size, "*", filter_size, "stride:", stride, "padding:",
+            padding)
       self._CompareBackpropFilterFloat(input_size, filter_size, output_size,
                                        stride, padding)
       self._CompareBackpropFilterDouble(input_size, filter_size, output_size,
                                         stride, padding)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 09d6436f435..f0b78857326 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -30,44 +30,45 @@ from tensorflow.python.platform import tf_logging
 
 
 class MatrixDiagTest(test.TestCase):
-  _use_gpu = False
 
   def testVector(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.diag(v)
       v_diag = array_ops.matrix_diag(v)
       self.assertEqual((3, 3), v_diag.get_shape())
       self.assertAllEqual(v_diag.eval(), mat)
 
-  def testBatchVector(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      v_batch = np.array([[1.0, 2.0, 3.0],
-                          [4.0, 5.0, 6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0],
-            [0.0, 0.0, 3.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0],
-            [0.0, 0.0, 6.0]]])
+  def _testBatchVector(self, dtype):
+    with self.test_session(use_gpu=True):
+      v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+                             [0.0, 0.0, 6.0]]]).astype(dtype)
       v_batch_diag = array_ops.matrix_diag(v_batch)
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
+  def testBatchVector(self):
+    self._testBatchVector(np.float32)
+    self._testBatchVector(np.float64)
+    self._testBatchVector(np.int32)
+    self._testBatchVector(np.int64)
+    self._testBatchVector(np.bool)
+
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
 
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 1-dim"):
         array_ops.matrix_diag(v).eval(feed_dict={v: 0.0})
 
   def testGrad(self):
     shapes = ((3,), (7, 4))
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(np.random.rand(*shape), np.float32)
         y = array_ops.matrix_diag(x)
@@ -78,15 +79,10 @@ class MatrixDiagTest(test.TestCase):
         self.assertLess(error, 1e-4)
 
 
-class MatrixDiagGpuTest(MatrixDiagTest):
-  _use_gpu = True
-
-
 class MatrixSetDiagTest(test.TestCase):
-  _use_gpu = False
 
   def testSquare(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.array([[0.0, 1.0, 0.0],
                       [1.0, 0.0, 1.0],
@@ -99,7 +95,7 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertAllEqual(mat_set_diag, output.eval())
 
   def testRectangular(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]])
       expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]])
@@ -114,31 +110,31 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertEqual((3, 2), output.get_shape())
       self.assertAllEqual(expected, output.eval())
 
-  def testSquareBatch(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      v_batch = np.array([[-1.0, -2.0, -3.0],
-                          [-4.0, -5.0, -6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 3.0],
-            [0.0, 2.0, 0.0],
-            [1.0, 0.0, 3.0]],
-           [[4.0, 0.0, 4.0],
-            [0.0, 5.0, 0.0],
-            [2.0, 0.0, 6.0]]])
+  def _testSquareBatch(self, dtype):
+    with self.test_session(use_gpu=True):
+      v_batch = np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0],
+                             [2.0, 0.0, 6.0]]]).astype(dtype)
+
+      mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0],
+                                      [1.0, 0.0, -3.0]],
+                                     [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0],
+                                      [2.0, 0.0, -6.0]]]).astype(dtype)
 
-      mat_set_diag_batch = np.array(
-          [[[-1.0, 0.0, 3.0],
-            [0.0, -2.0, 0.0],
-            [1.0, 0.0, -3.0]],
-           [[-4.0, 0.0, 4.0],
-            [0.0, -5.0, 0.0],
-            [2.0, 0.0, -6.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
       self.assertAllEqual(mat_set_diag_batch, output.eval())
 
+  def testSquareBatch(self):
+    self._testSquareBatch(np.float32)
+    self._testSquareBatch(np.float64)
+    self._testSquareBatch(np.int32)
+    self._testSquareBatch(np.int64)
+    self._testSquareBatch(np.bool)
+
   def testRectangularBatch(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v_batch = np.array([[-1.0, -2.0],
                           [-4.0, -5.0]])
       mat_batch = np.array(
@@ -163,7 +159,7 @@ class MatrixSetDiagTest(test.TestCase):
       array_ops.matrix_set_diag([[0]], 0)
 
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_set_diag(v, [v]).eval(feed_dict={v: 0.0})
@@ -173,7 +169,7 @@ class MatrixSetDiagTest(test.TestCase):
 
   def testGrad(self):
     shapes = ((3, 4, 4), (3, 3, 4), (3, 4, 3), (7, 4, 8, 8))
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(
             np.random.rand(*shape), dtype=dtypes_lib.float32)
@@ -189,7 +185,7 @@ class MatrixSetDiagTest(test.TestCase):
         self.assertLess(error_x_diag, 1e-4)
 
   def testGradWithNoShapeInformation(self):
-    with self.test_session(use_gpu=self._use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       mat = array_ops.placeholder(dtype=dtypes_lib.float32)
       grad_input = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -207,15 +203,10 @@ class MatrixSetDiagTest(test.TestCase):
                           grad_vals[0])
 
 
-class MatrixSetDiagGpuTest(MatrixSetDiagTest):
-  _use_gpu = True
-
-
 class MatrixDiagPartTest(test.TestCase):
-  _use_gpu = False
 
   def testSquare(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.diag(v)
       mat_diag = array_ops.matrix_diag_part(mat)
@@ -223,7 +214,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_diag.eval(), v)
 
   def testRectangular(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 5.0]))
@@ -231,24 +222,26 @@ class MatrixDiagPartTest(test.TestCase):
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
 
-  def testSquareBatch(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      v_batch = np.array([[1.0, 2.0, 3.0],
-                          [4.0, 5.0, 6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0],
-            [0.0, 0.0, 3.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0],
-            [0.0, 0.0, 6.0]]])
+  def _testSquareBatch(self, dtype):
+    with self.test_session(use_gpu=True):
+      v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+                             [0.0, 0.0, 6.0]]]).astype(dtype)
       self.assertEqual(mat_batch.shape, (2, 3, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  def testSquareBatch(self):
+    self._testSquareBatch(np.float32)
+    self._testSquareBatch(np.float64)
+    self._testSquareBatch(np.int32)
+    self._testSquareBatch(np.int64)
+    self._testSquareBatch(np.bool)
+
   def testRectangularBatch(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v_batch = np.array([[1.0, 2.0],
                           [4.0, 5.0]])
       mat_batch = np.array(
@@ -266,14 +259,14 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_diag_part(v).eval(feed_dict={v: 0.0})
 
   def testGrad(self):
     shapes = ((3, 3), (2, 3), (3, 2), (5, 3, 3))
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(np.random.rand(*shape), dtype=np.float32)
         y = array_ops.matrix_diag_part(x)
@@ -284,10 +277,6 @@ class MatrixDiagPartTest(test.TestCase):
         self.assertLess(error, 1e-4)
 
 
-class MatrixDiagPartGpuTest(MatrixDiagPartTest):
-  _use_gpu = True
-
-
 class DiagTest(test.TestCase):
 
   def diagOp(self, diag, dtype, expected_ans, use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
new file mode 100644
index 00000000000..50a07952004
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -0,0 +1,279 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+cuda_py_test(
+    name = "bijector_test",
+    size = "small",
+    srcs = ["bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "kullback_leibler_test",
+    size = "small",
+    srcs = ["kullback_leibler_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "beta_test",
+    size = "small",
+    srcs = ["beta_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "bernoulli_test",
+    size = "small",
+    srcs = ["bernoulli_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "categorical_test",
+    size = "small",
+    srcs = ["categorical_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_test",
+    size = "small",
+    srcs = ["dirichlet_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_multinomial_test",
+    size = "medium",
+    srcs = ["dirichlet_multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "gamma_test",
+    srcs = ["gamma_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "laplace_test",
+    srcs = ["laplace_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "multinomial_test",
+    srcs = ["multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "student_t_test",
+    size = "small",
+    srcs = ["student_t_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+)
+
+cuda_py_test(
+    name = "uniform_test",
+    size = "small",
+    srcs = ["uniform_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "normal_test",
+    size = "medium",
+    srcs = ["normal_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "special_math_test",
+    size = "medium",
+    srcs = ["special_math_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "identity_bijector_test",
+    size = "small",
+    srcs = ["identity_bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/distributions/__init__.py b/tensorflow/python/kernel_tests/distributions/__init__.py
new file mode 100644
index 00000000000..94dd13c8905
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kernel tests for tf.distributions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
similarity index 90%
rename from tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
rename to tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 6ba872ef9ca..ef93c4dab08 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-import scipy.special
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
 
 
 def make_bernoulli(batch_shape, dtype=dtypes.int32):
@@ -54,13 +69,16 @@ class BernoulliTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(logits, dist.logits.eval())
 
+    if not special:
+      return
+
     with self.test_session():
-      self.assertAllClose(scipy.special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), dist.probs.eval())
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
@@ -148,10 +166,21 @@ class BernoulliTest(test.TestCase):
               p: [0.2, 0.3, 0.4]
           }), [[0.2, 0.7, 0.4]])
 
+  def testPmfInvalid(self):
+    p = [0.1, 0.2, 0.7]
+    with self.test_session():
+      dist = bernoulli.Bernoulli(probs=p, validate_args=True)
+      with self.assertRaisesOpError("must be non-negative."):
+        dist.prob([1, 1, -1]).eval()
+      with self.assertRaisesOpError("is not less than or equal to 1."):
+        dist.prob([2, 0, 1]).eval()
+
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
-    self._testPmf(logits=scipy.special.logit(p))
+    if not special:
+      return
+    self._testPmf(logits=special.logit(p))
 
   def testBroadcasting(self):
     with self.test_session():
@@ -277,7 +306,7 @@ class BernoulliTest(test.TestCase):
       a = bernoulli.Bernoulli(probs=a_p)
       b = bernoulli.Bernoulli(probs=b_p)
 
-      kl = kullback_leibler.kl(a, b)
+      kl = kullback_leibler.kl_divergence(a, b)
       kl_val = sess.run(kl)
 
       kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
rename to tensorflow/python/kernel_tests/distributions/beta_test.py
index f524986cec8..91a451f033f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -16,18 +16,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import beta as beta_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import beta as beta_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class BetaTest(test.TestCase):
@@ -167,18 +182,22 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_mean = stats.beta.mean(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.beta.mean(a, b)
       self.assertAllClose(expected_mean, dist.mean().eval())
 
   def testBetaVariance(self):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_variance = stats.beta.var(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.beta.var(a, b)
       self.assertAllClose(expected_variance, dist.variance().eval())
 
   def testBetaMode(self):
@@ -228,9 +247,11 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_entropy = stats.beta.entropy(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.beta.entropy(a, b)
       self.assertAllClose(expected_entropy, dist.entropy().eval())
 
   def testBetaSample(self):
@@ -243,6 +264,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
@@ -286,6 +309,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values[:, 1, :].mean(axis=0),
           stats.beta.mean(a, b)[1, :],
@@ -301,6 +326,8 @@ class BetaTest(test.TestCase):
         actual = beta_lib.Beta(a, b).cdf(x).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaLogCdf(self):
@@ -313,6 +340,8 @@ class BetaTest(test.TestCase):
         actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaWithSoftplusConcentration(self):
@@ -342,6 +371,8 @@ class BetaTest(test.TestCase):
         d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
                                                        concentration0=b2_sp)
 
+        if not special:
+          return
         kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
                        (a1 - a2) * special.digamma(a1) +
                        (b1 - b2) * special.digamma(b1) +
@@ -349,13 +380,13 @@ class BetaTest(test.TestCase):
 
         for dist1 in [d1, d1_sp]:
           for dist2 in [d2, d2_sp]:
-            kl = kullback_leibler.kl(dist1, dist2)
+            kl = kullback_leibler.kl_divergence(dist1, dist2)
             kl_val = sess.run(kl)
             self.assertEqual(kl.get_shape(), shape)
             self.assertAllClose(kl_val, kl_expected)
 
         # Make sure KL(d1||d1) is 0
-        kl_same = sess.run(kullback_leibler.kl(d1, d1))
+        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
         self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 
 
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
new file mode 100644
index 00000000000..9f9fb5c0bb4
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -0,0 +1,163 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.platform import test
+
+
+class BaseBijectorTest(test.TestCase):
+  """Tests properties of the Bijector base-class."""
+
+  def testIsAbstract(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError,
+                                   ("Can't instantiate abstract class Bijector "
+                                    "with abstract methods __init__")):
+        bijector.Bijector()  # pylint: disable=abstract-class-instantiated
+
+  def testDefaults(self):
+    class _BareBonesBijector(bijector.Bijector):
+      """Minimal specification of a `Bijector`."""
+
+      def __init__(self):
+        super(_BareBonesBijector, self).__init__()
+
+    with self.test_session() as sess:
+      bij = _BareBonesBijector()
+      self.assertEqual(None, bij.event_ndims)
+      self.assertEqual([], bij.graph_parents)
+      self.assertEqual(False, bij.is_constant_jacobian)
+      self.assertEqual(False, bij.validate_args)
+      self.assertEqual(None, bij.dtype)
+      self.assertEqual("bare_bones_bijector", bij.name)
+
+      for shape in [[], [1, 2], [1, 2, 3]]:
+        [
+            forward_event_shape_,
+            inverse_event_shape_,
+        ] = sess.run([
+            bij.inverse_event_shape_tensor(shape),
+            bij.forward_event_shape_tensor(shape),
+        ])
+        self.assertAllEqual(shape, forward_event_shape_)
+        self.assertAllEqual(shape, bij.forward_event_shape(shape))
+        self.assertAllEqual(shape, inverse_event_shape_)
+        self.assertAllEqual(shape, bij.inverse_event_shape(shape))
+
+      for fn in ["forward",
+                 "inverse",
+                 "inverse_log_det_jacobian",
+                 "forward_log_det_jacobian"]:
+        with self.assertRaisesRegexp(
+            NotImplementedError, fn + " not implemented"):
+          getattr(bij, fn)(0)
+
+
+class IntentionallyMissingError(Exception):
+  pass
+
+
+class BrokenBijector(bijector.Bijector):
+  """Forward and inverse are not inverses of each other."""
+
+  def __init__(self, forward_missing=False, inverse_missing=False):
+    super(BrokenBijector, self).__init__(
+        event_ndims=0, validate_args=False, name="broken")
+    self._forward_missing = forward_missing
+    self._inverse_missing = inverse_missing
+
+  def _forward(self, x):
+    if self._forward_missing:
+      raise IntentionallyMissingError
+    return 2 * x
+
+  def _inverse(self, y):
+    if self._inverse_missing:
+      raise IntentionallyMissingError
+    return y / 2.
+
+  def _inverse_log_det_jacobian(self, y):  # pylint:disable=unused-argument
+    if self._inverse_missing:
+      raise IntentionallyMissingError
+    return -math_ops.log(2.)
+
+  def _forward_log_det_jacobian(self, x):  # pylint:disable=unused-argument
+    if self._forward_missing:
+      raise IntentionallyMissingError
+    return math_ops.log(2.)
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BijectorCachingTestBase(object):
+
+  @abc.abstractproperty
+  def broken_bijector_cls(self):
+    # return a BrokenBijector type Bijector, since this will test the caching.
+    raise IntentionallyMissingError("Not implemented")
+
+  def testCachingOfForwardResults(self):
+    broken_bijector = self.broken_bijector_cls(inverse_missing=True)
+    with self.test_session():
+      x = constant_op.constant(1.1)
+
+      # Call forward and forward_log_det_jacobian one-by-one (not together).
+      y = broken_bijector.forward(x)
+      _ = broken_bijector.forward_log_det_jacobian(x)
+
+      # Now, everything should be cached if the argument is y.
+      try:
+        broken_bijector.inverse(y)
+        broken_bijector.inverse_log_det_jacobian(y)
+      except IntentionallyMissingError:
+        raise AssertionError("Tests failed! Cached values not used.")
+
+  def testCachingOfInverseResults(self):
+    broken_bijector = self.broken_bijector_cls(forward_missing=True)
+    with self.test_session():
+      y = constant_op.constant(1.1)
+
+      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
+      x = broken_bijector.inverse(y)
+      _ = broken_bijector.inverse_log_det_jacobian(y)
+
+      # Now, everything should be cached if the argument is x.
+      try:
+        broken_bijector.forward(x)
+        broken_bijector.forward_log_det_jacobian(x)
+      except IntentionallyMissingError:
+        raise AssertionError("Tests failed! Cached values not used.")
+
+
+class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
+  """Test caching with BrokenBijector."""
+
+  @property
+  def broken_bijector_cls(self):
+    return BrokenBijector
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
rename to tensorflow/python/kernel_tests/distributions/categorical_test.py
index 0b42581e79f..33db933e82a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
@@ -30,6 +28,8 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -126,6 +126,63 @@ class CategoricalTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
+  def testCDFWithDynamicEventShape(self):
+    """Test that dynamically-sized events with unknown shape work."""
+    batch_size = 2
+    histograms = array_ops.placeholder(dtype=dtypes.float32,
+                                       shape=(batch_size, None))
+    event = array_ops.placeholder(dtype=dtypes.float32, shape=(batch_size,))
+    dist = categorical.Categorical(probs=histograms)
+    cdf_op = dist.cdf(event)
+
+    # Feed values into the placeholder with different shapes
+    # three classes.
+    event_feed_one = [0, 1]
+    histograms_feed_one = [[0.5, 0.3, 0.2], [1.0, 0.0, 0.0]]
+    expected_cdf_one = [0.0, 1.0]
+    feed_dict_one = {
+        histograms: histograms_feed_one,
+        event: event_feed_one
+    }
+
+    # six classes.
+    event_feed_two = [2, 5]
+    histograms_feed_two = [[0.9, 0.0, 0.0, 0.0, 0.0, 0.1],
+                           [0.15, 0.2, 0.05, 0.35, 0.13, 0.12]]
+    expected_cdf_two = [0.9, 0.88]
+    feed_dict_two = {
+        histograms: histograms_feed_two,
+        event: event_feed_two
+    }
+
+    with self.test_session() as sess:
+      actual_cdf_one = sess.run(cdf_op, feed_dict=feed_dict_one)
+      actual_cdf_two = sess.run(cdf_op, feed_dict=feed_dict_two)
+
+    self.assertAllClose(actual_cdf_one, expected_cdf_one)
+    self.assertAllClose(actual_cdf_two, expected_cdf_two)
+
+  def testCDFWithBatch(self):
+    histograms = [[0.1, 0.2, 0.3, 0.25, 0.15],
+                  [0.0, 0.75, 0.2, 0.05, 0.0]]
+    event = [0, 3]
+    expected_cdf = [0.0, 0.95]
+    dist = categorical.Categorical(probs=histograms)
+    cdf_op = dist.cdf(event)
+
+    with self.test_session():
+      self.assertAllClose(cdf_op.eval(), expected_cdf)
+
+  def testCDFNoBatch(self):
+    histogram = [0.1, 0.2, 0.3, 0.4]
+    event = 2
+    expected_cdf = 0.3
+    dist = categorical.Categorical(probs=histogram)
+    cdf_op = dist.cdf(event)
+
+    with self.test_session():
+      self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
+
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -278,10 +335,10 @@ class CategoricalTest(test.TestCase):
           a = categorical.Categorical(logits=a_logits)
           b = categorical.Categorical(logits=b_logits)
 
-          kl = kullback_leibler.kl(a, b)
+          kl = kullback_leibler.kl_divergence(a, b)
           kl_val = sess.run(kl)
           # Make sure KL(a||a) is 0
-          kl_same = sess.run(kullback_leibler.kl(a, a))
+          kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
           prob_a = np_softmax(a_logits)
           prob_b = np_softmax(b_logits)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
similarity index 96%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index 235ce209458..2f8f85866df 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -17,13 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet_multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
+
+ds = dirichlet_multinomial
 
 
 class DirichletMultinomialTest(test.TestCase):
@@ -87,9 +89,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist.prob([3., 0, 2]).eval()
       dist.prob([3.0, 0, 2.0]).eval()
       # Both equality and integer checking fail.
+      placeholder = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(
           "counts cannot contain fractional components"):
-        dist.prob([1.0, 2.5, 1.5]).eval()
+        dist.prob(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob([1., 2., 3.]).eval()
       # Non-integer arguments work.
@@ -222,9 +225,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(n, alpha)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
+      x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
+          x_centered[..., array_ops.newaxis],
+          x_centered[..., array_ops.newaxis, :]), 0)
       sample_var = array_ops.matrix_diag_part(sample_cov)
       sample_stddev = math_ops.sqrt(sample_var)
       [
@@ -248,7 +252,7 @@ class DirichletMultinomialTest(test.TestCase):
       ])
       self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
       self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.05)
-      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
+      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.05)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
 
   def testCovariance(self):
@@ -317,7 +321,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(ns, alpha)
       covariance = dist.covariance()
       expected_covariance = shared_matrix * (
-          ns * (ns + alpha_0) / (1 + alpha_0))[..., None]
+          ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
       self.assertAllClose(expected_covariance, covariance.eval())
@@ -418,7 +422,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.test_session() as sess:
       dist = ds.DirichletMultinomial(
           total_count=5.,
-          concentration=2. * self._rng.rand(4, 3, 2).astype(np.float32))
+          concentration=1. + 2. * self._rng.rand(4, 3, 2).astype(np.float32))
       n = int(3e3)
       x = dist.sample(n, seed=0)
       sample_mean = math_ops.reduce_mean(x, 0)
@@ -447,7 +451,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.test_session() as sess:
       dist = ds.DirichletMultinomial(
           total_count=5.,
-          concentration=2. * self._rng.rand(4).astype(np.float32))
+          concentration=1. + 2. * self._rng.rand(4).astype(np.float32))
       n = int(5e3)
       x = dist.sample(n, seed=0)
       sample_mean = math_ops.reduce_mean(x, 0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index cd634da09dd..a2f1de5aaf3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -16,14 +16,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import dirichlet as dirichlet_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class DirichletTest(test.TestCase):
@@ -132,9 +147,11 @@ class DirichletTest(test.TestCase):
   def testMean(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_mean = stats.dirichlet.mean(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mean().get_shape(), [3])
+      if not stats:
+        return
+      expected_mean = stats.dirichlet.mean(alpha)
       self.assertAllClose(dirichlet.mean().eval(), expected_mean)
 
   def testCovarianceFromSampling(self):
@@ -177,11 +194,13 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       alpha = [1., 2, 3]
       denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
+      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
+      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
+      if not stats:
+        return
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
-      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
       self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
 
   def testMode(self):
@@ -213,9 +232,11 @@ class DirichletTest(test.TestCase):
   def testEntropy(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_entropy = stats.dirichlet.entropy(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.entropy().get_shape(), ())
+      if not stats:
+        return
+      expected_entropy = stats.dirichlet.entropy(alpha)
       self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
 
   def testSample(self):
@@ -227,6 +248,8 @@ class DirichletTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
similarity index 88%
rename from tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
rename to tensorflow/python/kernel_tests/distributions/exponential_test.py
index 61712024138..7afdf0f9476 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -18,13 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import exponential as exponential_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class ExponentialTest(test.TestCase):
@@ -36,14 +51,17 @@ class ExponentialTest(test.TestCase):
       lam_v = 2.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
 
       log_pdf = exponential.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = exponential.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
@@ -54,34 +72,43 @@ class ExponentialTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
 
       cdf = exponential.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_mean = stats.expon.mean(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.expon.mean(scale=1 / lam_v)
       self.assertAllClose(exponential.mean().eval(), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_variance = stats.expon.var(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.expon.var(scale=1 / lam_v)
       self.assertAllClose(exponential.variance().eval(), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       self.assertAllClose(exponential.entropy().eval(), expected_entropy)
 
   def testExponentialSample(self):
@@ -95,6 +122,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
@@ -116,6 +145,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
 
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py b/tensorflow/python/kernel_tests/distributions/gamma_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
rename to tensorflow/python/kernel_tests/distributions/gamma_test.py
index fd627102372..5e4813ac076 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
+++ b/tensorflow/python/kernel_tests/distributions/gamma_test.py
@@ -17,18 +17,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from scipy import special
-from scipy import stats
+import importlib
+
+import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class GammaTest(test.TestCase):
@@ -53,13 +67,14 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-
       pdf = gamma.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensional(self):
@@ -71,15 +86,16 @@ class GammaTest(test.TestCase):
       beta_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensionalBroadcasting(self):
@@ -91,15 +107,17 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaCDF(self):
@@ -112,10 +130,11 @@ class GammaTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
-
       cdf = gamma.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testGammaMean(self):
@@ -123,8 +142,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.mean().eval(), expected_means)
 
   def testGammaModeAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
@@ -165,8 +186,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.variance().eval(), expected_variances)
 
   def testGammaStd(self):
@@ -174,17 +197,21 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertEqual(gamma.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertAllClose(gamma.stddev().eval(), expected_stddev)
 
   def testGammaEntropy(self):
     with self.test_session():
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
       self.assertEqual(gamma.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.entropy().eval(), expected_entropy)
 
   def testGammaSampleSmallAlpha(self):
@@ -199,6 +226,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -208,7 +238,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSample(self):
     with session.Session():
@@ -222,6 +251,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -231,7 +263,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSampleMultiDimensional(self):
     with session.Session():
@@ -246,6 +277,8 @@ class GammaTest(test.TestCase):
       zeros = np.zeros_like(alpha_v + beta_v)  # 10 x 100
       alpha_bc = alpha_v + zeros
       beta_bc = beta_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.gamma.mean(
@@ -266,6 +299,8 @@ class GammaTest(test.TestCase):
 
   def _kstest(self, alpha, beta, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If we can't test, return that the test passes.
     ks, _ = stats.kstest(samples, stats.gamma(alpha, scale=1 / beta).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -279,6 +314,12 @@ class GammaTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.gamma.mean(
               [[7., 11.], [7., 11.]], scale=1 / np.array([[5., 5.], [6., 6.]])),
@@ -289,10 +330,6 @@ class GammaTest(test.TestCase):
                           scale=1 / np.array([[5., 5.], [6., 6.]])),
           sample_vals.var(axis=0),
           atol=.1)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -345,11 +382,15 @@ class GammaTest(test.TestCase):
       g1 = gamma_lib.Gamma(concentration=alpha1, rate=beta1)
       x = g0.sample(int(1e4), seed=0)
       kl_sample = math_ops.reduce_mean(g0.log_prob(x) - g1.log_prob(x), 0)
-      kl_actual = kullback_leibler.kl(g0, g1)
+      kl_actual = kullback_leibler.kl_divergence(g0, g1)
 
     # Execute graph.
     [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual])
 
+    self.assertEqual(beta0.shape, kl_actual.get_shape())
+
+    if not special:
+      return
     kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0)
                    + special.gammaln(alpha1)
                    - special.gammaln(alpha0)
@@ -357,7 +398,6 @@ class GammaTest(test.TestCase):
                    - alpha1 * np.log(beta1)
                    + alpha0 * (beta1 / beta0 - 1.))
 
-    self.assertEqual(beta0.shape, kl_actual.get_shape())
     self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6)
     self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
 
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
new file mode 100644
index 00000000000..e8f9d0b728d
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -0,0 +1,47 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Identity Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.distributions import bijector_test_util
+from tensorflow.python.ops.distributions import identity_bijector
+from tensorflow.python.platform import test
+
+
+class IdentityBijectorTest(test.TestCase):
+  """Tests correctness of the Y = g(X) = X transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      bijector = identity_bijector.Identity()
+      self.assertEqual("identity", bijector.name)
+      x = [[[0.], [1.]]]
+      self.assertAllEqual(x, bijector.forward(x).eval())
+      self.assertAllEqual(x, bijector.inverse(x).eval())
+      self.assertAllEqual(0., bijector.inverse_log_det_jacobian(x).eval())
+      self.assertAllEqual(0., bijector.forward_log_det_jacobian(x).eval())
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = identity_bijector.Identity()
+      bijector_test_util.assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
new file mode 100644
index 00000000000..b1d8da77161
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -0,0 +1,132 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for distributions KL mechanism."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
+from tensorflow.python.platform import test
+
+# pylint: disable=protected-access
+_DIVERGENCES = kullback_leibler._DIVERGENCES
+_registered_kl = kullback_leibler._registered_kl
+
+# pylint: enable=protected-access
+
+
+class KLTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class MyDist(normal.Normal):
+      pass
+
+    # Register KL to a lambda that spits out the name parameter
+    @kullback_leibler.RegisterKL(MyDist, MyDist)
+    def _kl(a, b, name=None):  # pylint: disable=unused-argument,unused-variable
+      return name
+
+    a = MyDist(loc=0.0, scale=1.0)
+    self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
+
+  def testDomainErrorExceptions(self):
+
+    class MyDistException(normal.Normal):
+      pass
+
+    # Register KL to a lambda that spits out the name parameter
+    @kullback_leibler.RegisterKL(MyDistException, MyDistException)
+    # pylint: disable=unused-argument,unused-variable
+    def _kl(a, b, name=None):
+      return array_ops.identity([float("nan")])
+
+    # pylint: disable=unused-argument,unused-variable
+
+    with self.test_session():
+      a = MyDistException(loc=0.0, scale=1.0)
+      kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
+      with self.assertRaisesOpError(
+          "KL calculation between .* and .* returned NaN values"):
+        kl.eval()
+      kl_ok = kullback_leibler.kl_divergence(a, a)
+      self.assertAllEqual([float("nan")], kl_ok.eval())
+
+  def testRegistrationFailures(self):
+
+    class MyDist(normal.Normal):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      kullback_leibler.RegisterKL(MyDist, MyDist)("blah")
+
+    # First registration is OK
+    kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      kullback_leibler.RegisterKL(MyDist, MyDist)(lambda a, b: None)
+
+  def testExactRegistrationsAllMatch(self):
+    for (k, v) in _DIVERGENCES.items():
+      self.assertEqual(v, _registered_kl(*k))
+
+  def testIndirectRegistration(self):
+
+    class Sub1(normal.Normal):
+      pass
+
+    class Sub2(normal.Normal):
+      pass
+
+    class Sub11(Sub1):
+      pass
+
+    # pylint: disable=unused-argument,unused-variable
+    @kullback_leibler.RegisterKL(Sub1, Sub1)
+    def _kl11(a, b, name=None):
+      return "sub1-1"
+
+    @kullback_leibler.RegisterKL(Sub1, Sub2)
+    def _kl12(a, b, name=None):
+      return "sub1-2"
+
+    @kullback_leibler.RegisterKL(Sub2, Sub1)
+    def _kl21(a, b, name=None):
+      return "sub2-1"
+
+    # pylint: enable=unused-argument,unused_variable
+
+    sub1 = Sub1(loc=0.0, scale=1.0)
+    sub2 = Sub2(loc=0.0, scale=1.0)
+    sub11 = Sub11(loc=0.0, scale=1.0)
+
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
rename to tensorflow/python/kernel_tests/distributions/laplace_test.py
index 1f58d495f02..55577386c45 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -17,15 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import laplace as laplace_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
 
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
 
 class LaplaceTest(test.TestCase):
 
@@ -49,9 +65,11 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = laplace.prob(x)
@@ -67,15 +85,17 @@ class LaplaceTest(test.TestCase):
       scale_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensionalBroadcasting(self):
@@ -87,15 +107,17 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceCDF(self):
@@ -108,10 +130,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogCDF(self):
@@ -124,10 +148,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.log_cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
@@ -140,10 +166,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
 
       sf = laplace.log_survival_function(x)
       self.assertEqual(sf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
       self.assertAllClose(sf.eval(), expected_sf)
 
   def testLaplaceMean(self):
@@ -151,8 +179,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertEqual(laplace.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertAllClose(laplace.mean().eval(), expected_means)
 
   def testLaplaceMode(self):
@@ -168,8 +198,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertEqual(laplace.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertAllClose(laplace.variance().eval(), expected_variances)
 
   def testLaplaceStd(self):
@@ -177,17 +209,21 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertEqual(laplace.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertAllClose(laplace.stddev().eval(), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       self.assertAllClose(laplace.entropy().eval(), expected_entropy)
 
   def testLaplaceSample(self):
@@ -202,6 +238,8 @@ class LaplaceTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.laplace.mean(
@@ -228,6 +266,8 @@ class LaplaceTest(test.TestCase):
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
       loc_bc = loc_v + zeros
       scale_bc = scale_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.laplace.mean(
@@ -250,6 +290,8 @@ class LaplaceTest(test.TestCase):
 
   def _kstest(self, loc, scale, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If scipy isn't available, return "True" for passing
     ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -263,6 +305,12 @@ class LaplaceTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.laplace.mean(
               [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
@@ -275,10 +323,6 @@ class LaplaceTest(test.TestCase):
           sample_vals.var(axis=0),
           rtol=0.05,
           atol=0.)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/multinomial_test.py
index ded12c9c4d4..80caf10391d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -17,14 +17,14 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
-
 
 class MultinomialTest(test.TestCase):
 
@@ -34,7 +34,7 @@ class MultinomialTest(test.TestCase):
   def testSimpleShapes(self):
     with self.test_session():
       p = [.1, .3, .6]
-      dist = ds.Multinomial(total_count=1., probs=p)
+      dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
       self.assertAllEqual([], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
@@ -44,7 +44,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual(2, dist.event_shape_tensor().eval())
       self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
@@ -54,14 +54,14 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=3., probs=p)
+      dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
@@ -70,7 +70,7 @@ class MultinomialTest(test.TestCase):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
     with self.test_session():
-      multinom = ds.Multinomial(total_count=3., logits=logits)
+      multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
       self.assertAllClose(p, multinom.probs.eval())
@@ -80,10 +80,10 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      dist = multinomial.Multinomial(total_count=n, probs=p, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
-      with self.assertRaisesOpError("counts must be non-negative"):
+      with self.assertRaisesOpError("must be non-negative"):
         dist.prob([-1., 4, 2]).eval()
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
@@ -93,18 +93,21 @@ class MultinomialTest(test.TestCase):
     n = [[5.]]
     with self.test_session():
       # No errors with integer n.
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=True)
       multinom.prob([2., 1, 2]).eval()
       multinom.prob([3., 0, 2]).eval()
       # Counts don't sum to n.
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         multinom.prob([2., 3, 2]).eval()
       # Counts are non-integers.
+      x = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(
-          "counts cannot contain fractional components."):
-        multinom.prob([1.0, 2.5, 1.5]).eval()
+          "cannot contain fractional components."):
+        multinom.prob(x).eval(feed_dict={x: [1.0, 2.5, 1.5]})
 
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=False)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=False)
       multinom.prob([1., 2., 2.]).eval()
       # Non-integer arguments work.
       multinom.prob([1.0, 2.5, 1.5]).eval()
@@ -114,7 +117,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.5, 0.5]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(0.5, pmf.eval())
       self.assertEqual((), pmf.get_shape())
 
@@ -123,7 +126,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.1, 0.9]
       counts = [3., 2]
-      dist = ds.Multinomial(total_count=5., probs=p)
+      dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
       self.assertAllClose(81. / 10000, pmf.eval())
@@ -133,7 +136,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -141,7 +144,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -149,7 +152,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
@@ -157,7 +160,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual(pmf.get_shape(), (2))
 
@@ -169,7 +172,7 @@ class MultinomialTest(test.TestCase):
       n = [[3., 3], [3, 3]]
       # [2]
       counts = [2., 1]
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual(pmf.get_shape(), (2, 2))
 
@@ -178,7 +181,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual((4, 3), pmf.get_shape())
 
@@ -186,7 +189,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_means = 5 * np.array(p, dtype=np.float32)
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
@@ -195,7 +198,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_covariances = [[9. / 20, -1 / 10, -7 / 20],
                               [-1 / 10, 4 / 5, -7 / 10],
                               [-7 / 20, -7 / 10, 21 / 20]]
@@ -208,7 +211,7 @@ class MultinomialTest(test.TestCase):
       n = [5.] * 2
       # Shape [4, 1, 2]
       p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       # Shape [2, 2]
       inner_var = [[9. / 20, -9 / 20], [-9 / 20, 9 / 20]]
       # Shape [4, 2, 2, 2]
@@ -226,8 +229,8 @@ class MultinomialTest(test.TestCase):
     ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
 
     with self.test_session():
-      dist = ds.Multinomial(ns, p)
-      dist2 = ds.Multinomial(ns2, p2)
+      dist = multinomial.Multinomial(ns, p)
+      dist2 = multinomial.Multinomial(ns2, p2)
 
       covariance = dist.covariance()
       covariance2 = dist2.covariance()
@@ -239,17 +242,19 @@ class MultinomialTest(test.TestCase):
     # via broadcast between alpha, n.
     theta = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
-    theta /= np.sum(theta, 1)[..., None]
+    theta /= np.sum(theta, 1)[..., array_ops.newaxis]
     # Ideally we'd be able to test broadcasting but, the multinomial sampler
     # doesn't support different total counts.
     n = np.float32(5)
     with self.test_session() as sess:
-      dist = ds.Multinomial(n, theta)  # batch_shape=[2], event_shape=[3]
+      # batch_shape=[2], event_shape=[3]
+      dist = multinomial.Multinomial(n, theta)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
+      x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
+          x_centered[..., array_ops.newaxis],
+          x_centered[..., array_ops.newaxis, :]), 0)
       sample_var = array_ops.matrix_diag_part(sample_cov)
       sample_stddev = math_ops.sqrt(sample_var)
       [
@@ -278,7 +283,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
       n = int(3e3)
@@ -307,7 +312,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4).astype(np.float32)))
       n = int(5e3)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
similarity index 86%
rename from tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
rename to tensorflow/python/kernel_tests/distributions/normal_test.py
index 9b70efaa37b..07c7d6d11d0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal as normal_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,7 +31,21 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
 
 
 class NormalTest(test.TestCase):
@@ -89,10 +102,8 @@ class NormalTest(test.TestCase):
       sigma = constant_op.constant([math.sqrt(10.0)] * batch_size)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -101,12 +112,17 @@ class NormalTest(test.TestCase):
       self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
 
       pdf = normal.prob(x)
-      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf.eval().shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
   def testNormalLogPDFMultidimensional(self):
     with self.test_session():
       batch_size = 6
@@ -115,12 +131,10 @@ class NormalTest(test.TestCase):
                                    batch_size)
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(expected_log_pdf, log_pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -131,12 +145,17 @@ class NormalTest(test.TestCase):
       pdf = normal.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
-      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf_values.shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf_values.shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
   def testNormalCDF(self):
     with self.test_session():
       batch_size = 50
@@ -145,14 +164,15 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).cdf(x)
-
       cdf = normal.cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
 
   def testNormalSurvivalFunction(self):
     with self.test_session():
@@ -162,14 +182,16 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).sf(x)
 
       sf = normal.survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
 
   def testNormalLogCDF(self):
     with self.test_session():
@@ -179,15 +201,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).logcdf(x)
 
       cdf = normal.log_cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
 
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
   def testFiniteGradientAtDifficultPoints(self):
     for dtype in [np.float32, np.float64]:
       g = ops.Graph()
@@ -216,15 +241,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).logsf(x)
 
       sf = normal.log_survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
 
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
   def testNormalEntropyWithScalarInputs(self):
     # Scipy.stats.norm cannot deal with the shapes in the other test.
     with self.test_session():
@@ -232,16 +260,18 @@ class NormalTest(test.TestCase):
       sigma_v = 4.56
       normal = normal_lib.Normal(loc=mu_v, scale=sigma_v)
 
-      # scipy.stats.norm cannot deal with these shapes.
-      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
       entropy = normal.entropy()
-      self.assertAllClose(expected_entropy, entropy.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.eval().shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
       self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      # scipy.stats.norm cannot deal with these shapes.
+      if not stats:
+        return
+      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
+      self.assertAllClose(expected_entropy, entropy.eval())
 
   def testNormalEntropy(self):
     with self.test_session():
@@ -276,6 +306,54 @@ class NormalTest(test.TestCase):
       self.assertAllEqual((3,), normal.mode().get_shape())
       self.assertAllEqual([7., 7, 7], normal.mode().eval())
 
+  def testNormalQuantile(self):
+    with self.test_session():
+      batch_size = 52
+      mu = self._rng.randn(batch_size)
+      sigma = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0., 1.0, batch_size - 2).astype(np.float64)
+      # Quantile performs piecewise rational approximation so adding some
+      # special input values to make sure we hit all the pieces.
+      p = np.hstack((p, np.exp(-33), 1. - np.exp(-33)))
+
+      normal = normal_lib.Normal(loc=mu, scale=sigma)
+      x = normal.quantile(p)
+
+      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.get_shape())
+      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(normal.batch_shape, x.get_shape())
+      self.assertAllEqual(normal.batch_shape, x.eval().shape)
+
+      if not stats:
+        return
+      expected_x = stats.norm(mu, sigma).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def _baseQuantileFiniteGradientAtDifficultPoints(self, dtype):
+    g = ops.Graph()
+    with g.as_default():
+      mu = variables.Variable(dtype(0.0))
+      sigma = variables.Variable(dtype(1.0))
+      dist = normal_lib.Normal(loc=mu, scale=sigma)
+      p = variables.Variable(
+          np.array([0.,
+                    np.exp(-32.), np.exp(-2.),
+                    1. - np.exp(-2.), 1. - np.exp(-32.),
+                    1.]).astype(dtype))
+
+      value = dist.quantile(p)
+      grads = gradients_impl.gradients(value, [mu, p])
+      with self.test_session(graph=g):
+        variables.global_variables_initializer().run()
+        self.assertAllFinite(grads[0])
+        self.assertAllFinite(grads[1])
+
+  def testQuantileFiniteGradientAtDifficultPointsFloat32(self):
+    self._baseQuantileFiniteGradientAtDifficultPoints(np.float32)
+
+  def testQuantileFiniteGradientAtDifficultPointsFloat64(self):
+    self._baseQuantileFiniteGradientAtDifficultPoints(np.float64)
+
   def testNormalVariance(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -404,7 +482,7 @@ class NormalTest(test.TestCase):
       n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
       n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
 
-      kl = kullback_leibler.kl(n_a, n_b)
+      kl = kullback_leibler.kl_divergence(n_a, n_b)
       kl_val = sess.run(kl)
 
       kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
rename to tensorflow/python/kernel_tests/distributions/special_math_test.py
index 795087e6a4c..dc462bae56b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -19,18 +19,30 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import importlib
 
 import numpy as np
-from scipy import special
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 sm = special_math
 
 
@@ -56,6 +68,51 @@ GridSpec = collections.namedtuple("GridSpec", ["min", "max", "shape"])
 ErrorSpec = collections.namedtuple("ErrorSpec", ["rtol", "atol"])
 
 
+class NdtriTest(test.TestCase):
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def testNdtri(self):
+    """Verifies that ndtri computation is correct."""
+    with self.test_session():
+      if not special:
+        return
+
+      p = np.linspace(0., 1.0, 50).astype(np.float64)
+      # Quantile performs piecewise rational approximation so adding some
+      # special input values to make sure we hit all the pieces.
+      p = np.hstack((p, np.exp(-32), 1. - np.exp(-32),
+                     np.exp(-2), 1. - np.exp(-2)))
+      expected_x = special.ndtri(p)
+      x = special_math.ndtri(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def _baseNdtriFiniteGradientTest(self, dtype):
+    """Verifies that ndtri has finite gradients at interesting points."""
+    g = ops.Graph()
+    with g.as_default():
+      # Tests gradients at 0, 1, and piece-wise boundaries.
+      p = variables.Variable(
+          np.array([0.,
+                    np.exp(-32.), np.exp(-2.),
+                    1. - np.exp(-2.), 1. - np.exp(-32.),
+                    1.]).astype(dtype))
+    value = special_math.ndtri(p)
+    grads = gradients_impl.gradients(value, p)
+    with self.test_session(graph=g):
+      variables.global_variables_initializer().run()
+      self.assertAllFinite(grads[0])
+
+  def testNdtriFiniteGradientFloat32(self):
+    self._baseNdtriFiniteGradientTest(np.float32)
+
+  def testNdtriFiniteGradientFloat64(self):
+    self._baseNdtriFiniteGradientTest(np.float64)
+
+
 class NdtrTest(test.TestCase):
   _use_log = False
   # Grid min/max chosen to ensure 0 < cdf(x) < 1.
@@ -71,6 +128,9 @@ class NdtrTest(test.TestCase):
       self._test_grid_no_log(dtype, grid_spec, error_spec)
 
   def _test_grid_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.log_ndtr(grid).eval()
@@ -95,6 +155,9 @@ class NdtrTest(test.TestCase):
           atol=error_spec.atol)
 
   def _test_grid_no_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.ndtr(grid).eval()
@@ -225,6 +288,9 @@ class NdtrGradientTest(test.TestCase):
       self.assert_all_true(np.isfinite(grad_eval))
 
       # Versus scipy.
+      if not (special and stats):
+        return
+
       expected = stats.norm.pdf(raw_grid)
       if self._use_log:
         expected /= special.ndtr(raw_grid)
@@ -281,6 +347,9 @@ class LogCDFLaplaceTest(test.TestCase):
       _check_strictly_increasing(actual)
 
       # Versus scipy.
+      if not stats:
+        return
+
       scipy_dist = stats.laplace(loc=0., scale=1.)
       expected = scipy_dist.logcdf(grid.astype(scipy_dtype))
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
rename to tensorflow/python/kernel_tests/distributions/student_t_test.py
index 209ef696caa..f1150de58e0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -18,19 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib import distributions
-from tensorflow.contrib.distributions.python.ops import student_t
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
-ds = distributions
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class StudentTTest(test.TestCase):
@@ -45,7 +56,7 @@ class StudentTTest(test.TestCase):
       mu_v = 7.
       sigma_v = 8.
       t = np.array([-2.5, 2.5, 8., 0., -1., 2.], dtype=np.float32)
-      student = ds.StudentT(df, loc=mu, scale=-sigma)
+      student = student_t.StudentT(df, loc=mu, scale=-sigma)
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
@@ -54,6 +65,9 @@ class StudentTTest(test.TestCase):
       self.assertEquals(pdf.get_shape(), (6,))
       pdf_values = pdf.eval()
 
+      if not stats:
+        return
+
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -72,13 +86,16 @@ class StudentTTest(test.TestCase):
       mu_v = np.array([3., -3.])
       sigma_v = np.array([np.sqrt(10.), np.sqrt(15.)])
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
-      student = ds.StudentT(df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -105,6 +122,8 @@ class StudentTTest(test.TestCase):
       self.assertEquals(cdf.get_shape(), (6,))
       cdf_values = cdf.eval()
 
+      if not stats:
+        return
       expected_log_cdf = stats.t.logcdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_cdf = stats.t.cdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_cdf, log_cdf_values, atol=0., rtol=1e-5)
@@ -119,7 +138,7 @@ class StudentTTest(test.TestCase):
     mu_v = np.array([[1., -1, 0]])  # 1x3
     sigma_v = np.array([[1., -2., 3.]]).T  # transposed => 3x1
     with self.test_session():
-      student = ds.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
+      student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
       ent_values = ent.eval()
 
@@ -128,6 +147,8 @@ class StudentTTest(test.TestCase):
     sigma_bc = np.abs(sigma_v) * ones
     mu_bc = ones.T * mu_v
     df_bc = ones.T * df_v
+    if not stats:
+      return
     expected_entropy = stats.t.entropy(
         np.reshape(df_bc, [-1]),
         loc=np.reshape(mu_bc, [-1]),
@@ -144,7 +165,7 @@ class StudentTTest(test.TestCase):
       mu_v = 3.
       sigma_v = np.sqrt(10.)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -166,11 +187,13 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(100)
 
       random_seed.set_random_seed(654321)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t1")
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t1")
       samples1 = student.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
-      student2 = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t2")
+      student2 = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t2")
       samples2 = student2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -180,7 +203,7 @@ class StudentTTest(test.TestCase):
       df_v = [1e-1, 1e-5, 1e-10, 1e-20]
       df = constant_op.constant(df_v)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=1., scale=1.)
+      student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -198,7 +221,7 @@ class StudentTTest(test.TestCase):
       mu_v = [3., -3.]
       sigma_v = [np.sqrt(10.), np.sqrt(15.)]
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
@@ -222,6 +245,8 @@ class StudentTTest(test.TestCase):
   def _checkKLApprox(self, df, mu, sigma, samples):
     n = samples.size
     np.random.seed(137)
+    if not stats:
+      return
     sample_scipy = stats.t.rvs(df, loc=mu, scale=sigma, size=n)
     covg = 0.99
     r = stats.t.interval(covg, df, loc=mu, scale=sigma)
@@ -247,9 +272,9 @@ class StudentTTest(test.TestCase):
       self.assertEqual(student.prob(2.).get_shape(), (3,))
       self.assertEqual(student.sample(37, seed=123456).get_shape(), (37, 3,))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
   def testBroadcastingPdfArgs(self):
 
@@ -266,9 +291,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
     def _check2d(student):
       _assert_shape(student, 2., (1, 3))
@@ -279,9 +304,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check2d(ds.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
-    _check2d(ds.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
-    _check2d(ds.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
+    _check2d(student_t.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
 
     def _check2d_rows(student):
       _assert_shape(student, 2., (3, 1))
@@ -292,22 +317,23 @@ class StudentTTest(test.TestCase):
       xs = xs.T  # (3,1)
       _assert_shape(student, xs, (3, 1))
 
-    _check2d_rows(ds.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
+    _check2d_rows(student_t.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
 
   def testMeanAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
+      student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
       mean = student.mean().eval()
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
-                            allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
+          allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.mean().eval()
 
@@ -315,8 +341,9 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
-                            allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
+          allow_nan_stats=True)
       mean = student.mean().eval()
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
@@ -327,7 +354,8 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1.5, 3., 5., 7.]
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, allow_nan_stats=True)
       var = student.variance().eval()
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
@@ -336,6 +364,8 @@ class StudentTTest(test.TestCase):
       self.assertTrue(np.isnan(var[0]))
       var[0] = np.inf
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -348,9 +378,11 @@ class StudentTTest(test.TestCase):
       df = [1.5, 3., 5., 7.]
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       var = student.variance().eval()
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -359,13 +391,15 @@ class StudentTTest(test.TestCase):
   def testVarianceAllowNanStatsFalseRaisesForUndefinedBatchMembers(self):
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=1., loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=0.5, loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
@@ -375,11 +409,13 @@ class StudentTTest(test.TestCase):
       df = [3.5, 5., 3., 5., 7.]
       mu = [-2.2]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       stddev = student.stddev().eval()
       mu *= len(df)
 
+      if not stats:
+        return
       expected_stddev = [
           stats.t.std(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -390,14 +426,14 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1., 3]
       mu = [-1, 0., 1]
       sigma = [5., 4., 3.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       mode = student.mode().eval()
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=3., loc=np.pi, scale=1.)
+      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
       num = 20000
       samples = student.sample(num, seed=123456)
       pdfs = student.prob(samples)
@@ -410,13 +446,15 @@ class StudentTTest(test.TestCase):
       self.assertEqual(mean.get_shape(), ())
       self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
       self.assertNear(np.pi, mean_val, err=1e-6)
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
       # Verify integral over sample*pdf ~= 1.
       self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
+      if not stats:
+        return
+      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
       self.assertAllEqual([], student.event_shape)
       self.assertAllEqual([], student.event_shape_tensor().eval())
       self.assertAllEqual([2, 2], student.batch_shape)
@@ -429,6 +467,12 @@ class StudentTTest(test.TestCase):
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
       self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
       self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertNear(
           stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 0]),
@@ -437,10 +481,6 @@ class StudentTTest(test.TestCase):
           stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 1]),
           err=.4)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -454,8 +494,8 @@ class StudentTTest(test.TestCase):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = ds.StudentT(df=[2, -5.], loc=0., scale=1.,
-                            validate_args=True, name="S")
+      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
+                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
         student.mean().eval()
 
@@ -464,7 +504,8 @@ class StudentTTest(test.TestCase):
       df = constant_op.constant([-3.2, -4.6])
       mu = constant_op.constant([-4.2, 3.4])
       sigma = constant_op.constant([-6.4, -8.8])
-      student = ds.StudentTWithAbsDfSoftplusScale(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentTWithAbsDfSoftplusScale(
+          df=df, loc=mu, scale=sigma)
       self.assertAllClose(
           math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
       self.assertAllClose(mu.eval(), student.loc.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
rename to tensorflow/python/kernel_tests/distributions/uniform_test.py
index c3c97b98f0d..df99a0ed257 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
+
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class UniformTest(test.TestCase):
@@ -126,7 +141,7 @@ class UniformTest(test.TestCase):
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
       uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
         uniform.low.eval()
 
@@ -187,6 +202,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
 
@@ -195,6 +212,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.variance().eval(), s_uniform.var())
 
@@ -203,6 +222,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
 
diff --git a/tensorflow/python/kernel_tests/division_future_test.py b/tensorflow/python/kernel_tests/division_future_test.py
index 4d943decf2a..e681b32856a 100644
--- a/tensorflow/python/kernel_tests/division_future_test.py
+++ b/tensorflow/python/kernel_tests/division_future_test.py
@@ -38,15 +38,19 @@ class DivisionTestCase(test.TestCase):
     # TODO(irving): Test int8, int16 once we support casts for those.
     dtypes = np.int32, np.int64, np.float32, np.float64
 
-    def check(x, y):
-      if isinstance(x, ops.Tensor):
-        x = x.eval()
-      if isinstance(y, ops.Tensor):
-        y = y.eval()
-      self.assertEqual(x.dtype, y.dtype)
-      self.assertEqual(x, y)
+    tensors = []
+    checks = []
 
-    with self.test_session():
+    def check(x, y):
+      x = ops.convert_to_tensor(x)
+      y = ops.convert_to_tensor(y)
+      tensors.append((x, y))
+      def f(x, y):
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x, y)
+      checks.append(f)
+
+    with self.test_session() as sess:
       for dtype in dtypes:
         for x in map(dtype, values):
           for y in map(dtype, values):
@@ -60,6 +64,9 @@ class DivisionTestCase(test.TestCase):
                 floordiv = x // y
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
+      # Do only one sess.run for speed
+      for f, (x, y) in zip(checks, sess.run(tensors)):
+        f(x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 8f446d01bb3..2ff2f894077 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -38,15 +38,19 @@ class DivisionTestCase(test.TestCase):
     # TODO(irving): Test int8, int16 once we support casts for those.
     dtypes = np.int32, np.int64, np.float32, np.float64
 
-    def check(x, y):
-      if isinstance(x, ops.Tensor):
-        x = x.eval()
-      if isinstance(y, ops.Tensor):
-        y = y.eval()
-      self.assertEqual(x.dtype, y.dtype)
-      self.assertEqual(x, y)
+    tensors = []
+    checks = []
 
-    with self.test_session():
+    def check(x, y):
+      x = ops.convert_to_tensor(x)
+      y = ops.convert_to_tensor(y)
+      tensors.append((x, y))
+      def f(x, y):
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x, y)
+      checks.append(f)
+
+    with self.test_session() as sess:
       for dtype in dtypes:
         for x in map(dtype, values):
           for y in map(dtype, values):
@@ -60,6 +64,9 @@ class DivisionTestCase(test.TestCase):
                 floordiv = x // y
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
+      # Do only one sess.run for speed
+      for f, (x, y) in zip(checks, sess.run(tensors)):
+        f(x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 8cd37882570..057da9d7afa 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import compat
@@ -548,6 +547,31 @@ class EmbeddingLookupTest(test.TestCase):
             sharded = embedding_ops.embedding_lookup(split_params, ids).eval()
             self.assertAllEqual(simple, sharded)
 
+  def testHigherRankMaxNorm(self):
+    np.random.seed(8)
+    with self.test_session():
+      for params_shape in (12,), (6, 3):
+        params = 2 * np.ones(params_shape)
+        params_norm = params / np.sqrt(
+            np.sum(params*params, tuple(range(params.ndim)[1:]), keepdims=True))
+        for ids_shape in (), (3), (4, 3), (2, 3, 4):
+          ids = np.random.randint(
+              params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(
+                  ids_shape)
+          # Compare nonsharded to gather
+          simple = embedding_ops.embedding_lookup(
+              params, ids, max_norm=1.0).eval()
+          self.assertAllEqual(simple, array_ops.gather(params_norm, ids).eval())
+          # Run a few random sharded versions
+          for procs in 1, 2, 3:
+            stride = procs * math_ops.range(params.shape[0] // procs)
+            split_params = [
+                array_ops.gather(params, stride + p) for p in xrange(procs)
+            ]
+            sharded = embedding_ops.embedding_lookup(
+                split_params, ids, max_norm=1.0).eval()
+            self.assertAllEqual(simple, sharded)
+
 
 class EmbeddingLookupSparseTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index 203e7a1ffe6..6c575aea128 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -21,100 +21,157 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
 
 
-class FFTOpsTest(test.TestCase):
+class BaseFFTOpsTest(test.TestCase):
 
-  def _Compare(self, x, rank):
+  def _use_eigen_kernels(self):
+    use_eigen_kernels = False  # Eigen kernels are default
     if test.is_gpu_available(cuda_only=True):
-      # GPU/Forward
-      self.assertAllClose(
-          self._npFFT(x, rank),
-          self._tfFFT(
-              x, rank, use_gpu=True),
-          rtol=1e-4,
-          atol=1e-4)
-      # GPU/Backward
-      self.assertAllClose(
-          self._npIFFT(x, rank),
-          self._tfIFFT(
-              x, rank, use_gpu=True),
-          rtol=1e-4,
-          atol=1e-4)
+      use_eigen_kernels = False
+    return use_eigen_kernels
 
-  def _checkGrad(self, func, x, y, use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
+  def _fft_kernel_label_map(self):
+    """Returns a generator overriding kernel selection.
+
+    This is used to force testing of the eigen kernels, even
+    when they are not the default registered kernels.
+
+    Returns:
+      A generator in which to wrap every test.
+    """
+    if self._use_eigen_kernels():
+      d = dict([(op, "eigen")
+                for op in [
+                    "FFT", "FFT2D", "FFT3D", "IFFT", "IFFT2D", "IFFT3D",
+                    "IRFFT", "IRFFT2D", "IRFFT3D", "RFFT", "RFFT2D", "RFFT3D"
+                ]])
+      return ops.get_default_graph()._kernel_label_map(d)
+    else:
+      return ops.get_default_graph()._kernel_label_map({})
+
+  def _Compare(self, x, rank, fft_length=None, use_placeholder=False):
+    self._CompareForward(x, rank, fft_length, use_placeholder)
+    self._CompareBackward(x, rank, fft_length, use_placeholder)
+
+  def _CompareForward(self, x, rank, fft_length=None, use_placeholder=False):
+    x_np = self._npFFT(x, rank, fft_length)
+    if use_placeholder:
+      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
+      x_tf = self._tfFFT(x_ph, rank, fft_length, feed_dict={x_ph: x})
+    else:
+      x_tf = self._tfFFT(x, rank, fft_length)
+
+    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+
+  def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+    x_np = self._npIFFT(x, rank, fft_length)
+    if use_placeholder:
+      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
+      x_tf = self._tfIFFT(x_ph, rank, fft_length, feed_dict={x_ph: x})
+    else:
+      x_tf = self._tfIFFT(x, rank, fft_length)
+
+    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+
+  def _checkGradComplex(self, func, x, y, result_is_complex=True):
+    with self.test_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
-      # func is a forward or inverse FFT function (batched or unbatched)
+      # func is a forward or inverse, real or complex, batched or unbatched FFT
+      # function with a complex input.
       z = func(math_ops.complex(inx, iny))
       # loss = sum(|z|^2)
       loss = math_ops.reduce_sum(math_ops.real(z * math_ops.conj(z)))
+
       ((x_jacob_t, x_jacob_n),
        (y_jacob_t, y_jacob_n)) = gradient_checker.compute_gradient(
            [inx, iny], [list(x.shape), list(y.shape)],
            loss, [1],
            x_init_value=[x, y],
            delta=1e-2)
+
     self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=1e-2)
     self.assertAllClose(y_jacob_t, y_jacob_n, rtol=1e-2, atol=1e-2)
 
-  def _npFFT(self, x, rank):
+  def _checkGradReal(self, func, x):
+    with self.test_session(use_gpu=True):
+      inx = ops.convert_to_tensor(x)
+      # func is a forward RFFT function (batched or unbatched).
+      z = func(inx)
+      # loss = sum(|z|^2)
+      loss = math_ops.reduce_sum(math_ops.real(z * math_ops.conj(z)))
+      x_jacob_t, x_jacob_n = test.compute_gradient(
+          inx, list(x.shape), loss, [1], x_init_value=x, delta=1e-2)
+
+    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=1e-2)
+
+
+class FFTOpsTest(BaseFFTOpsTest):
+
+  def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
+    # fft_length unused for complex FFTs.
+    with self.test_session(use_gpu=True):
+      return self._tfFFTForRank(rank)(x).eval(feed_dict=feed_dict)
+
+  def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
+    # fft_length unused for complex FFTs.
+    with self.test_session(use_gpu=True):
+      return self._tfIFFTForRank(rank)(x).eval(feed_dict=feed_dict)
+
+  def _npFFT(self, x, rank, fft_length=None):
     if rank == 1:
-      return np.fft.fft2(x, axes=(-1,))
+      return np.fft.fft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
-      return np.fft.fft2(x, axes=(-2, -1))
+      return np.fft.fft2(x, s=fft_length, axes=(-2, -1))
     elif rank == 3:
-      return np.fft.fft2(x, axes=(-3, -2, -1))
+      return np.fft.fft2(x, s=fft_length, axes=(-3, -2, -1))
     else:
       raise ValueError("invalid rank")
 
-  def _npIFFT(self, x, rank):
+  def _npIFFT(self, x, rank, fft_length=None):
     if rank == 1:
-      return np.fft.ifft2(x, axes=(-1,))
+      return np.fft.ifft2(x, s=fft_length, axes=(-1,))
     elif rank == 2:
-      return np.fft.ifft2(x, axes=(-2, -1))
+      return np.fft.ifft2(x, s=fft_length, axes=(-2, -1))
     elif rank == 3:
-      return np.fft.ifft2(x, axes=(-3, -2, -1))
+      return np.fft.ifft2(x, s=fft_length, axes=(-3, -2, -1))
     else:
       raise ValueError("invalid rank")
 
-  def _tfFFT(self, x, rank, use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
-      return self._tfFFTForRank(rank)(x).eval()
-
-  def _tfIFFT(self, x, rank, use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
-      return self._tfIFFTForRank(rank)(x).eval()
-
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return math_ops.fft
+      return spectral_ops.fft
     elif rank == 2:
-      return math_ops.fft2d
+      return spectral_ops.fft2d
     elif rank == 3:
-      return math_ops.fft3d
+      return spectral_ops.fft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return math_ops.ifft
+      return spectral_ops.ifft
     elif rank == 2:
-      return math_ops.ifft2d
+      return spectral_ops.ifft2d
     elif rank == 3:
-      return math_ops.ifft3d
+      return spectral_ops.ifft3d
     else:
       raise ValueError("invalid rank")
 
   def testEmpty(self):
-    if test.is_gpu_available(cuda_only=True):
+    with self._fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
           x = np.zeros((0,) * dims).astype(np.complex64)
@@ -122,54 +179,308 @@ class FFTOpsTest(test.TestCase):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
-    for rank in VALID_FFT_RANKS:
-      for dims in xrange(rank, rank + 3):
-        self._Compare(
-            np.mod(np.arange(np.power(4, dims)), 10).reshape((4,) * dims), rank)
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          self._Compare(
+              np.mod(np.arange(np.power(4, dims)), 10).reshape(
+                  (4,) * dims).astype(np.complex64), rank)
+
+  def testBasicPlaceholder(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          self._Compare(
+              np.mod(np.arange(np.power(4, dims)), 10).reshape(
+                  (4,) * dims).astype(np.complex64),
+              rank,
+              use_placeholder=True)
 
   def testRandom(self):
-    np.random.seed(12345)
+    with self._fft_kernel_label_map():
+      np.random.seed(12345)
 
-    def gen(shape):
-      n = np.prod(shape)
-      re = np.random.uniform(size=n)
-      im = np.random.uniform(size=n)
-      return (re + im * 1j).reshape(shape)
+      def gen(shape):
+        n = np.prod(shape)
+        re = np.random.uniform(size=n)
+        im = np.random.uniform(size=n)
+        return (re + im * 1j).reshape(shape)
 
-    for rank in VALID_FFT_RANKS:
-      for dims in xrange(rank, rank + 3):
-        self._Compare(gen((4,) * dims), rank)
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          self._Compare(gen((4,) * dims), rank)
 
   def testError(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(0, rank):
-          x = np.zeros((1,) * dims).astype(np.complex64)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfFFT(x, rank)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfIFFT(x, rank)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(0, rank):
+        x = np.zeros((1,) * dims).astype(np.complex64)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfFFT(x, rank)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfIFFT(x, rank)
 
   def testGrad_Simple(self):
-    if test.is_gpu_available(cuda_only=True):
+    with self._fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 2):
           re = np.ones(shape=(4,) * dims, dtype=np.float32) / 10.0
           im = np.zeros(shape=(4,) * dims, dtype=np.float32)
-          self._checkGrad(self._tfFFTForRank(rank), re, im, use_gpu=True)
-          self._checkGrad(self._tfIFFTForRank(rank), re, im, use_gpu=True)
+          self._checkGradComplex(self._tfFFTForRank(rank), re, im)
+          self._checkGradComplex(self._tfIFFTForRank(rank), re, im)
 
   def testGrad_Random(self):
-    if test.is_gpu_available(cuda_only=True):
+    with self._fft_kernel_label_map():
       np.random.seed(54321)
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 2):
           re = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
           im = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
-          self._checkGrad(self._tfFFTForRank(rank), re, im, use_gpu=True)
-          self._checkGrad(self._tfIFFTForRank(rank), re, im, use_gpu=True)
+          self._checkGradComplex(self._tfFFTForRank(rank), re, im)
+          self._checkGradComplex(self._tfIFFTForRank(rank), re, im)
+
+
+class RFFTOpsTest(BaseFFTOpsTest):
+
+  def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+    super(RFFTOpsTest, self)._CompareBackward(x, rank, fft_length,
+                                              use_placeholder)
+
+  def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
+    with self.test_session(use_gpu=True):
+      return self._tfFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
+
+  def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
+    with self.test_session(use_gpu=True):
+      return self._tfIFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
+
+  def _npFFT(self, x, rank, fft_length=None):
+    if rank == 1:
+      return np.fft.rfft2(x, s=fft_length, axes=(-1,))
+    elif rank == 2:
+      return np.fft.rfft2(x, s=fft_length, axes=(-2, -1))
+    elif rank == 3:
+      return np.fft.rfft2(x, s=fft_length, axes=(-3, -2, -1))
+    else:
+      raise ValueError("invalid rank")
+
+  def _npIFFT(self, x, rank, fft_length=None):
+    if rank == 1:
+      return np.fft.irfft2(x, s=fft_length, axes=(-1,))
+    elif rank == 2:
+      return np.fft.irfft2(x, s=fft_length, axes=(-2, -1))
+    elif rank == 3:
+      return np.fft.irfft2(x, s=fft_length, axes=(-3, -2, -1))
+    else:
+      raise ValueError("invalid rank")
+
+  def _tfFFTForRank(self, rank):
+    if rank == 1:
+      return spectral_ops.rfft
+    elif rank == 2:
+      return spectral_ops.rfft2d
+    elif rank == 3:
+      return spectral_ops.rfft3d
+    else:
+      raise ValueError("invalid rank")
+
+  def _tfIFFTForRank(self, rank):
+    if rank == 1:
+      return spectral_ops.irfft
+    elif rank == 2:
+      return spectral_ops.irfft2d
+    elif rank == 3:
+      return spectral_ops.irfft3d
+    else:
+      raise ValueError("invalid rank")
+
+  def testEmpty(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          x = np.zeros((0,) * dims).astype(np.float32)
+          self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
+          x = np.zeros((0,) * dims).astype(np.complex64)
+          self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
+
+  def testBasic(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          for size in (5, 6):
+            inner_dim = size // 2 + 1
+            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+                (size,) * dims)
+            self._CompareForward(r2c.astype(np.float32), rank, (size,) * rank)
+            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                         10).reshape((size,) * (dims - 1) + (inner_dim,))
+            self._CompareBackward(
+                c2r.astype(np.complex64), rank, (size,) * rank)
+
+  def testBasicPlaceholder(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          for size in (5, 6):
+            inner_dim = size // 2 + 1
+            r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+                (size,) * dims)
+            self._CompareForward(
+                r2c.astype(np.float32),
+                rank, (size,) * rank,
+                use_placeholder=True)
+            c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                         10).reshape((size,) * (dims - 1) + (inner_dim,))
+            self._CompareBackward(
+                c2r.astype(np.complex64),
+                rank, (size,) * rank,
+                use_placeholder=True)
+
+  def testFftLength(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self._fft_kernel_label_map():
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            for size in (5, 6):
+              inner_dim = size // 2 + 1
+              r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+                  (size,) * dims)
+              c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                           10).reshape((size,) * (dims - 1) + (inner_dim,))
+
+              # Test truncation (FFT size < dimensions).
+              fft_length = (size - 2,) * rank
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+              # Confirm it works with unknown shapes as well.
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                   use_placeholder=True)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                    use_placeholder=True)
+
+              # Test padding (FFT size > dimensions).
+              fft_length = (size + 2,) * rank
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
+
+              # Confirm it works with unknown shapes as well.
+              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
+                                   use_placeholder=True)
+              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
+                                    use_placeholder=True)
+
+  def testRandom(self):
+    with self._fft_kernel_label_map():
+      np.random.seed(12345)
+
+      def gen_real(shape):
+        n = np.prod(shape)
+        re = np.random.uniform(size=n)
+        ret = re.reshape(shape)
+        return ret
+
+      def gen_complex(shape):
+        n = np.prod(shape)
+        re = np.random.uniform(size=n)
+        im = np.random.uniform(size=n)
+        ret = (re + im * 1j).reshape(shape)
+        return ret
+
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(rank, rank + 3):
+          for size in (5, 6):
+            inner_dim = size // 2 + 1
+            self._CompareForward(gen_real((size,) * dims), rank, (size,) * rank)
+            complex_dims = (size,) * (dims - 1) + (inner_dim,)
+            self._CompareBackward(
+                gen_complex(complex_dims), rank, (size,) * rank)
+
+  def testError(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        for dims in xrange(0, rank):
+          x = np.zeros((1,) * dims).astype(np.complex64)
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape .* must have rank at least {}".format(rank)):
+            self._tfFFT(x, rank)
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape .* must have rank at least {}".format(rank)):
+            self._tfIFFT(x, rank)
+        for dims in xrange(rank, rank + 2):
+          x = np.zeros((1,) * rank)
+
+          # Test non-rank-1 fft_length produces an error.
+          fft_length = np.zeros((1, 1)).astype(np.int32)
+          with self.assertRaisesWithPredicateMatch(ValueError,
+                                                   "Shape .* must have rank 1"):
+            self._tfFFT(x, rank, fft_length)
+          with self.assertRaisesWithPredicateMatch(ValueError,
+                                                   "Shape .* must have rank 1"):
+            self._tfIFFT(x, rank, fft_length)
+
+          # Test wrong fft_length length.
+          fft_length = np.zeros((rank + 1,)).astype(np.int32)
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+            self._tfFFT(x, rank, fft_length)
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+            self._tfIFFT(x, rank, fft_length)
+
+        # Test that calling the kernel directly without padding to fft_length
+        # produces an error.
+        rffts_for_rank = {
+            1: [gen_spectral_ops.rfft, gen_spectral_ops.irfft],
+            2: [gen_spectral_ops.rfft2d, gen_spectral_ops.irfft2d],
+            3: [gen_spectral_ops.rfft3d, gen_spectral_ops.irfft3d]
+        }
+        rfft_fn, irfft_fn = rffts_for_rank[rank]
+        with self.assertRaisesWithPredicateMatch(
+            errors.InvalidArgumentError,
+            "Input dimension .* must have length of at least 6 but got: 5"):
+          x = np.zeros((5,) * rank).astype(np.float32)
+          fft_length = [6] * rank
+          with self.test_session():
+            rfft_fn(x, fft_length).eval()
+
+        with self.assertRaisesWithPredicateMatch(
+            errors.InvalidArgumentError,
+            "Input dimension .* must have length of at least .* but got: 3"):
+          x = np.zeros((3,) * rank).astype(np.complex64)
+          fft_length = [6] * rank
+          with self.test_session():
+            irfft_fn(x, fft_length).eval()
+
+  def testGrad_Simple(self):
+    with self._fft_kernel_label_map():
+      for rank in VALID_FFT_RANKS:
+        # rfft3d/irfft3d do not have gradients yet.
+        if rank == 3:
+          continue
+        for dims in xrange(rank, rank + 2):
+          for size in (5, 6):
+            re = np.ones(shape=(size,) * dims, dtype=np.float32)
+            im = -np.ones(shape=(size,) * dims, dtype=np.float32)
+            self._checkGradReal(self._tfFFTForRank(rank), re)
+            self._checkGradComplex(
+                self._tfIFFTForRank(rank), re, im, result_is_complex=False)
+
+  def testGrad_Random(self):
+    with self._fft_kernel_label_map():
+      np.random.seed(54321)
+      for rank in VALID_FFT_RANKS:
+        # rfft3d/irfft3d do not have gradients yet.
+        if rank == 3:
+          continue
+        for dims in xrange(rank, rank + 2):
+          for size in (5, 6):
+            re = np.random.rand(*((size,) * dims)).astype(np.float32) * 2 - 1
+            im = np.random.rand(*((size,) * dims)).astype(np.float32) * 2 - 1
+            self._checkGradReal(self._tfFFTForRank(rank), re)
+            self._checkGradComplex(
+                self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index dda20e3a93d..85e7b635d80 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -1372,7 +1372,8 @@ class FIFOQueueTest(test.TestCase):
       dtypes = [
           dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
           dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8, dtypes_lib.int64,
-          dtypes_lib.bool, dtypes_lib.complex64, dtypes_lib.complex128
+          dtypes_lib.uint16, dtypes_lib.bool, dtypes_lib.complex64,
+          dtypes_lib.complex128
       ]
       shape = (32, 4, 128)
       q = data_flow_ops.FIFOQueue(32, dtypes, [shape[1:]] * len(dtypes))
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 80a9e0be681..877c2fec3a7 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -32,10 +32,9 @@ from tensorflow.python.platform import test
 
 
 class GatherNdTest(test.TestCase):
-  use_gpu = False
 
   def _testSimpleDtype(self, dtype):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = constant_op.constant(np.array([8, 1, 2, 3, 7, 5], dtype=dtype))
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
@@ -53,7 +52,7 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
@@ -84,7 +83,7 @@ class GatherNdTest(test.TestCase):
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
   def testIndexScalar(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4, 1])
@@ -94,7 +93,7 @@ class GatherNdTest(test.TestCase):
       self.assertAllEqual(np.array(7), gather_nd_val)
 
   def testParamsRankLargerThanIndexIndexScalarSlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4])
@@ -104,7 +103,7 @@ class GatherNdTest(test.TestCase):
       self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
 
   def testParamsRankLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([[4], [4], [0]])
@@ -115,7 +114,7 @@ class GatherNdTest(test.TestCase):
     self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
 
   def testHigherRankParamsLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -129,7 +128,7 @@ class GatherNdTest(test.TestCase):
     self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
 
   def testEmptyIndicesLastRankMeansCopyEntireTensor(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -146,7 +145,7 @@ class GatherNdTest(test.TestCase):
         gather_nd_val)
 
   def testHigherRankParamsAndIndicesLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -161,7 +160,7 @@ class GatherNdTest(test.TestCase):
                         gather_nd_val)
 
   def testHigherRankParams(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       shape = (10, 20, 5, 1, 17)
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
@@ -173,7 +172,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual([2000], gather_nd_t.get_shape())
 
   def testHigherRankParamsAndIndices(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       shape = (10, 20, 5, 1, 17)
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
@@ -194,7 +193,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape[0].value)
 
   def testBadIndices(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
@@ -204,7 +203,7 @@ class GatherNdTest(test.TestCase):
         gather_nd.eval()
 
   def testBadIndicesWithSlices(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
@@ -221,7 +220,7 @@ class GatherNdTest(test.TestCase):
     grad_vals = constant_op.constant([1, 2], dtype=dtypes.float64)
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[1, 0], [0, 2]], dtype=np.float64)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       assert np.array_equal(expected_grads, grads.eval())
 
   def testGradientsRank2Slices(self):
@@ -232,7 +231,7 @@ class GatherNdTest(test.TestCase):
     grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[3, 4], [1, 2]], dtype=np.float64)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
   def testGradientsRank3Elements(self):
@@ -247,7 +246,7 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
   def testGradientsInt64Indices(self):
@@ -262,7 +261,7 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
   def testGradientsRank2SlicesWithEmptySpace(self):
@@ -283,14 +282,10 @@ class GatherNdTest(test.TestCase):
          [1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 3, 3, 3, 3, 3, 3, 3, 3]],
         dtype=np.float64)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
 
-class GatherNdGpuTest(GatherNdTest):
-  use_gpu = True
-
-
 class GatherNdOpBenchmark(test.Benchmark):
 
   def benchmark_gather_nd_op(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index dac8d58b356..b3ce234d4e8 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,65 +27,87 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
+_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+
 
 class GatherTest(test.TestCase):
-  use_gpu = False
+
+  def _buildParams(self, data, dtype):
+    data = data.astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
 
   def testScalar1D(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([0, 1, 2, 3, 7, 5])
-      indices = constant_op.constant(4)
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual(7, gather_val)
-    self.assertEqual([], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([0, 1, 2, 3, 7, 5])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant(4)
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[4], gather_val)
+        self.assertEqual([], gather_t.get_shape())
 
   def testScalar2D(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8],
-                                     [9, 10, 11], [12, 13, 14]])
-      indices = constant_op.constant(2)
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual([6, 7, 8], gather_val)
-    self.assertEqual([3], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
+                       [9, 10, 11], [12, 13, 14]])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant(2)
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[2], gather_val)
+        self.assertEqual([3], gather_t.get_shape())
 
   def testSimpleTwoD32(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8],
-                                     [9, 10, 11], [12, 13, 14]])
-      indices = constant_op.constant([0, 4, 0, 2])
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual([[0, 1, 2], [12, 13, 14], [0, 1, 2], [6, 7, 8]],
-                        gather_val)
-    self.assertEqual([4, 3], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
+                       [9, 10, 11], [12, 13, 14]])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant([0, 4, 0, 2])
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[[0, 4, 0, 2]], gather_val)
+        self.assertEqual([4, 3], gather_t.get_shape())
 
   def testHigherRank(self):
     np.random.seed(1)
     # We check that scalar and empty shapes work as well
     for shape in (7, 0), (4, 3, 2):
       for indices_shape in (), (0,), (3, 0), (3, 5):
-        params = np.random.randn(*shape)
-        indices = np.random.randint(shape[0], size=indices_shape)
-        with self.test_session(use_gpu=self.use_gpu):
-          tf_params = constant_op.constant(params)
-          tf_indices = constant_op.constant(indices)
-          gather = array_ops.gather(tf_params, tf_indices)
-          self.assertAllEqual(params[indices], gather.eval())
-          self.assertEqual(indices.shape + params.shape[1:], gather.get_shape())
-          # Test gradients
-          gather_grad = np.random.randn(*gather.get_shape().as_list())
-          params_grad, indices_grad = gradients_impl.gradients(
-              gather, [tf_params, tf_indices], gather_grad)
-          self.assertEqual(indices_grad, None)
-          self.assertEqual(type(params_grad), ops.IndexedSlices)
-          params_grad = ops.convert_to_tensor(params_grad)
-          correct_params_grad = np.zeros(shape)
-          for i, g in zip(indices.flat,
-                          gather_grad.reshape((indices.size,) + shape[1:])):
-            correct_params_grad[i] += g
-          self.assertAllClose(correct_params_grad, params_grad.eval())
+        for dtype in _TEST_TYPES:
+          params = self._buildParams(np.random.randn(*shape), dtype)
+          indices = np.random.randint(shape[0], size=indices_shape)
+          with self.test_session(use_gpu=True):
+            tf_params = constant_op.constant(params)
+            tf_indices = constant_op.constant(indices)
+            gather = array_ops.gather(tf_params, tf_indices)
+            self.assertAllEqual(params[indices], gather.eval())
+            self.assertEqual(indices.shape + params.shape[1:],
+                             gather.get_shape())
+            # Test gradients
+            gather_grad = np.random.randn(*gather.get_shape().as_list()).astype(
+                dtype.as_numpy_dtype)
+            if dtype.is_complex:
+              gather_grad -= 1j * gather_grad
+            params_grad, indices_grad = gradients_impl.gradients(
+                gather, [tf_params, tf_indices], gather_grad)
+            self.assertEqual(indices_grad, None)
+            self.assertEqual(type(params_grad), ops.IndexedSlices)
+            params_grad = ops.convert_to_tensor(params_grad)
+            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+            for i, g in zip(indices.flat,
+                            gather_grad.reshape((indices.size,) + shape[1:])):
+              correct_params_grad[i] += g
+            self.assertAllClose(correct_params_grad, params_grad.eval())
 
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
@@ -94,7 +116,7 @@ class GatherTest(test.TestCase):
     self.assertEqual(None, gather_t.get_shape())
 
   def testBadIndices(self):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[7]]
       gather = array_ops.gather(params, indices)
@@ -102,18 +124,14 @@ class GatherTest(test.TestCase):
         gather.eval()
 
   def testEmptySlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      for dtype in np.float32, np.float64:
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
-          params = np.zeros((7, 0), dtype=dtype)
+          params = np.zeros((7, 0), dtype=dtype.as_numpy_dtype)
           indices = np.array([3, 4], dtype=itype)
           gather = array_ops.gather(params, indices)
           self.assertAllEqual(gather.eval(), np.zeros((2, 0)))
 
 
-class GatherGpuTest(GatherTest):
-  use_gpu = True
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index ff299e65116..2a562b6e2c5 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.ops.special_math_ops."""
+"""Tests for tensorflow.python.ops.linalg_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,19 +28,18 @@ from tensorflow.python.platform import test
 
 
 def _random_pd_matrix(n, rng):
-  """Random postive definite matrix."""
+  """Random positive definite matrix."""
   temp = rng.randn(n, n)
   return temp.dot(temp.T)
 
 
 class CholeskySolveTest(test.TestCase):
-  _use_gpu = False
 
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
   def test_works_with_five_different_random_pos_def_matrices(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       for n in range(1, 6):
         for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]:
           # Create 2 x n x n matrix
@@ -55,17 +54,13 @@ class CholeskySolveTest(test.TestCase):
                 rhs, math_ops.matmul(array, x).eval(), atol=atol)
 
 
-class CholeskySolveGpuTest(CholeskySolveTest):
-  _use_gpu = True
-
-
 class EyeTest(test.TestCase):
 
   def test_non_batch_2x2(self):
     num_rows = 2
     dtype = np.float32
     np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, dtype=dtype)
       self.assertAllEqual((num_rows, num_rows), eye.get_shape())
       self.assertAllEqual(np_eye, eye.eval())
@@ -75,7 +70,7 @@ class EyeTest(test.TestCase):
     num_columns = 3
     dtype = np.float32
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
       self.assertAllEqual((num_rows, num_columns), eye.get_shape())
       self.assertAllEqual(np_eye, eye.eval())
@@ -85,7 +80,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype)
       self.assertAllEqual(batch_shape + [num_rows, num_rows], eye.get_shape())
       eye_v = eye.eval()
@@ -98,7 +93,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       num_rows_ph = array_ops.placeholder(dtypes.int32)
       batch_shape_ph = array_ops.placeholder(dtypes.int32)
       eye = linalg_ops.eye(num_rows_ph, batch_shape=batch_shape_ph, dtype=dtype)
@@ -115,7 +110,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows,
                            num_columns=num_columns,
                            batch_shape=batch_shape,
@@ -133,7 +128,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       num_rows_ph = array_ops.placeholder(dtypes.int32)
       num_columns_ph = array_ops.placeholder(dtypes.int32)
       batch_shape_ph = array_ops.placeholder(dtypes.int32)
@@ -154,7 +149,7 @@ class EyeTest(test.TestCase):
     num_rows = 0
     dtype = np.int64
     np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, dtype=dtype)
       self.assertAllEqual((num_rows, num_rows), eye.get_shape())
       self.assertAllEqual(np_eye, eye.eval())
@@ -164,7 +159,7 @@ class EyeTest(test.TestCase):
     num_columns = 0
     dtype = np.int64
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
       self.assertAllEqual((num_rows, num_columns), eye.get_shape())
       self.assertAllEqual(np_eye, eye.eval())
@@ -174,7 +169,7 @@ class EyeTest(test.TestCase):
     num_columns = 2
     dtype = np.int64
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
       self.assertAllEqual((num_rows, num_columns), eye.get_shape())
       self.assertAllEqual(np_eye, eye.eval())
@@ -184,7 +179,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype)
       self.assertAllEqual((1, 3, 0, 0), eye.get_shape())
       eye_v = eye.eval()
@@ -198,7 +193,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows,
                            num_columns=num_columns,
                            batch_shape=batch_shape,
@@ -216,7 +211,7 @@ class EyeTest(test.TestCase):
     batch_shape = [1, 3]
     dtype = np.float32
     np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       eye = linalg_ops.eye(num_rows,
                            num_columns=num_columns,
                            batch_shape=batch_shape,
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
new file mode 100644
index 00000000000..1d92a08f5c6
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -0,0 +1,1355 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lookup ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class HashTableOpTest(test.TestCase):
+
+  def testHashTable(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableFindHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(
+          [["brain", "salad"], ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testHashTableInitWithPythonArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = ["brain", "salad", "surgery"]
+      values = [0, 1, 2]
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              keys, values, value_dtype=dtypes.int64), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableInitWithNumPyArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = np.array(["brain", "salad", "surgery"], dtype=np.str)
+      values = np.array([0, 1, 2], dtype=np.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testMultipleHashTables(self):
+    with self.test_session() as sess:
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      table1 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(3, table1.size().eval())
+      self.assertAllEqual(3, table2.size().eval())
+      self.assertAllEqual(3, table3.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testHashTableWithTensorDefault(self):
+    with self.test_session():
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableWithSparseTensorInput(self):
+    with self.test_session() as sess:
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      sp_indices = [[0, 0], [0, 1], [1, 0]]
+      sp_shape = [2, 2]
+      input_tensor = sparse_tensor.SparseTensor(
+          constant_op.constant(sp_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "tank"]),
+          constant_op.constant(sp_shape, dtypes.int64))
+      output = table.lookup(input_tensor)
+
+      out_indices, out_values, out_shape = sess.run(output)
+
+      self.assertAllEqual([0, 1, -1], out_values)
+      self.assertAllEqual(sp_indices, out_indices)
+      self.assertAllEqual(sp_shape, out_shape)
+
+  def testSignatureMismatch(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+      with self.assertRaises(TypeError):
+        table.lookup(input_string)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), "UNK")
+
+  def testDTypes(self):
+    with self.test_session():
+      default_val = -1
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
+                                                 dtypes.int64), default_val)
+
+  def testNotInitialized(self):
+    with self.test_session():
+      default_val = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              ["a"], [1], value_dtype=dtypes.int64), default_val)
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      with self.assertRaisesOpError("Table not initialized"):
+        output.eval()
+
+  def testInitializeTwice(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      with self.assertRaisesOpError("Table already initialized"):
+        table.init.run()
+
+  def testInitializationWithInvalidDimensions(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+  def testMultipleSessions(self):
+    # Start a server
+    server = server_lib.Server(
+        {
+            "local0": ["localhost:0"]
+        }, protocol="grpc", start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target)
+    session2 = session.Session(server.target)
+
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        name="t1")
+
+    # Init the table in the first session.
+    with session1:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+    # Init the table in the second session and verify that we do not get a
+    # "Table already initialized" error.
+    with session2:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+
+class IndexTableFromFile(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def test_string_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_string_index_table_from_file_tensor_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      vocabulary_file = constant_op.constant(vocabulary_file)
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  def test_string_index_table_from_file_placeholder_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      vocabulary_placeholder = array_ops.placeholder(dtypes.string, [])
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_placeholder, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      feed_dict = {vocabulary_placeholder.name: vocabulary_file}
+      lookup_ops.tables_initializer().run(feed_dict=feed_dict)
+      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertEqual(0,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  def test_int32_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab2.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_file_with_default_value(self):
+    default_value = -42
+    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_file_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+      ids = table.lookup(
+          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(
+          (
+              1,  # From vocabulary file.
+              2,  # From vocabulary file.
+              867,  # 3 + fingerprint("tarkus") mod 300.
+              860),  # 3 + fingerprint("toccata") mod 300.
+          ids.eval())
+
+  def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file="")
+
+  def test_index_table_from_file_fails_with_empty_vocabulary(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file=None)
+
+  def test_index_table_from_file_with_vocab_size_too_small(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=2)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertEqual(2, table.size().eval())
+
+  def test_index_table_from_file_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", table.init.run)
+
+  def test_index_table_from_file_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
+
+    self.assertRaises(
+        ValueError,
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertEqual(3, table.size().eval())
+
+  def test_index_table_from_file_with_invalid_hashers(self):
+    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file,
+            vocab_size=3,
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=3,
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class KeyValueTensorInitializerTest(test.TestCase):
+
+  def test_string(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int64(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int64, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int32(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int32, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "No OpKernel was registered"):
+        table.init.run()
+
+
+class IndexTableFromTensor(test.TestCase):
+
+  def test_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int32_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_tensor_with_default_value(self):
+    default_value = -42
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_tensor_missing_vocabulary_list(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError,
+                                   "vocabulary_list must be specified"):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=None, num_oov_buckets=1)
+
+  def test_index_table_from_tensor_empty_vocabulary_list(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "keys and values cannot be empty"):
+        lookup_ops.tables_initializer().run()
+
+  def test_index_table_from_tensor_with_invalid_hashers(self):
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=["brain", "salad", "surgery"],
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class IndexToStringTableFromFileTest(test.TestCase):
+
+  def _createVocabFile(self, basename):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+    return vocabulary_file
+
+  def test_index_to_string_table(self):
+    vocabulary_file = self._createVocabFile("i2f_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_index_to_string_table_with_default_value(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_small(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=2,
+          default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", default_value, default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      init = lookup_ops.tables_initializer()
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", init.run)
+
+  def test_index_to_string_table_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+
+
+class IndexToStringTableFromTensorTest(test.TestCase):
+
+  def test_index_to_string_table_from_tensor(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+
+      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_duplicate_entries(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["hello", "hello"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+      indices = constant_op.constant([0, 1, 4], dtypes.int64)
+      features = table.lookup(indices)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+
+  def test_index_to_string_with_default_value(self):
+    default_value = b"NONE"
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list, default_value=default_value)
+      indices = constant_op.constant([1, 2, 4], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+
+class InitializeTableFromFileOpTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testInitializeStringTable(self):
+    vocabulary_file = self._createVocabFile("one_column_1.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeInt64Table(self):
+    vocabulary_file = self._createVocabFile(
+        "one_column_int64.txt", values=("42", "1", "-1000"))
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.int64,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(
+          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeIndexTable(self):
+    vocabulary_file = self._createVocabFile("one_column_2.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                         key_index, dtypes.string, value_index),
+          default_value)
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      output = table.lookup(input_values)
+
+      result = output.eval()
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
+
+  def testMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 1
+      value_index = 2
+
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([1, 5, 6], result)
+
+  def testInvalidDataTypeInMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 2
+      value_index = 1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      with self.assertRaisesOpError("is not a valid"):
+        table.init.run()
+
+  def testInvalidDataType(self):
+    vocabulary_file = self._createVocabFile("one_column_3.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                           key_index, dtypes.string,
+                                           value_index), default_value)
+
+  def testInvalidIndex(self):
+    vocabulary_file = self._createVocabFile("one_column_4.txt")
+    with self.test_session():
+      default_value = -1
+      key_index = 1  # second column of the line
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+
+      with self.assertRaisesOpError("Invalid number of columns"):
+        table.init.run()
+
+  def testInitializeSameTableWithMultipleNodes(self):
+    vocabulary_file = self._createVocabFile("one_column_5.txt")
+
+    with self.test_session() as sess:
+      shared_name = "shared-one-columm"
+      default_value = -1
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testInitializeTableWithNoFilename(self):
+    with self.test_session():
+      default_value = -1
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                "", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testInitializeWithVocabSize(self):
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      vocabulary_file1 = self._createVocabFile("one_column6.txt")
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file1,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Initialize from file.
+      table1.init.run()
+      self.assertEquals(vocab_size, table1.size().eval())
+
+      vocabulary_file2 = self._createVocabFile("one_column7.txt")
+      vocab_size = 5
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file2,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+      with self.assertRaisesOpError("Invalid vocab_size"):
+        table2.init.run()
+
+      vocab_size = 1
+      vocabulary_file3 = self._createVocabFile("one_column3.txt")
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file3,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Smaller vocab size reads only vocab_size records.
+      table3.init.run()
+      self.assertEquals(vocab_size, table3.size().eval())
+
+  def testFeedVocabularyName(self):
+    vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              "old_file.txt", dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+
+      # Initialize with non existing file (old_file.txt) should fail.
+      # TODO(yleon): Update message, which might change per FileSystem.
+      with self.assertRaisesOpError("old_file.txt"):
+        table.init.run()
+
+      # Initialize the model feeding the vocabulary file.
+      filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      table.init.run(feed_dict={filenames[0]: vocabulary_file})
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInvalidFilenames(self):
+    vocabulary_file = self._createVocabFile("filename_shape.txt")
+
+    with self.test_session():
+      default_value = -1
+
+      # Invalid data type
+      other_type = constant_op.constant(1)
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+      # Non-scalar filename
+      filenames = constant_op.constant([vocabulary_file, vocabulary_file])
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testIdToStringTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = "UNK"
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileStringTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+
+      out = table.lookup(input_values)
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testStringToIdTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testInt64ToIdTable(self):
+    vocab_file = self._createVocabFile(
+        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          default_value)
+      table.init.run()
+
+      out = table.lookup(
+          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
+      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+
+class IdTableWithHashBucketsTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testStringIdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
+
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value),
+          oov_buckets,
+          key_dtype=dtypes.int32)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt64IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value), oov_buckets)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testStringIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
+      table.init.run()
+
+      values = constant_op.constant(("brain", "salad", "surgery"))
+
+      out = table.lookup(values)
+      self.assertAllEqual(
+          [
+              3,  # fingerprint("brain") mod 5.
+              1,  # fingerprint("salad") mod 5.
+              4  # fingerprint("surgery") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(
+          None, oov_buckets, key_dtype=dtypes.int32)
+      table.init.run()
+
+      input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
+
+      out = table.lookup(input_string)
+      self.assertAllEqual(
+          [
+              1,  # fingerprint("42") mod 5.
+              4,  # fingerprint("1") mod 5.
+              2  # fingerprint("-1000") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testFloat64IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.float64)
+
+  def testBoolIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.bool)
+
+  def testIdTableWithHashBucketsWithMultipleInitializers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session() as sess:
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 3
+
+      vocab_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.FastHashSpec,
+          name="table1")
+
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.StrongHashSpec((1, 2)),
+          name="table2")
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(
+          ["fruit", "brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string)
+      out2 = table2.lookup(input_string)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([5, 0, 1, 2, 5], out1)
+      self.assertAllEqual([5, 0, 1, 2, 3], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      test_util.assert_ops_in_graph({
+          "table1_Lookup/hash_bucket": "StringToHashBucketFast",
+          "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
+      }, sess.graph)
+
+  def testIdTableWithHashBucketsInitializationAcrossSessions(self):
+    vocab_file = self._createVocabFile("feat_to_id_5.txt")
+    shared_name = "across-sessions"
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      table1.init.run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+
+      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+
+      # Underlying lookup table already initialized in previous session.
+      # No need to call table2.init.run()
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out2 = table2.lookup(input_string_2)
+
+      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
+    vocab_file = self._createVocabFile("feat_to_id_6.txt")
+    with self.test_session() as sess:
+      default_value1 = -1
+      vocab_size = 3
+      oov_buckets = 0
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value1),
+          oov_buckets)
+
+      default_value2 = -2
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value2),
+          oov_buckets)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+      out2 = table2.lookup(input_string_2)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([0, 1, 2, -1], out1)
+      self.assertAllEqual([-2, 1, -2], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testSparseTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                               dtypes.string),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+              -1), 1)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int32)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int64)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testIdTableWithHashBucketsWithInvalidHashers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      lookup_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.IdTableWithHashBuckets(
+            lookup_table, oov_buckets, hasher_spec=1)
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      with self.assertRaises(ValueError):
+        table.lookup(input_string)
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([]))
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
+
+      with self.assertRaises(TypeError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index f3ae092b6f5..d081d266793 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
@@ -318,6 +319,21 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
+  def testUnknownShapePlaceholderForLogitsLabelsButScalarWeights(self):
+    logits = array_ops.placeholder(dtypes.float32)
+    labels = array_ops.placeholder(dtypes.int32)
+    weights = 1.0
+    with self.test_session() as sess:
+      loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
+      loss_val = sess.run(loss,
+                          feed_dict={
+                              logits: [[10.0, 0.0, 0.0],
+                                       [0.0, 10.0, 0.0],
+                                       [0.0, 0.0, 10.0]],
+                              labels: [[2], [0], [1]],
+                          })
+      self.assertAlmostEqual((1.0 + 1.0 + 1.0) * 10.0 / 3.0, loss_val, 3)
+
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -446,7 +462,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(0.0, loss.eval(), 3)
 
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
@@ -455,6 +472,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
+    self.assertEquals(logits.dtype, loss.dtype)
 
     with self.test_session() as sess:
       loss = sess.run(loss,
@@ -470,6 +488,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
+    self.assertEquals(logits.dtype, loss.dtype)
 
     with self.test_session() as sess:
       loss = sess.run(loss,
@@ -486,7 +505,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
@@ -497,7 +517,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       weights = constant_op.constant([[3, 4, 5], [2, 6, 0], [8, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
 
   def testMultiCorrectSigmoid(self):
@@ -506,10 +527,43 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                    [-100.0, 100.0, 100.0]])
     labels = constant_op.constant([[1, 0, 1], [1, 1, 0], [0, 1, 1]])
     loss = losses.sigmoid_cross_entropy(labels, logits)
-    self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.test_session():
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testSigmoidFloat64(self):
+    logits = constant_op.constant((
+        (100.0, -100.0, 100.0),
+        (100.0, -100.0, 100.0),
+        (100.0, 100.0, -100.0)
+    ), dtype=dtypes.float64)
+    labels = constant_op.constant((
+        (1, 0, 1), (1, 1, 0), (0, 1, 1)
+    ), dtype=dtypes.int64)
+    loss = losses.sigmoid_cross_entropy(labels, logits)
+    self.assertEquals(logits.dtype, loss.dtype)
+
+    with self.test_session():
+      self.assertAlmostEqual(44.444, loss.eval(), 3)
+
+  def testSigmoidNoReduction(self):
+    logits = constant_op.constant((
+        (100.0, -100.0, 100.0),
+        (100.0, -100.0, 100.0),
+        (100.0, 100.0, -100.0)))
+    labels = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    loss = losses.sigmoid_cross_entropy(
+        labels, logits, reduction=losses.Reduction.NONE)
+    self.assertEquals(logits.dtype, loss.dtype)
+
+    with self.test_session():
+      self.assertAllClose((
+          (0., 0., 0.),
+          (0., 100., 100.),
+          (100., 0., 100.)
+      ), loss.eval(), 3)
 
   def testSigmoidLabelSmoothingCorrect(self):
     with self.test_session():
@@ -529,7 +583,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       label_smoothing = 0.1
       loss = losses.sigmoid_cross_entropy(
           labels, logits, label_smoothing=label_smoothing)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
@@ -540,6 +595,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       sigmoid_labels = constant_op.constant([[1, 0, 1]])
       sigmoid_loss = losses.sigmoid_cross_entropy(
           sigmoid_labels, sigmoid_logits, label_smoothing=label_smoothing)
+      self.assertEquals(sigmoid_logits.dtype, sigmoid_loss.dtype)
 
       softmax_logits = constant_op.constant(
           [[0.0, 100.0], [100.0, 0.0], [100.0, 0.0]])
@@ -752,6 +808,63 @@ class HingeLossTest(test.TestCase):
       self.assertAllClose(loss.eval(), 0.875, atol=1e-3)
 
 
+class HuberLossTest(test.TestCase):
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      predictions = constant_op.constant([[-1.0], [2.1]])
+      labels = constant_op.constant([0.0, 1.0])
+      with self.assertRaises(ValueError):
+        _ = losses.huber_loss(labels, predictions).eval()
+
+  def testAllQuadratic(self):
+    with self.test_session():
+      predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
+      labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
+      loss = losses.huber_loss(labels, predictions)
+      self.assertAllClose(loss.eval(),
+                          0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
+
+  def testAllLinear(self):
+    with self.test_session():
+      predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
+      labels = constant_op.constant([0.0, 1.0, 0.0, 1.5])
+      loss = losses.huber_loss(labels, predictions)
+      self.assertAllClose(loss.eval(),
+                          (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
+
+  def testMixedQuadraticLinear(self):
+    with self.test_session():
+      predictions = constant_op.constant([[1.5, -1.4, -1.0, 0.0],
+                                          [1.5, -1.4, -1.0, 0.0]])
+      labels = constant_op.constant([[1.0, -1.0, 0.0, 0.5],
+                                     [0.0, 1.0, 0.0, 1.5]])
+      loss = losses.huber_loss(labels, predictions)
+      quadratic = 0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4.
+      linear = (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5
+      expected_loss = (quadratic + linear) / 2.
+      self.assertAllClose(loss.eval(), expected_loss, atol=1e-5)
+
+  def testAllQuadraticDelta(self):
+    with self.test_session():
+      delta = 0.5
+      predictions = constant_op.constant([1.5, -1.4, -0.5, 0.0])
+      labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
+      expected = 0.5 * np.array([0.5**2, 0.4**2, 0.5**2, 0.5**2]).mean()
+      loss = losses.huber_loss(labels, predictions, delta=delta)
+      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+
+  def testAllLinearDelta(self):
+    delta = 0.5
+    predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
+    labels = constant_op.constant([0.0, 1.0, 0.0, 1.5])
+    expected = delta * np.array([1.5, 2.4, 1.0, 1.5]).mean()
+    expected -= 0.5 * delta**2
+    loss = losses.huber_loss(labels, predictions, delta=delta)
+    with self.test_session():
+      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+
+
 class MeanSquaredErrorTest(test.TestCase):
 
   def setUp(self):
@@ -1015,6 +1128,41 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         self._labels, self._predictions, expected_loss=0.0,
         weights=np.zeros((2, 1)))
 
+  def testLossIsAssociativeAcrossBatchElements(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(0)
+
+      height = 3
+      width = 4
+      shape = (1, height, width, 1)
+
+      labels0 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+      predictions0 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+
+      labels1 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+      predictions1 = random_ops.random_uniform(
+          shape, minval=0, maxval=1, dtype=dtypes.float32)
+
+      loss0 = losses.mean_pairwise_squared_error(
+          labels=labels0,
+          predictions=predictions0)
+      loss1 = losses.mean_pairwise_squared_error(
+          labels=labels1,
+          predictions=predictions1)
+      loss0_1 = losses.mean_pairwise_squared_error(
+          labels=array_ops.concat([labels0, labels1], 0),
+          predictions=array_ops.concat([predictions0, predictions1], 0))
+
+      with self.test_session() as session:
+        loss0, loss1, loss0_1 = session.run([loss0, loss1, loss0_1])
+
+        self.assertTrue(loss0 > 0)
+        self.assertTrue(loss1 > 0)
+        self.assertAlmostEqual(loss0 + loss1, loss0_1, 5)
+
 
 class CosineDistanceLossTest(test.TestCase):
 
@@ -1152,27 +1300,44 @@ class ComputeWeightedLossTest(test.TestCase):
           next_loss += 1.0
     raw_losses.setflags(write=False)
     self._raw_losses = raw_losses
-    self._unweighted_loss = np.mean(self._raw_losses)
 
   def testUnweighted(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      raw_losses = self._raw_losses
-      unweighted_losses = (
-          losses.compute_weighted_loss(raw_losses),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 2, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones(self._shape))
-      )
-      self.assertEqual(9, len(util.get_losses()))
-      with self.test_session():
-        for unweighted_loss in unweighted_losses:
-          self.assertAllClose(self._unweighted_loss, unweighted_loss.eval())
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = self._raw_losses
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 2, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 2, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 1, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 2, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones(self._shape), reduction=reduction)
+        )
+        self.assertEqual(9, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval())
+            else:
+              # reduction one of losses.Reduction.MEAN and
+              # losses.Reduction.SUM_BY_NONZERO_WEIGHTS.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval())
 
   def testScalarWeight(self):
     with ops.Graph().as_default():
@@ -1245,15 +1410,29 @@ class ComputeWeightedLossTest(test.TestCase):
     self._test_invalid_weights((17.0,),)
 
   def _test_valid_weights(self, weights):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights * self._raw_losses),
-            weighted_loss.eval())
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        weighted_loss = losses.compute_weighted_loss(
+            self._raw_losses, weights=weights, reduction=reduction)
+        self.assertEqual(1, len(util.get_losses()))
+        with self.test_session(g):
+          weighted_losses = weights * self._raw_losses
+          weighted_sum = np.sum(weighted_losses)
+          if reduction == losses.Reduction.NONE:
+            self.assertAllClose(weighted_losses, weighted_loss.eval())
+          elif reduction == losses.Reduction.SUM:
+            self.assertAllClose(weighted_sum, weighted_loss.eval())
+          else:
+            broadcast_weights = weights * np.ones_like(self._raw_losses)
+            if reduction == losses.Reduction.MEAN:
+              self.assertAllClose(
+                  weighted_sum / np.sum(broadcast_weights),
+                  weighted_loss.eval())
+            elif reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS:
+              self.assertAllClose(
+                  weighted_sum / np.count_nonzero(broadcast_weights),
+                  weighted_loss.eval())
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
@@ -1262,7 +1441,7 @@ class ComputeWeightedLossTest(test.TestCase):
     self._test_valid_weights((((17.0,), (3.0,),),))
 
   def test1x1x4Weight(self):
-    self._test_valid_weights((((17.0, 13.0, 2.0, 5.0),),))
+    self._test_valid_weights((((17.0, 0.0, 2.0, 5.0),),))
 
   def test3x1x1Weight(self):
     self._test_valid_weights((((17.0,),), ((5.0,),), ((2.0,),),))
@@ -1276,22 +1455,22 @@ class ComputeWeightedLossTest(test.TestCase):
 
   def test3x1x4Weight(self):
     self._test_valid_weights((
-        ((17.0, 13.0, 2.0, 5.0),),
+        ((17.0, 0.0, 2.0, 5.0),),
         ((5.0, 31.0, 17.0, 5.0),),
         ((7.0, 3.0, 11.0, 5.0),),
     ))
 
   def test1x2x4Weight(self):
     self._test_valid_weights(((
-        (17.0, 13.0, 2.0, 5.0),
+        (17.0, 0.0, 2.0, 5.0),
         (3.0, 13.0, 11.0, 2.0),
     ),))
 
   def test3x2x4Weight(self):
     self._test_valid_weights((
-        ((17.0, 13.0, 2.0, 5.0), (3.0, 13.0, 11.0, 2.0),),
-        ((5.0, 31.0, 17.0, 5.0), (13.0, 3.0, 1.0, 11.0),),
-        ((7.0, 3.0, 11.0, 5.0), (13.0, 11.0, 1.0, 7.0),),
+        ((17.0, 0.0, 2.0, 5.0), (3.0, 13.0, 11.0, 2.0),),
+        ((5.0, 31.0, 17.0, 5.0), (13.0, 3.0, 0.0, 11.0),),
+        ((0.0, 3.0, 11.0, 5.0), (13.0, 11.0, 1.0, 7.0),),
     ))
 
 
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
new file mode 100644
index 00000000000..4ceb24862ff
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -0,0 +1,561 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+TIMEOUT = 1
+
+class MapStageTest(test.TestCase):
+
+  def testSimple(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.float32])
+        stage = stager.put(pi, [v], [0])
+        k, y = stager.get(gi)
+        y = math_ops.reduce_max(math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testMultiple(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.float32, dtypes.float32])
+        stage = stager.put(pi, [x, v], [0, 1])
+        k, (z, y) = stager.get(gi)
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testDictionary(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32],
+            shapes=[[], [128, 128]],
+            names=['x', 'v'])
+        stage = stager.put(pi,{'x': x, 'v': v})
+        key, ret = stager.get(gi)
+        z = ret['x']
+        y = ret['v']
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testColocation(self):
+    gpu_dev = test.gpu_device_name()
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(gpu_dev):
+        stager = data_flow_ops.MapStagingArea([dtypes.float32])
+        y = stager.put(1, [v], [0])
+        self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
+                                                   else gpu_dev)
+      with ops.device('/cpu:0'):
+        _, x = stager.get(1)
+        y = stager.peek(1)
+        _, z = stager.get()
+        self.assertEqual(x.device, '/device:CPU:0')
+        self.assertEqual(y.device, '/device:CPU:0')
+        self.assertEqual(z.device, '/device:CPU:0')
+
+    G.finalize()
+
+  def testPeek(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.int32, name='x')
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        p = array_ops.placeholder(dtypes.int32, name='p')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]])
+        stage = stager.put(pi,[x], [0])
+        peek = stager.peek(gi)
+        size = stager.size()
+
+    G.finalize()
+
+    n = 10
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      for i in range(n):
+        sess.run(stage, feed_dict={x:i, pi:i})
+
+      for i in range(n):
+        self.assertTrue(sess.run(peek, feed_dict={gi: i}) == i)
+
+      self.assertTrue(sess.run(size) == 10)
+
+  def testSizeAndClear(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32, name='x')
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32],
+            shapes=[[], [128, 128]],
+            names=['x', 'v'])
+        stage = stager.put(pi,{'x': x, 'v': v})
+        size = stager.size()
+        clear = stager.clear()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      sess.run(stage, feed_dict={x: -1, pi: 3})
+      self.assertEqual(sess.run(size), 1)
+      sess.run(stage, feed_dict={x: -1, pi: 1})
+      self.assertEqual(sess.run(size), 2)
+      sess.run(clear)
+      self.assertEqual(sess.run(size), 0)
+
+
+  def testCapacity(self):
+    capacity = 3
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.int32, name='x')
+        pi = array_ops.placeholder(dtypes.int64, name='pi')
+        gi = array_ops.placeholder(dtypes.int64, name='gi')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
+          capacity=capacity, shapes=[[]])
+
+      stage = stager.put(pi, [x], [0])
+      get = stager.get()
+      size = stager.size()
+
+    G.finalize()
+
+    from six.moves import queue as Queue
+    import threading
+
+    queue = Queue.Queue()
+    n = 8
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: i, pi: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.daemon = True
+      t.start()
+
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(n):
+        sess.run(get)
+
+      self.assertTrue(sess.run(size) == 0)
+
+  def testMemoryLimit(self):
+    memory_limit = 512*1024  # 512K
+    chunk = 200*1024 # 256K
+    capacity = memory_limit // chunk
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.uint8, name='x')
+        pi = array_ops.placeholder(dtypes.int64, name='pi')
+        gi = array_ops.placeholder(dtypes.int64, name='gi')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.uint8],
+          memory_limit=memory_limit, shapes=[[]])
+        stage = stager.put(pi, [x], [0])
+        get = stager.get()
+        size = stager.size()
+
+    G.finalize()
+
+    from six.moves import queue as Queue
+    import threading
+    import numpy as np
+
+    queue = Queue.Queue()
+    n = 8
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          data = np.full(chunk, i, dtype=np.uint8)
+          sess.run(stage, feed_dict={x: data, pi: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.daemon = True
+      t.start()
+
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(n):
+        sess.run(get)
+
+      self.assertTrue(sess.run(size) == 0)
+
+  def testOrdering(self):
+    import six
+    import random
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.int32, name='x')
+        pi = array_ops.placeholder(dtypes.int64, name='pi')
+        gi = array_ops.placeholder(dtypes.int64, name='gi')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
+          shapes=[[]], ordered=True)
+        stage = stager.put(pi, [x], [0])
+        get = stager.get()
+        size = stager.size()
+
+    G.finalize()
+
+    n = 10
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Keys n-1..0
+      keys = list(reversed(six.moves.range(n)))
+
+      for i in keys:
+        sess.run(stage, feed_dict={pi: i, x: i})
+
+      self.assertTrue(sess.run(size) == n)
+
+      # Check that key, values come out in ascending order
+      for i, k in enumerate(reversed(keys)):
+        get_key, values = sess.run(get)
+        self.assertTrue(i == k == get_key == values)
+
+      self.assertTrue(sess.run(size) == 0)
+
+  def testPartialDictInsert(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        f = array_ops.placeholder(dtypes.float32)
+        v = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+      with ops.device(test.gpu_device_name()):
+        # Test barrier with dictionary
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32],
+            names=['x', 'v', 'f'])
+        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_v = stager.put(pi, {'v': v})
+        key, ret = stager.get(gi)
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # 0 complete and incomplete entries
+      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      # Stage key 0, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      # Stage key 1, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 2])
+
+      # Now complete key 0 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 0, v: 1})
+      # 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      # We can now obtain tuple associated with key 0
+      self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
+                              == [0, { 'x':1, 'f':2, 'v':1}])
+
+      # 0 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      # Now complete key 1 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 1, v: 3})
+      # We can now obtain tuple associated with key 1
+      self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
+                              == [1, { 'x':1, 'f':2, 'v':3}])
+
+  def testPartialIndexInsert(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        f = array_ops.placeholder(dtypes.float32)
+        v = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32])
+        stage_xf = stager.put(pi, [x, f], [0, 2])
+        stage_v = stager.put(pi, [v], [1])
+        key, ret = stager.get(gi)
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # 0 complete and incomplete entries
+      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      # Stage key 0, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      # Stage key 1, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 2])
+
+      # Now complete key 0 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 0, v: 1})
+      # 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      # We can now obtain tuple associated with key 0
+      self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
+                              == [0, [1, 1, 2]])
+
+      # 0 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      # Now complete key 1 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 1, v: 3})
+      # We can now obtain tuple associated with key 1
+      self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
+                              == [1, [1,3, 2]])
+
+  def testPartialDictGetsAndPeeks(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        f = array_ops.placeholder(dtypes.float32)
+        v = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        pei = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+      with ops.device(test.gpu_device_name()):
+        # Test barrier with dictionary
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32],
+            names=['x', 'v', 'f'])
+        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_v = stager.put(pi, {'v': v})
+        peek_xf = stager.peek(pei, ['x', 'f'])
+        peek_v = stager.peek(pei, ['v'])
+        key_xf, get_xf = stager.get(gi, ['x', 'f'])
+        key_v, get_v = stager.get(gi, ['v'])
+        pop_key_xf, pop_xf = stager.get(indices=['x', 'f'])
+        pop_key_v, pop_v = stager.get(pi, ['v'])
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # 0 complete and incomplete entries
+      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      # Stage key 0, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      # Stage key 1, x and f tuple entries
+      sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
+      self.assertTrue(sess.run([size, isize]) == [0, 2])
+
+      # Now complete key 0 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 0, v: 1})
+      # 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 1])
+
+      # We can now peek at 'x' and 'f' values associated with key 0
+      self.assertTrue(sess.run(peek_xf, feed_dict={pei:0})
+                              == { 'x':1, 'f':2})
+      # Peek at 'v' value associated with key 0
+      self.assertTrue(sess.run(peek_v, feed_dict={pei:0})
+                              == { 'v':1})
+      # 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 1])
+
+      # We can now obtain 'x' and 'f' values associated with key 0
+      self.assertTrue(sess.run([key_xf, get_xf], feed_dict={gi:0})
+                              == [0, { 'x':1, 'f':2}])
+      # Still have 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 1])
+
+      # We can no longer get 'x' and 'f' from key 0
+      with self.assertRaises(errors.InvalidArgumentError) as cm:
+        sess.run([key_xf, get_xf], feed_dict={gi:0})
+
+      exc_str = ("Tensor at index '0' for key '0' "
+                "has already been removed.")
+
+      self.assertTrue(exc_str in cm.exception.message)
+
+      # Obtain 'v' value associated with key 0
+      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi:0})
+                              == [0, { 'v':1}])
+      # 0 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [0, 1])
+
+      # Now complete key 1 with tuple entry v
+      sess.run(stage_v, feed_dict={pi: 1, v: 1})
+      # 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 0])
+
+      # Pop without key to obtain 'x' and 'f' values associated with key 1
+      self.assertTrue(sess.run([pop_key_xf, pop_xf])
+                              == [1, { 'x':1, 'f':2}])
+      # still 1 complete and 1 incomplete entry
+      self.assertTrue(sess.run([size, isize]) == [1, 0])
+      # We can now obtain 'x' and 'f' values associated with key 1
+      self.assertTrue(sess.run([pop_key_v, pop_v], feed_dict={pi:1})
+                              == [1, { 'v': 1 }])
+      # Nothing is left
+      self.assertTrue(sess.run([size, isize]) == [0, 0])
+
+  def testPartialIndexGets(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        f = array_ops.placeholder(dtypes.float32)
+        v = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        pei = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+      with ops.device(test.gpu_device_name()):
+        # Test again with partial index gets
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32])
+        stage_xvf = stager.put(pi, [x, v, f], [0, 1, 2])
+        key_xf, get_xf = stager.get(gi, [0, 2])
+        key_v, get_v = stager.get(gi, [1])
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Stage complete tuple
+      sess.run(stage_xvf, feed_dict={pi: 0, x: 1, f: 2, v: 3})
+
+      self.assertTrue(sess.run([size, isize]) == [1, 0])
+
+      # Partial get using indices
+      self.assertTrue(sess.run([key_xf, get_xf],
+            feed_dict={gi: 0}) == [0, [1, 2]])
+
+      # Still some of key 0 left
+      self.assertTrue(sess.run([size, isize]) == [1, 0])
+
+      # Partial get of remaining index
+      self.assertTrue(sess.run([key_v, get_v],
+            feed_dict={gi: 0}) == [0, [3]])
+
+      # All gone
+      self.assertTrue(sess.run([size, isize]) == [0, 0])
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index f11f86d5d5b..042f4623574 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -18,405 +18,117 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import operator
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
+from tensorflow.python.platform import test as test_lib
 
 
-class MatMulTest(test.TestCase):
+def _AddTest(test, op_name, testcase_name, fn):
+  test_name = "_".join(["test", op_name, testcase_name])
+  if hasattr(test, test_name):
+    raise RuntimeError("Test %s defined more than once" % test_name)
+  setattr(test, test_name, fn)
 
-  def assertAllCloseAccordingToType(self, a, b, rtol=1e-6, atol=1e-6):
-    """Like test_util.assertAllCloseToType, but with looser fp16 limits.
 
-    With matrix multiplication, many values are summed, compounding
-    accuracy issues. Thus, we set fp16 tolerance to 1e-2 instead of 1e-6.
-    (This primarily affects the CPU versions, which accumulate in fp16;
-    the CUDA versions currently use fp32 math internally.)
+def _GetTransposedMatrices(x, x_name, kwargs):
+  if kwargs["transpose_" + x_name] is True:
+    return x.T
+  elif kwargs["adjoint_" + x_name] is True:
+    return np.conj(x.T)
+  else:
+    return x
 
-    Args:
-      a: a numpy ndarray or anything can be converted to one.
-      b: a numpy ndarray or anything can be converted to one.
-      rtol: relative tolerance
-      atol: absolute tolerance
-    """
-    a = self._GetNdArray(a)
-    b = self._GetNdArray(b)
-    if a.dtype == np.float16 or b.dtype == np.float16:
-      rtol = max(rtol, 1e-2)
-      atol = max(atol, 1e-2)
 
-    self.assertAllClose(a, b, rtol=rtol, atol=atol)
+class MatMulTest(test_lib.TestCase):
+  pass  # Filled in below
 
-  def _testCpuMatmul(self, x, y, transpose_x=False, transpose_y=False):
-    x_mat = np.matrix(x).T if transpose_x else np.matrix(x)
-    y_mat = np.matrix(y).T if transpose_y else np.matrix(y)
-    np_ans = x_mat * y_mat
-    with self.test_session(use_gpu=False):
-      tf_ans = math_ops.matmul(x, y, transpose_x, transpose_y).eval()
-    self.assertAllCloseAccordingToType(np_ans, tf_ans)
-    self.assertAllEqual(np_ans.shape, tf_ans.shape)
 
-  def _testGpuMatmul(self, x, y, transpose_x=False, transpose_y=False):
-    x_mat = np.matrix(x).T if transpose_x else np.matrix(x)
-    y_mat = np.matrix(y).T if transpose_y else np.matrix(y)
-    np_ans = x_mat * y_mat
+def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
+
+  def Test(self):
+    np_val = np.matrix(a_np_) * np.matrix(b_np_)
+
+    use_gpu = True
+    if a_np_.dtype is np.float16 and (
+        not test_util.CudaSupportsHalfMatMulAndConv()):
+      use_gpu = False
+      print("Built without fp16 matmul support for Cuda, running test on CPU.")
+
+    # Transpose and possibly conjugate a_np_ and b_np_ according to the
+    # attributes such that tf.matmul(effective_a_np, effective_b_np, **kwargs)
+    # results in a valid matrix multiplication and produces the same result as
+    # np.matrix(a_np_) * np.matrix(b_np_)
+    effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
+    effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
+    with self.test_session(use_gpu=use_gpu) as sess:
+      if use_static_shape_:
+        a = constant_op.constant(effective_a_np)
+        b = constant_op.constant(effective_b_np)
+        res = math_ops.matmul(a, b, **kwargs_)
+        tf_val = res.eval()
+      else:
+        a = array_ops.placeholder(a_np_.dtype)
+        b = array_ops.placeholder(b_np_.dtype)
+        res = math_ops.matmul(a, b, **kwargs_)
+        tf_val = sess.run(res, feed_dict={a: effective_a_np, b: effective_b_np})
+
+    self.assertAllCloseAccordingToType(
+        tf_val,
+        np_val,
+        float_rtol=2e-5,
+        float_atol=2e-5,
+        half_rtol=0.2,
+        half_atol=0.2)
+
+  return Test
+
+
+class MatMulGradientTest(test_lib.TestCase):
+  pass  # Will be filled in below.
+
+
+def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
+
+  def Test(self):
+    if not use_static_shape_ or a_np_.dtype in (np.int32, np.float16):
+      self.skipTest("Skipping infeasible gradient test.")
+
+    # Transpose and possibly conjugate a_np_ and b_np_ according to the
+    # attributes such that tf.matmul(effective_a_np, effective_b_np, **kwargs)
+    # results in a valid matrix multiplication and produces the same result as
+    # np.matrix(a_np_) * np.matrix(b_np_)
+    effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
+    effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
+
+    epsilon = np.finfo(a_np_.dtype).eps
+    delta = epsilon**(1.0 / 3.0)
+    tol = 20 * delta
     with self.test_session(use_gpu=True):
-      tf_ans = math_ops.matmul(x, y, transpose_x, transpose_y).eval()
-    self.assertAllCloseAccordingToType(np_ans, tf_ans)
-    self.assertAllEqual(np_ans.shape, tf_ans.shape)
+      a = constant_op.constant(effective_a_np)
+      b = constant_op.constant(effective_b_np)
+      res = math_ops.matmul(a, b, **kwargs_)
+      for x, x_init in [a, effective_a_np], [b, effective_b_np]:
+        theoretical, numerical = gradient_checker.compute_gradient(
+            x,
+            x_init.shape,
+            res, [a_np_.shape[0], b_np_.shape[1]],
+            x_init_value=x_init,
+            delta=delta)
+        self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
 
-  def _randMatrix(self, rows, cols, dtype):
-    if dtype in (np.complex64, np.complex128):
-      if dtype == np.complex64:
-        float_dtype = np.float32
-      else:
-        float_dtype = np.float64
-      real = self._randMatrix(rows, cols, float_dtype)
-      imag = self._randMatrix(rows, cols, float_dtype)
-      return real + 1j * imag
-    else:
-      return np.random.uniform(
-          low=1.0, high=100.0, size=rows * cols).reshape(
-              [rows, cols]).astype(dtype)
-
-  # Basic test:
-  #   [ [1],
-  #     [2],
-  #     [3],   *  [1, 2]
-  #     [4] ]
-  def testFloatBasic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.float32)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.float32)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testDoubleBasic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.float64)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.float64)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testHalfBasic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.float16)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.float16)
-    self._testCpuMatmul(x, y)
-    if test_util.CudaSupportsHalfMatMulAndConv():
-      self._testGpuMatmul(x, y)
-    else:
-      print("Built without fp16 matmul support, skipping GPU test.")
-
-  def testInt32Basic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.int32)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.int32)
-    self._testCpuMatmul(x, y)
-
-  def testComplex64Basic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.complex64)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.complex64)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testComplex128Basic(self):
-    x = np.arange(1., 5.).reshape([4, 1]).astype(np.complex128)
-    y = np.arange(1., 3.).reshape([1, 2]).astype(np.complex128)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  # Vector optimized tests
-  #   x            *  y
-  #   [1, 2, 3, 4] *  [[1, 2, 3, 4],
-  #                    [5, 6, 7, 8]]
-  #
-  # and y^T * x^T
-  def _vectorTest(self, dtype, gpu):
-    x = np.arange(1., 5.).reshape([1, 4]).astype(np.float32)
-    y = np.arange(1., 9.).reshape([4, 2]).astype(np.float32)
-    x_t = x.transpose()
-    y_t = y.transpose()
-    if gpu:
-      self._testGpuMatmul(x, y)
-      self._testGpuMatmul(x_t, y, transpose_x=True)
-      self._testGpuMatmul(x, y_t, transpose_y=True)
-      self._testGpuMatmul(y_t, x_t)
-      self._testGpuMatmul(y, x_t, transpose_x=True)
-      self._testGpuMatmul(y_t, x, transpose_y=True)
-    else:
-      self._testCpuMatmul(x, y)
-      self._testCpuMatmul(x_t, y, transpose_x=True)
-      self._testCpuMatmul(x, y_t, transpose_y=True)
-      self._testCpuMatmul(y_t, x_t)
-      self._testCpuMatmul(y, x_t, transpose_x=True)
-      self._testCpuMatmul(y_t, x, transpose_y=True)
-
-  def testFloatVector(self):
-    self._vectorTest(np.float32, gpu=False)
-    self._vectorTest(np.float32, gpu=True)
-
-  def testDoubleVector(self):
-    self._vectorTest(np.float64, gpu=False)
-    self._vectorTest(np.float64, gpu=True)
-
-  def testHalfVector(self):
-    self._vectorTest(np.float16, gpu=False)
-    if test_util.CudaSupportsHalfMatMulAndConv():
-      self._vectorTest(np.float16, gpu=True)
-    else:
-      print("Built without fp16 matmul support, skipping GPU test.")
-
-  def testInt32Vector(self):
-    self._vectorTest(np.int32, gpu=False)
-    self._vectorTest(np.int32, gpu=True)
-
-  def testComplex64Vector(self):
-    self._vectorTest(np.complex64, gpu=False)
-    self._vectorTest(np.complex64, gpu=True)
-
-  def testComplex128Vector(self):
-    self._vectorTest(np.complex128, gpu=False)
-    self._vectorTest(np.complex128, gpu=True)
-
-  # Tests testing random sized matrices.
-  def testFloatRandom(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
-      x = self._randMatrix(n, k, np.float32)
-      y = self._randMatrix(k, m, np.float32)
-      self._testCpuMatmul(x, y)
-      self._testGpuMatmul(x, y)
-
-  def testDoubleRandom(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
-      x = self._randMatrix(n, k, np.float64)
-      y = self._randMatrix(k, m, np.float64)
-      self._testCpuMatmul(x, y)
-      self._testGpuMatmul(x, y)
-
-  def testHalfRandom(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float.
-      x = self._randMatrix(n, k, np.float16)
-      y = self._randMatrix(k, m, np.float16)
-      self._testCpuMatmul(x, y)
-      if test_util.CudaSupportsHalfMatMulAndConv():
-        self._testGpuMatmul(x, y)
-      else:
-        print("Built without fp16 matmul support, skipping GPU test.")
-
-  def testInt32Random(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
-      x = self._randMatrix(n, k, np.int32)
-      y = self._randMatrix(k, m, np.int32)
-      self._testCpuMatmul(x, y)
-
-  def testComplex64Random(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float
-      x = self._randMatrix(n, k, np.complex64)
-      y = self._randMatrix(k, m, np.complex64)
-      self._testCpuMatmul(x, y)
-      self._testGpuMatmul(x, y)
-
-  def testComplex128Random(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float
-      x = self._randMatrix(n, k, np.complex128)
-      y = self._randMatrix(k, m, np.complex128)
-      self._testCpuMatmul(x, y)
-      self._testGpuMatmul(x, y)
-
-  # Test the cases that transpose the matrices before multiplying.
-  # NOTE(keveman): The cases where only one of the inputs is
-  # transposed are covered by tf.matmul's gradient function.
-  def testFloatRandomTransposeBoth(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
-      x = self._randMatrix(k, n, np.float32)
-      y = self._randMatrix(m, k, np.float32)
-      self._testCpuMatmul(x, y, True, True)
-      self._testGpuMatmul(x, y, True, True)
-
-  def testDoubleRandomTransposeBoth(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
-      x = self._randMatrix(k, n, np.float64)
-      y = self._randMatrix(m, k, np.float64)
-      self._testCpuMatmul(x, y, True, True)
-      self._testGpuMatmul(x, y, True, True)
-
-  def testHalfRandomTransposeBoth(self):
-    for _ in range(10):
-      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float.
-      x = self._randMatrix(k, n, np.float16)
-      y = self._randMatrix(m, k, np.float16)
-      self._testCpuMatmul(x, y, True, True)
-      if test_util.CudaSupportsHalfMatMulAndConv():
-        self._testGpuMatmul(x, y, True, True)
-      else:
-        print("Built without fp16 matmul support, skipping GPU test.")
-
-  def testMatMul_OutEmpty_A(self):
-    n, k, m = 0, 8, 3
-    x = self._randMatrix(n, k, np.float32)
-    y = self._randMatrix(k, m, np.float32)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testMatMul_OutEmpty_B(self):
-    n, k, m = 3, 8, 0
-    x = self._randMatrix(n, k, np.float32)
-    y = self._randMatrix(k, m, np.float32)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testMatMul_Inputs_Empty(self):
-    n, k, m = 3, 0, 4
-    x = self._randMatrix(n, k, np.float32)
-    y = self._randMatrix(k, m, np.float32)
-    self._testCpuMatmul(x, y)
-    self._testGpuMatmul(x, y)
-
-  def testShapeErrors(self):
-    a = array_ops.placeholder(dtypes.float32, [32, 37])
-    b = array_ops.placeholder(dtypes.float32, [36, 2])
-    c = array_ops.placeholder(dtypes.float32, [37])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Dimensions must be equal, but are 37 and 36"):
-      math_ops.matmul(a, b)
-    with self.assertRaisesRegexp(ValueError, "must be rank 2"):
-      math_ops.matmul(a, c)
-
-  def testShapeInference(self):
-    """Tests common_shapes.call_cpp_shape_fn."""
-    a = constant_op.constant([2] * 6, shape=[3, 2])
-    b = constant_op.constant([2] * 2, shape=[2, 1])
-    mm = math_ops.matmul(a, b)
-    self.assertEqual([3, 1], mm.get_shape())
-
-    # Transpose arguments are respected.
-    a = constant_op.constant([2] * 6, shape=[2, 3])
-    b = constant_op.constant([2] * 2, shape=[1, 2])
-    mm = math_ops.matmul(a, b, transpose_a=True, transpose_b=True)
-    self.assertEqual([3, 1], mm.get_shape())
-
-    # Unknown dims come through in output.
-    a = array_ops.placeholder(np.float32)
-    b = array_ops.placeholder(np.float32)
-    mm = math_ops.matmul(a, b)
-    self.assertEqual(tensor_shape.TensorShape(None), mm.get_shape())
-
-    a = constant_op.constant([1] * 6, shape=[2, 3])
-    b = constant_op.constant([2] * 2, shape=[1, 2])
-    with self.assertRaisesRegexp(ValueError, ".*must be equal.*"):
-      math_ops.matmul(a, b, transpose_a=False, transpose_b=True)
+  return Test
 
 
-# TODO(zhifengc): Figures out how to test matmul gradients on GPU.
-class MatMulGradientTest(test.TestCase):
-
-  def testGradientInput0(self):
-    with self.test_session(use_gpu=False):
-      x = constant_op.constant(
-          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-          shape=[3, 2],
-          dtype=dtypes.float64,
-          name="x")
-      y = constant_op.constant(
-          [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
-          shape=[2, 4],
-          dtype=dtypes.float64,
-          name="y")
-      m = math_ops.matmul(x, y, name="matmul")
-      err = gradient_checker.compute_gradient_error(x, [3, 2], m, [3, 4])
-    print("matmul input0 gradient err = ", err)
-    self.assertLess(err, 1e-10)
-
-  def testGradientInput1(self):
-    with self.test_session(use_gpu=False):
-      x = constant_op.constant(
-          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-          shape=[3, 2],
-          dtype=dtypes.float64,
-          name="x")
-      y = constant_op.constant(
-          [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
-          shape=[2, 4],
-          dtype=dtypes.float64,
-          name="y")
-      m = math_ops.matmul(x, y, name="matmul")
-      err = gradient_checker.compute_gradient_error(y, [2, 4], m, [3, 4])
-    print("matmul input1 gradient err = ", err)
-    self.assertLess(err, 1e-10)
-
-  def _VerifyInput0(self, transpose_a, transpose_b):
-    shape_x = [3, 2]
-    shape_y = [2, 4]
-    if transpose_a:
-      shape_x = list(reversed(shape_x))
-    if transpose_b:
-      shape_y = list(reversed(shape_y))
-    with self.test_session(use_gpu=False):
-      x = constant_op.constant(
-          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-          shape=shape_x,
-          dtype=dtypes.float64,
-          name="x")
-      y = constant_op.constant(
-          [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
-          shape=shape_y,
-          dtype=dtypes.float64,
-          name="y")
-      m = math_ops.matmul(x, y, transpose_a, transpose_b, name="matmul")
-      err = gradient_checker.compute_gradient_error(x, shape_x, m, [3, 4])
-    print("matmul input0 gradient err = ", err)
-    self.assertLess(err, 1e-10)
-
-  def testGradientInput0WithTranspose(self):
-    self._VerifyInput0(transpose_a=True, transpose_b=False)
-    self._VerifyInput0(transpose_a=False, transpose_b=True)
-    self._VerifyInput0(transpose_a=True, transpose_b=True)
-
-  def _VerifyInput1(self, transpose_a, transpose_b):
-    shape_x = [3, 2]
-    shape_y = [2, 4]
-    if transpose_a:
-      shape_x = list(reversed(shape_x))
-    if transpose_b:
-      shape_y = list(reversed(shape_y))
-    with self.test_session(use_gpu=False):
-      x = constant_op.constant(
-          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-          shape=shape_x,
-          dtype=dtypes.float64,
-          name="x")
-      y = constant_op.constant(
-          [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
-          shape=shape_y,
-          dtype=dtypes.float64,
-          name="y")
-      m = math_ops.matmul(x, y, transpose_a, transpose_b, name="matmul")
-      err = gradient_checker.compute_gradient_error(y, shape_y, m, [3, 4])
-    print("matmul input1 gradient err = ", err)
-    self.assertLess(err, 1e-10)
-
-  def testGradientInput1WithTranspose(self):
-    self._VerifyInput1(transpose_a=True, transpose_b=False)
-    self._VerifyInput1(transpose_a=False, transpose_b=True)
-    self._VerifyInput1(transpose_a=True, transpose_b=True)
-
-
-class MatMulStatsTest(test.TestCase):
+class MatMulStatsTest(test_lib.TestCase):
 
   def testSimpleStatistics(self):
     g = ops.Graph()
@@ -441,5 +153,105 @@ class MatMulStatsTest(test.TestCase):
           self.assertEqual(7200, flops)
 
 
+try:
+  # @ operator supported since python 3.5.
+  infix_matmul = operator.matmul
+except AttributeError:
+
+  # For earlier versions of python, emulate regular behavior.
+  # Useful to build and test for 3.5+ on earlier versions.
+  def infix_matmul(x, y):  # pylint: disable=invalid-name
+    try:
+      r = type(x).__matmul__(x, y)
+    except AttributeError:
+      r = NotImplemented
+    if r is NotImplemented and type(x) is not type(y):
+      try:
+        r = type(y).__rmatmul__(y, x)
+      except AttributeError:
+        r = NotImplemented
+    if r is NotImplemented:
+      raise TypeError("unsupported operand type(s) for @: '{}' and '{}'"
+                      .format(type(x).__name__, type(y).__name__))
+    return r
+
+
+class MatMulInfixOperatorTest(test_lib.TestCase):
+
+  def testMismatchedShape(self):
+    with self.assertRaisesWithPredicateMatch(ValueError,
+                                             lambda e: "Shape must" in str(e)):
+      infix_matmul(
+          ops.convert_to_tensor([10.0, 20.0, 30.0]),
+          ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
+
+  def testMismatchedDimensions(self):
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, lambda e: "Dimensions must" in str(e)):
+      infix_matmul(
+          ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
+          ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
+
+  def testInfixMatmulIsTfMatmul(self):
+    a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
+    b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
+    c = infix_matmul(a, b)
+    self.assertEqual(c.op.type, "MatMul")
+
+  def testInfixMatmulDoesDotProduct(self):
+    a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
+    b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
+    c = infix_matmul(a, b)
+    d = math_ops.matmul(a, b)
+    with self.test_session():
+      self.assertAllEqual(c.eval(), d.eval())
+
+
 if __name__ == "__main__":
-  test.main()
+  sizes = [1, 3, 5]
+  trans_options = [[False, False], [True, False], [False, True]]
+  for use_static_shape in [False, True]:
+    for dtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
+                  np.complex128):
+      if not use_static_shape and dtype == np.int32:
+        # TODO(rmlarsen): Re-enable this test when we have fixed the underlying
+        # bug in Windows (b/35935459).
+        continue
+      for m in sizes:
+        for n in sizes:
+          for k in sizes:
+            # Construct compatible random matrices a_np of size [m, k] and b_np
+            # of size [k, n].
+            a_np = np.random.normal(-5, 5, m * k).astype(dtype).reshape([m, k])
+            if dtype in (np.complex64, np.complex128):
+              a_np.imag = np.random.normal(-5, 5,
+                                           m * k).astype(dtype).reshape([m, k])
+            b_np = np.random.normal(-5, 5, k * n).astype(dtype).reshape([k, n])
+            if dtype in (np.complex64, np.complex128):
+              b_np.imag = np.random.normal(-5, 5,
+                                           k * n).astype(dtype).reshape([k, n])
+            for adjoint_a, transpose_a in trans_options:
+              for adjoint_b, transpose_b in trans_options:
+                name = "%s_%s_%s_%s_%s_%s_%s_%s_%s" % (
+                    use_static_shape, dtype.__name__, m, n, k, adjoint_a,
+                    transpose_a, adjoint_b, transpose_b)
+                _AddTest(MatMulTest, "MatMulTest", name,
+                         _GetMatMulTest(
+                             a_np,
+                             b_np,
+                             use_static_shape,
+                             adjoint_a=adjoint_a,
+                             transpose_a=transpose_a,
+                             adjoint_b=adjoint_b,
+                             transpose_b=transpose_b))
+                _AddTest(MatMulGradientTest, "MatMulGradientTest", name,
+                         _GetMatMulGradientTest(
+                             a_np,
+                             b_np,
+                             use_static_shape,
+                             adjoint_a=adjoint_a,
+                             transpose_a=transpose_a,
+                             adjoint_b=adjoint_b,
+                             transpose_b=transpose_b))
+
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index ee7db77dd07..c2530b3597c 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -71,7 +71,8 @@ def _GetMatrixBandPartGradTest(dtype_, batch_shape_, shape_):
 
 
 if __name__ == '__main__':
-  for dtype in np.int32, np.int64, np.float32, np.float64:
+  for dtype in (
+      np.int32, np.int64, np.float32, np.float64, np.complex64, np.complex128):
     for batch_shape in ((), (2,), (1, 3, 2)):
       for rows in 1, 2, 7:
         for cols in 1, 2, 7:
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 263f90c4f1f..d494016a9d5 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -28,46 +28,71 @@ from tensorflow.python.platform import test
 
 class InverseOpTest(test.TestCase):
 
-  def _verifyInverse(self, x):
+  def _verifyInverse(self, x, np_type):
+    for adjoint in False, True:
+      y = x.astype(np_type)
+      with self.test_session(use_gpu=True):
+        # Verify that x^{-1} * x == Identity matrix.
+        inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
+        tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
+        np_ans = np.identity(y.shape[-1])
+        if x.ndim > 2:
+          tiling = list(y.shape)
+          tiling[-2:] = [1, 1]
+          np_ans = np.tile(np_ans, tiling)
+        out = tf_ans.eval()
+        self.assertAllClose(np_ans, out)
+        self.assertShapeEqual(y, tf_ans)
+
+  def _verifyInverseReal(self, x):
     for np_type in [np.float32, np.float64]:
-      for adjoint in False, True:
-        y = x.astype(np_type)
-        with self.test_session():
-          # Verify that x^{-1} * x == Identity matrix.
-          inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
-          tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
-          np_ans = np.identity(y.shape[-1])
-          if x.ndim > 2:
-            tiling = list(y.shape)
-            tiling[-2:] = [1, 1]
-            np_ans = np.tile(np_ans, tiling)
-          out = tf_ans.eval()
-          self.assertAllClose(np_ans, out)
-          self.assertShapeEqual(y, tf_ans)
+      self._verifyInverse(x, np_type)
+
+  def _verifyInverseComplex(self, x):
+    for np_type in [np.complex64, np.complex128]:
+      self._verifyInverse(x, np_type)
+
+  def _makeBatch(self, matrix1, matrix2):
+    matrix_batch = np.concatenate(
+        [np.expand_dims(matrix1, 0), np.expand_dims(matrix2, 0)])
+    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
+    return matrix_batch
 
   def testNonsymmetric(self):
     # 2x2 matrices
     matrix1 = np.array([[1., 2.], [3., 4.]])
     matrix2 = np.array([[1., 3.], [3., 5.]])
-    self._verifyInverse(matrix1)
-    self._verifyInverse(matrix2)
+    self._verifyInverseReal(matrix1)
+    self._verifyInverseReal(matrix2)
     # A multidimensional batch of 2x2 matrices
-    matrix_batch = np.concatenate(
-        [np.expand_dims(matrix1, 0), np.expand_dims(matrix2, 0)])
-    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
-    self._verifyInverse(matrix_batch)
+    self._verifyInverseReal(self._makeBatch(matrix1, matrix2))
+    # Complex
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyInverseComplex(matrix1)
+    self._verifyInverseComplex(matrix2)
+    # Complex batch
+    self._verifyInverseComplex(self._makeBatch(matrix1, matrix2))
 
   def testSymmetricPositiveDefinite(self):
     # 2x2 matrices
     matrix1 = np.array([[2., 1.], [1., 2.]])
     matrix2 = np.array([[3., -1.], [-1., 3.]])
-    self._verifyInverse(matrix1)
-    self._verifyInverse(matrix2)
+    self._verifyInverseReal(matrix1)
+    self._verifyInverseReal(matrix2)
     # A multidimensional batch of 2x2 matrices
-    matrix_batch = np.concatenate(
-        [np.expand_dims(matrix1, 0), np.expand_dims(matrix2, 0)])
-    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
-    self._verifyInverse(matrix_batch)
+    self._verifyInverseReal(self._makeBatch(matrix1, matrix2))
+    # Complex
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyInverseComplex(matrix1)
+    self._verifyInverseComplex(matrix2)
+    # Complex batch
+    self._verifyInverseComplex(self._makeBatch(matrix1, matrix2))
 
   def testNonSquareMatrix(self):
     # When the inverse of a non-square matrix is attempted we should return
@@ -86,13 +111,13 @@ class InverseOpTest(test.TestCase):
     with self.test_session():
       with self.assertRaisesOpError("Input is not invertible."):
         # All rows of the matrix below add to zero.
-        tensor3 = constant_op.constant(
-            [[1., 0., -1.], [-1., 1., 0.], [0., -1., 1.]])
+        tensor3 = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                        [0., -1., 1.]])
         linalg_ops.matrix_inverse(tensor3).eval()
 
   def testEmpty(self):
-    self._verifyInverse(np.empty([0, 2, 2]))
-    self._verifyInverse(np.empty([2, 0, 0]))
+    self._verifyInverseReal(np.empty([0, 2, 2]))
+    self._verifyInverseReal(np.empty([2, 0, 0]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 07ff53cfe6e..e7ae7f714f2 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -96,11 +96,6 @@ class MatrixSolveOpTest(test.TestCase):
             [[1., 0., -1.], [-1., 1., 0.], [0., -1., 1.]])
         linalg_ops.matrix_solve(matrix, matrix).eval()
 
-  def testEmpty(self):
-    with self.test_session():
-      self._verifySolve(np.empty([0, 0]), np.empty([0, 0]))
-      self._verifySolve(np.empty([2, 2]), np.empty([2, 0]))
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index cdf828f3cae..33288392c07 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -20,23 +20,32 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test
 
 
 class MatrixTriangularSolveOpTest(test.TestCase):
 
-  def _verifySolveAllWays(self, x, y, batch_dims=None):
-    for use_gpu in True, False:
-      for lower in True, False:
-        for adjoint in True, False:
+  def _verifySolveAllWays(self, x, y, dtypes, batch_dims=None):
+    for lower in True, False:
+      for adjoint in True, False:
+        for use_placeholder in True, False:
           self._verifySolve(
               x,
               y,
               lower=lower,
               adjoint=adjoint,
               batch_dims=batch_dims,
-              use_gpu=use_gpu)
+              use_placeholder=use_placeholder,
+              dtypes=dtypes)
+
+  def _verifySolveAllWaysReal(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.float32, np.float64), batch_dims)
+
+  def _verifySolveAllWaysComplex(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.complex64, np.complex128), batch_dims)
 
   def _verifySolve(self,
                    x,
@@ -44,8 +53,9 @@ class MatrixTriangularSolveOpTest(test.TestCase):
                    lower=True,
                    adjoint=False,
                    batch_dims=None,
-                   use_gpu=False):
-    for np_type in [np.float32, np.float64]:
+                   use_placeholder=False,
+                   dtypes=(np.float32, np.float64)):
+    for np_type in dtypes:
       a = x.astype(np_type)
       b = y.astype(np_type)
       # For numpy.solve we have to explicitly zero out the strictly
@@ -64,31 +74,71 @@ class MatrixTriangularSolveOpTest(test.TestCase):
         a_np = np.tile(a_np, batch_dims + [1, 1])
         b = np.tile(b, batch_dims + [1, 1])
 
-      with self.test_session(use_gpu=use_gpu):
-        tf_ans = linalg_ops.matrix_triangular_solve(
-            a, b, lower=lower, adjoint=adjoint)
-        out = tf_ans.eval()
-        np_ans = np.linalg.solve(a_np, b)
-        self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        self.assertEqual(np_ans.shape, out.shape)
-        self.assertAllClose(np_ans, out)
+      with self.test_session(use_gpu=True) as sess:
+        if use_placeholder:
+          a_tf = array_ops.placeholder(a.dtype)
+          b_tf = array_ops.placeholder(b.dtype)
+          tf_ans = linalg_ops.matrix_triangular_solve(
+              a_tf, b_tf, lower=lower, adjoint=adjoint)
+          tf_val = sess.run(tf_ans, feed_dict={a_tf: a, b_tf: b})
+          np_ans = np.linalg.solve(a_np, b)
+        else:
+          a_tf = constant_op.constant(a)
+          b_tf = constant_op.constant(b)
+          tf_ans = linalg_ops.matrix_triangular_solve(
+              a_tf, b_tf, lower=lower, adjoint=adjoint)
+          tf_val = tf_ans.eval()
+          np_ans = np.linalg.solve(a_np, b)
+          self.assertEqual(np_ans.shape, tf_ans.get_shape())
+        self.assertEqual(np_ans.shape, tf_val.shape)
+        self.assertAllClose(np_ans, tf_val)
 
   def testSolve(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1]])
+    rhs0 = np.array([[1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
     # 2x2 matrices, single right-hand side.
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs0 = np.array([[1.], [1.]])
-    self._verifySolveAllWays(matrix, rhs0)
+    self._verifySolveAllWaysReal(matrix, rhs0)
     # 2x2 matrices, 3 right-hand sides.
     rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
-    self._verifySolveAllWays(matrix, rhs1)
+    self._verifySolveAllWaysReal(matrix, rhs1)
+
+  def testSolveComplex(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1 + 1j * 0.1]])
+    rhs0 = np.array([[1. + 1j]])
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrices, single right-hand side.
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs0 = np.array([[1.], [1.]]).astype(np.complex64)
+    rhs0 += 1j * rhs0
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrices, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs1 += 1j * rhs1
+    self._verifySolveAllWaysComplex(matrix, rhs1)
 
   def testSolveBatch(self):
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
     # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
-    self._verifySolveAllWays(matrix, rhs, batch_dims=[2, 3])
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
-    self._verifySolveAllWays(matrix, rhs, batch_dims=[3, 2])
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+  def testSolveBatchComplex(self):
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs += 1j * rhs
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
 
   def testNonSquareMatrix(self):
     # A non-square matrix should cause an error.
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 91b3a88feb9..543039bdd3d 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -227,20 +227,28 @@ class MeanTest(test.TestCase):
       self.assertAlmostEqual(1.65, sess.run(mean), 5)
 
   def testUnweighted(self):
-    values = _test_values((3, 2, 4))
+    values = _test_values((3, 2, 4, 1))
     mean_results = (
         metrics.mean(values),
         metrics.mean(values, weights=1.0),
         metrics.mean(values, weights=np.ones((1, 1, 1))),
         metrics.mean(values, weights=np.ones((1, 1, 1, 1))),
+        metrics.mean(values, weights=np.ones((1, 1, 1, 1, 1))),
         metrics.mean(values, weights=np.ones((1, 1, 4))),
+        metrics.mean(values, weights=np.ones((1, 1, 4, 1))),
         metrics.mean(values, weights=np.ones((1, 2, 1))),
+        metrics.mean(values, weights=np.ones((1, 2, 1, 1))),
         metrics.mean(values, weights=np.ones((1, 2, 4))),
+        metrics.mean(values, weights=np.ones((1, 2, 4, 1))),
         metrics.mean(values, weights=np.ones((3, 1, 1))),
+        metrics.mean(values, weights=np.ones((3, 1, 1, 1))),
         metrics.mean(values, weights=np.ones((3, 1, 4))),
+        metrics.mean(values, weights=np.ones((3, 1, 4, 1))),
         metrics.mean(values, weights=np.ones((3, 2, 1))),
+        metrics.mean(values, weights=np.ones((3, 2, 1, 1))),
         metrics.mean(values, weights=np.ones((3, 2, 4))),
-        metrics.mean(values, weights=np.ones((3, 2, 4, 1))),)
+        metrics.mean(values, weights=np.ones((3, 2, 4, 1))),
+        metrics.mean(values, weights=np.ones((3, 2, 4, 1, 1))),)
     expected = np.mean(values)
     with self.test_session():
       variables.local_variables_initializer().run()
@@ -304,9 +312,7 @@ class MeanTest(test.TestCase):
     invalid_weights = (
         (1,),
         (1, 1),
-        (1, 1, 1),
         (3, 2),
-        (3, 2, 4),
         (2, 4, 1),
         (4, 2, 4, 1),
         (3, 3, 4, 1),
@@ -1163,7 +1169,7 @@ class AUCTest(test.TestCase):
       self.assertAlmostEqual(1, auc.eval(), 6)
 
   def np_auc(self, predictions, labels, weights):
-    """Computes the AUC explicitely using Numpy.
+    """Computes the AUC explicitly using Numpy.
 
     Args:
       predictions: an ndarray with shape [N].
@@ -3405,5 +3411,245 @@ class MeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_miou, miou.eval())
 
 
+class MeanPerClassAccuracyTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_per_class_accuracy(
+        predictions=array_ops.ones([10, 1]),
+        labels=array_ops.ones([10, 1]),
+        num_classes=2)
+    _assert_local_variables(self, ('mean_accuracy/total_confusion_matrix:0',))
+
+  def testMetricsCollections(self):
+    my_collection_name = '__metrics__'
+    mean_accuracy, _ = metrics.mean_per_class_accuracy(
+        predictions=array_ops.ones([10, 1]),
+        labels=array_ops.ones([10, 1]),
+        num_classes=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(
+        ops.get_collection(my_collection_name), [mean_accuracy])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_per_class_accuracy(
+        predictions=array_ops.ones([10, 1]),
+        labels=array_ops.ones([10, 1]),
+        num_classes=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
+    predictions = array_ops.ones([10, 3])
+    labels = array_ops.ones([10, 4])
+    with self.assertRaises(ValueError):
+      metrics.mean_per_class_accuracy(labels, predictions, num_classes=2)
+
+  def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
+    predictions = array_ops.ones([10])
+    labels = array_ops.ones([10])
+    weights = array_ops.zeros([9])
+    with self.assertRaises(ValueError):
+      metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes=2, weights=weights)
+
+  def testValueTensorIsIdempotent(self):
+    num_classes = 3
+    predictions = random_ops.random_uniform(
+        [10], maxval=num_classes, dtype=dtypes_lib.int64, seed=1)
+    labels = random_ops.random_uniform(
+        [10], maxval=num_classes, dtype=dtypes_lib.int64, seed=1)
+    mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+        labels, predictions, num_classes=num_classes)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_mean_accuracy = mean_accuracy.eval()
+      for _ in range(10):
+        self.assertEqual(initial_mean_accuracy, mean_accuracy.eval())
+
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = data_flow_ops.FIFOQueue(
+          5, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = data_flow_ops.FIFOQueue(
+          5, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes)
+
+      sess.run(variables.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
+      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+
+  def testMultipleUpdatesWithWeights(self):
+    num_classes = 2
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = data_flow_ops.FIFOQueue(
+          6, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = data_flow_ops.FIFOQueue(
+          6, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = data_flow_ops.FIFOQueue(
+          6, dtypes=dtypes_lib.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      weights = weights_queue.dequeue()
+
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes, weights=weights)
+
+      variables.local_variables_initializer().run()
+      for _ in range(6):
+        sess.run(update_op)
+      desired_output = np.mean([2.0 / 2.0, 1.0 / 2.0])
+      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+
+  def testMultipleUpdatesWithMissingClass(self):
+    # Test the case where there are no predicions and labels for
+    # one class, and thus there is one row and one column with
+    # zero entries in the confusion matrix.
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      # There is no prediction for class 2.
+      preds_queue = data_flow_ops.FIFOQueue(
+          5, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      # There is label for class 2.
+      labels_queue = data_flow_ops.FIFOQueue(
+          5, dtypes=dtypes_lib.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes)
+
+      sess.run(variables.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
+      self.assertAlmostEqual(desired_output, mean_accuracy.eval())
+
+  def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]), constant_op.constant(1, shape=[7])
+    ], 0)
+    num_classes = 2
+    with self.test_session() as sess:
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes)
+      sess.run(variables.local_variables_initializer())
+      confusion_matrix = update_op.eval()
+      self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
+      desired_mean_accuracy = np.mean([3. / 3., 5. / 7.])
+      self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
+
+  def testAllCorrect(self):
+    predictions = array_ops.zeros([40])
+    labels = array_ops.zeros([40])
+    num_classes = 1
+    with self.test_session() as sess:
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes)
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(40, update_op.eval()[0])
+      self.assertEqual(1.0, mean_accuracy.eval())
+
+  def testAllWrong(self):
+    predictions = array_ops.zeros([40])
+    labels = array_ops.ones([40])
+    num_classes = 2
+    with self.test_session() as sess:
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes)
+      sess.run(variables.local_variables_initializer())
+      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
+      self.assertEqual(0., mean_accuracy.eval())
+
+  def testResultsWithSomeMissing(self):
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]), constant_op.constant(1, shape=[7])
+    ], 0)
+    num_classes = 2
+    weights = array_ops.concat([
+        constant_op.constant(0, shape=[1]), constant_op.constant(1, shape=[8]),
+        constant_op.constant(0, shape=[1])
+    ], 0)
+    with self.test_session() as sess:
+      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes, weights=weights)
+      sess.run(variables.local_variables_initializer())
+      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
+      desired_mean_accuracy = np.mean([2. / 2., 4. / 6.])
+      self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/multinomial_op_big_test.py b/tensorflow/python/kernel_tests/multinomial_op_big_test.py
new file mode 100644
index 00000000000..0023506b77a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/multinomial_op_big_test.py
@@ -0,0 +1,92 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Long tests for Multinomial."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class MultinomialTest(test.TestCase):
+  # check that events with tiny probabilities are not over-sampled
+
+  def testLargeDynamicRange(self):
+    random_seed.set_random_seed(10)
+    counts_by_indices = {}
+    with self.test_session(use_gpu=True) as sess:
+      samples = random_ops.multinomial(
+          constant_op.constant([[-30, 0]], dtype=dtypes.float32),
+          num_samples=1000000,
+          seed=15)
+      for _ in range(100):
+        x = sess.run(samples)
+        indices, counts = np.unique(x, return_counts=True)
+        for index, count in zip(indices, counts):
+          if index in counts_by_indices.keys():
+            counts_by_indices[index] += count
+          else:
+            counts_by_indices[index] = count
+    self.assertEqual(counts_by_indices[1], 100000000)
+
+  def testLargeDynamicRange2(self):
+    random_seed.set_random_seed(10)
+    counts_by_indices = {}
+    with self.test_session(use_gpu=True) as sess:
+      samples = random_ops.multinomial(
+          constant_op.constant([[0, -30]], dtype=dtypes.float32),
+          num_samples=1000000,
+          seed=15)
+      for _ in range(100):
+        x = sess.run(samples)
+        indices, counts = np.unique(x, return_counts=True)
+        for index, count in zip(indices, counts):
+          if index in counts_by_indices.keys():
+            counts_by_indices[index] += count
+          else:
+            counts_by_indices[index] = count
+    self.assertEqual(counts_by_indices[0], 100000000)
+
+  def testLargeDynamicRange3(self):
+    random_seed.set_random_seed(10)
+    counts_by_indices = {}
+    # here the cpu undersamples and won't pass this test either
+    with self.test_session(use_gpu=True) as sess:
+      samples = random_ops.multinomial(
+          constant_op.constant([[0, -17]], dtype=dtypes.float32),
+          num_samples=1000000,
+          seed=22)
+
+      # we'll run out of memory if we try to draw 1e9 samples directly
+      # really should fit in 12GB of memory...
+      for _ in range(100):
+        x = sess.run(samples)
+        indices, counts = np.unique(x, return_counts=True)
+        for index, count in zip(indices, counts):
+          if index in counts_by_indices.keys():
+            counts_by_indices[index] += count
+          else:
+            counts_by_indices[index] = count
+    self.assertGreater(counts_by_indices[1], 0)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/multinomial_op_test.py b/tensorflow/python/kernel_tests/multinomial_op_test.py
index ab082a1efdf..d6e1b2b4c05 100644
--- a/tensorflow/python/kernel_tests/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/multinomial_op_test.py
@@ -51,11 +51,10 @@ native_sampler = random_ops.multinomial
 
 
 class MultinomialTest(test.TestCase):
-  use_gpu = False
 
   def testSmallEntropy(self):
     random_seed.set_random_seed(1618)
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       # A logit value of -10 corresponds to a probability of ~5e-5.
       logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
       num_samples = 1000
@@ -63,7 +62,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
   def testOneOpMultipleStepsIndependent(self):
-    with self.test_session(use_gpu=self.use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       sample_op1, _ = self._make_ops(10)
       # Consecutive runs shouldn't yield identical output.
       sample1a = sess.run(sample_op1)
@@ -71,7 +70,7 @@ class MultinomialTest(test.TestCase):
       self.assertFalse(np.equal(sample1a, sample1b).all())
 
   def testTwoOpsIndependent(self):
-    with self.test_session(use_gpu=self.use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       sample_op1, sample_op2 = self._make_ops(32)
       sample1, sample2 = sess.run([sample_op1, sample_op2])
       # We expect sample1 and sample2 to be independent.
@@ -79,14 +78,14 @@ class MultinomialTest(test.TestCase):
       self.assertFalse(np.equal(sample1, sample2).all())
 
   def testTwoOpsSameSeedDrawSameSequences(self):
-    with self.test_session(use_gpu=self.use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       sample_op1, sample_op2 = self._make_ops(1000, seed=1)
       sample1, sample2 = sess.run([sample_op1, sample_op2])
       self.assertAllEqual(sample1, sample2)
 
   def testLargeLogits(self):
     for neg in [True, False]:
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session(use_gpu=True):
         logits = np.array([[1000.] * 5])
         if neg:
           logits *= -1
@@ -147,7 +146,7 @@ class MultinomialTest(test.TestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.test_session(use_gpu=self.use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       random_seed.set_random_seed(1618)
       op = sampler(constant_op.constant(logits), num_samples)
       d = sess.run(op)
@@ -176,7 +175,7 @@ class MultinomialTest(test.TestCase):
 
   def testEmpty(self):
     classes = 5
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       for batch in 0, 3:
         for samples in 0, 7:
           x = random_ops.multinomial(
@@ -184,24 +183,20 @@ class MultinomialTest(test.TestCase):
           self.assertEqual(x.shape, (batch, samples))
 
   def testEmptyClasses(self):
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       x = random_ops.multinomial(array_ops.zeros([5, 0]), 7)
       with self.assertRaisesOpError("num_classes should be positive"):
         x.eval()
 
   def testNegativeMinLogits(self):
     random_seed.set_random_seed(78844)
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       logits = constant_op.constant([[np.finfo(np.float32).min] * 1023 + [0]])
       num_samples = 1000
       samples = random_ops.multinomial(logits, num_samples).eval()
       self.assertAllEqual([[1023] * num_samples], samples)
 
 
-class MultinomialGpuTest(MultinomialTest):
-  use_gpu = True
-
-
 # Benchmarking code
 def native_op_vs_composed_ops(batch_size, num_classes, num_samples, num_iters):
   np.random.seed(1618)  # Make it reproducible.
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
new file mode 100644
index 00000000000..30795eed8a0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -0,0 +1,287 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for neon kernel for depthwise convolutional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+def ConfigsToTest():
+  """Iterator for different convolution shapes, strides and paddings.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
+    convolution parameters.
+  """
+  input_sizes = [[4, 5, 5, 48], [4, 8, 8, 84], [4, 17, 17, 48], [4, 35, 35, 2],
+                 [4, 147, 147, 2], [3, 299, 299, 3], [5, 183, 183, 1]]
+  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [3, 1, 48, 4], [5, 5, 2, 1],
+                  [3, 3, 2, 8], [2, 2, 3, 8], [5, 5, 1, 2]]
+  out_sizes = [[4, 5, 5, 96], [4, 8, 8, 84], [4, 17, 17, 192], [4, 35, 35, 2],
+               [4, 49, 49, 16], [3, 150, 150, 24], [5, 92, 92, 2]]
+  strides = [1, 1, 1, 1, 3, 2, 2]
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [SAME, SAME, SAME, SAME, VALID, SAME, SAME, SAME]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+def CheckGradConfigsToTest():
+  """Iterator for different convolution shapes, strides and paddings.
+
+  compute_gradient_error() is very expensive. So the configs should be
+  relatively small.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
+    convolution parameters.
+  """
+  input_sizes = [[2, 5, 8, 1], [4, 5, 5, 1], [2, 4, 4, 2], [1, 15, 15, 2],
+                 [2, 15, 16, 1]]
+  filter_sizes = [[4, 4, 1, 2], [2, 2, 1, 2], [3, 1, 2, 2], [1, 3, 2, 1],
+                  [3, 3, 1, 2]]
+  out_sizes = [[2, 5, 8, 2], [4, 2, 2, 2], [2, 4, 4, 4], [1, 15, 15, 2],
+               [2, 5, 5, 2]]
+  strides = [1, 2, 1, 1, 3]
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [SAME, VALID, SAME, SAME, VALID]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+class DepthwiseConv2DTest(test.TestCase):
+
+  # This is testing that depthwise_conv2d and depthwise_conv2d_native
+  # produce the same results.  It also tests that NCHW and NWHC
+  # formats agree, by comparing the depthwise_conv2d_native with
+  # 'NCHW' format (with transposition) matches the 'NHWC' format using
+  # the higher level interface.
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    stride,
+                    padding,
+                    use_gpu,
+                    data_format="NHWC"):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [filter_rows, filter_cols, input_depth, depth_multiplier].
+      stride: Stride.
+      padding: Padding type.
+      use_gpu: Whether to use GPU.
+      data_format: The data_format of the input.  "NHWC" or "NCHW".
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input and filter tensor with numbers incrementing from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t1.set_shape(tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+
+      native_t1 = t1
+      strides = [1, stride, stride, 1]
+      if data_format == "NCHW":
+        # Transpose from NWHC input to NCHW
+        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        strides = [1, 1, stride, stride]
+
+      conv_native = nn_ops.depthwise_conv2d_native(
+          native_t1,
+          t2,
+          strides=strides,
+          data_format=data_format,
+          padding=padding)
+
+      if data_format == "NCHW":
+        # Transpose back from NCHW to NHWC
+        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
+
+      conv_interface = nn_impl.depthwise_conv2d(
+          t1, t2, strides=[1, stride, stride, 1], padding=padding)
+
+      native_result = sess.run(conv_native)
+      interface_result = sess.run(conv_interface)
+
+    print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
+          ", stride:", stride, ", padding: ", padding, ", max diff: ",
+          np.amax(np.absolute(native_result - interface_result)))
+    self.assertArrayNear(
+        np.ravel(native_result), np.ravel(interface_result), 1e-5)
+    self.assertShapeEqual(native_result, conv_native)
+    self.assertShapeEqual(native_result, conv_interface)
+
+  def testDepthwiseConv2D(self):
+    for index, (input_size, filter_size, _, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Processing ", index, "th config.")
+      if index == 2:
+        self._VerifyValues(
+            input_size, filter_size, stride, padding, use_gpu=True)
+      self._VerifyValues(
+          input_size, filter_size, stride, padding, use_gpu=False)
+
+  def testDepthwiseConv2DFormat(self):
+    if not test.is_gpu_available():
+      return
+
+    for index, (input_size, filter_size, _, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Processing ", index, "th config.")
+      self._VerifyValues(
+          input_size,
+          filter_size,
+          stride,
+          padding,
+          use_gpu=True,
+          data_format="NCHW")
+
+# This is testing against hand calculated results.
+
+  def _VerifyHandValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
+                        expected, use_gpu):
+    """Verifies the output values of the depthwise convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [filter_rows, filter_cols, input_depth, depth_multiplier].
+      stride: Stride.
+      padding: Padding type.
+      expected: An array containing the expected operation outputs.
+      use_gpu: Whether to use GPU.
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t1.set_shape(tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+        conv = nn_ops.depthwise_conv2d_native(
+            t1, t2, strides=[1, stride, stride, 1], padding=padding)
+        value = sess.run(conv)
+    print("value = ", value)
+    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertShapeEqual(value, conv)
+
+  def testConv2D2x2Filter(self):
+    # The inputs look like this (it's a 3 x 2 matrix, each of depth 2):
+    #
+    # [ (1.0, 2.0), (3.0,  4.0), ( 5.0,  6.0) ]
+    # [ (7.0, 8.0), (9.0, 10.0), (11.0, 12.0) ]
+    #  We can view this as two inputs
+    #
+    #  input depth 0:
+    #
+    #  [ 1.0,  3.0,  5.0 ]
+    #  [ 7.0,  9.0, 11.0 ]
+    #
+    #  input depth 1:
+    #
+    #  [ 2.0,  4.0,  6.0 ]
+    #  [ 8.0, 10.0, 12.0 ]
+    #
+    # The filter looks like this (it has two 2 x 2 patches, each generating 2
+    # depths):
+    #
+    #  filter #0:
+    #
+    #  [ (1.0,  3.0), ( 5.0,  7.0)]
+    #  [ (9.0, 11.0), (13.0, 15.0)]
+    #
+    #  filter #1:
+    #
+    #  [ ( 2.0,  4.0), ( 6.0,  8.0)]
+    #  [ (10.0, 12.0), (14.0, 16.0)]
+    #
+    # So the outputs are:
+    #
+    # (position 0, 0: in_depth 0, output_depth 0 -- using filter #0)
+    #  1.0 * 1.0 + 7.0 * 9.0 + 3.0 * 5.0 + 9.0 * 13.0 = 196
+    # (position 0, 0: in_depth 0, output_depth 1 -- using filter #1)
+    #  1.0 * 2.0 + 7.0 * 10.0 + 3.0 * 6.0 + 9.0 * 14.0 = 216
+    # (position 0, 0: in_depth 1, output_depth 2 -- using filter #0)
+    #  2.0 * 3.0 + 8.0 * 11.0 + 4.0 * 7.0 + 10.0 * 15.0 = 272
+    # (position 0, 0: in_depth 1, output_depth 3 -- using filter #1)
+    #  2.0 * 4.0 + 8.0 * 12.0 + 4.0 * 8.0 + 10.0 * 16.0 = 296
+    #
+    # (position 1, 0: in_depth 0, output_depth 0 -- using filter #0)
+    #  3.0 * 1.0 + 9.0 * 9.0 + 5.0 * 5.0 + 11.0 * 13.0 = 252
+    # (position 1, 0: in_depth 0, output_depth 1 -- using filter #1)
+    #  3.0 * 2.0 + 9.0 * 10.0 + 5.0 * 6.0 + 11.0 * 14.0 = 280
+    # (position 1, 0: in_depth 1, output_depth 2 -- using filter #0)
+    #  4.0 * 3.0 + 10.0 * 11.0 + 6.0 * 7.0 + 12.0 * 15.0 = 344
+    # (position 1, 0: in_depth 1, output_depth 3 -- using filter #1)
+    #  4.0 * 4.0 + 10.0 * 12.0 + 6.0 * 8.0 + 12.0 * 16.0 = 376
+    expected_output = [196, 216, 272, 296, 252, 280, 344, 376]
+    self._VerifyHandValues(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        stride=1,
+        padding="VALID",
+        expected=expected_output,
+        use_gpu=False)
+
+    self._VerifyHandValues(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        stride=1,
+        padding="VALID",
+        expected=expected_output,
+        use_gpu=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index 65c500ac9cd..d85512fae69 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -90,6 +90,8 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
     if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2):
       self.skipTest("Not supported by numpy.linalg.norm")
     matrix = np.random.randn(*shape_).astype(dtype_)
+    if dtype_ in (np.complex64, np.complex128):
+      matrix += 1j * np.random.randn(*shape_).astype(dtype_)
     _CompareNorm(self, matrix)
 
   return Test
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
index a43f169df08..dd67919f69e 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
@@ -104,7 +104,6 @@ def z_test(real, expected, i, num_samples):
 
 
 class ParameterizedTruncatedNormalTest(test.TestCase):
-  _use_gpu = False
   z_limit = 6.0
 
   # Stop at moment 10 to avoid numerical errors in the theoretical moments.
@@ -116,7 +115,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
       # Give up early if we are unable to import it.
       import scipy.stats  # pylint: disable=g-import-not-at-top,unused-variable
       random_seed.set_random_seed(seed)
-      with self.test_session(use_gpu=self._use_gpu):
+      with self.test_session(use_gpu=True):
         samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
                                                             minval,
                                                             maxval).eval()
@@ -140,7 +139,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     try:
       import scipy.stats  # pylint: disable=g-import-not-at-top
       random_seed.set_random_seed(seed)
-      with self.test_session(use_gpu=self._use_gpu):
+      with self.test_session(use_gpu=True):
         samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
                                                             minval,
                                                             maxval).eval()
@@ -184,10 +183,6 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
 
 
-class ParameterizedTruncatedNormalGpuTest(ParameterizedTruncatedNormalTest):
-  _use_gpu = True
-
-
 # Benchmarking code
 def parameterized_vs_naive(shape, num_iters, use_gpu=False):
   np.random.seed(1618)  # Make it reproducible.
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index b66d271f3c6..ff75b94322c 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import itertools
 
 import numpy as np
@@ -31,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -92,6 +94,7 @@ class ParseExampleTest(test.TestCase):
                                                  expected_err[1]):
           out = parsing_ops.parse_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
+        return
       else:
         # Returns dict w/ Tensors and SparseTensors.
         out = parsing_ops.parse_example(**kwargs)
@@ -321,7 +324,8 @@ class ParseExampleTest(test.TestCase):
     self._test({
         "serialized": ops.convert_to_tensor(serialized),
         "features": {
-            "sp": parsing_ops.SparseFeature("idx", "val", dtypes.float32, 13)
+            "sp": parsing_ops.SparseFeature(
+                ["idx"], "val", dtypes.float32, [13])
         }
     }, expected_output)
 
@@ -368,6 +372,51 @@ class ParseExampleTest(test.TestCase):
         }
     }, expected_output)
 
+  def testSerializedContaining3DSparseFeature(self):
+    original = [
+        example(features=features({
+            "val": float_feature([3, 4]),
+            "idx0": int64_feature([5, 10]),
+            "idx1": int64_feature([0, 2]),
+        })),
+        example(features=features({
+            "val": float_feature([]),  # empty float list
+            "idx0": int64_feature([]),
+            "idx1": int64_feature([]),
+        })),
+        example(features=features({
+            "val": feature(),  # feature with nothing in it
+            # missing idx feature
+        })),
+        example(features=features({
+            "val": float_feature([1, 2, -1]),
+            "idx0": int64_feature([0, 9, 3]),  # unsorted
+            "idx1": int64_feature([1, 0, 2]),
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_sp = (
+        # indices
+        np.array(
+            [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
+            dtype=np.int64),
+        # values
+        np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
+        # shape batch == 4, max_elems = 13
+        np.array([4, 13, 3], dtype=np.int64))
+
+    expected_output = {"sp": expected_sp,}
+
+    self._test({
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            "sp": parsing_ops.SparseFeature(
+                ["idx0", "idx1"], "val", dtypes.float32, [13, 3])
+        }
+    }, expected_output)
+
   def testSerializedContainingDense(self):
     aname = "a"
     bname = "b*has+a:tricky_name"
@@ -632,10 +681,254 @@ class ParseExampleTest(test.TestCase):
         "serialized": ops.convert_to_tensor(serialized),
         "features": {
             "idx": parsing_ops.VarLenFeature(dtypes.int64),
-            "sp": parsing_ops.SparseFeature("idx", "val", dtypes.string, 13),
+            "sp": parsing_ops.SparseFeature(
+                ["idx"], "val", dtypes.string, [13]),
         }
     }, expected_output)
 
+  def _testSerializedContainingVarLenDenseLargerBatch(self, batch_size):
+    # During parsing, data read from the serialized proto is stored in buffers.
+    # For small batch sizes, a buffer will contain one minibatch entry.
+    # For larger batch sizes, a buffer may contain several minibatch
+    # entries.  This test identified a bug where the code that copied
+    # data out of the buffers and into the output tensors assumed each
+    # buffer only contained one minibatch entry.  The bug has since been fixed.
+    truth_int = [i for i in range(batch_size)]
+    truth_str = [[("foo%d" % i).encode(), ("bar%d" % i).encode()]
+                 for i in range(batch_size)]
+
+    expected_str = copy.deepcopy(truth_str)
+
+    # Delete some intermediate entries
+    for i in range(batch_size):
+      col = 1
+      if np.random.rand() < 0.25:
+        # w.p. 25%, drop out the second entry
+        expected_str[i][col] = b"default"
+        col -= 1
+        truth_str[i].pop()
+      if np.random.rand() < 0.25:
+        # w.p. 25%, drop out the second entry (possibly again)
+        expected_str[i][col] = b"default"
+        truth_str[i].pop()
+
+    expected_output = {
+        # Batch size batch_size, 1 time step.
+        "a": np.array(truth_int, dtype=np.int64).reshape(batch_size, 1),
+        # Batch size batch_size, 2 time steps.
+        "b": np.array(expected_str, dtype="|S").reshape(batch_size, 2),
+    }
+
+    original = [
+        example(features=features(
+            {"a": int64_feature([truth_int[i]]),
+             "b": bytes_feature(truth_str[i])}))
+        for i in range(batch_size)
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    self._test({
+        "serialized": ops.convert_to_tensor(serialized, dtype=dtypes.string),
+        "features": {
+            "a": parsing_ops.FixedLenSequenceFeature(
+                shape=(), dtype=dtypes.int64, allow_missing=True,
+                default_value=-1),
+            "b": parsing_ops.FixedLenSequenceFeature(
+                shape=[], dtype=dtypes.string, allow_missing=True,
+                default_value="default"),
+        }
+    }, expected_output)
+
+  def testSerializedContainingVarLenDenseLargerBatch(self):
+    np.random.seed(3456)
+    for batch_size in (1, 10, 20, 100, 256):
+      self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
+
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    example_names = ["in1", "in2", "in3", "in4"]
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str", b"b1_str"]),
+        })),
+        example(features=features({
+            aname: float_feature([-1, -1, 2, 2]),
+            bname: bytes_feature([b"b1"]),
+        })),
+        example(features=features({
+            aname: float_feature([]),
+            cname: int64_feature([3]),
+        })),
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        aname:
+            np.array(
+                [
+                    [0, 0, 0, 0],
+                    [1, 1, 0, 0],
+                    [-1, -1, 2, 2],
+                    [0, 0, 0, 0],
+                ],
+                dtype=np.float32).reshape(4, 2, 2, 1),
+        bname:
+            np.array(
+                [["", ""], ["b0_str", "b1_str"], ["b1", ""], ["", ""]],
+                dtype=bytes).reshape(4, 2, 1, 1, 1),
+        cname:
+            np.array([2, 0, 0, 3], dtype=np.int64).reshape(4, 1),
+        dname:
+            np.empty(shape=(4, 0), dtype=bytes),
+    }
+
+    self._test({
+        "example_names": example_names,
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }
+    }, expected_output)
+
+    # Test with padding values.
+    expected_output_custom_padding = dict(expected_output)
+    expected_output_custom_padding[aname] = np.array(
+        [
+            [-2, -2, -2, -2],
+            [1, 1, -2, -2],
+            [-1, -1, 2, 2],
+            [-2, -2, -2, -2],
+        ],
+        dtype=np.float32).reshape(4, 2, 2, 1)
+
+    self._test({
+        "example_names": example_names,
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True,
+                    default_value=-2.0),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }
+    }, expected_output_custom_padding)
+
+    # Change number of required values so the inputs are not a
+    # multiple of this size.
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(
+            errors_impl.OpError, "Name: in3, Key: b, Index: 2.  "
+            "Number of bytes values is not a multiple of stride length."))
+
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True,
+                        default_value=[]),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenFeature(
+                        (None, 2, 1), dtype=dtypes.float32),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "First dimension of shape for feature a unknown. "
+                      "Consider using FixedLenSequenceFeature."))
+
+    self._test(
+        {
+            "example_names": example_names,
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                cname:
+                    parsing_ops.FixedLenFeature(
+                        (1, None), dtype=dtypes.int64, default_value=[[1]]),
+            }
+        },
+        expected_err=(ValueError,
+                      "All dimensions of shape for feature c need to be known "
+                      r"but received \(1, None\)."))
+
+    self._test({
+        "example_names": example_names,
+        "serialized": ops.convert_to_tensor(serialized),
+        "features": {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=False),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }
+    }, expected_err=(ValueError,
+                     "Unsupported: FixedLenSequenceFeature requires "
+                     "allow_missing to be True."))
+
 
 class ParseSingleExampleTest(test.TestCase):
 
@@ -656,7 +949,8 @@ class ParseSingleExampleTest(test.TestCase):
       # Check shapes.
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
-          self.assertEqual(tuple(out[k].get_shape()), f.shape)
+          self.assertEqual(tuple(out[k].get_shape()),
+                           tensor_shape.as_shape(f.shape))
         elif isinstance(f, parsing_ops.VarLenFeature):
           self.assertEqual(
               tuple(out[k].indices.get_shape().as_list()), (None, 1))
@@ -667,6 +961,7 @@ class ParseSingleExampleTest(test.TestCase):
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(features=features({
         "c": float_feature([3, 4]),
+        "d": float_feature([0.0, 1.0]),
         "val": bytes_feature([b"a", b"b"]),
         "idx": int64_feature([0, 3]),
         "st_a": float_feature([3.0, 4.0])
@@ -695,8 +990,8 @@ class ParseSingleExampleTest(test.TestCase):
         "sp": expected_sp,
         "a": [a_default],
         "b": b_default,
-        "c": np.array(
-            [3, 4], dtype=np.float32),
+        "c": np.array([3, 4], dtype=np.float32),
+        "d": np.array([0.0, 1.0], dtype=np.float32),
     }
 
     self._test(
@@ -709,7 +1004,8 @@ class ParseSingleExampleTest(test.TestCase):
                 "st_a":
                     parsing_ops.VarLenFeature(dtypes.float32),
                 "sp":
-                    parsing_ops.SparseFeature("idx", "val", dtypes.string, 13),
+                    parsing_ops.SparseFeature(
+                        ["idx"], "val", dtypes.string, [13]),
                 "a":
                     parsing_ops.FixedLenFeature(
                         (1, 3), dtypes.int64, default_value=a_default),
@@ -718,7 +1014,11 @@ class ParseSingleExampleTest(test.TestCase):
                         (3, 3), dtypes.string, default_value=b_default),
                 # Feature "c" must be provided, since it has no default_value.
                 "c":
-                    parsing_ops.FixedLenFeature((2,), dtypes.float32),
+                    parsing_ops.FixedLenFeature(2, dtypes.float32),
+                "d":
+                    parsing_ops.FixedLenSequenceFeature([],
+                                                        dtypes.float32,
+                                                        allow_missing=True)
             }
         },
         expected_output)
@@ -894,7 +1194,7 @@ class ParseSequenceExampleTest(test.TestCase):
                 "b":
                     parsing_ops.FixedLenSequenceFeature((2, 2), dtypes.string),
                 "c":
-                    parsing_ops.FixedLenSequenceFeature((2,), dtypes.float32),
+                    parsing_ops.FixedLenSequenceFeature(2, dtypes.float32),
                 "d":
                     parsing_ops.FixedLenSequenceFeature(
                         (5,), dtypes.float32, allow_missing=True),
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index a3351d84781..563815b7d84 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -273,6 +273,14 @@ class PoolingTest(test.TestCase):
               strides=[1, 2],
               dilation_rate=[1, 1],
               data_format="NCHW")
+          self._test(
+              input_shape=[2, 2, 7, 5, 3],
+              window_shape=[2, 2, 2],
+              padding=padding,
+              pooling_type="MAX",
+              strides=[1, 2, 1],
+              dilation_rate=[1, 1, 1],
+              data_format="NCDHW")
         self._test(
             input_shape=[2, 2, 7, 9],
             window_shape=[2, 2],
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index c44e623635f..fa1553a3f6b 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -21,16 +21,32 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+def GetTestConfigs():
+  """Get all the valid tests configs to run.
+
+  Returns:
+    all the valid test configs as tuples of data_format and use_gpu.
+  """
+  test_configs = [("NDHWC", False), ("NDHWC", True)]
+  if test.is_gpu_available(cuda_only=True):
+    # "NCHW" format is currently supported exclusively on CUDA GPUs.
+    test_configs += [("NCDHW", True)]
+  return test_configs
+
+
+# TODO(mjanusz): Add microbenchmarks for 3d pooling.
 class PoolingTest(test.TestCase):
 
-  def _VerifyValues(self, pool_func, input_sizes, window, strides, padding,
-                    expected):
+  def _VerifyOneTest(self, pool_func, input_sizes, window, strides, padding,
+                     data_format, expected, use_gpu):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -39,7 +55,9 @@ class PoolingTest(test.TestCase):
       window: Tuple of kernel dims: planes, rows, cols.
       strides: Tuple of strides for dims: planes, rows, cols.
       padding: Padding type.
+      data_format: The data format we use to run the pooling operation.
       expected: An array containing the expected operation outputs.
+      use_gpu: Whether to run ops on GPU.
     """
     total_size = 1
     for s in input_sizes:
@@ -47,18 +65,33 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=use_gpu) as sess:
       t = constant_op.constant(x, shape=input_sizes)
+      window = [1] + list(window) + [1]
+      strides = [1] + list(strides) + [1]
+      if data_format == "NCDHW":
+        t = test_util.NHWCToNCHW(t)
+        window = test_util.NHWCToNCHW(window)
+        strides = test_util.NHWCToNCHW(strides)
       t = pool_func(
           t,
-          ksize=[1, window[0], window[1], window[2], 1],
-          strides=[1, strides[0], strides[1], strides[2], 1],
-          padding=padding)
+          ksize=window,
+          strides=strides,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCDHW":
+        t = test_util.NCHWToNHWC(t)
       vals = sess.run(t)
     # Verifies values.
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
 
+  def _VerifyValues(self, pool_func, input_sizes, window, strides,
+                    padding, expected):
+    for data_format, use_gpu in GetTestConfigs():
+      self._VerifyOneTest(pool_func, input_sizes, window, strides, padding,
+                          data_format, expected, use_gpu)
+
   def testAvgPool3dValidPadding(self):
     expected_output = [20.5, 21.5, 22.5]
     self._VerifyValues(
@@ -172,15 +205,16 @@ class PoolingTest(test.TestCase):
         padding="VALID",
         expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5])
 
-  def _ConstructAndTestGradient(self,
-                                pool_func,
-                                input_sizes,
-                                output_sizes,
-                                window,
-                                strides,
-                                padding,
-                                x_init_value=None):
-    """Verifies the gradients of the avg pooling function.
+  def _ConstructAndTestGradientForConfig(self,
+                                         pool_func,
+                                         input_sizes,
+                                         output_sizes,
+                                         window,
+                                         strides,
+                                         padding,
+                                         data_format,
+                                         use_gpu):
+    """Verifies the gradients of a pooling function.
 
     Args:
       pool_func: Function to be called, co.MaxPool, co.AvgPool,
@@ -190,42 +224,75 @@ class PoolingTest(test.TestCase):
       window: Tuple of kernel dims: planes, rows, cols.
       strides: Tuple of strides for dims: planes, rows, cols.
       padding: Padding type.
-      x_init_value: Values to be passed to the gradient checker.
+      data_format: Data format string.
+      use_gpu: Whether to run on GPU.
     """
     total_size = 1
     for s in input_sizes:
       total_size *= s
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
-    x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=True):
+    x = np.arange(1, total_size + 1, dtype=np.float32)
+    with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
-      err_margin = 1e-3
+      err_g_margin = 1e-3
+      err_gg_margin = 1.5e-2
       if pool_func == nn_ops.avg_pool3d:
         func_name = "avg_pool3d"
+        x_init_value = None
       else:
-        if x_init_value is None:
-          x_init_value = np.asfarray(
-              np.arange(1, total_size + 1),
-              dtype=np.float32).reshape(input_sizes)
-          func_name = "max_pool3d"
+        x_init_value = np.asfarray(np.arange(1, total_size + 1),
+                                   dtype=np.float32).reshape(input_sizes)
+        func_name = "max_pool3d"
+
+      ksize = [1, window[0], window[1], window[2], 1]
+      strides = [1, strides[0], strides[1], strides[2], 1]
+      t = input_tensor
+
+      if data_format == "NCDHW":
+        ksize = test_util.NHWCToNCHW(ksize)
+        strides = test_util.NHWCToNCHW(strides)
+        t = test_util.NHWCToNCHW(t)
 
       t = pool_func(
-          input_tensor,
-          ksize=[1, window[0], window[1], window[2], 1],
-          strides=[1, strides[0], strides[1], strides[2], 1],
+          t,
+          ksize=ksize,
+          strides=strides,
           padding=padding,
+          data_format=data_format,
           name=func_name)
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
 
-      err = gradient_checker.compute_gradient_error(
+      err_g = gradient_checker.compute_gradient_error(
           input_tensor,
           input_sizes,
           t,
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
-    self.assertLess(err, err_margin)
+      err_gg = gradient_checker.compute_gradient_error(
+          input_tensor,
+          input_sizes,
+          t_g,
+          input_sizes,
+          x_init_value=x_init_value,
+          delta=1e-2)
+
+    print("%s gradient error = " % func_name, err_g)
+    self.assertLess(err_g, err_g_margin)
+    print("%s second-order gradient error = " % func_name, err_gg)
+    self.assertLess(err_gg, err_gg_margin)
+
+  def _ConstructAndTestGradient(self,
+                                pool_func,
+                                **kwargs):
+    """Runs _ConstructAndTestGradientForConfig for all tests configurations."""
+
+    for data_format, use_gpu in GetTestConfigs():
+      self._ConstructAndTestGradientForConfig(pool_func,
+                                              data_format=data_format,
+                                              use_gpu=use_gpu,
+                                              **kwargs)
 
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
@@ -239,8 +306,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 3, 3, 6, 3],
-        output_sizes=[2, 2, 2, 5, 3],
+        input_sizes=[1, 2, 3, 4, 2],
+        output_sizes=[1, 1, 2, 3, 2],
         window=(2, 2, 2),
         strides=(1, 1, 1),
         padding="VALID")
@@ -248,8 +315,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 3, 5, 7, 3],
-        output_sizes=[2, 2, 4, 6, 3],
+        input_sizes=[1, 3, 2, 7, 1],
+        output_sizes=[1, 2, 1, 6, 1],
         window=(2, 2, 2),
         strides=(1, 1, 1),
         padding="VALID")
@@ -257,8 +324,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 2, 2, 2, 3],
-        output_sizes=[2, 1, 1, 1, 3],
+        input_sizes=[2, 2, 2, 2, 1],
+        output_sizes=[2, 1, 1, 1, 1],
         window=(2, 2, 2),
         strides=(2, 2, 2),
         padding="VALID")
@@ -266,8 +333,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 3, 2, 4, 1],
-        output_sizes=[2, 3, 2, 4, 1],
+        input_sizes=[1, 3, 2, 4, 1],
+        output_sizes=[1, 3, 2, 4, 1],
         window=(1, 1, 1),
         strides=(1, 1, 1),
         padding="SAME")
@@ -275,8 +342,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 3, 2, 4, 1],
-        output_sizes=[2, 3, 2, 4, 1],
+        input_sizes=[1, 3, 2, 4, 1],
+        output_sizes=[1, 3, 2, 4, 1],
         window=(2, 2, 2),
         strides=(1, 1, 1),
         padding="SAME")
@@ -284,8 +351,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[2, 5, 2, 4, 3],
-        output_sizes=[2, 3, 1, 2, 3],
+        input_sizes=[1, 5, 2, 4, 2],
+        output_sizes=[1, 3, 1, 2, 2],
         window=(2, 2, 2),
         strides=(2, 2, 2),
         padding="SAME")
@@ -293,8 +360,8 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
-        input_sizes=[1, 3, 3, 7, 1],
-        output_sizes=[1, 3, 3, 7, 1],
+        input_sizes=[1, 3, 4, 2, 1],
+        output_sizes=[1, 3, 4, 2, 1],
         window=(3, 3, 3),
         strides=(1, 1, 1),
         padding="SAME")
@@ -302,8 +369,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[2, 3, 3, 3, 3],
-        output_sizes=[2, 3, 3, 3, 3],
+        input_sizes=[1, 3, 3, 3, 1],
+        output_sizes=[1, 3, 3, 3, 1],
         window=(1, 1, 1),
         strides=(1, 1, 1),
         padding="VALID")
@@ -311,8 +378,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradValidPadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[2, 3, 3, 3, 3],
-        output_sizes=[2, 2, 2, 2, 3],
+        input_sizes=[1, 3, 3, 3, 2],
+        output_sizes=[1, 2, 2, 2, 2],
         window=(2, 2, 2),
         strides=(1, 1, 1),
         padding="VALID")
@@ -320,8 +387,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[2, 2, 2, 2, 3],
-        output_sizes=[2, 1, 1, 1, 3],
+        input_sizes=[2, 2, 2, 2, 2],
+        output_sizes=[2, 1, 1, 1, 2],
         window=(2, 2, 2),
         strides=(2, 2, 2),
         padding="VALID")
@@ -329,8 +396,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[2, 3, 2, 4, 3],
-        output_sizes=[2, 3, 2, 4, 3],
+        input_sizes=[1, 3, 2, 4, 2],
+        output_sizes=[1, 3, 2, 4, 2],
         window=(1, 1, 1),
         strides=(1, 1, 1),
         padding="SAME")
@@ -347,8 +414,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[2, 5, 2, 4, 3],
-        output_sizes=[2, 3, 1, 2, 3],
+        input_sizes=[1, 5, 2, 4, 1],
+        output_sizes=[1, 3, 1, 2, 1],
         window=(2, 2, 2),
         strides=(2, 2, 2),
         padding="SAME")
@@ -356,8 +423,8 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
-        input_sizes=[1, 3, 6, 7, 1],
-        output_sizes=[1, 3, 6, 7, 1],
+        input_sizes=[1, 3, 6, 2, 1],
+        output_sizes=[1, 3, 6, 2, 1],
         window=(3, 3, 3),
         strides=(1, 1, 1),
         padding="SAME")
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 7f5d43e33db..1b6c8bef986 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -23,46 +23,16 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-def NHWCToNCHW(input_tensor):
-  """Convert the input from NHWC format to NCHW.
-
-  Args:
-    input_tensor:  a 4-D tensor, or a 4-element array representing the same.
-
-  Returns:
-    the converted tensor or a shape array
-  """
-  if isinstance(input_tensor, ops.Tensor):
-    return array_ops.transpose(input_tensor, [0, 3, 1, 2])
-  else:
-    return [input_tensor[0], input_tensor[3], input_tensor[1], input_tensor[2]]
-
-
-def NCHWToNHWC(input_tensor):
-  """Convert the input from NCHW format to NHWC.
-
-  Args:
-    input_tensor:  a 4-D tensor, or a 4-element array representing the same.
-
-  Returns:
-    the converted tensor or a shape array
-  """
-  if isinstance(input_tensor, ops.Tensor):
-    return array_ops.transpose(input_tensor, [0, 2, 3, 1])
-  else:
-    return [input_tensor[0], input_tensor[2], input_tensor[3], input_tensor[1]]
-
-
 def GetTestConfigs():
   """Get all the valid tests configs to run.
 
@@ -127,12 +97,12 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
       if data_format == "NCHW":
-        t = NHWCToNCHW(t)
-        ksize = NHWCToNCHW(ksize)
-        strides = NHWCToNCHW(strides)
+        t = test_util.NHWCToNCHW(t)
+        ksize = test_util.NHWCToNCHW(ksize)
+        strides = test_util.NHWCToNCHW(strides)
       t = pool_func(
           t,
           ksize=ksize,
@@ -140,7 +110,7 @@ class PoolingTest(test.TestCase):
           padding=padding,
           data_format=data_format)
       if data_format == "NCHW":
-        t = NCHWToNHWC(t)
+        t = test_util.NCHWToNHWC(t)
       actual = t.eval()
       self.assertAllCloseAccordingToType(expected, actual.flatten())
       self.assertShapeEqual(actual, t)
@@ -527,7 +497,7 @@ class PoolingTest(test.TestCase):
                                          strides,
                                          error_msg,
                                          use_gpu=False):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t = constant_op.constant(1.0, shape=in_size)
       with self.assertRaisesRegexp(errors_impl.UnimplementedError, error_msg):
         t = nn_ops.max_pool(
@@ -553,7 +523,7 @@ class PoolingTest(test.TestCase):
   # The following are tests that verify that the CPU and GPU implementations
   # produce the same resuts.
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
-    for dtype in np.float32, np.float16:
+    for dtype in np.float64, np.float32, np.float16:
       tensor_input = np.random.rand(*input_shape).astype(dtype)
       with self.test_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
@@ -567,7 +537,7 @@ class PoolingTest(test.TestCase):
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
                            padding):
-    for dtype in np.float32, np.float16:
+    for dtype in np.float64, np.float32, np.float16:
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
@@ -590,12 +560,39 @@ class PoolingTest(test.TestCase):
                                            padding)
         cpu_val = out_op.eval()
         self.assertShapeEqual(cpu_val, out_op)
-      if dtype == np.float16:
-        # The CPU version accumulates its gradient on fp16, so it's less
-        # accurate than the GPU version that does the accumulation on fp32
-        self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01)
-      else:
-        self.assertAllClose(cpu_val, gpu_val)
+      # The CPU version accumulates its gradient on fp16, so it's less
+      # accurate than the GPU version that does the accumulation on fp32
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
+
+  def _CompareMaxPoolingGradBk(self, input_shape, output_shape, ksize, strides,
+                               padding):
+    for dtype in np.float64, np.float32, np.float16:
+      # Generate numbers in a narrow range, so that there are many duplicates
+      # in the input.
+      tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = constant_op.constant(tensor_input, shape=input_shape)
+        _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
+        argmax = argmax_op.eval()
+        grad_in = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+            t, grad_in, argmax, ksize, strides, padding)
+        gpu_val = out_op.eval()
+        self.assertShapeEqual(gpu_val, out_op)
+      with self.test_session(use_gpu=False):
+        t = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = nn_ops.max_pool(t, ksize, strides, padding)
+        orig_out = out_op.eval()
+        grad_in = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = gen_nn_ops._max_pool_grad_grad(t, orig_out, grad_in, ksize,
+                                                strides, padding)
+        cpu_val = out_op.eval()
+        self.assertShapeEqual(cpu_val, out_op)
+      # The CPU version accumulates its gradient on fp16, so it's less
+      # accurate than the GPU version that does the accumulation on fp32
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
     # MaxPoolWithArgMax is implemented only on CUDA.
@@ -623,7 +620,7 @@ class PoolingTest(test.TestCase):
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=True):
       orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
       t = constant_op.constant(tensor_input, shape=[1, 2, 2, 1])
       argmax = constant_op.constant(
@@ -639,6 +636,28 @@ class PoolingTest(test.TestCase):
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
+  def testMaxPoolingGradGradWithArgmax(self):
+    # MaxPoolWithArgMax is implemented only on CUDA.
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
+    tensor_input = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]
+    tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
+    with self.test_session(use_gpu=True):
+      orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
+      t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
+      argmax = constant_op.constant(
+          tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
+      out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+          orig_in,
+          t,
+          argmax,
+          ksize=[1, 2, 2, 1],
+          strides=[1, 1, 1, 1],
+          padding="VALID")
+      out = out_op.eval().flatten()
+      self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
+
   def _ConstructAndTestGradient(self,
                                 pool_func,
                                 input_sizes,
@@ -679,18 +698,18 @@ class PoolingTest(test.TestCase):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
       if pool_func == nn_ops.avg_pool:
         func_name = "avg_pool"
-        err_margin = 1e-4
+        err_tolerance = 1e-4
       else:
         if x_init_value is None:
           x_init_value = np.asfarray(
               np.arange(1, total_size + 1),
               dtype=np.float32).reshape(input_sizes)
         func_name = "max_pool"
-        err_margin = 1e-3
+        err_tolerance = 1e-3
       if data_format == "NCHW":
         ksize = [1, 1, window_rows, window_rows]
         strides = [1, 1, row_stride, col_stride]
-        t = NHWCToNCHW(input_tensor)
+        t = test_util.NHWCToNCHW(input_tensor)
       else:
         ksize = [1, window_rows, window_rows, 1]
         strides = [1, row_stride, col_stride, 1]
@@ -703,7 +722,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           name=func_name)
       if data_format == "NCHW":
-        t = NCHWToNHWC(t)
+        t = test_util.NCHWToNHWC(t)
 
       err = gradient_checker.compute_gradient_error(
           input_tensor,
@@ -713,7 +732,84 @@ class PoolingTest(test.TestCase):
           x_init_value=x_init_value,
           delta=1e-2)
     print("%s gradient error = " % func_name, err)
-    self.assertLess(err, err_margin)
+    self.assertLess(err, err_tolerance)
+
+  def _ConstructAndTestSecondGradient(self,
+                                      pool_func,
+                                      input_sizes,
+                                      output_sizes,
+                                      window_rows,
+                                      window_cols,
+                                      row_stride,
+                                      col_stride,
+                                      padding,
+                                      data_format,
+                                      use_gpu,
+                                      x_init_value=None):
+    """Verifies the second-order gradients of the pooling function.
+
+    Args:
+      pool_func: Function to be called, co.MaxPool, co.AvgPool,
+        or the Lua version.
+      input_sizes: Input tensor dimensions.
+      output_sizes: Output tensor dimensions.
+      window_rows: kernel size in row dim
+      window_cols: kernel size in col dim
+      row_stride: Row Stride.
+      col_stride: Col Stride.
+      padding: Padding type.
+      data_format: Data format.
+      use_gpu: whether we are running on GPU
+      x_init_value: Values to be passed to the gradient checker.
+    """
+    assert input_sizes[0] == output_sizes[0]
+    assert input_sizes[3] == output_sizes[3]
+    total_size = 1
+    for s in input_sizes:
+      total_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x = [f * 1.0 for f in range(1, total_size + 1)]
+    with self.test_session(use_gpu=use_gpu):
+      input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
+      if pool_func == nn_ops.avg_pool:
+        func_name = "avg_pool"
+        err_tolerance = 1e-3
+      else:
+        if x_init_value is None:
+          x_init_value = np.asfarray(
+              np.arange(1, total_size + 1),
+              dtype=np.float32).reshape(input_sizes)
+        func_name = "max_pool"
+        err_tolerance = 1e-2
+      if data_format == "NCHW":
+        ksize = [1, 1, window_rows, window_rows]
+        strides = [1, 1, row_stride, col_stride]
+        t = test_util.NHWCToNCHW(input_tensor)
+      else:
+        ksize = [1, window_rows, window_rows, 1]
+        strides = [1, row_stride, col_stride, 1]
+        t = input_tensor
+      t = pool_func(
+          t,
+          ksize=ksize,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          name=func_name)
+      if data_format == "NCHW":
+        t = test_util.NHWCToNCHW(t)
+
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
+      err = gradient_checker.compute_gradient_error(
+          input_tensor,
+          input_sizes,
+          t_g,
+          input_sizes,
+          x_init_value=x_init_value,
+          delta=1e-2)
+    print("%s second-order gradient error = " % func_name, err)
+    self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
     self._ConstructAndTestGradient(
@@ -856,7 +952,7 @@ class PoolingTest(test.TestCase):
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
                              padding, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
       output_tensor = nn_ops.max_pool(input_tensor,
                                       [1, window_rows, window_cols, 1],
@@ -1082,6 +1178,144 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirectWithNans2_1()
     self._testMaxPoolGradDirectWithNans2_2()
 
+  def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        output_sizes=[1, 3, 3, 1],
+        window_rows=1,
+        window_cols=1,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_1_6(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 6, 6, 3],
+        output_sizes=[2, 5, 5, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_1_7(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 7, 7, 3],
+        output_sizes=[2, 6, 6, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_2(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 2, 3],
+        output_sizes=[2, 1, 1, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=2,
+        col_stride=2,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding1_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 2, 4, 3],
+        window_rows=1,
+        window_cols=1,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding2_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 2, 4, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding2_2(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 1, 2, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=2,
+        col_stride=2,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding3_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[1, 7, 7, 1],
+        output_sizes=[1, 7, 7, 1],
+        window_rows=3,
+        window_cols=3,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def testMaxPoolGradGrad(self):
+    for (data_format, use_gpu) in GetTestConfigs():
+      self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_1_6(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_1_7(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_2(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding2_1(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding2_2(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding3_1(data_format, use_gpu)
+
+  def _MaxPoolGradGrad(self, orig_input, orig_output, grad, window_rows,
+                       window_cols, row_stride, col_stride, padding):
+    """Max Pooling Second-Order Gradient.
+
+    Args:
+      orig_input: A float Tensor. The original input tensor.
+      orig_output: A float Tensor. The original output tensor.
+      grad: A float Tensor.
+        The 4D (batch x out_rows x out_cols x depth) output backprop.
+      window_rows: integer. Kernel size along rows dimension.
+      window_cols: integer. Kernel size along cols dimension.
+      row_stride: integer. Stride along rows dimension
+      col_stride: integer. Stride along cols dimension
+      padding: PoolingOpDef.Padding.  Padding type.
+
+    Returns:
+      A Tensor.
+    """
+    return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
+                                          [1, window_rows, window_cols,
+                                           1], [1, row_stride, col_stride,
+                                                1], padding)
+
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1270,6 +1504,19 @@ def GetMaxPoolGradTest(input_size, filter_size, output_size, strides, padding):
   return Test
 
 
+def GetMaxPoolGradGradTest(input_size, filter_size, output_size, strides,
+                           padding):
+
+  def Test(self):
+    # MaxPoolWithArgMax is implemented only on CUDA.
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    self._CompareMaxPoolingGradBk(input_size, output_size, filter_size, strides,
+                                  padding)
+
+  return Test
+
+
 if __name__ == "__main__":
   for (name_, input_size_, filter_size_, output_size_, stride_,
        padding_) in GetShrunkInceptionMaxPoolShapes():
@@ -1278,4 +1525,7 @@ if __name__ == "__main__":
     setattr(PoolingTest, "testMaxPoolGrad_" + name_,
             GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
                                padding_))
+    setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
+            GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
+                                   stride_, padding_))
   test.main()
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 938e451cbcd..e098cf3ff9c 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -160,6 +160,14 @@ class PyOpTest(test.TestCase):
         _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
     self.assertTrue(script_ops._py_funcs.size() < 100)
 
+  def testAlias(self):
+    with self.test_session():
+      np_array = np.array([1.0, 2.0], dtype=np.float32)
+      tf_array = script_ops.py_func(lambda: np_array, [], [dtypes.float32])
+      value = tf_array + constant_op.constant([2.0, 3.0], dtype=dtypes.float32)
+      value.op.run()
+      self.assertAllEqual(np_array, [1.0, 2.0])
+
   def testBadNumpyReturnType(self):
     with self.test_session():
 
@@ -178,9 +186,9 @@ class PyOpTest(test.TestCase):
 
       def bad():
         # Non-string python objects aren't supported.
-        return dtypes.float32
+        return {"foo": dtypes.float32}
 
-      z, = script_ops.py_func(bad, [], [dtypes.float64])
+      z, = script_ops.py_func(bad, [], [dtypes.int64])
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
@@ -275,6 +283,28 @@ class PyOpTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(f), [])
 
+  def _testExceptionHandling(self, py_exp, tf_exp):
+
+    def raise_exception():
+      raise py_exp("blah")  # pylint: disable=not-callable
+
+    f = script_ops.py_func(raise_exception, [], [])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(tf_exp, "blah"):
+        sess.run(f)
+
+  def testExceptionHandling(self):
+    self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
+    self._testExceptionHandling(TypeError, errors.InvalidArgumentError)
+    self._testExceptionHandling(StopIteration, errors.OutOfRangeError)
+    self._testExceptionHandling(MemoryError, errors.ResourceExhaustedError)
+    self._testExceptionHandling(NotImplementedError, errors.UnimplementedError)
+
+    class WeirdError(Exception):
+      pass
+
+    self._testExceptionHandling(WeirdError, errors.UnknownError)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random_gamma_test.py b/tensorflow/python/kernel_tests/random_gamma_test.py
index 64595ce9cd7..aa40228dc1f 100644
--- a/tensorflow/python/kernel_tests/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random_gamma_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -252,6 +253,14 @@ class RandomGammaTest(test.TestCase):
     rnd = random_ops.random_gamma([50], array_ops.placeholder(dtypes.float32))
     self.assertIs(None, rnd.get_shape().ndims)
 
+  def testPositive(self):
+    n = int(10e3)
+    for dt in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        x = random_ops.random_gamma(shape=[n], alpha=0.001, dtype=dt, seed=0)
+        self.assertEqual(0, math_ops.reduce_sum(math_ops.cast(
+            math_ops.less_equal(x, 0.), dtype=dtypes.int64)).eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random_ops_test.py b/tensorflow/python/kernel_tests/random_ops_test.py
index fa323b24963..56aaa53b981 100644
--- a/tensorflow/python/kernel_tests/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -65,7 +66,8 @@ class RandomNormalTest(test.TestCase):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       results = {}
       for use_gpu in [False, True]:
-        sampler = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=12345)
+        sampler = self._Sampler(
+            1000000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=12345)
         results[use_gpu] = sampler()
       if dt == dtypes.float16:
         self.assertAllClose(results[False], results[True], rtol=1e-3, atol=1e-3)
@@ -134,7 +136,7 @@ class TruncatedNormalTest(test.TestCase):
         # We need a particular larger number of samples to test multiple rounds
         # on GPU
         sampler = self._Sampler(
-            200000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=12345)
+            1000000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=12345)
         results[use_gpu] = sampler()
       if dt == dtypes.float16:
         self.assertAllClose(results[False], results[True], rtol=1e-3, atol=1e-3)
@@ -157,6 +159,13 @@ class TruncatedNormalTest(test.TestCase):
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
       self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
 
+  def testLargeShape(self):
+    with self.test_session(use_gpu=True):
+      v = variables.Variable(
+          array_ops.zeros(dtype=dtypes.float32, shape=[2**33, 1]))
+      n = random_ops.truncated_normal(v.shape)
+      self.assertEqual([8589934592, 1], n.shape.as_list())
+
   def testNoCSE(self):
     with self.test_session(use_gpu=True):
       shape = [2, 3, 4]
@@ -235,7 +244,7 @@ class RandomUniformTest(test.TestCase):
       results = {}
       for use_gpu in False, True:
         sampler = self._Sampler(
-            1000, minv=0, maxv=maxv, dtype=dt, use_gpu=use_gpu, seed=12345)
+            1000000, minv=0, maxv=maxv, dtype=dt, use_gpu=use_gpu, seed=12345)
         results[use_gpu] = sampler()
       self.assertAllEqual(results[False], results[True])
 
diff --git a/tensorflow/python/kernel_tests/random_poisson_test.py b/tensorflow/python/kernel_tests/random_poisson_test.py
new file mode 100644
index 00000000000..01281b7bd03
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random_poisson_test.py
@@ -0,0 +1,178 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.random_ops.random_poisson."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+class RandomPoissonTest(test.TestCase):
+  """This is a large test due to the moments computation taking some time."""
+
+  def _Sampler(self, num, lam, dtype, use_gpu, seed=None):
+
+    def func():
+      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+        rng = random_ops.random_poisson(lam, [num], dtype=dtype, seed=seed)
+        ret = np.empty([10, num])
+        for i in xrange(10):
+          ret[i, :] = sess.run(rng)
+      return ret
+
+    return func
+
+  # TODO(srvasude): Factor this out along with the corresponding moment testing
+  # method in random_gamma_test into a single library.
+  def testMoments(self):
+    try:
+      from scipy import stats  # pylint: disable=g-import-not-at-top
+    except ImportError as e:
+      tf_logging.warn("Cannot test moments: %s", e)
+      return
+    # The moments test is a z-value test.  This is the largest z-value
+    # we want to tolerate. Since the z-test approximates a unit normal
+    # distribution, it should almost definitely never exceed 6.
+    z_limit = 6.0
+    for dt in dtypes.float16, dtypes.float32, dtypes.float64:
+      # Test when lam < 10 and when lam >= 10
+      for stride in 0, 4, 10:
+        for lam in (3., 20):
+          max_moment = 5
+          sampler = self._Sampler(10000, lam, dt, use_gpu=False, seed=12345)
+          moments = [0] * (max_moment + 1)
+          moments_sample_count = [0] * (max_moment + 1)
+          x = np.array(sampler().flat)  # sampler does 10x samples
+          for k in range(len(x)):
+            moment = 1.
+            for i in range(max_moment + 1):
+              index = k + i * stride
+              if index >= len(x):
+                break
+              moments[i] += moment
+              moments_sample_count[i] += 1
+              moment *= x[index]
+          for i in range(max_moment + 1):
+            moments[i] /= moments_sample_count[i]
+          for i in range(1, max_moment + 1):
+            g = stats.poisson(lam)
+            if stride == 0:
+              moments_i_mean = g.moment(i)
+              moments_i_squared = g.moment(2 * i)
+            else:
+              moments_i_mean = pow(g.moment(1), i)
+              moments_i_squared = pow(g.moment(2), i)
+            moments_i_var = (
+                moments_i_squared - moments_i_mean * moments_i_mean)
+            # Assume every operation has a small numerical error.
+            # It takes i multiplications to calculate one i-th moment.
+            error_per_moment = i * 1e-6
+            total_variance = (
+                moments_i_var / moments_sample_count[i] + error_per_moment)
+            if not total_variance:
+              total_variance = 1e-10
+            # z_test is approximately a unit normal distribution.
+            z_test = abs(
+                (moments[i] - moments_i_mean) / np.sqrt(total_variance))
+            self.assertLess(z_test, z_limit)
+
+  # Checks that the CPU and GPU implementation returns the same results,
+  # given the same random seed
+  def testCPUGPUMatch(self):
+    for dt in dtypes.float16, dtypes.float32, dtypes.float64:
+      results = {}
+      for use_gpu in [False, True]:
+        sampler = self._Sampler(1000, 1.0, dt, use_gpu=use_gpu, seed=12345)
+        results[use_gpu] = sampler()
+      if dt == dtypes.float16:
+        self.assertAllClose(results[False], results[True], rtol=1e-3, atol=1e-3)
+      else:
+        self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
+
+  def testSeed(self):
+    for dt in dtypes.float16, dtypes.float32, dtypes.float64:
+      sx = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
+      sy = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
+      self.assertAllEqual(sx(), sy())
+
+  def testNoCSE(self):
+    """CSE = constant subexpression eliminator.
+
+    SetIsStateful() should prevent two identical random ops from getting
+    merged.
+    """
+    for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
+      with self.test_session(use_gpu=True):
+        rnd1 = random_ops.random_poisson(2.0, [24], dtype=dtype)
+        rnd2 = random_ops.random_poisson(2.0, [24], dtype=dtype)
+        diff = rnd2 - rnd1
+        # Since these are all positive integers, the norm will
+        # be at least 1 if they are different.
+        self.assertGreaterEqual(np.linalg.norm(diff.eval()), 1)
+
+  def testShape(self):
+    # Fully known shape.
+    rnd = random_ops.random_poisson(2.0, [150], seed=12345)
+    self.assertEqual([150], rnd.get_shape().as_list())
+    rnd = random_ops.random_poisson(
+        lam=array_ops.ones([1, 2, 3]),
+        shape=[150],
+        seed=12345)
+    self.assertEqual([150, 1, 2, 3], rnd.get_shape().as_list())
+    rnd = random_ops.random_poisson(
+        lam=array_ops.ones([1, 2, 3]),
+        shape=[20, 30],
+        seed=12345)
+    self.assertEqual([20, 30, 1, 2, 3], rnd.get_shape().as_list())
+    rnd = random_ops.random_poisson(
+        lam=array_ops.placeholder(dtypes.float32, shape=(2,)),
+        shape=[12],
+        seed=12345)
+    self.assertEqual([12, 2], rnd.get_shape().as_list())
+    # Partially known shape.
+    rnd = random_ops.random_poisson(
+        lam=array_ops.ones([7, 3]),
+        shape=array_ops.placeholder(dtypes.int32, shape=(1,)),
+        seed=12345)
+    self.assertEqual([None, 7, 3], rnd.get_shape().as_list())
+    rnd = random_ops.random_poisson(
+        lam=array_ops.ones([9, 6]),
+        shape=array_ops.placeholder(dtypes.int32, shape=(3,)),
+        seed=12345)
+    self.assertEqual([None, None, None, 9, 6], rnd.get_shape().as_list())
+    # Unknown shape.
+    rnd = random_ops.random_poisson(
+        lam=array_ops.placeholder(dtypes.float32),
+        shape=array_ops.placeholder(dtypes.int32),
+        seed=12345)
+    self.assertIs(None, rnd.get_shape().ndims)
+    rnd = random_ops.random_poisson(
+        lam=array_ops.placeholder(dtypes.float32),
+        shape=[50],
+        seed=12345)
+    self.assertIs(None, rnd.get_shape().ndims)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random_shuffle_queue_test.py
index c9b983662da..d9bf0e46f8d 100644
--- a/tensorflow/python/kernel_tests/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random_shuffle_queue_test.py
@@ -967,6 +967,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertNotEqual(60.0, results[0])
       # Similarly for 60.0 and the second element.
       self.assertNotEqual(60.0, results[1])
+      thread.join()
 
   def testBlockingEnqueueToClosedQueue(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 5e8f8e8673a..12932219fc3 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -352,9 +352,20 @@ class FixedLengthRecordReaderTest(test.TestCase):
     self._record_bytes = 3
     self._footer_bytes = 2
 
+    self._hop_bytes = 2
+    self._num_overlapped_records = 3
+
   def _Record(self, f, r):
     return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
 
+  def _OverlappedRecord(self, f, r):
+    record_str = "".join([
+        str(i)[0]
+        for i in range(r * self._hop_bytes,
+                       r * self._hop_bytes + self._record_bytes)
+    ])
+    return compat.as_bytes(record_str)
+
   def _CreateFiles(self):
     filenames = []
     for i in range(self._num_files):
@@ -367,6 +378,23 @@ class FixedLengthRecordReaderTest(test.TestCase):
         f.write(b"F" * self._footer_bytes)
     return filenames
 
+  def _CreateOverlappedRecordFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(),
+                        "fixed_length_overlapped_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        all_records_str = "".join([
+            str(i)[0]
+            for i in range(self._record_bytes + self._hop_bytes *
+                           (self._num_overlapped_records - 1))
+        ])
+        f.write(compat.as_bytes(all_records_str))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
   def testOneEpoch(self):
     files = self._CreateFiles()
     with self.test_session() as sess:
@@ -374,6 +402,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           header_bytes=self._header_bytes,
           record_bytes=self._record_bytes,
           footer_bytes=self._footer_bytes,
+          hop_bytes=0,
           name="test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -390,6 +419,31 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testOneEpochWithHopBytes(self):
+    files = self._CreateOverlappedRecordFiles()
+    with self.test_session() as sess:
+      reader = io_ops.FixedLengthRecordReader(
+          header_bytes=self._header_bytes,
+          record_bytes=self._record_bytes,
+          footer_bytes=self._footer_bytes,
+          hop_bytes=self._hop_bytes,
+          name="test_reader")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue_many([files]).run()
+      queue.close().run()
+      for i in range(self._num_files):
+        for j in range(self._num_overlapped_records):
+          k, v = sess.run([key, value])
+          print(v)
+          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+          self.assertAllEqual(self._OverlappedRecord(i, j), v)
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
 
 class TFRecordReaderTest(test.TestCase):
 
@@ -804,5 +858,49 @@ class AsyncReaderTest(test.TestCase):
     output.append(sess.run(args))
 
 
+# TODO(jhseu): Restore after fixing.
+#class LMDBReaderTest(test.TestCase):
+#
+#  def setUp(self):
+#    super(LMDBReaderTest, self).setUp()
+#
+#  def testReadFromFile(self):
+#    with self.test_session() as sess:
+#      reader = io_ops.LMDBReader(name="test_read_from_file")
+#      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
+#                          "data.mdb")
+#      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+#      key, value = reader.read(queue)
+#
+#      queue.enqueue([path]).run()
+#      queue.close().run()
+#      for i in range(10):
+#        k, v = sess.run([key, value])
+#        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+#        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+#
+#      with self.assertRaisesOpError("is closed and has insufficient elements "
+#                                    "\\(requested 1, current size 0\\)"):
+#        k, v = sess.run([key, value])
+#
+#  def testReadFromFolder(self):
+#    with self.test_session() as sess:
+#      reader = io_ops.LMDBReader(name="test_read_from_folder")
+#      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
+#      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+#      key, value = reader.read(queue)
+#
+#      queue.enqueue([path]).run()
+#      queue.close().run()
+#      for i in range(10):
+#        k, v = sess.run([key, value])
+#        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+#        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+#
+#      with self.assertRaisesOpError("is closed and has insufficient elements "
+#                                    "\\(requested 1, current size 0\\)"):
+#        k, v = sess.run([key, value])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index 9b5de4fcdb9..8fec2affa59 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -53,6 +53,7 @@ class RecordInputOpTest(test.TestCase):
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
+    batches = 2
     with self.test_session() as sess:
       self.generateTestData("basic", files, records_per_file)
 
@@ -63,17 +64,37 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           shift_ratio=0.33,
           seed=10,
-          name="record_input")
+          name="record_input",
+          batches=batches)
 
       yield_op = records.get_yield_op()
 
       # cycle over 3 epochs and make sure we never duplicate
       for _ in range(3):
         epoch_set = set()
-        for _ in range(files * records_per_file):
-          r = sess.run(yield_op)
-          self.assertTrue(r[0] not in epoch_set)
-          epoch_set.add(r[0])
+        for _ in range(int(files * records_per_file / batches)):
+          op_list = sess.run(yield_op)
+          self.assertTrue(len(op_list) is batches)
+          for r in op_list:
+            self.assertTrue(r[0] not in epoch_set)
+            epoch_set.add(r[0])
+
+  def testDoesNotDeadlock(self):
+    # Iterate multiple times to cause deadlock if there is a chance it can occur
+    for _ in range(30):
+      with self.test_session() as sess:
+        self.generateTestData("basic", 1, 1)
+
+        records = data_flow_ops.RecordInput(
+            file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+            parallelism=1,
+            buffer_size=100,
+            batch_size=1,
+            name="record_input")
+
+        yield_op = records.get_yield_op()
+        for _ in range(50):
+          sess.run(yield_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 0da5a2ecc56..1dfc7f48d57 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import numbers
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -29,6 +32,26 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+# The maximum input rank to test.
+_MAX_RANK = 5
+
+
+def _powerset(iterable):
+  """Helper for generating all possible reduction_axes arguments.
+
+  Example:
+  powerset([0,1,2]): () (0,) (1,) (2,) (0,1) (0,2) (1,2) (0,1,2)
+
+  Args:
+    iterable: An iterable of items to generate the powerset of.
+
+  Returns:
+    The powerset of all items in iterable.
+  """
+  s = list(iterable)
+  return itertools.chain.from_iterable(
+      itertools.combinations(s, r) for r in range(len(s)+1))
+
 
 class ReducedShapeTest(test.TestCase):
 
@@ -68,23 +91,30 @@ class ReducedShapeTest(test.TestCase):
       self._check([10, 10, 10], [-3], [1, 10, 10])
 
 
-class SumReductionTest(test.TestCase):
+class BaseReductionTest(test.TestCase):
 
-  def _compare(self,
-               x,
-               reduction_axes,
-               keep_dims,
-               use_gpu=False,
-               feed_dict=None):
-    np_ans = x
-    if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
-    else:
-      reduction_axes = np.array(reduction_axes).astype(np.int32)
-      for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
-    with self.test_session(use_gpu=use_gpu) as sess:
-      tf_ans = math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+  def _makeIncremental(self, shape, dtype):
+    data = np.arange(np.prod(shape)).reshape(shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _makeRandom(self, shape, dtype):
+    data = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _compare(self, x, reduction_axes, keep_dims, feed_dict=None):
+    np_ans = self._np_reduce(x, reduction_axes, keep_dims)
+    with self.test_session(use_gpu=True) as sess:
+      tf_ans = self._tf_reduce(x, reduction_axes, keep_dims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -93,10 +123,45 @@ class SumReductionTest(test.TestCase):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
       self._compareAll(x, reduction_axes[0])
-    self._compare(x, reduction_axes, False, use_gpu=True, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, False, use_gpu=False, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, True, use_gpu=True, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, True, use_gpu=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keep_dims=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keep_dims=True, feed_dict=feed_dict)
+
+  def _compareAllAxes(self, x, feed_dict=None):
+    self._compareAll(x, None)
+    for axes in _powerset(range(x.ndim)):
+      self._compareAll(x, axes, feed_dict)
+
+  def _compareGradient(self, x, reduction_axes, rtol=1e-8, atol=1e-8):
+    if reduction_axes is not None and np.shape(reduction_axes) == (1,):
+      # Test scalar reduction_axes argument
+      self._compareGradient(x, reduction_axes[0], rtol=rtol, atol=atol)
+    with self.test_session(use_gpu=True):
+      t = ops.convert_to_tensor(x)
+      su = self._tf_reduce(t, reduction_axes, False)
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          t, x.shape, su, su.get_shape().as_list(), x_init_value=x, delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=rtol, atol=atol)
+
+  def _compareGradientAxes(self, x, rtol=1e-8, atol=1e-8):
+    self._compareGradient(x, None, rtol=rtol, atol=atol)
+    self._compareGradient(x, [], rtol=rtol, atol=atol)
+    self._compareGradient(x, 0, rtol=rtol, atol=atol)
+    self._compareGradient(x, [1], rtol=rtol, atol=atol)
+    self._compareGradient(x, [2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [1, 2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [0, 1, 2, 3], rtol=rtol, atol=atol)
+
+
+class SumReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    return np.sum(x, axis=reduction_axes, keepdims=keep_dims)
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -105,95 +170,30 @@ class SumReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testFloatReduce1D(self):
-    # Create a 1D array of floats
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.float32)
-    self._compareAll(np_arr, [0])
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce2D(self):
-    # Create a 2D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [0, 1])
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [-1])
-    self._compareAll(np_arr, [-1, -3])
-    self._compareAll(np_arr, [-1, 1])
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce4D(self):
-    # Create a 4D array of floats and reduce across some
-    # dimensions
-    np_arr = np.arange(0, 210).reshape([2, 3, 5, 7]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    # Need specialization for reduce(4D, [0, 2])
-    # self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [1, 2, 3])
-    self._compareAll(np_arr, [0, 1, 2, 3])
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce5D(self):
-    # Create a 5D array of floats and reduce across some dimensions
-    np_arr = np.arange(0, 840).reshape([2, 3, 5, 7, 4]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    # Need specialization for reduce(4D, [0, 2])
-    # self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [1, 2, 3])
-    self._compareAll(np_arr, [0, 1, 2, 3])
-    self._compareAll(np_arr, [1, 2, 3, 4])
-    self._compareAll(np_arr, [0, 1, 2, 3, 4])
-
-  # Simple tests for various types.
-  def testDoubleReduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.float64)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-
-  def testInt32Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.int32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-
-  def testComplex64Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.complex64)
-    self._compare(np_arr, [], False)
-    self._compare(np_arr, [0], False)
-
-  def testComplex128Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.complex128)
-    self._compare(np_arr, [], False)
-    self._compare(np_arr, [0], False)
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
 
   def testInvalidIndex(self):
     np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
@@ -241,34 +241,20 @@ class SumReductionTest(test.TestCase):
         c_unknown_indices, unknown_indices, keep_dims=True)
     self.assertEqual(2, s_unknown_indices_keep.get_shape().ndims)
 
+  def testWrongShapeForReductionIndices(self):
+    reduction_axes = [[1], [2]]
+    c_unknown = array_ops.placeholder(dtypes.float32)
+    with self.assertRaisesWithPredicateMatch(ValueError,
+                                             ".*must be at most rank 1.*"):
+      math_ops.reduce_sum(c_unknown, reduction_axes)
+
   # Int64??
 
-  def _compareGradient(self, shape, sum_shape, reduction_axes):
-    if reduction_axes is not None and np.shape(reduction_axes) == (1,):
-      # Test scalar reduction_axes argument
-      self._compareGradient(shape, sum_shape, reduction_axes[0])
-    x = np.arange(1.0, 49.0).reshape(shape).astype(np.float64)
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
-      su = math_ops.reduce_sum(t, reduction_axes)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, shape, su, sum_shape, x_init_value=x, delta=1)
-    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
-
   def testGradient(self):
-    self._compareGradient([2, 3, 4, 2], [2, 2], [1, 2])
-
-  def testGradient2(self):
-    self._compareGradient([2, 3, 4, 2], [2, 4, 2], [1])
-
-  def testGradient3(self):
-    self._compareGradient([2, 3, 4, 2], [2, 3, 2], [2])
-
-  def testGradient4(self):
-    self._compareGradient([2, 3, 4, 2], [], None)
-
-  def testGradient5(self):
-    self._compareGradient([2, 3, 4, 2], [3, 4, 2], 0)
+    for dtype in [dtypes.float32, dtypes.float64, dtypes.complex64,
+                  dtypes.complex128]:
+      x = self._makeIncremental([2, 3, 4, 2], dtype)
+      self._compareGradientAxes(x)
 
   def testHighRank(self):
     # Do a bunch of random high dimensional reductions
@@ -293,61 +279,45 @@ class SumReductionTest(test.TestCase):
     self._compareAll(x, [1])
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_sum(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                      dtypes.complex64, dtypes.complex128):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_sum(x, [0])
-          self.assertAllEqual(y.eval(), np.zeros(9938))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.complex64, dtypes.complex128):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_sum(x, [0])
+        self.assertAllEqual(y.eval(), np.zeros(9938))
 
 
-class MeanReductionTest(test.TestCase):
+class MeanReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    elif isinstance(reduction_axes, numbers.Integral):
+      reduction_axes = (reduction_axes,)
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
-    np_ans = x
     if reduction_axes is None:
-      np_ans = np.mean(np_ans, keepdims=keep_dims)
+      count = np.prod(x.shape)
     else:
-      reduction_axes = np.array(reduction_axes).astype(np.int32)
-      count = 1
-      for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
-        count *= x.shape[ra]
-      np_ans /= count
-    with self.test_session(use_gpu=use_gpu):
-      tf_ans = math_ops.reduce_mean(x, reduction_axes, keep_dims)
-      out = tf_ans.eval()
-    self.assertAllClose(np_ans, out)
-    self.assertShapeEqual(np_ans, tf_ans)
-
-  def _compareAll(self, x, reduction_axes):
-    self._compare(x, reduction_axes, False, use_gpu=True)
-    self._compare(x, reduction_axes, True, use_gpu=True)
-    self._compare(x, reduction_axes, False, use_gpu=False)
-    self._compare(x, reduction_axes, True, use_gpu=False)
-
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
+      count = np.prod([x.shape[ax] for ax in reduction_axes])
+    # np.mean automatically converts integer inputs to float, while TensorFlow's
+    # reduce_mean does not. For integer inputs, we emulate TensorFlow's behavior
+    # using np.sum and truncating division.
+    np_sum = np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    if np.issubdtype(x.dtype, np.integer):
+      return np_sum // count
+    return np_sum / count
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -356,83 +326,64 @@ class MeanReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testDoubleReduce3D(self):
-    # Create a 3D array of doubles and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
+
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
+
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
+
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
+
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
 
   def testGradient(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32)
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
-      su = math_ops.reduce_mean(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_mean(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [1], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_mean(t, [])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_mean(t, 0)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = self._makeIncremental(s, dtype)
+      self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_mean(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_mean(x, [0]).eval()
-          self.assertEqual(y.shape, (9938,))
-          self.assertTrue(np.all(np.isnan(y)))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_mean(x, [0]).eval()
+        self.assertEqual(y.shape, (9938,))
+        self.assertTrue(np.all(np.isnan(y)))
 
 
-class ProdReductionTest(test.TestCase):
+class ProdReductionTest(BaseReductionTest):
 
-  def _compare(self, x, reduction_axes, keep_dims):
-    np_ans = x
-    if reduction_axes is None:
-      np_ans = np.prod(np_ans, keepdims=keep_dims)
-    else:
-      for ra in reduction_axes[::-1]:
-        np_ans = np.prod(np_ans, axis=ra, keepdims=keep_dims)
-    with self.test_session():
-      if reduction_axes is not None:
-        reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_prod(x, reduction_axes, keep_dims)
-      out = tf_ans.eval()
-    self.assertAllClose(np_ans, out)
-    self.assertShapeEqual(np_ans, tf_ans)
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_prod(x, reduction_axes, keep_dims)
 
-  def _compareAll(self, x, reduction_axes):
-    self._compare(x, reduction_axes, False)
-    self._compare(x, reduction_axes, True)
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    return np.prod(x, axis=reduction_axes, keepdims=keep_dims)
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -441,81 +392,70 @@ class ProdReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
+  def testInt32(self):
+    # Numpy automatically upgrades the type of np.prod from int32 to int64, so
+    # Numpy does not overflow an int32 np.prod while TensorFlow does. To avoid
+    # overflow, divide the incremental int32 array by 2.
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32) / 2
+      self._compareAllAxes(np_arr)
 
-  def _compareGradient(self, x):
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_prod(t, [])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [2, 3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_prod(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [2, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_prod(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [1], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_prod(t, 0)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
 
   def testGradientWithZeros(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+    x = self._makeIncremental(s, dtypes.float32) / 20.
     # No zeros in input
-    self._compareGradient(x)
+    self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
     # Zero at beginning
     x1 = x.copy()
     x1[:, :, 0, :] = 0
-    self._compareGradient(x1)
+    self._compareGradientAxes(x1, rtol=1e-3, atol=1e-3)
     # Zero at end
     x2 = x.copy()
     x2[:, :, -1, :] = 0
-    self._compareGradient(x2)
+    self._compareGradientAxes(x2, rtol=1e-3, atol=1e-3)
     # Zero in middle
     x3 = x.copy()
     x3[:, :, 2, :] = 0
-    self._compareGradient(x3)
+    self._compareGradientAxes(x3, rtol=1e-3, atol=1e-3)
     # All zeros
     x4 = x.copy()
     x4[:, :, :, :] = 0
-    self._compareGradient(x4)
+    self._compareGradientAxes(x4, rtol=1e-3, atol=1e-3)
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_prod(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_prod(x, [0])
-          self.assertAllEqual(y.eval(), np.ones(9938))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_prod(x, [0])
+        self.assertAllEqual(y.eval(), np.ones(9938))
 
 
 class MinReductionTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 2f0455b441b..3d10f4d0e93 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -17,13 +17,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -97,6 +103,20 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
       self.assertEqual(read.eval(), [[3]])
 
+  def testGPU(self):
+    with self.test_session(use_gpu=True) as sess:
+      abc = variable_scope.get_variable(
+          "abc",
+          shape=[1],
+          initializer=init_ops.ones_initializer(),
+          use_resource=True)
+
+      sess.run(variables.global_variables_initializer())
+      self.assertEqual(
+          resource_variable_ops.var_is_initialized_op(abc.handle).eval(),
+          True)
+      print(sess.run(abc))
+
   def testInitFn(self):
     with self.test_session():
       v = resource_variable_ops.ResourceVariable(initial_value=lambda: 1,
@@ -136,6 +156,21 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.assign(2.0).eval()
       self.assertEqual(2.0, v.value().eval())
 
+  def testLoad(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      v.load(2.0)
+      self.assertEqual(2.0, v.value().eval())
+
+  def testToFromProto(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+
+      w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
+      self.assertEquals(2, math_ops.add(w, 1).eval())
+
   def testAssignAddMethod(self):
     with self.test_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -164,6 +199,87 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       sess.run(resource_variable_ops.destroy_resource_op(
           handle, ignore_lookup_error=True))
 
+  def testAssignDifferentShapes(self):
+    with self.test_session() as sess, variable_scope.variable_scope(
+        "foo", use_resource=True):
+      var = variable_scope.get_variable("x", shape=[1, 1], dtype=dtypes.float32)
+      placeholder = array_ops.placeholder(dtypes.float32)
+      assign = var.assign(placeholder)
+      sess.run([assign],
+               feed_dict={placeholder: np.zeros(shape=[2, 2],
+                                                dtype=np.float32)})
+
+  def testDtypeAfterFromProto(self):
+    v = resource_variable_ops.ResourceVariable(2.0)
+    w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
+    self.assertIsInstance(w.dtype, dtypes.DType)
+    self.assertEqual(v.dtype, w.dtype)
+
+  def testCachingDevice(self):
+    with ops.device("/job:server/task:1"):
+      v = resource_variable_ops.ResourceVariable(
+          2.0, caching_device="/job:localhost")
+      self.assertEqual("/job:localhost", v.value().device)
+      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+        _ = v.value().op.get_attr("_class")
+
+    with ops.colocate_with(v.op):
+      w = resource_variable_ops.ResourceVariable(
+          2.0, caching_device="/job:localhost")
+      self.assertEqual("/job:localhost", w.value().device)
+      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+        _ = w.value().op.get_attr("_class")
+
+  def testSharedName(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(300.0, name="var1")
+      v.initializer.run()
+
+      w = resource_variable_ops.var_handle_op(dtype=v.dtype.base_dtype,
+                                              shape=v.get_shape(),
+                                              shared_name="var1")
+      w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
+      self.assertEqual(300.0, w_read.eval())
+
+      x = resource_variable_ops.var_handle_op(dtype=v.dtype.base_dtype,
+                                              shape=v.get_shape(),
+                                              shared_name="var1/")
+      x_read = resource_variable_ops.read_variable_op(x, v.dtype.base_dtype)
+      with self.assertRaisesOpError("Resource .*/var1//.* does not exist"):
+        _ = x_read.eval()
+
+  def testShape(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(
+          name="var1", initial_value=array_ops.ones(shape=[10, 20, 35]))
+      self.assertEqual("(10, 20, 35)", str(v.get_shape()))
+      self.assertEqual("(10, 20, 35)", str(v.value().shape))
+      self.assertEqual("(3, 20, 35)", str(v.sparse_read([0, 1, 2]).shape))
+      self.assertEqual(
+          "<unknown>",
+          str(v.sparse_read(array_ops.placeholder(dtypes.int32)).shape))
+
+  def testSetInitialValue(self):
+    with self.test_session():
+      # Initialize variable with a value different from the initial value passed
+      # in the constructor.
+      v = resource_variable_ops.ResourceVariable(2.0)
+      v.initializer.run(feed_dict={v.initial_value: 3.0})
+      self.assertEqual(3.0, v.value().eval())
+
+  def testControlFlowInitialization(self):
+    """Expects an error if an initializer is in a control-flow scope."""
+    def cond(i, _):
+      return i < 10
+
+    def body(i, _):
+      zero = array_ops.zeros([], dtype=dtypes.int32)
+      v = resource_variable_ops.ResourceVariable(initial_value=zero)
+      return (i + 1, v.read_value())
+
+    with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
+      control_flow_ops.while_loop(cond, body, [0, 0])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 34fed7f3a26..a644e6a44fa 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -18,22 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import time
 import timeit
 
 import numpy as np
 
-# TODO: #6568 Remove this hack that makes dlopen() not crash.
-if hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags"):
-  import ctypes
-  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
-
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -48,7 +43,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-class Plus1RNNCell(rnn_cell_impl._RNNCell):
+class Plus1RNNCell(rnn_cell_impl.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -63,6 +58,24 @@ class Plus1RNNCell(rnn_cell_impl._RNNCell):
     return (input_ + 1, state + 1)
 
 
+class ScalarStateRNNCell(rnn_cell_impl.RNNCell):
+  """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
+
+  @property
+  def output_size(self):
+    return 1
+
+  @property
+  def state_size(self):
+    return tensor_shape.TensorShape([])
+
+  def zero_state(self, batch_size, dtype):
+    return array_ops.zeros([], dtype=dtypes.int32)
+
+  def __call__(self, input_, state, scope=None):
+    return (input_, state + 1)
+
+
 class RNNTest(test.TestCase):
 
   def setUp(self):
@@ -79,6 +92,46 @@ class RNNTest(test.TestCase):
           dtype=dtypes.float32,
           sequence_length=[[4]])
 
+  def testBatchSizeFromInput(self):
+    cell = Plus1RNNCell()
+    # With static batch size
+    inputs = array_ops.placeholder(dtypes.float32, shape=(3, 4, 5))
+    # - Without initial_state
+    outputs, state = rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
+    self.assertEqual(3, outputs.shape[0].value)
+    self.assertEqual(3, state.shape[0].value)
+    # - With initial_state
+    outputs, state = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=array_ops.placeholder(dtypes.float32, shape=(3, 5)))
+    self.assertEqual(3, outputs.shape[0].value)
+    self.assertEqual(3, state.shape[0].value)
+    # Without static batch size
+    inputs = array_ops.placeholder(dtypes.float32, shape=(None, 4, 5))
+    # - Without initial_state
+    outputs, state = rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
+    self.assertEqual(None, outputs.shape[0].value)
+    self.assertEqual(None, state.shape[0].value)
+    # - With initial_state
+    outputs, state = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=array_ops.placeholder(dtypes.float32, shape=(None, 5)))
+    self.assertEqual(None, outputs.shape[0].value)
+    self.assertEqual(None, state.shape[0].value)
+
+  def testScalarStateIsAccepted(self):
+    cell = ScalarStateRNNCell()
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
+    with self.test_session() as sess:
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32, sequence_length=[4])
+      outputs, state = sess.run(
+          [outputs, state], feed_dict={inputs: [[[1], [2], [3], [4]]]})
+    self.assertAllEqual(outputs, [[[1], [2], [3], [4]]])
+    self.assertEqual(state, 4)
+
 
 ######### Benchmarking RNN code
 
@@ -142,18 +195,17 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time):
   inputs = np.dstack(inputs_list).transpose([0, 2, 1])  # batch x time x depth
 
   def _create_static_rnn():
-    with session.Session(config=config, graph=ops_lib.Graph()) as sess:
+    with session.Session(config=config, graph=ops_lib.Graph()):
       inputs_list_t = [
           variables_lib.Variable(
               x, trainable=False).value() for x in inputs_list
       ]
-      ops = _static_vs_dynamic_rnn_benchmark_static(inputs_list_t,
-                                                    sequence_length)
+      _static_vs_dynamic_rnn_benchmark_static(inputs_list_t, sequence_length)
 
   def _create_dynamic_rnn():
-    with session.Session(config=config, graph=ops_lib.Graph()) as sess:
+    with session.Session(config=config, graph=ops_lib.Graph()):
       inputs_t = variables_lib.Variable(inputs, trainable=False).value()
-      ops = _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length)
+      _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length)
 
   delta_static = timeit.timeit(_create_static_rnn, number=5)
   delta_dynamic = timeit.timeit(_create_dynamic_rnn, number=5)
@@ -408,7 +460,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):
 
 
 def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, dynamic,
-                                swap_memory):
+                                swap_memory, nn):
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
 
@@ -421,7 +473,7 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, dynamic,
   ]
   inputs = np.dstack(inputs_list).transpose([0, 2, 1])  # batch x time x depth
 
-  for _ in range(5):
+  for _ in range(nn):
     if dynamic:
       with session.Session(config=config, graph=ops_lib.Graph()) as sess:
         inputs_t = variables_lib.Variable(inputs, trainable=False).value()
@@ -554,6 +606,23 @@ class BenchmarkRNN(test.Benchmark):
                 iters=20,
                 wall_time=t_dt)
 
+  def _benchmarkDynamicLSTMMemorySwapLongSeq(self):
+    """The memory swapping test for the SOSP submission."""
+    print("Calculation: Long LSTM Sequence")
+    print("batch \t len \t units \t dynamic \t elapsed_t \t elapsed_t/len")
+    batch_size = 512
+    seqlen = 800
+    num_units = 512
+    dynamic = True
+    swap_memory = True
+    # Some warming up.
+    if swap_memory:
+      rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
+                                  dynamic, swap_memory, 2)
+    # Measure the performance.
+    for slen in xrange(100, 1100, 100):
+      rnn_long_sequence_benchmark(batch_size, slen, num_units, dynamic,
+                                  swap_memory, 3)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scalar_strict_test.py b/tensorflow/python/kernel_tests/scalar_strict_test.py
deleted file mode 100644
index e208217637c..00000000000
--- a/tensorflow/python/kernel_tests/scalar_strict_test.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for scalar strictness and scalar leniency."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import sparse_ops
-import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
-from tensorflow.python.platform import control_imports
-from tensorflow.python.platform import test
-
-
-class ScalarStrictTest(test.TestCase):
-
-  def check(self, op, args, error, correct=None):
-    # Within Google, the switch to scalar strict occurred at version 6.
-    if control_imports.USE_OSS:
-      lenient = []
-      strict = [5, 6]
-    else:
-      lenient = [5]
-      strict = [6]
-
-    # Use placeholders to bypass shape inference, since only the C++
-    # GraphDef level is ever scalar lenient.
-    def placeholders(args, feed):
-      if isinstance(args, tuple):
-        return [placeholders(x, feed) for x in args]
-      else:
-        x = ops.convert_to_tensor(args).eval()
-        fake = array_ops.placeholder(np.asarray(x).dtype)
-        feed[fake] = x
-        return fake
-
-    # Test various GraphDef versions
-    for version in strict + lenient:
-      with ops.Graph().as_default() as g:
-        g.graph_def_versions.producer = version
-        with self.test_session(graph=g) as sess:
-          feed = {}
-          xs = placeholders(args, feed)
-          x = op(*xs)
-          if version in strict:
-            with self.assertRaisesOpError(error):
-              sess.run(x, feed_dict=feed)
-          else:
-            r = sess.run(x, feed_dict=feed)
-            if correct is not None:
-              self.assertAllEqual(r, correct)
-
-  def testConcat(self):
-    self.check(array_ops.concat, (([2], [3], [7]), [0]),
-               'axis tensor should be a scalar integer', [2, 3, 7])
-    for data in (2, 3, 7), (2, [3], 7), (2, 3, [7]):
-      self.check(array_ops.concat, (data, 0),
-                 r'Expected \w+ dimensions in the range \[0, 0\)', [2, 3, 7])
-    for data in ([2], 3, 7), ([2], [3], 7):
-      self.check(array_ops.concat, (data, 0),
-                 r'Ranks of all input tensors should match', [2, 3, 7])
-
-  def testFill(self):
-    self.check(array_ops.fill, (2, 3), 'dims must be a vector', [3, 3])
-    self.check(array_ops.fill, ([2], [3]), 'value must be a scalar', [3, 3])
-
-  def testPad(self):
-    self.check(array_ops.pad, (7, [[1, 2]]),
-               'The first dimension of paddings must be the rank of inputs',
-               [0, 7, 0, 0])
-
-  def testRandom(self):
-    self.check(random_ops.random_uniform, (3,), 'shape must be a vector')
-
-  def testReshape(self):
-    self.check(array_ops.reshape, (7, 1), 'sizes input must be 1-D', [7])
-
-  def testShardedFilename(self):
-    self.check(gen_io_ops._sharded_filename, ('foo', 4, [100]),
-               'must be a scalar', b'foo-00004-of-00100')
-
-  def testShardedFilespec(self):
-    self.check(gen_io_ops._sharded_filespec, ('foo', [100]), 'must be a scalar',
-               b'foo-?????-of-00100')
-
-  def testUnsortedSegmentSum(self):
-    self.check(math_ops.unsorted_segment_sum, (7, 1, [4]),
-               'num_segments should be a scalar', [0, 7, 0, 0])
-
-  def testRange(self):
-    self.check(math_ops.range, ([0], 3, 2), 'start must be a scalar', [0, 2])
-    self.check(math_ops.range, (0, [3], 2), 'limit must be a scalar', [0, 2])
-    self.check(math_ops.range, (0, 3, [2]), 'delta must be a scalar', [0, 2])
-
-  def testSlice(self):
-    data = np.arange(10)
-    error = 'Expected begin and size arguments to be 1-D tensors'
-    self.check(array_ops.slice, (data, 2, 3), error, [2, 3, 4])
-    self.check(array_ops.slice, (data, [2], 3), error, [2, 3, 4])
-    self.check(array_ops.slice, (data, 2, [3]), error, [2, 3, 4])
-
-  def testSparseToDense(self):
-    self.check(sparse_ops.sparse_to_dense, (1, 4, 7),
-               'output_shape should be a vector', [0, 7, 0, 0])
-
-  def testTile(self):
-    self.check(array_ops.tile, ([7], 2), 'Expected multiples to be 1-D', [7, 7])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
new file mode 100644
index 00000000000..b34426cc215
--- /dev/null
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -0,0 +1,125 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for scalar strictness and scalar leniency."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_io_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+class ScalarTest(test.TestCase):
+
+  def check(self, op, args, error, correct=None):
+    # Within Google, the switch to scalar strict occurred at version 6.
+    lenient = []
+    strict = [5, 6]
+
+    # Use placeholders to bypass shape inference, since only the C++
+    # GraphDef level is ever scalar lenient.
+    def placeholders(args, feed):
+      if isinstance(args, tuple):
+        return [placeholders(x, feed) for x in args]
+      else:
+        x = ops.convert_to_tensor(args).eval()
+        fake = array_ops.placeholder(np.asarray(x).dtype)
+        feed[fake] = x
+        return fake
+
+    # Test various GraphDef versions
+    for version in strict + lenient:
+      with ops.Graph().as_default() as g:
+        g.graph_def_versions.producer = version
+        with self.test_session(graph=g) as sess:
+          feed = {}
+          xs = placeholders(args, feed)
+          x = op(*xs)
+          if version in strict:
+            with self.assertRaisesOpError(error):
+              sess.run(x, feed_dict=feed)
+          else:
+            r = sess.run(x, feed_dict=feed)
+            if correct is not None:
+              self.assertAllEqual(r, correct)
+
+  def testConcat(self):
+    self.check(array_ops.concat, (([2], [3], [7]), [0]),
+               'axis tensor should be a scalar integer', [2, 3, 7])
+    for data in (2, 3, 7), (2, [3], 7), (2, 3, [7]):
+      self.check(array_ops.concat, (data, 0),
+                 r'Expected \w+ dimensions in the range \[0, 0\)', [2, 3, 7])
+    for data in ([2], 3, 7), ([2], [3], 7):
+      self.check(array_ops.concat, (data, 0),
+                 r'Ranks of all input tensors should match', [2, 3, 7])
+
+  def testFill(self):
+    self.check(array_ops.fill, (2, 3), 'dims must be a vector', [3, 3])
+    self.check(array_ops.fill, ([2], [3]), 'value must be a scalar', [3, 3])
+
+  def testPad(self):
+    self.check(array_ops.pad, (7, [[1, 2]]),
+               'The first dimension of paddings must be the rank of inputs',
+               [0, 7, 0, 0])
+
+  def testRandom(self):
+    self.check(random_ops.random_uniform, (3,), 'shape must be a vector')
+
+  def testReshape(self):
+    self.check(array_ops.reshape, (7, 1), 'sizes input must be 1-D', [7])
+
+  def testShardedFilename(self):
+    self.check(gen_io_ops._sharded_filename, ('foo', 4, [100]),
+               'must be a scalar', b'foo-00004-of-00100')
+
+  def testShardedFilespec(self):
+    self.check(gen_io_ops._sharded_filespec, ('foo', [100]), 'must be a scalar',
+               b'foo-?????-of-00100')
+
+  def testUnsortedSegmentSum(self):
+    self.check(math_ops.unsorted_segment_sum, (7, 1, [4]),
+               'num_segments should be a scalar', [0, 7, 0, 0])
+
+  def testRange(self):
+    self.check(math_ops.range, ([0], 3, 2), 'start must be a scalar', [0, 2])
+    self.check(math_ops.range, (0, [3], 2), 'limit must be a scalar', [0, 2])
+    self.check(math_ops.range, (0, 3, [2]), 'delta must be a scalar', [0, 2])
+
+  def testSlice(self):
+    data = np.arange(10)
+    error = 'Expected begin and size arguments to be 1-D tensors'
+    self.check(array_ops.slice, (data, 2, 3), error, [2, 3, 4])
+    self.check(array_ops.slice, (data, [2], 3), error, [2, 3, 4])
+    self.check(array_ops.slice, (data, 2, [3]), error, [2, 3, 4])
+
+  def testSparseToDense(self):
+    self.check(sparse_ops.sparse_to_dense, (1, 4, 7),
+               'output_shape should be a vector', [0, 7, 0, 0])
+
+  def testTile(self):
+    self.check(array_ops.tile, ([7], 2), 'Expected multiples to be 1-D', [7, 7])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index b470919dcac..8519d19fe19 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -94,13 +94,11 @@ class ScatterNdTest(test.TestCase):
                         tf_scatter,
                         vtype,
                         itype,
-                        use_gpu,
                         repeat_indices=False):
     np.random.seed(8)
     ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
     indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
-    # TODO(apassos): re-enable when GPU support is working again.
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
         num_updates = indices_shape[0]
         ixdim = indices_shape[-1]
@@ -144,8 +142,48 @@ class ScatterNdTest(test.TestCase):
   def _VariableRankTests(self, np_scatter, tf_scatter):
     for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
-        for use_gpu in (False, True):
-          self._VariableRankTest(np_scatter, tf_scatter, vtype, itype, use_gpu)
+        self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
+
+  def testSimple(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
+    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      result = sess.run(scatter)
+      self.assertAllClose(result, expected)
+
+  def testSimple2(self):
+    indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
+    updates = constant_op.constant([11., 12.], dtype=dtypes.float32)
+    ref = variables.Variable(
+        [[0., 0.], [0., 0.], [0., 0.]], dtype=dtypes.float32)
+    expected = np.array([[0., 0.], [11., 12.], [0., 0.]])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      result = sess.run(scatter)
+      self.assertAllClose(result, expected)
+
+  def testSimple3(self):
+    indices = constant_op.constant([[1]], dtype=dtypes.int32)
+    updates = constant_op.constant([[11., 12.]], dtype=dtypes.float32)
+    ref = variables.Variable(
+        [[0., 0.], [0., 0.], [0., 0.]], dtype=dtypes.float32)
+    expected = np.array([[0., 0.], [11., 12.], [0., 0.]])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      result = sess.run(scatter)
+      self.assertAllClose(result, expected)
 
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
@@ -167,14 +205,8 @@ class ScatterNdTest(test.TestCase):
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
     for vtype in (np.float32, np.float64):
       for itype in (np.int32, np.int64):
-        for use_gpu in (False, True):
-          self._VariableRankTest(
-              np_scatter,
-              tf_scatter,
-              vtype,
-              itype,
-              use_gpu,
-              repeat_indices=True)
+        self._VariableRankTest(
+            np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
@@ -378,8 +410,8 @@ class ScatterNdTest(test.TestCase):
     indices = constant_op.constant([[0, 1]] * num_updates, dtype=dtypes.int32)
     updates = constant_op.constant(update_values, dtype=dtypes.float64)
 
-    exepected_result = np.zeros([2, 2], dtype=np.float64)
-    exepected_result[0, 1] = np.sum(update_values)
+    expected_result = np.zeros([2, 2], dtype=np.float64)
+    expected_result[0, 1] = np.sum(update_values)
 
     scatter = state_ops.scatter_nd_add(ref, indices, updates)
     init = variables.global_variables_initializer()
@@ -387,7 +419,7 @@ class ScatterNdTest(test.TestCase):
     with session.Session() as sess:
       sess.run(init)
       result = sess.run(scatter)
-      assert np.allclose(result, exepected_result)
+      assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
   def _disabledTestScatterOutOfRangeGpu(self):
@@ -424,6 +456,34 @@ class ScatterNdTest(test.TestCase):
       val = array_ops.scatter_nd(indices, values, shape).eval()
     self.assertAllClose([np.sum(values)], val)
 
+  def testSmokeScatterNdBatch2DSliceDim2(self):
+    with self.test_session():
+      indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
+      values = array_ops.zeros([3, 5, 7])
+      shape = [4, 6, 7]
+      array_ops.scatter_nd(indices, values, shape).eval()
+
+  def testSmokeScatterNdBatch1DSliceDim2(self):
+    with self.test_session():
+      indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
+      values = array_ops.zeros([0, 7])
+      shape = [4, 6, 7]
+      array_ops.scatter_nd(indices, values, shape).eval()
+
+  def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
+    with self.test_session():
+      indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
+      values = array_ops.zeros([1, 6, 7, 8, 9])
+      shape = [3, 4, 5, 6, 7, 8, 9]
+      array_ops.scatter_nd(indices, values, shape).eval()
+
+  def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
+    with self.test_session():
+      indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
+      values = array_ops.zeros([1, 2, 6, 7, 8, 9])
+      shape = [3, 4, 5, 6, 7, 8, 9]
+      array_ops.scatter_nd(indices, values, shape).eval()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 485530d4059..33269c91234 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -36,8 +36,11 @@ class SegmentReductionHelper(test.TestCase):
       num_elem *= x
     values = np.arange(1, num_elem + 1)
     np_values = values.reshape(input_shape).astype(dtype.as_numpy_dtype)
+    # Add a non-zero imaginary component to complex types.
+    if dtype.is_complex:
+      np_values -= 1j * np_values
     return constant_op.constant(
-        values, shape=input_shape, dtype=dtype), np_values
+        np_values, shape=input_shape, dtype=dtype), np_values
 
   def _segmentReduce(self, indices, x, op1, op2=None, num_out_rows=None):
     if not x.size:
@@ -69,10 +72,6 @@ class SegmentReductionHelper(test.TestCase):
     output = [o.reshape(slice_shape) for o in output]
     return np.array(output)
 
-  def _assertAllClose(self, indices, np_x, tf_x):
-    for i in set(np.asarray(indices).ravel()):
-      self.assertAllClose(np_x[i], tf_x[i])
-
   def _mean_cum_op(self, x, y):
     return (x[0] + y, x[1] + 1) if isinstance(x, tuple) else (x + y, 2)
 
@@ -115,7 +114,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
           np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
           s = tf_op(data=tf_x, segment_ids=indices)
           tf_ans = s.eval()
-          self._assertAllClose(indices, np_ans, tf_ans)
+          self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
           # onwards, because the size of dimension 0 is data-dependent
@@ -147,54 +146,58 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval()
       self.assertAllEqual([[15, 18, 21, 24], [13, 14, 15, 16]], result)
 
+  def testSegmentIdsGreaterThanZero(self):
+    shape = [4, 4]
+    with self.test_session():
+      tf_x, np_x = self._input(shape)
+      indices = [1, 1, 2, 2]
+      np_ans = self._segmentReduce(indices, np_x, np.add)
+      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+      tf_ans = s.eval()
+      self.assertAllClose(np_ans, tf_ans)
+
+  def testSegmentIdsHole(self):
+    shape = [4, 4]
+    with self.test_session():
+      tf_x, np_x = self._input(shape)
+      indices = [0, 0, 3, 3]
+      np_ans = self._segmentReduce(indices, np_x, np.add)
+      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+      tf_ans = s.eval()
+      self.assertAllClose(np_ans, tf_ans)
+
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
     with self.test_session():
       tf_x, _ = self._input(shape)
       indices = [-1, -1, 0, 0]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids do not start at 0"):
+      with self.assertRaisesOpError(
+          r"Segment id -1 out of range \[0, 1\), possibly because "
+          "'segment_ids' input is not sorted."):
         s.eval()
 
   def testSegmentIdsInvalid2(self):
     shape = [4, 4]
     with self.test_session():
       tf_x, _ = self._input(shape)
-      indices = [1, 1, 2, 2]
+      indices = [0, 1, 0, 1]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids do not start at 0"):
+      with self.assertRaisesOpError("segment ids are not increasing"):
         s.eval()
 
   def testSegmentIdsInvalid3(self):
-    shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 0, 2, 2]
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids are not increasing by 1"):
-        s.eval()
-
-  def testSegmentIdsInvalid4(self):
-    shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 1, 0, 1]
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids are not increasing by 1"):
-        s.eval()
-
-  def testSegmentIdsInvalid5(self):
     shape = [4, 4]
     with self.test_session():
       tf_x, _ = self._input(shape)
       indices = [0, 1, 2, 0]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
       with self.assertRaisesOpError(
-          r"Segment id 1 out of range \[0, 1\), probably "
+          r"Segment id 1 out of range \[0, 1\), possibly "
           "because 'segment_ids' input is not sorted."):
         s.eval()
 
-  def testSegmentIdsInvalid6(self):
+  def testSegmentIdsInvalid4(self):
     shape = [4, 4]
     with self.test_session():
       tf_x, _ = self._input(shape)
@@ -203,7 +206,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError("segment ids must be >= 0"):
         s.eval()
 
-  def testSegmentIdsInvalid7(self):
+  def testSegmentIdsInvalid5(self):
     shape = [4, 4]
     with self.test_session():
       tf_x, _ = self._input(shape)
@@ -228,11 +231,10 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             s, [3, 4],
             x_init_value=np_x.astype(np.double),
             delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+      self.assertAllClose(jacob_t, jacob_n)
 
 
 class UnsortedSegmentSumTest(SegmentReductionHelper):
-  use_gpu = False
 
   def testValues(self):
     dtypes = [
@@ -244,33 +246,35 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
       for dtype in dtypes:
-        with self.test_session(use_gpu=self.use_gpu):
+        with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
               indices, np_x, np.add, op2=None, num_out_rows=num_segments)
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
           tf_ans = s.eval()
-        self._assertAllClose(indices, np_ans, tf_ans)
+        self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
   def testGradientSegmentSum(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=self.use_gpu):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_sum(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s, [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double),
-            delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
+                  dtypes_lib.complex128]:
+      for indices in indices_flat, indices_flat.reshape(5, 2):
+        shape = indices.shape + (num_cols,)
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape, dtype=dtype)
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x, segment_ids=indices, num_segments=num_segments)
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              tf_x,
+              shape,
+              s, [num_segments, num_cols],
+              x_init_value=np_x,
+              delta=1)
+        self.assertAllClose(jacob_t, jacob_n)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -283,27 +287,28 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    with self.test_session(use_gpu=self.use_gpu):
-      tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-      # Results from UnsortedSegmentSum
-      unsorted_s = math_ops.unsorted_segment_sum(
-          data=tf_x, segment_ids=indices, num_segments=num_segments)
-      (unsorted_jacob_t, unsorted_jacob_n) = gradient_checker.compute_gradient(
-          tf_x,
-          shape,
-          unsorted_s, [num_segments, num_cols],
-          x_init_value=np_x.astype(np.double),
-          delta=1)
-      # Results from SegmentSum
-      sorted_s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      sorted_jacob_t, sorted_jacob_n = gradient_checker.compute_gradient(
-          tf_x,
-          shape,
-          sorted_s, [num_segments, num_cols],
-          x_init_value=np_x.astype(np.double),
-          delta=1)
-    self.assertAllClose(unsorted_jacob_t, sorted_jacob_t, rtol=1e-3, atol=1e-3)
-    self.assertAllClose(unsorted_jacob_n, sorted_jacob_n, rtol=1e-3, atol=1e-3)
+    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
+                  dtypes_lib.complex128]:
+      with self.test_session(use_gpu=True):
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        # Results from UnsortedSegmentSum
+        unsorted_s = math_ops.unsorted_segment_sum(
+            data=tf_x, segment_ids=indices, num_segments=num_segments)
+        unsorted_jacob_t, unsorted_jacob_n = (
+            gradient_checker.compute_gradient(tf_x, shape, unsorted_s,
+                                              [num_segments, num_cols],
+                                              x_init_value=np_x, delta=1))
+
+        # Results from SegmentSum
+        sorted_s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        sorted_jacob_t, sorted_jacob_n = gradient_checker.compute_gradient(
+            tf_x,
+            shape,
+            sorted_s, [num_segments, num_cols],
+            x_init_value=np_x,
+            delta=1)
+      self.assertAllClose(unsorted_jacob_t, sorted_jacob_t)
+      self.assertAllClose(unsorted_jacob_n, sorted_jacob_n)
 
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
@@ -319,7 +324,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     dtypes = [
         np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
     ]
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
           data = np.zeros((2, 0), dtype=dtype)
@@ -333,7 +338,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_segments = max(indices_flat) + 3
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (num_cols,)
-      with self.test_session():
+      with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
         s = math_ops.unsorted_segment_max(data=tf_x, segment_ids=indices,
                                     num_segments=num_segments)
@@ -343,10 +348,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
             s,
             [num_segments, num_cols],
             x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-class UnsortedSegmentSumGpuTest(UnsortedSegmentSumTest):
-  use_gpu = True
+      self.assertAllClose(jacob_t, jacob_n)
 
 
 class SparseSegmentReductionHelper(SegmentReductionHelper):
@@ -394,13 +396,41 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
                                              np_op1, np_op2)
           s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
           tf_ans = s.eval()
-          self._assertAllClose(segment_indices, np_ans, tf_ans)
+          self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
           # onwards, because the size of dimension 0 is data-dependent
           # and may therefore vary dynamically.
           self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
+  def testSegmentIdsHole(self):
+    tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
+        self._mean_cum_op, self._mean_reduce_op, math_ops.sparse_segment_mean)]
+    segment_indices = [0, 2, 2, 2]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for np_op1, np_op2, tf_op in ops_list:
+        np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
+                                           np_op1, np_op2)
+        s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+
+  def testSegmentIdsGreaterThanZero(self):
+    tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
+        self._mean_cum_op, self._mean_reduce_op, math_ops.sparse_segment_mean)]
+    segment_indices = [1, 2, 2, 2]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for np_op1, np_op2, tf_op in ops_list:
+        np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
+                                           np_op1, np_op2)
+        s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+
   def testValid(self):
     # Baseline for the test*Invalid* methods below.
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
@@ -436,17 +466,6 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             r"indices\[3\] == 10 out of range \[0, 10\)"):
           s.eval()
 
-  def testSegmentsInvalid1(self):
-    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
-    ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
-    segment_indices = [0, 2, 2, 2]
-    tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
-      for tf_op in ops_list:
-        s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        with self.assertRaisesOpError("segment ids are not increasing by 1"):
-          s.eval()
-
   def testSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -455,7 +474,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.test_session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        with self.assertRaisesOpError("segment ids are not increasing by 1"):
+        with self.assertRaisesOpError("segment ids are not increasing"):
           s.eval()
 
   def testSegmentsInvalid3(self):
@@ -467,7 +486,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
-            r"Segment id 1 out of range \[0, 1\), probably because "
+            r"Segment id 1 out of range \[0, 1\), possibly because "
             "'segment_ids' input is not sorted"):
           s.eval()
 
@@ -479,18 +498,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.test_session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        with self.assertRaisesOpError("segment ids do not start at 0"):
-          s.eval()
-
-  def testSegmentsInvalid5(self):
-    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
-    ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
-    segment_indices = [1, 2, 2, 2]
-    tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
-      for tf_op in ops_list:
-        s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        with self.assertRaisesOpError("segment ids do not start at 0"):
+        with self.assertRaisesOpError(
+            r"Segment id -1 out of range \[0, 2\), possibly because "
+            "'segment_ids' input is not sorted"):
           s.eval()
 
   def testSegmentsInvalid6(self):
@@ -531,7 +541,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             s, [3, 4],
             x_init_value=np_x.astype(np.double),
             delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+      self.assertAllClose(jacob_t, jacob_n)
 
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 36b3ed33d85..db64458863e 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -49,12 +50,27 @@ def SortEigenDecomposition(e, v):
     return np.take(e, perm, -1), np.take(v, perm, -1)
 
 
+def NormalizeEigenvectorsPhase(v):
+  """Normalizes the phase of the Eigenvectors stored in the columns of `v`.
+
+  (complex) Eigenvectors are only unique up to an arbitrary phase.
+  We normalize the vectors such that the first component has phase 0.
+
+  Args:
+    v: `np.ndarray` with Eigenvectors as returned from `np.linalg.eigh`.
+
+  Returns:
+    `np.ndarray` normalized Eigenvectors.
+  """
+  reference = v / np.linalg.norm(v[..., 0:1, :], axis=-1, keepdims=True)
+  return v * reference.conj()
+
+
 def _GetSelfAdjointEigTest(dtype_, shape_):
 
   def CompareEigenVectors(self, x, y, tol):
-    # Eigenvectors are only unique up to sign so we normalize the signs first.
-    signs = np.sign(np.sum(np.divide(x, y), -2, keepdims=True))
-    x *= signs
+    x = NormalizeEigenvectorsPhase(x)
+    y = NormalizeEigenvectorsPhase(y)
     self.assertAllClose(x, y, atol=tol, rtol=tol)
 
   def CompareEigenDecompositions(self, x_e, x_v, y_e, y_v, tol):
@@ -74,16 +90,20 @@ def _GetSelfAdjointEigTest(dtype_, shape_):
     np.random.seed(1)
     n = shape_[-1]
     batch_shape = shape_[:-2]
+    np_dtype = dtype_.as_numpy_dtype
     a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-    a += a.T
+        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+    if dtype_.is_complex:
+      a += 1j * np.random.uniform(
+          low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+    a += np.conj(a.T)
     a = np.tile(a, batch_shape + (1, 1))
-    if dtype_ == np.float32:
+    if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       atol = 1e-4
     else:
       atol = 1e-12
     for compute_v in False, True:
-      np_e, np_v = np.linalg.eig(a)
+      np_e, np_v = np.linalg.eigh(a)
       with self.test_session():
         if compute_v:
           tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))
@@ -95,7 +115,7 @@ def _GetSelfAdjointEigTest(dtype_, shape_):
               adjoint_b=True)
           self.assertAllClose(a_ev.eval(), a, atol=atol)
 
-          # Compare to numpy.linalg.eig.
+          # Compare to numpy.linalg.eigh.
           CompareEigenDecompositions(self, np_e, np_v,
                                      tf_e.eval(), tf_v.eval(), atol)
         else:
@@ -116,26 +136,38 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_):
     np.random.seed(1)
     n = shape_[-1]
     batch_shape = shape_[:-2]
+    np_dtype = dtype_.as_numpy_dtype
     a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-    a += a.T
+        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+    if dtype_.is_complex:
+      a += 1j * np.random.uniform(
+          low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+    a += np.conj(a.T)
     a = np.tile(a, batch_shape + (1, 1))
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
-    epsilon = np.finfo(dtype_).eps
+    epsilon = np.finfo(np_dtype).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
     # tolerance obtained by looking at actual differences using
     # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-    if dtype_ == np.float32:
+    if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       tol = 1e-2
     else:
       tol = 1e-7
     with self.test_session():
       tf_a = constant_op.constant(a)
       tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
+      # (complex) Eigenvectors are only unique up to an arbitrary phase
+      # We normalize the vectors such that the first component has phase 0.
+      reference = tf_v / linalg_ops.norm(
+          tf_v[..., 0:1, :], axis=-1, keep_dims=True)
+      tf_v *= math_ops.conj(reference)
       for b in tf_e, tf_v:
         x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-        x_init += x_init.T
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+        if dtype_.is_complex:
+          x_init += 1j * np.random.uniform(
+              low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+        x_init += np.conj(x_init.T)
         x_init = np.tile(x_init, batch_shape + (1, 1))
         theoretical, numerical = gradient_checker.compute_gradient(
             tf_a,
@@ -150,11 +182,13 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_):
 
 
 if __name__ == '__main__':
-  for dtype in np.float32, np.float64:
+  for dtype in (
+      dtypes_lib.float32, dtypes_lib.float64,
+      dtypes_lib.complex64, dtypes_lib.complex128):
     for size in 1, 2, 5, 10:
       for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
         shape = batch_dims + (size, size)
-        name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+        name = '%s_%s' % (dtype, '_'.join(map(str, shape)))
         setattr(SelfAdjointEigTest, 'testSelfAdjointEig_' + name,
                 _GetSelfAdjointEigTest(dtype, shape))
         setattr(SelfAdjointEigGradTest, 'testSelfAdjointEigGrad_' + name,
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index 25d60c52590..678016b13dc 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -20,8 +20,11 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import session_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -229,6 +232,71 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  def testFeedOneHandleDirectly(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(10.0)
+      b = constant_op.constant(5.0)
+      c = math_ops.multiply(a, b)
+      d = math_ops.multiply(c, c)
+
+      h_c = sess.run(session_ops.get_session_handle(c))
+
+      self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
+
+  def testDirectHandleFeedOverlappingWithFetches(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(10.0)
+      b = constant_op.constant(5.0)
+      c = math_ops.multiply(a, b)
+      h_c = sess.run(session_ops.get_session_handle(c))
+      d = array_ops.identity(c)
+
+      c_val = sess.run(c, feed_dict={c: h_c})
+      self.assertAllClose(50.0, c_val)
+
+      d_val = sess.run(d, feed_dict={c: h_c})
+      self.assertAllClose(50.0, d_val)
+
+      c_val, d_val = sess.run([c, d], feed_dict={c: h_c, d: 60.0})
+      self.assertAllClose(50.0, c_val)
+      self.assertAllClose(60.0, d_val)
+
+      c_val, d_val = sess.run([c, d], feed_dict={c: 60.0, d: h_c})
+      self.assertAllClose(60.0, c_val)
+      self.assertAllClose(50.0, d_val)
+
+      c_val, d_val = sess.run([c, d], feed_dict={c: h_c, d: h_c})
+      self.assertAllClose(50.0, c_val)
+      self.assertAllClose(50.0, d_val)
+
+  def testFeedTwoHandlesDirectly(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(10.0)
+      b = constant_op.constant(5.0)
+      c = math_ops.multiply(a, b)
+      d = math_ops.div(a, b)
+      e = math_ops.subtract(c, d)
+
+      h_c = sess.run(session_ops.get_session_handle(c))
+      h_d = sess.run(session_ops.get_session_handle(d))
+
+      self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
+      self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
+
+  def testFeedHandleToVariableDirectly(self):
+    with self.test_session() as sess:
+      a = variables.Variable(12.0)
+      inc_a = state_ops.assign_add(a, 2.0)
+      b = math_ops.add(a, 5.0)
+      sess.run(a.initializer)
+
+      h_a_read = sess.run(session_ops.get_session_handle(a.read_value()))
+      self.assertAllClose(12.0, sess.run(a))
+
+      self.assertAllClose(17.0, sess.run(b, feed_dict={a: h_a_read}))
+      sess.run(inc_a)
+      self.assertAllClose(19.0, sess.run(b, feed_dict={a: h_a_read}))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 875ac3a4276..97d61d52af5 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -147,6 +147,14 @@ class ShapeOpsTest(test.TestCase):
     self._testAll(np.random.randn(2, 3, 5, 7, 11))
     self._testAll(np.random.randn(2, 3, 5, 7, 11, 13))
 
+  def testBool(self):
+    self._testAll(np.random.choice((False, True), size=(2,)))
+    self._testAll(np.random.choice((False, True), size=(2, 3)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11, 13)))
+
   # Disabled because it takes too long to run, but manually verified
   # as passing at time of writing.
   def _test64BitOutput(self):
@@ -197,12 +205,38 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -3)
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -4)
 
+  def testExpandDimsBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    self._compareExpandDimsAll(choice([2]), 0)
+    self._compareExpandDimsAll(choice([2]), 1)
+    self._compareExpandDimsAll(choice([2]), -1)
+
+    self._compareExpandDimsAll(choice([2, 3]), 0)
+    self._compareExpandDimsAll(choice([2, 3]), 1)
+    self._compareExpandDimsAll(choice([2, 3]), 2)
+    self._compareExpandDimsAll(choice([2, 3]), -1)
+    self._compareExpandDimsAll(choice([2, 3]), -2)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), 0)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 3)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), -1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -3)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -4)
+
   def testExpandDimsErrors(self):
     with self.test_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), -5)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], -5)
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), 4)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], 4)
 
   def testExpandDimsGradient(self):
     with self.test_session():
@@ -220,6 +254,10 @@ class ShapeOpsTest(test.TestCase):
       self.assertAllEqual([7], array_ops.expand_dims(inp, 0).eval())
       self.assertAllEqual([7], array_ops.expand_dims(inp, -1).eval())
 
+      inp = constant_op.constant(True)
+      self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
+      self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
+
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       if squeeze_dims:
@@ -250,6 +288,18 @@ class ShapeOpsTest(test.TestCase):
     # Squeeze on both ends.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]))
 
+  def testSqueezeBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Nothing to squeeze.
+    self._compareSqueezeAll(choice([2]))
+    self._compareSqueezeAll(choice([2, 3]))
+
+    # Squeeze the middle element away.
+    self._compareSqueezeAll(choice([2, 1, 2]))
+
+    # Squeeze on both ends.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]))
+
   def testSqueezeSpecificDimension(self):
     # Positive squeeze dim index.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [0])
@@ -261,6 +311,18 @@ class ShapeOpsTest(test.TestCase):
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5])
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5, -1])
 
+  def testSqueezeSpecificDimensionBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Positive squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [2, 4])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0, 4, 2])
+
+    # Negative squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-1])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5, -1])
+
   def testSqueezeAllOnes(self):
     # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
     # Verify that we do the same.
@@ -271,6 +333,16 @@ class ShapeOpsTest(test.TestCase):
         tf_ans = tensor.eval()
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  def testSqueezeAllOnesBool(self):
+    # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
+    # Verify that we do the same.
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        tensor = array_ops.squeeze([[[False]]], [])
+        self.assertEqual(np.shape(1), tensor.get_shape())
+        tf_ans = tensor.eval()
+        self.assertEqual(np.shape(1), tf_ans.shape)
+
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
@@ -348,6 +420,16 @@ class TileTest(test.TestCase):
     self.assertEqual([4, 4], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 4))).all())
 
+  def testIdentityTileAndGrad(self):
+    with self.test_session():
+      inp = np.random.rand(4, 1).astype(np.float32)
+      a = constant_op.constant(inp)
+      tiled = array_ops.tile(a, [1, 1])
+      result = tiled.eval()
+    self.assertEqual(result.shape, (4, 1))
+    self.assertEqual([4, 1], tiled.get_shape())
+    self.assertTrue((result == np.tile(inp, (1, 1))).all())
+
   def testEmpty(self):
     with self.test_session():
       inp = np.random.rand(2, 3).astype(np.float32)
@@ -528,6 +610,7 @@ class TileTest(test.TestCase):
     self._RunAndVerifyGradientResult([], [])
 
   def testGradientRandom(self):
+    self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 1, 1, 1, 1])
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
     self._RunAndVerifyGradientResult([2, 3, 1, 1, 3], [3, 1, 1, 2, 2])
     self._RunAndVerifyGradientResult([2, 1, 3, 3, 2], [1, 3, 3, 1, 2])
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 29f76a21826..c11f78b77e9 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -269,6 +269,15 @@ class SliceTest(test.TestCase):
     c = array_ops.slice(a, [begin, 0], [-1, 2])
     self.assertEqual([None, 2], c.get_shape().as_list())
 
+  def testSliceOfSlice(self):
+    with self.test_session(use_gpu=True):
+      a = constant_op.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+      b = a[1:, :]
+      c = b[:-1, :]
+      d = c[1, :]
+      res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
+      self.assertAllEqual([0, 0, 0], res.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index f70f60c0f5e..b8e7c50a378 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -85,6 +86,45 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) gradient of gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testGradGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      (grad_grad,) = gradients_impl.gradients(grad, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad_grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) third-order gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testWarnInts(self):
+    # Running the op triggers address sanitizer errors, so we just make it
+    nn_ops.softplus(constant_op.constant(7))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 5fd5253c092..371f86ff151 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -65,6 +65,12 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testWarnInts(self):
+    # NOTE(irving): Actually I don't know how to intercept the warning, but
+    # let's make sure it runs.  I promised I've looked, and there was a warning.
+    with self.test_session():
+      nn_ops.softsign(constant_op.constant(7)).eval()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 874dcbabf10..7371ebe3893 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gradient_checker
@@ -88,6 +89,7 @@ class SparseAddTest(test.TestCase):
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
+          self.assertAllEqual((3, 3), sp_sum.get_shape())
 
           sum_out = sess.run(sp_sum)
 
@@ -188,6 +190,22 @@ class SparseAddTest(test.TestCase):
                                                     [(nnz,), (n, m)], s, (n, m))
       self.assertLess(err, 1e-3)
 
+  def testInvalidSparseTensor(self):
+    with self.test_session(use_gpu=False) as sess:
+      shape = [2, 2]
+      val = [0]
+      dense = constant_op.constant(np.zeros(shape, dtype=np.int32))
+
+      for bad_idx in [
+          [[-1, 0]],  # -1 is invalid.
+          [[1, 3]],  # ...so is 3.
+      ]:
+        sparse = sparse_tensor.SparseTensorValue(bad_idx, val, shape)
+        s = sparse_ops.sparse_add(sparse, dense)
+
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "invalid index"):
+          sess.run(s)
 
 ######################## Benchmarking code
 
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
new file mode 100644
index 00000000000..3d09badf27e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -0,0 +1,398 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse_cross_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class SparseCrossOpTest(test.TestCase):
+
+  def test_simple(self):
+    """Tests a simple scenario."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['batch1-FC1-F1_X_batch1-FC2-F1'], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_dense(self):
+    """Tests only dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                              ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                             dtypes.string),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+    ], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 55555]]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [
+        '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1',
+        '55555_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1',
+        '333_X_batch1-FC2-F2'
+    ], [
+        '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+        '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'], [
+            'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+            'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_sparse_input(self):
+    """Tests mixed type sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 5555]]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [
+            '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+            '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor([['batch1-FC2-F1']]),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(col1), self._sparse_tensor(col2),
+        self._sparse_tensor(col3)
+    ])
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
+        self._sparse_tensor([], 1),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
+        self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([]), self._sparse_tensor([]),
+        self._sparse_tensor([])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_hashed_zero_bucket_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ])
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[1971693436396284976]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_zero_bucket(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[4847552627144134031]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  def test_hashed_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[83]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_output(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[31]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed__has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]])
+    t2 = constant_op.constant([list(range(10)), list(range(10))])
+    cross = sparse_ops._sparse_cross_hashed(
+        [t2, t1],
+        num_buckets=1024,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = cross_dense.eval()
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor(
+                [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+        ],
+        num_buckets=1000)
+    with self.test_session() as sess:
+      out = sess.run(op)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEquals(0, sp.indices.size)
+    self.assertEquals(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEquals(0, sp.dense_shape[1])
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+          list represents a batch. Each item of the batch is a feature of a
+          specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+          entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (dtypes.string if not values or isinstance(values[0], str) else
+                  dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 06d5cbaf2d0..aff9753eb89 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import unittest
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -328,6 +329,12 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 6, 7], dtype=np.int64)
+    sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
+    self.assertAllEqual([3, 6, 7], sp_output.get_shape())
+
   def testBasic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -397,14 +404,21 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  def testInvalidDimensionSize(self):
+  def testInvalidDimensionSizeStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 7, 5], dtype=np.int64)
+
+    with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
+      sparse_ops.sparse_reset_shape(sp_input, new_shape)
+
+  def testInvalidDimensionSizeDynamic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
-      new_shape = np.array([3, 7, 5], dtype=np.int64)
+      new_shape = array_ops.placeholder(dtype=dtypes.int32)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
-        sess.run(out)
+        sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
@@ -418,13 +432,13 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
 
 class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
-  def _SparseTensorValue_5x6(self):
+  def _SparseTensorValue_5x6(self, dtype=np.int32):
     ind = np.array([[0, 0], [1, 0], [1, 3], [1, 4], [3, 2], [3, 3]])
     val = np.array([0, 10, 13, 14, 32, 33])
     shape = np.array([5, 6])
     return sparse_tensor.SparseTensorValue(
-        np.array(ind, np.int64),
-        np.array(val, np.int32), np.array(shape, np.int64))
+        np.array(ind, np.int64), np.array(val, dtype), np.array(
+            shape, np.int64))
 
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
@@ -464,6 +478,40 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(empty_row_indicator_out,
                             np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  def testFillFloat(self):
+    with self.test_session(use_gpu=False) as sess:
+      values = constant_op.constant(
+          [0.0, 10.0, 13.0, 14.0, 32.0, 33.0], dtype=dtypes.float64)
+      default_value = constant_op.constant(-1.0, dtype=dtypes.float64)
+      sp_input = sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0], [1, 0], [1, 3], [1, 4], [3, 2], [3, 3]]),
+          values=values,
+          dense_shape=np.array([5, 6]))
+      sp_output, empty_row_indicator = (sparse_ops.sparse_fill_empty_rows(
+          sp_input, default_value))
+      output, empty_row_indicator_out = sess.run(
+          [sp_output, empty_row_indicator])
+
+      self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4],
+                                           [2, 0], [3, 2], [3, 3], [4, 0]])
+      self.assertAllClose(output.values, [0, 10, 13, 14, -1, 32, 33, -1])
+      self.assertAllEqual(output.dense_shape, [5, 6])
+      self.assertAllEqual(empty_row_indicator_out,
+                          np.array([0, 0, 1, 0, 1]).astype(np.bool))
+
+      values_grad_err = gradient_checker.compute_gradient_error(
+          values, values.shape.as_list(), sp_output.values, [8], delta=1e-8)
+      self.assertGreater(values_grad_err, 0)
+      self.assertLess(values_grad_err, 1e-8)
+
+      default_value_grad_err = gradient_checker.compute_gradient_error(
+          default_value,
+          default_value.shape.as_list(),
+          sp_output.values, [8],
+          delta=1e-8)
+      self.assertGreater(default_value_grad_err, 0)
+      self.assertLess(default_value_grad_err, 1e-8)
+
   def testFillString(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_String5x6()
@@ -540,7 +588,11 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, False)
     self._compare(sp_t, reduction_axes, ndims, True)
 
+  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testSimpleAndRandomInputs(self):
+    if np.__version__ == "1.13.0":
+      self.skipTest("numpy 1.13.0 bug")
+
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
     with self.test_session(use_gpu=False):
@@ -572,7 +624,11 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
 
+  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testGradient(self):
+    if np.__version__ == "1.13.0":
+      self.skipTest("numpy 1.13.0 bug")
+
     np.random.seed(8161)
     test_dims = [(11, 1, 5, 7, 1), (2, 2)]
     with self.test_session(use_gpu=False):
@@ -813,6 +869,9 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
 class SparseTransposeTest(test.TestCase):
 
   def testTranspose(self):
+    if np.__version__ == "1.13.0":
+      self.skipTest("numpy 1.13.0 bug")
+
     with self.test_session(use_gpu=False):
       np.random.seed(1618)
       shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 5136cdadead..18335d665af 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -48,6 +48,13 @@ class SparseReorderTest(test.TestCase):
     shape = np.array([5, 6]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6(np.arange(6)))
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reorder(sp_input)
+    self.assertAllEqual((5, 6), sp_output.get_shape())
+
   def testAlreadyInOrder(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 1bb05aa3b2a..e87fa0c94c4 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -50,6 +50,13 @@ class SparseReshapeTest(test.TestCase):
     shape = np.array([2, 3, 4])
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6())
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reshape(sp_input, shape=(1, 5, 2, 3))
+    self.assertAllEqual((1, 5, 2, 3), sp_output.get_shape())
+
   def testSameShape(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
@@ -71,6 +78,18 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  def testWorksWellWithTfShape(self):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorPlaceholder()
+      input_val = self._SparseTensorValue_5x6()
+      shape = array_ops.shape(sp_input)  # tf.shape generates int32 output
+      sp_output = sparse_ops.sparse_reshape(sp_input, shape)
+
+      output_val = sess.run(sp_output, {sp_input: input_val})
+      self.assertAllEqual(output_val.indices, input_val.indices)
+      self.assertAllEqual(output_val.values, input_val.values)
+      self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
+
   def testFeedSameShapeWithInferredDim(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -180,6 +199,12 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output shape size may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  def testProvideStaticallyMismatchedSizes(self):
+    input_val = self._SparseTensorValue_5x6()
+    sp_input = sparse_tensor.SparseTensor.from_value(input_val)
+    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+      sparse_ops.sparse_reshape(sp_input, [4, 7])
+
   def testFeedMismatchedSizes(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index df5462dd2d0..e8b94294b1b 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -30,34 +30,44 @@ from tensorflow.python.platform import test
 
 class SparseTensorDenseMatMulGradientTest(test.TestCase):
 
-  def _sparsify(self, x):
+  def _sparsify(self, x, indices_dtype=np.int64):
     x[x < 0.5] = 0
 
     non_zero = np.where(x)
-    x_indices = np.vstack(non_zero).astype(np.int64).T
+    x_indices = np.vstack(non_zero).astype(indices_dtype).T
     x_values = x[non_zero]
     x_shape = x.shape
 
     return sparse_tensor.SparseTensor(
         indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
-  def _randomTensor(self, size, np_dtype, adjoint=False, sparse=False):
+  def _randomTensor(self,
+                    size,
+                    values_dtype,
+                    adjoint=False,
+                    sparse=False,
+                    indices_dtype=np.int64):
     n, m = size
-    x = np.random.randn(n, m).astype(np_dtype)
+    x = np.random.randn(n, m).astype(values_dtype)
 
     if adjoint:
       x = x.transpose()
 
     if sparse:
-      return self._sparsify(x)
+      return self._sparsify(x, indices_dtype=indices_dtype)
     else:
-      return constant_op.constant(x, dtype=np_dtype)
+      return constant_op.constant(x, dtype=values_dtype)
 
-  def _testGradients(self, adjoint_a, adjoint_b, name, np_dtype):
+  def _testGradients(self, adjoint_a, adjoint_b, name, values_dtype,
+                     indices_dtype):
     n, k, m = np.random.randint(1, 10, size=3)
     sp_t, nnz = self._randomTensor(
-        [n, k], np_dtype, adjoint=adjoint_a, sparse=True)
-    dense_t = self._randomTensor([k, m], np_dtype, adjoint=adjoint_b)
+        [n, k],
+        values_dtype,
+        adjoint=adjoint_a,
+        sparse=True,
+        indices_dtype=indices_dtype)
+    dense_t = self._randomTensor([k, m], values_dtype, adjoint=adjoint_b)
 
     matmul = sparse_ops.sparse_tensor_dense_matmul(
         sp_t, dense_t, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name=name)
@@ -71,17 +81,19 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
       print("%s gradient err = %s" % (name, err))
       self.assertLess(err, 1e-3)
 
-  def _testGradientsType(self, np_dtype):
+  def _testGradientsType(self, values_dtype, indices_dtype):
     for adjoint_a in [True, False]:
       for adjoint_b in [True, False]:
-        name = "sparse_tensor_dense_matmul_%s_%s_%s" % (adjoint_a, adjoint_b,
-                                                        np_dtype.__name__)
-        self._testGradients(adjoint_a, adjoint_b, name, np_dtype)
+        name = "sparse_tensor_dense_matmul_%s_%s_%s_%s" % (
+            adjoint_a, adjoint_b, values_dtype.__name__, indices_dtype.__name__)
+        self._testGradients(adjoint_a, adjoint_b, name, values_dtype,
+                            indices_dtype)
 
   def testGradients(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
-    self._testGradientsType(np.float32)
-    self._testGradientsType(np.float64)
+    self._testGradientsType(np.float32, np.int64)
+    self._testGradientsType(np.float64, np.int64)
+    self._testGradientsType(np.float32, np.int32)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 25da6691e62..a0bd178e247 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -45,7 +46,12 @@ def _maybe_complex(x):
 
 class SparseTensorDenseMatMulTest(test.TestCase):
 
-  def _testMatmul(self, x, y, adjoint_a=False, adjoint_b=False):
+  def _testMatmul(self,
+                  x,
+                  y,
+                  adjoint_a=False,
+                  adjoint_b=False,
+                  indices_dtype=np.int64):
     x_mat = np.matrix(x)
     if adjoint_a:
       x_mat = x_mat.H
@@ -55,7 +61,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     np_ans = x_mat * y_mat
 
-    x_indices = np.vstack(np.where(x)).astype(np.int64).T
+    x_indices = np.vstack(np.where(x)).astype(indices_dtype).T
     x_values = x[np.where(x)]
     x_shape = x.shape
 
@@ -82,13 +88,13 @@ class SparseTensorDenseMatMulTest(test.TestCase):
         else:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
 
-  def _testBasic(self, np_dtype):
-    x = _maybe_complex(np.random.rand(10, 10).astype(np_dtype))
+  def _testBasic(self, value_dtype, indices_dtype=np.int64):
+    x = _maybe_complex(np.random.rand(10, 10).astype(value_dtype))
     x[np.abs(x) < 0.5] = 0  # Make it sparse
 
-    y = _maybe_complex(np.random.randn(10, 20).astype(np_dtype))
+    y = _maybe_complex(np.random.randn(10, 20).astype(value_dtype))
 
-    self._testMatmul(x, y)
+    self._testMatmul(x, y, indices_dtype=indices_dtype)
 
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
@@ -97,6 +103,8 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.float64)
     self._testBasic(np.complex64)
     self._testBasic(np.complex128)
+    self._testBasic(np.int32, indices_dtype=np.int32)
+    self._testBasic(np.float32, indices_dtype=np.int32)
 
   def testShapeInference(self):
     x = np.random.rand(10, 10)
@@ -123,6 +131,77 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
       sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
 
+  def testInvalidIndicesForSparseTensorDenseMatmul(self):
+    # Note: use_gpu=False because nice errors are only returned from CPU kernel.
+    with self.test_session(use_gpu=False):
+      indices = np.matrix([[1, 10]]).astype(np.int64)
+      values = np.array([10]).astype(np.float32)
+      shape = [3, 2]
+      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+
+      # Test multiplying by both a small and large dense matrix, to hit
+      # both cases in the kernel.
+      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "k .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "k .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+
+      # Repeat with adjoint_a, to get a different error.
+      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "m .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True).eval()
+      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "m .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True).eval()
+
+  def testInvalidIndicesForSparseTensorDenseMatmulOnGPU(self):
+    # Note: use_gpu=False because nice errors are only returned from CPU kerne
+    if not test.is_gpu_available():
+      return
+    with self.test_session(use_gpu=True):
+      indices = np.array([[1, 10]]).astype(np.int64)
+      values = np.array([10]).astype(np.float32)
+      shape = [3, 2]
+      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+
+      # Test multiplying by both a small and large dense matrix, to hit
+      # both cases in the kernel.
+      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+      expected_t = np.array(
+          [[0] * 500, [np.nan] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+
+      # Repeat with adjoint_a, now the error is that the sparse index
+      # is OOO w.r.t. the output.  The GPU kernel can't do much here,
+      # so it just doesn't accumulate.
+
+      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
+      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+      expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
   # Tests setting one dimension to be a high value.
   def _testLarge(self, np_dtype):
     r1 = np.random.randint(6000, 20000)
@@ -137,9 +216,12 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
       y = _maybe_complex(np.random.randn(k, n).astype(np_dtype))
 
-      self._testMatmul(x, y)
+      self._testMatmul(x, y, adjoint_a=False, adjoint_b=False)
+      self._testMatmul(x.transpose(), y, adjoint_a=True, adjoint_b=False)
+      self._testMatmul(x, y.transpose(), adjoint_a=False, adjoint_b=True)
+      self._testMatmul(
+          x.transpose(), y.transpose(), adjoint_a=True, adjoint_b=True)
 
-  def testLarge(self):
     np.random.seed(127)  # Repeatable results
     self._testLarge(np.float32)
     self._testLarge(np.float64)
@@ -183,7 +265,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x, y, adjoint_a,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -208,7 +292,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(x_ind, x_val, x_shape,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -253,7 +339,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
   if skip_dense:
     delta_dense = float("nan")
   else:
-    with session.Session("", config=config, graph=ops.Graph()) as sess:
+    with session.Session(config=config, graph=ops.Graph()) as sess:
       if not use_gpu:
         with ops.device("/cpu:0"):
           x_t = constant_op.constant(x)
@@ -261,12 +347,12 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
               x_t, y_t, adjoint_a, adjoint_b)
       else:
-        x_t = constant_op.constant(x)
-        y_t = constant_op.constant(y)
-        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x_t, y_t,
-                                                                      adjoint_a,
-                                                                      adjoint_b)
-      delta_dense = _timer(sess, ops_fn, 1000)
+        with ops.device("/gpu:0"):
+          x_t = constant_op.constant(x)
+          y_t = constant_op.constant(y)
+          ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
+              x_t, y_t, adjoint_a, adjoint_b)
+      delta_dense = _timer(sess, ops_fn, 200)
 
   # Using sparse_tensor_dense_matmul.
   with session.Session("", config=config, graph=ops.Graph()) as sess:
@@ -279,13 +365,14 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
         ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
             x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
     else:
-      x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
-      x_val = constant_op.constant(x[np.where(x)])
-      x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
-      y_t = constant_op.constant(y)
-      ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
-          x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
-    delta_sparse = _timer(sess, ops_fn, 1000)
+      with ops.device("/gpu:0"):
+        x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
+        x_val = constant_op.constant(x[np.where(x)])
+        x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
+        y_t = constant_op.constant(y)
+        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
+            x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
+    delta_sparse = _timer(sess, ops_fn, 200)
 
   print("%g \t %d \t %s \t %d \t %d \t %g \t %g \t %g" %
         (1 - thresh, n, use_gpu, m, k, delta_dense, delta_sparse,
@@ -302,7 +389,7 @@ def main(_):
         "\t dt(sparse)/dt(dense)")
 
   for thresh in (0.99, 0.8, 0.5, 0.2):
-    for n in (1, 10, 25):
+    for n in (50, 100):
       for use_gpu in (True, False):
         for m in (100, 1000):
           for k in (100, 1000):
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index bff9ac4ec02..cd5b711a0ed 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -214,7 +214,7 @@ class SparseXentTest(test.TestCase):
     # Taking ths second gradient should fail, since it is not
     # yet supported.
     with self.assertRaisesRegexp(LookupError,
-                                 ".*No gradient defined.*PreventGradient.*"):
+                                 "explicitly disabled"):
       _ = gradients_impl.hessians(loss, [weights])
 
   def _testHighDim(self, features, labels):
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 8ea2d7ecda3..b44dc037f14 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -22,21 +22,74 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+_TEST_DTYPES = (dtypes.float32, dtypes.float64, dtypes.complex64,
+                dtypes.complex128)
+
 
 class SplitOpTest(test.TestCase):
 
+  def _makeData(self, shape, dtype):
+    data = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 1j * data
+    return data
+
+  def testShapeInference(self):
+    model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
+
+    # check that we fail during static shape inference if sizes are known
+    with self.assertRaises(ValueError):
+      # pylint: disable=expression-not-assigned
+      array_ops.split(model_input, [4], axis=1)[0]
+      # pylint: enable=expression-not-assigned
+
+    model_input = array_ops.placeholder(dtypes.float32)
+    inp = np.zeros((1, 10))
+    # check that we still fail at runtime if the shapes were unknown
+    with self.test_session(use_gpu=True) as sess:
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        sess.run(array_ops.split(model_input, [4]), {model_input: inp})
+
+    # test that we can pass a scalar Tensor as num_splits
+    for axis in [0, -2]:
+      with self.test_session(use_gpu=True) as sess:
+        result = sess.run(
+            array_ops.split(
+                array_ops.ones([4, 4]),
+                num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
+                axis=axis))
+
+      self.assertEqual(result[0].shape, (2, 4))
+      self.assertEqual(result[1].shape, (2, 4))
+
+    # test that none split dimensions remain, even if we don't know how
+    # the split_dim will be split, but we do know the axis
+    result = array_ops.split(
+        array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
+
+    self.assertEqual(result[0].shape[1], 2)
+    self.assertEqual(result[1].shape[1], 2)
+    self.assertEqual(result[2].shape[1], 2)
+
+    model_input2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
+    result = array_ops.split(model_input2, [2, 2], axis=0)[0]
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
+
   def testExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
     value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.test_session(use_gpu=True) as sess:
       with self.assertRaises(ValueError) as context:
         sess.run(array_ops.split(value, size_splits), {size_splits: [2, 2, 6]})
 
@@ -55,24 +108,24 @@ class SplitOpTest(test.TestCase):
 
     value = np.random.rand(11, 11)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(value, [a, b]))
 
     self.assertAllEqual(result[0], value[0:5, :])
     self.assertAllEqual(result[1], value[5:, :])
 
-  def _RunAndVerifyVariable(self, use_gpu, large_num_splits=False):
+  def _RunAndVerifyVariable(self, dtype, large_num_splits=False):
     # Random dims of rank 5
     shape = np.random.randint(1, 5, size=5)
-    split_dim = np.random.randint(0, 5)
+    split_dim = np.random.randint(-5, 5)
     if large_num_splits:
       num_split = np.random.randint(16, 25)
     else:
       num_split = np.random.randint(2, 8)
     size_splits = np.random.randint(2, 8, num_split)
     shape[split_dim] = np.sum(size_splits)
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
@@ -81,10 +134,10 @@ class SplitOpTest(test.TestCase):
       offset += size_splits[i]
       self.assertAllEqual(result[i], inp[slices])
 
-  def _testSpecialCasesVariable(self, use_gpu):
+  def _testSpecialCasesVariable(self):
     inp = np.random.rand(4, 4).astype("f")
 
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, [4], 0))
       self.assertAllEqual(result[0], inp)
 
@@ -92,13 +145,13 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[0], inp[0:1, :])
       self.assertAllEqual(result[1], inp[1:4, :])
 
-  def _testHugeNumberOfTensorsVariable(self, use_gpu):
+  def _testHugeNumberOfTensorsVariable(self, dtype):
     num_split = 10000
     size_splits = np.random.randint(1, 3, num_split)
     shape = [3, np.sum(size_splits)]
     split_dim = 1
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
@@ -108,18 +161,17 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[i], inp[slices])
 
   def testSpecialCasesVariable(self):
-    self._testSpecialCasesVariable(False)
-    self._testSpecialCasesVariable(True)
-    self._testHugeNumberOfTensorsVariable(False)
-    self._testHugeNumberOfTensorsVariable(True)
+    self._testSpecialCasesVariable()
+    for dtype in _TEST_DTYPES:
+      self._testHugeNumberOfTensorsVariable(dtype)
 
-  def _testGradientsSimpleVariable(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    with self.test_session(use_gpu=use_gpu):
+  def _testGradientsSimpleVariable(self, dtype):
+    inp = self._makeData((4, 4), dtype)
+    with self.test_session(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
-      s = array_ops.split(inp_tensor, [1, 4], 1)
+      s = array_ops.split(inp_tensor, [1, 3], 1)
       inp_grads = [
-          np.random.rand(4, 1).astype("f"), np.random.rand(4, 3).astype("f")
+          self._makeData((4, 1), dtype), self._makeData((4, 3), dtype)
       ]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[-1]
@@ -129,16 +181,17 @@ class SplitOpTest(test.TestCase):
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
 
   def testOutputShape(self):
-    with self.test_session(use_gpu=False):
-      tensor = array_ops.placeholder(dtypes.float32, shape=[None, 12])
-      size_splits = [3, 7, 2]
-      outputs = array_ops.split(tensor, size_splits, 1)
-      for i, output in enumerate(outputs):
-        self.assertEqual(output.get_shape().as_list(), [None, size_splits[i]])
+    for axis in [1, -1]:
+      with self.test_session(use_gpu=True):
+        tensor = array_ops.placeholder(dtypes.float32, shape=[None, 12])
+        size_splits = [3, 7, 2]
+        outputs = array_ops.split(tensor, size_splits, axis)
+        for i, output in enumerate(outputs):
+          self.assertEqual(output.get_shape().as_list(), [None, size_splits[i]])
 
-  def _compare(self, x, dim, num, use_gpu):
+  def _compare(self, x, dim, num):
     np_ans = np.split(x, num, dim)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       tf_ans = array_ops.split(value=x, num_or_size_splits=num, axis=dim)
       out = sess.run(tf_ans)
     self.assertEqual(num, len(np_ans))
@@ -148,21 +201,15 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(np_ans[i], out[i])
       self.assertShapeEqual(np_ans[i], tf_ans[i])
 
-  def _testSplitRows(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    self._compare(inp, 0, 4, use_gpu)
+  def testSplitRows(self):
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((4, 4), dtype)
+      self._compare(inp, 0, 4)
 
-  def testSplitRowsAll(self):
-    self._testSplitRows(use_gpu=False)
-    self._testSplitRows(use_gpu=True)
-
-  def _testSplitCols(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    self._compare(inp, 1, 4, use_gpu)
-
-  def testSplitColsAll(self):
-    self._testSplitRows(use_gpu=False)
-    self._testSplitCols(use_gpu=True)
+  def testSplitCols(self):
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((4, 4), dtype)
+      self._compare(inp, 1, 4)
 
   def _testEmpty(self, x, dim, num, expected_shape):
     with self.test_session() as sess:
@@ -177,37 +224,38 @@ class SplitOpTest(test.TestCase):
   def testEmpty(self):
     # Note: np.split returns a rank-0 empty ndarray
     # if the input ndarray is empty.
-    inp = np.random.rand(8, 0, 21).astype("f")
-    self._testEmpty(inp, 0, 2, (4, 0, 21))
-    self._testEmpty(inp, 0, 4, (2, 0, 21))
-    self._testEmpty(inp, 1, 4, (8, 0, 21))
-    self._testEmpty(inp, 2, 3, (8, 0, 7))
-    self._testEmpty(inp, 2, 7, (8, 0, 3))
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((8, 0, 21), dtype)
+      self._testEmpty(inp, 0, 2, (4, 0, 21))
+      self._testEmpty(inp, 0, 4, (2, 0, 21))
+      self._testEmpty(inp, 1, 4, (8, 0, 21))
+      self._testEmpty(inp, 2, 3, (8, 0, 7))
+      self._testEmpty(inp, 2, 7, (8, 0, 3))
 
   def testIdentity(self):
-    inp = np.random.rand(2, 2, 2).astype("f")
-    for use_gpu in [False, True]:
-      self._compare(inp, 0, 1, use_gpu)
-      self._compare(inp, 1, 1, use_gpu)
-      self._compare(inp, 2, 1, use_gpu)
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((2, 2, 2), dtype)
+      self._compare(inp, 0, 1)
+      self._compare(inp, 1, 1)
+      self._compare(inp, 2, 1)
 
   def testSplitDim0(self):
-    for use_gpu in [False, True]:
-      self._compare(np.random.rand(6, 10, 18).astype("f"), 0, 3, use_gpu)
-      self._compare(np.random.rand(6, 7, 18).astype("f"), 0, 3, use_gpu)
-      self._compare(np.random.rand(6, 7, 9).astype("f"), 0, 3, use_gpu)
+    for dtype in _TEST_DTYPES:
+      self._compare(self._makeData((6, 10, 18), dtype), 0, 3)
+      self._compare(self._makeData((6, 7, 18), dtype), 0, 3)
+      self._compare(self._makeData((6, 7, 9), dtype), 0, 3)
 
-  def _RunAndVerify(self, use_gpu, large_num_splits=False):
+  def _RunAndVerify(self, dtype, large_num_splits=False):
     # Random dims of rank 5
     shape = np.random.randint(0, 5, size=5)
-    split_dim = np.random.randint(0, 5)
+    split_dim = np.random.randint(-5, 5)
     if large_num_splits:
       num_split = np.random.randint(9, 15)
     else:
       num_split = np.random.randint(2, 8)
     shape[split_dim] = np.random.randint(2, 5) * num_split
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(
           array_ops.split(
               value=inp, num_or_size_splits=num_split, axis=split_dim))
@@ -220,20 +268,19 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[i], inp[slices])
 
   def testRandom(self):
-    for _ in range(5):
-      self._RunAndVerify(use_gpu=False)
-      self._RunAndVerify(use_gpu=True)
-      self._RunAndVerify(use_gpu=True, large_num_splits=True)
-      self._RunAndVerifyVariable(use_gpu=False)
-      self._RunAndVerifyVariable(use_gpu=True)
-      self._RunAndVerifyVariable(use_gpu=True, large_num_splits=True)
+    for dtype in _TEST_DTYPES:
+      for _ in range(5):
+        self._RunAndVerify(dtype)
+        self._RunAndVerify(dtype, large_num_splits=True)
+        self._RunAndVerifyVariable(dtype)
+        self._RunAndVerifyVariable(dtype, large_num_splits=True)
 
-  def _testGradientsSimple(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    with self.test_session(use_gpu=use_gpu):
+  def _testGradientsSimple(self, dtype):
+    inp = self._makeData((4, 4), dtype)
+    with self.test_session(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
       s = array_ops.split(value=inp_tensor, num_or_size_splits=4, axis=1)
-      inp_grads = [np.random.rand(4, 1).astype("f") for _ in range(4)]
+      inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
       result = grad.eval()
@@ -241,16 +288,19 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
   def testGradientsAll(self):
-    self._testGradientsSimple(use_gpu=False)
-    self._testGradientsSimple(use_gpu=True)
-    self._testGradientsSimpleVariable(use_gpu=False)
-    self._testGradientsSimpleVariable(use_gpu=True)
+    for dtype in _TEST_DTYPES:
+      self._testGradientsSimple(dtype)
+      self._testGradientsSimpleVariable(dtype)
 
   def testShapeFunctionEdgeCases(self):
     # split_dim greater than rank of input.
     with self.assertRaises(ValueError):
       array_ops.split(value=[[0, 1], [2, 3]], num_or_size_splits=4, axis=2)
 
+    # split dim less than -(rank of input)
+    with self.assertRaises(ValueError):
+      array_ops.split(value=[[0, 1], [2, 3]], num_or_size_splits=4, axis=-3)
+
     # num_split does not evenly divide the size in split_dim.
     with self.assertRaisesRegexp(ValueError, "should evenly divide"):
       array_ops.split(value=[0, 1, 2, 3], num_or_size_splits=3, axis=0)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index c2d760809be..1a6a869e3d0 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,34 +23,44 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
+
   def testSimple(self):
-    with self.test_session(use_gpu=True) as sess:
+    with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
-        v = 2. * (array_ops.zeros([1024, 1024]) + x)
-      with ops.device('/gpu:0'):
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea([dtypes.float32])
         stage = stager.put([v])
         y = stager.get()
         y = math_ops.reduce_max(math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
-        self.assertAllClose(4 * (i - 1) * (i - 1) * 1024, yval, rtol=1e-4)
+        self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
   def testMultiple(self):
-    with self.test_session(use_gpu=True) as sess:
+    with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
-      with ops.device('/gpu:0'):
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
         stage = stager.put([x, v])
         z, y = stager.get()
         y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
@@ -58,11 +68,11 @@ class StageTest(test.TestCase):
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
   def testDictionary(self):
-    with self.test_session(use_gpu=True) as sess:
+    with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
-      with ops.device('/gpu:0'):
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea(
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
@@ -72,24 +82,200 @@ class StageTest(test.TestCase):
         z = ret['x']
         y = ret['v']
         y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
-  def testColocation1(self):
-    with ops.device('/cpu:0'):
-      x = array_ops.placeholder(dtypes.float32)
-      v = 2. * (array_ops.zeros([1024, 1024]) + x)
-    with ops.device('/gpu:0'):
-      stager = data_flow_ops.StagingArea([dtypes.float32])
-      y = stager.put([v])
-      self.assertEqual(y.device, '/device:GPU:0')
-    with ops.device('/cpu:0'):
-      x = stager.get()
-      self.assertEqual(x.device, '/device:CPU:0')
+  def testColocation(self):
+    gpu_dev = test.gpu_device_name()
 
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(gpu_dev):
+        stager = data_flow_ops.StagingArea([dtypes.float32])
+        y = stager.put([v])
+        self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
+                                                   else gpu_dev)
+      with ops.device('/cpu:0'):
+        x = stager.get()
+        self.assertEqual(x.device, '/device:CPU:0')
+
+    G.finalize()
+
+  def testPeek(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.int32, name='x')
+        p = array_ops.placeholder(dtypes.int32, name='p')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
+        stage = stager.put([x])
+        peek = stager.peek(p)
+        ret = stager.get()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      for i in range(10):
+        sess.run(stage, feed_dict={x:i})
+
+      for i in range(10):
+        self.assertTrue(sess.run(peek, feed_dict={p:i}) == i)
+
+  def testSizeAndClear(self):
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32, name='x')
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.StagingArea(
+            [dtypes.float32, dtypes.float32],
+            shapes=[[], [128, 128]],
+            names=['x', 'v'])
+        stage = stager.put({'x': x, 'v': v})
+        ret = stager.get()
+        size = stager.size()
+        clear = stager.clear()
+
+    G.finalize()
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      sess.run(stage, feed_dict={x: -1})
+      self.assertEqual(sess.run(size), 1)
+      sess.run(stage, feed_dict={x: -1})
+      self.assertEqual(sess.run(size), 2)
+      sess.run(clear)
+      self.assertEqual(sess.run(size), 0)
+
+  def testCapacity(self):
+    capacity = 3
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.int32, name='x')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.StagingArea([dtypes.int32, ],
+          capacity=capacity, shapes=[[]])
+        stage = stager.put([x])
+        ret = stager.get()
+        size = stager.size()
+
+    G.finalize()
+
+    from six.moves import queue as Queue
+    import threading
+
+    queue = Queue.Queue()
+    n = 8
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.daemon = True
+      t.start()
+
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(n):
+        self.assertTrue(sess.run(ret) == i)
+
+      # It should now be empty
+      self.assertTrue(sess.run(size) == 0)
+
+  def testMemoryLimit(self):
+    memory_limit = 512*1024  # 512K
+    chunk = 200*1024 # 256K
+    capacity = memory_limit // chunk
+
+    with ops.Graph().as_default() as G:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.uint8, name='x')
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.StagingArea([dtypes.uint8, ],
+          memory_limit=memory_limit, shapes=[[]])
+        stage = stager.put([x])
+        ret = stager.get()
+        size = stager.size()
+
+    G.finalize()
+
+    from six.moves import queue as Queue
+    import threading
+    import numpy as np
+
+    queue = Queue.Queue()
+    n = 8
+
+    with self.test_session(use_gpu=True, graph=G) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8)})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.daemon = True
+      t.start()
+
+      # Get tokens from the queue until a timeout occurs
+      try:
+        for i in range(n):
+          queue.get(timeout=TIMEOUT)
+      except Queue.Empty:
+        pass
+
+      # Should've timed out on the iteration 'capacity'
+      if not i == capacity:
+        self.fail("Expected to timeout on iteration '{}' "
+                  "but instead timed out on iteration '{}' "
+                  "Staging Area size is '{}' and configured "
+                  "capacity is '{}'.".format(capacity, i,
+                                            sess.run(size),
+                                            capacity))
+
+      # Should have capacity elements in the staging area
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(n):
+        self.assertTrue(np.all(sess.run(ret) == i))
+
+      self.assertTrue(sess.run(size) == 0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/string_to_number_op_test.py b/tensorflow/python/kernel_tests/string_to_number_op_test.py
index 8a7a7285a6f..cc4c21b66c8 100644
--- a/tensorflow/python/kernel_tests/string_to_number_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_number_op_test.py
@@ -28,57 +28,72 @@ _ERROR_MESSAGE = "StringToNumberOp could not correctly convert string: "
 
 class StringToNumberOpTest(test.TestCase):
 
-  def testToFloat(self):
+  def _test(self, tf_type, good_pairs, bad_pairs):
     with self.test_session():
+      # Build a small testing graph.
       input_string = array_ops.placeholder(dtypes.string)
       output = parsing_ops.string_to_number(
-          input_string, out_type=dtypes.float32)
+          input_string, out_type=tf_type)
 
-      result = output.eval(feed_dict={
-          input_string: [
-              "0",
-              "3",
-              "-1",
-              "1.12",
-              "0xF",
-              "   -10.5",
-              "3.40282e+38",
-              # The next two exceed maximum value for float, so we
-              # expect +/-INF to be returned instead.
-              "3.40283e+38",
-              "-3.40283e+38",
-              "NAN",
-              "INF"
-          ]
-      })
+      # Check all the good input/output pairs.
+      for instr, outnum in good_pairs:
+        result, = output.eval(feed_dict={input_string: [instr]})
+        self.assertAllClose([outnum], [result])
 
-      self.assertAllClose([
-          0, 3, -1, 1.12, 0xF, -10.5, 3.40282e+38, float("INF"), float("-INF"),
-          float("NAN"), float("INF")
-      ], result)
+      # Check that the bad inputs produce the right errors.
+      for instr, outstr in bad_pairs:
+        with self.assertRaisesOpError(outstr):
+          output.eval(feed_dict={input_string: [instr]})
 
-      with self.assertRaisesOpError(_ERROR_MESSAGE + "10foobar"):
-        output.eval(feed_dict={input_string: ["10foobar"]})
+  def testToFloat(self):
+    self._test(dtypes.float32,
+               [("0", 0), ("3", 3), ("-1", -1),
+                ("1.12", 1.12), ("0xF", 15), ("   -10.5", -10.5),
+                ("3.40282e+38", 3.40282e+38),
+                # Greater than max value of float.
+                ("3.40283e+38", float("INF")),
+                ("-3.40283e+38", float("-INF")),
+                # Less than min value of float.
+                ("NAN", float("NAN")),
+                ("INF", float("INF"))],
+               [("10foobar", _ERROR_MESSAGE + "10foobar")])
+
+  def testToDouble(self):
+    self._test(dtypes.float64,
+               [("0", 0), ("3", 3), ("-1", -1),
+                ("1.12", 1.12), ("0xF", 15), ("   -10.5", -10.5),
+                ("3.40282e+38", 3.40282e+38),
+                # Greater than max value of float.
+                ("3.40283e+38", 3.40283e+38),
+                # Less than min value of float.
+                ("-3.40283e+38", -3.40283e+38),
+                ("NAN", float("NAN")),
+                ("INF", float("INF"))],
+               [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
   def testToInt32(self):
-    with self.test_session():
-      input_string = array_ops.placeholder(dtypes.string)
-      output = parsing_ops.string_to_number(input_string, out_type=dtypes.int32)
+    self._test(dtypes.int32,
+               [("0", 0), ("3", 3), ("-1", -1),
+                ("    -10", -10),
+                ("-2147483648", -2147483648),
+                ("2147483647", 2147483647)],
+               [   # Less than min value of int32.
+                   ("-2147483649", _ERROR_MESSAGE + "-2147483649"),
+                   # Greater than max value of int32.
+                   ("2147483648", _ERROR_MESSAGE + "2147483648"),
+                   ("2.9", _ERROR_MESSAGE + "2.9"),
+                   ("10foobar", _ERROR_MESSAGE + "10foobar")])
 
-      result = output.eval(feed_dict={
-          input_string:
-              ["0", "3", "-1", "    -10", "-2147483648", "2147483647"]
-      })
-
-      self.assertAllEqual([0, 3, -1, -10, -2147483648, 2147483647], result)
-
-      with self.assertRaisesOpError(_ERROR_MESSAGE + "2.9"):
-        output.eval(feed_dict={input_string: ["2.9"]})
-
-      # The next two exceed maximum value of int32.
-      for in_string in ["-2147483649", "2147483648"]:
-        with self.assertRaisesOpError(_ERROR_MESSAGE + in_string):
-          output.eval(feed_dict={input_string: [in_string]})
+  def testToInt64(self):
+    self._test(dtypes.int64,
+               [("0", 0), ("3", 3), ("-1", -1),
+                ("    -10", -10),
+                ("-2147483648", -2147483648),
+                ("2147483647", 2147483647),
+                ("-2147483649", -2147483649),  # Less than min value of int32.
+                ("2147483648", 2147483648)],  # Greater than max value of int32.
+               [("2.9", _ERROR_MESSAGE + "2.9"),
+                ("10foobar", _ERROR_MESSAGE + "10foobar")])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 0c0710fed43..854394b0dde 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -183,7 +183,7 @@ class SubstrOpTest(test.TestCase):
 
     position = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]], dtype)
     length = np.array([[2, 3, 4]], dtype)
-    # Should fail: postion/length have different dimensionality
+    # Should fail: position/length have different dimensionality
     with self.assertRaises(ValueError):
       substr_op = string_ops.substr(test_string, position, length)
 
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 84f46c62787..54e8098e4e6 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -21,6 +21,7 @@ import traceback
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import template
@@ -305,7 +306,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(custom_getter_count[0], 2)
 
     # Test that custom getter is called when the variable scope is created
-    # during construction
+  # during construction
     custom_getter_count[0] = 0
     tmpl2 = template.make_template(
         "s2",
@@ -318,6 +319,76 @@ class TemplateTest(test.TestCase):
     tmpl2()
     self.assertEqual(custom_getter_count[0], 2)
 
+  def test_fails_gracefully(self):
+    for create_scope_now in [True, False]:
+      def module_function_with_one_arg(inputs):
+        w = variable_scope.get_variable(
+            "w", shape=[1], initializer=init_ops.zeros_initializer())
+        return inputs * w
+
+      templatized_function = template.make_template(
+          "f1", module_function_with_one_arg,
+          create_scope_now_=create_scope_now)
+      data = array_ops.zeros(1)
+      try:
+        # Try to connect with a kwarg which is unsupported.
+        templatized_function(data, is_training=True)
+      except TypeError:
+        pass
+
+      # The failed __call__ hasn't modified the inner state.
+      self.assertFalse(templatized_function._variables_created)
+      templatized_function(data)
+      self.assertTrue(templatized_function._variables_created)
+
+  def test_name_scopes_for_variable_scopes(self):
+    # Test that name scopes are not unnecessarily uniquified (but are
+    # still uniquified when necessary).
+    def linear_module(x, output_size):
+      w = variable_scope.get_variable(
+          "w", shape=[x.get_shape()[1], output_size],
+          initializer=init_ops.zeros_initializer())
+      b = variable_scope.get_variable(
+          "b", shape=[output_size],
+          initializer=init_ops.zeros_initializer())
+      return (math_ops.matmul(x, w) + b), w
+
+    def make_linear_module(output_size, name):
+      return template.make_template(
+          name,
+          linear_module,
+          output_size=output_size,
+          create_scope_now_=True)
+
+    inputs = array_ops.ones((3, 4))
+
+    linear1 = make_linear_module(output_size=2, name="foo")
+    outputs_a, w1 = linear1(inputs)
+    outputs_b, _ = linear1(inputs)
+    self.assertEquals("foo", linear1.variable_scope.name)
+    self.assertEquals("foo/w:0", w1.name)
+    self.assertEquals("foo/add:0", outputs_a.name,
+                      "First application of template should get "
+                      "same name scope as variables.")
+    self.assertEquals("foo_1/add:0", outputs_b.name,
+                      "Second application of template should get "
+                      "a freshly uniquified name scope.")
+
+    linear2 = make_linear_module(output_size=2, name="foo")
+    outputs_c, w2 = linear2(inputs)
+    outputs_d, _ = linear2(inputs)
+    self.assertEquals("foo_1", linear2.variable_scope.name,
+                      "New template gets a freshly uniquified variable scope "
+                      "because 'foo' is already taken.")
+    self.assertEquals("foo_1/w:0", w2.name)
+    self.assertEquals("foo_1_1/add:0", outputs_c.name,
+                      "First application of template would get "
+                      "same name scope as variables, but 'foo_1' is already "
+                      "a name scope.")
+    self.assertEquals("foo_1_2/add:0", outputs_d.name,
+                      "Second application of template should also get "
+                      "a freshly uniquified name scope.")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index e52ae95281a..9941c97c30f 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,8 +38,31 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+def _make_converter(tf_dtype):
+  def _converter(x):
+    if tf_dtype == dtypes.string:
+      # In Python3, np.str is unicode, while we always want bytes
+      return np.asarray(x).astype("|S")
+    x = np.asarray(x).astype(tf_dtype.as_numpy_dtype)
+    if tf_dtype.is_complex:
+      # Add a non-zero imaginary component to x.
+      x -= 1j * x
+    return x
+  return _converter
+
+
 class TensorArrayTest(test.TestCase):
 
+  @classmethod
+  def setUpClass(cls):
+    super(TensorArrayTest, cls).setUpClass()
+    cls._workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
+
+  @classmethod
+  def tearDownClass(cls):
+    super(TensorArrayTest, cls).tearDownClass()
+    session_lib.Session.reset(cls._workers[0].target)
+
   def testTensorArrayWriteRead(self):
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -60,16 +85,11 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(-3.0, d2)
 
   def _testTensorArrayWritePack(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       w0 = ta.write(0, convert([[4.0, 5.0]]))
       w1 = w0.write(1, convert([[6.0, 7.0]]))
@@ -92,17 +112,26 @@ class TensorArrayTest(test.TestCase):
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
+  def testEmptyTensorArrayPack(self):
+    with self.test_session(use_gpu=True):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+
+      empty_element = np.zeros((0, 1), dtype=np.float32)
+      w0 = ta.write(0, empty_element)
+      w1 = w0.write(1, empty_element)
+      w2 = w1.write(2, empty_element)
+
+      c0 = w2.stack()
+
+      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+
   def _testTensorArrayWriteConcat(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       w0 = ta.write(0, convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0]]))
       w1 = w0.write(1, convert([[6.0, 7.0], [106.0, 107.0]]))
@@ -124,7 +153,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWriteConcat(dtypes.string)
 
   def _testTensorArrayPackNotAllValuesAvailableFails(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -136,16 +165,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayPackNotAllValuesAvailableFails()
 
   def _testTensorArrayUnpackRead(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
-      if tf_dtype is dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       # Unpack a vector into scalars
       w0 = ta.unstack(convert([1.0, 2.0, 3.0]))
@@ -201,16 +225,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
   def _testTensorArraySplitRead(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       # Split an empty vector
       lengths = constant_op.constant([0, 0, 0])
@@ -831,7 +850,7 @@ class TensorArrayTest(test.TestCase):
         dynamic_size=True, dtype=dtypes.float32)
 
   def testGradSerialTwoLoops(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       num_steps = 100
       acc = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -937,7 +956,7 @@ class TensorArrayTest(test.TestCase):
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
   def testWriteShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       c0 = constant_op.constant([4.0, 5.0])
@@ -961,7 +980,7 @@ class TensorArrayTest(test.TestCase):
         w0.write(0, c2)
 
   def testPartlyUnknownShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
 
@@ -1001,7 +1020,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
   def _testUnpackShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1027,7 +1046,7 @@ class TensorArrayTest(test.TestCase):
     self._testUnpackShape()
 
   def testSplitShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1050,7 +1069,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
 
   def testWriteUnknownShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1121,7 +1140,7 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5]))
+      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       self.assertAllEqual([0, 3, 5], packed.eval().shape)
       # Concatenating zero tensors along their first dimension gives a
@@ -1187,85 +1206,114 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(expected_grad, grad_vals[0])
 
   def testTensorArrayGetsDeviceFromFirstWrite(self):
-    with ops.device("/gpu:1"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    # parent device was ignored when creating the TensorArray
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
-      # the first write sets the op's device
+    with ops.device("/job:worker/task:1/cpu:0"):
+      # the first write sets the op's device.
       ta = ta.write(0, 1.0)
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
-      # subsequent writes do not modify the op's device
+    with ops.device("/job:worker/task:2/cpu:0"):
+      # subsequent writes do not modify the op's device.
       ta = ta.write(1, 1.0)
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
 
+    # The gradient TA will sit on the same device as the forward TA.
     ta_grad = ta.grad("grad")
-    self.assertTrue("gpu:0" in ta_grad.handle.device.lower())
-    self.assertTrue("gpu:0" in ta_grad.flow.device.lower())
+    flows = [ta.flow, ta_grad.flow]
 
     # Similar tests for unpack and split
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=3)
+    with ops.device("/job:worker/task:1/cpu:0"):
       ta = ta.unstack([1.0, 2.0])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
-      ta = ta.unstack([1.0, 2.0])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
+    with ops.device("/job:worker/task:2/cpu:0"):
+      ta = ta.write(2, 3.0)
+    flows.append(ta.flow)
 
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+    with ops.device("/job:worker/task:1/cpu:0"):
       ta = ta.split([1.0, 2.0], [1, 1])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
-      ta = ta.split([1.0, 2.0], [1, 1])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
+    flows.append(ta.flow)
+
+    session = session_lib.Session(self._workers[0].target)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    session.run(flows, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: d.node_stats
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:1/" in d:
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
   def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
     def _body(i, ta_i):
-      with ops.device("/gpu:0"):
-        return i + 1, ta_i.write(i, 0.0)
-
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
+      with ops.device("/job:worker/task:1/cpu:0"):
+        return i + 1, ta_i.write(i, constant_op.constant(0.0))
 
     _, ta_out = control_flow_ops.while_loop(
         lambda i, ta: i < 2, _body, loop_vars=[0, ta])
 
-    self.assertTrue("gpu:0" in ta_out.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
+    session = session_lib.Session(self._workers[0].target)
 
-  def testTensorArrayLazyDeviceSettingDoesNotConfuseInitialAccess(self):
-    with self.test_session(use_gpu=True) as session:
-      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-      self.assertEqual(ta.handle.device, "")
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
 
-      with ops.device("/cpu:0"):
-        size = ta.size()
-      with ops.device("/gpu:0"):
-        ta = ta.write(0, 0.0)
+    session.run(ta_out.flow, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: d.node_stats
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:1/" in d:
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
-      self.assertTrue("gpu:0" in ta.handle.device.lower())
+  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
 
-      # This should use the TensorArray on /gpu:0
-      size_value, _ = session.run((size, ta.flow))
-      self.assertEqual(2, size_value)
+    def _body(i, ta_i):
+      with ops.device("/job:worker/task:1/cpu:0"):
+        return i + 1, ta_i.write(i, constant_op.constant(0.0))
+
+    _, ta_out = control_flow_ops.while_loop(
+        lambda i, ta: i < 2, _body, loop_vars=[0, ta])
+
+    session = session_lib.Session(self._workers[0].target)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    session.run(ta_out.flow, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: list(d.node_stats)
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:0/" in d and "cpu" in d:  # Skip any GPU node stats
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
   def testTensorArrayIdentity(self):
-    with self.test_session() as session:
+    with self.test_session(use_gpu=True) as session:
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
                                          infer_shape=False)
       ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4,
diff --git a/tensorflow/python/kernel_tests/tensor_priority_test.py b/tensorflow/python/kernel_tests/tensor_priority_test.py
new file mode 100644
index 00000000000..574538a837a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tensor_priority_test.py
@@ -0,0 +1,86 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the binary ops priority mechanism."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test as test_lib
+
+
+class TensorPriorityTest(test_lib.TestCase):
+
+  def testSupportedRhsWithoutDelegation(self):
+
+    class NumpyArraySubclass(np.ndarray):
+      pass
+
+    supported_rhs_without_delegation = (3, 3.0, [1.0, 2.0], np.array(
+        [1.0, 2.0]), NumpyArraySubclass(
+            shape=(1, 2), buffer=np.array([1.0, 2.0])),
+                                        ops.convert_to_tensor([[1.0, 2.0]]))
+    for rhs in supported_rhs_without_delegation:
+      tensor = ops.convert_to_tensor([[10.0, 20.0]])
+      res = tensor + rhs
+      self.assertIsInstance(res, ops.Tensor)
+
+  def testUnsupportedRhsWithoutDelegation(self):
+
+    class WithoutReverseAdd(object):
+      pass
+
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = WithoutReverseAdd()
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, lambda e: "Expected float" in str(e)):
+      # pylint: disable=pointless-statement
+      tensor + rhs
+
+  def testUnsupportedRhsWithDelegation(self):
+
+    class WithReverseAdd(object):
+
+      def __radd__(self, lhs):
+        return "Works!"
+
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = WithReverseAdd()
+    res = tensor + rhs
+    self.assertEqual(res, "Works!")
+
+  def testFullDelegationControlUsingRegistry(self):
+
+    class NumpyArraySubclass(np.ndarray):
+
+      def __radd__(self, lhs):
+        return "Works!"
+
+    def raise_to_delegate(value, dtype=None, name=None, as_ref=False):
+      del value, dtype, name, as_ref  # Unused.
+      raise TypeError
+
+    ops.register_tensor_conversion_function(
+        NumpyArraySubclass, raise_to_delegate, priority=0)
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = NumpyArraySubclass(shape=(1, 2), buffer=np.array([1.0, 2.0]))
+    res = tensor + rhs
+    self.assertEqual(res, "Works!")
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index a147fc1dbf8..71230ba0005 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -54,10 +54,10 @@ class TensordotTest(test_lib.TestCase):
         b_ph = array_ops.placeholder(dtypes.float32)
         axes_ph = array_ops.placeholder(dtypes.int32)
         output = math_ops.tensordot(a_ph, b_ph, axes_ph)
-        _ = sess.run([output],
-                     feed_dict={a_ph: a,
-                                b_ph: b,
-                                axes_ph: (a_axes, b_axes)})
+        _ = sess.run(
+            [output], feed_dict={a_ph: a,
+                                 b_ph: b,
+                                 axes_ph: (a_axes, b_axes)})
 
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
@@ -79,14 +79,12 @@ class TensordotTest(test_lib.TestCase):
     for axes_value in 1, [1], [0, 1], [[1]], [[0, 1]], [[0], [7]]:
       with self.test_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
-          _ = sess.run([output],
-                       feed_dict={a_ph: a,
-                                  b_ph: b,
-                                  axes_ph: axes_value})
+          _ = sess.run(
+              [output], feed_dict={a_ph: a,
+                                   b_ph: b,
+                                   axes_ph: axes_value})
 
-  def test_no_partial_shape_inference(self):
-    # If one of the shapes is only partially defined, the output shape is
-    # unknown.
+  def test_partial_shape_inference(self):
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
     axes = ([1], [0])
@@ -95,13 +93,21 @@ class TensordotTest(test_lib.TestCase):
     a.set_shape([None, 2])
     b.set_shape([2, 3])
     output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], None)
+    self.assertEqual(output_shape[1], 3)
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
     a.set_shape([2, 2])
     b.set_shape([2, None])
     output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], 2)
+    self.assertEqual(output_shape[1], None)
 
 
 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
@@ -173,8 +179,14 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       all_axes.append(a_np.ndim - 1)
     for axes in all_axes:
       np_ans = np.tensordot(a_np, b_np, axes=axes)
-      with self.test_session(use_gpu=True):
-        tf_ans = math_ops.tensordot(a_np, b_np, axes=axes).eval()
+      with self.test_session(use_gpu=True) as sess:
+        if dynamic_shape_:
+          a = array_ops.placeholder(dtype_)
+          b = array_ops.placeholder(dtype_)
+          c = math_ops.tensordot(a, b, axes=axes)
+          tf_ans = sess.run(c, feed_dict={a: a_np, b: b_np})
+        else:
+          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes).eval()
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 9d89e250f59..b3f737c8841 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -43,10 +43,10 @@ class TopKTest(test.TestCase):
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
       values = values_op.eval()
       indices = indices_op.eval()
-      self.assertAllClose(np_values, values)
-      self.assertAllEqual(np_indices, indices)
       self.assertShapeEqual(np_values, values_op)
       self.assertShapeEqual(np_indices, indices_op)
+      self.assertAllEqual(np_indices, indices)
+      self.assertAllClose(np_values, values)
 
   def testTop1(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
@@ -54,7 +54,7 @@ class TopKTest(test.TestCase):
 
   def testTop2(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
-    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.3, 0.3]], [[3, 1], [1, 2]])
+    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.3, 0.3]], [[3, 1], [2, 1]])
 
   def testTopAll(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
@@ -62,10 +62,10 @@ class TopKTest(test.TestCase):
                        [[3, 1, 2, 0], [1, 2, 3, 0]])
 
   def testTop3Unsorted(self):
-    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
+    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.4, 0.3, 0.2]]
     self._validateTopK(
         inputs,
-        3, [[0.2, 0.3, 0.4], [0.2, 0.3, 0.3]], [[2, 1, 3], [3, 1, 2]],
+        3, [[0.2, 0.3, 0.4], [0.2, 0.4, 0.3]], [[2, 1, 3], [3, 1, 2]],
         sorted=False)
 
   def testTop3Vector(self):
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 968996be4ba..570fa79944c 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -92,7 +92,7 @@ class TransposeTest(test.TestCase):
     # generate all permutations of [0, 1, ... n-1] in random order.
     all_perm = np.random.permutation(
         [p for p in itertools.permutations(range(n))]).astype(np.int32)
-    for p in all_perm[0:2]:
+    for p in all_perm[:2]:
       self._compareCpu(x, p)
       if use_gpu:
         self._compareGpu(x, p)
@@ -127,6 +127,108 @@ class TransposeTest(test.TestCase):
     self._compare(vector, use_gpu=False)
     self._compare(vector, use_gpu=True)
 
+  def test5DGPU(self):
+    # If no GPU available, skip the test
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    large_shapes = [[4, 10, 10, 10, 3], [4, 10, 10, 10, 8], [4, 10, 10, 10, 13],
+                    [4, 3, 10, 10, 10], [4, 8, 10, 10, 10], [4, 13, 10, 10,
+                                                             10]] * 3
+    perms = [[0, 4, 1, 2, 3]] * 3 + [[0, 2, 3, 4, 1]] * 3 + [[
+        4, 1, 2, 3, 0
+    ]] * 6 + [[1, 2, 3, 4, 0]] * 6
+
+    datatypes = [np.int8, np.float16, np.float32, np.float64, np.complex128]
+    for datatype in datatypes:
+      for input_shape, perm in zip(large_shapes, perms):
+        total_size = np.prod(input_shape)
+        inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.test_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = y.eval()
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
+
+  def test4DGPU(self):
+    # If no GPU available, skip the test
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    large_shapes = [[4, 10, 10, 3], [4, 10, 10, 8], [4, 10, 10, 13],
+                    [4, 3, 10, 10], [4, 8, 10, 10], [4, 13, 10, 10]] * 3
+    perms = [[0, 3, 1, 2]] * 3 + [[0, 2, 3, 1]] * 3 + [[3, 1, 2, 0]] * 6 + [[
+        1, 2, 3, 0
+    ]] * 3 + [[2, 3, 0, 1]] * 3
+
+    for input_shape, perm in zip(large_shapes, perms):
+      total_size = np.prod(input_shape)
+      inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
+      np_ans = self._np_transpose(inp, perm)
+      with self.test_session(use_gpu=True):
+        inx = ops.convert_to_tensor(inp)
+        y = array_ops.transpose(inx, perm)
+        tf_ans = y.eval()
+      self.assertAllEqual(np_ans, tf_ans)
+      self.assertShapeEqual(np_ans, y)
+
+    # shapes related to Inception (taken from conv_ops_test.py)
+    inception_shapes = [[4, 5, 5, 124], [4, 8, 8, 38], [4, 8, 8, 38], [
+        4, 8, 8, 204
+    ], [4, 8, 8, 44], [4, 8, 8, 204], [4, 8, 8, 204], [4, 8, 8, 204], [
+        4, 8, 8, 176
+    ], [4, 8, 8, 176], [4, 8, 8, 176], [4, 8, 8, 176], [4, 17, 17, 19], [
+        4, 17, 17, 19
+    ], [4, 17, 17, 124], [4, 17, 17, 12], [4, 17, 17, 124], [4, 17, 17, 22], [
+        4, 17, 17, 19
+    ], [4, 17, 17, 19], [4, 17, 17, 121], [4, 17, 17, 121], [4, 17, 17, 22], [
+        4, 17, 17, 19
+    ], [4, 17, 17, 19], [4, 17, 17, 115], [4, 17, 17, 115], [4, 17, 17, 19], [
+        4, 17, 17, 16
+    ], [4, 17, 17, 115], [4, 17, 17, 102], [4, 17, 17, 12], [4, 17, 17, 102], [
+        4, 17, 17, 12
+    ], [4, 17, 17, 102], [4, 17, 17, 12], [4, 17, 17, 76], [4, 17, 17, 12], [
+        4, 17, 17, 12
+    ], [4, 17, 17, 76], [4, 17, 17, 76], [4, 35, 35, 9], [4, 35, 35, 28], [
+        4, 35, 35, 6
+    ], [4, 35, 35, 28], [4, 35, 35, 25], [4, 35, 35, 4], [4, 35, 35, 25],
+                        [4, 35, 35, 9], [4, 35, 35, 19], [4, 35, 35, 19],
+                        [4, 35, 35, 19], [4, 73, 73, 6], [4, 73, 73,
+                                                          6], [4, 147, 147, 2]]
+    for input_shape in inception_shapes:
+      perm = [0, 3, 1, 2]
+      total_size = np.prod(input_shape)
+      inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
+      np_ans = self._np_transpose(inp, perm)
+      with self.test_session(use_gpu=True):
+        inx = ops.convert_to_tensor(inp)
+        y = array_ops.transpose(inx, perm)
+        tf_ans = y.eval()
+      self.assertAllEqual(np_ans, tf_ans)
+      self.assertShapeEqual(np_ans, y)
+
+  def test3DGPU(self):
+    # If no GPU available, skip the test
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    datatypes = [np.int8, np.float16, np.float32, np.float64, np.complex128]
+    large_shapes = [[4, 1000, 3], [4, 1000, 8], [4, 1000, 13], [4, 3, 1000],
+                    [4, 8, 1000], [4, 13, 1000]] * 3
+    perms = [[0, 2, 1]] * 6 + [[2, 1, 0]] * 6 + [[1, 2, 0]] * 3 + [[2, 0, 1]
+                                                                  ] * 3
+    for datatype in datatypes:
+      for input_shape, perm in zip(large_shapes, perms):
+        total_size = np.prod(input_shape)
+        inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
+        np_ans = self._np_transpose(inp, perm)
+        with self.test_session(use_gpu=True):
+          inx = ops.convert_to_tensor(inp)
+          y = array_ops.transpose(inx, perm)
+          tf_ans = y.eval()
+        self.assertAllEqual(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, y)
+
   def testNop(self):
     self._compareCpu(np.arange(0, 6).reshape([3, 2]).astype(np.float32), [0, 1])
 
@@ -208,6 +310,20 @@ class TransposeTest(test.TestCase):
         x_tf = array_ops.transpose(x_np).eval()
         self.assertAllEqual(x_tf, [[1, 4], [2, 5], [3, 6]])
 
+  def testSingletonDims(self):
+    # A singleton dimension is a dimension i with shape[i] == 1. Such dimensions
+    # can be collapsed and expanded using reshape without changing the
+    # underlying data storage. If all non-singleton dimensions remain in
+    # ascending order, the shuffled singletons will be transposed by a reshape,
+    # saving a memory allocation & copy. Since this gets a special code-path in
+    # transpose_op.cc, we test that the codepath is exercised and the results
+    # are as expected; we do not test that we save the memory allocation and
+    # copy here.
+    for shape in [[2, 1, 2], [2, 1, 2, 1, 1, 2], [1, 2, 2, 1, 1, 1],
+                  [1, 1, 1, 2, 2, 2], [2, 2, 1, 1, 1]]:
+      self._compare_cpu_gpu(
+          np.arange(np.prod(shape)).reshape(shape).astype(np.float32))
+
   def testTransposeShapes(self):
     self.assertEqual(
         [],
@@ -256,9 +372,6 @@ class TransposeTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.transpose(
           np.arange(0., 30).reshape([2, 3, 5]), [[0, 1], [2, 3]])
-    self._testError(
-        np.arange(0., 2**11).reshape([2] * 11), np.arange(11),
-        "not implemented")
     with self.assertRaises(ValueError):
       array_ops.transpose(np.arange(0., 30).reshape([2, 3, 5]), [0, 1, 3])
     self._testError(
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index ddec6ec60b9..7108131d53d 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -107,14 +108,14 @@ class VariableScopeTest(test.TestCase):
         variable_scope.get_variable("x", initializer={})
 
   def testInitFromNonInitializer(self):
-    with self.test_session() as sess:
+    with self.test_session():
       # Test various dtypes with zeros initializer as following:
       types = [
           dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.uint16, dtypes.int32,
           dtypes.int64, dtypes.bool
       ]
 
-      # Use different varibale_name to distinguish various dtypes
+      # Use different variable_name to distinguish various dtypes
       for (i, dtype) in enumerate(types):
         x = variable_scope.get_variable(
             name="x%d" % i, shape=(3, 4), dtype=dtype)
@@ -396,7 +397,7 @@ class VariableScopeTest(test.TestCase):
         self.assertTrue(jump_reuse.reuse)
 
       with variable_scope.variable_scope(vs, reuse=False) as jump_no_reuse:
-        self.assertFalse(jump_no_reuse.reuse)
+        self.assertTrue(jump_no_reuse.reuse)  # Inherited, cannot be undone.
 
       with variable_scope.variable_scope("jump", reuse=False) as scope:
         vs = scope
@@ -721,6 +722,63 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(varname_type[0], ("x", dtypes.float32))
     self.assertEqual(varname_type[1], ("y", dtypes.int64))
 
+  def testGetCollection(self):
+    with self.test_session():
+      _ = variable_scope.get_variable("a", [])
+      _ = variable_scope.get_variable("b", [], trainable=False)
+      with variable_scope.variable_scope("foo_") as scope1:
+        _ = variable_scope.get_variable("a", [])
+        _ = variable_scope.get_variable("b", [], trainable=False)
+        self.assertEqual([
+            v.name
+            for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+        ], ["foo_/a:0"])
+        self.assertEqual([
+            v.name
+            for v in scope1.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        ], ["foo_/a:0", "foo_/b:0"])
+      with variable_scope.variable_scope("foo") as scope2:
+        _ = variable_scope.get_variable("a", [])
+        _ = variable_scope.get_variable("b", [], trainable=False)
+        self.assertEqual([
+            v.name
+            for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+        ], ["foo/a:0"])
+        self.assertEqual([
+            v.name
+            for v in scope2.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        ], ["foo/a:0", "foo/b:0"])
+      scope = variable_scope.get_variable_scope()
+      self.assertEqual([
+          v.name for v in scope.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      ], ["a:0", "b:0", "foo_/a:0", "foo_/b:0", "foo/a:0", "foo/b:0"])
+      self.assertEqual([
+          v.name
+          for v in scope.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      ], ["a:0", "foo_/a:0", "foo/a:0"])
+
+  def testGetTrainableVariables(self):
+    with self.test_session():
+      _ = variable_scope.get_variable("a", [])
+      with variable_scope.variable_scope("foo") as scope:
+        _ = variable_scope.get_variable("b", [])
+        _ = variable_scope.get_variable("c", [], trainable=False)
+        self.assertEqual([v.name
+                          for v in scope.trainable_variables()], ["foo/b:0"])
+
+  def testGetGlobalVariables(self):
+    with self.test_session():
+      _ = variable_scope.get_variable("a", [])
+      with variable_scope.variable_scope("foo") as scope:
+        _ = variable_scope.get_variable("b", [])
+        self.assertEqual([v.name
+                          for v in scope.global_variables()], ["foo/b:0"])
+
+  def testGetVariableWithRefDtype(self):
+    v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
+    # Ensure it is possible to do get_variable with a _ref dtype passed in.
+    _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
@@ -749,7 +807,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
           dtypes.int64, dtypes.bool
       ]
 
-      # Use different varibale_name to distinguish various dtypes
+      # Use different variable_name to distinguish various dtypes
       for (i, dtype) in enumerate(types):
         x = variable_scope.get_variable(
             name="x%d" % i,
@@ -899,6 +957,25 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual(v3, v4)
     self.assertEqual(3, called[0])  # skipped one in the first new_scope
 
+  def testCustomGetterWithReuse(self):
+    # Custom getter can choose to behave differently on reused variables.
+    def custom_getter(getter, *args, **kwargs):
+      var = getter(*args, **kwargs)
+      if kwargs["reuse"]:
+        # This can be used, e.g., for changing the caching device if needed.
+        return array_ops.identity(var, name="reused")
+      else:
+        return array_ops.identity(var, name="not_reused")
+
+    with variable_scope.variable_scope(
+        "scope", custom_getter=custom_getter) as scope:
+      v = variable_scope.get_variable("v", [1])
+    with variable_scope.variable_scope(scope, reuse=True):
+      v2 = variable_scope.get_variable("v", [1])
+
+    self.assertEqual(v.name, "not_reused:0")
+    self.assertEqual(v2.name, "reused:0")
+
   def testGetterThatCreatesTwoVariablesAndSumsThem(self):
 
     def custom_getter(getter, name, *args, **kwargs):
@@ -921,6 +998,52 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       np_vars, np_v = sess.run([true_vars, v])
       self.assertAllClose(np_v, sum(np_vars))
 
+  def testNestedCustomGetters(self):
+
+    def sum_getter(getter, name, *args, **kwargs):
+      g_0 = getter("%s/sum_0" % name, *args, **kwargs)
+      g_1 = getter("%s/sum_1" % name, *args, **kwargs)
+      with ops.name_scope("sum_getter"):
+        return g_0 + g_1
+
+    def prod_getter(getter, name, *args, **kwargs):
+      g_0 = getter("%s/prod_0" % name, *args, **kwargs)
+      g_1 = getter("%s/prod_1" % name, *args, **kwargs)
+      with ops.name_scope("prod_getter"):
+        return g_0 * g_1
+
+    with variable_scope.variable_scope(
+        "prod_scope", custom_getter=prod_getter):
+      with variable_scope.variable_scope(
+          "sum_scope", custom_getter=sum_getter):
+        with variable_scope.variable_scope(
+            "inner_sum_scope", custom_getter=sum_getter):
+          # take sums of sums of products
+          v = variable_scope.get_variable("v", [1, 2, 3])
+
+    self.assertEqual([1, 2, 3], v.get_shape())
+    true_vars = variables_lib.trainable_variables()
+    self.assertEqual(8, len(true_vars))
+    template = (
+        "prod_scope/sum_scope/inner_sum_scope/v/sum_%d/sum_%d/prod_%d:0")
+    self.assertEqual(template % (0, 0, 0), true_vars[0].name)
+    self.assertEqual(template % (0, 0, 1), true_vars[1].name)
+    self.assertEqual(template % (0, 1, 0), true_vars[2].name)
+    self.assertEqual(template % (0, 1, 1), true_vars[3].name)
+    self.assertEqual(template % (1, 0, 0), true_vars[4].name)
+    self.assertEqual(template % (1, 0, 1), true_vars[5].name)
+    self.assertEqual(template % (1, 1, 0), true_vars[6].name)
+    self.assertEqual(template % (1, 1, 1), true_vars[7].name)
+
+    with self.test_session() as sess:
+      variables_lib.global_variables_initializer().run()
+      np_vars, np_v = sess.run([true_vars, v])
+      # take products of sums of products
+      self.assertAllClose(
+          np_v,
+          (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
+           + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
+
 
 class PartitionInfoTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index f2163ac79c3..47a22f2a876 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -46,11 +46,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("Variable:0", var0.name)
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.get_shape())
+      self.assertEqual([], var0.shape)
 
       var1 = variables.Variable(1.1)
       self.assertEqual("Variable_1:0", var1.name)
       self.assertEqual([], var1.get_shape())
       self.assertEqual([], var1.get_shape())
+      self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         var0.eval()
@@ -69,11 +71,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("rnd:0", rnd.name)
       self.assertEqual([3, 6], rnd.get_shape())
       self.assertEqual([3, 6], rnd.get_shape())
+      self.assertEqual([3, 6], rnd.shape)
 
       dep = variables.Variable(rnd.initialized_value(), name="dep")
       self.assertEqual("dep:0", dep.name)
       self.assertEqual([3, 6], dep.get_shape())
       self.assertEqual([3, 6], dep.get_shape())
+      self.assertEqual([3, 6], dep.shape)
 
       # Currently have to set the shape manually for Add.
       added_val = rnd.initialized_value() + dep.initialized_value() + 2.0
@@ -83,6 +87,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual("depdep:0", depdep.name)
       self.assertEqual([3, 6], depdep.get_shape())
       self.assertEqual([3, 6], depdep.get_shape())
+      self.assertEqual([3, 6], depdep.shape)
 
       variables.global_variables_initializer().run()
 
@@ -133,6 +138,18 @@ class VariablesTestCase(test.TestCase):
       four.eval()
       self.assertAllClose(4.0, var.eval())
 
+  def testZeroSizeStringAssign(self):
+    with self.test_session() as sess:
+      array = variables.Variable(
+          initial_value=array_ops.zeros((0,), dtype=dtypes.string),
+          name="foo",
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      sess.run(variables.local_variables_initializer())
+      old_value = array.value()
+      copy_op = array.assign(old_value)
+      self.assertEqual([], list(sess.run(copy_op)))
+
   def _countUpToTest(self, dtype):
     with self.test_session():
       zero = constant_op.constant(0, dtype=dtype)
@@ -173,13 +190,10 @@ class VariablesTestCase(test.TestCase):
         d = constant_op.constant(2.0)
         # variables do not.
         var_x = variables.Variable(2.0)
-        # initialized_value do not either.
-        inited_x = var_x.initialized_value()
       self.assertEqual([c.op], d.op.control_inputs)
       self.assertEqual([], var_x.initializer.control_inputs)
       self.assertEqual([], var_x.value().op.control_inputs)
       self.assertEqual([], var_x._ref().op.control_inputs)  # pylint: disable=protected-access
-      self.assertEqual([var_x.initializer], inited_x.op.control_inputs)
 
   def testControlFlow(self):
     with self.test_session() as sess:
@@ -217,6 +231,19 @@ class VariablesTestCase(test.TestCase):
       sess.run(v0.initializer)
       sess.run(add)
 
+  def testControlFlowInitialization(self):
+    """Expects an error if an initializer is in a control-flow scope."""
+    def cond(i, _):
+      return i < 10
+
+    def body(i, _):
+      zero = array_ops.zeros([], dtype=dtypes.int32)
+      v = variables.Variable(initial_value=zero)
+      return (i + 1, v.read_value())
+
+    with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
+      control_flow_ops.while_loop(cond, body, [0, 0])
+
   def testUseVariableAsTensor(self):
     with self.test_session():
       var_x = variables.Variable(2.0)
@@ -247,8 +274,6 @@ class VariablesTestCase(test.TestCase):
       var_cached = variables.Variable(2.0, caching_device="/job:foo")
       self.assertFalse(var_cached.device.startswith("/job:foo"))
       self.assertTrue(var_cached.value().device.startswith("/job:foo"))
-      self.assertTrue(var_cached.initialized_value().device.startswith(
-          "/job:foo"))
 
   def testCollections(self):
     with self.test_session():
@@ -301,6 +326,10 @@ class VariablesTestCase(test.TestCase):
       var_t = variables.Variable(rnd)
       slice_v = var_t[2, 0:0]
 
+      var_m = variables.Variable([[2.0, 3.0]])
+      matmul = var_m.__matmul__([[10.0], [20.0]])
+      rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
+
       variables.global_variables_initializer().run()
       self.assertAllClose([2.0], add.eval())
       self.assertAllClose([3.0], radd.eval())
@@ -331,6 +360,9 @@ class VariablesTestCase(test.TestCase):
 
       self.assertAllClose(rnd[2, 0:0], slice_v.eval())
 
+      self.assertAllClose([[80.0]], matmul.eval())
+      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], rmatmul.eval())
+
   def testSession(self):
     with self.test_session() as sess:
       var = variables.Variable([1, 12])
@@ -363,6 +395,7 @@ class VariablesTestCase(test.TestCase):
 
       v1 = variables.Variable(initializer, dtype=dtypes.float32)
       self.assertEqual(shape, v1.get_shape())
+      self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
         v1.eval()
@@ -370,17 +403,24 @@ class VariablesTestCase(test.TestCase):
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
       self.assertEqual(v1.get_shape(), v2.get_shape())
+      self.assertEqual(v1.shape, v2.shape)
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
-      # Once v2.initial_value.eval() has been called, v1 has effectively been
-      # initialized.
-      self.assertAllClose(value, v1.eval())
-
       with self.assertRaises(errors_impl.FailedPreconditionError):
         v2.eval()
       variables.global_variables_initializer().run()
       self.assertAllClose(np.negative(value), v2.eval())
 
+  def testNoRefDataRace(self):
+    with self.test_session():
+      a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
+      b = variables.Variable(a.initialized_value() + 2)
+      c = variables.Variable(b.initialized_value() + 2)
+      variables.global_variables_initializer().run()
+      self.assertAllEqual(a.eval(), [1, 2, 3])
+      self.assertAllEqual(b.eval(), [3, 4, 5])
+      self.assertAllEqual(c.eval(), [5, 6, 7])
+
   def testInitializerFunctionDevicePlacement(self):
     with self.test_session():
       initializer = lambda: constant_op.constant(42.0)
@@ -407,6 +447,12 @@ class VariablesTestCase(test.TestCase):
 
       self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
 
+  def testRepr(self):
+    var = variables.Variable(np.zeros((5, 5), np.float32), name='noop')
+    self.assertEqual(
+        "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
+        repr(var))
+
 
 class IsInitializedTest(test.TestCase):
 
@@ -514,6 +560,7 @@ class PartitionedVariableTest(test.TestCase):
       self.assertEqual(2, num_partitions)
       self.assertEqual([v0, v1], iterated_partitions)
       self.assertEqual([2], concatenated.get_shape())
+      self.assertEqual([2], concatenated.shape)
 
   def testPartitionedVariableFailures(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c8c4e2bd1ea..4b3dadc1128 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -157,7 +157,7 @@ class XentTest(test.TestCase):
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
 
   def testGradient(self):
-    with self.test_session():
+    with self.test_session() as sess:
       l = constant_op.constant(
           [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
           shape=[3, 4],
@@ -171,14 +171,21 @@ class XentTest(test.TestCase):
       x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
                                                    name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
+
+      # Check that no extra computation performed. When only first derivative is requested,
+      # second derivative must not be computed. So when there is no second derivative,
+      # there is no `BatchMatMul` op in the graph.
+      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+      self.assertNotIn('BatchMatMul', op_names)
+
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
-    with self.test_session():
-      l = constant_op.constant([0.0, 0.0, 1.0, 0.0,
-                                1.0, 0.0, 0.0, 0.0,
-                                0.0, 0.5, 0.0, 0.5], shape=[12],
+    with self.test_session() as sess:
+      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
+                                1.0/3, 0.0, 0.0, 0.0,
+                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
                                dtype=dtypes.float64, name="l")
       f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
                                 0.1, 0.4, 0.9, 1.6,
@@ -186,13 +193,19 @@ class XentTest(test.TestCase):
                                dtype=dtypes.float64, name="f")
       x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
                                                    name="xent")
-      loss = math_ops.reduce_mean(x)
+      loss = math_ops.reduce_sum(x)
 
-    # Taking ths second gradient should fail, since it is not
-    # yet supported.
-    with self.assertRaisesRegexp(LookupError,
-                                 ".*No gradient defined.*PreventGradient.*"):
-      _ = gradients_impl.hessians(loss, [f])
+      gradients = gradients_impl.gradients(loss, [f])[0]
+
+      err = gradient_checker.compute_gradient_error(f, [12], gradients, [12])
+
+      # Check that second derivative is calculated.
+      # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
+      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+      self.assertIn('BatchMatMul', op_names)
+
+    print("cross entropy hessian err = ", err)
+    self.assertLess(err, 5e-8)
 
   def testWrapper(self):
     features = np.array(
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 853b08b2a50..a37308f702b 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,9 +23,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
 import functools
-import inspect
 import re
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 import six
@@ -34,9 +36,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 
-class _Layer(object):
+class Layer(object):
   """Base layer class.
 
   WARNING: Do not subclass this layer unless you know what you are doing:
@@ -58,6 +62,8 @@ class _Layer(object):
     variables: List of all variables of this layer, trainable and non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
+    input_spec: Object specifying the constraints on inputs that can be
+      accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None,
@@ -78,80 +84,65 @@ class _Layer(object):
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    self._trainable = trainable
-    self._built = False
-    self._trainable_variables = []
-    self._non_trainable_variables = []
+    self.trainable = trainable
+    self.built = False
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []
     self._losses = []
     self._reuse = kwargs.get('_reuse')
-    self.dtype = dtype
+    self._graph = ops.get_default_graph()
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+    self.dtype = dtypes.as_dtype(dtype).name
+    self.input_spec = None
 
-    # Determine base name (non-unique).
-    base_name = name
+    # Determine layer name (non-unique).
+    if isinstance(name, vs.VariableScope):
+      base_name = name.name
+    else:
+      base_name = name
+      self.name = name
     if not name:
       base_name = _to_snake_case(self.__class__.__name__)
+      self.name = _unique_layer_name(base_name)
+    self._base_name = base_name
 
     # Determine variable scope.
     scope = kwargs.get('_scope')
     if scope:
       self._scope = next(vs.variable_scope(scope).gen)
     else:
-      self._scope = next(vs.variable_scope(None, default_name=base_name).gen)
-
-    # Unique name is borrowed from scope to match variable names.
-    self.name = self._scope.name
-
-  def __setattr__(self, name, value):
-    if hasattr(self, name):
-      # Only allow private attributes to be set more than once, under the
-      # convention that private attributes should only be set from inside
-      # the class.
-      # All attributes meant to be set several times should be set to private.
-      if name[0] != '_':
-        raise AttributeError('Read-only property cannot be set: %s' % name)
-    super(_Layer, self).__setattr__(name, value)
+      self._scope = None
 
   @property
-  def trainable_variables(self):
-    return self._trainable_variables if self.trainable else []
-
-  @property
-  def non_trainable_variables(self):
-    return self._non_trainable_variables if self.trainable else self.variables
+  def scope_name(self):
+    if not self._scope:
+      raise ValueError('No name available for layer scope because the layer "' +
+                       self.name + '" has not been used yet. The scope name ' +
+                       ' is determined the first time the layer instance is ' +
+                       'called. You must therefore call the layer before ' +
+                       'querying `scope_name`.')
+    return self._scope.name
 
   @property
   def trainable_weights(self):
-    return self.trainable_variables
+    return self._trainable_weights if self.trainable else []
 
   @property
   def non_trainable_weights(self):
-    return self.non_trainable_variables
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
 
   @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self._trainable_variables + self._non_trainable_variables
+  def trainable_variables(self):
+    return self.trainable_weights
 
   @property
-  def updates(self):
-    return self._updates
-
-  @property
-  def losses(self):
-    return self._losses
-
-  @property
-  def built(self):
-    return self._built
-
-  @property
-  def trainable(self):
-    return self._trainable
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
 
   @property
   def weights(self):
@@ -160,12 +151,150 @@ class _Layer(object):
     Returns:
       A list of variables.
     """
-    return self.variables
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def updates(self):
+    return self._updates
+
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing a same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: Optional input tensor(s) that the update(s) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the updates are created. If `None` is passed, the updates are assumed
+        to be unconditional, and will apply across all dataflows of the layer.
+    """
+    updates = _to_list(updates)
+    if not updates:
+      return
+    self._updates += updates
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_updates:
+      self._per_input_updates[inputs_hash] = []
+    self._per_input_updates[inputs_hash] += updates
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__` method
+        at the time the updates were created.
+        If you pass `inputs=None`, unconditional updates are returned.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+    """
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_updates.get(inputs_hash, [])
+
+  @property
+  def losses(self):
+    return self._losses
+
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing a same layer
+    on different inputs `a` and `b`, some entries in `layer.losses` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: Optional input tensor(s) that the loss(es) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the losses are created. If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+    """
+    losses = _to_list(losses)
+    if not losses:
+      return
+    self._losses += losses
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_losses:
+      self._per_input_losses[inputs_hash] = []
+    self._per_input_losses[inputs_hash] += losses
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__`
+        method at the time the losses were created.
+        If you pass `inputs=None`, unconditional losses are returned,
+        such as weight regularization losses.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+    """
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
     """Creates the variables of the layer.
     """
-    self._built = True
+    self.built = True
 
   def call(self, inputs, **kwargs):
     """The logic of the layer lives here.
@@ -179,10 +308,42 @@ class _Layer(object):
     """
     raise NotImplementedError
 
-  def _add_variable(self, name, shape, dtype=None,
-                    initializer=None, regularizer=None, trainable=True,
-                    variable_getter=vs.get_variable):
-    """Adds a new variable to the layer.
+  def _compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer given the input shape.
+
+    Assumes that the layer will be built to match that input shape.
+    If this method is not implemented by child classes, the default
+    assumption will be that the layer does not alter the shape of the tensors
+    passing through it.
+
+    Args:
+      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
+        be fully defined (e.g. the batch size may be unknown).
+
+    Returns:
+      A (possibly nested tuple of) `TensorShape`.
+
+    Raises:
+      TypeError: if `input_shape` is not a (possibly nested tuple of)
+        `TensorShape`.
+      ValueError: if `input_shape` is incomplete or is incompatible with the
+        the layer.
+    """
+    return input_shape
+
+  def _set_scope(self, scope=None):
+    if self._scope is None:
+      # If constructed with _scope=None, lazy setting of scope.
+      if self._reuse:
+        self._scope = next(vs.variable_scope(
+            scope if scope is not None else self._base_name).gen)
+      else:
+        self._scope = next(vs.variable_scope(
+            scope, default_name=self._base_name).gen)
+
+  def add_variable(self, name, shape, dtype=None,
+                   initializer=None, regularizer=None, trainable=True):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
       name: variable name.
@@ -193,7 +354,6 @@ class _Layer(object):
       trainable: whether the variable should be part of the layer's
         "trainable_variables" (e.g. variables, biases)
         or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      variable_getter: The getter to use for TensorFlow variables.
 
     Returns:
       The created variable.
@@ -201,76 +361,84 @@ class _Layer(object):
     if dtype is None:
       dtype = self.dtype
     existing_variables = set(tf_variables.global_variables())
-    variable = variable_getter(name,
-                               shape=shape,
-                               initializer=initializer,
-                               dtype=dtype,
-                               trainable=trainable and self.trainable)
-    # TODO(sguada) fix name = variable.op.name
-    if variable in existing_variables:
-      return variable
-    if regularizer:
-      # To match the behavior of tf.get_variable(), we only
-      # apply regularization if the variable is newly created.
-      if isinstance(variable, tf_variables.PartitionedVariable):
-        for v in variable:
-          with ops.colocate_with(v.op):
-            with ops.name_scope(name + '/Regularizer'):
-              regularization = regularizer(v)
-          if regularization is not None:
-            self._losses.append(regularization)
-            _add_elements_to_collection(
-                regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
-      else:
-        with ops.colocate_with(variable.op):
-          with ops.name_scope(name + '/Regularizer'):
-            regularization = regularizer(variable)
-        if regularization is not None:
-          self._losses.append(regularization)
-          _add_elements_to_collection(
-              regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+
+    self._set_scope(None)
+
+    with vs.variable_scope(self._scope,
+                           reuse=self.built or self._reuse) as scope:
+      with ops.name_scope(scope.original_name_scope):
+        variable = vs.get_variable(name,
+                                   shape=shape,
+                                   initializer=initializer,
+                                   dtype=dtypes.as_dtype(dtype),
+                                   trainable=trainable and self.trainable)
+        if variable in existing_variables:
+          return variable
+        if regularizer:
+          # To match the behavior of tf.get_variable(), we only
+          # apply regularization if the variable is newly created.
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            for v in variable:
+              with ops.colocate_with(v.op):
+                with ops.name_scope(name + '/Regularizer'):
+                  regularization = regularizer(v)
+              if regularization is not None:
+                self.add_loss(regularization)
+                _add_elements_to_collection(
+                    regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+          else:
+            with ops.colocate_with(variable.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(variable)
+            if regularization is not None:
+              self.add_loss(regularization)
+              _add_elements_to_collection(
+                  regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
     if trainable:
-      self._trainable_variables.append(variable)
+      self._trainable_weights.append(variable)
     else:
-      self._non_trainable_variables.append(variable)
+      self._non_trainable_weights.append(variable)
     return variable
 
-  def __call__(self, inputs, **kwargs):
+  def __call__(self, inputs, *args, **kwargs):
     """Wraps `call`, applying pre- and post-processing steps.
 
     Arguments:
       inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
       **kwargs: additional keyword arguments to be passed to `self.call`.
-
+        **Note**: kwarg `scope` is reserved for use by the layer.
     Returns:
       Output tensor(s).
     """
-    # Define a custom getter to override tf.get_variable when creating layer
-    # variables. We respect current custom getter, if one is set.
-    current_custom_getter = vs.get_variable_scope().custom_getter
-    def variable_getter(getter, name, shape, dtype=None, initializer=None,
-                        regularizer=None, trainable=True, **kwargs):
-      if current_custom_getter is not None:
-        getter = functools.partial(current_custom_getter, getter)
-      return self._add_variable(
-          name, shape, initializer=initializer, regularizer=regularizer,
-          dtype=dtype, trainable=trainable,
-          variable_getter=functools.partial(getter, **kwargs))
+    self._set_scope(kwargs.pop('scope', None))
+
+    # Ensure the Layer, if being reused, is working with inputs from
+    # the same graph as where it was created.
+    try:
+      ops._get_graph_from_inputs(nest.flatten(inputs), graph=self.graph)  # pylint: disable=protected-access
+    except ValueError as e:
+      raise ValueError('Input graph and Layer graph are not the same: %s' % e)
 
-    # Build (if necessary) and call the layer, inside a variable scope.
     with vs.variable_scope(self._scope,
-                           reuse=True if self._built else self._reuse,
-                           custom_getter=variable_getter) as scope:
+                           reuse=self.built or self._reuse) as scope:
       with ops.name_scope(scope.original_name_scope):
         if not self.built:
-          input_list = _to_list(inputs)
+          # Check input assumptions set before layer building, e.g. input rank.
+          self._assert_input_compatibility(inputs)
+          input_list = [
+              ops.convert_to_tensor(x, name='input')
+              for x in nest.flatten(inputs)]
           input_shapes = [x.get_shape() for x in input_list]
           if len(input_shapes) == 1:
             self.build(input_shapes[0])
           else:
             self.build(input_shapes)
-          self._built = True
-        outputs = self.call(inputs, **kwargs)
+        if 'scope' in tf_inspect.getargspec(self.call).args:
+          kwargs['scope'] = scope
+        # Check input assumptions set after layer building, e.g. input shape.
+        self._assert_input_compatibility(inputs)
+        outputs = self.call(inputs, *args, **kwargs)
 
         # Apply activity regularization.
         # Note that it should be applied every time the layer creates a new
@@ -280,27 +448,181 @@ class _Layer(object):
           for output in output_list:
             with ops.name_scope('ActivityRegularizer'):
               activity_regularization = self.activity_regularizer(output)
-            self._losses.append(activity_regularization)
+            self.add_loss(activity_regularization)
             _add_elements_to_collection(
                 activity_regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
 
     # Update global default collections.
     _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
+    self.built = True
     return outputs
 
-  def apply(self, inputs, **kwargs):
+  @property
+  def graph(self):
+    return self._graph
+
+  def __deepcopy__(self, memo):
+    no_copy = set(['_graph'])
+    shallow_copy = set(['_scope'])
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      if k in no_copy:
+        setattr(result, k, v)
+      elif k in shallow_copy:
+        setattr(result, k, copy.copy(v))
+      else:
+        setattr(result, k, copy.deepcopy(v, memo))
+    return result
+
+  def apply(self, inputs, *args, **kwargs):
     """Apply the layer on a input.
 
     This simply wraps `self.__call__`.
 
     Arguments:
       inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
       **kwargs: additional keyword arguments to be passed to `self.call`.
 
     Returns:
       Output tensor(s).
     """
-    return self.__call__(inputs, **kwargs)
+    return self.__call__(inputs, *args, **kwargs)
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = _to_list(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = _to_list(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, by the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim='
+                           + str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
+
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
 
 
 def _to_snake_case(name):
@@ -330,12 +652,48 @@ def _to_list(x):
   return [x]
 
 
-def _add_elements_to_collection(elements, collections):
+def _add_elements_to_collection(elements, collection_list):
   elements = _to_list(elements)
-  collections = _to_list(collections)
-  for name in collections:
+  collection_list = _to_list(collection_list)
+  for name in collection_list:
     collection = ops.get_collection_ref(name)
     collection_set = set(collection)
     for element in elements:
       if element not in collection_set:
         collection.append(element)
+
+
+def _object_list_uid(object_list):
+  object_list = _to_list(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = collections.defaultdict(
+    lambda: collections.defaultdict(int))
+
+
+def _unique_layer_name(name):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```
+    >>> _unique_layer_name('dense')
+    dense_1
+    >>> _unique_layer_name('dense')
+    dense_2
+  ```
+  """
+  graph = ops.get_default_graph()
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
+  layer_name_uids[name] += 1
+  return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 1294f3ca028..81fbe5fbf70 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import base as base_layers
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -30,25 +33,24 @@ from tensorflow.python.platform import test
 class BaseLayerTest(test.TestCase):
 
   def testLayerProperties(self):
-    layer = base_layers._Layer(name='my_layer')
-    self.assertEqual(layer.name, 'my_layer')
+    layer = base_layers.Layer(name='my_layer')
     self.assertListEqual(layer.variables, [])
     self.assertListEqual(layer.trainable_variables, [])
     self.assertListEqual(layer.non_trainable_variables, [])
     self.assertListEqual(layer.updates, [])
     self.assertListEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
-    layer = base_layers._Layer(name='my_layer', trainable=False)
+    layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
   def testAddWeight(self):
     with self.test_session():
-      layer = base_layers._Layer(name='my_layer')
+      layer = base_layers.Layer(name='my_layer')
 
       # Test basic variable creation.
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-      self.assertEqual(variable.name, 'my_var:0')
+      self.assertEqual(variable.name, 'my_layer/my_var:0')
       self.assertListEqual(layer.variables, [variable])
       self.assertListEqual(layer.trainable_variables, [variable])
       self.assertListEqual(layer.non_trainable_variables, [])
@@ -57,8 +59,8 @@ class BaseLayerTest(test.TestCase):
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
       # Test non-trainable variable creation.
-      # layer._add_variable should work even outside `build` and `call`.
-      variable_2 = layer._add_variable(
+      # layer.add_variable should work even outside `build` and `call`.
+      variable_2 = layer.add_variable(
           'non_trainable_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           trainable=False)
@@ -70,7 +72,7 @@ class BaseLayerTest(test.TestCase):
 
       # Test with regularizer.
       regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'reg_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           regularizer=regularizer)
@@ -78,30 +80,70 @@ class BaseLayerTest(test.TestCase):
 
   def testGetVariable(self):
     with self.test_session():
-      # From inside `build` and `call` it should be possible to use
-      # either tf.get_variable
 
-      class MyLayer(base_layers._Layer):
+      class MyLayer(base_layers.Layer):
 
         def build(self, input_shape):
-          self.my_var = variable_scope.get_variable(
+          self.my_var = self.add_variable(
               'my_var', [2, 2], initializer=init_ops.zeros_initializer())
 
         def call(self, inputs):
-          variable_scope.get_variable(
-              'my_call_var', [2, 2], initializer=init_ops.zeros_initializer())
-          return inputs
+          return inputs * 2
 
       layer = MyLayer(name='my_layer')
       inputs = random_ops.random_uniform((5,), seed=1)
       layer.apply(inputs)
       layer.apply(inputs)
       self.assertListEqual([v.name for v in layer.variables],
-                           ['my_layer/my_var:0', 'my_layer/my_call_var:0'])
+                           ['my_layer/my_var:0'])
+
+      # Creating a layer with no scope leads to lazy construction of
+      # the scope at apply() time.  It uses scope "<current scope>/base_name"
+      lazy_layer = MyLayer(_reuse=True)
+      with variable_scope.variable_scope('new_scope'):
+        # This should attempt to reuse 'my_var' in 'new_scope'
+        with self.assertRaisesRegexp(
+            ValueError, r'new_scope/my_layer/my_var does not exist'):
+          lazy_layer.apply(inputs)
+        with variable_scope.variable_scope('my_layer'):
+          variable_scope.get_variable('my_var', [2, 2])
+
+        # Smoke test: it runs.
+        lazy_layer.apply(inputs)
+        # The variables were created outside of the Layer, and
+        # reuse=True, so the Layer does not own them and they are not
+        # stored in its collection.
+        self.assertListEqual(lazy_layer.variables, [])
+        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
+
+      # Creating a layer with no scope leads to lazy construction of
+      # the scope at apply() time.  If 'scope' argument is passed to
+      # apply(), it uses that scope when accessing variables.
+      lazy_layer = MyLayer(_reuse=True)
+      with variable_scope.variable_scope('new_scope') as new_scope:
+        # This should attempt to reuse 'my_var' in 'new_scope'
+        with self.assertRaisesRegexp(
+            ValueError, r'new_scope/my_var does not exist'):
+          lazy_layer.apply(inputs, scope=new_scope)
+        variable_scope.get_variable('my_var', [2, 2])
+
+        # Smoke test: it runs.
+        lazy_layer.apply(inputs, scope=new_scope)
+        # The variables were created outside of the Layer, and
+        # reuse=True, so the Layer does not own them and they are not
+        # stored in its collection.
+        self.assertListEqual(lazy_layer.variables, [])
+        self.assertEqual(lazy_layer._scope.name, 'new_scope')
+
+      with ops.Graph().as_default():
+        inputs_ng = random_ops.random_uniform((5,), seed=1)
+        with self.assertRaisesRegexp(ValueError,
+                                     r'graph are not the same'):
+          layer.apply(inputs_ng)
 
   def testCall(self):
 
-    class MyLayer(base_layers._Layer):
+    class MyLayer(base_layers.Layer):
 
       def call(self, inputs):
         return math_ops.square(inputs)
@@ -112,37 +154,247 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.built, True)
     self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  def testNaming(self):
-    default_layer = base_layers._Layer()
-    self.assertEqual(default_layer.name, 'private__layer')
-    default_layer1 = base_layers._Layer()
-    self.assertEqual(default_layer1.name, 'private__layer_1')
-    my_layer = base_layers._Layer(name='my_layer')
-    self.assertEqual(my_layer.name, 'my_layer')
-    my_layer1 = base_layers._Layer(name='my_layer')
-    self.assertEqual(my_layer1.name, 'my_layer_1')
-    # New graph has fully orthogonal names.
-    with ops.Graph().as_default():
-      my_layer_other_graph = base_layers._Layer(name='my_layer')
-      self.assertEqual(my_layer_other_graph.name, 'my_layer')
-    my_layer2 = base_layers._Layer(name='my_layer')
-    self.assertEqual(my_layer2.name, 'my_layer_2')
+  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, _):
+        # Do not mark the layer as built.
+        pass
+
+      def call(self, inputs):
+        self.my_var = self.add_variable('my_var', [2, 2])
+        if self.built:
+          # Skip creating on the first call; try to create after it's
+          # built.  This is expected to fail.
+          self.add_variable('this_will_break_on_second_call', [2, 2])
+        return inputs + math_ops.square(self.my_var)
+
+    layer = MyLayer(name='my_layer')
+    inputs = random_ops.random_uniform((2,), seed=1)
+    outputs = layer.apply(inputs)
+    self.assertEqual(layer.built, True)
+    self.assertEqual(outputs.op.name, 'my_layer/add')
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+    with self.assertRaisesRegexp(ValueError,
+                                 'my_layer/this_will_break_on_second_call'):
+      layer.apply(inputs)
+    # The list of variables hasn't changed.
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+
+  def testDeepCopy(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        return math_ops.square(inputs)
+
+    layer = MyLayer(name='my_layer')
+    inputs = random_ops.random_uniform((5,), seed=1)
+    outputs = layer.apply(inputs)
+    self.assertEqual(layer.built, True)
+    self.assertEqual(outputs.op.name, 'my_layer/Square')
+
+    layer_copy = copy.deepcopy(layer)
+    self.assertEqual(layer_copy.name, layer.name)
+    self.assertEqual(layer_copy._scope.name, layer._scope.name)
+    self.assertEqual(layer_copy._graph, layer._graph)
+
+  def testScopeNaming(self):
+
+    class PrivateLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        return None
+
+    inputs = random_ops.random_uniform((5,))
+    default_layer = PrivateLayer()
+    _ = default_layer.apply(inputs)
+    self.assertEqual(default_layer._scope.name, 'private_layer')
+    default_layer1 = PrivateLayer()
+    default_layer1.apply(inputs)
+    self.assertEqual(default_layer1._scope.name, 'private_layer_1')
+    my_layer = PrivateLayer(name='my_layer')
+    my_layer.apply(inputs)
+    self.assertEqual(my_layer._scope.name, 'my_layer')
+    my_layer1 = PrivateLayer(name='my_layer')
+    my_layer1.apply(inputs)
+    self.assertEqual(my_layer1._scope.name, 'my_layer_1')
+    my_layer2 = PrivateLayer(name='my_layer')
+    my_layer2.apply(inputs)
+    self.assertEqual(my_layer2._scope.name, 'my_layer_2')
     # Name scope shouldn't affect names.
     with ops.name_scope('some_name_scope'):
-      default_layer2 = base_layers._Layer()
-      self.assertEqual(default_layer2.name, 'private__layer_2')
-      my_layer3 = base_layers._Layer(name='my_layer')
-      self.assertEqual(my_layer3.name, 'my_layer_3')
-      other_layer = base_layers._Layer(name='other_layer')
-      self.assertEqual(other_layer.name, 'other_layer')
-    # Variable scope gets added to names.
+      default_layer2 = PrivateLayer()
+      default_layer2.apply(inputs)
+      self.assertEqual(default_layer2._scope.name, 'private_layer_2')
+      my_layer3 = PrivateLayer(name='my_layer')
+      my_layer3.apply(inputs)
+      self.assertEqual(my_layer3._scope.name, 'my_layer_3')
+      other_layer = PrivateLayer(name='other_layer')
+      other_layer.apply(inputs)
+      self.assertEqual(other_layer._scope.name, 'other_layer')
+    # Variable scope gets added to scope names.
     with variable_scope.variable_scope('var_scope'):
-      default_layer_scoped = base_layers._Layer()
-      self.assertEqual(default_layer_scoped.name, 'var_scope/private__layer')
-      my_layer_scoped = base_layers._Layer(name='my_layer')
-      self.assertEqual(my_layer_scoped.name, 'var_scope/my_layer')
-      my_layer_scoped1 = base_layers._Layer(name='my_layer')
-      self.assertEqual(my_layer_scoped1.name, 'var_scope/my_layer_1')
+      default_layer_scoped = PrivateLayer()
+      default_layer_scoped.apply(inputs)
+      self.assertEqual(default_layer_scoped._scope.name,
+                       'var_scope/private_layer')
+      my_layer_scoped = PrivateLayer(name='my_layer')
+      my_layer_scoped.apply(inputs)
+      self.assertEqual(my_layer_scoped._scope.name, 'var_scope/my_layer')
+      my_layer_scoped1 = PrivateLayer(name='my_layer')
+      my_layer_scoped1.apply(inputs)
+      self.assertEqual(my_layer_scoped1._scope.name, 'var_scope/my_layer_1')
+
+  def testInputSpecNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+
+  def testInputSpecMinNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(min_ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected min_ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+    layer.apply(array_ops.placeholder('int32', shape=(None, None, None)))
+
+  def testInputSpecMaxNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(max_ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected max_ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, None, None)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+    layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+  def testInputSpecDtypeCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(dtype='float32')
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected dtype=float32'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    # Works
+    layer.apply(array_ops.placeholder('float32', shape=(None, None)))
+
+  def testInputSpecAxesCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected axis'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, 3)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None, 2)))
+    layer.apply(array_ops.placeholder('int32', shape=(None, 2)))
+
+  def testInputSpecShapeCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected shape'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, 2)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, 3)))
+    layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
+
+  def testNoInputSpec(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = None
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+
+    # Works
+    layer.apply(array_ops.placeholder('int32'))
+    layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 3b96d4362fd..fdf1b134b9c 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -32,12 +32,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope as vs
-
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
+from tensorflow.python import framework
 
 
-class _Conv(base._Layer):  # pylint: disable=protected-access
+class _Conv(base.Layer):
   """Abstract nD convolution layer (private, used as implementation base).
 
   This layer creates a convolution kernel that is convolved
@@ -48,21 +49,21 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
 
   Arguments:
     rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: an integer or tuple/list of n integers, specifying the
-      length of the 1D convolution window.
-    strides: an integer or tuple/list of n integers,
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
       specifying the stride length of the convolution.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: an integer or tuple/list of n integers, specifying
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
@@ -76,7 +77,7 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -114,37 +115,38 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
     self.activity_regularizer = activity_regularizer
+    self.input_spec = base.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
-    if len(input_shape) != self.rank + 2:
-      raise ValueError('Inputs should have rank ' +
-                       str(self.rank + 2) +
-                       'Received input shape:', str(input_shape))
+    input_shape = tensor_shape.TensorShape(input_shape)
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis] is None:
+    if input_shape[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
+    input_dim = input_shape[channel_axis].value
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.input_spec = base.InputSpec(ndim=self.rank + 2,
+                                     axes={channel_axis: input_dim})
+    self.built = True
 
   def call(self, inputs):
     outputs = nn.convolution(
@@ -154,26 +156,62 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
         strides=self.strides,
         padding=self.padding.upper(),
         data_format=utils.convert_data_format(self.data_format, self.rank + 2))
+
     if self.bias is not None:
-      if self.rank != 2 and self.data_format == 'channels_first':
-        # bias_add does not support channels_first for non-4D inputs.
+      if self.data_format == 'channels_first':
         if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
           bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
         if self.rank == 3:
-          bias = array_ops.reshape(self.bias, (1, self.filters, 1, 1))
-        outputs += bias
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          outputs_4d = array_ops.reshape(outputs,
+                                         [outputs_shape[0], outputs_shape[1],
+                                          outputs_shape[2] * outputs_shape[3],
+                                          outputs_shape[4]])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
-        outputs = nn.bias_add(
-            outputs,
-            self.bias,
-            data_format=utils.convert_data_format(self.data_format, 4))
-        # Note that we passed rank=4 because bias_add will only accept
-        # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
     if self.activation is not None:
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
 
 class Conv1D(_Conv):
   """1D convolution layer (e.g. temporal convolution).
@@ -185,21 +223,21 @@ class Conv1D(_Conv):
   `activation` is not `None`, it is applied to the outputs as well.
 
   Arguments:
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
     kernel_size: An integer or tuple/list of a single integer, specifying the
       length of the 1D convolution window.
-    strides: an integer or tuple/list of a single integer,
+    strides: An integer or tuple/list of a single integer,
       specifying the stride length of the convolution.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
       `(batch, length, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, length)`.
-    dilation_rate: an integer or tuple/list of a single integer, specifying
+    dilation_rate: An integer or tuple/list of a single integer, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
@@ -213,7 +251,7 @@ class Conv1D(_Conv):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -279,21 +317,21 @@ def conv1d(inputs,
 
   Arguments:
     inputs: Tensor input.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
     kernel_size: An integer or tuple/list of a single integer, specifying the
       length of the 1D convolution window.
-    strides: an integer or tuple/list of a single integer,
+    strides: An integer or tuple/list of a single integer,
       specifying the stride length of the convolution.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
       `(batch, length, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, length)`.
-    dilation_rate: an integer or tuple/list of a single integer, specifying
+    dilation_rate: An integer or tuple/list of a single integer, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
@@ -307,7 +345,7 @@ def conv1d(inputs,
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -331,6 +369,7 @@ def conv1d(inputs,
       activity_regularizer=activity_regularizer,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
@@ -346,25 +385,26 @@ class Conv2D(_Conv):
   `activation` is not `None`, it is applied to the outputs as well.
 
   Arguments:
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: an integer or tuple/list of 2 integers, specifying the
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
       width and height of the 2D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
-    strides: an integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the width and height.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
       Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -380,7 +420,7 @@ class Conv2D(_Conv):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -446,25 +486,26 @@ def conv2d(inputs,
 
   Arguments:
     inputs: Tensor input.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: an integer or tuple/list of 2 integers, specifying the
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
       width and height of the 2D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
-    strides: an integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the width and height.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
       Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -480,7 +521,7 @@ def conv2d(inputs,
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -504,6 +545,7 @@ def conv2d(inputs,
       activity_regularizer=activity_regularizer,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
@@ -519,25 +561,27 @@ class Conv3D(_Conv):
   `activation` is not `None`, it is applied to the outputs as well.
 
   Arguments:
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: an integer or tuple/list of 3 integers, specifying the
-      width and height of the 2D convolution window.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
-    strides: an integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the width and height.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
       Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -553,7 +597,7 @@ class Conv3D(_Conv):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -619,25 +663,27 @@ def conv3d(inputs,
 
   Arguments:
     inputs: Tensor input.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: an integer or tuple/list of 3 integers, specifying the
-      width and height of the 2D convolution window.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
-    strides: an integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the width and height.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
       Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -653,7 +699,7 @@ def conv3d(inputs,
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -692,23 +738,24 @@ class SeparableConv2D(Conv2D):
   It then optionally applies an activation function to produce the final output.
 
   Arguments:
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: a tuple or list of N positive integers specifying the spatial
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 integers specifying the spatial
       dimensions of of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
-    strides: a tuple or list of N positive integers specifying the strides
+    strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shapedata_format = 'NWHC'
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -731,7 +778,7 @@ class SeparableConv2D(Conv2D):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -787,6 +834,7 @@ class SeparableConv2D(Conv2D):
                        '`SeparableConv2D` '
                        'should be defined. Found `None`.')
     input_dim = int(input_shape[channel_axis])
+    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
     depthwise_kernel_shape = (self.kernel_size[0],
                               self.kernel_size[1],
                               input_dim,
@@ -795,29 +843,30 @@ class SeparableConv2D(Conv2D):
                               self.depth_multiplier * input_dim,
                               self.filters)
 
-    self.depthwise_kernel = vs.get_variable(
-        'depthwise_kernel',
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
         shape=depthwise_kernel_shape,
         initializer=self.depthwise_initializer,
         regularizer=self.depthwise_regularizer,
         trainable=True,
         dtype=self.dtype)
-    self.pointwise_kernel = vs.get_variable(
-        'pointwise_kernel',
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
         shape=pointwise_kernel_shape,
         initializer=self.pointwise_initializer,
         regularizer=self.pointwise_regularizer,
         trainable=True,
         dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -837,7 +886,7 @@ class SeparableConv2D(Conv2D):
       # Reshape to channels first
       outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
 
-    if self.bias:
+    if self.bias is not None:
       outputs = nn.bias_add(
           outputs,
           self.bias,
@@ -847,6 +896,26 @@ class SeparableConv2D(Conv2D):
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+
+    rows = utils.conv_output_length(rows, self.kernel_size[0],
+                                    self.padding, self.strides[0])
+    cols = utils.conv_output_length(cols, self.kernel_size[1],
+                                    self.padding, self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], self.filters, rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, self.filters])
+
 
 def separable_conv2d(inputs,
                      filters,
@@ -878,23 +947,24 @@ def separable_conv2d(inputs,
 
   Arguments:
     inputs: Input tensor.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: a tuple or list of N positive integers specifying the spatial
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 integers specifying the spatial
       dimensions of of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
-    strides: a tuple or list of N positive integers specifying the strides
+    strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
+    padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shapedata_format = 'NWHC'
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
-    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
       all spatial dimensions.
@@ -917,7 +987,7 @@ def separable_conv2d(inputs,
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -950,7 +1020,7 @@ def separable_conv2d(inputs,
 
 
 class Conv2DTranspose(Conv2D):
-  """Transposed convolution layer (sometimes called Deconvolution).
+  """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
   from the desire to use a transformation going in the opposite direction
@@ -960,20 +1030,20 @@ class Conv2DTranspose(Conv2D):
   said convolution.
 
   Arguments:
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: a tuple or list of 2 positive integers specifying the spatial
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 positive integers specifying the spatial
       dimensions of of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
-    strides: a tuple or list of 2 positive integers specifying the strides
+    strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
     activation: Activation function. Set it to None to maintain a
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
@@ -984,7 +1054,7 @@ class Conv2DTranspose(Conv2D):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
   """
 
@@ -1019,6 +1089,7 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
+    self.input_spec = base.InputSpec(ndim=4)
 
   def build(self, input_shape):
     if len(input_shape) != 4:
@@ -1033,23 +1104,25 @@ class Conv2DTranspose(Conv2D):
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
+    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1063,20 +1136,15 @@ class Conv2DTranspose(Conv2D):
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
-    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
-      if isinstance(dim_size, ops.Tensor):
-        dim_size = math_ops.multiply(dim_size, stride_size)
-      elif dim_size is not None:
-        dim_size *= stride_size
-
-      if padding == 'valid' and dim_size is not None:
-        dim_size += max(kernel_size - stride_size, 0)
-      return dim_size
-
     # Infer the dynamic output shape:
-    out_height = get_deconv_dim(height, stride_h, kernel_h, self.padding)
-    out_width = get_deconv_dim(width, stride_w, kernel_w, self.padding)
-
+    out_height = utils.deconv_output_length(height,
+                                            kernel_h,
+                                            self.padding,
+                                            stride_h)
+    out_width = utils.deconv_output_length(width,
+                                           kernel_w,
+                                           self.padding,
+                                           stride_w)
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_height, out_width)
       strides = (1, 1, stride_h, stride_w)
@@ -1096,10 +1164,14 @@ class Conv2DTranspose(Conv2D):
     # Infer the static output shape:
     out_shape = inputs.get_shape().as_list()
     out_shape[c_axis] = self.filters
-    out_shape[h_axis] = get_deconv_dim(
-        out_shape[h_axis], stride_h, kernel_h, self.padding)
-    out_shape[w_axis] = get_deconv_dim(
-        out_shape[w_axis], stride_w, kernel_w, self.padding)
+    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                   kernel_h,
+                                                   self.padding,
+                                                   stride_h)
+    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                   kernel_w,
+                                                   self.padding,
+                                                   stride_w)
     outputs.set_shape(out_shape)
 
     if self.bias:
@@ -1112,6 +1184,24 @@ class Conv2DTranspose(Conv2D):
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
 
 def conv2d_transpose(inputs,
                      filters,
@@ -1129,7 +1219,7 @@ def conv2d_transpose(inputs,
                      trainable=True,
                      name=None,
                      reuse=None):
-  """Transposed convolution layer (sometimes called Deconvolution).
+  """Functional interface for transposed 2D convolution layer.
 
   The need for transposed convolutions generally arises
   from the desire to use a transformation going in the opposite direction
@@ -1140,20 +1230,279 @@ def conv2d_transpose(inputs,
 
   Arguments:
     inputs: Input tensor.
-    filters: integer, the dimensionality of the output space (i.e. the number
-      output of filters in the convolution).
-    kernel_size: a tuple or list of 2 positive integers specifying the spatial
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 positive integers specifying the spatial
       dimensions of of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
-    strides: a tuple or list of 2 positive integers specifying the strides
+    strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, width, height, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, width, height)`.
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, then no
+      bias will be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv2DTranspose(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class Conv3DTranspose(Conv3D):
+  """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    strides: An integer or tuple/list of 3 integers, specifying the strides
+      of the convolution along the depth, height and width.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, then no
+      bias will be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv3DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+  def build(self, input_shape):
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = utils.deconv_output_length(depth,
+                                           kernel_d,
+                                           self.padding,
+                                           stride_d)
+    out_height = utils.deconv_output_length(height,
+                                            kernel_h,
+                                            self.padding,
+                                            stride_h)
+    out_width = utils.deconv_output_length(width,
+                                           kernel_w,
+                                           self.padding,
+                                           stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    # Infer the static output shape:
+    out_shape = inputs.get_shape().as_list()
+    out_shape[c_axis] = self.filters
+    out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
+                                                   kernel_d,
+                                                   self.padding,
+                                                   stride_d)
+    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                   kernel_h,
+                                                   self.padding,
+                                                   stride_h)
+    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                   kernel_w,
+                                                   self.padding,
+                                                   stride_w)
+    outputs.set_shape(out_shape)
+
+    if self.bias:
+      outputs_shape = outputs.shape.as_list()
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+def conv3d_transpose(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     activation=None,
+                     use_bias=True,
+                     kernel_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     kernel_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for transposed 3D convolution layer.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 3 positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 3 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
     activation: Activation function. Set it to None to maintain a
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
@@ -1164,7 +1513,7 @@ def conv2d_transpose(inputs,
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: A string, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -1172,7 +1521,7 @@ def conv2d_transpose(inputs,
   Returns:
     Output tensor.
   """
-  layer = Conv2DTranspose(
+  layer = Conv3DTranspose(
       filters=filters,
       kernel_size=kernel_size,
       strides=strides,
@@ -1199,8 +1548,10 @@ Convolution2D = Conv2D
 Convolution3D = Conv3D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Deconvolution2D = Deconv2D = Conv2DTranspose
+Convolution3DTranspose = Deconvolution3D = Deconv3D = Conv3DTranspose
 convolution1d = conv1d
 convolution2d = conv2d
 convolution3d = conv3d
 separable_convolution2d = separable_conv2d
 convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
+convolution3d_transpose = deconvolution3d = deconv3d = conv3d_transpose
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 1a5fe5c9b7d..973514b6e4c 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.convolutional."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -68,6 +69,13 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testConv2DFloat16(self):
+    height, width = 7, 9
+    images = random_ops.random_uniform((5, height, width, 4), dtype='float16')
+    output = conv_layers.conv2d(images, 32, [3, 3], activation=nn_ops.relu)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height - 2, width - 2, 32])
+
   def testCreateConv2DIntegerKernelSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -88,6 +96,23 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testUnknownInputChannels(self):
+    images = random_ops.random_uniform((5, 7, 9, 4))
+    images._shape = tensor_shape.as_shape((5, 7, 9, None))
+    layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
+    with self.assertRaisesRegexp(ValueError,
+                                 'The channel dimension of the inputs '
+                                 'should be defined. Found `None`.'):
+      _ = layer.apply(images)
+
+    images = random_ops.random_uniform((5, 4, 7, 9))
+    images._shape = tensor_shape.as_shape((5, None, 7, 9))
+    layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
+    with self.assertRaisesRegexp(ValueError,
+                                 'The channel dimension of the inputs '
+                                 'should be defined. Found `None`.'):
+      _ = layer.apply(images)
+
   def testConv2DPaddingSame(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 32), seed=1)
@@ -126,6 +151,12 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testConv1DFloat16(self):
+    width = 7
+    data = random_ops.random_uniform((5, width, 4), dtype='float16')
+    output = conv_layers.conv1d(data, 32, 3, activation=nn_ops.relu)
+    self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
+
   def testCreateConv1DChannelsFirst(self):
     width = 7
     data = random_ops.random_uniform((5, 4, width))
@@ -135,6 +166,23 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testUnknownInputChannelsConv1D(self):
+    data = random_ops.random_uniform((5, 4, 7))
+    data._shape = tensor_shape.as_shape((5, 4, None))
+    layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
+    with self.assertRaisesRegexp(ValueError,
+                                 'The channel dimension of the inputs '
+                                 'should be defined. Found `None`.'):
+      _ = layer.apply(data)
+
+    data = random_ops.random_uniform((5, 7, 4))
+    data._shape = tensor_shape.as_shape((5, None, 4))
+    layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
+    with self.assertRaisesRegexp(ValueError,
+                                 'The channel dimension of the inputs '
+                                 'should be defined. Found `None`.'):
+      _ = layer.apply(data)
+
   def testCreateConv3D(self):
     depth, height, width = 6, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 4))
@@ -146,6 +194,15 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testUnknownInputChannelsConv3D(self):
+    volumes = random_ops.random_uniform((5, 6, 7, 9, 9))
+    volumes._shape = tensor_shape.as_shape((5, 6, 7, 9, None))
+    layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
+    with self.assertRaisesRegexp(ValueError,
+                                 'The channel dimension of the inputs '
+                                 'should be defined. Found `None`.'):
+      _ = layer.apply(volumes)
+
   def testConv2DKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -478,6 +535,14 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  def testConv2DTransposeFloat16(self):
+    height, width = 7, 9
+    images = random_ops.random_uniform((5, height, width, 4), dtype='float16')
+    output = conv_layers.conv2d_transpose(images, 32, [3, 3],
+                                          activation=nn_ops.relu)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, height + 2, width + 2, 32])
+
   def testCreateConv2DTransposeIntegerKernelSize(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -607,5 +672,174 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertEqual(len(variables.trainable_variables()), 4)
 
 
+class Conv3DTransposeTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'data_format'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, strides=(1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.conv3d_transpose(volumes, 4, (1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.conv3d_transpose(volumes, 4, None)
+
+  def testCreateConv3DTranspose(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], activation=nn_ops.relu)
+    output = layer.apply(volumes)
+    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testCreateConv3DTransposeIntegerKernelSize(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(4, 3)
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testCreateConv3DTransposeChannelsFirst(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, 32, depth, height, width))
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], data_format='channels_first')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 4, depth + 2, height + 2, width + 2])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testConv3DTransposePaddingSame(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 64), seed=1)
+    layer = conv_layers.Conv3DTranspose(
+        32, volumes.get_shape()[1:4], padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 32])
+
+  def testCreateConv3DTransposeWithStrides(self):
+    depth, height, width = 4, 6, 8
+    # Test strides tuple.
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], strides=(2, 2, 2), padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height * 2, width * 2, 4])
+
+    # Test strides integer.
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], strides=2, padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height * 2, width * 2, 4])
+
+    # Test unequal strides.
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], strides=(2, 1, 1), padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height, width, 4])
+
+  def testConv3DTransposeKernelRegularizer(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], kernel_regularizer=reg)
+    layer.apply(volumes)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv3DTransposeBiasRegularizer(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], bias_regularizer=reg)
+    layer.apply(volumes)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv3DTransposeNoBias(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], activation=nn_ops.relu, use_bias=False)
+    output = layer.apply(volumes)
+    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertEqual(layer.bias, None)
+
+  def testFunctionalConv3DTransposeReuse(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+    self.assertEqual(len(variables.trainable_variables()), 2)
+    conv_layers.conv3d_transpose(
+        volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
+    self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv3DTransposeReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv3DTransposeInitializerFromScope(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'scope', initializer=init_ops.ones_initializer()):
+        depth, height, width = 5, 7, 9
+        volumes = random_ops.random_uniform(
+            (5, depth, height, width, 32), seed=1)
+        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+        weights = variables.trainable_variables()
+        # Check the names of weights in order.
+        self.assertTrue('kernel' in weights[0].name)
+        self.assertTrue('bias' in weights[1].name)
+        sess.run(variables.global_variables_initializer())
+        weights = sess.run(weights)
+        # Check that the kernel weights got initialized to ones (from scope)
+        self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
+        # Check that the bias still got initialized to zeros.
+        self.assertAllClose(weights[1], np.zeros((4)))
+
+  def testFunctionalConv3DTransposeNoReuse(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+    self.assertEqual(len(variables.trainable_variables()), 2)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+    self.assertEqual(len(variables.trainable_variables()), 4)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 92894e14472..407bc06dfeb 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -38,7 +38,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class Dense(base._Layer):  # pylint: disable=protected-access
+class Dense(base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -49,7 +49,7 @@ class Dense(base._Layer):  # pylint: disable=protected-access
   (only if `use_bias` is `True`).
 
   Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `w`.
+  flattened prior to the initial matrix multiply by `kernel`.
 
   Arguments:
     units: Integer or Long, dimensionality of the output space.
@@ -62,7 +62,7 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     bias_regularizer: Regularizer function for the bias.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such cases.
     reuse: Boolean, whether to reuse the weights of a previous layer
@@ -101,60 +101,59 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
     self.activity_regularizer = activity_regularizer
+    self.input_spec = base.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape.ndims is None:
-      raise ValueError('Inputs to `Dense` should have known rank.')
-    if len(input_shape) < 2:
-      raise ValueError('Inputs to `Dense` should have rank >= 2.')
     if input_shape[-1].value is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    # Note that we set `trainable=True` because this is a trainable
-    # weight of the layer. If the layer is not trainable
-    # (self.trainable = False), the variable will not be added to
-    # tf.trainable_variables(), and self.trainable_weights will be empty.
-    self.kernel = vs.get_variable('kernel',
-                                  shape=[input_shape[-1].value, self.units],
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+    self.input_spec = base.InputSpec(min_ndim=2,
+                                     axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=[self.units,],
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
     shape = inputs.get_shape().as_list()
-    input_dim = shape[-1]
     output_shape = shape[:-1] + [self.units]
     if len(output_shape) > 2:
-      # Reshape the input to 2D.
-      output_shape_tensors = array_ops.unstack(array_ops.shape(inputs))
-      output_shape_tensors[-1] = self.units
-      output_shape_tensor = array_ops.stack(output_shape_tensors)
-      inputs = array_ops.reshape(inputs, [-1, input_dim])
-
-    outputs = standard_ops.matmul(inputs, self.kernel)
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
+                                                             [0]])
+      # Reshape the output back to the original ndim of the input.
+      outputs.set_shape(output_shape)
+    else:
+      outputs = standard_ops.matmul(inputs, self.kernel)
     if self.use_bias:
       outputs = nn.bias_add(outputs, self.bias)
-
-    if len(output_shape) > 2:
-      # Reshape the output back to the original ndim of the input.
-      outputs = array_ops.reshape(outputs, output_shape_tensor)
-      outputs.set_shape(output_shape)
-
     if self.activation is not None:
       return self.activation(outputs)  # pylint: disable=not-callable
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
+
 
 def dense(
     inputs, units,
@@ -178,7 +177,7 @@ def dense(
   (only if `use_bias` is `True`).
 
   Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `w`.
+  flattened prior to the initial matrix multiply by `kernel`.
 
   Arguments:
     inputs: Tensor input.
@@ -192,7 +191,7 @@ def dense(
     bias_regularizer: Regularizer function for the bias.
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: String, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
@@ -216,7 +215,7 @@ def dense(
   return layer.apply(inputs)
 
 
-class Dropout(base._Layer):  # pylint: disable=protected-access
+class Dropout(base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -225,7 +224,7 @@ class Dropout(base._Layer):  # pylint: disable=protected-access
   sum is unchanged at training time and inference time.
 
   Arguments:
-    rate: The dropout rate, between 0 and 1. E.g. "rate=0.1" would drop out
+    rate: The dropout rate, between 0 and 1. E.g. `rate=0.1` would drop out
       10% of input units.
     noise_shape: 1D tensor of type `int32` representing the shape of the
       binary dropout mask that will be multiplied with the input.
@@ -234,7 +233,7 @@ class Dropout(base._Layer):  # pylint: disable=protected-access
       to be the same for all timesteps, you can use
       `noise_shape=[batch_size, 1, features]`.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}.
       for behavior.
     name: The name of the layer (string).
   """
@@ -249,10 +248,16 @@ class Dropout(base._Layer):  # pylint: disable=protected-access
     self.noise_shape = noise_shape
     self.seed = seed
 
+  def _get_noise_shape(self, _):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    return self.noise_shape
+
   def call(self, inputs, training=False):
     def dropped_inputs():
       return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self.noise_shape,
+                        noise_shape=self._get_noise_shape(inputs),
                         seed=self.seed)
     return utils.smart_cond(training,
                             dropped_inputs,
@@ -283,7 +288,7 @@ def dropout(inputs,
       to be the same for all timesteps, you can use
       `noise_shape=[batch_size, 1, features]`.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index cfcee7b788f..e46894dfd56 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -43,13 +44,14 @@ class DenseTest(test.TestCase):
     self.assertEqual(dense.bias_regularizer, None)
     self.assertEqual(dense.activity_regularizer, None)
     self.assertEqual(dense.use_bias, True)
-    self.assertEqual(dense.name, 'my_dense')
 
     # Test auto-naming
     dense = core_layers.Dense(2, activation=nn_ops.relu)
-    self.assertEqual(dense.name, 'dense')
-    dense = core_layers.Dense(2, activation=nn_ops.relu)
+    dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_1')
+    dense = core_layers.Dense(2, activation=nn_ops.relu)
+    dense.apply(random_ops.random_uniform((5, 2)))
+    self.assertEqual(dense.name, 'dense_2')
 
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
@@ -58,8 +60,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.non_trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
@@ -85,8 +85,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.non_trainable_variables,
                          [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
@@ -260,14 +258,36 @@ class DenseTest(test.TestCase):
       var = variables.trainable_variables()[4]
       self.assertEqual(var.name, 'test2/dense/kernel:0')
 
+  def testComputeOutputShape(self):
+    dense = core_layers.Dense(2, activation=nn_ops.relu, name='dense1')
+    ts = tensor_shape.TensorShape
+    # pylint: disable=protected-access
+    with self.assertRaises(ValueError):
+      dense._compute_output_shape(ts(None))
+    with self.assertRaises(ValueError):
+      dense._compute_output_shape(ts([]))
+    with self.assertRaises(ValueError):
+      dense._compute_output_shape(ts([1]))
+    self.assertEqual(
+        [None, 2],
+        dense._compute_output_shape((None, 3)).as_list())
+    self.assertEqual(
+        [None, 2],
+        dense._compute_output_shape(ts([None, 3])).as_list())
+    self.assertEqual(
+        [None, 4, 2],
+        dense._compute_output_shape(ts([None, 4, 3])).as_list())
+    # pylint: enable=protected-access
+
 
 class DropoutTest(test.TestCase):
 
   def testDropoutProperties(self):
-    dp = core_layers.Dropout(0.5)
+    dp = core_layers.Dropout(0.5, name='dropout')
     self.assertEqual(dp.rate, 0.5)
-    self.assertEqual(dp.name, 'dropout')
     self.assertEqual(dp.noise_shape, None)
+    dp.apply(np.ones(()))
+    self.assertEqual(dp.name, 'dropout')
 
   def testBooleanLearningPhase(self):
     with self.test_session() as sess:
@@ -317,6 +337,18 @@ class DropoutTest(test.TestCase):
       np_output = sess.run(dropped, feed_dict={training: False})
       self.assertAllClose(np.ones((5, 5)), np_output)
 
+  def testDynamicRate(self):
+    with self.test_session() as sess:
+      rate = array_ops.placeholder(dtype='float32', name='rate')
+      dp = core_layers.Dropout(rate, name='dropout')
+      inputs = array_ops.ones((5, 5))
+      dropped = dp.apply(inputs, training=True)
+      sess.run(variables.global_variables_initializer())
+      np_output = sess.run(dropped, feed_dict={rate: 0.5})
+      self.assertAlmostEqual(0., np_output.min())
+      np_output = sess.run(dropped, feed_dict={rate: 0.0})
+      self.assertAllClose(np.ones((5, 5)), np_output)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index c25361d562a..aa46eb5d27d 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -16,32 +16,21 @@
 # pylint: disable=line-too-long
 """This library provides a set of high-level neural networks layers.
 
-## Core layers
-
 @@dense
 @@dropout
-
-## Convolutional layers
-
 @@conv1d
 @@conv2d
 @@conv3d
 @@separable_conv2d
 @@conv2d_transpose
-
-## Pooling layers
-
+@@conv3d_transpose
 @@average_pooling1d
 @@max_pooling1d
 @@average_pooling2d
 @@max_pooling2d
 @@average_pooling3d
 @@max_pooling3d
-
-## Normalization layers
-
 @@batch_normalization
-
 """
 
 from __future__ import absolute_import
@@ -62,6 +51,7 @@ from tensorflow.python.layers.convolutional import conv2d
 from tensorflow.python.layers.convolutional import conv3d
 from tensorflow.python.layers.convolutional import separable_conv2d
 from tensorflow.python.layers.convolutional import conv2d_transpose
+from tensorflow.python.layers.convolutional import conv3d_transpose
 
 # Pooling layers.
 from tensorflow.python.layers.pooling import average_pooling1d
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 4a59d779483..780d1c2b8e0 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -41,7 +41,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class BatchNormalization(base._Layer):  # pylint: disable=protected-access
+class BatchNormalization(base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -51,7 +51,7 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
 
   Arguments:
     axis: Integer, the axis that should be normalized (typically the features
-      axis). For instance, after a `Convolution2D` layer with
+      axis). For instance, after a `Conv2D` layer with
       `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
@@ -66,6 +66,21 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     moving_variance_initializer: Initializer for the moving variance.
     beta_regularizer: Optional regularizer for the beta weight.
     gamma_regularizer: Optional regularizer for the gamma weight.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `True`, use a faster, fused implementation based on
+      nn.fused_batch_norm. If `None`, use the fused implementation if possible.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: A string, the name of the layer.
@@ -83,6 +98,10 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
                moving_variance_initializer=init_ops.ones_initializer(),
                beta_regularizer=None,
                gamma_regularizer=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=False,
                trainable=True,
                name=None,
                **kwargs):
@@ -99,12 +118,32 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     self.moving_variance_initializer = moving_variance_initializer
     self.beta_regularizer = beta_regularizer
     self.gamma_regularizer = gamma_regularizer
+    self.renorm = renorm
+    self.fused = fused
+    if self.fused and renorm:
+      raise ValueError(
+          'Batch renorm is currently not supported with fused batch norm.')
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
       raise ValueError('Input has undefined rank:', input_shape)
     ndim = len(input_shape)
+    # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
+    # output back to its original shape accordingly.
+    if self.fused and ndim != 4:
+      raise ValueError(
+          'Only 4D inputs are currently supported with fused batch norm. '
+          'Consider reshaping the input to 4D and reshape the output back '
+          'to its original shape. Got input rank: ', ndim)
     if self.axis < 0:
       axis = ndim + self.axis
     else:
@@ -112,46 +151,194 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     if axis < 0 or axis >= ndim:
       raise ValueError('Value of `axis` argument ' + str(self.axis) +
                        ' is out of range for input with rank ' + str(ndim))
+
+    if self.fused is None:
+      self.fused = not self.renorm and ndim == 4 and axis in [1, 3]
+
+    if self.fused:
+      if axis == 1:
+        self._data_format = 'NCHW'
+      elif axis == 3:
+        self._data_format = 'NHWC'
+      else:
+        raise ValueError(
+            'Only axis 1 and 3 are currently supported dimensions for '
+            'fused batch norm. Got `axis` dimension: ', axis)
+
     param_dim = input_shape[axis]
     if not param_dim.value:
       raise ValueError('Input has undefined `axis` dimension. Input shape: ',
                        input_shape)
+    self.input_spec = base.InputSpec(ndim=ndim,
+                                     axes={self.axis: param_dim.value})
 
     if self.center:
-      self.beta = vs.get_variable('beta',
-                                  shape=(param_dim,),
-                                  initializer=self.beta_initializer,
-                                  regularizer=self.beta_regularizer,
-                                  trainable=True)
+      self.beta = self.add_variable(name='beta',
+                                    shape=(param_dim,),
+                                    initializer=self.beta_initializer,
+                                    regularizer=self.beta_regularizer,
+                                    trainable=True)
     else:
       self.beta = None
+      if self.fused:
+        self._beta_const = array_ops.constant(0.0, shape=(param_dim,))
     if self.scale:
-      self.gamma = vs.get_variable('gamma',
-                                   shape=(param_dim,),
-                                   initializer=self.gamma_initializer,
-                                   regularizer=self.gamma_regularizer,
-                                   trainable=True)
+      self.gamma = self.add_variable(name='gamma',
+                                     shape=(param_dim,),
+                                     initializer=self.gamma_initializer,
+                                     regularizer=self.gamma_regularizer,
+                                     trainable=True)
     else:
       self.gamma = None
+      if self.fused:
+        self._gamma_const = array_ops.constant(1.0, shape=(param_dim,))
 
     # Disable variable partitioning when creating the moving mean and variance
-    partitioner = vs.get_variable_scope().partitioner
+    partitioner = self._scope.partitioner
     try:
-      vs.get_variable_scope().set_partitioner(None)
-      self.moving_mean = vs.get_variable(
-          'moving_mean',
+      self._scope.set_partitioner(None)
+      self.moving_mean = self.add_variable(
+          name='moving_mean',
           shape=(param_dim,),
           initializer=self.moving_mean_initializer,
           trainable=False)
-      self.moving_variance = vs.get_variable(
-          'moving_variance',
+      self.moving_variance = self.add_variable(
+          name='moving_variance',
           shape=(param_dim,),
           initializer=self.moving_variance_initializer,
           trainable=False)
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self.add_variable(name=name,
+                                  shape=shape,
+                                  initializer=init_ops.zeros_initializer(),
+                                  trainable=False)
+          return var
+        with ops.device(None):
+          with ops.device(lambda _: self.moving_mean.device):
+            self.renorm_mean = _renorm_variable('renorm_mean', (param_dim,))
+            self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+          # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+          # renorm_stddev_weight. This allows us to (1) mix the average
+          # stddev with the minibatch stddev early in training, and (2) compute
+          # the unbiased average stddev by dividing renorm_stddev by the weight.
+          with ops.device(lambda _: self.moving_variance.device):
+            self.renorm_stddev = _renorm_variable('renorm_stddev', (param_dim,))
+            self.renorm_stddev_weight = _renorm_variable(
+                'renorm_stddev_weight', ())
     finally:
-      vs.get_variable_scope().set_partitioner(partitioner)
+      self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _fused_batch_norm(self, inputs, training):
+    """Returns the output of fused batch norm."""
+    beta = self.beta if self.center else self._beta_const
+    gamma = self.gamma if self.scale else self._gamma_const
+
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          epsilon=self.epsilon,
+          data_format=self._data_format)
+
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=self.moving_mean,
+          variance=self.moving_variance,
+          epsilon=self.epsilon,
+          is_training=False,
+          data_format=self._data_format)
+
+    output, mean, variance = utils.smart_cond(
+        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+
+    training_value = utils.constant_value(training)
+    if training_value is not False:
+      decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
+      mean_update = moving_averages.assign_moving_average(
+          self.moving_mean, mean, decay, zero_debias=False)
+      variance_update = moving_averages.assign_moving_average(
+          self.moving_variance, variance, decay, zero_debias=False)
+      self.add_update(mean_update, inputs=inputs)
+      self.add_update(variance_update, inputs=inputs)
+
+    return output
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0, and decay=1 meaning no updates.
+    r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
+    decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      # Update the variables without zero debiasing. The debiasing will be
+      # accomplished by dividing the exponential moving average by the weight.
+      # For example, after a single update, the moving average would be
+      # (1-decay) * value. and the weight will be 1-decay, with their ratio
+      # giving value.
+      # Make sure the weight is not updated until before r and d computation.
+      value = array_ops.identity(value)
+      with ops.control_dependencies([value]):
+        weight_value = array_ops.constant(1., dtype=weight.dtype)
+      new_var = moving_averages.assign_moving_average(
+          var, value, decay, zero_debias=False)
+      new_weight = moving_averages.assign_moving_average(
+          weight, weight_value, decay, zero_debias=False)
+      return new_var / new_weight
+
+    with ops.colocate_with(self.moving_mean):
+      new_mean = _update_renorm_variable(self.renorm_mean,
+                                         self.renorm_mean_weight,
+                                         mean)
+    with ops.colocate_with(self.moving_variance):
+      new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                           self.renorm_stddev_weight,
+                                           stddev)
+      # Make sqrt(moving_variance + epsilon) = new_stddev.
+      new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=False):
+    if self.fused:
+      return self._fused_batch_norm(inputs, training=training)
+
     # First, compute the axes along which to reduce the mean / variance,
     # as well as the broadcast shape to be used for all parameters.
     input_shape = inputs.get_shape()
@@ -162,84 +349,64 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     broadcast_shape[self.axis] = input_shape[self.axis].value
 
     # Determines whether broadcasting is needed.
-    needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])
+    needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])
+
+    scale, offset = self.gamma, self.beta
 
     # Determine a boolean value for `training`: could be True, False, or None.
     training_value = utils.constant_value(training)
-
-    if needs_broadcasting:
-      # In this case we must explictly broadcast all parameters.
-      if self.center:
-        broadcast_beta = array_ops.reshape(self.beta, broadcast_shape)
-      else:
-        broadcast_beta = None
-      if self.scale:
-        broadcast_gamma = array_ops.reshape(self.gamma, broadcast_shape)
-      else:
-        broadcast_gamma = None
-
     if training_value is not False:
-      if needs_broadcasting:
-        broadcast_mean, broadcast_variance = nn.moments(
-            inputs, reduction_axes, keep_dims=True)
-        mean = array_ops.reshape(broadcast_mean, [-1])
-        variance = array_ops.reshape(broadcast_variance, [-1])
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      mean, variance = nn.moments(inputs, reduction_axes)
+      mean = _smart_select(training,
+                           lambda: mean,
+                           lambda: self.moving_mean)
+      variance = _smart_select(training,
+                               lambda: variance,
+                               lambda: self.moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        scale = array_ops.stop_gradient(r, name='renorm_r')
+        offset = array_ops.stop_gradient(d, name='renorm_d')
+        if self.gamma is not None:
+          scale *= self.gamma
+          offset *= self.gamma
+        if self.beta is not None:
+          offset += self.beta
       else:
-        mean, variance = nn.moments(inputs, reduction_axes)
+        new_mean, new_variance = mean, variance
 
-      # Prepare updates if necessary.
-      if not self.updates:
-        mean_update = moving_averages.assign_moving_average(
-            self.moving_mean, mean, self.momentum, zero_debias=False)
-        variance_update = moving_averages.assign_moving_average(
-            self.moving_variance, variance, self.momentum, zero_debias=False)
-        # In the future this should be refactored into a self.add_update
-        # methods in order to allow for instance-based BN layer sharing
-        # across unrelated input streams (e.g. like in Keras).
-        self.updates.append(mean_update)
-        self.updates.append(variance_update)
+      # Update moving averages when training, and prevent updates otherwise.
+      decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
+      mean_update = moving_averages.assign_moving_average(
+          self.moving_mean, new_mean, decay, zero_debias=False)
+      variance_update = moving_averages.assign_moving_average(
+          self.moving_variance, new_variance, decay, zero_debias=False)
 
-    # Normalize batch. We do this inside separate functions for training
-    # and inference so as to avoid evaluating both branches.
-    def normalize_in_test():
-      if needs_broadcasting:
-        broadcast_moving_mean = array_ops.reshape(self.moving_mean,
-                                                  broadcast_shape)
-        broadcast_moving_variance = array_ops.reshape(self.moving_variance,
-                                                      broadcast_shape)
-        return nn.batch_normalization(inputs,
-                                      broadcast_moving_mean,
-                                      broadcast_moving_variance,
-                                      broadcast_beta,
-                                      broadcast_gamma,
-                                      self.epsilon)
-      else:
-        return nn.batch_normalization(inputs,
-                                      self.moving_mean,
-                                      self.moving_variance,
-                                      self.beta if self.center else None,
-                                      self.gamma if self.scale else None,
-                                      self.epsilon)
+      self.add_update(mean_update, inputs=inputs)
+      self.add_update(variance_update, inputs=inputs)
 
-    def normalize_in_training():
-      if needs_broadcasting:
-        return nn.batch_normalization(inputs,
-                                      broadcast_mean,
-                                      broadcast_variance,
-                                      broadcast_beta,
-                                      broadcast_gamma,
-                                      self.epsilon)
-      else:
-        return nn.batch_normalization(inputs,
-                                      mean,
-                                      variance,
-                                      self.beta if self.center else None,
-                                      self.gamma if self.scale else None,
-                                      self.epsilon)
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
 
-    return utils.smart_cond(training,
-                            normalize_in_training,
-                            normalize_in_test)
+    def _broadcast(v):
+      if needs_broadcasting and v is not None:
+        # In this case we must explicitly broadcast all parameters.
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    return nn.batch_normalization(inputs,
+                                  _broadcast(mean),
+                                  _broadcast(variance),
+                                  _broadcast(offset),
+                                  _broadcast(scale),
+                                  self.epsilon)
 
 
 def batch_normalization(inputs,
@@ -257,7 +424,11 @@ def batch_normalization(inputs,
                         training=False,
                         trainable=True,
                         name=None,
-                        reuse=None):
+                        reuse=None,
+                        renorm=False,
+                        renorm_clipping=None,
+                        renorm_momentum=0.99,
+                        fused=False):
   """Functional interface for the batch normalization layer.
 
   Reference: http://arxiv.org/abs/1502.03167
@@ -267,6 +438,16 @@ def batch_normalization(inputs,
 
   Sergey Ioffe, Christian Szegedy
 
+  Note: when training, the moving_mean and moving_variance need to be updated.
+  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
+  need to be added as a dependency to the `train_op`. For example:
+
+  ```python
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)
+  ```
+
   Arguments:
     inputs: Tensor input.
     axis: Integer, the axis that should be normalized (typically the features
@@ -288,12 +469,29 @@ def batch_normalization(inputs,
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
       (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics).
+      (normalized with moving statistics). **NOTE**: make sure to set this
+      parameter correctly, or else your training/inference will not work
+      properly.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: String, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `True`, use a faster, fused implementation based on
+      nn.fused_batch_norm. If `None`, use the fused implementation if possible.
 
   Returns:
     Output tensor.
@@ -310,6 +508,10 @@ def batch_normalization(inputs,
       moving_variance_initializer=moving_variance_initializer,
       beta_regularizer=beta_regularizer,
       gamma_regularizer=gamma_regularizer,
+      renorm=renorm,
+      renorm_clipping=renorm_clipping,
+      renorm_momentum=renorm_momentum,
+      fused=fused,
       trainable=trainable,
       name=name,
       _reuse=reuse,
@@ -321,3 +523,39 @@ def batch_normalization(inputs,
 
 BatchNorm = BatchNormalization
 batch_norm = batch_normalization
+
+
+# Helper function
+
+
+def _smart_select(pred, fn_then, fn_else):
+  """Selects fn_then() or fn_else() based on the value of pred.
+
+  The purpose of this function is the same as `utils.smart_cond`. However, at
+  the moment there is a bug (b/36297356) that seems to kick in only when
+  `smart_cond` delegates to `tf.cond`, which sometimes results in the training
+  hanging when using parameter servers. This function will output the result
+  of `fn_then` or `fn_else` if `pred` is known at graph construction time.
+  Otherwise, it will use `tf.where` which will result in some redundant work
+  (both branches will be computed but only one selected). However, the tensors
+  involved will usually be small (means and variances in batchnorm), so the
+  cost will be small and will not be incurred at all if `pred` is a constant.
+
+  Args:
+    pred: A boolean scalar `Tensor`.
+    fn_then: A callable to use when pred==True.
+    fn_else: A callable to use when pred==False.
+
+  Returns:
+    A `Tensor` whose value is fn_then() or fn_else() based on the value of pred.
+  """
+  pred_value = utils.constant_value(pred)
+  if pred_value:
+    return fn_then()
+  elif pred_value is False:
+    return fn_else()
+  t_then = array_ops.expand_dims(fn_then(), 0)
+  t_else = array_ops.expand_dims(fn_else(), 0)
+  pred = array_ops.reshape(pred, [1])
+  result = array_ops.where(pred, t_then, t_else)
+  return array_ops.squeeze(result, [0])
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 91b7cb6f483..fa6c9c4a5db 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.normalization."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -261,6 +262,87 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
+  def test4DInputAxis3Fused(self):
+    epsilon = 1e-3
+    bn = normalization_layers.BatchNormalization(
+        axis=3, epsilon=epsilon, momentum=0.9, fused=True)
+    inputs = variables.Variable(
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
+    training = array_ops.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    with self.test_session() as sess:
+      # Test training with placeholder learning phase.
+      sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+      for _ in range(100):
+        np_output, _, _ = sess.run(
+            [outputs] + bn.updates, feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+      # Verify that the statistics are updated during training.
+      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+      np_inputs = sess.run(inputs)
+      mean = np.mean(np_inputs, axis=(0, 1, 2))
+      std = np.std(np_inputs, axis=(0, 1, 2))
+      variance = np.square(std)
+      self.assertAllClose(mean, moving_mean, atol=1e-2)
+      self.assertAllClose(variance, moving_var, atol=1e-2)
+
+      # Test inference with placeholder learning phase.
+      np_output = sess.run(outputs, feed_dict={training: False})
+
+      # Verify that the axis is normalized during inference.
+      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+  def test4DInputAxis1Fused(self):
+    if test.is_gpu_available(cuda_only=True):
+      epsilon = 1e-3
+      bn = normalization_layers.BatchNormalization(
+          axis=1, epsilon=epsilon, momentum=0.9, fused=True)
+      inputs = variables.Variable(
+          np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
+      training = array_ops.placeholder(dtype='bool')
+      outputs = bn.apply(inputs, training=training)
+
+      with self.test_session() as sess:
+        # Test training with placeholder learning phase.
+        sess.run(variables.global_variables_initializer())
+        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+        np_beta = np.reshape(np_beta, (1, 4, 1, 1))
+        for _ in range(100):
+          np_output, _, _ = sess.run(
+              [outputs] + bn.updates, feed_dict={training: True})
+          # Verify that the axis is normalized during training.
+          normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+          self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+          self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
+        # Verify that the statistics are updated during training.
+        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+        np_inputs = sess.run(inputs)
+        mean = np.mean(np_inputs, axis=(0, 2, 3))
+        std = np.std(np_inputs, axis=(0, 2, 3))
+        variance = np.square(std)
+        self.assertAllClose(mean, moving_mean, atol=1e-2)
+        self.assertAllClose(variance, moving_var, atol=1e-2)
+
+        # Test inference with placeholder learning phase.
+        np_output = sess.run(outputs, feed_dict={training: False})
+
+        # Verify that the axis is normalized during inference.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
   def testNegativeAxis(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -513,6 +595,64 @@ class BNTest(test.TestCase):
     _ = bn.apply(inputs, training=training)
     self.assertEqual(len(bn.losses), 1)
 
+  def testRenorm(self):
+    shape = (4, 3)
+    xt = array_ops.placeholder(dtypes.float32, shape)
+    momentum = 0.99
+    renorm_momentum = 0.8
+    rmax = 1.1
+    rmin = 0.9
+    dmax = 0.1
+    gamma = 2.
+    beta = 3.
+    epsilon = 0.001
+    bn = normalization_layers.BatchNormalization(
+        axis=1,
+        gamma_initializer=init_ops.constant_initializer(gamma),
+        beta_initializer=init_ops.constant_initializer(beta),
+        epsilon=epsilon,
+        momentum=momentum,
+        renorm=True,
+        renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax},
+        renorm_momentum=renorm_momentum)
+    training = array_ops.placeholder(dtypes.bool)
+    yt = bn.apply(xt, training=training)
+
+    moving_mean = 0.
+    moving_variance = 1.
+    renorm_mean = renorm_stddev = 0.
+    renorm_weight = 0.
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(variables.global_variables_initializer())
+      for _ in range(5):
+        x = np.random.random(shape)
+
+        mean = x.mean(0)
+        stddev = np.sqrt(x.var(0) + epsilon)
+        adj_mean = renorm_mean + (1. - renorm_weight) * mean
+        adj_stddev = renorm_stddev + (1. - renorm_weight) * stddev
+        r = (stddev / adj_stddev).clip(rmin, rmax)
+        d = ((mean - adj_mean) / adj_stddev).clip(-dmax, dmax)
+        y_train = ((x - mean) / stddev * r + d) * gamma + beta
+        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
+        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
+        renorm_weight += (1. - renorm_weight) * (1. - renorm_momentum)
+        moving_mean += (renorm_mean / renorm_weight -
+                        moving_mean) * (1. - momentum)
+        moving_variance += ((renorm_stddev / renorm_weight) ** 2 - epsilon -
+                            moving_variance) * (1. - momentum)
+
+        y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 *
+                  gamma) + beta
+
+        yt_val_train, _, _ = sess.run([yt] + bn.updates,
+                                      feed_dict={xt: x, training: True})
+        yt_val_test, _, _ = sess.run([yt] + bn.updates,
+                                     feed_dict={xt: x, training: False})
+
+        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
+        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 2601c61c47d..e903afa0a80 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -31,12 +31,13 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope as vs
-
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
+from tensorflow.python import framework
 
 
-class _Pooling1D(base._Layer):  # pylint: disable=protected-access
+class _Pooling1D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 1D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -66,11 +67,7 @@ class _Pooling1D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 1, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 3:
-      raise ValueError('Inputs should have rank 3. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=3)
 
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
@@ -97,6 +94,12 @@ class _Pooling1D(base._Layer):  # pylint: disable=protected-access
     else:
       return array_ops.squeeze(outputs, 1)
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                      self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
 
 class AveragePooling1D(_Pooling1D):
   """Average Pooling layer for 1D inputs.
@@ -222,7 +225,7 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base._Layer):  # pylint: disable=protected-access
+class _Pooling2D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -256,11 +259,7 @@ class _Pooling2D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 2, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=4)
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
@@ -269,12 +268,32 @@ class _Pooling2D(base._Layer):  # pylint: disable=protected-access
     else:
       pool_shape = (1, 1) + self.pool_size
       strides = (1, 1) + self.strides
-    return self.pool_function(
+    outputs = self.pool_function(
         inputs,
         ksize=pool_shape,
         strides=strides,
         padding=self.padding.upper(),
         data_format=utils.convert_data_format(self.data_format, 4))
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                    self.strides[0])
+    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                    self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
 
 class AveragePooling2D(_Pooling2D):
@@ -294,7 +313,7 @@ class AveragePooling2D(_Pooling2D):
     data_format: A string. The ordering of the dimensions in the inputs.
       `channels_last` (default) and `channels_first` are supported.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, channels, width)` while `channels_first` corresponds to
+      `(batch, height, width, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, height, width)`.
     name: A string, the name of the layer.
   """
@@ -329,7 +348,7 @@ def average_pooling2d(inputs,
     data_format: A string. The ordering of the dimensions in the inputs.
       `channels_last` (default) and `channels_first` are supported.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, channels, width)` while `channels_first` corresponds to
+      `(batch, height, width, channels)` while `channels_first` corresponds to
       inputs with shape `(batch, channels, height, width)`.
     name: A string, the name of the layer.
 
@@ -407,7 +426,7 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base._Layer):  # pylint: disable=protected-access
+class _Pooling3D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 3D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -443,19 +462,16 @@ class _Pooling3D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 3, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=5)
 
   def call(self, inputs):
     pool_shape = (1,) + self.pool_size + (1,)
     strides = (1,) + self.strides + (1,)
 
     if self.data_format == 'channels_first':
-      # TF does not support channels first with 3D pooling operations,
+      # TF does not support `channels_first` with 3D pooling operations,
       # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
       inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
 
     outputs = self.pool_function(
@@ -468,6 +484,29 @@ class _Pooling3D(base._Layer):  # pylint: disable=protected-access
       outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
+                                        self.padding, self.strides[0])
+    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
+                                        self.padding, self.strides[1])
+    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
+                                        self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
+
 
 class AveragePooling3D(_Pooling3D):
   """Average pooling layer for 3D inputs (e.g. volumes).
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index f9929eaf209..589fee5f719 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import pooling as pooling_layers
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -63,6 +64,36 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
 
+  def testCreateMaxPooling2DChannelsFirst(self):
+    height, width = 7, 9
+    images = random_ops.random_uniform((5, 2, height, width))
+    layer = pooling_layers.MaxPooling2D([2, 2],
+                                        strides=1,
+                                        data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
+
+  def testCreateAveragePooling2DChannelsFirst(self):
+    height, width = 5, 6
+    images = random_ops.random_uniform((3, 4, height, width))
+    layer = pooling_layers.AveragePooling2D((2, 2),
+                                            strides=(1, 1),
+                                            padding='valid',
+                                            data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
+
+  def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
+    height, width = 5, 6
+    images = array_ops.placeholder(dtype='float32',
+                                   shape=(None, 4, height, width))
+    layer = pooling_layers.AveragePooling2D((2, 2),
+                                            strides=(1, 1),
+                                            padding='valid',
+                                            data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [None, 4, 4, 5])
+
   def testCreateMaxPooling1D(self):
     width = 7
     images = random_ops.random_uniform((5, width, 4))
@@ -85,6 +116,14 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
 
+  def testCreateAveragePooling1DChannelsFirst(self):
+    width = 7
+    images = random_ops.random_uniform((5, width, 4))
+    layer = pooling_layers.AveragePooling1D(
+        2, strides=2, data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
     images = random_ops.random_uniform((5, depth, height, width, 4))
@@ -99,13 +138,21 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
 
-  def testmaxPooling3DChannelsFirst(self):
+  def testMaxPooling3DChannelsFirst(self):
     depth, height, width = 6, 7, 9
-    images = random_ops.random_uniform((5, 4, depth, height, width))
+    images = random_ops.random_uniform((5, 2, depth, height, width))
+    layer = pooling_layers.MaxPooling3D(
+        [2, 2, 2], strides=2, data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
+
+  def testAveragePooling3DChannelsFirst(self):
+    depth, height, width = 6, 7, 9
+    images = random_ops.random_uniform((5, 2, depth, height, width))
     layer = pooling_layers.AveragePooling3D(
         [2, 2, 2], strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
 
   def testCreateMaxPooling2DIntegerPoolSize(self):
     height, width = 7, 9
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 650e0586c3a..5e206c3bf9d 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 
@@ -46,8 +47,7 @@ def convert_data_format(data_format, ndim):
     elif ndim == 4:
       return 'NCHW'
     elif ndim == 5:
-      raise ValueError('Data format "channels_first" not supported for '
-                       'inputs with rank 5.')
+      return 'NCDHW'
     else:
       raise ValueError('Input rank not supported:', ndim)
   else:
@@ -110,6 +110,78 @@ def normalize_padding(value):
   return padding
 
 
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
+
+
 def smart_cond(pred, fn1, fn2, name=None):
   """Return either `fn1()` or `fn2()` based on the boolean predicate `pred`.
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index d95ea04d42d..a560f6b6d21 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.utils."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,19 +25,20 @@ from tensorflow.python.platform import test
 class ConvUtilsTest(test.TestCase):
 
   def testConvertDataFormat(self):
-    self.assertEqual(utils.convert_data_format('channels_first', 4), 'NCHW')
-    self.assertEqual(utils.convert_data_format('channels_first', 3), 'NCW')
-    self.assertEqual(utils.convert_data_format('channels_last', 4), 'NHWC')
-    self.assertEqual(utils.convert_data_format('channels_last', 3), 'NWC')
-    self.assertEqual(utils.convert_data_format('channels_last', 5), 'NDHWC')
+    self.assertEqual('NCDHW', utils.convert_data_format('channels_first', 5))
+    self.assertEqual('NCHW', utils.convert_data_format('channels_first', 4))
+    self.assertEqual('NCW', utils.convert_data_format('channels_first', 3))
+    self.assertEqual('NHWC', utils.convert_data_format('channels_last', 4))
+    self.assertEqual('NWC', utils.convert_data_format('channels_last', 3))
+    self.assertEqual('NDHWC', utils.convert_data_format('channels_last', 5))
 
     with self.assertRaises(ValueError):
       utils.convert_data_format('invalid', 2)
 
   def testNormalizeTuple(self):
-    self.assertEqual(utils.normalize_tuple(2, n=3, name='strides'), (2, 2, 2))
+    self.assertEqual((2, 2, 2), utils.normalize_tuple(2, n=3, name='strides'))
     self.assertEqual(
-        utils.normalize_tuple((2, 1, 2), n=3, name='strides'), (2, 1, 2))
+        (2, 1, 2), utils.normalize_tuple((2, 1, 2), n=3, name='strides'))
 
     with self.assertRaises(ValueError):
       utils.normalize_tuple((2, 1), n=3, name='strides')
@@ -47,20 +48,44 @@ class ConvUtilsTest(test.TestCase):
 
   def testNormalizeDataFormat(self):
     self.assertEqual(
-        utils.normalize_data_format('Channels_Last'), 'channels_last')
+        'channels_last', utils.normalize_data_format('Channels_Last'))
     self.assertEqual(
-        utils.normalize_data_format('CHANNELS_FIRST'), 'channels_first')
+        'channels_first', utils.normalize_data_format('CHANNELS_FIRST'))
 
     with self.assertRaises(ValueError):
       utils.normalize_data_format('invalid')
 
   def testNormalizePadding(self):
-    self.assertEqual(utils.normalize_padding('SAME'), 'same')
-    self.assertEqual(utils.normalize_padding('VALID'), 'valid')
+    self.assertEqual('same', utils.normalize_padding('SAME'))
+    self.assertEqual('valid', utils.normalize_padding('VALID'))
 
     with self.assertRaises(ValueError):
       utils.normalize_padding('invalid')
 
+  def testConvOutputLength(self):
+    self.assertEqual(4, utils.conv_output_length(4, 2, 'same', 1, 1))
+    self.assertEqual(2, utils.conv_output_length(4, 2, 'same', 2, 1))
+    self.assertEqual(3, utils.conv_output_length(4, 2, 'valid', 1, 1))
+    self.assertEqual(2, utils.conv_output_length(4, 2, 'valid', 2, 1))
+    self.assertEqual(5, utils.conv_output_length(4, 2, 'full', 1, 1))
+    self.assertEqual(3, utils.conv_output_length(4, 2, 'full', 2, 1))
+    self.assertEqual(2, utils.conv_output_length(5, 2, 'valid', 2, 2))
+
+  def testConvInputLength(self):
+    self.assertEqual(3, utils.conv_input_length(4, 2, 'same', 1))
+    self.assertEqual(2, utils.conv_input_length(2, 2, 'same', 2))
+    self.assertEqual(4, utils.conv_input_length(3, 2, 'valid', 1))
+    self.assertEqual(4, utils.conv_input_length(2, 2, 'valid', 2))
+    self.assertEqual(3, utils.conv_input_length(4, 2, 'full', 1))
+    self.assertEqual(4, utils.conv_input_length(3, 2, 'full', 2))
+
+  def testDeconvOutputLength(self):
+    self.assertEqual(4, utils.deconv_output_length(4, 2, 'same', 1))
+    self.assertEqual(8, utils.deconv_output_length(4, 2, 'same', 2))
+    self.assertEqual(5, utils.deconv_output_length(4, 2, 'valid', 1))
+    self.assertEqual(8, utils.deconv_output_length(4, 2, 'valid', 2))
+    self.assertEqual(3, utils.deconv_output_length(4, 2, 'full', 1))
+    self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
new file mode 100644
index 00000000000..8934d39e473
--- /dev/null
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Must be included first.
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+
+namespace tensorflow {
+
+// Mutex used to serialize accesses to cached vector of pointers to python
+// arrays to be dereferenced.
+static mutex* DelayedDecrefLock() {
+  static mutex* decref_lock = new mutex;
+  return decref_lock;
+}
+
+// Caches pointers to numpy arrays which need to be dereferenced.
+static std::vector<void*>* DecrefCache() {
+  static std::vector<void*>* decref_cache = new std::vector<void*>;
+  return decref_cache;
+}
+
+// Destructor passed to TF_NewTensor when it reuses a numpy buffer. Stores a
+// pointer to the pyobj in a buffer to be dereferenced later when we're actually
+// holding the GIL.
+void DelayedNumpyDecref(void* data, size_t len, void* obj) {
+  mutex_lock ml(*DelayedDecrefLock());
+  DecrefCache()->push_back(obj);
+}
+
+// Actually dereferences cached numpy arrays. REQUIRES being called while
+// holding the GIL.
+void ClearDecrefCache() {
+  mutex_lock ml(*DelayedDecrefLock());
+  for (void* obj : *DecrefCache()) {
+    Py_DECREF(reinterpret_cast<PyObject*>(obj));
+  }
+  DecrefCache()->clear();
+}
+
+// Structure which keeps a reference to a Tensor alive while numpy has a pointer
+// to it.
+struct TensorReleaser {
+  // Python macro to include standard members.
+  PyObject_HEAD
+
+      // Destructor responsible for releasing the memory.
+      std::function<void()>* destructor;
+};
+
+extern PyTypeObject TensorReleaserType;
+
+static void TensorReleaser_dealloc(TensorReleaser* self) {
+  (*self->destructor)();
+  delete self->destructor;
+  TensorReleaserType.tp_free(self);
+}
+
+PyTypeObject TensorReleaserType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) /* head init */
+    "tensorflow_wrapper",             /* tp_name */
+    sizeof(TensorReleaser),           /* tp_basicsize */
+    0,                                /* tp_itemsize */
+    /* methods */
+    (destructor)TensorReleaser_dealloc, /* tp_dealloc */
+    nullptr,                            /* tp_print */
+    nullptr,                            /* tp_getattr */
+    nullptr,                            /* tp_setattr */
+    nullptr,                            /* tp_compare */
+    nullptr,                            /* tp_repr */
+    nullptr,                            /* tp_as_number */
+    nullptr,                            /* tp_as_sequence */
+    nullptr,                            /* tp_as_mapping */
+    nullptr,                            /* tp_hash */
+    nullptr,                            /* tp_call */
+    nullptr,                            /* tp_str */
+    nullptr,                            /* tp_getattro */
+    nullptr,                            /* tp_setattro */
+    nullptr,                            /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
+    "Wrapped TensorFlow Tensor",        /* tp_doc */
+    nullptr,                            /* tp_traverse */
+    nullptr,                            /* tp_clear */
+    nullptr,                            /* tp_richcompare */
+};
+
+Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
+                                   int* out_pyarray_type) {
+  switch (tf_datatype) {
+    case TF_HALF:
+      *out_pyarray_type = NPY_FLOAT16;
+      break;
+    case TF_FLOAT:
+      *out_pyarray_type = NPY_FLOAT32;
+      break;
+    case TF_DOUBLE:
+      *out_pyarray_type = NPY_FLOAT64;
+      break;
+    case TF_INT32:
+      *out_pyarray_type = NPY_INT32;
+      break;
+    case TF_UINT8:
+      *out_pyarray_type = NPY_UINT8;
+      break;
+    case TF_UINT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    case TF_INT8:
+      *out_pyarray_type = NPY_INT8;
+      break;
+    case TF_INT16:
+      *out_pyarray_type = NPY_INT16;
+      break;
+    case TF_INT64:
+      *out_pyarray_type = NPY_INT64;
+      break;
+    case TF_BOOL:
+      *out_pyarray_type = NPY_BOOL;
+      break;
+    case TF_COMPLEX64:
+      *out_pyarray_type = NPY_COMPLEX64;
+      break;
+    case TF_COMPLEX128:
+      *out_pyarray_type = NPY_COMPLEX128;
+      break;
+    case TF_STRING:
+      *out_pyarray_type = NPY_OBJECT;
+      break;
+    case TF_RESOURCE:
+      *out_pyarray_type = NPY_VOID;
+      break;
+    // TODO(keveman): These should be changed to NPY_VOID, and the type used for
+    // the resulting numpy array should be the custom struct types that we
+    // expect for quantized types.
+    case TF_QINT8:
+      *out_pyarray_type = NPY_INT8;
+      break;
+    case TF_QUINT8:
+      *out_pyarray_type = NPY_UINT8;
+      break;
+    case TF_QINT16:
+      *out_pyarray_type = NPY_INT16;
+      break;
+    case TF_QUINT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    case TF_QINT32:
+      *out_pyarray_type = NPY_INT32;
+      break;
+    case TF_BFLOAT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    default:
+      return errors::Internal("Tensorflow type ", tf_datatype,
+                              " not convertible to numpy dtype.");
+  }
+  return Status::OK();
+}
+
+Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
+                       std::function<void()> destructor, PyObject** result) {
+  int size = 1;
+  for (int i = 0; i < dim_size; ++i) {
+    size *= dims[i];
+  }
+  if (dtype == DT_STRING || dtype == DT_RESOURCE || size == 0) {
+    return errors::FailedPrecondition(
+        "Cannot convert strings, resources, or empty Tensors.");
+  }
+
+  int type_num = -1;
+  Status s =
+      TF_DataType_to_PyArray_TYPE(static_cast<TF_DataType>(dtype), &type_num);
+  if (!s.ok()) {
+    return s;
+  }
+
+  PyObject* np_array =
+      PyArray_SimpleNewFromData(dim_size, dims, type_num, data);
+  if (PyType_Ready(&TensorReleaserType) == -1) {
+    return errors::Unknown("Python type initialization failed.");
+  }
+  TensorReleaser* releaser = reinterpret_cast<TensorReleaser*>(
+      TensorReleaserType.tp_alloc(&TensorReleaserType, 0));
+  releaser->destructor = new std::function<void()>(std::move(destructor));
+  if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(np_array),
+                            reinterpret_cast<PyObject*>(releaser)) == -1) {
+    Py_DECREF(releaser);
+    return errors::Unknown("Python array refused to use memory.");
+  }
+  *result = PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.h b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
new file mode 100644
index 00000000000..029c0d3ef0a
--- /dev/null
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
+
+// Must be included first.
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include <functional>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Destructor passed to TF_NewTensor when it reuses a numpy buffer. Stores a
+// pointer to the pyobj in a buffer to be dereferenced later when we're actually
+// holding the GIL. Data and len are ignored.
+void DelayedNumpyDecref(void* data, size_t len, void* obj);
+
+// Actually dereferences cached numpy arrays. REQUIRES being called while
+// holding the GIL.
+void ClearDecrefCache();
+
+// Creates a numpy array with shapes specified by dim_size and dims and content
+// in data. The array does not own the memory, and destructor will be called to
+// release it. If the status is not ok the caller is responsible for releasing
+// the memory.
+Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
+                       std::function<void()> destructor, PyObject** result);
+
+// Converts TF_DataType to the corresponding numpy type.
+Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
+                                   int* out_pyarray_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 57004141490..c48296eccb0 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -17,14 +17,16 @@ limitations under the License.
 
 #include <array>
 
-#include <Python.h>
 #include "numpy/arrayobject.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+#include <Python.h>
 
 namespace tensorflow {
 namespace {
@@ -39,50 +41,6 @@ PyObject* GetPyTrampoline() {
   return py_trampoline;
 }
 
-// Returns the corresponding numpy dtype in 'np' for tf data type
-// 'tf'.  Returns an error if the type is not supported by this
-// module.
-Status TfDTypeToNpDType(const DataType& tf, int* np) {
-  switch (tf) {
-    case DT_FLOAT:
-      *np = NPY_FLOAT32;
-      break;
-    case DT_DOUBLE:
-      *np = NPY_FLOAT64;
-      break;
-    case DT_INT32:
-      *np = NPY_INT32;
-      break;
-    case DT_UINT8:
-      *np = NPY_UINT8;
-      break;
-    case DT_INT8:
-      *np = NPY_INT8;
-      break;
-    case DT_INT16:
-      *np = NPY_INT16;
-      break;
-    case DT_INT64:
-      *np = NPY_INT64;
-      break;
-    case DT_BOOL:
-      *np = NPY_BOOL;
-      break;
-    case DT_COMPLEX64:
-      *np = NPY_COMPLEX64;
-      break;
-    case DT_COMPLEX128:
-      *np = NPY_COMPLEX128;
-      break;
-    case DT_STRING:
-      *np = NPY_OBJECT;
-      break;
-    default:
-      return errors::Unimplemented("Unsupported tf type ", DataTypeString(tf));
-  }
-  return Status::OK();
-}
-
 // A call to the registered python function.
 struct PyCall {
   // Passed to python runtime to call the python function registered
@@ -171,6 +129,48 @@ bool IsSingleNone(PyObject* obj) {
   return item == Py_None;
 }
 
+// py.__class__.__name__
+const char* ClassName(PyObject* py) {
+/* PyPy doesn't have a separate C API for old-style classes. */
+#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
+  if (PyClass_Check(py))
+    return PyString_AS_STRING(
+        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
+  if (PyInstance_Check(py))
+    return PyString_AS_STRING(CHECK_NOTNULL(
+        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
+#endif
+  if (Py_TYPE(py) == &PyType_Type) {
+    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
+  }
+  return Py_TYPE(py)->tp_name;
+}
+
+string PyExcFetch() {
+  CHECK(PyErr_Occurred()) << "Must only call PyExcFetch after an exception.";
+  PyObject* ptype;
+  PyObject* pvalue;
+  PyObject* ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  string err = ClassName(ptype);
+  if (pvalue) {
+    PyObject* str = PyObject_Str(pvalue);
+    if (str) {
+#if PY_MAJOR_VERSION < 3
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+#else
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+#endif
+      Py_DECREF(str);
+    }
+    Py_DECREF(pvalue);
+  }
+  Py_DECREF(ptype);
+  Py_XDECREF(ptraceback);
+  return err;
+}
+
 // Calls the registered py function through the trampoline.
 Status DoCallPyFunc(PyCall* call) {
   PyObject* trampoline = GetPyTrampoline();
@@ -188,11 +188,24 @@ Status DoCallPyFunc(PyCall* call) {
   Py_DECREF(args);
   if (result == nullptr) {
     if (PyErr_Occurred()) {
-      // TODO(zhifengc): Consider pretty-print error using LOG(STDERR).
-      PyErr_Print();
+      if (PyErr_ExceptionMatches(PyExc_ValueError) ||
+          PyErr_ExceptionMatches(PyExc_TypeError)) {
+        return errors::InvalidArgument(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+        return errors::OutOfRange(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
+        return errors::ResourceExhausted(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
+        return errors::Unimplemented(PyExcFetch());
+      } else {
+        // TODO(ebrevdo): Check if exception is an OpError and use the
+        // OpError.error_code property to map it back in the Status.
+        return errors::Unknown(PyExcFetch());
+      }
+    } else {
+      return errors::Internal("Failed to run py callback ", call->token,
+                              ": see error log.");
     }
-    return errors::Internal("Failed to run py callback ", call->token,
-                            ": see error log.");
   }
 
   // Process the return values and converts them to tf Tensors.
@@ -227,9 +240,44 @@ Status DoCallPyFunc(PyCall* call) {
 
 }  // end namespace
 
+// Outside anonymous namespace just to make the friend declaration in
+// tensorflow::Tensor apply.
+class NumpyTensorBuffer : public TensorBuffer {
+ public:
+  NumpyTensorBuffer(PyArrayObject* array, size_t len, void* data)
+      : array_(array), len_(len), data_(data) {}
+
+  ~NumpyTensorBuffer() override {
+    // Note: The session::run wrapper is responsible for freeing this while
+    // holding the GIL.
+    DelayedNumpyDecref(data_, len_, array_);
+  }
+
+  void* data() const override { return data_; }
+  size_t size() const override { return len_; }
+  TensorBuffer* root_buffer() override { return this; }
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    tensorflow::int64 rb = size();
+    proto->set_requested_bytes(rb);
+    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
+  }
+  Tensor MakeTensor(DataType dtype, const TensorShape& shape) {
+    CHECK_EQ(len_, shape.num_elements() * DataTypeSize(dtype));
+    return Tensor(dtype, shape, this);
+  }
+
+  // Prevents input forwarding from overwriting this buffer.
+  bool OwnsMemory() const override { return false; }
+
+ private:
+  PyArrayObject* array_;
+  size_t len_;
+  void* data_;
+};
+
 Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
   PyArrayObject* input = reinterpret_cast<PyArrayObject*>(obj);
-  DataType dtype;
+  DataType dtype = DT_INVALID;
   TensorShape shape;
   for (int i = 0; i < PyArray_NDIM(input); ++i) {
     shape.AddDim(PyArray_SHAPE(input)[i]);
@@ -267,27 +315,51 @@ Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
     }
     default: {
       TF_RETURN_IF_ERROR(NumericNpDTypeToTfDType(PyArray_TYPE(input), &dtype));
-      Tensor t(dtype, shape);
       CHECK(DataTypeCanUseMemcpy(dtype));
-      StringPiece p = t.tensor_data();
-      memcpy(const_cast<char*>(p.data()), PyArray_DATA(input), p.size());
-      *ret = t;
+      if (reinterpret_cast<intptr_t>(PyArray_DATA(input)) %
+              EIGEN_MAX_ALIGN_BYTES !=
+          0) {
+        Tensor t(dtype, shape);
+        StringPiece p = t.tensor_data();
+        memcpy(const_cast<char*>(p.data()), PyArray_DATA(input), p.size());
+        *ret = t;
+      } else {
+        // Incref the array as the calling context will decref it when we
+        // return and we want to keep a handle to this memory.
+        Py_INCREF(input);
+        NumpyTensorBuffer* buf = new NumpyTensorBuffer(
+            input, shape.num_elements() * DataTypeSize(dtype),
+            PyArray_DATA(input));
+        *ret = buf->MakeTensor(dtype, shape);
+        buf->Unref();
+      }
     }
   }
   return Status::OK();
 }
 
-// Creates a numpy array in 'ret' and copies the content of tensor 't'
-// into 'ret'.
+// Creates a numpy array in 'ret' which either aliases the content of 't' or has
+// a copy.
 Status ConvertTensorToNdarray(const Tensor& t, PyObject** ret) {
   int typenum = -1;
-  TF_RETURN_IF_ERROR(TfDTypeToNpDType(t.dtype(), &typenum));
+  TF_RETURN_IF_ERROR(TF_DataType_to_PyArray_TYPE(
+      static_cast<TF_DataType>(t.dtype()), &typenum));
   PyArray_Descr* descr = PyArray_DescrFromType(typenum);
   CHECK(descr);
   std::vector<npy_intp> dims;
+  dims.reserve(t.dims());
   for (int i = 0; i < t.dims(); ++i) {
     dims.push_back(t.dim_size(i));
   }
+  Tensor* copy = new Tensor(t);
+  if (ArrayFromMemory(dims.size(), dims.data(),
+                      const_cast<char*>(copy->tensor_data().data()), t.dtype(),
+                      [copy]() { delete copy; }, ret)
+          .ok()) {
+    return Status::OK();
+  }
+  delete copy;
+
   PyObject* obj = PyArray_Empty(dims.size(), dims.data(), descr, 0);
   if (obj == nullptr) {
     return errors::Internal("Failed to allocate np array: ",
diff --git a/tensorflow/python/lib/core/strings.i b/tensorflow/python/lib/core/strings.i
index d2b05588826..b74eb91cd55 100644
--- a/tensorflow/python/lib/core/strings.i
+++ b/tensorflow/python/lib/core/strings.i
@@ -87,7 +87,7 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   if (!temp_string_list) {
     SWIG_fail;
   }
-  tensorflow::Safe_PyObjectVector converted;
+  std::vector<tensorflow::Safe_PyObjectPtr> converted;
   converted.reserve(size);
   for (const string& op : $1) {
     // Always treat strings as bytes, consistent with the typemap
@@ -103,4 +103,3 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   }
   $result = temp_string_list.release();
 }
-
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index ddd117e443c..c212d2071f2 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -24,9 +24,12 @@ from __future__ import print_function
 import os
 import uuid
 
+import six
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 
 
 class FileIO(object):
@@ -121,11 +124,51 @@ class FileIO(object):
       return self._prepare_value(
           pywrap_tensorflow.ReadFromStream(self._read_buf, length, status))
 
-  def seek(self, position):
-    """Seeks to the position in the file."""
+  @deprecation.deprecated_args(
+      None,
+      "position is deprecated in favor of the offset argument.",
+      "position")
+  def seek(self, offset=None, whence=0, position=None):
+    # TODO(jhseu): Delete later. Used to omit `position` from docs.
+    # pylint: disable=g-doc-args
+    """Seeks to the offset in the file.
+
+    Args:
+      offset: The byte count relative to the whence argument.
+      whence: Valid values for whence are:
+        0: start of the file (default)
+        1: relative to the current position of the file
+        2: relative to the end of file. offset is usually negative.
+    """
+    # pylint: enable=g-doc-args
     self._preread_check()
+    # We needed to make offset a keyword argument for backwards-compatibility.
+    # This check exists so that we can convert back to having offset be a
+    # positional argument.
+    # TODO(jhseu): Make `offset` a positional argument after `position` is
+    # deleted.
+    if offset is None and position is None:
+      raise TypeError("seek(): offset argument required")
+    if offset is not None and position is not None:
+      raise TypeError("seek(): offset and position may not be set "
+                      "simultaneously.")
+
+    if position is not None:
+      offset = position
+
     with errors.raise_exception_on_not_ok_status() as status:
-      ret_status = self._read_buf.Seek(position)
+      if whence == 0:
+        pass
+      elif whence == 1:
+        offset += self.tell()
+      elif whence == 2:
+        offset += self.size()
+      else:
+        raise errors.InvalidArgumentError(
+            None, None,
+            "Invalid whence argument: {}. Valid values are 0, 1, or 2."
+            .format(whence))
+      ret_status = self._read_buf.Seek(offset)
       pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
 
   def readline(self):
@@ -146,9 +189,7 @@ class FileIO(object):
 
   def tell(self):
     """Returns the current position in the file."""
-    if not self._read_check_passed:
-      raise errors.PermissionDeniedError(None, None,
-                                         "File isn't open for reading")
+    self._preread_check()
     return self._read_buf.Tell()
 
   def __enter__(self):
@@ -265,23 +306,33 @@ def write_string_to_file(filename, file_content):
 
 
 def get_matching_files(filename):
-  """Returns a list of files that match the given pattern.
+  """Returns a list of files that match the given pattern(s).
 
   Args:
-    filename: string, the pattern
+    filename: string or iterable of strings. The glob pattern(s).
 
   Returns:
-    Returns a list of strings containing filenames that match the given pattern.
+    A list of strings containing filenames that match the given pattern(s).
 
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    # Convert each element to string, since the return values of the
-    # vector of string should be interpreted as strings, not bytes.
-    return [compat.as_str_any(matching_filename)
-            for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-                compat.as_bytes(filename), status)]
+    if isinstance(filename, six.string_types):
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(filename), status)
+      ]
+    else:
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for single_filename in filename
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(single_filename), status)
+      ]
 
 
 def create_dir(dirname):
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index 0063eebb59a..e60b93b84fb 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -129,6 +129,12 @@ class FileIoTest(test.TestCase):
     self.assertItemsEqual(
         file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
         expected_match)
+    self.assertItemsEqual(file_io.get_matching_files(tuple()), [])
+    files_subset = [
+        os.path.join(dir_path, files[0]), os.path.join(dir_path, files[2])
+    ]
+    self.assertItemsEqual(
+        file_io.get_matching_files(files_subset), files_subset)
     file_io.delete_recursively(dir_path)
     self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
 
@@ -354,6 +360,7 @@ class FileIoTest(test.TestCase):
     file_path = os.path.join(self._base_dir, "temp_file")
     with file_io.FileIO(file_path, mode="r+") as f:
       f.write("testing1\ntesting2\ntesting3\n\ntesting5")
+    self.assertEqual(0, f.tell())
     self.assertEqual("testing1\n", f.readline())
     self.assertEqual(9, f.tell())
     self.assertEqual("testing2\n", f.readline())
@@ -391,6 +398,40 @@ class FileIoTest(test.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       f.seek(-1)
 
+    with self.assertRaises(TypeError):
+      f.seek()
+
+    # TODO(jhseu): Delete after position deprecation.
+    with self.assertRaises(TypeError):
+      f.seek(offset=0, position=0)
+    f.seek(position=9)
+    self.assertEqual(9, f.tell())
+    self.assertEqual("testing2\n", f.readline())
+
+  def testSeekFromWhat(self):
+    file_path = os.path.join(self._base_dir, "temp_file")
+    with file_io.FileIO(file_path, mode="r+") as f:
+      f.write("testing1\ntesting2\ntesting3\n\ntesting5")
+    self.assertEqual("testing1\n", f.readline())
+    self.assertEqual(9, f.tell())
+
+    # Seek to 18
+    f.seek(9, 1)
+    self.assertEqual(18, f.tell())
+    self.assertEqual("testing3\n", f.readline())
+
+    # Seek back to 9
+    f.seek(9, 0)
+    self.assertEqual(9, f.tell())
+    self.assertEqual("testing2\n", f.readline())
+
+    f.seek(-f.size(), 2)
+    self.assertEqual(0, f.tell())
+    self.assertEqual("testing1\n", f.readline())
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      f.seek(0, 3)
+
   def testReadingIterator(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     data = ["testing1\n", "testing2\n", "testing3\n", "\n", "testing5"]
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index 039e59756ec..df35c43c3d4 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -37,18 +37,16 @@ PyRecordWriter* PyRecordWriter::New(const string& filename,
     return nullptr;
   }
   PyRecordWriter* writer = new PyRecordWriter;
-  writer->file_ = file.release();
+  writer->file_ = std::move(file);
 
   RecordWriterOptions options =
       RecordWriterOptions::CreateRecordWriterOptions(compression_type_string);
 
-  writer->writer_ = new RecordWriter(writer->file_, options);
+  writer->writer_.reset(new RecordWriter(writer->file_.get(), options));
   return writer;
 }
 
 PyRecordWriter::~PyRecordWriter() {
-  delete writer_;
-  delete file_;
 }
 
 bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
@@ -57,11 +55,19 @@ bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
   return s.ok();
 }
 
-void PyRecordWriter::Close() {
-  delete writer_;
-  delete file_;
-  writer_ = nullptr;
-  file_ = nullptr;
+void PyRecordWriter::Close(TF_Status* out_status) {
+  Status s = writer_->Close();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  writer_.reset(nullptr);
+  s = file_->Close();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  file_.reset(nullptr);
 }
 
 }  // namespace io
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 483b7b9df04..8c53420ce68 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_IO_PY_RECORD_WRITER_H_
 #define TENSORFLOW_PYTHON_LIB_IO_PY_RECORD_WRITER_H_
 
+#include <memory>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
@@ -42,13 +44,13 @@ class PyRecordWriter {
   ~PyRecordWriter();
 
   bool WriteRecord(tensorflow::StringPiece record);
-  void Close();
+  void Close(TF_Status* out_status);
 
  private:
   PyRecordWriter();
 
-  WritableFile* file_;        // Owned
-  io::RecordWriter* writer_;  // Owned
+  std::unique_ptr<io::RecordWriter> writer_;
+  std::unique_ptr<WritableFile> file_;
   TF_DISALLOW_COPY_AND_ASSIGN(PyRecordWriter);
 };
 
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index a4c682aa28a..b92cfe8f801 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -13,34 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Data IO (Python Functions)
+"""Python functions for directly manipulating TFRecord-formatted files.
 
-A TFRecords file represents a sequence of (binary) strings.  The format is not
-random access, so it is suitable for streaming large amounts of data but not
-suitable if fast sharding or other non-sequential access is desired.
+See the @{$python/python_io} guide.
 
 @@TFRecordWriter
 @@tf_record_iterator
 @@TFRecordCompressionType
 @@TFRecordOptions
-
-- - -
-
-### TFRecords Format Details
-
-A TFRecords file contains a sequence of strings with CRC hashes.  Each record
-has the format
-
-    uint64 length
-    uint32 masked_crc32_of_length
-    byte   data[length]
-    uint32 masked_crc32_of_data
-
-and the records are concatenated together to produce the file.  The CRC32s
-are [described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check),
-and the mask of a CRC is
-
-    masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index d02baeb6cd4..3d0cdc2153c 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -86,10 +86,6 @@ class TFRecordWriter(object):
 
   This class implements `__enter__` and `__exit__`, and can be used
   in `with` blocks like a normal file.
-
-  @@__init__
-  @@write
-  @@close
   """
 
   # TODO(josh11b): Support appending?
@@ -127,4 +123,5 @@ class TFRecordWriter(object):
 
   def close(self):
     """Close the file."""
-    self._writer.Close()
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.Close(status)
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 50aab4fad5a..5c6d309e6c7 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -271,7 +271,6 @@ def _SplitGrad(op, *grads):
 def _SplitVGrad(op, *grads):
   returnval = array_ops.concat(list(grads), op.inputs[2])
   returnval = [returnval] + [None,] * (len(op.inputs) - 1)
-  print(returnval)
   return returnval
 
 ops.NotDifferentiable("Const")
@@ -303,6 +302,7 @@ def _MatrixDiagPartGrad(op, grad):
 
 @ops.RegisterGradient("MatrixSetDiag")
 def _MatrixSetDiagGrad(op, grad):
+  """Gradient for MatrixSetDiag."""
   input_shape = op.inputs[0].get_shape().merge_with(grad.get_shape())
   diag_shape = op.inputs[1].get_shape()
   batch_shape = input_shape[:-2].merge_with(diag_shape[:-1])
@@ -341,15 +341,28 @@ def _FillGrad(_, grad):
 
 
 ops.NotDifferentiable("ZerosLike")
+ops.NotDifferentiable("OnesLike")
+
+
+@ops.RegisterGradient("PreventGradient")
+def _PreventGradientGrad(op, _):
+  raise LookupError(
+      "Gradient explicitly disabled. Reason: %s" % op.get_attr("message"))
 
 
 @ops.RegisterGradient("Gather")
 def _GatherGrad(op, grad):
   """Gradient for Gather op."""
   # params can be large, so colocate the shape calculation with it.
+  #
+  # params can be very large for sparse model, array_ops.shape raises
+  # exception on the Windows platform when any dimension is larger than
+  # int32. params_shape is not used in optimizer apply_sparse gradients,
+  # so it's fine to convert it back to int32 regardless of truncation.
   params = op.inputs[0]
   with ops.colocate_with(params):
-    params_shape = array_ops.shape(params)
+    params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
+    params_shape = math_ops.to_int32(params_shape)
 
   # Build appropriately shaped IndexedSlices
   indices = op.inputs[1]
@@ -376,6 +389,7 @@ def _CheckNumericsGrad(_, grad):
       grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
 
 
+@ops.RegisterGradient("PlaceholderWithDefault")
 @ops.RegisterGradient("Identity")
 def _IdGrad(_, grad):
   return grad
@@ -567,6 +581,11 @@ def _QuantizeAndDequantizeGrad(_, grad):
   return grad
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV2")
+def _QuantizeAndDequantizeV2Grad(_, grad):
+  return [grad, None, None]
+
+
 @ops.RegisterGradient("ExtractImagePatches")
 def _ExtractImagePatchesGrad(op, grad):
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8170ea913db..86a59ff9e30 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Casting
+"""Support for manipulating tensors.
 
-TensorFlow provides several operations that you can use to cast tensor data
-types in your graph.
+See the @{$python/array_ops} guide.
 
 @@string_to_number
 @@to_double
@@ -27,12 +26,6 @@ types in your graph.
 @@cast
 @@bitcast
 @@saturate_cast
-
-## Shapes and Shaping
-
-TensorFlow provides several operations that you can use to determine the shape
-of a tensor and change the shape of a tensor.
-
 @@broadcast_dynamic_shape
 @@broadcast_static_shape
 @@shape
@@ -43,12 +36,6 @@ of a tensor and change the shape of a tensor.
 @@squeeze
 @@expand_dims
 @@meshgrid
-
-## Slicing and Joining
-
-TensorFlow provides several operations to slice or extract parts of a tensor,
-or join multiple tensors together.
-
 @@slice
 @@strided_slice
 @@split
@@ -83,10 +70,6 @@ or join multiple tensors together.
 @@quantize_v2
 @@quantized_concat
 @@setdiff1d
-
-## Fake quantization
-Operations used to help train for better quantization accuracy.
-
 @@fake_quant_with_min_max_args
 @@fake_quant_with_min_max_args_gradient
 @@fake_quant_with_min_max_vars
@@ -94,13 +77,13 @@ Operations used to help train for better quantization accuracy.
 @@fake_quant_with_min_max_vars_per_channel
 @@fake_quant_with_min_max_vars_per_channel_gradient
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import sys
 import numpy as np
-import six
 
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -212,7 +195,8 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
-    shape_y: A rank 1 integer `Tensor`, representing the shape of x.
+    shape_y: A rank 1 integer `Tensor`, representing the shape of y.
+
   Returns:
     A rank 1 integer `Tensor` representing the broadcasted shape.
   """
@@ -418,14 +402,15 @@ def _SliceHelper(tensor, slice_spec, var=None):
 
   # Insert another dimension
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[:, tf.newaxis, :].eval()) # => [[[3,2,1]], [[9,8,7]]]
-  print(foo[:, :, tf.newaxis].eval()) # => [[[3],[2],[1]], [[9],[8],[7]]]
+  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[:, tf.newaxis, :].eval()) # => [[[1,2,3]], [[4,5,6]], [[7,8,9]]]
+  print(foo[:, :, tf.newaxis].eval()) # => [[[1],[2],[3]], [[4],[5],[6]], [[7],[8],[9]]]
 
   # Ellipses (3 equivalent operations)
-  print(foo[tf.newaxis, :, :].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[tf.newaxis, ...].eval()) # => [[[3,2,1], [9,8,7]]]
-  print(foo[tf.newaxis].eval()) # => [[[3,2,1], [9,8,7]]]
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, ...].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
   ```
 
   Notes:
@@ -486,7 +471,10 @@ def _SliceHelper(tensor, slice_spec, var=None):
     else:
       begin.append(s)
       end.append(s + 1)
-      strides.append(1)
+      if isinstance(s, ops.Tensor):
+        strides.append(constant(1, s.dtype))
+      else:
+        strides.append(np.ones_like(s).dtype.type(1))
       shrink_axis_mask |= (1 << index)
     index += 1
 
@@ -573,7 +561,13 @@ def strided_slice(input_,
                   shrink_axis_mask=0,
                   var=None,
                   name=None):
-  """Extracts a strided slice from a tensor.
+  """Extracts a strided slice of a tensor (generalized python array indexing).
+
+  **Most users will want to use @{tf.Tensor.__getitem__} and
+  @{tf.Variable.__getitem__}.** That allows  NumPy style slicing syntax (i.e.
+  `tensor[..., 3:4:-1, tf.newaxis, 3]`).
+  This op is the low-level interface that are used to implement operators.
+  Those interfaces are much more friendly, and highly recommended.
 
   To a first order, this operation extracts a slice of size `end - begin`
   from a tensor `input`
@@ -602,7 +596,7 @@ def strided_slice(input_,
   `foo[::-1]` reverses a tensor with shape 8.
 
 
-  If the ith bit of `ellipsis_mask`, as many unspecified dimensions
+  If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions
   as needed will be inserted between other dimensions. Only one
   non-zero bit is allowed in `ellipsis_mask`.
 
@@ -610,7 +604,7 @@ def strided_slice(input_,
   equivalent to `foo[3:5,:,:,4:5]` and
   `foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
 
-  If the ith bit of `new_axis_mask` is one, then a `begin`,
+  If the ith bit of `new_axis_mask` is one, then `begin`,
   `end`, and `stride` are ignored and a new length 1 dimension is
   added at this point in the output tensor.
 
@@ -634,8 +628,8 @@ def strided_slice(input_,
   tf.strided_slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
   tf.strided_slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
                                                                  [4, 4, 4]]]
-  tf.strided_slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
-                                                                  [3, 3, 3]]]
+  tf.strided_slice(input, [1, -1, 0], [2, -3, 3], [1, -1, 1]) ==>[[[4, 4, 4],
+                                                                   [3, 3, 3]]]
   ```
 
   Args:
@@ -670,19 +664,23 @@ def strided_slice(input_,
       new_axis_mask=new_axis_mask,
       shrink_axis_mask=shrink_axis_mask)
 
-  def assign(val):
+  parent_name = name
+
+  def assign(val, name=None):
     """Closure that holds all the arguments to create an assignment."""
 
     if var is None:
       raise ValueError("Sliced assignment is only supported for variables")
 
-    return gen_array_ops.strided_slice_assign(
-        ref=var,
+    if name is None:
+      name = parent_name + "_assign"
+
+    return var._strided_slice_assign(
         begin=begin,
         end=end,
         strides=strides,
         value=val,
-        name=name + "_assign",
+        name=name,
         begin_mask=begin_mask,
         end_mask=end_mask,
         ellipsis_mask=ellipsis_mask,
@@ -697,9 +695,7 @@ def _SliceHelperVar(var, slice_spec):
   """Creates a slice helper object given a variable.
 
   This allows creating a sub-tensor from part of the current contents
-  of a variable.
-  See
-  [`Tensor.__getitem__`](../../api_docs/python/framework.md#Tensor.__getitem__)
+  of a variable.  See ${tf.Tensor$`Tensor.__getitem__`}
   for detailed examples of slicing.
 
   This function in addition also allows assignment to a sliced range.
@@ -708,7 +704,7 @@ def _SliceHelperVar(var, slice_spec):
   operation for grouping or passing to `sess.run()`.
   For example,
 
-  ```prettyprint
+  ```python
   import tensorflow as tf
   A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
   with tf.Session() as sess:
@@ -754,18 +750,21 @@ def parallel_stack(values, name="parallel_stack"):
 
   For example:
 
-  ```prettyprint
+  ```python
   # 'x' is [1, 4]
   # 'y' is [2, 5]
   # 'z' is [3, 6]
-  parallel_stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]
+  parallel_stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]]
   ```
 
-  The difference between stack and parallel_stack is that stack requires all
-  of the inputs be computed before the operation will begin but doesn't require
-  that the input shapes be known during graph construction.  Parallel stack
-  will copy pieces of the input into the output as they become available, in
-  some situations this can provide a performance benefit.
+  The difference between `stack` and `parallel_stack` is that `stack` requires
+  all the inputs be computed before the operation will begin but doesn't require
+  that the input shapes be known during graph construction.
+  
+  `parallel_stack` will copy pieces of the input into the output as they become
+  available, in some situations this can provide a performance benefit.
+  
+  Unlike `stack`, `parallel_stack` does NOT support backpropagation.
 
   This is the opposite of unstack.  The numpy equivalent is
 
@@ -788,6 +787,7 @@ def parallel_stack(values, name="parallel_stack"):
     return gen_array_ops._parallel_concat(
         [expand_dims(value, 0) for value in values], shape=output_shape)
 
+
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
 
@@ -801,17 +801,19 @@ def stack(values, axis=0, name="stack"):
 
   For example:
 
-  ```prettyprint
+  ```python
   # 'x' is [1, 4]
   # 'y' is [2, 5]
   # 'z' is [3, 6]
-  stack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-  stack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+  stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]] (Pack along first dim.)
+  stack([x, y, z], axis=1)  # => [[1, 2, 3], [4, 5, 6]]
   ```
 
   This is the opposite of unstack.  The numpy equivalent is
 
-      tf.stack([x, y, z]) = np.asarray([x, y, z])
+  ```python
+  tf.stack([x, y, z]) = np.asarray([x, y, z])
+  ```
 
   Args:
     values: A list of `Tensor` objects with the same shape and type.
@@ -947,7 +949,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
     `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
   Etc.
 
-  This is the opposite of pack.  The numpy equivalent is
+  This is the opposite of stack.  The numpy equivalent is
 
       tf.unstack(x, n) = list(x)
 
@@ -1167,13 +1169,14 @@ def sparse_mask(a, mask_indices, name=None):
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
 
-  If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-  dimension `axis` into `num_split` smaller tensors.
+  If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
+  along dimension `axis` into `num_split` smaller tensors.
   Requires that `num_split` evenly divides `value.shape[axis]`.
 
-  If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-  `len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-  the `value` except along dimension `axis` where the size is `size_splits[i]`.
+  If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
+  `size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
+  of the `i`-th piece has the same size as the `value` except along dimension
+  `axis` where the size is `size_splits[i]`.
 
   For example:
 
@@ -1191,13 +1194,13 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of splits along
-      split_dim or a 1-D Tensor containing the sizes of each output tensor
-      along split_dim. If an integer then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split
-      dimension must match that of the `value`.
+    num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
+      splits along split_dim or a 1-D integer `Tensor` integer tensor containing
+      the sizes of each output tensor along split_dim. If a scalar then it must
+      evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
+      split dimension must match that of the `value`.
     axis: A 0-D `int32` `Tensor`. The dimension along which to split.
-      Must be in the range `[0, rank(value))`. Defaults to 0.
+      Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
       inferred from the shape of `size_splits`.
     name: A name for the operation (optional).
@@ -1211,11 +1214,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   Raises:
     ValueError: If `num` is unspecified and cannot be inferred.
   """
-  if isinstance(num_or_size_splits, six.integer_types):
+  size_splits = ops.convert_to_tensor(num_or_size_splits)
+  if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
         split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
   else:
-    size_splits = ops.convert_to_tensor(num_or_size_splits)
     if num is None:
       size_splits_shape = size_splits.get_shape()
       num = size_splits_shape.dims[0]
@@ -1308,6 +1311,17 @@ def matrix_transpose(a, name="matrix_transpose"):
   # tf.matrix_transpose(x) is shape [1, 2, 4, 3]
   ```
 
+  Note that `tf.matmul` provides kwargs allowing for transpose of arguments.
+  This is done with minimal cost, and is preferable to using this function. E.g.
+
+  ```
+  # Good!  Transpose is taken at minimal additional cost.
+  tf.matmul(matrix, b, transpose_b=True)
+
+  # Inefficient!
+  tf.matmul(matrix, tf.matrix_transpose(b))
+  ```
+
   Args:
     a: A `Tensor` with `rank >= 2`.
     name: A name for the operation (optional).
@@ -1407,10 +1421,15 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
-    if dtype is not None and tensor.dtype != dtype:
-      ret = zeros(shape_internal(tensor, optimize=optimize), dtype, name=name)
-      ret.set_shape(tensor.get_shape())
-      return ret
+
+    if tensor.shape.is_fully_defined():
+      # We can produce a zeros tensor independent of the value of 'tensor',
+      # since the shape is known statically.
+      return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name)
+
+    if dtype is not None and dtype != tensor.dtype:
+      return zeros(shape_internal(tensor, optimize=optimize), dtype=dtype,
+                   name=name)
     else:
       return gen_array_ops._zeros_like(tensor, name=name)
 
@@ -1514,17 +1533,7 @@ def placeholder(dtype, shape=None, name=None):
     A `Tensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
   """
-  shape = tensor_shape.as_shape(shape)
-  if shape.is_fully_defined():
-    dim_list = shape.as_list()
-  else:
-    dim_list = []
-  ret = gen_array_ops._placeholder(
-      dtype=dtype,
-      shape=dim_list,
-      name=name)
-  ret.set_shape(shape)
-  return ret
+  return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
 
 
 # pylint: disable=redefined-outer-name
@@ -1563,7 +1572,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
       x: (indices, values, shape)}))  # Will succeed.
 
     sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
-    sp_value = sp.eval(session)
+    sp_value = sp.eval(session=sess)
     print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
   ```
 
@@ -1678,29 +1687,29 @@ def meshgrid(*args, **kwargs):
 
   Calling `X, Y = meshgrid(x, y)` with the tensors
 
-  ```prettyprint
+  ```python
     x = [1, 2, 3]
     y = [4, 5, 6]
   ```
 
   results in
 
-  ```prettyprint
-    X = [[1, 1, 1],
-         [2, 2, 2],
-         [3, 3, 3]]
-    Y = [[4, 5, 6],
-         [4, 5, 6],
-         [4, 5, 6]]
+  ```python
+    X = [[1, 2, 3],
+         [1, 2, 3],
+         [1, 2, 3]]
+    Y = [[4, 4, 4],
+         [5, 5, 5],
+         [6, 6, 6]]
   ```
 
   Args:
-    *args: `Tensor`s with rank 1
-    indexing: Either 'xy' or 'ij' (optional, default: 'xy')
+    *args: `Tensor`s with rank 1.
+    indexing: Either 'xy' or 'ij' (optional, default: 'xy').
     name: A name for the operation (optional).
 
   Returns:
-    outputs: A list of N `Tensor`s with rank N
+    outputs: A list of N `Tensor`s with rank N.
   """
 
   indexing = kwargs.pop("indexing", "xy")
@@ -1881,22 +1890,37 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
 @ops.RegisterGradient("FakeQuantWithMinMaxArgs")
 def _FakeQuantWithMinMaxArgsGradient(op, grad):
   """Gradient for FakeQuantWithMinMaxArgs op."""
-  return fake_quant_with_min_max_args_gradient(grad, op.inputs[0])
+  return fake_quant_with_min_max_args_gradient(
+      grad,
+      op.inputs[0],
+      min=op.get_attr("min"),
+      max=op.get_attr("max"),
+      num_bits=op.get_attr("num_bits"),
+      narrow_range=op.get_attr("narrow_range"))
 
 
 @ops.RegisterGradient("FakeQuantWithMinMaxVars")
 def _FakeQuantWithMinMaxVarsGradient(op, grad):
   """Gradient for FakeQuantWithMinMaxVars op."""
-  return fake_quant_with_min_max_vars_gradient(grad, op.inputs[0], op.inputs[1],
-                                               op.inputs[2])
+  return fake_quant_with_min_max_vars_gradient(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      num_bits=op.get_attr("num_bits"),
+      narrow_range=op.get_attr("narrow_range"))
 
 
 @ops.RegisterGradient("FakeQuantWithMinMaxVarsPerChannel")
 def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
   """Gradient for FakeQuantWithMinMaxVarsPerChannel op."""
-  return fake_quant_with_min_max_vars_per_channel_gradient(grad, op.inputs[0],
-                                                           op.inputs[1],
-                                                           op.inputs[2])
+  return fake_quant_with_min_max_vars_per_channel_gradient(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      num_bits=op.get_attr("num_bits"),
+      narrow_range=op.get_attr("narrow_range"))
 
 
 def required_space_to_batch_paddings(input_shape,
@@ -2240,16 +2264,16 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
 
   For example:
 
-  ```prettyprint
+  ```python
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-  shape(squeeze(t)) ==> [2, 3]
+  shape(squeeze(t))  # => [2, 3]
   ```
 
   Or, to remove specific size 1 dimensions:
 
-  ```prettyprint
+  ```python
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-  shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+  shape(squeeze(t, [2, 4]))  # => [1, 2, 3, 1]
   ```
 
   Args:
@@ -2290,7 +2314,7 @@ def where(condition, x=None, y=None, name=None):
 
   If both non-None, `x` and `y` must have the same shape.
   The `condition` tensor must be a scalar if `x` and `y` are scalar.
-  If `x` and `y` are vectors or higher rank, then `condition` must be either a
+  If `x` and `y` are vectors of higher rank, then `condition` must be either a
   vector with size matching the first dimension of `x`, or must have the same
   shape as `x`.
 
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index 397ed91078b..c2ee2b38323 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -198,7 +198,7 @@ class BatchNormBenchmark(test.Benchmark):
     if FLAGS.use_gpu:
       t1 = self._run_graph("gpu", shape, axes, 10, "op", True, True, 50)
       t2 = self._run_graph("gpu", shape, axes, 10, "py", True, True, 50)
-      t2 = self._run_graph("gpu", shape, axes, 10, "slow", True, True, 50)
+      t3 = self._run_graph("gpu", shape, axes, 10, "slow", True, True, 50)
       print_difference("op vs py", t1, t2)
       print_difference("py vs slow", t2, t3)
     print("Forward convolution (higher layers).")
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 285c199b109..d6294c24f5c 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -53,7 +53,9 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample. The
+      `sampled_candidates` return value will have shape `[num_sampled]`. If
+      `unique=True`, `num_sampled` must be less than or equal to `range_max`.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -61,8 +63,10 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.  The
+      sampled classes, either with possible duplicates (`unique=False`) or all
+      unique (`unique=True`). In either case, `sampled_candidates` is
+      independent of the true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -111,7 +115,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -166,7 +170,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -230,7 +234,7 @@ def fixed_unigram_candidate_sampler(true_classes,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -245,7 +249,7 @@ def fixed_unigram_candidate_sampler(true_classes,
       `distortion = 1.0` gives regular unigram sampling (as defined by the vocab
       file), and `distortion = 0.0` gives a uniform distribution.
     num_reserved_ids: Optionally some reserved IDs can be added in the range
-      `[0, num_reserved_ids]` by the users. One use case is that a special
+      `[0, num_reserved_ids)` by the users. One use case is that a special
       unknown word token is used as ID 0. These IDs will have a sampling
       probability of 0.
     num_shards: A sampler can be used to sample from a subset of the original
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 0e088649cd5..1d853df86ca 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -13,15 +13,16 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
-"""## Asserts and Boolean Checks
+"""Asserts and Boolean Checks.
+
+See the @{$python/check_ops} guide.
 
 @@assert_negative
 @@assert_positive
-@@assert_proper_iterable
 @@assert_non_negative
 @@assert_non_positive
 @@assert_equal
-@@assert_integer
+@@assert_none_equal
 @@assert_less
 @@assert_less_equal
 @@assert_greater
@@ -29,6 +30,10 @@
 @@assert_rank
 @@assert_rank_at_least
 @@assert_type
+@@assert_integer
+@@assert_proper_iterable
+@@assert_same_float_dtype
+@@assert_scalar
 @@is_non_decreasing
 @@is_numeric_tensor
 @@is_strictly_increasing
@@ -61,6 +66,7 @@ __all__ = [
     'assert_non_negative',
     'assert_non_positive',
     'assert_equal',
+    'assert_none_equal',
     'assert_integer',
     'assert_less',
     'assert_less_equal',
@@ -69,6 +75,8 @@ __all__ = [
     'assert_rank',
     'assert_rank_at_least',
     'assert_rank_in',
+    'assert_same_float_dtype',
+    'assert_scalar',
     'assert_type',
     'is_non_decreasing',
     'is_numeric_tensor',
@@ -76,6 +84,22 @@ __all__ = [
 ]
 
 
+def _maybe_constant_value_string(t):
+  if not isinstance(t, ops.Tensor):
+    return str(t)
+  const_t = tensor_util.constant_value(t)
+  if const_t is not None:
+    return str(const_t)
+  return t
+
+
+def _assert_static(condition, data):
+  """Raises a static ValueError with as much information as possible."""
+  if not condition:
+    data_static = [_maybe_constant_value_string(x) for x in data]
+    raise ValueError('\n'.join(data_static))
+
+
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
 
@@ -132,7 +156,9 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       data = [
-          message, 'Condition x < 0 did not hold element-wise: x = ', x.name, x]
+          message,
+          'Condition x < 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(x, zero, data=data, summarize=summarize)
 
@@ -166,7 +192,8 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       data = [
-          message, 'Condition x > 0 did not hold element-wise: x = ', x.name, x]
+          message, 'Condition x > 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(zero, x, data=data, summarize=summarize)
 
@@ -202,7 +229,8 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x >= 0 did not hold element-wise: x = ', x.name, x]
+          'Condition x >= 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
@@ -238,7 +266,8 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x <= 0 did not hold element-wise: x = ', x.name, x]
+          'Condition x <= 0 did not hold element-wise:'
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
@@ -276,10 +305,59 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x == y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x == y did not hold element-wise:',
+          'x (%s) = ' % x.name, x,
+          'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.equal(x, y))
+    x_static = tensor_util.constant_value(x)
+    y_static = tensor_util.constant_value(y)
+    if x_static is not None and y_static is not None:
+      condition_static = (x_static == y_static).all()
+      _assert_static(condition_static, data)
+    return control_flow_ops.Assert(condition, data, summarize=summarize)
+
+
+def assert_none_equal(
+    x, y, data=None, summarize=None, message=None, name=None):
+  """Assert the condition `x != y` holds for all elements.
+
+  Example of adding a dependency to an operation:
+
+  ```python
+  with tf.control_dependencies([tf.assert_none_equal(x, y)]):
+    output = tf.reduce_sum(x)
+  ```
+
+  This condition holds if for every pair of (possibly broadcast) elements
+  `x[i]`, `y[i]`, we have `x[i] != y[i]`.
+  If both `x` and `y` are empty, this is trivially satisfied.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    data:  The tensors to print out if the condition is False.  Defaults to
+      error message and first few entries of `x`, `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+      Defaults to "assert_none_equal".
+
+  Returns:
+    Op that raises `InvalidArgumentError` if `x != y` is ever False.
+  """
+  message = message or ''
+  with ops.name_scope(name, 'assert_none_equal', [x, y, data]):
+    x = ops.convert_to_tensor(x, name='x')
+    y = ops.convert_to_tensor(y, name='y')
+    if data is None:
+      data = [
+          message,
+          'Condition x != y did not hold for every single element:'
+          'x (%s) = ' % x.name, x,
+          'y (%s) = ' % y.name, y
+      ]
+    condition = math_ops.reduce_all(math_ops.not_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
@@ -316,8 +394,8 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x < y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x < y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.less(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -356,8 +434,8 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x <= y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x <= y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.less_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -396,8 +474,8 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x > y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x > y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -438,8 +516,8 @@ def assert_greater_equal(x, y, data=None, summarize=None, message=None,
     if data is None:
       data = [
           message,
-          'Condition x >= y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x >= y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -648,7 +726,7 @@ def _assert_ranks_condition(
 
   # Attempt to statically defined rank.
   ranks_static = tuple([tensor_util.constant_value(rank) for rank in ranks])
-  if None not in ranks_static:
+  if not any(r is None for r in ranks_static):
     for rank_static in ranks_static:
       if rank_static.ndim != 0:
         raise ValueError('Rank must be a scalar.')
@@ -860,3 +938,72 @@ def is_strictly_increasing(x, name=None):
     # When len(x) = 1, diff = [], less = [], and reduce_all([]) = True.
     zero = ops.convert_to_tensor(0, dtype=diff.dtype)
     return math_ops.reduce_all(math_ops.less(zero, diff))
+
+
+def _assert_same_base_type(items, expected_type=None):
+  r"""Asserts all items are of the same base type.
+
+  Args:
+    items: List of graph items (e.g., `Variable`, `Tensor`, `SparseTensor`,
+        `Operation`, or `IndexedSlices`). Can include `None` elements, which
+        will be ignored.
+    expected_type: Expected type. If not specified, assert all items are
+        of the same base type.
+
+  Returns:
+    Validated type, or none if neither expected_type nor items provided.
+
+  Raises:
+    ValueError: If any types do not match.
+  """
+  original_item_str = None
+  for item in items:
+    if item is not None:
+      item_type = item.dtype.base_dtype
+      if not expected_type:
+        expected_type = item_type
+        original_item_str = item.name if hasattr(item, 'name') else str(item)
+      elif expected_type != item_type:
+        raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
+            item.name if hasattr(item, 'name') else str(item),
+            item_type, expected_type,
+            (' as %s' % original_item_str) if original_item_str else ''))
+  return expected_type
+
+
+def assert_same_float_dtype(tensors=None, dtype=None):
+  """Validate and return float type based on `tensors` and `dtype`.
+
+  For ops such as matrix multiplication, inputs and weights must be of the
+  same float type. This function validates that all `tensors` are the same type,
+  validates that type is `dtype` (if supplied), and returns the type. Type must
+  be a floating point type. If neither `tensors` nor `dtype` is supplied,
+  the function will return `dtypes.float32`.
+
+  Args:
+    tensors: Tensors of input values. Can include `None` elements, which will be
+        ignored.
+    dtype: Expected type.
+  Returns:
+    Validated type.
+  Raises:
+    ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
+        float, or the common type of the inputs is not a floating point type.
+  """
+  if tensors:
+    dtype = _assert_same_base_type(tensors, dtype)
+  if not dtype:
+    dtype = dtypes.float32
+  elif not dtype.is_floating:
+    raise ValueError('Expected floating point type, got %s.' % dtype)
+  return dtype
+
+
+def assert_scalar(tensor, name=None):
+  with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
+    tensor = ops.convert_to_tensor(tensor, name=name_scope)
+    shape = tensor.get_shape()
+    if shape.ndims != 0:
+      raise ValueError('Expected scalar shape for %s, saw shape: %s.'
+                       % (tensor.name, shape))
+    return tensor
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index bda7212c8ae..7430c28583b 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -41,12 +41,18 @@ def clip_by_value(t, clip_value_min, clip_value_max,
 
   Args:
     t: A `Tensor`.
-    clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by.
-    clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by.
+    clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+      as `t`. The minimum value to clip by.
+    clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+      as `t`. The maximum value to clip by.
     name: A name for the operation (optional).
 
   Returns:
     A clipped `Tensor`.
+
+  Raises:
+    ValueError: if the clip tensors would trigger array broadcasting
+      that would make the returned tensor larger than the input.
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
@@ -54,7 +60,12 @@ def clip_by_value(t, clip_value_min, clip_value_max,
 
     # Go through list of tensors, for each value in each tensor clip
     t_min = math_ops.minimum(t, clip_value_max)
+    # Assert that the shape is compatible with the initial shape,
+    # to prevent unintentional broadcasting.
+    _ = t.shape.merge_with(t_min.shape)
+
     t_max = math_ops.maximum(t_min, clip_value_min, name=name)
+    _ = t.shape.merge_with(t_max.shape)
 
   return t_max
 
@@ -98,7 +109,11 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
     # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
     l2norm_inv = math_ops.rsqrt(
         math_ops.reduce_sum(t * t, axes, keep_dims=True))
-    tclip = array_ops.identity(t * clip_norm * math_ops.minimum(
+    intermediate = t * clip_norm
+    # Assert that the shape is compatible with the initial shape,
+    # to prevent unintentional broadcasting.
+    _ = t.shape.merge_with(intermediate.shape)
+    tclip = array_ops.identity(intermediate * math_ops.minimum(
         l2norm_inv, constant_op.constant(1.0, dtype=t.dtype) / clip_norm),
                                name=name)
 
diff --git a/tensorflow/python/ops/cloud/cloud.py b/tensorflow/python/ops/cloud/cloud.py
deleted file mode 100644
index eb917a987e9..00000000000
--- a/tensorflow/python/ops/cloud/cloud.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Import cloud ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.cloud.bigquery_reader_ops import *
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ['BigQueryReader']
-remove_undocumented(__name__, _allowed_symbols, [sys.modules[__name__]])
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 95247ea125f..32e071db174 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -155,9 +156,30 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     predictions = math_ops.cast(predictions, dtypes.int64)
     labels = math_ops.cast(labels, dtypes.int64)
 
+    # Sanity checks - underflow or overflow can cause memory corruption.
+    labels = control_flow_ops.with_dependencies(
+        [check_ops.assert_non_negative(
+            labels, message='`labels` contains negative values')],
+        labels)
+    predictions = control_flow_ops.with_dependencies(
+        [check_ops.assert_non_negative(
+            predictions, message='`predictions` contains negative values')],
+        predictions)
+
     if num_classes is None:
       num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
                                      math_ops.reduce_max(labels)) + 1
+    else:
+      num_classes_int64 = math_ops.cast(num_classes, dtypes.int64)
+      labels = control_flow_ops.with_dependencies(
+          [check_ops.assert_less(
+              labels, num_classes_int64, message='`labels` out of bound')],
+          labels)
+      predictions = control_flow_ops.with_dependencies(
+          [check_ops.assert_less(
+              predictions, num_classes_int64,
+              message='`predictions` out of bound')],
+          predictions)
 
     if weights is not None:
       predictions.get_shape().assert_is_compatible_with(weights.get_shape())
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 133528a1cd6..496c5addad0 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -49,7 +49,7 @@ def _SwitchGrad(op, *grad):
       # This is the second time this Switch is visited. It comes from
       # the non-exit branch of the Switch, so update the second input
       # to the Merge.
-      # TODO: Perform shape inference with this new input.
+      # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
         control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
@@ -72,6 +72,9 @@ def _SwitchGrad(op, *grad):
     good_grad = grad[op_ctxt.branch]
     zero_grad = grad[1 - op_ctxt.branch]
     # At this point, we have created zero_grad guarded by the right switch.
+    # Unfortunately, we may still get None here for not trainable data types.
+    if zero_grad is None:
+      return None, None
     return merge([good_grad, zero_grad], name="cond_grad")[0], None
   else:
     false_grad = switch(grad[0], op.inputs[1])[0]
@@ -162,11 +165,14 @@ def _ExitGrad(op, grad):
     dense_shape = grad.dense_shape
     if dense_shape is not None:
       grad_ctxt.AddName(dense_shape.name)
-  enter_fn = control_flow_ops._Enter  # pylint: disable=protected-access
   grad_ctxt.Enter()
-  result = enter_fn(grad, grad_ctxt.name, is_constant=False,
-                    parallel_iterations=grad_ctxt.parallel_iterations,
-                    name="b_exit")
+  # pylint: disable=protected-access
+  result = control_flow_ops._Enter(
+      grad, grad_ctxt.name, is_constant=False,
+      parallel_iterations=grad_ctxt.parallel_iterations,
+      name="b_exit")
+  # pylint: enable=protected-access
+  grad_ctxt.loop_enters.append(result)
   grad_ctxt.Exit()
   return result
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index bdb65e72a38..478e0a9472b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Control Flow Operations
+"""Control Flow Operations.
 
-TensorFlow provides several operations and classes that you can use to control
-the execution of operations and add conditional dependencies to your graph.
+See the @{$python/control_flow_ops} guide.
 
 @@identity
 @@tuple
@@ -26,22 +25,10 @@ the execution of operations and add conditional dependencies to your graph.
 @@cond
 @@case
 @@while_loop
-
-## Logical Operators
-
-TensorFlow provides several operations that you can use to add logical operators
-to your graph.
-
 @@logical_and
 @@logical_not
 @@logical_or
 @@logical_xor
-
-## Comparison Operators
-
-TensorFlow provides several operations that you can use to add comparison
-operators to your graph.
-
 @@equal
 @@not_equal
 @@less
@@ -49,12 +36,6 @@ operators to your graph.
 @@greater
 @@greater_equal
 @@where
-
-## Debugging Operations
-
-TensorFlow provides several operations that you can use to validate values and
-debug your graph.
-
 @@is_finite
 @@is_inf
 @@is_nan
@@ -92,7 +73,9 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_should_use
 
 
 # We override the 'tuple' for a control flow op, so we keep python's
@@ -105,6 +88,7 @@ _basetuple = tuple
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
+@tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
 
@@ -294,7 +278,7 @@ def exit(data, name=None):
 def switch(data, pred, dtype=None, name=None):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is true, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwarded to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
@@ -342,7 +326,7 @@ def switch(data, pred, dtype=None, name=None):
 def _SwitchRefOrTensor(data, pred, name="Switch"):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is true, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwarded to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
@@ -353,7 +337,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
     name: A name for this operation (optional).
 
   Returns:
-    `(output_false, output_false)`: If `pred` is true, data will be forwarded to
+    `(output_false, output_true)`: If `pred` is true, data will be forwarded to
     `output_true`, otherwise it goes to `output_false`.
 
   Raises:
@@ -445,17 +429,22 @@ def merge(inputs, name=None):
 # pylint: enable=protected-access
 
 
-def _convert_tensorarrays_to_flows(tensors_or_tensor_arrays):
-  return [ta.flow if isinstance(ta, tensor_array_ops.TensorArray)
-          else ta
-          for ta in tensors_or_tensor_arrays]
+def _convert_tensorarray_to_flow(tensor_or_tensor_array):
+  if isinstance(tensor_or_tensor_array, tensor_array_ops.TensorArray):
+    return tensor_or_tensor_array.flow
+  else:
+    return tensor_or_tensor_array
 
 
 def _make_tensor_array(ta, t_or_flow):
+  # pylint: disable=protected-access
   new_ta = tensor_array_ops.TensorArray(
       dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
-      infer_shape=ta._infer_shape)
-  new_ta._element_shape = ta._element_shape  # pylint: disable=protected-access
+      infer_shape=ta._infer_shape,
+      colocate_with_first_write_call=ta._colocate_with_first_write_call)
+  new_ta._colocate_with = ta._colocate_with
+  new_ta._element_shape = ta._element_shape
+  # pylint: enable=protected-access
   return new_ta
 
 
@@ -1066,15 +1055,19 @@ class ControlFlowState(object):
     """
     loop_exits = []
     for _, grad_state in self._map.items():
+      # pylint: disable=protected-access
       for y in grad_state.forward_loop_exits:
-        # pylint: disable=protected-access
         if pending_count[y.op._id] == 0:
           grad_state.pending_exits_count -= 1
           if y.op._id not in to_ops_set:
             grad_state.unused_exits.append(y)
           if grad_state.pending_exits_count == 0:
             loop_exits.extend(grad_state.unused_exits)
-        # pylint: enable=protected-access
+      # Need to include Enters in backprop for higher-order gradients.
+      for y in grad_state.forward_context.loop_enters:
+        if pending_count[y.op._id] == 0:
+          pending_count[y.op._id] = 1
+      # pylint: enable=protected-access
     return loop_exits
 
   def EnterGradWhileContext(self, op, before):
@@ -1321,11 +1314,15 @@ def ZerosLikeOutsideLoop(op, index):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
-    pred = op_ctxt.pred
-    branch = op_ctxt.branch
-    switch_val = switch(op.inputs[0], pred)[1 - branch]
-    zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
-    return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    if op_ctxt:
+      # We are in a cond context. Use a switch to create zeros only when needed.
+      pred = op_ctxt.pred
+      branch = op_ctxt.branch
+      switch_val = switch(op.inputs[0], pred)[1 - branch]
+      zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
+      return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    else:
+      return array_ops.zeros_like(val, optimize=False)
 
 
 class ControlFlowContext(object):
@@ -1436,8 +1433,7 @@ class ControlFlowContext(object):
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      for x in result:
-        self._outer_context.AddName(x.name)
+      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1473,6 +1469,14 @@ class ControlFlowContext(object):
     return internal_control_inputs
   # pylint: enable=protected-access
 
+  def AddInnerOp(self, op):
+    """Notifies a scope about an operator added to an inner scope."""
+    pass
+
+  def GetControlPivot(self):
+    """Returns the pivot node for this context, or None."""
+    return None
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
@@ -1634,8 +1638,15 @@ class CondContext(ControlFlowContext):
           # pylint: disable=protected-access
           op._update_input(index, real_x)
           # pylint: enable=protected-access
+      # Remove any external control dependency on this op.
+      self._RemoveExternalControlEdges(op)
       for x in op.outputs:
         self._values.add(x.name)
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        op._add_control_input(self._pivot.op)
+      # pylint: enable=protected-access
+
     if self._outer_context or not IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
@@ -1656,73 +1667,100 @@ class CondContext(ControlFlowContext):
         real_val = external_val
     return real_val
 
+  def _BuildCondTensor(self, v):
+    if isinstance(v, ops.Operation):
+      # Use pivot as the proxy for this op.
+      return with_dependencies([v], self._pivot)
+    elif isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
+      values = self._ProcessOutputTensor(v.values)
+      indices = self._ProcessOutputTensor(v.indices)
+      if isinstance(v, ops.IndexedSlices):
+        dense_shape = v.dense_shape
+        if dense_shape is not None:
+          dense_shape = self._ProcessOutputTensor(dense_shape)
+        return ops.IndexedSlices(values, indices, dense_shape)
+      else:
+        dense_shape = self._ProcessOutputTensor(v.dense_shape)
+        return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    else:
+      v = nest.map_structure(_convert_tensorarray_to_flow, v)
+      return self._ProcessOutputTensor(ops.convert_to_tensor(v))
+
   def BuildCondBranch(self, fn):
     """Add the subgraph defined by fn() to the graph."""
-    r = fn()
-    original_r = r
-    result = []
-    if r is not None:
-      if not isinstance(r, list) and not isinstance(r, _basetuple):
-        r = [r]
-        original_r = [original_r]
-      r = _convert_tensorarrays_to_flows(r)
-      for v in r:
-        real_v = v
-        if isinstance(v, ops.Operation):
-          # Use pivot as the proxy for this op.
-          real_v = with_dependencies([v], self._pivot)
-        else:
-          if isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-            values = self._ProcessOutputTensor(v.values)
-            indices = self._ProcessOutputTensor(v.indices)
-            if isinstance(v, ops.IndexedSlices):
-              dense_shape = v.dense_shape
-              if dense_shape is not None:
-                dense_shape = self._ProcessOutputTensor(dense_shape)
-              real_v = ops.IndexedSlices(values, indices, dense_shape)
-            else:
-              dense_shape = self._ProcessOutputTensor(v.dense_shape)
-              real_v = sparse_tensor.SparseTensor(indices, values, dense_shape)
-          else:
-            real_v = self._ProcessOutputTensor(v)
-        result.append(real_v)
-    return original_r, result
+    original_result = fn()
+    if original_result is None:
+      return None, None
+
+    result = nest.map_structure(self._BuildCondTensor, original_result)
+    if not isinstance(result, (list, _basetuple)):
+      result = [result]
+    return original_result, result
 
 
-def cond(pred, fn1, fn2, name=None):
-  """Return either fn1() or fn2() based on the boolean predicate `pred`.
+def _UnpackIfSingleton(res):
+  if isinstance(res, (list, _basetuple)) and len(res) == 1:
+    return res[0]
+  else:
+    return res
 
-  `fn1` and `fn2` both return lists of output tensors. `fn1` and `fn2` must have
-  the same non-zero number and type of outputs.
+
+# pylint: disable=g-doc-args
+@deprecation.deprecated_args(
+    None,
+    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    "fn1", "fn2")
+def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
+         fn1=None, fn2=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
 
   Note that the conditional execution applies only to the operations defined in
-  fn1 and fn2. Consider the following simple program:
+  `true_fn` and `false_fn`. Consider the following simple program:
 
   ```python
   z = tf.multiply(a, b)
   result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
   ```
 
-  If x < y, the `tf.add` operation will be executed and `tf.square`
-  operation will not be executed. Since z is needed for at least one
-  branch of the cond, the `tf.multiply` operation is always executed, unconditionally.
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
   Although this behavior is consistent with the dataflow model of TensorFlow,
   it has occasionally surprised some users who expected a lazier semantics.
 
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+  This behavior is disabled by passing `strict=True`.
+
   Args:
-    pred: A scalar determining whether to return the result of `fn1` or `fn2`.
-    fn1: The callable to be performed if pred is true.
-    fn2: The callable to be performed if pref is false.
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    strict: A boolean that enables/disables 'strict' mode; see above.
     name: Optional name prefix for the returned tensors.
 
   Returns:
-    Tensors returned by the call to either `fn1` or `fn2`. If the callables
-    return a singleton list, the element is extracted from the list.
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
 
   Raises:
-    TypeError: if `fn1` or `fn2` is not callable.
-    ValueError: if `fn1` and `fn2` do not return the same number of tensors, or
-                return tensors of different types.
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
 
   Example:
 
@@ -1737,12 +1775,30 @@ def cond(pred, fn1, fn2, name=None):
   ```
 
   """
-  with ops.name_scope(name, "cond", [pred]) as name:
-    if not callable(fn1):
-      raise TypeError("fn1 must be callable.")
-    if not callable(fn2):
-      raise TypeError("fn2 must be callable.")
+  # We needed to make true_fn/false_fn keyword arguments for
+  # backwards-compatibility. This check exists so that we can convert back to
+  # having them be positional arguments.
+  # TODO(josh11b): Make `true_fn` and `false_fn` positional arguments after
+  # `fn1` and `fn2` are deleted.
+  if fn1 is not None:
+    if true_fn is not None:
+      raise TypeError("cond(): true_fn and fn1 may not be set simultaneously.")
+    true_fn = fn1
+  elif true_fn is None:
+    raise TypeError("cond(): true_fn argument required")
+  if fn2 is not None:
+    if false_fn is not None:
+      raise TypeError("cond(): false_fn and fn2 may not be set simultaneously.")
+    false_fn = fn2
+  elif false_fn is None:
+    raise TypeError("cond(): false_fn argument required")
 
+  if not callable(true_fn):
+    raise TypeError("true_fn must be callable.")
+  if not callable(false_fn):
+    raise TypeError("false_fn must be callable.")
+
+  with ops.name_scope(name, "cond", [pred]) as name:
     # Add the Switch to the graph.
     if isinstance(pred, bool):
       raise TypeError("pred must not be a Python bool")
@@ -1757,23 +1813,43 @@ def cond(pred, fn1, fn2, name=None):
     # Build the graph for the true branch in a new context.
     context_t = CondContext(pred, pivot_1, branch=1)
     context_t.Enter()
-    orig_res, res_t = context_t.BuildCondBranch(fn1)
+    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
+    if orig_res_t is None:
+      raise ValueError("true_fn must have a return value.")
     context_t.ExitResult(res_t)
     context_t.Exit()
 
     # Build the graph for the false branch in a new context.
     context_f = CondContext(pred, pivot_2, branch=0)
     context_f.Enter()
-    _, res_f = context_f.BuildCondBranch(fn2)
+    orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
+    if orig_res_f is None:
+      raise ValueError("false_fn must have a return value.")
     context_f.ExitResult(res_f)
     context_f.Exit()
 
+    if not strict:
+      orig_res_t = _UnpackIfSingleton(orig_res_t)
+      orig_res_f = _UnpackIfSingleton(orig_res_f)
+
+    # Check that the return values of the two branches have the same structure.
+    try:
+      nest.assert_same_structure(orig_res_t, orig_res_f)
+    except TypeError as e:
+      raise TypeError(
+          "Incompatible return types of true_fn and false_fn: {}".format(e))
+    except ValueError as e:
+      raise ValueError(
+          "Incompatible return values of true_fn and false_fn: {}".format(e))
+
     # Add the final merge to the graph.
-    if len(res_t) != len(res_f):
-      raise ValueError("fn1 and fn2 must return the same number of results.")
     if not res_t:
-      raise ValueError("fn1 and fn2 must return at least one result.")
-    for x, y in zip(res_f, res_t):
+      raise ValueError("true_fn and false_fn must return at least one result.")
+
+    res_t_flat = nest.flatten(res_t)
+    res_f_flat = nest.flatten(res_f)
+
+    for x, y in zip(res_t_flat, res_f_flat):
       assert ((isinstance(x, ops.IndexedSlices) and
                isinstance(y, ops.IndexedSlices)) or
               (isinstance(x, sparse_tensor.SparseTensor) and
@@ -1782,16 +1858,33 @@ def cond(pred, fn1, fn2, name=None):
       val_x = x if isinstance(x, ops.Tensor) else x.values
       val_y = y if isinstance(y, ops.Tensor) else y.values
       if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError("Outputs of fn1 and fn2 must have the same type: "
-                         "%s, %s" % (val_x.dtype.name, val_y.dtype.name))
-    merges = [merge([x[0], x[1]])[0] for x in zip(res_f, res_t)]
-    merges = _convert_flows_to_tensorarrays(orig_res, merges)
+        raise ValueError(
+            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
+            (val_x.dtype.name, val_y.dtype.name))
+
+    merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
+    merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
 
     # Add to collections
     ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
     ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    return merges[0] if len(merges) == 1 else merges
+    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
+
+    # Singleton lists and tuples are automatically unpacked if strict == False.
+    if not strict:
+      merges = _UnpackIfSingleton(merges)
+    return merges
+# pylint: enable=g-doc-args
+
+
+def _resource_safe_shape(t):
+  """Returns the shape of t or the variable it points to."""
+  if t.dtype == dtypes.resource:
+    while t.op.inputs:
+      t = t.op.inputs[0]
+    return tensor_shape.TensorShape(t.op.get_attr("shape"))
+  return array_ops.shape_internal(t, optimize=False)
 
 
 # TODO(yuanbyu): Consider having a unified notion of context for
@@ -1854,6 +1947,8 @@ class WhileContext(ControlFlowContext):
     self._pivot = None
     # The list of exit tensors for loop variables.
     self._loop_exits = []
+    # The list of enter tensors for loop variables.
+    self._loop_enters = []
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `WhileContext` from protocol buffer.
@@ -1883,6 +1978,10 @@ class WhileContext(ControlFlowContext):
     self._loop_exits = [g.as_graph_element(
         ops.prepend_name_scope(exit_name, import_scope))
                         for exit_name in context_def.loop_exit_names]
+    # The list of enter tensors for loop variables.
+    self._loop_enters = [g.as_graph_element(
+        ops.prepend_name_scope(enter_name, import_scope))
+                         for enter_name in context_def.loop_enter_names]
     super(WhileContext, self).__init__(values_def=context_def.values_def,
                                        import_scope=import_scope)
 
@@ -1910,6 +2009,11 @@ class WhileContext(ControlFlowContext):
     """The boolean tensor representing the loop termination condition."""
     return self._pivot
 
+  @property
+  def loop_enters(self):
+    """The list of enter tensors for loop variables."""
+    return self._loop_enters
+
   @property
   def loop_exits(self):
     """The list of exit tensors for loop variables."""
@@ -1943,10 +2047,12 @@ class WhileContext(ControlFlowContext):
           self._pivot_for_body.name, export_scope)
       context_def.pivot_name = ops.strip_name_scope(
           self._pivot.name, export_scope)
-      if self._loop_exits:
-        context_def.loop_exit_names.extend(
-            [ops.strip_name_scope(l.name, export_scope)
-             for l in self._loop_exits])
+      context_def.loop_exit_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_exits])
+      context_def.loop_enter_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_enters])
       context_def.values_def.MergeFrom(
           super(WhileContext, self)._to_proto(
               export_scope=export_scope))
@@ -2006,6 +2112,9 @@ class WhileContext(ControlFlowContext):
       with ops.control_dependencies(None):
         enter = _Enter(result, self._name, is_constant=True,
                        parallel_iterations=self._parallel_iterations)
+        enter.graph.prevent_feeding(enter)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(enter.op)
       # Fix the control inputs and control flow context of these enter ops.
       self._FixControlInputsAndContext([enter])
 
@@ -2072,12 +2181,22 @@ class WhileContext(ControlFlowContext):
         self._values.add(x.name)
     if self._outer_context or not IsLoopExit(op):
       op.graph.prevent_fetching(op)
+      for x in op.outputs:
+        op.graph.prevent_feeding(x)
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
 
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
     def _IsOpFree(op):
+      """Determines if `op` needs a control dependency."""
       if op.control_inputs:
         return False
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        return True
+      # pylint: enable=protected-access
       for x in op.inputs:
         if not _IsLoopConstantEnter(x.op):
           return False
@@ -2118,6 +2237,8 @@ class WhileContext(ControlFlowContext):
     enter_n = _Enter(n, self._name, is_constant=False,
                      parallel_iterations=self._parallel_iterations,
                      name="f_count")
+    self.loop_enters.append(enter_n)
+
     merge_n = merge([enter_n, enter_n])[0]
     switch_n = switch(merge_n, self._pivot)
 
@@ -2158,6 +2279,8 @@ class WhileContext(ControlFlowContext):
     enter_count = _Enter(count, self._name, is_constant=False,
                          parallel_iterations=self._parallel_iterations,
                          name="b_count")
+    self.loop_enters.append(enter_count)
+
     merge_count = merge([enter_count, enter_count])[0]
     self._pivot_for_pred = merge_count
 
@@ -2245,6 +2368,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = _Enter(acc, self._name, is_constant=False,
                        parallel_iterations=self._parallel_iterations,
                        name="b_acc")
+    self.loop_enters.append(enter_acc)
+
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
     switch_acc_false, switch_acc_true = switch(merge_acc, self._pivot)
 
@@ -2252,10 +2377,10 @@ class WhileContext(ControlFlowContext):
     next_acc = _NextIteration(add_acc)
     merge_acc.op._update_input(1, next_acc)  # pylint: disable=protected-access
 
-    acc_result = exit(switch_acc_false, name="b_acc")
-    self.loop_exits.append(acc_result)
-    self.ExitResult([acc_result])
-    return acc_result
+    result_acc = exit(switch_acc_false, name="b_acc")
+    self.loop_exits.append(result_acc)
+    self.ExitResult([result_acc])
+    return result_acc
 
   def AddBackPropIndexedSlicesAccumulator(self, op, grad):
     """This is used for accumulating gradients that are IndexedSlices.
@@ -2284,7 +2409,7 @@ class WhileContext(ControlFlowContext):
                                         name="b_acc")
       if self.outer_context: self.outer_context.Exit()
     else:
-      values_shape = array_ops.shape_internal(op.inputs[0], optimize=False)[1:]
+      values_shape = _resource_safe_shape(op.inputs[0])[1:]
       values_shape = array_ops.concat([[1], values_shape], 0)
       values_acc = array_ops.zeros(values_shape, dtype=values.dtype)
     indices_acc = constant_op.constant([0], indices.dtype)
@@ -2312,6 +2437,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = [_Enter(x, self._name, is_constant=False,
                         parallel_iterations=self._parallel_iterations,
                         name="b_acc") for x in init_acc]
+    self.loop_enters.extend(enter_acc)
+
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
     switch_acc = [switch(x, self._pivot) for x in merge_acc]
 
@@ -2329,13 +2456,13 @@ class WhileContext(ControlFlowContext):
     for xm, xn in zip(merge_acc, next_acc):
       xm.op._update_input(1, xn)  # pylint: disable=protected-access
 
-    acc_exits = [exit(x[0], name="b_acc") for x in switch_acc]
-    self.loop_exits.extend(acc_exits)
+    exit_acc = [exit(x[0], name="b_acc") for x in switch_acc]
+    self.loop_exits.extend(exit_acc)
 
-    self.ExitResult(acc_exits)
+    self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=acc_exits[0], values=acc_exits[1],
-        dense_shape=acc_exits[2] if shape_acc is not None else None)
+        indices=exit_acc[0], values=exit_acc[1],
+        dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
     """Makes the values known to this context."""
@@ -2371,18 +2498,32 @@ class WhileContext(ControlFlowContext):
                            parallel_iterations=self._parallel_iterations,
                            use_input_shape=(shape_invariants is None))
                     for x in real_vars]
-    if self._outer_context:
-      control_pivot = self._outer_context.GetControlPivot().op
+      for x in enter_vars:
+        x.graph.prevent_feeding(x)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(x.op)
+
+    # Finds the closest enclosing non-None control pivot.
+    outer_context = self._outer_context
+    control_pivot = None
+    while outer_context is not None and control_pivot is None:
+      control_pivot = outer_context.GetControlPivot()
+      # pylint: disable=protected-access
+      outer_context = outer_context._outer_context
+      # pylint: enable=protected-access
+
+    if control_pivot is not None:
       for var in enter_vars:
         if _IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
-          var.op._add_control_input(control_pivot)
+          var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
     _SetShapeInvariants(real_vars, enter_vars, shape_invariants)
 
     # Fix the control inputs and control flow context of these enter ops.
     self._FixControlInputsAndContext(enter_vars)
     self._InitializeValues(enter_vars)
+    self._loop_enters = enter_vars
 
     merge_vars = [merge([x, x])[0] for x in enter_vars]
     self._pivot_for_pred = merge_vars[0]
@@ -2419,8 +2560,8 @@ class WhileContext(ControlFlowContext):
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
     # Convert TensorArrays returned by body into their flow variables
-    flat_result = nest.flatten(body_result)
-    result = _convert_tensorarrays_to_flows(flat_result)
+    result = nest.map_structure(_convert_tensorarray_to_flow,
+                                nest.flatten(body_result))
     result = ops.convert_n_to_tensor_or_indexed_slices(result)
 
     # Add NextIteration and the back edges to complete the loop.
@@ -2450,9 +2591,9 @@ class WhileContext(ControlFlowContext):
 
     # Keep original_loop_vars to identify which are TensorArrays
     original_loop_vars = loop_vars
-    flat_loop_vars = nest.flatten(loop_vars)
     # Convert TensorArrays to their flow variables
-    loop_vars = _convert_tensorarrays_to_flows(flat_loop_vars)
+    loop_vars = nest.map_structure(_convert_tensorarray_to_flow,
+                                   nest.flatten(loop_vars))
     loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
     try:
       self.Enter()
@@ -2508,12 +2649,16 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   `cond` and `body`. `cond` and `body` both take as many arguments as there are
   `loop_vars`.
 
-  While `cond` evaluates to true, `body` is executed.
-
   In addition to regular Tensors or IndexedSlices, the body may accept and
   return TensorArray objects.  The flows of the TensorArray objects will
   be appropriately forwarded between loops and during gradient calculations.
 
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to make something the repeats
+  `body` until `cond` returns false.
+
   For correctness, `tf.while_loop()` strictly enforces shape invariants for
   the loop variables. A shape invariant is a (possibly partial) shape that
   is unchanged across the iterations of the loop. An error will be raised
@@ -2525,7 +2670,7 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   `loop_vars` is the same in every iteration. The `shape_invariants` argument
   allows the caller to specify a less specific shape invariant for each loop
   variable, which is needed if the shape varies between iterations. The
-  [`Tensor.set_shape()`](../../api_docs/python/framework.md#Tensor.set_shape)
+  @{tf.Tensor.set_shape}
   function may also be used in the `body` function to indicate that
   the output loop variable has a particular shape. The shape invariant for
   SparseTensor and IndexedSlices are treated specially as follows:
@@ -2578,35 +2723,35 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
 
   Example:
 
-    ```python
-    i = tf.constant(0)
-    c = lambda i: tf.less(i, 10)
-    b = lambda i: tf.add(i, 1)
-    r = tf.while_loop(c, b, [i])
-    ```
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
 
   Example with nesting and a namedtuple:
 
-    ```python
-    import collections
-    Pair = collections.namedtuple('Pair', 'j, k')
-    ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
-    c = lambda i, p: i < 10
-    b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
-    ijk_final = tf.while_loop(c, b, ijk_0)
-    ```
+  ```python
+  import collections
+  Pair = collections.namedtuple('Pair', 'j, k')
+  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  c = lambda i, p: i < 10
+  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  ijk_final = tf.while_loop(c, b, ijk_0)
+  ```
 
   Example using shape_invariants:
 
-    ```python
-    i0 = tf.constant(0)
-    m0 = tf.ones([2, 2])
-    c = lambda i, m: i < 10
-    b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
-    tf.while_loop(
-        c, b, loop_vars=[i0, m0],
-        shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
-    ```
+  ```python
+  i0 = tf.constant(0)
+  m0 = tf.ones([2, 2])
+  c = lambda i, m: i < 10
+  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+  ```
 
   """
   with ops.name_scope(name, "while", loop_vars) as name:
@@ -2677,7 +2822,7 @@ def with_dependencies(dependencies, output_tensor, name=None):
   no guarantee that `output_tensor` will be evaluated after any `dependencies`
   have run.
 
-  See also `tuple` and `group`.
+  See also @{tf.tuple$tuple} and @{tf.group$group}.
 
   Args:
     dependencies: Iterable of operations to run before this op finishes.
@@ -2719,7 +2864,8 @@ def group(*inputs, **kwargs):
   When this op finishes, all ops in `input` have finished. This op has no
   output.
 
-  See also `tuple` and `with_dependencies`.
+  See also @{tf.tuple$tuple} and
+  @{tf.control_dependencies$control_dependencies}.
 
   Args:
     *inputs: Zero or more tensors to group.
@@ -2782,7 +2928,8 @@ def tuple(tensors, name=None, control_inputs=None):
   returned by `tuple` are only available after all the parallel computations
   are done.
 
-  See also `group` and `with_dependencies`.
+  See also @{tf.group$group} and
+  @{tf.control_dependencies$control_dependencies}.
 
   Args:
     tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
@@ -2822,7 +2969,7 @@ def tuple(tensors, name=None, control_inputs=None):
     return tpl
 
 
-def case(pred_fn_pairs, default, exclusive=False, name="case"):
+def case(pred_fn_pairs, default, exclusive=False, strict=False, name="case"):
   """Create a case operation.
 
   The `pred_fn_pairs` parameter is a dict or list of pairs of size N.
@@ -2839,6 +2986,18 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
   are returned immediately. If none of the predicates evaluate to True, this
   operation returns the tensors generated by `default`.
 
+  `tf.case` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. All of the callables must return the same
+  (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  a callable, they are implicitly unpacked to single values. This
+  behavior is disabled by passing `strict=True`.
+
+  If an unordered dictionary is used for `pred_fn_pairs`, the order of the
+  conditional tests is not guaranteed. However, the order is guaranteed to be
+  deterministic, so that variables created in conditional branches are created
+  in fixed order across runs.
+
   Example 1:
     Pseudocode:
     ```
@@ -2864,9 +3023,6 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
 
     Expressions:
     ```
-      x = tf.constant(0)
-      y = tf.constant(1)
-      z = tf.constant(2)
       def f1(): return tf.constant(17)
       def f2(): return tf.constant(23)
       def f3(): return tf.constant(-1)
@@ -2879,6 +3035,7 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
                    callable which returns a list of tensors.
     default: A callable that returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    strict: A boolean that enables/disables 'strict' mode; see above.
     name: A name for this operation (optional).
 
   Returns:
@@ -2896,11 +3053,14 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
           or isinstance(pfp, dict)):
     raise TypeError("fns must be a list, tuple, or dict")
   if isinstance(pfp, dict):
-    pfp = pfp.items()
-    if not exclusive:
-      logging.warn("%s: Provided dictionary of predicate/fn pairs, but "
-                   "exclusive=False.  Order of conditional tests is "
-                   "not guaranteed.", name)
+    if isinstance(pfp, collections.OrderedDict):
+      pfp = pfp.items()
+    else:
+      pfp = sorted(pfp.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
+                     "provided, but exclusive=False. The order of conditional "
+                     "tests is deterministic but not guaranteed.", name)
   for tup in pfp:
     if not isinstance(tup, _basetuple) or len(tup) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -2943,20 +3103,31 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
 
     # Create an empty tensor, or list, with the right type and shape
     with ops.name_scope("case_create_empty"):
-      dummy_value = default()
+      def _create_empty_constant(dtype, shape):
+        value = ("" if dtype == dtypes.string else dtype.as_numpy_dtype())
+        if shape.ndims is None:
+          return array_ops.constant(value, dtype=dtype)
+        else:
+          temp_shape = [1 if x.value is None else x.value for x in shape]
+          result = array_ops.constant(value, shape=temp_shape, dtype=dtype)
+          result._shape = shape  # pylint: disable=protected-access
+          return result
+
       def _correct_empty(v):
         if isinstance(v, ops.Operation):
           return no_op()
-        elif v.dtype == dtypes.string:
-          return array_ops.constant("")
+        elif isinstance(v, tensor_array_ops.TensorArray):
+          return v
+        elif not hasattr(v, "dtype"):
+          return ops.convert_to_tensor(v)
+        elif isinstance(v, sparse_tensor.SparseTensor):
+          return sparse_tensor.SparseTensor(indices=[[0] * len(v.get_shape())],
+                                            values=[v.dtype.as_numpy_dtype()],
+                                            dense_shape=v.get_shape())
         else:
-          return array_ops.constant(v.dtype.as_numpy_dtype())
+          return _create_empty_constant(v.dtype, v.get_shape())
 
-      if isinstance(dummy_value, collections.Sequence):
-        dummy_type = type(dummy_value)
-        empty = lambda: dummy_type(_correct_empty(v) for v in dummy_value)
-      else:
-        empty = lambda: _correct_empty(dummy_value)
+      empty = lambda: nest.map_structure(_correct_empty, default())
 
     # case_sequence = [
     #   cond(~p3 & ~p2 & ~p1, default, empty),
@@ -2974,7 +3145,7 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
         prev_case = cond(
             cp, fn,
             empty if i == 0 else lambda: prev_case,
-            name="If_%d" % i)
+            strict=strict, name="If_%d" % i)
       return prev_case
 
     if exclusive:
@@ -2996,6 +3167,8 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
     else:
       case_seq = _build_case()
 
+    if not strict:
+      case_seq = _UnpackIfSingleton(case_seq)
     return case_seq
 
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 14704c93937..4e95783e5a8 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import numpy as np
+
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -37,9 +42,14 @@ from tensorflow.python.ops import variables
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import momentum
+from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
 
 
+TestTuple = collections.namedtuple("TestTuple", "a b")
+SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
+
+
 class GroupTestCase(TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -205,11 +215,12 @@ class SwitchTestCase(TensorFlowTestCase):
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(10.0, cost.eval())
 
-  def testIndexedSlicesGradientInCondInWhileLoop(self):
+  def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
     with ops.Graph().as_default():
       embedding_matrix = variable_scope.get_variable(
           "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer())
+          initializer=init_ops.random_normal_initializer(),
+          use_resource=use_resource)
 
       def Cond(it, _):
         return it < 5
@@ -240,6 +251,12 @@ class SwitchTestCase(TensorFlowTestCase):
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
 
+  def testIndexedSlicesGradientInCondInWhileLoop(self):
+    self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=False)
+
+  def testIndexedSlicesGradientInCondInWhileLoopResource(self):
+    self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=True)
+
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.test_session() as sess:
@@ -296,6 +313,79 @@ class SwitchTestCase(TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  def testGradientThroughSingleBranchOutsideOfContext(self):
+    with self.test_session():
+      x = constant_op.constant(2.)
+      s = constant_op.constant(True)
+      x_false, x_true = control_flow_ops.switch(x, s)
+      grad_x_true = gradients_impl.gradients(x_true, x)[0]
+      grad_x_false = gradients_impl.gradients(x_false, x)[0]
+      self.assertEquals(grad_x_true.eval(), 1.)
+      self.assertEquals(grad_x_false.eval(), 0.)
+
+
+class CondTest(TensorFlowTestCase):
+
+  def testCondTrue(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalse(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondTrueLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalseLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondMissingArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, false_fn=lambda: x)
+
+  def testCondMissingArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x)
+
+  def testCondDuplicateArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+
+  def testCondDuplicateArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+
 
 class ContextTest(TensorFlowTestCase):
 
@@ -327,5 +417,356 @@ class ContextTest(TensorFlowTestCase):
               control_flow_ops.WhileContext.from_proto(c.to_proto()).to_proto())
 
 
+def _GetNestedShape(nested):
+  def _GetShape(tensor):
+    if isinstance(tensor, tensor_array_ops.TensorArray):
+      return tensor_array_ops.TensorArray
+    elif isinstance(tensor, ops.IndexedSlices):
+      return tensor.dense_shape
+    else:
+      return tensor.get_shape()
+
+  return nest.map_structure(_GetShape, nested)
+
+
+def _CreateTensorArray(size, shape):
+  ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size,
+                                    clear_after_read=False)
+  for i in range(size):
+    ta = ta.write(i, array_ops.zeros(shape))
+  return ta
+
+
+def _RawNestedShape(nested_shape):
+  def _RawShape(shape):
+    if isinstance(shape, tensor_shape.TensorShape) and shape.ndims is not None:
+      return [x.value for x in shape]
+    else:
+      return None
+  return nest.map_structure(_RawShape, nested_shape)
+
+
+# TODO(yori): Add tests for indexed slices.
+class DataTypesTest(TensorFlowTestCase):
+
+  def assertAllEqualNested(self, a, b):
+    if isinstance(a, (list, tuple)):
+      for entry_a, entry_b in zip(a, b):
+        self.assertAllEqualNested(entry_a, entry_b)
+    else:
+      self.assertAllEqual(a, b)
+
+  def _testShape(self, fn_true, fn_false, expected_shape,
+                 strict=False):
+    condition = array_ops.placeholder(dtypes.bool)
+    output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
+                                        strict=strict)
+    self.assertEqual(_RawNestedShape(_GetNestedShape(output_cond)),
+                     _RawNestedShape(expected_shape))
+
+    output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
+                                        strict=strict)
+    self.assertEqual(_RawNestedShape(_GetNestedShape(output_case)),
+                     _RawNestedShape(expected_shape))
+
+  def _testReturnValues(self, fn_true, fn_false, expected_value_true,
+                        expected_value_false, strict=False,
+                        check_cond=True):
+    condition = array_ops.placeholder(dtypes.bool)
+    output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
+                                        strict=strict)
+    output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
+                                        strict=strict)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      result_cond, result_case = sess.run([output_cond, output_case],
+                                          feed_dict={condition: True})
+      self.assertAllEqualNested(result_cond, expected_value_true)
+      if check_cond:
+        self.assertAllEqualNested(result_case, expected_value_true)
+      result_cond, result_case = sess.run([output_cond, output_case],
+                                          feed_dict={condition: False})
+      self.assertAllEqualNested(result_cond, expected_value_false)
+      if check_cond:
+        self.assertAllEqualNested(result_case, expected_value_false)
+
+  def test_int(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: 1
+    fn_false = lambda: 2
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 2)
+    self._testShape(fn_true, fn_false, shape, strict=True)
+    self._testReturnValues(fn_true, fn_false, 1, 2, strict=True)
+
+  def test_float(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: 1.0
+    fn_false = lambda: 2.0
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1.0, 2.0)
+
+  def test_noop(self):
+    shape = tensor_shape.TensorShape(None)
+    self._testShape(control_flow_ops.no_op, control_flow_ops.no_op, shape)
+    self._testReturnValues(control_flow_ops.no_op, control_flow_ops.no_op,
+                           True, False, check_cond=False)
+
+  def test_string(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: "abc"
+    fn_false = lambda: "xyz"
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
+
+  def test_variable(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: variables.Variable(3.0)
+    fn_false = lambda: variables.Variable(4.0)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 3.0, 4.0)
+
+  def test_none(self):
+    fn_none = lambda: None
+    fn_tensor = lambda: constant_op.constant(1)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_none, fn_tensor)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
+
+  def test_tensors(self):
+    def _BuildTrueBranch(dtype):
+      def _Build():
+        return (array_ops.zeros([2, 2], dtype=dtype),
+                array_ops.ones([3, 3], dtype=dtype))
+      return _Build
+
+    def _BuildFalseBranch(dtype):
+      def _Build():
+        return (array_ops.ones([2, 2], dtype=dtype),
+                array_ops.zeros([3, 3], dtype=dtype))
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = (tensor_shape.TensorShape([2, 2]),
+               tensor_shape.TensorShape([3, 3]))
+      fn_true = _BuildTrueBranch(dtype)
+      fn_false = _BuildFalseBranch(dtype)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             (np.zeros([2, 2]), np.ones([3, 3])),
+                             (np.ones([2, 2]), np.zeros([3, 3])))
+
+  def test_tensors_unknown_shape(self):
+    def _BuildTrueBranch(dtype):
+      def _Build():
+        tensor = array_ops.zeros([2, 2], dtype=dtype)
+        tensor._shape = tensor_shape.TensorShape(None)
+        return tensor
+      return _Build
+
+    def _BuildFalseBranch(dtype):
+      def _Build():
+        tensor = array_ops.ones([2, 2], dtype=dtype)
+        tensor._shape = tensor_shape.TensorShape(None)
+        return tensor
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = tensor_shape.TensorShape(None)
+      fn_true = _BuildTrueBranch(dtype)
+      fn_false = _BuildFalseBranch(dtype)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             np.zeros([2, 2]), np.ones([2, 2]))
+
+  def test_sparse_tensors(self):
+    shape = tensor_shape.TensorShape([None, None])
+
+    def FnTrue():
+      return [sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]],
+                                         values=[1, 2], dense_shape=[3, 4])]
+
+    def FnFalse():
+      return [sparse_tensor.SparseTensor(indices=[[0, 0], [2, 1]],
+                                         values=[3, 4], dense_shape=[3, 4])]
+
+    value1 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [1, 2]],
+                                             values=[1, 2], dense_shape=[3, 4])
+    value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
+                                             values=[3, 4], dense_shape=[3, 4])
+    self._testShape(FnTrue, FnFalse, shape)
+    self._testReturnValues(FnTrue, FnFalse, value1, value2)
+    self._testShape(FnTrue, FnFalse, [shape], strict=True)
+    self._testReturnValues(FnTrue, FnFalse, [value1], [value2], strict=True)
+
+  def test_tensors_with_partially_specified_shapes(self):
+    def _BuildBranch(dtype, shape):
+      def _Build():
+        a = array_ops.zeros([2, 2], dtype=dtype)
+        b = array_ops.zeros([5], dtype=dtype)
+        c = array_ops.ones([3, 3], dtype=dtype)
+        a._shape = tensor_shape.TensorShape(shape[0])
+        b._shape = tensor_shape.TensorShape(shape[1])
+        c._shape = tensor_shape.TensorShape(shape[2])
+        return a, b, c
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = (tensor_shape.TensorShape([None, 2]),
+               tensor_shape.TensorShape([None]),
+               tensor_shape.TensorShape([3, None]))
+      fn_true = _BuildBranch(dtype, shape)
+      fn_false = _BuildBranch(dtype, shape)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])))
+
+  def test_tensor_arrays(self):
+    element_shape = tensor_shape.TensorShape([2])
+    ta1 = _CreateTensorArray(4, element_shape)
+    ta2 = _CreateTensorArray(4, element_shape)
+    shape = tensor_array_ops.TensorArray
+    fn_true = lambda: ta1
+    fn_false = lambda: ta2
+    self._testShape(fn_true, fn_false, shape)
+
+  def test_tensor_array_reads(self):
+    shape = tensor_shape.TensorShape([2])
+    ta = _CreateTensorArray(4, shape)
+    fn_true = lambda: ta.read(0)
+    fn_false = lambda: ta.read(1)
+    self._testShape(fn_true, fn_false, shape)
+
+  def test_list(self):
+    shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
+             tensor_shape.TensorShape([])]
+    fn_true = lambda: [constant_op.constant(1), 2, variables.Variable(3.0)]
+    fn_false = lambda: [constant_op.constant(3), 4, variables.Variable(5.0)]
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, [1, 2, 3.0], [3, 4, 5.0])
+
+  def test_non_strict(self):
+    shape = tensor_shape.TensorShape([])
+    fn_tensor = lambda: constant_op.constant(1)
+    fn_list = lambda: [constant_op.constant(2)]
+    fn_tuple = lambda: (constant_op.constant(3),)
+    self._testShape(fn_tensor, fn_list, shape)
+    self._testShape(fn_tensor, fn_tuple, shape)
+    self._testShape(fn_list, fn_tuple, shape)
+    self._testReturnValues(fn_tensor, fn_list, 1, 2)
+    self._testReturnValues(fn_tensor, fn_tuple, 1, 3)
+    self._testReturnValues(fn_list, fn_tuple, 2, 3)
+
+  def test_singleton_strict(self):
+    fn_tensor = lambda: constant_op.constant(1)
+    fn_list = lambda: [constant_op.constant(2)]
+    fn_tuple = lambda: (constant_op.constant(3),)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_list,
+                            strict=True)
+
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(constant_op.constant(True), fn_list, fn_tuple,
+                            strict=True)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.case([(constant_op.constant(True), fn_tensor)], fn_list,
+                            strict=True)
+
+    with self.assertRaises(TypeError):
+      control_flow_ops.case([(constant_op.constant(True), fn_list)], fn_tuple,
+                            strict=True)
+
+  def test_singleton_list(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: [constant_op.constant(1)]
+    fn_false = lambda: [constant_op.constant(3)]
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, [shape], strict=True)
+    self._testReturnValues(fn_true, fn_false, [1], [3], strict=True)
+
+  def test_singleton_tuple(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: (constant_op.constant(1),)
+    fn_false = lambda: (constant_op.constant(3),)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, (shape,), strict=True)
+    self._testReturnValues(fn_true, fn_false, (1,), (3,),
+                           strict=True)
+
+  def test_singleton_namedtuple(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: SingletonTestTuple(constant_op.constant(1))
+    fn_false = lambda: SingletonTestTuple(constant_op.constant(3))
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, SingletonTestTuple(shape),
+                    strict=True)
+    self._testReturnValues(fn_true, fn_false, SingletonTestTuple(1),
+                           SingletonTestTuple(3), strict=True)
+
+  def test_tuple(self):
+    shape = (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+    fn_true = lambda: (constant_op.constant(1), 2)
+    fn_false = lambda: (constant_op.constant(3), 4)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, (1, 2), (3, 4))
+
+  def test_namedtuple(self):
+    shape = TestTuple(tensor_shape.TensorShape([]),
+                      tensor_shape.TensorShape([]))
+    fn_true = lambda: TestTuple(constant_op.constant(1), 2)
+    fn_false = lambda: TestTuple(constant_op.constant(3), 4)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, TestTuple(1, 2), TestTuple(3, 4))
+
+  def test_nested(self):
+    shape = [tensor_shape.TensorShape([]),
+             TestTuple(tensor_shape.TensorShape([]),
+                       [tensor_shape.TensorShape([]),
+                        tensor_shape.TensorShape([])]),
+             tensor_shape.TensorShape([5, 5]),
+             tensor_shape.TensorShape([])]
+
+    def FnTrue():
+      return [constant_op.constant(1),
+              TestTuple(constant_op.constant(2), [3, 4]),
+              array_ops.zeros([5, 5]), 6]
+
+    def FnFalse():
+      return [constant_op.constant(11),
+              TestTuple(constant_op.constant(12), [13, 14]),
+              array_ops.ones([5, 5]), 16]
+
+    self._testShape(FnTrue, FnFalse, shape)
+    self._testReturnValues(FnTrue, FnFalse,
+                           [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
+                           [11, TestTuple(12, [13, 14]), np.ones([5, 5]), 16])
+
+  def test_cond_inside_while_loop(self):
+    def Body(i, matrix):
+      result_tuple, unused_matrix = control_flow_ops.cond(
+          constant_op.constant(True),
+          lambda: (TestTuple(matrix * 2, matrix * 4), matrix),
+          lambda: (TestTuple(matrix * 4, matrix * 2), matrix))
+      return [i+1, result_tuple.a]
+
+    iteration, matrix = control_flow_ops.while_loop(
+        lambda i, matrix: i < 10,
+        Body,
+        loop_vars=[constant_op.constant(0), array_ops.ones([2, 2])])
+
+    self.assertEqual(iteration.get_shape(), tensor_shape.TensorShape([]))
+    self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 1ce5597e135..477c0d1cb49 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -30,16 +30,15 @@ from tensorflow.python.ops.nn_grad import _BroadcastMul
 # pylint: disable=protected-access, invalid-name
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
-             ctc_merge_repeated=True, time_major=True):
+             ctc_merge_repeated=True,
+             ignore_longer_outputs_than_inputs=False, time_major=True):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
   This op implements the CTC loss as presented in the article:
 
-  A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
-  Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.
-
-  http://www.cs.toronto.edu/~graves/icml_2006.pdf
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Input requirements:
 
@@ -96,6 +95,11 @@ def ctc_loss(labels, inputs, sequence_length,
 
     Untested.  Very likely will not learn to output repeated classes.
 
+  The `ignore_longer_outputs_than_inputs` option allows to specify the behavior
+  of the CTCLoss when dealing with sequences that have longer outputs than
+  inputs. If true, the CTCLoss will simply return zero gradient for those
+  items, otherwise an InvalidArgument error is returned, stopping training.
+
   Args:
     labels: An `int32` `SparseTensor`.
       `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
@@ -113,6 +117,8 @@ def ctc_loss(labels, inputs, sequence_length,
     preprocess_collapse_repeated: Boolean.  Default: False.
       If True, repeated labels are collapsed prior to the CTC calculation.
     ctc_merge_repeated: Boolean.  Default: True.
+    ignore_longer_outputs_than_inputs: Boolean. Default: False.
+      If True, sequences with longer outputs than inputs will be ignored.
     time_major: The shape format of the `inputs` Tensors.
       If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
       If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
@@ -142,7 +148,8 @@ def ctc_loss(labels, inputs, sequence_length,
       labels.values,
       sequence_length,
       preprocess_collapse_repeated=preprocess_collapse_repeated,
-      ctc_merge_repeated=ctc_merge_repeated)
+      ctc_merge_repeated=ctc_merge_repeated,
+      ignore_longer_outputs_than_inputs=ignore_longer_outputs_than_inputs)
 
   return loss
 
@@ -165,7 +172,10 @@ def _CTCLossGrad(op, grad_loss, _):
   # due to the fused implementation's interaction with tf.gradients(),
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
-  grad_without_gradient = array_ops.prevent_gradient(op.outputs[1])
+  grad_without_gradient = array_ops.prevent_gradient(
+      op.outputs[1], message="Currently there is no way to take the second "
+      " derivative of ctc_loss due to the fused implementation's interaction "
+      " with tf.gradients()")
   # Return gradient for inputs and None for
   # labels_indices, labels_values and sequence_length
   return [_BroadcastMul(grad_loss, grad_without_gradient), None, None, None]
@@ -194,7 +204,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
     merge_repeated: Boolean.  Default: True.
 
   Returns:
-    A tuple `(decoded, log_probabilities)` where
+    A tuple `(decoded, neg_sum_logits)` where
     decoded: A single-element list. `decoded[0]`
       is an `SparseTensor` containing the decoded outputs s.t.:
       `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
@@ -203,8 +213,9 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The vector stores the decoded classes.
       `decoded.shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
-    log_probability: A `float` matrix `(batch_size x 1)` containing sequence
-        log-probabilities.
+    neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
+        sequence found, the negative of the sum of the greatest logit at each
+        timeframe.
   """
   outputs = gen_ctc_ops._ctc_greedy_decoder(
       inputs, sequence_length, merge_repeated=merge_repeated)
diff --git a/tensorflow/python/ops/data_flow_grad.py b/tensorflow/python/ops/data_flow_grad.py
index 95c15f334da..79e94dace01 100644
--- a/tensorflow/python/ops/data_flow_grad.py
+++ b/tensorflow/python/ops/data_flow_grad.py
@@ -78,5 +78,6 @@ ops.NotDifferentiable("StackPop")
 ops.NotDifferentiable("StackClose")
 
 ops.NotDifferentiable("GetSessionHandle")
+ops.NotDifferentiable("GetSessionHandleV2")
 ops.NotDifferentiable("GetSessionTensor")
 ops.NotDifferentiable("DeleteSessionTensor")
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cfa70cf46a2..829aa992846 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import re
 import threading
 
 import six
@@ -39,7 +38,6 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.deprecation import deprecated
 
 
 def _as_type_list(dtypes):
@@ -56,6 +54,7 @@ def _as_type_list(dtypes):
 def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
+  del dtypes
   if unknown_dim_allowed:
     if (not isinstance(shapes, collections.Sequence)
         or not shapes
@@ -119,21 +118,10 @@ class QueueBase(object):
   handle single elements, versions that support enqueuing and
   dequeuing a batch of elements at once.
 
-  See [`tf.FIFOQueue`](#FIFOQueue) and
-  [`tf.RandomShuffleQueue`](#RandomShuffleQueue) for concrete
+  See @{tf.FIFOQueue} and
+  @{tf.RandomShuffleQueue} for concrete
   implementations of this class, and instructions on how to create
   them.
-
-  @@enqueue
-  @@enqueue_many
-
-  @@dequeue
-  @@dequeue_many
-
-  @@size
-
-  @@close
-
   """
 
   def __init__(self, dtypes, shapes, names, queue_ref):
@@ -303,12 +291,12 @@ class QueueBase(object):
     until the element has been enqueued.
 
     At runtime, this operation may raise an error if the queue is
-    [closed](#QueueBase.close) before or during its execution. If the
+    @{tf.QueueBase.close} before or during its execution. If the
     queue is closed before this operation runs,
     `tf.errors.CancelledError` will be raised. If this operation is
     blocked, and either (i) the queue is closed by a close operation
     with `cancel_pending_enqueues=True`, or (ii) the session is
-    [closed](../../api_docs/python/client.md#Session.close),
+    @{tf.Session.close},
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -346,12 +334,12 @@ class QueueBase(object):
     until all of the elements have been enqueued.
 
     At runtime, this operation may raise an error if the queue is
-    [closed](#QueueBase.close) before or during its execution. If the
+    @{tf.QueueBase.close} before or during its execution. If the
     queue is closed before this operation runs,
     `tf.errors.CancelledError` will be raised. If this operation is
     blocked, and either (i) the queue is closed by a close operation
     with `cancel_pending_enqueues=True`, or (ii) the session is
-    [closed](../../api_docs/python/client.md#Session.close),
+    @{tf.Session.close},
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -407,11 +395,11 @@ class QueueBase(object):
     until there is an element to dequeue.
 
     At runtime, this operation may raise an error if the queue is
-    [closed](#QueueBase.close) before or during its execution. If the
+    @{tf.QueueBase.close} before or during its execution. If the
     queue is closed, the queue is empty, and there are no pending
     enqueue operations that can fulfill this request,
     `tf.errors.OutOfRangeError` will be raised. If the session is
-    [closed](../../api_docs/python/client.md#Session.close),
+    @{tf.Session.close},
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -448,11 +436,11 @@ class QueueBase(object):
     `OutOfRange` exception is raised.
 
     At runtime, this operation may raise an error if the queue is
-    [closed](#QueueBase.close) before or during its execution. If the
+    @{tf.QueueBase.close} before or during its execution. If the
     queue is closed, the queue contains fewer than `n` elements, and
     there are no pending enqueue operations that can fulfill this
     request, `tf.errors.OutOfRangeError` will be raised. If the
-    session is [closed](../../api_docs/python/client.md#Session.close),
+    session is @{tf.Session.close},
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -490,7 +478,7 @@ class QueueBase(object):
 
     If the queue is closed and there are more than `0` but fewer than
     `n` elements remaining, then instead of raising a
-    `tf.errors.OutOfRangeError` like [`dequeue_many`](#QueueBase.dequeue_many),
+    `tf.errors.OutOfRangeError` like @{tf.QueueBase.dequeue_many},
     less than `n` elements are returned immediately.  If the queue is
     closed and there are `0` elements left in the queue, then a
     `tf.errors.OutOfRangeError` is raised just like in `dequeue_many`.
@@ -528,7 +516,7 @@ class QueueBase(object):
     that would block will fail immediately.
 
     If `cancel_pending_enqueues` is `True`, all pending requests will also
-    be cancelled.
+    be canceled.
 
     Args:
       cancel_pending_enqueues: (Optional.) A boolean, defaulting to
@@ -569,10 +557,8 @@ class QueueBase(object):
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
-  See [`tf.QueueBase`](#QueueBase) for a description of the methods on
+  See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @@__init__
   """
 
   def __init__(self, capacity, min_after_dequeue, dtypes, shapes=None,
@@ -614,7 +600,7 @@ class RandomShuffleQueue(QueueBase):
         with the same length as `dtypes`, or `None`.  If specified the dequeue
         methods return a dictionary with the names as keys.
       seed: A Python integer. Used to create a random seed. See
-        [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+        @{tf.set_random_seed}
         for behavior.
       shared_name: (Optional.) If non-empty, this queue will be shared under
         the given name across multiple sessions.
@@ -644,10 +630,8 @@ class RandomShuffleQueue(QueueBase):
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
-  See [`tf.QueueBase`](#QueueBase) for a description of the methods on
+  See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @@__init__
   """
 
   def __init__(self, capacity, dtypes, shapes=None, names=None,
@@ -697,10 +681,8 @@ class PaddingFIFOQueue(QueueBase):
   A `PaddingFIFOQueue` may contain components with dynamic shape, while also
   supporting `dequeue_many`.  See the constructor for more details.
 
-  See [`tf.QueueBase`](#QueueBase) for a description of the methods on
+  See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @@__init__
   """
 
   def __init__(self, capacity, dtypes, shapes, names=None, shared_name=None,
@@ -761,10 +743,8 @@ class PaddingFIFOQueue(QueueBase):
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
-  See [`tf.QueueBase`](#QueueBase) for a description of the methods on
+  See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @@__init__
   """
 
   def __init__(self, capacity, types, shapes=None, names=None, shared_name=None,
@@ -944,16 +924,18 @@ class Barrier(object):
     If barrier has no completed elements, this operation will block
     until there are 'num_elements' elements to take.
 
+    TODO(b/25743580): the semantics of `allow_small_batch` are experimental
+    and may be extended to other cases in the future.
+
+    TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
+    already when the barrier is closed, it will block for ever. Fix this
+    by using asynchronous operations.
+
     Args:
       num_elements: The number of elements to take.
       allow_small_batch: If the barrier is closed, don't block if there are less
         completed elements than requested, but instead return all available
         completed elements.
-        TODO(b/25743580): the semantics of `allow_small_batch` are experimental
-        and may be extended to other cases in the future.
-        TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
-        already when the barrier is closed, it will block for ever. Fix this
-        by using asynchronous operations.
       timeout: This specifies the number of milliseconds to block
         before returning with DEADLINE_EXCEEDED. (This option is not
         supported yet.)
@@ -1006,7 +988,7 @@ class Barrier(object):
     TakeMany operations that would block will fail immediately.
 
     If `cancel_pending_enqueues` is `True`, all pending requests to the
-    underlying queue will also be cancelled, and completing of already
+    underlying queue will also be canceled, and completing of already
     started values is also not acceptable anymore.
 
     Args:
@@ -1054,47 +1036,6 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
-def initialize_all_tables(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  return tables_initializer(name)
-
-
-def tables_initializer(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
-  if initializers:
-    return control_flow_ops.group(*initializers, name=name)
-  return control_flow_ops.no_op(name=name)
-
-
-ops.NotDifferentiable("LookupTableFind")
-ops.NotDifferentiable("LookupTableInsert")
-ops.NotDifferentiable("LookupTableSize")
-ops.NotDifferentiable("HashTable")
-ops.NotDifferentiable("InitializeTable")
-ops.NotDifferentiable("InitializeTableFromTextFile")
-ops.NotDifferentiable("MutableDenseHashTable")
-ops.NotDifferentiable("MutableHashTable")
-ops.NotDifferentiable("MutableHashTableOfTensors")
-
-
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1228,6 +1169,7 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     successfully applied to the accumulator.
 
     Once successful, the following actions are also triggered:
+
     - Counter of accumulated gradients is reset to 0.
     - Aggregated gradient is reset to 0 tensor.
     - Accumulator's internal time step is incremented by 1.
@@ -1242,8 +1184,10 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     Raises:
       InvalidArgumentError: If num_required < 1
     """
-    return gen_data_flow_ops.accumulator_take_gradient(
+    out = gen_data_flow_ops.accumulator_take_gradient(
         self._accumulator_ref, num_required, dtype=self._dtype, name=name)
+    out.set_shape(self._shape)
+    return out
 
 
 class SparseConditionalAccumulator(ConditionalAccumulatorBase):
@@ -1400,73 +1344,30 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         dense_shape=return_val.shape)
 
 
-class StagingArea(object):
-  """Class for staging inputs. No ordering guarantees.
-
-  A `StagingArea` is a TensorFlow data structure that stores tensors across
-  multiple steps, and exposes operations that can put and get
-  tensors.
-
-  Each `StagingArea` element is a tuple of one or more tensors, where each
-  tuple component has a static dtype, and may have a static shape.
-
-  The capacity of a `StagingArea` is unbounded and supports multiple
-  concurrent producers and consumers; and provides exactly-once delivery.
-
-  Each element of a `StagingArea` is a fixed-length tuple of tensors whose
-  dtypes are described by `dtypes`, and whose shapes are optionally described
-  by the `shapes` argument.
-
-  If the `shapes` argument is specified, each component of a staging area
-  element must have the respective fixed shape. If it is
-  unspecified, different elements may have different shapes,
-  """
-
+class BaseStagingArea(object):
+  """Base class for Staging Areas."""
   _identifier = 0
   _lock = threading.Lock()
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None):
-    """Constructs a staging area object.
-
-    The two optional lists, `shapes` and `names`, must be of the same length
-    as `dtypes` if provided.  The values at a given index `i` indicate the
-    shape and name to use for the corresponding queue component in `dtypes`.
-
-    The device scope at the time of object creation determines where the
-    storage for the `StagingArea` will reside.  Calls to `put` will incur a copy
-    to this memory space, if necessary.  Tensors returned by `get` will be
-    placed according to the device scope when `get` is called.
-
-    Args:
-      dtypes:  A list of types.  The length of dtypes must equal the number
-        of tensors in each element.
-      shapes: (Optional.) Constraints on the shapes of tensors in an element.
-        A list of shape tuples or None. This list is the same length
-        as dtypes.  If the shape of any tensors in the element are constrained,
-        all must be; shapes can be None if the shapes should not be constrained.
-      names: (Optional.) If provided, the `get()` and
-        `put()` methods will use dictionaries with these names as keys.
-        Must be None or a list or tuple of the same length as `dtypes`.
-      shared_name: (Optional.) A name to be used for the shared object. By
-        passing the same name to two different python objects they will share
-        the underlying staging area. Must be a string.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                  capacity=0, memory_limit=0):
     if shared_name is None:
-      self._name = ops.get_default_graph().unique_name("StagingArea")
+      self._name = (ops.get_default_graph()
+                       .unique_name(self.__class__.__name__))
     elif isinstance(shared_name, six.string_types):
       self._name = shared_name
     else:
       raise ValueError("shared_name must be a string")
+
     self._dtypes = dtypes
+
     if shapes is not None:
       if len(shapes) != len(dtypes):
         raise ValueError("StagingArea shapes must be the same length as dtypes")
       self._shapes = [tensor_shape.TensorShape(s) for s in shapes]
     else:
       self._shapes = [tensor_shape.unknown_shape() for _ in self._dtypes]
+
     if names is not None:
       if len(names) != len(dtypes):
         raise ValueError("StagingArea names must be the same length as dtypes")
@@ -1474,6 +1375,9 @@ class StagingArea(object):
     else:
       self._names = None
 
+    self._capacity = capacity
+    self._memory_limit = memory_limit
+
     # all get and put ops must colocate with this op
     with ops.name_scope("%s_root" % self._name):
       self._coloc_op = control_flow_ops.no_op()
@@ -1498,52 +1402,141 @@ class StagingArea(object):
     """The list of names for each component of a staging area element."""
     return self._names
 
-  def _check_put_dtypes(self, vals):
+  @property
+  def capacity(self):
+    """The maximum number of elements of this staging area."""
+    return self._capacity
+
+  @property
+  def memory_limit(self):
+    """The maximum number of bytes of this staging area."""
+    return self._memory_limit
+
+  def _check_put_dtypes(self, vals, indices=None):
     """Validate and convert `vals` to a list of `Tensor`s.
 
     The `vals` argument can be a Tensor, a list or tuple of tensors, or a
     dictionary with tensor values.
 
+    If `vals` is a list, then the appropriate indices associated with the
+    values must be provided.
+
     If it is a dictionary, the staging area must have been constructed with a
     `names` attribute and the dictionary keys must match the staging area names.
+    `indices` will be inferred from the dictionary keys.
     If the staging area was constructed with a `names` attribute, `vals` must
     be a dictionary.
 
+    Checks that the dtype and shape of each value matches that
+    of the staging area.
+
     Args:
       vals: A tensor, a list or tuple of tensors, or a dictionary..
 
     Returns:
-      A list of `Tensor` objects.
+      A (tensors, indices) tuple where `tensors` is a list of `Tensor` objects
+      and `indices` is a list of indices associed with the tensors.
 
     Raises:
-      ValueError: If `vals` is invalid.
+      ValueError: If `vals` or `indices` is invalid.
     """
     if isinstance(vals, dict):
       if not self._names:
         raise ValueError(
             "Staging areas must have names to enqueue a dictionary")
-      if sorted(self._names) != sorted(vals.keys()):
+      if not set(vals.keys()).issubset(self._names):
         raise ValueError("Keys in dictionary to put do not match names "
                          "of staging area. Dictionary: (%s), Queue: (%s)" %
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals = [vals[k] for k in self._names]
+      vals, indices, n = zip(*[(vals[k], i, k) for i, k in enumerate(self._names)
+                                                  if k in vals])
     else:
       if self._names:
         raise ValueError("You must enqueue a dictionary in a staging area "
                          "with names")
+
+      if indices is None:
+        raise ValueError("Indices must be supplied when inserting a list "
+                        "of tensors")
+
+      if len(indices) != len(vals):
+        raise ValueError("Number of indices '%s' doesn't match "
+                         "number of values '%s'")
+
       if not isinstance(vals, (list, tuple)):
         vals = [vals]
+        indices = [0]
+
+    # Sanity check number of values
+    if not len(vals) <= len(self._dtypes):
+      raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
+                          len(values), len(self._dtypes)))
 
     tensors = []
-    for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
-      tensors.append(
-          ops.convert_to_tensor(
-              val, dtype=dtype, name="component_%d" % i))
+
+    for val, i in zip(vals, indices):
+      dtype, shape = self._dtypes[i], self._shapes[i]
+      # Check dtype
+      if not val.dtype == dtype:
+        raise ValueError("Datatypes do not match. '%s' != '%s'" %(
+                        str(val.dtype), str(dtype)))
+
+      # Check shape
+      val.get_shape().assert_is_compatible_with(shape)
+
+      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
+                                          name="component_%d" % i))
+
+    return tensors, indices
+
+  def _create_device_transfers(self, tensors):
+    """Encode inter-device transfers if the current device
+    is not the same as the Staging Area's device
+    """
+
+    if not isinstance(tensors, (tuple, list)):
+      tensors = [tensors]
+
+    curr_device_scope = control_flow_ops.no_op().device
+
+    if curr_device_scope != self._coloc_op.device:
+      tensors = [array_ops.identity(t) for t in tensors]
 
     return tensors
 
+  def _get_return_value(self, tensors, indices):
+    """Return the value to return from a get op.
+
+    If the staging area has names, return a dictionary with the
+    names as keys.  Otherwise return either a single tensor
+    or a list of tensors depending on the length of `tensors`.
+
+    Args:
+      tensors: List of tensors from the get op.
+      indices: Indices of associated names and shapes
+
+    Returns:
+      A single tensor, a list of tensors, or a dictionary
+      of tensors.
+    """
+
+    tensors = self._create_device_transfers(tensors)
+
+    # Sets shape
+    for output, i in zip(tensors, indices):
+      output.set_shape(self._shapes[i])
+
+    if self._names:
+      # The returned values in `tensors` are in the same order as
+      # the names in `self._names`.
+      return {self._names[i]: t for t, i in zip(tensors, indices)}
+    elif len(tensors) == 1:
+      return tensors[0]
+    else:
+      return tensors
+
   def _scope_vals(self, vals):
     """Return a list of values to pass to `name_scope()`.
 
@@ -1560,9 +1553,86 @@ class StagingArea(object):
     else:
       return [vals]
 
+class StagingArea(BaseStagingArea):
+  """Class for staging inputs. No ordering guarantees.
+
+  A `StagingArea` is a TensorFlow data structure that stores tensors across
+  multiple steps, and exposes operations that can put and get tensors.
+
+  Each `StagingArea` element is a tuple of one or more tensors, where each
+  tuple component has a static dtype, and may have a static shape.
+
+  The capacity of a `StagingArea` may be bounded or unbounded.
+  It supports multiple concurrent producers and consumers; and
+  provides exactly-once delivery.
+
+  Each element of a `StagingArea` is a fixed-length tuple of tensors whose
+  dtypes are described by `dtypes`, and whose shapes are optionally described
+  by the `shapes` argument.
+
+  If the `shapes` argument is specified, each component of a staging area
+  element must have the respective fixed shape. If it is
+  unspecified, different elements may have different shapes,
+
+  It can be configured with a capacity in which case
+  put(values) will block until space becomes available.
+
+  Similarly, it can be configured with a memory limit which
+  will block put(values) until space is available.
+  This is mostly useful for limiting the number of tensors on
+  devices such as GPUs.
+
+  All get() and peek() commands block if the requested data
+  is not present in the Staging Area.
+
+  """
+
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                  capacity=0, memory_limit=0):
+    """Constructs a staging area object.
+
+    The two optional lists, `shapes` and `names`, must be of the same length
+    as `dtypes` if provided.  The values at a given index `i` indicate the
+    shape and name to use for the corresponding queue component in `dtypes`.
+
+    The device scope at the time of object creation determines where the
+    storage for the `StagingArea` will reside.  Calls to `put` will incur a copy
+    to this memory space, if necessary.  Tensors returned by `get` will be
+    placed according to the device scope when `get` is called.
+
+    Args:
+      dtypes:  A list of types.  The length of dtypes must equal the number
+        of tensors in each element.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area.
+        An integer. If zero, the Staging Area is unbounded
+      shapes: (Optional.) Constraints on the shapes of tensors in an element.
+        A list of shape tuples or None. This list is the same length
+        as dtypes.  If the shape of any tensors in the element are constrained,
+        all must be; shapes can be None if the shapes should not be constrained.
+      names: (Optional.) If provided, the `get()` and
+        `put()` methods will use dictionaries with these names as keys.
+        Must be None or a list or tuple of the same length as `dtypes`.
+      shared_name: (Optional.) A name to be used for the shared object. By
+        passing the same name to two different python objects they will share
+        the underlying staging area. Must be a string.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+
+    super(StagingArea, self).__init__(dtypes, shapes,
+                                          names, shared_name,
+                                          capacity, memory_limit)
+
   def put(self, values, name=None):
     """Create an op that places a value into the staging area.
 
+    This operation will block if the `StagingArea` has reached
+    its capacity.
+
     Args:
       values: Tensor (or a tuple of Tensors) to place into the staging area.
       name: A name for the operation (optional).
@@ -1575,46 +1645,25 @@ class StagingArea(object):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
-      vals = self._check_put_dtypes(values)
-      if len(values) != len(self._dtypes):
-        raise ValueError("Unexpected number of inputs " + str(len(values)) +
-                         "vs " + str(len(self._dtypes)))
-      for val, dtype in zip(vals, self._dtypes):
-        if val.dtype != dtype:
-          raise ValueError("Datatypes do not match. " + str(val.dtype) + " != "
-                           + str(dtype))
 
-      for val, shape in zip(vals, self._shapes):
-        val.get_shape().assert_is_compatible_with(shape)
+      # Hard-code indices for this staging area
+      indices = (list(six.moves.range(len(values)))
+                  if isinstance(values, (list, tuple)) else None)
+      vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
         op = gen_data_flow_ops.stage(values=vals, shared_name=self._name,
-                                     name=scope)
+                                     name=scope, capacity=self._capacity,
+                                     memory_limit=self._memory_limit)
 
       return op
 
-  def _get_return_value(self, tensors):
-    """Return the value to return from a get op.
+  def __internal_get(self, get_fn, name):
+    with ops.colocate_with(self._coloc_op):
+      ret = get_fn()
 
-    If the staging area has names, return a dictionary with the
-    names as keys.  Otherwise return either a single tensor
-    or a list of tensors depending on the length of `tensors`.
-
-    Args:
-      tensors: List of tensors from the get op.
-
-    Returns:
-      A single tensor, a list of tensors, or a dictionary
-      of tensors.
-    """
-    if self._names:
-      # The returned values in `tensors` are in the same order as
-      # the names in `self._names`.
-      return {n: tensors[i] for i, n in enumerate(self._names)}
-    elif len(tensors) == 1:
-      return tensors[0]
-    else:
-      return tensors
+    indices = list(six.moves.range(len(self._dtypes))) # Hard coded
+    return self._get_return_value(ret, indices)
 
   def get(self, name=None):
     """Gets one element from this staging area.
@@ -1622,6 +1671,13 @@ class StagingArea(object):
     If the staging area is empty when this operation executes, it will block
     until there is an element to dequeue.
 
+    Note that unlike others ops that can block, like the queue Dequeue
+    operations, this can stop other work from happening.  To avoid this, the
+    intended use is for this to be called only when there will be an element
+    already available.  One method for doing this in a training loop would be to
+    run a `put()` call during a warmup session.run call, and then call both
+    `get()` and `put()` in each subsequent step.
+
     The placement of the returned tensor will be determined by the current
     device scope when this function is called.
 
@@ -1634,19 +1690,448 @@ class StagingArea(object):
     if name is None:
       name = "%s_get" % self._name
 
+    fn = lambda: gen_data_flow_ops.unstage(dtypes=self._dtypes,
+                    shared_name=self._name, name=name,
+                    capacity=self._capacity,
+                    memory_limit=self._memory_limit)
+
+    return self.__internal_get(fn, name)
+
+  def peek(self, index, name=None):
+    """Peeks at an element in the staging area.
+
+    If the staging area is too small to contain the element at
+    the specified index, it will block until enough elements
+    are inserted to complete the operation.
+
+    The placement of the returned tensor will be determined by
+    the current device scope when this function is called.
+
+    Args:
+      index: The index of the tensor within the staging area
+              to look up.
+      name: A name for the operation (optional).
+
+    Returns:
+      The tuple of tensors that was gotten.
+    """
+    if name is None:
+      name = "%s_peek" % self._name
+
+    fn = lambda: gen_data_flow_ops.stage_peek(index,
+                    dtypes=self._dtypes, shared_name=self._name,
+                    name=name, capacity=self._capacity,
+                    memory_limit=self._memory_limit)
+
+    return self.__internal_get(fn, name)
+
+  def size(self, name=None):
+    """Returns the number of elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_size" % self._name
+
+    return gen_data_flow_ops.stage_size(name=name, shared_name=self._name,
+                        dtypes=self._dtypes, capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+  def clear(self, name=None):
+    """Clears the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_clear" % self._name
+
+    return gen_data_flow_ops.stage_clear(name=name, shared_name=self._name,
+                        dtypes=self._dtypes, capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+class MapStagingArea(BaseStagingArea):
+  """
+  A `MapStagingArea` is a TensorFlow data structure that stores tensors across
+  multiple steps, and exposes operations that can put and get tensors.
+
+  Each `MapStagingArea` element is a (key, value) pair.
+  Only int64 keys are supported, other types should be
+  hashed to produce a key.
+  Values are a tuple of one or more tensors.
+  Each tuple component has a static dtype,
+  and may have a static shape.
+
+  The capacity of a `MapStagingArea` may be bounded or unbounded.
+  It supports multiple concurrent producers and consumers; and
+  provides exactly-once delivery.
+
+  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors whose
+  dtypes are described by `dtypes`, and whose shapes are optionally described
+  by the `shapes` argument.
+
+  If the `shapes` argument is specified, each component of a staging area
+  element must have the respective fixed shape. If it is
+  unspecified, different elements may have different shapes,
+
+  It behaves like an associative container with support for:
+
+   - put(key, values)
+   - peek(key)         like dict.get(key)
+   - get(key)          like dict.pop(key)
+   - get(key=None)     like dict.popitem()
+   - size()
+   - clear()
+
+  If ordered a tree structure ordered by key will be used and
+  get(key=None) will remove (key, value) pairs in increasing key order.
+  Otherwise a hashtable
+
+  It can be configured with a capacity in which case
+  put(key, values) will block until space becomes available.
+
+  Similarly, it can be configured with a memory limit which
+  will block put(key, values) until space is available.
+  This is mostly useful for limiting the number of tensors on
+  devices such as GPUs.
+
+  All get() and peek() commands block if the requested
+  (key, value) pair is not present in the staging area.
+
+  Partial puts are supported and will be placed in an incomplete
+  map until such time as all values associated with the key have
+  been inserted. Once completed, this (key, value) pair will be
+  inserted into the map. Data in the incomplete map
+  counts towards the memory limit, but not towards capacity limit.
+
+  Partial gets from the map are also supported.
+  This removes the partially requested tensors from the entry,
+  but the entry is only removed from the map once all tensors
+  associated with it are removed.
+  """
+
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                      ordered=False, capacity=0, memory_limit=0):
+    """
+    Args:
+      dtypes:  A list of types.  The length of dtypes must equal the number
+        of tensors in each element.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area (excluding keys).
+        An integer. If zero, the Staging Area is unbounded
+      ordered: (Optional.) If True the underlying data structure
+        is a tree ordered on key. Otherwise assume a hashtable.
+      shapes: (Optional.) Constraints on the shapes of tensors in an element.
+        A list of shape tuples or None. This list is the same length
+        as dtypes.  If the shape of any tensors in the element are constrained,
+        all must be; shapes can be None if the shapes should not be constrained.
+      names: (Optional.) If provided, the `get()` and
+        `put()` methods will use dictionaries with these names as keys.
+        Must be None or a list or tuple of the same length as `dtypes`.
+      shared_name: (Optional.) A name to be used for the shared object. By
+        passing the same name to two different python objects they will share
+        the underlying staging area. Must be a string.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    """
+
+    super(MapStagingArea, self).__init__(dtypes, shapes,
+                                      names, shared_name,
+                                      capacity, memory_limit)
+
+    # Defer to different methods depending if the map is ordered
+    self._ordered = ordered
+
+    if ordered:
+      self._put_fn = gen_data_flow_ops.ordered_map_stage
+      self._pop_fn = gen_data_flow_ops.ordered_map_unstage
+      self._popitem_fn = gen_data_flow_ops.ordered_map_unstage_no_key
+      self._peek_fn = gen_data_flow_ops.ordered_map_peek
+      self._size_fn = gen_data_flow_ops.ordered_map_size
+      self._incomplete_size_fn = gen_data_flow_ops.ordered_map_incomplete_size
+      self._clear_fn = gen_data_flow_ops.ordered_map_clear
+    else:
+      self._put_fn = gen_data_flow_ops.map_stage
+      self._pop_fn = gen_data_flow_ops.map_unstage
+      self._popitem_fn = gen_data_flow_ops.map_unstage_no_key
+      self._peek_fn = gen_data_flow_ops.map_peek
+      self._size_fn = gen_data_flow_ops.map_size
+      self._incomplete_size_fn = gen_data_flow_ops.map_incomplete_size
+      self._clear_fn = gen_data_flow_ops.map_clear
+
+  def put(self, key, vals, indices=None, name=None):
+    """
+    Create an op that stores the (key, vals) pair in the staging area.
+
+    Incomplete puts are possible, preferably using a dictionary for vals
+    as the appropriate dtypes and shapes can be inferred from the value names
+    dictionary key values. If vals is a list or tuple, indices must
+    also be specified so that the op knows at which element position
+    to perform the insert.
+
+    This operation will block if the capacity or memory limit of this
+    container is reached.
+
+    Args:
+        key: Key associated with the data
+        vals: Tensor (or a dict/tuple of Tensors) to place
+                into the staging area.
+        indices: (Optional) if vals is a tuple/list, this is required.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+
+    Raises:
+        ValueError: If the number or type of inputs don't match the staging area.
+    """
+
+    with ops.name_scope(name, "%s_put" % self._name,
+                        self._scope_vals(vals)) as scope:
+
+      vals, indices = self._check_put_dtypes(vals, indices)
+
+      with ops.colocate_with(self._coloc_op):
+        op = self._put_fn(key, indices, vals, dtypes=self._dtypes,
+                             shared_name=self._name, name=scope,
+                             capacity=self._capacity,
+                             memory_limit=self._memory_limit)
+    return op
+
+  def _get_indices_and_dtypes(self, indices=None):
+    if indices is None:
+      indices = list(six.moves.range(len(self._dtypes)))
+
+    if not isinstance(indices, (tuple, list)):
+      raise TypeError("Invalid indices type '%s'" % type(indices))
+
+    if len(indices) == 0:
+      raise ValueError("Empty indices")
+
+    if all(isinstance(i, str) for i in indices):
+      if self._names is None:
+        raise ValueError("String indices provided '%s', but this Staging Area "
+                        "was not created with names." % indices)
+
+      try:
+        indices = [self._names.index(n) for n in indices]
+      except ValueError:
+        raise ValueError("Named index '%s' not in "
+                        "Staging Area names '%s'" % (n, self._names))
+    elif all(isinstance(i, int) for i in indices):
+      pass
+    else:
+      raise TypeError("Mixed types in indices '%s'. "
+                      "May only be str or int" % indices)
+
+    dtypes = [self._dtypes[i] for i in indices]
+
+    return indices, dtypes
+
+
+  def peek(self, key, indices=None, name=None):
+    """
+    Peeks at staging area data associated with the key.
+
+    If the key is not in the staging area, it will block
+    until the associated (key, value) is inserted.
+
+    Args:
+        key: Key associated with the required data
+        indices: Partial list of tensors to retrieve (optional).
+                A list of integer or string indices.
+                String indices are only valid if the Staging Area
+                has names associated with it.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+
+    if name is None:
+      name = "%s_pop" % self._name
+
+    indices, dtypes = self._get_indices_and_dtypes(indices)
+
     with ops.colocate_with(self._coloc_op):
-      ret = gen_data_flow_ops.unstage(dtypes=self._dtypes,
-                                      shared_name=self._name, name=name)
+      result = self._peek_fn(key, shared_name=self._name,
+                      indices=indices,
+                      dtypes=dtypes,
+                      name=name,
+                      capacity=self._capacity,
+                      memory_limit=self._memory_limit)
 
-    curr_device_scope = control_flow_ops.no_op().device
-    if curr_device_scope != self._coloc_op.device:
-      for i in range(len(ret)):
-        ret[i] = array_ops.identity(ret[i])
+    return self._get_return_value(result, indices)
 
-    for output, shape in zip(ret, self._shapes):
-      output.set_shape(shape)
+  def get(self, key=None, indices=None, name=None):
+    """
+    If the key is provided, the associated (key, value)
+    is returned from the staging area. If the key is not
+    in the staging area, this method will block until
+    the associated (key, value) is inserted.
 
-    return self._get_return_value(ret)
+    If no key is provided and the staging area is ordered,
+    the (key, value) with the smallest key will be returned.
+    Otherwise, a random (key, value) will be returned.
+
+    If the staging area is empty when this operation executes,
+    it will block until there is an element to dequeue.
+
+    Args:
+        key: Key associated with the required data (Optional)
+        indices: Partial list of tensors to retrieve (optional).
+                A list of integer or string indices.
+                String indices are only valid if the Staging Area
+                has names associated with it.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if key is None:
+      return self._popitem(indices=indices, name=name)
+    else:
+      return self._pop(key, indices=indices, name=name)
+
+  def _pop(self, key, indices=None, name=None):
+    """
+    Remove and return the associated (key, value)
+    is returned from the staging area. If the key is not
+    in the staging area, this method will block until
+    the associated (key, value) is inserted.
+
+    Args:
+        key: Key associated with the required data
+        indices: Partial list of tensors to retrieve (optional).
+                A list of integer or string indices.
+                String indices are only valid if the Staging Area
+                has names associated with it.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_get" % self._name
+
+    indices, dtypes = self._get_indices_and_dtypes(indices)
+
+    with ops.colocate_with(self._coloc_op):
+      result = self._pop_fn(key, shared_name=self._name,
+                      indices=indices,
+                      dtypes=dtypes,
+                      name=name,
+                      capacity=self._capacity,
+                      memory_limit=self._memory_limit)
+
+    return key, self._get_return_value(result, indices)
+
+  def _popitem(self, indices=None, name=None):
+    """
+    If the staging area is ordered,
+    the (key, value) with the smallest key will be returned.
+    Otherwise, a random (key, value) will be returned.
+
+    If the staging area is empty when this operation executes,
+    it will block until there is an element to dequeue.
+
+    Args:
+        key: Key associated with the required data
+        indices: Partial list of tensors to retrieve (optional).
+                A list of integer or string indices.
+                String indices are only valid if the Staging Area
+                has names associated with it.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_get_nokey" % self._name
+
+    indices, dtypes = self._get_indices_and_dtypes(indices)
+
+    with ops.colocate_with(self._coloc_op):
+      key, result = self._popitem_fn(shared_name=self._name,
+                              indices=indices,
+                              dtypes=dtypes,
+                              name=name,
+                              capacity=self._capacity,
+                              memory_limit=self._memory_limit)
+
+    # Separate keys and results out from
+    # underlying namedtuple
+    key = self._create_device_transfers(key)[0]
+    result = self._get_return_value(result, indices)
+
+    return key, result
+
+  def size(self, name=None):
+    """
+    Returns the number of elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_size" % self._name
+
+    return self._size_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+  def incomplete_size(self, name=None):
+    """
+    Returns the number of incomplete elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_incomplete_size" % self._name
+
+    return self._incomplete_size_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+
+
+  def clear(self, name=None):
+    """
+    Clears the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_clear" % self._name
+
+    return self._clear_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
 
 
 class RecordInput(object):
@@ -1670,28 +2155,33 @@ class RecordInput(object):
                parallelism=1,
                shift_ratio=0,
                seed=0,
-               name=None):
+               name=None,
+               batches=None):
     """Constructs a RecordInput Op.
 
     Args:
       file_pattern: File path to the dataset, possibly containing wildcards.
         All matching files will be iterated over each epoch.
       batch_size: How many records to return at a time.
-      buffer_size: The maximum number of records the buffer will contain.  This
-        _must_ be smaller than the total number of records in an epoch or
-        deadlock can occur.
+      buffer_size: The maximum number of records the buffer will contain.
       parallelism: How many reader threads to use for reading from files.
       shift_ratio: What percentage of the total number files to move the start
         file forward by each epoch.
       seed: Specify the random number seed used by generator that randomizes
         records.
       name: Optional name for the operation.
+      batches: None by default, creating a single batch op. Otherwise specifies
+        how many batches to create, which are returned as a list when
+        `get_yield_op()` is called. An example use case is to split processing
+        between devices on one computer.
 
     Raises:
       ValueError: If one of the arguments is invalid.
     """
-
     self._batch_size = batch_size
+    if batches is not None:
+      self._batch_size *= batches
+    self._batches = batches
     self._file_pattern = file_pattern
     self._buffer_size = buffer_size
     self._parallelism = parallelism
@@ -1700,8 +2190,11 @@ class RecordInput(object):
     self._name = name
 
   def get_yield_op(self):
-    """Add a node that yields a minibatch every time it is executed."""
-    return gen_data_flow_ops.record_input(
+    """Adds a node that yields a group of records every time it is executed.
+    If RecordInput `batches` parameter is not None, it yields a list of
+    record batches with the specified `batch_size`.
+    """
+    records = gen_data_flow_ops.record_input(
         file_pattern=self._file_pattern,
         file_buffer_size=self._buffer_size,
         file_parallelism=self._parallelism,
@@ -1709,3 +2202,14 @@ class RecordInput(object):
         batch_size=self._batch_size,
         file_random_seed=self._seed,
         name=self._name)
+    if self._batches is None:
+      return records
+    else:
+      with ops.name_scope(self._name):
+        batch_list = [[] for i in six.moves.range(self._batches)]
+        records = array_ops.split(records, self._batch_size, 0)
+        records = [array_ops.reshape(record, []) for record in records]
+        for index, protobuf in zip(six.moves.range(len(records)), records):
+          batch_index = index % self._batches
+          batch_list[batch_index].append(protobuf)
+        return batch_list
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
new file mode 100644
index 00000000000..833239eb5fa
--- /dev/null
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -0,0 +1,42 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "distributions",
+    srcs = glob(["*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/ops/distributions/__init__.py b/tensorflow/python/ops/distributions/__init__.py
new file mode 100644
index 00000000000..563b189990c
--- /dev/null
+++ b/tensorflow/python/ops/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution objects and helpers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/ops/bernoulli.py
rename to tensorflow/python/ops/distributions/bernoulli.py
index 60f8c114d8e..3281b57e83e 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -18,17 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Bernoulli(distribution.Distribution):
@@ -57,21 +58,21 @@ class Bernoulli(distribution.Distribution):
         Bernoulli distribution. Only one of `logits` or `probs` should be passed
         in.
       dtype: The type of the event samples. Default: `int32`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       ValueError: If p and logits are passed, or if neither are passed.
     """
     parameters = locals()
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name):
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
@@ -79,13 +80,12 @@ class Bernoulli(distribution.Distribution):
           name=name)
     super(Bernoulli, self).__init__(
         dtype=dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._logits, self._probs],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -114,13 +114,14 @@ class Bernoulli(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    new_shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
+    new_shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     uniform = random_ops.random_uniform(
         new_shape, seed=seed, dtype=self.probs.dtype)
     sample = math_ops.less(uniform, self.probs)
     return math_ops.cast(sample, self.dtype)
 
   def _log_prob(self, event):
+    event = self._maybe_assert_valid_sample(event)
     # TODO(jaana): The current sigmoid_cross_entropy_with_logits has
     # inconsistent  behavior for logits = inf/-inf.
     event = math_ops.cast(event, self.logits.dtype)
@@ -161,6 +162,17 @@ class Bernoulli(distribution.Distribution):
     """Returns `1` if `prob > 0.5` and `0` otherwise."""
     return math_ops.cast(self.probs > 0.5, self.dtype)
 
+  def _maybe_assert_valid_sample(self, event, check_integer=True):
+    if not self.validate_args:
+      return event
+    event = distribution_util.embed_check_nonnegative_discrete(
+        event, check_integer=check_integer)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less_equal(
+            event, array_ops.ones_like(event),
+            message="event is not less than or equal to 1."),
+    ], event)
+
 
 class BernoulliWithSigmoidProbs(Bernoulli):
   """Bernoulli with `probs = nn.sigmoid(logits)`."""
@@ -172,13 +184,13 @@ class BernoulliWithSigmoidProbs(Bernoulli):
                allow_nan_stats=True,
                name="BernoulliWithSigmoidProbs"):
     parameters = locals()
-    with ops.name_scope(name) as ns:
+    with ops.name_scope(name):
       super(BernoulliWithSigmoidProbs, self).__init__(
           probs=nn.sigmoid(logits, name="sigmoid_probs"),
           dtype=dtype,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/python/ops/distributions/beta.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/ops/beta.py
rename to tensorflow/python/ops/distributions/beta.py
index 53149b3acd2..2b93478cdf9 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -20,10 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -139,39 +138,37 @@ class Beta(distribution.Distribution):
       concentration0: Positive floating-point `Tensor` indicating mean
         number of failures; aka "beta". Otherwise has same semantics as
         `concentration1`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration1,
-                                      concentration0]) as ns:
+    with ops.name_scope(name, values=[concentration1, concentration0]):
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
           validate_args)
       self._concentration0 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration0, name="concentration0"),
           validate_args)
-      contrib_tensor_util.assert_same_float_dtype([
+      check_ops.assert_same_float_dtype([
           self._concentration1, self._concentration0])
       self._total_concentration = self._concentration1 + self._concentration0
     super(Beta, self).__init__(
         dtype=self._total_concentration.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=True,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=[self._concentration1,
                        self._concentration0,
                        self._total_concentration],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -267,7 +264,7 @@ class Beta(distribution.Distribution):
   @distribution_util.AppendDocstring(
       """Note: The mode is undefined when `concentration1 <= 1` or
       `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
-      is used for undefined modes.  If `self.allow_nan_stats` is `False` an
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
       exception is raised when one or more modes are undefined.""")
   def _mode(self):
     mode = (self.concentration1 - 1.) / (self.total_concentration - 2.)
diff --git a/tensorflow/python/ops/distributions/bijector.py b/tensorflow/python/ops/distributions/bijector.py
new file mode 100644
index 00000000000..70e9fdadd20
--- /dev/null
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bijector base."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.distributions.bijector_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Bijector"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
new file mode 100644
index 00000000000..7be7c27ae9a
--- /dev/null
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -0,0 +1,725 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bijector base."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import contextlib
+import re
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+__all__ = [
+    "Bijector",
+]
+
+
+class _Mapping(collections.namedtuple(
+    "_Mapping", ["x", "y", "ildj", "kwargs"])):
+  """Helper class to make it easier to manage caching in `Bijector`."""
+
+  def __new__(cls, x=None, y=None, ildj=None, kwargs=None):
+    """Custom __new__ so namedtuple items have defaults.
+
+    Args:
+      x: `Tensor`. Forward.
+      y: `Tensor`. Inverse.
+      ildj: `Tensor`. Inverse log det Jacobian.
+      kwargs: Python dictionary. Extra args supplied to
+        forward/inverse/etc functions.
+
+    Returns:
+      mapping: New instance of _Mapping.
+    """
+    return super(_Mapping, cls).__new__(cls, x, y, ildj, kwargs)
+
+  @property
+  def x_key(self):
+    """Returns key used for caching Y=g(X)."""
+    return (self.x,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
+
+  @property
+  def y_key(self):
+    """Returns key used for caching X=g^{-1}(Y)."""
+    return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
+
+  def merge(self, x=None, y=None, ildj=None, kwargs=None, mapping=None):
+    """Returns new _Mapping with args merged with self.
+
+    Args:
+      x: `Tensor`. Forward.
+      y: `Tensor`. Inverse.
+      ildj: `Tensor`. Inverse log det Jacobian.
+      kwargs: Python dictionary. Extra args supplied to
+        forward/inverse/etc functions.
+      mapping: Instance of _Mapping to merge. Can only be specified if no other
+        arg is specified.
+
+    Returns:
+      mapping: New instance of `_Mapping` which has inputs merged with self.
+
+    Raises:
+      ValueError: if mapping and any other arg is not `None`.
+    """
+    if mapping is None:
+      mapping = _Mapping(x=x, y=y, ildj=ildj, kwargs=kwargs)
+    elif not all(arg is None for arg in [x, y, ildj, kwargs]):
+      raise ValueError("Cannot specify mapping and individual args.")
+    return _Mapping(
+        x=self._merge(self.x, mapping.x),
+        y=self._merge(self.y, mapping.y),
+        ildj=self._merge(self.ildj, mapping.ildj),
+        kwargs=self._merge(self.kwargs, mapping.kwargs))
+
+  def _merge(self, old, new):
+    """Helper to merge which handles merging one value."""
+    if old is None:
+      return new
+    elif new is not None and old != new:
+      raise ValueError("Incompatible values: %s != %s" % (old, new))
+    return old
+
+  def _deep_tuple(self, x):
+    """Converts lists of lists to tuples of tuples."""
+    return (tuple(map(self._deep_tuple, x))
+            if isinstance(x, (list, tuple)) else x)
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Bijector(object):
+  """Interface for invertible transformations of a `Distribution` sample.
+
+  #### Mathematical Details
+
+  A `Bijector` implements a
+  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
+  bijective, differentiable function. A `Bijector` is used by
+  `TransformedDistribution` but can be generally used for transforming a
+  `Distribution` generated `Tensor`. A `Bijector` is characterized by three
+  operations:
+
+  1. Forward Evaluation
+
+     Useful for turning one random outcome into another random outcome from a
+     different distribution.
+
+  2. Inverse Evaluation
+
+     Useful for "reversing" a transformation to compute one probability in
+     terms of another.
+
+  3. (log o det o Jacobian o inverse)(x)
+
+     "The log of the determinant of the matrix of all first-order partial
+     derivatives of the inverse function."
+     Useful for inverting a transformation to compute one probability in terms
+     of another. Geometrically, the det(Jacobian) is the volume of the
+     transformation and is used to scale the probability.
+
+  By convention, transformations of random variables are named in terms of the
+  forward transformation. The forward transformation creates samples, the
+  inverse is useful for computing probabilities.
+
+  #### Example Uses
+
+  - Basic properties:
+
+  ```python
+  x = ...  # A tensor.
+  # Evaluate forward transformation.
+  fwd_x = my_bijector.forward(x)
+  x == my_bijector.inverse(fwd_x)
+  x != my_bijector.forward(fwd_x)  # Not equal because g(x) != g(g(x)).
+  ```
+
+  - Computing a log-likelihood:
+
+  ```python
+  def transformed_log_prob(bijector, log_prob, x):
+    return (bijector.inverse_log_det_jacobian(x) +
+            log_prob(bijector.inverse(x)))
+  ```
+
+  - Transforming a random outcome:
+
+  ```python
+  def transformed_sample(bijector, x):
+    return bijector.forward(x)
+  ```
+
+  #### Example Bijectors
+
+  - "Exponential"
+
+    ```none
+    Y = g(X) = exp(X)
+    X ~ Normal(0, 1)  # Univariate.
+    ```
+
+    Implies:
+
+    ```none
+      g^{-1}(Y) = log(Y)
+      |Jacobian(g^{-1})(y)| = 1 / y
+      Y ~ LogNormal(0, 1), i.e.,
+      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
+                = (1 / y) Normal(log(y); 0, 1)
+    ```
+
+    Here is an example of how one might implement the `Exp` bijector:
+
+    ```python
+      class Exp(Bijector):
+
+        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
+          super(Exp, self).__init__(
+              event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+        def _forward(self, x):
+          return math_ops.exp(x)
+
+        def _inverse(self, y):
+          return math_ops.log(y)
+
+        def _inverse_log_det_jacobian(self, y):
+          return -self._forward_log_det_jacobian(self._inverse(y))
+
+        def _forward_log_det_jacobian(self, x):
+          if self.event_ndims is None:
+            raise ValueError("Jacobian requires known event_ndims.")
+          event_dims = array_ops.shape(x)[-self.event_ndims:]
+          return math_ops.reduce_sum(x, axis=event_dims)
+      ```
+
+  - "Affine"
+
+    ```none
+    Y = g(X) = sqrtSigma * X + mu
+    X ~ MultivariateNormal(0, I_d)
+    ```
+
+    Implies:
+
+    ```none
+      g^{-1}(Y) = inv(sqrtSigma) * (Y - mu)
+      |Jacobian(g^{-1})(y)| = det(inv(sqrtSigma))
+      Y ~ MultivariateNormal(mu, sqrtSigma) , i.e.,
+      prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
+                = det(sqrtSigma)^(-d) *
+                  MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
+      ```
+
+  #### Jacobian
+
+  The Jacobian is a reduction over event dims. To see this, consider the `Exp`
+  `Bijector` applied to a `Tensor` which has sample, batch, and event (S, B, E)
+  shape semantics. Suppose the `Tensor`'s partitioned-shape is `(S=[4], B=[2],
+  E=[3, 3])`. The shape of the `Tensor` returned by `forward` and `inverse` is
+  unchanged, i.e., `[4, 2, 3, 3]`.  However the shape returned by
+  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
+  over the event dimensions.
+
+  It is sometimes useful to implement the inverse Jacobian as the negative
+  forward Jacobian. For example,
+
+  ```python
+  def _inverse_log_det_jacobian(self, y):
+     return -self._forward_log_det_jac(self._inverse(y))  # Note negation.
+  ```
+
+  The correctness of this approach can be seen from the following claim.
+
+  - Claim:
+
+      Assume `Y = g(X)` is a bijection whose derivative exists and is nonzero
+      for its domain, i.e., `dY/dX = d/dX g(X) != 0`. Then:
+
+      ```none
+      (log o det o jacobian o g^{-1})(Y) = -(log o det o jacobian o g)(X)
+      ```
+
+  - Proof:
+
+      From the bijective, nonzero differentiability of `g`, the
+      [inverse function theorem](
+          https://en.wikipedia.org/wiki/Inverse_function_theorem)
+      implies `g^{-1}` is differentiable in the image of `g`.
+      Applying the chain rule to `y = g(x) = g(g^{-1}(y))` yields
+      `I = g'(g^{-1}(y))*g^{-1}'(y)`.
+      The same theorem also implies `g{-1}'` is non-singular therefore:
+      `inv[ g'(g^{-1}(y)) ] = g^{-1}'(y)`.
+      The claim follows from [properties of determinant](
+  https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
+
+  Generally its preferable to directly implement the inverse Jacobian. This
+  should have superior numerical stability and will often share subgraphs with
+  the `_inverse` implementation.
+
+  #### Subclass Requirements
+
+  - Subclasses typically implement:
+
+      - `_forward`,
+      - `_inverse`,
+      - `_inverse_log_det_jacobian`,
+      - `_forward_log_det_jacobian` (optional).
+
+    The `_forward_log_det_jacobian` is called when the bijector is inverted via
+    the `Invert` bijector. If undefined, a slightly less efficiently
+    calculation, `-1 * _inverse_log_det_jacobian`, is used.
+
+    If the bijector changes the shape of the input, you must also implement:
+
+      - _forward_event_shape_tensor,
+      - _forward_event_shape (optional),
+      - _inverse_event_shape_tensor,
+      - _inverse_event_shape (optional).
+
+    By default the event-shape is assumed unchanged from input.
+
+  - If the `Bijector`'s use is limited to `TransformedDistribution` (or friends
+    like `QuantizedDistribution`) then depending on your use, you may not need
+    to implement all of `_forward` and `_inverse` functions.
+
+    Examples:
+
+      1. Sampling (e.g., `sample`) only requires `_forward`.
+      2. Probability functions (e.g., `prob`, `cdf`, `survival`) only require
+         `_inverse` (and related).
+      3. Only calling probability functions on the output of `sample` means
+        `_inverse` can be implemented as a cache lookup.
+
+    See "Example Uses" [above] which shows how these functions are used to
+    transform a distribution. (Note: `_forward` could theoretically be
+    implemented as a cache lookup but this would require controlling the
+    underlying sample generation mechanism.)
+
+  """
+
+  @abc.abstractmethod
+  def __init__(self,
+               event_ndims=None,
+               graph_parents=None,
+               is_constant_jacobian=False,
+               validate_args=False,
+               dtype=None,
+               name=None):
+    """Constructs Bijector.
+
+    A `Bijector` transforms random variables into new random variables.
+
+    Examples:
+
+    ```python
+    # Create the Y = g(X) = X transform which operates on vector events.
+    identity = Identity(event_ndims=1)
+
+    # Create the Y = g(X) = exp(X) transform which operates on matrices.
+    exp = Exp(event_ndims=2)
+    ```
+
+    See `Bijector` subclass docstring for more details and specific examples.
+
+    Args:
+      event_ndims: number of dimensions associated with event coordinates.
+      graph_parents: Python list of graph prerequisites of this `Bijector`.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is not a
+        function of the input.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are invalid,
+        correct behavior is not guaranteed.
+      dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
+        enforced.
+      name: The name to give Ops created by the initializer.
+    """
+    self._event_ndims = (
+        ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
+        if event_ndims is not None else None)
+    self._graph_parents = graph_parents or []
+    self._is_constant_jacobian = is_constant_jacobian
+    self._validate_args = validate_args
+    self._dtype = dtype
+    self._from_y = {}
+    self._from_x = {}
+    # Using abbreviation ildj for "inverse log det Jacobian."
+    # This variable is not `None` iff is_constant_jacobian is `True`.
+    self._constant_ildj = None
+    if name:
+      self._name = name
+    else:
+      # We want the default convention to be snake_case rather than CamelCase
+      # since `Chain` uses bijector.name as the kwargs dictionary key.
+      def camel_to_snake(name):
+        s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+        return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+      self._name = camel_to_snake(type(self).__name__.lstrip("_"))
+
+  @property
+  def event_ndims(self):
+    """Returns then number of event dimensions this bijector operates on."""
+    return self._event_ndims
+
+  @property
+  def graph_parents(self):
+    """Returns this `Bijector`'s graph_parents as a Python list."""
+    return self._graph_parents
+
+  @property
+  def is_constant_jacobian(self):
+    """Returns true iff the Jacobian is not a function of x.
+
+    Note: Jacobian is either constant for both forward and inverse or neither.
+
+    Returns:
+      is_constant_jacobian: Python `bool`.
+    """
+    return self._is_constant_jacobian
+
+  @property
+  def validate_args(self):
+    """Returns True if Tensor arguments will be validated."""
+    return self._validate_args
+
+  @property
+  def dtype(self):
+    """dtype of `Tensor`s transformable by this distribution."""
+    return self._dtype
+
+  @property
+  def name(self):
+    """Returns the string name of this `Bijector`."""
+    return self._name
+
+  def _forward_event_shape_tensor(self, input_shape):
+    """Subclass implementation for `forward_event_shape_tensor` function."""
+    # By default, we assume event_shape is unchanged.
+    return input_shape
+
+  def forward_event_shape_tensor(self,
+                                 input_shape,
+                                 name="forward_event_shape_tensor"):
+    """Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
+
+    Args:
+      input_shape: `Tensor`, `int32` vector indicating event-portion shape
+        passed into `forward` function.
+      name: name to give to the op
+
+    Returns:
+      forward_event_shape_tensor: `Tensor`, `int32` vector indicating
+        event-portion shape after applying `forward`.
+    """
+    with self._name_scope(name, [input_shape]):
+      input_shape = ops.convert_to_tensor(input_shape, dtype=dtypes.int32,
+                                          name="input_shape")
+      return self._forward_event_shape_tensor(input_shape)
+
+  def _forward_event_shape(self, input_shape):
+    """Subclass implementation for `forward_event_shape` public function."""
+    # By default, we assume event_shape is unchanged.
+    return input_shape
+
+  def forward_event_shape(self, input_shape):
+    """Shape of a single sample from a single batch as a `TensorShape`.
+
+    Same meaning as `forward_event_shape_tensor`. May be only partially defined.
+
+    Args:
+      input_shape: `TensorShape` indicating event-portion shape passed into
+        `forward` function.
+
+    Returns:
+      forward_event_shape_tensor: `TensorShape` indicating event-portion shape
+        after applying `forward`. Possibly unknown.
+    """
+    return self._forward_event_shape(tensor_shape.TensorShape(input_shape))
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    """Subclass implementation for `inverse_event_shape_tensor` function."""
+    # By default, we assume event_shape is unchanged.
+    return output_shape
+
+  def inverse_event_shape_tensor(self,
+                                 output_shape,
+                                 name="inverse_event_shape_tensor"):
+    """Shape of a single sample from a single batch as an `int32` 1D `Tensor`.
+
+    Args:
+      output_shape: `Tensor`, `int32` vector indicating event-portion shape
+        passed into `inverse` function.
+      name: name to give to the op
+
+    Returns:
+      inverse_event_shape_tensor: `Tensor`, `int32` vector indicating
+        event-portion shape after applying `inverse`.
+    """
+    with self._name_scope(name, [output_shape]):
+      output_shape = ops.convert_to_tensor(output_shape, dtype=dtypes.int32,
+                                           name="output_shape")
+      return self._inverse_event_shape_tensor(output_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    """Subclass implementation for `inverse_event_shape` public function."""
+    # By default, we assume event_shape is unchanged.
+    return tensor_shape.TensorShape(output_shape)
+
+  def inverse_event_shape(self, output_shape):
+    """Shape of a single sample from a single batch as a `TensorShape`.
+
+    Same meaning as `inverse_event_shape_tensor`. May be only partially defined.
+
+    Args:
+      output_shape: `TensorShape` indicating event-portion shape passed into
+        `inverse` function.
+
+    Returns:
+      inverse_event_shape_tensor: `TensorShape` indicating event-portion shape
+        after applying `inverse`. Possibly unknown.
+    """
+    return self._inverse_event_shape(output_shape)
+
+  def _forward(self, x):
+    """Subclass implementation for `forward` public function."""
+    raise NotImplementedError("forward not implemented.")
+
+  def _call_forward(self, x, name, **kwargs):
+    with self._name_scope(name, [x]):
+      x = ops.convert_to_tensor(x, name="x")
+      self._maybe_assert_dtype(x)
+      mapping = self._lookup(x=x, kwargs=kwargs)
+      if mapping.y is not None:
+        return mapping.y
+      mapping = mapping.merge(y=self._forward(x, **kwargs))
+      self._cache(mapping)
+      return mapping.y
+
+  def forward(self, x, name="forward"):
+    """Returns the forward `Bijector` evaluation, i.e., X = g(Y).
+
+    Args:
+      x: `Tensor`. The input to the "forward" evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+
+    Raises:
+      TypeError: if `self.dtype` is specified and `x.dtype` is not
+        `self.dtype`.
+      NotImplementedError: if `_forward` is not implemented.
+    """
+    return self._call_forward(x, name)
+
+  def _inverse(self, y):
+    """Subclass implementation for `inverse` public function."""
+    raise NotImplementedError("inverse not implemented")
+
+  def _call_inverse(self, y, name, **kwargs):
+    with self._name_scope(name, [y]):
+      y = ops.convert_to_tensor(y, name="y")
+      self._maybe_assert_dtype(y)
+      mapping = self._lookup(y=y, kwargs=kwargs)
+      if mapping.x is not None:
+        return mapping.x
+      mapping = mapping.merge(x=self._inverse(y, **kwargs))
+      self._cache(mapping)
+      return mapping.x
+
+  def inverse(self, y, name="inverse"):
+    """Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
+
+    Args:
+      y: `Tensor`. The input to the "inverse" evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+
+    Raises:
+      TypeError: if `self.dtype` is specified and `y.dtype` is not
+        `self.dtype`.
+      NotImplementedError: if `_inverse` is not implemented.
+    """
+    return self._call_inverse(y, name)
+
+  def _inverse_log_det_jacobian(self, y):
+    """Subclass implementation of `inverse_log_det_jacobian` public function."""
+    raise NotImplementedError("inverse_log_det_jacobian not implemented.")
+
+  def _call_inverse_log_det_jacobian(self, y, name, **kwargs):
+    with self._name_scope(name, [y]):
+      if self._constant_ildj is not None:
+        return self._constant_ildj
+      y = ops.convert_to_tensor(y, name="y")
+      self._maybe_assert_dtype(y)
+      mapping = self._lookup(y=y, kwargs=kwargs)
+      if mapping.ildj is not None:
+        return mapping.ildj
+      try:
+        x = None  # Not needed; leave cache as is.
+        ildj = self._inverse_log_det_jacobian(y, **kwargs)
+      except NotImplementedError as original_exception:
+        try:
+          x = mapping.x if mapping.x is not None else self._inverse(y, **kwargs)
+          ildj = self._inverse_log_det_jacobian(y, **kwargs)
+        except NotImplementedError:
+          raise original_exception
+      mapping = mapping.merge(x=x, ildj=ildj)
+      self._cache(mapping)
+      if self.is_constant_jacobian:
+        self._constant_ildj = mapping.ildj
+      return mapping.ildj
+
+  def inverse_log_det_jacobian(self, y, name="inverse_log_det_jacobian"):
+    """Returns the (log o det o Jacobian o inverse)(y).
+
+    Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
+
+    Note that `forward_log_det_jacobian` is the negative of this function.
+
+    Args:
+      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+
+    Raises:
+      TypeError: if `self.dtype` is specified and `y.dtype` is not
+        `self.dtype`.
+      NotImplementedError: if `_inverse_log_det_jacobian` is not implemented.
+    """
+    return self._call_inverse_log_det_jacobian(y, name)
+
+  def _forward_log_det_jacobian(self, x):
+    """Subclass implementation of `forward_log_det_jacobian`."""
+    raise NotImplementedError(
+        "forward_log_det_jacobian not implemented.")
+
+  def _call_forward_log_det_jacobian(self, x, name, **kwargs):
+    with self._name_scope(name, [x]):
+      if self._constant_ildj is not None:
+        # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
+        return -1. * self._constant_ildj
+      x = ops.convert_to_tensor(x, name="x")
+      self._maybe_assert_dtype(x)
+      mapping = self._lookup(x=x, kwargs=kwargs)
+      if mapping.ildj is not None:
+        return -mapping.ildj
+      try:
+        y = None  # Not needed; leave cache as is.
+        ildj = -self._forward_log_det_jacobian(x, **kwargs)
+      except NotImplementedError as original_exception:
+        try:
+          y = mapping.y if mapping.y is not None else self._forward(x, **kwargs)
+          ildj = self._inverse_log_det_jacobian(y, **kwargs)
+        except NotImplementedError:
+          raise original_exception
+      mapping = mapping.merge(y=y, ildj=ildj)
+      self._cache(mapping)
+      if self.is_constant_jacobian:
+        self._constant_ildj = mapping.ildj
+      return -mapping.ildj
+
+  def forward_log_det_jacobian(self, x, name="forward_log_det_jacobian"):
+    """Returns both the forward_log_det_jacobian.
+
+    Args:
+      x: `Tensor`. The input to the "forward" Jacobian evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+
+    Raises:
+      TypeError: if `self.dtype` is specified and `y.dtype` is not
+        `self.dtype`.
+      NotImplementedError: if neither `_forward_log_det_jacobian`
+        nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
+    """
+    return self._call_forward_log_det_jacobian(x, name)
+
+  @contextlib.contextmanager
+  def _name_scope(self, name=None, values=None):
+    """Helper function to standardize op scope."""
+    with ops.name_scope(self.name):
+      with ops.name_scope(
+          name, values=(values or []) + self.graph_parents) as scope:
+        yield scope
+
+  def _maybe_assert_dtype(self, x):
+    """Helper to check dtype when self.dtype is known."""
+    if self.dtype is not None and self.dtype.base_dtype != x.dtype.base_dtype:
+      raise TypeError("Input had dtype %s but expected %s." %
+                      (self.dtype, x.dtype))
+
+  def _cache(self, mapping):
+    """Helper which stores mapping info in forward/inverse dicts."""
+    if self._constant_ildj is not None:
+      # Fold in ildj if known constant Jacobian.
+      mapping = mapping.merge(ildj=self._constant_ildj)
+    # Merging from lookup is an added check that we're not overwriting anything
+    # which is not None.
+    mapping = mapping.merge(mapping=self._lookup(
+        mapping.x, mapping.y, mapping.kwargs))
+    if mapping.x is None and mapping.y is None:
+      raise ValueError("Caching expects at least one of (x,y) to be known, "
+                       "i.e., not None.")
+    self._from_x[mapping.x_key] = mapping
+    self._from_y[mapping.y_key] = mapping
+
+  def _lookup(self, x=None, y=None, kwargs=None):
+    """Helper which retrieves mapping info from forward/inverse dicts."""
+    mapping = _Mapping(x=x, y=y, kwargs=kwargs)
+    # Since _cache requires both x,y to be set, we only need to do one cache
+    # lookup since the mapping is always in both or neither.
+    if mapping.x is not None:
+      return self._from_x.get(mapping.x_key, mapping)
+    if mapping.y is not None:
+      return self._from_y.get(mapping.y_key, mapping)
+    return mapping
+
+  def _event_dims_tensor(self, sample):
+    """Return a 1D `int32` tensor: `range(rank(sample))[-event_ndims:]`."""
+    if self.event_ndims is None:
+      raise ValueError("Jacobian cannot be computed with unknown event_ndims")
+    static_event_ndims = tensor_util.constant_value(self.event_ndims)
+    static_rank = sample.get_shape().ndims
+    if static_event_ndims is not None and static_rank is not None:
+      return ops.convert_to_tensor(
+          static_rank + np.arange(-static_event_ndims, 0).astype(np.int32))
+
+    if static_event_ndims is not None:
+      event_range = np.arange(-static_event_ndims, 0).astype(np.int32)
+    else:
+      event_range = math_ops.range(-self.event_ndims, 0, dtype=dtypes.int32)
+
+    if static_rank is not None:
+      return event_range + static_rank
+    else:
+      return event_range + array_ops.rank(sample)
diff --git a/tensorflow/python/ops/distributions/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
new file mode 100644
index 00000000000..ff3535c6264
--- /dev/null
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -0,0 +1,220 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bijector unit-test utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
+
+
+def assert_finite(array):
+  if not np.isfinite(array).all():
+    raise AssertionError("array was not all finite. %s" % array[:15])
+
+
+def assert_strictly_increasing(array):
+  np.testing.assert_array_less(0., np.diff(array))
+
+
+def assert_strictly_decreasing(array):
+  np.testing.assert_array_less(np.diff(array), 0.)
+
+
+def assert_strictly_monotonic(array):
+  if array[0] < array[-1]:
+    assert_strictly_increasing(array)
+  else:
+    assert_strictly_decreasing(array)
+
+
+def assert_scalar_congruency(bijector,
+                             lower_x,
+                             upper_x,
+                             n=int(10e3),
+                             rtol=0.01,
+                             sess=None):
+  """Assert `bijector`'s forward/inverse/inverse_log_det_jacobian are congruent.
+
+  We draw samples `X ~ U(lower_x, upper_x)`, then feed these through the
+  `bijector` in order to check that:
+
+  1. the forward is strictly monotonic.
+  2. the forward/inverse methods are inverses of each other.
+  3. the jacobian is the correct change of measure.
+
+  This can only be used for a Bijector mapping open subsets of the real line
+  to themselves.  This is due to the fact that this test compares the `prob`
+  before/after transformation with the Lebesgue measure on the line.
+
+  Args:
+    bijector:  Instance of Bijector
+    lower_x:  Python scalar.
+    upper_x:  Python scalar.  Must have `lower_x < upper_x`, and both must be in
+      the domain of the `bijector`.  The `bijector` should probably not produce
+      huge variation in values in the interval `(lower_x, upper_x)`, or else
+      the variance based check of the Jacobian will require small `rtol` or
+      huge `n`.
+    n:  Number of samples to draw for the checks.
+    rtol:  Positive number.  Used for the Jacobian check.
+    sess:  `tf.Session`.  Defaults to the default session.
+
+  Raises:
+    AssertionError:  If tests fail.
+  """
+
+  # Checks and defaults.
+  assert bijector.event_ndims.eval() == 0
+  if sess is None:
+    sess = ops.get_default_session()
+
+  # Should be monotonic over this interval
+  ten_x_pts = np.linspace(lower_x, upper_x, num=10).astype(np.float32)
+  if bijector.dtype is not None:
+    ten_x_pts = ten_x_pts.astype(bijector.dtype.as_numpy_dtype)
+  forward_on_10_pts = bijector.forward(ten_x_pts)
+
+  # Set the lower/upper limits in the range of the bijector.
+  lower_y, upper_y = sess.run(
+      [bijector.forward(lower_x), bijector.forward(upper_x)])
+  if upper_y < lower_y:  # If bijector.forward is a decreasing function.
+    lower_y, upper_y = upper_y, lower_y
+
+  # Uniform samples from the domain, range.
+  uniform_x_samps = uniform_lib.Uniform(
+      low=lower_x, high=upper_x).sample(n, seed=0)
+  uniform_y_samps = uniform_lib.Uniform(
+      low=lower_y, high=upper_y).sample(n, seed=1)
+
+  # These compositions should be the identity.
+  inverse_forward_x = bijector.inverse(bijector.forward(uniform_x_samps))
+  forward_inverse_y = bijector.forward(bijector.inverse(uniform_y_samps))
+
+  # For a < b, and transformation y = y(x),
+  # (b - a) = \int_a^b dx = \int_{y(a)}^{y(b)} |dx/dy| dy
+  # "change_measure_dy_dx" below is a Monte Carlo approximation to the right
+  # hand side, which should then be close to the left, which is (b - a).
+  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(uniform_y_samps))
+  # E[|dx/dy|] under Uniform[lower_y, upper_y]
+  # = \int_{y(a)}^{y(b)} |dx/dy| dP(u), where dP(u) is the uniform measure
+  expectation_of_dy_dx_under_uniform = math_ops.reduce_mean(dy_dx)
+  # dy = dP(u) * (upper_y - lower_y)
+  change_measure_dy_dx = (
+      (upper_y - lower_y) * expectation_of_dy_dx_under_uniform)
+
+  # We'll also check that dy_dx = 1 / dx_dy.
+  dx_dy = math_ops.exp(
+      bijector.forward_log_det_jacobian(bijector.inverse(uniform_y_samps)))
+
+  [
+      forward_on_10_pts_v,
+      dy_dx_v,
+      dx_dy_v,
+      change_measure_dy_dx_v,
+      uniform_x_samps_v,
+      uniform_y_samps_v,
+      inverse_forward_x_v,
+      forward_inverse_y_v,
+  ] = sess.run([
+      forward_on_10_pts,
+      dy_dx,
+      dx_dy,
+      change_measure_dy_dx,
+      uniform_x_samps,
+      uniform_y_samps,
+      inverse_forward_x,
+      forward_inverse_y,
+  ])
+
+  assert_strictly_monotonic(forward_on_10_pts_v)
+  # Composition of forward/inverse should be the identity.
+  np.testing.assert_allclose(
+      inverse_forward_x_v, uniform_x_samps_v, atol=1e-5, rtol=1e-3)
+  np.testing.assert_allclose(
+      forward_inverse_y_v, uniform_y_samps_v, atol=1e-5, rtol=1e-3)
+  # Change of measure should be correct.
+  np.testing.assert_allclose(
+      upper_x - lower_x, change_measure_dy_dx_v, atol=0, rtol=rtol)
+  # Inverse Jacobian should be equivalent to the reciprocal of the forward
+  # Jacobian.
+  np.testing.assert_allclose(
+      dy_dx_v, np.divide(1., dx_dy_v), atol=1e-5, rtol=1e-3)
+
+
+def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
+  """Assert that forward/inverse (along with jacobians) are inverses and finite.
+
+  It is recommended to use x and y values that are very very close to the edge
+  of the Bijector's domain.
+
+  Args:
+    bijector:  A Bijector instance.
+    x:  np.array of values in the domain of bijector.forward.
+    y:  np.array of values in the domain of bijector.inverse.
+    atol:  Absolute tolerance.
+    rtol:  Relative tolerance.
+    sess:  TensorFlow session.  Defaults to the default session.
+
+  Raises:
+    AssertionError:  If tests fail.
+  """
+  sess = sess or ops.get_default_session()
+
+  # These are the incoming points, but people often create a crazy range of
+  # values for which these end up being bad, especially in 16bit.
+  assert_finite(x)
+  assert_finite(y)
+
+  f_x = bijector.forward(x)
+  g_y = bijector.inverse(y)
+
+  [
+      x_from_x,
+      y_from_y,
+      ildj_f_x,
+      fldj_x,
+      ildj_y,
+      fldj_g_y,
+      f_x_v,
+      g_y_v,
+  ] = sess.run([
+      bijector.inverse(f_x),
+      bijector.forward(g_y),
+      bijector.inverse_log_det_jacobian(f_x),
+      bijector.forward_log_det_jacobian(x),
+      bijector.inverse_log_det_jacobian(y),
+      bijector.forward_log_det_jacobian(g_y),
+      f_x,
+      g_y,
+  ])
+
+  assert_finite(x_from_x)
+  assert_finite(y_from_y)
+  assert_finite(ildj_f_x)
+  assert_finite(fldj_x)
+  assert_finite(ildj_y)
+  assert_finite(fldj_g_y)
+  assert_finite(f_x_v)
+  assert_finite(g_y_v)
+
+  np.testing.assert_allclose(x_from_x, x, atol=atol, rtol=rtol)
+  np.testing.assert_allclose(y_from_y, y, atol=atol, rtol=rtol)
+  np.testing.assert_allclose(-ildj_f_x, fldj_x, atol=atol, rtol=rtol)
+  np.testing.assert_allclose(-ildj_y, fldj_g_y, atol=atol, rtol=rtol)
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/python/ops/distributions/categorical.py
similarity index 82%
rename from tensorflow/contrib/distributions/python/ops/categorical.py
rename to tensorflow/python/ops/distributions/categorical.py
index 67f3a1cc936..bad7e6e42f0 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Categorical(distribution.Distribution):
@@ -39,7 +39,7 @@ class Categorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a 3-class distiribution, with the 2nd class, the most likely to be
+  Creates a 3-class distribution, with the 2nd class, the most likely to be
   drawn from.
 
   ```python
@@ -47,7 +47,7 @@ class Categorical(distribution.Distribution):
   dist = Categorical(probs=p)
   ```
 
-  Creates a 3-class distiribution, with the 2nd class the most likely to be
+  Creates a 3-class distribution, with the 2nd class the most likely to be
   drawn from, using logits.
 
   ```python
@@ -97,18 +97,18 @@ class Categorical(distribution.Distribution):
         represents a vector of probabilities for each class. Only one of
         `logits` or `probs` should be passed in.
       dtype: The type of the event samples (default: int32).
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[logits, probs]) as ns:
+    with ops.name_scope(name, values=[logits, probs]):
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
@@ -133,9 +133,8 @@ class Categorical(distribution.Distribution):
             dtype=dtypes.int32,
             name="event_size")
       else:
-        self._event_size = array_ops.gather(logits_shape,
-                                            self._batch_rank,
-                                            name="event_size")
+        with ops.name_scope(name="event_size"):
+          self._event_size = logits_shape[self._batch_rank]
 
       if logits_shape_static[:-1].is_fully_defined():
         self._batch_shape_val = constant_op.constant(
@@ -147,14 +146,13 @@ class Categorical(distribution.Distribution):
           self._batch_shape_val = logits_shape[:-1]
     super(Categorical, self).__init__(
         dtype=dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._logits,
                        self._probs],
-        name=ns)
+        name=name)
 
   @property
   def event_size(self):
@@ -192,9 +190,25 @@ class Categorical(distribution.Distribution):
     samples = math_ops.cast(samples, self.dtype)
     ret = array_ops.reshape(
         array_ops.transpose(samples),
-        array_ops.concat(([n], self.batch_shape_tensor()), 0))
+        array_ops.concat([[n], self.batch_shape_tensor()], 0))
     return ret
 
+  def _cdf(self, k):
+    k = ops.convert_to_tensor(k, name="k")
+
+    # If there are multiple batch dimension, flatten them into one.
+    batch_flattened_probs = array_ops.reshape(self._probs,
+                                              [-1, self._event_size])
+    batch_flattened_k = array_ops.reshape(k, (-1,))
+
+    # Form a tensor to sum over.
+    mask_tensor = array_ops.sequence_mask(batch_flattened_k, self._event_size)
+    to_sum_over = array_ops.where(mask_tensor,
+                                  batch_flattened_probs,
+                                  array_ops.zeros_like(batch_flattened_probs))
+    batch_flat_cdf = math_ops.reduce_sum(to_sum_over, axis=-1)
+    return array_ops.reshape(batch_flat_cdf, self._batch_shape())
+
   def _log_prob(self, k):
     k = ops.convert_to_tensor(k, name="k")
     if self.logits.get_shape()[:-1] == k.get_shape():
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
similarity index 91%
rename from tensorflow/contrib/distributions/python/ops/dirichlet.py
rename to tensorflow/python/ops/distributions/dirichlet.py
index bd713cf08f2..923696a553c 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -29,6 +27,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -142,18 +142,18 @@ class Dirichlet(distribution.Distribution):
         `concentration.shape = [N1, N2, ..., Nm, k]` then
         `batch_shape = [N1, N2, ..., Nm]` and
         `event_shape = [k]`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration]) as ns:
+    with ops.name_scope(name, values=[concentration]):
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
           validate_args)
@@ -162,12 +162,11 @@ class Dirichlet(distribution.Distribution):
         dtype=self._concentration.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=True,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=[self._concentration,
                        self._total_concentration],
-        name=ns)
+        name=name)
 
   @property
   def concentration(self):
@@ -225,12 +224,13 @@ class Dirichlet(distribution.Distribution):
             axis=-1))
 
   def _mean(self):
-    return self.concentration / self.total_concentration[..., None]
+    return self.concentration / self.total_concentration[..., array_ops.newaxis]
 
   def _covariance(self):
     x = self._variance_scale_term() * self._mean()
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(x[..., None], x[..., None, :]),  # outer prod
+        -math_ops.matmul(x[..., array_ops.newaxis],
+                         x[..., array_ops.newaxis, :]),  # outer prod
         self._variance())
 
   def _variance(self):
@@ -240,16 +240,17 @@ class Dirichlet(distribution.Distribution):
 
   def _variance_scale_term(self):
     """Helper to `_covariance` and `_variance` which computes a shared scale."""
-    return math_ops.rsqrt(1. + self.total_concentration[..., None])
+    return math_ops.rsqrt(1. + self.total_concentration[..., array_ops.newaxis])
 
   @distribution_util.AppendDocstring(
       """Note: The mode is undefined when any `concentration <= 1`. If
-      `self.allow_nan_stats` is `True`, `NaN` is used for undefined modes.  If
+      `self.allow_nan_stats` is `True`, `NaN` is used for undefined modes. If
       `self.allow_nan_stats` is `False` an exception is raised when one or more
       modes are undefined.""")
   def _mode(self):
     k = math_ops.cast(self.event_shape_tensor()[0], self.dtype)
-    mode = (self.concentration - 1.) / (self.total_concentration[..., None] - k)
+    mode = (self.concentration - 1.) / (
+        self.total_concentration[..., array_ops.newaxis] - k)
     if self.allow_nan_stats:
       nan = array_ops.fill(
           array_ops.shape(mode),
@@ -290,7 +291,7 @@ class Dirichlet(distribution.Distribution):
             x,
             message="samples must be positive"),
         distribution_util.assert_close(
-            array_ops.ones((), dtype=self.dtype),
+            array_ops.ones([], dtype=self.dtype),
             math_ops.reduce_sum(x, -1),
             message="sample last-dimension must sum to `1`"),
     ], x)
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
similarity index 91%
rename from tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
rename to tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 11b6826c1ab..662a7655584 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -28,6 +26,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -36,10 +36,10 @@ __all__ = [
 
 
 _dirichlet_multinomial_sample_note = """For each batch of counts,
-`value = [n_0, ... ,n_{k-1}]`, `P[value]` is the probability that after sampling
-`self.total_count` draws from this Dirichlet-Multinomial distribution, the
-number of draws falling in class `j` is `n_j`. Since this definition is
-[exchangeable]( https://en.wikipedia.org/wiki/Exchangeable_random_variables);
+`value = [n_0, ..., n_{k-1}]`, `P[value]` is the probability that after
+sampling `self.total_count` draws from this Dirichlet-Multinomial distribution,
+the number of draws falling in class `j` is `n_j`. Since this definition is
+[exchangeable](https://en.wikipedia.org/wiki/Exchangeable_random_variables);
 different sequences have the same counts so the probability includes a
 combinatorial coefficient.
 
@@ -153,32 +153,31 @@ class DirichletMultinomial(distribution.Distribution):
     Args:
       total_count:  Non-negative floating point tensor, whose dtype is the same
         as `concentration`. The shape is broadcastable to `[N1,..., Nm]` with
-        `m >= 0`.  Defines this as a batch of `N1 x ... x Nm` different
+        `m >= 0`. Defines this as a batch of `N1 x ... x Nm` different
         Dirichlet multinomial distributions. Its components should be equal to
         integer values.
       concentration: Positive floating point tensor, whose dtype is the
         same as `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.
         Defines this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
         multinomial distributions.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, concentration]) as ns:
+    with ops.name_scope(name, values=[total_count, concentration]):
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
       #   we use the last dimension for the distribution, whereas
       #   the batch dimensions are the leading dimensions, which forces the
       #   distribution dimension to be defined explicitly (i.e. it cannot be
-      #   created automatically by prepending).  This forces enough
-      #   explicitness.
+      #   created automatically by prepending). This forces enough explicitness.
       # * All calls involving `counts` eventually require a broadcast between
       #  `counts` and concentration.
       self._total_count = self._maybe_assert_valid_total_count(
@@ -193,12 +192,11 @@ class DirichletMultinomial(distribution.Distribution):
         dtype=self._concentration.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=[self._total_count,
                        self._concentration],
-        name=ns)
+        name=name)
 
   @property
   def total_count(self):
@@ -261,7 +259,7 @@ class DirichletMultinomial(distribution.Distribution):
 
   def _mean(self):
     return self.total_count * (self.concentration /
-                               self.total_concentration[..., None])
+                               self.total_concentration[..., array_ops.newaxis])
 
   @distribution_util.AppendDocstring(
       """The covariance for each batch member is defined as the following:
@@ -284,7 +282,8 @@ class DirichletMultinomial(distribution.Distribution):
   def _covariance(self):
     x = self._variance_scale_term() * self._mean()
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(x[..., None], x[..., None, :]),  # outer prod
+        -math_ops.matmul(x[..., array_ops.newaxis],
+                         x[..., array_ops.newaxis, :]),  # outer prod
         self._variance())
 
   def _variance(self):
@@ -296,7 +295,7 @@ class DirichletMultinomial(distribution.Distribution):
     """Helper to `_covariance` and `_variance` which computes a shared scale."""
     # We must take care to expand back the last dim whenever we use the
     # total_concentration.
-    c0 = self.total_concentration[..., None]
+    c0 = self.total_concentration[..., array_ops.newaxis]
     return math_ops.sqrt((1. + c0 / self.total_count) / (1. + c0))
 
   def _maybe_assert_valid_concentration(self, concentration, validate_args):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution.py b/tensorflow/python/ops/distributions/distribution.py
similarity index 87%
rename from tensorflow/contrib/distributions/python/ops/distribution.py
rename to tensorflow/python/ops/distributions/distribution.py
index 979d1ffd0c0..a0be433a616 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -20,20 +20,19 @@ from __future__ import print_function
 
 import abc
 import contextlib
-import inspect
 import types
 
 import numpy as np
 import six
 
-from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.util import tf_inspect
 
 
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
@@ -63,8 +62,8 @@ def _copy_fn(fn):
   """
   if not callable(fn):
     raise TypeError("fn is not callable: %s" % fn)
-  # The blessed way to copy a function.  copy.deepcopy fails to create
-  # a non-reference copy.  Since:
+  # The blessed way to copy a function. copy.deepcopy fails to create a
+  # non-reference copy. Since:
   #   types.FunctionType == type(lambda: None),
   # and the docstring for the function type states:
   #
@@ -129,7 +128,7 @@ class _DistributionMeta(abc.ABCMeta):
       ValueError:  If a `Distribution` public method lacks a docstring.
     """
     if not baseclasses:  # Nothing to be done for Distribution
-      raise TypeError("Expected non-empty baseclass.  Does Distribution "
+      raise TypeError("Expected non-empty baseclass. Does Distribution "
                       "not subclass _BaseDistribution?")
     which_base = [
         base for base in baseclasses
@@ -155,12 +154,12 @@ class _DistributionMeta(abc.ABCMeta):
       if class_special_attr_value is None:
         # No _special method available, no need to update the docstring.
         continue
-      class_special_attr_docstring = inspect.getdoc(class_special_attr_value)
+      class_special_attr_docstring = tf_inspect.getdoc(class_special_attr_value)
       if not class_special_attr_docstring:
         # No docstring to append.
         continue
       class_attr_value = _copy_fn(base_attr_value)
-      class_attr_docstring = inspect.getdoc(base_attr_value)
+      class_attr_docstring = tf_inspect.getdoc(base_attr_value)
       if class_attr_docstring is None:
         raise ValueError(
             "Expected base class fn to contain a docstring: %s.%s"
@@ -185,7 +184,7 @@ class ReparameterizationType(object):
 
   `NOT_REPARAMETERIZED`: Samples from the distribution are not fully
     reparameterized, and straight-through gradients are either partially
-    unsupported or are not supported at all.  In this case, for purposes of
+    unsupported or are not supported at all. In this case, for purposes of
     e.g. RL or variational inference, it is generally safest to wrap the
     sample results in a `stop_gradients` call and instead use policy
     gradients / surrogate loss instead.
@@ -231,28 +230,28 @@ class Distribution(_BaseDistribution):
   `Distribution` is a base class for constructing and organizing properties
   (e.g., mean, variance) of random variables (e.g, Bernoulli, Gaussian).
 
-  ### Subclassing
+  #### Subclassing
 
   Subclasses are expected to implement a leading-underscore version of the
-  same-named function.  The argument signature should be identical except for
-  the omission of `name="..."`.  For example, to enable `log_prob(value,
+  same-named function. The argument signature should be identical except for
+  the omission of `name="..."`. For example, to enable `log_prob(value,
   name="log_prob")` a subclass should implement `_log_prob(value)`.
 
   Subclasses can append to public-level docstrings by providing
   docstrings for their method specializations. For example:
 
   ```python
-  @distribution_util.AppendDocstring("Some other details.")
+  @util.AppendDocstring("Some other details.")
   def _log_prob(self, value):
     ...
   ```
 
   would add the string "Some other details." to the `log_prob` function
-  docstring.  This is implemented as a simple decorator to avoid python
+  docstring. This is implemented as a simple decorator to avoid python
   linter complaining about missing Args/Returns/Raises sections in the
   partial docstrings.
 
-  ### Broadcasting, batching, and shapes
+  #### Broadcasting, batching, and shapes
 
   All distributions support batches of independent distributions of that type.
   The batch shape is determined by broadcasting together the parameters.
@@ -261,7 +260,7 @@ class Distribution(_BaseDistribution):
   `log_prob` reflect this broadcasting, as does the return value of `sample` and
   `sample_n`.
 
-  `sample_n_shape = (n,) + batch_shape + event_shape`, where `sample_n_shape` is
+  `sample_n_shape = [n] + batch_shape + event_shape`, where `sample_n_shape` is
   the shape of the `Tensor` returned from `sample_n`, `n` is the number of
   samples, `batch_shape` defines how many independent distributions there are,
   and `event_shape` defines the shape of samples from each of those independent
@@ -286,19 +285,19 @@ class Distribution(_BaseDistribution):
   # `event_shape_t` is a `Tensor` which will evaluate to [].
   event_shape_t = u.event_shape_tensor()
 
-  # Sampling returns a sample per distribution.  `samples` has shape
-  # (5, 2, 2), which is (n,) + batch_shape + event_shape, where n=5,
-  # batch_shape=(2, 2), and event_shape=().
+  # Sampling returns a sample per distribution. `samples` has shape
+  # [5, 2, 2], which is [n] + batch_shape + event_shape, where n=5,
+  # batch_shape=[2, 2], and event_shape=[].
   samples = u.sample_n(5)
 
   # The broadcasting holds across methods. Here we use `cdf` as an example. The
   # same holds for `log_cdf` and the likelihood functions.
 
-  # `cum_prob` has shape (2, 2) as the `value` argument was broadcasted to the
+  # `cum_prob` has shape [2, 2] as the `value` argument was broadcasted to the
   # shape of the `Uniform` instance.
   cum_prob_broadcast = u.cdf(4.0)
 
-  # `cum_prob`'s shape is (2, 2), one per distribution. No broadcasting
+  # `cum_prob`'s shape is [2, 2], one per distribution. No broadcasting
   # occurred.
   cum_prob_per_dist = u.cdf([[4.0, 5.0],
                              [6.0, 7.0]])
@@ -308,12 +307,12 @@ class Distribution(_BaseDistribution):
   cum_prob_invalid = u.cdf([4.0, 5.0, 6.0])
   ```
 
-  ### Parameter values leading to undefined statistics or distributions.
+  #### Parameter values leading to undefined statistics or distributions.
 
   Some distributions do not have well-defined statistics for all initialization
-  parameter values.  For example, the beta distribution is parameterized by
-  positive real numbers `a` and `b`, and does not have well-defined mode if
-  `a < 1` or `b < 1`.
+  parameter values. For example, the beta distribution is parameterized by
+  positive real numbers `concentration1` and `concentration0`, and does not have
+  well-defined mode if `concentration1 < 1` or `concentration0 < 1`.
 
   The user is given the option of raising an exception or returning `NaN`.
 
@@ -343,7 +342,6 @@ class Distribution(_BaseDistribution):
 
   def __init__(self,
                dtype,
-               is_continuous,
                reparameterization_type,
                validate_args,
                allow_nan_stats,
@@ -356,35 +354,35 @@ class Distribution(_BaseDistribution):
 
     Args:
       dtype: The type of the event samples. `None` implies no type-enforcement.
-      is_continuous: Python boolean. If `True` this
-        `Distribution` is continuous over its supported domain.
       reparameterization_type: Instance of `ReparameterizationType`.
         If `distributions.FULLY_REPARAMETERIZED`, this
         `Distribution` can be reparameterized in terms of some standard
         distribution with a function whose Jacobian is constant for the support
-        of the standard distribution.  If `distributions.NOT_REPARAMETERIZED`,
+        of the standard distribution. If `distributions.NOT_REPARAMETERIZED`,
         then no such reparameterization is available.
-      validate_args: Python boolean.  Whether to validate input with asserts.
-        If `validate_args` is `False`, and the inputs are invalid,
-        correct behavior is not guaranteed.
-      allow_nan_stats: Python boolean.  If `False`, raise an
-        exception if a statistic (e.g., mean, mode) is undefined for any batch
-        member. If True, batch members with valid parameters leading to
-        undefined statistics will return `NaN` for this statistic.
-      parameters: Python dictionary of parameters used to instantiate this
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      parameters: Python `dict` of parameters used to instantiate this
         `Distribution`.
-      graph_parents: Python list of graph prerequisites of this `Distribution`.
-      name: A name for this distribution. Default: subclass name.
+      graph_parents: Python `list` of graph prerequisites of this
+        `Distribution`.
+      name: Python `str` name prefixed to Ops created by this class. Default:
+        subclass name.
 
     Raises:
       ValueError: if any member of graph_parents is `None` or not a `Tensor`.
     """
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
-      if t is None or not contrib_framework.is_tensor(t):
+      if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
     self._dtype = dtype
-    self._is_continuous = is_continuous
     self._reparameterization_type = reparameterization_type
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
@@ -419,8 +417,8 @@ class Distribution(_BaseDistribution):
 
     This is a class method that describes what key/value arguments are required
     to instantiate the given `Distribution` so that a particular shape is
-    returned for that instance's call to `sample()`.  Assumes that
-    the sample's shape is known statically.
+    returned for that instance's call to `sample()`. Assumes that the sample's
+    shape is known statically.
 
     Subclasses should override class method `_param_shapes` to return
     constant-valued tensors when constant values are fed.
@@ -474,10 +472,6 @@ class Distribution(_BaseDistribution):
     return dict((k, v) for k, v in self._parameters.items()
                 if not k.startswith("__") and k != "self")
 
-  @property
-  def is_continuous(self):
-    return self._is_continuous
-
   @property
   def reparameterization_type(self):
     """Describes how samples from the distribution are reparameterized.
@@ -493,39 +487,38 @@ class Distribution(_BaseDistribution):
 
   @property
   def allow_nan_stats(self):
-    """Python boolean describing behavior when a stat is undefined.
+    """Python `bool` describing behavior when a stat is undefined.
 
-    Stats return +/- infinity when it makes sense.  E.g., the variance
-    of a Cauchy distribution is infinity.  However, sometimes the
-    statistic is undefined, e.g., if a distribution's pdf does not achieve a
-    maximum within the support of the distribution, the mode is undefined.
-    If the mean is undefined, then by definition the variance is undefined.
-    E.g. the mean for Student's T for df = 1 is undefined (no clear way to say
-    it is either + or - infinity), so the variance = E[(X - mean)^2] is also
-    undefined.
+    Stats return +/- infinity when it makes sense. E.g., the variance of a
+    Cauchy distribution is infinity. However, sometimes the statistic is
+    undefined, e.g., if a distribution's pdf does not achieve a maximum within
+    the support of the distribution, the mode is undefined. If the mean is
+    undefined, then by definition the variance is undefined. E.g. the mean for
+    Student's T for df = 1 is undefined (no clear way to say it is either + or -
+    infinity), so the variance = E[(X - mean)**2] is also undefined.
 
     Returns:
-      allow_nan_stats: Python boolean.
+      allow_nan_stats: Python `bool`.
     """
     return self._allow_nan_stats
 
   @property
   def validate_args(self):
-    """Python boolean indicated possibly expensive checks are enabled."""
+    """Python `bool` indicating possibly expensive checks are enabled."""
     return self._validate_args
 
   def copy(self, **override_parameters_kwargs):
     """Creates a deep copy of the distribution.
 
     Note: the copy distribution may continue to depend on the original
-    intialization arguments.
+    initialization arguments.
 
     Args:
       **override_parameters_kwargs: String/value dictionary of initialization
         arguments to override with new values.
 
     Returns:
-      distribution: A new instance of `type(self)` intitialized from the union
+      distribution: A new instance of `type(self)` initialized from the union
         of self.parameters and override_parameters_kwargs, i.e.,
         `dict(self.parameters, **override_parameters_kwargs)`.
     """
@@ -611,7 +604,7 @@ class Distribution(_BaseDistribution):
       name: The name to give this op.
 
     Returns:
-      is_scalar_event: `Boolean` `scalar` `Tensor`.
+      is_scalar_event: `bool` scalar `Tensor`.
     """
     with self._name_scope(name):
       return ops.convert_to_tensor(
@@ -625,7 +618,7 @@ class Distribution(_BaseDistribution):
       name: The name to give this op.
 
     Returns:
-      is_scalar_batch: `Boolean` `scalar` `Tensor`.
+      is_scalar_batch: `bool` scalar `Tensor`.
     """
     with self._name_scope(name):
       return ops.convert_to_tensor(
@@ -679,7 +672,7 @@ class Distribution(_BaseDistribution):
           raise original_exception
 
   def log_prob(self, value, name="log_prob"):
-    """Log probability density/mass function (depending on `is_continuous`).
+    """Log probability density/mass function.
 
     Args:
       value: `float` or `double` `Tensor`.
@@ -706,7 +699,7 @@ class Distribution(_BaseDistribution):
           raise original_exception
 
   def prob(self, value, name="prob"):
-    """Probability density/mass function (depending on `is_continuous`).
+    """Probability density/mass function.
 
     Args:
       value: `float` or `double` `Tensor`.
@@ -737,7 +730,7 @@ class Distribution(_BaseDistribution):
 
     Given random variable `X`, the cumulative distribution function `cdf` is:
 
-    ```
+    ```none
     log_cdf(x) := Log[ P[X <= x] ]
     ```
 
@@ -774,7 +767,7 @@ class Distribution(_BaseDistribution):
 
     Given random variable `X`, the cumulative distribution function `cdf` is:
 
-    ```
+    ```none
     cdf(x) := P[X <= x]
     ```
 
@@ -798,7 +791,7 @@ class Distribution(_BaseDistribution):
         return self._log_survival_function(value, **kwargs)
       except NotImplementedError as original_exception:
         try:
-          return math_ops.log(1. - self.cdf(value, **kwargs))
+          return math_ops.log1p(-self.cdf(value, **kwargs))
         except NotImplementedError:
           raise original_exception
 
@@ -807,7 +800,7 @@ class Distribution(_BaseDistribution):
 
     Given random variable `X`, the survival function is defined:
 
-    ```
+    ```none
     log_survival_function(x) = Log[ P[X > x] ]
                              = Log[ 1 - P[X <= x] ]
                              = Log[ 1 - cdf(x) ]
@@ -845,7 +838,7 @@ class Distribution(_BaseDistribution):
 
     Given random variable `X`, the survival function is defined:
 
-    ```
+    ```none
     survival_function(x) = P[X > x]
                          = 1 - P[X <= x]
                          = 1 - cdf(x).
@@ -877,6 +870,36 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._mean()
 
+  def _quantile(self, value):
+    raise NotImplementedError("quantile is not implemented")
+
+  def _call_quantile(self, value, name, **kwargs):
+    with self._name_scope(name, values=[value]):
+      value = ops.convert_to_tensor(value, name="value")
+      try:
+        return self._quantile(value, **kwargs)
+      except NotImplementedError as original_exception:
+        raise original_exception
+
+  def quantile(self, value, name="quantile"):
+    """Quantile function. Aka "inverse cdf" or "percent point function".
+
+    Given random variable `X` and `p in [0, 1]`, the `quantile` is:
+
+    ```none
+    quantile(p) := x such that P[X <= x] == p
+    ```
+
+    Args:
+      value: `float` or `double` `Tensor`.
+      name: The name to give this op.
+
+    Returns:
+      quantile: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
+        values of type `self.dtype`.
+    """
+    return self._call_quantile(value, name)
+
   def _variance(self):
     raise NotImplementedError("variance is not implemented")
 
@@ -964,7 +987,7 @@ class Distribution(_BaseDistribution):
 
     ```none
     Cov[i, j] = Covariance(Vec(X)_i, Vec(X)_j) = [as above]
-    ````
+    ```
 
     where `Cov` is a (batch of) `k' x k'` matrices,
     `0 <= (i, j) < k' = reduce_prod(event_shape)`, and `Vec` is some function
@@ -1010,10 +1033,9 @@ class Distribution(_BaseDistribution):
     if ndims is None:
       # Maybe expand_dims.
       ndims = array_ops.rank(x)
-      expanded_shape = distribution_util.pick_vector(
+      expanded_shape = util.pick_vector(
           math_ops.equal(ndims, 0),
-          np.array([1], dtype=np.int32),
-          array_ops.shape(x))
+          np.array([1], dtype=np.int32), array_ops.shape(x))
       x = array_ops.reshape(x, expanded_shape)
     elif ndims == 0:
       # Definitely expand_dims.
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/python/ops/distributions/exponential.py
similarity index 77%
rename from tensorflow/contrib/distributions/python/ops/exponential.py
rename to tensorflow/python/ops/distributions/exponential.py
index 6bff48c3178..281641b9156 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
@@ -78,29 +78,29 @@ class Exponential(gamma.Gamma):
     Args:
       rate: Floating point tensor, equivalent to `1 / mean`. Must contain only
         positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
     # Even though all statistics of are defined for valid inputs, this is not
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
     # through to the parent class results in unnecessary asserts.
-    with ops.name_scope(name, values=[rate]) as ns:
+    with ops.name_scope(name, values=[rate]):
       self._rate = ops.convert_to_tensor(rate, name="rate")
     super(Exponential, self).__init__(
-        concentration=array_ops.ones((), dtype=self._rate.dtype),
+        concentration=array_ops.ones([], dtype=self._rate.dtype),
         rate=self._rate,
         allow_nan_stats=allow_nan_stats,
         validate_args=validate_args,
-        name=ns)
+        name=name)
     # While the Gamma distribution is not reparameterizable, the exponential
     # distribution is.
     self._reparameterization_type = True
@@ -116,13 +116,18 @@ class Exponential(gamma.Gamma):
     return self._rate
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self._rate)), 0)
-    # Sample uniformly-at-random from the open-interval (0, 1).
+    shape = array_ops.concat([[n], array_ops.shape(self._rate)], 0)
+    # Uniform variates must be sampled from the open-interval `(0, 1)` rather
+    # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
+    # because it is the smallest, positive, "normal" number. A "normal" number
+    # is such that the mantissa has an implicit leading 1. Normal, positive
+    # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
+    # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
+    # 0.
     sampled = random_ops.random_uniform(
         shape,
-        minval=np.nextafter(self.dtype.as_numpy_dtype(0.),
-                            self.dtype.as_numpy_dtype(1.)),
-        maxval=array_ops.ones((), dtype=self.dtype),
+        minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
+        maxval=1.,
         seed=seed,
         dtype=self.dtype)
     return -math_ops.log(sampled) / self._rate
@@ -137,10 +142,10 @@ class ExponentialWithSoftplusRate(Exponential):
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[rate]) as ns:
+    with ops.name_scope(name, values=[rate]):
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/python/ops/distributions/gamma.py
similarity index 90%
rename from tensorflow/contrib/distributions/python/ops/gamma.py
rename to tensorflow/python/ops/distributions/gamma.py
index cec72f1ec66..4ac2b9b4ef8 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -20,10 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -112,21 +111,21 @@ class Gamma(distribution.Distribution):
         distribution(s). Must contain only positive values.
       rate: Floating point tensor, the inverse scale params of the
         distribution(s). Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]) as ns:
+    with ops.name_scope(name, values=[concentration, rate]):
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
           check_ops.assert_positive(rate),
@@ -134,18 +133,17 @@ class Gamma(distribution.Distribution):
         self._concentration = array_ops.identity(
             concentration, name="concentration")
         self._rate = array_ops.identity(rate, name="rate")
-        contrib_tensor_util.assert_same_float_dtype(
+        check_ops.assert_same_float_dtype(
             [self._concentration, self._rate])
     super(Gamma, self).__init__(
         dtype=self._concentration.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
-        is_continuous=True,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=[self._concentration,
                        self._rate],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -231,7 +229,7 @@ class Gamma(distribution.Distribution):
 
   @distribution_util.AppendDocstring(
       """The mode of a gamma distribution is `(shape - 1) / rate` when
-      `shape > 1`, and `NaN` otherwise.  If `self.allow_nan_stats` is `False`,
+      `shape > 1`, and `NaN` otherwise. If `self.allow_nan_stats` is `False`,
       an exception will be raised rather than returning `NaN`.""")
   def _mode(self):
     mode = (self.concentration - 1.) / self.rate
@@ -250,7 +248,7 @@ class Gamma(distribution.Distribution):
           ], mode)
 
   def _maybe_assert_valid_sample(self, x):
-    contrib_tensor_util.assert_same_float_dtype(tensors=[x], dtype=self.dtype)
+    check_ops.assert_same_float_dtype(tensors=[x], dtype=self.dtype)
     if not self.validate_args:
       return x
     return control_flow_ops.with_dependencies([
@@ -268,14 +266,14 @@ class GammaWithSoftplusConcentrationRate(Gamma):
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
     parameters = locals()
-    with ops.name_scope(name, values=[concentration, rate]) as ns:
+    with ops.name_scope(name, values=[concentration, rate]):
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
                                     name="softplus_concentration"),
           rate=nn.softplus(rate, name="softplus_rate"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
new file mode 100644
index 00000000000..f277eda8bbf
--- /dev/null
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -0,0 +1,63 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Identity bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Identity",
+]
+
+
+class Identity(bijector.Bijector):
+  """Compute Y = g(X) = X.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
+    # ndim and 1 event ndim (i.e., vector of vectors).
+    identity = Identity(event_ndims=1)
+    x = [[1., 2],
+         [3, 4]]
+    x == identity.forward(x) == identity.inverse(x)
+    ```
+
+  """
+
+  def __init__(self, validate_args=False, event_ndims=0, name="identity"):
+    super(Identity, self).__init__(
+        is_constant_jacobian=True,
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    return x
+
+  def _inverse(self, y):
+    return y
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
new file mode 100644
index 00000000000..9770d82bd83
--- /dev/null
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -0,0 +1,147 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registration and usage mechanisms for KL-divergences."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_inspect
+
+
+_DIVERGENCES = {}
+
+
+def _registered_kl(type_a, type_b):
+  """Get the KL function registered for classes a and b."""
+  hierarchy_a = tf_inspect.getmro(type_a)
+  hierarchy_b = tf_inspect.getmro(type_b)
+  dist_to_children = None
+  kl_fn = None
+  for mro_to_a, parent_a in enumerate(hierarchy_a):
+    for mro_to_b, parent_b in enumerate(hierarchy_b):
+      candidate_dist = mro_to_a + mro_to_b
+      candidate_kl_fn = _DIVERGENCES.get((parent_a, parent_b), None)
+      if not kl_fn or (candidate_kl_fn and candidate_dist < dist_to_children):
+        dist_to_children = candidate_dist
+        kl_fn = candidate_kl_fn
+  return kl_fn
+
+
+def kl_divergence(distribution_a, distribution_b,
+                  allow_nan_stats=True, name=None):
+  """Get the KL-divergence KL(distribution_a || distribution_b).
+
+  If there is no KL method registered specifically for `type(distribution_a)`
+  and `type(distribution_b)`, then the class hierarchies of these types are
+  searched.
+
+  If one KL method is registered between any pairs of classes in these two
+  parent hierarchies, it is used.
+
+  If more than one such registered method exists, the method whose registered
+  classes have the shortest sum MRO paths to the input types is used.
+
+  If more than one such shortest path exists, the first method
+  identified in the search is used (favoring a shorter MRO distance to
+  `type(distribution_a)`).
+
+  Args:
+    distribution_a: The first distribution.
+    distribution_b: The second distribution.
+    allow_nan_stats: Python `bool`, default `True`. When `True`,
+      statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+      indicate the result is undefined. When `False`, an exception is raised
+      if one or more of the statistic's batch members are undefined.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    A Tensor with the batchwise KL-divergence between `distribution_a`
+    and `distribution_b`.
+
+  Raises:
+    NotImplementedError: If no KL method is defined for distribution types
+      of `distribution_a` and `distribution_b`.
+  """
+  kl_fn = _registered_kl(type(distribution_a), type(distribution_b))
+  if kl_fn is None:
+    raise NotImplementedError(
+        "No KL(distribution_a || distribution_b) registered for distribution_a "
+        "type %s and distribution_b type %s"
+        % (type(distribution_a).__name__, type(distribution_b).__name__))
+
+  with ops.name_scope("KullbackLeibler"):
+    kl_t = kl_fn(distribution_a, distribution_b, name=name)
+    if allow_nan_stats:
+      return kl_t
+
+    # Check KL for NaNs
+    kl_t = array_ops.identity(kl_t, name="kl")
+
+    with ops.control_dependencies([
+        control_flow_ops.Assert(
+            math_ops.logical_not(
+                math_ops.reduce_any(math_ops.is_nan(kl_t))),
+            ["KL calculation between %s and %s returned NaN values "
+             "(and was called with allow_nan_stats=False). Values:"
+             % (distribution_a.name, distribution_b.name), kl_t])]):
+      return array_ops.identity(kl_t, name="checked_kl")
+
+
+class RegisterKL(object):
+  """Decorator to register a KL divergence implementation function.
+
+  Usage:
+
+  @distributions.RegisterKL(distributions.Normal, distributions.Normal)
+  def _kl_normal_mvn(norm_a, norm_b):
+    # Return KL(norm_a || norm_b)
+  """
+
+  def __init__(self, dist_cls_a, dist_cls_b):
+    """Initialize the KL registrar.
+
+    Args:
+      dist_cls_a: the class of the first argument of the KL divergence.
+      dist_cls_b: the class of the second argument of the KL divergence.
+    """
+    self._key = (dist_cls_a, dist_cls_b)
+
+  def __call__(self, kl_fn):
+    """Perform the KL registration.
+
+    Args:
+      kl_fn: The function to use for the KL divergence.
+
+    Returns:
+      kl_fn
+
+    Raises:
+      TypeError: if kl_fn is not a callable.
+      ValueError: if a KL divergence function has already been registered for
+        the given argument classes.
+    """
+    if not callable(kl_fn):
+      raise TypeError("kl_fn must be callable, received: %s" % kl_fn)
+    if self._key in _DIVERGENCES:
+      raise ValueError("KL(%s || %s) has already been registered to: %s"
+                       % (self._key[0].__name__, self._key[1].__name__,
+                          _DIVERGENCES[self._key]))
+    _DIVERGENCES[self._key] = kl_fn
+    return kl_fn
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/python/ops/distributions/laplace.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/ops/laplace.py
rename to tensorflow/python/ops/distributions/laplace.py
index c4fdf38b5aa..5c964ff78a5 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -22,9 +22,6 @@ import math
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import special_math
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +31,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
@@ -86,35 +85,34 @@ class Laplace(distribution.Distribution):
         of the distribution.
       scale: Positive floating point tensor which characterizes the spread of
         the distribution.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `loc` and `scale` are of different dtype.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]) as ns:
+    with ops.name_scope(name, values=[loc, scale]):
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
       super(Laplace, self).__init__(
           dtype=self._loc.dtype,
-          is_continuous=True,
           reparameterization_type=distribution.FULLY_REPARAMETERIZED,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
           parameters=parameters,
           graph_parents=[self._loc, self._scale],
-          name=ns)
+          name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -147,8 +145,13 @@ class Laplace(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
-    # Sample uniformly-at-random from the open-interval (-1, 1).
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    # Uniform variates must be sampled from the open-interval `(-1, 1)` rather
+    # than `[-1, 1)`. In the case of `(0, 1)` we'd use
+    # `np.finfo(self.dtype.as_numpy_dtype).tiny` because it is the smallest,
+    # positive, "normal" number. However, the concept of subnormality exists
+    # only at zero; here we need the smallest usable number larger than -1,
+    # i.e., `-1 + eps/2`.
     uniform_samples = random_ops.random_uniform(
         shape=shape,
         minval=np.nextafter(self.dtype.as_numpy_dtype(-1.),
@@ -157,7 +160,7 @@ class Laplace(distribution.Distribution):
         dtype=self.dtype,
         seed=seed)
     return (self.loc - self.scale * math_ops.sign(uniform_samples) *
-            math_ops.log(1. - math_ops.abs(uniform_samples)))
+            math_ops.log1p(-math_ops.abs(uniform_samples)))
 
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
@@ -213,11 +216,11 @@ class LaplaceWithSoftplusScale(Laplace):
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]) as ns:
+    with ops.name_scope(name, values=[loc, scale]):
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
similarity index 89%
rename from tensorflow/contrib/distributions/python/ops/multinomial.py
rename to tensorflow/python/ops/distributions/multinomial.py
index bcf2acb7152..a5bea7b4bad 100644
--- a/tensorflow/contrib/distributions/python/ops/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -136,7 +136,7 @@ class Multinomial(distribution.Distribution):
     Args:
       total_count: Non-negative floating point tensor with shape broadcastable
         to `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
-        `N1 x ... x Nm` different Multinomial distributions.  Its components
+        `N1 x ... x Nm` different Multinomial distributions. Its components
         should be equal to integer values.
       logits: Floating point tensor representing the log-odds of a
         positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
@@ -144,22 +144,22 @@ class Multinomial(distribution.Distribution):
         `N1 x ... x Nm` different `k` class Multinomial distributions. Only one
         of `logits` or `probs` should be passed in.
       probs: Positive floating point tensor with shape broadcastable to
-        `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`.  Defines
+        `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`. Defines
         this as a batch of `N1 x ... x Nm` different `k` class Multinomial
         distributions. `probs`'s components in the last portion of its shape
         should sum to `1`. Only one of `logits` or `probs` should be passed in.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[total_count, logits, probs]) as ns:
+    with ops.name_scope(name, values=[total_count, logits, probs]):
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
           validate_args)
@@ -169,10 +169,9 @@ class Multinomial(distribution.Distribution):
           multidimensional=True,
           validate_args=validate_args,
           name=name)
-      self._mean_val = self._total_count[..., None] * self._probs
+      self._mean_val = self._total_count[..., array_ops.newaxis] * self._probs
     super(Multinomial, self).__init__(
         dtype=self._probs.dtype,
-        is_continuous=False,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
@@ -180,7 +179,7 @@ class Multinomial(distribution.Distribution):
         graph_parents=[self._total_count,
                        self._logits,
                        self._probs],
-        name=ns)
+        name=name)
 
   @property
   def total_count(self):
@@ -229,7 +228,7 @@ class Multinomial(distribution.Distribution):
         seed=seed)
     draws = array_ops.reshape(draws, shape=[-1, n, n_draws])
     x = math_ops.reduce_sum(array_ops.one_hot(draws, depth=k),
-                            reduction_indices=-2)  # shape: [B, n, k]
+                            axis=-2)  # shape: [B, n, k]
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
     return array_ops.reshape(x, final_shape)
@@ -254,13 +253,16 @@ class Multinomial(distribution.Distribution):
     return array_ops.identity(self._mean_val)
 
   def _covariance(self):
-    p = self.probs * array_ops.ones_like(self.total_count)[..., None]
+    p = self.probs * array_ops.ones_like(
+        self.total_count)[..., array_ops.newaxis]
     return array_ops.matrix_set_diag(
-        -math_ops.matmul(self._mean_val[..., None], p[..., None, :]),
+        -math_ops.matmul(self._mean_val[..., array_ops.newaxis],
+                         p[..., array_ops.newaxis, :]),  # outer product
         self._variance())
 
   def _variance(self):
-    p = self.probs * array_ops.ones_like(self.total_count)[..., None]
+    p = self.probs * array_ops.ones_like(
+        self.total_count)[..., array_ops.newaxis]
     return self._mean_val - self._mean_val * p
 
   def _maybe_assert_valid_total_count(self, total_count, validate_args):
@@ -279,14 +281,11 @@ class Multinomial(distribution.Distribution):
     """Check counts for proper shape, values, then return tensor version."""
     if not self.validate_args:
       return counts
+
+    counts = distribution_util.embed_check_nonnegative_discrete(
+        counts, check_integer=True)
     return control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            counts,
-            message="counts must be non-negative."),
         check_ops.assert_equal(
             self.total_count, math_ops.reduce_sum(counts, -1),
             message="counts must sum to `self.total_count`"),
-        distribution_util.assert_integer_form(
-            counts,
-            message="counts cannot contain fractional components."),
     ], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/python/ops/distributions/normal.py
similarity index 87%
rename from tensorflow/contrib/distributions/python/ops/normal.py
rename to tensorflow/python/ops/distributions/normal.py
index 52634d2ff4c..0ef1c91df8c 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -20,10 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import special_math
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +29,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
@@ -71,14 +70,14 @@ class Normal(distribution.Distribution):
 
   ```python
   # Define a single scalar Normal distribution.
-  dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
+  dist = tf.distributions.Normal(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Normals.
   # The first has mean 1 and standard deviation 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -93,7 +92,7 @@ class Normal(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Normals.
   # Both have mean 1, but different standard deviations.
-  dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
@@ -117,35 +116,34 @@ class Normal(distribution.Distribution):
       loc: Floating point tensor; the means of the distribution(s).
       scale: Floating point tensor; the stddevs of the distribution(s).
         Must contain only positive values.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[loc, scale]) as ns:
+    with ops.name_scope(name, values=[loc, scale]):
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype((self._loc, self._scale))
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
     super(Normal, self).__init__(
         dtype=self._scale.dtype,
-        is_continuous=True,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._loc, self._scale],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -180,7 +178,7 @@ class Normal(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], array_ops.shape(self.mean())), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     sampled = random_ops.random_normal(
         shape=shape, mean=0., stddev=1., dtype=self.loc.dtype, seed=seed)
     return sampled * self.scale + self.loc
@@ -217,6 +215,9 @@ class Normal(distribution.Distribution):
   def _mean(self):
     return self.loc * array_ops.ones_like(self.scale)
 
+  def _quantile(self, p):
+    return self._inv_z(special_math.ndtri(p))
+
   def _stddev(self):
     return self.scale * array_ops.ones_like(self.loc)
 
@@ -228,6 +229,11 @@ class Normal(distribution.Distribution):
     with ops.name_scope("standardize", values=[x]):
       return (x - self.loc) / self.scale
 
+  def _inv_z(self, z):
+    """Reconstruct input `x` from a its normalized version."""
+    with ops.name_scope("reconstruct", values=[z]):
+      return z * self.scale + self.loc
+
 
 class NormalWithSoftplusScale(Normal):
   """Normal with softplus applied to `scale`."""
@@ -239,13 +245,13 @@ class NormalWithSoftplusScale(Normal):
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[scale]) as ns:
+    with ops.name_scope(name, values=[scale]):
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
 
 
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
new file mode 100644
index 00000000000..3a804c941a7
--- /dev/null
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -0,0 +1,401 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Special Math Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = [
+    "ndtr",
+    "ndtri",
+    "log_ndtr",
+    "log_cdf_laplace",
+]
+
+
+# log_ndtr uses different functions over the ranges
+# (-infty, lower](lower, upper](upper, infty)
+# Lower bound values were chosen by examining where the support of ndtr
+# appears to be zero, relative to scipy's (which is always 64bit). They were
+# then made more conservative just to be safe. (Conservative means use the
+# expansion more than we probably need to.) See `NdtrTest` in
+# special_math_test.py.
+LOGNDTR_FLOAT64_LOWER = -20
+LOGNDTR_FLOAT32_LOWER = -10
+
+# Upper bound values were chosen by examining for which values of 'x'
+# Log[cdf(x)] is 0, after which point we need to use the approximation
+# Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x). We chose a value slightly
+# conservative, meaning we use the approximation earlier than needed.
+LOGNDTR_FLOAT64_UPPER = 8
+LOGNDTR_FLOAT32_UPPER = 5
+
+
+def ndtr(x, name="ndtr"):
+  """Normal distribution function.
+
+  Returns the area under the Gaussian probability density function, integrated
+  from minus infinity to x:
+
+  ```
+                    1       / x
+     ndtr(x)  = ----------  |    exp(-0.5 t**2) dt
+                sqrt(2 pi)  /-inf
+
+              = 0.5 (1 + erf(x / sqrt(2)))
+              = 0.5 erfc(x / sqrt(2))
+  ```
+
+  Args:
+    x: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="ndtr").
+
+  Returns:
+    ndtr: `Tensor` with `dtype=x.dtype`.
+
+  Raises:
+    TypeError: if `x` is not floating-type.
+  """
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.as_numpy_dtype not in [np.float32, np.float64]:
+      raise TypeError(
+          "x.dtype=%s is not handled, see docstring for supported types."
+          % x.dtype)
+    return _ndtr(x)
+
+
+def _ndtr(x):
+  """Implements ndtr core logic."""
+  half_sqrt_2 = constant_op.constant(
+      0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
+  w = x * half_sqrt_2
+  z = math_ops.abs(w)
+  y = array_ops.where(math_ops.less(z, half_sqrt_2),
+                      1. + math_ops.erf(w),
+                      array_ops.where(math_ops.greater(w, 0.),
+                                      2. - math_ops.erfc(z),
+                                      math_ops.erfc(z)))
+  return 0.5 * y
+
+
+def ndtri(p, name="ndtri"):
+  """The inverse of the CDF of the Normal distribution function.
+
+  Returns x such that the area under the pdf from minus infinity to x is equal
+  to p.
+
+  A piece-wise rational approximation is done for the function.
+  This is a port of the implementation in netlib.
+
+  Args:
+    p: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="ndtri").
+
+  Returns:
+    x: `Tensor` with `dtype=p.dtype`.
+
+  Raises:
+    TypeError: if `p` is not floating-type.
+  """
+
+  with ops.name_scope(name, values=[p]):
+    p = ops.convert_to_tensor(p, name="p")
+    if p.dtype.as_numpy_dtype not in [np.float32, np.float64]:
+      raise TypeError(
+          "p.dtype=%s is not handled, see docstring for supported types."
+          % p.dtype)
+    return _ndtri(p)
+
+
+def _ndtri(p):
+  """Implements ndtri core logic."""
+
+  # Constants used in piece-wise rational approximations. Taken from the cephes
+  # library:
+  # https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtri.c
+  p0 = list(reversed([-5.99633501014107895267E1,
+                      9.80010754185999661536E1,
+                      -5.66762857469070293439E1,
+                      1.39312609387279679503E1,
+                      -1.23916583867381258016E0]))
+  q0 = list(reversed([1.0,
+                      1.95448858338141759834E0,
+                      4.67627912898881538453E0,
+                      8.63602421390890590575E1,
+                      -2.25462687854119370527E2,
+                      2.00260212380060660359E2,
+                      -8.20372256168333339912E1,
+                      1.59056225126211695515E1,
+                      -1.18331621121330003142E0]))
+  p1 = list(reversed([4.05544892305962419923E0,
+                      3.15251094599893866154E1,
+                      5.71628192246421288162E1,
+                      4.40805073893200834700E1,
+                      1.46849561928858024014E1,
+                      2.18663306850790267539E0,
+                      -1.40256079171354495875E-1,
+                      -3.50424626827848203418E-2,
+                      -8.57456785154685413611E-4]))
+  q1 = list(reversed([1.0,
+                      1.57799883256466749731E1,
+                      4.53907635128879210584E1,
+                      4.13172038254672030440E1,
+                      1.50425385692907503408E1,
+                      2.50464946208309415979E0,
+                      -1.42182922854787788574E-1,
+                      -3.80806407691578277194E-2,
+                      -9.33259480895457427372E-4]))
+  p2 = list(reversed([3.23774891776946035970E0,
+                      6.91522889068984211695E0,
+                      3.93881025292474443415E0,
+                      1.33303460815807542389E0,
+                      2.01485389549179081538E-1,
+                      1.23716634817820021358E-2,
+                      3.01581553508235416007E-4,
+                      2.65806974686737550832E-6,
+                      6.23974539184983293730E-9]))
+  q2 = list(reversed([1.0,
+                      6.02427039364742014255E0,
+                      3.67983563856160859403E0,
+                      1.37702099489081330271E0,
+                      2.16236993594496635890E-1,
+                      1.34204006088543189037E-2,
+                      3.28014464682127739104E-4,
+                      2.89247864745380683936E-6,
+                      6.79019408009981274425E-9]))
+
+  def _create_polynomial(var, coeffs):
+    """Compute n_th order polynomial via Horner's method."""
+    if not coeffs:
+      return 0.
+    return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var
+
+  maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p)
+  # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
+  # later on. The result from the computation when p == 0 is not used so any
+  # number that doesn't result in NaNs is fine.
+  sanitized_mcp = array_ops.where(
+      maybe_complement_p <= 0.,
+      constant_op.constant(0.5, dtype=p.dtype, shape=p.shape),
+      maybe_complement_p)
+
+  # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
+  w = sanitized_mcp - 0.5
+  ww = w ** 2
+  x_for_big_p = w + w * ww * (_create_polynomial(ww, p0)
+                              / _create_polynomial(ww, q0))
+  x_for_big_p *= -np.sqrt(2. * np.pi)
+
+  # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
+  # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
+  # arrays based on wether p < exp(-32).
+  z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
+  first_term = z - math_ops.log(z) / z
+  second_term_small_p = (_create_polynomial(1. / z, p2)
+                         / _create_polynomial(1. / z, q2)) / z
+  second_term_otherwise = (_create_polynomial(1. / z, p1)
+                           / _create_polynomial(1. / z, q1)) / z
+  x_for_small_p = first_term - second_term_small_p
+  x_otherwise = first_term - second_term_otherwise
+
+  x = array_ops.where(sanitized_mcp > np.exp(-2.),
+                      x_for_big_p,
+                      array_ops.where(z >= 8.0, x_for_small_p, x_otherwise))
+
+  x = array_ops.where(p > 1. - np.exp(-2.), x, -x)
+  infinity = constant_op.constant(np.inf, dtype=x.dtype, shape=x.shape)
+  x_nan_replaced = array_ops.where(
+      p <= 0.0, -infinity, array_ops.where(p >= 1.0, infinity, x))
+  return x_nan_replaced
+
+
+def log_ndtr(x, series_order=3, name="log_ndtr"):
+  """Log Normal distribution function.
+
+  For details of the Normal distribution function see `ndtr`.
+
+  This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or
+  using an asymptotic series. Specifically:
+  - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on
+    `log(1-x) ~= -x, x << 1`.
+  - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique
+    and take a log.
+  - For `x <= lower_segment`, we use the series approximation of erf to compute
+    the log CDF directly.
+
+  The `lower_segment` is set based on the precision of the input:
+
+  ```
+  lower_segment = { -20,  x.dtype=float64
+                  { -10,  x.dtype=float32
+  upper_segment = {   8,  x.dtype=float64
+                  {   5,  x.dtype=float32
+  ```
+
+  When `x < lower_segment`, the `ndtr` asymptotic series approximation is:
+
+  ```
+     ndtr(x) = scale * (1 + sum) + R_N
+     scale   = exp(-0.5 x**2) / (-x sqrt(2 pi))
+     sum     = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N}
+     R_N     = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3})
+  ```
+
+  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ...  (3) (1)` is a
+  [double-factorial](https://en.wikipedia.org/wiki/Double_factorial).
+
+
+  Args:
+    x: `Tensor` of type `float32`, `float64`.
+    series_order: Positive Python `integer`. Maximum depth to
+      evaluate the asymptotic expansion. This is the `N` above.
+    name: Python string. A name for the operation (default="log_ndtr").
+
+  Returns:
+    log_ndtr: `Tensor` with `dtype=x.dtype`.
+
+  Raises:
+    TypeError: if `x.dtype` is not handled.
+    TypeError: if `series_order` is a not Python `integer.`
+    ValueError:  if `series_order` is not in `[0, 30]`.
+  """
+  if not isinstance(series_order, int):
+    raise TypeError("series_order must be a Python integer.")
+  if series_order < 0:
+    raise ValueError("series_order must be non-negative.")
+  if series_order > 30:
+    raise ValueError("series_order must be <= 30.")
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+
+    if x.dtype.as_numpy_dtype == np.float64:
+      lower_segment = LOGNDTR_FLOAT64_LOWER
+      upper_segment = LOGNDTR_FLOAT64_UPPER
+    elif x.dtype.as_numpy_dtype == np.float32:
+      lower_segment = LOGNDTR_FLOAT32_LOWER
+      upper_segment = LOGNDTR_FLOAT32_UPPER
+    else:
+      raise TypeError("x.dtype=%s is not supported." % x.dtype)
+
+    # The basic idea here was ported from py/scipy/special/cephes/ndtr.c.
+    # We copy the main idea, with a few changes
+    # * For x >> 1, and X ~ Normal(0, 1),
+    #     Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x],
+    #     which extends the range of validity of this function.
+    # * We use one fixed series_order for all of 'x', rather than adaptive.
+    # * Our docstring properly reflects that this is an asymptotic series, not a
+    #   Taylor series. We also provided a correct bound on the remainder.
+    # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when
+    #   x=0. This happens even though the branch is unchosen because when x=0
+    #   the gradient of a select involves the calculation 1*dy+0*(-inf)=nan
+    #   regardless of whether dy is finite. Note that the minimum is a NOP if
+    #   the branch is chosen.
+    return array_ops.where(
+        math_ops.greater(x, upper_segment),
+        -_ndtr(-x),  # log(1-x) ~= -x, x << 1
+        array_ops.where(math_ops.greater(x, lower_segment),
+                        math_ops.log(_ndtr(math_ops.maximum(x, lower_segment))),
+                        _log_ndtr_lower(math_ops.minimum(x, lower_segment),
+                                        series_order)))
+
+
+def _log_ndtr_lower(x, series_order):
+  """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`."""
+  x_2 = math_ops.square(x)
+  # Log of the term multiplying (1 + sum)
+  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
+  return log_scale + math_ops.log(_log_ndtr_asymptotic_series(x, series_order))
+
+
+def _log_ndtr_asymptotic_series(x, series_order):
+  """Calculates the asymptotic series used in log_ndtr."""
+  if series_order <= 0:
+    return 1.
+  x_2 = math_ops.square(x)
+  even_sum = 0.
+  odd_sum = 0.
+  x_2n = x_2  # Start with x^{2*1} = x^{2*n} with n = 1.
+  for n in range(1, series_order + 1):
+    if n % 2:
+      odd_sum += _double_factorial(2 * n - 1) / x_2n
+    else:
+      even_sum += _double_factorial(2 * n - 1) / x_2n
+    x_2n *= x_2
+  return 1. + even_sum - odd_sum
+
+
+def _double_factorial(n):
+  """The double factorial function for small Python integer `n`."""
+  return np.prod(np.arange(n, 1, -2))
+
+
+def log_cdf_laplace(x, name="log_cdf_laplace"):
+  """Log Laplace distribution function.
+
+  This function calculates `Log[L(x)]`, where `L(x)` is the cumulative
+  distribution function of the Laplace distribution, i.e.
+
+  ```L(x) := 0.5 * int_{-infty}^x e^{-|t|} dt```
+
+  For numerical accuracy, `L(x)` is computed in different ways depending on `x`,
+
+  ```
+  x <= 0:
+    Log[L(x)] = Log[0.5] + x, which is exact
+
+  0 < x:
+    Log[L(x)] = Log[1 - 0.5 * e^{-x}], which is exact
+  ```
+
+  Args:
+    x: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="log_ndtr").
+
+  Returns:
+    `Tensor` with `dtype=x.dtype`.
+
+  Raises:
+    TypeError: if `x.dtype` is not handled.
+  """
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+
+    # For x < 0, L(x) = 0.5 * exp{x} exactly, so Log[L(x)] = log(0.5) + x.
+    lower_solution = -np.log(2.) + x
+
+    # safe_exp_neg_x = exp{-x} for x > 0, but is
+    # bounded above by 1, which avoids
+    #   log[1 - 1] = -inf for x = log(1/2), AND
+    #   exp{-x} --> inf, for x << -1
+    safe_exp_neg_x = math_ops.exp(-math_ops.abs(x))
+
+    # log1p(z) = log(1 + z) approx z for |z| << 1. This approxmation is used
+    # internally by log1p, rather than being done explicitly here.
+    upper_solution = math_ops.log1p(-0.5 * safe_exp_neg_x)
+
+    return array_ops.where(x < 0., lower_solution, upper_solution)
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/python/ops/distributions/student_t.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/ops/student_t.py
rename to tensorflow/python/ops/distributions/student_t.py
index 2d097ff95ff..073ac4286be 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +31,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -43,8 +42,10 @@ __all__ = [
 
 
 class StudentT(distribution.Distribution):
-  # pylint: disable=line-too-long
-  """Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
+  """Student's t-distribution.
+
+  This distribution has parameters: degree of freedom `df`, location `loc`,
+  and `scale`.
 
   #### Mathematical details
 
@@ -74,7 +75,7 @@ class StudentT(distribution.Distribution):
   ```
 
   Notice that `scale` has semantics more similar to standard deviation than
-  variance.  However it is not actually the std. deviation; the Student's
+  variance. However it is not actually the std. deviation; the Student's
   t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.
 
   #### Examples
@@ -83,7 +84,7 @@ class StudentT(distribution.Distribution):
 
   ```python
   # Define a single scalar Student t distribution.
-  single_dist = tf.contrib.distributions.StudentT(df=3)
+  single_dist = tf.distributions.StudentT(df=3)
 
   # Evaluate the pdf at 1, returning a scalar Tensor.
   single_dist.prob(1.)
@@ -91,7 +92,7 @@ class StudentT(distribution.Distribution):
   # Define a batch of two scalar valued Student t's.
   # The first has degrees of freedom 2, mean 1, and scale 11.
   # The second 3, 2 and 22.
-  multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
+  multi_dist = tf.distributions.StudentT(df=[2, 3],
                                                  loc=[1, 2.],
                                                  scale=[11, 22.])
 
@@ -108,7 +109,7 @@ class StudentT(distribution.Distribution):
   ```python
   # Define a batch of two Student's t distributions.
   # Both have df 2 and mean 1, but different scales.
-  dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
+  dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
@@ -134,44 +135,43 @@ class StudentT(distribution.Distribution):
     supports broadcasting (e.g. `df + loc + scale` is a valid operation).
 
     Args:
-      df: Numeric `Tensor`. The degrees of freedom of the distribution(s).
-        `df` must contain only positive values.
-      loc: Numeric `Tensor`. The mean(s) of the distribution(s).
-      scale: Numeric `Tensor`. The scaling factor(s) for the distribution(s).
-        Note that `scale` is not technically the standard deviation of this
-        distribution but has semantics more similar to standard deviation than
-        variance.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      df: Floating-point `Tensor`. The degrees of freedom of the
+        distribution(s). `df` must contain only positive values.
+      loc: Floating-point `Tensor`. The mean(s) of the distribution(s).
+      scale: Floating-point `Tensor`. The scaling factor(s) for the
+        distribution(s). Note that `scale` is not technically the standard
+        deviation of this distribution but has semantics more similar to
+        standard deviation than variance.
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`,
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
-        indicate the result is undefined.  When `False`, an exception is raised
+        indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[df, loc, scale]) as ns:
+    with ops.name_scope(name, values=[df, loc, scale]):
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
         self._df = array_ops.identity(df, name="df")
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
-        contrib_tensor_util.assert_same_float_dtype(
+        check_ops.assert_same_float_dtype(
             (self._df, self._loc, self._scale))
     super(StudentT, self).__init__(
         dtype=self._scale.dtype,
-        is_continuous=True,
         reparameterization_type=distribution.NOT_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._df, self._loc, self._scale],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -257,8 +257,9 @@ class StudentT(distribution.Distribution):
     return array_ops.where(math_ops.less(y, 0.), neg_cdf, 1. - neg_cdf)
 
   def _entropy(self):
-    v = array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)[..., None]
-    u = v * self.df[..., None]
+    v = array_ops.ones(self.batch_shape_tensor(),
+                       dtype=self.dtype)[..., array_ops.newaxis]
+    u = v * self.df[..., array_ops.newaxis]
     beta_arg = array_ops.concat([u, v], -1) / 2.
     return (math_ops.log(math_ops.abs(self.scale)) +
             0.5 * math_ops.log(self.df) +
@@ -269,7 +270,7 @@ class StudentT(distribution.Distribution):
 
   @distribution_util.AppendDocstring(
       """The mean of Student's T equals `loc` if `df > 1`, otherwise it is
-      `NaN`.  If `self.allow_nan_stats=True`, then an exception will be raised
+      `NaN`. If `self.allow_nan_stats=True`, then an exception will be raised
       rather than returning `NaN`.""")
   def _mean(self):
     mean = self.loc * array_ops.ones(self.batch_shape_tensor(),
@@ -286,7 +287,7 @@ class StudentT(distribution.Distribution):
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
-                  array_ops.ones((), dtype=self.dtype),
+                  array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="mean not defined for components of df <= 1"),
           ],
@@ -329,7 +330,7 @@ class StudentT(distribution.Distribution):
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
-                  array_ops.ones((), dtype=self.dtype),
+                  array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="variance not defined for components of df <= 1"),
           ],
@@ -350,12 +351,12 @@ class StudentTWithAbsDfSoftplusScale(StudentT):
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
     parameters = locals()
-    with ops.name_scope(name, values=[df, scale]) as ns:
+    with ops.name_scope(name, values=[df, scale]):
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
-          name=ns)
+          name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
similarity index 80%
rename from tensorflow/contrib/distributions/python/ops/transformed_distribution.py
rename to tensorflow/python/ops/distributions/transformed_distribution.py
index 097cf5db441..1be3819569c 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -19,9 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import bijector as bijectors
-from tensorflow.contrib.distributions.python.ops import distribution as distributions
-from tensorflow.contrib.distributions.python.ops import distribution_util
+# Bijectors must be directly imported because `remove_undocumented` prevents
+# individual file imports.
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +30,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import identity_bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "TransformedDistribution",
@@ -119,7 +121,7 @@ def _is_scalar_from_shape(shape):
   return _logical_equal(_ndims_from_shape(shape), 0)
 
 
-class TransformedDistribution(distributions.Distribution):
+class TransformedDistribution(distribution_lib.Distribution):
   """A Transformed Distribution.
 
   A `TransformedDistribution` models `p(y)` given a base distribution `p(x)`,
@@ -139,56 +141,26 @@ class TransformedDistribution(distributions.Distribution):
   Write `cdf(Y=y)` for an absolutely continuous cumulative distribution function
   of random variable `Y`; write the probability density function `pdf(Y=y) :=
   d^k / (dy_1,...,dy_k) cdf(Y=y)` for its derivative wrt to `Y` evaluated at
-  `y`.  Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
+  `y`. Assume that `Y = g(X)` where `g` is a deterministic diffeomorphism,
   i.e., a non-random, continuous, differentiable, and invertible function.
   Write the inverse of `g` as `X = g^{-1}(Y)` and `(J o g)(x)` for the Jacobian
   of `g` evaluated at `x`.
 
   A `TransformedDistribution` implements the following operations:
 
-    * `sample`:
+    * `sample`
+      Mathematically:   `Y = g(X)`
+      Programmatically: `bijector.forward(distribution.sample(...))`
 
-      Mathematically:
+    * `log_prob`
+      Mathematically:   `(log o pdf)(Y=y) = (log o pdf o g^{-1})(y)
+                         + (log o abs o det o J o g^{-1})(y)`
+      Programmatically: `(distribution.log_prob(bijector.inverse(y))
+                         + bijector.inverse_log_det_jacobian(y))`
 
-      ```none
-      Y = g(X)
-      ```
-
-      Programmatically:
-
-      ```python
-      return bijector.forward(distribution.sample(...))
-      ```
-
-    * `log_prob`:
-
-      Mathematically:
-
-      ```none
-      (log o pdf)(Y=y) = (log o pdf o g^{-1})(y) +
-                           (log o abs o det o J o g^{-1})(y)
-      ```
-
-      Programmatically:
-
-      ```python
-      return (distribution.log_prob(bijector.inverse(y)) +
-              bijector.inverse_log_det_jacobian(y))
-      ```
-
-    * `log_cdf`:
-
-      Mathematically:
-
-      ```none
-      (log o cdf)(Y=y) = (log o cdf o g^{-1})(y)
-      ```
-
-      Programmatically:
-
-      ```python
-      return distribution.log_cdf(bijector.inverse(x))
-      ```
+    * `log_cdf`
+      Mathematically:   `(log o cdf)(Y=y) = (log o cdf o g^{-1})(y)`
+      Programmatically: `distribution.log_cdf(bijector.inverse(x))`
 
     * and similarly for: `cdf`, `prob`, `log_survival_function`,
      `survival_function`.
@@ -199,8 +171,8 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   log_normal = ds.TransformedDistribution(
-    distribution=ds.Normal(mu=mu, sigma=sigma),
-    bijector=ds.bijector.Exp(),
+    distribution=ds.Normal(loc=0., scale=1.),
+    bijector=ds.bijectors.Exp(),
     name="LogNormalTransformedDistribution")
   ```
 
@@ -209,12 +181,12 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   log_normal = ds.TransformedDistribution(
-    distribution=ds.Normal(mu=mu, sigma=sigma),
-    bijector=ds.bijector.Inline(
+    distribution=ds.Normal(loc=0., scale=1.),
+    bijector=ds.bijectors.Inline(
       forward_fn=tf.exp,
       inverse_fn=tf.log,
       inverse_log_det_jacobian_fn=(
-        lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
+        lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
     name="LogNormalTransformedDistribution")
   ```
 
@@ -223,21 +195,23 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   normal = ds.TransformedDistribution(
-    distribution=ds.Normal(mu=0, sigma=1),
-    bijector=ds.bijector.ScaleAndShift(loc=mu, scale=sigma, event_ndims=0),
+    distribution=ds.Normal(loc=0., scale=1.),
+    bijector=ds.bijectors.Affine(
+      shift=-1.,
+      scale_identity_multiplier=2.,
+      event_ndims=0),
     name="NormalTransformedDistribution")
   ```
 
   A `TransformedDistribution`'s batch- and event-shape are implied by the base
   distribution unless explicitly overridden by `batch_shape` or `event_shape`
-  arguments.  Specifying an overriding `batch_shape` (`event_shape`) is
+  arguments. Specifying an overriding `batch_shape` (`event_shape`) is
   permitted only if the base distribution has scalar batch-shape (event-shape).
   The bijector is applied to the distribution as if the distribution possessed
   the overridden shape(s). The following example demonstrates how to construct a
   multivariate Normal as a `TransformedDistribution`.
 
   ```python
-  bs = tf.contrib.distributions.bijector
   ds = tf.contrib.distributions
   # We will create two MVNs with batch_shape = event_shape = 2.
   mean = [[-1., 0],      # batch:0
@@ -247,11 +221,11 @@ class TransformedDistribution(distributions.Distribution):
               [[1, 0],
                [2, 2]]]  # batch:1
   mvn1 = ds.TransformedDistribution(
-      distribution=ds.Normal(mu=0., sigma=1.),
-      bijector=bs.Affine(shift=mean, tril=chol_cov),
+      distribution=ds.Normal(loc=0., scale=1.),
+      bijector=ds.bijectors.Affine(shift=mean, scale_tril=chol_cov),
       batch_shape=[2],  # Valid because base_distribution.batch_shape == [].
       event_shape=[2])  # Valid because base_distribution.event_shape == [].
-  mvn2 = ds.MultivariateNormalCholesky(mu=mean, chol=chol_cov)
+  mvn2 = ds.MultivariateNormalTriL(loc=mean, scale_tril=chol_cov)
   # mvn1.log_prob(x) == mvn2.log_prob(x)
   ```
 
@@ -275,11 +249,11 @@ class TransformedDistribution(distributions.Distribution):
         `batch_shape`; valid only if `distribution.is_scalar_batch()`.
       event_shape: `integer` vector `Tensor` which overrides `distribution`
         `event_shape`; valid only if `distribution.is_scalar_event()`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      name: `String` name prefixed to Ops created by this class. Default:
+      name: Python `str` name prefixed to Ops created by this class. Default:
         `bijector.name + distribution.name`.
     """
     parameters = locals()
@@ -291,7 +265,7 @@ class TransformedDistribution(distributions.Distribution):
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
 
       if bijector is None:
-        bijector = bijectors.Identity(validate_args=validate_args)
+        bijector = identity_bijector.Identity(validate_args=validate_args)
 
       # We will keep track of a static and dynamic version of
       # self._is_{batch,event}_override. This way we can do more prior to graph
@@ -304,7 +278,7 @@ class TransformedDistribution(distributions.Distribution):
           _ndims_from_shape(self._override_batch_shape), self._zero))
       self._is_maybe_batch_override = bool(
           tensor_util.constant_value(self._override_batch_shape) is None or
-          tensor_util.constant_value(self._override_batch_shape))
+          tensor_util.constant_value(self._override_batch_shape).size != 0)
 
       self._override_event_shape = self._maybe_validate_shape_override(
           event_shape, distribution.is_scalar_event(), validate_args,
@@ -313,12 +287,12 @@ class TransformedDistribution(distributions.Distribution):
           _ndims_from_shape(self._override_event_shape), self._zero))
       self._is_maybe_event_override = bool(
           tensor_util.constant_value(self._override_event_shape) is None or
-          tensor_util.constant_value(self._override_event_shape))
+          tensor_util.constant_value(self._override_event_shape).size != 0)
 
       # To convert a scalar distribution into a multivariate distribution we
       # will draw dims from the sample dims, which are otherwise iid. This is
       # easy to do except in the case that the base distribution has batch dims
-      # and we're overriding event shape.  When that case happens the event dims
+      # and we're overriding event shape. When that case happens the event dims
       # will incorrectly be to the left of the batch dims. In this case we'll
       # cyclically permute left the new dims.
       self._needs_rotation = _logical_and(
@@ -337,7 +311,6 @@ class TransformedDistribution(distributions.Distribution):
     self._bijector = bijector
     super(TransformedDistribution, self).__init__(
         dtype=self._distribution.dtype,
-        is_continuous=self._distribution.is_continuous,
         reparameterization_type=self._distribution.reparameterization_type,
         validate_args=validate_args,
         allow_nan_stats=self._distribution.allow_nan_stats,
@@ -366,11 +339,19 @@ class TransformedDistribution(distributions.Distribution):
             self.distribution.event_shape_tensor()))
 
   def _event_shape(self):
+    # If there's a chance that the event_shape has been overridden, we return
+    # what we statically know about the `event_shape_override`. This works
+    # because: `_is_maybe_event_override` means `static_override` is `None` or a
+    # non-empty list, i.e., we don't statically know the `event_shape` or we do.
+    #
+    # Since the `bijector` may change the `event_shape`, we then forward what we
+    # know to the bijector. This allows the `bijector` to have final say in the
+    # `event_shape`.
     static_override = tensor_util.constant_value(self._override_event_shape)
     return self.bijector.forward_event_shape(
-        self.distribution.event_shape
-        if static_override is not None and not static_override
-        else tensor_shape.TensorShape(static_override))
+        tensor_shape.TensorShape(static_override)
+        if self._is_maybe_event_override
+        else self.distribution.event_shape)
 
   def _batch_shape_tensor(self):
     return distribution_util.pick_vector(
@@ -379,14 +360,20 @@ class TransformedDistribution(distributions.Distribution):
         self.distribution.batch_shape_tensor())
 
   def _batch_shape(self):
+    # If there's a chance that the batch_shape has been overridden, we return
+    # what we statically know about the `batch_shape_override`. This works
+    # because: `_is_maybe_batch_override` means `static_override` is `None` or a
+    # non-empty list, i.e., we don't statically know the `batch_shape` or we do.
+    #
+    # Notice that this implementation parallels the `_event_shape` except that
+    # the `bijector` doesn't get to alter the `batch_shape`. Recall that
+    # `batch_shape` is a property of a distribution while `event_shape` is
+    # shared between both the `distribution` instance and the `bijector`.
     static_override = tensor_util.constant_value(self._override_batch_shape)
-    if static_override is not None and not static_override:
-      return self.distribution.batch_shape
-    return tensor_shape.TensorShape(static_override)
+    return (tensor_shape.TensorShape(static_override)
+            if self._is_maybe_batch_override
+            else self.distribution.batch_shape)
 
-  @distribution_util.AppendDocstring(
-      """Samples from the base distribution and then passes through
-      the bijector's forward transform.""")
   def _sample_n(self, n, seed=None):
     sample_shape = _concat_vectors(
         distribution_util.pick_vector(self._needs_rotation, self._empty, [n]),
@@ -397,33 +384,31 @@ class TransformedDistribution(distributions.Distribution):
     x = self._maybe_rotate_dims(x)
     return self.bijector.forward(x)
 
-  @distribution_util.AppendDocstring(
-      """Implements `(log o p o g^{-1})(y) + (log o abs o det o J o g^{-1})(y)`,
-      where `g^{-1}` is the inverse of `transform`.
-
-      Also raises a `ValueError` if `inverse` was not provided to the
-      distribution and `y` was not returned from `sample`.""")
   def _log_prob(self, y):
-    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(y)
+    x = self.bijector.inverse(y)
+    ildj = self.bijector.inverse_log_det_jacobian(y)
     x = self._maybe_rotate_dims(x, rotate_right=True)
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    return ildj + log_prob
+    log_prob = ildj + log_prob
+    if self._is_maybe_event_override:
+      log_prob.set_shape(array_ops.broadcast_static_shape(
+          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+    return log_prob
 
-  @distribution_util.AppendDocstring(
-      """Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
-      inverse of `transform`.
-
-      Also raises a `ValueError` if `inverse` was not provided to the
-      distribution and `y` was not returned from `sample`.""")
   def _prob(self, y):
-    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(y)
+    x = self.bijector.inverse(y)
+    ildj = self.bijector.inverse_log_det_jacobian(y)
     x = self._maybe_rotate_dims(x, rotate_right=True)
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    return math_ops.exp(ildj) * prob
+    prob *= math_ops.exp(ildj)
+    if self._is_maybe_event_override:
+      prob.set_shape(array_ops.broadcast_static_shape(
+          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+    return prob
 
   def _log_cdf(self, y):
     if self._is_maybe_event_override:
@@ -454,8 +439,7 @@ class TransformedDistribution(distributions.Distribution):
     return self.distribution.survival_function(x)
 
   def _entropy(self):
-    if (not self.distribution.is_continuous or
-        not self.bijector.is_constant_jacobian):
+    if not self.bijector.is_constant_jacobian:
       raise NotImplementedError("entropy is not implemented")
     # Suppose Y = g(X) where g is a diffeomorphism and X is a continuous rv. It
     # can be shown that:
@@ -480,8 +464,10 @@ class TransformedDistribution(distributions.Distribution):
           _ones_like(self.distribution.batch_shape_tensor())
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
-    dummy = 0.
-    return entropy - self.bijector.inverse_log_det_jacobian(dummy)
+    dummy = array_ops.zeros([], self.dtype)
+    entropy -= self.bijector.inverse_log_det_jacobian(dummy)
+    entropy.set_shape(self.batch_shape)
+    return entropy
 
   def _maybe_validate_shape_override(self, override_shape, base_is_scalar,
                                      validate_args, name):
@@ -534,7 +520,8 @@ class TransformedDistribution(distributions.Distribution):
 
   def _maybe_rotate_dims(self, x, rotate_right=False):
     """Helper which rolls left event_dims left or right event_dims right."""
-    if tensor_util.constant_value(self._needs_rotation) is False:
+    needs_rotation_const = tensor_util.constant_value(self._needs_rotation)
+    if needs_rotation_const is not None and not needs_rotation_const:
       return x
     ndims = array_ops.rank(x)
     n = (ndims - self._rotate_ndims) if rotate_right else self._rotate_ndims
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/python/ops/distributions/uniform.py
similarity index 89%
rename from tensorflow/contrib/distributions/python/ops/uniform.py
rename to tensorflow/python/ops/distributions/uniform.py
index 80cc9c30155..9b555f87eae 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -30,12 +28,13 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Uniform(distribution.Distribution):
   """Uniform distribution with `low` and `high` parameters.
 
-  ### Mathematical Details
+  #### Mathematical Details
 
   The probability density function (pdf) is,
 
@@ -54,7 +53,7 @@ class Uniform(distribution.Distribution):
   The parameters `low` and `high` must be shaped in a way that supports
   broadcasting (e.g., `high - low` is a valid operation).
 
-  ### Examples
+  #### Examples
 
   ```python
   # Without broadcasting:
@@ -87,38 +86,37 @@ class Uniform(distribution.Distribution):
         have `low < high`.
       high: Floating point tensor, upper boundary of the output interval. Must
         have `low < high`.
-      validate_args: Python `Boolean`, default `False`. When `True` distribution
+      validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
-      allow_nan_stats: Python `Boolean`, default `True`. When `True`, statistics
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
-        result is undefined.  When `False`, an exception is raised if one or
+        result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
-      name: `String` name prefixed to Ops created by this class.
+      name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[low, high]) as ns:
+    with ops.name_scope(name, values=[low, high]):
       with ops.control_dependencies([
           check_ops.assert_less(
               low, high, message="uniform not defined when low >= high.")
       ] if validate_args else []):
         self._low = array_ops.identity(low, name="low")
         self._high = array_ops.identity(high, name="high")
-        contrib_tensor_util.assert_same_float_dtype([self._low, self._high])
+        check_ops.assert_same_float_dtype([self._low, self._high])
     super(Uniform, self).__init__(
         dtype=self._low.dtype,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
-        is_continuous=True,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._low,
                        self._high],
-        name=ns)
+        name=name)
 
   @staticmethod
   def _param_shapes(sample_shape):
@@ -158,7 +156,7 @@ class Uniform(distribution.Distribution):
     return tensor_shape.scalar()
 
   def _sample_n(self, n, seed=None):
-    shape = array_ops.concat(([n], self.batch_shape_tensor()), 0)
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     samples = random_ops.random_uniform(shape=shape,
                                         dtype=self.dtype,
                                         seed=seed)
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
new file mode 100644
index 00000000000..05c6f4da570
--- /dev/null
+++ b/tensorflow/python/ops/distributions/util.py
@@ -0,0 +1,693 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probability distributions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import hashlib
+import math
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+def assert_close(
+    x, y, data=None, summarize=None, message=None, name="assert_close"):
+  """Assert that that x and y are within machine epsilon of each other.
+
+  Args:
+    x: Floating-point `Tensor`
+    y: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
+  """
+  message = message or ""
+  x = ops.convert_to_tensor(x, name="x")
+  y = ops.convert_to_tensor(y, name="y")
+
+  if data is None:
+    data = [
+        message,
+        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
+        y.name, y
+    ]
+
+  if x.dtype.is_integer:
+    return check_ops.assert_equal(
+        x, y, data=data, summarize=summarize, message=message, name=name)
+
+  with ops.name_scope(name, "assert_close", [x, y, data]):
+    tol = np.finfo(x.dtype.as_numpy_dtype).eps
+    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
+    return control_flow_ops.Assert(
+        condition, data, summarize=summarize)
+
+
+def assert_integer_form(
+    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+  """Assert that x has integer components (or floats equal to integers).
+
+  Args:
+    x: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if round(x) != x.
+  """
+
+  message = message or "x has non-integer components"
+  x = ops.convert_to_tensor(x, name="x")
+  casted_x = math_ops.to_int64(x)
+  return check_ops.assert_equal(
+      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
+      data=data, summarize=summarize, message=message, name=name)
+
+
+def assert_symmetric(matrix):
+  matrix_t = array_ops.matrix_transpose(matrix)
+  return control_flow_ops.with_dependencies(
+      [check_ops.assert_equal(matrix, matrix_t)], matrix)
+
+
+def embed_check_nonnegative_discrete(x, check_integer=True):
+  """Assert x is a non-negative tensor, and optionally of integers."""
+  assertions = [check_ops.assert_non_negative(
+      x, message="x must be non-negative.")]
+  if check_integer:
+    assertions += [assert_integer_form(
+        x, message="x cannot contain fractional components.")]
+  return control_flow_ops.with_dependencies(assertions, x)
+
+
+def same_dynamic_shape(a, b):
+  """Returns whether a and b have the same dynamic shape.
+
+  Args:
+    a: `Tensor`
+    b: `Tensor`
+
+  Returns:
+    `bool` `Tensor` representing if both tensors have the same shape.
+  """
+  a = ops.convert_to_tensor(a, name="a")
+  b = ops.convert_to_tensor(b, name="b")
+
+  # Here we can't just do math_ops.equal(a.shape, b.shape), since
+  # static shape inference may break the equality comparison between
+  # shape(a) and shape(b) in math_ops.equal.
+  def all_shapes_equal():
+    return math_ops.reduce_all(math_ops.equal(
+        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
+        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
+
+  # One of the shapes isn't fully defined, so we need to use the dynamic
+  # shape.
+  return control_flow_ops.cond(
+      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
+      all_shapes_equal,
+      lambda: constant_op.constant(False))
+
+
+def get_logits_and_probs(logits=None,
+                         probs=None,
+                         multidimensional=False,
+                         validate_args=False,
+                         name="get_logits_and_probs"):
+  """Converts logit to probabilities (or vice-versa), and returns both.
+
+  Args:
+    logits: Floating-point `Tensor` representing log-odds.
+    probs: Floating-point `Tensor` representing probabilities.
+    multidimensional: Python `bool`, default `False`.
+      If `True`, represents whether the last dimension of `logits` or `probs`,
+      a `[N1, N2, ...  k]` dimensional tensor, representing the
+      logit or probability of `shape[-1]` classes.
+    validate_args: Python `bool`, default `False`. When `True`, either assert
+      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
+      of `probs` sums to one.
+    name: A name for this operation (optional).
+
+  Returns:
+    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
+      `1`, then the corresponding entry in the returned logit will be `-Inf` and
+      `Inf` respectively.
+
+  Raises:
+    ValueError: if neither `probs` nor `logits` were passed in, or both were.
+  """
+  with ops.name_scope(name, values=[probs, logits]):
+    if (probs is None) == (logits is None):
+      raise ValueError("Must pass probs or logits, but not both.")
+
+    if probs is None:
+      logits = ops.convert_to_tensor(logits, name="logits")
+      if multidimensional:
+        return logits, nn.softmax(logits, name="probs")
+      return logits, math_ops.sigmoid(logits, name="probs")
+
+    probs = ops.convert_to_tensor(probs, name="probs")
+    if validate_args:
+      with ops.name_scope("validate_probs"):
+        one = constant_op.constant(1., probs.dtype)
+        dependencies = [check_ops.assert_non_negative(probs)]
+        if multidimensional:
+          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
+                                        message="probs does not sum to 1.")]
+        else:
+          dependencies += [check_ops.assert_less_equal(
+              probs, one, message="probs has components greater than 1.")]
+        probs = control_flow_ops.with_dependencies(dependencies, probs)
+
+    with ops.name_scope("logits"):
+      if multidimensional:
+        # Here we don't compute the multidimensional case, in a manner
+        # consistent with respect to the unidimensional case. We do so
+        # following the TF convention. Typically, you might expect to see
+        # logits = log(probs) - log(probs[pivot]). A side-effect of
+        # being consistent with the TF approach is that the unidimensional case
+        # implicitly handles the second dimension but the multidimensional case
+        # explicitly keeps the pivot dimension.
+        return math_ops.log(probs), probs
+      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
+
+
+def log_combinations(n, counts, name="log_combinations"):
+  """Multinomial coefficient.
+
+  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
+  the multinomial coefficient as:
+
+  ```n! / sum_i n_i!```
+
+  where `i` runs over all `k` classes.
+
+  Args:
+    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
+      outcomes.
+    counts: Floating-point `Tensor` broadcastable with `n`. This represents
+      counts in `k` classes, where `k` is the last dimension of the tensor.
+    name: A name for this operation (optional).
+
+  Returns:
+    `Tensor` representing the multinomial coefficient between `n` and `counts`.
+  """
+  # First a bit about the number of ways counts could have come in:
+  # E.g. if counts = [1, 2], then this is 3 choose 2.
+  # In general, this is (sum counts)! / sum(counts!)
+  # The sum should be along the last dimension of counts. This is the
+  # "distribution" dimension. Here n a priori represents the sum of counts.
+  with ops.name_scope(name, values=[n, counts]):
+    n = ops.convert_to_tensor(n, name="n")
+    counts = ops.convert_to_tensor(counts, name="counts")
+    total_permutations = math_ops.lgamma(n + 1)
+    counts_factorial = math_ops.lgamma(counts + 1)
+    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
+    return total_permutations - redundant_permutations
+
+
+def matrix_diag_transform(matrix, transform=None, name=None):
+  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
+
+  Create a trainable covariance defined by a Cholesky factor:
+
+  ```python
+  # Transform network layer into 2 x 2 array.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+
+  # Make the diagonal positive. If the upper triangle was zero, this would be a
+  # valid Cholesky factor.
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # OperatorPDCholesky ignores the upper triangle.
+  operator = OperatorPDCholesky(chol)
+  ```
+
+  Example of heteroskedastic 2-D linear regression.
+
+  ```python
+  # Get a trainable Cholesky factor.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # Get a trainable mean.
+  mu = tf.contrib.layers.fully_connected(activations, 2)
+
+  # This is a fully trainable multivariate normal!
+  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
+
+  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
+  # will be a distribution predicting labels as multivariate Gaussians.
+  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
+  ```
+
+  Args:
+    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
+      equal.
+    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
+      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
+      unchanged. Defaults to `None`.
+    name:  A name to give created ops.
+      Defaults to "matrix_diag_transform".
+
+  Returns:
+    A `Tensor` with same shape and `dtype` as `matrix`.
+  """
+  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    if transform is None:
+      return matrix
+    # Replace the diag with transformed diag.
+    diag = array_ops.matrix_diag_part(matrix)
+    transformed_diag = transform(diag)
+    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
+
+  return transformed_mat
+
+
+def rotate_transpose(x, shift, name="rotate_transpose"):
+  """Circularly moves dims left or right.
+
+  Effectively identical to:
+
+  ```python
+  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
+  ```
+
+  When `validate_args=False` additional graph-runtime checks are
+  performed. These checks entail moving data from to GPU to CPU.
+
+  Example:
+
+    ```python
+    x = ...  # Tensor of shape [1, 2, 3, 4].
+    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
+    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
+    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x, 7) == rotate_transpose(x, 3)
+    rotate_transpose(x, -7) == rotate_transpose(x, -3)
+    ```
+
+  Args:
+    x: `Tensor`.
+    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
+      transpose right (shift>0).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
+
+  Raises:
+    TypeError: if shift is not integer type.
+  """
+  with ops.name_scope(name, values=[x, shift]):
+    x = ops.convert_to_tensor(x, name="x")
+    shift = ops.convert_to_tensor(shift, name="shift")
+    # We do not assign back to preserve constant-ness.
+    check_ops.assert_integer(shift)
+    shift_value_static = tensor_util.constant_value(shift)
+    ndims = x.get_shape().ndims
+    if ndims is not None and shift_value_static is not None:
+      if ndims < 2: return x
+      shift_value_static = np.sign(shift_value_static) * (
+          abs(shift_value_static) % ndims)
+      if shift_value_static == 0: return x
+      perm = np.roll(np.arange(ndims), shift_value_static)
+      return array_ops.transpose(x, perm=perm)
+    else:
+      # Consider if we always had a positive shift, and some specified
+      # direction.
+      # When shifting left we want the new array:
+      #   last(x, n-shift) + first(x, shift)
+      # and if shifting right then we want:
+      #   last(x, shift) + first(x, n-shift)
+      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
+      # Also, we can encode direction and shift as one: direction * shift.
+      # Combining these facts, we have:
+      #   a = cond(shift<0, -shift, n-shift)
+      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
+      # Finally, we transform shift by modulo length so it can be specified
+      # independently from the array upon which it operates (like python).
+      ndims = array_ops.rank(x)
+      shift = array_ops.where(math_ops.less(shift, 0),
+                              math_ops.mod(-shift, ndims),
+                              ndims - math_ops.mod(shift, ndims))
+      first = math_ops.range(0, shift)
+      last = math_ops.range(shift, ndims)
+      perm = array_ops.concat([last, first], 0)
+      return array_ops.transpose(x, perm=perm)
+
+
+def pick_vector(cond,
+                true_vector,
+                false_vector,
+                name="pick_vector"):
+  """Picks possibly different length row `Tensor`s based on condition.
+
+  Value `Tensor`s should have exactly one dimension.
+
+  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
+  `false_vector` is immediately returned. I.e., no graph nodes are created and
+  no validation happens.
+
+  Args:
+    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
+    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
+    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
+    name: Python `str`. The name to give this op.
+
+  Example:
+
+  ```python
+  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [10, 11].
+  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [15, 16, 17].
+  ```
+
+  Returns:
+    true_or_false_vector: `Tensor`.
+
+  Raises:
+    TypeError: if `cond.dtype != tf.bool`
+    TypeError: if `cond` is not a constant and
+      `true_vector.dtype != false_vector.dtype`
+  """
+  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
+    cond = ops.convert_to_tensor(cond, name="cond")
+    if cond.dtype != dtypes.bool:
+      raise TypeError("%s.dtype=%s which is not %s" %
+                      (cond.name, cond.dtype, dtypes.bool))
+    cond_value_static = tensor_util.constant_value(cond)
+    if cond_value_static is not None:
+      return true_vector if cond_value_static else false_vector
+    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
+    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
+    if true_vector.dtype != false_vector.dtype:
+      raise TypeError(
+          "%s.dtype=%s does not match %s.dtype=%s"
+          % (true_vector.name, true_vector.dtype,
+             false_vector.name, false_vector.dtype))
+    n = array_ops.shape(true_vector)[0]
+    return array_ops.slice(
+        array_ops.concat([true_vector, false_vector], 0),
+        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
+
+
+def gen_new_seed(seed, salt):
+  """Generate a new seed, from the given seed and salt."""
+  if seed is None:
+    return None
+  string = (str(seed) + salt).encode("utf-8")
+  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+
+def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
+  """Creates a (batch of) lower triangular matrix from a vector of inputs.
+
+  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
+  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
+  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
+
+  Although the non-batch complexity is O(n**2), large constants and sub-optimal
+  vectorization means the complexity of this function is 5x slower than zeroing
+  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
+  function becomes competitive only when several matmul/cholesky/etc ops can be
+  ellided in constructing the input. Example: wiring a fully connected layer as
+  a covariance matrix; this function reduces the final layer by 2x and possibly
+  reduces the network arch complexity considerably. In most cases it is better
+  to simply build a full matrix and zero out the upper triangular elements,
+  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
+  construct a lower triangular.
+
+  Example:
+
+  ```python
+  fill_lower_triangular([1, 2, 3, 4, 5, 6])
+  # Returns: [[1, 0, 0],
+  #           [2, 3, 0],
+  #           [4, 5, 6]]
+  ```
+
+  For comparison, a pure numpy version of this function can be found in
+  `distribution_util_test.py`, function `_fill_lower_triangular`.
+
+  Args:
+    x: `Tensor` representing lower triangular elements.
+    validate_args: Python `bool`, default `False`. Whether to ensure the shape
+      of `x` can be mapped to a lower triangular matrix (controls non-static
+      checks only).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    tril: `Tensor` with lower triangular elements filled from `x`.
+
+  Raises:
+    ValueError: if shape if `x` has static shape which cannot be mapped to a
+      lower triangular matrix.
+  """
+  # TODO(jvdillon): Replace this code with dedicated op when it exists.
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[-1].value is not None):
+      d = x.get_shape()[-1].value
+      # d = n(n+1)/2 implies n is:
+      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
+      d_inferred = n * (n + 1) /2
+      if d != d_inferred:
+        raise ValueError("Input cannot be mapped to a lower triangular; "
+                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([n, n]))
+    else:
+      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
+      # d = n(n+1)/2 implies n is:
+      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
+                        dtype=dtypes.int32)
+      if validate_args:
+        is_valid_input_shape = check_ops.assert_equal(
+            n * (n + 1) / 2, d,
+            message="Input cannot be mapped to a lower triangular.")
+        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([None, None]))
+
+    def tril_ids(n):
+      """Internal helper to create vector of linear indices into y."""
+      # Build the ids statically; chose 512 because it implies 1MiB.
+      if not tensor_util.is_tensor(n) and n <= 512:
+        ids = np.arange(n**2, dtype=np.int32)
+        rows = (ids / n).astype(np.int32)  # Implicit floor.
+        # We need to stop incrementing the index when we encounter
+        # upper-triangular elements. The idea here is to compute the
+        # lower-right number of zeros then by "symmetry" subtract this from the
+        # total number of zeros, n(n-1)/2.
+        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
+        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
+        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
+        # mask = (ids <= (n + 1) * rows).astype(np.int32)
+      else:
+        ids = math_ops.range(n**2)
+        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
+        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
+                               dtype=dtypes.int32)
+      return ids - offset
+
+    # Special-case non-batch case.
+    if x.get_shape().ndims == 1:
+      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
+      y = array_ops.matrix_band_part(y, -1, 0)
+      y.set_shape(y.get_shape().merge_with(final_shape))
+      return y
+
+    # Make ids for each batch dim.
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[:-1].is_fully_defined()):
+      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
+      m = np.prod(batch_shape).astype(np.int32)
+    else:
+      batch_shape = array_ops.shape(x)[:-1]
+      m = math_ops.reduce_prod(array_ops.shape(x)[:-1])
+    batch_ids = math_ops.range(m)
+
+    # Assemble the tril_ids into batch,tril_id pairs.
+    idx = array_ops.stack([
+        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
+        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
+    ])
+    idx = array_ops.transpose(idx, [1, 2, 0])
+
+    # Gather up, reshape, and return.
+    y = array_ops.reshape(x, [-1, d])
+    y = array_ops.gather_nd(y, idx)
+    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
+    y = array_ops.matrix_band_part(y, -1, 0)
+    y.set_shape(y.get_shape().merge_with(final_shape))
+    return y
+
+
+# TODO(jvdillon): Merge this test back into:
+# tensorflow/python/ops/softplus_op_test.py
+# once TF core is accepting new ops.
+def softplus_inverse(x, name=None):
+  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
+
+  Mathematically this op is equivalent to:
+
+  ```none
+  softplus_inverse = log(exp(x) - 1.)
+  ```
+
+  Args:
+    x: `Tensor`. Non-negative (not enforced), floating-point.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor`. Has the same type/shape as input `x`.
+  """
+  with ops.name_scope(name, "softplus_inverse", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    # We begin by deriving a more numerically stable softplus_inverse:
+    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
+    # ==> exp{x} = 1 + exp{y}                                (1)
+    # ==> y = Log[exp{x} - 1]                                (2)
+    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
+    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
+    #       = Log[1 - exp{-x}] + x                           (3)
+    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
+    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
+    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
+    #
+    # In addition to the numerically stable derivation above, we clamp
+    # small/large values to be congruent with the logic in:
+    # tensorflow/core/kernels/softplus_op.h
+    #
+    # Finally, we set the input to one whenever the input is too large or too
+    # small. This ensures that no unchosen codepath is +/- inf. This is
+    # necessary to ensure the gradient doesn't get NaNs. Recall that the
+    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
+    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
+    # to overwrite `x` with ones only when we will never actually use this
+    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
+    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
+    is_too_small = math_ops.less(x, np.exp(threshold))
+    is_too_large = math_ops.greater(x, -threshold)
+    too_small_value = math_ops.log(x)
+    too_large_value = x
+    # This `where` will ultimately be a NOP because we won't select this
+    # codepath whenever we used the surrogate `ones_like`.
+    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
+                        array_ops.ones_like(x), x)
+    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
+    return array_ops.where(is_too_small, too_small_value,
+                           array_ops.where(is_too_large, too_large_value, y))
+
+
+# TODO(b/35290280): Add unit-tests.
+def dimension_size(x, axis):
+  """Returns the size of a specific dimension."""
+  # Since tf.gather isn't "constant-in, constant-out", we must first check the
+  # static shape or fallback to dynamic shape.
+  num_rows = (None if x.get_shape().ndims is None
+              else x.get_shape()[axis].value)
+  if num_rows is not None:
+    return num_rows
+  return array_ops.shape(x)[axis]
+
+
+class AppendDocstring(object):
+  """Helper class to promote private subclass docstring to public counterpart.
+
+  Example:
+
+  ```python
+  class TransformedDistribution(Distribution):
+    @distribution_util.AppendDocstring(
+      additional_note="A special note!",
+      kwargs_dict={"foo": "An extra arg."})
+    def _prob(self, y, foo=None):
+      pass
+  ```
+
+  In this case, the `AppendDocstring` decorator appends the `additional_note` to
+  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
+  section with each dictionary item as a bullet-point.
+
+  For a more detailed example, see `TransformedDistribution`.
+  """
+
+  def __init__(self, additional_note="", kwargs_dict=None):
+    """Initializes the AppendDocstring object.
+
+    Args:
+      additional_note: Python string added as additional docstring to public
+        version of function.
+      kwargs_dict: Python string/string dictionary representing
+        specific kwargs expanded from the **kwargs input.
+
+    Raises:
+      ValueError: if kwargs_dict.key contains whitespace.
+      ValueError: if kwargs_dict.value contains newlines.
+    """
+    self._additional_note = additional_note
+    if kwargs_dict:
+      bullets = []
+      for key in sorted(kwargs_dict.keys()):
+        value = kwargs_dict[key]
+        if any(x.isspace() for x in key):
+          raise ValueError(
+              "Parameter name \"%s\" contains whitespace." % key)
+        value = value.lstrip()
+        if "\n" in value:
+          raise ValueError(
+              "Parameter description for \"%s\" contains newlines." % key)
+        bullets.append("*  `%s`: %s" % (key, value))
+      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
+                                "\n".join(bullets))
+
+  def __call__(self, fn):
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+      return fn(*args, **kwargs)
+    if _fn.__doc__ is None:
+      _fn.__doc__ = self._additional_note
+    else:
+      _fn.__doc__ += "\n%s" % self._additional_note
+    return _fn
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 1e02cc6c483..4c94f9e9b53 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+# Imports gradient definitions.
+from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -33,21 +35,21 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 
-def _do_gather(params, ids, validate_indices=True, name=None):
+def _do_gather(params, ids, name=None):
   """Deals with doing gather differently for resource variables."""
   if isinstance(params, resource_variable_ops.ResourceVariable):
     return params.sparse_read(ids, name=name)
-  return array_ops.gather(
-      params, ids, name=name, validate_indices=validate_indices)
+  return array_ops.gather(params, ids, name=name)
 
 
 def embedding_lookup(params, ids, partition_strategy="mod", name=None,
-                     validate_indices=True, max_norm=None):
+                     validate_indices=True,  # pylint: disable=unused-argument
+                     max_norm=None):
   """Looks up `ids` in a list of embedding tensors.
 
   This function is used to perform parallel lookups on the list of
   tensors in `params`.  It is a generalization of
-  [`tf.gather()`](../../api_docs/python/array_ops.md#gather), where `params` is
+  @{tf.gather}, where `params` is
   interpreted as a partitioning of a large embedding tensor.  `params` may be
   a `PartitionedVariable` as returned by using `tf.get_variable()` with a
   partitioner.
@@ -82,7 +84,10 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`.
     name: A name for the operation (optional).
-    validate_indices: Whether or not to validate gather indices.
+    validate_indices: DEPRECATED. If this operation is assigned to CPU, values
+      in `indices` are always validated to be within range.  If assigned to GPU,
+      out-of-bound indices result in safe but unspecified behavior, which may
+      include raising an error.
     max_norm: If not None, embedding values are l2-normalized to the value of
      max_norm.
 
@@ -92,20 +97,31 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
   Raises:
     ValueError: If `params` is empty.
   """
-  if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
+  if params is None or params in ((), []):
     raise ValueError("Need at least one param")
   if isinstance(params, variables.PartitionedVariable):
     params = list(params)  # Iterate to get the underlying Variables.
   if not isinstance(params, list):
     params = [params]
+
   def maybe_normalize(x):
-    if max_norm is not None:
-      if x.get_shape().ndims is not None:
-        ndims = x.get_shape().ndims
-      else:
-        ndims = array_ops.size(array_ops.shape(x))
-      return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
-    return x
+    """Normalizes the embeddings in x if max_norm is not None."""
+    if max_norm is None:
+      return x
+    static = True
+    ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
+    if ids_rank is None:
+      ids_rank = array_ops.rank(ids)
+      static = False
+    x_rank = x.get_shape().ndims
+    if x_rank is None:
+      x_rank = array_ops.rank(x)
+      static = False
+    return clip_ops.clip_by_norm(
+        x, max_norm,
+        axes=list(range(ids_rank, x_rank)) if static
+        else math_ops.range(ids_rank, x_rank))
+
   with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
     np = len(params)  # Number of partitions
     # Preserve the resource variable status to avoid accidental dense reads.
@@ -114,9 +130,7 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
     if np == 1:
       with ops.colocate_with(params[0]):
-        return maybe_normalize(
-            _do_gather(
-                params[0], ids, validate_indices=validate_indices, name=name))
+        return maybe_normalize(_do_gather(params[0], ids, name=name))
     else:
       ids = ops.convert_to_tensor(ids, name="ids")
       flat_ids = array_ops.reshape(ids, [-1])
@@ -176,9 +190,7 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       partitioned_result = []
       for p in xrange(np):
         with ops.colocate_with(params[p]):
-          partitioned_result.append(
-              _do_gather(params[p], gather_ids[p],
-                         validate_indices=validate_indices))
+          partitioned_result.append(_do_gather(params[p], gather_ids[p]))
       # Stitch these back together
       ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
                                          name=name)
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index e8ca9e99e6e..413c29850e0 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -15,10 +15,7 @@
 
 """Functional operations.
 
-## Higher Order Operators
-
-TensorFlow provides several higher order operators to simplify the common
-map-reduce programming patterns.
+See the @{$python/functional_ops} guide.
 
 @@map_fn
 @@foldl
@@ -213,7 +210,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
            swap_memory=False, infer_shape=True, name=None):
   """map on the list of tensors unpacked from `elems` on dimension 0.
 
-  The simplest version of `map` repeatedly applies the callable `fn` to a
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
   sequence of elements from first to last. The elements are made of the
   tensors unpacked from `elems`. `dtype` is the data type of the return
   value of `fn`. Users must provide `dtype` if it is different from
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index b48ba4af023..3addfefc99d 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -218,13 +218,9 @@ def _compute_gradient(x,
         x_shape, i_shape)
     x_data = x_init_value
   else:
-    if t == dtypes.float16:
-      dtype = np.float16
-    elif t == dtypes.float32:
-      dtype = np.float32
-    else:
-      dtype = np.float64
-    x_data = np.asfarray(np.random.random_sample(x_shape), dtype=dtype)
+    x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype)
+    if t.is_complex:
+      x_data.imag = np.random.random_sample(x_shape)
 
   jacob_t = _compute_theoretical_jacobian(
       x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict)
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 3ea8f3798cd..b0ecdc6a502 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -153,7 +153,7 @@ class GradientCheckerTest(test.TestCase):
                                                                 size)
       correct = np.array([[1, 0], [0, -1]])
       self.assertAllEqual(correct, analytical)
-      self.assertAllClose(correct, numerical, rtol=3e-6)
+      self.assertAllClose(correct, numerical, rtol=2e-5)
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-5)
 
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 23c1ff742c0..bd8a5c86acc 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -43,6 +43,8 @@ from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -271,28 +273,6 @@ def _VerifyGeneratedGradients(grads, op):
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
-    for i in xrange(len(grads)):
-      grad = grads[i]
-      inp = op.inputs[i]
-      if grad is None:
-        continue
-      if grad.dtype.is_floating:
-        if not inp.dtype.is_floating:
-          raise TypeError("Gradient type %s generated for real-valued op %s "
-                           "with type %s must be real" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      elif grad.dtype.is_complex:
-        if not inp.dtype.is_complex:
-          raise TypeError("Gradient type %s generated for complex-valued op %s"
-                           " with type %s must be complex" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      else:
-        raise TypeError("Gradient type %s generated for op %s "
-                         "with type %s must be either real or complex" %
-                         (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                          dtypes.as_dtype(inp.dtype).name))
 
 
 def _StopOps(from_ops, pending_count):
@@ -348,20 +328,41 @@ def _SymGrad(op, out_grads):
   return in_grads
 
 
-def _MaybeCompile(op, is_func, grad_fn):
+def _MaybeCompile(scope, op, func, grad_fn):
   """Compile the calculation in grad_fn if op was marked as compiled."""
-  if is_func:
-    # Functions handle their own gradient compilation
+  scope = scope.rstrip("/").replace("/", "_")
+  if func is not None:
+    xla_compile = func.definition.attr["_XlaCompile"].b
+    xla_separate_compiled_gradients = func.definition.attr[
+        "_XlaSeparateCompiledGradients"].b
+    xla_scope = func.definition.attr["_XlaScope"].s.decode()
+  else:
+    try:
+      xla_compile = op.get_attr("_XlaCompile")
+      xla_separate_compiled_gradients = op.get_attr(
+          "_XlaSeparateCompiledGradients")
+      xla_scope = op.get_attr("_XlaScope").decode()
+    except ValueError:
+      return grad_fn()  # Exit early
+
+  if not xla_compile:
+    return grad_fn()  # Exit early
+
+  # If the gradients are supposed to be compiled separately, we give them a
+  # _XlaScope name that is based on the name_scope of the gradients.  Otherwise
+  # they just inherit the existing _XlaScope name, which lets them be merged
+  # together with the non-gradient computation.
+  if xla_separate_compiled_gradients:
+    xla_grad_scope = "%s_grad_%s" % (xla_scope, scope)
+  else:
+    xla_grad_scope = xla_scope
+
+  attrs = {
+      "_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile),
+      "_XlaScope": attr_value_pb2.AttrValue(s=xla_grad_scope.encode())
+  }
+  with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
     return grad_fn()
-  try:
-    xla_compile = op.get_attr("_XlaCompile")
-    attrs = {"_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile)}
-    with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
-      return grad_fn()
-  except ValueError as e:
-    if "No attr named" in str(e):
-      return grad_fn()
-    raise e
 
 
 def gradients(ys,
@@ -420,9 +421,13 @@ def gradients(ys,
   else:
     grad_ys = _AsList(grad_ys)
 
-  with ops.name_scope(name, "gradients", ys + xs + grad_ys):
+  with ops.name_scope(name, "gradients", ys + xs + grad_ys) as grad_scope:
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
+    xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
+          else x
+          for x in xs]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x",
+                                                            as_ref=True)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
 
     # The approach we take here is as follows: Create a list of all ops in the
@@ -434,6 +439,8 @@ def gradients(ys,
 
     # Initialize the pending count for ops in the connected subgraph from ys
     # to the xs.
+    if len(ys) > 1:
+      ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
@@ -488,12 +495,13 @@ def gradients(ys,
 
         grad_fn = None
         # pylint: disable=protected-access
+        func_call = None
         is_func_call = ops.get_default_graph()._is_function(op.type)
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
         if has_out_grads and (op._id not in stop_ops):
           if is_func_call:
-            grad_fn = ops.get_default_graph()._get_function(
-                op.type).python_grad_func
+            func_call = ops.get_default_graph()._get_function(op.type)
+            grad_fn = func_call.python_grad_func
             # pylint: enable=protected-access
           else:
             # A grad_fn must be defined, either as a function or as None
@@ -515,6 +523,8 @@ def gradients(ys,
                 not out_grad) and _IsTrainable(op.outputs[i]):
               # Only floating-point outputs get a zero gradient. Gradient
               # functions should ignore the gradient for other outputs.
+              # TODO(apassos) gradients of resource handles might be an
+              # issue here because of zeros.
               if loop_state:
                 out_grads[i] = loop_state.ZerosLike(op, i)
               else:
@@ -527,12 +537,12 @@ def gradients(ys,
                 # If grad_fn was found, do not use SymbolicGradient even for
                 # functions.
                 in_grads = _MaybeCompile(
-                    op, is_func_call, lambda: grad_fn(op, *out_grads))
+                    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
               else:
                 # For function call ops, we add a 'SymbolicGradient'
                 # node to the graph to compute gradients.
                 in_grads = _MaybeCompile(
-                    op, is_func_call, lambda: _SymGrad(op, out_grads))
+                    grad_scope, op, func_call, lambda: _SymGrad(op, out_grads))
               in_grads = _AsList(in_grads)
               _VerifyGeneratedGradients(in_grads, op)
               if gate_gradients and len(
@@ -545,7 +555,8 @@ def gradients(ys,
           in_grads = [None] * len(op.inputs)
         for t_in, in_grad in zip(op.inputs, in_grads):
           if in_grad is not None:
-            if isinstance(in_grad, ops.Tensor):
+            if (isinstance(in_grad, ops.Tensor) and
+                t_in.dtype != dtypes.resource):
               in_grad.set_shape(t_in.get_shape())
             _SetGrad(grads, t_in, in_grad)
         if loop_state:
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 453313b4ac3..aefed34d744 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import warnings
 
 import numpy as np
 
-from tensorflow.contrib.compiler import jit
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -45,6 +45,7 @@ from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
 
@@ -312,6 +313,42 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grad, = gradients.gradients(target, v)
       self.assertIsNone(grad)
 
+  def testVariableReadValueGradient(self):
+    with ops.Graph().as_default():
+      init = constant_op.constant(100.0)
+      var = variables.Variable(init)
+      gradient = gradients.gradients(var.read_value(), var)
+      self.assertIsNotNone(gradient)
+
+  def testVariableAsGraphElementGradient(self):
+    with ops.Graph().as_default() as graph:
+      init = constant_op.constant(100.0)
+      var = variables.Variable(init)
+      gradient = gradients.gradients(graph.as_graph_element(var), var)
+      self.assertIsNotNone(gradient)
+
+  def testVariableRefGradient(self):
+    with ops.Graph().as_default():
+      init = constant_op.constant(100.0)
+      var = variables.Variable(init)
+      gradient = gradients.gradients(var._ref(), var)
+      self.assertIsNotNone(gradient)
+
+  def testDependentYs(self):
+    with self.test_session():
+      x = constant_op.constant(3.0)
+      y = math_ops.square(x)
+      y1 = math_ops.square(y)
+      y2 = math_ops.square(y1)
+      g = gradients.gradients([y, y2], x)
+      self.assertAllClose(17502.0, g[0].eval())
+      g = gradients.gradients(y + y2, x)
+      self.assertAllClose(17502.0, g[0].eval())
+      z = array_ops.identity(y)
+      z2 = array_ops.identity(y2)
+      g = gradients.gradients([z, z2], x)
+      self.assertAllClose(17502.0, g[0].eval())
+
 
 class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
@@ -418,7 +455,7 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       inp = constant(1.0, shape=[100, 32], name="in")
       out = array_ops.prevent_gradient(inp)
-      with self.assertRaisesRegexp(LookupError, "No gradient defined"):
+      with self.assertRaisesRegexp(LookupError, "explicitly disabled"):
         _ = gradients.gradients(out, inp)
 
 
@@ -538,6 +575,11 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(np_val, c_dense.eval())
 
   def testWarnings(self):
+    # TODO(gunan) Reenable after this issue is fixed:
+    # https://github.com/google/protobuf/issues/2812
+    if sys.version_info >= (3, 6):
+      self.skipTest("Skipped test for Python 3.6+")
+
     # Smaller than the threshold: no warning.
     c_sparse = ops.IndexedSlices(
         array_ops.placeholder(dtypes.float32),
@@ -582,31 +624,5 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
       gradients.gradients(y, x)
 
 
-class CompilationEnabledInGradientTest(test_util.TensorFlowTestCase):
-
-  def testCompilationInGradient(self):
-    with self.test_session():
-      x = constant_op.constant(3)
-      y_nc = math_ops.add(x, x, name="not_compiled")
-      with jit.experimental_jit_scope():
-        y_c = math_ops.add(y_nc, y_nc, name="compiled")
-      x_grads = gradients.gradients([y_c], [x])[0]
-      operations = x_grads.graph.get_operations()
-      c_grad_ops = [
-          op for op in operations if "gradients/compiled" in op.name]
-      nc_grad_ops = [
-          op for op in operations if "gradients/not_compiled" in op.name]
-      self.assertGreater(len(c_grad_ops), 0)
-      self.assertGreater(len(nc_grad_ops), 0)
-      for cg in c_grad_ops:
-        self.assertEqual(True, cg.get_attr("_XlaCompile"))
-      for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "No attr named"):
-          ncg.get_attr("_XlaCompile")
-
-      # d/dx (4 * x)
-      self.assertAllClose(4, x_grads.eval())
-
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index e76976368c9..9aef6bffdea 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -60,18 +60,30 @@ FakeQueue
 FIFOQueue
 FIFOQueueV2
 GetSessionHandle
+GetSessionHandleV2
 GetSessionTensor
 HashTable
+HashTableV2
 InitializeTable
+InitializeTableV2
 InitializeTableFromTextFile
+InitializeTableFromTextFileV2
 LookupTableExport
+LookupTableExportV2
 LookupTableFind
+LookupTableFindV2
 LookupTableImport
+LookupTableImportV2
 LookupTableInsert
+LookupTableInsertV2
 LookupTableSize
+LookupTableSizeV2
 MutableDenseHashTable
+MutableDenseHashTableV2
 MutableHashTable
+MutableHashTableV2
 MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
 Mutex
 MutexAcquire
 MutexRelease
@@ -149,8 +161,6 @@ ScaleImageGrad
 # io_ops
 FixedLengthRecordReader
 IdentityReader
-ReaderClose
-ReaderEnqueueWork
 ReaderNumRecordsProduced
 ReaderNumWorkUnitsCompleted
 ReaderRead
@@ -161,8 +171,6 @@ ReaderSerializeState
 ReaderWorkQueueLength
 FixedLengthRecordReaderV2
 IdentityReaderV2
-ReaderCloseV2
-ReaderEnqueueWorkV2
 ReaderNumRecordsProducedV2
 ReaderNumWorkUnitsCompletedV2
 ReaderReadV2
@@ -183,6 +191,7 @@ WholeFileReader
 TextLineReaderV2
 TFRecordReaderV2
 WholeFileReaderV2
+LMDBReader
 
 # linalg_ops
 BatchCholesky
@@ -210,6 +219,7 @@ MergeSummary
 Print
 ScalarSummary
 TensorSummary
+TensorSummaryV2
 
 # math_ops
 Abs
@@ -223,6 +233,7 @@ BatchFFT3D
 BatchIFFT
 BatchIFFT2D
 BatchIFFT3D
+Bucketize
 Complex
 ComplexAbs
 Conj
@@ -255,6 +266,7 @@ TruncateMod
 
 # nn_ops
 AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
 BatchNormWithGlobalNormalization
 BatchNormWithGlobalNormalizationGrad
 FusedBatchNorm
@@ -263,6 +275,10 @@ SparseSoftmaxCrossEntropyWithLogits
 LRNGrad
 MaxPoolGrad
 MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
 ReluGrad
 Relu6Grad
 EluGrad
@@ -286,6 +302,7 @@ ParseSingleSequenceExample
 
 # random_ops
 RandomGamma
+RandomPoisson
 RandomUniform
 RandomUniformInt
 RandomShuffle
@@ -298,9 +315,6 @@ PyFunc
 PyFuncStateless
 
 # sdca_ops
-SdcaFprint
-SdcaOptimizer
-SdcaShrinkL1
 
 # state_ops
 Variable
@@ -318,6 +332,9 @@ SerializeSparse
 SparseAdd
 SparseAddGrad
 SparseConcat
+SparseCross
+SparseFillEmptyRows
+SparseFillEmptyRowsGrad
 SparseSplit
 SparseSelectLastK
 SparseReorder
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 5e1f322b2cc..c145b11191e 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
-"""## Histograms
+"""Histograms.
+
+Please see @{$python/histogram_ops} guide.
 
 @@histogram_fixed_width
 """
@@ -42,9 +44,9 @@ def histogram_fixed_width(values,
 
   Args:
     values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
-      mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
-      Must be same dtype as new_values.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
     nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
     dtype:  dtype for returned histogram.
     name:  A name for this operation (defaults to 'histogram_fixed_width').
@@ -72,7 +74,7 @@ def histogram_fixed_width(values,
     values = array_ops.reshape(values, [-1])
     value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins = ops.convert_to_tensor(nbins, dtype=dtypes.int32, name='nbins')
-    nbins_float = math_ops.to_float(nbins)
+    nbins_float = math_ops.cast(nbins, values.dtype)
 
     # Map tensor values that fall within value_range to [0, 1].
     scaled_values = math_ops.truediv(values - value_range[0],
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index dd04d2f2aeb..e819e0234d2 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,10 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -41,25 +38,33 @@ class HistogramFixedWidthTest(test.TestCase):
     expected_bin_counts = [0, 0, 0, 0, 0]
     with self.test_session():
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
-
-      # Hist should start "fresh" with every eval.
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval())
 
-  def test_one_update_on_constant_input(self):
+  def test_1d_values_int64_output(self):
     # Bins will be:
     #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
     value_range = [0.0, 5.0]
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     expected_bin_counts = [2, 1, 1, 0, 2]
+    with self.test_session():
+      hist = histogram_ops.histogram_fixed_width(
+          values, value_range, nbins=5, dtype=dtypes.int64)
+      self.assertEqual(dtypes.int64, hist.dtype)
+      self.assertAllClose(expected_bin_counts, hist.eval())
+
+  def test_1d_float64_values(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = np.float64([0.0, 5.0])
+    values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
+    expected_bin_counts = [2, 1, 1, 0, 2]
     with self.test_session():
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
-
-      # Hist should start "fresh" with every eval.
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval())
 
-  def test_one_update_on_constant_2d_input(self):
+  def test_2d_values(self):
     # Bins will be:
     #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
     value_range = [0.0, 5.0]
@@ -67,82 +72,8 @@ class HistogramFixedWidthTest(test.TestCase):
     expected_bin_counts = [2, 1, 1, 0, 2]
     with self.test_session():
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
-
-      # Hist should start "fresh" with every eval.
+      self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval())
-      self.assertAllClose(expected_bin_counts, hist.eval())
-
-  def test_two_updates_on_constant_input(self):
-    # Bins will be:
-    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    value_range = [0.0, 5.0]
-    values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-    values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0]
-    expected_bin_counts_1 = [2, 1, 1, 0, 2]
-    expected_bin_counts_2 = [2, 1, 0, 0, 3]
-    with self.test_session():
-      values = array_ops.placeholder(dtypes.float32, shape=[6])
-      hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
-
-      # The values in hist should depend on the current feed and nothing else.
-      self.assertAllClose(
-          expected_bin_counts_1, hist.eval(feed_dict={values: values_1}))
-      self.assertAllClose(
-          expected_bin_counts_2, hist.eval(feed_dict={values: values_2}))
-      self.assertAllClose(
-          expected_bin_counts_1, hist.eval(feed_dict={values: values_1}))
-      self.assertAllClose(
-          expected_bin_counts_1, hist.eval(feed_dict={values: values_1}))
-
-  def test_two_updates_on_scalar_input(self):
-    # Bins will be:
-    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    value_range = [0.0, 5.0]
-    values_1 = 1.5
-    values_2 = 2.5
-    expected_bin_counts_1 = [0, 1, 0, 0, 0]
-    expected_bin_counts_2 = [0, 0, 1, 0, 0]
-    with self.test_session():
-      values = array_ops.placeholder(dtypes.float32, shape=[])
-      hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
-
-      # The values in hist should depend on the current feed and nothing else.
-      self.assertAllClose(
-          expected_bin_counts_2, hist.eval(feed_dict={values: values_2}))
-      self.assertAllClose(
-          expected_bin_counts_1, hist.eval(feed_dict={values: values_1}))
-      self.assertAllClose(
-          expected_bin_counts_1, hist.eval(feed_dict={values: values_1}))
-      self.assertAllClose(
-          expected_bin_counts_2, hist.eval(feed_dict={values: values_2}))
-
-  def test_multiple_random_accumulating_updates_results_in_right_dist(self):
-    # Accumulate the updates in a new variable.  Resultant
-    # histogram should be uniform.  Use only 3 bins because with many bins it
-    # would be unlikely that all would be close to 1/n.  If someone ever wants
-    # to test that, it would be better to check that the cdf was linear.
-    value_range = [1.0, 4.14159]
-    with self.test_session() as sess:
-      values = array_ops.placeholder(dtypes.float32, shape=[4, 4, 4])
-      hist = histogram_ops.histogram_fixed_width(
-          values, value_range, nbins=3, dtype=dtypes.int64)
-
-      hist_accum = variables.Variable(init_ops.zeros_initializer()(
-          [3], dtype=dtypes.int64))
-      hist_accum = hist_accum.assign_add(hist)
-
-      variables.global_variables_initializer().run()
-
-      for _ in range(100):
-        # Map the rv: U[0, 1] --> U[value_range[0], value_range[1]].
-        values_arr = (
-            value_range[0] +
-            (value_range[1] - value_range[0]) * self.rng.rand(4, 4, 4))
-
-        hist_accum_arr = sess.run(hist_accum, feed_dict={values: values_arr})
-
-    pmf = hist_accum_arr / float(hist_accum_arr.sum())
-    np.testing.assert_allclose(1 / 3, pmf, atol=0.02)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index f97a240acb7..51d02761402 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -14,157 +14,54 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""## Encoding and Decoding
+"""Image processing and decoding ops.
 
-TensorFlow provides Ops to decode and encode JPEG and PNG formats.  Encoded
-images are represented by scalar string Tensors, decoded images by 3-D uint8
-tensors of shape `[height, width, channels]`. (PNG also supports uint16.)
-
-The encode and decode Ops apply to one image at a time.  Their input and output
-are all of variable size.  If you need fixed size images, pass the output of
-the decode Ops to one of the cropping and resizing Ops.
-
-Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
-presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
-to be stripped from the image and re-attached using slicing ops.
+See the @{$python/image} guide.
 
+@@decode_bmp
 @@decode_gif
-
 @@decode_jpeg
 @@encode_jpeg
-
 @@decode_png
 @@encode_png
-
 @@decode_image
-
-## Resizing
-
-The resizing Ops accept input images as tensors of several types.  They always
-output resized images as float32 tensors.
-
-The convenience function [`resize_images()`](#resize_images) supports both 4-D
-and 3-D tensors as input and output.  4-D tensors are for batches of images,
-3-D tensors for individual images.
-
-Other resizing Ops only support 4-D batches of images as input:
-[`resize_area`](#resize_area), [`resize_bicubic`](#resize_bicubic),
-[`resize_bilinear`](#resize_bilinear),
-[`resize_nearest_neighbor`](#resize_nearest_neighbor).
-
-Example:
-
-```python
-# Decode a JPG image and resize it to 299 by 299 using default method.
-image = tf.image.decode_jpeg(...)
-resized_image = tf.image.resize_images(image, [299, 299])
-```
-
 @@resize_images
-
 @@resize_area
 @@resize_bicubic
 @@resize_bilinear
 @@resize_nearest_neighbor
-
-## Cropping
-
 @@resize_image_with_crop_or_pad
-
 @@central_crop
 @@pad_to_bounding_box
 @@crop_to_bounding_box
 @@extract_glimpse
-
 @@crop_and_resize
-
-## Flipping, Rotating and Transposing
-
 @@flip_up_down
 @@random_flip_up_down
-
 @@flip_left_right
 @@random_flip_left_right
-
 @@transpose_image
-
 @@rot90
 
-## Converting Between Colorspaces.
-
-Image ops work either on individual images or on batches of images, depending on
-the shape of their input Tensor.
-
-If 3-D, the shape is `[height, width, channels]`, and the Tensor represents one
-image. If 4-D, the shape is `[batch_size, height, width, channels]`, and the
-Tensor represents `batch_size` images.
-
-Currently, `channels` can usefully be 1, 2, 3, or 4. Single-channel images are
-grayscale, images with 3 channels are encoded as either RGB or HSV. Images
-with 2 or 4 channels include an alpha channel, which has to be stripped from the
-image before passing the image to most image processing functions (and can be
-re-attached later).
-
-Internally, images are either stored in as one `float32` per channel per pixel
-(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel
-per pixel (values are assumed to lie in `[0,255]`).
-
-TensorFlow can convert between images in RGB or HSV. The conversion functions
-work only on float images, so you need to convert images in other formats using
-[`convert_image_dtype`](#convert-image-dtype).
-
-Example:
-
-```python
-# Decode an image and convert it to HSV.
-rgb_image = tf.image.decode_png(...,  channels=3)
-rgb_image_float = tf.image.convert_image_dtype(rgb_image, tf.float32)
-hsv_image = tf.image.rgb_to_hsv(rgb_image)
-```
-
 @@rgb_to_grayscale
 @@grayscale_to_rgb
-
 @@hsv_to_rgb
 @@rgb_to_hsv
-
 @@convert_image_dtype
-
-## Image Adjustments
-
-TensorFlow provides functions to adjust images in various ways: brightness,
-contrast, hue, and saturation.  Each adjustment can be done with predefined
-parameters or with random parameters picked from predefined intervals. Random
-adjustments are often useful to expand a training set and reduce overfitting.
-
-If several adjustments are chained it is advisable to minimize the number of
-redundant conversions by first converting the images to the most natural data
-type and representation (RGB or HSV).
-
 @@adjust_brightness
 @@random_brightness
-
 @@adjust_contrast
 @@random_contrast
-
 @@adjust_hue
 @@random_hue
-
 @@adjust_gamma
-
 @@adjust_saturation
 @@random_saturation
-
 @@per_image_standardization
-
-## Working with Bounding Boxes
-
 @@draw_bounding_boxes
 @@non_max_suppression
+@@non_max_suppression_v2
 @@sample_distorted_bounding_box
-
-## Denoising
-
 @@total_variation
 """
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f47cd98975e..65a1399c5b8 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -32,9 +32,9 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 
 
@@ -52,6 +52,7 @@ ops.NotDifferentiable('SampleDistortedBoundingBox')
 # latent bugs here.
 ops.NotDifferentiable('ExtractGlimpse')
 ops.NotDifferentiable('NonMaxSuppression')
+ops.NotDifferentiable('NonMaxSuppressionV2')
 
 
 def _assert(cond, ex_type, msg):
@@ -90,22 +91,23 @@ def _is_tensor(x):
   return isinstance(x, (ops.Tensor, variables.Variable))
 
 
-def _ImageDimensions(image):
+def _ImageDimensions(image, rank):
   """Returns the dimensions of an image tensor.
 
   Args:
-    image: A 3-D Tensor of shape `[height, width, channels]`.
+    image: A rank-D Tensor. For 3-D  of shape: `[height, width, channels]`.
+    rank: The expected rank of the image
 
   Returns:
-    A list of `[height, width, channels]` corresponding to the dimensions of the
+    A list of corresponding to the dimensions of the
     input image.  Dimensions that are statically known are python integers,
     otherwise they are integer scalar tensors.
   """
   if image.get_shape().is_fully_defined():
     return image.get_shape().as_list()
   else:
-    static_shape = image.get_shape().with_rank(3).as_list()
-    dynamic_shape = array_ops.unstack(array_ops.shape(image), 3)
+    static_shape = image.get_shape().with_rank(rank).as_list()
+    dynamic_shape = array_ops.unstack(array_ops.shape(image), rank)
     return [s if s is not None else d
             for s, d in zip(static_shape, dynamic_shape)]
 
@@ -128,9 +130,11 @@ def _Check3DImage(image, require_static=True):
   try:
     image_shape = image.get_shape().with_rank(3)
   except ValueError:
-    raise ValueError("'image' must be three-dimensional.")
+    raise ValueError("'image' (shape %s) must be three-dimensional." %
+                     image.shape)
   if require_static and not image_shape.is_fully_defined():
-    raise ValueError("'image' must be fully defined.")
+    raise ValueError("'image' (shape %s) must be fully defined." %
+                     image_shape)
   if any(x == 0 for x in image_shape):
     raise ValueError("all dims of 'image.shape' must be > 0: %s" %
                      image_shape)
@@ -142,22 +146,39 @@ def _Check3DImage(image, require_static=True):
     return []
 
 
-def _CheckAtLeast3DImage(image):
+def _CheckAtLeast3DImage(image, require_static=True):
   """Assert that we are working with properly shaped image.
 
   Args:
     image: >= 3-D Tensor of size [*, height, width, depth]
+    require_static: If `True`, requires that all dimensions of `image` are
+      known and non-zero.
 
   Raises:
     ValueError: if image.shape is not a [>= 3] vector.
+
+  Returns:
+    An empty list, if `image` has fully defined dimensions. Otherwise, a list
+    containing an assert op is returned.
   """
-  if not image.get_shape().is_fully_defined():
+  try:
+    if image.get_shape().ndims is None:
+      image_shape = image.get_shape().with_rank(3)
+    else:
+      image_shape = image.get_shape().with_rank_at_least(3)
+  except ValueError:
+    raise ValueError("'image' must be at least three-dimensional.")
+  if require_static and not image_shape.is_fully_defined():
     raise ValueError('\'image\' must be fully defined.')
-  if image.get_shape().ndims < 3:
-    raise ValueError('\'image\' must be at least three-dimensional.')
-  if not all(x > 0 for x in image.get_shape()):
+  if any(x == 0 for x in image_shape):
     raise ValueError('all dims of \'image.shape\' must be > 0: %s' %
-                     image.get_shape())
+                     image_shape)
+  if not image_shape.is_fully_defined():
+    return [check_ops.assert_positive(array_ops.shape(image),
+                                      ["all dims of 'image.shape' "
+                                       "must be > 0."])]
+  else:
+    return []
 
 
 def fix_image_flip_shape(image, result):
@@ -188,7 +209,7 @@ def random_flip_up_down(image, seed=None):
   Args:
     image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
 
   Returns:
@@ -198,7 +219,8 @@ def random_flip_up_down(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -216,7 +238,7 @@ def random_flip_left_right(image, seed=None):
   Args:
     image: A 3-D tensor of shape `[height, width, channels].`
     seed: A Python integer. Used to create a random seed. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
 
   Returns:
@@ -226,7 +248,8 @@ def random_flip_left_right(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -253,12 +276,13 @@ def flip_left_right(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
 
 
 def flip_up_down(image):
-  """Flip an image horizontally (upside down).
+  """Flip an image vertically (upside down).
 
   Outputs the contents of `image` flipped along the first dimension, which is
   `height`.
@@ -275,7 +299,8 @@ def flip_up_down(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
 
 
@@ -292,7 +317,8 @@ def rot90(image, k=1, name=None):
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    _Check3DImage(image, require_static=False)
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
@@ -330,7 +356,8 @@ def transpose_image(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
 
 
@@ -359,12 +386,14 @@ def central_crop(image, central_fraction):
     3-D float Tensor
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
   if central_fraction <= 0.0 or central_fraction > 1.0:
     raise ValueError('central_fraction must be within (0, 1]')
   if central_fraction == 1.0:
     return image
 
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
+
   img_shape = array_ops.shape(image)
   depth = image.get_shape()[2]
   fraction_offset = int(1 / ((1 - central_fraction) / 2.0))
@@ -395,14 +424,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   `target_height` by `target_width`.
 
   Args:
-    image: 3-D tensor with shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     offset_height: Number of rows of zeros to add on top.
     offset_width: Number of columns of zeros to add on the left.
     target_height: Height of output image.
     target_width: Width of output image.
 
   Returns:
-    3-D tensor of shape `[target_height, target_width, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, target_height, target_width, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+    `[target_height, target_width, channels]`
 
   Raises:
     ValueError: If the shape of `image` is incompatible with the `offset_*` or
@@ -411,10 +444,22 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  is_batch = True
+  image_shape = image.get_shape()
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
+  batch, height, width, depth = _ImageDimensions(image, rank=4)
 
-  height, width, depth = _ImageDimensions(image)
   after_padding_width = target_width - offset_width - width
   after_padding_height = target_height - offset_height - height
 
@@ -431,15 +476,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   # Do not pad on the depth dimensions.
   paddings = array_ops.reshape(
       array_ops.stack([
-          offset_height, after_padding_height, offset_width,
+          0, 0, offset_height, after_padding_height, offset_width,
           after_padding_width, 0, 0
-      ]), [3, 2])
+      ]), [4, 2])
   padded = array_ops.pad(image, paddings)
 
   padded_shape = [None if _is_tensor(i) else i
-                  for i in [target_height, target_width, depth]]
+                  for i in [batch, target_height, target_width, depth]]
   padded.set_shape(padded_shape)
 
+  if not is_batch:
+    padded = array_ops.squeeze(padded, squeeze_dims=[0])
+
   return padded
 
 
@@ -453,7 +501,8 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   `offset_height + target_height, offset_width + target_width`.
 
   Args:
-    image: 3-D tensor with shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     offset_height: Vertical coordinate of the top-left corner of the result in
                    the input.
     offset_width: Horizontal coordinate of the top-left corner of the result in
@@ -462,7 +511,10 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     target_width: Width of the result.
 
   Returns:
-    3-D tensor of image with shape `[target_height, target_width, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, target_height, target_width, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+    `[target_height, target_width, channels]`
 
   Raises:
     ValueError: If the shape of `image` is incompatible with the `offset_*` or
@@ -471,10 +523,21 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  is_batch = True
+  image_shape = image.get_shape()
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  height, width, depth = _ImageDimensions(image)
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
+  batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   assert_ops += _assert(offset_width >= 0, ValueError,
                         'offset_width must be >= 0.')
@@ -490,14 +553,18 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                         'height must be >= target + offset.')
   image = control_flow_ops.with_dependencies(assert_ops, image)
 
-  cropped = array_ops.slice(image,
-                            array_ops.stack([offset_height, offset_width, 0]),
-                            array_ops.stack([target_height, target_width, -1]))
+  cropped = array_ops.slice(
+      image,
+      array_ops.stack([0, offset_height, offset_width, 0]),
+      array_ops.stack([-1, target_height, target_width, -1]))
 
   cropped_shape = [None if _is_tensor(i) else i
-                   for i in [target_height, target_width, depth]]
+                   for i in [batch, target_height, target_width, depth]]
   cropped.set_shape(cropped_shape)
 
+  if not is_batch:
+    cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+
   return cropped
 
 
@@ -514,7 +581,8 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   dimension.
 
   Args:
-    image: 3-D tensor of shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     target_height: Target height.
     target_width: Target width.
 
@@ -522,13 +590,26 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     ValueError: if `target_height` or `target_width` are zero or negative.
 
   Returns:
-    Cropped and/or padded image of shape
-    `[target_height, target_width, channels]`
+    Cropped and/or padded image.
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
   """
   image = ops.convert_to_tensor(image, name='image')
+  image_shape = image.get_shape()
+  is_batch = True
+  if image_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif image_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif image_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = []
-  assert_ops += _Check3DImage(image, require_static=False)
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
   assert_ops += _assert(target_width > 0, ValueError,
                         'target_width must be > 0.')
   assert_ops += _assert(target_height > 0, ValueError,
@@ -539,7 +620,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   # Make sure our checks come first, so that error messages are clearer.
   if _is_tensor(target_height):
     target_height = control_flow_ops.with_dependencies(
-      assert_ops, target_height)
+        assert_ops, target_height)
   if _is_tensor(target_width):
     target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
 
@@ -561,7 +642,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     else:
       return x == y
 
-  height, width, _ = _ImageDimensions(image)
+  _, height, width, _ = _ImageDimensions(image, rank=4)
   width_diff = target_width - width
   offset_crop_width = max_(-width_diff // 2, 0)
   offset_pad_width = max_(width_diff // 2, 0)
@@ -583,7 +664,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   if resized.get_shape().ndims is None:
     raise ValueError('resized contains no shape.')
 
-  resized_height, resized_width, _ = _ImageDimensions(resized)
+  _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
 
   assert_ops = []
   assert_ops += _assert(equal_(resized_height, target_height), ValueError,
@@ -592,6 +673,10 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
                         'resized width is not correct.')
 
   resized = control_flow_ops.with_dependencies(assert_ops, resized)
+
+  if not is_batch:
+    resized = array_ops.squeeze(resized, squeeze_dims=[0])
+
   return resized
 
 
@@ -610,13 +695,16 @@ def resize_images(images,
 
   Resized images will be distorted if their original aspect ratio is not
   the same as `size`.  To avoid distortions see
-  [`resize_image_with_crop_or_pad`](#resize_image_with_crop_or_pad).
+  @{tf.image.resize_image_with_crop_or_pad}.
 
   `method` can be one of:
 
-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](
+    https://en.wikipedia.org/wiki/Bicubic_interpolation)
   *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
 
   Args:
@@ -721,7 +809,8 @@ def per_image_standardization(image):
     ValueError: if the shape of 'image' is incompatible with this function.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
   image = math_ops.cast(image, dtype=dtypes.float32)
@@ -752,7 +841,7 @@ def random_brightness(image, max_delta, seed=None):
     image: An image.
     max_delta: float, must be non-negative.
     seed: A Python integer. Used to create a random seed. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
 
   Returns:
@@ -779,7 +868,7 @@ def random_contrast(image, lower, upper, seed=None):
     lower: float.  Lower bound for the random contrast factor.
     upper: float.  Upper bound for the random contrast factor.
     seed: A Python integer. Used to create a random seed. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
 
   Returns:
@@ -876,6 +965,7 @@ def adjust_contrast(images, contrast_factor):
 
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs Gamma Correction on the input image.
+
     Also known as Power Law Transform. This function transforms the
     input image pixelwise according to the equation Out = In**gamma
     after scaling each pixel to the range 0 to 1.
@@ -888,6 +978,9 @@ def adjust_gamma(image, gamma=1, gain=1):
   Returns:
     A Tensor. Gamma corrected output image.
 
+  Raises:
+    ValueError: If gamma is negative.
+
   Notes:
     For gamma greater than 1, the histogram will shift towards left and
     the output image will be darker than the input image.
@@ -898,16 +991,17 @@ def adjust_gamma(image, gamma=1, gain=1):
     [1] http://en.wikipedia.org/wiki/Gamma_correction
   """
 
-  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma') as name:
+  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma'):
     # Convert pixel value to DT_FLOAT for computing adjusted image
     img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
     # Keep image dtype for computing the scale of corresponding dtype
     image = ops.convert_to_tensor(image, name='image')
 
     if gamma < 0:
-      raise ValueError("Gamma should be a non-negative real number")
+      raise ValueError('Gamma should be a non-negative real number')
     # scale = max(dtype) - min(dtype)
-    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
+                                 dtype=dtypes.float32)
     # According to the definition of gamma correction
     adjusted_img = (img / scale) ** gamma * scale * gain
 
@@ -1219,57 +1313,94 @@ def adjust_saturation(image, saturation_factor, name=None):
 
 
 def decode_image(contents, channels=None, name=None):
-  """Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
-  Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate 
-  operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
+  """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
+  and `decode_png`.
 
-  Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as 
-  opposed to `decode_jpeg` and `decode_png`, which return 3-D arrays 
-  `[height, width, num_channels]`. Make sure to take this into account when 
-  constructing your graph if you are intermixing GIF files with JPEG and/or PNG 
-  files.
+  Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+  appropriate operation to convert the input bytes `string` into a `Tensor` of
+  type `uint8`.
+
+  Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
+  opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
+  arrays `[height, width, num_channels]`. Make sure to take this into account
+  when constructing your graph if you are intermixing GIF files with BMP, JPEG,
+  and/or PNG files.
 
   Args:
     contents: 0-D `string`. The encoded image bytes.
-    channels: An optional `int`. Defaults to `0`. Number of color channels for 
+    channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
     name: A name for the operation (optional)
-    
-  Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for 
-      JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF 
-      images.
-  """
-  with ops.name_scope(name, 'decode_image') as scope:
-    if channels not in (None, 0, 1, 3):
-      raise ValueError('channels must be in (None, 0, 1, 3)')
-    substr = string_ops.substr(contents, 0, 4)
 
-    def _gif():
-      # Create assert op to check that bytes are GIF decodable
-      is_gif = math_ops.equal(substr, b'\x47\x49\x46\x38', name='is_gif')
-      decode_msg = 'Unable to decode bytes as JPEG, PNG, or GIF'
-      assert_decode = control_flow_ops.Assert(is_gif, [decode_msg])
-      # Create assert to make sure that channels is not set to 1
-      # Already checked above that channels is in (None, 0, 1, 3)
-      gif_channels = 0 if channels is None else channels
-      good_channels = math_ops.not_equal(gif_channels, 1, name='check_channels')
-      channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
+  Returns:
+    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+      BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
+      GIF images.
+
+  Raises:
+    ValueError: On incorrect number of channels.
+  """
+  with ops.name_scope(name, 'decode_image'):
+    if channels not in (None, 0, 1, 3, 4):
+      raise ValueError('channels must be in (None, 0, 1, 3, 4)')
+    substr = string_ops.substr(contents, 0, 3)
+
+    def _bmp():
+      """Decodes a GIF image."""
+      signature = string_ops.substr(contents, 0, 2)
+      # Create assert op to check that bytes are BMP decodable
+      is_bmp = math_ops.equal(signature, 'BM', name='is_bmp')
+      decode_msg = 'Unable to decode bytes as JPEG, PNG, GIF, or BMP'
+      assert_decode = control_flow_ops.Assert(is_bmp, [decode_msg])
+      bmp_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(bmp_channels, 1, name='check_channels')
+      channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
+        return gen_image_ops.decode_bmp(contents)
+
+    def _gif():
+      # Create assert to make sure that channels is not set to 1
+      # Already checked above that channels is in (None, 0, 1, 3)
+
+      gif_channels = 0 if channels is None else channels
+      good_channels = math_ops.logical_and(
+          math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
+          math_ops.not_equal(gif_channels, 4, name='check_gif_channels')
+      )
+      channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_channels]):
         return gen_image_ops.decode_gif(contents)
 
+    def check_gif():
+      # Create assert op to check that bytes are GIF decodable
+      is_gif = math_ops.equal(substr, b'\x47\x49\x46', name='is_gif')
+      return control_flow_ops.cond(is_gif, _gif, _bmp, name='cond_gif')
+
     def _png():
+      """Decodes a PNG image."""
       return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
-      is_png = math_ops.equal(substr, b'\211PNG', name='is_png')
-      return control_flow_ops.cond(is_png, _png, _gif, name='cond_png')
+      """Checks if an image is PNG."""
+      is_png = math_ops.equal(substr, b'\211PN', name='is_png')
+      return control_flow_ops.cond(is_png, _png, check_gif, name='cond_png')
 
     def _jpeg():
-      return gen_image_ops.decode_jpeg(contents, channels)
+      """Decodes a jpeg image."""
+      jpeg_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(jpeg_channels, 4,
+                                         name='check_jpeg_channels')
+      channels_msg = ('Channels must be in (None, 0, 1, 3) when decoding JPEG '
+                      'images')
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_channels]):
+        return gen_image_ops.decode_jpeg(contents, channels)
 
-    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff\xe0', name='is_jpeg')
+    # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
+    # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
+    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff', name='is_jpeg')
     return control_flow_ops.cond(is_jpeg, _jpeg, check_png, name='cond_jpeg')
 
 
@@ -1337,7 +1468,7 @@ def total_variation(images, name=None):
 
     # Calculate the total variation by taking the absolute value of the
     # pixel-differences and summing over the appropriate axis.
-    tot_var = math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) + \
-              math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis)
+    tot_var = (math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
+               math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 8148af7a648..5588d18ef1d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import colorsys
+import functools
 import math
 import os
 import time
@@ -299,7 +300,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     return y_v.reshape(x_np.shape)
 
   def _adjustHueTf(self, x_np, delta_h):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
       y_tf = y.eval()
@@ -314,29 +315,29 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
         [1000, 1, 3],
     ]
     test_styles = [
-        'all_random',
-        'rg_same',
-        'rb_same',
-        'gb_same',
-        'rgb_same',
+        "all_random",
+        "rg_same",
+        "rb_same",
+        "gb_same",
+        "rgb_same",
     ]
     for x_shape in x_shapes:
       for test_style in test_styles:
         x_np = np.random.rand(*x_shape) * 255.
         delta_h = np.random.rand() * 2.0 - 1.0
-        if test_style == 'all_random':
+        if test_style == "all_random":
           pass
-        elif test_style == 'rg_same':
+        elif test_style == "rg_same":
           x_np[..., 1] = x_np[..., 0]
-        elif test_style == 'rb_same':
+        elif test_style == "rb_same":
           x_np[..., 2] = x_np[..., 0]
-        elif test_style == 'gb_same':
+        elif test_style == "gb_same":
           x_np[..., 2] = x_np[..., 1]
-        elif test_style == 'rgb_same':
+        elif test_style == "rgb_same":
           x_np[..., 1] = x_np[..., 0]
           x_np[..., 2] = x_np[..., 0]
         else:
-          raise AssertionError('Invalid test style: %s' % (test_style))
+          raise AssertionError("Invalid test style: %s" % (test_style))
         y_np = self._adjustHueNp(x_np, delta_h)
         y_tf = self._adjustHueTf(x_np, delta_h)
         self.assertAllClose(y_tf, y_np, rtol=2e-5, atol=1e-5)
@@ -350,11 +351,11 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     x_np = np.random.rand(2, 3) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
     fused = False
-    with self.assertRaisesRegexp(ValueError, 'Shape must be at least rank 3'):
+    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 3"):
       self._adjustHueTf(x_np, delta_h)
     x_np = np.random.rand(4, 2, 4) * 255.
     delta_h = np.random.rand() * 2.0 - 1.0
-    with self.assertRaisesOpError('input must have 3 channels'):
+    with self.assertRaisesOpError("input must have 3 channels"):
       self._adjustHueTf(x_np, delta_h)
 
 
@@ -368,7 +369,7 @@ class AdjustHueBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session('', graph=ops.Graph(), config=config) as sess:
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
             random_ops.random_uniform(
@@ -385,19 +386,19 @@ class AdjustHueBenchmark(test.Benchmark):
           sess.run(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
-    tag = '%s' % (cpu_count) if cpu_count is not None else '_all'
-    print('benchmarkAdjustHue_299_299_3_cpu%s step_time: %.2f us' %
+    tag = "%s" % (cpu_count) if cpu_count is not None else "_all"
+    print("benchmarkAdjustHue_299_299_3_cpu%s step_time: %.2f us" %
           (tag, step_time * 1e6))
     self.report_benchmark(
-        name='benchmarkAdjustHue_299_299_3_cpu%s' % (tag),
+        name="benchmarkAdjustHue_299_299_3_cpu%s" % (tag),
         iters=benchmark_rounds,
         wall_time=step_time)
 
   def benchmarkAdjustHueCpu1(self):
-    self._benchmarkAdjustHue('/cpu:0', 1)
+    self._benchmarkAdjustHue("/cpu:0", 1)
 
   def benchmarkAdjustHueCpuAll(self):
-    self._benchmarkAdjustHue('/cpu:0', None)
+    self._benchmarkAdjustHue("/cpu:0", None)
 
   def benchmarkAdjustHueGpu(self):
     self._benchmarkAdjustHue(test.gpu_device_name(), None)
@@ -413,7 +414,7 @@ class AdjustSaturationBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session('', graph=ops.Graph(), config=config) as sess:
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
             random_ops.random_uniform(
@@ -431,19 +432,19 @@ class AdjustSaturationBenchmark(test.Benchmark):
           sess.run(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
-    tag = '%s' % (cpu_count) if cpu_count is not None else '_all'
-    print('benchmarkAdjustSaturation_599_599_3_cpu%s step_time: %.2f us' %
+    tag = "%s" % (cpu_count) if cpu_count is not None else "_all"
+    print("benchmarkAdjustSaturation_599_599_3_cpu%s step_time: %.2f us" %
           (tag, step_time * 1e6))
     self.report_benchmark(
-        name='benchmarkAdjustSaturation_599_599_3_cpu%s' % (tag),
+        name="benchmarkAdjustSaturation_599_599_3_cpu%s" % (tag),
         iters=benchmark_rounds,
         wall_time=step_time)
 
   def benchmarkAdjustSaturationCpu1(self):
-    self._benchmarkAdjustSaturation('/cpu:0', 1)
+    self._benchmarkAdjustSaturation("/cpu:0", 1)
 
   def benchmarkAdjustSaturationCpuAll(self):
-    self._benchmarkAdjustSaturation('/cpu:0', None)
+    self._benchmarkAdjustSaturation("/cpu:0", None)
 
   def benchmarkAdjustSaturationGpu(self):
     self._benchmarkAdjustSaturation(test.gpu_device_name(), None)
@@ -451,14 +452,16 @@ class AdjustSaturationBenchmark(test.Benchmark):
 
 class ResizeBilinearBenchmark(test.Benchmark):
 
-  def _benchmarkResize(self, image_size):
-    # 4D float tensor (10 images per batch, 3 channels per image)
+  def _benchmarkResize(self, image_size, num_channels):
+    batch_size = 1
+    num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([10, image_size[0], image_size[1], 3]),
-        name='img')
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
+        name="img")
 
     deps = []
-    for _ in xrange(100):
+    for _ in xrange(num_ops):
       with ops.control_dependencies(deps):
         resize_op = image_ops.resize_bilinear(
             img, [299, 299], align_corners=False)
@@ -467,32 +470,45 @@ class ResizeBilinearBenchmark(test.Benchmark):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      print('Variables initalized for resize_bilinear image size: %s.' %
-            (image_size,))
-      benchmark_values = self.run_op_benchmark(
-          sess, benchmark_op, name=('bilinear_%s_%s' % image_size))
-      print('Benchmark values:\n%s' % benchmark_values)
+      results = self.run_op_benchmark(
+          sess,
+          benchmark_op,
+          name=("resize_bilinear_%s_%s_%s" %
+                (image_size[0], image_size[1], num_channels)))
+      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
+                                    / (batch_size * num_ops)))
 
-  def benchmarkSimilar(self):
-    self._benchmarkResize((183, 229))
+  def benchmarkSimilar3Channel(self):
+    self._benchmarkResize((183, 229), 3)
 
-  def benchmarkScaleUp(self):
-    self._benchmarkResize((141, 186))
+  def benchmarkScaleUp3Channel(self):
+    self._benchmarkResize((141, 186), 3)
 
-  def benchmarkScaleDown(self):
-    self._benchmarkResize((749, 603))
+  def benchmarkScaleDown3Channel(self):
+    self._benchmarkResize((749, 603), 3)
+
+  def benchmarkSimilar1Channel(self):
+    self._benchmarkResize((183, 229), 1)
+
+  def benchmarkScaleUp1Channel(self):
+    self._benchmarkResize((141, 186), 1)
+
+  def benchmarkScaleDown1Channel(self):
+    self._benchmarkResize((749, 603), 1)
 
 
 class ResizeBicubicBenchmark(test.Benchmark):
 
-  def _benchmarkResize(self, image_size):
-    # 4D float tensor (10 images per batch, 3 channels per image)
+  def _benchmarkResize(self, image_size, num_channels):
+    batch_size = 1
+    num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([10, image_size[0], image_size[1], 3]),
-        name='img')
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
+        name="img")
 
     deps = []
-    for _ in xrange(100):
+    for _ in xrange(num_ops):
       with ops.control_dependencies(deps):
         resize_op = image_ops.resize_bicubic(
             img, [299, 299], align_corners=False)
@@ -501,20 +517,41 @@ class ResizeBicubicBenchmark(test.Benchmark):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      print('Variables initalized for resize_bicubic image size: %s.' %
-            (image_size,))
-      benchmark_values = self.run_op_benchmark(
-          sess, benchmark_op, name=('bicubic_%s_%s' % image_size))
-      print('Benchmark values:\n%s' % benchmark_values)
+      results = self.run_op_benchmark(
+          sess,
+          benchmark_op,
+          min_iters=20,
+          name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1],
+                                             num_channels)))
+      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
+                                    / (batch_size * num_ops)))
 
-  def benchmarkSimilar(self):
-    self._benchmarkResize((183, 229))
+  def benchmarkSimilar3Channel(self):
+    self._benchmarkResize((183, 229), 3)
 
-  def benchmarkScaleUp(self):
-    self._benchmarkResize((141, 186))
+  def benchmarkScaleUp3Channel(self):
+    self._benchmarkResize((141, 186), 3)
 
-  def benchmarkScaleDown(self):
-    self._benchmarkResize((749, 603))
+  def benchmarkScaleDown3Channel(self):
+    self._benchmarkResize((749, 603), 3)
+
+  def benchmarkSimilar1Channel(self):
+    self._benchmarkResize((183, 229), 1)
+
+  def benchmarkScaleUp1Channel(self):
+    self._benchmarkResize((141, 186), 1)
+
+  def benchmarkScaleDown1Channel(self):
+    self._benchmarkResize((749, 603), 1)
+
+  def benchmarkSimilar4Channel(self):
+    self._benchmarkResize((183, 229), 4)
+
+  def benchmarkScaleUp4Channel(self):
+    self._benchmarkResize((141, 186), 4)
+
+  def benchmarkScaleDown4Channel(self):
+    self._benchmarkResize((749, 603), 4)
 
 
 class ResizeAreaBenchmark(test.Benchmark):
@@ -525,7 +562,7 @@ class ResizeAreaBenchmark(test.Benchmark):
     img = variables.Variable(
         random_ops.random_normal([batch_size, image_size[0],
                                   image_size[1], num_channels]),
-        name='img')
+        name="img")
 
     deps = []
     for _ in xrange(num_ops):
@@ -538,11 +575,11 @@ class ResizeAreaBenchmark(test.Benchmark):
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess, benchmark_op,
-          name=('resize_area_%s_%s_%s' %
+          name=("resize_area_%s_%s_%s" %
                 (image_size[0], image_size[1], num_channels)))
-      print('%s   : %.2f ms/img' % (
-          results['name'],
-          1000*results['wall_time'] / (batch_size * num_ops)))
+      print("%s   : %.2f ms/img" % (
+          results["name"],
+          1000*results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -596,7 +633,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
 
   def _adjust_saturation(self, image, saturation_factor):
-    image = ops.convert_to_tensor(image, name='image')
+    image = ops.convert_to_tensor(image, name="image")
     orig_dtype = image.dtype
     flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
     saturation_adjusted_image = gen_image_ops.adjust_saturation(
@@ -661,30 +698,30 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
         [1000, 1, 3],
     ]
     test_styles = [
-        'all_random',
-        'rg_same',
-        'rb_same',
-        'gb_same',
-        'rgb_same',
+        "all_random",
+        "rg_same",
+        "rb_same",
+        "gb_same",
+        "rgb_same",
     ]
     with self.test_session():
       for x_shape in x_shapes:
         for test_style in test_styles:
           x_np = np.random.rand(*x_shape) * 255.
           scale = np.random.rand()
-          if test_style == 'all_random':
+          if test_style == "all_random":
             pass
-          elif test_style == 'rg_same':
+          elif test_style == "rg_same":
             x_np[..., 1] = x_np[..., 0]
-          elif test_style == 'rb_same':
+          elif test_style == "rb_same":
             x_np[..., 2] = x_np[..., 0]
-          elif test_style == 'gb_same':
+          elif test_style == "gb_same":
             x_np[..., 2] = x_np[..., 1]
-          elif test_style == 'rgb_same':
+          elif test_style == "rgb_same":
             x_np[..., 1] = x_np[..., 0]
             x_np[..., 2] = x_np[..., 0]
           else:
-            raise AssertionError('Invalid test style: %s' % (test_style))
+            raise AssertionError("Invalid test style: %s" % (test_style))
           y_baseline = self._adjustSaturationNp(x_np, scale)
           y_fused = self._adjust_saturation(x_np, scale).eval()
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
@@ -810,9 +847,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       transformed_unknown_width = op(p_unknown_width)
       self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
 
-      with self.assertRaisesRegexp(ValueError, 'must be three-dimensional'):
+      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
         op(p_wrong_rank)
-      with self.assertRaisesRegexp(ValueError, 'must be > 0'):
+      with self.assertRaisesRegexp(ValueError, "must be > 0"):
         op(p_zero_dim)
 
   def testRot90GroupOrder(self):
@@ -1084,7 +1121,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
         if err_msg not in str(e):
           raise
       else:
-        raise AssertionError('Exception not raised: %s' % err_msg)
+        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
@@ -1139,9 +1176,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width, "'image' must be three-dimensional")
+                         target_width,
+                         "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
@@ -1171,7 +1209,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           offset_width,
           target_height,
           target_width,
-          'assertion failed:',
+          "assertion failed:",
           use_tensor_inputs_options=[True])
 
   def testBadParams(self):
@@ -1180,12 +1218,12 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], 'offset_height must be >= 0'),
-                   ([0, -1, 3, 3], 'offset_width must be >= 0'),
-                   ([0,  0, 0, 3], 'target_height must be > 0'),
-                   ([0,  0, 3, 0], 'target_width must be > 0'),
-                   ([2,  0, 3, 3], 'height must be >= target + offset'),
-                   ([0,  2, 3, 3], 'width must be >= target + offset'))
+    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"),
+                   ([0, -1, 3, 3], "offset_width must be >= 0"),
+                   ([0, 0, 0, 3], "target_height must be > 0"),
+                   ([0, 0, 3, 0], "target_width must be > 0"),
+                   ([2, 0, 3, 3], "height must be >= target + offset"),
+                   ([0, 2, 3, 3], "width must be >= target + offset"))
 
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
@@ -1316,7 +1354,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
         if err_msg not in str(e):
           raise
       else:
-        raise AssertionError('Exception not raised: %s' % err_msg)
+        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
@@ -1384,9 +1422,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width, "'image' must be three-dimensional")
+                         target_width,
+                         "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
@@ -1410,7 +1449,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           use_tensor_inputs_options=[False])
 
       # The orignal error message does not contain back slashes. However, they
-      # are added by either the assert op or the runtime. If this behaviour
+      # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
           x,
@@ -1428,10 +1467,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   offset_height, offset_width, target_height, target_width, err_msg
-    test_config = ((-1, 0, 4, 4, 'offset_height must be >= 0'),
-                   ( 0,-1, 4, 4, 'offset_width must be >= 0'),
-                   ( 2, 0, 4, 4, 'height must be <= target - offset'),
-                   ( 0, 2, 4, 4, 'width must be <= target - offset'))
+    test_config = ((-1, 0, 4, 4, "offset_height must be >= 0"),
+                   (0, -1, 4, 4, "offset_width must be >= 0"),
+                   (2, 0, 4, 4, "height must be <= target - offset"),
+                   (0, 2, 4, 4, "width must be <= target - offset"))
 
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
@@ -1508,7 +1547,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
     # For reference, here is what the distribution of area ratios look like.
     area_ratio_hist, _ = np.histogram(area_ratios, bins=10, range=area_range)
-    print('area_ratio_hist ', area_ratio_hist)
+    print("area_ratio_hist ", area_ratio_hist)
 
     # Ensure that fraction_object_covered is satisfied.
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
@@ -1604,8 +1643,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), [None] + post_shape)
 
   def shouldRunOnGPU(self, opt, nptype):
-    if opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR \
-            and nptype in [np.float32, np.float64]:
+    if (opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR and
+        nptype in [np.float32, np.float64]):
       return True
     else:
       return False
@@ -1628,15 +1667,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True) as sess:
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(image, [target_height, target_width],
-                                        opt)
-            yshape = array_ops.shape(y)
-            resized, newshape = sess.run([y, yshape])
-            self.assertAllEqual(img_shape, newshape)
-            self.assertAllClose(resized, img_np, atol=1e-5)
+        with self.test_session(use_gpu=True) as sess:
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(image, [target_height, target_width], opt)
+          yshape = array_ops.shape(y)
+          resized, newshape = sess.run([y, yshape])
+          self.assertAllEqual(img_shape, newshape)
+          self.assertAllClose(resized, img_np, atol=1e-5)
 
       # Resizing with a single image must leave the shape unchanged also.
       with self.test_session(use_gpu=True):
@@ -1774,7 +1811,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               resized = y.eval()
               self.assertAllClose(resized, expected, atol=1e-5)
 
-  def testResizeUp(self):
+  def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32,
             32, 64,
@@ -1809,16 +1846,63 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
           image_ops.ResizeMethod.AREA]:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True):
-            img_np = np.array(data, dtype=nptype).reshape(img_shape)
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(
-                image, [target_height, target_width], opt)
-            resized = y.eval()
-            expected = np.array(expected_data[opt]).reshape(
-                [1, target_height, target_width, 1])
-            self.assertAllClose(resized, expected, atol=1e-05)
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=False)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
+
+  def testResizeUpAlignCornersTrue(self):
+    img_shape = [1, 3, 2, 1]
+    data = [6, 3,
+            3, 6,
+            6, 9]
+    target_height = 5
+    target_width = 4
+    expected_data = {}
+    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+        6.0, 5.0, 4.0, 3.0,
+        4.5, 4.5, 4.5, 4.5,
+        3.0, 4.0, 5.0, 6.0,
+        4.5, 5.5, 6.5, 7.5,
+        6.0, 7.0, 8.0, 9.0
+    ]
+    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+        6.0, 6.0, 3.0, 3.0,
+        3.0, 3.0, 6.0, 6.0,
+        3.0, 3.0, 6.0, 6.0,
+        6.0, 6.0, 9.0, 9.0,
+        6.0, 6.0, 9.0, 9.0
+    ]
+    # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
+    # align_corners=True.
+    expected_data[image_ops.ResizeMethod.AREA] = [
+        6.0, 6.0, 6.0, 3.0,
+        6.0, 6.0, 6.0, 3.0,
+        3.0, 3.0, 3.0, 6.0,
+        3.0, 3.0, 3.0, 6.0,
+        6.0, 6.0, 6.0, 9.0
+    ]
+
+    for nptype in self.TYPES:
+      for opt in [
+          image_ops.ResizeMethod.BILINEAR,
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+          image_ops.ResizeMethod.AREA
+      ]:
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=True)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
@@ -2002,7 +2086,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
         if err_msg not in str(e):
           raise
       else:
-        raise AssertionError('Exception not raised: %s' % err_msg)
+        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
@@ -2174,9 +2258,13 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     x = [0] * 15
     target_height, target_width = [4, 4]
 
-    for x_shape in ([1, 3, 5, 1], [3, 5]):
+    for x_shape in ([3, 5],):
       self._assertRaises(x, x_shape, target_height, target_width,
-                         "'image' must be three-dimensional")
+                         "'image' must have either 3 or 4 dimensions.")
+
+    for x_shape in ([1, 3, 5, 1, 1],):
+      self._assertRaises(x, x_shape, target_height, target_width,
+                         "'image' must have either 3 or 4 dimensions.")
 
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
@@ -2193,7 +2281,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           use_tensor_inputs_options=[False])
 
       # The orignal error message does not contain back slashes. However, they
-      # are added by either the assert op or the runtime. If this behaviour
+      # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
           x,
@@ -2210,12 +2298,12 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     # target_height <= 0
     target_height, target_width = [0, 5]
     self._assertRaises(x, x_shape, target_height, target_width,
-                       'target_height must be > 0')
+                       "target_height must be > 0")
 
     # target_width <= 0
     target_height, target_width = [5, 0]
     self._assertRaises(x, x_shape, target_height, target_width,
-                       'target_width must be > 0')
+                       "target_width must be > 0")
 
 
 def _SimpleColorRamp():
@@ -2240,8 +2328,8 @@ class JpegTest(test_util.TensorFlowTestCase):
 
   def testExisting(self):
     # Read a real jpeg and verify shape
-    path = ('tensorflow/core/lib/jpeg/testdata/'
-            'jpeg_merge_test1.jpg')
+    path = ("tensorflow/core/lib/jpeg/testdata/"
+            "jpeg_merge_test1.jpg")
     with self.test_session(use_gpu=True) as sess:
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
@@ -2253,9 +2341,9 @@ class JpegTest(test_util.TensorFlowTestCase):
 
   def testCmyk(self):
     # Confirm that CMYK reads in as RGB
-    base = 'tensorflow/core/lib/jpeg/testdata'
-    rgb_path = os.path.join(base, 'jpeg_merge_test1.jpg')
-    cmyk_path = os.path.join(base, 'jpeg_merge_test1_cmyk.jpg')
+    base = "tensorflow/core/lib/jpeg/testdata"
+    rgb_path = os.path.join(base, "jpeg_merge_test1.jpg")
+    cmyk_path = os.path.join(base, "jpeg_merge_test1_cmyk.jpg")
     shape = 256, 128, 3
     for channels in 3, 0:
       with self.test_session(use_gpu=True) as sess:
@@ -2274,9 +2362,9 @@ class JpegTest(test_util.TensorFlowTestCase):
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
-      image1 = image_ops.decode_jpeg(jpeg0, dct_method='INTEGER_ACCURATE')
+      image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_ACCURATE")
       image2 = image_ops.decode_jpeg(
-          image_ops.encode_jpeg(image1), dct_method='INTEGER_ACCURATE')
+          image_ops.encode_jpeg(image1), dct_method="INTEGER_ACCURATE")
       jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input
@@ -2294,9 +2382,9 @@ class JpegTest(test_util.TensorFlowTestCase):
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
-      image1 = image_ops.decode_jpeg(jpeg0, dct_method='INTEGER_FAST')
+      image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(
-          image_ops.encode_jpeg(image1), dct_method='INTEGER_FAST')
+          image_ops.encode_jpeg(image1), dct_method="INTEGER_FAST")
       jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input, but
@@ -2318,7 +2406,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       # default.  They should be the same.
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
-      image1 = image_ops.decode_jpeg(jpeg0, dct_method='INTEGER_FAST')
+      image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(jpeg0)
       image1, image2 = sess.run([image1, image2])
 
@@ -2327,7 +2415,7 @@ class JpegTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
-      jpeg = constant_op.constant('nonsense')
+      jpeg = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_jpeg(jpeg, channels=channels)
         self.assertEqual(image.get_shape().as_list(),
@@ -2338,8 +2426,8 @@ class PngTest(test_util.TensorFlowTestCase):
 
   def testExisting(self):
     # Read some real PNGs, converting to different channel numbers
-    prefix = 'tensorflow/core/lib/png/testdata/'
-    inputs = (1, 'lena_gray.png'), (4, 'lena_rgba.png')
+    prefix = "tensorflow/core/lib/png/testdata/"
+    inputs = (1, "lena_gray.png"), (4, "lena_rgba.png")
     for channels_in, filename in inputs:
       for channels in 0, 1, 3, 4:
         with self.test_session(use_gpu=True) as sess:
@@ -2405,7 +2493,7 @@ class PngTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
     with self.test_session(use_gpu=True):
-      png = constant_op.constant('nonsense')
+      png = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_png(png, channels=channels)
         self.assertEqual(image.get_shape().as_list(),
@@ -2416,8 +2504,8 @@ class GifTest(test_util.TensorFlowTestCase):
 
   def testValid(self):
     # Read some real GIFs
-    prefix = 'tensorflow/core/lib/gif/testdata/'
-    filename = 'scan.gif'
+    prefix = "tensorflow/core/lib/gif/testdata/"
+    filename = "scan.gif"
     WIDTH = 20
     HEIGHT = 40
     STRIDE = 5
@@ -2446,8 +2534,8 @@ class GifTest(test_util.TensorFlowTestCase):
 
   def testInValid(self):
     # Read some real GIFs
-    prefix = 'tensorflow/core/lib/gif/testdata/'
-    filename = 'optimized.gif'
+    prefix = "tensorflow/core/lib/gif/testdata/"
+    filename = "optimized.gif"
 
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
@@ -2457,7 +2545,7 @@ class GifTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
-      gif = constant_op.constant('nonsense')
+      gif = constant_op.constant("nonsense")
       image = image_ops.decode_gif(gif)
       self.assertEqual(image.get_shape().as_list(), [None, None, None, 3])
 
@@ -2480,7 +2568,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       image = constant_op.constant([1], dtype=dtypes.uint8)
       image_ops.convert_image_dtype(image, dtypes.uint8)
       y = image_ops.convert_image_dtype(image, dtypes.uint8)
-      self.assertEquals(y.op.type, 'Identity')
+      self.assertEquals(y.op.type, "Identity")
       self.assertEquals(y.op.inputs[0], image)
 
   def testConvertBetweenInteger(self):
@@ -2705,5 +2793,37 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     self._test(multi, tot_var * np.array([1.0, 1.1, 1.2]))
 
 
-if __name__ == '__main__':
+class FormatTest(test_util.TensorFlowTestCase):
+
+  def testFormats(self):
+    prefix = "tensorflow/core/lib"
+    paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
+             "gif/testdata/lena.gif")
+    decoders = {
+        "jpeg": functools.partial(image_ops.decode_jpeg, channels=3),
+        "png": functools.partial(image_ops.decode_png, channels=3),
+        "gif": lambda s: array_ops.squeeze(image_ops.decode_gif(s), axis=0),
+    }
+    with self.test_session():
+      for path in paths:
+        contents = io_ops.read_file(os.path.join(prefix, path)).eval()
+        images = {}
+        for name, decode in decoders.items():
+          image = decode(contents).eval()
+          self.assertEqual(image.ndim, 3)
+          for prev_name, prev in images.items():
+            print("path %s, names %s %s, shapes %s %s" %
+                  (path, name, prev_name, image.shape, prev.shape))
+            self.assertAllEqual(image, prev)
+          images[name] = image
+
+  def testError(self):
+    path = "tensorflow/core/lib/gif/testdata/scan.gif"
+    with self.test_session():
+      for decode in image_ops.decode_jpeg, image_ops.decode_png:
+        with self.assertRaisesOpError(r"Got 12 frames"):
+          decode(io_ops.read_file(path)).eval()
+
+
+if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 3c35414077f..1e2f9999957 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,7 +39,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import math_ops
 
 
 class Initializer(object):
@@ -49,30 +51,64 @@ class Initializer(object):
   def __call__(self, shape, dtype=None, partition_info=None):
     raise NotImplementedError
 
+  def get_config(self):
+    """Returns the configuration of the initializer as a JSON-serializable dict.
+
+    Returns:
+      A JSON-serializable Python dict.
+    """
+    return {}
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates an initializer from a configuration dictionary.
+
+    Example:
+
+    ```
+    initializer = RandomUniform(-1, 1)
+    config = initializer.get_config()
+    initializer = RandomUniform.from_config(config)
+    ```
+
+    Arguments:
+      config: A Python dictionary.
+        It will typically be the output of `get_config`.
+
+    Returns:
+      An Initializer instance.
+    """
+    return cls(**config)
+
 
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
   def __init__(self, dtype=dtypes.float32):
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
       dtype = self.dtype
-    return constant_op.constant(False if dtype is dtypes.bool else 0,
-                                dtype=dtype, shape=shape)
+    return array_ops.zeros(shape, dtype)
+
+  def get_config(self):
+    return {"dtype": self.dtype.name}
 
 
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
   def __init__(self, dtype=dtypes.float32):
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
       dtype = self.dtype
-    return constant_op.constant(1, dtype=dtype, shape=shape)
+    return array_ops.ones(shape, dtype)
+
+  def get_config(self):
+    return {"dtype": self.dtype.name}
 
 
 class Constant(Initializer):
@@ -152,14 +188,27 @@ class Constant(Initializer):
 
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
     self.value = value
-    self.dtype = dtype
-    self.verify_shape = verify_shape
+    self.dtype = dtypes.as_dtype(dtype)
+    self._verify_shape = verify_shape
 
-  def __call__(self, shape, dtype=None, partition_info=None):
+  def __call__(self, shape,
+               dtype=None,
+               partition_info=None,
+               verify_shape=None):
     if dtype is None:
       dtype = self.dtype
+    if verify_shape is None:
+      verify_shape = self._verify_shape
     return constant_op.constant(self.value, dtype=dtype, shape=shape,
-                                verify_shape=self.verify_shape)
+                                verify_shape=verify_shape)
+
+  def get_config(self):
+    # We don't include `verify_shape` for compatibility with Keras.
+    # `verify_shape` should be passed as an argument to `__call__` rather
+    # than as a constructor argument: conceptually it isn't a property
+    # of the initializer.
+    return {"value": self.value,
+            "dtype": self.dtype.name}
 
 
 class RandomUniform(Initializer):
@@ -171,7 +220,7 @@ class RandomUniform(Initializer):
     maxval: A python scalar or a scalar tensor. Upper bound of the range
       of random values to generate.  Defaults to 1 for float types.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type.
   """
@@ -180,7 +229,7 @@ class RandomUniform(Initializer):
     self.minval = minval
     self.maxval = maxval
     self.seed = seed
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -188,6 +237,12 @@ class RandomUniform(Initializer):
     return random_ops.random_uniform(shape, self.minval, self.maxval,
                                      dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"minval": self.minval,
+            "maxval": self.maxval,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
@@ -198,7 +253,7 @@ class RandomNormal(Initializer):
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -207,7 +262,7 @@ class RandomNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -215,6 +270,12 @@ class RandomNormal(Initializer):
     return random_ops.random_normal(shape, self.mean, self.stddev,
                                     dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"mean": self.mean,
+            "stddev": self.stddev,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
@@ -230,7 +291,7 @@ class TruncatedNormal(Initializer):
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -239,7 +300,7 @@ class TruncatedNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -247,6 +308,12 @@ class TruncatedNormal(Initializer):
     return random_ops.truncated_normal(shape, self.mean, self.stddev,
                                        dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"mean": self.mean,
+            "stddev": self.stddev,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
@@ -270,7 +337,7 @@ class UniformUnitScaling(Initializer):
   Args:
     factor: Float.  A multiplicative factor by which the values will be scaled.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -278,7 +345,7 @@ class UniformUnitScaling(Initializer):
   def __init__(self, factor=1.0, seed=None, dtype=dtypes.float32):
     self.factor = factor
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -299,6 +366,11 @@ class UniformUnitScaling(Initializer):
     return random_ops.random_uniform(shape, -max_val, max_val,
                                      dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"factor": self.factor,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
@@ -318,7 +390,7 @@ class VarianceScaling(Initializer):
     mode: One of "fan_in", "fan_out", "fan_avg".
     distribution: Random distribution to use. One of "normal", "uniform".
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
@@ -343,7 +415,7 @@ class VarianceScaling(Initializer):
     self.mode = mode
     self.distribution = distribution
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -368,6 +440,13 @@ class VarianceScaling(Initializer):
       return random_ops.random_uniform(shape, -limit, limit,
                                        dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"scale": self.scale,
+            "mode": self.mode,
+            "distribution": self.distribution,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
@@ -385,13 +464,13 @@ class Orthogonal(Initializer):
     gain: multiplicative factor to apply to the orthogonal matrix
     dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
   """
 
-  def __init__(self, gain=1.0, dtype=dtypes.float32, seed=None):
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
     self.gain = gain
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
     self.seed = seed
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -410,18 +489,25 @@ class Orthogonal(Initializer):
     flat_shape = (num_rows, num_cols)
 
     # Generate a random matrix
-    a = random_ops.random_uniform(flat_shape, dtype=dtype, seed=self.seed)
-    # Compute the svd
-    _, u, v = linalg_ops.svd(a, full_matrices=False)
-    # Pick the appropriate singular value decomposition
-    if num_rows > num_cols:
-      q = u
-    else:
-      # Tensorflow departs from numpy conventions
-      # such that we need to transpose axes here
-      q = array_ops.transpose(v)
+    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
+    # Compute the qr factorization
+    q, r = linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    square_len = math_ops.minimum(num_rows, num_cols)
+    d = array_ops.diag_part(r[:square_len, :square_len])
+    ph = d / math_ops.abs(d)
+    q *= ph
+    # Pad zeros to Q (if rows smaller than cols)
+    if num_rows < num_cols:
+      padding = array_ops.zeros([num_rows, num_cols - num_rows], dtype=dtype)
+      q = array_ops.concat([q, padding], 1)
     return self.gain * array_ops.reshape(q, shape)
 
+  def get_config(self):
+    return {"gain": self.gain,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 # Aliases.
 
@@ -450,7 +536,7 @@ def glorot_uniform_initializer(seed=None, dtype=dtypes.float32):
 
   Arguments:
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
@@ -476,7 +562,7 @@ def glorot_normal_initializer(seed=None, dtype=dtypes.float32):
 
   Arguments:
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 0a099ae28c3..0b1a77969a0 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -14,52 +14,21 @@
 # ==============================================================================
 
 # pylint: disable=line-too-long
-"""## Placeholders
+"""Inputs and Readers.
 
-TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on [Feeding
-data](../../how_tos/reading_data/index.md#feeding).
+See the @{$python/io_ops} guide.
 
 @@placeholder
 @@placeholder_with_default
-
-For feeding `SparseTensor`s which are composite type,
-there is a convenience function:
-
 @@sparse_placeholder
-
-## Readers
-
-TensorFlow provides a set of Reader classes for reading data formats.
-For more information on inputs and readers, see [Reading
-data](../../how_tos/reading_data/index.md).
-
 @@ReaderBase
 @@TextLineReader
 @@WholeFileReader
 @@IdentityReader
 @@TFRecordReader
 @@FixedLengthRecordReader
-
-## Converting
-
-TensorFlow provides several operations that you can use to convert various data
-formats into tensors.
-
 @@decode_csv
 @@decode_raw
-
-- - -
-
-### Example protocol buffer
-
-TensorFlow's [recommended format for training
-examples](../../how_tos/reading_data/index.md#standard-tensorflow-format)
-is serialized `Example` protocol buffers, [described
-here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
-They contain `Features`, [described
-here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
-
 @@VarLenFeature
 @@FixedLenFeature
 @@FixedLenSequenceFeature
@@ -68,71 +37,23 @@ here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
 @@parse_single_example
 @@parse_tensor
 @@decode_json_example
-
-## Queues
-
-TensorFlow provides several implementations of 'Queues', which are
-structures within the TensorFlow computation graph to stage pipelines
-of tensors together. The following describe the basic Queue interface
-and some implementations.  To see an example use, see [Threading and
-Queues](../../how_tos/threading_and_queues/index.md).
-
 @@QueueBase
 @@FIFOQueue
 @@PaddingFIFOQueue
 @@RandomShuffleQueue
 @@PriorityQueue
-
-## Conditional Accumulators
-
 @@ConditionalAccumulatorBase
 @@ConditionalAccumulator
 @@SparseConditionalAccumulator
-
-## Dealing with the filesystem
-
 @@matching_files
 @@read_file
 @@write_file
-
-## Input pipeline
-
-TensorFlow functions for setting up an input-prefetching pipeline.
-Please see the [reading data how-to](../../how_tos/reading_data/index.md)
-for context.
-
-### Beginning of an input pipeline
-
-The "producer" functions add a queue to the graph and a corresponding
-`QueueRunner` for running the subgraph that fills that queue.
-
 @@match_filenames_once
 @@limit_epochs
 @@input_producer
 @@range_input_producer
 @@slice_input_producer
 @@string_input_producer
-
-### Batching at the end of an input pipeline
-
-These functions add a queue to the graph to assemble a batch of
-examples, with possible shuffling.  They also add a `QueueRunner` for
-running the subgraph that fills that queue.
-
-Use [`batch`](#batch) or [`batch_join`](#batch_join) for batching
-examples that have already been well shuffled.  Use
-[`shuffle_batch`](#shuffle_batch) or
-[`shuffle_batch_join`](#shuffle_batch_join) for examples that would
-benefit from additional shuffling.
-
-Use [`batch`](#batch) or [`shuffle_batch`](#shuffle_batch) if you want a
-single thread producing examples to batch, or if you have a
-single subgraph producing examples but you want to run it in *N* threads
-(where you increase *N* until it can keep the queue full).  Use
-[`batch_join`](#batch_join) or [`shuffle_batch_join`](#shuffle_batch_join)
-if you have *N* different subgraphs producing examples to batch and you
-want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
-
 @@batch
 @@maybe_batch
 @@batch_join
@@ -309,10 +230,10 @@ class ReaderBase(object):
       # For compatibility with pre-resource queues, create a ref(string) tensor
       # which can be looked up as the same queue by a resource manager.
       old_queue_op = gen_data_flow_ops._fake_queue(queue_ref)
-      return gen_io_ops._reader_read_up_to_v2(self._reader_ref,
-                                              old_queue_op,
-                                              num_records,
-                                              name=name)
+      return gen_io_ops._reader_read_up_to(self._reader_ref,
+                                           old_queue_op,
+                                           num_records,
+                                           name=name)
 
   def num_records_produced(self, name=None):
     """Returns the number of records this reader has produced.
@@ -470,7 +391,11 @@ class FixedLengthRecordReader(ReaderBase):
   """
   # TODO(josh11b): Support serializing and restoring state.
 
-  def __init__(self, record_bytes, header_bytes=None, footer_bytes=None,
+  def __init__(self,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               hop_bytes=None,
                name=None):
     """Create a FixedLengthRecordReader.
 
@@ -478,11 +403,15 @@ class FixedLengthRecordReader(ReaderBase):
       record_bytes: An int.
       header_bytes: An optional int. Defaults to 0.
       footer_bytes: An optional int. Defaults to 0.
+      hop_bytes: An optional int. Defaults to 0.
       name: A name for the operation (optional).
     """
     rr = gen_io_ops._fixed_length_record_reader_v2(
-        record_bytes=record_bytes, header_bytes=header_bytes,
-        footer_bytes=footer_bytes, name=name)
+        record_bytes=record_bytes,
+        header_bytes=header_bytes,
+        footer_bytes=footer_bytes,
+        hop_bytes=hop_bytes,
+        name=name)
     super(FixedLengthRecordReader, self).__init__(rr)
 
 
@@ -514,6 +443,25 @@ class TFRecordReader(ReaderBase):
 ops.NotDifferentiable("TFRecordReader")
 
 
+class LMDBReader(ReaderBase):
+  """A Reader that outputs the records from a LMDB file.
+
+  See ReaderBase for supported methods.
+  """
+  def __init__(self, name=None, options=None):
+    """Create a LMDBReader.
+
+    Args:
+      name: A name for the operation (optional).
+      options: A LMDBRecordOptions object (optional).
+    """
+    rr = gen_io_ops._lmdb_reader(name=name)
+    super(LMDBReader, self).__init__(rr)
+
+
+ops.NotDifferentiable("LMDBReader")
+
+
 class IdentityReader(ReaderBase):
   """A Reader that outputs the queued work as both the key and value.
 
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 6059c754662..b479b6ac60e 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -57,7 +57,24 @@ def _MatrixDeterminantGrad(op, grad):
 @ops.RegisterGradient("Cholesky")
 def _CholeskyGrad(op, grad):
   """Gradient for Cholesky."""
-  return linalg_ops.cholesky_grad(op.outputs[0], grad)
+
+  # Gradient is l^{-H} @ ((l^{H} @ grad) * (tril(ones)-1/2*eye)) @ l^{-1}
+  l = op.outputs[0]
+  num_rows = array_ops.shape(l)[-1]
+  batch_shape = array_ops.shape(l)[:-2]
+  l_inverse = linalg_ops.matrix_triangular_solve(
+      l, linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=l.dtype))
+
+  middle = math_ops.matmul(l, grad, adjoint_a=True)
+  middle = array_ops.matrix_set_diag(middle,
+                                     0.5 * array_ops.matrix_diag_part(middle))
+  middle = array_ops.matrix_band_part(middle, -1, 0)
+
+  grad_a = math_ops.matmul(
+      math_ops.matmul(l_inverse, middle, adjoint_a=True), l_inverse)
+
+  grad_a += math_ops.conj(array_ops.matrix_transpose(grad_a))
+  return grad_a * 0.5
 
 
 @ops.RegisterGradient("MatrixSolve")
@@ -212,7 +229,7 @@ def _SelfAdjointEigV2Grad(op, grad_e, grad_v):
     # The forward op only depends on the lower triangular part of a, so here we
     # symmetrize and take the lower triangle
     grad_a = array_ops.matrix_band_part(
-        grad_a + array_ops.matrix_transpose(grad_a), -1, 0)
+        grad_a + math_ops.conj(array_ops.matrix_transpose(grad_a)), -1, 0)
     grad_a = array_ops.matrix_set_diag(grad_a,
                                        0.5 * array_ops.matrix_diag_part(grad_a))
     return grad_a
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 595c645cbb0..e7ac2ca8367 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -236,11 +236,11 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   `tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
   :])`
 
-  ```prettyprint
+  ```python
   # a is a tensor.
   # s is a tensor of singular values.
   # u is a tensor of left singular vectors.
-  #v is a tensor of right singular vectors.
+  # v is a tensor of right singular vectors.
   s, u, v = svd(a)
   s = svd(a, compute_uv=False)
   ```
@@ -257,11 +257,13 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
     name: string, optional name of the operation.
 
   Returns:
-    s: Singular values. Shape is `[..., P]`.
-    u: Right singular vectors. If `full_matrices` is `False` (default) then
+    s: Singular values. Shape is `[..., P]`. The values are sorted in reverse
+      order of magnitude, so s[..., 0] is the largest value, s[..., 1] is the
+      second largest, etc.
+    u: Left singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
       `[..., M, M]`. Not returned if `compute_uv` is `False`.
-    v: Left singular vectors. If `full_matrices` is `False` (default) then
+    v: Right singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
       `[..., N, N]`. Not returned if `compute_uv` is `False`.
 
@@ -285,18 +287,19 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
   r"""Computes the norm of vectors, matrices, and tensors.
 
-  This function can compute 3 different matrix norms (Frobenius, 1-norm, and
-  inf-norm) and up to 9218868437227405311 different vectors norms.
+  This function can compute several different vector norms (the 1-norm, the
+  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
+  matrix norms (Frobenius, 1-norm, and inf-norm).
 
   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
     ord: Order of the norm. Supported values are 'fro', 'euclidean', `0`,
-      `1, `2`, `np.inf` and any positive real number yielding the corresponding
+      `1`, `2`, `np.inf` and any positive real number yielding the corresponding
       p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
       `tensor` is a matrix and equivalent to 2-norm for vectors.
-      Some restrictions apply,
+      Some restrictions apply:
         a) The Frobenius norm `fro` is not defined for vectors,
-        b) If axis is a 2-tuple (matrix-norm), only 'euclidean', 'fro', `1`,
+        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
            `np.inf` are supported.
       See the description of `axis` on how to compute norms for a batch of
       vectors or matrices stored in a tensor.
@@ -305,7 +308,7 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
       `norm(reshape(tensor, [-1]), ord=ord)`.
       If `axis` is a Python integer, the input is considered a batch of vectors,
-      and `axis`t determines the axis in `tensor` over which to compute vector
+      and `axis` determines the axis in `tensor` over which to compute vector
       norms.
       If `axis` is a 2-tuple of Python integers it is considered a batch of
       matrices and `axis` determines the axes in `tensor` over which to compute
@@ -333,7 +336,7 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
   Mostly equivalent to numpy.linalg.norm.
   Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
   Other differences:
-    a) If axis is `None`, treats the the flattened `tensor` as a vector
+    a) If axis is `None`, treats the flattened `tensor` as a vector
      regardless of rank.
     b) Explicitly supports 'euclidean' norm as the default, including for
      higher order tensors.
@@ -371,7 +374,7 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       # matrices.
       result = math_ops.sqrt(
           math_ops.reduce_sum(
-              math_ops.square(tensor), axis, keep_dims=True))
+              tensor * math_ops.conj(tensor), axis, keep_dims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 3502b3dbb3c..08e3f83a0b2 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -42,6 +42,10 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
   This is an identity op with the side effect of printing `data` when
   evaluating.
 
+  Note: This op prints to the standard error. It is not currently compatible
+    with jupyter notebook (printing to the notebook *server's* output, not into
+    the notebook).
+
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -82,7 +86,7 @@ def histogram_summary(tag, values, collections=None, name=None):
   This ops is deprecated. Please switch to tf.summary.histogram.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
@@ -120,7 +124,7 @@ def image_summary(tag, tensor, max_images=3, collections=None, name=None):
   """Outputs a `Summary` protocol buffer with images.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_images` summary values containing images. The
   images are built from `tensor` which must be 4-D with shape `[batch_size,
@@ -186,7 +190,7 @@ def audio_summary(tag,
 
   This op is deprecated. Please switch to tf.summary.audio.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_outputs` summary values containing audio. The
   audio is built from `tensor` which must be 3-D with shape `[batch_size,
@@ -322,7 +326,7 @@ def scalar_summary(tags, values, collections=None, name=None):
 
   This ops is deprecated. Please switch to tf.summary.scalar.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The input `tags` and `values` must have the same shape.  The generated
   summary has a summary value for each tag-value pair in `tags` and `values`.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
new file mode 100644
index 00000000000..3ae9316fee2
--- /dev/null
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -0,0 +1,1220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#==============================================================================
+"""Lookup operations."""
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_lookup_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
+
+
+# TODO(yleon): Remove this function.
+@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
+def initialize_all_tables(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  return tables_initializer(name)
+
+
+def tables_initializer(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
+  if initializers:
+    return control_flow_ops.group(*initializers, name=name)
+  return control_flow_ops.no_op(name=name)
+
+
+def _check_table_dtypes(table, key_dtype, value_dtype):
+  """Check that the given key_dtype and value_dtype matches the table dtypes.
+
+  Args:
+    table: The table to check types against to.
+    key_dtype: The key data type to check.
+    value_dtype: The value data type to check.
+
+  Raises:
+    TypeError: when 'key_dtype' or 'value_dtype' doesn't match the table data
+      types.
+  """
+  if key_dtype != table.key_dtype:
+    raise TypeError("Invalid key dtype, expected %s but got %s." %
+                    (table.key_dtype, key_dtype))
+  if value_dtype != table.value_dtype:
+    raise TypeError("Invalid value dtype, expected %s but got %s." %
+                    (table.value_dtype, value_dtype))
+
+
+class LookupInterface(object):
+  """Represent a lookup table that persists across different steps."""
+
+  def __init__(self, key_dtype, value_dtype, name):
+    """Construct a lookup table interface.
+
+    Args:
+      key_dtype: The table key type.
+      value_dtype: The table value type.
+      name: A name for the operation (optional).
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+    self._name = name
+
+  @property
+  def key_dtype(self):
+    """The table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The table value dtype."""
+    return self._value_dtype
+
+  @property
+  def name(self):
+    """The name of the table."""
+    return self._name
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    raise NotImplementedError
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    raise NotImplementedError
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values."""
+    raise NotImplementedError
+
+
+class InitializableLookupTableBase(LookupInterface):
+  """Initializable lookup table interface.
+
+  An initializable lookup tables persist across different steps.
+  """
+
+  def __init__(self, table_ref, default_value, initializer):
+    """Construct a table object from a table reference.
+
+    If requires a table initializer object (subclass of `TableInitializerBase`).
+    It provides the table key and value types, as well as the op to initialize
+    the table. The caller is responsible to execute the initialization op.
+
+    Args:
+      table_ref: The table reference, i.e. the output of the lookup table ops.
+      default_value: The value to use if a key is missing in the table.
+      initializer: The table initializer to use.
+    """
+    super(InitializableLookupTableBase,
+          self).__init__(initializer.key_dtype, initializer.value_dtype,
+                         table_ref.op.name.split("/")[-1])
+    self._table_ref = table_ref
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=self._value_dtype)
+    self._default_value.get_shape().merge_with(tensor_shape.scalar())
+    self._init = initializer.initialize(self)
+
+  @property
+  def table_ref(self):
+    """Get the underlying table reference."""
+    return self._table_ref
+
+  @property
+  def default_value(self):
+    """The default value of the table."""
+    return self._default_value
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    return self._init
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self._name,
+                        [self._table_ref]) as scope:
+      # pylint: disable=protected-access
+      return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=scope)
+      # pylint: enable=protected-access
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` or `default_value` doesn't match the table data
+        types.
+    """
+    key_tensor = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      key_tensor = keys.values
+
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_Lookup" % self._name,
+                        (self._table_ref, key_tensor,
+                         self._default_value)) as scope:
+      # pylint: disable=protected-access
+      values = gen_lookup_ops._lookup_table_find_v2(
+          self._table_ref, key_tensor, self._default_value, name=scope)
+      # pylint: enable=protected-access
+
+    values.set_shape(key_tensor.get_shape())
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
+    else:
+      return values
+
+
+class HashTable(InitializableLookupTableBase):
+  """A generic hash table implementation.
+
+  Example usage:
+
+  ```python
+  table = tf.contrib.lookup.HashTable(
+      tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+  """
+
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
+
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
+
+    Args:
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
+      default_value: The value to use if a key is missing in the table.
+      shared_name: If non-empty, this table will be shared under
+        the given name across multiple sessions.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `HashTable` object.
+    """
+    with ops.name_scope(name, "hash_table", (initializer,
+                                             default_value)) as scope:
+      # pylint: disable=protected-access
+      table_ref = gen_lookup_ops._hash_table_v2(
+          shared_name=shared_name,
+          key_dtype=initializer.key_dtype,
+          value_dtype=initializer.value_dtype,
+          name=scope)
+      # pylint: enable=protected-access
+
+      super(HashTable, self).__init__(table_ref, default_value, initializer)
+
+
+class TableInitializerBase(object):
+  """Base class for lookup table initializers."""
+
+  def __init__(self, key_dtype, value_dtype):
+    """Construct a table initializer object.
+
+    Args:
+      key_dtype: Type of the table keys.
+      value_dtype: Type of the table values.
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+
+  @property
+  def key_dtype(self):
+    """The expected table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The expected table value dtype."""
+    return self._value_dtype
+
+  def initialize(self, table):
+    """Returns the table initialization op."""
+    raise NotImplementedError
+
+
+class KeyValueTensorInitializer(TableInitializerBase):
+  """Table initializers given `keys` and `values` tensors."""
+
+  def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
+    """Constructs a table initializer object based on keys and values tensors.
+
+    Args:
+      keys: The tensor for the keys.
+      values: The tensor for the values.
+      key_dtype: The `keys` data type. Used when `keys` is a python array.
+      value_dtype: The `values` data type. Used when `values` is a python array.
+      name: A name for the operation (optional).
+    """
+    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
+      self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
+      self._values = ops.convert_to_tensor(
+          values, dtype=value_dtype, name="values")
+      self._name = scope
+
+    super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
+                                                    self._values.dtype)
+
+  def initialize(self, table):
+    """Initializes the given `table` with `keys` and `values` tensors.
+
+    Args:
+      table: The table to initialize.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
+    with ops.name_scope(
+        self._name, values=(table.table_ref, self._keys,
+                            self._values)) as scope:
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_v2(
+          table.table_ref, self._keys, self._values, name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    return init_op
+
+
+class TextFileIndex(object):
+  WHOLE_LINE = -2
+  LINE_NUMBER = -1
+
+
+class TextFileInitializer(TableInitializerBase):
+  """Table initializers from a text file.
+
+  This initializer assigns one entry in the table for each line in the file.
+
+  The key and value type of the table to initialize is given by `key_dtype` and
+  `value_dtype`.
+
+  The key and value content to get from each line is specified by
+  the `key_index` and `value_index`.
+
+  * `TextFileIndex.LINE_NUMBER` means use the line number starting from zero,
+    expects data type int64.
+  * `TextFileIndex.WHOLE_LINE` means use the whole line content, expects data
+    type string.
+  * A value `>=0` means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+  For example if we have a file with the following content:
+
+  ```
+  emerson 10
+  lake 20
+  palmer 30
+  ```
+
+  The following snippet initializes a table with the first column as keys and
+  second column as values:
+
+  * `emerson -> 10`
+  * `lake -> 20`
+  * `palmer -> 30`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+
+  Similarly to initialize the whole line as keys and the line number as values.
+
+  * `emerson 10 -> 0`
+  * `lake 20 -> 1`
+  * `palmer 30 -> 2`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
+      tf.int64, tf.contrib.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+  """
+
+  def __init__(self,
+               filename,
+               key_dtype,
+               key_index,
+               value_dtype,
+               value_index,
+               vocab_size=None,
+               delimiter="\t",
+               name=None):
+    """Constructs a table initializer object to populate from a text file.
+
+    It generates one key-value pair per line. The type of table key and
+    value are specified by `key_dtype` and `value_dtype`, respectively.
+    Similarly the content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_dtype: The `key` data type.
+      key_index: the index that represents information of a line to get the
+        table 'key' values from.
+      value_dtype: The `value` data type.
+      value_index: the index that represents information of a line to get the
+        table 'value' values from.'
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: A name for the operation (optional).
+
+    Raises:
+      ValueError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    if not isinstance(filename, ops.Tensor) and not filename:
+      raise ValueError("Filename required for %s." % name)
+
+    key_dtype = dtypes.as_dtype(key_dtype)
+    value_dtype = dtypes.as_dtype(value_dtype)
+
+    if key_index < -2:
+      raise ValueError("Invalid key index %s." % (key_index))
+
+    if key_index == TextFileIndex.LINE_NUMBER and key_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Keys must be dtype %s, got %s." %
+                       (dtypes.int64, key_dtype))
+    if ((key_index == TextFileIndex.WHOLE_LINE) and
+        (not key_dtype.is_integer) and (key_dtype != dtypes.string)):
+      raise ValueError(
+          "Signature mismatch. Keys must be integer or string, got %s." %
+          key_dtype)
+    if value_index < -2:
+      raise ValueError("Invalid value index %s." % (value_index))
+
+    if value_index == TextFileIndex.LINE_NUMBER and value_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.int64, value_dtype))
+    if value_index == TextFileIndex.WHOLE_LINE and value_dtype != dtypes.string:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.string, value_dtype))
+
+    if (vocab_size is not None) and (vocab_size <= 0):
+      raise ValueError("Invalid vocab_size %s." % vocab_size)
+
+    self._filename = filename
+    self._key_index = key_index
+    self._value_index = value_index
+    self._vocab_size = vocab_size
+    self._delimiter = delimiter
+    self._name = name
+
+    super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
+
+  def initialize(self, table):
+    """Initializes the table from a text file.
+
+    Args:
+      table: The table to be initialized.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self.key_dtype, self.value_dtype)
+    with ops.name_scope(self._name, "text_file_init",
+                        (table.table_ref,)) as scope:
+      filename = ops.convert_to_tensor(
+          self._filename, dtypes.string, name="asset_filepath")
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_from_text_file_v2(
+          table.table_ref,
+          filename,
+          self._key_index,
+          self._value_index,
+          -1 if self._vocab_size is None else self._vocab_size,
+          self._delimiter,
+          name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    # If the filename tensor is anything other than a string constant (e.g., if
+    # it is a placeholder) then it does not make sense to track it as an asset.
+    if constant_op.is_constant(filename):
+      ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
+    return init_op
+
+
+class TextFileStringTableInitializer(TextFileInitializer):
+  """Table initializer for `int64` IDs to string tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.LINE_NUMBER,
+               value_column_index=TextFileIndex.WHOLE_LINE,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_string_table_init"):
+    """Constructs an initializer for an id-to-string table from a text file.
+
+    It populates a table that its key and value types are int64 and string,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by `key_column_index`
+    and `value_column_index`.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the keys
+        from. The default is 0 that represents the whole line content.
+      value_column_index: The column index from the text file to get the
+        values from. The default is to use the line number, starting from zero.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileStringTableInitializer, self).__init__(
+        filename,
+        dtypes.int64,
+        key_column_index,
+        dtypes.string,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class TextFileIdTableInitializer(TextFileInitializer):
+  """Table initializer for string to `int64` IDs tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.WHOLE_LINE,
+               value_column_index=TextFileIndex.LINE_NUMBER,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_id_table_init",
+               key_dtype=dtypes.string):
+    """Constructs an initializer for an string-to-id table from a text file.
+
+    It populates a table that its key and value types are string and int64,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the `key`
+        values from. The default is to use the line number, starting from zero.
+      value_column_index: The column index from the text file ro get the `value`
+        values from. The default is 0 that represents the whole line content.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+      key_dtype: The `key` data type.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileIdTableInitializer, self).__init__(
+        filename,
+        key_dtype,
+        key_column_index,
+        dtypes.int64,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
+  """A structure for the spec of the hashing function to use for hash buckets.
+
+  `hasher` is the name of the hashing function to use (eg. "fasthash",
+  "stronghash").
+  `key` is optional and specify the key to use for the hash function if
+  supported, currently only used by a strong hash.
+
+  Fields:
+    hasher: The hasher name to use.
+    key: The key to be used by the hashing function, if required.
+  """
+  __slots__ = ()
+
+
+FastHashSpec = HasherSpec("fasthash", None)  # pylint: disable=invalid-name
+
+
+class StrongHashSpec(HasherSpec):
+  """A structure to specify a key of the strong keyed hash spec.
+
+  The strong hash requires a `key`, which is a list of 2 unsigned integer
+  numbers. These should be non-zero; random numbers generated from random.org
+  would be a fine choice.
+
+  Fields:
+    key: The key to be used by the keyed hashing function.
+  """
+  __slots__ = ()
+
+  def __new__(cls, key):
+    if len(key) != 2:
+      raise ValueError("key must have size 2, got %s." % len(key))
+
+    if not isinstance(key[0], compat.integral_types) or not isinstance(
+        key[1], compat.integral_types):
+      raise TypeError("Invalid key %s. Must be unsigned integer values." % key)
+
+    return super(cls, StrongHashSpec).__new__(cls, "stronghash", key)
+
+
+def _as_string(tensor):
+  if dtypes.string == tensor.dtype.base_dtype:
+    return tensor
+  return string_ops.as_string(tensor)
+
+
+class IdTableWithHashBuckets(LookupInterface):
+  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+
+  For example, if an instance of `IdTableWithHashBuckets` is initialized with a
+  string-to-id table that maps:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+
+  The `IdTableWithHashBuckets` object will performs the following mapping:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+  - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
+    hash(<term>) % num_oov_buckets + vocab_size
+
+  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
+  the lookup result is [0, 1, 2, 4, 7]
+
+  If `table` is None, only out-of-vocabulary buckets are used.
+
+  Example usage:
+
+  ```python
+  num_oov_buckets = 3
+  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
+  table = tf.IdTableWithHashBuckets(
+      tf.HashTable(tf.TextFileIdTableInitializer(filename), default_value),
+      num_oov_buckets)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+
+  The hash function used for generating out-of-vocabulary buckets ID is handled
+  by `hasher_spec`.
+  """
+
+  def __init__(self,
+               table,
+               num_oov_buckets,
+               hasher_spec=FastHashSpec,
+               name=None,
+               key_dtype=None):
+    """Construct a `IdTableWithHashBuckets` object.
+
+    Args:
+      table: Table that maps `tf.string` or `tf.int64` keys to `tf.int64` ids.
+      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys.
+      hasher_spec: A `HasherSpec` to specify the hash function to use for
+        assignation of out-of-vocabulary buckets  (optional).
+      name: A name for the operation (optional).
+      key_dtype: Data type of keys passed to `lookup`. Defaults to
+        `table.key_dtype` if `table` is specified, otherwise `tf.string`.
+        Must be string or integer, and must be castable to `table.key_dtype`.
+
+    Raises:
+      ValueError: when `table` in None and `num_oov_buckets` is not positive.
+      TypeError: when `hasher_spec` is invalid.
+    """
+    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
+    # characters to use as table name.
+    if name:
+      name = name.rstrip("/")
+    if table:
+      if key_dtype is None:
+        key_dtype = table.key_dtype
+      supported_table_key_dtypes = (dtypes.int64, dtypes.string)
+      if table.key_dtype not in supported_table_key_dtypes:
+        raise TypeError("Invalid key dtype, expected one of %s, but got %s." %
+                        (supported_table_key_dtypes, key_dtype))
+      if table.key_dtype.is_integer != key_dtype.is_integer:
+        raise TypeError("Invalid key dtype, expected %s but got %s." %
+                        ("integer" if key_dtype.is_integer else "non-integer",
+                         table.key_dtype))
+      if table.value_dtype != dtypes.int64:
+        raise TypeError("Invalid value dtype, expected %s but got %s." %
+                        (dtypes.int64, table.value_dtype))
+      self._table = table
+      name = name or self._table.name
+    else:
+      if num_oov_buckets <= 0:
+        raise ValueError("oov_buckets must be > 0 if no table is supplied.")
+      key_dtype = dtypes.string if key_dtype is None else key_dtype
+      self._table = None
+      name = name or "hash_bucket"
+    if (not key_dtype.is_integer) and (dtypes.string != key_dtype):
+      raise TypeError(
+          "Invalid key_dtype, expected integer or string, got %s." % key_dtype)
+    self._num_oov_buckets = num_oov_buckets
+
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError(
+          "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
+    self._hasher_spec = hasher_spec
+    super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64,
+                                                 name.split("/")[-1])
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    if self._table:
+      return self._table.init
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    with ops.name_scope(name, "%s_Size" % self.name) as scope:
+      if self._table:
+        tsize = self._table.size(scope)
+      else:
+        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
+      return tsize + self._num_oov_buckets
+
+  def _get_string_to_hash_bucket_fn(self, hasher_spec):
+    """Returns the string_to_hash_bucket op to use based on `hasher_spec`."""
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError("hasher_spec must be of type HasherSpec %s" % hasher_spec)
+    if hasher_spec.hasher == "fasthash":
+      return string_ops.string_to_hash_bucket_fast
+    if hasher_spec.hasher == "legacy":
+      return string_ops.string_to_hash_bucket
+    if hasher_spec.hasher == "stronghash":
+      return functools.partial(
+          string_ops.string_to_hash_bucket_strong, key=hasher_spec.key)
+    raise ValueError("Unknown hasher %s" % hasher_spec.hasher)
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in the table, outputs the corresponding values.
+
+    It assigns out-of-vocabulary keys to buckets based in their hashes.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: Optional name for the op.
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` doesn't match the table key data type.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+    values = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      values = keys.values
+    if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
+      values = math_ops.to_int64(values)
+
+    if self._num_oov_buckets == 0:
+      ids = self._table.lookup(values, name=name)
+    else:
+      # TODO(yleon): Consider moving this functionality to its own kernel.
+      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
+        str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
+            self._hasher_spec)
+        buckets = str_to_hash_bucket(
+            _as_string(values),
+            num_buckets=self._num_oov_buckets,
+            name="hash_bucket")
+        if self._table:
+          ids = self._table.lookup(values)
+          buckets = math_ops.add(buckets, self._table.size())
+          is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
+          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
+        else:
+          ids = buckets
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    return ids
+
+
+def index_table_from_file(vocabulary_file=None,
+                          num_oov_buckets=0,
+                          vocab_size=None,
+                          default_value=-1,
+                          hasher_spec=FastHashSpec,
+                          key_dtype=dtypes.string,
+                          name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the key and the zero-based line
+  number is the ID.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  table = tf.contrib.lookup.index_table_from_file(
+      vocabulary_file="test.txt", num_oov_buckets=1)
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 3, 2]  # where 3 is the out-of-vocabulary bucket
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename, may be a constant scalar `Tensor`.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignation of out-of-vocabulary buckets.
+    key_dtype: The `key` data type.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `vocabulary_file` is not set.
+    ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
+      than zero.
+  """
+  if vocabulary_file is None or (
+      isinstance(vocabulary_file, str) and not vocabulary_file):
+    raise ValueError("vocabulary_file must be specified and must not be empty.")
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+  if (not key_dtype.is_integer) and (dtypes.string != key_dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    table = None
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      if vocab_size:
+        # Keep the shared_name:
+        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                  TextFileIndex.WHOLE_LINE,
+                                                  TextFileIndex.LINE_NUMBER)
+      else:
+        # Keep the shared_name
+        # <table_type>_<filename>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                               TextFileIndex.WHOLE_LINE,
+                                               TextFileIndex.LINE_NUMBER)
+      init = TextFileIdTableInitializer(
+          vocabulary_file,
+          vocab_size=vocab_size,
+          key_dtype=dtypes.int64 if key_dtype.is_integer else key_dtype,
+          name="table_init")
+
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=key_dtype)
+
+    return table
+
+
+def index_table_from_tensor(vocabulary_list,
+                            num_oov_buckets=0,
+                            default_value=-1,
+                            hasher_spec=FastHashSpec,
+                            dtype=dtypes.string,
+                            name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a string `vocabulary_list` 1-D
+  tensor where each element is a key and corresponding index within the tensor
+  is the value.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
+  table = tf.contrib.lookup.index_table_from_tensor(
+      mapping=vocabulary_list, num_oov_buckets=1, default_value=-1)
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 4, 2]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D `Tensor` that specifies the mapping of keys to
+      indices. Thetype of this object must be castable to `dtype`.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignment of out-of-vocabulary buckets.
+    dtype: The type of values passed to `lookup`. Only string and integers are
+      supported.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map an input `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `mapping` is invalid.
+    ValueError: If `num_oov_buckets` is negative.
+  """
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+
+  if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    keys = ops.convert_to_tensor(vocabulary_list)
+    if keys.dtype.is_integer != dtype.is_integer:
+      raise ValueError("Expected %s, got %s." %
+                       ("integer"
+                        if dtype.is_integer else "non-integer", keys.dtype))
+    if (not dtype.is_integer) and (keys.dtype.base_dtype != dtype):
+      raise ValueError("Expected %s, got %s." % (dtype, keys.dtype))
+    num_elements = array_ops.size(keys)
+    values = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
+      init = KeyValueTensorInitializer(
+          table_keys,
+          values,
+          table_keys.dtype.base_dtype,
+          dtypes.int64,
+          name="table_init")
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=dtype)
+
+    return table
+
+
+def index_to_string_table_from_file(vocabulary_file,
+                                    vocab_size=None,
+                                    default_value="UNK",
+                                    name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The table is initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the value and the
+  zero-based line number is the index.
+
+  Any input which does not have a corresponding index in the vocabulary file
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_file(
+      vocabulary_file="test.txt", default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_file` is empty.
+    ValueError: when `vocab_size` is invalid.
+  """
+  if not vocabulary_file:
+    raise ValueError("vocabulary_file must be specified.")
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    shared_name = ""
+    if vocab_size:
+      # Keep a shared_name
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                TextFileIndex.LINE_NUMBER,
+                                                TextFileIndex.WHOLE_LINE)
+    else:
+      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                             TextFileIndex.LINE_NUMBER,
+                                             TextFileIndex.WHOLE_LINE)
+    init = TextFileStringTableInitializer(
+        vocabulary_file, vocab_size=vocab_size, name="table_init")
+
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+def index_to_string_table_from_tensor(vocabulary_list,
+                                      default_value="UNK",
+                                      name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The mapping is initialized from a string `mapping` 1-D `Tensor` where
+  each element is a value and the corresponding index within the tensor is the
+  key.
+
+  Any input which does not have a corresponding index in 'mapping'
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_tensor(
+      vocabulary_list, default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D string `Tensor` that specifies the strings to map
+      from indices.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_list` is not set.
+  """
+
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
+    num_elements = array_ops.size(vocabulary_list)
+    keys = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    init = KeyValueTensorInitializer(
+        keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+ops.NotDifferentiable("LookupTableFind")
+ops.NotDifferentiable("LookupTableFindV2")
+ops.NotDifferentiable("LookupTableInsert")
+ops.NotDifferentiable("LookupTableInsertV2")
+ops.NotDifferentiable("LookupTableSize")
+ops.NotDifferentiable("LookupTableSizeV2")
+ops.NotDifferentiable("HashTable")
+ops.NotDifferentiable("HashTableV2")
+ops.NotDifferentiable("InitializeTable")
+ops.NotDifferentiable("InitializeTableV2")
+ops.NotDifferentiable("InitializeTableFromTextFile")
+ops.NotDifferentiable("InitializeTableFromTextFileV2")
+ops.NotDifferentiable("MutableDenseHashTable")
+ops.NotDifferentiable("MutableDenseHashTableV2")
+ops.NotDifferentiable("MutableHashTable")
+ops.NotDifferentiable("MutableHashTableV2")
+ops.NotDifferentiable("MutableHashTableOfTensors")
+ops.NotDifferentiable("MutableHashTableOfTensorsV2")
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index c4ce11ce0fc..189b5e451ba 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -9,6 +9,8 @@ package(
     ],
 )
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 licenses(["notice"])  # Apache 2.0
 
 py_library(
@@ -34,6 +36,18 @@ py_library(
     ],
 )
 
+py_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":losses",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
index 28f004228d8..8532c19ad6b 100644
--- a/tensorflow/python/ops/losses/losses.py
+++ b/tensorflow/python/ops/losses/losses.py
@@ -16,10 +16,12 @@
 
 Note: All the losses are added to the `GraphKeys.LOSSES` collection by default.
 
+@@Reduction
 @@absolute_difference
 @@compute_weighted_loss
 @@cosine_distance
 @@hinge_loss
+@@huber_loss
 @@log_loss
 @@mean_pairwise_squared_error
 @@mean_squared_error
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 89daa9594a2..c913d7a3793 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -27,29 +27,36 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.platform import tf_logging as logging
 
 
-def _scale_losses(losses, weights):
-  """Computes the scaled loss.
+class Reduction(object):
+  """Types of loss reduction."""
 
-  Args:
-    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
-    weights: `Tensor` of shape `[]`, `[batch_size]` or
-      `[batch_size, d1, ... dN]`. The `losses` are reduced (`tf.reduce_sum`)
-      until its dimension matches that of `weights` at which point the reduced
-      `losses` are element-wise multiplied by `weights` and a final `reduce_sum`
-      is computed on the result. Conceptually, this operation is similar to
-      broadcasting (tiling) `weights` to be the same shape as `losses`,
-      performing an element-wise multiplication, and summing the result. Note,
-      however, that the dimension matching is right-to-left, not left-to-right;
-      i.e., the opposite of standard NumPy/Tensorflow broadcasting.
+  # Un-reduced weighted losses with the same shape as input.
+  NONE = "none"
 
-  Returns:
-    A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
-      `losses`.
-  """
-  weighted_losses = math_ops.multiply(losses, weights)
-  return math_ops.reduce_sum(weighted_losses)
+  # Scalar sum of `NONE`.
+  SUM = "weighted_sum"
+
+  # Scalar `SUM` divided by sum of weights.
+  MEAN = "weighted_mean"
+
+  # Scalar `SUM` divided by number of non-zero weights.
+  SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
+
+  @classmethod
+  def all(cls):
+    return (
+        cls.NONE,
+        cls.SUM,
+        cls.MEAN,
+        cls.SUM_BY_NONZERO_WEIGHTS)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid ReductionKey %s." % key)
 
 
 def _safe_div(numerator, denominator, name="value"):
@@ -129,7 +136,8 @@ def _num_present(losses, weights, per_batch=False):
 
 
 def compute_weighted_loss(
-    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES):
+    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Computes the weighted loss.
 
   Args:
@@ -139,15 +147,18 @@ def compute_weighted_loss(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: the scope for the operations performed in computing the loss.
     loss_collection: the loss will be added to these collections.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` that returns the weighted loss.
+    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
 
   Raises:
     ValueError: If `weights` is `None` or the shape is not compatible with
       `losses`, or if the number of dimensions (rank) of either `losses` or
       `weights` is missing.
   """
+  Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
@@ -155,18 +166,28 @@ def compute_weighted_loss(
       input_dtype = losses.dtype
       losses = math_ops.to_float(losses)
       weights = math_ops.to_float(weights)
-      total_loss = _scale_losses(losses, weights)
-      num_present = _num_present(losses, weights)
-      mean_loss = _safe_mean(total_loss, num_present)
+      weighted_losses = math_ops.multiply(losses, weights)
+      if reduction == Reduction.NONE:
+        loss = weighted_losses
+      else:
+        loss = math_ops.reduce_sum(weighted_losses)
+        if reduction == Reduction.MEAN:
+          loss = _safe_mean(
+              loss,
+              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+        elif reduction == Reduction.SUM_BY_NONZERO_WEIGHTS:
+          loss = _safe_mean(loss, _num_present(losses, weights))
+
       # Convert the result back to the input type.
-      mean_loss = math_ops.cast(mean_loss, input_dtype)
-      util.add_loss(mean_loss, loss_collection)
-      return mean_loss
+      loss = math_ops.cast(loss, input_dtype)
+      util.add_loss(loss, loss_collection)
+      return loss
 
 
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds an Absolute Difference loss to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -185,9 +206,11 @@ def absolute_difference(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -199,12 +222,14 @@ def absolute_difference(
     labels = math_ops.to_float(labels)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.abs(math_ops.subtract(predictions, labels))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def cosine_distance(
     labels, predictions, dim=None, weights=1.0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -219,9 +244,11 @@ def cosine_distance(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If `predictions` shape doesn't match `labels` shape, or
@@ -237,11 +264,13 @@ def cosine_distance(
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(dim,), keep_dims=True)
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def hinge_loss(labels, logits, weights=1.0, scope=None,
-               loss_collection=ops.GraphKeys.LOSSES):
+               loss_collection=ops.GraphKeys.LOSSES,
+               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a hinge loss to the training procedure.
 
   Args:
@@ -253,9 +282,11 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` of the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shapes of `logits` and `labels` don't match.
@@ -269,11 +300,76 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
     labels = math_ops.subtract(2 * labels, all_ones)
     losses = nn_ops.relu(
         math_ops.subtract(all_ones, math_ops.multiply(labels, logits)))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
+
+
+def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
+               loss_collection=ops.GraphKeys.LOSSES,
+               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
+  """Adds a Huber Loss term to the training procedure.
+
+  For each value x in `error=labels-predictions`, the following is calculated:
+
+  ```
+    0.5 * x^2                  if |x| <= d
+    0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+
+  where d is `delta`.
+
+  See: https://en.wikipedia.org/wiki/Huber_loss
+
+  `weights` acts as a coefficient for the loss. If a scalar is provided, then
+  the loss is simply scaled by the given value. If `weights` is a tensor of size
+  [batch_size], then the total loss for each sample of the batch is rescaled
+  by the corresponding element in the `weights` vector. If the shape of
+  `weights` matches the shape of `predictions`, then the loss of each
+  measurable element of `predictions` is scaled by the corresponding value of
+  `weights`.
+
+  Args:
+    labels: The ground truth output tensor, same dimensions as 'predictions'.
+    predictions: The predicted outputs.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
+    delta: `float`, the point where the huber loss function
+      changes from a quadratic to linear.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
+
+  Returns:
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weights` is invalid.
+  """
+  with ops.name_scope(scope, "huber_loss",
+                      (predictions, labels, weights)) as scope:
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    error = math_ops.subtract(predictions, labels)
+    abs_error = math_ops.abs(error)
+    quadratic = math_ops.minimum(abs_error, delta)
+    # The following expression is the same in value as
+    # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
+    # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
+    # This is necessary to avoid doubling the gradient, since there is already a
+    # nonzero contribution to the gradient from the quadratic term.
+    linear = (abs_error - quadratic)
+    losses = 0.5 * quadratic**2 + delta * linear
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
-             loss_collection=ops.GraphKeys.LOSSES):
+             loss_collection=ops.GraphKeys.LOSSES,
+             reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a Log Loss term to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -293,9 +389,11 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
     epsilon: A small increment to add to avoid taking a log of zero.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -310,11 +408,14 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
         labels,
         math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
-def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
-                                loss_collection=ops.GraphKeys.LOSSES):
+# TODO(b/37208492): Add reduction arg.
+def mean_pairwise_squared_error(
+    labels, predictions, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -349,7 +450,7 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
     loss_collection: collection to which the loss will be added.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    A scalar `Tensor` that returns the weighted loss.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -381,7 +482,8 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
       term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
                               math_ops.square(num_present_per_batch))
 
-      loss = _scale_losses(term1 - term2, weights)
+      weighted_losses = math_ops.multiply(term1 - term2, weights)
+      loss = math_ops.reduce_sum(weighted_losses)
 
       mean_loss = array_ops.where(
           math_ops.reduce_sum(num_present_per_batch) > 0,
@@ -392,8 +494,10 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
       return mean_loss
 
 
-def mean_squared_error(labels, predictions, weights=1.0, scope=None,
-                       loss_collection=ops.GraphKeys.LOSSES):
+def mean_squared_error(
+    labels, predictions, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a Sum-of-Squares loss to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -412,9 +516,11 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -426,12 +532,14 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
     labels = math_ops.to_float(labels)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.square(math_ops.subtract(predictions, labels))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -447,16 +555,18 @@ def sigmoid_cross_entropy(
   Args:
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
       `(0, 1)`.
-    logits: `[batch_size, num_classes]` logits outputs of the network.
+    logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
     label_smoothing: If greater than `0` then smooth the labels.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has the same shape as `logits`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `logits` doesn't match that of
@@ -466,7 +576,9 @@ def sigmoid_cross_entropy(
   with ops.name_scope(scope, "sigmoid_cross_entropy_loss",
                       (logits, multi_class_labels, weights)) as scope:
     logits = ops.convert_to_tensor(logits)
+    logging.info("logits.dtype=%s.", logits.dtype)
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
+    logging.info("multi_class_labels.dtype=%s.", multi_class_labels.dtype)
     logits.get_shape().assert_is_compatible_with(multi_class_labels.get_shape())
 
     if label_smoothing > 0:
@@ -476,12 +588,15 @@ def sigmoid_cross_entropy(
     losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
                                                   logits=logits,
                                                   name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    logging.info("losses.dtype=%s.", losses.dtype)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -496,16 +611,16 @@ def softmax_cross_entropy(
   Args:
     onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
     logits: [batch_size, num_classes] logits outputs of the network .
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `onehot_labels`, and must be broadcastable to `onehot_labels` (i.e., all
-      dimensions must be either `1`, or the same as the corresponding `losses`
-      dimension).
+    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
+      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the mean loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has shape `[batch_size]`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
@@ -527,7 +642,8 @@ def softmax_cross_entropy(
     losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
                                                   logits=logits,
                                                   name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 # TODO(ptucker): Merge this with similar method in metrics_impl.
@@ -572,7 +688,7 @@ def _remove_squeezable_dimensions(
     # Use dynamic rank.
     rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
     if (weights_rank is None) or (
-        weights_shape.dims[-1].is_compatible_with(1)):
+        weights_rank > 0 and weights_shape.dims[-1].is_compatible_with(1)):
       weights = control_flow_ops.cond(
           math_ops.equal(1, rank_diff),
           lambda: array_ops.squeeze(weights, [-1]),
@@ -581,8 +697,10 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
-def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
-                                 loss_collection=ops.GraphKeys.LOSSES):
+def sparse_softmax_cross_entropy(
+    labels, logits, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -602,9 +720,11 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
       `labels`
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the mean loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has the same shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shapes of logits, labels, and weight are incompatible, or
@@ -620,4 +740,5 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
     losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=logits,
                                                          name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index aaf324891f3..3718c481c26 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -17,6 +17,7 @@
 
 @@add_loss
 @@get_losses
+@@get_regularization_loss
 @@get_regularization_losses
 @@get_total_loss
 
@@ -26,6 +27,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 
@@ -45,7 +47,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
   Args:
-    scope: an optional scope for filtering the losses to return.
+    scope: An optional scope name for filtering the losses to return.
     loss_collection: Optional losses collection.
 
   Returns:
@@ -55,21 +57,42 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
 
 
 def get_regularization_losses(scope=None):
-  """Gets the regularization losses.
+  """Gets the list of regularization losses.
 
   Args:
-    scope: an optional scope for filtering the losses to return.
+    scope: An optional scope name for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+def get_regularization_loss(scope=None, name="total_regularization_loss"):
+  """Gets the total regularization loss.
+
+  Args:
+    scope: An optional scope name for filtering the losses to return.
+    name: The name of the returned tensor.
+
+  Returns:
+    A scalar regularization loss.
+  """
+  losses = get_regularization_losses(scope)
+  if losses:
+    return math_ops.add_n(losses, name=name)
+  else:
+    return constant_op.constant(0.0)
+
+
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
-  Notice that the function adds the given losses to the regularization losses.
+  In particular, this adds any losses you have added with `tf.add_loss()` to
+  any regularization losses that have been added by regularization parameters
+  on layers constructors e.g. `tf.layers`. Be very sure to use this if you
+  are constructing a loss_op manually. Otherwise regularization arguments
+  on `tf.layers` methods will not function.
 
   Args:
     add_regularization_losses: A boolean indicating whether or not to use the
diff --git a/tensorflow/python/ops/losses/util_test.py b/tensorflow/python/ops/losses/util_test.py
new file mode 100644
index 00000000000..7fa7a41fca2
--- /dev/null
+++ b/tensorflow/python/ops/losses/util_test.py
@@ -0,0 +1,51 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for losses util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import util
+from tensorflow.python.platform import test
+
+
+class LossesUtilTest(test.TestCase):
+
+  def testGetRegularizationLoss(self):
+    # Empty regularization collection should evaluate to 0.0.
+    with self.test_session():
+      self.assertEqual(0.0, util.get_regularization_loss().eval())
+
+    # Loss should sum.
+    ops.add_to_collection(
+        ops.GraphKeys.REGULARIZATION_LOSSES, constant_op.constant(2.0))
+    ops.add_to_collection(
+        ops.GraphKeys.REGULARIZATION_LOSSES, constant_op.constant(3.0))
+    with self.test_session():
+      self.assertEqual(5.0, util.get_regularization_loss().eval())
+
+    # Check scope capture mechanism.
+    with ops.name_scope('scope1'):
+      ops.add_to_collection(
+          ops.GraphKeys.REGULARIZATION_LOSSES, constant_op.constant(-1.0))
+    with self.test_session():
+      self.assertEqual(-1.0, util.get_regularization_loss('scope1').eval())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index b1e3e9e7491..a0f505e47b2 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -369,6 +369,24 @@ def _Log1pGrad(op, grad):
     return grad * math_ops.reciprocal(1 + x)
 
 
+@ops.RegisterGradient("Sinh")
+def _SinhGrad(op, grad):
+  """Returns grad * cosh(x)."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad.op]):
+    x = math_ops.conj(x)
+    return grad * math_ops.cosh(x)
+
+
+@ops.RegisterGradient("Cosh")
+def _CoshGrad(op, grad):
+  """Returns grad * sinh(x)."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad.op]):
+    x = math_ops.conj(x)
+    return grad * math_ops.sinh(x)
+
+
 @ops.RegisterGradient("Tanh")
 def _TanhGrad(op, grad):
   """Returns grad * (1 - tanh(x) * tanh(x))."""
@@ -429,7 +447,7 @@ def _DigammaGrad(op, grad):
 
 @ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
-  """Returns gradient of igamma(a, x) with respect to a and x."""
+  """Returns gradient of igamma(a, x) with respect to x."""
   # TODO(ebrevdo): Perhaps add the derivative w.r.t. a
   a = op.inputs[0]
   x = op.inputs[1]
@@ -440,14 +458,43 @@ def _IgammaGrad(op, grad):
   # Perform operations in log space before summing, because Gamma(a)
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
+  # TODO(b/36815900): Mark None return values as NotImplemented
   return (None,
           array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
 def _IgammacGrad(op, grad):
-  """Returns gradient of igammac(a, x) = 1 - igamma(a, x) w.r.t. a and x."""
-  return [-1 * g if g is not None else None for g in _IgammaGrad(op, grad)]
+  """Returns gradient of igammac(a, x) = 1 - igamma(a, x) w.r.t. x."""
+  _, igamma_grad_x = _IgammaGrad(op, grad)
+  return None, -igamma_grad_x
+
+
+@ops.RegisterGradient("Betainc")
+def _BetaincGrad(op, grad):
+  """Returns gradient of betainc(a, b, x) with respect to x."""
+  # TODO(ebrevdo): Perhaps add the derivative w.r.t. a, b
+  a, b, x = op.inputs
+
+  # two cases: x is a scalar and a/b are same-shaped tensors, or vice
+  # versa; so its sufficient to check against shape(a).
+  sa = array_ops.shape(a)
+  sx = array_ops.shape(x)
+  # pylint: disable=protected-access
+  _, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
+  # pylint: enable=protected-access
+
+  # Perform operations in log space before summing, because terms
+  # can grow large.
+  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
+              - gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp(
+      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+
+  # TODO(b/36815900): Mark None return values as NotImplemented
+  return (None,  # da
+          None,  # db
+          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -465,6 +512,7 @@ def _ZetaGrad(op, grad):
     x = math_ops.conj(x)
     q = math_ops.conj(q)
     partial_q = -x * math_ops.zeta(x + 1, q)
+    # TODO(b/36815900): Mark None return values as NotImplemented
     return (None,
             array_ops.reshape(math_ops.reduce_sum(partial_q * grad, rq), sq))
 
@@ -484,6 +532,7 @@ def _PolygammaGrad(op, grad):
     n = math_ops.conj(n)
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(n + 1, x)
+    # TODO(b/36815900): Mark None return values as NotImplemented
     return (None,
             array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
@@ -582,6 +631,16 @@ def _AtanGrad(op, grad):
     return grad * inv
 
 
+@ops.RegisterGradient("Atan2")
+def _Atan2Grad(op, grad):
+  """Returns grad * x / (x^2 + y^2), grad * -y / (x^2 + y^2)."""
+  y = op.inputs[0]
+  x = op.inputs[1]
+  with ops.control_dependencies([grad.op]):
+    grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
+    return x * grad_inv, -y * grad_inv
+
+
 @ops.RegisterGradient("AddN")
 def _AddNGrad(op, grad):
   """Copies the gradient to all inputs."""
@@ -755,6 +814,7 @@ ops.NotDifferentiable("LessEqual")
 ops.NotDifferentiable("Greater")
 ops.NotDifferentiable("GreaterEqual")
 ops.NotDifferentiable("Equal")
+ops.NotDifferentiable("ApproximateEqual")
 ops.NotDifferentiable("NotEqual")
 ops.NotDifferentiable("LogicalAnd")
 ops.NotDifferentiable("LogicalOr")
@@ -772,24 +832,25 @@ def _SelectGrad(op, grad):
 
 @ops.RegisterGradient("MatMul")
 def _MatMulGrad(op, grad):
+  """Gradient for MatMul."""
+
   t_a = op.get_attr("transpose_a")
   t_b = op.get_attr("transpose_b")
+  a = math_ops.conj(op.inputs[0])
+  b = math_ops.conj(op.inputs[1])
   if not t_a and not t_b:
-    return (math_ops.matmul(
-        grad, op.inputs[1], transpose_b=True), math_ops.matmul(
-            op.inputs[0], grad, transpose_a=True))
+    grad_a = math_ops.matmul(grad, b, transpose_b=True)
+    grad_b = math_ops.matmul(a, grad, transpose_a=True)
   elif not t_a and t_b:
-    return (math_ops.matmul(grad, op.inputs[1]), math_ops.matmul(
-        grad, op.inputs[0], transpose_a=True))
+    grad_a = math_ops.matmul(grad, b)
+    grad_b = math_ops.matmul(grad, a, transpose_a=True)
   elif t_a and not t_b:
-    return (math_ops.matmul(
-        op.inputs[1], grad, transpose_b=True),
-            math_ops.matmul(op.inputs[0], grad))
+    grad_a = math_ops.matmul(b, grad, transpose_b=True)
+    grad_b = math_ops.matmul(a, grad)
   elif t_a and t_b:
-    return (math_ops.matmul(
-        op.inputs[1], grad, transpose_a=True, transpose_b=True),
-            math_ops.matmul(
-                grad, op.inputs[0], transpose_a=True, transpose_b=True))
+    grad_a = math_ops.matmul(b, grad, transpose_a=True, transpose_b=True)
+    grad_b = math_ops.matmul(grad, a, transpose_a=True, transpose_b=True)
+  return grad_a, grad_b
 
 
 @ops.RegisterGradient("SparseMatMul")
@@ -951,48 +1012,6 @@ def _CastGrad(op, grad):
     return None
 
 
-def _FFTSizeForGrad(grad, rank):
-  return math_ops.reduce_prod(
-      array_ops.slice(
-          array_ops.reverse_v2(array_ops.shape(grad), [0]), (0,), (rank,)))
-
-
-@ops.RegisterGradient("FFT")
-def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
-  return math_ops.ifft(grad) * math_ops.complex(size, 0.)
-
-
-@ops.RegisterGradient("IFFT")
-def _IFFTGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
-  return math_ops.fft(grad) * math_ops.complex(rsize, 0.)
-
-
-@ops.RegisterGradient("FFT2D")
-def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
-  return math_ops.ifft2d(grad) * math_ops.complex(size, 0.)
-
-
-@ops.RegisterGradient("IFFT2D")
-def _IFFT2DGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
-  return math_ops.fft2d(grad) * math_ops.complex(rsize, 0.)
-
-
-@ops.RegisterGradient("FFT3D")
-def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
-  return math_ops.ifft3d(grad) * math_ops.complex(size, 0.)
-
-
-@ops.RegisterGradient("IFFT3D")
-def _IFFT3DGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
-  return math_ops.fft3d(grad) * math_ops.complex(rsize, 0.)
-
-
 @ops.RegisterGradient("Cross")
 def _CrossGrad(op, grad):
   u = op.inputs[0]
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 1378bf8ecac..16aa5d82476 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Note: Elementwise binary operations in TensorFlow follow [numpy-style
-broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+"""Basic arithmetic operators.
 
-## Arithmetic Operators
-
-TensorFlow provides several operations that you can use to add basic arithmetic
-operators to your graph.
+See the @{$python/math_ops} guide.
 
 @@add
 @@subtract
@@ -35,12 +31,6 @@ operators to your graph.
 @@floormod
 @@mod
 @@cross
-
-## Basic Math Functions
-
-TensorFlow provides several operations that you can use to add basic
-mathematical functions to your graph.
-
 @@add_n
 @@abs
 @@negative
@@ -55,6 +45,8 @@ mathematical functions to your graph.
 @@expm1
 @@log
 @@log1p
+@@sinh
+@@cosh
 @@ceil
 @@floor
 @@maximum
@@ -66,6 +58,7 @@ mathematical functions to your graph.
 @@acos
 @@asin
 @@atan
+@@atan2
 @@lgamma
 @@digamma
 @@erf
@@ -77,26 +70,17 @@ mathematical functions to your graph.
 @@polygamma
 @@betainc
 @@rint
-
-## Matrix Math Functions
-
-TensorFlow provides several operations that you can use to add linear algebra
-functions on matrices to your graph.
-
 @@diag
 @@diag_part
 @@trace
 @@transpose
-
 @@eye
 @@matrix_diag
 @@matrix_diag_part
 @@matrix_band_part
 @@matrix_set_diag
 @@matrix_transpose
-
 @@matmul
-
 @@norm
 @@matrix_determinant
 @@matrix_inverse
@@ -109,43 +93,17 @@ functions on matrices to your graph.
 @@self_adjoint_eig
 @@self_adjoint_eigvals
 @@svd
-
-
-## Tensor Math Function
-
-TensorFlow provides operations that you can use to add tensor functions to your
-graph.
-
 @@tensordot
-
-
-## Complex Number Functions
-
-TensorFlow provides several operations that you can use to add complex number
-functions to your graph.
-
 @@complex
 @@conj
 @@imag
 @@real
-
-## Fourier Transform Functions
-
-TensorFlow provides several operations that you can use to add discrete
-Fourier transform functions to your graph.
-
 @@fft
 @@ifft
 @@fft2d
 @@ifft2d
 @@fft3d
 @@ifft3d
-
-## Reduction
-
-TensorFlow provides several operations that you can use to perform
-common math computations that reduce various dimensions of a tensor.
-
 @@reduce_sum
 @@reduce_prod
 @@reduce_min
@@ -155,70 +113,27 @@ common math computations that reduce various dimensions of a tensor.
 @@reduce_any
 @@reduce_logsumexp
 @@count_nonzero
-
 @@accumulate_n
-
 @@einsum
-
-## Scan
-
-TensorFlow provides several operations that you can use to perform scans
-(running totals) across one axis of a tensor.
-
+@@bincount
 @@cumsum
 @@cumprod
-
-## Segmentation
-
-TensorFlow provides several operations that you can use to perform common
-math computations on tensor segments.
-Here a segmentation is a partitioning of a tensor along
-the first dimension, i.e. it  defines a mapping from the first dimension onto
-`segment_ids`. The `segment_ids` tensor should be the size of
-the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
-where `k<d0`.
-In particular, a segmentation of a matrix tensor is a mapping of rows to
-segments.
-
-For example:
-
-```python
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-  ==>  [[0 0 0 0]
-        [5 6 7 8]]
-```
-
 @@segment_sum
 @@segment_prod
 @@segment_min
 @@segment_max
 @@segment_mean
-
 @@unsorted_segment_sum
 @@unsorted_segment_max
-
 @@sparse_segment_sum
 @@sparse_segment_mean
 @@sparse_segment_sqrt_n
-
-
-## Sequence Comparison and Indexing
-
-TensorFlow provides several operations that you can use to add sequence
-comparison and index extraction to your graph. You can use these operations to
-determine sequence differences and determine the indexes of specific values in
-a tensor.
-
 @@argmin
 @@argmax
-
 @@setdiff1d
 @@where
 @@unique
-
 @@edit_distance
-
 @@invert_permutation
 """
 from __future__ import absolute_import
@@ -239,7 +154,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import state_ops
 # go/tf-wildcard-import
@@ -249,7 +166,6 @@ from tensorflow.python.ops.gen_math_ops import *
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
-
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
@@ -266,8 +182,9 @@ def argmax(input, axis=None, name=None, dimension=None):
   return gen_math_ops.arg_max(input, axis, name)
 
 
-argmax.__doc__ = (gen_math_ops.arg_max.__doc__.replace(
-    "dimensions", "axes").replace("dimension", "axis"))
+argmax.__doc__ = (gen_math_ops.arg_max.__doc__.replace("dimensions",
+                                                       "axes").replace(
+                                                           "dimension", "axis"))
 
 
 # TODO(aselle:deprecate arg_min)
@@ -281,8 +198,9 @@ def argmin(input, axis=None, name=None, dimension=None):
   return gen_math_ops.arg_min(input, axis, name)
 
 
-argmin.__doc__ = (gen_math_ops.arg_min.__doc__.replace(
-    "dimensions", "axes").replace("dimension", "axis"))
+argmin.__doc__ = (gen_math_ops.arg_min.__doc__.replace("dimensions",
+                                                       "axes").replace(
+                                                           "dimension", "axis"))
 
 # pylint: enable=redefined-builtin
 
@@ -290,21 +208,27 @@ argmin.__doc__ = (gen_math_ops.arg_min.__doc__.replace(
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 def abs(x, name=None):
-  """Computes the absolute value of a tensor.
+  r"""Computes the absolute value of a tensor.
 
-  Given a tensor of real numbers `x`, this operation returns a tensor
-  containing the absolute value of each element in `x`. For example, if x is
-  an input element and y is an output element, this operation computes
-  \\\\(y = |x|\\\\).
+  Given a tensor `x` of complex numbers, this operation returns a tensor of type
+  `float32` or `float64` that is the absolute value of each element in `x`. All
+  elements in `x` must be complex numbers of the form \\(a + bj\\). The
+  absolute value is computed as \\( \sqrt{a^2 + b^2}\\).  For example:
+  ```
+  # tensor 'x' is [[-2.25 + 4.75j], [-3.25 + 5.75j]]
+  tf.complex_abs(x) ==> [5.25594902, 6.60492229]
+  ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`, or
-      `int64`.
+    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
+      `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
       values.
+    Note, for `complex64` or `complex128' input, the returned `Tensor` will be
+      of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
@@ -321,9 +245,17 @@ def abs(x, name=None):
       if x.dtype in (dtypes.complex64, dtypes.complex128):
         return gen_math_ops._complex_abs(x, Tout=x.dtype.real_dtype, name=name)
       return gen_math_ops._abs(x, name=name)
+
+
 # pylint: enable=g-docstring-has-escape
 
 
+# pylint: disable=redefined-builtin
+def _bucketize(input, boundaries, name=None):
+  return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+# pylint: enable=redefined-builtin
+
+
 class DivideDelegateWithName(object):
   """Use Python2/Python3 division delegation to implement divide for tensors."""
 
@@ -360,6 +292,8 @@ def divide(x, y, name=None):
 
 def multiply(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
+
+
 multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
 
 
@@ -369,12 +303,16 @@ multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
     "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`")
 def _mul(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
-_mul.__doc__ = (gen_math_ops._mul.__doc__
-                + ("" if _mul.__doc__ is None else _mul.__doc__))
+
+
+_mul.__doc__ = (gen_math_ops._mul.__doc__ +
+                ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
 def subtract(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
+
+
 subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
 
 
@@ -384,8 +322,10 @@ subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
     "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`")
 def _sub(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
-_sub.__doc__ = (gen_math_ops._sub.__doc__
-                + ("" if _sub.__doc__ is None else _sub.__doc__))
+
+
+_sub.__doc__ = (gen_math_ops._sub.__doc__ +
+                ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
 # pylint: disable=g-docstring-has-escape
@@ -409,13 +349,14 @@ def negative(x, name=None):
           indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
     else:
       return gen_math_ops._neg(x, name=name)
+
+
 # pylint: enable=g-docstring-has-escape
 
 
 # pylint: disable=g-docstring-has-escape
-@deprecated(
-    "2016-12-30",
-    "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
+@deprecated("2016-12-30",
+            "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
 def _neg(x, name=None):
   """Computes numerical negative value element-wise.
 
@@ -430,13 +371,17 @@ def _neg(x, name=None):
     A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
   """
   return negative(x, name)
+
+
 # pylint: enable=g-docstring-has-escape
 
 
 def sign(x, name=None):
   """Returns an element-wise indication of the sign of a number.
 
-  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0` or `tf.is_nan(x)`; 1 if `x > 0`.
+
+  Zero is returned for NaN inputs.
 
   For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
 
@@ -447,6 +392,10 @@ def sign(x, name=None):
 
   Returns:
     A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(numpy)
+  Equivalent to numpy.sign except for the behavior for input values of NaN.
+  @end_compatibility
   """
   with ops.name_scope(name, "Sign", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
@@ -458,7 +407,7 @@ def sign(x, name=None):
 
 
 def square(x, name=None):
-  """Computes square of x element-wise.
+  r"""Computes square of x element-wise.
 
   I.e., \\(y = x * x = x^2\\).
 
@@ -480,7 +429,7 @@ def square(x, name=None):
 
 
 def sqrt(x, name=None):
-  """Computes square root of x element-wise.
+  r"""Computes square root of x element-wise.
 
   I.e., \\(y = \sqrt{x} = x^{1/2}\\).
 
@@ -551,9 +500,9 @@ def scalar_mul(scalar, x):
 
 
 def pow(x, y, name=None):
-  """Computes the power of one value to another.
+  r"""Computes the power of one value to another.
 
-  Given a tensor `x` and a tensor `y`, this operation computes \\\\(x^y\\\\) for
+  Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
   corresponding elements in `x` and `y`. For example:
 
   ```
@@ -614,8 +563,7 @@ def complex(real, imag, name=None):
       Tout = dtypes.complex64
     else:
       raise TypeError("real and imag have incorrect types: "
-                      "{} {}".format(real.dtype.name,
-                                     imag.dtype.name))
+                      "{} {}".format(real.dtype.name, imag.dtype.name))
     return gen_math_ops._complex(real, imag, Tout=Tout, name=name)
 
 
@@ -652,7 +600,7 @@ def real(input, name=None):
 
 
 def imag(input, name=None):
-  """Returns the imaginary part of a complex number.
+  r"""Returns the imaginary part of a complex number.
 
   Given a tensor `input` of complex numbers, this operation returns a tensor of
   type `float32` or `float64` that is the imaginary part of each element in
@@ -770,15 +718,15 @@ def saturate_cast(value, dtype, name=None):
     value = ops.convert_to_tensor(value, name="value")
     dtype = dtypes.as_dtype(dtype).base_dtype
     if value.dtype.min < dtype.min:
-      value = gen_math_ops.maximum(
-          value,
-          ops.convert_to_tensor(
-              dtype.min, dtype=value.dtype, name="min"))
+      value = gen_math_ops.maximum(value,
+                                   ops.convert_to_tensor(
+                                       dtype.min, dtype=value.dtype,
+                                       name="min"))
     if value.dtype.max > dtype.max:
-      value = gen_math_ops.minimum(
-          value,
-          ops.convert_to_tensor(
-              dtype.max, dtype=value.dtype, name="max"))
+      value = gen_math_ops.minimum(value,
+                                   ops.convert_to_tensor(
+                                       dtype.max, dtype=value.dtype,
+                                       name="max"))
     return cast(value, dtype, name=name)
 
 
@@ -885,17 +833,28 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
       if not isinstance(y, sparse_tensor.SparseTensor):
-        y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+        try:
+          y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+        except TypeError:
+          # If the RHS is not a tensor, it might be a tensor aware object
+          # that can implement the operator with knowledge of itself
+          # and the tensor.
+          if hasattr(type(y), "__r%s__" % op_name):
+            return NotImplemented
+          else:
+            raise
       return func(x, y, name=name)
 
   def binary_op_wrapper_sparse(sp_x, y):
     with ops.name_scope(None, op_name, [sp_x, y]) as name:
       y = ops.convert_to_tensor(y, dtype=sp_x.dtype.base_dtype, name="y")
-      return sparse_tensor.SparseTensor(
-          sp_x.indices,
-          func(
-              sp_x.indices, sp_x.values, sp_x.dense_shape, y, name=name),
-          sp_x.dense_shape)
+      return sparse_tensor.SparseTensor(sp_x.indices,
+                                        func(
+                                            sp_x.indices,
+                                            sp_x.values,
+                                            sp_x.dense_shape,
+                                            y,
+                                            name=name), sp_x.dense_shape)
 
   def r_binary_op_wrapper(y, x):
     with ops.name_scope(None, op_name, [x, y]) as name:
@@ -943,8 +902,8 @@ _TRUEDIV_TABLE = {
 # to explicitly use the "/" operator to invoke either truediv or div.
 def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
   """Internal helper function for 'sp_t / dense_t'."""
-  with ops.name_scope(name, "truediv",
-                      [sp_indices, sp_values, sp_shape, y]) as name:
+  with ops.name_scope(name, "truediv", [sp_indices, sp_values, sp_shape,
+                                        y]) as name:
     sp_values = ops.convert_to_tensor(sp_values, name="sp_values")
     y = ops.convert_to_tensor(y, name="y")
     x_dtype = sp_values.dtype.base_dtype
@@ -1131,8 +1090,6 @@ _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
 _OverrideBinaryOperatorHelper(_div_python2, "div")
 _OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
-# TODO(aselle): Switch mod to floor_mod when ready
-# _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(gen_math_ops._floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
@@ -1218,8 +1175,9 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):
           dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64
       ]
       assert all(arg.dtype in dtype_hierarchy for arg in [start, limit, delta])
-      inferred_dtype = max([arg.dtype for arg in [start, limit, delta]],
-                           key=dtype_hierarchy.index)
+      inferred_dtype = max(
+          [arg.dtype for arg in [start, limit, delta]],
+          key=dtype_hierarchy.index)
 
       start = cast(start, inferred_dtype)
       limit = cast(limit, inferred_dtype)
@@ -1673,7 +1631,7 @@ def reduce_logsumexp(input_tensor,
 
 
 def trace(x, name=None):
-  """ Compute the trace of a tensor `x`.
+  """Compute the trace of a tensor `x`.
 
   `trace(x)` returns the sum along the main diagonal of each inner-most matrix
   in x. If x is of rank `k` with shape `[I, J, K, ..., L, M, N]`, then output
@@ -1725,8 +1683,9 @@ def matmul(a,
            name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a` * `b`.
 
-  The inputs must be matrices (or tensors of rank > 2, representing batches of
-  matrices), with matching inner dimensions, possibly after transposition.
+  The inputs must, following any transpositions, be tensors of rank >= 2 
+  where the inner 2 dimensions specify valid matrix multiplication arguments, 
+  and any further outer dimensions match.
 
   Both matrices must be of the same type. The supported types are:
   `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
@@ -1774,6 +1733,12 @@ def matmul(a,
                            [229 244]],
                           [[508 532]
                            [697 730]]]
+
+  # Since python >= 3.5 the @ operator is supported (see PEP 465).
+  # In TensorFlow, it simply calls the `tf.matmul()` function, so the
+  # following lines are equivalent:
+  d = a @ b @ [[10.], [11.]]
+  d = tf.matmul(tf.matmul(a, b), [[10.], [11.]])
   ```
 
   Args:
@@ -1860,6 +1825,8 @@ def matmul(a,
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
 
 
+_OverrideBinaryOperatorHelper(matmul, "matmul")
+
 sparse_matmul = gen_math_ops._sparse_mat_mul
 
 
@@ -1976,6 +1943,12 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   NOTE: This operation is not differentiable and cannot be used if inputs depend
   on trainable variables. Please use `tf.add_n` for such cases.
 
+  Aside from differentiability, `tf.accumulate_n` performs the same operation as
+  `tf.add_n`, but does not wait for all of its inputs to be ready before
+  beginning to sum. This can save memory if inputs are ready at different times,
+  since minimum temporary storage is proportional to the output size rather than
+  the inputs size.
+
   For example:
 
   ```python
@@ -2030,8 +2003,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
       zeros.set_shape(shape)
       ref = state_ops.assign(var, zeros, validate_shape=False)
       update_ops = [
-          state_ops.assign_add(
-              ref, input_tensor, use_locking=True) for input_tensor in inputs
+          state_ops.assign_add(ref, input_tensor, use_locking=True)
+          for input_tensor in inputs
       ]
       with ops.control_dependencies(update_ops):
         return gen_state_ops._destroy_temporary_variable(
@@ -2061,17 +2034,34 @@ def sigmoid(x, name=None):
     return gen_math_ops._sigmoid(x, name=name)
 
 
+def log_sigmoid(x, name=None):
+  """Computes log sigmoid of `x` element-wise.
+
+  Specifically, `y = log(1 / (1 + exp(-x)))`.  For numerical stability,
+  we use `y = -tf.nn.softplus(-x)`.
+
+  Args:
+    x: A Tensor with type `float32` or `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor with the same type as `x`.
+  """
+  with ops.name_scope(name, "LogSigmoid", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops._neg(gen_nn_ops.softplus(-x), name=name)
+
+
 def tanh(x, name=None):
   """Computes hyperbolic tangent of `x` element-wise.
 
   Args:
     x: A Tensor or SparseTensor with type `float`, `double`, `int32`,
-      `complex64`, `int64`, or `qint32`.
+      `complex64`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
-    A Tensor or SparseTensor respectively with the same type as `x` if
-    `x.dtype != qint32` otherwise the return type is `quint8`.
+    A Tensor or SparseTensor respectively with the same type as `x`.
   """
   with ops.name_scope(name, "Tanh", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
@@ -2082,40 +2072,90 @@ def tanh(x, name=None):
       return gen_math_ops._tanh(x, name=name)
 
 
+def bincount(arr,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  Args:
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+        `arr`, the bin will be incremented by the corresponding weight instead
+        of 1.
+    minlength: If given, ensures the output has length at least `minlength`,
+        padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+        `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+  """
+  arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+  array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
+  output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
+  if minlength is not None:
+    minlength = ops.convert_to_tensor(
+        minlength, name="minlength", dtype=dtypes.int32)
+    output_size = gen_math_ops.maximum(minlength, output_size)
+  if maxlength is not None:
+    maxlength = ops.convert_to_tensor(
+        maxlength, name="maxlength", dtype=dtypes.int32)
+    output_size = gen_math_ops.minimum(maxlength, output_size)
+  weights = (ops.convert_to_tensor(weights, name="weights")
+             if weights is not None else constant_op.constant([], dtype))
+  return gen_math_ops.bincount(arr, output_size, weights)
+
+
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
   By default, this op performs an inclusive cumsum, which means that the first
   element of the input is identical to the first element of the output:
-  ```prettyprint
-  tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+
+  ```python
+  tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
   ```
 
   By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
   instead:
-  ```prettyprint
-  tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+
+  ```python
+  tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumsum is performed in the
   opposite direction:
-  ```prettyprint
-  tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+
+  ```python
+  tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
   ```
+
   This is more efficient than using separate `tf.reverse` ops.
 
   The `reverse` and `exclusive` kwargs can also be combined:
-  ```prettyprint
-  tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+
+  ```python
+  tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   ```
 
   Args:
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-       axis: A `Tensor` of type `int32` (default: 0).
-       reverse: A `bool` (default: False).
-       name: A name for the operation (optional).
+    axis: A `Tensor` of type `int32` (default: 0).
+    exclusive: If `True`, perform exclusive cumsum.
+    reverse: A `bool` (default: False).
+    name: A name for the operation (optional).
 
   Returns:
     A `Tensor`. Has the same type as `x`.
@@ -2130,29 +2170,32 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
 
   By default, this op performs an inclusive cumprod, which means that the
-  first
-  element of the input is identical to the first element of the output:
-  ```prettyprint
-  tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+  first element of the input is identical to the first element of the output:
+
+  ```python
+  tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
   ```
 
   By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
   performed
   instead:
-  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
+
+  ```python
+  tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumprod is performed in the
   opposite direction:
-  ```prettyprint
-  tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-  ```
-  This is more efficient than using separate `tf.reverse` ops.
 
+  ```python
+  tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+  ```
+
+  This is more efficient than using separate `tf.reverse` ops.
   The `reverse` and `exclusive` kwargs can also be combined:
-  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
+
+  ```python
+  tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
   ```
 
   Args:
@@ -2160,6 +2203,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
     axis: A `Tensor` of type `int32` (default: 0).
+    exclusive: If `True`, perform exclusive cumprod.
     reverse: A `bool` (default: False).
     name: A name for the operation (optional).
 
@@ -2301,7 +2345,7 @@ def tensordot(a, b, axes, name=None):
     using `array_ops.transpose` and `array_ops.reshape`. The method takes a
     tensor and performs the correct transpose and reshape operation for a given
     set of indices. It returns the reshaped tensor as well as a list of indices
-    necesary to reshape the tensor again after matrix multiplication.
+    necessary to reshape the tensor again after matrix multiplication.
 
     Args:
       a: `Tensor`.
@@ -2311,12 +2355,13 @@ def tensordot(a, b, axes, name=None):
         assumes that `a` is the second argument in the contraction operation.
 
     Returns:
-      A pair `(reshaped_a, free_dims)` where `reshaped_a` is the tensor `a`
-      reshaped to allow contraction via `matmul` and `free_dims` is either a
-      list of integers or an `int32` `Tensor`, depending on if `axes` is a list
-      and the shape of `a`  is fully defined.
+      A tuple `(reshaped_a, free_dims, free_dims_static)` where `reshaped_a` is
+      the tensor `a` reshaped to allow contraction via `matmul`, `free_dims` is
+      either a list of integers or an `int32` `Tensor`, depending on whether
+      the shape of a is fully specified, and free_dims_static is either a list
+      of integers and None values, or None, representing the inferred
+      static shape of the free dimensions
     """
-    # TODO(b/33084409): Implement partial shape inference.
     if a.get_shape().is_fully_defined() and isinstance(axes, (list, tuple)):
       shape_a = a.get_shape().as_list()
       axes = [i if i >= 0 else i + len(shape_a) for i in axes]
@@ -2327,8 +2372,15 @@ def tensordot(a, b, axes, name=None):
       perm = list(axes) + free if flipped else free + list(axes)
       new_shape = [prod_axes, prod_free] if flipped else [prod_free, prod_axes]
       reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape)
-      return reshaped_a, free_dims
+      return reshaped_a, free_dims, free_dims
     else:
+      if a.get_shape().ndims is not None and isinstance(axes, (list, tuple)):
+        shape_a = a.get_shape().as_list()
+        axes = [i if i >= 0 else i + len(shape_a) for i in axes]
+        free = [i for i in xrange(len(shape_a)) if i not in axes]
+        free_dims_static = [shape_a[i] for i in free]
+      else:
+        free_dims_static = None
       shape_a = array_ops.shape(a)
       rank_a = array_ops.rank(a)
       axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
@@ -2347,7 +2399,7 @@ def tensordot(a, b, axes, name=None):
         perm = array_ops.concat([free, axes], 0)
         new_shape = array_ops.stack([prod_free_dims, prod_axes_dims])
       reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape)
-      return reshaped_a, free_dims
+      return reshaped_a, free_dims, free_dims_static
 
   def _tensordot_axes(a, axes):
     """Generates two sets of contraction axes for the two tensor arguments."""
@@ -2359,9 +2411,8 @@ def tensordot(a, b, axes, name=None):
         return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
       else:
         rank = array_ops.rank(a)
-        return (array_ops.range(
-            rank - axes, rank, dtype=dtypes.int32), array_ops.range(
-                rank, dtype=dtypes.int32))
+        return (range(rank - axes, rank, dtype=dtypes.int32), range(
+            axes, dtype=dtypes.int32))
     elif isinstance(axes, (list, tuple)):
       if len(axes) != 2:
         raise ValueError("'axes' must be an integer or have length 2.")
@@ -2380,13 +2431,27 @@ def tensordot(a, b, axes, name=None):
     a = ops.convert_to_tensor(a, name="a")
     b = ops.convert_to_tensor(b, name="b")
     a_axes, b_axes = _tensordot_axes(a, axes)
-    a_reshape, a_free_dims = _tensordot_reshape(a, a_axes)
-    b_reshape, b_free_dims = _tensordot_reshape(b, b_axes, True)
+    a_reshape, a_free_dims, a_free_dims_static = _tensordot_reshape(a, a_axes)
+    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(b, b_axes,
+                                                                    True)
     ab_matmul = matmul(a_reshape, b_reshape)
     if isinstance(a_free_dims, list) and isinstance(b_free_dims, list):
       return array_ops.reshape(ab_matmul, a_free_dims + b_free_dims, name=name)
     else:
-      a_free_dims = ops.convert_to_tensor(a_free_dims)
-      b_free_dims = ops.convert_to_tensor(b_free_dims)
-      return array_ops.reshape(
+      a_free_dims = ops.convert_to_tensor(a_free_dims, dtype=dtypes.int32)
+      b_free_dims = ops.convert_to_tensor(b_free_dims, dtype=dtypes.int32)
+      product = array_ops.reshape(
           ab_matmul, array_ops.concat([a_free_dims, b_free_dims], 0), name=name)
+      if a_free_dims_static is not None and b_free_dims_static is not None:
+        product.set_shape(a_free_dims_static + b_free_dims_static)
+      return product
+
+
+# FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
+# 1.0 API so we leave these here for backwards compatibility.
+fft = gen_spectral_ops.fft
+ifft = gen_spectral_ops.ifft
+fft2d = gen_spectral_ops.fft2d
+ifft2d = gen_spectral_ops.ifft2d
+fft3d = gen_spectral_ops.fft3d
+ifft3d = gen_spectral_ops.ifft3d
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 37787eca63b..96836037856 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.ops.math_ops."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -29,6 +29,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
+ops._USE_C_API = True
+
 exp = np.exp
 log = np.log
 
@@ -54,7 +56,8 @@ class ReduceTest(test_util.TensorFlowTestCase):
   def testReduceInvalidAxis(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
 
@@ -183,14 +186,45 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
     for dtype in [np.int32, np.float16]:
       x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
       y = np.array([-3, -2, -1], dtype=dtype)
-      z = (x - y)*(x - y)
+      z = (x - y) * (x - y)
       with self.test_session(use_gpu=True):
         z_tf = math_ops.squared_difference(x, y).eval()
         self.assertAllClose(z, z_tf)
 
 
+class ApproximateEqualTest(test_util.TensorFlowTestCase):
+
+  def testApproximateEqual(self):
+    for dtype in [np.float32, np.double]:
+      x = dtype(1)
+      y = dtype(1.00009)
+      z = False
+      with self.test_session(use_gpu=True):
+        # Default tolerance is 0.00001
+        z_tf = math_ops.approximate_equal(x, y).eval()
+        self.assertAllEqual(z, z_tf)
+
+    for dtype in [np.float32, np.double]:
+      x = dtype(1)
+      y = dtype(1.000009)
+      z = True
+      with self.test_session(use_gpu=True):
+        # Default tolerance is 0.00001
+        z_tf = math_ops.approximate_equal(x, y).eval()
+        self.assertAllEqual(z, z_tf)
+
+    for dtype in [np.float32, np.double]:
+      x = np.array([[[[-1, 2.00009999], [-3, 4.01]]]], dtype=dtype)
+      y = np.array([[[[-1.001, 2], [-3.00009, 4]]]], dtype=dtype)
+      z = np.array([[[[False, True], [True, False]]]], dtype=np.bool)
+      with self.test_session(use_gpu=True):
+        z_tf = math_ops.approximate_equal(x, y, tolerance=0.0001).eval()
+        self.assertAllEqual(z, z_tf)
+
+
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testAcceptsRefs(self):
     var = variables.Variable(10)
     result = math_ops.scalar_mul(3, var)
@@ -241,6 +275,45 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
+class AddNTest(test_util.TensorFlowTestCase):
+
+  def testPartials(self):
+    """Test that previously revealed a bug in buffer forwarding for AddN."""
+    partials = []
+    for _ in range(98):
+      partials.append(math_ops.add_n([constant_op.constant(1)]))
+    partials.append(
+        math_ops.add_n([constant_op.constant(1),
+                        constant_op.constant(1)]))
+
+    res = math_ops.add_n(partials) + constant_op.constant(0)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(res.eval(), 100)
+
+  def testFloat(self):
+    np.random.seed(12345)
+    for num_inputs in range(1, 10):
+      x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(num_inputs)]
+      tf_x = ops.convert_n_to_tensor(x)
+      with self.test_session(use_gpu=True):
+        self.assertAllClose(sum(x), math_ops.add_n(tf_x).eval())
+        self.assertAllClose(x[0] * num_inputs,
+                            math_ops.add_n([tf_x[0]] * num_inputs).eval())
+
+  def testInt(self):
+    np.random.seed(54321)
+    for num_inputs in range(1, 10):
+      x = [
+          np.random.randint(-128, 128, (5, 4, 3, 2, 1))
+          for _ in range(num_inputs)
+      ]
+      tf_x = ops.convert_n_to_tensor(x)
+      with self.test_session(use_gpu=True):
+        self.assertAllEqual(sum(x), math_ops.add_n(tf_x).eval())
+        self.assertAllEqual(x[0] * num_inputs,
+                            math_ops.add_n([tf_x[0]] * num_inputs).eval())
+
+
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
@@ -254,6 +327,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     divs = np.arange(-3, 0, .25).reshape(1, 12)
     return nums, divs
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorModInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -263,6 +337,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -274,6 +349,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               % array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -281,6 +357,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -288,6 +365,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDivideInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -299,12 +377,14 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               // array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDivideName(self):
     with self.test_session():
-      op = math_ops.divide(array_ops.constant(3),
-                           array_ops.constant(4), name="my_cool_divide")
+      op = math_ops.divide(
+          array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testRealDiv(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -312,12 +392,14 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testComplexDiv(self):
-    foo = array_ops.constant([1.+3.j])
+    foo = array_ops.constant([1. + 3.j])
     with self.test_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorDivGrad(self):
     with self.test_session():
       a = variables.Variable(2.)
@@ -329,30 +411,28 @@ class DivAndModTest(test_util.TensorFlowTestCase):
         c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
-        self.assertAllEqual([None if x is None else x.eval() for x in c_grad],
-                            [None, None])
+        self.assertAllEqual([None if x is None else x.eval()
+                             for x in c_grad], [None, None])
 
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testConsistent(self):
     nums, divs = self.intTestData()
     with self.test_session():
-      tf_result = (
-          math_ops.floor_div(nums, divs) * divs + math_ops.floormod(nums, divs)
-      ).eval()
+      tf_result = (math_ops.floor_div(nums, divs) * divs + math_ops.floormod(
+          nums, divs)).eval()
       tf_nums = array_ops.constant(nums)
       tf_divs = array_ops.constant(divs)
       tf2_result = (tf_nums // tf_divs * tf_divs + tf_nums % tf_divs).eval()
       np_result = (nums // divs) * divs + (nums % divs)
-      # consistentcy with numpy
+      # Consistent with numpy
       self.assertAllEqual(tf_result, np_result)
-      # consistentcy with two forms of divide
+      # Consistent with two forms of divide
       self.assertAllEqual(tf_result, tf2_result)
       # consistency for truncation form
-      tf3_result = (
-          math_ops.truncatediv(nums, divs) * divs
-          + math_ops.truncatemod(nums, divs)
-      ).eval()
-      expanded_nums = np.reshape(np.tile(nums, divs.shape[1]),
-                                 (nums.shape[0], divs.shape[1]))
+      tf3_result = (math_ops.truncatediv(nums, divs) * divs +
+                    math_ops.truncatemod(nums, divs)).eval()
+      expanded_nums = np.reshape(
+          np.tile(nums, divs.shape[1]), (nums.shape[0], divs.shape[1]))
       # Consistent with desire to get numerator
       self.assertAllEqual(tf3_result, expanded_nums)
       # Consistent with desire to get numerator
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
index bc2eaa506f9..f504a46178f 100644
--- a/tensorflow/python/ops/metrics.py
+++ b/tensorflow/python/ops/metrics.py
@@ -23,6 +23,7 @@
 @@mean_absolute_error
 @@mean_cosine_distance
 @@mean_iou
+@@mean_per_class_accuracy
 @@mean_relative_error
 @@mean_squared_error
 @@mean_tensor
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 55aa4255249..fab4c5cb0f2 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import sets
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.ops import weights_broadcast_ops
 
 
@@ -45,7 +44,7 @@ def _local_variable(initial_value, validate_shape=True, name=None):
   Returns:
     New variable.
   """
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=validate_shape, name=name)
@@ -76,28 +75,48 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
         labels, predictions)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-  if weights is not None:
-    weights = ops.convert_to_tensor(weights)
-    predictions_shape = predictions.get_shape()
-    predictions_rank = predictions_shape.ndims
-    weights_shape = weights.get_shape()
-    weights_rank = weights_shape.ndims
+  if weights is None:
+    return predictions, labels, None
 
-    # TODO(ptucker): Add logic to handle weights rank 1 less than predictions &
-    # labels.
-    if (predictions_rank is not None) and (weights_rank is not None):
-      # Use static rank.
-      if weights_rank - predictions_rank == 1:
-        weights = array_ops.squeeze(weights, [-1])
-    elif ((weights_rank is None) or
-          ((weights_rank > 0) and
-           weights_shape.dims[-1].is_compatible_with(1))):
-      # Use dynamic rank.
-      weights = control_flow_ops.cond(
-          math_ops.equal(array_ops.rank(weights),
-                         math_ops.add(array_ops.rank(predictions), 1)),
-          lambda: array_ops.squeeze(weights, [-1]),
+  weights = ops.convert_to_tensor(weights)
+  weights_shape = weights.get_shape()
+  weights_rank = weights_shape.ndims
+  if weights_rank == 0:
+    return predictions, labels, weights
+
+  predictions_shape = predictions.get_shape()
+  predictions_rank = predictions_shape.ndims
+  if (predictions_rank is not None) and (weights_rank is not None):
+    # Use static rank.
+    if weights_rank - predictions_rank == 1:
+      weights = array_ops.squeeze(weights, [-1])
+    elif predictions_rank - weights_rank == 1:
+      weights = array_ops.expand_dims(weights, [-1])
+  else:
+    # Use dynamic rank.
+    weights_rank_tensor = array_ops.rank(weights)
+    rank_diff = weights_rank_tensor - array_ops.rank(predictions)
+    def _maybe_expand_weights():
+      return control_flow_ops.cond(
+          math_ops.equal(rank_diff, -1),
+          lambda: array_ops.expand_dims(weights, [-1]),
           lambda: weights)
+    # Don't attempt squeeze if it will fail based on static check.
+    if ((weights_rank is not None) and
+        (not weights_shape.dims[-1].is_compatible_with(1))):
+      maybe_squeeze_weights = lambda: weights
+    else:
+      maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1])
+    def _maybe_adjust_weights():
+      return control_flow_ops.cond(
+          math_ops.equal(rank_diff, 1),
+          maybe_squeeze_weights,
+          _maybe_expand_weights)
+    # If weights are scalar, do nothing. Otherwise, try to add or remove a
+    # dimension to match predictions.
+    weights = control_flow_ops.cond(
+        math_ops.equal(weights_rank_tensor, 0),
+        lambda: weights, _maybe_adjust_weights)
   return predictions, labels, weights
 
 
@@ -169,8 +188,8 @@ def _create_local(name, shape, collections=None, validate_shape=True,
   # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
   collections = list(collections or [])
   collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variables.Variable(
-      initial_value=array_ops.zeros(shape, dtype=dtype),
+  return variable_scope.variable(
+      lambda: array_ops.zeros(shape, dtype=dtype),
       name=name,
       trainable=False,
       collections=collections,
@@ -216,6 +235,58 @@ def _safe_scalar_div(numerator, denominator, name):
       name=name)
 
 
+def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
+  """Calculate a streaming confusion matrix.
+
+  Calculates a confusion matrix. For estimation over a stream of data,
+  the function creates an  `update_op` operation.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with shape [batch size] and of
+      type `int32` or `int64`. The tensor will be flattened if its rank > 1.
+    predictions: A `Tensor` of prediction results for semantic labels, whose
+      shape is [batch size] and type `int32` or `int64`. The tensor will be
+      flattened if its rank > 1.
+    num_classes: The possible number of labels the prediction task can
+      have. This value must be provided, since a confusion matrix of
+      dimension = [num_classes, num_classes] will be allocated.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+
+  Returns:
+    total_cm: A `Tensor` representing the confusion matrix.
+    update_op: An operation that increments the confusion matrix.
+  """
+  # Local variable to accumulate the predictions in the confusion matrix.
+  cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
+  total_cm = _create_local(
+      'total_confusion_matrix',
+      shape=[num_classes, num_classes],
+      dtype=cm_dtype)
+
+  # Cast the type to int64 required by confusion_matrix_ops.
+  predictions = math_ops.to_int64(predictions)
+  labels = math_ops.to_int64(labels)
+  num_classes = math_ops.to_int64(num_classes)
+
+  # Flatten the input if its rank > 1.
+  if predictions.get_shape().ndims > 1:
+    predictions = array_ops.reshape(predictions, [-1])
+
+  if labels.get_shape().ndims > 1:
+    labels = array_ops.reshape(labels, [-1])
+
+  if (weights is not None) and (weights.get_shape().ndims > 1):
+    weights = array_ops.reshape(weights, [-1])
+
+  # Accumulate the prediction to current confusion matrix.
+  current_cm = confusion_matrix.confusion_matrix(
+      labels, predictions, num_classes, weights=weights, dtype=cm_dtype)
+  update_op = state_ops.assign_add(total_cm, current_cm)
+  return total_cm, update_op
+
+
 def mean(values, weights=None, metrics_collections=None,
          updates_collections=None, name=None):
   """Computes the (weighted) mean of the given values.
@@ -706,6 +777,85 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
   return mean_distance, update_op
 
 
+def mean_per_class_accuracy(labels,
+                            predictions,
+                            num_classes,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Calculates the mean of the per-class accuracies.
+
+  Calculates the accuracy for each class, then takes the mean of that.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_accuracy`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with shape [batch size] and of
+      type `int32` or `int64`. The tensor will be flattened if its rank > 1.
+    predictions: A `Tensor` of prediction results for semantic labels, whose
+      shape is [batch size] and type `int32` or `int64`. The tensor will be
+      flattened if its rank > 1.
+    num_classes: The possible number of labels the prediction task can
+      have. This value must be provided, since a confusion matrix of
+      dimension = [num_classes, num_classes] will be allocated.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    metrics_collections: An optional list of collections that
+      `mean_per_class_accuracy'
+      should be added to.
+    updates_collections: An optional list of collections `update_op` should be
+      added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_accuracy: A `Tensor` representing the mean per class accuracy.
+    update_op: An operation that increments the confusion matrix.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'mean_accuracy',
+                                     (predictions, labels, weights)):
+    # Check if shape is compatible.
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    total_cm, update_op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights=weights)
+
+    def compute_mean_accuracy(name):
+      """Compute the mean per class accuracy via the confusion matrix."""
+      per_row_sum = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
+      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
+      denominator = per_row_sum
+
+      # If the value of the denominator is 0, set it to 1 to avoid
+      # zero division.
+      denominator = array_ops.where(
+          math_ops.greater(denominator, 0), denominator,
+          array_ops.ones_like(denominator))
+      accuracies = math_ops.div(cm_diag, denominator)
+      return math_ops.reduce_mean(accuracies, name=name)
+
+    mean_accuracy_v = compute_mean_accuracy('mean_accuracy')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_accuracy_v)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_accuracy_v, update_op
+
+
 def mean_iou(labels,
              predictions,
              num_classes,
@@ -761,31 +911,8 @@ def mean_iou(labels,
     # Check if shape is compatible.
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    # Local variable to accumulate the predictions in the confusion matrix.
-    cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
-    total_cm = _create_local('total_confusion_matrix',
-                             shape=[num_classes, num_classes], dtype=cm_dtype)
-
-    # Cast the type to int64 required by confusion_matrix_ops.
-    predictions = math_ops.to_int64(predictions)
-    labels = math_ops.to_int64(labels)
-    num_classes = math_ops.to_int64(num_classes)
-
-    # Flatten the input if its rank > 1.
-    if predictions.get_shape().ndims > 1:
-      predictions = array_ops.reshape(predictions, [-1])
-
-    if labels.get_shape().ndims > 1:
-      labels = array_ops.reshape(labels, [-1])
-
-    if (weights is not None) and (weights.get_shape().ndims > 1):
-      weights = array_ops.reshape(weights, [-1])
-
-    # Accumulate the prediction to current confusion matrix.
-    current_cm = confusion_matrix.confusion_matrix(
-        labels, predictions, num_classes, weights=weights, dtype=cm_dtype)
-    update_op = state_ops.assign_add(total_cm, current_cm)
-
+    total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
+                                                      num_classes, weights)
     def compute_mean_iou(name):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
@@ -1345,7 +1472,7 @@ def false_negatives(labels, predictions, weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
-  """Computes the total number of false positives.
+  """Computes the total number of false negatives.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
@@ -1375,9 +1502,10 @@ def false_negatives(labels, predictions, weights=None,
   with variable_scope.variable_scope(
       name, 'false_negatives', (predictions, labels, weights)):
 
-    labels = math_ops.cast(labels, dtype=dtypes.bool)
-    predictions = math_ops.cast(predictions, dtype=dtypes.bool)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions, labels, weights = _remove_squeezable_dimensions(
+        predictions=math_ops.cast(predictions, dtype=dtypes.bool),
+        labels=math_ops.cast(labels, dtype=dtypes.bool),
+        weights=weights)
     is_false_negative = math_ops.logical_and(math_ops.equal(labels, True),
                                              math_ops.equal(predictions, False))
     return _count_condition(is_false_negative, weights, metrics_collections,
@@ -1608,7 +1736,7 @@ def _streaming_sparse_true_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('true_positive', k, class_id=class_id),
@@ -1704,7 +1832,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_negative', k, class_id=class_id),
@@ -1797,7 +1925,74 @@ def recall_at_k(labels,
     labels = _maybe_expand_labels(labels, predictions)
 
     _, top_k_idx = nn.top_k(predictions, k)
-    top_k_idx = math_ops.to_int64(top_k_idx)
+    return _sparse_recall_at_top_k(
+        labels=labels,
+        predictions_idx=top_k_idx,
+        k=k,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
+
+
+def _sparse_recall_at_top_k(labels,
+                            predictions_idx,
+                            k=None,
+                            class_id=None,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  Differs from `recall_at_k` in that predictions must be in the form of top `k`
+  class indices, whereas `recall_at_k` expects logits. Refer to `recall_at_k`
+  for more details.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
+      num_labels=1. N >= 1 and num_labels is the number of target classes for
+      the associated prediction. Commonly, N=1 and `labels` has shape
+      [batch_size, num_labels]. [D1, ... DN] must match `predictions`. Values
+      should be in range [0, num_classes), where num_classes is the last
+      dimension of `predictions`. Values outside this range always count
+      towards `false_negative_at_<k>`.
+    predictions_idx: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and predictions has shape [batch size, k]. The final
+      dimension contains the top `k` predicted class indices. [D1, ... DN] must
+      match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  with ops.name_scope(name,
+                      _at_k_name('recall', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
+    top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
         weights=weights)
@@ -2125,7 +2320,7 @@ def _num_relevant(labels, k):
     return array_ops.fill(labels_shape[0:-1], num_relevant_scalar, name=scope)
 
 
-def _sparse_average_precision_at_k(labels, predictions, k):
+def _sparse_average_precision_at_top_k(labels, predictions_idx):
   """Computes average precision@k of predictions with respect to sparse labels.
 
   From en.wikipedia.org/wiki/Information_retrieval#Average_precision, formula
@@ -2133,46 +2328,39 @@ def _sparse_average_precision_at_k(labels, predictions, k):
 
     AveP = sum_{i=1...k} P_{i} * rel_{i} / num_relevant_items
 
-  A "row" is the elements in dimension [D1, ... DN] of `predictions`, `labels`,
-  and the result `Tensors`. In the common case, this is [batch_size]. Each row
-  of the results contains the average precision for that row.
-
-  Internally, a `top_k` operation computes a `Tensor` indicating the top `k`
-  `predictions`. Set operations applied to `top_k` and `labels` calculate the
-  true positives, which are used to calculate the precision ("P_{i}" term,
-  above).
+  A "row" is the elements in dimension [D1, ... DN] of `predictions_idx`,
+  `labels`, and the result `Tensors`. In the common case, this is [batch_size].
+  Each row of the results contains the average precision for that row.
 
   Args:
     labels: `int64` `Tensor` or `SparseTensor` with shape
       [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
       num_labels=1. N >= 1 and num_labels is the number of target classes for
       the associated prediction. Commonly, N=1 and `labels` has shape
-      [batch_size, num_labels]. [D1, ... DN] must match `predictions`. Values
-      should be in range [0, num_classes), where num_classes is the last
-      dimension of `predictions`. Values outside this range are ignored.
-    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
-      N >= 1. Commonly, N=1 and `predictions` has shape
-      [batch size, num_classes]. The final dimension contains the logit values
-      for each class. [D1, ... DN] must match `labels`.
-    k: Integer, k for @k metric. This will calculate an average precision for
-      range `[1,k]`, as documented above.
+      [batch_size, num_labels]. [D1, ... DN] must match `predictions_idx`.
+      Values should be in range [0, num_classes).
+    predictions_idx: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and `predictions_idx` has shape [batch size, k]. The final
+      dimension must be set and contains the top `k` predicted class indices.
+      [D1, ... DN] must match `labels`. Values should be in range
+      [0, num_classes).
 
   Returns:
     `float64` `Tensor` of shape [D1, ... DN], where each value is the average
     precision for that row.
 
   Raises:
-    ValueError: if k is invalid.
+    ValueError: if the last dimension of predictions_idx is not set.
   """
-  if k < 1:
-    raise ValueError('Invalid k=%s.' % k)
   with ops.name_scope(
-      None, 'average_precision', (predictions, labels, k)) as scope:
-    labels = _maybe_expand_labels(labels, predictions)
-
-    # Calculate top k indices to produce [D1, ... DN, k] tensor.
-    _, predictions_idx = nn.top_k(predictions, k)
+      None, 'average_precision', (predictions_idx, labels)) as scope:
     predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
+    if predictions_idx.get_shape().ndims == 0:
+      raise ValueError('The rank of predictions_idx must be at least 1.')
+    k = predictions_idx.get_shape().as_list()[-1]
+    if k is None:
+      raise ValueError('The last dimension of predictions_idx must be set.')
+    labels = _maybe_expand_labels(labels, predictions_idx)
 
     # Expand dims to produce [D1, ... DN, k, 1] tensor. This gives us a separate
     # prediction for each k, so we can calculate separate true positive values
@@ -2218,6 +2406,99 @@ def _sparse_average_precision_at_k(labels, predictions, k):
     return math_ops.div(precision_sum, num_relevant_items, name=scope)
 
 
+def _streaming_sparse_average_precision_at_top_k(labels,
+                                                 predictions_idx,
+                                                 weights=None,
+                                                 metrics_collections=None,
+                                                 updates_collections=None,
+                                                 name=None):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  `sparse_average_precision_at_top_k` creates two local variables,
+  `average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
+  are used to compute the frequency. This frequency is ultimately returned as
+  `average_precision_at_<k>`: an idempotent operation that simply divides
+  `average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Set operations applied to `top_k` and `labels` calculate
+  the true positives and false positives weighted by `weights`. Then `update_op`
+  increments `true_positive_at_<k>` and `false_positive_at_<k>` using these
+  values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
+      num_labels=1. N >= 1 and num_labels is the number of target classes for
+      the associated prediction. Commonly, N=1 and `labels` has shape
+      [batch_size, num_labels]. [D1, ... DN] must match `predictions_idx`.
+      Values should be in range [0, num_classes).
+    predictions_idx: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and `predictions_idx` has shape [batch size, k]. The final
+      dimension contains the top `k` predicted class indices. [D1, ... DN] must
+      match `labels`. Values should be in range [0, num_classes).
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    mean_average_precision: Scalar `float64` `Tensor` with the mean average
+      precision values.
+    update: `Operation` that increments  variables appropriately, and whose
+      value matches `metric`.
+  """
+  with ops.name_scope(name, 'average_precision_at_top_k',
+                      (predictions_idx, labels, weights)) as scope:
+    # Calculate per-example average precision, and apply weights.
+    average_precision = _sparse_average_precision_at_top_k(
+        predictions_idx=predictions_idx, labels=labels)
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(
+          math_ops.to_double(weights), average_precision)
+      average_precision = math_ops.multiply(average_precision, weights)
+
+    # Create accumulation variables and update ops for max average precision and
+    # total average precision.
+    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
+      # `max` is the max possible precision. Since max for any row is 1.0:
+      # - For the unweighted case, this is just the number of rows.
+      # - For the weighted case, it's the sum of the weights broadcast across
+      #   `average_precision` rows.
+      max_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
+      if weights is None:
+        batch_max = math_ops.to_double(
+            array_ops.size(average_precision, name='batch_max'))
+      else:
+        batch_max = math_ops.reduce_sum(weights, name='batch_max')
+      max_update = state_ops.assign_add(max_var, batch_max, name='update')
+    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
+      total_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
+      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
+      total_update = state_ops.assign_add(total_var, batch_total, name='update')
+
+    # Divide total by max to get mean, for both vars and the update ops.
+    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+    update = _safe_scalar_div(total_update, max_update, name=scope)
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_average_precision)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+
+    return mean_average_precision, update
+
+
 def sparse_average_precision_at_k(labels,
                                   predictions,
                                   k,
@@ -2272,49 +2553,24 @@ def sparse_average_precision_at_k(labels,
       precision values.
     update: `Operation` that increments  variables appropriately, and whose
       value matches `metric`.
+
+  Raises:
+    ValueError: if k is invalid.
   """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
   with ops.name_scope(
       name, _at_k_name('average_precision', k),
       (predictions, labels, weights)) as scope:
-    # Calculate per-example average precision, and apply weights.
-    average_precision = _sparse_average_precision_at_k(
-        predictions=predictions, labels=labels, k=k)
-    if weights is not None:
-      weights = weights_broadcast_ops.broadcast_weights(
-          math_ops.to_double(weights), average_precision)
-      average_precision = math_ops.multiply(average_precision, weights)
-
-    # Create accumulation variables and update ops for max average precision and
-    # total average precision.
-    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
-      # `max` is the max possible precision. Since max for any row is 1.0:
-      # - For the unweighted case, this is just the number of rows.
-      # - For the weighted case, it's the sum of the weights broadcast across
-      #   `average_precision` rows.
-      max_var = _local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
-      if weights is None:
-        batch_max = math_ops.to_double(
-            array_ops.size(average_precision, name='batch_max'))
-      else:
-        batch_max = math_ops.reduce_sum(weights, name='batch_max')
-      max_update = state_ops.assign_add(max_var, batch_max, name='update')
-    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
-      total_var = _local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
-      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
-      total_update = state_ops.assign_add(total_var, batch_total, name='update')
-
-    # Divide total by max to get mean, for both vars and the update ops.
-    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-    update = _safe_scalar_div(total_update, max_update, name=scope)
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_average_precision)
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update)
-
-    return mean_average_precision, update
+    # Calculate top k indices to produce [D1, ... DN, k] tensor.
+    _, predictions_idx = nn.top_k(predictions, k)
+    return _streaming_sparse_average_precision_at_top_k(
+        labels=labels,
+        predictions_idx=predictions_idx,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
 
 
 def _sparse_false_positive_at_k(labels,
@@ -2398,7 +2654,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_positive', k, class_id=class_id),
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 00626eba037..d05cba2e930 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -14,16 +14,9 @@
 # =============================================================================
 
 # pylint: disable=unused-import,g-bad-import-order
-"""## Activation Functions.
+"""Neural network support.
 
-The activation ops provide different types of nonlinearities for use in neural
-networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`,
-`softplus`, and `softsign`), continuous but not everywhere differentiable
-functions (`relu`, `relu6`, `crelu` and `relu_x`), and random regularization
-(`dropout`).
-
-All activation ops apply componentwise, and produce a tensor of the same
-shape as the input tensor.
+See the @{$python/nn} guide.
 
 @@relu
 @@relu6
@@ -34,79 +27,8 @@ shape as the input tensor.
 @@dropout
 @@bias_add
 @@sigmoid
+@@log_sigmoid
 @@tanh
-
-## Convolution
-
-The convolution ops sweep a 2-D filter over a batch of images, applying the
-filter to each window of each image of the appropriate size.  The different
-ops trade off between generic vs. specific filters:
-
-* `conv2d`: Arbitrary filters that can mix channels together.
-* `depthwise_conv2d`: Filters that operate on each channel independently.
-* `separable_conv2d`: A depthwise spatial filter followed by a pointwise filter.
-
-Note that although these ops are called "convolution", they are strictly
-speaking "cross-correlation" since the filter is combined with an input window
-without reversing the filter.  For details, see [the properties of
-cross-correlation](https://en.wikipedia.org/wiki/Cross-correlation#Properties).
-
-The filter is applied to image patches of the same size as the filter and
-strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
-the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
-filter to every other image patch in each dimension, etc.
-
-Ignoring channels for the moment, and assume that the 4-D `input` has shape
-`[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
-`[filter_height, filter_width, ...]`, then the spatial semantics of the
-convolution ops are as follows: first, according to the padding scheme chosen
-as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
-For the `'SAME'` padding, the output height and width are computed as:
-
-    out_height = ceil(float(in_height) / float(strides[1]))
-    out_width  = ceil(float(in_width) / float(strides[2]))
-
-and the padding on the top and left are computed as:
-
-    pad_along_height = max((out_height - 1) * strides[1] +
-                        filter_height - in_height, 0)
-    pad_along_width = max((out_width - 1) * strides[2] +
-                       filter_width - in_width, 0)
-    pad_top = pad_along_height // 2
-    pad_bottom = pad_along_height - pad_top
-    pad_left = pad_along_width // 2
-    pad_right = pad_along_width - pad_left
-
-
-Note that the division by 2 means that there might be cases when the padding on
-both sides (top vs bottom, right vs left) are off by one. In this case, the
-bottom and right sides always get the one additional padded pixel. For example,
-when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the
-bottom. Note that this is different from existing libraries such as cuDNN and
-Caffe, which explicitly specify the number of padded pixels and always pad the
-same number of pixels on both sides.
-
-For the `'VALID`' padding, the output height and width are computed as:
-
-    out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
-    out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
-
-and the padding values are always zero. The output is then computed as
-
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
-
-where any value outside the original input image region are considered zero (
-i.e. we pad zero values around the border of the image).
-
-Since `input` is 4-D, each `input[b, i, j, :]` is a vector.  For `conv2d`, these
-vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new
-vectors.  For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]`
-is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
-concatenated.
-
 @@convolution
 @@conv2d
 @@depthwise_conv2d
@@ -123,22 +45,6 @@ concatenated.
 @@conv3d_backprop_filter_v2
 @@depthwise_conv2d_native_backprop_filter
 @@depthwise_conv2d_native_backprop_input
-
-## Pooling
-
-The pooling ops sweep a rectangular window over the input tensor, computing a
-reduction operation for each window (average, max, or max with argmax).  Each
-pooling op uses rectangular windows of size `ksize` separated by offset
-`strides`.  For example, if `strides` is all ones every window is used, if
-`strides` is all twos every other window is used in each dimension, etc.
-
-In detail, the output is
-
-    output[i] = reduce(value[strides * i:strides * i + ksize])
-
-where the indices also take into consideration the padding values. Please refer
-to the `Convolution` section for details about the padding calculation.
-
 @@avg_pool
 @@max_pool
 @@max_pool_with_argmax
@@ -147,53 +53,9 @@ to the `Convolution` section for details about the padding calculation.
 @@fractional_avg_pool
 @@fractional_max_pool
 @@pool
-
-## Morphological filtering
-
-Morphological operators are non-linear filters used in image processing.
-
-[Greyscale morphological dilation
-](https://en.wikipedia.org/wiki/Dilation_(morphology))
-is the max-sum counterpart of standard sum-product convolution:
-
-    output[b, y, x, c] =
-        max_{dy, dx} input[b,
-                           strides[1] * y + rates[1] * dy,
-                           strides[2] * x + rates[2] * dx,
-                           c] +
-                     filter[dy, dx, c]
-
-The `filter` is usually called structuring function. Max-pooling is a special
-case of greyscale morphological dilation when the filter assumes all-zero
-values (a.k.a. flat structuring function).
-
-[Greyscale morphological erosion
-](https://en.wikipedia.org/wiki/Erosion_(morphology))
-is the min-sum counterpart of standard sum-product convolution:
-
-    output[b, y, x, c] =
-        min_{dy, dx} input[b,
-                           strides[1] * y - rates[1] * dy,
-                           strides[2] * x - rates[2] * dx,
-                           c] -
-                     filter[dy, dx, c]
-
-Dilation and erosion are dual to each other. The dilation of the input signal
-`f` by the structuring signal `g` is equal to the negation of the erosion of
-`-f` by the reflected `g`, and vice versa.
-
-Striding and padding is carried out in exactly the same way as in standard
-convolution. Please refer to the `Convolution` section for details.
-
 @@dilation2d
 @@erosion2d
 @@with_space_to_batch
-
-## Normalization
-
-Normalization is useful to prevent neurons from saturating when inputs may
-have varying scale, and to aid generalization.
-
 @@l2_normalize
 @@local_response_normalization
 @@sufficient_statistics
@@ -203,100 +65,38 @@ have varying scale, and to aid generalization.
 @@fused_batch_norm
 @@batch_normalization
 @@batch_norm_with_global_normalization
-
-## Losses
-
-The loss ops measure error between two tensors, or between a tensor and zero.
-These can be used for measuring accuracy of a network in a regression task
-or for regularization purposes (weight decay).
-
 @@l2_loss
 @@log_poisson_loss
-
-## Classification
-
-TensorFlow provides several operations that help you perform classification.
-
 @@sigmoid_cross_entropy_with_logits
 @@softmax
 @@log_softmax
 @@softmax_cross_entropy_with_logits
 @@sparse_softmax_cross_entropy_with_logits
 @@weighted_cross_entropy_with_logits
-
-## Embeddings
-
-TensorFlow provides library support for looking up values in embedding
-tensors.
-
 @@embedding_lookup
 @@embedding_lookup_sparse
-
-## Recurrent Neural Networks
-
-TensorFlow provides a number of methods for constructing Recurrent
-Neural Networks.  Most accept an `RNNCell`-subclassed object
-(see the documentation for `tf.contrib.rnn`).
-
 @@dynamic_rnn
 @@bidirectional_dynamic_rnn
 @@raw_rnn
-
-## Connectionist Temporal Classification (CTC)
-
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
 @@ctc_loss
 @@ctc_greedy_decoder
 @@ctc_beam_search_decoder
-
-## Evaluation
-
-The evaluation ops are useful for measuring the performance of a network.
-They are typically used at evaluation time.
-
 @@top_k
 @@in_top_k
-
-## Candidate Sampling
-
-Do you want to train a multiclass or multilabel model with thousands
-or millions of output classes (for example, a language model with a
-large vocabulary)?  Training with a full Softmax is slow in this case,
-since all of the classes are evaluated for every training example.
-Candidate Sampling training algorithms can speed up your step times by
-only considering a small randomly-chosen subset of contrastive classes
-(called candidates) for each batch of training examples.
-
-See our
-[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
-
-### Sampled Loss Functions
-
-TensorFlow provides the following sampled loss functions for faster training.
-
 @@nce_loss
 @@sampled_softmax_loss
-
-### Candidate Samplers
-
-TensorFlow provides the following samplers for randomly sampling candidate
-classes when using one of the sampled loss functions above.
-
 @@uniform_candidate_sampler
 @@log_uniform_candidate_sampler
 @@learned_unigram_candidate_sampler
 @@fixed_unigram_candidate_sampler
-
-### Miscellaneous candidate sampling utilities
-
 @@compute_accidental_hits
-
-### Quantization ops
-
 @@quantized_conv2d
 @@quantized_relu_x
 @@quantized_max_pool
 @@quantized_avg_pool
-
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -316,14 +116,15 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 # Bring more nn-associated functionality into this package.
 # go/tf-wildcard-import
-# pylint: disable=wildcard-import
+# pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.ctc_ops import *
 from tensorflow.python.ops.nn_impl import *
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
 from tensorflow.python.ops.rnn import *
-# pylint: enable=wildcard-import
+from tensorflow.python.ops import rnn_cell
+# pylint: enable=wildcard-import,unused-import
 
 
 # TODO(cwhipkey): sigmoid and tanh should not be exposed from tf.nn.
@@ -338,6 +139,7 @@ _allowed_symbols = [
     "lrn",  # Excluded in gen_docs_combined.
     "relu_layer",  # Excluded in gen_docs_combined.
     "xw_plus_b",  # Excluded in gen_docs_combined.
+    "rnn_cell",  # rnn_cell is a submodule of tf.nn.
 ]
 
 remove_undocumented(__name__, _allowed_symbols,
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 6147cdf2213..028d82aa4da 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import gen_nn_ops
 
 
 @ops.RegisterGradient("Conv2DBackpropInput")
@@ -70,64 +71,118 @@ def _Conv2DBackpropFilterGrad(op, grad):
 
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
+  data_format = op.get_attr("data_format")
   return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
                                           op.inputs[1],
                                           grad,
                                           strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding")),
+                                          padding=op.get_attr("padding"),
+                                          data_format=data_format),
           nn_ops.conv3d_backprop_filter_v2(op.inputs[0],
                                            array_ops.shape(op.inputs[1]),
                                            grad,
                                            strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"))]
+                                           padding=op.get_attr("padding"),
+                                           data_format=data_format)]
 
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
+  data_format = op.get_attr("data_format")
   return [None,
           nn_ops.conv3d_backprop_filter_v2(grad,
                                            array_ops.shape(op.inputs[1]),
                                            op.inputs[2],
                                            strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding")),
+                                           padding=op.get_attr("padding"),
+                                           data_format=data_format),
           nn_ops.conv3d(grad,
                         op.inputs[1],
                         strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"))]
+                        padding=op.get_attr("padding"),
+                        data_format=data_format)]
 
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
+  data_format = op.get_attr("data_format")
   return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
                                           grad,
                                           op.inputs[2],
                                           strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding")),
+                                          padding=op.get_attr("padding"),
+                                          data_format=data_format),
           None,
           nn_ops.conv3d(op.inputs[0],
                         grad,
                         strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"))]
+                        padding=op.get_attr("padding"),
+                        data_format=data_format)]
 
 
 @ops.RegisterGradient("AvgPool3D")
 def _AvgPool3DGrad(op, grad):
-  return nn_ops.avg_pool3d_grad(
+  return gen_nn_ops._avg_pool3d_grad(
       array_ops.shape(op.inputs[0]),
       grad,
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
-      padding=op.get_attr("padding"))
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
+
+
+@ops.RegisterGradient("AvgPool3DGrad")
+def _AvgPool3DGradGrad(op, grad):
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3D")
 def _MaxPool3DGrad(op, grad):
-  return nn_ops.max_pool3d_grad(op.inputs[0],
-                                op.outputs[0],
-                                grad,
-                                ksize=op.get_attr("ksize"),
-                                strides=op.get_attr("strides"),
-                                padding=op.get_attr("padding"))
+  return gen_nn_ops._max_pool3d_grad(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      ksize=op.get_attr("ksize"),
+      strides=op.get_attr("strides"),
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
+
+
+@ops.RegisterGradient("MaxPool3DGrad")
+def _MaxPool3DGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
+@ops.RegisterGradient("MaxPool3DGradGrad")
+def _MaxPool3DGradGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("Softmax")
@@ -203,6 +258,7 @@ def _BiasAddGrad(op, received_grad):
   return (received_grad, gen_nn_ops.bias_add_grad(out_backprop=received_grad,
                                                   data_format=data_format))
 
+
 @ops.RegisterGradient("BiasAddGrad")
 def _BiasAddGradGrad(op, received_grad):
   """Gradient for the BiasAddGrad op.
@@ -275,7 +331,7 @@ def _EluGradGrad(op, grad):
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
           array_ops.where(
               x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1),
-              array_ops.zeros(shape = array_ops.shape(x), dtype = x.dtype)))
+              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -293,6 +349,19 @@ def _SoftplusGrad(op, grad):
   return gen_nn_ops._softplus_grad(grad, op.inputs[0])
 
 
+@ops.RegisterGradient("SoftplusGrad")
+def _SoftplusGradGrad(op, grad):
+  # Let:
+  #   y = tf.nn.softplus(x)
+  #   dx = gen_nn_ops._softplus_grad(dy, x) = dy / (1 + exp(-x))
+  # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
+  dy, x = op.inputs
+  with ops.control_dependencies([grad.op]):
+    ddy = gen_nn_ops._softplus_grad(grad, x)  # pylint: disable=protected-access
+    d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
+    return (ddy, d2x)
+
+
 @ops.RegisterGradient("Softsign")
 def _SoftsignGrad(op, grad):
   return gen_nn_ops._softsign_grad(grad, op.inputs[0])
@@ -321,18 +390,33 @@ def _BroadcastMul(vec, mat):
 
 
 @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits")
-def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
+def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
   """Gradient function for SoftmaxCrossEntropyWithLogits."""
-  # grad_0 is the backprop for cost, and we multiply it with the gradients
+  # grad_loss is the backprop for cost, and we multiply it with the gradients
   # (which is output[1])
+  # grad_grad is the backprop for softmax gradient.
   # There is no gradient for the labels
   #
-  # Currently there is no way to take the second derivative of this op
-  # due to the fused implementation's interaction with tf.gradients(),
-  # so we make sure we prevent silently incorrect results by raising
-  # an error if the second derivative is requested via prevent_gradient.
-  softmax_grad_without_gradient = array_ops.prevent_gradient(op.outputs[1])
-  return _BroadcastMul(grad_0, softmax_grad_without_gradient), None
+  # Second derivative is just softmax derivative w.r.t. logits.
+  softmax_grad = op.outputs[1]
+  grad = _BroadcastMul(grad_loss, softmax_grad)
+
+  def IsZero(g):
+    # Some introspection to check if the gradient is feeding zeros
+    if g.op.type in ("ZerosLike", "Zeros"):
+      return True
+    const_fill_value = tensor_util.constant_value(g)
+    return const_fill_value is not None and (const_fill_value == 0).all()
+
+  if not IsZero(grad_grad):
+    logits = op.inputs[0]
+    softmax = nn_ops.softmax(logits)
+
+    grad += ((grad_grad - array_ops.squeeze(
+        math_ops.matmul(grad_grad[:, None, :],
+                        softmax[:, :, None]), axis=1)) * softmax)
+
+  return grad, None
 
 
 @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
@@ -347,7 +431,9 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
   sparse_softmax_grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1])
+      op.outputs[1], message="Currently there is no way to take the second "
+      "derivative of sparse_softmax_cross_entropy_with_logits due to the fused "
+      "implementation's interaction with tf.gradients()")
   return _BroadcastMul(grad_0, sparse_softmax_grad_without_gradient), None
 
 
@@ -369,11 +455,19 @@ def _Conv2DGrad(op, grad):
 def _DepthwiseConv2dNativeGrad(op, grad):
   return [
       nn_ops.depthwise_conv2d_native_backprop_input(
-          array_ops.shape(op.inputs[0]), op.inputs[1], grad,
-          op.get_attr("strides"), op.get_attr("padding")),
+          array_ops.shape(op.inputs[0]),
+          op.inputs[1],
+          grad,
+          op.get_attr("strides"),
+          op.get_attr("padding"),
+          data_format=op.get_attr("data_format")),
       nn_ops.depthwise_conv2d_native_backprop_filter(
-          op.inputs[0], array_ops.shape(op.inputs[1]), grad,
-          op.get_attr("strides"), op.get_attr("padding"))
+          op.inputs[0],
+          array_ops.shape(op.inputs[1]),
+          grad,
+          op.get_attr("strides"),
+          op.get_attr("padding"),
+          data_format=op.get_attr("data_format"))
   ]
 
 
@@ -410,6 +504,16 @@ def _AvgPoolGrad(op, grad):
       data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("AvgPoolGrad")
+def _AvgPoolGradGrad(op, grad):
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
+
+
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
   return gen_nn_ops._max_pool_grad(op.inputs[0],
@@ -421,6 +525,48 @@ def _MaxPoolGrad(op, grad):
                                    data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("MaxPoolWithArgmax")
+def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
+  return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
+                                               grad,
+                                               op.outputs[1],
+                                               op.get_attr("ksize"),
+                                               op.get_attr("strides"),
+                                               padding=op.get_attr("padding"))
+
+
+@ops.RegisterGradient("MaxPoolGrad")
+def _MaxPoolGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
+@ops.RegisterGradient("MaxPoolGradGrad")
+def _MaxPoolGradGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
 @ops.RegisterGradient("FractionalMaxPool")
 def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
   """Returns gradient for FractionalMaxPool.
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index e5242bb24de..254a8432d3f 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -182,12 +182,13 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
 
   The usual cross-entropy cost is defined as:
 
-    targets * -log(sigmoid(logits)) + (1 - targets) * -log(1 - sigmoid(logits))
+      targets * -log(sigmoid(logits)) +
+          (1 - targets) * -log(1 - sigmoid(logits))
 
   The argument `pos_weight` is used as a multiplier for the positive targets:
 
-    targets * -log(sigmoid(logits)) * pos_weight +
-        (1 - targets) * -log(1 - sigmoid(logits))
+      targets * -log(sigmoid(logits)) * pos_weight +
+          (1 - targets) * -log(1 - sigmoid(logits))
 
   For brevity, let `x = logits`, `z = targets`, `q = pos_weight`.
   The loss is:
@@ -300,9 +301,8 @@ def zero_fraction(value, name=None):
   This is useful in summaries to measure and report sparsity.  For example,
 
   ```python
-      z = tf.Relu(...)
-      summ = tf.contrib.deprecated.scalar_summary('sparsity',
-      tf.nn.zero_fraction(z))
+      z = tf.nn.relu(...)
+      summ = tf.summary.scalar('sparsity', tf.nn.zero_fraction(z))
   ```
 
   Args:
@@ -320,10 +320,16 @@ def zero_fraction(value, name=None):
 
 
 # pylint: disable=redefined-builtin
-def depthwise_conv2d(input, filter, strides, padding, rate=None, name=None):
+def depthwise_conv2d(input,
+                     filter,
+                     strides,
+                     padding,
+                     rate=None,
+                     name=None,
+                     data_format=None):
   """Depthwise 2-D convolution.
 
-  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
   and a filter tensor of shape
   `[filter_height, filter_width, in_channels, channel_multiplier]`
   containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
@@ -344,21 +350,22 @@ def depthwise_conv2d(input, filter, strides, padding, rate=None, name=None):
   to 1.
 
   Args:
-    input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+    input: 4-D with shape according to `data_format`.
     filter: 4-D with shape
       `[filter_height, filter_width, in_channels, channel_multiplier]`.
     strides: 1-D of size 4.  The stride of the sliding window for each
       dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment
-        here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
 
   Returns:
-    A 4-D `Tensor` of shape
+    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
+    "NHWC" format, shape is
     `[batch, out_height, out_width, in_channels * channel_multiplier].`
   """
   with ops.name_scope(name, "depthwise", [input, filter]) as name:
@@ -373,6 +380,7 @@ def depthwise_conv2d(input, filter, strides, padding, rate=None, name=None):
           filter=filter,
           strides=strides,
           padding=padding,
+          data_format=data_format,
           name=name)
 
     return nn_ops.with_space_to_batch(
@@ -380,6 +388,7 @@ def depthwise_conv2d(input, filter, strides, padding, rate=None, name=None):
         filter_shape=array_ops.shape(filter),
         dilation_rate=rate,
         padding=padding,
+        data_format=data_format,
         op=op)
 
 
@@ -393,7 +402,8 @@ def separable_conv2d(input,
                      strides,
                      padding,
                      rate=None,
-                     name=None):
+                     name=None,
+                     data_format=None):
   """2-D convolution with separable filters.
 
   Performs a depthwise convolution that acts separately on channels followed by
@@ -417,7 +427,7 @@ def separable_conv2d(input,
   to 1.
 
   Args:
-    input: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`.
+    input: 4-D `Tensor` with shape according to `data_format`.
     depthwise_filter: 4-D `Tensor` with shape
       `[filter_height, filter_width, in_channels, channel_multiplier]`.
       Contains `in_channels` convolutional filters of depth 1.
@@ -427,19 +437,17 @@ def separable_conv2d(input,
     strides: 1-D of size 4.  The strides for the depthwise convolution for
       each dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-      See the [comment
-        here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
 
   Returns:
-    A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`.
-
-  Raises:
-    ValueError: If channel_multiplier * in_channels > out_channels,
-      which means that the separable convolution is overparameterized.
+    A 4-D `Tensor` with shape according to 'data_format'. For
+      example, with data_format="NHWC", shape is [batch, out_height,
+      out_width, out_channels].
   """
   with ops.name_scope(name, "separable_conv2d",
                       [input, depthwise_filter, pointwise_filter]) as name:
@@ -453,22 +461,9 @@ def separable_conv2d(input,
     pointwise_filter_shape[0].assert_is_compatible_with(1)
     pointwise_filter_shape[1].assert_is_compatible_with(1)
 
-    channel_multiplier = depthwise_filter.get_shape().with_rank(4)[3]
-    in_channels = input.get_shape().with_rank(4)[3]
-    out_channels = pointwise_filter_shape[3]
-
     if rate is None:
       rate = [1, 1]
 
-    # If any of channel numbers is unknown, then the comparison below returns
-    # None. See TensorShape.__gt__().
-    if channel_multiplier * in_channels > out_channels:
-      raise ValueError("Refusing to perform an overparameterized separable "
-                       "convolution: channel_multiplier * in_channels = "
-                       "%d * %d = %d > %d = out_channels" %
-                       (channel_multiplier, in_channels,
-                        channel_multiplier * in_channels, out_channels))
-
     # The layout of the ops in the graph are expected to be as follows:
     # depthwise_conv2d  // Conv2D op corresponding to native deptwise conv.
     # separable_conv2d  // Conv2D op corresponding to the pointwise conv.
@@ -479,6 +474,7 @@ def separable_conv2d(input,
           filter=depthwise_filter,
           strides=strides,
           padding=padding,
+          data_format=data_format,
           name="depthwise")
 
     depthwise = nn_ops.with_space_to_batch(
@@ -486,10 +482,15 @@ def separable_conv2d(input,
         filter_shape=array_ops.shape(depthwise_filter),
         dilation_rate=rate,
         padding=padding,
+        data_format=data_format,
         op=op)
 
     return nn_ops.conv2d(
-        depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", name=name)
+        depthwise,
+        pointwise_filter, [1, 1, 1, 1],
+        padding="VALID",
+        data_format=data_format,
+        name=name)
 
 
 # pylint: enable=redefined-builtin,line-too-long
@@ -616,18 +617,22 @@ def moments(x, axes, shift=None, name=None, keep_dims=False):
           math_ops.reduce_mean(y, axes, keep_dims=True))
     else:
       shift = math_ops.cast(shift, y.dtype)
-    counts, m_ss, v_ss, shift = sufficient_statistics(
-        y, axes, shift=shift, keep_dims=keep_dims, name=name)
-    # Reshape shift as needed.
-    shift = array_ops.reshape(shift, array_ops.shape(m_ss))
-    shift.set_shape(m_ss.get_shape())
-    with ops.control_dependencies([counts, m_ss, v_ss]):
-      mean, variance = normalize_moments(counts, m_ss, v_ss, shift, name=name)
-      if x.dtype == dtypes.float16:
-        return (math_ops.cast(mean, dtypes.float16),
-                math_ops.cast(variance, dtypes.float16))
-      else:
-        return (mean, variance)
+    shifted_mean = math_ops.reduce_mean(
+        math_ops.subtract(y, shift), axes, keep_dims=True, name="shifted_mean")
+    variance = math_ops.subtract(
+        math_ops.reduce_mean(
+            math_ops.squared_difference(y, shift), axes, keep_dims=True),
+        math_ops.square(shifted_mean),
+        name="variance")
+    mean = math_ops.add(shifted_mean, shift, name="mean")
+    if not keep_dims:
+      mean = array_ops.squeeze(mean, axes)
+      variance = array_ops.squeeze(variance, axes)
+    if x.dtype == dtypes.float16:
+      return (math_ops.cast(mean, dtypes.float16), math_ops.cast(
+          variance, dtypes.float16))
+    else:
+      return (mean, variance)
 
 
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
@@ -897,7 +902,8 @@ def _compute_sampled_logits(weights,
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
         objects whose concatenation along dimension 0 has shape
         `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
-    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
+        class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
         the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
@@ -950,8 +956,10 @@ def _compute_sampled_logits(weights,
           range_max=num_classes)
     # NOTE: pylint cannot tell that 'sampled_values' is a sequence
     # pylint: disable=unpacking-non-sequence
-    sampled, true_expected_count, sampled_expected_count = sampled_values
+    sampled, true_expected_count, sampled_expected_count = (
+        array_ops.stop_gradient(s) for s in sampled_values)
     # pylint: enable=unpacking-non-sequence
+    sampled = math_ops.cast(sampled, dtypes.int64)
 
     # labels_flat is a [batch_size * num_true] tensor
     # sampled is a [num_sampled] int tensor
@@ -960,7 +968,8 @@ def _compute_sampled_logits(weights,
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
-    all_b = embedding_ops.embedding_lookup(biases, all_ids)
+    all_b = embedding_ops.embedding_lookup(
+        biases, all_ids, partition_strategy=partition_strategy)
     # true_w shape is [batch_size * num_true, dim]
     # true_b is a [batch_size * num_true] tensor
     true_w = array_ops.slice(
@@ -1056,12 +1065,36 @@ def nce_loss(weights,
   unnormalized statistical
   models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
   Also see our [Candidate Sampling Algorithms
-  Reference](../../extras/candidate_sampling.pdf)
+  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.nce_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+    loss = tf.reduce_sum(loss, axis=1)
+  ```
 
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
-  [log_uniform_candidate_sampler](#log_uniform_candidate_sampler).
+  @{tf.nn.log_uniform_candidate_sampler}.
 
   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
@@ -1093,7 +1126,7 @@ def nce_loss(weights,
         `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
         learning to generate log-odds instead of log probabilities.  See
         our [Candidate Sampling Algorithms Reference]
-        (../../extras/candidate_sampling.pdf).
+        (https://www.tensorflow.org/extras/candidate_sampling.pdf).
         Default is False.
     partition_strategy: A string specifying the partitioning strategy, relevant
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
@@ -1142,11 +1175,31 @@ def sampled_softmax_loss(weights,
   This operation is for training only.  It is generally an underestimate of
   the full softmax loss.
 
-  At inference time, you can compute full softmax probabilities with the
-  expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+  ```
 
   See our [Candidate Sampling Algorithms Reference]
-  (../../extras/candidate_sampling.pdf)
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
 
   Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
   ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index eea84e41ab2..e4eaeff67ad 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -39,6 +39,8 @@ from tensorflow.python.ops.gen_nn_ops import *
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
+# pylint: disable=protected-access
+
 
 def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
                             strides=None, name=None):
@@ -64,8 +66,8 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
       the `input` and output is the last dimension (default, or if `data_format`
       does not start with "NC"), or the second dimension (if `data_format`
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
-      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-      N=3, the valid value is "NDHWC".
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     strides: Sequence of N positive integers, defaults to `[1] * N`.
     name: Name prefix to use.
 
@@ -130,18 +132,28 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
     elif conv_dims == 3:
       if data_format is None or data_format == "NDHWC":
         strides = [1] + list(strides) + [1]
+      elif data_format == "NCDHW":
+        strides = [1, 1] + list(strides)
       else:
-        raise ValueError("data_format must be \"NDHWC\".")
+        raise ValueError("data_format must be \"NDHWC\" or \"NCDHW\". Have: %s"
+                         % data_format)
       return gen_nn_ops.conv3d(
           input=input,
           filter=filter,
           strides=strides,
           padding=padding,
+          data_format=data_format,
           name=name)
 
 
-def with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None,  # pylint: disable=redefined-builtin
-                        spatial_dims=None):
+def with_space_to_batch(
+    input,  # pylint: disable=redefined-builtin
+    dilation_rate,
+    padding,
+    op,
+    filter_shape=None,
+    spatial_dims=None,
+    data_format=None):
   """Performs `op` on the space-to-batch representation of `input`.
 
   This has the effect of transforming sliding window operations into the
@@ -260,9 +272,16 @@ def with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None,  #
     spatial_dims: Monotonically increasing sequence of `num_spatial_dims`
       integers (which are >= 1) specifying the spatial dimensions of `input`
       and output.  Defaults to: `range(1, num_spatial_dims+1)`.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
 
   Returns:
-    The output Tensor as described above.
+    The output Tensor as described above, dimensions will vary based on the op
+    provided.
 
   Raises:
     ValueError: if `padding` is invalid or the arguments are incompatible.
@@ -283,20 +302,31 @@ def with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None,  #
 
   num_spatial_dims = rate_shape[0].value
 
+  if data_format is not None and data_format.startswith("NC"):
+    starting_spatial_dim = 2
+  else:
+    starting_spatial_dim = 1
+
   if spatial_dims is None:
-    spatial_dims = range(1, num_spatial_dims + 1)
+    spatial_dims = range(starting_spatial_dim,
+                         num_spatial_dims + starting_spatial_dim)
   orig_spatial_dims = list(spatial_dims)
   spatial_dims = sorted(set(int(x) for x in orig_spatial_dims))
   if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
     raise ValueError(
-        "spatial_dims must be a montonically increasing sequence of positive integers")  # pylint: disable=line-too-long
-  last_spatial_dim = spatial_dims[-1]
+        "spatial_dims must be a montonically increasing sequence of positive "
+        "integers")  # pylint: disable=line-too-long
+
+  if data_format is not None and data_format.startswith("NC"):
+    expected_input_rank = spatial_dims[-1]
+  else:
+    expected_input_rank = spatial_dims[-1] + 1
 
   try:
-    input.get_shape().with_rank_at_least(last_spatial_dim + 1)
+    input.get_shape().with_rank_at_least(expected_input_rank)
   except ValueError:
     ValueError("input tensor must have rank %d at least" %
-               (last_spatial_dim + 1))
+               (expected_input_rank))
 
   const_rate = tensor_util.constant_value(dilation_rate)
   rate_or_const_rate = dilation_rate
@@ -345,7 +375,9 @@ def with_space_to_batch(input, dilation_rate, padding, op, filter_shape=None,  #
     input_shape_list = input.get_shape().as_list()
     input_spatial_shape = [input_shape_list[i] for i in spatial_dims]
   if input_spatial_shape is None or None in input_spatial_shape:
-    input_spatial_shape = array_ops.gather(array_ops.shape(input), spatial_dims)
+    input_shape_tensor = array_ops.shape(input)
+    input_spatial_shape = array_ops.stack(
+        [input_shape_tensor[i] for i in spatial_dims])
 
   paddings, crops = array_ops.required_space_to_batch_paddings(
       input_shape=input_spatial_shape,
@@ -502,21 +534,20 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
   of N `strides` (defaulting [1]*N), this computes for each N-D spatial output
   position (x[0], ..., x[N-1]):
 
+  ```
     output[b, x[0], ..., x[N-1], k] =
-
         sum_{z[0], ..., z[N-1], q}
-
             filter[z[0], ..., z[N-1], q, k] *
             padded_input[b,
                          x[0]*strides[0] + dilation_rate[0]*z[0],
                          ...,
                          x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
                          q]
-
+  ```
   where `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
-  [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
+  @{tf.nn.convolution$comment here}.
 
   In the case that `data_format` does start with `"NC"`, the `input` and output
   (but not the `filter`) are simply transposed as follows:
@@ -553,8 +584,8 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
       the `input` and output is the last dimension (default, or if `data_format`
       does not start with "NC"), or the second dimension (if `data_format`
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
-      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-      N=3, the valid value is "NDHWC".
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
 
   Returns:
     A `Tensor` with the same type as `input` of shape
@@ -655,6 +686,7 @@ def pool(input,  # pylint: disable=redefined-builtin
       0 <= x[i] < output_spatial_shape[i],
       0 <= c < num_channels:
 
+  ```
     output[b, x[0], ..., x[N-1], c] =
       REDUCE_{z[0], ..., z[N-1]}
         input[b,
@@ -662,19 +694,22 @@ def pool(input,  # pylint: disable=redefined-builtin
               ...
               x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
               c],
+  ```
 
   where the reduction function REDUCE depends on the value of `pooling_type`,
   and pad_before is defined based on the value of `padding` as described in the
-  [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution).
+  @{tf.nn.convolution$comment here}.
   The reduction never includes out-of-bounds positions.
 
   In the case that `data_format` starts with `"NC"`, the `input` and output are
   simply transposed as follows:
 
+  ```
     pool(input, data_format, **kwargs) =
       tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
                         **kwargs),
                    [0, N+1] + range(1, N+1))
+  ```
 
   Args:
     input: Tensor of rank N+2, of shape
@@ -685,7 +720,7 @@ def pool(input,  # pylint: disable=redefined-builtin
     window_shape: Sequence of N ints >= 1.
     pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
     padding: The padding algorithm, must be "SAME" or "VALID".
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     dilation_rate: Optional.  Dilation rate.  List of N ints >= 1.
       Defaults to [1]*N.  If any value of dilation_rate is > 1, then all values
       of strides must be 1.
@@ -697,8 +732,8 @@ def pool(input,  # pylint: disable=redefined-builtin
       the `input` and output is the last dimension (default, or if `data_format`
       does not start with "NC"), or the second dimension (if `data_format`
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
-      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".  For
-      N=3, the valid value is "NDHWC".
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
 
   Returns:
     Tensor of rank N+2, of shape
@@ -713,6 +748,7 @@ def pool(input,  # pylint: disable=redefined-builtin
 
     If padding = "SAME":
       output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
     If padding = "VALID":
       output_spatial_shape[i] =
         ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
@@ -766,11 +802,7 @@ def pool(input,  # pylint: disable=redefined-builtin
       adjusted_strides = [1, 1] + list(strides)
       spatial_dims = range(2, num_spatial_dims + 2)
 
-    if num_spatial_dims == 3:
-      if data_format is not None and data_format != "NDHWC":
-        raise ValueError("data_format must be \"NDHWC\".")
-      data_format_kwargs = dict()
-    elif num_spatial_dims == 1:
+    if num_spatial_dims == 1:
       if data_format is None or data_format == "NWC":
         data_format_kwargs = dict(data_format="NHWC")
       elif data_format == "NCW":
@@ -808,6 +840,11 @@ def pool(input,  # pylint: disable=redefined-builtin
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
+  This function is a simpler wrapper around the more general
+  @{tf.nn.convolution}, and exists only for backwards compatibility. You can
+  use @{tf.nn.convolution} to perform 1-D, 2-D, or 3-D atrous convolution.
+
+
   Computes a 2-D atrous convolution, also known as convolution with holes or
   dilated convolution, given 4-D `value` and `filters` tensors. If the `rate`
   parameter is equal to one, it performs regular 2-D convolution. If the `rate`
@@ -821,8 +858,13 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   More specifically:
 
-      output[b, i, j, k] = sum_{di, dj, q} filters[di, dj, q, k] *
-            value[b, i + rate * di, j + rate * dj, q]
+  ```
+  output[batch, height, width, out_channel] =
+      sum_{dheight, dwidth, in_channel} (
+          filters[dheight, dwidth, in_channel, out_channel] *
+          value[batch, height + rate*dheight, width + rate*dwidth, in_channel]
+      )
+  ```
 
   Atrous convolution allows us to explicitly control how densely to compute
   feature responses in fully convolutional networks. Used in conjunction with
@@ -909,98 +951,25 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
   Returns:
     A `Tensor` with the same type as `value`.
+    Output shape with `'VALID`` padding is:
+
+        [batch, height - 2 * (filter_width - 1),
+         width - 2 * (filter_height - 1), out_channels].
+
+    Output shape with `'SAME'` padding is:
+
+        [batch, height, width, out_channels].
 
   Raises:
     ValueError: If input/output depth does not match `filters`' shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
-  with ops.name_scope(name, "atrous_conv2d", [value, filters]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filters = ops.convert_to_tensor(filters, name="filters")
-    if not value.get_shape()[3].is_compatible_with(filters.get_shape()[2]):
-      raise ValueError(
-          "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[2]))
-    if rate < 1:
-      raise ValueError("rate {} cannot be less than one".format(rate))
-
-    if rate == 1:
-      value = gen_nn_ops.conv2d(input=value,
-                                filter=filters,
-                                strides=[1, 1, 1, 1],
-                                padding=padding)
-      return value
-
-    # We have two padding contributions. The first is used for converting "SAME"
-    # to "VALID". The second is required so that the height and width of the
-    # zero-padded value tensor are multiples of rate.
-
-    # Padding required to reduce to "VALID" convolution
-    if padding == "SAME":
-      # Handle filters whose shape is unknown during graph creation.
-      if filters.get_shape().is_fully_defined():
-        filter_shape = filters.get_shape().as_list()
-      else:
-        filter_shape = array_ops.shape(filters)
-      filter_height, filter_width = filter_shape[0], filter_shape[1]
-
-      # Spatial dimensions of the filters and the upsampled filters in which we
-      # introduce (rate - 1) zeros between consecutive filter values.
-      filter_height_up = filter_height + (filter_height - 1) * (rate - 1)
-      filter_width_up = filter_width + (filter_width - 1) * (rate - 1)
-
-      pad_height = filter_height_up - 1
-      pad_width = filter_width_up - 1
-
-      # When pad_height (pad_width) is odd, we pad more to bottom (right),
-      # following the same convention as conv2d().
-      pad_top = pad_height // 2
-      pad_bottom = pad_height - pad_top
-      pad_left = pad_width // 2
-      pad_right = pad_width - pad_left
-    elif padding == "VALID":
-      pad_top = 0
-      pad_bottom = 0
-      pad_left = 0
-      pad_right = 0
-    else:
-      raise ValueError("Invalid padding")
-
-    # Handle input whose shape is unknown during graph creation.
-    if value.get_shape().is_fully_defined():
-      value_shape = value.get_shape().as_list()
-    else:
-      value_shape = array_ops.shape(value)
-
-    in_height = value_shape[1] + pad_top + pad_bottom
-    in_width = value_shape[2] + pad_left + pad_right
-
-    # More padding so that rate divides the height and width of the input.
-    pad_bottom_extra = (rate - in_height % rate) % rate
-    pad_right_extra = (rate - in_width % rate) % rate
-
-    # The paddings argument to space_to_batch includes both padding components.
-    space_to_batch_pad = [[pad_top, pad_bottom + pad_bottom_extra],
-                          [pad_left, pad_right + pad_right_extra]]
-
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
-
-    value = gen_nn_ops.conv2d(input=value,
-                              filter=filters,
-                              strides=[1, 1, 1, 1],
-                              padding="VALID",
-                              name=name)
-
-    # The crops argument to batch_to_space is just the extra padding component.
-    batch_to_space_crop = [[0, pad_bottom_extra], [0, pad_right_extra]]
-
-    value = array_ops.batch_to_space(input=value,
-                                     crops=batch_to_space_crop,
-                                     block_size=rate)
-
-    return value
+  return convolution(
+      input=value,
+      filter=filters,
+      padding=padding,
+      dilation_rate=np.broadcast_to(rate, (2,)),
+      name=name)
 
 
 def conv2d_transpose(value,
@@ -1029,7 +998,7 @@ def conv2d_transpose(value,
     strides: A list of ints. The stride of the sliding window for each
       dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
 
@@ -1227,6 +1196,7 @@ def conv3d_transpose(value,
                      output_shape,
                      strides,
                      padding="SAME",
+                     data_format="NDHWC",
                      name=None):
   """The transpose of `conv3d`.
 
@@ -1246,7 +1216,9 @@ def conv3d_transpose(value,
     strides: A list of ints. The stride of the sliding window for each
       dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
+    data_format: A string, either `'NDHWC'` or `'NCDHW`' specifying the layout
+      of the input and output tensors. Defaults to `'NDHWC'`.
     name: Optional name for the returned tensor.
 
   Returns:
@@ -1260,10 +1232,11 @@ def conv3d_transpose(value,
                       [value, filter, output_shape]) as name:
     value = ops.convert_to_tensor(value, name="value")
     filter = ops.convert_to_tensor(filter, name="filter")
-    if not value.get_shape()[4].is_compatible_with(filter.get_shape()[4]):
+    axis = 1 if data_format == "NCDHW" else 4
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[4], filter.get_shape(
-                       )[4]))
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[4]))
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
@@ -1286,6 +1259,7 @@ def conv3d_transpose(value,
                                                out_backprop=value,
                                                strides=strides,
                                                padding=padding,
+                                               data_format=data_format,
                                                name=name)
 
 
@@ -1350,7 +1324,7 @@ def crelu(features, name=None):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: https://arxiv.org/abs/1603.05201
+  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201) 
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1368,6 +1342,7 @@ def crelu(features, name=None):
 
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1429,14 +1404,14 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
-  def _swap_axis(logits, dim_index, last_index):
+  def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
     return array_ops.transpose(logits,
                                array_ops.concat([
                                    math_ops.range(dim_index), [last_index],
                                    math_ops.range(dim_index + 1, last_index),
                                    [dim_index]
-                               ], 0))
+                               ], 0), name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1452,8 +1427,8 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   if is_last_dim:
     input_shape = array_ops.shape(logits)
     logits = _flatten_outer_dims(logits)
-    output = compute_op(logits, name=name)
-    output = array_ops.reshape(output, input_shape)
+    output = compute_op(logits)
+    output = array_ops.reshape(output, input_shape, name=name)
     return output
 
   # If dim is not the last dimension, we have to do a reshape and transpose so
@@ -1468,11 +1443,11 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   logits = _flatten_outer_dims(logits)
 
   # Do the actual softmax on its last dimension.
-  output = compute_op(logits, name=name)
+  output = compute_op(logits)
 
   # Transform back the output tensor.
   output = array_ops.reshape(output, shape_after_swap)
-  output = _swap_axis(output, dim, math_ops.subtract(input_rank, 1))
+  output = _swap_axis(output, dim, math_ops.subtract(input_rank, 1), name=name)
 
   # Make shape inference work since reshape and transpose may erase its static
   # shape.
@@ -1559,8 +1534,9 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape `[batch_size, num_classes]`
-  and the same dtype (either `float16`, `float32`, or `float64`).
+  `logits` and `labels` must have the same shape, e.g.
+  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  or `float64`).
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
@@ -1653,7 +1629,7 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
   a probability distribution for each entry, see
   `softmax_cross_entropy_with_logits`.
 
-  **WARNING:** This op expects unscaled logits, since it performs a softmax
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
@@ -1719,8 +1695,7 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
         return cost
 
     # Reshape logits to 2 dim, labels to 1 dim.
-    num_classes = array_ops.gather(array_ops.shape(logits),
-                                   array_ops.rank(logits) - 1)
+    num_classes = array_ops.shape(logits)[array_ops.rank(logits) - 1]
     precise_logits = array_ops.reshape(precise_logits, [-1, num_classes])
     labels = array_ops.reshape(labels, [-1])
     # The second output tensor contains the gradients.  We use it in
@@ -1750,7 +1725,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
       The stride of the sliding window for each dimension of the
       input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the operation.
 
@@ -1778,7 +1753,7 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
     strides: A list of ints that has length >= 4.  The stride of the sliding
       window for each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the operation.
 
@@ -1906,7 +1881,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for this operation (optional).
 
@@ -1975,7 +1950,7 @@ def top_k(input, k=1, sorted=True, name=None):
 def conv1d(value, filters, stride, padding,
            use_cudnn_on_gpu=None, data_format=None,
            name=None):
-  """Computes a 1-D convolution given 3-D input and filter tensors.
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
   Given an input tensor of shape
     [batch, in_width, in_channels]
@@ -1996,7 +1971,7 @@ def conv1d(value, filters, stride, padding,
     [1, filter_width, in_channels, out_channels].
   The result is then reshaped back to
     [batch, out_width, out_channels]
-  (where out_width is a function of the stride and padding as in conv2d) and
+  \(where out_width is a function of the stride and padding as in conv2d\) and
   returned to the caller.
 
   Args:
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index f48e420e875..5cf8c3291cd 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -30,6 +30,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops.nn_impl import _compute_sampled_logits
 from tensorflow.python.platform import test as test_lib
@@ -412,13 +415,25 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     biases = np.random.randn(self._num_classes).astype(np.float32)
     hidden_acts = np.random.randn(self._batch_size,
                                   self._dim).astype(np.float32)
-    sharded_weights = [
-        weights[[
-            row for row in range(self._num_classes)
-            if row % self._num_shards == shard
-        ]] for shard in range(self._num_shards)
-    ]
-    return weights, biases, hidden_acts, sharded_weights
+
+    with ops.Graph().as_default() as g:
+      sharded_weights = variable_scope.get_variable(
+          "w",
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              self._num_shards),
+          initializer=constant_op.constant(weights))
+      sharded_biases = variable_scope.get_variable(
+          "b",
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              self._num_shards),
+          initializer=constant_op.constant(biases))
+      with self.test_session(graph=g) as sess:
+        variables.global_variables_initializer().run()
+
+        sharded_weights_v, sharded_biases_v = sess.run(
+            [list(sharded_weights), list(sharded_biases)])
+
+    return weights, biases, hidden_acts, sharded_weights_v, sharded_biases_v
 
   def _ComputeSampledLogitsNP(self,
                               true_w,
@@ -466,7 +481,10 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       weights_tf = [constant_op.constant(shard) for shard in weights]
     else:
       weights_tf = constant_op.constant(weights)
-    biases_tf = constant_op.constant(biases)
+    if isinstance(biases, list):
+      biases_tf = [constant_op.constant(shard) for shard in biases]
+    else:
+      biases_tf = constant_op.constant(biases)
     hidden_acts_tf = constant_op.constant(
         hidden_acts, shape=(self._batch_size, self._dim))
     labels_tf = constant_op.constant(
@@ -483,12 +501,13 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         sampled_vals,
         subtract_log_q=subtract_log_q,
         remove_accidental_hits=remove_accidental_hits,
-        name=name)
+        name=name,
+        partition_strategy="div")
     return pred_logits_tf, pred_labels_tf
 
   def testComputeSampledLogitsShapes(self):
     # We just check that the shapes of the returned values are correct.
-    weights, biases, hidden_acts, _ = self._GenerateTestInputs()
+    weights, biases, hidden_acts, _, _ = self._GenerateTestInputs()
     sampled = [1, 0, 2, 3]
     num_sampled = len(sampled)
     true_exp = sampled_exp = [1., 1., 1., 1.]
@@ -529,7 +548,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
 
   def testComputeSampledLogitsValues(self):
     # Here we check the actual numerics.
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     eps = 1e-3
     sampled = [1, 0, 2, 3]
     num_sampled = len(sampled)
@@ -627,7 +647,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         self.assertAllClose(logits_np, logits_tf_val, eps)
         self.assertAllClose(labels_np, labels_tf_val, eps)
 
-        # Test 4: Test 1, with sharded weights
+        # Test 4: Test 1, with sharded weights and sharded biases.
         logits_np, labels_np = self._ComputeSampledLogitsNP(
             true_w,
             true_b,
@@ -637,7 +657,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
             num_true=num_true_test)
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
             sharded_weights,
-            biases,
+            sharded_biases,
             hidden_acts,
             labels,
             num_sampled,
@@ -663,7 +683,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = np.minimum(np.maximum(pred, eps), 1 - eps)
       return -targets * np.log(pred) - (1. - targets) * np.log(1. - pred)
 
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     labels = [0, 1, 2]
     true_w, true_b = weights[labels], biases[labels]
     sampled = [1, 0, 2, 3]
@@ -697,23 +718,25 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           biases_tf,
           labels_tf,
           inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals)
+          sampled_values=test_sampled_vals,
+          partition_strategy="div")
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
-      # Test with sharded weights
+      # Test with sharded weights and sharded biases.
       nce_loss_tf = nn_impl.nce_loss(
-          [constant_op.constant(shard) for shard in sharded_weights],
-          biases_tf,
+          sharded_weights,
+          sharded_biases,
           labels_tf,
           inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals)
+          sampled_values=test_sampled_vals,
+          partition_strategy="div")
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
@@ -728,7 +751,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
       return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
 
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     labels = [0, 1, 2]
     true_w, true_b = weights[labels], biases[labels]
     sampled = [1, 0, 2, 3]
@@ -760,26 +784,28 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           biases=biases_tf,
           labels=labels_tf,
           inputs=inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
           sampled_values=test_sampled_vals,
-          remove_accidental_hits=False)
+          remove_accidental_hits=False,
+          partition_strategy="div")
 
       self.assertAllClose(sampled_softmax_loss_np,
                           sampled_softmax_loss_tf.eval(), 1e-4)
 
-      # Test with sharded weights
+      # Test with sharded weights and sharded biases.
       sampled_softmax_loss_tf = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in sharded_weights],
-          biases=biases_tf,
+          weights=sharded_weights,
+          biases=sharded_biases,
           labels=labels_tf,
           inputs=inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
           sampled_values=test_sampled_vals,
-          remove_accidental_hits=False)
+          remove_accidental_hits=False,
+          partition_strategy="div")
 
       self.assertAllClose(sampled_softmax_loss_np,
                           sampled_softmax_loss_tf.eval(), 1e-4)
@@ -788,13 +814,33 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
 class CReluTest(test_lib.TestCase):
 
   def test(self):
-    x = np.random.rand(3, 4).astype(np.float32)
+    np.random.seed(1)  # Make it reproducible.
+    x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
     with self.test_session():
       z = nn_ops.crelu(constant_op.constant(x)).eval()
       self.assertAllClose(y, z, 1e-4)
 
 
+class ReluTest(test_lib.TestCase):
+
+  def test(self):
+    np.random.seed(1)  # Make it reproducible.
+    x = np.random.randn(3, 4).astype(np.float32)
+    y = np.maximum(x, 0.0)
+    with self.test_session():
+      z = nn_ops.relu(constant_op.constant(x)).eval()
+      self.assertAllEqual(y, z)
+
+  def testNaNs(self):
+    # Test that relu(nan) = nan for various sizes.
+    for i in range(18):
+      x = np.zeros(i) + np.nan
+      with self.test_session():
+        z = nn_ops.relu(constant_op.constant(x)).eval()
+        self.assertTrue(np.isnan(z).all())
+
+
 class MomentsTest(test_lib.TestCase):
 
   def doOutputTest(self, input_shape, moments_axes, tol=1e-4):
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 079837bce35..c2f99617316 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import sparse_ops
 # pylint: disable=wildcard-import,undefined-variable
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
+from tensorflow.python.platform import tf_logging
 
 
 ops.NotDifferentiable("DecodeRaw")
@@ -55,21 +56,69 @@ class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
         ["index_key", "value_key", "dtype", "size", "already_sorted"])):
-  """Configuration for parsing a sparse input feature.
+  """Configuration for parsing a sparse input feature from an `Example`.
+
+  Note, preferably use `VarLenFeature` (possibly in combination with a
+  `SequenceExample`) in order to parse out `SparseTensor`s instead of
+  `SparseFeature` due to its simplicity.
+
+  Closely mimicking the `SparseTensor` that will be obtained by parsing an
+  `Example` with a `SparseFeature` config, a `SparseFeature` contains a
+
+  * `value_key`: The name of key for a `Feature` in the `Example` whose parsed
+    `Tensor` will be the resulting `SparseTensor.values`.
+
+  * `index_key`: A list of names - one for each dimension in the resulting
+    `SparseTensor` whose `indices[i][dim]` indicating the position of
+    the `i`-th value in the `dim` dimension will be equal to the `i`-th value in
+    the Feature with key named `index_key[dim]` in the `Example`.
+
+  * `size`: A list of ints for the resulting `SparseTensor.dense_shape`.
+
+  For example, we can represent the following 2D `SparseTensor`
+
+  ```python
+  SparseTensor(indices=[[3, 1], [20, 0]],
+               values=[0.5, -1.0]
+               dense_shape=[100, 3])
+  ```
+
+  with an `Example` input proto
+
+  ```python
+  features {
+    feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+    feature { key: "ix0" value { int64_list { value: [ 3, 20 ] } } }
+    feature { key: "ix1" value { int64_list { value: [ 1, 0 ] } } }
+  }
+  ```
+
+  and `SparseFeature` config with 2 `index_key`s
+
+  ```python
+  SparseFeature(index_key=["ix0", "ix1"],
+                value_key="val",
+                dtype=tf.float32,
+                size=[100, 3])
+  ```
 
   Fields:
-    index_key: Name of index feature.  The underlying feature's type must
-      be `int64` and its length must always match that of the `value_key`
-      feature.
+    index_key: A single string name or a list of string names of index features.
+      For each key the underlying feature's type must be `int64` and its length
+      must always match that of the `value_key` feature.
+      To represent `SparseTensor`s with a `dense_shape` of `rank` higher than 1
+      a list of length `rank` should be used.
     value_key: Name of value feature.  The underlying feature's type must
-      be `dtype` and its length must always match that of the `index_key`
-      feature.
+      be `dtype` and its length must always match that of all the `index_key`s'
+      features.
     dtype: Data type of the `value_key` feature.
-    size: A Python int to specify a dimension of the dense shape. Each value in
-      the `index_key` feature must be in `[0, size)`.
+    size: A Python int or list thereof specifying the dense shape. Should be a
+      list if and only if `index_key` is a list. In that case the list must be
+      equal to the length of `index_key`. Each for each entry `i` all values in
+      the `index_key`[i] feature must be in `[0, size[i])`.
     already_sorted: A Python boolean to specify whether the values in
-      `index_key` are already sorted. If so skip sorting.
-      False by default (optional).
+      `value_key` are already sorted by their index position. If so skip
+      sorting. False by default (optional).
   """
   pass
 SparseFeature.__new__.__defaults__ = (False,)
@@ -86,29 +135,41 @@ class FixedLenFeature(collections.namedtuple(
     shape: Shape of input data.
     dtype: Data type of input.
     default_value: Value to be used if an example is missing this feature. It
-        must be compatible with `dtype`.
+        must be compatible with `dtype` and of the specified `shape`.
   """
   pass
 FixedLenFeature.__new__.__defaults__ = (None,)
 
 
-# NOTE: If we ever support a default_value for sequence dense features, we can
-# remove this class and use FixedLenFeature in its place.
 class FixedLenSequenceFeature(collections.namedtuple(
-    "FixedLenSequenceFeature", ["shape", "dtype", "allow_missing"])):
-  """Configuration for a dense input feature in a sequence item.
+    "FixedLenSequenceFeature",
+    ["shape", "dtype", "allow_missing", "default_value"])):
+  """Configuration for parsing a variable-length input feature into a `Tensor`.
+
+  The resulting `Tensor` of parsing a single `SequenceExample` or `Example` has
+  a static `shape` of `[None] + shape` and the specified `dtype`.
+  The resulting `Tensor` of parsing a `batch_size` many `Example`s has
+  a static `shape` of `[batch_size, None] + shape` and the specified `dtype`.
+  The entries in the `batch` from different `Examples` will be padded with
+  `default_value` to the maximum length present in the `batch`.
 
   To treat a sparse input as dense, provide `allow_missing=True`; otherwise,
   the parse functions will fail on any examples missing this feature.
 
   Fields:
-    shape: Shape of input data.
+    shape: Shape of input data for dimension 2 and higher. First dimension is
+      of variable length `None`.
     dtype: Data type of input.
     allow_missing: Whether to allow this feature to be missing from a feature
-      list item.
+      list item. Is available only for parsing `SequenceExample` not for
+      parsing `Examples`.
+    default_value: Scalar value to be used to pad multiple `Example`s to their
+      maximum length. Irrelevant for parsing a single `Example` or
+      `SequenceExample`. Defaults to "" for dtype string and 0 otherwise
+      (optional).
   """
   pass
-FixedLenSequenceFeature.__new__.__defaults__ = (False,)
+FixedLenSequenceFeature.__new__.__defaults__ = (False, None)
 
 
 def _features_to_raw_params(features, types):
@@ -147,6 +208,7 @@ def _features_to_raw_params(features, types):
       elif isinstance(feature, SparseFeature):
         if SparseFeature not in types:
           raise ValueError("Unsupported SparseFeature %s.", feature)
+
         if not feature.index_key:
           raise ValueError(
               "Missing index_key for SparseFeature %s.", feature)
@@ -155,15 +217,22 @@ def _features_to_raw_params(features, types):
               "Missing value_key for SparseFeature %s.", feature)
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
-        if feature.index_key in sparse_keys:
-          dtype = sparse_types[sparse_keys.index(feature.index_key)]
-          if dtype != dtypes.int64:
-            raise ValueError("Conflicting type %s vs int64 for feature %s." % (
-                dtype, feature.index_key))
-        else:
-          sparse_keys.append(feature.index_key)
-          sparse_types.append(dtypes.int64)
-
+        index_keys = feature.index_key
+        if isinstance(index_keys, str):
+          index_keys = [index_keys]
+        elif len(index_keys) > 1:
+          tf_logging.warning("SparseFeature is a complicated feature config "
+                             "and should only be used after careful "
+                             "consideration of VarLenFeature.")
+        for index_key in sorted(index_keys):
+          if index_key in sparse_keys:
+            dtype = sparse_types[sparse_keys.index(index_key)]
+            if dtype != dtypes.int64:
+              raise ValueError("Conflicting type %s vs int64 for feature %s." %
+                               (dtype, index_key))
+          else:
+            sparse_keys.append(index_key)
+            sparse_types.append(dtypes.int64)
         if feature.value_key in sparse_keys:
           dtype = sparse_types[sparse_keys.index(feature.value_key)]
           if dtype != feature.dtype:
@@ -179,6 +248,15 @@ def _features_to_raw_params(features, types):
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
           raise ValueError("Missing shape for feature %s." % key)
+        feature_tensor_shape = tensor_shape.as_shape(feature.shape)
+        if (feature.shape and feature_tensor_shape.ndims and
+            feature_tensor_shape.dims[0].value is None):
+          raise ValueError("First dimension of shape for feature %s unknown. "
+                           "Consider using FixedLenSequenceFeature." % key)
+        if (feature.shape is not None and
+            not feature_tensor_shape.is_fully_defined()):
+          raise ValueError("All dimensions of shape for feature %s need to be "
+                           "known but received %s." % (key, str(feature.shape)))
         dense_keys.append(key)
         dense_shapes.append(feature.shape)
         dense_types.append(feature.dtype)
@@ -196,6 +274,8 @@ def _features_to_raw_params(features, types):
         dense_types.append(feature.dtype)
         if feature.allow_missing:
           dense_defaults[key] = None
+        if feature.default_value is not None:
+          dense_defaults[key] = feature.default_value
       else:
         raise ValueError("Invalid feature %s:%s." % (key, feature))
   return (
@@ -229,7 +309,10 @@ def _construct_sparse_tensors_for_sparse_features(features, tensor_dict):
   for key in sorted(features.keys()):
     feature = features[key]
     if isinstance(feature, SparseFeature):
-      sp_ids = tensor_dict[feature.index_key]
+      if isinstance(feature.index_key, str):
+        sp_ids = tensor_dict[feature.index_key]
+      else:
+        sp_ids = [tensor_dict[index_key] for index_key in feature.index_key]
       sp_values = tensor_dict[feature.value_key]
       tensor_dict[key] = sparse_ops.sparse_merge(
           sp_ids,
@@ -243,12 +326,31 @@ def _construct_sparse_tensors_for_sparse_features(features, tensor_dict):
   return tensor_dict
 
 
+def _prepend_none_dimension(features):
+  if features:
+    modified_features = dict(features)  # Create a copy to modify
+    for key, feature in features.items():
+      if isinstance(feature, FixedLenSequenceFeature):
+        if not feature.allow_missing:
+          raise ValueError("Unsupported: FixedLenSequenceFeature requires "
+                           "allow_missing to be True.")
+        modified_features[key] = FixedLenSequenceFeature(
+            [None] + list(feature.shape),
+            feature.dtype,
+            feature.allow_missing,
+            feature.default_value)
+    return modified_features
+  else:
+    return features
+
+
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
 
   Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
-  protos given in `serialized`.
+  protos given in `serialized`. We refer to `serialized` as a batch with
+  `batch_size` many entries of individual `Example` protos.
 
   `example_names` may contain descriptive names for the corresponding serialized
   protos. These may be useful for debugging purposes, but they have no effect on
@@ -263,15 +365,20 @@ def parse_example(serialized, features, name=None, example_names=None):
 
   Each `VarLenFeature` maps to a `SparseTensor` of the specified type
   representing a ragged matrix. Its indices are `[batch, index]` where `batch`
-  is the batch entry the value is from in `serialized`, and `index` is the
-  value's index in the list of values associated with that feature and example.
+  identifies the example in `serialized`, and `index` is the value's index in
+  the list of values associated with that feature and example.
 
   Each `SparseFeature` maps to a `SparseTensor` of the specified type
-  representing a sparse matrix of shape
-  `(serialized.size(), SparseFeature.size)`. Its indices are `[batch, index]`
-  where `batch` is the batch entry the value is from in `serialized`, and
-  `index` is the value's index is given by the values in the
-  `SparseFeature.index_key` feature column.
+  representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`.
+  Its `values` come from the feature in the examples with key `value_key`.
+  A `values[i]` comes from a position `k` in the feature of an example at batch
+  entry `batch`. This positional information is recorded in `indices[i]` as
+  `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+  the feature in the example at with key `SparseFeature.index_key[j].
+  In other words, we split the indices (except the first index indicating the
+  batch entry) of a `SparseTensor` by dimension into different features of the
+  `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+  `SparseFeature` whenever possible.
 
   Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
   `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
@@ -280,9 +387,15 @@ def parse_example(serialized, features, name=None, example_names=None):
   value, we will fail if that `Feature` is missing from any example in
   `serialized`.
 
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape
+  `(serialized.size(), None) + df.shape`.
+  All examples in `serialized` will be padded with `default_value` along the
+  second dimension.
+
   Examples:
 
-  For example, if one expects a `tf.float32` sparse feature `ft` and three
+  For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
   serialized `Example`s are provided:
 
   ```
@@ -304,6 +417,13 @@ def parse_example(serialized, features, name=None, example_names=None):
                       dense_shape=(3, 2)) }
   ```
 
+  If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
+  `shape=[]` is used then the output will look like:
+
+  ```
+  {"ft": [[1.0, 2.0], [3.0, -1.0]]}
+  ```
+
   Given two `Example` input protos in `serialized`:
 
   ```
@@ -384,7 +504,9 @@ def parse_example(serialized, features, name=None, example_names=None):
   }
   ```
 
-  Given two `Example` input protos in `serialized`:
+  An alternative to `VarLenFeature` to obtain a `SparseTensor` is
+  `SparseFeature`. For example, given two `Example` input protos in
+  `serialized`:
 
   ```
   [
@@ -437,9 +559,11 @@ def parse_example(serialized, features, name=None, example_names=None):
   """
   if not features:
     raise ValueError("Missing: features was %s." % features)
+  features = _prepend_none_dimension(features)
   (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
    dense_shapes) = _features_to_raw_params(
-       features, [VarLenFeature, SparseFeature, FixedLenFeature])
+       features,
+       [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
   outputs = _parse_example_raw(
       serialized, example_names, sparse_keys, sparse_types, dense_keys,
       dense_types, dense_defaults, dense_shapes, name)
@@ -476,8 +600,13 @@ def _parse_example_raw(serialized,
       The keys of the dict must match the dense_keys of the feature.
     dense_shapes: A list of tuples with the same length as `dense_keys`.
       The shape of the data for each dense feature referenced by `dense_keys`.
-      Required for any input tensors identified by `dense_keys` whose shapes are
-      anything other than `[]` or `[1]`.
+      Required for any input tensors identified by `dense_keys`.  Must be
+      either fully defined, or may contain an unknown first dimension.
+      An unknown first dimension means the feature is treated as having
+      a variable number of blocks, and the output shape along this dimension
+      is considered unknown at graph build time.  Padding is applied for
+      minibatch elements smaller than the maximum number of blocks for the
+      given feature along this dimension.
     name: A name for this operation (optional).
 
   Returns:
@@ -516,21 +645,42 @@ def _parse_example_raw(serialized,
           "Dense and sparse keys must not intersect; intersection: %s" %
           set(dense_keys).intersection(set(sparse_keys)))
 
+    # Convert dense_shapes to TensorShape object.
+    dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
+
     dense_defaults_vec = []
     for i, key in enumerate(dense_keys):
       default_value = dense_defaults.get(key)
-      if default_value is None:
-        default_value = constant_op.constant([], dtype=dense_types[i])
-      elif not isinstance(default_value, ops.Tensor):
-        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-        default_value = ops.convert_to_tensor(
-            default_value, dtype=dense_types[i], name=key_name)
-        default_value = array_ops.reshape(default_value, dense_shapes[i])
+      dense_shape = dense_shapes[i]
+      if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
+          dense_shape[0].value is None):
+        # Variable stride dense shape, the default value should be a
+        # scalar padding value
+        if default_value is None:
+          default_value = ops.convert_to_tensor(
+              "" if dense_types[i] == dtypes.string else 0,
+              dtype=dense_types[i])
+        else:
+          # Reshape to a scalar to ensure user gets an error if they
+          # provide a tensor that's not intended to be a padding value
+          # (0 or 2+ elements).
+          key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+          default_value = ops.convert_to_tensor(
+              default_value, dtype=dense_types[i], name=key_name)
+          default_value = array_ops.reshape(default_value, [])
+      else:
+        if default_value is None:
+          default_value = constant_op.constant([], dtype=dense_types[i])
+        elif not isinstance(default_value, ops.Tensor):
+          key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+          default_value = ops.convert_to_tensor(
+              default_value, dtype=dense_types[i], name=key_name)
+          default_value = array_ops.reshape(default_value, dense_shape)
 
       dense_defaults_vec.append(default_value)
 
-    dense_shapes = [tensor_shape.as_shape(shape).as_proto()
-                    for shape in dense_shapes]
+    # Finally, convert dense_shapes to TensorShapeProto
+    dense_shapes = [shape.as_proto() for shape in dense_shapes]
 
     # pylint: disable=protected-access
     outputs = gen_parsing_ops._parse_example(
@@ -587,9 +737,11 @@ def parse_single_example(serialized, features, name=None, example_names=None):
   """
   if not features:
     raise ValueError("Missing features.")
+  features = _prepend_none_dimension(features)
   (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
    dense_shapes) = _features_to_raw_params(
-       features, [VarLenFeature, FixedLenFeature, SparseFeature])
+       features,
+       [VarLenFeature, FixedLenFeature, FixedLenSequenceFeature, SparseFeature])
   outputs = _parse_single_example_raw(
       serialized, example_names, sparse_keys, sparse_types, dense_keys,
       dense_types, dense_defaults, dense_shapes, name)
@@ -693,7 +845,7 @@ def parse_single_sequence_example(
   Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
   proto given in `serialized`.
 
-  This op parses a serialize sequence example into a tuple of dictionaries
+  This op parses a serialized sequence example into a tuple of dictionaries
   mapping keys to `Tensor` and `SparseTensor` objects respectively.
   The first dictionary contains mappings for keys appearing in
   `context_features`, and the second dictionary contains mappings for keys
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 34b4d361021..15613289a0b 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -59,7 +60,7 @@ def random_normal(shape,
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for the operation (optional).
 
@@ -71,10 +72,8 @@ def random_normal(shape,
     mean_tensor = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev_tensor = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._random_standard_normal(shape_tensor,
-                                                 dtype,
-                                                 seed=seed1,
-                                                 seed2=seed2)
+    rnd = gen_random_ops._random_standard_normal(
+        shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
     return value
@@ -110,7 +109,7 @@ def parameterized_truncated_normal(shape,
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for the operation (optional).
 
@@ -125,13 +124,14 @@ def parameterized_truncated_normal(shape,
     minvals_tensor = ops.convert_to_tensor(minvals, dtype=dtype, name="minvals")
     maxvals_tensor = ops.convert_to_tensor(maxvals, dtype=dtype, name="maxvals")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._parameterized_truncated_normal(shape_tensor,
-                                                         means_tensor,
-                                                         stddevs_tensor,
-                                                         minvals_tensor,
-                                                         maxvals_tensor,
-                                                         seed=seed1,
-                                                         seed2=seed2)
+    rnd = gen_random_ops._parameterized_truncated_normal(
+        shape_tensor,
+        means_tensor,
+        stddevs_tensor,
+        minvals_tensor,
+        maxvals_tensor,
+        seed=seed1,
+        seed2=seed2)
     return rnd
 
 
@@ -156,7 +156,7 @@ def truncated_normal(shape,
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for the operation (optional).
 
@@ -168,10 +168,8 @@ def truncated_normal(shape,
     mean_tensor = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev_tensor = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
     seed1, seed2 = random_seed.get_seed(seed)
-    rnd = gen_random_ops._truncated_normal(shape_tensor,
-                                           dtype,
-                                           seed=seed1,
-                                           seed2=seed2)
+    rnd = gen_random_ops._truncated_normal(
+        shape_tensor, dtype, seed=seed1, seed2=seed2)
     mul = rnd * stddev_tensor
     value = math_ops.add(mul, mean_tensor, name=name)
     return value
@@ -210,8 +208,7 @@ def random_uniform(shape,
       floating point.
     dtype: The type of the output: `float32`, `float64`, `int32`, or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      See @{tf.set_random_seed}
       for behavior.
     name: A name for the operation (optional).
 
@@ -232,17 +229,11 @@ def random_uniform(shape,
     maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
     seed1, seed2 = random_seed.get_seed(seed)
     if dtype.is_integer:
-      return gen_random_ops._random_uniform_int(shape,
-                                                minval,
-                                                maxval,
-                                                seed=seed1,
-                                                seed2=seed2,
-                                                name=name)
+      return gen_random_ops._random_uniform_int(
+          shape, minval, maxval, seed=seed1, seed2=seed2, name=name)
     else:
-      rnd = gen_random_ops._random_uniform(shape,
-                                           dtype,
-                                           seed=seed1,
-                                           seed2=seed2)
+      rnd = gen_random_ops._random_uniform(
+          shape, dtype, seed=seed1, seed2=seed2)
       return math_ops.add(rnd * (maxval - minval), minval, name=name)
 
 
@@ -266,7 +257,7 @@ def random_shuffle(value, seed=None, name=None):
     value: A Tensor to be shuffled.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for the operation (optional).
 
@@ -275,10 +266,8 @@ def random_shuffle(value, seed=None, name=None):
     dimension.
   """
   seed1, seed2 = random_seed.get_seed(seed)
-  return gen_random_ops._random_shuffle(value,
-                                        seed=seed1,
-                                        seed2=seed2,
-                                        name=name)
+  return gen_random_ops._random_shuffle(
+      value, seed=seed1, seed2=seed2, name=name)
 
 
 def random_crop(value, size, seed=None, name=None):
@@ -295,7 +284,7 @@ def random_crop(value, size, seed=None, name=None):
     value: Input tensor to crop.
     size: 1-D tensor with size the rank of `value`.
     seed: Python integer. Used to create a random seed. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: A name for this operation (optional).
 
@@ -311,7 +300,8 @@ def random_crop(value, size, seed=None, name=None):
     shape = array_ops.shape(value)
     check = control_flow_ops.Assert(
         math_ops.reduce_all(shape >= size),
-        ["Need value.shape >= size, got ", shape, size])
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
     shape = control_flow_ops.with_dependencies([check], shape)
     limit = shape - size + 1
     offset = random_uniform(
@@ -335,11 +325,11 @@ def multinomial(logits, num_samples, seed=None, name=None):
 
   Args:
     logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
-      `[i, :]` represents the unnormalized log probabilities for all classes.
+      `[i, :]` represents the log-odds for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: Optional name for the operation.
 
@@ -349,10 +339,8 @@ def multinomial(logits, num_samples, seed=None, name=None):
   with ops.name_scope(name, "multinomial", [logits]):
     logits = ops.convert_to_tensor(logits, name="logits")
     seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.multinomial(logits,
-                                      num_samples,
-                                      seed=seed1,
-                                      seed2=seed2)
+    return gen_random_ops.multinomial(
+        logits, num_samples, seed=seed1, seed2=seed2)
 
 
 ops.NotDifferentiable("Multinomial")
@@ -382,25 +370,12 @@ def random_gamma(shape,
     samples = tf.random_gamma([30], [[1.],[3.],[5.]], beta=[[3., 4.]])
     # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.
 
-    Note that for small alpha values, there is a chance you will draw a value of
-    exactly 0, which gets worse for lower-precision dtypes, even though zero is
-    not in the support of the gamma distribution.
-
-    Relevant cdfs (~chance you will draw a exactly-0 value):
-    ```
-      stats.gamma(.01).cdf(np.finfo(np.float16).tiny)
-          0.91269738769897879
-      stats.gamma(.01).cdf(np.finfo(np.float32).tiny)
-          0.41992668622045726
-      stats.gamma(.01).cdf(np.finfo(np.float64).tiny)
-          0.00084322740680686662
-      stats.gamma(.35).cdf(np.finfo(np.float16).tiny)
-          0.037583276135263931
-      stats.gamma(.35).cdf(np.finfo(np.float32).tiny)
-          5.9514895726818067e-14
-      stats.gamma(.35).cdf(np.finfo(np.float64).tiny)
-          2.3529843400647272e-108
-    ```
+    Note: Because internal calculations are done using `float64` and casting has
+    `floor` semantics, we must manually map zero outcomes to the smallest
+    possible positive floating-point value, i.e., `np.finfo(dtype).tiny`.  This
+    means that `np.finfo(dtype).tiny` occurs more frequently than it otherwise
+    should.  This bias can only happen for small values of `alpha`, i.e.,
+    `alpha << 1` or large values of `beta`, i.e., `beta >> 1`.
 
   Args:
     shape: A 1-D integer Tensor or Python array. The shape of the output samples
@@ -415,7 +390,7 @@ def random_gamma(shape,
       `float64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      @{tf.set_random_seed}
       for behavior.
     name: Optional name for the operation.
 
@@ -426,15 +401,53 @@ def random_gamma(shape,
   with ops.name_scope(name, "random_gamma", [shape, alpha, beta]):
     shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
     alpha = ops.convert_to_tensor(alpha, name="alpha", dtype=dtype)
-    beta = ops.convert_to_tensor(beta if beta is not None else 1,
-                                 name="beta",
-                                 dtype=dtype)
+    beta = ops.convert_to_tensor(
+        beta if beta is not None else 1, name="beta", dtype=dtype)
     alpha_broadcast = alpha + array_ops.zeros_like(beta)
     seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops._random_gamma(shape,
-                                        alpha_broadcast,
-                                        seed=seed1,
-                                        seed2=seed2) / beta
-
+    return math_ops.maximum(
+        np.finfo(dtype.as_numpy_dtype).tiny,
+        gen_random_ops._random_gamma(
+            shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 ops.NotDifferentiable("RandomGamma")
+
+
+def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
+  """Draws `shape` samples from each of the given Poisson distribution(s).
+
+  `lam` is the rate parameter describing the distribution(s).
+
+  Example:
+
+    samples = tf.random_poisson([0.5, 1.5], [10])
+    # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
+    # the samples drawn from each distribution
+
+    samples = tf.random_poisson([12.2, 3.3], [7, 5])
+    # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
+    # represents the 7x5 samples drawn from each of the two distributions
+
+  Args:
+    lam: A Tensor or Python value or N-D array of type `dtype`.
+      `lam` provides the rate parameter(s) describing the poisson
+      distribution(s) to sample.
+    shape: A 1-D integer Tensor or Python array. The shape of the output samples
+      to be drawn per "rate"-parameterized distribution.
+    dtype: The type of `lam` and the output: `float16`, `float32`, or
+      `float64`.
+    seed: A Python integer. Used to create a random seed for the distributions.
+      See
+      @{tf.set_random_seed}
+      for behavior.
+    name: Optional name for the operation.
+
+  Returns:
+    samples: a `Tensor` of shape `tf.concat(shape, tf.shape(lam))` with
+      values of type `dtype`.
+  """
+  with ops.name_scope(name, "random_poisson", [lam, shape]):
+    lam = ops.convert_to_tensor(lam, name="lam", dtype=dtype)
+    shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_random_ops._random_poisson(shape, lam, seed=seed1, seed2=seed2)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 626b8bf8d9c..465b73bda5e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
@@ -32,17 +35,7 @@ from tensorflow.python.ops.gen_resource_variable_ops import *
 from tensorflow.python.util import compat
 
 
-def _register_variable_read(read, collections, trainable):
-  """Helper function to put a read from a variable in the collections."""
-  if collections is None:
-    collections = []
-  if (trainable and ops.GraphKeys.TRAINABLE_VARIABLES
-       not in collections):
-    collections = (list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES])
-    ops.add_to_collections(collections, read)
-
-
-class ResourceVariable(object):
+class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
   TODO(apassos): fill this out explaining the semantics and Variable
@@ -167,16 +160,15 @@ class ResourceVariable(object):
     with ops.control_dependencies(None):
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
+        # pylint: disable=protected-access
+        true_name = ops._name_from_scope_name(name)
         if init_from_fn:
           # Use attr_scope and device(None) to simulate the behavior of
           # colocate_with when the variable we want to colocate with doesn't
           # yet exist.
-          # pylint: disable=protected-access
-          true_name = ops._name_from_scope_name(name)
           attr = attr_value_pb2.AttrValue(
               list=attr_value_pb2.AttrValue.ListValue(
                   s=[compat.as_bytes("loc:@%s" % true_name)]))
-          # pylint: disable=protected-access
           with ops.get_default_graph()._attr_scope({"_class": attr}):
             with ops.name_scope("Initializer"), ops.device(None):
               self._initial_value = ops.convert_to_tensor(
@@ -184,16 +176,25 @@ class ResourceVariable(object):
             self._handle = gen_resource_variable_ops.var_handle_op(
                 shape=self._initial_value.get_shape(),
                 dtype=self._initial_value.dtype.base_dtype,
-                shared_name=name, name=name)
+                shared_name=true_name, name=name)
+        # pylint: enable=protected-access
 
         # Or get the initial value from a Tensor or Python object.
         else:
           self._initial_value = ops.convert_to_tensor(
               initial_value, name="initial_value", dtype=dtype)
+          # pylint: disable=protected-access
+          if self._initial_value.op._get_control_flow_context() is not None:
+            raise ValueError(
+                "Initializer for variable %s is from inside a control-flow "
+                "construct, such as a loop or conditional. When creating a "
+                "variable inside a loop or conditional, use a lambda as the "
+                "initializer." % name)
+          # pylint: enable=protected-access
           self._handle = gen_resource_variable_ops.var_handle_op(
               shape=self._initial_value.get_shape(),
               dtype=self._initial_value.dtype.base_dtype,
-              shared_name=name, name=name)
+              shared_name=true_name, name=name)
 
         self._dtype = self._initial_value.dtype.base_dtype
 
@@ -202,21 +203,27 @@ class ResourceVariable(object):
               gen_resource_variable_ops.var_is_initialized_op(self._handle))
         if initial_value is not None:
           with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-            self._initialize_op = gen_resource_variable_ops.assign_variable_op(
+            self._initializer_op = gen_resource_variable_ops.assign_variable_op(
                 self._handle, self._initial_value, name=n)
         with ops.name_scope("Read"), ops.colocate_with(self._handle):
-          self._value = gen_resource_variable_ops.read_variable_op(
-              self._handle, dtype=self._dtype)
+          # Manually assign reads to the handle's device to avoid log messages.
+          with ops.device(self._handle.device):
+            value = gen_resource_variable_ops.read_variable_op(
+                self._handle, dtype=self._dtype)
+          self._graph_element = value
           if caching_device is not None:
-            with ops.device(caching_device):
-              self._cached_value = array_ops.identity(self._value)
+            # Variables may be created in a tf.device() or ops.colocate_with()
+            # context. At the same time, users would expect caching device to be
+            # independent of this context, and/or would not expect the current
+            # device context to be merged with the caching device spec.
+            # Therefore we reset the colocation stack before creating the cached
+            # value. Note that resetting the colocation stack will also reset
+            # the device stack.
+            with ops.colocate_with(None, ignore_existing=True):
+              with ops.device(caching_device):
+                self._cached_value = array_ops.identity(value)
           else:
-            with ops.colocate_with(self._handle.op):
-              self._cached_value = array_ops.identity(self._value)
-          # TODO(apassos) this is terrible monkey-patching required to make
-          # initialize_all_variables work. Replace self._value with an explicit
-          # class instead of monkey-patching.
-          self._value.initializer = self._initialize_op
+            self._cached_value = None
           ops.add_to_collections(collections, self)
 
   def _init_from_proto(self, variable_def, import_scope=None):
@@ -230,20 +237,23 @@ class ResourceVariable(object):
     self._handle = g.as_graph_element(
         ops.prepend_name_scope(variable_def.variable_name,
                                import_scope=import_scope))
-    self._initialize_op = g.as_graph_element(
+    self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(variable_def.initializer_name,
                                import_scope=import_scope))
-    self._cached_value = g.as_graph_element(
-        ops.prepend_name_scope(variable_def.snapshot_name,
-                               import_scope=import_scope))
-    self._value = self._cached_value
+    if variable_def.snapshot_name:
+      self._cached_value = g.as_graph_element(
+          ops.prepend_name_scope(variable_def.snapshot_name,
+                                 import_scope=import_scope))
+    else:
+      self._cached_value = None
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def)
     else:
       self._save_slice_info = None
     self._caching_device = None
-    self._dtype = self._handle.op.get_attr("dtype")
+    self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
+    self._graph_element = self.value()
 
   @property
   def dtype(self):
@@ -255,6 +265,11 @@ class ResourceVariable(object):
     """The device this variable is on."""
     return self._handle.device
 
+  @property
+  def graph(self):
+    """The `Graph` of this variable."""
+    return self._handle.graph
+
   @property
   def name(self):
     """The name of the handle for this variable."""
@@ -262,12 +277,12 @@ class ResourceVariable(object):
 
   def get_shape(self):
     """The shape of this variable."""
-    return self._value.get_shape()
+    return tensor_shape.TensorShape(self._handle.op.get_attr("shape"))
 
   @property
   def create(self):
     """The op responsible for initializing this variable."""
-    return self._initialize_op
+    return self._initializer_op
 
   @property
   def handle(self):
@@ -276,16 +291,26 @@ class ResourceVariable(object):
 
   def value(self):
     """A cached operation which reads the value of this variable."""
-    return self._cached_value
+    if self._cached_value is not None:
+      return self._cached_value
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._handle.device):
+        return gen_resource_variable_ops.read_variable_op(
+            self._handle, dtype=self._dtype)
 
   def _as_graph_element(self):
     """Conversion function for Graph.as_graph_element()."""
-    return self._value
+    return self._graph_element
 
   @property
   def initializer(self):
     """The op responsible for initializing this variable."""
-    return self._initialize_op
+    return self._initializer_op
+
+  @property
+  def initial_value(self):
+    """Returns the Tensor used as the initial value for the variable."""
+    return self._initial_value
 
   @property
   def op(self):
@@ -294,7 +319,7 @@ class ResourceVariable(object):
 
   def eval(self, session=None):
     """Evaluates and returns the value of this variable."""
-    return self._value.eval(session=session)
+    return self._graph_element.eval(session=session)
 
   def _set_save_slice_info(self, save_slice_info):
     """Sets the slice info for this `ResourceVariable`.
@@ -307,31 +332,28 @@ class ResourceVariable(object):
   def _get_save_slice_info(self):
     return self._save_slice_info
 
-  def read_value(self, collections=None, trainable=True):
+  def read_value(self):
     """Constructs an op which reads the value of this variable.
 
     Should be used when there are multiple reads, or when it is desirable to
     read the value only after some condition is true.
 
-    Args:
-     collections: any collections in which this operation should be inserted.
-     trainable: whether this read is to be used for training.
-
     Returns:
      the read operation.
     """
     with ops.name_scope("Read"):
-      value = gen_resource_variable_ops.read_variable_op(
-          self._handle, dtype=self._dtype)
-    _register_variable_read(value, collections=collections, trainable=trainable)
+      with ops.device(self._handle.device):
+        value = gen_resource_variable_ops.read_variable_op(
+            self._handle, dtype=self._dtype)
+    # Return an identity so it can get placed on whatever device the context
+    # specifies instead of the device where the variable is.
     return array_ops.identity(value)
 
-  def sparse_read(self, indices, collections=None, trainable=True, name=None):
+  def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     with ops.name_scope("Gather" if name is None else name) as name:
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
-    _register_variable_read(value, collections=collections, trainable=trainable)
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):
@@ -351,8 +373,9 @@ class ResourceVariable(object):
           self.handle.name, export_scope)
       var_def.initializer_name = ops.strip_name_scope(
           self.initializer.name, export_scope)
-      var_def.snapshot_name = ops.strip_name_scope(
-          self.value().name, export_scope)
+      if self._cached_value is not None:
+        var_def.snapshot_name = ops.strip_name_scope(
+            self._cached_value.name, export_scope)
       var_def.is_resource = True
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto(
@@ -379,6 +402,10 @@ class ResourceVariable(object):
   def _AsTensor(self):
     return self.value()
 
+  def _ref(self):
+    """Unsupported."""
+    raise NotImplementedError("ResourceVariable does not implement _ref()")
+
   @staticmethod
   def _OverloadOperator(operator):  # pylint: disable=invalid-name
     """Defer an operator overload to `ops.Tensor`.
@@ -426,6 +453,31 @@ class ResourceVariable(object):
             ops.convert_to_tensor(value, dtype=self.dtype), name=name)]):
       return self.read_value()
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    with ops.control_dependencies([gen_array_ops.resource_strided_slice_assign(
+        ref=self.handle,
+        begin=begin,
+        end=end,
+        strides=strides,
+        value=value,
+        name=name,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        ellipsis_mask=ellipsis_mask,
+        new_axis_mask=new_axis_mask,
+        shrink_axis_mask=shrink_axis_mask)]):
+      return self.value()
+
 
 # pylint: disable=unused-argument,protected-access
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
@@ -433,19 +485,50 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
     print("trying to switch the dtype to ", dtype, " from ", var.value().dtype)
     return NotImplemented
   if as_ref:
-    return var._value
-  return var._cached_value
+    return var.read_value().op.inputs[0]
+  return var.value()
 # pylint: enable=unused-argument,protected-access
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
+
+# Note: registering for Variable after ResourceVariable because inheritance will
+# otherwise lead to the wrong behavior.
 ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
+ops.register_tensor_conversion_function(
+    variables.Variable,
+    variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
 
 # pylint: disable=protected-access
 ResourceVariable._OverloadAllOperators()
 ops.register_dense_tensor_like_type(ResourceVariable)
 
 
+@ops.RegisterGradient("ReadVariableOp")
+def _ReadGrad(_, grad):
+  """Gradient for read op."""
+  return grad
+
+
+@ops.RegisterGradient("ResourceGather")
+def _GatherGrad(op, grad):
+  """Gradient for gather op."""
+  # Build appropriately shaped IndexedSlices
+  # Walk graph back until the original handle is found.
+  # TODO(apassos): more robust way of getting the shape.
+  handle = op.inputs[0]
+  while handle.op.type != "VarHandleOp":
+    handle = handle.op.inputs[0]
+  params_shape = ops.convert_to_tensor(
+      tensor_shape.TensorShape(handle.op.get_attr("shape")))
+  indices = op.inputs[1]
+  size = array_ops.expand_dims(array_ops.size(indices), 0)
+  values_shape = array_ops.concat([size, params_shape[1:]], 0)
+  values = array_ops.reshape(grad, values_shape)
+  indices = array_ops.reshape(indices, size)
+  return [ops.IndexedSlices(values, indices, params_shape), None]
+
+
 def _to_proto_fn(v, export_scope=None):
   """Converts Variable and ResourceVariable to VariableDef for collections."""
   return v.to_proto(export_scope=export_scope)
@@ -473,3 +556,13 @@ ops.register_proto_function(
     proto_type=variable_pb2.VariableDef,
     to_proto=_to_proto_fn,
     from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.LOCAL_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.MODEL_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index 41fb8a74a9e..57ba0084e84 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_should_use
 
 
 _Resource = collections.namedtuple("_Resource",
@@ -98,6 +99,7 @@ def report_uninitialized_resources(resource_list=None,
     return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 
+@tf_should_use.should_use_result
 def initialize_resources(resource_list, name="init"):
   """Initializes the resources in the given list.
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 06ae3589a27..3c3c18b1c9b 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -13,8 +13,16 @@
 # limitations under the License.
 # ==============================================================================
 
-"""RNN helpers for TensorFlow models."""
+"""RNN helpers for TensorFlow models.
 
+
+@@bidirectional_dynamic_rnn
+@@dynamic_rnn
+@@raw_rnn
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -33,10 +41,68 @@ from tensorflow.python.util import nest
 
 
 # pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
+_concat = rnn_cell_impl._concat
+_like_rnncell = rnn_cell_impl._like_rnncell
 # pylint: enable=protected-access
 
 
+def _transpose_batch_time(x):
+  """Transpose the batch and time dimensions of a Tensor.
+
+  Retains as much of the static shape information as possible.
+
+  Args:
+    x: A tensor of rank 2 or higher.
+
+  Returns:
+    x transposed along the first two dimensions.
+
+  Raises:
+    ValueError: if `x` is rank 1 or lower.
+  """
+  x_static_shape = x.get_shape()
+  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+    raise ValueError(
+        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+        (x, x_static_shape))
+  x_rank = array_ops.rank(x)
+  x_t = array_ops.transpose(
+      x, array_ops.concat(
+          ([1, 0], math_ops.range(2, x_rank)), axis=0))
+  x_t.set_shape(
+      tensor_shape.TensorShape([
+          x_static_shape[1].value, x_static_shape[0].value
+      ]).concatenate(x_static_shape[2:]))
+  return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+  """Get static input batch size if available, with fallback to the dynamic one.
+
+  Args:
+    flat_input: An iterable of time major input Tensors of shape [max_time,
+      batch_size, ...]. All inputs should have compatible batch sizes.
+
+  Returns:
+    The batch size in Python integer if available, or a scalar Tensor otherwise.
+
+  Raises:
+    ValueError: if there is any input with an invalid shape.
+  """
+  for input_ in flat_input:
+    shape = input_.shape
+    if shape.ndims is None:
+      continue
+    if shape.ndims < 2:
+      raise ValueError(
+          "Expected input tensor %s to have rank at least 2" % input_)
+    batch_size = shape[1].value
+    if batch_size is not None:
+      return batch_size
+  # Fallback to the dynamic batch size of the first input.
+  return array_ops.shape(flat_input[0])[1]
+
+
 def _infer_state_dtype(explicit_dtype, state):
   """Infer the dtype of an RNN state.
 
@@ -67,15 +133,6 @@ def _infer_state_dtype(explicit_dtype, state):
     return state.dtype
 
 
-def _on_device(fn, device):
-  """Build the subgraph defined by lambda `fn` on `device` if it's not None."""
-  if device:
-    with ops.device(device):
-      return fn()
-  else:
-    return fn()
-
-
 # pylint: disable=unused-argument
 def _rnn_step(
     time, sequence_length, min_sequence_length, max_sequence_length,
@@ -136,10 +193,12 @@ def _rnn_step(
   flat_zero_output = nest.flatten(zero_output)
 
   def _copy_one_through(output, new_output):
+    # If the state contains a scalar value we simply pass it through.
+    if output.shape.ndims == 0:
+      return new_output
     copy_cond = (time >= sequence_length)
-    return _on_device(
-        lambda: array_ops.where(copy_cond, output, new_output),
-        device=new_output.op.device)
+    with ops.colocate_with(new_output):
+      return array_ops.where(copy_cond, output, new_output)
 
   def _copy_some_through(flat_new_output, flat_new_state):
     # Use broadcasting select to determine which values should get
@@ -258,11 +317,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               swap_memory=False, time_major=False, scope=None):
   """Creates a dynamic version of bidirectional recurrent neural network.
 
-  Similar to the unidirectional case above (rnn) but takes input and builds
-  independent forward and backward RNNs. The input_size of forward and
-  backward cell must match. The initial state for both directions is zero by
-  default (but can be set optionally) and no intermediate states are ever
-  returned -- the network is fully unrolled for the given (passed in)
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
   length(s) of the sequence(s) or completely unrolled if length(s) is not
   given.
 
@@ -271,12 +329,13 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
     cell_bw: An instance of RNNCell, to be used for backward direction.
     inputs: The RNN inputs.
       If time_major == False (default), this must be a tensor of shape:
-        `[batch_size, max_time, input_size]`.
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
       If time_major == True, this must be a tensor of shape:
-        `[max_time, batch_size, input_size]`.
-      [batch_size, input_size].
-    sequence_length: An int32/int64 vector, size `[batch_size]`,
-      containing the actual lengths for each of the sequences.
+        `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences in the batch.
+      If not provided, all batch entries are assumed to be full sequences; and
+      time reversal is applied from time `0` to `max_time` for each sequence.
     initial_state_fw: (optional) An initial state for the forward RNN.
       This must be a tensor of appropriate type and shape
       `[batch_size, cell_fw.state_size]`.
@@ -303,8 +362,6 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
       transposes at the beginning and end of the RNN calculation.  However,
       most TensorFlow data is batch-major, so by default this function
       accepts input and emits output in batch-major form.
-    dtype: (optional) The data type for the initial state.  Required if
-      either of the initial states are not provided.
     scope: VariableScope for the created subgraph; defaults to
       "bidirectional_rnn"
 
@@ -333,12 +390,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell_fw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_fw):
     raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_bw):
     raise TypeError("cell_bw must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   with vs.variable_scope(scope or "bidirectional_rnn"):
     # Forward direction
@@ -357,9 +412,17 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
       time_dim = 0
       batch_dim = 1
 
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        return array_ops.reverse(input_, axis=[seq_dim])
+
     with vs.variable_scope("bw") as bw_scope:
-      inputs_reverse = array_ops.reverse_sequence(
-          input=inputs, seq_lengths=sequence_length,
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
           seq_dim=time_dim, batch_dim=batch_dim)
       tmp, output_state_bw = dynamic_rnn(
           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
@@ -367,8 +430,8 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
           time_major=time_major, scope=bw_scope)
 
-  output_bw = array_ops.reverse_sequence(
-      input=tmp, seq_lengths=sequence_length,
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
       seq_dim=time_dim, batch_dim=batch_dim)
 
   outputs = (output_fw, output_bw)
@@ -382,12 +445,10 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 time_major=False, scope=None):
   """Creates a recurrent neural network specified by RNNCell `cell`.
 
-  This function is functionally identical to the function `rnn` above, but
-  performs fully dynamic unrolling of `inputs`.
+  Performs fully dynamic unrolling of `inputs`.
 
-  Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
-  each frame.  Instead, `inputs` may be a single `Tensor` where
-  the maximum time is either the first or second dimension (see the parameter
+  `Inputs` may be a single `Tensor` where the maximum time is either the first
+  or second dimension (see the parameter
   `time_major`).  Alternatively, it may be a (possibly nested) tuple of
   Tensors, each of them having matching batch and time dimensions.
   The corresponding output is either a single `Tensor` having the same number
@@ -396,7 +457,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
   The parameter `sequence_length` is optional and is used to copy-through state
   and zero-out outputs when past a batch element's sequence length. So it's more
-  for correctness than performance, unlike in rnn().
+  for correctness than performance.
 
   Args:
     cell: An instance of RNNCell.
@@ -472,11 +533,8 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     TypeError: If `cell` is not an instance of RNNCell.
     ValueError: If inputs is None or an empty list.
   """
-
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   # By default, time_major==False and inputs are batch-major: shaped
   #   [batch, time, depth]
@@ -485,8 +543,8 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
   if not time_major:
     # (B,T,D) => (T,B,D)
-    flat_input = tuple(array_ops.transpose(input_, [1, 0, 2])
-                       for input_ in flat_input)
+    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+    flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
 
   parallel_iterations = parallel_iterations or 32
   if sequence_length is not None:
@@ -504,18 +562,13 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
   with vs.variable_scope(scope or "rnn") as varscope:
     if varscope.caching_device is None:
       varscope.set_caching_device(lambda op: op.device)
-    input_shape = tuple(array_ops.shape(input_) for input_ in flat_input)
-    batch_size = input_shape[0][1]
-
-    for input_ in input_shape:
-      if input_[1].get_shape() != batch_size.get_shape():
-        raise ValueError("All inputs should have the same batch size")
+    batch_size = _best_effort_input_batch_size(flat_input)
 
     if initial_state is not None:
       state = initial_state
     else:
       if not dtype:
-        raise ValueError("If no initial_state is provided, dtype must be.")
+        raise ValueError("If there is no initial_state, you must give a dtype.")
       state = cell.zero_state(batch_size, dtype)
 
     def _assert_has_shape(x, shape):
@@ -549,11 +602,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     # to shape [batch, time, depth]
     if not time_major:
       # (T,B,D) => (B,T,D)
-      flat_output = nest.flatten(outputs)
-      flat_output = [array_ops.transpose(output, [1, 0, 2])
-                     for output in flat_output]
-      outputs = nest.pack_sequence_as(
-          structure=outputs, flat_sequence=flat_output)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
 
     return (outputs, final_state)
 
@@ -606,7 +655,7 @@ def _dynamic_rnn_loop(cell,
   # Construct an initial output
   input_shape = array_ops.shape(flat_input[0])
   time_steps = input_shape[0]
-  batch_size = input_shape[1]
+  batch_size = _best_effort_input_batch_size(flat_input)
 
   inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
                            for input_ in flat_input)
@@ -630,7 +679,7 @@ def _dynamic_rnn_loop(cell,
 
   # Prepare dynamic conditional copying of state & output
   def _create_zero_arrays(size):
-    size = _state_size_with_prefix(size, prefix=[batch_size])
+    size = _concat(batch_size, size)
     return array_ops.zeros(
         array_ops.stack(size), _infer_state_dtype(dtype, state))
 
@@ -656,7 +705,7 @@ def _dynamic_rnn_loop(cell,
   output_ta = tuple(_create_ta("output_%d" % i,
                                _infer_state_dtype(dtype, state))
                     for i in range(len(flat_output_size)))
-  input_ta = tuple(_create_ta("input_%d" % i, flat_input[0].dtype)
+  input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
                    for i in range(len(flat_input)))
 
   input_ta = tuple(ta.unstack(input_)
@@ -716,8 +765,8 @@ def _dynamic_rnn_loop(cell,
 
   # Restore some shape information
   for output, output_size in zip(final_outputs, flat_output_size):
-    shape = _state_size_with_prefix(
-        output_size, prefix=[const_time_steps, const_batch_size])
+    shape = _concat(
+        [const_time_steps, const_batch_size], output_size, static=True)
     output.set_shape(shape)
 
   final_outputs = nest.pack_sequence_as(
@@ -891,10 +940,8 @@ def raw_rnn(cell, loop_fn,
       a `callable`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
   if not callable(loop_fn):
     raise TypeError("loop_fn must be a callable")
 
@@ -937,7 +984,8 @@ def raw_rnn(cell, loop_fn,
 
     if emit_structure is not None:
       flat_emit_structure = nest.flatten(emit_structure)
-      flat_emit_size = [emit.get_shape() for emit in flat_emit_structure]
+      flat_emit_size = [emit.shape if emit.shape.is_fully_defined() else
+                        array_ops.shape(emit) for emit in flat_emit_structure]
       flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure]
     else:
       emit_structure = cell.output_size
@@ -951,9 +999,7 @@ def raw_rnn(cell, loop_fn,
     emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                     flat_sequence=flat_emit_ta)
     flat_zero_emit = [
-        array_ops.zeros(
-            _state_size_with_prefix(size_i, prefix=[batch_size]),
-            dtype_i)
+        array_ops.zeros(_concat(batch_size, size_i), dtype_i)
         for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]
     zero_emit = nest.pack_sequence_as(structure=emit_structure,
                                       flat_sequence=flat_zero_emit)
@@ -996,34 +1042,19 @@ def raw_rnn(cell, loop_fn,
 
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
-        current_flat = nest.flatten(current)
-        candidate_flat = nest.flatten(candidate)
-        # pylint: disable=g-long-lambda,cell-var-from-loop
-        result_flat = [
-            _on_device(
-                lambda: array_ops.where(
-                    elements_finished, current_i, candidate_i),
-                device=candidate_i.op.device)
-            for (current_i, candidate_i) in zip(current_flat, candidate_flat)]
-        # pylint: enable=g-long-lambda,cell-var-from-loop
-        return nest.pack_sequence_as(
-            structure=current, flat_sequence=result_flat)
+        def copy_fn(cur_i, cand_i):
+          with ops.colocate_with(cand_i):
+            return array_ops.where(elements_finished, cur_i, cand_i)
+        return nest.map_structure(copy_fn, current, candidate)
 
       emit_output = _copy_some_through(zero_emit, emit_output)
       next_state = _copy_some_through(state, next_state)
 
-      emit_output_flat = nest.flatten(emit_output)
-      emit_ta_flat = nest.flatten(emit_ta)
+      emit_ta = nest.map_structure(
+          lambda ta, emit: ta.write(time, emit), emit_ta, emit_output)
 
       elements_finished = math_ops.logical_or(elements_finished, next_finished)
 
-      emit_ta_flat = [
-          ta.write(time, emit)
-          for (ta, emit) in zip(emit_ta_flat, emit_output_flat)]
-
-      emit_ta = nest.pack_sequence_as(
-          structure=emit_structure, flat_sequence=emit_ta_flat)
-
       return (next_time, elements_finished, next_input,
               emit_ta, next_state, loop_state)
 
@@ -1040,3 +1071,351 @@ def raw_rnn(cell, loop_fn,
       final_loop_state = None
 
     return (emit_ta, final_state, final_loop_state)
+
+
+def static_rnn(cell,
+               inputs,
+               initial_state=None,
+               dtype=None,
+               sequence_length=None,
+               scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  The simplest form of RNN network generated is:
+
+  ```python
+    state = cell.zero_state(...)
+    outputs = []
+    for input_ in inputs:
+      output, state = cell(input_, state)
+      outputs.append(output)
+    return (outputs, state)
+  ```
+  However, a few other options are available:
+
+  An initial state can be provided.
+  If the sequence_length vector is provided, dynamic calculation is performed.
+  This method of calculation does not compute the RNN steps past the maximum
+  sequence length of the minibatch (thus saving computational time),
+  and properly propagates the state at an example's sequence length
+  to the final state output.
+
+  The dynamic calculation performed is, at time `t` for batch row `b`,
+
+  ```python
+    (output, state)(b, t) =
+      (t >= sequence_length(b))
+        ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
+        : cell(input(b, t), state(b, t - 1))
+  ```
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: A length T list of inputs, each a `Tensor` of shape
+      `[batch_size, input_size]`, or a nested tuple of such elements.
+    initial_state: (optional) An initial state for the RNN.
+      If `cell.state_size` is an integer, this must be
+      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+      If `cell.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    sequence_length: Specifies the length of each sequence in inputs.
+      An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    - outputs is a length T list of outputs (one for each input), or a nested
+      tuple of such elements.
+    - state is the final state
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If `inputs` is `None` or an empty list, or if the input depth
+      (column size) cannot be inferred from inputs via shape inference.
+  """
+
+  if not _like_rnncell(cell):
+    raise TypeError("cell must be an instance of RNNCell")
+  if not nest.is_sequence(inputs):
+    raise TypeError("inputs must be a sequence")
+  if not inputs:
+    raise ValueError("inputs must not be empty")
+
+  outputs = []
+  # Create a new scope in which the caching device is either
+  # determined by the parent scope, or is set to place the cached
+  # Variable using the same placement as for the rest of the RNN.
+  with vs.variable_scope(scope or "rnn") as varscope:
+    if varscope.caching_device is None:
+      varscope.set_caching_device(lambda op: op.device)
+
+    # Obtain the first sequence of the input
+    first_input = inputs
+    while nest.is_sequence(first_input):
+      first_input = first_input[0]
+
+    # Temporarily avoid EmbeddingWrapper and seq2seq badness
+    # TODO(lukaszkaiser): remove EmbeddingWrapper
+    if first_input.get_shape().ndims != 1:
+
+      input_shape = first_input.get_shape().with_rank_at_least(2)
+      fixed_batch_size = input_shape[0]
+
+      flat_inputs = nest.flatten(inputs)
+      for flat_input in flat_inputs:
+        input_shape = flat_input.get_shape().with_rank_at_least(2)
+        batch_size, input_size = input_shape[0], input_shape[1:]
+        fixed_batch_size.merge_with(batch_size)
+        for i, size in enumerate(input_size):
+          if size.value is None:
+            raise ValueError(
+                "Input size (dimension %d of inputs) must be accessible via "
+                "shape inference, but saw value None." % i)
+    else:
+      fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0]
+
+    if fixed_batch_size.value:
+      batch_size = fixed_batch_size.value
+    else:
+      batch_size = array_ops.shape(first_input)[0]
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If no initial_state is provided, "
+                         "dtype must be specified")
+      state = cell.zero_state(batch_size, dtype)
+
+    if sequence_length is not None:  # Prepare variables
+      sequence_length = ops.convert_to_tensor(
+          sequence_length, name="sequence_length")
+      if sequence_length.get_shape().ndims not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size")
+
+      def _create_zero_output(output_size):
+        # convert int to TensorShape if necessary
+        size = _concat(batch_size, output_size)
+        output = array_ops.zeros(
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+        shape = _concat(fixed_batch_size.value, output_size, static=True)
+        output.set_shape(tensor_shape.TensorShape(shape))
+        return output
+
+      output_size = cell.output_size
+      flat_output_size = nest.flatten(output_size)
+      flat_zero_output = tuple(
+          _create_zero_output(size) for size in flat_output_size)
+      zero_output = nest.pack_sequence_as(
+          structure=output_size, flat_sequence=flat_zero_output)
+
+      sequence_length = math_ops.to_int32(sequence_length)
+      min_sequence_length = math_ops.reduce_min(sequence_length)
+      max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    for time, input_ in enumerate(inputs):
+      if time > 0:
+        varscope.reuse_variables()
+      # pylint: disable=cell-var-from-loop
+      call_cell = lambda: cell(input_, state)
+      # pylint: enable=cell-var-from-loop
+      if sequence_length is not None:
+        (output, state) = _rnn_step(
+            time=time,
+            sequence_length=sequence_length,
+            min_sequence_length=min_sequence_length,
+            max_sequence_length=max_sequence_length,
+            zero_output=zero_output,
+            state=state,
+            call_cell=call_cell,
+            state_size=cell.state_size)
+      else:
+        (output, state) = call_cell()
+
+      outputs.append(output)
+
+    return (outputs, state)
+
+
+def static_state_saving_rnn(cell,
+                            inputs,
+                            state_saver,
+                            state_name,
+                            sequence_length=None,
+                            scope=None):
+  """RNN that accepts a state saver for time-truncated RNN calculation.
+
+  Args:
+    cell: An instance of `RNNCell`.
+    inputs: A length T list of inputs, each a `Tensor` of shape
+      `[batch_size, input_size]`.
+    state_saver: A state saver object with methods `state` and `save_state`.
+    state_name: Python string or tuple of strings.  The name to use with the
+      state_saver. If the cell returns tuples of states (i.e.,
+      `cell.state_size` is a tuple) then `state_name` should be a tuple of
+      strings having the same length as `cell.state_size`.  Otherwise it should
+      be a single string.
+    sequence_length: (optional) An int32/int64 vector size [batch_size].
+      See the documentation for rnn() for more details about sequence_length.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+      outputs is a length T list of outputs (one for each input)
+      states is the final state
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If `inputs` is `None` or an empty list, or if the arity and
+     type of `state_name` does not match that of `cell.state_size`.
+  """
+  state_size = cell.state_size
+  state_is_tuple = nest.is_sequence(state_size)
+  state_name_tuple = nest.is_sequence(state_name)
+
+  if state_is_tuple != state_name_tuple:
+    raise ValueError("state_name should be the same type as cell.state_size.  "
+                     "state_name: %s, cell.state_size: %s" % (str(state_name),
+                                                              str(state_size)))
+
+  if state_is_tuple:
+    state_name_flat = nest.flatten(state_name)
+    state_size_flat = nest.flatten(state_size)
+
+    if len(state_name_flat) != len(state_size_flat):
+      raise ValueError("#elems(state_name) != #elems(state_size): %d vs. %d" %
+                       (len(state_name_flat), len(state_size_flat)))
+
+    initial_state = nest.pack_sequence_as(
+        structure=state_size,
+        flat_sequence=[state_saver.state(s) for s in state_name_flat])
+  else:
+    initial_state = state_saver.state(state_name)
+
+  (outputs, state) = static_rnn(
+      cell,
+      inputs,
+      initial_state=initial_state,
+      sequence_length=sequence_length,
+      scope=scope)
+
+  if state_is_tuple:
+    flat_state = nest.flatten(state)
+    state_name = nest.flatten(state_name)
+    save_state = [
+        state_saver.save_state(name, substate)
+        for name, substate in zip(state_name, flat_state)
+    ]
+  else:
+    save_state = [state_saver.save_state(state_name, state)]
+
+  with ops.control_dependencies(save_state):
+    last_output = outputs[-1]
+    flat_last_output = nest.flatten(last_output)
+    flat_last_output = [
+        array_ops.identity(output) for output in flat_last_output
+    ]
+    outputs[-1] = nest.pack_sequence_as(
+        structure=last_output, flat_sequence=flat_last_output)
+
+  return (outputs, state)
+
+
+def static_bidirectional_rnn(cell_fw,
+                             cell_bw,
+                             inputs,
+                             initial_state_fw=None,
+                             initial_state_bw=None,
+                             dtype=None,
+                             sequence_length=None,
+                             scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Similar to the unidirectional case above (rnn) but takes input and builds
+  independent forward and backward RNNs with the final forward and backward
+  outputs depth-concatenated, such that the output will have the format
+  [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
+  forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: A length T list of inputs, each a tensor of shape
+      [batch_size, input_size], or a nested tuple of such elements.
+    initial_state_fw: (optional) An initial state for the forward RNN.
+      This must be a tensor of appropriate type and shape
+      `[batch_size, cell_fw.state_size]`.
+      If `cell_fw.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+      the corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial state.  Required if
+      either of the initial states are not provided.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_state_fw, output_state_bw) where:
+      outputs is a length `T` list of outputs (one for each input), which
+        are depth-concatenated forward and backward outputs.
+      output_state_fw is the final state of the forward rnn.
+      output_state_bw is the final state of the backward rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+    ValueError: If inputs is None or an empty list.
+  """
+
+  if not _like_rnncell(cell_fw):
+    raise TypeError("cell_fw must be an instance of RNNCell")
+  if not _like_rnncell(cell_bw):
+    raise TypeError("cell_bw must be an instance of RNNCell")
+  if not nest.is_sequence(inputs):
+    raise TypeError("inputs must be a sequence")
+  if not inputs:
+    raise ValueError("inputs must not be empty")
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = static_rnn(
+          cell_fw,
+          inputs,
+          initial_state_fw,
+          dtype,
+          sequence_length,
+          scope=fw_scope)
+
+    # Backward direction
+    with vs.variable_scope("bw") as bw_scope:
+      reversed_inputs = _reverse_seq(inputs, sequence_length)
+      tmp, output_state_bw = static_rnn(
+          cell_bw,
+          reversed_inputs,
+          initial_state_bw,
+          dtype,
+          sequence_length,
+          scope=bw_scope)
+
+  output_bw = _reverse_seq(tmp, sequence_length)
+  # Concat each of the forward/backward outputs
+  flat_output_fw = nest.flatten(output_fw)
+  flat_output_bw = nest.flatten(output_bw)
+
+  flat_outputs = tuple(
+      array_ops.concat([fw, bw], 1)
+      for fw, bw in zip(flat_output_fw, flat_output_bw))
+
+  outputs = nest.pack_sequence_as(
+      structure=output_fw, flat_sequence=flat_outputs)
+
+  return (outputs, output_state_fw, output_state_bw)
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
new file mode 100644
index 00000000000..c0dac8fb012
--- /dev/null
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -0,0 +1,51 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for constructing RNN Cells.
+
+## Base interface for all RNN Cells
+
+@@RNNCell
+
+## RNN Cells for use with TensorFlow's core RNN methods
+
+@@BasicRNNCell
+@@BasicLSTMCell
+@@GRUCell
+@@LSTMCell
+
+## Classes storing split `RNNCell` state
+
+@@LSTMStateTuple
+
+## RNN Cell wrappers (RNNCells that wrap other RNNCells)
+
+@@MultiRNNCell
+@@DropoutWrapper
+@@DeviceWrapper
+@@ResidualWrapper
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.rnn_cell_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = []
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 10e3bc5e02c..c00949a322a 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -12,62 +12,144 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Module implementing RNN Cells.
 
-"""Module implementing RNN Cells."""
-
+This module provides a number of basic commonly used RNN cells, such as LSTM
+(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
+operators that allow adding dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
+calling the `rnn` ops several times.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import hashlib
+import numbers
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-def _state_size_with_prefix(state_size, prefix=None):
-  """Helper function that enables int or TensorShape shape specification.
+_BIAS_VARIABLE_NAME = "bias"
+_WEIGHTS_VARIABLE_NAME = "kernel"
 
-  This function takes a size specification, which can be an integer or a
-  TensorShape, and converts it into a list of integers. One may specify any
-  additional dimensions that precede the final state size specification.
+
+def _like_rnncell(cell):
+  """Checks that a given object is an RNNCell by using duck typing."""
+  conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+                hasattr(cell, "zero_state"), callable(cell)]
+  return all(conditions)
+
+
+def _concat(prefix, suffix, static=False):
+  """Concat that enables int, Tensor, or TensorShape values.
+
+  This function takes a size specification, which can be an integer, a
+  TensorShape, or a Tensor, and converts it into a concatenated Tensor
+  (if static = False) or a list of integers (if static = True).
 
   Args:
-    state_size: TensorShape or int that specifies the size of a tensor.
-    prefix: optional additional list of dimensions to prepend.
+    prefix: The prefix; usually the batch size (and/or time step size).
+      (TensorShape, int, or Tensor.)
+    suffix: TensorShape, int, or Tensor.
+    static: If `True`, return a python list with possibly unknown dimensions.
+      Otherwise return a `Tensor`.
 
   Returns:
-    result_state_size: list of dimensions the resulting tensor size.
+    shape: the concatenation of prefix and suffix.
+
+  Raises:
+    ValueError: if `suffix` is not a scalar or vector (or TensorShape).
+    ValueError: if prefix or suffix was `None` and asked for dynamic
+      Tensors out.
   """
-  result_state_size = tensor_shape.as_shape(state_size).as_list()
-  if prefix is not None:
-    if not isinstance(prefix, list):
-      raise TypeError("prefix of _state_size_with_prefix should be a list.")
-    result_state_size = prefix + result_state_size
-  return result_state_size
+  if isinstance(prefix, ops.Tensor):
+    p = prefix
+    p_static = tensor_util.constant_value(prefix)
+    if p.shape.ndims == 0:
+      p = array_ops.expand_dims(p, 0)
+    elif p.shape.ndims != 1:
+      raise ValueError("prefix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % p)
+  else:
+    p = tensor_shape.as_shape(prefix)
+    p_static = p.as_list() if p.ndims is not None else None
+    p = (constant_op.constant(p.as_list(), dtype=dtypes.int32)
+         if p.is_fully_defined() else None)
+  if isinstance(suffix, ops.Tensor):
+    s = suffix
+    s_static = tensor_util.constant_value(suffix)
+    if s.shape.ndims == 0:
+      s = array_ops.expand_dims(s, 0)
+    elif s.shape.ndims != 1:
+      raise ValueError("suffix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % s)
+  else:
+    s = tensor_shape.as_shape(suffix)
+    s_static = s.as_list() if s.ndims is not None else None
+    s = (constant_op.constant(s.as_list(), dtype=dtypes.int32)
+         if s.is_fully_defined() else None)
+
+  if static:
+    shape = tensor_shape.as_shape(p_static).concatenate(s_static)
+    shape = shape.as_list() if shape.ndims is not None else None
+  else:
+    if p is None or s is None:
+      raise ValueError("Provided a prefix or suffix of None: %s and %s"
+                       % (prefix, suffix))
+    shape = array_ops.concat((p, s), 0)
+  return shape
 
 
-class _RNNCell(object):
+def _zero_state_tensors(state_size, batch_size, dtype):
+  """Create tensors of zeros based on state_size, batch_size, and dtype."""
+  def get_state_shape(s):
+    """Combine s with batch_size to get a proper tensor shape."""
+    c = _concat(batch_size, s)
+    c_static = _concat(batch_size, s, static=True)
+    size = array_ops.zeros(c, dtype=dtype)
+    size.set_shape(c_static)
+    return size
+  return nest.map_structure(get_state_shape, state_size)
+
+
+class RNNCell(base_layer.Layer):
   """Abstract object representing an RNN cell.
 
-  The definition of cell in this package differs from the definition used in the
-  literature. In the literature, cell refers to an object with a single scalar
-  output. The definition in this package refers to a horizontal array of such
-  units.
+  Every `RNNCell` must have the properties below and implement `call` with
+  the signature `(output, next_state) = call(input, state)`.  The optional
+  third input argument, `scope`, is allowed for backwards compatibility
+  purposes; but should be left off for new subclasses.
+
+  This definition of cell differs from the definition used in the literature.
+  In the literature, 'cell' refers to an object with a single scalar output.
+  This definition refers to a horizontal array of such units.
 
   An RNN cell, in the most abstract setting, is anything that has
   a state and performs some operation that takes a matrix of inputs.
   This operation results in an output matrix with `self.output_size` columns.
   If `self.state_size` is an integer, this operation also results in a new
   state matrix with `self.state_size` columns.  If `self.state_size` is a
-  tuple of integers, then it results in a tuple of `len(state_size)` state
-  matrices, each with a column size corresponding to values in `state_size`.
-
-  This module provides a number of basic commonly used RNN cells, such as
-  LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-  of operators that allow add dropouts, projections, or embeddings for inputs.
-  Constructing multi-layer cells is supported by the class `MultiRNNCell`,
-  or by calling the `rnn` ops several times. Every `RNNCell` must have the
-  properties below and implement `__call__` with the following signature.
+  (possibly nested tuple of) TensorShape object(s), then it should return a
+  matching structure of Tensors having shape `[batch_size].concatenate(s)`
+  for each `s` in `self.batch_size`.
   """
 
   def __call__(self, inputs, state, scope=None):
@@ -88,7 +170,25 @@ class _RNNCell(object):
       - New state: Either a single `2-D` tensor, or a tuple of tensors matching
         the arity and shapes of `state`.
     """
-    raise NotImplementedError("Abstract method")
+    if scope is not None:
+      with vs.variable_scope(scope,
+                             custom_getter=self._rnn_get_variable) as scope:
+        return super(RNNCell, self).__call__(inputs, state, scope=scope)
+    else:
+      with vs.variable_scope(vs.get_variable_scope(),
+                             custom_getter=self._rnn_get_variable):
+        return super(RNNCell, self).__call__(inputs, state)
+
+  def _rnn_get_variable(self, getter, *args, **kwargs):
+    variable = getter(*args, **kwargs)
+    trainable = (variable in tf_variables.trainable_variables() or
+                 (isinstance(variable, tf_variables.PartitionedVariable) and
+                  list(variable)[0] in tf_variables.trainable_variables()))
+    if trainable and variable not in self._trainable_weights:
+      self._trainable_weights.append(variable)
+    elif not trainable and variable not in self._non_trainable_weights:
+      self._non_trainable_weights.append(variable)
+    return variable
 
   @property
   def state_size(self):
@@ -104,6 +204,11 @@ class _RNNCell(object):
     """Integer or TensorShape: size of outputs produced by this cell."""
     raise NotImplementedError("Abstract method")
 
+  def build(self, _):
+    # This tells the parent Layer object that it's OK to call
+    # self.add_variable() inside the call() method.
+    pass
+
   def zero_state(self, batch_size, dtype):
     """Return zero-filled state tensor(s).
 
@@ -117,24 +222,811 @@ class _RNNCell(object):
 
       If `state_size` is a nested list or tuple, then the return value is
       a nested list or tuple (of the same structure) of `2-D` tensors with
-    the shapes `[batch_size x s]` for each s in `state_size`.
+      the shapes `[batch_size x s]` for each s in `state_size`.
     """
-    state_size = self.state_size
-    if nest.is_sequence(state_size):
-      state_size_flat = nest.flatten(state_size)
-      zeros_flat = [
-          array_ops.zeros(
-              array_ops.stack(_state_size_with_prefix(
-                  s, prefix=[batch_size])),
-              dtype=dtype) for s in state_size_flat
-      ]
-      for s, z in zip(state_size_flat, zeros_flat):
-        z.set_shape(_state_size_with_prefix(s, prefix=[None]))
-      zeros = nest.pack_sequence_as(structure=state_size,
-                                    flat_sequence=zeros_flat)
-    else:
-      zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size])
-      zeros = array_ops.zeros(array_ops.stack(zeros_size), dtype=dtype)
-      zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None]))
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      state_size = self.state_size
+      return _zero_state_tensors(state_size, batch_size, dtype)
 
-    return zeros
+
+class BasicRNNCell(RNNCell):
+  """The most basic RNN cell.
+
+  Args:
+    num_units: int, The number of units in the RNN cell.
+    activation: Nonlinearity to use.  Default: `tanh`.
+    reuse: (optional) Python boolean describing whether to reuse variables
+     in an existing scope.  If not `True`, and the existing scope already has
+     the given variables, an error is raised.
+  """
+
+  def __init__(self, num_units, activation=None, reuse=None):
+    super(BasicRNNCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    output = self._activation(_linear([inputs, state], self._num_units, True))
+    return output, output
+
+
+class GRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               kernel_initializer=None,
+               bias_initializer=None):
+    super(GRUCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    with vs.variable_scope("gates"):  # Reset gate and update gate.
+      # We start with bias of 1.0 to not reset and not update.
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        dtype = [a.dtype for a in [inputs, state]][0]
+        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
+      value = math_ops.sigmoid(
+          _linear([inputs, state], 2 * self._num_units, True, bias_ones,
+                  self._kernel_initializer))
+      r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+    with vs.variable_scope("candidate"):
+      c = self._activation(
+          _linear([inputs, r * state], self._num_units, True,
+                  self._bias_initializer, self._kernel_initializer))
+    new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
+
+_LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
+
+
+class LSTMStateTuple(_LSTMStateTuple):
+  """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
+
+  Stores two elements: `(c, h)`, in that order.
+
+  Only used when `state_is_tuple=True`.
+  """
+  __slots__ = ()
+
+  @property
+  def dtype(self):
+    (c, h) = self
+    if c.dtype != h.dtype:
+      raise TypeError("Inconsistent internal state: %s vs %s" %
+                      (str(c.dtype), str(h.dtype)))
+    return c.dtype
+
+
+class BasicLSTMCell(RNNCell):
+  """Basic LSTM recurrent network cell.
+
+  The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+  We add forget_bias (default: 1) to the biases of the forget gate in order to
+  reduce the scale of forgetting in the beginning of the training.
+
+  It does not allow cell clipping, a projection layer, and does not
+  use peep-hole connections: it is the basic baseline.
+
+  For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
+  that follows.
+  """
+
+  def __init__(self, num_units, forget_bias=1.0,
+               state_is_tuple=True, activation=None, reuse=None):
+    """Initialize the basic LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell.
+      forget_bias: float, The bias added to forget gates (see above).
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  The latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(BasicLSTMCell, self).__init__(_reuse=reuse)
+    if not state_is_tuple:
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
+    self._num_units = num_units
+    self._forget_bias = forget_bias
+    self._state_is_tuple = state_is_tuple
+    self._activation = activation or math_ops.tanh
+
+  @property
+  def state_size(self):
+    return (LSTMStateTuple(self._num_units, self._num_units)
+            if self._state_is_tuple else 2 * self._num_units)
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Long short-term memory cell (LSTM)."""
+    sigmoid = math_ops.sigmoid
+    # Parameters of gates are concatenated into one multiply for efficiency.
+    if self._state_is_tuple:
+      c, h = state
+    else:
+      c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+
+    concat = _linear([inputs, h], 4 * self._num_units, True)
+
+    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+    i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+
+    new_c = (
+        c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j))
+    new_h = self._activation(new_c) * sigmoid(o)
+
+    if self._state_is_tuple:
+      new_state = LSTMStateTuple(new_c, new_h)
+    else:
+      new_state = array_ops.concat([new_c, new_h], 1)
+    return new_h, new_state
+
+
+class LSTMCell(RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  The default non-peephole implementation is based on:
+
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+
+  S. Hochreiter and J. Schmidhuber.
+  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+  """
+
+  def __init__(self, num_units,
+               use_peepholes=False, cell_clip=None,
+               initializer=None, num_proj=None, proj_clip=None,
+               num_unit_shards=None, num_proj_shards=None,
+               forget_bias=1.0, state_is_tuple=True,
+               activation=None, reuse=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      num_unit_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      num_proj_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  This latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(LSTMCell, self).__init__(_reuse=reuse)
+    if not state_is_tuple:
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
+    if num_unit_shards is not None or num_proj_shards is not None:
+      logging.warn(
+          "%s: The num_unit_shards and proj_unit_shards parameters are "
+          "deprecated and will be removed in Jan 2017.  "
+          "Use a variable scope with a partitioner instead.", self)
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._num_unit_shards = num_unit_shards
+    self._num_proj_shards = num_proj_shards
+    self._forget_bias = forget_bias
+    self._state_is_tuple = state_is_tuple
+    self._activation = activation or math_ops.tanh
+
+    if num_proj:
+      self._state_size = (
+          LSTMStateTuple(num_units, num_proj)
+          if state_is_tuple else num_units + num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = (
+          LSTMStateTuple(num_units, num_units)
+          if state_is_tuple else 2 * num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: if `state_is_tuple` is False, this must be a state Tensor,
+        `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
+        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
+        `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+    sigmoid = math_ops.sigmoid
+
+    if self._state_is_tuple:
+      (c_prev, m_prev) = state
+    else:
+      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+      if self._num_unit_shards is not None:
+        unit_scope.set_partitioner(
+            partitioned_variables.fixed_size_partitioner(
+                self._num_unit_shards))
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+      # Diagonal connections
+      if self._use_peepholes:
+        with vs.variable_scope(unit_scope) as projection_scope:
+          if self._num_unit_shards is not None:
+            projection_scope.set_partitioner(None)
+          w_f_diag = vs.get_variable(
+              "w_f_diag", shape=[self._num_units], dtype=dtype)
+          w_i_diag = vs.get_variable(
+              "w_i_diag", shape=[self._num_units], dtype=dtype)
+          w_o_diag = vs.get_variable(
+              "w_o_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+      else:
+        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
+             self._activation(j))
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * self._activation(c)
+      else:
+        m = sigmoid(o) * self._activation(c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection") as proj_scope:
+          if self._num_proj_shards is not None:
+            proj_scope.set_partitioner(
+                partitioned_variables.fixed_size_partitioner(
+                    self._num_proj_shards))
+          m = _linear(m, self._num_proj, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+    new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
+                 array_ops.concat([c, m], 1))
+    return m, new_state
+
+
+def _enumerated_map_structure(map_fn, *args, **kwargs):
+  ix = [0]
+  def enumerated_fn(*inner_args, **inner_kwargs):
+    r = map_fn(ix[0], *inner_args, **inner_kwargs)
+    ix[0] += 1
+    return r
+  return nest.map_structure(enumerated_fn, *args, **kwargs)
+
+
+class DropoutWrapper(RNNCell):
+  """Operator adding dropout to inputs and outputs of the given cell."""
+
+  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
+               state_keep_prob=1.0, variational_recurrent=False,
+               input_size=None, dtype=None, seed=None):
+    """Create a cell with added input, state, and/or output dropout.
+
+    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
+    then the same dropout mask is applied at every step, as described in:
+
+    Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
+    Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
+
+    Otherwise a different dropout mask is applied at every time step.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      input_keep_prob: unit Tensor or float between 0 and 1, input keep
+        probability; if it is constant and 1, no input dropout will be added.
+      output_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+      state_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+        State dropout is performed on the *output* states of the cell.
+      variational_recurrent: Python bool.  If `True`, then the same
+        dropout pattern is applied across all time steps per run call.
+        If this parameter is set, `input_size` **must** be provided.
+      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
+        containing the depth(s) of the input tensors expected to be passed in to
+        the `DropoutWrapper`.  Required and used **iff**
+         `variational_recurrent = True` and `input_keep_prob < 1`.
+      dtype: (optional) The `dtype` of the input, state, and output tensors.
+        Required and used **iff** `variational_recurrent = True`.
+      seed: (optional) integer, the randomness seed.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if any of the keep_probs are not between 0 and 1.
+    """
+    if not _like_rnncell(cell):
+      raise TypeError("The parameter cell is not a RNNCell.")
+    with ops.name_scope("DropoutWrapperInit"):
+      def tensor_and_const_value(v):
+        tensor_value = ops.convert_to_tensor(v)
+        const_value = tensor_util.constant_value(tensor_value)
+        return (tensor_value, const_value)
+      for prob, attr in [(input_keep_prob, "input_keep_prob"),
+                         (state_keep_prob, "state_keep_prob"),
+                         (output_keep_prob, "output_keep_prob")]:
+        tensor_prob, const_prob = tensor_and_const_value(prob)
+        if const_prob is not None:
+          if const_prob < 0 or const_prob > 1:
+            raise ValueError("Parameter %s must be between 0 and 1: %d"
+                             % (attr, const_prob))
+          setattr(self, "_%s" % attr, float(const_prob))
+        else:
+          setattr(self, "_%s" % attr, tensor_prob)
+
+    # Set cell, variational_recurrent, seed before running the code below
+    self._cell = cell
+    self._variational_recurrent = variational_recurrent
+    self._seed = seed
+
+    self._recurrent_input_noise = None
+    self._recurrent_state_noise = None
+    self._recurrent_output_noise = None
+
+    if variational_recurrent:
+      if dtype is None:
+        raise ValueError(
+            "When variational_recurrent=True, dtype must be provided")
+
+      def convert_to_batch_shape(s):
+        # Prepend a 1 for the batch dimension; for recurrent
+        # variational dropout we use the same dropout mask for all
+        # batch elements.
+        return array_ops.concat(
+            ([1], tensor_shape.TensorShape(s).as_list()), 0)
+
+      def batch_noise(s, inner_seed):
+        shape = convert_to_batch_shape(s)
+        return random_ops.random_uniform(shape, seed=inner_seed, dtype=dtype)
+
+      if (not isinstance(self._input_keep_prob, numbers.Real) or
+          self._input_keep_prob < 1.0):
+        if input_size is None:
+          raise ValueError(
+              "When variational_recurrent=True and input_keep_prob < 1.0 or "
+              "is unknown, input_size must be provided")
+        self._recurrent_input_noise = _enumerated_map_structure(
+            lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)),
+            input_size)
+      self._recurrent_state_noise = _enumerated_map_structure(
+          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)),
+          cell.state_size)
+      self._recurrent_output_noise = _enumerated_map_structure(
+          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)),
+          cell.output_size)
+
+  def _gen_seed(self, salt_prefix, index):
+    if self._seed is None:
+      return None
+    salt = "%s_%d" % (salt_prefix, index)
+    string = (str(self._seed) + salt).encode("utf-8")
+    return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def _variational_recurrent_dropout_value(
+      self, index, value, noise, keep_prob):
+    """Performs dropout given the pre-calculated noise tensor."""
+    # uniform [keep_prob, 1.0 + keep_prob)
+    random_tensor = keep_prob + noise
+
+    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
+    binary_tensor = math_ops.floor(random_tensor)
+    ret = math_ops.div(value, keep_prob) * binary_tensor
+    ret.set_shape(value.get_shape())
+    return ret
+
+  def _dropout(self, values, salt_prefix, recurrent_noise, keep_prob):
+    """Decides whether to perform standard dropout or recurrent dropout."""
+    if not self._variational_recurrent:
+      def dropout(i, v):
+        return nn_ops.dropout(
+            v, keep_prob=keep_prob, seed=self._gen_seed(salt_prefix, i))
+      return _enumerated_map_structure(dropout, values)
+    else:
+      def dropout(i, v, n):
+        return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
+      return _enumerated_map_structure(dropout, values, recurrent_noise)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell with the declared dropouts."""
+    def _should_dropout(p):
+      return (not isinstance(p, float)) or p < 1
+
+    if _should_dropout(self._input_keep_prob):
+      inputs = self._dropout(inputs, "input",
+                             self._recurrent_input_noise,
+                             self._input_keep_prob)
+    output, new_state = self._cell(inputs, state, scope)
+    if _should_dropout(self._state_keep_prob):
+      new_state = self._dropout(new_state, "state",
+                                self._recurrent_state_noise,
+                                self._state_keep_prob)
+    if _should_dropout(self._output_keep_prob):
+      output = self._dropout(output, "output",
+                             self._recurrent_output_noise,
+                             self._output_keep_prob)
+    return output, new_state
+
+
+class ResidualWrapper(RNNCell):
+  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+  def __init__(self, cell):
+    """Constructs a `ResidualWrapper` for `cell`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+    """
+    self._cell = cell
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and add its inputs to its outputs.
+
+    Args:
+      inputs: cell inputs.
+      state: cell state.
+      scope: optional cell scope.
+
+    Returns:
+      Tuple of cell outputs and new state.
+
+    Raises:
+      TypeError: If cell inputs and outputs have different structure (type).
+      ValueError: If cell inputs and outputs have different structure (value).
+    """
+    outputs, new_state = self._cell(inputs, state, scope=scope)
+    nest.assert_same_structure(inputs, outputs)
+    # Ensure shapes match
+    def assert_shape_match(inp, out):
+      inp.get_shape().assert_is_compatible_with(out.get_shape())
+    nest.map_structure(assert_shape_match, inputs, outputs)
+    res_outputs = nest.map_structure(
+        lambda inp, out: inp + out, inputs, outputs)
+    return (res_outputs, new_state)
+
+
+class DeviceWrapper(RNNCell):
+  """Operator that ensures an RNNCell runs on a particular device."""
+
+  def __init__(self, cell, device):
+    """Construct a `DeviceWrapper` for `cell` with device `device`.
+
+    Ensures the wrapped `cell` is called with `tf.device(device)`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      device: A device string or function, for passing to `tf.device`.
+    """
+    self._cell = cell
+    self._device = device
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      with ops.device(self._device):
+        return self._cell.zero_state(batch_size, dtype)
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell on specified device."""
+    with ops.device(self._device):
+      return self._cell(inputs, state, scope=scope)
+
+
+class MultiRNNCell(RNNCell):
+  """RNN cell composed sequentially of multiple simple cells."""
+
+  def __init__(self, cells, state_is_tuple=True):
+    """Create a RNN cell composed sequentially of a number of RNNCells.
+
+    Args:
+      cells: list of RNNCells that will be composed in this order.
+      state_is_tuple: If True, accepted and returned states are n-tuples, where
+        `n = len(cells)`.  If False, the states are all
+        concatenated along the column axis.  This latter behavior will soon be
+        deprecated.
+
+    Raises:
+      ValueError: if cells is empty (not allowed), or at least one of the cells
+        returns a state tuple but the flag `state_is_tuple` is `False`.
+    """
+    super(MultiRNNCell, self).__init__()
+    if not cells:
+      raise ValueError("Must specify at least one cell for MultiRNNCell.")
+    if not nest.is_sequence(cells):
+      raise TypeError(
+          "cells must be a list or tuple, but saw: %s." % cells)
+
+    self._cells = cells
+    self._state_is_tuple = state_is_tuple
+    if not state_is_tuple:
+      if any(nest.is_sequence(c.state_size) for c in self._cells):
+        raise ValueError("Some cells return tuples of states, but the flag "
+                         "state_is_tuple is not set.  State sizes are: %s"
+                         % str([c.state_size for c in self._cells]))
+
+  @property
+  def state_size(self):
+    if self._state_is_tuple:
+      return tuple(cell.state_size for cell in self._cells)
+    else:
+      return sum([cell.state_size for cell in self._cells])
+
+  @property
+  def output_size(self):
+    return self._cells[-1].output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      if self._state_is_tuple:
+        return tuple(cell.zero_state(batch_size, dtype) for cell in self._cells)
+      else:
+        # We know here that state_size of each cell is not a tuple and
+        # presumably does not contain TensorArrays or anything else fancy
+        return super(MultiRNNCell, self).zero_state(batch_size, dtype)
+
+  def call(self, inputs, state):
+    """Run this multi-layer cell on inputs, starting from state."""
+    cur_state_pos = 0
+    cur_inp = inputs
+    new_states = []
+    for i, cell in enumerate(self._cells):
+      with vs.variable_scope("cell_%d" % i):
+        if self._state_is_tuple:
+          if not nest.is_sequence(state):
+            raise ValueError(
+                "Expected state to be a tuple of length %d, but received: %s" %
+                (len(self.state_size), state))
+          cur_state = state[i]
+        else:
+          cur_state = array_ops.slice(state, [0, cur_state_pos],
+                                      [-1, cell.state_size])
+          cur_state_pos += cell.state_size
+        cur_inp, new_state = cell(cur_inp, cur_state)
+        new_states.append(new_state)
+
+    new_states = (tuple(new_states) if self._state_is_tuple else
+                  array_ops.concat(new_states, 1))
+
+    return cur_inp, new_states
+
+
+class _SlimRNNCell(RNNCell):
+  """A simple wrapper for slim.rnn_cells."""
+
+  def __init__(self, cell_fn):
+    """Create a SlimRNNCell from a cell_fn.
+
+    Args:
+      cell_fn: a function which takes (inputs, state, scope) and produces the
+        outputs and the new_state. Additionally when called with inputs=None and
+        state=None it should return (initial_outputs, initial_state).
+
+    Raises:
+      TypeError: if cell_fn is not callable
+      ValueError: if cell_fn cannot produce a valid initial state.
+    """
+    if not callable(cell_fn):
+      raise TypeError("cell_fn %s needs to be callable", cell_fn)
+    self._cell_fn = cell_fn
+    self._cell_name = cell_fn.func.__name__
+    init_output, init_state = self._cell_fn(None, None)
+    output_shape = init_output.get_shape()
+    state_shape = init_state.get_shape()
+    self._output_size = output_shape.with_rank(2)[1].value
+    self._state_size = state_shape.with_rank(2)[1].value
+    if self._output_size is None:
+      raise ValueError("Initial output created by %s has invalid shape %s" %
+                       (self._cell_name, output_shape))
+    if self._state_size is None:
+      raise ValueError("Initial state created by %s has invalid shape %s" %
+                       (self._cell_name, state_shape))
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def __call__(self, inputs, state, scope=None):
+    scope = scope or self._cell_name
+    output, state = self._cell_fn(inputs, state, scope=scope)
+    return output, state
+
+
+def _linear(args,
+            output_size,
+            bias,
+            bias_initializer=None,
+            kernel_initializer=None):
+  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+  Args:
+    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+    output_size: int, second dimension of W[i].
+    bias: boolean, whether to add a bias term or not.
+    bias_initializer: starting value to initialize the bias
+      (default is all zeros).
+    kernel_initializer: starting value to initialize the weight.
+
+  Returns:
+    A 2D Tensor with shape [batch x output_size] equal to
+    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+  Raises:
+    ValueError: if some of the arguments has unspecified or wrong shape.
+  """
+  if args is None or (nest.is_sequence(args) and not args):
+    raise ValueError("`args` must be specified")
+  if not nest.is_sequence(args):
+    args = [args]
+
+  # Calculate the total size of arguments on dimension 1.
+  total_arg_size = 0
+  shapes = [a.get_shape() for a in args]
+  for shape in shapes:
+    if shape.ndims != 2:
+      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+    if shape[1].value is None:
+      raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                       "but saw %s" % (shape, shape[1]))
+    else:
+      total_arg_size += shape[1].value
+
+  dtype = [a.dtype for a in args][0]
+
+  # Now the computation.
+  scope = vs.get_variable_scope()
+  with vs.variable_scope(scope) as outer_scope:
+    weights = vs.get_variable(
+        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+        dtype=dtype,
+        initializer=kernel_initializer)
+    if len(args) == 1:
+      res = math_ops.matmul(args[0], weights)
+    else:
+      res = math_ops.matmul(array_ops.concat(args, 1), weights)
+    if not bias:
+      return res
+    with vs.variable_scope(outer_scope) as inner_scope:
+      inner_scope.set_partitioner(None)
+      if bias_initializer is None:
+        bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+      biases = vs.get_variable(
+          _BIAS_VARIABLE_NAME, [output_size],
+          dtype=dtype,
+          initializer=bias_initializer)
+    return nn_ops.bias_add(res, biases)
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index b89b76cf55f..ebe1f5c0a40 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Script Language Operators.
 
-TensorFlow provides allows you to wrap python/numpy functions as
-TensorFlow operators.
+"""Script Language Operators. See the @{$python/script_ops} guide.
 
 @@py_func
-
 """
 
 # pylint: disable=g-bad-name
@@ -29,8 +26,10 @@ from __future__ import print_function
 import threading
 
 import numpy as np
+import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
 
@@ -83,6 +82,10 @@ class FuncRegistry(object):
     if func is None:
       raise ValueError("callback %s is not found" % token)
     ret = func(*args)
+    # Strings seem to lead to a memory leak here if they're not wrapped in a
+    # list.
+    if isinstance(ret, six.binary_type):
+      ret = [ret]
     # Ensures that we return either a single numpy array or a list of numpy
     # arrays.
     if isinstance(ret, (tuple, list)):
@@ -168,10 +171,16 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   # We tie the registered function's life-time with the current
   # default graph. I.e., when the current graph is destroyed, we
   # should remove its py funcs.
-  cleanup = CleanupFunc(token)
   g = ops.get_default_graph()
+
   # pylint: disable=protected-access
-  #
+  while isinstance(g, function._FuncGraph):
+    # If the py_func was declared inside a _FuncGraph, its lifetime should be
+    # bound to that of the outer graph instead.
+    g = g._outer_graph
+
+  cleanup = CleanupFunc(token)
+
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
   if not hasattr(g, "_cleanup_py_funcs_used_in_graph"):
@@ -181,20 +190,21 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   # will be destroyed and their __del__ will remove the 'token' from
   # the funcs registry.
   g._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # pylint: enable=protected-access
 
   if isinstance(Tout, (list, tuple)):
     is_list_or_tuple = True
   else:
     Tout = [Tout]
     is_list_or_tuple = False
+  # pylint: disable=protected-access
   if stateful:
     result = gen_script_ops._py_func(
         input=inp, token=token, Tout=Tout, name=name)
-    # pylint: enable=protected-access
   else:
     result = gen_script_ops._py_func_stateless(
         input=inp, token=token, Tout=Tout, name=name)
-    # pylint: enable=protected-access
+  # pylint: enable=protected-access
   return result if is_list_or_tuple else result[0]
 
 
diff --git a/tensorflow/python/ops/sdca_ops.py b/tensorflow/python/ops/sdca_ops.py
index 3876bc96421..8b7e5abbc22 100644
--- a/tensorflow/python/ops/sdca_ops.py
+++ b/tensorflow/python/ops/sdca_ops.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Script Language Operators.
-
-A Dual Cordinate Ascent optimizer for TensorFlow for training fast linear
-models.
+"""A Dual Coordinate Ascent optimizer library for training fast linear models.
 
 @@sdca_optimizer
 @@sdca_fprint
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index fceb4ebf863..e74c52b8cf9 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Tensor Handle Operations.
 
-TensorFlow provides several operators that allows the user to keep tensors
-"in-place" across run calls.
+"""Tensor Handle Operations. See the @{$python/session_ops} guide.
 
 @@get_session_handle
+@@get_session_handle_v2
 @@get_session_tensor
 @@delete_session_tensor
 """
@@ -27,6 +26,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.core.framework import resource_handle_pb2
+from tensorflow.python import pywrap_tensorflow_internal
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +38,12 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.util import compat
 
 
+def encode_resource_handle(resource_handle):
+  """Encode a ResourceHandle proto as custom numpy struct type."""
+  return np.asarray(bytearray(resource_handle.SerializeToString()),
+                    dtype=dtypes.np_resource)
+
+
 class TensorHandle(object):
   """Represents a handle for a live tensor in a session."""
 
@@ -50,6 +59,7 @@ class TensorHandle(object):
       session: The session in which the tensor is produced.
     """
     self._handle = compat.as_str_any(handle)
+    self._resource_handle = None
     self._dtype = dtype
     self._session = session
     self._auto_gc_enabled = True
@@ -61,6 +71,25 @@ class TensorHandle(object):
   def __str__(self):
     return self._handle
 
+  def _get_resource_handle(self):
+    """The ResourceHandle representation of this handle."""
+    if not self._resource_handle:
+      self._resource_handle = resource_handle_pb2.ResourceHandle()
+      self._resource_handle.device = self._handle.split(";")[-1]
+      self._resource_handle.container = (
+          pywrap_tensorflow_internal.TENSOR_HANDLE_KEY)
+      self._resource_handle.name = self._handle
+    return self._resource_handle
+
+  def to_numpy_array(self):
+    """Convert a TensorHandle object to a feedable numpy value.
+
+    Returns:
+      A numpy array of a custom struct type that can be used as a feed value
+      to run().
+    """
+    return encode_resource_handle(self._get_resource_handle())
+
   @property
   def handle(self):
     """The string representation of this handle."""
@@ -81,7 +110,7 @@ class TensorHandle(object):
       raise TypeError("Persistent tensor %s may have already been deleted."
                       % self.handle)
     self._auto_gc_enabled = False
-    holder, deleter = _get_handle_deleter(self._session.graph, self._handle)
+    holder, deleter = _get_handle_deleter(self._session.graph, 0, self._handle)
     self._session.run(deleter, feed_dict={holder: self.handle})
 
   def get_raw_handle(self):
@@ -106,11 +135,6 @@ class TensorHandle(object):
     handle_parts = str(handle).split(";")
     return handle_parts[0] + ";" + handle_parts[-1]
 
-  @staticmethod
-  def _get_deleter_key(handle):
-    """The graph key for deleter."""
-    return str(handle).split(";")[-1]
-
   @staticmethod
   def _get_mover_key(feeder, handle):
     """The graph key for mover."""
@@ -156,7 +180,7 @@ def get_session_handle(data, name=None):
 
   # Colocate this operation with data.
   with ops.colocate_with(data):
-    return gen_data_flow_ops._get_session_handle(data, name=name)
+    return gen_data_flow_ops._get_session_handle(data, name=name)  # pylint: disable=protected-access
 
 
 def get_session_tensor(handle, dtype, name=None):
@@ -261,16 +285,15 @@ def _get_handle_mover(graph, feeder, handle):
     # Create mover if we haven't done it.
     holder, reader = _get_handle_reader(graph, handle, dtype)
     with graph.as_default(), graph.device(feeder.op.device):
-      mover = gen_data_flow_ops._get_session_handle(reader)
+      mover = gen_data_flow_ops._get_session_handle(reader)  # pylint: disable=protected-access
     result = (holder, mover)
     graph._handle_movers[graph_key] = result
   return result
 
 
-def _get_handle_deleter(graph, handle):
+def _get_handle_deleter(graph, deleter_key, handle):
   """Return a deletion subgraph for this handle."""
-  graph_key = TensorHandle._get_deleter_key(handle)
-  result = graph._handle_deleters.get(graph_key)
+  result = graph._handle_deleters.get(deleter_key)
   if result is None:
     # Create deleter if we haven't done it.
     handle_device = TensorHandle._get_device_name(handle)
@@ -278,5 +301,5 @@ def _get_handle_deleter(graph, handle):
       holder = array_ops.placeholder(dtypes.string)
       deleter = gen_data_flow_ops._delete_session_tensor(holder)
     result = (holder, deleter)
-    graph._handle_deleters[graph_key] = result
+    graph._handle_deleters[deleter_key] = result
   return result
diff --git a/tensorflow/python/ops/sets.py b/tensorflow/python/ops/sets.py
index 1eff9033559..ea4677befe6 100644
--- a/tensorflow/python/ops/sets.py
+++ b/tensorflow/python/ops/sets.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Python layer for sets.
+"""Tensorflow set operations.
 
 @@set_size
 @@set_intersection
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index fa015856cec..5295e7d21c2 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -136,12 +136,13 @@ def _SparseTensorDenseMatMulGrad(op, grad):
   Raises:
     TypeError: When the two operands don't have the same type.
   """
-  sp_t = sparse_tensor.SparseTensor(*op.inputs[:3])
+  a_indices, a_values, a_shape = op.inputs[:3]
+  b = op.inputs[3]
   adj_a = op.get_attr("adjoint_a")
   adj_b = op.get_attr("adjoint_b")
 
-  a_type = sp_t.values.dtype.base_dtype
-  b_type = op.inputs[3].dtype.base_dtype
+  a_type = a_values.dtype.base_dtype
+  b_type = b.dtype.base_dtype
   if a_type != b_type:
     raise TypeError("SparseTensorDenseMatMul op received operands with "
                     "different types: ", a_type, " and ", b_type)
@@ -150,15 +151,12 @@ def _SparseTensorDenseMatMulGrad(op, grad):
                               "complex gradients.")
 
   # gradient w.r.t. dense
-  b_grad = sparse_ops.sparse_tensor_dense_matmul(sp_t, grad,
-                                                 adjoint_a=not adj_a)
+  b_grad = gen_sparse_ops._sparse_tensor_dense_mat_mul(  # pylint: disable=protected-access
+      a_indices, a_values, a_shape, grad, adjoint_a=not adj_a)
   if adj_b:
     b_grad = array_ops.transpose(b_grad)
 
   # gradient w.r.t. sparse values
-  a_indices = op.inputs[0]
-  b = op.inputs[3]
-
   rows = a_indices[:, 0]
   cols = a_indices[:, 1]
 
@@ -271,3 +269,18 @@ def _SparseSparseMaximumGrad(unused_op, unused_grad):
 def _SparseSparseMinimumGrad(unused_op, unused_grad):
   raise NotImplementedError("Gradient for SparseSparseMinimum is currently not"
                             " implemented yet.")
+
+
+@ops.RegisterGradient("SparseFillEmptyRows")
+def _SparseFillEmptyRowsGrad(op, unused_grad_output_indices, output_grad_values,
+                             unused_grad_empty_row_indicator,
+                             unused_grad_reverse_index_map):
+  """Gradients for SparseFillEmptyRows."""
+  reverse_index_map = op.outputs[3]
+
+  # pylint: disable=protected-access
+  d_values, d_default_value = gen_sparse_ops._sparse_fill_empty_rows_grad(
+      reverse_index_map=reverse_index_map, grad_values=output_grad_values)
+
+  # d_indices, d_values, d_dense_shape, d_default_value.
+  return [None, d_values, None, d_default_value]
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index e35b4a815bd..b52610661fd 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -14,25 +14,14 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""## Sparse Tensor Representation
-
-TensorFlow supports a `SparseTensor` representation for data that is sparse
-in multiple dimensions. Contrast this representation with `IndexedSlices`,
-which is efficient for representing tensors that are sparse in their first
-dimension, and dense along all other dimensions.
+"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide.
 
 @@SparseTensor
 @@SparseTensorValue
-
-## Conversion
-
 @@sparse_to_dense
 @@sparse_tensor_to_dense
 @@sparse_to_indicator
 @@sparse_merge
-
-## Manipulation
-
 @@sparse_concat
 @@sparse_reorder
 @@sparse_reshape
@@ -41,18 +30,15 @@ dimension, and dense along all other dimensions.
 @@sparse_reset_shape
 @@sparse_fill_empty_rows
 @@sparse_transpose
-
-## Reduction
 @@sparse_reduce_sum
 @@sparse_reduce_sum_sparse
-
-## Math Operations
 @@sparse_add
 @@sparse_softmax
 @@sparse_tensor_dense_matmul
 @@sparse_maximum
 @@sparse_minimum
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -65,6 +51,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -254,6 +241,8 @@ def sparse_add(a, b, thresh=0):
   of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
   `Tensor`s.
 
+  The shapes of the two operands must match: broadcasting is not supported.
+
   The indices of any input `SparseTensor` are assumed ordered in standard
   lexicographic order.  If this is not the case, before this step run
   `SparseReorder` to restore index ordering.
@@ -302,12 +291,21 @@ def sparse_add(a, b, thresh=0):
 
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
+    b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype, name="thresh")
     output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
         a.indices, a.values, a.dense_shape,
         b.indices, b.values, b.dense_shape,
         thresh))
+
+    # Attempt to get output_shape statically.
+    a.get_shape().assert_is_compatible_with(b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(
+        a.get_shape(), b.get_shape())
+    if static_shape.is_fully_defined():
+      output_shape = static_shape.as_list()
+
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
   else:
     # swap to make `a` the SparseTensor.
@@ -317,6 +315,126 @@ def sparse_add(a, b, thresh=0):
         a.indices, a.values, a.dense_shape, b)
 
 
+def _sparse_cross(inputs, name=None):
+  """Generates sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `string`.
+  """
+  return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+
+
+def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
+  """Generates hashed sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    num_buckets: An `int` that is `>= 0`.
+      output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+    hash_key: Integer hash_key that will be used by the `FingerprintCat64`
+      function. If not given, will use a default key.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `int64`.
+  """
+  return _sparse_cross_internal(
+      inputs=inputs,
+      hashed_output=True,
+      num_buckets=num_buckets,
+      hash_key=hash_key,
+      name=name)
+
+
+_DEFAULT_HASH_KEY = 0xDECAFCAFFE
+
+
+def _sparse_cross_internal(
+    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+  """See gen_sparse_ops._sparse_cross."""
+  if not isinstance(inputs, list):
+    raise TypeError("Inputs must be a list")
+  if not all(isinstance(i, sparse_tensor.SparseTensor) or
+             isinstance(i, ops.Tensor) for i in inputs):
+    raise TypeError("All inputs must be SparseTensors")
+
+  sparse_inputs = [i for i in inputs
+                   if isinstance(i, sparse_tensor.SparseTensor)]
+  dense_inputs = [i for i in inputs
+                  if not isinstance(i, sparse_tensor.SparseTensor)]
+
+  indices = [sp_input.indices for sp_input in sparse_inputs]
+  values = [sp_input.values for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
+  out_type = dtypes.int64 if hashed_output else dtypes.string
+
+  internal_type = dtypes.string
+  for i in range(len(values)):
+    if values[i].dtype != dtypes.string:
+      values[i] = math_ops.to_int64(values[i])
+      internal_type = dtypes.int64
+  for i in range(len(dense_inputs)):
+    if dense_inputs[i].dtype != dtypes.string:
+      dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
+      internal_type = dtypes.int64
+
+  indices_out, values_out, shape_out = gen_sparse_ops._sparse_cross(
+      indices=indices,
+      values=values,
+      shapes=shapes,
+      dense_inputs=dense_inputs,
+      hashed_output=hashed_output,
+      num_buckets=num_buckets,
+      hash_key=hash_key or _DEFAULT_HASH_KEY,
+      out_type=out_type,
+      internal_type=internal_type,
+      name=name)
+
+  return sparse_tensor.SparseTensor(indices_out, values_out, shape_out)
+
+
 def sparse_dense_cwise_add(sp_t, dense_t):
   """Adds up a SparseTensor and a dense Tensor, using these special rules:
 
@@ -382,8 +500,12 @@ def sparse_reorder(sp_input, name=None):
   reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
       sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
-  return sparse_tensor.SparseTensor(reordered_ind, reordered_val,
-                                    array_ops.identity(sp_input.dense_shape))
+  if sp_input.get_shape().is_fully_defined():
+    dense_shape = sp_input.get_shape().as_list()
+  else:
+    dense_shape = array_ops.identity(sp_input.dense_shape)
+
+  return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
 def sparse_reshape(sp_input, shape, name=None):
@@ -430,13 +552,30 @@ def sparse_reshape(sp_input, shape, name=None):
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
+    ValueError:  If argument `shape` requests a `SparseTensor` with a different
+      number of elements than `sp_input`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
+  shape = math_ops.cast(shape, dtype=dtypes.int64)
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
     reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
+    reshaped_shape_const = tensor_util.constant_value(shape)
+    if (reshaped_shape_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      # Don't deal with inferred dimensions. That would add significant code.
+      if all(n >= 0 for n in reshaped_shape_const):
+        reshaped_size = np.prod(reshaped_shape_const)
+        in_shape_size = np.prod(sp_input.get_shape().as_list())
+        if reshaped_size != in_shape_size:
+          raise ValueError(
+              "Cannot reshape a tensor with %d elements to shape %s "
+              "(%d elements)."
+              % (in_shape_size, reshaped_shape_const, reshaped_size))
+        reshaped_shape = reshaped_shape_const
+
     return sparse_tensor.SparseTensor(
         reshaped_ind, array_ops.identity(sp_input.values),
         reshaped_shape)
@@ -1000,6 +1139,8 @@ def sparse_reset_shape(sp_input, new_shape=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
     ValueError: If `new_shape` represents a tensor with a different rank from
       that of `sp_input` (if shapes are known when graph is constructed).
+    ValueError:  If `new_shape` is determined during graph build to have
+      dimension sizes that are too small.
     OpError:
       - If `new_shape` has dimension sizes that are too small.
       - If shapes are not known during graph construction time, and during run
@@ -1023,18 +1164,35 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    # For cases where shape is not known during graph construction.
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_equal(
-            array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-        output_shape_tensor)
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+    output_shape_tensor_const = tensor_util.constant_value(
         output_shape_tensor)
+    # For cases where all shapes are known during graph construction
+    if (output_shape_tensor_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      in_shape_const = np.array(sp_input.get_shape().as_list())
+      if not np.all(in_shape_const <= output_shape_tensor_const):
+        raise ValueError(
+            "Requested new_shape should have dimension sizes >= sp_input.shape."
+            "  Found new_shape (%s), sp_input.shape (%s)."
+            % (in_shape_const, output_shape_tensor_const))
+      output_shape_tensor = output_shape_tensor_const
+    else:
+      # For cases where shape is not known during graph construction.
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
+          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+          output_shape_tensor)
 
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
 
+# TODO(b/37517434): Delete this variable on 20170610.
+_SPARSE_FILL_EMPTY_ROWS_FAST_PATH = False
+
+
 def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """Fills empty rows in the input 2-D `SparseTensor` with a default value.
 
@@ -1084,37 +1242,51 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
+  # TODO(b/37517434): Delete the slow path and only use the fast path
+  # on 20170610.
   with ops.name_scope(name, "SparseFillEmptyRows", [sp_input]):
     default_value = ops.convert_to_tensor(
         default_value, dtype=sp_input.values.dtype)
+    if _SPARSE_FILL_EMPTY_ROWS_FAST_PATH:
+      (output_indices, output_values, empty_row_indicator,
+       unused_reverse_index_map) = gen_sparse_ops._sparse_fill_empty_rows(
+           indices=sp_input.indices,
+           values=sp_input.values,
+           dense_shape=sp_input.dense_shape,
+           default_value=default_value)
+      return (sparse_tensor.SparseTensor(
+          indices=output_indices,
+          values=output_values,
+          dense_shape=sp_input.dense_shape), empty_row_indicator)
+    else:
+      num_rows = math_ops.cast(sp_input.dense_shape[0], dtypes.int32)
+      all_row_indices = math_ops.cast(math_ops.range(num_rows), dtypes.int64)
+      empty_row_indices, _ = array_ops.setdiff1d(all_row_indices,
+                                                 sp_input.indices[:, 0])
+      empty_row_indicator = sparse_to_dense(
+          empty_row_indices,
+          array_ops.expand_dims(sp_input.dense_shape[0], -1), True,
+          False)
 
-    num_rows = math_ops.cast(sp_input.dense_shape[0], dtypes.int32)
-    all_row_indices = math_ops.cast(math_ops.range(num_rows), dtypes.int64)
-    empty_row_indices, _ = array_ops.setdiff1d(all_row_indices,
-                                               sp_input.indices[:, 0])
-    empty_row_indicator = sparse_to_dense(
-        empty_row_indices,
-        array_ops.expand_dims(sp_input.dense_shape[0], -1), True,
-        False)
+      empty_row_indices_as_column = array_ops.reshape(
+          empty_row_indices, [-1, 1])
+      additional_indices = array_ops.concat([
+          empty_row_indices_as_column,
+          array_ops.zeros_like(empty_row_indices_as_column)
+      ], 1)
+      additional_values = array_ops.fill(
+          array_ops.shape(empty_row_indices), default_value)
 
-    empty_row_indices_as_column = array_ops.reshape(empty_row_indices, [-1, 1])
-    additional_indices = array_ops.concat([
-        empty_row_indices_as_column,
-        array_ops.zeros_like(empty_row_indices_as_column)
-    ], 1)
-    additional_values = array_ops.fill(
-        array_ops.shape(empty_row_indices), default_value)
+      all_indices_unordered = array_ops.concat(
+          [sp_input.indices, additional_indices], 0)
+      all_values_unordered = array_ops.concat(
+          [sp_input.values, additional_values], 0)
+      sp_unordered_output = sparse_tensor.SparseTensor(
+          all_indices_unordered,
+          all_values_unordered, sp_input.dense_shape)
+      sp_ordered_output = sparse_reorder(sp_unordered_output)
 
-    all_indices_unordered = array_ops.concat(
-        [sp_input.indices, additional_indices], 0)
-    all_values_unordered = array_ops.concat(
-        [sp_input.values, additional_values], 0)
-    sp_unordered_output = sparse_tensor.SparseTensor(
-        all_indices_unordered,
-        all_values_unordered, sp_input.dense_shape)
-    sp_ordered_output = sparse_reorder(sp_unordered_output)
-
-    return sp_ordered_output, empty_row_indicator
+      return sp_ordered_output, empty_row_indicator
 
 
 def serialize_sparse(sp_input, name=None):
@@ -1243,36 +1415,72 @@ def sparse_tensor_dense_matmul(sp_a,
   # pylint: disable=line-too-long
   """Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 
-  No validity checking is performed on the indices of A.  However, the following
-  input format is recommended for optimal behavior:
+  No validity checking is performed on the indices of `A`.  However, the
+  following input format is recommended for optimal behavior:
 
-  if adjoint_a == false:
-    A should be sorted in lexicographically increasing order.  Use
-    sparse_reorder if you're not sure.
-  if adjoint_a == true:
-    A should be sorted in order of increasing dimension 1 (i.e., "column major"
-    order instead of "row major" order).
+  * If `adjoint_a == false`: `A` should be sorted in lexicographically
+    increasing order.  Use `sparse_reorder` if you're not sure.
+  * If `adjoint_a == true`: `A` should be sorted in order of increasing
+    dimension 1 (i.e., "column major" order instead of "row major" order).
 
-  Deciding when to use sparse_tensor_dense_matmul vs. matmul(sp_a=True):
+  Using `tf.nn.embedding_lookup_sparse` for sparse multiplication:
+
+  It's not obvious but you can consider `embedding_lookup_sparse` as another
+  sparse and dense multiplication. In some situations, you may prefer to use
+  `embedding_lookup_sparse` even though you're not dealing with embeddings.
+
+  There are two questions to ask in the decision process: Do you need gradients
+  computed as sparse too? Is your sparse data represented as two
+  `SparseTensor`s: ids and values? There is more explanation about data format
+  below. If you answer any of these questions as yes, consider using
+  `tf.nn.embedding_lookup_sparse`.
+
+  Following explains differences between the expected SparseTensors:
+  For example if dense form of your sparse data has shape `[3, 5]` and values:
+
+      [[  a      ]
+       [b       c]
+       [    d    ]]
+
+
+  `SparseTensor` format expected by `sparse_tensor_dense_matmul`:
+   `sp_a` (indices, values):
+
+      [0, 1]: a
+      [1, 0]: b
+      [1, 4]: c
+      [2, 2]: d
+
+  `SparseTensor` format expected by `embedding_lookup_sparse`:
+   `sp_ids`                 `sp_weights`
+
+      [0, 0]: 1                [0, 0]: a
+      [1, 0]: 0                [1, 0]: b
+      [1, 1]: 4                [1, 1]: c
+      [2, 0]: 2                [2, 0]: d
+
+
+  Deciding when to use `sparse_tensor_dense_matmul` vs.
+  `matmul`(a_is_sparse=True):
 
   There are a number of questions to ask in the decision process, including:
 
-  * Will the SparseTensor A fit in memory if densified?
+  * Will the SparseTensor `A` fit in memory if densified?
   * Is the column count of the product large (>> 1)?
-  * Is the density of A larger than approximately 15%?
+  * Is the density of `A` larger than approximately 15%?
 
   If the answer to several of these questions is yes, consider
   converting the `SparseTensor` to a dense one and using `tf.matmul` with
-  `sp_a=True`.
+  `a_is_sparse=True`.
 
-  This operation tends to perform well when A is more sparse, if the column size
-  of the product is small (e.g. matrix-vector multiplication), if
+  This operation tends to perform well when `A` is more sparse, if the column
+  size of the product is small (e.g. matrix-vector multiplication), if
   `sp_a.dense_shape` takes on large values.
 
-  Below is a rough speed comparison between sparse_tensor_dense_matmul,
-  labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
-  the comparison, the time spent converting from a SparseTensor to a dense
-  Tensor is not included, so it is overly conservative with respect to
+  Below is a rough speed comparison between `sparse_tensor_dense_matmul`,
+  labeled 'sparse', and `matmul`(a_is_sparse=True), labeled 'dense'.  For
+  purposes of the comparison, the time spent converting from a `SparseTensor` to
+  a dense `Tensor` is not included, so it is overly conservative with respect to
   the time ratio.
 
   Benchmark system:
@@ -1397,9 +1605,9 @@ def sparse_tensor_dense_matmul(sp_a,
 
   Returns:
     A dense matrix (pseudo-code in dense np.matrix notation):
-      A = A.H if adjoint_a else A
-      B = B.H if adjoint_b else B
-      return A*B
+      `A = A.H if adjoint_a else A`
+      `B = B.H if adjoint_b else B`
+      `return A*B`
   """
   # pylint: enable=line-too-long
   sp_a = _convert_to_sparse_tensor(sp_a)
@@ -1574,7 +1782,7 @@ def sparse_transpose(sp_input, perm=None, name=None):
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
-  with ops.op_scope([sp_input], name, "SparseTranspose") as name:
+  with ops.name_scope(name, "SparseTranspose", [sp_input]) as name:
     if perm is None:
       rank = array_ops.rank(sp_input)
       perm = (rank - 1) - math_ops.range(0, rank, 1)
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index bf4d1982091..b561203bb47 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -29,73 +29,61 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 def lbeta(x, name='lbeta'):
-  r"""Computes `ln(|Beta(x)|)`, reducing along the last dimension.
+  r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
   Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
 
-  ```Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)```
+  $$Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)$$
 
   And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-  `lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)`.  In other words,
-  the last dimension is treated as the `z` vector.
+  $$lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)$$.
+
+  In other words, the last dimension is treated as the `z` vector.
 
   Note that if `z = [u, v]`, then
-  `Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt`, which defines the traditional
-  bivariate beta function.
+  \\(Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt\\), which defines the
+  traditional bivariate beta function.
+
+  If the last dimension is empty, we follow the convention that the sum over
+  the empty set is zero, and the product is one.
 
   Args:
-    x: A rank `n + 1` `Tensor` with type `float`, or `double`.
+    x: A rank `n + 1` `Tensor`, `n >= 0` with type `float`, or `double`.
     name: A name for the operation (optional).
 
   Returns:
-    The logarithm of `|Beta(x)|` reducing along the last dimension.
-
-  Raises:
-    ValueError:  If `x` is empty with rank one or less.
+    The logarithm of \\(|Beta(x)|\\) reducing along the last dimension.
   """
+  # In the event that the last dimension has zero entries, we return -inf.
+  # This is consistent with a convention that the sum over the empty set 0, and
+  # the product is 1.
+  # This is standard.  See https://en.wikipedia.org/wiki/Empty_set.
   with ops.name_scope(name, values=[x]):
     x = ops.convert_to_tensor(x, name='x')
-    x = control_flow_ops.with_dependencies(
-        [check_ops.assert_rank_at_least(x, 1)], x)
 
-    is_empty = math_ops.equal(0, array_ops.size(x))
+    # Note reduce_sum([]) = 0.
+    log_prod_gamma_x = math_ops.reduce_sum(
+        math_ops.lgamma(x), reduction_indices=[-1])
 
-    def nonempty_lbeta():
-      log_prod_gamma_x = math_ops.reduce_sum(
-          math_ops.lgamma(x), reduction_indices=[-1])
-      sum_x = math_ops.reduce_sum(x, reduction_indices=[-1])
-      log_gamma_sum_x = math_ops.lgamma(sum_x)
-      result = log_prod_gamma_x - log_gamma_sum_x
-      return result
+    # Note lgamma(0) = infinity, so if x = []
+    # log_gamma_sum_x = lgamma(0) = infinity, and
+    # log_prod_gamma_x = lgamma(1) = 0,
+    # so result = -infinity
+    sum_x = math_ops.reduce_sum(x, axis=[-1])
+    log_gamma_sum_x = math_ops.lgamma(sum_x)
+    result = log_prod_gamma_x - log_gamma_sum_x
 
-    def empty_lbeta():
-      # If x is empty, return version with one less dimension.
-      # Can only do this if rank >= 2.
-      assertion = check_ops.assert_rank_at_least(x, 2)
-      with ops.control_dependencies([assertion]):
-        return array_ops.squeeze(x, squeeze_dims=[0])
-
-    static_size = x.get_shape().num_elements()
-    if static_size is not None:
-      if static_size > 0:
-        return nonempty_lbeta()
-      else:
-        return empty_lbeta()
-    else:
-      return control_flow_ops.cond(is_empty, empty_lbeta, nonempty_lbeta)
+    return result
 
 
 def einsum(equation, *inputs):
-  """
-  A generalized contraction between tensors of arbitrary dimension.
+  """A generalized contraction between tensors of arbitrary dimension.
 
   This function returns a tensor whose elements are defined by `equation`,
   which is written in a shorthand form inspired by the Einstein summation
@@ -139,6 +127,7 @@ def einsum(equation, *inputs):
   ```
 
   This function behaves like `numpy.einsum`, but does not support:
+
   * Ellipses (subscripts like `ij...,jk...->ik...`)
   * Subscripts where an axis appears more than once for a single input
     (e.g. `ijj,k->ik`).
@@ -147,7 +136,7 @@ def einsum(equation, *inputs):
   Args:
     equation: a `str` describing the contraction, in the same format as
       `numpy.einsum`.
-    inputs: the inputs to contract (each one a `Tensor`), whose shapes should
+    *inputs: the inputs to contract (each one a `Tensor`), whose shapes should
       be consistent with `equation`.
 
   Returns:
@@ -163,7 +152,7 @@ def einsum(equation, *inputs):
       - the input shapes are inconsistent along a particular axis.
   """
   if '...' in equation:
-    raise ValueError("Subscripts with ellipses are not yet supported.")
+    raise ValueError('Subscripts with ellipses are not yet supported.')
 
   match = re.match('([a-z,]+)(->[a-z]*)?', equation)
   if not match:
@@ -190,8 +179,8 @@ def einsum(equation, *inputs):
         counts[ax] += 1
 
     output_axis_labels = ''.join(sorted(
-      ax for ax in indices
-      if counts[ax] == 1
+        ax for ax in indices
+        if counts[ax] == 1
     ))
 
   for a in axis_labels:
@@ -318,44 +307,28 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
     # into a single axis, and combine multiple summed axes into a
     # single axis.
 
-    t0_shape = tuple(x.value for x in t0.get_shape())
+    t0_shape = _get_shape(t0)
     num_broadcast_elements_t0 = _total_size(
         t0_shape[len(preserved_axes):-len(axes_to_sum)])
     num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
-    new_shape = t0_shape[:len(preserved_axes)] + (num_broadcast_elements_t0,
-                                                  num_summed_elements)
+    new_shape = (t0_shape[:len(preserved_axes)]
+                 + [num_broadcast_elements_t0, num_summed_elements])
     t0 = _reshape_if_necessary(t0, new_shape)
 
-    t1_shape = tuple(x.value for x in t1.get_shape())
+    t1_shape = _get_shape(t1)
     num_broadcast_elements_t1 = _total_size(
         t1_shape[len(preserved_axes)+len(axes_to_sum):])
-    new_shape = t1_shape[:len(preserved_axes)] + (num_summed_elements,
-                                                  num_broadcast_elements_t1)
+    new_shape = (t1_shape[:len(preserved_axes)]
+                 + [num_summed_elements, num_broadcast_elements_t1])
     t1 = _reshape_if_necessary(t1, new_shape)
 
     product = math_ops.matmul(t0, t1)
 
     # Undo compaction of broadcast axes
     uncompacted_shape = (
-        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])] +
-        t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
+        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
+        + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
     )
-
-    # Check the number of None values and replace them with Tensors containing
-    # corresponding dimensions if there exist two or more None values
-    num_none_dims = sum(1 for d in uncompacted_shape if d is None)
-    if num_none_dims > 1:
-      uncompacted_shape = list(uncompacted_shape)
-      for i in xrange(len(uncompacted_shape)):
-        if uncompacted_shape[i] is None:
-          if i < len(preserved_axes) + len(broadcast_axes[0]):
-            uncompacted_shape[i] = array_ops.shape(inputs[0])[i]
-          else:
-            idx = (i - len(preserved_axes) - len(broadcast_axes[0])
-                   + len(t1_shape) - len(broadcast_axes[1]))
-            uncompacted_shape[i] = array_ops.shape(inputs[1])[idx]
-      uncompacted_shape = tuple(uncompacted_shape)
-
     product = _reshape_if_necessary(product, uncompacted_shape)
 
     product_axes = (
@@ -386,13 +359,28 @@ def _reshape_if_necessary(tensor, new_shape):
     return array_ops.reshape(tensor, new_shape)
 
 
+def _get_shape(tensor):
+  """Like get_shape().as_list(), but explicitly queries the shape of a tensor
+  if necessary to ensure that the returned value contains no unknown value."""
+
+  shape = tensor.get_shape().as_list()
+  none_indices = [i for i, d in enumerate(shape) if d is None]
+  if none_indices:
+    # Query the shape if shape contains None values
+    shape_tensor = array_ops.shape(tensor)
+    for i in none_indices:
+      shape[i] = shape_tensor[i]
+  return shape
+
+
 def _total_size(shape_values):
-  """Given list of tensor shape values, returns total size or -1 if unknown."""
+  """Given list of tensor shape values, returns total size.
+  If shape_values contains tensor values (which are results of
+  array_ops.shape), then it returns a scalar tensor.
+  If not, it returns an integer."""
+
   result = 1
   for val in shape_values:
-    if val is None:
-      return -1
-    assert isinstance(val, int)
     result *= val
   return result
 
@@ -436,7 +424,7 @@ def _exponential_space_einsum(equation, *inputs):
   missing_idx = set(idx_out).difference(idx_all)
   if missing_idx:
     raise ValueError(
-        'Unknown ouput axes: %s' % missing_idx
+        'Unknown output axes: %s' % missing_idx
     )
 
   axis_order = {}
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 3d289bcc9a5..13cd9b7ba44 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -32,40 +32,52 @@ from tensorflow.python.platform import test
 
 
 class LBetaTest(test.TestCase):
-  _use_gpu = False
 
   def test_one_dimensional_arg(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
     x_one_half = [2, 1.]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_one)).eval())
       self.assertAllClose(
           0.5, math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
-  def test_one_dimensional_arg_dynamic_alloc(self):
+  def test_one_dimensional_arg_dynamic(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
     x_one_half = [2, 1.]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       ph = array_ops.placeholder(dtypes.float32)
       beta_ph = math_ops.exp(special_math_ops.lbeta(ph))
       self.assertAllClose(1, beta_ph.eval(feed_dict={ph: x_one}))
       self.assertAllClose(0.5, beta_ph.eval(feed_dict={ph: x_one_half}))
 
+  def test_four_dimensional_arg_with_partial_shape_dynamic(self):
+    x_ = np.ones((3, 2, 3, 4))
+    # Gamma(1) = 0! = 1
+    # Gamma(1 + 1 + 1 + 1) = Gamma(4) = 3! = 6
+    # ==> Beta([1, 1, 1, 1])
+    #     = Gamma(1) * Gamma(1) * Gamma(1) * Gamma(1) / Gamma(1 + 1 + 1 + 1)
+    #     = 1 / 6
+    expected_beta_x = 1 / 6 * np.ones((3, 2, 3))
+    with self.test_session(use_gpu=True):
+      x_ph = array_ops.placeholder(dtypes.float32, [3, 2, 3, None])
+      beta_ph = math_ops.exp(special_math_ops.lbeta(x_ph))
+      self.assertAllClose(expected_beta_x, beta_ph.eval(feed_dict={x_ph: x_}))
+
   def test_two_dimensional_arg(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       self.assertAllClose(
           [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
-  def test_two_dimensional_arg_dynamic_alloc(self):
+  def test_two_dimensional_arg_dynamic(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       ph = array_ops.placeholder(dtypes.float32)
       beta_ph = math_ops.exp(special_math_ops.lbeta(ph))
       self.assertAllClose([0.5, 0.5], beta_ph.eval(feed_dict={ph: x_one_half}))
@@ -73,7 +85,7 @@ class LBetaTest(test.TestCase):
   def test_two_dimensional_proper_shape(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       self.assertAllClose(
           [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
@@ -83,7 +95,7 @@ class LBetaTest(test.TestCase):
           special_math_ops.lbeta(x_one_half).get_shape())
 
   def test_complicated_shape(self):
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       x = ops.convert_to_tensor(np.random.rand(3, 2, 2))
       self.assertAllEqual(
           (3, 2), array_ops.shape(special_math_ops.lbeta(x)).eval())
@@ -96,40 +108,42 @@ class LBetaTest(test.TestCase):
     # as the answer, always.
     x_a = [5.5]
     x_b = [0.1]
-    with self.test_session(use_gpu=self._use_gpu):
+    with self.test_session(use_gpu=True):
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_a)).eval())
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_b)).eval())
       self.assertEqual((), special_math_ops.lbeta(x_a).get_shape())
 
-  def test_empty_rank2_or_greater_input_gives_empty_output(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      self.assertAllEqual([], special_math_ops.lbeta([[]]).eval())
-      self.assertEqual((0,), special_math_ops.lbeta([[]]).get_shape())
-      self.assertAllEqual([[]], special_math_ops.lbeta([[[]]]).eval())
-      self.assertEqual((1, 0), special_math_ops.lbeta([[[]]]).get_shape())
+  def test_empty_rank1_returns_negative_infinity(self):
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant([], shape=[0])
+      lbeta_x = special_math_ops.lbeta(x)
+      expected_result = constant_op.constant(-np.inf, shape=())
 
-  def test_empty_rank2_or_greater_input_gives_empty_output_dynamic_alloc(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      ph = array_ops.placeholder(dtypes.float32)
-      self.assertAllEqual(
-          [], special_math_ops.lbeta(ph).eval(feed_dict={ph: [[]]}))
-      self.assertAllEqual(
-          [[]], special_math_ops.lbeta(ph).eval(feed_dict={ph: [[[]]]}))
+      self.assertAllEqual(expected_result.eval(), lbeta_x.eval())
+      self.assertEqual(expected_result.get_shape(), lbeta_x.get_shape())
 
-  def test_empty_rank1_input_raises_value_error(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      with self.assertRaisesRegexp(ValueError, 'rank'):
-        special_math_ops.lbeta([])
+  def test_empty_rank2_with_zero_last_dim_returns_negative_infinity(self):
+    with self.test_session(use_gpu=True):
+      event_size = 0
+      for batch_size in [0, 1, 2]:
+        x = constant_op.constant([], shape=[batch_size, event_size])
+        lbeta_x = special_math_ops.lbeta(x)
+        expected_result = constant_op.constant(-np.inf, shape=[batch_size])
 
-  def test_empty_rank1_dynamic_alloc_input_raises_op_error(self):
-    with self.test_session(use_gpu=self._use_gpu):
-      ph = array_ops.placeholder(dtypes.float32)
-      with self.assertRaisesOpError('rank'):
-        special_math_ops.lbeta(ph).eval(feed_dict={ph: []})
+        self.assertAllEqual(expected_result.eval(), lbeta_x.eval())
+        self.assertEqual(expected_result.get_shape(), lbeta_x.get_shape())
 
+  def test_empty_rank2_with_zero_batch_dim_returns_empty(self):
+    with self.test_session(use_gpu=True):
+      batch_size = 0
+      for event_size in [0, 1, 2]:
+        x = constant_op.constant([], shape=[batch_size, event_size])
+        lbeta_x = special_math_ops.lbeta(x)
 
-class LBetaTestGpu(LBetaTest):
-  _use_gpu = True
+        expected_result = constant_op.constant([], shape=[batch_size])
+
+        self.assertAllEqual(expected_result.eval(), lbeta_x.eval())
+        self.assertEqual(expected_result.get_shape(), lbeta_x.get_shape())
 
 
 class EinsumTest(test.TestCase):
@@ -250,7 +264,7 @@ class EinsumTest(test.TestCase):
     input_tensors = [constant_op.constant(val) for val in input_vals]
     output_tensor = special_math_ops.einsum(axes, *input_tensors)
 
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       output_value = output_tensor.eval()
 
     correct_value = np.einsum(axes, *input_vals)
@@ -318,7 +332,19 @@ class EinsumTest(test.TestCase):
             m1: [3, 2],
         }
         np.testing.assert_almost_equal(
-            [[7]], sess.run(out, feed_dict=feed_dict))
+           [[7]], sess.run(out, feed_dict=feed_dict))
+
+    with ops.Graph().as_default():
+      m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
+      m1 = array_ops.placeholder(dtypes.int32, shape=(None, 2))
+      out = special_math_ops.einsum('ijkl,ij->ikl', m0, m1)
+      with session.Session() as sess:
+        feed_dict = {
+            m0: [[[[1, 2]], [[2, 1]]]],
+            m1: [[3, 2]],
+        }
+        np.testing.assert_almost_equal(
+            [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
new file mode 100644
index 00000000000..deb0a571780
--- /dev/null
+++ b/tensorflow/python/ops/spectral_grad.py
@@ -0,0 +1,179 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for operators defined in spectral_ops.py."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops
+
+
+def _FFTSizeForGrad(grad, rank):
+  return math_ops.reduce_prod(array_ops.shape(grad)[-rank:])
+
+
+@ops.RegisterGradient("FFT")
+def _FFTGrad(_, grad):
+  size = math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
+  return spectral_ops.ifft(grad) * math_ops.complex(size, 0.)
+
+
+@ops.RegisterGradient("IFFT")
+def _IFFTGrad(_, grad):
+  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
+  return spectral_ops.fft(grad) * math_ops.complex(rsize, 0.)
+
+
+@ops.RegisterGradient("FFT2D")
+def _FFT2DGrad(_, grad):
+  size = math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
+  return spectral_ops.ifft2d(grad) * math_ops.complex(size, 0.)
+
+
+@ops.RegisterGradient("IFFT2D")
+def _IFFT2DGrad(_, grad):
+  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
+  return spectral_ops.fft2d(grad) * math_ops.complex(rsize, 0.)
+
+
+@ops.RegisterGradient("FFT3D")
+def _FFT3DGrad(_, grad):
+  size = math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
+  return spectral_ops.ifft3d(grad) * math_ops.complex(size, 0.)
+
+
+@ops.RegisterGradient("IFFT3D")
+def _IFFT3DGrad(_, grad):
+  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
+  return spectral_ops.fft3d(grad) * math_ops.complex(rsize, 0.)
+
+
+def _RFFTGradHelper(rank, irfft_fn):
+  """Returns a gradient function for an RFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for RFFT3D.
+  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
+
+  def _Grad(op, grad):
+    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
+    fft_length = op.inputs[1]
+    input_shape = array_ops.shape(op.inputs[0])
+    is_even = math_ops.cast(1 - (fft_length[-1] % 2), dtypes.complex64)
+
+    def _TileForBroadcasting(matrix, t):
+      expanded = array_ops.reshape(
+          matrix,
+          array_ops.concat([
+              array_ops.ones([array_ops.rank(t) - 2], dtypes.int32),
+              array_ops.shape(matrix)
+          ], 0))
+      return array_ops.tile(
+          expanded, array_ops.concat([array_ops.shape(t)[:-2], [1, 1]], 0))
+
+    def _MaskMatrix(length):
+      # TODO(rjryan): Speed up computation of twiddle factors using the
+      # following recurrence relation and cache them across invocations of RFFT.
+      #
+      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+      # for n = 0, 1,..., line_len-1.
+      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+      a = array_ops.tile(
+          array_ops.expand_dims(math_ops.range(length), 0), (length, 1))
+      b = array_ops.transpose(a, [1, 0])
+      return math_ops.exp(-2j * np.pi * math_ops.cast(a * b, dtypes.complex64) /
+                          math_ops.cast(length, dtypes.complex64))
+
+    def _YMMask(length):
+      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
+      return math_ops.cast(1 - 2 * (math_ops.range(length) % 2),
+                           dtypes.complex64)
+
+    y0 = grad[..., 0:1]
+    if rank == 1:
+      ym = grad[..., -1:]
+      extra_terms = y0 + is_even * ym * _YMMask(input_shape[-1])
+    elif rank == 2:
+      # Create a mask matrix for y0 and ym.
+      base_mask = _MaskMatrix(input_shape[-2])
+
+      # Tile base_mask to match y0 in shape so that we can batch-matmul the
+      # inner 2 dimensions.
+      tiled_mask = _TileForBroadcasting(base_mask, y0)
+
+      y0_term = math_ops.matmul(tiled_mask, math_ops.conj(y0))
+      extra_terms = y0_term
+
+      ym = grad[..., -1:]
+      ym_term = math_ops.matmul(tiled_mask, math_ops.conj(ym))
+
+      inner_dim = input_shape[-1]
+      ym_term = array_ops.tile(
+          ym_term,
+          array_ops.concat([
+              array_ops.ones([array_ops.rank(grad) - 1], dtypes.int32),
+              [inner_dim]
+          ], 0)) * _YMMask(inner_dim)
+
+      extra_terms += is_even * ym_term
+
+    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
+    # factor, plus some additional terms to make up for the components dropped
+    # due to Hermitian symmetry.
+    input_size = math_ops.to_float(_FFTSizeForGrad(op.inputs[0], rank))
+    irfft = irfft_fn(grad, fft_length)
+    return 0.5 * (irfft * input_size + math_ops.real(extra_terms)), None
+
+  return _Grad
+
+
+def _IRFFTGradHelper(rank, rfft_fn):
+  """Returns a gradient function for an IRFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for IRFFT3D.
+  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
+
+  def _Grad(op, grad):
+    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
+    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
+    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
+    # graph we special-case the situation where the FFT length and last
+    # dimension of the input are known at graph construction time.
+    fft_length = op.inputs[1]
+    is_odd = math_ops.mod(fft_length[-1], 2)
+    input_last_dimension = array_ops.shape(op.inputs[0])[-1]
+    mask = array_ops.concat(
+        [[1.0], 2.0 * array_ops.ones([input_last_dimension - 2 + is_odd]),
+         array_ops.ones([1 - is_odd])], 0)
+
+    rsize = math_ops.reciprocal(math_ops.to_float(_FFTSizeForGrad(grad, rank)))
+
+    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
+    # factor and a mask. The mask scales the gradient for the Hermitian
+    # symmetric components of the RFFT by a factor of two, since these
+    # components are de-duplicated in the RFFT.
+    rfft = rfft_fn(grad, fft_length)
+    return rfft * math_ops.cast(rsize * mask, dtypes.complex64), None
+
+  return _Grad
+
+
+ops.RegisterGradient("RFFT")(_RFFTGradHelper(1, spectral_ops.irfft))
+ops.RegisterGradient("IRFFT")(_IRFFTGradHelper(1, spectral_ops.rfft))
+ops.RegisterGradient("RFFT2D")(_RFFTGradHelper(2, spectral_ops.irfft2d))
+ops.RegisterGradient("IRFFT2D")(_IRFFTGradHelper(2, spectral_ops.rfft2d))
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
new file mode 100644
index 00000000000..47ff7018f2d
--- /dev/null
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -0,0 +1,170 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Spectral operators (e.g. FFT, RFFT).
+
+@@fft
+@@ifft
+@@fft2d
+@@ifft2d
+@@fft3d
+@@ifft3d
+@@rfft
+@@irfft
+@@rfft2d
+@@irfft2d
+@@rfft3d
+@@irfft3d
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_util as _tensor_util
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import gen_spectral_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+def _infer_fft_length_for_rfft(input_tensor, fft_rank):
+  """Infers the `fft_length` argument for a `rank` RFFT from `input_tensor`."""
+  # A TensorShape for the inner fft_rank dimensions.
+  fft_shape = input_tensor.get_shape()[-fft_rank:]
+
+  # If any dim is unknown, fall back to tensor-based math.
+  if not fft_shape.is_fully_defined():
+    return _array_ops.shape(input_tensor)[-fft_rank:]
+
+  # Otherwise, return a constant.
+  return _ops.convert_to_tensor(fft_shape.as_list(), _dtypes.int32)
+
+
+def _infer_fft_length_for_irfft(input_tensor, fft_rank):
+  """Infers the `fft_length` argument for a `rank` IRFFT from `input_tensor`."""
+  # A TensorShape for the inner fft_rank dimensions.
+  fft_shape = input_tensor.get_shape()[-fft_rank:]
+
+  # If any dim is unknown, fall back to tensor-based math.
+  if not fft_shape.is_fully_defined():
+    fft_length = _array_ops.unstack(_array_ops.shape(input_tensor)[-fft_rank:])
+    fft_length[-1] = _math_ops.maximum(0, 2 * (fft_length[-1] - 1))
+    return _array_ops.stack(fft_length)
+
+  # Otherwise, return a constant.
+  fft_length = fft_shape.as_list()
+  if fft_length:
+    fft_length[-1] = max(0, 2 * (fft_length[-1] - 1))
+  return _ops.convert_to_tensor(fft_length, _dtypes.int32)
+
+
+def _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length, is_reverse=False):
+  """Pads `input_tensor` to `fft_length` on its inner-most `fft_rank` dims."""
+  fft_shape = _tensor_util.constant_value_as_shape(fft_length)
+
+  # Edge case: skip padding empty tensors.
+  if (input_tensor.shape.ndims is not None and
+      any(dim.value == 0 for dim in input_tensor.shape)):
+    return input_tensor
+
+  # If we know the shapes ahead of time, we can either skip or pre-compute the
+  # appropriate paddings. Otherwise, fall back to computing paddings in
+  # TensorFlow.
+  if fft_shape.is_fully_defined() and input_tensor.shape.ndims is not None:
+    # Slice the last FFT-rank dimensions from input_tensor's shape.
+    input_fft_shape = input_tensor.shape[-fft_shape.ndims:]
+
+    if input_fft_shape.is_fully_defined():
+      # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
+      if is_reverse:
+        fft_shape = fft_shape[:-1].concatenate(fft_shape[-1].value // 2 + 1)
+
+      paddings = [[0, max(fft_dim.value - input_dim.value, 0)]
+                  for fft_dim, input_dim in zip(fft_shape, input_fft_shape)]
+      if any(pad > 0 for _, pad in paddings):
+        outer_paddings = [[0, 0]] * max((input_tensor.shape.ndims -
+                                         fft_shape.ndims), 0)
+        return _array_ops.pad(input_tensor, outer_paddings + paddings)
+      return input_tensor
+
+  # If we can't determine the paddings ahead of time, then we have to pad. If
+  # the paddings end up as zero, tf.pad has a special-case that does no work.
+  input_rank = _array_ops.rank(input_tensor)
+  input_fft_shape = _array_ops.shape(input_tensor)[-fft_rank:]
+  outer_dims = _math_ops.maximum(0, input_rank - fft_rank)
+  outer_paddings = _array_ops.zeros([outer_dims], fft_length.dtype)
+  # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
+  if is_reverse:
+    fft_length = _array_ops.concat([fft_length[:-1],
+                                    fft_length[-1:] // 2 + 1], 0)
+  fft_paddings = _math_ops.maximum(0, fft_length - input_fft_shape)
+  paddings = _array_ops.concat([outer_paddings, fft_paddings], 0)
+  paddings = _array_ops.stack([_array_ops.zeros_like(paddings), paddings],
+                              axis=1)
+  return _array_ops.pad(input_tensor, paddings)
+
+
+def _rfft_wrapper(fft_fn, fft_rank, default_name):
+  """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
+
+  def _rfft(input_tensor, fft_length=None, name=None):
+    with _ops.name_scope(name, default_name,
+                         [input_tensor, fft_length]) as name:
+      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
+      input_tensor.shape.with_rank_at_least(fft_rank)
+      if fft_length is None:
+        fft_length = _infer_fft_length_for_rfft(input_tensor, fft_rank)
+      else:
+        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
+      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length)
+      return fft_fn(input_tensor, fft_length, name)
+  _rfft.__doc__ = fft_fn.__doc__
+  return _rfft
+
+
+def _irfft_wrapper(ifft_fn, fft_rank, default_name):
+  """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
+
+  def _irfft(input_tensor, fft_length=None, name=None):
+    with _ops.name_scope(name, default_name,
+                         [input_tensor, fft_length]) as name:
+      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
+      input_tensor.shape.with_rank_at_least(fft_rank)
+      if fft_length is None:
+        fft_length = _infer_fft_length_for_irfft(input_tensor, fft_rank)
+      else:
+        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
+      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length,
+                                         is_reverse=True)
+      return ifft_fn(input_tensor, fft_length, name)
+  _irfft.__doc__ = ifft_fn.__doc__
+  return _irfft
+
+
+fft = gen_spectral_ops.fft
+ifft = gen_spectral_ops.ifft
+fft2d = gen_spectral_ops.fft2d
+ifft2d = gen_spectral_ops.ifft2d
+fft3d = gen_spectral_ops.fft3d
+ifft3d = gen_spectral_ops.ifft3d
+rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
+irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
+rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
+irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
+rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
+irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
+
+remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 3bc8048707b..a6b14f6f6f3 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import sparse_grad
+from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
 from tensorflow.python.util.all_util import remove_undocumented
@@ -56,6 +57,8 @@ from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
 from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
+from tensorflow.python.ops.lookup_ops import initialize_all_tables
+from tensorflow.python.ops.lookup_ops import tables_initializer
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -144,6 +147,7 @@ _allowed_symbols_math_ops = [
     # These are documented in nn.
     # We are are not importing nn because it would create a circular dependency.
     "sigmoid",
+    "log_sigmoid",
     "tanh",
 ]
 
@@ -181,7 +185,6 @@ _allowed_symbols_array_ops = [
     "batch_matrix_band_part",
     "batch_matrix_diag_part",
     "batch_matrix_set_diag",
-    "concat_v2",  # Use tf.concat instead.
 ]
 
 _allowed_symbols_partitioned_variables = [
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index ff6c47247c9..dbc637975d6 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -13,46 +13,27 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Variables
+"""Variables. See the @{$python/state_ops} guide.
 
 @@Variable
-
-## Variable helper functions
-
-TensorFlow provides a set of functions to help manage the set of variables
-collected in the graph.
-
 @@global_variables
 @@local_variables
 @@model_variables
 @@trainable_variables
 @@moving_average_variables
-
 @@global_variables_initializer
 @@local_variables_initializer
 @@variables_initializer
 @@is_variable_initialized
 @@report_uninitialized_variables
 @@assert_variables_initialized
-
 @@assign
 @@assign_add
 @@assign_sub
-
-## Saving and Restoring Variables
-
 @@Saver
-
 @@latest_checkpoint
-
 @@get_checkpoint_state
 @@update_checkpoint_state
-
-## Sharing Variables
-
-TensorFlow provides several classes and operations that you can use to
-create variables contingent on certain conditions.
-
 @@get_variable
 @@get_local_variable
 @@VariableScope
@@ -60,9 +41,7 @@ create variables contingent on certain conditions.
 @@variable_op_scope
 @@get_variable_scope
 @@make_template
-
 @@no_regularizer
-
 @@constant_initializer
 @@random_normal_initializer
 @@truncated_normal_initializer
@@ -71,27 +50,9 @@ create variables contingent on certain conditions.
 @@zeros_initializer
 @@ones_initializer
 @@orthogonal_initializer
-
-## Variable Partitioners for Sharding
-
 @@fixed_size_partitioner
 @@variable_axis_size_partitioner
 @@min_max_variable_partitioner
-
-## Sparse Variable Updates
-
-The sparse update ops modify a subset of the entries in a dense `Variable`,
-either overwriting the entries or adding / subtracting a delta.  These are
-useful for training embedding models and similar lookup-based networks, since
-only a small subset of embedding vectors change in any given step.
-
-Since a sparse update of a large tensor may be generated automatically during
-gradient computation (as in the gradient of
-[`tf.gather`](../../api_docs/python/array_ops.md#gather)),
-an [`IndexedSlices`](#IndexedSlices) class is provided that encapsulates a set
-of sparse indices and values.  `IndexedSlices` objects are detected and handled
-automatically by the optimizers in most cases.
-
 @@scatter_update
 @@scatter_add
 @@scatter_sub
@@ -102,25 +63,14 @@ automatically by the optimizers in most cases.
 @@scatter_nd_sub
 @@sparse_mask
 @@IndexedSlices
-
-### Read-only Lookup Tables
-
 @@initialize_all_tables
 @@tables_initializer
-
-
-## Exporting and Importing Meta Graphs
-
 @@export_meta_graph
 @@import_meta_graph
-
-# Deprecated functions (removed after 2017-03-02). Please don't use them.
-
 @@all_variables
 @@initialize_all_variables
 @@initialize_local_variables
 @@initialize_variables
-
 """
 
 from __future__ import absolute_import
@@ -259,7 +209,7 @@ def assign_sub(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_sub(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_sub(value, name=name)
+  return ref.assign_sub(value)
 
 
 def assign_add(ref, value, use_locking=None, name=None):
@@ -287,14 +237,15 @@ def assign_add(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_add(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_add(value, name=name)
+  return ref.assign_add(value)
 
 
 def assign(ref, value, validate_shape=None, use_locking=None, name=None):
   """Update 'ref' by assigning 'value' to it.
 
-  This operation outputs "ref" after the assignment is done.
-  This makes it easier to chain operations that need to use the reset value.
+  This operation outputs a Tensor that holds the new value of 'ref' after
+    the value has been assigned. This makes it easier to chain operations
+    that need to use the reset value.
 
   Args:
     ref: A mutable `Tensor`.
@@ -311,11 +262,11 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     name: A name for the operation (optional).
 
   Returns:
-    Same as "ref".  Returned as a convenience for operations that want
-    to use the new value after the variable has been reset.
+    A `Tensor` that will hold the new value of 'ref' after
+      the assignment has completed.
   """
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value, name=name)
+  return ref.assign(value)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 169390fbf29..97f2f761a6a 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -13,30 +13,17 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Hashing
+"""Operations for working with string Tensors.
 
-String hashing ops take a string input tensor and map each element to an
-integer.
+See the @{$python/string_ops} guide.
 
 @@string_to_hash_bucket_fast
 @@string_to_hash_bucket_strong
 @@string_to_hash_bucket
-
-## Joining
-
-String joining ops concatenate elements of input string tensors to produce a new
-string tensor.
-
 @@reduce_join
 @@string_join
-
-## Splitting
-
 @@string_split
 @@substr
-
-## Conversion
-
 @@as_string
 @@encode_base64
 @@decode_base64
@@ -68,7 +55,7 @@ def string_split(source, delimiter=" "):  # pylint: disable=invalid-name
 
   Let N be the size of source (typically N will be the batch size). Split each
   element of `source` based on `delimiter` and return a `SparseTensor`
-  containing the splitted tokens. Empty tokens are ignored.
+  containing the split tokens. Empty tokens are ignored.
 
   If `delimiter` is an empty string, each element of the `source` is split
   into individual strings, each containing one byte. (This includes splitting
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
new file mode 100644
index 00000000000..a3f66169029
--- /dev/null
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -0,0 +1,107 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#==============================================================================
+"""Contains utility functions used by summary ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import re
+
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging
+
+
+def collect(val, collections, default_collections):
+  """Adds keys to a collection.
+
+  Args:
+    val: The value to add per each key.
+    collections: A collection of keys to add.
+    default_collections: Used if collections is None.
+  """
+  if collections is None:
+    collections = default_collections
+  for key in collections:
+    ops.add_to_collection(key, val)
+
+
+_INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
+
+
+def clean_tag(name):
+  """Cleans a tag. Removes illegal characters for instance.
+
+  Args:
+    name: The original tag name to be processed.
+
+  Returns:
+    The cleaned tag name.
+  """
+  # In the past, the first argument to summary ops was a tag, which allowed
+  # arbitrary characters. Now we are changing the first argument to be the node
+  # name. This has a number of advantages (users of summary ops now can
+  # take advantage of the tf name scope system) but risks breaking existing
+  # usage, because a much smaller set of characters are allowed in node names.
+  # This function replaces all illegal characters with _s, and logs a warning.
+  # It also strips leading slashes from the name.
+  if name is not None:
+    new_name = _INVALID_TAG_CHARACTERS.sub('_', name)
+    new_name = new_name.lstrip('/')  # Remove leading slashes
+    if new_name != name:
+      tf_logging.info('Summary name %s is illegal; using %s instead.' %
+                      (name, new_name))
+      name = new_name
+  return name
+
+
+@contextlib.contextmanager
+def summary_scope(name, family=None, default_name=None, values=None):
+  """Enters a scope used for the summary and yields both the name and tag.
+
+  To ensure that the summary tag name is always unique, we create a name scope
+  based on `name` and use the full scope name in the tag.
+
+  If `family` is set, then the tag name will be '<family>/<scope_name>', where
+  `scope_name` is `<outer_scope>/<family>/<name>`. This ensures that `family`
+  is always the prefix of the tag (and unmodified), while ensuring the scope
+  respects the outer scope from this this summary was created.
+
+  Args:
+    name: A name for the generated summary node.
+    family: Optional; if provided, used as the prefix of the summary tag name.
+    default_name: Optional; if provided, used as default name of the summary.
+    values: Optional; passed as `values` parameter to name_scope.
+
+  Yields:
+    A tuple `(tag, scope)`, both of which are unique and should be used for the
+    tag and the scope for the summary to output.
+  """
+  name = clean_tag(name)
+  family = clean_tag(family)
+  # Use family name in the scope to ensure uniqueness of scope/tag.
+  scope_base_name = name if family is None else '{}/{}'.format(family, name)
+  with ops.name_scope(scope_base_name, default_name, values=values) as scope:
+    if family is None:
+      tag = scope.rstrip('/')
+    else:
+      # Prefix our scope with family again so it displays in the right tab.
+      tag = '{}/{}'.format(family, scope.rstrip('/'))
+      # Note: tag is not 100% unique if the user explicitly enters a scope with
+      # the same name as family, then later enter it again before summaries.
+      # This is very contrived though, and we opt here to let it be a runtime
+      # exception if tags do indeed collide.
+    yield (tag, scope)
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 4010a020de8..4ad0862dcc7 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -22,19 +22,21 @@ from google.protobuf import json_format
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_logging_ops
+from tensorflow.python.ops import summary_op_util
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 
 
-def _Collect(val, collections, default_collections):
-  if collections is None:
-    collections = default_collections
-  for key in collections:
-    ops.add_to_collection(key, val)
-
-
+# TODO(dandelion): As currently implemented, this op has several problems.
+# The 'summary_description' field is passed but not used by the kernel.
+# The 'name' field is used to creat a scope and passed down via name=scope,
+# but gen_logging_ops._tensor_summary ignores this parameter and uses the
+# kernel's op name as the name. This is ok because scope and the op name
+# are identical, but it's probably worthwhile to fix.
+# Finally, because of the complications above, this currently does not
+# support the family= attribute added to other summaries in cl/156791589.
 def tensor_summary(  # pylint: disable=invalid-name
     name,
     tensor,
@@ -70,8 +72,67 @@ def tensor_summary(  # pylint: disable=invalid-name
         tensor=tensor,
         description=description,
         name=scope)
-    _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
+    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
-
 ops.NotDifferentiable("TensorSummary")
+
+
+def _tensor_summary_v2(  # pylint: disable=invalid-name
+    name,
+    tensor,
+    summary_description=None,
+    collections=None,
+    summary_metadata=None,
+    family=None):
+  # pylint: disable=line-too-long
+  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
+
+  NOTE(chizeng): This method is temporary. It should never make it into
+  TensorFlow 1.3, and nothing should depend on it. This method should be deleted
+  before August 2017 (ideally, earlier). This method exists to unblock the
+  TensorBoard plugin refactoring effort. We will later modify the tensor_summary
+  method to directly make use of the TensorSummaryV2 op. There must be a 3-week
+  difference between adding a new op (C++) and changing a python interface to
+  use it.
+
+  The generated
+  [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+  has one summary value containing the input tensor.
+
+  Args:
+    name: A name for the generated node. Will also serve as the series name in
+      TensorBoard.
+    tensor: A tensor of any type and shape to serialize.
+    summary_description: This is currently un-used but must be kept for
+      backwards compatibility.
+    collections: Optional list of graph collections keys. The new summary op is
+      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    summary_metadata: Optional SummaryMetadata proto (which describes which
+      plugins may use the summary value).
+    family: Optional; if provided, used as the prefix of the summary tag name,
+      which controls the tab name used for display on Tensorboard.
+
+  Returns:
+    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+    buffer.
+  """
+  # pylint: enable=line-too-long
+
+  # The summary description is unused now.
+  del summary_description
+
+  serialized_summary_metadata = ""
+  if summary_metadata:
+    serialized_summary_metadata = summary_metadata.SerializeToString()
+
+  with summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
+    val = gen_logging_ops._tensor_summary_v2(
+        tensor=tensor,
+        tag=tag,
+        description="",
+        name=scope,
+        serialized_summary_metadata=serialized_summary_metadata)
+    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
+  return val
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index c9fbb854f07..48be9e2cdae 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -120,7 +120,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
       template of the same scope/unique_name already exists and reuse is false,
       an error is raised. Defaults to None.
     custom_getter_: Optional custom getter for variables used in `func_`. See
-      the [`get_variable`](#get_variable) `custom_getter` documentation for
+      the @{tf.get_variable} `custom_getter` documentation for
       more information.
     **kwargs: Keyword arguments to apply to `func_`.
 
@@ -197,8 +197,9 @@ class Template(object):
     if name is None:
       raise ValueError("name cannot be None.")
     if create_scope_now:
-      with variable_scope.variable_scope(
-          self._unique_name, self._name,
+      with variable_scope._pure_variable_scope(  # pylint:disable=protected-access
+          (self._unique_name or
+           variable_scope._get_unique_variable_scope(self._name)),  # pylint:disable=protected-access
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
     else:
@@ -260,20 +261,23 @@ class Template(object):
           return self._call_func(args, kwargs, check_for_new_variables=True)
       else:
         # This is the first visit to __call__, but the scope has already been
-        # created in the constructor. Set _variables_created so that subsequent
-        # calls take the if branch above.
-        self._variables_created = True
+        # created in the constructor. Set _variables_created after the inner
+        # function is successfully called so that subsequent calls take the if
+        # branch above.
         with variable_scope.variable_scope(self._variable_scope):
-          return self._call_func(args, kwargs, check_for_new_variables=False)
+          result = self._call_func(args, kwargs, check_for_new_variables=False)
+          self._variables_created = True
+          return result
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
-      self._variables_created = True
       with variable_scope.variable_scope(
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        return self._call_func(args, kwargs, check_for_new_variables=False)
+        result = self._call_func(args, kwargs, check_for_new_variables=False)
+        self._variables_created = True
+        return result
 
   @property
   def variable_scope(self):
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index 0e7d1880ce4..1f70d695485 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -99,9 +99,9 @@ def _TensorArrayReadGrad(op, grad):
   flow = op.inputs[2]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   w_g = g.write(index, grad)
   return [None, None, w_g.flow]
 
@@ -125,9 +125,9 @@ def _TensorArrayWriteGrad(op, flow):
   index = op.inputs[1]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.read(index)
   return [None, None, grad, flow]
 
@@ -156,9 +156,9 @@ def _TensorArrayGatherGrad(op, grad):
   flow = op.inputs[2]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   u_g = g.scatter(indices, grad)
   return [None, None, u_g.flow]
 
@@ -180,9 +180,9 @@ def _TensorArrayScatterGrad(op, flow):
   indices = op.inputs[1]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.gather(indices)
   return [None, None, grad, flow]
 
@@ -211,9 +211,9 @@ def _TensorArrayConcatGrad(op, grad, unused_lengths_grad):
   lengths = op.outputs[1]
   dtype = op.get_attr("dtype")
   grad_source = _GetGradSource(grad)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   u_g = g.split(grad, lengths=lengths)
   # handle, flow_in
   return [None, u_g.flow]
@@ -235,9 +235,9 @@ def _TensorArraySplitGrad(op, flow):
   handle = op.inputs[0]
   dtype = op.get_attr("T")
   grad_source = _GetGradSource(flow)
-  g = tensor_array_ops.TensorArray(
-      dtype=dtype, handle=handle, flow=flow).grad(
-          source=grad_source, flow=flow)
+  g = (tensor_array_ops.TensorArray(dtype=dtype, handle=handle, flow=flow,
+                                    colocate_with_first_write_call=False)
+       .grad(source=grad_source, flow=flow))
   grad = g.concat()
   # handle, value, lengths, flow_in
   return [None, grad, None, flow]
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 929d090a184..20ae082ee12 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorArray operations.
-
-## Classes containing dynamically sized arrays of Tensors.
+"""TensorArray: a dynamically sized array of Tensors.
 
 @@TensorArray
 """
@@ -24,30 +22,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
-
-
-def _maybe_set_device(handle_op, value_t):
-  # NOTE(ebrevdo): Do not try this at home, kids
-  # _______________________________________________
-  # | I WILL NOT ACCESS PRIVATE METHODS ^^^^^^^^\ |
-  # | I WILL NOT ACCESS PRIVATE METHODS |       | |
-  # | I WILL NOT ACCESS PRIVATE METHODS |_ __   | |
-  # | I WILL NOT ACCESS PRIVATE METHODS (.(. )  | |
-  # | I WILL NOT ACCESS PRIVATE         (_      ) |
-  # |                           \\      /___/' /  |
-  # |                           _\\_      \    |  |
-  # |                          ((   )     /====|  |
-  # |                           \  <.__._-      \ |
-  # |___________________________ <//___.         ||
-  #
-  if not handle_op.device and value_t.device:
-    handle_op._set_device(value_t.device)  # pylint: disable=protected-access
+from tensorflow.python.util import tf_should_use
 
 
 # TensorArray object accesses many of the hidden generated ops, but is
@@ -59,24 +42,6 @@ class TensorArray(object):
   This class is meant to be used with dynamic iteration primitives such as
   `while_loop` and `map_fn`.  It supports gradient back-propagation via special
   "flow" control flow dependencies.
-
-  @@handle
-  @@flow
-  @@dtype
-
-  @@read
-  @@gather
-  @@stack
-  @@concat
-
-  @@write
-  @@scatter
-  @@unstack
-  @@split
-
-  @@identity
-
-  @@grad
   """
 
   def __init__(self,
@@ -89,6 +54,7 @@ class TensorArray(object):
                flow=None,
                infer_shape=True,
                element_shape=None,
+               colocate_with_first_write_call=True,
                name=None):
     """Construct a new TensorArray or wrap an existing TensorArray handle.
 
@@ -120,6 +86,11 @@ class TensorArray(object):
       element_shape: (optional, default: None) A `TensorShape` object specifying
         the shape constraints of each of the elements of the TensorArray.
         Need not be fully defined.
+      colocate_with_first_write_call: If `True`, the TensorArray will be
+        colocated on the same device as the Tensor used on its first write
+        (write operations include `write`, `unstack`, and `split`).  If `False`,
+        the TensorArray will be placed on the device determined by the
+        device context available during its initialization.
       name: A name for the operation (optional).
 
     Raises:
@@ -151,6 +122,16 @@ class TensorArray(object):
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
+
+    # Used to keep track of what tensors the TensorArray should be
+    # colocated with.  We choose to colocate the TensorArray with the
+    # first tensor written to it.
+    self._colocate_with_first_write_call = colocate_with_first_write_call
+    if colocate_with_first_write_call:
+      self._colocate_with = []
+    else:
+      self._colocate_with = None
+
     # Record the current static shape for the array elements. The element
     # shape is defined either by `element_shape` or the shape of the tensor
     # of the first write. If `infer_shape` is true, all writes checks for
@@ -171,8 +152,8 @@ class TensorArray(object):
         # Construct the TensorArray with an empty device.  The first
         # write into the TensorArray from a Tensor with a set device
         # will retroactively set the device value of this op.
-        with ops.device(None), ops.colocate_with(None, ignore_existing=True):
-          self._handle, self._flow = gen_data_flow_ops._tensor_array_v3(
+        def create():
+          return gen_data_flow_ops._tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
@@ -180,6 +161,11 @@ class TensorArray(object):
               clear_after_read=clear_after_read,
               tensor_array_name=tensor_array_name,
               name=scope)
+        if colocate_with_first_write_call:
+          with ops.device(None), ops.colocate_with(None, ignore_existing=True):
+            self._handle, self._flow = create()
+        else:
+          self._handle, self._flow = create()
 
   @property
   def flow(self):
@@ -216,6 +202,27 @@ class TensorArray(object):
     else:
       self._element_shape.append(shape)
 
+  @contextlib.contextmanager
+  def _maybe_colocate_with(self, value):
+    """Colocate operations with an internal colocation group or `value`.
+
+    Args:
+      value: `Tensor`, the tensor to try to colocate with.
+
+    Yields:
+      Does not yield anything, but the new context is a colocation context.
+
+    If no internal colocation group is set, colocate with `value` and set
+    the internal colocation group to be value.
+    """
+    if not self._colocate_with_first_write_call:
+      yield
+    else:
+      if not self._colocate_with:
+        self._colocate_with.append(value)
+      with ops.colocate_with(self._colocate_with[0]):
+        yield
+
   def identity(self):
     """Returns a TensorArray with the same content and properties.
 
@@ -225,9 +232,12 @@ class TensorArray(object):
       Use this object all for subsequent operations.
     """
     flow = array_ops.identity(self._flow)
-    ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow,
-                     infer_shape=self._infer_shape)
+    ta = TensorArray(
+        dtype=self._dtype, handle=self._handle, flow=flow,
+        infer_shape=self._infer_shape,
+        colocate_with_first_write_call=self._colocate_with_first_write_call)
     ta._element_shape = self._element_shape
+    ta._colocate_with = self._colocate_with
     return ta
 
   def grad(self, source, flow=None, name=None):
@@ -247,7 +257,8 @@ class TensorArray(object):
             dtype=self._dtype,
             handle=g_handle,
             flow=flow,
-            infer_shape=self._infer_shape)
+            infer_shape=self._infer_shape,
+            colocate_with_first_write_call=False)
         g._element_shape = self._element_shape
         return g
 
@@ -261,17 +272,17 @@ class TensorArray(object):
     Returns:
       The tensor at index `index`.
     """
-    with ops.colocate_with(self._handle):
-      value = gen_data_flow_ops._tensor_array_read_v3(
-          handle=self._handle,
-          index=index,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name)
-      if self._element_shape:
-        value.set_shape(self._element_shape[0].dims)
-      return value
+    value = gen_data_flow_ops._tensor_array_read_v3(
+        handle=self._handle,
+        index=index,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
 
+  @tf_should_use.should_use_result
   def write(self, index, value, name=None):
     """Write `value` into index `index` of the TensorArray.
 
@@ -289,17 +300,19 @@ class TensorArray(object):
     """
     with ops.name_scope(name, "TensorArrayWrite", [self._handle, index, value]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops._tensor_array_write_v3(
             handle=self._handle,
             index=index,
             value=value,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         ta._merge_element_shape(value.get_shape())
       return ta
@@ -334,21 +347,20 @@ class TensorArray(object):
     Returns:
       The in the `TensorArray` selected by `indices`, packed into one tensor.
     """
-    with ops.colocate_with(self._handle):
-      if self._element_shape:
-        element_shape = self._element_shape[0]
-      else:
-        element_shape = tensor_shape.TensorShape(None)
-      value = gen_data_flow_ops._tensor_array_gather_v3(
-          handle=self._handle,
-          indices=indices,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name,
-          element_shape=element_shape)
-      if self._element_shape and self._element_shape[0].dims is not None:
-        value.set_shape([None] + self._element_shape[0].dims)
-      return value
+    if self._element_shape:
+      element_shape = self._element_shape[0]
+    else:
+      element_shape = tensor_shape.TensorShape(None)
+    value = gen_data_flow_ops._tensor_array_gather_v3(
+        handle=self._handle,
+        indices=indices,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name,
+        element_shape=element_shape)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
 
   def concat(self, name=None):
     """Return the values in the TensorArray as a concatenated `Tensor`.
@@ -367,22 +379,23 @@ class TensorArray(object):
           tensor_shape.TensorShape(self._element_shape[0].dims[1:]))
     else:
       element_shape_except0 = tensor_shape.TensorShape(None)
-    with ops.colocate_with(self._handle):
-      value, _ = gen_data_flow_ops._tensor_array_concat_v3(
-          handle=self._handle,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name,
-          element_shape_except0=element_shape_except0)
-      if self._element_shape and self._element_shape[0].dims is not None:
-        value.set_shape([None] + self._element_shape[0].dims[1:])
-      return value
+    value, _ = gen_data_flow_ops._tensor_array_concat_v3(
+        handle=self._handle,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name,
+        element_shape_except0=element_shape_except0)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
 
+  @tf_should_use.should_use_result
   def unstack(self, value, name=None):
     """Unstack the values of a `Tensor` in the TensorArray.
 
     If input value shapes have rank-`R`, then the output TensorArray will
     contain elements whose shapes are rank-`(R-1)`.
+
     Args:
       value: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
       name: A name for the operation (optional).
@@ -399,6 +412,7 @@ class TensorArray(object):
       return self.scatter(
           indices=math_ops.range(0, num_elements), value=value, name=name)
 
+  @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
     """Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
@@ -418,17 +432,19 @@ class TensorArray(object):
     with ops.name_scope(name, "TensorArrayScatter",
                         [self._handle, value, indices]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops._tensor_array_scatter_v3(
             handle=self._handle,
             indices=indices,
             value=value,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[2].get_shape()
         element_shape = tensor_shape.unknown_shape()
@@ -437,6 +453,7 @@ class TensorArray(object):
         ta._merge_element_shape(element_shape)
       return ta
 
+  @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
     """Split the values of a `Tensor` into the TensorArray.
 
@@ -456,18 +473,20 @@ class TensorArray(object):
     with ops.name_scope(name, "TensorArraySplit",
                         [self._handle, value, lengths]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      lengths_64 = math_ops.to_int64(lengths)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
+        lengths_64 = math_ops.to_int64(lengths)
         flow_out = gen_data_flow_ops._tensor_array_split_v3(
             handle=self._handle,
             value=value,
             lengths=lengths_64,
             flow_in=self._flow,
             name=name)
-      ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
+      ta = TensorArray(
+          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[1].get_shape()
         clengths = tensor_util.constant_value(flow_out.op.inputs[2])
@@ -481,14 +500,13 @@ class TensorArray(object):
 
   def size(self, name=None):
     """Return the size of the TensorArray."""
-    with ops.colocate_with(self._handle):
-      return gen_data_flow_ops._tensor_array_size_v3(
-          handle=self._handle, flow_in=self.flow, name=name)
+    return gen_data_flow_ops._tensor_array_size_v3(
+        handle=self._handle, flow_in=self.flow, name=name)
 
+  @tf_should_use.should_use_result
   def close(self, name=None):
     """Close the current TensorArray."""
-    with ops.colocate_with(self._handle):
-      return gen_data_flow_ops._tensor_array_close_v3(
-          handle=self._handle, name=name)
+    return gen_data_flow_ops._tensor_array_close_v3(
+        handle=self._handle, name=name)
 
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/transpose_benchmark.py b/tensorflow/python/ops/transpose_benchmark.py
new file mode 100644
index 00000000000..6bd3fe5e5a0
--- /dev/null
+++ b/tensorflow/python/ops/transpose_benchmark.py
@@ -0,0 +1,147 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Transpose op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def build_graph(device, input_shape, perm, datatype, num_iters):
+  """Build a graph containing a sequence of conv2d operations.
+
+  Args:
+    device: String, the device to run on.
+    input_shape: Shape of the input tensor.
+    perm: A list of ints with the same length as input tensor's dimension.
+    datatype: numpy data type of the input tensor.
+    num_iters: number of iterations to run transpose.
+
+  Returns:
+    An array of tensors to run()
+  """
+  with ops.device("/%s:0" % device):
+    total_size = np.prod(input_shape)
+    inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
+    t = constant_op.constant(inp, shape=input_shape)
+
+    outputs = []
+    outputs.append(array_ops.transpose(t, perm))
+    for i in range(1, num_iters):
+      with ops.control_dependencies([outputs[i - 1]]):
+        outputs.append(array_ops.transpose(t, perm))
+    return control_flow_ops.group(*outputs)
+
+
+class TransposeBenchmark(test.Benchmark):
+  """Benchmark transpose!"""
+
+  def _run_graph(self, device, input_shape, perm, num_iters, datatype):
+    """Run the graph and print its execution time.
+
+    Args:
+      device: String, the device to run on.
+      input_shape: Shape of the input tensor.
+      perm: A list of ints with the same length as input tensor's dimension.
+      num_iters: Number of iterations to run the benchmark.
+      datatype: numpy data type of the input tensor.
+
+    Returns:
+      The duration of the run in seconds.
+    """
+    graph = ops.Graph()
+    with graph.as_default():
+      outputs = build_graph(device, input_shape, perm, datatype, num_iters)
+      with session_lib.Session(graph=graph) as session:
+        variables.global_variables_initializer().run()
+        # warmup runs
+        session.run(outputs)
+        start_time = time.time()
+        session.run(outputs)
+        duration = (time.time() - start_time) / num_iters
+        throughput = np.prod(np.array(
+            input_shape)) * datatype().itemsize * 2 / duration / 1e9
+        print("%s %s inputshape:%s perm:%s %d %.6fsec, %.4fGB/s." %
+              (device, str(datatype), str(input_shape).replace(" ", ""),
+               str(perm).replace(" ", ""), num_iters, duration, throughput))
+
+    name_template = (
+        "transpose_{device}_{dtype}_input_shape_{inputshape}_perm_{perm}")
+
+    self.report_benchmark(
+        name=name_template.format(
+            device=device,
+            dtype=str(datatype).replace(" ", ""),
+            inputshape=str(input_shape).replace(" ", ""),
+            perm=str(perm).replace(" ", "")).replace(" ", ""),
+        iters=num_iters,
+        wall_time=duration)
+
+    return duration
+
+  def benchmark_transpose(self):
+    print("transpose benchmark:")
+
+    datatypes = [np.complex128, np.float64, np.float32, np.float16, np.int8]
+
+    small_shapes = [[2, 20, 20, 20, 16], [2, 16, 20, 20, 20]] * 2 + [[
+        2, 100, 100, 16
+    ], [2, 16, 100, 100]] * 2 + [[2, 5000, 16], [2, 16, 5000]] * 2
+    small_perms = [[0, 4, 1, 2, 3], [0, 2, 3, 4, 1]] + [[4, 1, 2, 3, 0]] * 2 + [
+        [0, 3, 1, 2], [0, 2, 3, 1]
+    ] + [[3, 1, 2, 0]] * 2 + [[0, 2, 1]] * 2 + [[2, 1, 0]] * 2
+
+    large_shapes = [[2, 100, 100, 100, 32], [2, 100, 100, 100, 64]] * 2 + [[
+        2, 1000, 1000, 32
+    ], [2, 1000, 1000, 64]] * 2 + [[2, 1000000, 32], [2, 1000000, 64]] * 2
+    large_perms = [[0, 4, 1, 2, 3], [0, 2, 3, 4, 1]] + [[4, 1, 2, 3, 0]] * 2 + [
+        [0, 3, 1, 2], [0, 2, 3, 1]
+    ] + [[3, 1, 2, 0]] * 2 + [[0, 2, 1]] * 2 + [[2, 1, 0]] * 2
+
+    huge_shapes = [[2, 100, 100, 100, 128], [2, 1000, 1000, 128],
+                   [2, 1000000, 128]] * 2
+    huge_perms = [[0, 4, 1, 2, 3], [0, 3, 1, 2], [0, 2, 1], [4, 1, 2, 3, 0],
+                  [3, 1, 2, 0], [2, 1, 0]]
+
+    num_iters = 40
+    for datatype in datatypes:
+      for ishape, perm in zip(small_shapes, small_perms):
+        self._run_graph("gpu", ishape, perm, num_iters, datatype)
+
+      if datatype is not np.complex128:
+        if datatype is not np.float16:
+          for ishape, perm in zip(large_shapes, large_perms):
+            self._run_graph("gpu", ishape, perm, num_iters, datatype)
+
+      if datatype is not np.complex128:
+        if datatype is not np.float64:
+          if datatype is not np.float16:
+            for ishape, perm in zip(huge_shapes, huge_perms):
+              self._run_graph("gpu", ishape, perm, num_iters, datatype)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 185aeb54718..24616aeac39 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as collections_lib
-import contextlib
 import copy
 import functools
 import traceback
@@ -36,6 +35,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_contextlib
 
 __all__ = ["VariableScope", "get_variable_scope",
            "get_variable", "get_local_variable", "variable_scope",
@@ -280,6 +280,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favor of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
     # This is the main logic of get_variable.  However, custom_getter
     # may override this logic.  So we save it as a callable and pass
     # it to custom_getter.
@@ -346,7 +357,7 @@ class _VariableStore(object):
           initializer=initializer, regularizer=regularizer,
           reuse=reuse, trainable=trainable, collections=collections,
           caching_device=caching_device, partitioner=partitioner,
-          validate_shape=validate_shape)
+          validate_shape=validate_shape, use_resource=use_resource)
     else:
       return _true_getter(
           name, shape=shape, dtype=dtype,
@@ -683,7 +694,10 @@ class _VariableStore(object):
         init_val = initializer
         variable_dtype = None
       else:
-        init_val = lambda: initializer(
+        # Instantiate initializer if provided initializer is a type object.
+        if isinstance(initializer, type(init_ops.Initializer)):
+          initializer = initializer(dtype=dtype)
+        init_val = lambda: initializer(  # pylint: disable=g-long-lambda
             shape.as_list(), dtype=dtype, partition_info=partition_info)
         variable_dtype = dtype.base_dtype
 
@@ -725,7 +739,6 @@ class _VariableStore(object):
 
     return v
 
-
   # Initialize variable when no initializer provided
   def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32):
     """Provide a default initializer and a corresponding value.
@@ -754,7 +767,7 @@ class _VariableStore(object):
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
     else:
       raise ValueError("An initializer for variable %s of %s is required"
-          % (name, dtype.base_dtype))
+                       % (name, dtype.base_dtype))
 
     return initializer, initializing_from_value
 
@@ -766,9 +779,9 @@ def no_regularizer(_):
 
 
 class VariableScope(object):
-  """Variable scope object to carry defaults to provide to get_variable.
+  """Variable scope object to carry defaults to provide to `get_variable`.
 
-  Many of the arguments we need for get_variable in a variable store are most
+  Many of the arguments we need for `get_variable` in a variable store are most
   easily handled with a context. This object is used for the defaults.
 
   Attributes:
@@ -882,6 +895,19 @@ class VariableScope(object):
     """Set custom getter for this scope."""
     self._custom_getter = custom_getter
 
+  def get_collection(self, name):
+    """Get this scope's variables."""
+    scope = self._name + "/" if self._name else ""
+    return ops.get_collection(name, scope)
+
+  def trainable_variables(self):
+    """Get this scope's trainable variables."""
+    return self.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+
+  def global_variables(self):
+    """Get this scope's global variables."""
+    return self.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+
   def get_variable(self,
                    var_store,
                    name,
@@ -889,6 +915,7 @@ class VariableScope(object):
                    dtype=None,
                    initializer=None,
                    regularizer=None,
+                   reuse=None,
                    trainable=True,
                    collections=None,
                    caching_device=None,
@@ -905,6 +932,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if custom_getter is None:
       custom_getter = self._custom_getter
+    if reuse is None:
+      reuse = self._reuse
 
     full_name = self.name + "/" + name if self.name else name
     # Variable names only depend on variable_scope (full_name here),
@@ -927,7 +956,7 @@ class VariableScope(object):
 
       return var_store.get_variable(
           full_name, shape=shape, dtype=dtype, initializer=initializer,
-          regularizer=regularizer, reuse=self.reuse, trainable=trainable,
+          regularizer=regularizer, reuse=reuse, trainable=trainable,
           collections=collections, caching_device=caching_device,
           partitioner=partitioner, validate_shape=validate_shape,
           use_resource=use_resource, custom_getter=custom_getter)
@@ -956,6 +985,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if dtype is None:
       dtype = self._dtype
+    if use_resource is None:
+      use_resource = self._use_resource
 
     if self._custom_getter is not None:
       raise ValueError(
@@ -1037,7 +1068,7 @@ get_variable_or_local_docstring = (
 
 %sThis function prefixes the name with the current variable scope
 and performs reuse checks. See the
-[Variable Scope How To](../../how_tos/variable_scope/index.md)
+@{$variable_scope$Variable Scope How To}
 for an extensive description of how reusing works. Here is a basic example:
 
 ```python
@@ -1071,7 +1102,7 @@ Args:
   initializer: Initializer for the variable if one is created.
   regularizer: A (Tensor -> Tensor or None) function; the result of
     applying it on a newly created variable will be added to the collection
-    GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
+    @{tf.GraphKeys.REGULARIZATION_LOSSES} and can be used for regularization.
   %scollections: List of graph collections keys to add the Variable to.
     Defaults to `[%s]` (see `tf.Variable`).
   caching_device: Optional device string or function describing where the
@@ -1114,7 +1145,7 @@ get_variable.__doc__ = get_variable_or_local_docstring % (
     "Gets an existing variable with these parameters or create a new one.",
     "",
     "trainable: If `True` also add the variable to the graph collection\n"
-    "    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).\n",
+    "    `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).\n  ",
     "GraphKeys.GLOBAL_VARIABLES")
 
 
@@ -1230,7 +1261,7 @@ def _get_partitioned_variable(name,
   # pylint: enable=protected-access
 
 
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def _pure_variable_scope(name_or_scope,
                          reuse=None,
                          initializer=None,
@@ -1261,7 +1292,7 @@ def _pure_variable_scope(name_or_scope,
       well-defined semantics. Defaults to False (will later change to True).
 
   Yields:
-    A scope that can be to captured and reused.
+    A scope that can be captured and reused.
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within
@@ -1307,7 +1338,9 @@ def _pure_variable_scope(name_or_scope,
       if partitioner is not None:
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
-        default_varscope[0].set_custom_getter(custom_getter)
+        default_varscope[0].set_custom_getter(
+            _maybe_wrap_custom_getter(
+                custom_getter, name_or_scope.custom_getter))
       if dtype is not None:
         default_varscope[0].set_dtype(dtype)
       if use_resource is not None:
@@ -1338,7 +1371,8 @@ def _pure_variable_scope(name_or_scope,
       if partitioner is not None:
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
-        default_varscope[0].set_custom_getter(custom_getter)
+        default_varscope[0].set_custom_getter(
+            _maybe_wrap_custom_getter(custom_getter, old.custom_getter))
       if dtype is not None:
         default_varscope[0].set_dtype(dtype)
       if use_resource is not None:
@@ -1352,6 +1386,26 @@ def _pure_variable_scope(name_or_scope,
     default_varscope[0] = old
 
 
+def _maybe_wrap_custom_getter(custom_getter, old_getter):
+  """Wrap a call to a custom_getter to use the old_getter internally."""
+  if old_getter is None:
+    return custom_getter
+
+  # The new custom_getter should call the old one
+  def wrapped_custom_getter(getter, *args, **kwargs):
+    # Call:
+    #  custom_getter(
+    #    lambda: old_getter(true_getter, ...), *args, **kwargs)
+    # which means custom_getter will call old_getter, which
+    # will call the true_getter, perform any intermediate
+    # processing, and return the results to the current
+    # getter, which will also perform additional processing.
+    return custom_getter(
+        functools.partial(old_getter, getter),
+        *args, **kwargs)
+  return wrapped_custom_getter
+
+
 def _get_unique_variable_scope(prefix):
   """Get a name with the given prefix unique in the current variable scope."""
   var_store = _get_default_variable_store()
@@ -1366,7 +1420,7 @@ def _get_unique_variable_scope(prefix):
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def variable_scope(name_or_scope,
                    default_name=None,
                    values=None,
@@ -1390,7 +1444,7 @@ def variable_scope(name_or_scope,
 
   Variable scope allows to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
-  see the [Variable Scope How To](../../how_tos/variable_scope/index.md),
+  see the @{$variable_scope$Variable Scope How To},
   here we present only a few basic examples.
 
   Simple example of how to create a new variable:
@@ -1444,6 +1498,14 @@ def variable_scope(name_or_scope,
   Note that the `reuse` flag is inherited: if we open a reusing scope,
   then all its sub-scopes become reusing as well.
 
+  A note about name scoping: Setting `reuse` does not impact the naming of other
+  ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
+
+  Note that up to and including version 1.0, it was allowed (though
+  explicitly discouraged) to pass False to the reuse argument, yielding
+  undocumented behaviour slightly different from None. Starting at 1.1.0
+  passing None and False as reuse has exactly the same effect.
+
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
@@ -1468,11 +1530,15 @@ def variable_scope(name_or_scope,
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within
-      a reuse scope, or if reuse is not `None` or `True`.
+      a reuse scope.
     TypeError: when the types of some arguments are not appropriate.
   """
   if default_name is None and name_or_scope is None:
     raise TypeError("If default_name is None then name_or_scope is required")
+  if not (reuse is True or reuse is False or reuse is None):
+    raise ValueError("The reuse parameter must be True or False or None.")
+  if reuse is False:  # We don't allow non-inheriting scopes, False = None here.
+    reuse = None
   if values is None:
     values = []
   g = ops._get_graph_from_inputs(values)  # pylint: disable=protected-access
@@ -1535,7 +1601,7 @@ def variable_scope(name_or_scope,
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def variable_op_scope(values,
                       name_or_scope,
                       default_name=None,
@@ -1592,3 +1658,22 @@ def _compute_slice_dim_and_shape(full_shape, slicing):
   if slice_dim is None:
     slice_dim = 0
   return slice_dim, slice_shape
+
+
+def variable(initial_value=None,
+             trainable=True,
+             collections=None,
+             validate_shape=True,
+             caching_device=None,
+             name=None,
+             dtype=None):
+  if get_variable_scope().use_resource:
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value, trainable=trainable,
+        collections=collections, validate_shape=validate_shape,
+        caching_device=caching_device, name=name, dtype=dtype)
+  else:
+    return variables.Variable(
+        initial_value=initial_value, trainable=trainable,
+        collections=collections, validate_shape=validate_shape,
+        caching_device=caching_device, name=name, dtype=dtype)
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 1c48d9151ee..93d2838f6a2 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -24,15 +24,16 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 
 
 class Variable(object):
-  """See the [Variables How To](../../how_tos/variables/index.md) for a high
-  level overview.
+  """See the @{$variables$Variables How To} for a high level overview.
 
   A variable maintains state in the graph across calls to `run()`. You add a
   variable to the graph by constructing an instance of the class `Variable`.
@@ -116,37 +117,8 @@ class Variable(object):
   `trainable_variables()` returns the contents of this collection. The
   various `Optimizer` classes use this collection as the default list of
   variables to optimize.
-
-
-  Creating a variable.
-
-  @@__init__
-  @@initialized_value
-
-  Changing a variable value.
-
-  @@assign
-  @@assign_add
-  @@assign_sub
-  @@scatter_sub
-  @@count_up_to
-
-  @@eval
-
-  Properties.
-
-  @@name
-  @@dtype
-  @@get_shape
-  @@device
-  @@initializer
-  @@graph
-  @@op
   """
 
-  # TODO(touts): Add @@value and @@ref in the docstring above once they are
-  # ready for consumption.
-
   def __init__(self,
                initial_value=None,
                trainable=True,
@@ -192,8 +164,9 @@ class Variable(object):
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
       variable_def: `VariableDef` protocol buffer. If not `None`, recreates
-        the Variable object with its contents. `variable_def` and the other
-        arguments are mutually exclusive.
+        the Variable object with its contents, referencing the variable's nodes
+        in the graph, which must already exist. The graph is not changed.
+        `variable_def` and the other arguments are mutually exclusive.
       dtype: If set, initial_value will be converted to the given type.
         If `None`, either the datatype will be kept (if `initial_value` is
         a Tensor), or `convert_to_tensor` will decide.
@@ -225,8 +198,9 @@ class Variable(object):
           dtype=dtype,
           expected_shape=expected_shape)
 
-  def __str__(self):
-    return str(self._snapshot)
+  def __repr__(self):
+    return "<tf.Variable '%s' shape=%s dtype=%s>" % (
+            self.name, self.get_shape(), self.dtype.name)
 
   def _init_from_args(self,
                       initial_value=None,
@@ -307,11 +281,20 @@ class Variable(object):
                 shape,
                 self._initial_value.dtype.base_dtype,
                 name=name)
+          # pylint: enable=protected-access
 
         # Or get the initial value from a Tensor or Python object.
         else:
           self._initial_value = ops.convert_to_tensor(
               initial_value, name="initial_value", dtype=dtype)
+          # pylint: disable=protected-access
+          if self._initial_value.op._get_control_flow_context() is not None:
+            raise ValueError(
+                "Initializer for variable %s is from inside a control-flow "
+                "construct, such as a loop or conditional. When creating a "
+                "variable inside a loop or conditional, use a lambda as the "
+                "initializer." % name)
+          # pylint: enable=protected-access
           shape = (self._initial_value.get_shape()
                    if validate_shape else tensor_shape.unknown_shape())
           # In this case, the variable op can't be created until after the
@@ -348,10 +331,11 @@ class Variable(object):
     self._save_slice_info = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
-    """Creates a new variable from `VariableDef` protocol buffer.
+    """Recreates the Variable object from a `VariableDef` protocol buffer.
 
     Args:
-      variable_def: `VariableDef` protocol buffer.
+      variable_def: `VariableDef` protocol buffer, describing a variable
+          whose nodes already exists in the graph.
       import_scope: Optional `string`. Name scope to add.
     """
     assert isinstance(variable_def, variable_pb2.VariableDef)
@@ -380,7 +364,7 @@ class Variable(object):
   def _AsTensor(self):  # pylint: disable=invalid-name
     """Converts this variable to a Tensor.
 
-    See [`value()`](#Variable.value).
+    See @{tf.Variable.value}.
 
     Returns:
       A `Tensor` containing the value of the variable.
@@ -437,7 +421,7 @@ class Variable(object):
 
     Returns is a `Tensor` which holds a reference to the variable.  You can
     assign a new value to the variable by passing the tensor to an assign op.
-    See [`value()`](#Variable.value) if you want to get the value of the
+    See @{tf.Variable.value} if you want to get the value of the
     variable.
 
     Returns:
@@ -459,10 +443,10 @@ class Variable(object):
 
     This is not a graph construction method, it does not add ops to the graph.
 
-    This convenience method requires a session where the graph containing this
-    variable has been launched. If no session is passed, the default session is
-    used.  See the [Session class](../../api_docs/python/client.md#Session) for
-    more information on launching a graph and on sessions.
+    This convenience method requires a session where the graph
+    containing this variable has been launched. If no session is
+    passed, the default session is used.  See @{tf.Session} for more
+    information on launching a graph and on sessions.
 
     ```python
     v = tf.Variable([1, 2])
@@ -492,10 +476,6 @@ class Variable(object):
     You should use this instead of the variable itself to initialize another
     variable with a value that depends on the value of this variable.
 
-    Beware of using initialized_value except during initialization:
-    initialized_value causes the Variable's initializer op to be run, so running
-    this op resets the variable to the initial value.
-
     ```python
     # Initialize 'v' with a random tensor.
     v = tf.Variable(tf.truncated_normal([10, 40]))
@@ -510,16 +490,9 @@ class Variable(object):
       has run.
     """
     with ops.control_dependencies(None):
-      with ops.control_dependencies([self._initializer_op]):
-        # TODO(vrv): Change this class to not take caching_device, but
-        # to take the op to colocate the snapshot with, so we can use
-        # colocation rather than devices.
-        if self._caching_device is not None:
-          with ops.device(self._caching_device):
-            return array_ops.identity(self._variable)
-        else:
-          with ops.colocate_with(self._variable.op):
-            return array_ops.identity(self._variable)
+      return control_flow_ops.cond(is_variable_initialized(self),
+                                   self.read_value,
+                                   lambda: self.initial_value)
 
   @property
   def initial_value(self):
@@ -605,6 +578,29 @@ class Variable(object):
         sparse_delta.values,
         use_locking=use_locking)
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    return gen_array_ops.strided_slice_assign(ref=self._ref(),
+                                              begin=begin,
+                                              end=end,
+                                              strides=strides,
+                                              value=value,
+                                              name=name,
+                                              begin_mask=begin_mask,
+                                              end_mask=end_mask,
+                                              ellipsis_mask=ellipsis_mask,
+                                              new_axis_mask=new_axis_mask,
+                                              shrink_axis_mask=shrink_axis_mask)
+
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -632,10 +628,10 @@ class Variable(object):
 
     Writes new value to variable's memory. Doesn't add ops to the graph.
 
-    This convenience method requires a session where the graph containing this
-    variable has been launched. If no session is passed, the default session is
-    used.  See the [Session class](../../api_docs/python/client.md#Session) for
-    more information on launching a graph and on sessions.
+    This convenience method requires a session where the graph
+    containing this variable has been launched. If no session is
+    passed, the default session is used.  See @{tf.Session} for more
+    information on launching a graph and on sessions.
 
     ```python
     v = tf.Variable([1, 2])
@@ -751,7 +747,8 @@ class Variable(object):
     """The `Graph` of this variable."""
     return self._variable.graph
 
-  def get_shape(self):
+  @property
+  def shape(self):
     """The `TensorShape` of this variable.
 
     Returns:
@@ -759,6 +756,10 @@ class Variable(object):
     """
     return self._variable.get_shape()
 
+  def get_shape(self):
+    """Alias of Variable.shape."""
+    return self.shape
+
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
@@ -1077,7 +1078,7 @@ def global_variables():
   This convenience function returns the contents of that collection.
 
   An alternative to global variables are local variables. See
-  [`tf.local_variables()`](../../api_docs/python/state_ops.md#local_variables)
+  @{tf.local_variables}
 
   Returns:
     A list of `Variable` objects.
@@ -1114,7 +1115,7 @@ def local_variables():
   This convenience function returns the contents of that collection.
 
   An alternative to local variables are global variables. See
-  [`tf.global_variables()`](../../api_docs/python/state_ops.md#global_variables)
+  @{tf.global_variables}
 
   Returns:
     A list of local `Variable` objects.
@@ -1184,6 +1185,7 @@ def variables_initializer(var_list, name="init"):
   return control_flow_ops.no_op(name=name)
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
   """See `tf.variables_initializer`."""
@@ -1193,7 +1195,7 @@ def initialize_variables(var_list, name="init"):
 def global_variables_initializer():
   """Returns an Op that initializes global variables.
 
-  This is just a shortcut for `variable_initializers(global_variables())`
+  This is just a shortcut for `variables_initializer(global_variables())`
 
   Returns:
     An Op that initializes global variables in the graph.
@@ -1201,6 +1203,7 @@ def global_variables_initializer():
   return variables_initializer(global_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
   """See `tf.global_variables_initializer`."""
@@ -1210,7 +1213,7 @@ def initialize_all_variables():
 def local_variables_initializer():
   """Returns an Op that initializes all local variables.
 
-  This is just a shortcut for `variable_initializers(local_variables())`
+  This is just a shortcut for `variables_initializer(local_variables())`
 
   Returns:
     An Op that initializes all local variables in the graph.
@@ -1218,12 +1221,14 @@ def local_variables_initializer():
   return variables_initializer(local_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
   """See `tf.local_variables_initializer`."""
   return local_variables_initializer()
 
 
+@tf_should_use.should_use_result
 def is_variable_initialized(variable):
   """Tests if a variable has been initialized.
 
@@ -1237,6 +1242,7 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
+@tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
   """Returns an Op to check if variables are initialized.
 
@@ -1278,6 +1284,7 @@ def assert_variables_initialized(var_list=None):
       return array_ops.stack(ranks)
 
 
+@tf_should_use.should_use_result
 def report_uninitialized_variables(var_list=None,
                                    name="report_uninitialized_variables"):
   """Adds ops to list the names of uninitialized variables.
@@ -1318,8 +1325,6 @@ def report_uninitialized_variables(var_list=None,
       return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 # pylint: disable=protected-access
-ops.register_tensor_conversion_function(Variable,
-                                        Variable._TensorConversionFunction)
 Variable._OverloadAllOperators()
 
 ops.register_tensor_conversion_function(
diff --git a/tensorflow/python/ops/weights_broadcast_ops.py b/tensorflow/python/ops/weights_broadcast_ops.py
index 257b9f1faa4..35e93249c31 100644
--- a/tensorflow/python/ops/weights_broadcast_ops.py
+++ b/tensorflow/python/ops/weights_broadcast_ops.py
@@ -97,9 +97,10 @@ def assert_broadcastable(weights, values):
         return control_flow_ops.no_op(name="static_scalar_check_success")
       if weights_rank_static != values_rank_static:
         raise ValueError(
-            "%s values.rank=%s. weights.rank=%s." % (
+            "%s values.rank=%s. weights.rank=%s."
+            " values.shape=%s. weights.shape=%s." % (
                 _ASSERT_BROADCASTABLE_ERROR_PREFIX, values_rank_static,
-                weights_rank_static))
+                weights_rank_static, values.shape, weights.shape))
       weights_shape_static = tensor_util.constant_value(weights_shape)
       values_shape_static = tensor_util.constant_value(values_shape)
       if weights_shape_static is not None and values_shape_static is not None:
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index a47d183e60e..5ecaa1baafb 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -24,6 +24,10 @@ from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
 
 
+def _benchmark_tests_can_log_memory():
+  return True
+
+
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
   f = flags.FLAGS
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index d91c19eeb46..bd2ef361705 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import numbers
 import os
 import re
@@ -33,6 +32,8 @@ from tensorflow.python.client import timeline
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
 
 # When a subclass of the Benchmark class is created, it is added to
 # the registry automatically
@@ -42,6 +43,8 @@ GLOBAL_BENCHMARK_REGISTRY = set()
 # See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv.
 TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
 
+_benchmark_tests_can_log_memory = app._benchmark_tests_can_log_memory  # pylint: disable=protected-access
+
 
 def _global_report_benchmark(
     name, iters=None, cpu_time=None, wall_time=None,
@@ -70,11 +73,6 @@ def _global_report_benchmark(
                  cpu_time is not None else -1, throughput if
                  throughput is not None else -1, str(extras) if extras else "")
 
-  test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
-  if test_env is None:
-    # Reporting was not requested
-    return
-
   entries = test_log_pb2.BenchmarkEntries()
   entry = entries.entry.add()
   entry.name = name
@@ -93,6 +91,12 @@ def _global_report_benchmark(
       else:
         entry.extras[k].string_value = str(v)
 
+  test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
+  if test_env is None:
+    # Reporting was not requested, just print the proto
+    print(str(entries))
+    return
+
   serialized_entry = entries.SerializeToString()
 
   mangled_name = name.replace("/", "__")
@@ -133,7 +137,7 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
     """Returns full name of class and method calling report_benchmark."""
 
     # Find the caller method (outermost Benchmark class)
-    stack = inspect.stack()
+    stack = tf_inspect.stack()
     calling_class = None
     name = None
     for frame in stack[::-1]:
@@ -212,8 +216,9 @@ class TensorFlowBenchmark(Benchmark):
         store the trace of iteration in the benchmark report.
         The trace will be stored as a string in Google Chrome trace format
         in the extras field "full_trace_chrome_format".
-      store_memory_usage: Boolean, whether to run an extra untimed iteration,
-        calculate memory usage, and store that in extras fields.
+      store_memory_usage: Boolean, whether to run an extra
+        untimed iteration, calculate memory usage, and store that in extras
+        fields.
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
       extras: (optional) Dict mapping string keys to additional benchmark info.
@@ -225,6 +230,8 @@ class TensorFlowBenchmark(Benchmark):
       A `dict` containing the key-value pairs that were passed to
       `report_benchmark`.
     """
+    store_memory_usage &= _benchmark_tests_can_log_memory()
+
     for _ in range(burn_iters):
       sess.run(op_or_tensor, feed_dict=feed_dict)
 
diff --git a/tensorflow/python/platform/control_imports.py b/tensorflow/python/platform/control_imports.py
index 61b29ca4e57..b8e8e78ef3b 100644
--- a/tensorflow/python/platform/control_imports.py
+++ b/tensorflow/python/platform/control_imports.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Switch between Google or open source dependencies."""
 # Switch between Google and OSS dependencies
 USE_OSS = True
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index be5c833d6da..96219faab71 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
+import atexit
 import itertools
 import os
 import sys
@@ -29,13 +29,23 @@ import tempfile
 from unittest import *
 # pylint: enable=wildcard-import
 
+from tensorflow.python.framework import errors
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import app
-from tensorflow.python.platform import benchmark  # pylint: disable=unused-import
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
 
 unittest_main = main
 
+# We keep a global variable in this module to make sure we create the temporary
+# directory only once per test binary invocation.
+_googletest_temp_dir = ''
+
 
 # pylint: disable=invalid-name
 # pylint: disable=undefined-variable
@@ -90,13 +100,24 @@ def main(argv=None):  # pylint: disable=function-redefined
 
 
 def GetTempDir():
-  first_frame = inspect.stack()[-1][0]
-  temp_dir = os.path.join(
-      tempfile.gettempdir(), os.path.basename(inspect.getfile(first_frame)))
-  temp_dir = temp_dir.rstrip('.py')
-  if not os.path.isdir(temp_dir):
-    os.mkdir(temp_dir, 0o755)
-  return temp_dir
+  """Return a temporary directory for tests to use."""
+  global _googletest_temp_dir
+  if not _googletest_temp_dir:
+    first_frame = tf_inspect.stack()[-1][0]
+    temp_dir = os.path.join(tempfile.gettempdir(),
+                            os.path.basename(tf_inspect.getfile(first_frame)))
+    temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
+
+    def delete_temp_dir(dirname=temp_dir):
+      try:
+        file_io.delete_recursively(dirname)
+      except errors.OpError as e:
+        logging.error('Error removing %s: %s', dirname, e)
+
+    atexit.register(delete_temp_dir)
+    _googletest_temp_dir = temp_dir
+
+  return _googletest_temp_dir
 
 
 def test_src_dir_path(relative_path):
@@ -110,7 +131,7 @@ def test_src_dir_path(relative_path):
     An absolute path to the linked in runfiles.
   """
   return os.path.join(os.environ['TEST_SRCDIR'],
-                      "org_tensorflow/tensorflow", relative_path)
+                      'org_tensorflow/tensorflow', relative_path)
 
 
 def StatefulSessionAvailable():
@@ -185,15 +206,16 @@ class StubOutForTesting(object):
     Raises:
       AttributeError: If the attribute cannot be found.
     """
-    if (inspect.ismodule(obj) or
-        (not inspect.isclass(obj) and attr_name in obj.__dict__)):
+    _, obj = tf_decorator.unwrap(obj)
+    if (tf_inspect.ismodule(obj) or
+        (not tf_inspect.isclass(obj) and attr_name in obj.__dict__)):
       orig_obj = obj
       orig_attr = getattr(obj, attr_name)
     else:
-      if not inspect.isclass(obj):
-        mro = list(inspect.getmro(obj.__class__))
+      if not tf_inspect.isclass(obj):
+        mro = list(tf_inspect.getmro(obj.__class__))
       else:
-        mro = list(inspect.getmro(obj))
+        mro = list(tf_inspect.getmro(obj))
 
       mro.reverse()
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index cee83c61853..2455acb4c0c 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""## Resource management.
+"""Resource management library.
 
 @@get_data_files_path
 @@get_path_to_datafile
+@@get_root_dir_with_all_resources
 @@load_resource
 @@readahead_file_path
 """
@@ -24,10 +24,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect as _inspect
 import os as _os
 import sys as _sys
 
+from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -43,9 +43,8 @@ def load_resource(path):
   Raises:
     IOError: If the path is not found, or the resource can't be opened.
   """
-  tensorflow_root = (
-      _os.path.join(
-          _os.path.dirname(__file__), _os.pardir, _os.pardir))
+  tensorflow_root = (_os.path.join(
+      _os.path.dirname(__file__), _os.pardir, _os.pardir))
   path = _os.path.join(tensorflow_root, path)
   path = _os.path.abspath(path)
   with open(path, 'rb') as f:
@@ -54,7 +53,7 @@ def load_resource(path):
 
 # pylint: disable=protected-access
 def get_data_files_path():
-  """Get the directory where files specified in data attribute are stored.
+  """Get a direct path to the data files colocated with the script.
 
   Returns:
     The directory where files specified in data attribute of py_test
@@ -63,6 +62,45 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
+def get_root_dir_with_all_resources():
+  """Get a root directory containing all the data attributes in the build rule.
+
+  Returns:
+    The path to the specified file present in the data attribute of py_test
+    or py_binary. Falls back to returning the same as get_data_files_path if it
+    fails to detect a bazel runfiles directory.
+  """
+  script_dir = get_data_files_path()
+
+  # Create a history of the paths, because the data files are located relative
+  # to the repository root directory, which is directly under runfiles
+  # directory.
+  directories = [script_dir]
+  data_files_dir = ''
+
+  while True:
+    candidate_dir = directories[-1]
+    current_directory = _os.path.basename(candidate_dir)
+    if '.runfiles' in current_directory:
+      # Our file should never be directly under runfiles.
+      # If the history has only one item, it means we are directly inside the
+      # runfiles directory, something is wrong, fall back to the default return
+      # value, script directory.
+      if len(directories) > 1:
+        data_files_dir = directories[-2]
+
+      break
+    else:
+      new_candidate_dir = _os.path.dirname(candidate_dir)
+      # If we are at the root directory these two will be the same.
+      if new_candidate_dir == candidate_dir:
+        break
+      else:
+        directories.append(new_candidate_dir)
+
+  return data_files_dir or script_dir
+
+
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -82,9 +120,10 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
-def readahead_file_path(path, unused_readahead=None):
+def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
 
+
 _allowed_symbols = []
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 8b1e2c329d5..70765168586 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## System configuration.
+"""System configuration library.
 
 @@get_include
 @@get_lib
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 3f2c1d97b48..a307347f606 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -13,57 +13,29 @@
 # limitations under the License.
 # ==============================================================================
 
-# pylint: disable=g-short-docstring-punctuation
-"""## Unit tests
+"""Testing.
 
-TensorFlow provides a convenience class inheriting from `unittest.TestCase`
-which adds methods relevant to TensorFlow tests.  Here is an example:
-
-```python
-    import tensorflow as tf
-
-
-    class SquareTest(tf.test.TestCase):
-
-      def testSquare(self):
-        with self.test_session():
-          x = tf.square([2, 3])
-          self.assertAllEqual(x.eval(), [4, 9])
-
-
-    if __name__ == '__main__':
-      tf.test.main()
-```
-
-`tf.test.TestCase` inherits from `unittest.TestCase` but adds a few additional
-methods.  We will document these methods soon.
+See the @{$python/test} guide.
 
 @@main
 @@TestCase
 @@test_src_dir_path
-
-## Utilities
-
 @@assert_equal_graph_def
 @@get_temp_dir
 @@is_built_with_cuda
 @@is_gpu_available
 @@gpu_device_name
-
-## Gradient checking
-
-[`compute_gradient`](#compute_gradient) and
-[`compute_gradient_error`](#compute_gradient_error) perform numerical
-differentiation of graphs for comparison against registered analytic gradients.
-
 @@compute_gradient
 @@compute_gradient_error
+@@create_local_cluster
 
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 # pylint: disable=g-bad-import-order
 from tensorflow.python.client import device_lib as _device_lib
 from tensorflow.python.framework import test_util as _test_util
@@ -72,7 +44,9 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=unused-import
 from tensorflow.python.framework.test_util import assert_equal_graph_def
+from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
+from tensorflow.python.framework.test_util import gpu_device_name
 
 from tensorflow.python.ops.gradient_checker import compute_gradient_error
 from tensorflow.python.ops.gradient_checker import compute_gradient
@@ -87,6 +61,9 @@ else:
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
 
+# Import StubOutForTesting class
+StubOutForTesting = _googletest.StubOutForTesting  # pylint: disable=invalid-name
+
 
 def main(argv=None):
   """Runs all unit tests."""
@@ -139,18 +116,11 @@ def is_gpu_available(cuda_only=False):
                for x in _device_lib.list_local_devices())
 
 
-def gpu_device_name():
-  """Returns the name of a GPU device if available or the empty string."""
-  for x in _device_lib.list_local_devices():
-    if x.device_type == 'GPU' or x.device_type == 'SYCL':
-      return x.name
-  return ''
-
-
 _allowed_symbols = [
     # We piggy-back googletest documentation.
     'Benchmark',
     'mock',
+    'StubOutForTesting',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 70ecda1dda5..71ee5e365f7 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 # Determine whether we are in an interactive environment
+_interactive = False
 try:
   # This is only defined in interactive shells
   if _sys.ps1: _interactive = True
@@ -60,13 +61,34 @@ _handler = _logging.StreamHandler(_logging_target)
 _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
 _logger.addHandler(_handler)
 
-log = _logger.log
-debug = _logger.debug
-error = _logger.error
-fatal = _logger.fatal
-info = _logger.info
-warn = _logger.warn
-warning = _logger.warning
+
+def log(level, msg, *args, **kwargs):
+  _logger.log(level, msg, *args, **kwargs)
+
+
+def debug(msg, *args, **kwargs):
+  _logger.debug(msg, *args, **kwargs)
+
+
+def error(msg, *args, **kwargs):
+  _logger.error(msg, *args, **kwargs)
+
+
+def fatal(msg, *args, **kwargs):
+  _logger.fatal(msg, *args, **kwargs)
+
+
+def info(msg, *args, **kwargs):
+  _logger.info(msg, *args, **kwargs)
+
+
+def warn(msg, *args, **kwargs):
+  _logger.warn(msg, *args, **kwargs)
+
+
+def warning(msg, *args, **kwargs):
+  _logger.warning(msg, *args, **kwargs)
+
 
 _level_names = {
     FATAL: 'FATAL',
@@ -206,9 +228,9 @@ def get_verbosity():
   return _logger.getEffectiveLevel()
 
 
-def set_verbosity(verbosity):
+def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _logger.setLevel(verbosity)
+  _logger.setLevel(v)
 
 
 def _get_thread_id():
diff --git a/tensorflow/python/pywrap_tensorflow.py b/tensorflow/python/pywrap_tensorflow.py
new file mode 100644
index 00000000000..34741a9b8ca
--- /dev/null
+++ b/tensorflow/python/pywrap_tensorflow.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""pywrap_tensorflow wrapper that exports all symbols with RTLD_GLOBAL."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ctypes
+import sys
+import traceback
+
+# pylint: disable=wildcard-import,g-import-not-at-top,unused-import,line-too-long
+
+# On UNIX-based platforms, pywrap_tensorflow is a SWIG-generated
+# python library that dynamically loads _pywrap_tensorflow.so. The
+# default mode for loading keeps all the symbol private and not
+# visible to other libraries that may be loaded. Setting the mode to
+# RTLD_GLOBAL to make the symbols visible, so that custom op libraries
+# imported using `tf.load_op_library()` can access symbols defined in
+# _pywrap_tensorflow.so.
+try:
+  # TODO(keveman,mrry): Support dynamic op loading on platforms that do not
+  # use `dlopen()` for dynamic loading.
+  _use_rtld_global = hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags')
+  if _use_rtld_global:
+    _default_dlopen_flags = sys.getdlopenflags()
+    sys.setdlopenflags(_default_dlopen_flags | ctypes.RTLD_GLOBAL)
+  from tensorflow.python.pywrap_tensorflow_internal import *
+  from tensorflow.python.pywrap_tensorflow_internal import __version__
+  from tensorflow.python.pywrap_tensorflow_internal import __git_version__
+  from tensorflow.python.pywrap_tensorflow_internal import __compiler_version__
+  if _use_rtld_global:
+    sys.setdlopenflags(_default_dlopen_flags)
+except ImportError:
+  msg = """%s\n\nFailed to load the native TensorFlow runtime.\n
+See https://www.tensorflow.org/install/install_sources#common_installation_problems\n
+for some common reasons and solutions.  Include the entire stack trace
+above this error message when asking for help.""" % traceback.format_exc()
+  raise ImportError(msg)
+
+# pylint: enable=wildcard-import,g-import-not-at-top,unused-import,line-too-long
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 79399c11c42..775232c19f9 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -105,6 +105,7 @@ py_test(
     srcs = ["saved_model_test.py"],
     data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     visibility = ["//visibility:private"],
     deps = [
         ":builder",
@@ -122,6 +123,7 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index a8e0d61513d..38203da5b67 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -22,6 +22,8 @@ The following is a summary of the features in SavedModel:
       and outputs. This is called a `Signature`.
     * SavedModel uses [SignatureDefs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/meta_graph.proto)
       to allow generic support for signatures that may need to be saved with the graphs.
+    * For commonly used SignatureDefs in the context of TensorFlow Serving,
+      please see documentation [here](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/signature_defs.md).
 * Support for `Assets`.
     * For cases where ops depend on external files for initialization, such as
       vocabularies, SavedModel supports this via `assets`.
@@ -100,13 +102,13 @@ The typical usage of `builder` is as follows:
 ~~~python
 export_dir = ...
 ...
-builder = saved_model_builder.SavedModelBuilder(export_dir)
+builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 with tf.Session(graph=tf.Graph()) as sess:
-...
-builder.add_meta_graph_and_variables(sess,
-                                     [tag_constants.TRAINING],
-                                     signature_def_map=foo_signatures,
-                                     assets_collection=foo_assets)
+  ...
+  builder.add_meta_graph_and_variables(sess,
+                                       [tag_constants.TRAINING],
+                                       signature_def_map=foo_signatures,
+                                       assets_collection=foo_assets)
 ...
 with tf.Session(graph=tf.Graph()) as sess:
   ...
@@ -130,7 +132,7 @@ the specific meta graph def, will be restored into the supplied session.
 export_dir = ...
 ...
 with tf.Session(graph=tf.Graph()) as sess:
-  loader.load(sess, [tag_constants.TRAINING], export_dir)
+  tf.saved_model.loader.load(sess, [tag_constants.TRAINING], export_dir)
   ...
 ~~~
 
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 4fd87c04ece..e6d71a48dfd 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -57,7 +57,7 @@ class SavedModelBuilder(object):
   Typical usage for the `SavedModelBuilder`:
   ```python
   ...
-  builder = saved_model_builder.SavedModelBuilder(export_dir)
+  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     ...
@@ -96,53 +96,13 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _asset_path_from_tensor(self, path_tensor):
-    """Returns the filepath value stored in constant `path_tensor`.
-
-    Args:
-      path_tensor: Tensor of a file-path.
-
-    Returns:
-      The string value i.e. path of the tensor, if valid.
-
-    Raises:
-      TypeError if tensor does not match expected op type, dtype or value.
-    """
-    if not isinstance(path_tensor, ops.Tensor):
-      raise TypeError("Asset path tensor must be a Tensor.")
-    if path_tensor.op.type != "Const":
-      raise TypeError("Asset path tensor must be of type constant.")
-    if path_tensor.dtype != dtypes.string:
-      raise TypeError("Asset path tensor must be of dtype string.")
-    str_values = path_tensor.op.get_attr("value").string_val
-    if len(str_values) != 1:
-      raise TypeError("Asset path tensor must be a scalar.")
-    return str_values[0]
-
-  def _add_asset_to_collection(self, asset_filename, asset_tensor):
-    """Builds an asset proto and adds it to the asset collection of the graph.
-
-    Args:
-      asset_filename: The filename of the asset to be added.
-      asset_tensor: The asset tensor used to populate the tensor info of the
-          asset proto.
-    """
-    asset_proto = meta_graph_pb2.AssetFileDef()
-    asset_proto.filename = asset_filename
-    asset_proto.tensor_info.name = asset_tensor.name
-
-    asset_any_proto = Any()
-    asset_any_proto.Pack(asset_proto)
-    ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
-
   def _save_and_write_assets(self, assets_collection_to_add=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
       assets_collection_to_add: The collection where the asset paths are setup.
     """
-    asset_source_filepath_list = self._maybe_save_assets(
-        assets_collection_to_add)
+    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)
 
     # Return if there are no assets to write.
     if len(asset_source_filepath_list) is 0:
@@ -201,42 +161,6 @@ class SavedModelBuilder(object):
         raise TypeError("main_op needs to be an Operation: %r" % main_op)
       ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
-  def _maybe_save_assets(self, assets_collection_to_add=None):
-    """Saves assets to the meta graph.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-
-    Returns:
-      The list of filepaths to the assets in the assets collection.
-
-    Raises:
-      ValueError: Indicating an invalid filepath tensor.
-    """
-    asset_source_filepath_list = []
-
-    if assets_collection_to_add is None:
-      tf_logging.info("No assets to save.")
-      return asset_source_filepath_list
-
-    # Iterate over the supplied asset collection, build the `AssetFile` proto
-    # and add them to the collection with key `constants.ASSETS_KEY`, in the
-    # graph.
-    for asset_tensor in assets_collection_to_add:
-      asset_source_filepath = self._asset_path_from_tensor(asset_tensor)
-      if not asset_source_filepath:
-        raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
-
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
-      # Build `AssetFile` proto and add it to the asset collection in the graph.
-      self._add_asset_to_collection(asset_source_filename, asset_tensor)
-
-      asset_source_filepath_list.append(asset_source_filepath)
-
-    tf_logging.info("Assets added to graph.")
-    return asset_source_filepath_list
-
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
 
@@ -352,15 +276,27 @@ class SavedModelBuilder(object):
     else:
       self._add_main_op(main_op)
 
-    # Initialize a saver to generate a sharded output for all variables in the
+    # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
     saver = tf_saver.Saver(
-        variables.global_variables(),
+        variables._all_saveable_objects(),  # pylint: disable=protected-access
         sharded=True,
         write_version=saver_pb2.SaverDef.V2,
         allow_empty=True)
 
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  However, a *new* Saver was just created that
+    # includes all of the variables.  In the context of the SavedModel, this
+    # new Saver is the only one that needs to be retained. The associated
+    # checkpoint produced in add_meta_graph_and_variables() contains all of the
+    # variable values.  Thus, any preexisting Savers are redundant and useless
+    # at best, but worse may break downstream graph-processing tools, and can be
+    # confusing during debugging. It is therefore safe and wise to set
+    # `clear_extraneous_savers` to `True`, since it removes both the extraneous
+    # SaverDefs and their associated Save/Restore Ops from the graph.
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices,
+                                             clear_extraneous_savers=True)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
@@ -423,10 +359,10 @@ class SavedModelBuilder(object):
     else:
       self._add_main_op(main_op)
 
-    # Initialize a saver to generate a sharded output for all variables in the
+    # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
     saver = tf_saver.Saver(
-        variables.global_variables(),
+        variables._all_saveable_objects(),  # pylint: disable=protected-access
         sharded=True,
         write_version=saver_pb2.SaverDef.V2,
         allow_empty=True)
@@ -438,7 +374,20 @@ class SavedModelBuilder(object):
     saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
 
     # Export the meta graph def.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  However, a *new* Saver was just created that
+    # includes all of the variables.  In the context of the SavedModel, this
+    # new Saver is the only one that needs to be retained.  The associated
+    # checkpoint that was saved just above contains all of the variable values.
+    # Thus, any preexisting Savers are redundant and useless at best, but worse
+    # may break downstream graph-processing tools, and can be confusing during
+    # debugging.  It is therefore safe and wise to set `clear_extraneous_savers`
+    # to `True`, since it removes both the extraneous SaverDefs and their
+    # associated Save/Restore Ops from the graph.
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices,
+                                             clear_extraneous_savers=True)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
@@ -475,3 +424,81 @@ class SavedModelBuilder(object):
     tf_logging.info("SavedModel written to: %s", path)
 
     return path
+
+
+def _maybe_save_assets(assets_collection_to_add=None):
+  """Saves assets to the meta graph.
+
+  Args:
+    assets_collection_to_add: The collection where the asset paths are setup.
+
+  Returns:
+    The list of filepaths to the assets in the assets collection.
+
+  Raises:
+    ValueError: Indicating an invalid filepath tensor.
+  """
+  asset_source_filepath_list = []
+
+  if assets_collection_to_add is None:
+    tf_logging.info("No assets to save.")
+    return asset_source_filepath_list
+
+  # Iterate over the supplied asset collection, build the `AssetFile` proto
+  # and add them to the collection with key `constants.ASSETS_KEY`, in the
+  # graph.
+  for asset_tensor in assets_collection_to_add:
+    asset_source_filepath = _asset_path_from_tensor(asset_tensor)
+    if not asset_source_filepath:
+      raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
+
+    asset_source_filename = os.path.basename(asset_source_filepath)
+
+    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    _add_asset_to_collection(asset_source_filename, asset_tensor)
+
+    asset_source_filepath_list.append(asset_source_filepath)
+
+  tf_logging.info("Assets added to graph.")
+  return asset_source_filepath_list
+
+
+def _asset_path_from_tensor(path_tensor):
+  """Returns the filepath value stored in constant `path_tensor`.
+
+  Args:
+    path_tensor: Tensor of a file-path.
+
+  Returns:
+    The string value i.e. path of the tensor, if valid.
+
+  Raises:
+    TypeError if tensor does not match expected op type, dtype or value.
+  """
+  if not isinstance(path_tensor, ops.Tensor):
+    raise TypeError("Asset path tensor must be a Tensor.")
+  if path_tensor.op.type != "Const":
+    raise TypeError("Asset path tensor must be of type constant.")
+  if path_tensor.dtype != dtypes.string:
+    raise TypeError("Asset path tensor must be of dtype string.")
+  str_values = path_tensor.op.get_attr("value").string_val
+  if len(str_values) != 1:
+    raise TypeError("Asset path tensor must be a scalar.")
+  return str_values[0]
+
+
+def _add_asset_to_collection(asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the asset collection of the graph.
+
+  Args:
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the
+        asset proto.
+  """
+  asset_proto = meta_graph_pb2.AssetFileDef()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+  asset_any_proto = Any()
+  asset_any_proto.Pack(asset_proto)
+  ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5c0c0165361..32526521749 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import os
 
+from google.protobuf import message
 from google.protobuf import text_format
 
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -54,28 +55,27 @@ def _parse_saved_model(export_dir):
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
 
-  # Ensure that the SavedModel exists at either path.
-  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
-      path_to_pb):
-    raise IOError("SavedModel file does not exist at: %s" % export_dir)
-
-  saved_model = saved_model_pb2.SavedModel()
-
   # Parse the SavedModel protocol buffer.
-  try:
-    file_content = file_io.FileIO(path_to_pb, "rb").read()
-    saved_model.ParseFromString(file_content)
-    return saved_model
-  except Exception:  # pylint: disable=broad-except
-    # Pass for exceptions in order to try reading the file in text format.
-    pass
-
-  try:
-    file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
-    text_format.Merge(file_content.decode("utf-8"), saved_model)
-  except text_format.ParseError as e:
-    raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
-  return saved_model
+  saved_model = saved_model_pb2.SavedModel()
+  if file_io.file_exists(path_to_pb):
+    try:
+      file_content = file_io.FileIO(path_to_pb, "rb").read()
+      saved_model.ParseFromString(file_content)
+      return saved_model
+    except message.DecodeError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pb, str(e)))
+  elif file_io.file_exists(path_to_pbtxt):
+    try:
+      file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
+      text_format.Merge(file_content.decode("utf-8"), saved_model)
+      return saved_model
+    except text_format.ParseError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  else:
+    raise IOError("SavedModel file does not exist at: %s/{%s|%s}" %
+                  (export_dir,
+                   constants.SAVED_MODEL_FILENAME_PBTXT,
+                   constants.SAVED_MODEL_FILENAME_PB))
 
 
 def _get_asset_tensors(export_dir, meta_graph_def_to_load):
@@ -195,46 +195,47 @@ def load(sess, tags, export_dir, **saver_kwargs):
   Raises:
     RuntimeError: MetaGraphDef associated with the tags cannot be found.
   """
-  # Build the SavedModel protocol buffer and find the requested meta graph def.
-  saved_model = _parse_saved_model(export_dir)
-  found_match = False
-  for meta_graph_def in saved_model.meta_graphs:
-    if set(meta_graph_def.meta_info_def.tags) == set(tags):
-      meta_graph_def_to_load = meta_graph_def
-      found_match = True
-      break
+  with sess.graph.as_default():
+    # Build the SavedModel protocol buffer and find requested meta graph def.
+    saved_model = _parse_saved_model(export_dir)
+    found_match = False
+    for meta_graph_def in saved_model.meta_graphs:
+      if set(meta_graph_def.meta_info_def.tags) == set(tags):
+        meta_graph_def_to_load = meta_graph_def
+        found_match = True
+        break
 
-  if not found_match:
-    raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
-        "[]") + " could not be found in SavedModel")
+    if not found_match:
+      raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
+          "[]") + " could not be found in SavedModel")
 
-  # Build a saver by importing the meta graph def to load.
-  saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
+    # Build a saver by importing the meta graph def to load.
+    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
 
-  if saver:
-    # Build the checkpoint path where the variables are located.
-    variables_path = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.VARIABLES_DIRECTORY),
-        compat.as_bytes(constants.VARIABLES_FILENAME))
+    if saver:
+      # Build the checkpoint path where the variables are located.
+      variables_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.VARIABLES_DIRECTORY),
+          compat.as_bytes(constants.VARIABLES_FILENAME))
 
-    # Restore the variables using the built saver in the provided session.
-    saver.restore(sess, variables_path)
-  else:
-    tf_logging.info("The specified SavedModel has no variables; no "
-                    "checkpoints were restored.")
+      # Restore the variables using the built saver in the provided session.
+      saver.restore(sess, variables_path)
+    else:
+      tf_logging.info("The specified SavedModel has no variables; no "
+                      "checkpoints were restored.")
 
-  # Get asset tensors, if any.
-  asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                meta_graph_def_to_load)
+    # Get asset tensors, if any.
+    asset_tensors_dictionary = _get_asset_tensors(export_dir,
+                                                  meta_graph_def_to_load)
 
-  main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
-  if main_op_tensor is not None:
-    sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-  else:
-    legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-    if legacy_init_op_tensor is not None:
-      sess.run(fetches=[legacy_init_op_tensor],
-               feed_dict=asset_tensors_dictionary)
+    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    if main_op_tensor is not None:
+      sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+    else:
+      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
+      if legacy_init_op_tensor is not None:
+        sess.run(
+            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
-  return meta_graph_def_to_load
+    return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 51462310a63..355fd57bf1d 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops as tf_data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 
 
@@ -35,10 +35,11 @@ def main_op():
   """
   init = variables.global_variables_initializer()
   init_local = variables.local_variables_initializer()
-  init_tables = tf_data_flow_ops.tables_initializer()
+  init_tables = lookup_ops.tables_initializer()
   return control_flow_ops.group(init, init_local, init_tables)
 
 
+# TODO(sukritiramesh): Integrate with Saver for complete restore functionality.
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 024b52fccb9..0eb9f49fed0 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -39,6 +39,8 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.training import saver_test_utils
 from tensorflow.python.util import compat
 
 SAVED_MODEL_PATH = ("cc/saved_model/testdata/half_plus_two/00000123")
@@ -116,6 +118,61 @@ class SavedModelTest(test.TestCase):
     base_path = "complete_garbage"
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
 
+  def testBadSavedModelFileFormat(self):
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_bad_saved_model_file_format")
+    # Attempt to load a SavedModel from an export directory that does not exist.
+    with self.test_session(graph=ops.Graph()) as sess:
+      with self.assertRaisesRegexp(IOError,
+                                   "SavedModel file does not exist at: %s" %
+                                   export_dir):
+        loader.load(sess, ["foo"], export_dir)
+
+    os.makedirs(export_dir)
+    # Write an invalid binary proto to saved_model.pb.
+    path_to_pb = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PB)
+    with open(path_to_pb, "w") as f:
+      f.write("invalid content")
+    with self.test_session(graph=ops.Graph()) as sess:
+      with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
+                                   constants.SAVED_MODEL_FILENAME_PB):
+        loader.load(sess, ["foo"], export_dir)
+
+    # Cleanup the directory and start again.
+    file_io.delete_recursively(export_dir)
+
+    os.makedirs(export_dir)
+    # Write an invalid text proto to saved_model.pbtxt
+    path_to_pbtxt = os.path.join(export_dir,
+                                 constants.SAVED_MODEL_FILENAME_PBTXT)
+    with open(path_to_pbtxt, "w") as f:
+      f.write("invalid content")
+    with self.test_session(graph=ops.Graph()) as sess:
+      with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
+                                   constants.SAVED_MODEL_FILENAME_PBTXT):
+        loader.load(sess, ["foo"], export_dir)
+
+  def testVerifySessionGraphUsage(self):
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_verify_session_graph_usage")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Build a session and supply it to the load operation.
+    sess = session.Session(graph=ops.Graph())
+    loader.load(sess, [tag_constants.TRAINING], export_dir)
+
+    # Check the variable within the scope of the session and its graph.
+    with sess:
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
   def testSequence(self):
     export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -700,6 +757,35 @@ class SavedModelTest(test.TestCase):
       ops.get_collection("init_op")[0].run()
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
+  def testCustomSaveable(self):
+    export_dir = os.path.join(test.get_temp_dir(), "custom_saveable")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with session.Session(
+        graph=ops.Graph(),
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+      # CheckpointedOp is a key-value table that can be saved across sessions.
+      # The table register itself in SAVEABLE_OBJECTS collection.
+      v1 = saver_test_utils.CheckpointedOp(name="v1")
+      variables.global_variables_initializer().run()
+      v1.insert("k1", 3.0).run()
+      # Once the table is restored, we can access it through this reference.
+      ops.add_to_collection("table_ref", v1.table_ref)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with session.Session(
+        graph=ops.Graph(),
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      # Instantiate a wrapper object from the checkpointed reference.
+      v1 = saver_test_utils.CheckpointedOp(
+          name="v1", table_ref=ops.get_collection("table_ref")[0])
+      self.assertEqual(b"k1", v1.keys().eval())
+      self.assertEqual(3.0, v1.values().eval())
+
   def testClearDevices(self):
     export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -724,6 +810,66 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  def testClearExtraneousSavers(self):
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_clear_extraneous_savers")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Create a variable and a Saver.
+    with ops.Graph().as_default() as graph:
+      with session.Session(
+          target="",
+          config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+        self._init_and_validate_variable(sess, "v", 42)
+
+        # Add two Savers, which should be removed in
+        # add_meta_graph_and_variables() in favor of the locally added one.
+        saver1 = tf_saver.Saver()
+        graph.add_to_collection(ops.GraphKeys.SAVERS, saver1)
+        saver2 = tf_saver.Saver()
+        graph.add_to_collection(ops.GraphKeys.SAVERS, saver2)
+
+        # Confirm there are two SaverDefs.
+        savers = graph.get_collection(ops.GraphKeys.SAVERS)
+        self.assertEqual(2, len(savers))
+
+        # Confirm there are two Save and two Restore ops.
+        save_op_names = set([x.name for x in graph.get_operations()
+                             if x.type == "SaveV2"])
+        self.assertSetEqual(set(["save/SaveV2", "save_1/SaveV2"]),
+                            save_op_names)
+
+        restore_op_names = set([x.name for x in graph.get_operations()
+                                if x.type == "RestoreV2"])
+        self.assertSetEqual(set(["save/RestoreV2", "save_1/RestoreV2"]),
+                            restore_op_names)
+
+        # The SavedModel builder adds its own Saver' for a total of three.
+        builder.add_meta_graph_and_variables(
+            sess, [tag_constants.TRAINING], clear_devices=True)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Restore the graph.
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        self.assertEqual(
+            42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
+        # Confirm that the reloaded graph has only one SaverDef.
+        savers = ops.get_collection(ops.GraphKeys.SAVERS)
+        self.assertEqual(1, len(savers))
+
+        # The reloaded graph should have exactly one Save and one Restore op.
+        save_op_names = set([x.name for x in graph.get_operations()
+                             if x.type == "SaveV2"])
+        self.assertSetEqual(set(["save_2/SaveV2"]), save_op_names)
+        restore_op_names = set([x.name for x in graph.get_operations()
+                                if x.type == "RestoreV2"])
+        self.assertSetEqual(set(["save_2/RestoreV2"]), restore_op_names)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 6b6702a79a1..0559fb415eb 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -136,14 +136,6 @@ def predict_signature_def(inputs, outputs):
   if outputs is None:
     raise ValueError('outputs cannot be None or empty for prediction.')
 
-  # If there's only one input or output, we can standardize keys
-  if len(inputs) == 1:
-    (_, value), = inputs.items()
-    inputs = {signature_constants.PREDICT_INPUTS: value}
-  if len(outputs) == 1:
-    (_, value), = outputs.items()
-    outputs = {signature_constants.PREDICT_OUTPUTS: value}
-
   signature_inputs = {key: utils.build_tensor_info(tensor)
                       for key, tensor in inputs.items()}
   signature_outputs = {key: utils.build_tensor_info(tensor)
diff --git a/tensorflow/python/summary/event_accumulator_test.py b/tensorflow/python/summary/event_accumulator_test.py
deleted file mode 100644
index 2c4ee558ec8..00000000000
--- a/tensorflow/python/summary/event_accumulator_test.py
+++ /dev/null
@@ -1,957 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import event_accumulator as ea
-from tensorflow.python.summary import summary as summary_lib
-from tensorflow.python.summary.writer import writer as writer_lib
-from tensorflow.python.summary.writer.writer import SummaryToEventTransformer
-from tensorflow.python.training import saver
-
-
-class _EventGenerator(object):
-  """Class that can add_events and then yield them back.
-
-  Satisfies the EventGenerator API required for the EventAccumulator.
-  Satisfies the EventWriter API required to create a SummaryWriter.
-
-  Has additional convenience methods for adding test events.
-  """
-
-  def __init__(self, zero_out_timestamps=False):
-    self.items = []
-    self.zero_out_timestamps = zero_out_timestamps
-
-  def Load(self):
-    while self.items:
-      yield self.items.pop(0)
-
-  def AddScalar(self, tag, wall_time=0, step=0, value=0):
-    event = event_pb2.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=summary_pb2.Summary(
-            value=[summary_pb2.Summary.Value(
-                tag=tag, simple_value=value)]))
-    self.AddEvent(event)
-
-  def AddHealthPill(self, wall_time, step, node_name, output_slot, elements):
-    event = event_pb2.Event()
-    event.wall_time = wall_time
-    event.step = step
-    value = event.summary.value.add()
-    # The node_name property is actually a watch key.
-    value.node_name = '%s:%d:DebugNumericSummary' % (node_name, output_slot)
-    value.tag = '__health_pill__'
-    value.tensor.tensor_shape.dim.add().size = len(elements)
-    value.tensor.tensor_content = np.array(elements, dtype=np.float64).tobytes()
-    self.AddEvent(event)
-
-  def AddHistogram(self,
-                   tag,
-                   wall_time=0,
-                   step=0,
-                   hmin=1,
-                   hmax=2,
-                   hnum=3,
-                   hsum=4,
-                   hsum_squares=5,
-                   hbucket_limit=None,
-                   hbucket=None):
-    histo = summary_pb2.HistogramProto(
-        min=hmin,
-        max=hmax,
-        num=hnum,
-        sum=hsum,
-        sum_squares=hsum_squares,
-        bucket_limit=hbucket_limit,
-        bucket=hbucket)
-    event = event_pb2.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=summary_pb2.Summary(
-            value=[summary_pb2.Summary.Value(
-                tag=tag, histo=histo)]))
-    self.AddEvent(event)
-
-  def AddImage(self,
-               tag,
-               wall_time=0,
-               step=0,
-               encoded_image_string=b'imgstr',
-               width=150,
-               height=100):
-    image = summary_pb2.Summary.Image(
-        encoded_image_string=encoded_image_string, width=width, height=height)
-    event = event_pb2.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=summary_pb2.Summary(
-            value=[summary_pb2.Summary.Value(
-                tag=tag, image=image)]))
-    self.AddEvent(event)
-
-  def AddAudio(self,
-               tag,
-               wall_time=0,
-               step=0,
-               encoded_audio_string=b'sndstr',
-               content_type='audio/wav',
-               sample_rate=44100,
-               length_frames=22050):
-    audio = summary_pb2.Summary.Audio(
-        encoded_audio_string=encoded_audio_string,
-        content_type=content_type,
-        sample_rate=sample_rate,
-        length_frames=length_frames)
-    event = event_pb2.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=summary_pb2.Summary(
-            value=[summary_pb2.Summary.Value(
-                tag=tag, audio=audio)]))
-    self.AddEvent(event)
-
-  def AddEvent(self, event):
-    if self.zero_out_timestamps:
-      event.wall_time = 0
-    self.items.append(event)
-
-  def add_event(self, event):  # pylint: disable=invalid-name
-    """Match the EventWriter API."""
-    self.AddEvent(event)
-
-
-class EventAccumulatorTest(test.TestCase):
-
-  def assertTagsEqual(self, tags1, tags2):
-    # Make sure the two dictionaries have the same keys.
-    self.assertItemsEqual(tags1, tags2)
-    # Additionally, make sure each key in the dictionary maps to the same value.
-    for key in tags1:
-      if isinstance(tags1[key], list):
-        # We don't care about the order of the values in lists, thus asserting
-        # only if the items are equal.
-        self.assertItemsEqual(tags1[key], tags2[key])
-      else:
-        # Make sure the values are equal.
-        self.assertEqual(tags1[key], tags2[key])
-
-
-class MockingEventAccumulatorTest(EventAccumulatorTest):
-
-  def setUp(self):
-    super(MockingEventAccumulatorTest, self).setUp()
-    self.stubs = googletest.StubOutForTesting()
-    self.empty = {
-        ea.IMAGES: [],
-        ea.AUDIO: [],
-        ea.SCALARS: [],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: False,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    }
-    self._real_constructor = ea.EventAccumulator
-    self._real_generator = ea._GeneratorFromPath
-
-    def _FakeAccumulatorConstructor(generator, *args, **kwargs):
-      ea._GeneratorFromPath = lambda x: generator
-      return self._real_constructor(generator, *args, **kwargs)
-
-    ea.EventAccumulator = _FakeAccumulatorConstructor
-
-  def tearDown(self):
-    self.stubs.CleanUp()
-    ea.EventAccumulator = self._real_constructor
-    ea._GeneratorFromPath = self._real_generator
-
-  def testEmptyAccumulator(self):
-    gen = _EventGenerator()
-    x = ea.EventAccumulator(gen)
-    x.Reload()
-    self.assertEqual(x.Tags(), self.empty)
-
-  def testTags(self):
-    gen = _EventGenerator()
-    gen.AddScalar('s1')
-    gen.AddScalar('s2')
-    gen.AddHistogram('hst1')
-    gen.AddHistogram('hst2')
-    gen.AddImage('im1')
-    gen.AddImage('im2')
-    gen.AddAudio('snd1')
-    gen.AddAudio('snd2')
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1', 'im2'],
-        ea.AUDIO: ['snd1', 'snd2'],
-        ea.SCALARS: ['s1', 's2'],
-        ea.HISTOGRAMS: ['hst1', 'hst2'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
-        ea.GRAPH: False,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    })
-
-  def testReload(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    self.assertEqual(acc.Tags(), self.empty)
-    gen.AddScalar('s1')
-    gen.AddScalar('s2')
-    gen.AddHistogram('hst1')
-    gen.AddHistogram('hst2')
-    gen.AddImage('im1')
-    gen.AddImage('im2')
-    gen.AddAudio('snd1')
-    gen.AddAudio('snd2')
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1', 'im2'],
-        ea.AUDIO: ['snd1', 'snd2'],
-        ea.SCALARS: ['s1', 's2'],
-        ea.HISTOGRAMS: ['hst1', 'hst2'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
-        ea.GRAPH: False,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    })
-
-  def testScalars(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
-    s2 = ea.ScalarEvent(wall_time=2, step=12, value=64)
-    gen.AddScalar('s1', wall_time=1, step=10, value=32)
-    gen.AddScalar('s2', wall_time=2, step=12, value=64)
-    acc.Reload()
-    self.assertEqual(acc.Scalars('s1'), [s1])
-    self.assertEqual(acc.Scalars('s2'), [s2])
-
-  def _compareHealthPills(self, expected_event, gotten_event):
-    """Compares 2 health pills.
-
-    Args:
-      expected_event: The expected HealthPillEvent.
-      gotten_event: The gotten HealthPillEvent.
-    """
-    self.assertEqual(expected_event.wall_time, gotten_event.wall_time)
-    self.assertEqual(expected_event.step, gotten_event.step)
-    self.assertEqual(expected_event.node_name, gotten_event.node_name)
-    self.assertEqual(expected_event.output_slot, gotten_event.output_slot)
-    self.assertEqual(len(expected_event.value), len(gotten_event.value))
-    for i, expected_value in enumerate(expected_event.value):
-      self.assertEqual(expected_value, gotten_event.value[i])
-
-  def testHealthPills(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddHealthPill(13371337, 41, 'Add', 0, range(1, 13))
-    gen.AddHealthPill(13381338, 42, 'Add', 1, range(42, 54))
-
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-
-    # Retrieve the health pills for each node name.
-    gotten_events = acc.HealthPills('Add')
-    self.assertEquals(2, len(gotten_events))
-    self._compareHealthPills(
-        ea.HealthPillEvent(
-            wall_time=13371337,
-            step=41,
-            node_name='Add',
-            output_slot=0,
-            value=range(1, 13)),
-        gotten_events[0])
-    self._compareHealthPills(
-        ea.HealthPillEvent(
-            wall_time=13381338,
-            step=42,
-            node_name='Add',
-            output_slot=1,
-            value=range(42, 54)),
-        gotten_events[1])
-
-  def testHistograms(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-
-    val1 = ea.HistogramValue(
-        min=1,
-        max=2,
-        num=3,
-        sum=4,
-        sum_squares=5,
-        bucket_limit=[1, 2, 3],
-        bucket=[0, 3, 0])
-    val2 = ea.HistogramValue(
-        min=-2,
-        max=3,
-        num=4,
-        sum=5,
-        sum_squares=6,
-        bucket_limit=[2, 3, 4],
-        bucket=[1, 3, 0])
-
-    hst1 = ea.HistogramEvent(wall_time=1, step=10, histogram_value=val1)
-    hst2 = ea.HistogramEvent(wall_time=2, step=12, histogram_value=val2)
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=1,
-        hmax=2,
-        hnum=3,
-        hsum=4,
-        hsum_squares=5,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 3, 0])
-    gen.AddHistogram(
-        'hst2',
-        wall_time=2,
-        step=12,
-        hmin=-2,
-        hmax=3,
-        hnum=4,
-        hsum=5,
-        hsum_squares=6,
-        hbucket_limit=[2, 3, 4],
-        hbucket=[1, 3, 0])
-    acc.Reload()
-    self.assertEqual(acc.Histograms('hst1'), [hst1])
-    self.assertEqual(acc.Histograms('hst2'), [hst2])
-
-  def testCompressedHistograms(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
-
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=1,
-        hmax=2,
-        hnum=3,
-        hsum=4,
-        hsum_squares=5,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 3, 0])
-    gen.AddHistogram(
-        'hst2',
-        wall_time=2,
-        step=12,
-        hmin=-2,
-        hmax=3,
-        hnum=4,
-        hsum=5,
-        hsum_squares=6,
-        hbucket_limit=[2, 3, 4],
-        hbucket=[1, 3, 0])
-    acc.Reload()
-
-    # Create the expected values after compressing hst1
-    expected_vals1 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, 1.0), (2500, 1.25), (5000, 1.5), (7500, 1.75
-                                                             ), (10000, 2.0)]
-    ]
-    expected_cmphst1 = ea.CompressedHistogramEvent(
-        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
-    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
-
-    # Create the expected values after compressing hst2
-    expected_vals2 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, -2),
-                        (2500, 2),
-                        (5000, 2 + 1 / 3),
-                        (7500, 2 + 2 / 3),
-                        (10000, 3)]
-    ]
-    expected_cmphst2 = ea.CompressedHistogramEvent(
-        wall_time=2, step=12, compressed_histogram_values=expected_vals2)
-    self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
-
-  def testCompressedHistogramsWithEmptyHistogram(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
-
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=None,
-        hmax=None,
-        hnum=0,
-        hsum=0,
-        hsum_squares=0,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 0, 0])
-    acc.Reload()
-
-    # Create the expected values after compressing hst1
-    expected_vals1 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, 0.0), (2500, 0), (5000, 0), (7500, 0), (10000, 0)]
-    ]
-    expected_cmphst1 = ea.CompressedHistogramEvent(
-        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
-    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
-
-  def testCompressHistogram_uglyHistogram(self):
-    bps = (0, 668, 1587, 3085, 5000, 6915, 8413, 9332, 10000)
-    histogram_values = ea.HistogramValue(
-        min=0.0,
-        max=1.0,
-        num=960.0,
-        sum=64.0,
-        sum_squares=64.0,
-        bucket_limit=[
-            0.0, 1e-12, 0.917246389039776, 1.0089710279437536,
-            1.7976931348623157e+308
-        ],
-        bucket=[0.0, 896.0, 0.0, 64.0, 0.0])
-    histogram_event = ea.HistogramEvent(0, 0, histogram_values)
-    compressed_event = ea._CompressHistogram(histogram_event, bps)
-    vals = compressed_event.compressed_histogram_values
-    self.assertEquals(tuple(v.basis_point for v in vals), bps)
-    self.assertAlmostEqual(vals[0].value, 0.0)
-    self.assertAlmostEqual(vals[1].value, 7.157142857142856e-14)
-    self.assertAlmostEqual(vals[2].value, 1.7003571428571426e-13)
-    self.assertAlmostEqual(vals[3].value, 3.305357142857143e-13)
-    self.assertAlmostEqual(vals[4].value, 5.357142857142857e-13)
-    self.assertAlmostEqual(vals[5].value, 7.408928571428571e-13)
-    self.assertAlmostEqual(vals[6].value, 9.013928571428571e-13)
-    self.assertAlmostEqual(vals[7].value, 9.998571428571429e-13)
-    self.assertAlmostEqual(vals[8].value, 1.0)
-
-  def testImages(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    im1 = ea.ImageEvent(
-        wall_time=1,
-        step=10,
-        encoded_image_string=b'big',
-        width=400,
-        height=300)
-    im2 = ea.ImageEvent(
-        wall_time=2,
-        step=12,
-        encoded_image_string=b'small',
-        width=40,
-        height=30)
-    gen.AddImage(
-        'im1',
-        wall_time=1,
-        step=10,
-        encoded_image_string=b'big',
-        width=400,
-        height=300)
-    gen.AddImage(
-        'im2',
-        wall_time=2,
-        step=12,
-        encoded_image_string=b'small',
-        width=40,
-        height=30)
-    acc.Reload()
-    self.assertEqual(acc.Images('im1'), [im1])
-    self.assertEqual(acc.Images('im2'), [im2])
-
-  def testAudio(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    snd1 = ea.AudioEvent(
-        wall_time=1,
-        step=10,
-        encoded_audio_string=b'big',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=441000)
-    snd2 = ea.AudioEvent(
-        wall_time=2,
-        step=12,
-        encoded_audio_string=b'small',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=44100)
-    gen.AddAudio(
-        'snd1',
-        wall_time=1,
-        step=10,
-        encoded_audio_string=b'big',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=441000)
-    gen.AddAudio(
-        'snd2',
-        wall_time=2,
-        step=12,
-        encoded_audio_string=b'small',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=44100)
-    acc.Reload()
-    self.assertEqual(acc.Audio('snd1'), [snd1])
-    self.assertEqual(acc.Audio('snd2'), [snd2])
-
-  def testKeyError(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    with self.assertRaises(KeyError):
-      acc.Scalars('s1')
-    with self.assertRaises(KeyError):
-      acc.Scalars('hst1')
-    with self.assertRaises(KeyError):
-      acc.Scalars('im1')
-    with self.assertRaises(KeyError):
-      acc.Histograms('s1')
-    with self.assertRaises(KeyError):
-      acc.Histograms('im1')
-    with self.assertRaises(KeyError):
-      acc.Images('s1')
-    with self.assertRaises(KeyError):
-      acc.Images('hst1')
-    with self.assertRaises(KeyError):
-      acc.Audio('s1')
-    with self.assertRaises(KeyError):
-      acc.Audio('hst1')
-
-  def testNonValueEvents(self):
-    """Tests that non-value events in the generator don't cause early exits."""
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddScalar('s1', wall_time=1, step=10, value=20)
-    gen.AddEvent(event_pb2.Event(wall_time=2, step=20, file_version='nots2'))
-    gen.AddScalar('s3', wall_time=3, step=100, value=1)
-    gen.AddHistogram('hst1')
-    gen.AddImage('im1')
-    gen.AddAudio('snd1')
-
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1'],
-        ea.AUDIO: ['snd1'],
-        ea.SCALARS: ['s1', 's3'],
-        ea.HISTOGRAMS: ['hst1'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1'],
-        ea.GRAPH: False,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    })
-
-  def testExpiredDataDiscardedAfterRestartForFileVersionLessThan2(self):
-    """Tests that events are discarded after a restart is detected.
-
-    If a step value is observed to be lower than what was previously seen,
-    this should force a discard of all previous items with the same tag
-    that are outdated.
-
-    Only file versions < 2 use this out-of-order discard logic. Later versions
-    discard events based on the step value of SessionLog.START.
-    """
-    warnings = []
-    self.stubs.Set(logging, 'warn', warnings.append)
-
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    acc.Reload()
-    ## Check that number of items are what they should be
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
-
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-    acc.Reload()
-    ## Check that we have discarded 200 and 300 from s1
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
-
-  def testOrphanedDataNotDiscardedIfFlagUnset(self):
-    """Tests that events are not discarded if purge_orphaned_data is false.
-    """
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen, purge_orphaned_data=False)
-
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    acc.Reload()
-    ## Check that number of items are what they should be
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
-
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-    acc.Reload()
-    ## Check that we have discarded 200 and 300 from s1
-    self.assertEqual([x.step for x in acc.Scalars('s1')],
-                     [100, 200, 300, 101, 201, 301])
-
-  def testEventsDiscardedPerTagAfterRestartForFileVersionLessThan2(self):
-    """Tests that event discards after restart, only affect the misordered tag.
-
-    If a step value is observed to be lower than what was previously seen,
-    this should force a discard of all previous items that are outdated, but
-    only for the out of order tag. Other tags should remain unaffected.
-
-    Only file versions < 2 use this out-of-order discard logic. Later versions
-    discard events based on the step value of SessionLog.START.
-    """
-    warnings = []
-    self.stubs.Set(logging, 'warn', warnings.append)
-
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-
-    gen.AddScalar('s2', wall_time=1, step=101, value=20)
-    gen.AddScalar('s2', wall_time=1, step=201, value=20)
-    gen.AddScalar('s2', wall_time=1, step=301, value=20)
-
-    acc.Reload()
-    ## Check that we have discarded 200 and 300
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
-
-    ## Check that s1 discards do not affect s2
-    ## i.e. check that only events from the out of order tag are discarded
-    self.assertEqual([x.step for x in acc.Scalars('s2')], [101, 201, 301])
-
-  def testOnlySummaryEventsTriggerDiscards(self):
-    """Test that file version event does not trigger data purge."""
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    ev1 = event_pb2.Event(wall_time=2, step=0, file_version='brain.Event:1')
-    graph_bytes = graph_pb2.GraphDef().SerializeToString()
-    ev2 = event_pb2.Event(wall_time=3, step=0, graph_def=graph_bytes)
-    gen.AddEvent(ev1)
-    gen.AddEvent(ev2)
-    acc.Reload()
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100])
-
-  def testSessionLogStartMessageDiscardsExpiredEvents(self):
-    """Test that SessionLog.START message discards expired events.
-
-    This discard logic is preferred over the out-of-order step discard logic,
-    but this logic can only be used for event protos which have the SessionLog
-    enum, which was introduced to event.proto for file_version >= brain.Event:2.
-    """
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=0, step=1, file_version='brain.Event:2'))
-
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    gen.AddScalar('s1', wall_time=1, step=400, value=20)
-
-    gen.AddScalar('s2', wall_time=1, step=202, value=20)
-    gen.AddScalar('s2', wall_time=1, step=203, value=20)
-
-    slog = event_pb2.SessionLog(status=event_pb2.SessionLog.START)
-    gen.AddEvent(event_pb2.Event(wall_time=2, step=201, session_log=slog))
-    acc.Reload()
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200])
-    self.assertEqual([x.step for x in acc.Scalars('s2')], [])
-
-  def testFirstEventTimestamp(self):
-    """Test that FirstEventTimestamp() returns wall_time of the first event."""
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=10, step=20, file_version='brain.Event:2'))
-    gen.AddScalar('s1', wall_time=30, step=40, value=20)
-    self.assertEqual(acc.FirstEventTimestamp(), 10)
-
-  def testReloadPopulatesFirstEventTimestamp(self):
-    """Test that Reload() means FirstEventTimestamp() won't load events."""
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=1, step=2, file_version='brain.Event:2'))
-
-    acc.Reload()
-
-    def _Die(*args, **kwargs):  # pylint: disable=unused-argument
-      raise RuntimeError('Load() should not be called')
-
-    self.stubs.Set(gen, 'Load', _Die)
-    self.assertEqual(acc.FirstEventTimestamp(), 1)
-
-  def testFirstEventTimestampLoadsEvent(self):
-    """Test that FirstEventTimestamp() doesn't discard the loaded event."""
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(
-        event_pb2.Event(
-            wall_time=1, step=2, file_version='brain.Event:2'))
-
-    self.assertEqual(acc.FirstEventTimestamp(), 1)
-    acc.Reload()
-    self.assertEqual(acc.file_version, 2.0)
-
-  def testTFSummaryScalar(self):
-    """Verify processing of tf.summary.scalar, which uses TensorSummary op."""
-    event_sink = _EventGenerator(zero_out_timestamps=True)
-    writer = SummaryToEventTransformer(event_sink)
-    with self.test_session() as sess:
-      ipt = array_ops.placeholder(dtypes.float32)
-      summary_lib.scalar('scalar1', ipt)
-      summary_lib.scalar('scalar2', ipt * ipt)
-      merged = summary_lib.merge_all()
-      writer.add_graph(sess.graph)
-      for i in xrange(10):
-        summ = sess.run(merged, feed_dict={ipt: i})
-        writer.add_summary(summ, global_step=i)
-
-    accumulator = ea.EventAccumulator(event_sink)
-    accumulator.Reload()
-
-    seq1 = [ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)]
-    seq2 = [
-        ea.ScalarEvent(
-            wall_time=0, step=i, value=i * i) for i in xrange(10)
-    ]
-
-    self.assertTagsEqual(accumulator.Tags(), {
-        ea.IMAGES: [],
-        ea.AUDIO: [],
-        ea.SCALARS: ['scalar1', 'scalar2'],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: True,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    })
-
-    self.assertEqual(accumulator.Scalars('scalar1'), seq1)
-    self.assertEqual(accumulator.Scalars('scalar2'), seq2)
-    first_value = accumulator.Scalars('scalar1')[0].value
-    self.assertTrue(isinstance(first_value, float))
-
-  def testTFSummaryImage(self):
-    """Verify processing of tf.summary.image."""
-    event_sink = _EventGenerator(zero_out_timestamps=True)
-    writer = SummaryToEventTransformer(event_sink)
-    with self.test_session() as sess:
-      ipt = array_ops.ones([10, 4, 4, 3], dtypes.uint8)
-      # This is an interesting example, because the old tf.image_summary op
-      # would throw an error here, because it would be tag reuse.
-      # Using the tf node name instead allows argument re-use to the image
-      # summary.
-      with ops.name_scope('1'):
-        summary_lib.image('images', ipt, max_outputs=1)
-      with ops.name_scope('2'):
-        summary_lib.image('images', ipt, max_outputs=2)
-      with ops.name_scope('3'):
-        summary_lib.image('images', ipt, max_outputs=3)
-      merged = summary_lib.merge_all()
-      writer.add_graph(sess.graph)
-      for i in xrange(10):
-        summ = sess.run(merged)
-        writer.add_summary(summ, global_step=i)
-
-    accumulator = ea.EventAccumulator(event_sink)
-    accumulator.Reload()
-
-    tags = [
-        u'1/images/image', u'2/images/image/0', u'2/images/image/1',
-        u'3/images/image/0', u'3/images/image/1', u'3/images/image/2'
-    ]
-
-    self.assertTagsEqual(accumulator.Tags(), {
-        ea.IMAGES: tags,
-        ea.AUDIO: [],
-        ea.SCALARS: [],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: True,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: []
-    })
-
-
-class RealisticEventAccumulatorTest(EventAccumulatorTest):
-
-  def setUp(self):
-    super(RealisticEventAccumulatorTest, self).setUp()
-
-  def testScalarsRealistically(self):
-    """Test accumulator by writing values and then reading them."""
-
-    def FakeScalarSummary(tag, value):
-      value = summary_pb2.Summary.Value(tag=tag, simple_value=value)
-      summary = summary_pb2.Summary(value=[value])
-      return summary
-
-    directory = os.path.join(self.get_temp_dir(), 'values_dir')
-    if gfile.IsDirectory(directory):
-      gfile.DeleteRecursively(directory)
-    gfile.MkDir(directory)
-
-    writer = writer_lib.FileWriter(directory, max_queue=100)
-
-    with ops.Graph().as_default() as graph:
-      _ = constant_op.constant([2.0, 1.0])
-    # Add a graph to the summary writer.
-    writer.add_graph(graph)
-    meta_graph_def = saver.export_meta_graph(
-        graph_def=graph.as_graph_def(add_shapes=True))
-    writer.add_meta_graph(meta_graph_def)
-
-    run_metadata = config_pb2.RunMetadata()
-    device_stats = run_metadata.step_stats.dev_stats.add()
-    device_stats.device = 'test device'
-    writer.add_run_metadata(run_metadata, 'test run')
-
-    # Write a bunch of events using the writer.
-    for i in xrange(30):
-      summ_id = FakeScalarSummary('id', i)
-      summ_sq = FakeScalarSummary('sq', i * i)
-      writer.add_summary(summ_id, i * 5)
-      writer.add_summary(summ_sq, i * 5)
-    writer.flush()
-
-    # Verify that we can load those events properly
-    acc = ea.EventAccumulator(directory)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: [],
-        ea.AUDIO: [],
-        ea.SCALARS: ['id', 'sq'],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: True,
-        ea.META_GRAPH: True,
-        ea.RUN_METADATA: ['test run']
-    })
-    id_events = acc.Scalars('id')
-    sq_events = acc.Scalars('sq')
-    self.assertEqual(30, len(id_events))
-    self.assertEqual(30, len(sq_events))
-    for i in xrange(30):
-      self.assertEqual(i * 5, id_events[i].step)
-      self.assertEqual(i * 5, sq_events[i].step)
-      self.assertEqual(i, id_events[i].value)
-      self.assertEqual(i * i, sq_events[i].value)
-
-    # Write a few more events to test incremental reloading
-    for i in xrange(30, 40):
-      summ_id = FakeScalarSummary('id', i)
-      summ_sq = FakeScalarSummary('sq', i * i)
-      writer.add_summary(summ_id, i * 5)
-      writer.add_summary(summ_sq, i * 5)
-    writer.flush()
-
-    # Verify we can now see all of the data
-    acc.Reload()
-    id_events = acc.Scalars('id')
-    sq_events = acc.Scalars('sq')
-    self.assertEqual(40, len(id_events))
-    self.assertEqual(40, len(sq_events))
-    for i in xrange(40):
-      self.assertEqual(i * 5, id_events[i].step)
-      self.assertEqual(i * 5, sq_events[i].step)
-      self.assertEqual(i, id_events[i].value)
-      self.assertEqual(i * i, sq_events[i].value)
-    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
-    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
-
-  def testGraphFromMetaGraphBecomesAvailable(self):
-    """Test accumulator by writing values and then reading them."""
-
-    directory = os.path.join(self.get_temp_dir(), 'metagraph_test_values_dir')
-    if gfile.IsDirectory(directory):
-      gfile.DeleteRecursively(directory)
-    gfile.MkDir(directory)
-
-    writer = writer_lib.FileWriter(directory, max_queue=100)
-
-    with ops.Graph().as_default() as graph:
-      _ = constant_op.constant([2.0, 1.0])
-    # Add a graph to the summary writer.
-    meta_graph_def = saver.export_meta_graph(
-        graph_def=graph.as_graph_def(add_shapes=True))
-    writer.add_meta_graph(meta_graph_def)
-
-    writer.flush()
-
-    # Verify that we can load those events properly
-    acc = ea.EventAccumulator(directory)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: [],
-        ea.AUDIO: [],
-        ea.SCALARS: [],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: True,
-        ea.META_GRAPH: True,
-        ea.RUN_METADATA: []
-    })
-    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
-    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/summary/event_multiplexer_test.py b/tensorflow/python/summary/event_multiplexer_test.py
deleted file mode 100644
index fa4290cccdb..00000000000
--- a/tensorflow/python/summary/event_multiplexer_test.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import os.path
-import shutil
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary import event_accumulator
-from tensorflow.python.summary import event_multiplexer
-
-
-def _AddEvents(path):
-  if not gfile.IsDirectory(path):
-    gfile.MakeDirs(path)
-  fpath = os.path.join(path, 'hypothetical.tfevents.out')
-  with gfile.GFile(fpath, 'w') as f:
-    f.write('')
-    return fpath
-
-
-def _CreateCleanDirectory(path):
-  if gfile.IsDirectory(path):
-    gfile.DeleteRecursively(path)
-  gfile.MkDir(path)
-
-
-class _FakeAccumulator(object):
-
-  def __init__(self, path):
-    self._path = path
-    self.reload_called = False
-    self._node_names_to_health_pills = {'Add': ['hp1', 'hp2']}
-
-  def Tags(self):
-    return {event_accumulator.IMAGES: ['im1', 'im2'],
-            event_accumulator.AUDIO: ['snd1', 'snd2'],
-            event_accumulator.HISTOGRAMS: ['hst1', 'hst2'],
-            event_accumulator.COMPRESSED_HISTOGRAMS: ['cmphst1', 'cmphst2'],
-            event_accumulator.SCALARS: ['sv1', 'sv2']}
-
-  def FirstEventTimestamp(self):
-    return 0
-
-  def Scalars(self, tag_name):
-    if tag_name not in self.Tags()[event_accumulator.SCALARS]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def HealthPills(self, node_name):
-    if node_name not in self._node_names_to_health_pills:
-      raise KeyError
-    health_pills = self._node_names_to_health_pills[node_name]
-    return [self._path + '/' + health_pill for health_pill in health_pills]
-
-  def Histograms(self, tag_name):
-    if tag_name not in self.Tags()[event_accumulator.HISTOGRAMS]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def CompressedHistograms(self, tag_name):
-    if tag_name not in self.Tags()[event_accumulator.COMPRESSED_HISTOGRAMS]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def Images(self, tag_name):
-    if tag_name not in self.Tags()[event_accumulator.IMAGES]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def Audio(self, tag_name):
-    if tag_name not in self.Tags()[event_accumulator.AUDIO]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def Reload(self):
-    self.reload_called = True
-
-
-# pylint: disable=unused-argument
-def _GetFakeAccumulator(
-    path,
-    size_guidance=None,
-    compression_bps=None,
-    purge_orphaned_data=None):
-  return _FakeAccumulator(path)
-# pylint: enable=unused-argument
-
-
-class EventMultiplexerTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    super(EventMultiplexerTest, self).setUp()
-    self.stubs = googletest.StubOutForTesting()
-
-    self.stubs.Set(event_accumulator, 'EventAccumulator', _GetFakeAccumulator)
-
-  def tearDown(self):
-    self.stubs.CleanUp()
-
-  def testEmptyLoader(self):
-    x = event_multiplexer.EventMultiplexer()
-    self.assertEqual(x.Runs(), {})
-
-  def testRunNamesRespected(self):
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'run2'])
-    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
-    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
-
-  def testReload(self):
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertFalse(x._GetAccumulator('run1').reload_called)
-    self.assertFalse(x._GetAccumulator('run2').reload_called)
-    x.Reload()
-    self.assertTrue(x._GetAccumulator('run1').reload_called)
-    self.assertTrue(x._GetAccumulator('run2').reload_called)
-
-  def testScalars(self):
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-
-    run1_actual = x.Scalars('run1', 'sv1')
-    run1_expected = ['path1/sv1']
-
-    self.assertEqual(run1_expected, run1_actual)
-
-  def testHealthPills(self):
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertEqual(['path1/hp1', 'path1/hp2'], x.HealthPills('run1', 'Add'))
-
-  def testExceptions(self):
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    with self.assertRaises(KeyError):
-      x.Scalars('sv1', 'xxx')
-
-  def testInitialization(self):
-    x = event_multiplexer.EventMultiplexer()
-    self.assertEqual(x.Runs(), {})
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual(x.Runs(), ['run1', 'run2'])
-    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
-    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
-
-  def testAddRunsFromDirectory(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    fakedir = join(tmpdir, 'fake_accumulator_directory')
-    realdir = join(tmpdir, 'real_accumulator_directory')
-    self.assertEqual(x.Runs(), {})
-    x.AddRunsFromDirectory(fakedir)
-    self.assertEqual(x.Runs(), {}, 'loading fakedir had no effect')
-
-    _CreateCleanDirectory(realdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertEqual(x.Runs(), {}, 'loading empty directory had no effect')
-
-    path1 = join(realdir, 'path1')
-    gfile.MkDir(path1)
-    x.AddRunsFromDirectory(realdir)
-    self.assertEqual(x.Runs(), {}, 'creating empty subdirectory had no effect')
-
-    _AddEvents(path1)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1'], 'loaded run: path1')
-    loader1 = x._GetAccumulator('path1')
-    self.assertEqual(loader1._path, path1, 'has the correct path')
-
-    path2 = join(realdir, 'path2')
-    _AddEvents(path2)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1', 'path2'])
-    self.assertEqual(
-        x._GetAccumulator('path1'), loader1, 'loader1 not regenerated')
-
-    path2_2 = join(path2, 'path2')
-    _AddEvents(path2_2)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1', 'path2', 'path2/path2'])
-    self.assertEqual(
-        x._GetAccumulator('path2/path2')._path, path2_2, 'loader2 path correct')
-
-  def testAddRunsFromDirectoryThatContainsEvents(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-
-    self.assertEqual(x.Runs(), {})
-
-    _AddEvents(realdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['.'])
-
-    subdir = join(realdir, 'subdir')
-    _AddEvents(subdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['.', 'subdir'])
-
-  def testAddRunsFromDirectoryWithRunNames(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-
-    self.assertEqual(x.Runs(), {})
-
-    _AddEvents(realdir)
-    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo/.'])
-
-    subdir = join(realdir, 'subdir')
-    _AddEvents(subdir)
-    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo/.', 'foo/subdir'])
-
-  def testAddRunsFromDirectoryWalksTree(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-    _AddEvents(realdir)
-    sub = join(realdir, 'subdirectory')
-    sub1 = join(sub, '1')
-    sub2 = join(sub, '2')
-    sub1_1 = join(sub1, '1')
-    _AddEvents(sub1)
-    _AddEvents(sub2)
-    _AddEvents(sub1_1)
-    x.AddRunsFromDirectory(realdir)
-
-    self.assertItemsEqual(x.Runs(), ['.', 'subdirectory/1', 'subdirectory/2',
-                                     'subdirectory/1/1'])
-
-  def testAddRunsFromDirectoryThrowsException(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-
-    filepath = _AddEvents(tmpdir)
-    with self.assertRaises(ValueError):
-      x.AddRunsFromDirectory(filepath)
-
-  def testAddRun(self):
-    x = event_multiplexer.EventMultiplexer()
-    x.AddRun('run1_path', 'run1')
-    run1 = x._GetAccumulator('run1')
-    self.assertEqual(sorted(x.Runs().keys()), ['run1'])
-    self.assertEqual(run1._path, 'run1_path')
-
-    x.AddRun('run1_path', 'run1')
-    self.assertEqual(run1, x._GetAccumulator('run1'), 'loader not recreated')
-
-    x.AddRun('run2_path', 'run1')
-    new_run1 = x._GetAccumulator('run1')
-    self.assertEqual(new_run1._path, 'run2_path')
-    self.assertNotEqual(run1, new_run1)
-
-    x.AddRun('runName3')
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'runName3'])
-    self.assertEqual(x._GetAccumulator('runName3')._path, 'runName3')
-
-  def testAddRunMaintainsLoading(self):
-    x = event_multiplexer.EventMultiplexer()
-    x.Reload()
-    x.AddRun('run1')
-    x.AddRun('run2')
-    self.assertTrue(x._GetAccumulator('run1').reload_called)
-    self.assertTrue(x._GetAccumulator('run2').reload_called)
-
-
-class EventMultiplexerWithRealAccumulatorTest(test_util.TensorFlowTestCase):
-
-  def testDeletingDirectoryRemovesRun(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    run1_dir = join(tmpdir, 'run1')
-    run2_dir = join(tmpdir, 'run2')
-    run3_dir = join(tmpdir, 'run3')
-
-    for dirname in [run1_dir, run2_dir, run3_dir]:
-      _AddEvents(dirname)
-
-    x.AddRun(run1_dir, 'run1')
-    x.AddRun(run2_dir, 'run2')
-    x.AddRun(run3_dir, 'run3')
-
-    x.Reload()
-
-    # Delete the directory, then reload.
-    shutil.rmtree(run2_dir)
-    x.Reload()
-    self.assertNotIn('run2', x.Runs().keys())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/python/summary/impl/event_file_loader.py b/tensorflow/python/summary/impl/event_file_loader.py
deleted file mode 100644
index ccc61d45644..00000000000
--- a/tensorflow/python/summary/impl/event_file_loader.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functionality for loading events from a record file."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import app
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-class EventFileLoader(object):
-  """An EventLoader is an iterator that yields Event protos."""
-
-  def __init__(self, file_path):
-    if file_path is None:
-      raise ValueError('A file path is required')
-    file_path = resource_loader.readahead_file_path(file_path)
-    logging.debug('Opening a record reader pointing at %s', file_path)
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._reader = pywrap_tensorflow.PyRecordReader_New(
-          compat.as_bytes(file_path), 0, compat.as_bytes(''), status)
-    # Store it for logging purposes.
-    self._file_path = file_path
-    if not self._reader:
-      raise IOError('Failed to open a record reader pointing to %s' % file_path)
-
-  def Load(self):
-    """Loads all new values from disk.
-
-    Calling Load multiple times in a row will not 'drop' events as long as the
-    return value is not iterated over.
-
-    Yields:
-      All values that were written to disk that have not been yielded yet.
-    """
-    while True:
-      try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          self._reader.GetNext(status)
-      except (errors.DataLossError, errors.OutOfRangeError):
-        # We ignore partial read exceptions, because a record may be truncated.
-        # PyRecordReader holds the offset prior to the failed read, so retrying
-        # will succeed.
-        break
-      event = event_pb2.Event()
-      event.ParseFromString(self._reader.record())
-      yield event
-    logging.debug('No more events in %s', self._file_path)
-
-
-def main(argv):
-  if len(argv) != 2:
-    print('Usage: event_file_loader <path-to-the-recordio-file>')
-    return 1
-  loader = EventFileLoader(argv[1])
-  for event in loader.Load():
-    print(event)
-
-
-if __name__ == '__main__':
-  app.run()
diff --git a/tensorflow/python/summary/plugin_asset.py b/tensorflow/python/summary/plugin_asset.py
new file mode 100644
index 00000000000..998fb30fa49
--- /dev/null
+++ b/tensorflow/python/summary/plugin_asset.py
@@ -0,0 +1,141 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorBoard Plugin asset abstract class.
+
+TensorBoard plugins may need to provide arbitrary assets, such as
+configuration information for specific outputs, or vocabulary files, or sprite
+images, etc.
+
+This module contains methods that allow plugin assets to be specified at graph
+construction time. Plugin authors define a PluginAsset which is treated as a
+singleton on a per-graph basis. The PluginAsset has an assets method which
+returns a dictionary of asset contents. The tf.summary.FileWriter
+(or any other Summary writer) will serialize these assets in such a way that
+TensorBoard can retrieve them.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.framework import ops
+
+_PLUGIN_ASSET_PREFIX = "__tensorboard_plugin_asset__"
+
+
+def get_plugin_asset(plugin_asset_cls, graph=None):
+  """Acquire singleton PluginAsset instance from a graph.
+
+  PluginAssets are always singletons, and are stored in tf Graph collections.
+  This way, they can be defined anywhere the graph is being constructed, and
+  if the same plugin is configured at many different points, the user can always
+  modify the same instance.
+
+  Args:
+    plugin_asset_cls: The PluginAsset class
+    graph: (optional) The graph to retrieve the instance from. If not specified,
+      the default graph is used.
+
+  Returns:
+    An instance of the plugin_asset_class
+
+  Raises:
+    ValueError: If we have a plugin name collision, or if we unexpectedly find
+      the wrong number of items in a collection.
+  """
+  if graph is None:
+    graph = ops.get_default_graph()
+  if not plugin_asset_cls.plugin_name:
+    raise ValueError("Class %s has no plugin_name" % plugin_asset_cls.__name__)
+
+  name = _PLUGIN_ASSET_PREFIX + plugin_asset_cls.plugin_name
+  container = graph.get_collection(name)
+  if container:
+    if len(container) is not 1:
+      raise ValueError("Collection for %s had %d items, expected 1" %
+                       (name, len(container)))
+    instance = container[0]
+    if not isinstance(instance, plugin_asset_cls):
+      raise ValueError("Plugin name collision between classes %s and %s" %
+                       (plugin_asset_cls.__name__, instance.__class__.__name__))
+  else:
+    instance = plugin_asset_cls()
+    graph.add_to_collection(name, instance)
+    graph.add_to_collection(_PLUGIN_ASSET_PREFIX, plugin_asset_cls.plugin_name)
+  return instance
+
+
+def get_all_plugin_assets(graph=None):
+  """Retrieve all PluginAssets stored in the graph collection.
+
+  Args:
+    graph: Optionally, the graph to get assets from. If unspecified, the default
+      graph is used.
+
+  Returns:
+    A list with all PluginAsset instances in the graph.
+
+  Raises:
+    ValueError: if we unexpectedly find a collection with the wrong number of
+      PluginAssets.
+
+  """
+  if graph is None:
+    graph = ops.get_default_graph()
+
+  out = []
+  for name in graph.get_collection(_PLUGIN_ASSET_PREFIX):
+    collection = graph.get_collection(_PLUGIN_ASSET_PREFIX + name)
+    if len(collection) is not 1:
+      raise ValueError("Collection for %s had %d items, expected 1" %
+                       (name, len(collection)))
+    out.append(collection[0])
+  return out
+
+
+class PluginAsset(object):
+  """This abstract base class allows TensorBoard to serialize assets to disk.
+
+  Plugin authors are expected to extend the PluginAsset class, so that it:
+  - has a unique plugin_name
+  - provides an assets method that returns an {asset_name: asset_contents}
+    dictionary. For now, asset_contents are strings, although we may add
+    StringIO support later.
+
+  LifeCycle of a PluginAsset instance:
+  - It is constructed when get_plugin_asset is called on the class for
+    the first time.
+  - It is configured by code that follows the calls to get_plugin_asset
+  - When the containing graph is serialized by the tf.summary.FileWriter, the
+    writer calls assets and the PluginAsset instance provides its contents to be
+    written to disk.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  plugin_name = None
+
+  @abc.abstractmethod
+  def assets(self):
+    """Provide all of the assets contained by the PluginAsset instance.
+
+    The assets method should return a dictionary structured as
+    {asset_name: asset_contents}. asset_contents is a string.
+
+    This method will be called by the tf.summary.FileWriter when it is time to
+    write the assets out to disk.
+    """
+    raise NotImplementedError()
diff --git a/tensorflow/python/summary/plugin_asset_test.py b/tensorflow/python/summary/plugin_asset_test.py
new file mode 100644
index 00000000000..d2edcc053c7
--- /dev/null
+++ b/tensorflow/python/summary/plugin_asset_test.py
@@ -0,0 +1,81 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.summary import plugin_asset
+
+
+class _UnnamedPluginAsset(plugin_asset.PluginAsset):
+  """An example asset with a dummy serialize method provided, but no name."""
+
+  def assets(self):
+    return {}
+
+
+class _ExamplePluginAsset(_UnnamedPluginAsset):
+  """Simple example asset."""
+  plugin_name = "_ExamplePluginAsset"
+
+
+class _OtherExampleAsset(_UnnamedPluginAsset):
+  """Simple example asset."""
+  plugin_name = "_OtherExampleAsset"
+
+
+class _ExamplePluginThatWillCauseCollision(_UnnamedPluginAsset):
+  plugin_name = "_ExamplePluginAsset"
+
+
+class PluginAssetTest(test_util.TensorFlowTestCase):
+
+  def testGetPluginAsset(self):
+    epa = plugin_asset.get_plugin_asset(_ExamplePluginAsset)
+    self.assertIsInstance(epa, _ExamplePluginAsset)
+    epa2 = plugin_asset.get_plugin_asset(_ExamplePluginAsset)
+    self.assertIs(epa, epa2)
+    opa = plugin_asset.get_plugin_asset(_OtherExampleAsset)
+    self.assertIsNot(epa, opa)
+
+  def testUnnamedPluginFails(self):
+    with self.assertRaises(ValueError):
+      plugin_asset.get_plugin_asset(_UnnamedPluginAsset)
+
+  def testPluginCollisionDetected(self):
+    plugin_asset.get_plugin_asset(_ExamplePluginAsset)
+    with self.assertRaises(ValueError):
+      plugin_asset.get_plugin_asset(_ExamplePluginThatWillCauseCollision)
+
+  def testGetAllPluginAssets(self):
+    epa = plugin_asset.get_plugin_asset(_ExamplePluginAsset)
+    opa = plugin_asset.get_plugin_asset(_OtherExampleAsset)
+    self.assertItemsEqual(plugin_asset.get_all_plugin_assets(), [epa, opa])
+
+  def testRespectsGraphArgument(self):
+    g1 = ops.Graph()
+    g2 = ops.Graph()
+    e1 = plugin_asset.get_plugin_asset(_ExamplePluginAsset, g1)
+    e2 = plugin_asset.get_plugin_asset(_ExamplePluginAsset, g2)
+
+    self.assertEqual(e1, plugin_asset.get_all_plugin_assets(g1)[0])
+    self.assertEqual(e2, plugin_asset.get_all_plugin_assets(g2)[0])
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 3cc0f9cd7fb..7ff01a51f3d 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -13,33 +13,31 @@
 # limitations under the License.
 # ==============================================================================
 
-"""## Generation of summaries.
+"""Tensor summaries for exporting information about a model.
+
+See the @{$python/summary} guide.
 
-### Class for writing Summaries
 @@FileWriter
 @@FileWriterCache
-
-### Summary Ops
 @@tensor_summary
 @@scalar
 @@histogram
 @@audio
 @@image
+@@text
 @@merge
 @@merge_all
-
-## Utilities
 @@get_summary_description
-
+@@get_plugin_asset
+@@get_all_plugin_assets
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re as _re
-
 from google.protobuf import json_format as _json_format
+
 # exports Summary, SummaryDescription, Event, TaggedRunMetadata, SessionLog
 # pylint: disable=unused-import
 from tensorflow.core.framework.summary_pb2 import Summary
@@ -52,50 +50,30 @@ from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
-# exports tensor_summary
+from tensorflow.python.ops import summary_op_util as _summary_op_util
+
+# exports tensor-related summaries
 # pylint: disable=unused-import
+from tensorflow.python.ops.summary_ops import _tensor_summary_v2
 from tensorflow.python.ops.summary_ops import tensor_summary
 # pylint: enable=unused-import
-from tensorflow.python.platform import tf_logging as _logging
+
+# exports text
+# pylint: disable=unused-import
+from tensorflow.python.summary.text_summary import text_summary as text
+# pylint: enable=unused-import
+
 # exports FileWriter, FileWriterCache
 # pylint: disable=unused-import
 from tensorflow.python.summary.writer.writer import FileWriter
 from tensorflow.python.summary.writer.writer_cache import FileWriterCache
 # pylint: enable=unused-import
+
 from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _collect(val, collections, default_collections):
-  if collections is None:
-    collections = default_collections
-  for key in collections:
-    _ops.add_to_collection(key, val)
-
-
-_INVALID_TAG_CHARACTERS = _re.compile(r'[^-/\w\.]')
-
-
-def _clean_tag(name):
-  # In the past, the first argument to summary ops was a tag, which allowed
-  # arbitrary characters. Now we are changing the first argument to be the node
-  # name. This has a number of advantages (users of summary ops now can
-  # take advantage of the tf name scope system) but risks breaking existing
-  # usage, because a much smaller set of characters are allowed in node names.
-  # This function replaces all illegal characters with _s, and logs a warning.
-  # It also strips leading slashes from the name.
-  if name is not None:
-    new_name = _INVALID_TAG_CHARACTERS.sub('_', name)
-    new_name = new_name.lstrip('/')  # Remove leading slashes
-    if new_name != name:
-      _logging.info(
-          'Summary name %s is illegal; using %s instead.' %
-          (name, new_name))
-      name = new_name
-  return name
-
-
-def scalar(name, tensor, collections=None):
+def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
   The generated Summary has a Tensor.proto containing the input Tensor.
@@ -106,6 +84,8 @@ def scalar(name, tensor, collections=None):
     tensor: A real numeric Tensor containing a single value.
     collections: Optional list of graph collections keys. The new summary op is
       added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    family: Optional; if provided, used as the prefix of the summary tag name,
+      which controls the tab name used for display on Tensorboard.
 
   Returns:
     A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf.
@@ -113,16 +93,15 @@ def scalar(name, tensor, collections=None):
   Raises:
     ValueError: If tensor has the wrong shape or type.
   """
-  name = _clean_tag(name)
-  with _ops.name_scope(name, None, [tensor]) as scope:
+  with _summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
     # pylint: disable=protected-access
-    val = _gen_logging_ops._scalar_summary(
-        tags=scope.rstrip('/'), values=tensor, name=scope)
-    _collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+    val = _gen_logging_ops._scalar_summary(tags=tag, values=tensor, name=scope)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
 
-def image(name, tensor, max_outputs=3, collections=None):
+def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
   The summary has up to `max_outputs` summary values containing images. The
@@ -160,27 +139,31 @@ def image(name, tensor, max_outputs=3, collections=None):
     max_outputs: Max number of batch elements to generate images for.
     collections: Optional list of ops.GraphKeys.  The collections to add the
       summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+    family: Optional; if provided, used as the prefix of the summary tag name,
+      which controls the tab name used for display on Tensorboard.
 
   Returns:
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  name = _clean_tag(name)
-  with _ops.name_scope(name, None, [tensor]) as scope:
+  with _summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
     # pylint: disable=protected-access
     val = _gen_logging_ops._image_summary(
-        tag=scope.rstrip('/'),
-        tensor=tensor,
-        max_images=max_outputs,
-        name=scope)
-    _collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+        tag=tag, tensor=tensor, max_images=max_outputs, name=scope)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
 
-def histogram(name, values, collections=None):
+def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
 
+  Adding a histogram summary makes it possible to visualize your data's
+  distribution in TensorBoard. You can see a detailed explanation of the
+  TensorBoard histogram dashboard
+  [here](https://www.tensorflow.org/get_started/tensorboard_histograms).
+
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   has one summary value containing a histogram for `values`.
@@ -194,22 +177,25 @@ def histogram(name, values, collections=None):
       build the histogram.
     collections: Optional list of graph collections keys. The new summary op is
       added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    family: Optional; if provided, used as the prefix of the summary tag name,
+      which controls the tab name used for display on Tensorboard.
 
   Returns:
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  # pylint: enable=line-too-long
-  name = _clean_tag(name)
-  with _ops.name_scope(name, 'HistogramSummary', [values]) as scope:
+  with _summary_op_util.summary_scope(
+      name, family, values=[values],
+      default_name='HistogramSummary') as (tag, scope):
     # pylint: disable=protected-access
     val = _gen_logging_ops._histogram_summary(
-        tag=scope.rstrip('/'), values=values, name=scope)
-    _collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+        tag=tag, values=values, name=scope)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
 
-def audio(name, tensor, sample_rate, max_outputs=3, collections=None):
+def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
+          family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with audio.
 
@@ -236,24 +222,22 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None):
     max_outputs: Max number of batch elements to generate audio for.
     collections: Optional list of ops.GraphKeys.  The collections to add the
       summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+    family: Optional; if provided, used as the prefix of the summary tag name,
+      which controls the tab name used for display on Tensorboard.
 
   Returns:
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  # pylint: enable=line-too-long
-  name = _clean_tag(name)
-  with _ops.name_scope(name, None, [tensor]) as scope:
+  with _summary_op_util.summary_scope(
+      name, family=family, values=[tensor]) as (tag, scope):
     # pylint: disable=protected-access
     sample_rate = _ops.convert_to_tensor(
         sample_rate, dtype=_dtypes.float32, name='sample_rate')
     val = _gen_logging_ops._audio_summary_v2(
-        tag=scope.rstrip('/'),
-        tensor=tensor,
-        max_outputs=max_outputs,
-        sample_rate=sample_rate,
-        name=scope)
-    _collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+        tag=tag, tensor=tensor, max_outputs=max_outputs,
+        sample_rate=sample_rate, name=scope)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
   return val
 
 
@@ -281,11 +265,11 @@ def merge(inputs, collections=None, name=None):
     buffer resulting from the merging.
   """
   # pylint: enable=line-too-long
-  name = _clean_tag(name)
+  name = _summary_op_util.clean_tag(name)
   with _ops.name_scope(name, 'Merge', inputs):
     # pylint: disable=protected-access
     val = _gen_logging_ops._merge_summary(inputs=inputs, name=name)
-    _collect(val, collections, [])
+    _summary_op_util.collect(val, collections, [])
   return val
 
 
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 490ce141f13..301f560d413 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -44,18 +44,6 @@ class SummaryWriter(object):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
-
-  @@__init__
-
-  @@add_summary
-  @@add_session_log
-  @@add_event
-  @@add_graph
-  @@add_run_metadata
-  @@get_logdir
-
-  @@flush
-  @@close
   """
 
   def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
@@ -136,8 +124,8 @@ class SummaryWriter(object):
     and adds it to the event file.
 
     You can pass the result of evaluating any summary op, using
-    [`Session.run()`](client.md#Session.run) or
-    [`Tensor.eval()`](framework.md#Tensor.eval), to this
+    @{tf.Session.run} or
+    @{tf.Tensor.eval}, to this
     function. Alternatively, you can pass a `tf.Summary` protocol
     buffer that you populate with your own data. The latter is
     commonly done to report evaluation results in event files.
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index 478e82f4478..eb9dbf96458 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -19,11 +19,9 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from google.protobuf import json_format
-
 from tensorflow.core.framework import summary_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
@@ -46,6 +44,29 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'outer/inner')
     self.assertEqual(values[0].simple_value, 3.0)
 
+  def testScalarSummaryWithFamily(self):
+    with self.test_session() as s:
+      i = constant_op.constant(7)
+      with ops.name_scope('outer'):
+        im1 = summary_lib.scalar('inner', i, family='family')
+        self.assertEquals(im1.op.name, 'outer/family/inner')
+        im2 = summary_lib.scalar('inner', i, family='family')
+        self.assertEquals(im2.op.name, 'outer/family/inner_1')
+      sm1, sm2 = s.run([im1, im2])
+    summary = summary_pb2.Summary()
+
+    summary.ParseFromString(sm1)
+    values = summary.value
+    self.assertEqual(len(values), 1)
+    self.assertEqual(values[0].tag, 'family/outer/family/inner')
+    self.assertEqual(values[0].simple_value, 7.0)
+
+    summary.ParseFromString(sm2)
+    values = summary.value
+    self.assertEqual(len(values), 1)
+    self.assertEqual(values[0].tag, 'family/outer/family/inner_1')
+    self.assertEqual(values[0].simple_value, 7.0)
+
   def testSummarizingVariable(self):
     with self.test_session() as s:
       c = constant_op.constant(42.0)
@@ -75,6 +96,22 @@ class ScalarSummaryTest(test.TestCase):
     expected = sorted('outer/inner/image/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  def testImageSummaryWithFamily(self):
+    with self.test_session() as s:
+      i = array_ops.ones((5, 2, 3, 1))
+      with ops.name_scope('outer'):
+        im = summary_lib.image('inner', i, max_outputs=3, family='family')
+        self.assertEquals(im.op.name, 'outer/family/inner')
+      summary_str = s.run(im)
+    summary = summary_pb2.Summary()
+    summary.ParseFromString(summary_str)
+    values = summary.value
+    self.assertEqual(len(values), 3)
+    tags = sorted(v.tag for v in values)
+    expected = sorted('family/outer/family/inner/image/{}'.format(i)
+                      for i in xrange(3))
+    self.assertEqual(tags, expected)
+
   def testHistogramSummary(self):
     with self.test_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -86,6 +123,48 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'outer/inner')
 
+  def testHistogramSummaryWithFamily(self):
+    with self.test_session() as s:
+      i = array_ops.ones((5, 4, 4, 3))
+      with ops.name_scope('outer'):
+        summ_op = summary_lib.histogram('inner', i, family='family')
+        self.assertEquals(summ_op.op.name, 'outer/family/inner')
+      summary_str = s.run(summ_op)
+    summary = summary_pb2.Summary()
+    summary.ParseFromString(summary_str)
+    self.assertEqual(len(summary.value), 1)
+    self.assertEqual(summary.value[0].tag, 'family/outer/family/inner')
+
+  def testAudioSummary(self):
+    with self.test_session() as s:
+      i = array_ops.ones((5, 3, 4))
+      with ops.name_scope('outer'):
+        aud = summary_lib.audio('inner', i, 0.2, max_outputs=3)
+      summary_str = s.run(aud)
+    summary = summary_pb2.Summary()
+    summary.ParseFromString(summary_str)
+    values = summary.value
+    self.assertEqual(len(values), 3)
+    tags = sorted(v.tag for v in values)
+    expected = sorted('outer/inner/audio/{}'.format(i) for i in xrange(3))
+    self.assertEqual(tags, expected)
+
+  def testAudioSummaryWithFamily(self):
+    with self.test_session() as s:
+      i = array_ops.ones((5, 3, 4))
+      with ops.name_scope('outer'):
+        aud = summary_lib.audio('inner', i, 0.2, max_outputs=3, family='family')
+        self.assertEquals(aud.op.name, 'outer/family/inner')
+      summary_str = s.run(aud)
+    summary = summary_pb2.Summary()
+    summary.ParseFromString(summary_str)
+    values = summary.value
+    self.assertEqual(len(values), 3)
+    tags = sorted(v.tag for v in values)
+    expected = sorted('family/outer/family/inner/audio/{}'.format(i)
+                      for i in xrange(3))
+    self.assertEqual(tags, expected)
+
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
@@ -97,6 +176,34 @@ class ScalarSummaryTest(test.TestCase):
     s3 = summary_lib.scalar('/name/with/leading/slash', c)
     self.assertEqual(s3.op.name, 'name/with/leading/slash')
 
+  def testSummaryWithFamilyMetaGraphExport(self):
+    with ops.name_scope('outer'):
+      i = constant_op.constant(11)
+      summ = summary_lib.scalar('inner', i)
+      self.assertEquals(summ.op.name, 'outer/inner')
+      summ_f = summary_lib.scalar('inner', i, family='family')
+      self.assertEquals(summ_f.op.name, 'outer/family/inner')
+
+    metagraph_def, _ = meta_graph.export_scoped_meta_graph(export_scope='outer')
+
+    with ops.Graph().as_default() as g:
+      meta_graph.import_scoped_meta_graph(metagraph_def, graph=g,
+                                          import_scope='new_outer')
+      # The summaries should exist, but with outer scope renamed.
+      new_summ = g.get_tensor_by_name('new_outer/inner:0')
+      new_summ_f = g.get_tensor_by_name('new_outer/family/inner:0')
+
+      # However, the tags are unaffected.
+      with self.test_session() as s:
+        new_summ_str, new_summ_f_str = s.run([new_summ, new_summ_f])
+        new_summ_pb = summary_pb2.Summary()
+        new_summ_pb.ParseFromString(new_summ_str)
+        self.assertEquals('outer/inner', new_summ_pb.value[0].tag)
+        new_summ_f_pb = summary_pb2.Summary()
+        new_summ_f_pb.ParseFromString(new_summ_f_str)
+        self.assertEquals('family/outer/family/inner',
+                          new_summ_f_pb.value[0].tag)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
new file mode 100644
index 00000000000..52bc913b2ad
--- /dev/null
+++ b/tensorflow/python/summary/text_summary.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements text_summary in TensorFlow, with TensorBoard support.
+
+The text_summary is basically a wrapper around the generic tensor_summary,
+and it uses a TextSummaryPluginAsset class to record which tensor_summaries
+are readable by the TensorBoard text plugin.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops.summary_ops import tensor_summary
+from tensorflow.python.summary import plugin_asset
+
+
+def text_summary(name, tensor, collections=None):
+  """Summarizes textual data.
+
+  Text data summarized via this plugin will be visible in the Text Dashboard
+  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
+  in the strings, and will automatically organize 1d and 2d tensors into tables.
+  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
+  displayed along with a warning message. (Note that this behavior is not
+  intrinsic to the text summary api, but rather to the default TensorBoard text
+  plugin.)
+
+  Args:
+    name: A name for the generated node. Will also serve as a series name in
+      TensorBoard.
+    tensor: a string-type Tensor to summarize.
+    collections: Optional list of ops.GraphKeys.  The collections to add the
+      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+
+  Returns:
+    A  TensorSummary op that is configured so that TensorBoard will recognize
+    that it contains textual data. The TensorSummary is a scalar `Tensor` of
+    type `string` which contains `Summary` protobufs.
+
+  Raises:
+    ValueError: If tensor has the wrong type.
+  """
+  if tensor.dtype != dtypes.string:
+    raise ValueError("Expected tensor %s to have dtype string, got %s" %
+                     (tensor.name, tensor.dtype))
+
+  t_summary = tensor_summary(name, tensor, collections=collections)
+  text_assets = plugin_asset.get_plugin_asset(TextSummaryPluginAsset)
+  text_assets.register_tensor(t_summary.op.name)
+  return t_summary
+
+
+class TextSummaryPluginAsset(plugin_asset.PluginAsset):
+  """Provides a registry of text summaries for the TensorBoard text plugin."""
+  plugin_name = "tensorboard_text"
+
+  def __init__(self):
+    self._tensor_names = []
+
+  def register_tensor(self, name):
+    """Register a new Tensor Summary name as containing textual data."""
+    self._tensor_names.append(name)
+
+  def assets(self):
+    """Store the tensors registry in a file called tensors.json."""
+    return {"tensors.json": json.dumps(self._tensor_names)}
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
new file mode 100644
index 00000000000..31009702ca4
--- /dev/null
+++ b/tensorflow/python/summary/text_summary_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.summary import text_summary
+
+
+class TextPluginTest(test_util.TensorFlowTestCase):
+  """Test the Text Summary API.
+
+  These tests are focused on testing the API design of the text_summary method.
+  It doesn't test the PluginAsset and tensors registry functionality, because
+  that is better tested by the text_plugin test that actually consumes that
+  metadata.
+  """
+
+  def testTextSummaryAPI(self):
+    with self.test_session():
+
+      with self.assertRaises(ValueError):
+        num = array_ops.constant(1)
+        text_summary.text_summary("foo", num)
+
+      # The API accepts vectors.
+      arr = array_ops.constant(["one", "two", "three"])
+      summ = text_summary.text_summary("foo", arr)
+      self.assertEqual(summ.op.type, "TensorSummary")
+
+      # the API accepts scalars
+      summ = text_summary.text_summary("foo", array_ops.constant("one"))
+      self.assertEqual(summ.op.type, "TensorSummary")
+
+  def testTextSummaryCollections(self):
+    text_summary.text_summary("bar", array_ops.constant("2"), collections=[])
+    summaries = framework_ops.get_collection(framework_ops.GraphKeys.SUMMARIES)
+    self.assertEqual(len(summaries), 0)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/summary/writer/event_file_writer.py b/tensorflow/python/summary/writer/event_file_writer.py
index b3455dc6323..2936a279bd4 100644
--- a/tensorflow/python/summary/writer/event_file_writer.py
+++ b/tensorflow/python/summary/writer/event_file_writer.py
@@ -24,6 +24,7 @@ import time
 
 import six
 
+from tensorflow.core.util import event_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
@@ -35,15 +36,10 @@ class EventFileWriter(object):
   The `EventFileWriter` class creates an event file in the specified directory,
   and asynchronously writes Event protocol buffers to the file. The Event file
   is encoded using the tfrecord format, which is similar to RecordIO.
-
-  @@__init__
-
-  @@add_event
-  @@flush
-  @@close
   """
 
-  def __init__(self, logdir, max_queue=10, flush_secs=120):
+  def __init__(self, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=None):
     """Creates a `EventFileWriter` and an event file to write to.
 
     On construction the summary writer creates a new event file in `logdir`.
@@ -63,6 +59,8 @@ class EventFileWriter(object):
       max_queue: Integer. Size of the queue for pending events and summaries.
       flush_secs: Number. How often, in seconds, to flush the
         pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
     """
     self._logdir = logdir
     if not gfile.IsDirectory(self._logdir):
@@ -70,12 +68,20 @@ class EventFileWriter(object):
     self._event_queue = six.moves.queue.Queue(max_queue)
     self._ev_writer = pywrap_tensorflow.EventsWriter(
         compat.as_bytes(os.path.join(self._logdir, "events")))
+    self._flush_secs = flush_secs
+    self._sentinel_event = self._get_sentinel_event()
+    if filename_suffix:
+      self._ev_writer.InitWithSuffix(compat.as_bytes(filename_suffix))
     self._closed = False
     self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
-                                      flush_secs)
+                                      self._flush_secs, self._sentinel_event)
 
     self._worker.start()
 
+  def _get_sentinel_event(self):
+    """Generate a sentinel event for terminating worker."""
+    return event_pb2.Event()
+
   def get_logdir(self):
     """Returns the directory where event file will be written."""
     return self._logdir
@@ -89,6 +95,9 @@ class EventFileWriter(object):
     Does nothing if the EventFileWriter was not closed.
     """
     if self._closed:
+      self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
+                                        self._flush_secs, self._sentinel_event)
+      self._worker.start()
       self._closed = False
 
   def add_event(self, event):
@@ -114,7 +123,9 @@ class EventFileWriter(object):
 
     Call this method when you do not need the summary writer anymore.
     """
+    self.add_event(self._sentinel_event)
     self.flush()
+    self._worker.join()
     self._ev_writer.Close()
     self._closed = True
 
@@ -122,7 +133,7 @@ class EventFileWriter(object):
 class _EventLoggerThread(threading.Thread):
   """Thread that logs events."""
 
-  def __init__(self, queue, ev_writer, flush_secs):
+  def __init__(self, queue, ev_writer, flush_secs, sentinel_event):
     """Creates an _EventLoggerThread.
 
     Args:
@@ -131,6 +142,8 @@ class _EventLoggerThread(threading.Thread):
        the visualizer.
       flush_secs: How often, in seconds, to flush the
         pending file to disk.
+      sentinel_event: A sentinel element in queue that tells this thread to
+        terminate.
     """
     threading.Thread.__init__(self)
     self.daemon = True
@@ -139,10 +152,14 @@ class _EventLoggerThread(threading.Thread):
     self._flush_secs = flush_secs
     # The first event will be flushed immediately.
     self._next_event_flush_time = 0
+    self._sentinel_event = sentinel_event
 
   def run(self):
     while True:
       event = self._queue.get()
+      if event is self._sentinel_event:
+        self._queue.task_done()
+        break
       try:
         self._ev_writer.WriteEvent(event)
         # Flush the event writer every so often.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index fc90d547dc6..05f97fb2841 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os.path
 import time
 
 from tensorflow.core.framework import graph_pb2
@@ -26,9 +27,13 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
 
+_PLUGINS_DIR = "plugins"
+
 
 class SummaryToEventTransformer(object):
   """Abstractly implements the SummaryWriter API.
@@ -36,14 +41,6 @@ class SummaryToEventTransformer(object):
   This API basically implements a number of endpoints (add_summary,
   add_session_log, etc). The endpoints all generate an event protobuf, which is
   passed to the contained event_writer.
-
-  @@__init__
-
-  @@add_summary
-  @@add_session_log
-  @@add_graph
-  @@add_meta_graph
-  @@add_run_metadata
   """
 
   def __init__(self, event_writer, graph=None, graph_def=None):
@@ -71,7 +68,7 @@ class SummaryToEventTransformer(object):
 
 
     Args:
-      event_writer: An EventWriter. Implements add_event method.
+      event_writer: An EventWriter. Implements add_event and get_logdir.
       graph: A `Graph` object, such as `sess.graph`.
       graph_def: DEPRECATED: Use the `graph` argument instead.
     """
@@ -83,12 +80,11 @@ class SummaryToEventTransformer(object):
       self.add_graph(graph=graph, graph_def=graph_def)
       # Also export the meta_graph_def in this case.
       # graph may itself be a graph_def due to positional arguments
-      maybe_graph_as_def = (
-          graph.as_graph_def(add_shapes=True) if isinstance(graph, ops.Graph)
-          else graph)
+      maybe_graph_as_def = (graph.as_graph_def(add_shapes=True)
+                            if isinstance(graph, ops.Graph) else graph)
       self.add_meta_graph(
-          meta_graph.create_meta_graph_def(
-              graph_def=graph_def or maybe_graph_as_def))
+          meta_graph.create_meta_graph_def(graph_def=graph_def or
+                                           maybe_graph_as_def))
 
   def add_summary(self, summary, global_step=None):
     """Adds a `Summary` protocol buffer to the event file.
@@ -97,8 +93,8 @@ class SummaryToEventTransformer(object):
     and adds it to the event file.
 
     You can pass the result of evaluating any summary op, using
-    [`Session.run()`](client.md#Session.run) or
-    [`Tensor.eval()`](framework.md#Tensor.eval), to this
+    @{tf.Session.run} or
+    @{tf.Tensor.eval}, to this
     function. Alternatively, you can pass a `tf.Summary` protocol
     buffer that you populate with your own data. The latter is
     commonly done to report evaluation results in event files.
@@ -166,6 +162,7 @@ class SummaryToEventTransformer(object):
 
       # Serialize the graph with additional info.
       true_graph_def = graph.as_graph_def(add_shapes=True)
+      self._write_plugin_assets(graph)
     elif (isinstance(graph, graph_pb2.GraphDef) or
           isinstance(graph_def, graph_pb2.GraphDef)):
       # The user passed a `GraphDef`.
@@ -186,6 +183,19 @@ class SummaryToEventTransformer(object):
     # Finally, add the graph_def to the summary writer.
     self._add_graph_def(true_graph_def, global_step)
 
+  def _write_plugin_assets(self, graph):
+    plugin_assets = plugin_asset.get_all_plugin_assets(graph)
+    logdir = self.event_writer.get_logdir()
+    for asset_container in plugin_assets:
+      plugin_name = asset_container.plugin_name
+      plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+      gfile.MakeDirs(plugin_dir)
+      assets = asset_container.assets()
+      for (asset_name, content) in assets.items():
+        asset_path = os.path.join(plugin_dir, asset_name)
+        with gfile.Open(asset_path, "w") as f:
+          f.write(content)
+
   def add_meta_graph(self, meta_graph_def, global_step=None):
     """Adds a `MetaGraphDef` to the event file.
 
@@ -193,7 +203,7 @@ class SummaryToEventTransformer(object):
     `saver.import_meta_graph()`.
 
     Args:
-      meta_graph_def: A `MetaGraphDef` object, often as retured by
+      meta_graph_def: A `MetaGraphDef` object, often as returned by
         `saver.export_meta_graph()`.
       global_step: Number. Optional global step counter to record with the
         graph.
@@ -202,8 +212,8 @@ class SummaryToEventTransformer(object):
       TypeError: If both `meta_graph_def` is not an instance of `MetaGraphDef`.
     """
     if not isinstance(meta_graph_def, meta_graph_pb2.MetaGraphDef):
-      raise TypeError("meta_graph_def must be type MetaGraphDef, saw type: %s"
-                      % type(meta_graph_def))
+      raise TypeError("meta_graph_def must be type MetaGraphDef, saw type: %s" %
+                      type(meta_graph_def))
     meta_graph_bytes = meta_graph_def.SerializeToString()
     event = event_pb2.Event(meta_graph_def=meta_graph_bytes)
     self._add_event(event, global_step)
@@ -247,18 +257,6 @@ class FileWriter(SummaryToEventTransformer):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
-
-  @@__init__
-
-  @@add_summary
-  @@add_session_log
-  @@add_event
-  @@add_graph
-  @@add_run_metadata
-  @@get_logdir
-
-  @@flush
-  @@close
   """
 
   def __init__(self,
@@ -266,7 +264,8 @@ class FileWriter(SummaryToEventTransformer):
                graph=None,
                max_queue=10,
                flush_secs=120,
-               graph_def=None):
+               graph_def=None,
+               filename_suffix=None):
     """Creates a `FileWriter` and an event file.
 
     On construction the summary writer creates a new event file in `logdir`.
@@ -304,8 +303,11 @@ class FileWriter(SummaryToEventTransformer):
       flush_secs: Number. How often, in seconds, to flush the
         pending events and summaries to disk.
       graph_def: DEPRECATED: Use the `graph` argument instead.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `suffix`.
     """
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs)
+    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                   filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def get_logdir(self):
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index 21870e788ef..bad289303c0 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -39,6 +39,10 @@ class FileWriterCache(object):
   def clear():
     """Clear cached summary writers. Currently only used for unit tests."""
     with FileWriterCache._lock:
+      # Make sure all the writers are closed now (otherwise open file handles
+      # may hang around, blocking deletions on Windows).
+      for item in FileWriterCache._cache.values():
+        item.close()
       FileWriterCache._cache = {}
 
   @staticmethod
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 245f7523240..8c34eb82e35 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -33,7 +33,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer
 from tensorflow.python.summary.writer import writer_cache
@@ -256,6 +258,15 @@ class SummaryWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  def testNonBlockingClose(self):
+    test_dir = self._CleanTestDir("non_blocking_close")
+    sw = writer.FileWriter(test_dir)
+    # Sleep 1.2 seconds to make sure event queue is empty.
+    time.sleep(1.2)
+    time_before_close = time.time()
+    sw.close()
+    self._assertRecent(time_before_close)
+
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
@@ -306,6 +317,22 @@ class SummaryWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  def testFileWriterWithSuffix(self):
+    test_dir = self._CleanTestDir("test_suffix")
+    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    for _ in range(10):
+      sw.add_summary(
+          summary_pb2.Summary(value=[
+              summary_pb2.Summary.Value(tag="float_ten", simple_value=10.0)
+          ]),
+          10)
+      sw.close()
+      sw.reopen()
+    sw.close()
+    event_filenames = glob.glob(os.path.join(test_dir, "event*"))
+    for filename in event_filenames:
+      self.assertTrue(filename.endswith("_test_suffix"))
+
 
 class SummaryWriterCacheTest(test.TestCase):
   """SummaryWriterCache tests."""
@@ -354,5 +381,32 @@ class SummaryWriterCacheTest(test.TestCase):
       self.assertFalse(sw1 == sw2)
 
 
+class ExamplePluginAsset(plugin_asset.PluginAsset):
+  plugin_name = "example"
+
+  def assets(self):
+    return {"foo.txt": "foo!", "bar.txt": "bar!"}
+
+
+class PluginAssetsTest(test.TestCase):
+
+  def testPluginAssetSerialized(self):
+    with ops.Graph().as_default() as g:
+      plugin_asset.get_plugin_asset(ExamplePluginAsset)
+
+      logdir = self.get_temp_dir()
+      fw = writer.FileWriter(logdir)
+      fw.add_graph(g)
+    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
+
+    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "foo!")
+
+    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "bar!")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 09b4b20bcd2..a9a0b7fffa8 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -29,9 +29,9 @@ limitations under the License.
 
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
-%include "tensorflow/python/client/quantize_training.i"
 
 %include "tensorflow/python/lib/io/file_io.i"
+%include "tensorflow/python/training/quantize_training.i"
 %include "tensorflow/python/training/server_lib.i"
 
 %include "tensorflow/python/framework/python_op_gen.i"
@@ -40,3 +40,6 @@ limitations under the License.
 %include "tensorflow/python/util/kernel_registry.i"
 
 %include "tensorflow/python/util/transform_graph.i"
+
+%include "tensorflow/python/grappler/tf_optimizer.i"
+%include "tensorflow/python/grappler/cost_analyzer.i"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 97dda45ff10..1780d34b394 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -9,6 +9,19 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "tools_pip",
+    deps = [
+        ":freeze_graph",
+        ":inspect_checkpoint",
+        ":optimize_for_inference",
+        ":print_selective_registration_header",
+        ":saved_model_cli",
+        ":strip_unused",
+    ],
+)
+
 py_library(
     name = "freeze_graph_lib",
     srcs = ["freeze_graph.py"],
@@ -152,9 +165,9 @@ py_test(
     ],
 )
 
-py_binary(
-    name = "print_selective_registration_header",
-    srcs = ["print_selective_registration_header.py"],
+py_library(
+    name = "selective_registration_header_lib",
+    srcs = ["selective_registration_header_lib.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -163,17 +176,54 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "print_selective_registration_header",
+    srcs = ["print_selective_registration_header.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":selective_registration_header_lib",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "print_selective_registration_header_test",
+    size = "small",
     srcs = ["print_selective_registration_header_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":print_selective_registration_header",
+        ":selective_registration_header_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
     ],
 )
 
+py_binary(
+    name = "saved_model_cli",
+    srcs = ["saved_model_cli.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python",
+        "//tensorflow/python/debug:local_cli_wrapper",
+    ],
+)
+
+py_test(
+    name = "saved_model_cli_test",
+    size = "small",
+    srcs = ["saved_model_cli_test.py"],
+    data = [
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    ],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":saved_model_cli",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 38e3e17a884..bd046a7fd09 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
+r"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
 
 This script is designed to take a GraphDef proto, a SaverDef proto, and a set of
 variable values stored in a checkpoint file, and output a GraphDef with all of
@@ -55,29 +55,20 @@ from tensorflow.python.training import saver as saver_lib
 FLAGS = None
 
 
-def freeze_graph(input_graph,
-                 input_saver,
-                 input_binary,
-                 input_checkpoint,
-                 output_node_names,
-                 restore_op_name,
-                 filename_tensor_name,
-                 output_graph,
-                 clear_devices,
-                 initializer_nodes,
-                 variable_names_blacklist=""):
+def freeze_graph_with_def_protos(
+    input_graph_def,
+    input_saver_def,
+    input_checkpoint,
+    output_node_names,
+    restore_op_name,
+    filename_tensor_name,
+    output_graph,
+    clear_devices,
+    initializer_nodes,
+    variable_names_blacklist=""):
   """Converts all variables in a graph and checkpoint into constants."""
-
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
-  if not gfile.Exists(input_graph):
-    print("Input graph file '" + input_graph + "' does not exist!")
-    return -1
-
-  if input_saver and not gfile.Exists(input_saver):
-    print("Input saver file '" + input_saver + "' does not exist!")
-    return -1
-
   # 'input_checkpoint' may be a prefix if we're using Saver V2 format
   if not saver_lib.checkpoint_exists(input_checkpoint):
     print("Input checkpoint '" + input_checkpoint + "' doesn't exist!")
@@ -87,13 +78,6 @@ def freeze_graph(input_graph,
     print("You need to supply the name of a node to --output_node_names.")
     return -1
 
-  input_graph_def = graph_pb2.GraphDef()
-  mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
-    if input_binary:
-      input_graph_def.ParseFromString(f.read())
-    else:
-      text_format.Merge(f.read(), input_graph_def)
   # Remove all the explicit device specifications for this node. This helps to
   # make the graph more portable.
   if clear_devices:
@@ -103,15 +87,9 @@ def freeze_graph(input_graph,
   _ = importer.import_graph_def(input_graph_def, name="")
 
   with session.Session() as sess:
-    if input_saver:
-      with gfile.FastGFile(input_saver, mode) as f:
-        saver_def = saver_pb2.SaverDef()
-        if input_binary:
-          saver_def.ParseFromString(f.read())
-        else:
-          text_format.Merge(f.read(), saver_def)
-        saver = saver_lib.Saver(saver_def=saver_def)
-        saver.restore(sess, input_checkpoint)
+    if input_saver_def:
+      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver.restore(sess, input_checkpoint)
     else:
       var_list = {}
       reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint)
@@ -142,6 +120,65 @@ def freeze_graph(input_graph,
   print("%d ops in the final graph." % len(output_graph_def.node))
 
 
+def _parse_input_graph_proto(input_graph, input_binary):
+  """Parser input tensorflow graph into GraphDef proto."""
+  if not gfile.Exists(input_graph):
+    print("Input graph file '" + input_graph + "' does not exist!")
+    return -1
+  input_graph_def = graph_pb2.GraphDef()
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_graph, mode) as f:
+    if input_binary:
+      input_graph_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), input_graph_def)
+  return input_graph_def
+
+
+def _parse_input_saver_proto(input_saver, input_binary):
+  """Parser input tensorflow Saver into SaverDef proto."""
+  if not gfile.Exists(input_saver):
+    print("Input saver file '" + input_saver + "' does not exist!")
+    return -1
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_saver, mode) as f:
+    saver_def = saver_pb2.SaverDef()
+    if input_binary:
+      saver_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), saver_def)
+  return saver_def
+
+
+def freeze_graph(input_graph,
+                 input_saver,
+                 input_binary,
+                 input_checkpoint,
+                 output_node_names,
+                 restore_op_name,
+                 filename_tensor_name,
+                 output_graph,
+                 clear_devices,
+                 initializer_nodes,
+                 variable_names_blacklist=""):
+  """Converts all variables in a graph and checkpoint into constants."""
+  input_graph_def = _parse_input_graph_proto(input_graph, input_binary)
+  input_saver_def = None
+  if input_saver:
+    input_saver_def = _parse_input_saver_proto(input_saver, input_binary)
+  freeze_graph_with_def_protos(
+      input_graph_def,
+      input_saver_def,
+      input_checkpoint,
+      output_node_names,
+      restore_op_name,
+      filename_tensor_name,
+      output_graph,
+      clear_devices,
+      initializer_nodes,
+      variable_names_blacklist)
+
+
 def main(unused_args):
   freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
                FLAGS.input_checkpoint, FLAGS.output_node_names,
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
new file mode 100644
index 00000000000..2bb055e9786
--- /dev/null
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ================================
+"""Imports a protobuf model as a graph in Tensorboard."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary import summary
+
+
+def import_to_tensorboard(model_dir, log_dir):
+  """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
+
+  Args:
+    model_dir: The location of the protobuf (`pb`) model to visualize
+    log_dir: The location for the Tensorboard log to begin visualization from.
+
+  Usage:
+    Call this function with your model location and desired log directory.
+    Launch Tensorboard by pointing it to the log directory.
+    View your imported `.pb` model as a graph.
+  """
+  with session.Session(graph=ops.Graph()) as sess:
+    with gfile.FastGFile(model_dir, "rb") as f:
+      graph_def = graph_pb2.GraphDef()
+      graph_def.ParseFromString(f.read())
+      importer.import_graph_def(graph_def)
+
+    pb_visual_writer = summary.FileWriter(log_dir)
+    pb_visual_writer.add_graph(sess.graph)
+    print("Model Imported. Visualize by running: "
+          "> tensorboard --logdir={}".format(log_dir))
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index e218fd06ab4..47a74e5abfb 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -20,8 +20,11 @@ from __future__ import print_function
 import argparse
 import sys
 
+import numpy as np
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
 
 FLAGS = None
 
@@ -43,7 +46,7 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
     if all_tensors:
       var_to_shape_map = reader.get_variable_to_shape_map()
-      for key in var_to_shape_map:
+      for key in sorted(var_to_shape_map):
         print("tensor_name: ", key)
         print(reader.get_tensor(key))
     elif not tensor_name:
@@ -56,6 +59,46 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
     if "corrupted compressed block contents" in str(e):
       print("It's likely that your checkpoint file has been compressed "
             "with SNAPPY.")
+    if ("Data loss" in str(e) and
+        (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+      proposed_file = ".".join(file_name.split(".")[0:-1])
+      v2_file_error_template = """
+It's likely that this is a V2 checkpoint and you need to provide the filename
+*prefix*.  Try removing the '.' and extension.  Try:
+inspect checkpoint --file_name = {}"""
+      print(v2_file_error_template.format(proposed_file))
+
+
+def parse_numpy_printoption(kv_str):
+  """Sets a single numpy printoption from a string of the form 'x=y'.
+
+  See documentation on numpy.set_printoptions() for details about what values
+  x and y can take. x can be any option listed there other than 'formatter'.
+
+  Args:
+    kv_str: A string of the form 'x=y', such as 'threshold=100000'
+
+  Raises:
+    argparse.ArgumentTypeError: If the string couldn't be used to set any
+        nump printoption.
+  """
+  k_v_str = kv_str.split("=", 1)
+  if len(k_v_str) != 2 or not k_v_str[0]:
+    raise argparse.ArgumentTypeError("'%s' is not in the form k=v." % kv_str)
+  k, v_str = k_v_str
+  printoptions = np.get_printoptions()
+  if k not in printoptions:
+    raise argparse.ArgumentTypeError("'%s' is not a valid printoption." % k)
+  v_type = type(printoptions[k])
+  if v_type is type(None):
+    raise argparse.ArgumentTypeError(
+        "Setting '%s' from the command line is not supported." % k)
+  try:
+    v = (v_type(v_str) if v_type is not bool
+         else flags.BooleanParser().parse(v_str))
+  except ValueError as e:
+    raise argparse.ArgumentTypeError(e.message)
+  np.set_printoptions(**{k: v})
 
 
 def main(unused_argv):
@@ -87,5 +130,10 @@ if __name__ == "__main__":
       type="bool",
       default=False,
       help="If True, print the values of all the tensors.")
+  parser.add_argument(
+      "--printoptions",
+      nargs="*",
+      type=parse_numpy_printoption,
+      help="Argument for numpy.set_printoptions(), in the form 'k=v'.")
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 4fa1ac5042f..902748d55ef 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -99,7 +99,8 @@ def main(unused_args):
   return 0
 
 
-if __name__ == "__main__":
+def parse_args():
+  """Parses command line arguments."""
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
@@ -137,5 +138,9 @@ if __name__ == "__main__":
       type=int,
       default=dtypes.float32.as_datatype_enum,
       help="The AttrValue enum to use for placeholders.")
-  FLAGS, unparsed = parser.parse_known_args()
+  return parser.parse_known_args()
+
+
+if __name__ == "__main__":
+  FLAGS, unparsed = parse_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 2ff7c18150c..3e80f1ecd7a 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -89,7 +89,8 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
                                                       input_node_names,
                                                       output_node_names,
                                                       placeholder_type_enum)
-  optimized_graph_def = graph_util.remove_training_nodes(optimized_graph_def)
+  optimized_graph_def = graph_util.remove_training_nodes(
+      optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
   optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
                                              output_node_names)
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index c84a3c2b49e..62f00f44678 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -14,150 +14,65 @@
 # ==============================================================================
 r"""Prints a header file to be used with SELECTIVE_REGISTRATION.
 
-Example usage:
-  print_selective_registration_header \
-      --graphs=path/to/graph.pb > ops_to_register.h
+An example of command-line usage is:
+  bazel build tensorflow/python/tools:print_selective_registration_header && \
+  bazel-bin/tensorflow/python/tools/print_selective_registration_header \
+    --graphs=path/to/graph.pb > ops_to_register.h
 
-  Then when compiling tensorflow, include ops_to_register.h in the include
-  search path and pass -DSELECTIVE_REGISTRATION  - see
-  core/framework/selective_registration.h for more details.
+Then when compiling tensorflow, include ops_to_register.h in the include search
+path and pass -DSELECTIVE_REGISTRATION and -DSUPPORT_SELECTIVE_REGISTRATION
+ - see core/framework/selective_registration.h for more details.
+
+When compiling for Android:
+  bazel build -c opt --copt="-DSELECTIVE_REGISTRATION" \
+    --copt="-DSUPPORT_SELECTIVE_REGISTRATION" \
+    //tensorflow/contrib/android:libtensorflow_inference.so \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    --config=android_arm
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import sys
 
-from google.protobuf import text_format
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging
+from tensorflow.python.tools import selective_registration_header_lib
 
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('proto_fileformat', 'rawproto',
-                    'Format of proto file, either textproto or rawproto')
-
-flags.DEFINE_string(
-    'graphs', '',
-    'Comma-separated list of paths to model files to be analyzed.')
-
-flags.DEFINE_string(
-    'default_ops', 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp',
-    'Default operator:kernel pairs to always include implementation for. '
-    'Pass "all" to have all operators and kernels included; note that this '
-    'should be used only when it is useful compared with simply not using '
-    'selective registration, as it can in some cases limit the effect of '
-    'compilation caches')
-
-
-def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
-  """Gets the ops and kernels needed from the model files."""
-  ops = set()
-
-  for proto_file in proto_files:
-    tf_logging.info('Loading proto file %s', proto_file)
-    # Load GraphDef.
-    file_data = gfile.GFile(proto_file, 'rb').read()
-    if proto_fileformat == 'rawproto':
-      graph_def = graph_pb2.GraphDef.FromString(file_data)
-    else:
-      assert proto_fileformat == 'textproto'
-      graph_def = text_format.Parse(file_data, graph_pb2.GraphDef())
-
-    # Find all ops and kernels used by the graph.
-    for node_def in graph_def.node:
-      if not node_def.device:
-        node_def.device = '/cpu:0'
-      kernel_class = pywrap_tensorflow.TryFindKernelClass(
-          node_def.SerializeToString())
-      if kernel_class:
-        op_and_kernel = (str(node_def.op), kernel_class.decode('utf-8'))
-        if op_and_kernel not in ops:
-          ops.add(op_and_kernel)
-      else:
-        print(
-            'Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
-
-  # Add default ops.
-  if default_ops_str != 'all':
-    for s in default_ops_str.split(','):
-      op, kernel = s.split(':')
-      op_and_kernel = (op, kernel)
-      if op_and_kernel not in ops:
-        ops.add(op_and_kernel)
-
-  return list(sorted(ops))
-
-
-def get_header(ops_and_kernels, include_all_ops_and_kernels):
-  """Returns a header for use with tensorflow SELECTIVE_REGISTRATION.
-
-  Args:
-    ops_and_kernels: a set of (op_name, kernel_class_name) pairs to include.
-    include_all_ops_and_kernels: if True, ops_and_kernels is ignored and all op
-    kernels are included.
-
-  Returns:
-    the string of the header that should be written as ops_to_register.h.
-  """
-  ops = set([op for op, _ in ops_and_kernels])
-  result_list = []
-
-  def append(s):
-    result_list.append(s)
-
-  append('#ifndef OPS_TO_REGISTER')
-  append('#define OPS_TO_REGISTER')
-
-  if include_all_ops_and_kernels:
-    append('#define SHOULD_REGISTER_OP(op) true')
-    append('#define SHOULD_REGISTER_OP_KERNEL(clz) true')
-    append('#define SHOULD_REGISTER_OP_GRADIENT true')
-  else:
-    append('constexpr inline bool ShouldRegisterOp(const char op[]) {')
-    append('  return false')
-    for op in sorted(ops):
-      append('     || (strcmp(op, "%s") == 0)' % op)
-    append('  ;')
-    append('}')
-    append('#define SHOULD_REGISTER_OP(op) ShouldRegisterOp(op)')
-    append('')
-
-    line = 'const char kNecessaryOpKernelClasses[] = ","\n'
-    for _, kernel_class in ops_and_kernels:
-      line += '"%s,"\n' % kernel_class
-    line += ';'
-    append(line)
-    append('#define SHOULD_REGISTER_OP_KERNEL(clz) '
-           '(strstr(kNecessaryOpKernelClasses, "," clz ",") != nullptr)')
-    append('')
-
-    append('#define SHOULD_REGISTER_OP_GRADIENT ' + (
-        'true' if 'SymbolicGradient' in ops else 'false'))
-
-  append('#endif')
-  return '\n'.join(result_list)
+FLAGS = None
 
 
 def main(unused_argv):
-  if not FLAGS.graphs:
-    print('--graphs is required')
-    return 1
   graphs = FLAGS.graphs.split(',')
-  ops_and_kernels = get_ops_and_kernels(FLAGS.proto_fileformat, graphs,
-                                        FLAGS.default_ops)
-  if not ops_and_kernels:
-    print('Error reading graph!')
-    return 1
-
-  print(get_header(ops_and_kernels, FLAGS.default_ops == 'all'))
+  print(selective_registration_header_lib.get_header(
+      graphs, FLAGS.proto_fileformat, FLAGS.default_ops))
 
 
 if __name__ == '__main__':
-  app.run()
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--graphs',
+      type=str,
+      default='',
+      help='Comma-separated list of paths to model files to be analyzed.',
+      required=True)
+  parser.add_argument(
+      '--proto_fileformat',
+      type=str,
+      default='rawproto',
+      help='Format of proto file, either textproto or rawproto.')
+  parser.add_argument(
+      '--default_ops',
+      type=str,
+      default='NoOp:NoOp,_Recv:RecvOp,_Send:SendOp',
+      help='Default operator:kernel pairs to always include implementation for.'
+      'Pass "all" to have all operators and kernels included; note that this '
+      'should be used only when it is useful compared with simply not using '
+      'selective registration, as it can in some cases limit the effect of '
+      'compilation caches')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/tools/print_selective_registration_header_test.py b/tensorflow/python/tools/print_selective_registration_header_test.py
index 26a383cd8c0..36978b0860a 100644
--- a/tensorflow/python/tools/print_selective_registration_header_test.py
+++ b/tensorflow/python/tools/print_selective_registration_header_test.py
@@ -19,13 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import sys
 
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.tools import print_selective_registration_header
+from tensorflow.python.tools import selective_registration_header_lib
 
 # Note that this graph def is not valid to be loaded - its inputs are not
 # assigned correctly in all cases.
@@ -71,6 +72,9 @@ GRAPH_DEF_TXT_2 = """
 
 class PrintOpFilegroupTest(test.TestCase):
 
+  def setUp(self):
+    _, self.script_name = os.path.split(sys.argv[0])
+
   def WriteGraphFiles(self, graphs):
     fnames = []
     for i, graph in enumerate(graphs):
@@ -87,7 +91,7 @@ class PrintOpFilegroupTest(test.TestCase):
         for d in [GRAPH_DEF_TXT, GRAPH_DEF_TXT_2]
     ]
 
-    ops_and_kernels = print_selective_registration_header.get_ops_and_kernels(
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
         'rawproto', self.WriteGraphFiles(graphs), default_ops)
     self.assertListEqual(
         [
@@ -103,7 +107,7 @@ class PrintOpFilegroupTest(test.TestCase):
 
     graphs[0].node[0].ClearField('device')
     graphs[0].node[2].ClearField('device')
-    ops_and_kernels = print_selective_registration_header.get_ops_and_kernels(
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
         'rawproto', self.WriteGraphFiles(graphs), default_ops)
     self.assertListEqual(
         [
@@ -123,13 +127,14 @@ class PrintOpFilegroupTest(test.TestCase):
         text_format.Parse(d, graph_pb2.GraphDef())
         for d in [GRAPH_DEF_TXT, GRAPH_DEF_TXT_2]
     ]
-    ops_and_kernels = print_selective_registration_header.get_ops_and_kernels(
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
         'rawproto', self.WriteGraphFiles(graphs), default_ops)
 
-    header = print_selective_registration_header.get_header(ops_and_kernels,
-                                                            default_ops)
+    header = selective_registration_header_lib.get_header_from_ops_and_kernels(
+        ops_and_kernels, include_all_ops_and_kernels=True)
     self.assertListEqual(
         [
+            '// This file was autogenerated by %s' % self.script_name,
             '#ifndef OPS_TO_REGISTER',  #
             '#define OPS_TO_REGISTER',  #
             '#define SHOULD_REGISTER_OP(op) true',  #
@@ -139,6 +144,64 @@ class PrintOpFilegroupTest(test.TestCase):
         ],
         header.split('\n'))
 
+    self.assertListEqual(
+        header.split('\n'),
+        selective_registration_header_lib.get_header(
+            self.WriteGraphFiles(graphs), 'rawproto', default_ops).split('\n'))
+
+  def testGetSelectiveHeader(self):
+    default_ops = ''
+    graphs = [text_format.Parse(GRAPH_DEF_TXT_2, graph_pb2.GraphDef())]
+
+    expected = '''// This file was autogenerated by %s
+#ifndef OPS_TO_REGISTER
+#define OPS_TO_REGISTER
+
+    namespace {
+      constexpr const char* skip(const char* x) {
+        return (*x) ? (*x == ' ' ? skip(x + 1) : x) : x;
+      }
+
+      constexpr bool isequal(const char* x, const char* y) {
+        return (*skip(x) && *skip(y))
+                   ? (*skip(x) == *skip(y) && isequal(skip(x) + 1, skip(y) + 1))
+                   : (!*skip(x) && !*skip(y));
+      }
+
+      template<int N>
+      struct find_in {
+        static constexpr bool f(const char* x, const char* const y[N]) {
+          return isequal(x, y[0]) || find_in<N - 1>::f(x, y + 1);
+        }
+      };
+
+      template<>
+      struct find_in<0> {
+        static constexpr bool f(const char* x, const char* const y[]) {
+          return false;
+        }
+      };
+    }  // end namespace
+    constexpr const char* kNecessaryOpKernelClasses[] = {
+"BiasOp<CPUDevice, float>",
+};
+#define SHOULD_REGISTER_OP_KERNEL(clz) (find_in<sizeof(kNecessaryOpKernelClasses) / sizeof(*kNecessaryOpKernelClasses)>::f(clz, kNecessaryOpKernelClasses))
+
+constexpr inline bool ShouldRegisterOp(const char op[]) {
+  return false
+     || isequal(op, "BiasAdd")
+  ;
+}
+#define SHOULD_REGISTER_OP(op) ShouldRegisterOp(op)
+
+#define SHOULD_REGISTER_OP_GRADIENT false
+#endif''' % self.script_name
+
+    header = selective_registration_header_lib.get_header(
+        self.WriteGraphFiles(graphs), 'rawproto', default_ops)
+    print(header)
+    self.assertListEqual(expected.split('\n'), header.split('\n'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
new file mode 100644
index 00000000000..e1be3055052
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -0,0 +1,643 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Command-line interface to inspect and execute a graph in a SavedModel.
+
+For detailed usages and examples, please refer to:
+https://www.tensorflow.org/programmers_guide/saved_model_cli
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import sys
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.platform import app
+from tensorflow.python.saved_model import loader
+
+
+def _show_tag_sets(saved_model_dir):
+  """Prints the tag-sets stored in SavedModel directory.
+
+  Prints all the tag-sets for MetaGraphs stored in SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+  """
+  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  print('The given SavedModel contains the following tag-sets:')
+  for tag_set in sorted(tag_sets):
+    print(', '.join(sorted(tag_set)))
+
+
+def _show_signature_def_map_keys(saved_model_dir, tag_set):
+  """Prints the keys for each SignatureDef in the SignatureDef map.
+
+  Prints the list of SignatureDef keys from the SignatureDef map specified by
+  the given tag-set and SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef to get SignatureDef map from,
+        in string format, separated by ','. For tag-set contains multiple tags,
+        all tags must be passed in.
+  """
+  signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
+  print('The given SavedModel MetaGraphDef contains SignatureDefs with the '
+        'following keys:')
+  for signature_def_key in sorted(signature_def_map.keys()):
+    print('SignatureDef key: \"%s\"' % signature_def_key)
+
+
+def _get_inputs_tensor_info_from_meta_graph_def(meta_graph_def,
+                                                signature_def_key):
+  """Gets TensorInfo for all inputs of the SignatureDef.
+
+  Returns a dictionary that maps each input key to its TensorInfo for the given
+  signature_def_key in the meta_graph_def
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer with the SignatureDef map to
+        look up SignatureDef key.
+    signature_def_key: A SignatureDef key string.
+
+  Returns:
+    A dictionary that maps input tensor keys to TensorInfos.
+  """
+  return signature_def_utils.get_signature_def_by_key(meta_graph_def,
+                                                      signature_def_key).inputs
+
+
+def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def,
+                                                 signature_def_key):
+  """Gets TensorInfos for all outputs of the SignatureDef.
+
+  Returns a dictionary that maps each output key to its TensorInfo for the given
+  signature_def_key in the meta_graph_def.
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer with the SignatureDefmap to
+    look up signature_def_key.
+    signature_def_key: A SignatureDef key string.
+
+  Returns:
+    A dictionary that maps output tensor keys to TensorInfos.
+  """
+  return signature_def_utils.get_signature_def_by_key(meta_graph_def,
+                                                      signature_def_key).outputs
+
+
+def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
+  """Prints input and output TensorInfos.
+
+  Prints the details of input and output TensorInfos for the SignatureDef mapped
+  by the given signature_def_key.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
+        ','. For tag-set contains multiple tags, all tags must be passed in.
+    signature_def_key: A SignatureDef key string.
+  """
+  meta_graph_def = get_meta_graph_def(saved_model_dir, tag_set)
+  inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+
+  print('The given SavedModel SignatureDef contains the following input(s):')
+  for input_key, input_tensor in sorted(inputs_tensor_info.items()):
+    print('inputs[\'%s\'] tensor_info:' % input_key)
+    _print_tensor_info(input_tensor)
+
+  print('The given SavedModel SignatureDef contains the following output(s):')
+  for output_key, output_tensor in sorted(outputs_tensor_info.items()):
+    print('outputs[\'%s\'] tensor_info:' % output_key)
+    _print_tensor_info(output_tensor)
+
+  print('Method name is: %s' %
+        meta_graph_def.signature_def[signature_def_key].method_name)
+
+
+def _print_tensor_info(tensor_info):
+  """Prints details of the given tensor_info.
+
+  Args:
+    tensor_info: TensorInfo object to be printed.
+  """
+  print('    dtype: ' + types_pb2.DataType.keys()[tensor_info.dtype])
+  # Display shape as tuple.
+  if tensor_info.tensor_shape.unknown_rank:
+    shape = 'unknown_rank'
+  else:
+    dims = [str(dim.size) for dim in tensor_info.tensor_shape.dim]
+    shape = ', '.join(dims)
+    shape = '(' + shape + ')'
+  print('    shape: ' + shape)
+  print('    name: ' + tensor_info.name)
+
+
+def _show_all(saved_model_dir):
+  """Prints tag-set, SignatureDef and Inputs/Outputs information in SavedModel.
+
+  Prints all tag-set, SignatureDef and Inputs/Outputs information stored in
+  SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+  """
+  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  for tag_set in sorted(tag_sets):
+    tag_set = ', '.join(tag_set)
+    print('\nMetaGraphDef with tag-set: \'' + tag_set +
+          '\' contains the following SignatureDefs:')
+
+    signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
+    for signature_def_key in sorted(signature_def_map.keys()):
+      print('\nsignature_def[\'' + signature_def_key + '\']:')
+      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key)
+
+
+def get_meta_graph_def(saved_model_dir, tag_set):
+  """Gets MetaGraphDef from SavedModel.
+
+  Returns the MetaGraphDef for the given tag-set and SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect or execute.
+    tag_set: Group of tag(s) of the MetaGraphDef to load, in string format,
+        separated by ','. For tag-set contains multiple tags, all tags must be
+        passed in.
+
+  Raises:
+    RuntimeError: An error when the given tag-set does not exist in the
+        SavedModel.
+
+  Returns:
+    A MetaGraphDef corresponding to the tag-set.
+  """
+  saved_model = reader.read_saved_model(saved_model_dir)
+  set_of_tags = set(tag_set.split(','))
+  for meta_graph_def in saved_model.meta_graphs:
+    if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
+      return meta_graph_def
+
+  raise RuntimeError('MetaGraphDef associated with tag-set ' + tag_set +
+                     ' could not be found in SavedModel')
+
+
+def get_signature_def_map(saved_model_dir, tag_set):
+  """Gets SignatureDef map from a MetaGraphDef in a SavedModel.
+
+  Returns the SignatureDef map for the given tag-set in the SavedModel
+  directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect or execute.
+    tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in
+        string format, separated by ','. For tag-set contains multiple tags, all
+        tags must be passed in.
+
+  Returns:
+    A SignatureDef map that maps from string keys to SignatureDefs.
+  """
+  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
+  return meta_graph.signature_def
+
+
+def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
+                                   input_tensor_key_feed_dict, outdir,
+                                   overwrite_flag, tf_debug=False):
+  """Runs SavedModel and fetch all outputs.
+
+  Runs the input dictionary through the MetaGraphDef within a SavedModel
+  specified by the given tag_set and SignatureDef. Also save the outputs to file
+  if outdir is not None.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to execute.
+    tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in
+        string format, separated by ','. For tag-set contains multiple tags, all
+        tags must be passed in.
+    signature_def_key: A SignatureDef key string.
+    input_tensor_key_feed_dict: A dictionary maps input keys to numpy ndarrays.
+    outdir: A directory to save the outputs to. If the directory doesn't exist,
+        it will be created.
+    overwrite_flag: A boolean flag to allow overwrite output file if file with
+        the same name exists.
+    tf_debug: A boolean flag to use TensorFlow Debugger (TFDBG) to observe the
+        intermediate Tensor values and runtime GraphDefs while running the
+        SavedModel.
+
+  Raises:
+    RuntimeError: An error when output file already exists and overwrite is not
+    enabled.
+  """
+  # Get a list of output tensor names.
+  meta_graph_def = get_meta_graph_def(saved_model_dir, tag_set)
+
+  # Re-create feed_dict based on input tensor name instead of key as session.run
+  # uses tensor name.
+  inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  inputs_feed_dict = {
+      inputs_tensor_info[key].name: tensor
+      for key, tensor in input_tensor_key_feed_dict.items()
+  }
+  # Get outputs
+  outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  # Sort to preserve order because we need to go from value to key later.
+  output_tensor_keys_sorted = sorted(outputs_tensor_info.keys())
+  output_tensor_names_sorted = [
+      outputs_tensor_info[tensor_key].name
+      for tensor_key in output_tensor_keys_sorted
+  ]
+
+  with session.Session(graph=ops_lib.Graph()) as sess:
+    loader.load(sess, tag_set.split(','), saved_model_dir)
+
+    if tf_debug:
+      sess = local_cli_wrapper.LocalCLIDebugWrapperSession(sess)
+
+    outputs = sess.run(output_tensor_names_sorted, feed_dict=inputs_feed_dict)
+
+    for i, output in enumerate(outputs):
+      output_tensor_key = output_tensor_keys_sorted[i]
+      print('Result for output key %s:\n%s' % (output_tensor_key, output))
+
+      # Only save if outdir is specified.
+      if outdir:
+        # Create directory if outdir does not exist
+        if not os.path.isdir(outdir):
+          os.makedirs(outdir)
+        output_full_path = os.path.join(outdir, output_tensor_key + '.npy')
+
+        # If overwrite not enabled and file already exist, error out
+        if not overwrite_flag and os.path.exists(output_full_path):
+          raise RuntimeError(
+              'Output file %s already exists. Add \"--overwrite\" to overwrite'
+              ' the existing output files.' % output_full_path)
+
+        np.save(output_full_path, output)
+        print('Output %s is saved to %s' % (output_tensor_key,
+                                            output_full_path))
+
+
+def preprocess_inputs_arg_string(inputs_str):
+  """Parses input arg into dictionary that maps input to file/variable tuple.
+
+  Parses input string in the format of, for example,
+  "input1=filename1[variable_name1],input2=filename2" into a
+  dictionary looks like
+  {'input_key1': (filename1, variable_name1),
+   'input_key2': (file2, None)}
+  , which maps input keys to a tuple of file name and variable name(None if
+  empty).
+
+  Args:
+    inputs_str: A string that specified where to load inputs. Inputs are
+    separated by semicolons.
+        * For each input key:
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
+
+  Returns:
+    A dictionary that maps input keys to a tuple of file name and variable name.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+  inputs_raw = inputs_str.split(';')
+  for input_raw in filter(bool, inputs_raw):  # skip empty strings
+    # Format of input=filename[variable_name]'
+    match = re.match(r'([^=]+)=([^\[\]]+)\[([^\[\]]+)\]$', input_raw)
+
+    if match:
+      input_dict[match.group(1)] = match.group(2), match.group(3)
+    else:
+      # Format of input=filename'
+      match = re.match(r'([^=]+)=([^\[\]]+)$', input_raw)
+      if match:
+        input_dict[match.group(1)] = match.group(2), None
+      else:
+        raise RuntimeError(
+            '--inputs "%s" format is incorrect. Please follow'
+            '"<input_key>=<filename>", or'
+            '"<input_key>=<filename>[<variable_name>]"' % input_raw)
+
+  return input_dict
+
+
+def preprocess_input_exprs_arg_string(input_exprs_str):
+  """Parses input arg into dictionary that maps input key to python expression.
+
+  Parses input string in the format of 'input_key=<python expression>' into a
+  dictionary that maps each input_key to its python expression.
+
+  Args:
+    input_exprs_str: A string that specifies python expression for input keys.
+    Each input is separated by semicolon. For each input key:
+        'input_key=<python expression>'
+
+  Returns:
+    A dictionary that maps input keys to python expressions.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+
+  for input_raw in filter(bool, input_exprs_str.split(';')):
+    if '=' not in input_exprs_str:
+      raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow'
+                         '"<input_key>=<python expression>"' % input_exprs_str)
+    input_key, expr = input_raw.split('=')
+    input_dict[input_key] = expr
+
+  return input_dict
+
+
+def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
+  """Parses input arg strings and create inputs feed_dict.
+
+  Parses '--inputs' string for inputs to be loaded from file, and parses
+  '--input_exprs' string for inputs to be evaluated from python expression.
+
+  Args:
+    inputs_str: A string that specified where to load inputs. Each input is
+        separated by semicolon.
+        * For each input key:
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
+        * File specified by 'filename' will be loaded using numpy.load. Inputs
+            can be loaded from only .npy, .npz or pickle files.
+        * The "[variable_name]" key is optional depending on the input file type
+            as descripted in more details below.
+        When loading from a npy file, which always contains a numpy ndarray, the
+        content will be directly assigned to the specified input tensor. If a
+        variable_name is specified, it will be ignored and a warning will be
+        issued.
+        When loading from a npz zip file, user can specify which variable within
+        the zip file to load for the input tensor inside the square brackets. If
+        nothing is specified, this function will check that only one file is
+        included in the zip and load it for the specified input tensor.
+        When loading from a pickle file, if no variable_name is specified in the
+        square brackets, whatever that is inside the pickle file will be passed
+        to the specified input tensor, else SavedModel CLI will assume a
+        dictionary is stored in the pickle file and the value corresponding to
+        the variable_name will be used.
+    input_exprs_str: A string that specified python expressions for inputs.
+        * In the format of: '<input_key>=<python expression>'.
+        * numpy module is available as np.
+
+  Returns:
+    A dictionary that maps input tensor keys to numpy ndarrays.
+
+  Raises:
+    RuntimeError: An error when a key is specified, but the input file contains
+        multiple numpy ndarrays, none of which matches the given key.
+    RuntimeError: An error when no key is specified, but the input file contains
+        more than one numpy ndarrays.
+  """
+  tensor_key_feed_dict = {}
+
+  inputs = preprocess_inputs_arg_string(inputs_str)
+  input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
+
+  for input_tensor_key, (filename, variable_name) in inputs.items():
+    data = np.load(filename)
+
+    # When a variable_name key is specified for the input file
+    if variable_name:
+      # if file contains a single ndarray, ignore the input name
+      if isinstance(data, np.ndarray):
+        warnings.warn(
+            'Input file %s contains a single ndarray. Name key \"%s\" ignored.'
+            % (filename, variable_name))
+        tensor_key_feed_dict[input_tensor_key] = data
+      else:
+        if variable_name in data:
+          tensor_key_feed_dict[input_tensor_key] = data[variable_name]
+        else:
+          raise RuntimeError(
+              'Input file %s does not contain variable with name \"%s\".' %
+              (filename, variable_name))
+    # When no key is specified for the input file.
+    else:
+      # Check if npz file only contains a single numpy ndarray.
+      if isinstance(data, np.lib.npyio.NpzFile):
+        variable_name_list = data.files
+        if len(variable_name_list) != 1:
+          raise RuntimeError(
+              'Input file %s contains more than one ndarrays. Please specify '
+              'the name of ndarray to use.' % filename)
+        tensor_key_feed_dict[input_tensor_key] = data[variable_name_list[0]]
+      else:
+        tensor_key_feed_dict[input_tensor_key] = data
+
+  # When input is a python expression:
+  for input_tensor_key, py_expr in input_exprs.items():
+    if input_tensor_key in tensor_key_feed_dict:
+      warnings.warn(
+          'input_key %s has been specified with both --inputs and --input_exprs'
+          ' options. Value in --input_exprs will be used.' % input_tensor_key)
+
+    # ast.literal_eval does not work with numpy expressions
+    tensor_key_feed_dict[input_tensor_key] = eval(py_expr)  # pylint: disable=eval-used
+
+  return tensor_key_feed_dict
+
+
+def show(args):
+  """Function triggered by show command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  # If all tag is specified, display all information.
+  if args.all:
+    _show_all(args.dir)
+  else:
+    # If no tag is specified, display all tag_set, if no signaure_def key is
+    # specified, display all SignatureDef keys, else show input output tensor
+    # information corresponding to the given SignatureDef key
+    if args.tag_set is None:
+      _show_tag_sets(args.dir)
+    else:
+      if args.signature_def is None:
+        _show_signature_def_map_keys(args.dir, args.tag_set)
+      else:
+        _show_inputs_outputs(args.dir, args.tag_set, args.signature_def)
+
+
+def run(args):
+  """Function triggered by run command.
+
+  Args:
+    args: A namespace parsed from command line.
+
+  Raises:
+    AttributeError: An error when neither --inputs nor --input_exprs is passed
+    to run command.
+  """
+  if not args.inputs and not args.input_exprs:
+    raise AttributeError(
+        'At least one of --inputs and --input_exprs must be required')
+  tensor_key_feed_dict = load_inputs_from_input_arg_string(
+      args.inputs, args.input_exprs)
+  run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
+                                 tensor_key_feed_dict, args.outdir,
+                                 args.overwrite, tf_debug=args.tf_debug)
+
+
+def create_parser():
+  """Creates a parser that parse the command line arguments.
+
+  Returns:
+    A namespace parsed from command line arguments.
+  """
+  parser = argparse.ArgumentParser(
+      description='saved_model_cli: Command-line interface for SavedModel')
+  parser.add_argument('-v', '--version', action='version', version='0.1.0')
+
+  subparsers = parser.add_subparsers(
+      title='commands', description='valid commands', help='additional help')
+
+  # show command
+  show_msg = (
+      'Usage examples:\n'
+      'To show all tag-sets in a SavedModel:\n'
+      '$saved_model_cli show --dir /tmp/saved_model\n'
+      'To show all available SignatureDef keys in a '
+      'MetaGraphDef specified by its tag-set:\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
+      'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
+      'passed in, separated by \';\':\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
+      'To show all inputs and outputs TensorInfo for a specific'
+      ' SignatureDef specified by the SignatureDef key in a'
+      ' MetaGraph.\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      '--signature_def serving_default\n\n'
+      'To show all available information in the SavedModel\n:'
+      '$saved_model_cli show --dir /tmp/saved_model --all')
+  parser_show = subparsers.add_parser(
+      'show',
+      description=show_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_show.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to inspect')
+  parser_show.add_argument(
+      '--all',
+      action='store_true',
+      help='if set, will output all information in given SavedModel')
+  parser_show.add_argument(
+      '--tag_set',
+      type=str,
+      default=None,
+      help='tag-set of graph in SavedModel to show, separated by \',\'')
+  parser_show.add_argument(
+      '--signature_def',
+      type=str,
+      default=None,
+      metavar='SIGNATURE_DEF_KEY',
+      help='key of SignatureDef to display input(s) and output(s) for')
+  parser_show.set_defaults(func=show)
+
+  # run command
+  run_msg = ('Usage example:\n'
+             'To run input tensors from files through a MetaGraphDef and save'
+             ' the output tensors to files:\n'
+             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+             '--signature_def serving_default '
+             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy'
+             '--input_exprs \'input3_key=np.ones(2)\' --outdir=/out\n\n'
+             'For more information about input file format, please see:\n'
+             'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
+  parser_run = subparsers.add_parser(
+      'run', description=run_msg, formatter_class=argparse.RawTextHelpFormatter)
+  parser_run.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to execute')
+  parser_run.add_argument(
+      '--tag_set',
+      type=str,
+      required=True,
+      help='tag-set of graph in SavedModel to load, separated by \',\'')
+  parser_run.add_argument(
+      '--signature_def',
+      type=str,
+      required=True,
+      metavar='SIGNATURE_DEF_KEY',
+      help='key of SignatureDef to run')
+  msg = ('Loading inputs from files, in the format of \'<input_key>=<filename>,'
+         ' or \'<input_key>=<filename>[<variable_name>]\', separated by \';\'.'
+         ' The file format can only be from .npy, .npz or pickle.')
+  parser_run.add_argument('--inputs', type=str, default='', help=msg)
+  msg = ('Specifying inputs by python expressions, in the format of'
+         ' "<input_key>=\'<python expression>\'", separated by \';\'. '
+         'numpy module is available as \'np\'. '
+         'Will override duplicate input_keys from --inputs option.')
+  parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
+  parser_run.add_argument(
+      '--outdir',
+      type=str,
+      default=None,
+      help='if specified, output tensor(s) will be saved to given directory')
+  parser_run.add_argument(
+      '--overwrite',
+      action='store_true',
+      help='if set, output file will be overwritten if it already exists.')
+  parser_run.add_argument(
+      '--tf_debug',
+      action='store_true',
+      help='if set, will use TensorFlow Debugger (tfdbg) to watch the '
+           'intermediate Tensors and runtime GraphDefs while running the '
+           'SavedModel.')
+  parser_run.set_defaults(func=run)
+
+  return parser
+
+
+def main():
+  parser = create_parser()
+  args = parser.parse_args()
+  args.func(args)
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
new file mode 100644
index 00000000000..8f79c888ebd
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -0,0 +1,455 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModelCLI tool.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import os
+import pickle
+import shutil
+import sys
+
+import numpy as np
+from six import StringIO
+
+from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.platform import test
+from tensorflow.python.tools import saved_model_cli
+
+SAVED_MODEL_PATH = ('cc/saved_model/testdata/half_plus_two/00000123')
+
+
+@contextlib.contextmanager
+def captured_output():
+  new_out, new_err = StringIO(), StringIO()
+  old_out, old_err = sys.stdout, sys.stderr
+  try:
+    sys.stdout, sys.stderr = new_out, new_err
+    yield sys.stdout, sys.stderr
+  finally:
+    sys.stdout, sys.stderr = old_out, old_err
+
+
+class SavedModelCLITestCase(test.TestCase):
+
+  def testShowCommandAll(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(['show', '--dir', base_path, '--all'])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    # pylint: disable=line-too-long
+    exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['classify_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/classify
+
+signature_def['classify_x_to_y']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/classify
+
+signature_def['regress_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/regress
+
+signature_def['regress_x_to_y']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/regress
+
+signature_def['regress_x_to_y2']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y2:0
+Method name is: tensorflow/serving/regress
+
+signature_def['serving_default']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict"""
+    # pylint: enable=line-too-long
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandTags(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(['show', '--dir', base_path])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    exp_out = 'The given SavedModel contains the following tag-sets:\nserve'
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandSignature(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(
+        ['show', '--dir', base_path, '--tag_set', 'serve'])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    exp_header = ('The given SavedModel MetaGraphDef contains SignatureDefs '
+                  'with the following keys:')
+    exp_start = 'SignatureDef key: '
+    exp_keys = [
+        '"classify_x2_to_y3"', '"classify_x_to_y"', '"regress_x2_to_y3"',
+        '"regress_x_to_y"', '"regress_x_to_y2"', '"serving_default"'
+    ]
+    # Order of signatures does not matter
+    self.assertMultiLineEqual(
+        output,
+        '\n'.join([exp_header] + [exp_start + exp_key for exp_key in exp_keys]))
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandErrorNoTagSet(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(
+        ['show', '--dir', base_path, '--tag_set', 'badtagset'])
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.show(args)
+
+  def testShowCommandInputsOutputs(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args([
+        'show', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default'
+    ])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    expected_output = (
+        'The given SavedModel SignatureDef contains the following input(s):\n'
+        'inputs[\'x\'] tensor_info:\n'
+        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: x:0\n'
+        'The given SavedModel SignatureDef contains the following output(s):\n'
+        'outputs[\'y\'] tensor_info:\n'
+        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: y:0\n'
+        'Method name is: tensorflow/serving/predict')
+    self.assertEqual(output, expected_output)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testInputPreProcessFormats(self):
+    input_str = 'input1=/path/file.txt[ab3];input2=file2'
+    input_expr_str = 'input3=np.zeros([2,2]);input4=[4,5]'
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_expr_dict = saved_model_cli.preprocess_input_exprs_arg_string(
+        input_expr_str)
+    self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
+    self.assertTrue(input_dict['input2'] == ('file2', None))
+    self.assertTrue(input_expr_dict['input3'] == 'np.zeros([2,2])')
+    self.assertTrue(input_expr_dict['input4'] == '[4,5]')
+    self.assertTrue(len(input_dict) == 2)
+    self.assertTrue(len(input_expr_dict) == 2)
+
+  def testInputPreProcessFileNames(self):
+    input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
+                 r'input:0=c:\PROGRA~1\data.npy')
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    print(input_dict)
+    self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
+                                             'v:0'))
+    self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
+
+  def testInputPreProcessErrorBadFormat(self):
+    input_str = 'inputx=file[[v1]v2'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:file'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:np.zeros((5))'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_input_exprs_arg_string(input_str)
+
+  def testInputParserNPY(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(6)).reshape(2, 3)
+    input0_path = os.path.join(test.get_temp_dir(), 'input0.npy')
+    input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
+    np.save(input0_path, x0)
+    np.save(input1_path, x1)
+    input_str = 'x0=' + input0_path + '[x0];x1=' + input1_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x0'] == x0))
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+
+  def testInputParserNPZ(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    input_str = 'x=' + input_path + '[a];y=' + input_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x'] == x0))
+    self.assertTrue(np.all(feed_dict['y'] == x0))
+
+  def testInputParserPickle(self):
+    pkl0 = {'a': 5, 'b': np.array(range(4))}
+    pkl1 = np.array([1])
+    pkl2 = np.array([[1], [3]])
+    input_path0 = os.path.join(test.get_temp_dir(), 'pickle0.pkl')
+    input_path1 = os.path.join(test.get_temp_dir(), 'pickle1.pkl')
+    input_path2 = os.path.join(test.get_temp_dir(), 'pickle2.pkl')
+    with open(input_path0, 'wb') as f:
+      pickle.dump(pkl0, f)
+    with open(input_path1, 'wb') as f:
+      pickle.dump(pkl1, f)
+    with open(input_path2, 'wb') as f:
+      pickle.dump(pkl2, f)
+    input_str = 'x=' + input_path0 + '[b];y=' + input_path1 + '[c];'
+    input_str += 'z=' + input_path2
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x'] == pkl0['b']))
+    self.assertTrue(np.all(feed_dict['y'] == pkl1))
+    self.assertTrue(np.all(feed_dict['z'] == pkl2))
+
+  def testInputParserPythonExpression(self):
+    x1 = np.ones([2, 10])
+    x2 = np.array([[1], [2], [3]])
+    x3 = np.mgrid[0:5, 0:5]
+    x4 = [[3], [4]]
+    input_expr_str = ('x1=np.ones([2,10]);x2=np.array([[1],[2],[3]]);'
+                      'x3=np.mgrid[0:5,0:5];x4=[[3],[4]]')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        '', input_expr_str)
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+    self.assertTrue(np.all(feed_dict['x2'] == x2))
+    self.assertTrue(np.all(feed_dict['x3'] == x3))
+    self.assertTrue(np.all(feed_dict['x4'] == x4))
+
+  def testInputParserBoth(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x1=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x0))
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+
+  def testInputParserBothDuplicate(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x0=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x1))
+
+  def testInputParserErrorNoName(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(5))
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0, b=x1)
+    input_str = 'x=' + input_path
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+
+  def testInputParserErrorWrongName(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(5))
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0, b=x1)
+    input_str = 'x=' + input_path + '[c]'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+
+  def testRunCommandExistingOutdir(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(), 'testRunCommand_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'outputs.npy')
+    if os.path.exists(output_file):
+      os.remove(output_file)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x2_to_y3', '--inputs', 'inputs=' + input_path + '[x0]',
+        '--outdir',
+        test.get_temp_dir()
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(output_file)
+    y_expected = np.array([[3.5], [4.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandNewOutdir(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandNewOutdir_inputs.npz')
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    if os.path.isdir(output_dir):
+      shutil.rmtree(output_dir)
+    np.savez(input_path, x0=x, x1=x_notused)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        output_dir
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(os.path.join(output_dir, 'y.npy'))
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandOutOverwrite(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandOutOverwrite_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'y.npy')
+    open(output_file, 'a').close()
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        test.get_temp_dir(), '--overwrite'
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(output_file)
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandOutputFileExistError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandOutOverwrite_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'y.npy')
+    open(output_file, 'a').close()
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        test.get_temp_dir()
+    ])
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.run(args)
+
+  def testRunCommandInputNotGivenError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default'
+    ])
+    with self.assertRaises(AttributeError):
+      saved_model_cli.run(args)
+
+  def testRunCommandWithDebuggerEnabled(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandNewOutdir_inputs.npz')
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    if os.path.isdir(output_dir):
+      shutil.rmtree(output_dir)
+    np.savez(input_path, x0=x, x1=x_notused)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        output_dir, '--tf_debug'
+    ])
+
+    def fake_wrapper_session(sess):
+      return sess
+
+    with test.mock.patch.object(local_cli_wrapper,
+                                'LocalCLIDebugWrapperSession',
+                                side_effect=fake_wrapper_session,
+                                autospec=True) as fake:
+      saved_model_cli.run(args)
+      fake.assert_called_with(test.mock.ANY)
+
+    y_actual = np.load(os.path.join(output_dir, 'y.npy'))
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/tools/selective_registration_header_lib.py b/tensorflow/python/tools/selective_registration_header_lib.py
new file mode 100644
index 00000000000..7f7470994dd
--- /dev/null
+++ b/tensorflow/python/tools/selective_registration_header_lib.py
@@ -0,0 +1,177 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Computes a header file to be used with SELECTIVE_REGISTRATION.
+
+See the executable wrapper, print_selective_registration_header.py, for more
+information.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from google.protobuf import text_format
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging
+
+
+def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
+  """Gets the ops and kernels needed from the model files."""
+  ops = set()
+
+  for proto_file in proto_files:
+    tf_logging.info('Loading proto file %s', proto_file)
+    # Load GraphDef.
+    file_data = gfile.GFile(proto_file, 'rb').read()
+    if proto_fileformat == 'rawproto':
+      graph_def = graph_pb2.GraphDef.FromString(file_data)
+    else:
+      assert proto_fileformat == 'textproto'
+      graph_def = text_format.Parse(file_data, graph_pb2.GraphDef())
+
+    # Find all ops and kernels used by the graph.
+    for node_def in graph_def.node:
+      if not node_def.device:
+        node_def.device = '/cpu:0'
+      kernel_class = pywrap_tensorflow.TryFindKernelClass(
+          node_def.SerializeToString())
+      if kernel_class:
+        op_and_kernel = (str(node_def.op), kernel_class.decode('utf-8'))
+        if op_and_kernel not in ops:
+          ops.add(op_and_kernel)
+      else:
+        print(
+            'Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
+
+  # Add default ops.
+  if default_ops_str and default_ops_str != 'all':
+    for s in default_ops_str.split(','):
+      op, kernel = s.split(':')
+      op_and_kernel = (op, kernel)
+      if op_and_kernel not in ops:
+        ops.add(op_and_kernel)
+
+  return list(sorted(ops))
+
+
+def get_header_from_ops_and_kernels(ops_and_kernels,
+                                    include_all_ops_and_kernels):
+  """Returns a header for use with tensorflow SELECTIVE_REGISTRATION.
+
+  Args:
+    ops_and_kernels: a set of (op_name, kernel_class_name) pairs to include.
+    include_all_ops_and_kernels: if True, ops_and_kernels is ignored and all op
+    kernels are included.
+
+  Returns:
+    the string of the header that should be written as ops_to_register.h.
+  """
+  ops = set([op for op, _ in ops_and_kernels])
+  result_list = []
+
+  def append(s):
+    result_list.append(s)
+
+  _, script_name = os.path.split(sys.argv[0])
+  append('// This file was autogenerated by %s' % script_name)
+  append('#ifndef OPS_TO_REGISTER')
+  append('#define OPS_TO_REGISTER')
+
+  if include_all_ops_and_kernels:
+    append('#define SHOULD_REGISTER_OP(op) true')
+    append('#define SHOULD_REGISTER_OP_KERNEL(clz) true')
+    append('#define SHOULD_REGISTER_OP_GRADIENT true')
+  else:
+    line = '''
+    namespace {
+      constexpr const char* skip(const char* x) {
+        return (*x) ? (*x == ' ' ? skip(x + 1) : x) : x;
+      }
+
+      constexpr bool isequal(const char* x, const char* y) {
+        return (*skip(x) && *skip(y))
+                   ? (*skip(x) == *skip(y) && isequal(skip(x) + 1, skip(y) + 1))
+                   : (!*skip(x) && !*skip(y));
+      }
+
+      template<int N>
+      struct find_in {
+        static constexpr bool f(const char* x, const char* const y[N]) {
+          return isequal(x, y[0]) || find_in<N - 1>::f(x, y + 1);
+        }
+      };
+
+      template<>
+      struct find_in<0> {
+        static constexpr bool f(const char* x, const char* const y[]) {
+          return false;
+        }
+      };
+    }  // end namespace
+    '''
+    line += 'constexpr const char* kNecessaryOpKernelClasses[] = {\n'
+    for _, kernel_class in ops_and_kernels:
+      line += '"%s",\n' % kernel_class
+    line += '};'
+    append(line)
+    append('#define SHOULD_REGISTER_OP_KERNEL(clz) '
+           '(find_in<sizeof(kNecessaryOpKernelClasses) '
+           '/ sizeof(*kNecessaryOpKernelClasses)>::f(clz, '
+           'kNecessaryOpKernelClasses))')
+    append('')
+
+    append('constexpr inline bool ShouldRegisterOp(const char op[]) {')
+    append('  return false')
+    for op in sorted(ops):
+      append('     || isequal(op, "%s")' % op)
+    append('  ;')
+    append('}')
+    append('#define SHOULD_REGISTER_OP(op) ShouldRegisterOp(op)')
+    append('')
+
+    append('#define SHOULD_REGISTER_OP_GRADIENT ' + (
+        'true' if 'SymbolicGradient' in ops else 'false'))
+
+  append('#endif')
+  return '\n'.join(result_list)
+
+
+def get_header(graphs,
+               proto_fileformat='rawproto',
+               default_ops='NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'):
+  """Computes a header for use with tensorflow SELECTIVE_REGISTRATION.
+
+  Args:
+    graphs: a list of paths to GraphDef files to include.
+    proto_fileformat: optional format of proto file, either 'textproto' or
+      'rawproto' (default).
+    default_ops: optional comma-separated string of operator:kernel pairs to
+      always include implementation for. Pass 'all' to have all operators and
+      kernels included. Default: 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'.
+  Returns:
+    the string of the header that should be written as ops_to_register.h.
+  """
+  ops_and_kernels = get_ops_and_kernels(proto_fileformat, graphs, default_ops)
+  if not ops_and_kernels:
+    print('Error reading graph!')
+    return 1
+
+  return get_header_from_ops_and_kernels(ops_and_kernels, default_ops == 'all')
diff --git a/tensorflow/python/tools/strip_unused_lib.py b/tensorflow/python/tools/strip_unused_lib.py
index 8f9e20ab8e7..b1d19560760 100644
--- a/tensorflow/python/tools/strip_unused_lib.py
+++ b/tensorflow/python/tools/strip_unused_lib.py
@@ -41,14 +41,26 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
         a list that specifies one value per input node name.
 
   Returns:
-    A GraphDef with all unnecessary ops removed.
+    A `GraphDef` with all unnecessary ops removed.
+
+  Raises:
+    ValueError: If any element in `input_node_names` refers to a tensor instead
+      of an operation.
+    KeyError: If any element in `input_node_names` is not found in the graph.
   """
+  for name in input_node_names:
+    if ":" in name:
+      raise ValueError("Name '%s' appears to refer to a Tensor, "
+                       "not a Operation." % name)
+
   # Here we replace the nodes we're going to override as inputs with
   # placeholders so that any unused nodes that are inputs to them are
   # automatically stripped out by extract_sub_graph().
+  not_found = {name for name in input_node_names}
   inputs_replaced_graph_def = graph_pb2.GraphDef()
   for node in input_graph_def.node:
     if node.name in input_node_names:
+      not_found.remove(node.name)
       placeholder_node = node_def_pb2.NodeDef()
       placeholder_node.op = "Placeholder"
       placeholder_node.name = node.name
@@ -67,6 +79,9 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
     else:
       inputs_replaced_graph_def.node.extend([copy.deepcopy(node)])
 
+  if not_found:
+    raise KeyError("The following input nodes were not found: %s\n" % not_found)
+
   output_graph_def = graph_util.extract_sub_graph(inputs_replaced_graph_def,
                                                   output_node_names)
   return output_graph_def
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index d492a0e8229..7cf0c3e3ed9 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -58,16 +58,25 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
     # routine.
     input_graph_path = os.path.join(self.get_temp_dir(), input_graph_name)
     input_binary = False
-    input_node_names = "wanted_input_node"
     output_binary = True
     output_node_names = "output_node"
     output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name)
 
-    strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
-                                             output_graph_path, output_binary,
-                                             input_node_names,
-                                             output_node_names,
-                                             dtypes.float32.as_datatype_enum)
+    def strip(input_node_names):
+      strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
+                                               output_graph_path, output_binary,
+                                               input_node_names,
+                                               output_node_names,
+                                               dtypes.float32.as_datatype_enum)
+
+    with self.assertRaises(KeyError):
+      strip("does_not_exist")
+
+    with self.assertRaises(ValueError):
+      strip("wanted_input_node:0")
+
+    input_node_names = "wanted_input_node"
+    strip(input_node_names)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 0b021d8ce8f..13c07cfd7bf 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -29,8 +29,6 @@ class AdadeltaOptimizer(optimizer.Optimizer):
 
   See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
   ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
-
-  @@__init__
   """
 
   def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8,
@@ -39,6 +37,7 @@ class AdadeltaOptimizer(optimizer.Optimizer):
 
     Args:
       learning_rate: A `Tensor` or a floating point value. The learning rate.
+        To match the exact form in the original paper use 1.0.
       rho: A `Tensor` or a floating point value. The decay rate.
       epsilon: A `Tensor` or a floating point value.  A constant epsilon used
                to better conditioning the grad update.
@@ -83,7 +82,7 @@ class AdadeltaOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     accum_update = self.get_slot(var, "accum_update")
     return training_ops.resource_apply_adadelta(
-        var,
+        var.handle,
         accum.handle,
         accum_update.handle,
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
@@ -110,7 +109,7 @@ class AdadeltaOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     accum_update = self.get_slot(var, "accum_update")
     return training_ops.resource_sparse_apply_adadelta(
-        var,
+        var.handle,
         accum.handle,
         accum_update.handle,
         math_ops.cast(self._lr_t, grad.dtype),
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index 595c17991ef..6da2433b08a 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
@@ -31,8 +31,6 @@ class AdagradOptimizer(optimizer.Optimizer):
   See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
   or this
   [intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
-
-  @@__init__
   """
 
   def __init__(self, learning_rate, initial_accumulator_value=0.1,
@@ -62,10 +60,11 @@ class AdagradOptimizer(optimizer.Optimizer):
   def _create_slots(self, var_list):
     for v in var_list:
       with ops.colocate_with(v):
-        val = constant_op.constant(self._initial_accumulator_value,
-                                   shape=v.get_shape(),
-                                   dtype=v.dtype.base_dtype)
-      self._get_or_make_slot(v, val, "accumulator", self._name)
+        dtype = v.dtype.base_dtype
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype,
+                                              "accumulator", self._name)
 
   def _prepare(self):
     self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
@@ -83,7 +82,7 @@ class AdagradOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     acc = self.get_slot(var, "accumulator")
     return training_ops.resource_apply_adagrad(
-        var,
+        var.handle,
         acc.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
         grad,
@@ -102,7 +101,7 @@ class AdagradOptimizer(optimizer.Optimizer):
   def _resource_apply_sparse(self, grad, var, indices):
     acc = self.get_slot(var, "accumulator")
     return training_ops.resource_sparse_apply_adagrad(
-        var,
+        var.handle,
         acc.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
         grad,
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index 2e0c8f0c19b..1aa513f13e1 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -38,8 +38,6 @@ class AdagradDAOptimizer(optimizer.Optimizer):
   trained model. This optimizer only guarantees sparsity for linear models. Be
   careful when using AdagradDA for deep networks as it will require careful
   initialization of the gradient accumulators for it to train.
-
-  @@__init__
   """
 
   def __init__(self,
@@ -82,6 +80,7 @@ class AdagradDAOptimizer(optimizer.Optimizer):
     self._l1_regularization_strength = l1_regularization_strength
     self._l2_regularization_strength = l2_regularization_strength
     self._global_step = global_step
+    self._global_step_on_worker = None
 
   def _create_slots(self, var_list):
     for v in var_list:
@@ -99,14 +98,16 @@ class AdagradDAOptimizer(optimizer.Optimizer):
   def _prepare(self):
     self._learning_rate_tensor = ops.convert_to_tensor(
         self._learning_rate, name="learning_rate")
+    # Performance optimization so that worker creates a copy of the global step
+    # to avoid overloading the parameter server holding the global step.
+    with ops.colocate_with(self._learning_rate_tensor):
+      self._global_step_on_worker = array_ops.identity(self._global_step) + 1
 
   def _apply_dense(self, grad, var):
     g_acc = self.get_slot(var, "gradient_accumulator")
     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
-    # Performance optimization so that worker creates a copy of the global step
-    # to avoid overloading the parameter server holding the global step.
-    with ops.device(grad[0].device):
-      global_step = array_ops.identity(self._global_step) + 1
+    with ops.device(var.device):
+      global_step = array_ops.identity(self._global_step_on_worker)
     return training_ops.apply_adagrad_da(
         var,
         g_acc,
@@ -121,12 +122,10 @@ class AdagradDAOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     g_acc = self.get_slot(var, "gradient_accumulator")
     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
-    # Performance optimization so that worker creates a copy of the global step
-    # to avoid overloading the parameter server holding the global step.
-    with ops.device(grad[0].device):
-      global_step = array_ops.identity(self._global_step) + 1
+    with ops.device(var.device):
+      global_step = array_ops.identity(self._global_step_on_worker)
     return training_ops.resource_apply_adagrad_da(
-        var,
+        var.handle,
         g_acc.handle,
         gg_acc.handle,
         grad,
@@ -139,10 +138,8 @@ class AdagradDAOptimizer(optimizer.Optimizer):
   def _apply_sparse(self, grad, var):
     g_acc = self.get_slot(var, "gradient_accumulator")
     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
-    # Performance optimization so that worker creates a copy of the global step
-    # to avoid overloading the parameter server holding the global step.
-    with ops.device(grad[0].device):
-      global_step = array_ops.identity(self._global_step) + 1
+    with ops.device(var.device):
+      global_step = array_ops.identity(self._global_step_on_worker)
     return training_ops.sparse_apply_adagrad_da(
         var,
         g_acc,
@@ -158,12 +155,10 @@ class AdagradDAOptimizer(optimizer.Optimizer):
   def _resource_apply_sparse(self, grad, var, indices):
     g_acc = self.get_slot(var, "gradient_accumulator")
     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
-    # Performance optimization so that worker creates a copy of the global step
-    # to avoid overloading the parameter server holding the global step.
-    with ops.device(grad[0].device):
-      global_step = array_ops.identity(self._global_step) + 1
+    with ops.device(var.device):
+      global_step = array_ops.identity(self._global_step_on_worker)
     return training_ops.resource_sparse_apply_adagrad_da(
-        var,
+        var.handle,
         g_acc.handle,
         gg_acc.handle,
         grad,
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 6aab715e66b..459c735ea38 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -21,8 +21,9 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
 
@@ -32,8 +33,6 @@ class AdamOptimizer(optimizer.Optimizer):
 
   See [Kingma et. al., 2014](http://arxiv.org/abs/1412.6980)
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
-
-  @@__init__
   """
 
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
@@ -62,11 +61,19 @@ class AdamOptimizer(optimizer.Optimizer):
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-    current good choice is 1.0 or 0.1.
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
 
-    Note that in dense implement of this algorithm, m_t, v_t and variable will
-    update even if g is zero, but in sparse implement, m_t, v_t and variable
-    will not update in iterations g is zero.
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
 
     Args:
       learning_rate: A Tensor or a floating point value.  The learning rate.
@@ -74,7 +81,9 @@ class AdamOptimizer(optimizer.Optimizer):
         The exponential decay rate for the 1st moment estimates.
       beta2: A float value or a constant float tensor.
         The exponential decay rate for the 2nd moment estimates.
-      epsilon: A small constant for numerical stability.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".
@@ -104,16 +113,20 @@ class AdamOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
-    # variable.
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+
     if (self._beta1_power is None or
-        self._beta1_power.graph is not var_list[0].graph):
-      with ops.colocate_with(var_list[0]):
-        self._beta1_power = variables.Variable(self._beta1,
-                                               name="beta1_power",
-                                               trainable=False)
-        self._beta2_power = variables.Variable(self._beta2,
-                                               name="beta2_power",
-                                               trainable=False)
+        self._beta1_power.graph is not first_var.graph):
+      with ops.colocate_with(first_var):
+        self._beta1_power = variable_scope.variable(self._beta1,
+                                                    name="beta1_power",
+                                                    trainable=False)
+        self._beta2_power = variable_scope.variable(self._beta2,
+                                                    name="beta2_power",
+                                                    trainable=False)
     # Create slots for the first and second moments.
     for v in var_list:
       self._zeros_slot(v, "m", self._name)
@@ -142,7 +155,7 @@ class AdamOptimizer(optimizer.Optimizer):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     return training_ops.resource_apply_adam(
-        var, m.handle, v.handle,
+        var.handle, m.handle, v.handle,
         math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
@@ -151,7 +164,7 @@ class AdamOptimizer(optimizer.Optimizer):
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
         grad, use_locking=self._use_locking)
 
-  def _apply_sparse(self, grad, var):
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
     beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
@@ -161,23 +174,39 @@ class AdamOptimizer(optimizer.Optimizer):
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
-    m_scaled_g_values = grad.values * (1 - beta1_t)
+    m_scaled_g_values = grad * (1 - beta1_t)
     m_t = state_ops.assign(m, m * beta1_t,
                            use_locking=self._use_locking)
-    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
-                                use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
-    v_scaled_g_values = (grad.values * grad.values) * (1 - beta2_t)
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
-    v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values,
-                                use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
     var_update = state_ops.assign_sub(var,
                                       lr * m_t / (v_sqrt + epsilon_t),
                                       use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add)
+
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
     with ops.control_dependencies(update_ops):
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index f0bcca823ed..62b171e234e 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -52,7 +52,7 @@ def adam_update_numpy(param,
 
 class AdamOptimizerTest(test.TestCase):
 
-  def testSparse(self):
+  def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():
         # Initialize variables for numpy implementation.
@@ -62,8 +62,12 @@ class AdamOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = variables.Variable(var0_np)
-        var1 = variables.Variable(var1_np)
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np),
@@ -95,17 +99,24 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
   def testSparseDevicePlacement(self):
-    with self.test_session(force_gpu=test.is_gpu_available()):
-      # If a GPU is available, tests that all optimizer ops can be placed on
-      # it (i.e. they have GPU kernels).
-      var = variables.Variable([[1.0], [2.0]])
-      indices = constant_op.constant([0, 1])
-      gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
-      optimizer = adam.AdamOptimizer(3.0)
-      minimize_op = optimizer.minimize(gathered_sum)
-      variables.global_variables_initializer().run()
-      minimize_op.run()
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.AdamOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index af300db4e7e..52b0f421061 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -34,7 +34,7 @@ def basic_train_loop(supervisor, train_step_fn, args=None,
   typically runs one training step in the session.
 
   Args:
-    supervisor: `tf.Supervisor` to run the training services.
+    supervisor: `tf.train.Supervisor` to run the training services.
     train_step_fn: Callable to execute one training step.  Called
       repeatedly as `train_step_fn(session, *args **kwargs)`.
     args: Optional positional arguments passed to `train_step_fn`.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 2a663c7c338..afb73306a47 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -40,27 +40,69 @@ from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
 
 
-class SecondOrStepTimer(object):
+class _HookTimer(object):
+  """Base timer for determining when Hooks should trigger.
+
+  Should not be instantiated directly.
+  """
+
+  def __init__(self):
+    pass
+
+  def reset(self):
+    """Resets the timer."""
+    pass
+
+  def should_trigger_for_step(self, step):
+    """Return true if the timer should trigger for the specified step."""
+    raise NotImplementedError
+
+  def update_last_triggered_step(self, step):
+    """Update the last triggered time and step number.
+
+    Args:
+      step: The current step.
+
+    Returns:
+      A pair `(elapsed_time, elapsed_steps)`, where `elapsed_time` is the number
+      of seconds between the current trigger and the last one (a float), and
+      `elapsed_steps` is the number of steps between the current trigger and
+      the last one. Both values will be set to `None` on the first trigger.
+    """
+    raise NotImplementedError
+
+  def last_triggered_step(self):
+    """Returns the last triggered time step or None if never triggered."""
+    raise NotImplementedError
+
+
+class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
 
   def __init__(self, every_secs=None, every_steps=None):
+    self.reset()
     self._every_secs = every_secs
     self._every_steps = every_steps
-    self._last_triggered_step = None
-    self._last_triggered_time = None
 
     if self._every_secs is None and self._every_steps is None:
       raise ValueError("Either every_secs or every_steps should be provided.")
     if (self._every_secs is not None) and (self._every_steps is not None):
       raise ValueError("Can not provide both every_secs and every_steps.")
 
+    super(SecondOrStepTimer, self).__init__()
+
+  def reset(self):
+    self._last_triggered_step = None
+    self._last_triggered_time = None
+
   def should_trigger_for_step(self, step):
     """Return true if the timer should trigger for the specified step.
 
@@ -89,17 +131,6 @@ class SecondOrStepTimer(object):
     return False
 
   def update_last_triggered_step(self, step):
-    """Update the last triggered time and step number.
-
-    Args:
-      step: The current step.
-
-    Returns:
-      A pair `(elapsed_time, elapsed_steps)`, where `elapsed_time` is the number
-      of seconds between the current trigger and the last one (a float), and
-      `elapsed_steps` is the number of steps between the current trigger and
-      the last one. Both values will be set to `None` on the first trigger.
-    """
     current_time = time.time()
     if self._last_triggered_time is None:
       elapsed_secs = None
@@ -116,15 +147,38 @@ class SecondOrStepTimer(object):
     return self._last_triggered_step
 
 
-class LoggingTensorHook(session_run_hook.SessionRunHook):
-  """Prints the given tensors once every N local steps or once every N seconds.
+class NeverTriggerTimer(_HookTimer):
+  """Timer that never triggers."""
 
-  The tensors will be printed to the log, with `INFO` severity.
+  def should_trigger_for_step(self, step):
+    _ = step
+    return False
+
+  def update_last_triggered_step(self, step):
+    _ = step
+    return (None, None)
+
+  def last_triggered_step(self):
+    return None
+
+
+class LoggingTensorHook(session_run_hook.SessionRunHook):
+  """Prints the given tensors every N local steps, every N seconds, or at end.
+
+  The tensors will be printed to the log, with `INFO` severity. If you are not
+  seeing the logs, you might want to add the following line after your imports:
+  
+  ```python
+    tf.logging.set_verbosity(tf.logging.INFO)
+  ```
+
+  Note that if `at_end` is True, `tensors` should not include any tensor
+  whose evaluation produces a side effect such as consuming additional inputs.
   """
 
   def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
-               formatter=None):
-    """Initializes a LoggingHook monitor.
+               at_end=False, formatter=None):
+    """Initializes a `LoggingTensorHook`.
 
     Args:
       tensors: `dict` that maps string-valued tags to tensors/tensor names,
@@ -134,15 +188,21 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       every_n_secs: `int` or `float`, print the values of `tensors` once every N
           seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
           provided.
+      at_end: `bool` specifying whether to print the values of `tensors` at the
+          end of the run.
       formatter: function, takes dict of `tag`->`Tensor` and returns a string.
           If `None` uses default printing all tensors.
 
     Raises:
       ValueError: if `every_n_iter` is non-positive.
     """
-    if (every_n_iter is None) == (every_n_secs is None):
+    only_log_at_end = (
+        at_end and (every_n_iter is None) and (every_n_secs is None))
+    if (not only_log_at_end and
+        (every_n_iter is None) == (every_n_secs is None)):
       raise ValueError(
-          "exactly one of every_n_iter and every_n_secs must be provided.")
+          "either at_end and/or exactly one of every_n_iter and every_n_secs "
+          "must be provided.")
     if every_n_iter is not None and every_n_iter <= 0:
       raise ValueError("invalid every_n_iter=%s." % every_n_iter)
     if not isinstance(tensors, dict):
@@ -152,10 +212,13 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._tag_order = tensors.keys()
     self._tensors = tensors
     self._formatter = formatter
-    self._timer = SecondOrStepTimer(every_secs=every_n_secs,
-                                    every_steps=every_n_iter)
+    self._timer = (
+        NeverTriggerTimer() if only_log_at_end else
+        SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_iter))
+    self._log_at_end = at_end
 
   def begin(self):
+    self._timer.reset()
     self._iter_count = 0
     # Convert names to tensors if given
     self._current_tensors = {tag: _as_graph_element(tensor)
@@ -168,38 +231,47 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
     else:
       return None
 
+  def _log_tensors(self, tensor_values):
+    original = np.get_printoptions()
+    np.set_printoptions(suppress=True)
+    elapsed_secs, _ = self._timer.update_last_triggered_step(self._iter_count)
+    if self._formatter:
+      logging.info(self._formatter(tensor_values))
+    else:
+      stats = []
+      for tag in self._tag_order:
+        stats.append("%s = %s" % (tag, tensor_values[tag]))
+      if elapsed_secs is not None:
+        logging.info("%s (%.3f sec)", ", ".join(stats), elapsed_secs)
+      else:
+        logging.info("%s", ", ".join(stats))
+    np.set_printoptions(**original)
+
   def after_run(self, run_context, run_values):
     _ = run_context
     if self._should_trigger:
-      original = np.get_printoptions()
-      np.set_printoptions(suppress=True)
-      elapsed_secs, _ = self._timer.update_last_triggered_step(self._iter_count)
-      if self._formatter:
-        logging.info(self._formatter(run_values.results))
-      else:
-        stats = []
-        for tag in self._tag_order:
-          stats.append("%s = %s" % (tag, run_values.results[tag]))
-        if elapsed_secs is not None:
-          logging.info("%s (%.3f sec)", ", ".join(stats), elapsed_secs)
-        else:
-          logging.info("%s", ", ".join(stats))
-      np.set_printoptions(**original)
+      self._log_tensors(run_values.results)
+
     self._iter_count += 1
 
+  def end(self, session):
+    if self._log_at_end:
+      values = session.run(self._current_tensors)
+      self._log_tensors(values)
+
 
 class StopAtStepHook(session_run_hook.SessionRunHook):
-  """Monitor to request stop at a specified step."""
+  """Hook that requests stop at a specified step."""
 
   def __init__(self, num_steps=None, last_step=None):
-    """Create a StopAtStep Hook.
+    """Initializes a `StopAtStepHook`.
 
     This hook requests stop after either a number of steps have been
-    executed or a last step has been reached.  Only of the two options can be
+    executed or a last step has been reached. Only one of the two options can be
     specified.
 
     if `num_steps` is specified, it indicates the number of steps to execute
-    after `begin()` is called.  If instead `last_step` is specified, it
+    after `begin()` is called. If instead `last_step` is specified, it
     indicates the last step we want to execute, as passed to the `after_run()`
     call.
 
@@ -222,64 +294,63 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
     if self._global_step_tensor is None:
       raise RuntimeError("Global step should be created to use StopAtStepHook.")
 
+  def after_create_session(self, session, coord):
+    if self._last_step is None:
+      global_step = session.run(self._global_step_tensor)
+      self._last_step = global_step + self._num_steps
+
   def before_run(self, run_context):  # pylint: disable=unused-argument
     return SessionRunArgs(self._global_step_tensor)
 
   def after_run(self, run_context, run_values):
     global_step = run_values.results
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps - 1
     if global_step >= self._last_step:
       run_context.request_stop()
 
 
 class CheckpointSaverListener(object):
-  """An interface for event hooks that depend on a checkpoint.
+  """Interface for listeners that take action before or after checkpoint save.
 
-  CheckpointSaverListeners are similar to SessionRunHooks, and can be useful to
-  track training, report progress, and more.  The distinction is that
-  CheckpointSaverListeners run only in steps when CheckpointSaverHook is
-  triggered, and provide callbacks to run before or after the checkpoint is
-  generated.  This is in contrast to SessionRunHooks, which may run in steps
-  when no checkpoint is written, and which have no guaranteed execution order
-  in any case.  CheckpointSaverListeners use the observer pattern and notify at
-  the following points:
-   - when a session starts being used
+  `CheckpointSaverListener` triggers only in steps when `CheckpointSaverHook` is
+  triggered, and provides callbacks at the following points:
+   - before using the session
    - before each call to `Saver.save()`
    - after each call to `Saver.save()`
-   - when the session closed
+   - at the end of session
 
-  Custom CheckpointSaverListeners look like this:
-    class ExampleCheckpointSaverListerner(CheckpointSaverListener):
-      def begin(self):
-        # You can add ops to the graph here.
-        print('Starting the session.')
-        self.your_tensor = ...
+  To use a listener, implement a class and pass the listener to a
+  `CheckpointSaverHook`, as in this example:
 
-      def before_save(self, session, global_step_value):
-        print('About to write a checkpoint')
+  ```python
+  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+    def begin(self):
+      # You can add ops to the graph here.
+      print('Starting the session.')
+      self.your_tensor = ...
 
-      def after_save(self, session, global_step_value):
-        print('Done writing checkpoint.')
+    def before_save(self, session, global_step_value):
+      print('About to write a checkpoint')
 
-      def end(self, session, global_step_value):
-        print('Done with the session.')
+    def after_save(self, session, global_step_value):
+      print('Done writing checkpoint.')
 
-  A CheckpointSaverListener may simply take some action after every checkpoint.
-  It is also possible for the listener to use its own schedule to act less
-  frequently, based on wall clock time or on global_step_value.  In this case,
-  implementors must be careful about what happens at end().  When end is called,
-  The CheckpointSaverHook will have already triggered after_save() in the same
-  global_step, but the listener may or may not have actually acted on it.
-  The listener may want to be sure to act at end() if there is a fresh
-  checkpoint available, but should not act twice if after_save() already handled
-  it.  In this case, end() should have logic to detect the situation and do the
-  right thing, similar to what CheckpointSaverHook.end() does using
-  self._timer.last_triggered_step().
+    def end(self, session, global_step_value):
+      print('Done with the session.')
 
-  To use such listeners, pass them in the checkpoint_listeners argument to
-  graph_actions._monitored_train().  If using tf.Learn Estimators, create a
-  custom Estimator and override _get_checkpoint_listeners().
+  ...
+  listener = ExampleCheckpointSaverListerner()
+  saver_hook = tf.train.CheckpointSaverHook(
+      checkpoint_dir, listeners=[listener])
+  with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
+    ...
+  ```
+
+  A `CheckpointSaverListener` may simply take some action after every
+  checkpoint save. It is also possible for the listener to use its own schedule
+  to act less frequently, e.g. based on global_step_value. In this case,
+  implementors should implement the `end()` method to handle actions related to
+  the last checkpoint save. But the listener should not act twice if
+  `after_save()` already handled this last checkpoint save.
   """
 
   def begin(self):
@@ -306,7 +377,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
                checkpoint_basename="model.ckpt",
                scaffold=None,
                listeners=None):
-    """Initialize CheckpointSaverHook monitor.
+    """Initializes a `CheckpointSaverHook`.
 
     Args:
       checkpoint_dir: `str`, base directory for the checkpoint files.
@@ -316,18 +387,18 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
       checkpoint_basename: `str`, base name for the checkpoint files.
       scaffold: `Scaffold`, use to get saver object.
       listeners: List of `CheckpointSaverListener` subclass instances.
-        Used for callbacks that run immediately after the corresponding
-        CheckpointSaverHook callbacks, only in steps where the
-        CheckpointSaverHook was triggered.
+        Used for callbacks that run immediately before or after this hook saves
+        the checkpoint.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
       ValueError: Exactly one of saver or scaffold should be set.
     """
     logging.info("Create CheckpointSaverHook.")
-    if ((saver is None and scaffold is None) or
-        (saver is not None and scaffold is not None)):
-      raise ValueError("Exactly one of saver or scaffold must be provided.")
+    if saver is not None and scaffold is not None:
+      raise ValueError("You cannot provide both saver and scaffold.")
+    if saver is None and scaffold is None:
+      saver = saver_lib._get_saver_or_default()  # pylint: disable=protected-access
     self._saver = saver
     self._checkpoint_dir = checkpoint_dir
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
@@ -402,7 +473,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
 
 
 class StepCounterHook(session_run_hook.SessionRunHook):
-  """Steps per second monitor."""
+  """Hook that counts steps per second."""
 
   def __init__(self,
                every_n_steps=100,
@@ -454,14 +525,13 @@ class NanLossDuringTrainingError(RuntimeError):
 
 
 class NanTensorHook(session_run_hook.SessionRunHook):
-  """NaN Loss monitor.
+  """Monitors the loss tensor and stops training if loss is NaN.
 
-  Monitors loss and stops training if loss is NaN.
   Can either fail with exception or just stop training.
   """
 
   def __init__(self, loss_tensor, fail_on_nan_loss=True):
-    """Initializes NanLoss monitor.
+    """Initializes a `NanTensorHook`.
 
     Args:
       loss_tensor: `Tensor`, the loss tensor.
@@ -495,7 +565,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
                summary_writer=None,
                scaffold=None,
                summary_op=None):
-    """Initializes a `SummarySaver` monitor.
+    """Initializes a `SummarySaverHook`.
 
     Args:
       save_steps: `int`, save summaries every N steps. Exactly one of
@@ -591,7 +661,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
 
 
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
-  """Delay execution until global step reaches to wait_until_step.
+  """Delays execution until global step reaches `wait_until_step`.
 
   This hook delays execution until global step reaches to `wait_until_step`. It
   is used to gradually start workers in distributed settings. One example usage
@@ -600,7 +670,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """
 
   def __init__(self, wait_until_step):
-    """Create a _GlobalStepWaiterHook.
+    """Initializes a `GlobalStepWaiterHook`.
 
     Args:
       wait_until_step: an `int` shows until which global step should we wait.
@@ -638,10 +708,10 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
 
 
 class FinalOpsHook(session_run_hook.SessionRunHook):
-  """A run hook which evaluates `Tensors` at the end of a session."""
+  """A hook which evaluates `Tensors` at the end of a session."""
 
   def __init__(self, final_ops, final_ops_feed_dict=None):
-    """Constructs the FinalOpHook with ops to run at the end of the session.
+    """Initializes `FinalOpHook` with ops to run at the end of the session.
 
     Args:
       final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of
@@ -667,10 +737,11 @@ class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
   def __init__(self, feed_fn):
-    """Constructs the FeedFnHook with given `feed_fn`.
+    """Initializes a `FeedFnHook`.
 
     Args:
-      feed_fn: function, no arguments and returns `dict` to feed.
+      feed_fn: function that takes no arguments and returns `dict` of `Tensor`
+        to feed.
     """
     self.feed_fn = feed_fn
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index f33232b5fb3..ede01eba79a 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -140,6 +140,7 @@ class StopAtStepTest(test.TestCase):
       with session_lib.Session() as sess:
         mon_sess = monitored_session._HookedSession(sess, [h])
         sess.run(state_ops.assign(global_step, 5))
+        h.after_create_session(sess, None)
         mon_sess.run(no_op)
         self.assertFalse(mon_sess.should_stop())
         sess.run(state_ops.assign(global_step, 9))
@@ -163,6 +164,7 @@ class StopAtStepTest(test.TestCase):
       with session_lib.Session() as sess:
         mon_sess = monitored_session._HookedSession(sess, [h])
         sess.run(state_ops.assign(global_step, 5))
+        h.after_create_session(sess, None)
         mon_sess.run(no_op)
         self.assertFalse(mon_sess.should_stop())
         sess.run(state_ops.assign(global_step, 13))
@@ -170,12 +172,32 @@ class StopAtStepTest(test.TestCase):
         self.assertFalse(mon_sess.should_stop())
         sess.run(state_ops.assign(global_step, 14))
         mon_sess.run(no_op)
-        self.assertTrue(mon_sess.should_stop())
+        self.assertFalse(mon_sess.should_stop())
         sess.run(state_ops.assign(global_step, 15))
+        mon_sess.run(no_op)
+        self.assertTrue(mon_sess.should_stop())
+        sess.run(state_ops.assign(global_step, 16))
         mon_sess._should_stop = False
         mon_sess.run(no_op)
         self.assertTrue(mon_sess.should_stop())
 
+  def test_stop_based_with_multiple_steps(self):
+    h = basic_session_run_hooks.StopAtStepHook(num_steps=10)
+
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      no_op = control_flow_ops.no_op()
+      h.begin()
+      with session_lib.Session() as sess:
+        mon_sess = monitored_session._HookedSession(sess, [h])
+        sess.run(state_ops.assign(global_step, 5))
+        h.after_create_session(sess, None)
+        mon_sess.run(no_op)
+        self.assertFalse(mon_sess.should_stop())
+        sess.run(state_ops.assign(global_step, 15))
+        mon_sess.run(no_op)
+        self.assertTrue(mon_sess.should_stop())
+
 
 class LoggingTensorHookTest(test.TestCase):
 
@@ -205,25 +227,69 @@ class LoggingTensorHookTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'xactly one of'):
       basic_session_run_hooks.LoggingTensorHook(tensors=['t'])
 
-  def test_print_every_n_steps(self):
+  def test_print_at_end_only(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
       t = constant_op.constant(42.0, name='foo')
       train_op = constant_op.constant(3)
       hook = basic_session_run_hooks.LoggingTensorHook(
-          tensors=[t.name], every_n_iter=10)
+          tensors=[t.name], at_end=True)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
       sess.run(variables_lib.global_variables_initializer())
+      self.logged_message = ''
+      for _ in range(3):
+        mon_sess.run(train_op)
+        # assertNotRegexpMatches is not supported by python 3.1 and later
+        self.assertEqual(str(self.logged_message).find(t.name), -1)
+
+      hook.end(sess)
+      self.assertRegexpMatches(str(self.logged_message), t.name)
+
+  def _validate_print_every_n_steps(self, sess, at_end):
+    t = constant_op.constant(42.0, name='foo')
+
+    train_op = constant_op.constant(3)
+    hook = basic_session_run_hooks.LoggingTensorHook(
+        tensors=[t.name], every_n_iter=10, at_end=at_end)
+    hook.begin()
+    mon_sess = monitored_session._HookedSession(sess, [hook])
+    sess.run(variables_lib.global_variables_initializer())
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self.logged_message), t.name)
+    for _ in range(3):
+      self.logged_message = ''
+      for _ in range(9):
+        mon_sess.run(train_op)
+        # assertNotRegexpMatches is not supported by python 3.1 and later
+        self.assertEqual(str(self.logged_message).find(t.name), -1)
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), t.name)
-      for _ in range(3):
-        self.logged_message = ''
-        for _ in range(9):
-          mon_sess.run(train_op)
-          # assertNotRegexpMatches is not supported by python 3.1 and later
-          self.assertEqual(str(self.logged_message).find(t.name), -1)
-        mon_sess.run(train_op)
-        self.assertRegexpMatches(str(self.logged_message), t.name)
+
+    # Add additional run to verify proper reset when called multiple times.
+    self.logged_message = ''
+    mon_sess.run(train_op)
+    # assertNotRegexpMatches is not supported by python 3.1 and later
+    self.assertEqual(str(self.logged_message).find(t.name), -1)
+
+    self.logged_message = ''
+    hook.end(sess)
+    if at_end:
+      self.assertRegexpMatches(str(self.logged_message), t.name)
+    else:
+      # assertNotRegexpMatches is not supported by python 3.1 and later
+      self.assertEqual(str(self.logged_message).find(t.name), -1)
+
+  def test_print_every_n_steps(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      self._validate_print_every_n_steps(sess, at_end=False)
+      # Verify proper reset.
+      self._validate_print_every_n_steps(sess, at_end=False)
+
+  def test_print_every_n_steps_and_end(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      self._validate_print_every_n_steps(sess, at_end=True)
+      # Verify proper reset.
+      self._validate_print_every_n_steps(sess, at_end=True)
 
   def test_print_first_step(self):
     # if it runs every iteration, first iteration has None duration.
@@ -240,29 +306,48 @@ class LoggingTensorHookTest(test.TestCase):
       # in first run, elapsed time is None.
       self.assertEqual(str(self.logged_message).find('sec'), -1)
 
+  def _validate_print_every_n_secs(self, sess, at_end):
+    t = constant_op.constant(42.0, name='foo')
+    train_op = constant_op.constant(3)
+
+    hook = basic_session_run_hooks.LoggingTensorHook(
+        tensors=[t.name], every_n_secs=1.0, at_end=at_end)
+    hook.begin()
+    mon_sess = monitored_session._HookedSession(sess, [hook])
+    sess.run(variables_lib.global_variables_initializer())
+
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self.logged_message), t.name)
+
+    # assertNotRegexpMatches is not supported by python 3.1 and later
+    self.logged_message = ''
+    mon_sess.run(train_op)
+    self.assertEqual(str(self.logged_message).find(t.name), -1)
+    time.sleep(1.0)
+
+    self.logged_message = ''
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self.logged_message), t.name)
+
+    self.logged_message = ''
+    hook.end(sess)
+    if at_end:
+      self.assertRegexpMatches(str(self.logged_message), t.name)
+    else:
+      # assertNotRegexpMatches is not supported by python 3.1 and later
+      self.assertEqual(str(self.logged_message).find(t.name), -1)
+
   def test_print_every_n_secs(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      t = constant_op.constant(42.0, name='foo')
-      train_op = constant_op.constant(3)
+      self._validate_print_every_n_secs(sess, at_end=False)
+      # Verify proper reset.
+      self._validate_print_every_n_secs(sess, at_end=False)
 
-      hook = basic_session_run_hooks.LoggingTensorHook(
-          tensors=[t.name], every_n_secs=1.0)
-      hook.begin()
-      mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
-
-      mon_sess.run(train_op)
-      self.assertRegexpMatches(str(self.logged_message), t.name)
-
-      # assertNotRegexpMatches is not supported by python 3.1 and later
-      self.logged_message = ''
-      mon_sess.run(train_op)
-      self.assertEqual(str(self.logged_message).find(t.name), -1)
-      time.sleep(1.0)
-
-      self.logged_message = ''
-      mon_sess.run(train_op)
-      self.assertRegexpMatches(str(self.logged_message), t.name)
+  def test_print_every_n_secs_and_end(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      self._validate_print_every_n_secs(sess, at_end=True)
+      # Verify proper reset.
+      self._validate_print_every_n_secs(sess, at_end=True)
 
   def test_print_formatter(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
@@ -346,6 +431,98 @@ class CheckpointSaverHookTest(test.TestCase):
           'end': 1
       }, listener.get_counts())
 
+  def test_listener_with_monitored_session(self):
+    with ops.Graph().as_default():
+      scaffold = monitored_session.Scaffold()
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          scaffold=scaffold,
+          listeners=[listener])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          scaffold=scaffold,
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener_counts = listener.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener_counts)
+
+  def test_listener_with_default_saver(self):
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          listeners=[listener])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener_counts = listener.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener_counts)
+
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      with monitored_session.SingularMonitoredSession(
+          checkpoint_dir=self.model_dir) as sess2:
+        global_step_saved_val = sess2.run(global_step)
+    self.assertEqual(2, global_step_saved_val)
+
+  def test_two_listeners_with_default_saver(self):
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener1 = MockCheckpointSaverListener()
+      listener2 = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          listeners=[listener1, listener2])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener1_counts = listener1.get_counts()
+      listener2_counts = listener2.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener1_counts)
+    self.assertEqual(listener1_counts, listener2_counts)
+
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      with monitored_session.SingularMonitoredSession(
+          checkpoint_dir=self.model_dir) as sess2:
+        global_step_saved_val = sess2.run(global_step)
+    self.assertEqual(2, global_step_saved_val)
+
   @test.mock.patch('time.time')
   def test_save_secs_saves_periodically(self, mock_time):
     # Let's have a realistic start time
@@ -393,7 +570,8 @@ class CheckpointSaverHookTest(test.TestCase):
                          checkpoint_utils.load_variable(self.model_dir,
                                                         self.global_step.name))
 
-  def test_save_secs_calls_listeners_periodically(self):
+  # Flaky because of time.sleep()
+  def DISABLED_test_save_secs_calls_listeners_periodically(self):
     with self.graph.as_default():
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
@@ -510,6 +688,49 @@ class CheckpointSaverHookTest(test.TestCase):
     fake_summary_writer.FakeSummaryWriter.uninstall()
 
 
+class ResourceCheckpointSaverHookTest(test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = ops.Graph()
+    with self.graph.as_default():
+      self.scaffold = monitored_session.Scaffold()
+      with variable_scope.variable_scope('foo', use_resource=True):
+        self.global_step = variables.get_or_create_global_step()
+      self.train_op = state_ops.assign_add(self.global_step, 1)
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      hook.begin()
+      self.scaffold.finalize()
+      with session_lib.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        mon_sess = monitored_session._HookedSession(sess, [hook])
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.train_op)
+        # Not saved
+        self.assertEqual(1,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        mon_sess.run(self.train_op)
+        # saved
+        self.assertEqual(3,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        mon_sess.run(self.train_op)
+        # Not saved
+        self.assertEqual(3,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+        mon_sess.run(self.train_op)
+        # saved
+        self.assertEqual(5,
+                         checkpoint_utils.load_variable(self.model_dir,
+                                                        self.global_step.name))
+
+
 class StepCounterHookTest(test.TestCase):
 
   def setUp(self):
@@ -846,6 +1067,55 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
+class ResourceSummarySaverHookTest(test.TestCase):
+
+  def setUp(self):
+    test.TestCase.setUp(self)
+
+    self.log_dir = 'log/dir'
+    self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir)
+
+    var = variable_scope.get_variable('var', initializer=0.0, use_resource=True)
+    tensor = state_ops.assign_add(var, 1.0)
+    self.summary_op = summary_lib.scalar('my_summary', tensor)
+
+    with variable_scope.variable_scope('foo', use_resource=True):
+      global_step = variables.get_or_create_global_step()
+    self.train_op = state_ops.assign_add(global_step, 1)
+
+  def test_save_steps(self):
+    hook = basic_session_run_hooks.SummarySaverHook(
+        save_steps=8,
+        summary_writer=self.summary_writer,
+        summary_op=self.summary_op)
+
+    with self.test_session() as sess:
+      hook.begin()
+      sess.run(variables_lib.global_variables_initializer())
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      for _ in range(30):
+        mon_sess.run(self.train_op)
+      hook.end(sess)
+
+    self.summary_writer.assert_summaries(
+        test_case=self,
+        expected_logdir=self.log_dir,
+        expected_summaries={
+            1: {
+                'my_summary': 1.0
+            },
+            9: {
+                'my_summary': 2.0
+            },
+            17: {
+                'my_summary': 3.0
+            },
+            25: {
+                'my_summary': 4.0
+            },
+        })
+
+
 class FeedFnHookTest(test.TestCase):
 
   def test_feeding_placeholder(self):
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
new file mode 100644
index 00000000000..d52cf9a4367
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -0,0 +1,326 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools to work with checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver
+from tensorflow.python.training import training as train
+
+__all__ = [
+    "load_checkpoint", "load_variable", "list_variables", "init_from_checkpoint"
+]
+
+
+def load_checkpoint(ckpt_dir_or_file):
+  """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
+
+  If `ckpt_dir_or_file` resolves to a directory with multiple checkpoints,
+  reader for the latest checkpoint is returned.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint
+      file.
+
+  Returns:
+    `CheckpointReader` object.
+
+  Raises:
+    ValueError: If `ckpt_dir_or_file` resolves to a directory with no
+      checkpoints.
+  """
+  filename = _get_checkpoint_filename(ckpt_dir_or_file)
+  if filename is None:
+    raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
+                     "given directory %s" % ckpt_dir_or_file)
+  return train.NewCheckpointReader(filename)
+
+
+def load_variable(ckpt_dir_or_file, name):
+  """Returns the tensor value of the given variable in the checkpoint.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+    name: Name of the variable to return.
+
+  Returns:
+    A numpy `ndarray` with a copy of the value of this variable.
+  """
+  # TODO(b/29227106): Fix this in the right place and remove this.
+  if name.endswith(":0"):
+    name = name[:-2]
+  reader = load_checkpoint(ckpt_dir_or_file)
+  return reader.get_tensor(name)
+
+
+def list_variables(ckpt_dir_or_file):
+  """Returns list of all variables in the checkpoint.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+
+  Returns:
+    List of tuples `(name, shape)`.
+  """
+  reader = load_checkpoint(ckpt_dir_or_file)
+  variable_map = reader.get_variable_to_shape_map()
+  names = sorted(variable_map.keys())
+  result = []
+  for name in names:
+    result.append((name, variable_map[name]))
+  return result
+
+
+def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
+  """Initializes current variables with tensors loaded from given checkpoint.
+
+  Note: This overrides default initialization ops of specified variables and
+  redefines dtype.
+
+  Assignment map supports following syntax:
+
+  * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
+    current `scope_name` from `checkpoint_scope_name` with matching tensor
+    names.
+  * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
+    will initialize `scope_name/variable_name` variable
+    from `checkpoint_scope_name/some_other_variable`.
+  * `'scope_variable_name': variable` - will initialize given `tf.Variable`
+    object with tensor 'scope_variable_name' from the checkpoint.
+  * `'scope_variable_name': list(variable)` - will initialize list of
+    partitioned variables with tensor 'scope_variable_name' from the checkpoint.
+  * `'/': 'scope_name/'` - will load all variables in current `scope_name` from
+    checkpoint's root (e.g. no scope).
+
+  Supports loading into partitioned variables, which are represented as
+  `'<variable>/part_<part #>'`.
+
+  Example:
+
+  ```python
+
+  # Say, '/tmp/model.ckpt' has the following tensors:
+  #  -- name='old_scope_1/var1', shape=[20, 2]
+  #  -- name='old_scope_1/var2', shape=[50, 4]
+  #  -- name='old_scope_2/var3', shape=[100, 100]
+
+  # Create new model's variables
+  with tf.variable_scope('new_scope_1'):
+    var1 = tf.get_variable('var1', shape=[20, 2],
+                           initializer=tf.zeros_initializer())
+  with tf.variable_scope('new_scope_2'):
+    var2 = tf.get_variable('var2', shape=[50, 4],
+                           initializer=tf.zeros_initializer())
+    # Partition into 5 variables along the first axis.
+    var3 = tf.get_variable(name='var3', shape=[100, 100],
+                           initializer=tf.zeros_initializer(),
+                           partitioner=lambda shape, dtype: [5, 1])
+
+  # Initialize all variables in `new_scope_1` from `old_scope_1`.
+  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/', 'new_scope_1'})
+
+  # Use names to specify which variables to initialize from checkpoint.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_1/var1': 'new_scope_1/var1',
+                        'old_scope_1/var2': 'new_scope_2/var2'})
+
+  # Or use tf.Variable objects to identify what to initialize.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_1/var1': var1,
+                        'old_scope_1/var2': var2})
+
+  # Initialize partitioned variables using variable's name
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_2/var3': 'new_scope_2/var3'})
+
+  # Or specify the list of tf.Variable objects.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_2/var3': var3._get_variable_list()})
+
+  ```
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+    assignment_map: Dict, where keys are names of the variables in the
+      checkpoint and values are current variables or names of current variables
+      (in default graph).
+
+  Raises:
+    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
+    ValueError: If missing variables in current graph.
+  """
+  ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
+  reader = load_checkpoint(ckpt_dir_or_file)
+  variable_map = reader.get_variable_to_shape_map()
+  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+    var = None
+    # Check if this is Variable object or list of Variable objects (in case of
+    # partitioned variables).
+    is_var = lambda x: isinstance(x, variables.Variable)
+    if is_var(current_var_or_name) or (
+        isinstance(current_var_or_name, list)
+        and all(is_var(v) for v in current_var_or_name)):
+      var = current_var_or_name
+    else:
+      store_vars = vs._get_default_variable_store()._vars  # pylint:disable=protected-access
+      # Check if this variable is in var_store.
+      var = store_vars.get(current_var_or_name, None)
+      # Also check if variable is partitioned as list.
+      if var is None:
+        var = _collect_partitioned_variable(current_var_or_name, store_vars)
+    if var is not None:
+      # If 1 to 1 mapping was provided, find variable in the checkpoint.
+      if tensor_name_in_ckpt not in variable_map:
+        raise ValueError("Tensor %s is not found in %s checkpoint %s" % (
+            tensor_name_in_ckpt, ckpt_dir_or_file, variable_map
+        ))
+      if is_var(var):
+        # Additional at-call-time checks.
+        if not var.get_shape().is_compatible_with(
+            variable_map[tensor_name_in_ckpt]):
+          raise ValueError(
+              "Shape of variable %s (%s) doesn't match with shape of "
+              "tensor %s (%s) from checkpoint reader." % (
+                  var.name, str(var.get_shape()),
+                  tensor_name_in_ckpt, str(variable_map[tensor_name_in_ckpt])
+              ))
+        var_name = var.name
+      else:
+        var_name = ",".join([v.name for v in var])
+      _set_variable_or_list_initializer(var, ckpt_file, tensor_name_in_ckpt)
+      logging.info("Initialize variable %s from checkpoint %s with %s",
+                   var_name, ckpt_dir_or_file, tensor_name_in_ckpt)
+    else:
+      scopes = ""
+      # TODO(vihanjain): Support list of 'current_var_or_name' here.
+      if "/" in current_var_or_name:
+        scopes = current_var_or_name[:current_var_or_name.rindex("/")]
+      if not tensor_name_in_ckpt.endswith("/"):
+        raise ValueError(
+            "Assignment map with scope only name {} should map to scope only "
+            "{}. Should be 'scope/': 'other_scope/'.".format(
+                scopes, tensor_name_in_ckpt))
+      # If scope to scope mapping was provided, find all variables in the scope
+      # and create variable to variable mapping.
+      scope_variables = set()
+      for var_name in store_vars:
+        if not scopes or var_name.startswith(scopes + "/"):
+          # Consume /part_ if partitioned variable.
+          if "/part_" in var_name:
+            var_name = var_name[:var_name.index("/part_")]
+          scope_variables.add(var_name)
+      for var_name in scope_variables:
+        # Lookup name with specified prefix and suffix from current variable.
+        # If tensor_name given is '/' (root), don't use it for full name.
+        full_tensor_name = var_name[len(scopes):]
+        if current_var_or_name != "/":
+          full_tensor_name = full_tensor_name[1:]
+        if tensor_name_in_ckpt != "/":
+          full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        if full_tensor_name not in variable_map:
+          raise ValueError(
+              "Tensor %s (%s in %s) is not found in %s checkpoint" % (
+                  full_tensor_name, var_name[len(scopes) + 1:],
+                  tensor_name_in_ckpt, ckpt_dir_or_file
+              ))
+        var = store_vars.get(var_name, None)
+        if var is None:
+          var = _collect_partitioned_variable(var_name, store_vars)
+        _set_variable_or_list_initializer(var, ckpt_file, full_tensor_name)
+        logging.info("Initialize variable %s from checkpoint %s with %s",
+                     var_name, ckpt_dir_or_file, full_tensor_name)
+
+
+def _get_checkpoint_filename(ckpt_dir_or_file):
+  """Returns checkpoint filename given directory or specific checkpoint file."""
+  if gfile.IsDirectory(ckpt_dir_or_file):
+    return saver.latest_checkpoint(ckpt_dir_or_file)
+  return ckpt_dir_or_file
+
+
+def _set_checkpoint_initializer(variable,
+                                ckpt_file,
+                                tensor_name,
+                                slice_spec,
+                                name="checkpoint_initializer"):
+  """Overrides given variable's initialization op.
+
+  Sets variable initializer to assign op that initializes variable from tensor's
+  value in the checkpoint.
+
+  Args:
+    variable: `tf.Variable` object.
+    ckpt_file: string, full path of the checkpoint.
+    tensor_name: Name of the tensor to load from the checkpoint.
+    slice_spec: Slice specification for loading partitioned tensors.
+    name: Name of the operation.
+  """
+  base_type = variable.dtype.base_dtype
+  restore_op = io_ops.restore_v2(
+      ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
+  variable._initializer_op = state_ops.assign(variable, restore_op)  # pylint:disable=protected-access
+
+
+def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
+                                      tensor_name):
+  """Overrides initialization op of given variable or list of variables.
+
+  Calls `_set_checkpoint_initializer` for each variable in the given list of
+  variables.
+
+  Args:
+    variable_or_list: `tf.Variable` object or a list of `tf.Variable` objects.
+    ckpt_file: string, full path of the checkpoint.
+    tensor_name: Name of the tensor to load from the checkpoint.
+
+  Raises:
+    ValueError: if all objects in `variable_or_list` are not partitions of the
+      same large variable.
+  """
+  if isinstance(variable_or_list, (list, tuple)):
+    # A set of slices.
+    slice_name = None
+    for v in variable_or_list:
+      slice_info = v._save_slice_info  # pylint:disable=protected-access
+      if slice_name is None:
+        slice_name = slice_info.full_name
+      elif slice_name != slice_info.full_name:
+        raise ValueError("Slices must all be from the same tensor: %s != %s" %
+                         (slice_name, slice_info.full_name))
+      _set_checkpoint_initializer(v, ckpt_file, tensor_name, slice_info.spec)
+  else:
+    _set_checkpoint_initializer(variable_or_list, ckpt_file, tensor_name, "")
+
+
+def _collect_partitioned_variable(name, all_vars):
+  """Returns list of `tf.Variable` that comprise the partitioned variable."""
+  if name + "/part_0" in all_vars:
+    var = []
+    i = 0
+    while name + "/part_%d" % i in all_vars:
+      var.append(all_vars[name + "/part_%d" % i])
+      i += 1
+    return var
+  return None
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
new file mode 100644
index 00000000000..b0af922c0c9
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -0,0 +1,323 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpoints tools."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import saver as saver_lib
+
+
+def _create_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  v1 = variable_scope.get_variable("var1", [1, 10])
+  v2 = variable_scope.get_variable("var2", [10, 10])
+  v3 = variable_scope.get_variable("var3", [100, 100])
+  with variable_scope.variable_scope("useful_scope"):
+    v4 = variable_scope.get_variable("var4", [9, 9])
+  sess.run(variables.global_variables_initializer())
+  v1_value, v2_value, v3_value, v4_value = sess.run([v1, v2, v3, v4])
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value, v2_value, v3_value, v4_value
+
+
+def _create_partition_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  with variable_scope.variable_scope("scope"):
+    v1 = variable_scope.get_variable(
+        name="var1",
+        shape=[100, 100],
+        initializer=init_ops.truncated_normal_initializer(0.5),
+        partitioner=partitioned_variables.min_max_variable_partitioner(
+            max_partitions=5, axis=0, min_slice_size=8 << 10))
+  sess.run(variables.global_variables_initializer())
+  v1_value = sess.run(v1._get_variable_list())
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value
+
+
+class CheckpointsTest(test.TestCase):
+
+  def testNoCheckpoints(self):
+    checkpoint_dir = self.get_temp_dir() + "/no_checkpoints"
+    with self.assertRaises(errors_impl.OpError):
+      self.assertAllEqual(
+          checkpoint_utils.load_variable(checkpoint_dir, "var1"), [])
+
+  def testNoTensor(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+    with self.assertRaises(errors_impl.OpError):
+      self.assertAllEqual(
+          checkpoint_utils.load_variable(checkpoint_dir, "var5"), [])
+
+  def testGetTensor(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var1"), v1)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var2"), v2)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var3"), v3)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "useful_scope/var4"), v4)
+
+  def testGetAllVariables(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _create_checkpoints(session, checkpoint_dir)
+    self.assertEqual(
+        checkpoint_utils.list_variables(checkpoint_dir),
+        [("useful_scope/var4", [9, 9]), ("var1", [1, 10]), ("var2", [10, 10]),
+         ("var3", [100, 100])])
+
+  def testInitFromCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable("my1", [1, 10])
+          with variable_scope.variable_scope("some_other_scope"):
+            my2 = variable_scope.get_variable("my2", [10, 10])
+            with variable_scope.variable_scope("other_useful_scope"):
+              my4 = variable_scope.get_variable("var4", [9, 9])
+        my3 = variable_scope.get_variable("my3", [100, 100])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "var1": "some_scope/my1",
+            "useful_scope/": "some_scope/some_other_scope/other_useful_scope/",
+        })
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "var2": "some_scope/some_other_scope/my2",
+            "var3": my3,
+        })
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+        # Check that tensors are not explicitly in the graph.
+        self.assertLess(len(str(session.graph.as_graph_def())), 27000)
+
+  def testInitWithScopeDoesNotCaptureSuffixes(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    with ops.Graph().as_default() as g:
+      with variable_scope.variable_scope("useful_scope"):
+        my4 = variable_scope.get_variable("var4", [9, 9])
+      with variable_scope.variable_scope("useful_scope_1"):
+        my5_init = [[1.0, 2.0], [3.0, 4.0]]
+        my5 = variable_scope.get_variable("var5", initializer=my5_init)
+
+      checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                            {"useful_scope/": "useful_scope/"})
+      with self.test_session(graph=g) as session:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my4.eval(session), v4)
+        self.assertAllEqual(my5.eval(session), my5_init)
+
+  def testInitFromRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable("var1", [1, 10])
+          my2 = variable_scope.get_variable("var2", [10, 10])
+          my3 = variable_scope.get_variable("var3", [100, 100])
+          with variable_scope.variable_scope("useful_scope"):
+            my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "some_scope/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+  def testInitToRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = variable_scope.get_variable("var1", [1, 10])
+        my2 = variable_scope.get_variable("var2", [10, 10])
+        my3 = variable_scope.get_variable("var3", [100, 100])
+        with variable_scope.variable_scope("useful_scope"):
+          my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+  def testInitFromPartitionVar(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1 = _create_partition_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=init_ops.zeros_initializer(),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
+        # Create another variable with different partitions than the variable in
+        # the checkpoint.
+        with variable_scope.variable_scope("some_other_scope"):
+          my2 = variable_scope.get_variable(
+              name="var1",
+              shape=[100, 100],
+              initializer=init_ops.zeros_initializer(),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=16 << 10))
+          my2_var_list = my2._get_variable_list()
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "scope/var1": "some_scope/my1",
+            "scope/": "some_other_scope/"})
+
+        session.run(variables.global_variables_initializer())
+        my1_values = session.run(my1_var_list)
+        self.assertAllEqual(my1_values, v1)
+        my2_values = session.run(my2_var_list)
+        # Verify we created different number of partitions.
+        self.assertNotEquals(len(my2_values), len(v1))
+        # Verify the values were correctly initialized inspite of different
+        # partitions.
+        full_my2_values = np.concatenate(my2_values, axis=0)
+        full_v1_values = np.concatenate(v1, axis=0)
+        self.assertAllEqual(full_my2_values, full_v1_values)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=init_ops.truncated_normal_initializer(0.5),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"scope/var1": my1_var_list,})
+
+        session.run(variables.global_variables_initializer())
+        my1_values = session.run(my1_var_list)
+        self.assertAllEqual(my1_values, v1)
+
+  def testInitFromCheckpointMissing(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          _ = variable_scope.get_variable("my1", [10, 10])
+          _ = variable_scope.get_variable(
+              "my2", [1, 10],
+              dtype=dtypes.int64,
+              initializer=init_ops.zeros_initializer())
+
+        # No directory.
+        with self.assertRaises(errors_impl.OpError):
+          checkpoint_utils.init_from_checkpoint("no_dir",
+                                                {"var1": "some_scope/my1"})
+
+        # No variable in checkpoint.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"no_var": "some_scope/my1"})
+
+        # No variable in the graph.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"var3": "some_scope/no_var"})
+
+        # Shape mismatch.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"var1": "some_scope/my1"})
+
+        # Variable 'my1' and 'my2' are missing in given checkpoint scope.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(
+              checkpoint_dir, {"useful_scope/": "some_scope/"})
+
+        # Mapping is not to scope name.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"useful_scope": "some_scope/"})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 417d4d27740..23e86387646 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -62,7 +62,7 @@ class Coordinator(object):
   #### Exception handling:
 
   A thread can report an exception to the coordinator as part of the
-  `should_stop()` call.  The exception will be re-raised from the
+  `request_stop()` call.  The exception will be re-raised from the
   `coord.join()` call.
 
   Thread code:
@@ -106,7 +106,7 @@ class Coordinator(object):
   After a thread has called `coord.request_stop()` the other threads have a
   fixed time to stop, this is called the 'stop grace period' and defaults to 2
   minutes.  If any of the threads is still alive after the grace period expires
-  `coord.join()` raises a RuntimeException reporting the laggards.
+  `coord.join()` raises a RuntimeError reporting the laggards.
 
   ```python
   try:
@@ -117,7 +117,7 @@ class Coordinator(object):
     ...start thread N...(coord, ...)
     # Wait for all the threads to terminate, give them 10s grace period
     coord.join(threads, stop_grace_period_secs=10)
-  except RuntimeException:
+  except RuntimeError:
     ...one of the threads took more than 10s to stop after request_stop()
     ...was called.
   except Exception:
@@ -319,7 +319,8 @@ class Coordinator(object):
     with self._lock:
       self._registered_threads.add(thread)
 
-  def join(self, threads=None, stop_grace_period_secs=120):
+  def join(self, threads=None, stop_grace_period_secs=120,
+           ignore_live_threads=False):
     """Wait for threads to terminate.
 
     This call blocks until a set of threads have terminated.  The set of thread
@@ -341,6 +342,8 @@ class Coordinator(object):
         addition to the registered threads.
       stop_grace_period_secs: Number of seconds given to threads to stop after
         `request_stop()` has been called.
+      ignore_live_threads: If `False`, raises an error if any of the threads are
+        still alive after `stop_grace_period_secs`.
 
     Raises:
       RuntimeError: If any thread is still alive after `request_stop()`
@@ -363,7 +366,7 @@ class Coordinator(object):
     # If any thread is still alive, wait for the grace period to expire.
     # By the time this check is executed, threads may still be shutting down,
     # so we add a sleep of increasing duration to give them a chance to shut
-    # down without loosing too many cycles.
+    # down without losing too many cycles.
     # The sleep duration is limited to the remaining grace duration.
     stop_wait_secs = 0.001
     while any(t.is_alive() for t in threads) and stop_grace_period_secs >= 0.0:
@@ -385,9 +388,13 @@ class Coordinator(object):
       if self._exc_info_to_raise:
         six.reraise(*self._exc_info_to_raise)
       elif stragglers:
-        raise RuntimeError(
-            "Coordinator stopped with threads still running: %s" %
-            " ".join(stragglers))
+        if ignore_live_threads:
+          logging.info("Coordinator stopped with threads still running: %s",
+                       " ".join(stragglers))
+        else:
+          raise RuntimeError(
+              "Coordinator stopped with threads still running: %s" %
+              " ".join(stragglers))
 
   @property
   def joined(self):
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 89b835c77f3..8f4cae6f06d 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -149,6 +149,22 @@ class CoordinatorTest(test.TestCase):
     TestWithGracePeriod(0.002)
     TestWithGracePeriod(1.0)
 
+  def testJoinWithoutGraceExpires(self):
+    coord = coordinator.Coordinator()
+    wait_for_stop_ev = threading.Event()
+    has_stopped_ev = threading.Event()
+    threads = [
+        threading.Thread(target=StopOnEvent,
+                         args=(coord, wait_for_stop_ev, has_stopped_ev)),
+        threading.Thread(target=SleepABit, args=(10.0,))]
+    for t in threads:
+      t.daemon = True
+      t.start()
+    wait_for_stop_ev.set()
+    has_stopped_ev.wait()
+    coord.join(
+        threads, stop_grace_period_secs=1., ignore_live_threads=True)
+
   def testJoinRaiseReportExcInfo(self):
     coord = coordinator.Coordinator()
     threads = [
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 7f403f49275..02155a98d7d 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -94,31 +94,31 @@ class _ReplicaDeviceChooser(object):
     Returns:
       The device to use for the `Operation`.
     """
+    # If we don't return early here, either merge_devices is True, or op.device
+    # is empty (in which case merging is a no-op). So we can always merge below.
     if not self._merge_devices and op.device:
       return op.device
+
     current_device = pydev.DeviceSpec.from_string(op.device or "")
-    spec = pydev.DeviceSpec()
-    if self._ps_tasks and self._ps_device:
-      node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-      if node_def.op in self._ps_ops:
-        device_string = "%s/task:%d" % (
-            self._ps_device, self._ps_strategy(op))
-        if self._merge_devices:
-          spec = pydev.DeviceSpec.from_string(device_string)
-          spec.merge_from(current_device)
-          return spec.to_string()
-        else:
-          return device_string
-    if self._worker_device:
-      if not self._merge_devices:
-        return self._worker_device
-      spec = pydev.DeviceSpec.from_string(self._worker_device)
 
-    if not self._merge_devices:
-      return ""
+    # The ps_device will be used for specified ops (ps_ops) whenever it is
+    # present and ps_tasks is non-zero. However, its task number will only be
+    # set (using ps_strategy) if there is a job field in ps_device that won't be
+    # changed by the job field (if present) in current_device.
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if self._ps_tasks and self._ps_device and node_def.op in self._ps_ops:
+      ps_device = pydev.DeviceSpec.from_string(self._ps_device)
 
-    spec.merge_from(current_device)
-    return spec.to_string()
+      current_job, ps_job = current_device.job, ps_device.job
+      if ps_job and (not current_job or current_job == ps_job):
+        ps_device.task = self._ps_strategy(op)
+
+      ps_device.merge_from(current_device)
+      return ps_device.to_string()
+
+    worker_device = pydev.DeviceSpec.from_string(self._worker_device or "")
+    worker_device.merge_from(current_device)
+    return worker_device.to_string()
 
 
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
@@ -186,7 +186,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
       cluster_spec = cluster.as_dict()
     else:
       cluster_spec = server_lib.ClusterSpec(cluster).as_dict()
-    # Get ps_job_name from ps_device by striping "/job:".
+    # Get ps_job_name from ps_device by stripping "/job:".
     ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
     if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None:
       return None
@@ -198,7 +198,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
   if ps_ops is None:
     # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be
     # placed in the parameter server.
-    ps_ops = ["Variable", "VariableV2"]
+    ps_ops = ["Variable", "VariableV2", "VarHandleOp"]
 
   if not merge_devices:
     logging.warning(
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index e05f0f6a1c7..85b75502ab0 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_setter
@@ -46,6 +47,12 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 
+  def testResource(self):
+    with ops.device(
+        device_setter.replica_device_setter(cluster=self._cluster_spec)):
+      v = resource_variable_ops.ResourceVariable([1, 2])
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+
   def testPS2TasksWithClusterSpecClass(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -58,6 +65,50 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  def testPS2TasksPinVariableToJob(self):
+    with ops.device(
+        device_setter.replica_device_setter(cluster=self._cluster_spec)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+        with ops.device("/job:ps"):  # Explicit PS job will get task set.
+          x = variables.Variable([0, 1])
+      a = v + w + x
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon", w.device)
+      self.assertDeviceEqual("/job:moon", w.initializer.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksUseCpuForPS(self):
+    with ops.device(
+        device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/cpu:0", v.device)
+      self.assertDeviceEqual("/cpu:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksNoMerging(self):
+    with ops.device(
+        device_setter.replica_device_setter(
+            cluster=self._cluster_spec, merge_devices=False)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:ps"):  # Won't assign task when merge_devices=False.
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:ps", w.device)
+      self.assertDeviceEqual("/job:ps", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
new file mode 100644
index 00000000000..bbaa3931c20
--- /dev/null
+++ b/tensorflow/python/training/evaluation.py
@@ -0,0 +1,186 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains functions for evaluation and summarization of metrics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_run_hook
+
+
+def _get_or_create_eval_step():
+  """Gets or creates the eval step `Tensor`.
+
+  Returns:
+    A `Tensor` representing a counter for the evaluation step.
+
+  Raises:
+    ValueError: If multiple `Tensors` have been added to the
+      `tf.GraphKeys.EVAL_STEP` collection.
+  """
+  graph = ops.get_default_graph()
+  eval_steps = graph.get_collection(ops.GraphKeys.EVAL_STEP)
+  if len(eval_steps) == 1:
+    return eval_steps[0]
+  elif len(eval_steps) > 1:
+    raise ValueError('Multiple tensors added to tf.GraphKeys.EVAL_STEP')
+  else:
+    counter = variable_scope.get_variable(
+        'eval_step',
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.EVAL_STEP])
+    return counter
+
+
+class _StopAfterNEvalsHook(session_run_hook.SessionRunHook):
+  """Run hook used by the evaluation routines to run the `eval_ops` N times."""
+
+  def __init__(self, num_evals, log_progress=True):
+    """Constructs the run hook.
+
+    Args:
+      num_evals: The number of evaluations to run for.
+      log_progress: Whether to log evaluation progress, defaults to True.
+    """
+    # The number of evals to run for.
+    self._num_evals = num_evals
+    self._evals_completed = None
+    self._log_progress = log_progress
+
+  def _set_evals_completed_tensor(self, updated_eval_step):
+    self._evals_completed = updated_eval_step
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs({
+        'evals_completed': self._evals_completed
+    })
+
+  def after_run(self, run_context, run_values):
+    evals_completed = run_values.results['evals_completed']
+    if self._log_progress:
+      logging.info('Evaluation [%d/%d]', evals_completed, self._num_evals)
+    if evals_completed >= self._num_evals:
+      run_context.request_stop()
+
+
+def _evaluate_once(checkpoint_path,
+                   master='',
+                   scaffold=None,
+                   eval_ops=None,
+                   feed_dict=None,
+                   final_ops=None,
+                   final_ops_feed_dict=None,
+                   hooks=None,
+                   config=None):
+  """Evaluates the model at the given checkpoint path.
+
+  During a single evaluation, the `eval_ops` is run until the session is
+  interrupted or requested to finish. This is typically requested via a
+  `tf.contrib.training.StopAfterNEvalsHook` which results in `eval_ops` running
+  the requested number of times.
+
+  Optionally, a user can pass in `final_ops`, a single `Tensor`, a list of
+  `Tensors` or a dictionary from names to `Tensors`. The `final_ops` is
+  evaluated a single time after `eval_ops` has finished running and the fetched
+  values of `final_ops` are returned. If `final_ops` is left as `None`, then
+  `None` is returned.
+
+  One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
+  summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
+  summaries run immediately after the model checkpoint has been restored.
+
+  Note that `evaluate_once` creates a local variable used to track the number of
+  evaluations run via `tf.contrib.training.get_or_create_eval_step`.
+  Consequently, if a custom local init op is provided via a `scaffold`, the
+  caller should ensure that the local init op also initializes the eval step.
+
+  Args:
+    checkpoint_path: The path to a checkpoint to use for evaluation.
+    master: The BNS address of the TensorFlow master.
+    scaffold: An tf.train.Scaffold instance for initializing variables and
+      restoring variables. Note that `scaffold.init_fn` is used by the function
+      to restore the checkpoint. If you supply a custom init_fn, then it must
+      also take care of restoring the model from its checkpoint.
+    eval_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
+      to `Tensors`, which is run until the session is requested to stop,
+      commonly done by a `tf.contrib.training.StopAfterNEvalsHook`.
+    feed_dict: The feed dictionary to use when executing the `eval_ops`.
+    final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of names
+      to `Tensors`.
+    final_ops_feed_dict: A feed dictionary to use when evaluating `final_ops`.
+    hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the
+      evaluation loop.
+    config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
+
+  Returns:
+    The fetched values of `final_ops` or `None` if `final_ops` is `None`.
+  """
+  eval_step = _get_or_create_eval_step()
+
+  # Prepare the run hooks.
+  hooks = hooks or []
+
+  if eval_ops is not None:
+    update_eval_step = state_ops.assign_add(eval_step, 1)
+
+    for h in hooks:
+      if isinstance(h, _StopAfterNEvalsHook):
+        h._set_evals_completed_tensor(update_eval_step)  # pylint: disable=protected-access
+
+    if isinstance(eval_ops, dict):
+      eval_ops['update_eval_step'] = update_eval_step
+    elif isinstance(eval_ops, (tuple, list)):
+      eval_ops = list(eval_ops) + [update_eval_step]
+    else:
+      eval_ops = [eval_ops, update_eval_step]
+
+  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
+                                                         time.gmtime()))
+
+  # Prepare the session creator.
+  session_creator = monitored_session.ChiefSessionCreator(
+      scaffold=scaffold,
+      checkpoint_filename_with_path=checkpoint_path,
+      master=master,
+      config=config)
+
+  final_ops_hook = basic_session_run_hooks.FinalOpsHook(
+      final_ops, final_ops_feed_dict)
+  hooks.append(final_ops_hook)
+
+  with monitored_session.MonitoredSession(
+      session_creator=session_creator, hooks=hooks) as session:
+    if eval_ops is not None:
+      while not session.should_stop():
+        session.run(eval_ops, feed_dict)
+
+  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
+                                                         time.gmtime()))
+  return final_ops_hook.final_ops_values
diff --git a/tensorflow/python/training/evaluation_test.py b/tensorflow/python/training/evaluation_test.py
new file mode 100644
index 00000000000..c7ffd8c60b0
--- /dev/null
+++ b/tensorflow/python/training/evaluation_test.py
@@ -0,0 +1,179 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.training.evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import evaluation
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import saver
+from tensorflow.python.training import training
+
+_USE_GLOBAL_STEP = 0
+
+
+def logistic_classifier(inputs):
+  return layers.dense(inputs, 1, activation=math_ops.sigmoid)
+
+
+def local_variable(init_value, name):
+  return variable_scope.get_variable(
+      name,
+      dtype=dtypes.float32,
+      initializer=init_value,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES])
+
+
+class EvaluateOnceTest(test.TestCase):
+
+  def setUp(self):
+    super(EvaluateOnceTest, self).setUp()
+
+    # Create an easy training set:
+    np.random.seed(0)
+
+    self._inputs = np.zeros((16, 4))
+    self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
+
+    for i in range(16):
+      j = int(2 * self._labels[i] + np.random.randint(0, 2))
+      self._inputs[i, j] = 1
+
+  def _train_model(self, checkpoint_dir, num_steps):
+    """Trains a simple classification model.
+
+    Note that the data has been configured such that after around 300 steps,
+    the model has memorized the dataset (e.g. we can expect %100 accuracy).
+
+    Args:
+      checkpoint_dir: The directory where the checkpoint is written to.
+      num_steps: The number of steps to train for.
+    """
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(0)
+      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)
+
+      tf_predictions = logistic_classifier(tf_inputs)
+      loss_op = losses.log_loss(labels=tf_labels, predictions=tf_predictions)
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+      train_op = optimizer.minimize(loss_op,
+                                    training.get_or_create_global_step())
+
+      with monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=checkpoint_dir,
+          hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)]) as session:
+        loss = None
+        while not session.should_stop():
+          _, loss = session.run([train_op, loss_op])
+
+        if num_steps >= 300:
+          assert loss < .015
+
+  def testEvaluatePerfectModel(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(),
+                                  'evaluate_perfect_model_once')
+
+    # Train a Model to completion:
+    self._train_model(checkpoint_dir, num_steps=300)
+
+    # Run
+    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+    labels = constant_op.constant(self._labels, dtype=dtypes.float32)
+    logits = logistic_classifier(inputs)
+    predictions = math_ops.round(logits)
+
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
+
+    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
+
+    final_ops_values = evaluation._evaluate_once(
+        checkpoint_path=checkpoint_path,
+        eval_ops=update_op,
+        final_ops={'accuracy': accuracy},
+        hooks=[evaluation._StopAfterNEvalsHook(1),])
+    self.assertTrue(final_ops_values['accuracy'] > .99)
+
+  def testEvalOpAndFinalOp(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops')
+
+    # Train a model for a single step to get a checkpoint.
+    self._train_model(checkpoint_dir, num_steps=1)
+    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
+
+    # Create the model so we have something to restore.
+    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+    logistic_classifier(inputs)
+
+    num_evals = 5
+    final_increment = 9.0
+
+    my_var = local_variable(0.0, name='MyVar')
+    eval_ops = state_ops.assign_add(my_var, 1.0)
+    final_ops = array_ops.identity(my_var) + final_increment
+
+    final_ops_values = evaluation._evaluate_once(
+        checkpoint_path=checkpoint_path,
+        eval_ops=eval_ops,
+        final_ops={'value': final_ops},
+        hooks=[evaluation._StopAfterNEvalsHook(num_evals),])
+    self.assertEqual(final_ops_values['value'], num_evals + final_increment)
+
+  def testOnlyFinalOp(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'only_final_ops')
+
+    # Train a model for a single step to get a checkpoint.
+    self._train_model(checkpoint_dir, num_steps=1)
+    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
+
+    # Create the model so we have something to restore.
+    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+    logistic_classifier(inputs)
+
+    final_increment = 9.0
+
+    my_var = local_variable(0.0, name='MyVar')
+    final_ops = array_ops.identity(my_var) + final_increment
+
+    final_ops_values = evaluation._evaluate_once(
+        checkpoint_path=checkpoint_path, final_ops={'value': final_ops})
+    self.assertEqual(final_ops_values['value'], final_increment)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c8a3014d53f..618f3baf089 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Ftrl-proximal for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
@@ -30,16 +29,18 @@ class FtrlOptimizer(optimizer.Optimizer):
 
   See this [paper](
   https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
-
-  @@__init__
   """
 
-  def __init__(self, learning_rate,
+  def __init__(self,
+               learning_rate,
                learning_rate_power=-0.5,
                initial_accumulator_value=0.1,
                l1_regularization_strength=0.0,
                l2_regularization_strength=0.0,
-               use_locking=False, name="Ftrl"):
+               use_locking=False,
+               name="Ftrl",
+               accum_name=None,
+               linear_name=None):
     """Construct a new FTRL optimizer.
 
     Args:
@@ -54,6 +55,10 @@ class FtrlOptimizer(optimizer.Optimizer):
       use_locking: If `True` use locks for update operations.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Ftrl".
+      accum_name: The suffix for the variable that keeps the gradient squared
+        accumulator.  If not present, defaults to name.
+      linear_name: The suffix for the variable that keeps the linear gradient
+        accumulator.  If not present, defaults to name + "_1".
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -84,34 +89,36 @@ class FtrlOptimizer(optimizer.Optimizer):
     self._learning_rate_power_tensor = None
     self._l1_regularization_strength_tensor = None
     self._l2_regularization_strength_tensor = None
+    self._accum_name = accum_name
+    self._linear_name = linear_name
 
   def _create_slots(self, var_list):
     # Create the "accum" and "linear" slots.
     for v in var_list:
-      val = constant_op.constant(self._initial_accumulator_value,
-                                 dtype=v.dtype, shape=v.get_shape())
-      self._get_or_make_slot(v, val, "accum", self._name)
-      self._zeros_slot(v, "linear", self._name)
+      with ops.colocate_with(v):
+        val = constant_op.constant(
+            self._initial_accumulator_value, dtype=v.dtype, shape=v.get_shape())
+        self._get_or_make_slot(v, val, "accum", self._accum_name or self._name)
+        self._zeros_slot(v, "linear", self._linear_name or self._name)
 
   def _prepare(self):
     self._learning_rate_tensor = ops.convert_to_tensor(
-        self._learning_rate,
-        name="learning_rate")
+        self._learning_rate, name="learning_rate")
     self._l1_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l1_regularization_strength,
-        name="l1_regularization_strength")
+        self._l1_regularization_strength, name="l1_regularization_strength")
     self._l2_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l2_regularization_strength,
-        name="l2_regularization_strength")
+        self._l2_regularization_strength, name="l2_regularization_strength")
     self._learning_rate_power_tensor = ops.convert_to_tensor(
-        self._learning_rate_power,
-        name="learning_rate_power")
+        self._learning_rate_power, name="learning_rate_power")
 
   def _apply_dense(self, grad, var):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.apply_ftrl(
-        var, accum, linear, grad,
+        var,
+        accum,
+        linear,
+        grad,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       var.dtype.base_dtype),
@@ -124,7 +131,10 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.resource_apply_ftrl(
-        var, accum.handle, linear.handle, grad,
+        var.handle,
+        accum.handle,
+        linear.handle,
+        grad,
         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       grad.dtype.base_dtype),
@@ -137,7 +147,11 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.sparse_apply_ftrl(
-        var, accum, linear, grad.values, grad.indices,
+        var,
+        accum,
+        linear,
+        grad.values,
+        grad.indices,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       var.dtype.base_dtype),
@@ -150,11 +164,13 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.resource_sparse_apply_ftrl(
-        var, accum.handle, linear.handle, grad, indices,
+        var.handle,
+        accum.handle,
+        linear.handle,
+        grad,
+        indices,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor,
-                      grad.dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor,
-                      grad.dtype),
+        math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+        math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
         math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
         use_locking=self._use_locking)
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 15041b85231..5a536e27297 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -27,8 +27,6 @@ from tensorflow.python.training import training_ops
 
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
-
-  @@__init__
   """
 
   def __init__(self, learning_rate, use_locking=False, name="GradientDescent"):
@@ -53,13 +51,13 @@ class GradientDescentOptimizer(optimizer.Optimizer):
 
   def _resource_apply_dense(self, grad, handle):
     return training_ops.resource_apply_gradient_descent(
-        handle, math_ops.cast(self._learning_rate_tensor,
-                              grad.dtype.base_dtype),
+        handle.handle, math_ops.cast(self._learning_rate_tensor,
+                                     grad.dtype.base_dtype),
         grad, use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     return resource_variable_ops.resource_scatter_add(
-        handle, indices, -grad * self._learning_rate)
+        handle.handle, indices, -grad * self._learning_rate)
 
   def _apply_sparse_duplicate_indices(self, grad, var):
     delta = ops.IndexedSlices(
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 0c022d572b4..09671275f0c 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -118,7 +118,7 @@ class GradientDescentOptimizerTest(test.TestCase):
         # doesn't work because the sessions and graph are reused across unit
         # tests and this would mean trying to reinitialize variables. Figure out
         # a long-term solution for this.
-        resources.initialize_resources([var0, var1]).run()
+        variables.global_variables_initializer().run()
         # Fetch params to validate initial values
         self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
         self.assertAllCloseAccordingToType([3.0], var1.eval())
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index cadb404a83d..21183823c23 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -15,7 +15,7 @@
 
 """Input pipeline.
 
-Please see the [reading data how-to](../../how_tos/reading_data/index.md)
+Please see the @{$reading_data$reading data how-to}
 for context.
 """
 
@@ -40,7 +40,7 @@ from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 
@@ -63,9 +63,10 @@ def match_filenames_once(pattern, name=None):
     A variable that is initialized to the list of files matching the pattern(s).
   """
   with ops.name_scope(name, "matching_filenames", [pattern]) as name:
-    return variables.Variable(io_ops.matching_files(pattern), trainable=False,
-                              name=name, validate_shape=False,
-                              collections=[ops.GraphKeys.LOCAL_VARIABLES])
+    return vs.variable(
+        name=name, initial_value=io_ops.matching_files(pattern),
+        trainable=False, validate_shape=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
 def limit_epochs(tensor, num_epochs=None, name=None):
@@ -92,7 +93,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
     raise ValueError("num_epochs must be > 0 not %d." % num_epochs)
   with ops.name_scope(name, "limit_epochs", [tensor]) as name:
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    epochs = variables.Variable(
+    epochs = vs.variable(
         zero64, name="epochs", trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
     counter = epochs.count_up_to(num_epochs)
@@ -196,7 +197,10 @@ def string_input_producer(string_tensor,
     seed: An integer (optional). Seed used if shuffle == True.
     capacity: An integer. Sets the queue capacity.
     shared_name: (optional). If set, this queue will be shared under the given
-      name across multiple sessions.
+      name across multiple sessions. All sessions open to the device which has
+      this queue will be able to access it via the shared_name. Using this in
+      a distributed setting means each name will only be seen by one of the
+      sessions which has access to this operation.
     name: A name for the operations (optional).
     cancel_op: Cancel op for the queue (optional).
 
@@ -758,6 +762,9 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   tensor_list = _as_tensor_list(tensors)
   with ops.name_scope(name, "shuffle_batch",
                       list(tensor_list) + [keep_input]) as name:
+    if capacity <= min_after_dequeue:
+      raise ValueError("capacity %d must be bigger than min_after_dequeue %d."
+                       % (capacity, min_after_dequeue))
     tensor_list = _validate(tensor_list)
     keep_input = _validate_keep_input(keep_input, enqueue_many)
     tensor_list, sparse_info = _store_sparse_tensors(
@@ -875,13 +882,11 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -930,10 +935,11 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresonding value in `keep_input` is `True`. This tensor essentially acts
-      as a filtering mechanism.
+      corresponding value in `keep_input` is `True`. This tensor essentially
+      acts as a filtering mechanism.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -977,6 +983,9 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
   dictionaries of tensors.  Each element in the list is treated similarly
   to the `tensors` argument of `tf.train.batch()`.
 
+  WARNING: This function is nondeterministic, since it starts a separate thread
+  for each tensor.
+
   Enqueues a different list of tensors in different threads.
   Implemented using a queue -- a `QueueRunner` for the queue
   is added to the current `Graph`'s `QUEUE_RUNNER` collection.
@@ -1079,7 +1088,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresonding value in `keep_input` is `True`. This tensor essentially acts
+      corresponding value in `keep_input` is `True`. This tensor essentially acts
       as a filtering mechanism.
     batch_size: An integer. The new batch size pulled from the queue.
     capacity: An integer. The maximum number of elements in the queue.
@@ -1172,9 +1181,6 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
@@ -1233,7 +1239,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresonding value in `keep_input` is `True`. This tensor essentially acts
+      corresponding value in `keep_input` is `True`. This tensor essentially acts
       as a filtering mechanism.
     num_threads: The number of threads enqueuing `tensor_list`.
     seed: Seed for the random shuffling within the queue.
@@ -1375,7 +1381,7 @@ def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresonding value in `keep_input` is `True`. This tensor essentially acts
+      corresponding value in `keep_input` is `True`. This tensor essentially acts
       as a filtering mechanism.
     seed: Seed for the random shuffling within the queue.
     enqueue_many: Whether each tensor in `tensor_list_list` is a single
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index c76adcaf07d..4f705a0a85e 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -996,10 +996,10 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  def testTwoThreads(self):
+  def DISABLED_testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
-  def testTwoThreadsDict(self):
+  def DISABLED_testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
   def testMismatchedDictKeys(self):
@@ -1016,7 +1016,7 @@ class BatchJoinTest(test_lib.TestCase):
           }],
           batch_size=8)
 
-  def testTwoThreadsDynamicPad(self):
+  def DISABLED_testTwoThreadsDynamicPad(self):
     with self.test_session() as sess:
       # Two threads, the first generates (0..69, ["a"] * 1..70).
       num_a = 70
@@ -1091,7 +1091,7 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  def testTwoThreadsSmallerBatch(self):
+  def DISABLED_testTwoThreadsSmallerBatch(self):
     with self.test_session() as sess:
       extra_elements = 2
       # Two threads, the first generates (0..69, "a").
@@ -1190,7 +1190,7 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  def testTwoThreadsDynamicPadSmallerBatch(self):
+  def DISABLED_testTwoThreadsDynamicPadSmallerBatch(self):
     with self.test_session() as sess:
       extra_elements = 2
       # Two threads, the first generates (0..69, ["a"] * 1..70).
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 9d99f4355dd..6d7d5940fb5 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -138,17 +138,17 @@ def piecewise_constant(x, boundaries, values, name=None):
     # comparisons, for example if floats are converted to integers.
     boundaries = ops.convert_n_to_tensor(boundaries)
     for b in boundaries:
-      if b.dtype != x.dtype:
+      if b.dtype.base_dtype != x.dtype.base_dtype:
         raise ValueError(
             "Boundaries (%s) must have the same dtype as x (%s)." % (
-                b.dtype, x.dtype))
+                b.dtype.base_dtype, x.dtype.base_dtype))
     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
     values = ops.convert_n_to_tensor(values)
     for v in values[1:]:
-      if v.dtype != values[0].dtype:
+      if v.dtype.base_dtype != values[0].dtype.base_dtype:
         raise ValueError(
             "Values must have elements all with the same dtype (%s vs %s)." % (
-                values[0].dtype, v.dtype))
+                values[0].dtype.base_dtype, v.dtype.base_dtype))
 
     pred_fn_pairs = {}
     pred_fn_pairs[x <= boundaries[0]] = lambda: values[0]
@@ -226,7 +226,7 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
       Python number.  The minimal end learning rate.
     power: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
+      Python number.  The power of the polynomial. Defaults to linear, 1.0.
     cycle: A boolean, whether or not it should cycle beyond decay_steps.
     name: String.  Optional name of the operation. Defaults to
       'PolynomialDecay'.
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index e7289504223..177a2356e45 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -23,6 +23,8 @@ import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -40,7 +42,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
   def testStaircase(self):
     with self.test_session():
-      step = gen_state_ops._variable(shape=[], dtype=dtypes.int32, 
+      step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
           name="step", container="", shared_name="")
       assign_100 = state_ops.assign(step, 100)
       assign_1 = state_ops.assign(step, 1)
@@ -111,6 +113,11 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         learning_rate_decay.piecewise_constant(x, boundaries, values)
 
+      # Test that ref types are valid.
+      x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
+      boundaries, values = [1.0, 2.0], [1, 2, 3]
+      learning_rate_decay.piecewise_constant(x_ref, boundaries, values)
+
 
 class LinearDecayTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
index 9de681837d0..7c097b943d0 100644
--- a/tensorflow/python/training/localhost_cluster_performance_test.py
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import time
 
 import numpy as np
-import portpicker
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
@@ -31,37 +30,12 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_setter
-from tensorflow.python.training import server_lib
-
-
-def create_local_cluster(num_workers, num_ps, protocol="grpc"):
-  """Create local GRPC servers and return their servers."""
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-  cluster_dict = {
-      "worker": ["localhost:%s" % port for port in worker_ports],
-      "ps": ["localhost:%s" % port for port in ps_ports]
-  }
-  cs = server_lib.ClusterSpec(cluster_dict)
-
-  workers = [
-      server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_workers)
-  ]
-  ps_servers = [
-      server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_ps)
-  ]
-
-  return workers, ps_servers
 
 
 class CreateLocalClusterTest(test.TestCase):
 
   def testCreateLocalCluster(self):
-    workers, _ = create_local_cluster(num_workers=2, num_ps=2)
+    workers, _ = test.create_local_cluster(num_workers=2, num_ps=2)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
     with ops.device("/job:ps/task:0"):
       var0 = variables.Variable(0.0)
@@ -88,7 +62,7 @@ class CreateLocalClusterBenchmark(test.Benchmark):
     iters = 5
     for _ in range(iters):
       start_time = time.time()
-      create_local_cluster(num_workers=1, num_ps=10)
+      test.create_local_cluster(num_workers=1, num_ps=10)
       end_time = time.time()
       deltas.append(end_time - start_time)
 
@@ -104,7 +78,7 @@ class CreateLocalClusterBenchmark(test.Benchmark):
 class PartitionedVariablesBenchmark(test.Benchmark):
 
   def benchmark_create_1000_partitions_with_100_parameter_servers(self):
-    workers, _ = create_local_cluster(num_workers=1, num_ps=100)
+    workers, _ = test.create_local_cluster(num_workers=1, num_ps=100)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
     worker = worker_sessions[0]
     partition_sizes = (1, 512, 1024 * 32, 1024 * 128)
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index a057ce38504..ffd7c12c427 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -27,7 +27,18 @@ from tensorflow.python.training import training_ops
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
-  @@__init__
+  Computes (if `use_nesterov = False`):
+  
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
+
+  Note that in the dense version of this algorithm, `accumulation` is updated
+  and applied regardless of a gradient's value, whereas the sparse version (when
+  the gradient is an `IndexedSlices`, typically because of `tf.gather` or an
+  embedding) only updates variable slices and corresponding `accumulation` terms
+  when that part of the variable was used in the forward pass.
   """
 
   def __init__(self, learning_rate, momentum,
@@ -73,7 +84,7 @@ class MomentumOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     mom = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
-        var, mom.handle,
+        var.handle, mom.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
         grad,
         math_ops.cast(self._momentum_tensor, grad.dtype.base_dtype),
@@ -93,7 +104,7 @@ class MomentumOptimizer(optimizer.Optimizer):
   def _resource_apply_sparse(self, grad, var, indices):
     mom = self.get_slot(var, "momentum")
     return training_ops.resource_sparse_apply_momentum(
-        var, mom.handle,
+        var.handle, mom.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
         grad, indices,
         math_ops.cast(self._momentum_tensor, grad.dtype),
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 26e52464cb5..ff77470a824 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -22,12 +22,11 @@ from __future__ import print_function
 import abc
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -40,6 +39,11 @@ from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
 
 
+# The list of exceptions that we should recover from. Exceptions not in this
+# list may terminate the job.
+_PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
+
+
 # TODO(touts): Share that with the Supervisor.
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
@@ -62,8 +66,8 @@ class Scaffold(object):
   The following pieces are directly accessible as attributes of the `Scaffold`
   object:
 
-  * `saver`: A `tf.Saver` object taking care of saving the variables.  Picked
-    from and stored into the `SAVERS` collection in the graph by default.
+  * `saver`: A `tf.train.Saver` object taking care of saving the variables.
+    Picked from and stored into the `SAVERS` collection in the graph by default.
   * `init_op`: An op to run to initialize the variables.  Picked from and
     stored into the `INIT_OP` collection in the graph by default.
   * `ready_op`: An op to verify that the variables are initialized.  Picked
@@ -82,7 +86,7 @@ class Scaffold(object):
 
   You can also pass the following additional pieces to the constructor:
 
-  * `init_feed_dict`: A sessionn feed dictionary that should be used when
+  * `init_feed_dict`: A session feed dictionary that should be used when
      running the init op.
   * `init_fn`: A callable to run run after the init op to perform additional
     initializations.  The callable will be called as
@@ -98,7 +102,8 @@ class Scaffold(object):
                ready_for_local_init_op=None,
                local_init_op=None,
                summary_op=None,
-               saver=None):
+               saver=None,
+               copy_from_scaffold=None):
     """Create a scaffold.
 
     Args:
@@ -119,23 +124,44 @@ class Scaffold(object):
       local_init_op: Optional op to initialize local variables.
       summary_op: Optional op to gather all summaries.  Must return a scalar
         string tensor containing a serialized `Summary` proto.
-      saver: Optional `tf.Saver` object to use to save and restore variables.
+      saver: Optional `tf.train.Saver` object to use to save and restore
+        variables.
+      copy_from_scaffold: Optional scaffold object to copy fields from. Its
+        fields will be overwritten by the provided fields in this function.
     """
+    if copy_from_scaffold is not None:
+      if not isinstance(copy_from_scaffold, Scaffold):
+        raise TypeError('copy_from_scaffold is not a Scaffold instance.')
+      # We need _coalesce since Tensor is not converted to bool automatically,
+      # so the common idiom of (a or b) does not work.
+      coalesce = lambda a, b: a if a is not None else b
+      init_op = coalesce(init_op, copy_from_scaffold.init_op)
+      init_feed_dict = coalesce(init_feed_dict,
+                                copy_from_scaffold.init_feed_dict)
+      # Use the original init_fn provided by the user to init the new Scaffold.
+      init_fn = coalesce(init_fn, copy_from_scaffold._user_init_fn)  # pylint: disable=protected-access
+      ready_op = coalesce(ready_op, copy_from_scaffold.ready_op)
+      ready_for_local_init_op = coalesce(
+          ready_for_local_init_op, copy_from_scaffold.ready_for_local_init_op)
+      local_init_op = coalesce(local_init_op, copy_from_scaffold.local_init_op)
+      summary_op = coalesce(summary_op, copy_from_scaffold.summary_op)
+      saver = coalesce(saver, copy_from_scaffold.saver)
 
     # NOTE(touts): modifying the init function to be passed the scaffold is a
     # hack to make it easy to find the saver.  Is there a better way?
+    self._user_init_fn = init_fn
     if init_fn:
       self._init_fn = lambda sess: init_fn(self, sess)
     else:
       self._init_fn = None
 
     self._init_op = init_op
+    self._init_feed_dict = init_feed_dict
     self._ready_op = ready_op
     self._ready_for_local_init_op = ready_for_local_init_op
     self._local_init_op = local_init_op
     self._summary_op = summary_op
     self._saver = saver
-    self._init_feed_dict = init_feed_dict
 
   def finalize(self):
     """Creates operations if needed and finalizes the graph."""
@@ -174,11 +200,7 @@ class Scaffold(object):
                                                  summary.merge_all)
     # pylint: disable=g-long-lambda
     if self._saver is None:
-      self._saver = Scaffold.get_or_default(
-          'saver',
-          ops.GraphKeys.SAVERS,
-          lambda: training_saver.Saver(sharded=True, allow_empty=True,
-                                       write_version=saver_pb2.SaverDef.V2))
+      self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
     self._saver.build()
 
@@ -237,7 +259,7 @@ class Scaffold(object):
   @staticmethod
   def _default_local_init_op():
     return control_flow_ops.group(variables.local_variables_initializer(),
-                                  data_flow_ops.tables_initializer())
+                                  lookup_ops.tables_initializer())
 
 
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
@@ -249,13 +271,16 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              save_checkpoint_secs=600,
                              save_summaries_steps=100,
                              save_summaries_secs=None,
-                             config=None):
+                             config=None,
+                             stop_grace_period_secs=120,
+                             log_step_count_steps=100):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
   creates hooks related to checkpoint and summary saving. For workers, this
   utility sets proper session creator which waits for the chief to
-  inialize/restore.
+  initialize/restore. Please check `tf.train.MonitoredSession` for more
+  information.
 
 
   Args:
@@ -283,6 +308,10 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       isn't used.
     config: an instance of `tf.ConfigProto` proto used to configure the session.
       It's the `config` argument of constructor of `tf.Session`.
+    stop_grace_period_secs: Number of seconds given to threads to stop after
+      `close()` has been called.
+    log_step_count_steps: The frequency, in number of global steps, that the
+      global step/sec is logged.
 
   Returns:
     A `MonitoredSession` object.
@@ -291,7 +320,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   if not is_chief:
     session_creator = WorkerSessionCreator(
         scaffold=scaffold, master=master, config=config)
-    return MonitoredSession(session_creator=session_creator, hooks=hooks or [])
+    return MonitoredSession(session_creator=session_creator, hooks=hooks or [],
+                            stop_grace_period_secs=stop_grace_period_secs)
 
   all_hooks = []
   if chief_only_hooks:
@@ -303,8 +333,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       config=config)
 
   if checkpoint_dir:
-    all_hooks.append(
-        basic_session_run_hooks.StepCounterHook(output_dir=checkpoint_dir))
+    all_hooks.append(basic_session_run_hooks.StepCounterHook(
+        output_dir=checkpoint_dir, every_n_steps=log_step_count_steps))
 
     if (save_summaries_steps and save_summaries_steps > 0) or (
         save_summaries_secs and save_summaries_secs > 0):
@@ -319,7 +349,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
 
   if hooks:
     all_hooks.extend(hooks)
-  return MonitoredSession(session_creator=session_creator, hooks=all_hooks)
+  return MonitoredSession(session_creator=session_creator, hooks=all_hooks,
+                          stop_grace_period_secs=stop_grace_period_secs)
 
 
 class SessionCreator(object):
@@ -413,13 +444,16 @@ class WorkerSessionCreator(SessionCreator):
   def create_session(self):
     self._scaffold.finalize()
     return self._get_session_manager().wait_for_session(
-        self._master, config=self._config)
+        self._master, config=self._config,
+        max_wait_secs=30 * 60  # Wait up to 30 mins for the session to be ready.
+    )
 
 
 class _MonitoredSession(object):
   """See `MonitoredSession` or `SingularMonitoredSession`."""
 
-  def __init__(self, session_creator, hooks, should_recover):
+  def __init__(self, session_creator, hooks, should_recover,
+               stop_grace_period_secs=120):
     """Sets up a Monitored or Hooked Session.
 
     Args:
@@ -427,7 +461,9 @@ class _MonitoredSession(object):
         `ChiefSessionCreator` or a `WorkerSessionCreator`.
       hooks: An iterable of `SessionRunHook' objects.
       should_recover: A bool. Indicates whether to recover from `AbortedError`
-        or not.
+        and `UnavailableError` or not.
+      stop_grace_period_secs: Number of seconds given to threads to stop after
+        `close()` has been called.
     """
     self._graph_was_finalized = ops.get_default_graph().finalized
     self._hooks = hooks or []
@@ -436,7 +472,8 @@ class _MonitoredSession(object):
     # Create the session.
     self._coordinated_creator = self._CoordinatedSessionCreator(
         session_creator=session_creator or ChiefSessionCreator(),
-        hooks=self._hooks)
+        hooks=self._hooks,
+        stop_grace_period_secs=stop_grace_period_secs)
     if should_recover:
       self._sess = _RecoverableSession(self._coordinated_creator)
     else:
@@ -489,11 +526,12 @@ class _MonitoredSession(object):
   class _CoordinatedSessionCreator(object):
     """Factory for the _RecoverableSession."""
 
-    def __init__(self, session_creator, hooks):
+    def __init__(self, session_creator, hooks, stop_grace_period_secs):
       self._session_creator = session_creator
       self._hooks = hooks
       self.coord = None
       self.tf_sess = None
+      self._stop_grace_period_secs = stop_grace_period_secs
 
     def create_session(self):
       """Creates a coordinated session."""
@@ -506,7 +544,8 @@ class _MonitoredSession(object):
       for hook in self._hooks:
         hook.after_create_session(self.tf_sess, self.coord)
       return _CoordinatedSession(
-          _HookedSession(self.tf_sess, self._hooks), self.coord)
+          _HookedSession(self.tf_sess, self._hooks), self.coord,
+          self._stop_grace_period_secs)
 
   def _close_internal(self, exception_type=None):
     try:
@@ -542,7 +581,7 @@ class MonitoredSession(_MonitoredSession):
 
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with MonitoredSession(session_creator=ChiefSessionCreator(...),
                         hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
@@ -558,6 +597,7 @@ class MonitoredSession(_MonitoredSession):
   * initializes the model via initialization ops provided by `Scaffold`
   * restores variables if a checkpoint exists
   * launches queue runners
+  * calls `hook.after_create_session()`
 
   Run: When `run()` is called, the monitored session does following things:
 
@@ -565,8 +605,8 @@ class MonitoredSession(_MonitoredSession):
   * calls TensorFlow `session.run()` with merged fetches and feed_dict
   * calls `hook.after_run()`
   * returns result of `session.run()` asked by user
-  * if `AbortedError` occurs, it recovers or reinitializes the session before
-    executing the run() call again
+  * if `AbortedError` or `UnavailableError` occurs, it recovers or
+    reinitializes the session before executing the run() call again
 
 
   Exit: At the `close()`, the monitored session does following things in order:
@@ -594,6 +634,12 @@ class MonitoredSession(_MonitoredSession):
 
   See `MonitoredTrainingSession` for an example usage based on chief or worker.
 
+  Note: This is not a `tf.Session`. For example, it cannot do following:
+
+  * it cannot be set as default session.
+  * it cannot be sent to saver.save.
+  * it cannot be sent to tf.train.start_queue_runners.
+
   Args:
     session_creator: A factory object to create session. Typically a
       `ChiefSessionCreator` which is the default one.
@@ -603,9 +649,11 @@ class MonitoredSession(_MonitoredSession):
     A MonitoredSession object.
   """
 
-  def __init__(self, session_creator=None, hooks=None):
+  def __init__(self, session_creator=None, hooks=None,
+               stop_grace_period_secs=120):
     super(MonitoredSession, self).__init__(
-        session_creator, hooks, should_recover=True)
+        session_creator, hooks, should_recover=True,
+        stop_grace_period_secs=stop_grace_period_secs)
 
 
 class SingularMonitoredSession(_MonitoredSession):
@@ -614,21 +662,22 @@ class SingularMonitoredSession(_MonitoredSession):
   Please note that this utility is not recommended for distributed settings.
   For distributed settings, please use `tf.train.MonitoredSession`. The
   differences between `MonitoredSession` and `SingularMonitoredSession` are:
-  * `MonitoredSession` handles `AbortedError` for distributed settings,
-    but `SingularMonitoredSession` does not.
+
+  * `MonitoredSession` handles `AbortedError` and `UnavailableError` for
+    distributed settings, but `SingularMonitoredSession` does not.
   * `MonitoredSession` can be created in `chief` or `worker` modes.
     `SingularMonitoredSession` is always created as `chief`.
   * You can access the raw `tf.Session` object used by
     `SingularMonitoredSession`, whereas in MonitoredSession the raw session is
     private. This can be used:
-    - To `run` without hooks.
-    - To save and restore.
+      - To `run` without hooks.
+      - To save and restore.
   * All other functionality is identical.
 
   Example usage:
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with SingularMonitoredSession(hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
       sess.run(train_op)
@@ -655,7 +704,7 @@ class SingularMonitoredSession(_MonitoredSession):
 
   * calls `hook.end()`
   * closes the queue runners and the session
-  * surpresses `OutOfRange` error which indicates that all inputs have been
+  * suppresses `OutOfRange` error which indicates that all inputs have been
     processed if the `SingularMonitoredSession` is used as a context.
   """
 
@@ -664,7 +713,8 @@ class SingularMonitoredSession(_MonitoredSession):
                scaffold=None,
                master='',
                config=None,
-               checkpoint_dir=None):
+               checkpoint_dir=None,
+               stop_grace_period_secs=120):
     """Creates a SingularMonitoredSession.
 
     Args:
@@ -675,6 +725,8 @@ class SingularMonitoredSession(_MonitoredSession):
       config: `ConfigProto` proto used to configure the session.
       checkpoint_dir: A string.  Optional path to a directory where to restore
         variables.
+      stop_grace_period_secs: Number of seconds given to threads to stop after
+        `close()` has been called.
     """
     session_creator = ChiefSessionCreator(
         scaffold=scaffold,
@@ -682,7 +734,8 @@ class SingularMonitoredSession(_MonitoredSession):
         config=config,
         checkpoint_dir=checkpoint_dir)
     super(SingularMonitoredSession, self).__init__(
-        session_creator, hooks, should_recover=False)
+        session_creator, hooks, should_recover=False,
+        stop_grace_period_secs=stop_grace_period_secs)
 
   def raw_session(self):
     """Returns underlying `TensorFlow.Session` object."""
@@ -744,6 +797,8 @@ class _WrappedSession(object):
     if self._sess:
       try:
         self._sess.close()
+      except _PREEMPTION_ERRORS:
+        pass
       finally:
         self._sess = None
 
@@ -752,13 +807,14 @@ class _WrappedSession(object):
 
 
 class _RecoverableSession(_WrappedSession):
-  """A wrapped session that recreates a session on `tf.errors.AbortedError`.
+  """A wrapped session that recreates a session upon certain kinds of errors.
 
   The constructor is passed a SessionCreator object, not a session.
 
   Calls to `run()` are delegated to the wrapped session.  If a call raises the
-  exception `tf.errors.AbortedError`, the wrapped session is closed, and a new
-  one is created by calling the factory again.
+  exception `tf.errors.AbortedError` or `tf.errors.UnavailableError`, the
+  wrapped session is closed, and a new one is created by calling the factory
+  again.
   """
 
   def __init__(self, sess_creator):
@@ -777,10 +833,11 @@ class _RecoverableSession(_WrappedSession):
     while True:
       try:
         return self._sess_creator.create_session()
-      except errors.AbortedError:
-        logging.info('An AbortedError was raised during initialization. '
-                     'It\'s most likely due to a preemption in a connected '
-                     'worker/ps. A new session will be created.')
+      except _PREEMPTION_ERRORS as e:
+        logging.info('An error was raised while a session was being created. '
+                     'This may be due to a preemption of a connected worker '
+                     'or parameter server. A new session will be created. '
+                     'Error: %s', e)
 
   def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
     while True:
@@ -791,11 +848,11 @@ class _RecoverableSession(_WrappedSession):
                               feed_dict=feed_dict,
                               options=options,
                               run_metadata=run_metadata)
-      except errors.AbortedError:
-        logging.info('An AbortedError was raised. Closing the current session. '
-                     'It\'s most likely due to a preemption in a connected '
-                     'worker/ps. '
-                     'A new session will be created on the next session.run().')
+      except _PREEMPTION_ERRORS as e:
+        logging.info('An error was raised. This may be due to a preemption in '
+                     'a connected worker or parameter server. The current '
+                     'session will be closed and a new session will be '
+                     'created. Error: %s', e)
         self.close()
         self._sess = None
 
@@ -814,15 +871,18 @@ class _CoordinatedSession(_WrappedSession):
   will be re-raised from the call to `run()`.
   """
 
-  def __init__(self, sess, coord):
+  def __init__(self, sess, coord, stop_grace_period_secs=120):
     """Create a new `_CoordinatedSession`.
 
     Args:
       sess: A `tf.Session` object.  The wrapped session.
       coord: A `tf.train.Coordinator` object.
+      stop_grace_period_secs: Number of seconds given to threads to stop after
+        `close()` has been called.
     """
     _WrappedSession.__init__(self, sess)
     self._coord = coord
+    self._stop_grace_period_secs = stop_grace_period_secs
 
   def _check_stop(self):
     # Check with the coordinator if we should stop.
@@ -831,7 +891,9 @@ class _CoordinatedSession(_WrappedSession):
   def close(self):
     self._coord.request_stop()
     try:
-      self._coord.join()
+      self._coord.join(
+          stop_grace_period_secs=self._stop_grace_period_secs,
+          ignore_live_threads=True)
     finally:
       try:
         _WrappedSession.close(self)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 444ee68cb8b..85a5ceeb08f 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -147,6 +147,68 @@ class ScaffoldTest(test.TestCase):
                                    'Graph is finalized and cannot be modified'):
         constant_op.constant([0])
 
+  def test_new_scaffold_from_default_scaffold(self):
+    scaffold1 = monitored_session.Scaffold()
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold2 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(2, scaffold2.init_op)
+      self.assertEqual(3, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(5, scaffold2.ready_op)
+      self.assertEqual(6, scaffold2.ready_for_local_init_op)
+      self.assertEqual(7, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_new_scaffold_from_existing_scaffold(self):
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold1 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver)
+
+      scaffold2 = monitored_session.Scaffold(
+          init_op=4,
+          init_feed_dict=6,
+          init_fn=lambda scaffold, sess: 8,
+          ready_op=10,
+          ready_for_local_init_op=12,
+          local_init_op=14,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(4, scaffold2.init_op)
+      self.assertEqual(6, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(10, scaffold2.ready_op)
+      self.assertEqual(12, scaffold2.ready_for_local_init_op)
+      self.assertEqual(14, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_copy_from_scaffold_is_scaffold(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, 'copy_from_scaffold is not a Scaffold instance'):
+        monitored_session.Scaffold(copy_from_scaffold=1)
+
 
 def _test_dir(temp_dir, test_name):
   """Create an empty dir to use for tests.
@@ -224,7 +286,8 @@ class MonitoredTrainingSessionTest(test.TestCase):
       with monitored_session.MonitoredTrainingSession(
           is_chief=True,
           checkpoint_dir=logdir,
-          save_summaries_steps=100) as session:
+          save_summaries_steps=100,
+          log_step_count_steps=10) as session:
         for _ in range(101):
           session.run(new_gstep)
     summaries = util_test.latest_summaries(logdir)
@@ -242,7 +305,8 @@ class MonitoredTrainingSessionTest(test.TestCase):
           is_chief=True,
           checkpoint_dir=logdir,
           save_summaries_steps=None,
-          save_summaries_secs=0.1) as session:
+          save_summaries_secs=0.1,
+          log_step_count_steps=10) as session:
         session.run(new_gstep)
         time.sleep(0.2)
         for _ in range(101):
@@ -879,13 +943,13 @@ class MonitoredSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
       self.assertTrue(self.init_raised_aborted_error)
 
-  def test_retry_on_aborted_error(self):
-    # Tests that we silently retry on abort.  Note that this does not test
+  def _retry_test(self, ex):
+    # Tests that we silently retry on error.  Note that this does not test
     # recovery as we do not use a CheckpointSaver in this test.
     with ops.Graph().as_default():
       gstep = variables_lib.get_or_create_global_step()
       do_step = state_ops.assign_add(gstep, 1)
-      hook = RaiseOnceAtCountN(4, errors_impl.AbortedError(None, None, 'Abort'))
+      hook = RaiseOnceAtCountN(4, ex)
       with monitored_session.MonitoredSession(hooks=[hook]) as session:
         self.assertEqual(0, session.run(gstep))
         self.assertEqual(1, session.run(do_step))
@@ -901,6 +965,12 @@ class MonitoredSessionTest(test.TestCase):
         self.assertEqual(2, session.run(do_step))
         self.assertFalse(session.should_stop())
 
+  def test_retry_on_aborted_error(self):
+    self._retry_test(errors_impl.AbortedError(None, None, 'Abort'))
+
+  def test_retry_on_unavailable_error(self):
+    self._retry_test(errors_impl.UnavailableError(None, None, 'Unavailable'))
+
   def test_recover_and_retry_on_aborted_error(self):
     # Tests that we silently retry and recover on abort.  This test uses
     # a CheckpointSaver to have something to recover from.
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index a7784103f3e..b31027ca3cc 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -51,13 +51,13 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     variable: A Variable.
     value: A tensor with the same shape as 'variable'.
     decay: A float Tensor or float value.  The moving average decay.
-    zero_debias: A python bool. If true, assume the variable is 0-intialized and
+    zero_debias: A python bool. If true, assume the variable is 0-initialized and
       unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
       `_zero_debias` for more details.
     name: Optional name of the returned operation.
 
   Returns:
-    An Operation that updates 'variable' with the newly computed
+    A reference to the input 'variable' tensor with the newly computed
     moving average.
   """
   with ops.name_scope(name, "AssignMovingAvg",
@@ -269,7 +269,7 @@ class ExponentialMovingAverage(object):
      for a given variable.
   *  Build a model normally but load the checkpoint files to evaluate by using
      the shadow variable names.  For this use the `average_name()` method.  See
-     the [Saver class](../../api_docs/python/train.md#Saver) for more
+     the @{tf.train.Saver} for more
      information on restoring saved variables.
 
   Example of restoring the shadow variable values:
@@ -282,12 +282,6 @@ class ExponentialMovingAverage(object):
   saver.restore(...checkpoint filename...)
   # var0 and var1 now hold the moving average values
   ```
-
-  @@__init__
-  @@apply
-  @@average_name
-  @@average
-  @@variables_to_restore
   """
 
   def __init__(self, decay, num_updates=None, zero_debias=False,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 532321f81bc..0e10704dc86 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -32,17 +32,17 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util import nest
 
 
 def _get_variable_for(v):
   """Returns the ResourceVariable responsible for v, or v if not necessary."""
-  if v.op.type == "ResourceGather":
-    for var in variables.global_variables() + variables.local_variables():
+  if v.op.type == "VarHandleOp":
+    for var in variables.trainable_variables():
       if (isinstance(var, resource_variable_ops.ResourceVariable)
-          and var.handle is v.op.inputs[0]):
+          and var.handle.op is v.op):
         return var
-    raise ValueError("Got embedding lookup %s but"
-                     " could not locate source variable." % (str(v)))
+    raise ValueError("Got %s but  could not locate source variable." % (str(v)))
   return v
 
 
@@ -66,8 +66,6 @@ def _deduplicate_indexed_slices(values, indices):
 
 
 def _var_key(var):
-  if var.op.type == "ResourceGather":
-    var = var.op.inputs[0]
   return (var.op.graph, var.op.name)
 
 
@@ -129,11 +127,14 @@ class _DenseResourceVariableProcessor(_OptimizableVariable):
 
   def update_op(self, optimizer, g):
     # pylint: disable=protected-access
-    return optimizer._resource_apply_dense(g, self._v.handle)
+    if isinstance(g, ops.IndexedSlices):
+      return optimizer._resource_apply_sparse_duplicate_indices(
+          g.values, self._v, g.indices)
+    return optimizer._resource_apply_dense(g, self._v)
 
 
-class _SparseResourceVariableProcessor(_OptimizableVariable):
-  """Processor for sparse ResourceVariables."""
+class _StreamingModelPortProcessor(_OptimizableVariable):
+  """Processor for streaming ModelPorts."""
 
   def __init__(self, v):
     self._v = v
@@ -142,20 +143,17 @@ class _SparseResourceVariableProcessor(_OptimizableVariable):
     return self._v
 
   def update_op(self, optimizer, g):
-    # pylint: disable=protected-access
-    return optimizer._resource_apply_sparse_duplicate_indices(
-        g, self._v.op.inputs[0], self._v.op.inputs[1])
+    return g
 
 
 def _get_processor(v):
-  if isinstance(v, variables.Variable):
-    return _RefVariableProcessor(v)
-  if v.op.type == "ReadVariableOp":
-    return _DenseReadResourceVariableProcessor(v)
+  """The processor of v."""
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
-  if v.op.type == "ResourceGather":
-    return _SparseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
+  if v.op.type == "SubmodelPort":
+    return _StreamingModelPortProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
@@ -211,12 +209,6 @@ class Optimizer(object):
   opt.apply_gradients(capped_grads_and_vars)
   ```
 
-  @@__init__
-
-  @@minimize
-  @@compute_gradients
-  @@apply_gradients
-
   ### Gating Gradients
 
   Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
@@ -250,9 +242,6 @@ class Optimizer(object):
 
   This can be useful if you want to log debug a training algorithm, report stats
   about the slots, etc.
-
-  @@get_slot_names
-  @@get_slot
   """
 
   # Values for gate_gradients.
@@ -300,9 +289,9 @@ class Optimizer(object):
       loss: A `Tensor` containing the value to minimize.
       global_step: Optional `Variable` to increment by one after the
         variables have been updated.
-      var_list: Optional list of `Variable` objects to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
       gate_gradients: How to gate the computation of gradients.  Can be
         `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
@@ -350,7 +339,7 @@ class Optimizer(object):
 
     Args:
       loss: A Tensor containing the value to minimize.
-      var_list: Optional list of `tf.Variable` to update to minimize
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph
         under the key `GraphKey.TRAINABLE_VARIABLES`.
       gate_gradients: How to gate the computation of gradients.  Can be
@@ -381,6 +370,11 @@ class Optimizer(object):
       var_list = (
           variables.trainable_variables() +
           ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+    else:
+      var_list = nest.flatten(var_list)
+    # pylint: disable=protected-access
+    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
+    # pylint: enable=protected-access
     processors = [_get_processor(v) for v in var_list]
     if not var_list:
       raise ValueError("No variables to optimize.")
@@ -393,7 +387,9 @@ class Optimizer(object):
     if gate_gradients == Optimizer.GATE_GRAPH:
       grads = control_flow_ops.tuple(grads)
     grads_and_vars = list(zip(grads, var_list))
-    self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None])
+    self._assert_valid_dtypes(
+        [v for g, v in grads_and_vars
+         if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
@@ -731,6 +727,28 @@ class Optimizer(object):
       named_slots[_var_key(var)] = slot_creator.create_slot(var, val, op_name)
     return named_slots[_var_key(var)]
 
+  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
+                                         slot_name, op_name):
+    """Find or create a slot for a variable, using an Initializer.
+
+    Args:
+      var: A `Variable` object.
+      initializer: An `Initializer`.  The initial value of the slot.
+      shape: Shape of the initial value of the slot.
+      dtype: Type of the value of the slot.
+      slot_name: Name for the slot.
+      op_name: Name to use when scoping the Variable that
+        needs to be created for  the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    if _var_key(var) not in named_slots:
+      named_slots[_var_key(var)] = slot_creator.create_slot_with_initializer(
+          var, initializer, shape, dtype, op_name)
+    return named_slots[_var_key(var)]
+
   def _zeros_slot(self, var, slot_name, op_name):
     """Find or create a slot initialized with 0.0.
 
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index 68fd5444862..da31ab325d5 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -30,8 +30,6 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Proximal Adagrad algorithm.
 
   See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-  @@__init__
   """
 
   def __init__(self, learning_rate, initial_accumulator_value=0.1,
@@ -96,7 +94,7 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     acc = self.get_slot(var, "accumulator")
     return training_ops.resource_apply_proximal_adagrad(
-        var, acc.handle, self._learning_rate_tensor,
+        var.handle, acc.handle, self._learning_rate_tensor,
         self._l1_regularization_strength_tensor,
         self._l2_regularization_strength_tensor,
         grad, use_locking=self._use_locking)
@@ -113,7 +111,7 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
   def _resource_apply_sparse(self, grad, var, indices):
     acc = self.get_slot(var, "accumulator")
     return training_ops.resource_sparse_apply_proximal_adagrad(
-        var, acc.handle,
+        var.handle, acc.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
         math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
         math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 28e28687f45..1da7f75531a 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -132,7 +132,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
       v0_val, v1_val = sess.run([var0, var1])
-      self.assertAllClose(np.array([0.662907, 0.767398]), v0_val)
+      self.assertAllClose(np.array([-6.663634, -9.190331]), v0_val)
       self.assertAllClose(np.array([2.959304, 1.029232]), v1_val)
 
   def testProximalAdagradWithL1_L2(self):
@@ -159,8 +159,8 @@ class ProximalAdagradOptimizerTest(test.TestCase):
         update.run()
 
       v0_val, v1_val = sess.run([var0, var1])
-      self.assertAllClose(np.array([0.043069, 0.080461]), v0_val)
-      self.assertAllClose(np.array([0.004069, 0.008578]), v1_val)
+      self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
+      self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
   def applyOptimizer(self, opt, steps=5, is_sparse=False):
     if is_sparse:
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index dd10c960e15..53e9dc2ef2c 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -31,8 +31,6 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the proximal gradient descent algorithm.
 
   See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-
-  @@__init__
   """
 
   def __init__(self, learning_rate, l1_regularization_strength=0.0,
@@ -69,7 +67,7 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
 
   def _resource_apply_dense(self, grad, var):
     return training_ops.resource_apply_proximal_gradient_descent(
-        var,
+        var.handle,
         self._learning_rate_tensor,
         self._l1_regularization_strength_tensor,
         self._l2_regularization_strength_tensor,
@@ -88,7 +86,7 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
 
   def _resource_apply_sparse(self, grad, var, indices):
     return training_ops.resource_sparse_apply_proximal_gradient_descent(
-        var,
+        var.handle,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
         math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
         math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index 9c5ea670150..4e4812fe603 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -131,8 +131,8 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
         update.run()
 
       v0_val, v1_val = sess.run([var0, var1])
-      self.assertAllClose(np.array([0.037125, 0.074625]), v0_val)
-      self.assertAllClose(np.array([0.003375, 0.007125]), v1_val)
+      self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
+      self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
   def applyOptimizer(self, opt, steps=5, is_sparse=False):
     if is_sparse:
diff --git a/tensorflow/python/client/quantize_training.i b/tensorflow/python/training/quantize_training.i
similarity index 94%
rename from tensorflow/python/client/quantize_training.i
rename to tensorflow/python/training/quantize_training.i
index 59cc895084a..40c60769731 100644
--- a/tensorflow/python/client/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -24,8 +24,10 @@ static PyObject* DoQuantizeTrainingOnGraphDefHelper(
     int num_bits,
     TF_Status* out_status) {
   string result;
+  // TODO(suharshs): Make the QuantizeAndDequantizeV2 configurable.
   tensorflow::Status status =
-      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits, &result);
+      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits,
+      "QuantizeAndDequantizeV2", &result);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
     Py_RETURN_NONE;
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
new file mode 100644
index 00000000000..9754adea856
--- /dev/null
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -0,0 +1,99 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the SWIG-wrapped quantize training rewriting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_module
+
+
+class PywrapQuantizeTrainingTest(test.TestCase):
+
+  # Mainly to verify the python interface is working.
+  # More tests for this function can be found in the related c++ tests.
+  def testQuantizeTraining(self):
+    with session.Session() as sess:
+      a = constant_op.constant(6.0, shape=[1, 1])
+      b = constant_op.constant(7.0, shape=[1, 1])
+      c = math_ops.matmul(a, b, name='matmul')
+
+      self.assertEquals(c.eval(), 42.0)
+      self.assertEquals(len(sess.graph_def.node), 3)
+
+      result = pywrap_tensorflow.do_quantize_training_on_graphdef(
+          sess.graph_def, 8)
+
+      # We just want to guarantee that some rewrite happened.
+      self.assertGreater(len(result.node), 3)
+
+  # Test that save/restoring works for EMA variables generated in the
+  # quantized training rewrite.
+  def testQuantizedSaveRestore(self):
+    save_path = os.path.join(self.get_temp_dir(), 'quantized_save_restore')
+
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      a = constant_op.constant(6.0, shape=[1, 1], name='a')
+      b = variables.Variable(constant_op.constant(7.0, shape=[1, 1]), name='b')
+      c = math_ops.matmul(a, b, name='matmul')
+
+      init_op = variables.global_variables_initializer()
+
+      saver = saver_module.Saver({'b': b})
+
+      result = pywrap_tensorflow.do_quantize_training_on_graphdef(
+          sess.graph_def, 8)
+
+    with ops.Graph().as_default() as g, session.Session(graph=g) as sess:
+      _ = importer.import_graph_def(result, name='')
+
+      # Initialize the variable.
+      sess.run(g.get_operation_by_name(init_op.name))
+
+      # Run the graph for one step to assign values to the quantization min/max
+      # variables.
+      sess.run(g.get_tensor_by_name(c.name))
+
+      saver.save(sess, save_path)
+
+    with ops.Graph().as_default() as g, session.Session(graph=g) as sess:
+      _ = importer.import_graph_def(result, name='')
+
+      # When we restore the saved variabled, the quantization variables should
+      # be restored as well.
+      saver.restore(sess, save_path)
+      self.assertEquals(7.0, sess.run(g.get_tensor_by_name('b:0')))
+      self.assertEquals(6.0, sess.run(g.get_tensor_by_name('a/Min/Variable:0')))
+      self.assertEquals(6.0, sess.run(g.get_tensor_by_name('a/Max/Variable:0')))
+      self.assertEquals(7.0,
+                        sess.run(g.get_tensor_by_name('b/read/Min/Variable:0')))
+      self.assertEquals(7.0,
+                        sess.run(g.get_tensor_by_name('b/read/Max/Variable:0')))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 91999fa37db..4e58602a6f7 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -22,6 +22,7 @@ import threading
 import weakref
 
 from tensorflow.core.protobuf import queue_runner_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -128,7 +129,7 @@ class QueueRunner(object):
                      for t in queue_closed_exception_types)):
         raise TypeError(
             "queue_closed_exception_types, when provided, "
-            "must be a non-empty list of tf.error types, but saw: %s"
+            "must be a tuple of tf.error types, but saw: %s"
             % queue_closed_exception_types)
     self._queue_closed_exception_types = queue_closed_exception_types
     # Close when no more will be produced, but pending enqueues should be
@@ -227,11 +228,14 @@ class QueueRunner(object):
     """
     decremented = False
     try:
+      # Make a cached callable from the `enqueue_op` to decrease the
+      # Python overhead in the queue-runner loop.
+      enqueue_callable = sess.make_callable(enqueue_op)
       while True:
         if coord and coord.should_stop():
           break
         try:
-          sess.run(enqueue_op)
+          enqueue_callable()
         except self._queue_closed_exception_types:  # pylint: disable=catching-non-exception
           # This exception indicates that a queue was closed.
           with self._lock:
@@ -398,6 +402,10 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
     collection: A `GraphKey` specifying the graph collection to
       get the queue runners from.  Defaults to `GraphKeys.QUEUE_RUNNERS`.
 
+  Raises:
+    ValueError: if `sess` is None and there isn't any default session.
+    TypeError: if `sess` is not a `tf.Session` object.
+
   Returns:
     A list of threads.
   """
@@ -407,6 +415,15 @@ def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
       raise ValueError("Cannot start queue runners: No default session is "
                        "registered. Use `with sess.as_default()` or pass an "
                        "explicit session to tf.start_queue_runners(sess=sess)")
+
+  if not isinstance(sess, session.SessionInterface):
+    # Following check is due to backward compatibility. (b/62061352)
+    if sess.__class__.__name__ in [
+        "MonitoredSession", "SingularMonitoredSession"]:
+      return []
+    raise TypeError("sess must be a `tf.Session` object. "
+                    "Given class: {}".format(sess.__class__))
+
   with sess.graph.as_default():
     threads = []
     for qr in ops.get_collection(collection):
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 5b00ac9fc31..51c0eecf46a 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import queue_runner_impl
 
 
@@ -247,6 +248,33 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 3.
       self.assertEqual(3, var.eval())
 
+  def testStartQueueRunnersRaisesIfNotASession(self):
+    zero64 = constant_op.constant(0, dtype=dtypes.int64)
+    var = variables.Variable(zero64)
+    count_up_to = var.count_up_to(3)
+    queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+    init_op = variables.global_variables_initializer()
+    qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+    queue_runner_impl.add_queue_runner(qr)
+    with self.test_session():
+      init_op.run()
+      with self.assertRaisesRegexp(TypeError, "tf.Session"):
+        queue_runner_impl.start_queue_runners("NotASession")
+
+  def testStartQueueRunnersIgnoresMonitoredSession(self):
+    zero64 = constant_op.constant(0, dtype=dtypes.int64)
+    var = variables.Variable(zero64)
+    count_up_to = var.count_up_to(3)
+    queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
+    init_op = variables.global_variables_initializer()
+    qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
+    queue_runner_impl.add_queue_runner(qr)
+    with self.test_session():
+      init_op.run()
+      threads = queue_runner_impl.start_queue_runners(
+          monitored_session.MonitoredSession())
+      self.assertFalse(threads)
+
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 46aa27a0524..eb814a7d160 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -40,8 +40,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
@@ -51,8 +51,6 @@ class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
   See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
-
-  @@__init__
   """
 
   def __init__(self,
@@ -97,11 +95,11 @@ class RMSPropOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     for v in var_list:
-      val_rms = constant_op.constant(1.0, dtype=v.dtype, shape=v.get_shape())
-      self._get_or_make_slot(v, val_rms, "rms", self._name)
+      init_rms = init_ops.ones_initializer(dtype=v.dtype)
+      self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
+                                              v.dtype, "rms", self._name)
       if self._centered:
-        val_mg = constant_op.constant(0.0, dtype=v.dtype, shape=v.get_shape())
-        self._get_or_make_slot(v, val_mg, "mg", self._name)
+        self._zeros_slot(v, "mg", self._name)
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
@@ -147,7 +145,7 @@ class RMSPropOptimizer(optimizer.Optimizer):
     if self._centered:
       mg = self.get_slot(var, "mg")
       return training_ops.resource_apply_centered_rms_prop(
-          var,
+          var.handle,
           mg.handle,
           rms.handle,
           mom.handle,
@@ -159,7 +157,7 @@ class RMSPropOptimizer(optimizer.Optimizer):
           use_locking=self._use_locking)
     else:
       return training_ops.resource_apply_rms_prop(
-          var,
+          var.handle,
           rms.handle,
           mom.handle,
           math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
@@ -205,7 +203,7 @@ class RMSPropOptimizer(optimizer.Optimizer):
     if self._centered:
       mg = self.get_slot(var, "mg")
       return training_ops.resource_sparse_apply_centered_rms_prop(
-          var,
+          var.handle,
           mg.handle,
           rms.handle,
           mom.handle,
@@ -218,7 +216,7 @@ class RMSPropOptimizer(optimizer.Optimizer):
           use_locking=self._use_locking)
     else:
       return training_ops.resource_sparse_apply_rms_prop(
-          var,
+          var.handle,
           rms.handle,
           mom.handle,
           math_ops.cast(self._learning_rate_tensor, grad.dtype),
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 25db2a8ac08..20d520fd7b5 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -58,8 +58,8 @@ from tensorflow.python.util import compat
 _VARIABLE_OPS = set(["Variable",
                      "VariableV2",
                      "AutoReloadVariable",
-                     "ReadVariableOp",
-                     "ResourceGather"])
+                     "VarHandleOp",
+                     "ReadVariableOp"])
 
 
 def _set_cpu0(device_string):
@@ -511,9 +511,7 @@ class BaseSaverBuilder(object):
           raise ValueError("At least two variables have the same name: %s" %
                            var.name)
         names_to_saveables[var.name] = var
-      elif ((isinstance(var, variables.Variable) or
-             isinstance(var, resource_variable_ops.ResourceVariable)) and
-            var._save_slice_info):
+      elif isinstance(var, variables.Variable) and var._save_slice_info:
         name = var._save_slice_info.full_name
         if name in names_to_saveables:
           if not isinstance(names_to_saveables[name], list):
@@ -573,8 +571,7 @@ class BaseSaverBuilder(object):
         slice_name = None
         # pylint: disable=protected-access
         for variable in op:
-          if (not isinstance(variable, variables.Variable) and
-              not isinstance(variable, resource_variable_ops.ResourceVariable)):
+          if not isinstance(variable, variables.Variable):
             raise ValueError("Slices must all be Variables: %s" % variable)
           if not variable._save_slice_info:
             raise ValueError("Slices must all be slices: %s" % variable)
@@ -659,7 +656,7 @@ class BaseSaverBuilder(object):
       restore_sequentially: A Bool, which if true, causes restore of different
         variables to happen sequentially within each device.
       filename: If known at graph construction time, filename used for variable
-        loading/saving.
+        loading/saving. If None, then the default name "model" will be used.
 
     Returns:
       A SaverDef proto.
@@ -677,7 +674,7 @@ class BaseSaverBuilder(object):
     with ops.name_scope(name, "save",
                         [saveable.op for saveable in saveables]) as name:
       # Add the Constant string tensor for the filename.
-      filename_tensor = constant_op.constant(filename)
+      filename_tensor = constant_op.constant(filename or "model")
 
       # Add the save ops.
       if sharded:
@@ -712,6 +709,33 @@ class BaseSaverBuilder(object):
         version=self._write_version)
 
 
+def _get_saver_or_default():
+  """Returns the saver from SAVERS collection, or creates a default one.
+
+  This method is used by other members of the training module, such as
+  `Scaffold`, or `CheckpointSaverHook`.
+
+  Returns:
+    `Saver`.
+
+  Raises:
+    RuntimeError: If the SAVERS collection already has more than one items.
+  """
+  collection_key = ops.GraphKeys.SAVERS
+  savers = ops.get_collection(collection_key)
+  if savers:
+    if len(savers) > 1:
+      raise RuntimeError(
+          "More than one item in collection {}. "
+          "Please indicate which one to use by passing it to the constructor.".
+          format(collection_key))
+    return savers[0]
+  saver = Saver(sharded=True, allow_empty=True)
+  if saver is not None:
+    ops.add_to_collection(collection_key, saver)
+  return saver
+
+
 def _GetCheckpointFilename(save_dir, latest_filename):
   """Returns a filename for storing the CheckpointState.
 
@@ -792,14 +816,66 @@ def update_checkpoint_state(save_dir,
       'checkpoint'.
 
   Raises:
-    RuntimeError: If the save paths conflict.
+    RuntimeError: If any of the model checkpoint paths conflict with the file
+      containing CheckpointSate.
+  """
+  _update_checkpoint_state(
+      save_dir=save_dir,
+      model_checkpoint_path=model_checkpoint_path,
+      all_model_checkpoint_paths=all_model_checkpoint_paths,
+      latest_filename=latest_filename,
+      save_relative_paths=False)
+
+
+def _update_checkpoint_state(save_dir,
+                             model_checkpoint_path,
+                             all_model_checkpoint_paths=None,
+                             latest_filename=None,
+                             save_relative_paths=False):
+  """Updates the content of the 'checkpoint' file.
+
+  This updates the checkpoint file containing a CheckpointState
+  proto.
+
+  Args:
+    save_dir: Directory where the model was saved.
+    model_checkpoint_path: The checkpoint file.
+    all_model_checkpoint_paths: List of strings.  Paths to all not-yet-deleted
+      checkpoints, sorted from oldest to newest.  If this is a non-empty list,
+      the last element must be equal to model_checkpoint_path.  These paths
+      are also saved in the CheckpointState proto.
+    latest_filename: Optional name of the checkpoint file.  Default to
+      'checkpoint'.
+    save_relative_paths: If `True`, will write relative paths to the checkpoint
+      state file.
+
+  Raises:
+    RuntimeError: If any of the model checkpoint paths conflict with the file
+      containing CheckpointSate.
   """
   # Writes the "checkpoint" file for the coordinator for later restoration.
   coord_checkpoint_filename = _GetCheckpointFilename(save_dir, latest_filename)
-  ckpt = generate_checkpoint_state_proto(
-      save_dir,
-      model_checkpoint_path,
-      all_model_checkpoint_paths=all_model_checkpoint_paths)
+  if save_relative_paths:
+    if os.path.isabs(model_checkpoint_path):
+      rel_model_checkpoint_path = os.path.relpath(
+          model_checkpoint_path, save_dir)
+    else:
+      rel_model_checkpoint_path = model_checkpoint_path
+    rel_all_model_checkpoint_paths = []
+    for p in all_model_checkpoint_paths:
+      if os.path.isabs(p):
+        rel_all_model_checkpoint_paths.append(os.path.relpath(p, save_dir))
+      else:
+        rel_all_model_checkpoint_paths.append(p)
+    ckpt = generate_checkpoint_state_proto(
+        save_dir,
+        rel_model_checkpoint_path,
+        all_model_checkpoint_paths=rel_all_model_checkpoint_paths)
+  else:
+    ckpt = generate_checkpoint_state_proto(
+        save_dir,
+        model_checkpoint_path,
+        all_model_checkpoint_paths=all_model_checkpoint_paths)
 
   if coord_checkpoint_filename == ckpt.model_checkpoint_path:
     raise RuntimeError("Save path '%s' conflicts with path used for "
@@ -856,11 +932,11 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
           ckpt.all_model_checkpoint_paths[i] = os.path.join(checkpoint_dir, p)
   except errors.OpError as e:
     # It's ok if the file cannot be read
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   except text_format.ParseError as e:
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   finally:
@@ -872,7 +948,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
 class Saver(object):
   """Saves and restores variables.
 
-  See [Variables](../../how_tos/variables/index.md)
+  See @{$variables$Variables}
   for an overview of variables, saving and restoring.
 
   The `Saver` class adds ops to save and restore variables to and from
@@ -941,17 +1017,6 @@ class Saver(object):
 
   If you create several savers, you can specify a different filename for the
   protocol buffer file in the call to `save()`.
-
-  @@__init__
-  @@save
-  @@restore
-
-  Other utility methods.
-
-  @@last_checkpoints
-  @@set_last_checkpoints_with_time
-  @@recover_last_checkpoints
-  @@as_saver_def
   """
 
   def __init__(self,
@@ -967,7 +1032,9 @@ class Saver(object):
                defer_build=False,
                allow_empty=False,
                write_version=saver_pb2.SaverDef.V2,
-               pad_step_number=False):
+               pad_step_number=False,
+               save_relative_paths=False,
+               filename=None):
     """Creates a `Saver`.
 
     The constructor adds ops to save and restore variables.
@@ -1040,6 +1107,11 @@ class Saver(object):
       pad_step_number: if True, pads the global step number in the checkpoint
         filepaths to some fixed width (8 by default).  This is turned off by
         default.
+      save_relative_paths: If `True`, will write relative paths to the
+        checkpoint state file. This is needed if the user wants to copy the
+        checkpoint directory and reload from the copied directory.
+      filename: If known at graph construction time, filename used for variable
+        loading/saving.
 
     Raises:
       TypeError: If `var_list` is invalid.
@@ -1063,11 +1135,13 @@ class Saver(object):
     self._is_empty = None
     self._write_version = write_version
     self._pad_step_number = pad_step_number
+    self._filename = filename
     if not defer_build:
       self.build()
     if self.saver_def:
       self._check_saver_def()
       self._write_version = self.saver_def.version
+    self._save_relative_paths = save_relative_paths
 
   def build(self):
     """Builds saver_def."""
@@ -1094,7 +1168,8 @@ class Saver(object):
           max_to_keep=self._max_to_keep,
           keep_checkpoint_every_n_hours=self._keep_checkpoint_every_n_hours,
           name=self._name,
-          restore_sequentially=self._restore_sequentially)
+          restore_sequentially=self._restore_sequentially,
+          filename=self._filename)
     elif self.saver_def and self._name:
       # Since self._name is used as a name_scope by builder(), we are
       # overloading the use of this field to represent the "import_scope" as
@@ -1223,12 +1298,24 @@ class Saver(object):
     Returns:
       A `SaverDef` protocol buffer.
     """
-    if (export_scope is None or
-        self._name.startswith(export_scope)):
+    if export_scope is None:
       return self.saver_def
-    else:
+
+    if not (self.saver_def.filename_tensor_name.startswith(export_scope) and
+            self.saver_def.save_tensor_name.startswith(export_scope) and
+            self.saver_def.restore_op_name.startswith(export_scope)):
       return None
 
+    saver_def = saver_pb2.SaverDef()
+    saver_def.CopyFrom(self.saver_def)
+    saver_def.filename_tensor_name = ops.strip_name_scope(
+        saver_def.filename_tensor_name, export_scope)
+    saver_def.save_tensor_name = ops.strip_name_scope(
+        saver_def.save_tensor_name, export_scope)
+    saver_def.restore_op_name = ops.strip_name_scope(
+        saver_def.restore_op_name, export_scope)
+    return saver_def
+
   @staticmethod
   def from_proto(saver_def, import_scope=None):
     """Returns a `Saver` object created from `saver_def`.
@@ -1376,24 +1463,31 @@ class Saver(object):
             "'latest_filename' collides with 'save_path': '%s' and '%s'" %
             (latest_filename, save_path))
 
-    if not gfile.IsDirectory(os.path.dirname(save_path)):
-      raise ValueError(
-          "Parent directory of {} doesn't exist, can't save.".format(save_path))
-
-    save_path = os.path.dirname(save_path)
     if not isinstance(sess, session.SessionInterface):
       raise TypeError("'sess' must be a Session; %s" % sess)
 
+    save_path_parent = os.path.dirname(save_path)
     if not self._is_empty:
-      model_checkpoint_path = sess.run(
-          self.saver_def.save_tensor_name,
-          {self.saver_def.filename_tensor_name: checkpoint_file})
-      model_checkpoint_path = compat.as_str(model_checkpoint_path)
-      if write_state:
-        self._MaybeDeleteOldCheckpoints(
-            model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
-        update_checkpoint_state(save_path, model_checkpoint_path,
-                                self.last_checkpoints, latest_filename)
+      try:
+        model_checkpoint_path = sess.run(
+            self.saver_def.save_tensor_name,
+            {self.saver_def.filename_tensor_name: checkpoint_file})
+        model_checkpoint_path = compat.as_str(model_checkpoint_path)
+        if write_state:
+          self._MaybeDeleteOldCheckpoints(
+              model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
+          _update_checkpoint_state(
+              save_dir=save_path_parent,
+              model_checkpoint_path=model_checkpoint_path,
+              all_model_checkpoint_paths=self.last_checkpoints,
+              latest_filename=latest_filename,
+              save_relative_paths=self._save_relative_paths)
+      except (errors.FailedPreconditionError, errors.NotFoundError) as exc:
+        if not gfile.IsDirectory(save_path_parent):
+          exc = ValueError(
+              "Parent directory of {} doesn't exist, can't save.".format(
+                  save_path))
+        raise exc
 
     if write_meta_graph:
       meta_graph_filename = self._MetaGraphFilename(
@@ -1411,7 +1505,8 @@ class Saver(object):
                         collection_list=None,
                         as_text=False,
                         export_scope=None,
-                        clear_devices=False):
+                        clear_devices=False,
+                        clear_extraneous_savers=False):
     """Writes `MetaGraphDef` to save_path/filename.
 
     Args:
@@ -1421,6 +1516,9 @@ class Saver(object):
       export_scope: Optional `string`. Name scope to remove.
       clear_devices: Whether or not to clear the device field for an `Operation`
         or `Tensor` during export.
+      clear_extraneous_savers: Remove any Saver-related information from the
+        graph (both Save/Restore ops and SaverDefs) that are not associated
+        with this Saver.
 
     Returns:
       A `MetaGraphDef` proto.
@@ -1432,7 +1530,8 @@ class Saver(object):
         collection_list=collection_list,
         as_text=as_text,
         export_scope=export_scope,
-        clear_devices=clear_devices)
+        clear_devices=clear_devices,
+        clear_extraneous_savers=clear_extraneous_savers)
 
   def restore(self, sess, save_path):
     """Restores previously saved variables.
@@ -1614,6 +1713,7 @@ def export_meta_graph(filename=None,
                       graph=None,
                       export_scope=None,
                       clear_devices=False,
+                      clear_extraneous_savers=False,
                       **kwargs):
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
@@ -1637,6 +1737,9 @@ def export_meta_graph(filename=None,
       is exported. graph_def and export_scope cannot both be specified.
     clear_devices: Whether or not to clear the device field for an `Operation`
       or `Tensor` during export.
+    clear_extraneous_savers: Remove any Saver-related information from the
+        graph (both Save/Restore ops and SaverDefs) that are not associated
+        with the provided SaverDef.
     **kwargs: Optional keyed arguments.
 
   Returns:
@@ -1655,6 +1758,7 @@ def export_meta_graph(filename=None,
       graph=graph,
       export_scope=export_scope,
       clear_devices=clear_devices,
+      clear_extraneous_savers=clear_extraneous_savers,
       **kwargs)
   return meta_graph_def
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 42c1cab8024..d17b7e93a10 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -30,6 +30,7 @@ import numpy as np
 import six
 
 from google.protobuf.any_pb2 import Any
+from google.protobuf import text_format
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -45,10 +46,11 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
@@ -65,63 +67,11 @@ from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
+from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
 
 
-class CheckpointedOp(object):
-  """Op with a custom checkpointing implementation.
-
-  Defined as part of the test because the MutableHashTable Python code is
-  currently in contrib.
-  """
-
-  def __init__(self, name):
-    self._table_ref = gen_data_flow_ops._mutable_hash_table(
-        key_dtype=dtypes.string, value_dtype=dtypes.float32, name=name)
-    self._name = name
-    self._saveable = CheckpointedOp.CustomSaveable(self, name)
-    ops_lib.add_to_collection(ops_lib.GraphKeys.SAVEABLE_OBJECTS,
-                              self._saveable)
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def saveable(self):
-    return self._saveable
-
-  def insert(self, keys, values):
-    return gen_data_flow_ops._lookup_table_insert(self._table_ref, keys, values)
-
-  def keys(self):
-    return self._export()[0]
-
-  def values(self):
-    return self._export()[1]
-
-  def _export(self):
-    return gen_data_flow_ops._lookup_table_export(self._table_ref,
-                                                  dtypes.string, dtypes.float32)
-
-  class CustomSaveable(saver_module.BaseSaverBuilder.SaveableObject):
-
-    def __init__(self, table, name):
-      tensors = table._export()
-      specs = [
-          saver_module.BaseSaverBuilder.SaveSpec(tensors[0], "",
-                                                 name + "-keys"),
-          saver_module.BaseSaverBuilder.SaveSpec(tensors[1], "",
-                                                 name + "-values")
-      ]
-      super(CheckpointedOp.CustomSaveable, self).__init__(table, specs, name)
-
-    def restore(self, restore_tensors, shapes):
-      return gen_data_flow_ops._lookup_table_import(self.op._table_ref,
-                                                    restore_tensors[0],
-                                                    restore_tensors[1])
-
-
 class SaverTest(test.TestCase):
 
   def basicSaveRestore(self, variable_op):
@@ -131,7 +81,7 @@ class SaverTest(test.TestCase):
     # Restore nodes for them.
     v0 = variable_op(10.0, name="v0")
     v1 = variable_op(20.0, name="v1")
-    v2 = CheckpointedOp(name="v2")
+    v2 = saver_test_utils.CheckpointedOp(name="v2")
     v2_init = v2.insert("k1", 30.0)
     save = saver_module.Saver(
         {
@@ -161,7 +111,7 @@ class SaverTest(test.TestCase):
     with self.test_session() as sess:
       v0 = variable_op(-1.0, name="v0")
       v1 = variable_op(-1.0, name="v1")
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
       save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
 
       # Assert that the variables are not initialized.
@@ -183,7 +133,7 @@ class SaverTest(test.TestCase):
     with self.test_session() as sess:
       v0_2 = variable_op(1000.0, name="v0")
       v1_2 = variable_op(2000.0, name="v1")
-      v2_2 = CheckpointedOp(name="v2")
+      v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver({"v0": v0_2, "v1": v1_2, "v2": v2_2.saveable})
       v2_2.insert("k1000", 3000.0).run()
       variables.global_variables_initializer().run()
@@ -207,6 +157,94 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  def testResourceSaveRestoreCachingDevice(self):
+    save_path = os.path.join(self.get_temp_dir(), "resource_cache")
+    v = resource_variable_ops.ResourceVariable([1], caching_device="/cpu:0")
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      save = saver_module.Saver()
+      save.save(sess, save_path)
+    with self.test_session() as sess:
+      save2 = saver_module.Saver()
+      save2.restore(sess, save_path)
+      self.assertEquals(v.eval(), [1])
+
+  def testSaveCopyRestoreWithSaveRelativePaths(self):
+    """Save, copy checkpoint dir and restore from copied dir.
+
+    This only works for save_relative_paths=True.
+    """
+    save_dir1 = os.path.join(self.get_temp_dir(), "save_dir1")
+    os.mkdir(save_dir1)
+    save_path1 = os.path.join(save_dir1, "save_copy_restore")
+
+    # Build a graph with 2 parameter nodes, and Save and
+    # Restore nodes for them.
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(20.0, name="v1")
+    v2 = saver_test_utils.CheckpointedOp(name="v2")
+    v2_init = v2.insert("k1", 30.0)
+    save = saver_module.Saver(
+        var_list={
+            "v0": v0,
+            "v1": v1,
+            "v2": v2.saveable},
+        restore_sequentially=True,
+        save_relative_paths=True)
+    init_all_op = [variables.global_variables_initializer(), v2_init]
+
+    with self.test_session() as sess:
+      # Initialize all variables
+      sess.run(init_all_op)
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, v0.eval())
+      self.assertEqual(20.0, v1.eval())
+      self.assertEqual(b"k1", v2.keys().eval())
+      self.assertEqual(30.0, v2.values().eval())
+
+      # Save the initialized values in the file at "save_path"
+      val = save.save(sess, save_path1)
+      self.assertTrue(isinstance(val, six.string_types))
+      self.assertEqual(save_path1, val)
+
+    self.assertEqual(saver_module.latest_checkpoint(save_dir1), save_path1)
+    save_dir2 = os.path.join(self.get_temp_dir(), "save_dir2")
+    os.renames(save_dir1, save_dir2)
+    save_path2 = os.path.join(save_dir2, "save_copy_restore")
+    self.assertEqual(saver_module.latest_checkpoint(save_dir2), save_path2)
+
+    # Start a second session.  In that session the parameter nodes
+    # have not been initialized either.
+    with self.test_session() as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
+      save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
+
+      # Assert that the variables are not initialized.
+      self.assertEqual(
+          len(variables.report_uninitialized_variables().eval()), 2)
+      self.assertEqual(0, len(v2.keys().eval()))
+      self.assertEqual(0, len(v2.values().eval()))
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path2)
+      # Check that the parameter nodes have been restored.
+      self.assertEqual(10.0, v0.eval())
+      self.assertEqual(20.0, v1.eval())
+      self.assertEqual(b"k1", v2.keys().eval())
+      self.assertEqual(30.0, v2.values().eval())
+
+  def testFilenameTensor(self):
+    v0 = variables.Variable(0, name="v0")
+    filename = b"somerandomfilename"
+    save = saver_module.Saver({"v0": v0}, filename=filename)
+    with self.test_session() as sess:
+      tensor = sess.graph.get_tensor_by_name(
+          save.saver_def.filename_tensor_name)
+      self.assertEqual(sess.run(tensor), filename)
+
   def testInvalidPath(self):
     v0 = variables.Variable(0, name="v0")
     for ver in (saver_pb2.SaverDef.V1, saver_pb2.SaverDef.V2):
@@ -276,7 +314,7 @@ class SaverTest(test.TestCase):
   def testSameName(self):
     with ops_lib.Graph().as_default():
       v0 = variables.Variable([10.0], name="v0")
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
 
       # Saving one variable under two names raises an error.
       with self.assertRaisesRegexp(
@@ -299,7 +337,7 @@ class SaverTest(test.TestCase):
       # Restore nodes for them.
       v0 = variables.Variable(10.0, name="v0")
       v1 = variables.Variable(20.0, name="v1")
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver([v0, v1, v2.saveable])
       variables.global_variables_initializer().run()
@@ -321,7 +359,7 @@ class SaverTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       v0 = variables.Variable(-1.0, name="v0")
       v1 = variables.Variable(-1.0, name="v1")
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
       save = saver_module.Saver([v0, v1, v2.saveable])
 
       with self.assertRaisesWithPredicateMatch(
@@ -346,7 +384,7 @@ class SaverTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       v0_2 = variables.Variable(1000.0, name="v0")
       v1_2 = variables.Variable(2000.0, name="v1")
-      v2_2 = CheckpointedOp(name="v2")
+      v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver([v0_2, v1_2, v2_2.saveable])
       v2_2.insert("k1000", 3000.0).run()
       variables.global_variables_initializer().run()
@@ -418,7 +456,7 @@ class SaverTest(test.TestCase):
     with session.Session("", graph=ops_lib.Graph()) as sess:
       one = variables.Variable(1.0)
       twos = variables.Variable([2.0, 2.0, 2.0])
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
       init = variables.global_variables_initializer()
       save = saver_module.Saver()
       init.run()
@@ -428,7 +466,7 @@ class SaverTest(test.TestCase):
     with session.Session("", graph=ops_lib.Graph()) as sess:
       one = variables.Variable(0.0)
       twos = variables.Variable([0.0, 0.0, 0.0])
-      v2 = CheckpointedOp(name="v2")
+      v2 = saver_test_utils.CheckpointedOp(name="v2")
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
@@ -526,32 +564,46 @@ class SaverTest(test.TestCase):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
   def testSaveToNonexistingPath(self):
+    file_io.write_string_to_file(
+        os.path.join(self.get_temp_dir(), "actually_a_file"), "")
+    paths = [
+        os.path.join(self.get_temp_dir(), "nonexisting_dir/path"),
+        os.path.join(self.get_temp_dir(), "other_nonexisting_dir/path1/path2"),
+        os.path.join(self.get_temp_dir(), "actually_a_file/path"),
+    ]
 
-    save_path = os.path.join(self.get_temp_dir(), "nonexisting_dir/path")
+    for save_path in paths:
+      # Build a graph with 2 parameter nodes, and Save and
+      # Restore nodes for them.
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+      save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
+      init_all_op = variables.global_variables_initializer()
 
-    # Build a graph with 2 parameter nodes, and Save and
-    # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-    save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
-    init_all_op = variables.global_variables_initializer()
+      # In the case where the parent directory doesn't exist, whether or not the
+      # save succeeds or fails is implementation dependent.  Therefore we allow
+      # both cases.
+      try:
+        with self.test_session() as sess:
+          # Initialize all variables
+          sess.run(init_all_op)
 
-    with self.test_session() as sess:
-      # Initialize all variables
-      sess.run(init_all_op)
+          # Check that the parameter nodes have been initialized.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
 
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
+          # Save the graph.
+          save.save(sess, save_path)
 
-      error_msg_template = "Parent directory of {} doesn't exist, can't save."
-
-      # Assert saving fails when parent dir of save path doesn't exist
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: error_msg_template.format(save_path) in str(e)
-      ):
-        save.save(sess, save_path)
+        with self.test_session() as sess:
+          # Restore the saved values in the parameter nodes.
+          save.restore(sess, save_path)
+          # Check that the parameter nodes have been restored.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
+      except ValueError as exc:
+        error_msg_template = "Parent directory of {} doesn't exist, can't save."
+        self.assertEqual(error_msg_template.format(save_path), str(exc))
 
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
@@ -593,10 +645,10 @@ class SaveRestoreShardedTest(test.TestCase):
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
         v0 = variables.Variable(10, name="v0")
-        t0 = CheckpointedOp(name="t0")
+        t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
         v1 = variables.Variable(20, name="v1")
-        t1 = CheckpointedOp(name="t1")
+        t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
               "v0": v0,
@@ -623,7 +675,7 @@ class SaveRestoreShardedTest(test.TestCase):
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
           v0 = variables.Variable(111, name="v0")
-          t0 = CheckpointedOp(name="t0")
+          t0 = saver_test_utils.CheckpointedOp(name="t0")
         save = saver_module.Saver({"v0": v0, "t0": t0.saveable}, sharded=True)
         variables.global_variables_initializer().run()
         t0.insert("k11", 33.0).run()
@@ -641,7 +693,7 @@ class SaveRestoreShardedTest(test.TestCase):
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
           v1 = variables.Variable(222)
-          t1 = CheckpointedOp(name="t1")
+          t1 = saver_test_utils.CheckpointedOp(name="t1")
         save = saver_module.Saver({"v1": v1, "t1": t1.saveable}, sharded=True)
         variables.global_variables_initializer().run()
         t1.insert("k22", 44.0).run()
@@ -659,10 +711,10 @@ class SaveRestoreShardedTest(test.TestCase):
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
         v0 = variables.Variable(111, name="v0")
-        t0 = CheckpointedOp(name="t0")
+        t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
         v1 = variables.Variable(222, name="v1")
-        t1 = CheckpointedOp(name="t1")
+        t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
               "v0": v0,
@@ -1334,6 +1386,36 @@ class CheckpointStateTest(test.TestCase):
     self.assertEqual(ckpt.all_model_checkpoint_paths[-1], rel_path)
     self.assertEqual(ckpt.all_model_checkpoint_paths[0], abs_path)
 
+  def testUpdateCheckpointStateSaveRelativePaths(self):
+    save_dir = self._get_test_dir("update_checkpoint_state")
+    os.chdir(save_dir)
+    abs_path2 = os.path.join(save_dir, "model-2")
+    rel_path2 = "model-2"
+    abs_path0 = os.path.join(save_dir, "model-0")
+    rel_path0 = "model-0"
+    saver_module._update_checkpoint_state(  # pylint: disable=protected-access
+        save_dir=save_dir,
+        model_checkpoint_path=abs_path2,
+        all_model_checkpoint_paths=[rel_path0, abs_path2],
+        save_relative_paths=True)
+
+    # File should contain relative paths.
+    file_content = file_io.read_file_to_string(
+        os.path.join(save_dir, "checkpoint"))
+    ckpt = CheckpointState()
+    text_format.Merge(file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, rel_path2)
+    self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2)
+    self.assertEqual(ckpt.all_model_checkpoint_paths[-1], rel_path2)
+    self.assertEqual(ckpt.all_model_checkpoint_paths[0], rel_path0)
+
+    # get_checkpoint_state should return absolute paths.
+    ckpt = saver_module.get_checkpoint_state(save_dir)
+    self.assertEqual(ckpt.model_checkpoint_path, abs_path2)
+    self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2)
+    self.assertEqual(ckpt.all_model_checkpoint_paths[-1], abs_path2)
+    self.assertEqual(ckpt.all_model_checkpoint_paths[0], abs_path0)
+
   def testCheckPointStateFailsWhenIncomplete(self):
     save_dir = self._get_test_dir("checkpoint_state_fails_when_incomplete")
     os.chdir(save_dir)
@@ -1424,7 +1506,9 @@ class MetaGraphTest(test.TestCase):
       # Generates a new MetaGraphDef.
       new_meta_graph_def = new_saver.export_meta_graph()
       # It should be the same as the original.
-      self.assertProtoEquals(meta_graph_def, new_meta_graph_def)
+
+    test_util.assert_meta_graph_protos_equal(
+        self, meta_graph_def, new_meta_graph_def)
 
   def testAddCollectionDefFails(self):
     with self.test_session():
@@ -1487,7 +1571,7 @@ class MetaGraphTest(test.TestCase):
       collection_def = meta_graph_def0.collection_def["savers"]
       kind = collection_def.WhichOneof("kind")
       self.assertEqual(kind, "bytes_list")
-      # Verifies that there are 3 entries in SAVERS collection.
+      # Verifies that there are 2 entries in SAVERS collection.
       savers = getattr(collection_def, kind)
       self.assertEqual(2, len(savers.value))
 
@@ -1523,6 +1607,60 @@ class MetaGraphTest(test.TestCase):
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
+  def testClearExtraneousSavers(self):
+    test_dir = self._get_test_dir("clear_extraneous_savers")
+    filename = os.path.join(test_dir, "metafile")
+    saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
+    saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
+    with self.test_session(graph=ops_lib.Graph()) as sess:
+      # Creates a graph.
+      v0 = variables.Variable([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
+      v1 = variables.Variable(11.0, name="v1")
+
+      # Creates 2 savers.
+      saver0 = saver_module.Saver({"v0": v0}, name="saver0")
+      saver1 = saver_module.Saver({"v1": v1}, name="saver1")
+      ops_lib.add_to_collection("savers", saver0)
+      ops_lib.add_to_collection("savers", saver1)
+      variables.global_variables_initializer().run()
+
+      # Saves to different checkpoints.
+      saver0.save(sess, saver0_ckpt)
+      saver1.save(sess, saver1_ckpt)
+
+      # Generates MetaGraphDef.
+      meta_graph_def = saver_module.export_meta_graph(filename)
+      meta_graph_def0 = saver0.export_meta_graph()
+      meta_graph_def1 = saver1.export_meta_graph(clear_extraneous_savers=True)
+
+      # Verifies that there is no saver_def in meta_graph_def.
+      self.assertFalse(meta_graph_def.HasField("saver_def"))
+      # Verifies that there is saver_def in meta_graph_def0 and 1.
+      self.assertTrue(meta_graph_def0.HasField("saver_def"))
+      self.assertTrue(meta_graph_def1.HasField("saver_def"))
+
+      # Verifies SAVERS is saved as bytes_list for meta_graph_def.
+      collection_def = meta_graph_def.collection_def["savers"]
+      kind = collection_def.WhichOneof("kind")
+      self.assertEqual(kind, "bytes_list")
+
+      # Verifies that there are 2 entries in SAVERS collection.
+      savers = getattr(collection_def, kind)
+      self.assertEqual(2, len(savers.value))
+
+      # Verifies SAVERS collection is saved as bytes_list for meta_graph_def1.
+      collection_def = meta_graph_def1.collection_def["savers"]
+      kind = collection_def.WhichOneof("kind")
+      self.assertEqual(kind, "bytes_list")
+
+      # Verifies that there is 1 entry in SAVERS collection.
+      savers = getattr(collection_def, kind)
+      self.assertEqual(1, len(savers.value))
+
+      # Verifies that saver0 graph nodes are omitted from the saver1 export
+      self.assertEqual(29, len(meta_graph_def0.graph_def.node))
+      self.assertEqual(19, len(meta_graph_def1.graph_def.node))
+
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
@@ -1739,11 +1877,9 @@ class MetaGraphTest(test.TestCase):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
     filename = os.path.join(test_dir, "ckpt")
-    image = array_ops.placeholder(dtypes.float32, [None, 784])
-    label = array_ops.placeholder(dtypes.float32, [None, 10])
+    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
     with session.Session() as sess:
-      label = array_ops.identity(label, name="label")
-      image = array_ops.identity(image, name="image")
       weights = variables.Variable(
           random_ops.random_uniform([784, 10]), name="weights")
       bias = variables.Variable(array_ops.zeros([10]), name="bias")
@@ -1786,8 +1922,8 @@ class MetaGraphTest(test.TestCase):
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=False, import_scope="new_model")
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Cannot assign a device to node"):
+      # Device refers to GPU, which is not available here.
+      with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
@@ -1899,14 +2035,13 @@ class WriteGraphTest(test.TestCase):
     self.assertEqual(path, truth)
     self.assertTrue(os.path.exists(path))
 
-
   def testRecursiveCreate(self):
     test_dir = self._get_test_dir("deep_dir")
     variables.Variable([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
     path = graph_io.write_graph(ops_lib.get_default_graph().as_graph_def(),
                                 os.path.join(test_dir, "l1", "l2", "l3"),
                                 "graph.pbtxt")
-    truth = os.path.join(test_dir, 'l1', 'l2', 'l3', "graph.pbtxt")
+    truth = os.path.join(test_dir, "l1", "l2", "l3", "graph.pbtxt")
     self.assertEqual(path, truth)
     self.assertTrue(os.path.exists(path))
 
@@ -2014,6 +2149,18 @@ class ScopedGraphTest(test.TestCase):
         biases3 = variables.Variable(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights3) + biases3
         ops_lib.add_to_collection("logits", logits)
+
+        # Adds user_defined proto in three formats: string, bytes and Any.
+        # Any proto should just pass through.
+        queue_runner = queue_runner_pb2.QueueRunnerDef(queue_name="test_queue")
+        ops_lib.add_to_collection("user_defined_string_collection",
+                                  str(queue_runner))
+        ops_lib.add_to_collection("user_defined_bytes_collection",
+                                  queue_runner.SerializeToString())
+        any_buf = Any()
+        any_buf.Pack(queue_runner)
+        ops_lib.add_to_collection("user_defined_any_collection", any_buf)
+
       _, var_list = meta_graph.export_scoped_meta_graph(
           filename=os.path.join(test_dir, exported_filename),
           graph=ops_lib.get_default_graph(),
@@ -2192,6 +2339,57 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  def testSerializeSaverWithScope(self):
+    test_dir = self._get_test_dir("export_graph_def")
+    saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
+    saver2_ckpt = os.path.join(test_dir, "saver2.ckpt")
+    graph = ops_lib.Graph()
+    with graph.as_default():
+      with ops_lib.name_scope("hidden1"):
+        variable1 = variables.Variable([1.0], name="variable1")
+        saver1 = saver_module.Saver(var_list=[variable1])
+        graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver1)
+
+      with ops_lib.name_scope("hidden2"):
+        variable2 = variables.Variable([2.0], name="variable2")
+      saver2 = saver_module.Saver(var_list=[variable2], name="hidden2/")
+      graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
+
+    with self.test_session(graph=graph) as sess:
+      variables.global_variables_initializer().run()
+      saver1.save(sess, saver1_ckpt, write_state=False)
+      saver2.save(sess, saver2_ckpt, write_state=False)
+
+    graph1 = ops_lib.Graph()
+    var_dict1 = meta_graph.copy_scoped_meta_graph(
+        from_scope="hidden1",
+        to_scope="new_hidden1",
+        from_graph=graph,
+        to_graph=graph1)
+    self.assertEqual(1, len(var_dict1))
+
+    saver_list1 = graph1.get_collection(ops_lib.GraphKeys.SAVERS)
+    self.assertEqual(1, len(saver_list1))
+
+    with self.test_session(graph=graph1) as sess:
+      saver_list1[0].restore(sess, saver1_ckpt)
+      self.assertEqual(1.0, var_dict1["variable1:0"].eval())
+
+    graph2 = ops_lib.Graph()
+    var_dict2 = meta_graph.copy_scoped_meta_graph(
+        from_scope="hidden2",
+        to_scope="new_hidden2",
+        from_graph=graph,
+        to_graph=graph2)
+    self.assertEqual(1, len(var_dict2))
+
+    saver_list2 = graph2.get_collection(ops_lib.GraphKeys.SAVERS)
+    self.assertEqual(1, len(saver_list2))
+
+    with self.test_session(graph=graph2) as sess:
+      saver_list2[0].restore(sess, saver2_ckpt)
+      self.assertEqual(2.0, var_dict2["variable2:0"].eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/saver_test_utils.py b/tensorflow/python/training/saver_test_utils.py
new file mode 100644
index 00000000000..bcabb413040
--- /dev/null
+++ b/tensorflow/python/training/saver_test_utils.py
@@ -0,0 +1,86 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Utility classes for testing checkpointing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.ops import gen_lookup_ops
+from tensorflow.python.training import saver as saver_module
+
+
+class CheckpointedOp(object):
+  """Op with a custom checkpointing implementation.
+
+  Defined as part of the test because the MutableHashTable Python code is
+  currently in contrib.
+  """
+
+  # pylint: disable=protected-access
+  def __init__(self, name, table_ref=None):
+    if table_ref is None:
+      self.table_ref = gen_lookup_ops._mutable_hash_table_v2(
+          key_dtype=dtypes.string, value_dtype=dtypes.float32, name=name)
+    else:
+      self.table_ref = table_ref
+    self._name = name
+    self._saveable = CheckpointedOp.CustomSaveable(self, name)
+    ops_lib.add_to_collection(ops_lib.GraphKeys.SAVEABLE_OBJECTS,
+                              self._saveable)
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def insert(self, keys, values):
+    return gen_lookup_ops._lookup_table_insert_v2(self.table_ref, keys, values)
+
+  def lookup(self, keys, default):
+    return gen_lookup_ops._lookup_table_find_v2(self.table_ref, keys, default)
+
+  def keys(self):
+    return self._export()[0]
+
+  def values(self):
+    return self._export()[1]
+
+  def _export(self):
+    return gen_lookup_ops._lookup_table_export_v2(self.table_ref, dtypes.string,
+                                                  dtypes.float32)
+
+  class CustomSaveable(saver_module.BaseSaverBuilder.SaveableObject):
+    """A custom saveable for CheckpointedOp."""
+
+    def __init__(self, table, name):
+      tensors = table._export()
+      specs = [
+          saver_module.BaseSaverBuilder.SaveSpec(tensors[0], "",
+                                                 name + "-keys"),
+          saver_module.BaseSaverBuilder.SaveSpec(tensors[1], "",
+                                                 name + "-values")
+      ]
+      super(CheckpointedOp.CustomSaveable, self).__init__(table, specs, name)
+
+    def restore(self, restore_tensors, shapes):
+      return gen_lookup_ops._lookup_table_import_v2(
+          self.op.table_ref, restore_tensors[0], restore_tensors[1])
+  # pylint: enable=protected-access
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index fc8896a1250..2091eca0b9c 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
@@ -95,19 +96,11 @@ class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
   A `tf.train.Server` instance encapsulates a set of devices and a
-  [`tf.Session`](../../api_docs/python/client.md#Session) target that
+  @{tf.Session} target that
   can participate in distributed training. A server belongs to a
-  cluster (specified by a [`tf.train.ClusterSpec`](#ClusterSpec)), and
+  cluster (specified by a @{tf.train.ClusterSpec}), and
   corresponds to a particular task in a named job. The server can
   communicate with any other server in the same cluster.
-
-  @@__init__
-  @@create_local_server
-  @@target
-  @@server_def
-
-  @@start
-  @@join
   """
 
   def __init__(self,
@@ -190,7 +183,7 @@ class Server(object):
     """Returns the target for a `tf.Session` to connect to this server.
 
     To create a
-    [`tf.Session`](../../api_docs/python/client.md#Session) that
+    @{tf.Session} that
     connects to this server, use the following snippet:
 
     ```python
@@ -233,7 +226,7 @@ class ClusterSpec(object):
 
   A `tf.train.ClusterSpec` represents the set of processes that
   participate in a distributed TensorFlow computation. Every
-  [`tf.train.Server`](#Server) is constructed in a particular cluster.
+  @{tf.train.Server} is constructed in a particular cluster.
 
   To create a cluster with two jobs and five tasks, you specify the
   mapping from job names to lists of network addresses (typically
@@ -257,9 +250,6 @@ class ClusterSpec(object):
                                   "ps": ["ps0.example.com:2222",
                                          "ps1.example.com:2222"]})
   ```
-
-  @@as_cluster_def
-  @@as_dict
   """
 
   def __init__(self, cluster):
@@ -287,14 +277,14 @@ class ClusterSpec(object):
                           "from integers to strings." % job_name)
         self._cluster_spec[job_name] = job_tasks
       self._make_cluster_def()
-    elif isinstance(cluster, tensorflow_server_pb2.ClusterDef):
+    elif isinstance(cluster, cluster_pb2.ClusterDef):
       self._cluster_def = cluster
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
         self._cluster_spec[job_def.name] = {
             i: t for i, t in job_def.tasks.items()}
     elif isinstance(cluster, ClusterSpec):
-      self._cluster_def = tensorflow_server_pb2.ClusterDef()
+      self._cluster_def = cluster_pb2.ClusterDef()
       self._cluster_def.MergeFrom(cluster.as_cluster_def())
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
@@ -421,7 +411,7 @@ class ClusterSpec(object):
     NOTE: For backwards compatibility, this method returns a list. If
     the given job was defined with a sparse set of task indices, the
     length of this list may not reflect the number of tasks defined in
-    this job. Use the [`num_tasks()`](#ClusterSpec.num_tasks) method
+    this job. Use the @{tf.train.ClusterSpec.num_tasks} method
     to find the number of tasks defined in a particular job.
 
     Args:
@@ -451,7 +441,7 @@ class ClusterSpec(object):
       TypeError: If `cluster_spec` is not a dictionary mapping strings to lists
         of strings.
     """
-    self._cluster_def = tensorflow_server_pb2.ClusterDef()
+    self._cluster_def = cluster_pb2.ClusterDef()
 
     # NOTE(mrry): Sort by job_name to produce deterministic protobufs.
     for job_name, tasks in sorted(self._cluster_spec.items()):
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 3e96dec9b4f..0a8ec4901c9 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -34,6 +34,8 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_ops
+from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import server_lib
 
 
@@ -156,7 +158,8 @@ class GrpcServerTest(test.TestCase):
     sess.run(dequeue_t)
 
     def blocking_dequeue():
-      with self.assertRaises(errors_impl.CancelledError):
+      with self.assertRaisesRegexp(errors_impl.CancelledError,
+                                   "Session::Close"):
         sess.run(dequeue_t)
 
     blocking_thread = self.checkedThread(blocking_dequeue)
@@ -224,6 +227,20 @@ class GrpcServerTest(test.TestCase):
       _ = server_lib.Server(
           {"local_2": [server.target[len("grpc://"):]]})
 
+  def testExtendAfterQueueRunners(self):
+    server = self._cached_server
+    with session.Session(server.target) as sess:
+      input_queue = input_ops.input_producer(constant_op.constant(
+          [0.], dtype=dtypes.float32))
+      self.assertIsNotNone(input_queue)
+
+      var = variables.Variable(1., dtype=dtypes.float32, trainable=False,
+                               name="var")
+
+      sess.run(variables.global_variables_initializer())
+      queue_runner_impl.start_queue_runners(sess)
+      sess.run(var.assign(3.0))
+
 
 class ServerDefTest(test.TestCase):
 
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 6bcc6e25c36..a13b6dd976a 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -27,6 +27,23 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
 
 
+def _maybe_name(obj):
+  """Returns object name if it has one, or a message otherwise.
+
+  This is useful for names that apper in error messages.
+  Args:
+    obj: Object to get the name of.
+  Returns:
+    name, "None", or a "no name" message.
+  """
+  if obj is None:
+    return "None"
+  elif hasattr(obj, "name"):
+    return obj.name
+  else:
+    return "<no name for %s>" % type(obj)
+
+
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -267,8 +284,8 @@ class SessionManager(object):
     if not local_init_success:
       raise RuntimeError(
           "Init operations did not make model ready for local_init.  "
-          "Init op: %s, init fn: %s, error: %s" % ("None" if init_op is None
-                                                   else init_op.name, init_fn,
+          "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
+                                                   init_fn,
                                                    msg))
 
     is_ready, msg = self._model_ready(sess)
@@ -276,8 +293,7 @@ class SessionManager(object):
       raise RuntimeError(
           "Init operations did not make model ready.  "
           "Init op: %s, init fn: %s, local_init_op: %s, error: %s" %
-          (None if init_op is None else init_op.name, init_fn,
-           self._local_init_op, msg))
+          (_maybe_name(init_op), init_fn, self._local_init_op, msg))
     return sess
 
   def recover_session(self,
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 246e95110a6..4dc1d5abb71 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -497,6 +497,23 @@ class SessionManagerTest(test.TestCase):
                                    "Init operations did not make model ready"):
         sm2.prepare_session("", init_op=v.initializer)
 
+  def testPrepareSessionDidNotInitLocalVariableList(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1, name="v")
+      w = variables.Variable(
+          v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w")
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+      sm2 = session_manager.SessionManager(
+          ready_op=variables.report_uninitialized_variables())
+      with self.assertRaisesRegexp(RuntimeError,
+                                   "Init operations did not make model ready"):
+        sm2.prepare_session("", init_op=[v.initializer])
+
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
       v = variables.Variable(1, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 13f8ba1b6e2..09da63eb687 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -62,15 +62,24 @@ look at following code:
       sess.run(your_fetches)
 
 Above user code leads to following execution:
-  call hooks.begin
+  call hooks.begin()
   sess = tf.Session()
+  call hooks.after_create_session()
   while not stop is requested:
     call hooks.before_run()
-    results = sess.run(merged_fetches)
+    try:
+      results = sess.run(merged_fetches)
+    except (errors.OutOfRangeError, StopIteration):
+      break
     call hooks.after_run()
   call hooks.end()
   sess.close()
 
+Note that if sess.run() raises OutOfRangeError or StopIteration then
+hooks.after_run() will not be called but hooks.end() will still be called.
+If sess.run() raises any other exception then neither hooks.after_run() nor
+hooks.end() will be called.
+
 @@SessionRunHook
 @@SessionRunArgs
 @@SessionRunContext
@@ -149,6 +158,8 @@ class SessionRunHook(object):
     The `run_context` argument is the same one send to `before_run` call.
     `run_context.request_stop()` can be called to stop the iteration.
 
+    If `session.run()` raises any exceptions then `after_run()` is not called.
+
     Args:
       run_context: A `SessionRunContext` object.
       run_values: A SessionRunValues object.
@@ -161,6 +172,12 @@ class SessionRunHook(object):
     The `session` argument can be used in case the hook wants to run final ops,
     such as saving a last checkpoint.
 
+    If `session.run()` raises exception other than OutOfRangeError or
+    StopIteration then `end()` is not called.
+    Note the difference between `end()` and `after_run()` behavior when
+    `session.run()` raises OutOfRangeError or StopIteration. In that case
+    `end()` is called but `after_run()` is not called.
+
     Args:
       session: A TensorFlow Session that will be soon closed.
     """
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index cfb4deb49c5..3f5895f2435 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -41,6 +41,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -48,11 +49,10 @@ from tensorflow.python.ops import variables
 
 def _is_resource(v):
   """Returns true if v is something you get from a resource variable."""
-  return (isinstance(v, resource_variable_ops.ResourceVariable) or
-          (isinstance(v, ops.Tensor) and v.op.type == "ResourceGather"))
+  return isinstance(v, resource_variable_ops.ResourceVariable)
 
 
-def _create_slot_var(primary, val, scope):
+def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
   """Helper function for creating a slot variable."""
 
   # TODO(lukaszkaiser): Consider allowing partitioners to be set in the current
@@ -61,7 +61,9 @@ def _create_slot_var(primary, val, scope):
   variable_scope.get_variable_scope().set_partitioner(None)
   slot = variable_scope.get_variable(
       scope, initializer=val, trainable=False,
-      use_resource=_is_resource(primary))
+      use_resource=_is_resource(primary),
+      shape=shape, dtype=dtype,
+      validate_shape=validate_shape)
   variable_scope.get_variable_scope().set_partitioner(current_partitioner)
 
   # pylint: disable=protected-access
@@ -100,16 +102,51 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     A `Variable` object.
   """
   # Scope the slot name in the namespace of the primary variable.
-  # Set "primary.op.name + '/' + name" as default name, so the scope name of 
+  # Set "primary.op.name + '/' + name" as default name, so the scope name of
   # optimizer can be shared when reuse is True. Meanwhile when reuse is False
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
+  validate_shape = val.get_shape().is_fully_defined()
   with variable_scope.variable_scope(None, primary.op.name + "/" + name):
     if colocate_with_primary:
       with ops.colocate_with(primary):
-        return _create_slot_var(primary, val, "")
+        return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
-      return _create_slot_var(primary, val, "")
+      return _create_slot_var(primary, val, "", validate_shape, None, None)
+
+
+def create_slot_with_initializer(primary, initializer, shape, dtype, name,
+                                 colocate_with_primary=True):
+  """Creates a slot initialized using an `Initializer`.
+
+  The type of the slot is determined by the given value.
+
+  Args:
+    primary: The primary `Variable` or `Tensor`.
+    initializer: An `Initializer`.  The initial value of the slot.
+    shape: Shape of the initial value of the slot.
+    dtype: Type of the value of the slot.
+    name: Name to use for the slot variable.
+    colocate_with_primary: Boolean.  If True the slot is located
+      on the same device as `primary`.
+
+  Returns:
+    A `Variable` object.
+  """
+  # Scope the slot name in the namespace of the primary variable.
+  # Set "primary.op.name + '/' + name" as default name, so the scope name of
+  # optimizer can be shared when reuse is True. Meanwhile when reuse is False
+  # and the same name has been previously used, the scope name will add '_N'
+  # as suffix for unique identifications.
+  validate_shape = shape.is_fully_defined()
+  with variable_scope.variable_scope(None, primary.op.name + "/" + name):
+    if colocate_with_primary:
+      with ops.colocate_with(primary):
+        return _create_slot_var(primary, initializer, "", validate_shape, shape,
+                                dtype)
+    else:
+      return _create_slot_var(primary, initializer, "", validate_shape, shape,
+                              dtype)
 
 
 def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True):
@@ -127,6 +164,15 @@ def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True):
   """
   if dtype is None:
     dtype = primary.dtype
-  val = array_ops.zeros(primary.get_shape().as_list(), dtype=dtype)
-  return create_slot(primary, val, name,
-                     colocate_with_primary=colocate_with_primary)
+  slot_shape = primary.get_shape()
+  slot_shape = (slot_shape if slot_shape.is_fully_defined()
+                else array_ops.shape(primary.initialized_value()))
+  if slot_shape.is_fully_defined():
+    initializer = init_ops.zeros_initializer(dtype)
+    return create_slot_with_initializer(
+        primary, initializer, slot_shape, dtype, name,
+        colocate_with_primary=colocate_with_primary)
+  else:
+    val = array_ops.zeros(slot_shape, dtype=dtype)
+    return create_slot(primary, val, name,
+                       colocate_with_primary=colocate_with_primary)
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index aa5081870e2..230ed1db687 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as _summary
@@ -127,7 +127,7 @@ class Supervisor(object):
 
   * Specifying `'local'` requests a session that uses the RPC-based
     "Master interface" to run TensorFlow programs. See
-    [`tf.train.Server.create_local_server()`](#Server.create_local_server) for
+    @{tf.train.Server.create_local_server} for
     details.
 
   * Specifying `'grpc://hostname:port'` requests a session that uses
@@ -190,19 +190,6 @@ class Supervisor(object):
   initialization needs, see how to specify a `local_init_op` when creating the
   supervisor.  You can also use the `SessionManager` directly to create a
   session and check if it could be initialized automatically.
-
-  @@__init__
-  @@managed_session
-  @@prepare_or_wait_for_session
-  @@start_standard_services
-  @@start_queue_runners
-  @@summary_computed
-
-  @@stop
-  @@request_stop
-  @@should_stop
-  @@stop_on_exception
-  @@wait_for_stop
   """
 
   # Value to pass for the 'ready_op', 'init_op', 'summary_op', 'saver',
@@ -254,7 +241,7 @@ class Supervisor(object):
         on a chief supervisor for inits and restore.
       init_op: `Operation`.  Used by chief supervisors to initialize the model
         when it can not be recovered.  Defaults to an `Operation` that
-        initializes all variables.  If `None`, no initialization is done
+        initializes all global variables.  If `None`, no initialization is done
         automatically unless you pass a value for `init_fn`, see below.
       init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
         This feed dictionary will be used when `init_op` is evaluated.
@@ -439,8 +426,10 @@ class Supervisor(object):
       local_init_op = self._get_first_op_from_collection(
           ops.GraphKeys.LOCAL_INIT_OP)
       if local_init_op is None:
-        op_list = [variables.local_variables_initializer(),
-                   data_flow_ops.tables_initializer()]
+        op_list = [
+            variables.local_variables_initializer(),
+            lookup_ops.tables_initializer()
+        ]
         if op_list:
           local_init_op = control_flow_ops.group(*op_list)
           ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -719,12 +708,14 @@ class Supervisor(object):
           init_feed_dict=self._init_feed_dict, init_fn=self._init_fn)
       self._write_graph()
       if start_standard_services:
+        logging.info("Starting standard services.")
         self.start_standard_services(sess)
     else:
       sess = self._session_manager.wait_for_session(master,
                                                     config=config,
                                                     max_wait_secs=max_wait_secs)
     if start_standard_services:
+      logging.info("Starting queue runners.")
       self.start_queue_runners(sess)
     return sess
 
@@ -1005,34 +996,39 @@ class SVSummaryThread(coordinator.LooperThread):
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
     if self._sv.summary_writer:
+      logging.info("Recording summary at step %s.", global_step)
       self._sv.summary_writer.add_summary(summary_strs, global_step)
 
 
 class SVStepCounterThread(coordinator.LooperThread):
   """Threads to count steps and measure their duration."""
 
-  def __init__(self, sv, sess):
+  def __init__(self, sv, sess, step_counter=None):
     """Create a `SVStepCounterThread`.
 
     Args:
       sv: A `Supervisor`.
       sess: A `Session`.
+      step_counter: A `Tensor` holding the step counter. By defaults, it uses
+        sv.global_step.
     """
     super(SVStepCounterThread, self).__init__(sv.coord, sv.save_summaries_secs)
     self._sv = sv
     self._sess = sess
     self._last_time = 0.0
     self._last_step = 0
-    self._summary_tag = "%s/sec" % self._sv.global_step.op.name
+    step_counter = sv.global_step if step_counter is None else step_counter
+    self._step_counter = step_counter
+    self._summary_tag = "%s/sec" % self._step_counter.op.name
 
   def start_loop(self):
     self._last_time = time.time()
     self._last_step = training_util.global_step(
-        self._sess, self._sv.global_step)
+        self._sess, self._step_counter)
 
   def run_loop(self):
     # Count the steps.
-    current_step = training_util.global_step(self._sess, self._sv.global_step)
+    current_step = training_util.global_step(self._sess, self._step_counter)
     added_steps = current_step - self._last_step
     self._last_step = current_step
     # Measure the elapsed time.
@@ -1040,7 +1036,10 @@ class SVStepCounterThread(coordinator.LooperThread):
     elapsed_time = current_time - self._last_time
     self._last_time = current_time
     # Reports the number of steps done per second
-    steps_per_sec = added_steps / elapsed_time
+    if elapsed_time > 0.:
+      steps_per_sec = added_steps / elapsed_time
+    else:
+      steps_per_sec = float("inf")
     summary = Summary(value=[Summary.Value(tag=self._summary_tag,
                                            simple_value=steps_per_sec)])
     if self._sv.summary_writer:
@@ -1064,6 +1063,7 @@ class SVTimerCheckpointThread(coordinator.LooperThread):
     self._sess = sess
 
   def run_loop(self):
+    logging.info("Saving checkpoint to path %s", self._sv.save_path)
     self._sv.saver.save(self._sess, self._sv.save_path,
                         global_step=self._sv.global_step)
     if self._sv.summary_writer and self._sv.global_step is not None:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 0c69b351b68..f1830bd3fcf 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
@@ -124,11 +125,12 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
       mon_sess.run(training_op)
   ```
 
-  @@__init__
-  @@compute_gradients
-  @@apply_gradients
-  @@get_chief_queue_runner
-  @@get_init_tokens_op
+  To use SyncReplicasOptimizer with an `Estimator`, you need to send
+  sync_replicas_hook while calling the fit.
+  ```
+  my_estimator = DNNClassifier(..., optimizer=opt)
+  my_estimator.fit(..., hooks=[sync_replicas_hook])
+  ```
   """
 
   def __init__(self,
@@ -238,12 +240,17 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     aggregated_grad = []
     var_list = []
 
-    self._local_step = variables.Variable(
-        initial_value=0,
-        trainable=False,
-        collections=[ops.GraphKeys.LOCAL_VARIABLES],
-        dtype=global_step.dtype.base_dtype,
-        name="sync_rep_local_step")
+    # local_anchor op will be placed on this worker task by default.
+    local_anchor = control_flow_ops.no_op()
+    # Colocating local_step variable prevents it being placed on the PS.
+    with ops.colocate_with(local_anchor):
+      self._local_step = variable_scope.variable(
+          initial_value=0,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          dtype=global_step.dtype.base_dtype,
+          name="sync_rep_local_step")
+
     self.local_step_init_op = state_ops.assign(self._local_step, global_step)
     chief_init_ops = [self.local_step_init_op]
     self.ready_for_local_init_op = variables.report_uninitialized_variables(
@@ -424,34 +431,42 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   def make_session_run_hook(self, is_chief, num_tokens=-1):
     """Creates a hook to handle SyncReplicasHook ops such as initialization."""
-    if is_chief:
-      return _SyncReplicasOptimizerHook(self.chief_init_op,
-                                        self.ready_for_local_init_op,
-                                        self.get_chief_queue_runner(),
-                                        self.get_init_tokens_op(num_tokens))
-
-    return _SyncReplicasOptimizerHook(self.local_step_init_op,
-                                      self.ready_for_local_init_op, None, None)
+    return _SyncReplicasOptimizerHook(self, is_chief, num_tokens)
 
 
 class _SyncReplicasOptimizerHook(session_run_hook.SessionRunHook):
   """A SessionRunHook handles ops related to SyncReplicasOptimizer."""
 
-  def __init__(self, local_init_op, ready_for_local_init_op, q_runner,
-               init_tokens_op):
+  def __init__(self, sync_optimizer, is_chief, num_tokens):
     """Creates hook to handle SyncReplicaOptimizer initialization ops.
 
     Args:
-      local_init_op: Either `SyncReplicasOptimizer.chief_init_op` or
-        `SyncReplicasOptimizer.local_step_init_op`.
-      ready_for_local_init_op: `SyncReplicasOptimizer.ready_for_local_init_op`
-      q_runner: Either `SyncReplicasOptimizer.get_chief_queue_runner` or `None`
-      init_tokens_op: `SyncReplicasOptimizer.get_init_tokens_op` or None
+      sync_optimizer: `SyncReplicasOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+      num_tokens: Number of tokens to add to the queue.
     """
-    self._local_init_op = local_init_op
-    self._ready_for_local_init_op = ready_for_local_init_op
-    self._q_runner = q_runner
-    self._init_tokens_op = init_tokens_op
+    self._sync_optimizer = sync_optimizer
+    self._is_chief = is_chief
+    self._num_tokens = num_tokens
+
+  def begin(self):
+    if self._sync_optimizer._gradients_applied is False:  # pylint: disable=protected-access
+      raise ValueError(
+          "SyncReplicasOptimizer.apply_gradient should be called before using "
+          "the hook.")
+    if self._is_chief:
+      self._local_init_op = self._sync_optimizer.chief_init_op
+      self._ready_for_local_init_op = (
+          self._sync_optimizer.ready_for_local_init_op)
+      self._q_runner = self._sync_optimizer.get_chief_queue_runner()
+      self._init_tokens_op = self._sync_optimizer.get_init_tokens_op(
+          self._num_tokens)
+    else:
+      self._local_init_op = self._sync_optimizer.local_step_init_op
+      self._ready_for_local_init_op = (
+          self._sync_optimizer.ready_for_local_init_op)
+      self._q_runner = None
+      self._init_tokens_op = None
 
   def after_create_session(self, session, coord):
     """Runs SyncReplicasOptimizer initialization ops."""
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 6da18391db9..15f938df8c3 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -267,6 +267,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
     # Starts worker 1.
     thread_1.start()
     thread_1.join()
+    thread_0.join()
 
     # The global step should now be 2 and the gradients should have been
     # applied again.
@@ -277,5 +278,30 @@ class SyncReplicasOptimizerTest(test.TestCase):
                         sessions[1].run(var_1_g_1))
 
 
+class SyncReplicasOptimizerHookTest(test.TestCase):
+
+  def testErrorIfUsedBeforeMinimizeCalled(self):
+    opt = training.SyncReplicasOptimizer(
+        opt=gradient_descent.GradientDescentOptimizer(1.0),
+        replicas_to_aggregate=1,
+        total_num_replicas=1)
+    hook = opt.make_session_run_hook(True)
+    with self.assertRaisesRegexp(ValueError,
+                                 "apply_gradient should be called"):
+      hook.begin()
+
+  def testCanCreatedBeforeMinimizeCalled(self):
+    """This behavior is required to be integrated with Estimators."""
+    opt = training.SyncReplicasOptimizer(
+        opt=gradient_descent.GradientDescentOptimizer(1.0),
+        replicas_to_aggregate=1,
+        total_num_replicas=1)
+    hook = opt.make_session_run_hook(True)
+    v = variables.Variable([0.])
+    global_step = variables.Variable(0, name="global_step", trainable=False)
+    opt.minimize(v, global_step=global_step)
+    hook.begin()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tensorboard_logging.py b/tensorflow/python/training/tensorboard_logging.py
index 352e4c308c2..b275d2f682f 100644
--- a/tensorflow/python/training/tensorboard_logging.py
+++ b/tensorflow/python/training/tensorboard_logging.py
@@ -56,7 +56,7 @@ _summary_writer = _sentinel_summary_writer
 # Map from the tensorboard_logging logging enum values to the proto's enum
 # values.
 _LEVEL_PROTO_MAP = {
-    DEBUG: event_pb2.LogMessage.DEBUG,
+    DEBUG: event_pb2.LogMessage.DEBUGGING,
     INFO: event_pb2.LogMessage.INFO,
     WARN: event_pb2.LogMessage.WARN,
     ERROR: event_pb2.LogMessage.ERROR,
diff --git a/tensorflow/python/training/tensorboard_logging_test.py b/tensorflow/python/training/tensorboard_logging_test.py
index cc5cefffb4d..5af6a0aa7b4 100644
--- a/tensorflow/python/training/tensorboard_logging_test.py
+++ b/tensorflow/python/training/tensorboard_logging_test.py
@@ -94,7 +94,7 @@ class EventLoggingTest(test.TestCase):
     tensorboard_logging.debug("debug")
 
     self.assertLoggedMessagesAre([(event_pb2.LogMessage.ERROR, "error"),
-                                  (event_pb2.LogMessage.DEBUG, "debug")])
+                                  (event_pb2.LogMessage.DEBUGGING, "debug")])
     # All message should be logged because tensorboard_logging verbosity doesn't
     # affect logging verbosity.
     self.assertEqual(3, self.logged_message_count)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 9f59d270e4e..f4ac3c97587 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -13,20 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-# pylint: disable=line-too-long
-"""This library provides a set of classes and functions that helps train models.
+"""Support for training models.
 
-## Optimizers
-
-The Optimizer base class provides methods to compute gradients for a loss and
-apply gradients to variables.  A collection of subclasses implement classic
-optimization algorithms such as GradientDescent and Adagrad.
-
-You never instantiate the Optimizer class itself, but instead instantiate one
-of the subclasses.
+See the @{$python/train} guide.
 
 @@Optimizer
-
 @@GradientDescentOptimizer
 @@AdadeltaOptimizer
 @@AdagradOptimizer
@@ -37,69 +28,26 @@ of the subclasses.
 @@ProximalGradientDescentOptimizer
 @@ProximalAdagradOptimizer
 @@RMSPropOptimizer
-
-## Gradient Computation
-
-TensorFlow provides functions to compute the derivatives for a given
-TensorFlow computation graph, adding operations to the graph. The
-optimizer classes automatically compute derivatives on your graph, but
-creators of new Optimizers or expert users can call the lower-level
-functions below.
-
 @@gradients
 @@AggregationMethod
-
 @@stop_gradient
-
 @@hessians
-
-
-## Gradient Clipping
-
-TensorFlow provides several operations that you can use to add clipping
-functions to your graph. You can use these functions to perform general data
-clipping, but they're particularly useful for handling exploding or vanishing
-gradients.
-
 @@clip_by_value
 @@clip_by_norm
 @@clip_by_average_norm
 @@clip_by_global_norm
 @@global_norm
-
-## Decaying the learning rate
 @@exponential_decay
 @@inverse_time_decay
 @@natural_exp_decay
 @@piecewise_constant
 @@polynomial_decay
-
-## Moving Averages
-
-Some training algorithms, such as GradientDescent and Momentum often benefit
-from maintaining a moving average of variables during optimization.  Using the
-moving averages for evaluations often improve results significantly.
-
 @@ExponentialMovingAverage
-
-## Coordinator and QueueRunner
-
-See [Threading and Queues](../../how_tos/threading_and_queues/index.md)
-for how to use threads and queues.  For documentation on the Queue API,
-see [Queues](../../api_docs/python/io_ops.md#queues).
-
-
 @@Coordinator
 @@QueueRunner
 @@LooperThread
 @@add_queue_runner
 @@start_queue_runners
-
-## Distributed execution
-
-See [Distributed TensorFlow](../../how_tos/distributed/index.md) for
-more information about how to configure a distributed TensorFlow program.
-
 @@Server
 @@Supervisor
 @@SessionManager
@@ -112,27 +60,15 @@ more information about how to configure a distributed TensorFlow program.
 @@SessionCreator
 @@ChiefSessionCreator
 @@WorkerSessionCreator
-
-## Reading Summaries from Event Files
-
-See [Summaries and
-TensorBoard](../../how_tos/summaries_and_tensorboard/index.md) for an
-overview of summaries, event files, and visualization in TensorBoard.
-
 @@summary_iterator
-
-## Training Hooks
-
-Hooks are tools that run in the process of training/evaluation of the model.
-
 @@SessionRunHook
 @@SessionRunArgs
 @@SessionRunContext
 @@SessionRunValues
-
 @@LoggingTensorHook
 @@StopAtStepHook
 @@CheckpointSaverHook
+@@CheckpointSaverListener
 @@NewCheckpointReader
 @@StepCounterHook
 @@NanLossDuringTrainingError
@@ -141,16 +77,15 @@ Hooks are tools that run in the process of training/evaluation of the model.
 @@GlobalStepWaiterHook
 @@FinalOpsHook
 @@FeedFnHook
-
-## Training Utilities
-
+@@SecondOrStepTimer
 @@global_step
 @@basic_train_loop
 @@get_global_step
+@@get_or_create_global_step
+@@create_global_step
 @@assert_global_step
 @@write_graph
 """
-# pylint: enable=line-too-long
 
 # Optimizers.
 from __future__ import absolute_import
@@ -160,10 +95,14 @@ from __future__ import print_function
 import sys as _sys
 
 from tensorflow.python.ops import io_ops as _io_ops
+from tensorflow.python.ops import sdca_ops as _sdca_ops
 from tensorflow.python.ops import state_ops as _state_ops
 from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=g-bad-import-order,unused-import
+from tensorflow.python.ops.sdca_ops import sdca_optimizer
+from tensorflow.python.ops.sdca_ops import sdca_fprint
+from tensorflow.python.ops.sdca_ops import sdca_shrink_l1
 from tensorflow.python.training.adadelta import AdadeltaOptimizer
 from tensorflow.python.training.adagrad import AdagradOptimizer
 from tensorflow.python.training.adagrad_da import AdagradDAOptimizer
@@ -194,6 +133,7 @@ from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
 from tensorflow.python.training.basic_session_run_hooks import LoggingTensorHook
 from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook
 from tensorflow.python.training.basic_session_run_hooks import CheckpointSaverHook
+from tensorflow.python.training.basic_session_run_hooks import CheckpointSaverListener
 from tensorflow.python.training.basic_session_run_hooks import StepCounterHook
 from tensorflow.python.training.basic_session_run_hooks import NanLossDuringTrainingError
 from tensorflow.python.training.basic_session_run_hooks import NanTensorHook
@@ -230,6 +170,8 @@ from tensorflow.python.training.training_util import write_graph
 from tensorflow.python.training.training_util import global_step
 from tensorflow.python.training.training_util import get_global_step
 from tensorflow.python.training.training_util import assert_global_step
+from tensorflow.python.training.training_util import create_global_step
+from tensorflow.python.training.training_util import get_or_create_global_step
 from tensorflow.python.pywrap_tensorflow import do_quantize_training_on_graphdef
 from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
 
@@ -244,8 +186,8 @@ from tensorflow.python.training.learning_rate_decay import *
 # pylint: enable=wildcard-import
 
 # Distributed computing support.
-from tensorflow.core.protobuf.tensorflow_server_pb2 import ClusterDef
-from tensorflow.core.protobuf.tensorflow_server_pb2 import JobDef
+from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
+from tensorflow.core.protobuf.cluster_pb2 import JobDef
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.server_lib import Server
@@ -254,36 +196,36 @@ from tensorflow.python.training.server_lib import Server
 _allowed_symbols = [
     # TODO(cwhipkey): review these and move to contrib or expose through
     # documentation.
-    "generate_checkpoint_state_proto",   # Used internally by saver.
+    "generate_checkpoint_state_proto",  # Used internally by saver.
     "checkpoint_exists",  # Only used in test?
     "get_checkpoint_mtimes",  # Only used in test?
 
     # Legacy: remove.
     "do_quantize_training_on_graphdef",  # At least use grah_def, not graphdef.
-                                         # No uses within tensorflow.
+    # No uses within tensorflow.
     "queue_runner",  # Use tf.train.start_queue_runner etc directly.
-                     # This is also imported internally.
+    # This is also imported internally.
 
     # TODO(drpng): document these. The reference in howtos/distributed does
     # not link.
     "SyncReplicasOptimizer",
     # Protobufs:
-    "BytesList",          # from example_pb2.
+    "BytesList",  # from example_pb2.
     "ClusterDef",
-    "Example",            # from example_pb2
-    "Feature",            # from example_pb2
-    "Features",           # from example_pb2
-    "FeatureList",        # from example_pb2
-    "FeatureLists",       # from example_pb2
-    "FloatList",          # from example_pb2.
-    "Int64List",          # from example_pb2.
+    "Example",  # from example_pb2
+    "Feature",  # from example_pb2
+    "Features",  # from example_pb2
+    "FeatureList",  # from example_pb2
+    "FeatureLists",  # from example_pb2
+    "FloatList",  # from example_pb2.
+    "Int64List",  # from example_pb2.
     "JobDef",
-    "SaverDef",           # From saver_pb2.
-    "SequenceExample",    # from example_pb2.
+    "SaverDef",  # From saver_pb2.
+    "SequenceExample",  # from example_pb2.
     "ServerDef",
 ]
 # Include extra modules for docstrings because:
 # * Input methods in tf.train are documented in io_ops.
 # * Saver methods in tf.train are documented in state_ops.
 remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _io_ops, _state_ops])
+                    [_sys.modules[__name__], _io_ops, _sdca_ops, _state_ops])
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 4b8bdbc98ba..9ee48d1a451 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -25,6 +25,8 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.test_util import TensorFlowTestCase
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import training_ops
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 14d20cf2a61..a40f63ccb32 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -19,9 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
@@ -71,7 +74,7 @@ def get_global_step(graph=None):
     TypeError: If the global step tensor has a non-integer type, or if it is not
       a `Variable`.
   """
-  graph = ops.get_default_graph() if graph is None else graph
+  graph = graph or ops.get_default_graph()
   global_step_tensor = None
   global_step_tensors = graph.get_collection(ops.GraphKeys.GLOBAL_STEP)
   if len(global_step_tensors) == 1:
@@ -89,6 +92,50 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
+def create_global_step(graph=None):
+  """Create global step tensor in graph.
+
+  Args:
+    graph: The graph in which to create the global step tensor. If missing,
+      use default graph.
+
+  Returns:
+    Global step tensor.
+
+  Raises:
+    ValueError: if global step tensor is already defined.
+  """
+  graph = graph or ops.get_default_graph()
+  if get_global_step(graph) is not None:
+    raise ValueError('"global_step" already exists.')
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        ops.GraphKeys.GLOBAL_STEP,
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
+
+
+def get_or_create_global_step(graph=None):
+  """Returns and create (if necessary) the global step tensor.
+
+  Args:
+    graph: The graph in which to create the global step tensor. If missing, use
+      default graph.
+
+  Returns:
+    The global step tensor.
+  """
+  graph = graph or ops.get_default_graph()
+  global_step_tensor = get_global_step(graph)
+  if global_step_tensor is None:
+    global_step_tensor = create_global_step(graph)
+  return global_step_tensor
+
+
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
new file mode 100644
index 00000000000..b019064ee9b
--- /dev/null
+++ b/tensorflow/python/training/training_util_test.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
+
+
+class GlobalStepTest(test.TestCase):
+
+  def _assert_global_step(self, global_step, expected_dtype=dtypes.int64):
+    self.assertEqual('%s:0' % ops.GraphKeys.GLOBAL_STEP, global_step.name)
+    self.assertEqual(expected_dtype, global_step.dtype.base_dtype)
+    self.assertEqual([], global_step.get_shape().as_list())
+
+  def test_invalid_dtype(self):
+    with ops.Graph().as_default() as g:
+      self.assertIsNone(training_util.get_global_step())
+      variables.Variable(
+          0.0,
+          trainable=False,
+          dtype=dtypes.float32,
+          name=ops.GraphKeys.GLOBAL_STEP)
+      self.assertRaisesRegexp(TypeError, 'does not have integer type',
+                              training_util.get_global_step)
+    self.assertRaisesRegexp(TypeError, 'does not have integer type',
+                            training_util.get_global_step, g)
+
+  def test_invalid_shape(self):
+    with ops.Graph().as_default() as g:
+      self.assertIsNone(training_util.get_global_step())
+      variables.Variable(
+          [0],
+          trainable=False,
+          dtype=dtypes.int32,
+          name=ops.GraphKeys.GLOBAL_STEP)
+      self.assertRaisesRegexp(TypeError, 'not scalar',
+                              training_util.get_global_step)
+    self.assertRaisesRegexp(TypeError, 'not scalar',
+                            training_util.get_global_step, g)
+
+  def test_create_global_step(self):
+    self.assertIsNone(training_util.get_global_step())
+    with ops.Graph().as_default() as g:
+      global_step = training_util.create_global_step()
+      self._assert_global_step(global_step)
+      self.assertRaisesRegexp(ValueError, 'already exists',
+                              training_util.create_global_step)
+      self.assertRaisesRegexp(ValueError, 'already exists',
+                              training_util.create_global_step, g)
+      self._assert_global_step(training_util.create_global_step(ops.Graph()))
+
+  def test_get_global_step(self):
+    with ops.Graph().as_default() as g:
+      self.assertIsNone(training_util.get_global_step())
+      variables.Variable(
+          0,
+          trainable=False,
+          dtype=dtypes.int32,
+          name=ops.GraphKeys.GLOBAL_STEP)
+      self._assert_global_step(
+          training_util.get_global_step(), expected_dtype=dtypes.int32)
+    self._assert_global_step(
+        training_util.get_global_step(g), expected_dtype=dtypes.int32)
+
+  def test_get_or_create_global_step(self):
+    with ops.Graph().as_default() as g:
+      self.assertIsNone(training_util.get_global_step())
+      self._assert_global_step(training_util.get_or_create_global_step())
+      self._assert_global_step(training_util.get_or_create_global_step(g))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/all_util.py b/tensorflow/python/util/all_util.py
index 08f33657510..50d480f8707 100644
--- a/tensorflow/python/util/all_util.py
+++ b/tensorflow/python/util/all_util.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect as _inspect
 import re as _re
 import sys as _sys
 
+from tensorflow.python.util import tf_inspect as _tf_inspect
+
+
 _reference_pattern = _re.compile(r'^@@(\w+)$', flags=_re.MULTILINE)
 
 
@@ -45,7 +47,7 @@ def make_all(module_name, doc_string_modules=None):
   if doc_string_modules is None:
     doc_string_modules = [_sys.modules[module_name]]
   cur_members = set([name for name, _
-                     in _inspect.getmembers(_sys.modules[module_name])])
+                     in _tf_inspect.getmembers(_sys.modules[module_name])])
 
   results = set()
   for doc_module in doc_string_modules:
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 6fd7c4ab284..07382d93dfe 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -69,7 +69,7 @@ def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
   Args:
-    bytes_or_text: A `bytes`, `str, or `unicode` object.
+    bytes_or_text: A `bytes`, `str`, or `unicode` object.
     encoding: A string indicating the charset for decoding unicode.
 
   Returns:
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index d09476a680d..c2de723bfb6 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -20,11 +20,17 @@ from __future__ import print_function
 
 import collections
 import functools
-import inspect
 import re
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+# Allow deprecation warnings to be silenced temporarily with a context manager.
+_PRINT_DEPRECATION_WARNINGS = True
 
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
@@ -33,7 +39,8 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
       doc, instructions,
       'DEPRECATED FUNCTION',
       '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
+          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
+              'in a future version' if date is None else ('after %s' % date)),
           'Instructions for updating:'])
 
 
@@ -44,14 +51,13 @@ def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
       'DEPRECATED FUNCTION ARGUMENTS',
       '(deprecated arguments)', [
           'SOME ARGUMENTS ARE DEPRECATED. '
-          'They will be removed after %s.' % date,
+          'They will be removed %s.' % (
+              'in a future version' if date is None else ('after %s' % date)),
           'Instructions for updating:'])
 
 
 def _validate_deprecation_args(date, instructions):
-  if not date:
-    raise ValueError('Tell us what date this will be deprecated!')
-  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
+  if date is not None and not re.match(r'20\d\d-[01]\d-[0123]\d', date):
     raise ValueError('Date must be YYYY-MM-DD.')
   if not instructions:
     raise ValueError('Don\'t deprecate things without conversion instructions!')
@@ -59,7 +65,7 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location():
   """Returns call location given level up from current call."""
-  frame = inspect.currentframe()
+  frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
@@ -69,7 +75,7 @@ def _call_location():
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
-    stack = inspect.stack(0)  # 0 avoids generating unused context
+    stack = tf_inspect.stack(0)  # 0 avoids generating unused context
     entry = stack[2]
     return '%s:%d' % (entry[1], entry[2])
 
@@ -84,6 +90,7 @@ def deprecated(date, instructions):
     Instructions for updating:
     <instructions>
 
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
   <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated)' is appended
@@ -91,8 +98,8 @@ def deprecated(date, instructions):
   to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None.
     instructions: String. Instructions on how to update code using the
       deprecated function.
 
@@ -100,7 +107,8 @@ def deprecated(date, instructions):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+    ValueError: If date is not None or in ISO 8601 format, or instructions are
+      empty.
   """
   _validate_deprecation_args(date, instructions)
 
@@ -108,17 +116,20 @@ def deprecated(date, instructions):
     """Deprecation wrapper."""
     decorator_utils.validate_callable(func, 'deprecated')
     @functools.wraps(func)
-    def new_func(*args, **kwargs):
-      logging.warning(
-          'From %s: %s (from %s) is deprecated and will be removed '
-          'after %s.\n'
-          'Instructions for updating:\n%s',
-          _call_location(), decorator_utils.get_qualified_name(func),
-          func.__module__, date, instructions)
+    def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
+      if _PRINT_DEPRECATION_WARNINGS:
+        logging.warning(
+            'From %s: %s (from %s) is deprecated and will be removed %s.\n'
+            'Instructions for updating:\n%s',
+            _call_location(), decorator_utils.get_qualified_name(func),
+            func.__module__,
+            'in a future version' if date is None else ('after %s' % date),
+            instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_function_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(
+        func, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(func.__doc__, date,
+                                                     instructions))
   return deprecated_wrapper
 
 
@@ -136,15 +147,16 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     removed after <date>. Instructions for updating:
       <instructions>
 
-  <function> will include the class name if it is a method.
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
+  <function> includes the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated arguments)' is
   appended to the first line of the docstring and a deprecation notice is
   prepended to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None.
     instructions: String. Instructions on how to update code using the
       deprecated function.
     *deprecated_arg_names_or_tuples: String. or 2-Tuple(String,
@@ -156,7 +168,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, instructions are
+    ValueError: If date is not None or in ISO 8601 format, instructions are
       empty, the deprecated arguments are not present in the function
       signature, or the second element of a deprecated_tuple is not a
       list.
@@ -176,7 +188,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     return d
 
   def _get_deprecated_positional_arguments(names_to_ok_vals, arg_spec):
-    """Builds a dictionary from deprecated arguments to thier spec.
+    """Builds a dictionary from deprecated arguments to their spec.
 
     Returned dict is keyed by argument name.
     Each value is a DeprecatedArgSpec with the following fields:
@@ -189,7 +201,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     Args:
       names_to_ok_vals: dict from string arg_name to a list of values,
         possibly empty, which should not elicit a warning.
-      arg_spec: Output from inspect.getargspec on the called function.
+      arg_spec: Output from tf_inspect.getargspec on the called function.
 
     Returns:
       Dictionary from arg_name to DeprecatedArgSpec.
@@ -209,7 +221,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     decorator_utils.validate_callable(func, 'deprecated_args')
     deprecated_arg_names = _get_arg_names_to_ok_vals()
 
-    arg_spec = inspect.getargspec(func)
+    arg_spec = tf_inspect.getargspec(func)
     deprecated_positions = _get_deprecated_positional_arguments(
         deprecated_arg_names, arg_spec)
 
@@ -255,33 +267,36 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      invalid_args = []
-      named_args = inspect.getcallargs(func, *args, **kwargs)
-      for arg_name, spec in iter(deprecated_positions.items()):
-        if (spec.position < len(args) and
-            not (spec.has_ok_value and
-                 _same_value(named_args[arg_name], spec.ok_value))):
-          invalid_args.append(arg_name)
-      if is_varargs_deprecated and len(args) > len(arg_spec.args):
-        invalid_args.append(arg_spec.varargs)
-      if is_kwargs_deprecated and kwargs:
-        invalid_args.append(arg_spec.keywords)
-      for arg_name in deprecated_arg_names:
-        if (arg_name in kwargs and
-            not (deprecated_positions[arg_name].has_ok_value and
-                 _same_value(named_args[arg_name],
-                             deprecated_positions[arg_name].ok_value))):
-          invalid_args.append(arg_name)
-      for arg_name in invalid_args:
-        logging.warning(
-            'From %s: calling %s (from %s) with %s is deprecated and will '
-            'be removed after %s.\nInstructions for updating:\n%s',
-            _call_location(), decorator_utils.get_qualified_name(func),
-            func.__module__, arg_name, date, instructions)
+      if _PRINT_DEPRECATION_WARNINGS:
+        invalid_args = []
+        named_args = tf_inspect.getcallargs(func, *args, **kwargs)
+        for arg_name, spec in iter(deprecated_positions.items()):
+          if (spec.position < len(args) and
+              not (spec.has_ok_value and
+                   _same_value(named_args[arg_name], spec.ok_value))):
+            invalid_args.append(arg_name)
+        if is_varargs_deprecated and len(args) > len(arg_spec.args):
+          invalid_args.append(arg_spec.varargs)
+        if is_kwargs_deprecated and kwargs:
+          invalid_args.append(arg_spec.keywords)
+        for arg_name in deprecated_arg_names:
+          if (arg_name in kwargs and
+              not (deprecated_positions[arg_name].has_ok_value and
+                   _same_value(named_args[arg_name],
+                               deprecated_positions[arg_name].ok_value))):
+            invalid_args.append(arg_name)
+        for arg_name in invalid_args:
+          logging.warning(
+              'From %s: calling %s (from %s) with %s is deprecated and will '
+              'be removed %s.\nInstructions for updating:\n%s',
+              _call_location(), decorator_utils.get_qualified_name(func),
+              func.__module__, arg_name,
+              'in a future version' if date is None else ('after %s' % date),
+              instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(func, new_func, 'deprecated',
+                                       _add_deprecated_arg_notice_to_docstring(
+                                           func.__doc__, date, instructions))
   return deprecated_wrapper
 
 
@@ -295,6 +310,7 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     will be removed after <date>. Instructions for updating:
       <instructions>
 
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
   <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated arguments)' is
@@ -302,8 +318,8 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
   prepended to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None
     instructions: String. Instructions on how to update code using the
       deprecated function.
     **deprecated_kwargs: The deprecated argument values.
@@ -312,7 +328,8 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+    ValueError: If date is not None or in ISO 8601 format, or instructions are
+      empty.
   """
   _validate_deprecation_args(date, instructions)
   if not deprecated_kwargs:
@@ -324,18 +341,21 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      named_args = inspect.getcallargs(func, *args, **kwargs)
-      for arg_name, arg_value in deprecated_kwargs.items():
-        if arg_name in named_args and named_args[arg_name] == arg_value:
-          logging.warning(
-              'From %s: calling %s (from %s) with %s=%s is deprecated and will '
-              'be removed after %s.\nInstructions for updating:\n%s',
-              _call_location(), decorator_utils.get_qualified_name(func),
-              func.__module__, arg_name, arg_value, date, instructions)
+      if _PRINT_DEPRECATION_WARNINGS:
+        named_args = tf_inspect.getcallargs(func, *args, **kwargs)
+        for arg_name, arg_value in deprecated_kwargs.items():
+          if arg_name in named_args and named_args[arg_name] == arg_value:
+            logging.warning(
+                'From %s: calling %s (from %s) with %s=%s is deprecated and '
+                'will be removed %s.\nInstructions for updating:\n%s',
+                _call_location(), decorator_utils.get_qualified_name(func),
+                func.__module__, arg_name, arg_value,
+                'in a future version' if date is None else ('after %s' % date),
+                instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(func, new_func, 'deprecated',
+                                       _add_deprecated_arg_notice_to_docstring(
+                                           func.__doc__, date, instructions))
   return deprecated_wrapper
 
 
@@ -363,3 +383,13 @@ def deprecated_argument_lookup(new_name, new_value, old_name, old_value):
 def rewrite_argument_docstring(old_doc, old_argument, new_argument):
   return old_doc.replace('`%s`' % old_argument, '`%s`' % new_argument).replace(
       '%s:' % old_argument, '%s:' % new_argument)
+
+
+@tf_contextlib.contextmanager
+def silence():
+  """Temporarily silence deprecation warnings."""
+  global _PRINT_DEPRECATION_WARNINGS
+  print_deprecation_warnings = _PRINT_DEPRECATION_WARNINGS
+  _PRINT_DEPRECATION_WARNINGS = False
+  yield
+  _PRINT_DEPRECATION_WARNINGS = print_deprecation_warnings
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 07d6bd6a48f..e2d9a594a35 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""tensor_util tests."""
+"""Deprecation tests."""
 
 # pylint: disable=unused-import
 from __future__ import absolute_import
@@ -26,6 +26,25 @@ from tensorflow.python.util import deprecation
 
 class DeprecationTest(test.TestCase):
 
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_silence(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn():
+      pass
+
+    _fn()
+    self.assertEqual(1, mock_warning.call_count)
+
+    with deprecation.silence():
+      _fn()
+    self.assertEqual(1, mock_warning.call_count)
+
+    _fn()
+    self.assertEqual(2, mock_warning.call_count)
+
   def _assert_subset(self, expected_subset, actual_set):
     self.assertTrue(
         actual_set.issuperset(expected_subset),
@@ -33,9 +52,7 @@ class DeprecationTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated(None, instructions)
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("", instructions)
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("07-04-2016", instructions)
@@ -45,6 +62,46 @@ class DeprecationTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "instructions"):
       deprecation.deprecated(date, "")
 
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_no_date(self, mock_warning):
+    date = None
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      """fn doc.
+
+      Args:
+        arg0: Arg 0.
+        arg1: Arg 1.
+
+      Returns:
+        Sum of args.
+      """
+      return arg0 + arg1
+
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n"
+        "\nReturns:"
+        "\n  Sum of args." % instructions, _fn.__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(
+        args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["in a future version", instructions]),
+                        set(args[1:]))
+
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
@@ -82,8 +139,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_one_line_doc(self, mock_warning):
@@ -107,8 +164,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_no_doc(self, mock_warning):
@@ -132,8 +189,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_with_doc(self, mock_warning):
@@ -177,8 +234,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_with_one_line_doc(self, mock_warning):
@@ -207,8 +264,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_no_doc(self, mock_warning):
@@ -236,8 +293,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   def test_prop_wrong_order(self):
     with self.assertRaisesRegexp(
@@ -290,8 +347,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_with_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_prop_no_doc(self, mock_warning):
@@ -320,8 +377,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_no_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
 
 class DeprecatedArgsTest(test.TestCase):
@@ -334,9 +391,7 @@ class DeprecatedArgsTest(test.TestCase):
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
     date = "2016-07-04"
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated_args(None, instructions, "deprecated")
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("", instructions, "deprecated")
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("07-04-2016", instructions, "deprecated")
@@ -401,8 +456,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_one_line_doc(self, mock_warning):
@@ -430,8 +485,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_no_doc(self, mock_warning):
@@ -459,8 +514,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_varargs(self, mock_warning):
@@ -479,8 +534,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True, False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_kwargs(self, mock_warning):
@@ -499,8 +554,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, a=True, b=False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_positional_and_named(self, mock_warning):
@@ -519,11 +574,13 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, None, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d1"]), set(args1[1:]))
+    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d1"]),
+                        set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d2"]), set(args2[1:]))
+    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d2"]),
+                        set(args2[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_positional_and_named_with_ok_vals(self, mock_warning):
@@ -543,11 +600,13 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, False, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d1"]), set(args1[1:]))
+    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d1"]),
+                        set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d2"]), set(args2[1:]))
+    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d2"]),
+                        set(args2[1:]))
 
     # Assert calls with the deprecated arguments dont log warnings if
     # the value matches the 'ok_val'.
@@ -565,9 +624,7 @@ class DeprecatedArgValuesTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated_arg_values(None, instructions, deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values("", instructions, deprecated=True)
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values(
@@ -623,8 +680,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -656,8 +713,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -689,8 +746,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index 8843016a978..a3750851769 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -101,7 +101,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     fixed_config.shape.CopyFrom(
         tensor_shape.TensorShape(dense_shapes[i]).as_proto())
 
-    fixed_config.dtype = dense_types[i]
+    fixed_config.dtype = int(dense_types[i])
     # Get the output tensor name.
     fixed_config.values_output_tensor_name = parse_example_op.outputs[
         dense_values_start + i].name
@@ -111,7 +111,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     key = fetched[sparse_keys_start + i]
     feature_config = config.feature_map[key]
     var_len_feature = feature_config.var_len_feature
-    var_len_feature.dtype = sparse_types[i]
+    var_len_feature.dtype = int(sparse_types[i])
     var_len_feature.indices_output_tensor_name = parse_example_op.outputs[
         sparse_indices_start + i].name
     var_len_feature.values_output_tensor_name = parse_example_op.outputs[
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index b05c2ef04b5..d451bbace2c 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -49,7 +49,8 @@ string TryFindKernelClass(const string& serialized_node_def) {
   }
   string class_name = "";
   tensorflow::FindKernelDef(tensorflow::DeviceType(parsed_name.type.c_str()),
-                            node_def, nullptr /* kernel_def */, &class_name);
+                            node_def, nullptr /* kernel_def */, &class_name)
+      .IgnoreError();
   return class_name;
 }
 
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
new file mode 100644
index 00000000000..6d2622b1c04
--- /dev/null
+++ b/tensorflow/python/util/lazy_loader.py
@@ -0,0 +1,58 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A LazyLoader class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import types
+
+
+class LazyLoader(types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies.
+
+  `contrib`, and `ffmpeg` are examples of modules that are large and not always
+  needed, and this allows them to only be loaded when they are used.
+  """
+
+  # The lint error here is incorrect.
+  def __init__(self, local_name, parent_module_globals, name):  # pylint: disable=super-on-old-class
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+
+    super(LazyLoader, self).__init__(name)
+
+  def _load(self):
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+
+    # Update this object's dict so that if someone keeps a reference to the
+    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+    #   that fail).
+    self.__dict__.update(module.__dict__)
+
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+  def __dir__(self):
+    module = self._load()
+    return dir(module)
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index ad0fcd31e5f..01b3dfacd14 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -15,30 +15,25 @@
 
 """## Functions for working with arbitrarily nested sequences of elements.
 
-This module is used to perform any operations on nested structures, which can be
-specified as sequences that contain non-sequence elements or other sequences.
-The utilities here assume (and do not check) that the nested structures form a
-'tree', i.e. no references in the structure of the input of these functions
-should be recursive.
+This module is used to perform any operations on nested structures. A nested
+structure is a Python sequence that contains non-sequence elements or other
+sequences. The utilities here assume (and do not check) that the nested
+structures form a 'tree', i.e. no references in the structure of the input of
+these functions should be recursive.
 
-@@assert_same_structure
-@@is_sequence
-@@flatten
-@@flatten_dict_items
-@@pack_sequence_as
-@@map_structure
-@@assert_shallow_structure
-@@flatten_up_to
-@@map_structure_up_to
+Example structures: `((3, 4), 5, (6, 7, (9, 10), 8))`, `(np.array(0),
+  (np.array([3, 4]), tf.constant([3, 4])))`
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import collections as _collections
 
-import six
+import six as _six
+
+from tensorflow.python.util.all_util import remove_undocumented
 
 
 def _sequence_like(instance, args):
@@ -53,8 +48,8 @@ def _sequence_like(instance, args):
   """
   if (isinstance(instance, tuple) and
       hasattr(instance, "_fields") and
-      isinstance(instance._fields, collections.Sequence) and
-      all(isinstance(f, six.string_types) for f in instance._fields)):
+      isinstance(instance._fields, _collections.Sequence) and
+      all(isinstance(f, _six.string_types) for f in instance._fields)):
     # This is a namedtuple
     return type(instance)(*args)
   else:
@@ -80,8 +75,8 @@ def is_sequence(seq):
   Returns:
     True if the sequence is a not a string and is a collections.Sequence.
   """
-  return (isinstance(seq, collections.Sequence)
-          and not isinstance(seq, six.string_types))
+  return (isinstance(seq, _collections.Sequence)
+          and not isinstance(seq, _six.string_types))
 
 
 def flatten(nest):
@@ -99,7 +94,7 @@ def flatten(nest):
   return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
 
 
-def _recursive_assert_same_structure(nest1, nest2):
+def _recursive_assert_same_structure(nest1, nest2, check_types):
   is_sequence_nest1 = is_sequence(nest1)
   if is_sequence_nest1 != is_sequence(nest2):
     raise ValueError(
@@ -109,28 +104,31 @@ def _recursive_assert_same_structure(nest1, nest2):
   if is_sequence_nest1:
     type_nest1 = type(nest1)
     type_nest2 = type(nest2)
-    if type_nest1 != type_nest2:
+    if check_types and type_nest1 != type_nest2:
       raise TypeError(
           "The two structures don't have the same sequence type. First "
           "structure has type %s, while second structure has type %s."
           % (type_nest1, type_nest2))
 
     for n1, n2 in zip(nest1, nest2):
-      _recursive_assert_same_structure(n1, n2)
+      _recursive_assert_same_structure(n1, n2, check_types)
 
 
-def assert_same_structure(nest1, nest2):
+def assert_same_structure(nest1, nest2, check_types=True):
   """Asserts that two structures are nested in the same way.
 
   Args:
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
+    check_types: if `True` (default) types of sequences are checked as
+      well. If set to `False`, for example a list and a tuple of objects will
+      look same if they have the same size.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
       if the two structures are not nested in the same way.
     TypeError: If the two structures differ in the type of sequence in any of
-      their substructures.
+      their substructures. Only possible if `check_types` is `True`.
   """
   len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
   len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
@@ -138,7 +136,7 @@ def assert_same_structure(nest1, nest2):
     raise ValueError("The two structures don't have the same number of "
                      "elements. First structure: %s, second structure: %s."
                      % (nest1, nest2))
-  _recursive_assert_same_structure(nest1, nest2)
+  _recursive_assert_same_structure(nest1, nest2, check_types)
 
 
 def flatten_dict_items(dictionary):
@@ -174,7 +172,7 @@ def flatten_dict_items(dictionary):
   if not isinstance(dictionary, dict):
     raise TypeError("input must be a dictionary")
   flat_dictionary = {}
-  for i, v in six.iteritems(dictionary):
+  for i, v in _six.iteritems(dictionary):
     if not is_sequence(i):
       if i in flat_dictionary:
         raise ValueError(
@@ -266,7 +264,7 @@ def pack_sequence_as(structure, flat_sequence):
   return _sequence_like(structure, packed)
 
 
-def map_structure(func, *structure):
+def map_structure(func, *structure, **check_types_dict):
   """Applies `func` to each entry in `structure` and returns a new structure.
 
   Applies `func(x[0], x[1], ...)` where x[i] is an entry in
@@ -277,17 +275,24 @@ def map_structure(func, *structure):
     func: A callable that acceps as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
+    **check_types_dict: only valid keyword argument is `check_types`. If set to
+      `True` (default) the types of iterables within the  structures have to be
+      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
+      exception). To allow this set this argument to `False`.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
     to `func(x[0], x[1], ...)` where `x[i]` is a value in the corresponding
-    location in `structure[i]`.
+    location in `structure[i]`. If there are different sequence types and
+    `check_types` is `False` the sequence types of the first structure will be
+    used.
 
   Raises:
     TypeError: If `func` is not callable or if the structures do not match
       each other by depth tree.
     ValueError: If no structure is provided or if the structures do not match
       each other by type.
+    ValueError: If wrong keyword arguments are provided.
   """
   if not callable(func):
     raise TypeError("func must be callable, got: %s" % func)
@@ -295,8 +300,15 @@ def map_structure(func, *structure):
   if not structure:
     raise ValueError("Must provide at least one structure")
 
+  if check_types_dict:
+    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
+      raise ValueError("Only valid keyword argument is check_types")
+    check_types = check_types_dict["check_types"]
+  else:
+    check_types = True
+
   for other in structure[1:]:
-    assert_same_structure(structure[0], other)
+    assert_same_structure(structure[0], other, check_types=check_types)
 
   flat_structure = [flatten(s) for s in structure]
   entries = zip(*flat_structure)
@@ -315,7 +327,7 @@ def _yield_flat_up_to(shallow_tree, input_tree):
     yield input_tree
 
 
-def assert_shallow_structure(shallow_tree, input_tree):
+def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
   """Asserts that `shallow_tree` is a shallow structure of `input_tree`.
 
   That is, this function tests if the `input_tree` structure can be created from
@@ -341,11 +353,13 @@ def assert_shallow_structure(shallow_tree, input_tree):
   Args:
     shallow_tree: an arbitrarily nested structure.
     input_tree: an arbitrarily nested structure.
+    check_types: if `True` (default) the sequence types of `shallow_tree` and
+      `input_tree` have to be the same.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
     TypeError: If the sequence types of `shallow_tree` are different from
-      `input_tree`.
+      `input_tree`. Only raised if `check_types` is `True`.
     ValueError: If the sequence lengths of `shallow_tree` are different from
       `input_tree`.
   """
@@ -355,7 +369,7 @@ def assert_shallow_structure(shallow_tree, input_tree):
           "If shallow structure is a sequence, input must also be a sequence. "
           "Input has type: %s." % type(input_tree))
 
-    if not isinstance(input_tree, type(shallow_tree)):
+    if check_types and not isinstance(input_tree, type(shallow_tree)):
       raise TypeError(
           "The two structures don't have the same sequence type. Input "
           "structure has type %s, while shallow structure has type %s."
@@ -368,7 +382,8 @@ def assert_shallow_structure(shallow_tree, input_tree):
           % (len(input_tree), len(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
-      assert_shallow_structure(shallow_branch, input_branch)
+      assert_shallow_structure(shallow_branch, input_branch,
+                               check_types=check_types)
 
 
 def flatten_up_to(shallow_tree, input_tree):
@@ -516,3 +531,18 @@ def map_structure_up_to(shallow_tree, func, *inputs):
                          for input_tree in inputs]
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
+
+
+_allowed_symbols = [
+    "assert_same_structure",
+    "is_sequence",
+    "flatten",
+    "flatten_dict_items",
+    "pack_sequence_as",
+    "map_structure",
+    "assert_shallow_structure",
+    "flatten_up_to",
+    "map_structure_up_to",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index f6a2d8b6631..8a17d990da2 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -139,6 +139,13 @@ class NestTest(test.TestCase):
                                  "don't have the same nested structure"):
       nest.assert_same_structure([[3], 4], [3, [4]])
 
+    structure1_list = [[[1, 2], 3], 4, [5, 6]]
+    with self.assertRaisesRegexp(TypeError,
+                                 "don't have the same sequence type"):
+      nest.assert_same_structure(structure1, structure1_list)
+    nest.assert_same_structure(structure1, structure2, check_types=False)
+    nest.assert_same_structure(structure1, structure1_list, check_types=False)
+
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -169,6 +176,23 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
 
+    structure1_list = [[[1, 2], 3], 4, [5, 6]]
+    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+      nest.map_structure(lambda x, y: None, structure1, structure1_list)
+
+    nest.map_structure(lambda x, y: None, structure1, structure1_list,
+                       check_types=False)
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)),
+                         check_types=False)
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, foo="a")
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
+
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
@@ -186,6 +210,7 @@ class NestTest(test.TestCase):
         "<(type|class) 'list'>.")
     with self.assertRaisesRegexp(TypeError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
+    nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
   def testFlattenUpTo(self):
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index 99ceb0b0198..a0e6bf65cf5 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -67,11 +67,12 @@ import collections
 import six
 
 from google.protobuf import descriptor
+from google.protobuf import descriptor_pool
 from google.protobuf import message
 from google.protobuf import text_format
 
 
-def assertProtoEqual(self, a, b, check_initialized=True,
+def assertProtoEqual(self, a, b, check_initialized=True,  # pylint: disable=invalid-name
                      normalize_numbers=False, msg=None):
   """Fails with a useful error if a and b aren't equal.
 
@@ -80,16 +81,17 @@ def assertProtoEqual(self, a, b, check_initialized=True,
 
   Args:
     self: googletest.TestCase
-    a: proto2 PB instance, or text string representing one
-    b: proto2 PB instance -- message.Message or subclass thereof
+    a: proto2 PB instance, or text string representing one.
+    b: proto2 PB instance -- message.Message or subclass thereof.
     check_initialized: boolean, whether to fail if either a or b isn't
-      initialized
+      initialized.
     normalize_numbers: boolean, whether to normalize types and precision of
       numbers before comparison.
-    msg: if specified, is used as the error message on failure
+    msg: if specified, is used as the error message on failure.
   """
+  pool = descriptor_pool.Default()
   if isinstance(a, six.string_types):
-    a = text_format.Merge(a, b.__class__())
+    a = text_format.Merge(a, b.__class__(), descriptor_pool=pool)
 
   for pb in a, b:
     if check_initialized:
@@ -99,9 +101,10 @@ def assertProtoEqual(self, a, b, check_initialized=True,
     if normalize_numbers:
       NormalizeNumberFields(pb)
 
-  self.assertMultiLineEqual(text_format.MessageToString(a),
-                            text_format.MessageToString(b),
-                            msg=msg)
+  self.assertMultiLineEqual(
+      text_format.MessageToString(a, descriptor_pool=pool),
+      text_format.MessageToString(b, descriptor_pool=pool),
+      msg=msg)
 
 
 def NormalizeNumberFields(pb):
@@ -117,10 +120,10 @@ def NormalizeNumberFields(pb):
   Modifies pb in place. Recurses into nested objects.
 
   Args:
-    pb: proto2 message
+    pb: proto2 message.
 
   Returns:
-    the given pb, modified in place
+    the given pb, modified in place.
   """
   for desc, values in pb.ListFields():
     is_repeated = True
@@ -193,14 +196,24 @@ def ProtoEq(a, b):
   repeated fields, ie duplicates and order matter.
 
   Args:
-    a, b: proto2 messages or primitives
+    a: A proto2 message or a primitive.
+    b: A proto2 message or a primitive.
 
   Returns:
     `True` if the messages are equal.
   """
   def Format(pb):
-    """Returns a dictionary that maps tag number (for messages) or element index
-    (for repeated fields) to value, or just pb unchanged if it's neither."""
+    """Returns a dictionary or unchanged pb bases on its type.
+
+    Specifically, this function returns a dictionary that maps tag
+    number (for messages) or element index (for repeated fields) to
+    value, or just pb unchanged if it's neither.
+
+    Args:
+      pb: A proto2 message or a primitive.
+    Returns:
+      A dict or unchanged pb.
+    """
     if isinstance(pb, message.Message):
       return dict((desc.number, value) for desc, value in pb.ListFields())
     elif _IsMap(pb):
diff --git a/tensorflow/python/util/tf_contextlib.py b/tensorflow/python/util/tf_contextlib.py
new file mode 100644
index 00000000000..3830014d4ac
--- /dev/null
+++ b/tensorflow/python/util/tf_contextlib.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the contextlib module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib as _contextlib
+
+from tensorflow.python.util import tf_decorator
+
+
+def contextmanager(target):
+  """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
+
+  Usage is identical to `contextlib.contextmanager`.
+
+  Args:
+    target: A callable to be wrapped in a contextmanager.
+  Returns:
+    A callable that can be used inside of a `with` statement.
+  """
+  context_manager = _contextlib.contextmanager(target)
+  return tf_decorator.make_decorator(target, context_manager, 'contextmanager')
diff --git a/tensorflow/python/util/tf_contextlib_test.py b/tensorflow/python/util/tf_contextlib_test.py
new file mode 100644
index 00000000000..4a5bf388a63
--- /dev/null
+++ b/tensorflow/python/util/tf_contextlib_test.py
@@ -0,0 +1,92 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_contextlib."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+@tf_contextlib.contextmanager
+def test_yield_append_before_and_after_yield(x, before, after):
+  x.append(before)
+  yield
+  x.append(after)
+
+
+@tf_contextlib.contextmanager
+def test_yield_return_x_plus_1(x):
+  yield x + 1
+
+
+@tf_contextlib.contextmanager
+def test_params_and_defaults(a, b=2, c=True, d='hello'):
+  return [a, b, c, d]
+
+
+class TfContextlibTest(test.TestCase):
+
+  def testRunsCodeBeforeYield(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 'before', ''):
+      self.assertEqual('before', x[-1])
+
+  def testRunsCodeAfterYield(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, '', 'after'):
+      pass
+    self.assertEqual('after', x[-1])
+
+  def testNestedWith(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 'before', 'after'):
+      with test_yield_append_before_and_after_yield(x, 'inner', 'outer'):
+        with test_yield_return_x_plus_1(1) as var:
+          x.append(var)
+    self.assertEqual(['before', 'inner', 2, 'outer', 'after'], x)
+
+  def testMultipleCallsOfSeparateInstances(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 1, 2):
+      pass
+    with test_yield_append_before_and_after_yield(x, 3, 4):
+      pass
+    self.assertEqual([1, 2, 3, 4], x)
+
+  def testReturnsResultFromYield(self):
+    with test_yield_return_x_plus_1(3) as result:
+      self.assertEqual(4, result)
+
+  def testUnwrapContextManager(self):
+    decorators, target = tf_decorator.unwrap(test_params_and_defaults)
+    self.assertEqual(1, len(decorators))
+    self.assertTrue(isinstance(decorators[0], tf_decorator.TFDecorator))
+    self.assertEqual('contextmanager', decorators[0].decorator_name)
+    self.assertFalse(isinstance(target, tf_decorator.TFDecorator))
+
+  def testGetArgSpecReturnsWrappedArgSpec(self):
+    argspec = tf_inspect.getargspec(test_params_and_defaults)
+    self.assertEqual(['a', 'b', 'c', 'd'], argspec.args)
+    self.assertEqual((2, True, 'hello'), argspec.defaults)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
new file mode 100644
index 00000000000..a5d979e376c
--- /dev/null
+++ b/tensorflow/python/util/tf_decorator.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base TFDecorator class and utility functions for working with decorators.
+
+There are two ways to create decorators that TensorFlow can introspect into.
+This is important for documentation generation purposes, so that function
+signatures aren't obscured by the (*args, **kwds) signature that decorators
+often provide.
+
+1. Call `tf_decorator.make_decorator` on your wrapper function. If your
+decorator is stateless, or can capture all of the variables it needs to work
+with through lexical closure, this is the simplest option. Create your wrapper
+function as usual, but instead of returning it, return
+`tf_decorator.make_decorator(your_wrapper)`. This will attach some decorator
+introspection metadata onto your wrapper and return it.
+
+Example:
+
+  def print_hello_before_calling(target):
+    def wrapper(*args, **kwargs):
+      print('hello')
+      return target(*args, **kwargs)
+    return tf_decorator.make_decorator(wrapper)
+
+2. Derive from TFDecorator. If your decorator needs to be stateful, you can
+implement it in terms of a TFDecorator. Store whatever state you need in your
+derived class, and implement the `__call__` method to do your work before
+calling into your target. You can retrieve the target via
+`super(MyDecoratorClass, self).decorated_target`, and call it with whatever
+parameters it needs.
+
+Example:
+
+  class CallCounter(tf_decorator.TFDecorator):
+    def __init__(self, target):
+      super(CallCounter, self).__init__('count_calls', target)
+      self.call_count = 0
+
+    def __call__(self, *args, **kwargs):
+      self.call_count += 1
+      return super(CallCounter, self).decorated_target(*args, **kwargs)
+
+  def count_calls(target):
+    return CallCounter(target)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools as _functools
+import inspect as _inspect
+
+
+def make_decorator(target,
+                   decorator_func,
+                   decorator_name=None,
+                   decorator_doc='',
+                   decorator_argspec=None):
+  """Make a decorator from a wrapper and a target.
+
+  Args:
+    target: The final callable to be wrapped.
+    decorator_func: The wrapper function.
+    decorator_name: The name of the decorator. If `None`, the name of the
+      function calling make_decorator.
+    decorator_doc: Documentation specific to this application of
+      `decorator_func` to `target`.
+    decorator_argspec: The new callable signature of this decorator.
+
+  Returns:
+    The `decorator_func` argument with new metadata attached.
+  """
+  if decorator_name is None:
+    decorator_name = _inspect.stack()[1][3]  # Caller's name.
+  decorator = TFDecorator(decorator_name, target, decorator_doc,
+                          decorator_argspec)
+  setattr(decorator_func, '_tf_decorator', decorator)
+  decorator_func.__name__ = target.__name__
+  decorator_func.__doc__ = decorator.__doc__
+  decorator_func.__wrapped__ = target
+  return decorator_func
+
+
+def unwrap(maybe_tf_decorator):
+  """Unwraps an object into a list of TFDecorators and a final target.
+
+  Args:
+    maybe_tf_decorator: Any callable object.
+
+  Returns:
+    A tuple whose first element is an list of TFDecorator-derived objects that
+    were applied to the final callable target, and whose second element is the
+    final undecorated callable target. If the `maybe_tf_decorator` parameter is
+    not decorated by any TFDecorators, the first tuple element will be an empty
+    list. The `TFDecorator` list is ordered from outermost to innermost
+    decorators.
+  """
+  decorators = []
+  cur = maybe_tf_decorator
+  while True:
+    if isinstance(cur, TFDecorator):
+      decorators.append(cur)
+    elif hasattr(cur, '_tf_decorator'):
+      decorators.append(getattr(cur, '_tf_decorator'))
+    else:
+      break
+    cur = decorators[-1].decorated_target
+  return decorators, cur
+
+
+class TFDecorator(object):
+  """Base class for all TensorFlow decorators.
+
+  TFDecorator captures and exposes the wrapped target, and provides details
+  about the current decorator.
+  """
+
+  def __init__(self,
+               decorator_name,
+               target,
+               decorator_doc='',
+               decorator_argspec=None):
+    self._decorated_target = target
+    self._decorator_name = decorator_name
+    self._decorator_doc = decorator_doc
+    self._decorator_argspec = decorator_argspec
+    self.__name__ = target.__name__
+    if self._decorator_doc:
+      self.__doc__ = self._decorator_doc
+    elif target.__doc__:
+      self.__doc__ = target.__doc__
+    else:
+      self.__doc__ = ''
+
+  def __get__(self, obj, objtype):
+    return _functools.partial(self.__call__, obj)
+
+  def __call__(self, *args, **kwargs):
+    return self._decorated_target(*args, **kwargs)
+
+  @property
+  def decorated_target(self):
+    return self._decorated_target
+
+  @property
+  def decorator_name(self):
+    return self._decorator_name
+
+  @property
+  def decorator_doc(self):
+    return self._decorator_doc
+
+  @property
+  def decorator_argspec(self):
+    return self._decorator_argspec
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
new file mode 100644
index 00000000000..3f6a10b4408
--- /dev/null
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_decorator."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def test_tfdecorator(decorator_name, decorator_doc=None):
+
+  def make_tf_decorator(target):
+    return tf_decorator.TFDecorator(decorator_name, target, decorator_doc)
+
+  return make_tf_decorator
+
+
+def test_decorator_increment_first_int_arg(target):
+  """This test decorator skips past `self` as args[0] in the bound case."""
+
+  def wrapper(*args, **kwargs):
+    new_args = []
+    found = False
+    for arg in args:
+      if not found and isinstance(arg, int):
+        new_args.append(arg + 1)
+        found = True
+      else:
+        new_args.append(arg)
+    return target(*new_args, **kwargs)
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
+def test_function(x):
+  """Test Function Docstring."""
+  return x + 1
+
+
+@test_tfdecorator('decorator 1')
+@test_decorator_increment_first_int_arg
+@test_tfdecorator('decorator 3', 'decorator 3 documentation')
+def test_decorated_function(x):
+  """Test Decorated Function Docstring."""
+  return x * 2
+
+
+@test_tfdecorator('decorator')
+class TestDecoratedClass(object):
+  """Test Decorated Class."""
+
+  def __init__(self, two_attr=2):
+    self.two_attr = two_attr
+
+  @property
+  def two_prop(self):
+    return 2
+
+  def two_func(self):
+    return 2
+
+  @test_decorator_increment_first_int_arg
+  def return_params(self, a, b, c):
+    """Return parameters."""
+    return [a, b, c]
+
+
+class TfDecoratorTest(test.TestCase):
+
+  def testInitCapturesTarget(self):
+    self.assertIs(test_function,
+                  tf_decorator.TFDecorator('', test_function).decorated_target)
+
+  def testInitCapturesDecoratorName(self):
+    self.assertEqual('decorator name',
+                     tf_decorator.TFDecorator('decorator name',
+                                              test_function).decorator_name)
+
+  def testInitCapturesDecoratorDoc(self):
+    self.assertEqual('decorator doc',
+                     tf_decorator.TFDecorator('', test_function,
+                                              'decorator doc').decorator_doc)
+
+  def testInitCapturesNonNoneArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    self.assertIs(argspec,
+                  tf_decorator.TFDecorator('', test_function, '',
+                                           argspec).decorator_argspec)
+
+  def testInitSetsDecoratorNameToTargetName(self):
+    self.assertEqual('test_function',
+                     tf_decorator.TFDecorator('', test_function).__name__)
+
+  def testInitSetsDecoratorDocToTargetDoc(self):
+    self.assertEqual('Test Function Docstring.',
+                     tf_decorator.TFDecorator('', test_function).__doc__)
+
+  def testCallingATFDecoratorCallsTheTarget(self):
+    self.assertEqual(124, tf_decorator.TFDecorator('', test_function)(123))
+
+  def testCallingADecoratedFunctionCallsTheTarget(self):
+    self.assertEqual((2 + 1) * 2, test_decorated_function(2))
+
+  def testInitializingDecoratedClassWithInitParamsDoesntRaise(self):
+    try:
+      TestDecoratedClass(2)
+    except TypeError:
+      self.assertFail()
+
+  def testReadingClassAttributeOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_attr)
+
+  def testCallingClassMethodOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_func())
+
+  def testReadingClassPropertyOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_prop)
+
+  def testNameOnBoundProperty(self):
+    self.assertEqual('return_params',
+                     TestDecoratedClass().return_params.__name__)
+
+  def testDocstringOnBoundProperty(self):
+    self.assertEqual('Return parameters.',
+                     TestDecoratedClass().return_params.__doc__)
+
+
+def test_wrapper(*args, **kwargs):
+  return test_function(*args, **kwargs)
+
+
+class TfMakeDecoratorTest(test.TestCase):
+
+  def testAttachesATFDecoratorAttr(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertIsInstance(decorator, tf_decorator.TFDecorator)
+
+  def testAttachesWrappedAttr(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    wrapped_attr = getattr(decorated, '__wrapped__')
+    self.assertIs(test_function, wrapped_attr)
+
+  def testSetsTFDecoratorNameToDecoratorNameArg(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper,
+                                            'test decorator name')
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test decorator name', decorator.decorator_name)
+
+  def testSetsTFDecoratorDocToDecoratorDocArg(self):
+    decorated = tf_decorator.make_decorator(
+        test_function, test_wrapper, decorator_doc='test decorator doc')
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test decorator doc', decorator.decorator_doc)
+
+  def testSetsTFDecoratorArgSpec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper, '', '',
+                                            argspec)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual(argspec, decorator.decorator_argspec)
+
+  def testSetsDecoratorNameToFunctionThatCallsMakeDecoratorIfAbsent(self):
+
+    def test_decorator_name(wrapper):
+      return tf_decorator.make_decorator(test_function, wrapper)
+
+    decorated = test_decorator_name(test_wrapper)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test_decorator_name', decorator.decorator_name)
+
+
+class TfDecoratorUnwrapTest(test.TestCase):
+
+  def testUnwrapReturnsEmptyArrayForUndecoratedFunction(self):
+    decorators, _ = tf_decorator.unwrap(test_function)
+    self.assertEqual(0, len(decorators))
+
+  def testUnwrapReturnsUndecoratedFunctionAsTarget(self):
+    _, target = tf_decorator.unwrap(test_function)
+    self.assertIs(test_function, target)
+
+  def testUnwrapReturnsFinalFunctionAsTarget(self):
+    self.assertEqual((4 + 1) * 2, test_decorated_function(4))
+    _, target = tf_decorator.unwrap(test_decorated_function)
+    self.assertTrue(tf_inspect.isfunction(target))
+    self.assertEqual(4 * 2, target(4))
+
+  def testUnwrapReturnsListOfUniqueTFDecorators(self):
+    decorators, _ = tf_decorator.unwrap(test_decorated_function)
+    self.assertEqual(3, len(decorators))
+    self.assertTrue(isinstance(decorators[0], tf_decorator.TFDecorator))
+    self.assertTrue(isinstance(decorators[1], tf_decorator.TFDecorator))
+    self.assertTrue(isinstance(decorators[2], tf_decorator.TFDecorator))
+    self.assertIsNot(decorators[0], decorators[1])
+    self.assertIsNot(decorators[1], decorators[2])
+    self.assertIsNot(decorators[2], decorators[0])
+
+  def testUnwrapReturnsDecoratorListFromOutermostToInnermost(self):
+    decorators, _ = tf_decorator.unwrap(test_decorated_function)
+    self.assertEqual('decorator 1', decorators[0].decorator_name)
+    self.assertEqual('test_decorator_increment_first_int_arg',
+                     decorators[1].decorator_name)
+    self.assertEqual('decorator 3', decorators[2].decorator_name)
+    self.assertEqual('decorator 3 documentation', decorators[2].decorator_doc)
+
+  def testUnwrapBoundMethods(self):
+    test_decorated_class = TestDecoratedClass()
+    self.assertEqual([2, 2, 3], test_decorated_class.return_params(1, 2, 3))
+    decorators, target = tf_decorator.unwrap(test_decorated_class.return_params)
+    self.assertEqual('test_decorator_increment_first_int_arg',
+                     decorators[0].decorator_name)
+    self.assertEqual([1, 2, 3], target(test_decorated_class, 1, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
new file mode 100644
index 00000000000..977b0df08b5
--- /dev/null
+++ b/tensorflow/python/util/tf_inspect.py
@@ -0,0 +1,141 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the inspect module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect as _inspect
+
+from tensorflow.python.util import tf_decorator
+
+ArgSpec = _inspect.ArgSpec
+
+
+def currentframe():
+  """TFDecorator-aware replacement for inspect.currentframe."""
+  return _inspect.stack()[1][0]
+
+
+def getargspec(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getargspec.
+
+  Args:
+    object: A callable, possibly decorated.
+
+  Returns:
+    The `ArgSpec` that describes the signature of the outermost decorator that
+    changes the callable's signature. If the callable is not decorated,
+    `inspect.getargspec()` will be called directly on the callable.
+  """
+  decorators, target = tf_decorator.unwrap(object)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), _inspect.getargspec(target))
+
+
+def getcallargs(func, *positional, **named):
+  """TFDecorator-aware replacement for inspect.getcallargs.
+
+  Args:
+    func: A callable, possibly decorated
+    *positional: The positional arguments that would be passed to `func`.
+    **named: The named argument dictionary that would be passed to `func`.
+
+  Returns:
+    A dictionary mapping `func`'s named arguments to the values they would
+    receive if `func(*positional, **named)` were called.
+
+  `getcallargs` will use the argspec from the outermost decorator that provides
+  it. If no attached decorators modify argspec, the final unwrapped target's
+  argspec will be used.
+  """
+  argspec = getargspec(func)
+  call_args = named.copy()
+  this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
+  if ismethod(func) and this:
+    positional = (this,) + positional
+  remaining_positionals = [arg for arg in argspec.args if arg not in call_args]
+  call_args.update(dict(zip(remaining_positionals, positional)))
+  default_count = 0 if not argspec.defaults else len(argspec.defaults)
+  if default_count:
+    for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
+      if arg not in call_args:
+        call_args[arg] = value
+  return call_args
+
+
+def getdoc(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getdoc.
+
+  Args:
+    object: An object, possibly decorated.
+
+  Returns:
+    The docstring associated with the object.
+
+  The outermost-decorated object is intended to have the most complete
+  documentation, so the decorated parameter is not unwrapped.
+  """
+  return _inspect.getdoc(object)
+
+
+def getfile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfile."""
+  return _inspect.getfile(tf_decorator.unwrap(object)[1])
+
+
+def getmembers(object, predicate=None):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmembers."""
+  return _inspect.getmembers(object, predicate)
+
+
+def getmro(cls):
+  """TFDecorator-aware replacement for inspect.getmro."""
+  return _inspect.getmro(cls)
+
+
+def getsource(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsource."""
+  return _inspect.getsource(tf_decorator.unwrap(object)[1])
+
+
+def isclass(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isclass."""
+  return _inspect.isclass(tf_decorator.unwrap(object)[1])
+
+
+def isfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isfunction."""
+  return _inspect.isfunction(tf_decorator.unwrap(object)[1])
+
+
+def ismethod(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismethod."""
+  return _inspect.ismethod(tf_decorator.unwrap(object)[1])
+
+
+def ismodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.ismodule(tf_decorator.unwrap(object)[1])
+
+
+def isroutine(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isroutine."""
+  return _inspect.isroutine(tf_decorator.unwrap(object)[1])
+
+
+def stack(context=1):
+  """TFDecorator-aware replacement for inspect.stack."""
+  return _inspect.stack(context)[1:]
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
new file mode 100644
index 00000000000..a9e8ffb30c3
--- /dev/null
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -0,0 +1,327 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_inspect."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def test_decorator(decorator_name, decorator_doc=None):
+
+  def make_tf_decorator(target):
+    return tf_decorator.TFDecorator(decorator_name, target, decorator_doc)
+
+  return make_tf_decorator
+
+
+def test_undecorated_function():
+  pass
+
+
+@test_decorator('decorator 1')
+@test_decorator('decorator 2')
+@test_decorator('decorator 3')
+def test_decorated_function(x):
+  """Test Decorated Function Docstring."""
+  return x * 2
+
+
+@test_decorator('decorator')
+def test_decorated_function_with_defaults(a, b=2, c='Hello'):
+  """Test Decorated Function With Defaults Docstring."""
+  return [a, b, c]
+
+
+@test_decorator('decorator')
+class TestDecoratedClass(object):
+  """Test Decorated Class."""
+
+  def __init__(self):
+    pass
+
+  def two(self):
+    return 2
+
+
+class TfInspectTest(test.TestCase):
+
+  def testCurrentFrame(self):
+    self.assertEqual(inspect.currentframe(), tf_inspect.currentframe())
+
+  def testGetArgSpecOnDecoratorsThatDontProvideArgspec(self):
+    argspec = tf_inspect.getargspec(test_decorated_function_with_defaults)
+    self.assertEqual(['a', 'b', 'c'], argspec.args)
+    self.assertEqual((2, 'Hello'), argspec.defaults)
+
+  def testGetArgSpecOnDecoratorThatChangesArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
+                                         argspec)
+    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
+
+  def testGetArgSpecIgnoresDecoratorsThatDontProvideArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator)
+    self.assertEqual(argspec, tf_inspect.getargspec(outer_decorator))
+
+  def testGetArgSpecReturnsOutermostDecoratorThatChangesArgspec(self):
+    outer_argspec = tf_inspect.ArgSpec(
+        args=['a'], varargs=None, keywords=None, defaults=None)
+    inner_argspec = tf_inspect.ArgSpec(
+        args=['b'], varargs=None, keywords=None, defaults=None)
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', inner_argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator, '',
+                                               outer_argspec)
+    self.assertEqual(outer_argspec, tf_inspect.getargspec(outer_decorator))
+
+  def testGetDoc(self):
+    self.assertEqual('Test Decorated Function With Defaults Docstring.',
+                     tf_inspect.getdoc(test_decorated_function_with_defaults))
+
+  def testGetFile(self):
+    self.assertTrue('tf_inspect_test.py' in tf_inspect.getfile(
+        test_decorated_function_with_defaults))
+    self.assertTrue('tf_decorator.py' in tf_inspect.getfile(
+        test_decorator('decorator')(tf_decorator.unwrap)))
+
+  def testGetMembers(self):
+    self.assertEqual(
+        inspect.getmembers(TestDecoratedClass),
+        tf_inspect.getmembers(TestDecoratedClass))
+
+  def testGetSource(self):
+    expected = '''@test_decorator('decorator')
+def test_decorated_function_with_defaults(a, b=2, c='Hello'):
+  """Test Decorated Function With Defaults Docstring."""
+  return [a, b, c]
+'''
+    self.assertEqual(
+        expected, tf_inspect.getsource(test_decorated_function_with_defaults))
+
+  def testIsClass(self):
+    self.assertTrue(tf_inspect.isclass(TestDecoratedClass))
+    self.assertFalse(tf_inspect.isclass(test_decorated_function))
+
+  def testIsFunction(self):
+    self.assertTrue(tf_inspect.isfunction(test_decorated_function))
+    self.assertFalse(tf_inspect.isfunction(TestDecoratedClass))
+
+  def testIsMethod(self):
+    self.assertTrue(tf_inspect.ismethod(TestDecoratedClass().two))
+    self.assertFalse(tf_inspect.ismethod(test_decorated_function))
+
+  def testIsModule(self):
+    self.assertTrue(
+        tf_inspect.ismodule(inspect.getmodule(inspect.currentframe())))
+    self.assertFalse(tf_inspect.ismodule(test_decorated_function))
+
+  def testIsRoutine(self):
+    self.assertTrue(tf_inspect.isroutine(len))
+    self.assertFalse(tf_inspect.isroutine(TestDecoratedClass))
+
+  def testStack(self):
+    expected_stack = inspect.stack()
+    actual_stack = tf_inspect.stack()
+    self.assertEqual(len(expected_stack), len(actual_stack))
+    self.assertEqual(expected_stack[0][0], actual_stack[0][0])  # Frame object
+    self.assertEqual(expected_stack[0][1], actual_stack[0][1])  # Filename
+    self.assertEqual(expected_stack[0][2],
+                     actual_stack[0][2] - 1)  # Line number
+    self.assertEqual(expected_stack[0][3], actual_stack[0][3])  # Function name
+    self.assertEqual(expected_stack[1:], actual_stack[1:])
+
+
+class TfInspectGetCallArgsTest(test.TestCase):
+
+  def testReturnsEmptyWhenUnboundFuncHasNoParameters(self):
+
+    def empty():
+      pass
+
+    self.assertEqual({}, tf_inspect.getcallargs(empty))
+
+  def testUnboundFuncWithOneParamPositional(self):
+
+    def func(a):
+      return a
+
+    self.assertEqual({'a': 5}, tf_inspect.getcallargs(func, 5))
+
+  def testUnboundFuncWithTwoParamsPositional(self):
+
+    def func(a, b):
+      return (a, b)
+
+    self.assertEqual({'a': 10, 'b': 20}, tf_inspect.getcallargs(func, 10, 20))
+
+  def testUnboundFuncWithOneParamKeyword(self):
+
+    def func(a):
+      return a
+
+    self.assertEqual({'a': 5}, tf_inspect.getcallargs(func, a=5))
+
+  def testUnboundFuncWithTwoParamsKeyword(self):
+
+    def func(a, b):
+      return (a, b)
+
+    self.assertEqual({'a': 6, 'b': 7}, tf_inspect.getcallargs(func, a=6, b=7))
+
+  def testUnboundFuncWithOneParamDefault(self):
+
+    def func(a=13):
+      return a
+
+    self.assertEqual({'a': 13}, tf_inspect.getcallargs(func))
+
+  def testUnboundFuncWithOneParamDefaultOnePositional(self):
+
+    def func(a=0):
+      return a
+
+    self.assertEqual({'a': 1}, tf_inspect.getcallargs(func, 1))
+
+  def testUnboundFuncWithTwoParamsDefaultOnePositional(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 5, 'b': 2}, tf_inspect.getcallargs(func, 5))
+
+  def testUnboundFuncWithTwoParamsDefaultTwoPositional(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 4}, tf_inspect.getcallargs(func, 3, 4))
+
+  def testUnboundFuncWithOneParamDefaultOneKeyword(self):
+
+    def func(a=1):
+      return a
+
+    self.assertEqual({'a': 3}, tf_inspect.getcallargs(func, a=3))
+
+  def testUnboundFuncWithTwoParamsDefaultOneKeywordFirst(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 2}, tf_inspect.getcallargs(func, a=3))
+
+  def testUnboundFuncWithTwoParamsDefaultOneKeywordSecond(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 1, 'b': 4}, tf_inspect.getcallargs(func, b=4))
+
+  def testUnboundFuncWithTwoParamsDefaultTwoKeywords(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 4}, tf_inspect.getcallargs(func, a=3, b=4))
+
+  def testBoundFuncWithOneParam(self):
+
+    class Test(object):
+
+      def bound(self):
+        pass
+
+    t = Test()
+    self.assertEqual({'self': t}, tf_inspect.getcallargs(t.bound))
+
+  def testBoundFuncWithManyParamsAndDefaults(self):
+
+    class Test(object):
+
+      def bound(self, a, b=2, c='Hello'):
+        return (a, b, c)
+
+    t = Test()
+    self.assertEqual({
+        'self': t,
+        'a': 3,
+        'b': 2,
+        'c': 'Goodbye'
+    }, tf_inspect.getcallargs(t.bound, 3, c='Goodbye'))
+
+  def testClassMethod(self):
+
+    class Test(object):
+
+      @classmethod
+      def test(cls, a, b=3, c='hello'):
+        return (a, b, c)
+
+    self.assertEqual({
+        'cls': Test,
+        'a': 5,
+        'b': 3,
+        'c': 'goodbye'
+    }, tf_inspect.getcallargs(Test.test, 5, c='goodbye'))
+
+  def testUsesOutermostDecoratorsArgSpec(self):
+
+    def func():
+      pass
+
+    def wrapper(*args, **kwargs):
+      return func(*args, **kwargs)
+
+    decorated = tf_decorator.make_decorator(
+        func,
+        wrapper,
+        decorator_argspec=tf_inspect.ArgSpec(
+            args=['a', 'b', 'c'],
+            varargs=None,
+            keywords=None,
+            defaults=(3, 'hello')))
+
+    self.assertEqual({
+        'a': 4,
+        'b': 3,
+        'c': 'goodbye'
+    }, tf_inspect.getcallargs(decorated, 4, c='goodbye'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
new file mode 100644
index 00000000000..05c99856d27
--- /dev/null
+++ b/tensorflow/python/util/tf_should_use.py
@@ -0,0 +1,211 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Decorator that provides a warning if the wrapped object is never used."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import itertools
+import traceback
+import types
+
+import six  # pylint: disable=unused-import
+
+from backports import weakref  # pylint: disable=g-bad-import-order
+
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import tf_decorator
+
+
+class _RefInfoField(
+    collections.namedtuple(
+        '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))):
+  pass
+
+
+# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for
+# higher values in Python 3.4+.  We don't expect to ever count higher than this.
+# https://mail.python.org/pipermail/python-list/2005-April/342279.html
+_REF_ITER = itertools.count()
+
+# Dictionary mapping id(obj) => _RefInfoField.
+_REF_INFO = {}
+
+
+def _deleted(obj_id, fatal_error):
+  obj = _REF_INFO[obj_id]
+  del _REF_INFO[obj_id]
+  if not obj.object_used:
+    if fatal_error:
+      logger = tf_logging.fatal
+    else:
+      logger = tf_logging.error
+    logger(
+        '==================================\n'
+        'Object was never used (type %s):\n%s\nIf you want to mark it as '
+        'used call its "mark_used()" method.\nIt was originally created '
+        'here:\n%s\n'
+        '==================================' %
+        (obj.type_, obj.repr_, obj.creation_stack))
+
+
+def _add_should_use_warning(x, fatal_error=False):
+  """Wraps object x so that if it is never used, a warning is logged.
+
+  Args:
+    x: Python object.
+    fatal_error: Python bool.  If `True`, tf.logging.fatal is raised
+      if the returned value is never used.
+
+  Returns:
+    An instance of `TFShouldUseWarningWrapper` which subclasses `type(x)`
+    and is a very shallow wrapper for `x` which logs access into `x`.
+  """
+  if x is None:  # special corner case where x is None
+    return x
+  if hasattr(x, '_tf_ref_id'):  # this is already a TFShouldUseWarningWrapper
+    return x
+
+  def override_method(method):
+    def fn(self, *args, **kwargs):
+      # pylint: disable=protected-access
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
+      return method(self, *args, **kwargs)
+    return fn
+
+  class TFShouldUseWarningWrapper(type(x)):
+    """Wrapper for objects that keeps track of their use."""
+
+    def __init__(self, true_self):
+      self.__dict__ = true_self.__dict__
+      stack = [s.strip() for s in traceback.format_stack()]
+      # Remove top three stack entries from adding the wrapper
+      self.creation_stack = '\n'.join(stack[:-3])
+      self._tf_ref_id = next(_REF_ITER)
+      _REF_INFO[self._tf_ref_id] = _RefInfoField(
+          type_=type(x),
+          repr_=repr(x),
+          creation_stack=stack,
+          object_used=False)
+
+      # Create a finalizer for self, which will be called when self is
+      # garbage collected.  Can't add self as the args because the
+      # loop will break garbage collection.  We keep track of
+      # ourselves via python ids.
+      weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
+
+    # Not sure why this pylint warning is being used; this is not an
+    # old class form.
+    # pylint: disable=super-on-old-class
+    def __getattribute__(self, name):
+      if name == '_tf_ref_id':
+        return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+      if self._tf_ref_id in _REF_INFO:
+        _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+            object_used=True)
+      return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+
+    def mark_used(self, *args, **kwargs):
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
+      if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
+        return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
+    # pylint: enable=super-on-old-class
+
+  for name in dir(TFShouldUseWarningWrapper):
+    method = getattr(TFShouldUseWarningWrapper, name)
+    if not isinstance(method, types.FunctionType):
+      continue
+    if name in ('__init__', '__getattribute__', '__del__', 'mark_used'):
+      continue
+    setattr(TFShouldUseWarningWrapper, name,
+            functools.wraps(method)(override_method(method)))
+
+  wrapped = TFShouldUseWarningWrapper(x)
+  wrapped.__doc__ = x.__doc__  # functools.wraps fails on some objects.
+  ref_id = wrapped._tf_ref_id  # pylint: disable=protected-access
+  _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False)
+  return wrapped
+
+
+def should_use_result(fn):
+  """Function wrapper that ensures the function's output is used.
+
+  If the output is not used, a `tf.logging.error` is logged.
+
+  An output is marked as used if any of its attributes are read, modified, or
+  updated.  Examples when the output is a `Tensor` include:
+
+  - Using it in any capacity (e.g. `y = t + 0`, `sess.run(t)`)
+  - Accessing a property (e.g. getting `t.name` or `t.op`).
+
+  Note, certain behaviors cannot be tracked - for these the object may not
+  be marked as used.  Examples include:
+
+  - `t != 0`.  In this case, comparison is done on types / ids.
+  - `isinstance(t, tf.Tensor)`.  Similar to above.
+
+  Args:
+    fn: The function to wrap.
+
+  Returns:
+    The wrapped function.
+  """
+  def wrapped(*args, **kwargs):
+    return _add_should_use_warning(fn(*args, **kwargs))
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'should_use_result',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function should be used.  If it is not, '
+        'a warning will be logged.  To mark the output as used, '
+        'call its .mark_used() method.')))
+
+
+def must_use_result_or_fatal(fn):
+  """Function wrapper that ensures the function's output is used.
+
+  If the output is not used, a `tf.logging.fatal` error is raised.
+
+  An output is marked as used if any of its attributes are read, modified, or
+  updated.  Examples when the output is a `Tensor` include:
+
+  - Using it in any capacity (e.g. `y = t + 0`, `sess.run(t)`)
+  - Accessing a property (e.g. getting `t.name` or `t.op`).
+
+  Note, certain behaviors cannot be tracked - for these the object may not
+  be marked as used.  Examples include:
+
+  - `t != 0`.  In this case, comparison is done on types / ids.
+  - `isinstance(t, tf.Tensor)`.  Similar to above.
+
+  Args:
+    fn: The function to wrap.
+
+  Returns:
+    The wrapped function.
+  """
+  def wrapped(*args, **kwargs):
+    return _add_should_use_warning(fn(*args, **kwargs), fatal_error=True)
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'must_use_result_or_fatal',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function must be used.  If it is not, '
+        'a fatal error will be raised.  To mark the output as used, '
+        'call its .mark_used() method.')))
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
new file mode 100644
index 00000000000..c8268744004
--- /dev/null
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_should_use."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import gc
+import sys
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import tf_should_use
+
+
+@contextlib.contextmanager
+def reroute_error(captured):
+  """Temporarily reroute errors written to tf_logging.error into `captured`."""
+  del captured[:]
+  true_logger = tf_logging.error
+  def capture_errors(*args, **unused_kwargs):
+    captured.extend(args)
+  tf_logging.error = capture_errors
+  try:
+    yield
+  finally:
+    tf_logging.error = true_logger
+
+
+class TfShouldUseTest(test.TestCase):
+
+  def testAddShouldUseWarningWhenNotUsed(self):
+    c = constant_op.constant(0, name='blah0')
+    captured = []
+    with reroute_error(captured):
+      def in_this_function():
+        h = tf_should_use._add_should_use_warning(c)
+        del h
+      in_this_function()
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah0:0', '\n'.join(captured))
+    self.assertIn('in_this_function', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
+
+  def _testAddShouldUseWarningWhenUsed(self, fn, name):
+    c = constant_op.constant(0, name=name)
+    captured = []
+    with reroute_error(captured):
+      h = tf_should_use._add_should_use_warning(c)
+      fn(h)
+      del h
+    self.assertNotIn('Object was never used', '\n'.join(captured))
+    self.assertNotIn('%s:0' % name, '\n'.join(captured))
+
+  def testAddShouldUseWarningWhenUsedWithAdd(self):
+    def add(h):
+      _ = h + 1
+    self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
+    gc.collect()
+    self.assertFalse(gc.garbage)
+
+  def testAddShouldUseWarningWhenUsedWithGetName(self):
+    def get_name(h):
+      _ = h.name
+    self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
+    gc.collect()
+    self.assertFalse(gc.garbage)
+
+  def testShouldUseResult(self):
+    @tf_should_use.should_use_result
+    def return_const(value):
+      return constant_op.constant(value, name='blah2')
+    captured = []
+    with reroute_error(captured):
+      return_const(0.0)
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah2:0', '\n'.join(captured))
+    self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
+
+  def testShouldUseResultWhenNotReallyUsed(self):
+    @tf_should_use.should_use_result
+    def return_const(value):
+      return constant_op.constant(value, name='blah3')
+    captured = []
+    with reroute_error(captured):
+      with self.test_session():
+        return_const(0.0)
+        # Creating another op and executing it does not mark the
+        # unused op as being "used".
+        v = constant_op.constant(1.0, name='meh')
+        v.eval()
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah3:0', '\n'.join(captured))
+    self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 651f7a39499..00faccced6f 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -24,9 +24,10 @@ cc_library(
         "lib/gtl/*.h",
         "platform/**/*.h",
     ]),
-    linkopts = [
-        "-ldl",
-    ],
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
@@ -45,9 +46,10 @@ cc_library(
             exclude = ["cuda/cuda_platform_id.cc"],
         ),
     ),
-    linkopts = [
-        "-ldl",
-    ],
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":stream_executor",
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index 239e3fce012..a59a1dda71f 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -67,6 +67,23 @@ string SideString(Side s) {
   }
 }
 
+string ComputationTypeString(ComputationType ty) {
+  switch (ty) {
+    case ComputationType::kF16:
+      return "f16";
+    case ComputationType::kF32:
+      return "f32";
+    case ComputationType::kF64:
+      return "f64";
+    case ComputationType::kComplexF32:
+      return "complex f32";
+    case ComputationType::kComplexF64:
+      return "complex f64";
+    default:
+      LOG(FATAL) << "Unknown ComputationType " << static_cast<int32>(ty);
+  }
+}
+
 }  // namespace blas
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 853bbfd1c7e..07a0f7ccd61 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -88,6 +88,46 @@ enum class Side { kLeft, kRight };
 // Returns a name for s.
 string SideString(Side s);
 
+// Type with which intermediate computations of a blas routine are performed.
+//
+// Some blas calls can perform computations with a type that's different than
+// the type of their inputs/outputs.  This lets you e.g. multiply two matricies
+// of int8s using float32s to store the matmul's intermediate values.
+enum class ComputationType {
+  kF16,         // 16-bit floating-point
+  kF32,         // 32-bit floating-point
+  kF64,         // 64-bit floating-point
+  kComplexF32,  // Complex number comprised of two f32s.
+  kComplexF64   // Complex number comprised of two f64s.
+};
+
+// Converts a ComputationType to a string.
+string ComputationTypeString(ComputationType ty);
+
+// Opaque identifier for an "algorithm" used by a blas routine.  This functions
+// as a hint to the blas library.
+typedef int64 AlgorithmType;
+
+// Describes the result of a performance experiment, usually timing the speed of
+// a particular AlgorithmType.
+//
+// If the call we were benchmarking failed (a common occurrence; not all
+// algorithms are valid for all calls), is_valid() will be false.
+class ProfileResult {
+ public:
+  bool is_valid() const { return is_valid_; }
+  void set_is_valid(bool val) { is_valid_ = val; }
+  AlgorithmType algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
+  void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
+
+ private:
+  bool is_valid_ = false;
+  AlgorithmType algorithm_ = 0;
+  float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
+};
+
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -856,11 +896,10 @@ class BlasSupport {
   // batched version of the half-precision interface.
   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
-                          float alpha,
-                          const DeviceMemory<Eigen::half> &a, int lda,
-                          const DeviceMemory<Eigen::half> &b, int ldb,
-                          float beta,
-                          DeviceMemory<Eigen::half> *c, int ldc) = 0;
+                          float alpha, const DeviceMemory<Eigen::half> &a,
+                          int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+                          float beta, DeviceMemory<Eigen::half> *c,
+                          int ldc) = 0;
   virtual bool DoBlasGemm(Stream *stream, blas::Transpose transa,
                           blas::Transpose transb, uint64 m, uint64 n, uint64 k,
                           float alpha, const DeviceMemory<float> &a, int lda,
@@ -886,6 +925,61 @@ class BlasSupport {
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
 
+  // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.  Note that
+  // any or all of these algorithms may still be
+  virtual bool GetBlasGemmAlgorithms(
+      std::vector<AlgorithmType> *out_algorithms) = 0;
+
+  // Like DoBlasGemm, but accepts an algorithm and an compute type.
+  //
+  // The compute type lets you say (e.g.) that the inputs and outputs are
+  // Eigen::halfs, but you want the internal computations to be done with
+  // float32 precision.
+  //
+  // Note the subtle difference in the version that accepts Eigen:::half --
+  // alpha and beta have type const Eigen::half&, not float.
+  //
+  // If output_profile_result is not null, a failure here does not put the
+  // stream in a failure state.  Instead, success/failure is indicated by
+  // output_profile_result->is_valid().  This lets you use this function for
+  // choosing the best algorithm among many (some of which may fail) without
+  // creating a new Stream for each attempt.
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const Eigen::half &alpha,
+      const DeviceMemory<Eigen::half> &a, int lda,
+      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
+      DeviceMemory<Eigen::half> *c, int ldc, ComputationType computation_type,
+      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+      const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+      const DeviceMemory<double> &b, int ldb, double beta,
+      DeviceMemory<double> *c, int ldc, ComputationType computation_type,
+      AlgorithmType algorithm, ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, std::complex<float> alpha,
+      const DeviceMemory<std::complex<float>> &a, int lda,
+      const DeviceMemory<std::complex<float>> &b, int ldb,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, std::complex<double> alpha,
+      const DeviceMemory<std::complex<double>> &a, int lda,
+      const DeviceMemory<std::complex<double>> &b, int ldb,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
+
   // Computes a batch of matrix-matrix product with general matrices.
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
@@ -1641,6 +1735,47 @@ class BlasSupport {
                   const DeviceMemory<std::complex<double>> &b, int ldb,        \
                   std::complex<double> beta,                                   \
                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
+  bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms) \
+      override;                                                                \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, const Eigen::half &alpha,                  \
+      const DeviceMemory<Eigen::half> &a, int lda,                             \
+      const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,    \
+      DeviceMemory<Eigen::half> *c, int ldc,                                   \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
+      int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
+      DeviceMemory<float> *c, int ldc, blas::ComputationType computation_type, \
+      blas::AlgorithmType algorithm,                                           \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, double alpha,                              \
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
+      int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
+      const DeviceMemory<std::complex<float>> &a, int lda,                     \
+      const DeviceMemory<std::complex<float>> &b, int ldb,                     \
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
+      const DeviceMemory<std::complex<double>> &a, int lda,                    \
+      const DeviceMemory<std::complex<double>> &b, int ldb,                    \
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
+      int ldc, blas::ComputationType computation_type,                         \
+      blas::AlgorithmType algorithm,                                           \
+      blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, float alpha,                               \
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 7b05373cd70..2c650afc702 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
@@ -262,6 +263,10 @@ CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
 PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmEx)
 #endif
 
+#if CUDA_VERSION >= 8000
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGemmEx)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -282,6 +287,12 @@ static string ToString(cublasStatus_t status) {
       return "CUBLAS_STATUS_EXECUTION_FAILED";
     case CUBLAS_STATUS_INTERNAL_ERROR:
       return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 8000
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
     default:
       return port::StrCat("<invalid cublas status: ", status, ">");
   }
@@ -431,11 +442,89 @@ cublasSideMode_t CUDABlasSide(blas::Side side) {
   }
 }
 
+// CUDADataType<T>::type translates from a C++ type (e.g. float) to a
+// cudaDataType_t (e.g. CUDA_R_32F).  CUDAComputationType(ty) translates from a
+// blas::ComputationType to a cudaDataType_t.
+//
+// These are used to build the argument type and computation type args to
+// cublasGemmEx.  cublasGemmEx and cudaDataType_t are available only on
+// CUDA >= 8.0.
+#if CUDA_VERSION >= 8000
+template <typename T>
+struct CUDADataType;
+
+template <>
+struct CUDADataType<Eigen::half> {
+  static constexpr cudaDataType_t type = SE_CUDA_DATA_HALF;
+};
+
+template <>
+struct CUDADataType<std::complex<Eigen::half>> {
+  static constexpr cudaDataType_t type = CUDA_C_16F;
+};
+
+template <>
+struct CUDADataType<float> {
+  static constexpr cudaDataType_t type = CUDA_R_32F;
+};
+
+template <>
+struct CUDADataType<std::complex<float>> {
+  static constexpr cudaDataType_t type = CUDA_C_32F;
+};
+
+template <>
+struct CUDADataType<double> {
+  static constexpr cudaDataType_t type = CUDA_R_64F;
+};
+
+template <>
+struct CUDADataType<std::complex<double>> {
+  static constexpr cudaDataType_t type = CUDA_C_64F;
+};
+
+template <>
+struct CUDADataType<int8> {
+  static constexpr cudaDataType_t type = CUDA_R_8I;
+};
+
+template <>
+struct CUDADataType<std::complex<int8>> {
+  static constexpr cudaDataType_t type = CUDA_C_8I;
+};
+
+template <>
+struct CUDADataType<uint8> {
+  static constexpr cudaDataType_t type = CUDA_R_8U;
+};
+
+template <>
+struct CUDADataType<std::complex<uint8>> {
+  static constexpr cudaDataType_t type = CUDA_C_8U;
+};
+
+cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
+  switch (ty) {
+    case blas::ComputationType::kF16:
+      return CUDA_R_16F;
+    case blas::ComputationType::kF32:
+      return CUDA_R_32F;
+    case blas::ComputationType::kF64:
+      return CUDA_R_64F;
+    case blas::ComputationType::kComplexF32:
+      return CUDA_C_32F;
+    case blas::ComputationType::kComplexF64:
+      return CUDA_C_64F;
+  }
+}
+#endif
+
 }  // namespace
 
 template <typename FuncT, typename... Args>
-bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream,
-                              bool pointer_mode_host, Args... args) {
+bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args... args) {
   mutex_lock lock{mu_};
 
   CHECK(blas_ != nullptr);
@@ -450,13 +539,11 @@ bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream,
   }
 
   cublasStatus_t ret = cublas_func(parent_, blas_, args...);
-  if (ret != CUBLAS_STATUS_SUCCESS) {
+  if (err_on_failure && ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": "
                << ToString(ret);
-    return false;
   }
-
-  return true;
+  return ret == CUBLAS_STATUS_SUCCESS;
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -1762,6 +1849,138 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
       CUDAComplex(CUDAMemoryMutable(c)), ldc);
 }
 
+template <typename T>
+bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const T &alpha, const DeviceMemory<T> &a, int lda,
+    const DeviceMemory<T> &b, int ldb, const T &beta, DeviceMemory<T> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+// CUDA < version 8 and GPUs < sm_50 don't support cublasGemmEx.
+#if CUDA_VERSION < 8000
+  return false;
+#else
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major < 5) {
+    return false;
+  }
+
+  struct TimerDeleter {
+    void operator()(CUDATimer *t) {
+      t->Destroy();
+      delete t;
+    }
+  };
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  if (output_profile_result != nullptr) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
+
+  cudaDataType_t data_type = CUDADataType<T>::type;
+  bool result = DoBlasInternalFailureOK(
+      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ true,
+      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+      CUDAMemory(a), data_type, lda, CUDAMemory(b), data_type, ldb, &beta,
+      CUDAMemoryMutable(c), data_type, ldc,
+      CUDAComputationType(computation_type),
+      static_cast<cublasGemmAlgo_t>(algorithm));
+
+  if (timer != nullptr && result) {
+    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // state.
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(algorithm);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
+  return result;
+#endif
+}
+
+bool CUDABlas::GetBlasGemmAlgorithms(
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+// cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
+// were first introduced in CUDA 8.
+#if CUDA_VERSION >= 8000
+  for (cublasGemmAlgo_t algo :
+       {CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
+        CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4,
+        CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7}) {
+    out_algorithms->push_back(algo);
+  }
+#endif
+  return true;
+}
+
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const Eigen::half &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb, const Eigen::half &beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
 template <typename T, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
@@ -1850,9 +2069,13 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
     const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
     int batch_count, ScratchAllocator *scratch_allocator) {
-  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+  port::Status status = DoBlasGemmBatchedInternal(
       wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator));
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
@@ -1862,9 +2085,13 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
     double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
-  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+  port::Status status = DoBlasGemmBatchedInternal(
       wrap::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator));
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
@@ -1876,9 +2103,13 @@ bool CUDABlas::DoBlasGemmBatched(
     int ldb, std::complex<float> beta,
     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
-  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+  port::Status status = DoBlasGemmBatchedInternal(
       wrap::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator));
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
@@ -1890,9 +2121,13 @@ bool CUDABlas::DoBlasGemmBatched(
     int ldb, std::complex<double> beta,
     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
-  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
+  port::Status status = DoBlasGemmBatchedInternal(
       wrap::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator));
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 1226df6d656..6a33cd746b3 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -79,10 +79,27 @@ class CUDABlas : public blas::BlasSupport {
   // stream:             Stream to enqueue the BLAS operation onto.
   // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
   //                     (true) or device (false).
+  // err_on_failure:     Whether to print an error if the cublas function fails.
   // args:               Arguments of cuBLAS function.
   template <typename FuncT, typename... Args>
+  bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+                          bool pointer_mode_host, bool err_on_failure,
+                          Args... args);
+
+  // Convenience functions that call DoBlasInternalImpl with different values
+  // for err_on_failure.
+  template <typename FuncT, typename... Args>
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
-                      Args... args);
+                      Args... args) {
+    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true, args...);
+  }
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
+                               bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/false, args...);
+  }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
@@ -95,6 +112,23 @@ class CUDABlas : public blas::BlasSupport {
       const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
       int batch_count, ScratchAllocator *scratch_allocator);
 
+  // Helper function for implementing DoBlasGemmWithAlgorithm.
+  //
+  // We take alpha and beta by const reference because T might be Eigen::half,
+  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
+  // references to cublas, we essentially reinterpret_cast to __half, which is
+  // safe because Eigen::half inherits from __half.
+  template <typename T>
+  bool DoBlasGemmWithAlgorithmImpl(Stream *stream, blas::Transpose transa,
+                                   blas::Transpose transb, uint64 m, uint64 n,
+                                   uint64 k, const T &alpha,
+                                   const DeviceMemory<T> &a, int lda,
+                                   const DeviceMemory<T> &b, int ldb,
+                                   const T &beta, DeviceMemory<T> *c, int ldc,
+                                   blas::ComputationType computation_type,
+                                   blas::AlgorithmType algorithm,
+                                   blas::ProfileResult *output_profile_result);
+
   // mutex that guards the cuBLAS handle for this device.
   mutex mu_;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index def28053f4c..bf81b9c0ad0 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -57,7 +57,7 @@ namespace cuda {
 
 #ifdef __APPLE__
 static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#else
+#elif !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
 #endif
 
@@ -170,7 +170,7 @@ void Diagnostician::LogDiagnosticInformation() {
     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
 
     std::vector<string> pieces = port::Split(library_path, ':');
-    for (auto piece : pieces) {
+    for (const auto &piece : pieces) {
       if (piece.empty()) {
         continue;
       }
@@ -341,6 +341,12 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
                               CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
     };
   return status;
+#elif defined(PLATFORM_WINDOWS)
+  auto status =
+    port::Status{port::error::UNIMPLEMENTED,
+                 "kernel reported driver version not implemented on Windows"
+    };
+  return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
@@ -363,7 +369,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     LOG(INFO) << "driver version file contents: \"\"\"" << contents.begin()
               << "\"\"\"";
     fclose(driver_version_file);
-    return FindKernelModuleVersion(string{contents.begin()});
+    return FindKernelModuleVersion(contents.begin());
   }
 
   auto status =
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index 5cce6b93656..aa68321acc8 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -75,7 +75,7 @@ class Diagnostician {
 
   // Given the DSO version number and the driver version file contents, extracts
   // the driver version and compares, warning the user in the case of
-  // incompatability.
+  // incompatibility.
   //
   // This is solely used for more informative log messages when the user is
   // running on a machine that happens to have a libcuda/kernel driver mismatch.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 809fb1c956b..34a683ab575 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -169,6 +170,7 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
   __macro(cudnnTransformTensor)                           \
   __macro(cudnnSetConvolutionNdDescriptor)                \
+  __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensorNdDescriptor)                     \
   __macro(cudnnSetFilterNdDescriptor)                     \
   __macro(cudnnPoolingForward)                            \
@@ -238,6 +240,17 @@ CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
 #endif
 
+// APIs in R6
+// clang-format off
+#if CUDNN_VERSION >= 6000
+#define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
+  __macro(cudnnConvolutionBiasActivationForward)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R6
+#endif
+
 #undef CUDNN_DNN_ROUTINE_EACH
 
 }  // namespace wrap
@@ -259,6 +272,9 @@ cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmType algorithm) {
     case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
 #if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
+#endif
+#if CUDNN_VERSION >= 5100
+    case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
 #endif
       return algo;
     default:
@@ -277,6 +293,9 @@ cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
 #if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
+#endif
+#if CUDNN_VERSION >= 5100
+    case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
 #endif
       return algo;
     default:
@@ -295,6 +314,11 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
+#if CUDNN_VERSION >= 5100
+    // Based on cudnn.h, the following is not implemented.
+    // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
+    case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
+#endif
       return algo;
     default:
       LOG(FATAL)
@@ -346,16 +370,14 @@ port::Status CudnnSupport::Init() {
 
   LOG(ERROR) << "could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    // This is the error code that the driver returns when we're not running a
-    // sufficient CUDA driver -- cudnn requires 6.5+ compatibility, which
-    // starts with the 340.XX driver series.
     auto result = cuda::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
       LOG(ERROR) << "error retrieving driver version: "
                  << DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
-      LOG(INFO) << "running driver version: " << DriverVersionToString(version);
+      LOG(INFO) << "possibly insufficient driver version: "
+                << DriverVersionToString(version);
       // OS X kernel driver does not report version accurately
 #if !defined(__APPLE__)
       if (std::get<0>(version) < 340) {
@@ -386,35 +408,48 @@ class ScopedTensorDescriptor {
 
     switch (batch_descriptor.layout()) {
       case dnn::DataLayout::kBatchYXDepth:
-      case dnn::DataLayout::kBatchDepthYX:
-        break;
+      case dnn::DataLayout::kBatchDepthYX: {
+        const int nd = batch_descriptor.ndims() + 2;
+        // cuDNN requires the strides and dims to be ordered as BDYX.
+        std::vector<int64> strides64 =
+            batch_descriptor.full_strides(dnn::DataLayout::kBatchDepthYX);
+        std::vector<int64> dims64 =
+            batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+        // cuDNN requires arrays of ints.
+        std::vector<int> strides(nd);
+        std::vector<int> dims(nd);
+        std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                       &CheckedNarrowing<int64, int>);
+        std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
+                       &CheckedNarrowing<int64, int>);
+        status = wrap::cudnnSetTensorNdDescriptor(
+            parent_, handle_, elem_type, nd, dims.data(), strides.data());
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+          LOG(FATAL) << "could not convert BatchDescriptor "
+                     << batch_descriptor.ToString()
+                     << " to cudnn tensor descriptor: " << ToString(status);
+        }
+      } break;
+#if CUDNN_VERSION >= 6000
+      case dnn::DataLayout::kBatchDepthYX4: {
+        status = wrap::cudnnSetTensor4dDescriptor(
+            parent_, handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
+            batch_descriptor.count(), batch_descriptor.feature_map_count(),
+            batch_descriptor.height(), batch_descriptor.width());
+        if (status != CUDNN_STATUS_SUCCESS) {
+          LOG(FATAL) << "could not convert BatchDescriptor "
+                     << batch_descriptor.ToString()
+                     << " to cudnn tensor descriptor: " << ToString(status);
+        }
+      } break;
+#endif
       default:
         LOG(FATAL) << "Unsupported tensor format "
                    << DataLayoutString(batch_descriptor.layout());
         break;
     }
-
-    const int nd = batch_descriptor.ndims() + 2;
-    // cuDNN requires the strides and dims to be ordered as BDYX.
-    std::vector<int64> strides64 =
-        batch_descriptor.full_strides(dnn::DataLayout::kBatchDepthYX);
-    std::vector<int64> dims64 =
-        batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-    // cuDNN requires arrays of ints.
-    std::vector<int> strides(nd);
-    std::vector<int> dims(nd);
-    std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
-                   &CheckedNarrowing<int64, int>);
-    std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                   &CheckedNarrowing<int64, int>);
-    status = wrap::cudnnSetTensorNdDescriptor(parent_, handle_, elem_type, nd,
-                                              dims.data(), strides.data());
-
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn tensor descriptor: "
-                 << ToString(status);
-    }
   }
 
   ~ScopedTensorDescriptor() {
@@ -458,6 +493,11 @@ class ScopedFilterDescriptor {
       case dnn::FilterLayout::kOutputInputYX:
         format = CUDNN_TENSOR_NCHW;
         break;
+#if CUDNN_VERSION >= 6000
+      case dnn::FilterLayout::kOutputInputYX4:
+        format = CUDNN_TENSOR_NCHW_VECT_C;
+        break;
+#endif
       default:
         LOG(FATAL) << "Unsupported filter format "
                    << FilterLayoutString(filter_descriptor.layout());
@@ -752,12 +792,19 @@ class ScopedActivationDescriptor {
 #endif
 
 namespace {
-cudnnDataType_t ToCudnnDataType(dnn::DataType data_type) {
+cudnnDataType_t ToCudnnDataType(
+    dnn::DataType data_type,
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
   switch (data_type) {
     case dnn::DataType::kFloat:
     case dnn::DataType::kDouble:
     case dnn::DataType::kHalf:
       return static_cast<cudnnDataType_t>(data_type);
+#if CUDNN_VERSION >= 6000
+    case dnn::DataType::kInt8:
+      return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
+                                                            : CUDNN_DATA_INT8;
+#endif
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
@@ -910,7 +957,7 @@ class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
                            const CudnnRnnDescriptor& rnn_desc);
   ~CudnnRnnParamsDescriptor() {
     cudnnStatus_t status = wrap::cudnnDestroyFilterDescriptor(parent_, handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy RNN filter desciptor");
+    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy RNN filter descriptor");
   }
   cudnnFilterDescriptor_t handle() const {
     if (!ok()) return nullptr;
@@ -1190,7 +1237,8 @@ class CudnnRnnSequenceTensorDescriptor
     // Only the first one needs to be destroyed. All others are the same.
     cudnnStatus_t status =
         wrap::cudnnDestroyTensorDescriptor(parent_, handles_[0]);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy sequence tensor desciptor");
+    CUDNN_RETURN_IF_FAIL(status,
+                         "Failed to destroy sequence tensor descriptor");
   }
 
   const cudnnTensorDescriptor_t* handles() const {
@@ -1780,6 +1828,7 @@ bool CudnnSupport::DoConvolveImpl(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<T>& biases, dnn::ActivationMode activation_mode,
     const BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
@@ -1906,6 +1955,26 @@ bool CudnnSupport::DoConvolveImpl(
     }
   }
 
+  const bool has_biases = (biases != nullptr);
+  const bool supported_activation_mode =
+      (activation_mode == dnn::ActivationMode::kRelu6 ||
+       activation_mode == dnn::ActivationMode::kReluX ||
+       activation_mode == dnn::ActivationMode::kRelu);
+
+  if (has_biases && !supported_activation_mode) {
+    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only "
+                  "support relu activation.";
+    return false;
+  }
+
+  if (has_biases && activation_mode != dnn::ActivationMode::kNone) {
+    LOG(ERROR) << "To use cudnnConvolutionBiasActivationForward() "
+                  "with a valid biases tensor, need to also provide "
+                  "a valid activation mode (currently only supports "
+                  "kRelu6, kReluX, and kRelu).";
+    return false;
+  }
+
   std::unique_ptr<CUDATimer> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
@@ -1920,23 +1989,56 @@ bool CudnnSupport::DoConvolveImpl(
       return false;
     }
   }
-  status = wrap::cudnnConvolutionForward(
-      parent_, ToHandle(dnn_handle_),
-      /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/algo, /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+  if (has_biases) {
+    CHECK(supported_activation_mode);
+#if CUDNN_VERSION < 6000
+    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
+                  "supported for cuDNN version >= 6.";
+    return false;
+#else
+    BatchDescriptor bias_dimensions;
+    bias_dimensions.set_count(1)
+        .set_feature_map_count(output_descriptor.feature_map_count())
+        .set_height(1)
+        .set_width(1)
+        .set_layout(dnn::DataLayout::kBatchYXDepth);
+    ScopedTensorDescriptor bias_descriptor{
+        parent_, bias_dimensions, static_cast<cudnnDataType_t>(cudnn_type)};
+    ScopedActivationDescriptor activation_desc{parent_, activation_mode,
+                                               output_descriptor.value_max()};
+    status = wrap::cudnnConvolutionBiasActivationForward(
+        parent_, ToHandle(dnn_handle_),
+        /*alpha1=*/&alpha, /*srcDesc=*/input_nd.handle(),
+        /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
+        /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+        /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+        /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&beta,
+        /*zDesc=*/output_nd.handle(), /*z=*/nullptr,
+        /*biasDesc=*/bias_descriptor.handle(),
+        /*bias=*/biases.opaque(), /*activationDesc=*/activation_desc.handle(),
+        /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+#endif  // CUDNN_VERSION < 6000
+  } else {
+    status = wrap::cudnnConvolutionForward(
+        parent_, ToHandle(dnn_handle_),
+        /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
+        /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
+        /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+        /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+        /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
+        /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+  }
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
       timer->Destroy();
       return false;
     }
-    output_profile_result->set_is_valid(true);
-    output_profile_result->set_algorithm(algo);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
+    if (status == CUDNN_STATUS_SUCCESS) {
+      output_profile_result->set_is_valid(true);
+      output_profile_result->set_algorithm(algo);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
     timer->Destroy();
   }
 
@@ -1952,7 +2054,39 @@ bool CudnnSupport::DoConvolveImpl(
   return true;
 }
 
+// A helper class to decide whether to enable the WINOGRAD_NONFUSED algorithms.
+// By default it is turned on, users can explicitly disable them through an
+// env-var "TF_ENABLE_WINOGRAD_NONFUSED=0".
+// https://github.com/tensorflow/tensorflow/pull/4901
+// TODO(yangzihao): winograd_nonfused bug will only be fixed in cuDNNv7, for
+// cuDNN with smaller versions, we have added code to avoid using winograd
+// nonfused for certain input parameter set.
+template <bool DefaultFlag>
+class WinogradNonfused {
+ public:
+  static bool IsEnabled() {
+    static bool is_enabled = IsEnabledImpl();
+    return is_enabled;
+  }
+
+ private:
+  static bool IsEnabledImpl() {
+    const char* tf_env_var_val = getenv("TF_ENABLE_WINOGRAD_NONFUSED");
+    if (tf_env_var_val != nullptr) {
+      port::StringPiece tf_env_var_val_str(tf_env_var_val);
+      if (tf_env_var_val_str == "0") {
+        return false;
+      }
+      return true;
+    }
+    // TODO(zhengxq): turn the default to True when the test failure is
+    // resolved.
+    return DefaultFlag;
+  }
+};
+
 bool CudnnSupport::GetConvolveAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -1967,10 +2101,16 @@ bool CudnnSupport::GetConvolveAlgorithms(
 #endif
       // clang-format on
   });
+#if CUDNN_VERSION >= 5100
+  if (WinogradNonfused<true>::IsEnabled() && with_winograd_nonfused) {
+    out_algorithms->push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
+  }
+#endif
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -1983,10 +2123,17 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 #endif
       // clang-format on
   });
+#if CUDNN_VERSION >= 5100
+  if (WinogradNonfused<true>::IsEnabled() && with_winograd_nonfused) {
+    out_algorithms->push_back(
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
+  }
+#endif
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -1996,6 +2143,20 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
       // clang-format on
   });
+#if CUDNN_VERSION >= 5100
+#if CUDNN_VERSION >= 5110
+  static constexpr bool kDefaultFlagWinogradNonfused = true;
+#else
+  static constexpr bool kDefaultFlagWinogradNonfused = false;
+#endif
+  if (WinogradNonfused<kDefaultFlagWinogradNonfused>::IsEnabled() &&
+      with_winograd_nonfused) {
+    out_algorithms->push_back(
+        // Based on cudnn.h, the following is not implemented.
+        // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
+  }
+#endif
   return true;
 }
 
@@ -2139,16 +2300,48 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
     const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<float>(
       stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
+      filter_data, convolution_descriptor, biases, activation_mode,
+      output_descriptor, output_data, scratch_allocator, algorithm_config,
+      output_profile_result);
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<float>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
+    ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveImpl<float>(
+      stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
+      filter_data, convolution_descriptor, /*biases=*/nullptr,
+      dnn::ActivationMode::kNone, output_descriptor, output_data,
       scratch_allocator, algorithm_config, output_profile_result);
 }
 
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<double>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data) {
+  LOG(ERROR) << "double-based DNN not yet implemented";
+  return false;
+}
+
 bool CudnnSupport::DoConvolve(
     Stream* stream, const BatchDescriptor& batch_descriptor,
     const DeviceMemory<double>& input_data,
@@ -2167,13 +2360,33 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<Eigen::half>& biases,
+    dnn::ActivationMode activation_mode,
     const BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<Eigen::half>(
       stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
+      filter_data, convolution_descriptor, biases, activation_mode,
+      output_descriptor, output_data, scratch_allocator, algorithm_config,
+      output_profile_result);
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<Eigen::half>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveImpl<Eigen::half>(
+      stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
+      filter_data, convolution_descriptor, /*biases=*/nullptr,
+      dnn::ActivationMode::kNone, output_descriptor, output_data,
       scratch_allocator, algorithm_config, output_profile_result);
 }
 
@@ -2214,6 +2427,35 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   return (*transform_scratch)->device_memory();
 }
 
+bool CudnnSupport::DoTransformTensor(Stream* stream,
+                                     const dnn::BatchDescriptor& input_desc,
+                                     dnn::DataType input_type,
+                                     const DeviceMemoryBase& input_data,
+                                     const dnn::BatchDescriptor& output_desc,
+                                     dnn::DataType output_type,
+                                     DeviceMemoryBase* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  ScopedTensorDescriptor input_tensor_desc(
+      parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
+  ScopedTensorDescriptor output_tensor_desc(
+      parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
+  cudnnStatus_t status = wrap::cudnnTransformTensor(
+      parent_, ToHandle(dnn_handle_), &alpha, input_tensor_desc.handle(),
+      input_data.opaque(), &beta, output_tensor_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "Could not transform a tensor with layout "
+               << input_desc.ToString() << " and data type "
+               << static_cast<int>(input_type) << " to another with layout "
+               << output_desc.ToString() << " and data type "
+               << static_cast<int>(output_type) << ": " << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 template <class T>
 bool CudnnSupport::DoConvolveBackwardDataImpl(
     Stream* stream,
@@ -2395,10 +2637,12 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
       /*gradData=*/backward_input_data->opaque());
   if (is_profiling) {
     timer->Stop(AsCUDAStream(stream));
-    output_profile_result->set_is_valid(true);
-    output_profile_result->set_algorithm(algo);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
+    if (status == CUDNN_STATUS_SUCCESS) {
+      output_profile_result->set_is_valid(true);
+      output_profile_result->set_algorithm(algo);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
     timer->Destroy();
   }
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -2628,10 +2872,12 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
       /*gradData=*/backward_filter_data->opaque());
   if (is_profiling) {
     timer->Stop(AsCUDAStream(stream));
-    output_profile_result->set_is_valid(true);
-    output_profile_result->set_algorithm(algo);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
+    if (status == CUDNN_STATUS_SUCCESS) {
+      output_profile_result->set_is_valid(true);
+      output_profile_result->set_algorithm(algo);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
     timer->Destroy();
   }
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -2866,6 +3112,7 @@ bool CudnnSupport::DoMatMul(Stream* stream,
     }
     const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) {
       std::vector<DeviceMemory<float>*> ptrs;
+      ptrs.reserve(v.size());
       for (auto& mem : v) {
         ptrs.push_back(&mem);
       }
@@ -3008,6 +3255,41 @@ bool CudnnSupport::DoActivate(Stream* stream,
   return true;
 }
 
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<double>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                     AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  double alpha = 1.0;
+  // Beta is the scaling factor for output.
+  double beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+                                   CUDNN_DATA_DOUBLE};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = wrap::cudnnPoolingForward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoPoolForward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -3077,6 +3359,44 @@ bool CudnnSupport::DoPoolForward(
   return true;
 }
 
+bool CudnnSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<double>& output_data,
+    const DeviceMemory<double>& input_diff_data,
+    DeviceMemory<double>* output_diff_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                     AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  double alpha = 1.0;
+  // Beta is the scaling factor for output.
+  double beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+                                   CUDNN_DATA_DOUBLE};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = wrap::cudnnPoolingBackward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
+      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
+      src_desc.handle(), output_diff_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index cfc7e29574a..cc37c8bb9f3 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -104,12 +104,15 @@ class CudnnSupport : public dnn::DnnSupport {
                      ScratchAllocator* workspace_allocator) override;
 
   bool GetConvolveAlgorithms(
+      bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType>* out_algorithms) override;
 
   bool GetConvolveBackwardDataAlgorithms(
+      bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType>* out_algorithms) override;
 
   bool GetConvolveBackwardFilterAlgorithms(
+      bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType>* out_algorithms) override;
 
   bool DoBatchNormalizationForward(
@@ -134,7 +137,43 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
       DeviceMemory<float>* offset_backprop) override;
 
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<float>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<float>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<float>& biases,
+                  dnn::ActivationMode activation_mode,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<float>* output_data,
+                  ScratchAllocator* scratch_allocator,
+                  const dnn::AlgorithmConfig& algorithm_config,
+                  dnn::ProfileResult* output_profile_result) override;
+
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<double>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<double>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<double>& biases,
+                  dnn::ActivationMode activation_mode,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<double>* output_data) override;
+
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<Eigen::half>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<Eigen::half>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<Eigen::half>& biases,
+                  dnn::ActivationMode activation_mode,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<Eigen::half>* output_data,
+                  ScratchAllocator* scratch_allocator,
+                  const dnn::AlgorithmConfig& algorithm_config,
+                  dnn::ProfileResult* output_profile_result) override;
+
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
                   const DeviceMemory<float>& input_data,
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<float>& filter_data,
@@ -153,7 +192,7 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<double>* output_data) override;
 
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
                   const DeviceMemory<Eigen::half>& input_data,
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<Eigen::half>& filter_data,
@@ -305,6 +344,13 @@ class CudnnSupport : public dnn::DnnSupport {
                   const DeviceMemory<float>& input_data,
                   DeviceMemory<float>* output_data, uint64 options) override;
 
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<double>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<double>* output_data) override;
+
   bool DoPoolForward(Stream* stream,
                      const dnn::PoolingDescriptor& pooling_dimensions,
                      const dnn::BatchDescriptor& input_dimensions,
@@ -319,6 +365,15 @@ class CudnnSupport : public dnn::DnnSupport {
                      const dnn::BatchDescriptor& output_dimensions,
                      DeviceMemory<Eigen::half>* output_data) override;
 
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<double>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<double>& output_data,
+                      const DeviceMemory<double>& input_diff_data,
+                      DeviceMemory<double>* output_diff_data) override;
+
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
@@ -396,6 +451,13 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       dnn::BatchDescriptor* output_batch_descriptor);
 
+  bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
+                         dnn::DataType input_type,
+                         const DeviceMemoryBase& input_data,
+                         const dnn::BatchDescriptor& output_desc,
+                         dnn::DataType output_type,
+                         DeviceMemoryBase* output_data) override;
+
  private:
   // Guards the enqueueing of DNN operations via the dnn_handle_ below.
   mutex dnn_handle_mutex_;
@@ -458,6 +520,8 @@ class CudnnSupport : public dnn::DnnSupport {
                       const dnn::FilterDescriptor& filter_descriptor,
                       const DeviceMemory<T>& filter_data,
                       const dnn::ConvolutionDescriptor& convolution_descriptor,
+                      const DeviceMemory<T>& biases,
+                      dnn::ActivationMode activation_mode,
                       const dnn::BatchDescriptor& output_descriptor,
                       DeviceMemory<T>* output_data,
                       ScratchAllocator* scratch_allocator,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index fa8a954380c..76778dbeece 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 
-#include <map>
 #include <stdint.h>
 #include <stdlib.h>
+#include <map>
 #include <set>
+#include <utility>
 
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@@ -80,7 +81,7 @@ class CreatedContexts {
     mutex_lock lock{mu_};
     auto cuda_context = new CudaContext(context, next_id_++);
     Live()->insert(
-        make_pair(context, std::unique_ptr<CudaContext>(cuda_context)));
+        std::make_pair(context, std::unique_ptr<CudaContext>(cuda_context)));
     return cuda_context;
   }
 
@@ -227,7 +228,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current  = CUDADriver::CurrentContextOrDie();
+  CUcontext current = CUDADriver::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -453,7 +454,8 @@ static port::Status InternalInit() {
   return true;
 }
 
-bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
+bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
+                                 int *flags) {
   static_assert(DeviceOptions::kMask == 0xf,
                 "needs update for new device options");
 
@@ -480,27 +482,56 @@ bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
     CUdevice device, DeviceOptions device_options, CudaContext** context) {
   *context = nullptr;
 
-  CUcontext former_context = CurrentContext();
-  if (former_context != nullptr) {
-    LOG(WARNING) << "creating context when one is currently active; existing: "
-                 << former_context;
-  }
-
   int flags = 0;
   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
     LOG(WARNING) << "could not convert all device options into context flags";
   }
 
   CUresult res;
+  CUcontext former_context;
   CUcontext new_context;
   {
     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
     // context creation: see http://b/13248943
 
 #if CUDA_VERSION >= 7000
-    res = cuDevicePrimaryCtxSetFlags(device, flags);
+    {
+      unsigned int former_primary_context_flags;
+      int former_primary_context_is_active;
+      CHECK_EQ(CUDA_SUCCESS,
+               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                          &former_primary_context_is_active));
+      if (former_primary_context_flags != flags) {
+        if (former_primary_context_is_active) {
+          LOG(ERROR)
+              << "The primary context is active and has a different flag set ("
+              << former_primary_context_flags << ") than the desired flag set ("
+              << flags << ").";
+        } else {
+          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+        }
+      }
+    }
+
+    former_context = CUDADriver::CurrentContextOrDie();
     res = cuDevicePrimaryCtxRetain(&new_context, device);
+    if (former_context != nullptr) {
+      if (former_context == new_context) {
+        VLOG(2) << "The primary context " << former_context
+                << " exists before initializing the StreamExecutor.";
+      } else {
+        LOG(WARNING) << "A non-primary context " << former_context
+                     << " exists before initializing the StreamExecutor. We "
+                        "haven't verified StreamExecutor works with that.";
+      }
+    }
 #else
+    former_context = CurrentContext();
+    if (former_context != nullptr) {
+      LOG(WARNING)
+          << "creating context when one is currently active; existing: "
+          << former_context;
+    }
     res = cuCtxCreate(&new_context, flags, device);
 #endif
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index c5d7d8b32f3..68494aba659 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -77,7 +77,7 @@ class CUDADriver {
 
   // Destroys a CUDA stream associated with the given context.
   // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfuly destroyed.
+  // if the stream is successfully destroyed.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
   static void DestroyStream(CudaContext* context, CUstream *stream);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index 46f0232b1dd..56667e65d38 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -46,7 +46,7 @@ class CUDAEvent : public internal::EventInterface {
   // Polls the CUDA platform for the event's current status.
   Event::Status PollForStatus();
 
-  // The underyling CUDA event element.
+  // The underlying CUDA event element.
   const CUevent& cuda_event();
 
  private:
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 6cde6f6eec1..43c707730af 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -65,17 +65,8 @@ limitations under the License.
 #endif
 
 extern bool FLAGS_check_gpu_leaks;
-tensorflow::int32 FLAGS_register_occupancy_warning_threshold;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
-namespace rng {
-class RngSupport;
-}  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
@@ -353,15 +344,18 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
   const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
 
-  // Only perform/print the occupancy check 1x.
-  launched_kernels_mu_.lock();
-  if (launched_kernels_.find(cufunc) == launched_kernels_.end()) {
-    OccupancyCheck(kernel, thread_dims, block_dims);
-    // TODO(rspringer): Remove elements from launched_kernels_...if we ever
-    // expose a kernel/module deallocation method.
-    launched_kernels_.insert(cufunc);
+  // Only perform/print the occupancy check once.  Even just checking to see
+  // whether we've done an occupancy check on this kernel before isn't free
+  // (because we have to synchronize), so we only do this at -v 2+.
+  if (VLOG_IS_ON(2)) {
+    mutex_lock lock(launched_kernels_mu_);
+    if (!launched_kernels_.count(cufunc)) {
+      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+      // expose a kernel/module deallocation method.
+      launched_kernels_.insert(cufunc);
+    }
   }
-  launched_kernels_mu_.unlock();
 
   if (cuda_kernel->GetPreferredCacheConfig() !=
       KernelCacheConfig::kNoPreference) {
@@ -388,9 +382,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void CUDAExecutor::OccupancyCheck(const KernelBase &kernel,
-                                  const ThreadDim &thread_dims,
-                                  const BlockDim &block_dims) {
+void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
+                                     const ThreadDim &thread_dims,
+                                     const BlockDim &block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
           << kernel.demangled_name();
   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@@ -430,16 +424,6 @@ void CUDAExecutor::OccupancyCheck(const KernelBase &kernel,
     VLOG(2) << "Reducing register usage from " << regs_per_thread
             << " to " << improved_regs_per_thread
             << " could increase resident blocks per SM by one.";
-
-    uint64 reg_reduction = regs_per_thread - improved_regs_per_thread;
-    if (reg_reduction <=
-        static_cast<uint64>(FLAGS_register_occupancy_warning_threshold)) {
-      LOG(INFO) << "Notice: occupancy would increase if register usage was"
-                << " reduced from " << regs_per_thread
-                << " to " << improved_regs_per_thread
-                << " registers per thread for kernel: "
-                << kernel.demangled_name();
-    }
   } else {
     VLOG(2) << "Resident blocks per SM cannot be increased by reducing "
         "register usage.";
@@ -863,7 +847,7 @@ void *CUDAExecutor::CudaContextHack() { return context_; }
 
 CudaContext* CUDAExecutor::cuda_context() { return context_; }
 
-// Attemps to read the NUMA node corresponding to the GPU device's PCI bus out
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
 //
 // For anything more complicated/prod-focused than this, you'll likely want to
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index a9917cc89f1..6c5b9dca90b 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,17 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
-namespace blas {
-class BlasSupport;
-}
-namespace internal {
-class RngSupport;
-}  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
@@ -237,12 +226,10 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   bool GetKernelMetadata(CUDAKernel *cuda_kernel,
                          KernelMetadata *kernel_metadata);
 
-  // Determines if the given kernel's occupancy could be improved by only
-  // slightly reducing its register usage. If so, a message is emitted to the
-  // INFO log. The warning threshold is controlled by the flag
-  // register_occupancy_warning_threshold.
-  void OccupancyCheck(const KernelBase &kernel, const ThreadDim &thread_dims,
-                      const BlockDim &block_dims);
+  // Prints to VLOG(2) information about the kernel's occupancy and how it might
+  // be improved.
+  void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
+                         const BlockDim &block_dims);
 
   // Guards the on-disk-module mapping.
   mutex disk_modules_mu_;
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 9f58be4302a..3e7032f97a0 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -141,7 +141,8 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
   }
 
   StreamExecutor* naked_executor = executor.ValueOrDie().get();
-  executor_cache_.Insert(config, executor.ConsumeValueOrDie());
+  SE_RETURN_IF_ERROR(
+      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
   return naked_executor;
 }
 
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index e815a3bf745..311f45f7481 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,17 +23,17 @@ namespace gputools {
 namespace dnn {
 
 bool DnnSupport::GetConvolveAlgorithms(
-    std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
-    std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardFilterAlgorithms(
-    std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
   return false;
 }
 
@@ -94,6 +94,8 @@ string DataLayoutString(DataLayout layout) {
       return "BatchYXDepth";
     case DataLayout::kBatchDepthYX:
       return "BatchDepthYX";
+    case DataLayout::kBatchDepthYX4:
+      return "BatchDepthYX4";
     default:
       LOG(FATAL) << "Unknown data layout " << static_cast<int32>(layout);
   }
@@ -104,6 +106,8 @@ string FilterLayoutString(FilterLayout layout) {
   switch (layout) {
     case FilterLayout::kOutputInputYX:
       return "OutputInputYX";
+    case FilterLayout::kOutputInputYX4:
+      return "OutputInputYX4";
     case FilterLayout::kInputYXOutput:
       return "InputYXOutput";
     case FilterLayout::kYXInputOutput:
@@ -123,6 +127,7 @@ string PadAlignmentString(PadAlignment alignment) {
     case PadAlignment::kTensorFlowPadding:
       return "TensorFlow padding";
   }
+  return "unknown pad alignment";
 }
 
 string ShortPoolingModeString(PoolingMode mode) {
@@ -160,6 +165,7 @@ std::tuple<int, int, int> GetDimIndices(const DataLayout& layout,
       break;
 
     case DataLayout::kBatchDepthYX:
+    case DataLayout::kBatchDepthYX4:
       depth_idx = 1;
       batch_idx = 0;
       spatial_idx = 2;
@@ -193,6 +199,12 @@ std::vector<int64> ReorderDims(const std::vector<int64>& input,
   return reordered;
 }
 
+// -- AlgorithmConfig
+
+string AlgorithmConfig::ToString() const {
+  return port::StrCat(algorithm_, ", ", algorithm_no_scratch_);
+}
+
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
@@ -217,6 +229,14 @@ std::vector<int64> BatchDescriptor::full_dims(const DataLayout& layout) const {
 
 std::vector<int64> BatchDescriptor::full_strides(
     const DataLayout& layout) const {
+  if (layout_ == DataLayout::kBatchDepthYX4) {
+    LOG(FATAL)
+        << "Cannot compute full strides for batch descriptor " << ToString()
+        << ", because its layout is kBatchDepthYX4. In fact, "
+           "cudnnSetTensorNdDescriptor doesn't work for kBatchDepthYX4 at all. "
+           "Use cudnnSetTensor4DDescriptor to set cudnnTensorDescriptor_t "
+           "instead.";
+  }
   std::vector<int64> phys_dims = full_dims(layout_);
   std::vector<int64> phys_strides(phys_dims.size());
   phys_strides[ndims_ + 1] = 1;
@@ -278,6 +298,8 @@ string BatchDescriptor::ToShortString() const {
       return port::StrCat(batch, spatial, depth, suffix);
     case DataLayout::kBatchDepthYX:
       return port::StrCat(batch, depth, spatial, suffix);
+    case DataLayout::kBatchDepthYX4:
+      return port::StrCat(batch, depth, spatial, suffix, "(VECT_C)");
     default:
       LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
@@ -373,6 +395,8 @@ string FilterDescriptor::ToShortString() const {
   switch (layout_) {
     case FilterLayout::kOutputInputYX:
       return port::StrCat(od, id, spatial);
+    case FilterLayout::kOutputInputYX4:
+      return port::StrCat(od, id, spatial, "(VECT_C)");
     case FilterLayout::kInputYXOutput:
       return port::StrCat(id, spatial, od);
     case FilterLayout::kYXInputOutput:
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 8c3eee70f48..8c8ac8662d1 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -52,8 +52,11 @@ enum class DataLayout : int64 {
   kYXDepthBatch = 0,  // Same as dist_belief::DF_DEPTH_MAJOR.
   kYXBatchDepth,      // Same as dist_belief::DF_BATCH_MAJOR.
   kBatchYXDepth,      // Same as run_brain output, and tensorflow's layout.
-  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature,
+  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature
                       // maps, rows, columns.
+  kBatchDepthYX4,     // cuDNN's NCHW_VECT_C layout, data laid out the same as
+                      // kBatchDepthYX but each element is a vector of 4 feature
+                      // maps.
 };
 
 // Specifies an index to use when accessing specific spatial dimensions.
@@ -87,6 +90,7 @@ enum class DataType {
   kFloat = 0,
   kDouble = 1,
   kHalf = 2,
+  kInt8 = 3,
 };
 
 // A helper class to convert C/C++ types to the proper enums.
@@ -104,6 +108,10 @@ template <>
 struct ToDataType<Eigen::half> {
   static constexpr DataType value = DataType::kHalf;
 };
+template <>
+struct ToDataType<int8> {
+  static constexpr DataType value = DataType::kInt8;
+};
 
 // Specifies the types of a RNN model.
 enum class RnnMode {
@@ -341,8 +349,10 @@ enum class FilterLayout : int64 {
   kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
                        // (major) output feature maps >> input feature maps >>
                        // rows >> columns (minor).
-  kInputYXOutput,      // Same as dist_belief's default filter layout.
-  kYXInputOutput,      // Same as tensorflow's default filter layout.
+  kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
+                    // vector of 4 feature maps.
+  kInputYXOutput,   // Same as dist_belief's default filter layout.
+  kYXInputOutput,   // Same as tensorflow's default filter layout.
 };
 
 // Returns a string representation of the given filter layout.
@@ -681,7 +691,7 @@ class ProfileResult {
   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
 };
 
-// Describe the configuration for the algorithms that will used.
+// Describes the configuration for the algorithms that will used.
 //
 // Arguments:
 //  algorithm: the primary algorithm that should be used.
@@ -702,6 +712,14 @@ class AlgorithmConfig {
   void set_algorithm_no_scratch(AlgorithmType val) {
     algorithm_no_scratch_ = val;
   }
+  bool operator==(const AlgorithmConfig& other) const {
+    return this->algorithm_ == other.algorithm_ &&
+           this->algorithm_no_scratch_ == other.algorithm_no_scratch_;
+  }
+  bool operator!=(const AlgorithmConfig& other) const {
+    return !(*this == other);
+  }
+  string ToString() const;
 
  private:
   AlgorithmType algorithm_;
@@ -788,6 +806,7 @@ class NormalizeDescriptor {
 
 // Describes a kind of non-linearity (threshold-like mathematical function).
 enum class ActivationMode {
+  kNone,
   kSigmoid,
   // Rectified linear activation: f(x) = x < 0 ? 0 : x
   kRelu,
@@ -902,9 +921,11 @@ class DnnSupport {
   //  input_data: un-owned device memory region which contains the
   //    convolution input.
   //  filter_descriptor: dimensions of the convolution filter.
-  //  weights: coefficients for the convolution filter, these are multiplied
-  //    against values in the input that the filter convolves over.
   //  convolution_descriptor: stride of the convolution filter.
+  //  biases: un-owned device memory region containing biases to add to the
+  //  input. This can be DeviceMemory pointing to NULL only when activation_mode
+  //  is kNone.
+  //  activation_mode: Type of activation to perform.
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
@@ -931,6 +952,55 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  // Enqueues a double-precision fused convolution, bias add, and activation
+  // operation onto the stream. See DoConvolve above for argument details.
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double>* output_data) {
+    return false;
+  }
+
+  // Enqueues a half-precision fused convolution, bias add, and activation
+  // operation onto the stream. See DoConvolve above for argument details.
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<Eigen::half>& biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  // Enqueues a single-precision convolution operation (without bias add
+  // or activation) onto the stream.
+  // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<float>& input_data,
@@ -942,11 +1012,8 @@ class DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       ProfileResult* output_profile_result) = 0;
 
-  // Return a list of algorithms supported by the forward convolution pass.
-  virtual bool GetConvolveAlgorithms(
-      std::vector<AlgorithmType>* out_algorithms);
-
-  // Enqueues a double-precision convolution operation onto the stream.
+  // Enqueues a double-precision convolution operation (without bias add
+  // or activation) onto the stream.
   // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
@@ -957,7 +1024,8 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<double>* output_data) = 0;
 
-  // Enqueues a half-precision convolution operation onto the stream.
+  // Enqueues a half-precision convolution operation (without bias add
+  // or activation) onto the stream.
   // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
@@ -971,6 +1039,10 @@ class DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       ProfileResult* output_profile_result) = 0;
 
+  // Return a list of algorithms supported by the forward convolution pass.
+  virtual bool GetConvolveAlgorithms(
+      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
+
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
   // original float coefficient[row * num_columns + column] =
@@ -1048,7 +1120,7 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
   virtual bool GetConvolveBackwardDataAlgorithms(
-      std::vector<AlgorithmType>* out_algorithms);
+      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
 
   virtual bool DoConvolveBackwardData(
       Stream* stream, const FilterDescriptor& filter_descriptor,
@@ -1096,7 +1168,7 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
   virtual bool GetConvolveBackwardFilterAlgorithms(
-      std::vector<AlgorithmType>* out_algorithms);
+      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
 
   virtual bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
@@ -1272,14 +1344,39 @@ class DnnSupport {
                              const dnn::BatchDescriptor& output_dimensions,
                              DeviceMemory<float>* output_data) = 0;
 
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<double>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<double>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for double.";
+    return false;
+  }
+
   virtual bool DoPoolForward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
                              const DeviceMemory<Eigen::half>& input_data,
                              const dnn::BatchDescriptor& output_dimensions,
-                             DeviceMemory<Eigen::half>* output_data) = 0;
+                             DeviceMemory<Eigen::half>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for float16.";
+    return false;
+  }
 
   // Performs differentiation of the pooling operation.
+  virtual bool DoPoolBackward(Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<double>& input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              const DeviceMemory<double>& output_data,
+                              const DeviceMemory<double>& input_diff_data,
+                              DeviceMemory<double>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
+
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
                               const dnn::BatchDescriptor& input_dimensions,
@@ -1287,7 +1384,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<float>& output_data,
                               const DeviceMemory<float>& input_diff_data,
-                              DeviceMemory<float>* output_diff_data) = 0;
+                              DeviceMemory<float>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1296,7 +1396,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<Eigen::half>& output_data,
                               const DeviceMemory<Eigen::half>& input_diff_data,
-                              DeviceMemory<Eigen::half>* output_diff_data) = 0;
+                              DeviceMemory<Eigen::half>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   // Applies local response normalization to the values from
   // input_data and writes the result to output_data. See comments on
@@ -1867,6 +1970,27 @@ class DnnSupport {
     return false;
   }
 
+  // Transforms a tensor into another tensor with a different layout and/or data
+  // type.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  input_desc: specifies the shape and the data layout of the input tensor.
+  //  input_type: the data type of the input tensor.
+  //  input_data: the device memory region that contains the input tensor.
+  //  output_desc: specifies the shape and the data layout of the output tensor.
+  //  output_type: the data type of the output tensor.
+  //  output_data: the device memory region that contains the output tensor.
+  virtual bool DoTransformTensor(Stream* stream,
+                                 const dnn::BatchDescriptor& input_desc,
+                                 dnn::DataType input_type,
+                                 const DeviceMemoryBase& input_data,
+                                 const dnn::BatchDescriptor& output_desc,
+                                 dnn::DataType output_type,
+                                 DeviceMemoryBase* output_data) {
+    return false;
+  }
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
 };
@@ -1876,4 +2000,3 @@ class DnnSupport {
 }  // namespace perftools
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
-
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index db1f8d9ba91..5210a81092b 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -123,9 +123,13 @@ static mutex& GetRpathMutex() {
   port::Status s =
       port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
   if (!s.ok()) {
+#if !defined(PLATFORM_WINDOWS)
+    char* ld_library_path = getenv("LD_LIBRARY_PATH");
+#endif
     LOG(INFO) << "Couldn't open CUDA library " << path
 #if !defined(PLATFORM_WINDOWS)
-              << ". LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH")
+              << ". LD_LIBRARY_PATH: "
+              << (ld_library_path != nullptr ? ld_library_path : "")
 #endif
     ;
     return port::Status(port::error::FAILED_PRECONDITION,
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 830bc9a6813..0af2c8cc3d7 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -19,13 +19,14 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/stream_executor/host/host_platform_id.h"
 #include "tensorflow/stream_executor/host/host_stream.h"
 #include "tensorflow/stream_executor/host/host_timer.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 
-namespace gpu = ::perftools::gputools;
+bool FLAGS_stream_executor_cpu_real_clock_rate = false;
 
 namespace perftools {
 namespace gputools {
@@ -190,7 +191,12 @@ DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
   // doesn't result in thrashing or other badness? 4GiB chosen arbitrarily.
   builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
 
-  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
+  float cycle_counter_frequency = 1e9;
+  if (FLAGS_stream_executor_cpu_real_clock_rate) {
+    cycle_counter_frequency = static_cast<float>(
+        tensorflow::profile_utils::CpuUtils::GetCycleCounterFrequency());
+  }
+  builder.set_clock_rate_ghz(cycle_counter_frequency / 1e9);
 
   auto built = builder.Build();
   return built.release();
diff --git a/tensorflow/stream_executor/host/host_timer.cc b/tensorflow/stream_executor/host/host_timer.cc
index 187db9f0c27..d84d825c92a 100644
--- a/tensorflow/stream_executor/host/host_timer.cc
+++ b/tensorflow/stream_executor/host/host_timer.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace perftools {
 namespace gputools {
 namespace host {
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index bbe02e5112f..d9d40d77bd9 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -136,6 +136,8 @@ class KernelMetadata {
 // Thread-compatible.
 class KernelBase {
  public:
+  KernelBase(KernelBase &&) = default;
+
   // Constructs an "empty" (not-yet-loaded) kernel instance.
   //
   // parent is the StreamExecutor that will be responsible for loading the
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index b4f2aa46491..0404c573f01 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -103,7 +103,7 @@ const char *CudaPtxInMemory::default_text() const {
   if (decompressed_ptx_iter != decompressed_ptx_.end()) {
     // If the decompressed string is empty, which means the ptx hasn't been
     // decompressed, decompress it here.
-    if (decompressed_ptx_iter->second.size() == 0) {
+    if (decompressed_ptx_iter->second.empty()) {
       decompressed_ptx_iter->second = DecompressPtx(ptx);
     }
     return decompressed_ptx_iter->second.c_str();
@@ -136,7 +136,7 @@ const char *CudaPtxInMemory::text(int compute_capability_major,
   if (decompressed_ptx_iter != decompressed_ptx_.end()) {
     // If the decompressed string is empty, which means the ptx hasn't been
     // decompressed, decompress it here.
-    if (decompressed_ptx_iter->second.size() == 0) {
+    if (decompressed_ptx_iter->second.empty()) {
       decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second);
     }
     return decompressed_ptx_iter->second.c_str();
diff --git a/tensorflow/stream_executor/lib/demangle.cc b/tensorflow/stream_executor/lib/demangle.cc
index 8abbac34327..8dea7534e54 100644
--- a/tensorflow/stream_executor/lib/demangle.cc
+++ b/tensorflow/stream_executor/lib/demangle.cc
@@ -37,11 +37,11 @@ namespace port {
 string Demangle(const char *mangled) {
   string demangled;
   int status = 0;
-  char *result = NULL;
+  char *result = nullptr;
 #if HAS_CXA_DEMANGLE
-  result = abi::__cxa_demangle(mangled, NULL, NULL, &status);
+  result = abi::__cxa_demangle(mangled, nullptr, nullptr, &status);
 #endif
-  if (status == 0 && result != NULL) {  // Demangling succeeeded.
+  if (status == 0 && result != nullptr) {  // Demangling succeeeded.
     demangled.append(result);
     free(result);
   }
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index 9bdc46b3dca..c9a22ebd558 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -26,9 +26,7 @@ namespace gputools {
 namespace port {
 
 using tensorflow::Env;
-using tensorflow::ReadFileToString;
 using tensorflow::Thread;
-using tensorflow::WriteStringToFile;
 
 inline Status FileExists(const string& filename) {
   return Env::Default()->FileExists(filename);
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index 139b5bb5f50..f2591f47f7b 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-using ::perftools::gputools::port::StrAppend;
-
 namespace perftools {
 namespace gputools {
 namespace port {
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index 849143b9fe2..3d856187f05 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -41,7 +41,7 @@ bool GetCurrentDirectory(string* dir) {
   std::unique_ptr<char[]> a(new char[len]);
   for (;;) {
     char* p = getcwd(a.get(), len);
-    if (p != NULL) {
+    if (p != nullptr) {
       *dir = p;
       return true;
     } else if (errno == ERANGE) {
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 0aec2917dc2..8c289e1927f 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -29,8 +29,7 @@ namespace port {
 
 using Status = tensorflow::Status;
 
-#define SE_CHECK_OK(val) \
-  CHECK_EQ(::perftools::gputools::port::Status::OK(), (val))
+#define SE_CHECK_OK(val) TF_CHECK_OK(val)
 #define SE_ASSERT_OK(val) \
   ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
 
diff --git a/tensorflow/stream_executor/lib/status_macros.h b/tensorflow/stream_executor/lib/status_macros.h
index a14ba73af38..816132e85a6 100644
--- a/tensorflow/stream_executor/lib/status_macros.h
+++ b/tensorflow/stream_executor/lib/status_macros.h
@@ -50,20 +50,4 @@ limitations under the License.
   SE_ASSIGN_OR_RETURN_IMPL(__lhs, __rhs,  \
                            SE_MACRO_CONCAT(__status_or_value, __COUNTER__))
 
-// Logs the status and returns false if it is in error; otherwise, returns true.
-//
-// The argument expression is guaranteed to be evaluated exactly once.
-//
-// TODO(leary) remove as many of these as possible with port::Status
-// proliferation.
-#define SE_RETURN_STATUS_AS_BOOL(__status) \
-  do {                                     \
-    auto status = __status;                \
-    if (__status.ok()) {                   \
-      return true;                         \
-    }                                      \
-    LOG(ERROR) << status;                  \
-    return false;                          \
-  } while (false)
-
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_MACROS_H_
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 2a6df910abe..e06550009a9 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -135,7 +135,7 @@ class StatusOr {
   // operators, to support move-only types and avoid unnecessary copying.
   StatusOr(T&& value);  // NOLINT
 
-  // Move conversion operator to avoid unecessary copy.
+  // Move conversion operator to avoid unnecessary copy.
   // T must be assignable from U.
   // Not marked with explicit so the implicit conversion can happen.
   template <typename U>
@@ -143,7 +143,7 @@ class StatusOr {
       : status_(std::move(other.status_)),
         value_(std::move(other.value_)) {}
 
-  // Move assignment opeartor to avoid unnecessary copy.
+  // Move assignment operator to avoid unnecessary copy.
   // T must be assignable from U
   template <typename U>
   StatusOr& operator=(StatusOr<U>&& other) {
@@ -202,13 +202,13 @@ StatusOr<T>::StatusOr(const T& value)
 
 template <typename T>
 const T& StatusOr<T>::ValueOrDie() const {
-  assert(status_.ok());
+  TF_CHECK_OK(status_);
   return value_;
 }
 
 template <typename T>
 T StatusOr<T>::ConsumeValueOrDie() {
-  assert(status_.ok());
+  TF_CHECK_OK(status_);
   return std::move(value_);
 }
 
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 0f95fd64198..4dd6f3b0ccf 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -36,7 +36,6 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
 }
 
 using tensorflow::str_util::Lowercase;
-using tensorflow::str_util::Uppercase;
 
 }  // namespace port
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/lib/stringprintf.h b/tensorflow/stream_executor/lib/stringprintf.h
index 7cc90b5550b..504de25a681 100644
--- a/tensorflow/stream_executor/lib/stringprintf.h
+++ b/tensorflow/stream_executor/lib/stringprintf.h
@@ -24,7 +24,6 @@ namespace port {
 
 using tensorflow::strings::Printf;
 using tensorflow::strings::Appendf;
-using tensorflow::strings::Appendv;
 
 }  // namespace port
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
index b1db8b7cb87..0b88b86e2b1 100644
--- a/tensorflow/stream_executor/plugin.h
+++ b/tensorflow/stream_executor/plugin.h
@@ -49,7 +49,7 @@ enum class PluginKind {
 //
 // A PluginConfig may be passed to the StreamExecutor constructor - the plugins
 // described therein will be used to provide BLAS, DNN, FFT, and RNG
-// functionality. Platform-approprate defaults will be used for any un-set
+// functionality. Platform-appropriate defaults will be used for any un-set
 // libraries. If a platform does not support a specified plugin (ex. cuBLAS on
 // an OpenCL executor), then an error will be logged and no plugin operations
 // will succeed.
diff --git a/tensorflow/stream_executor/rng.cc b/tensorflow/stream_executor/rng.cc
index 08f91584bfc..1c05005067c 100644
--- a/tensorflow/stream_executor/rng.cc
+++ b/tensorflow/stream_executor/rng.cc
@@ -41,7 +41,7 @@ bool RngSupport::CheckSeed(const uint8 *seed, uint64 seed_bytes) {
   return true;
 }
 
-#if defined(__APPLE__)
+#if defined(__APPLE__) || defined(__FreeBSD__)
 const int RngSupport::kMinSeedBytes;
 const int RngSupport::kMaxSeedBytes;
 #endif
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 44c6d706369..fff9accae32 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -75,6 +75,10 @@ string ToVlogString(blas::Diagonal d) { return blas::DiagonalString(d); }
 
 string ToVlogString(blas::Side s) { return blas::SideString(s); }
 
+string ToVlogString(blas::ComputationType ty) {
+  return blas::ComputationTypeString(ty);
+}
+
 string ToVlogString(const void *ptr) {
   if (ptr == nullptr) {
     return "null";
@@ -109,6 +113,8 @@ string ToVlogString(const DeviceMemoryBase *memory) {
   return ToVlogString(*memory);
 }
 
+string ToVlogString(const Eigen::half &h) { return port::StrCat(h); }
+
 string ToVlogString(int i) { return port::StrCat(i); }
 
 string ToVlogString(uint32 i) { return port::StrCat(i); }
@@ -157,6 +163,20 @@ string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) {
     case dnn::DepthToSpaceLayout::DepthHeightWidth:
       return "DepthToSpaceLayout::DepthHeightWidth";
   }
+  return "unknown DepthToSpaceLayout";
+}
+
+string ToVlogString(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return "dnn::DataType::kFloat";
+    case dnn::DataType::kDouble:
+      return "dnn::DataType::kDouble";
+    case dnn::DataType::kHalf:
+      return "dnn::DataType::kHalf";
+    case dnn::DataType::kInt8:
+      return "dnn::DataType::kInt8";
+  }
 }
 
 // Used together with PARAM to VLOG calls made to the stream. Intended
@@ -343,9 +363,65 @@ Stream &Stream::ThenConvolveWithScratch(
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<Eigen::half> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<Eigen::half> &biases,
+    dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output,
+    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithScratch(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<float> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
     ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithScratch(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<Eigen::half> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
             PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(convolution_descriptor), PARAM(output_descriptor),
@@ -355,9 +431,9 @@ Stream &Stream::ThenConvolveWithScratch(
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoConvolve(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output,
-          /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          convolution_descriptor, output_descriptor, output, scratch_allocator,
+          dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -382,9 +458,74 @@ Stream &Stream::ThenConvolveWithScratch(
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoConvolve(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output,
-          /*scratch_allocator=*/scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          convolution_descriptor, output_descriptor, output, scratch_allocator,
+          dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<float> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
+    ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, algorithm_config, output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<Eigen::half> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<Eigen::half> &biases,
+    dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
+    const dnn::AlgorithmConfig &algorithm_config,
+    dnn::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, algorithm_config, output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -454,6 +595,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   return *this;
 }
 
+Stream &Stream::ThenConvolve(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<float> &filter_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<float> *output) {
+  return ThenConvolveWithScratch(
+      input_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, biases, activation_mode, output_descriptor,
+      output, /*scratch_allocator=*/nullptr);
+}
+
 Stream &Stream::ThenConvolve(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -575,7 +731,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -669,7 +825,7 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
           backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -711,7 +867,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -772,7 +928,7 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
           backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
-          nullptr));
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -956,6 +1112,30 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
   return *this;
 }
 
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<double> *output_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions,
+                                    output_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolForward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
@@ -998,6 +1178,33 @@ Stream &Stream::ThenPoolForward(
   return *this;
 }
 
+Stream &Stream::ThenPoolBackward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    const DeviceMemory<double> &output_data,
+    const DeviceMemory<double> &input_diff_data,
+    DeviceMemory<double> *output_diff_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(input_diff_data), PARAM(output_diff_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                     input_data, output_dimensions, output_data,
+                                     input_diff_data, output_diff_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
@@ -1495,16 +1702,6 @@ Stream &Stream::ThenWaitFor(Stream *other) {
   return *this;
 }
 
-Stream &Stream::ThenWaitFor(std::vector<std::unique_ptr<Stream>> *others) {
-  VLOG_CALL(PARAM(others));
-
-  for (auto &stream : *others) {
-    CHECK_NE(stream.get(), this);
-    ThenWaitFor(stream.get());
-  }
-  return *this;
-}
-
 Stream &Stream::ThenWaitFor(Event *event) {
   VLOG_CALL(PARAM(event));
 
@@ -1530,21 +1727,33 @@ struct ThenBlasImpl {
   // arguments except the first one of Stream* type.
   Stream &operator()(Stream *stream,
                      bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
-                     Args... args);
+                     Args... args) {
+    return Run(stream, blas_func, /*record_error=*/true, args...);
+  }
+
+  // Like operator(), but only calls stream->CheckError() if record_error is
+  // true.
+  Stream &Run(Stream *stream,
+              bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
+              bool record_error, Args... args);
 };
 
 template <typename... Args>
-Stream &ThenBlasImpl<Args...>::operator()(
+Stream &ThenBlasImpl<Args...>::Run(
     Stream *stream, bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
-    Args... args) {
+    bool record_error, Args... args) {
   if (stream->ok()) {
+    bool ok;
     if (blas::BlasSupport *blas = stream->parent_->AsBlas()) {
-      stream->CheckError((blas->*blas_func)(stream, args...));
+      ok = (blas->*blas_func)(stream, args...);
     } else {
-      stream->CheckError(false);
       LOG(WARNING)
           << "attempting to perform BLAS operation using StreamExecutor "
              "without BLAS support";
+      ok = false;
+    }
+    if (record_error) {
+      stream->CheckError(ok);
     }
   }
   return *stream;
@@ -3225,6 +3434,141 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+namespace {
+// Like ThenBlasImpl, except this expects the last argument of blas_func to be a
+// blas::ProfileResult*.  This functor doesn't put the stream into an error
+// state if the op fails and the profile result is non-null.  Instead, the
+// error-ness is returned in the profile result itself.
+template <typename... Args>
+struct ThenBlasWithProfileImpl {
+  Stream &operator()(Stream *stream,
+                     bool (blas::BlasSupport::*blas_func)(
+                         Stream *, Args..., blas::ProfileResult *),
+                     Args... args, blas::ProfileResult *profile_result) {
+    ThenBlasImpl<Args..., blas::ProfileResult *> Runner;
+    bool record_error = profile_result == nullptr;
+    return Runner.Run(stream, blas_func, record_error, args..., profile_result);
+  }
+};
+}  // anonymous namespace
+
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+    const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64, const Eigen::half &,
+                          const DeviceMemory<Eigen::half> &, int,
+                          const DeviceMemory<Eigen::half> &, int,
+                          const Eigen::half &, DeviceMemory<Eigen::half> *, int,
+                          blas::ComputationType, blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
+      const DeviceMemory<float> &, int, const DeviceMemory<float> &, int, float,
+      DeviceMemory<float> *, int, blas::ComputationType, blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64, double, const DeviceMemory<double> &, int,
+                          const DeviceMemory<double> &, int, double,
+                          DeviceMemory<double> *, int, blas::ComputationType,
+                          blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      std::complex<float>, const DeviceMemory<std::complex<float>> &, int,
+      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
+      DeviceMemory<std::complex<float>> *, int, blas::ComputationType,
+      blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      std::complex<double>, const DeviceMemory<std::complex<double>> &, int,
+      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
+      DeviceMemory<std::complex<double>> *, int, blas::ComputationType,
+      blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
 Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
                              uint64 n, std::complex<float> alpha,
                              const DeviceMemory<std::complex<float>> &a,
@@ -3673,7 +4017,7 @@ Stream &Stream::ThenBlasGemmBatched(
     int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
-                                        nullptr);
+                                        /*scratch_allocator=*/nullptr);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -3705,7 +4049,7 @@ Stream &Stream::ThenBlasGemmBatched(
     int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
-                                        nullptr);
+                                        /*scratch_allocator=*/nullptr);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -3739,7 +4083,7 @@ Stream &Stream::ThenBlasGemmBatched(
     int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
-                                        nullptr);
+                                        /*scratch_allocator=*/nullptr);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -3778,7 +4122,7 @@ Stream &Stream::ThenBlasGemmBatched(
     int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
-                                        nullptr);
+                                        /*scratch_allocator=*/nullptr);
 }
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
@@ -3971,7 +4315,7 @@ Stream &Stream::ThenMemZero(DeviceMemoryBase *location, uint64 size) {
   return *this;
 }
 
-Stream &Stream::ThenMemset32(DeviceMemoryBase *location, const uint32 &pattern,
+Stream &Stream::ThenMemset32(DeviceMemoryBase *location, uint32 pattern,
                              uint64 size) {
   VLOG_CALL(PARAM(location), PARAM(pattern), PARAM(size));
 
@@ -4058,6 +4402,26 @@ Stream &Stream::ThenRnnBackward(
   return *this;
 }
 
+Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
+                                    dnn::DataType input_type,
+                                    const DeviceMemoryBase &input_data,
+                                    const dnn::BatchDescriptor &output_desc,
+                                    dnn::DataType output_type,
+                                    DeviceMemoryBase *output_data) {
+  VLOG_CALL(PARAM(input_desc), PARAM(input_type), PARAM(input_data),
+            PARAM(output_desc), PARAM(output_type), PARAM(output_data));
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoTransformTensor(this, input_desc, input_type,
+                                        input_data, output_desc, output_type,
+                                        output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenDoHostCallbackForTest(std::function<void()> callback) {
   VLOG_CALL(PARAM(callback));
 
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index c7ca7ad0503..b07d3021c92 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -121,12 +121,8 @@ class Stream {
   // Convenience wrapper around Init() and InitTimer().
   Stream &InitWithTimer(Timer *t);
 
-  // Warning! After calling BlockHostUntilDone(), all sub-streams will be
-  // returned and hence invalid. This may be a temporary solution to the issue
-  // b/18070215.
-  // Get or create a sub-stream from this stream. If there is any sub-stream
-  // in the pool that can be reused then just return this sub-stream.
-  // Otherwise
+  // Get or create a sub-stream from this stream. If there is any sub-stream in
+  // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
   Stream *GetOrCreateSubStream();
 
@@ -194,8 +190,15 @@ class Stream {
 
   // Waits for all streams values in others.
   // Checks that there is no shallow circular wait (i.e. that "this" is not in
-  // others).
-  Stream &ThenWaitFor(std::vector<std::unique_ptr<Stream>> *others);
+  // others)
+  template <typename P>
+  Stream &ThenWaitFor(P others) {
+    for (auto &stream : *others) {
+      CHECK_NE(stream.get(), this);
+      ThenWaitFor(stream.get());
+    }
+    return *this;
+  }
 
   // Waits for an event object to be set.
   // Note that ThenRecordEvent must have been called on the event before
@@ -237,6 +240,16 @@ class Stream {
       DeviceMemory<float> *offset_backprop);
 
   // TODO(leary) add double-precision version of this interface.
+  Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
+                       const DeviceMemory<float> &input_data,
+                       const dnn::FilterDescriptor &filter_descriptor,
+                       const DeviceMemory<float> &filter_data,
+                       const dnn::ConvolutionDescriptor &convolution_descriptor,
+                       const DeviceMemory<float> &biases,
+                       dnn::ActivationMode activation_mode,
+                       const dnn::BatchDescriptor &output_descriptor,
+                       DeviceMemory<float> *output);
+
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
                        const dnn::FilterDescriptor &filter_descriptor,
@@ -265,6 +278,27 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output_data);
 
+  Stream &ThenConvolveWithScratch(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<Eigen::half> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<Eigen::half> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<Eigen::half> &biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
+
+  Stream &ThenConvolveWithScratch(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<float> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<float> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
+
   Stream &ThenConvolveWithScratch(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<Eigen::half> &input_data,
@@ -283,6 +317,31 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
 
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<float> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<float> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<Eigen::half> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<Eigen::half> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<Eigen::half> &biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
+
   Stream &ThenConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
@@ -462,6 +521,12 @@ class Stream {
                       const dnn::BatchDescriptor &dimensions,
                       DeviceMemory<float> *output_data);
 
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<double> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<double> *output_data);
+
   Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
                           const dnn::BatchDescriptor &input_dimensions,
                           const DeviceMemory<float> &input_data,
@@ -474,6 +539,14 @@ class Stream {
                           const dnn::BatchDescriptor &output_dimensions,
                           DeviceMemory<Eigen::half> *output_data);
 
+  Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
+                           const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<double> &input_data,
+                           const dnn::BatchDescriptor &output_dimensions,
+                           const DeviceMemory<double> &output_data,
+                           const DeviceMemory<double> &input_diff_data,
+                           DeviceMemory<double> *output_diff_data);
+
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<float> &input_data,
@@ -1176,6 +1249,47 @@ class Stream {
                        std::complex<double> beta,
                        DeviceMemory<std::complex<double>> *c, int ldc);
 
+  // See BlasSupport::DoBlasGemmWithAlgorithm.
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
+      int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+      const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
+                                    blas::Transpose transb, uint64 m, uint64 n,
+                                    uint64 k, float alpha,
+                                    const DeviceMemory<float> &a, int lda,
+                                    const DeviceMemory<float> &b, int ldb,
+                                    float beta, DeviceMemory<float> *c, int ldc,
+                                    blas::ComputationType computation_type,
+                                    blas::AlgorithmType algorithm,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+      const DeviceMemory<double> &b, int ldb, double beta,
+      DeviceMemory<double> *c, int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<float> alpha,
+      const DeviceMemory<std::complex<float>> &a, int lda,
+      const DeviceMemory<std::complex<float>> &b, int ldb,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<double> alpha,
+      const DeviceMemory<std::complex<double>> &a, int lda,
+      const DeviceMemory<std::complex<double>> &b, int ldb,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+
   // See BlasSupport::DoBlasGemmBatched.
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64 m, uint64 n, uint64 k, float alpha,
@@ -1481,18 +1595,14 @@ class Stream {
     return ThenMemcpy(gpu_dst, gpu_src, size);
   }
 
-  // Entrain onto the stream: a memset of zero at a GPU location of size
-  // bytes.
+  // Entrain onto the stream: a memset of zero at a GPU location of size bytes.
   // The location must not be null.
   Stream &ThenMemZero(DeviceMemoryBase *location, uint64 size);
 
-  // Entrain onto the stream: a memset of a 32-bit pattern at a GPU location
-  // of
-  // size bytes, where bytes must be evenly 32-bit sized (i.e. evenly
-  // divisible
+  // Entrain onto the stream: a memset of a 32-bit pattern at a GPU location of
+  // size bytes, where bytes must be evenly 32-bit sized (i.e. evenly divisible
   // by 4). The location must not be null.
-  Stream &ThenMemset32(DeviceMemoryBase *location, const uint32 &pattern,
-                       uint64 size);
+  Stream &ThenMemset32(DeviceMemoryBase *location, uint32 pattern, uint64 size);
 
   // Enqueue a forward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnForward for more details.
@@ -1539,6 +1649,27 @@ class Stream {
                           DeviceMemory<uint8> *reserve_space_data,
                           ScratchAllocator *workspace_allocator);
 
+  // Enqueue onto the stream a operation that transforms a tensor.
+  // See DnnSupport::DoTransformTensor for more details.
+  Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
+                              dnn::DataType input_type,
+                              const DeviceMemoryBase &input_data,
+                              const dnn::BatchDescriptor &output_desc,
+                              dnn::DataType output_type,
+                              DeviceMemoryBase *output_data);
+
+  // The templated version of the above ThenTransformTensor. Useful when the
+  // input and output types are statically known.
+  template <typename InElemT, typename OutElemT>
+  Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
+                              const DeviceMemory<InElemT> &input_data,
+                              const dnn::BatchDescriptor &output_desc,
+                              DeviceMemory<OutElemT> *output_data) {
+    return ThenTransformTensor(input_desc, dnn::ToDataType<InElemT>(),
+                               input_data, output_desc,
+                               dnn::ToDataType<OutElemT>(), output_data);
+  }
+
   // (Synchronously) block the host code waiting for the operations
   // entrained on the stream (enqueued to this point in program
   // execution) to complete.
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 751ccd3d0ef..802ef755eb5 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -48,27 +48,9 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
-class KernelBase;
 class Stream;
 class Timer;
 
-namespace blas {
-class BlasSupport;
-}  // namespace blas
-
-namespace fft {
-class Support;
-}  // namespace fft
-
-namespace rng {
-class RngSupport;
-}  // namespace rng
-
-}  // namespace gputools
-}  // namespace perftools
-
-namespace perftools {
-namespace gputools {
 namespace internal {
 
 // Platform-dependent interface class for the generic Events interface, in
@@ -319,7 +301,7 @@ class StreamExecutorInterface {
   // Creates a new DnnSupport object, ownership is transferred to the caller.
   // If SupportsDnn() is false, this will always return null.
   //
-  // If SupportsDnn() is true, this may return null, for example, if the RNG
+  // If SupportsDnn() is true, this may return null, for example, if the DNN
   // initialization fails.
   virtual dnn::DnnSupport *CreateDnn() { return nullptr; }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index c498eecb3ca..b3eefe02992 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 #include <atomic>
+#include <utility>
 
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
@@ -204,7 +205,7 @@ StreamExecutor::~StreamExecutor() {
 port::Status StreamExecutor::Init(int device_ordinal,
                                   DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
-  return implementation_->Init(device_ordinal, device_options);
+  return implementation_->Init(device_ordinal, std::move(device_options));
 }
 
 port::Status StreamExecutor::Init() {
@@ -284,30 +285,45 @@ bool StreamExecutor::SupportsDnn() const {
 }
 
 bool StreamExecutor::GetConvolveAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
-  return dnn_support->GetConvolveAlgorithms(out_algorithms);
+  return dnn_support->GetConvolveAlgorithms(with_winograd_nonfused,
+                                            out_algorithms);
 }
 
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
-  return dnn_support->GetConvolveBackwardDataAlgorithms(out_algorithms);
+  return dnn_support->GetConvolveBackwardDataAlgorithms(with_winograd_nonfused,
+                                                        out_algorithms);
 }
 
 bool StreamExecutor::GetConvolveBackwardFilterAlgorithms(
+    bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmType> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
   }
-  return dnn_support->GetConvolveBackwardFilterAlgorithms(out_algorithms);
+  return dnn_support->GetConvolveBackwardFilterAlgorithms(
+      with_winograd_nonfused, out_algorithms);
+}
+
+bool StreamExecutor::GetBlasGemmAlgorithms(
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+  blas::BlasSupport *blas_support = AsBlas();
+  if (!blas_support) {
+    return false;
+  }
+  return blas_support->GetBlasGemmAlgorithms(out_algorithms);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
@@ -610,7 +626,7 @@ bool StreamExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
 
 bool StreamExecutor::HostCallback(Stream *stream,
                                   std::function<void()> callback) {
-  return implementation_->HostCallback(stream, callback);
+  return implementation_->HostCallback(stream, std::move(callback));
 }
 
 port::Status StreamExecutor::AllocateEvent(Event *event) {
@@ -680,7 +696,7 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
 }
 
 void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
-  background_threads_->Schedule(task);
+  background_threads_->Schedule(std::move(task));
 }
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 29ba63af059..9814f1b9607 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -205,7 +205,7 @@ class StreamExecutor {
   // This should be done before deallocating the region with delete[]/free/etc.
   bool HostMemoryUnregister(void *location) SE_MUST_USE_RESULT;
 
-  // Synchronizes all activity occuring in the StreamExecutor's context (most
+  // Synchronizes all activity occurring in the StreamExecutor's context (most
   // likely a whole device).
   bool SynchronizeAllActivity() SE_MUST_USE_RESULT;
 
@@ -238,7 +238,7 @@ class StreamExecutor {
                                     DeviceMemoryBase *gpu_dst);
 
   // Alternative interface for memcpying from host to device that takes an
-  // array slice. Checks that the destination size can accomodate the host
+  // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <class T>
   port::Status SynchronousMemcpyH2D(port::ArraySlice<T> host_src,
@@ -253,7 +253,7 @@ class StreamExecutor {
                                     void *host_dst);
 
   // Alternative interface for memcpying from device to host that takes an
-  // array slice. Checks that the destination size can accomodate the host
+  // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <typename T>
   port::Status SynchronousMemcpyD2H(const DeviceMemory<T> &gpu_src,
@@ -342,17 +342,23 @@ class StreamExecutor {
   bool SupportsDnn() const;
 
   // Get the list of supported algorithms for the forward convolution opeartion.
-  bool GetConvolveAlgorithms(std::vector<dnn::AlgorithmType> *out_algorithms);
+  bool GetConvolveAlgorithms(bool with_winograd_nonfused,
+                             std::vector<dnn::AlgorithmType> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
+      bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on the
   // filter.
   bool GetConvolveBackwardFilterAlgorithms(
+      bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType> *out_algorithms);
 
+  // Get the list of supported algorithms for BLAS gemm.
+  bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms);
+
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
diff --git a/tensorflow/tensorboard/.bowerrc b/tensorflow/tensorboard/.bowerrc
deleted file mode 100644
index 333544ec7a8..00000000000
--- a/tensorflow/tensorboard/.bowerrc
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "directory" : "components"
-}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/.gitignore b/tensorflow/tensorboard/.gitignore
deleted file mode 100644
index 98b96425455..00000000000
--- a/tensorflow/tensorboard/.gitignore
+++ /dev/null
@@ -1,27 +0,0 @@
-node_modules/*
-typings/*
-build/*
-dist/tf-tensorboard-demo.html
-
-# Since bower components are stored in the same directory as
-# tensorboard components, we ignore everything under components
-# except our own components which start with tf-.
-components/*
-# This rule should always be in sync with TF_COMPONENTS_TYPESCRIPT_GLOB
-# in gulpfile.js
-!components/tf-*
-!components/tf_*
-!components/vz-*
-!components/vz_*
-!components/index.html
-!components/BUILD
-# Ignore the sample graph files since they are too large to
-# be in the repo.
-components/tf-graph/demo/tf_model_zoo/*
-
-# All standalone code for TensorBoard components should be written in
-# typescript, and the compiled javascript code should be ignored.
-components/tf-*/**/*.js
-components/tf_*/**/*.js
-components/vz-*/**/*.js
-components/vz_*/**/*.js
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
index cf084b2df79..bbd4251731e 100644
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@@ -1,91 +1,84 @@
 # Description:
 # TensorBoard, a dashboard for investigating TensorFlow
 
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = [":internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "frontend",
-    srcs = [
-        "TAG",
-        "dist/bazel-html-imports.html",
-        "dist/index.html",
-        "dist/tf-tensorboard.html",
-        "//tensorflow/tensorboard/bower",
-        "//tensorflow/tensorboard/lib:all_files",
+package_group(
+    name = "internal",
+    packages = [
+        "//learning/brain/tensorboard/...",
+        "//learning/vis/...",
+        "//tensorflow/...",
+        "//tensorflow/tensorboard/...",
     ],
 )
 
 py_binary(
     name = "tensorboard",
-    srcs = [
-        "__main__.py",
-        "tensorboard.py",
-    ],
-    data = [":frontend"],
+    srcs = ["main.py"],
+    data = [":assets"],
+    main = "main.py",
     srcs_version = "PY2AND3",
     deps = [
-        ":debugger",
-        ":projector",
-        "//tensorflow/python:platform",
         "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_file_inspector",
+        "//tensorflow/tensorboard/plugins/audio:audio_plugin",
+        "//tensorflow/tensorboard/plugins/distributions:distributions_plugin",
+        "//tensorflow/tensorboard/plugins/graphs:graphs_plugin",
+        "//tensorflow/tensorboard/plugins/histograms:histograms_plugin",
+        "//tensorflow/tensorboard/plugins/images:images_plugin",
+        "//tensorflow/tensorboard/plugins/projector:projector_plugin",
+        "//tensorflow/tensorboard/plugins/scalars:scalars_plugin",
+        "//tensorflow/tensorboard/plugins/text:text_plugin",
         "@org_pocoo_werkzeug//:werkzeug",
     ],
 )
 
+py_library(
+    name = "expect_tensorflow_installed",
+    # This is a dummy rule used as a TensorFlow dependency in open-source.
+    # We expect TensorFlow to already be installed on the system, e.g. via
+    # `pip install tensorflow`
+)
+
+py_library(
+    name = "expect_numpy_installed",
+    # This is a dummy rule used as a numpy dependency in open-source.
+    # We expect numpy to already be installed on the system, e.g. via
+    # `pip install numpy`
+)
+
+filegroup(
+    name = "assets",
+    srcs = [
+        "TAG",
+        "//tensorflow/tensorboard/components:index.html",
+        "//tensorflow/tensorboard/components:trace_viewer_index.html",
+    ],
+)
+
+filegroup(
+    name = "ts_web_library_default_typings",
+    srcs = [
+        # Ordering probably matters.
+        "@com_microsoft_typescript//:lib.es6.d.ts",
+        "@io_angular_clutz//:src/resources/closure.lib.d.ts",
+        "//tensorflow/tensorboard/defs:clutz.d.ts",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
-        ["**/*"],
+        ["**"],
         exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/node_modules/**",
-            "**/typings/**",
+            "METADATA",
+            "OWNERS",
+            "tensorboard.google.bzl",
         ],
     ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-###### PLUGINS ######
-
-# Plugins don't have their own packages (BUILD files) because we want to
-# have only one BUILD file since each BUILD file needs special rewrite rules
-# in the git world.
-
-py_library(
-    name = "base_plugin",
-    srcs = ["plugins/base_plugin.py"],
-    srcs_version = "PY2AND3",
-)
-
-## TensorFlow Debugger Plugin ##
-py_library(
-    name = "debugger",
-    srcs = ["plugins/debugger/plugin.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_plugin",
-        "//tensorflow/python:platform",
-        "//tensorflow/tensorboard/lib/python:http_util",
-    ],
-)
-
-## Embedding Projector Plugin ##
-py_library(
-    name = "projector",
-    srcs = glob(["plugins/projector/**/*.py"]),
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_plugin",
-        "//tensorflow/contrib/tensorboard:projector",
-        "//tensorflow/contrib/tensorboard:protos_all_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-    ],
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/CHANGES b/tensorflow/tensorboard/CHANGES
deleted file mode 100644
index 7e3083f58ef..00000000000
--- a/tensorflow/tensorboard/CHANGES
+++ /dev/null
@@ -1,10 +0,0 @@
---- 2 ---
-Begin tracking TensorBoard changes.
-
---- 3 ---
-Change default # of scalar values to 1000
-Fix bug where TensorBoard discards all values after a restart.
-
---- 4 ---
-Fix another case where TensorBoard discards values after a restart.
-Add a "toggle all runs" button.
\ No newline at end of file
diff --git a/tensorflow/tensorboard/DEVELOPMENT.md b/tensorflow/tensorboard/DEVELOPMENT.md
index 1d98df0a837..79d534a26c9 100644
--- a/tensorflow/tensorboard/DEVELOPMENT.md
+++ b/tensorflow/tensorboard/DEVELOPMENT.md
@@ -2,51 +2,24 @@
 
 ## Launching a Development Instance
 
-The first step is getting a TensorBoard development environment set up. You
-should start by making sure you have [nodejs](https://nodejs.org/en/) and
-[npm](https://www.npmjs.com/). On Ubuntu, `sudo apt-get install -y nodejs
-nodejs-legacy npm`. Ensure your npm version is >=3.0 by running
-'npm --version'. If the version is <3.0, run 'sudo npm install npm -g' to
-update to the latest version. You may need to open a new terminal window after
-updating in order to make use of the newly-installed version.
+Run the following to launch a demo of TensorBoard in raw sources mode:
 
-Next, you'll want to install [gulp](http://gulpjs.com/) and
-[bower](http://bower.io/), which are used for build tooling and dependency
-management respectively. Both must be installed globally: `sudo npm install -g
-gulp bower` will do that.
+```sh
+bazel run third_party/tensorflow/tensorboard/components/tf_tensorboard:demo
+```
 
-Then, cd into the TensorBoard directory:
-
-`cd tensorflow/tensorboard`
-
-and install dependencies:
-
-`npm run prepare`
-
-Then, run gulp: `gulp`
-
-(Don't worry if there are some linter errors.)
-
-Now you can navigate to
-[http://localhost:8000/demo/index.html](http://localhost:8000/demo/index.html)
-and play with the demo TensorBoard instance. If you make changes to the source
-code, `gulp` should detect it, recompile (if Typescript), and reload your
-browser.
+Now you can navigate to <http://localhost:6006/demo/index.html> and play with
+the demo TensorBoard instance. This will have live source reloading.
 
 This demo TensorBoard will have a small amount of demo data generated by
 [generate_testdata.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/generate_testdata.py).
 You can use [serialize_tensorboard.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/serialize_tensorboard.py)
 to create a realistic demo directory from your own data files.
 
-## Launching TensorBoard with modified source
+## Launching TensorBoard Proper
 
-If you are developing in open source, and have made some changes to TensorBoard
-that you'd like to try out on real data, then you need to regenerate
-`dist/tf-tensorboard.html`.
+Running TensorBoard automatically asks Bazel to create a vulcanized HTML binary:
 
-Run `gulp regenerate`. That will recompile all of the TensorBoard assets, and
-produce a new tf-tensorboard.html with your changes.
-
-Now, you can use `bazel` to launch TensorBoard:
-
-`bazel run //tensorflow/tensorboard:tensorboard -- --logdir=/path/to/logs`.
+```sh
+bazel run //tensorflow/tensorboard:tensorboard -- --logdir=/path/to/logs
+```
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index fa0a245b418..a9ab4d3bd2a 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -1,14 +1,17 @@
 # TensorBoard
 
 TensorBoard is a suite of web applications for inspecting and understanding your
-TensorFlow runs and graphs. TensorBoard currently supports five visualizations:
-scalars, images, audio, histograms, and the graph.
+TensorFlow runs and graphs.
 
 This README gives an overview of key concepts in TensorBoard, as well as how to
 interpret the visualizations TensorBoard provides. For an in-depth example of
 using TensorBoard, see the tutorial: [TensorBoard: Visualizing
-Learning](https://www.tensorflow.org/versions/master/how_tos/summaries_and_tensorboard/index.html).
-For in-depth information on the Graph Visualizer, see this tutorial: [TensorBoard: Graph Visualization](https://www.tensorflow.org/versions/master/how_tos/graph_viz/index.html).
+Learning](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+For in-depth information on the Graph Visualizer, see this tutorial: [TensorBoard: Graph Visualization](https://www.tensorflow.org/get_started/graph_viz).
+
+You may also want to watch
+[this video tutorial](https://www.youtube.com/watch?v=eBbEDRsCmv4) that walks
+through setting up and using TensorBoard.
 
 # Usage
 
@@ -16,13 +19,12 @@ Before running TensorBoard, make sure you have generated summary data in a log
 directory by creating a summary writer:
 
 ``` python
-# sess.graph_def is the graph definition; that enables the Graph Visualizer.
+# sess.graph contains the graph definition; that enables the Graph Visualizer.
 
 file_writer = tf.summary.FileWriter('/path/to/logs', sess.graph)
 ```
 
-For more details, see [this
-tutorial](http://www.tensorflow.org/how_tos/summaries_and_tensorboard/index.html#serializing-the-data).
+For more details, see [the TensorBoard tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 Once you have event files, run TensorBoard and provide the log directory. If
 you're using a precompiled TensorFlow package (e.g. you installed via pip), run:
 
@@ -37,7 +39,8 @@ bazel build tensorflow/tensorboard:tensorboard
 ./bazel-bin/tensorflow/tensorboard/tensorboard --logdir=path/to/logs
 ```
 
-This should print that TensorBoard has started. Next, connect to http://localhost:6006.
+This should print that TensorBoard has started. Next, connect to
+http://localhost:6006.
 
 TensorBoard requires a `logdir` to read logs from. For info on configuring
 TensorBoard, run `tensorboard --help`.
@@ -50,19 +53,25 @@ work, but there may be bugs or performance issues.
 ### Summary Ops: How TensorBoard gets data from TensorFlow
 
 The first step in using TensorBoard is acquiring data from your TensorFlow run.
-For this, you need [summary
-ops](https://www.tensorflow.org/versions/r1.0/api_docs/python/train.html#summary-operations).
+For this, you need [summary ops](https://www.tensorflow.org/api_docs/python/tf/summary).
 Summary ops are ops, like
-[`tf.matmul`](https://www.tensorflow.org/versions/r1.0/api_docs/python/math_ops.html#matmul)
+[`tf.matmul`](https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/matmul)
 or
-[`tf.nn.relu`](https://www.tensorflow.org/versions/r1.0/api_docs/python/nn.html#relu),
+[`tf.nn.relu`](https://www.tensorflow.org/versions/master/api_docs/python/tf/nn/relu),
 which means they take in tensors, produce tensors, and are evaluated from within
 a TensorFlow graph. However, summary ops have a twist: the Tensors they produce
 contain serialized protobufs, which are written to disk and sent to TensorBoard.
 To visualize the summary data in TensorBoard, you should evaluate the summary
 op, retrieve the result, and then write that result to disk using a
 summary.FileWriter. A full explanation, with examples, is in [the
-tutorial](https://www.tensorflow.org/versions/r1.0/how_tos/summaries_and_tensorboard/index.html).
+tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+The supported summary ops include:
+* tf.summary.scalar
+* tf.summary.image
+* tf.summary.audio
+* tf.summary.text
+* tf.summary.histogram
 
 ### Tags: Giving names to data
 
@@ -91,7 +100,7 @@ produce a consistent history of what happened.
 ### Runs: Comparing different executions of your model
 
 You may want to visually compare multiple executions of your model; for example,
-suppose you've changed the hyperparameters and want to see if its converging
+suppose you've changed the hyperparameters and want to see if it's converging
 faster. TensorBoard enables this through different "runs". When TensorBoard is
 passed a `logdir` at startup, it recursively walks the directory tree rooted at
 `logdir` looking for subdirectories that contain tfevents data. Every time it
@@ -121,9 +130,9 @@ tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
 
 # The Visualizations
 
-### Events Dashboard
+### Scalar Dashboard
 
-TensorBoard's Events Dashboard visualizes scalar statistics that vary over time;
+TensorBoard's Scalar Dashboard visualizes scalar statistics that vary over time;
 for example, you might want to track the model's loss or learning rate. As
 described in *Key Concepts*, you can compare multiple runs, and the data is
 organized by tag. The line charts have the following interactions:
@@ -141,12 +150,20 @@ the run-selector on the left.
 Additionally, you can create new folders to organize tags by writing regular
 expressions in the box in the top-left of the dashboard.
 
+### Histogram Dashboard
+
+The HistogramDashboard displays how the statistical distribution of a Tensor
+has varied over time. It visualizes data recorded via `tf.summary.histogram`.
+Each chart shows temporal "slices" of data, where each slice is a histogram of
+the tensor at a given step. It's organized with the oldest timestep in the back,
+and the most recent timestep in front. By changing the Histogram Mode from
+"offset" to "overlay", the perspective will rotate so that every histogram slice
+is rendered as a line and overlaid with one another.
+
 ### Distribution Dashboard
 
-The Distribution Dashboard is for visualizing how the statistical distribution
-of a Tensor has varied over time. It visualizes data recorded via a
-tf.summary.histogram. Right now, its name is a bit of a misnomer, as it doesn't
-show histograms; instead, it shows some high-level statistics on a distribution.
+The Distribution Dashboard is another way of visualizing histogram data from
+`tf.summary.histogram`. It shows some high-level statistics on a distribution.
 Each line on the chart represents a percentile in the distribution over the
 data: for example, the bottom line shows how the minimum value has changed over
 time, and the line in the middle shows how the median has changed. Reading from
@@ -158,22 +175,20 @@ normal distribution: `[maximum, μ+1.5σ, μ+σ, μ+0.5σ, μ, μ-0.5σ, μ-σ,
 minimum]` so that the colored regions, read from inside to outside, have widths
 `[σ, 2σ, 3σ]` respectively.
 
-This histogram visualization is a bit weird, and cannot meaningfully represent
-multimodal distributions. We are currently working on a true-histogram
-replacement.
 
 ### Image Dashboard
 
-The Image Dashboard can display pngs that were saved via a tf.summary.image. The
-dashboard is set up so that each row corresponds to a different tag, and each
-column corresponds to a run. Since the image dashboard supports arbitrary pngs,
-you can use this to embed custom visualizations (e.g. matplotlib scatterplots)
-into TensorBoard. This dashboard always shows you the latest image for each tag.
+The Image Dashboard can display pngs that were saved via a `tf.summary.image`.
+The dashboard is set up so that each row corresponds to a different tag, and
+each column corresponds to a run. Since the image dashboard supports arbitrary
+pngs, you can use this to embed custom visualizations (e.g. matplotlib
+scatterplots) into TensorBoard. This dashboard always shows you the latest image
+for each tag.
 
 ### Audio Dashboard
 
 The Audio Dashboard can embed playable audio widgets for audio saved via a
-tf.summary.audio. The dashboard is set up so that each row corresponds to a
+`tf.summary.audio`. The dashboard is set up so that each row corresponds to a
 different tag, and each column corresponds to a run. This dashboard always
 embeds the latest audio for each tag.
 
@@ -183,8 +198,21 @@ The Graph Explorer can visualize a TensorBoard graph, enabling inspection of the
 TensorFlow model. To get best use of the graph visualizer, you should use name
 scopes to hierarchically group the ops in your graph - otherwise, the graph may
 be difficult to decipher. For more information, including examples, see [the
-graph visualizer
-tutorial](https://www.tensorflow.org/versions/r1.0/how_tos/graph_viz/index.html#tensorboard-graph-visualization).
+graph visualizer tutorial](https://www.tensorflow.org/get_started/graph_viz).
+
+### Embedding Projector
+
+The Embedding Projector allows you to visualize high-dimensional data; for
+example, you may view your input data after it has been embedded in a high-
+dimensional space by your model. The embedding projector reads data from your
+model checkpoint file, and may be configured with additional metadata, like
+a vocabulary file or sprite images. For more details, see [the embedding
+projector tutorial](https://www.tensorflow.org/get_started/embedding_viz).
+
+### Text Dashboard
+
+The Text Dashboard displays text snippets saved via `tf.summary.text`. Markdown
+features including hyperlinks, lists, and tables are all supported.
 
 # Frequently Asked Questions
 
@@ -275,19 +303,14 @@ events. This behavior may be disabled with the flag
 
 ### How can I export data from TensorBoard?
 
-If you'd like to export data to visualize elsewhere (e.g. iPython Notebook),
-that's possible too. You can directly depend on the underlying classes that
-TensorBoard uses for loading data: `python/summary/event_accumulator.py` (for
-loading data from a single run) or `python/summary/event_multiplexer.py` (for
-loading data from multiple runs, and keeping it organized). These classes load
-groups of event files, discard data that was "orphaned" by TensorFlow crashes,
-and organize the data by tag.
+The Scalar Dashboard supports exporting data; you can click the "enable
+download links" option in the left-hand bar. Then, each plot will provide
+download links for the data it contains.
+
+If you need access to the full dataset, you can read the event files that
+TensorBoard consumes by using the [`summary_iterator`](https://github.com/tensorflow/tensorflow/blob/e7f333b5f8b3c53b21d149d8d14c0cebbde431aa/tensorflow/python/summary/summary_iterator.py#L313)
+method.
 
-As another option, there is a script
-(`tensorboard/scripts/serialize_tensorboard.py`) which will load a logdir just
-like TensorBoard does, but write all of the data out to disk as json instead of
-starting a server. This script is setup to make "fake TensorBoard backends" for
-testing, so it is a bit rough around the edges.
 
 ### Can I overlap multiple plots?
 
@@ -307,7 +330,9 @@ TensorBoard uses [reservoir
 sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to downsample your
 data so that it can be loaded into RAM. You can modify the number of elements it
 will keep per tag in
-[tensorboard/backend/server.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/backend/server.py).
+[tensorboard/backend/application.py](https://www.github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/tensorboard/backend/application.py).
+See this [StackOverflow question](http://stackoverflow.com/questions/43702546/tensorboard-doesnt-show-all-data-points/)
+for some more information.
 
 ### I get a network security popup every time I run TensorBoard on a mac!
 
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
index 9e5feb52569..fb1e7bc8699 100644
--- a/tensorflow/tensorboard/TAG
+++ b/tensorflow/tensorboard/TAG
@@ -1 +1 @@
-46
+54
diff --git a/tensorflow/tensorboard/WORKSPACE b/tensorflow/tensorboard/WORKSPACE
new file mode 100644
index 00000000000..1667478cab9
--- /dev/null
+++ b/tensorflow/tensorboard/WORKSPACE
@@ -0,0 +1,21 @@
+workspace(name = "org_tensorflow_tensorboard")
+
+http_archive(
+    name = "io_bazel_rules_closure",
+    sha256 = "bc41b80486413aaa551860fc37471dbc0666e1dbb5236fb6177cb83b0c105846",
+    strip_prefix = "rules_closure-dec425a4ff3faf09a56c85d082e4eed05d8ce38f",
+    urls = [
+        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",  # 2017-06-02
+        "https://github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",
+    ],
+)
+
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
+
+closure_repositories()
+
+load("//third_party:workspace.bzl", "tensorboard_workspace")
+
+# Please add all new dependencies in workspace.bzl.
+tensorboard_workspace()
diff --git a/tensorflow/tensorboard/app/BUILD b/tensorflow/tensorboard/app/BUILD
deleted file mode 100644
index 9afcd23e9e7..00000000000
--- a/tensorflow/tensorboard/app/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-# Description:
-# Build rules for building the HTML/JS necessary for TensorBoard.
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/app/analytics.js b/tensorflow/tensorboard/app/analytics.js
deleted file mode 100644
index 497c02ced8a..00000000000
--- a/tensorflow/tensorboard/app/analytics.js
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Nothing to see here. vulcanize doesn't like empty files.
diff --git a/tensorflow/tensorboard/backend/BUILD b/tensorflow/tensorboard/backend/BUILD
index fe7c0e08ccd..c7bf0dfee58 100644
--- a/tensorflow/tensorboard/backend/BUILD
+++ b/tensorflow/tensorboard/backend/BUILD
@@ -1,22 +1,69 @@
 # Description:
 # TensorBoard, a dashboard for investigating TensorFlow
 
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
 py_library(
-    name = "application",
-    srcs = ["application.py"],
-    data = ["//tensorflow/tensorboard:frontend"],
+    name = "http_util",
+    srcs = ["http_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":json_util",
+        "//tensorflow:tensorflow_py",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "http_util_test",
+    size = "small",
+    srcs = ["http_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":process_graph",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-        "//tensorflow/tensorboard/lib/python:http_util",
+        ":http_util",
+        "//tensorflow:tensorflow_py",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "json_util",
+    srcs = ["json_util.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "json_util_test",
+    size = "small",
+    srcs = ["json_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":json_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "application",
+    srcs = ["application.py"],
+    data = ["//tensorflow/tensorboard:assets"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":http_util",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
         "@org_pocoo_werkzeug//:werkzeug",
         "@six_archive//:six",
     ],
@@ -24,23 +71,16 @@ py_library(
 
 py_test(
     name = "application_test",
-    size = "small",
+    size = "medium",
     srcs = ["application_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["manual"],
     deps = [
         ":application",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/tensorboard:projector",
-        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "//tensorflow/tensorboard/plugins:base_plugin",
         "@org_pocoo_werkzeug//:werkzeug",
     ],
 )
@@ -50,7 +90,9 @@ py_library(
     srcs = ["process_graph.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
 )
 
 filegroup(
diff --git a/tensorflow/tensorboard/backend/application.py b/tensorflow/tensorboard/backend/application.py
index 7c6b922501b..3657eee38b0 100644
--- a/tensorflow/tensorboard/backend/application.py
+++ b/tensorflow/tensorboard/backend/application.py
@@ -22,26 +22,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import csv
-import imghdr
-import mimetypes
 import os
 import re
 import threading
 import time
 
 import six
-from six import StringIO
-from six.moves import urllib
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from six.moves.urllib import parse as urlparse
+import tensorflow as tf
 from werkzeug import wrappers
 
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import event_accumulator
-from tensorflow.tensorboard.backend import process_graph
-from tensorflow.tensorboard.lib.python import http_util
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
 
 
 DEFAULT_SIZE_GUIDANCE = {
@@ -57,39 +50,37 @@ DATA_PREFIX = '/data'
 LOGDIR_ROUTE = '/logdir'
 RUNS_ROUTE = '/runs'
 PLUGIN_PREFIX = '/plugin'
-SCALARS_ROUTE = '/' + event_accumulator.SCALARS
-IMAGES_ROUTE = '/' + event_accumulator.IMAGES
-AUDIO_ROUTE = '/' + event_accumulator.AUDIO
-HISTOGRAMS_ROUTE = '/' + event_accumulator.HISTOGRAMS
-COMPRESSED_HISTOGRAMS_ROUTE = '/' + event_accumulator.COMPRESSED_HISTOGRAMS
-INDIVIDUAL_IMAGE_ROUTE = '/individualImage'
-INDIVIDUAL_AUDIO_ROUTE = '/individualAudio'
-GRAPH_ROUTE = '/' + event_accumulator.GRAPH
-RUN_METADATA_ROUTE = '/' + event_accumulator.RUN_METADATA
+PLUGINS_LISTING_ROUTE = '/plugins_listing'
 TAB_ROUTES = ['', '/events', '/images', '/audio', '/graphs', '/histograms']
 
-_IMGHDR_TO_MIMETYPE = {
-    'bmp': 'image/bmp',
-    'gif': 'image/gif',
-    'jpeg': 'image/jpeg',
-    'png': 'image/png'
-}
-_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
+# Slashes in a plugin name could throw the router for a loop. An empty
+# name would be confusing, too. To be safe, let's restrict the valid
+# names as follows.
+_VALID_PLUGIN_RE = re.compile(r'^[A-Za-z0-9_.-]+$')
 
 
-def _content_type_for_image(encoded_image_string):
-  image_type = imghdr.what(None, encoded_image_string)
-  return _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
+def standard_tensorboard_wsgi(
+    logdir,
+    purge_orphaned_data,
+    reload_interval,
+    plugins):
+  """Construct a TensorBoardWSGIApp with standard plugins and multiplexer.
 
+  Args:
+    logdir: The path to the directory containing events files.
+    purge_orphaned_data: Whether to purge orphaned data.
+    reload_interval: The interval at which the backend reloads more data in
+        seconds.
+    plugins: A list of plugins for TensorBoard to initialize.
 
-class _OutputFormat(object):
-  """An enum used to list the valid output formats for API calls.
-
-  Not all API calls support all formats (for example, only scalars and
-  compressed histograms support CSV).
+  Returns:
+    The new TensorBoard WSGI application.
   """
-  JSON = 'json'
-  CSV = 'csv'
+  multiplexer = event_multiplexer.EventMultiplexer(
+      size_guidance=DEFAULT_SIZE_GUIDANCE,
+      purge_orphaned_data=purge_orphaned_data)
+
+  return TensorBoardWSGIApp(logdir, plugins, multiplexer, reload_interval)
 
 
 class TensorBoardWSGIApp(object):
@@ -102,125 +93,77 @@ class TensorBoardWSGIApp(object):
   #                      responses using send_header.
   protocol_version = 'HTTP/1.1'
 
-  def __init__(self, logdir, plugins, multiplexer, reload_interval=60):
+  def __init__(self, logdir, plugins, multiplexer, reload_interval):
     """Constructs the TensorBoard application.
 
     Args:
       logdir: the logdir spec that describes where data will be loaded.
         may be a directory, or comma,separated list of directories, or colons
         can be used to provide named directories
-      plugins: Map from plugin name to plugin application.
-      multiplexer: Organizes events data from runs.
+      plugins: List of plugins that extend tensorboard.plugins.BasePlugin
+      multiplexer: The EventMultiplexer with TensorBoard data to serve
       reload_interval: How often (in seconds) to reload the Multiplexer
 
     Returns:
       A WSGI application that implements the TensorBoard backend.
-    """
-    self._plugins = plugins
-    self._logdir = logdir
-    self._multiplexer = multiplexer
-    self._reload = reload_interval
-    self.initialize()
 
-  def initialize(self):
-    """Setup the TensorBoard application."""
+    Raises:
+      ValueError: If some plugin has no plugin_name
+      ValueError: If some plugin has an invalid plugin_name (plugin
+          names must only contain [A-Za-z0-9_.-])
+      ValueError: If two plugins have the same plugin_name
+      ValueError: If some plugin handles a route that does not start
+          with a slash
+    """
+    self._logdir = logdir
+    self._plugins = plugins
+    self._multiplexer = multiplexer
+    self.tag = get_tensorboard_tag()
+
     path_to_run = parse_event_files_spec(self._logdir)
-    if self._reload:
-      start_reloading_multiplexer(self._multiplexer, path_to_run, self._reload)
+    if reload_interval:
+      start_reloading_multiplexer(self._multiplexer, path_to_run,
+                                  reload_interval)
     else:
       reload_multiplexer(self._multiplexer, path_to_run)
 
     self.data_applications = {
         DATA_PREFIX + LOGDIR_ROUTE:
             self._serve_logdir,
-        DATA_PREFIX + SCALARS_ROUTE:
-            self._serve_scalars,
-        DATA_PREFIX + GRAPH_ROUTE:
-            self._serve_graph,
-        DATA_PREFIX + RUN_METADATA_ROUTE:
-            self._serve_run_metadata,
-        DATA_PREFIX + HISTOGRAMS_ROUTE:
-            self._serve_histograms,
-        DATA_PREFIX + COMPRESSED_HISTOGRAMS_ROUTE:
-            self._serve_compressed_histograms,
-        DATA_PREFIX + IMAGES_ROUTE:
-            self._serve_images,
-        DATA_PREFIX + INDIVIDUAL_IMAGE_ROUTE:
-            self._serve_image,
-        DATA_PREFIX + AUDIO_ROUTE:
-            self._serve_audio,
-        DATA_PREFIX + INDIVIDUAL_AUDIO_ROUTE:
-            self._serve_individual_audio,
-        DATA_PREFIX + RUNS_ROUTE:
-            self._serve_runs,
-        '/app.js':
-            self._serve_js
+        # TODO(chizeng): Delete this RPC once we have skylark rules that obviate
+        # the need for the frontend to determine which plugins are active.
+        DATA_PREFIX + PLUGINS_LISTING_ROUTE: self._serve_plugins_listing,
+        DATA_PREFIX + RUNS_ROUTE: self._serve_runs,
     }
 
     # Serve the routes from the registered plugins using their name as the route
     # prefix. For example if plugin z has two routes /a and /b, they will be
     # served as /data/plugin/z/a and /data/plugin/z/b.
-    for name in self._plugins:
+    plugin_names_encountered = set()
+    for plugin in self._plugins:
+      if plugin.plugin_name is None:
+        raise ValueError('Plugin %s has no plugin_name' % plugin)
+      if not _VALID_PLUGIN_RE.match(plugin.plugin_name):
+        raise ValueError('Plugin %s has invalid name %r' % (plugin,
+                                                            plugin.plugin_name))
+      if plugin.plugin_name in plugin_names_encountered:
+        raise ValueError('Duplicate plugins for name %s' % plugin.plugin_name)
+      plugin_names_encountered.add(plugin.plugin_name)
+
       try:
-        plugin = self._plugins[name]
-        plugin_apps = plugin.get_plugin_apps(self._multiplexer.RunPaths(),
-                                             self._logdir)
+        plugin_apps = plugin.get_plugin_apps(self._multiplexer, self._logdir)
       except Exception as e:  # pylint: disable=broad-except
-        logging.warning('Plugin %s failed. Exception: %s', name, str(e))
+        tf.logging.warning('Plugin %s failed. Exception: %s',
+                           plugin.plugin_name, str(e))
         continue
       for route, app in plugin_apps.items():
-        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + name + route
+        if not route.startswith('/'):
+          raise ValueError('Plugin named %r handles invalid route %r: '
+                           'route does not start with a slash' %
+                           (plugin.plugin_name, route))
+        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + plugin.plugin_name + route
         self.data_applications[path] = app
 
-  # We use underscore_names for consistency with inherited methods.
-
-  def _image_response_for_run(self, run_images, run, tag):
-    """Builds a JSON-serializable object with information about run_images.
-
-    Args:
-      run_images: A list of event_accumulator.ImageValueEvent objects.
-      run: The name of the run.
-      tag: The name of the tag the images all belong to.
-
-    Returns:
-      A list of dictionaries containing the wall time, step, URL, width, and
-      height for each image.
-    """
-    response = []
-    for index, run_image in enumerate(run_images):
-      response.append({
-          'wall_time': run_image.wall_time,
-          'step': run_image.step,
-          # We include the size so that the frontend can add that to the <img>
-          # tag so that the page layout doesn't change when the image loads.
-          'width': run_image.width,
-          'height': run_image.height,
-          'query': self._query_for_individual_image(run, tag, index)
-      })
-    return response
-
-  def _audio_response_for_run(self, run_audio, run, tag):
-    """Builds a JSON-serializable object with information about run_audio.
-
-    Args:
-      run_audio: A list of event_accumulator.AudioValueEvent objects.
-      run: The name of the run.
-      tag: The name of the tag the images all belong to.
-
-    Returns:
-      A list of dictionaries containing the wall time, step, URL, and
-      content_type for each audio clip.
-    """
-    response = []
-    for index, run_audio_clip in enumerate(run_audio):
-      response.append({
-          'wall_time': run_audio_clip.wall_time,
-          'step': run_audio_clip.step,
-          'content_type': run_audio_clip.content_type,
-          'query': self._query_for_individual_audio(run, tag, index)
-      })
-    return response
-
   def _path_is_safe(self, path):
     """Check path is safe (stays within current directory).
 
@@ -246,218 +189,19 @@ class TensorBoardWSGIApp(object):
         request, {'logdir': self._logdir}, 'application/json')
 
   @wrappers.Request.application
-  def _serve_scalars(self, request):
-    """Given a tag and single run, return array of ScalarEvents."""
-    # TODO(cassandrax): return HTTP status code for malformed requests
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    values = self._multiplexer.Scalars(run, tag)
+  def _serve_plugins_listing(self, request):
+    """Serves an object mapping plugin name to whether it is enabled.
 
-    if request.args.get('format') == _OutputFormat.CSV:
-      string_io = StringIO()
-      writer = csv.writer(string_io)
-      writer.writerow(['Wall time', 'Step', 'Value'])
-      writer.writerows(values)
-      return http_util.Respond(request, string_io.getvalue(), 'text/csv')
-    else:
-      return http_util.Respond(request, values, 'application/json')
+    Args:
+      request: The werkzeug.Request object.
 
-  @wrappers.Request.application
-  def _serve_graph(self, request):
-    """Given a single run, return the graph definition in json format."""
-    run = request.args.get('run', None)
-    if run is None:
-      return http_util.Respond(
-          request, 'query parameter "run" is required', 'text/plain', 400)
-
-    try:
-      graph = self._multiplexer.Graph(run)
-    except ValueError:
-      return http_util.Respond(request, '404 Not Found', code=404)
-
-    limit_attr_size = request.args.get('limit_attr_size', None)
-    if limit_attr_size is not None:
-      try:
-        limit_attr_size = int(limit_attr_size)
-      except ValueError:
-        return http_util.Respond(
-            request, 'query parameter `limit_attr_size` must be integer',
-            'text/plain', 400)
-
-    large_attrs_key = request.args.get('large_attrs_key', None)
-    try:
-      process_graph.prepare_graph_for_ui(graph, limit_attr_size,
-                                         large_attrs_key)
-    except ValueError as e:
-      return http_util.Respond(request, e.message, 'text/plain', 400)
-
-    return http_util.Respond(request, str(graph), 'text/x-protobuf')  # pbtxt
-
-  @wrappers.Request.application
-  def _serve_run_metadata(self, request):
-    """Given a tag and a TensorFlow run, return the session.run() metadata."""
-    tag = request.args.get('tag', None)
-    run = request.args.get('run', None)
-    if tag is None:
-      return http_util.Respond(
-          request, 'query parameter "tag" is required', 'text/plain', 400)
-    if run is None:
-      return http_util.Respond(
-          request, 'query parameter "run" is required', 'text/plain', 400)
-    try:
-      run_metadata = self._multiplexer.RunMetadata(run, tag)
-    except ValueError:
-      return http_util.Respond(request, '404 Not Found', code=404)
+    Returns:
+      A werkzeug.Response object.
+    """
     return http_util.Respond(
-        request, str(run_metadata), 'text/x-protobuf')  # pbtxt
-
-  @wrappers.Request.application
-  def _serve_histograms(self, request):
-    """Given a tag and single run, return an array of histogram values."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    values = self._multiplexer.Histograms(run, tag)
-    return http_util.Respond(request, values, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_compressed_histograms(self, request):
-    """Given a tag and single run, return an array of compressed histograms."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    compressed_histograms = self._multiplexer.CompressedHistograms(run, tag)
-    if request.args.get('format') == _OutputFormat.CSV:
-      string_io = StringIO()
-      writer = csv.writer(string_io)
-
-      # Build the headers; we have two columns for timing and two columns for
-      # each compressed histogram bucket.
-      headers = ['Wall time', 'Step']
-      if compressed_histograms:
-        bucket_count = len(compressed_histograms[0].compressed_histogram_values)
-        for i in xrange(bucket_count):
-          headers += ['Edge %d basis points' % i, 'Edge %d value' % i]
-      writer.writerow(headers)
-
-      for compressed_histogram in compressed_histograms:
-        row = [compressed_histogram.wall_time, compressed_histogram.step]
-        for value in compressed_histogram.compressed_histogram_values:
-          row += [value.rank_in_bps, value.value]
-        writer.writerow(row)
-      return http_util.Respond(request, string_io.getvalue(), 'text/csv')
-    else:
-      return http_util.Respond(
-          request, compressed_histograms, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_images(self, request):
-    """Given a tag and list of runs, serve a list of images.
-
-    Note that the images themselves are not sent; instead, we respond with URLs
-    to the images. The frontend should treat these URLs as opaque and should not
-    try to parse information about them or generate them itself, as the format
-    may change.
-
-    Args:
-      request: A werkzeug.wrappers.Request object.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-
-    images = self._multiplexer.Images(run, tag)
-    response = self._image_response_for_run(images, run, tag)
-    return http_util.Respond(request, response, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_image(self, request):
-    """Serves an individual image."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    index = int(request.args.get('index'))
-    image = self._multiplexer.Images(run, tag)[index]
-    encoded_image_string = image.encoded_image_string
-    content_type = _content_type_for_image(encoded_image_string)
-    return http_util.Respond(request, encoded_image_string, content_type)
-
-  def _query_for_individual_image(self, run, tag, index):
-    """Builds a URL for accessing the specified image.
-
-    This should be kept in sync with _serve_image. Note that the URL is *not*
-    guaranteed to always return the same image, since images may be unloaded
-    from the reservoir as new images come in.
-
-    Args:
-      run: The name of the run.
-      tag: The tag.
-      index: The index of the image. Negative values are OK.
-
-    Returns:
-      A string representation of a URL that will load the index-th
-      sampled image in the given run with the given tag.
-    """
-    query_string = urllib.parse.urlencode({
-        'run': run,
-        'tag': tag,
-        'index': index
-    })
-    return query_string
-
-  @wrappers.Request.application
-  def _serve_audio(self, request):
-    """Given a tag and list of runs, serve a list of audio.
-
-    Note that the audio clips themselves are not sent; instead, we respond with
-    URLs to the audio. The frontend should treat these URLs as opaque and should
-    not try to parse information about them or generate them itself, as the
-    format may change.
-
-    Args:
-      request: A werkzeug.wrappers.Request object.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-
-    audio_list = self._multiplexer.Audio(run, tag)
-    response = self._audio_response_for_run(audio_list, run, tag)
-    return http_util.Respond(request, response, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_individual_audio(self, request):
-    """Serves an individual audio clip."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    index = int(request.args.get('index'))
-    audio = self._multiplexer.Audio(run, tag)[index]
-    return http_util.Respond(
-        request, audio.encoded_audio_string, audio.content_type)
-
-  def _query_for_individual_audio(self, run, tag, index):
-    """Builds a URL for accessing the specified audio.
-
-    This should be kept in sync with _serve_individual_audio. Note that the URL
-    is *not* guaranteed to always return the same audio, since audio may be
-    unloaded from the reservoir as new audio comes in.
-
-    Args:
-      run: The name of the run.
-      tag: The tag.
-      index: The index of the audio. Negative values are OK.
-
-    Returns:
-      A string representation of a URL that will load the index-th
-      sampled audio in the given run with the given tag.
-    """
-    query_string = urllib.parse.urlencode({
-        'run': run,
-        'tag': tag,
-        'index': index
-    })
-    return query_string
+        request,
+        {plugin.plugin_name: plugin.is_active() for plugin in self._plugins},
+        'application/json')
 
   @wrappers.Request.application
   def _serve_runs(self, request):
@@ -470,79 +214,34 @@ class TensorBoardWSGIApp(object):
 
     Returns:
       A werkzeug Response with the following content:
-      {runName: {images: [tag1, tag2, tag3],
-                 audio: [tag4, tag5, tag6],
-                 scalars: [tagA, tagB, tagC],
-                 histograms: [tagX, tagY, tagZ],
-                 firstEventTimestamp: 123456.789}}
+      {runName: {firstEventTimestamp: 123456.789}}
     """
-    runs = self._multiplexer.Runs()
-    for run_name, run_data in runs.items():
+    run_names = sorted(self._multiplexer.Runs())  # Why `sorted`? See below.
+    def get_first_event_timestamp(run_name):
       try:
-        run_data['firstEventTimestamp'] = self._multiplexer.FirstEventTimestamp(
-            run_name)
+        return self._multiplexer.FirstEventTimestamp(run_name)
       except ValueError:
-        logging.warning('Unable to get first event timestamp for run %s',
-                        run_name)
-        run_data['firstEventTimestamp'] = None
-    return http_util.Respond(request, runs, 'application/json')
+        tf.logging.warning('Unable to get first event timestamp for run %s',
+                           run_name)
+        # Put runs without a timestamp at the end. Their internal
+        # ordering would be nondeterministic, but Python's sorts are
+        # stable, so `sorted`ing the initial list above provides a
+        # deterministic ordering. Of course, we cannot guarantee that
+        # this will be append-only for new event-less runs.
+        return float('inf')
+    first_event_timestamps = {
+        run_name: get_first_event_timestamp(run_name)
+        for run_name in run_names
+    }
+    run_names.sort(key=first_event_timestamps.get)
+    return http_util.Respond(request, run_names, 'application/json')
 
   @wrappers.Request.application
   def _serve_index(self, request):
     """Serves the index page (i.e., the tensorboard app itself)."""
-    return self._serve_static_file(request, '/dist/index.html')
-
-  @wrappers.Request.application
-  def _serve_js(self, request):
-    """Serves the JavaScript for the index page."""
-    return self._serve_static_file(request, '/dist/app.js')
-
-  def _serve_static_file(self, request, path):
-    """Serves the static file located at the given path.
-
-    Args:
-      request: A werkzeug Request
-      path: The path of the static file, relative to the tensorboard/ directory.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    # Strip off the leading forward slash.
-    orig_path = path.lstrip('/')
-    if not self._path_is_safe(orig_path):
-      logging.warning('path not safe: %s', orig_path)
-      return http_util.Respond(request, 'Naughty naughty!', 'text/plain', 400)
-      # Resource loader wants a path relative to //WORKSPACE/tensorflow.
-    path = os.path.join('tensorboard', orig_path)
-    # Open the file and read it.
-    try:
-      contents = resource_loader.load_resource(path)
-    except IOError:
-      # For compatibility with latest version of Bazel, we renamed bower
-      # packages to use '_' rather than '-' in their package name.
-      # This means that the directory structure is changed too.
-      # So that all our recursive imports work, we need to modify incoming
-      # requests to map onto the new directory structure.
-      path = orig_path
-      components = path.split('/')
-      components[0] = components[0].replace('-', '_')
-      path = ('/').join(components)
-      # Bazel keeps all the external dependencies in //WORKSPACE/external.
-      # and resource loader wants a path relative to //WORKSPACE/tensorflow/.
-      path = os.path.join('../external', path)
-      try:
-        contents = resource_loader.load_resource(path)
-      except IOError:
-        logging.info('path %s not found, sending 404', path)
-        return http_util.Respond(request, 'Not found', 'text/plain', code=404)
-    mimetype, content_encoding = mimetypes.guess_type(path)
-    mimetype = mimetype or 'application/octet-stream'
-    return http_util.Respond(
-        request,
-        contents,
-        mimetype,
-        expires=3600,
-        content_encoding=content_encoding)
+    contents = tf.resource_loader.load_resource(
+        'tensorboard/components/index.html')
+    return http_util.Respond(request, contents, 'text/html', expires=3600)
 
   def __call__(self, environ, start_response):  # pylint: disable=invalid-name
     """Central entry point for the TensorBoard application.
@@ -573,8 +272,9 @@ class TensorBoardWSGIApp(object):
     elif clean_path in TAB_ROUTES:
       return self._serve_index(environ, start_response)
     else:
-      return self._serve_static_file(request, clean_path)(environ,
-                                                          start_response)
+      tf.logging.warning('path %s not found, sending 404', clean_path)
+      return http_util.Respond(request, 'Not found', 'text/plain', code=404)(
+          environ, start_response)
     # pylint: enable=too-many-function-args
 
 
@@ -628,13 +328,13 @@ def reload_multiplexer(multiplexer, path_to_run):
       name is interpreted as a run name equal to the path.
   """
   start = time.time()
-  logging.info('TensorBoard reload process beginning')
+  tf.logging.info('TensorBoard reload process beginning')
   for (path, name) in six.iteritems(path_to_run):
     multiplexer.AddRunsFromDirectory(path, name)
-  logging.info('TensorBoard reload process: Reload the whole Multiplexer')
+  tf.logging.info('TensorBoard reload process: Reload the whole Multiplexer')
   multiplexer.Reload()
   duration = time.time() - start
-  logging.info('TensorBoard done reloading. Load took %0.3f secs', duration)
+  tf.logging.info('TensorBoard done reloading. Load took %0.3f secs', duration)
 
 
 def start_reloading_multiplexer(multiplexer, path_to_run, load_interval):
@@ -665,3 +365,9 @@ def start_reloading_multiplexer(multiplexer, path_to_run, load_interval):
   thread.daemon = True
   thread.start()
   return thread
+
+
+def get_tensorboard_tag():
+  """Read the TensorBoard TAG number, and return it or an empty string."""
+  tag = tf.resource_loader.load_resource('tensorboard/TAG').strip()
+  return tag
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
index 15884d3fdf3..fd63564e4e3 100644
--- a/tensorflow/tensorboard/backend/application_test.py
+++ b/tensorflow/tensorboard/backend/application_test.py
@@ -21,61 +21,86 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import base64
 import gzip
 import json
-import numbers
 import os
 import shutil
+import socket
 import tempfile
 import threading
 
-import numpy as np
 from six import BytesIO
 from six.moves import http_client
-from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
 
 from werkzeug import serving
-from google.protobuf import text_format
 
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import saver_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.client import session
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import test
-from tensorflow.python.summary import event_multiplexer
-from tensorflow.python.summary.writer import writer as writer_lib
-from tensorflow.python.training import saver as saver_lib
+from tensorflow.tensorboard import main as tensorboard
 from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.plugins.projector import plugin as projector_plugin
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins import base_plugin
 
 
-class TensorboardServerTest(test.TestCase):
+class FakePlugin(base_plugin.TBPlugin):
+  """A plugin with no functionality."""
+
+  def __init__(self, plugin_name, is_active_value, routes_mapping):
+    """Constructs a fake plugin.
+
+    Args:
+      plugin_name: The name of this plugin.
+      is_active_value: Whether the plugin is active.
+      routes_mapping: A dictionary mapping from route (string URL path) to the
+        method called when a user issues a request to that route.
+    """
+    self.plugin_name = plugin_name
+    self._is_active_value = is_active_value
+    self._routes_mapping = routes_mapping
+
+  def get_plugin_apps(self, multiplexer, logdir):
+    """Returns a mapping from routes to handlers offered by this plugin.
+
+    Args:
+      multiplexer: The event multiplexer.
+      logdir: The path to the directory containing logs.
+
+    Returns:
+      A dictionary mapping from routes to handlers offered by this plugin.
+    """
+    return self._routes_mapping
+
+  def is_active(self):
+    """Returns whether this plugin is active.
+
+    Returns:
+      A boolean. Whether this plugin is active.
+    """
+    return self._is_active_value
+
+
+class TensorboardServerTest(tf.test.TestCase):
   _only_use_meta_graph = False  # Server data contains only a GraphDef
 
-  # Number of scalar-containing events to make.
-  _SCALAR_COUNT = 99
-
   def setUp(self):
-    self.temp_dir = self._GenerateTestData()
-    multiplexer = event_multiplexer.EventMultiplexer(
+    self.logdir = self.get_temp_dir()
+
+    self._GenerateTestData(run_name='run1')
+    self._multiplexer = event_multiplexer.EventMultiplexer(
         size_guidance=application.DEFAULT_SIZE_GUIDANCE,
         purge_orphaned_data=True)
-    plugins = {'projector': projector_plugin.ProjectorPlugin()}
+    plugins = [
+        FakePlugin(plugin_name='foo', is_active_value=True, routes_mapping={}),
+        FakePlugin(plugin_name='bar', is_active_value=False, routes_mapping={})
+    ]
     app = application.TensorBoardWSGIApp(
-        self.temp_dir, plugins, multiplexer, reload_interval=0)
-    self._server = serving.BaseWSGIServer('localhost', 0, app)
-    # 0 to pick an unused port.
+        self.logdir, plugins, self._multiplexer, reload_interval=0)
+    try:
+      self._server = serving.BaseWSGIServer('localhost', 0, app)
+      # 0 to pick an unused port.
+    except IOError:
+      # BaseWSGIServer has a preference for IPv4. If that didn't work, try again
+      # with an explicit IPv6 address.
+      self._server = serving.BaseWSGIServer('::1', 0, app)
     self._server_thread = threading.Thread(target=self._server.serve_forever)
     self._server_thread.daemon = True
     self._server_thread.start()
@@ -118,39 +143,75 @@ class TensorboardServerTest(test.TestCase):
     response = self._get('/asdf')
     self.assertEqual(response.status, 404)
 
-  def testDirectoryTraversal(self):
-    """Attempt a directory traversal attack."""
-    response = self._get('/..' * 30 + '/etc/passwd')
-    self.assertEqual(response.status, 400)
-
   def testLogdir(self):
     """Test the format of the data/logdir endpoint."""
     parsed_object = self._getJson('/data/logdir')
-    self.assertEqual(parsed_object, {'logdir': self.temp_dir})
+    self.assertEqual(parsed_object, {'logdir': self.logdir})
+
+  def testPluginsListing(self):
+    """Test the format of the data/plugins_listing endpoint."""
+    parsed_object = self._getJson('/data/plugins_listing')
+    # Plugin foo is active. Plugin bar is not.
+    self.assertEqual(parsed_object, {'foo': True, 'bar': False})
 
   def testRuns(self):
     """Test the format of the /data/runs endpoint."""
     run_json = self._getJson('/data/runs')
+    self.assertEqual(run_json, ['run1'])
 
-    # Don't check the actual timestamp since it's time-dependent.
-    self.assertTrue(
-        isinstance(run_json['run1']['firstEventTimestamp'], numbers.Number))
-    del run_json['run1']['firstEventTimestamp']
-    self.assertEqual(
-        run_json,
-        {
-            'run1': {
-                'compressedHistograms': ['histogram'],
-                'scalars': ['simple_values'],
-                'histograms': ['histogram'],
-                'images': ['image'],
-                'audio': ['audio'],
-                # if only_use_meta_graph, the graph is from the metagraph
-                'graph': True,
-                'meta_graph': self._only_use_meta_graph,
-                'run_metadata': ['test run']
-            }
-        })
+  def testRunsAppendOnly(self):
+    """Test that new runs appear after old ones in /data/runs."""
+    # We use three runs: the 'run1' that we already created in our
+    # `setUp` method, plus runs with names lexicographically before and
+    # after it (so that just sorting by name doesn't have a chance of
+    # working).
+    fake_wall_times = {
+        'run1': 1234.0,
+        'avocado': 2345.0,
+        'zebra': 3456.0,
+        'mysterious': None,
+    }
+
+    stubs = tf.test.StubOutForTesting()
+    # pylint: disable=invalid-name
+    def FirstEventTimestamp_stub(multiplexer_self, run_name):
+      del multiplexer_self
+      matches = [candidate_name
+                 for candidate_name in fake_wall_times
+                 if run_name.endswith(candidate_name)]
+      self.assertEqual(len(matches), 1, '%s (%s)' % (matches, run_name))
+      wall_time = fake_wall_times[matches[0]]
+      if wall_time is None:
+        raise ValueError('No event timestamp could be found')
+      else:
+        return wall_time
+    # pylint: enable=invalid-name
+
+    stubs.SmartSet(self._multiplexer,
+                   'FirstEventTimestamp',
+                   FirstEventTimestamp_stub)
+
+    def add_run(run_name):
+      self._GenerateTestData(run_name)
+      self._multiplexer.AddRunsFromDirectory(self.logdir)
+      self._multiplexer.Reload()
+
+    # Add one run: it should come last.
+    add_run('avocado')
+    self.assertEqual(self._getJson('/data/runs'),
+                     ['run1', 'avocado'])
+
+    # Add another run: it should come last, too.
+    add_run('zebra')
+    self.assertEqual(self._getJson('/data/runs'),
+                     ['run1', 'avocado', 'zebra'])
+
+    # And maybe there's a run for which we somehow have no timestamp.
+    add_run('mysterious')
+    self.assertEqual(self._getJson('/data/runs'),
+                     ['run1', 'avocado', 'zebra', 'mysterious'])
+
+    stubs.UnsetAll()
 
   def testApplicationPaths_getCached(self):
     """Test the format of the /data/runs endpoint."""
@@ -168,13 +229,7 @@ class TensorboardServerTest(test.TestCase):
 
   def testDataPaths_disableAllCaching(self):
     """Test the format of the /data/runs endpoint."""
-    for path in ('/data/runs', '/data/logdir',
-                 '/data/scalars?run=run1&tag=simple_values',
-                 '/data/scalars?run=run1&tag=simple_values&format=csv',
-                 '/data/images?run=run1&tag=image',
-                 '/data/individualImage?run=run1&tag=image&index=0',
-                 '/data/audio?run=run1&tag=audio',
-                 '/data/run_metadata?run=run1&tag=test%20run'):
+    for path in ('/data/runs', '/data/logdir'):
       connection = http_client.HTTPConnection('localhost',
                                               self._server.server_address[1])
       connection.request('GET', path)
@@ -184,246 +239,107 @@ class TensorboardServerTest(test.TestCase):
       response.read()
       connection.close()
 
-  def testHistograms(self):
-    """Test the format of /data/histograms."""
-    self.assertEqual(
-        self._getJson('/data/histograms?tag=histogram&run=run1'),
-        [[0, 0, [0, 2.0, 3.0, 6.0, 5.0, [0.0, 1.0, 2.0], [1.0, 1.0, 1.0]]]])
-
-  def testImages(self):
-    """Test listing images and retrieving an individual image."""
-    image_json = self._getJson('/data/images?tag=image&run=run1')
-    image_query = image_json[0]['query']
-    # We don't care about the format of the image query.
-    del image_json[0]['query']
-    self.assertEqual(image_json, [{
-        'wall_time': 0,
-        'step': 0,
-        'height': 1,
-        'width': 1
-    }])
-    response = self._get('/data/individualImage?%s' % image_query)
-    self.assertEqual(response.status, 200)
-
-  def testAudio(self):
-    """Test listing audio and retrieving an individual audio clip."""
-    audio_json = self._getJson('/data/audio?tag=audio&run=run1')
-    audio_query = audio_json[0]['query']
-    # We don't care about the format of the audio query.
-    del audio_json[0]['query']
-    self.assertEqual(audio_json, [{
-        'wall_time': 0,
-        'step': 0,
-        'content_type': 'audio/wav'
-    }])
-    response = self._get('/data/individualAudio?%s' % audio_query)
-    self.assertEqual(response.status, 200)
-
-  def testGraph(self):
-    """Test retrieving the graph definition."""
-    response = self._get('/data/graph?run=run1&limit_attr_size=1024'
-                         '&large_attrs_key=_very_large_attrs')
-    self.assertEqual(response.status, 200)
-    graph_pbtxt = response.read()
-    # Parse the graph from pbtxt into a graph message.
-    graph = graph_pb2.GraphDef()
-    graph = text_format.Parse(graph_pbtxt, graph)
-    self.assertEqual(len(graph.node), 2)
-    self.assertEqual(graph.node[0].name, 'a')
-    self.assertEqual(graph.node[1].name, 'b')
-    # Make sure the second node has an attribute that was filtered out because
-    # it was too large and was added to the "too large" attributes list.
-    self.assertEqual(list(graph.node[1].attr.keys()), ['_very_large_attrs'])
-    self.assertEqual(graph.node[1].attr['_very_large_attrs'].list.s,
-                     [b'very_large_attr'])
-
-  def testProjectorRunsWithEmbeddings(self):
-    """Test the format of /runs endpoint of the projector plugin."""
-    run_json = self._getJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, ['run1'])
-
-  def testProjectorInfo(self):
-    """Test the format of /info endpoint of the projector plugin."""
-    info_json = self._getJson('/data/plugin/projector/info?run=run1')
-    self.assertItemsEqual(info_json['embeddings'], [{
-        'tensorShape': [1, 2],
-        'tensorName': 'var1'
-    }, {
-        'tensorShape': [10, 10],
-        'tensorName': 'var2'
-    }, {
-        'tensorShape': [100, 100],
-        'tensorName': 'var3'
-    }])
-
-  def testProjectorTensor(self):
-    """Test the format of /tensor endpoint of the projector plugin."""
-    url = '/data/plugin/projector/tensor?run=run1&name=var1'
-    tensor_bytes = self._get(url).read()
-    tensor = np.reshape(np.fromstring(tensor_bytes, dtype='float32'), [1, 2])
-    expected_tensor = np.array([[6, 6]], dtype='float32')
-    self.assertTrue(np.array_equal(tensor, expected_tensor))
-
-  def testAcceptGzip_compressesResponse(self):
-    response = self._get('/data/graph?run=run1&limit_attr_size=1024'
-                         '&large_attrs_key=_very_large_attrs',
-                         {'Accept-Encoding': 'gzip'})
-    self.assertEqual(response.status, 200)
-    self.assertEqual(response.getheader('Content-Encoding'), 'gzip')
-    pbtxt = gzip.GzipFile('', 'rb', 9, BytesIO(response.read())).read()
-    graph = text_format.Parse(pbtxt, graph_pb2.GraphDef())
-    self.assertEqual(len(graph.node), 2)
-
-  def testAcceptAnyEncoding_compressesResponse(self):
-    response = self._get('/data/graph?run=run1&limit_attr_size=1024'
-                         '&large_attrs_key=_very_large_attrs',
-                         {'Accept-Encoding': '*'})
-    self.assertEqual(response.status, 200)
-    self.assertEqual(response.getheader('Content-Encoding'), 'gzip')
-    pbtxt = gzip.GzipFile('', 'rb', 9, BytesIO(response.read())).read()
-    graph = text_format.Parse(pbtxt, graph_pb2.GraphDef())
-    self.assertEqual(len(graph.node), 2)
-
-  def testAcceptDoodleEncoding_doesNotCompressResponse(self):
-    response = self._get('/data/graph?run=run1&limit_attr_size=1024'
-                         '&large_attrs_key=_very_large_attrs',
-                         {'Accept-Encoding': 'doodle'})
-    self.assertEqual(response.status, 200)
-    self.assertIsNone(response.getheader('Content-Encoding'))
-    graph = text_format.Parse(response.read(), graph_pb2.GraphDef())
-    self.assertEqual(len(graph.node), 2)
-
-  def testAcceptGzip_doesNotCompressImage(self):
-    response = self._get('/data/individualImage?run=run1&tag=image&index=0',
-                         {'Accept-Encoding': 'gzip'})
-    self.assertEqual(response.status, 200)
-    self.assertEqual(response.getheader('Content-Encoding'), None)
-
-  def testRunMetadata(self):
-    """Test retrieving the run metadata information."""
-    response = self._get('/data/run_metadata?run=run1&tag=test%20run')
-    self.assertEqual(response.status, 200)
-    run_metadata_pbtxt = response.read()
-    # Parse from pbtxt into a message.
-    run_metadata = config_pb2.RunMetadata()
-    text_format.Parse(run_metadata_pbtxt, run_metadata)
-    self.assertEqual(len(run_metadata.step_stats.dev_stats), 1)
-    self.assertEqual(run_metadata.step_stats.dev_stats[0].device, 'test device')
-
-  def _GenerateTestData(self):
+  def _GenerateTestData(self, run_name):
     """Generates the test data directory.
 
-    The test data has a single run named run1 which contains:
-     - a histogram
-     - an image at timestamp and step 0
-     - scalar events containing the value i at step 10 * i and wall time
-         100 * i, for i in [1, _SCALAR_COUNT).
-     - a graph definition
+    The test data has a single run of the given name, containing:
+      - a graph definition and metagraph definition
 
-    Returns:
-      temp_dir: The directory the test data is generated under.
+    Arguments:
+      run_name: the directory under self.logdir into which to write
+        events
     """
-    temp_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir)
-    run1_path = os.path.join(temp_dir, 'run1')
-    os.makedirs(run1_path)
-    writer = writer_lib.FileWriter(run1_path)
+    run_path = os.path.join(self.logdir, run_name)
+    os.makedirs(run_path)
+
+    writer = tf.summary.FileWriter(run_path)
 
-    histogram_value = summary_pb2.HistogramProto(
-        min=0,
-        max=2,
-        num=3,
-        sum=6,
-        sum_squares=5,
-        bucket_limit=[0, 1, 2],
-        bucket=[1, 1, 1])
     # Add a simple graph event.
-    graph_def = graph_pb2.GraphDef()
+    graph_def = tf.GraphDef()
     node1 = graph_def.node.add()
     node1.name = 'a'
     node2 = graph_def.node.add()
     node2.name = 'b'
     node2.attr['very_large_attr'].s = b'a' * 2048  # 2 KB attribute
 
-    meta_graph_def = meta_graph_pb2.MetaGraphDef(graph_def=graph_def)
+    meta_graph_def = tf.MetaGraphDef(graph_def=graph_def)
 
     if self._only_use_meta_graph:
       writer.add_meta_graph(meta_graph_def)
     else:
       writer.add_graph(graph_def)
 
-    # Add a simple run metadata event.
-    run_metadata = config_pb2.RunMetadata()
-    device_stats = run_metadata.step_stats.dev_stats.add()
-    device_stats.device = 'test device'
-    writer.add_run_metadata(run_metadata, 'test run')
-
-    # 1x1 transparent GIF.
-    encoded_image = base64.b64decode(
-        'R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7')
-    image_value = summary_pb2.Summary.Image(
-        height=1, width=1, colorspace=1, encoded_image_string=encoded_image)
-
-    audio_value = summary_pb2.Summary.Audio(
-        sample_rate=44100,
-        length_frames=22050,
-        num_channels=2,
-        encoded_audio_string=b'',
-        content_type='audio/wav')
-    writer.add_event(
-        event_pb2.Event(
-            wall_time=0,
-            step=0,
-            summary=summary_pb2.Summary(value=[
-                summary_pb2.Summary.Value(
-                    tag='histogram', histo=histogram_value),
-                summary_pb2.Summary.Value(
-                    tag='image', image=image_value), summary_pb2.Summary.Value(
-                        tag='audio', audio=audio_value)
-            ])))
-
-    # Write 100 simple values.
-    for i in xrange(1, self._SCALAR_COUNT + 1):
-      writer.add_event(
-          event_pb2.Event(
-              # We use different values for wall time, step, and the value so we
-              # can tell them apart.
-              wall_time=100 * i,
-              step=10 * i,
-              summary=summary_pb2.Summary(value=[
-                  summary_pb2.Summary.Value(
-                      tag='simple_values', simple_value=i)
-              ])))
     writer.flush()
     writer.close()
 
-    # We assume that the projector is a registered plugin.
-    self._GenerateProjectorTestData(run1_path)
 
-    return temp_dir
+class TensorboardServerPluginNameTest(tf.test.TestCase):
 
-  def _GenerateProjectorTestData(self, run_path):
-    # Write a projector config file in run1.
-    config_path = os.path.join(run_path, 'projector_config.pbtxt')
-    config = ProjectorConfig()
-    embedding = config.embeddings.add()
-    # Add an embedding by its canonical tensor name.
-    embedding.tensor_name = 'var1:0'
-    config_pbtxt = text_format.MessageToString(config)
-    with gfile.GFile(config_path, 'w') as f:
-      f.write(config_pbtxt)
+  def _test(self, name, should_be_okay):
+    temp_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir)
+    multiplexer = event_multiplexer.EventMultiplexer(
+        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
+        purge_orphaned_data=True)
+    plugins = [
+        FakePlugin(plugin_name='foo', is_active_value=True, routes_mapping={}),
+        FakePlugin(plugin_name=name, is_active_value=True, routes_mapping={}),
+        FakePlugin(plugin_name='bar', is_active_value=False, routes_mapping={})
+    ]
+    if should_be_okay:
+      application.TensorBoardWSGIApp(
+          temp_dir, plugins, multiplexer, reload_interval=0)
+    else:
+      with self.assertRaisesRegexp(ValueError, r'invalid name'):
+        application.TensorBoardWSGIApp(
+            temp_dir, plugins, multiplexer, reload_interval=0)
 
-    # Write a checkpoint with some dummy variables.
-    with ops.Graph().as_default():
-      sess = session.Session()
-      checkpoint_path = os.path.join(run_path, 'model')
-      variable_scope.get_variable(
-          'var1', [1, 2], initializer=init_ops.constant_initializer(6.0))
-      variable_scope.get_variable('var2', [10, 10])
-      variable_scope.get_variable('var3', [100, 100])
-      sess.run(variables.global_variables_initializer())
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
-      saver.save(sess, checkpoint_path)
+  def testEmptyName(self):
+    self._test('', False)
+
+  def testNameWithSlashes(self):
+    self._test('scalars/data', False)
+
+  def testNameWithSpaces(self):
+    self._test('my favorite plugin', False)
+
+  def testSimpleName(self):
+    self._test('scalars', True)
+
+  def testComprehensiveName(self):
+    self._test('Scalar-Dashboard_3000.1', True)
+
+
+class TensorboardServerPluginRouteTest(tf.test.TestCase):
+
+  def _test(self, route, should_be_okay):
+    temp_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir)
+    multiplexer = event_multiplexer.EventMultiplexer(
+        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
+        purge_orphaned_data=True)
+    plugins = [
+        FakePlugin(
+            plugin_name='foo',
+            is_active_value=True,
+            routes_mapping={route: lambda environ, start_response: None}),
+    ]
+    if should_be_okay:
+      application.TensorBoardWSGIApp(
+          temp_dir, plugins, multiplexer, reload_interval=0)
+    else:
+      with self.assertRaisesRegexp(ValueError, r'invalid route'):
+        application.TensorBoardWSGIApp(
+            temp_dir, plugins, multiplexer, reload_interval=0)
+
+  def testNormalRoute(self):
+    self._test('/runs', True)
+
+  def testEmptyRoute(self):
+    self._test('', False)
+
+  def testSlashlessRoute(self):
+    self._test('runaway', False)
 
 
 class TensorboardServerUsingMetagraphOnlyTest(TensorboardServerTest):
@@ -431,7 +347,7 @@ class TensorboardServerUsingMetagraphOnlyTest(TensorboardServerTest):
   _only_use_meta_graph = True  # Server data contains only a MetaGraphDef
 
 
-class ParseEventFilesSpecTest(test.TestCase):
+class ParseEventFilesSpecTest(tf.test.TestCase):
 
   def testRunName(self):
     logdir = 'lol:/cat'
@@ -484,12 +400,114 @@ class ParseEventFilesSpecTest(test.TestCase):
     self.assertEqual(application.parse_event_files_spec(logdir), expected)
 
 
-class TensorBoardAssetsTest(test.TestCase):
+class TensorBoardAssetsTest(tf.test.TestCase):
 
   def testTagFound(self):
-    tag = resource_loader.load_resource('tensorboard/TAG')
+    tag = application.get_tensorboard_tag()
     self.assertTrue(tag)
+    app = application.standard_tensorboard_wsgi('', True, 60, [])
+    self.assertEqual(app.tag, tag)
+
+
+class TensorBoardPluginsTest(tf.test.TestCase):
+
+  def testPluginsAdded(self):
+
+    def foo_handler():
+      pass
+
+    def bar_handler():
+      pass
+
+    plugins = [
+        FakePlugin(
+            plugin_name='foo',
+            is_active_value=True,
+            routes_mapping={'/foo_route': foo_handler}),
+        FakePlugin(
+            plugin_name='bar',
+            is_active_value=True,
+            routes_mapping={'/bar_route': bar_handler}),
+    ]
+
+    # The application should have added routes for both plugins.
+    app = application.standard_tensorboard_wsgi('', True, 60, plugins)
+
+    # The routes are prefixed with /data/plugin/[plugin name].
+    self.assertDictContainsSubset({
+        '/data/plugin/foo/foo_route': foo_handler,
+        '/data/plugin/bar/bar_route': bar_handler,
+    }, app.data_applications)
+
+
+class TensorboardSimpleServerConstructionTest(tf.test.TestCase):
+  """Tests that the default HTTP server is constructed without error.
+
+  Mostly useful for IPv4/IPv6 testing. This test should run with only IPv4, only
+  IPv6, and both IPv4 and IPv6 enabled.
+  """
+
+  class _StubApplication(object):
+    tag = ''
+
+  def testMakeServerBlankHost(self):
+    # Test that we can bind to all interfaces without throwing an error
+    server, url = tensorboard.make_simple_server(
+        self._StubApplication(),
+        host='',
+        port=0)  # Grab any available port
+    self.assertTrue(server)
+    self.assertTrue(url)
+
+  def testSpecifiedHost(self):
+    one_passed = False
+    try:
+      _, url = tensorboard.make_simple_server(
+          self._StubApplication(),
+          host='127.0.0.1',
+          port=0)
+      self.assertStartsWith(actual=url, expected_start='http://127.0.0.1:')
+      one_passed = True
+    except socket.error:
+      # IPv4 is not supported
+      pass
+    try:
+      _, url = tensorboard.make_simple_server(
+          self._StubApplication(),
+          host='::1',
+          port=0)
+      self.assertStartsWith(actual=url, expected_start='http://[::1]:')
+      one_passed = True
+    except socket.error:
+      # IPv6 is not supported
+      pass
+    self.assertTrue(one_passed)  # We expect either IPv4 or IPv6 to be supported
+
+
+class TensorBoardApplcationConstructionTest(tf.test.TestCase):
+
+  def testExceptions(self):
+    logdir = '/fake/foo'
+    multiplexer = event_multiplexer.EventMultiplexer()
+
+    # Fails if there is an unnamed plugin
+    with self.assertRaises(ValueError):
+      # This plugin lacks a name.
+      plugins = [
+          FakePlugin(plugin_name=None, is_active_value=True, routes_mapping={})
+      ]
+      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
+
+    # Fails if there are two plugins with same name
+    with self.assertRaises(ValueError):
+      plugins = [
+          FakePlugin(
+              plugin_name='foo', is_active_value=True, routes_mapping={}),
+          FakePlugin(
+              plugin_name='foo', is_active_value=True, routes_mapping={}),
+      ]
+      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
 
 
 if __name__ == '__main__':
-  test.main()
+  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/BUILD b/tensorflow/tensorboard/backend/event_processing/BUILD
new file mode 100644
index 00000000000..9c9ca29be2d
--- /dev/null
+++ b/tensorflow/tensorboard/backend/event_processing/BUILD
@@ -0,0 +1,165 @@
+# Description:
+# Event processing logic for TensorBoard
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "io_wrapper",
+    srcs = ["io_wrapper.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "directory_watcher",
+    srcs = ["directory_watcher.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":io_wrapper",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "directory_watcher_test",
+    size = "small",
+    srcs = ["directory_watcher_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":directory_watcher",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "reservoir",
+    srcs = ["reservoir.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "reservoir_test",
+    size = "small",
+    srcs = ["reservoir_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":reservoir",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "event_file_loader",
+    srcs = ["event_file_loader.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
+py_test(
+    name = "event_file_loader_test",
+    size = "small",
+    srcs = ["event_file_loader_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":event_file_loader",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "event_accumulator",
+    srcs = ["event_accumulator.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":directory_watcher",
+        ":event_file_loader",
+        ":plugin_asset_util",
+        ":reservoir",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "event_accumulator_test",
+    size = "small",
+    srcs = ["event_accumulator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":event_accumulator",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "event_multiplexer",
+    srcs = ["event_multiplexer.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":directory_watcher",
+        ":event_accumulator",
+        ":io_wrapper",
+        "//tensorflow:tensorflow_py",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "event_multiplexer_test",
+    size = "small",
+    srcs = ["event_multiplexer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":event_accumulator",
+        ":event_multiplexer",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "plugin_asset_util",
+    srcs = ["plugin_asset_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "event_file_inspector",
+    srcs = ["event_file_inspector.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":event_accumulator",
+        ":event_multiplexer",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "event_file_inspector_test",
+    size = "small",
+    srcs = ["event_file_inspector_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":event_file_inspector",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        [
+            "*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/summary/impl/directory_watcher.py b/tensorflow/tensorboard/backend/event_processing/directory_watcher.py
similarity index 89%
rename from tensorflow/python/summary/impl/directory_watcher.py
rename to tensorflow/tensorboard/backend/event_processing/directory_watcher.py
index 799e01a8366..6be3049e906 100644
--- a/tensorflow/python/summary/impl/directory_watcher.py
+++ b/tensorflow/tensorboard/backend/event_processing/directory_watcher.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import bisect
 
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.impl import io_wrapper
+import tensorflow as tf
+
+
+from tensorflow.tensorboard.backend.event_processing import io_wrapper
 
 
 class DirectoryWatcher(object):
@@ -87,8 +87,8 @@ class DirectoryWatcher(object):
     try:
       for event in self._LoadInternal():
         yield event
-    except errors.OpError:
-      if not gfile.Exists(self._directory):
+    except tf.errors.OpError:
+      if not tf.gfile.Exists(self._directory):
         raise DirectoryDeletedError(
             'Directory %s has been permanently deleted' % self._directory)
 
@@ -114,7 +114,7 @@ class DirectoryWatcher(object):
 
       next_path = self._GetNextPath()
       if not next_path:
-        logging.info('No path found after %s', self._path)
+        tf.logging.info('No path found after %s', self._path)
         # Current path is empty and there are no new paths, so we're done.
         return
 
@@ -137,8 +137,8 @@ class DirectoryWatcher(object):
       for event in self._loader.Load():
         yield event
 
-      logging.info('Directory watcher advancing from %s to %s', self._path,
-                   next_path)
+      tf.logging.info('Directory watcher advancing from %s to %s', self._path,
+                      next_path)
 
       # Advance to the next path and start over.
       self._SetPath(next_path)
@@ -181,11 +181,11 @@ class DirectoryWatcher(object):
     if old_path and not io_wrapper.IsGCSPath(old_path):
       try:
         # We're done with the path, so store its size.
-        size = gfile.Stat(old_path).length
-        logging.debug('Setting latest size of %s to %d', old_path, size)
+        size = tf.gfile.Stat(old_path).length
+        tf.logging.debug('Setting latest size of %s to %d', old_path, size)
         self._finalized_sizes[old_path] = size
-      except errors.OpError as e:
-        logging.error('Unable to get size of %s: %s', old_path, e)
+      except tf.errors.OpError as e:
+        tf.logging.error('Unable to get size of %s: %s', old_path, e)
 
     self._path = path
     self._loader = self._loader_factory(path)
@@ -230,15 +230,15 @@ class DirectoryWatcher(object):
   def _HasOOOWrite(self, path):
     """Returns whether the path has had an out-of-order write."""
     # Check the sizes of each path before the current one.
-    size = gfile.Stat(path).length
+    size = tf.gfile.Stat(path).length
     old_size = self._finalized_sizes.get(path, None)
     if size != old_size:
       if old_size is None:
-        logging.error('File %s created after file %s even though it\'s '
-                      'lexicographically earlier', path, self._path)
+        tf.logging.error('File %s created after file %s even though it\'s '
+                         'lexicographically earlier', path, self._path)
       else:
-        logging.error('File %s updated even though the current file is %s',
-                      path, self._path)
+        tf.logging.error('File %s updated even though the current file is %s',
+                         path, self._path)
       return True
     else:
       return False
diff --git a/tensorflow/python/summary/impl/directory_watcher_test.py b/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py
similarity index 92%
rename from tensorflow/python/summary/impl/directory_watcher_test.py
rename to tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py
index b6ecc158493..d44f74a8a43 100644
--- a/tensorflow/python/summary/impl/directory_watcher_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py
@@ -22,11 +22,10 @@ from __future__ import print_function
 import os
 import shutil
 
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary.impl import directory_watcher
-from tensorflow.python.summary.impl import io_wrapper
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import directory_watcher
+from tensorflow.tensorboard.backend.event_processing import io_wrapper
 
 
 class _ByteLoader(object):
@@ -47,7 +46,7 @@ class _ByteLoader(object):
         return
 
 
-class DirectoryWatcherTest(test_util.TensorFlowTestCase):
+class DirectoryWatcherTest(tf.test.TestCase):
 
   def setUp(self):
     # Put everything in a directory so it's easier to delete.
@@ -55,7 +54,7 @@ class DirectoryWatcherTest(test_util.TensorFlowTestCase):
     os.mkdir(self._directory)
     self._watcher = directory_watcher.DirectoryWatcher(self._directory,
                                                        _ByteLoader)
-    self.stubs = googletest.StubOutForTesting()
+    self.stubs = tf.test.StubOutForTesting()
 
   def tearDown(self):
     self.stubs.CleanUp()
@@ -198,12 +197,12 @@ class DirectoryWatcherTest(test_util.TensorFlowTestCase):
       self.stubs.Set(io_wrapper, stub_name,
                      FakeFactory(getattr(io_wrapper, stub_name)))
     for stub_name in ['IsDirectory', 'Exists', 'Stat']:
-      self.stubs.Set(gfile, stub_name,
-                     FakeFactory(getattr(gfile, stub_name)))
+      self.stubs.Set(tf.gfile, stub_name,
+                     FakeFactory(getattr(tf.gfile, stub_name)))
 
     with self.assertRaises((IOError, OSError)):
       self._LoadAllEvents()
 
 
 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
diff --git a/tensorflow/python/summary/event_accumulator.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
similarity index 79%
rename from tensorflow/python/summary/event_accumulator.py
rename to tensorflow/tensorboard/backend/event_processing/event_accumulator.py
index d43cd7d2168..1562f0f8339 100644
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
@@ -23,25 +23,19 @@ import re
 import threading
 
 import numpy as np
+import tensorflow as tf
 
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf.config_pb2 import RunMetadata
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary
-from tensorflow.python.summary.impl import directory_watcher
-from tensorflow.python.summary.impl import event_file_loader
-from tensorflow.python.summary.impl import reservoir
-from tensorflow.python.util import compat
+from tensorflow.tensorboard.backend.event_processing import directory_watcher
+from tensorflow.tensorboard.backend.event_processing import event_file_loader
+from tensorflow.tensorboard.backend.event_processing import plugin_asset_util
+from tensorflow.tensorboard.backend.event_processing import reservoir
 
 namedtuple = collections.namedtuple
 ScalarEvent = namedtuple('ScalarEvent', ['wall_time', 'step', 'value'])
 
-HealthPillEvent = namedtuple(
-    'HealthPillEvent',
-    ['wall_time', 'step', 'node_name', 'output_slot', 'value'])
+HealthPillEvent = namedtuple('HealthPillEvent', [
+    'wall_time', 'step', 'device_name', 'node_name', 'output_slot', 'dtype',
+    'shape', 'value'])
 
 CompressedHistogramEvent = namedtuple('CompressedHistogramEvent',
                                       ['wall_time', 'step',
@@ -65,19 +59,25 @@ AudioEvent = namedtuple('AudioEvent', ['wall_time', 'step',
                                        'encoded_audio_string', 'content_type',
                                        'sample_rate', 'length_frames'])
 
+TensorEvent = namedtuple('TensorEvent', ['wall_time', 'step', 'tensor_proto'])
+
 ## Different types of summary events handled by the event_accumulator
-SUMMARY_TYPES = {'simple_value': '_ProcessScalar',
-                 'histo': '_ProcessHistogram',
-                 'image': '_ProcessImage',
-                 'audio': '_ProcessAudio'}
+SUMMARY_TYPES = {
+    'simple_value': '_ProcessScalar',
+    'histo': '_ProcessHistogram',
+    'image': '_ProcessImage',
+    'audio': '_ProcessAudio',
+    'tensor': '_ProcessTensor',
+}
 
 ## The tagTypes below are just arbitrary strings chosen to pass the type
 ## information of the tag from the backend to the frontend
-COMPRESSED_HISTOGRAMS = 'compressedHistograms'
+COMPRESSED_HISTOGRAMS = 'distributions'
 HISTOGRAMS = 'histograms'
 IMAGES = 'images'
 AUDIO = 'audio'
 SCALARS = 'scalars'
+TENSORS = 'tensors'
 HEALTH_PILLS = 'health_pills'
 GRAPH = 'graph'
 META_GRAPH = 'meta_graph'
@@ -96,6 +96,7 @@ DEFAULT_SIZE_GUIDANCE = {
     # We store this many health pills per op.
     HEALTH_PILLS: 100,
     HISTOGRAMS: 1,
+    TENSORS: 10,
 }
 
 STORE_EVERYTHING_SIZE_GUIDANCE = {
@@ -105,12 +106,13 @@ STORE_EVERYTHING_SIZE_GUIDANCE = {
     SCALARS: 0,
     HEALTH_PILLS: 0,
     HISTOGRAMS: 0,
+    TENSORS: 0,
 }
 
 # The tag that values containing health pills have. Health pill data is stored
 # in tensors. In order to distinguish health pill values from scalar values, we
 # rely on how health pill values have this special tag value.
-_HEALTH_PILL_EVENT_TAG = '__health_pill__'
+HEALTH_PILL_EVENT_TAG_PREFIX = '__health_pill__/'
 
 
 def IsTensorFlowEventsFile(path):
@@ -127,7 +129,7 @@ def IsTensorFlowEventsFile(path):
   """
   if not path:
     raise ValueError('Path must be a nonempty string')
-  return 'tfevents' in compat.as_str_any(os.path.basename(path))
+  return 'tfevents' in tf.compat.as_str_any(os.path.basename(path))
 
 
 class EventAccumulator(object):
@@ -151,18 +153,7 @@ class EventAccumulator(object):
 
   Histograms, audio, and images are very large, so storing all of them is not
   recommended.
-
-  @@Reload
-  @@Tags
-  @@Scalars
-  @@HealthPills
-  @@Graph
-  @@MetaGraph
-  @@RunMetadata
-  @@Histograms
-  @@CompressedHistograms
-  @@Images
-  @@Audio
+  @@Tensors
   """
 
   def __init__(self,
@@ -211,8 +202,10 @@ class EventAccumulator(object):
         size=sizes[COMPRESSED_HISTOGRAMS], always_keep_last=False)
     self._images = reservoir.Reservoir(size=sizes[IMAGES])
     self._audio = reservoir.Reservoir(size=sizes[AUDIO])
+    self._tensors = reservoir.Reservoir(size=sizes[TENSORS])
 
     self._generator_mutex = threading.Lock()
+    self.path = path
     self._generator = _GeneratorFromPath(path)
 
     self._compression_bps = compression_bps
@@ -240,6 +233,33 @@ class EventAccumulator(object):
         self._ProcessEvent(event)
     return self
 
+  def PluginAssets(self, plugin_name):
+    """Return a list of all plugin assets for the given plugin.
+
+    Args:
+      plugin_name: The string name of a plugin to retrieve assets for.
+
+    Returns:
+      A list of string plugin asset names, or empty list if none are available.
+      If the plugin was not registered, an empty list is returned.
+    """
+    return plugin_asset_util.ListAssets(self.path, plugin_name)
+
+  def RetrievePluginAsset(self, plugin_name, asset_name):
+    """Return the contents of a given plugin asset.
+
+    Args:
+      plugin_name: The string name of a plugin.
+      asset_name: The string name of an asset.
+
+    Returns:
+      The string contents of the plugin asset.
+
+    Raises:
+      KeyError: If the asset is not available.
+    """
+    return plugin_asset_util.RetrieveAsset(self.path, plugin_name, asset_name)
+
   def FirstEventTimestamp(self):
     """Returns the timestamp in seconds of the first event.
 
@@ -275,10 +295,10 @@ class EventAccumulator(object):
       new_file_version = _ParseFileVersion(event.file_version)
       if self.file_version and self.file_version != new_file_version:
         ## This should not happen.
-        logging.warn(('Found new file_version for event.proto. This will '
-                      'affect purging logic for TensorFlow restarts. '
-                      'Old: {0} New: {1}').format(self.file_version,
-                                                  new_file_version))
+        tf.logging.warn(('Found new file_version for event.proto. This will '
+                         'affect purging logic for TensorFlow restarts. '
+                         'Old: {0} New: {1}').format(self.file_version,
+                                                     new_file_version))
       self.file_version = new_file_version
 
     self._MaybePurgeOrphanedData(event)
@@ -291,99 +311,49 @@ class EventAccumulator(object):
     # inside the meta_graph_def.
     if event.HasField('graph_def'):
       if self._graph is not None:
-        logging.warn(('Found more than one graph event per run, or there was '
-                      'a metagraph containing a graph_def, as well as one or '
-                      'more graph events.  Overwriting the graph with the '
-                      'newest event.'))
+        tf.logging.warn(
+            ('Found more than one graph event per run, or there was '
+             'a metagraph containing a graph_def, as well as one or '
+             'more graph events.  Overwriting the graph with the '
+             'newest event.'))
       self._graph = event.graph_def
       self._graph_from_metagraph = False
-      self._UpdateTensorSummaries()
     elif event.HasField('meta_graph_def'):
       if self._meta_graph is not None:
-        logging.warn(('Found more than one metagraph event per run. '
-                      'Overwriting the metagraph with the newest event.'))
+        tf.logging.warn(('Found more than one metagraph event per run. '
+                         'Overwriting the metagraph with the newest event.'))
       self._meta_graph = event.meta_graph_def
       if self._graph is None or self._graph_from_metagraph:
         # We may have a graph_def in the metagraph.  If so, and no
         # graph_def is directly available, use this one instead.
-        meta_graph = meta_graph_pb2.MetaGraphDef()
+        meta_graph = tf.MetaGraphDef()
         meta_graph.ParseFromString(self._meta_graph)
         if meta_graph.graph_def:
           if self._graph is not None:
-            logging.warn(('Found multiple metagraphs containing graph_defs,'
-                          'but did not find any graph events.  Overwriting the '
-                          'graph with the newest metagraph version.'))
+            tf.logging.warn(
+                ('Found multiple metagraphs containing graph_defs,'
+                 'but did not find any graph events.  Overwriting the '
+                 'graph with the newest metagraph version.'))
           self._graph_from_metagraph = True
           self._graph = meta_graph.graph_def.SerializeToString()
-          self._UpdateTensorSummaries()
     elif event.HasField('tagged_run_metadata'):
       tag = event.tagged_run_metadata.tag
       if tag in self._tagged_metadata:
-        logging.warn('Found more than one "run metadata" event with tag ' +
-                     tag + '. Overwriting it with the newest event.')
+        tf.logging.warn('Found more than one "run metadata" event with tag ' +
+                        tag + '. Overwriting it with the newest event.')
       self._tagged_metadata[tag] = event.tagged_run_metadata.run_metadata
     elif event.HasField('summary'):
       for value in event.summary.value:
-        if value.HasField('tensor'):
-          if value.tag == _HEALTH_PILL_EVENT_TAG:
-            self._ProcessHealthPillSummary(value, event)
-          else:
-            self._ProcessTensorSummary(value, event)
+        if (value.HasField('tensor') and
+            value.tag.startswith(HEALTH_PILL_EVENT_TAG_PREFIX)):
+          self._ProcessHealthPillSummary(value, event)
         else:
           for summary_type, summary_func in SUMMARY_TYPES.items():
             if value.HasField(summary_type):
               datum = getattr(value, summary_type)
-              getattr(self, summary_func)(value.tag, event.wall_time,
-                                          event.step, datum)
-
-  def _ProcessTensorSummary(self, value, event):
-    """Process summaries generated by the TensorSummary op.
-
-    These summaries are distinguished by the fact that they have a Tensor field,
-    rather than one of the old idiosyncratic per-summary data fields.
-
-    Processing Tensor summaries is complicated by the fact that Tensor summaries
-    are not self-descriptive; you need to read the NodeDef of the corresponding
-    TensorSummary op to know the summary_type, the tag, etc.
-
-    This method emits ERROR-level messages to the logs if it encounters Tensor
-    summaries that it cannot process.
-
-    Args:
-      value: A summary_pb2.Summary.Value with a Tensor field.
-      event: The event_pb2.Event containing that value.
-    """
-
-    def LogErrorOnce(msg):
-      logging.log_first_n(logging.ERROR, msg, 1)
-
-    name = value.node_name
-    if self._graph is None:
-      LogErrorOnce('Attempting to process TensorSummary output, but '
-                   'no graph is present, so processing is impossible. '
-                   'All TensorSummary output will be ignored.')
-      return
-
-    if name not in self._tensor_summaries:
-      LogErrorOnce('No node_def for TensorSummary {}; skipping this sequence.'.
-                   format(name))
-      return
-
-    summary_description = self._tensor_summaries[name]
-    type_hint = summary_description.type_hint
-
-    if not type_hint:
-      LogErrorOnce('No type_hint for TensorSummary {}; skipping this sequence.'.
-                   format(name))
-      return
-
-    if type_hint == 'scalar':
-      scalar = float(tensor_util.MakeNdarray(value.tensor))
-      self._ProcessScalar(name, event.wall_time, event.step, scalar)
-    else:
-      LogErrorOnce(
-          'Unsupported type {} for TensorSummary {}; skipping this sequence.'.
-          format(type_hint, name))
+              tag = value.node_name if summary_type == 'tensor' else value.tag
+              getattr(self, summary_func)(tag, event.wall_time, event.step,
+                                          datum)
 
   def _ProcessHealthPillSummary(self, value, event):
     """Process summaries containing health pills.
@@ -395,35 +365,27 @@ class EventAccumulator(object):
     summaries that it cannot process.
 
     Args:
-      value: A summary_pb2.Summary.Value with a Tensor field.
-      event: The event_pb2.Event containing that value.
+      value: A tf.Summary.Value with a Tensor field.
+      event: The tf.Event containing that value.
     """
-    elements = np.fromstring(value.tensor.tensor_content, dtype=np.float64)
+    elements = tf.make_ndarray(value.tensor)
 
     # The node_name property of the value object is actually a watch key: a
     # combination of node name, output slot, and a suffix. We capture the
     # actual node name and the output slot with a regular expression.
     match = re.match(r'^(.*):(\d+):DebugNumericSummary$', value.node_name)
     if not match:
-      logging.log_first_n(
-          logging.ERROR,
+      tf.logging.log_first_n(
+          tf.logging.ERROR,
           'Unsupported watch key %s for health pills; skipping this sequence.',
-          1,
-          value.node_name)
+          1, value.node_name)
       return
 
     node_name = match.group(1)
     output_slot = int(match.group(2))
-    self._ProcessHealthPill(
-        event.wall_time, event.step, node_name, output_slot, elements)
-
-  def _UpdateTensorSummaries(self):
-    g = self.Graph()
-    for node in g.node:
-      if node.op == 'TensorSummary':
-        d = summary.get_summary_description(node)
-
-        self._tensor_summaries[node.name] = d
+    device_name = value.tag[len(HEALTH_PILL_EVENT_TAG_PREFIX):]
+    self._ProcessHealthPill(event.wall_time, event.step, device_name, node_name,
+                            output_slot, elements)
 
   def Tags(self):
     """Return all tags found in the value stream.
@@ -431,16 +393,19 @@ class EventAccumulator(object):
     Returns:
       A `{tagType: ['list', 'of', 'tags']}` dictionary.
     """
-    return {IMAGES: self._images.Keys(),
-            AUDIO: self._audio.Keys(),
-            HISTOGRAMS: self._histograms.Keys(),
-            SCALARS: self._scalars.Keys(),
-            COMPRESSED_HISTOGRAMS: self._compressed_histograms.Keys(),
-            # Use a heuristic: if the metagraph is available, but
-            # graph is not, then we assume the metagraph contains the graph.
-            GRAPH: self._graph is not None,
-            META_GRAPH: self._meta_graph is not None,
-            RUN_METADATA: list(self._tagged_metadata.keys())}
+    return {
+        IMAGES: self._images.Keys(),
+        AUDIO: self._audio.Keys(),
+        HISTOGRAMS: self._histograms.Keys(),
+        SCALARS: self._scalars.Keys(),
+        COMPRESSED_HISTOGRAMS: self._compressed_histograms.Keys(),
+        TENSORS: self._tensors.Keys(),
+        # Use a heuristic: if the metagraph is available, but
+        # graph is not, then we assume the metagraph contains the graph.
+        GRAPH: self._graph is not None,
+        META_GRAPH: self._meta_graph is not None,
+        RUN_METADATA: list(self._tagged_metadata.keys())
+    }
 
   def Scalars(self, tag):
     """Given a summary tag, return all associated `ScalarEvent`s.
@@ -470,6 +435,14 @@ class EventAccumulator(object):
     """
     return self._health_pills.Items(node_name)
 
+  def GetOpsWithHealthPills(self):
+    """Determines which ops have at least 1 health pill event.
+
+    Returns:
+      A list of names of ops with at least 1 health pill event.
+    """
+    return self._health_pills.Keys()
+
   def Graph(self):
     """Return the graph definition, if there is one.
 
@@ -482,7 +455,7 @@ class EventAccumulator(object):
     Returns:
       The `graph_def` proto.
     """
-    graph = graph_pb2.GraphDef()
+    graph = tf.GraphDef()
     if self._graph is not None:
       graph.ParseFromString(self._graph)
       return graph
@@ -499,7 +472,7 @@ class EventAccumulator(object):
     """
     if self._meta_graph is None:
       raise ValueError('There is no metagraph in this EventAccumulator')
-    meta_graph = meta_graph_pb2.MetaGraphDef()
+    meta_graph = tf.MetaGraphDef()
     meta_graph.ParseFromString(self._meta_graph)
     return meta_graph
 
@@ -518,7 +491,7 @@ class EventAccumulator(object):
     if tag not in self._tagged_metadata:
       raise ValueError('There is no run metadata with this tag name')
 
-    run_metadata = RunMetadata()
+    run_metadata = tf.RunMetadata()
     run_metadata.ParseFromString(self._tagged_metadata[tag])
     return run_metadata
 
@@ -578,6 +551,20 @@ class EventAccumulator(object):
     """
     return self._audio.Items(tag)
 
+  def Tensors(self, tag):
+    """Given a summary tag, return all associated tensors.
+
+    Args:
+      tag: A string tag associated with the events.
+
+    Raises:
+      KeyError: If the tag is not found.
+
+    Returns:
+      An array of `TensorEvent`s.
+    """
+    return self._tensors.Items(tag)
+
   def _MaybePurgeOrphanedData(self, event):
     """Maybe purge orphaned data due to a TensorFlow crash.
 
@@ -620,7 +607,7 @@ class EventAccumulator(object):
         previously seen events with a greater event.step will be purged.
     """
     if event.HasField(
-        'session_log') and event.session_log.status == SessionLog.START:
+        'session_log') and event.session_log.status == tf.SessionLog.START:
       self._Purge(event, by_tags=False)
 
   def _CheckForOutOfOrderStepAndMaybePurge(self, event):
@@ -680,8 +667,12 @@ class EventAccumulator(object):
     sv = ScalarEvent(wall_time=wall_time, step=step, value=scalar)
     self._scalars.AddItem(tag, sv)
 
-  def _ProcessHealthPill(self, wall_time, step, node_name, output_slot,
-                         elements):
+  def _ProcessTensor(self, tag, wall_time, step, tensor):
+    tv = TensorEvent(wall_time=wall_time, step=step, tensor_proto=tensor)
+    self._tensors.AddItem(tag, tv)
+
+  def _ProcessHealthPill(self, wall_time, step, device_name, node_name,
+                         output_slot, elements):
     """Processes a health pill value by adding it to accumulated state.
 
     Args:
@@ -689,19 +680,24 @@ class EventAccumulator(object):
         debugger.
       step: The step at which the health pill was created. Provided by the
         debugger.
+      device_name: The name of the node's device.
       node_name: The name of the node for this health pill.
       output_slot: The output slot for this health pill.
-      elements: An ND array of 12 floats. The elements of the health pill.
+      elements: An ND array of 20 floats. The elements of the health pill.
     """
-    # Key by the node name for fast retrieval of health pills by node name.
-    self._health_pills.AddItem(
-        node_name,
-        HealthPillEvent(
-            wall_time=wall_time,
-            step=step,
-            node_name=node_name,
-            output_slot=output_slot,
-            value=elements))
+    # Key by the node name for fast retrieval of health pills by node name. The
+    # array is cast to a list so that it is JSON-able. The debugger data plugin
+    # serves a JSON response.
+    self._health_pills.AddItem(node_name,
+                               HealthPillEvent(
+                                   wall_time=wall_time,
+                                   step=step,
+                                   device_name=device_name,
+                                   node_name=node_name,
+                                   output_slot=output_slot,
+                                   dtype=repr(tf.as_dtype(elements[12])),
+                                   shape=list(elements[14:]),
+                                   value=list(elements)))
 
   def _Purge(self, event, by_tags):
     """Purge all events that have occurred after the given event.step.
@@ -745,7 +741,7 @@ class EventAccumulator(object):
       purge_msg = _GetPurgeMessage(self.most_recent_step,
                                    self.most_recent_wall_time, event.step,
                                    event.wall_time, *expired_per_type)
-      logging.warn(purge_msg)
+      tf.logging.warn(purge_msg)
 
 
 def _GetPurgeMessage(most_recent_step, most_recent_wall_time, event_step,
@@ -791,8 +787,9 @@ def _ParseFileVersion(file_version):
   except ValueError:
     ## This should never happen according to the definition of file_version
     ## specified in event.proto.
-    logging.warn(('Invalid event.proto file_version. Defaulting to use of '
-                  'out-of-order event.step logic for purging expired events.'))
+    tf.logging.warn(
+        ('Invalid event.proto file_version. Defaulting to use of '
+         'out-of-order event.step logic for purging expired events.'))
     return -1
 
 
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
new file mode 100644
index 00000000000..4ce766f4204
--- /dev/null
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -0,0 +1,976 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_accumulator as ea
+
+
+class _EventGenerator(object):
+  """Class that can add_events and then yield them back.
+
+  Satisfies the EventGenerator API required for the EventAccumulator.
+  Satisfies the EventWriter API required to create a SummaryWriter.
+
+  Has additional convenience methods for adding test events.
+  """
+
+  def __init__(self, testcase, zero_out_timestamps=False):
+    self._testcase = testcase
+    self.items = []
+    self.zero_out_timestamps = zero_out_timestamps
+
+  def Load(self):
+    while self.items:
+      yield self.items.pop(0)
+
+  def AddScalar(self, tag, wall_time=0, step=0, value=0):
+    event = tf.Event(
+        wall_time=wall_time,
+        step=step,
+        summary=tf.Summary(
+            value=[tf.Summary.Value(tag=tag, simple_value=value)]))
+    self.AddEvent(event)
+
+  def AddHealthPill(self, wall_time, step, device_name, op_name, output_slot,
+                    elements):
+    event = tf.Event(step=step, wall_time=wall_time)
+    value = event.summary.value.add(
+        tag=ea.HEALTH_PILL_EVENT_TAG_PREFIX + device_name,
+        node_name='%s:%d:DebugNumericSummary' % (op_name, output_slot))
+    value.tensor.tensor_shape.dim.add(size=len(elements))
+    value.tensor.dtype = 2  # DT_DOUBLE
+    value.tensor.tensor_content = np.array(elements, dtype=np.float64).tobytes()
+    self.AddEvent(event)
+
+  def AddHistogram(self,
+                   tag,
+                   wall_time=0,
+                   step=0,
+                   hmin=1,
+                   hmax=2,
+                   hnum=3,
+                   hsum=4,
+                   hsum_squares=5,
+                   hbucket_limit=None,
+                   hbucket=None):
+    histo = tf.HistogramProto(
+        min=hmin,
+        max=hmax,
+        num=hnum,
+        sum=hsum,
+        sum_squares=hsum_squares,
+        bucket_limit=hbucket_limit,
+        bucket=hbucket)
+    event = tf.Event(
+        wall_time=wall_time,
+        step=step,
+        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, histo=histo)]))
+    self.AddEvent(event)
+
+  def AddImage(self,
+               tag,
+               wall_time=0,
+               step=0,
+               encoded_image_string=b'imgstr',
+               width=150,
+               height=100):
+    image = tf.Summary.Image(
+        encoded_image_string=encoded_image_string, width=width, height=height)
+    event = tf.Event(
+        wall_time=wall_time,
+        step=step,
+        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, image=image)]))
+    self.AddEvent(event)
+
+  def AddAudio(self,
+               tag,
+               wall_time=0,
+               step=0,
+               encoded_audio_string=b'sndstr',
+               content_type='audio/wav',
+               sample_rate=44100,
+               length_frames=22050):
+    audio = tf.Summary.Audio(
+        encoded_audio_string=encoded_audio_string,
+        content_type=content_type,
+        sample_rate=sample_rate,
+        length_frames=length_frames)
+    event = tf.Event(
+        wall_time=wall_time,
+        step=step,
+        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, audio=audio)]))
+    self.AddEvent(event)
+
+  def AddEvent(self, event):
+    if self.zero_out_timestamps:
+      event.wall_time = 0
+    self.items.append(event)
+
+  def add_event(self, event):  # pylint: disable=invalid-name
+    """Match the EventWriter API."""
+    self.AddEvent(event)
+
+  def get_logdir(self):  # pylint: disable=invalid-name
+    """Return a temp directory for asset writing."""
+    return self._testcase.get_temp_dir()
+
+
+class EventAccumulatorTest(tf.test.TestCase):
+
+  def assertTagsEqual(self, actual, expected):
+    """Utility method for checking the return value of the Tags() call.
+
+    It fills out the `expected` arg with the default (empty) values for every
+    tag type, so that the author needs only specify the non-empty values they
+    are interested in testing.
+
+    Args:
+      actual: The actual Accumulator tags response.
+      expected: The expected tags response (empty fields may be omitted)
+    """
+
+    empty_tags = {
+        ea.IMAGES: [],
+        ea.AUDIO: [],
+        ea.SCALARS: [],
+        ea.HISTOGRAMS: [],
+        ea.COMPRESSED_HISTOGRAMS: [],
+        ea.GRAPH: False,
+        ea.META_GRAPH: False,
+        ea.RUN_METADATA: [],
+        ea.TENSORS: [],
+    }
+
+    # Verifies that there are no unexpected keys in the actual response.
+    # If this line fails, likely you added a new tag type, and need to update
+    # the empty_tags dictionary above.
+    self.assertItemsEqual(actual.keys(), empty_tags.keys())
+
+    for key in actual:
+      expected_value = expected.get(key, empty_tags[key])
+      if isinstance(expected_value, list):
+        self.assertItemsEqual(actual[key], expected_value)
+      else:
+        self.assertEqual(actual[key], expected_value)
+
+
+class MockingEventAccumulatorTest(EventAccumulatorTest):
+
+  def setUp(self):
+    super(MockingEventAccumulatorTest, self).setUp()
+    self.stubs = tf.test.StubOutForTesting()
+    self._real_constructor = ea.EventAccumulator
+    self._real_generator = ea._GeneratorFromPath
+
+    def _FakeAccumulatorConstructor(generator, *args, **kwargs):
+      ea._GeneratorFromPath = lambda x: generator
+      return self._real_constructor(generator, *args, **kwargs)
+
+    ea.EventAccumulator = _FakeAccumulatorConstructor
+
+  def tearDown(self):
+    self.stubs.CleanUp()
+    ea.EventAccumulator = self._real_constructor
+    ea._GeneratorFromPath = self._real_generator
+
+  def testEmptyAccumulator(self):
+    gen = _EventGenerator(self)
+    x = ea.EventAccumulator(gen)
+    x.Reload()
+    self.assertTagsEqual(x.Tags(), {})
+
+  def testTags(self):
+    """Tags should be found in EventAccumulator after adding some events."""
+    gen = _EventGenerator(self)
+    gen.AddScalar('s1')
+    gen.AddScalar('s2')
+    gen.AddHistogram('hst1')
+    gen.AddHistogram('hst2')
+    gen.AddImage('im1')
+    gen.AddImage('im2')
+    gen.AddAudio('snd1')
+    gen.AddAudio('snd2')
+    acc = ea.EventAccumulator(gen)
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {
+        ea.IMAGES: ['im1', 'im2'],
+        ea.AUDIO: ['snd1', 'snd2'],
+        ea.SCALARS: ['s1', 's2'],
+        ea.HISTOGRAMS: ['hst1', 'hst2'],
+        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
+    })
+
+  def testReload(self):
+    """EventAccumulator contains suitable tags after calling Reload."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {})
+    gen.AddScalar('s1')
+    gen.AddScalar('s2')
+    gen.AddHistogram('hst1')
+    gen.AddHistogram('hst2')
+    gen.AddImage('im1')
+    gen.AddImage('im2')
+    gen.AddAudio('snd1')
+    gen.AddAudio('snd2')
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {
+        ea.IMAGES: ['im1', 'im2'],
+        ea.AUDIO: ['snd1', 'snd2'],
+        ea.SCALARS: ['s1', 's2'],
+        ea.HISTOGRAMS: ['hst1', 'hst2'],
+        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
+    })
+
+  def testScalars(self):
+    """Tests whether EventAccumulator contains scalars after adding them."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
+    s2 = ea.ScalarEvent(wall_time=2, step=12, value=64)
+    gen.AddScalar('s1', wall_time=1, step=10, value=32)
+    gen.AddScalar('s2', wall_time=2, step=12, value=64)
+    acc.Reload()
+    self.assertEqual(acc.Scalars('s1'), [s1])
+    self.assertEqual(acc.Scalars('s2'), [s2])
+
+  def _compareHealthPills(self, expected_event, gotten_event):
+    """Compares 2 health pills.
+
+    Args:
+      expected_event: The expected HealthPillEvent.
+      gotten_event: The gotten HealthPillEvent.
+    """
+    self.assertEqual(expected_event.wall_time, gotten_event.wall_time)
+    self.assertEqual(expected_event.step, gotten_event.step)
+    self.assertEqual(expected_event.device_name, gotten_event.device_name)
+    self.assertEqual(expected_event.node_name, gotten_event.node_name)
+    self.assertEqual(expected_event.output_slot, gotten_event.output_slot)
+    self.assertEqual(len(expected_event.value), len(gotten_event.value))
+    for i, expected_value in enumerate(expected_event.value):
+      self.assertEqual(expected_value, gotten_event.value[i])
+
+  def testHealthPills(self):
+    """HealthPills should be properly inserted into EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    health_pill_elements_1 = list(range(1, 13)) + [
+        float(1), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
+    gen.AddHealthPill(13371337, 41, '/job:localhost/replica:0/task:0/cpu:0',
+                      'Add', 0, health_pill_elements_1)
+    health_pill_elements_2 = list(range(42, 54)) + [
+        float(2), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
+    gen.AddHealthPill(13381338, 42, '/job:localhost/replica:0/task:0/gpu:0',
+                      'Add', 1, health_pill_elements_2)
+    acc.Reload()
+
+    # Retrieve the health pills for each node name.
+    gotten_events = acc.HealthPills('Add')
+    self.assertEquals(2, len(gotten_events))
+    self._compareHealthPills(
+        ea.HealthPillEvent(
+            wall_time=13371337,
+            step=41,
+            device_name='/job:localhost/replica:0/task:0/cpu:0',
+            node_name='Add',
+            output_slot=0,
+            dtype='tf.float32',
+            shape=[1, 2],
+            value=health_pill_elements_1), gotten_events[0])
+    self._compareHealthPills(
+        ea.HealthPillEvent(
+            wall_time=13381338,
+            device_name='/job:localhost/replica:0/task:0/gpu:0',
+            step=42,
+            node_name='Add',
+            output_slot=1,
+            dtype='tf.float64',
+            shape=[3, 4],
+            value=health_pill_elements_2), gotten_events[1])
+
+  def testGetOpsWithHealthPills(self):
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    health_pill_elements_1 = list(range(1, 13)) + [
+        float(1), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
+    gen.AddHealthPill(13371337, 41, '/job:localhost/replica:0/task:0/cpu:0',
+                      'Add', 0, health_pill_elements_1)
+    health_pill_elements_2 = list(range(42, 54)) + [
+        float(2), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
+    gen.AddHealthPill(13381338, 42, '/job:localhost/replica:0/task:0/cpu:0',
+                      'MatMul', 1, health_pill_elements_2)
+    acc.Reload()
+    self.assertItemsEqual(['Add', 'MatMul'], acc.GetOpsWithHealthPills())
+
+  def testHistograms(self):
+    """Tests whether histograms are inserted into EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+
+    val1 = ea.HistogramValue(
+        min=1,
+        max=2,
+        num=3,
+        sum=4,
+        sum_squares=5,
+        bucket_limit=[1, 2, 3],
+        bucket=[0, 3, 0])
+    val2 = ea.HistogramValue(
+        min=-2,
+        max=3,
+        num=4,
+        sum=5,
+        sum_squares=6,
+        bucket_limit=[2, 3, 4],
+        bucket=[1, 3, 0])
+
+    hst1 = ea.HistogramEvent(wall_time=1, step=10, histogram_value=val1)
+    hst2 = ea.HistogramEvent(wall_time=2, step=12, histogram_value=val2)
+    gen.AddHistogram(
+        'hst1',
+        wall_time=1,
+        step=10,
+        hmin=1,
+        hmax=2,
+        hnum=3,
+        hsum=4,
+        hsum_squares=5,
+        hbucket_limit=[1, 2, 3],
+        hbucket=[0, 3, 0])
+    gen.AddHistogram(
+        'hst2',
+        wall_time=2,
+        step=12,
+        hmin=-2,
+        hmax=3,
+        hnum=4,
+        hsum=5,
+        hsum_squares=6,
+        hbucket_limit=[2, 3, 4],
+        hbucket=[1, 3, 0])
+    acc.Reload()
+    self.assertEqual(acc.Histograms('hst1'), [hst1])
+    self.assertEqual(acc.Histograms('hst2'), [hst2])
+
+  def testCompressedHistograms(self):
+    """Tests compressed histograms inserted into EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
+
+    gen.AddHistogram(
+        'hst1',
+        wall_time=1,
+        step=10,
+        hmin=1,
+        hmax=2,
+        hnum=3,
+        hsum=4,
+        hsum_squares=5,
+        hbucket_limit=[1, 2, 3],
+        hbucket=[0, 3, 0])
+    gen.AddHistogram(
+        'hst2',
+        wall_time=2,
+        step=12,
+        hmin=-2,
+        hmax=3,
+        hnum=4,
+        hsum=5,
+        hsum_squares=6,
+        hbucket_limit=[2, 3, 4],
+        hbucket=[1, 3, 0])
+    acc.Reload()
+
+    # Create the expected values after compressing hst1
+    expected_vals1 = [
+        ea.CompressedHistogramValue(bp, val)
+        for bp, val in [(0, 1.0), (2500, 1.25), (5000, 1.5), (7500, 1.75
+                                                             ), (10000, 2.0)]
+    ]
+    expected_cmphst1 = ea.CompressedHistogramEvent(
+        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
+    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
+
+    # Create the expected values after compressing hst2
+    expected_vals2 = [
+        ea.CompressedHistogramValue(bp, val)
+        for bp, val in [(0, -2),
+                        (2500, 2),
+                        (5000, 2 + 1 / 3),
+                        (7500, 2 + 2 / 3),
+                        (10000, 3)]
+    ]
+    expected_cmphst2 = ea.CompressedHistogramEvent(
+        wall_time=2, step=12, compressed_histogram_values=expected_vals2)
+    self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
+
+  def testCompressedHistogramsWithEmptyHistogram(self):
+    """Tests that empty histograms compressed properly in EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
+
+    gen.AddHistogram(
+        'hst1',
+        wall_time=1,
+        step=10,
+        hmin=None,
+        hmax=None,
+        hnum=0,
+        hsum=0,
+        hsum_squares=0,
+        hbucket_limit=[1, 2, 3],
+        hbucket=[0, 0, 0])
+    acc.Reload()
+
+    # Create the expected values after compressing hst1
+    expected_vals1 = [
+        ea.CompressedHistogramValue(bp, val)
+        for bp, val in [(0, 0.0), (2500, 0), (5000, 0), (7500, 0), (10000, 0)]
+    ]
+    expected_cmphst1 = ea.CompressedHistogramEvent(
+        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
+    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
+
+  def testCompressHistogram_uglyHistogram(self):
+    bps = (0, 668, 1587, 3085, 5000, 6915, 8413, 9332, 10000)
+    histogram_values = ea.HistogramValue(
+        min=0.0,
+        max=1.0,
+        num=960.0,
+        sum=64.0,
+        sum_squares=64.0,
+        bucket_limit=[
+            0.0, 1e-12, 0.917246389039776, 1.0089710279437536,
+            1.7976931348623157e+308
+        ],
+        bucket=[0.0, 896.0, 0.0, 64.0, 0.0])
+    histogram_event = ea.HistogramEvent(0, 0, histogram_values)
+    compressed_event = ea._CompressHistogram(histogram_event, bps)
+    vals = compressed_event.compressed_histogram_values
+    self.assertEquals(tuple(v.basis_point for v in vals), bps)
+    self.assertAlmostEqual(vals[0].value, 0.0)
+    self.assertAlmostEqual(vals[1].value, 7.157142857142856e-14)
+    self.assertAlmostEqual(vals[2].value, 1.7003571428571426e-13)
+    self.assertAlmostEqual(vals[3].value, 3.305357142857143e-13)
+    self.assertAlmostEqual(vals[4].value, 5.357142857142857e-13)
+    self.assertAlmostEqual(vals[5].value, 7.408928571428571e-13)
+    self.assertAlmostEqual(vals[6].value, 9.013928571428571e-13)
+    self.assertAlmostEqual(vals[7].value, 9.998571428571429e-13)
+    self.assertAlmostEqual(vals[8].value, 1.0)
+
+  def testImages(self):
+    """Tests 2 images inserted/accessed in EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    im1 = ea.ImageEvent(
+        wall_time=1,
+        step=10,
+        encoded_image_string=b'big',
+        width=400,
+        height=300)
+    im2 = ea.ImageEvent(
+        wall_time=2,
+        step=12,
+        encoded_image_string=b'small',
+        width=40,
+        height=30)
+    gen.AddImage(
+        'im1',
+        wall_time=1,
+        step=10,
+        encoded_image_string=b'big',
+        width=400,
+        height=300)
+    gen.AddImage(
+        'im2',
+        wall_time=2,
+        step=12,
+        encoded_image_string=b'small',
+        width=40,
+        height=30)
+    acc.Reload()
+    self.assertEqual(acc.Images('im1'), [im1])
+    self.assertEqual(acc.Images('im2'), [im2])
+
+  def testAudio(self):
+    """Tests 2 audio events inserted/accessed in EventAccumulator."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    snd1 = ea.AudioEvent(
+        wall_time=1,
+        step=10,
+        encoded_audio_string=b'big',
+        content_type='audio/wav',
+        sample_rate=44100,
+        length_frames=441000)
+    snd2 = ea.AudioEvent(
+        wall_time=2,
+        step=12,
+        encoded_audio_string=b'small',
+        content_type='audio/wav',
+        sample_rate=44100,
+        length_frames=44100)
+    gen.AddAudio(
+        'snd1',
+        wall_time=1,
+        step=10,
+        encoded_audio_string=b'big',
+        content_type='audio/wav',
+        sample_rate=44100,
+        length_frames=441000)
+    gen.AddAudio(
+        'snd2',
+        wall_time=2,
+        step=12,
+        encoded_audio_string=b'small',
+        content_type='audio/wav',
+        sample_rate=44100,
+        length_frames=44100)
+    acc.Reload()
+    self.assertEqual(acc.Audio('snd1'), [snd1])
+    self.assertEqual(acc.Audio('snd2'), [snd2])
+
+  def testKeyError(self):
+    """KeyError should be raised when accessing non-existing keys."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    acc.Reload()
+    with self.assertRaises(KeyError):
+      acc.Scalars('s1')
+    with self.assertRaises(KeyError):
+      acc.Scalars('hst1')
+    with self.assertRaises(KeyError):
+      acc.Scalars('im1')
+    with self.assertRaises(KeyError):
+      acc.Histograms('s1')
+    with self.assertRaises(KeyError):
+      acc.Histograms('im1')
+    with self.assertRaises(KeyError):
+      acc.Images('s1')
+    with self.assertRaises(KeyError):
+      acc.Images('hst1')
+    with self.assertRaises(KeyError):
+      acc.Audio('s1')
+    with self.assertRaises(KeyError):
+      acc.Audio('hst1')
+
+  def testNonValueEvents(self):
+    """Non-value events in the generator don't cause early exits."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddScalar('s1', wall_time=1, step=10, value=20)
+    gen.AddEvent(tf.Event(wall_time=2, step=20, file_version='nots2'))
+    gen.AddScalar('s3', wall_time=3, step=100, value=1)
+    gen.AddHistogram('hst1')
+    gen.AddImage('im1')
+    gen.AddAudio('snd1')
+
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {
+        ea.IMAGES: ['im1'],
+        ea.AUDIO: ['snd1'],
+        ea.SCALARS: ['s1', 's3'],
+        ea.HISTOGRAMS: ['hst1'],
+        ea.COMPRESSED_HISTOGRAMS: ['hst1'],
+    })
+
+  def testExpiredDataDiscardedAfterRestartForFileVersionLessThan2(self):
+    """Tests that events are discarded after a restart is detected.
+
+    If a step value is observed to be lower than what was previously seen,
+    this should force a discard of all previous items with the same tag
+    that are outdated.
+
+    Only file versions < 2 use this out-of-order discard logic. Later versions
+    discard events based on the step value of SessionLog.START.
+    """
+    warnings = []
+    self.stubs.Set(tf.logging, 'warn', warnings.append)
+
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+
+    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
+    gen.AddScalar('s1', wall_time=1, step=100, value=20)
+    gen.AddScalar('s1', wall_time=1, step=200, value=20)
+    gen.AddScalar('s1', wall_time=1, step=300, value=20)
+    acc.Reload()
+    ## Check that number of items are what they should be
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
+
+    gen.AddScalar('s1', wall_time=1, step=101, value=20)
+    gen.AddScalar('s1', wall_time=1, step=201, value=20)
+    gen.AddScalar('s1', wall_time=1, step=301, value=20)
+    acc.Reload()
+    ## Check that we have discarded 200 and 300 from s1
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
+
+  def testOrphanedDataNotDiscardedIfFlagUnset(self):
+    """Tests that events are not discarded if purge_orphaned_data is false.
+    """
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen, purge_orphaned_data=False)
+
+    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
+    gen.AddScalar('s1', wall_time=1, step=100, value=20)
+    gen.AddScalar('s1', wall_time=1, step=200, value=20)
+    gen.AddScalar('s1', wall_time=1, step=300, value=20)
+    acc.Reload()
+    ## Check that number of items are what they should be
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
+
+    gen.AddScalar('s1', wall_time=1, step=101, value=20)
+    gen.AddScalar('s1', wall_time=1, step=201, value=20)
+    gen.AddScalar('s1', wall_time=1, step=301, value=20)
+    acc.Reload()
+    ## Check that we have discarded 200 and 300 from s1
+    self.assertEqual([x.step for x in acc.Scalars('s1')],
+                     [100, 200, 300, 101, 201, 301])
+
+  def testEventsDiscardedPerTagAfterRestartForFileVersionLessThan2(self):
+    """Tests that event discards after restart, only affect the misordered tag.
+
+    If a step value is observed to be lower than what was previously seen,
+    this should force a discard of all previous items that are outdated, but
+    only for the out of order tag. Other tags should remain unaffected.
+
+    Only file versions < 2 use this out-of-order discard logic. Later versions
+    discard events based on the step value of SessionLog.START.
+    """
+    warnings = []
+    self.stubs.Set(tf.logging, 'warn', warnings.append)
+
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+
+    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
+    gen.AddScalar('s1', wall_time=1, step=100, value=20)
+    gen.AddScalar('s1', wall_time=1, step=200, value=20)
+    gen.AddScalar('s1', wall_time=1, step=300, value=20)
+    gen.AddScalar('s1', wall_time=1, step=101, value=20)
+    gen.AddScalar('s1', wall_time=1, step=201, value=20)
+    gen.AddScalar('s1', wall_time=1, step=301, value=20)
+
+    gen.AddScalar('s2', wall_time=1, step=101, value=20)
+    gen.AddScalar('s2', wall_time=1, step=201, value=20)
+    gen.AddScalar('s2', wall_time=1, step=301, value=20)
+
+    acc.Reload()
+    ## Check that we have discarded 200 and 300
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
+
+    ## Check that s1 discards do not affect s2
+    ## i.e. check that only events from the out of order tag are discarded
+    self.assertEqual([x.step for x in acc.Scalars('s2')], [101, 201, 301])
+
+  def testOnlySummaryEventsTriggerDiscards(self):
+    """Test that file version event does not trigger data purge."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddScalar('s1', wall_time=1, step=100, value=20)
+    ev1 = tf.Event(wall_time=2, step=0, file_version='brain.Event:1')
+    graph_bytes = tf.GraphDef().SerializeToString()
+    ev2 = tf.Event(wall_time=3, step=0, graph_def=graph_bytes)
+    gen.AddEvent(ev1)
+    gen.AddEvent(ev2)
+    acc.Reload()
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100])
+
+  def testSessionLogStartMessageDiscardsExpiredEvents(self):
+    """Test that SessionLog.START message discards expired events.
+
+    This discard logic is preferred over the out-of-order step discard logic,
+    but this logic can only be used for event protos which have the SessionLog
+    enum, which was introduced to event.proto for file_version >= brain.Event:2.
+    """
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddEvent(tf.Event(wall_time=0, step=1, file_version='brain.Event:2'))
+
+    gen.AddScalar('s1', wall_time=1, step=100, value=20)
+    gen.AddScalar('s1', wall_time=1, step=200, value=20)
+    gen.AddScalar('s1', wall_time=1, step=300, value=20)
+    gen.AddScalar('s1', wall_time=1, step=400, value=20)
+
+    gen.AddScalar('s2', wall_time=1, step=202, value=20)
+    gen.AddScalar('s2', wall_time=1, step=203, value=20)
+
+    slog = tf.SessionLog(status=tf.SessionLog.START)
+    gen.AddEvent(tf.Event(wall_time=2, step=201, session_log=slog))
+    acc.Reload()
+    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200])
+    self.assertEqual([x.step for x in acc.Scalars('s2')], [])
+
+  def testFirstEventTimestamp(self):
+    """Test that FirstEventTimestamp() returns wall_time of the first event."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddEvent(tf.Event(wall_time=10, step=20, file_version='brain.Event:2'))
+    gen.AddScalar('s1', wall_time=30, step=40, value=20)
+    self.assertEqual(acc.FirstEventTimestamp(), 10)
+
+  def testReloadPopulatesFirstEventTimestamp(self):
+    """Test that Reload() means FirstEventTimestamp() won't load events."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddEvent(tf.Event(wall_time=1, step=2, file_version='brain.Event:2'))
+
+    acc.Reload()
+
+    def _Die(*args, **kwargs):  # pylint: disable=unused-argument
+      raise RuntimeError('Load() should not be called')
+
+    self.stubs.Set(gen, 'Load', _Die)
+    self.assertEqual(acc.FirstEventTimestamp(), 1)
+
+  def testFirstEventTimestampLoadsEvent(self):
+    """Test that FirstEventTimestamp() doesn't discard the loaded event."""
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddEvent(tf.Event(wall_time=1, step=2, file_version='brain.Event:2'))
+
+    self.assertEqual(acc.FirstEventTimestamp(), 1)
+    acc.Reload()
+    self.assertEqual(acc.file_version, 2.0)
+
+  def testTFSummaryScalar(self):
+    """Verify processing of tf.summary.scalar."""
+    event_sink = _EventGenerator(self, zero_out_timestamps=True)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
+    with self.test_session() as sess:
+      ipt = tf.placeholder(tf.float32)
+      tf.summary.scalar('scalar1', ipt)
+      tf.summary.scalar('scalar2', ipt * ipt)
+      merged = tf.summary.merge_all()
+      writer.add_graph(sess.graph)
+      for i in xrange(10):
+        summ = sess.run(merged, feed_dict={ipt: i})
+        writer.add_summary(summ, global_step=i)
+
+    accumulator = ea.EventAccumulator(event_sink)
+    accumulator.Reload()
+
+    seq1 = [ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)]
+    seq2 = [
+        ea.ScalarEvent(
+            wall_time=0, step=i, value=i * i) for i in xrange(10)
+    ]
+
+    self.assertTagsEqual(accumulator.Tags(), {
+        ea.SCALARS: ['scalar1', 'scalar2'],
+        ea.GRAPH: True,
+        ea.META_GRAPH: False,
+    })
+
+    self.assertEqual(accumulator.Scalars('scalar1'), seq1)
+    self.assertEqual(accumulator.Scalars('scalar2'), seq2)
+    first_value = accumulator.Scalars('scalar1')[0].value
+    self.assertTrue(isinstance(first_value, float))
+
+  def testTFSummaryImage(self):
+    """Verify processing of tf.summary.image."""
+    event_sink = _EventGenerator(self, zero_out_timestamps=True)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
+    with self.test_session() as sess:
+      ipt = tf.ones([10, 4, 4, 3], tf.uint8)
+      # This is an interesting example, because the old tf.image_summary op
+      # would throw an error here, because it would be tag reuse.
+      # Using the tf node name instead allows argument re-use to the image
+      # summary.
+      with tf.name_scope('1'):
+        tf.summary.image('images', ipt, max_outputs=1)
+      with tf.name_scope('2'):
+        tf.summary.image('images', ipt, max_outputs=2)
+      with tf.name_scope('3'):
+        tf.summary.image('images', ipt, max_outputs=3)
+      merged = tf.summary.merge_all()
+      writer.add_graph(sess.graph)
+      for i in xrange(10):
+        summ = sess.run(merged)
+        writer.add_summary(summ, global_step=i)
+
+    accumulator = ea.EventAccumulator(event_sink)
+    accumulator.Reload()
+
+    tags = [
+        u'1/images/image', u'2/images/image/0', u'2/images/image/1',
+        u'3/images/image/0', u'3/images/image/1', u'3/images/image/2'
+    ]
+
+    self.assertTagsEqual(accumulator.Tags(), {
+        ea.IMAGES: tags,
+        ea.GRAPH: True,
+        ea.META_GRAPH: False,
+    })
+
+  def testTFSummaryTensor(self):
+    """Verify processing of tf.summary.tensor."""
+    event_sink = _EventGenerator(self, zero_out_timestamps=True)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
+    with self.test_session() as sess:
+      tf.summary.tensor_summary('scalar', tf.constant(1.0))
+      tf.summary.tensor_summary('vector', tf.constant([1.0, 2.0, 3.0]))
+      tf.summary.tensor_summary('string', tf.constant(six.b('foobar')))
+      merged = tf.summary.merge_all()
+      summ = sess.run(merged)
+      writer.add_summary(summ, 0)
+
+    accumulator = ea.EventAccumulator(event_sink)
+    accumulator.Reload()
+
+    self.assertTagsEqual(accumulator.Tags(), {
+        ea.TENSORS: ['scalar', 'vector', 'string'],
+    })
+
+    scalar_proto = accumulator.Tensors('scalar')[0].tensor_proto
+    scalar = tf.make_ndarray(scalar_proto)
+    vector_proto = accumulator.Tensors('vector')[0].tensor_proto
+    vector = tf.make_ndarray(vector_proto)
+    string_proto = accumulator.Tensors('string')[0].tensor_proto
+    string = tf.make_ndarray(string_proto)
+
+    self.assertTrue(np.array_equal(scalar, 1.0))
+    self.assertTrue(np.array_equal(vector, [1.0, 2.0, 3.0]))
+    self.assertTrue(np.array_equal(string, six.b('foobar')))
+
+
+class RealisticEventAccumulatorTest(EventAccumulatorTest):
+
+  def setUp(self):
+    super(RealisticEventAccumulatorTest, self).setUp()
+
+  def testScalarsRealistically(self):
+    """Test accumulator by writing values and then reading them."""
+
+    def FakeScalarSummary(tag, value):
+      value = tf.Summary.Value(tag=tag, simple_value=value)
+      summary = tf.Summary(value=[value])
+      return summary
+
+    directory = os.path.join(self.get_temp_dir(), 'values_dir')
+    if tf.gfile.IsDirectory(directory):
+      tf.gfile.DeleteRecursively(directory)
+    tf.gfile.MkDir(directory)
+
+    writer = tf.summary.FileWriter(directory, max_queue=100)
+
+    with tf.Graph().as_default() as graph:
+      _ = tf.constant([2.0, 1.0])
+    # Add a graph to the summary writer.
+    writer.add_graph(graph)
+    meta_graph_def = tf.train.export_meta_graph(graph_def=graph.as_graph_def(
+        add_shapes=True))
+    writer.add_meta_graph(meta_graph_def)
+
+    run_metadata = tf.RunMetadata()
+    device_stats = run_metadata.step_stats.dev_stats.add()
+    device_stats.device = 'test device'
+    writer.add_run_metadata(run_metadata, 'test run')
+
+    # Write a bunch of events using the writer.
+    for i in xrange(30):
+      summ_id = FakeScalarSummary('id', i)
+      summ_sq = FakeScalarSummary('sq', i * i)
+      writer.add_summary(summ_id, i * 5)
+      writer.add_summary(summ_sq, i * 5)
+    writer.flush()
+
+    # Verify that we can load those events properly
+    acc = ea.EventAccumulator(directory)
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {
+        ea.SCALARS: ['id', 'sq'],
+        ea.GRAPH: True,
+        ea.META_GRAPH: True,
+        ea.RUN_METADATA: ['test run'],
+    })
+    id_events = acc.Scalars('id')
+    sq_events = acc.Scalars('sq')
+    self.assertEqual(30, len(id_events))
+    self.assertEqual(30, len(sq_events))
+    for i in xrange(30):
+      self.assertEqual(i * 5, id_events[i].step)
+      self.assertEqual(i * 5, sq_events[i].step)
+      self.assertEqual(i, id_events[i].value)
+      self.assertEqual(i * i, sq_events[i].value)
+
+    # Write a few more events to test incremental reloading
+    for i in xrange(30, 40):
+      summ_id = FakeScalarSummary('id', i)
+      summ_sq = FakeScalarSummary('sq', i * i)
+      writer.add_summary(summ_id, i * 5)
+      writer.add_summary(summ_sq, i * 5)
+    writer.flush()
+
+    # Verify we can now see all of the data
+    acc.Reload()
+    id_events = acc.Scalars('id')
+    sq_events = acc.Scalars('sq')
+    self.assertEqual(40, len(id_events))
+    self.assertEqual(40, len(sq_events))
+    for i in xrange(40):
+      self.assertEqual(i * 5, id_events[i].step)
+      self.assertEqual(i * 5, sq_events[i].step)
+      self.assertEqual(i, id_events[i].value)
+      self.assertEqual(i * i, sq_events[i].value)
+    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
+    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
+
+  def testGraphFromMetaGraphBecomesAvailable(self):
+    """Test accumulator by writing values and then reading them."""
+
+    directory = os.path.join(self.get_temp_dir(), 'metagraph_test_values_dir')
+    if tf.gfile.IsDirectory(directory):
+      tf.gfile.DeleteRecursively(directory)
+    tf.gfile.MkDir(directory)
+
+    writer = tf.summary.FileWriter(directory, max_queue=100)
+
+    with tf.Graph().as_default() as graph:
+      _ = tf.constant([2.0, 1.0])
+    # Add a graph to the summary writer.
+    meta_graph_def = tf.train.export_meta_graph(graph_def=graph.as_graph_def(
+        add_shapes=True))
+    writer.add_meta_graph(meta_graph_def)
+
+    writer.flush()
+
+    # Verify that we can load those events properly
+    acc = ea.EventAccumulator(directory)
+    acc.Reload()
+    self.assertTagsEqual(acc.Tags(), {
+        ea.GRAPH: True,
+        ea.META_GRAPH: True,
+    })
+    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
+    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/summary/event_file_inspector.py b/tensorflow/tensorboard/backend/event_processing/event_file_inspector.py
similarity index 94%
rename from tensorflow/python/summary/event_file_inspector.py
rename to tensorflow/tensorboard/backend/event_processing/event_file_inspector.py
index 3f56bb58e10..e120dd2ab16 100644
--- a/tensorflow/python/summary/event_file_inspector.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_file_inspector.py
@@ -116,15 +116,13 @@ import collections
 import itertools
 import os
 
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import gfile
-from tensorflow.python.summary import event_accumulator
-from tensorflow.python.summary import event_multiplexer
-from tensorflow.python.summary.impl import event_file_loader
+import tensorflow as tf
 
-FLAGS = flags.FLAGS
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_file_loader
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+
+FLAGS = tf.flags.FLAGS
 
 
 # Map of field names within summary.proto to the user-facing names that this
@@ -195,11 +193,11 @@ def get_field_to_observations_map(generator, query_for_tag=''):
       increment('graph', event)
     if event.HasField('session_log') and (not query_for_tag):
       status = event.session_log.status
-      if status == SessionLog.START:
+      if status == tf.SessionLog.START:
         increment('sessionlog:start', event)
-      elif status == SessionLog.STOP:
+      elif status == tf.SessionLog.STOP:
         increment('sessionlog:stop', event)
-      elif status == SessionLog.CHECKPOINT:
+      elif status == tf.SessionLog.CHECKPOINT:
         increment('sessionlog:checkpoint', event)
     elif event.HasField('summary'):
       for value in event.summary.value:
@@ -325,11 +323,13 @@ def generators_from_logdir(logdir):
     List of event generators for each subdirectory with event files.
   """
   subdirs = event_multiplexer.GetLogdirSubdirectories(logdir)
-  generators = [itertools.chain(*[
-      generator_from_event_file(os.path.join(subdir, f))
-      for f in gfile.ListDirectory(subdir)
-      if event_accumulator.IsTensorFlowEventsFile(os.path.join(subdir, f))
-  ]) for subdir in subdirs]
+  generators = [
+      itertools.chain(*[
+          generator_from_event_file(os.path.join(subdir, f))
+          for f in tf.gfile.ListDirectory(subdir)
+          if event_accumulator.IsTensorFlowEventsFile(os.path.join(subdir, f))
+      ]) for subdir in subdirs
+  ]
   return generators
 
 
@@ -360,7 +360,7 @@ def get_inspection_units(logdir='', event_file='', tag=''):
     for subdir in subdirs:
       generator = itertools.chain(*[
           generator_from_event_file(os.path.join(subdir, f))
-          for f in gfile.ListDirectory(subdir)
+          for f in tf.gfile.ListDirectory(subdir)
           if event_accumulator.IsTensorFlowEventsFile(os.path.join(subdir, f))
       ])
       inspection_units.append(InspectionUnit(
@@ -424,4 +424,4 @@ def inspect(logdir='', event_file='', tag=''):
 
 
 if __name__ == '__main__':
-  app.run()
+  tf.app.run()
diff --git a/tensorflow/python/summary/event_file_inspector_test.py b/tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py
similarity index 85%
rename from tensorflow/python/summary/event_file_inspector_test.py
rename to tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py
index 940586cf973..084043d5110 100644
--- a/tensorflow/python/summary/event_file_inspector_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py
@@ -20,16 +20,12 @@ from __future__ import print_function
 import os
 import shutil
 
-from tensorflow.core.framework.summary_pb2 import HistogramProto
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary import event_file_inspector as efi
-from tensorflow.python.training.summary_io import SummaryWriter
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_file_inspector as efi
 
 
-class EventFileInspectorTest(test_util.TensorFlowTestCase):
+class EventFileInspectorTest(tf.test.TestCase):
 
   def setUp(self):
     self.logdir = os.path.join(self.get_temp_dir(), 'tfevents')
@@ -49,15 +45,15 @@ class EventFileInspectorTest(test_util.TensorFlowTestCase):
       subdir = os.path.join(self.logdir, subdir_)
       self._MakeDirectoryIfNotExists(subdir)
 
-      sw = SummaryWriter(subdir)
+      sw = tf.summary.FileWriter(subdir)
       for datum in data:
-        summary = Summary()
+        summary = tf.Summary()
         if 'simple_value' in datum:
           summary.value.add(tag=datum['tag'],
                             simple_value=datum['simple_value'])
           sw.add_summary(summary, global_step=datum['step'])
         elif 'histo' in datum:
-          summary.value.add(tag=datum['tag'], histo=HistogramProto())
+          summary.value.add(tag=datum['tag'], histo=tf.HistogramProto())
           sw.add_summary(summary, global_step=datum['step'])
         elif 'session_log' in datum:
           sw.add_session_log(datum['session_log'], global_step=datum['step'])
@@ -126,13 +122,34 @@ class EventFileInspectorTest(test_util.TensorFlowTestCase):
 
   def testSessionLogSummaries(self):
     data = [
-        {'session_log': SessionLog(status=SessionLog.START), 'step': 0},
-        {'session_log': SessionLog(status=SessionLog.CHECKPOINT), 'step': 1},
-        {'session_log': SessionLog(status=SessionLog.CHECKPOINT), 'step': 2},
-        {'session_log': SessionLog(status=SessionLog.CHECKPOINT), 'step': 3},
-        {'session_log': SessionLog(status=SessionLog.STOP), 'step': 4},
-        {'session_log': SessionLog(status=SessionLog.START), 'step': 5},
-        {'session_log': SessionLog(status=SessionLog.STOP), 'step': 6},
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.START),
+            'step': 0
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
+            'step': 1
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
+            'step': 2
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
+            'step': 3
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.STOP),
+            'step': 4
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.START),
+            'step': 5
+        },
+        {
+            'session_log': tf.SessionLog(status=tf.SessionLog.STOP),
+            'step': 6
+        },
     ]
 
     self._WriteScalarSummaries(data)
@@ -169,4 +186,4 @@ class EventFileInspectorTest(test_util.TensorFlowTestCase):
                                                        (15, 3)])
 
 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_file_loader.py b/tensorflow/tensorboard/backend/event_processing/event_file_loader.py
new file mode 100644
index 00000000000..896142daaf4
--- /dev/null
+++ b/tensorflow/tensorboard/backend/event_processing/event_file_loader.py
@@ -0,0 +1,74 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functionality for loading events from a record file."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class EventFileLoader(object):
+  """An EventLoader is an iterator that yields Event protos."""
+
+  def __init__(self, file_path):
+    if file_path is None:
+      raise ValueError('A file path is required')
+    file_path = tf.resource_loader.readahead_file_path(file_path)
+    tf.logging.debug('Opening a record reader pointing at %s', file_path)
+    with tf.errors.raise_exception_on_not_ok_status() as status:
+      self._reader = tf.pywrap_tensorflow.PyRecordReader_New(
+          tf.compat.as_bytes(file_path), 0, tf.compat.as_bytes(''), status)
+    # Store it for logging purposes.
+    self._file_path = file_path
+    if not self._reader:
+      raise IOError('Failed to open a record reader pointing to %s' % file_path)
+
+  def Load(self):
+    """Loads all new values from disk.
+
+    Calling Load multiple times in a row will not 'drop' events as long as the
+    return value is not iterated over.
+
+    Yields:
+      All values that were written to disk that have not been yielded yet.
+    """
+    while True:
+      try:
+        with tf.errors.raise_exception_on_not_ok_status() as status:
+          self._reader.GetNext(status)
+      except (tf.errors.DataLossError, tf.errors.OutOfRangeError):
+        # We ignore partial read exceptions, because a record may be truncated.
+        # PyRecordReader holds the offset prior to the failed read, so retrying
+        # will succeed.
+        break
+      event = tf.Event()
+      event.ParseFromString(self._reader.record())
+      yield event
+    tf.logging.debug('No more events in %s', self._file_path)
+
+
+def main(argv):
+  if len(argv) != 2:
+    print('Usage: event_file_loader <path-to-the-recordio-file>')
+    return 1
+  loader = EventFileLoader(argv[1])
+  for event in loader.Load():
+    print(event)
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/python/summary/impl/event_file_loader_test.py b/tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py
similarity index 93%
rename from tensorflow/python/summary/impl/event_file_loader_test.py
rename to tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py
index 0b354d553db..210a7bc52ed 100644
--- a/tensorflow/python/summary/impl/event_file_loader_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py
@@ -22,12 +22,13 @@ from __future__ import print_function
 import os
 import tempfile
 
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary.impl import event_file_loader
+import tensorflow as tf
 
 
-class EventFileLoaderTest(test_util.TensorFlowTestCase):
+from tensorflow.tensorboard.backend.event_processing import event_file_loader
+
+
+class EventFileLoaderTest(tf.test.TestCase):
   # A record containing a simple event.
   RECORD = (b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\xc0%\xddu'
             b'\xd5A\x1a\rbrain.Event:1\xec\xf32\x8d')
@@ -89,4 +90,4 @@ class EventFileLoaderTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
similarity index 80%
rename from tensorflow/python/summary/event_multiplexer.py
rename to tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
index 04687b42711..e4b8814c929 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
@@ -22,12 +22,11 @@ import os
 import threading
 
 import six
+import tensorflow as tf
 
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import event_accumulator
-from tensorflow.python.summary.impl import directory_watcher
-from tensorflow.python.summary.impl import io_wrapper
+from tensorflow.tensorboard.backend.event_processing import directory_watcher
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import io_wrapper
 
 
 class EventMultiplexer(object):
@@ -65,20 +64,7 @@ class EventMultiplexer(object):
   If you would like to watch `/parent/directory/path`, wait for it to be created
     (if necessary) and then periodically pick up new runs, use
     `AutoloadingMultiplexer`
-
-  @@__init__
-  @@AddRun
-  @@AddRunsFromDirectory
-  @@Reload
-  @@Runs
-  @@RunPaths
-  @@Scalars
-  @@Graph
-  @@MetaGraph
-  @@Histograms
-  @@CompressedHistograms
-  @@Images
-  @@Audio
+  @@Tensors
   """
 
   def __init__(self,
@@ -97,7 +83,7 @@ class EventMultiplexer(object):
       purge_orphaned_data: Whether to discard any events that were "orphaned" by
         a TensorFlow restart.
     """
-    logging.info('Event Multiplexer initializing.')
+    tf.logging.info('Event Multiplexer initializing.')
     self._accumulators_mutex = threading.Lock()
     self._accumulators = {}
     self._paths = {}
@@ -105,11 +91,11 @@ class EventMultiplexer(object):
     self._size_guidance = size_guidance
     self.purge_orphaned_data = purge_orphaned_data
     if run_path_map is not None:
-      logging.info('Event Multplexer doing initialization load for %s',
-                   run_path_map)
+      tf.logging.info('Event Multplexer doing initialization load for %s',
+                      run_path_map)
       for (run, path) in six.iteritems(run_path_map):
         self.AddRun(path, run)
-    logging.info('Event Multiplexer done initializing')
+    tf.logging.info('Event Multiplexer done initializing')
 
   def AddRun(self, path, name=None):
     """Add a run to the multiplexer.
@@ -138,9 +124,9 @@ class EventMultiplexer(object):
         if name in self._paths and self._paths[name] != path:
           # TODO(danmane) - Make it impossible to overwrite an old path with
           # a new path (just give the new path a distinct name)
-          logging.warning('Conflict for name %s: old path %s, new path %s',
-                          name, self._paths[name], path)
-        logging.info('Constructing EventAccumulator for %s', path)
+          tf.logging.warning('Conflict for name %s: old path %s, new path %s',
+                             name, self._paths[name], path)
+        tf.logging.info('Constructing EventAccumulator for %s', path)
         accumulator = event_accumulator.EventAccumulator(
             path,
             size_guidance=self._size_guidance,
@@ -179,18 +165,18 @@ class EventMultiplexer(object):
     Returns:
       The `EventMultiplexer`.
     """
-    logging.info('Starting AddRunsFromDirectory: %s', path)
+    tf.logging.info('Starting AddRunsFromDirectory: %s', path)
     for subdir in GetLogdirSubdirectories(path):
-      logging.info('Adding events from directory %s', subdir)
+      tf.logging.info('Adding events from directory %s', subdir)
       rpath = os.path.relpath(subdir, path)
       subname = os.path.join(name, rpath) if name else rpath
       self.AddRun(subdir, name=subname)
-    logging.info('Done with AddRunsFromDirectory: %s', path)
+    tf.logging.info('Done with AddRunsFromDirectory: %s', path)
     return self
 
   def Reload(self):
     """Call `Reload` on every `EventAccumulator`."""
-    logging.info('Beginning EventMultiplexer.Reload()')
+    tf.logging.info('Beginning EventMultiplexer.Reload()')
     self._reload_called = True
     # Build a list so we're safe even if the list of accumulators is modified
     # even while we're reloading.
@@ -202,17 +188,50 @@ class EventMultiplexer(object):
       try:
         accumulator.Reload()
       except (OSError, IOError) as e:
-        logging.error("Unable to reload accumulator '%s': %s", name, e)
+        tf.logging.error("Unable to reload accumulator '%s': %s", name, e)
       except directory_watcher.DirectoryDeletedError:
         names_to_delete.add(name)
 
     with self._accumulators_mutex:
       for name in names_to_delete:
-        logging.warning("Deleting accumulator '%s'", name)
+        tf.logging.warning("Deleting accumulator '%s'", name)
         del self._accumulators[name]
-    logging.info('Finished with EventMultiplexer.Reload()')
+    tf.logging.info('Finished with EventMultiplexer.Reload()')
     return self
 
+  def PluginAssets(self, plugin_name):
+    """Get index of runs and assets for a given plugin.
+
+    Args:
+      plugin_name: Name of the plugin we are checking for.
+
+    Returns:
+      A dictionary that maps from run_name to a list of plugin
+        assets for that run.
+    """
+    with self._accumulators_mutex:
+      # To avoid nested locks, we construct a copy of the run-accumulator map
+      items = list(six.iteritems(self._accumulators))
+
+    return {run: accum.PluginAssets(plugin_name) for run, accum in items}
+
+  def RetrievePluginAsset(self, run, plugin_name, asset_name):
+    """Return the contents for a specific plugin asset from a run.
+
+    Args:
+      run: The string name of the run.
+      plugin_name: The string name of a plugin.
+      asset_name: The string name of an asset.
+
+    Returns:
+      The string contents of the plugin asset.
+
+    Raises:
+      KeyError: If the asset is not available.
+    """
+    accumulator = self._GetAccumulator(run)
+    return accumulator.RetrievePluginAsset(plugin_name, asset_name)
+
   def FirstEventTimestamp(self, run):
     """Return the timestamp of the first event of the given run.
 
@@ -251,7 +270,7 @@ class EventMultiplexer(object):
     return accumulator.Scalars(tag)
 
   def HealthPills(self, run, node_name):
-    """Retrieve the scalar events associated with a run and node name.
+    """Retrieve the health pill events associated with a run and node name.
 
     Args:
       run: A string name of the run for which health pills are retrieved.
@@ -267,6 +286,21 @@ class EventMultiplexer(object):
     accumulator = self._GetAccumulator(run)
     return accumulator.HealthPills(node_name)
 
+  def GetOpsWithHealthPills(self, run):
+    """Determines which ops have at least 1 health pill event for a given run.
+
+    Args:
+      run: The name of the run.
+
+    Raises:
+      KeyError: If the run is not found, or the node name is not available for
+        the given run.
+
+    Returns:
+      The list of names of ops with health pill events.
+    """
+    return self._GetAccumulator(run).GetOpsWithHealthPills()
+
   def Graph(self, run):
     """Retrieve the graph associated with the provided run.
 
@@ -384,6 +418,23 @@ class EventMultiplexer(object):
     accumulator = self._GetAccumulator(run)
     return accumulator.Audio(tag)
 
+  def Tensors(self, run, tag):
+    """Retrieve the tensor events associated with a run and tag.
+
+    Args:
+      run: A string name of the run for which values are retrieved.
+      tag: A string name of the tag for which values are retrieved.
+
+    Raises:
+      KeyError: If the run is not found, or the tag is not available for
+        the given run.
+
+    Returns:
+      An array of `event_accumulator.TensorEvent`s.
+    """
+    accumulator = self._GetAccumulator(run)
+    return accumulator.Tensors(tag)
+
   def Runs(self):
     """Return all the run names in the `EventMultiplexer`.
 
@@ -412,7 +463,7 @@ class EventMultiplexer(object):
 
 def GetLogdirSubdirectories(path):
   """Returns subdirectories with event files on path."""
-  if gfile.Exists(path) and not gfile.IsDirectory(path):
+  if tf.gfile.Exists(path) and not tf.gfile.IsDirectory(path):
     raise ValueError('GetLogdirSubdirectories: path exists and is not a '
                      'directory, %s' % path)
 
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
new file mode 100644
index 00000000000..ea536dfaad6
--- /dev/null
+++ b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
@@ -0,0 +1,360 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+import os.path
+import shutil
+
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+
+
+def _AddEvents(path):
+  if not tf.gfile.IsDirectory(path):
+    tf.gfile.MakeDirs(path)
+  fpath = os.path.join(path, 'hypothetical.tfevents.out')
+  with tf.gfile.GFile(fpath, 'w') as f:
+    f.write('')
+    return fpath
+
+
+def _CreateCleanDirectory(path):
+  if tf.gfile.IsDirectory(path):
+    tf.gfile.DeleteRecursively(path)
+  tf.gfile.MkDir(path)
+
+
+class _FakeAccumulator(object):
+
+  def __init__(self, path, health_pill_mapping=None):
+    """Constructs a fake accumulator with some fake events.
+
+    Args:
+      path: The path for the run that this accumulator is for.
+      health_pill_mapping: An optional mapping from Op to health pill strings.
+    """
+    self._path = path
+    self.reload_called = False
+    self._node_names_to_health_pills = health_pill_mapping or {}
+
+  def Tags(self):
+    return {event_accumulator.IMAGES: ['im1', 'im2'],
+            event_accumulator.AUDIO: ['snd1', 'snd2'],
+            event_accumulator.HISTOGRAMS: ['hst1', 'hst2'],
+            event_accumulator.COMPRESSED_HISTOGRAMS: ['cmphst1', 'cmphst2'],
+            event_accumulator.SCALARS: ['sv1', 'sv2']}
+
+  def FirstEventTimestamp(self):
+    return 0
+
+  def _TagHelper(self, tag_name, enum):
+    if tag_name not in self.Tags()[enum]:
+      raise KeyError
+    return ['%s/%s' % (self._path, tag_name)]
+
+  def Scalars(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.SCALARS)
+
+  def HealthPills(self, node_name):
+    if node_name not in self._node_names_to_health_pills:
+      raise KeyError
+    health_pills = self._node_names_to_health_pills[node_name]
+    return [self._path + '/' + health_pill for health_pill in health_pills]
+
+  def GetOpsWithHealthPills(self):
+    return self._node_names_to_health_pills.keys()
+
+  def Histograms(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.HISTOGRAMS)
+
+  def CompressedHistograms(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.COMPRESSED_HISTOGRAMS)
+
+  def Images(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.IMAGES)
+
+  def Audio(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.AUDIO)
+
+  def Tensors(self, tag_name):
+    return self._TagHelper(tag_name, event_accumulator.TENSORS)
+
+  def Reload(self):
+    self.reload_called = True
+
+
+def _GetFakeAccumulator(path,
+                        size_guidance=None,
+                        compression_bps=None,
+                        purge_orphaned_data=None,
+                        health_pill_mapping=None):
+  del size_guidance, compression_bps, purge_orphaned_data  # Unused.
+  return _FakeAccumulator(path, health_pill_mapping=health_pill_mapping)
+
+
+class EventMultiplexerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(EventMultiplexerTest, self).setUp()
+    self.stubs = tf.test.StubOutForTesting()
+
+    self.stubs.Set(event_accumulator, 'EventAccumulator', _GetFakeAccumulator)
+
+  def tearDown(self):
+    self.stubs.CleanUp()
+
+  def testEmptyLoader(self):
+    """Tests empty EventMultiplexer creation."""
+    x = event_multiplexer.EventMultiplexer()
+    self.assertEqual(x.Runs(), {})
+
+  def testRunNamesRespected(self):
+    """Tests two EventAccumulators inserted/accessed in EventMultiplexer."""
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'run2'])
+    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
+    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
+
+  def testReload(self):
+    """EventAccumulators should Reload after EventMultiplexer call it."""
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertFalse(x._GetAccumulator('run1').reload_called)
+    self.assertFalse(x._GetAccumulator('run2').reload_called)
+    x.Reload()
+    self.assertTrue(x._GetAccumulator('run1').reload_called)
+    self.assertTrue(x._GetAccumulator('run2').reload_called)
+
+  def testScalars(self):
+    """Tests Scalars function returns suitable values."""
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+
+    run1_actual = x.Scalars('run1', 'sv1')
+    run1_expected = ['path1/sv1']
+
+    self.assertEqual(run1_expected, run1_actual)
+
+  def testHealthPills(self):
+    """Tests HealthPills() returns events associated with run1/Add."""
+    self.stubs.Set(event_accumulator, 'EventAccumulator',
+                   functools.partial(
+                       _GetFakeAccumulator,
+                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertEqual(['path1/hp1', 'path1/hp2'], x.HealthPills('run1', 'Add'))
+
+  def testGetOpsWithHealthPillsWhenHealthPillsAreNotAvailable(self):
+    # The event accumulator lacks health pills for the run.
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual([], x.GetOpsWithHealthPills('run1'))
+
+  def testGetOpsWithHealthPillsWhenHealthPillsAreAvailable(self):
+    # The event accumulator has health pills for the run.
+    self.stubs.Set(event_accumulator, 'EventAccumulator',
+                   functools.partial(
+                       _GetFakeAccumulator,
+                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual(['Add'], x.GetOpsWithHealthPills('run1'))
+
+  def testExceptions(self):
+    """KeyError should be raised when accessing non-existing keys."""
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    with self.assertRaises(KeyError):
+      x.Scalars('sv1', 'xxx')
+
+  def testInitialization(self):
+    """Tests EventMultiplexer is created properly with its params."""
+    x = event_multiplexer.EventMultiplexer()
+    self.assertEqual(x.Runs(), {})
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual(x.Runs(), ['run1', 'run2'])
+    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
+    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
+
+  def testAddRunsFromDirectory(self):
+    """Tests AddRunsFromDirectory function.
+
+    Tests the following scenarios:
+    - When the directory does not exist.
+    - When the directory is empty.
+    - When the directory has empty subdirectory.
+    - Contains proper EventAccumulators after adding events.
+    """
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    fakedir = join(tmpdir, 'fake_accumulator_directory')
+    realdir = join(tmpdir, 'real_accumulator_directory')
+    self.assertEqual(x.Runs(), {})
+    x.AddRunsFromDirectory(fakedir)
+    self.assertEqual(x.Runs(), {}, 'loading fakedir had no effect')
+
+    _CreateCleanDirectory(realdir)
+    x.AddRunsFromDirectory(realdir)
+    self.assertEqual(x.Runs(), {}, 'loading empty directory had no effect')
+
+    path1 = join(realdir, 'path1')
+    tf.gfile.MkDir(path1)
+    x.AddRunsFromDirectory(realdir)
+    self.assertEqual(x.Runs(), {}, 'creating empty subdirectory had no effect')
+
+    _AddEvents(path1)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['path1'], 'loaded run: path1')
+    loader1 = x._GetAccumulator('path1')
+    self.assertEqual(loader1._path, path1, 'has the correct path')
+
+    path2 = join(realdir, 'path2')
+    _AddEvents(path2)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2'])
+    self.assertEqual(
+        x._GetAccumulator('path1'), loader1, 'loader1 not regenerated')
+
+    path2_2 = join(path2, 'path2')
+    _AddEvents(path2_2)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2', 'path2/path2'])
+    self.assertEqual(
+        x._GetAccumulator('path2/path2')._path, path2_2, 'loader2 path correct')
+
+  def testAddRunsFromDirectoryThatContainsEvents(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    realdir = join(tmpdir, 'event_containing_directory')
+
+    _CreateCleanDirectory(realdir)
+
+    self.assertEqual(x.Runs(), {})
+
+    _AddEvents(realdir)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['.'])
+
+    subdir = join(realdir, 'subdir')
+    _AddEvents(subdir)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['.', 'subdir'])
+
+  def testAddRunsFromDirectoryWithRunNames(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    realdir = join(tmpdir, 'event_containing_directory')
+
+    _CreateCleanDirectory(realdir)
+
+    self.assertEqual(x.Runs(), {})
+
+    _AddEvents(realdir)
+    x.AddRunsFromDirectory(realdir, 'foo')
+    self.assertItemsEqual(x.Runs(), ['foo/.'])
+
+    subdir = join(realdir, 'subdir')
+    _AddEvents(subdir)
+    x.AddRunsFromDirectory(realdir, 'foo')
+    self.assertItemsEqual(x.Runs(), ['foo/.', 'foo/subdir'])
+
+  def testAddRunsFromDirectoryWalksTree(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    realdir = join(tmpdir, 'event_containing_directory')
+
+    _CreateCleanDirectory(realdir)
+    _AddEvents(realdir)
+    sub = join(realdir, 'subdirectory')
+    sub1 = join(sub, '1')
+    sub2 = join(sub, '2')
+    sub1_1 = join(sub1, '1')
+    _AddEvents(sub1)
+    _AddEvents(sub2)
+    _AddEvents(sub1_1)
+    x.AddRunsFromDirectory(realdir)
+
+    self.assertItemsEqual(x.Runs(), ['.', 'subdirectory/1', 'subdirectory/2',
+                                     'subdirectory/1/1'])
+
+  def testAddRunsFromDirectoryThrowsException(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+
+    filepath = _AddEvents(tmpdir)
+    with self.assertRaises(ValueError):
+      x.AddRunsFromDirectory(filepath)
+
+  def testAddRun(self):
+    x = event_multiplexer.EventMultiplexer()
+    x.AddRun('run1_path', 'run1')
+    run1 = x._GetAccumulator('run1')
+    self.assertEqual(sorted(x.Runs().keys()), ['run1'])
+    self.assertEqual(run1._path, 'run1_path')
+
+    x.AddRun('run1_path', 'run1')
+    self.assertEqual(run1, x._GetAccumulator('run1'), 'loader not recreated')
+
+    x.AddRun('run2_path', 'run1')
+    new_run1 = x._GetAccumulator('run1')
+    self.assertEqual(new_run1._path, 'run2_path')
+    self.assertNotEqual(run1, new_run1)
+
+    x.AddRun('runName3')
+    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'runName3'])
+    self.assertEqual(x._GetAccumulator('runName3')._path, 'runName3')
+
+  def testAddRunMaintainsLoading(self):
+    x = event_multiplexer.EventMultiplexer()
+    x.Reload()
+    x.AddRun('run1')
+    x.AddRun('run2')
+    self.assertTrue(x._GetAccumulator('run1').reload_called)
+    self.assertTrue(x._GetAccumulator('run2').reload_called)
+
+
+class EventMultiplexerWithRealAccumulatorTest(tf.test.TestCase):
+
+  def testDeletingDirectoryRemovesRun(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    run1_dir = join(tmpdir, 'run1')
+    run2_dir = join(tmpdir, 'run2')
+    run3_dir = join(tmpdir, 'run3')
+
+    for dirname in [run1_dir, run2_dir, run3_dir]:
+      _AddEvents(dirname)
+
+    x.AddRun(run1_dir, 'run1')
+    x.AddRun(run2_dir, 'run2')
+    x.AddRun(run3_dir, 'run3')
+
+    x.Reload()
+
+    # Delete the directory, then reload.
+    shutil.rmtree(run2_dir)
+    x.Reload()
+    self.assertNotIn('run2', x.Runs().keys())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/summary/impl/io_wrapper.py b/tensorflow/tensorboard/backend/event_processing/io_wrapper.py
similarity index 86%
rename from tensorflow/python/summary/impl/io_wrapper.py
rename to tensorflow/tensorboard/backend/event_processing/io_wrapper.py
index 258fe8c804f..c185f26a4fd 100644
--- a/tensorflow/python/summary/impl/io_wrapper.py
+++ b/tensorflow/tensorboard/backend/event_processing/io_wrapper.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.platform import gfile
+import tensorflow as tf
 
 
 def IsGCSPath(path):
@@ -29,7 +29,7 @@ def IsGCSPath(path):
 def ListDirectoryAbsolute(directory):
   """Yields all files in the given directory. The paths are absolute."""
   return (os.path.join(directory, path)
-          for path in gfile.ListDirectory(directory))
+          for path in tf.gfile.ListDirectory(directory))
 
 
 def ListRecursively(top):
@@ -37,8 +37,8 @@ def ListRecursively(top):
 
   For each of `top` and its subdirectories, yields a tuple containing the path
   to the directory and the path to each of the contained files.  Note that
-  unlike os.Walk()/gfile.Walk(), this does not list subdirectories and the file
-  paths are all absolute.
+  unlike os.Walk()/tf.gfile.Walk(), this does not list subdirectories and the
+  file paths are all absolute.
 
   If the directory does not exist, this yields nothing.
 
@@ -47,6 +47,6 @@ def ListRecursively(top):
   Yields:
     A list of (dir_path, file_paths) tuples.
   """
-  for dir_path, _, filenames in gfile.Walk(top):
+  for dir_path, _, filenames in tf.gfile.Walk(top):
     yield (dir_path, (os.path.join(dir_path, filename)
                       for filename in filenames))
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
new file mode 100644
index 00000000000..5fb71284244
--- /dev/null
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load plugin assets from disk."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+
+import tensorflow as tf
+
+
+_PLUGINS_DIR = "plugins"
+
+
+def _IsDirectory(parent, item):
+  """Helper that returns if parent/item is a directory."""
+  return tf.gfile.IsDirectory(os.path.join(parent, item))
+
+
+def PluginDirectory(logdir, plugin_name):
+  """Returns the plugin directory for plugin_name."""
+  return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+
+
+def ListPlugins(logdir):
+  """List all the plugins that have registered assets in logdir.
+
+  If the plugins_dir does not exist, it returns an empty list. This maintains
+  compatibility with old directories that have no plugins written.
+
+  Args:
+    logdir: A directory that was created by a TensorFlow events writer.
+
+  Returns:
+    a list of plugin names, as strings
+  """
+  plugins_dir = os.path.join(logdir, _PLUGINS_DIR)
+  if not tf.gfile.IsDirectory(plugins_dir):
+    return []
+  entries = tf.gfile.ListDirectory(plugins_dir)
+  return [x for x in entries if _IsDirectory(plugins_dir, x)]
+
+
+def ListAssets(logdir, plugin_name):
+  """List all the assets that are available for given plugin in a logdir.
+
+  Args:
+    logdir: A directory that was created by a TensorFlow summary.FileWriter.
+    plugin_name: A string name of a plugin to list assets for.
+
+  Returns:
+    A string list of available plugin assets. If the plugin subdirectory does
+    not exist (either because the logdir doesn't exist, or because the plugin
+    didn't register) an empty list is returned.
+  """
+  plugin_dir = PluginDirectory(logdir, plugin_name)
+  if not tf.gfile.IsDirectory(plugin_dir):
+    return []
+  entries = tf.gfile.ListDirectory(plugin_dir)
+  return [x for x in entries if not _IsDirectory(plugin_dir, x)]
+
+
+def RetrieveAsset(logdir, plugin_name, asset_name):
+  """Retrieve a particular plugin asset from a logdir.
+
+  Args:
+    logdir: A directory that was created by a TensorFlow summary.FileWriter.
+    plugin_name: The plugin we want an asset from.
+    asset_name: The name of the requested asset.
+
+  Returns:
+    string contents of the plugin asset.
+
+  Raises:
+    KeyError: if the asset does not exist.
+  """
+
+  asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
+  try:
+    with tf.gfile.Open(asset_path, "r") as f:
+      return f.read()
+  except tf.errors.NotFoundError:
+    raise KeyError("Asset path %s not found" % asset_path)
+  except tf.errors.OpError as e:
+    raise KeyError("Couldn't read asset path: %s, OpError %s" % (asset_path, e))
diff --git a/tensorflow/python/summary/impl/reservoir.py b/tensorflow/tensorboard/backend/event_processing/reservoir.py
similarity index 100%
rename from tensorflow/python/summary/impl/reservoir.py
rename to tensorflow/tensorboard/backend/event_processing/reservoir.py
diff --git a/tensorflow/python/summary/impl/reservoir_test.py b/tensorflow/tensorboard/backend/event_processing/reservoir_test.py
similarity index 97%
rename from tensorflow/python/summary/impl/reservoir_test.py
rename to tensorflow/tensorboard/backend/event_processing/reservoir_test.py
index c526d64f3d0..df4757e2454 100644
--- a/tensorflow/python/summary/impl/reservoir_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/reservoir_test.py
@@ -18,12 +18,12 @@ from __future__ import division
 from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
 
-from tensorflow.python.platform import test
-from tensorflow.python.summary.impl import reservoir
+from tensorflow.tensorboard.backend.event_processing import reservoir
 
 
-class ReservoirTest(test.TestCase):
+class ReservoirTest(tf.test.TestCase):
 
   def testEmptyReservoir(self):
     r = reservoir.Reservoir(1)
@@ -110,7 +110,7 @@ class ReservoirTest(test.TestCase):
     self.assertEqual(len(r.Items('key2')), 8)
 
 
-class ReservoirBucketTest(test.TestCase):
+class ReservoirBucketTest(tf.test.TestCase):
 
   def testEmptyBucket(self):
     b = reservoir._ReservoirBucket(1)
@@ -229,7 +229,7 @@ class ReservoirBucketTest(test.TestCase):
     self.assertEqual(b.Items(), [x * 2 for x in xrange(99)] + [999 * 2])
 
 
-class ReservoirBucketStatisticalDistributionTest(test.TestCase):
+class ReservoirBucketStatisticalDistributionTest(tf.test.TestCase):
 
   def setUp(self):
     self.total = 1000000
@@ -276,4 +276,4 @@ class ReservoirBucketStatisticalDistributionTest(test.TestCase):
 
 
 if __name__ == '__main__':
-  test.main()
+  tf.test.main()
diff --git a/tensorflow/tensorboard/lib/python/http_util.py b/tensorflow/tensorboard/backend/http_util.py
similarity index 96%
rename from tensorflow/tensorboard/lib/python/http_util.py
rename to tensorflow/tensorboard/backend/http_util.py
index 7110178bbc0..81a06a5f14c 100644
--- a/tensorflow/tensorboard/lib/python/http_util.py
+++ b/tensorflow/tensorboard/backend/http_util.py
@@ -26,11 +26,10 @@ import time
 import wsgiref.handlers
 
 import six
-
+import tensorflow as tf
 from werkzeug import wrappers
 
-from tensorflow.python.util import compat
-from tensorflow.tensorboard.lib.python import json_util
+from tensorflow.tensorboard.backend import json_util
 
 
 _EXTRACT_MIMETYPE_PATTERN = re.compile(r'^[^;\s]*')
@@ -118,8 +117,8 @@ def Respond(request,
     content = json.dumps(json_util.Cleanse(content, encoding),
                          ensure_ascii=not charset_match)
   if charset != encoding:
-    content = compat.as_text(content, encoding)
-  content = compat.as_bytes(content, charset)
+    content = tf.compat.as_text(content, encoding)
+  content = tf.compat.as_bytes(content, charset)
   if textual and not charset_match and mimetype not in _JSON_MIMETYPES:
     content_type += '; charset=' + charset
   if (not content_encoding and textual and
diff --git a/tensorflow/tensorboard/lib/python/http_util_test.py b/tensorflow/tensorboard/backend/http_util_test.py
similarity index 97%
rename from tensorflow/tensorboard/lib/python/http_util_test.py
rename to tensorflow/tensorboard/backend/http_util_test.py
index a2a7a1b3d20..6b0c8d3403b 100644
--- a/tensorflow/tensorboard/lib/python/http_util_test.py
+++ b/tensorflow/tensorboard/backend/http_util_test.py
@@ -23,13 +23,13 @@ from __future__ import unicode_literals
 import gzip
 
 import six
+import tensorflow as tf
 from werkzeug import test as wtest
 from werkzeug import wrappers
-from tensorflow.python.platform import test
-from tensorflow.tensorboard.lib.python import http_util
+from tensorflow.tensorboard.backend import http_util
 
 
-class RespondTest(test.TestCase):
+class RespondTest(tf.test.TestCase):
 
   def testHelloWorld(self):
     q = wrappers.Request(wtest.EnvironBuilder().get_environ())
@@ -153,4 +153,4 @@ def _gunzip(bs):
 
 
 if __name__ == '__main__':
-  test.main()
+  tf.test.main()
diff --git a/tensorflow/tensorboard/lib/python/json_util.py b/tensorflow/tensorboard/backend/json_util.py
similarity index 96%
rename from tensorflow/tensorboard/lib/python/json_util.py
rename to tensorflow/tensorboard/backend/json_util.py
index f837852fdf0..ab8f34a2fb9 100644
--- a/tensorflow/tensorboard/lib/python/json_util.py
+++ b/tensorflow/tensorboard/backend/json_util.py
@@ -29,7 +29,8 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.python.util import compat
+import tensorflow as tf
+
 
 _INFINITY = float('inf')
 _NEGATIVE_INFINITY = float('-inf')
@@ -62,7 +63,7 @@ def Cleanse(obj, encoding='utf-8'):
     else:
       return obj
   elif isinstance(obj, bytes):
-    return compat.as_text(obj, encoding)
+    return tf.compat.as_text(obj, encoding)
   elif isinstance(obj, list) or isinstance(obj, tuple):
     return [Cleanse(i, encoding) for i in obj]
   elif isinstance(obj, set):
diff --git a/tensorflow/tensorboard/lib/python/json_util_test.py b/tensorflow/tensorboard/backend/json_util_test.py
similarity index 92%
rename from tensorflow/tensorboard/lib/python/json_util_test.py
rename to tensorflow/tensorboard/backend/json_util_test.py
index 26cb9fb677f..22e815564e4 100644
--- a/tensorflow/tensorboard/lib/python/json_util_test.py
+++ b/tensorflow/tensorboard/backend/json_util_test.py
@@ -17,13 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.platform import googletest
-from tensorflow.tensorboard.lib.python import json_util
+import tensorflow as tf
+
+
+from tensorflow.tensorboard.backend import json_util
 
 _INFINITY = float('inf')
 
 
-class FloatWrapperTest(googletest.TestCase):
+class FloatWrapperTest(tf.test.TestCase):
 
   def _assertWrapsAs(self, to_wrap, expected):
     """Asserts that |to_wrap| becomes |expected| when wrapped."""
@@ -61,4 +63,4 @@ class FloatWrapperTest(googletest.TestCase):
 
 
 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/process_graph.py b/tensorflow/tensorboard/backend/process_graph.py
index ae90a3edbe3..2b314d79cb1 100644
--- a/tensorflow/tensorboard/backend/process_graph.py
+++ b/tensorflow/tensorboard/backend/process_graph.py
@@ -19,7 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util import compat
+
+import tensorflow as tf
 
 
 def prepare_graph_for_ui(graph, limit_attr_size=1024,
@@ -65,5 +66,4 @@ def prepare_graph_for_ui(graph, limit_attr_size=1024,
           # Add the attribute key to the list of "too large" attributes.
           # This is used in the info card in the graph UI to show the user
           # that some attributes are too large to be shown.
-          node.attr[large_attrs_key].list.s.append(compat.as_bytes(key))
-
+          node.attr[large_attrs_key].list.s.append(tf.compat.as_bytes(key))
diff --git a/tensorflow/tensorboard/bower.json b/tensorflow/tensorboard/bower.json
deleted file mode 100644
index 0a0fac45fd6..00000000000
--- a/tensorflow/tensorboard/bower.json
+++ /dev/null
@@ -1,187 +0,0 @@
-{
-  "__autoadded_transitive_dep__": [
-    "font-roboto",
-    "iron-a11y-announcer",
-    "iron-a11y-keys-behavior",
-    "iron-autogrow-textarea",
-    "iron-checked-element-behavior",
-    "iron-dropdown",
-    "iron-fit-behavior",
-    "iron-flex-layout",
-    "iron-form-element-behavior",
-    "iron-icon",
-    "iron-iconset-svg",
-    "iron-input",
-    "iron-menu-behavior",
-    "iron-meta",
-    "iron-overlay-behavior",
-    "iron-range-behavior",
-    "iron-resizable-behavior",
-    "iron-scroll-target-behavior",
-    "iron-validatable-behavior",
-    "neon-animation",
-    "paper-dialog-behavior",
-    "paper-material",
-    "paper-menu-button",
-    "paper-ripple",
-    "promise-polyfill",
-    "web-animations-js",
-    "webcomponentsjs"
-  ],
-  "authors": [
-    "Google"
-  ],
-  "dependencies": {
-    "d3": "3.5.15",
-    "dagre": "0.7.4",
-    "es6-promise": "2.1.0",
-    "font-roboto": "PolymerElements/font-roboto#1.0.1",
-    "graphlib": "1.0.7",
-    "iron-a11y-announcer": "PolymerElements/iron-a11y-announcer#1.0.5",
-    "iron-a11y-keys-behavior": "PolymerElements/iron-a11y-keys-behavior#1.1.8",
-    "iron-ajax": "PolymerElements/iron-ajax#1.2.0",
-    "iron-autogrow-textarea": "PolymerElements/iron-autogrow-textarea#1.0.12",
-    "iron-behaviors": "PolymerElements/iron-behaviors#1.0.17",
-    "iron-checked-element-behavior": "PolymerElements/iron-checked-element-behavior#1.0.4",
-    "iron-collapse": "PolymerElements/iron-collapse#1.0.8",
-    "iron-dropdown": "PolymerElements/iron-dropdown#1.4.0",
-    "iron-fit-behavior": "PolymerElements/iron-fit-behavior#1.2.5",
-    "iron-flex-layout": "PolymerElements/iron-flex-layout#1.3.0",
-    "iron-form-element-behavior": "PolymerElements/iron-form-element-behavior#1.0.6",
-    "iron-icon": "PolymerElements/iron-icon#1.0.11",
-    "iron-icons": "PolymerElements/iron-icons#1.1.3",
-    "iron-iconset-svg": "PolymerElements/iron-iconset-svg#1.1.0",
-    "iron-input": "PolymerElements/iron-input#1.0.10",
-    "iron-list": "PolymerElements/iron-list#1.3.9",
-    "iron-menu-behavior": "PolymerElements/iron-menu-behavior#1.1.10",
-    "iron-meta": "PolymerElements/iron-meta#1.1.1",
-    "iron-overlay-behavior": "PolymerElements/iron-overlay-behavior#1.10.1",
-    "iron-range-behavior": "PolymerElements/iron-range-behavior#1.0.4",
-    "iron-resizable-behavior": "PolymerElements/iron-resizable-behavior#1.0.3",
-    "iron-scroll-target-behavior": "PolymerElements/iron-scroll-target-behavior#1.0.3",
-    "iron-selector": "PolymerElements/iron-selector#1.5.2",
-    "iron-validatable-behavior": "PolymerElements/iron-validatable-behavior#1.1.1",
-    "lodash": "3.8.0",
-    "neon-animation": "PolymerElements/neon-animation#1.2.2",
-    "numericjs": "1.2.6",
-    "paper-behaviors": "PolymerElements/paper-behaviors#1.0.12",
-    "paper-button": "PolymerElements/paper-button#1.0.11",
-    "paper-checkbox": "PolymerElements/paper-checkbox#1.4.0",
-    "paper-dialog": "PolymerElements/paper-dialog#1.0.4",
-    "paper-dialog-behavior": "PolymerElements/paper-dialog-behavior#1.2.5",
-    "paper-dialog-scrollable": "PolymerElements/paper-dialog-scrollable#1.1.5",
-    "paper-dropdown-menu": "PolymerElements/paper-dropdown-menu#1.4.0",
-    "paper-header-panel": "PolymerElements/paper-header-panel#1.1.4",
-    "paper-icon-button": "PolymerElements/paper-icon-button#1.1.3",
-    "paper-input": "PolymerElements/paper-input#1.1.18",
-    "paper-item": "PolymerElements/paper-item#1.1.4",
-    "paper-listbox": "PolymerElements/paper-listbox#1.1.2",
-    "paper-material": "PolymerElements/paper-material#1.0.6",
-    "paper-menu": "PolymerElements/paper-menu#1.2.2",
-    "paper-menu-button": "PolymerElements/paper-menu-button#1.5.1",
-    "paper-progress": "PolymerElements/paper-progress#1.0.9",
-    "paper-radio-button": "PolymerElements/paper-radio-button#1.1.2",
-    "paper-radio-group": "PolymerElements/paper-radio-group#1.0.9",
-    "paper-ripple": "PolymerElements/paper-ripple#1.0.5",
-    "paper-slider": "PolymerElements/paper-slider#1.0.10",
-    "paper-spinner": "PolymerElements/paper-spinner#1.1.1",
-    "paper-styles": "PolymerElements/paper-styles#1.1.4",
-    "paper-tabs": "PolymerElements/paper-tabs#1.7.0",
-    "paper-toast": "PolymerElements/paper-toast#1.3.0",
-    "paper-toggle-button": "PolymerElements/paper-toggle-button#1.2.0",
-    "paper-toolbar": "PolymerElements/paper-toolbar#1.1.4",
-    "paper-tooltip": "PolymerElements/paper-tooltip#1.1.2",
-    "plottable": "1.16.1",
-    "polymer": "1.7.0",
-    "promise-polyfill": "polymerlabs/promise-polyfill#1.0.0",
-    "three.js": "threejs#r77",
-    "web-animations-js": "web-animations/web-animations-js#2.2.1",
-    "webcomponentsjs": "webcomponents/webcomponentsjs#0.7.22",
-    "weblas": "0.9.0"
-  },
-  "description": "TensorBoard: Visualizations for TensorFlow",
-  "devDependencies": {
-    "iron-component-page": "PolymerElements/iron-component-page#^1.1.4",
-    "iron-demo-helpers": "PolymerElements/iron-demo-helpers#^1.2.3",
-    "web-component-tester": "Polymer/web-component-tester"
-  },
-  "ignore": [
-    "**/.*",
-    "node_modules",
-    "bower_components",
-    "test",
-    "tests"
-  ],
-  "license": "Apache-2.0",
-  "name": "tensorboard",
-  "private": true,
-  "resolutions": {
-    "d3": "3.5.15",
-    "dagre": "0.7.4",
-    "es6-promise": "2.1.0",
-    "font-roboto": "1.0.1",
-    "graphlib": "1.0.7",
-    "iron-a11y-announcer": "1.0.5",
-    "iron-a11y-keys-behavior": "1.1.8",
-    "iron-ajax": "1.2.0",
-    "iron-autogrow-textarea": "1.0.12",
-    "iron-behaviors": "1.0.17",
-    "iron-checked-element-behavior": "1.0.4",
-    "iron-collapse": "1.0.8",
-    "iron-dropdown": "1.4.0",
-    "iron-fit-behavior": "1.2.5",
-    "iron-flex-layout": "1.3.0",
-    "iron-form-element-behavior": "1.0.6",
-    "iron-icon": "1.0.11",
-    "iron-icons": "1.1.3",
-    "iron-iconset-svg": "1.1.0",
-    "iron-input": "1.0.10",
-    "iron-list": "1.3.9",
-    "iron-menu-behavior": "1.1.10",
-    "iron-meta": "1.1.1",
-    "iron-overlay-behavior": "1.10.1",
-    "iron-range-behavior": "1.0.4",
-    "iron-resizable-behavior": "1.0.3",
-    "iron-scroll-target-behavior": "1.0.3",
-    "iron-selector": "1.5.2",
-    "iron-validatable-behavior": "1.1.1",
-    "lodash": "3.8.0",
-    "neon-animation": "1.2.2",
-    "numericjs": "1.2.6",
-    "paper-behaviors": "1.0.12",
-    "paper-button": "1.0.11",
-    "paper-checkbox": "1.4.0",
-    "paper-dialog": "1.0.4",
-    "paper-dialog-behavior": "1.2.5",
-    "paper-dialog-scrollable": "1.1.5",
-    "paper-dropdown-menu": "1.4.0",
-    "paper-header-panel": "1.1.4",
-    "paper-icon-button": "1.1.3",
-    "paper-input": "1.1.18",
-    "paper-item": "1.1.4",
-    "paper-listbox": "1.1.2",
-    "paper-material": "1.0.6",
-    "paper-menu": "1.2.2",
-    "paper-menu-button": "1.5.1",
-    "paper-progress": "1.0.9",
-    "paper-radio-button": "1.1.2",
-    "paper-radio-group": "1.0.9",
-    "paper-ripple": "1.0.5",
-    "paper-slider": "1.0.10",
-    "paper-spinner": "1.1.1",
-    "paper-styles": "1.1.4",
-    "paper-tabs": "1.7.0",
-    "paper-toast": "1.3.0",
-    "paper-toggle-button": "1.2.0",
-    "paper-toolbar": "1.1.4",
-    "paper-tooltip": "1.1.2",
-    "plottable": "1.16.1",
-    "polymer": "1.7.0",
-    "promise-polyfill": "1.0.0",
-    "three.js": "threejs#r77",
-    "web-animations-js": "2.2.1",
-    "webcomponentsjs": "0.7.22",
-    "weblas": "0.9.0"
-  },
-  "version": "0.0.0"
-}
diff --git a/tensorflow/tensorboard/bower/BUILD b/tensorflow/tensorboard/bower/BUILD
deleted file mode 100644
index 22a9d8187bd..00000000000
--- a/tensorflow/tensorboard/bower/BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-# AUTOGENERATED FILE by tensorboard_bower_dependency_sync.py
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "bower",
-    srcs = [
-        "@d3//:d3",
-        "@dagre//:dagre",
-        "@es6_promise//:es6_promise",
-        "@font_roboto//:font_roboto",
-        "@graphlib//:graphlib",
-        "@iron_a11y_announcer//:iron_a11y_announcer",
-        "@iron_a11y_keys_behavior//:iron_a11y_keys_behavior",
-        "@iron_ajax//:iron_ajax",
-        "@iron_autogrow_textarea//:iron_autogrow_textarea",
-        "@iron_behaviors//:iron_behaviors",
-        "@iron_checked_element_behavior//:iron_checked_element_behavior",
-        "@iron_collapse//:iron_collapse",
-        "@iron_dropdown//:iron_dropdown",
-        "@iron_fit_behavior//:iron_fit_behavior",
-        "@iron_flex_layout//:iron_flex_layout",
-        "@iron_form_element_behavior//:iron_form_element_behavior",
-        "@iron_icon//:iron_icon",
-        "@iron_icons//:iron_icons",
-        "@iron_iconset_svg//:iron_iconset_svg",
-        "@iron_input//:iron_input",
-        "@iron_list//:iron_list",
-        "@iron_menu_behavior//:iron_menu_behavior",
-        "@iron_meta//:iron_meta",
-        "@iron_overlay_behavior//:iron_overlay_behavior",
-        "@iron_range_behavior//:iron_range_behavior",
-        "@iron_resizable_behavior//:iron_resizable_behavior",
-        "@iron_scroll_target_behavior//:iron_scroll_target_behavior",
-        "@iron_selector//:iron_selector",
-        "@iron_validatable_behavior//:iron_validatable_behavior",
-        "@lodash//:lodash",
-        "@neon_animation//:neon_animation",
-        "@numericjs_numeric_min_js//file",
-        "@paper_behaviors//:paper_behaviors",
-        "@paper_button//:paper_button",
-        "@paper_checkbox//:paper_checkbox",
-        "@paper_dialog//:paper_dialog",
-        "@paper_dialog_behavior//:paper_dialog_behavior",
-        "@paper_dialog_scrollable//:paper_dialog_scrollable",
-        "@paper_dropdown_menu//:paper_dropdown_menu",
-        "@paper_header_panel//:paper_header_panel",
-        "@paper_icon_button//:paper_icon_button",
-        "@paper_input//:paper_input",
-        "@paper_item//:paper_item",
-        "@paper_listbox//:paper_listbox",
-        "@paper_material//:paper_material",
-        "@paper_menu//:paper_menu",
-        "@paper_menu_button//:paper_menu_button",
-        "@paper_progress//:paper_progress",
-        "@paper_radio_button//:paper_radio_button",
-        "@paper_radio_group//:paper_radio_group",
-        "@paper_ripple//:paper_ripple",
-        "@paper_slider//:paper_slider",
-        "@paper_spinner//:paper_spinner",
-        "@paper_styles//:paper_styles",
-        "@paper_tabs//:paper_tabs",
-        "@paper_toast//:paper_toast",
-        "@paper_toggle_button//:paper_toggle_button",
-        "@paper_toolbar//:paper_toolbar",
-        "@paper_tooltip//:paper_tooltip",
-        "@plottable//:plottable",
-        "@polymer_archive//:polymer",
-        "@promise_polyfill//:promise_polyfill",
-        "@three_js_orbitcontrols_js//file",
-        "@three_js_three_min_js//file",
-        "@web_animations_js//:web_animations_js",
-        "@webcomponentsjs//:webcomponentsjs",
-        "@weblas_weblas_js//file",
-    ],
-)
diff --git a/tensorflow/tensorboard/components/BUILD b/tensorflow/tensorboard/components/BUILD
index 301425d96d0..2d7613dbfdc 100644
--- a/tensorflow/tensorboard/components/BUILD
+++ b/tensorflow/tensorboard/components/BUILD
@@ -1,23 +1,47 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+ts_web_library(
+    name = "tensorboard",
+    srcs = [
+        "analytics.html",
+        "tensorboard.html",
+    ],
+    path = "/",
+    deps = ["//tensorflow/tensorboard/components/tf_tensorboard"],
+)
+
+tensorboard_html_binary(
+    name = "index",
+    input_path = "/tensorboard.html",
+    output_path = "/index.html",
+    deps = [":tensorboard"],
+)
+
+ts_web_library(
+    name = "trace_viewer",
+    srcs = [
+        "trace_viewer.html",
+    ],
+    path = "/",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_trace_viewer",
+    ],
+)
+
+tensorboard_html_binary(
+    name = "trace_viewer_index",
+    input_path = "/trace_viewer.html",
+    output_path = "/trace_viewer_index.html",
+    deps = [":trace_viewer"],
+)
 
 filegroup(
     name = "all_files",
-    srcs = glob(
-        [
-            "tf_*/**/*",
-            "vz_*/**/*",
-        ],
-        exclude = [
-            "**/tf_model_zoo/*",
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ) + [
-        "BUILD",
-    ],
-    visibility = ["//tensorflow:__subpackages__"],
+    srcs = glob(["**"]),
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/components/analytics.html b/tensorflow/tensorboard/components/analytics.html
new file mode 100644
index 00000000000..d319f576fc1
--- /dev/null
+++ b/tensorflow/tensorboard/components/analytics.html
@@ -0,0 +1,18 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!-- TODO(jart): Give users the ability to opt-in to analytics. -->
diff --git a/tensorflow/tensorboard/components/index.html b/tensorflow/tensorboard/components/index.html
deleted file mode 100644
index c790a76f753..00000000000
--- a/tensorflow/tensorboard/components/index.html
+++ /dev/null
@@ -1,39 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <script src="webcomponentsjs/webcomponents-lite.min.js"></script>
-    <style>
-      html, body {
-        margin: 0;
-        padding: 0;
-        height: 100%;
-        font-family: "RobotoDraft","Roboto",sans-serif;
-      }
-    </style>
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-    <link rel="import" href="tf-tensorboard/tf-tensorboard.html">
-    <title>TensorBoard</title>
-  </head>
-  <body>
-    <tf-tensorboard use-hash></tf-tensorboard>
-    <script src="../app/analytics.js"></script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tensorboard.html b/tensorflow/tensorboard/components/tensorboard.html
new file mode 100644
index 00000000000..afaf396614f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tensorboard.html
@@ -0,0 +1,26 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>TensorBoard</title>
+<link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
+<link rel="import" href="tf-tensorboard/style.html">
+<link rel="import" href="tf-tensorboard/tf-tensorboard.html">
+<link rel="import" href="analytics.html">
+<body>
+<tf-tensorboard use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
new file mode 100644
index 00000000000..3bc754063c7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
@@ -0,0 +1,50 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_audio_dashboard",
+    srcs = [
+        "tf-audio-dashboard.html",
+        "tf-audio-grid.html",
+        "tf-audio-loader.html",
+    ],
+    path = "/tf-audio-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+ts_web_library(
+    name = "index",
+    srcs = [
+        "demo/index.html",
+        "index.html",
+    ],
+    path = "/tf-audio-dashboard",
+    deps = [
+        ":tf_audio_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer_iron_component_page",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
index e6c92b095e3..a1d7e968e8f 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
@@ -16,47 +16,52 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<html>
-  <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-    <link rel="import" href="../tf-audio-dashboard.html">
-    <title>Audio Dashboard Demo</title>
-    <style>
-      #container{
-        width: 1000px;
-        height: 800px;
-        border: 2px solid grey;
-      }
-      html,body {
-        height: 100%;
-      }
-    </style>
-  </head>
-  <body>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../../paper-styles/typography.html">
+<link rel="import" href="../tf-audio-dashboard.html">
+
+<title>Audio Dashboard Demo</title>
+<style>
+  #container {
+    height: 300px;
+    width: 100%;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
     <dom-module id="audio-dash-demo">
       <template>
-        <tf-audio-dashboard id="demo" backend="[[backend]]">
-        </tf-audio-dashboard>
+        <tf-audio-dashboard id="demo" backend="[[backend]]"></tf-audio-dashboard>
       </template>
       <script>
+        import {Backend} from '../tf-backend/backend';
+        import {createRouter, setRouter} from '../tf-backend/router';
+
         Polymer({
           is: "audio-dash-demo",
           properties: {
             backend: {
               type: Object,
               value: function() {
-                var path = "/demo/data";
-                var router = new TF.Backend.router(path, true);
-                return new TF.Backend.Backend(router);
+                return new Backend();
               },
             },
           },
+          created: function() {
+            var router = createRouter("/data", true);
+            setRouter(router);
+          },
         });
       </script>
     </dom-module>
-    <div id="container">
-      <audio-dash-demo></audio-dash-demo>
-    </div>
-  </body>
-</html>
+    <audio-dash-demo id="container"></audio-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard/index.html
new file mode 100644
index 00000000000..157f1692658
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/index.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>tf-audio-dashboard</title>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<script src="../webcomponentsjs/webcomponents-lite.js"></script>
+<link rel="import" href="../iron-component-page/iron-component-page.html">
+<body>
+<iron-component-page src="tf-audio-dashboard.html"></iron-component-page>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD
new file mode 100644
index 00000000000..3d50e5d2caa
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD
@@ -0,0 +1,33 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "audioDashboardTests.ts",
+        "tests.html",
+    ],
+    path = "/tf-audio-dashboard/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_audio_dashboard",
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "//tensorflow/tensorboard/demo:demo_data",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts b/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
index bd6f7627107..6ccd9bede66 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
@@ -12,32 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-declare function stub(el: string, obj: any): void;
 
-    describe('audio dashboard tests', function() {
-      var audioDash;
-      var reloadCount = 0;
-      beforeEach(function() {
-        audioDash = fixture('testElementFixture');
-        var router = TF.Backend.router('data', true);
-        var backend = new TF.Backend.Backend(router);
-        audioDash.backend = backend;
-        stub('tf-audio-loader', {
-          reload: function() { reloadCount++; },
-        });
-      });
+import * as backend_backend from '../../tf-backend/backend';
+import {createRouter, setRouter} from '../../tf-backend/router';
 
-      it('calling reload on dashboard reloads the audio-loaders',
-         function(done) {
-           audioDash.backendReload().then(() => {
-             reloadCount = 0;
-             var loaders = [].slice.call(
-                 audioDash.getElementsByTagName('tf-audio-loader'));
-             audioDash.frontendReload();
-             setTimeout(function() {
-               assert.isAbove(reloadCount, 3);
-               done();
-             });
-           });
-         });
+// TODO(dandelion): Fix me.
+declare function fixture(id: string): any;
+declare function stub(x, y: any): void;
+
+describe('audio dashboard tests', () => {
+  let audioDash;
+  let reloadCount = 0;
+  beforeEach(() => {
+    audioDash = fixture('testElementFixture');
+    const router = createRouter('/data', true);
+    setRouter(router);
+    const backend = new backend_backend.Backend();
+    audioDash.backend = backend;
+    stub('tf-audio-loader', {
+      reload: () => { reloadCount++; },
     });
+  });
+
+  it('calling reload on dashboard reloads the audio-loaders', (done) => {
+    audioDash.backendReload().then(() => {
+      reloadCount = 0;
+      const loaders =
+          [].slice.call(audioDash.getElementsByTagName('tf-audio-loader'));
+      audioDash.frontendReload();
+      setTimeout(() => {
+        chai.assert.isTrue(reloadCount >= 2);
+        done();
+      });
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json b/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json
deleted file mode 100644
index 478f5ed8337..00000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run1": {"audio": ["foo", "bar"]}, "run2": {"audio": ["bar", "zod"]}}
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html
deleted file mode 100644
index 421a6658b02..00000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <script src="../../web-component-tester/browser.js"></script>
-    <link rel="import" href="../../tf-imports/d3.html">
-    <link rel="import" href="../tf-audio-dashboard.html">
-    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-</head>
-<body>
-  <test-fixture id="testElementFixture">
-    <template>
-      <tf-audio-dashboard></tf-audio-dashboard>
-    </template>
-  </test-fixture>
-  <script src="audioDashboardTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html b/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html
new file mode 100644
index 00000000000..891e8bf0c29
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html
@@ -0,0 +1,38 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../tf-imports/d3.html">
+<link rel="import" href="../tf-audio-dashboard.html">
+<style>
+  html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+</style>
+
+<test-fixture id="testElementFixture">
+  <template>
+    <tf-audio-dashboard></tf-audio-dashboard>
+  </template>
+</test-fixture>
+
+<script src="audioDashboardTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
index ad879210d6f..7caea7130d0 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
@@ -59,14 +59,25 @@ tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
     </style>
   </template>
   <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+
     Polymer({
       is: "tf-audio-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       properties: {
-        dataType: {value: "audio"},
+        dataType: {
+          type: Object,
+          value: "audio",
+        },
       },
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-audio-loader"),
-        TF.Backend.Behavior
+        DashboardBehavior("audio"),
+        ReloadBehavior("tf-audio-loader"),
+        BackendBehavior,
       ],
       attached: function() {
         this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
index 145003433c4..c71d8bdd4bf 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
@@ -54,15 +54,17 @@ is high)
     <div id="fullContainer" class="container scrollbar">
       <div id="topRow" class="container">
         <div class="noshrink" id="paddingCell"></div>
-        <template
-          is="dom-repeat"
-          items="[[runs]]"
-          as="run"
-        >
-        <div class="run-name-cell noshrink">
-          <span>[[run]]</span>
-        </div>
-      </template>
+        <template is="dom-if" if="[[_tagsExist(tags)]]">
+          <template
+            is="dom-repeat"
+            items="[[runs]]"
+            as="run"
+          >
+            <div class="run-name-cell noshrink">
+              <span>[[run]]</span>
+            </div>
+          </template>
+        </template>
       </div>
       <div id="bottomContainer" class="container">
         <template
@@ -99,6 +101,7 @@ is high)
       :host {
         display: block;
         height: 100%;
+        --audio-cell-min-height: 105px;
       }
       .container {
         display: flex;
@@ -127,14 +130,17 @@ is high)
         padding-top: 5px;
       }
       .audio-cell {
+        background: #FAFAFA;
         width: 300px;
-        height: 36px;
+        min-height: var(--audio-cell-min-height);
         border: 1px solid black;
         margin-right: 3px;
+        padding: 10px;
+        box-sizing: border-box;
       }
       .tag-name-cell {
         width: 300px;
-        height: 36px;
+        height: var(--audio-cell-min-height);
         display:flex;
         flex-direction: column;
         justify-content: center;
@@ -146,7 +152,6 @@ is high)
       }
       .run-name-cell {
         width: 300px;
-        height: 36px;
         text-align: center;
         margin-right: 5px;
       }
@@ -155,7 +160,6 @@ is high)
       }
       #paddingCell {
         width: 300px;
-        height: 36px;
       }
     </style>
   </template>
@@ -168,6 +172,9 @@ is high)
         runs: Array,
         audioGenerator: Function,
       },
+      _tagsExist: function(tags) {
+        return tags && tags.length > 0;
+      },
       _exists: function (run, tag) {
         return this.runToAudio[run].indexOf(tag) !== -1;
       },
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
index 0a62b132435..71539537d0e 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
@@ -16,6 +16,9 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
 <link rel="import" href="../tf-imports/lodash.html">
 
 <!--
@@ -28,41 +31,126 @@ future for loading older clips.
 -->
 <dom-module id="tf-audio-loader">
   <style>
-  :host {
-    display: block;
-  }
-  img {
-    width: 100%;
-    height: 100%;
-    image-rendering: pixelated;
-  }
+    :host {
+      display: block;
+      --step-slider-knob-color: #424242;
+    }
+
+    img {
+      width: 100%;
+      height: 100%;
+      image-rendering: pixelated;
+    }
+
+    .step-description {
+      font-size: 12px;
+    }
+
+    .step-value {
+      font-weight: bold;
+    }
+
+    #audio-loading-spinner {
+      width: 14px;
+      height: 14px;
+      vertical-align: text-bottom;
+      --paper-spinner-color: var(--tb-orange-strong)
+    }
+
+    #steps {
+      height: 15px;
+      margin: 0 0 0 -15px;
+      /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+       * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+       * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
+      width: calc(100% + 31px);
+      --paper-slider-active-color: var(--step-slider-knob-color);
+      --paper-slider-knob-color: var(--step-slider-knob-color);
+      --paper-slider-pin-color: var(--step-slider-knob-color);
+      --paper-slider-knob-start-color: var(--step-slider-knob-color);
+      --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
+      --paper-slider-pin-start-color: var(--step-slider-knob-color);
+    }
+
+    #individual-audio-container audio {
+      margin: 5px 0 0 -10px;
+      width: calc(100% + 20px);
+    }
   </style>
   <template>
-    <template is="dom-if" if="[[audioUrl]]">
-      <audio controls loop>
-        <source src="[[audioUrl]]" type="[[audioContentType]]" />
-      </audio>
+    <template is="dom-if" if="[[_metadatas]]">
+      <template is="dom-if" if="[[_hasAtLeastOneStep(_metadatas)]]">
+        <div class="step-description">
+          step
+          <span class="step-value">
+            [[_stepValue]]
+          </span><br>
+          <template is="dom-if" if="[[_stepWallTime]]">
+            [[_stepWallTime]]
+          </template>
+          <paper-spinner-lite active
+                              id="audio-loading-spinner"
+                              hidden$=[[!_isAudioLoading]]></paper-spinner-lite>
+        </div>
+      </template>
+      <template is="dom-if" if="[[_maxStepIndex]]">
+        <paper-slider
+            id="steps"
+            immediate-value="{{_stepIndex}}"
+            max="[[_maxStepIndex]]"
+            max-markers="[[_maxStepIndex]]"
+            snaps
+            step="1"
+            value="{{_stepIndex}}"></paper-slider>
+      </template>
+      <div id="individual-audio-container"></div>
     </template>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-audio-loader",
       properties: {
         run: String,
         tag: String,
         audioGenerator: Function,
-        audioUrl: String,
-        audioContentType: String
+        // todo: document.
+        _metadatas: Array,
+        _stepIndex: Number,
+        _stepValue: {
+          type: Number,
+          computed: "_computeStepValue(_metadatas, _stepIndex)",
+          value: 0,
+        },
+        _stepWallTime: {
+          type: Number,
+          computed: "_computeStepWallTime(_metadatas, _stepIndex)",
+          value: 0,
+        },
+        _maxStepIndex: {
+          type: Number,
+          computed: "_computeMaxStepIndex(_metadatas)",
+          value: 0,
+        },
+        _isAudioLoading: Boolean,
+        // Used to identify stale requests for audio.
+        _audioRequestId: {
+          type: Number,
+          value: 1
+        },
       },
+      observers: [
+        "_updateAudio(_metadatas, _stepIndex)",
+      ],
       reload: function() {
-        var _this = this;
-        this.audioUrl = ""; // force reload
-        this.audioContentType = "";
         this.audioGenerator(this.tag, this.run).then(function(metadatas) {
-          var last_metadata = _.last(metadatas);
-          _this.audioUrl = last_metadata.url;
-          _this.audioContentType = last_metadata.content_type;
-        })
+          // Set the list of available metadata.
+          this.set("_metadatas", metadatas);
+
+          // Set the index to be the last one.
+          this.set("_stepIndex", this._maxStepIndex);
+        }.bind(this));
       },
       ready: function() {
         // Need to test so that it will not error if it is constructed w/o
@@ -71,6 +159,79 @@ future for loading older clips.
           this.reload();
         }
       },
+      _updateAudio: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return;
+        }
+
+        // Load new audio.
+        const requestId = ++this._audioRequestId;
+        this.set("_isAudioLoading", true);
+
+        // Create a new audio element. Only replace the previous one once the new audio loads.
+        let audioElement = document.createElement("audio");
+        audioElement.setAttribute("controls", true);
+        audioElement.setAttribute("loop", "loop");
+        let canPlayHandler = function() {
+          if (requestId !== this._audioRequestId) {
+            // This request is no longer relevant.
+            return;
+          }
+
+          // Remove this event listener: "canplay" apparently fires in Chrome every time playing
+          // begins again on loop. So, if we create a new audio element every time that happens, we
+          // don't actually loop.
+          audioElement.removeEventListener("canplay", canPlayHandler);
+
+          let individualAudioContainer = this.$$("#individual-audio-container");
+          individualAudioContainer.innerHTML = "";
+          Polymer.dom(individualAudioContainer).appendChild(audioElement);
+          this.set("_isAudioLoading", false);
+        }.bind(this);
+        audioElement.addEventListener("canplay", canPlayHandler);
+        audioElement.addEventListener("error", function() {
+          if (requestId !== this._audioRequestId) {
+            // This request is no longer relevant.
+            return;
+          }
+
+          // The audio could not be loaded.
+          this.$$("#individual-audio-container").innerHTML = "";
+          this.set("_isAudioLoading", false);
+        }.bind(this));
+
+        // Initiate the request for new audio.
+        var sourceElement = document.createElement("source");
+        let metadata = metadatas[stepIndex];
+        sourceElement.setAttribute("src", metadata.url);
+        sourceElement.setAttribute("type", metadata.content_type);
+        audioElement.appendChild(sourceElement);
+      },
+      _computeStepValue: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas[stepIndex].step;
+      },
+      _computeStepWallTime: function(metadatas, stepIndex) {
+        if (!metadatas || stepIndex >= metadatas.length) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas[stepIndex].wall_time.toString();
+      },
+      _computeMaxStepIndex: function(metadatas) {
+        if (!metadatas || metadatas.length === 0) {
+          // No audio to show. The audio section should be hidden.
+          return 0;
+        }
+        return metadatas.length - 1;
+      },
+      _hasAtLeastOneStep: function(metadatas) {
+        return metadatas && metadatas.length > 0;
+      },
     });
   </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_backend/BUILD b/tensorflow/tensorboard/components/tf_backend/BUILD
new file mode 100644
index 00000000000..50fc267dc4d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/BUILD
@@ -0,0 +1,45 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_backend",
+    srcs = [
+        "backend.ts",
+        "behavior.ts",
+        "requestManager.ts",
+        "router.ts",
+        "runsStore.ts",
+        "tf-backend.html",
+        "urlPathHelpers.ts",
+    ],
+    path = "/tf-backend",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/vz_sorting",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_backend"],
+    destdir = "tf-backend",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend/backend.ts b/tensorflow/tensorboard/components/tf_backend/backend.ts
index 28a5b2d0e14..023414b6b75 100644
--- a/tensorflow/tensorboard/components/tf_backend/backend.ts
+++ b/tensorflow/tensorboard/components/tf_backend/backend.ts
@@ -13,427 +13,596 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module TF.Backend {
-  export interface RunEnumeration {
-    histograms: string[];
-    compressedHistogramTuples: string[];
-    scalars: string[];
-    images: string[];
-    audio: string[];
-    graph: boolean;
-    run_metadata: string[];
-  }
+import {compareTagNames} from '../vz-sorting/sorting';
+import {RequestManager} from './requestManager';
+import {getRouter} from './router';
+import {demoify, queryEncoder} from './urlPathHelpers';
 
-  export interface LogdirResponse { logdir: string; }
+export interface RunEnumeration {
+  histograms: string[];
+  compressedHistogramTuples: string[];
+  scalars: string[];
+  images: string[];
+  audio: string[];
+  graph: boolean;
+  run_metadata: string[];
+}
 
-  export interface RunsResponse { [runName: string]: RunEnumeration; }
+export interface LogdirResponse { logdir: string; }
 
-  export type RunToTag = {[run: string]: string[];};
+export interface RunsResponse { [runName: string]: RunEnumeration; }
 
-  export interface Datum {
-    wall_time: Date;
-    step: number;
-  }
+export type RunToTag = {
+  [run: string]: string[];
+};
 
-  export type ScalarDatum = Datum & Scalar;
-  export interface Scalar { scalar: number; }
+export interface Datum {
+  wall_time: Date;
+  step: number;
+}
 
-  export type HistogramDatum = Datum & Histogram;
-  export interface Histogram {
-    min: number;
-    max: number;
-    nItems?: number;
-    sum?: number;
-    sumSquares?: number;
-    bucketRightEdges: number[];
-    bucketCounts: number[];
-  }
+export type ScalarDatum = Datum & Scalar;
+export interface Scalar { scalar: number; }
 
-  export interface HistogramBin {
-    x: number;
-    dx: number;
-    y: number;
-  }
-  export type HistogramSeriesDatum = HistogramSeries & Datum;
-  export interface HistogramSeries { bins: HistogramBin[]; }
+export interface Text { text: string; }
+export type TextDatum = Datum & Text;
 
-  export type ImageDatum = Datum & Image;
-  export interface Image {
-    width: number;
-    height: number;
-    url: string;
-  }
+export type HistogramDatum = Datum & Histogram;
+export interface Histogram {
+  min: number;
+  max: number;
+  nItems?: number;
+  sum?: number;
+  sumSquares?: number;
+  bucketRightEdges: number[];
+  bucketCounts: number[];
+}
 
-  export type AudioDatum = Datum & Audio;
-  export interface Audio {
-    content_type: string;
-    url: string;
-  }
+export interface HistogramBin {
+  x: number;
+  dx: number;
+  y: number;
+}
+export type HistogramSeriesDatum = HistogramSeries & Datum;
+export interface HistogramSeries { bins: HistogramBin[]; }
 
-  // A health pill encapsulates an overview of tensor element values. The value
-  // field is a list of 12 numbers that shed light on the status of the tensor.
-  export interface HealthPill {
-    node_name: string;
-    output_slot: number;
-    value: number[];
-  };
-  export type HealthPillDatum = Datum & HealthPill;
-  // A health pill response is a mapping from node name to a list of health pill
-  // data entries.
-  export interface HealthPillsResponse { [key: string]: HealthPillDatum[]; };
+export type ImageDatum = Datum & Image;
+export interface Image {
+  width: number;
+  height: number;
+  url: string;
+}
+
+export type AudioDatum = Datum & Audio;
+export interface Audio {
+  content_type: string;
+  url: string;
+}
+
+// A health pill encapsulates an overview of tensor element values. The value
+// field is a list of 12 numbers that shed light on the status of the tensor.
+export interface HealthPill {
+  device_name: string;
+  node_name: string;
+  output_slot: number;
+  dtype: string;
+  shape: number[];
+  value: number[];
+}
+
+// When updating this type, keep it consistent with the HealthPill interface
+// in tf_graph_common/lib/scene/scene.ts.
+export type HealthPillDatum = Datum & HealthPill;
+// A health pill response is a mapping from node name to a list of health pill
+// data entries.
+export interface HealthPillsResponse { [key: string]: HealthPillDatum[]; }
+
+// An object that encapsulates an alert issued by the debugger. This alert is
+// sent by debugging libraries after bad values (NaN, +/- Inf) are encountered.
+export interface DebuggerNumericsAlertReport {
+  device_name: string;
+  tensor_name: string;
+  first_timestamp: number;
+  nan_event_count: number;
+  neg_inf_event_count: number;
+  pos_inf_event_count: number;
+}
+// A DebuggerNumericsAlertReportResponse contains alerts issued by the debugger
+// in ascending order of timestamp. This helps the user identify for instance
+// when bad values first appeared in the model.
+export type DebuggerNumericsAlertReportResponse = DebuggerNumericsAlertReport[];
+
+export const TYPES = [
+  'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
+  'runMetadata', 'text'
+];
+/**
+ * The Backend class provides a convenient and typed interface to the backend.
+ *
+ * It provides methods corresponding to the different data sources on the
+ * TensorBoard backend. These methods return a promise containing the data
+ * from the backend. This class does some post-processing on the data; for
+ * example, converting data elements tuples into js objects so that they can
+ * be accessed in a more convenient and clearly-documented fashion.
+ */
+export class Backend {
+  public requestManager: RequestManager;
 
-  export var TYPES = [
-    'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
-    'runMetadata'
-  ];
   /**
-   * The Backend class provides a convenient and typed interface to the backend.
-   *
-   * It provides methods corresponding to the different data sources on the
-   * TensorBoard backend. These methods return a promise containing the data
-   * from the backend. This class does some post-processing on the data; for
-   * example, converting data elements tuples into js objects so that they can
-   * be accessed in a more convenient and clearly-documented fashion.
+   * Construct a Backend instance.
+   * @param requestManager The RequestManager, overwritable so you may
+   * manually clear request queue, etc. Defaults to a new RequestManager.
    */
-  export class Backend {
-    public router: Router;
-    public requestManager: RequestManager;
+  constructor(requestManager?: RequestManager) {
+    this.requestManager = requestManager || new RequestManager();
+  }
 
-    /**
-     * Construct a Backend instance.
-     * @param router the Router with info on what urls to get data from
-     * @param requestManager The RequestManager, overwritable so you may
-     * manually clear request queue, etc. Defaults to a new RequestManager.
-     */
-    constructor(router: Router, requestManager?: RequestManager) {
-      this.router = router;
-      this.requestManager = requestManager || new RequestManager();
+  /**
+   * Returns a promise for requesting the logdir string.
+   */
+  public logdir(): Promise<LogdirResponse> {
+    return this.requestManager.request(getRouter().logdir());
+  }
+
+  /**
+   * Returns a listing of all the available data in the TensorBoard backend.
+   */
+  public runs(): Promise<RunsResponse> {
+    return this.requestManager.request(getRouter().runs());
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for scalar data.
+   */
+  public scalarTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('scalars', '/tags'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for histogram data.
+   */
+  public histogramTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('histograms', '/tags'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for image data.
+   */
+  public imageTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('images', '/tags'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for audio data.
+   */
+  public audioTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('audio', '/tags'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for compressedHistogram
+   * data.
+   */
+  public compressedHistogramTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('distributions', '/tags'));
+  }
+
+  /**
+   * Returns a promise showing the Run-to-Tag mapping for profile data.
+   */
+  public profileTags(): Promise<RunToTag> {
+    let url = getRouter().pluginRoute('profile', '/tags');
+    if (getRouter().isDemoMode()) {
+      url += '.json';
     }
+    return this.requestManager.request(url);
+  }
 
-    /**
-     * Returns a promise for requesting the logdir string.
-     */
-    public logdir(): Promise<LogdirResponse> {
-      return this.requestManager.request(this.router.logdir());
+  /**
+   * Return a promise showing list of runs that contain graphs.
+   */
+  public graphRuns(): Promise<string[]> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('graphs', '/runs'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
+   */
+  public runMetadataTags(): Promise<RunToTag> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('graphs', '/run_metadata_tags'));
+  }
+
+
+  /**
+   * Returns a promise showing the Run-to-Tag mapping for text data.
+   */
+  public textRuns(): Promise<RunToTag> {
+    return this.requestManager.request(getRouter().textRuns());
+  }
+
+
+  /**
+   * Returns a promise containing TextDatums for given run and tag.
+   */
+  public text(tag: string, run: string): Promise<TextDatum[]> {
+    const url = getRouter().text(tag, run);
+    // tslint:disable-next-line:no-any it's convenient and harmless here
+    return this.requestManager.request(url).then(map((x: any) => {
+      x.wall_time = timeToDate(x.wall_time);
+      return x;
+    }));
+  }
+
+  /**
+   * Return a URL to fetch a graph (cf. method 'graph').
+   */
+  public graphUrl(run: string, limitAttrSize?: number, largeAttrsKey?: string):
+      string {
+    const demoMode = getRouter().isDemoMode();
+    const base = getRouter().pluginRoute('graphs', '/graph');
+    const optional = (p) => (p != null && !demoMode || undefined) && p;
+    const parameters = {
+      'run': run,
+      'limit_attr_size': optional(limitAttrSize),
+      'large_attrs_key': optional(largeAttrsKey),
+    };
+    const extension = demoMode ? '.pbtxt' : '';
+    return base + queryEncoder(parameters) + extension;
+  }
+
+  public graph(run: string, limitAttrSize?: number, largeAttrsKey?: string):
+      Promise<string> {
+    const url = this.graphUrl(run, limitAttrSize, largeAttrsKey);
+    return this.requestManager.request(url);
+  }
+
+  /**
+   * Return a promise containing ScalarDatums for given run and tag.
+   */
+  public scalar(tag: string, run: string): Promise<Array<ScalarDatum>> {
+    let p: Promise<TupleData<number>[]>;
+    const url = getRouter().pluginRunTagRoute('scalars', '/scalars')(tag, run);
+    p = this.requestManager.request(url);
+    return p.then(map(detupler(createScalar)));
+  }
+
+  /**
+   * Returns a promise for requesting the health pills for a list of nodes. This
+   * route is used by the debugger plugin.
+   */
+  public healthPills(nodeNames: string[], step?: number):
+      Promise<HealthPillsResponse> {
+    const postData = {
+      'node_names': JSON.stringify(nodeNames),
+
+      // Events files with debugger data fall under this special run.
+      'run': '__debugger_data__',
+    };
+    if (step !== undefined) {
+      // The user requested health pills for a specific step. This request
+      // might be slow since the backend reads events sequentially from disk.
+      postData['step'] = step;
     }
+    return this.requestManager.request(getRouter().healthPills(), postData);
+  }
 
-    /**
-     * Returns a listing of all the available data in the TensorBoard backend.
-     */
-    public runs(): Promise<RunsResponse> {
-      return this.requestManager.request(this.router.runs());
-    }
+  /**
+   * Returns a promise for alerts for bad values (detected by the debugger).
+   * This route is used by the debugger plugin.
+   */
+  public debuggerNumericsAlerts():
+      Promise<DebuggerNumericsAlertReportResponse> {
+    return this.requestManager.request(
+        getRouter().pluginRoute('debugger', '/numerics_alert_report'));
+  }
 
-    /**
-     * Return a promise showing the Run-to-Tag mapping for scalar data.
-     */
-    public scalarRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'scalars'));
-    }
+  /**
+   * Return a promise containing HistogramDatums for given run and tag.
+   */
+  public histogram(tag: string, run: string):
+      Promise<Array<HistogramSeriesDatum>> {
+    let p: Promise<TupleData<HistogramTuple>[]>;
+    const url =
+        getRouter().pluginRunTagRoute('histograms', '/histograms')(tag, run);
+    p = this.requestManager.request(url);
+    return p.then(map(detupler(createHistogram))).then(function(histos) {
+      // Get the minimum and maximum values across all histograms so that the
+      // visualization is aligned for all timesteps.
+      const min = d3.min(histos, d => d.min);
+      const max = d3.max(histos, d => d.max);
 
-    /**
-     * Return a promise showing the Run-to-Tag mapping for histogram data.
-     */
-    public histogramRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'histograms'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for image data.
-     */
-    public imageRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'images'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for audio data.
-     */
-    public audioRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'audio'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for compressedHistogram
-     * data.
-     */
-    public compressedHistogramRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'compressedHistograms'));
-    }
-
-    /**
-     * Return a promise showing list of runs that contain graphs.
-     */
-    public graphRuns(): Promise<string[]> {
-      return this.runs().then(
-          (x) => { return _.keys(x).filter((k) => x[k].graph); });
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
-     */
-    public runMetadataRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'run_metadata'));
-    }
-
-    /**
-     * Return a promise of a graph string from the backend.
-     */
-    public graph(
-        tag: string, limit_attr_size?: number,
-        large_attrs_key?: string): Promise<string> {
-      let url = this.router.graph(tag, limit_attr_size, large_attrs_key);
-      return this.requestManager.request(url);
-    }
-
-    /**
-     * Return a promise containing ScalarDatums for given run and tag.
-     */
-    public scalar(tag: string, run: string): Promise<Array<ScalarDatum>> {
-      let p: Promise<TupleData<number>[]>;
-      let url = this.router.scalars(tag, run);
-      p = this.requestManager.request(url);
-      return p.then(map(detupler(createScalar)));
-    }
-
-    /**
-     * Returns a promise for requesting the health pills for a list of nodes.
-     */
-    public healthPills(nodeNames: string[]): Promise<HealthPillsResponse> {
-      let postData = {'node_names': JSON.stringify(nodeNames)};
-      return this.requestManager.request(this.router.healthPills(), postData);
-    }
-
-    /**
-     * Return a promise containing HistogramDatums for given run and tag.
-     */
-    public histogram(tag: string, run: string):
-        Promise<Array<HistogramSeriesDatum>> {
-      let p: Promise<TupleData<HistogramTuple>[]>;
-      let url = this.router.histograms(tag, run);
-      p = this.requestManager.request(url);
-      return p.then(map(detupler(createHistogram))).then(function(histos) {
-        // Get the minimum and maximum values across all histograms so that the
-        // visualization is aligned for all timesteps.
-        let min = d3.min(histos, d => d.min);
-        let max = d3.max(histos, d => d.max);
-
-        return histos.map(function(histo, i) {
-          return {
-            wall_time: histo.wall_time,
-            step: histo.step,
-            bins: convertBins(histo, min, max)
-          };
-        });
+      return histos.map(function(histo, i) {
+        return {
+          wall_time: histo.wall_time,
+          step: histo.step,
+          bins: convertBins(histo, min, max)
+        };
       });
-    }
-
-    /**
-     * Return a promise containing ImageDatums for given run and tag.
-     */
-    public image(tag: string, run: string): Promise<Array<ImageDatum>> {
-      let url = this.router.images(tag, run);
-      let p: Promise<ImageMetadata[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(this.createImage.bind(this)));
-    }
-
-    /**
-     * Return a promise containing AudioDatums for given run and tag.
-     */
-    public audio(tag: string, run: string): Promise<Array<AudioDatum>> {
-      let url = this.router.audio(tag, run);
-      let p: Promise<AudioMetadata[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(this.createAudio.bind(this)));
-    }
-
-    /**
-     * Returns a promise to load the string RunMetadata for given run/tag.
-     */
-    public runMetadata(tag: string, run: string): Promise<string> {
-      let url = this.router.runMetadata(tag, run);
-      return this.requestManager.request(url);
-    }
-
-    /**
-     * Get compressedHistogram data.
-     * Unlike other methods, don't bother reprocessing this data into a nicer
-     * format. This is because we will deprecate this route.
-     */
-    private compressedHistogram(tag: string, run: string):
-        Promise<Array<Datum&CompressedHistogramTuple>> {
-      let url = this.router.compressedHistograms(tag, run);
-      let p: Promise<TupleData<CompressedHistogramTuple>[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(detupler((x) => x)));
-    }
-
-    private createImage(x: ImageMetadata): Image&Datum {
-      return {
-        width: x.width,
-        height: x.height,
-        wall_time: timeToDate(x.wall_time),
-        step: x.step,
-        url: this.router.individualImage(x.query, x.wall_time),
-      };
-    }
-
-    private createAudio(x: AudioMetadata): Audio&Datum {
-      return {
-        content_type: x.content_type,
-        wall_time: timeToDate(x.wall_time),
-        step: x.step,
-        url: this.router.individualAudio(x.query),
-      };
-    }
-  }
-
-  /** Given a RunToTag, return sorted array of all runs */
-  export function getRuns(r: RunToTag): string[] {
-    return _.keys(r).sort(VZ.Sorting.compareTagNames);
-  }
-
-  /** Given a RunToTag, return array of all tags (sorted + dedup'd) */
-  export function getTags(r: RunToTag): string[] {
-    return _.union.apply(null, _.values(r)).sort(VZ.Sorting.compareTagNames);
-  }
-
-  /**
-   * Given a RunToTag and an array of runs, return every tag that appears for
-   * at least one run.
-   * Sorted, deduplicated.
-   */
-  export function filterTags(r: RunToTag, runs: string[]): string[] {
-    var result = [];
-    runs.forEach((x) => result = result.concat(r[x]));
-    return _.uniq(result).sort(VZ.Sorting.compareTagNames);
-  }
-
-  function timeToDate(x: number): Date { return new Date(x * 1000); };
-
-  /**  Just a curryable map to make things cute and tidy. */
-  function map<T, U>(f: (x: T) => U): (arr: T[]) => U[] {
-    return function(arr: T[]): U[] { return arr.map(f); };
-  };
-
-  /**
-   * This is a higher order function that takes a function that transforms a
-   * T into a G, and returns a function that takes TupleData<T>s and converts
-   * them into the intersection of a G and a Datum.
-   */
-  function detupler<T, G>(xform: (x: T) => G): (t: TupleData<T>) => Datum & G {
-    return function(x: TupleData<T>): Datum & G {
-      // Create a G, assert it has type <G & Datum>
-      let obj = <G&Datum>xform(x[2]);
-      // ... patch in the properties of datum
-      obj.wall_time = timeToDate(x[0]);
-      obj.step = x[1];
-      return obj;
-    };
-  };
-
-  function createScalar(x: number): Scalar { return {scalar: x}; };
-
-  function createHistogram(x: HistogramTuple): Histogram {
-    return {
-      min: x[0],
-      max: x[1],
-      nItems: x[2],
-      sum: x[3],
-      sumSquares: x[4],
-      bucketRightEdges: x[5],
-      bucketCounts: x[6],
-    };
-  };
-
-  /**
-   * Takes histogram data as stored by tensorboard backend and converts it to
-   * the standard d3 histogram data format to make it more compatible and easier
-   * to visualize. When visualizing histograms, having the left edge and width
-   * makes things quite a bit easier. The bins are also converted to have an
-   * uniform width, what makes the visualization easier to understand.
-   *
-   * @param histogram A histogram from tensorboard backend.
-   * @param min The leftmost edge. The binning will start on it.
-   * @param max The rightmost edge. The binning will end on it.
-   * @param numBins The number of bins of the converted data. The default of 30
-   * is a sensible default, using more starts to get artifacts because the event
-   * data is stored in buckets, and you start being able to see the aliased
-   * borders between each bucket.
-   * @return A histogram bin. Each bin has an x (left edge), a dx (width),
-   *     and a y (count).
-   *
-   * If given rightedges are inclusive, then these left edges (x) are exclusive.
-   */
-  export function convertBins(
-      histogram: Histogram, min: number, max: number, numBins = 30) {
-    if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
-      throw(new Error('Edges and counts are of different lengths.'));
-    }
-
-    if (max === min) {
-      // Create bins even if all the data has a single value.
-      max = min * 1.1 + 1;
-      min = min / 1.1 - 1;
-    }
-    let binWidth = (max - min) / numBins;
-    let bucketLeft = min;  // Use the min as the starting point for the bins.
-    let bucketPos = 0;
-    return d3.range(min, max, binWidth).map(function(binLeft) {
-      let binRight = binLeft + binWidth;
-
-      // Take the count of each existing bucket, multiply it by the proportion
-      // of overlap with the new bin, then sum and store as the count for the
-      // new bin. If no overlap, will add to zero, if 100% overlap, will include
-      // the full count into new bin.
-      let binY = 0;
-      while (bucketPos < histogram.bucketRightEdges.length) {
-        // Clip the right edge because right-most edge can be infinite-sized.
-        let bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
-
-        let intersect =
-            Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
-        let count = (intersect / (bucketRight - bucketLeft)) *
-            histogram.bucketCounts[bucketPos];
-
-        binY += intersect > 0 ? count : 0;
-
-        // If bucketRight is bigger than binRight, than this bin is finished and
-        // there is data for the next bin, so don't increment bucketPos.
-        if (bucketRight > binRight) {
-          break;
-        }
-        bucketLeft = Math.max(min, bucketRight);
-        bucketPos++;
-      };
-
-      return {x: binLeft, dx: binWidth, y: binY};
     });
   }
 
   /**
-   * The following interfaces (TupleData, HistogramTuple,
-   * CompressedHistogramTuple, ImageMetadata, and AudioMetadata) describe how
-   * the data is sent over from the backend.
+   * Return a promise containing ImageDatums for given run and tag.
    */
-  type TupleData<T> = [number, number, T];  // wall_time, step
-
-  // Min, Max, nItems, Sum, Sum_Squares, right edges of buckets, nItems in
-  // buckets
-  type HistogramTuple =
-      [number, number, number, number, number, number[], number[]];
-  type CompressedHistogramTuple = [number, number][];  // percentile, value
-  interface ImageMetadata {
-    width: number;
-    height: number;
-    wall_time: number;
-    step: number;
-    query: string;
+  public image(tag: string, run: string): Promise<Array<ImageDatum>> {
+    const url = (getRouter().pluginRunTagRoute('images', '/images')(tag, run));
+    let p: Promise<ImageMetadata[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(this.createImage.bind(this)));
   }
-  interface AudioMetadata {
-    content_type: string;
-    wall_time: number;
-    step: number;
-    query: string;
+
+  /**
+   * Return a promise containing AudioDatums for given run and tag.
+   */
+  public audio(tag: string, run: string): Promise<Array<AudioDatum>> {
+    const url = (getRouter().pluginRunTagRoute('audio', '/audio')(tag, run));
+    let p: Promise<AudioMetadata[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(this.createAudio.bind(this)));
+  }
+
+  /**
+   * Returns a promise containing profile data for given run and tag.
+   */
+  public profile(tag: string, run: string): Promise<string> {
+    let url = (getRouter().pluginRunTagRoute('profile', '/data')(tag, run));
+    if (getRouter().isDemoMode()) {
+      url += '.json';
+    }
+    return this.requestManager.request(url);
+  }
+
+  /**
+   * Returns the url for the RunMetadata for the given run/tag.
+   */
+  public runMetadataUrl(tag: string, run: string): string {
+    return getRouter().pluginRunTagRoute('graphs', '/run_metadata')(tag, run);
+  }
+
+  /**
+   * Returns a promise to load the string RunMetadata for given run/tag.
+   */
+  public runMetadata(tag: string, run: string): Promise<string> {
+    const url = this.runMetadataUrl(tag, run);
+    return this.requestManager.request(url);
+  }
+
+  /**
+   * Get compressedHistogram data.
+   * Unlike other methods, don't bother reprocessing this data into a nicer
+   * format. This is because we will deprecate this route.
+   */
+  private compressedHistogram(tag: string, run: string):
+      Promise<Array<Datum&CompressedHistogramTuple>> {
+    const url = (getRouter().pluginRunTagRoute(
+        'distributions', '/distributions')(tag, run));
+    let p: Promise<TupleData<CompressedHistogramTuple>[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(detupler((x) => x)));
+  }
+
+  private createImage(x: ImageMetadata): Image&Datum {
+    const pluginRoute = getRouter().pluginRoute('images', '/individualImage');
+
+    let query = x.query;
+    if (pluginRoute.indexOf('?') > -1) {
+      // The route already has GET parameters. Append our parameters to them.
+      query = '&' + query;
+    } else {
+      // The route lacks GET parameters. We append them.
+      query = '?' + query;
+    }
+
+    if (getRouter().isDemoMode()) {
+      query = demoify(query);
+    }
+
+    let individualImageUrl = pluginRoute + query;
+    // Include wall_time just to disambiguate the URL and force the browser
+    // to reload the image when the URL changes. The backend doesn't care
+    // about the value.
+    individualImageUrl +=
+        getRouter().isDemoMode() ? '.png' : '&ts=' + x.wall_time;
+
+    return {
+      width: x.width,
+      height: x.height,
+      wall_time: timeToDate(x.wall_time),
+      step: x.step,
+      url: individualImageUrl,
+    };
+  }
+
+  private createAudio(x: AudioMetadata): Audio&Datum {
+    const pluginRoute = getRouter().pluginRoute('audio', '/individualAudio');
+
+    let query = x.query;
+    if (pluginRoute.indexOf('?') > -1) {
+      // The route already has GET parameters. Append our parameters to them.
+      query = '&' + query;
+    } else {
+      // The route lacks GET parameters. We append them.
+      query = '?' + query;
+    }
+
+    if (getRouter().isDemoMode()) {
+      query = demoify(query);
+    }
+
+    let individualAudioUrl = pluginRoute + query;
+    // Include wall_time just to disambiguate the URL and force the browser
+    // to reload the audio when the URL changes. The backend doesn't care
+    // about the value.
+    individualAudioUrl +=
+        getRouter().isDemoMode() ? '.wav' : '&ts=' + x.wall_time;
+
+    return {
+      content_type: x.content_type,
+      wall_time: timeToDate(x.wall_time),
+      step: x.step,
+      url: individualAudioUrl,
+    };
   }
 }
+
+/** Given a RunToTag, return sorted array of all runs */
+export function getRuns(r: RunToTag): string[] {
+  return _.keys(r).sort(compareTagNames);
+}
+
+/** Given a RunToTag, return array of all tags (sorted + dedup'd) */
+export function getTags(r: RunToTag): string[] {
+  return _.union.apply(null, _.values(r)).sort(compareTagNames);
+}
+
+/**
+ * Given a RunToTag and an array of runs, return every tag that appears for
+ * at least one run.
+ * Sorted, deduplicated.
+ */
+export function filterTags(r: RunToTag, runs: string[]): string[] {
+  let result = [];
+  runs.forEach((x) => result = result.concat(r[x]));
+  return _.uniq(result).sort(compareTagNames);
+}
+
+function timeToDate(x: number): Date {
+  return new Date(x * 1000);
+};
+
+/**  Just a curryable map to make things cute and tidy. */
+function map<T, U>(f: (x: T) => U): (arr: T[]) => U[] {
+  return function(arr: T[]): U[] {
+    return arr.map(f);
+  };
+};
+
+/**
+ * This is a higher order function that takes a function that transforms a
+ * T into a G, and returns a function that takes TupleData<T>s and converts
+ * them into the intersection of a G and a Datum.
+ */
+function detupler<T, G>(xform: (x: T) => G): (t: TupleData<T>) => Datum & G {
+  return function(x: TupleData<T>): Datum & G {
+    // Create a G, assert it has type <G & Datum>
+    let obj = <G&Datum>xform(x[2]);
+    // ... patch in the properties of datum
+    obj.wall_time = timeToDate(x[0]);
+    obj.step = x[1];
+    return obj;
+  };
+};
+
+function createScalar(x: number): Scalar {
+  return {scalar: x};
+}
+
+function createHistogram(x: HistogramTuple): Histogram {
+  return {
+    min: x[0],
+    max: x[1],
+    nItems: x[2],
+    sum: x[3],
+    sumSquares: x[4],
+    bucketRightEdges: x[5],
+    bucketCounts: x[6],
+  };
+}
+
+/**
+ * Takes histogram data as stored by tensorboard backend and converts it to
+ * the standard d3 histogram data format to make it more compatible and easier
+ * to visualize. When visualizing histograms, having the left edge and width
+ * makes things quite a bit easier. The bins are also converted to have an
+ * uniform width, what makes the visualization easier to understand.
+ *
+ * @param histogram A histogram from tensorboard backend.
+ * @param min The leftmost edge. The binning will start on it.
+ * @param max The rightmost edge. The binning will end on it.
+ * @param numBins The number of bins of the converted data. The default of 30
+ * is a sensible default, using more starts to get artifacts because the event
+ * data is stored in buckets, and you start being able to see the aliased
+ * borders between each bucket.
+ * @return A histogram bin. Each bin has an x (left edge), a dx (width),
+ *     and a y (count).
+ *
+ * If given rightedges are inclusive, then these left edges (x) are exclusive.
+ */
+export function convertBins(
+    histogram: Histogram, min: number, max: number, numBins = 30) {
+  if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
+    throw(new Error('Edges and counts are of different lengths.'));
+  }
+
+  if (max === min) {
+    // Create bins even if all the data has a single value.
+    max = min * 1.1 + 1;
+    min = min / 1.1 - 1;
+  }
+  const binWidth = (max - min) / numBins;
+  let bucketLeft = min;  // Use the min as the starting point for the bins.
+  let bucketPos = 0;
+  return d3.range(min, max, binWidth).map((binLeft) => {
+    const binRight = binLeft + binWidth;
+
+    // Take the count of each existing bucket, multiply it by the proportion
+    // of overlap with the new bin, then sum and store as the count for the
+    // new bin. If no overlap, will add to zero, if 100% overlap, will include
+    // the full count into new bin.
+    let binY = 0;
+    while (bucketPos < histogram.bucketRightEdges.length) {
+      // Clip the right edge because right-most edge can be infinite-sized.
+      const bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
+
+      const intersect =
+          Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
+      const count = (intersect / (bucketRight - bucketLeft)) *
+          histogram.bucketCounts[bucketPos];
+
+      binY += intersect > 0 ? count : 0;
+
+      // If bucketRight is bigger than binRight, than this bin is finished and
+      // there is data for the next bin, so don't increment bucketPos.
+      if (bucketRight > binRight) {
+        break;
+      }
+      bucketLeft = Math.max(min, bucketRight);
+      bucketPos++;
+    }
+
+    return {x: binLeft, dx: binWidth, y: binY};
+  });
+}
+
+/**
+ * The following interfaces (TupleData, HistogramTuple,
+ * CompressedHistogramTuple, ImageMetadata, and AudioMetadata) describe how
+ * the data is sent over from the backend.
+ */
+type TupleData<T> = [number, number, T];  // wall_time, step
+
+// Min, Max, nItems, Sum, Sum_Squares, right edges of buckets, nItems in
+// buckets
+type HistogramTuple =
+    [number, number, number, number, number, number[], number[]];
+type CompressedHistogramTuple = [number, number][];  // percentile, value
+interface ImageMetadata {
+  width: number;
+  height: number;
+  wall_time: number;
+  step: number;
+  query: string;
+}
+interface AudioMetadata {
+  content_type: string;
+  wall_time: number;
+  step: number;
+  query: string;
+}
diff --git a/tensorflow/tensorboard/components/tf_backend/behavior.ts b/tensorflow/tensorboard/components/tf_backend/behavior.ts
index de6590456f7..8df791eface 100644
--- a/tensorflow/tensorboard/components/tf_backend/behavior.ts
+++ b/tensorflow/tensorboard/components/tf_backend/behavior.ts
@@ -12,134 +12,137 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+import {getRuns, getTags, TYPES} from './backend';
 
-module TF.Backend {
-  export var Behavior = {
-    properties: {
-      /** *** Required properties *** */
-      /** Data type. One of TF.Backend.TYPES */
-      dataType: {
-        type: String,
-        observer: '_throwErrorOnUnrecognizedType',
-      },
-
-      /** TF.Backend.Backend for data loading. */
-      backend: {
-        type: Object,
-      },
-
-      /** Should it automatically load when configured ready? Default true. */
-      autoLoad: {
-        type: Boolean,
-        value: true,
-      },
-
-      /** *** Component-provided properties *** */
-      /** Every tag available for data type (sorted, dedpulicated) */
-      tags: {
-        type: Array,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Every run available for data type (sorted) */
-      runs: {
-        type: Array,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Mapping from runs to tags for the data type */
-      run2tag: {
-        type: Object,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Promise provider for the data. Useful for passing to subcomponents */
-      dataProvider:
-          {type: Function, computed: '_getDataProvider(dataType, backend)'},
-
-      /** Has the dashboard loaded yet? */
-      loadState: {
-        type: String,
-        value: 'noload',  // [noload, pending, loaded, failure]
-        readOnly: true,
-      },
-
-      /**
-       * True if dashboard has loaded, and no tags were found.
-       * Persists through subsequent reloads (ie. still true while
-       * next load is pending) so warning won't flash away every reload
-       * when there is no data.
-       */
-      dataNotFound: {
-        type: Boolean,
-        value: false,
-        readOnly: true,
-      }
-
+/** @polymerBehavior */
+export const BackendBehavior = {
+  properties: {
+    /** *** Required properties *** */
+    /** Data type. One of Backend.TYPES */
+    dataType: {
+      type: String,
+      observer: '_throwErrorOnUnrecognizedType',
     },
-    observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
-    /**
-     * Reloading works in two steps:
-     * Backend reload, which gets metadata on available runs, tags, etc from
-     *   the backend.
-     * Frontend reload, which loads new data for each chart or visual display.
-     * Backend reload logic is provided by this behaivor. The frontend reload
-     *   logic should be provided elsewhere, since it is component-specific.
-     * To keep things simple and consistent, we do the backend reload first,
-     *   and the frontend reload afterwards.
-     */
-    reload: function() {
-      return this.backendReload().then(
-          (x) => { return this.frontendReload(); });
+
+    /** Backend for data loading. */
+    backend: {
+      type: Object,
     },
+
+    /** Should it automatically load when configured ready? Default true. */
+    autoLoad: {
+      type: Boolean,
+      value: true,
+    },
+
+    /** *** Component-provided properties *** */
+    /** Every tag available for data type (sorted, dedpulicated) */
+    tags: {
+      type: Array,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Every run available for data type (sorted) */
+    runs: {
+      type: Array,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Mapping from runs to tags for the data type */
+    run2tag: {
+      type: Object,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Promise provider for the data. Useful for passing to subcomponents */
+    dataProvider:
+        {type: Function, computed: '_getDataProvider(dataType, backend)'},
+
+    /** Has the dashboard loaded yet? */
+    loadState: {
+      type: String,
+      value: 'noload',  // [noload, pending, loaded, failure]
+      readOnly: true,
+    },
+
     /**
-     * Load data from backend and then set run2tag, tags, runs, and loadState.
-     * Returns a promise that resolves/rejects when data is loaded.
+     * True if dashboard has loaded, and no tags were found.
+     * Persists through subsequent reloads (ie. still true while
+     * next load is pending) so warning won't flash away every reload
+     * when there is no data.
      */
-    backendReload: function() {
-      if (this.dataType == null) {
-        throw new Error('TF.Backend.Behavior: Need a dataType to reload.');
-      }
-      if (this.backend == null) {
-        throw new Error('TF.Backend.Behavior: Need a backend to reload.');
-      }
-      var runsRoute = this.backend[this.dataType + 'Runs'].bind(this.backend);
-      this._setLoadState('pending');
-      return runsRoute().then(
-          (x) => {
-            this._setLoadState('loaded');
-            if (_.isEqual(x, this.run2tag)) {
-              // If x and run2tag are equal, let's avoid updating everything
-              // since that can needlessly trigger run changes, reloads, etc
-              return x;
-            }
-            this._setRun2tag(x);
-            var tags = TF.Backend.getTags(x);
-            this._setDataNotFound(tags.length === 0);
-            this._setTags(tags);
-            this._setRuns(TF.Backend.getRuns(x));
+    dataNotFound: {
+      type: Boolean,
+      value: false,
+      readOnly: true,
+    }
+
+  },
+  observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
+  /**
+   * Reloading works in two steps:
+   * Backend reload, which gets metadata on available runs, tags, etc from
+   *   the backend.
+   * Frontend reload, which loads new data for each chart or visual display.
+   * Backend reload logic is provided by this behavior. The frontend reload
+   *   logic should be provided elsewhere, since it is component-specific.
+   * To keep things simple and consistent, we do the backend reload first,
+   *   and the frontend reload afterwards.
+   */
+  reload() {
+    return this.backendReload().then((x) => {
+      return this.frontendReload();
+    });
+  },
+  /**
+   * Load data from backend and then set run2tag, tags, runs, and loadState.
+   * Returns a promise that resolves/rejects when data is loaded.
+   */
+  backendReload() {
+    if (this.dataType == null) {
+      throw new Error('BackendBehavior: Need a dataType to reload.');
+    }
+    if (this.backend == null) {
+      throw new Error('BackendBehavior: Need a backend to reload.');
+    }
+    const runsRoute = (this.backend[this.dataType + 'Runs'] ||
+                       this.backend[this.dataType + 'Tags'])
+                          .bind(this.backend);
+    this._setLoadState('pending');
+    return runsRoute().then(
+        (x) => {
+          this._setLoadState('loaded');
+          if (_.isEqual(x, this.run2tag)) {
+            // If x and run2tag are equal, let's avoid updating everything
+            // since that can needlessly trigger run changes, reloads, etc
             return x;
-          },
-          (fail) => {
-            this._setLoadState('failure');
-            return fail;
-          });
-    },
-    _do_autoLoad: function(type, backend, autoLoad) {
-      if (autoLoad) {
-        this.reload();
-      };
-    },
-    _getDataProvider: function(dataType, backend) {
-      return this.backend[this.dataType].bind(this.backend);
-    },
-    _throwErrorOnUnrecognizedType: function(dataType) {
-      if (TF.Backend.TYPES.indexOf(dataType) === -1) {
-        throw new Error('TF.Backend.Behavior: Unknown dataType ' + dataType);
-      }
-    },
-  };
-}
+          }
+          this._setRun2tag(x);
+          const tags = getTags(x);
+          this._setDataNotFound(tags.length === 0);
+          this._setTags(tags);
+          this._setRuns(getRuns(x));
+          return x;
+        },
+        (fail) => {
+          this._setLoadState('failure');
+          return fail;
+        });
+  },
+  _do_autoLoad(type, backend, autoLoad) {
+    if (autoLoad) {
+      this.reload();
+    }
+  },
+  _getDataProvider(dataType, backend) {
+    return this.backend[this.dataType].bind(this.backend);
+  },
+  _throwErrorOnUnrecognizedType(dataType) {
+    if (TYPES.indexOf(dataType) === -1) {
+      throw new Error('BackendBehavior: Unknown dataType ' + dataType);
+    }
+  },
+};
diff --git a/tensorflow/tensorboard/components/tf_backend/requestManager.ts b/tensorflow/tensorboard/components/tf_backend/requestManager.ts
index 1dfc3348b59..0fa198416e8 100644
--- a/tensorflow/tensorboard/components/tf_backend/requestManager.ts
+++ b/tensorflow/tensorboard/components/tf_backend/requestManager.ts
@@ -13,166 +13,165 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module TF.Backend {
-  interface ResolveReject { resolve: Function; reject: Function; }
-  /**
-   * Manages many fetch requests. Launches up to nSimultaneousRequests
-   * simultaneously, and maintains a LIFO queue of requests to process when
-   * more urls are requested than can be handled at once. The queue can be
-   * cleared.
-   *
-   * When a request is made, a Promise is returned which resolves with the
-   * parsed JSON result from the request.
-   */
-  export class RequestCancellationError extends Error {
-    public name = 'RequestCancellationError';
-  }
+interface ResolveReject {
+  resolve: Function;
+  reject: Function;
+}
+/**
+ * Manages many fetch requests. Launches up to nSimultaneousRequests
+ * simultaneously, and maintains a LIFO queue of requests to process when
+ * more urls are requested than can be handled at once. The queue can be
+ * cleared.
+ *
+ * When a request is made, a Promise is returned which resolves with the
+ * parsed JSON result from the request.
+ */
+export class RequestCancellationError extends Error {
+  public name = 'RequestCancellationError';
+}
 
-  export class RequestNetworkError extends Error {
-    public name: string;
-    public req: XMLHttpRequest;
-    public url: string;
+export class RequestNetworkError extends Error {
+  public name: string;
+  public req: XMLHttpRequest;
+  public url: string;
 
-    constructor(req: XMLHttpRequest, url) {
-      super();
-      this.message = `RequestNetworkError: ${req.status} at ${url}`;
-      this.name = 'RequestNetworkError';
-      this.req = req;
-      this.url = url;
-    }
-  }
-
-  export class RequestManager {
-    private _queue: ResolveReject[];
-    private _maxRetries: number;
-    private _nActiveRequests: number;
-    private _nSimultaneousRequests: number;
-
-    constructor(nSimultaneousRequests = 10, maxRetries = 3) {
-      this._queue = [];
-      this._nActiveRequests = 0;
-      this._nSimultaneousRequests = nSimultaneousRequests;
-      this._maxRetries = maxRetries;
-    }
-
-    /**
-     * Gives a promise that loads assets from given url (respects queuing). If
-     * postData is provided, this request will use POST, not GET. This is an
-     * object mapping POST keys to string values.
-     */
-    public request(
-        url: string, postData?: {[key: string]: string}): Promise<any> {
-      var promise = new Promise((resolve, reject) => {
-                      var resolver = {resolve: resolve, reject: reject};
-                      this._queue.push(resolver);
-                      this.launchRequests();
-                    })
-                        .then(() => {
-                          return this.promiseWithRetries(
-                              url, this._maxRetries, postData);
-                        })
-                        .then(
-                            (response) => {
-                              // Success - Let's free space for another active
-                              // reqest, and launch it
-                              this._nActiveRequests--;
-                              this.launchRequests();
-                              return response;
-                            },
-                            (rejection) => {
-                              if (rejection.name === 'RequestNetworkError') {
-                                // If we failed due to network error, we should
-                                // decrement
-                                // _nActiveRequests because this request was
-                                // active
-                                this._nActiveRequests--;
-                                this.launchRequests();
-                              }
-                              return Promise.reject(rejection);
-                            });
-      return promise;
-    }
-
-    public clearQueue() {
-      while (this._queue.length > 0) {
-        this._queue.pop().reject(
-            new RequestCancellationError('Request cancelled by clearQueue'));
-      }
-    }
-
-    /* Return number of currently pending requests */
-    public activeRequests(): number {
-      return this._nActiveRequests;
-    }
-
-    /* Return total number of outstanding requests (includes queue) */
-    public outstandingRequests(): number {
-      return this._nActiveRequests + this._queue.length;
-    }
-
-    private launchRequests() {
-      while (this._nActiveRequests < this._nSimultaneousRequests &&
-             this._queue.length > 0) {
-        this._nActiveRequests++;
-        this._queue.pop().resolve();
-      }
-    }
-
-    /**
-     * Try to request a given URL using overwritable _promiseFromUrl method.
-     * If the request fails for any reason, we will retry up to maxRetries
-     * times. In practice, this will help us paper over transient network issues
-     * like '502 Bad Gateway'.
-     * By default, Chrome displays network errors in console, so
-     * the user will be able to tell when the requests are failing. I think this
-     * is a feature, if the request failures and retries are causing any
-     * pain to users, they can see it and file issues.
-     */
-    private promiseWithRetries(
-        url: string,
-        maxRetries: number,
-        postData?: {[key: string]: string}) {
-      var success = (x) =>  x;
-      var failure = (x) => {
-        if (maxRetries > 0) {
-          return this.promiseWithRetries(url, maxRetries - 1, postData);
-        } else {
-          return Promise.reject(x);
-        }
-      };
-      return this._promiseFromUrl(url, postData).then(success, failure);
-    }
-
-    /* Actually get promise from url using XMLHttpRequest */
-    protected _promiseFromUrl(url:string, postData?: {[key: string]: string}) {
-      return new Promise((resolve, reject) => {
-        let req = new XMLHttpRequest();
-        req.open(postData ? 'POST' : 'GET', url);
-
-        let formData;
-        if (postData) {
-          // We are to make a POST request.
-          formData = new FormData();
-          for (let postKey in postData) {
-            if (postKey) {
-              // The linter requires 'for in' loops to be filtered by an if
-              // condition.
-              formData.append(postKey, postData[postKey]);
-            }
-          }
-        }
-        req.onload = function() {
-          if (req.status === 200) {
-            resolve(JSON.parse(req.responseText));
-          } else {
-            reject(new RequestNetworkError(req, url));
-          }
-        };
-        req.onerror = function() {
-          reject(new RequestNetworkError(req, url));
-        };
-        req.send(formData);
-      });
-    }
+  constructor(req: XMLHttpRequest, url) {
+    super();
+    this.message = `RequestNetworkError: ${req.status} at ${url}`;
+    this.name = 'RequestNetworkError';
+    this.req = req;
+    this.url = url;
+  }
+}
+
+export class RequestManager {
+  private _queue: ResolveReject[];
+  private _maxRetries: number;
+  private _nActiveRequests: number;
+  private _nSimultaneousRequests: number;
+
+  constructor(nSimultaneousRequests = 10, maxRetries = 3) {
+    this._queue = [];
+    this._nActiveRequests = 0;
+    this._nSimultaneousRequests = nSimultaneousRequests;
+    this._maxRetries = maxRetries;
+  }
+
+  /**
+   * Gives a promise that loads assets from given url (respects queuing). If
+   * postData is provided, this request will use POST, not GET. This is an
+   * object mapping POST keys to string values.
+   */
+  public request(url: string, postData?: {[key: string]: string}):
+      Promise<any> {
+    const promise =
+        new Promise((resolve, reject) => {
+          const resolver = {resolve: resolve, reject: reject};
+          this._queue.push(resolver);
+          this.launchRequests();
+        })
+            .then(() => {
+              return this.promiseWithRetries(url, this._maxRetries, postData);
+            })
+            .then(
+                (response) => {
+                  // Success - Let's free space for another active
+                  // request, and launch it
+                  this._nActiveRequests--;
+                  this.launchRequests();
+                  return response;
+                },
+                (rejection) => {
+                  if (rejection.name === 'RequestNetworkError') {
+                    // If we failed due to network error, we should
+                    // decrement
+                    // _nActiveRequests because this request was
+                    // active
+                    this._nActiveRequests--;
+                    this.launchRequests();
+                  }
+                  return Promise.reject(rejection);
+                });
+    return promise;
+  }
+
+  public clearQueue() {
+    while (this._queue.length > 0) {
+      this._queue.pop().reject(
+          new RequestCancellationError('Request cancelled by clearQueue'));
+    }
+  }
+
+  /* Return number of currently pending requests */
+  public activeRequests(): number {
+    return this._nActiveRequests;
+  }
+
+  /* Return total number of outstanding requests (includes queue) */
+  public outstandingRequests(): number {
+    return this._nActiveRequests + this._queue.length;
+  }
+
+  private launchRequests() {
+    while (this._nActiveRequests < this._nSimultaneousRequests &&
+           this._queue.length > 0) {
+      this._nActiveRequests++;
+      this._queue.pop().resolve();
+    }
+  }
+
+  /**
+   * Try to request a given URL using overwritable _promiseFromUrl method.
+   * If the request fails for any reason, we will retry up to maxRetries
+   * times. In practice, this will help us paper over transient network issues
+   * like '502 Bad Gateway'.
+   * By default, Chrome displays network errors in console, so
+   * the user will be able to tell when the requests are failing. I think this
+   * is a feature, if the request failures and retries are causing any
+   * pain to users, they can see it and file issues.
+   */
+  private promiseWithRetries(
+      url: string, maxRetries: number, postData?: {[key: string]: string}) {
+    var success = (x) => x;
+    var failure = (x) => {
+      if (maxRetries > 0) {
+        return this.promiseWithRetries(url, maxRetries - 1, postData);
+      } else {
+        return Promise.reject(x);
+      }
+    };
+    return this._promiseFromUrl(url, postData).then(success, failure);
+  }
+
+  /* Actually get promise from url using XMLHttpRequest */
+  protected _promiseFromUrl(url: string, postData?: {[key: string]: string}) {
+    return new Promise((resolve, reject) => {
+      let req = new XMLHttpRequest();
+      req.open(postData ? 'POST' : 'GET', url);
+
+      let formData;
+      if (postData) {
+        // We are to make a POST request.
+        formData = new FormData();
+        for (let postKey in postData) {
+          if (postKey) {
+            // The linter requires 'for in' loops to be filtered by an if
+            // condition.
+            formData.append(postKey, postData[postKey]);
+          }
+        }
+      }
+      req.onload = function() {
+        if (req.status === 200) {
+          resolve(JSON.parse(req.responseText));
+        } else {
+          reject(new RequestNetworkError(req, url));
+        }
+      };
+      req.onerror = function() {
+        reject(new RequestNetworkError(req, url));
+      };
+      req.send(formData);
+    });
   }
 }
diff --git a/tensorflow/tensorboard/components/tf_backend/router.ts b/tensorflow/tensorboard/components/tf_backend/router.ts
index d2c8191cc86..598546004e1 100644
--- a/tensorflow/tensorboard/components/tf_backend/router.ts
+++ b/tensorflow/tensorboard/components/tf_backend/router.ts
@@ -12,94 +12,86 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-module TF.Backend {
-  export type RunTagUrlFn = (tag: string, run: string) => string;
 
-  export interface Router {
-    logdir: () => string;
-    runs: () => string;
-    scalars: RunTagUrlFn;
-    histograms: RunTagUrlFn;
-    compressedHistograms: RunTagUrlFn;
-    images: RunTagUrlFn;
-    individualImage: (query: string, wallTime: number) => string;
-    audio: RunTagUrlFn;
-    individualAudio: (query: string) => string;
-    graph: (run: string, limit_attr_size?: number, large_attrs_key?: string)
-        => string;
-    runMetadata: RunTagUrlFn;
-    healthPills: () => string;
-  };
+import {demoify, queryEncoder} from './urlPathHelpers'
 
-  /**
-   * The standard router for communicating with the TensorBoard backend
-   * @param dataDir {string} The base prefix for finding data on server.
-   * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
-   */
-  export function router(dataDir = '/data', demoMode = false): Router {
-    var clean = demoMode ? demoify : (x) => x;
-    if (dataDir[dataDir.length - 1] === '/') {
-      dataDir = dataDir.slice(0, dataDir.length - 1);
-    }
-    function standardRoute(route: string, demoExtension = '.json'):
-        ((tag: string, run: string) => string) {
-      return function(tag: string, run: string): string {
-        var url =
-            dataDir + '/' + route + clean(queryEncoder({tag: tag, run: run}));
-        if (demoMode) {
-          url += demoExtension;
-        }
-        return url;
-      };
-    }
-    function individualImageUrl(query: string, wallTime: number) {
-      var url = dataDir + '/' + clean('individualImage?' + query);
-      // Include wall_time just to disambiguate the URL and force the browser
-      // to reload the image when the URL changes. The backend doesn't care
-      // about the value.
-      url += demoMode ? '.png' : '&ts=' + wallTime;
-      return url;
-    }
-    function individualAudioUrl(query: string) {
-      var url = dataDir + '/' + clean('individualAudio?' + query);
-      if (demoMode) {
-        url += '.wav';
-      }
-      return url;
-    }
-    function graphUrl(run: string, limit_attr_size?: number,
-        large_attrs_key?: string) {
-      let query_params = [['run', clean(run)]];
-      if (limit_attr_size != null && !demoMode) {
-        query_params.push(['limit_attr_size', String(limit_attr_size)]);
-      }
-      if (large_attrs_key != null && !demoMode) {
-        query_params.push(['large_attrs_key', large_attrs_key]);
-      }
-      let query = query_params
-                      .map(param => {
-                        return param[0] + '=' + encodeURIComponent(param[1]);
-                      })
-                      .join('&');
-      var url = dataDir + '/graph' + clean('?' + query);
-      if (demoMode) {
-        url += '.pbtxt';
-      }
-      return url;
-    }
-    return {
-      logdir: () => dataDir + '/logdir',
-      runs: () => dataDir + '/runs' + (demoMode ? '.json' : ''),
-      individualImage: individualImageUrl,
-      individualAudio: individualAudioUrl,
-      graph: graphUrl,
-      scalars: standardRoute('scalars'),
-      histograms: standardRoute('histograms'),
-      compressedHistograms: standardRoute('compressedHistograms'),
-      images: standardRoute('images'),
-      audio: standardRoute('audio'),
-      runMetadata: standardRoute('run_metadata', '.pbtxt'),
-      healthPills: () => dataDir + '/plugin/debugger/health_pills',
-    };
-  };
+export type RunTagUrlFn = (tag: string, run: string) => string;
+
+export interface Router {
+  logdir: () => string;
+  runs: () => string;
+  isDemoMode: () => boolean;
+  textRuns: () => string;
+  text: RunTagUrlFn;
+  healthPills: () => string;
+  pluginRoute: (pluginName: string, route: string) => string;
+  pluginRunTagRoute: (pluginName: string, route: string) => RunTagUrlFn;
+}
+;
+
+/**
+ * Create a router for communicating with the TensorBoard backend. You
+ * can pass this to `setRouter` to make it the global router.
+ *
+ * @param dataDir {string} The base prefix for finding data on server.
+ * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
+ */
+export function createRouter(dataDir = 'data', demoMode = false): Router {
+  var clean = demoMode ? demoify : (x) => x;
+  if (dataDir[dataDir.length - 1] === '/') {
+    dataDir = dataDir.slice(0, dataDir.length - 1);
+  }
+  function standardRoute(route: string, demoExtension = '.json'):
+      ((tag: string, run: string) => string) {
+    return function(tag: string, run: string): string {
+      var url =
+          dataDir + '/' + route + clean(queryEncoder({tag: tag, run: run}));
+      if (demoMode) {
+        url += demoExtension;
+      }
+      return url;
+    };
+  }
+  function pluginRoute(pluginName: string, route: string): string {
+    return `${dataDir}/plugin/${pluginName}${route}`;
+  }
+  function pluginRunTagRoute(pluginName: string, route: string):
+      ((tag: string, run: string) => string) {
+    const base = pluginRoute(pluginName, route);
+    return (tag, run) => base + clean(queryEncoder({tag, run}));
+  }
+  return {
+    logdir: () => dataDir + '/logdir',
+    runs: () => dataDir + '/runs' + (demoMode ? '.json' : ''),
+    isDemoMode: () => demoMode,
+    healthPills: () => dataDir + '/plugin/debugger/health_pills',
+    textRuns: () => dataDir + '/plugin/text/runs' + (demoMode ? '.json' : ''),
+    text: standardRoute('plugin/text/text'),
+    pluginRoute,
+    pluginRunTagRoute,
+  };
+};
+
+let _router: Router = createRouter();
+
+/**
+ * @return {Router} the global router
+ */
+export function getRouter(): Router {
+  return _router;
+}
+
+/**
+ * Set the global router, to be returned by future calls to `getRouter`.
+ * You may wish to invoke this if you are running a demo server with a
+ * custom path prefix, or if you have customized the TensorBoard backend
+ * to use a different path.
+ *
+ * @param {Router} router the new global router
+ */
+export function setRouter(router: Router): void {
+  if (router == null) {
+    throw new Error('Router required, but got: ' + router);
+  }
+  _router = router;
 }
diff --git a/tensorflow/tensorboard/components/tf_backend/runsStore.ts b/tensorflow/tensorboard/components/tf_backend/runsStore.ts
new file mode 100644
index 00000000000..bcaff994ce8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/runsStore.ts
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {RequestManager} from './requestManager';
+import {getRouter} from './router';
+
+let runs: string[] = [];
+
+export type Listener = () => void;
+const listeners = new Set<Listener>();
+
+const requestManager = new RequestManager(1 /* simultaneous request */);
+
+/**
+ * Register a listener (nullary function) to be called when new runs are
+ * available.
+ */
+export function addListener(listener: Listener): void {
+  listeners.add(listener);
+}
+
+/**
+ * Remove a listener registered with `addListener`.
+ */
+export function removeListener(listener: Listener): void {
+  listeners.delete(listener);
+}
+
+/**
+ * Asynchronously load or reload the runs data. Listeners will be
+ * invoked if this causes the runs data to change.
+ *
+ * @see addListener
+ * @return {Promise<void>} a promise that resolves when the runs have
+ * loaded
+ */
+export function fetchRuns(): Promise<void> {
+  const url = getRouter().runs();
+  return requestManager.request(url).then(newRuns => {
+    if (!_.isEqual(runs, newRuns)) {
+      runs = newRuns;
+      listeners.forEach(listener => {
+        listener();
+      });
+    }
+  });
+}
+
+/**
+ * Get the current list of runs. If no data is available, this will be
+ * an empty array (i.e., there is no distinction between "no runs" and
+ * "no runs yet").
+ */
+export function getRuns(): string[] {
+  return runs.slice();
+}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/BUILD b/tensorflow/tensorboard/components/tf_backend/test/BUILD
new file mode 100644
index 00000000000..da70f8a9daa
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/test/BUILD
@@ -0,0 +1,32 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "tests.html",
+        "backendTests.ts",
+        "behaviorTests.ts",
+        "requestManagerTests.ts",
+    ] + glob(["data/**"]),
+    path = "/tf-backend/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts b/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
index 4b91e9f62c8..029c8359125 100644
--- a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
+++ b/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
@@ -12,290 +12,283 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-var assert = chai.assert;
+import {Backend, convertBins, filterTags, getRuns, getTags, RunToTag, TYPES} from '../backend';
+import {RequestManager} from '../requestManager';
+import {createRouter, setRouter} from '../router';
+import {BAD_CHARACTERS, demoify, queryEncoder} from '../urlPathHelpers';
 
-module TF.Backend {
-  describe('urlPathHelpers', function() {
-    let demoify = TF.Backend.demoify;
-    let encode = TF.Backend.queryEncoder;
-    it('demoify works as expected', function() {
-      let demoified = demoify(BAD_CHARACTERS);
-      let all_clean = '';
-      for (let i = 0; i < BAD_CHARACTERS.length; i++) {
-        all_clean += '_';
-      }
-      assert.equal(demoified, all_clean, 'cleaning the BAD_CHARACTERS works');
-      assert.equal(demoify('foozod'), 'foozod', 'doesnt change safe string');
-      assert.equal(demoify('foo zod (2)'), 'foo_zod__2_', 'simple case');
-    });
+describe('urlPathHelpers', () => {
+  it('demoify works as expected', () => {
+    const demoified = demoify(BAD_CHARACTERS);
+    let allClean = '';
+    for (let i = 0; i < BAD_CHARACTERS.length; i++) {
+      allClean += '_';
+    }
+    chai.assert.equal(demoified, allClean, 'cleaning the BAD_CHARACTERS works');
+    chai.assert.equal(demoify('foozod'), 'foozod', 'doesnt change safe string');
+    chai.assert.equal(demoify('foo zod (2)'), 'foo_zod__2_', 'simple case');
+  });
 
-    it('queryEncoder works with demoify on spaces and parens', function() {
-      let params = {foo: 'something with spaces and (parens)'};
-      let actual = demoify(encode(params));
-      let expected = '_foo_something_with_spaces_and__28parens_29';
-      assert.equal(actual, expected);
+  it('queryEncoder works with demoify on spaces and parens', () => {
+    const params = {foo: 'something with spaces and (parens)'};
+    const actual = demoify(queryEncoder(params));
+    const expected = '_foo_something_with_spaces_and__28parens_29';
+    chai.assert.equal(actual, expected);
+  });
+});
+
+function assertIsDatum(x) {
+  chai.assert.isNumber(x.step);
+  chai.assert.instanceOf(x.wall_time, Date);
+}
+
+describe('backend tests', () => {
+  let backend: Backend;
+  let rm: RequestManager;
+  const base = 'data';
+  const demoRouter = createRouter(base, /*demoMode=*/true);
+  beforeEach(() => {
+    // Construct a demo Backend (third param is true)
+    setRouter(demoRouter);
+    backend = new Backend();
+    rm = new RequestManager();
+  });
+
+  it('runs are loaded properly', (done) => {
+    const runsResponse = backend.runs();
+    const actualRuns = rm.request(demoRouter.runs());
+    Promise.all([runsResponse, actualRuns]).then((values) => {
+      chai.assert.deepEqual(values[0], values[1]);
+      done();
     });
   });
 
-  function assertIsDatum(x) {
-    assert.isNumber(x.step);
-    assert.instanceOf(x.wall_time, Date);
+  it('scalars are loaded properly', (done) => {
+    backend.scalar('cross_entropy (1)', 'run1').then((s) => {
+      // just check the data got reformatted properly
+      const aScalar = s[s.length - 1];
+      assertIsDatum(aScalar);
+      chai.assert.isNumber(aScalar.scalar);
+      // verify date conversion works
+      chai.assert.equal(aScalar.wall_time.valueOf(), 40000);
+      done();
+    });
+  });
+
+  it('histograms are loaded properly', (done) => {
+    backend.histogram('histo1', 'run1').then((histos) => {
+      const histo = histos[0];
+      assertIsDatum(histo);
+      chai.assert.instanceOf(histo.bins, Array);
+      done();
+    });
+  });
+
+  it('all registered types have handlers', () => {
+    TYPES.forEach((t: string) => {
+      chai.assert.isDefined(backend[t], t);
+      chai.assert.isDefined(backend[t + 'Runs'], t + 'Runs');
+    });
+  });
+
+  it('images are loaded properly', (done) => {
+    backend.image('im1', 'run1').then((images) => {
+      const image = images[0];
+      assertIsDatum(image);
+      chai.assert.isNumber(image.width);
+      chai.assert.isNumber(image.height);
+      done();
+    });
+  });
+
+  it('audio is loaded properly', (done) => {
+    backend.audio('audio1', 'run1').then((audioClips) => {
+      const audio = audioClips[0];
+      assertIsDatum(audio);
+      chai.assert.equal(audio.content_type, 'audio/wav');
+      done();
+    });
+  });
+
+  it('trailing slash removed from base route', () => {
+    const r = createRouter('foo/');
+    chai.assert.equal(r.runs(), 'foo/runs');
+  });
+
+  it('run helper methods work', (done) => {
+    const scalar = {run1: ['cross_entropy (1)'], fake_run_no_data: ['scalar2']};
+    const image = {run1: ['im1'], fake_run_no_data: ['im1', 'im2']};
+    const audio = {run1: ['audio1'], fake_run_no_data: ['audio1', 'audio2']};
+    const runMetadata = {run1: ['step99'], fake_run_no_data: ['step99']};
+    const graph = ['fake_run_no_data'];
+    let count = 0;
+    function next() {
+      count++;
+      if (count === 4) {
+        done();
+      }
+    }
+    backend.scalarTags().then((x) => {
+      chai.assert.deepEqual(x, scalar);
+      next();
+    });
+    backend.imageTags().then((x) => {
+      chai.assert.deepEqual(x, image);
+      next();
+    });
+    backend.audioTags().then((x) => {
+      chai.assert.deepEqual(x, audio);
+      next();
+    });
+    backend.runMetadataTags().then((x) => {
+      chai.assert.deepEqual(x, runMetadata);
+      next();
+    });
+    backend.graphRuns().then((x) => {
+      chai.assert.deepEqual(x, graph);
+      next();
+    });
+  });
+
+  it('runToTag helpers work', () => {
+    const r2t: RunToTag = {
+      run1: ['foo', 'bar', 'zod'],
+      run2: ['zod', 'zoink'],
+      a: ['foo', 'zod']
+    };
+    const empty1: RunToTag = {};
+    const empty2: RunToTag = {run1: [], run2: []};
+    chai.assert.deepEqual(getRuns(r2t), ['a', 'run1', 'run2']);
+    chai.assert.deepEqual(getTags(r2t), ['bar', 'foo', 'zod', 'zoink']);
+    chai.assert.deepEqual(filterTags(r2t, ['run1', 'run2']), getTags(r2t));
+    chai.assert.deepEqual(filterTags(r2t, ['run1']), ['bar', 'foo', 'zod']);
+    chai.assert.deepEqual(
+        filterTags(r2t, ['run2', 'a']), ['foo', 'zod', 'zoink']);
+
+    chai.assert.deepEqual(getRuns(empty1), []);
+    chai.assert.deepEqual(getTags(empty1), []);
+
+    chai.assert.deepEqual(getRuns(empty2), ['run1', 'run2']);
+    chai.assert.deepEqual(getTags(empty2), []);
+  });
+});
+
+describe('Verify that the histogram format conversion works.', () => {
+
+  function assertHistogramEquality(h1, h2) {
+    h1.forEach((b1, i) => {
+      const b2 = h2[i];
+      chai.assert.closeTo(b1.x, b2.x, 1e-10);
+      chai.assert.closeTo(b1.dx, b2.dx, 1e-10);
+      chai.assert.closeTo(b1.y, b2.y, 1e-10);
+    });
   }
 
-  describe('backend tests', function() {
-    let backend: Backend;
-    let rm: RequestManager;
-    let base = 'data';
-    let demoRouter = TF.Backend.router(base, true);
-    beforeEach(function() {
-      // Construct a demo Backend (third param is true)
-      backend = new Backend(demoRouter);
-      rm = new RequestManager();
-    });
-
-    it('runs are loaded properly', function(done) {
-      let runsResponse = backend.runs();
-      let actualRuns = rm.request(demoRouter.runs());
-      Promise.all([runsResponse, actualRuns]).then((values) => {
-        assert.deepEqual(values[0], values[1]);
-        done();
-      });
-    });
-
-    it('scalars are loaded properly', function(done) {
-      backend.scalar('cross_entropy (1)', 'run1').then((s) => {
-        // just check the data got reformatted properly
-        let aScalar = s[s.length - 1];
-        assertIsDatum(aScalar);
-        assert.isNumber(aScalar.scalar);
-        // verify date conversion works
-        assert.equal(aScalar.wall_time.valueOf(), 40000);
-        done();
-      });
-    });
-
-    it('histograms are loaded properly', function(done) {
-      backend.histogram('histo1', 'run1').then((histos) => {
-        let histo = histos[0];
-        assertIsDatum(histo);
-        assert.instanceOf(histo.bins, Array);
-        done();
-      });
-    });
-
-    it('all registered types have handlers', function() {
-      TYPES.forEach((t: string) => {
-        assert.isDefined(backend[t], t);
-        assert.isDefined(backend[t + 'Runs'], t + 'Runs');
-      });
-    });
-
-    it('images are loaded properly', function(done) {
-      backend.image('im1', 'run1').then((images) => {
-        let image = images[0];
-        assertIsDatum(image);
-        assert.isNumber(image.width);
-        assert.isNumber(image.height);
-        let nonDemoQuery = 'index=0&tag=im1&run=run1';
-        let expectedUrl = demoRouter.individualImage(nonDemoQuery, 10.0);
-        assert.equal(image.url, expectedUrl);
-        done();
-      });
-    });
-
-    it('audio is loaded properly', function(done) {
-      backend.audio('audio1', 'run1').then((audio_clips) => {
-        let audio = audio_clips[0];
-        assertIsDatum(audio);
-        assert.equal(audio.content_type, 'audio/wav');
-        let nonDemoQuery = 'index=0&tag=audio1&run=run1';
-        let expectedUrl = demoRouter.individualAudio(nonDemoQuery);
-        assert.equal(audio.url, expectedUrl);
-        done();
-      });
-    });
-
-    it('trailing slash removed from base route', function() {
-      let r = TF.Backend.router('foo/');
-      assert.equal(r.runs(), 'foo/runs');
-    });
-
-    it('run helper methods work', function(done) {
-      let scalar = {run1: ['cross_entropy (1)'], fake_run_no_data: ['scalar2']};
-      let image = {run1: ['im1'], fake_run_no_data: ['im1', 'im2']};
-      let audio = {run1: ['audio1'], fake_run_no_data: ['audio1', 'audio2']};
-      let runMetadata = {run1: ['step99'], fake_run_no_data: ['step99']};
-      let graph = ['fake_run_no_data'];
-      let count = 0;
-      function next() {
-        count++;
-        if (count === 4) {
-          done();
-        }
-      }
-      backend.scalarRuns().then((x) => {
-        assert.deepEqual(x, scalar);
-        next();
-      });
-      backend.imageRuns().then((x) => {
-        assert.deepEqual(x, image);
-        next();
-      });
-      backend.audioRuns().then((x) => {
-        assert.deepEqual(x, audio);
-        next();
-      });
-      backend.runMetadataRuns().then((x) => {
-        assert.deepEqual(x, runMetadata);
-        next();
-      });
-      backend.graphRuns().then((x) => {
-        assert.deepEqual(x, graph);
-        next();
-      });
-    });
-
-    it('runToTag helpers work', function() {
-      let r2t: RunToTag = {
-        run1: ['foo', 'bar', 'zod'],
-        run2: ['zod', 'zoink'],
-        a: ['foo', 'zod']
-      };
-      let empty1: RunToTag = {};
-      let empty2: RunToTag = {run1: [], run2: []};
-      assert.deepEqual(getRuns(r2t), ['a', 'run1', 'run2']);
-      assert.deepEqual(getTags(r2t), ['bar', 'foo', 'zod', 'zoink']);
-      assert.deepEqual(filterTags(r2t, ['run1', 'run2']), getTags(r2t));
-      assert.deepEqual(filterTags(r2t, ['run1']), ['bar', 'foo', 'zod']);
-      assert.deepEqual(filterTags(r2t, ['run2', 'a']), ['foo', 'zod', 'zoink']);
-
-      assert.deepEqual(getRuns(empty1), []);
-      assert.deepEqual(getTags(empty1), []);
-
-      assert.deepEqual(getRuns(empty2), ['run1', 'run2']);
-      assert.deepEqual(getTags(empty2), []);
-    });
+  it('Throws and error if the inputs are of different lengths', () => {
+    chai.assert.throws(() => {
+      convertBins(
+          {bucketRightEdges: [0], bucketCounts: [1, 2], min: 1, max: 2}, 1, 2,
+          2);
+    }, 'Edges and counts are of different lengths.');
   });
 
-  describe('Verify that the histogram format conversion works.', function() {
-
-    function assertHistogramEquality(h1, h2) {
-      h1.forEach(function(b1, i) {
-        let b2 = h2[i];
-        assert.closeTo(b1.x, b2.x, 1e-10);
-        assert.closeTo(b1.dx, b2.dx, 1e-10);
-        assert.closeTo(b1.y, b2.y, 1e-10);
-      });
-    }
-
-    it('Throws and error if the inputs are of different lengths', function() {
-      assert.throws(function() {
+  it('Handles data with no bins', () => {
+    chai.assert.deepEqual(
         convertBins(
-            {bucketRightEdges: [0], bucketCounts: [1, 2], min: 1, max: 2}, 1, 2,
-            2);
-      }, 'Edges and counts are of different lengths.');
-    });
-
-    it('Handles data with no bins', function() {
-      assert.deepEqual(
-          convertBins(
-              {bucketRightEdges: [], bucketCounts: [], min: 0, max: 0}, 0, 0,
-              0),
-          []);
-    });
-
-    it('Handles data with one bin', function() {
-      let counts = [1];
-      let rightEdges = [1.21e-12];
-      let histogram = [{x: 1.1e-12, dx: 1.21e-12 - 1.1e-12, y: 1}];
-      let newHistogram = convertBins(
-          {
-            bucketRightEdges: rightEdges,
-            bucketCounts: counts,
-            min: 1.1e-12,
-            max: 1.21e-12
-          },
-          1.1e-12, 1.21e-12, 1);
-      assertHistogramEquality(newHistogram, histogram);
-    });
-
-    it('Handles data with two bins.', function() {
-      let counts = [1, 2];
-      let rightEdges = [1.1e-12, 1.21e-12];
-      let histogram = [
-        {x: 1.0e-12, dx: 1.05e-13, y: 1.09090909090909},
-        {x: 1.105e-12, dx: 1.05e-13, y: 1.9090909090909}
-      ];
-      let newHistogram = convertBins(
-          {
-            bucketRightEdges: rightEdges,
-            bucketCounts: counts,
-            min: 1.0e-12,
-            max: 1.21e-12
-          },
-          1.0e-12, 1.21e-12, 2);
-      assertHistogramEquality(newHistogram, histogram);
-    });
-
-    it('Handles a domain that crosses zero, but doesn\'t include zero as ' +
-           'an edge.',
-       function() {
-         let counts = [1, 2];
-         let rightEdges = [-1.0e-12, 1.0e-12];
-         let histogram = [
-           {x: -1.1e-12, dx: 1.05e-12, y: 1.95},
-           {x: -0.5e-13, dx: 1.05e-12, y: 1.05}
-         ];
-         let newHistogram = convertBins(
-             {
-               bucketRightEdges: rightEdges,
-               bucketCounts: counts,
-               min: -1.1e-12,
-               max: 1.0e-12
-             },
-             -1.1e-12, 1.0e-12, 2);
-         assertHistogramEquality(newHistogram, histogram);
-       });
-
-    it('Handles a histogram of all zeros', function() {
-      let h = {
-        min: 0,
-        max: 0,
-        nItems: 51200,
-        sum: 0,
-        sumSquares: 0,
-        bucketRightEdges: [0, 1e-12, 1.7976931348623157e+308],
-        bucketCounts: [0, 51200, 0],
-        wall_time: '2017-01-25T02:30:11.257Z',
-        step: 0
-      };
-      let newHistogram = convertBins(h, 0, 0, 5);
-      let expectedHistogram = [
-        {x: -1, dx: 0.4, y: 0}, {x: -0.6, dx: 0.4, y: 0},
-        {x: -0.2, dx: 0.4, y: 51200}, {x: 0.2, dx: 0.4, y: 0},
-        {x: 0.6, dx: 0.4, y: 0}
-      ];
-      assertHistogramEquality(newHistogram, expectedHistogram);
-    });
-
-    it('Handles a right-most right edge that extends to very large number.',
-       function() {
-         let counts = [1, 2, 3];
-         let rightEdges = [0, 1.0e-12, 1.0e14];
-         let histogram = [
-           {x: -1.0e-12, dx: 0.7e-12, y: 0.7},
-           {x: -0.3e-12, dx: 0.7e-12, y: 1.1},
-           {x: 0.4e-12, dx: 0.7e-12, y: 4.2}
-         ];
-         let newHistogram = convertBins(
-             {
-               bucketRightEdges: rightEdges,
-               bucketCounts: counts,
-               min: -1.0e-12,
-               max: 1.1e-12
-             },
-             -1.0e-12, 1.1e-12, 3);
-         assertHistogramEquality(newHistogram, histogram);
-       });
+            {bucketRightEdges: [], bucketCounts: [], min: 0, max: 0}, 0, 0, 0),
+        []);
   });
-}
+
+  it('Handles data with one bin', () => {
+    const counts = [1];
+    const rightEdges = [1.21e-12];
+    const histogram = [{x: 1.1e-12, dx: 1.21e-12 - 1.1e-12, y: 1}];
+    const newHistogram = convertBins(
+        {
+          bucketRightEdges: rightEdges,
+          bucketCounts: counts,
+          min: 1.1e-12,
+          max: 1.21e-12
+        },
+        1.1e-12, 1.21e-12, 1);
+    assertHistogramEquality(newHistogram, histogram);
+  });
+
+  it('Handles data with two bins.', () => {
+    const counts = [1, 2];
+    const rightEdges = [1.1e-12, 1.21e-12];
+    const histogram = [
+      {x: 1.0e-12, dx: 1.05e-13, y: 1.09090909090909},
+      {x: 1.105e-12, dx: 1.05e-13, y: 1.9090909090909}
+    ];
+    const newHistogram = convertBins(
+        {
+          bucketRightEdges: rightEdges,
+          bucketCounts: counts,
+          min: 1.0e-12,
+          max: 1.21e-12
+        },
+        1.0e-12, 1.21e-12, 2);
+    assertHistogramEquality(newHistogram, histogram);
+  });
+
+  it('Handles a domain that crosses zero, but doesn\'t include zero as ' +
+         'an edge.',
+     () => {
+       const counts = [1, 2];
+       const rightEdges = [-1.0e-12, 1.0e-12];
+       const histogram = [
+         {x: -1.1e-12, dx: 1.05e-12, y: 1.95},
+         {x: -0.5e-13, dx: 1.05e-12, y: 1.05}
+       ];
+       const newHistogram = convertBins(
+           {
+             bucketRightEdges: rightEdges,
+             bucketCounts: counts,
+             min: -1.1e-12,
+             max: 1.0e-12
+           },
+           -1.1e-12, 1.0e-12, 2);
+       assertHistogramEquality(newHistogram, histogram);
+     });
+
+  it('Handles a histogram of all zeros', () => {
+    const h = {
+      min: 0,
+      max: 0,
+      nItems: 51200,
+      sum: 0,
+      sumSquares: 0,
+      bucketRightEdges: [0, 1e-12, 1.7976931348623157e+308],
+      bucketCounts: [0, 51200, 0],
+      wall_time: '2017-01-25T02:30:11.257Z',
+      step: 0
+    };
+    const newHistogram = convertBins(h, 0, 0, 5);
+    const expectedHistogram = [
+      {x: -1, dx: 0.4, y: 0}, {x: -0.6, dx: 0.4, y: 0},
+      {x: -0.2, dx: 0.4, y: 51200}, {x: 0.2, dx: 0.4, y: 0},
+      {x: 0.6, dx: 0.4, y: 0}
+    ];
+    assertHistogramEquality(newHistogram, expectedHistogram);
+  });
+
+  it('Handles a right-most right edge that extends to very large number.',
+     () => {
+       const counts = [1, 2, 3];
+       const rightEdges = [0, 1.0e-12, 1.0e14];
+       const histogram = [
+         {x: -1.0e-12, dx: 0.7e-12, y: 0.7}, {x: -0.3e-12, dx: 0.7e-12, y: 1.1},
+         {x: 0.4e-12, dx: 0.7e-12, y: 4.2}
+       ];
+       const newHistogram = convertBins(
+           {
+             bucketRightEdges: rightEdges,
+             bucketCounts: counts,
+             min: -1.0e-12,
+             max: 1.1e-12
+           },
+           -1.0e-12, 1.1e-12, 3);
+       assertHistogramEquality(newHistogram, histogram);
+     });
+});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts b/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
index 42b6fad7fe8..6bf328140e2 100644
--- a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
+++ b/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
@@ -12,147 +12,154 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-var assert = chai.assert;
+
+import {Backend, getRuns, getTags, RunToTag} from '../backend'
+import {BackendBehavior} from '../behavior'
+
 declare function fixture(id: string): void;
 
-    module TF.Backend {
-      window.addEventListener('WebComponentsReady', function() {
-        Polymer({
-          is: 'test-element',
-          behaviors: [TF.Backend.Behavior],
-          frontendReload: function() {
-            // no-op
-          },
-        });
+window.addEventListener('WebComponentsReady', function() {
+  Polymer({
+    is: 'test-element',
+    behaviors: [BackendBehavior],
+    frontendReload: function() {
+      // no-op
+    },
+  });
+});
+
+describe('data-behavior', function() {
+  let testElement;
+  let resolve;
+  let reject;
+  const fakeBackend = {
+    scalarTags() {
+      return new Promise((_resolve, _reject) => {
+        resolve = (x) => _resolve(x);
+        reject = (x) => _reject(x);
       });
+    },
+    scalar(x) {
+      return this;
+    },
+  };
+  beforeEach(function() {
+    testElement = fixture('testElementFixture');
+    testElement.autoLoad = false;
+    testElement.backend = fakeBackend;
+    testElement.dataType = 'scalar';
+  });
 
-      describe('data-behavior', function() {
-        var testElement;
-        var resolve;
-        var reject;
-        var fakeBackend = {
-          scalarRuns: function() {
-            return new Promise(function(_resolve, _reject) {
-              resolve = (x) => _resolve(x);
-              reject = (x) => _reject(x);
-            });
-          },
-          scalar: function(x) { return this; },
-        };
-        beforeEach(function() {
-          testElement = fixture('testElementFixture');
-          testElement.autoLoad = false;
-          testElement.backend = fakeBackend;
-          testElement.dataType = 'scalar';
+  it('load states work as expected', function(done) {
+    chai.assert.equal(testElement.loadState, 'noload');
+    var reloaded = testElement.reload();
+    chai.assert.equal(testElement.loadState, 'pending');
+    resolve();
+    reloaded
+        .then(function() {
+          chai.assert.equal(testElement.loadState, 'loaded');
+          var reloaded2 = testElement.reload();
+          chai.assert.equal(testElement.loadState, 'pending');
+          reject();
+          return reloaded2;
+        })
+        .then(function() {
+          chai.assert.equal(testElement.loadState, 'failure');
+          done();
         });
+  });
 
-        it('load states work as expected', function(done) {
-          assert.equal(testElement.loadState, 'noload');
-          var reloaded = testElement.reload();
-          assert.equal(testElement.loadState, 'pending');
-          resolve();
-          reloaded
-              .then(function() {
-                assert.equal(testElement.loadState, 'loaded');
-                var reloaded2 = testElement.reload();
-                assert.equal(testElement.loadState, 'pending');
-                reject();
-                return reloaded2;
-              })
-              .then(function() {
-                assert.equal(testElement.loadState, 'failure');
-                done();
-              });
-        });
+  it('data provider set appropriately', function() {
+    chai.assert.deepEqual(testElement.dataProvider(), testElement.backend);
+  });
 
-        it('data provider set appropriately', function() {
-          assert.deepEqual(testElement.dataProvider(), testElement.backend);
-        });
+  it('loads data as expected', function(done) {
+    var r2t: RunToTag = {
+      run1: ['foo', 'bar', 'zod'],
+      run2: ['zoink', 'zow'],
+      run3: ['.'],
+    };
+    var tags = getTags(r2t);
+    var runs = getRuns(r2t);
+    testElement.backend = fakeBackend;
+    testElement.dataType = 'scalar';
+    testElement.reload().then(function(x) {
+      chai.assert.deepEqual(testElement.run2tag, r2t);
+      chai.assert.deepEqual(testElement.runs, runs);
+      chai.assert.deepEqual(testElement.tags, tags);
+      done();
+    });
+    resolve(r2t);
+  });
 
-        it('loads data as expected', function(done) {
-          var r2t: RunToTag = {
-            run1: ['foo', 'bar', 'zod'],
-            run2: ['zoink', 'zow'],
-            run3: ['.'],
-          };
-          var tags = TF.Backend.getTags(r2t);
-          var runs = TF.Backend.getRuns(r2t);
-          testElement.backend = fakeBackend;
-          testElement.dataType = 'scalar';
-          testElement.reload().then(function(x) {
-            assert.deepEqual(testElement.run2tag, r2t);
-            assert.deepEqual(testElement.runs, runs);
-            assert.deepEqual(testElement.tags, tags);
-            done();
-          });
-          resolve(r2t);
-        });
+  it('errors thrown on bad data types', function() {
+    testElement.backend = undefined;
+    chai.assert.throws(function() {
+      testElement.dataType = 'foo';
+    });
+    testElement.dataType = 'scalar';
+    testElement.dataType = 'graph';
+    testElement.dataType = 'histogram';
+  });
 
-        it('errors thrown on bad data types', function() {
-          testElement.backend = undefined;
-          assert.throws(function() { testElement.dataType = 'foo'; });
-          testElement.dataType = 'scalar';
-          testElement.dataType = 'graph';
-          testElement.dataType = 'histogram';
-        });
-
-        it('dataNotFound flag works', function(done) {
-          assert.isFalse(testElement.dataNotFound, 'initially false');
-          var next = testElement.reload();
-          assert.isFalse(testElement.dataNotFound, 'still false while pending');
-          resolve({foo: [], bar: []});
-          next.then(() => {
-            assert.isTrue(testElement.dataNotFound, 'true on empty data');
-            var last = testElement.reload();
-            assert.isTrue(testElement.dataNotFound, 'still true while pending');
-            resolve({foo: ['bar'], bar: ['zod']});
-            last.then(() => {
-              assert.isFalse(
-                  testElement.dataNotFound, 'false now that we have data');
-              done();
-            });
-          });
-        });
-
-        it('reloads as soon as setup, if autoReload is true', function(done) {
-          var r2t = {foo: [], bar: []};
-          var fakeBackend = {
-            scalarRuns: () => Promise.resolve(r2t),
-            scalar: () => null,
-          };
-          testElement = fixture('testElementFixture');
-          testElement.dataType = 'scalar';
-          testElement.backend = fakeBackend;
-          setTimeout(() => {
-            assert.equal(testElement.run2tag, r2t);
-            done();
-          });
-        });
-
-        it('doesn\'t mutate props if backend returns same data', function(
-                                                                     done) {
-          var r2t_1 = {foo: ['1', '2'], bar: ['3', '4']};
-          var r2t_2 = {foo: ['1', '2'], bar: ['3', '4']};
-          var fakeBackend = {
-            scalarRuns: () => Promise.resolve(r2t_1),
-            scalar: () => null,
-          };
-          testElement.backend = fakeBackend;
-          testElement.reload().then(() => {
-            fakeBackend.scalarRuns = () => Promise.resolve(r2t_2);
-            var tags = testElement.tags;
-            testElement.reload().then(() => {
-              // shallow equality ensures it wasn't recomputed
-              assert.equal(tags, testElement.tags, 'tags was not recomputed');
-              done();
-            });
-          });
-
-          it('reload calls frontendReload', function(done) {
-            testElement.frontendReload = function() { done(); };
-            testElement.reload();
-          });
-
-        });
+  it('dataNotFound flag works', function(done) {
+    chai.assert.isFalse(testElement.dataNotFound, 'initially false');
+    var next = testElement.reload();
+    chai.assert.isFalse(testElement.dataNotFound, 'still false while pending');
+    resolve({foo: [], bar: []});
+    next.then(() => {
+      chai.assert.isTrue(testElement.dataNotFound, 'true on empty data');
+      var last = testElement.reload();
+      chai.assert.isTrue(testElement.dataNotFound, 'still true while pending');
+      resolve({foo: ['bar'], bar: ['zod']});
+      last.then(() => {
+        chai.assert.isFalse(
+            testElement.dataNotFound, 'false now that we have data');
+        done();
       });
-    }
+    });
+  });
+
+  it('reloads as soon as setup, if autoReload is true', function(done) {
+    var r2t = {foo: [], bar: []};
+    var fakeBackend = {
+      scalarTags: () => Promise.resolve(r2t),
+      scalar: () => null,
+    };
+    testElement = fixture('testElementFixture');
+    testElement.dataType = 'scalar';
+    testElement.backend = fakeBackend;
+    setTimeout(() => {
+      chai.assert.equal(testElement.run2tag, r2t);
+      done();
+    });
+  });
+
+  it('doesn\'t mutate props if backend returns same data', function(done) {
+    var r2t_1 = {foo: ['1', '2'], bar: ['3', '4']};
+    var r2t_2 = {foo: ['1', '2'], bar: ['3', '4']};
+    var fakeBackend = {
+      scalarTags: () => Promise.resolve(r2t_1),
+      scalar: () => null,
+    };
+    testElement.backend = fakeBackend;
+    testElement.reload().then(() => {
+      fakeBackend.scalarTags = () => Promise.resolve(r2t_2);
+      var tags = testElement.tags;
+      testElement.reload().then(() => {
+        // shallow equality ensures it wasn't recomputed
+        chai.assert.equal(tags, testElement.tags, 'tags was not recomputed');
+        done();
+      });
+    });
+  });
+
+  // TODO(dandelion): Fix this test.
+  it('reload calls frontendReload', function(done) {
+    testElement.frontendReload = function() {
+      done();
+    };
+    testElement.reload();
+  });
+
+});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/index.html b/tensorflow/tensorboard/components/tf_backend/test/index.html
deleted file mode 100644
index 7f51861d25a..00000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/index.html
+++ /dev/null
@@ -1,46 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../../vz-sorting/vz-sorting.html">
-</head>
-<body>
-  <test-fixture id="testElementFixture">
-    <template>
-      <test-element
-        id="test"
-      ></test-element>
-    </template>
-  </test-fixture>
-    <script src="../requestManager.js"></script>
-    <script src="../urlPathHelpers.js"></script>
-    <script src="../router.js"></script>
-    <script src="../backend.js"></script>
-    <script src="../behavior.js"></script>
-
-    <script src="requestManagerTest.js"></script>
-    <script src="backendTests.js"></script>
-    <script src="behaviorTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts b/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts
deleted file mode 100644
index b93e1569a45..00000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts
+++ /dev/null
@@ -1,287 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var assert = chai.assert;
-
-module TF.Backend {
-  interface MockRequest {
-    resolve: Function;
-    reject: Function;
-    id: number;
-    url: string;
-  }
-
-  class MockedRequestManager extends TF.Backend.RequestManager {
-    private resolvers: Function[];
-    private rejectors: Function[];
-    public requestsDispatched: number;
-
-    constructor(maxRequests = 10, maxRetries = 3) {
-      super(maxRequests, maxRetries);
-      this.resolvers = [];
-      this.rejectors = [];
-      this.requestsDispatched = 0;
-    }
-
-    protected _promiseFromUrl(url) {
-      return new Promise((resolve, reject) => {
-        var mockJSON = {
-          ok: true,
-          json: function() { return url; },
-          url: url,
-          status: 200,
-        };
-        var mockFailedRequest: any = {
-          ok: false,
-          url: url,
-          status: 502,
-        };
-        var mockFailure = new RequestNetworkError(mockFailedRequest, url);
-        this.resolvers.push(function() { resolve(mockJSON); });
-        this.rejectors.push(function() { reject(mockFailure); });
-        this.requestsDispatched++;
-      });
-    }
-
-    public resolveFakeRequest() {
-      this.resolvers.pop()();
-    }
-
-    public rejectFakeRequest() {
-      this.rejectors.pop()();
-    }
-
-    public dispatchAndResolve() {
-      // Wait for at least one request to be dispatched, then resolve it.
-      this.waitForDispatch(1).then(() => this.resolveFakeRequest());
-    }
-
-    public waitForDispatch(num) {
-      return waitForCondition(() => {return this.requestsDispatched >= num; });
-    }
-  }
-
-  /* Create a promise that returns when *check* returns true. */
-  // May cause a test timeout if check never becomes true.
-  function waitForCondition(check: () => boolean): Promise<any> {
-    return new Promise((resolve, reject) => {
-      var go = function() {
-        if (check()) {
-          resolve();
-        }
-        setTimeout(go, 2);
-      };
-      go();
-    });
-  }
-
-  describe('backend', () => {
-    describe('request manager', () => {
-      it('request loads JSON properly', (done) => {
-        var rm = new TF.Backend.RequestManager();
-        var promise = rm.request('data/example.json');
-        promise.then(
-            (response) => {
-              assert.deepEqual(response, {foo: 3, bar: 'zoidberg'});
-              done();
-            },
-            (reject) => { throw new Error(reject); });
-      });
-
-      it('rejects on bad url', (done) => {
-        var rm = new TF.Backend.RequestManager(5, 0);
-        var bad_url = '_bad_url_which_doesnt_exist.json';
-        var promise = rm.request(bad_url);
-        promise.then(
-            (success) => {
-              done(new Error('the promise should have rejected'));
-            },
-            (reject: TF.Backend.RequestNetworkError) => {
-              assert.instanceOf(reject, TF.Backend.RequestNetworkError);
-              assert.include(reject.message, '404');
-              assert.include(reject.message, bad_url);
-              assert.equal(reject.req.status, 404);
-              done();
-            });
-      });
-
-      it('can retry if requests fail', (done) => {
-        var rm = new MockedRequestManager(3, 5);
-        var r = rm.request('foo');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(2);
-        }).then(() => rm.resolveFakeRequest());
-        r.then((success) => done());
-      });
-
-      it('retries at most maxRetries times', (done) => {
-        var MAX_RETRIES = 2;
-        var rm = new MockedRequestManager(3, MAX_RETRIES);
-        var r = rm.request('foo');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(2);
-        }).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(3);
-        }).then(() => {
-          rm.rejectFakeRequest();
-        });
-
-        r.then(
-            (success) => done(new Error('The reqest should have failed')),
-            (failure) => done());
-      });
-
-      it('requestManager only sends maxRequests requests at a time', (done) => {
-        var rm = new MockedRequestManager(3);
-        var requestsConcluded = 0;
-        var r0 = rm.request('1');
-        var r1 = rm.request('2');
-        var r2 = rm.request('3');
-        var r3 = rm.request('4');
-        assert.equal(rm.activeRequests(), 3, 'three requests are active');
-        assert.equal(rm.outstandingRequests(), 4, 'four requests are pending');
-        rm.waitForDispatch(3)
-            .then(() => {
-              assert.equal(
-                  rm.activeRequests(), 3,
-                  'three requests are still active (1)');
-              assert.equal(
-                  rm.requestsDispatched, 3, 'three requests were dispatched');
-              rm.resolveFakeRequest();
-              return rm.waitForDispatch(4);
-            })
-            .then(() => {
-              assert.equal(
-                  rm.activeRequests(), 3,
-                  'three requests are still active (2)');
-              assert.equal(
-                  rm.requestsDispatched, 4, 'four requests were dispatched');
-              assert.equal(
-                  rm.outstandingRequests(), 3, 'three requests are pending');
-              rm.resolveFakeRequest();
-              rm.resolveFakeRequest();
-              rm.resolveFakeRequest();
-              return r3;
-            })
-            .then(() => {
-              assert.equal(rm.activeRequests(), 0, 'all requests finished');
-              assert.equal(rm.outstandingRequests(), 0, 'no requests pending');
-              done();
-            });
-      });
-
-      it('queue continues after failures', (done) => {
-        var rm = new MockedRequestManager(1, 0);
-        var r0 = rm.request('1');
-        var r1 = rm.request('2');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-        });
-
-        r0.then(
-              (success) => done(new Error('r0 should have failed')),
-              (failure) => 'unused_argument')
-            .then(() => rm.resolveFakeRequest());
-
-        // When the first request rejects, it should decrement nActiveRequests
-        // and then launch remaining requests in queue (i.e. this one)
-        r1.then((success) => done(),
-                (failure) => done(new Error(failure)));
-      });
-
-      it('queue is LIFO', (done) => {
-        /* This test is a bit tricky.
-        * We want to verify that the RequestManager queue has LIFO semantics.
-        * So we construct three requests off the bat: A, B, C.
-        * So LIFO semantics ensure these will resolve in order A, C, B.
-        * (Because the A request launches immediately when we create it, it's
-        * not in queue)
-        * Then after resolving A, C moves out of queue, and we create X.
-        * So expected final order is A, C, X, B.
-        * We verify this with an external var that counts how many requests were
-        * resolved.
-        */
-        var rm = new MockedRequestManager(1);
-        var nResolved = 0;
-        function assertResolutionOrder(expectedSpotInSequence) {
-          return function() {
-            nResolved++;
-            assert.equal(expectedSpotInSequence, nResolved);
-          };
-        }
-
-        function launchThirdRequest() {
-          rm.request('started late but goes third')
-              .then(assertResolutionOrder(3))
-              .then(() => rm.dispatchAndResolve());
-        }
-
-        rm.request('first')
-            .then(assertResolutionOrder(
-                1))  // Assert that this one resolved first
-            .then(launchThirdRequest)
-            .then(() => rm.dispatchAndResolve());  // then trigger the next one
-
-        rm.request('this one goes fourth')  // created second, will go last
-            .then(assertResolutionOrder(
-                4))       // assert it was the fourth to get resolved
-            .then(done);  // finish the test
-
-        rm.request('second')
-            .then(assertResolutionOrder(2))
-            .then(() => rm.dispatchAndResolve());
-
-        rm.dispatchAndResolve();
-      });
-
-      it('requestManager can clear queue', (done) => {
-        var rm = new MockedRequestManager(1);
-        var requestsResolved = 0;
-        var requestsRejected = 0;
-        var success = () => requestsResolved++;
-        var failure = (err) => {
-          assert.equal(err.name, 'RequestCancellationError');
-          requestsRejected++;
-        };
-        var finishTheTest = () => {
-          assert.equal(rm.activeRequests(), 0, 'no requests still active');
-          assert.equal(
-              rm.requestsDispatched, 1, 'only one req was ever dispatched');
-          assert.equal(rm.outstandingRequests(), 0, 'no pending requests');
-          assert.equal(requestsResolved, 1, 'one request got resolved');
-          assert.equal(
-              requestsRejected, 4, 'four were cancelled and threw errors');
-          done();
-        };
-        rm.request('0').then(success, failure).then(finishTheTest);
-        rm.request('1').then(success, failure);
-        rm.request('2').then(success, failure);
-        rm.request('3').then(success, failure);
-        rm.request('4').then(success, failure);
-        assert.equal(rm.activeRequests(), 1, 'one req is active');
-        rm.waitForDispatch(1).then(() => {
-          assert.equal(rm.activeRequests(), 1, 'one req is active');
-          assert.equal(rm.requestsDispatched, 1, 'one req was dispatched');
-          assert.equal(rm.outstandingRequests(), 5, 'five reqs outstanding');
-          rm.clearQueue();
-          rm.resolveFakeRequest();
-          // resolving the first request triggers finishTheTest
-        });
-      });
-    });
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts b/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts
new file mode 100644
index 00000000000..3800e6e4021
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts
@@ -0,0 +1,294 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {RequestManager, RequestNetworkError} from '../requestManager';
+
+interface MockRequest {
+  resolve: Function;
+  reject: Function;
+  id: number;
+  url: string;
+}
+
+class MockedRequestManager extends RequestManager {
+  private resolvers: Function[];
+  private rejectors: Function[];
+  public requestsDispatched: number;
+  constructor(maxRequests = 10, maxRetries = 3) {
+    super(maxRequests, maxRetries);
+    this.resolvers = [];
+    this.rejectors = [];
+    this.requestsDispatched = 0;
+  }
+  protected _promiseFromUrl(url) {
+    return new Promise((resolve, reject) => {
+      const mockJSON = {
+        ok: true,
+        json() {
+          return url;
+        },
+        url,
+        status: 200,
+      };
+      const mockFailedRequest: any = {
+        ok: false,
+        url,
+        status: 502,
+      };
+      const mockFailure = new RequestNetworkError(mockFailedRequest, url);
+      this.resolvers.push(() => {
+        resolve(mockJSON);
+      });
+      this.rejectors.push(() => {
+        reject(mockFailure);
+      });
+      this.requestsDispatched++;
+    });
+  }
+  public resolveFakeRequest() {
+    this.resolvers.pop()();
+  }
+  public rejectFakeRequest() {
+    this.rejectors.pop()();
+  }
+  public dispatchAndResolve() {
+    // Wait for at least one request to be dispatched, then resolve it.
+    this.waitForDispatch(1).then(() => this.resolveFakeRequest());
+  }
+  public waitForDispatch(num) {
+    return waitForCondition(() => {
+      return this.requestsDispatched >= num;
+    });
+  }
+}
+
+/** Create a promise that returns when *check* returns true.
+ * May cause a test timeout if check never becomes true.
+ */
+
+function waitForCondition(check: () => boolean): Promise<any> {
+  return new Promise((resolve, reject) => {
+    const go = () => {
+      if (check()) {
+        resolve();
+      }
+      setTimeout(go, 2);
+    };
+    go();
+  });
+}
+
+describe('backend', () => {
+  describe('request manager', () => {
+    it('request loads JSON properly', (done) => {
+      const rm = new RequestManager();
+      const promise = rm.request('data/example.json');
+      promise.then(
+          (response) => {
+            chai.assert.deepEqual(response, {foo: 3, bar: 'zoidberg'});
+            done();
+          },
+          (reject) => {
+            throw new Error(reject);
+          });
+    });
+
+    it('rejects on bad url', (done) => {
+      const rm = new RequestManager(5, 0);
+      const badUrl = '_bad_url_which_doesnt_exist.json';
+      const promise = rm.request(badUrl);
+      promise.then(
+          (success) => {
+            done(new Error('the promise should have rejected'));
+          },
+          (reject: RequestNetworkError) => {
+            chai.assert.include(reject.message, '404');
+            chai.assert.include(reject.message, badUrl);
+            chai.assert.equal(reject.req.status, 404);
+            done();
+          });
+    });
+
+    it('can retry if requests fail', (done) => {
+      const rm = new MockedRequestManager(3, 5);
+      const r = rm.request('foo');
+      rm.waitForDispatch(1)
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(2);
+          })
+          .then(() => rm.resolveFakeRequest());
+      r.then((success) => done());
+    });
+
+    it('retries at most maxRetries times', (done) => {
+      const MAX_RETRIES = 2;
+      const rm = new MockedRequestManager(3, MAX_RETRIES);
+      const r = rm.request('foo');
+      rm.waitForDispatch(1)
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(2);
+          })
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(3);
+          })
+          .then(() => {
+            rm.rejectFakeRequest();
+          });
+
+      r.then(
+          (success) => done(new Error('The request should have failed')),
+          (failure) => done());
+    });
+
+    it('requestManager only sends maxRequests requests at a time', (done) => {
+      const rm = new MockedRequestManager(3);
+      const r0 = rm.request('1');
+      const r1 = rm.request('2');
+      const r2 = rm.request('3');
+      const r3 = rm.request('4');
+      chai.assert.equal(rm.activeRequests(), 3, 'three requests are active');
+      chai.assert.equal(
+          rm.outstandingRequests(), 4, 'four requests are pending');
+      rm.waitForDispatch(3)
+          .then(() => {
+            chai.assert.equal(
+                rm.activeRequests(), 3, 'three requests are still active (1)');
+            chai.assert.equal(
+                rm.requestsDispatched, 3, 'three requests were dispatched');
+            rm.resolveFakeRequest();
+            return rm.waitForDispatch(4);
+          })
+          .then(() => {
+            chai.assert.equal(
+                rm.activeRequests(), 3, 'three requests are still active (2)');
+            chai.assert.equal(
+                rm.requestsDispatched, 4, 'four requests were dispatched');
+            chai.assert.equal(
+                rm.outstandingRequests(), 3, 'three requests are pending');
+            rm.resolveFakeRequest();
+            rm.resolveFakeRequest();
+            rm.resolveFakeRequest();
+            return r3;
+          })
+          .then(() => {
+            chai.assert.equal(rm.activeRequests(), 0, 'all requests finished');
+            chai.assert.equal(
+                rm.outstandingRequests(), 0, 'no requests pending');
+            done();
+          });
+    });
+
+    it('queue continues after failures', (done) => {
+      const rm = new MockedRequestManager(1, 0);
+      const r0 = rm.request('1');
+      const r1 = rm.request('2');
+      rm.waitForDispatch(1).then(() => {
+        rm.rejectFakeRequest();
+      });
+
+      r0.then(
+            (success) => done(new Error('r0 should have failed')),
+            (failure) => 'unused_argument')
+          .then(() => rm.resolveFakeRequest());
+
+      // When the first request rejects, it should decrement nActiveRequests
+      // and then launch remaining requests in queue (i.e. this one)
+      r1.then((success) => done(), (failure) => done(new Error(failure)));
+    });
+
+    it('queue is LIFO', (done) => {
+      /* This test is a bit tricky.
+       * We want to verify that the RequestManager queue has LIFO semantics.
+       * So we construct three requests off the bat: A, B, C.
+       * So LIFO semantics ensure these will resolve in order A, C, B.
+       * (Because the A request launches immediately when we create it, it's
+       * not in queue)
+       * Then after resolving A, C moves out of queue, and we create X.
+       * So expected final order is A, C, X, B.
+       * We verify this with an external var that counts how many requests were
+       * resolved.
+       */
+      const rm = new MockedRequestManager(1);
+      let nResolved = 0;
+      function assertResolutionOrder(expectedSpotInSequence) {
+        return () => {
+          nResolved++;
+          chai.assert.equal(expectedSpotInSequence, nResolved);
+        };
+      }
+
+      function launchThirdRequest() {
+        rm.request('started late but goes third')
+            .then(assertResolutionOrder(3))
+            .then(() => rm.dispatchAndResolve());
+      }
+
+      rm.request('first')
+          .then(
+              assertResolutionOrder(1))  // Assert that this one resolved first
+          .then(launchThirdRequest)
+          .then(() => rm.dispatchAndResolve());  // then trigger the next one
+
+      rm.request('this one goes fourth')  // created second, will go last
+          .then(assertResolutionOrder(
+              4))       // assert it was the fourth to get resolved
+          .then(done);  // finish the test
+
+      rm.request('second')
+          .then(assertResolutionOrder(2))
+          .then(() => rm.dispatchAndResolve());
+
+      rm.dispatchAndResolve();
+    });
+
+    it('requestManager can clear queue', (done) => {
+      const rm = new MockedRequestManager(1);
+      let requestsResolved = 0;
+      let requestsRejected = 0;
+      const success = () => requestsResolved++;
+      const failure = (err) => {
+        chai.assert.equal(err.name, 'RequestCancellationError');
+        requestsRejected++;
+      };
+      const finishTheTest = () => {
+        chai.assert.equal(rm.activeRequests(), 0, 'no requests still active');
+        chai.assert.equal(
+            rm.requestsDispatched, 1, 'only one req was ever dispatched');
+        chai.assert.equal(rm.outstandingRequests(), 0, 'no pending requests');
+        chai.assert.equal(requestsResolved, 1, 'one request got resolved');
+        chai.assert.equal(
+            requestsRejected, 4, 'four were cancelled and threw errors');
+        done();
+      };
+      rm.request('0').then(success, failure).then(finishTheTest);
+      rm.request('1').then(success, failure);
+      rm.request('2').then(success, failure);
+      rm.request('3').then(success, failure);
+      rm.request('4').then(success, failure);
+      chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
+      rm.waitForDispatch(1).then(() => {
+        chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
+        chai.assert.equal(rm.requestsDispatched, 1, 'one req was dispatched');
+        chai.assert.equal(rm.outstandingRequests(), 5, 'five reqs outstanding');
+        rm.clearQueue();
+        rm.resolveFakeRequest();
+        // resolving the first request triggers finishTheTest
+      });
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/tests.html b/tensorflow/tensorboard/components/tf_backend/test/tests.html
new file mode 100644
index 00000000000..58cb89a30b6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend/test/tests.html
@@ -0,0 +1,37 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+<head>
+  <meta charset="utf-8">
+  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+  <script src="../../web-component-tester/browser.js"></script>
+  <link rel="import" href="../../polymer/polymer.html">
+  <link rel="import" href="../tf-backend.html">
+</head>
+<body>
+  <test-fixture id="testElementFixture">
+    <template>
+      <test-element id="test"></test-element>
+    </template>
+  </test-fixture>
+  <script src="backendTests.js"></script>
+  <script src="behaviorTests.js"></script>
+  <script src="requestManagerTests.js"></script>
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_backend/tf-backend.html b/tensorflow/tensorboard/components/tf_backend/tf-backend.html
index 0e07c7fdb1e..c2a44b3b63f 100644
--- a/tensorflow/tensorboard/components/tf_backend/tf-backend.html
+++ b/tensorflow/tensorboard/components/tf_backend/tf-backend.html
@@ -23,5 +23,6 @@ limitations under the License.
 <script src="requestManager.js"></script>
 <script src="urlPathHelpers.js"></script>
 <script src="router.js"></script>
+<script src="runsStore.js"></script>
 <script src="backend.js"></script>
 <script src="behavior.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts b/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
index 7c59eafb448..62519dac5ca 100644
--- a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
+++ b/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
@@ -12,31 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-module TF.Backend {
-  export var BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
-  /** Cleanup a url so that it can be loaded from a filesystem. */
-  export function demoify(s) {
-    // for consistency with python's urllib.urlencode
-    s = s.replace(new RegExp('%20', 'g'), '+');
-    for (var i = 0; i < BAD_CHARACTERS.length; i++) {
-      var c = BAD_CHARACTERS[i];
-      s = s.replace(new RegExp('\\' + c, 'g'), '_');
-    }
-    return s;
-  }
-
-  export function queryEncoder(params?: any): string {
-    // It's important that the keys be sorted, so we always grab the right file
-    // if we are talking to the backend generated by serialze_tensorboard.py
-    if (params == null) {
-      return '';
-    }
-    var components = _.keys(params)
-                         .sort()
-                         .filter((k) => params[k] !== undefined)
-                         .map((k) => k + '=' + encodeURIComponent(params[k]));
-    var result = components.length ? '?' + components.join('&') : '';
-    // Replace parens for consistency with urllib.urlencode
-    return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
+export const BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
+/** Cleanup a url so that it can be loaded from a filesystem. */
+export function demoify(s) {
+  // for consistency with python's urllib.urlencode
+  s = s.replace(new RegExp('%20', 'g'), '+');
+  for (let i = 0; i < BAD_CHARACTERS.length; i++) {
+    const c = BAD_CHARACTERS[i];
+    s = s.replace(new RegExp('\\' + c, 'g'), '_');
   }
+  return s;
+}
+
+export function queryEncoder(params?: any): string {
+  // It's important that the keys be sorted, so we always grab the right file
+  // if we are talking to the backend generated by serialze_tensorboard.py
+  if (params == null) {
+    return '';
+  }
+  const components = _.keys(params)
+                       .sort()
+                       .filter((k) => params[k] !== undefined)
+                       .map((k) => k + '=' + encodeURIComponent(params[k]));
+  const result = components.length ? '?' + components.join('&') : '';
+  // Replace parens for consistency with urllib.urlencode
+  return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
 }
diff --git a/tensorflow/tensorboard/components/tf_color_scale/BUILD b/tensorflow/tensorboard/components/tf_color_scale/BUILD
new file mode 100644
index 00000000000..730ab37d6f7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/BUILD
@@ -0,0 +1,39 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_color_scale",
+    srcs = [
+        "colorScale.ts",
+        "palettes.ts",
+        "tf-color-scale.html",
+    ],
+    path = "/tf-color-scale",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-color-scale",
+    deps = [
+        ":tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts b/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
index c05d9765335..e20a65cdd84 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
+++ b/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
@@ -15,49 +15,75 @@ limitations under the License.
 
 // Example usage:
 // runs = ["train", "test", "test1", "test2"]
-// ccs = new TF.ColorScale();
+// ccs = new ColorScale();
 // ccs.domain(runs);
 // ccs.getColor("train");
 // ccs.getColor("test1");
 
-module TF {
-  export class ColorScale {
-    private palette: string[];
-    private identifiers = d3.map();
+import {palettes} from './palettes';
 
-    /**
-     * Creates a color scale with optional custom palette.
-     *  @param {string[]} [palette=TF.palettes.googleColorBlind] - The color
-     *                 palette you want as an Array of hex strings.
-     */
-    constructor(palette: string[] = TF.palettes.googleColorBlindAssist) {
-      this.palette = palette;
-    }
+export class ColorScale {
+  private identifiers = d3.map();
 
-    /**
-     * Set the domain of strings.
-     * @param {string[]} strings - An array of possible strings to use as the
-     *                             domain for your scale.
-     */
-    public domain(strings: string[]): this {
-      this.identifiers = d3.map();
-      strings.forEach((s, i) => {
-        this.identifiers.set(s, this.palette[i % this.palette.length]);
-      });
-      return this;
-    }
+  /**
+   * Creates a color scale with optional custom palette.
+   * @param {Array<string>} [palette=palettes.googleColorBlind] - The color
+   *     palette you want as an Array of hex strings.
+   */
+  constructor(
+      private readonly palette: string[] = palettes.googleColorBlindAssist) {}
 
-    /**
-     * Use the color scale to transform an element in the domain into a color.
-     * @param {string} The input string to map to a color.
-     * @return {string} The color corresponding to that input string.
-     * @throws Will error if input string is not in the scale's domain.
-     */
-    public scale(s: string): string {
-      if (!this.identifiers.has(s)) {
-        throw new Error('String was not in the domain.');
-      }
-      return this.identifiers.get(s) as string;
+  /**
+   * Set the domain of strings.
+   * @param {Array<string>} strings - An array of possible strings to use as the
+   *     domain for your scale.
+   */
+  public domain(strings: string[]): this {
+    this.identifiers = d3.map();
+
+    // TODO(wchargin): Remove this call to `sort` once we have only a
+    // singleton ColorScale, linked directly to the RunsStore, which
+    // will always give sorted output.
+    strings = strings.slice();
+    strings.sort();
+
+    strings.forEach((s, i) => {
+      this.identifiers.set(s, this.palette[i % this.palette.length]);
+    });
+    return this;
+  }
+
+  /**
+   * Use the color scale to transform an element in the domain into a color.
+   * @param {string} The input string to map to a color.
+   * @return {string} The color corresponding to that input string.
+   * @throws Will error if input string is not in the scale's domain.
+   */
+  public scale(s: string): string {
+    if (!this.identifiers.has(s)) {
+      throw new Error('String was not in the domain.');
     }
+    return this.identifiers.get(s) as string;
   }
 }
+
+Polymer({
+  is: 'tf-color-scale',
+  properties: {
+    runs: {
+      type: Array,
+    },
+    outColorScale: {
+      type: Object,
+      readOnly: true,
+      notify: true,
+      value() {
+        return new ColorScale();
+      },
+    },
+  },
+  observers: ['updateColorScale(runs.*)'],
+  updateColorScale(runsChange) {
+    this.outColorScale.domain(this.runs);
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/index.html b/tensorflow/tensorboard/components/tf_color_scale/index.html
new file mode 100644
index 00000000000..81dfab098c6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>tf-color-scale demo</title>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="tf-color-scale.html">
+
+<style> body {font-family: "Roboto";}</style>
+<demo-snippet>
+  <template>
+    <dom-module id="color-scale-demo">
+      <template>
+        <paper-button raised id="button" on-tap="change">Change Runs</paper-button>
+        <tf-color-scale runs="[[runs]]" out-color-scale="{{scale}}"></tf-color-scale>
+        <div class="container">
+          <template is="dom-repeat" items="[[runs]]">
+            <div class="row">
+              <div class="circle" style$=[[_makeStyle(item)]]></div>
+              <span>[[item]]</span>
+            </div>
+          </template>
+        </div>
+        <style>
+          .circle {
+            width: 20px;
+            height: 20px;
+            border-radius: 10px;
+            display: inline-block;
+          }
+          .row {
+            height: 35px;
+            width: 200px;
+            display: inline-block;
+          }
+          .container {
+            height: 200px;
+          }
+          #button {
+            margin: 20px;
+          }
+        </style>
+      </template>
+      <script>
+        let fellowship = ["aragorn", "legolas", "gimli", "frodo", 
+                          "gandalf", "boromir", "merry", "pippin", "sam"];
+        let gems = ["garnet", "amethyst", "pearl", "and steven!"];
+        let numbers = d3.range(30).map(function(x) {return x.toString();});
+        let examples = [numbers, fellowship, gems];
+        Polymer({
+          is: "color-scale-demo",
+          properties: {
+            runs: {
+              type: Array,
+              value: examples[0],
+            },
+            i: {
+              type: Number,
+              value: 0,
+            },
+          },
+          _makeStyle: function(item) {
+            return "background-color: " + this.scale.scale(item);
+          },
+          change: function() {
+            this.i = (this.i + 1) % 3;
+            this.runs = examples[this.i];
+          },
+        });
+      </script>
+    </dom-module>
+    <color-scale-demo id="demo"></color-scale-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts b/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
index c53ed599ae9..ce42a115458 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
+++ b/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
@@ -13,68 +13,64 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module TF {
-  export const palettes = {
-    googleStandard: [
-      '#db4437',  // google red 500
-      '#ff7043',  // deep orange 400
-      '#f4b400',  // google yellow 500
-      '#0f9d58',  // google green 500
-      '#00796b',  // teal 700
-      '#00acc1',  // cyan 600
-      '#4285f4',  // google blue 500
-      '#5c6bc0',  // indigo 400
-      '#ab47bc'   // purple 400
-    ],
-    googleCool: [
-      '#9e9d24',  // lime 800
-      '#0f9d58',  // google green 500
-      '#00796b',  // teal 700
-      '#00acc1',  // cyan 600
-      '#4285f4',  // google blue 500
-      '#5c6bc0',  // indigo 400
-      '#607d8b'   // blue gray 500
-    ],
-    googleWarm: [
-      '#795548',  // brown 500
-      '#ab47bc',  // purple 400
-      '#f06292',  // pink 300
-      '#c2185b',  // pink 700
-      '#db4437',  // google red 500
-      '#ff7043',  // deep orange 400
-      '#f4b400'   // google yellow 700
-    ],
-    googleColorBlindAssist: [
-      '#ff7043',  // orange
-      '#00ACC1',  // dark cyan
-      '#AB47BC',  // bright purple
-      '#2A56C6',  // dark blue
-      '#0b8043',  // green
-      '#F7CB4D',  // yellow
-      '#c0ca33',  // lime
-      '#5e35b1',  // purple
-      '#A52714',  // red
-    ],
-    // These palettes try to be better for color differentiation.
-    // https://personal.sron.nl/~pault/
-    colorBlindAssist1:
-        ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
-    colorBlindAssist2: [
-      '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677',
-      '#882255', '#aa4499'
-    ],
-    colorBlindAssist3: [
-      '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933',
-      '#ddcc77', '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
-    ],
-    // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
-    colorBlindAssist4: [
-      '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB',
-      '#490092'
-    ],
-    mldash: [
-      '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8',
-      '#00B7ED'
-    ]
-  };
-}
+export const palettes = {
+  googleStandard: [
+    '#db4437',  // google red 500
+    '#ff7043',  // deep orange 400
+    '#f4b400',  // google yellow 500
+    '#0f9d58',  // google green 500
+    '#00796b',  // teal 700
+    '#00acc1',  // cyan 600
+    '#4285f4',  // google blue 500
+    '#5c6bc0',  // indigo 400
+    '#ab47bc'   // purple 400
+  ],
+  googleCool: [
+    '#9e9d24',  // lime 800
+    '#0f9d58',  // google green 500
+    '#00796b',  // teal 700
+    '#00acc1',  // cyan 600
+    '#4285f4',  // google blue 500
+    '#5c6bc0',  // indigo 400
+    '#607d8b'   // blue gray 500
+  ],
+  googleWarm: [
+    '#795548',  // brown 500
+    '#ab47bc',  // purple 400
+    '#f06292',  // pink 300
+    '#c2185b',  // pink 700
+    '#db4437',  // google red 500
+    '#ff7043',  // deep orange 400
+    '#f4b400'   // google yellow 700
+  ],
+  googleColorBlindAssist: [
+    '#ff7043',  // orange
+    '#00ACC1',  // dark cyan
+    '#AB47BC',  // bright purple
+    '#2A56C6',  // dark blue
+    '#0b8043',  // green
+    '#F7CB4D',  // yellow
+    '#c0ca33',  // lime
+    '#5e35b1',  // purple
+    '#A52714',  // red
+  ],
+  // These palettes try to be better for color differentiation.
+  // https://personal.sron.nl/~pault/
+  colorBlindAssist1:
+      ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
+  colorBlindAssist2: [
+    '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677', '#882255',
+    '#aa4499'
+  ],
+  colorBlindAssist3: [
+    '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77',
+    '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
+  ],
+  // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
+  colorBlindAssist4: [
+    '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB', '#490092'
+  ],
+  mldash: [
+    '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8', '#00B7ED'
+  ]
+};
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/BUILD b/tensorflow/tensorboard/components/tf_color_scale/test/BUILD
new file mode 100644
index 00000000000..331783f3c76
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/test/BUILD
@@ -0,0 +1,30 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "colorScaleTests.ts",
+        "tests.html",
+    ],
+    path = "/tf-color-scale/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts b/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
index 700a01848b6..78824a772c3 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
+++ b/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
@@ -13,34 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module TF {
-  let assert = chai.assert;
+let assert = chai.assert;
 
-  describe('ColorScale', function() {
-    let ccs: ColorScale;
+import {ColorScale} from '../colorScale'
 
-    beforeEach(function() { ccs = new ColorScale(); });
+describe('ColorScale', function() {
+  let ccs: ColorScale;
 
-    it('Returns consistent colors', function() {
-      ccs.domain(['train', 'eval', 'test']);
-      let trainColor = ccs.scale('train');
-      let trainColor2 = ccs.scale('train');
-      assert.equal(trainColor, trainColor2);
-    });
-
-    it('Returns consistent colors after new domain', function() {
-      ccs.domain(['train', 'eval']);
-      let trainColor = ccs.scale('train');
-      ccs.domain(['train', 'eval', 'test']);
-      let trainColor2 = ccs.scale('train');
-      assert.equal(trainColor, trainColor2);
-    });
-
-    it('Throws an error if string is not in the domain', function() {
-      ccs.domain(['red', 'yellow', 'green']);
-      assert.throws(function() {
-        ccs.scale('not in domain');
-      }, 'String was not in the domain.');
-    });
+  beforeEach(function() {
+    ccs = new ColorScale();
   });
-}
+
+  it('Returns consistent colors', function() {
+    ccs.domain(['train', 'eval', 'test']);
+    let trainColor = ccs.scale('train');
+    let trainColor2 = ccs.scale('train');
+    assert.equal(trainColor, trainColor2);
+  });
+
+  it('Returns consistent colors after new domain', function() {
+    ccs.domain(['train', 'eval']);
+    let trainColor = ccs.scale('train');
+    ccs.domain(['train', 'eval', 'test']);
+    let trainColor2 = ccs.scale('train');
+    assert.equal(trainColor, trainColor2);
+  });
+
+  it('Throws an error if string is not in the domain', function() {
+    ccs.domain(['red', 'yellow', 'green']);
+    assert.throws(function() {
+      ccs.scale('not in domain');
+    }, 'String was not in the domain.');
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/index.html b/tensorflow/tensorboard/components/tf_color_scale/test/index.html
deleted file mode 100644
index 9a2a174349c..00000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/test/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-</head>
-<body>
-    <script src="../colorScale.js"></script>
-    <script src="../palettes.js"></script>
-    <script src="colorScaleTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/tests.html b/tensorflow/tensorboard/components/tf_color_scale/test/tests.html
new file mode 100644
index 00000000000..59c802d02bf
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-color-scale.html">
+<body>
+<script src="colorScaleTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html b/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
index 743996f6241..a325f0a04cd 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
+++ b/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
 
 <!--
 tf-color-scale is a plumbing component that takes in an array of runs, and produces
@@ -27,28 +28,4 @@ a set of colors.
 <dom-module id="tf-color-scale">
   <script src="palettes.js"></script>
   <script src="colorScale.js"></script>
-  <script>
-    (function() {
-      Polymer({
-        is: "tf-color-scale",
-        properties: {
-          runs: {
-            type: Array,
-          },
-          outColorScale: {
-            type: Object,
-            readOnly: true,
-            notify: true,
-            value: function() {
-              return new TF.ColorScale();
-            },
-          },
-        },
-        observers: ['updateColorScale(runs.*)'],
-        updateColorScale: function(runsChange) {
-          this.outColorScale.domain(this.runs);
-        },
-      });
-    })();
-  </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
new file mode 100644
index 00000000000..7471da3144a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
@@ -0,0 +1,107 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_dashboard_common",
+    srcs = [
+        "dashboard-behavior.ts",
+        "dashboard-style.html",
+        "reload-behavior.ts",
+        "run-color-style.html",
+        "scrollbar-style.html",
+        "tensorboard-color.html",
+        "tf-categorizer.html",
+        "tf-categorizer.ts",
+        "tf-chart-scaffold.html",
+        "tf-collapsable-pane.html",
+        "tf-dashboard.html",
+        "tf-dashboard-layout.html",
+        "tf-downloader.html",
+        "tf-multi-checkbox.html",
+        "tf-multi-checkbox.ts",
+        "tf-no-data-warning.html",
+        "tf-option-selector.html",
+        "tf-panes-helper.html",
+        "tf-regex-group.html",
+        "tf-regex-group.ts",
+        "tf-run-selector.html",
+        "tf-sidebar-helper.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_storage",
+        "//tensorflow/tensorboard/components/vz_sorting",
+        "@org_polymer_iron_ajax",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toggle_button",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = [
+        "tf-categorizer-demo.html",
+        "tf-collapsable-pane-demo.html",
+        "tf-multi-checkbox-demo.html",
+        "tf-regex-group-demo.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        ":tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_dashboard_common"],
+    destdir = "tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//tensorflow/tensorboard/components/tf_storage:legacy",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/iron-ajax:lib",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/iron-icons:lib",
+        "//third_party/javascript/polymer/v1/paper-button:lib",
+        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+        "//third_party/javascript/polymer/v1/paper-dialog:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-input:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-spinner:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
deleted file mode 100644
index e8a9751f08f..00000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module Categorizer {
-  /**
-   * This module contains methods that allow sorting tags into 'categories'.
-   * A category contains a name and a list of tags.
-   * The sorting strategy is defined by a 'CustomCategorization', which contains
-   * 'categoryDefinitions' which are regex rules used to construct a category.
-   * E.g. the regex rule 'xent' will create a category called 'xent' that
-   * contains values whose tags match the regex.
-   *
-   * After custom categories are evaluated, the tags are sorted by a hardcoded
-   * fallback categorizer, which may, for example, group tags into categories
-   * based on their top namespace.
-   */
-
-  export interface Category {
-    // Categories that data is sorted into
-    name: string;
-    tags: string[];
-  }
-
-  export interface CustomCategorization {
-    // Defines a categorization strategy
-    categoryDefinitions: string[];
-    fallbackCategorizer: string;
-    /* {'TopLevelNamespaceCategorizer',
-        'LegacyUnderscoreCategorizer'} */
-  }
-
-  export interface Categorizer {
-    // Function that generates categories
-    (tags: string[]): Category[];
-  }
-
-  /* Canonical TensorFlow ops are namespaced using forward slashes.
-   * This fallback categorizer categorizes by the top-level namespace.
-   */
-  export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
-
-  // Try to produce good categorizations on legacy graphs, which often
-  // are namespaced like l1_foo/bar or l2_baz/bam.
-  // If there is no leading underscore before the first forward slash,
-  // then it behaves the same as topLevelNamespaceCategorizer
-  export var legacyUnderscoreCategorizer: Categorizer =
-      splitCategorizer(/[\/_]/);
-
-  export function fallbackCategorizer(s: string): Categorizer {
-    switch (s) {
-      case 'TopLevelNamespaceCategorizer':
-        return topLevelNamespaceCategorizer;
-      case 'LegacyUnderscoreCategorizer':
-        return legacyUnderscoreCategorizer;
-      default:
-        throw new Error('Unrecognized categorization strategy: ' + s);
-    }
-  }
-
-  /* An 'extractor' is a function that takes a tag name, and 'extracts' a
-   * category name.
-   * This function takes an extractor, and produces a categorizer.
-   * Currently, it is just used for the fallbackCategorizer, but we may want to
-   * refactor the general categorization logic to use the concept of extractors.
-   */
-  function extractorToCategorizer(extractor: (s: string) => string):
-      Categorizer {
-    return (tags: string[]): Category[] => {
-      if (tags.length === 0) {
-        return [];
-      }
-      let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-      let categories: Category[] = [];
-      let currentCategory = {
-        name: extractor(sortedTags[0]),
-        tags: [],
-      };
-      sortedTags.forEach((t: string) => {
-        let topLevel = extractor(t);
-        if (currentCategory.name !== topLevel) {
-          categories.push(currentCategory);
-          currentCategory = {
-            name: topLevel,
-            tags: [],
-          };
-        }
-        currentCategory.tags.push(t);
-      });
-      categories.push(currentCategory);
-      return categories;
-    };
-  }
-
-  function splitCategorizer(r: RegExp): Categorizer {
-    let extractor = (t: string) => {
-      return t.split(r)[0];
-    };
-    return extractorToCategorizer(extractor);
-  }
-
-  export interface CategoryDefinition {
-    name: string;
-    matches: (t: string) => boolean;
-  }
-
-  export function defineCategory(ruledef: string): CategoryDefinition {
-    let r = new RegExp(ruledef);
-    let f = function(tag: string): boolean {
-      return r.test(tag);
-    };
-    return { name: ruledef, matches: f };
-  }
-
-  export function _categorizer(
-      rules: CategoryDefinition[], fallback: Categorizer) {
-    return function(tags: string[]): Category[] {
-      let remaining: d3.Set = d3.set(tags);
-      let userSpecified = rules.map((def: CategoryDefinition) => {
-        let tags: string[] = [];
-        remaining.forEach((t: string) => {
-          if (def.matches(t)) {
-            tags.push(t);
-          }
-        });
-        let cat = {name: def.name, tags: tags.sort(VZ.Sorting.compareTagNames)};
-        return cat;
-      });
-      let defaultCategories = fallback(remaining.values());
-      return userSpecified.concat(defaultCategories);
-    };
-  }
-
-  export function categorizer(s: CustomCategorization): Categorizer {
-    let rules = s.categoryDefinitions.map(defineCategory);
-    let fallback = fallbackCategorizer(s.fallbackCategorizer);
-    return _categorizer(rules, fallback);
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
new file mode 100644
index 00000000000..aa063c74220
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * A behavior that TensorBoard dashboards must implement. This behavior serves
+ * the purpose of an interface.
+ *
+ * @polymerBehavior
+ */
+export function DashboardBehavior(dashboardName) {
+  return {
+    properties: {
+      name: {
+        type: String,
+        value: dashboardName,
+        readOnly: true,
+      },
+    },
+    // This method is called when the dashboard reloads, either when the
+    // dashboard is first visited, periodically reloaded, or manually reloaded
+    // via the user clicking the button. Note that dashboard custom elements
+    // that use TF.Dashboard.ReloadBehavior already implement a reload method.
+    reload() {
+      throw Error(
+          'The ' + dashboardName + ' dashboard does not implement reload.');
+    },
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html
index b6225ba5b23..6629e5bfc22 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html
@@ -16,7 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../paper-styles/paper-styles.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tensorboard-color.html">
 
 <dom-module id="dashboard-style">
   <template>
@@ -30,7 +30,7 @@ limitations under the License.
 
       .sidebar-section {
         border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
+        padding: 15px 0px 15px 30px;
       }
 
       .sidebar-section:first-child {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts
index f629576ae6d..61fe0c07812 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts
@@ -13,27 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module TF.Dashboard {
-  /**
-   * ReloadBehavior: A simple behavior for dashboards where the
-   * frontendReload() function should find every child element with a
-   * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
-   * and call a `reload` method on that child.
-   * May later extend it so it has more sophisticated logic, e.g. reloading
-   * only tags that are in view.
-   */
-  export function ReloadBehavior(tagName) {
-    return {
-      properties: {
-        reloadTag: {
-          type: String,
-          value: tagName,
-        },
+/**
+ * ReloadBehavior: A simple behavior for dashboards where the
+ * frontendReload() function should find every child element with a
+ * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
+ * and call a `reload` method on that child.
+ * May later extend it so it has more sophisticated logic, e.g. reloading
+ * only tags that are in view.
+ *
+ * @polymerBehavior
+ */
+export function ReloadBehavior(tagName) {
+  return {
+    properties: {
+      reloadTag: {
+        type: String,
+        value: tagName,
       },
-      frontendReload: function() {
-        var elements = this.getElementsByTagName(this.reloadTag);
-        Array.prototype.forEach.call(elements, function(x) { x.reload(); });
-      },
-    };
-  }
+    },
+    frontendReload: function() {
+      var elements = this.getElementsByTagName(this.reloadTag);
+      Array.prototype.forEach.call(elements, function(x) {
+        x.reload();
+      });
+    },
+  };
 }
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html b/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html
index 82a22e78cba..bfd61f66191 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html
@@ -15,6 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+<link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-styles/paper-styles.html">
 
 <dom-module id="scrollbar-style">
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD
new file mode 100644
index 00000000000..ef7a1562c65
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD
@@ -0,0 +1,30 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "tests.html",
+        "tf-categorizer-tests.ts",
+    ],
+    path = "/tf-dashboard-common/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
deleted file mode 100644
index c28e1851fdc..00000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module Categorizer {
-  let assert = chai.assert;
-
-  describe('categorizer', () => {
-    describe('topLevelNamespaceCategorizer', () => {
-      it('returns empty array on empty tags', () => {
-        assert.lengthOf(topLevelNamespaceCategorizer([]), 0);
-      });
-
-      it('handles a simple case', () => {
-        let simple = [
-          'foo1/bar', 'foo1/zod', 'foo2/bar', 'foo2/zod', 'gosh/lod/mar',
-          'gosh/lod/ned'
-        ];
-        let expected = [
-          {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
-          {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
-          {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
-        ];
-        assert.deepEqual(topLevelNamespaceCategorizer(simple), expected);
-      });
-
-      it('orders the categories', () => {
-        let test = ['e', 'f', 'g', 'a', 'b', 'c'];
-        let expected = [
-          {name: 'a', tags: ['a']},
-          {name: 'b', tags: ['b']},
-          {name: 'c', tags: ['c']},
-          {name: 'e', tags: ['e']},
-          {name: 'f', tags: ['f']},
-          {name: 'g', tags: ['g']},
-        ];
-        assert.deepEqual(topLevelNamespaceCategorizer(test), expected);
-      });
-
-      it('handles cases where category names overlap node names', () => {
-        let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
-        let actual = topLevelNamespaceCategorizer(test);
-        let expected = [
-          {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
-          {name: 'b', tags: ['b', 'b/a']},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-
-      it('handles singleton case', () => {
-        assert.deepEqual(
-            topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
-      });
-    });
-
-    describe('legacyUnderscoreCategorizer', () => {
-      it('splits by shorter of first _ or /', () => {
-        let tags = [
-          'l0_bar/foo', 'l0_bar/baz', 'l0_foo/wob', 'l1_zoink/bla',
-          'l1_wibble/woz', 'l1/foo_woink', 'l2/wozzle_wizzle'
-        ];
-        let actual = legacyUnderscoreCategorizer(tags);
-        let expected = [
-          {name: 'l0', tags: ['l0_bar/baz', 'l0_bar/foo', 'l0_foo/wob']},
-          {name: 'l1', tags: ['l1/foo_woink', 'l1_wibble/woz', 'l1_zoink/bla']},
-          {name: 'l2', tags: ['l2/wozzle_wizzle']},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-    });
-
-    describe('customCategorizer', () => {
-      function noFallbackCategorizer(tags: string[]): Category[] {
-        return [];
-      }
-
-      function testCategorizer(
-          defs: string[], fallback: Categorizer, tags: string[]): Category[] {
-        let catDefs = defs.map(defineCategory);
-        return _categorizer(catDefs, fallback)(tags);
-      }
-
-      it('categorizes by regular expression', () => {
-        let defs = ['foo..', 'bar..'];
-        let tags = ['fooab', 'fooxa', 'barts', 'barms'];
-        let actual = testCategorizer(defs, noFallbackCategorizer, tags);
-        let expected = [
-          {name: 'foo..', tags: ['fooab', 'fooxa']},
-          {name: 'bar..', tags: ['barms', 'barts']},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-
-      it('matches non-exclusively', () => {
-        let tags = ['abc', 'bar', 'zod'];
-        let actual =
-            testCategorizer(['...', 'bar'], noFallbackCategorizer, tags);
-        let expected = [
-          {name: '...', tags: ['abc', 'bar', 'zod']},
-          {name: 'bar', tags: ['bar']},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-
-      it('creates categories for unmatched rules', () => {
-        let actual =
-            testCategorizer(['a', 'b', 'c'], noFallbackCategorizer, []);
-        let expected = [
-          {name: 'a', tags: []},
-          {name: 'b', tags: []},
-          {name: 'c', tags: []},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-
-      it('category regexs work with special characters', () => {
-        let defs = ['^\\w+$', '^\\d+$', '^\\/..$'];
-        let tags = ['foo', '3243', '/xa'];
-        let actual = testCategorizer(defs, noFallbackCategorizer, tags);
-        let expected = [
-          {name: '^\\w+$', tags: ['3243', 'foo']},
-          {name: '^\\d+$', tags: ['3243']},
-          {name: '^\\/..$', tags: ['/xa']},
-        ];
-        assert.deepEqual(actual, expected);
-      });
-
-      it('category tags are sorted', () => {
-        let tags = ['a', 'z', 'c', 'd', 'e', 'x', 'f', 'y', 'g'];
-        let sorted = tags.slice().sort();
-        let expected = [{name: '.*', tags: sorted}];
-        let actual = testCategorizer(['.*'], noFallbackCategorizer, tags);
-        assert.deepEqual(actual, expected);
-      });
-
-      it('if nonexclusive: all tags passed to fallback', () => {
-        let passedToDefault = null;
-        function defaultCategorizer(tags: string[]): Category[] {
-          passedToDefault = tags;
-          return [];
-        }
-        let tags = ['foo', 'bar', 'foo123'];
-        testCategorizer(['foo'], defaultCategorizer, tags);
-        assert.deepEqual(passedToDefault, tags);
-      });
-    });
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/index.html b/tensorflow/tensorboard/components/tf_dashboard_common/test/index.html
deleted file mode 100644
index 270a2522b9d..00000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../../vz-sorting/vz-sorting.html">
-</head>
-<body>
-  <script src="../categorizer.js"></script>
-  <script src="categorizerTest.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html b/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html
new file mode 100644
index 00000000000..c9ad14730f0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../tf-categorizer.html">
+<body>
+<script src="tf-categorizer-tests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts b/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts
new file mode 100644
index 00000000000..a786f39b4fb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as cat from '../tf-categorizer';
+
+let assert = chai.assert;
+
+describe('categorizer', () => {
+  describe('topLevelNamespaceCategorizer', () => {
+    it('returns empty array on empty tags', () => {
+      assert.lengthOf(cat.topLevelNamespaceCategorizer([]), 0);
+    });
+
+    it('handles a simple case', () => {
+      let simple = [
+        'foo1/bar', 'foo1/zod', 'foo2/bar', 'foo2/zod', 'gosh/lod/mar',
+        'gosh/lod/ned'
+      ];
+      let expected = [
+        {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
+        {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
+        {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(simple), expected);
+    });
+
+    it('orders the categories', () => {
+      let test = ['e', 'f', 'g', 'a', 'b', 'c'];
+      let expected = [
+        {name: 'a', tags: ['a']},
+        {name: 'b', tags: ['b']},
+        {name: 'c', tags: ['c']},
+        {name: 'e', tags: ['e']},
+        {name: 'f', tags: ['f']},
+        {name: 'g', tags: ['g']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(test), expected);
+    });
+
+    it('handles cases where category names overlap node names', () => {
+      let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
+      const actual = cat.topLevelNamespaceCategorizer(test);
+      let expected = [
+        {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
+        {name: 'b', tags: ['b', 'b/a']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('handles singleton case', () => {
+      assert.deepEqual(
+          cat.topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
+    });
+  });
+
+  describe('customCategorizer', () => {
+    function noFallbackCategorizer(tags: string[]): cat.Category[] {
+      return [];
+    }
+
+    function testCategorizer(
+        defs: string[], fallback: cat.Categorizer,
+        tags: string[]): cat.Category[] {
+      const catDefs = defs.map(cat.defineCategory);
+      return cat._categorizer(catDefs, fallback)(tags);
+    }
+
+    it('categorizes by regular expression', () => {
+      let defs = ['foo..', 'bar..'];
+      let tags = ['fooab', 'fooxa', 'barts', 'barms'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: 'foo..', tags: ['fooab', 'fooxa']},
+        {name: 'bar..', tags: ['barms', 'barts']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('matches non-exclusively', () => {
+      let tags = ['abc', 'bar', 'zod'];
+      const actual =
+          testCategorizer(['...', 'bar'], noFallbackCategorizer, tags);
+      let expected = [
+        {name: '...', tags: ['abc', 'bar', 'zod']},
+        {name: 'bar', tags: ['bar']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('creates categories for unmatched rules', () => {
+      const actual =
+          testCategorizer(['a', 'b', 'c'], noFallbackCategorizer, []);
+      let expected = [
+        {name: 'a', tags: []},
+        {name: 'b', tags: []},
+        {name: 'c', tags: []},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category regexs work with special characters', () => {
+      let defs = ['^\\w+$', '^\\d+$', '^\\/..$'];
+      let tags = ['foo', '3243', '/xa'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: '^\\w+$', tags: ['3243', 'foo']},
+        {name: '^\\d+$', tags: ['3243']},
+        {name: '^\\/..$', tags: ['/xa']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category tags are sorted', () => {
+      let tags = ['a', 'z', 'c', 'd', 'e', 'x', 'f', 'y', 'g'];
+      let sorted = tags.slice().sort();
+      let expected = [{name: '.*', tags: sorted}];
+      const actual = testCategorizer(['.*'], noFallbackCategorizer, tags);
+      assert.deepEqual(actual, expected);
+    });
+
+    it('if nonexclusive: all tags passed to fallback', () => {
+      let passedToDefault = null;
+      function defaultCategorizer(tags: string[]): cat.Category[] {
+        passedToDefault = tags;
+        return [];
+      }
+      let tags = ['foo', 'bar', 'foo123'];
+      testCategorizer(['foo'], defaultCategorizer, tags);
+      assert.deepEqual(passedToDefault, tags);
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html
index 3918f045dd9..23babaaecc4 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html
@@ -18,11 +18,7 @@ limitations under the License.
 
 <html>
  <head>
-   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-   <script src="../../d3/d3.js"></script>
-   <link rel="import" href="../tf-categorizer.html">
-   <link rel="import" href="../../iron-flex-layout/classes/iron-flex-layout.html">
-
+  <link rel="import" href="tf-categorizer.html">
  </head>
  <body>
   <style>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html
index 090e74fbc7e..f09eb03582d 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html
@@ -17,10 +17,11 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../tf-storage/tf-storage.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-sorting/vz-sorting.html">
 <link rel="import" href="tf-regex-group.html">
 <link rel="import" href="tensorboard-color.html">
-<link rel="import" href="../vz-sorting/vz-sorting.html">
 
 <!--
 `tf-categorizer` turns an array of tags into an array of categories
@@ -46,75 +47,17 @@ categories are exclusive.
     <div class="inputs">
       <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
     </div>
-    <div id="underscore-categorization">
-      <paper-checkbox
-        checked="{{splitOnUnderscore}}"
-      >Split on underscores</paper-checkbox>
-    </div>
     <style>
       :host {
         display: block;
-        padding-bottom: 15px;
+        padding-bottom: 5px;
       }
       paper-checkbox {
         --paper-checkbox-checked-color: var(--paper-grey-600);
         --paper-checkbox-unchecked-color: var(--paper-grey-600);
         font-size: 14px;
       }
-      #underscore-categorization {
-        color: var(--paper-grey-700);
-      }
     </style>
   </template>
-  <script src="categorizer.js"></script>
-  <script>
-    Polymer({
-      is: "tf-categorizer",
-      properties: {
-        regexes: {type: Array},
-        tags: {type: Array},
-        categoriesAreExclusive: {type: Boolean, value: true},
-        fallbackCategorizer: {
-          type: String,
-          computed: "chooseFallbackCategorizer(splitOnUnderscore)"
-        },
-        splitOnUnderscore: {
-          type: Boolean,
-          notify: true,
-          value: TF.URIStorage.getBooleanInitializer('splitOnUnderscore',
-              false),
-          observer: '_splitOnUnderscoreObserver'
-        },
-        categorizer: {
-          type: Object,
-          computed: "computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)",
-        },
-        categories: {type: Array, value: function() {return [];}, notify: true, readOnly: true},
-      },
-      observers: ['recategorize(tags.*, categorizer)'],
-      computeCategorization: function(regexes, categoriesAreExclusive, fallbackCategorizer) {
-        var categorizationStrategy = {
-          categoryDefinitions: regexes.base,
-          categoriesAreExclusive: categoriesAreExclusive,
-          fallbackCategorizer: fallbackCategorizer,
-        };
-        return Categorizer.categorizer(categorizationStrategy);
-      },
-      recategorize: function() {
-        this.debounce("tf-categorizer-recategorize", function (){
-          var categories = this.categorizer(this.tags);
-          this._setCategories(categories);
-        })
-      },
-      chooseFallbackCategorizer: function(splitOnUnderscore) {
-        if (splitOnUnderscore) {
-          return "LegacyUnderscoreCategorizer";
-        } else {
-          return "TopLevelNamespaceCategorizer";
-        }
-      },
-      _splitOnUnderscoreObserver: TF.URIStorage.getBooleanObserver(
-          'splitOnUnderscore', false)
-    });
-  </script>
+  <script src="tf-categorizer.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts
new file mode 100644
index 00000000000..0eaf852ff13
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts
@@ -0,0 +1,189 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {compareTagNames} from '../vz-sorting/sorting';
+
+/**
+ * This module contains methods that allow sorting tags into 'categories'.
+ * A category contains a name and a list of tags.
+ * The sorting strategy is defined by a 'CustomCategorization', which contains
+ * 'categoryDefinitions' which are regex rules used to construct a category.
+ * E.g. the regex rule 'xent' will create a category called 'xent' that
+ * contains values whose tags match the regex.
+ *
+ * After custom categories are evaluated, the tags are sorted by a hardcoded
+ * fallback categorizer, which may, for example, group tags into categories
+ * based on their top namespace.
+ */
+
+export interface Category {
+  // Categories that data is sorted into
+  name: string;
+  tags: string[];
+}
+
+export interface CustomCategorization {
+  // Defines a categorization strategy
+  categoryDefinitions: string[];
+  fallbackCategorizer: string;
+  /* {'TopLevelNamespaceCategorizer',
+      'LegacyUnderscoreCategorizer'} */
+}
+
+export interface Categorizer {
+  // Function that generates categories
+  (tags: string[]): Category[];
+}
+
+/* Canonical TensorFlow ops are namespaced using forward slashes.
+ * This fallback categorizer categorizes by the top-level namespace.
+ */
+export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
+
+export function fallbackCategorizer(s: string): Categorizer {
+  switch (s) {
+    case 'TopLevelNamespaceCategorizer':
+      return topLevelNamespaceCategorizer;
+    default:
+      throw new Error('Unrecognized categorization strategy: ' + s);
+  }
+}
+
+/* An 'extractor' is a function that takes a tag name, and 'extracts' a
+ * category name.
+ * This function takes an extractor, and produces a categorizer.
+ * Currently, it is just used for the fallbackCategorizer, but we may want to
+ * refactor the general categorization logic to use the concept of extractors.
+ */
+function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
+  return (tags: string[]): Category[] => {
+    if (tags.length === 0) {
+      return [];
+    }
+
+    // Maps between top-level name and category. We use the mapping to avoid
+    // duplicating categories per run.
+    const categoryMapping: {[key: string]: Category} = {};
+
+    tags.forEach((t: string) => {
+      const topLevel = extractor(t);
+      if (!categoryMapping[topLevel]) {
+        const newCategory = {
+          name: topLevel,
+          tags: [],
+        };
+        categoryMapping[topLevel] = newCategory;
+      }
+
+      categoryMapping[topLevel].tags.push(t);
+    });
+
+    // Sort categories into alphabetical order.
+    const categories =
+        _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+    _.forEach(categories, (category) => {
+      // Sort the tags within each category.
+      category.tags.sort(compareTagNames);
+    });
+    return categories;
+  };
+}
+
+function splitCategorizer(r: RegExp): Categorizer {
+  let extractor = (t: string) => {
+    return t.split(r)[0];
+  };
+  return extractorToCategorizer(extractor);
+}
+
+export interface CategoryDefinition {
+  name: string;
+  matches: (t: string) => boolean;
+}
+
+export function defineCategory(ruledef: string): CategoryDefinition {
+  let r = new RegExp(ruledef);
+  let f = function(tag: string): boolean {
+    return r.test(tag);
+  };
+  return {name: ruledef, matches: f};
+}
+
+export function _categorizer(
+    rules: CategoryDefinition[], fallback: Categorizer) {
+  return function(tags: string[]): Category[] {
+    let remaining: d3.Set = d3.set(tags);
+    let userSpecified = rules.map((def: CategoryDefinition) => {
+      let tags: string[] = [];
+      remaining.each((t: string) => {
+        if (def.matches(t)) {
+          tags.push(t);
+        }
+      });
+      let cat = {name: def.name, tags: tags.sort(compareTagNames)};
+      return cat;
+    });
+    let defaultCategories = fallback(remaining.values());
+    return userSpecified.concat(defaultCategories);
+  };
+}
+
+export function categorizer(s: CustomCategorization): Categorizer {
+  let rules = s.categoryDefinitions.map(defineCategory);
+  let fallback = fallbackCategorizer(s.fallbackCategorizer);
+  return _categorizer(rules, fallback);
+};
+
+Polymer({
+  is: 'tf-categorizer',
+  properties: {
+    regexes: {type: Array},
+    tags: {type: Array},
+    categoriesAreExclusive: {type: Boolean, value: true},
+    fallbackCategorizer: {
+      type: String,
+      value: 'TopLevelNamespaceCategorizer',
+    },
+    categorizer: {
+      type: Object,
+      computed:
+          'computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)',
+    },
+    categories: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+      notify: true,
+      readOnly: true
+    },
+  },
+  observers: ['recategorize(tags.*, categorizer)'],
+  computeCategorization: function(
+      regexes, categoriesAreExclusive, fallbackCategorizer) {
+    var categorizationStrategy = {
+      categoryDefinitions: regexes.base,
+      categoriesAreExclusive: categoriesAreExclusive,
+      fallbackCategorizer: fallbackCategorizer,
+    };
+    return categorizer(categorizationStrategy);
+  },
+  recategorize: function() {
+    this.debounce('tf-categorizer-recategorize', function() {
+      var categories = this.categorizer(this.tags);
+      this._setCategories(categories);
+    })
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
index e2530d59716..a39fb9462ba 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
@@ -16,8 +16,6 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/plottable.html">
-<link rel="import" href="../tf-imports/lodash.html">
 
 <!--
 tf-chart-scaffold is responsible for providing data from TensorBoard to charts.
@@ -34,7 +32,7 @@ chart() - Returns the underlying chart element.
 reload() - Reloads the data and sends it to the underlying chart.
 
 This element should have a compatible chart plugin element as it's content. The
-plugin is requred to implement two functions:
+plugin is required to implement two functions:
 - setVisibleSeries(names: string[]): a function that receives an array of series
     names as the first parameter, responsible for changing the series currently
     being displayed to only the series in this array.
@@ -57,6 +55,8 @@ plugin is requred to implement two functions:
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-chart-scaffold",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html
similarity index 83%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html
index 15253a83313..efa990b11cf 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html
@@ -18,8 +18,8 @@ limitations under the License.
 
 <html>
  <head>
-   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-   <link rel="import" href="../tf-collapsable-pane.html">
+   <link rel="import" href="tf-collapsable-pane.html">
+   
  </head>
  <body>
   <style>
@@ -28,7 +28,4 @@ limitations under the License.
     <h1>This is content inside the pane.</h1>
   </tf-collapsable-pane>
  </body>
- <script>
-
- </script>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
index 4c7c91d1713..9e2f6b9589b 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
@@ -22,4 +22,5 @@ limitations under the License.
 <link rel="import" href="tf-downloader.html">
 <link rel="import" href="tf-no-data-warning.html">
 
+<script src="dashboard-behavior.js"></script>
 <script src="reload-behavior.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html
similarity index 92%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html
index bad4a1a66ec..d0f5aa6f27d 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html
@@ -18,11 +18,8 @@ limitations under the License.
 
 <html>
 <head>
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<script src="../../d3/d3.js"></script>
-<link rel="import" href="../tf-multi-checkbox.html">
-<link rel="import" href="../../tf-color-scale/tf-color-scale.html">
-<link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="tf-multi-checkbox.html">
 
 </head>
 <body>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
index 8dfcb3153db..fad4642963f 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
@@ -20,10 +20,10 @@ limitations under the License.
 <link rel="import" href="../paper-checkbox/paper-checkbox.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
 <link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
-<link rel="import" href="../tf-dashboard-common/run-color-style.html">
 <link rel="import" href="../tf-storage/tf-storage.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="scrollbar-style.html">
+<link rel="import" href="run-color-style.html">
 
 <!--
 tf-multi-checkbox creates a list of checkboxes that can be used to toggle on or off
@@ -58,7 +58,7 @@ handle these situations gracefully.
             <paper-checkbox
               class="checkbox vertical-align-center"
               name="[[item]]"
-              checked$="[[_isChecked(item, runsDisabled.*)]]"
+              checked$="[[_isChecked(item, runSelectionState.*)]]"
               on-change="_checkboxChange"
             ></paper-checkbox>
 
@@ -156,174 +156,5 @@ handle these situations gracefully.
     }
   </style>
   </template>
-
-  <script>
-  Polymer({
-    is: "tf-multi-checkbox",
-    properties: {
-      names: {
-        type: Array,
-        value: function() {return [];},
-      }, // All the runs in consideration
-      regexInput: {
-        type: String,
-        value: TF.URIStorage.getStringInitializer("regexInput", ""),
-        observer: "_regexInputObserver",
-      }, // Regex for filtering the runs
-      regex: {
-        type: Object,
-        computed: "_makeRegex(regexInput)"
-      },
-      namesMatchingRegex: {
-        type: Array,
-        computed: "computeNamesMatchingRegex(names.*, regex)"
-      }, // Runs that match the regex
-      runsDisabled: {
-        type: Object,
-        value: TF.URIStorage.getObjectInitializer('runsDisabled', {}),
-      }, // Every run that is disabled is stored in the map (with value true)
-      // (Allows state to persist across regex filtering)
-      outSelected: {
-        type: Array,
-        notify: true,
-        computed: 'computeOutSelected(namesMatchingRegex.*, runsDisabled.*)'
-      },
-      colorScale: {
-        type: Object,
-        observer: "synchronizeColors",
-      }, // map from run name to css class
-      _debouncedRegexChange: {
-        type: Function,
-        // Updating the regex can be slow, because it involves updating styles
-        // on a large number of Polymer paper-checkboxes. We don't want to do
-        // this while the user is typing, as it may make a bad, laggy UI.
-        // So we debounce the updates that come from user typing.
-        value: function() {
-          _this = this;
-          var debounced = _.debounce(function(r) {
-            _this.regexInput = r;
-          }, 150, {leading: false});
-          return function() {
-            var r = this.$$("#runs-regex").value;
-            if (r == "") {
-              // If the user cleared the field, they may be done typing, so
-              // update more quickly.
-              this.async(function() {
-                _this.regexInput = r;
-              }, 30);
-            } else {
-              debounced(r);
-            };
-          };
-        },
-      },
-    },
-    listeners: {
-      'dom-change': 'synchronizeColors',
-    },
-    observers: [
-      "_setIsolatorIcon(runsDisabled, names)",
-      "_storeRunToIsCheckedMapping(runsDisabled)",
-    ],
-    _storeRunToIsCheckedMapping: TF.URIStorage.getObjectObserver('runsDisabled', {}),
-    _makeRegex: function(regex) {
-      try {
-        return new RegExp(regex)
-      } catch (e) {
-        return null;
-      }
-    },
-    _setIsolatorIcon: function() {
-      var runMap = this.runsDisabled;
-      var numChecked = this.names.length - _.filter(_.values(runMap)).length;
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-
-      buttons.forEach(function(b) {
-        if (numChecked === 1 && !runMap[b.name]) {
-          b.icon = "radio-button-checked";
-        } else {
-          b.icon = "radio-button-unchecked";
-        }
-      });
-    },
-    computeNamesMatchingRegex: function(__, ___) {
-      var regex = this.regex;
-      return this.names.filter(function(n) {
-        return regex == null || regex.test(n);
-      });
-    },
-    computeOutSelected: function(__, ___) {
-      var runsDisabled = this.runsDisabled;
-      return this.namesMatchingRegex.filter(function(n) {
-        return !runsDisabled[n];
-      });
-    },
-    synchronizeColors: function(e) {
-      if (!this.colorScale) return;
-
-      this._setIsolatorIcon();
-
-      var checkboxes = Array.prototype.slice.call(this.querySelectorAll("paper-checkbox"));
-      var scale = this.colorScale;
-      checkboxes.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.customStyle['--paper-checkbox-checked-color'] = color;
-        p.customStyle['--paper-checkbox-checked-ink-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
-      });
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-      buttons.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.style['color'] = color;
-      });
-      // The updateStyles call fails silently if the browser doesn't have focus,
-      // e.g. if TensorBoard was opened into a new tab that isn't visible.
-      // So we wait for requestAnimationFrame.
-      var _this = this;
-      window.requestAnimationFrame(function() {_this.updateStyles();});
-    },
-    _isolateRun: function(e) {
-      // If user clicks on the label for one run, enable it and disable all other runs.
-
-      var name = Polymer.dom(e).localTarget.name;
-      var newDisabled = {};
-      this.names.forEach(function(n) {
-        newDisabled[n] = true;
-      })
-      delete newDisabled[name];
-      this.runsDisabled = newDisabled;
-    },
-    _checkboxChange: function(e) {
-      var target = Polymer.dom(e).localTarget;
-      if (target.checked) {
-        delete this.runsDisabled[target.name];
-      } else {
-        this.runsDisabled[target.name] = true;
-      }
-      // n.b. notifyPath won't work because run names may have periods.
-      this.runsDisabled = _.clone(this.runsDisabled);
-    },
-    _isChecked: function(item, outSelectedChange) {
-      return this.runsDisabled[item] == undefined;
-    },
-    _initializeRuns: function(change) {
-      this.outSelected = change.base.slice();
-    },
-    _regexInputObserver: TF.URIStorage.getStringObserver("regexInput", ""),
-    toggleAll: function() {
-      var _this = this;
-      var allOn = this.namesMatchingRegex
-                    .filter(function(n) {return _this.runsDisabled[n]})
-                    .length === 0;
-      let newRunsDisabled = {}
-      if (allOn) {
-        this.names.forEach(function(n) {
-          newRunsDisabled[n] = true;
-        })
-      }
-      this.runsDisabled = newRunsDisabled;
-    },
-  });
-  </script>
+  <script src="tf-multi-checkbox.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts
new file mode 100644
index 00000000000..4b38d82b14e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-multi-checkbox',
+  properties: {
+    names: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+    },  // All the runs in consideration
+    regexInput: {
+      type: String,
+      value: storage.getStringInitializer('regexInput', ''),
+      observer: '_regexInputObserver',
+    },  // Regex for filtering the runs
+    regex: {type: Object, computed: '_makeRegex(regexInput)'},
+    namesMatchingRegex: {
+      type: Array,
+      computed: 'computeNamesMatchingRegex(names.*, regex)'
+    },  // Runs that match the regex
+    runSelectionState: {
+      // if a run is explicitly enabled, True, if explicitly disabled, False.
+      // if undefined, default value (enable for first k runs, disable after).
+      type: Object,
+      value: storage.getObjectInitializer('runSelectionState', {}),
+      observer: '_storeRunToIsCheckedMapping',
+    },
+    // (Allows state to persist across regex filtering)
+    outSelected: {
+      type: Array,
+      notify: true,
+      computed: 'computeOutSelected(namesMatchingRegex.*, runSelectionState.*)'
+    },
+    colorScale: {
+      type: Object,
+      observer: 'synchronizeColors',
+    },  // map from run name to css class
+    maxRunsToEnableByDefault: {
+      // When TB first loads, if it has k or fewer runs, they are all enabled
+      // by default. If there are more, then they are all disabled.
+      type: Number,
+      value: 40,
+    },
+    _debouncedRegexChange: {
+      type: Object,
+      // Updating the regex can be slow, because it involves updating styles
+      // on a large number of Polymer paper-checkboxes. We don't want to do
+      // this while the user is typing, as it may make a bad, laggy UI.
+      // So we debounce the updates that come from user typing.
+      value: function() {
+        const _this = this;
+        var debounced = _.debounce(function(r) {
+          _this.regexInput = r;
+        }, 150, {leading: false});
+        return function() {
+          var r = this.$$('#runs-regex').value;
+          if (r == '') {
+            // If the user cleared the field, they may be done typing, so
+            // update more quickly.
+            this.async(function() {
+              _this.regexInput = r;
+            }, 30);
+          } else {
+            debounced(r);
+          };
+        };
+      },
+    },
+  },
+  listeners: {
+    'dom-change': 'synchronizeColors',
+  },
+  observers: [
+    '_setIsolatorIcon(runSelectionState, names)',
+  ],
+  _storeRunToIsCheckedMapping:
+      storage.getObjectObserver('runSelectionState', {}),
+  _makeRegex: function(regex) {
+    try {
+      return new RegExp(regex)
+    } catch (e) {
+      return null;
+    }
+  },
+  _setIsolatorIcon: function() {
+    var runMap = this.runSelectionState;
+    var numChecked = _.filter(_.values(runMap)).length;
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+
+    buttons.forEach(function(b) {
+      if (numChecked === 1 && runMap[b.name]) {
+        b.icon = 'radio-button-checked';
+      } else {
+        b.icon = 'radio-button-unchecked';
+      }
+    });
+  },
+  computeNamesMatchingRegex: function(__, ___) {
+    var regex = this.regex;
+    return this.names.filter(function(n) {
+      return regex == null || regex.test(n);
+    });
+  },
+  computeOutSelected: function(__, ___) {
+    var runSelectionState = this.runSelectionState;
+    var num = this.maxRunsToEnableByDefault;
+    var allEnabled = this.namesMatchingRegex.length <= num;
+    return this.namesMatchingRegex.filter(function(n, i) {
+      return runSelectionState[n] == null ? allEnabled : runSelectionState[n];
+    });
+  },
+  synchronizeColors: function(e) {
+    if (!this.colorScale) return;
+
+    this._setIsolatorIcon();
+
+    var checkboxes =
+        Array.prototype.slice.call(this.querySelectorAll('paper-checkbox'));
+    var scale = this.colorScale;
+    checkboxes.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.customStyle['--paper-checkbox-checked-color'] = color;
+      p.customStyle['--paper-checkbox-checked-ink-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
+    });
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+    buttons.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.style['color'] = color;
+    });
+    // The updateStyles call fails silently if the browser doesn't have focus,
+    // e.g. if TensorBoard was opened into a new tab that isn't visible.
+    // So we wait for requestAnimationFrame.
+    var _this = this;
+    window.requestAnimationFrame(function() {
+      _this.updateStyles();
+    });
+  },
+  _isolateRun: function(e) {
+    // If user clicks on the label for one run, enable it and disable all other
+    // runs.
+
+    var name = (Polymer.dom(e) as any).localTarget.name;
+    var selectionState = {};
+    this.names.forEach(function(n) {
+      selectionState[n] = n == name;
+    });
+    this.runSelectionState = selectionState;
+  },
+  _checkboxChange: function(e) {
+    var target = (Polymer.dom(e) as any).localTarget;
+    this.runSelectionState[target.name] = target.checked;
+    // n.b. notifyPath won't work because run names may have periods.
+    this.runSelectionState = _.clone(this.runSelectionState);
+  },
+  _isChecked: function(item, outSelectedChange) {
+    return this.outSelected.indexOf(item) != -1;
+  },
+  _regexInputObserver: storage.getStringObserver('regexInput', ''),
+  toggleAll: function() {
+    var _this = this;
+    var anyToggledOn = this.namesMatchingRegex.some(function(n) {
+      return _this.runSelectionState[n]
+    });
+
+
+    var runSelectionStateIsDefault =
+        Object.keys(this.runSelectionState).length == 0;
+
+    var defaultOff =
+        this.namesMatchingRegex.length > this.maxRunsToEnableByDefault;
+    // We have runs toggled either if some were explicitly toggled on, or if
+    // we are in the default state, and there are few enough that we default
+    // to toggling on.
+    anyToggledOn = anyToggledOn || runSelectionStateIsDefault && !defaultOff;
+
+    // If any are toggled on, we turn everything off. Or, if none are toggled
+    // on, we turn everything on.
+
+    var newRunsDisabled = {};
+    this.names.forEach(function(n) {
+      newRunsDisabled[n] = !anyToggledOn;
+    });
+    this.runSelectionState = newRunsDisabled;
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
index dbc1dc5c5fa..c90efac1d6b 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
@@ -34,10 +34,9 @@ Display a warning when there is no data found.
             and pass the graph either via the constructor, or by calling its
             <code>add_graph()</code> method.
             You may want to check out the
-            <a href="https://www.tensorflow.org/versions/master/how_tos/graph_viz/index.html">
+            <a href="https://www.tensorflow.org/get_started/graph_viz">
               graph visualizer tutorial
-            </a>
-            .
+            </a>.
           </p>
         </template>
         <template is="dom-if" if="[[_isProjector(dataType)]]">
@@ -53,7 +52,7 @@ Display a warning when there is no data found.
               <li>
                 You are not saving any checkpoint. To save your model,
                 create a
-                <a href="https://www.tensorflow.org/versions/master/api_docs/python/state_ops.html#Saver">
+                <a href="https://www.tensorflow.org/api_docs/python/tf/train/Saver">
                   <code>tf.train.Saver</code>
                 </a>
                 and save your model periodically
@@ -86,7 +85,7 @@ Display a warning when there is no data found.
             README
           </a>
           and perhaps the
-          <a href="https://www.tensorflow.org/versions/master/how_tos/summaries_and_tensorboard/index.html">
+          <a href="https://www.tensorflow.org/get_started/summaries_and_tensorboard">
             TensorBoard tutorial
           </a>.
         </p>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html
index d6fc9d6861f..547a558ad0b 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html
@@ -16,7 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tensorboard-color.html">
 
 <!--
 tf-option-selector is a simple component that has buttons as content and
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html
index c434bd47282..155259d3294 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html
@@ -113,7 +113,7 @@ downloadLinkUrlFunction property to an appropriate value.
         display: flex;
         flex-direction: column;
         margin: 5px;
-        padding: 0 30px 35px 0;
+        padding: var(--card-padding, 0 30px 35px 0);
         -webkit-user-select: none;
         -moz-user-select: none;
         position: relative;
@@ -166,6 +166,7 @@ downloadLinkUrlFunction property to an appropriate value.
         padding: 4px;
         border-radius: 100%;
         pointer-events: auto;
+        display: var(--show-expand-button, block);
       }
 
       .card-expanded .expand-button {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html
similarity index 88%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html
index eaf10999bfb..3565fec1791 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html
@@ -18,8 +18,7 @@ limitations under the License.
 
 <html>
  <head>
-   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-   <link rel="import" href="../tf-regex-group.html">
+   <link rel="import" href="tf-regex-group.html">
  </head>
  <body>
   <style>
@@ -43,7 +42,4 @@ limitations under the License.
     </template>
   </template>
  </body>
- <script>
-
- </script>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html
index f00bd21c364..c1d3cf06aea 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html
@@ -20,6 +20,7 @@ limitations under the License.
 <link rel="import" href="../iron-icons/iron-icons.html">
 <link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
 <link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
 
 <!--
 `tf-regex-group` provides an input component for a group of regular expressions.
@@ -94,70 +95,5 @@ more regexes).
       </template>
     </div>
   </template>
-  <script>
-    Polymer({
-      is: "tf-regex-group",
-      properties: {
-        rawRegexes: {
-          type: Array,
-          value: TF.URIStorage.getObjectInitializer('rawRegexes', [{regex: "", valid: true}]),
-        },
-        regexes: {type: Array, computed: "usableRegexes(rawRegexes.*)", notify: true},
-      },
-      observers: [
-        "addNewRegexIfNeeded(rawRegexes.*)",
-        "checkValidity(rawRegexes.*)",
-        "_uriStoreRegexes(rawRegexes.*)",
-      ],
-      _uriStoreRegexes: TF.URIStorage.getObjectObserver('rawRegexes', [{regex: "", valid: true}]),
-      checkValidity: function(x) {
-        var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
-        if (match) {
-          var idx = match[1];
-          this.set("rawRegexes." + idx + ".valid", this.isValid(x.value));
-        }
-      },
-      isValid: function(s) {
-        try {
-          new RegExp(s);
-          return true;
-        } catch (e) {
-          return false;
-        }
-      },
-      usableRegexes: function(regexes) {
-        var isValid = this.isValid;
-        return regexes.base.filter(function (r) {
-          // Checking validity here (rather than using the data property)
-          // is necessary because otherwise we might send invalid regexes due
-          // to the fact that this function can call before the observer does
-          return r.regex !== "" && isValid(r.regex);
-        }).map(function(r) {
-          return r.regex;
-        });
-      },
-      addNewRegexIfNeeded: function() {
-        var last = this.rawRegexes[this.rawRegexes.length - 1];
-        if (last.regex !== "") {
-          this.push("rawRegexes", {regex: "", valid: true});
-        }
-      },
-      deleteRegex: function(e) {
-        if (this.rawRegexes.length > 1) {
-          this.splice("rawRegexes", e.model.index, 1);
-        }
-      },
-      moveFocus: function(e) {
-        if (e.keyCode === 13) {
-          var idx = e.model.index;
-          var inputs = Polymer.dom(this.root).querySelectorAll(".regex-input");
-          if (idx < this.rawRegexes.length - 1) {
-            inputs[idx+1].$.input.focus();
-          } else {
-            document.activeElement.blur();
-          }
-        }
-      }
-    });
-  </script>
+  <script src="tf-regex-group.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts
new file mode 100644
index 00000000000..92a0eb6a0b9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-regex-group',
+  properties: {
+    rawRegexes: {
+      type: Array,
+      value: storage.getObjectInitializer(
+          'rawRegexes', [{regex: '', valid: true}]),
+    },
+    regexes:
+        {type: Array, computed: 'usableRegexes(rawRegexes.*)', notify: true},
+  },
+  observers: [
+    'addNewRegexIfNeeded(rawRegexes.*)',
+    'checkValidity(rawRegexes.*)',
+    '_uriStoreRegexes(rawRegexes.*)',
+  ],
+  _uriStoreRegexes:
+      storage.getObjectObserver('rawRegexes', [{regex: '', valid: true}]),
+  checkValidity: function(x) {
+    var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
+    if (match) {
+      var idx = match[1];
+      this.set('rawRegexes.' + idx + '.valid', this.isValid(x.value));
+    }
+  },
+  isValid: function(s) {
+    try {
+      new RegExp(s);
+      return true;
+    } catch (e) {
+      return false;
+    }
+  },
+  usableRegexes: function(regexes) {
+    var isValid = this.isValid;
+    return regexes.base
+        .filter(function(r) {
+          // Checking validity here (rather than using the data property)
+          // is necessary because otherwise we might send invalid regexes due
+          // to the fact that this function can call before the observer does
+          return r.regex !== '' && isValid(r.regex);
+        })
+        .map(function(r) {
+          return r.regex;
+        });
+  },
+  addNewRegexIfNeeded: function() {
+    var last = this.rawRegexes[this.rawRegexes.length - 1];
+    if (last.regex !== '') {
+      this.push('rawRegexes', {regex: '', valid: true});
+    }
+  },
+  deleteRegex: function(e) {
+    if (this.rawRegexes.length > 1) {
+      this.splice('rawRegexes', e.model.index, 1);
+    }
+  },
+  moveFocus: function(e) {
+    if (e.keyCode === 13) {
+      var idx = e.model.index;
+      var inputs = Polymer.dom(this.root).querySelectorAll('.regex-input');
+      if (idx < this.rawRegexes.length - 1) {
+        (inputs[idx + 1] as any).$.input.focus();
+      } else {
+        (document.activeElement as HTMLElement).blur();
+      }
+    }
+  }
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
index 8f2ea402e89..e3d8a91fd0c 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
@@ -18,7 +18,6 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-button/paper-button.html">
 <link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../tf-imports/lodash.html">
 <link rel="import" href="tf-multi-checkbox.html">
 <link rel="import" href="scrollbar-style.html">
 
@@ -36,8 +35,7 @@ Properties out:
 -->
 <dom-module id="tf-run-selector">
   <template>
-    <paper-dialog with-backdrop
-                  id="logdir-dialog">
+    <paper-dialog with-backdrop id="logdir-dialog">
       <h2>logdir</h2>
       <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
     </paper-dialog>
@@ -139,17 +137,9 @@ Properties out:
       },
     },
     observers: [
+      "_onBackendUpdate(backend)",
       "_logdirSet(logdir)",
     ],
-    ready: function() {
-      // Populate the logdir.
-      this.backend.logdir().then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
     _toggleAll: function() {
       this.$.multiCheckbox.toggleAll();
     },
@@ -157,8 +147,21 @@ Properties out:
     _breakString: function(originalString) {
       return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
     },
+    _onBackendUpdate: function(backend) {
+      if (backend === undefined) {
+        return;
+      }
+
+      // When the backend is set, the selector can request the logdir.
+      backend.logdir().then(logdirObject => {
+        this.set('logdir', logdirObject.logdir);
+      }).catch(e => {
+        // Fetching the logdir failed. Prevent the exception from logging to
+        // console. The console already logs a 404 network event.
+      });
+    },
     _logdirSet: function(logdir) {
-      if (!logdir) {
+      if (logdir === undefined) {
         // The logdir has not been set yet.
         return;
       }
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html
index f2b94a981af..5eb8537040c 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/lodash.html">
 <link rel="import" href="tf-categorizer.html">
 <link rel="import" href="tf-run-selector.html">
 
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
new file mode 100644
index 00000000000..5ddd6ba5bb9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_distribution_dashboard",
+    srcs = ["tf-distribution-dashboard.html"],
+    path = "/tf-distribution-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/vz_distribution_chart",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-distribution-dashboard",
+    deps = [
+        ":tf_distribution_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json
new file mode 100644
index 00000000000..a6765285b14
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -2.3150592308536755
+            ],
+            [
+                668,
+                -2.0967547155036605
+            ],
+            [
+                1587,
+                -1.4326244423655616
+            ],
+            [
+                3085,
+                -0.8871306575801902
+            ],
+            [
+                5000,
+                -0.09312398815580714
+            ],
+            [
+                6915,
+                0.2584093405812282
+            ],
+            [
+                8413,
+                0.8895470642005087
+            ],
+            [
+                9332,
+                1.3198979614453679
+            ],
+            [
+                10000,
+                1.6793308878855118
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -1.3417572789138936
+            ],
+            [
+                668,
+                -1.183563374619141
+            ],
+            [
+                1587,
+                -0.48920418783271574
+            ],
+            [
+                3085,
+                0.29326906896076954
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                0.8684655583499333
+            ],
+            [
+                8413,
+                1.4133127368907181
+            ],
+            [
+                9332,
+                1.906140650457873
+            ],
+            [
+                10000,
+                2.135771998171255
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -1.5066917525035333
+            ],
+            [
+                668,
+                -1.3910909571770793
+            ],
+            [
+                1587,
+                -0.902737218885874
+            ],
+            [
+                3085,
+                -0.3807791904765027
+            ],
+            [
+                5000,
+                0.38900200905253046
+            ],
+            [
+                6915,
+                0.8209734209339482
+            ],
+            [
+                8413,
+                1.302385856695965
+            ],
+            [
+                9332,
+                1.9324626053521639
+            ],
+            [
+                10000,
+                2.957505317875451
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -0.5430457051469562
+            ],
+            [
+                668,
+                -0.4626161834245273
+            ],
+            [
+                1587,
+                0.21573949543027715
+            ],
+            [
+                3085,
+                0.37353741100174215
+            ],
+            [
+                5000,
+                0.6891407881591103
+            ],
+            [
+                6915,
+                1.0927156232630852
+            ],
+            [
+                8413,
+                1.2745337159550916
+            ],
+            [
+                9332,
+                1.4321116832891605
+            ],
+            [
+                10000,
+                2.1913774993059034
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -0.3584790755077172
+            ],
+            [
+                668,
+                -0.33301611509753215
+            ],
+            [
+                1587,
+                -0.1089466072951948
+            ],
+            [
+                3085,
+                0.5792199847585249
+            ],
+            [
+                5000,
+                1.220854943811942
+            ],
+            [
+                6915,
+                1.759829438421432
+            ],
+            [
+                8413,
+                2.3072559906741614
+            ],
+            [
+                9332,
+                2.753036118353921
+            ],
+            [
+                10000,
+                3.0267252195784047
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json
new file mode 100644
index 00000000000..9e8a55b3f20
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -3.6801669545044846
+            ],
+            [
+                668,
+                -3.192188140974744
+            ],
+            [
+                1587,
+                -2.3414678549368806
+            ],
+            [
+                3085,
+                -0.9632173471995873
+            ],
+            [
+                5000,
+                -0.3214892636797772
+            ],
+            [
+                6915,
+                0.11870794142185205
+            ],
+            [
+                8413,
+                0.8895470642005087
+            ],
+            [
+                9332,
+                1.183563374619141
+            ],
+            [
+                10000,
+                2.665663810418372
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -3.564793583751807
+            ],
+            [
+                668,
+                -3.376844436865802
+            ],
+            [
+                1587,
+                -1.0366615731293798
+            ],
+            [
+                3085,
+                -0.27318696312672563
+            ],
+            [
+                5000,
+                0.9718642422053263
+            ],
+            [
+                6915,
+                2.5765662807928194
+            ],
+            [
+                8413,
+                3.1415385101545126
+            ],
+            [
+                9332,
+                4.085981768607621
+            ],
+            [
+                10000,
+                4.623079406808927
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -2.235172510433281
+            ],
+            [
+                668,
+                -2.004569042815611
+            ],
+            [
+                1587,
+                -1.2015432383370985
+            ],
+            [
+                3085,
+                0.11835464933202625
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                1.202844810963146
+            ],
+            [
+                8413,
+                2.689066032283515
+            ],
+            [
+                9332,
+                2.8494015726499944
+            ],
+            [
+                10000,
+                3.481377676013788
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -3.360113978269659
+            ],
+            [
+                668,
+                -2.8293185004961043
+            ],
+            [
+                1587,
+                -1.5992540502266783
+            ],
+            [
+                3085,
+                0.14393860259807117
+            ],
+            [
+                5000,
+                1.47723448201245
+            ],
+            [
+                6915,
+                1.9510057389110733
+            ],
+            [
+                8413,
+                2.833176104473626
+            ],
+            [
+                9332,
+                4.142405216576347
+            ],
+            [
+                10000,
+                4.706937777668589
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -2.599286228987632
+            ],
+            [
+                668,
+                -2.240365897443259
+            ],
+            [
+                1587,
+                -1.5992540502266783
+            ],
+            [
+                3085,
+                -0.9101893288861387
+            ],
+            [
+                5000,
+                0.7580548669750213
+            ],
+            [
+                6915,
+                1.6009864433919474
+            ],
+            [
+                8413,
+                2.3504002974280036
+            ],
+            [
+                9332,
+                2.7907805263353733
+            ],
+            [
+                10000,
+                3.5098048900144323
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json
new file mode 100644
index 00000000000..7c8836f6246
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json
@@ -0,0 +1,212 @@
+[
+    [
+        0.0,
+        0,
+        [
+            [
+                0,
+                -1.9291158122759586
+            ],
+            [
+                668,
+                -1.5970765333488954
+            ],
+            [
+                1587,
+                -1.0923120348519078
+            ],
+            [
+                3085,
+                -0.6688082872192093
+            ],
+            [
+                5000,
+                0.09312398815580714
+            ],
+            [
+                6915,
+                0.44532789251701854
+            ],
+            [
+                8413,
+                0.8238009655877649
+            ],
+            [
+                9332,
+                1.0357232383581656
+            ],
+            [
+                10000,
+                1.2741043689144438
+            ]
+        ]
+    ],
+    [
+        100.0,
+        10,
+        [
+            [
+                0,
+                -0.7780725642449806
+            ],
+            [
+                668,
+                -0.7138496178727424
+            ],
+            [
+                1587,
+                -0.5448932415735014
+            ],
+            [
+                3085,
+                -0.24370397454796228
+            ],
+            [
+                5000,
+                0.42790220995778355
+            ],
+            [
+                6915,
+                0.6191730643365096
+            ],
+            [
+                8413,
+                0.752059342118037
+            ],
+            [
+                9332,
+                1.0451472255274825
+            ],
+            [
+                10000,
+                2.5559479569222825
+            ]
+        ]
+    ],
+    [
+        200.0,
+        20,
+        [
+            [
+                0,
+                -1.3876904425996377
+            ],
+            [
+                668,
+                -1.1464188862638496
+            ],
+            [
+                1587,
+                -0.4049955219067526
+            ],
+            [
+                3085,
+                0.04721394862139682
+            ],
+            [
+                5000,
+                0.56953784145381
+            ],
+            [
+                6915,
+                1.3221859041483333
+            ],
+            [
+                8413,
+                1.6188495656305735
+            ],
+            [
+                9332,
+                1.7613953069723651
+            ],
+            [
+                10000,
+                2.3257482385477384
+            ]
+        ]
+    ],
+    [
+        300.0,
+        30,
+        [
+            [
+                0,
+                -1.600772629982185
+            ],
+            [
+                668,
+                -1.1548516185367033
+            ],
+            [
+                1587,
+                -0.260387173785447
+            ],
+            [
+                3085,
+                0.17416570914366614
+            ],
+            [
+                5000,
+                0.47069243095356195
+            ],
+            [
+                6915,
+                1.1559276581637614
+            ],
+            [
+                8413,
+                2.0474031182051404
+            ],
+            [
+                9332,
+                2.18821711651116
+            ],
+            [
+                10000,
+                2.2393193406467518
+            ]
+        ]
+    ],
+    [
+        400.0,
+        40,
+        [
+            [
+                0,
+                -0.8286852465281818
+            ],
+            [
+                668,
+                -0.7815041529866706
+            ],
+            [
+                1587,
+                -0.3334896444053469
+            ],
+            [
+                3085,
+                0.21085213041026643
+            ],
+            [
+                5000,
+                0.5177616740489182
+            ],
+            [
+                6915,
+                1.077122434649409
+            ],
+            [
+                8413,
+                1.5898009703967424
+            ],
+            [
+                9332,
+                1.8859097291499742
+            ],
+            [
+                10000,
+                2.0954239138728523
+            ]
+        ]
+    ]
+]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir
new file mode 100644
index 00000000000..b6362b45d77
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json
new file mode 100644
index 00000000000..739262a9fb6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json
@@ -0,0 +1,4 @@
+{
+  "run1": {"compressedHistograms": ["histo1"]},
+  "run2": {"compressedHistograms": ["histo2", "histo1"]}
+}
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html b/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html
new file mode 100644
index 00000000000..fe899a0ba8c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html
@@ -0,0 +1,69 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-distribution-dashboard.html">
+
+<title>Distribution Dashboard Demo</title>
+<style>
+  #container {
+    height: 800px;
+    width: 100%;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="distribution-dash-demo">
+      <template>
+        <tf-distribution-dashboard id="demo" backend="[[backend]]"></tf-distribution-dashboard>
+      </template>
+      <script>
+        import {Backend} from "../../tf-backend/backend";
+        import {createRouter, setRouter} from "../../tf-backend/router";
+
+        Polymer({
+          is: "distribution-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                return new Backend();
+              },
+            },
+          },
+          created: function() {
+            var path = "data";
+            var router = createRouter(path, true);
+            setRouter(router);
+          },
+        });
+      </script>
+    </dom-module>
+    <distribution-dash-demo id="container"></distribution-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html b/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
index 40e0b15413a..76de74273f2 100644
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
@@ -82,6 +82,7 @@ contains vz-distribution-charts embedded inside tf-panes-helper's.
           color-scale="[[_colorScale]]"
           data-type="[[dataType]]"
           data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
           run2tag="[[run2tag]]"
           selected-runs="[[_selectedRuns]]"
           repeat-for-runs
@@ -100,11 +101,19 @@ contains vz-distribution-charts embedded inside tf-panes-helper's.
   </template>
 
   <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+
     Polymer({
       is: "tf-distribution-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        DashboardBehavior("distributions"),
+        ReloadBehavior("tf-chart-scaffold"),
+        BackendBehavior,
       ],
       properties: {
         backend: Object,
@@ -112,7 +121,10 @@ contains vz-distribution-charts embedded inside tf-panes-helper's.
           type: String,
           value: "step"
         },
-        dataType: {value: "compressedHistogram"},
+        dataType: {
+          type: Object,
+          value: "compressedHistogram",
+        },
       },
     });
   </script>
diff --git a/tensorflow/tensorboard/components/tf_globals/BUILD b/tensorflow/tensorboard/components/tf_globals/BUILD
new file mode 100644
index 00000000000..c5b0cfbaa55
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_globals/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_globals",
+    srcs = [
+        "globals.ts",
+        "tf-globals.html",
+    ],
+    path = "/tf-globals",
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_globals"],
+    destdir = "tf-globals",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_globals/globals.ts b/tensorflow/tensorboard/components/tf_globals/globals.ts
index 33feb26d238..fb6bb83b97f 100644
--- a/tensorflow/tensorboard/components/tf_globals/globals.ts
+++ b/tensorflow/tensorboard/components/tf_globals/globals.ts
@@ -13,20 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-/* tslint:disable:no-namespace */
-module TF.Globals {
+// The names of TensorBoard tabs.
+export const TABS = [
+  'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
+  'embeddings', 'text'
+];
 
-  // The names of TensorBoard tabs.
-  export var TABS = [
-    'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
-    'embeddings'
-  ];
+// If true, TensorBoard stores its hash in the URI state.
+// If false, tab switching in TensorBoard will not update location hash,
+// because hash updates interfere with wct_tests.
+let _useHash = false;
 
-  // If true, TensorBoard stores its hash in the URI state.
-  // If false, tab switching in TensorBoard will not update location hash,
-  // because hash updates interfere with wct_tests.
-  export var USE_HASH = false;
-
-  // If USE_HASH is false, FAKE_HASH holds the hash contents.
-  export var FAKE_HASH = '';
+export function setUseHash(shouldUseHash: boolean): void {
+  _useHash = shouldUseHash;
+}
+
+export function useHash(): boolean {
+  return _useHash;
+}
+
+let _fakeHash = '';
+
+export function setFakeHash(h: string) {
+  _fakeHash = h;
+}
+
+export function getFakeHash() {
+  return _fakeHash;
 }
diff --git a/tensorflow/tensorboard/components/tf_globals/tf-globals.html b/tensorflow/tensorboard/components/tf_globals/tf-globals.html
index 952979d0be9..efb8e92e080 100644
--- a/tensorflow/tensorboard/components/tf_globals/tf-globals.html
+++ b/tensorflow/tensorboard/components/tf_globals/tf-globals.html
@@ -15,7 +15,5 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<dom-module id="tf-globals">
-  <script src="globals.js"></script>
-</dom-module>
+<script src="globals.js"></script>
 
diff --git a/tensorflow/tensorboard/components/tf_graph/BUILD b/tensorflow/tensorboard/components/tf_graph/BUILD
new file mode 100644
index 00000000000..4c0894f1925
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/BUILD
@@ -0,0 +1,56 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    path = "/tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_iron_flex_layout",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph"],
+    destdir = "tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//third_party/javascript/polymer/v1/iron-flex-layout:lib",
+        "//third_party/javascript/polymer/v1/iron-icons:lib",
+        "//third_party/javascript/polymer/v1/paper-button:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-input:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
+        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/BUILD b/tensorflow/tensorboard/components/tf_graph/demo/BUILD
new file mode 100644
index 00000000000..02f3bf64bbc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
new file mode 100644
index 00000000000..30b20645346
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json b/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
deleted file mode 100644
index f5ca9aada79..00000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
+++ /dev/null
@@ -1,123 +0,0 @@
-[
-  {
-    "name": "Mnist Eval",
-    "path": "mnist_eval.pbtxt"
-  },
-  {
-    "name": "Mnist with summaries (+stats)",
-    "path": "mnist_with_summaries.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step100",
-        "path": "mnist_with_summaries_step100.pbtxt"
-      },
-      {
-        "tag": "step1000",
-        "path": "mnist_with_summaries_step1000.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Mnist Train (with shapes)",
-    "path": "mnist_train_shapes.pbtxt"
-  },
-  {
-    "name": "Inception Train (huge)",
-    "path": "inception_train.pbtxt"
-  },
-  {
-    "name": "Inception Train Eval",
-    "path": "inception_train_eval.pbtxt"
-  },
-  {
-    "name": "Inception Test",
-    "path": "inception_test_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train",
-    "path": "ptb_word_lstm_train.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train Eval",
-    "path": "ptb_word_lstm_train_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Test",
-    "path": "ptb_word_lstm_test_eval.pbtxt"
-  },
-  {
-    "name": "Cifar10 Train (+stats)",
-    "path": "cifar10_train.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_train_step0.pbtxt"
-      },
-      {
-        "tag": "step100",
-        "path": "cifar10_train_step100.pbtxt"
-      },
-      {
-        "tag": "step200",
-        "path": "cifar10_train_step200.pbtxt"
-      },
-      {
-        "tag": "step300",
-        "path": "cifar10_train_step300.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Cifar10 Multi-GPU Train",
-    "path": "cifar10_multi_gpu_train.pbtxt"
-  },
-  {
-    "name": "Cifar10 Eval (+stats)",
-    "path": "cifar10_eval.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_eval_step0.pbtxt"
-      },
-      {
-        "tag": "step10",
-        "path": "cifar10_eval_step10.pbtxt"
-      },
-      {
-        "tag": "step20",
-        "path": "cifar10_eval_step20.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Fatcat LSTM",
-    "path": "fatcat_lstm.pbtxt"
-  },
-  {
-    "name": "Legacy Inception Renamed",
-    "path": "legacy_inception_renamed.pbtxt"
-  },
-  {
-    "name": "Wolfe (Broken)",
-    "path": "wolfe1.pbtxt"
-  },
-  {
-    "name": "Wolfe (Fixed)",
-    "path": "wolfe2.pbtxt"
-  },
-  {
-    "id": "alex",
-    "name": "AlexNet",
-    "path": "alexnet.pbtxt"
-  },
-  {
-    "id": "alexprivate",
-    "name": "AlexNet Private",
-    "path": "alexnet.pbtxt",
-    "private": true
-  },
-  {
-    "name": "TestError404",
-    "path": "nofile"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/index.html b/tensorflow/tensorboard/components/tf_graph/demo/index.html
index c89490f44d4..52e2f0b9340 100644
--- a/tensorflow/tensorboard/components/tf_graph/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_graph/demo/index.html
@@ -15,32 +15,78 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
 
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, minimum-scale=1.0, initial-scale=1.0, user-scalable=yes">
-    <title>tf-graph Demo</title>
-    <!-- Libraries that should be imported in TensorBoard when the Graph visualizer ports to TensorBoard -->
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <script src="../../es6-promise/promise.min.js"></script>
-    <link rel="import" href="tf-graph-demo.html">
-    <style>
-      html {
-        width: 100%;
-        height: 100%;
-      }
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph id="graph" color-by="xla_cluster"></tf-graph>
+      </template>
+      <script>
+        "use strict";
 
-      body {
-        margin: 0;
-        padding: 0;
-        width: 100%;
-        height: 100%;
-      }
-    </style>
-  </head>
+        Polymer({
+          is: "tf-graph-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
 
-  <body unresolved>
-    <tf-graph-demo></tf-graph-demo>
-  </body>
-</html>
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.graph.set('basicGraph', slimGraph);
+              this.$.graph.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-demo></tf-graph-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html b/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
deleted file mode 100644
index d5fd41dfebe..00000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
+++ /dev/null
@@ -1,202 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../../polymer/polymer.html">
-<link rel="import" href="../../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../tf-graph/tf-graph-controls.html">
-
-<!--
-Element for tf-graph demo page
-
-Example:
-
-<tf-graph-demo></tf-graph-demo>
--->
-<dom-module id="tf-graph-demo">
-<template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.main {
-  position: absolute;
-  right: 0;
-  left: 250px;
-  height: 100%;
-}
-
-.side {
-  position: absolute;
-  left: 0;
-  width: 250px;
-  height: 100%;
-  border: 1px solid black;
-  box-sizing: border-box;
-}
-
-.all {
-  position: relative;
-  width: 100%;
-  height: 100%
-}
-
-</style>
-<div class="all">
-  <div class="side">
-    <!-- The observatory header component is injected in during vulcanization
-         and an instance of it is initialized and filled here when the demo
-         app initializes. -->
-    <div id="observatory-header"></div>
-    <tf-graph-controls
-        devices-for-stats="{{_devicesForStats}}"
-        color-by-params="[[colorByParams]]"
-        stats="[[stats]]"
-        color-by="{{colorBy}}"
-        datasets="[[datasets]]"
-        render-hierarchy="[[_renderHierarchy]]"
-        selected-dataset="{{selectedDataset}}"
-        selected-file="{{selectedFile}}"
-        selected-metadata-tag="{{selectedMetadataTag}}"
-        show-session-runs-dropdown="[[showSessionRunsDropdown]]"
-        show-upload-button="[[showUploadButton]]"
-    ></tf-graph-controls>
-    <tf-graph-loader id="loader"
-        datasets="[[datasets]]"
-        selected-dataset="[[selectedDataset]]"
-        selected-metadata-tag="[[selectedMetadataTag]]"
-        selected-file="[[selectedFile]]"
-        out-graph-hierarchy="{{graphHierarchy}}"
-        out-graph="{{graph}}"
-        out-stats="{{stats}}"
-        progress="{{_progress}}"
-        out-hierarchy-params="{{_hierarchyParams}}"
-    ></tf-graph-loader>
-  </div>
-  <div class="main">
-    <tf-graph-board id="graphboard"
-        color-by="[[colorBy]]"
-        color-by-params="{{colorByParams}}"
-        devices-for-stats="[[_devicesForStats]]"
-        graph-hierarchy="[[graphHierarchy]]"
-        graph="[[graph]]"
-        hierarchy-params="[[_hierarchyParams]]"
-        progress="[[_progress]]"
-        render-hierarchy="{{_renderHierarchy}}"
-        stats="[[stats]]"
-    ></tf-graph-board>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-(function(){
-
-Polymer({
-  is: 'tf-graph-demo',
-  properties: {
-    datasets: {
-      type: Object
-    },
-    selectedDataset: {
-      type: Number,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    _renderHierarchy: Object,
-    _progress: Object,
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    }
-  },
-  created: function() {
-    let queryParams = tf.graph.util.getQueryParams(location.search);
-    let selectedDataset = 0;
-
-    if (typeof DEMO_DATASETS === 'undefined') {
-      DEMO_DATASETS = 'demo_datasets.json';
-    }
-
-    d3.json(DEMO_DATASETS, function (error, datasets) {
-        let publicDatasets = [];
-
-        if (error) {
-          console.log('Error loading demo datasets:');
-          console.log(error);
-          return;
-        }
-
-        if(typeof DEMO_DIR_PREFIX === 'undefined') {
-          DEMO_DIR_PREFIX = 'tf_model_zoo/';
-        }
-        _.each(datasets, function(dataset, index) {
-          if (queryParams['graphid'] && dataset.id == queryParams['graphid']) {
-            selectedDataset = index;
-          } else if (dataset['private']) {
-            return;
-          }
-
-          dataset.path = this._normalizePath(dataset.path);
-          if (dataset.runMetadata != null) {
-            _.each(dataset.runMetadata, function(metadata) {
-              metadata.path = this._normalizePath(metadata.path);
-            }, this);
-          }
-          publicDatasets.push(dataset);
-        }, this);
-        this.set('datasets', publicDatasets);
-        if (selectedDataset != 0) {
-          this.set('selectedDataset', selectedDataset);
-        }
-    }.bind(this));
-  },
-  ready: function() {
-    if (typeof IS_OBSERVATORY !== 'undefined' && IS_OBSERVATORY) {
-      // Create the header and add it to the DOM. This component is injected in
-      // during vulcanization.
-      document.getElementById('observatory-header').appendChild(
-          document.createElement('tf-graph-observatory-header'));
-
-      this.set('showSessionRunsDropdown', false);
-      this.set('showUploadButton', false);
-    }
-  },
-  _normalizePath: function(path) {
-    return this.resolveUrl(DEMO_DIR_PREFIX + path);
-  },
-  _selectedDatasetChanged: function() {
-    if (this.datasets) {
-      let dataset = this.datasets[this.selectedDataset];
-      let queryParams = '';
-      if (dataset['id']) {
-        queryParams = '?graphid=' + dataset['id'];
-      }
-      window.history.replaceState(
-          null, null, location.pathname + queryParams);
-    }
-  }
-});
-})();
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
deleted file mode 100644
index 36ab5be7840..00000000000
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
+++ /dev/null
@@ -1,739 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-menu/paper-menu.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-radio-group/paper-radio-group.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-
-<dom-module id="tf-graph-controls">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  color: gray;
-  --paper-font-subhead: {
-    font-size: 14px;
-    color: gray;
-  };
-  --paper-dropdown-menu-icon: {
-    width: 15px;
-    height: 15px;
-  };
-  --paper-dropdown-menu-button: {
-    padding: 0;
-  };
-  --paper-dropdown-menu-input: {
-    padding: 0;
-  };
-  --paper-item-min-height: 30px;
-}
-
-paper-button[raised].keyboard-focus {
-  font-weight: normal;
-}
-
-.run-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 25px;
-  };
-}
-
-.color-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 13px;
-  };
-}
-
-table {
-  border-collapse: collapse;
-  border-spacing: 0;
-}
-
-table td {
-  padding: 0;
-  margin: 0;
-}
-
-.allcontrols {
-  width: 188px;
-  padding: 0 30px;
-}
-
-.legend-holder {
-  position: absolute;
-  bottom: 0;
-  padding-bottom: 10px;
-}
-
-paper-radio-button {
-  display: block;
-  padding: 5px;
-}
-svg.icon {
-  width: 60px;
-  height: 18px;
-}
-.icon ellipse {
-  rx: 10px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 1px;
-  fill: #FFFFFF;
-  cy: 10px;
-}
-.icon rect {
-  height: 14px;
-  width: 35px;
-  rx: 5px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 2px;
-  fill: #D9D9D9;
-}
-.domainValues {
-  margin-bottom: 10px;
-  width: 165px;
-}
-.domainStart {
-  float: left;
-}
-.domainEnd {
-  float: right;
-}
-.colorBox {
-  width: 20px;
-}
-
-.image-icon {
-  width: 24px;
-  height: 24px;
-}
-
-.help-icon {
-  height: 15px;
-  margin: 0;
-  padding: 0;
-}
-
-.gray {
-  color: #666;
-}
-
-.title {
-  font-size: 16px;
-  margin: 8px 5px 8px 0;
-  color: black;
-}
-.title small {
-  font-weight: normal;
-}
-.deviceList {
-  max-height: 200px;
-  overflow-y: auto;
-}
-
-#file {
-  padding: 8px 0;
-}
-
-.color-legend-row {
-  clear: both;
-  height: 20px;
-  margin-top: 5px;
-  position: relative;
-}
-
-.color-legend-row svg {
-  position: absolute;
-  top: -1px;
-  width: 40px;
-}
-
-.color-legend-row span.color-legend-value {
-  margin-left: 60px;
-}
-
-#grey-rect {
-  fill: #eee;
-  stroke: #a6a6a6;
-}
-
-#faded-rect {
-  fill: url(#rectHatch);
-  stroke: var(--tb-graph-faded);
-}
-
-.button-text {
-  text-transform: none;
-  padding: 8px 18px 0 18px;
-  font-size: 14px
-}
-
-.upload-button {
-  width: 165px;
-  height: 25px;
-  text-transform: none;
-  margin-top: 4px;
-}
-
-.iconbutton {
-  padding: 2px;
-  width: 30px;
-  height: 30px;
-  color: var(--paper-orange-500);
-}
-
-.hidden-input {
-  height: 0px;
-  width: 0px;
-  overflow:hidden;
-}
-
-.allcontrols .control-holder {
-  display: flex;
-  clear: both;
-}
-
-.allcontrols .control-holder paper-radio-group {
-  margin-top: 5px;
-}
-
-span.counter {
-  font-size: 13px;
-  color: gray;
-}
-
-.runs paper-item {
-  --paper-item: {
-    white-space: nowrap;
-  }
-}
-</style>
-<svg width="0" height="0">
-  <defs>
-    <g id="legend-rect">
-      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-    </g>
-    <g id="grey-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink"
-            xlink:href="#legend-rect"/>
-     </g>
-     <g id="faded-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink"
-            xlink:href="#legend-rect"/>
-     </g>
-  </defs>
-</svg>
-<div class="allcontrols">
-  <div class="control-holder">
-    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="fit">Fit to screen
-    </paper-button>
-  </div>
-  <div class="control-holder">
-    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="download">Download PNG
-    </paper-button>
-    <a href="#" id="graphdownload" class="title" download="graph.png">
-    </a>
-  </div>
-  <div class="control-holder runs">
-    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
-    <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
-        <template is="dom-repeat" items="[[datasets]]">
-          <paper-item>[[item.name]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-  </div>
-  <template is="dom-if" if="[[showSessionRunsDropdown]]">
-    <div class="control-holder">
-      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
-      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
-          <template is="dom-repeat" items="[[metadataTags]]">
-            <paper-item>[[item.tag]]</paper-item>
-          </template>
-          <paper-item>None</paper-item>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-  </template>
-  <template is="dom-if" if="[[showUploadButton]]">
-    <div class="control-holder">
-      <div class="title">Upload</div>
-      <paper-button raised class="text-button upload-button"
-          on-click="_getFile">Choose File</paper-button>
-      <div class="hidden-input">
-        <input type="file" id="file" name="file" on-change="_updateFileInput" />
-      </div>
-    </div>
-  </template>
-  <div class="control-holder">
-    <div class="title">
-      Trace inputs
-    </div>
-    <paper-toggle-button id="trace-inputs">
-
-    </paper-toggle-button>
-  </div>
-  <div class="control-holder">
-    <div class="title">Color</div>
-    <paper-radio-group selected="{{colorBy}}">
-      <paper-radio-button name="structure">Structure</paper-radio-button>
-      <paper-radio-button name="device">Device</paper-radio-button>
-      <template is="dom-if" if="[[_statsNotNull(stats)]]">
-        <paper-radio-button name="compute_time">Compute time</paper-radio-button>
-        <paper-radio-button name="memory">Memory</paper-radio-button>
-      </template>
-    </paper-radio-group>
-  </div>
-  <div>
-    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
-      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
-        <defs>
-          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
-            <stop class="start" offset="0%"
-                stop-color$="[[_currentGradientParams.startColor]]"/>
-            <stop class="end" offset="100%"
-                stop-color$="[[_currentGradientParams.endColor]]"/>
-          </linearGradient>
-        </defs>
-        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)"
-            stroke="black" />
-      </svg>
-      <div class="domainValues color-text">
-        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
-        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
-      </div>
-      <br style="clear: both">
-      <div>Devices included in stats:</div>
-      <div class="deviceList">
-        <table>
-        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
-          <tr>
-            <td>
-              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked"/>
-            </td>
-            <td>
-              <div>
-                <span>[[item.suffix]]</span>
-                <template is="dom-if" if="[[item.ignoredMsg]]">
-                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
-                </template>
-              </div>
-            </td>
-          </tr>
-        </template>
-        </table>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
-      <div class="color-text">
-        <div class="color-legend-row">
-          <div style="position: absolute;">
-            colors
-          </div>
-          <span class="color-legend-value">same substructure</span>
-        </div>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                 xlink:href="#grey-rect" x="0" y="0"/>
-          </svg>
-          <span class="color-legend-value">unique substructure</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
-      <div class="color-text">
-        <div class="deviceList">
-          <table>
-          <template is="dom-repeat" items="[[colorByParams.device]]">
-            <tr>
-              <td style$="[[_getBackgroundColor(item.color)]]">
-                <div class="colorBox"></div>
-              </td>
-              <td>
-                <div>[[item.device]]</div>
-              </td>
-            </tr>
-          </template>
-          </table>
-        </div>
-        <br/>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                 xlink:href="#grey-rect" x="0" y="0"/>
-          </svg>
-          <span class="color-legend-value">unknown device</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_statsNotNull(stats)]]">
-      <div class="color-legend-row">
-        <svg>
-          <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                xlink:href="#faded-rect" x="0" y="0"/>
-        </svg>
-        <span class="color-legend-value">unused substructure</span>
-      </div>
-    </template>
-  </div>
-  <!--
-    Due to limited vertical space on the left sidebar, hide the legend whenever
-    we show a list of devices to include in stats.
-  -->
-  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
-    <div class="legend-holder">
-      <table>
-        <tr>
-          <td><div class="title">Graph</div></td>
-          <td>(* = expandable)</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <rect transform="translate(3, 1)" height="14" width="35"
-                  rx="5" ry="5"/>
-            </svg>
-          </td>
-          <td>Namespace<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" preserveAspectRatio="xMinYMid meet"
-                viewBox="0 0 10 10">
-              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5"
-                y="6" />
-            </svg>
-          </td>
-          <td>OpNode</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet"
-                viewBox="0 0 12 12">
-              <use xlink:href="#op-series-horizontal-stamp" fill="white"
-                  stroke="#ccc" x="2" y="2"/>
-            </svg>
-          </td>
-          <td>Unconnected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <use xlink:href="#op-series-vertical-stamp"
-                  fill="white" stroke="#ccc" x="2" y="2"/>
-            </svg>
-          </td>
-          <td>Connected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"/>
-            </svg>
-          </td>
-          <td>Constant</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
-              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"/>
-            </svg>
-          </td>
-          <td>Summary</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <defs>
-                <marker id="ref-arrowhead-legend" fill="#bbb" markerWidth="10"
-                    markerHeight="10" refX="1" refY="5" orient="auto">
-                  <path d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"/>
-                </marker>
-              </defs>
-              <path stroke="#bbb"
-                  d="M2 9 l 23 0" stroke-linecap="round" />
-            </svg>
-          </td>
-          <td>Dataflow edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path stroke="#bbb"
-                d="M2 9 l 23 0" stroke-linecap="round" stroke-dasharray="2, 2" />
-            </svg>
-          </td>
-          <td>Control dependency edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path marker-start="url(#ref-arrowhead-legend)"
-                stroke="#bbb" d="M2 9 l 23 0"
-                stroke-linecap="round" />
-            </svg>
-          </td>
-          <td>Reference edge</td>
-        </tr>
-      </table>
-    </div>
-  </template>
-  </div>
-</template>
-<script>
-(function() { // Private scope.
-/**
- * Stats from device names that match these regexes will be excluded by default.
- * The user can still turn on a device by selecting the checkbox in the device list.
- * See b/29089982 for context.
- */
-var DEVICE_NAMES_EXCLUDE = [
-  {
-    regex: /gpu:[0-9]+$/,
-    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
-  }
-];
-
-Polymer({
-  is: 'tf-graph-controls',
-  properties: {
-    // Public API.
-    stats: {
-      value: null,
-      type: Object,
-      observer: '_statsChanged'
-    },
-    devicesForStats: {
-      value: null,
-      type: Object,
-      notify: true,
-      readonly: true,
-    },
-    colorBy: {
-      type: String,
-      value: 'structure',
-      notify: true,
-      readonly: true
-    },
-    colorByParams: Object,
-    datasets: {
-      type: Array,
-      observer: '_datasetsChanged'
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true,
-    },
-    metadataTags: {
-      type: Array,
-      computed: '_getMetadataTags(selectedDataset, datasets)'
-    },
-    selectedDataset: {
-      type: Number,
-      notify: true,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    selectedFile: {
-      type: Object,
-      notify: true
-    },
-    selectedMetadataTag: {
-      type: Number,
-      notify: true,
-      value: -1
-    },
-    _currentGradientParams: {
-      type: Object,
-      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
-    },
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    }
-  },
-  listeners: {
-    'trace-inputs.change': '_traceInputToggleChanged'
-  },
-  _traceInputToggleChanged: function(event) {
-    // Flip the state of the trace inputs flag.
-    this.renderHierarchy.traceInputs = event.target.active;
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-  },
-  _statsNotNull: function(stats) {
-    return stats != null;
-  },
-  _statsChanged: function(stats) {
-    if (stats == null) {
-      return;
-    }
-    var devicesForStats = {};
-    var devices = _.each(stats.dev_stats, function(d) {
-      // Avoid device names that are ignored by default.
-      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
-        return rule.regex.test(d.device);
-      });
-      if (!exclude) {
-        devicesForStats[d.device] = true;
-      }
-    });
-    this.set('devicesForStats', devicesForStats);
-  },
-  _getDevices: function(devicesForStats) {
-    var devices = _.map(this.stats.dev_stats, function(d) {
-      return d.device;
-    });
-    // Devices names can be long so we remove the longest common prefix
-    // before showing the devices in a list.
-    var suffixes = tf.graph.util.removeCommonPrefix(devices);
-    return _.map(devices, function(device, i) {
-      var ignoredMsg = null;
-      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
-        if (rule.regex.test(device)) {
-          ignoredMsg = rule.msg;
-        }
-      });
-      return {
-        device: device,
-        suffix: suffixes[i],
-        used: devicesForStats[device],
-        ignoredMsg: ignoredMsg
-      };
-    });
-  },
-  _deviceCheckboxClicked: function(checkbox) {
-    // Update the device map.
-    var devicesForStats = _.extend({}, this.devicesForStats);
-    var device = checkbox.target.value;
-    if (checkbox.target.checked) {
-      devicesForStats[device] = true;
-    } else {
-      delete devicesForStats[device];
-    }
-    this.set('devicesForStats', devicesForStats);
-  },
-  _numSessionRuns: function(metadataTags) {
-    return metadataTags != null ? metadataTags.length : 0;
-  },
-  _getBackgroundColor: function(color) {
-    return 'background-color:' + color;
-  },
-  fit: function() {
-    document.querySelector('#scene').fit();
-  },
-  _isGradientColoring: function(stats, colorBy) {
-    return ["compute_time", "memory"].indexOf(colorBy) !== -1
-        && stats != null;
-  },
-  _equals: function(a, b) {
-    return a === b;
-  },
-  _getCurrentGradientParams: function(colorByParams, colorBy) {
-    if (!this._isGradientColoring(this.stats, colorBy)) {
-      return;
-    }
-    var params = colorByParams[colorBy];
-    var minValue = params.minValue;
-    var maxValue = params.maxValue;
-    if (colorBy === 'memory') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.MEMORY_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.MEMORY_UNITS);
-    } else if (colorBy === 'compute_time') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.TIME_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.TIME_UNITS);
-    }
-    return {
-      minValue: minValue,
-      maxValue: maxValue,
-      startColor: params.startColor,
-      endColor: params.endColor
-    };
-  },
-  download: function() {
-    this.$.graphdownload.click();
-  },
-  _updateFileInput: function(e) {
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-    this._setDownloadFilename(file.name);
-    this.set('selectedFile', e);
-  },
-  _datasetsChanged: function(newDatasets, oldDatasets) {
-    if (oldDatasets != null || this.selected == null) {
-      // Select the first dataset by default.
-      this.set('selectedDataset', 0);
-      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
-    }
-  },
-  _getMetadataTags: function(selectedDataset, datasets) {
-    return this.datasets[selectedDataset].runMetadata;
-  },
-  _selectedDatasetChanged: function(newDataset, oldDataset) {
-    if (this.datasets) {
-      this.set('selectedMetadataTag', -1);
-      this.set('colorBy', 'structure');
-      this.$['trace-inputs'].active = false; // Set trace input to off-state.
-      this._setDownloadFilename(this.datasets[newDataset].path);
-    }
-  },
-  _getFile: function() {
-    this.$$("#file").click();
-  },
-  _setDownloadFilename: function(graphPath) {
-    // Strip off everything before the last "/" and strip off the file
-    // extension in order to get the name of the PNG for the graph.
-    var dotIndex = graphPath.lastIndexOf('.');
-    if (dotIndex) {
-      graphPath = graphPath.substring(0, dotIndex);
-    }
-    var slashIndex = graphPath.lastIndexOf('/');
-    if (slashIndex) {
-      graphPath = graphPath.substring(slashIndex + 1);
-    }
-    this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  }
-});
-})(); // Closing private scope.
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html b/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
index cdbee3af5ed..5fc16c05207 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
+++ b/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
@@ -16,7 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<script src="../tf-graph-common/lib/scene/minimap.js"></script>
+<script src="../tf-graph-common/minimap.js"></script>
 
 <dom-module id="tf-graph-minimap">
 <template>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html b/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
index d0e42a36a06..fb2bc13f9a1 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
+++ b/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
@@ -276,6 +276,12 @@ limitations under the License.
   display: none;
 }
 
+/* Reference Edge */
+::content .edge > path.edgeline.referenceedge {
+  stroke: #FFB74D;
+  opacity: 1;
+}
+
 /* --- Series Nodes --- */
 
 /* Hide the rect for a series' annotation. */
@@ -322,7 +328,7 @@ limitations under the License.
 /* --- Annotation --- */
 
 /* only applied for annotations that are not summary or constant.
-(.summary, .constant gets overriden below) */
+(.summary, .constant gets overridden below) */
 ::content .annotation > .annotation-node > * {
   stroke-width: 0.5;
   stroke-dasharray: 1, 1;
@@ -436,10 +442,14 @@ limitations under the License.
   fill: #666;
 }
 
-::content .ref-arrowhead {
+::content .dataflow-arrowhead {
   fill: #bbb;
 }
 
+::content .reference-arrowhead {
+  fill: #FFB74D;
+}
+
 ::content .edge .control-dep {
   stroke-dasharray: 2, 2;
 }
@@ -479,6 +489,17 @@ limitations under the License.
   display: none;
 }
 
+::content .health-pill-stats {
+  font-size: 4px;
+  text-anchor: middle;
+}
+
+::content .health-pill rect {
+  filter: url(#health-pill-shadow);
+  rx: 3;
+  ry: 3;
+}
+
 .titleContainer {
   position: relative;
   top: 20px;
@@ -505,23 +526,42 @@ limitations under the License.
 <svg id="svg">
   <defs>
 
-    <!-- Arrow heads for edge paths of different predefined sizes. -->
-    <path id="ref-arrowhead-path" d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"/>
-    <marker class="ref-arrowhead" id="ref-arrowhead-small" viewBox="0 0 10 10" markerWidth="10" markerHeight="10"
-      refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path" />
+    <!-- Arrow heads for reference edge paths of different predefined sizes per color. -->
+    <path id="reference-arrowhead-path" d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
+    <marker class="reference-arrowhead" id="reference-arrowhead-small" viewBox="0 0 10 10" markerWidth="5" markerHeight="5"
+      refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#reference-arrowhead-path" />
     </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
-        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path" />
+    <marker class="reference-arrowhead" id="reference-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#reference-arrowhead-path" />
     </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
-        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path" />
+    <marker class="reference-arrowhead" id="reference-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#reference-arrowhead-path" />
     </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
-        refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path" />
+    <marker class="reference-arrowhead" id="reference-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#reference-arrowhead-path" />
+    </marker>
+
+    <!-- Arrow heads for dataflow edge paths of different predefined sizes per color. -->
+    <path id="dataflow-arrowhead-path" d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
+    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-small" viewBox="0 0 10 10" markerWidth="5" markerHeight="5"
+      refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#dataflow-arrowhead-path" />
+    </marker>
+    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#dataflow-arrowhead-path" />
+    </marker>
+    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#dataflow-arrowhead-path" />
+    </marker>
+    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
+        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
+      <use xlink:href="#dataflow-arrowhead-path" />
     </marker>
 
     <!-- Arrow head for annotation edge paths. -->
@@ -581,6 +621,18 @@ limitations under the License.
     <pattern id="ellipseHatch" patternTransform="rotate(45 0 0)" width="2" height="2" patternUnits="userSpaceOnUse">
       <line x1="0" y1="0" x2="0" y2="2" style="stroke-width: 1"/>
     </pattern>
+
+    <!-- A shadow for health pills. -->
+    <filter id="health-pill-shadow" x="-40%" y="-40%" width="180%" height="180%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="0.8"/>
+      <feOffset dx="0" dy="0" result="offsetblur"/>
+      <feFlood flood-color="#000000"/>
+      <feComposite in2="offsetblur" operator="in"/>
+      <feMerge>
+        <feMergeNode/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
   </defs>
   <!-- Make a large rectangle that fills the svg space so that
   zoom events get captured on safari -->
@@ -596,6 +648,18 @@ Polymer({
     renderHierarchy: Object,
     name: String,
     colorBy: String,
+
+    // For each render hierarchy, we only fit it to the viewport once (when the scene is attached to
+    // the DOM). We do not fit the hierarchy again (unless the user clicks the reset button). For
+    // instance, if the user enters a certain view in the graph, switches to another dashboard, and
+    // returns to the graph dashboard, the user expects the previous view. These properties enable
+    // that behavior.
+
+    /** Whether the scene has fit the current render hierarchy (to the viewport) at least once. */
+    _hasRenderHierarchyBeenFitOnce: Boolean,
+    /** Whether this scene element is currently attached to a parent element. */
+    _isAttached: Boolean,
+
     /** @type {d3_zoom} d3 zoom object */
     _zoom: Object,
     highlightedNode: {
@@ -614,12 +678,12 @@ Polymer({
     },
     /** Keeps track of the starting coordinates of a graph zoom/pan */
     _zoomStartCoords: {
-      type: Array,
+      type: Object,
       value: null
     },
     /** Keeps track of the current coordinates of a graph zoom/pan */
-    _zoomCoords: {
-      type: Array,
+    _zoomTransform: {
+      type: Object,
       value: null
     },
     /** Maximum distance of a zoom event for it to be interpreted as a click */
@@ -692,11 +756,19 @@ Polymer({
       type: Number,
       value: 18
     },
-    progress: Object
+    progress: Object,
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
   },
   observers: [
     '_colorByChanged(colorBy)',
-    '_buildAndFit(renderHierarchy)'
+    '_renderHierarchyChanged(renderHierarchy)',
+    // Animation and fitting must come after the observer for the hierarchy changing because we must
+    // first build the render hierarchy.
+    '_animateAndFit(_isAttached, renderHierarchy)',
+    '_updateHealthPills(nodeNamesToHealthPills, healthPillStepIndex)',
   ],
   getNode: function(nodeName) {
     return this.renderHierarchy.getRenderNodeByName(nodeName);
@@ -739,12 +811,13 @@ Polymer({
     }.bind(this));
     // Update the minimap again when the graph is done animating.
     setTimeout(function() {
+      this._updateHealthPills(this.nodeNamesToHealthPills, this.healthPillStepIndex);
       this.minimap.update();
     }.bind(this), tf.graph.layout.PARAMS.animation.duration);
   },
   ready: function() {
-    this._zoom = d3.behavior.zoom()
-      .on('zoomend', function() {
+    this._zoom = d3.zoom()
+      .on('end', function() {
         if (this._zoomStartCoords) {
           // Calculate the total distance dragged during the zoom event.
           // If it is sufficiently small, then fire an event indicating
@@ -753,8 +826,8 @@ Polymer({
           // is ignored (as this mouse click was part of a zooming, and should
           // not be used to indicate an actual click on the graph).
           var dragDistance = Math.sqrt(
-            Math.pow(this._zoomStartCoords[0] - this._zoomCoords[0], 2) +
-            Math.pow(this._zoomStartCoords[1] - this._zoomCoords[1], 2));
+            Math.pow(this._zoomStartCoords.x - this._zoomTransform.x, 2) +
+            Math.pow(this._zoomStartCoords.y - this._zoomTransform.y, 2));
           if (dragDistance < this._maxZoomDistanceForClick) {
             this._fireEnableClick();
           } else {
@@ -764,8 +837,8 @@ Polymer({
         this._zoomStartCoords = null;
       }.bind(this))
       .on('zoom', function() {
-        // Store the coordinates of the zoom event
-        this._zoomCoords = d3.event.translate;
+        // Store the coordinates of the zoom event.
+        this._zoomTransform = d3.event.transform;
 
         // If this is the first zoom event after a zoom-end, then
         // store the coordinates as the start coordinates as well,
@@ -774,15 +847,13 @@ Polymer({
         // event on mouse-down, even if there has been no dragging
         // done to translate the graph around.
         if (!this._zoomStartCoords) {
-          this._zoomStartCoords = this._zoomCoords.slice();
+          this._zoomStartCoords = this._zoomTransform;
           this.fire('disable-click');
         }
         this._zoomed = true;
-        d3.select(this.$.root).attr('transform',
-                    'translate(' + d3.event.translate + ')' +
-                    'scale(' + d3.event.scale + ')');
+        d3.select(this.$.root).attr('transform', d3.event.transform);
         // Notify the minimap.
-        this.minimap.zoom(d3.event.translate, d3.event.scale);
+        this.minimap.zoom(d3.event.transform);
       }.bind(this));
     d3.select(this.$.svg).call(this._zoom)
       .on('dblclick.zoom', null);
@@ -797,9 +868,24 @@ Polymer({
         tf.graph.layout.PARAMS.minimap.size,
         tf.graph.layout.PARAMS.subscene.meta.labelHeight);
   },
-  _buildAndFit: function(renderHierarchy) {
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
+  },
+  _renderHierarchyChanged: function(renderHierarchy) {
+    this._hasRenderHierarchyBeenFitOnce = false;
     this._resetState();
     this._build(renderHierarchy);
+  },
+  _animateAndFit: function(isAttached, renderHierarchy) {
+    if (this._hasRenderHierarchyBeenFitOnce || !isAttached) {
+      // Do not animate and fit if the scene has already fitted this render hierarchy once. Or if
+      // the graph dashboard is not attached (in which case the scene lacks DOM info for fitting).
+      return;
+    }
+
     // Fit to screen after the graph is done animating.
     setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
   },
@@ -808,14 +894,14 @@ Polymer({
     var titleStyle = mainGraphTitleElement.style;
     var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
     var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-      tf.graph.scene.Class.Scene.CORE)[0][0];
+      tf.graph.scene.Class.Scene.CORE).node();
     // Only show labels if the graph is fully loaded.
     if (showLabels && core && this.progress && this.progress.value === 100) {
       var aux =
         d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.INEXTRACT)[0][0] ||
+          tf.graph.scene.Class.Scene.INEXTRACT).node() ||
         d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.OUTEXTRACT)[0][0];
+          tf.graph.scene.Class.Scene.OUTEXTRACT).node();
       var coreX = core.getCTM().e;
       var auxX = aux ? aux.getCTM().e : null;
       titleStyle.display = 'inline';
@@ -852,6 +938,7 @@ Polymer({
     }
   },
   fit: function() {
+    this._hasRenderHierarchyBeenFitOnce = true;
     tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
       this._zoomed = false;
     }.bind(this));
@@ -883,11 +970,15 @@ Polymer({
     delete this._nodeGroupIndex[n];
   },
   addEdgeGroup: function(n, selection) {
-    this._edgeGroupIndex[e] = selection;
+    this._edgeGroupIndex[n] = selection;
   },
   getEdgeGroup: function(e) {
     return this._edgeGroupIndex[e];
   },
+  _updateHealthPills: function(nodeNamesToHealthPills, healthPillStepIndex) {
+    tf.graph.scene.addHealthPills(
+        this.$.svg, nodeNamesToHealthPills, healthPillStepIndex);
+  },
   /**
    * Update node and annotation node of the given name.
    * @param  {String} n node name
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph.html b/tensorflow/tensorboard/components/tf_graph/tf-graph.html
index 6c542f1720d..efbf065a40a 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph.html
+++ b/tensorflow/tensorboard/components/tf_graph/tf-graph.html
@@ -61,9 +61,11 @@ paper-button {
     <tf-graph-scene id="scene" class="auto"
           render-hierarchy="[[renderHierarchy]]"
           highlighted-node="[[_getVisible(highlightedNode)]]"
-          selected-node="[[selectedNode]]"
+          selected-node="{{selectedNode}}"
           color-by="[[colorBy]]"
           progress="[[progress]]"
+          node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+          health-pill-step-index="{{healthPillStepIndex}}"
     ></tf-graph-scene>
   </div>
 </div>
@@ -118,6 +120,10 @@ Polymer({
       type: Boolean,
       value: true
     },
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
   },
   observers: [
     '_statsChanged(stats, devicesForStats)',
@@ -166,7 +172,14 @@ Polymer({
             device: deviceName,
             color: renderGraph.deviceColorMap(deviceName)
           };
-        })
+        }),
+        xla_cluster: _.map(renderGraph.xlaClusterColorMap.domain(),
+            function(xlaClusterName) {
+          return {
+            xla_cluster: xlaClusterName,
+            color: renderGraph.xlaClusterColorMap(xlaClusterName)
+          };
+        }),
       });
       this._setRenderHierarchy(renderGraph);
       this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_graph_app/BUILD b/tensorflow/tensorboard/components/tf_graph_app/BUILD
new file mode 100644
index 00000000000..d0b6d79640d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app/BUILD
@@ -0,0 +1,47 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_app",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    path = "/tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_component_page",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_app"],
+    visibility = [
+        "//learning/brain/python/client/colab:__pkg__",
+        "//learning/vis/vz_elements/catalog:__pkg__",
+    ],
+    destdir = "tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
+        "//third_party/javascript/polymer/v1/iron-component-page:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+        "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
new file mode 100644
index 00000000000..0205e2fd92c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_app/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-app/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_app",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt
rename to tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
index 0897cdd08bd..f71feea390a 100644
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
@@ -15,31 +15,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <link rel="import" href="../tf-graph-app.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-  <style>
-  body {
-    margin: 0;
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<link rel="import" href="../tf-graph-app.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<style>
+  /** Make the graph app tall enough so the bottom legend does not overlap with the top. */
+  tf-graph-app, .container.tf-graph-app {
+    display: block;
+    height: 700px;
   }
-  </style>
-</head>
-<body>
-  <h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
-  <demo-snippet>
-    <template>
-      <tf-graph-app id="tfgraph"></tf-graph-app>
-      <script>
-        let g = document.querySelector("#tfgraph");
-        fetch("graph.pbtxt", {credentials: "include"}).then(r => r.text()).then(pbtxt => {
-          g.pbtxt = pbtxt;
-        });
-      </script>
-    </template>
-  </demo-snippet>
-</body>
-</html>
+</style>
+<h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
+<demo-snippet>
+  <template>
+    <tf-graph-app pbtxt-file-location="data/graph.pbtxt"></tf-graph-app>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html b/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
index 07308d38e41..915b54a06a9 100644
--- a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
+++ b/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
@@ -18,7 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 
 <!--
 Stand alone element of tf-graph for embedding.
@@ -111,17 +111,42 @@ Polymer({
   is: 'tf-graph-app',
   properties: {
     stats: Object,
+
+    // To use tf-graph-app, specify one of these 2 properties. Provide either
+    // 1. The path to a pbtxt file to load (pbtxtFileLocation). This option nicely makes the
+    //    progress bar include the time it takes to load the file across the network. The path could
+    //    be either a relative path or an absolute URL (of a resource that supports CORS).
+    // 2. The raw contents of a pbtxt file (pbtxt).
+    // Do not set both of these 2 properties.
+    pbtxtFileLocation: {
+      type: String,
+      observer: '_updateGraph',
+    },
     pbtxt: {
       type: String,
       observer: '_updateGraph',
     },
+
     _renderHierarchy: Object,
-    _progress: Object
+    _progress: Object,
   },
   _updateGraph: function() {
-    var blob = new Blob([this.pbtxt]);
-    this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
-  }
+    if (this.pbtxtFileLocation) {
+      // Fetch a pbtxt file. The fetching will be part of the loading sequence.
+      this.$.loader.datasets = [{
+        // Just name the dataset based on the file location.
+        "name": this.pbtxtFileLocation,
+        "path": this.pbtxtFileLocation,
+      }];
+      this.$.loader.set('selectedDataset', 0);
+    } else if (this.pbtxt) {
+      // Render the provided pbtxt.
+      var blob = new Blob([this.pbtxt]);
+
+      // TODO(chizeng): Find out why we call a private method here and do away with the call.
+      this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
+    }
+  },
 });
 })();
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/BUILD b/tensorflow/tensorboard/components/tf_graph_board/BUILD
new file mode 100644
index 00000000000..866112e0212
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/BUILD
@@ -0,0 +1,38 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_board",
+    srcs = ["tf-graph-board.html"],
+    path = "/tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_info",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_progress",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_board"],
+    destdir = "tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_info:legacy",
+        "//third_party/javascript/polymer/v1/paper-progress:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
new file mode 100644
index 00000000000..07e8d43dbee
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_board/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-board/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
new file mode 100644
index 00000000000..30b20645346
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/index.html b/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
new file mode 100644
index 00000000000..2563e1595e9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-board.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Board Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+
+  /** Make the graph take up the entire height of the demo container. */
+  tf-graph-board-demo, #board, #board > div {
+    display: block;
+    height: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-board-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph-board id="board" color-by="xla_cluster"></tf-graph-board>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-board-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.board.set('graph', slimGraph);
+              this.$.board.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-board-demo></tf-graph-board-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html b/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
index 2bc16b15c03..742bb63e045 100644
--- a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
+++ b/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
@@ -17,6 +17,7 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph/tf-graph.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 <link rel="import" href="../tf-graph-info/tf-graph-info.html">
 <link rel="import" href="../paper-progress/paper-progress.html">
 
@@ -137,11 +138,13 @@ paper-progress {
               render-hierarchy="{{renderHierarchy}}"
               devices-for-stats="[[devicesForStats]]"
               stats="[[stats]]"
-              selected-node="{{_selectedNode}}"
+              selected-node="{{selectedNode}}"
               highlighted-node="{{_highlightedNode}}"
               color-by="[[colorBy]]"
               color-by-params="{{colorByParams}}"
               progress="{{progress}}"
+              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+              health-pill-step-index="[[healthPillStepIndex]]"
     ></tf-graph>
   </div>
   <div id="info">
@@ -150,11 +153,18 @@ paper-progress {
               graph-hierarchy="[[graphHierarchy]]"
               render-hierarchy="[[renderHierarchy]]"
               graph="[[graph]]"
-              selected-node="{{_selectedNode}}"
+              selected-node="{{selectedNode}}"
               selected-node-include="{{_selectedNodeInclude}}"
               highlighted-node="{{_highlightedNode}}"
               color-by="[[colorBy]]"
               color-by-params="[[colorByParams]]"
+              debugger-data-enabled="[[debuggerDataEnabled]]"
+              are-health-pills-loading="[[areHealthPillsLoading]]"
+              debugger-numeric-alerts="[[debuggerNumericAlerts]]"
+              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+              all-steps-mode-enabled="{{allStepsModeEnabled}}"
+              specific-health-pill-step="{{specificHealthPillStep}}"
+              health-pill-step-index="{{healthPillStepIndex}}"
     ></tf-graph-info>
   </div>
   <div class="context-menu"></div>
@@ -171,10 +181,9 @@ Polymer({
     graph: Object,
     stats: Object,
     /**
-     * @type {value: number, msg: string}
-     *
      * A number between 0 and 100 denoting the % of progress
      * for the progress bar and the displayed message.
+     * @type {{value: number, msg: string}}
      */
     progress: Object,
     colorBy: String,
@@ -186,8 +195,39 @@ Polymer({
       type: Object,
       notify: true
     },
+    // Whether debugger data is enabled for this instance of Tensorboard.
+    debuggerDataEnabled: Boolean,
+    // Whether health pills are currently being loaded.
+    areHealthPillsLoading: Boolean,
+    // An array of alerts (in chronological order) provided by debugging libraries on when bad
+    // values (NaN, +/- Inf) appear.
+    debuggerNumericAlerts: {
+      type: Array,
+      notify: true,
+    },
+    // A mapping between node name to the tf.graph.scene.HealthPill to render.
+    nodeNamesToHealthPills: Object,
+    // Whether the user can request health pills for individual steps from the server. This can be
+    // slow compared the default of showing sampled health pills.
+    allStepsModeEnabled: {
+      type: Boolean,
+      notify: true,
+      value: false,
+    },
+    // Relevant if allStepsModeEnabled. The specific step for which to fetch health pills from the
+    // server for.
+    specificHealthPillStep: {
+      type: Number,
+      notify: true,
+      value: 0,
+    },
+    // The step of health pills to show throughout the graph.
+    healthPillStepIndex: Number,
     // Private API: Data routing between child components.
-    _selectedNode: String,
+    selectedNode: {
+      type: String,
+      notify: true,
+    },
     // The enum value of the include property of the selected node.
     _selectedNodeInclude: Number,
     _highlightedNode: String
@@ -196,7 +236,7 @@ Polymer({
     'node-toggle-extract': '_nodeToggleExtract'
   },
   observers: [
-    '_updateNodeInclude(_selectedNode)'
+    '_updateNodeInclude(selectedNode)'
   ],
   /** True if the progress is not complete yet (< 100 %). */
   _isNotComplete: function(progress) {
@@ -218,7 +258,7 @@ Polymer({
       node ? node.include : tf.graph.InclusionType.UNSPECIFIED);
   },
   _nodeToggleExtract: function() {
-    this._updateNodeInclude(this._selectedNode);
+    this._updateNodeInclude(this.selectedNode);
   }
 });
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/BUILD b/tensorflow/tensorboard/components/tf_graph_common/BUILD
new file mode 100644
index 00000000000..e4e57149f3c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common/BUILD
@@ -0,0 +1,54 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_common",
+    srcs = [
+        "annotation.ts",
+        "colors.ts",
+        "common.ts",
+        "contextmenu.ts",
+        "edge.ts",
+        "externs.ts",
+        "graph.ts",
+        "hierarchy.ts",
+        "layout.ts",
+        "minimap.ts",
+        "node.ts",
+        "parser.ts",
+        "proto.ts",
+        "render.ts",
+        "scene.ts",
+        "template.ts",
+        "tf-graph-common.html",
+        "util.ts",
+    ],
+    path = "/tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:dagre",
+        "//tensorflow/tensorboard/components/tf_imports:graphlib",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_common"],
+    destdir = "tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts b/tensorflow/tensorboard/components/tf_graph_common/annotation.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts
rename to tensorflow/tensorboard/components/tf_graph_common/annotation.ts
index ba308e6934e..bde38297785 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/annotation.ts
@@ -75,9 +75,7 @@ module tf.graph.scene.annotation {
             addAnnotationLabel(
                 aGroup, a.node.name, a, Class.Annotation.ELLIPSIS);
           }
-        });
-
-    annotationGroups
+        }).merge(annotationGroups)
         .attr(
             'class',
             a => {
@@ -114,11 +112,10 @@ function annotationToClassName(annotationType: render.AnnotationType) {
 function buildShape(aGroup, a: render.Annotation) {
   if (a.annotationType === render.AnnotationType.SUMMARY) {
     let summary = selectOrCreateChild(aGroup, 'use');
-    summary.attr({
-      'class': 'summary',
-      'xlink:href': '#summary-icon',
-      'cursor': 'pointer'
-    });
+    summary
+      .attr('class', 'summary')
+      .attr('xlink:href', '#summary-icon')
+      .attr('cursor', 'pointer');
   } else {
     let shape = node.buildShape(aGroup, a, Class.Annotation.NODE);
     // add title tag to get native tooltips
@@ -203,20 +200,18 @@ function update(aGroup, d: render.RenderNodeInfo, a: render.Annotation,
   }
 
   // label position
-  aGroup.select('text.' + Class.Annotation.LABEL).transition().attr({
-    x: cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset),
-    y: d.y + a.dy
-  });
+  aGroup.select('text.' + Class.Annotation.LABEL).transition()
+    .attr('x', cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset))
+    .attr('y', d.y + a.dy);
 
   // Some annotations (such as summary) are represented using a 12x12 image tag.
   // Purposely omitted units (e.g. pixels) since the images are vector graphics.
   // If there is an image, we adjust the location of the image to be vertically
   // centered with the node and horizontally centered between the arrow and the
   // text label.
-  aGroup.select('use.summary').transition().attr({
-    x: cx + a.dx - 3,
-    y: d.y + a.dy - 6
-  });
+  aGroup.select('use.summary').transition()
+    .attr('x', cx + a.dx - 3)
+    .attr('y', d.y + a.dy - 6);
 
   // Node position (only one of the shape selection will be non-empty.)
   positionEllipse(
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/colors.ts b/tensorflow/tensorboard/components/tf_graph_common/colors.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/colors.ts
rename to tensorflow/tensorboard/components/tf_graph_common/colors.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/common.ts b/tensorflow/tensorboard/components/tf_graph_common/common.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/common.ts
rename to tensorflow/tensorboard/components/tf_graph_common/common.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts b/tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts
similarity index 93%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts
rename to tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts
index 628e9ae56f2..8121cf9f6da 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts
@@ -48,11 +48,10 @@ export function getMenu(menu: ContextMenuItem[]) {
   return function(data, index: number): void {
     // Position and display the menu.
     let event = <MouseEvent>d3.event;
-    menuSelection.style({
-      'display': 'block',
-      'left': (event.layerX + 1) + 'px',
-      'top': (event.layerY + 1) + 'px'
-    });
+    menuSelection
+      .style('display', 'block')
+      .style('left', (event.layerX + 1) + 'px')
+      .style('top', (event.layerY + 1) + 'px');
 
     // Stop the event from propagating further.
     event.preventDefault();
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts b/tensorflow/tensorboard/components/tf_graph_common/edge.ts
similarity index 82%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts
rename to tensorflow/tensorboard/components/tf_graph_common/edge.ts
index 868d47dc83a..4a1182bb9fb 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/edge.ts
@@ -29,14 +29,14 @@ const EDGE_WIDTH_SCALE_EXPONENT = 0.3;
 /** The domain (min and max value) for the edge width. */
 const DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
 
-export const EDGE_WIDTH_SCALE = d3.scale.pow()
+export const EDGE_WIDTH_SCALE: d3.ScalePower<number, number> = d3.scalePow()
       .exponent(EDGE_WIDTH_SCALE_EXPONENT)
       .domain(DOMAIN_EDGE_WIDTH_SCALE)
       .range([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
       .clamp(true);
 
 let arrowheadMap =
-    d3.scale.quantize().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
+    d3.scaleQuantize<String>().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
       'small', 'medium', 'large', 'xlarge'
     ]);
 
@@ -87,12 +87,7 @@ export function buildGroup(sceneGroup,
 
   // Select all children and join with data.
   // (Note that all children of g.edges are g.edge)
-  let edgeGroups = container.selectAll(function() {
-    // using d3's selector function
-    // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-    // (It's not listed in the d3 wiki.)
-    return this.childNodes;
-  }).data(edges, getEdgeKey);
+  let edgeGroups = (container as any).selectAll(function() {return this.childNodes;}).data(edges, getEdgeKey);
 
   // Make edges a group to support rendering multiple lines for metaedge
   edgeGroups.enter()
@@ -108,10 +103,10 @@ export function buildGroup(sceneGroup,
         // Add line during enter because we're assuming that type of line
         // normally does not change.
         appendEdge(edgeGroup, d, sceneElement);
-      });
-
-  edgeGroups.each(position);
-  edgeGroups.each(function(d) {
+      })
+      .merge(edgeGroups)
+      .each(position)
+      .each(function(d) {
     stylize(d3.select(this), d, sceneElement);
   });
 
@@ -168,8 +163,8 @@ export function getLabelForEdge(metaedge: Metaedge,
  * @return The new array of control points.
  */
 function adjustPathPointsForMarker(points: render.Point[],
-    marker: d3.Selection<any>, isStart: boolean): render.Point[] {
-  let lineFunc = d3.svg.line<render.Point>()
+    marker: d3.Selection<any, any, any, any>, isStart: boolean): render.Point[] {
+  let lineFunc = d3.line<render.Point>()
     .x(d => d.x)
     .y(d => d.y);
   let path =
@@ -181,23 +176,28 @@ function adjustPathPointsForMarker(points: render.Point[],
   let refX = +marker.attr('refX');
   let pathNode = <SVGPathElement> path.node();
   if (isStart) {
-    let fractionStickingOut = refX / viewBoxWidth;
-    let length = markerWidth * fractionStickingOut;
-    let point = pathNode.getPointAtLength(length);
+    // The edge flows downwards. Do not make the edge go the whole way, lest we
+    // clobber the arrowhead.
+    const fractionStickingOut = 1 - refX / viewBoxWidth;
+    const length = markerWidth * fractionStickingOut;
+    const point = pathNode.getPointAtLength(length);
     // Figure out how many segments of the path we need to remove in order
     // to shorten the path.
-    let segIndex = pathNode.getPathSegAtLength(length);
+    const segIndex = pathNode.getPathSegAtLength(length);
     // Update the very first segment.
     points[segIndex - 1] = {x: point.x, y: point.y};
     // Ignore every point before segIndex - 1.
     return points.slice(segIndex - 1);
   } else {
-    let fractionStickingOut = 1 - refX / viewBoxWidth;
-    let length = pathNode.getTotalLength() - markerWidth * fractionStickingOut;
-    let point = pathNode.getPointAtLength(length);
+    // The edge flows upwards. Do not make the edge go the whole way, lest we
+    // clobber the arrowhead.
+    const fractionStickingOut = 1 - refX / viewBoxWidth;
+    const length =
+        pathNode.getTotalLength() - markerWidth * fractionStickingOut;
+    const point = pathNode.getPointAtLength(length);
     // Figure out how many segments of the path we need to remove in order
     // to shorten the path.
-    let segIndex = pathNode.getPathSegAtLength(length);
+    const segIndex = pathNode.getPathSegAtLength(length);
     // Update the very last segment.
     points[segIndex] = {x: point.x, y: point.y};
     // Ignore every point after segIndex.
@@ -226,23 +226,32 @@ export function appendEdge(edgeGroup, d: EdgeData,
   if (d.label && d.label.structural) {
     edgeClass += ' ' + Class.Edge.STRUCTURAL;
   }
+  if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
+    edgeClass += ' ' + Class.Edge.REFERENCE_EDGE;
+  }
   // Give the path a unique id, which will be used to link
   // the textPath (edge label) to this path.
   let pathId = 'path_' + getEdgeKey(d);
   let strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
 
   let path = edgeGroup.append('path')
-                 .attr({
-                   'id': pathId,
-                   'class': edgeClass,
-                 })
-                 .style({'stroke-width': strokeWidth + 'px'});
+                 .attr('id', pathId)
+                 .attr('class', edgeClass)
+                 .style('stroke-width', strokeWidth + 'px');
 
   // Check if there is a reference edge and add an arrowhead of the right size.
-  if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
-    let markerId = `ref-arrowhead-${arrowheadMap(strokeWidth)}`;
-    path.style('marker-start', `url(#${markerId})`);
-    d.label.startMarkerId = markerId;
+  if (d.label && d.label.metaedge) {
+    if (d.label.metaedge.numRefEdges) {
+      // We have a reference edge.
+      const markerId = `reference-arrowhead-${arrowheadMap(strokeWidth)}`;
+      path.style('marker-start', `url(#${markerId})`);
+      d.label.startMarkerId = markerId;
+    } else {
+      // We have a dataflow edge.
+      const markerId = `dataflow-arrowhead-${arrowheadMap(strokeWidth)}`;
+      path.style('marker-end', `url(#${markerId})`);
+      d.label.endMarkerId = markerId;
+    }
   }
 
   if (d.label == null || d.label.metaedge == null) {
@@ -264,17 +273,15 @@ export function appendEdge(edgeGroup, d: EdgeData,
 
   edgeGroup.append('text')
       .append('textPath')
-      .attr({
-        'xlink:href': '#' + pathId,
-        'startOffset': '50%',
-        'text-anchor': 'middle',
-        'dominant-baseline': 'central'
-      })
+        .attr('xlink:href', '#' + pathId)
+        .attr('startOffset', '50%')
+        .attr('text-anchor', 'middle')
+        .attr('dominant-baseline', 'central')
       .text(labelForEdge);
 };
 
-export let interpolate = d3.svg.line<{x: number, y: number}>()
-                             .interpolate('basis')
+export let interpolate: d3.Line<{x: number, y: number}> = d3.line<{x: number, y: number}>()
+                             .curve(d3.curveBasis)
                              .x((d) => { return d.x;})
                              .y((d) => { return d.y;});
 
@@ -333,7 +340,7 @@ function position(d) {
   d3.select(this)
       .select('path.' + Class.Edge.LINE)
       .transition()
-      .attrTween('d', getEdgePathInterpolator);
+      .attrTween('d', getEdgePathInterpolator as any);
 };
 
 /**
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/externs.ts b/tensorflow/tensorboard/components/tf_graph_common/externs.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/externs.ts
rename to tensorflow/tensorboard/components/tf_graph_common/externs.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts b/tensorflow/tensorboard/components/tf_graph_common/graph.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts
rename to tensorflow/tensorboard/components/tf_graph_common/graph.ts
index 094bb9577dc..cbd7b14539a 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/graph.ts
@@ -42,6 +42,9 @@ export enum SeriesGroupingType {GROUP, UNGROUP};
 /** Attribute key reserved for the shapes of the output tensors. */
 const OUTPUT_SHAPES_KEY = '_output_shapes';
 
+/** Attribute key reserved for the XLA cluster that an op runs on. */
+const _XLA_CLUSTER_KEY = '_XlaCluster';
+
 /**
  * A BaseEdge is the label object (in the graphlib sense) for an edge in the
  * original, full graph produced after parsing. Subsequent graphs, like those
@@ -133,6 +136,7 @@ export type TensorShape = number[];
 
 export interface OpNode extends Node {
   op: string;
+  // The device on which the op ran. Null if it is unknown.
   device: string;
   attr: {key: string, value: any}[];
   inputs: NormalizedInput[];
@@ -154,6 +158,8 @@ export interface OpNode extends Node {
    *       of the middle dimension is unknown (encoded as -1).
    */
   outputShapes: TensorShape[];
+  // The XLA Cluster on which the op ran. Null if it is unknown.
+  xlaCluster: string;
 }
 
 export interface BridgeNode extends Node {
@@ -349,6 +355,7 @@ export class OpNodeImpl implements OpNode {
   owningSeries: string;
   outputShapes: TensorShape[];
   nodeAttributes: {[key: string]: any;};
+  xlaCluster: string;
 
   /**
    * Constructs a new Op node.
@@ -366,6 +373,7 @@ export class OpNodeImpl implements OpNode {
     // control dependency.
     this.inputs = normalizeInputs(rawNode.input);
     this.outputShapes = extractOutputShapes(rawNode.attr);
+    this.xlaCluster = extractXlaCluster(rawNode.attr);
     // additional properties
     this.type = NodeType.OP;
     this.isGroupNode = false;
@@ -501,16 +509,7 @@ export class NodeStats {
    * if it is a Group node.
    */
   totalBytes = 0;
-  /**
-   * Total number of compute time in microseconds used for the node.
-   * Sum of all children if it is a Group node. Null if it is unknown.
-   */
-  get totalMicros(): number {
-    if (this.startTime == null || this.endTime == null) {
-      return null;
-    }
-    return this.endTime - this.startTime;
-  }
+
   /**
    * The shape of each output tensors, if there are any.
    * Empty if it is a Group node.
@@ -526,10 +525,23 @@ export class NodeStats {
     if (stats.totalBytes != null) {
       this.totalBytes += stats.totalBytes;
     }
-    if (stats.totalMicros != null) {
+    if (stats.getTotalMicros() != null) {
       this.addExecutionTime(stats.startTime, stats.endTime);
     }
   }
+
+  /**
+   * Total number of compute time in microseconds used for the node.
+   * Sum of all children if it is a Group node. Null if it is unknown.
+   * This method can not be scaffolded under a getter attribute because
+   * ECMAScript 5 does not support getter attributes.
+   */
+  getTotalMicros(): number {
+    if (this.startTime == null || this.endTime == null) {
+      return null;
+    }
+    return this.endTime - this.startTime;
+  }
 }
 
 export class MetanodeImpl implements Metanode {
@@ -711,7 +723,7 @@ export class MetaedgeImpl implements Metaedge {
       number {
     let opNode = <OpNode> h.node(edge.v);
     if (opNode.outputShapes == null) {
-      // No shape information. Asssume a single number. This gives
+      // No shape information. Assume a single number. This gives
       // a lower bound for the total size.
       return 1;
     }
@@ -799,7 +811,9 @@ class SeriesNodeImpl implements SeriesNode {
  * Extracts the shapes of the output tensors from the attr property in the
  * node proto.
  */
-function extractOutputShapes(attr: {key: string, value: any}[]): TensorShape[] {
+// tslint:disable-next-line:no-any
+function extractOutputShapes(attr: Array<{key: string, value: any}>):
+    TensorShape[] {
   let result = null;
   // We don't know anything about the output tensors.
   if (!attr) {
@@ -808,29 +822,34 @@ function extractOutputShapes(attr: {key: string, value: any}[]): TensorShape[] {
   for (let i = 0; i < attr.length; i++) {
     let {key, value} = attr[i];
     if (key === OUTPUT_SHAPES_KEY) {
-     // Map all output tensors into array of numbers denoting their shape.
-     let result = value.list.shape.map(shape => {
-       if (shape.unknown_rank) {
-         // This output tensor is of unknown rank. We don't know if it is a
-         // scalar, or a tensor, or of what shape it is.
-         return null;
-       }
-       if (shape.dim == null ||
-           (shape.dim.length === 1 && shape.dim[0].size == null)) {
-         // This output tensor is a scalar.
-         return [];
-       }
-       // This output tensor has a known rank. Map each dimension size
-       // into a number.
-       return shape.dim.map(dim => {
-         // Size can be -1 if this particular dimension is unknown.
-         return dim.size;
-       });
-     });
-     // Since we already processed it, remove the entry from the attribute
-     // list (saves memory).
-     attr.splice(i, 1);
-     return result;
+      if (!value.list.shape) {
+        // The OUTPUT_SHAPES_KEY lacks a value. We know nothing about the shape.
+        return null;
+      }
+
+      // Map all output tensors into array of numbers denoting their shape.
+      let result = value.list.shape.map(shape => {
+        if (shape.unknown_rank) {
+          // This output tensor is of unknown rank. We don't know if it is a
+          // scalar, or a tensor, or of what shape it is.
+          return null;
+        }
+        if (shape.dim == null ||
+            (shape.dim.length === 1 && shape.dim[0].size == null)) {
+          // This output tensor is a scalar.
+          return [];
+        }
+        // This output tensor has a known rank. Map each dimension size
+        // into a number.
+        return shape.dim.map(dim => {
+          // Size can be -1 if this particular dimension is unknown.
+          return dim.size;
+        });
+      });
+      // Since we already processed it, remove the entry from the attribute
+      // list (saves memory).
+      attr.splice(i, 1);
+      return result;
     }
   }
   // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
@@ -838,6 +857,28 @@ function extractOutputShapes(attr: {key: string, value: any}[]): TensorShape[] {
   return null;
 }
 
+/**
+ * Extracts the XLA Cluster that an op runs on from the attrs of the OpNode.
+ * @param attr The attr property.
+ * @return A string that is the name of the cluster. Or null if it could not be
+ *     determined.
+ */
+// tslint:disable-next-line:no-any
+function extractXlaCluster(attr: Array<{key: string, value: any}>): string|
+    null {
+  if (!attr) {
+    return null;
+  }
+
+  // Find the attribute for XLA cluster if there is one.
+  for (let i = 0; i < attr.length; i++) {
+    if (attr[i].key === _XLA_CLUSTER_KEY) {
+      return attr[i].value['s'] || null;
+    }
+  }
+  return null;
+}
+
 /**
  * Normalizes the inputs and extracts associated metadata:
  * 1) Inputs can contain a colon followed by a number at the end
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts b/tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts
rename to tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts
index 52c809dff70..889607ac500 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts
@@ -30,6 +30,8 @@ export interface Hierarchy {
   templates: {[templateId: string]: string[]};
   /** List of all device names */
   devices: string[];
+  /** List of all XLA cluster names */
+  xlaClusters: string[];
   /** True if at least one tensor in the graph has shape information */
   hasShapeInfo: boolean;
   /** The maximum size across all meta edges. Used for scaling thickness. */
@@ -52,6 +54,7 @@ class HierarchyImpl implements Hierarchy {
   templates: {[templateId: string]: string[]};
   private index: {[nodeName: string]: GroupNode|OpNode};
   devices: string[];
+  xlaClusters: string[];
   hasShapeInfo = false;
   maxMetaEdgeSize = 1;
   orderings: { [nodeName: string]: { [childName: string]: number } };
@@ -346,7 +349,7 @@ class HierarchyImpl implements Hierarchy {
    */
   getTemplateIndex(): (string) => number {
     let templateNames = d3.keys(this.templates);
-    let templateIndex = d3.scale.ordinal()
+    let templateIndex = d3.scaleOrdinal()
         .domain(templateNames)
         .range(d3.range(0, templateNames.length));
     return (templateId: string) => <number>templateIndex(templateId);
@@ -395,14 +398,22 @@ export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
       .runAsyncTask(
           'Adding nodes', 20,
           () => {
-            // Get all the possible device names.
+            // Get all the possible device and XLA cluster names.
             let deviceNames = {};
+            let xlaClusterNames = {};
             _.each(graph.nodes, (node, nodeName) => {
-              if (node.device != null) {
+              if (node.device) {
                 deviceNames[node.device] = true;
               }
+
+              if (node.xlaCluster) {
+                xlaClusterNames[node.xlaCluster] = true;
+              }
             });
+
             h.devices = _.keys(deviceNames);
+            h.xlaClusters = _.keys(xlaClusterNames);
+
             addNodes(h, graph);
           },
           tracker)
@@ -426,7 +437,9 @@ export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
               h.templates = template.detect(h, params.verifyTemplate);
             }, tracker);
       })
-      .then(() => { return h; });
+      .then(() => {
+        return h;
+      });
 };
 
 export function joinAndAggregateStats(
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts b/tensorflow/tensorboard/components/tf_graph_common/layout.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts
rename to tensorflow/tensorboard/components/tf_graph_common/layout.ts
index 9f4e2f406bb..1019e4f2694 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/layout.ts
@@ -103,7 +103,9 @@ export const PARAMS = {
       width: 60,
       maxLabelWidth: 52,
       /** A scale for the node's height based on number of nodes inside */
-      height: d3.scale.linear().domain([1, 200]).range([15, 60]).clamp(true),
+      // Hack - set this as an any type to avoid issues in exporting a type
+      // from an external module.
+      height: (d3 as any).scaleLinear().domain([1, 200]).range([15, 60]).clamp(true),
       /** The radius of the circle denoting the expand button. */
       expandButtonRadius: 3
     },
@@ -604,7 +606,7 @@ function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
           inboxHeight / 2);
   inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
 
-  let inY = d3.scale.linear()
+  let inY = d3.scaleLinear()
     .domain([0, inAnnotations.length - 1])
     .range([-inTouchHeight, inTouchHeight]);
 
@@ -633,7 +635,7 @@ function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
       Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
           outboxHeight / 2);
   outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
-  let outY = d3.scale.linear()
+  let outY = d3.scaleLinear()
     .domain([0, outAnnotations.length - 1])
     .range([-outTouchHeight, outTouchHeight]);
 
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts b/tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts
deleted file mode 100644
index f8ad90a968b..00000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.scene {
-
-/** Enums element class of objects in the scene */
-export let Class = {
-  Node: {
-    // <g> element that contains nodes.
-    CONTAINER: 'nodes',
-    // <g> element that contains detail about a node.
-    GROUP: 'node',
-    // <g> element that contains visual elements (like rect, ellipse).
-    SHAPE: 'nodeshape',
-    // <*> element(s) under SHAPE that should receive color updates.
-    COLOR_TARGET: 'nodecolortarget',
-    // <text> element showing the node's label.
-    LABEL: 'nodelabel',
-    // <g> element that contains all visuals for the expand/collapse
-    // button for expandable group nodes.
-    BUTTON_CONTAINER: 'buttoncontainer',
-    // <circle> element that surrounds expand/collapse buttons.
-    BUTTON_CIRCLE: 'buttoncircle',
-    // <path> element of the expand button.
-    EXPAND_BUTTON: 'expandbutton',
-    // <path> element of the collapse button.
-    COLLAPSE_BUTTON: 'collapsebutton'
-  },
-  Edge: {
-    CONTAINER: 'edges',
-    GROUP: 'edge',
-    LINE: 'edgeline',
-    REF_LINE: 'refline',
-    STRUCTURAL: 'structural'
-  },
-  Annotation: {
-    OUTBOX: 'out-annotations',
-    INBOX: 'in-annotations',
-    GROUP: 'annotation',
-    NODE: 'annotation-node',
-    EDGE: 'annotation-edge',
-    CONTROL_EDGE: 'annotation-control-edge',
-    LABEL: 'annotation-label',
-    ELLIPSIS: 'annotation-ellipsis'
-  },
-  Scene: {
-    GROUP: 'scene',
-    CORE: 'core',
-    INEXTRACT: 'in-extract',
-    OUTEXTRACT: 'out-extract'
-  },
-  Subscene: {GROUP: 'subscene'},
-  OPNODE: 'op',
-  METANODE: 'meta',
-  SERIESNODE: 'series',
-  BRIDGENODE: 'bridge',
-  ELLIPSISNODE: 'ellipsis'
-};
-
-/**
- * Helper method for fitting the graph in the svg view.
- *
- * @param svg The main svg.
- * @param zoomG The svg group used for panning and zooming.
- * @param d3zoom The zoom behavior.
- * @param callback Called when the fitting is done.
- */
-export function fit(svg, zoomG, d3zoom, callback) {
-  let svgRect = svg.getBoundingClientRect();
-  let sceneSize = null;
-  try {
-    sceneSize = zoomG.getBBox();
-    if (sceneSize.width === 0) {
-      // There is no scene anymore. We have been detached from the dom.
-      return;
-    }
-  } catch (e) {
-    // Firefox produced NS_ERROR_FAILURE if we have been
-    // detached from the dom.
-    return;
-  }
-  let scale = 0.9 * Math.min(
-      svgRect.width / sceneSize.width,
-      svgRect.height / sceneSize.height,
-      2
-    );
-  let params = layout.PARAMS.graph;
-  let zoomEvent =
-      d3zoom.scale(scale)
-          .on('zoomend.fitted',
-              () => {
-                // Remove the listener for the zoomend event,
-                // so we don't get called at the end of regular zoom events,
-                // just those that fit the graph to screen.
-                d3zoom.on('zoomend.fitted', null);
-                callback();
-              })
-          .translate([params.padding.paddingLeft, params.padding.paddingTop])
-          .event;
-  d3.select(zoomG).transition().duration(500).call(zoomEvent);
-};
-
-/**
- * Helper method for panning the graph to center on the provided node,
- * if the node is currently off-screen.
- *
- * @param nodeName The node to center the graph on
- * @param svg The root SVG element for the graph
- * @param zoomG The svg group used for panning and zooming.
- * @param d3zoom The zoom behavior.
- * @return True if the graph had to be panned to display the
- *            provided node.
- */
-export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
-  let node = <SVGAElement>d3
-                 .select('[data-name="' + nodeName + '"].' + Class.Node.GROUP)
-                 .node();
-  if (!node) {
-    return false;
-  }
-  let translate = d3zoom.translate();
-  // Check if the selected node is off-screen in either
-  // X or Y dimension in either direction.
-  let nodeBox = node.getBBox();
-  let nodeCtm = node.getScreenCTM();
-  let pointTL = svg.createSVGPoint();
-  let pointBR = svg.createSVGPoint();
-  pointTL.x = nodeBox.x;
-  pointTL.y = nodeBox.y;
-  pointBR.x = nodeBox.x + nodeBox.width;
-  pointBR.y = nodeBox.y + nodeBox.height;
-  pointTL = pointTL.matrixTransform(nodeCtm);
-  pointBR = pointBR.matrixTransform(nodeCtm);
-  let isOutsideOfBounds = (start, end, bound) => {
-    return end < 0 || start > bound;
-  };
-  let svgRect = svg.getBoundingClientRect();
-  if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
-      isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
-    // Determine the amount to transform the graph in both X and Y
-    // dimensions in order to center the selected node. This takes into
-    // acount the position of the node, the size of the svg scene, the
-    // amount the scene has been scaled by through zooming, and any previous
-    // transform already performed by this logic.
-    let centerX = (pointTL.x + pointBR.x) / 2;
-    let centerY = (pointTL.y + pointBR.y) / 2;
-    let dx = ((svgRect.width / 2) - centerX);
-    let dy = ((svgRect.height / 2) - centerY);
-    let zoomEvent = d3zoom.translate([translate[0] + dx, translate[1] + dy])
-        .event;
-    d3.select(zoomG).transition().duration(500).call(zoomEvent);
-    return true;
-  }
-  return false;
-};
-
-/**
- * Given a container d3 selection, select a child svg element of a given tag
- * and class if exists or append / insert one otherwise.  If multiple children
- * matches the tag and class name, returns only the first one.
- *
- * @param container
- * @param tagName tag name.
- * @param className (optional) Class name or a list of class names.
- * @param before (optional) reference DOM node for insertion.
- * @return selection of the element
- */
-export function selectOrCreateChild(
-    container, tagName: string, className?: string | string[], before?) {
-  let child = selectChild(container, tagName, className);
-  if (!child.empty()) {
-    return child;
-  }
-  let newElement =
-      document.createElementNS('http://www.w3.org/2000/svg', tagName);
-
-  if (className instanceof Array) {
-    for (let i = 0; i < className.length; i++) {
-      newElement.classList.add(className[i]);
-    }
-  } else {
-    newElement.classList.add(className);
-  }
-
-  if (before) { // if before exists, insert
-    container.node().insertBefore(newElement, before);
-  } else { // otherwise, append
-    container.node().appendChild(newElement);
-  }
-  return d3.select(newElement)
-           // need to bind data to emulate d3_selection.append
-           .datum(container.datum());
-};
-
-/**
- * Given a container d3 selection, select a child element of a given tag and
- * class. If multiple children matches the tag and class name, returns only
- * the first one.
- *
- * @param container
- * @param tagName tag name.
- * @param className (optional) Class name or list of class names.
- * @return selection of the element, or an empty selection
- */
-export function selectChild(
-    container, tagName: string, className?: string | string[]) {
-  let children = container.node().childNodes;
-  for (let i = 0; i < children.length; i++) {
-    let child = children[i];
-    if (child.tagName === tagName) {
-      if (className instanceof Array) {
-        let hasAllClasses = true;
-        for (let j = 0; j < className.length; j++) {
-          hasAllClasses =
-              hasAllClasses && child.classList.contains(className[j]);
-        }
-        if (hasAllClasses) {
-          return d3.select(child);
-        }
-      } else if ((!className || child.classList.contains(className))) {
-        return d3.select(child);
-      }
-    }
-  }
-  return d3.select(null);
-};
-
-/**
- * Select or create a sceneGroup and build/update its nodes and edges.
- *
- * Structure Pattern:
- *
- * <g class='scene'>
- *   <g class='core'>
- *     <g class='edges'>
- *       ... stuff from tf.graph.scene.edges.build ...
- *     </g>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- *   <g class='in-extract'>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- *   <g class='out-extract'>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- * </g>
- *
- * @param container D3 selection of the parent.
- * @param renderNode render node of a metanode or series node.
- * @param sceneElement <tf-graph-scene> polymer element.
- * @param sceneClass class attribute of the scene (default='scene').
- */
-export function buildGroup(container,
-    renderNode: render.RenderGroupNodeInfo,
-    sceneElement,
-    sceneClass: string) {
-  sceneClass = sceneClass || Class.Scene.GROUP;
-  let isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
-  let sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
-
-  // core
-  let coreGroup = selectOrCreateChild(sceneGroup, 'g', Class.Scene.CORE);
-  let coreNodes = _.reduce(renderNode.coreGraph.nodes(), (nodes, name) => {
-                    let node = renderNode.coreGraph.node(name);
-                    if (!node.excluded) {
-                      nodes.push(node);
-                    }
-                    return nodes;
-                  }, []);
-
-  if (renderNode.node.type === NodeType.SERIES) {
-    // For series, we want the first item on top, so reverse the array so
-    // the first item in the series becomes last item in the top, and thus
-    // is rendered on the top.
-    coreNodes.reverse();
-  }
-
-  // Create the layer of edges for this scene (paths).
-  edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
-
-  // Create the layer of nodes for this scene (ellipses, rects etc).
-  node.buildGroup(coreGroup, coreNodes, sceneElement);
-
-  // In-extract
-  if (renderNode.isolatedInExtract.length > 0) {
-    let inExtractGroup =
-        selectOrCreateChild(sceneGroup, 'g', Class.Scene.INEXTRACT);
-    node.buildGroup(inExtractGroup, renderNode.isolatedInExtract,
-        sceneElement);
-  } else {
-    selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT).remove();
-  }
-
-  // Out-extract
-  if (renderNode.isolatedOutExtract.length > 0) {
-    let outExtractGroup =
-        selectOrCreateChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT);
-    node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract,
-        sceneElement);
-  } else {
-    selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT).remove();
-  }
-
-  position(sceneGroup, renderNode);
-
-  // Fade in the scene group if it didn't already exist.
-  if (isNewSceneGroup) {
-    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
-  }
-
-  return sceneGroup;
-};
-
-/**
- * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
- * groups' position relative to the scene.
- *
- * @param sceneGroup
- * @param renderNode render node of a metanode or series node.
- */
-function position(sceneGroup, renderNode: render.RenderGroupNodeInfo) {
-  // Translate scenes down by the label height so that when showing graphs in
-  // expanded metanodes, the graphs are below the labels.  Do not shift them
-  // down for series nodes as series nodes don't have labels inside of their
-  // bounding boxes.
-  let yTranslate = renderNode.node.type === NodeType.SERIES ?
-    0 : layout.PARAMS.subscene.meta.labelHeight;
-
-  // core
-  translate(selectChild(sceneGroup, 'g', Class.Scene.CORE), 0, yTranslate);
-
-  // in-extract
-  let hasInExtract = renderNode.isolatedInExtract.length > 0;
-  let hasOutExtract = renderNode.isolatedOutExtract.length > 0;
-
-  if (hasInExtract) {
-    let offset = layout.PARAMS.subscene.meta.extractXOffset;
-    let inExtractX = renderNode.coreBox.width -
-      renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
-          (hasOutExtract ? offset : 0);
-    translate(
-        selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT), inExtractX,
-        yTranslate);
-  }
-
-  // out-extract
-  if (hasOutExtract) {
-    let outExtractX = renderNode.coreBox.width -
-      renderNode.outExtractBox.width / 2;
-    translate(
-        selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT), outExtractX,
-        yTranslate);
-  }
-};
-
-/** Adds a click listener to a group that fires a graph-select event */
-export function addGraphClickListener(graphGroup, sceneElement) {
-  d3.select(graphGroup).on('click', () => {
-    sceneElement.fire('graph-select');
-  });
-};
-
-/** Helper for adding transform: translate(x0, y0) */
-export function translate(selection, x0: number, y0: number) {
-  // If it is already placed on the screen, make it a transition.
-  if (selection.attr('transform') != null) {
-    selection = selection.transition('position');
-  }
-  selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
-};
-
-/**
- * Helper for setting position of a svg rect
- * @param rect rect to set position of.
- * @param cx Center x.
- * @param cy Center x.
- * @param width Width to set.
- * @param height Height to set.
- */
-export function positionRect(rect, cx: number, cy: number, width: number,
-    height: number) {
-  rect.transition().attr({
-    x: cx - width / 2,
-    y: cy - height / 2,
-    width: width,
-    height: height
-  });
-};
-
-/**
- * Helper for setting position of a svg expand/collapse button
- * @param button container group
- * @param renderNode the render node of the group node to position
- *        the button on.
- */
-export function positionButton(button, renderNode: render.RenderNodeInfo) {
-  let cx = layout.computeCXPositionOfNodeShape(renderNode);
-  // Position the button in the top-right corner of the group node,
-  // with space given the draw the button inside of the corner.
-  let width = renderNode.expanded ?
-      renderNode.width : renderNode.coreBox.width;
-  let height = renderNode.expanded ?
-      renderNode.height : renderNode.coreBox.height;
-  let x = cx + width / 2 - 6;
-  let y = renderNode.y - height / 2 + 6;
-  // For unexpanded series nodes, the button has special placement due
-  // to the unique visuals of this group node.
-  if (renderNode.node.type === NodeType.SERIES && !renderNode.expanded) {
-    x += 10;
-    y -= 2;
-  }
-  let translateStr = 'translate(' + x + ',' + y + ')';
-  button.selectAll('path').transition().attr('transform', translateStr);
-  button.select('circle').transition().attr(
-      {cx: x, cy: y, r: layout.PARAMS.nodeSize.meta.expandButtonRadius});
-};
-
-/**
- * Helper for setting position of a svg ellipse
- * @param ellipse ellipse to set position of.
- * @param cx Center x.
- * @param cy Center x.
- * @param width Width to set.
- * @param height Height to set.
- */
-export function positionEllipse(ellipse, cx: number, cy: number,
-    width: number, height: number) {
-  ellipse.transition().attr({
-    cx: cx,
-    cy: cy,
-    rx: width / 2,
-    ry: height / 2
-  });
-};
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts b/tensorflow/tensorboard/components/tf_graph_common/minimap.ts
similarity index 93%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts
rename to tensorflow/tensorboard/components/tf_graph_common/minimap.ts
index 769984feb4a..8129df3a426 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/minimap.ts
@@ -43,7 +43,7 @@ export class Minimap {
   /** The svg group used for panning and zooming the main svg. */
   private zoomG: SVGGElement;
   /** The zoom behavior of the main svg. */
-  private mainZoom: d3.behavior.Zoom<any>;
+  private mainZoom: d3.ZoomBehavior<any, any>;
   /** The maximum width and height for the minimap. */
   private maxWandH: number;
   /** The last translation vector used in the main svg. */
@@ -67,7 +67,7 @@ export class Minimap {
    * @param labelPadding Padding in pixels due to the main graph labels.
    */
   constructor(svg: SVGSVGElement, zoomG: SVGGElement,
-      mainZoom: d3.behavior.Zoom<any>, minimap: HTMLElement,
+      mainZoom: d3.ZoomBehavior<any, any>, minimap: HTMLElement,
       maxWandH: number, labelPadding: number) {
     this.svg = svg;
     this.labelPadding = labelPadding;
@@ -87,8 +87,8 @@ export class Minimap {
       this.updateViewpoint();
     };
     this.viewpointCoord = {x: 0, y: 0};
-    let drag = d3.behavior.drag().origin(Object).on('drag', dragmove);
-    $viewpoint.datum(this.viewpointCoord).call(drag);
+    let drag = d3.drag().subject(Object).on('drag', dragmove);
+    $viewpoint.datum(this.viewpointCoord as any).call(drag);
 
     // Make the minimap clickable.
     $minimapSvg.on('click', () => {
@@ -99,7 +99,7 @@ export class Minimap {
       // Update the coordinates of the viewpoint.
       let width = Number($viewpoint.attr('width'));
       let height = Number($viewpoint.attr('height'));
-      let clickCoords = d3.mouse($minimapSvg.node());
+      let clickCoords = d3.mouse($minimapSvg.node() as any);
       this.viewpointCoord.x = clickCoords[0] - width / 2;
       this.viewpointCoord.y = clickCoords[1] - height / 2;
       this.updateViewpoint();
@@ -129,8 +129,9 @@ export class Minimap {
     // new viewpoint.
     let mainX = - this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
     let mainY = - this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
-    let zoomEvent = this.mainZoom.translate([mainX, mainY]).event;
-    d3.select(this.zoomG).call(zoomEvent);
+    d3.select(this.svg).call(
+        this.mainZoom.transform,
+        d3.zoomIdentity.translate(mainX, mainY).scale(this.scaleMain));
   }
 
   /**
@@ -198,10 +199,9 @@ export class Minimap {
     // Temporarily assign an explicit width/height to the main svg, since
     // it doesn't have one (uses flex-box), but we need it for the canvas
     // to work.
-    $svg.attr({
-      width: sceneSize.width,
-      height: sceneSize.height,
-    });
+    $svg
+      .attr('width', sceneSize.width)
+      .attr('height', sceneSize.height);
 
     // Since the content inside the svg changed (e.g. a node was expanded),
     // the aspect ratio have also changed. Thus, we need to update the scale
@@ -241,10 +241,8 @@ export class Minimap {
     // assigned styles, explicit width and height and bring back the pan/zoom
     // transform.
     svgStyle.remove();
-    $svg.attr({
-      width: null,
-      height: null
-    });
+    $svg.attr('width', null).attr('height', null);
+
     $zoomG.attr('transform', zoomTransform);
     let image = new Image();
     image.onload = () => {
@@ -283,14 +281,17 @@ export class Minimap {
    * @param translate The translate vector, or none to use the last used one.
    * @param scale The scaling factor, or none to use the last used one.
    */
-  zoom(translate?: [number, number], scale?: number): void {
+  zoom(transform?: d3.ZoomTransform): void {
     if (this.scaleMinimap == null) {
       // Scene is not ready yet.
       return;
     }
     // Update the new translate and scale params, only if specified.
-    this.translate = translate || this.translate;
-    this.scaleMain = scale || this.scaleMain;
+    if (transform) {
+      this.translate = [transform.x, transform.y];
+      this.scaleMain = transform.k;
+    }
+
     // Update the location of the viewpoint rectangle.
     let svgRect = this.svg.getBoundingClientRect();
     let $viewpoint = d3.select(this.viewpoint);
@@ -300,12 +301,11 @@ export class Minimap {
         this.scaleMain;
     let viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
     let viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
-    $viewpoint.attr({
-      x: this.viewpointCoord.x,
-      y: this.viewpointCoord.y,
-      width: viewpointWidth,
-      height: viewpointHeight
-    });
+    $viewpoint
+      .attr('x', this.viewpointCoord.x)
+      .attr('y', this.viewpointCoord.y)
+      .attr('width', viewpointWidth)
+      .attr('height', viewpointHeight);
     // Show/hide the minimap depending on the viewpoint area as fraction of the
     // whole minimap.
     let mapWidth = this.minimapSize.width;
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts b/tensorflow/tensorboard/components/tf_graph_common/node.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts
rename to tensorflow/tensorboard/components/tf_graph_common/node.ts
index 5051128540c..f090a51fc4e 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/node.ts
@@ -66,13 +66,7 @@ module tf.graph.scene.node {
     // Select all children and join with data.
     // (Note that all children of g.nodes are g.node)
     let nodeGroups =
-        container
-            .selectAll(function() {
-              // using d3's selector function
-              // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-              // (It's not listed in the d3 wiki.)
-              return this.childNodes;  // this here refers to container.node()
-            })
+        (container as any).selectAll(function() {return this.childNodes;})
             .data(nodeData, (d) => {
               // make sure that we don't have to swap shape type
               return d.node.name + ':' + d.node.type;
@@ -86,10 +80,9 @@ module tf.graph.scene.node {
           let nodeGroup = d3.select(this);
           // index node group for quick stylizing
           sceneElement.addNodeGroup(d.node.name, nodeGroup);
-        });
-
-    // UPDATE
-    nodeGroups
+        })
+        .merge(nodeGroups)
+        // ENTER + UPDATE
         .attr('class', d => { return Class.Node.GROUP + ' ' + nodeClass(d); })
         .each(function(d) {
           let nodeGroup = d3.select(this);
@@ -200,7 +193,7 @@ function addButton(selection, d: render.RenderNodeInfo, sceneElement) {
       .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
   scene.selectOrCreateChild(group, 'path', Class.Node.COLLAPSE_BUTTON)
       .attr('d', 'M-2.2,0 H2.2');
-  group.on('click', d => {
+  (group as any).on('click', (d: any) => {
     // Stop this event's propagation so that it isn't also considered a
     // node-select.
     (<Event>d3.event).stopPropagation();
@@ -368,7 +361,7 @@ function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
     label.attr('font-size', scale(text.length) + 'px');
   }
 
-  let txtElement = <d3.Selection<any>>label.text(text);
+  let txtElement = <d3.Selection<any, any, any, any>>label.text(text);
   enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
   return label;
 }
@@ -386,8 +379,8 @@ function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
  * determine whether META nodes are collapsed or expanded.
  */
 export function enforceLabelWidth(
-    txtElementSelection: d3.Selection<any>, nodeType: NodeType | number,
-    renderNodeInfo?: render.RenderNodeInfo) {
+    txtElementSelection: d3.Selection<any, any, any, any>, nodeType: NodeType | number,
+    renderNodeInfo?: render.RenderNodeInfo): any {
   // Get text element itself and its on-screen width.
   let txtNode = <SVGTextElement>txtElementSelection.node();
   let computedTxtLength = txtNode.getComputedTextLength();
@@ -453,7 +446,7 @@ export function enforceLabelWidth(
 let fontScale = null;
 function getLabelFontScale(sceneElement) {
   if (!fontScale) {
-    fontScale = d3.scale.linear()
+    fontScale = d3.scaleLinear()
       .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
         sceneElement.maxMetanodeLabelLength])
       .range([sceneElement.maxMetanodeLabelLengthFontSize,
@@ -482,7 +475,7 @@ function labelPosition(nodeGroup, cx: number, cy: number,
  * @param nodeClass class for the element.
  * @return Selection of the shape.
  */
-export function buildShape(nodeGroup, d, nodeClass: string) {
+export function buildShape(nodeGroup, d, nodeClass: string): d3.Selection<any, any, any, any> {
   // Create a group to house the underlying visual elements.
   let shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
   // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
@@ -505,15 +498,15 @@ export function buildShape(nodeGroup, d, nodeClass: string) {
       scene.selectOrCreateChild(shapeGroup, 'use', classList)
           .attr('xlink:href', '#op-series-' + stampType + '-stamp');
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     case NodeType.BRIDGE:
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     case NodeType.META:
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     default:
       throw Error('Unrecognized node type: ' + d.node.type);
@@ -591,7 +584,8 @@ function position(nodeGroup, d: render.RenderNodeInfo) {
 };
 
 /** Enum specifying the options to color nodes by */
-export enum ColorBy { STRUCTURE, DEVICE, COMPUTE_TIME, MEMORY };
+export enum ColorBy {STRUCTURE, DEVICE, XLA_CLUSTER, COMPUTE_TIME, MEMORY}
+;
 
 /**
  * Returns the fill color for the node given its state and the 'color by'
@@ -648,6 +642,9 @@ export function getFillForNode(templateIndex, colorBy,
         });
       }
       return isExpanded ? colorParams.EXPANDED_COLOR : `url(#${escapedId})`;
+    case ColorBy.XLA_CLUSTER:
+      return isExpanded ? colorParams.EXPANDED_COLOR :
+                          renderInfo.xlaClusterColor || colorParams.UNKNOWN;
     case ColorBy.COMPUTE_TIME:
       return isExpanded ?
         colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
@@ -723,12 +720,12 @@ export function traceInputs(renderGraphInfo: tf.graph.render.RenderGraphInfo) {
 
   // Extract currently selected node. Return if input tracing disabled or no
   // node is selected.
-  let selectedNodeSelectorString = 'g.node.selected,g.op.selected';
-  let node = d3.select(selectedNodeSelectorString);
+  const selectedNodeSelectorString = 'g.node.selected,g.op.selected';
+  const nodeSelection = d3.select(selectedNodeSelectorString);
   let currentNode = undefined;
-  if (renderGraphInfo && renderGraphInfo.traceInputs && node && node[0] &&
-      node[0][0]) {
-    currentNode = node[0][0] as Element;
+  if (renderGraphInfo && renderGraphInfo.traceInputs &&
+      nodeSelection.nodes().length) {
+    currentNode = nodeSelection.nodes()[0];
   } else {
     return;
   }
@@ -740,13 +737,12 @@ export function traceInputs(renderGraphInfo: tf.graph.render.RenderGraphInfo) {
         traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
   });
 
-  d3.selectAll(selectedNodeSelectorString).classed({
-    // Remove the input-highlight from the selected node.
-    'input-highlight': false,
-    // Add input-highlight-selected class to selected node, which allows
-    // treating the selected not as a special case of an input node.
-    'input-highlight-selected': true
-  });
+  d3.selectAll(selectedNodeSelectorString)
+      // Remove the input-highlight from the selected node.
+      .classed('input-highlight', false)
+      // Add input-highlight-selected class to selected node, which allows
+      // treating the selected not as a special case of an input node.
+      .classed('input-highlight-selected', true);
 
   // Highlight all parent nodes of each OpNode as input parent to allow
   // specific highlighting.
@@ -1020,16 +1016,17 @@ function _markParentsOfNodes(visibleNodes: {[nodeName: string]: Node}) {
     let currentNode = nodeInstance;
 
     while (currentNode.name !== tf.graph.ROOT_NAME) {
-      let renderedElement = d3.select(`.node[data-name="${currentNode.name}"]`);
+      const renderedElementSelection =
+          d3.select(`.node[data-name="${currentNode.name}"]`);
       // Only mark the element as a parent node to an input if it is not
       // marked as input node itself.
-      if (renderedElement[0][0] &&
-          !renderedElement.classed('input-highlight') &&
-          !renderedElement.classed('selected') &&
+      if (renderedElementSelection.nodes().length &&
+          !renderedElementSelection.classed('input-highlight') &&
+          !renderedElementSelection.classed('selected') &&
           // OpNode only parent if start node is embedded node, in which case
           // the OpNode should be faded as well.
-          !renderedElement.classed('op')) {
-        renderedElement.classed('input-parent', true);
+          !renderedElementSelection.classed('op')) {
+        renderedElementSelection.classed('input-parent', true);
       }
       currentNode = currentNode.parentNode;
     }
@@ -1039,7 +1036,7 @@ function _markParentsOfNodes(visibleNodes: {[nodeName: string]: Node}) {
 /**
  * Find the parent of the passed in op node which is expanded. This is done
  * by going through all parents until the parent's parent is expanded, thus
- * finding the the first unexpanded parent which is rendered on the screen.
+ * finding the first unexpanded parent which is rendered on the screen.
  * @param renderGraphInfo The graph info object used to gain access to the
  * render info of the parents.
  * @param currentNode The node whose parent is to be found.
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/parser.ts b/tensorflow/tensorboard/components/tf_graph_common/parser.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/parser.ts
rename to tensorflow/tensorboard/components/tf_graph_common/parser.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/proto.ts b/tensorflow/tensorboard/components/tf_graph_common/proto.ts
similarity index 78%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/proto.ts
rename to tensorflow/tensorboard/components/tf_graph_common/proto.ts
index e4f1398fd6e..eda73e45c3b 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/proto.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/proto.ts
@@ -37,6 +37,42 @@ module tf.graph.proto {
     attr: {key: string, value: Object}[];
   }
 
+  /**
+   * Generic graph as defined in the graph_explorer.proto file.
+   */
+  export interface GenericGraph {
+    /** List of nodes in the graph */
+    node: GenericNode[];
+    /** List of nodes in the graph */
+    edge: GenericEdge[];
+    /** List of attributes that describe/modify the operation. */
+    attr: Array<{[key: string]: any}>;
+  }
+
+  /**
+   * GenericEdge corresponds to the Edge message in graph_explorer.proto.
+   */
+  export interface GenericEdge {
+    /** Name of the source node. */
+    source: string;
+    /** Name of the target node. */
+    target: string;
+    /** Attributes of the edge. */
+    edge_attr: Array<{[key: string]: any}>;
+  }
+
+  /**
+   * GenericNode corresponds to the Node message in graph_explorer.proto.
+   */
+  export interface GenericNode {
+    /** Name of the node */
+    name: string;
+    /** Attributes of a leaf node or leaf nodes within a metanode. */
+    node_attr: Array<{[key: string]: any}>;
+    /** Attributes of a metanode. */
+    metanode_attr: Array<{[key: string]: any}>;
+  }
+
   /**
    * TensorFlow stats file definition as defined in the stats proto file.
    */
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/render.ts b/tensorflow/tensorboard/components/tf_graph_common/render.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/render.ts
rename to tensorflow/tensorboard/components/tf_graph_common/render.ts
index f3a81587c71..4f28af481d4 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/render.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/render.ts
@@ -45,7 +45,7 @@ export let MetanodeColors = {
    * Standard hue values for node color palette.
    */
   HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
-  STRUCTURE_PALETTE: function(id: number, lightened?: boolean) {
+  STRUCTURE_PALETTE(id: number, lightened?: boolean) {
     // The code below is a flexible way to computationally create a set
     // of colors that go well together.
     let hues = MetanodeColors.HUES;
@@ -56,8 +56,12 @@ export let MetanodeColors = {
     let light = lightened ? 95 : 80;
     return d3.hsl(hue, .01 * sat, .01 * light).toString();
   },
-  DEVICE_PALETTE: function(index: number):
-      string { return MetanodeColors.STRUCTURE_PALETTE(index);},
+  DEVICE_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
+  XLA_CLUSTER_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
   UNKNOWN: '#eee',
   GRADIENT_OUTLINE: '#888'
 };
@@ -157,12 +161,14 @@ export class RenderGraphInfo {
   hierarchy: hierarchy.Hierarchy;
   private displayingStats: boolean;
   private index: {[nodeName: string]: RenderNodeInfo};
-  private deviceColorMap: d3.scale.Ordinal<string, string>;
-  private memoryUsageScale: d3.scale.Linear<string, string>;
-  private computeTimeScale: d3.scale.Linear<string, string>;
+  private renderedOpNames: string[];
+  private deviceColorMap: d3.ScaleOrdinal<string, string>;
+  private xlaClusterColorMap: d3.ScaleOrdinal<string, string>;
+  private memoryUsageScale: d3.ScaleLinear<string, string>;
+  private computeTimeScale: d3.ScaleLinear<string, string>;
   /** Scale for the thickness of edges when there is no shape information. */
   edgeWidthScale:
-      d3.scale.Linear<number, number> | d3.scale.Pow<number, number>;
+      d3.ScaleLinear<number, number> | d3.ScalePower<number, number>;
   // Since the rendering information for each node is constructed lazily,
   // upon node's expansion by the user, we keep a map between the node's name
   // and whether the rendering information was already constructed for that
@@ -175,6 +181,7 @@ export class RenderGraphInfo {
     this.hierarchy = hierarchy;
     this.displayingStats = displayingStats;
     this.index = {};
+    this.renderedOpNames = [];
 
     this.computeScales();
     // Maps node name to whether the rendering hierarchy was already
@@ -182,17 +189,25 @@ export class RenderGraphInfo {
     this.hasSubhierarchy = {};
     this.root = new RenderGroupNodeInfo(hierarchy.root);
     this.index[hierarchy.root.name] = this.root;
+    this.renderedOpNames.push(hierarchy.root.name);
     this.buildSubhierarchy(hierarchy.root.name);
     this.root.expanded = true;
     this.traceInputs = false;
   }
 
   computeScales() {
-    this.deviceColorMap = d3.scale.ordinal<string>()
+    this.deviceColorMap = d3.scaleOrdinal<string>()
         .domain(this.hierarchy.devices)
         .range(_.map(d3.range(this.hierarchy.devices.length),
                      MetanodeColors.DEVICE_PALETTE));
 
+    this.xlaClusterColorMap =
+        d3.scaleOrdinal<string>()
+            .domain(this.hierarchy.xlaClusters)
+            .range(_.map(
+                d3.range(this.hierarchy.xlaClusters.length),
+                MetanodeColors.XLA_CLUSTER_PALETTE));
+
     let topLevelGraph = this.hierarchy.root.metagraph;
     // Find the maximum and minimum memory usage.
     let memoryExtent = d3.extent(topLevelGraph.nodes(),
@@ -203,7 +218,7 @@ export class RenderGraphInfo {
         return node.stats.totalBytes;
       }
     });
-    this.memoryUsageScale = d3.scale.linear<string, string>()
+    this.memoryUsageScale = d3.scaleLinear<string, string>()
         .domain(memoryExtent)
         .range(PARAMS.minMaxColors);
 
@@ -213,16 +228,16 @@ export class RenderGraphInfo {
       let node = topLevelGraph.node(nodeName);
       // Some ops don't have stats at all.
       if (node.stats != null) {
-        return node.stats.totalMicros;
+        return node.stats.getTotalMicros();
       }
     });
-    this.computeTimeScale = d3.scale.linear<string, string>()
+    this.computeTimeScale = d3.scaleLinear<string, string>()
         .domain(computeTimeExtent)
         .range(PARAMS.minMaxColors);
 
     this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
       scene.edge.EDGE_WIDTH_SCALE :
-      d3.scale.linear()
+      d3.scaleLinear()
         .domain([1, this.hierarchy.maxMetaEdgeSize])
         .range([scene.edge.MIN_EDGE_WIDTH, scene.edge.MAX_EDGE_WIDTH]);
   }
@@ -266,11 +281,19 @@ export class RenderGraphInfo {
         new RenderGroupNodeInfo(<GroupNode>node) :
         new RenderNodeInfo(node);
     this.index[nodeName] = renderInfo;
+    this.renderedOpNames.push(nodeName);
 
     if (node.stats) {
       renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
       renderInfo.computeTimeColor =
-        this.computeTimeScale(node.stats.totalMicros);
+          this.computeTimeScale(node.stats.getTotalMicros());
+    }
+
+    if (!node.isGroupNode) {
+      let clusterName = (node as OpNode).xlaCluster;
+      if (clusterName) {
+        renderInfo.xlaClusterColor = this.xlaClusterColorMap(clusterName);
+      }
     }
 
     // We only fade nodes when we're displaying stats.
@@ -345,6 +368,15 @@ export class RenderGraphInfo {
     return !!found;
   }
 
+  /**
+   * Returns a list of ops that have been rendered so far for this graph. More
+   * ops may later be rendered if the user expands nodes for instance. The list
+   * returned here can only stay the same size or grow on successive calls.
+   */
+  getNamesOfRenderedOps(): string[] {
+    return this.renderedOpNames;
+  }
+
   buildSubhierarchy(nodeName: string): void {
     // Terminate if the rendering hierarchy was already constructed
     // for this node.
@@ -1018,7 +1050,12 @@ export class RenderNodeInfo {
    * its children. If this node is an op node, this list will have only one
    * color with proportion 1.0.
    */
-  deviceColors: {color: string, proportion: number}[];
+  deviceColors: Array<{color: string, proportion: number}>;
+
+  /**
+   * Color according to the XLA cluster of this node.
+   */
+  xlaClusterColor: string;
 
   /**
    * Color according to the memory usage of this node.
@@ -1122,7 +1159,7 @@ export class RenderMetaedgeInfo {
   /**
    * D3 selection of the group containing the path that displays this edge.
    */
-  edgeGroup: d3.Selection<RenderMetaedgeInfo>;
+  edgeGroup: d3.Selection<RenderMetaedgeInfo & any, any, any, any>;
 
   /** Id of the <marker> used as a start-marker for the edge path. */
   startMarkerId: string;
@@ -1593,4 +1630,44 @@ function extractHighDegrees(renderNode: RenderGroupNodeInfo) {
     }
   });
 }
+
+/**
+ * Expands nodes in the graph until the desired node is visible.
+ *
+ * @param scene The scene polymer component.
+ * @param renderHierarchy The render hierarchy.
+ * @param tensorName The name of a tensor.
+ * @return A string that is the name of the node representing the given tensor.
+ *     Note that the original tensor name might differ from this returned node
+ *     name. Specifically, for instance, the tensor name usually ends with an
+ *     output slot index (such as :0), while the node name lacks that suffix.
+ */
+export function expandUntilNodeIsShown(
+    scene, renderHierarchy, tensorName: string) {
+  const splitTensorName = tensorName.split('/');
+
+  // Graph names do not take into account the output slot. Strip it.
+  const lastNodeNameMatch =
+      splitTensorName[splitTensorName.length - 1].match(/(.*):\d+/);
+  if (lastNodeNameMatch.length === 2) {
+    splitTensorName[splitTensorName.length - 1] = lastNodeNameMatch[1];
+  }
+
+  let nodeName = splitTensorName[0];
+  let renderNode = renderHierarchy.getRenderNodeByName(nodeName);
+  for (let i = 1; i < splitTensorName.length; i++) {
+    // Op nodes are not expandable.
+    if (renderNode.node.type === tf.graph.NodeType.OP) {
+      break;
+    }
+    renderHierarchy.buildSubhierarchy(nodeName);
+    renderNode.expanded = true;
+    scene.setNodeExpanded(renderNode);
+    nodeName += '/' + splitTensorName[i];
+    renderNode = renderHierarchy.getRenderNodeByName(nodeName);
+  }
+
+  return renderNode.node.name;
+}
+
 } // close module tf.graph.render
diff --git a/tensorflow/tensorboard/components/tf_graph_common/scene.ts b/tensorflow/tensorboard/components/tf_graph_common/scene.ts
new file mode 100644
index 00000000000..14d35efd9ff
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common/scene.ts
@@ -0,0 +1,735 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+module tf.graph.scene {
+  const svgNamespace = 'http://www.w3.org/2000/svg';
+
+  /** Enums element class of objects in the scene */
+  export let Class = {
+    Node: {
+      // <g> element that contains nodes.
+      CONTAINER: 'nodes',
+      // <g> element that contains detail about a node.
+      GROUP: 'node',
+      // <g> element that contains visual elements (like rect, ellipse).
+      SHAPE: 'nodeshape',
+      // <*> element(s) under SHAPE that should receive color updates.
+      COLOR_TARGET: 'nodecolortarget',
+      // <text> element showing the node's label.
+      LABEL: 'nodelabel',
+      // <g> element that contains all visuals for the expand/collapse
+      // button for expandable group nodes.
+      BUTTON_CONTAINER: 'buttoncontainer',
+      // <circle> element that surrounds expand/collapse buttons.
+      BUTTON_CIRCLE: 'buttoncircle',
+      // <path> element of the expand button.
+      EXPAND_BUTTON: 'expandbutton',
+      // <path> element of the collapse button.
+      COLLAPSE_BUTTON: 'collapsebutton'
+    },
+    Edge: {
+      CONTAINER: 'edges',
+      GROUP: 'edge',
+      LINE: 'edgeline',
+      REFERENCE_EDGE: 'referenceedge',
+      REF_LINE: 'refline',
+      STRUCTURAL: 'structural'
+    },
+    Annotation: {
+      OUTBOX: 'out-annotations',
+      INBOX: 'in-annotations',
+      GROUP: 'annotation',
+      NODE: 'annotation-node',
+      EDGE: 'annotation-edge',
+      CONTROL_EDGE: 'annotation-control-edge',
+      LABEL: 'annotation-label',
+      ELLIPSIS: 'annotation-ellipsis'
+    },
+    Scene: {
+      GROUP: 'scene',
+      CORE: 'core',
+      INEXTRACT: 'in-extract',
+      OUTEXTRACT: 'out-extract'
+    },
+    Subscene: {GROUP: 'subscene'},
+    OPNODE: 'op',
+    METANODE: 'meta',
+    SERIESNODE: 'series',
+    BRIDGENODE: 'bridge',
+    ELLIPSISNODE: 'ellipsis'
+  };
+
+  /**
+   * A health pill encapsulates an overview of tensor element values. The value
+   * field is a list of 12 numbers that shed light on the status of the tensor.
+   * Visualized in health pills are the 3rd through 8th (inclusive) numbers of
+   * health pill values. Those 6 numbers are counts of tensor elements that fall
+   * under -Inf, negative, 0, positive, +Inf, NaN (in that order).
+   *
+   * Please keep this interface consistent with HealthPillDatum within
+   * backend.ts.
+   */
+  export interface HealthPill {
+    device_name: string;
+    node_name: string;
+    output_slot: number;
+    dtype: string;
+    shape: number[];
+    value: number[];
+    wall_time: number;
+    step: number;
+  }
+
+  interface HealthPillNumericStats {
+    min: number;
+    max: number;
+    mean: number;
+    stddev: number;
+  }
+
+  /**
+   * Encapsulates how to render a single entry in a health pill. Each entry
+   * corresponds to a category of tensor element values.
+   */
+  export interface HealthPillEntry {
+    background_color: string;
+    label: string;
+  }
+  ;
+  export let healthPillEntries: HealthPillEntry[] = [
+    {
+      background_color: '#CC2F2C',
+      label: 'NaN',
+    },
+    {
+      background_color: '#FF8D00',
+      label: '-∞',
+    },
+    {
+      background_color: '#EAEAEA',
+      label: '-',
+    },
+    {
+      background_color: '#A5A5A5',
+      label: '0',
+    },
+    {
+      background_color: '#262626',
+      label: '+',
+    },
+    {
+      background_color: '#003ED4',
+      label: '+∞',
+    },
+  ];
+
+  /**
+   * Helper method for fitting the graph in the svg view.
+   *
+   * @param svg The main svg.
+   * @param zoomG The svg group used for panning and zooming.
+   * @param d3zoom The zoom behavior.
+   * @param callback Called when the fitting is done.
+   */
+  export function fit(svg, zoomG, d3zoom, callback) {
+    let svgRect = svg.getBoundingClientRect();
+    let sceneSize = null;
+    try {
+      sceneSize = zoomG.getBBox();
+      if (sceneSize.width === 0) {
+        // There is no scene anymore. We have been detached from the dom.
+        return;
+      }
+    } catch (e) {
+      // Firefox produced NS_ERROR_FAILURE if we have been
+      // detached from the dom.
+      return;
+    }
+    let scale = 0.9 *
+        Math.min(
+            svgRect.width / sceneSize.width, svgRect.height / sceneSize.height,
+            2);
+    let params = layout.PARAMS.graph;
+    const transform = d3.zoomIdentity
+        .scale(scale)
+        .translate(params.padding.paddingLeft, params.padding.paddingTop);
+
+    d3.select(svg)
+        .transition()
+        .duration(500)
+        .call(d3zoom.transform, transform)
+        .on('end.fitted', () => {
+          // Remove the listener for the zoomend event,
+          // so we don't get called at the end of regular zoom events,
+          // just those that fit the graph to screen.
+          d3zoom.on('end.fitted', null);
+          callback();
+        });
+};
+
+/**
+ * Helper method for panning the graph to center on the provided node,
+ * if the node is currently off-screen.
+ *
+ * @param nodeName The node to center the graph on
+ * @param svg The root SVG element for the graph
+ * @param zoomG The svg group used for panning and zooming.
+ * @param d3zoom The zoom behavior.
+ * @return True if the graph had to be panned to display the
+ *            provided node.
+ */
+export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
+  let node = <SVGAElement>d3
+                 .select('[data-name="' + nodeName + '"].' + Class.Node.GROUP)
+                 .node();
+  if (!node) {
+    return false;
+  }
+
+  // Check if the selected node is off-screen in either
+  // X or Y dimension in either direction.
+  let nodeBox = node.getBBox();
+  let nodeCtm = node.getScreenCTM();
+  let pointTL = svg.createSVGPoint();
+  let pointBR = svg.createSVGPoint();
+  pointTL.x = nodeBox.x;
+  pointTL.y = nodeBox.y;
+  pointBR.x = nodeBox.x + nodeBox.width;
+  pointBR.y = nodeBox.y + nodeBox.height;
+  pointTL = pointTL.matrixTransform(nodeCtm);
+  pointBR = pointBR.matrixTransform(nodeCtm);
+  let isOutsideOfBounds = (start, end, bound) => {
+    return end < 0 || start > bound;
+  };
+  let svgRect = svg.getBoundingClientRect();
+  if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
+      isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
+    // Determine the amount to translate the graph in both X and Y dimensions in
+    // order to center the selected node. This takes into account the position
+    // of the node, the size of the svg scene, the amount the scene has been
+    // scaled by through zooming, and any previous transforms already performed
+    // by this logic.
+    let centerX = (pointTL.x + pointBR.x) / 2;
+    let centerY = (pointTL.y + pointBR.y) / 2;
+    let dx = ((svgRect.width / 2) - centerX);
+    let dy = ((svgRect.height / 2) - centerY);
+
+    // We translate by this amount. We divide the X and Y translations by the
+    // scale to undo how translateBy scales the translations (in d3 v4).
+    const svgTransform = d3.zoomTransform(svg);
+    d3.select(svg).transition().duration(500).call(
+        d3zoom.translateBy, dx / svgTransform.k, dy / svgTransform.k);
+
+    return true;
+  }
+  return false;
+};
+
+/**
+ * Given a container d3 selection, select a child svg element of a given tag
+ * and class if exists or append / insert one otherwise.  If multiple children
+ * matches the tag and class name, returns only the first one.
+ *
+ * @param container
+ * @param tagName tag name.
+ * @param className (optional) Class name or a list of class names.
+ * @param before (optional) reference DOM node for insertion.
+ * @return selection of the element
+ */
+export function selectOrCreateChild(
+    container, tagName: string, className?: string | string[], before?): d3.Selection<any, any, any, any> {
+  let child = selectChild(container, tagName, className);
+  if (!child.empty()) {
+    return child;
+  }
+  let newElement =
+      document.createElementNS('http://www.w3.org/2000/svg', tagName);
+
+  if (className instanceof Array) {
+    for (let i = 0; i < className.length; i++) {
+      newElement.classList.add(className[i]);
+    }
+  } else {
+    newElement.classList.add(className);
+  }
+
+  if (before) { // if before exists, insert
+    container.node().insertBefore(newElement, before);
+  } else { // otherwise, append
+    container.node().appendChild(newElement);
+  }
+  return d3.select(newElement)
+           // need to bind data to emulate d3_selection.append
+           .datum(container.datum());
+};
+
+/**
+ * Given a container d3 selection, select a child element of a given tag and
+ * class. If multiple children matches the tag and class name, returns only
+ * the first one.
+ *
+ * @param container
+ * @param tagName tag name.
+ * @param className (optional) Class name or list of class names.
+ * @return selection of the element, or an empty selection
+ */
+export function selectChild(
+    container, tagName: string, className?: string | string[]): d3.Selection<any, any, any, any> {
+  let children = container.node().childNodes;
+  for (let i = 0; i < children.length; i++) {
+    let child = children[i];
+    if (child.tagName === tagName) {
+      if (className instanceof Array) {
+        let hasAllClasses = true;
+        for (let j = 0; j < className.length; j++) {
+          hasAllClasses =
+              hasAllClasses && child.classList.contains(className[j]);
+        }
+        if (hasAllClasses) {
+          return d3.select(child);
+        }
+      } else if ((!className || child.classList.contains(className))) {
+        return d3.select(child);
+      }
+    }
+  }
+  return d3.select(null);
+};
+
+/**
+ * Select or create a sceneGroup and build/update its nodes and edges.
+ *
+ * Structure Pattern:
+ *
+ * <g class='scene'>
+ *   <g class='core'>
+ *     <g class='edges'>
+ *       ... stuff from tf.graph.scene.edges.build ...
+ *     </g>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ *   <g class='in-extract'>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ *   <g class='out-extract'>
+ *     <g class='nodes'>
+ *       ... stuff from tf.graph.scene.nodes.build ...
+ *     </g>
+ *   </g>
+ * </g>
+ *
+ * @param container D3 selection of the parent.
+ * @param renderNode render node of a metanode or series node.
+ * @param sceneElement <tf-graph-scene> polymer element.
+ * @param sceneClass class attribute of the scene (default='scene').
+ */
+export function buildGroup(container,
+    renderNode: render.RenderGroupNodeInfo,
+    sceneElement,
+    sceneClass: string): d3.Selection<any, any, any, any> {
+  sceneClass = sceneClass || Class.Scene.GROUP;
+  let isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
+  let sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
+
+  // core
+  let coreGroup = selectOrCreateChild(sceneGroup, 'g', Class.Scene.CORE);
+  let coreNodes = _.reduce(renderNode.coreGraph.nodes(), (nodes, name) => {
+                    let node = renderNode.coreGraph.node(name);
+                    if (!node.excluded) {
+                      nodes.push(node);
+                    }
+                    return nodes;
+                  }, []);
+
+  if (renderNode.node.type === NodeType.SERIES) {
+    // For series, we want the first item on top, so reverse the array so
+    // the first item in the series becomes last item in the top, and thus
+    // is rendered on the top.
+    coreNodes.reverse();
+  }
+
+  // Create the layer of edges for this scene (paths).
+  edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
+
+  // Create the layer of nodes for this scene (ellipses, rects etc).
+  node.buildGroup(coreGroup, coreNodes, sceneElement);
+
+  // In-extract
+  if (renderNode.isolatedInExtract.length > 0) {
+    let inExtractGroup =
+        selectOrCreateChild(sceneGroup, 'g', Class.Scene.INEXTRACT);
+    node.buildGroup(inExtractGroup, renderNode.isolatedInExtract,
+        sceneElement);
+  } else {
+    selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT).remove();
+  }
+
+  // Out-extract
+  if (renderNode.isolatedOutExtract.length > 0) {
+    let outExtractGroup =
+        selectOrCreateChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT);
+    node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract,
+        sceneElement);
+  } else {
+    selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT).remove();
+  }
+
+  position(sceneGroup, renderNode);
+
+  // Fade in the scene group if it didn't already exist.
+  if (isNewSceneGroup) {
+    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
+  }
+
+  return sceneGroup;
+};
+
+/**
+ * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
+ * groups' position relative to the scene.
+ *
+ * @param sceneGroup
+ * @param renderNode render node of a metanode or series node.
+ */
+function position(sceneGroup, renderNode: render.RenderGroupNodeInfo) {
+  // Translate scenes down by the label height so that when showing graphs in
+  // expanded metanodes, the graphs are below the labels.  Do not shift them
+  // down for series nodes as series nodes don't have labels inside of their
+  // bounding boxes.
+  let yTranslate = renderNode.node.type === NodeType.SERIES ?
+    0 : layout.PARAMS.subscene.meta.labelHeight;
+
+  // core
+  translate(selectChild(sceneGroup, 'g', Class.Scene.CORE), 0, yTranslate);
+
+  // in-extract
+  let hasInExtract = renderNode.isolatedInExtract.length > 0;
+  let hasOutExtract = renderNode.isolatedOutExtract.length > 0;
+
+  if (hasInExtract) {
+    let offset = layout.PARAMS.subscene.meta.extractXOffset;
+    let inExtractX = renderNode.coreBox.width -
+      renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
+          (hasOutExtract ? offset : 0);
+    translate(
+        selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT), inExtractX,
+        yTranslate);
+  }
+
+  // out-extract
+  if (hasOutExtract) {
+    let outExtractX = renderNode.coreBox.width -
+      renderNode.outExtractBox.width / 2;
+    translate(
+        selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT), outExtractX,
+        yTranslate);
+  }
+};
+
+/** Adds a click listener to a group that fires a graph-select event */
+export function addGraphClickListener(graphGroup, sceneElement) {
+  d3.select(graphGroup).on('click', () => {
+    sceneElement.fire('graph-select');
+  });
+};
+
+/** Helper for adding transform: translate(x0, y0) */
+export function translate(selection, x0: number, y0: number) {
+  // If it is already placed on the screen, make it a transition.
+  if (selection.attr('transform') != null) {
+    selection = selection.transition('position');
+  }
+  selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
+};
+
+/**
+ * Helper for setting position of a svg rect
+ * @param rect rect to set position of.
+ * @param cx Center x.
+ * @param cy Center x.
+ * @param width Width to set.
+ * @param height Height to set.
+ */
+export function positionRect(rect, cx: number, cy: number, width: number,
+    height: number) {
+  rect.transition()
+    .attr('x', cx - width / 2)
+    .attr('y', cy - height / 2)
+    .attr('width', width)
+    .attr('height', height);
+};
+
+/**
+ * Helper for setting position of a svg expand/collapse button
+ * @param button container group
+ * @param renderNode the render node of the group node to position
+ *        the button on.
+ */
+export function positionButton(button, renderNode: render.RenderNodeInfo) {
+  let cx = layout.computeCXPositionOfNodeShape(renderNode);
+  // Position the button in the top-right corner of the group node,
+  // with space given the draw the button inside of the corner.
+  let width = renderNode.expanded ?
+      renderNode.width : renderNode.coreBox.width;
+  let height = renderNode.expanded ?
+      renderNode.height : renderNode.coreBox.height;
+  let x = cx + width / 2 - 6;
+  let y = renderNode.y - height / 2 + 6;
+  // For unexpanded series nodes, the button has special placement due
+  // to the unique visuals of this group node.
+  if (renderNode.node.type === NodeType.SERIES && !renderNode.expanded) {
+    x += 10;
+    y -= 2;
+  }
+  let translateStr = 'translate(' + x + ',' + y + ')';
+  button.selectAll('path').transition().attr('transform', translateStr);
+  button.select('circle').transition().attr(
+      {cx: x, cy: y, r: layout.PARAMS.nodeSize.meta.expandButtonRadius});
+};
+
+/**
+ * Helper for setting position of a svg ellipse
+ * @param ellipse ellipse to set position of.
+ * @param cx Center x.
+ * @param cy Center x.
+ * @param width Width to set.
+ * @param height Height to set.
+ */
+export function positionEllipse(ellipse, cx: number, cy: number,
+    width: number, height: number) {
+  ellipse.transition()
+    .attr('cx', cx)
+    .attr('cy', cy)
+    .attr('rx', width / 2)
+    .attr('ry', height / 2);
+};
+
+/**
+ * @param {number} stat A stat for a health pill (such as mean or variance).
+ * @param {boolean} shouldRoundOnesDigit Whether to round this number to the
+ *     ones digit. Useful for say int, uint, and bool output types.
+ * @return {string} A human-friendly string representation of that stat.
+ */
+export function humanizeHealthPillStat(stat, shouldRoundOnesDigit) {
+  if (shouldRoundOnesDigit) {
+    return stat.toFixed(0);
+  }
+
+  if (Math.abs(stat) >= 1) {
+    return stat.toFixed(1);
+  }
+  return stat.toExponential(1);
+}
+
+/**
+ * Get text content describing a health pill.
+ */
+function _getHealthPillTextContent(healthPill: HealthPill,
+                                   totalCount: number,
+                                   elementsBreakdown: number[],
+                                   numericStats: HealthPillNumericStats) {
+  let text = 'Device: ' + healthPill.device_name + '\n';
+  text += 'dtype: ' + healthPill.dtype + '\n';
+
+  let shapeStr = '(scalar)';
+  if (healthPill.shape.length > 0) {
+    shapeStr = '(' + healthPill.shape.join(',') + ')';
+  }
+  text += '\nshape: ' + shapeStr + '\n\n';
+
+  text += '#(elements): ' + totalCount + '\n';
+  const breakdownItems = [];
+  for (let i = 0; i < elementsBreakdown.length; i++) {
+    if (elementsBreakdown[i] > 0) {
+      breakdownItems.push(
+          '#(' + healthPillEntries[i].label + '): ' + elementsBreakdown[i]);
+    }
+  }
+  text += breakdownItems.join(', ') + '\n\n';
+
+  // In some cases (e.g., size-0 tensors; all elements are nan or inf) the
+  // min/max and mean/stddev stats are meaningless.
+  if (numericStats.max >= numericStats.min) {
+    text += 'min: ' + numericStats.min + ', max: ' + numericStats.max + '\n';
+    text += 'mean: ' + numericStats.mean + ', stddev: ' + numericStats.stddev;
+  }
+
+  return text;
+}
+
+/**
+ * Renders a health pill for an op atop a node.
+ */
+function _addHealthPill(
+    nodeGroupElement: SVGElement, healthPill: HealthPill,
+    nodeInfo: render.RenderNodeInfo) {
+  // Check if text already exists at location.
+  d3.select(nodeGroupElement.parentNode as any).selectAll('.health-pill').remove();
+
+  if (!nodeInfo || !healthPill) {
+    return;
+  }
+
+  let lastHealthPillData = healthPill.value;
+
+  // For now, we only visualize the 6 values that summarize counts of tensor
+  // elements of various categories: -Inf, negative, 0, positive, Inf, and NaN.
+  const lastHealthPillElementsBreakdown = lastHealthPillData.slice(2, 8);
+  let totalCount = lastHealthPillData[1];
+  const numericStats: HealthPillNumericStats = {
+      min: lastHealthPillData[8],
+      max: lastHealthPillData[9],
+      mean: lastHealthPillData[10],
+      stddev: Math.sqrt(lastHealthPillData[11])
+  };
+
+  let healthPillWidth = 60;
+  let healthPillHeight = 10;
+  if (nodeInfo.node.type === tf.graph.NodeType.OP) {
+    // Use a smaller health pill for op nodes (rendered as smaller ellipses).
+    healthPillWidth /= 2;
+    healthPillHeight /= 2;
+  }
+
+  let healthPillGroup = document.createElementNS(svgNamespace, 'g');
+  healthPillGroup.classList.add('health-pill');
+
+  // Define the gradient for the health pill.
+  let healthPillDefs = document.createElementNS(svgNamespace, 'defs');
+  healthPillGroup.appendChild(healthPillDefs);
+  let healthPillGradient =
+      document.createElementNS(svgNamespace, 'linearGradient');
+  const healthPillGradientId = 'health-pill-gradient';
+  healthPillGradient.setAttribute('id', healthPillGradientId);
+
+  let cumulativeCount = 0;
+  let previousOffset = '0%';
+  for (let i = 0; i < lastHealthPillElementsBreakdown.length; i++) {
+    if (!lastHealthPillElementsBreakdown[i]) {
+      // Exclude empty categories.
+      continue;
+    }
+    cumulativeCount += lastHealthPillElementsBreakdown[i];
+
+    // Create a color interval using 2 stop elements.
+    let stopElement0 = document.createElementNS(svgNamespace, 'stop');
+    stopElement0.setAttribute('offset', previousOffset);
+    stopElement0.setAttribute(
+        'stop-color', healthPillEntries[i].background_color);
+    healthPillGradient.appendChild(stopElement0);
+
+    let stopElement1 = document.createElementNS(svgNamespace, 'stop');
+    let percent = (cumulativeCount * 100 / totalCount) + '%';
+    stopElement1.setAttribute('offset', percent);
+    stopElement1.setAttribute(
+        'stop-color', healthPillEntries[i].background_color);
+    healthPillGradient.appendChild(stopElement1);
+    previousOffset = percent;
+  }
+  healthPillDefs.appendChild(healthPillGradient);
+
+  // Create the rectangle for the health pill.
+  let rect = document.createElementNS(svgNamespace, 'rect');
+  rect.setAttribute('fill', 'url(#' + healthPillGradientId + ')');
+  rect.setAttribute('width', String(healthPillWidth));
+  rect.setAttribute('height', String(healthPillHeight));
+  healthPillGroup.appendChild(rect);
+
+  // Show a title with specific counts on hover.
+  let titleSvg = document.createElementNS(svgNamespace, 'title');
+  titleSvg.textContent = _getHealthPillTextContent(
+      healthPill, totalCount, lastHealthPillElementsBreakdown, numericStats);
+  healthPillGroup.appendChild(titleSvg);
+  // TODO(cais): Make the tooltip content prettier.
+
+  // Center this health pill just right above the node for the op.
+  let healthPillX = nodeInfo.x - healthPillWidth / 2;
+  let healthPillY = nodeInfo.y - healthPillHeight - nodeInfo.height / 2 - 2;
+  if (nodeInfo.labelOffset < 0) {
+    // The label is positioned above the node. Do not occlude the label.
+    healthPillY += nodeInfo.labelOffset;
+  }
+
+  if (lastHealthPillElementsBreakdown[2] ||
+      lastHealthPillElementsBreakdown[3] ||
+      lastHealthPillElementsBreakdown[4]) {
+    // At least 1 "non-Inf and non-NaN" value exists (a -, 0, or + value). Show
+    // stats on tensor values.
+
+    // Determine if we should display the output range as integers.
+    let shouldRoundOnesDigit = false;
+    let node = nodeInfo.node as OpNode;
+    let attributes = node.attr;
+    if (attributes && attributes.length) {
+      // Find the attribute for output type if there is one.
+      for (let i = 0; i < attributes.length; i++) {
+        if (attributes[i].key === 'T') {
+          // Note whether the output type is an integer.
+          let outputType = attributes[i].value['type'];
+          shouldRoundOnesDigit =
+              outputType && /^DT_(BOOL|INT|UINT)/.test(outputType);
+          break;
+        }
+      }
+    }
+
+    let statsSvg = document.createElementNS(svgNamespace, 'text');
+    const minString = humanizeHealthPillStat(numericStats.min, shouldRoundOnesDigit);
+    const maxString = humanizeHealthPillStat(numericStats.max, shouldRoundOnesDigit);
+    if (totalCount > 1) {
+      statsSvg.textContent = minString + ' ~ ' + maxString;
+    } else {
+      statsSvg.textContent = minString;
+    }
+    statsSvg.classList.add('health-pill-stats');
+    statsSvg.setAttribute('x', String(healthPillWidth / 2));
+    statsSvg.setAttribute('y', '-2');
+    healthPillGroup.appendChild(statsSvg);
+  }
+
+  healthPillGroup.setAttribute(
+      'transform', 'translate(' + healthPillX + ', ' + healthPillY + ')');
+
+  Polymer.dom(nodeGroupElement.parentNode).appendChild(healthPillGroup);
+}
+
+/**
+ * Adds health pills (which visualize tensor summaries) to a graph group.
+ * @param svgRoot The root SVG element of the graph to add heath pills to.
+ * @param nodeNamesToHealthPills An object mapping node name to health pill.
+ * @param colors A list of colors to use.
+ */
+export function addHealthPills(
+    svgRoot: SVGElement, nodeNamesToHealthPills: {[key: string]: HealthPill[]},
+    healthPillStepIndex: number) {
+  if (!nodeNamesToHealthPills) {
+    // No health pill information available.
+    return;
+  }
+
+  let svgRootSelection = d3.select(svgRoot);
+  svgRootSelection.selectAll('g.nodeshape')
+      .each(function(nodeInfo: render.RenderNodeInfo) {
+        // Only show health pill data for this node if it is available.
+        let healthPills = nodeNamesToHealthPills[nodeInfo.node.name];
+        let healthPill = healthPills ? healthPills[healthPillStepIndex] : null;
+        _addHealthPill((this as SVGElement), healthPill, nodeInfo);
+      });
+};
+
+} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/template.ts b/tensorflow/tensorboard/components/tf_graph_common/template.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/template.ts
rename to tensorflow/tensorboard/components/tf_graph_common/template.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts b/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
index a853ca7e01e..af3030197e0 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
@@ -75,12 +75,29 @@ suite('graph', () => {
 
             tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
               tf.graph.joinStatsInfoWithGraph(slimGraph, stepStats);
-              assert.equal(slimGraph.nodes['Q'].stats.totalMicros, 6);
+              assert.equal(slimGraph.nodes['Q'].stats.getTotalMicros(), 6);
               done();
             });
           });
     });
   });
 
+  test('health pill numbers round correctly', () => {
+    // Integers are rounded to the ones place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.0, true), '42');
+
+    // Numbers with magnitude >= 1 are rounded to the tenths place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(1, false), '1.0');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.42, false), '42.4');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(-42.42, false), '-42.4');
+
+    // Numbers with magnitude < 1 are written in scientific notation rounded to
+    // the tenths place.
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(0, false), '0.0e+0');
+    assert.equal(tf.graph.scene.humanizeHealthPillStat(0.42, false), '4.2e-1');
+    assert.equal(
+        tf.graph.scene.humanizeHealthPillStat(-0.042, false), '-4.2e-2');
+  });
+
   // TODO(bp): write tests.
 });
diff --git a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html b/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
index f4f98c8b9ac..a460072a38f 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
+++ b/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
@@ -20,19 +20,19 @@ limitations under the License.
 <link rel="import" href="../tf-imports/graphlib.html">
 <link rel="import" href="../tf-imports/lodash.html">
 
-<script src="lib/colors.js"></script>
-<script src="lib/common.js"></script>
-<script src="lib/externs.js"></script>
-<script src="lib/graph.js"></script>
-<script src="lib/hierarchy.js"></script>
-<script src="lib/layout.js"></script>
-<script src="lib/parser.js"></script>
-<script src="lib/proto.js"></script>
-<script src="lib/render.js"></script>
-<script src="lib/scene/annotation.js"></script>
-<script src="lib/scene/contextmenu.js"></script>
-<script src="lib/scene/edge.js"></script>
-<script src="lib/scene/node.js"></script>
-<script src="lib/scene/scene.js"></script>
-<script src="lib/template.js"></script>
-<script src="lib/util.js"></script>
+<script src="colors.js"></script>
+<script src="common.js"></script>
+<script src="externs.js"></script>
+<script src="graph.js"></script>
+<script src="hierarchy.js"></script>
+<script src="layout.js"></script>
+<script src="parser.js"></script>
+<script src="proto.js"></script>
+<script src="render.js"></script>
+<script src="annotation.js"></script>
+<script src="contextmenu.js"></script>
+<script src="edge.js"></script>
+<script src="node.js"></script>
+<script src="scene.js"></script>
+<script src="template.js"></script>
+<script src="util.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/util.ts b/tensorflow/tensorboard/components/tf_graph_common/util.ts
similarity index 89%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/util.ts
rename to tensorflow/tensorboard/components/tf_graph_common/util.ts
index 8a8a29ca088..0b2df6545cc 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/util.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common/util.ts
@@ -68,7 +68,7 @@ module tf.graph.util {
    * progress
    * of the subtask and the subtask message. The parent task should pass a
    * subtracker to its subtasks. The subtask reports its own progress which
-   * becames relative to the main task.
+   * becomes relative to the main task.
    */
   export function getSubtaskTracker(
       parentTracker: ProgressTracker, impactOnTotalProgress: number,
@@ -228,7 +228,8 @@ module tf.graph.util {
 
   export function hasDisplayableNodeStats(stats: NodeStats) {
     if (stats &&
-        (stats.totalBytes > 0 || stats.totalMicros > 0 || stats.outputSize)) {
+        (stats.totalBytes > 0 || stats.getTotalMicros() > 0 ||
+         stats.outputSize)) {
       return true;
     }
     return false;
@@ -287,4 +288,29 @@ module tf.graph.util {
 
     return _.object(queryParams);
   }
+
+  /**
+   * Given a timestamp in microseconds, return a human-friendly string denoting
+   * how long ago the timestamp was.
+   */
+  export function computeHumanFriendlyTime(timeInMicroseconds: number) {
+    var timeDifferenceInMs =
+        +(new Date()) - +(new Date(timeInMicroseconds / 1e3));
+    if (timeDifferenceInMs < 30000) {
+      return 'just now';
+    } else if (timeDifferenceInMs < 60000) {
+      return Math.floor(timeDifferenceInMs / 1000) + ' seconds ago';
+    } else if (timeDifferenceInMs < 120000) {
+      return 'a minute ago';
+    } else if (timeDifferenceInMs < 3600000) {
+      return Math.floor(timeDifferenceInMs / 60000) + ' minutes ago';
+    } else if (Math.floor(timeDifferenceInMs / 3600000) == 1) {
+      return 'an hour ago';
+    } else if (timeDifferenceInMs < 86400000) {
+      return Math.floor(timeDifferenceInMs / 3600000) + ' hours ago';
+    } else if (timeDifferenceInMs < 172800000) {
+      return 'yesterday';
+    }
+    return Math.floor(timeDifferenceInMs / 86400000) + ' days ago';
+  }
 }
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/BUILD
new file mode 100644
index 00000000000..ecca2ba4cb5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_controls",
+    srcs = ["tf-graph-controls.html"],
+    path = "/tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_controls"],
+    destdir = "tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//third_party/javascript/polymer/v1/paper-button:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
+        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
new file mode 100644
index 00000000000..0e120542132
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_controls/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-graph-controls/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html b/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
new file mode 100644
index 00000000000..8b12641b28e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
@@ -0,0 +1,49 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-controls.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Controls Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 700px;
+    position: relative;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-controls-demo">
+      <template>
+        <tf-graph-controls
+            id="controls"
+            color-by="structure"
+        ></tf-graph-controls>
+      </template>
+      <script>
+        Polymer({
+          is: "tf-graph-controls-demo",
+        });
+      </script>
+    </dom-module>
+    <div id="demo-container">
+      <tf-graph-controls-demo></tf-graph-controls-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
new file mode 100644
index 00000000000..6d896357482
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
@@ -0,0 +1,919 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-menu/paper-menu.html">
+<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
+<link rel="import" href="../paper-radio-group/paper-radio-group.html">
+<link rel="import" href="../paper-tooltip/paper-tooltip.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+
+<dom-module id="tf-graph-controls">
+<template>
+<style>
+:host {
+  font-size: 12px;
+  color: gray;
+  --paper-font-subhead: {
+    font-size: 14px;
+    color: gray;
+  };
+  --paper-dropdown-menu-icon: {
+    width: 15px;
+    height: 15px;
+  };
+  --paper-dropdown-menu-button: {
+    padding: 0;
+  };
+  --paper-dropdown-menu-input: {
+    padding: 0;
+  };
+  --paper-item-min-height: 30px;
+}
+
+paper-button[raised].keyboard-focus {
+  font-weight: normal;
+}
+
+.run-dropdown {
+  --paper-input-container: {
+    padding: 9px 0 0 25px;
+  };
+}
+
+.color-dropdown {
+  --paper-input-container: {
+    padding: 9px 0 0 13px;
+  };
+}
+
+table {
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+
+table td {
+  padding: 0;
+  margin: 0;
+}
+
+.allcontrols {
+  width: 188px;
+  padding: 0 30px;
+}
+
+.legend-holder {
+  position: absolute;
+  bottom: 0;
+  padding-bottom: 10px;
+}
+
+paper-radio-button {
+  display: block;
+  padding: 5px;
+}
+svg.icon {
+  width: 60px;
+  height: 18px;
+}
+.icon ellipse {
+  rx: 10px;
+  ry: 5px;
+  stroke: #CCC;
+  stroke-width: 1px;
+  fill: #FFFFFF;
+  cy: 10px;
+}
+.icon rect {
+  height: 14px;
+  width: 35px;
+  rx: 5px;
+  ry: 5px;
+  stroke: #CCC;
+  stroke-width: 2px;
+  fill: #D9D9D9;
+}
+.domainValues {
+  margin-bottom: 10px;
+  width: 165px;
+}
+.domainStart {
+  float: left;
+}
+.domainEnd {
+  float: right;
+}
+.colorBox {
+  width: 20px;
+}
+
+.image-icon {
+  width: 24px;
+  height: 24px;
+}
+
+.help-icon {
+  height: 15px;
+  margin: 0;
+  padding: 0;
+}
+
+.gray {
+  color: #666;
+}
+
+.title {
+  font-size: 16px;
+  margin: 8px 5px 8px 0;
+  color: black;
+}
+.title small {
+  font-weight: normal;
+}
+.deviceList, .xlaClusterList {
+  max-height: 200px;
+  overflow-y: auto;
+}
+
+#file {
+  padding: 8px 0;
+}
+
+.color-legend-row {
+  clear: both;
+  height: 20px;
+  margin-top: 5px;
+  position: relative;
+}
+
+.color-legend-row svg {
+  position: absolute;
+  top: -1px;
+  width: 40px;
+}
+
+.color-legend-row span.color-legend-value {
+  margin-left: 60px;
+}
+
+#grey-rect {
+  fill: #eee;
+  stroke: #a6a6a6;
+}
+
+#faded-rect {
+  fill: url(#rectHatch);
+  stroke: var(--tb-graph-faded);
+}
+
+.button-text {
+  text-transform: none;
+  padding: 8px 18px 0 18px;
+  font-size: 14px
+}
+
+.upload-button {
+  width: 165px;
+  height: 25px;
+  text-transform: none;
+  margin-top: 4px;
+}
+
+.iconbutton {
+  padding: 2px;
+  width: 30px;
+  height: 30px;
+  color: var(--paper-orange-500);
+}
+
+.hidden-input {
+  height: 0px;
+  width: 0px;
+  overflow:hidden;
+}
+
+.allcontrols .control-holder {
+  display: flex;
+  clear: both;
+}
+
+.allcontrols .control-holder paper-radio-group {
+  margin-top: 5px;
+}
+
+span.counter {
+  font-size: 13px;
+  color: gray;
+}
+
+.runs paper-item {
+  --paper-item: {
+    white-space: nowrap;
+  }
+}
+
+table.control-holder {
+  border: 0;
+  border-collapse: collapse;
+}
+
+table.tf-graph-controls td.input-element-table-data {
+  padding: 0 0 0 20px;
+}
+
+/** Override inline styles that suppress pointer events for disabled buttons. Otherwise, the */
+/*  tooltips do not appear. */
+#color-by-radio-group paper-radio-button {
+  pointer-events: auto !important;
+}
+
+.legend-clarifier {
+  color: #266236;
+  cursor: help;
+  display: inline-block;
+  text-decoration: underline;
+}
+
+.legend-clarifier paper-tooltip {
+  width: 150px;
+}
+</style>
+<svg width="0" height="0">
+  <defs>
+    <g id="legend-rect">
+      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
+    </g>
+    <g id="grey-rect">
+       <use xmlns:xlink="http://www.w3.org/1999/xlink"
+            xlink:href="#legend-rect"/>
+     </g>
+     <g id="faded-rect">
+       <use xmlns:xlink="http://www.w3.org/1999/xlink"
+            xlink:href="#legend-rect"/>
+     </g>
+  </defs>
+</svg>
+<div class="allcontrols">
+  <div class="control-holder">
+    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
+    </paper-icon-button>
+    <paper-button class="button-text" on-click="fit">Fit to screen
+    </paper-button>
+  </div>
+  <div class="control-holder">
+    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
+    </paper-icon-button>
+    <paper-button class="button-text" on-click="download">Download PNG
+    </paper-button>
+    <a href="#" id="graphdownload" class="title" download="graph.png">
+    </a>
+  </div>
+  <div class="control-holder runs">
+    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
+    <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
+        <template is="dom-repeat" items="[[datasets]]">
+          <paper-item>[[item.name]]</paper-item>
+        </template>
+      </paper-menu>
+    </paper-dropdown-menu>
+  </div>
+  <template is="dom-if" if="[[showSessionRunsDropdown]]">
+    <div class="control-holder">
+      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
+      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
+          <template is="dom-repeat" items="[[metadataTags]]">
+            <paper-item>[[item.tag]]</paper-item>
+          </template>
+          <paper-item>None</paper-item>
+        </paper-menu>
+      </paper-dropdown-menu>
+    </div>
+  </template>
+  <template is="dom-if" if="[[showUploadButton]]">
+    <div class="control-holder">
+      <div class="title">Upload</div>
+      <paper-button raised class="text-button upload-button"
+          on-click="_getFile">Choose File</paper-button>
+      <div class="hidden-input">
+        <input type="file" id="file" name="file" on-change="_updateFileInput" />
+      </div>
+    </div>
+  </template>
+  <table class="control-holder">
+    <tr>
+      <td class="title">Trace inputs</td>
+      <td class="input-element-table-data">
+        <paper-toggle-button id="trace-inputs"></paper-toggle-button>
+      </td>
+    </tr>
+    <template is="dom-if" if="[[healthPillsFeatureEnabled]]">
+      <tr>
+        <td class="title">Show health pills</td>
+        <td class="input-element-table-data">
+          <paper-toggle-button checked="{{healthPillsToggledOn}}"></paper-toggle-button>
+        </td>
+      </tr>
+    </template>
+  </table>
+  <div class="control-holder">
+    <div class="title">Color</div>
+    <paper-radio-group id="color-by-radio-group" selected="{{colorBy}}">
+      <paper-radio-button name="structure">Structure</paper-radio-button>
+
+      <paper-radio-button name="device">Device</paper-radio-button>
+
+      <paper-radio-button id="xla-cluster-radio-button"
+                          name="xla_cluster"
+                          disabled="[[!_xlaClustersProvided(renderHierarchy)]]">
+        XLA Cluster
+      </paper-radio-button>
+      <paper-tooltip animation-delay="0" for="xla-cluster-radio-button" position="right">
+        Coloring by XLA cluster is only enabled if at least 1 op specifies an XLA cluster.
+      </paper-tooltip>
+
+      <paper-radio-button id="compute-time-radio-button"
+                          name="compute_time"
+                          disabled="[[!stats]]">
+        Compute time
+      </paper-radio-button>
+      <paper-tooltip animation-delay="0" for="compute-time-radio-button" position="right">
+        Coloring by compute time is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
+
+      <paper-radio-button id="memory-radio-button"
+                          name="memory"
+                          disabled="[[!stats]]">
+        Memory
+      </paper-radio-button>
+      <paper-tooltip animation-delay="0" for="memory-radio-button" position="right">
+        Coloring by memory is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
+    </paper-radio-group>
+  </div>
+  <div>
+    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
+      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
+        <defs>
+          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
+            <stop class="start" offset="0%"
+                stop-color$="[[_currentGradientParams.startColor]]"/>
+            <stop class="end" offset="100%"
+                stop-color$="[[_currentGradientParams.endColor]]"/>
+          </linearGradient>
+        </defs>
+        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)"
+            stroke="black" />
+      </svg>
+      <div class="domainValues color-text">
+        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
+        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
+      </div>
+      <br style="clear: both">
+      <div>Devices included in stats:</div>
+      <div class="deviceList">
+        <table>
+        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
+          <tr>
+            <td>
+              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked"/>
+            </td>
+            <td>
+              <div>
+                <span>[[item.suffix]]</span>
+                <template is="dom-if" if="[[item.ignoredMsg]]">
+                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
+                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
+                </template>
+              </div>
+            </td>
+          </tr>
+        </template>
+        </table>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
+      <div class="color-text">
+        <div class="color-legend-row">
+          <div style="position: absolute;">
+            colors
+          </div>
+          <span class="color-legend-value">same substructure</span>
+        </div>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unique substructure</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
+      <div class="color-text">
+        <div class="deviceList">
+          <table>
+          <template is="dom-repeat" items="[[colorByParams.device]]">
+            <tr>
+              <td style$="[[_getBackgroundColor(item.color)]]">
+                <div class="colorBox"></div>
+              </td>
+              <td>
+                <div>[[item.device]]</div>
+              </td>
+            </tr>
+          </template>
+          </table>
+        </div>
+        <br/>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unknown device</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'xla_cluster')]]">
+      <div class="color-text">
+        <div class="xlaClusterList">
+          <table>
+          <template is="dom-repeat" items="[[colorByParams.xla_cluster]]">
+            <tr>
+              <td style$="[[_getBackgroundColor(item.color)]]">
+                <div class="colorBox"></div>
+              </td>
+              <td>
+                <div>[[item.xla_cluster]]</div>
+              </td>
+            </tr>
+          </template>
+          </table>
+        </div>
+        <br/>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unknown XLA cluster</span>
+        </div>
+      </div>
+    </template>
+    <template is="dom-if" if="[[_statsNotNull(stats)]]">
+      <div class="color-legend-row">
+        <svg>
+          <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                xlink:href="#faded-rect" x="0" y="0"/>
+        </svg>
+        <span class="color-legend-value">unused substructure</span>
+      </div>
+    </template>
+  </div>
+  <!--
+    Due to limited vertical space on the left sidebar, hide the legend whenever
+    we show a list of devices to include in stats.
+  -->
+  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
+    <div class="legend-holder">
+      <table>
+        <tr>
+          <td><div class="title">Graph</div></td>
+          <td>(* = expandable)</td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon">
+              <rect transform="translate(3, 1)" height="14" width="35"
+                  rx="5" ry="5"/>
+            </svg>
+          </td>
+          <td>
+            Namespace<span class="gray">*</span>
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Encapsulates a set of nodes. Namespace is hierarchical and based on scope.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" preserveAspectRatio="xMinYMid meet"
+                viewBox="0 0 10 10">
+              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5"
+                y="6" />
+            </svg>
+          </td>
+          <td>
+            OpNode
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Node that performs an operation. These nodes cannot expand.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet"
+                viewBox="0 0 12 12">
+              <use xlink:href="#op-series-horizontal-stamp" fill="white"
+                  stroke="#ccc" x="2" y="2"/>
+            </svg>
+          </td>
+          <td>
+            Unconnected series<span class="gray">*</span>
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Sequence of numbered nodes that are not connected to each other.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <use xlink:href="#op-series-vertical-stamp"
+                  fill="white" stroke="#ccc" x="2" y="2"/>
+            </svg>
+          </td>
+          <td>
+            Connected series<span class="gray">*</span>
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Sequence of numbered nodes that are connected to each other.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon">
+              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"/>
+            </svg>
+          </td>
+          <td>
+            Constant
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Node that outputs a constant value.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
+              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"/>
+            </svg>
+          </td>
+          <td>
+            Summary
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Node that collects data for visualization within TensorBoard.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <defs>
+                <marker id="dataflow-arrowhead-legend" fill="#bbb" markerWidth="10"
+                    markerHeight="10" refX="9" refY="5" orient="auto-start-reverse">
+                  <path d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
+                </marker>
+              </defs>
+              <path marker-end="url(#dataflow-arrowhead-legend)"
+                    stroke="#bbb" d="M2 9 l 29 0"
+                    stroke-linecap="round" />
+            </svg>
+          </td>
+          <td>
+            Dataflow edge
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Edge showing the data flow between operations. Edges flow upwards unless arrowheads specify otherwise.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <path stroke="#bbb"
+                d="M2 9 l 29 0" stroke-linecap="round" stroke-dasharray="2, 2" />
+            </svg>
+          </td>
+          <td>
+            Control dependency edge
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Edge showing the control dependency between operations.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+        <tr>
+          <td>
+            <svg class="icon" height="15px"
+                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
+              <defs>
+                <marker id="reference-arrowhead-legend" fill="#FFB74D" markerWidth="10"
+                    markerHeight="10" refX="9" refY="5" orient="auto-start-reverse">
+                  <path d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
+                </marker>
+              </defs>
+              <path marker-end="url(#reference-arrowhead-legend)"
+                    stroke="#FFB74D" d="M2 9 l 29 0"
+                    stroke-linecap="round" />
+            </svg>
+          </td>
+          <td>
+            Reference edge
+            <div class="legend-clarifier">
+              <span>?</span>
+              <paper-tooltip animation-delay="0" position="right">
+                Edge showing that the outgoing operation node can mutate the incoming tensor.
+              </paper-tooltip>
+            </div>
+          </td>
+        </tr>
+      </table>
+    </div>
+  </template>
+  </div>
+</template>
+</dom-module>
+
+<script>
+(function() { // Private scope.
+/**
+ * Stats from device names that match these regexes will be excluded by default.
+ * The user can still turn on a device by selecting the checkbox in the device list.
+ * See b/29089982 for context.
+ */
+var DEVICE_NAMES_EXCLUDE = [
+  {
+    regex: /gpu:[0-9]+$/,
+    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
+  }
+];
+
+Polymer({
+  is: 'tf-graph-controls',
+  properties: {
+    // Public API.
+    stats: {
+      value: null,
+      type: Object,
+      observer: '_statsChanged'
+    },
+    devicesForStats: {
+      value: null,
+      type: Object,
+      notify: true,
+      readonly: true,
+    },
+    colorBy: {
+      type: String,
+      value: 'structure',
+      notify: true,
+      readonly: true
+    },
+    colorByParams: Object,
+    datasets: {
+      type: Array,
+      observer: '_datasetsChanged'
+    },
+    renderHierarchy: {
+      type: Object,
+      notify: true,
+    },
+    metadataTags: {
+      type: Array,
+      computed: '_getMetadataTags(selectedDataset, datasets)'
+    },
+    selectedDataset: {
+      type: Number,
+      notify: true,
+      value: 0,
+      observer: '_selectedDatasetChanged'
+    },
+    selectedFile: {
+      type: Object,
+      notify: true
+    },
+    selectedMetadataTag: {
+      type: Number,
+      notify: true,
+      value: -1
+    },
+    _currentGradientParams: {
+      type: Object,
+      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
+    },
+    showSessionRunsDropdown: {
+      type: Boolean,
+      value: true
+    },
+    showUploadButton: {
+      type: Boolean,
+      value: true
+    },
+    // This stores whether the feature for showing health pills is enabled in the first place.
+    healthPillsFeatureEnabled: Boolean,
+    // This stores whether to show health pills. Only relevant if healthPillsFeatureEnabled. The
+    // user can toggle this value.
+    healthPillsToggledOn: {
+      type: Boolean,
+      notify: true,
+    },
+  },
+  listeners: {
+    'trace-inputs.change': '_traceInputToggleChanged'
+  },
+  _traceInputToggleChanged: function(event) {
+    // Flip the state of the trace inputs flag.
+    this.renderHierarchy.traceInputs = event.target.active;
+    tf.graph.scene.node.traceInputs(this.renderHierarchy);
+  },
+  _xlaClustersProvided: function(renderHierarchy) {
+    return renderHierarchy &&
+        renderHierarchy.hierarchy &&
+        renderHierarchy.hierarchy.xlaClusters.length > 0;
+  },
+  _statsChanged: function(stats) {
+    if (stats == null) {
+      return;
+    }
+    var devicesForStats = {};
+    var devices = _.each(stats.dev_stats, function(d) {
+      // Avoid device names that are ignored by default.
+      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
+        return rule.regex.test(d.device);
+      });
+      if (!exclude) {
+        devicesForStats[d.device] = true;
+      }
+    });
+    this.set('devicesForStats', devicesForStats);
+  },
+  _getDevices: function(devicesForStats) {
+    var devices = _.map(this.stats.dev_stats, function(d) {
+      return d.device;
+    });
+    // Devices names can be long so we remove the longest common prefix
+    // before showing the devices in a list.
+    var suffixes = tf.graph.util.removeCommonPrefix(devices);
+    return _.map(devices, function(device, i) {
+      var ignoredMsg = null;
+      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
+        if (rule.regex.test(device)) {
+          ignoredMsg = rule.msg;
+        }
+      });
+      return {
+        device: device,
+        suffix: suffixes[i],
+        used: devicesForStats[device],
+        ignoredMsg: ignoredMsg
+      };
+    });
+  },
+  _deviceCheckboxClicked: function(checkbox) {
+    // Update the device map.
+    var devicesForStats = _.extend({}, this.devicesForStats);
+    var device = checkbox.target.value;
+    if (checkbox.target.checked) {
+      devicesForStats[device] = true;
+    } else {
+      delete devicesForStats[device];
+    }
+    this.set('devicesForStats', devicesForStats);
+  },
+  _numSessionRuns: function(metadataTags) {
+    return metadataTags != null ? metadataTags.length : 0;
+  },
+  _getBackgroundColor: function(color) {
+    return 'background-color:' + color;
+  },
+  fit: function() {
+    document.querySelector('#scene').fit();
+  },
+  _isGradientColoring: function(stats, colorBy) {
+    return ["compute_time", "memory"].indexOf(colorBy) !== -1
+        && stats != null;
+  },
+  _equals: function(a, b) {
+    return a === b;
+  },
+  _getCurrentGradientParams: function(colorByParams, colorBy) {
+    if (!this._isGradientColoring(this.stats, colorBy)) {
+      return;
+    }
+    var params = colorByParams[colorBy];
+    var minValue = params.minValue;
+    var maxValue = params.maxValue;
+    if (colorBy === 'memory') {
+      minValue = tf.graph.util.convertUnitsToHumanReadable(
+          minValue, tf.graph.util.MEMORY_UNITS);
+      maxValue = tf.graph.util.convertUnitsToHumanReadable(
+          maxValue, tf.graph.util.MEMORY_UNITS);
+    } else if (colorBy === 'compute_time') {
+      minValue = tf.graph.util.convertUnitsToHumanReadable(
+          minValue, tf.graph.util.TIME_UNITS);
+      maxValue = tf.graph.util.convertUnitsToHumanReadable(
+          maxValue, tf.graph.util.TIME_UNITS);
+    }
+    return {
+      minValue: minValue,
+      maxValue: maxValue,
+      startColor: params.startColor,
+      endColor: params.endColor
+    };
+  },
+  download: function() {
+    this.$.graphdownload.click();
+  },
+  _updateFileInput: function(e) {
+    var file = e.target.files[0];
+    if (!file) {
+      return;
+    }
+    this._setDownloadFilename(file.name);
+    this.set('selectedFile', e);
+  },
+  _datasetsChanged: function(newDatasets, oldDatasets) {
+    if (oldDatasets != null || this.selected == null) {
+      // Select the first dataset by default.
+      this.set('selectedDataset', 0);
+      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
+    }
+  },
+  _getMetadataTags: function(selectedDataset, datasets) {
+    return this.datasets[selectedDataset].runMetadata;
+  },
+  _selectedDatasetChanged: function(newDataset, oldDataset) {
+    if (this.datasets) {
+      this.set('selectedMetadataTag', -1);
+      this.set('colorBy', 'structure');
+      this.$['trace-inputs'].active = false; // Set trace input to off-state.
+      this._setDownloadFilename(this.datasets[newDataset].path);
+    }
+  },
+  _getFile: function() {
+    this.$$("#file").click();
+  },
+  _setDownloadFilename: function(graphPath) {
+    // Strip off everything before the last "/" and strip off the file
+    // extension in order to get the name of the PNG for the graph.
+    var dotIndex = graphPath.lastIndexOf('.');
+    if (dotIndex) {
+      graphPath = graphPath.substring(0, dotIndex);
+    }
+    var slashIndex = graphPath.lastIndexOf('/');
+    if (slashIndex) {
+      graphPath = graphPath.substring(slashIndex + 1);
+    }
+    this.$.graphdownload.setAttribute('download', graphPath + '.png');
+  },
+  _statsNotNull: function(stats) {
+    return stats !== null;
+  },
+});
+})(); // Closing private scope.
+</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
new file mode 100644
index 00000000000..c69a7809035
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_dashboard",
+    srcs = ["tf-graph-dashboard.html"],
+    path = "/tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph",
+        "//tensorflow/tensorboard/components/tf_graph_board",
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/vz_sorting",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_dashboard"],
+    destdir = "tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
new file mode 100644
index 00000000000..66a37b89785
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_dashboard/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
new file mode 100644
index 00000000000..30b20645346
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
new file mode 100644
index 00000000000..0429aa71f82
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
@@ -0,0 +1,6 @@
+{
+  "run1": {
+    "graph": true,
+    "scalars": ["foo/sin"]
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
new file mode 100644
index 00000000000..ae84c547b48
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
@@ -0,0 +1,62 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-graph-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+
+<title>Graph Dashboard Demo</title>
+<style>
+  #demo-container {
+    display: block;
+    height: 900px;
+    position: relative;
+    width: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="graph-dashboard-demo">
+      <template>
+        <tf-graph-dashboard backend="[[backend]]"></tf-graph-dashboard>
+      </template>
+      <script>
+        import {Backend} from "../../tf-backend/backend";
+        import {createRouter, setRouter} from "../../tf-backend/router";
+
+        Polymer({
+          is: "graph-dashboard-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                return new Backend();
+              },
+            },
+          },
+          created: function() {
+            var router = createRouter("data", true);
+            setRouter(router);
+          },
+        });
+      </script>
+    </dom-module>
+    <graph-dashboard-demo id="demo-container"></graph-dashboard-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
index ae26f59762a..ba69882a232 100644
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
@@ -18,9 +18,10 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 <link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../vz-sorting/vz-sorting.html">
 
 <!--
 tf-graph-dashboard displays a graph from a TensorFlow run.
@@ -51,6 +52,8 @@ by default. The user can select a different run from a dropdown menu.
         selected-dataset="{{_selectedDataset}}"
         selected-file="{{_selectedFile}}"
         selected-metadata-tag="{{_selectedMetadataTag}}"
+        health-pills-feature-enabled="[[debuggerDataEnabled]]"
+        health-pills-toggled-on="{{healthPillsToggledOn}}"
   ></tf-graph-controls>
   <tf-graph-loader id="loader"
         datasets="[[_datasets]]"
@@ -73,7 +76,15 @@ out-hierarchy-params="{{_hierarchyParams}}"
         graph="[[_graph]]"
         hierarchy-params="[[_hierarchyParams]]"
         progress="[[_progress]]"
+        debugger-data-enabled="[[debuggerDataEnabled]]"
+        are-health-pills-loading="[[_areHealthPillsLoading]]"
+        debugger-numeric-alerts="[[_debuggerNumericAlerts]]"
+        node-names-to-health-pills="[[_nodeNamesToHealthPills]]"
+        all-steps-mode-enabled="{{allStepsModeEnabled}}"
+        specific-health-pill-step="{{specificHealthPillStep}}"
+        health-pill-step-index="[[_healthPillStepIndex]]"
         render-hierarchy="{{_renderHierarchy}}"
+        selected-node="{{_selectedNode}}"
         stats="[[_stats]]"
     ></tf-graph-board>
 </div>
@@ -95,40 +106,216 @@ out-hierarchy-params="{{_hierarchyParams}}"
 </dom-module>
 
 <script>
-(function() {
+import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+import {BackendBehavior} from "../tf-backend/behavior";
+import {compareTagNames} from "../vz-sorting/sorting";
+
 Polymer({
   is: 'tf-graph-dashboard',
+  factoryImpl: function(backend, debuggerDataEnabled) {
+    this.backend = backend;
+    this.debuggerDataEnabled = debuggerDataEnabled;
+  },
+  behaviors: [
+    DashboardBehavior("graphs"),
+    ReloadBehavior("tf-graph-dashboard"),
+    BackendBehavior,
+  ],
   properties: {
     _datasets: Object,
-    _renderHierarchy: Object,
-    backend: {type: Object, observer: 'reload'},
-    runs: Array
+    _renderHierarchy: {type: Object, observer: '_renderHierarchyChanged'},
+    backend: Object,
+    debuggerDataEnabled: Boolean,
+    allStepsModeEnabled: Boolean,
+    specificHealthPillStep: {type: Number, value: 0},
+    healthPillsToggledOn: {type: Boolean, value: true, observer: '_healthPillsToggledOnChanged'},
+    _selectedNode: Object,
+    _isAttached: Boolean,
+    // Whether this dashboard is initialized. This dashboard should only be initialized once.
+    _initialized: Boolean,
+    // Whether health pills are currently being loaded, in which case we may want to say show a
+    // spinner.
+    _areHealthPillsLoading: Boolean,
+    // An array of alerts (in chronological order) provided by debugging libraries on when bad
+    // values (NaN, +/- Inf) appear.
+    _debuggerNumericAlerts: {
+      type: Array,
+      value: [],
+      notify: true,
+    },
+    // Maps the names of nodes to an array of health pills (HealthPillDatums).
+    _nodeNamesToHealthPills: {
+      type: Object,
+      value: {},
+    },
+    _healthPillStepIndex: Number,
+    // A strictly increasing ID. Each request for health pills has a unique ID. This helps us
+    // identify stale requests.
+    _healthPillRequestId: {type: Number, value: 1},
+    // The setTimeout ID for the pending request for health pills at a specific step.
+    _healthPillStepRequestTimerId: Number,
+    // The request for health pills at a specific step (as opposed to all sampled health pills) may
+    // involve slow disk reads. Hence, we throttle to 1 of those requests every this many ms.
+    _healthPillStepRequestTimerDelay: {
+      type: Number,
+      value: 500,
+      readOnly: true,
+    },
+    runs: Array,
+  },
+  listeners: {
+    'node-toggle-expand': '_handleNodeToggleExpand',
+  },
+  observers: [
+    '_maybeFetchHealthPills(allStepsModeEnabled, specificHealthPillStep, _selectedNode)',
+    '_maybeInitializeDashboard(backend, _isAttached)',
+  ],
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
   },
   reload: function() {
-    Promise.all([this.backend.graphRuns(), this.backend.runMetadataRuns()])
-    .then(function(result) {
-      var runsWithGraph = result[0].sort(VZ.Sorting.compareTagNames);
-      var runToMetadata = result[1];
-      var datasets = _.map(runsWithGraph, function(runName) {
-        return {
-          name: runName,
-          path: this.backend.router.graph(runName, tf.graph.LIMIT_ATTR_SIZE,
-            tf.graph.LARGE_ATTRS_KEY),
-          runMetadata: runToMetadata[runName] ? _.map(
-            runToMetadata[runName].sort(VZ.Sorting.compareTagNames), function(tag) {
-              return {
-                tag: tag,
-                path: this.backend.router.runMetadata(tag, runName)
-              };
-            }, this) : []
-        };
-      }, this);
-      this.set('_datasets', datasets);
+    this._maybeFetchHealthPills();
+  },
+  _shouldRequestHealthPills: function() {
+    // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
+    // or if the graph itself has not loaded yet. We need the graph to load so that we know which
+    // nodes to request health pills for.
+    return this.debuggerDataEnabled &&
+        this.healthPillsToggledOn &&
+        this._renderHierarchy &&
+        !this._datasetsEmpty(this._datasets);
+  },
+  _maybeInitializeDashboard: function(backend, isAttached) {
+    if (this._initialized || !backend || !isAttached) {
+      // Either this dashboard is already initialized ... or we are not yet ready to initialize.
+      return;
+    }
+    if (typeof ga !== 'undefined' && ga != null) {
+      ga('send', {hitType: 'pageview', page: '/v/graph'});
+    }
+    // Set this to true so we only initialize once.
+    this._initialized = true;
+    Promise.all([backend.graphRuns(), backend.runMetadataTags()])
+      .then(function(result) {
+        var runsWithGraph = result[0].sort(compareTagNames);
+        var runToMetadata = result[1];
+        var datasets = _.map(runsWithGraph, function(runName) {
+          return {
+            name: runName,
+            path: backend.graphUrl(
+                runName, tf.graph.LIMIT_ATTR_SIZE, tf.graph.LARGE_ATTRS_KEY),
+            runMetadata: runToMetadata[runName] ? _.map(
+              runToMetadata[runName].sort(compareTagNames), function(tag) {
+                return {
+                  tag: tag,
+                  path: backend.runMetadataUrl(tag, runName)
+                };
+              }, this) : []
+          };
+        }, this);
+        this.set('_datasets', datasets);
+      }.bind(this));
+  },
+  _requestHealthPills: function() {
+    this.set('_areHealthPillsLoading', true);
+    var requestId = ++this._healthPillRequestId;
+
+    if (this._healthPillStepRequestTimerId !== null) {
+      // A request for health pills is already scheduled to be initiated. Clear it, and schedule a
+      // new request.
+      window.clearTimeout(this._healthPillStepRequestTimerId);
+      this._healthPillStepRequestTimerId = null;
+    }
+
+    if (this.allStepsModeEnabled) {
+      // This path may be slow. Schedule network requests to start some time later. If another
+      // request is scheduled in the mean time, drop this current request.
+      this._healthPillStepRequestTimerId = setTimeout(function() {
+        this._healthPillStepRequestTimerId = null;
+        this._initiateNetworkRequestForHealthPills(requestId);
+      }.bind(this), this._healthPillStepRequestTimerDelay);
+    } else {
+      // The user is fetching sampled steps. This path is fast, so no need to throttle. Directly
+      // fetch the health pills across the network.
+      this._initiateNetworkRequestForHealthPills(requestId);
+    }
+  },
+  // Initiates the network request for health pills. Do not directly call this method - network
+  // requests may be throttled. Instead, call _requestHealthPills, which uses this method.
+  _initiateNetworkRequestForHealthPills: function(requestId) {
+    if (this._healthPillRequestId !== requestId) {
+      // This possibly scheduled request was outdated before it was even sent across the network. Do
+      // not bother initiating it.
+      return;
+    }
+
+    var specificStep = this.allStepsModeEnabled ? this.specificHealthPillStep : undefined;
+
+    var healthPillsPromise = this.backend.healthPills(
+        this._renderHierarchy.getNamesOfRenderedOps(), specificStep);
+    var alertsPromise = this.backend.debuggerNumericsAlerts();
+
+    Promise.all([healthPillsPromise, alertsPromise]).then(
+        function(result) {
+      var healthPillsResult = result[0];
+      var alertsResult = result[1];
+
+      if (!this.healthPillsToggledOn) {
+        // The user has opted to hide health pills via the toggle button.
+        return;
+      }
+
+      if (requestId !== this._healthPillRequestId) {
+        // This response is no longer relevant.
+        return;
+      }
+
+      // Set the index for which step to show for the health pills. By default, show the last step.
+      // A precondition we assume (that Tensorboard's reservoir sampling guarantees) is that all
+      // node names should be mapped to the same number of steps.
+      for (var nodeName in healthPillsResult) {
+        this.set('_healthPillStepIndex', healthPillsResult[nodeName].length - 1);
+        break;
+      }
+
+      this.set('_debuggerNumericAlerts', alertsResult);
+      this.set('_nodeNamesToHealthPills', healthPillsResult);
+      this.set('_areHealthPillsLoading', false);
+      this.set('_healthPillStepRequestTimerId', null);
     }.bind(this));
   },
   _datasetsEmpty: function(datasets) {
     return !datasets || !datasets.length;
-  }
+  },
+  _renderHierarchyChanged: function(renderHierarchy) {
+    // Reload any data on the graph when the render hierarchy (which determines which nodes are
+    // rendered) changes.
+    this.reload();
+  },
+  _handleNodeToggleExpand: function() {
+    // Nodes were toggled. We may need to request health pills for more nodes.
+    this._maybeFetchHealthPills();
+  },
+  _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
+    if (healthPillsToggledOn) {
+      // Load health pills.
+      this.reload();
+    } else {
+      // Remove all health pills by setting an empty mapping.
+      this.set('_nodeNamesToHealthPills', {});
+    }
+  },
+  // Fetch health pills for a specific step if applicable.
+  _maybeFetchHealthPills: function() {
+    if (!this._shouldRequestHealthPills()) {
+      return;
+    }
+
+    this._requestHealthPills();
+  },
 });
-})();
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD
new file mode 100644
index 00000000000..c0d2bd5a46c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_debugger_data_card",
+    srcs = [
+        "tf-graph-debugger-data-card.html",
+    ],
+    path = "/tf-graph-debugger-data-card",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_debugger_data_card"],
+    destdir = "tf-graph-debugger-data-card",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/iron-list:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-spinner:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD
new file mode 100644
index 00000000000..66cb1156188
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-debugger-data-card/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html
new file mode 100644
index 00000000000..934e4f86a83
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html
@@ -0,0 +1,36 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-debugger-data-card.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Info Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <div id='demo-container'>
+      <!-- This simple demo starts up a page with a health pill legend. -->
+      <tf-graph-debugger-data-card></tf-graph-debugger-data-card>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html
new file mode 100644
index 00000000000..6cc99a327cb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html
@@ -0,0 +1,560 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+
+<dom-module id="tf-graph-debugger-data-card">
+<template>
+<style>
+:host {
+  font-size: 12px;
+  margin: 0;
+  padding: 0;
+  display: block;
+}
+
+h2 {
+  padding: 0;
+  text-align: center;
+  margin: 0;
+}
+
+.health-pill-legend {
+  padding: 15px;
+}
+
+.health-pill-legend h2 {
+  text-align: left;
+}
+
+.health-pill-entry {
+  margin: 10px 10px 10px 0;
+}
+
+.health-pill-entry .color-preview {
+  width: 26px;
+  height: 26px;
+  border-radius: 3px;
+  display: inline-block;
+  margin: 0 10px 0 0;
+}
+
+.health-pill-entry .color-label, .health-pill-entry .tensor-count {
+  color: #777;
+  display: inline-block;
+  height: 26px;
+  font-size: 22px;
+  line-height: 26px;
+  vertical-align: top;
+}
+
+.health-pill-entry .tensor-count {
+  float: right;
+}
+
+#health-pill-step-slider {
+  width: 100%;
+  margin: 0 0 0 -15px;
+  /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+   * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+   * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2.
+   * Apparently, the paper-slider lacks a mixin for those padding values. */
+  width: calc(100% + 31px);
+}
+
+#health-pills-loading-spinner {
+  width: 20px;
+  height: 20px;
+  vertical-align: top;
+}
+
+#health-pill-step-number-input {
+  text-align: center;
+  vertical-align: top;
+}
+
+#numeric-alerts-table-container {
+  max-height: 400px;
+  overflow-x: hidden;
+  overflow-y: auto;
+}
+
+#numeric-alerts-table {
+  text-align: left;
+}
+
+#numeric-alerts-table td {
+  vertical-align: top;
+}
+
+#numeric-alerts-table .first-offense-td {
+  display: inline-block;
+}
+
+.first-offense-td {
+  width: 80px;
+}
+
+.tensor-device-td {
+  max-width: 140px;
+  word-wrap : break-word;
+}
+
+.tensor-section-within-table {
+  color: #266236;
+  cursor: pointer;
+  opacity: 0.8;
+  text-decoration: underline;
+}
+
+.tensor-section-within-table:hover {
+  opacity: 1;
+}
+
+.device-section-within-table {
+  color: #666;
+}
+
+.mini-health-pill {
+  width: 130px;
+}
+
+.mini-health-pill > div {
+  height: 100%;
+  width: 60px;
+  border-radius: 3px;
+}
+
+#event-counts-th {
+  padding: 0 0 0 10px;
+}
+
+.negative-inf-mini-health-pill-section {
+  background: rgb(255, 141, 0);
+  width: 20px;
+}
+
+.positive-inf-mini-health-pill-section {
+  background: rgb(0, 62, 212);
+  width: 20px;
+}
+
+.nan-mini-health-pill-section {
+  background: rgb(204, 47, 44);
+  width: 20px;
+}
+
+.negative-inf-mini-health-pill-section,
+.positive-inf-mini-health-pill-section,
+.nan-mini-health-pill-section {
+  color: #fff;
+  display: inline-block;
+  height: 100%;
+  line-height: 20px;
+  margin: 0 0 0 10px;
+  text-align: center;
+}
+
+.no-numeric-alerts-notification {
+  margin: 0;
+}
+</style>
+<paper-material elevation="1" class="card health-pill-legend">
+  <div class="title">
+    Enable all (not just sampled) steps. Requires slow disk read.
+  </div>
+  <paper-toggle-button id="enableAllStepsModeToggle" checked="{{allStepsModeEnabled}}">
+  </paper-toggle-button>
+  <h2>
+    Step of Health Pills:
+    <template is="dom-if" if="[[allStepsModeEnabled]]">
+      <input type="number"
+             id="health-pill-step-number-input"
+             min="0"
+             max="[[_biggestStepEverSeen]]"
+             value="{{specificHealthPillStep::input}}">
+    </template>
+    <template is="dom-if" if="[[!allStepsModeEnabled]]">
+      [[_currentStepDisplayValue]]
+    </template>
+    <paper-spinner-lite active
+                        hidden$=[[!areHealthPillsLoading]]
+                        id="health-pills-loading-spinner"></paper-spinner-lite>
+  </h2>
+  <template is="dom-if" if="[[allStepsModeEnabled]]">
+    <paper-slider
+          id="health-pill-step-slider"
+          immediate-value="{{specificHealthPillStep}}"
+          max="[[_biggestStepEverSeen]]"
+          snaps
+          step="1"
+          value="{{specificHealthPillStep}}"></paper-slider>
+  </template>
+  <template is="dom-if" if="[[!allStepsModeEnabled]]">
+    <template is="dom-if" if="[[_maxStepIndex]]">
+      <paper-slider
+            id="health-pill-step-slider"
+            immediate-value="{{healthPillStepIndex}}"
+            max="[[_maxStepIndex]]"
+            snaps
+            step="1"
+            value="{{healthPillStepIndex}}"></paper-slider>
+    </template>
+  </template>
+  <h2>
+    Health Pill
+    <template is="dom-if" if="[[healthPillValuesForSelectedNode]]">
+      Counts for Selected Node
+    </template>
+    <template is="dom-if" if="[[!healthPillValuesForSelectedNode]]">
+      Legend
+    </template>
+  </h2>
+  <template is="dom-repeat" items="[[healthPillEntries]]">
+    <div class="health-pill-entry">
+      <div class="color-preview" style="background:[[item.background_color]]"></div>
+      <div class="color-label">[[item.label]]</div>
+      <div class="tensor-count">
+        [[_computeTensorCountString(healthPillValuesForSelectedNode, index)]]
+      </div>
+    </div>
+  </template>
+  <div hidden$="[[!_hasDebuggerNumericAlerts(debuggerNumericAlerts)]]">
+    <h2 id="numeric-alerts-header">Numeric Alerts</h2>
+    <p>
+      Alerts are sorted from top to bottom by increasing timestamp.
+    </p>
+    <div id='numeric-alerts-table-container'>
+      <table id="numeric-alerts-table">
+        <thead>
+          <tr>
+            <th>First Offense</th>
+            <th>Tensor (Device)</th>
+            <th id='event-counts-th'>Event Counts</th>
+          </tr>
+        </thead>
+        <tbody id="numeric-alerts-body">
+        </tbody>
+      </table>
+    </div>
+  </div>
+  <template is="dom-if" if="[[!_hasDebuggerNumericAlerts(debuggerNumericAlerts)]]">
+    <p class="no-numeric-alerts-notification">
+      No numeric alerts so far. That is likely good. Alerts indicate the presence of NaN
+      or (+/-) Infinity values, which may be concerning.
+    </p>
+  </template>
+</paper-material>
+</template>
+<script>
+"use strict";
+
+(function() {
+  Polymer({
+    is: 'tf-graph-debugger-data-card',
+
+    properties: {
+      renderHierarchy: Object,
+      debuggerNumericAlerts: {
+        type: Array,
+        notify: true,
+      },
+      nodeNamesToHealthPills: Object,
+      healthPillStepIndex: {
+        type: Number,
+        notify: true,
+      },
+      // Only relevant if we are in all steps mode, in which case the user may want to view health
+      // pills for a specific step.
+      specificHealthPillStep: {
+        type: Number,
+        value: 0,
+        notify: true,
+      },
+      // Two-ways
+      selectedNode: {
+        type: String,
+        notify: true
+      },
+      highlightedNode: {
+        type: String,
+        notify: true
+      },
+      // The enum value of the include property of the selected node.
+      selectedNodeInclude: {
+        type: Number,
+        notify: true
+      },
+      // Whether health pills are currently being loaded, in which case we show a spinner (and the
+      // current health pills shown might be out of date).
+      areHealthPillsLoading: Boolean,
+      healthPillEntries: {
+        type: Array,
+        value: tf.graph.scene.healthPillEntries,
+        readOnly: true,
+      },
+      healthPillValuesForSelectedNode: {
+        type: Array,
+        computed: '_computeHealthPillForNode(nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading)',
+      },
+      // When all-steps mode is enabled, the user can request health pills for any step. In this
+      // mode, Tensorboard makes a request every time the user drags the slider to a different step.
+      allStepsModeEnabled: {
+        type: Boolean,
+        notify: true,
+      },
+      // The biggest step value ever seen. Used to determine what steps of health pills to let the
+      // user fetch in all steps mode.
+      _biggestStepEverSeen: {
+        type: Number,
+        computed: '_computeBiggestStepEverSeen(nodeNamesToHealthPills)',
+      },
+      _maxStepIndex: {
+        type: Number,
+        computed: '_computeMaxStepIndex(nodeNamesToHealthPills)',
+      },
+      _currentStepDisplayValue: {
+        type: String,
+        computed: '_computeCurrentStepDisplayValue(nodeNamesToHealthPills, healthPillStepIndex, allStepsModeEnabled, specificHealthPillStep, areHealthPillsLoading)',
+      },
+    },
+    observers: [
+      '_updateAlertsList(debuggerNumericAlerts)',
+    ],
+    ready: function() {
+      var mainContainer = document.getElementById('mainContainer');
+      var scrollbarContainer = document.querySelector('tf-dashboard-layout .scrollbar');
+      if (mainContainer && scrollbarContainer) {
+        // If this component is being used inside of TensorBoard's dashboard layout, it may easily
+        // cause the dashboard layout element to overflow, giving the user 2 scroll bars. Prevent
+        // that by hiding whatever content overflows - the user will have to expand the viewport to
+        // use this debugging card.
+        mainContainer.style.overflow = 'hidden';
+        scrollbarContainer.style.overflow = 'hidden';
+      }
+    },
+    _healthPillsAvailable: function(debuggerDataEnabled, nodeNamesToHealthPills) {
+      // So long as there is a mapping (even if empty) from node name to health pills, show the
+      // legend and slider. We do that because, even if no health pills exist at the current step,
+      // the user may desire to change steps, and the slider must show for the user to do that.
+      return debuggerDataEnabled && nodeNamesToHealthPills;
+    },
+    _computeTensorCountString: function(healthPillValuesForSelectedNode, valueIndex) {
+      if (!healthPillValuesForSelectedNode) {
+        // No health pill data is available.
+        return '';
+      }
+
+      return healthPillValuesForSelectedNode[valueIndex].toFixed(0);
+    },
+    _computeHealthPillForNode: function(
+        nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading) {
+      if (areHealthPillsLoading) {
+        // Health pills are loading. Do not render data that is out of date.
+        return null;
+      }
+
+      if (!selectedNode) {
+        // No node is selected.
+        return null;
+      }
+
+      const healthPills = nodeNamesToHealthPills[selectedNode];
+      if (!healthPills) {
+        // This node lacks a health pill.
+        return null;
+      }
+
+      // If all steps mode is enabled, we use the first health pill in the list because the JSON
+      // response from the server is a mapping between node name and a list of 1 health pill.
+      const healthPill = healthPills[allStepsModeEnabled ? 0 : healthPillStepIndex];
+      if (!healthPill) {
+        // This node lacks a health pill at the current step.
+        return null;
+      }
+
+      // The health pill count values start at 2. Each health pill contains 6 values.
+      return healthPill.value.slice(2, 8);
+    },
+    _computeCurrentStepDisplayValue: function(
+        nodeNamesToHealthPills,
+        healthPillStepIndex,
+        allStepsModeEnabled,
+        specificHealthPillStep,
+        areHealthPillsLoading) {
+      if (allStepsModeEnabled) {
+        // The user seeks health pills for specific step from the server.
+        return specificHealthPillStep.toFixed(0);
+      }
+
+      if (areHealthPillsLoading) {
+        // The current step is undefined.
+        return 0;
+      }
+
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node. We cannot
+        // directly index into the nodeNamesToHealthPills object because we do not have a key.
+        // If all steps mode is enabled, we only have 1 step to show.
+        return nodeNamesToHealthPills[nodeName][healthPillStepIndex].step.toFixed(0);
+      }
+
+      // The current step could not be computed.
+      return 0;
+    },
+    _computeBiggestStepEverSeen: function(nodeNamesToHealthPills) {
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node.
+        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
+        // of the array will be greater than 1.
+        var healthPills = nodeNamesToHealthPills[nodeName];
+        return Math.max(this._biggestStepEverSeen, healthPills[healthPills.length - 1].step);
+      }
+
+      // No steps seen so far. Default to 0.
+      return this._biggestStepEverSeen || 0;
+    },
+    _computeMaxStepIndex: function(nodeNamesToHealthPills) {
+      for (let nodeName in nodeNamesToHealthPills) {
+        // All nodes have the same number of steps stored, so only examine 1 node.
+        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
+        // of the array will be greater than 1.
+        return nodeNamesToHealthPills[nodeName].length - 1;
+      }
+
+      // Return a falsy value. The slider should be hidden.
+      return 0;
+    },
+    _hasDebuggerNumericAlerts: function(debuggerNumericAlerts) {
+      return debuggerNumericAlerts && debuggerNumericAlerts.length;
+    },
+    _updateAlertsList: function(debuggerNumericAlerts) {
+      var alertBody = this.$$('#numeric-alerts-body');
+      if (!alertBody) {
+        return;
+      }
+
+      alertBody.innerHTML = '';
+
+      for (var i = 0; i < debuggerNumericAlerts.length; i++) {
+        var alert = debuggerNumericAlerts[i];
+        var tableRow = document.createElement('tr');
+
+        var timestampTd = document.createElement('td');
+        timestampTd.innerHTML = tf.graph.util.computeHumanFriendlyTime(alert.first_timestamp);
+        timestampTd.classList.add('first-offense-td');
+        tableRow.appendChild(timestampTd);
+
+        var tensorDeviceTd = document.createElement('td');
+        tensorDeviceTd.classList.add('tensor-device-td')
+
+        var tensorSection = document.createElement('div');
+        tensorSection.classList.add('tensor-section-within-table');
+        tensorSection.innerHTML = alert.tensor_name;
+        this._addOpExpansionListener(tensorSection, alert.tensor_name);
+        tensorDeviceTd.appendChild(tensorSection);
+
+        var deviceSection = document.createElement('div');
+        deviceSection.classList.add('device-section-within-table');
+        deviceSection.innerHTML = '(' + alert.device_name + ')';
+        tensorDeviceTd.appendChild(deviceSection);
+        tableRow.appendChild(tensorDeviceTd);
+
+        var miniHealthPill = document.createElement('div');
+        miniHealthPill.classList.add('mini-health-pill');
+
+        var miniHealthPillTd = document.createElement('td');
+        miniHealthPillTd.classList.add('mini-health-pill-td');
+        miniHealthPillTd.appendChild(miniHealthPill);
+        tableRow.appendChild(miniHealthPillTd);
+
+        if (alert.neg_inf_event_count) {
+          var negativeInfCountSection = document.createElement('div');
+          negativeInfCountSection.classList.add('negative-inf-mini-health-pill-section');
+          negativeInfCountSection.innerHTML = alert.neg_inf_event_count;
+          negativeInfCountSection.setAttribute(
+              'title', alert.neg_inf_event_count + ' events with -∞')
+          miniHealthPill.appendChild(negativeInfCountSection);
+        }
+
+        if (alert.pos_inf_event_count) {
+          var positiveInfCountSection = document.createElement('div');
+          positiveInfCountSection.classList.add('positive-inf-mini-health-pill-section');
+          positiveInfCountSection.innerHTML = alert.pos_inf_event_count;
+          positiveInfCountSection.setAttribute(
+              'title', alert.pos_inf_event_count + ' events with +∞')
+          miniHealthPill.appendChild(positiveInfCountSection);
+        }
+
+        if (alert.nan_event_count) {
+          var nanCountSection = document.createElement('div');
+          nanCountSection.classList.add('nan-mini-health-pill-section');
+          nanCountSection.innerHTML = alert.nan_event_count;
+          nanCountSection.setAttribute(
+              'title', alert.nan_event_count + ' events with NaN')
+          miniHealthPill.appendChild(nanCountSection);
+        }
+
+        Polymer.dom(alertBody).appendChild(tableRow);
+      }
+    },
+    // Adds a listener to an element, so that when that element is clicked, the tensor with
+    // tensorName expands.
+    _addOpExpansionListener: function(clickableElement, tensorName) {
+      clickableElement.addEventListener('click', () => {
+        // When the user clicks on a tensor name, expand all nodes until the user can see the
+        // associated node.
+        var nameOfNodeToSelect = tf.graph.render.expandUntilNodeIsShown(
+            document.getElementById('scene'), this.renderHierarchy, tensorName);
+
+        // Store the current scroll of the graph info card. Node selection alters that scroll, and
+        // we restore the scroll later.
+        var previousScrollFromBottom;
+        var graphInfoCard = document.querySelector('tf-graph-info#graph-info');
+        if (graphInfoCard) {
+          previousScrollFromBottom = graphInfoCard.scrollHeight - graphInfoCard.scrollTop;
+        }
+
+        // Update the selected node within graph logic.
+        var previousSelectedNode = this.selectedNode;
+        this.set('selectedNode', nameOfNodeToSelect);
+
+        // Scroll the graph info card back down if necessary so that user can see the alerts section
+        // again. Selecting the node causes the info card to scroll to the top, which may mean the
+        // user no longer sees the list of alerts.
+        var scrollToOriginalLocation = () => {
+          graphInfoCard.scrollTop = graphInfoCard.scrollHeight - previousScrollFromBottom;
+        };
+        if (graphInfoCard) {
+          // This component is used within an info card. Restore the original scroll.
+          if (previousSelectedNode) {
+            // The card for the selected node has already opened. Immediately restore the scroll.
+            scrollToOriginalLocation();
+          } else {
+            // Give some time for the DOM of the info card to be created before scrolling down.
+            window.setTimeout(scrollToOriginalLocation, 20);
+          }
+        }
+      });
+    },
+  });
+})();
+</script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/BUILD b/tensorflow/tensorboard/components/tf_graph_info/BUILD
new file mode 100644
index 00000000000..22e886d881e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/BUILD
@@ -0,0 +1,53 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_info",
+    srcs = [
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
+    ],
+    path = "/tf-graph-info",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_list",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_info"],
+    destdir = "tf-graph-info",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card:legacy",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/iron-list:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-spinner:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
new file mode 100644
index 00000000000..2f1f7bf2761
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_info/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-info/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_graph_info",
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
new file mode 100644
index 00000000000..30b20645346
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/index.html b/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
new file mode 100644
index 00000000000..f7d2ef7ee5e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-info.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Info Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-info-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <tf-graph-info id="info" title="selected"></tf-graph-info>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-info-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Show info.
+              this.$.info.set('graph', slimGraph);
+              this.$.info.set('graphHierarchy', graphHierarchy);
+
+              // Select a node within that graph.
+              this.$.info.set('selectedNode', 'GradientDescent/learning_rate');
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-info-demo></tf-graph-info-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html
rename to tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
index 527cbb7888f..bac25b67f77 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
@@ -16,6 +16,10 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
+<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../tf-graph-debugger-data-card/tf-graph-debugger-data-card.html">
 <link rel="import" href="tf-node-info.html">
 
 <dom-module id="tf-graph-info">
@@ -26,6 +30,9 @@ limitations under the License.
   margin: 0;
   padding: 0;
   display: block;
+  max-height: 650px;
+  overflow-x: hidden;
+  overflow-y: auto;
 }
 
 h2 {
@@ -46,8 +53,23 @@ h2 {
     </tf-node-info>
   </paper-material>
 </template>
+<template is="dom-if" if="[[_healthPillsAvailable(debuggerDataEnabled, nodeNamesToHealthPills)]]">
+  <tf-graph-debugger-data-card render-hierarchy="[[renderHierarchy]]"
+                               debugger-numeric-alerts="[[debuggerNumericAlerts]]"
+                               node-names-to-health-pills="[[nodeNamesToHealthPills]]"
+                               render-hierarchy="[[renderHierarchy]]"
+                               selected-node="{{selectedNode}}"
+                               highlighted-node="{{highlightedNode}}"
+                               are-health-pills-loading="[[areHealthPillsLoading]]"
+                               all-steps-mode-enabled="{{allStepsModeEnabled}}"
+                               specific-health-pill-step="{{specificHealthPillStep}}"
+                               health-pill-step-index="{{healthPillStepIndex}}">
+  </tf-graph-debugger-data-card>
+</template>
 </template>
 <script>
+"use strict";
+
 (function() {
   Polymer({
     is: 'tf-graph-info',
@@ -57,6 +79,11 @@ h2 {
       graphHierarchy: Object,
       graph: Object,
       renderHierarchy: Object,
+      nodeNamesToHealthPills: Object,
+      healthPillStepIndex: {
+        type: Number,
+        notify: true,
+      },
       colorBy: String,
       // Two-ways
       selectedNode: {
@@ -71,7 +98,11 @@ h2 {
       selectedNodeInclude: {
         type: Number,
         notify: true
-      }
+      },
+      // Whether debugger data is enabled for this instance of Tensorboard.
+      debuggerDataEnabled: Boolean,
+      // Whether health pills are currently being loaded, in which case we show a spinner (and the
+      // current health pills shown might be out of date).
     },
     listeners: {
       'node-list-item-click': '_nodeListItemClicked',
@@ -86,7 +117,13 @@ h2 {
     },
     _nodeListItemMouseout: function() {
       this.highlightedNode = null;
-    }
+    },
+    _healthPillsAvailable: function(debuggerDataEnabled, nodeNamesToHealthPills) {
+      // So long as there is a mapping (even if empty) from node name to health pills, show the
+      // legend and slider. We do that because, even if no health pills exist at the current step,
+      // the user may desire to change steps, and the slider must show for the user to do that.
+      return debuggerDataEnabled && nodeNamesToHealthPills;
+    },
   });
 })();
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
index 1e60cda66ad..66a3034b5b2 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
@@ -19,9 +19,10 @@ limitations under the License.
 <link rel="import" href="../iron-list/iron-list.html">
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-item/all-imports.html">
+<link rel="import" href="../paper-item/paper-item.html">
+<link rel="import" href="../paper-item/paper-item-body.html">
 <link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
+<link rel="import" href="tf-graph-icon.html">
 <link rel="import" href="tf-node-list-item.html">
 
 <dom-module id="tf-node-info">
@@ -315,7 +316,7 @@ limitations under the License.
                   <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
                 </div>
               </template>
-              <template is="dom-if" if="{{_nodeStats.totalMicros}}">
+              <template is="dom-if" if="{{_getTotalMicros(_nodeStats)}}">
                 <div class="sub-list-table-row">
                   <div class="sub-list-table-cell">Compute Time</div>
                   <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
@@ -456,6 +457,9 @@ limitations under the License.
           }
           return null;
         },
+        _getTotalMicros: function(stats) {
+          return stats.getTotalMicros();
+        },
         _getHasDisplayableNodeStats: function(stats) {
           return tf.graph.util.hasDisplayableNodeStats(stats);
         },
@@ -468,12 +472,12 @@ limitations under the License.
               stats.totalBytes, tf.graph.util.MEMORY_UNITS);
         },
         _getNodeStatsFormattedComputeTime: function(stats) {
-          if (!stats || !stats.totalMicros) {
+          if (!stats || !stats.getTotalMicros()) {
             return;
           }
 
           return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalMicros, tf.graph.util.TIME_UNITS);
+              stats.getTotalMicros(), tf.graph.util.TIME_UNITS);
         },
         _getNodeStatsFormattedOutputSizes: function(stats) {
           if (!stats || !stats.outputSize || !stats.outputSize.length) {
@@ -544,7 +548,7 @@ limitations under the License.
            */
           var unpackMetaedge = function(metaedge) {
             return _.map(metaedge.baseEdgeList, function(baseEdge) {
-              name = isPredecessor ? baseEdge.v : baseEdge.w;
+              var name = isPredecessor ? baseEdge.v : baseEdge.w;
               return {
                 name: name,
                 node: this._getNode(name, this.graphHierarchy),
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
index 9e9bface5de..c15478d126c 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
+++ b/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
@@ -16,8 +16,8 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tf-graph-icon.html">
 
 <dom-module id="tf-node-list-item">
   <style>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
new file mode 100644
index 00000000000..41fbfb8ee85
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
@@ -0,0 +1,32 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_graph_loader",
+    srcs = ["tf-graph-loader.html"],
+    path = "/tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_graph_loader"],
+    destdir = "tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
new file mode 100644
index 00000000000..f109a19163b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-loader/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
new file mode 100644
index 00000000000..30b20645346
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
new file mode 100644
index 00000000000..2ffb2a1a59c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Loader Demo</title>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-loader-demo">
+      <template>
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            progress="{{_progress}}">
+        </tf-graph-loader>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-loader-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _progress: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_progressUpdated(_progress)',
+          ],
+          _progressUpdated(progress) {
+            // console.log the progress.
+            console.log('Progress updated.', progress);
+
+            // The graph has loaded. console.log it.
+            if (progress.value == 100) {
+              console.log('graph', this.$.loader.outGraph);
+            }
+          },
+        });
+      </script>
+    </dom-module>
+    <!-- The graph loader lacks visual elements. -->
+    <tf-graph-loader-demo></tf-graph-loader-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
index bdfb3aa2bfc..8d59cbd2aac 100644
--- a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 
 <!--
 An element which provides a filter parsing for pbtxt to graph output.
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
new file mode 100644
index 00000000000..e510e4b4671
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_histogram_dashboard",
+    srcs = ["tf-histogram-dashboard.html"],
+    path = "/tf-histogram-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-histogram-dashboard",
+    deps = [
+        ":tf_histogram_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json
new file mode 100644
index 00000000000..a5600a356e8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json
@@ -0,0 +1 @@
+[[400.0, 40, [-0.3584790755077172, 3.0267252195784047, 20.0, 24.012225532303315, 48.29045006426564, [-0.35363819004775493, -0.29226296698161564, -0.19961953895336082, 0.3214892636797772, 0.5177616740489182, 0.56953784145381, 0.6264916255991911, 0.7580548669750213, 0.8338603536725235, 1.220854943811942, 1.3429404381931362, 1.47723448201245, 1.624957930213695, 1.7874537232350647, 1.9661990955585713, 2.379100905625872, 2.6170109961884593, 3.1665833053880363], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json
new file mode 100644
index 00000000000..407c375d2fc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json
@@ -0,0 +1 @@
+[[400.0, 40, [-2.599286228987632, 3.5098048900144323, 20.0, 10.792285491200078, 66.66796979177158, [-2.379100905625872, -1.9661990955585713, -1.624957930213695, -1.47723448201245, -1.109868130738129, -1.0089710279437536, -0.42790220995778355, -0.2195814928486969, 0.47069243095356195, 0.7580548669750213, 0.917246389039776, 1.3429404381931362, 1.624957930213695, 1.7874537232350647, 2.1628190051144287, 2.6170109961884593, 2.8787120958073054, 3.8315657995195243], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json
new file mode 100644
index 00000000000..752b621ab03
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json
@@ -0,0 +1 @@
+[[400.0, 40, [-0.8286852465281818, 2.0954239138728523, 20.0, 13.546880465642861, 24.14836803774091, [-0.7580548669750213, -0.38900200905253046, -0.06996543062044111, 0.07696197368248522, 0.19961953895336082, 0.2656936063469233, 0.29226296698161564, 0.5177616740489182, 0.7580548669750213, 0.917246389039776, 1.109868130738129, 1.220854943811942, 1.624957930213695, 2.1628190051144287], [2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir
new file mode 100644
index 00000000000..b6362b45d77
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json
new file mode 100644
index 00000000000..cbe657af6b6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json
@@ -0,0 +1,4 @@
+{
+	"run1": {"histograms": ["histo1"]}, 
+	"run2": {"histograms": ["histo2", "histo1"]}
+}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html b/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html
new file mode 100644
index 00000000000..7f1e2f9ff89
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html
@@ -0,0 +1,67 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-histogram-dashboard.html">
+
+<title>Distribution Dashboard Demo</title>
+<style>
+  #container {
+    height: 800px;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="histogram-dash-demo">
+      <template>
+        <tf-histogram-dashboard id="demo" backend="[[backend]]"></tf-histogram-dashboard>
+      </template>
+      <script>
+        import {Backend} from "../../tf-backend/backend";
+        import {createRouter, setRouter} from "../../tf-backend/router";
+
+        Polymer({
+          is: "histogram-dash-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                return new Backend();
+              },
+            },
+          },
+          created: function() {
+            var router = createRouter("data", true);
+            setRouter(router);
+          },
+        });
+      </script>
+    </dom-module>
+    <histogram-dash-demo id="container"></histogram-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
index c3105c3fd83..1821ce3b6f3 100644
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
@@ -95,6 +95,7 @@ contains vz-histogram-timeseries embedded inside tf-panes-helper's.
           color-scale="[[_colorScale]]"
           data-type="[[dataType]]"
           data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
           run2tag="[[run2tag]]"
           selected-runs="[[_selectedRuns]]"
           repeat-for-runs
@@ -120,11 +121,19 @@ contains vz-histogram-timeseries embedded inside tf-panes-helper's.
   </template>
 
   <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+
     Polymer({
       is: "tf-histogram-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        DashboardBehavior("histograms"),
+        ReloadBehavior("tf-chart-scaffold"),
+        BackendBehavior,
       ],
       properties: {
         backend: Object,
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
new file mode 100644
index 00000000000..1e2833f74c5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_image_dashboard",
+    srcs = [
+        "tf-image-dashboard.html",
+        "tf-image-loader.html",
+    ],
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_image_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json
new file mode 100644
index 00000000000..3dec4322134
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.088045,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im1%2Fimage%2F0&index=0&run=run1"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json
new file mode 100644
index 00000000000..16152b8626a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.093653,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im2%2Fimage%2F0&index=0&run=run1"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json
new file mode 100644
index 00000000000..a717b79c5de
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json
@@ -0,0 +1,9 @@
+[
+  {
+    "wall_time":1459200389.117463,
+    "width":4,
+    "height":4,
+    "step":0,
+    "query":"tag=im1%2Fimage%2F0&index=0&run=run2"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
new file mode 100644
index 00000000000..346fd0076be
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
new file mode 100644
index 00000000000..26d2d10acaf
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
new file mode 100644
index 00000000000..6c419062942
Binary files /dev/null and b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir
new file mode 100644
index 00000000000..c7d82022cc0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json
new file mode 100644
index 00000000000..b75de5b6614
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json
@@ -0,0 +1,13 @@
+{
+   "run1":{
+      "images":[
+         "im1/image/0",
+         "im2/image/0"
+      ]
+   },
+   "run2":{
+      "images":[
+         "im1/image/0"
+      ]
+   }
+}
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html
deleted file mode 100644
index 9689c13f3bd..00000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html
+++ /dev/null
@@ -1,62 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-    <link rel="import" href="../tf-image-dashboard.html">
-    <title>Image Dashboard Demo</title>
-    <style>
-      #container{
-        width: 1000px;
-        height: 800px;
-        border: 2px solid grey;
-      }
-      html,body {
-        height: 100%;
-      }
-    </style>
-  </head>
-  <body>
-    <dom-module id="image-dash-demo">
-      <template>
-        <tf-image-dashboard id="demo" backend="[[backend]]">
-        </tf-image-dashboard>
-      </template>
-      <script>
-        Polymer({
-          is: "image-dash-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                var path = "/demo/data";
-                var router = new TF.Backend.router(path, true);
-                return new TF.Backend.Backend(router);
-              },
-            },
-          },
-        });
-      </script>
-    </dom-module>
-    <div id="container">
-      <image-dash-demo></image-dash-demo>
-    </div>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/index.html b/tensorflow/tensorboard/components/tf_image_dashboard/index.html
new file mode 100644
index 00000000000..27a31d5ad50
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/index.html
@@ -0,0 +1,72 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-image-dashboard.html">
+    <title>Image Dashboard Demo</title>
+    <style>
+      #container{
+        width: 1000px;
+        height: 800px;
+        border: 2px solid grey;
+      }
+      html,body {
+        height: 100%;
+      }
+    </style>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="image-dash-demo">
+          <template>
+            <tf-image-dashboard id="demo" backend="[[backend]]">
+            </tf-image-dashboard>
+          </template>
+          <script>
+            import {Backend} from "../../tf-backend/backend";
+            import {createRouter, setRouter} from "../../tf-backend/router";
+
+            Polymer({
+              is: "image-dash-demo",
+              properties: {
+                backend: {
+                  type: Object,
+                  value: function() {
+                    return new Backend();
+                  },
+                },
+              },
+              created: function() {
+                var path = "data";
+                var router = createRouter(path, true);
+                setRouter(router);
+              },
+            });
+          </script>
+        </dom-module>
+        <div id="container">
+          <image-dash-demo></image-dash-demo>
+        </div>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
index 0274a1f3391..5d46847eb88 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
@@ -57,6 +57,7 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
           color-scale="[[_colorScale]]"
           data-type="[[dataType]]"
           data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
           run2tag="[[run2tag]]"
           selected-runs="[[_selectedRuns]]"
           repeat-for-runs
@@ -105,8 +106,15 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
     </style>
   </template>
   <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+
     Polymer({
       is: "tf-image-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       properties: {
         backend: Object,
         dataType: {
@@ -115,8 +123,9 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
         },
       },
       behaviors: [
-          TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-          TF.Backend.Behavior,
+        DashboardBehavior("images"),
+        ReloadBehavior("tf-chart-scaffold"),
+        BackendBehavior,
       ],
       attached: function() {
         this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
index 9a1bc1dded4..41fb12eefa7 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
@@ -104,10 +104,13 @@ future for loading older images.
         display: block;
         width: 100%;
         height: auto;
+        max-height: 500px;
       }
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-image-loader",
       properties: {
@@ -189,6 +192,7 @@ future for loading older images.
         }
 
         let img = new Image();
+        img.id = "img"; // '#img' used to select the image in tf-image-dashboard.
 
         const loadIndex = ++this._currentImageLoadIndex;
         img.onload = img.onerror = (function() {
diff --git a/tensorflow/tensorboard/components/tf_imports/BUILD b/tensorflow/tensorboard/components/tf_imports/BUILD
index 5655407b300..84b46bf0053 100644
--- a/tensorflow/tensorboard/components/tf_imports/BUILD
+++ b/tensorflow/tensorboard/components/tf_imports/BUILD
@@ -1,57 +1,499 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("//tensorflow/tensorboard/defs:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
-    name = "d3",
-    srcs = [
-        "d3.html",
-        "@org_d3js",
-    ],
-    path = "/tf-imports",
+ts_web_library(
+    name = "webcomponentsjs",
+    srcs = ["@org_definitelytyped//:webcomponents.js.d.ts"],
+    path = "/webcomponentsjs",
+    visibility = ["//visibility:public"],
+    exports = ["@org_polymer_webcomponentsjs"],
 )
 
-webfiles(
+ts_web_library(
+    name = "polymer",
+    srcs = ["@org_definitelytyped//:polymer.d.ts"],
+    path = "/polymer",
+    visibility = ["//visibility:public"],
+    exports = ["@org_polymer"],
+    deps = [":webcomponentsjs"],
+)
+
+ts_web_library(
     name = "lodash",
     srcs = [
         "lodash.html",
-        "@com_lodash",
+        "@org_definitelytyped//:lodash.d.ts",
     ],
     path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = ["@com_lodash"],
 )
 
-webfiles(
+ts_web_library(
+    name = "threejs",
+    srcs = [
+        "threejs.html",
+        "@org_definitelytyped//:three.d.ts",
+        "@org_threejs//:OrbitControls.js",
+        "@org_threejs//:three.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+ts_web_library(
+    name = "numericjs",
+    srcs = [
+        "numericjs.html",
+        "@com_numericjs//:numeric.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+ts_web_library(
+    name = "weblas",
+    srcs = [
+        "weblas.html",
+        "@io_github_waylonflinn_weblas//:weblas.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+ts_web_library(
     name = "graphlib",
     srcs = [
         "graphlib.html",
-        "@io_github_cpettitt_graphlib",
+        "@io_github_cpettitt_graphlib//:graphlib.core.js",
     ],
     path = "/tf-imports",
+    visibility = ["//visibility:public"],
     deps = [":lodash"],
 )
 
-webfiles(
+ts_web_library(
     name = "dagre",
     srcs = [
         "dagre.html",
-        "@io_github_cpettitt_dagre",
+        "@io_github_cpettitt_dagre//:dagre.core.js",
     ],
     path = "/tf-imports",
+    visibility = ["//visibility:public"],
     deps = [
         ":graphlib",
         ":lodash",
     ],
 )
 
-webfiles(
-    name = "plottable",
+ts_web_library(
+    name = "d3",
     srcs = [
-        "plottable.html",
-        "@com_palantir_plottable//:plottable.css",
-        "@com_palantir_plottable//:plottable.js",
+        "d3.d.ts",
+        "d3.html",
+        "@org_d3js//:d3.min.js",
     ],
     path = "/tf-imports",
-    deps = [":d3"],
+    visibility = ["//visibility:public"],
+)
+
+ts_web_library(
+    name = "plottable",
+    srcs = [
+        "plottable.d.ts",
+        "plottable.html",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":d3",
+        ":plottable_js_css",
+    ],
+)
+
+ts_web_library(
+    name = "plottable_js_css",
+    srcs = [
+        "@com_palantir_plottable//:package/plottable.css",
+        "@com_palantir_plottable//:package/plottable.js",
+    ],
+    path = "/tf-imports",
+    strip_prefix = "package",
+    visibility = ["//visibility:private"],
+)
+
+ts_web_library(
+    name = "web_component_tester",
+    testonly = 1,
+    visibility = ["//visibility:public"],
+    exports = [
+        ":chai_typings",
+        ":mocha_typings",
+        ":sinon_typings",
+        "@org_npmjs_registry_web_component_tester",
+    ],
+)
+
+ts_web_library(
+    name = "chai_typings",
+    testonly = 1,
+    srcs = ["@org_definitelytyped//:chai.d.ts"],
+    path = "/chai",
+    visibility = ["//visibility:private"],
+)
+
+ts_web_library(
+    name = "mocha_typings",
+    testonly = 1,
+    srcs = ["@org_definitelytyped//:mocha.d.ts"],
+    path = "/mocha",
+    visibility = ["//visibility:private"],
+)
+
+ts_web_library(
+    name = "sinon_typings",
+    testonly = 1,
+    srcs = ["@org_definitelytyped//:sinon.d.ts"],
+    path = "/sinonjs",
+    visibility = ["//visibility:private"],
+)
+
+# Generate single TypeScript typings file for d3.js with no ES6 imports.
+#
+# The DefinitelyTyped definition of d3 v4 was written under the assumption that
+# we want to use d3 in a modularized way. We don't want to do that because its
+# import statements use NodeJS namespaces, and the Web Compiler only supports
+# W3C, ECMA, and IETF standards.
+tensorboard_typescript_bundle(
+    name = "d3_typings",
+    out = "d3.d.ts",
+    namespace_srcs = {"d3": [
+        "d3-transition.d.ts",
+        "@org_definitelytyped_types_d3_path//:index.d.ts",
+        "@org_definitelytyped_types_d3_time//:index.d.ts",
+        "@org_definitelytyped_types_d3_dsv//:index.d.ts",
+        "@org_definitelytyped_types_d3_color//:index.d.ts",
+        "@org_definitelytyped_types_d3_selection//:index.d.ts",
+        "@org_definitelytyped_types_d3_shape//:index.d.ts",
+        "@org_definitelytyped_types_d3_scale//:index.d.ts",
+        "@org_definitelytyped_types_d3_request//:index.d.ts",
+        "@org_definitelytyped_types_d3_interpolate//:index.d.ts",
+        "@org_definitelytyped_types_d3_drag//:index.d.ts",
+        "@org_definitelytyped_types_d3_brush//:index.d.ts",
+        "@org_definitelytyped_types_d3_axis//:index.d.ts",
+        "@org_definitelytyped_types_d3_zoom//:index.d.ts",
+        "@org_definitelytyped_types_d3_array//:index.d.ts",
+        "@org_definitelytyped_types_d3_chord//:index.d.ts",
+        "@org_definitelytyped_types_d3_collection//:index.d.ts",
+        "@org_definitelytyped_types_d3_dispatch//:index.d.ts",
+        "@org_definitelytyped_types_d3_ease//:index.d.ts",
+        "@org_definitelytyped_types_d3_force//:index.d.ts",
+        "@org_definitelytyped_types_d3_format//:index.d.ts",
+        "@org_definitelytyped_types_d3_hierarchy//:index.d.ts",
+        "@org_definitelytyped_types_d3_polygon//:index.d.ts",
+        "@org_definitelytyped_types_d3_quadtree//:index.d.ts",
+        "@org_definitelytyped_types_d3_queue//:index.d.ts",
+        "@org_definitelytyped_types_d3_random//:index.d.ts",
+        "@org_definitelytyped_types_d3_timer//:index.d.ts",
+        "@org_definitelytyped_types_d3_voronoi//:index.d.ts",
+    ]},
+)
+
+# It would be nice if Plottable released a .d.ts file for plottable.js like
+# they did for previous versions.
+tensorboard_typescript_bundle(
+    name = "plottable_typings",
+    out = "plottable.d.ts",
+    namespace_srcs = {
+        "Plottable": [
+            "@com_palantir_plottable//:package/build/src/core/dataset.d.ts",
+            "@com_palantir_plottable//:package/build/src/core/interfaces.d.ts",
+            "@com_palantir_plottable//:package/build/src/core/version.d.ts",
+        ],
+        "Plottable.Animators": [
+            "@com_palantir_plottable//:package/build/src/animators/animator.d.ts",
+            "@com_palantir_plottable//:package/build/src/animators/easingAnimator.d.ts",
+            "@com_palantir_plottable//:package/build/src/animators/nullAnimator.d.ts",
+        ],
+        "Plottable.Axes": [
+            "@com_palantir_plottable//:package/build/src/axes/axis.d.ts",
+            "@com_palantir_plottable//:package/build/src/axes/categoryAxis.d.ts",
+            "@com_palantir_plottable//:package/build/src/axes/numericAxis.d.ts",
+            "@com_palantir_plottable//:package/build/src/axes/timeAxis.d.ts",
+        ],
+        "Plottable.Components": [
+            "@com_palantir_plottable//:package/build/src/components/component.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/componentContainer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/dragBoxLayer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/dragLineLayer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/gridlines.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/group.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/guideLineLayer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/interpolatedColorLegend.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/label.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/legend.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/plotGroup.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/selectionBoxLayer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/table.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/xDragBoxLayer.d.ts",
+            "@com_palantir_plottable//:package/build/src/components/yDragBoxLayer.d.ts",
+        ],
+        "Plottable.Configs": [
+            "@com_palantir_plottable//:package/build/src/core/config.d.ts",
+        ],
+        "Plottable.Formatters": [
+            "@com_palantir_plottable//:package/build/src/core/formatters.d.ts",
+        ],
+        "Plottable.RenderController": [
+            "@com_palantir_plottable//:package/build/src/core/renderController.d.ts",
+        ],
+        "Plottable.RenderPolicies": [
+            "@com_palantir_plottable//:package/build/src/core/renderPolicy.d.ts",
+        ],
+        "Plottable.SymbolFactories": [
+            "@com_palantir_plottable//:package/build/src/core/symbolFactories.d.ts",
+        ],
+        "Plottable.Dispatchers": [
+            "@com_palantir_plottable//:package/build/src/dispatchers/dispatcher.d.ts",
+            "@com_palantir_plottable//:package/build/src/dispatchers/keyDispatcher.d.ts",
+            "@com_palantir_plottable//:package/build/src/dispatchers/mouseDispatcher.d.ts",
+            "@com_palantir_plottable//:package/build/src/dispatchers/touchDispatcher.d.ts",
+        ],
+        "Plottable.Drawers": [
+            "@com_palantir_plottable//:package/build/src/drawers/arcDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/arcOutlineDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/areaDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/canvasBuffer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/canvasDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/drawStep.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/drawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/lineDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/rectangleDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/segmentDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/svgDrawer.d.ts",
+            "@com_palantir_plottable//:package/build/src/drawers/symbolDrawer.d.ts",
+        ],
+        "Plottable.Interactions": [
+            "@com_palantir_plottable//:package/build/src/interactions/clickInteraction.d.ts",
+            "@com_palantir_plottable//:package/build/src/interactions/dragInteraction.d.ts",
+            "@com_palantir_plottable//:package/build/src/interactions/interaction.d.ts",
+            "@com_palantir_plottable//:package/build/src/interactions/keyInteraction.d.ts",
+            "@com_palantir_plottable//:package/build/src/interactions/panZoomInteraction.d.ts",
+            "@com_palantir_plottable//:package/build/src/interactions/pointerInteraction.d.ts",
+        ],
+        "Plottable.Plots": [
+            "@com_palantir_plottable//:package/build/src/plots/areaPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/barPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/clusteredBarPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/commons.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/linePlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/piePlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/plot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/rectanglePlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/scatterPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/segmentPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/stackedAreaPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/stackedBarPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/waterfallPlot.d.ts",
+            "@com_palantir_plottable//:package/build/src/plots/xyPlot.d.ts",
+        ],
+        "Plottable.Scales": [
+            "@com_palantir_plottable//:package/build/src/scales/index.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/categoryScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/colorScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/interpolatedColorScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/linearScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/modifiedLogScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/quantitativeScale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/scale.d.ts",
+            "@com_palantir_plottable//:package/build/src/scales/timeScale.d.ts",
+        ],
+        "Plottable.Scales.TickGenerators": [
+            "@com_palantir_plottable//:package/build/src/scales/tickGenerators.d.ts",
+        ],
+        "Plottable.Utils": [
+            "@com_palantir_plottable//:package/build/src/utils/addD3SelectionMulti.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/bucket.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/callbackSet.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/coerceD3.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/entityStore.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/makeEnum.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/map.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/set.d.ts",
+            "@com_palantir_plottable//:package/build/src/utils/transformAwareTranslator.d.ts",
+        ],
+        "Plottable.Utils.Array": [
+            "@com_palantir_plottable//:package/build/src/utils/arrayUtils.d.ts",
+        ],
+        "Plottable.Utils.Color": [
+            "@com_palantir_plottable//:package/build/src/utils/colorUtils.d.ts",
+        ],
+        "Plottable.Utils.DOM": [
+            "@com_palantir_plottable//:package/build/src/utils/domUtils.d.ts",
+        ],
+        "Plottable.Utils.Math": [
+            "@com_palantir_plottable//:package/build/src/utils/mathUtils.d.ts",
+        ],
+        "Plottable.Utils.Stacking": [
+            "@com_palantir_plottable//:package/build/src/utils/stackingUtils.d.ts",
+        ],
+        "Plottable.Utils.Window": [
+            "@com_palantir_plottable//:package/build/src/utils/windowUtils.d.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "Plottable.Animators": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "SimpleSelection": "Plottable.SimpleSelection",
+        },
+        "Plottable.Axes": {
+            "Component": "Plottable.Components.Component",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scale": "Plottable.Scales.Scale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+        },
+        "Plottable.Components": {
+            "Bounds": "Plottable.Bounds",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IEntity": "Plottable.IEntity",
+            "Interactions": "Plottable.Interactions",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.RenderController": {
+            "Component": "Plottable.Components.Component",
+            "RenderPolicies": "Plottable.RenderPolicies",
+        },
+        "Plottable.SymbolFactories": {
+            "d3Shape": "d3",
+        },
+        "Plottable.Dispatchers": {
+            "Component": "Plottable.Components.Component",
+            "Dispatchers": "Plottable.Dispatchers",
+            "Point": "Plottable.Point",
+        },
+        "Plottable.Drawers": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.Interactions": {
+            "Component": "Plottable.Components.Component",
+            "Point": "Plottable.Point",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+        },
+        "Plottable.Plots": {
+            "AppliedDrawStep": "Plottable.Drawers.AppliedDrawStep",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "DrawStep": "Plottable.Drawers.DrawStep",
+            "Drawers": "Plottable.Drawers",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IEntity": "Plottable.IEntity",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "Projector": "Plottable.Projector",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Range": "Plottable.Range",
+            "Scale": "Plottable.Scales.Scale",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "Utils": "Plottable.Utils",
+            "d3Shape": "d3",
+        },
+        "Plottable.Scales": {
+            "Dataset": "Plottable.Dataset",
+            "Scales": "Plottable.Scales",
+        },
+        "Plottable.Scales.TickGenerators": {
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+        },
+        "Plottable.Utils": {
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "Point": "Plottable.Point",
+            "Range": "Plottable.Range",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "Utils": "Plottable.Utils",
+        },
+    },
+    namespace_symbol_aliases_public = {
+        "Plottable": {
+            "Axis": "Plottable.Axes.Axis",
+            "AxisOrientation": "Plottable.Axes.AxisOrientation",
+            "ClickCallback": "Plottable.Interactions.ClickCallback",
+            "Component": "Plottable.Components.Component",
+            "ComponentCallback": "Plottable.Components.ComponentCallback",
+            "ComponentContainer": "Plottable.Components.ComponentContainer",
+            "Dispatcher": "Plottable.Dispatchers.Dispatcher",
+            "DragBoxCallback": "Plottable.Components.DragBoxCallback",
+            "DragCallback": "Plottable.Interactions.DragCallback",
+            "EaseFn": "Plottable.Animators.EaseFn",
+            "EaseName": "Plottable.Animators.EaseName",
+            "Easing": "Plottable.Animators.Easing",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDragLineCallback": "Plottable.Components.IDragLineCallback",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IResizeHandler": "Plottable.Components.IResizeHandler",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Interaction": "Plottable.Interactions.Interaction",
+            "Key": "Plottable.Interactions.Key",
+            "KeyCallback": "Plottable.Interactions.KeyCallback",
+            "Null": "Plottable.Animators.Null",
+            "Plot": "Plottable.Plots.Plot",
+            "PointerCallback": "Plottable.Interactions.PointerCallback",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Renderer": "Plottable.Plots.Renderer",
+            "Scale": "Plottable.Scales.Scale",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TimeInterval": "Plottable.Axes.TimeInterval",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "XAlignment": "Plottable.Components.XAlignment",
+            "XYPlot": "Plottable.Plots.XYPlot",
+            "YAlignment": "Plottable.Components.YAlignment",
+        },
+    },
+)
+
+# Removes the 'declare module' block inside this file, but keeps its content.
+genrule(
+    name = "kludge_d3_transition",
+    srcs = ["@org_definitelytyped_types_d3_transition//:index.d.ts"],
+    outs = ["d3-transition.d.ts"],
+    cmd = "sed '/^declare module/d' $< | awk '/^}$$/ && !p {p++;next}1' >$@",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/components/tf_imports/d3.html b/tensorflow/tensorboard/components/tf_imports/d3.html
index d2c6cd8b5f0..76ca302709a 100644
--- a/tensorflow/tensorboard/components/tf_imports/d3.html
+++ b/tensorflow/tensorboard/components/tf_imports/d3.html
@@ -15,4 +15,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="d3.js"></script>
+<!--
+@license
+d3
+Copyright 2010-2017 Mike Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the author nor the names of contributors may be used to
+  endorse or promote products derived from this software without specific prior
+  written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+<script jscomp-nocompile src="d3.min.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/dagre.html b/tensorflow/tensorboard/components/tf_imports/dagre.html
index 48fe39da793..b90dc58e390 100644
--- a/tensorflow/tensorboard/components/tf_imports/dagre.html
+++ b/tensorflow/tensorboard/components/tf_imports/dagre.html
@@ -16,9 +16,30 @@ limitations under the License.
 -->
 
 <!--
-HTML imports are non-blocking thus getting the dependency 'graphlib'
-and 'lodash' via script imports instead.
+@license
+Dagre
+Copyright (c) 2012-2014 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
 -->
-<script src="lodash.js"></script>
-<script src="graphlib.core.js"></script>
-<script src="dagre.core.js"></script>
+
+<link rel="import" href="lodash.html">
+<link rel="import" href="graphlib.html">
+
+<script jscomp-nocompile src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/graphlib.html b/tensorflow/tensorboard/components/tf_imports/graphlib.html
index 4e19f7b008f..664b855f17f 100644
--- a/tensorflow/tensorboard/components/tf_imports/graphlib.html
+++ b/tensorflow/tensorboard/components/tf_imports/graphlib.html
@@ -15,5 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
-<script src="graphlib.core.js"></script>
+<link rel="import" href="lodash.html">
+
+<script jscomp-nocompile src="graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/lodash.html b/tensorflow/tensorboard/components/tf_imports/lodash.html
index f92aa808799..65ff6a4b032 100644
--- a/tensorflow/tensorboard/components/tf_imports/lodash.html
+++ b/tensorflow/tensorboard/components/tf_imports/lodash.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<script jscomp-nocompile src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/numericjs.html b/tensorflow/tensorboard/components/tf_imports/numericjs.html
new file mode 100644
index 00000000000..81fa9491688
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports/numericjs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+Numeric Javascript
+Copyright (C) 2011 by Sébastien Loisel
+Copyright (c) 2011 Alberto Santini <albertosantini@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script jscomp-suppress src="numeric.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/plottable.html b/tensorflow/tensorboard/components/tf_imports/plottable.html
index 57f9c1d6d3a..77ad544d5a0 100644
--- a/tensorflow/tensorboard/components/tf_imports/plottable.html
+++ b/tensorflow/tensorboard/components/tf_imports/plottable.html
@@ -15,6 +15,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+<!--
+@license
+Plottable.js
+Copyright (c) 2014-2017 Palantir Technologies, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
 <link rel="import" href="d3.html">
-<script src="plottable.js"></script>
-<link rel="stylesheet" type="text/css" href="plottable.css">
+<script jscomp-suppress src="plottable.js"></script>
+<link rel="stylesheet" href="plottable.css">
diff --git a/tensorflow/tensorboard/components/tf_imports/threejs.html b/tensorflow/tensorboard/components/tf_imports/threejs.html
new file mode 100644
index 00000000000..7f4233b5713
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports/threejs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+three.js
+Copyright (c) 2010-2013 three.js authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script jscomp-suppress src="three.js"></script>
+<script jscomp-suppress src="OrbitControls.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/weblas.html b/tensorflow/tensorboard/components/tf_imports/weblas.html
new file mode 100644
index 00000000000..c07020598fc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports/weblas.html
@@ -0,0 +1,42 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+weblas
+Copyright (c) 2015 Waylon Flinn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-->
+
+<script jscomp-nocompile src="weblas.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/README.md b/tensorflow/tensorboard/components/tf_imports_google/README.md
deleted file mode 100644
index 60d9cce777b..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This file acts as import routers for third party javascript libraries,
-e.g. Plottable and D3 from `g3/third_party`; it exists to facilitate development
-inside google.
diff --git a/tensorflow/tensorboard/components/tf_imports_google/d3.html b/tensorflow/tensorboard/components/tf_imports_google/d3.html
deleted file mode 100644
index dbfd11aa87e..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/d3.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../d3-library/d3.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/dagre.html b/tensorflow/tensorboard/components/tf_imports_google/dagre.html
deleted file mode 100644
index 5b8b9817410..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/dagre.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../dagre-library/dagre.html">
diff --git a/tensorflow/tensorboard/components/tf_imports_google/graphlib.html b/tensorflow/tensorboard/components/tf_imports_google/graphlib.html
deleted file mode 100644
index 56b37ebe4bb..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/graphlib.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../graphlib-library/graphlib.html">
diff --git a/tensorflow/tensorboard/components/tf_imports_google/lodash.html b/tensorflow/tensorboard/components/tf_imports_google/lodash.html
deleted file mode 100644
index eb8fef28831..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/lodash.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../lodash-library/lodash-library.html">
diff --git a/tensorflow/tensorboard/components/tf_imports_google/plottable.html b/tensorflow/tensorboard/components/tf_imports_google/plottable.html
deleted file mode 100644
index 6f9678f9cb2..00000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/plottable.html
+++ /dev/null
@@ -1,19 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="d3.html">
-<link rel="import" href="../plottable-library/plottable.html">
diff --git a/tensorflow/tensorboard/components/tf_option_selector/BUILD b/tensorflow/tensorboard/components/tf_option_selector/BUILD
new file mode 100644
index 00000000000..3f7eed25cb1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_option_selector/BUILD
@@ -0,0 +1,21 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_option_selector",
+    srcs = ["tf-option-selector.html"],
+    path = "/tf-option-selector",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD b/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD
new file mode 100644
index 00000000000..5d04618a545
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_profile_dashboard",
+    srcs = [
+        "tf-profile-dashboard.html",
+    ],
+    path = "/tf-profile-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_graph_controls",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD
new file mode 100644
index 00000000000..3cc20ba352f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-profile-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_profile_dashboard",
+        "//tensorflow/tensorboard/components/tf_trace_viewer:demo",
+        "@org_polymer",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir
new file mode 100644
index 00000000000..ecaaa8ac758
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/some/fake/logdir"}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json
new file mode 100644
index 00000000000..bc1a08b535f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json
@@ -0,0 +1,27 @@
+{
+  "traceEvents": [
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
+        "name": "counter", "args": {"value": 10}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
+        "name": "A long name that doesnt fit but is exceedingly informative",
+        "args": {"name_false": false, "value_true": true}},
+    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
+        "name": "ProcessWideEvent1", "args": {}}
+  ],
+  "stackFrames": {
+    "1": {
+      "category": "m1",
+      "name": "main"
+    },
+    "7": {
+      "category": "m2",
+      "name": "frame7",
+      "parent": "1"
+    },
+    "8": {
+      "category": "m2",
+      "name": "frame8",
+      "parent": "1"
+    }
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_unsupported.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_unsupported.json
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json
new file mode 100644
index 00000000000..e1d57394e35
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json
@@ -0,0 +1,105 @@
+{
+  "traceEvents": [
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
+        "name": "counter", "args": {"value": 10}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
+        "name": "A long name that doesnt fit but is exceedingly informative",
+        "args": {"name_false": false, "value_true": true}},
+    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
+        "name": "ProcessWideEvent1", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 827, "ph": "B",
+        "name": "Asub with a name that wont fit", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 828, "ph": "E",
+        "name": "Asub", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 829, "ph": "B",
+        "name": "Asub", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 15, "ts": 820, "ph": "X",
+        "name": "Long X type", "args": {}, "sf": 7, "esf": 8},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "E",
+        "name": "Asub", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
+        "name": "X1", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
+        "name": "X same ts and dur as X1", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "C",
+        "name": "counter", "args": {"value": 1}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 833, "ph": "E",
+        "name": "", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 835, "ph": "I",
+        "name": "ThreadLevelI1", "args": {}},
+
+    {"cat": "PERF", "ts": 880, "ph": "I", "s": "g", "name": "GlobalEvent1",
+        "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 837, "ph": "I",
+        "name": "ThreadLevelI2", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 839, "ph": "C",
+        "name": "counter", "args": {"value": 5}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 840, "ph": "B",
+        "name": "A not as long a name", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "E",
+        "name": "A not as long a name", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "C",
+        "name": "counter", "args": {"value": 1}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "C",
+        "name": "counter", "args": {"value": 10}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 850, "ph": "B",
+        "name": "B", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "E",
+        "name": "B", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 827, "ph": "B",
+        "name": "A", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 835, "ph": "I",
+        "name": "ThreadLevelImmediate Three", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 845, "ph": "I",
+        "name": "ThreadLevelImmediate4", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 854, "ph": "E",
+        "name": "A", "args": {}},
+
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
+        "name": "B/E over X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 10, "ts": 860, "ph": "X",
+        "name": "X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
+        "name": "B/E under X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
+        "name": "B/E under X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
+        "name": "B/E over X", "args": {}},
+
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 870, "ph": "P",
+        "name": "SampleA", "args": {}},
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 875, "ph": "P",
+        "name": "SampleB", "args": {}},
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 878, "ph": "P",
+        "name": "SampleC", "args": {}, "sf": 8},
+
+    {"cat": "__metadata", "pid": 22630, "tid": 22630, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadA"}},
+    {"cat": "__metadata", "pid": 22630, "tid": 22631, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadB"}},
+    {"cat": "__metadata", "pid": 22630, "tid": 22632, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadC"}}
+  ],
+  "stackFrames": {
+    "1": {
+      "category": "m1",
+      "name": "main"
+    },
+    "7": {
+      "category": "m2",
+      "name": "frame7",
+      "parent": "1"
+    },
+    "8": {
+      "category": "m2",
+      "name": "frame8",
+      "parent": "1"
+    }
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json
new file mode 100644
index 00000000000..12ef5bf8b2e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json
@@ -0,0 +1 @@
+{"foo": ["trace_viewer"], "bar": ["unsupported", "trace_viewer"]}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html
new file mode 100644
index 00000000000..15064a54f8f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../tf-profile-dashboard.html">
+    <title>Profile Dashboard Demo</title>
+    <style>
+      #container{
+        height: 800px;
+        border: 2px solid grey;
+      }
+      html, body {
+        margin: 0;
+        padding: 0;
+        height: 100%;
+        font-family: "RobotoDraft","Roboto",sans-serif;
+      }
+    </style>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="profile-dash-demo">
+          <template>
+            <tf-profile-dashboard
+              backend="[[_backend]]",
+              trace-viewer-base-url="../../tf-trace-viewer/demo.html">
+            </tf-profile-dashboard>
+          </template>
+          <script>
+            import {Backend} from "../../tf-backend/backend";
+            import {createRouter, setRouter} from "../../tf-backend/router";
+
+            Polymer({
+              is: "profile-dash-demo",
+              properties: {
+                _backend: {
+                  type: Object,
+                  value: function() {
+                    return new Backend();
+                  },
+                },
+              },
+              created: function() {
+                var router = createRouter("data", true);
+                setRouter(router);
+              },
+            });
+          </script>
+        </dom-module>
+        <div id="container">
+          <profile-dash-demo></profile-dash-demo>
+        </div>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html b/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html
new file mode 100644
index 00000000000..4028f0e0f06
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html
@@ -0,0 +1,222 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
+
+<!--
+tf-profile-dashboard displays profiling information for different tools.
+
+In the profile dashboard, a "run" is a profile run and "tag" is a tool name. A
+profile run can have multiple tools that present the performance profile as different visualizations
+(e.g. Catapult TraceViewer).
+-->
+
+<dom-module id="tf-profile-dashboard">
+<template>
+<tf-dashboard-layout>
+<div class="sidebar">
+  <div class="allcontrols">
+    <div class="sidebar-section">
+      <div class="title">Runs <span class="counter">([[_datasets.length]])</span></div>
+      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+        <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
+          <template is="dom-repeat" items="[[_datasets]]">
+            <paper-item>[[item.name]]</paper-item>
+          </template>
+        </paper-menu>
+      </paper-dropdown-menu>
+    </div>
+    <div class="sidebar-section">
+      <div class="title">Tools <span class="counter">([[_activeTools.length]])</span></div>
+      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
+        <paper-menu id="select" class="dropdown-content" selected="{{selectedTool}}">
+          <template is="dom-repeat" items="[[_activeTools]]">
+            <paper-item>[[item]]</paper-item>
+          </template>
+          <paper-item>None</paper-item>
+        </paper-menu>
+      </paper-dropdown-menu>
+    </div>
+    <div class="sidebar-section"></div>
+  </div>
+</div>
+<div class="center">
+  <template is="dom-if" if="[[_toolIsTraceViewer(_currentTool)]]">
+    <iframe
+      id="tv_iframe"
+      height="100%"
+      width="100%"
+      src="[[_traceDataUrl]]">
+    </iframe>
+  </template>
+</div>
+</tf-dashboard-layout>
+<style include="dashboard-style"></style>
+
+<style>
+  .center {
+    position: relative;
+    height: 100%;
+  }
+  iframe {
+    position: absolute;
+    width: 100%;
+    height: 100%;
+    box-sizing: border-box;
+  }
+</style>
+
+</template>
+<script>
+  "use strict";
+
+  import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+  import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+  import {BackendBehavior} from "../tf-backend/behavior";
+  import {getRouter} from '../tf-backend/router';
+
+  Polymer({
+    is: "tf-profile-dashboard",
+    factoryImpl: function(backend) {
+      this.backend = backend;
+    },
+    behaviors: [
+      DashboardBehavior("profile"),
+      ReloadBehavior("tf-profile-dashboard"),
+      BackendBehavior,
+    ],
+    properties: {
+      backend: Object,
+      _isAttached: Boolean,
+      // Whether this dashboard is initialized. This dashboard should only be
+      // initialized once.
+      _initialized: Boolean,
+      // The endpoint that serves trace viewer app.
+      traceViewerBaseUrl: {
+        type: String,
+        value: function() {
+          return getRouter().pluginRoute("profile", "/trace_viewer");
+        },
+      },
+      // The URL for the trace data being display.
+      _traceDataUrl: {
+        type: String,
+        value: "",
+      },
+      _datasets: {
+        type: Array,
+        observer: '_datasetsChanged'
+      },
+      _activeTools: {
+        type: Array,
+        computed: '_getActiveTools(selectedDataset, _datasets)'
+      },
+      selectedDataset: {
+        type: Number,
+        notify: true,
+        value: 0,
+        observer: '_selectedDatasetChanged'
+      },
+      _currentTool: {
+        type: String,
+        computed: '_getCurrentToolName(selectedTool, _activeTools)',
+        notify: true,
+      },
+      selectedTool: {
+        type: Number,
+        notify: true,
+        value: 0,
+      },
+    },
+    reload: function() {
+    },
+    ready: function() {
+    },
+    observers: [
+      '_maybeInitializeDashboard(backend, _isAttached)',
+      '_maybeUpdateTool(_datasets, selectedDataset, _currentTool)',
+    ],
+    attached: function() {
+      this.set('_isAttached', true);
+    },
+    detached: function() {
+      this.set('_isAttached', false);
+    },
+    _maybeInitializeDashboard: function(backend, isAttached) {
+      if (this._initialized || !backend || !isAttached) {
+        // Either this dashboard is already initialized ... or we are not yet
+        // ready to initialize.
+        return;
+      }
+      // Set this to true so we only initialize once.
+      this._initialized = true;
+      backend.profileTags().then((runToTool) => {
+        var datasets = _.map(runToTool, function(tools, run) {
+          return {
+            name: run,
+            activeTools: tools,
+          };
+        }, this);
+        this.set('_datasets', datasets);
+      });
+    },
+    _maybeUpdateTool: function(datasets, selectedDataset, currentTool) {
+      if (currentTool == "trace_viewer") {
+        var trace_data_url = (getRouter().pluginRunTagRoute('profile', '/data')(
+            'trace_viewer', datasets[selectedDataset].name));
+        // Make the trace data url relative to the root.
+        if (trace_data_url[0] != '/') {
+          trace_data_url = '/' + trace_data_url;
+        }
+        this._traceDataUrl = this.traceViewerBaseUrl + "?trace_data_url=" +
+                            encodeURIComponent(trace_data_url);
+        return;
+      }
+    },
+    _datasetsChanged: function(newDatasets, oldDatasets) {
+      if (oldDatasets != null || this.selected == null) {
+        // Select the first dataset by default.
+        this.set('selectedDataset', 0);
+      }
+    },
+    _selectedDatasetChanged: function(newDataset, oldDataset) {
+      if (this._datasets) {
+        // Display the first tool by default when switching to another run.
+        this.set('selectedTool', 0);
+      }
+    },
+    _getCurrentToolName: function(selectedToolIndex, activeTools) {
+      if (selectedToolIndex >= 0 && selectedToolIndex < activeTools.length) {
+        return activeTools[selectedToolIndex];
+      }
+      return null;
+    },
+    _getActiveTools: function(selectedDataset, datasets) {
+      if (datasets && selectedDataset < datasets.length) {
+        return datasets[selectedDataset].activeTools;
+      }
+      return [];
+    },
+    _toolIsTraceViewer: function(toolName) {
+      return toolName == 'trace_viewer';
+    },
+  });
+</script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_runs_selector/BUILD b/tensorflow/tensorboard/components/tf_runs_selector/BUILD
new file mode 100644
index 00000000000..30265c8d294
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_runs_selector/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_runs_selector",
+    srcs = [
+        "tf-runs-selector.html",
+    ],
+    path = "/tf-runs-selector",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html b/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html
new file mode 100644
index 00000000000..6964bb076de
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html
@@ -0,0 +1,195 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="../tf-dashboard-common/tf-multi-checkbox.html">
+<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
+
+<!--
+tf-runs-selector creates a set of checkboxes to display which runs are
+selected. It also displays tooltips.
+
+Properties in: none.
+Properties out:
+- selectedRuns: The array of run names that are currently checked by the user.
+
+-->
+<dom-module id="tf-runs-selector">
+  <template>
+    <div id="plumbing">
+      <tf-color-scale
+        runs="[[runs]]"
+        out-color-scale="{{colorScale}}"
+      ></tf-color-scale>
+    </div>
+    <paper-dialog with-backdrop id="logdir-dialog">
+      <h2>logdir</h2>
+      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
+    </paper-dialog>
+    <div id="top-text">
+      <h3 id="tooltip-help" class="tooltip-container">
+        Runs<!--
+        - TODO(wchargin): Remove the "new" notice when we remove the old
+        - selector. This is just so that we can tell them apart.
+        -->&#x2003;<span
+          style="text-transform:uppercase;color:red;font-size:smaller"
+        >new</span>
+      </h3>
+    </div>
+    <tf-multi-checkbox
+      id="multiCheckbox"
+      names="[[runs]]"
+      out-selected="{{selectedRuns}}"
+      color-scale="[[colorScale]]"
+    ></tf-multi-checkbox>
+    <paper-button
+      class="x-button"
+      id="toggle-all"
+      on-tap="_toggleAll"
+    >
+    Toggle All Runs
+    </paper-button>
+    <template
+      is="dom-if"
+      if="[[logdir]]">
+      <div id="logdir">
+        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><!--
+          We use HTML comments to remove spaces before the ellipsis.
+        --><template
+                     is="dom-if"
+                     if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><!--
+          --><a href="" on-click="_openLogdirDialog">…</a>
+        </template>
+      </div>
+    </template>
+    <style>
+      :host {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        box-sizing: border-box;
+      }
+      #top-text {
+        width: 100%;
+        flex-grow: 0;
+        flex-shrink: 0;
+        padding-right: 16px;
+        box-sizing: border-box;
+        color: var(--paper-grey-800);
+      }
+      tf-multi-checkbox {
+        display: flex;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+      .x-button {
+        font-size: 13px;
+        background-color: var(--tb-ui-light-accent);
+        color: var(--tb-ui-dark-accent);
+      }
+      #tooltip-help {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+      }
+      paper-button {
+        margin-left: 0;
+      }
+      #logdir {
+        color: var(--tb-ui-dark-accent);
+        font-size: 13px;
+        margin: 5px 0 0 0;
+        max-width: 288px;
+      }
+    </style>
+  </template>
+  <script>
+  import {RequestManager} from "../tf-backend/requestManager";
+  import {getRouter} from "../tf-backend/router";
+  import * as RunsStore from "../tf-backend/runsStore";
+
+  var requestManager = new RequestManager();
+  Polymer({
+    is: "tf-runs-selector",
+    properties: {
+      selectedRuns: {type: Array, notify: true},
+      // runs: an array of strings, representing the run names that may be chosen
+      runs: Array,
+      logdir: {
+        type: String,
+        notify: true,
+      },
+      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
+      _clippedLogdir: {
+        type: String,
+        computed: '_getClippedLogdir(logdir, _logdirClipLength)',
+      },
+      _logdirClipLength: {
+        type: Number,
+        value: 250,
+        readOnly: true,
+      },
+    },
+    ready: function() {
+      var updateRuns = function() {
+        this.set("runs", RunsStore.getRuns());
+      }.bind(this);
+      RunsStore.addListener(updateRuns);
+      updateRuns();
+
+      requestManager.request(getRouter().logdir()).then(logdirObject => {
+        this.set('logdir', logdirObject.logdir);
+      }).catch(e => {
+        // Fetching the logdir failed. Prevent the exception from logging to
+        // console. The console already logs a 404 network event.
+      });
+    },
+    _toggleAll: function() {
+      this.$.multiCheckbox.toggleAll();
+    },
+    // Break the string at natural points, including commas, equals, and slashes
+    _breakString: function(originalString) {
+      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
+    },
+    _getClippedLogdir: function(logdir, logdirClipLength) {
+      if (logdir === undefined) {
+        // The logdir has not been set yet.
+        return undefined;
+      }
+
+      if (logdir.length > logdirClipLength) {
+        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
+        // version of the logdir.
+        return this._breakString(logdir.substring(0, logdirClipLength));
+      } else {
+        return this._breakString(logdir);
+      }
+    },
+    _openLogdirDialog: function(event) {
+      event.preventDefault();
+      this.$$('#logdir-dialog').open();
+    },
+    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
+      return logdir && logdir.length > _logdirClipLength;
+    },
+  });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
new file mode 100644
index 00000000000..7cc192b4640
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
@@ -0,0 +1,38 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_scalar_dashboard",
+    srcs = [
+        "tf-scalar-dashboard.html",
+        "tf-smoothing-input.html",
+    ],
+    path = "/tf-scalar-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_runs_selector",
+        "//tensorflow/tensorboard/components/vz_line_chart",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
new file mode 100644
index 00000000000..0e892b1aa30
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-scalar-dashboard/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir
new file mode 100644
index 00000000000..b6362b45d77
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
index da831a00e9d..d45f530763c 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
@@ -1,24 +1,4 @@
 {
-  "alpha": {
-    "scalars": [
-      "d1",
-      "d2",
-      "d3",
-      "d4"
-    ],
-    "histograms": [],
-    "images": [],
-    "audio": []
-  },
-  "beta": {
-    "scalars": [
-      "d1",
-      "d2",
-      "d3",
-      "d4"
-    ],
-    "histograms": [],
-    "images": [],
-    "audio": []
-  }
-}
+  "run1": {"scalars": ["foo/sin", "foo/cos", "foo/square", "bar/square"]},
+  "run2": {"scalars": ["foo/cos", "foo/square", "bar/square"]}
+}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json
new file mode 100644
index 00000000000..bc269395b68
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json
@@ -0,0 +1 @@
+{"run2": {"foo/cos": [[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]}, "run1": {"foo/sin": [[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]], "foo/cos": [[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d1.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d1.json
deleted file mode 100644
index af17f5c3283..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436926051.074826, 84, 0.6990088224411011], [1436926530.99861, 2289, 6.9384379386901855], [1436927011.134076, 7611, 13.698328971862793], [1436927490.984256, 16147, 20.168190002441406], [1436927970.957234, 26087, 20.877344131469727], [1436928450.977514, 36241, 21.269058227539062], [1436928930.989548, 46432, 21.329505920410156], [1436929410.976308, 56629, 21.220420837402344], [1436929890.966395, 66791, 21.190065383911133], [1436930370.958199, 76936, 21.108604431152344], [1436930850.985301, 87083, 21.157001495361328], [1436931331.009261, 97161, 21.02127456665039], [1436931810.966042, 107210, 20.891658782958984], [1436932290.955417, 117262, 20.930112838745117], [1436932770.964496, 127333, 20.986324310302734], [1436933250.962592, 137430, 20.981359481811523], [1436933730.992022, 147528, 21.083036422729492], [1436934210.959831, 157635, 21.092649459838867], [1436934690.97072, 167749, 21.11568832397461], [1436935170.957944, 177869, 21.145965576171875], [1436935650.959987, 188025, 21.215585708618164], [1436936130.997541, 198206, 21.227184295654297], [1436936610.965526, 208395, 21.226459503173828], [1436937090.965581, 218592, 21.264968872070312], [1436937570.964874, 228818, 21.335866928100586], [1436938050.965706, 239021, 21.286521911621094], [1436938531.013159, 249210, 21.20963478088379], [1436939010.957926, 259415, 21.28431510925293], [1436939490.96341, 269637, 21.326831817626953], [1436939970.959372, 279876, 21.38308334350586], [1436940450.963802, 290127, 21.355499267578125], [1436940931.004537, 300349, 21.31337547302246], [1436941410.979614, 310601, 21.405778884887695], [1436941890.979674, 320872, 21.368688583374023], [1436942370.975153, 331131, 21.39077377319336], [1436942850.980459, 341399, 21.41745948791504], [1436943331.000808, 351651, 21.384023666381836], [1436943810.968736, 361904, 21.326438903808594], [1436944290.95947, 372158, 21.367351531982422], [1436944770.955783, 382430, 21.476247787475586], [1436945250.966321, 392684, 21.36678695678711], [1436945731.008667, 402950, 21.349145889282227], [1436946210.977922, 413210, 21.373897552490234], [1436946690.975303, 423463, 21.322399139404297], [1436947170.964596, 433723, 21.341150283813477], [1436947650.955017, 443991, 21.366348266601562], [1436948130.992501, 454271, 21.43844223022461], [1436948610.960555, 464519, 21.36829948425293], [1436949090.961079, 474758, 21.266357421875], [1436949570.971528, 484987, 21.316511154174805], [1436950050.977787, 495228, 21.356050491333008], [1436950531.020035, 505458, 21.31462860107422], [1436951010.959775, 515682, 21.277490615844727], [1436951490.967418, 525910, 21.289737701416016], [1436951970.969778, 536112, 21.2515811920166], [1436952450.956291, 546320, 21.254491806030273], [1436952931.005547, 556541, 21.297870635986328], [1436953410.955758, 566755, 21.320045471191406], [1436953890.959151, 576957, 21.23529624938965], [1436954370.959553, 587165, 21.25132179260254], [1436954850.960546, 597371, 21.23470115661621], [1436955330.989932, 607582, 21.19434356689453], [1436955810.957128, 617790, 21.258535385131836], [1436956290.9763, 627991, 21.221921920776367], [1436956770.957785, 638208, 21.309843063354492], [1436957250.974143, 648404, 21.252185821533203], [1436957731.012441, 658613, 21.265626907348633], [1436958210.980787, 668824, 21.239660263061523], [1436958690.973474, 679034, 21.2642765045166], [1436959170.95825, 689249, 21.303138732910156], [1436959650.959345, 699454, 21.24073600769043], [1436960131.008682, 709664, 21.217615127563477], [1436960610.958074, 719876, 21.251184463500977], [1436961090.963638, 730100, 21.290971755981445], [1436961570.979029, 740316, 21.305265426635742], [1436962050.974645, 750534, 21.27857208251953], [1436962531.055479, 760757, 21.329837799072266], [1436963010.975299, 770964, 21.248849868774414], [1436963490.963107, 781164, 21.19978904724121], [1436963970.965936, 791382, 21.30535888671875], [1436964450.959947, 801590, 21.226255416870117], [1436964931.00587, 811785, 21.242237091064453], [1436965410.977997, 821977, 21.226497650146484], [1436965890.988465, 832189, 21.31219482421875], [1436966370.965612, 842399, 21.283390045166016], [1436966850.965794, 852612, 21.273908615112305], [1436967331.009476, 862825, 21.260452270507812], [1436967810.96767, 873037, 21.315444946289062], [1436968290.959107, 883248, 21.28677749633789], [1436968770.9681, 893452, 21.265335083007812], [1436969250.959332, 903655, 21.252891540527344], [1436969731.055609, 913856, 21.233684539794922], [1436970210.961426, 924047, 21.191429138183594], [1436970690.962999, 934250, 21.23288345336914], [1436971170.989107, 944430, 21.17190170288086], [1436971650.956015, 954634, 21.275972366333008], [1436972131.006841, 964844, 21.278474807739258], [1436972610.981754, 975045, 21.25553321838379], [1436973090.961548, 985239, 21.21686553955078], [1436973570.960013, 995439, 21.26004981994629], [1436974050.975653, 1005642, 21.25356101989746], [1436974530.988571, 1015842, 21.23944664001465], [1436975010.95851, 1026048, 21.293363571166992], [1436975490.97355, 1036253, 21.277101516723633], [1436975970.960916, 1046451, 21.242155075073242], [1436976450.990263, 1056636, 21.182037353515625], [1436976930.999578, 1066834, 21.21113395690918], [1436977410.962637, 1077031, 21.230762481689453], [1436977890.970389, 1087222, 21.232444763183594], [1436978370.959059, 1097405, 21.202342987060547], [1436978850.956562, 1107601, 21.23992156982422], [1436979331.021134, 1117786, 21.197628021240234], [1436979810.958593, 1127973, 21.2270565032959], [1436980290.958763, 1138163, 21.250303268432617], [1436980770.967171, 1148348, 21.215538024902344], [1436981250.960473, 1158540, 21.277185440063477], [1436981731.009465, 1168733, 21.268449783325195], [1436982210.960797, 1178930, 21.268077850341797], [1436982690.959709, 1189129, 21.243141174316406], [1436983170.961963, 1199327, 21.21793556213379], [1436983650.958504, 1209524, 21.2817440032959], [1436984130.998057, 1219726, 21.261478424072266], [1436984610.958945, 1229936, 21.300107955932617], [1436985090.978825, 1240145, 21.326183319091797], [1436985570.993741, 1250311, 21.115875244140625], [1436986050.965608, 1260436, 21.19010353088379], [1436986531.026713, 1270611, 21.183719635009766], [1436987010.969056, 1280784, 21.273176193237305], [1436987490.975071, 1290959, 21.182931900024414], [1436987970.96007, 1301147, 21.260244369506836], [1436988450.966092, 1311328, 21.225025177001953], [1436988931.004917, 1321514, 21.242164611816406], [1436989410.980351, 1331709, 21.19801139831543], [1436989890.975192, 1341910, 21.273555755615234], [1436990370.964941, 1352090, 21.175983428955078], [1436990850.973647, 1362240, 21.13412094116211], [1436991330.999346, 1372396, 21.153064727783203], [1436991811.003573, 1382550, 21.155475616455078], [1436992290.962706, 1392710, 21.17011833190918], [1436992770.999149, 1402862, 21.128713607788086], [1436993250.965124, 1413020, 21.1361026763916], [1436993731.020464, 1423164, 21.157777786254883], [1436994210.966935, 1433312, 21.119478225708008], [1436994690.962803, 1443468, 21.161104202270508], [1436995170.972952, 1453657, 21.11492919921875], [1436995650.976233, 1463820, 21.194231033325195], [1436996130.990524, 1473980, 21.169816970825195], [1436996610.97302, 1484152, 21.18223762512207], [1436997090.958457, 1494308, 21.1954402923584], [1436997570.980333, 1504463, 21.140769958496094], [1436998050.969869, 1514618, 21.162744522094727], [1436998530.99688, 1524770, 21.139591217041016], [1436999010.970375, 1534905, 21.107114791870117], [1436999490.960775, 1545070, 21.233396530151367], [1436999970.965087, 1555223, 21.201074600219727], [1437000450.969008, 1565370, 21.147083282470703], [1437000931.007425, 1575517, 21.108510971069336], [1437001410.962798, 1585666, 21.11674690246582], [1437001890.966192, 1595826, 21.17819595336914], [1437002370.961814, 1605980, 21.157669067382812], [1437002850.962206, 1616145, 21.212690353393555], [1437003330.994816, 1626291, 21.177446365356445], [1437003810.966017, 1636448, 21.17884063720703], [1437004290.959479, 1646599, 21.150310516357422], [1437004770.965083, 1656754, 21.21011734008789], [1437005250.958234, 1666902, 21.14912986755371], [1437005731.003528, 1677043, 21.125459671020508], [1437006210.961371, 1687192, 21.124374389648438], [1437006690.962663, 1697338, 21.150362014770508], [1437007170.961639, 1707484, 21.16637420654297], [1437007650.972242, 1717625, 21.163259506225586], [1437008131.003191, 1727767, 21.167280197143555], [1437008610.962644, 1737913, 21.174945831298828], [1437009090.964129, 1748068, 21.17894172668457], [1437009570.962582, 1758219, 21.116622924804688], [1437010050.984863, 1768384, 21.23469352722168], [1437010531.002295, 1778534, 21.143510818481445], [1437011010.961803, 1788677, 21.159791946411133], [1437011490.974074, 1798822, 21.119792938232422], [1437011970.959982, 1808958, 21.10943603515625], [1437012450.95932, 1819091, 21.123899459838867], [1437012931.004909, 1829227, 21.094532012939453], [1437013410.957751, 1839374, 21.200057983398438], [1437013890.960506, 1849509, 21.10895538330078], [1437014370.96113, 1859653, 21.108680725097656], [1437014850.962876, 1869791, 21.141136169433594], [1437015331.009875, 1879944, 21.160165786743164], [1437015810.960671, 1890090, 21.158742904663086], [1437016290.970743, 1900242, 21.16562271118164], [1437016770.961673, 1910391, 21.141860961914062], [1437017250.96735, 1920551, 21.19420051574707], [1437017731.000324, 1930702, 21.16814422607422], [1437018210.967878, 1940856, 21.125978469848633], [1437018690.962742, 1951005, 21.15043067932129], [1437019170.975774, 1961158, 21.157419204711914], [1437019650.964573, 1971309, 21.150177001953125], [1437020130.999343, 1981461, 21.124492645263672], [1437020610.960696, 1991611, 21.109933853149414], [1437021090.958597, 2001766, 21.169754028320312], [1437021570.964477, 2011919, 21.13479995727539], [1437022050.966522, 2022063, 21.131561279296875], [1437022531.005607, 2032219, 21.135629653930664], [1437023010.970667, 2042380, 21.207313537597656], [1437023490.964885, 2052534, 21.108623504638672], [1437023970.965596, 2062691, 21.14097023010254], [1437024450.962296, 2072837, 21.129037857055664], [1437024931.00395, 2082982, 21.077030181884766], [1437025410.96602, 2093128, 21.13152503967285], [1437025890.961753, 2103274, 21.117740631103516], [1437026370.962022, 2113424, 21.141584396362305], [1437026850.975475, 2123570, 21.143577575683594], [1437027331.009277, 2133721, 21.175586700439453], [1437027810.97206, 2143857, 21.099014282226562], [1437028290.961523, 2154015, 21.141523361206055], [1437028770.964366, 2164168, 21.141345977783203], [1437029250.962109, 2174320, 21.14827537536621], [1437029731.003068, 2184453, 21.086946487426758], [1437030210.960946, 2194602, 21.1590576171875], [1437030690.966681, 2204754, 21.17353057861328], [1437031170.961207, 2214899, 21.133989334106445], [1437031650.962809, 2225062, 21.14800453186035], [1437032130.997644, 2235215, 21.15397834777832], [1437032610.962999, 2245366, 21.15763282775879], [1437033090.962192, 2255521, 21.133577346801758], [1437033570.963341, 2265657, 21.058490753173828], [1437034050.979501, 2275787, 21.079614639282227], [1437034531.003514, 2285923, 21.12677574157715], [1437035010.960984, 2296058, 21.100793838500977], [1437035490.97325, 2306176, 21.10753059387207], [1437035970.969759, 2316297, 21.100393295288086], [1437036450.962305, 2326428, 21.041208267211914], [1437036931.001785, 2336571, 21.15167999267578], [1437037410.967681, 2346709, 21.09291648864746], [1437037890.963194, 2356854, 21.18524932861328], [1437038370.96445, 2366985, 21.116247177124023], [1437038850.960718, 2377124, 21.125469207763672], [1437039331.003148, 2387259, 21.132274627685547], [1437039810.974007, 2397400, 21.119945526123047], [1437040290.983415, 2407539, 21.154672622680664], [1437040770.961836, 2417667, 21.066741943359375], [1437041250.964281, 2427791, 21.126564025878906], [1437041731.0196, 2437923, 21.1062068939209], [1437042210.962927, 2448056, 21.124549865722656], [1437042690.964392, 2458193, 21.13232421875], [1437043170.972024, 2468318, 21.066423416137695], [1437043650.966111, 2478449, 21.123788833618164], [1437044131.030028, 2488576, 21.138349533081055], [1437044610.962532, 2498717, 21.11895179748535], [1437045090.965094, 2508839, 21.019609451293945], [1437045570.963352, 2518972, 21.079254150390625], [1437046050.96194, 2529106, 21.15033531188965], [1437046530.995016, 2539243, 21.11912727355957], [1437047010.963313, 2549369, 21.08464813232422], [1437047490.963943, 2559509, 21.133895874023438], [1437047970.958612, 2569646, 21.108659744262695], [1437048450.962392, 2579776, 21.084848403930664], [1437048931.005408, 2589906, 21.092708587646484], [1437049410.984115, 2600033, 21.130634307861328], [1437049890.964103, 2610162, 21.074010848999023], [1437050370.960886, 2620282, 21.086149215698242], [1437050850.959795, 2630402, 21.08969497680664], [1437051331.008292, 2640533, 21.134498596191406], [1437051810.96622, 2650643, 21.065444946289062], [1437052290.98584, 2660774, 21.120830535888672], [1437052770.967707, 2670900, 21.085134506225586], [1437053250.978851, 2681021, 21.037155151367188], [1437053731.021686, 2691151, 21.09203338623047], [1437054210.971744, 2701273, 21.048450469970703], [1437054690.966686, 2711425, 21.048809051513672], [1437055170.964463, 2721564, 21.13330078125], [1437055650.97301, 2731694, 21.097095489501953], [1437056130.997053, 2741810, 21.031536102294922], [1437056610.968681, 2751927, 21.04400634765625], [1437057090.976676, 2762049, 21.114444732666016], [1437057570.962334, 2772169, 21.06243896484375], [1437058050.969524, 2782292, 21.12563133239746], [1437058531.012918, 2792420, 21.12433433532715], [1437059010.972868, 2802545, 21.067407608032227], [1437059490.96188, 2812684, 21.099285125732422], [1437059970.965083, 2822806, 21.08357810974121], [1437060450.964845, 2832940, 21.142192840576172], [1437060931.011947, 2843080, 21.109895706176758], [1437061410.963414, 2853223, 21.13360023498535], [1437061890.969303, 2863361, 21.152849197387695], [1437062370.963703, 2873490, 21.08356285095215], [1437062850.964392, 2883627, 21.115087509155273], [1437063331.025516, 2893758, 21.13198471069336], [1437063810.962087, 2903877, 21.084623336791992], [1437064290.973818, 2914013, 21.14010238647461], [1437064770.967792, 2924145, 21.108346939086914], [1437065250.95886, 2934291, 21.1142635345459], [1437065731.01002, 2944434, 21.17418670654297], [1437066210.959306, 2954576, 21.084075927734375], [1437066690.960644, 2964724, 21.125164031982422], [1437067170.969539, 2974890, 21.200775146484375], [1437067650.960018, 2985036, 21.14740562438965], [1437068130.990731, 2995179, 21.11964225769043], [1437068610.960429, 3005322, 21.141313552856445], [1437069090.95752, 3015461, 21.082963943481445], [1437069570.974879, 3025595, 21.12288475036621], [1437070050.95761, 3035734, 21.107513427734375], [1437070531.0013, 3045868, 21.171630859375], [1437071010.961705, 3056004, 21.066505432128906], [1437071490.961495, 3066137, 21.10834312438965], [1437071970.978122, 3076267, 21.08027458190918], [1437072450.963299, 3086399, 21.089733123779297], [1437072931.018382, 3096524, 21.133176803588867], [1437073050.962102, 3099048, 21.041847229003906], [1437073170.96983, 3101584, 21.131967544555664], [1437073290.957895, 3104118, 21.118793487548828]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d2.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d2.json
deleted file mode 100644
index 92bb4143482..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d2.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 0.04500000178813934], [1436926413.945391, 1476, 0.04500000178813934], [1436926893.945037, 6006, 0.04500000178813934], [1436927373.995472, 13786, 0.04500000178813934], [1436927853.989794, 23650, 0.04500000178813934], [1436928334.132361, 33755, 0.04500000178813934], [1436928813.973288, 43941, 0.04500000178813934], [1436929293.975949, 54146, 0.04500000178813934], [1436929773.992781, 64316, 0.04500000178813934], [1436930253.997415, 74465, 0.04500000178813934], [1436930734.203004, 84611, 0.04230000078678131], [1436931214.03644, 94700, 0.04230000078678131], [1436931694.094564, 104766, 0.04230000078678131], [1436932174.114955, 114817, 0.04230000078678131], [1436932654.161382, 124880, 0.04230000078678131], [1436933133.960214, 134977, 0.04230000078678131], [1436933614.044337, 145062, 0.04230000078678131], [1436934094.166206, 155169, 0.04230000078678131], [1436934574.106036, 165284, 0.03976200148463249], [1436935054.150647, 175402, 0.03976200148463249], [1436935533.819562, 185538, 0.03976200148463249], [1436936013.710422, 195712, 0.03976200148463249], [1436936493.609025, 205906, 0.03976200148463249], [1436936973.683892, 216099, 0.03976200148463249], [1436937454.138383, 226331, 0.03976200148463249], [1436937933.838475, 236532, 0.03976200148463249], [1436938413.89688, 246724, 0.0373762808740139], [1436938894.018652, 256925, 0.0373762808740139], [1436939373.69067, 267137, 0.0373762808740139], [1436939853.673692, 277369, 0.0373762808740139], [1436940333.651346, 287620, 0.0373762808740139], [1436940813.599579, 297848, 0.0373762808740139], [1436941293.596313, 308088, 0.0373762808740139], [1436941773.659172, 318362, 0.0373762808740139], [1436942253.648479, 328621, 0.03513370454311371], [1436942733.752284, 338892, 0.03513370454311371], [1436943213.621881, 349144, 0.03513370454311371], [1436943693.698743, 359399, 0.03513370454311371], [1436944173.578463, 369649, 0.03513370454311371], [1436944653.692217, 379912, 0.03513370454311371], [1436945133.677298, 390180, 0.03513370454311371], [1436945613.572411, 400445, 0.03302568197250366], [1436946093.56123, 410703, 0.03302568197250366], [1436946573.542364, 420958, 0.03302568197250366], [1436947053.616578, 431216, 0.03302568197250366], [1436947533.636973, 441483, 0.03302568197250366], [1436948013.541574, 451751, 0.03302568197250366], [1436948493.560223, 462015, 0.03302568197250366], [1436948973.512541, 472260, 0.03302568197250366], [1436949453.550055, 482483, 0.031044140458106995], [1436949933.828011, 492731, 0.031044140458106995], [1436950413.603177, 502957, 0.031044140458106995], [1436950893.563009, 513185, 0.031044140458106995], [1436951373.620887, 523410, 0.031044140458106995], [1436951853.61941, 533618, 0.031044140458106995], [1436952333.694447, 543828, 0.031044140458106995], [1436952813.621004, 554042, 0.031044140458106995], [1436953293.588156, 564251, 0.02918149158358574], [1436953773.599734, 574464, 0.02918149158358574], [1436954253.621309, 584672, 0.02918149158358574], [1436954733.738119, 594882, 0.02918149158358574], [1436955213.56617, 605091, 0.02918149158358574], [1436955693.585366, 615296, 0.02918149158358574], [1436956173.626395, 625501, 0.02918149158358574], [1436956653.601937, 635705, 0.02918149158358574], [1436957133.665878, 645915, 0.02743060328066349], [1436957613.584762, 656116, 0.02743060328066349], [1436958093.549783, 666331, 0.02743060328066349], [1436958573.646778, 676543, 0.02743060328066349], [1436959053.585655, 686750, 0.02743060328066349], [1436959533.679696, 696961, 0.02743060328066349], [1436960013.633292, 707173, 0.02743060328066349], [1436960493.578778, 717383, 0.02743060328066349], [1436960973.596715, 727598, 0.025784766301512718], [1436961453.625644, 737818, 0.025784766301512718], [1436961933.740339, 748040, 0.025784766301512718], [1436962413.573845, 758252, 0.025784766301512718], [1436962893.610678, 768470, 0.025784766301512718], [1436963373.642878, 778674, 0.025784766301512718], [1436963853.558388, 788877, 0.025784766301512718], [1436964333.658419, 799099, 0.025784766301512718], [1436964813.573319, 809289, 0.024237681180238724], [1436965293.542098, 819484, 0.024237681180238724], [1436965773.545453, 829687, 0.024237681180238724], [1436966253.586517, 839901, 0.024237681180238724], [1436966733.639348, 850120, 0.024237681180238724], [1436967213.697288, 860330, 0.024237681180238724], [1436967693.617172, 870539, 0.024237681180238724], [1436968173.593885, 880748, 0.024237681180238724], [1436968653.560836, 890955, 0.022783419117331505], [1436969133.676337, 901164, 0.022783419117331505], [1436969613.506638, 911358, 0.022783419117331505], [1436970093.595964, 921560, 0.022783419117331505], [1436970573.541227, 931756, 0.022783419117331505], [1436971053.624316, 941945, 0.022783419117331505], [1436971533.655543, 952138, 0.022783419117331505], [1436972013.604738, 962349, 0.02141641452908516], [1436972493.613199, 972551, 0.02141641452908516], [1436972973.501155, 982746, 0.02141641452908516], [1436973453.64842, 992945, 0.02141641452908516], [1436973933.689516, 1003147, 0.02141641452908516], [1436974413.577769, 1013350, 0.02141641452908516], [1436974893.542281, 1023545, 0.02141641452908516], [1436975373.638453, 1033759, 0.02141641452908516], [1436975853.524388, 1043955, 0.02013142965734005], [1436976333.625792, 1054148, 0.02013142965734005], [1436976813.610661, 1064342, 0.02013142965734005], [1436977293.601581, 1074539, 0.02013142965734005], [1436977773.575627, 1084733, 0.02013142965734005], [1436978253.564972, 1094914, 0.02013142965734005], [1436978733.673144, 1105109, 0.02013142965734005], [1436979213.540585, 1115293, 0.02013142965734005], [1436979693.699591, 1125483, 0.018923543393611908], [1436980173.613012, 1135670, 0.018923543393611908], [1436980653.575769, 1145862, 0.018923543393611908], [1436981133.719264, 1156045, 0.018923543393611908], [1436981613.563551, 1166236, 0.018923543393611908], [1436982093.553233, 1176436, 0.018923543393611908], [1436982573.577846, 1186636, 0.018923543393611908], [1436983053.605749, 1196837, 0.018923543393611908], [1436983533.684994, 1207025, 0.017788130789995193], [1436984013.561492, 1217233, 0.017788130789995193], [1436984493.629873, 1227437, 0.017788130789995193], [1436984973.606714, 1237643, 0.017788130789995193], [1436985453.690084, 1247835, 0.017788130789995193], [1436985933.711388, 1257951, 0.017788130789995193], [1436986413.598807, 1268125, 0.017788130789995193], [1436986893.631797, 1278290, 0.017788130789995193], [1436987373.596962, 1288473, 0.016720842570066452], [1436987853.555549, 1298650, 0.016720842570066452], [1436988333.722032, 1308841, 0.016720842570066452], [1436988813.55697, 1319018, 0.016720842570066452], [1436989293.756905, 1329221, 0.016720842570066452], [1436989773.665141, 1339417, 0.016720842570066452], [1436990253.768302, 1349610, 0.016720842570066452], [1436990733.708919, 1359759, 0.016720842570066452], [1436991213.663033, 1369914, 0.01571759209036827], [1436991693.730925, 1380074, 0.01571759209036827], [1436992173.751791, 1390224, 0.01571759209036827], [1436992653.758682, 1400383, 0.01571759209036827], [1436993133.835604, 1410542, 0.01571759209036827], [1436993613.674655, 1420684, 0.01571759209036827], [1436994093.747454, 1430832, 0.01571759209036827], [1436994573.768973, 1440986, 0.01571759209036827], [1436995053.666661, 1451174, 0.014774537645280361], [1436995533.83439, 1461345, 0.014774537645280361], [1436996013.556996, 1471495, 0.014774537645280361], [1436996493.635477, 1481663, 0.014774537645280361], [1436996973.668684, 1491822, 0.014774537645280361], [1436997453.59326, 1501979, 0.014774537645280361], [1436997933.774019, 1512139, 0.014774537645280361], [1436998413.575162, 1522290, 0.01388806477189064], [1436998893.640468, 1532431, 0.01388806477189064], [1436999373.551661, 1542579, 0.01388806477189064], [1436999853.57906, 1552734, 0.01388806477189064], [1437000333.680409, 1562888, 0.01388806477189064], [1437000813.602383, 1573037, 0.01388806477189064], [1437001293.610337, 1583190, 0.01388806477189064], [1437001773.618199, 1593341, 0.01388806477189064], [1437002253.572966, 1603497, 0.013054781593382359], [1437002733.67994, 1613657, 0.013054781593382359], [1437003213.583266, 1623809, 0.013054781593382359], [1437003693.639943, 1633966, 0.013054781593382359], [1437004173.568287, 1644113, 0.013054781593382359], [1437004653.610772, 1654268, 0.013054781593382359], [1437005133.663045, 1664424, 0.013054781593382359], [1437005613.580984, 1674567, 0.013054781593382359], [1437006093.601019, 1684715, 0.01227149460464716], [1437006573.625314, 1694857, 0.01227149460464716], [1437007053.584514, 1704999, 0.01227149460464716], [1437007533.719303, 1715150, 0.01227149460464716], [1437008013.604962, 1725282, 0.01227149460464716], [1437008493.655091, 1735432, 0.01227149460464716], [1437008973.640165, 1745584, 0.01227149460464716], [1437009453.715067, 1755742, 0.01227149460464716], [1437009933.765712, 1765896, 0.011535204015672207], [1437010413.632128, 1776052, 0.011535204015672207], [1437010893.66766, 1786195, 0.011535204015672207], [1437011373.636164, 1796346, 0.011535204015672207], [1437011853.631224, 1806481, 0.011535204015672207], [1437012333.706205, 1816617, 0.011535204015672207], [1437012813.61987, 1826754, 0.011535204015672207], [1437013293.479904, 1836883, 0.011535204015672207], [1437013773.604574, 1847029, 0.010843091644346714], [1437014253.618884, 1857175, 0.010843091644346714], [1437014733.756419, 1867312, 0.010843091644346714], [1437015213.638607, 1877459, 0.010843091644346714], [1437015693.625763, 1887608, 0.010843091644346714], [1437016173.63194, 1897759, 0.010843091644346714], [1437016653.609074, 1907909, 0.010843091644346714], [1437017133.717601, 1918074, 0.010843091644346714], [1437017613.716011, 1928220, 0.010192506946623325], [1437018093.626005, 1938377, 0.010192506946623325], [1437018573.626522, 1948523, 0.010192506946623325], [1437019053.648174, 1958678, 0.010192506946623325], [1437019533.803011, 1968831, 0.010192506946623325], [1437020013.667751, 1978978, 0.010192506946623325], [1437020493.659028, 1989133, 0.010192506946623325], [1437020973.657346, 1999287, 0.010192506946623325], [1437021453.650634, 2009437, 0.00958095584064722], [1437021933.848661, 2019588, 0.00958095584064722], [1437022413.674963, 2029736, 0.00958095584064722], [1437022893.69086, 2039894, 0.00958095584064722], [1437023373.68883, 2050054, 0.00958095584064722], [1437023853.686116, 2060205, 0.00958095584064722], [1437024333.763876, 2070362, 0.00958095584064722], [1437024813.707845, 2080507, 0.00958095584064722], [1437025293.483294, 2090645, 0.009006098844110966], [1437025773.695712, 2100793, 0.009006098844110966], [1437026253.672994, 2110943, 0.009006098844110966], [1437026733.780775, 2121094, 0.009006098844110966], [1437027213.617849, 2131235, 0.009006098844110966], [1437027693.694451, 2141382, 0.009006098844110966], [1437028173.68596, 2151537, 0.009006098844110966], [1437028653.584833, 2161685, 0.009006098844110966], [1437029133.792483, 2171839, 0.00846573244780302], [1437029613.661672, 2181977, 0.00846573244780302], [1437030093.641009, 2192118, 0.00846573244780302], [1437030573.656274, 2202268, 0.00846573244780302], [1437031053.643631, 2212416, 0.00846573244780302], [1437031533.777478, 2222583, 0.00846573244780302], [1437032013.704008, 2232736, 0.00846573244780302], [1437032493.638393, 2242882, 0.007957788184285164], [1437032973.684986, 2253041, 0.007957788184285164], [1437033453.699562, 2263183, 0.007957788184285164], [1437033933.918074, 2273320, 0.007957788184285164], [1437034413.596351, 2283443, 0.007957788184285164], [1437034893.640496, 2293579, 0.007957788184285164], [1437035373.637761, 2303701, 0.007957788184285164], [1437035853.669947, 2313823, 0.007957788184285164], [1437036333.78905, 2323961, 0.0074803209863603115], [1437036813.699727, 2334089, 0.0074803209863603115], [1437037293.662592, 2344235, 0.0074803209863603115], [1437037773.66716, 2354364, 0.0074803209863603115], [1437038253.603687, 2364507, 0.0074803209863603115], [1437038733.78864, 2374644, 0.0074803209863603115], [1437039213.641799, 2384782, 0.0074803209863603115], [1437039693.687078, 2394923, 0.0074803209863603115], [1437040173.635717, 2405058, 0.0070315017364919186], [1437040653.673331, 2415194, 0.0070315017364919186], [1437041133.764768, 2425322, 0.0070315017364919186], [1437041613.629279, 2435449, 0.0070315017364919186], [1437042093.703985, 2445575, 0.0070315017364919186], [1437042573.496029, 2455712, 0.0070315017364919186], [1437043053.686022, 2465844, 0.0070315017364919186], [1437043533.731929, 2475974, 0.0070315017364919186], [1437044013.636245, 2486095, 0.006609611678868532], [1437044493.69923, 2496238, 0.006609611678868532], [1437044973.652155, 2506373, 0.006609611678868532], [1437045453.691467, 2516497, 0.006609611678868532], [1437045933.935804, 2526637, 0.006609611678868532], [1437046413.635583, 2536770, 0.006609611678868532], [1437046893.626337, 2546896, 0.006609611678868532], [1437047373.67437, 2557029, 0.006609611678868532], [1437047853.652939, 2567169, 0.0062130349688231945], [1437048333.778436, 2577306, 0.0062130349688231945], [1437048813.654248, 2587433, 0.0062130349688231945], [1437049293.610609, 2597552, 0.0062130349688231945], [1437049773.646573, 2607690, 0.0062130349688231945], [1437050253.667925, 2617808, 0.0062130349688231945], [1437050733.735291, 2627933, 0.0062130349688231945], [1437051213.620222, 2638053, 0.0062130349688231945], [1437051693.601978, 2648171, 0.005840253084897995], [1437052173.634985, 2658299, 0.005840253084897995], [1437052653.687176, 2668425, 0.005840253084897995], [1437053133.762819, 2678556, 0.005840253084897995], [1437053613.643698, 2688671, 0.005840253084897995], [1437054093.673047, 2698804, 0.005840253084897995], [1437054573.667371, 2708956, 0.005840253084897995], [1437055053.650441, 2719087, 0.005840253084897995], [1437055533.778469, 2729219, 0.005489837843924761], [1437056013.694082, 2739343, 0.005489837843924761], [1437056493.674871, 2749458, 0.005489837843924761], [1437056973.700234, 2759575, 0.005489837843924761], [1437057453.666129, 2769697, 0.005489837843924761], [1437057933.848506, 2779821, 0.005489837843924761], [1437058413.643799, 2789941, 0.005489837843924761], [1437058893.715386, 2800076, 0.005489837843924761], [1437059373.62596, 2810207, 0.005160447675734758], [1437059853.650848, 2820334, 0.005160447675734758], [1437060333.792248, 2830465, 0.005160447675734758], [1437060813.682955, 2840600, 0.005160447675734758], [1437061293.681795, 2850745, 0.005160447675734758], [1437061773.691182, 2860880, 0.005160447675734758], [1437062253.662987, 2871013, 0.005160447675734758], [1437062733.760419, 2881153, 0.005160447675734758], [1437063213.651969, 2891278, 0.004850820638239384], [1437063693.723523, 2901406, 0.004850820638239384], [1437064173.68663, 2911533, 0.004850820638239384], [1437064653.547643, 2921667, 0.004850820638239384], [1437065133.62645, 2931813, 0.004850820638239384], [1437065613.566569, 2941947, 0.004850820638239384], [1437066093.537804, 2952102, 0.004850820638239384], [1437066573.529332, 2962243, 0.004850820638239384], [1437067053.520098, 2972400, 0.004559771623462439], [1437067533.605733, 2982561, 0.004559771623462439], [1437068013.535467, 2992698, 0.004559771623462439], [1437068493.559976, 3002839, 0.004559771623462439], [1437068973.558743, 3012983, 0.004559771623462439], [1437069453.562661, 3023116, 0.004559771623462439], [1437069933.627071, 3033256, 0.004559771623462439], [1437070413.574131, 3043386, 0.004286185372620821], [1437070893.658803, 3053528, 0.004286185372620821], [1437071373.638711, 3063659, 0.004286185372620821], [1437071853.621384, 3073794, 0.004286185372620821], [1437072333.665269, 3083926, 0.004286185372620821], [1437072813.584388, 3094040, 0.004286185372620821], [1437073293.569178, 3104172, 0.004286185372620821]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d3.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d3.json
deleted file mode 100644
index 69191b91544..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d3.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 0.0], [1436927853.989794, 23650, 7360.0], [1436929773.992781, 64316, 7360.0], [1436931694.094564, 104766, 7360.0], [1436933614.044337, 145062, 7360.0], [1436935533.819562, 185538, 7360.0], [1436937454.138383, 226331, 7360.0], [1436939373.69067, 267137, 7360.0], [1436941293.596313, 308088, 7360.0], [1436943213.621881, 349144, 7360.0], [1436945133.677298, 390180, 7360.0], [1436947053.616578, 431216, 7360.0], [1436948973.512541, 472260, 7360.0], [1436950893.563009, 513185, 7360.0], [1436952813.621004, 554042, 7360.0], [1436954733.738119, 594882, 7360.0], [1436956653.601937, 635705, 7360.0], [1436958573.646778, 676543, 7360.0], [1436960493.578778, 717383, 7360.0], [1436962413.573845, 758252, 7360.0], [1436964333.658419, 799099, 7360.0], [1436966253.586517, 839901, 7360.0], [1436968173.593885, 880748, 7360.0], [1436970093.595964, 921560, 7360.0], [1436972013.604738, 962349, 7360.0], [1436973933.689516, 1003147, 7360.0], [1436975853.524388, 1043955, 7360.0], [1436977773.575627, 1084733, 7360.0], [1436979693.699591, 1125483, 7360.0], [1436981613.563551, 1166236, 7360.0], [1436983533.684994, 1207025, 7360.0], [1436985453.690084, 1247835, 7360.0], [1436987373.596962, 1288473, 7360.0], [1436989293.756905, 1329221, 7360.0], [1436991213.663033, 1369914, 7360.0], [1436993133.835604, 1410542, 7360.0], [1436995053.666661, 1451174, 7360.0], [1436996973.668684, 1491822, 7360.0], [1436998893.640468, 1532431, 7360.0], [1437000813.602383, 1573037, 7360.0], [1437002733.67994, 1613657, 7360.0], [1437004653.610772, 1654268, 7360.0], [1437006573.625314, 1694857, 7360.0], [1437008493.655091, 1735432, 7360.0], [1437010413.632128, 1776052, 7360.0], [1437012333.706205, 1816617, 7360.0], [1437014253.618884, 1857175, 7360.0], [1437016173.63194, 1897759, 7360.0], [1437018093.626005, 1938377, 7360.0], [1437020013.667751, 1978978, 7360.0], [1437021933.848661, 2019588, 7360.0], [1437023853.686116, 2060205, 7360.0], [1437025773.695712, 2100793, 7360.0], [1437027693.694451, 2141382, 7360.0], [1437029613.661672, 2181977, 7360.0], [1437031533.777478, 2222583, 7360.0], [1437033453.699562, 2263183, 7360.0], [1437035373.637761, 2303701, 7360.0], [1437037293.662592, 2344235, 7360.0], [1437039213.641799, 2384782, 7360.0], [1437041133.764768, 2425322, 7360.0], [1437043053.686022, 2465844, 7360.0], [1437044973.652155, 2506373, 7360.0], [1437046893.626337, 2546896, 7862.0], [1437048813.654248, 2587433, 7862.0], [1437050733.735291, 2627933, 7862.0], [1437052653.687176, 2668425, 7862.0], [1437054573.667371, 2708956, 7862.0], [1437056493.674871, 2749458, 7862.0], [1437058413.643799, 2789941, 7862.0], [1437060333.792248, 2830465, 7862.0], [1437062253.662987, 2871013, 7862.0], [1437064173.68663, 2911533, 7862.0], [1437066093.537804, 2952102, 7862.0], [1437068013.535467, 2992698, 7862.0], [1437069933.627071, 3033256, 7862.0], [1437071853.621384, 3073794, 7862.0], [1437072333.665269, 3083926, 7862.0], [1437072813.584388, 3094040, 7862.0], [1437073293.569178, 3104172, 7862.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d4.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d4.json
deleted file mode 100644
index caf1ae6e7f7..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/alpha/d4.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 2.461352825164795], [1436926413.945391, 1476, 12.772720336914062], [1436926893.945037, 6006, 12.195232391357422], [1436927373.995472, 13786, 11.528279304504395], [1436927853.989794, 23650, 10.722719192504883], [1436928334.132361, 33755, 10.215253829956055], [1436928813.973288, 43941, 9.730447769165039], [1436929293.975949, 54146, 9.399007797241211], [1436929773.992781, 64316, 9.1018648147583], [1436930253.997415, 74465, 8.961446762084961], [1436930734.203004, 84611, 8.757476806640625], [1436931214.03644, 94700, 8.4615478515625], [1436931694.094564, 104766, 8.506814956665039], [1436932174.114955, 114817, 8.246719360351562], [1436932654.161382, 124880, 8.329349517822266], [1436933133.960214, 134977, 7.90853214263916], [1436933614.044337, 145062, 8.192558288574219], [1436934094.166206, 155169, 7.865443229675293], [1436934574.106036, 165284, 7.910976886749268], [1436935054.150647, 175402, 7.925509929656982], [1436935533.819562, 185538, 7.866455078125], [1436936013.710422, 195712, 7.9123406410217285], [1436936493.609025, 205906, 7.748654842376709], [1436936973.683892, 216099, 7.849164009094238], [1436937454.138383, 226331, 7.784902572631836], [1436937933.838475, 236532, 7.749933242797852], [1436938413.89688, 246724, 7.777050971984863], [1436938894.018652, 256925, 7.663984775543213], [1436939373.69067, 267137, 7.602056980133057], [1436939853.673692, 277369, 7.539070129394531], [1436940333.651346, 287620, 7.575552463531494], [1436940813.599579, 297848, 7.47900390625], [1436941293.596313, 308088, 7.403858184814453], [1436941773.659172, 318362, 7.589539527893066], [1436942253.648479, 328621, 7.511919975280762], [1436942733.752284, 338892, 7.31054162979126], [1436943213.621881, 349144, 7.261094570159912], [1436943693.698743, 359399, 7.552957534790039], [1436944173.578463, 369649, 7.449452877044678], [1436944653.692217, 379912, 7.177209854125977], [1436945133.677298, 390180, 7.308793067932129], [1436945613.572411, 400445, 7.229344844818115], [1436946093.56123, 410703, 7.129981994628906], [1436946573.542364, 420958, 7.127549171447754], [1436947053.616578, 431216, 7.538583755493164], [1436947533.636973, 441483, 7.030594825744629], [1436948013.541574, 451751, 6.98097038269043], [1436948493.560223, 462015, 7.213271141052246], [1436948973.512541, 472260, 7.1727519035339355], [1436949453.550055, 482483, 6.985068321228027], [1436949933.828011, 492731, 7.051283836364746], [1436950413.603177, 502957, 7.082402229309082], [1436950893.563009, 513185, 7.1637864112854], [1436951373.620887, 523410, 7.193849086761475], [1436951853.61941, 533618, 7.1212921142578125], [1436952333.694447, 543828, 7.208009719848633], [1436952813.621004, 554042, 7.28671932220459], [1436953293.588156, 564251, 6.941026210784912], [1436953773.599734, 574464, 7.230144500732422], [1436954253.621309, 584672, 6.815900802612305], [1436954733.738119, 594882, 7.060589790344238], [1436955213.56617, 605091, 7.079995155334473], [1436955693.585366, 615296, 7.300849437713623], [1436956173.626395, 625501, 6.927395343780518], [1436956653.601937, 635705, 6.893837928771973], [1436957133.665878, 645915, 6.965301990509033], [1436957613.584762, 656116, 6.902514457702637], [1436958093.549783, 666331, 7.2444868087768555], [1436958573.646778, 676543, 6.784783840179443], [1436959053.585655, 686750, 6.800273418426514], [1436959533.679696, 696961, 6.743415355682373], [1436960013.633292, 707173, 7.012747764587402], [1436960493.578778, 717383, 6.548677921295166], [1436960973.596715, 727598, 6.638228416442871], [1436961453.625644, 737818, 6.884350776672363], [1436961933.740339, 748040, 6.797428607940674], [1436962413.573845, 758252, 6.815422058105469], [1436962893.610678, 768470, 6.7392377853393555], [1436963373.642878, 778674, 6.8375959396362305], [1436963853.558388, 788877, 6.7254252433776855], [1436964333.658419, 799099, 6.765130996704102], [1436964813.573319, 809289, 6.7060980796813965], [1436965293.542098, 819484, 6.63279390335083], [1436965773.545453, 829687, 6.587352752685547], [1436966253.586517, 839901, 6.4957275390625], [1436966733.639348, 850120, 6.765798091888428], [1436967213.697288, 860330, 6.681786060333252], [1436967693.617172, 870539, 6.696804523468018], [1436968173.593885, 880748, 6.571035385131836], [1436968653.560836, 890955, 6.29492712020874], [1436969133.676337, 901164, 6.679598331451416], [1436969613.506638, 911358, 6.548522472381592], [1436970093.595964, 921560, 6.585646629333496], [1436970573.541227, 931756, 6.589619159698486], [1436971053.624316, 941945, 6.333208084106445], [1436971533.655543, 952138, 6.582470417022705], [1436972013.604738, 962349, 6.289045810699463], [1436972493.613199, 972551, 6.360206127166748], [1436972973.501155, 982746, 6.567287921905518], [1436973453.64842, 992945, 6.246123313903809], [1436973933.689516, 1003147, 6.44004487991333], [1436974413.577769, 1013350, 6.315634727478027], [1436974893.542281, 1023545, 6.289544105529785], [1436975373.638453, 1033759, 6.412042140960693], [1436975853.524388, 1043955, 6.165371894836426], [1436976333.625792, 1054148, 6.403027534484863], [1436976813.610661, 1064342, 6.37597131729126], [1436977293.601581, 1074539, 6.336863994598389], [1436977773.575627, 1084733, 6.377552032470703], [1436978253.564972, 1094914, 6.28995943069458], [1436978733.673144, 1105109, 6.28420352935791], [1436979213.540585, 1115293, 6.277828216552734], [1436979693.699591, 1125483, 6.185207843780518], [1436980173.613012, 1135670, 6.186310768127441], [1436980653.575769, 1145862, 5.922095775604248], [1436981133.719264, 1156045, 6.141305923461914], [1436981613.563551, 1166236, 6.10508394241333], [1436982093.553233, 1176436, 5.967081069946289], [1436982573.577846, 1186636, 5.960882186889648], [1436983053.605749, 1196837, 6.2222185134887695], [1436983533.684994, 1207025, 6.051136493682861], [1436984013.561492, 1217233, 6.087917804718018], [1436984493.629873, 1227437, 5.95945405960083], [1436984973.606714, 1237643, 5.971570014953613], [1436985453.690084, 1247835, 5.969781398773193], [1436985933.711388, 1257951, 6.040994644165039], [1436986413.598807, 1268125, 6.142050743103027], [1436986893.631797, 1278290, 6.03120231628418], [1436987373.596962, 1288473, 5.921470642089844], [1436987853.555549, 1298650, 5.921937942504883], [1436988333.722032, 1308841, 6.050085067749023], [1436988813.55697, 1319018, 5.837893486022949], [1436989293.756905, 1329221, 5.927487850189209], [1436989773.665141, 1339417, 6.117348670959473], [1436990253.768302, 1349610, 6.052918434143066], [1436990733.708919, 1359759, 5.8977789878845215], [1436991213.663033, 1369914, 5.903198719024658], [1436991693.730925, 1380074, 5.85245418548584], [1436992173.751791, 1390224, 5.902153968811035], [1436992653.758682, 1400383, 5.822136878967285], [1436993133.835604, 1410542, 5.88037633895874], [1436993613.674655, 1420684, 5.778636932373047], [1436994093.747454, 1430832, 5.876591682434082], [1436994573.768973, 1440986, 6.196285724639893], [1436995053.666661, 1451174, 5.7718634605407715], [1436995533.83439, 1461345, 5.931266784667969], [1436996013.556996, 1471495, 5.9706597328186035], [1436996493.635477, 1481663, 5.589694023132324], [1436996973.668684, 1491822, 5.787637233734131], [1436997453.59326, 1501979, 5.634321689605713], [1436997933.774019, 1512139, 5.699962615966797], [1436998413.575162, 1522290, 5.807012557983398], [1436998893.640468, 1532431, 5.559602737426758], [1436999373.551661, 1542579, 5.918235778808594], [1436999853.57906, 1552734, 5.745569229125977], [1437000333.680409, 1562888, 5.59443473815918], [1437000813.602383, 1573037, 5.703190326690674], [1437001293.610337, 1583190, 5.468636512756348], [1437001773.618199, 1593341, 5.610755920410156], [1437002253.572966, 1603497, 5.4396867752075195], [1437002733.67994, 1613657, 5.7537946701049805], [1437003213.583266, 1623809, 5.7613725662231445], [1437003693.639943, 1633966, 5.439754009246826], [1437004173.568287, 1644113, 5.4889116287231445], [1437004653.610772, 1654268, 5.39843225479126], [1437005133.663045, 1664424, 5.576738357543945], [1437005613.580984, 1674567, 5.662004470825195], [1437006093.601019, 1684715, 5.3926777839660645], [1437006573.625314, 1694857, 5.464866638183594], [1437007053.584514, 1704999, 5.40261173248291], [1437007533.719303, 1715150, 5.23733377456665], [1437008013.604962, 1725282, 5.448479652404785], [1437008493.655091, 1735432, 5.684703826904297], [1437008973.640165, 1745584, 5.400024890899658], [1437009453.715067, 1755742, 5.378822326660156], [1437009933.765712, 1765896, 5.45297384262085], [1437010413.632128, 1776052, 5.248030185699463], [1437010893.66766, 1786195, 5.3377580642700195], [1437011373.636164, 1796346, 5.292956352233887], [1437011853.631224, 1806481, 5.438100814819336], [1437012333.706205, 1816617, 5.148743629455566], [1437012813.61987, 1826754, 5.319127559661865], [1437013293.479904, 1836883, 5.1646199226379395], [1437013773.604574, 1847029, 5.494720458984375], [1437014253.618884, 1857175, 5.17764949798584], [1437014733.756419, 1867312, 5.14331579208374], [1437015213.638607, 1877459, 5.309914588928223], [1437015693.625763, 1887608, 5.542352676391602], [1437016173.63194, 1897759, 5.075393199920654], [1437016653.609074, 1907909, 5.249225616455078], [1437017133.717601, 1918074, 5.392384052276611], [1437017613.716011, 1928220, 5.38590669631958], [1437018093.626005, 1938377, 5.229607105255127], [1437018573.626522, 1948523, 5.287610054016113], [1437019053.648174, 1958678, 5.2798333168029785], [1437019533.803011, 1968831, 5.151246070861816], [1437020013.667751, 1978978, 5.118294715881348], [1437020493.659028, 1989133, 5.327050685882568], [1437020973.657346, 1999287, 5.174264430999756], [1437021453.650634, 2009437, 5.1660661697387695], [1437021933.848661, 2019588, 5.089689254760742], [1437022413.674963, 2029736, 5.06661319732666], [1437022893.69086, 2039894, 5.031608581542969], [1437023373.68883, 2050054, 4.874476432800293], [1437023853.686116, 2060205, 5.107512474060059], [1437024333.763876, 2070362, 5.135380268096924], [1437024813.707845, 2080507, 5.087984561920166], [1437025293.483294, 2090645, 5.240448474884033], [1437025773.695712, 2100793, 4.930302619934082], [1437026253.672994, 2110943, 4.914392471313477], [1437026733.780775, 2121094, 5.182378768920898], [1437027213.617849, 2131235, 4.93843412399292], [1437027693.694451, 2141382, 4.924433708190918], [1437028173.68596, 2151537, 4.957921028137207], [1437028653.584833, 2161685, 5.040386199951172], [1437029133.792483, 2171839, 5.01956033706665], [1437029613.661672, 2181977, 4.987490177154541], [1437030093.641009, 2192118, 4.960195064544678], [1437030573.656274, 2202268, 5.0094523429870605], [1437031053.643631, 2212416, 4.83445930480957], [1437031533.777478, 2222583, 4.922268390655518], [1437032013.704008, 2232736, 5.113382339477539], [1437032493.638393, 2242882, 4.881488800048828], [1437032973.684986, 2253041, 4.953296661376953], [1437033453.699562, 2263183, 4.865671157836914], [1437033933.918074, 2273320, 4.829331874847412], [1437034413.596351, 2283443, 4.777036190032959], [1437034893.640496, 2293579, 4.864566326141357], [1437035373.637761, 2303701, 4.988693714141846], [1437035853.669947, 2313823, 5.016432285308838], [1437036333.78905, 2323961, 4.651939868927002], [1437036813.699727, 2334089, 4.767807960510254], [1437037293.662592, 2344235, 4.628738880157471], [1437037773.66716, 2354364, 4.929834842681885], [1437038253.603687, 2364507, 4.739555835723877], [1437038733.78864, 2374644, 4.821824073791504], [1437039213.641799, 2384782, 4.853730201721191], [1437039693.687078, 2394923, 4.581423759460449], [1437040173.635717, 2405058, 4.452754497528076], [1437040653.673331, 2415194, 4.837629318237305], [1437041133.764768, 2425322, 4.752482891082764], [1437041613.629279, 2435449, 4.730231761932373], [1437042093.703985, 2445575, 4.5618896484375], [1437042573.496029, 2455712, 4.673112869262695], [1437043053.686022, 2465844, 4.565918922424316], [1437043533.731929, 2475974, 4.7191481590271], [1437044013.636245, 2486095, 4.589008331298828], [1437044493.69923, 2496238, 4.599475383758545], [1437044973.652155, 2506373, 4.544175624847412], [1437045453.691467, 2516497, 4.4221673011779785], [1437045933.935804, 2526637, 4.44448709487915], [1437046413.635583, 2536770, 4.647110939025879], [1437046893.626337, 2546896, 4.768988609313965], [1437047373.67437, 2557029, 4.5318827629089355], [1437047853.652939, 2567169, 4.501277923583984], [1437048333.778436, 2577306, 4.6167216300964355], [1437048813.654248, 2587433, 4.66096305847168], [1437049293.610609, 2597552, 4.529193878173828], [1437049773.646573, 2607690, 4.455351829528809], [1437050253.667925, 2617808, 4.51211404800415], [1437050733.735291, 2627933, 4.803231716156006], [1437051213.620222, 2638053, 4.645476341247559], [1437051693.601978, 2648171, 4.419768810272217], [1437052173.634985, 2658299, 4.48175048828125], [1437052653.687176, 2668425, 4.397725582122803], [1437053133.762819, 2678556, 4.188413619995117], [1437053613.643698, 2688671, 4.291479110717773], [1437054093.673047, 2698804, 4.321218013763428], [1437054573.667371, 2708956, 4.311710834503174], [1437055053.650441, 2719087, 4.481810092926025], [1437055533.778469, 2729219, 4.452049255371094], [1437056013.694082, 2739343, 4.455989360809326], [1437056493.674871, 2749458, 4.415104866027832], [1437056973.700234, 2759575, 4.259828567504883], [1437057453.666129, 2769697, 4.510563373565674], [1437057933.848506, 2779821, 4.221935272216797], [1437058413.643799, 2789941, 4.437899112701416], [1437058893.715386, 2800076, 4.302872657775879], [1437059373.62596, 2810207, 4.228428363800049], [1437059853.650848, 2820334, 4.220061779022217], [1437060333.792248, 2830465, 4.138088703155518], [1437060813.682955, 2840600, 4.2196125984191895], [1437061293.681795, 2850745, 4.1594085693359375], [1437061773.691182, 2860880, 4.179514408111572], [1437062253.662987, 2871013, 4.202476978302002], [1437062733.760419, 2881153, 4.282044887542725], [1437063213.651969, 2891278, 4.200533866882324], [1437063693.723523, 2901406, 4.263350486755371], [1437064173.68663, 2911533, 4.378939628601074], [1437064653.547643, 2921667, 4.202810287475586], [1437065133.62645, 2931813, 4.193121910095215], [1437065613.566569, 2941947, 4.132870197296143], [1437066093.537804, 2952102, 4.35767936706543], [1437066573.529332, 2962243, 4.211732864379883], [1437067053.520098, 2972400, 4.020431041717529], [1437067533.605733, 2982561, 4.342063903808594], [1437068013.535467, 2992698, 4.197565078735352], [1437068493.559976, 3002839, 3.8806259632110596], [1437068973.558743, 3012983, 3.871702194213867], [1437069453.562661, 3023116, 4.064865589141846], [1437069933.627071, 3033256, 3.817744731903076], [1437070413.574131, 3043386, 4.106888294219971], [1437070893.658803, 3053528, 4.235474586486816], [1437071373.638711, 3063659, 4.127055644989014], [1437071853.621384, 3073794, 4.176018238067627], [1437072333.665269, 3083926, 4.048959732055664], [1437072813.584388, 3094040, 4.178991794586182], [1437073293.569178, 3104172, 3.8385396003723145]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d1.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d1.json
deleted file mode 100644
index 27ff64e5ddb..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 1.009283951483666897], [1436926413.945391, 1476, 0.932567862421274185], [1436926893.945037, 6006, 0.02773338556289673], [1436927373.995472, 13786, 0.021291319280862808], [1436927853.989794, 23650, 0.515582754276692867], [1436928334.132361, 33755, 0.011689444072544575], [1436928813.973288, 43941, 0.009183925576508045], [1436929293.975949, 54146, 0.007850822061300278], [1436929773.992781, 64316, 0.007189035415649414], [1436930253.997415, 74465, 0.007230754010379314], [1436930734.203004, 84611, 0.007685001939535141], [1436931214.03644, 94700, 0.008264732547104359], [1436931694.094564, 104766, 0.008946491405367851], [1436932174.114955, 114817, 0.00966302677989006], [1436932654.161382, 124880, 0.010994276031851768], [1436933133.960214, 134977, 0.01196141354739666], [1436933614.044337, 145062, 0.012673594057559967], [1436934094.166206, 155169, 0.013639944605529308], [1436934574.106036, 165284, 0.014305333606898785], [1436935054.150647, 175402, 0.014946178533136845], [1436935533.819562, 185538, 0.015736915171146393], [1436936013.710422, 195712, 0.01633097417652607], [1436936493.609025, 205906, 0.01669587567448616], [1436936973.683892, 216099, 0.017459288239479065], [1436937454.138383, 226331, 0.018532060086727142], [1436937933.838475, 236532, 0.01949254982173443], [1436938413.89688, 246724, 0.01951725408434868], [1436938894.018652, 256925, 0.019763393327593803], [1436939373.69067, 267137, 0.02008610963821411], [1436939853.673692, 277369, 0.021090799942612648], [1436940333.651346, 287620, 0.021408839151263237], [1436940813.599579, 297848, 0.021988894790410995], [1436941293.596313, 308088, 0.02236073836684227], [1436941773.659172, 318362, 0.022547174245119095], [1436942253.648479, 328621, 0.02303086407482624], [1436942733.752284, 338892, 0.023787079378962517], [1436943213.621881, 349144, 0.024007514119148254], [1436943693.698743, 359399, 0.02414763905107975], [1436944173.578463, 369649, 0.024576496332883835], [1436944653.692217, 379912, 0.02469169721007347], [1436945133.677298, 390180, 0.024951916188001633], [1436945613.572411, 400445, 0.025548970326781273], [1436946093.56123, 410703, 0.025769377127289772], [1436946573.542364, 420958, 0.02602097950875759], [1436947053.616578, 431216, 0.026028109714388847], [1436947533.636973, 441483, 0.026348495855927467], [1436948013.541574, 451751, 0.02621930092573166], [1436948493.560223, 462015, 0.02671053633093834], [1436948973.512541, 472260, 0.0272178016602993], [1436949453.550055, 482483, 0.02734796144068241], [1436949933.828011, 492731, 0.027217809110879898], [1436950413.603177, 502957, 0.027318621054291725], [1436950893.563009, 513185, 0.027304155752062798], [1436951373.620887, 523410, 0.027759933844208717], [1436951853.61941, 533618, 0.028056234121322632], [1436952333.694447, 543828, 0.028620803728699684], [1436952813.621004, 554042, 0.028957637026906013], [1436953293.588156, 564251, 0.029187509790062904], [1436953773.599734, 574464, 0.028960268944501877], [1436954253.621309, 584672, 0.02891424670815468], [1436954733.738119, 594882, 0.029211293905973434], [1436955213.56617, 605091, 0.029444213956594467], [1436955693.585366, 615296, 0.02974688820540905], [1436956173.626395, 625501, 0.03026159666478634], [1436956653.601937, 635705, 0.03039497137069702], [1436957133.665878, 645915, 0.03041839227080345], [1436957613.584762, 656116, 0.030588043853640556], [1436958093.549783, 666331, 0.030284974724054337], [1436958573.646778, 676543, 0.030354496091604233], [1436959053.585655, 686750, 0.030551007017493248], [1436959533.679696, 696961, 0.03068561479449272], [1436960013.633292, 707173, 0.030921893194317818], [1436960493.578778, 717383, 0.031080031767487526], [1436960973.596715, 727598, 0.030773505568504333], [1436961453.625644, 737818, 0.03084484674036503], [1436961933.740339, 748040, 0.03110458515584469], [1436962413.573845, 758252, 0.03114113211631775], [1436962893.610678, 768470, 0.03101053647696972], [1436963373.642878, 778674, 0.03110116347670555], [1436963853.558388, 788877, 0.031342316418886185], [1436964333.658419, 799099, 0.03130127117037773], [1436964813.573319, 809289, 0.031288161873817444], [1436965293.542098, 819484, 0.031435444951057434], [1436965773.545453, 829687, 0.03166936710476875], [1436966253.586517, 839901, 0.03169429674744606], [1436966733.639348, 850120, 0.03191458433866501], [1436967213.697288, 860330, 0.03205746412277222], [1436967693.617172, 870539, 0.03206293657422066], [1436968173.593885, 880748, 0.031957853585481644], [1436968653.560836, 890955, 0.0316658616065979], [1436969133.676337, 901164, 0.031929533928632736], [1436969613.506638, 911358, 0.03174331784248352], [1436970093.595964, 921560, 0.03157960623502731], [1436970573.541227, 931756, 0.03176721930503845], [1436971053.624316, 941945, 0.031810544431209564], [1436971533.655543, 952138, 0.031946416944265366], [1436972013.604738, 962349, 0.03205405920743942], [1436972493.613199, 972551, 0.031924981623888016], [1436972973.501155, 982746, 0.03199697285890579], [1436973453.64842, 992945, 0.03204970061779022], [1436973933.689516, 1003147, 0.032020214945077896], [1436974413.577769, 1013350, 0.03207497298717499], [1436974893.542281, 1023545, 0.03221454098820686], [1436975373.638453, 1033759, 0.032191887497901917], [1436975853.524388, 1043955, 0.03240729123353958], [1436976333.625792, 1054148, 0.032219529151916504], [1436976813.610661, 1064342, 0.03200426697731018], [1436977293.601581, 1074539, 0.03198647499084473], [1436977773.575627, 1084733, 0.0320645235478878], [1436978253.564972, 1094914, 0.0322980061173439], [1436978733.673144, 1105109, 0.032482605427503586], [1436979213.540585, 1115293, 0.032628435641527176], [1436979693.699591, 1125483, 0.032744552940130234], [1436980173.613012, 1135670, 0.03268158435821533], [1436980653.575769, 1145862, 0.0324023962020874], [1436981133.719264, 1156045, 0.03237328305840492], [1436981613.563551, 1166236, 0.03202575817704201], [1436982093.553233, 1176436, 0.03216284513473511], [1436982573.577846, 1186636, 0.03232415020465851], [1436983053.605749, 1196837, 0.0324099175632], [1436983533.684994, 1207025, 0.03245137259364128], [1436984013.561492, 1217233, 0.032246463000774384], [1436984493.629873, 1227437, 0.032042667269706726], [1436984973.606714, 1237643, 0.0318642184138298], [1436985453.690084, 1247835, 0.03191140666604042], [1436985933.711388, 1257951, 0.032287366688251495], [1436986413.598807, 1268125, 0.03226638585329056], [1436986893.631797, 1278290, 0.03252791240811348], [1436987373.596962, 1288473, 0.03241675719618797], [1436987853.555549, 1298650, 0.032103829085826874], [1436988333.722032, 1308841, 0.031904906034469604], [1436988813.55697, 1319018, 0.03179024159908295], [1436989293.756905, 1329221, 0.03168707340955734], [1436989773.665141, 1339417, 0.03160175681114197], [1436990253.768302, 1349610, 0.03161788731813431], [1436990733.708919, 1359759, 0.031772397458553314], [1436991213.663033, 1369914, 0.031758904457092285], [1436991693.730925, 1380074, 0.031629469245672226], [1436992173.751791, 1390224, 0.03154703974723816], [1436992653.758682, 1400383, 0.031527940183877945], [1436993133.835604, 1410542, 0.03169580549001694], [1436993613.674655, 1420684, 0.03182605654001236], [1436994093.747454, 1430832, 0.03185024857521057], [1436994573.768973, 1440986, 0.03199737146496773], [1436995053.666661, 1451174, 0.03156095743179321], [1436995533.83439, 1461345, 0.03150693327188492], [1436996013.556996, 1471495, 0.031496383249759674], [1436996493.635477, 1481663, 0.0313432440161705], [1436996973.668684, 1491822, 0.031145794317126274], [1436997453.59326, 1501979, 0.03106667660176754], [1436997933.774019, 1512139, 0.03143244981765747], [1436998413.575162, 1522290, 0.03142988309264183], [1436998893.640468, 1532431, 0.03132546320557594], [1436999373.551661, 1542579, 0.03125471621751785], [1436999853.57906, 1552734, 0.03098788857460022], [1437000333.680409, 1562888, 0.0308846328407526], [1437000813.602383, 1573037, 0.03082612156867981], [1437001293.610337, 1583190, 0.030793681740760803], [1437001773.618199, 1593341, 0.03087364137172699], [1437002253.572966, 1603497, 0.030839646235108376], [1437002733.67994, 1613657, 0.030705047771334648], [1437003213.583266, 1623809, 0.03071814589202404], [1437003693.639943, 1633966, 0.0304812490940094], [1437004173.568287, 1644113, 0.03030412085354328], [1437004653.610772, 1654268, 0.03032425045967102], [1437005133.663045, 1664424, 0.030430471524596214], [1437005613.580984, 1674567, 0.03036225587129593], [1437006093.601019, 1684715, 0.03056645393371582], [1437006573.625314, 1694857, 0.03043070062994957], [1437007053.584514, 1704999, 0.030224520713090897], [1437007533.719303, 1715150, 0.03024231642484665], [1437008013.604962, 1725282, 0.03009769506752491], [1437008493.655091, 1735432, 0.030214866623282433], [1437008973.640165, 1745584, 0.030181538313627243], [1437009453.715067, 1755742, 0.03017231822013855], [1437009933.765712, 1765896, 0.030141284689307213], [1437010413.632128, 1776052, 0.030052203685045242], [1437010893.66766, 1786195, 0.030078601092100143], [1437011373.636164, 1796346, 0.029969291761517525], [1437011853.631224, 1806481, 0.02999536693096161], [1437012333.706205, 1816617, 0.030100464820861816], [1437012813.61987, 1826754, 0.03008824959397316], [1437013293.479904, 1836883, 0.029995709657669067], [1437013773.604574, 1847029, 0.02995096519589424], [1437014253.618884, 1857175, 0.02980179339647293], [1437014733.756419, 1867312, 0.029607007279992104], [1437015213.638607, 1877459, 0.02952035330235958], [1437015693.625763, 1887608, 0.02937002293765545], [1437016173.63194, 1897759, 0.029285306110978127], [1437016653.609074, 1907909, 0.029194746166467667], [1437017133.717601, 1918074, 0.029153630137443542], [1437017613.716011, 1928220, 0.029063496738672256], [1437018093.626005, 1938377, 0.028990253806114197], [1437018573.626522, 1948523, 0.0290801040828228], [1437019053.648174, 1958678, 0.029026925563812256], [1437019533.803011, 1968831, 0.029071522876620293], [1437020013.667751, 1978978, 0.02911040186882019], [1437020493.659028, 1989133, 0.02908971533179283], [1437020973.657346, 1999287, 0.028982823714613914], [1437021453.650634, 2009437, 0.028793631121516228], [1437021933.848661, 2019588, 0.02868799678981304], [1437022413.674963, 2029736, 0.028585929423570633], [1437022893.69086, 2039894, 0.028488371521234512], [1437023373.68883, 2050054, 0.028293771669268608], [1437023853.686116, 2060205, 0.028227869421243668], [1437024333.763876, 2070362, 0.0280953086912632], [1437024813.707845, 2080507, 0.02794187143445015], [1437025293.483294, 2090645, 0.0278786551207304], [1437025773.695712, 2100793, 0.02786232903599739], [1437026253.672994, 2110943, 0.02783624827861786], [1437026733.780775, 2121094, 0.027756746858358383], [1437027213.617849, 2131235, 0.027644069865345955], [1437027693.694451, 2141382, 0.02752004750072956], [1437028173.68596, 2151537, 0.0274327602237463], [1437028653.584833, 2161685, 0.027434347197413445], [1437029133.792483, 2171839, 0.02731819450855255], [1437029613.661672, 2181977, 0.027138520032167435], [1437030093.641009, 2192118, 0.027088932693004608], [1437030573.656274, 2202268, 0.02713087759912014], [1437031053.643631, 2212416, 0.027159670367836952], [1437031533.777478, 2222583, 0.027089878916740417], [1437032013.704008, 2232736, 0.026989545673131943], [1437032493.638393, 2242882, 0.02692277729511261], [1437032973.684986, 2253041, 0.026783647015690804], [1437033453.699562, 2263183, 0.026735099032521248], [1437033933.918074, 2273320, 0.02665248140692711], [1437034413.596351, 2283443, 0.02659791149199009], [1437034893.640496, 2293579, 0.026540575549006462], [1437035373.637761, 2303701, 0.02647154964506626], [1437035853.669947, 2313823, 0.02645135670900345], [1437036333.78905, 2323961, 0.026429900899529457], [1437036813.699727, 2334089, 0.026324935257434845], [1437037293.662592, 2344235, 0.026287639513611794], [1437037773.66716, 2354364, 0.02626391313970089], [1437038253.603687, 2364507, 0.026225272566080093], [1437038733.78864, 2374644, 0.026248561218380928], [1437039213.641799, 2384782, 0.026243599131703377], [1437039693.687078, 2394923, 0.026255469769239426], [1437040173.635717, 2405058, 0.026186810806393623], [1437040653.673331, 2415194, 0.02606010064482689], [1437041133.764768, 2425322, 0.026031550019979477], [1437041613.629279, 2435449, 0.02595149166882038], [1437042093.703985, 2445575, 0.025885630398988724], [1437042573.496029, 2455712, 0.025858554989099503], [1437043053.686022, 2465844, 0.0257696695625782], [1437043533.731929, 2475974, 0.02574242651462555], [1437044013.636245, 2486095, 0.025741754099726677], [1437044493.69923, 2496238, 0.02561314031481743], [1437044973.652155, 2506373, 0.02550213597714901], [1437045453.691467, 2516497, 0.025422468781471252], [1437045933.935804, 2526637, 0.025300107896327972], [1437046413.635583, 2536770, 0.02533198893070221], [1437046893.626337, 2546896, 0.025261884555220604], [1437047373.67437, 2557029, 0.025176096707582474], [1437047853.652939, 2567169, 0.025054505094885826], [1437048333.778436, 2577306, 0.024978378787636757], [1437048813.654248, 2587433, 0.024952610954642296], [1437049293.610609, 2597552, 0.02484666183590889], [1437049773.646573, 2607690, 0.024764036759734154], [1437050253.667925, 2617808, 0.024689028039574623], [1437050733.735291, 2627933, 0.024599267169833183], [1437051213.620222, 2638053, 0.024585112929344177], [1437051693.601978, 2648171, 0.024474989622831345], [1437052173.634985, 2658299, 0.024343013763427734], [1437052653.687176, 2668425, 0.024294432252645493], [1437053133.762819, 2678556, 0.024164099246263504], [1437053613.643698, 2688671, 0.024035055190324783], [1437054093.673047, 2698804, 0.024000361561775208], [1437054573.667371, 2708956, 0.023914529010653496], [1437055053.650441, 2719087, 0.023955287411808968], [1437055533.778469, 2729219, 0.023859601467847824], [1437056013.694082, 2739343, 0.023759596049785614], [1437056493.674871, 2749458, 0.02367720566689968], [1437056973.700234, 2759575, 0.023645451292395592], [1437057453.666129, 2769697, 0.023565715178847313], [1437057933.848506, 2779821, 0.023514313623309135], [1437058413.643799, 2789941, 0.023489659652113914], [1437058893.715386, 2800076, 0.023429812863469124], [1437059373.62596, 2810207, 0.023344023153185844], [1437059853.650848, 2820334, 0.023226741701364517], [1437060333.792248, 2830465, 0.023134270682930946], [1437060813.682955, 2840600, 0.02305578999221325], [1437061293.681795, 2850745, 0.02298513427376747], [1437061773.691182, 2860880, 0.022913720458745956], [1437062253.662987, 2871013, 0.022864067927002907], [1437062733.760419, 2881153, 0.02278953418135643], [1437063213.651969, 2891278, 0.02276339940726757], [1437063693.723523, 2901406, 0.022675812244415283], [1437064173.68663, 2911533, 0.022622767835855484], [1437064653.547643, 2921667, 0.02255198359489441], [1437065133.62645, 2931813, 0.022431762889027596], [1437065613.566569, 2941947, 0.022368362173438072], [1437066093.537804, 2952102, 0.022323831915855408], [1437066573.529332, 2962243, 0.02226843684911728], [1437067053.520098, 2972400, 0.022210361436009407], [1437067533.605733, 2982561, 0.022118505090475082], [1437068013.535467, 2992698, 0.022013112902641296], [1437068493.559976, 3002839, 0.02197197824716568], [1437068973.558743, 3012983, 0.02191166952252388], [1437069453.562661, 3023116, 0.021851476281881332], [1437069933.627071, 3033256, 0.021762533113360405], [1437070413.574131, 3043386, 0.021733969449996948], [1437070893.658803, 3053528, 0.021669406443834305], [1437071373.638711, 3063659, 0.02159426547586918], [1437071853.621384, 3073794, 0.02153114229440689], [1437072333.665269, 3083926, 0.021499117836356163], [1437072813.584388, 3094040, 0.021457014605402946], [1437073293.569178, 3104172, 0.021365314722061157]]
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d2.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d2.json
deleted file mode 100644
index fb5a18d53a1..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d2.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 0.01034154836088419], [1436926413.945391, 1476, 0.03646053001284599], [1436926893.945037, 6006, 0.031110260635614395], [1436927373.995472, 13786, 0.024214591830968857], [1436927853.989794, 23650, 0.01820789836347103], [1436928334.132361, 33755, 0.01442798599600792], [1436928813.973288, 43941, 0.012150184251368046], [1436929293.975949, 54146, 0.011141776107251644], [1436929773.992781, 64316, 0.010859030298888683], [1436930253.997415, 74465, 0.011160558089613914], [1436930734.203004, 84611, 0.011997541412711143], [1436931214.03644, 94700, 0.01278648804873228], [1436931694.094564, 104766, 0.014073861762881279], [1436932174.114955, 114817, 0.01523376815021038], [1436932654.161382, 124880, 0.016527879983186722], [1436933133.960214, 134977, 0.01782997138798237], [1436933614.044337, 145062, 0.019055265933275223], [1436934094.166206, 155169, 0.02028629370033741], [1436934574.106036, 165284, 0.02116803079843521], [1436935054.150647, 175402, 0.022192901000380516], [1436935533.819562, 185538, 0.022869590669870377], [1436936013.710422, 195712, 0.023398980498313904], [1436936493.609025, 205906, 0.02443159930408001], [1436936973.683892, 216099, 0.025154944509267807], [1436937454.138383, 226331, 0.025802481919527054], [1436937933.838475, 236532, 0.027000702917575836], [1436938413.89688, 246724, 0.02752412110567093], [1436938894.018652, 256925, 0.0278119258582592], [1436939373.69067, 267137, 0.027698883786797523], [1436939853.673692, 277369, 0.028744956478476524], [1436940333.651346, 287620, 0.029281964525580406], [1436940813.599579, 297848, 0.03002205118536949], [1436941293.596313, 308088, 0.030467400327324867], [1436941773.659172, 318362, 0.03132195770740509], [1436942253.648479, 328621, 0.031431782990694046], [1436942733.752284, 338892, 0.03147844970226288], [1436943213.621881, 349144, 0.032013144344091415], [1436943693.698743, 359399, 0.03241390734910965], [1436944173.578463, 369649, 0.03261363133788109], [1436944653.692217, 379912, 0.033306822180747986], [1436945133.677298, 390180, 0.03390969708561897], [1436945613.572411, 400445, 0.03396527096629143], [1436946093.56123, 410703, 0.03388286381959915], [1436946573.542364, 420958, 0.03399669751524925], [1436947053.616578, 431216, 0.03394070267677307], [1436947533.636973, 441483, 0.03419327735900879], [1436948013.541574, 451751, 0.0342416949570179], [1436948493.560223, 462015, 0.034808479249477386], [1436948973.512541, 472260, 0.03552314639091492], [1436949453.550055, 482483, 0.036012329161167145], [1436949933.828011, 492731, 0.035826291888952255], [1436950413.603177, 502957, 0.03600003197789192], [1436950893.563009, 513185, 0.03563224524259567], [1436951373.620887, 523410, 0.03584449738264084], [1436951853.61941, 533618, 0.03587675839662552], [1436952333.694447, 543828, 0.036698292940855026], [1436952813.621004, 554042, 0.03698749095201492], [1436953293.588156, 564251, 0.03712376952171326], [1436953773.599734, 574464, 0.03729996830224991], [1436954253.621309, 584672, 0.03730553761124611], [1436954733.738119, 594882, 0.037479378283023834], [1436955213.56617, 605091, 0.03754287213087082], [1436955693.585366, 615296, 0.0377657376229763], [1436956173.626395, 625501, 0.038117796182632446], [1436956653.601937, 635705, 0.03822959586977959], [1436957133.665878, 645915, 0.03776161000132561], [1436957613.584762, 656116, 0.03816362842917442], [1436958093.549783, 666331, 0.03853853791952133], [1436958573.646778, 676543, 0.03826189786195755], [1436959053.585655, 686750, 0.0381099209189415], [1436959533.679696, 696961, 0.03844142332673073], [1436960013.633292, 707173, 0.03868117928504944], [1436960493.578778, 717383, 0.0390009842813015], [1436960973.596715, 727598, 0.0383562371134758], [1436961453.625644, 737818, 0.0382055900990963], [1436961933.740339, 748040, 0.03806299716234207], [1436962413.573845, 758252, 0.03807120397686958], [1436962893.610678, 768470, 0.03795558586716652], [1436963373.642878, 778674, 0.038018494844436646], [1436963853.558388, 788877, 0.038447774946689606], [1436964333.658419, 799099, 0.03842216357588768], [1436964813.573319, 809289, 0.03840547427535057], [1436965293.542098, 819484, 0.038492728024721146], [1436965773.545453, 829687, 0.0387515053153038], [1436966253.586517, 839901, 0.03869732841849327], [1436966733.639348, 850120, 0.03907460719347], [1436967213.697288, 860330, 0.0395859070122242], [1436967693.617172, 870539, 0.039280518889427185], [1436968173.593885, 880748, 0.0392826572060585], [1436968653.560836, 890955, 0.03899630531668663], [1436969133.676337, 901164, 0.03888440132141113], [1436969613.506638, 911358, 0.038790252059698105], [1436970093.595964, 921560, 0.03851785138249397], [1436970573.541227, 931756, 0.03913348540663719], [1436971053.624316, 941945, 0.038978900760412216], [1436971533.655543, 952138, 0.03925086557865143], [1436972013.604738, 962349, 0.039124101400375366], [1436972493.613199, 972551, 0.0390220545232296], [1436972973.501155, 982746, 0.039025235921144485], [1436973453.64842, 992945, 0.03877083212137222], [1436973933.689516, 1003147, 0.03902769833803177], [1436974413.577769, 1013350, 0.038719139993190765], [1436974893.542281, 1023545, 0.03872331231832504], [1436975373.638453, 1033759, 0.03927341103553772], [1436975853.524388, 1043955, 0.03930830955505371], [1436976333.625792, 1054148, 0.039153918623924255], [1436976813.610661, 1064342, 0.03932590410113335], [1436977293.601581, 1074539, 0.03922765702009201], [1436977773.575627, 1084733, 0.039390794932842255], [1436978253.564972, 1094914, 0.03935663774609566], [1436978733.673144, 1105109, 0.03939087316393852], [1436979213.540585, 1115293, 0.039371199905872345], [1436979693.699591, 1125483, 0.03982992097735405], [1436980173.613012, 1135670, 0.03941287472844124], [1436980653.575769, 1145862, 0.03933672979474068], [1436981133.719264, 1156045, 0.03919614478945732], [1436981613.563551, 1166236, 0.03906407952308655], [1436982093.553233, 1176436, 0.038837045431137085], [1436982573.577846, 1186636, 0.039009105414152145], [1436983053.605749, 1196837, 0.039010051637887955], [1436983533.684994, 1207025, 0.03891472890973091], [1436984013.561492, 1217233, 0.038610219955444336], [1436984493.629873, 1227437, 0.03866511583328247], [1436984973.606714, 1237643, 0.03865685313940048], [1436985453.690084, 1247835, 0.038945719599723816], [1436985933.711388, 1257951, 0.03925580158829689], [1436986413.598807, 1268125, 0.039332933723926544], [1436986893.631797, 1278290, 0.03918297216296196], [1436987373.596962, 1288473, 0.03883613646030426], [1436987853.555549, 1298650, 0.038776978850364685], [1436988333.722032, 1308841, 0.03888171166181564], [1436988813.55697, 1319018, 0.038825325667858124], [1436989293.756905, 1329221, 0.03864298388361931], [1436989773.665141, 1339417, 0.03865634649991989], [1436990253.768302, 1349610, 0.03898858651518822], [1436990733.708919, 1359759, 0.03906260430812836], [1436991213.663033, 1369914, 0.03911694139242172], [1436991693.730925, 1380074, 0.03875250369310379], [1436992173.751791, 1390224, 0.03882621228694916], [1436992653.758682, 1400383, 0.03877855837345123], [1436993133.835604, 1410542, 0.03870398923754692], [1436993613.674655, 1420684, 0.03887751325964928], [1436994093.747454, 1430832, 0.03915301710367203], [1436994573.768973, 1440986, 0.03938450664281845], [1436995053.666661, 1451174, 0.03919720649719238], [1436995533.83439, 1461345, 0.038862887769937515], [1436996013.556996, 1471495, 0.03901274502277374], [1436996493.635477, 1481663, 0.0388539656996727], [1436996973.668684, 1491822, 0.038732752203941345], [1436997453.59326, 1501979, 0.03879735246300697], [1436997933.774019, 1512139, 0.038524042814970016], [1436998413.575162, 1522290, 0.03869651257991791], [1436998893.640468, 1532431, 0.0383637398481369], [1436999373.551661, 1542579, 0.038300249725580215], [1436999853.57906, 1552734, 0.03799160569906235], [1437000333.680409, 1562888, 0.03759683296084404], [1437000813.602383, 1573037, 0.037678662687540054], [1437001293.610337, 1583190, 0.037575822323560715], [1437001773.618199, 1593341, 0.0376887246966362], [1437002253.572966, 1603497, 0.037922415882349014], [1437002733.67994, 1613657, 0.03766244649887085], [1437003213.583266, 1623809, 0.03754705190658569], [1437003693.639943, 1633966, 0.03738937899470329], [1437004173.568287, 1644113, 0.037347543984651566], [1437004653.610772, 1654268, 0.037374842911958694], [1437005133.663045, 1664424, 0.037443988025188446], [1437005613.580984, 1674567, 0.037457264959812164], [1437006093.601019, 1684715, 0.037874478846788406], [1437006573.625314, 1694857, 0.037644676864147186], [1437007053.584514, 1704999, 0.03743988648056984], [1437007533.719303, 1715150, 0.03739031031727791], [1437008013.604962, 1725282, 0.037301771342754364], [1437008493.655091, 1735432, 0.03735104575753212], [1437008973.640165, 1745584, 0.037282250821590424], [1437009453.715067, 1755742, 0.03729768097400665], [1437009933.765712, 1765896, 0.03717759624123573], [1437010413.632128, 1776052, 0.03691410645842552], [1437010893.66766, 1786195, 0.036807890981435776], [1437011373.636164, 1796346, 0.036659423261880875], [1437011853.631224, 1806481, 0.03682238608598709], [1437012333.706205, 1816617, 0.036776404827833176], [1437012813.61987, 1826754, 0.036672260612249374], [1437013293.479904, 1836883, 0.03666841238737106], [1437013773.604574, 1847029, 0.036642514169216156], [1437014253.618884, 1857175, 0.03654393553733826], [1437014733.756419, 1867312, 0.03638240322470665], [1437015213.638607, 1877459, 0.03610989451408386], [1437015693.625763, 1887608, 0.036011870950460434], [1437016173.63194, 1897759, 0.03607400134205818], [1437016653.609074, 1907909, 0.03581620752811432], [1437017133.717601, 1918074, 0.035680998116731644], [1437017613.716011, 1928220, 0.03547567501664162], [1437018093.626005, 1938377, 0.035375215113162994], [1437018573.626522, 1948523, 0.03534447029232979], [1437019053.648174, 1958678, 0.03535373508930206], [1437019533.803011, 1968831, 0.03541970252990723], [1437020013.667751, 1978978, 0.03534942492842674], [1437020493.659028, 1989133, 0.035337116569280624], [1437020973.657346, 1999287, 0.03519223630428314], [1437021453.650634, 2009437, 0.0350094810128212], [1437021933.848661, 2019588, 0.03481736779212952], [1437022413.674963, 2029736, 0.03482922539114952], [1437022893.69086, 2039894, 0.03482965752482414], [1437023373.68883, 2050054, 0.034710027277469635], [1437023853.686116, 2060205, 0.03447446599602699], [1437024333.763876, 2070362, 0.034356746822595596], [1437024813.707845, 2080507, 0.03430519998073578], [1437025293.483294, 2090645, 0.03412580490112305], [1437025773.695712, 2100793, 0.03409077599644661], [1437026253.672994, 2110943, 0.0340830534696579], [1437026733.780775, 2121094, 0.03400549292564392], [1437027213.617849, 2131235, 0.033846043050289154], [1437027693.694451, 2141382, 0.03379584103822708], [1437028173.68596, 2151537, 0.033618565648794174], [1437028653.584833, 2161685, 0.03352222591638565], [1437029133.792483, 2171839, 0.03338197246193886], [1437029613.661672, 2181977, 0.03323192894458771], [1437030093.641009, 2192118, 0.03313163295388222], [1437030573.656274, 2202268, 0.0331595316529274], [1437031053.643631, 2212416, 0.03310840204358101], [1437031533.777478, 2222583, 0.03298124670982361], [1437032013.704008, 2232736, 0.03288085386157036], [1437032493.638393, 2242882, 0.03281677886843681], [1437032973.684986, 2253041, 0.03261971473693848], [1437033453.699562, 2263183, 0.03251069411635399], [1437033933.918074, 2273320, 0.03243493288755417], [1437034413.596351, 2283443, 0.03251812607049942], [1437034893.640496, 2293579, 0.03244208171963692], [1437035373.637761, 2303701, 0.03246922418475151], [1437035853.669947, 2313823, 0.032652080059051514], [1437036333.78905, 2323961, 0.032621122896671295], [1437036813.699727, 2334089, 0.03248974680900574], [1437037293.662592, 2344235, 0.032404426485300064], [1437037773.66716, 2354364, 0.03240393102169037], [1437038253.603687, 2364507, 0.03238365799188614], [1437038733.78864, 2374644, 0.03244389593601227], [1437039213.641799, 2384782, 0.03239350765943527], [1437039693.687078, 2394923, 0.032426562160253525], [1437040173.635717, 2405058, 0.032403264194726944], [1437040653.673331, 2415194, 0.03231978043913841], [1437041133.764768, 2425322, 0.03223187103867531], [1437041613.629279, 2435449, 0.03213196247816086], [1437042093.703985, 2445575, 0.032153598964214325], [1437042573.496029, 2455712, 0.03199320286512375], [1437043053.686022, 2465844, 0.03188605234026909], [1437043533.731929, 2475974, 0.03178738057613373], [1437044013.636245, 2486095, 0.03171614184975624], [1437044493.69923, 2496238, 0.031645938754081726], [1437044973.652155, 2506373, 0.03155189007520676], [1437045453.691467, 2516497, 0.03144536912441254], [1437045933.935804, 2526637, 0.031432293355464935], [1437046413.635583, 2536770, 0.03129834309220314], [1437046893.626337, 2546896, 0.031195342540740967], [1437047373.67437, 2557029, 0.031033318489789963], [1437047853.652939, 2567169, 0.030938012525439262], [1437048333.778436, 2577306, 0.030827201902866364], [1437048813.654248, 2587433, 0.03068169392645359], [1437049293.610609, 2597552, 0.030520914122462273], [1437049773.646573, 2607690, 0.030437452718615532], [1437050253.667925, 2617808, 0.03041636385023594], [1437050733.735291, 2627933, 0.030291059985756874], [1437051213.620222, 2638053, 0.030283397063612938], [1437051693.601978, 2648171, 0.030193043872714043], [1437052173.634985, 2658299, 0.03004123829305172], [1437052653.687176, 2668425, 0.0299222432076931], [1437053133.762819, 2678556, 0.029762346297502518], [1437053613.643698, 2688671, 0.02970775216817856], [1437054093.673047, 2698804, 0.029604140669107437], [1437054573.667371, 2708956, 0.02949359640479088], [1437055053.650441, 2719087, 0.02943229116499424], [1437055533.778469, 2729219, 0.029304414987564087], [1437056013.694082, 2739343, 0.029147598892450333], [1437056493.674871, 2749458, 0.029033908620476723], [1437056973.700234, 2759575, 0.028886595740914345], [1437057453.666129, 2769697, 0.028734514489769936], [1437057933.848506, 2779821, 0.02874554693698883], [1437058413.643799, 2789941, 0.028716085478663445], [1437058893.715386, 2800076, 0.028669510036706924], [1437059373.62596, 2810207, 0.028530430048704147], [1437059853.650848, 2820334, 0.02839958481490612], [1437060333.792248, 2830465, 0.028364405035972595], [1437060813.682955, 2840600, 0.0282796248793602], [1437061293.681795, 2850745, 0.02820495329797268], [1437061773.691182, 2860880, 0.028159918263554573], [1437062253.662987, 2871013, 0.028104742988944054], [1437062733.760419, 2881153, 0.028099438175559044], [1437063213.651969, 2891278, 0.02802356891334057], [1437063693.723523, 2901406, 0.027945902198553085], [1437064173.68663, 2911533, 0.027897505089640617], [1437064653.547643, 2921667, 0.027821676805615425], [1437065133.62645, 2931813, 0.02770490199327469], [1437065613.566569, 2941947, 0.02761264331638813], [1437066093.537804, 2952102, 0.027557073161005974], [1437066573.529332, 2962243, 0.027522796764969826], [1437067053.520098, 2972400, 0.027469975873827934], [1437067533.605733, 2982561, 0.027299631386995316], [1437068013.535467, 2992698, 0.027225365862250328], [1437068493.559976, 3002839, 0.027095869183540344], [1437068973.558743, 3012983, 0.027036350220441818], [1437069453.562661, 3023116, 0.02693818509578705], [1437069933.627071, 3033256, 0.02687198854982853], [1437070413.574131, 3043386, 0.02687297947704792], [1437070893.658803, 3053528, 0.026770537719130516], [1437071373.638711, 3063659, 0.026667704805731773], [1437071853.621384, 3073794, 0.026571234688162804], [1437072333.665269, 3083926, 0.026447603479027748], [1437072813.584388, 3094040, 0.026389220729470253], [1437073293.569178, 3104172, 0.026299258694052696]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d3.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d3.json
deleted file mode 100644
index e489130ea77..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d3.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 0.03425809368491173], [1436926413.945391, 1476, 0.032557398080825806], [1436926893.945037, 6006, 0.0277252234518528], [1436927373.995472, 13786, 0.021282576024532318], [1436927853.989794, 23650, 0.015578101389110088], [1436928334.132361, 33755, 0.011687012389302254], [1436928813.973288, 43941, 0.00918175745755434], [1436929293.975949, 54146, 0.00784988235682249], [1436929773.992781, 64316, 0.007188988849520683], [1436930253.997415, 74465, 0.0072308750823140144], [1436930734.203004, 84611, 0.007685060612857342], [1436931214.03644, 94700, 0.008267422206699848], [1436931694.094564, 104766, 0.008946981281042099], [1436932174.114955, 114817, 0.009664506651461124], [1436932654.161382, 124880, 0.010994983837008476], [1436933133.960214, 134977, 0.011961394920945168], [1436933614.044337, 145062, 0.012674711644649506], [1436934094.166206, 155169, 0.013640021905303001], [1436934574.106036, 165284, 0.014305224642157555], [1436935054.150647, 175402, 0.014946703799068928], [1436935533.819562, 185538, 0.015737954527139664], [1436936013.710422, 195712, 0.016330912709236145], [1436936493.609025, 205906, 0.016695979982614517], [1436936973.683892, 216099, 0.017458846792578697], [1436937454.138383, 226331, 0.018533164635300636], [1436937933.838475, 236532, 0.01949200965464115], [1436938413.89688, 246724, 0.019517479464411736], [1436938894.018652, 256925, 0.019764307886362076], [1436939373.69067, 267137, 0.02008572220802307], [1436939853.673692, 277369, 0.021091068163514137], [1436940333.651346, 287620, 0.02140945754945278], [1436940813.599579, 297848, 0.021988170221447945], [1436941293.596313, 308088, 0.0223606675863266], [1436941773.659172, 318362, 0.022547796368598938], [1436942253.648479, 328621, 0.023031413555145264], [1436942733.752284, 338892, 0.023786410689353943], [1436943213.621881, 349144, 0.024008480831980705], [1436943693.698743, 359399, 0.024148935452103615], [1436944173.578463, 369649, 0.02457556128501892], [1436944653.692217, 379912, 0.02469060942530632], [1436945133.677298, 390180, 0.024952523410320282], [1436945613.572411, 400445, 0.02554873190820217], [1436946093.56123, 410703, 0.025771528482437134], [1436946573.542364, 420958, 0.02602078951895237], [1436947053.616578, 431216, 0.02602880820631981], [1436947533.636973, 441483, 0.026351822540163994], [1436948013.541574, 451751, 0.0262188371270895], [1436948493.560223, 462015, 0.026711203157901764], [1436948973.512541, 472260, 0.027218565344810486], [1436949453.550055, 482483, 0.02734719216823578], [1436949933.828011, 492731, 0.027217986062169075], [1436950413.603177, 502957, 0.027318857610225677], [1436950893.563009, 513185, 0.027305351570248604], [1436951373.620887, 523410, 0.027760380879044533], [1436951853.61941, 533618, 0.0280567966401577], [1436952333.694447, 543828, 0.028621215373277664], [1436952813.621004, 554042, 0.028958816081285477], [1436953293.588156, 564251, 0.029186993837356567], [1436953773.599734, 574464, 0.028960207477211952], [1436954253.621309, 584672, 0.028913332149386406], [1436954733.738119, 594882, 0.02921229600906372], [1436955213.56617, 605091, 0.029444556683301926], [1436955693.585366, 615296, 0.029747728258371353], [1436956173.626395, 625501, 0.030260732397437096], [1436956653.601937, 635705, 0.030394721776247025], [1436957133.665878, 645915, 0.03041674755513668], [1436957613.584762, 656116, 0.03058660589158535], [1436958093.549783, 666331, 0.030284838750958443], [1436958573.646778, 676543, 0.030354052782058716], [1436959053.585655, 686750, 0.030551131814718246], [1436959533.679696, 696961, 0.030686482787132263], [1436960013.633292, 707173, 0.030921922996640205], [1436960493.578778, 717383, 0.031079748645424843], [1436960973.596715, 727598, 0.03077232837677002], [1436961453.625644, 737818, 0.03084420971572399], [1436961933.740339, 748040, 0.03110562451183796], [1436962413.573845, 758252, 0.031141508370637894], [1436962893.610678, 768470, 0.031010067090392113], [1436963373.642878, 778674, 0.031100917607545853], [1436963853.558388, 788877, 0.03134296461939812], [1436964333.658419, 799099, 0.031301673501729965], [1436964813.573319, 809289, 0.031290579587221146], [1436965293.542098, 819484, 0.031435515731573105], [1436965773.545453, 829687, 0.031667787581682205], [1436966253.586517, 839901, 0.03169453889131546], [1436966733.639348, 850120, 0.03191617131233215], [1436967213.697288, 860330, 0.03205711767077446], [1436967693.617172, 870539, 0.03206227719783783], [1436968173.593885, 880748, 0.03195691108703613], [1436968653.560836, 890955, 0.03166574612259865], [1436969133.676337, 901164, 0.031929291784763336], [1436969613.506638, 911358, 0.031744007021188736], [1436970093.595964, 921560, 0.0315803587436676], [1436970573.541227, 931756, 0.031766779720783234], [1436971053.624316, 941945, 0.03181062266230583], [1436971533.655543, 952138, 0.0319465771317482], [1436972013.604738, 962349, 0.032054755836725235], [1436972493.613199, 972551, 0.03192495182156563], [1436972973.501155, 982746, 0.0319976881146431], [1436973453.64842, 992945, 0.03205036744475365], [1436973933.689516, 1003147, 0.032020118087530136], [1436974413.577769, 1013350, 0.03207429125905037], [1436974893.542281, 1023545, 0.032214779406785965], [1436975373.638453, 1033759, 0.03219134360551834], [1436975853.524388, 1043955, 0.0324082113802433], [1436976333.625792, 1054148, 0.03221917897462845], [1436976813.610661, 1064342, 0.03200480341911316], [1436977293.601581, 1074539, 0.03198748826980591], [1436977773.575627, 1084733, 0.032064300030469894], [1436978253.564972, 1094914, 0.032298240810632706], [1436978733.673144, 1105109, 0.03248215466737747], [1436979213.540585, 1115293, 0.03262820467352867], [1436979693.699591, 1125483, 0.032745134085416794], [1436980173.613012, 1135670, 0.032681502401828766], [1436980653.575769, 1145862, 0.03240214288234711], [1436981133.719264, 1156045, 0.03237201273441315], [1436981613.563551, 1166236, 0.03202598914504051], [1436982093.553233, 1176436, 0.032163310796022415], [1436982573.577846, 1186636, 0.03232435882091522], [1436983053.605749, 1196837, 0.032410554587841034], [1436983533.684994, 1207025, 0.03245232254266739], [1436984013.561492, 1217233, 0.03224659338593483], [1436984493.629873, 1227437, 0.03204221650958061], [1436984973.606714, 1237643, 0.03186390548944473], [1436985453.690084, 1247835, 0.031911786645650864], [1436985933.711388, 1257951, 0.032286882400512695], [1436986413.598807, 1268125, 0.032266560941934586], [1436986893.631797, 1278290, 0.03252791985869408], [1436987373.596962, 1288473, 0.03241678699851036], [1436987853.555549, 1298650, 0.03210347890853882], [1436988333.722032, 1308841, 0.031904902309179306], [1436988813.55697, 1319018, 0.03179018944501877], [1436989293.756905, 1329221, 0.0316874124109745], [1436989773.665141, 1339417, 0.03160090371966362], [1436990253.768302, 1349610, 0.03161816671490669], [1436990733.708919, 1359759, 0.0317724235355854], [1436991213.663033, 1369914, 0.03175821527838707], [1436991693.730925, 1380074, 0.031629402190446854], [1436992173.751791, 1390224, 0.031547073274850845], [1436992653.758682, 1400383, 0.031528495252132416], [1436993133.835604, 1410542, 0.03169562667608261], [1436993613.674655, 1420684, 0.031826674938201904], [1436994093.747454, 1430832, 0.03185039013624191], [1436994573.768973, 1440986, 0.03199826925992966], [1436995053.666661, 1451174, 0.03156091645359993], [1436995533.83439, 1461345, 0.031506411731243134], [1436996013.556996, 1471495, 0.031495608389377594], [1436996493.635477, 1481663, 0.03134337440133095], [1436996973.668684, 1491822, 0.031145554035902023], [1436997453.59326, 1501979, 0.031068041920661926], [1436997933.774019, 1512139, 0.031432390213012695], [1436998413.575162, 1522290, 0.03142932057380676], [1436998893.640468, 1532431, 0.03132513165473938], [1436999373.551661, 1542579, 0.03125539794564247], [1436999853.57906, 1552734, 0.0309873279184103], [1437000333.680409, 1562888, 0.03088490664958954], [1437000813.602383, 1573037, 0.0308260228484869], [1437001293.610337, 1583190, 0.030793415382504463], [1437001773.618199, 1593341, 0.03087344579398632], [1437002253.572966, 1603497, 0.0308389812707901], [1437002733.67994, 1613657, 0.03070608340203762], [1437003213.583266, 1623809, 0.0307186096906662], [1437003693.639943, 1633966, 0.03048117645084858], [1437004173.568287, 1644113, 0.03030446544289589], [1437004653.610772, 1654268, 0.030324051156640053], [1437005133.663045, 1664424, 0.03043009154498577], [1437005613.580984, 1674567, 0.030361991375684738], [1437006093.601019, 1684715, 0.030566193163394928], [1437006573.625314, 1694857, 0.030430208891630173], [1437007053.584514, 1704999, 0.030224468559026718], [1437007533.719303, 1715150, 0.030241932719945908], [1437008013.604962, 1725282, 0.030097855255007744], [1437008493.655091, 1735432, 0.030217904597520828], [1437008973.640165, 1745584, 0.030181601643562317], [1437009453.715067, 1755742, 0.030172593891620636], [1437009933.765712, 1765896, 0.030141659080982208], [1437010413.632128, 1776052, 0.030052196234464645], [1437010893.66766, 1786195, 0.03007938154041767], [1437011373.636164, 1796346, 0.02996920794248581], [1437011853.631224, 1806481, 0.029995175078511238], [1437012333.706205, 1816617, 0.03010040894150734], [1437012813.61987, 1826754, 0.030088385567069054], [1437013293.479904, 1836883, 0.029996229335665703], [1437013773.604574, 1847029, 0.029950618743896484], [1437014253.618884, 1857175, 0.029801754280924797], [1437014733.756419, 1867312, 0.029606210067868233], [1437015213.638607, 1877459, 0.029520301148295403], [1437015693.625763, 1887608, 0.02937021106481552], [1437016173.63194, 1897759, 0.02928493171930313], [1437016653.609074, 1907909, 0.029194936156272888], [1437017133.717601, 1918074, 0.029153617098927498], [1437017613.716011, 1928220, 0.029063349589705467], [1437018093.626005, 1938377, 0.02899051643908024], [1437018573.626522, 1948523, 0.02908063493669033], [1437019053.648174, 1958678, 0.029026903212070465], [1437019533.803011, 1968831, 0.029071694239974022], [1437020013.667751, 1978978, 0.029110101982951164], [1437020493.659028, 1989133, 0.02908976934850216], [1437020973.657346, 1999287, 0.028982611373066902], [1437021453.650634, 2009437, 0.028793690726161003], [1437021933.848661, 2019588, 0.02868787571787834], [1437022413.674963, 2029736, 0.028585631400346756], [1437022893.69086, 2039894, 0.02848806604743004], [1437023373.68883, 2050054, 0.028294002637267113], [1437023853.686116, 2060205, 0.02822807803750038], [1437024333.763876, 2070362, 0.02809525839984417], [1437024813.707845, 2080507, 0.027941878885030746], [1437025293.483294, 2090645, 0.02787884697318077], [1437025773.695712, 2100793, 0.027862509712576866], [1437026253.672994, 2110943, 0.027835993096232414], [1437026733.780775, 2121094, 0.027756690979003906], [1437027213.617849, 2131235, 0.027644263580441475], [1437027693.694451, 2141382, 0.02752007730305195], [1437028173.68596, 2151537, 0.027432529255747795], [1437028653.584833, 2161685, 0.027434471994638443], [1437029133.792483, 2171839, 0.027317894622683525], [1437029613.661672, 2181977, 0.027138294652104378], [1437030093.641009, 2192118, 0.027088705450296402], [1437030573.656274, 2202268, 0.027131302282214165], [1437031053.643631, 2212416, 0.02715957537293434], [1437031533.777478, 2222583, 0.027089620009064674], [1437032013.704008, 2232736, 0.026989320293068886], [1437032493.638393, 2242882, 0.026922713965177536], [1437032973.684986, 2253041, 0.02678370475769043], [1437033453.699562, 2263183, 0.0267350971698761], [1437033933.918074, 2273320, 0.026652036234736443], [1437034413.596351, 2283443, 0.0265977680683136], [1437034893.640496, 2293579, 0.02654072269797325], [1437035373.637761, 2303701, 0.026471523568034172], [1437035853.669947, 2313823, 0.026451298967003822], [1437036333.78905, 2323961, 0.026429779827594757], [1437036813.699727, 2334089, 0.026324886828660965], [1437037293.662592, 2344235, 0.026287589222192764], [1437037773.66716, 2354364, 0.026264755055308342], [1437038253.603687, 2364507, 0.026225194334983826], [1437038733.78864, 2374644, 0.02624845691025257], [1437039213.641799, 2384782, 0.02624380588531494], [1437039693.687078, 2394923, 0.026255516335368156], [1437040173.635717, 2405058, 0.026186630129814148], [1437040653.673331, 2415194, 0.026059549301862717], [1437041133.764768, 2425322, 0.02603207901120186], [1437041613.629279, 2435449, 0.025951188057661057], [1437042093.703985, 2445575, 0.025885486975312233], [1437042573.496029, 2455712, 0.0258584376424551], [1437043053.686022, 2465844, 0.02576967515051365], [1437043533.731929, 2475974, 0.02574247308075428], [1437044013.636245, 2486095, 0.025741368532180786], [1437044493.69923, 2496238, 0.025613142177462578], [1437044973.652155, 2506373, 0.025502001866698265], [1437045453.691467, 2516497, 0.025422129780054092], [1437045933.935804, 2526637, 0.02530006691813469], [1437046413.635583, 2536770, 0.02533203549683094], [1437046893.626337, 2546896, 0.025261884555220604], [1437047373.67437, 2557029, 0.02517615258693695], [1437047853.652939, 2567169, 0.025054262951016426], [1437048333.778436, 2577306, 0.024978358298540115], [1437048813.654248, 2587433, 0.024952327832579613], [1437049293.610609, 2597552, 0.024846646934747696], [1437049773.646573, 2607690, 0.024763893336057663], [1437050253.667925, 2617808, 0.024688972160220146], [1437050733.735291, 2627933, 0.024599123746156693], [1437051213.620222, 2638053, 0.024585271254181862], [1437051693.601978, 2648171, 0.024474715813994408], [1437052173.634985, 2658299, 0.0243435837328434], [1437052653.687176, 2668425, 0.024294523522257805], [1437053133.762819, 2678556, 0.024163981899619102], [1437053613.643698, 2688671, 0.024034887552261353], [1437054093.673047, 2698804, 0.024000374600291252], [1437054573.667371, 2708956, 0.023914175108075142], [1437055053.650441, 2719087, 0.02395522966980934], [1437055533.778469, 2729219, 0.023859599605202675], [1437056013.694082, 2739343, 0.02375946193933487], [1437056493.674871, 2749458, 0.023677179589867592], [1437056973.700234, 2759575, 0.023645443841814995], [1437057453.666129, 2769697, 0.02356558106839657], [1437057933.848506, 2779821, 0.023514214903116226], [1437058413.643799, 2789941, 0.023489613085985184], [1437058893.715386, 2800076, 0.023429814726114273], [1437059373.62596, 2810207, 0.023343827575445175], [1437059853.650848, 2820334, 0.02322673238813877], [1437060333.792248, 2830465, 0.023134106770157814], [1437060813.682955, 2840600, 0.023055672645568848], [1437061293.681795, 2850745, 0.022985080257058144], [1437061773.691182, 2860880, 0.02291373908519745], [1437062253.662987, 2871013, 0.022864071652293205], [1437062733.760419, 2881153, 0.0227896086871624], [1437063213.651969, 2891278, 0.02276325598359108], [1437063693.723523, 2901406, 0.022676151245832443], [1437064173.68663, 2911533, 0.022622840479016304], [1437064653.547643, 2921667, 0.022551873698830605], [1437065133.62645, 2931813, 0.022431621327996254], [1437065613.566569, 2941947, 0.022368427366018295], [1437066093.537804, 2952102, 0.022323856130242348], [1437066573.529332, 2962243, 0.022268367931246758], [1437067053.520098, 2972400, 0.022210223600268364], [1437067533.605733, 2982561, 0.022118542343378067], [1437068013.535467, 2992698, 0.022013003006577492], [1437068493.559976, 3002839, 0.021971898153424263], [1437068973.558743, 3012983, 0.021911533549427986], [1437069453.562661, 3023116, 0.021851375699043274], [1437069933.627071, 3033256, 0.021762363612651825], [1437070413.574131, 3043386, 0.021733952686190605], [1437070893.658803, 3053528, 0.021669508889317513], [1437071373.638711, 3063659, 0.021594204008579254], [1437071853.621384, 3073794, 0.021531015634536743], [1437072333.665269, 3083926, 0.021499203518033028], [1437072813.584388, 3094040, 0.021456807851791382], [1437073293.569178, 3104172, 0.02136526256799698]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d4.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d4.json
deleted file mode 100644
index 434b78cd0f5..00000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars/beta/d4.json
+++ /dev/null
@@ -1 +0,0 @@
-[[1436925978.257845, 7, 0.5028539896011353], [1436926413.945391, 1476, 0.4976981580257416], [1436926893.945037, 6006, 0.5092837810516357], [1436927373.995472, 13786, 0.5118998885154724], [1436927853.989794, 23650, 0.5314905643463135], [1436928334.132361, 33755, 0.550969123840332], [1436928813.973288, 43941, 0.5487659573554993], [1436929293.975949, 54146, 0.5263530015945435], [1436929773.992781, 64316, 0.5077286958694458], [1436930253.997415, 74465, 0.5120566487312317], [1436930734.203004, 84611, 0.5140185952186584], [1436931214.03644, 94700, 0.5133042335510254], [1436931694.094564, 104766, 0.5233010053634644], [1436932174.114955, 114817, 0.5230671763420105], [1436932654.161382, 124880, 0.5250263810157776], [1436933133.960214, 134977, 0.5088120698928833], [1436933614.044337, 145062, 0.5097426176071167], [1436934094.166206, 155169, 0.5103482007980347], [1436934574.106036, 165284, 0.5021579265594482], [1436935054.150647, 175402, 0.49785494804382324], [1436935533.819562, 185538, 0.4970649182796478], [1436936013.710422, 195712, 0.5023221373558044], [1436936493.609025, 205906, 0.5063169002532959], [1436936973.683892, 216099, 0.50455641746521], [1436937454.138383, 226331, 0.5104150772094727], [1436937933.838475, 236532, 0.5066487193107605], [1436938413.89688, 246724, 0.5183079838752747], [1436938894.018652, 256925, 0.5163102746009827], [1436939373.69067, 267137, 0.5216323733329773], [1436939853.673692, 277369, 0.5153006315231323], [1436940333.651346, 287620, 0.5240126252174377], [1436940813.599579, 297848, 0.5263218879699707], [1436941293.596313, 308088, 0.5236956477165222], [1436941773.659172, 318362, 0.534295916557312], [1436942253.648479, 328621, 0.540306031703949], [1436942733.752284, 338892, 0.5359382033348083], [1436943213.621881, 349144, 0.540198564529419], [1436943693.698743, 359399, 0.5404431819915771], [1436944173.578463, 369649, 0.5429667234420776], [1436944653.692217, 379912, 0.5415231585502625], [1436945133.677298, 390180, 0.54068922996521], [1436945613.572411, 400445, 0.5396349430084229], [1436946093.56123, 410703, 0.5486253499984741], [1436946573.542364, 420958, 0.5451043248176575], [1436947053.616578, 431216, 0.5478819608688354], [1436947533.636973, 441483, 0.5503379106521606], [1436948013.541574, 451751, 0.5534676313400269], [1436948493.560223, 462015, 0.5574610829353333], [1436948973.512541, 472260, 0.5558810234069824], [1436949453.550055, 482483, 0.5529404878616333], [1436949933.828011, 492731, 0.5618430972099304], [1436950413.603177, 502957, 0.5641138553619385], [1436950893.563009, 513185, 0.5707159638404846], [1436951373.620887, 523410, 0.5676558613777161], [1436951853.61941, 533618, 0.5637813806533813], [1436952333.694447, 543828, 0.5682924389839172], [1436952813.621004, 554042, 0.5690237283706665], [1436953293.588156, 564251, 0.5655006766319275], [1436953773.599734, 574464, 0.553955614566803], [1436954253.621309, 584672, 0.5558924674987793], [1436954733.738119, 594882, 0.5603042840957642], [1436955213.56617, 605091, 0.5625290870666504], [1436955693.585366, 615296, 0.5668522715568542], [1436956173.626395, 625501, 0.5736584663391113], [1436956653.601937, 635705, 0.5693879723548889], [1436957133.665878, 645915, 0.576599657535553], [1436957613.584762, 656116, 0.5648065805435181], [1436958093.549783, 666331, 0.5632508397102356], [1436958573.646778, 676543, 0.5660487413406372], [1436959053.585655, 686750, 0.568809449672699], [1436959533.679696, 696961, 0.5667826533317566], [1436960013.633292, 707173, 0.5637232661247253], [1436960493.578778, 717383, 0.5675314664840698], [1436960973.596715, 727598, 0.5714674592018127], [1436961453.625644, 737818, 0.564845085144043], [1436961933.740339, 748040, 0.5700833797454834], [1436962413.573845, 758252, 0.5702976584434509], [1436962893.610678, 768470, 0.5745863914489746], [1436963373.642878, 778674, 0.5763651728630066], [1436963853.558388, 788877, 0.5721960067749023], [1436964333.658419, 799099, 0.5714120864868164], [1436964813.573319, 809289, 0.5687000155448914], [1436965293.542098, 819484, 0.5728974938392639], [1436965773.545453, 829687, 0.5738612413406372], [1436966253.586517, 839901, 0.5702064037322998], [1436966733.639348, 850120, 0.5715107321739197], [1436967213.697288, 860330, 0.5695001482963562], [1436967693.617172, 870539, 0.5783872008323669], [1436968173.593885, 880748, 0.5758792161941528], [1436968653.560836, 890955, 0.572809636592865], [1436969133.676337, 901164, 0.5752230286598206], [1436969613.506638, 911358, 0.5861247181892395], [1436970093.595964, 921560, 0.5834078788757324], [1436970573.541227, 931756, 0.5814791321754456], [1436971053.624316, 941945, 0.5803619623184204], [1436971533.655543, 952138, 0.5765199065208435], [1436972013.604738, 962349, 0.5693190693855286], [1436972493.613199, 972551, 0.5720453262329102], [1436972973.501155, 982746, 0.5741620063781738], [1436973453.64842, 992945, 0.5705713629722595], [1436973933.689516, 1003147, 0.5657351613044739], [1436974413.577769, 1013350, 0.5685256123542786], [1436974893.542281, 1023545, 0.5698860287666321], [1436975373.638453, 1033759, 0.5801734328269958], [1436975853.524388, 1043955, 0.577880322933197], [1436976333.625792, 1054148, 0.5780594348907471], [1436976813.610661, 1064342, 0.5804633498191833], [1436977293.601581, 1074539, 0.5842364430427551], [1436977773.575627, 1084733, 0.5745837092399597], [1436978253.564972, 1094914, 0.5848771333694458], [1436978733.673144, 1105109, 0.5795935392379761], [1436979213.540585, 1115293, 0.583346426486969], [1436979693.699591, 1125483, 0.5840965509414673], [1436980173.613012, 1135670, 0.5807850360870361], [1436980653.575769, 1145862, 0.5843925476074219], [1436981133.719264, 1156045, 0.5828814506530762], [1436981613.563551, 1166236, 0.5873864889144897], [1436982093.553233, 1176436, 0.5896572470664978], [1436982573.577846, 1186636, 0.5887367725372314], [1436983053.605749, 1196837, 0.5841871500015259], [1436983533.684994, 1207025, 0.5867579579353333], [1436984013.561492, 1217233, 0.5940297842025757], [1436984493.629873, 1227437, 0.5925037860870361], [1436984973.606714, 1237643, 0.5981529951095581], [1436985453.690084, 1247835, 0.5954598188400269], [1436985933.711388, 1257951, 0.5903756022453308], [1436986413.598807, 1268125, 0.5837404131889343], [1436986893.631797, 1278290, 0.583182156085968], [1436987373.596962, 1288473, 0.5860618352890015], [1436987853.555549, 1298650, 0.5829544067382812], [1436988333.722032, 1308841, 0.5798720121383667], [1436988813.55697, 1319018, 0.589148998260498], [1436989293.756905, 1329221, 0.5905702710151672], [1436989773.665141, 1339417, 0.5900465250015259], [1436990253.768302, 1349610, 0.5893078446388245], [1436990733.708919, 1359759, 0.589722752571106], [1436991213.663033, 1369914, 0.5907371640205383], [1436991693.730925, 1380074, 0.5939858555793762], [1436992173.751791, 1390224, 0.5906378626823425], [1436992653.758682, 1400383, 0.5876493453979492], [1436993133.835604, 1410542, 0.5912420153617859], [1436993613.674655, 1420684, 0.5887293219566345], [1436994093.747454, 1430832, 0.589107096195221], [1436994573.768973, 1440986, 0.5928497910499573], [1436995053.666661, 1451174, 0.5916265845298767], [1436995533.83439, 1461345, 0.5911784768104553], [1436996013.556996, 1471495, 0.5890726447105408], [1436996493.635477, 1481663, 0.5914839506149292], [1436996973.668684, 1491822, 0.5915400385856628], [1436997453.59326, 1501979, 0.591564416885376], [1436997933.774019, 1512139, 0.5926578640937805], [1436998413.575162, 1522290, 0.5942149758338928], [1436998893.640468, 1532431, 0.5931802988052368], [1436999373.551661, 1542579, 0.587592601776123], [1436999853.57906, 1552734, 0.5877953171730042], [1437000333.680409, 1562888, 0.590681791305542], [1437000813.602383, 1573037, 0.5924896001815796], [1437001293.610337, 1583190, 0.5913501381874084], [1437001773.618199, 1593341, 0.5952408909797668], [1437002253.572966, 1603497, 0.5953922271728516], [1437002733.67994, 1613657, 0.6002237200737], [1437003213.583266, 1623809, 0.6042569875717163], [1437003693.639943, 1633966, 0.6017740368843079], [1437004173.568287, 1644113, 0.6037994623184204], [1437004653.610772, 1654268, 0.6037947535514832], [1437005133.663045, 1664424, 0.6028310060501099], [1437005613.580984, 1674567, 0.603211522102356], [1437006093.601019, 1684715, 0.6052727699279785], [1437006573.625314, 1694857, 0.6032628417015076], [1437007053.584514, 1704999, 0.5978461503982544], [1437007533.719303, 1715150, 0.602828323841095], [1437008013.604962, 1725282, 0.6063790917396545], [1437008493.655091, 1735432, 0.6047347784042358], [1437008973.640165, 1745584, 0.6031648516654968], [1437009453.715067, 1755742, 0.6067507863044739], [1437009933.765712, 1765896, 0.6062817573547363], [1437010413.632128, 1776052, 0.609245240688324], [1437010893.66766, 1786195, 0.6066284775733948], [1437011373.636164, 1796346, 0.6102170944213867], [1437011853.631224, 1806481, 0.609173595905304], [1437012333.706205, 1816617, 0.6035751104354858], [1437012813.61987, 1826754, 0.604059636592865], [1437013293.479904, 1836883, 0.6039224863052368], [1437013773.604574, 1847029, 0.5974730849266052], [1437014253.618884, 1857175, 0.6040806174278259], [1437014733.756419, 1867312, 0.6017186045646667], [1437015213.638607, 1877459, 0.5987159609794617], [1437015693.625763, 1887608, 0.6047909259796143], [1437016173.63194, 1897759, 0.6033824682235718], [1437016653.609074, 1907909, 0.6038352847099304], [1437017133.717601, 1918074, 0.6083348989486694], [1437017613.716011, 1928220, 0.6044996380805969], [1437018093.626005, 1938377, 0.6009799242019653], [1437018573.626522, 1948523, 0.60047847032547], [1437019053.648174, 1958678, 0.6019382476806641], [1437019533.803011, 1968831, 0.6007305383682251], [1437020013.667751, 1978978, 0.6025127172470093], [1437020493.659028, 1989133, 0.6051828861236572], [1437020973.657346, 1999287, 0.6085876822471619], [1437021453.650634, 2009437, 0.6065122485160828], [1437021933.848661, 2019588, 0.6084572076797485], [1437022413.674963, 2029736, 0.6065473556518555], [1437022893.69086, 2039894, 0.6075063347816467], [1437023373.68883, 2050054, 0.6095973253250122], [1437023853.686116, 2060205, 0.6047213077545166], [1437024333.763876, 2070362, 0.6034210324287415], [1437024813.707845, 2080507, 0.6008927822113037], [1437025293.483294, 2090645, 0.604469895362854], [1437025773.695712, 2100793, 0.6068717837333679], [1437026253.672994, 2110943, 0.6099737882614136], [1437026733.780775, 2121094, 0.6105009317398071], [1437027213.617849, 2131235, 0.611957311630249], [1437027693.694451, 2141382, 0.6141949892044067], [1437028173.68596, 2151537, 0.6135279536247253], [1437028653.584833, 2161685, 0.6111017465591431], [1437029133.792483, 2171839, 0.6135671138763428], [1437029613.661672, 2181977, 0.6112024188041687], [1437030093.641009, 2192118, 0.6097264289855957], [1437030573.656274, 2202268, 0.6097284555435181], [1437031053.643631, 2212416, 0.6121350526809692], [1437031533.777478, 2222583, 0.6147991418838501], [1437032013.704008, 2232736, 0.6118316054344177], [1437032493.638393, 2242882, 0.6191433072090149], [1437032973.684986, 2253041, 0.6188027262687683], [1437033453.699562, 2263183, 0.6163974404335022], [1437033933.918074, 2273320, 0.6144159436225891], [1437034413.596351, 2283443, 0.6123769879341125], [1437034893.640496, 2293579, 0.6139131188392639], [1437035373.637761, 2303701, 0.6150627136230469], [1437035853.669947, 2313823, 0.6149951219558716], [1437036333.78905, 2323961, 0.6155945658683777], [1437036813.699727, 2334089, 0.613308310508728], [1437037293.662592, 2344235, 0.6153736114501953], [1437037773.66716, 2354364, 0.6160987615585327], [1437038253.603687, 2364507, 0.611574113368988], [1437038733.78864, 2374644, 0.6145234107971191], [1437039213.641799, 2384782, 0.6117951273918152], [1437039693.687078, 2394923, 0.6129845380783081], [1437040173.635717, 2405058, 0.6095831394195557], [1437040653.673331, 2415194, 0.6110679507255554], [1437041133.764768, 2425322, 0.6099690198898315], [1437041613.629279, 2435449, 0.6105908155441284], [1437042093.703985, 2445575, 0.6124749779701233], [1437042573.496029, 2455712, 0.6118302345275879], [1437043053.686022, 2465844, 0.6094756722450256], [1437043533.731929, 2475974, 0.6094986796379089], [1437044013.636245, 2486095, 0.6114639639854431], [1437044493.69923, 2496238, 0.6101082563400269], [1437044973.652155, 2506373, 0.6105718612670898], [1437045453.691467, 2516497, 0.6115666627883911], [1437045933.935804, 2526637, 0.6128115653991699], [1437046413.635583, 2536770, 0.6122986078262329], [1437046893.626337, 2546896, 0.6142017245292664], [1437047373.67437, 2557029, 0.6111341714859009], [1437047853.652939, 2567169, 0.611350417137146], [1437048333.778436, 2577306, 0.6126709580421448], [1437048813.654248, 2587433, 0.6111524105072021], [1437049293.610609, 2597552, 0.6135894060134888], [1437049773.646573, 2607690, 0.6136029362678528], [1437050253.667925, 2617808, 0.6141685843467712], [1437050733.735291, 2627933, 0.6170881390571594], [1437051213.620222, 2638053, 0.6189730167388916], [1437051693.601978, 2648171, 0.6157540678977966], [1437052173.634985, 2658299, 0.6178646683692932], [1437052653.687176, 2668425, 0.6164441108703613], [1437053133.762819, 2678556, 0.6175132393836975], [1437053613.643698, 2688671, 0.6158696413040161], [1437054093.673047, 2698804, 0.6162974238395691], [1437054573.667371, 2708956, 0.6160892844200134], [1437055053.650441, 2719087, 0.6176281571388245], [1437055533.778469, 2729219, 0.6165231466293335], [1437056013.694082, 2739343, 0.6171510219573975], [1437056493.674871, 2749458, 0.6124134659767151], [1437056973.700234, 2759575, 0.6120688319206238], [1437057453.666129, 2769697, 0.6126770377159119], [1437057933.848506, 2779821, 0.6126595139503479], [1437058413.643799, 2789941, 0.616513729095459], [1437058893.715386, 2800076, 0.6130264401435852], [1437059373.62596, 2810207, 0.6114044785499573], [1437059853.650848, 2820334, 0.6077002882957458], [1437060333.792248, 2830465, 0.6086235046386719], [1437060813.682955, 2840600, 0.6084680557250977], [1437061293.681795, 2850745, 0.6094310879707336], [1437061773.691182, 2860880, 0.6066345572471619], [1437062253.662987, 2871013, 0.6094250082969666], [1437062733.760419, 2881153, 0.609106719493866], [1437063213.651969, 2891278, 0.6080747246742249], [1437063693.723523, 2901406, 0.6081057786941528], [1437064173.68663, 2911533, 0.6066460609436035], [1437064653.547643, 2921667, 0.6057829856872559], [1437065133.62645, 2931813, 0.6092885136604309], [1437065613.566569, 2941947, 0.6089289784431458], [1437066093.537804, 2952102, 0.6070758700370789], [1437066573.529332, 2962243, 0.6096142530441284], [1437067053.520098, 2972400, 0.609714925289154], [1437067533.605733, 2982561, 0.6116167306900024], [1437068013.535467, 2992698, 0.6119107007980347], [1437068493.559976, 3002839, 0.6119140386581421], [1437068973.558743, 3012983, 0.6115538477897644], [1437069453.562661, 3023116, 0.6126777529716492], [1437069933.627071, 3033256, 0.6146017909049988], [1437070413.574131, 3043386, 0.6119789481163025], [1437070893.658803, 3053528, 0.6139205694198608], [1437071373.638711, 3063659, 0.612362802028656], [1437071853.621384, 3073794, 0.6109192371368408], [1437072333.665269, 3083926, 0.6141091585159302], [1437072813.584388, 3094040, 0.6132751703262329], [1437073293.569178, 3104172, 0.6132386922836304]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
new file mode 100644
index 00000000000..6d584fb4a9e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
new file mode 100644
index 00000000000..025eaa16e93
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
@@ -0,0 +1 @@
+[[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
new file mode 100644
index 00000000000..eae69dd78f3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
new file mode 100644
index 00000000000..6d584fb4a9e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
new file mode 100644
index 00000000000..6d584fb4a9e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
new file mode 100644
index 00000000000..dd3593f9d10
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
@@ -0,0 +1 @@
+[[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
new file mode 100644
index 00000000000..0ff9ef0551d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
@@ -0,0 +1 @@
+[[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
index 02646c6c180..78f657b4104 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
@@ -16,33 +16,55 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<html>
-  <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../tf-scalar-dashboard.html">
-    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-    <title>Event Dashboard Demo Demo</title>
-  </head>
-  <body>
-    <dom-module id="x-demo">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-scalar-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+<link rel="import" href="../../tf-backend/tf-backend.html">
+
+<title>Scalar Dashboard Demo</title>
+<style>
+  #container {
+    height: 900px;
+    width: 100%;
+    display: block;
+  }
+
+  html, body {
+    margin: 0;
+    padding: 0;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="scalar-dash-demo">
       <template>
-        <tf-scalar-dashboard
-          id="demo"
-          router="[[demoRouter]]">
-        </tf-scalar-dashboard>
+        <tf-scalar-dashboard id="demo" backend="[[backend]]"></tf-scalar-dashboard>
       </template>
       <script>
+        import {Backend} from "../tf-backend/backend";
+        import {createRouter, setRouter} from "../tf-backend/router";
+
         Polymer({
-          is: "x-demo",
+          is: "scalar-dash-demo",
           properties: {
-            demoRouter: {
+            backend: {
               type: Object,
-              value: TF.Urls.demoRouter("data"),
+              value: function() {
+                return new Backend();
+              },
+            },
+            created: function() {
+              var router = createRouter("/data", true);
+              setRouter(router);
             },
           },
         });
       </script>
     </dom-module>
-    <x-demo></x-demo>
-  </body>
-</html>
+    <scalar-dash-demo id="container"></scalar-dash-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html b/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
index d4688bb7c48..848ed5292de 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
@@ -23,6 +23,7 @@ limitations under the License.
 <link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
 <link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
 <link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="../tf-runs-selector/tf-runs-selector.html">
 <link rel="import" href="../tf-imports/lodash.html">
 <link rel="import" href="../vz-line-chart/vz-line-chart.html">
 <link rel="import" href="../iron-collapse/iron-collapse.html">
@@ -58,54 +59,64 @@ contains vz-line-charts embedded inside tf-panes-helper's.
 
     <tf-dashboard-layout>
       <div class="sidebar">
-        <tf-sidebar-helper
-          backend="[[backend]]"
-          categories="{{_categories}}"
-          color-scale="[[_colorScale]]"
-          run2tag="[[run2tag]]"
-          runs="[[runs]]"
-          selected-runs="{{_selectedRuns}}"
-          >
-          <div class="extend-first-section">
+        <div class="sidebar-section">
+          <tf-categorizer
+            id="categorizer"
+            tags="[[tags]]"
+            categories="{{_categories}}"
+          ></tf-categorizer>
+          <div class="line-item">
             <paper-checkbox
-              id="download-option"
-              checked="{{_showDownloadLinks}}"
-              >Data download links</paper-checkbox>
-            <div id="tooltip-sorting">
-              <div id="tooltip-sorting-label">Tooltip sorting method:</div>
-              <paper-dropdown-menu
-                no-label-float
-                selected-item-label="{{_tooltipSortingMethod}}"
-                >
-                <paper-menu class="dropdown-content" selected="0">
-                  <paper-item>default</paper-item>
-                  <paper-item>descending</paper-item>
-                  <paper-item>ascending</paper-item>
-                  <paper-item>nearest</paper-item>
-                </paper-menu>
-              </paper-dropdown-menu>
-            </div>
+            id="download-option"
+            checked="{{_showDownloadLinks}}"
+            >Show data download links</paper-checkbox>
           </div>
-          <div class="sidebar-section">
-            <tf-smoothing-input
-              weight="{{_smoothingWeight}}"
-              step="0.001"
-              min="0"
-              max="1"
-              ></tf-smoothing-input>
+          <div class="line-item">
+            <paper-checkbox
+            id="outliersCheckbox"
+            checked="{{_ignoreYOutliers}}"
+            >Ignore outliers in chart scaling</paper-checkbox>
           </div>
-          <div class="sidebar-section">
-            <tf-option-selector
-              id="xTypeSelector"
-              name="Horizontal Axis"
-              selected-id="{{_xType}}"
+          <div id="tooltip-sorting">
+            <div id="tooltip-sorting-label">Tooltip sorting method:</div>
+            <paper-dropdown-menu
+              no-label-float
+              selected-item-label="{{_tooltipSortingMethod}}"
               >
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
+              <paper-menu class="dropdown-content" selected="0">
+                <paper-item>default</paper-item>
+                <paper-item>descending</paper-item>
+                <paper-item>ascending</paper-item>
+                <paper-item>nearest</paper-item>
+              </paper-menu>
+            </paper-dropdown-menu>
           </div>
-        </tf-sidebar-helper>
+        </div>
+        <div class="sidebar-section">
+          <tf-smoothing-input
+            weight="{{_smoothingWeight}}"
+            step="0.001"
+            min="0"
+            max="1"
+            ></tf-smoothing-input>
+        </div>
+        <div class="sidebar-section">
+          <tf-option-selector
+            id="xTypeSelector"
+            name="Horizontal Axis"
+            selected-id="{{_xType}}"
+            >
+            <paper-button id="step">step</paper-button>
+            <paper-button id="relative">relative</paper-button>
+            <paper-button id="wall_time">wall</paper-button>
+          </tf-option-selector>
+        </div>
+        <div class="sidebar-section">
+          <tf-runs-selector
+            id="runs-selector"
+            selected-runs="{{_selectedRuns}}"
+          ></tf-runs-selector>
+        </div>
       </div>
       <div class="center">
         <tf-panes-helper
@@ -113,11 +124,12 @@ contains vz-line-charts embedded inside tf-panes-helper's.
           color-scale="[[_colorScale]]"
           data-type="[[dataType]]"
           data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
           run2tag="[[run2tag]]"
           selected-runs="[[_selectedRuns]]"
           show-download-links="[[_showDownloadLinks]]"
           download-link-url-function="[[scalarUrl]]"
-          >
+        >
           <template>
             <vz-line-chart
               x-type="[[_xType]]"
@@ -125,6 +137,7 @@ contains vz-line-charts embedded inside tf-panes-helper's.
               smoothing-enabled="[[_smoothingEnabled]]"
               smoothing-weight="[[_smoothingWeight]]"
               tooltip-sorting-method="[[_tooltipSortingMethod]]"
+              ignore-y-outliers="[[_ignoreYOutliers]]"
               ></vz-line-chart>
             <paper-icon-button
               class="log-button"
@@ -155,6 +168,15 @@ contains vz-line-charts embedded inside tf-panes-helper's.
         background: var(--tb-ui-light-accent);
       }
 
+      #categorizer {
+        flex-shrink: 0;
+      }
+
+      #runs-selector {
+        flex-shrink: 1;
+        flex-grow: 1;
+      }
+
       #tooltip-sorting {
         display: flex;
         font-size: 14px;
@@ -171,16 +193,39 @@ contains vz-line-charts embedded inside tf-panes-helper's.
         --paper-input-container-focus-color: var(--tb-orange-strong);
         width: 105px;
       }
-    </style>
+      .line-item {
+        display: block;
+        padding-top: 5px;
+      }
 
+      .sidebar-section {
+        border-top: solid 1px rgba(0, 0, 0, 0.12);
+        padding: 20px 0px 20px 30px;
+      }
+
+      /* TODO(wchargin): These styles also exist in dashboard-style, */
+      /* but don't apply due to the namespacing. :-( */
+      .sidebar {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+      }
+    </style>
   </template>
 
   <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+    import {getRouter} from "../tf-backend/router";
+    import * as storage from "../tf-storage/storage";
+
     Polymer({
       is: "tf-scalar-dashboard",
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        DashboardBehavior("scalars"),
+        ReloadBehavior("tf-chart-scaffold"),
+        BackendBehavior,
       ],
       properties: {
         backend: Object,
@@ -188,45 +233,49 @@ contains vz-line-charts embedded inside tf-panes-helper's.
           type: String,
           value: "scalar"
         },
-        router: Object,
         scalarUrl: {
           type: Function,
-          computed: "_getScalarUrl(router)"
+          value: function() {
+            return getRouter().pluginRunTagRoute('scalars', '/scalars');
+          },
         },
         _showDownloadLinks: {
           type: Boolean,
           notify: true,
-          value: TF.URIStorage.getBooleanInitializer('_showDownloadLinks',
-              false),
+          value: storage.getBooleanInitializer('_showDownloadLinks', false, true),
           observer: '_showDownloadLinksObserver'
         },
         _smoothingWeight: {
           type: Number,
           notify: true,
-          value: TF.URIStorage.getNumberInitializer('_smoothingWeight', 0.6),
+          value: storage.getNumberInitializer('_smoothingWeight', 0.6),
           observer: '_smoothingWeightObserver'
         },
         _smoothingEnabled: {
           type: Boolean,
           computed: '_computeSmoothingEnabled(_smoothingWeight)'
         },
+        _ignoreYOutliers: {
+          type: Boolean,
+          value: storage.getBooleanInitializer('_ignoreYOutliers', true, true),
+          observer: '_ignoreYOutliersObserver',
+        },
         _xType: {
           type: String,
           value: "step"
-        }
+        },
       },
       attached: function() {
         this.async(function() {
           this.fire("rendered");
         });
       },
-      _getScalarUrl: function() {
-        return this.router.scalars;
-      },
-      _showDownloadLinksObserver: TF.URIStorage.getBooleanObserver(
-          '_showDownloadLinks', false),
-      _smoothingWeightObserver: TF.URIStorage.getNumberObserver(
+      _showDownloadLinksObserver: storage.getBooleanObserver(
+          '_showDownloadLinks', /*default=*/ false, /*useLocalStorage=*/ true),
+      _smoothingWeightObserver: storage.getNumberObserver(
           '_smoothingWeight', 0.6),
+      _ignoreYOutliersObserver: storage.getBooleanObserver(
+          '_ignoreYOutliers', /*default=*/ true, /*useLocalStorage=*/true),
       _computeSmoothingEnabled: function(_smoothingWeight) {
         return _smoothingWeight > 0;
       },
diff --git a/tensorflow/tensorboard/components/tf_storage/BUILD b/tensorflow/tensorboard/components/tf_storage/BUILD
new file mode 100644
index 00000000000..197e0ae73d6
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage/BUILD
@@ -0,0 +1,36 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_storage",
+    srcs = [
+        "storage.ts",
+        "tf-storage.html",
+    ],
+    path = "/tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":tf_storage"],
+    destdir = "tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals:legacy",
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage/storage.ts b/tensorflow/tensorboard/components/tf_storage/storage.ts
index f7b2179d381..873bc483a07 100644
--- a/tensorflow/tensorboard/components/tf_storage/storage.ts
+++ b/tensorflow/tensorboard/components/tf_storage/storage.ts
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import {getFakeHash, setFakeHash, TABS, useHash} from '../tf-globals/globals';
+
+
 /* tslint:disable:no-namespace variable-name */
 /**
  * The Storage Module provides storage for URL parameters, and an API for
@@ -21,337 +24,377 @@ limitations under the License.
  * It generates URI components like: events&runPrefix=train*
  * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
  * to store state in the URI.
+ *
+ * It also allows saving the values to localStorage for long-term persistence.
  */
-module TF.URIStorage {
-  type StringDict = {[key: string]: string};
+type StringDict = {[key: string]: string};
 
-  /**
-   * A key that users cannot use, since TensorBoard uses this to store info
-   * about the active tab.
-   */
-  export let TAB = '__tab__';
+/**
+ * A key that users cannot use, since TensorBoard uses this to store info
+ * about the active tab.
+ */
+export let TAB = '__tab__';
 
-  /**
-   * The name of the property for users to set on a Polymer component
-   * in order for its stored properties to be stored in the URI unambiguously.
-   * (No need to set this if you want mutliple instances of the component to
-   * share URI state)
-   *
-   * Example:
-   * <my-component disambiguator="0"></my-component>
-   *
-   * The disambiguator should be set to any unique value so that multiple
-   * instances of the component can store properties in URI storage.
-   *
-   * Because it's hard to dereference this variable in HTML property bindings,
-   * it is NOT safe to change the disambiguator string without find+replace
-   * across the codebase.
-   */
-  export let DISAMBIGUATOR = 'disambiguator';
+/**
+ * The name of the property for users to set on a Polymer component
+ * in order for its stored properties to be stored in the URI unambiguously.
+ * (No need to set this if you want multiple instances of the component to
+ * share URI state)
+ *
+ * Example:
+ * <my-component disambiguator="0"></my-component>
+ *
+ * The disambiguator should be set to any unique value so that multiple
+ * instances of the component can store properties in URI storage.
+ *
+ * Because it's hard to dereference this variable in HTML property bindings,
+ * it is NOT safe to change the disambiguator string without find+replace
+ * across the codebase.
+ */
+export let DISAMBIGUATOR = 'disambiguator';
 
-  /**
-   * Return a boolean stored in the URI, given a corresponding key.
-   * Undefined if not found.
-   */
-  export function getBoolean(key: string): boolean {
-    let items = _componentToDict(_readComponent());
-    let item = items[key];
-    return item === 'true' ? true : item === 'false' ? false : undefined;
+/**
+ * Return a string stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getString(key: string, useLocalStorage: boolean): string {
+  if (useLocalStorage) {
+    return window.localStorage.getItem(key);
+  } else {
+    return _componentToDict(_readComponent())[key];
   }
+}
 
-  /**
-   * Store a boolean in the URI, with a corresponding key.
-   */
-  export function setBoolean(key: string, value: boolean) {
-    let items = _componentToDict(_readComponent());
-    items[key] = value.toString();
-    _writeComponent(_dictToComponent(items));
-  }
-
-  /**
-   * Return a string stored in the URI, given a corresponding key.
-   * Undefined if not found.
-   */
-  export function getString(key: string): string {
-    let items = _componentToDict(_readComponent());
-    return items[key];
-  }
-
-  /**
-   * Store a string in the URI, with a corresponding key.
-   */
-  export function setString(key: string, value: string) {
-    let items = _componentToDict(_readComponent());
+/**
+ * Set a string in URI or localStorage.
+ */
+export function setString(
+    key: string, value: string, useLocalStorage: boolean) {
+  if (useLocalStorage) {
+    window.localStorage.setItem(key, value);
+  } else {
+    const items = _componentToDict(_readComponent());
     items[key] = value;
     _writeComponent(_dictToComponent(items));
   }
+}
 
-  /**
-   * Return a number stored in the URI, given a corresponding key.
-   * Undefined if not found.
-   */
-  export function getNumber(key: string): number {
-    let items = _componentToDict(_readComponent());
-    return items[key] === undefined ? undefined : +items[key];
-  }
+/**
+ * Return a boolean stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getBoolean(key: string, useLocalStorage: boolean): boolean {
+  const item = getString(key, useLocalStorage);
+  return item === 'true' ? true : item === 'false' ? false : undefined;
+}
 
-  /**
-   * Store a number in the URI, with a corresponding key.
-   */
-  export function setNumber(key: string, value: number) {
-    let items = _componentToDict(_readComponent());
-    items[key] = '' + value;
-    _writeComponent(_dictToComponent(items));
-  }
+/**
+ * Store a boolean in URI or localStorage.
+ */
+export function setBoolean(
+    key: string, value: boolean, useLocalStorage = false) {
+  setString(key, value.toString(), useLocalStorage);
+}
 
-  /**
-   * Return an object stored in the URI, given a corresponding key.
-   * Undefined if not found.
-   */
-  export function getObject(key: string): Object {
-    let items = _componentToDict(_readComponent());
-    return items[key] === undefined ? undefined : JSON.parse(atob(items[key]));
-  }
+/**
+ * Return a number stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getNumber(key: string, useLocalStorage: boolean): number {
+  const item = getString(key, useLocalStorage);
+  return item === undefined ? undefined : +item;
+}
 
-  /**
-   * Store an object in the URI, with a corresponding key.
-   */
-  export function setObject(key: string, value: Object) {
-    let items = _componentToDict(_readComponent());
-    items[key] = btoa(JSON.stringify(value));
-    _writeComponent(_dictToComponent(items));
-  }
+/**
+ * Store a number in URI or localStorage.
+ */
+export function setNumber(
+    key: string, value: number, useLocalStorage: boolean) {
+  setString(key, '' + value, useLocalStorage);
+}
 
-  /**
-   * Get a unique storage name for a (Polymer component, propertyName) tuple.
-   *
-   * DISAMBIGUATOR must be set on the component, if other components use the
-   * same propertyName.
-   */
-  export function getURIStorageName(
-      component: Object, propertyName: string): string {
-    let d = component[DISAMBIGUATOR];
-    let components = d == null ? [propertyName] : [d, propertyName];
-    return components.join('.');
-  }
+/**
+ * Return an object stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getObject(key: string, useLocalStorage: boolean): {} {
+  const item = getString(key, useLocalStorage);
+  return item === undefined ? undefined : JSON.parse(atob(item));
+}
 
-  /**
-   * Return a function that:
-   * (1) Initializes a Polymer boolean property with a default value, if its
-   *     value is not already set
-   * (2) Sets up listener that updates Polymer property on hash change.
-   */
-  export function getBooleanInitializer(
-      propertyName: string, defaultVal: boolean): Function {
-    return _getInitializer(getBoolean, propertyName, defaultVal);
-  }
+/**
+ * Store an object in URI or localStorage.
+ */
+export function setObject(key: string, value: {}, useLocalStorage: boolean) {
+  setString(key, btoa(JSON.stringify(value)), useLocalStorage);
+}
 
-  /**
-   * Return a function that:
-   * (1) Initializes a Polymer string property with a default value, if its
-   *     value is not already set
-   * (2) Sets up listener that updates Polymer property on hash change.
-   */
-  export function getStringInitializer(
-      propertyName: string, defaultVal: string): Function {
-    return _getInitializer(getString, propertyName, defaultVal);
-  }
+/**
+ * Get a unique storage name for a (Polymer component, propertyName) tuple.
+ *
+ * DISAMBIGUATOR must be set on the component, if other components use the
+ * same propertyName.
+ */
+export function getURIStorageName(
+    component: {}, propertyName: string): string {
+  const d = component[DISAMBIGUATOR];
+  const components = d == null ? [propertyName] : [d, propertyName];
+  return components.join('.');
+}
 
-  /**
-   * Return a function that:
-   * (1) Initializes a Polymer number property with a default value, if its
-   *     value is not already set
-   * (2) Sets up listener that updates Polymer property on hash change.
-   */
-  export function getNumberInitializer(
-      propertyName: string, defaultVal: number): Function {
-    return _getInitializer(getNumber, propertyName, defaultVal);
-  }
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer boolean property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getBooleanInitializer(
+    propertyName: string, defaultVal: boolean,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getBoolean, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Return a function that:
-   * (1) Initializes a Polymer Object property with a default value, if its
-   *     value is not already set
-   * (2) Sets up listener that updates Polymer property on hash change.
-   *
-   * Generates a deep clone of the defaultVal to avoid mutation issues.
-   */
-  export function getObjectInitializer(
-      propertyName: string, defaultVal: Object): Function {
-    return _getInitializer(getObject, propertyName, defaultVal);
-  }
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer string property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getStringInitializer(
+    propertyName: string, defaultVal: string,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getString, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Return a function that updates URIStorage when a string property changes.
-   */
-  export function getBooleanObserver(
-      propertyName: string, defaultVal: boolean): Function {
-    return _getObserver(getBoolean, setBoolean, propertyName, defaultVal);
-  }
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer number property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getNumberInitializer(
+    propertyName: string, defaultVal: number,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getNumber, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Return a function that updates URIStorage when a string property changes.
-   */
-  export function getStringObserver(
-      propertyName: string, defaultVal: string): Function {
-    return _getObserver(getString, setString, propertyName, defaultVal);
-  }
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer Object property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ *
+ * Generates a deep clone of the defaultVal to avoid mutation issues.
+ */
+export function getObjectInitializer(
+    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
+  return _getInitializer(
+      getObject, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Return a function that updates URIStorage when a number property changes.
-   */
-  export function getNumberObserver(
-      propertyName: string, defaultVal: number): Function {
-    return _getObserver(getNumber, setNumber, propertyName, defaultVal);
-  }
+/**
+ * Return a function that updates URIStorage when a string property changes.
+ */
+export function getBooleanObserver(
+    propertyName: string, defaultVal: boolean,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getBoolean, setBoolean, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Return a function that updates URIStorage when an object property changes.
-   * Generates a deep clone of the defaultVal to avoid mutation issues.
-   */
-  export function getObjectObserver(
-      propertyName: string, defaultVal: Object): Function {
-    let clone = _.cloneDeep(defaultVal);
-    return _getObserver(getObject, setObject, propertyName, clone);
-  }
+/**
+ * Return a function that updates URIStorage when a string property changes.
+ */
+export function getStringObserver(
+    propertyName: string, defaultVal: string,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getString, setString, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Read component from URI (e.g. returns "events&runPrefix=train*").
-   */
-  function _readComponent(): string {
-    return TF.Globals.USE_HASH ? window.location.hash.slice(1) :
-                                 TF.Globals.FAKE_HASH;
-  }
+/**
+ * Return a function that updates URIStorage when a number property changes.
+ */
+export function getNumberObserver(
+    propertyName: string, defaultVal: number,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getNumber, setNumber, propertyName, defaultVal, useLocalStorage);
+}
 
-  /**
-   * Write component to URI.
-   */
-  function _writeComponent(component: string) {
-    if (TF.Globals.USE_HASH) {
-      window.location.hash = component;
-    } else {
-      TF.Globals.FAKE_HASH = component;
-    }
-  }
+/**
+ * Return a function that updates URIStorage when an object property changes.
+ * Generates a deep clone of the defaultVal to avoid mutation issues.
+ */
+export function getObjectObserver(
+    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
+  const clone = _.cloneDeep(defaultVal);
+  return _getObserver(
+      getObject, setObject, propertyName, clone, useLocalStorage);
+}
 
-  /**
-   * Convert dictionary of strings into a URI Component.
-   * All key value entries get added as key value pairs in the component,
-   * with the exception of a key with the TAB value, which if present
-   * gets prepended to the URI Component string for backwards comptability
-   * reasons.
-   */
-  function _dictToComponent(items: StringDict): string {
-    let component = '';
+/**
+ * Read component from URI (e.g. returns "events&runPrefix=train*").
+ */
+function _readComponent(): string {
+  return useHash() ? window.location.hash.slice(1) : getFakeHash();
+}
 
-    // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
-    // for backwards compatbility.
-    if (items[TAB] !== undefined) {
-      component += items[TAB];
-    }
-
-    // Join other strings with &key=value notation
-    let nonTab = _.pairs(items)
-                     .filter(function(pair) { return pair[0] !== TAB; })
-                     .map(function(pair) {
-                       return encodeURIComponent(pair[0]) + '=' +
-                           encodeURIComponent(pair[1]);
-                     })
-                     .join('&');
-
-    return nonTab.length > 0 ? (component + '&' + nonTab) : component;
-  }
-
-  /**
-   * Convert a URI Component into a dictionary of strings.
-   * Component should consist of key-value pairs joined by a delimiter
-   * with the exception of the tabName.
-   * Returns dict consisting of all key-value pairs and
-   * dict[TAB] = tabName
-   */
-  function _componentToDict(component: string): StringDict {
-    let items = {} as StringDict;
-
-    let tokens = component.split('&');
-    tokens.forEach(function(token) {
-      let kv = token.split('=');
-      // Special backwards compatibility for URI components like #events
-      if (kv.length === 1 && _.contains(TF.Globals.TABS, kv[0])) {
-        items[TAB] = kv[0];
-      } else if (kv.length === 2) {
-        items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
-      }
-    });
-    return items;
-  }
-
-  /**
-   * Return a function that:
-   * (1) Initializes a Polymer property with a default value, if its
-   *     value is not already set
-   * (2) Sets up listener that updates Polymer property on hash change.
-   */
-  function _getInitializer<T>(
-      get: (name: string) => T, propertyName: string, defaultVal: T): Function {
-    return function() {
-      let URIStorageName = getURIStorageName(this, propertyName);
-      // setComponentValue will be called every time the hash changes, and is
-      // responsible for ensuring that new state in the hash will be propagated
-      // to the component with that property.
-      // It is important that this function does not re-assign needlessly,
-      // to avoid Polymer observer churn.
-      let setComponentValue = () => {
-        let uriValue = get(URIStorageName);
-        let currentValue = this[propertyName];
-        // if uriValue is undefined, we will ensure that the property has the
-        // default value
-        if (uriValue === undefined) {
-          if (!_.isEqual(currentValue, defaultVal)) {
-            // If we don't have an explicit URI value, then we need to ensure
-            // the property value is equal to the default value.
-            // We will assign a clone rather than the canonical default, because
-            // the component receiving this property may mutate it, and we need
-            // to keep a pristine copy of the default.
-            this[propertyName] = _.clone(defaultVal);
-          }
-          // In this case, we have an explicit URI value, so we will ensure that
-          // the component has an equivalent value.
-        } else if (!_.isEqual(uriValue, currentValue)) {
-          this[propertyName] = uriValue;
-        }
-      };
-      // Set the value on the property.
-      setComponentValue();
-      // Update it when the hashchanges.
-      window.addEventListener('hashchange', setComponentValue);
-    };
-  }
-
-  /**
-   * Return a function that updates URIStorage when a property changes.
-   */
-  function _getObserver<T>(
-      get: (name: string) => T, set: (name: string, newVal: T) => void,
-      propertyName: string, defaultVal: T): Function {
-    return function() {
-      let URIStorageName = getURIStorageName(this, propertyName);
-      let newVal = this[propertyName];
-      if (!_.isEqual(newVal, get(URIStorageName))) {
-        if (_.isEqual(newVal, defaultVal)) {
-          _unset(URIStorageName);
-        } else {
-          set(URIStorageName, newVal);
-        }
-      }
-    };
-  }
-
-  /**
-   * Delete a key from the URI.
-   */
-  function _unset(key) {
-    let items = _componentToDict(_readComponent());
-    delete items[key];
-    _writeComponent(_dictToComponent(items));
+/**
+ * Write component to URI.
+ */
+function _writeComponent(component: string) {
+  if (useHash()) {
+    window.location.hash = component;
+  } else {
+    setFakeHash(component);
   }
 }
+
+/**
+ * Convert dictionary of strings into a URI Component.
+ * All key value entries get added as key value pairs in the component,
+ * with the exception of a key with the TAB value, which if present
+ * gets prepended to the URI Component string for backwards compatibility
+ * reasons.
+ */
+function _dictToComponent(items: StringDict): string {
+  let component = '';
+
+  // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
+  // for backwards compatbility.
+  if (items[TAB] !== undefined) {
+    component += items[TAB];
+  }
+
+  // Join other strings with &key=value notation
+  const nonTab = _.pairs(items)
+                   .filter((pair) =>  pair[0] !== TAB)
+                   .map((pair) => {
+                     return encodeURIComponent(pair[0]) + '=' +
+                         encodeURIComponent(pair[1]);
+                   })
+                   .join('&');
+
+  return nonTab.length > 0 ? (component + '&' + nonTab) : component;
+}
+
+/**
+ * Convert a URI Component into a dictionary of strings.
+ * Component should consist of key-value pairs joined by a delimiter
+ * with the exception of the tabName.
+ * Returns dict consisting of all key-value pairs and
+ * dict[TAB] = tabName
+ */
+function _componentToDict(component: string): StringDict {
+  const items = {} as StringDict;
+
+  const tokens = component.split('&');
+  tokens.forEach((token) => {
+    const kv = token.split('=');
+    // Special backwards compatibility for URI components like #events
+    if (kv.length === 1 && _.contains(TABS, kv[0])) {
+      items[TAB] = kv[0];
+    } else if (kv.length === 2) {
+      items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
+    }
+  });
+  return items;
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+function _getInitializer<T>(
+    get: (name: string, useLocalStorage: boolean) => T, propertyName: string,
+    defaultVal: T, useLocalStorage): Function {
+  return function() {
+    const URIStorageName = getURIStorageName(this, propertyName);
+    // setComponentValue will be called every time the hash changes, and is
+    // responsible for ensuring that new state in the hash will be propagated
+    // to the component with that property.
+    // It is important that this function does not re-assign needlessly,
+    // to avoid Polymer observer churn.
+    const setComponentValue = () => {
+      const uriValue = get(URIStorageName, false);
+      const currentValue = this[propertyName];
+      // if uriValue is undefined, we will ensure that the property has the
+      // default value
+      if (uriValue === undefined) {
+        let valueToSet: T;
+        // if we are using localStorage, we will set the value to the value
+        // from localStorage. Then, the corresponding observer will proxy
+        // the localStorage value into URI storage.
+        // in this way, localStorage takes precedence over the default val
+        // but not over the URI value.
+        if (useLocalStorage) {
+          const useLocalStorageValue = get(URIStorageName, true);
+          valueToSet = useLocalStorageValue === undefined ?
+              defaultVal :
+              useLocalStorageValue;
+        } else {
+          valueToSet = defaultVal;
+        }
+        if (!_.isEqual(currentValue, valueToSet)) {
+          // If we don't have an explicit URI value, then we need to ensure
+          // the property value is equal to the default value.
+          // We will assign a clone rather than the canonical default, because
+          // the component receiving this property may mutate it, and we need
+          // to keep a pristine copy of the default.
+          this[propertyName] = _.clone(valueToSet);
+        }
+        // In this case, we have an explicit URI value, so we will ensure that
+        // the component has an equivalent value.
+      } else {
+        if (!_.isEqual(uriValue, currentValue)) {
+          this[propertyName] = uriValue;
+        }
+      }
+    };
+    // Set the value on the property.
+    setComponentValue();
+    // Update it when the hashchanges.
+    window.addEventListener('hashchange', setComponentValue);
+  };
+}
+
+/**
+ * Return a function that updates URIStorage when a property changes.
+ */
+function _getObserver<T>(
+    get: (name: string, useLocalStorage: boolean) => T,
+    set: (name: string, newVal: T, useLocalStorage: boolean) => void,
+    propertyName: string, defaultVal: T, useLocalStorage: boolean): Function {
+  return function() {
+    const URIStorageName = getURIStorageName(this, propertyName);
+    const newVal = this[propertyName];
+    // if this is a localStorage property, we always synchronize the value
+    // in localStorage to match the one currently in the URI.
+    if (useLocalStorage) {
+      set(URIStorageName, newVal, true);
+    }
+    if (!_.isEqual(newVal, get(URIStorageName, false))) {
+      if (_.isEqual(newVal, defaultVal)) {
+        _unsetFromURI(URIStorageName);
+      } else {
+        set(URIStorageName, newVal, false);
+      }
+    }
+  };
+}
+
+/**
+ * Delete a key from the URI.
+ */
+function _unsetFromURI(key) {
+  const items = _componentToDict(_readComponent());
+  delete items[key];
+  _writeComponent(_dictToComponent(items));
+}
+
diff --git a/tensorflow/tensorboard/components/tf_storage/test/BUILD b/tensorflow/tensorboard/components/tf_storage/test/BUILD
new file mode 100644
index 00000000000..32399ba7cbe
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage/test/BUILD
@@ -0,0 +1,30 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "storageTests.ts",
+        "tests.html",
+    ],
+    path = "/tf-storage/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "//tensorflow/tensorboard/components/tf_storage",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage/test/index.html b/tensorflow/tensorboard/components/tf_storage/test/index.html
deleted file mode 100644
index 6d395b07028..00000000000
--- a/tensorflow/tensorboard/components/tf_storage/test/index.html
+++ /dev/null
@@ -1,30 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../tf-storage.html">
-</head>
-<body>
-  <script src="storageTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts b/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
index f3486095b3e..82dc51f05da 100644
--- a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
+++ b/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
@@ -12,55 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// tslint:disable-next-line:no-var-keyword
-var assert = chai.assert;
+import {TAB, getString, getNumber, getObject, setString, setNumber, setObject} from '../storage';
+import {TABS} from '../../tf-globals/globals';
 
 /* tslint:disable:no-namespace */
-module TF.URIStorage {
-  describe('URIStorage', function() {
-    it('get/setString', function() {
-      setString('key_a', 'hello');
-      setString('key_b', 'there');
-      assert.equal('hello', getString('key_a'));
-      assert.equal('there', getString('key_b'));
-      assert.equal(null, getString('key_c'));
-    });
-
-    it('get/setNumber', function() {
-      setNumber('key_a', 12);
-      setNumber('key_b', 3.4);
-      assert.equal(12, getNumber('key_a'));
-      assert.equal(3.4, getNumber('key_b'));
-      assert.equal(null, getNumber('key_c'));
-    });
-
-    it('get/setObject', function() {
-      let obj = {'foo': 2.3, 'bar': 'barstr'};
-      setObject('key_a', obj);
-      assert.deepEqual(obj, getObject('key_a'));
-    });
-
-    it('get/setWeirdValues', function() {
-      setNumber('key_a', NaN);
-      assert.deepEqual(NaN, getNumber('key_a'));
-
-      setNumber('key_a', +Infinity);
-      assert.equal(+Infinity, getNumber('key_a'));
-
-      setNumber('key_a', -Infinity);
-      assert.equal(-Infinity, getNumber('key_a'));
-
-      setNumber('key_a', 1 / 3);
-      assert.equal(1 / 3, getNumber('key_a'));
-
-      setNumber('key_a', -0);
-      assert.equal(-0, getNumber('key_a'));
-    });
-
-    it('set/getTab', function() {
-      setString(TAB, TF.Globals.TABS[0]);
-      assert.equal(TF.Globals.TABS[0], getString(TAB));
-    });
+describe('URIStorage', () => {
+  it('get/setString', () => {
+    setString('key_a', 'hello', false);
+    setString('key_b', 'there', false);
+    chai.assert.equal('hello', getString('key_a', false));
+    chai.assert.equal('there', getString('key_b', false));
+    chai.assert.equal(null, getString('key_c', false));
   });
-}
+
+  it('get/setNumber', () => {
+    setNumber('key_a', 12, false);
+    setNumber('key_b', 3.4, false);
+    chai.assert.equal(12, getNumber('key_a', false));
+    chai.assert.equal(3.4, getNumber('key_b', false));
+    chai.assert.equal(null, getNumber('key_c', false));
+  });
+
+  it('get/setObject', () => {
+    const obj = {'foo': 2.3, 'bar': 'barstr'};
+    setObject('key_a', obj, false);
+    chai.assert.deepEqual(obj, getObject('key_a', false));
+  });
+
+  it('get/setWeirdValues', () => {
+    setNumber('key_a', NaN, false);
+    chai.assert.deepEqual(NaN, getNumber('key_a', false));
+
+    setNumber('key_a', +Infinity, false);
+    chai.assert.equal(+Infinity, getNumber('key_a', false));
+
+    setNumber('key_a', -Infinity, false);
+    chai.assert.equal(-Infinity, getNumber('key_a', false));
+
+    setNumber('key_a', 1 / 3, false);
+    chai.assert.equal(1 / 3, getNumber('key_a', false));
+
+    setNumber('key_a', -0, false);
+    chai.assert.equal(-0, getNumber('key_a', false));
+  });
+
+  it('set/getTab', () => {
+    setString(TAB, TABS[0], false);
+    chai.assert.equal(TABS[0], getString(TAB, false));
+  });
+});
+
diff --git a/tensorflow/tensorboard/components/tf_storage/test/tests.html b/tensorflow/tensorboard/components/tf_storage/test/tests.html
new file mode 100644
index 00000000000..4668b119d24
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage/test/tests.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
+<link rel="import" href="../tf-storage.html">
+<body>
+<script src="storageTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_storage/tf-storage.html b/tensorflow/tensorboard/components/tf_storage/tf-storage.html
index b20a23bd444..ff3f7b0ad4a 100644
--- a/tensorflow/tensorboard/components/tf_storage/tf-storage.html
+++ b/tensorflow/tensorboard/components/tf_storage/tf-storage.html
@@ -16,7 +16,6 @@ limitations under the License.
 -->
 
 <link rel="import" href="../tf-globals/tf-globals.html">
+<link rel="import" href="../tf-imports/lodash.html">
 
-<dom-module id="tf-storage">
- <script src="storage.js"></script>
-</dom-module>
+<script src="storage.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/BUILD b/tensorflow/tensorboard/components/tf_tensorboard/BUILD
new file mode 100644
index 00000000000..95fb8b7a882
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_tensorboard",
+    srcs = [
+        "autoReloadBehavior.ts",
+        "style.html",
+        "tf-tensorboard.html",
+    ],
+    path = "/tf-tensorboard",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_audio_dashboard",
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard",
+        "//tensorflow/tensorboard/components/tf_globals",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard",
+        "//tensorflow/tensorboard/components/tf_image_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
+        "//tensorflow/tensorboard/components/tf_storage",
+        "//tensorflow/tensorboard/components/tf_text_dashboard",
+        "//tensorflow/tensorboard/components/vz_projector",
+        "@org_polymer_font_roboto",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_header_panel",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_tabs",
+        "@org_polymer_paper_toolbar",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["demo.html"],
+    path = "/tf-tensorboard",
+    deps = [
+        ":tf_tensorboard",
+        "//tensorflow/tensorboard/demo:demo_data",
+    ],
+)
+
+tensorboard_html_binary(
+    name = "devserver",
+    testonly = 1,
+    input_path = "/tf-tensorboard/demo.html",
+    output_path = "/index.html",
+    deps = [":demo"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts b/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
index 5f605fa9500..54df16f5b5d 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
+++ b/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
@@ -12,47 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-module TF.TensorBoard {
-  export var AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
 
-  var getAutoReloadFromLocalStorage: () => boolean = () => {
-    var val = window.localStorage.getItem(AUTORELOAD_LOCALSTORAGE_KEY);
-    return val === 'true' || val == null;  // defaults to true
-  };
+export var AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
 
-  export var AutoReloadBehavior = {
-    properties: {
-      autoReloadEnabled: {
-        type: Boolean,
-        observer: '_autoReloadObserver',
-        value: getAutoReloadFromLocalStorage,
-      },
-      _autoReloadId: {
-        type: Number,
-      },
-      autoReloadIntervalSecs: {
-        type: Number,
-        value: 120,
-      },
+var getAutoReloadFromLocalStorage: () => boolean = () => {
+  var val = window.localStorage.getItem(AUTORELOAD_LOCALSTORAGE_KEY);
+  return val === 'true' || val == null;  // defaults to true
+};
+
+/**
+ * @polymerBehavior
+ */
+export var AutoReloadBehavior = {
+  properties: {
+    autoReloadEnabled: {
+      type: Boolean,
+      observer: '_autoReloadObserver',
+      value: getAutoReloadFromLocalStorage,
     },
-    detached: function() { window.clearTimeout(this._autoReloadId);},
-    _autoReloadObserver: function(autoReload) {
-      window.localStorage.setItem(AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
-      if (autoReload) {
-        var _this = this;
-        this._autoReloadId = window.setTimeout(
-            this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-      } else {
-        window.clearTimeout(this._autoReloadId);
-      }
+    _autoReloadId: {
+      type: Number,
     },
-    _doAutoReload: function() {
-      if (this.reload == null) {
-        throw new Error('AutoReloadBehavior requires a reload method');
-      }
-      this.reload();
+    autoReloadIntervalSecs: {
+      type: Number,
+      value: 30,
+    },
+  },
+  detached: function() {
+    window.clearTimeout(this._autoReloadId);
+  },
+  _autoReloadObserver: function(autoReload) {
+    window.localStorage.setItem(AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
+    if (autoReload) {
+      var _this = this;
       this._autoReloadId = window.setTimeout(
           this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
+    } else {
+      window.clearTimeout(this._autoReloadId);
     }
-  };
-}
+  },
+  _doAutoReload: function() {
+    if (this.reload == null) {
+      throw new Error('AutoReloadBehavior requires a reload method');
+    }
+    this.reload();
+    this._autoReloadId = window.setTimeout(
+        this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
+  }
+};
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/demo.html b/tensorflow/tensorboard/components/tf_tensorboard/demo.html
new file mode 100644
index 00000000000..f691f6211bc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard/demo.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>TensorBoard Demo</title>
+<link rel="import" href="style.html">
+<link rel="import" href="tf-tensorboard.html">
+<body>
+<tf-tensorboard demo-dir="/data" use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/style.html b/tensorflow/tensorboard/components/tf_tensorboard/style.html
new file mode 100644
index 00000000000..575e89e3982
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard/style.html
@@ -0,0 +1,28 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../font-roboto/roboto.html">
+
+<style>
+  html,
+  body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft", "Roboto", sans-serif;
+  }
+</style>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
index dfb1a7af568..b68fd8c9438 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
+++ b/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
@@ -12,20 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-var assert = chai.assert;
-declare function fixture(id: string): void; window.HTMLImports.whenReady(() => {
 
+import {AUTORELOAD_LOCALSTORAGE_KEY, AutoReloadBehavior} from '../autoReloadBehavior';
+
+declare function fixture(id: string): void;
+
+window.HTMLImports.whenReady(() => {
   Polymer({
     is: 'autoreload-test-element',
-    behaviors: [TF.TensorBoard.AutoReloadBehavior],
+    behaviors: [AutoReloadBehavior],
   });
 
   describe('autoReload-behavior', function() {
-    var testElement;
-    var ls = window.localStorage;
-    var key = TF.TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY;
-    var clock;
-    var callCount: number;
+    let testElement;
+    const ls = window.localStorage;
+    const key = AUTORELOAD_LOCALSTORAGE_KEY;
+    let clock;
+    let callCount: number;
 
     beforeEach(function() {
       ls.setItem(key, 'false');  // start it turned off so we can mutate fns
@@ -41,28 +44,28 @@ declare function fixture(id: string): void; window.HTMLImports.whenReady(() => {
     it('reads and writes autoReload state from localStorage', function() {
       ls.removeItem(key);
       testElement = fixture('autoReloadFixture');
-      assert.isTrue(
+      chai.assert.isTrue(
           testElement.autoReloadEnabled, 'autoReload defaults to true');
-      assert.equal(ls.getItem(key), 'true', 'autoReload setting saved');
+      chai.assert.equal(ls.getItem(key), 'true', 'autoReload setting saved');
       testElement = fixture('autoReloadFixture');
-      assert.isTrue(
+      chai.assert.isTrue(
           testElement.autoReloadEnabled, 'read true from localStorage');
       testElement.autoReloadEnabled = false;
-      assert.equal(ls.getItem(key), 'false', 'autoReload setting saved');
+      chai.assert.equal(ls.getItem(key), 'false', 'autoReload setting saved');
       testElement = fixture('autoReloadFixture');
-      assert.isFalse(
+      chai.assert.isFalse(
           testElement.autoReloadEnabled, 'read false setting properly');
       testElement.autoReloadEnabled = true;
-      assert.equal(ls.getItem(key), 'true', 'saved true setting');
+      chai.assert.equal(ls.getItem(key), 'true', 'saved true setting');
     });
 
     it('reloads every interval secs when autoReloading', function() {
       testElement.autoReloadIntervalSecs = 1;
       testElement.autoReloadEnabled = true;
       clock.tick(1000);
-      assert.equal(callCount, 1, 'ticking clock triggered call');
+      chai.assert.equal(callCount, 1, 'ticking clock triggered call');
       clock.tick(20 * 1000);
-      assert.equal(callCount, 21, 'ticking clock 20s triggered 20 calls');
+      chai.assert.equal(callCount, 21, 'ticking clock 20s triggered 20 calls');
     });
 
     it('can cancel pending autoReload', function() {
@@ -71,14 +74,14 @@ declare function fixture(id: string): void; window.HTMLImports.whenReady(() => {
       clock.tick(5 * 1000);
       testElement.autoReloadEnabled = false;
       clock.tick(20 * 1000);
-      assert.equal(callCount, 0, 'callCount is 0');
+      chai.assert.equal(callCount, 0, 'callCount is 0');
     });
 
     it('throws an error in absence of reload method', function() {
       testElement.reload = undefined;
       testElement.autoReloadIntervalSecs = 1;
       testElement.autoReloadEnabled = true;
-      assert.throws(function() {
+      chai.assert.throws(function() {
         clock.tick(5000);
       });
     });
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
deleted file mode 100644
index 2a6af328408..00000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
deleted file mode 100644
index a5a4d65d5c6..00000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
-node {
-  name: "c"
-  op: "matmul"
-  input: "a:0"
-  input: "b:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json b/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
deleted file mode 100644
index 10b2821b30b..00000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run2": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}, "run1": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
index 2025ac6a59a..a00027963be 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
+++ b/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import {TABS} from '../../tf-globals/globals';
+
 describe('end-to-end test', () => {
-  let assert = chai.assert;
   window.HTMLImports.whenReady(() => {
     let tb = d3.select('tf-tensorboard');
     var tabs = (<any>tb.node()).$.tabs;
 
     function testTab(tabIndex: number) {
-      it(`selecting ${TF.Globals.TABS[tabIndex]} tab`, done => {
+      it(`selecting ${TABS[tabIndex]} tab`, done => {
         // Every dashboard emits a rendered event when it is done rendering.
         tb.on('rendered', () => done());
         tabs.set('selected', tabIndex);
@@ -33,7 +34,7 @@ describe('end-to-end test', () => {
       // have failed. Re-selecting the default tab and listening for
       // "rendered" event won't work since the content is not re-stamped.
       let selected = +tabs.get('selected');
-      for (let i = 0; i < TF.Globals.TABS.length; i++) {
+      for (let i = 0; i < TABS.length; i++) {
         if (i !== selected) {
           testTab(i);
         }
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
index 1366695419e..905ed4ee4aa 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
+++ b/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import {TABS} from '../../tf-globals/globals';
+
 describe('fast tab switch', () => {
-  let assert = chai.assert;
   window.HTMLImports.whenReady(() => {
     let tb = d3.select('tf-tensorboard');
+    // tslint:disable-next-line:no-any be quiet tsc
     var tabs = (<any>tb.node()).$.tabs;
 
     // This test will select the events tab. Once the events tab
@@ -24,9 +26,9 @@ describe('fast tab switch', () => {
     // the images tab wihout waiting for the graph tab to finish
     // rendering. Finally, it finishes when the images tab
     // has rendered and no errors were thrown.
-    let eventsTabIndex = TF.Globals.TABS.indexOf('events');
-    let imagesTabIndex = TF.Globals.TABS.indexOf('images');
-    let graphTabIndex = TF.Globals.TABS.indexOf('graphs');
+    const eventsTabIndex = TABS.indexOf('events');
+    const imagesTabIndex = TABS.indexOf('images');
+    const graphTabIndex = TABS.indexOf('graphs');
 
     // Listen for when the events tab rendered.
     tb.on('rendered', () => {
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
index 9dec9d3dee3..06ff446f186 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
+++ b/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+import {TABS} from '../../tf-globals/globals';
+
 describe('tf-tensorboard tests', () => {
   window.HTMLImports.whenReady(() => {
-    let assert = chai.assert;
     let tensorboard: any;
     beforeEach(function() {
       tensorboard = fixture('tensorboardFixture');
@@ -26,28 +28,21 @@ describe('tf-tensorboard tests', () => {
       setTimeout(function() {
         let tabs = tensorboard.$.tabs.getElementsByTagName('paper-tab');
         let tabMode = Array.prototype.map.call(tabs, (x) => x.dataMode);
-        assert.deepEqual(tabMode, TF.Globals.TABS, 'mode is correct');
+        chai.assert.deepEqual(tabMode, TABS, 'mode is correct');
         let tabText =
             Array.prototype.map.call(tabs, (x) => x.innerText.toLowerCase());
-        assert.deepEqual(tabText, TF.Globals.TABS, 'text is correct');
+        chai.assert.deepEqual(tabText, TABS, 'text is correct');
         done();
       });
     });
 
-    it('respects router manually provided', function() {
-      let router = TF.Backend.router('data', true);
-      tensorboard.router = router;
-      tensorboard.demoDir = null;
-      assert.equal(tensorboard._backend.router, router);
-    });
-
     it('renders injected content', function() {
       let injected = tensorboard.querySelector('#inject-me');
-      assert.isNotNull(injected);
+      chai.assert.isNotNull(injected);
     });
 
     describe('reloading the selected dashboard', function() {
-      TF.Globals.TABS.forEach((name, tabIndex) => {
+      TABS.forEach((name, tabIndex) => {
         // These tabs do not support reload mode.
         if (name === 'graphs' || name === 'projections') {
           return;
@@ -60,10 +55,10 @@ describe('tf-tensorboard tests', () => {
               called = true;
             };
             tensorboard.reload();
-            assert.isFalse(
+            chai.assert.isFalse(
                 tensorboard.$$('#reload-button').disabled,
                 'reload button not disabled');
-            assert.isTrue(called, `reload was called`);
+            chai.assert.isTrue(called, `reload was called`);
             done();
           });
         });
@@ -71,8 +66,8 @@ describe('tf-tensorboard tests', () => {
     });
 
     it('reload is disabled for graph dashboard', function(done) {
-      let idx = TF.Globals.TABS.indexOf('graphs');
-      assert.notEqual(idx, -1, 'graphs was found');
+      const idx = TABS.indexOf('graphs');
+      chai.assert.notEqual(idx, -1, 'graphs was found');
       tensorboard.$.tabs.set('selected', idx);
       setTimeout(
           function() {  // async so that the queued tab change will happen
@@ -81,10 +76,10 @@ describe('tf-tensorboard tests', () => {
               called = true;
             };
             tensorboard.reload();
-            assert.isTrue(
+            chai.assert.isTrue(
                 tensorboard.$$('#reload-button').disabled,
                 'reload button disabled');
-            assert.isFalse(called, `reload was not called`);
+            chai.assert.isFalse(called, `reload was not called`);
             done();
           });
     });
@@ -94,11 +89,11 @@ describe('tf-tensorboard tests', () => {
         let called = false;
         tensorboard.reload = function() { called = true; };
         tensorboard.$$('#reload-button').click();
-        assert.isTrue(called);
+        chai.assert.isTrue(called);
       });
 
       it('settings pane is hidden', function() {
-        assert.equal(tensorboard.$.settings.style['display'], 'none');
+        chai.assert.equal(tensorboard.$.settings.style['display'], 'none');
       });
 
       it('settings icon button opens the settings pane', function(done) {
@@ -120,17 +115,17 @@ describe('tf-tensorboard tests', () => {
 
       it('Autoreload checkbox toggle works', function() {
         let checkbox = tensorboard.$$('#auto-reload-checkbox');
-        assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
+        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
         let oldValue = checkbox.checked;
         checkbox.click();
-        assert.notEqual(oldValue, checkbox.checked);
-        assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
+        chai.assert.notEqual(oldValue, checkbox.checked);
+        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
       });
 
       it('Autoreload checkbox contains correct interval info', function() {
         let checkbox = tensorboard.$$('#auto-reload-checkbox');
         let timeInSeconds = tensorboard.autoReloadIntervalSecs + 's';
-        assert.include(checkbox.innerText, timeInSeconds);
+        chai.assert.include(checkbox.innerText, timeInSeconds);
       });
     });
   });
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
index f9ac4495c1f..26b742996aa 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
@@ -31,6 +31,7 @@ limitations under the License.
 <link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
 <link rel="import" href="../tf-audio-dashboard/tf-audio-dashboard.html">
 <link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html">
+<link rel="import" href="../tf-text-dashboard/tf-text-dashboard.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
 <link rel="import" href="../tf-backend/tf-backend.html">
 <link rel="import" href="../tf-storage/tf-storage.html">
@@ -42,7 +43,6 @@ tf-tensorboard is the frontend entry point for TensorBoard.
 It implements a toolbar (via paper-header-panel and paper-toolbar) that
 allows the user to toggle between various dashboards.
 -->
-<script src="autoReloadBehavior.js"></script>
 <dom-module id="tf-tensorboard">
   <template>
     <paper-dialog with-backdrop id="settings">
@@ -66,7 +66,7 @@ allows the user to toggle between various dashboards.
             <paper-icon-button
               icon="refresh"
               on-tap="reload"
-              disabled$="[[_modeIsGraphs(mode)]]"
+              disabled$="[[_isReloadDisabled(mode)]]"
               id="reload-button"
             ></paper-icon-button>
             <paper-icon-button
@@ -88,7 +88,6 @@ allows the user to toggle between various dashboards.
           <tf-scalar-dashboard
             id="scalars"
             backend="[[_backend]]"
-            router="[[router]]"
           ></tf-scalar-dashboard>
         </template>
 
@@ -110,6 +109,7 @@ allows the user to toggle between various dashboards.
           <tf-graph-dashboard
             id="graphs"
             backend="[[_backend]]"
+            debugger-data-enabled="[[_debuggerDataEnabled]]"
           ></tf-graph-dashboard>
         </template>
 
@@ -133,6 +133,13 @@ allows the user to toggle between various dashboards.
             route-prefix="/data/plugin/projector">
           </vz-projector-dashboard>
         </template>
+
+        <template is="dom-if" if="[[_modeIsText(mode)]]">
+          <tf-text-dashboard
+            id="text"
+            backend="[[_backend]]">
+          </tf-text-dashboard>
+        </template>
       </div>
     </paper-header-panel>
 
@@ -200,20 +207,35 @@ allows the user to toggle between various dashboards.
 
     </style>
   </template>
+  <script src="autoReloadBehavior.js"></script>
   <script>
+    import {AutoReloadBehavior} from "./autoReloadBehavior";
+    import {Backend} from "../tf-backend/backend";
+    import {TABS, setUseHash} from "../tf-globals/globals";
+    import {getString, setString, TAB} from "../tf-storage/storage";
+    import {setRouter, createRouter} from "../tf-backend/router";
+    import {fetchRuns} from "../tf-backend/runsStore";
+
     Polymer({
       is: "tf-tensorboard",
-      behaviors: [TF.TensorBoard.AutoReloadBehavior],
+      behaviors: [AutoReloadBehavior],
       properties: {
         router: {
-          type: Object,
-          value: function() {
-            return TF.Backend.router();
-          },
+          type: Object,  // only to trigger an observer
+          observer: '_updateRouter',
         },
         _backend: {
           type: Object,
-          computed: "_makeBackend(router, demoDir)",
+          computed: "_makeBackend(demoDir)",
+        },
+        _debuggerDataEnabled: {
+          type: Boolean,
+          value: function() {
+            // For now, Tensorboard only shows debugger data if the debugger_data GET param is set
+            // to enabled.
+            let match = window.location.href.match(/[&\?]debugger_data=enabled/);
+            return match && match.length == 1;
+          },
         },
         // Which tab is selected (scalars, graph, images etc).
         mode: {
@@ -224,7 +246,7 @@ allows the user to toggle between various dashboards.
         tabs: {
           type: Array,
           readOnly: true,
-          value: TF.Globals.TABS,
+          value: TABS,
         },
         // If this is set to a string, TensorBoard will switch to "demo mode"
         // and attempt to load serialized json data from that directory. You can
@@ -250,15 +272,21 @@ allows the user to toggle between various dashboards.
       },
       _getModeFromIndex: function(modeIndex) {
         var mode = this.tabs[modeIndex];
-        TF.URIStorage.setString(TF.URIStorage.TAB, mode);
+        setString(TAB, mode);
         return mode;
       },
-      _makeBackend: function(router, demoDir) {
-        // use the demoDir if it is set, otherwise use the provided router
-        if (demoDir != null) {
-          router = TF.Backend.router(demoDir, true);
+      _makeBackend: function(demoDir) {
+        // If the user has provided a router, we'll always use that.
+        // Otherwise, if the user has provided a demoDir, we'll use that
+        // to create a router.
+        if (demoDir != null && this.router == null) {
+          var router = createRouter(demoDir, true);
+          setRouter(router);
         }
-        return new TF.Backend.Backend(router);
+        return new Backend();
+      },
+      _isReloadDisabled: function(mode) {
+        return !this._debuggerDataEnabled && this._modeIsGraphs(mode);
       },
       _modeIsScalars: function(mode) {
         return mode === "scalars";
@@ -281,6 +309,9 @@ allows the user to toggle between various dashboards.
       _modeIsHistograms: function(mode) {
         return mode === "histograms";
       },
+      _modeIsText: function(mode) {
+        return mode === "text";
+      },
       selectedDashboard: function() {
         var dashboard = this.$$("#" + this.mode);
         if (dashboard == null) {
@@ -289,15 +320,16 @@ allows the user to toggle between various dashboards.
         return dashboard;
       },
       ready: function() {
-        TF.Globals.USE_HASH = this.useHash;
+        setUseHash(this.useHash);
 
         this._getModeFromHash();
         window.addEventListener('hashchange', function() {
           this._getModeFromHash();
         }.bind(this));
+        fetchRuns();
       },
       _getModeFromHash: function() {
-        var tabName = TF.URIStorage.getString(TF.URIStorage.TAB);
+        var tabName = getString(TAB);
         var modeIndex = this.tabs.indexOf(tabName);
         if (modeIndex == -1 && this.modeIndex == null) {
           // Select the first tab as default.
@@ -307,17 +339,23 @@ allows the user to toggle between various dashboards.
           this.set('modeIndex', modeIndex);
         }
       },
+      _updateRouter: function(router) {
+        setRouter(router);
+      },
       reload: function() {
-        if (this.mode === "graphs" || this.mode === "embeddings") {
+        if (this._modeIsEmbeddings(this.mode)) {
           return;
         }
-        this.selectedDashboard().reload();
+        if (!this._debuggerDataEnabled && this._modeIsGraphs(this.mode)) {
+          return;
+        }
+        fetchRuns().then(function() {
+          this.selectedDashboard().reload();
+        }.bind(this));
       },
       openSettings: function() {
         this.$.settings.open();
       },
     });
   </script>
-  <!-- Compiled bundle of all components using ES6 modules. -->
-  <script src="../bundle.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
new file mode 100644
index 00000000000..bed551aedfc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
@@ -0,0 +1,45 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_text_dashboard",
+    srcs = [
+        "tf-text-dashboard.html",
+        "tf-text-loader.html",
+    ],
+    path = "/tf-text-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend",
+        "//tensorflow/tensorboard/components/tf_color_scale",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_material",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-text-dashboard",
+    deps = [
+        ":tf_text_dashboard",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir
new file mode 100644
index 00000000000..c7d82022cc0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json
new file mode 100644
index 00000000000..aea7de5f917
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json
@@ -0,0 +1 @@
+{"fry": ["message", "markdown"], "leela": ["message"]}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json
new file mode 100644
index 00000000000..94183ae13d1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json
@@ -0,0 +1,32 @@
+[
+  {
+    "wall_time": 1489715207.593146,
+    "step": 0,
+    "text": "<p><em>Italics1</em> <em>Italics2</em> <strong>bold1</strong> <strong>bold2</strong></p>"
+  },
+  {
+    "wall_time": 1489715207.593801,
+    "step": 1,
+    "text": "<ol>\n<li>List item one.</li>\n<li>List item two.</li>\n<li>Sublist</li>\n<li>Sublist2</li>\n<li>List continues.</li>\n</ol>"
+  },
+  {
+    "wall_time": 1489715207.594842,
+    "step": 2,
+    "text": "<table>\n<thead>\n<tr>\n<th>An</th>\n<th>Example</th>\n<th>Table</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>A</td>\n<td>B</td>\n<td>C</td>\n</tr>\n<tr>\n<td>1</td>\n<td>2</td>\n<td>3</td>\n</tr>\n</tbody>\n</table>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 3,
+    "text": "<p>hello <a><em>you</em></a></p>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 4,
+    "text": "<p><a href=\"http://tensorflow.org\">TensorFlow</a></p>"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 530234352,
+    "text": "&lt;script&gt;alert('xss')&lt;/script&gt;"
+  }
+]
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json
new file mode 100644
index 00000000000..e8cc006c0d0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json
@@ -0,0 +1,22 @@
+[
+  {
+    "wall_time": 1489715207.593146,
+    "step": 0,
+    "text": "fry loves garnet"
+  },
+  {
+    "wall_time": 1489715207.593801,
+    "step": 1,
+    "text": "fry loves amethyst"
+  },
+  {
+    "wall_time": 1489715207.594842,
+    "step": 2,
+    "text": "fry loves pearl"
+  },
+  {
+    "wall_time": 1489715207.595761,
+    "step": 3,
+    "text": "fry loves steven"
+  }
+]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json
new file mode 100644
index 00000000000..5a6d2598937
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json
@@ -0,0 +1,22 @@
+[
+  {
+    "step": 0,
+    "wall_time": 1489715207.607792,
+    "text": "leela loves garnet and feels strongly about various issues of the day including the two-cent titanium tax and whether nixon's head contributes to greenhouse gas emissions"
+  },
+  {
+    "step": 1,
+    "wall_time": 1489715207.609011,
+    "text": "leela loves amethyst"
+  },
+  {
+    "step": 2,
+    "wall_time": 1489715207.610028,
+    "text": "leela loves pearl"
+  },
+  {
+    "step": 3,
+    "wall_time": 1489715207.611142,
+    "text": "leela loves someverylongwordwithoutanybreaksorspacessowecanseehowthatishandledbythefrontend"
+  }
+]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/index.html b/tensorflow/tensorboard/components/tf_text_dashboard/index.html
new file mode 100644
index 00000000000..55ec4d79cf9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/index.html
@@ -0,0 +1,74 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-text-dashboard.html">
+    <title>text Dashboard Demo</title>
+    <style>
+      #container{
+        height: 800px;
+        border: 2px solid grey;
+      }
+      html, body {
+        margin: 0;
+        padding: 0;
+        height: 100%;
+        font-family: "RobotoDraft","Roboto",sans-serif;
+      }
+    </style>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="text-dash-demo">
+          <template>
+            <tf-text-dashboard id="demo" backend="[[backend]]">
+            </tf-text-dashboard>
+          </template>
+          <script>
+            import * as backend_backend from '../tf-backend/backend';
+            import {createRouter, setRouter} from '../tf-backend/router';
+
+            Polymer({
+              is: "text-dash-demo",
+              properties: {
+                backend: {
+                  type: Object,
+                  value: function() {
+                    return new backend_backend.Backend();
+                  },
+                },
+              },
+              created: function() {
+                var path = "data";
+                var router = createRouter(path, true);
+                setRouter(router);
+              },
+            });
+          </script>
+        </dom-module>
+        <div id="container">
+          <text-dash-demo></text-dash-demo>
+        </div>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
new file mode 100644
index 00000000000..9b4fd3239c9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
@@ -0,0 +1,113 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../tf-backend/tf-backend.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
+<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
+<link rel="import" href="tf-text-loader.html">
+
+<!--
+tf-text-dashboard displays a dashboard that loads texts from a TensorFlow run.
+-->
+<dom-module id="tf-text-dashboard">
+  <template>
+    <paper-dialog with-backdrop id="actual-text-size-dialog"></paper-dialog>
+    <div id="plumbing">
+      <tf-color-scale
+        id="colorScale"
+        runs="[[runs]]"
+        out-color-scale="{{_colorScale}}"
+        ></tf-color-scale>
+    </div>
+
+    <tf-dashboard-layout>
+      <div class="sidebar">
+        <tf-sidebar-helper
+          backend="[[backend]]"
+          categories="{{_categories}}"
+          color-scale="[[_colorScale]]"
+          run2tag="[[run2tag]]"
+          runs="[[runs]]"
+          selected-runs="{{_selectedRuns}}"
+          >
+        </tf-sidebar-helper>
+      </div>
+      <div class="center">
+        <tf-panes-helper
+          categories="[[_categories]]"
+          color-scale="[[_colorScale]]"
+          data-type="[[dataType]]"
+          data-provider="[[dataProvider]]"
+          data-not-found="[[dataNotFound]]"
+          run2tag="[[run2tag]]"
+          selected-runs="[[_selectedRuns]]"
+          repeat-for-runs
+          >
+          <template>
+            <tf-text-loader color-scale="[[_colorScale]]"></tf-text-loader>
+          </template>
+        </tf-panes-helper>
+      </div>
+    </tf-dashboard-layout>
+    <style include="dashboard-style"></style>
+    <style>
+      tf-panes-helper {
+        --card-width: 100%;
+        --card-height: auto;
+        --card-expanded-width: 100%;
+        --card-expanded-height: 1000px;
+        --card-padding: 0 5px 5px 5px;
+        --show-expand-button: none;
+      }
+
+    </style>
+  </template>
+  <script>
+    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
+    import {BackendBehavior} from "../tf-backend/behavior";
+
+    Polymer({
+      is: "tf-text-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
+      properties: {
+        backend: Object,
+        dataType: {
+          type: String,
+          value: "text"
+        },
+      },
+      behaviors: [
+        DashboardBehavior("text"),
+        ReloadBehavior("tf-chart-scaffold"),
+        BackendBehavior,
+      ],
+      attached: function() {
+        this.async(function() {
+          this.fire("rendered");
+        });
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html
new file mode 100644
index 00000000000..374e0478dd1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html
@@ -0,0 +1,143 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-material/paper-material.html">
+<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
+<link rel="import" href="../tf-imports/d3.html">
+
+<!--
+tf-text-loader displays markdown text data from the Text plugin.
+-->
+
+<style>
+  tf-text-loader p {
+    margin: 0.3em 0;
+  }
+
+  tf-text-loader table {
+    border-collapse: collapse;
+  }
+
+  tf-text-loader table th {
+    font-weight: 600;
+  }
+
+  tf-text-loader table th,
+  tf-text-loader table td {
+    padding: 6px 13px;
+    border: 1px solid #dfe2e5;
+  }
+
+  tf-text-loader table tr {
+    background-color: #fff;
+    border-top: 1px solid #c6cbd1;
+  }
+
+</style>
+<dom-module id="tf-text-loader">
+
+  <!-- Set the innerHTML with the textual content, so we can render the
+   html generated by our markdown parser. Note this content is always
+   sanitized by the backend, so xss attacks are not possible.
+  -->
+  <template>
+    <style include="scrollbar-style"></style>
+    <paper-material elevation="1" id="outer" class="container scrollbar">
+      <template id="repeater" is="dom-repeat" items="[[_texts]]">
+      <paper-material elevation="1" class="step-container">
+        step <span class="step-value">[[_numfmt(item.step)]]</span>
+      </paper-material>
+      <paper-material elevation="1" inner-h-t-m-l="[[item.text]]" class="text">
+        </paper-material>
+      </template>
+    </paper-material>
+
+
+    <style>
+      #outer {
+        display: block;
+        overflow: auto;
+        max-height: 500px;
+        position: relative;
+        border-radius: 3px;
+        border: 2px solid black;
+      }
+      .text {
+        margin: 0 10px 10px 10px;
+        border-radius: 0 3px 3px 3px;
+        background-color: white;
+        padding: 5px;
+        word-break: break-word;
+      }
+      .step-container {
+        border-left: 1px solid #ccc;
+        border-right: 1px solid #ccc;
+        border-top: 1px solid #ccc;
+        border-radius: 3px 3px 0 0;
+        font-style: italic;
+        margin-top: 10px;
+        background-color: var(--tb-ui-light-accent);
+        display: inline-block;
+        margin-left: 9px;
+        padding: 3px;
+        font-size: 12px;
+      }
+
+    </style>
+
+  </template>
+  <script>
+    Polymer({
+      is: "tf-text-loader",
+      properties: {
+        colorScale: Object,
+        run: String,
+        // This is an array of Tensorboard Text&Datum objects (See backend.ts for details). The
+        // properties of objects in this array are
+        // {
+        //   wall_time: Date,
+        //   step: number,
+        //   text: string,
+        // }
+        // they are ordered from most recent to oldest
+        _texts: {
+          type: Array,
+          value: [],
+        },
+
+      },
+      redraw: function() {
+        // Other dashboards logic requires a redraw method to be defined.
+      },
+      setVisibleSeries: function(runs) {
+        // Do nothing.
+      },
+      setSeriesData: function(run, texts) {
+        this.set("run", run);
+        this.set("_texts", texts.reverse());
+
+        // Update the border color based on the run.
+        var color = this.colorScale.scale(run);
+        this.$$("#outer").style.borderColor = color;
+      },
+      _numfmt: function(n) {
+        return d3.format(",")(n);
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/BUILD b/tensorflow/tensorboard/components/tf_trace_viewer/BUILD
new file mode 100644
index 00000000000..9f582329f1d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_trace_viewer/BUILD
@@ -0,0 +1,30 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "tf_trace_viewer",
+    srcs = [
+        "tf-trace-viewer.html",
+        "@org_chromium_catapult_vulcanized_trace_viewer//:trace_viewer_full.html",
+    ],
+    path = "/tf-trace-viewer",
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["demo.html"],
+    path = "/tf-trace-viewer",
+    deps = [
+        ":tf_trace_viewer",
+        "//tensorflow/tensorboard/components/tf_trace_viewer/data",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD b/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD
new file mode 100644
index 00000000000..c295d38258f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "data",
+    srcs = glob(["*.json"]),
+    path = "/tf-trace-viewer/data/plugin/profile",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json b/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json
new file mode 100644
index 00000000000..e1d57394e35
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json
@@ -0,0 +1,105 @@
+{
+  "traceEvents": [
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
+        "name": "counter", "args": {"value": 10}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
+        "name": "A long name that doesnt fit but is exceedingly informative",
+        "args": {"name_false": false, "value_true": true}},
+    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
+        "name": "ProcessWideEvent1", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 827, "ph": "B",
+        "name": "Asub with a name that wont fit", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 828, "ph": "E",
+        "name": "Asub", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 829, "ph": "B",
+        "name": "Asub", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 15, "ts": 820, "ph": "X",
+        "name": "Long X type", "args": {}, "sf": 7, "esf": 8},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "E",
+        "name": "Asub", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
+        "name": "X1", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
+        "name": "X same ts and dur as X1", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "C",
+        "name": "counter", "args": {"value": 1}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 833, "ph": "E",
+        "name": "", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 835, "ph": "I",
+        "name": "ThreadLevelI1", "args": {}},
+
+    {"cat": "PERF", "ts": 880, "ph": "I", "s": "g", "name": "GlobalEvent1",
+        "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 837, "ph": "I",
+        "name": "ThreadLevelI2", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 839, "ph": "C",
+        "name": "counter", "args": {"value": 5}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 840, "ph": "B",
+        "name": "A not as long a name", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "E",
+        "name": "A not as long a name", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "C",
+        "name": "counter", "args": {"value": 1}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "C",
+        "name": "counter", "args": {"value": 10}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 850, "ph": "B",
+        "name": "B", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "E",
+        "name": "B", "args": {}},
+
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 827, "ph": "B",
+        "name": "A", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 835, "ph": "I",
+        "name": "ThreadLevelImmediate Three", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 845, "ph": "I",
+        "name": "ThreadLevelImmediate4", "args": {}},
+    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 854, "ph": "E",
+        "name": "A", "args": {}},
+
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
+        "name": "B/E over X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 10, "ts": 860, "ph": "X",
+        "name": "X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
+        "name": "B/E under X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
+        "name": "B/E under X", "args": {}},
+    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
+        "name": "B/E over X", "args": {}},
+
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 870, "ph": "P",
+        "name": "SampleA", "args": {}},
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 875, "ph": "P",
+        "name": "SampleB", "args": {}},
+    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 878, "ph": "P",
+        "name": "SampleC", "args": {}, "sf": 8},
+
+    {"cat": "__metadata", "pid": 22630, "tid": 22630, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadA"}},
+    {"cat": "__metadata", "pid": 22630, "tid": 22631, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadB"}},
+    {"cat": "__metadata", "pid": 22630, "tid": 22632, "ts": 0, "ph": "M",
+        "name": "thread_name", "args": {"name": "threadC"}}
+  ],
+  "stackFrames": {
+    "1": {
+      "category": "m1",
+      "name": "main"
+    },
+    "7": {
+      "category": "m2",
+      "name": "frame7",
+      "parent": "1"
+    },
+    "8": {
+      "category": "m2",
+      "name": "frame8",
+      "parent": "1"
+    }
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/demo.html b/tensorflow/tensorboard/components/tf_trace_viewer/demo.html
new file mode 100644
index 00000000000..dd0029e9679
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_trace_viewer/demo.html
@@ -0,0 +1,30 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="tf-trace-viewer.html">
+<title>Trace Viewer Demo</title>
+<style>
+  #container{
+    height: 800px;
+    border: 2px solid grey;
+  }
+</style>
+<div id="container">
+  <tf-trace-viewer trace-data-url="data/plugin/profile/trace.json">
+  </tf-trace-viewer>
+</div>
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html b/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html
new file mode 100644
index 00000000000..a7b0b2cd730
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html
@@ -0,0 +1,127 @@
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="trace_viewer_full.html">
+
+<!--
+tf-trace-viewer is the frontend entry point for Trace Viewer on TensorBoard.
+
+The server serves the trace viewer app at a separate endpoint. TensorBoard
+dashboard would integrate trace viewer app using iframe.
+-->
+<script>
+  "use strict";
+
+  Polymer({
+    is: "tf-trace-viewer",
+    properties: {
+      // The URL of trace data. Provided by caller via URL parameter.
+      traceDataUrl: {
+        type: String,
+        value: null,
+      },
+      _traceData: {
+        type: Object,
+        observer: "_traceDataChanged"
+      },
+      _traceViewer: Object,
+      _traceContainer: Object,
+      _traceModel: Object,
+    },
+    ready: function() {
+      // Initiate the trace viewer app.
+      this._traceContainer = document.createElement("track-view-container");
+      this._traceContainer.id = "track_view_container";
+
+      this._traceViewer = document.createElement("tr-ui-timeline-view");
+      this._traceViewer.track_view_container = this._traceContainer;
+      this._traceViewer.appendChild(this._traceContainer);
+
+      this._traceViewer.id = 'trace-viewer';
+      this._traceViewer.globalMode = true;
+
+      Polymer.dom(this.root).appendChild(this._traceViewer);
+
+      // Retrieve the URL of trace data.
+      var queryString = window.location.href.split("?")[1];
+      if (queryString) {
+        var parts = queryString.split('&')
+        for (var i=0; i<parts.length; i++) {
+          var components = parts[i].split('=');
+          if (components[0] == "trace_data_url") {
+            this.traceDataUrl = decodeURIComponent(components[1]);
+            break;
+          }
+        }
+      }
+
+      this._loadTrace();
+    },
+    _loadTrace : function() {
+      if (!this.traceDataUrl) {
+        this._displayOverlay("Trace data URL is not provided.", "Trace Viewer");
+        return null;
+      }
+      // Send HTTP request to get the trace data.
+      var req = new XMLHttpRequest();
+      var is_binary = / [.] gz$ /.test(this.traceDataUrl) ||
+                      / [.] zip$ /.test(this.traceDataUrl);
+      req.overrideMimeType('text/plain; charset=x-user-defined');
+      req.open('GET', this.traceDataUrl, true);
+      if (is_binary) {
+        req.responseType = 'arraybuffer';
+      }
+
+      req.onreadystatechange = function(event) {
+        if (req.readyState !== 4) {
+          return;
+        }
+        window.setTimeout(function() {
+          if (req.status === 200) {
+            this.set("_traceData", is_binary ? req.response : req.responseText);
+          } else {
+            this._displayOverlay(req.status, "Failed to fetch data");
+          }
+        }.bind(this), 0);
+      }.bind(this);
+      req.send(null);
+    },
+    _traceDataChanged: function(data) {
+      if (!data) {
+        this._displayOverlay("Trace Viewer", "No trace to display...");
+        return;
+      }
+      // Feed the trace data into the trace viewer app.
+      this._traceModel = new tr.Model();
+      var i = new tr.importer.Import(this._traceModel);
+      var p = i.importTracesWithProgressDialog([data]);
+      p.then(() => {
+        this._traceViewer.model = this._traceModel;
+        this._traceViewer.viewTitle = "Trace View";
+      }).catch((err) => {
+        this._displayOverlay(
+            'Import error', tr.b.normalizeException(err).message);
+      });
+    },
+    _displayOverlay: function(title, content) {
+      var overlay = new tr.ui.b.Overlay();
+      overlay.textContent = content;
+      overlay.title = title;
+      overlay.visible = true;
+    },
+  });
+</script>
diff --git a/tensorflow/tensorboard/components/trace_viewer.html b/tensorflow/tensorboard/components/trace_viewer.html
new file mode 100644
index 00000000000..c9bcdc9e207
--- /dev/null
+++ b/tensorflow/tensorboard/components/trace_viewer.html
@@ -0,0 +1,28 @@
+<!doctype html>
+<!--
+@license
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>Trace Viewer</title>
+<html>
+<head>
+  <link rel="import" href="tf-trace-viewer/tf-trace-viewer.html" jscomp-nocompile="true">
+  <body>
+    <tf-trace-viewer></tf-trace-viewer>
+  </body>
+</head>
+</html>
diff --git a/tensorflow/tensorboard/components/vz_data_summary/BUILD b/tensorflow/tensorboard/components/vz_data_summary/BUILD
deleted file mode 100644
index 9743d70d947..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# Description:
-# Package for the data-summary vz-element.
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/components/vz_data_summary/demo.html b/tensorflow/tensorboard/components/vz_data_summary/demo.html
deleted file mode 100644
index ba475bf57c2..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/demo.html
+++ /dev/null
@@ -1,164 +0,0 @@
-<!DOCTYPE html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <title>Data Summary examples</title>
-  <link rel="import" href="vz-data-summary.html">
-  <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-</head>
-<body>
-<h1>TF Graph example sizes.</h1>
-<div style="display:flex">60 x 10&nbsp;
-  <div style="width: 60px">
-    <vz-data-summary data="[2, 2, 2, 2, 2, 2]"
-                     height-width-ratio="0.16">
-    </vz-data-summary>
-  </div>
-</div>
-<div style="display:flex">60 x 8&nbsp;&nbsp;&nbsp;
-  <div style="width: 60px">
-    <vz-data-summary data="[1, 2, 3, 6, 1, 1]"
-                     height-width-ratio="0.12">
-    </vz-data-summary>
-  </div>
-</div>
-<h1>Initialized with data.</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary id="first_test" data="[2, 2, 2, 2, 2, 2]">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Initializes with data and labels.</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary id="second_test"
-                       data="[2, 2, 3, 4, 3, 6]"
-                       labels="[0, 1, 2, 3, 4, 5]">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Initializes with data and labels, overflowing labels.</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary id="third_test"
-                       data="[2, 2, 2, 2, 2, 2]"
-                       labels='["fits", "fits", "fits", "OVERFLOW", "fits", "OVERFLOW"]'>
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Small demo, setting height/width=0.2, and
-  height/width=0.1.</h1>
-<div style="width: 80px">
-  <demo-snippet>
-    <template>
-      <vz-data-summary id="fourth_test"
-                       data="[2, 2, 2, 2, 2, 2]"
-                       height-width-ratio="0.2">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-  <demo-snippet>
-    <template>
-      <vz-data-summary data="[2, 2, 2, 2, 2, 2]"
-                       height-width-ratio="0.1">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Different sizes, illustrate font hidden if too small.</h1>
-<div style="display: flex">
-  <div style="width: 400px">
-    <demo-snippet>
-      <template>
-        <vz-data-summary data="[2, 2, 2, 2, 2, 2]"
-                         height-width-ratio="0.2">
-        </vz-data-summary>
-      </template>
-    </demo-snippet>
-  </div>
-  <div style="width: 200px">
-    <demo-snippet>
-      <template>
-        <vz-data-summary data="[2, 2, 2, 2, 2, 2]"
-                         height-width-ratio="0.2">
-        </vz-data-summary>
-      </template>
-    </demo-snippet>
-  </div>
-</div>
-<h1>Initialize with data and colors.</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary data="[2, 4, 5, 6, 3, 1]"
-                       colors='["#22b0dc", "azure", "#808080 ", "#ddd", "#ffdb00", "#c00"]'>
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Length != 6</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary data="[12, 3, 55]"
-                       colors='["#22b0dc", "azure", "#808080"]'
-                       labels='["0", "1", "2"]'>
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Initialized with large text.</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary data="[5, 1, 1, 1, 1, 1]"
-                       height-to-font-ratio="0.8">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Illustrate min-slice-width setting.</h1>
-<h3>Not yet implemented.</h3>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-data-summary data="[200, 200, 200, 200, 200, 1]"
-                       labels="[0, 1, 2, 3, 4, 5]">
-      </vz-data-summary>
-    </template>
-  </demo-snippet>
-</div>
-<div style="width: 50%">
-  <h1>Dynamically adding chart to existing SVG.</h1>
-  <demo-snippet>
-    <template>
-      <svg id="chartGroupExample"></svg>
-    </template>
-  </demo-snippet>
-</div>
-<script src="vz-data-summary.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_data_summary/demo.ts b/tensorflow/tensorboard/components/vz_data_summary/demo.ts
deleted file mode 100644
index 11a2d2636b5..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/demo.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the 'License');
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an 'AS IS' BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- =============================================================================*/
-import {attachChartGroup} from './vz-data-summary';
-/**
- * Function which tries to attach a <g> element to the demo SVG, and waits
- * until the SVG element is created.
- */
-function runDemoCode() {
-  let element = d3.select('#chartGroupExample').node() as HTMLElement;
-  if (element !== null) {
-    let data = [200, 200, 200, 200, 200, 100];
-
-    attachChartGroup(data, 300, element);
-  } else {
-    scheduleFunc(runDemoCode);  // Make code tail-recursive.
-  }
-}
-
-/**
- * Function which is used to run a function in the future.
- * @param callback - The function to be called after the timeout.
- */
-function scheduleFunc(callback) {
-  setTimeout(callback, 200);
-}
-
-runDemoCode();
diff --git a/tensorflow/tensorboard/components/vz_data_summary/index.html b/tensorflow/tensorboard/components/vz_data_summary/index.html
deleted file mode 100644
index bc714021e39..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/index.html
+++ /dev/null
@@ -1,34 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <style>
-      body {
-        margin: 0;
-      }
-    </style>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
-  </head>
-  <body>
-
-    <iron-component-page></iron-component-page>
-
-  </body>
-</html>
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/vz_data_summary/typings.d.ts b/tensorflow/tensorboard/components/vz_data_summary/typings.d.ts
deleted file mode 100644
index 84ec11a3093..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/typings.d.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the 'License');
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an 'AS IS' BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- =============================================================================*/
-/**
- * @fileoverview Typings that are missing or otherwise unavailable. Ideally
- * these should be committed upstream and wind their way back through the
- * third_party import process.
- */
-
-declare module polymer {
-  interface PolymerStatic {
-    IronResizableBehavior: any;
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.html b/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.html
deleted file mode 100644
index 317c7a49f14..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.html
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<link rel='import' href='../polymer/polymer.html'>
-<link rel='import'
-      href='../iron-resizable-behavior/iron-resizable-behavior.html'>
-<script src='../d3-library/d3.js'></script>
-
-<!--
-`vz-data-summary` A simple row chart which is designed to provide a quick
-overview of the values within a tensor. This element takes a n element data
-array which is interpreted as containing the number of elements for each
-passed label. The default is a 6 element data array representing number of
-items in a dataset which are negative infinity, negative, zero, positive,
-positive infinity and NaN.
-
-@element vz-data-summary
-@demo demo/index.html
--->
-
-<dom-module id='vz-data-summary'>
-  <template>
-    <style>
-      #summary {
-        width: 100%;
-        shape-rendering: crispEdges;
-      }
-    </style>
-    <div>
-      <svg id='summary'></svg>
-    </div>
-  </template>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts b/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts
deleted file mode 100644
index 9a4e80c8a98..00000000000
--- a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts
+++ /dev/null
@@ -1,467 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the 'License');
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an 'AS IS' BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- =============================================================================*/
-const svgNS = 'http://www.w3.org/2000/svg';
-
-// The values below are the global defaults for labels, colors, etc. They
-// have to be defined here, in order to be referenced as fallback values for
-// the parameters of createChartGroup() and as initial values for the internal
-// fields in the Polymer element.
-let labels = ['-Inf', '-', '0', '+', '+Inf', 'NaN'];
-let colors = ['#22b0dc', '#1c1c1c', '#808080 ', '#ddd', '#ffdb00', '#c00'];
-let heightWidthRatio = 0.2;
-let heightToFontRatio = 1 / 6;
-
-export function attachChartGroup(
-    data: Array<number>, width: number, parentElement: Element,
-    _labels = labels, _colors = colors, _heightWidthRatio = heightWidthRatio,
-    _heightToFontRatio = heightToFontRatio) {
-  // Calculate the total of all entries in the data array.
-  let dataSum = data.reduce(function(prevResult, currValue) {
-    return prevResult + currValue;
-  });
-  let height = width * _heightWidthRatio;
-
-  // Render data.
-  let outerGroup = document.createElementNS(svgNS, 'g');
-  Polymer.dom(parentElement).appendChild(outerGroup);
-
-  let currentOffset = 0 as number;
-  let currentDataLength = data.length as number;
-
-  for (let slice = 0; slice < currentDataLength; slice++) {
-    let sliceGroup = document.createElementNS(svgNS, 'g');
-    Polymer.dom(outerGroup).appendChild(sliceGroup);
-    let percentage = data[slice] / dataSum;
-    let sliceWidth = percentage * width;
-
-    // Create rectangle.
-    let rect = createRect(currentOffset, sliceWidth, height, _colors[slice]);
-
-    // Add to new rectangle to SVG.
-    Polymer.dom(sliceGroup).appendChild(rect);
-
-    // Add text to rectangle.
-    let text = createText(
-        currentOffset, sliceWidth, height, _labels[slice], _colors[slice],
-        sliceGroup, _heightToFontRatio);
-
-    // Add tooltip to rectangle and text.
-    let rectTitle = createTitle(_labels[slice], data[slice]);
-    let textTitle = createTitle(_labels[slice], data[slice]);
-    Polymer.dom(rect).appendChild(rectTitle);
-    Polymer.dom(text).appendChild(textTitle);
-
-    currentOffset += sliceWidth;
-  }
-  return outerGroup;
-}
-
-function createText(
-    currentOffset: number, sliceWidth: number, height: number, label: string,
-    color: string, group: Element, heightToFontRatio: number) {
-  // Add text to rectangle.
-  let text = document.createElementNS(svgNS, 'text');
-  Polymer.dom(group).appendChild(text);
-
-  text.innerHTML = label;
-
-  // Set location.
-  text.setAttribute('x', (currentOffset + sliceWidth / 2).toString());
-  text.setAttribute('y', (height / 2).toString());
-  // Set text properties.
-  let textColor = getTextColor(color);
-  text.setAttribute('fill', textColor);
-  // Center text.
-  text.setAttribute('text-anchor', 'middle');
-  text.setAttribute('dominant-baseline', 'middle');
-  // Font size.
-  let fontSize = height * heightToFontRatio;
-  text.setAttribute('font-size', fontSize.toString());
-  let textWidth = text.getBoundingClientRect().width;
-  let textHeight = text.getBoundingClientRect().height;
-  // Hide text if text is wider than the slice.
-  if (textWidth > sliceWidth || textHeight > height || fontSize < 7) {
-    text.innerHTML = '';
-  }
-
-  return text;
-}
-
-function createTitle(label: string, numberOfEntries: number) {
-  let title = document.createElementNS(svgNS, 'title');
-  title.innerHTML = label + ': ' + numberOfEntries.toString();
-
-  return title;
-}
-
-function createRect(
-    currentOffset: number, sliceWidth: number, height: number, color: string) {
-  // TODO(b/33428016): Remove this cast when TS 2.1 is in use, as it's
-  // unnecessary.
-  let rect = document.createElementNS(svgNS, 'rect') as SVGRectElement;
-
-  // Set location.
-  rect.setAttribute('x', currentOffset.toString());
-  rect.setAttribute('y', '0');
-  // Set dimensions.
-  rect.setAttribute('width', sliceWidth.toString());
-  rect.setAttribute('height', height.toString());
-  // Set colour.
-  rect.setAttribute('fill', color);
-
-  return rect;
-}
-
-function getTextColor(hexTripletColor: string) {
-  let color = hexTripletColor;
-  if (color.substring(0, 1) !== '#') {  // Lookup hex from name.
-    let convertedHex = colorToHex(color);
-    if (convertedHex) {
-      color = convertedHex;
-    } else {
-      // RGB string is currently not handled.
-      /* tslint:disable:no-console */
-      console.warn(
-          'WARNING: Could not convert color to hex,' +
-          'please specify color as name or hex string.');
-      return 'black';
-    }
-  }
-
-  color = color.substring(1);  // Remove #.
-  if (color.length === 3) {    // If short hex format is used.
-    color = color.split('').reduce(
-        // Double every character.
-        function(initial: string, current: string) {
-          return initial + current + current;
-        },
-        ''  // Initial value.
-        );
-  }
-
-  let colorInt = parseInt(color, 16);   // Convert to integer.
-  let r = (colorInt & 0xFF0000) >> 16;  // Extract each color component.
-  let g = (colorInt & 0x00FF00) >> 8;
-  let b = colorInt & 0x0000FF;
-  // Calculate human perceptible luminance.
-  let alpha = 1 - (0.299 * r + 0.587 * g + 0.114 * b) / 255;
-
-  if (alpha < 0.5) {
-    return 'black';
-  } else {
-    return 'white';
-  }
-}
-
-/**
- * Renders a pixel using the provided color string to an unattached canvas,
- * then reads and returns the rgb image data of the pixel.
- *
- * @param color The color string to be converted.
- *
- * @example
- * // Returns [0, 0, 255, 255]
- * colorToRGBA('blue')
- *
- * @returns Returns the rgba colors as an array, i.e.
- * [r, g, b, a]
- */
-function colorToRgba(color: string) {
-  let canvas = document.createElement('canvas');
-  canvas.height = 1;
-  canvas.width = 1;
-
-  let ctx = canvas.getContext('2d');
-  ctx.fillStyle = color;
-  ctx.fillRect(0, 0, 1, 1);
-  let retArray = [] as number[];
-  let imageData = ctx.getImageData(0, 0, 1, 1).data;
-  imageData.forEach(function(d: number) {  // Copy data into standard Array.
-    retArray.push(d);
-  });
-  return retArray;
-}
-
-/**
- * Turns a number (0-255) into a 2-character hex number (00-ff).
- * @param num
- *
- * @returns The converted string.
- */
-function numToHex(num: number) {
-  return ('0' + num.toString(16)).slice(-2);
-}
-
-/**
- * Converts any color string to its hex representation.
- * @param color The color string to be converted.
- *
- * @example
- * // Returns '#0000ff'
- * colorToHex('blue')
- *
- * @returns The hex color string.
- */
-function colorToHex(color: string) {
-  let rgba = colorToRgba(color) as Array<number|string>;
-  return '#' +
-      rgba.slice(0, 3)  // Remove alpha channel.
-          .map(function(value: number) { return numToHex(value); })
-          .join('');
-}
-Polymer({
-  is: 'vz-data-summary',
-  properties: {
-    colors: {
-      type: Array,
-      // Has to match number of elements in the data array.
-      observer: '_onColorsChange'
-    },
-    data: {type: Array, value: [], observer: '_onDataChange'},
-    heightWidthRatio:
-        {type: Number, value: heightWidthRatio, observer: '_onRatioChange'},
-    heightToFontRatio: {
-      type: Number,
-      value: heightToFontRatio,
-      observer: '_onFontRatioChange'
-    },
-    labels: {type: Array, observer: '_onLabelChange'},
-    _colors: {
-      type: Array,
-      value: colors,
-    },
-    _data: Array,
-    _labels: {type: Array, value: labels},
-    _dataSum: {type: Number},
-    _drawRequested: {type: Boolean, value: false},
-    _height: {type: String},
-    _isReady: {type: Boolean, value: false},
-    _width: {type: Number}
-  },
-  behaviors: [Polymer.IronResizableBehavior],
-  listeners: {'iron-resize': '_onWidthChange'},
-  ready: function() {
-    this._isReady = true;
-    this._updateDimensions();
-    // Trigger rendering if draw was requested before element was ready.
-    if (this._drawRequested) {
-      this._renderData();
-    }
-  },
-  attached: function() {
-    if (this._isReady) {
-      this._updateInternalVariables();
-      this._renderData();
-    }
-  },
-  /**
-   * Observer for this.colors.
-   * @private
-   */
-  _onColorsChange: function() {
-    // Verify passed array is valid.
-    if (this._isColorsValid()) {
-      // Copy over the array to the internal field if it has the correct
-      // length.
-      this._updateInternalVariables();
-      this._renderData();
-    }
-  },
-  /**
-   * Data change handler, if new data is valid, sets flag to indicate data
-   * now valid, updates the data extent and renders the new data.
-   */
-  _onDataChange: function() {
-    // Validate new data.
-    if (!this._isDataValid(this.data)) {
-      return;
-    }
-
-    this._updateInternalVariables();  // Update the internal variables.
-
-    // Calculate the sum.
-    this._dataSum =
-        this.data.reduce(function(prevResult: number, currValue: number) {
-          return prevResult + currValue;
-        });
-
-    this._renderData();
-  },
-  /**
-   * Observer for this.labels. Validates that this.labels is an array of
-   * 6 elements before copying its contents to an internal array.
-   */
-  _onLabelChange: function() {
-    if (this._isLabelsValid()) {
-      this._updateInternalVariables();
-      this._renderData();
-    }
-  },
-  _onFontRatioChange: function() {
-    if ((typeof this.heightToFontRatio) === 'number') {
-      this._renderData();
-    }
-  },
-  /**
-   * Observer for this.heightWidthRatio.
-   */
-  _onRatioChange: function() {
-    if (this.heightWidthRatio) {
-      this._renderData();
-    }
-  },
-  /**
-   * Observer for width change. Depends on iron-resizable-behavior.
-   */
-  _onWidthChange: function() {
-    this._width = this.$.summary.parentNode.width;
-    this._renderData();
-  },
-  _internalFieldsValid: function() {
-    return !!(this._data && this._labels && this._colors);
-  },
-  /**
-   * Renders data, if element is ready, the dimensions are set, and valid
-   * data exists.
-   * @private
-   */
-  _renderData: function() {
-    if (!this._isReady || !this._internalFieldsValid()) {
-      this._drawRequested = true;
-      return;
-    }
-
-    // Ensure dimensions are up-to-date and valid before starting rendering.
-    this._updateDimensions();
-    if (!this._width || !this._height) {
-      return;
-    }
-
-    // Find element to append heat map to and determine dimensions.
-    let svgSelection = this.$.summary as SVGElement;
-
-    // Clear the SVG.
-    this.resetSVG();
-
-    // Render data.
-    attachChartGroup(
-        this._data, this._width, svgSelection, this._labels, this._colors,
-        this.heightWidthRatio, this.heightToFontRatio);
-  },
-  /**
-   * Verifies whether input data is valid.
-   * @param [internal] {bool} Whether to check the internal or external
-   * field.
-   * @private
-   */
-  _isDataValid: function(internal: boolean = false) {
-    return this._validateArrayByName('data', internal, isNumeric);
-  },
-  _isColorsValid: function(internal: boolean = false) {
-    return this._validateArrayByName('colors', internal, isString);
-  },
-  _isLabelsValid: function(internal: boolean = false) {
-    return this._validateArrayByName('labels', internal, isString);
-  },
-  _validateArrayByName: function(
-      name: string, internal: boolean, typeChecker: CheckingFunctionType) {
-    if (internal) {
-      name = '_' + name;
-    }
-    return _isValidArray(this[name], typeChecker);
-  },
-  _updateInternalVariables: function() {
-    // If all new data is valid.
-    let isDataValid = this._isDataValid();
-    let isColorsValid = this._isColorsValid();
-    let isLabelsValid = this._isLabelsValid();
-
-    if (isDataValid && isColorsValid && isLabelsValid) {
-      // Ensure they all have the same length.
-      let length = this.data.length;
-      if (length === this.labels.length && length === this.colors.length) {
-        this._data = this.data.slice();
-        this._colors = this.colors.slice();
-        this._labels = this.labels.slice();
-      }
-    } else {  // Check whether any new fields have the correct length.
-      let internalDataValid = this._isDataValid(true);
-      let internalColorsValid = this._isColorsValid(true);
-      let internalLabelsValid = this._isLabelsValid(true);
-      length = (internalDataValid && this._data.length) ||
-          (internalColorsValid && this._colors.length) ||
-          (internalLabelsValid && this._labels.length);
-
-      this._updateIfSameLength(length, 'data');
-      this._updateIfSameLength(length, 'colors');
-      this._updateIfSameLength(length, 'labels');
-    }
-  },
-  /**
-   * Updates dimensions based on the width of the parent element.
-   * @private
-   */
-  _updateDimensions: function() {
-    let svgSelection = this.$.summary as SVGElement;
-
-    // Recalculate height and width, and ensure data can be rendered.
-    this._width = svgSelection.parentElement.clientWidth;
-    this._height = this._width * this.heightWidthRatio;
-    // Update the attribute height and width.
-    svgSelection.setAttribute('height', this._height);
-    svgSelection.setAttribute('width', this._width);
-  },
-  /**
-   * Called when either colors, data, or labels are called. Copies calues
-   * into internal fields iff the length of all three arrays is equal.
-   */
-  _updateIfSameLength: function(length: number, name: string) {
-    if (this[name] && this[name].length === length) {
-      this['_' + name] = this[name].slice();
-    }
-  },
-  /**
-   * Resets the SVG. Used by _renderData() but is exposed to provide the
-   * user more control of the SVG's contents.
-   */
-  resetSVG: function() {
-    // Reset the SVG.
-    (this.$.summary as SVGElement).innerHTML = '';
-  }
-});
-
-/**
- * Helper method for _isDataValid, _isColorsValid, _isLabelsValid. Check
- * whether the passed data is a) an array, b) the provided function
- * returns true for every element of the array.
- */
-interface CheckingFunctionType {
-  (value: any, index: number, array: any[]): boolean;
-}
-
-function _isValidArray(
-    newData: Object[], typeCheckingFunction: CheckingFunctionType) {
-  return Array.isArray(newData) &&
-      // Returns true is every element in the array is numeric.
-      newData.every(typeCheckingFunction);
-}
-
-function isNumeric(n: any) {
-  return !isNaN(parseFloat(n)) && isFinite(n);
-}
-
-function isString(s: any) {
-  return (typeof s) === 'string';
-}
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
new file mode 100644
index 00000000000..6645805d0c0
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
@@ -0,0 +1,39 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "vz_distribution_chart",
+    srcs = [
+        "vz-distribution-chart.html",
+        "vz-distribution-chart.ts",
+    ],
+    path = "/vz-distribution-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/vz_line_chart",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-distribution-chart",
+    deps = [
+        ":vz_distribution_chart",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html b/tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html
deleted file mode 100644
index b2810412ab1..00000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html
+++ /dev/null
@@ -1,56 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-distribution chart demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-distribution-chart.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-line-chart {
-        height: 400px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>Simple distribution chart</h3>
-    <demo-snippet>
-      <template>
-        <vz-distribution-chart id="demo"></vz-distribution-chart>
-        <script>
-          var elem = document.querySelector('#demo');
-          elem.setVisibleSeries(['demo']);
-          elem.setSeriesData('demo', data);
-        </script>
-      </template>
-    </demo-snippet>
-
-
-    <script>
-      var data = [[1716.20,0,[[0,0.10],[668,0.10],[1587,0.10],[3085,0.10],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1720.60,10,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1724.90,20,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1729.18,30,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1733.55,40,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1737.84,50,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1742.35,60,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1746.75,70,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1751.15,80,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1755.44,90,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1759.99,100,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1764.40,110,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1768.70,120,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1773.00,130,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1777.57,140,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1781.96,150,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1786.34,160,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1790.67,170,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1794.96,180,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1799.29,190,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1803.68,200,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1808.88,210,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1813.33,220,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1817.66,230,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1821.95,240,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1826.97,250,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1831.64,260,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1836.01,270,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1840.31,280,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1844.63,290,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]]];
-    </script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/index.html b/tensorflow/tensorboard/components/vz_distribution_chart/index.html
index b7b399d3fc8..39db09354bd 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart/index.html
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/index.html
@@ -18,13 +18,44 @@ limitations under the License.
 
 <html>
   <head>
-    <title>vz-line-chart</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
+    <title>vz-distribution chart demo</title>
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="vz-distribution-chart.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
+    <style type="text/css">
+      body {
+        font-family: "Roboto";
+      }
+
+      vz-distribution-chart {
+        height: 400px;
+      }
+    </style>
   </head>
   <body>
-    <iron-component-page src="vz-line-chart.html"></iron-component-page>
+    <h3>Simple distribution chart</h3>
+    <script>
+      var data = [[1716.20,0,[[0,0.10],[668,0.10],[1587,0.10],[3085,0.10],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1720.60,10,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1724.90,20,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1729.18,30,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1733.55,40,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1737.84,50,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1742.35,60,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1746.75,70,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1751.15,80,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1755.44,90,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1759.99,100,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1764.40,110,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1768.70,120,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1773.00,130,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1777.57,140,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1781.96,150,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1786.34,160,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1790.67,170,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1794.96,180,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1799.29,190,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1803.68,200,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1808.88,210,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1813.33,220,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1817.66,230,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1821.95,240,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1826.97,250,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1831.64,260,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1836.01,270,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1840.31,280,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1844.63,290,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]]];
+    </script>
+    <demo-snippet>
+      <template>
+        <vz-distribution-chart id="demo"></vz-distribution-chart>
+        <script>
+          var elem = document.querySelector('#demo');
+          elem.setVisibleSeries(['demo']);
+          // The data format is strange. We don't expect you to use this chart.
+          var xform = data.map(function(x) {
+            var out = x[2];
+            out.wall_time = x[0];
+            out.step = x[1];
+            return out;
+          });
+          elem.setSeriesData('demo', xform);
+        </script>
+      </template>
+    </demo-snippet>
   </body>
 </html>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
index 726e2216072..1f1fdda9196 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
@@ -17,11 +17,11 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-imports/plottable.html">
-<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-line-chart/vz-line-chart.html">
 
 <dom-module id="vz-distribution-chart">
   <template>
-    <svg id="chartsvg"></svg>
+    <div id="chartdiv"></div>
     <style>
       :host {
         -webkit-user-select: none;
@@ -32,7 +32,7 @@ limitations under the License.
         flex-shrink: 1;
         position: relative;
       }
-      svg {
+      #chartdiv {
         -webkit-user-select: none;
         -moz-user-select: none;
         flex-grow: 1;
@@ -42,101 +42,4 @@ limitations under the License.
     </style>
   </template>
   <script src="vz-distribution-chart.js"></script>
-  <script src="../vz-line-chart/vz-chart-helpers.js"></script>
-  <script>
-    Polymer({
-      is: "vz-distribution-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: { type: Number, value: null }
-      },
-      observers: [
-        "_makeChart(xType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-      ],
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-      redraw: function() {
-        this._chart.redraw();
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId === null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!_attached) return;
-          if (this._chart) this._chart.destroy();
-          var chart = new VZ.DistributionChart(xType, colorScale);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      }
-    });
-  </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
index 1c64eb4cd49..f3911d301d9 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
+++ b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
@@ -12,150 +12,226 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
 
-module VZ {
-  export class DistributionChart {
-    private run2datasets: {[run: string]: Plottable.Dataset};
-    protected runs: string[];
+import * as ChartHelpers from '../vz-line-chart/vz-chart-helpers';
 
-    protected xAccessor: Plottable.Accessor<number|Date>;
-    protected xScale: Plottable.QuantitativeScale<number|Date>;
-    protected yScale: Plottable.QuantitativeScale<number>;
-    protected gridlines: Plottable.Components.Gridlines;
-    protected center: Plottable.Components.Group;
-    protected xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-    protected yAxis: Plottable.Axes.Numeric;
-    protected xLabel: Plottable.Components.AxisLabel;
-    protected yLabel: Plottable.Components.AxisLabel;
-    protected outer: Plottable.Components.Table;
-    protected colorScale: Plottable.Scales.Color;
-    private plots: Plottable.XYPlot<number|Date, number>[];
+export class DistributionChart {
+  private run2datasets: {[run: string]: Plottable.Dataset};
+  protected runs: string[];
 
-    private targetSVG: d3.Selection<any>;
+  protected xAccessor: Plottable.IAccessor<number|Date>;
+  protected xScale: Plottable.QuantitativeScale<number|Date>;
+  protected yScale: Plottable.QuantitativeScale<number>;
+  protected gridlines: Plottable.Components.Gridlines;
+  protected center: Plottable.Components.Group;
+  protected xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
+  protected yAxis: Plottable.Axes.Numeric;
+  protected xLabel: Plottable.Components.AxisLabel;
+  protected yLabel: Plottable.Components.AxisLabel;
+  protected outer: Plottable.Components.Table;
+  protected colorScale: Plottable.Scales.Color;
+  private plots: Plottable.XYPlot<number|Date, number>[];
 
-    constructor(xType: string, colorScale: Plottable.Scales.Color) {
-      this.run2datasets = {};
-      this.colorScale = colorScale;
-      this.buildChart(xType);
+  private targetSVG: d3.Selection<any, any, any, any>;
+
+  constructor(xType: string, colorScale: Plottable.Scales.Color) {
+    this.run2datasets = {};
+    this.colorScale = colorScale;
+    this.buildChart(xType);
+  }
+
+  protected getDataset(run: string) {
+    if (this.run2datasets[run] === undefined) {
+      this.run2datasets[run] = new Plottable.Dataset([], {run: run});
     }
+    return this.run2datasets[run];
+  }
 
-    protected getDataset(run: string) {
-      if (this.run2datasets[run] === undefined) {
-        this.run2datasets[run] = new Plottable.Dataset([], {run: run});
-      }
-      return this.run2datasets[run];
+  protected buildChart(xType: string) {
+    if (this.outer) {
+      this.outer.destroy();
     }
+    let xComponents = ChartHelpers.getXComponents(xType);
+    this.xAccessor = xComponents.accessor;
+    this.xScale = xComponents.scale;
+    this.xAxis = xComponents.axis;
+    this.xAxis.margin(0).tickLabelPadding(3);
+    this.yScale = new Plottable.Scales.Linear();
+    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
+    let yFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
+    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
+    this.yAxis.usesTextWidthApproximation(true);
 
-    protected buildChart(xType: string) {
-      if (this.outer) {
-        this.outer.destroy();
-      }
-      let xComponents = VZ.ChartHelpers.getXComponents(xType);
-      this.xAccessor = xComponents.accessor;
-      this.xScale = xComponents.scale;
-      this.xAxis = xComponents.axis;
-      this.xAxis.margin(0).tickLabelPadding(3);
-      this.yScale = new Plottable.Scales.Linear();
-      this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-      let yFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-      this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-      this.yAxis.usesTextWidthApproximation(true);
+    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
 
-      let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
+    this.gridlines =
+        new Plottable.Components.Gridlines(this.xScale, this.yScale);
 
-      this.gridlines =
-          new Plottable.Components.Gridlines(this.xScale, this.yScale);
+    this.center = new Plottable.Components.Group([this.gridlines, center]);
+    this.outer = new Plottable.Components.Table(
+        [[this.yAxis, this.center], [null, this.xAxis]]);
+  }
 
-      this.center = new Plottable.Components.Group([this.gridlines, center]);
-      this.outer = new Plottable.Components.Table(
-          [[this.yAxis, this.center], [null, this.xAxis]]);
-    }
+  protected buildPlot(xAccessor, xScale, yScale): Plottable.Component {
+    let percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
+    let opacities = _.range(percents.length - 1)
+                        .map((i) => (percents[i + 1] - percents[i]) / 2500);
+    let accessors = percents.map((p, i) => (datum) => datum[i][1]);
+    let median = 4;
+    let medianAccessor = accessors[median];
 
-    protected buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      let percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
-      let opacities = _.range(percents.length - 1)
-                          .map((i) => (percents[i + 1] - percents[i]) / 2500);
-      let accessors = percents.map((p, i) => (datum) => datum[i][1]);
-      let median = 4;
-      let medianAccessor = accessors[median];
+    let plots = _.range(accessors.length - 1).map((i) => {
+      let p = new Plottable.Plots.Area<number|Date>();
+      p.x(xAccessor, xScale);
 
-      let plots = _.range(accessors.length - 1).map((i) => {
-        let p = new Plottable.Plots.Area<number|Date>();
-        p.x(xAccessor, xScale);
-
-        let y0 = i > median ? accessors[i] : accessors[i + 1];
-        let y = i > median ? accessors[i + 1] : accessors[i];
-        p.y(y, yScale);
-        p.y0(y0);
-        p.attr(
-            'fill', (d: any, i: number, dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().run));
-        p.attr(
-            'stroke', (d: any, i: number, dataset: Plottable.Dataset) =>
-                          this.colorScale.scale(dataset.metadata().run));
-        p.attr('stroke-weight', (d: any, i: number, m: any) => '0.5px');
-        p.attr('stroke-opacity', () => opacities[i]);
-        p.attr('fill-opacity', () => opacities[i]);
-        return p;
-      });
-
-      let medianPlot = new Plottable.Plots.Line<number|Date>();
-      medianPlot.x(xAccessor, xScale);
-      medianPlot.y(medianAccessor, yScale);
-      medianPlot.attr(
+      let y0 = i > median ? accessors[i] : accessors[i + 1];
+      let y = i > median ? accessors[i + 1] : accessors[i];
+      p.y(y, yScale);
+      p.y0(y0);
+      p.attr(
+          'fill',
+          (d: any, i: number, dataset: Plottable.Dataset) =>
+              this.colorScale.scale(dataset.metadata().run));
+      p.attr(
           'stroke',
-          (d: any, i: number, m: any) => this.colorScale.scale(m.run));
+          (d: any, i: number, dataset: Plottable.Dataset) =>
+              this.colorScale.scale(dataset.metadata().run));
+      p.attr('stroke-weight', (d: any, i: number, m: any) => '0.5px');
+      p.attr('stroke-opacity', () => opacities[i]);
+      p.attr('fill-opacity', () => opacities[i]);
+      return p;
+    });
 
-      this.plots = plots;
-      return new Plottable.Components.Group(plots);
-    }
+    let medianPlot = new Plottable.Plots.Line<number|Date>();
+    medianPlot.x(xAccessor, xScale);
+    medianPlot.y(medianAccessor, yScale);
+    medianPlot.attr(
+        'stroke', (d: any, i: number, m: any) => this.colorScale.scale(m.run));
 
-    public setVisibleSeries(runs: string[]) {
-      this.runs = runs;
-      let datasets = runs.map((r) => this.getDataset(r));
-      this.plots.forEach((p) => p.datasets(datasets));
-    }
+    this.plots = plots;
+    return new Plottable.Components.Group(plots);
+  }
 
-    /**
-     * Set the data of a series on the chart.
-     */
-    public setSeriesData(name: string, data: any) {
-      this.getDataset(name).data(data);
-    }
+  public setVisibleSeries(runs: string[]) {
+    this.runs = runs;
+    let datasets = runs.map((r) => this.getDataset(r));
+    this.plots.forEach((p) => p.datasets(datasets));
+  }
 
-    public renderTo(targetSVG: d3.Selection<any>) {
-      this.targetSVG = targetSVG;
-      this.setViewBox();
-      this.outer.renderTo(targetSVG);
-    }
+  /**
+   * Set the data of a series on the chart.
+   */
+  public setSeriesData(name: string, data: any) {
+    this.getDataset(name).data(data);
+  }
 
-    /** There's an issue in Chrome where the svg overflow is a bit
-     * "flickery". There is a border on the gridlines on the extreme edge of the
-     * chart, which behaves inconsistently and causes the screendiffing tests to
-     * flake. We can solve this by creating 1px effective margin for the svg by
-     * setting the viewBox on the containing svg.
-     */
-    private setViewBox() {
-      // There's an issue in Firefox where if we measure with the old viewbox
-      // set, we get horrible results.
-      this.targetSVG.attr('viewBox', null);
+  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
+    this.targetSVG = targetSVG;
+    this.outer.renderTo(targetSVG);
+  }
 
-      let parent = this.targetSVG.node().parentNode as HTMLElement;
-      let w = parent.clientWidth;
-      let h = parent.clientHeight;
-      this.targetSVG.attr({
-        'height': h,
-        'viewBox': `0 0 ${w + 1} ${h + 1}`,
-      });
-    }
+  public redraw() {
+    this.outer.redraw();
+  }
 
-    public redraw() {
-      this.outer.redraw();
-      this.setViewBox();
-    }
-
-    protected destroy() { this.outer.destroy(); }
+  protected destroy() {
+    this.outer.destroy();
   }
 }
+
+
+Polymer({
+  is: 'vz-distribution-chart',
+  properties: {
+    /**
+     * Scale that maps series names to colors. The default colors are from
+     * d3.d3.schemeCategory10. Use this property to replace the default
+     * line colors with colors of your own choice.
+     * @type {Plottable.Scales.Color}
+     * @required
+     */
+    colorScale: {
+      type: Object,
+      value: function() {
+        return new Plottable.Scales.Color().range(d3.schemeCategory10);
+      }
+    },
+    /**
+     * The way to display the X values. Allows:
+     * - "step" - Linear scale using the  "step" property of the datum.
+     * - "wall_time" - Temporal scale using the "wall_time" property of the
+     * datum.
+     * - "relative" - Temporal scale using the "relative" property of the
+     * datum if it is present or calculating from "wall_time" if it isn't.
+     */
+    xType: {type: String, value: 'step'},
+    _attached: Boolean,
+    _chart: Object,
+    _visibleSeriesCache: {
+      type: Array,
+      value: function() {
+        return []
+      }
+    },
+    _seriesDataCache: {
+      type: Object,
+      value: function() {
+        return {}
+      }
+    },
+    _makeChartAsyncCallbackId: {type: Number, value: null}
+  },
+  observers: [
+    '_makeChart(xType, colorScale, _attached)',
+    '_reloadFromCache(_chart)',
+  ],
+  setVisibleSeries: function(names) {
+    this._visibleSeriesCache = names;
+    if (this._chart) {
+      this._chart.setVisibleSeries(names);
+      this.redraw();
+    }
+  },
+  setSeriesData: function(name, data) {
+    this._seriesDataCache[name] = data;
+    if (this._chart) {
+      this._chart.setSeriesData(name, data);
+    }
+  },
+  redraw: function() {
+    this._chart.redraw();
+  },
+  ready: function() {
+    this.scopeSubtree(this.$.chartdiv, true);
+  },
+  _makeChart: function(xType, colorScale, _attached) {
+    if (this._makeChartAsyncCallbackId === null) {
+      this.cancelAsync(this._makeChartAsyncCallbackId);
+    }
+
+    this._makeChartAsyncCallbackId = this.async(function() {
+      this._makeChartAsyncCallbackId = null;
+      if (!_attached) return;
+      if (this._chart) this._chart.destroy();
+      var chart = new DistributionChart(xType, colorScale);
+      var svg = d3.select(this.$.chartdiv);
+      chart.renderTo(svg);
+      this._chart = chart;
+    }, 350);
+  },
+  _reloadFromCache: function() {
+    if (this._chart) {
+      this._chart.setVisibleSeries(this._visibleSeriesCache);
+      this._visibleSeriesCache.forEach(function(name) {
+        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
+      }.bind(this));
+    }
+  },
+  attached: function() {
+    this._attached = true;
+  },
+  detached: function() {
+    this._attached = false;
+  }
+});
diff --git a/tensorflow/tensorboard/components/vz_heatmap/demo/index.html b/tensorflow/tensorboard/components/vz_heatmap/demo/index.html
deleted file mode 100644
index 63dd04485fc..00000000000
--- a/tensorflow/tensorboard/components/vz_heatmap/demo/index.html
+++ /dev/null
@@ -1,160 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <title>Heatmap example</title>
-  <link rel="import" href="../vz-heatmap.html">
-  <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-</head>
-<script>
-  function generateRandomMatrix() {
-    var rows = getRandomArbitrary(10, 20);
-    var columns = getRandomArbitrary(10, 20);
-    var data = [];
-    // Generate new data array.
-    for (var row = 0; row < rows; row++) {
-      var currentRow = [];
-      for (var col = 0; col < columns; col++) {
-        currentRow[col] = getRandomArbitrary(0, 20);
-      }
-      data[row] = currentRow;
-    }
-    return data;
-  }
-
-  // Returns a random number between min (inclusive) and max (exclusive)
-  function getRandomArbitrary(min, max) {
-    return Math.random() * (max - min) + min;
-  }
-
-  function getRandomColorRange() {
-    return [getRandomColor(), getRandomColor()];
-  }
-
-  function getRandomColor() {
-    var letters = '0123456789ABCDEF';
-    var color = '#';
-    for (var i = 0; i < 6; i++) {
-      color += letters[Math.floor(Math.random() * 16)];
-    }
-    return color;
-  }
-</script>
-<body>
-<h1>Initialized with data</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap data="[[1,2],[3,4]]"></vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-
-<h1>Initialized with data and custom data range</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap data="[[1,2],[3,4]]" values="[0,10]"></vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-
-<h1>Initialized with data and colors</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap data="[[1,2],[3,4]]" colors='["yellow", "red"]'></vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-
-<h1>Initialized with data and colors and threshold values</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap data="[[1,2],[3,4]]"
-                  values="[0, 10]"
-                  colors='["yellow", "red"]'
-      ></vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-<h1>Initialized with data and color function</h1>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap id="color_function" data="[[1,2],[3,4]]"
-      ></vz-heatmap>
-    </template>
-  </demo-snippet>
-  <script>
-    setTimeout(function () {
-      var heatmapColor = document.getElementById('color_function');
-      heatmapColor.colorFunction =
-          d3.scale.linear().range(['white', 'black']).domain([0, 5]);
-    }, 1500);
-  </script>
-</div>
-
-<h1>Initialized with data and updated data</h1>
-<h3>Click on the heatmap to create new random data.</h3>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap data="[[1,2],[3,4]]"
-                  onclick="this.data = generateRandomMatrix()"
-      >
-      </vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-
-<h1>Initialized with data and updated color update</h1>
-<h3>Click on the heatmap to update the color scheme.</h3>
-<div style="width: 30%">
-  <demo-snippet>
-    <template>
-      <vz-heatmap id="data_color_update" data="[[1,2],[3,4]]"
-                  onclick="this.colors = getRandomColorRange();
-                       this.data = generateRandomMatrix()"
-      >
-      </vz-heatmap>
-    </template>
-  </demo-snippet>
-</div>
-
-<h2>Let's try to actually break it... *Puts on fedora*</h2>
-<p>Code below is not meant to be seen, but to ensure that no errors are
-  thrown when invalid data is passed into the Polymer element.</p>
-<demo-snippet>
-  <template>
-    <vz-heatmap id="break_the_heatmap" data="undefined"></vz-heatmap>
-  </template>
-</demo-snippet>
-<script>
-  var bth = document.getElementById('break_the_heatmap');
-  bth.data = []; // Empty 1D array.
-  bth.data = [[]]; // Empty 2D array.
-  bth.colors = ['yellow', 'blue', '']; // More than 2 elements in colors array.
-  bth.values = [1, 2, 3]; // More than 2 elements in values array.
-</script>
-
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_heatmap/index.html b/tensorflow/tensorboard/components/vz_heatmap/index.html
deleted file mode 100644
index 656306499eb..00000000000
--- a/tensorflow/tensorboard/components/vz_heatmap/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
-  </head>
-  <body>
-
-    <iron-component-page src="vz-heatmap.html"></iron-component-page>
-
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html b/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html
deleted file mode 100644
index 7acba47d5f5..00000000000
--- a/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html
+++ /dev/null
@@ -1,359 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/d3.html">
-
-<!--
-`vz-heatmap` A simple heatmap to visualize 2D data using predefined or user
-defined colour scheme, with a dependency on d3.js. The heatmap automatically
-fits itself to the width of the container it is placed in.
-
-@element vz-heatmap
-@demo demo/index.html
--->
-<dom-module id="vz-heatmap">
-  <template>
-    <div>
-      <canvas id="heatmap"
-              style="width: 100%; image-rendering:pixelated"></canvas>
-    </div>
-  </template>
-  <script>
-    Polymer({
-      is: 'vz-heatmap',
-      properties: {
-        /**
-         * An 2-element array which defines the lower and upper bound of the
-         * automatically created color scale for the heatmap.
-         */
-        colors: {
-          type: Array,
-          value: ['white', 'steelblue'],
-          observer: '_onColorsChange'
-        },
-        /**
-         * A function which returns a hex-formatted color string
-         * given a number type.
-         * i.e. (input: number) => string
-         *
-         * Passing a colorFunction deactivates the color
-         * function generated by the `colors` and `values` fields.
-         */
-        colorFunction: {
-          type: Object,
-          observer: '_onColorFunctionChange'
-        },
-        /**
-         * A 2D data array containing the data to be visualized.
-         */
-        data: {
-          type: Array,
-          value: [],
-          observer: '_onDataChange'
-        },
-        /**
-         * A 2-element array containing the domain of values which
-         * maps to the color range defined by the `colors` array.
-         */
-        values: {
-          type: Array,
-          observer: '_onValuesChange'
-        },
-        _currentDataValid: {
-          type: Boolean,
-          value: false
-        },
-        _dataExtent: {
-          type: Array,
-          value: [0, 1]
-        },
-        _debug: {
-          type: Boolean,
-          value: false
-        },
-        _generatedColorScale: {
-          type: Object,
-          value: (function () {
-            var retFunction = d3.scale.linear();
-            if (this.colors) {
-              retFunction.range(this.colors)
-            }
-
-            return retFunction;
-          })()
-        },
-        _isReady: {
-          type: Boolean,
-          value: false
-        },
-        _height: {
-          type: Number,
-          value: 0
-        },
-        _width: {
-          type: Number,
-          value: 0
-        },
-        _useUserColorPickerFunction: {
-          type: Boolean,
-          value: false
-        }
-      },
-      behaviors: [
-        Polymer.IronResizableBehavior
-      ],
-      listeners: {
-        'iron-resize': '_onWidthChange'
-      },
-      _onWidthChange: function () {
-        // Re-render the chart if the data has already been set.
-        if (this._currentDataValid) {
-          this._renderData();
-        }
-      },
-      ready: function () {
-        this._isReady = true;
-        this._onWidthChange();
-      },
-      attached: function () {
-        this._onWidthChange();
-      },
-      /**
-       * Data change handler, if new data is valid, sets flag to indicate data
-       * now valid, updates the data extent and renders the new data.
-       */
-      _onDataChange: function () {
-        // Validate new data.
-        if (!this._isDataValid(this.data)) {
-          return;
-        }
-
-        // Set flag to indicate data is now valid.
-        this._currentDataValid = true;
-
-        // Calculate new data extent.
-        this._updateDataExtent();
-
-        if (this._isReady) {
-          this._renderData();
-        }
-      },
-      /**
-       * (Re-)Renders the heatmap. Called when data, color mapping, or width
-       * changes.
-       * @private
-       */
-      _renderData: function () {
-        // Ensure dimensions are up-to-date and valid before starting rendering.
-        this._updateDimensions();
-        if (!this._width || !this._height) return;
-
-        // Ensure the color function is up-to-date.
-        this._updateColorFunction();
-        var width = this._width;
-        var rows = this.data.length;
-        var columns = this.data[0].length;
-
-        // Calculate side length of each tile.
-        var sideLength = width / columns;
-
-        // Set domain and range of the axis.
-        var colorScale = (this._useUserColorPickerFunction) ?
-            this.colorFunction : this._generatedColorScale;
-
-        // Clear the canvas.
-        this.resetCanvas();
-
-        // Render heatmap.
-        var ctx = this.$.heatmap.getContext("2d");
-
-        for (var row = 0; row < rows; row++) {
-          for (var column = 0; column < columns; column++) {
-            var value = this.data[row][column];
-            // Set location and dimensions.
-            ctx.fillStyle = colorScale(value);
-            // Preferred way to set color of individual pixels.
-            ctx.fillRect(column, row, 1, 1);
-          }
-        }
-      },
-      /**
-       * Observer for this.colors.
-       * @private
-       */
-      _onColorsChange: function () {
-        if (Array.isArray(this.colors) && this.colors.length == 2) {
-          this._updateColorFunction();
-        }
-        this._useUserColorPickerFunction = false;
-        this._renderData();
-      },
-      /**
-       * Observer for this.values. Updates the
-       * internal color function. If data invalid, internal color scale uses
-       * the extent of the data as domain.
-       * @private
-       */
-      _onValuesChange: function () {
-        // Verify that validity of the new content of values.
-        // If data not valid, set this.values to undefined, as
-        // _updateColorFunction will then use the data's extent as the color
-        // scale's domain.
-        if (!(Array.isArray(this.values) && this.values.length == 2)) {
-          this.values = undefined;
-        }
-
-        this._updateColorFunction();
-        this._useUserColorPickerFunction = false;
-        this._renderData();
-      },
-      /**
-       * Observer for this.colorFunction. This field allows the user to
-       * provide a custom color scale. Updates to this value are ignored, if the new
-       * value is not a function object.
-       * @private
-       */
-      _onColorFunctionChange: function () {
-        if (typeof this.colorFunction === 'function') {
-          this._useUserColorPickerFunction = true;
-          this._renderData();
-        } else if (this._debug) {
-          console.log('The colorFunction provided is not of function type, and was not set as default.')
-        }
-      },
-      /**
-       * Calculates the extent of the data if it is required by the
-       * current color function. This function is called when
-       * properties change which would result in the color scale
-       * changing, or when the computed color scale is started to be used.
-       * @private
-       */
-      _updateDataExtent: function () {
-        if (this._currentDataValid) {
-          var rows = this.data.length;
-          var columns = this.data[0].length;
-
-          var min = this.data[0][0];
-          var max = this.data[0][0];
-
-          for (var row = 0; row < rows; row++) {
-            for (var col = 0; col < columns; col++) {
-              var currentElement = this.data[row][col];
-
-              if (currentElement < min) {
-                min = currentElement;
-              } else if (currentElement > max) {
-                max = currentElement;
-              }
-            }
-          }
-
-          this._dataExtent = [min, max];
-        } else if (!this._dataExtent) {
-          this._dataExtent = [0, 1];
-        }
-      },
-      /**
-       * Updates the internal color function only when the number of
-       * elements in this.colors is equal to the number of elements in
-       * this.values or if this.values is empty, in which case the
-       * extent of the data is used.
-       * @private
-       */
-      _updateColorFunction: function () {
-        if (Array.isArray(this.colors) && this.colors.length) {
-          this._generatedColorScale = d3.scale.linear().range(this.colors);
-        }
-
-        if (this.values) {
-          // Check that the number of elements in this.values and
-          // this.colors have an identical number of elements.
-          if (this.colors.length === this.values.length) {
-            this._generatedColorScale.domain(this.values);
-          }
-
-          // If values reset, and data field contains valid data, set colour scale
-          // to use the data extent as domain.
-        } else if (this._currentDataValid) {
-          this._generatedColorScale.domain(this._dataExtent);
-        }
-      },
-      _getLinearInterpolation: function (domain, range) {
-        return d3.scale.linear().domain(domain).range(range);
-      },
-      /**
-       * Find side length of each tile in heat map by dividing current width
-       * by number of columns
-       */
-      _findTileSideLength: function () {
-        if (this._currentDataValid) {
-          var numOfColumns = this.data[0].length;
-
-          // Make sure value is finite.
-          if (numOfColumns === 0) {
-            var returnValue = 0;
-          } else {
-            returnValue = this._width / numOfColumns;
-          }
-          return returnValue;
-
-        } else {
-          if (this._debug == true) {
-            console.log("WARNING: vz-heatmap is reverting to zero height as passed data is invalid.")
-          }
-          return 0; // Fall back to 0 height, if data is invalid.
-        }
-      },
-      /**
-       * Verifies whether input data is valid.
-       * @param newData Number[][]
-       * @private
-       */
-      _isDataValid: function (newData) {
-        return Array.isArray(newData) && newData.length &&
-            Array.isArray(newData[0]);
-      },
-      _updateDimensions: function () {
-        var canvasElement = this.$.heatmap;
-
-        var rows = 0;
-        var columns = 0;
-        if (this._currentDataValid) {
-          rows = this.data.length;
-          columns = this.data[0].length;
-        }
-
-        // Recalculate height and width, and ensure data can be rendered.
-        this._width = canvasElement.parentNode.clientWidth;
-        this._height = this._findTileSideLength() * rows;
-        // Update the attribute height and width.
-        canvasElement.setAttribute('height', rows);
-        canvasElement.setAttribute('width', columns);
-      },
-      /**
-       * Function which clears the canvas.
-       */
-      resetCanvas: function () {
-        // Reset the canvas.
-        var canvas = this.$.heatmap;
-        var ctx = canvas.getContext("2d");
-        ctx.clearRect(0, 0, canvas.width, canvas.height);
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
new file mode 100644
index 00000000000..6f6c8d94c37
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "vz_histogram_timeseries",
+    srcs = ["vz-histogram-timeseries.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        ":vz_histogram_timeseries",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":vz_histogram_timeseries"],
+    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
+    destdir = "vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html b/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html
deleted file mode 100644
index 56543fb66b2..00000000000
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html
+++ /dev/null
@@ -1,84 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-histogram-timeseries demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-histogram-timeseries.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
-    <link rel="import" href="../../paper-button/paper-button.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-histogram-timeseries {
-        height: 300px;
-        width: 500px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>vz-histogram-timeseries mode</h3>
-    <demo-snippet>
-      <template>
-        <paper-button id="offsetButton">Offset</paper-button>
-        <paper-button id="overlayButton">Overlay</paper-button>
-        <vz-histogram-timeseries id="histo1"></vz-histogram-timeseries>
-        <script>
-          var histo1 = document.querySelector('#histo1'),
-            offsetButton = document.querySelector('#offsetButton'),
-            overlayButton = document.querySelector('#overlayButton');
-
-          histo1.setSeriesData('data', data);
-          offsetButton.addEventListener('click', function() { histo1.mode = 'offset' });
-          overlayButton.addEventListener('click', function() { histo1.mode = 'overlay' });
-        </script>
-      </template>
-    </demo-snippet>
-
-    <h3>vz-histogram-timeseries axis</h3>
-    <demo-snippet>
-      <template>
-        <paper-button id="stepButton">Step</paper-button>
-        <paper-button id="relativeButton">Relative</paper-button>
-        <paper-button id="wallTimeButton">Wall Time</paper-button>
-        <vz-histogram-timeseries id="histo2"></vz-histogram-timeseries>
-        <script>
-          var histo2 = document.querySelector('#histo2'),
-            stepButton = document.querySelector('#stepButton'),
-            relativeButton = document.querySelector('#relativeButton'),
-            wallTimeButton = document.querySelector('#wallTimeButton');
-
-          histo2.setSeriesData('data', data);
-          stepButton.addEventListener('click', function() { histo2.timeProperty = 'step' });
-          relativeButton.addEventListener('click', function() { histo2.timeProperty = 'relative' });
-          wallTimeButton.addEventListener('click', function() { histo2.timeProperty = 'wall_time' });
-        </script>
-      </template>
-    </demo-snippet>
-
-    <script>
-      var data = [{"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0},{"x":0.0752,"dx":0.0054,"y":3.2512},{"x":0.0805,"dx":0.0054,"y":19.1043},{"x":0.0859,"dx":0.0054,"y":66.4758},{"x":0.0913,"dx":0.0054,"y":89.9105},{"x":0.0966,"dx":0.0054,"y":102.4476},{"x":0.102,"dx":0.0054,"y":83.8924},{"x":0.1073,"dx":0.0054,"y":81.9883},{"x":0.1127,"dx":0.0054,"y":25.21},{"x":0.1181,"dx":0.0054,"y":25.21},{"x":0.1234,"dx":0.0054,"y":2.51},{"x":0.1288,"dx":0.0054,"y":0},{"x":0.1341,"dx":0.0054,"y":0},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0.7448},{"x":0.0752,"dx":0.0054,"y":14.6532},{"x":0.0805,"dx":0.0054,"y":30.3488},{"x":0.0859,"dx":0.0054,"y":59.5117},{"x":0.0913,"dx":0.0054,"y":75.1098},{"x":0.0966,"dx":0.0054,"y":83.4545},{"x":0.102,"dx":0.0054,"y":74.1291},{"x":0.1073,"dx":0.0054,"y":73.1618},{"x":0.1127,"dx":0.0054,"y":40.9067},{"x":0.1181,"dx":0.0054,"y":40.9067},{"x":0.1234,"dx":0.0054,"y":5.241},{"x":0.1288,"dx":0.0054,"y":1.2973},{"x":0.1341,"dx":0.0054,"y":0.5347},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":1.4897},{"x":0.0752,"dx":0.0054,"y":19.5529},{"x":0.0805,"dx":0.0054,"y":36.5817},{"x":0.0859,"dx":0.0054,"y":63.3103},{"x":0.0913,"dx":0.0054,"y":68.1847},{"x":0.0966,"dx":0.0054,"y":70.7925},{"x":0.102,"dx":0.0054,"y":68.2578},{"x":0.1073,"dx":0.0054,"y":67.9546},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":8.3454},{"x":0.1288,"dx":0.0054,"y":4.3242},{"x":0.1341,"dx":0.0054,"y":1.7822},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":2.2345},{"x":0.0752,"dx":0.0054,"y":22.1303},{"x":0.0805,"dx":0.0054,"y":37.9432},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":63.9491},{"x":0.0966,"dx":0.0054,"y":67.3392},{"x":0.102,"dx":0.0054,"y":70.352},{"x":0.1073,"dx":0.0054,"y":70.5635},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":9.5135},{"x":0.1288,"dx":0.0054,"y":5.6214},{"x":0.1341,"dx":0.0054,"y":2.5479},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.2139},{"x":0.0752,"dx":0.0054,"y":22.6865},{"x":0.0805,"dx":0.0054,"y":37.4077},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":62.0743},{"x":0.0966,"dx":0.0054,"y":64.4614},{"x":0.102,"dx":0.0054,"y":69.1481},{"x":0.1073,"dx":0.0054,"y":69.5186},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":11.4129},{"x":0.1288,"dx":0.0054,"y":7.7835},{"x":0.1341,"dx":0.0054,"y":3.439},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.9588},{"x":0.0752,"dx":0.0054,"y":21.5483},{"x":0.0805,"dx":0.0054,"y":37.556},{"x":0.0859,"dx":0.0054,"y":65.2096},{"x":0.0913,"dx":0.0054,"y":61.7226},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":65.4045},{"x":0.1073,"dx":0.0054,"y":65.8728},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":12.0391},{"x":0.1288,"dx":0.0054,"y":8.2159},{"x":0.1341,"dx":0.0054,"y":3.3861},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.134},{"x":0.0645,"dx":0.0054,"y":0.8427},{"x":0.0698,"dx":0.0054,"y":13.4305},{"x":0.0752,"dx":0.0054,"y":25.0288},{"x":0.0805,"dx":0.0054,"y":37.6034},{"x":0.0859,"dx":0.0054,"y":60.7779},{"x":0.0913,"dx":0.0054,"y":61.3028},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5244},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":13.9859},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.7394},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7516},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.67},{"x":0.0645,"dx":0.0054,"y":4.2133},{"x":0.0698,"dx":0.0054,"y":17.9929},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.6941},{"x":0.0859,"dx":0.0054,"y":51.2814},{"x":0.0913,"dx":0.0054,"y":57.993},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":56.9449},{"x":0.1073,"dx":0.0054,"y":56.4756},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":15.3539},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":5.2212},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.9381},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":14.3154},{"x":0.0752,"dx":0.0054,"y":32.7152},{"x":0.0805,"dx":0.0054,"y":43.8322},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":56.6475},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":59.1875},{"x":0.1073,"dx":0.0054,"y":59.0805},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":14.0438},{"x":0.1288,"dx":0.0054,"y":10.8105},{"x":0.1341,"dx":0.0054,"y":4.6865},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445,"bins":[{"x":0.0537,"dx":0.0054,"y":0.2205},{"x":0.0591,"dx":0.0054,"y":1.5836},{"x":0.0645,"dx":0.0054,"y":5.056},{"x":0.0698,"dx":0.0054,"y":18.0162},{"x":0.0752,"dx":0.0054,"y":32.1331},{"x":0.0805,"dx":0.0054,"y":41.2513},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":57.1104},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5178},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":13.7491},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.5083},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3973},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.3011},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":13.7105},{"x":0.0752,"dx":0.0054,"y":32.9245},{"x":0.0805,"dx":0.0054,"y":43.1978},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":55.2572},{"x":0.0966,"dx":0.0054,"y":60.4326},{"x":0.102,"dx":0.0054,"y":53.4983},{"x":0.1073,"dx":0.0054,"y":52.8206},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":15.6486},{"x":0.1288,"dx":0.0054,"y":12.5401},{"x":0.1341,"dx":0.0054,"y":6.5549},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.5922},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":14.4087},{"x":0.0752,"dx":0.0054,"y":31.3218},{"x":0.0805,"dx":0.0054,"y":41.9331},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":53.5815},{"x":0.0966,"dx":0.0054,"y":55.8282},{"x":0.102,"dx":0.0054,"y":55.0152},{"x":0.1073,"dx":0.0054,"y":54.901},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":18.1373},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.5713},{"x":0.1395,"dx":0.0054,"y":1.9655},{"x":0.1449,"dx":0.0054,"y":1.9005},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.4581},{"x":0.0645,"dx":0.0054,"y":8.4266},{"x":0.0698,"dx":0.0054,"y":15.1302},{"x":0.0752,"dx":0.0054,"y":31.577},{"x":0.0805,"dx":0.0054,"y":42.2256},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":53.2729},{"x":0.0966,"dx":0.0054,"y":54.6771},{"x":0.102,"dx":0.0054,"y":53.4815},{"x":0.1073,"dx":0.0054,"y":53.3397},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":18.6688},{"x":0.1288,"dx":0.0054,"y":15.9995},{"x":0.1341,"dx":0.0054,"y":7.5184},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5247},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":3.8126},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":15.8984},{"x":0.0752,"dx":0.0054,"y":33.6899},{"x":0.0805,"dx":0.0054,"y":44.0753},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":51.3318},{"x":0.0966,"dx":0.0054,"y":52.3749},{"x":0.102,"dx":0.0054,"y":57.1094},{"x":0.1073,"dx":0.0054,"y":57.5073},{"x":0.1127,"dx":0.0054,"y":40.4311},{"x":0.1181,"dx":0.0054,"y":40.4311},{"x":0.1234,"dx":0.0054,"y":18.0426},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":6.878},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7731},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":21.1823},{"x":0.0752,"dx":0.0054,"y":30.8315},{"x":0.0805,"dx":0.0054,"y":38.4273},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":49.7656},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4716},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":19.1055},{"x":0.1288,"dx":0.0054,"y":16.4319},{"x":0.1341,"dx":0.0054,"y":7.4655},{"x":0.1395,"dx":0.0054,"y":1.1793},{"x":0.1449,"dx":0.0054,"y":1.1489},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.3487},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":15.2469},{"x":0.0752,"dx":0.0054,"y":32.5059},{"x":0.0805,"dx":0.0054,"y":42.1267},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":48.883},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":58.3955},{"x":0.1073,"dx":0.0054,"y":59.0752},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":19.3054},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":7.8748},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":30.6481},{"x":0.0805,"dx":0.0054,"y":40.862},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":48.3322},{"x":0.0966,"dx":0.0054,"y":47.7705},{"x":0.102,"dx":0.0054,"y":58.6263},{"x":0.1073,"dx":0.0054,"y":59.6023},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":18.3268},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.3402},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":16.7365},{"x":0.0752,"dx":0.0054,"y":31.1584},{"x":0.0805,"dx":0.0054,"y":40.4233},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.9203},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0436},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.8024},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.0807},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":17.4347},{"x":0.0752,"dx":0.0054,"y":31.878},{"x":0.0805,"dx":0.0054,"y":40.8126},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":46.6997},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0423},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":18.5162},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":8.0335},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637,"bins":[{"x":0.0537,"dx":0.0054,"y":1.1024},{"x":0.0591,"dx":0.0054,"y":4.8356},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":20.2741},{"x":0.0752,"dx":0.0054,"y":34.292},{"x":0.0805,"dx":0.0054,"y":41.5418},{"x":0.0859,"dx":0.0054,"y":43.051},{"x":0.0913,"dx":0.0054,"y":48.0004},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4862},{"x":0.1127,"dx":0.0054,"y":48.5173},{"x":0.1181,"dx":0.0054,"y":48.5173},{"x":0.1234,"dx":0.0054,"y":15.7328},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":6.6077},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.3241},{"x":0.0645,"dx":0.0054,"y":7.5839},{"x":0.0698,"dx":0.0054,"y":20.3208},{"x":0.0752,"dx":0.0054,"y":36.1498},{"x":0.0805,"dx":0.0054,"y":44.7076},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.1704},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.748},{"x":0.1127,"dx":0.0054,"y":48.993},{"x":0.1181,"dx":0.0054,"y":48.993},{"x":0.1234,"dx":0.0054,"y":17.727},{"x":0.1288,"dx":0.0054,"y":14.2698},{"x":0.1341,"dx":0.0054,"y":7.961},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.7262},{"x":0.0645,"dx":0.0054,"y":10.1119},{"x":0.0698,"dx":0.0054,"y":19.6459},{"x":0.0752,"dx":0.0054,"y":35.4302},{"x":0.0805,"dx":0.0054,"y":44.3183},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":46.3911},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.7374},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":19.6843},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":9.0303},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":19.6926},{"x":0.0752,"dx":0.0054,"y":34.0368},{"x":0.0805,"dx":0.0054,"y":43.0042},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.9576},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":52.9203},{"x":0.1073,"dx":0.0054,"y":53.8615},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":21.005},{"x":0.1288,"dx":0.0054,"y":18.594},{"x":0.1341,"dx":0.0054,"y":9.5121},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":18.9711},{"x":0.0752,"dx":0.0054,"y":34.7106},{"x":0.0805,"dx":0.0054,"y":43.929},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":43.6121},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":53.25},{"x":0.1073,"dx":0.0054,"y":54.3819},{"x":0.1127,"dx":0.0054,"y":42.3337},{"x":0.1181,"dx":0.0054,"y":42.3337},{"x":0.1234,"dx":0.0054,"y":21.7363},{"x":0.1288,"dx":0.0054,"y":19.4588},{"x":0.1341,"dx":0.0054,"y":9.8685},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":17.5047},{"x":0.0752,"dx":0.0054,"y":34.2003},{"x":0.0805,"dx":0.0054,"y":43.3441},{"x":0.0859,"dx":0.0054,"y":46.2165},{"x":0.0913,"dx":0.0054,"y":43.1044},{"x":0.0966,"dx":0.0054,"y":41.4395},{"x":0.102,"dx":0.0054,"y":52.8213},{"x":0.1073,"dx":0.0054,"y":53.8641},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":22.2678},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.4437},{"x":0.0645,"dx":0.0054,"y":16.0105},{"x":0.0698,"dx":0.0054,"y":18.3196},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.2554},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.5826},{"x":0.0966,"dx":0.0054,"y":42.015},{"x":0.102,"dx":0.0054,"y":53.349},{"x":0.1073,"dx":0.0054,"y":54.3793},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":22.031},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.071},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.3097},{"x":0.0645,"dx":0.0054,"y":15.1679},{"x":0.0698,"dx":0.0054,"y":19.0411},{"x":0.0752,"dx":0.0054,"y":29.1371},{"x":0.0805,"dx":0.0054,"y":37.795},{"x":0.0859,"dx":0.0054,"y":50.0152},{"x":0.0913,"dx":0.0054,"y":45.1782},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":51.0074},{"x":0.1073,"dx":0.0054,"y":51.781},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":20.8524},{"x":0.1288,"dx":0.0054,"y":18.1616},{"x":0.1341,"dx":0.0054,"y":9.796},{"x":0.1395,"dx":0.0054,"y":3.9311},{"x":0.1449,"dx":0.0054,"y":3.8226},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947,"bins":[{"x":0.0537,"dx":0.0054,"y":3.6542},{"x":0.0591,"dx":0.0054,"y":6.8212},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":32.0414},{"x":0.0805,"dx":0.0054,"y":41.1524},{"x":0.0859,"dx":0.0054,"y":43.6841},{"x":0.0913,"dx":0.0054,"y":43.7216},{"x":0.0966,"dx":0.0054,"y":43.7417},{"x":0.102,"dx":0.0054,"y":46.8023},{"x":0.1073,"dx":0.0054,"y":47.089},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":24.8881},{"x":0.1288,"dx":0.0054,"y":22.4857},{"x":0.1341,"dx":0.0054,"y":11.116},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0926},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974,"bins":[{"x":0.0537,"dx":0.0054,"y":3.5385},{"x":0.0591,"dx":0.0054,"y":9.0258},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":17.4814},{"x":0.0752,"dx":0.0054,"y":30.9491},{"x":0.0805,"dx":0.0054,"y":39.7415},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":42.9501},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":51.8154},{"x":0.1073,"dx":0.0054,"y":52.8153},{"x":0.1127,"dx":0.0054,"y":41.8581},{"x":0.1181,"dx":0.0054,"y":41.8581},{"x":0.1234,"dx":0.0054,"y":22.8571},{"x":0.1288,"dx":0.0054,"y":20.7561},{"x":0.1341,"dx":0.0054,"y":11.0964},{"x":0.1395,"dx":0.0054,"y":4.3242},{"x":0.1449,"dx":0.0054,"y":4.22},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"bins":[{"x":0.0537,"dx":0.0054,"y":5.2083},{"x":0.0591,"dx":0.0054,"y":6.5532},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":18.9477},{"x":0.0752,"dx":0.0054,"y":30.5305},{"x":0.0805,"dx":0.0054,"y":39.1091},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":39.4876},{"x":0.0966,"dx":0.0054,"y":34.5329},{"x":0.102,"dx":0.0054,"y":55.0967},{"x":0.1073,"dx":0.0054,"y":57},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":10.3444},{"x":0.1395,"dx":0.0054,"y":6.6828},{"x":0.1449,"dx":0.0054,"y":6.4532},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]}];
-    </script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html b/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
index 20a157ea0e9..42efa83eb07 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
@@ -18,13 +18,67 @@ limitations under the License.
 
 <html>
   <head>
-    <title>vz-histogram-timeseries</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
+    <title>vz-histogram-timeseries demo</title>
+    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="vz-histogram-timeseries.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
+    <link rel="import" href="../paper-button/paper-button.html">
+    <style type="text/css">
+      body {
+        font-family: "Roboto";
+      }
+
+      vz-histogram-timeseries {
+        height: 300px;
+        width: 500px;
+      }
+    </style>
   </head>
   <body>
-    <iron-component-page src="vz-histogram-timeseries.html"></iron-component-page>
+    <h3>vz-histogram-timeseries mode</h3>
+    <demo-snippet>
+      <template>
+        <paper-button id="offsetButton">Offset</paper-button>
+        <paper-button id="overlayButton">Overlay</paper-button>
+        <vz-histogram-timeseries id="histo1"></vz-histogram-timeseries>
+        <script>
+          var histo1 = document.querySelector('#histo1'),
+            offsetButton = document.querySelector('#offsetButton'),
+            overlayButton = document.querySelector('#overlayButton');
+
+          histo1.setSeriesData('data', data);
+          offsetButton.addEventListener('click', function() { histo1.mode = 'offset' });
+          overlayButton.addEventListener('click', function() { histo1.mode = 'overlay' });
+        </script>
+      </template>
+    </demo-snippet>
+
+    <h3>vz-histogram-timeseries axis</h3>
+    <demo-snippet>
+      <template>
+        <paper-button id="stepButton">Step</paper-button>
+        <paper-button id="relativeButton">Relative</paper-button>
+        <paper-button id="wallTimeButton">Wall Time</paper-button>
+        <vz-histogram-timeseries id="histo2"></vz-histogram-timeseries>
+        <script>
+          var histo2 = document.querySelector('#histo2'),
+            stepButton = document.querySelector('#stepButton'),
+            relativeButton = document.querySelector('#relativeButton'),
+            wallTimeButton = document.querySelector('#wallTimeButton');
+
+          histo2.setSeriesData('data', data);
+          stepButton.addEventListener('click', function() { histo2.timeProperty = 'step' });
+          relativeButton.addEventListener('click', function() { histo2.timeProperty = 'relative' });
+          wallTimeButton.addEventListener('click', function() { histo2.timeProperty = 'wall_time' });
+        </script>
+      </template>
+    </demo-snippet>
+
+    <script>
+      var data = [{"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0},{"x":0.0752,"dx":0.0054,"y":3.2512},{"x":0.0805,"dx":0.0054,"y":19.1043},{"x":0.0859,"dx":0.0054,"y":66.4758},{"x":0.0913,"dx":0.0054,"y":89.9105},{"x":0.0966,"dx":0.0054,"y":102.4476},{"x":0.102,"dx":0.0054,"y":83.8924},{"x":0.1073,"dx":0.0054,"y":81.9883},{"x":0.1127,"dx":0.0054,"y":25.21},{"x":0.1181,"dx":0.0054,"y":25.21},{"x":0.1234,"dx":0.0054,"y":2.51},{"x":0.1288,"dx":0.0054,"y":0},{"x":0.1341,"dx":0.0054,"y":0},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0.7448},{"x":0.0752,"dx":0.0054,"y":14.6532},{"x":0.0805,"dx":0.0054,"y":30.3488},{"x":0.0859,"dx":0.0054,"y":59.5117},{"x":0.0913,"dx":0.0054,"y":75.1098},{"x":0.0966,"dx":0.0054,"y":83.4545},{"x":0.102,"dx":0.0054,"y":74.1291},{"x":0.1073,"dx":0.0054,"y":73.1618},{"x":0.1127,"dx":0.0054,"y":40.9067},{"x":0.1181,"dx":0.0054,"y":40.9067},{"x":0.1234,"dx":0.0054,"y":5.241},{"x":0.1288,"dx":0.0054,"y":1.2973},{"x":0.1341,"dx":0.0054,"y":0.5347},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":1.4897},{"x":0.0752,"dx":0.0054,"y":19.5529},{"x":0.0805,"dx":0.0054,"y":36.5817},{"x":0.0859,"dx":0.0054,"y":63.3103},{"x":0.0913,"dx":0.0054,"y":68.1847},{"x":0.0966,"dx":0.0054,"y":70.7925},{"x":0.102,"dx":0.0054,"y":68.2578},{"x":0.1073,"dx":0.0054,"y":67.9546},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":8.3454},{"x":0.1288,"dx":0.0054,"y":4.3242},{"x":0.1341,"dx":0.0054,"y":1.7822},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":2.2345},{"x":0.0752,"dx":0.0054,"y":22.1303},{"x":0.0805,"dx":0.0054,"y":37.9432},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":63.9491},{"x":0.0966,"dx":0.0054,"y":67.3392},{"x":0.102,"dx":0.0054,"y":70.352},{"x":0.1073,"dx":0.0054,"y":70.5635},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":9.5135},{"x":0.1288,"dx":0.0054,"y":5.6214},{"x":0.1341,"dx":0.0054,"y":2.5479},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.2139},{"x":0.0752,"dx":0.0054,"y":22.6865},{"x":0.0805,"dx":0.0054,"y":37.4077},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":62.0743},{"x":0.0966,"dx":0.0054,"y":64.4614},{"x":0.102,"dx":0.0054,"y":69.1481},{"x":0.1073,"dx":0.0054,"y":69.5186},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":11.4129},{"x":0.1288,"dx":0.0054,"y":7.7835},{"x":0.1341,"dx":0.0054,"y":3.439},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.9588},{"x":0.0752,"dx":0.0054,"y":21.5483},{"x":0.0805,"dx":0.0054,"y":37.556},{"x":0.0859,"dx":0.0054,"y":65.2096},{"x":0.0913,"dx":0.0054,"y":61.7226},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":65.4045},{"x":0.1073,"dx":0.0054,"y":65.8728},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":12.0391},{"x":0.1288,"dx":0.0054,"y":8.2159},{"x":0.1341,"dx":0.0054,"y":3.3861},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.134},{"x":0.0645,"dx":0.0054,"y":0.8427},{"x":0.0698,"dx":0.0054,"y":13.4305},{"x":0.0752,"dx":0.0054,"y":25.0288},{"x":0.0805,"dx":0.0054,"y":37.6034},{"x":0.0859,"dx":0.0054,"y":60.7779},{"x":0.0913,"dx":0.0054,"y":61.3028},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5244},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":13.9859},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.7394},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7516},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.67},{"x":0.0645,"dx":0.0054,"y":4.2133},{"x":0.0698,"dx":0.0054,"y":17.9929},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.6941},{"x":0.0859,"dx":0.0054,"y":51.2814},{"x":0.0913,"dx":0.0054,"y":57.993},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":56.9449},{"x":0.1073,"dx":0.0054,"y":56.4756},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":15.3539},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":5.2212},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.9381},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":14.3154},{"x":0.0752,"dx":0.0054,"y":32.7152},{"x":0.0805,"dx":0.0054,"y":43.8322},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":56.6475},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":59.1875},{"x":0.1073,"dx":0.0054,"y":59.0805},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":14.0438},{"x":0.1288,"dx":0.0054,"y":10.8105},{"x":0.1341,"dx":0.0054,"y":4.6865},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445,"bins":[{"x":0.0537,"dx":0.0054,"y":0.2205},{"x":0.0591,"dx":0.0054,"y":1.5836},{"x":0.0645,"dx":0.0054,"y":5.056},{"x":0.0698,"dx":0.0054,"y":18.0162},{"x":0.0752,"dx":0.0054,"y":32.1331},{"x":0.0805,"dx":0.0054,"y":41.2513},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":57.1104},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5178},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":13.7491},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.5083},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3973},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.3011},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":13.7105},{"x":0.0752,"dx":0.0054,"y":32.9245},{"x":0.0805,"dx":0.0054,"y":43.1978},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":55.2572},{"x":0.0966,"dx":0.0054,"y":60.4326},{"x":0.102,"dx":0.0054,"y":53.4983},{"x":0.1073,"dx":0.0054,"y":52.8206},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":15.6486},{"x":0.1288,"dx":0.0054,"y":12.5401},{"x":0.1341,"dx":0.0054,"y":6.5549},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.5922},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":14.4087},{"x":0.0752,"dx":0.0054,"y":31.3218},{"x":0.0805,"dx":0.0054,"y":41.9331},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":53.5815},{"x":0.0966,"dx":0.0054,"y":55.8282},{"x":0.102,"dx":0.0054,"y":55.0152},{"x":0.1073,"dx":0.0054,"y":54.901},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":18.1373},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.5713},{"x":0.1395,"dx":0.0054,"y":1.9655},{"x":0.1449,"dx":0.0054,"y":1.9005},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.4581},{"x":0.0645,"dx":0.0054,"y":8.4266},{"x":0.0698,"dx":0.0054,"y":15.1302},{"x":0.0752,"dx":0.0054,"y":31.577},{"x":0.0805,"dx":0.0054,"y":42.2256},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":53.2729},{"x":0.0966,"dx":0.0054,"y":54.6771},{"x":0.102,"dx":0.0054,"y":53.4815},{"x":0.1073,"dx":0.0054,"y":53.3397},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":18.6688},{"x":0.1288,"dx":0.0054,"y":15.9995},{"x":0.1341,"dx":0.0054,"y":7.5184},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5247},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":3.8126},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":15.8984},{"x":0.0752,"dx":0.0054,"y":33.6899},{"x":0.0805,"dx":0.0054,"y":44.0753},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":51.3318},{"x":0.0966,"dx":0.0054,"y":52.3749},{"x":0.102,"dx":0.0054,"y":57.1094},{"x":0.1073,"dx":0.0054,"y":57.5073},{"x":0.1127,"dx":0.0054,"y":40.4311},{"x":0.1181,"dx":0.0054,"y":40.4311},{"x":0.1234,"dx":0.0054,"y":18.0426},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":6.878},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7731},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":21.1823},{"x":0.0752,"dx":0.0054,"y":30.8315},{"x":0.0805,"dx":0.0054,"y":38.4273},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":49.7656},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4716},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":19.1055},{"x":0.1288,"dx":0.0054,"y":16.4319},{"x":0.1341,"dx":0.0054,"y":7.4655},{"x":0.1395,"dx":0.0054,"y":1.1793},{"x":0.1449,"dx":0.0054,"y":1.1489},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.3487},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":15.2469},{"x":0.0752,"dx":0.0054,"y":32.5059},{"x":0.0805,"dx":0.0054,"y":42.1267},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":48.883},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":58.3955},{"x":0.1073,"dx":0.0054,"y":59.0752},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":19.3054},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":7.8748},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":30.6481},{"x":0.0805,"dx":0.0054,"y":40.862},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":48.3322},{"x":0.0966,"dx":0.0054,"y":47.7705},{"x":0.102,"dx":0.0054,"y":58.6263},{"x":0.1073,"dx":0.0054,"y":59.6023},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":18.3268},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.3402},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":16.7365},{"x":0.0752,"dx":0.0054,"y":31.1584},{"x":0.0805,"dx":0.0054,"y":40.4233},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.9203},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0436},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.8024},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.0807},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":17.4347},{"x":0.0752,"dx":0.0054,"y":31.878},{"x":0.0805,"dx":0.0054,"y":40.8126},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":46.6997},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0423},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":18.5162},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":8.0335},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637,"bins":[{"x":0.0537,"dx":0.0054,"y":1.1024},{"x":0.0591,"dx":0.0054,"y":4.8356},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":20.2741},{"x":0.0752,"dx":0.0054,"y":34.292},{"x":0.0805,"dx":0.0054,"y":41.5418},{"x":0.0859,"dx":0.0054,"y":43.051},{"x":0.0913,"dx":0.0054,"y":48.0004},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4862},{"x":0.1127,"dx":0.0054,"y":48.5173},{"x":0.1181,"dx":0.0054,"y":48.5173},{"x":0.1234,"dx":0.0054,"y":15.7328},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":6.6077},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.3241},{"x":0.0645,"dx":0.0054,"y":7.5839},{"x":0.0698,"dx":0.0054,"y":20.3208},{"x":0.0752,"dx":0.0054,"y":36.1498},{"x":0.0805,"dx":0.0054,"y":44.7076},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.1704},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.748},{"x":0.1127,"dx":0.0054,"y":48.993},{"x":0.1181,"dx":0.0054,"y":48.993},{"x":0.1234,"dx":0.0054,"y":17.727},{"x":0.1288,"dx":0.0054,"y":14.2698},{"x":0.1341,"dx":0.0054,"y":7.961},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.7262},{"x":0.0645,"dx":0.0054,"y":10.1119},{"x":0.0698,"dx":0.0054,"y":19.6459},{"x":0.0752,"dx":0.0054,"y":35.4302},{"x":0.0805,"dx":0.0054,"y":44.3183},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":46.3911},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.7374},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":19.6843},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":9.0303},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":19.6926},{"x":0.0752,"dx":0.0054,"y":34.0368},{"x":0.0805,"dx":0.0054,"y":43.0042},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.9576},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":52.9203},{"x":0.1073,"dx":0.0054,"y":53.8615},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":21.005},{"x":0.1288,"dx":0.0054,"y":18.594},{"x":0.1341,"dx":0.0054,"y":9.5121},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":18.9711},{"x":0.0752,"dx":0.0054,"y":34.7106},{"x":0.0805,"dx":0.0054,"y":43.929},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":43.6121},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":53.25},{"x":0.1073,"dx":0.0054,"y":54.3819},{"x":0.1127,"dx":0.0054,"y":42.3337},{"x":0.1181,"dx":0.0054,"y":42.3337},{"x":0.1234,"dx":0.0054,"y":21.7363},{"x":0.1288,"dx":0.0054,"y":19.4588},{"x":0.1341,"dx":0.0054,"y":9.8685},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":17.5047},{"x":0.0752,"dx":0.0054,"y":34.2003},{"x":0.0805,"dx":0.0054,"y":43.3441},{"x":0.0859,"dx":0.0054,"y":46.2165},{"x":0.0913,"dx":0.0054,"y":43.1044},{"x":0.0966,"dx":0.0054,"y":41.4395},{"x":0.102,"dx":0.0054,"y":52.8213},{"x":0.1073,"dx":0.0054,"y":53.8641},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":22.2678},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.4437},{"x":0.0645,"dx":0.0054,"y":16.0105},{"x":0.0698,"dx":0.0054,"y":18.3196},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.2554},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.5826},{"x":0.0966,"dx":0.0054,"y":42.015},{"x":0.102,"dx":0.0054,"y":53.349},{"x":0.1073,"dx":0.0054,"y":54.3793},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":22.031},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.071},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.3097},{"x":0.0645,"dx":0.0054,"y":15.1679},{"x":0.0698,"dx":0.0054,"y":19.0411},{"x":0.0752,"dx":0.0054,"y":29.1371},{"x":0.0805,"dx":0.0054,"y":37.795},{"x":0.0859,"dx":0.0054,"y":50.0152},{"x":0.0913,"dx":0.0054,"y":45.1782},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":51.0074},{"x":0.1073,"dx":0.0054,"y":51.781},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":20.8524},{"x":0.1288,"dx":0.0054,"y":18.1616},{"x":0.1341,"dx":0.0054,"y":9.796},{"x":0.1395,"dx":0.0054,"y":3.9311},{"x":0.1449,"dx":0.0054,"y":3.8226},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947,"bins":[{"x":0.0537,"dx":0.0054,"y":3.6542},{"x":0.0591,"dx":0.0054,"y":6.8212},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":32.0414},{"x":0.0805,"dx":0.0054,"y":41.1524},{"x":0.0859,"dx":0.0054,"y":43.6841},{"x":0.0913,"dx":0.0054,"y":43.7216},{"x":0.0966,"dx":0.0054,"y":43.7417},{"x":0.102,"dx":0.0054,"y":46.8023},{"x":0.1073,"dx":0.0054,"y":47.089},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":24.8881},{"x":0.1288,"dx":0.0054,"y":22.4857},{"x":0.1341,"dx":0.0054,"y":11.116},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0926},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974,"bins":[{"x":0.0537,"dx":0.0054,"y":3.5385},{"x":0.0591,"dx":0.0054,"y":9.0258},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":17.4814},{"x":0.0752,"dx":0.0054,"y":30.9491},{"x":0.0805,"dx":0.0054,"y":39.7415},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":42.9501},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":51.8154},{"x":0.1073,"dx":0.0054,"y":52.8153},{"x":0.1127,"dx":0.0054,"y":41.8581},{"x":0.1181,"dx":0.0054,"y":41.8581},{"x":0.1234,"dx":0.0054,"y":22.8571},{"x":0.1288,"dx":0.0054,"y":20.7561},{"x":0.1341,"dx":0.0054,"y":11.0964},{"x":0.1395,"dx":0.0054,"y":4.3242},{"x":0.1449,"dx":0.0054,"y":4.22},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"bins":[{"x":0.0537,"dx":0.0054,"y":5.2083},{"x":0.0591,"dx":0.0054,"y":6.5532},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":18.9477},{"x":0.0752,"dx":0.0054,"y":30.5305},{"x":0.0805,"dx":0.0054,"y":39.1091},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":39.4876},{"x":0.0966,"dx":0.0054,"y":34.5329},{"x":0.102,"dx":0.0054,"y":55.0967},{"x":0.1073,"dx":0.0054,"y":57},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":10.3444},{"x":0.1395,"dx":0.0054,"y":6.6828},{"x":0.1449,"dx":0.0054,"y":6.4532},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]}];
+    </script>
   </body>
 </html>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html b/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
index 545a762cd7d..bdba230077d 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
@@ -158,8 +158,7 @@ visualization.
         }
 
         .axis {
-          font-size: 10px;
-          fill: #aaa;
+          font-size: 11px;
         }
 
         .axis path.domain {
@@ -253,13 +252,13 @@ visualization.
 
         /**
          * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
+         * d3.schemeCategory10() scale. Use this property to replace the default
          * line colors with colors of your own choice.
          */
         colorScale: {
           type: Object,
           value: function() {
-            return d3.scale.category10();
+            return d3.scaleOrdinal(d3.schemeCategory10);
           }
         },
 
@@ -378,7 +377,7 @@ visualization.
         var yAxisFormat = d3.format(".0f");
 
         if (timeProp === "wall_time") {
-          yAxisFormat = d3.time.format("%m/%d %X");
+          yAxisFormat = d3.timeFormat("%m/%d %X");
         } else if (timeProp === "relative") {
           yAxisFormat = function(d) {
             return d3.format(".1r")(d / 3.6e6) + 'h'; // Convert to hours.
@@ -405,19 +404,19 @@ visualization.
 
         var extent = d3.extent(data, timeAccessor);
 
-        var yScale = (timeProp === "wall_time" ? d3.time.scale() : d3.scale.linear())
+        var yScale = (timeProp === "wall_time" ? d3.scaleTime() : d3.scaleLinear())
             .domain(extent)
             .range([0, (mode === "offset" ? height : 0)]);
 
-        var ySliceScale = d3.scale.linear()
+        var ySliceScale = d3.scaleLinear()
             .domain([0, d3.max(data, function(d, i) { return yExtents[i][1]; })])
             .range([sliceHeight, 0]);
 
-        var yLineScale = d3.scale.linear()
+        var yLineScale = d3.scaleLinear()
             .domain(ySliceScale.domain())
             .range([outlineCanvasSize, 0]);
 
-        var xScale = d3.scale.linear()
+        var xScale = d3.scaleLinear()
             .domain([
               d3.min(data, function(d, i) { return xExtents[i][0]; }),
               d3.max(data, function(d, i) { return xExtents[i][1]; })
@@ -425,39 +424,33 @@ visualization.
             .nice()
             .range([0, width]);
 
-        var xLineScale = d3.scale.linear()
+        var xLineScale = d3.scaleLinear()
             .domain(xScale.domain())
             .range([0, outlineCanvasSize]);
 
-        var outlineColor = d3.scale.linear()
+        var outlineColor = d3.scaleLinear()
             .domain(d3.extent(data, timeAccessor))
             .range([color.darker(), color.brighter()])
             .interpolate(d3.interpolateHcl);
 
-        var xAxis = d3.svg.axis()
-            .scale(xScale)
-            .ticks(Math.max(2, width / 20))
-            .orient("bottom");
+        var xAxis = d3.axisBottom(xScale).ticks(Math.max(2, width / 20));
 
-        var yAxis = d3.svg.axis()
-            .scale(yScale)
+        var yAxis = d3.axisRight(yScale)
             .ticks(Math.max(2, height / 15))
-            .tickFormat(yAxisFormat)
-            .orient("right");
+            .tickFormat(yAxisFormat);
 
-        var ySliceAxis = d3.svg.axis()
-            .scale(ySliceScale)
+
+
+        var ySliceAxis = d3.axisRight(ySliceScale)
             .ticks(Math.max(2, height / 15))
             .tickSize(width + 5)
-            .tickFormat(format)
-            .orient("right");
+            .tickFormat(format);
 
         var xBinCentroid = function(d) {
           return d[xProp] + d[dxProp] / 2;
         };
 
-        var linePath = d3.svg.line()
-            .interpolate("linear")
+        var linePath = d3.line()
             .x(function(d) { return xLineScale(xBinCentroid(d)); })
             .y(function(d) { return yLineScale(d[yProp]); });
 
@@ -513,7 +506,7 @@ visualization.
         var histogram = stage.selectAll(".histogram").data(data),
             histogramExit = histogram.exit().remove(),
             histogramEnter = histogram.enter().append("g").attr("class", "histogram"),
-            histogramUpdate = histogram
+            histogramUpdate = histogramEnter.merge(histogram)
                 .sort(function(a, b) { return timeAccessor(a) - timeAccessor(b); }),
             histogramTransition = gTransition.selectAll(".histogram")
                 .attr("transform", function(d) {
@@ -556,7 +549,7 @@ visualization.
 
         var xAxisHover = g.select(".x-axis-hover").selectAll(".label").data(["x"]),
             xAxisHoverEnter = xAxisHover.enter().append("g").attr("class", "label"),
-            xAxisHoverUpdate = xAxisHover;
+            xAxisHoverUpdate = xAxisHover.merge(xAxisHoverEnter);
 
         xAxisHoverEnter.append("rect")
             .attr("x", -20)
@@ -575,7 +568,7 @@ visualization.
 
         var yAxisHover = g.select(".y-axis-hover").selectAll(".label").data(["y"]),
             yAxisHoverEnter = yAxisHover.enter().append("g").attr("class", "label"),
-            yAxisHoverUpdate = yAxisHover;
+            yAxisHoverUpdate = yAxisHover.merge(yAxisHoverEnter);
 
         yAxisHoverEnter.append("rect")
             .attr("x", 8)
@@ -595,7 +588,7 @@ visualization.
 
         var ySliceAxisHover = g.select(".y-slice-axis-hover").selectAll(".label").data(["y"]),
             ySliceAxisHoverEnter = ySliceAxisHover.enter().append("g").attr("class", "label"),
-            ySliceAxisHoverUpdate = ySliceAxisHover;
+            ySliceAxisHoverUpdate = ySliceAxisHover.merge(ySliceAxisHoverEnter);
 
         ySliceAxisHoverEnter.append("rect")
             .attr("x", 8)
@@ -627,6 +620,11 @@ visualization.
             .attr("transform", "translate(" + width + ", " + (mode === "offset" ? 0 : height) + ")")
             .call(yAxis);
 
+        gTransition.selectAll(".tick text")
+            .attr("fill", "#aaa");
+        gTransition.selectAll(".axis path.domain").attr("stroke", "none");
+
+
         function onMouseMove() {
           var m = d3.mouse(this),
               v = xScale.invert(m[0]),
diff --git a/tensorflow/tensorboard/components/vz_line_chart/BUILD b/tensorflow/tensorboard/components/vz_line_chart/BUILD
index 967d86596f1..8bbf8a24d34 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/BUILD
+++ b/tensorflow/tensorboard/components/vz_line_chart/BUILD
@@ -1,37 +1,48 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
+ts_web_library(
     name = "vz_line_chart",
-    srcs = [
-        "vz-line-chart.html",
-        ":ts",
-    ],
-    path = "/vz-line-chart",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "@org_polymer",
-    ],
-)
-
-tensorboard_typescript_genrule(
-    name = "ts",
     srcs = [
         "dragZoomInteraction.ts",
         "vz-chart-helpers.ts",
+        "vz-line-chart.html",
         "vz-line-chart.ts",
     ],
-    typings = [
-        "@org_definitelytyped//:d3.d.ts",
-        "@com_palantir_plottable//:plottable.d.ts",
-        "@org_definitelytyped//:lodash.d.ts",
+    path = "/vz-line-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_imports:plottable",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+    ],
+)
+
+ts_web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-line-chart",
+    deps = [
+        ":vz_line_chart",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":vz_line_chart"],
+    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
+    destdir = "vz-line-chart",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
     ],
 )
 
@@ -40,32 +51,3 @@ filegroup(
     srcs = glob(["**"]),
     tags = ["notsan"],
 )
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "index.html",
-        "vz-line-chart.html",
-        ":legacy_ts",
-    ],
-    visibility = ["//visibility:public"],
-    destdir = "vz-line-chart",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-        "dragZoomInteraction.ts",
-        "vz-chart-helpers.ts",
-        "vz-line-chart.ts",
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD b/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
deleted file mode 100644
index 84699b67b6b..00000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/vz_line_chart/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-line-chart/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/vz_line_chart",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/demo/index.html b/tensorflow/tensorboard/components/vz_line_chart/demo/index.html
deleted file mode 100644
index fec8e8bed88..00000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/demo/index.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-line-chart demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-line-chart.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-line-chart {
-        height: 400px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>Simple line chart</h3>
-    <demo-snippet>
-      <template>
-        <vz-line-chart id="simpleline"></vz-line-chart>
-        <script>
-          var elem = document.querySelector('#simpleline');
-          elem.setVisibleSeries(['test', 'train']);
-          elem.setSeriesData('test', data.test);
-          elem.setSeriesData('train', data.train);
-        </script>
-      </template>
-    </demo-snippet>
-
-    <h3>Exponential Smoothing enabled</h3>
-    <demo-snippet>
-      <template>
-        <vz-line-chart id="smoothedline"></vz-line-chart>
-        <script>
-          var elem = document.querySelector('#smoothedline');
-          elem.smoothingEnabled = true;
-          elem.setVisibleSeries(['test', 'train']);
-          elem.setSeriesData('test', data.test);
-          elem.setSeriesData('train', data.train);
-        </script>
-      </template>
-    </demo-snippet>
-
-    <script>
-      var data = {
-        "test": [{"scalar":0.07039999961853027,"wall_time":new Date("2016-06-24T04:13:01.295Z"),"step":0},{"scalar":0.6891000270843506,"wall_time":new Date("2016-06-24T04:13:05.909Z"),"step":10},{"scalar":0.8208000063896179,"wall_time":new Date("2016-06-24T04:13:10.318Z"),"step":20},{"scalar":0.8554999828338623,"wall_time":new Date("2016-06-24T04:13:14.794Z"),"step":30},{"scalar":0.8776000142097473,"wall_time":new Date("2016-06-24T04:13:19.166Z"),"step":40},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:23.603Z"),"step":50},{"scalar":0.8906000256538391,"wall_time":new Date("2016-06-24T04:13:27.931Z"),"step":60},{"scalar":0.8853999972343445,"wall_time":new Date("2016-06-24T04:13:32.281Z"),"step":70},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:36.636Z"),"step":80},{"scalar":0.8985999822616577,"wall_time":new Date("2016-06-24T04:13:41.070Z"),"step":90},{"scalar":0.9057000279426575,"wall_time":new Date("2016-06-24T04:13:45.475Z"),"step":100},{"scalar":0.9136000275611877,"wall_time":new Date("2016-06-24T04:13:49.947Z"),"step":110},{"scalar":0.919700026512146,"wall_time":new Date("2016-06-24T04:13:54.384Z"),"step":120},{"scalar":0.9182000160217285,"wall_time":new Date("2016-06-24T04:13:58.790Z"),"step":130},{"scalar":0.9282000064849854,"wall_time":new Date("2016-06-24T04:14:03.180Z"),"step":140},{"scalar":0.9218999743461609,"wall_time":new Date("2016-06-24T04:14:08.060Z"),"step":150},{"scalar":0.9294000267982483,"wall_time":new Date("2016-06-24T04:14:12.520Z"),"step":160},{"scalar":0.9261000156402588,"wall_time":new Date("2016-06-24T04:14:16.893Z"),"step":170},{"scalar":0.9236999750137329,"wall_time":new Date("2016-06-24T04:14:21.653Z"),"step":180},{"scalar":0.925000011920929,"wall_time":new Date("2016-06-24T04:14:26.065Z"),"step":190},{"scalar":0.9319000244140625,"wall_time":new Date("2016-06-24T04:14:30.430Z"),"step":200},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:34.815Z"),"step":210},{"scalar":0.9347000122070312,"wall_time":new Date("2016-06-24T04:14:39.179Z"),"step":220},{"scalar":0.9341999888420105,"wall_time":new Date("2016-06-24T04:14:43.562Z"),"step":230},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:47.953Z"),"step":240},{"scalar":0.9254000186920166,"wall_time":new Date("2016-06-24T04:14:52.322Z"),"step":250},{"scalar":0.9383000135421753,"wall_time":new Date("2016-06-24T04:14:56.703Z"),"step":260},{"scalar":0.9391999840736389,"wall_time":new Date("2016-06-24T04:15:01.046Z"),"step":270},{"scalar":0.9336000084877014,"wall_time":new Date("2016-06-24T04:15:05.458Z"),"step":280},{"scalar":0.9404000043869019,"wall_time":new Date("2016-06-24T04:15:10.438Z"),"step":290},{"scalar":0.944100022315979,"wall_time":new Date("2016-06-24T04:15:15.026Z"),"step":300},{"scalar":0.9401000142097473,"wall_time":new Date("2016-06-24T04:15:19.417Z"),"step":310},{"scalar":0.9394999742507935,"wall_time":new Date("2016-06-24T04:15:23.985Z"),"step":320},{"scalar":0.9438999891281128,"wall_time":new Date("2016-06-24T04:15:28.418Z"),"step":330},{"scalar":0.9478999972343445,"wall_time":new Date("2016-06-24T04:15:32.844Z"),"step":340},{"scalar":0.9470999836921692,"wall_time":new Date("2016-06-24T04:15:37.359Z"),"step":350},{"scalar":0.9423999786376953,"wall_time":new Date("2016-06-24T04:15:41.803Z"),"step":360},{"scalar":0.9472000002861023,"wall_time":new Date("2016-06-24T04:15:46.167Z"),"step":370},{"scalar":0.9491999745368958,"wall_time":new Date("2016-06-24T04:15:50.558Z"),"step":380},{"scalar":0.9458000063896179,"wall_time":new Date("2016-06-24T04:15:54.942Z"),"step":390},{"scalar":0.9442999958992004,"wall_time":new Date("2016-06-24T04:15:59.343Z"),"step":400},{"scalar":0.946399986743927,"wall_time":new Date("2016-06-24T04:16:03.703Z"),"step":410},{"scalar":0.947700023651123,"wall_time":new Date("2016-06-24T04:16:08.102Z"),"step":420},{"scalar":0.9451000094413757,"wall_time":new Date("2016-06-24T04:16:13.379Z"),"step":430},{"scalar":0.9532999992370605,"wall_time":new Date("2016-06-24T04:16:17.962Z"),"step":440},{"scalar":0.9496999979019165,"wall_time":new Date("2016-06-24T04:16:22.320Z"),"step":450},{"scalar":0.9513000249862671,"wall_time":new Date("2016-06-24T04:16:26.712Z"),"step":460},{"scalar":0.9488999843597412,"wall_time":new Date("2016-06-24T04:16:31.099Z"),"step":470},{"scalar":0.9520000219345093,"wall_time":new Date("2016-06-24T04:16:35.760Z"),"step":480},{"scalar":0.9516000151634216,"wall_time":new Date("2016-06-24T04:16:40.239Z"),"step":490},{"scalar":0.9537000060081482,"wall_time":new Date("2016-06-24T04:16:44.620Z"),"step":500},{"scalar":0.9528999924659729,"wall_time":new Date("2016-06-24T04:16:49.273Z"),"step":510},{"scalar":0.9502999782562256,"wall_time":new Date("2016-06-24T04:16:53.640Z"),"step":520},{"scalar":0.9573000073432922,"wall_time":new Date("2016-06-24T04:16:58.612Z"),"step":530},{"scalar":0.9550999999046326,"wall_time":new Date("2016-06-24T04:17:03.089Z"),"step":540},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:07.481Z"),"step":550},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:11.866Z"),"step":560},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:17:16.456Z"),"step":570},{"scalar":0.9588000178337097,"wall_time":new Date("2016-06-24T04:17:20.983Z"),"step":580},{"scalar":0.9569000005722046,"wall_time":new Date("2016-06-24T04:17:25.996Z"),"step":590},{"scalar":0.9585999846458435,"wall_time":new Date("2016-06-24T04:17:30.417Z"),"step":600},{"scalar":0.9555000066757202,"wall_time":new Date("2016-06-24T04:17:35.164Z"),"step":610},{"scalar":0.9567999839782715,"wall_time":new Date("2016-06-24T04:17:39.714Z"),"step":620},{"scalar":0.9616000056266785,"wall_time":new Date("2016-06-24T04:17:44.105Z"),"step":630},{"scalar":0.9603999853134155,"wall_time":new Date("2016-06-24T04:17:48.826Z"),"step":640},{"scalar":0.9605000019073486,"wall_time":new Date("2016-06-24T04:17:53.419Z"),"step":650},{"scalar":0.9627000093460083,"wall_time":new Date("2016-06-24T04:17:58.026Z"),"step":660},{"scalar":0.9639999866485596,"wall_time":new Date("2016-06-24T04:18:02.698Z"),"step":670},{"scalar":0.9613999724388123,"wall_time":new Date("2016-06-24T04:18:07.960Z"),"step":680},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:18:12.458Z"),"step":690},{"scalar":0.9617999792098999,"wall_time":new Date("2016-06-24T04:18:16.835Z"),"step":700},{"scalar":0.9635000228881836,"wall_time":new Date("2016-06-24T04:18:21.232Z"),"step":710},{"scalar":0.9641000032424927,"wall_time":new Date("2016-06-24T04:18:25.888Z"),"step":720},{"scalar":0.9628000259399414,"wall_time":new Date("2016-06-24T04:18:30.372Z"),"step":730},{"scalar":0.9656000137329102,"wall_time":new Date("2016-06-24T04:18:34.751Z"),"step":740},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:18:39.368Z"),"step":750},{"scalar":0.9646000266075134,"wall_time":new Date("2016-06-24T04:18:44.267Z"),"step":760},{"scalar":0.9617000222206116,"wall_time":new Date("2016-06-24T04:18:48.829Z"),"step":770},{"scalar":0.9657999873161316,"wall_time":new Date("2016-06-24T04:18:53.466Z"),"step":780},{"scalar":0.9667999744415283,"wall_time":new Date("2016-06-24T04:18:57.893Z"),"step":790},{"scalar":0.967199981212616,"wall_time":new Date("2016-06-24T04:19:02.601Z"),"step":800},{"scalar":0.9646999835968018,"wall_time":new Date("2016-06-24T04:19:07.657Z"),"step":810},{"scalar":0.9670000076293945,"wall_time":new Date("2016-06-24T04:19:12.331Z"),"step":820},{"scalar":0.96670001745224,"wall_time":new Date("2016-06-24T04:19:17.223Z"),"step":830},{"scalar":0.9668999910354614,"wall_time":new Date("2016-06-24T04:19:21.980Z"),"step":840},{"scalar":0.965399980545044,"wall_time":new Date("2016-06-24T04:19:26.352Z"),"step":850},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:30.764Z"),"step":860},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:35.244Z"),"step":870},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:19:39.620Z"),"step":880},{"scalar":0.9666000008583069,"wall_time":new Date("2016-06-24T04:19:43.979Z"),"step":890},{"scalar":0.9664999842643738,"wall_time":new Date("2016-06-24T04:19:48.337Z"),"step":900},{"scalar":0.9678000211715698,"wall_time":new Date("2016-06-24T04:19:52.688Z"),"step":910},{"scalar":0.9678999781608582,"wall_time":new Date("2016-06-24T04:19:57.058Z"),"step":920},{"scalar":0.9674999713897705,"wall_time":new Date("2016-06-24T04:20:01.415Z"),"step":930},{"scalar":0.9684000015258789,"wall_time":new Date("2016-06-24T04:20:05.887Z"),"step":940},{"scalar":0.9672999978065491,"wall_time":new Date("2016-06-24T04:20:10.261Z"),"step":950},{"scalar":0.9696000218391418,"wall_time":new Date("2016-06-24T04:20:14.610Z"),"step":960},{"scalar":0.9706000089645386,"wall_time":new Date("2016-06-24T04:20:19.526Z"),"step":970},{"scalar":0.9688000082969666,"wall_time":new Date("2016-06-24T04:20:23.881Z"),"step":980},{"scalar":0.9699000120162964,"wall_time":new Date("2016-06-24T04:20:28.415Z"),"step":990,"name":"test","relative":0.1242}],
-        "train": [{"scalar":0.05999999865889549,"wall_time":new Date("2016-06-24T04:13:01.556Z"),"step":1},{"scalar":0.18000000715255737,"wall_time":new Date("2016-06-24T04:13:01.693Z"),"step":2},{"scalar":0.25,"wall_time":new Date("2016-06-24T04:13:01.833Z"),"step":3},{"scalar":0.28999999165534973,"wall_time":new Date("2016-06-24T04:13:01.964Z"),"step":4},{"scalar":0.3400000035762787,"wall_time":new Date("2016-06-24T04:13:02.109Z"),"step":5},{"scalar":0.5099999904632568,"wall_time":new Date("2016-06-24T04:13:02.249Z"),"step":6},{"scalar":0.550000011920929,"wall_time":new Date("2016-06-24T04:13:02.387Z"),"step":7},{"scalar":0.5600000023841858,"wall_time":new Date("2016-06-24T04:13:02.515Z"),"step":8},{"scalar":0.6700000166893005,"wall_time":new Date("2016-06-24T04:13:02.650Z"),"step":9},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.061Z"),"step":11},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:06.202Z"),"step":12},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:06.343Z"),"step":13},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.480Z"),"step":14},{"scalar":0.75,"wall_time":new Date("2016-06-24T04:13:06.618Z"),"step":15},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:06.763Z"),"step":16},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:06.897Z"),"step":17},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:07.027Z"),"step":18},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:07.167Z"),"step":19},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:10.461Z"),"step":21},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:10.602Z"),"step":22},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:10.748Z"),"step":23},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:10.894Z"),"step":24},{"scalar":0.6899999976158142,"wall_time":new Date("2016-06-24T04:13:11.034Z"),"step":25},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:11.173Z"),"step":26},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:11.315Z"),"step":27},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:11.590Z"),"step":29},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:14.931Z"),"step":31},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.069Z"),"step":32},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:15.199Z"),"step":33},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:15.329Z"),"step":34},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.466Z"),"step":35},{"scalar":0.7699999809265137,"wall_time":new Date("2016-06-24T04:13:15.610Z"),"step":36},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:15.746Z"),"step":37},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:15.876Z"),"step":38},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:16.010Z"),"step":39},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.296Z"),"step":41},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.435Z"),"step":42},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:19.567Z"),"step":43},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:19.696Z"),"step":44},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.833Z"),"step":45},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.972Z"),"step":46},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:20.111Z"),"step":47},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.239Z"),"step":48},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.370Z"),"step":49},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:23.731Z"),"step":51},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:23.869Z"),"step":52},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:24.000Z"),"step":53},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:24.129Z"),"step":54},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.256Z"),"step":55},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:24.391Z"),"step":56},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:24.529Z"),"step":57},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:24.663Z"),"step":58},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.792Z"),"step":59},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:28.072Z"),"step":61},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:28.212Z"),"step":62},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:28.340Z"),"step":63},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:28.477Z"),"step":64},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.610Z"),"step":65},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:28.751Z"),"step":66},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.875Z"),"step":67},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:29.003Z"),"step":68},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:29.137Z"),"step":69},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:32.418Z"),"step":71},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:32.552Z"),"step":72},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:32.685Z"),"step":73},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:32.942Z"),"step":75},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:33.077Z"),"step":76},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:33.213Z"),"step":77},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:33.347Z"),"step":78},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:33.489Z"),"step":79},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:36.766Z"),"step":81},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:36.902Z"),"step":82},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:37.042Z"),"step":83},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:37.187Z"),"step":84},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:37.319Z"),"step":85},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:37.460Z"),"step":86},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:37.595Z"),"step":87},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:37.730Z"),"step":88},{"scalar":0.7400000095367432,"wall_time":new Date("2016-06-24T04:13:37.866Z"),"step":89},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:41.201Z"),"step":91},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.334Z"),"step":92},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.480Z"),"step":93},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.618Z"),"step":94},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:41.755Z"),"step":95},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:41.885Z"),"step":96},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:42.022Z"),"step":97},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:42.145Z"),"step":98},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:42.271Z"),"step":99},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:45.616Z"),"step":101},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:45.751Z"),"step":102},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:45.882Z"),"step":103},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.031Z"),"step":104},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:46.169Z"),"step":105},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.302Z"),"step":106},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.443Z"),"step":107},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:46.578Z"),"step":108},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.720Z"),"step":109},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.080Z"),"step":111},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:50.219Z"),"step":112},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:50.351Z"),"step":113},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.495Z"),"step":114},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.622Z"),"step":115},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.761Z"),"step":116},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.894Z"),"step":117},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:51.027Z"),"step":118},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:51.167Z"),"step":119},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:54.523Z"),"step":121},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:54.649Z"),"step":122},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:54.813Z"),"step":123},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:54.958Z"),"step":124},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.097Z"),"step":125},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:55.232Z"),"step":126},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.363Z"),"step":127},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:55.491Z"),"step":128},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:55.619Z"),"step":129},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:58.923Z"),"step":131},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.051Z"),"step":132},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.183Z"),"step":133},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.316Z"),"step":134},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.443Z"),"step":135},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.583Z"),"step":136},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:13:59.712Z"),"step":137},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.840Z"),"step":138},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:59.978Z"),"step":139},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:03.306Z"),"step":141},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.443Z"),"step":142},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:03.572Z"),"step":143},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:03.700Z"),"step":144},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.834Z"),"step":145},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:03.964Z"),"step":146},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:04.104Z"),"step":147},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:04.470Z"),"step":149},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:08.189Z"),"step":151},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.328Z"),"step":152},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.462Z"),"step":153},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.594Z"),"step":154},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:08.725Z"),"step":155},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:08.859Z"),"step":156},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:08.998Z"),"step":157},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:09.132Z"),"step":158},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:09.268Z"),"step":159},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:12.667Z"),"step":161},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:12.803Z"),"step":162},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:12.935Z"),"step":163},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:13.068Z"),"step":164},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.199Z"),"step":165},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:13.333Z"),"step":166},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.462Z"),"step":167},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:13.591Z"),"step":168},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:13.719Z"),"step":169},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.038Z"),"step":171},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:17.302Z"),"step":173},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.430Z"),"step":174},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.565Z"),"step":175},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:17.700Z"),"step":176},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:17.844Z"),"step":177},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:18.013Z"),"step":178},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:18.164Z"),"step":179},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:21.786Z"),"step":181},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:21.915Z"),"step":182},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.054Z"),"step":183},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:22.187Z"),"step":184},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:22.317Z"),"step":185},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.442Z"),"step":186},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:22.583Z"),"step":187},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:22.718Z"),"step":188},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:22.849Z"),"step":189},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:26.199Z"),"step":191},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.327Z"),"step":192},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:26.462Z"),"step":193},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:26.591Z"),"step":194},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:26.721Z"),"step":195},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:26.857Z"),"step":196},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.989Z"),"step":197},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:27.118Z"),"step":198},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:27.251Z"),"step":199},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:30.566Z"),"step":201},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:30.831Z"),"step":203},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:30.965Z"),"step":204},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:31.105Z"),"step":205},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.234Z"),"step":206},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.369Z"),"step":207},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:31.498Z"),"step":208},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:31.646Z"),"step":209},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:34.956Z"),"step":211},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:35.097Z"),"step":212},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:35.225Z"),"step":213},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:35.361Z"),"step":214},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:35.486Z"),"step":215},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:35.618Z"),"step":216},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:35.747Z"),"step":217},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:35.879Z"),"step":218},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:36.019Z"),"step":219},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:39.310Z"),"step":221},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.439Z"),"step":222},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:39.580Z"),"step":223},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.854Z"),"step":225},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.988Z"),"step":226},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:40.117Z"),"step":227},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:40.244Z"),"step":228},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:40.376Z"),"step":229},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:43.701Z"),"step":231},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:43.852Z"),"step":232},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:43.988Z"),"step":233},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:44.127Z"),"step":234},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:44.261Z"),"step":235},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:44.394Z"),"step":236},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:44.535Z"),"step":237},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:44.663Z"),"step":238},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:44.803Z"),"step":239},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:48.092Z"),"step":241},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:14:48.234Z"),"step":242},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.374Z"),"step":243},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.503Z"),"step":244},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.635Z"),"step":245},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:48.768Z"),"step":246},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.895Z"),"step":247},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:49.037Z"),"step":248},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:49.169Z"),"step":249},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:52.450Z"),"step":251},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:52.582Z"),"step":252},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:52.712Z"),"step":253},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:52.850Z"),"step":254},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:52.982Z"),"step":255},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:53.112Z"),"step":256},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:53.244Z"),"step":257},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:53.377Z"),"step":258},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:53.501Z"),"step":259},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:56.842Z"),"step":261},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:56.973Z"),"step":262},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:57.101Z"),"step":263},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:57.229Z"),"step":264},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:57.360Z"),"step":265},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.491Z"),"step":266},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.629Z"),"step":267},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:57.770Z"),"step":268},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.898Z"),"step":269},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.199Z"),"step":271},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:01.330Z"),"step":272},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:01.455Z"),"step":273},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.579Z"),"step":274},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.716Z"),"step":275},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.838Z"),"step":276},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.966Z"),"step":277},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:02.103Z"),"step":278},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:02.233Z"),"step":279},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.608Z"),"step":281},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:05.733Z"),"step":282},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:05.870Z"),"step":283},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.999Z"),"step":284},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.136Z"),"step":285},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:06.266Z"),"step":286},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:06.397Z"),"step":287},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.525Z"),"step":288},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:06.656Z"),"step":289},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:10.573Z"),"step":291},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:10.702Z"),"step":292},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:10.837Z"),"step":293},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:10.968Z"),"step":294},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:11.109Z"),"step":295},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.242Z"),"step":296},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.376Z"),"step":297},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:11.505Z"),"step":298},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.645Z"),"step":299},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:15.160Z"),"step":301},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:15.308Z"),"step":302},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:15.440Z"),"step":303},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:15.565Z"),"step":304},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:15.696Z"),"step":305},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:15.826Z"),"step":306},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:15.961Z"),"step":307},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:16.097Z"),"step":308},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:19.556Z"),"step":311},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:19.686Z"),"step":312},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.826Z"),"step":313},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.954Z"),"step":314},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:20.093Z"),"step":315},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:20.230Z"),"step":316},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:20.358Z"),"step":317},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:20.490Z"),"step":318},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:20.621Z"),"step":319},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:24.128Z"),"step":321},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.259Z"),"step":322},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.399Z"),"step":323},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.536Z"),"step":324},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.663Z"),"step":325},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:24.792Z"),"step":326},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:24.924Z"),"step":327},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:25.055Z"),"step":328},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:25.192Z"),"step":329},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.553Z"),"step":331},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.709Z"),"step":332},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.848Z"),"step":333},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:28.980Z"),"step":334},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:29.109Z"),"step":335},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.243Z"),"step":336},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:29.375Z"),"step":337},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:29.515Z"),"step":338},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.655Z"),"step":339},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:32.974Z"),"step":341},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.098Z"),"step":342},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.234Z"),"step":343},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:33.383Z"),"step":344},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:33.544Z"),"step":345},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.710Z"),"step":346},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.860Z"),"step":347},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:34.008Z"),"step":348},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:34.150Z"),"step":349},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.499Z"),"step":351},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:37.638Z"),"step":352},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.775Z"),"step":353},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:37.908Z"),"step":354},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:38.042Z"),"step":355},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:38.174Z"),"step":356},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:38.313Z"),"step":357},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:38.444Z"),"step":358},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:38.581Z"),"step":359},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:15:41.935Z"),"step":361},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.074Z"),"step":362},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.194Z"),"step":363},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:42.325Z"),"step":364},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.463Z"),"step":365},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.605Z"),"step":366},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:42.730Z"),"step":367},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:42.858Z"),"step":368},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.998Z"),"step":369},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:46.303Z"),"step":371},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:46.436Z"),"step":372},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.568Z"),"step":373},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.708Z"),"step":374},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:46.839Z"),"step":375},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:46.971Z"),"step":376},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:47.236Z"),"step":378},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:47.379Z"),"step":379},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:50.684Z"),"step":381},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:50.815Z"),"step":382},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:50.956Z"),"step":383},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.090Z"),"step":384},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:51.232Z"),"step":385},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.365Z"),"step":386},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.503Z"),"step":387},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.643Z"),"step":388},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:51.777Z"),"step":389},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.084Z"),"step":391},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:55.220Z"),"step":392},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.354Z"),"step":393},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.486Z"),"step":394},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.625Z"),"step":395},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:55.763Z"),"step":396},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.899Z"),"step":397},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:56.036Z"),"step":398},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:56.180Z"),"step":399},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:59.479Z"),"step":401},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:59.604Z"),"step":402},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.745Z"),"step":403},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.875Z"),"step":404},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:00.005Z"),"step":405},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:00.138Z"),"step":406},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:00.277Z"),"step":407},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:00.402Z"),"step":408},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:00.538Z"),"step":409},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:03.851Z"),"step":411},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:03.985Z"),"step":412},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.118Z"),"step":413},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.248Z"),"step":414},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.376Z"),"step":415},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.515Z"),"step":416},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.641Z"),"step":417},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.770Z"),"step":418},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.905Z"),"step":419},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:08.228Z"),"step":421},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:08.358Z"),"step":422},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.500Z"),"step":423},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.639Z"),"step":424},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.766Z"),"step":425},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:08.903Z"),"step":426},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.042Z"),"step":427},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.172Z"),"step":428},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:09.310Z"),"step":429},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.517Z"),"step":431},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.682Z"),"step":432},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:13.975Z"),"step":434},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:14.131Z"),"step":435},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:14.280Z"),"step":436},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:14.434Z"),"step":437},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:14.584Z"),"step":438},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:14.730Z"),"step":439},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.093Z"),"step":441},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.230Z"),"step":442},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:18.359Z"),"step":443},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.501Z"),"step":444},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:18.765Z"),"step":446},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.894Z"),"step":447},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:19.032Z"),"step":448},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:19.160Z"),"step":449},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:22.445Z"),"step":451},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:22.584Z"),"step":452},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:22.711Z"),"step":453},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:22.858Z"),"step":454},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.006Z"),"step":455},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.138Z"),"step":456},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:23.274Z"),"step":457},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:23.417Z"),"step":458},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:23.543Z"),"step":459},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:26.832Z"),"step":461},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:26.962Z"),"step":462},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:27.093Z"),"step":463},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.230Z"),"step":464},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.371Z"),"step":465},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:27.495Z"),"step":466},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:27.620Z"),"step":467},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.759Z"),"step":468},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:27.885Z"),"step":469},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:31.227Z"),"step":471},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:16:31.354Z"),"step":472},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.484Z"),"step":473},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.629Z"),"step":474},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.764Z"),"step":475},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.898Z"),"step":476},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:32.032Z"),"step":477},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.157Z"),"step":478},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.288Z"),"step":479},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:35.896Z"),"step":481},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:36.024Z"),"step":482},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:36.163Z"),"step":483},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.428Z"),"step":485},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:36.561Z"),"step":486},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.695Z"),"step":487},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:36.828Z"),"step":488},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:16:36.959Z"),"step":489},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:40.381Z"),"step":491},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:40.508Z"),"step":492},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:40.647Z"),"step":493},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:40.782Z"),"step":494},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:16:40.920Z"),"step":495},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:41.311Z"),"step":498},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:41.458Z"),"step":499},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:44.767Z"),"step":501},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:44.902Z"),"step":502},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:45.039Z"),"step":503},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.170Z"),"step":504},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.306Z"),"step":505},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:45.440Z"),"step":506},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:45.584Z"),"step":507},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.718Z"),"step":508},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.853Z"),"step":509},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.414Z"),"step":511},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.556Z"),"step":512},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:49.821Z"),"step":514},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.950Z"),"step":515},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.080Z"),"step":516},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.209Z"),"step":517},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:50.344Z"),"step":518},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:50.476Z"),"step":519},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:53.771Z"),"step":521},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:53.903Z"),"step":522},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:54.039Z"),"step":523},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:54.178Z"),"step":524},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:54.303Z"),"step":525},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:54.430Z"),"step":526},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:54.553Z"),"step":527},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.687Z"),"step":528},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.811Z"),"step":529},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:58.763Z"),"step":531},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:58.904Z"),"step":532},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.050Z"),"step":533},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.178Z"),"step":534},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:59.314Z"),"step":535},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:59.446Z"),"step":536},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:59.585Z"),"step":537},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:59.712Z"),"step":538},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:59.844Z"),"step":539},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.217Z"),"step":541},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.350Z"),"step":542},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.480Z"),"step":543},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:03.612Z"),"step":544},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:03.742Z"),"step":545},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.889Z"),"step":546},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.032Z"),"step":547},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.171Z"),"step":548},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.306Z"),"step":549},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:07.751Z"),"step":552},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:07.887Z"),"step":553},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:08.024Z"),"step":554},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.164Z"),"step":555},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:08.296Z"),"step":556},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.433Z"),"step":557},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:08.568Z"),"step":558},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:08.692Z"),"step":559},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:12.001Z"),"step":561},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.156Z"),"step":562},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:12.290Z"),"step":563},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.464Z"),"step":564},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:12.604Z"),"step":565},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.741Z"),"step":566},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.882Z"),"step":567},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:13.029Z"),"step":568},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:13.164Z"),"step":569},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:16.594Z"),"step":571},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.730Z"),"step":572},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.871Z"),"step":573},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:17.004Z"),"step":574},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:17.140Z"),"step":575},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:17.276Z"),"step":576},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.543Z"),"step":578},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.671Z"),"step":579},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.234Z"),"step":582},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.367Z"),"step":583},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.506Z"),"step":584},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:21.631Z"),"step":585},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:21.761Z"),"step":586},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:21.899Z"),"step":587},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:22.036Z"),"step":588},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:22.175Z"),"step":589},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:26.130Z"),"step":591},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.271Z"),"step":592},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.402Z"),"step":593},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.537Z"),"step":594},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.669Z"),"step":595},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.807Z"),"step":596},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.935Z"),"step":597},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:27.072Z"),"step":598},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:27.216Z"),"step":599},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.550Z"),"step":601},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:30.816Z"),"step":603},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:31.094Z"),"step":605},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.243Z"),"step":606},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:31.383Z"),"step":607},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:17:31.525Z"),"step":608},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.650Z"),"step":609},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.299Z"),"step":611},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:35.636Z"),"step":612},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:35.773Z"),"step":613},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.907Z"),"step":614},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:36.037Z"),"step":615},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.169Z"),"step":616},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:36.302Z"),"step":617},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.427Z"),"step":618},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:36.567Z"),"step":619},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.851Z"),"step":621},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.982Z"),"step":622},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.120Z"),"step":623},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.256Z"),"step":624},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.394Z"),"step":625},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.529Z"),"step":626},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.667Z"),"step":627},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.807Z"),"step":628},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.938Z"),"step":629},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.244Z"),"step":631},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:44.385Z"),"step":632},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:44.525Z"),"step":633},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:44.657Z"),"step":634},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.800Z"),"step":635},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:44.935Z"),"step":636},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:45.208Z"),"step":638},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:45.351Z"),"step":639},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:48.960Z"),"step":641},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.090Z"),"step":642},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.219Z"),"step":643},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.361Z"),"step":644},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:49.502Z"),"step":645},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:49.634Z"),"step":646},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:49.770Z"),"step":647},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.908Z"),"step":648},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:50.033Z"),"step":649},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:53.566Z"),"step":651},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:53.699Z"),"step":652},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.829Z"),"step":653},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.958Z"),"step":654},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:54.102Z"),"step":655},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.244Z"),"step":656},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:54.377Z"),"step":657},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.518Z"),"step":658},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.649Z"),"step":659},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.186Z"),"step":661},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.343Z"),"step":662},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.479Z"),"step":663},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:58.620Z"),"step":664},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.749Z"),"step":665},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:58.876Z"),"step":666},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.023Z"),"step":667},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:59.157Z"),"step":668},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.297Z"),"step":669},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:02.835Z"),"step":671},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:02.987Z"),"step":672},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:03.126Z"),"step":673},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:03.266Z"),"step":674},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.399Z"),"step":675},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:18:03.545Z"),"step":676},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:03.696Z"),"step":677},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:03.841Z"),"step":678},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.974Z"),"step":679},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.117Z"),"step":681},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.271Z"),"step":682},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.410Z"),"step":683},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.557Z"),"step":684},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:08.696Z"),"step":685},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:08.848Z"),"step":686},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.987Z"),"step":687},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:09.119Z"),"step":688},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:09.261Z"),"step":689},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:12.588Z"),"step":691},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:12.719Z"),"step":692},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.854Z"),"step":693},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.986Z"),"step":694},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.118Z"),"step":695},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.253Z"),"step":696},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:13.392Z"),"step":697},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:13.524Z"),"step":698},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.659Z"),"step":699},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:16.970Z"),"step":701},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.250Z"),"step":703},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.382Z"),"step":704},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.517Z"),"step":705},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:17.653Z"),"step":706},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:17.782Z"),"step":707},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:17.915Z"),"step":708},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:18.056Z"),"step":709},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.361Z"),"step":711},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:21.478Z"),"step":712},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:21.610Z"),"step":713},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:21.880Z"),"step":715},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.018Z"),"step":716},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:22.147Z"),"step":717},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:22.283Z"),"step":718},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.410Z"),"step":719},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:26.058Z"),"step":721},{"scalar":1,"wall_time":new Date("2016-06-24T04:18:26.204Z"),"step":722},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:26.350Z"),"step":723},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:26.498Z"),"step":724},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:26.636Z"),"step":725},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:26.787Z"),"step":726},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:26.919Z"),"step":727},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:27.051Z"),"step":728},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:27.183Z"),"step":729},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.500Z"),"step":731},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:30.634Z"),"step":732},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.769Z"),"step":733},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:30.904Z"),"step":734},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:31.043Z"),"step":735},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:31.176Z"),"step":736},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.311Z"),"step":737},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.438Z"),"step":738},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:31.577Z"),"step":739},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:34.889Z"),"step":741},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.026Z"),"step":742},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.160Z"),"step":743},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.299Z"),"step":744},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.436Z"),"step":745},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.573Z"),"step":746},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.713Z"),"step":747},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.845Z"),"step":748},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.978Z"),"step":749},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:39.520Z"),"step":751},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:39.679Z"),"step":752},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:39.844Z"),"step":753},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:40.006Z"),"step":754},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.142Z"),"step":755},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:40.337Z"),"step":756},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:40.487Z"),"step":757},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:40.855Z"),"step":759},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:44.438Z"),"step":761},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.698Z"),"step":763},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:44.832Z"),"step":764},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:44.974Z"),"step":765},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.115Z"),"step":766},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.258Z"),"step":767},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:45.392Z"),"step":768},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:45.525Z"),"step":769},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.116Z"),"step":772},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.257Z"),"step":773},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.387Z"),"step":774},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:49.512Z"),"step":775},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:49.653Z"),"step":776},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.787Z"),"step":777},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.922Z"),"step":778},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:50.062Z"),"step":779},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:53.610Z"),"step":781},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:53.745Z"),"step":782},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:53.887Z"),"step":783},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.025Z"),"step":784},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.156Z"),"step":785},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:54.281Z"),"step":786},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.417Z"),"step":787},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:54.560Z"),"step":788},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.691Z"),"step":789},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.048Z"),"step":791},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:58.192Z"),"step":792},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.324Z"),"step":793},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.449Z"),"step":794},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:58.584Z"),"step":795},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.720Z"),"step":796},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:58.846Z"),"step":797},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:58.991Z"),"step":798},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:59.124Z"),"step":799},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:02.737Z"),"step":801},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:02.880Z"),"step":802},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:03.021Z"),"step":803},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.166Z"),"step":804},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.308Z"),"step":805},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.436Z"),"step":806},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:03.574Z"),"step":807},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:03.702Z"),"step":808},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.842Z"),"step":809},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:07.794Z"),"step":811},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:07.946Z"),"step":812},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.081Z"),"step":813},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.215Z"),"step":814},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:08.347Z"),"step":815},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:08.490Z"),"step":816},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.625Z"),"step":817},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.750Z"),"step":818},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:19:08.877Z"),"step":819},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:12.481Z"),"step":821},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:12.643Z"),"step":822},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:12.810Z"),"step":823},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:12.982Z"),"step":824},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:13.116Z"),"step":825},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.254Z"),"step":826},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.425Z"),"step":827},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:13.570Z"),"step":828},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.707Z"),"step":829},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.381Z"),"step":831},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:17.518Z"),"step":832},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.657Z"),"step":833},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:17.815Z"),"step":834},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:17.959Z"),"step":835},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:18.249Z"),"step":837},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:18.389Z"),"step":838},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:18.519Z"),"step":839},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.125Z"),"step":841},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.257Z"),"step":842},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.393Z"),"step":843},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.524Z"),"step":844},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.657Z"),"step":845},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.791Z"),"step":846},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:22.926Z"),"step":847},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:23.060Z"),"step":848},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:23.192Z"),"step":849},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.487Z"),"step":851},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.617Z"),"step":852},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.744Z"),"step":853},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:19:27.016Z"),"step":855},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:27.157Z"),"step":856},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:27.288Z"),"step":857},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:27.412Z"),"step":858},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:27.549Z"),"step":859},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:30.933Z"),"step":861},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.071Z"),"step":862},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.211Z"),"step":863},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.350Z"),"step":864},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.508Z"),"step":865},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:31.653Z"),"step":866},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.795Z"),"step":867},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:31.941Z"),"step":868},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:32.077Z"),"step":869},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:35.373Z"),"step":871},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.499Z"),"step":872},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:35.634Z"),"step":873},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.780Z"),"step":874},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:35.906Z"),"step":875},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:36.045Z"),"step":876},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.176Z"),"step":877},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:36.319Z"),"step":878},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.449Z"),"step":879},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:39.764Z"),"step":881},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:39.896Z"),"step":882},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.028Z"),"step":883},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.158Z"),"step":884},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:40.296Z"),"step":885},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.422Z"),"step":886},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:40.561Z"),"step":887},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:40.688Z"),"step":888},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:40.822Z"),"step":889},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:44.112Z"),"step":891},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.247Z"),"step":892},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:44.384Z"),"step":893},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.516Z"),"step":894},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:44.643Z"),"step":895},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.778Z"),"step":896},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.914Z"),"step":897},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.051Z"),"step":898},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.179Z"),"step":899},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:48.465Z"),"step":901},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.593Z"),"step":902},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.728Z"),"step":903},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:48.866Z"),"step":904},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:48.998Z"),"step":905},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:49.130Z"),"step":906},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:49.256Z"),"step":907},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:49.393Z"),"step":908},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:49.521Z"),"step":909},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:52.825Z"),"step":911},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:52.956Z"),"step":912},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:53.101Z"),"step":913},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:53.235Z"),"step":914},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.368Z"),"step":915},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.500Z"),"step":916},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:19:53.623Z"),"step":917},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.768Z"),"step":918},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.899Z"),"step":919},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.197Z"),"step":921},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.333Z"),"step":922},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.467Z"),"step":923},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:57.594Z"),"step":924},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:57.731Z"),"step":925},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.866Z"),"step":926},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.991Z"),"step":927},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:58.128Z"),"step":928},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:58.260Z"),"step":929},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.545Z"),"step":931},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.681Z"),"step":932},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.814Z"),"step":933},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:01.950Z"),"step":934},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:02.111Z"),"step":935},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.255Z"),"step":936},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:02.399Z"),"step":937},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.542Z"),"step":938},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:02.672Z"),"step":939},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.030Z"),"step":941},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.172Z"),"step":942},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:06.314Z"),"step":943},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:06.436Z"),"step":944},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:06.572Z"),"step":945},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:06.704Z"),"step":946},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.966Z"),"step":948},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:07.105Z"),"step":949},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:10.389Z"),"step":951},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:10.522Z"),"step":952},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:10.660Z"),"step":953},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.806Z"),"step":954},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.939Z"),"step":955},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:11.073Z"),"step":956},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:11.204Z"),"step":957},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:11.339Z"),"step":958},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:11.465Z"),"step":959},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.750Z"),"step":961},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.879Z"),"step":962},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.017Z"),"step":963},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:20:15.152Z"),"step":964},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.288Z"),"step":965},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.415Z"),"step":966},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:15.550Z"),"step":967},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:15.679Z"),"step":968},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.815Z"),"step":969},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:19.650Z"),"step":971},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:19.786Z"),"step":972},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:19.922Z"),"step":973},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:20.198Z"),"step":975},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.326Z"),"step":976},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:20.461Z"),"step":977},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:20.588Z"),"step":978},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:20.718Z"),"step":979},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.015Z"),"step":981},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.151Z"),"step":982},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.300Z"),"step":983},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:24.447Z"),"step":984},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.576Z"),"step":985},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.701Z"),"step":986},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:24.830Z"),"step":987},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:24.977Z"),"step":988},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:25.134Z"),"step":989},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:28.551Z"),"step":991},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.681Z"),"step":992},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.812Z"),"step":993},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:28.949Z"),"step":994},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.086Z"),"step":995},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.219Z"),"step":996},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.351Z"),"step":997},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.497Z"),"step":998},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"name":"train","relative":0.12446694444444445}]
-      };
-    </script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts b/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
index 6e8b9c68794..c7f1f30e76b 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
+++ b/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module Plottable {
-export class DragZoomLayer extends Components.SelectionBoxLayer {
-  private _dragInteraction: Interactions.Drag;
-  private _doubleClickInteraction: Interactions.DoubleClick;
+export class DragZoomLayer extends Plottable.Components.SelectionBoxLayer {
+  private _dragInteraction: Plottable.Interactions.Drag;
+  private _doubleClickInteraction: Plottable.Interactions.Click;
   private isZoomed = false;
-  private easeFn: (t: number) => number = d3.ease('cubic-in-out');
+  private easeFn: (t: number) => number = d3.easeCubicInOut;
   private _animationTime = 750;
   private onStart: Function;
   private onEnd: Function;
+  private unzoomMethod: Function;
 
   /**
    * Constructs a SelectionBoxLayer with an attached DragInteraction and
@@ -35,31 +35,38 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
    * Component Group.
    * TODO(danmane) - merge this into Plottable
    */
-  constructor(xScale: QuantitativeScale<number | { valueOf(): number }>,
-              yScale: QuantitativeScale<number | { valueOf(): number }>) {
+  constructor(
+      xScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
+      yScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
+      unzoomMethod: Function) {
     super();
     this.xScale(xScale);
     this.yScale(yScale);
-    this._dragInteraction = new Interactions.Drag();
+    this._dragInteraction = new Plottable.Interactions.Drag();
     this._dragInteraction.attachTo(this);
-    this._doubleClickInteraction = new Interactions.DoubleClick();
+    this._doubleClickInteraction = new Plottable.Interactions.Click();
     this._doubleClickInteraction.attachTo(this);
     this.setupCallbacks();
+    this.unzoomMethod = unzoomMethod;
   }
 
   /**
    * Register a method that calls when the DragZoom interaction starts.
    */
-  public interactionStart(cb: Function) { this.onStart = cb; }
+  public interactionStart(cb: Function) {
+    this.onStart = cb;
+  }
 
   /**
    * Register a method that calls when the DragZoom interaction ends.
    */
-  public interactionEnd(cb: Function) { this.onEnd = cb; }
+  public interactionEnd(cb: Function) {
+    this.onEnd = cb;
+  }
 
   private setupCallbacks() {
     let dragging = false;
-    this._dragInteraction.onDragStart((startPoint: Point) => {
+    this._dragInteraction.onDragStart((startPoint: Plottable.Point) => {
       this.bounds({
         topLeft: startPoint,
         bottomRight: startPoint,
@@ -110,7 +117,7 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
       throw new Error('ease function must be a function');
     }
     if (fn(0) !== 0 || fn(1) !== 1) {
-      Utils.Window.warn(
+      Plottable.Utils.Window.warn(
           'Easing function does not maintain invariant ' +
           'f(0)==0 && f(1)==1. Bad behavior may result.');
     }
@@ -141,22 +148,12 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
       return;
     }
     this.isZoomed = false;
-
-    // Some Plottable magic follows which ensures that when we un-zoom, we
-    // un-zoom to the current extent of the data; i.e. if new data was loaded
-    // since we zoomed, we should un-zoom to the extent of the new data.
-    // this basically replicates the autoDomain logic in Plottable.
-    // it uses the internal methods to get the same boundaries that autoDomain
-    // would, but allows us to interpolate the zoom with a nice animation.
     let xScale = this.xScale() as any;
-    let yScale = this.yScale() as any;
     xScale._domainMin = null;
     xScale._domainMax = null;
-    yScale._domainMin = null;
-    yScale._domainMax = null;
     let xDomain = xScale._getExtent();
-    let yDomain = yScale._getExtent();
-    this.interpolateZoom(xDomain[0], xDomain[1], yDomain[0], yDomain[1]);
+    this.xScale().domain(xDomain);
+    this.unzoomMethod();
   }
 
   // If we are zooming, disable interactions, to avoid contention
@@ -192,7 +189,7 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
       this.xScale().domain([x0, x1]);
       this.yScale().domain([y0, y1]);
       if (p < 1) {
-        Utils.DOM.requestAnimationFramePolyfill(draw);
+        Plottable.Utils.DOM.requestAnimationFramePolyfill(draw);
       } else {
         this.onEnd();
         this.isZooming(false);
@@ -201,4 +198,3 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
     draw();
   }
 }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart/index.html b/tensorflow/tensorboard/components/vz_line_chart/index.html
index b7b399d3fc8..856ab7d1efe 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/index.html
+++ b/tensorflow/tensorboard/components/vz_line_chart/index.html
@@ -18,13 +18,55 @@ limitations under the License.
 
 <html>
   <head>
-    <title>vz-line-chart</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
+    <title>vz-line-chart demo</title>
+    <link rel="import" href="vz-line-chart.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
+    <style type="text/css">
+      body {
+        font-family: "Roboto";
+      }
+
+      vz-line-chart {
+        height: 400px;
+      }
+    </style>
   </head>
   <body>
-    <iron-component-page src="vz-line-chart.html"></iron-component-page>
+    <h3>Simple line chart</h3>
+    <demo-snippet>
+      <template>
+        <vz-line-chart id="simpleline"></vz-line-chart>
+        <script>
+          var elem = document.querySelector('#simpleline');
+          elem.setVisibleSeries(['test', 'train']);
+          elem.setSeriesData('test', data.test);
+          elem.setSeriesData('train', data.train);
+        </script>
+      </template>
+    </demo-snippet>
+
+    <h3>Exponential Smoothing enabled</h3>
+    <demo-snippet>
+      <template>
+        <vz-line-chart id="smoothedline"></vz-line-chart>
+        <script>
+          var elem = document.querySelector('#smoothedline');
+          elem.smoothingEnabled = true;
+          elem.setVisibleSeries(['test', 'train']);
+          elem.setSeriesData('test', data.test);
+          elem.setSeriesData('train', data.train);
+        </script>
+      </template>
+    </demo-snippet>
+
+    <script>
+      var data = {
+        "test": [{"scalar":0.07039999961853027,"wall_time":new Date("2016-06-24T04:13:01.295Z"),"step":0},{"scalar":0.6891000270843506,"wall_time":new Date("2016-06-24T04:13:05.909Z"),"step":10},{"scalar":0.8208000063896179,"wall_time":new Date("2016-06-24T04:13:10.318Z"),"step":20},{"scalar":0.8554999828338623,"wall_time":new Date("2016-06-24T04:13:14.794Z"),"step":30},{"scalar":0.8776000142097473,"wall_time":new Date("2016-06-24T04:13:19.166Z"),"step":40},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:23.603Z"),"step":50},{"scalar":0.8906000256538391,"wall_time":new Date("2016-06-24T04:13:27.931Z"),"step":60},{"scalar":0.8853999972343445,"wall_time":new Date("2016-06-24T04:13:32.281Z"),"step":70},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:36.636Z"),"step":80},{"scalar":0.8985999822616577,"wall_time":new Date("2016-06-24T04:13:41.070Z"),"step":90},{"scalar":0.9057000279426575,"wall_time":new Date("2016-06-24T04:13:45.475Z"),"step":100},{"scalar":0.9136000275611877,"wall_time":new Date("2016-06-24T04:13:49.947Z"),"step":110},{"scalar":0.919700026512146,"wall_time":new Date("2016-06-24T04:13:54.384Z"),"step":120},{"scalar":0.9182000160217285,"wall_time":new Date("2016-06-24T04:13:58.790Z"),"step":130},{"scalar":0.9282000064849854,"wall_time":new Date("2016-06-24T04:14:03.180Z"),"step":140},{"scalar":0.9218999743461609,"wall_time":new Date("2016-06-24T04:14:08.060Z"),"step":150},{"scalar":0.9294000267982483,"wall_time":new Date("2016-06-24T04:14:12.520Z"),"step":160},{"scalar":0.9261000156402588,"wall_time":new Date("2016-06-24T04:14:16.893Z"),"step":170},{"scalar":0.9236999750137329,"wall_time":new Date("2016-06-24T04:14:21.653Z"),"step":180},{"scalar":0.925000011920929,"wall_time":new Date("2016-06-24T04:14:26.065Z"),"step":190},{"scalar":0.9319000244140625,"wall_time":new Date("2016-06-24T04:14:30.430Z"),"step":200},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:34.815Z"),"step":210},{"scalar":0.9347000122070312,"wall_time":new Date("2016-06-24T04:14:39.179Z"),"step":220},{"scalar":0.9341999888420105,"wall_time":new Date("2016-06-24T04:14:43.562Z"),"step":230},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:47.953Z"),"step":240},{"scalar":0.9254000186920166,"wall_time":new Date("2016-06-24T04:14:52.322Z"),"step":250},{"scalar":0.9383000135421753,"wall_time":new Date("2016-06-24T04:14:56.703Z"),"step":260},{"scalar":0.9391999840736389,"wall_time":new Date("2016-06-24T04:15:01.046Z"),"step":270},{"scalar":0.9336000084877014,"wall_time":new Date("2016-06-24T04:15:05.458Z"),"step":280},{"scalar":0.9404000043869019,"wall_time":new Date("2016-06-24T04:15:10.438Z"),"step":290},{"scalar":0.944100022315979,"wall_time":new Date("2016-06-24T04:15:15.026Z"),"step":300},{"scalar":0.9401000142097473,"wall_time":new Date("2016-06-24T04:15:19.417Z"),"step":310},{"scalar":0.9394999742507935,"wall_time":new Date("2016-06-24T04:15:23.985Z"),"step":320},{"scalar":0.9438999891281128,"wall_time":new Date("2016-06-24T04:15:28.418Z"),"step":330},{"scalar":0.9478999972343445,"wall_time":new Date("2016-06-24T04:15:32.844Z"),"step":340},{"scalar":0.9470999836921692,"wall_time":new Date("2016-06-24T04:15:37.359Z"),"step":350},{"scalar":0.9423999786376953,"wall_time":new Date("2016-06-24T04:15:41.803Z"),"step":360},{"scalar":0.9472000002861023,"wall_time":new Date("2016-06-24T04:15:46.167Z"),"step":370},{"scalar":0.9491999745368958,"wall_time":new Date("2016-06-24T04:15:50.558Z"),"step":380},{"scalar":0.9458000063896179,"wall_time":new Date("2016-06-24T04:15:54.942Z"),"step":390},{"scalar":0.9442999958992004,"wall_time":new Date("2016-06-24T04:15:59.343Z"),"step":400},{"scalar":0.946399986743927,"wall_time":new Date("2016-06-24T04:16:03.703Z"),"step":410},{"scalar":0.947700023651123,"wall_time":new Date("2016-06-24T04:16:08.102Z"),"step":420},{"scalar":0.9451000094413757,"wall_time":new Date("2016-06-24T04:16:13.379Z"),"step":430},{"scalar":0.9532999992370605,"wall_time":new Date("2016-06-24T04:16:17.962Z"),"step":440},{"scalar":0.9496999979019165,"wall_time":new Date("2016-06-24T04:16:22.320Z"),"step":450},{"scalar":0.9513000249862671,"wall_time":new Date("2016-06-24T04:16:26.712Z"),"step":460},{"scalar":0.9488999843597412,"wall_time":new Date("2016-06-24T04:16:31.099Z"),"step":470},{"scalar":0.9520000219345093,"wall_time":new Date("2016-06-24T04:16:35.760Z"),"step":480},{"scalar":0.9516000151634216,"wall_time":new Date("2016-06-24T04:16:40.239Z"),"step":490},{"scalar":0.9537000060081482,"wall_time":new Date("2016-06-24T04:16:44.620Z"),"step":500},{"scalar":0.9528999924659729,"wall_time":new Date("2016-06-24T04:16:49.273Z"),"step":510},{"scalar":0.9502999782562256,"wall_time":new Date("2016-06-24T04:16:53.640Z"),"step":520},{"scalar":0.9573000073432922,"wall_time":new Date("2016-06-24T04:16:58.612Z"),"step":530},{"scalar":0.9550999999046326,"wall_time":new Date("2016-06-24T04:17:03.089Z"),"step":540},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:07.481Z"),"step":550},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:11.866Z"),"step":560},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:17:16.456Z"),"step":570},{"scalar":0.9588000178337097,"wall_time":new Date("2016-06-24T04:17:20.983Z"),"step":580},{"scalar":0.9569000005722046,"wall_time":new Date("2016-06-24T04:17:25.996Z"),"step":590},{"scalar":0.9585999846458435,"wall_time":new Date("2016-06-24T04:17:30.417Z"),"step":600},{"scalar":0.9555000066757202,"wall_time":new Date("2016-06-24T04:17:35.164Z"),"step":610},{"scalar":0.9567999839782715,"wall_time":new Date("2016-06-24T04:17:39.714Z"),"step":620},{"scalar":0.9616000056266785,"wall_time":new Date("2016-06-24T04:17:44.105Z"),"step":630},{"scalar":0.9603999853134155,"wall_time":new Date("2016-06-24T04:17:48.826Z"),"step":640},{"scalar":0.9605000019073486,"wall_time":new Date("2016-06-24T04:17:53.419Z"),"step":650},{"scalar":0.9627000093460083,"wall_time":new Date("2016-06-24T04:17:58.026Z"),"step":660},{"scalar":0.9639999866485596,"wall_time":new Date("2016-06-24T04:18:02.698Z"),"step":670},{"scalar":0.9613999724388123,"wall_time":new Date("2016-06-24T04:18:07.960Z"),"step":680},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:18:12.458Z"),"step":690},{"scalar":0.9617999792098999,"wall_time":new Date("2016-06-24T04:18:16.835Z"),"step":700},{"scalar":0.9635000228881836,"wall_time":new Date("2016-06-24T04:18:21.232Z"),"step":710},{"scalar":0.9641000032424927,"wall_time":new Date("2016-06-24T04:18:25.888Z"),"step":720},{"scalar":0.9628000259399414,"wall_time":new Date("2016-06-24T04:18:30.372Z"),"step":730},{"scalar":0.9656000137329102,"wall_time":new Date("2016-06-24T04:18:34.751Z"),"step":740},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:18:39.368Z"),"step":750},{"scalar":0.9646000266075134,"wall_time":new Date("2016-06-24T04:18:44.267Z"),"step":760},{"scalar":0.9617000222206116,"wall_time":new Date("2016-06-24T04:18:48.829Z"),"step":770},{"scalar":0.9657999873161316,"wall_time":new Date("2016-06-24T04:18:53.466Z"),"step":780},{"scalar":0.9667999744415283,"wall_time":new Date("2016-06-24T04:18:57.893Z"),"step":790},{"scalar":0.967199981212616,"wall_time":new Date("2016-06-24T04:19:02.601Z"),"step":800},{"scalar":0.9646999835968018,"wall_time":new Date("2016-06-24T04:19:07.657Z"),"step":810},{"scalar":0.9670000076293945,"wall_time":new Date("2016-06-24T04:19:12.331Z"),"step":820},{"scalar":0.96670001745224,"wall_time":new Date("2016-06-24T04:19:17.223Z"),"step":830},{"scalar":0.9668999910354614,"wall_time":new Date("2016-06-24T04:19:21.980Z"),"step":840},{"scalar":0.965399980545044,"wall_time":new Date("2016-06-24T04:19:26.352Z"),"step":850},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:30.764Z"),"step":860},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:35.244Z"),"step":870},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:19:39.620Z"),"step":880},{"scalar":0.9666000008583069,"wall_time":new Date("2016-06-24T04:19:43.979Z"),"step":890},{"scalar":0.9664999842643738,"wall_time":new Date("2016-06-24T04:19:48.337Z"),"step":900},{"scalar":0.9678000211715698,"wall_time":new Date("2016-06-24T04:19:52.688Z"),"step":910},{"scalar":0.9678999781608582,"wall_time":new Date("2016-06-24T04:19:57.058Z"),"step":920},{"scalar":0.9674999713897705,"wall_time":new Date("2016-06-24T04:20:01.415Z"),"step":930},{"scalar":0.9684000015258789,"wall_time":new Date("2016-06-24T04:20:05.887Z"),"step":940},{"scalar":0.9672999978065491,"wall_time":new Date("2016-06-24T04:20:10.261Z"),"step":950},{"scalar":0.9696000218391418,"wall_time":new Date("2016-06-24T04:20:14.610Z"),"step":960},{"scalar":0.9706000089645386,"wall_time":new Date("2016-06-24T04:20:19.526Z"),"step":970},{"scalar":0.9688000082969666,"wall_time":new Date("2016-06-24T04:20:23.881Z"),"step":980},{"scalar":0.9699000120162964,"wall_time":new Date("2016-06-24T04:20:28.415Z"),"step":990,"name":"test","relative":0.1242}],
+        "train": [{"scalar":0.05999999865889549,"wall_time":new Date("2016-06-24T04:13:01.556Z"),"step":1},{"scalar":0.18000000715255737,"wall_time":new Date("2016-06-24T04:13:01.693Z"),"step":2},{"scalar":0.25,"wall_time":new Date("2016-06-24T04:13:01.833Z"),"step":3},{"scalar":0.28999999165534973,"wall_time":new Date("2016-06-24T04:13:01.964Z"),"step":4},{"scalar":0.3400000035762787,"wall_time":new Date("2016-06-24T04:13:02.109Z"),"step":5},{"scalar":0.5099999904632568,"wall_time":new Date("2016-06-24T04:13:02.249Z"),"step":6},{"scalar":0.550000011920929,"wall_time":new Date("2016-06-24T04:13:02.387Z"),"step":7},{"scalar":0.5600000023841858,"wall_time":new Date("2016-06-24T04:13:02.515Z"),"step":8},{"scalar":0.6700000166893005,"wall_time":new Date("2016-06-24T04:13:02.650Z"),"step":9},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.061Z"),"step":11},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:06.202Z"),"step":12},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:06.343Z"),"step":13},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.480Z"),"step":14},{"scalar":0.75,"wall_time":new Date("2016-06-24T04:13:06.618Z"),"step":15},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:06.763Z"),"step":16},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:06.897Z"),"step":17},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:07.027Z"),"step":18},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:07.167Z"),"step":19},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:10.461Z"),"step":21},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:10.602Z"),"step":22},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:10.748Z"),"step":23},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:10.894Z"),"step":24},{"scalar":0.6899999976158142,"wall_time":new Date("2016-06-24T04:13:11.034Z"),"step":25},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:11.173Z"),"step":26},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:11.315Z"),"step":27},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:11.590Z"),"step":29},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:14.931Z"),"step":31},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.069Z"),"step":32},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:15.199Z"),"step":33},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:15.329Z"),"step":34},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.466Z"),"step":35},{"scalar":0.7699999809265137,"wall_time":new Date("2016-06-24T04:13:15.610Z"),"step":36},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:15.746Z"),"step":37},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:15.876Z"),"step":38},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:16.010Z"),"step":39},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.296Z"),"step":41},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.435Z"),"step":42},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:19.567Z"),"step":43},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:19.696Z"),"step":44},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.833Z"),"step":45},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.972Z"),"step":46},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:20.111Z"),"step":47},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.239Z"),"step":48},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.370Z"),"step":49},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:23.731Z"),"step":51},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:23.869Z"),"step":52},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:24.000Z"),"step":53},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:24.129Z"),"step":54},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.256Z"),"step":55},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:24.391Z"),"step":56},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:24.529Z"),"step":57},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:24.663Z"),"step":58},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.792Z"),"step":59},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:28.072Z"),"step":61},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:28.212Z"),"step":62},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:28.340Z"),"step":63},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:28.477Z"),"step":64},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.610Z"),"step":65},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:28.751Z"),"step":66},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.875Z"),"step":67},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:29.003Z"),"step":68},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:29.137Z"),"step":69},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:32.418Z"),"step":71},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:32.552Z"),"step":72},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:32.685Z"),"step":73},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:32.942Z"),"step":75},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:33.077Z"),"step":76},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:33.213Z"),"step":77},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:33.347Z"),"step":78},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:33.489Z"),"step":79},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:36.766Z"),"step":81},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:36.902Z"),"step":82},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:37.042Z"),"step":83},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:37.187Z"),"step":84},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:37.319Z"),"step":85},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:37.460Z"),"step":86},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:37.595Z"),"step":87},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:37.730Z"),"step":88},{"scalar":0.7400000095367432,"wall_time":new Date("2016-06-24T04:13:37.866Z"),"step":89},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:41.201Z"),"step":91},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.334Z"),"step":92},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.480Z"),"step":93},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.618Z"),"step":94},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:41.755Z"),"step":95},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:41.885Z"),"step":96},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:42.022Z"),"step":97},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:42.145Z"),"step":98},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:42.271Z"),"step":99},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:45.616Z"),"step":101},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:45.751Z"),"step":102},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:45.882Z"),"step":103},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.031Z"),"step":104},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:46.169Z"),"step":105},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.302Z"),"step":106},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.443Z"),"step":107},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:46.578Z"),"step":108},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.720Z"),"step":109},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.080Z"),"step":111},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:50.219Z"),"step":112},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:50.351Z"),"step":113},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.495Z"),"step":114},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.622Z"),"step":115},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.761Z"),"step":116},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.894Z"),"step":117},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:51.027Z"),"step":118},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:51.167Z"),"step":119},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:54.523Z"),"step":121},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:54.649Z"),"step":122},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:54.813Z"),"step":123},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:54.958Z"),"step":124},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.097Z"),"step":125},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:55.232Z"),"step":126},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.363Z"),"step":127},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:55.491Z"),"step":128},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:55.619Z"),"step":129},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:58.923Z"),"step":131},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.051Z"),"step":132},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.183Z"),"step":133},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.316Z"),"step":134},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.443Z"),"step":135},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.583Z"),"step":136},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:13:59.712Z"),"step":137},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.840Z"),"step":138},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:59.978Z"),"step":139},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:03.306Z"),"step":141},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.443Z"),"step":142},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:03.572Z"),"step":143},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:03.700Z"),"step":144},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.834Z"),"step":145},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:03.964Z"),"step":146},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:04.104Z"),"step":147},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:04.470Z"),"step":149},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:08.189Z"),"step":151},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.328Z"),"step":152},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.462Z"),"step":153},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.594Z"),"step":154},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:08.725Z"),"step":155},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:08.859Z"),"step":156},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:08.998Z"),"step":157},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:09.132Z"),"step":158},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:09.268Z"),"step":159},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:12.667Z"),"step":161},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:12.803Z"),"step":162},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:12.935Z"),"step":163},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:13.068Z"),"step":164},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.199Z"),"step":165},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:13.333Z"),"step":166},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.462Z"),"step":167},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:13.591Z"),"step":168},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:13.719Z"),"step":169},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.038Z"),"step":171},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:17.302Z"),"step":173},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.430Z"),"step":174},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.565Z"),"step":175},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:17.700Z"),"step":176},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:17.844Z"),"step":177},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:18.013Z"),"step":178},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:18.164Z"),"step":179},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:21.786Z"),"step":181},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:21.915Z"),"step":182},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.054Z"),"step":183},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:22.187Z"),"step":184},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:22.317Z"),"step":185},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.442Z"),"step":186},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:22.583Z"),"step":187},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:22.718Z"),"step":188},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:22.849Z"),"step":189},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:26.199Z"),"step":191},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.327Z"),"step":192},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:26.462Z"),"step":193},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:26.591Z"),"step":194},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:26.721Z"),"step":195},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:26.857Z"),"step":196},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.989Z"),"step":197},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:27.118Z"),"step":198},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:27.251Z"),"step":199},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:30.566Z"),"step":201},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:30.831Z"),"step":203},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:30.965Z"),"step":204},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:31.105Z"),"step":205},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.234Z"),"step":206},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.369Z"),"step":207},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:31.498Z"),"step":208},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:31.646Z"),"step":209},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:34.956Z"),"step":211},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:35.097Z"),"step":212},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:35.225Z"),"step":213},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:35.361Z"),"step":214},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:35.486Z"),"step":215},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:35.618Z"),"step":216},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:35.747Z"),"step":217},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:35.879Z"),"step":218},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:36.019Z"),"step":219},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:39.310Z"),"step":221},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.439Z"),"step":222},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:39.580Z"),"step":223},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.854Z"),"step":225},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.988Z"),"step":226},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:40.117Z"),"step":227},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:40.244Z"),"step":228},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:40.376Z"),"step":229},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:43.701Z"),"step":231},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:43.852Z"),"step":232},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:43.988Z"),"step":233},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:44.127Z"),"step":234},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:44.261Z"),"step":235},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:44.394Z"),"step":236},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:44.535Z"),"step":237},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:44.663Z"),"step":238},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:44.803Z"),"step":239},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:48.092Z"),"step":241},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:14:48.234Z"),"step":242},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.374Z"),"step":243},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.503Z"),"step":244},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.635Z"),"step":245},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:48.768Z"),"step":246},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.895Z"),"step":247},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:49.037Z"),"step":248},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:49.169Z"),"step":249},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:52.450Z"),"step":251},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:52.582Z"),"step":252},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:52.712Z"),"step":253},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:52.850Z"),"step":254},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:52.982Z"),"step":255},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:53.112Z"),"step":256},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:53.244Z"),"step":257},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:53.377Z"),"step":258},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:53.501Z"),"step":259},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:56.842Z"),"step":261},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:56.973Z"),"step":262},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:57.101Z"),"step":263},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:57.229Z"),"step":264},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:57.360Z"),"step":265},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.491Z"),"step":266},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.629Z"),"step":267},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:57.770Z"),"step":268},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.898Z"),"step":269},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.199Z"),"step":271},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:01.330Z"),"step":272},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:01.455Z"),"step":273},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.579Z"),"step":274},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.716Z"),"step":275},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.838Z"),"step":276},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.966Z"),"step":277},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:02.103Z"),"step":278},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:02.233Z"),"step":279},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.608Z"),"step":281},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:05.733Z"),"step":282},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:05.870Z"),"step":283},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.999Z"),"step":284},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.136Z"),"step":285},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:06.266Z"),"step":286},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:06.397Z"),"step":287},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.525Z"),"step":288},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:06.656Z"),"step":289},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:10.573Z"),"step":291},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:10.702Z"),"step":292},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:10.837Z"),"step":293},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:10.968Z"),"step":294},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:11.109Z"),"step":295},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.242Z"),"step":296},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.376Z"),"step":297},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:11.505Z"),"step":298},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.645Z"),"step":299},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:15.160Z"),"step":301},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:15.308Z"),"step":302},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:15.440Z"),"step":303},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:15.565Z"),"step":304},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:15.696Z"),"step":305},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:15.826Z"),"step":306},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:15.961Z"),"step":307},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:16.097Z"),"step":308},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:19.556Z"),"step":311},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:19.686Z"),"step":312},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.826Z"),"step":313},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.954Z"),"step":314},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:20.093Z"),"step":315},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:20.230Z"),"step":316},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:20.358Z"),"step":317},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:20.490Z"),"step":318},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:20.621Z"),"step":319},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:24.128Z"),"step":321},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.259Z"),"step":322},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.399Z"),"step":323},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.536Z"),"step":324},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.663Z"),"step":325},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:24.792Z"),"step":326},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:24.924Z"),"step":327},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:25.055Z"),"step":328},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:25.192Z"),"step":329},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.553Z"),"step":331},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.709Z"),"step":332},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.848Z"),"step":333},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:28.980Z"),"step":334},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:29.109Z"),"step":335},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.243Z"),"step":336},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:29.375Z"),"step":337},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:29.515Z"),"step":338},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.655Z"),"step":339},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:32.974Z"),"step":341},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.098Z"),"step":342},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.234Z"),"step":343},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:33.383Z"),"step":344},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:33.544Z"),"step":345},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.710Z"),"step":346},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.860Z"),"step":347},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:34.008Z"),"step":348},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:34.150Z"),"step":349},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.499Z"),"step":351},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:37.638Z"),"step":352},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.775Z"),"step":353},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:37.908Z"),"step":354},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:38.042Z"),"step":355},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:38.174Z"),"step":356},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:38.313Z"),"step":357},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:38.444Z"),"step":358},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:38.581Z"),"step":359},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:15:41.935Z"),"step":361},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.074Z"),"step":362},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.194Z"),"step":363},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:42.325Z"),"step":364},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.463Z"),"step":365},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.605Z"),"step":366},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:42.730Z"),"step":367},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:42.858Z"),"step":368},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.998Z"),"step":369},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:46.303Z"),"step":371},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:46.436Z"),"step":372},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.568Z"),"step":373},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.708Z"),"step":374},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:46.839Z"),"step":375},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:46.971Z"),"step":376},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:47.236Z"),"step":378},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:47.379Z"),"step":379},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:50.684Z"),"step":381},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:50.815Z"),"step":382},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:50.956Z"),"step":383},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.090Z"),"step":384},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:51.232Z"),"step":385},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.365Z"),"step":386},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.503Z"),"step":387},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.643Z"),"step":388},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:51.777Z"),"step":389},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.084Z"),"step":391},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:55.220Z"),"step":392},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.354Z"),"step":393},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.486Z"),"step":394},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.625Z"),"step":395},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:55.763Z"),"step":396},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.899Z"),"step":397},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:56.036Z"),"step":398},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:56.180Z"),"step":399},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:59.479Z"),"step":401},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:59.604Z"),"step":402},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.745Z"),"step":403},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.875Z"),"step":404},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:00.005Z"),"step":405},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:00.138Z"),"step":406},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:00.277Z"),"step":407},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:00.402Z"),"step":408},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:00.538Z"),"step":409},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:03.851Z"),"step":411},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:03.985Z"),"step":412},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.118Z"),"step":413},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.248Z"),"step":414},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.376Z"),"step":415},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.515Z"),"step":416},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.641Z"),"step":417},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.770Z"),"step":418},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.905Z"),"step":419},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:08.228Z"),"step":421},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:08.358Z"),"step":422},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.500Z"),"step":423},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.639Z"),"step":424},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.766Z"),"step":425},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:08.903Z"),"step":426},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.042Z"),"step":427},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.172Z"),"step":428},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:09.310Z"),"step":429},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.517Z"),"step":431},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.682Z"),"step":432},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:13.975Z"),"step":434},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:14.131Z"),"step":435},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:14.280Z"),"step":436},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:14.434Z"),"step":437},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:14.584Z"),"step":438},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:14.730Z"),"step":439},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.093Z"),"step":441},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.230Z"),"step":442},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:18.359Z"),"step":443},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.501Z"),"step":444},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:18.765Z"),"step":446},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.894Z"),"step":447},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:19.032Z"),"step":448},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:19.160Z"),"step":449},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:22.445Z"),"step":451},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:22.584Z"),"step":452},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:22.711Z"),"step":453},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:22.858Z"),"step":454},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.006Z"),"step":455},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.138Z"),"step":456},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:23.274Z"),"step":457},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:23.417Z"),"step":458},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:23.543Z"),"step":459},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:26.832Z"),"step":461},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:26.962Z"),"step":462},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:27.093Z"),"step":463},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.230Z"),"step":464},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.371Z"),"step":465},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:27.495Z"),"step":466},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:27.620Z"),"step":467},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.759Z"),"step":468},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:27.885Z"),"step":469},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:31.227Z"),"step":471},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:16:31.354Z"),"step":472},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.484Z"),"step":473},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.629Z"),"step":474},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.764Z"),"step":475},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.898Z"),"step":476},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:32.032Z"),"step":477},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.157Z"),"step":478},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.288Z"),"step":479},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:35.896Z"),"step":481},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:36.024Z"),"step":482},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:36.163Z"),"step":483},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.428Z"),"step":485},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:36.561Z"),"step":486},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.695Z"),"step":487},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:36.828Z"),"step":488},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:16:36.959Z"),"step":489},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:40.381Z"),"step":491},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:40.508Z"),"step":492},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:40.647Z"),"step":493},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:40.782Z"),"step":494},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:16:40.920Z"),"step":495},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:41.311Z"),"step":498},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:41.458Z"),"step":499},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:44.767Z"),"step":501},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:44.902Z"),"step":502},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:45.039Z"),"step":503},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.170Z"),"step":504},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.306Z"),"step":505},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:45.440Z"),"step":506},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:45.584Z"),"step":507},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.718Z"),"step":508},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.853Z"),"step":509},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.414Z"),"step":511},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.556Z"),"step":512},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:49.821Z"),"step":514},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.950Z"),"step":515},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.080Z"),"step":516},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.209Z"),"step":517},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:50.344Z"),"step":518},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:50.476Z"),"step":519},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:53.771Z"),"step":521},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:53.903Z"),"step":522},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:54.039Z"),"step":523},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:54.178Z"),"step":524},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:54.303Z"),"step":525},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:54.430Z"),"step":526},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:54.553Z"),"step":527},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.687Z"),"step":528},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.811Z"),"step":529},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:58.763Z"),"step":531},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:58.904Z"),"step":532},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.050Z"),"step":533},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.178Z"),"step":534},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:59.314Z"),"step":535},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:59.446Z"),"step":536},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:59.585Z"),"step":537},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:59.712Z"),"step":538},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:59.844Z"),"step":539},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.217Z"),"step":541},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.350Z"),"step":542},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.480Z"),"step":543},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:03.612Z"),"step":544},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:03.742Z"),"step":545},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.889Z"),"step":546},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.032Z"),"step":547},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.171Z"),"step":548},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.306Z"),"step":549},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:07.751Z"),"step":552},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:07.887Z"),"step":553},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:08.024Z"),"step":554},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.164Z"),"step":555},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:08.296Z"),"step":556},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.433Z"),"step":557},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:08.568Z"),"step":558},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:08.692Z"),"step":559},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:12.001Z"),"step":561},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.156Z"),"step":562},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:12.290Z"),"step":563},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.464Z"),"step":564},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:12.604Z"),"step":565},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.741Z"),"step":566},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.882Z"),"step":567},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:13.029Z"),"step":568},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:13.164Z"),"step":569},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:16.594Z"),"step":571},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.730Z"),"step":572},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.871Z"),"step":573},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:17.004Z"),"step":574},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:17.140Z"),"step":575},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:17.276Z"),"step":576},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.543Z"),"step":578},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.671Z"),"step":579},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.234Z"),"step":582},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.367Z"),"step":583},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.506Z"),"step":584},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:21.631Z"),"step":585},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:21.761Z"),"step":586},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:21.899Z"),"step":587},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:22.036Z"),"step":588},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:22.175Z"),"step":589},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:26.130Z"),"step":591},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.271Z"),"step":592},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.402Z"),"step":593},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.537Z"),"step":594},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.669Z"),"step":595},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.807Z"),"step":596},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.935Z"),"step":597},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:27.072Z"),"step":598},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:27.216Z"),"step":599},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.550Z"),"step":601},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:30.816Z"),"step":603},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:31.094Z"),"step":605},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.243Z"),"step":606},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:31.383Z"),"step":607},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:17:31.525Z"),"step":608},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.650Z"),"step":609},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.299Z"),"step":611},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:35.636Z"),"step":612},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:35.773Z"),"step":613},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.907Z"),"step":614},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:36.037Z"),"step":615},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.169Z"),"step":616},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:36.302Z"),"step":617},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.427Z"),"step":618},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:36.567Z"),"step":619},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.851Z"),"step":621},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.982Z"),"step":622},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.120Z"),"step":623},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.256Z"),"step":624},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.394Z"),"step":625},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.529Z"),"step":626},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.667Z"),"step":627},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.807Z"),"step":628},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.938Z"),"step":629},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.244Z"),"step":631},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:44.385Z"),"step":632},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:44.525Z"),"step":633},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:44.657Z"),"step":634},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.800Z"),"step":635},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:44.935Z"),"step":636},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:45.208Z"),"step":638},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:45.351Z"),"step":639},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:48.960Z"),"step":641},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.090Z"),"step":642},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.219Z"),"step":643},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.361Z"),"step":644},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:49.502Z"),"step":645},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:49.634Z"),"step":646},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:49.770Z"),"step":647},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.908Z"),"step":648},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:50.033Z"),"step":649},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:53.566Z"),"step":651},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:53.699Z"),"step":652},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.829Z"),"step":653},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.958Z"),"step":654},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:54.102Z"),"step":655},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.244Z"),"step":656},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:54.377Z"),"step":657},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.518Z"),"step":658},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.649Z"),"step":659},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.186Z"),"step":661},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.343Z"),"step":662},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.479Z"),"step":663},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:58.620Z"),"step":664},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.749Z"),"step":665},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:58.876Z"),"step":666},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.023Z"),"step":667},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:59.157Z"),"step":668},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.297Z"),"step":669},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:02.835Z"),"step":671},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:02.987Z"),"step":672},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:03.126Z"),"step":673},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:03.266Z"),"step":674},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.399Z"),"step":675},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:18:03.545Z"),"step":676},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:03.696Z"),"step":677},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:03.841Z"),"step":678},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.974Z"),"step":679},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.117Z"),"step":681},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.271Z"),"step":682},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.410Z"),"step":683},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.557Z"),"step":684},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:08.696Z"),"step":685},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:08.848Z"),"step":686},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.987Z"),"step":687},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:09.119Z"),"step":688},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:09.261Z"),"step":689},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:12.588Z"),"step":691},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:12.719Z"),"step":692},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.854Z"),"step":693},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.986Z"),"step":694},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.118Z"),"step":695},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.253Z"),"step":696},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:13.392Z"),"step":697},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:13.524Z"),"step":698},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.659Z"),"step":699},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:16.970Z"),"step":701},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.250Z"),"step":703},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.382Z"),"step":704},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.517Z"),"step":705},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:17.653Z"),"step":706},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:17.782Z"),"step":707},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:17.915Z"),"step":708},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:18.056Z"),"step":709},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.361Z"),"step":711},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:21.478Z"),"step":712},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:21.610Z"),"step":713},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:21.880Z"),"step":715},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.018Z"),"step":716},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:22.147Z"),"step":717},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:22.283Z"),"step":718},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.410Z"),"step":719},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:26.058Z"),"step":721},{"scalar":1,"wall_time":new Date("2016-06-24T04:18:26.204Z"),"step":722},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:26.350Z"),"step":723},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:26.498Z"),"step":724},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:26.636Z"),"step":725},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:26.787Z"),"step":726},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:26.919Z"),"step":727},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:27.051Z"),"step":728},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:27.183Z"),"step":729},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.500Z"),"step":731},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:30.634Z"),"step":732},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.769Z"),"step":733},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:30.904Z"),"step":734},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:31.043Z"),"step":735},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:31.176Z"),"step":736},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.311Z"),"step":737},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.438Z"),"step":738},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:31.577Z"),"step":739},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:34.889Z"),"step":741},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.026Z"),"step":742},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.160Z"),"step":743},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.299Z"),"step":744},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.436Z"),"step":745},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.573Z"),"step":746},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.713Z"),"step":747},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.845Z"),"step":748},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.978Z"),"step":749},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:39.520Z"),"step":751},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:39.679Z"),"step":752},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:39.844Z"),"step":753},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:40.006Z"),"step":754},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.142Z"),"step":755},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:40.337Z"),"step":756},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:40.487Z"),"step":757},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:40.855Z"),"step":759},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:44.438Z"),"step":761},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.698Z"),"step":763},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:44.832Z"),"step":764},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:44.974Z"),"step":765},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.115Z"),"step":766},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.258Z"),"step":767},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:45.392Z"),"step":768},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:45.525Z"),"step":769},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.116Z"),"step":772},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.257Z"),"step":773},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.387Z"),"step":774},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:49.512Z"),"step":775},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:49.653Z"),"step":776},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.787Z"),"step":777},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.922Z"),"step":778},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:50.062Z"),"step":779},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:53.610Z"),"step":781},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:53.745Z"),"step":782},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:53.887Z"),"step":783},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.025Z"),"step":784},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.156Z"),"step":785},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:54.281Z"),"step":786},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.417Z"),"step":787},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:54.560Z"),"step":788},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.691Z"),"step":789},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.048Z"),"step":791},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:58.192Z"),"step":792},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.324Z"),"step":793},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.449Z"),"step":794},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:58.584Z"),"step":795},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.720Z"),"step":796},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:58.846Z"),"step":797},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:58.991Z"),"step":798},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:59.124Z"),"step":799},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:02.737Z"),"step":801},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:02.880Z"),"step":802},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:03.021Z"),"step":803},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.166Z"),"step":804},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.308Z"),"step":805},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.436Z"),"step":806},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:03.574Z"),"step":807},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:03.702Z"),"step":808},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.842Z"),"step":809},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:07.794Z"),"step":811},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:07.946Z"),"step":812},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.081Z"),"step":813},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.215Z"),"step":814},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:08.347Z"),"step":815},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:08.490Z"),"step":816},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.625Z"),"step":817},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.750Z"),"step":818},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:19:08.877Z"),"step":819},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:12.481Z"),"step":821},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:12.643Z"),"step":822},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:12.810Z"),"step":823},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:12.982Z"),"step":824},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:13.116Z"),"step":825},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.254Z"),"step":826},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.425Z"),"step":827},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:13.570Z"),"step":828},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.707Z"),"step":829},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.381Z"),"step":831},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:17.518Z"),"step":832},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.657Z"),"step":833},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:17.815Z"),"step":834},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:17.959Z"),"step":835},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:18.249Z"),"step":837},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:18.389Z"),"step":838},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:18.519Z"),"step":839},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.125Z"),"step":841},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.257Z"),"step":842},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.393Z"),"step":843},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.524Z"),"step":844},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.657Z"),"step":845},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.791Z"),"step":846},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:22.926Z"),"step":847},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:23.060Z"),"step":848},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:23.192Z"),"step":849},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.487Z"),"step":851},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.617Z"),"step":852},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.744Z"),"step":853},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:19:27.016Z"),"step":855},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:27.157Z"),"step":856},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:27.288Z"),"step":857},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:27.412Z"),"step":858},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:27.549Z"),"step":859},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:30.933Z"),"step":861},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.071Z"),"step":862},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.211Z"),"step":863},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.350Z"),"step":864},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.508Z"),"step":865},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:31.653Z"),"step":866},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.795Z"),"step":867},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:31.941Z"),"step":868},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:32.077Z"),"step":869},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:35.373Z"),"step":871},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.499Z"),"step":872},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:35.634Z"),"step":873},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.780Z"),"step":874},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:35.906Z"),"step":875},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:36.045Z"),"step":876},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.176Z"),"step":877},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:36.319Z"),"step":878},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.449Z"),"step":879},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:39.764Z"),"step":881},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:39.896Z"),"step":882},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.028Z"),"step":883},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.158Z"),"step":884},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:40.296Z"),"step":885},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.422Z"),"step":886},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:40.561Z"),"step":887},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:40.688Z"),"step":888},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:40.822Z"),"step":889},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:44.112Z"),"step":891},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.247Z"),"step":892},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:44.384Z"),"step":893},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.516Z"),"step":894},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:44.643Z"),"step":895},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.778Z"),"step":896},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.914Z"),"step":897},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.051Z"),"step":898},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.179Z"),"step":899},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:48.465Z"),"step":901},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.593Z"),"step":902},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.728Z"),"step":903},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:48.866Z"),"step":904},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:48.998Z"),"step":905},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:49.130Z"),"step":906},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:49.256Z"),"step":907},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:49.393Z"),"step":908},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:49.521Z"),"step":909},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:52.825Z"),"step":911},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:52.956Z"),"step":912},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:53.101Z"),"step":913},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:53.235Z"),"step":914},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.368Z"),"step":915},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.500Z"),"step":916},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:19:53.623Z"),"step":917},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.768Z"),"step":918},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.899Z"),"step":919},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.197Z"),"step":921},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.333Z"),"step":922},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.467Z"),"step":923},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:57.594Z"),"step":924},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:57.731Z"),"step":925},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.866Z"),"step":926},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.991Z"),"step":927},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:58.128Z"),"step":928},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:58.260Z"),"step":929},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.545Z"),"step":931},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.681Z"),"step":932},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.814Z"),"step":933},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:01.950Z"),"step":934},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:02.111Z"),"step":935},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.255Z"),"step":936},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:02.399Z"),"step":937},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.542Z"),"step":938},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:02.672Z"),"step":939},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.030Z"),"step":941},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.172Z"),"step":942},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:06.314Z"),"step":943},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:06.436Z"),"step":944},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:06.572Z"),"step":945},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:06.704Z"),"step":946},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.966Z"),"step":948},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:07.105Z"),"step":949},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:10.389Z"),"step":951},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:10.522Z"),"step":952},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:10.660Z"),"step":953},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.806Z"),"step":954},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.939Z"),"step":955},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:11.073Z"),"step":956},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:11.204Z"),"step":957},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:11.339Z"),"step":958},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:11.465Z"),"step":959},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.750Z"),"step":961},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.879Z"),"step":962},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.017Z"),"step":963},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:20:15.152Z"),"step":964},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.288Z"),"step":965},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.415Z"),"step":966},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:15.550Z"),"step":967},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:15.679Z"),"step":968},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.815Z"),"step":969},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:19.650Z"),"step":971},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:19.786Z"),"step":972},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:19.922Z"),"step":973},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:20.198Z"),"step":975},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.326Z"),"step":976},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:20.461Z"),"step":977},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:20.588Z"),"step":978},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:20.718Z"),"step":979},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.015Z"),"step":981},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.151Z"),"step":982},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.300Z"),"step":983},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:24.447Z"),"step":984},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.576Z"),"step":985},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.701Z"),"step":986},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:24.830Z"),"step":987},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:24.977Z"),"step":988},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:25.134Z"),"step":989},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:28.551Z"),"step":991},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.681Z"),"step":992},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.812Z"),"step":993},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:28.949Z"),"step":994},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.086Z"),"step":995},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.219Z"),"step":996},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.351Z"),"step":997},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.497Z"),"step":998},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"name":"train","relative":0.12446694444444445}]
+      };
+    </script>
   </body>
 </html>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
index 839f0fb8b24..fa89e06ada1 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
+++ b/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
@@ -12,157 +12,208 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-module VZ.ChartHelpers {
-  export interface Datum {
-    wall_time: Date;
-    step: number;
-  }
 
-  export interface Scalar {
-    scalar: number;
-    smoothed: number;
-  }
+export interface Datum {
+  wall_time: Date;
+  step: number;
+}
 
-  export type ScalarDatum = Datum & Scalar;
+export interface Scalar {
+  scalar: number;
+  smoothed: number;
+}
 
-  export type DataFn = (run: string, tag: string) =>
-      Promise<Array<Datum>>;
+export type ScalarDatum = Datum & Scalar;
 
-  export let Y_TOOLTIP_FORMATTER_PRECISION = 4;
-  export let STEP_FORMATTER_PRECISION = 4;
-  export let Y_AXIS_FORMATTER_PRECISION = 3;
-  export let TOOLTIP_Y_PIXEL_OFFSET = 20;
-  export let TOOLTIP_CIRCLE_SIZE = 4;
-  export let NAN_SYMBOL_SIZE = 6;
+export type DataFn = (run: string, tag: string) => Promise<Array<Datum>>;
 
-  export interface Point {
-    x: number;  // pixel space
-    y: number;  // pixel space
-    datum: ScalarDatum;
-    dataset: Plottable.Dataset;
-  }
+export let Y_TOOLTIP_FORMATTER_PRECISION = 4;
+export let STEP_FORMATTER_PRECISION = 4;
+export let Y_AXIS_FORMATTER_PRECISION = 3;
+export let TOOLTIP_Y_PIXEL_OFFSET = 20;
+export let TOOLTIP_CIRCLE_SIZE = 4;
+export let NAN_SYMBOL_SIZE = 6;
 
-  /* Create a formatter function that will switch between exponential and
-   * regular display depending on the scale of the number being formatted,
-   * and show `digits` significant digits.
-   */
-  export function multiscaleFormatter(digits: number): ((v: number) => string) {
-    return (v: number) => {
-      let absv = Math.abs(v);
-      if (absv < 1E-15) {
-        // Sometimes zero-like values get an annoying representation
-        absv = 0;
-      }
-      let f: (x: number) => string;
-      if (absv >= 1E4) {
-        f = d3.format('.' + digits + 'e');
-      } else if (absv > 0 && absv < 0.01) {
-        f = d3.format('.' + digits + 'e');
-      } else {
-        f = d3.format('.' + digits + 'g');
-      }
-      return f(v);
-    };
-  }
+export interface Point {
+  x: number;  // pixel space
+  y: number;  // pixel space
+  datum: ScalarDatum;
+  dataset: Plottable.Dataset;
+}
 
-  export function accessorize(key: string): Plottable.Accessor<number> {
-    return (d: any, index: number, dataset: Plottable.Dataset) => d[key];
-  }
-
-  export interface XComponents {
-    /* tslint:disable */
-    scale: Plottable.Scales.Linear|Plottable.Scales.Time,
-        axis: Plottable.Axes.Numeric|Plottable.Axes.Time,
-        accessor: Plottable.Accessor<number|Date>,
-    /* tslint:enable */
-  }
-
-  export let stepFormatter =
-      Plottable.Formatters.siSuffix(STEP_FORMATTER_PRECISION);
-  export function stepX(): XComponents {
-    let scale = new Plottable.Scales.Linear();
-    let axis = new Plottable.Axes.Numeric(scale, 'bottom');
-    axis.formatter(stepFormatter);
-    return {
-      scale: scale,
-      axis: axis,
-      accessor: (d: Datum) => d.step,
-    };
-  }
-
-  export let timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-
-  export function wallX(): XComponents {
-    let scale = new Plottable.Scales.Time();
-    return {
-      scale: scale,
-      axis: new Plottable.Axes.Time(scale, 'bottom'),
-      accessor: (d: Datum) => d.wall_time,
-    };
-  }
-  export let relativeAccessor =
-      (d: any, index: number, dataset: Plottable.Dataset) => {
-        // We may be rendering the final-point datum for scatterplot.
-        // If so, we will have already provided the 'relative' property
-        if (d.relative != null) {
-          return d.relative;
-        }
-        let data = dataset.data();
-        // I can't imagine how this function would be called when the data is
-        // empty (after all, it iterates over the data), but lets guard just
-        // to be safe.
-        let first = data.length > 0 ? +data[0].wall_time : 0;
-        return (+d.wall_time - first) / (60 * 60 * 1000);  // ms to hours
-      };
-
-  export let relativeFormatter = (n: number) => {
-    // we will always show 2 units of precision, e.g days and hours, or
-    // minutes and seconds, but not hours and minutes and seconds
-    let ret = '';
-    let days = Math.floor(n / 24);
-    n -= (days * 24);
-    if (days) {
-      ret += days + 'd ';
+/* Create a formatter function that will switch between exponential and
+ * regular display depending on the scale of the number being formatted,
+ * and show `digits` significant digits.
+ */
+export function multiscaleFormatter(digits: number): ((v: number) => string) {
+  return (v: number) => {
+    let absv = Math.abs(v);
+    if (absv < 1E-15) {
+      // Sometimes zero-like values get an annoying representation
+      absv = 0;
     }
-    let hours = Math.floor(n);
-    n -= hours;
-    n *= 60;
-    if (hours || days) {
-      ret += hours + 'h ';
+    let f: (x: number) => string;
+    if (absv >= 1E4) {
+      f = d3.format('.' + digits + 'e');
+    } else if (absv > 0 && absv < 0.01) {
+      f = d3.format('.' + digits + 'e');
+    } else {
+      f = d3.format('.' + digits + 'g');
     }
-    let minutes = Math.floor(n);
-    n -= minutes;
-    n *= 60;
-    if (minutes || hours || days) {
-      ret += minutes + 'm ';
-    }
-    let seconds = Math.floor(n);
-    return ret + seconds + 's';
+    return f(v);
   };
-  export function relativeX(): XComponents {
-    let scale = new Plottable.Scales.Linear();
-    return {
-      scale: scale,
-      axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-      accessor: relativeAccessor,
-    };
+}
+
+/* Compute an appropriate domain given an array of all the values that are
+ * going to be displayed. If ignoreOutliers is true, it will ignore the
+ * lowest 10% and highest 10% of the data when computing a domain.
+ * It has n log n performance when ignoreOutliers is true, as it needs to
+ * sort the data.
+ */
+export function computeDomain(values: number[], ignoreOutliers: boolean) {
+  // Don't include infinities and NaNs in the domain computation.
+  values = values.filter(z => isFinite(z));
+
+  if (values.length === 0) {
+    return [-0.1, 1.1];
+  }
+  let a: number;
+  let b: number;
+  if (ignoreOutliers) {
+    let sorted = _.sortBy(values);
+    a = d3.quantile(sorted, 0.05);
+    b = d3.quantile(sorted, 0.95);
+  } else {
+    a = d3.min(values);
+    b = d3.max(values);
   }
 
-  // a very literal definition of NaN: true for NaN for a non-number type
-  // or null, etc. False for Infinity or -Infinity
-  export let isNaN = (x) => +x !== x;
+  let padding: number;
+  let span = b - a;
+  if (span === 0) {
+    // If b===a, we would create an empty range. We instead select the range
+    // [0, 2*a] if a > 0, or [-2*a, 0] if a < 0, plus a little bit of
+    // extra padding on the top and bottom of the plot.
+    padding = Math.abs(a) * 1.1 + 1.1;
+  } else {
+    padding = span * 0.2;
+  }
 
-  export function getXComponents(xType: string): XComponents {
-    switch (xType) {
-      case 'step':
-        return stepX();
-      case 'wall_time':
-        return wallX();
-      case 'relative':
-        return relativeX();
-      default:
-        throw new Error('invalid xType: ' + xType);
-    }
+  let lower: number;
+  if (a >= 0 && a < span) {
+    // We include the intercept (y = 0) if doing so less than doubles the span
+    // of the y-axis. (We actually select a lower bound that's slightly less
+    // than 0 so that 0.00 will clearly be written on the lower edge of the
+    // chart. The label on the lowest tick is often filtered out.)
+    lower = -0.1 * b;
+  } else {
+    lower = a - padding;
+  }
+
+
+  let domain = [lower, b + padding];
+  domain = d3.scaleLinear().domain(domain).nice().domain();
+  return domain;
+}
+
+export function accessorize(key: string): Plottable.IAccessor<number> {
+  // tslint:disable-next-line:no-any be quiet tsc
+  return (d: any, index: number, dataset: Plottable.Dataset) => d[key];
+}
+
+export interface XComponents {
+  /* tslint:disable */
+  scale: Plottable.Scales.Linear|Plottable.Scales.Time,
+      axis: Plottable.Axes.Numeric|Plottable.Axes.Time,
+      accessor: Plottable.IAccessor<number|Date>,
+  /* tslint:enable */
+}
+
+export let stepFormatter =
+    Plottable.Formatters.siSuffix(STEP_FORMATTER_PRECISION);
+export function stepX(): XComponents {
+  let scale = new Plottable.Scales.Linear();
+  let axis = new Plottable.Axes.Numeric(scale, 'bottom');
+  axis.formatter(stepFormatter);
+  return {
+    scale: scale,
+    axis: axis,
+    accessor: (d: Datum) => d.step,
+  };
+}
+
+export let timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
+
+export function wallX(): XComponents {
+  let scale = new Plottable.Scales.Time();
+  return {
+    scale: scale,
+    axis: new Plottable.Axes.Time(scale, 'bottom'),
+    accessor: (d: Datum) => d.wall_time,
+  };
+}
+export let relativeAccessor =
+    // tslint:disable-next-line:no-any be quiet tsc
+    (d: any, index: number, dataset: Plottable.Dataset) => {
+      // We may be rendering the final-point datum for scatterplot.
+      // If so, we will have already provided the 'relative' property
+      if (d.relative != null) {
+        return d.relative;
+      }
+      let data = dataset.data();
+      // I can't imagine how this function would be called when the data is
+      // empty (after all, it iterates over the data), but lets guard just
+      // to be safe.
+      let first = data.length > 0 ? +data[0].wall_time : 0;
+      return (+d.wall_time - first) / (60 * 60 * 1000);  // ms to hours
+    };
+
+export let relativeFormatter = (n: number) => {
+  // we will always show 2 units of precision, e.g days and hours, or
+  // minutes and seconds, but not hours and minutes and seconds
+  let ret = '';
+  let days = Math.floor(n / 24);
+  n -= (days * 24);
+  if (days) {
+    ret += days + 'd ';
+  }
+  let hours = Math.floor(n);
+  n -= hours;
+  n *= 60;
+  if (hours || days) {
+    ret += hours + 'h ';
+  }
+  let minutes = Math.floor(n);
+  n -= minutes;
+  n *= 60;
+  if (minutes || hours || days) {
+    ret += minutes + 'm ';
+  }
+  let seconds = Math.floor(n);
+  return ret + seconds + 's';
+};
+export function relativeX(): XComponents {
+  let scale = new Plottable.Scales.Linear();
+  return {
+    scale: scale,
+    axis: new Plottable.Axes.Numeric(scale, 'bottom'),
+    accessor: relativeAccessor,
+  };
+}
+
+// a very literal definition of NaN: true for NaN for a non-number type
+// or null, etc. False for Infinity or -Infinity
+export let isNaN = (x) => +x !== x;
+
+export function getXComponents(xType: string): XComponents {
+  switch (xType) {
+    case 'step':
+      return stepX();
+    case 'wall_time':
+      return wallX();
+    case 'relative':
+      return relativeX();
+    default:
+      throw new Error('invalid xType: ' + xType);
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
index 0d160795e89..38e0d7cb8d8 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
+++ b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
@@ -16,8 +16,9 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/plottable.html">
+<link rel="import" href="../tf-imports/d3.html">
 <link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-imports/plottable.html">
 
 <!--
 vz-line-chart creates an element that draws a line chart for
@@ -50,7 +51,7 @@ such as different X scales (linear and temporal), tooltips and smoothing.
         </tbody>
       </table>
     </div>
-    <svg id="chartsvg"></svg>
+    <div id="chartdiv"></div>
     <style>
       :host {
         -webkit-user-select: none;
@@ -62,7 +63,7 @@ such as different X scales (linear and temporal), tooltips and smoothing.
         position: relative;
         outline: none;
       }
-      svg {
+      div {
         -webkit-user-select: none;
         -moz-user-select: none;
         flex-grow: 1;
@@ -117,232 +118,14 @@ such as different X scales (linear and temporal), tooltips and smoothing.
         stroke-width: 1px;
       }
 
+      #chartdiv line.guide-line {
+        stroke: #999;
+        stroke-width: 1.5px;
+      }
+
     </style>
   </template>
+  <script src="vz-chart-helpers.js"></script>
   <script src="dragZoomInteraction.js"></script>
   <script src="vz-line-chart.js"></script>
-  <script src="vz-chart-helpers.js"></script>
-  <script>
-    Polymer({
-      is: "vz-line-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-
-        /**
-         * Whether smoothing is enabled or not. If true, smoothed lines will be
-         * plotted in the chart while the unsmoothed lines will be ghosted in
-         * the background.
-         *
-         * The smoothing algorithm is a simple moving average, which, given a
-         * point p and a window w, replaces p with a simple average of the
-         * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
-         * aren't enough points to cover the entire window to the left, the
-         * window is reduced to fit exactly the amount of elements available.
-         * This means that the smoothed line will be less in and gradually
-         * become more smooth until the desired window is reached. However when
-         * there aren't enough points on the right, the line stops being
-         * rendered at all.
-         */
-        smoothingEnabled: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
-         * the window size, and a weight of 1.0 means using 50% of the entire
-         * dataset as the window, while a weight of 0.0 means using a window of
-         * 0 (and thus replacing each point with themselves).
-         *
-         * The growth between 0.0 and 1.0 is not linear though. Because
-         * changing the window from 0% to 30% of the dataset smooths the line a
-         * lot more than changing the window from 70% to 100%, an exponential
-         * function is used instead: http://i.imgur.com/bDrhEZU.png. This
-         * function increases the size of the window slowly at the beginning
-         * and gradually speeds up the growth, but 0.0 still means a window of
-         * 0 and 1.0 still means a window of the dataset's length.
-         */
-        smoothingWeight: {
-          type: Number,
-          value: 0.6
-        },
-
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-
-        /**
-         * The scale for the y-axis. Allows:
-         * - "linear" - linear scale (Plottable.Scales.Linear)
-         * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
-         */
-        yScaleType: {
-          type: String,
-          value: 'linear'
-        },
-
-        /**
-         * Change how the tooltip is sorted. Allows:
-         * - "default" - Sort the tooltip by input order.
-         * - "ascending" - Sort the tooltip by ascending value.
-         * - "descending" - Sort the tooltip by descending value.
-         * - "nearest" - Sort the tooltip by closest to cursor.
-         */
-        tooltipSortingMethod: {
-          type: String,
-          value: 'default'
-        },
-
-        /**
-         * Change how the tooltip is positioned. Allows:
-         * - "bottom" - Position the tooltip on the bottom of the chart.
-         * - "right" - Position the tooltip to the right of the chart.
-         */
-        tooltipPosition: {
-          type: String,
-          value: 'bottom'
-        },
-
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: {
-          type: Number,
-          value: null
-        }
-      },
-      observers: [
-        "_makeChart(xType, yScaleType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-        "_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)",
-        "_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)",
-        "_tooltipPositionChanged(tooltipPosition, _chart)"
-      ],
-
-      /**
-       * Sets the series that the chart displays. Series with other names will
-       * not be displayed.
-       *
-       * @param {String[]} names Array with the names of the series to
-       * display.
-       */
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-
-      /**
-       * Sets the data of one of the series. Note that to display this series
-       * its name must be in the setVisibleSeries() array.
-       *
-       * @param {String} name Name of the series.
-       * @param {VZ.ChartHelpers.ScalarDatum[]} data Data of the series. This is
-       * an array of objects with at least the following properties:
-       * - step: (Number) - index of the datum.
-       * - wall_time: (Date) - Date object with the datum's time.
-       * - scalar: (Number) - Value of the datum.
-       */
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-
-      /**
-       * Re-renders the chart. Useful if e.g. the container size changed.
-       */
-      redraw: function() {
-        this._chart.redraw();
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.tooltip, true);
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, yScaleType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId !== null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-          this._makeChartAsyncCallbackId = null;
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!this._attached) return;
-          if (this._chart) this._chart.destroy();
-          var tooltip = d3.select(this.$.tooltip);
-          var chart = new VZ.LineChart(xType, yScaleType, colorScale, tooltip);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      _smoothingChanged: function() {
-        if(!this._chart) {
-          return;
-        }
-        if(this.smoothingEnabled) {
-          this._chart.smoothingUpdate(this.smoothingWeight);
-        }
-        else {
-          this._chart.smoothingDisable();
-        }
-      },
-      _tooltipSortingMethodChanged: function() {
-        if(this._chart) {
-          this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
-        }
-      },
-      _tooltipPositionChanged: function() {
-        if (this._chart) {
-          this._chart.setTooltipPosition(this.tooltipPosition);
-        }
-      }
-    });
-  </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
index 1be2a8f913a..5da6190ea24 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
+++ b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
@@ -14,561 +14,760 @@ limitations under the License.
 ==============================================================================*/
 /* tslint:disable:no-namespace variable-name */
 
-module VZ {
-  export class LineChart {
-    private name2datasets: {[name: string]: Plottable.Dataset};
-    private seriesNames: string[];
+import {DragZoomLayer} from './dragZoomInteraction'
+import * as ChartHelpers from './vz-chart-helpers'
 
-    private xAccessor: Plottable.Accessor<number|Date>;
-    private xScale: Plottable.QuantitativeScale<number|Date>;
-    private yScale: Plottable.QuantitativeScale<number>;
-    private gridlines: Plottable.Components.Gridlines;
-    private center: Plottable.Components.Group;
-    private xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-    private yAxis: Plottable.Axes.Numeric;
-    private outer: Plottable.Components.Table;
-    private colorScale: Plottable.Scales.Color;
-    private tooltip: d3.Selection<any>;
-    private dzl: Plottable.DragZoomLayer;
-
-    private linePlot: Plottable.Plots.Line<number|Date>;
-    private smoothLinePlot: Plottable.Plots.Line<number|Date>;
-    private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
-    private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
-    private scalarAccessor: Plottable.Accessor<number>;
-    private smoothedAccessor: Plottable.Accessor<number>;
-    private lastPointsDataset: Plottable.Dataset;
-    private datasets: Plottable.Dataset[];
-    private onDatasetChanged: (dataset: Plottable.Dataset) => void;
-    private nanDataset: Plottable.Dataset;
-    private smoothingWeight: number;
-    private smoothingEnabled: Boolean;
-    private tooltipSortingMethod: string;
-    private tooltipPosition: string;
-
-    private targetSVG: d3.Selection<any>;
-
-    constructor(
-        xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
-        tooltip: d3.Selection<any>) {
-      this.seriesNames = [];
-      this.name2datasets = {};
-      this.colorScale = colorScale;
-      this.tooltip = tooltip;
-      this.datasets = [];
-      // lastPointDataset is a dataset that contains just the last point of
-      // every dataset we're currently drawing.
-      this.lastPointsDataset = new Plottable.Dataset();
-      this.nanDataset = new Plottable.Dataset();
-      // need to do a single bind, so we can deregister the callback from
-      // old Plottable.Datasets. (Deregistration is done by identity checks.)
-      this.onDatasetChanged = this._onDatasetChanged.bind(this);
-      this.buildChart(xType, yScaleType);
-    }
-
-    private buildChart(xType: string, yScaleType: string) {
-      if (this.outer) {
-        this.outer.destroy();
-      }
-      let xComponents = VZ.ChartHelpers.getXComponents(xType);
-      this.xAccessor = xComponents.accessor;
-      this.xScale = xComponents.scale;
-      this.xAxis = xComponents.axis;
-      this.xAxis.margin(0).tickLabelPadding(3);
-      this.yScale = LineChart.getYScaleFromType(yScaleType);
-      this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-      let yFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-      this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-      this.yAxis.usesTextWidthApproximation(true);
-
-      this.dzl = new Plottable.DragZoomLayer(this.xScale, this.yScale);
-
-      let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-
-      this.gridlines =
-          new Plottable.Components.Gridlines(this.xScale, this.yScale);
-
-      this.center =
-          new Plottable.Components.Group([this.gridlines, center, this.dzl]);
-      this.outer =  new Plottable.Components.Table([
-                                                   [this.yAxis, this.center],
-                                                   [null, this.xAxis]
-                                                  ]);
-    }
-
-    private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      this.scalarAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
-      this.smoothedAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.smoothed;
-      let linePlot = new Plottable.Plots.Line<number|Date>();
-      linePlot.x(xAccessor, xScale);
-      linePlot.y(this.scalarAccessor, yScale);
-      linePlot.attr(
-          'stroke', (d: VZ.ChartHelpers.Datum, i: number,
-                     dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().name));
-      this.linePlot = linePlot;
-      let group = this.setupTooltips(linePlot);
-
-      let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
-      smoothLinePlot.x(xAccessor, xScale);
-      smoothLinePlot.y(this.smoothedAccessor, yScale);
-      smoothLinePlot.attr(
-          'stroke', (d: VZ.ChartHelpers.Datum, i: number,
-                     dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().name));
-      this.smoothLinePlot = smoothLinePlot;
-
-      // The scatterPlot will display the last point for each dataset.
-      // This way, if there is only one datum for the series, it is still
-      // visible. We hide it when tooltips are active to keep things clean.
-      let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
-      scatterPlot.x(xAccessor, xScale);
-      scatterPlot.y(this.scalarAccessor, yScale);
-      scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
-      scatterPlot.attr('opacity', 1);
-      scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
-      scatterPlot.datasets([this.lastPointsDataset]);
-      this.scatterPlot = scatterPlot;
-
-      let nanDisplay = new Plottable.Plots.Scatter<number|Date, number>();
-      nanDisplay.x(xAccessor, xScale);
-      nanDisplay.y((x) => x.displayY, yScale);
-      nanDisplay.attr('fill', (d: any) => this.colorScale.scale(d.name));
-      nanDisplay.attr('opacity', 1);
-      nanDisplay.size(VZ.ChartHelpers.NAN_SYMBOL_SIZE * 2);
-      nanDisplay.datasets([this.nanDataset]);
-      nanDisplay.symbol(Plottable.SymbolFactories.triangleUp);
-      this.nanDisplay = nanDisplay;
-
-      return new Plottable.Components.Group(
-          [nanDisplay, scatterPlot, smoothLinePlot, group]);
-    }
-
-    /** Updates the chart when a dataset changes. Called every time the data of
-     * a dataset changes to update the charts.
+Polymer({
+  is: 'vz-line-chart',
+  properties: {
+    /**
+     * Scale that maps series names to colors. The default colors are from
+     * d3.schemeCategory10. Use this property to replace the default line
+     * colors with colors of your own choice.
+     * @type {Plottable.Scales.Color}
+     * @required
      */
-    private _onDatasetChanged(dataset: Plottable.Dataset) {
-      if (this.smoothingEnabled) {
-        this.resmoothDataset(dataset);
+    colorScale: {
+      type: Object,
+      value: function() {
+        return new Plottable.Scales.Color().range(d3.schemeCategory10);
       }
-      this.updateSpecialDatasets();
-    }
-
-    private updateSpecialDatasets() {
-      if (this.smoothingEnabled) {
-        this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-      } else {
-        this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-      }
-    }
-
-    /** Constructs special datasets. Each special dataset contains exceptional
-     * values from all of the regular datasets, e.g. last points in series, or
-     * NaN values. Those points will have a `name` and `relative` property added
-     * (since usually those are context in the surrounding dataset).
-     * The accessor will point to the correct data to access.
-     */
-    private updateSpecialDatasetsWithAccessor(accessor:
-                                                  Plottable.Accessor<number>) {
-      let lastPointsData =
-          this.datasets
-              .map((d) => {
-                let datum = null;
-                // filter out NaNs to ensure last point is a clean one
-                let nonNanData =
-                    d.data().filter((x) => !isNaN(accessor(x, -1, d)));
-                if (nonNanData.length > 0) {
-                  let idx = nonNanData.length - 1;
-                  datum = nonNanData[idx];
-                  datum.name = d.metadata().name;
-                  datum.relative =
-                      VZ.ChartHelpers.relativeAccessor(datum, -1, d);
-                }
-                return datum;
-              })
-              .filter((x) => x != null);
-      this.lastPointsDataset.data(lastPointsData);
-
-      // Take a dataset, return an array of NaN data points
-      // the NaN points will have a "displayY" property which is the
-      // y-value of a nearby point that was not NaN (0 if all points are NaN)
-      let datasetToNaNData = (d: Plottable.Dataset) => {
-        let displayY = null;
-        let data = d.data();
-        let i = 0;
-        while (i < data.length && displayY == null) {
-          if (!isNaN(accessor(data[i], -1, d))) {
-            displayY = accessor(data[i], -1, d);
-          }
-          i++;
-        }
-        if (displayY == null) {
-          displayY = 0;
-        }
-        let nanData = [];
-        for (i = 0; i < data.length; i++) {
-          if (!isNaN(accessor(data[i], -1, d))) {
-            displayY = accessor(data[i], -1, d);
-          } else {
-            data[i].name = d.metadata().name;
-            data[i].displayY = displayY;
-            data[i].relative = VZ.ChartHelpers.relativeAccessor(data[i], -1, d);
-            nanData.push(data[i]);
-          }
-        }
-        return nanData;
-      };
-      let nanData = _.flatten(this.datasets.map(datasetToNaNData));
-      this.nanDataset.data(nanData);
-    }
-
-    private setupTooltips(plot: Plottable.XYPlot<number|Date, number>):
-        Plottable.Components.Group {
-      let pi = new Plottable.Interactions.Pointer();
-      pi.attachTo(plot);
-      // PointsComponent is a Plottable Component that will hold the little
-      // circles we draw over the closest data points
-      let pointsComponent = new Plottable.Component();
-      let group = new Plottable.Components.Group([plot, pointsComponent]);
-
-      let hideTooltips = () => {
-        this.tooltip.style('opacity', 0);
-        this.scatterPlot.attr('opacity', 1);
-        pointsComponent.content().selectAll('.point').remove();
-      };
-
-      let enabled = true;
-      let disableTooltips = () => {
-        enabled = false;
-        hideTooltips();
-      };
-      let enableTooltips = () => { enabled = true; };
-
-      this.dzl.interactionStart(disableTooltips);
-      this.dzl.interactionEnd(enableTooltips);
-
-      pi.onPointerMove((p: Plottable.Point) => {
-        if (!enabled) {
-          return;
-        }
-        let target: VZ.ChartHelpers.Point = {
-          x: p.x,
-          y: p.y,
-          datum: null,
-          dataset: null,
-        };
-
-
-        let bbox: SVGRect = (<any>this.gridlines.content().node()).getBBox();
-
-        // pts is the closets point to the tooltip for each dataset
-        let pts = plot.datasets()
-                      .map((dataset) => this.findClosestPoint(target, dataset))
-                      .filter(x => x != null);
-        let intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
-        // We draw tooltips for points that are NaN, or are currently visible
-        let ptsForTooltips = pts.filter(
-            (p) => intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar));
-        // Only draw little indicator circles for the non-NaN points
-        let ptsToCircle = ptsForTooltips.filter((p) => !isNaN(p.datum.scalar));
-
-        let ptsSelection: any =
-            pointsComponent.content().selectAll('.point').data(
-                ptsToCircle,
-                (p: VZ.ChartHelpers.Point) => p.dataset.metadata().name);
-        if (pts.length !== 0) {
-          ptsSelection.enter().append('circle').classed('point', true);
-          ptsSelection.attr('r', VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE)
-              .attr('cx', (p) => p.x)
-              .attr('cy', (p) => p.y)
-              .style('stroke', 'none')
-              .attr(
-                  'fill',
-                  (p) => this.colorScale.scale(p.dataset.metadata().name));
-          ptsSelection.exit().remove();
-          this.drawTooltips(ptsForTooltips, target);
-        } else {
-          hideTooltips();
-        }
-      });
-
-      pi.onPointerExit(hideTooltips);
-
-      return group;
-    }
-
-    private drawTooltips(
-        points: VZ.ChartHelpers.Point[], target: VZ.ChartHelpers.Point) {
-      // Formatters for value, step, and wall_time
-      this.scatterPlot.attr('opacity', 0);
-      let valueFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
-
-      let dist = (p: VZ.ChartHelpers.Point) =>
-          Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
-      let closestDist = _.min(points.map(dist));
-
-      let valueSortMethod = this.scalarAccessor;
-      if (this.smoothingEnabled) {
-        valueSortMethod = this.smoothedAccessor;
-      }
-
-      if (this.tooltipSortingMethod === 'ascending') {
-        points =
-            _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset));
-      } else if (this.tooltipSortingMethod === 'descending') {
-        points =
-            _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset))
-                .reverse();
-      } else if (this.tooltipSortingMethod === 'nearest') {
-        points = _.sortBy(points, dist);
-      } else {
-        // The 'default' sorting method maintains the order of names passed to
-        // setVisibleSeries(). However we reverse that order when defining the
-        // datasets. So we must call reverse again to restore the order.
-        points = points.slice(0).reverse();
-      }
-
-      let rows = this.tooltip.select('tbody')
-                     .html('')
-                     .selectAll('tr')
-                     .data(points)
-                     .enter()
-                     .append('tr');
-      // Grey out the point if any of the following are true:
-      // - The cursor is outside of the x-extent of the dataset
-      // - The point's y value is NaN
-      rows.classed('distant', (d) => {
-        let firstPoint = d.dataset.data()[0];
-        let lastPoint = _.last(d.dataset.data());
-        let firstX =
-            this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
-        let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
-        let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
-        return target.x < firstX || target.x > lastX || isNaN(s);
-      });
-      rows.classed('closest', (p) => dist(p) === closestDist);
-      // It is a bit hacky that we are manually applying the width to the swatch
-      // and the nowrap property to the text here. The reason is as follows:
-      // the style gets updated asynchronously by Polymer scopeSubtree observer.
-      // Which means we would get incorrect sizing information since the text
-      // would wrap by default. However, we need correct measurements so that
-      // we can stop the text from falling off the edge of the screen.
-      // therefore, we apply the size-critical styles directly.
-      rows.style('white-space', 'nowrap');
-      rows.append('td')
-          .append('span')
-          .classed('swatch', true)
-          .style(
-              'background-color',
-              (d) => this.colorScale.scale(d.dataset.metadata().name));
-      rows.append('td').text((d) => d.dataset.metadata().name);
-      if (this.smoothingEnabled) {
-        rows.append('td').text(
-            (d) => isNaN(d.datum.smoothed) ? 'NaN' :
-                                             valueFormatter(d.datum.smoothed));
-      }
-      rows.append('td').text(
-          (d) =>
-              isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.stepFormatter(d.datum.step));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.timeFormatter(d.datum.wall_time));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.relativeFormatter(
-              VZ.ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)));
-
-      // compute left position
-      let documentWidth = document.body.clientWidth;
-      let node: any = this.tooltip.node();
-      let parentRect = node.parentElement.getBoundingClientRect();
-      let nodeRect = node.getBoundingClientRect();
-      // prevent it from falling off the right side of the screen
-      let left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
-
-      if (this.tooltipPosition === 'right') {
-        left = Math.min(parentRect.width, left);
-      } else {  // 'bottom'
-        left = Math.min(0, left);
-        top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
-      }
-
-      this.tooltip.style(
-          'transform', 'translate(' + left + 'px,' + top + 'px)');
-      this.tooltip.style('opacity', 1);
-    }
-
-    private findClosestPoint(
-        target: VZ.ChartHelpers.Point,
-        dataset: Plottable.Dataset): VZ.ChartHelpers.Point {
-      let points: VZ.ChartHelpers.Point[] = dataset.data().map((d, i) => {
-        let x = this.xAccessor(d, i, dataset);
-        let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
-                                        this.scalarAccessor(d, i, dataset);
-        return {
-          x: this.xScale.scale(x),
-          y: this.yScale.scale(y),
-          datum: d,
-          dataset: dataset,
-        };
-      });
-      let idx: number =
-          _.sortedIndex(points, target, (p: VZ.ChartHelpers.Point) => p.x);
-      if (idx === points.length) {
-        return points[points.length - 1];
-      } else if (idx === 0) {
-        return points[0];
-      } else {
-        let prev = points[idx - 1];
-        let next = points[idx];
-        let prevDist = Math.abs(prev.x - target.x);
-        let nextDist = Math.abs(next.x - target.x);
-        return prevDist < nextDist ? prev : next;
-      }
-    }
-
-    private resmoothDataset(dataset: Plottable.Dataset) {
-      // When increasing the smoothing window, it smoothes a lot with the first
-      // few points and then starts to gradually smooth slower, so using an
-      // exponential function makes the slider more consistent. 1000^x has a
-      // range of [1, 1000], so subtracting 1 and dividing by 999 results in a
-      // range of [0, 1], which can be used as the percentage of the data, so
-      // that the kernel size can be specified as a percentage instead of a
-      // hardcoded number, what would be bad with multiple series.
-      let factor = (Math.pow(1000, this.smoothingWeight) - 1) / 999;
-      let data = dataset.data();
-      let kernelRadius = Math.floor(data.length * factor / 2);
-
-      data.forEach((d, i) => {
-        let actualKernelRadius = Math.min(kernelRadius, i);
-        let start = i - actualKernelRadius;
-        let end = i + actualKernelRadius + 1;
-        if (end >= data.length) {
-          // In the beginning, it's OK for the smoothing window to be small,
-          // but this is not desirable towards the end. Rather than shrinking
-          // the window, or extrapolating data to fill the gap, we're simply
-          // not going to display the smoothed line towards the end.
-          d.smoothed = Infinity;
-        } else if (!_.isFinite(d.scalar)) {
-          // Only smooth finite numbers.
-          d.smoothed = d.scalar;
-        } else {
-          d.smoothed = d3.mean(
-              data.slice(start, end).filter((d) => _.isFinite(d.scalar)),
-              (d) => d.scalar);
-        }
-      });
-    }
-
-    private getDataset(name: string) {
-      if (this.name2datasets[name] === undefined) {
-        this.name2datasets[name] = new Plottable.Dataset([], {name: name});
-      }
-      return this.name2datasets[name];
-    }
-
-    static getYScaleFromType(yScaleType: string):
-        Plottable.QuantitativeScale<number> {
-      if (yScaleType === 'log') {
-        return new Plottable.Scales.ModifiedLog();
-      } else if (yScaleType === 'linear') {
-        return new Plottable.Scales.Linear();
-      } else {
-        throw new Error('Unrecognized yScale type ' + yScaleType);
-      }
-    }
+    },
 
     /**
-     * Update the selected series on the chart.
+     * Whether smoothing is enabled or not. If true, smoothed lines will be
+     * plotted in the chart while the unsmoothed lines will be ghosted in
+     * the background.
+     *
+     * The smoothing algorithm is a simple moving average, which, given a
+     * point p and a window w, replaces p with a simple average of the
+     * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
+     * aren't enough points to cover the entire window to the left, the
+     * window is reduced to fit exactly the amount of elements available.
+     * This means that the smoothed line will be less in and gradually
+     * become more smooth until the desired window is reached. However when
+     * there aren't enough points on the right, the line stops being
+     * rendered at all.
      */
-    public setVisibleSeries(names: string[]) {
-      names = names.sort();
-      this.seriesNames = names;
-
-      names.reverse();  // draw first series on top
-      this.datasets.forEach((d) => d.offUpdate(this.onDatasetChanged));
-      this.datasets = names.map((r) => this.getDataset(r));
-      this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
-      this.linePlot.datasets(this.datasets);
-
-      if (this.smoothingEnabled) {
-        this.smoothLinePlot.datasets(this.datasets);
-      }
-      this.updateSpecialDatasets();
-    }
+    smoothingEnabled: {type: Boolean, value: false},
 
     /**
-     * Set the data of a series on the chart.
+     * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
+     * the window size, and a weight of 1.0 means using 50% of the entire
+     * dataset as the window, while a weight of 0.0 means using a window of
+     * 0 (and thus replacing each point with themselves).
+     *
+     * The growth between 0.0 and 1.0 is not linear though. Because
+     * changing the window from 0% to 30% of the dataset smooths the line a
+     * lot more than changing the window from 70% to 100%, an exponential
+     * function is used instead: http://i.imgur.com/bDrhEZU.png. This
+     * function increases the size of the window slowly at the beginning
+     * and gradually speeds up the growth, but 0.0 still means a window of
+     * 0 and 1.0 still means a window of the dataset's length.
      */
-    public setSeriesData(name: string, data: VZ.ChartHelpers.ScalarDatum[]) {
-      this.getDataset(name).data(data);
+    smoothingWeight: {type: Number, value: 0.6},
+
+    /**
+     * The way to display the X values. Allows:
+     * - "step" - Linear scale using the  "step" property of the datum.
+     * - "wall_time" - Temporal scale using the "wall_time" property of the
+     * datum.
+     * - "relative" - Temporal scale using the "relative" property of the
+     * datum if it is present or calculating from "wall_time" if it isn't.
+     */
+    xType: {type: String, value: 'step'},
+
+    /**
+     * The scale for the y-axis. Allows:
+     * - "linear" - linear scale (Plottable.Scales.Linear)
+     * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
+     */
+    yScaleType: {type: String, value: 'linear'},
+
+    /**
+     * Whether to ignore outlier data when computing the yScale domain.
+     */
+
+    ignoreYOutliers: {
+      type: Boolean,
+      value: false,
+    },
+
+    /**
+     * Change how the tooltip is sorted. Allows:
+     * - "default" - Sort the tooltip by input order.
+     * - "ascending" - Sort the tooltip by ascending value.
+     * - "descending" - Sort the tooltip by descending value.
+     * - "nearest" - Sort the tooltip by closest to cursor.
+     */
+    tooltipSortingMethod: {type: String, value: 'default'},
+
+    /**
+     * Change how the tooltip is positioned. Allows:
+     * - "bottom" - Position the tooltip on the bottom of the chart.
+     * - "right" - Position the tooltip to the right of the chart.
+     */
+    tooltipPosition: {type: String, value: 'bottom'},
+
+    _attached: Boolean,
+    _chart: Object,
+    _visibleSeriesCache: {
+      type: Array,
+      value: function() {
+        return []
+      }
+    },
+    _seriesDataCache: {
+      type: Object,
+      value: function() {
+        return {}
+      }
+    },
+    _makeChartAsyncCallbackId: {type: Number, value: null}
+  },
+  observers: [
+    '_makeChart(xType, yScaleType, colorScale, _attached)',
+    '_reloadFromCache(_chart)',
+    '_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)',
+    '_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)',
+    '_tooltipPositionChanged(tooltipPosition, _chart)',
+    '_outliersChanged(ignoreYOutliers, _chart)'
+  ],
+
+  /**
+   * Sets the series that the chart displays. Series with other names will
+   * not be displayed.
+   *
+   * @param {Array<String>} names Array with the names of the series to
+   * display.
+   */
+  setVisibleSeries: function(names) {
+    this._visibleSeriesCache = names;
+    if (this._chart) {
+      this._chart.setVisibleSeries(names);
+      this.redraw();
+    }
+  },
+
+  /**
+   * Sets the data of one of the series. Note that to display this series
+   * its name must be in the setVisibleSeries() array.
+   *
+   * @param {string} name Name of the series.
+   * @param {Array<ChartHelpers.ScalarDatum>} data Data of the series. This is
+   * an array of objects with at least the following properties:
+   * - step: (Number) - index of the datum.
+   * - wall_time: (Date) - Date object with the datum's time.
+   * - scalar: (Number) - Value of the datum.
+   */
+  setSeriesData: function(name, data) {
+    this._seriesDataCache[name] = data;
+    if (this._chart) {
+      this._chart.setSeriesData(name, data);
+    }
+  },
+
+  /**
+   * Re-renders the chart. Useful if e.g. the container size changed.
+   */
+  redraw: function() {
+    this._chart.redraw();
+  },
+  attached: function() {
+    this._attached = true;
+  },
+  detached: function() {
+    this._attached = false;
+  },
+  ready: function() {
+    this.scopeSubtree(this.$.tooltip, true);
+    this.scopeSubtree(this.$.chartdiv, true);
+  },
+  _makeChart: function(xType, yScaleType, colorScale, _attached) {
+    if (this._makeChartAsyncCallbackId !== null) {
+      this.cancelAsync(this._makeChartAsyncCallbackId);
+      this._makeChartAsyncCallbackId = null;
     }
 
-    public smoothingUpdate(weight: number) {
-      this.smoothingWeight = weight;
-      this.datasets.forEach((d) => this.resmoothDataset(d));
+    this._makeChartAsyncCallbackId = this.async(function() {
+      this._makeChartAsyncCallbackId = null;
+      if (!this._attached) return;
+      if (this._chart) this._chart.destroy();
+      var tooltip = d3.select(this.$.tooltip);
+      var chart = new LineChart(xType, yScaleType, colorScale, tooltip);
+      var div = d3.select(this.$.chartdiv);
+      chart.renderTo(div);
+      this._chart = chart;
+    }, 350);
+  },
+  _reloadFromCache: function() {
+    if (this._chart) {
+      this._chart.setVisibleSeries(this._visibleSeriesCache);
+      this._visibleSeriesCache.forEach(function(name) {
+        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
+      }.bind(this));
+    }
+  },
+  _smoothingChanged: function() {
+    if (!this._chart) {
+      return;
+    }
+    if (this.smoothingEnabled) {
+      this._chart.smoothingUpdate(this.smoothingWeight);
+    } else {
+      this._chart.smoothingDisable();
+    }
+  },
+  _outliersChanged: function() {
+    if (!this._chart) {
+      return;
+    }
+    this._chart.ignoreYOutliers(this.ignoreYOutliers);
+  },
+  _tooltipSortingMethodChanged: function() {
+    if (this._chart) {
+      this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
+    }
+  },
+  _tooltipPositionChanged: function() {
+    if (this._chart) {
+      this._chart.setTooltipPosition(this.tooltipPosition);
+    }
+  }
+});
 
-      if (!this.smoothingEnabled) {
-        this.linePlot.addClass('ghost');
-        this.scatterPlot.y(this.smoothedAccessor, this.yScale);
-        this.smoothingEnabled = true;
-        this.smoothLinePlot.datasets(this.datasets);
-      }
+class LineChart {
+  private name2datasets: {[name: string]: Plottable.Dataset};
+  private seriesNames: string[];
 
+  private xAccessor: Plottable.IAccessor<number|Date>;
+  private xScale: Plottable.QuantitativeScale<number|Date>;
+  private yScale: Plottable.QuantitativeScale<number>;
+  private gridlines: Plottable.Components.Gridlines;
+  private center: Plottable.Components.Group;
+  private xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
+  private yAxis: Plottable.Axes.Numeric;
+  private outer: Plottable.Components.Table;
+  private colorScale: Plottable.Scales.Color;
+  private tooltip: d3.Selection<any, any, any, any>;
+  private dzl: DragZoomLayer;
+
+  private linePlot: Plottable.Plots.Line<number|Date>;
+  private smoothLinePlot: Plottable.Plots.Line<number|Date>;
+  private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
+  private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
+  private scalarAccessor: Plottable.IAccessor<number>;
+  private smoothedAccessor: Plottable.IAccessor<number>;
+  private lastPointsDataset: Plottable.Dataset;
+  private datasets: Plottable.Dataset[];
+  private onDatasetChanged: (dataset: Plottable.Dataset) => void;
+  private nanDataset: Plottable.Dataset;
+  private smoothingWeight: number;
+  private smoothingEnabled: Boolean;
+  private tooltipSortingMethod: string;
+  private tooltipPosition: string;
+  private _ignoreYOutliers: boolean;
+
+  private targetSVG: d3.Selection<any, any, any, any>;
+
+  constructor(
+      xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
+      tooltip: d3.Selection<any, any, any, any>) {
+    this.seriesNames = [];
+    this.name2datasets = {};
+    this.colorScale = colorScale;
+    this.tooltip = tooltip;
+    this.datasets = [];
+    this._ignoreYOutliers = false;
+    // lastPointDataset is a dataset that contains just the last point of
+    // every dataset we're currently drawing.
+    this.lastPointsDataset = new Plottable.Dataset();
+    this.nanDataset = new Plottable.Dataset();
+    // need to do a single bind, so we can deregister the callback from
+    // old Plottable.Datasets. (Deregistration is done by identity checks.)
+    this.onDatasetChanged = this._onDatasetChanged.bind(this);
+    this.buildChart(xType, yScaleType);
+  }
+
+  private buildChart(xType: string, yScaleType: string) {
+    if (this.outer) {
+      this.outer.destroy();
+    }
+    let xComponents = ChartHelpers.getXComponents(xType);
+    this.xAccessor = xComponents.accessor;
+    this.xScale = xComponents.scale;
+    this.xAxis = xComponents.axis;
+    this.xAxis.margin(0).tickLabelPadding(3);
+    this.yScale = LineChart.getYScaleFromType(yScaleType);
+    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
+    let yFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
+    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
+    this.yAxis.usesTextWidthApproximation(true);
+
+    this.dzl = new DragZoomLayer(
+        this.xScale, this.yScale, this.updateSpecialDatasets.bind(this));
+
+    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
+
+    this.gridlines =
+        new Plottable.Components.Gridlines(this.xScale, this.yScale);
+
+    let xZeroLine = new Plottable.Components.GuideLineLayer('horizontal');
+    xZeroLine.scale(this.yScale).value(0);
+    let yZeroLine = new Plottable.Components.GuideLineLayer('vertical');
+    yZeroLine.scale(this.xScale).value(0);
+
+    this.center = new Plottable.Components.Group(
+        [this.gridlines, xZeroLine, yZeroLine, center, this.dzl]);
+    this.outer = new Plottable.Components.Table(
+        [[this.yAxis, this.center], [null, this.xAxis]]);
+  }
+
+  private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
+    this.scalarAccessor = (d: ChartHelpers.ScalarDatum) => d.scalar;
+    this.smoothedAccessor = (d: ChartHelpers.ScalarDatum) => d.smoothed;
+    let linePlot = new Plottable.Plots.Line<number|Date>();
+    linePlot.x(xAccessor, xScale);
+    linePlot.y(this.scalarAccessor, yScale);
+    linePlot.attr(
+        'stroke',
+        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
+            this.colorScale.scale(dataset.metadata().name));
+    this.linePlot = linePlot;
+    let group = this.setupTooltips(linePlot);
+
+    let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
+    smoothLinePlot.x(xAccessor, xScale);
+    smoothLinePlot.y(this.smoothedAccessor, yScale);
+    smoothLinePlot.attr(
+        'stroke',
+        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
+            this.colorScale.scale(dataset.metadata().name));
+    this.smoothLinePlot = smoothLinePlot;
+
+    // The scatterPlot will display the last point for each dataset.
+    // This way, if there is only one datum for the series, it is still
+    // visible. We hide it when tooltips are active to keep things clean.
+    let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
+    scatterPlot.x(xAccessor, xScale);
+    scatterPlot.y(this.scalarAccessor, yScale);
+    scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
+    scatterPlot.attr('opacity', 1);
+    scatterPlot.size(ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
+    scatterPlot.datasets([this.lastPointsDataset]);
+    this.scatterPlot = scatterPlot;
+
+    let nanDisplay = new Plottable.Plots.Scatter<number|Date, number>();
+    nanDisplay.x(xAccessor, xScale);
+    nanDisplay.y((x) => x.displayY, yScale);
+    nanDisplay.attr('fill', (d: any) => this.colorScale.scale(d.name));
+    nanDisplay.attr('opacity', 1);
+    nanDisplay.size(ChartHelpers.NAN_SYMBOL_SIZE * 2);
+    nanDisplay.datasets([this.nanDataset]);
+    nanDisplay.symbol(Plottable.SymbolFactories.triangle);
+    this.nanDisplay = nanDisplay;
+
+    return new Plottable.Components.Group(
+        [nanDisplay, scatterPlot, smoothLinePlot, group]);
+  }
+
+  /** Updates the chart when a dataset changes. Called every time the data of
+   * a dataset changes to update the charts.
+   */
+  private _onDatasetChanged(dataset: Plottable.Dataset) {
+    if (this.smoothingEnabled) {
+      this.resmoothDataset(dataset);
+    }
+    this.updateSpecialDatasets();
+  }
+
+  public ignoreYOutliers(ignoreYOutliers: boolean) {
+    if (ignoreYOutliers !== this._ignoreYOutliers) {
+      this._ignoreYOutliers = ignoreYOutliers;
+      this.updateSpecialDatasets();
+    }
+  }
+
+  private updateSpecialDatasets() {
+    if (this.smoothingEnabled) {
       this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
+    } else {
+      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
     }
+  }
 
-    public smoothingDisable() {
-      if (this.smoothingEnabled) {
-        this.linePlot.removeClass('ghost');
-        this.scatterPlot.y(this.scalarAccessor, this.yScale);
-        this.smoothLinePlot.datasets([]);
-        this.smoothingEnabled = false;
-        this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
+  /** Constructs special datasets. Each special dataset contains exceptional
+   * values from all of the regular datasets, e.g. last points in series, or
+   * NaN values. Those points will have a `name` and `relative` property added
+   * (since usually those are context in the surrounding dataset).
+   * The accessor will point to the correct data to access.
+   */
+  private updateSpecialDatasetsWithAccessor(accessor:
+                                                Plottable.IAccessor<number>) {
+    let lastPointsData =
+        this.datasets
+            .map((d) => {
+              let datum = null;
+              // filter out NaNs to ensure last point is a clean one
+              let nonNanData =
+                  d.data().filter((x) => !isNaN(accessor(x, -1, d)));
+              if (nonNanData.length > 0) {
+                let idx = nonNanData.length - 1;
+                datum = nonNanData[idx];
+                datum.name = d.metadata().name;
+                datum.relative = ChartHelpers.relativeAccessor(datum, -1, d);
+              }
+              return datum;
+            })
+            .filter((x) => x != null);
+    this.lastPointsDataset.data(lastPointsData);
+
+    // Take a dataset, return an array of NaN data points
+    // the NaN points will have a "displayY" property which is the
+    // y-value of a nearby point that was not NaN (0 if all points are NaN)
+    let datasetToNaNData = (d: Plottable.Dataset) => {
+      let displayY = null;
+      let data = d.data();
+      let i = 0;
+      while (i < data.length && displayY == null) {
+        if (!isNaN(accessor(data[i], -1, d))) {
+          displayY = accessor(data[i], -1, d);
+        }
+        i++;
       }
+      if (displayY == null) {
+        displayY = 0;
+      }
+      let nanData = [];
+      for (i = 0; i < data.length; i++) {
+        if (!isNaN(accessor(data[i], -1, d))) {
+          displayY = accessor(data[i], -1, d);
+        } else {
+          data[i].name = d.metadata().name;
+          data[i].displayY = displayY;
+          data[i].relative = ChartHelpers.relativeAccessor(data[i], -1, d);
+          nanData.push(data[i]);
+        }
+      }
+      return nanData;
+    };
+    let nanData = _.flatten(this.datasets.map(datasetToNaNData));
+    this.nanDataset.data(nanData);
+
+    let datasetToValues: (d: Plottable.Dataset) => number[] = (d) => {
+      return d.data().map((x) => accessor(x, -1, d));
+    };
+    let vals = _.flatten(this.datasets.map(datasetToValues));
+    vals = vals.filter((x) => x === x && x !== Infinity && x !== -Infinity);
+    let domain = ChartHelpers.computeDomain(vals, this._ignoreYOutliers);
+    this.yScale.domain(domain);
+  }
+
+  private setupTooltips(plot: Plottable.XYPlot<number|Date, number>):
+      Plottable.Components.Group {
+    let pi = new Plottable.Interactions.Pointer();
+    pi.attachTo(plot);
+    // PointsComponent is a Plottable Component that will hold the little
+    // circles we draw over the closest data points
+    let pointsComponent = new Plottable.Component();
+    let group = new Plottable.Components.Group([plot, pointsComponent]);
+
+    let hideTooltips = () => {
+      this.tooltip.style('opacity', 0);
+      this.scatterPlot.attr('opacity', 1);
+      pointsComponent.content().selectAll('.point').remove();
+    };
+
+    let enabled = true;
+    let disableTooltips = () => {
+      enabled = false;
+      hideTooltips();
+    };
+    let enableTooltips = () => {
+      enabled = true;
+    };
+
+    this.dzl.interactionStart(disableTooltips);
+    this.dzl.interactionEnd(enableTooltips);
+
+    pi.onPointerMove((p: Plottable.Point) => {
+      if (!enabled) {
+        return;
+      }
+      let target: ChartHelpers.Point = {
+        x: p.x,
+        y: p.y,
+        datum: null,
+        dataset: null,
+      };
+
+
+      let bbox: SVGRect = (<any>this.gridlines.content().node()).getBBox();
+
+      // pts is the closets point to the tooltip for each dataset
+      let pts = plot.datasets()
+                    .map((dataset) => this.findClosestPoint(target, dataset))
+                    .filter(x => x != null);
+      let intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
+      // We draw tooltips for points that are NaN, or are currently visible
+      let ptsForTooltips = pts.filter(
+          (p) => intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar));
+      // Only draw little indicator circles for the non-NaN points
+      let ptsToCircle = ptsForTooltips.filter((p) => !isNaN(p.datum.scalar));
+
+      let ptsSelection: any =
+          pointsComponent.content().selectAll('.point').data(
+              ptsToCircle,
+              (p: ChartHelpers.Point) => p.dataset.metadata().name);
+      if (pts.length !== 0) {
+        ptsSelection.enter().append('circle').classed('point', true);
+        ptsSelection.attr('r', ChartHelpers.TOOLTIP_CIRCLE_SIZE)
+            .attr('cx', (p) => p.x)
+            .attr('cy', (p) => p.y)
+            .style('stroke', 'none')
+            .attr(
+                'fill',
+                (p) => this.colorScale.scale(p.dataset.metadata().name));
+        ptsSelection.exit().remove();
+        this.drawTooltips(ptsForTooltips, target);
+      } else {
+        hideTooltips();
+      }
+    });
+
+    pi.onPointerExit(hideTooltips);
+
+    return group;
+  }
+
+  private drawTooltips(
+      points: ChartHelpers.Point[], target: ChartHelpers.Point) {
+    // Formatters for value, step, and wall_time
+    this.scatterPlot.attr('opacity', 0);
+    let valueFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
+
+    let dist = (p: ChartHelpers.Point) =>
+        Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
+    let closestDist = _.min(points.map(dist));
+
+    let valueSortMethod = this.scalarAccessor;
+    if (this.smoothingEnabled) {
+      valueSortMethod = this.smoothedAccessor;
     }
 
-    public setTooltipSortingMethod(method: string) {
-      this.tooltipSortingMethod = method;
+    if (this.tooltipSortingMethod === 'ascending') {
+      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset));
+    } else if (this.tooltipSortingMethod === 'descending') {
+      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset))
+                   .reverse();
+    } else if (this.tooltipSortingMethod === 'nearest') {
+      points = _.sortBy(points, dist);
+    } else {
+      // The 'default' sorting method maintains the order of names passed to
+      // setVisibleSeries(). However we reverse that order when defining the
+      // datasets. So we must call reverse again to restore the order.
+      points = points.slice(0).reverse();
     }
 
-    public setTooltipPosition(position: string) {
-      this.tooltipPosition = position;
+    let rows = this.tooltip.select('tbody')
+                   .html('')
+                   .selectAll('tr')
+                   .data(points)
+                   .enter()
+                   .append('tr');
+    // Grey out the point if any of the following are true:
+    // - The cursor is outside of the x-extent of the dataset
+    // - The point's y value is NaN
+    rows.classed('distant', (d) => {
+      let firstPoint = d.dataset.data()[0];
+      let lastPoint = _.last(d.dataset.data());
+      let firstX = this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
+      let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
+      let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
+      return target.x < firstX || target.x > lastX || isNaN(s);
+    });
+    rows.classed('closest', (p) => dist(p) === closestDist);
+    // It is a bit hacky that we are manually applying the width to the swatch
+    // and the nowrap property to the text here. The reason is as follows:
+    // the style gets updated asynchronously by Polymer scopeSubtree observer.
+    // Which means we would get incorrect sizing information since the text
+    // would wrap by default. However, we need correct measurements so that
+    // we can stop the text from falling off the edge of the screen.
+    // therefore, we apply the size-critical styles directly.
+    rows.style('white-space', 'nowrap');
+    rows.append('td')
+        .append('span')
+        .classed('swatch', true)
+        .style(
+            'background-color',
+            (d) => this.colorScale.scale(d.dataset.metadata().name));
+    rows.append('td').text((d) => d.dataset.metadata().name);
+    if (this.smoothingEnabled) {
+      rows.append('td').text(
+          (d) => isNaN(d.datum.smoothed) ? 'NaN' :
+                                           valueFormatter(d.datum.smoothed));
+    }
+    rows.append('td').text(
+        (d) => isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
+    rows.append('td').text((d) => ChartHelpers.stepFormatter(d.datum.step));
+    rows.append('td').text(
+        (d) => ChartHelpers.timeFormatter(d.datum.wall_time));
+    rows.append('td').text(
+        (d) => ChartHelpers.relativeFormatter(
+            ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)));
+
+    // compute left position
+    let documentWidth = document.body.clientWidth;
+    let node: any = this.tooltip.node();
+    let parentRect = node.parentElement.getBoundingClientRect();
+    let nodeRect = node.getBoundingClientRect();
+    // prevent it from falling off the right side of the screen
+    let left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
+
+    if (this.tooltipPosition === 'right') {
+      left = Math.min(parentRect.width, left);
+    } else {  // 'bottom'
+      left = Math.min(0, left);
+      top = parentRect.height + ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
     }
 
-    public renderTo(targetSVG: d3.Selection<any>) {
-      this.targetSVG = targetSVG;
-      this.setViewBox();
-      this.outer.renderTo(targetSVG);
+    this.tooltip.style('transform', 'translate(' + left + 'px,' + top + 'px)');
+    this.tooltip.style('opacity', 1);
+  }
+
+  private findClosestPoint(
+      target: ChartHelpers.Point,
+      dataset: Plottable.Dataset): ChartHelpers.Point {
+    let points: ChartHelpers.Point[] = dataset.data().map((d, i) => {
+      let x = this.xAccessor(d, i, dataset);
+      let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
+                                      this.scalarAccessor(d, i, dataset);
+      return {
+        x: this.xScale.scale(x),
+        y: this.yScale.scale(y),
+        datum: d,
+        dataset: dataset,
+      };
+    });
+    let idx: number =
+        _.sortedIndex(points, target, (p: ChartHelpers.Point) => p.x);
+    if (idx === points.length) {
+      return points[points.length - 1];
+    } else if (idx === 0) {
+      return points[0];
+    } else {
+      let prev = points[idx - 1];
+      let next = points[idx];
+      let prevDist = Math.abs(prev.x - target.x);
+      let nextDist = Math.abs(next.x - target.x);
+      return prevDist < nextDist ? prev : next;
+    }
+  }
+
+  private resmoothDataset(dataset: Plottable.Dataset) {
+    let data = dataset.data();
+    const smoothingWeight = this.smoothingWeight;
+    let last = data.length > 0 ? data[0].scalar : NaN;
+    data.forEach((d) => {
+      if (!_.isFinite(last)) {
+        d.smoothed = d.scalar;
+      } else {
+        // 1st-order IIR low-pass filter to attenuate the higher-
+        // frequency components of the time-series.
+        d.smoothed = last * smoothingWeight + (1 - smoothingWeight) * d.scalar;
+      }
+      last = d.smoothed;
+    });
+  }
+
+  private getDataset(name: string) {
+    if (this.name2datasets[name] === undefined) {
+      this.name2datasets[name] = new Plottable.Dataset([], {name: name});
+    }
+    return this.name2datasets[name];
+  }
+
+  static getYScaleFromType(yScaleType: string):
+      Plottable.QuantitativeScale<number> {
+    if (yScaleType === 'log') {
+      return new Plottable.Scales.ModifiedLog();
+    } else if (yScaleType === 'linear') {
+      return new Plottable.Scales.Linear();
+    } else {
+      throw new Error('Unrecognized yScale type ' + yScaleType);
+    }
+  }
+
+  /**
+   * Update the selected series on the chart.
+   */
+  public setVisibleSeries(names: string[]) {
+    names = names.sort();
+    this.seriesNames = names;
+
+    names.reverse();  // draw first series on top
+    this.datasets.forEach((d) => d.offUpdate(this.onDatasetChanged));
+    this.datasets = names.map((r) => this.getDataset(r));
+    this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
+    this.linePlot.datasets(this.datasets);
+
+    if (this.smoothingEnabled) {
+      this.smoothLinePlot.datasets(this.datasets);
+    }
+    this.updateSpecialDatasets();
+  }
+
+  /**
+   * Set the data of a series on the chart.
+   */
+  public setSeriesData(name: string, data: ChartHelpers.ScalarDatum[]) {
+    this.getDataset(name).data(data);
+  }
+
+  public smoothingUpdate(weight: number) {
+    this.smoothingWeight = weight;
+    this.datasets.forEach((d) => this.resmoothDataset(d));
+
+    if (!this.smoothingEnabled) {
+      this.linePlot.addClass('ghost');
+      this.scatterPlot.y(this.smoothedAccessor, this.yScale);
+      this.smoothingEnabled = true;
+      this.smoothLinePlot.datasets(this.datasets);
     }
 
-    /** There's an issue in Chrome where the svg overflow is a bit
-     * "flickery". There is a border on the gridlines on the extreme edge of the
-     * chart, which behaves inconsistently and causes the screendiffing tests to
-     * flake. We can solve this by creating 1px effective margin for the svg by
-     * setting the viewBox on the containing svg.
-     */
-    private setViewBox() {
-      // There's an issue in Firefox where if we measure with the old viewbox
-      // set, we get horrible results.
-      this.targetSVG.attr('viewBox', null);
+    this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
+  }
 
-      let parent = this.targetSVG.node().parentNode as HTMLElement;
-      let w = parent.clientWidth;
-      let h = parent.clientHeight;
-      this.targetSVG.attr({
-        'height': h,
-        'viewBox': `0 0 ${w + 1} ${h + 1}`,
-      });
+  public smoothingDisable() {
+    if (this.smoothingEnabled) {
+      this.linePlot.removeClass('ghost');
+      this.scatterPlot.y(this.scalarAccessor, this.yScale);
+      this.smoothLinePlot.datasets([]);
+      this.smoothingEnabled = false;
+      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
     }
+  }
 
-    public redraw() {
-      this.outer.redraw();
-      this.setViewBox();
-    }
+  public setTooltipSortingMethod(method: string) {
+    this.tooltipSortingMethod = method;
+  }
 
-    public destroy() { this.outer.destroy(); }
+  public setTooltipPosition(position: string) {
+    this.tooltipPosition = position;
+  }
+
+  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
+    this.targetSVG = targetSVG;
+    this.outer.renderTo(targetSVG);
+  }
+
+  public redraw() {
+    this.outer.redraw();
+  }
+
+  public destroy() {
+    this.outer.destroy();
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/BUILD b/tensorflow/tensorboard/components/vz_projector/BUILD
index 8c222be10e9..acc1312a944 100644
--- a/tensorflow/tensorboard/components/vz_projector/BUILD
+++ b/tensorflow/tensorboard/components/vz_projector/BUILD
@@ -1,19 +1,110 @@
-# Description:
-# Package for the Embedding Projector component.
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+ts_web_library(
+    name = "vz_projector",
+    srcs = [
+        "analyticsLogger.ts",
+        "bundle.html",
+        "data.ts",
+        "data-provider.ts",
+        "data-provider-demo.ts",
+        "data-provider-proto.ts",
+        "data-provider-server.ts",
+        "external.d.ts",
+        "knn.ts",
+        "label.ts",
+        "logging.ts",
+        "projectorEventContext.ts",
+        "projectorScatterPlotAdapter.ts",
+        "renderContext.ts",
+        "scatterPlot.ts",
+        "scatterPlotRectangleSelector.ts",
+        "scatterPlotVisualizer.ts",
+        "scatterPlotVisualizer3DLabels.ts",
+        "scatterPlotVisualizerCanvasLabels.ts",
+        "scatterPlotVisualizerPolylines.ts",
+        "scatterPlotVisualizerSprites.ts",
+        "styles.html",
+        "util.ts",
+        "vector.ts",
+        "vz-projector.html",
+        "vz-projector.ts",
+        "vz-projector-app.html",
+        "vz-projector-bookmark-panel.html",
+        "vz-projector-bookmark-panel.ts",
+        "vz-projector-colab.html",
+        "vz-projector-dashboard.html",
+        "vz-projector-data-panel.html",
+        "vz-projector-data-panel.ts",
+        "vz-projector-input.html",
+        "vz-projector-input.ts",
+        "vz-projector-inspector-panel.html",
+        "vz-projector-inspector-panel.ts",
+        "vz-projector-legend.html",
+        "vz-projector-legend.ts",
+        "vz-projector-metadata-card.html",
+        "vz-projector-metadata-card.ts",
+        "vz-projector-projections-panel.html",
+        "vz-projector-projections-panel.ts",
+        "vz-projector-util.ts",
+    ],
+    path = "/vz-projector",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bh_tsne",
+        ":heap",
+        ":sptree",
+        "//tensorflow/tensorboard/components/tf_dashboard_common",
+        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_imports:numericjs",
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:threejs",
+        "//tensorflow/tensorboard/components/tf_imports:weblas",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dialog_scrollable",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_listbox",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toast",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+ts_web_library(
+    name = "heap",
+    srcs = ["heap.ts"],
+    path = "/vz-projector",
+)
+
+ts_web_library(
+    name = "sptree",
+    srcs = ["sptree.ts"],
+    path = "/vz-projector",
+)
+
+ts_web_library(
+    name = "bh_tsne",
+    srcs = ["bh_tsne.ts"],
+    path = "/vz-projector",
+    deps = [":sptree"],
+)
 
 filegroup(
     name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    srcs = glob(["**"]),
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts b/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
index 9d2df65f560..063d57ec401 100644
--- a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
+++ b/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
@@ -22,6 +22,7 @@ limitations under the License.
  */
 
 /**
+ * @license
  * The MIT License (MIT)
  * Copyright (c) 2015 Andrej Karpathy
  * Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/tensorflow/tensorboard/components/vz_projector/bundle.html b/tensorflow/tensorboard/components/vz_projector/bundle.html
new file mode 100644
index 00000000000..f5a25230a0b
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/bundle.html
@@ -0,0 +1,48 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/numericjs.html">
+<link rel="import" href="../tf-imports/threejs.html">
+<link rel="import" href="../tf-imports/weblas.html">
+
+<script src="heap.js"></script>
+<script src="label.js"></script>
+<script src="sptree.js"></script>
+<script src="bh_tsne.js"></script>
+<script src="logging.js"></script>
+<script src="renderContext.js"></script>
+<script src="scatterPlotRectangleSelector.js"></script>
+<script src="analyticsLogger.js"></script>
+<script src="util.js"></script>
+<script src="vector.js"></script>
+<script src="knn.js"></script>
+<script src="data.js"></script>
+<script src="data-provider.js"></script>
+<script src="data-provider-demo.js"></script>
+<script src="data-provider-proto.js"></script>
+<script src="data-provider-server.js"></script>
+<script src="projectorEventContext.js"></script>
+<script src="scatterPlot.js"></script>
+<script src="scatterPlotVisualizer3DLabels.js"></script>
+<script src="scatterPlotVisualizerCanvasLabels.js"></script>
+<script src="scatterPlotVisualizerPolylines.js"></script>
+<script src="scatterPlotVisualizerSprites.js"></script>
+<script src="scatterPlotVisualizer.js"></script>
+<script src="projectorScatterPlotAdapter.js"></script>
+<script src="vz-projector-util.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
index bf1bc0b255b..1410a84a8e4 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
@@ -46,23 +46,27 @@ export class DemoDataProvider implements DataProvider {
 
   retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
       : void {
-    let msgId = logging.setModalMessage('Fetching projector config...');
-    d3.json(this.projectorConfigPath, (err, projectorConfig) => {
-      if (err) {
-        let errorMessage = err;
-        // If the error is a valid XMLHttpResponse, it's possible this is a
-        // cross-origin error.
-        if (err.responseText != null) {
-          errorMessage = 'Cannot fetch projector config, possibly a ' +
-              'Cross-Origin request error.';
-        }
-        logging.setErrorMessage(errorMessage, 'fetching projector config');
-        return;
+    const msgId = logging.setModalMessage('Fetching projector config...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', this.projectorConfigPath);
+    xhr.onerror = (err) => {
+      let errorMessage = err.message;
+      // If the error is a valid XMLHttpResponse, it's possible this is a
+      // cross-origin error.
+      if (xhr.responseText != null) {
+        errorMessage = 'Cannot fetch projector config, possibly a ' +
+            'Cross-Origin request error.';
       }
+      logging.setErrorMessage(errorMessage, 'fetching projector config');
+    };
+    xhr.onload = () => {
+      const projectorConfig = JSON.parse(xhr.responseText) as ProjectorConfig;
       logging.setModalMessage(null, msgId);
       this.projectorConfig = projectorConfig;
       callback(projectorConfig);
-    });
+    };
+    xhr.send();
   }
 
   retrieveTensor(run: string, tensorName: string,
@@ -76,15 +80,19 @@ export class DemoDataProvider implements DataProvider {
           callback);
     } else {
       logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID);
-      d3.text(url, (error: any, dataString: string) => {
-        if (error) {
-          logging.setErrorMessage(error.responseText, 'fetching tensors');
-          return;
-        }
-        dataProvider.parseTensors(dataString).then(points => {
+      const request = new XMLHttpRequest();
+      request.open('GET', url);
+      request.responseType = 'arraybuffer';
+
+      request.onerror = () => {
+        logging.setErrorMessage(request.responseText, 'fetching tensors');
+      };
+      request.onload = () => {
+        dataProvider.parseTensors(request.response).then(points => {
           callback(new DataSet(points));
         });
-      });
+      };
+      request.send();
     }
   }
 
@@ -103,14 +111,17 @@ export class DemoDataProvider implements DataProvider {
       run: string, tensorName: string, callback: (r: State[]) => void) {
     let embedding = this.getEmbeddingInfo(tensorName);
     let msgId = logging.setModalMessage('Fetching bookmarks...');
-    d3.json(embedding.bookmarksPath, (err, bookmarks: State[]) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText);
-        return;
-      }
 
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', embedding.bookmarksPath);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText);
+    };
+    xhr.onload = () => {
+      const bookmarks = JSON.parse(xhr.responseText) as State[];
       logging.setModalMessage(null, msgId);
       callback(bookmarks);
-    });
+    };
+    xhr.send();
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
index ff535468de7..02720ebf6a7 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
@@ -36,9 +36,9 @@ export class ServerDataProvider implements DataProvider {
   private getEmbeddingInfo(run: string, tensorName: string,
       callback: (e: EmbeddingInfo) => void): void {
     this.retrieveProjectorConfig(run, config => {
-      let embeddings = config.embeddings;
+      const embeddings = config.embeddings;
       for (let i = 0; i < embeddings.length; i++) {
-        let embedding = embeddings[i];
+        const embedding = embeddings[i];
         if (embedding.tensorName === tensorName) {
           callback(embedding);
           return;
@@ -49,15 +49,19 @@ export class ServerDataProvider implements DataProvider {
   }
 
   retrieveRuns(callback: (runs: string[]) => void): void {
-    let msgId = logging.setModalMessage('Fetching runs...');
-    d3.json(`${this.routePrefix}/runs`, (err, runs: string[]) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText, 'fetching runs');
-        return;
-      }
+    const msgId = logging.setModalMessage('Fetching runs...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', `${this.routePrefix}/runs`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching runs');
+    };
+    xhr.onload = () => {
+      const runs = JSON.parse(xhr.responseText);
       logging.setModalMessage(null, msgId);
       callback(runs);
-    });
+    };
+    xhr.send();
   }
 
   retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
@@ -67,17 +71,20 @@ export class ServerDataProvider implements DataProvider {
       return;
     }
 
-    let msgId = logging.setModalMessage('Fetching projector config...');
-    d3.json(`${this.routePrefix}/info?run=${run}`, (err,
-        config: ProjectorConfig) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText, 'fetching projector config');
-        return;
-      }
+    const msgId = logging.setModalMessage('Fetching projector config...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', `${this.routePrefix}/info?run=${run}`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching projector config');
+    };
+    xhr.onload = () => {
+      const config = JSON.parse(xhr.responseText) as ProjectorConfig;
       logging.setModalMessage(null, msgId);
       this.runProjectorConfigCache[run] = config;
       callback(config);
-    });
+    };
+    xhr.send();
   }
 
   retrieveTensor(run: string, tensorName: string,
@@ -112,14 +119,19 @@ export class ServerDataProvider implements DataProvider {
 
   getBookmarks(
       run: string, tensorName: string, callback: (r: State[]) => void) {
-    let msgId = logging.setModalMessage('Fetching bookmarks...');
-    d3.json(
-        `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`,
-        (err, bookmarks: State[]) => {
-          logging.setModalMessage(null, msgId);
-          if (!err) {
-            callback(bookmarks);
-          }
-        });
+    const msgId = logging.setModalMessage('Fetching bookmarks...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open(
+        'GET', `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching bookmarks');
+    };
+    xhr.onload = () => {
+      logging.setModalMessage(null, msgId);
+      const bookmarks = JSON.parse(xhr.responseText);
+      callback(bookmarks);
+    };
+    xhr.send();
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
index 4db042d7fda..c8eede798c6 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
@@ -131,31 +131,88 @@ export function retrieveTensorAsBytes(
 }
 
 export function parseRawTensors(
-    content: string, callback: (ds: DataSet) => void) {
+    content: ArrayBuffer, callback: (ds: DataSet) => void) {
   parseTensors(content).then(data => {
     callback(new DataSet(data));
   });
 }
 
 export function parseRawMetadata(
-    contents: string, callback: (r: SpriteAndMetadataInfo) => void) {
+    contents: ArrayBuffer, callback: (r: SpriteAndMetadataInfo) => void) {
   parseMetadata(contents).then(result => callback(result));
 }
 
+/**
+ * Parse an ArrayBuffer in a streaming fashion line by line (or custom delim).
+ * Can handle very large files.
+ *
+ * @param content The array buffer.
+ * @param callback The callback called on each line.
+ * @param chunkSize The size of each read chunk, defaults to ~1MB. (optional)
+ * @param delim The delimiter used to split a line, defaults to '\n'. (optional)
+ * @returns A promise for when it is finished.
+ */
+function streamParse(
+    content: ArrayBuffer, callback: (line: string) => void, chunkSize = 1000000,
+    delim = '\n'): Promise<void> {
+  return new Promise<void>((resolve, reject) => {
+    let offset = 0;
+    let bufferSize = content.byteLength - 1;
+    let data = '';
+
+    function readHandler(str) {
+      offset += chunkSize;
+      let parts = str.split(delim);
+      let first = data + parts[0];
+      if (parts.length === 1) {
+        data = first;
+        readChunk(offset, chunkSize);
+        return;
+      }
+      data = parts[parts.length - 1];
+      callback(first);
+      for (let i = 1; i < parts.length - 1; i++) {
+        callback(parts[i]);
+      }
+      if (offset >= bufferSize) {
+        if (data) {
+          callback(data);
+        }
+        resolve();
+        return;
+      }
+      readChunk(offset, chunkSize);
+    }
+
+    function readChunk(offset: number, size: number) {
+      const contentChunk = content.slice(offset, offset + size);
+
+      const blob = new Blob([contentChunk]);
+      const file = new FileReader();
+      file.onload = (e: any) => readHandler(e.target.result);
+      file.readAsText(blob);
+    }
+
+    readChunk(offset, chunkSize);
+  });
+}
+
 /** Parses a tsv text file. */
 export function parseTensors(
-    content: string, delim = '\t'): Promise<DataPoint[]> {
-  let data: DataPoint[] = [];
-  let numDim: number;
-  return runAsyncTask('Parsing tensors...', () => {
-    let lines = content.split('\n');
-    lines.forEach(line => {
+    content: ArrayBuffer, valueDelim = '\t'): Promise<DataPoint[]> {
+  logging.setModalMessage('Parsing tensors...', TENSORS_MSG_ID);
+
+  return new Promise<DataPoint[]>((resolve, reject) => {
+    const data: DataPoint[] = [];
+    let numDim: number;
+
+    streamParse(content, (line: string) => {
       line = line.trim();
       if (line === '') {
         return;
       }
-      let row = line.split(delim);
-      let dataPoint: DataPoint = {
+      const row = line.split(valueDelim);
+      const dataPoint: DataPoint = {
         metadata: {},
         vector: null,
         index: data.length,
@@ -182,11 +239,10 @@ export function parseTensors(
             'Parsing failed. Found a vector with only one dimension?');
         throw Error('Parsing failed');
       }
+    }).then(() => {
+      logging.setModalMessage(null, TENSORS_MSG_ID);
+      resolve(data);
     });
-    return data;
-  }, TENSORS_MSG_ID).then(dataPoints => {
-    logging.setModalMessage(null, TENSORS_MSG_ID);
-    return dataPoints;
   });
 }
 
@@ -194,8 +250,8 @@ export function parseTensors(
 export function parseTensorsFromFloat32Array(data: Float32Array,
     dim: number): Promise<DataPoint[]> {
   return runAsyncTask('Parsing tensors...', () => {
-    let N = data.length / dim;
-    let dataPoints: DataPoint[] = [];
+    const N = data.length / dim;
+    const dataPoints: DataPoint[] = [];
     let offset = 0;
     for (let i = 0; i < N; ++i) {
       dataPoints.push({
@@ -215,7 +271,7 @@ export function parseTensorsFromFloat32Array(data: Float32Array,
 
 export function analyzeMetadata(
     columnNames, pointsMetadata: PointMetadata[]): ColumnStats[] {
-  let columnStats: ColumnStats[] = columnNames.map(name => {
+  const columnStats: ColumnStats[] = columnNames.map(name => {
     return {
       name: name,
       isNumeric: true,
@@ -224,12 +280,15 @@ export function analyzeMetadata(
       max: Number.NEGATIVE_INFINITY
     };
   });
-  let mapOfValues = columnNames.map(() => d3.map<number>());
+
+  const mapOfValues: [{[value: string]: number}] =
+      columnNames.map(() => new Object());
+
   pointsMetadata.forEach(metadata => {
     columnNames.forEach((name: string, colIndex: number) => {
-      let stats = columnStats[colIndex];
-      let map = mapOfValues[colIndex];
-      let value = metadata[name];
+      const stats = columnStats[colIndex];
+      const map = mapOfValues[colIndex];
+      const value = metadata[name];
 
       // Skip missing values.
       if (value == null) {
@@ -237,12 +296,12 @@ export function analyzeMetadata(
       }
 
       if (!stats.tooManyUniqueValues) {
-        if (map.has(value)) {
-          map.set(value, map.get(value) + 1);
+        if (value in map) {
+          map[value]++;
         } else {
-          map.set(value, 1);
+          map[value] = 1;
         }
-        if (map.size() > NUM_COLORS_COLOR_MAP) {
+        if (Object.keys(map).length > NUM_COLORS_COLOR_MAP) {
           stats.tooManyUniqueValues = true;
         }
       }
@@ -256,26 +315,40 @@ export function analyzeMetadata(
     });
   });
   columnStats.forEach((stats, colIndex) => {
-    stats.uniqueEntries = mapOfValues[colIndex].entries().map(e => {
-      return {label: e.key, count: e.value};
+    stats.uniqueEntries = Object.keys(mapOfValues[colIndex]).map(label => {
+      return {label, count: mapOfValues[colIndex][label]};
     });
   });
   return columnStats;
 }
 
-export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> {
-  return runAsyncTask('Parsing metadata...', () => {
-    let lines = content.split('\n').filter(line => line.trim().length > 0);
-    let hasHeader = lines[0].indexOf('\t') >= 0;
+export function parseMetadata(content: ArrayBuffer):
+    Promise<SpriteAndMetadataInfo> {
+  logging.setModalMessage('Parsing metadata...', METADATA_MSG_ID);
+
+  return new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
     let pointsMetadata: PointMetadata[] = [];
-    // If the first row doesn't contain metadata keys, we assume that the values
-    // are labels.
+    let hasHeader = false;
+    let lineNumber = 0;
     let columnNames = ['label'];
-    if (hasHeader) {
-      columnNames = lines[0].split('\t');
-      lines = lines.slice(1);
-    }
-    lines.forEach((line: string) => {
+    streamParse(content, (line: string) => {
+      if (line.trim().length === 0) {
+        return;
+      }
+      if (lineNumber === 0) {
+        hasHeader = line.indexOf('\t') >= 0;
+
+        // If the first row doesn't contain metadata keys, we assume that the
+        // values are labels.
+        if (hasHeader) {
+          columnNames = line.split('\t');
+          lineNumber++;
+          return;
+        }
+      }
+
+      lineNumber++;
+
       let rowValues = line.split('\t');
       let metadata: PointMetadata = {};
       pointsMetadata.push(metadata);
@@ -285,14 +358,13 @@ export function parseMetadata(content: string): Promise<SpriteAndMetadataInfo> {
         value = (value === '' ? null : value);
         metadata[name] = value;
       });
+    }).then(() => {
+      logging.setModalMessage(null, METADATA_MSG_ID);
+      resolve({
+        stats: analyzeMetadata(columnNames, pointsMetadata),
+        pointsInfo: pointsMetadata
+      });
     });
-    return {
-      stats: analyzeMetadata(columnNames, pointsMetadata),
-      pointsInfo: pointsMetadata
-    } as SpriteAndMetadataInfo;
-  }, METADATA_MSG_ID).then(metadata => {
-    logging.setModalMessage(null, METADATA_MSG_ID);
-    return metadata;
   });
 }
 
@@ -313,14 +385,19 @@ export function retrieveSpriteAndMetadataInfo(metadataPath: string,
   if (metadataPath) {
     metadataPromise = new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
       logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
-      d3.text(metadataPath, (err: any, rawMetadata: string) => {
-        if (err) {
-          logging.setErrorMessage(err.responseText, 'fetching metadata');
-          reject(err);
-          return;
-        }
-        resolve(parseMetadata(rawMetadata));
-      });
+
+      const request = new XMLHttpRequest();
+      request.open('GET', metadataPath);
+      request.responseType = 'arraybuffer';
+
+      request.onerror = () => {
+        logging.setErrorMessage(request.responseText, 'fetching metadata');
+        reject();
+      };
+      request.onload = () => {
+        resolve(parseMetadata(request.response));
+      };
+      request.send(null);
     });
   }
   let spriteMsgId = null;
@@ -335,7 +412,7 @@ export function retrieveSpriteAndMetadataInfo(metadataPath: string,
     if (spriteMsgId) {
       logging.setModalMessage(null, spriteMsgId);
     }
-    let [metadata, spriteImage] = values;
+    const [metadata, spriteImage] = values;
 
     if (spriteImage && (spriteImage.height > MAX_SPRITE_IMAGE_SIZE_PX ||
                         spriteImage.width > MAX_SPRITE_IMAGE_SIZE_PX)) {
diff --git a/tensorflow/tensorboard/components/vz_projector/data.ts b/tensorflow/tensorboard/components/vz_projector/data.ts
index 0bdfe60665f..c4e81985fc8 100644
--- a/tensorflow/tensorboard/components/vz_projector/data.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data.ts
@@ -52,8 +52,8 @@ export interface SpriteAndMetadataInfo {
   spriteMetadata?: SpriteMetadata;
 }
 
-/** A single collection of points which make up a trace through space. */
-export interface DataTrace {
+/** A single collection of points which make up a sequence through space. */
+export interface Sequence {
   /** Indices into the DataPoints array in the Data object. */
   pointIndices: number[];
 }
@@ -68,8 +68,8 @@ export interface DataPoint {
    */
   metadata: PointMetadata;
 
-  /** index of the trace, used for highlighting on click */
-  traceIndex?: number;
+  /** index of the sequence, used for highlighting on click */
+  sequenceIndex?: number;
 
   /** index in the original data source */
   index: number;
@@ -88,8 +88,25 @@ export const PCA_SAMPLE_SIZE = 50000;
 export const PCA_SAMPLE_DIM = 200;
 /** Number of pca components to compute. */
 const NUM_PCA_COMPONENTS = 10;
-/** Reserved metadata attribute used for trace information. */
-const TRACE_METADATA_ATTR = '__next__';
+/**
+ * Reserved metadata attributes used for sequence information
+ * NOTE: Use "__seq_next__" as "__next__" is deprecated.
+ */
+const SEQUENCE_METADATA_ATTRS = ['__next__', '__seq_next__'];
+
+function getSequenceNextPointIndex(pointMetadata: PointMetadata): number|null {
+  let sequenceAttr = null;
+  for (let metadataAttr of SEQUENCE_METADATA_ATTRS) {
+    if (metadataAttr in pointMetadata && pointMetadata[metadataAttr] !== '') {
+      sequenceAttr = pointMetadata[metadataAttr];
+      break;
+    }
+  }
+  if (sequenceAttr == null) {
+    return null;
+  }
+  return +sequenceAttr;
+}
 
 /**
  * Dataset contains a DataPoints array that should be treated as immutable. This
@@ -100,7 +117,7 @@ const TRACE_METADATA_ATTR = '__next__';
  */
 export class DataSet {
   points: DataPoint[];
-  traces: DataTrace[];
+  sequences: Sequence[];
 
   shuffledDataIndices: number[] = [];
 
@@ -108,7 +125,7 @@ export class DataSet {
    * This keeps a list of all current projections so you can easily test to see
    * if it's been calculated already.
    */
-  projections = d3.set();
+  projections: {[projection: string]: boolean} = {};
   nearest: knn.NearestEntry[][];
   nearestK: number;
   tSNEIteration: number = 0;
@@ -124,54 +141,54 @@ export class DataSet {
   constructor(
       points: DataPoint[], spriteAndMetadataInfo?: SpriteAndMetadataInfo) {
     this.points = points;
-    this.shuffledDataIndices = util.shuffle(d3.range(this.points.length));
-    this.traces = this.computeTraces(points);
+    this.shuffledDataIndices = util.shuffle(util.range(this.points.length));
+    this.sequences = this.computeSequences(points);
     this.dim = [this.points.length, this.points[0].vector.length];
     this.spriteAndMetadataInfo = spriteAndMetadataInfo;
   }
 
-  private computeTraces(points: DataPoint[]) {
-    // Keep a list of indices seen so we don't compute traces for a given
+  private computeSequences(points: DataPoint[]) {
+    // Keep a list of indices seen so we don't compute sequences for a given
     // point twice.
     let indicesSeen = new Int8Array(points.length);
-    // Compute traces.
-    let indexToTrace: {[index: number]: DataTrace} = {};
-    let traces: DataTrace[] = [];
+    // Compute sequences.
+    let indexToSequence: {[index: number]: Sequence} = {};
+    let sequences: Sequence[] = [];
     for (let i = 0; i < points.length; i++) {
       if (indicesSeen[i]) {
         continue;
       }
       indicesSeen[i] = 1;
 
-      // Ignore points without a trace attribute.
-      let next = points[i].metadata[TRACE_METADATA_ATTR];
-      if (next == null || next === '') {
+      // Ignore points without a sequence attribute.
+      let next = getSequenceNextPointIndex(points[i].metadata);
+      if (next == null) {
         continue;
       }
-      if (next in indexToTrace) {
-        let existingTrace = indexToTrace[+next];
+      if (next in indexToSequence) {
+        let existingSequence = indexToSequence[next];
         // Pushing at the beginning of the array.
-        existingTrace.pointIndices.unshift(i);
-        indexToTrace[i] = existingTrace;
+        existingSequence.pointIndices.unshift(i);
+        indexToSequence[i] = existingSequence;
         continue;
       }
-      // The current point is pointing to a new/unseen trace.
-      let newTrace: DataTrace = {pointIndices: []};
-      indexToTrace[i] = newTrace;
-      traces.push(newTrace);
+      // The current point is pointing to a new/unseen sequence.
+      let newSequence: Sequence = {pointIndices: []};
+      indexToSequence[i] = newSequence;
+      sequences.push(newSequence);
       let currentIndex = i;
       while (points[currentIndex]) {
-        newTrace.pointIndices.push(currentIndex);
-        let next = points[currentIndex].metadata[TRACE_METADATA_ATTR];
-        if (next != null && next !== '') {
-          indicesSeen[+next] = 1;
-          currentIndex = +next;
+        newSequence.pointIndices.push(currentIndex);
+        let next = getSequenceNextPointIndex(points[currentIndex].metadata);
+        if (next != null) {
+          indicesSeen[next] = 1;
+          currentIndex = next;
         } else {
           currentIndex = -1;
         }
       }
     }
-    return traces;
+    return sequences;
   }
 
   projectionCanBeRendered(projection: ProjectionType): boolean {
@@ -224,7 +241,7 @@ export class DataSet {
 
   /** Projects the dataset onto a given vector and caches the result. */
   projectLinear(dir: vector.Vector, label: string) {
-    this.projections.add(label);
+    this.projections[label] = true;
     this.points.forEach(dataPoint => {
       dataPoint.projections[label] = vector.dot(dataPoint.vector, dir);
     });
@@ -232,7 +249,7 @@ export class DataSet {
 
   /** Projects the dataset along the top 10 principal components. */
   projectPCA(): Promise<void> {
-    if (this.projections.has('pca-0')) {
+    if (this.projections['pca-0'] != null) {
       return Promise.resolve<void>(null);
     }
     return util.runAsyncTask('Computing PCA...', () => {
@@ -273,7 +290,7 @@ export class DataSet {
       });
       for (let d = 0; d < NUM_PCA_COMPONENTS; d++) {
         let label = 'pca-' + d;
-        this.projections.add(label);
+        this.projections[label] = true;
         for (let i = 0; i < pcaVectors.length; i++) {
           let pointIndex = this.shuffledDataIndices[i];
           this.points[pointIndex].projections[label] = pcaVectors[i][d];
diff --git a/tensorflow/tensorboard/components/vz_projector/data_test.ts b/tensorflow/tensorboard/components/vz_projector/data_test.ts
deleted file mode 100644
index 6d92287637c..00000000000
--- a/tensorflow/tensorboard/components/vz_projector/data_test.ts
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataPoint, DataSet, State, stateGetAccessorDimensions} from './data';
-
-/**
- * Helper method that makes a list of points given an array of
- * trace indexes.
- *
- * @param traces The i-th entry holds the 'next' attribute for the i-th point.
- */
-function makePointsWithTraces(traces: number[]) {
-  let nextAttr = '__next__';
-  let points: DataPoint[] = [];
-  traces.forEach((t, i) => {
-    let metadata: {[key: string]: any} = {};
-    metadata[nextAttr] = t >= 0 ? t : null;
-    points.push({
-      vector: new Float32Array(0),
-      metadata: metadata,
-      projections: {},
-      index: i
-    });
-  });
-  return points;
-}
-
-describe('constructor_with_traces', () => {
-  it('Simple forward pointing traces', () => {
-    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
-    // one trace 0->2->3.
-    const points = makePointsWithTraces([2, -1, 3, -1]);
-    let dataset = new DataSet(points);
-    expect(dataset.traces.length).toEqual(1);
-    expect(dataset.traces[0].pointIndices).toEqual([0, 2, 3]);
-  });
-
-  it('No traces', () => {
-    let points = makePointsWithTraces([-1, -1, -1, -1]);
-    let dataset = new DataSet(points);
-    expect(dataset.traces.length).toEqual(0);
-  });
-
-  it('A trace that goes backwards and forward in the array', () => {
-    // The input is: 0->2, 1->0, 2->nothing, 3->1. This should return
-    // one trace 3->1->0->2.
-    let points = makePointsWithTraces([2, 0, -1, 1]);
-    let dataset = new DataSet(points);
-    expect(dataset.traces.length).toEqual(1);
-    expect(dataset.traces[0].pointIndices).toEqual([3, 1, 0, 2]);
-  });
-});
-
-describe('stateGetAccessorDimensions', () => {
-  it('returns [0, 1] for 2d t-SNE', () => {
-    const state = new State();
-    state.selectedProjection = 'tsne';
-    state.tSNEis3d = false;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1]);
-  });
-
-  it('returns [0, 1, 2] for 3d t-SNE', () => {
-    const state = new State();
-    state.selectedProjection = 'tsne';
-    state.tSNEis3d = true;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1, 2]);
-  });
-
-  it('returns pca component dimensions array for pca', () => {
-    const state = new State();
-    state.selectedProjection = 'pca';
-    state.pcaComponentDimensions = [13, 12, 11, 10];
-    expect(stateGetAccessorDimensions(state))
-        .toEqual(state.pcaComponentDimensions);
-  });
-
-  it('returns ["x", "y"] for custom projections', () => {
-    const state = new State();
-    state.selectedProjection = 'custom';
-    expect(stateGetAccessorDimensions(state)).toEqual(['x', 'y']);
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/logging.ts b/tensorflow/tensorboard/components/vz_projector/logging.ts
index b51b7026530..59f37206012 100644
--- a/tensorflow/tensorboard/components/vz_projector/logging.ts
+++ b/tensorflow/tensorboard/components/vz_projector/logging.ts
@@ -53,19 +53,22 @@ export function setModalMessage(
   dialog.querySelector('#notification-title').innerHTML = title;
   let msgsContainer = dialog.querySelector('#notify-msgs') as HTMLElement;
   if (isErrorMsg) {
-    d3.select(msgsContainer).html('');
+    msgsContainer.innerHTML = '';
   } else {
-    d3.select(msgsContainer).selectAll('.error').remove();
+    const errors = msgsContainer.querySelectorAll('.error');
+    for (let i = 0; i < errors.length; i++) {
+      msgsContainer.removeChild(errors[i]);
+    }
   }
   let divId = `notify-msg-${id}`;
-  let msgDiv = d3.select(dialog.querySelector('#' + divId));
-  let exists = msgDiv.size() > 0;
-  if (!exists) {
-    msgDiv = d3.select(msgsContainer)
-                 .insert('div', ':first-child')
-                 .attr('class', 'notify-msg')
-                 .classed('error', isErrorMsg)
-                 .attr('id', divId);
+  let msgDiv = dialog.querySelector('#' + divId) as HTMLDivElement;
+  if (msgDiv == null) {
+    msgDiv = document.createElement('div');
+    msgDiv.className = 'notify-msg ' + (isErrorMsg ? 'error' : '');
+    msgDiv.id = divId;
+
+    msgsContainer.insertBefore(msgDiv, msgsContainer.firstChild);
+
     if (!isErrorMsg) {
       numActiveMessages++;
     } else {
@@ -79,7 +82,7 @@ export function setModalMessage(
     }
     msgDiv.remove();
   } else {
-    msgDiv.text(msg);
+    msgDiv.innerText = msg;
     dialog.open();
   }
   return id;
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
index 9f8b2ccb70c..c0da9526598 100644
--- a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
+++ b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
@@ -20,8 +20,8 @@ import {LabelRenderParams} from './renderContext';
 import {ScatterPlot} from './scatterPlot';
 import {ScatterPlotVisualizer3DLabels} from './scatterPlotVisualizer3DLabels';
 import {ScatterPlotVisualizerCanvasLabels} from './scatterPlotVisualizerCanvasLabels';
+import {ScatterPlotVisualizerPolylines} from './scatterPlotVisualizerPolylines';
 import {ScatterPlotVisualizerSprites} from './scatterPlotVisualizerSprites';
-import {ScatterPlotVisualizerTraces} from './scatterPlotVisualizerTraces';
 import * as vector from './vector';
 
 const LABEL_FONT_SIZE = 10;
@@ -50,22 +50,22 @@ const LABELS_3D_COLOR_NO_SELECTION = 0xFFFFFF;
 const SPRITE_IMAGE_COLOR_UNSELECTED = 0xFFFFFF;
 const SPRITE_IMAGE_COLOR_NO_SELECTION = 0xFFFFFF;
 
-const TRACE_START_HUE = 60;
-const TRACE_END_HUE = 360;
-const TRACE_SATURATION = 1;
-const TRACE_LIGHTNESS = .3;
+const POLYLINE_START_HUE = 60;
+const POLYLINE_END_HUE = 360;
+const POLYLINE_SATURATION = 1;
+const POLYLINE_LIGHTNESS = .3;
 
-const TRACE_DEFAULT_OPACITY = .2;
-const TRACE_DEFAULT_LINEWIDTH = 2;
-const TRACE_SELECTED_OPACITY = .9;
-const TRACE_SELECTED_LINEWIDTH = 3;
-const TRACE_DESELECTED_OPACITY = .05;
+const POLYLINE_DEFAULT_OPACITY = .2;
+const POLYLINE_DEFAULT_LINEWIDTH = 2;
+const POLYLINE_SELECTED_OPACITY = .9;
+const POLYLINE_SELECTED_LINEWIDTH = 3;
+const POLYLINE_DESELECTED_OPACITY = .05;
 
 const SCATTER_PLOT_CUBE_LENGTH = 2;
 
 /** Color scale for nearest neighbors. */
 const NN_COLOR_SCALE =
-    d3.scale.linear<string>()
+    d3.scaleLinear<string, string>()
         .domain([1, 0.7, 0.4])
         .range(['hsl(285, 80%, 40%)', 'hsl(0, 80%, 65%)', 'hsl(40, 70%, 60%)'])
         .clamp(true);
@@ -76,7 +76,6 @@ const NN_COLOR_SCALE =
  */
 export class ProjectorScatterPlotAdapter {
   public scatterPlot: ScatterPlot;
-  private scatterPlotContainer: d3.Selection<any>;
   private projection: Projection;
   private hoverPointIndex: number;
   private selectedPointIndices: number[];
@@ -89,14 +88,13 @@ export class ProjectorScatterPlotAdapter {
   private spriteVisualizer: ScatterPlotVisualizerSprites;
   private labels3DVisualizer: ScatterPlotVisualizer3DLabels;
   private canvasLabelsVisualizer: ScatterPlotVisualizerCanvasLabels;
-  private traceVisualizer: ScatterPlotVisualizerTraces;
+  private polylineVisualizer: ScatterPlotVisualizerPolylines;
 
   constructor(
-      scatterPlotContainer: d3.Selection<any>,
+      private scatterPlotContainer: HTMLElement,
       projectorEventContext: ProjectorEventContext) {
     this.scatterPlot =
         new ScatterPlot(scatterPlotContainer, projectorEventContext);
-    this.scatterPlotContainer = scatterPlotContainer;
     projectorEventContext.registerProjectionChangedListener(projection => {
       this.projection = projection;
       this.updateScatterPlotWithNewProjection(projection);
@@ -134,8 +132,8 @@ export class ProjectorScatterPlotAdapter {
       // atomic unit of update.
       this.projection.dataSet = dataSet;
     }
-    if (this.traceVisualizer != null) {
-      this.traceVisualizer.setDataSet(dataSet);
+    if (this.polylineVisualizer != null) {
+      this.polylineVisualizer.setDataSet(dataSet);
     }
     if (this.labels3DVisualizer != null) {
       this.labels3DVisualizer.setLabelStrings(
@@ -222,18 +220,19 @@ export class ProjectorScatterPlotAdapter {
         dataSet, selectedSet, neighbors, hoverIndex);
     const labels = this.generateVisibleLabelRenderParams(
         dataSet, selectedSet, neighbors, hoverIndex);
-    const traceColors = this.generateLineSegmentColorMap(dataSet, pointColorer);
-    const traceOpacities =
+    const polylineColors =
+        this.generateLineSegmentColorMap(dataSet, pointColorer);
+    const polylineOpacities =
         this.generateLineSegmentOpacityArray(dataSet, selectedSet);
-    const traceWidths =
+    const polylineWidths =
         this.generateLineSegmentWidthArray(dataSet, selectedSet);
 
     this.scatterPlot.setPointColors(pointColors);
     this.scatterPlot.setPointScaleFactors(pointScaleFactors);
     this.scatterPlot.setLabels(labels);
-    this.scatterPlot.setTraceColors(traceColors);
-    this.scatterPlot.setTraceOpacities(traceOpacities);
-    this.scatterPlot.setTraceWidths(traceWidths);
+    this.scatterPlot.setPolylineColors(polylineColors);
+    this.scatterPlot.setPolylineOpacities(polylineOpacities);
+    this.scatterPlot.setPolylineWidths(polylineWidths);
   }
 
   render() {
@@ -246,9 +245,9 @@ export class ProjectorScatterPlotAdapter {
       return null;
     }
 
-    const xScaler: d3.scale.Linear<number, number> = d3.scale.linear();
-    const yScaler: d3.scale.Linear<number, number> = d3.scale.linear();
-    let zScaler: d3.scale.Linear<number, number> = null;
+    const xScaler = d3.scaleLinear();
+    const yScaler = d3.scaleLinear();
+    let zScaler = null;
     {
       // Determine max and min of each axis of our data.
       const xExtent = d3.extent(
@@ -268,7 +267,7 @@ export class ProjectorScatterPlotAdapter {
         const zExtent = d3.extent(
             ds.points,
             (p, i) => ds.points[i].projections[projectionComponents[2]]);
-        zScaler = d3.scale.linear();
+        zScaler = d3.scaleLinear();
         zScaler.domain(zExtent).range(range);
       }
     }
@@ -377,8 +376,8 @@ export class ProjectorScatterPlotAdapter {
     }
 
     return new LabelRenderParams(
-        visibleLabels, labelStrings, scale, opacityFlags, LABEL_FONT_SIZE,
-        fillColors, strokeColors);
+        new Float32Array(visibleLabels), labelStrings, scale, opacityFlags,
+        LABEL_FONT_SIZE, fillColors, strokeColors);
   }
 
   generatePointScaleFactorArray(
@@ -425,24 +424,23 @@ export class ProjectorScatterPlotAdapter {
 
   generateLineSegmentColorMap(
       ds: DataSet, legendPointColorer: (ds: DataSet, index: number) => string):
-      {[trace: number]: Float32Array} {
-    let traceColorArrayMap: {[trace: number]: Float32Array} = {};
+      {[polylineIndex: number]: Float32Array} {
+    let polylineColorArrayMap: {[polylineIndex: number]: Float32Array} = {};
     if (ds == null) {
-      return traceColorArrayMap;
+      return polylineColorArrayMap;
     }
 
-    for (let i = 0; i < ds.traces.length; i++) {
-      let dataTrace = ds.traces[i];
-      let colors =
-          new Float32Array(2 * (dataTrace.pointIndices.length - 1) * 3);
+    for (let i = 0; i < ds.sequences.length; i++) {
+      let sequence = ds.sequences[i];
+      let colors = new Float32Array(2 * (sequence.pointIndices.length - 1) * 3);
       let colorIndex = 0;
 
       if (legendPointColorer) {
-        for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-          const c1 = new THREE.Color(
-              legendPointColorer(ds, dataTrace.pointIndices[j]));
+        for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
+          const c1 =
+              new THREE.Color(legendPointColorer(ds, sequence.pointIndices[j]));
           const c2 = new THREE.Color(
-              legendPointColorer(ds, dataTrace.pointIndices[j + 1]));
+              legendPointColorer(ds, sequence.pointIndices[j + 1]));
           colors[colorIndex++] = c1.r;
           colors[colorIndex++] = c1.g;
           colors[colorIndex++] = c1.b;
@@ -451,11 +449,11 @@ export class ProjectorScatterPlotAdapter {
           colors[colorIndex++] = c2.b;
         }
       } else {
-        for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
+        for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
           const c1 =
-              getDefaultPointInTraceColor(j, dataTrace.pointIndices.length);
-          const c2 =
-              getDefaultPointInTraceColor(j + 1, dataTrace.pointIndices.length);
+              getDefaultPointInPolylineColor(j, sequence.pointIndices.length);
+          const c2 = getDefaultPointInPolylineColor(
+              j + 1, sequence.pointIndices.length);
           colors[colorIndex++] = c1.r;
           colors[colorIndex++] = c1.g;
           colors[colorIndex++] = c1.b;
@@ -465,10 +463,10 @@ export class ProjectorScatterPlotAdapter {
         }
       }
 
-      traceColorArrayMap[i] = colors;
+      polylineColorArrayMap[i] = colors;
     }
 
-    return traceColorArrayMap;
+    return polylineColorArrayMap;
   }
 
   generateLineSegmentOpacityArray(ds: DataSet, selectedPoints: number[]):
@@ -476,15 +474,15 @@ export class ProjectorScatterPlotAdapter {
     if (ds == null) {
       return new Float32Array(0);
     }
-    const opacities = new Float32Array(ds.traces.length);
+    const opacities = new Float32Array(ds.sequences.length);
     const selectedPointCount =
         (selectedPoints == null) ? 0 : selectedPoints.length;
     if (selectedPointCount > 0) {
-      opacities.fill(TRACE_DESELECTED_OPACITY);
-      const i = ds.points[selectedPoints[0]].traceIndex;
-      opacities[i] = TRACE_SELECTED_OPACITY;
+      opacities.fill(POLYLINE_DESELECTED_OPACITY);
+      const i = ds.points[selectedPoints[0]].sequenceIndex;
+      opacities[i] = POLYLINE_SELECTED_OPACITY;
     } else {
-      opacities.fill(TRACE_DEFAULT_OPACITY);
+      opacities.fill(POLYLINE_DEFAULT_OPACITY);
     }
     return opacities;
   }
@@ -494,13 +492,13 @@ export class ProjectorScatterPlotAdapter {
     if (ds == null) {
       return new Float32Array(0);
     }
-    const widths = new Float32Array(ds.traces.length);
-    widths.fill(TRACE_DEFAULT_LINEWIDTH);
+    const widths = new Float32Array(ds.sequences.length);
+    widths.fill(POLYLINE_DEFAULT_LINEWIDTH);
     const selectedPointCount =
         (selectedPoints == null) ? 0 : selectedPoints.length;
     if (selectedPointCount > 0) {
-      const i = ds.points[selectedPoints[0]].traceIndex;
-      widths[i] = TRACE_SELECTED_LINEWIDTH;
+      const i = ds.points[selectedPoints[0]].sequenceIndex;
+      widths[i] = POLYLINE_SELECTED_LINEWIDTH;
     }
     return widths;
   }
@@ -639,7 +637,7 @@ export class ProjectorScatterPlotAdapter {
     this.labels3DVisualizer = null;
     this.canvasLabelsVisualizer = null;
     this.spriteVisualizer = null;
-    this.traceVisualizer = null;
+    this.polylineVisualizer = null;
     if (inLabels3DMode) {
       this.labels3DVisualizer = new ScatterPlotVisualizer3DLabels();
       this.labels3DVisualizer.setLabelStrings(
@@ -650,7 +648,7 @@ export class ProjectorScatterPlotAdapter {
       this.canvasLabelsVisualizer =
           new ScatterPlotVisualizerCanvasLabels(this.scatterPlotContainer);
     }
-    this.traceVisualizer = new ScatterPlotVisualizerTraces();
+    this.polylineVisualizer = new ScatterPlotVisualizerPolylines();
     this.setDataSet(ds);
     if (this.spriteVisualizer) {
       scatterPlot.addVisualizer(this.spriteVisualizer);
@@ -661,7 +659,7 @@ export class ProjectorScatterPlotAdapter {
     if (this.canvasLabelsVisualizer) {
       scatterPlot.addVisualizer(this.canvasLabelsVisualizer);
     }
-    scatterPlot.addVisualizer(this.traceVisualizer);
+    scatterPlot.addVisualizer(this.polylineVisualizer);
   }
 
   private getSpriteImageMode(): boolean {
@@ -688,12 +686,12 @@ function styleRgbFromHexColor(hex: number): [number, number, number] {
   return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
 }
 
-function getDefaultPointInTraceColor(
+function getDefaultPointInPolylineColor(
     index: number, totalPoints: number): THREE.Color {
-  let hue =
-      TRACE_START_HUE + (TRACE_END_HUE - TRACE_START_HUE) * index / totalPoints;
+  let hue = POLYLINE_START_HUE +
+      (POLYLINE_END_HUE - POLYLINE_START_HUE) * index / totalPoints;
 
-  let rgb = d3.hsl(hue, TRACE_SATURATION, TRACE_LIGHTNESS).rgb();
+  let rgb = d3.hsl(hue, POLYLINE_SATURATION, POLYLINE_LIGHTNESS).rgb();
   return new THREE.Color(rgb.r / 255, rgb.g / 255, rgb.b / 255);
 }
 
diff --git a/tensorflow/tensorboard/components/vz_projector/renderContext.ts b/tensorflow/tensorboard/components/vz_projector/renderContext.ts
index 2e7e2545967..8d5232a8048 100644
--- a/tensorflow/tensorboard/components/vz_projector/renderContext.ts
+++ b/tensorflow/tensorboard/components/vz_projector/renderContext.ts
@@ -47,6 +47,7 @@ export class RenderContext {
       public farthestCameraSpacePointZ: number, public backgroundColor: number,
       public pointColors: Float32Array, public pointScaleFactors: Float32Array,
       public labels: LabelRenderParams,
-      public traceColors: {[trace: number]: Float32Array},
-      public traceOpacities: Float32Array, public traceWidths: Float32Array) {}
+      public polylineColors: {[polylineIndex: number]: Float32Array},
+      public polylineOpacities: Float32Array,
+      public polylineWidths: Float32Array) {}
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
index 2f68de6c9ca..283b608e836 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
@@ -72,9 +72,6 @@ export class CameraDef {
  * array of visualizers and dispatches application events to them.
  */
 export class ScatterPlot {
-  private projectorEventContext: ProjectorEventContext;
-
-  private containerNode: HTMLElement;
   private visualizers: ScatterPlotVisualizer[] = [];
 
   private onCameraMoveListeners: OnCameraMoveListener[] = [];
@@ -102,9 +99,9 @@ export class ScatterPlot {
   private pointColors: Float32Array;
   private pointScaleFactors: Float32Array;
   private labels: LabelRenderParams;
-  private traceColors: {[trace: number]: Float32Array};
-  private traceOpacities: Float32Array;
-  private traceWidths: Float32Array;
+  private polylineColors: {[polylineIndex: number]: Float32Array};
+  private polylineOpacities: Float32Array;
+  private polylineWidths: Float32Array;
 
   private selecting = false;
   private nearestPoint: number;
@@ -113,17 +110,15 @@ export class ScatterPlot {
   private rectangleSelector: ScatterPlotRectangleSelector;
 
   constructor(
-      container: d3.Selection<any>,
-      projectorEventContext: ProjectorEventContext) {
-    this.containerNode = container.node() as HTMLElement;
-    this.projectorEventContext = projectorEventContext;
+      private container: HTMLElement,
+      private projectorEventContext: ProjectorEventContext) {
     this.getLayoutValues();
 
     this.scene = new THREE.Scene();
     this.renderer = new THREE.WebGLRenderer(
         {alpha: true, premultipliedAlpha: false, antialias: false});
     this.renderer.setClearColor(BACKGROUND_COLOR, 1);
-    this.containerNode.appendChild(this.renderer.domElement);
+    this.container.appendChild(this.renderer.domElement);
     this.light = new THREE.PointLight(0xFFECBF, 1, 0);
     this.scene.add(this.light);
 
@@ -132,18 +127,16 @@ export class ScatterPlot {
     this.renderer.render(this.scene, this.camera);
 
     this.rectangleSelector = new ScatterPlotRectangleSelector(
-        this.containerNode,
+        this.container,
         (boundingBox: BoundingBox) => this.selectBoundingBox(boundingBox));
     this.addInteractionListeners();
   }
 
   private addInteractionListeners() {
-    this.containerNode.addEventListener(
-        'mousemove', this.onMouseMove.bind(this));
-    this.containerNode.addEventListener(
-        'mousedown', this.onMouseDown.bind(this));
-    this.containerNode.addEventListener('mouseup', this.onMouseUp.bind(this));
-    this.containerNode.addEventListener('click', this.onClick.bind(this));
+    this.container.addEventListener('mousemove', this.onMouseMove.bind(this));
+    this.container.addEventListener('mousedown', this.onMouseDown.bind(this));
+    this.container.addEventListener('mouseup', this.onMouseUp.bind(this));
+    this.container.addEventListener('click', this.onClick.bind(this));
     window.addEventListener('keydown', this.onKeyDown.bind(this), false);
     window.addEventListener('keyup', this.onKeyUp.bind(this), false);
   }
@@ -356,7 +349,7 @@ export class ScatterPlot {
     // If shift is pressed, start selecting
     if (e.keyCode === SHIFT_KEY) {
       this.selecting = true;
-      this.containerNode.style.cursor = 'crosshair';
+      this.container.style.cursor = 'crosshair';
     }
   }
 
@@ -371,7 +364,7 @@ export class ScatterPlot {
     if (e.keyCode === SHIFT_KEY) {
       this.selecting = (this.getMouseMode() === MouseMode.AREA_SELECT);
       if (!this.selecting) {
-        this.containerNode.style.cursor = 'default';
+        this.container.style.cursor = 'default';
       }
       this.render();
     }
@@ -441,8 +434,8 @@ export class ScatterPlot {
   }
 
   private getLayoutValues(): Point2D {
-    this.width = this.containerNode.offsetWidth;
-    this.height = Math.max(1, this.containerNode.offsetHeight);
+    this.width = this.container.offsetWidth;
+    this.height = Math.max(1, this.container.offsetHeight);
     return [this.width, this.height];
   }
 
@@ -596,7 +589,8 @@ export class ScatterPlot {
         this.camera, cameraType, this.orbitCameraControls.target, this.width,
         this.height, cameraSpacePointExtents[0], cameraSpacePointExtents[1],
         this.backgroundColor, this.pointColors, this.pointScaleFactors,
-        this.labels, this.traceColors, this.traceOpacities, this.traceWidths);
+        this.labels, this.polylineColors, this.polylineOpacities,
+        this.polylineWidths);
 
     // Render first pass to picking target. This render fills pickingTexture
     // with colors that are actually point ids, so that sampling the texture at
@@ -622,10 +616,10 @@ export class ScatterPlot {
     this.mouseMode = mouseMode;
     if (mouseMode === MouseMode.AREA_SELECT) {
       this.selecting = true;
-      this.containerNode.style.cursor = 'crosshair';
+      this.container.style.cursor = 'crosshair';
     } else {
       this.selecting = false;
-      this.containerNode.style.cursor = 'default';
+      this.container.style.cursor = 'default';
     }
   }
 
@@ -644,17 +638,17 @@ export class ScatterPlot {
     this.labels = labels;
   }
 
-  /** Set the colors for every data trace. (RGB triplets) */
-  setTraceColors(colors: {[trace: number]: Float32Array}) {
-    this.traceColors = colors;
+  /** Set the colors for every data polyline. (RGB triplets) */
+  setPolylineColors(colors: {[polylineIndex: number]: Float32Array}) {
+    this.polylineColors = colors;
   }
 
-  setTraceOpacities(opacities: Float32Array) {
-    this.traceOpacities = opacities;
+  setPolylineOpacities(opacities: Float32Array) {
+    this.polylineOpacities = opacities;
   }
 
-  setTraceWidths(widths: Float32Array) {
-    this.traceWidths = widths;
+  setPolylineWidths(widths: Float32Array) {
+    this.polylineWidths = widths;
   }
 
   getMouseMode(): MouseMode {
@@ -667,9 +661,11 @@ export class ScatterPlot {
   }
 
   setDayNightMode(isNight: boolean) {
-    d3.select(this.containerNode)
-        .selectAll('canvas')
-        .style('filter', isNight ? 'invert(100%)' : null);
+    const canvases = this.container.querySelectorAll('canvas');
+    const filterValue = isNight ? 'invert(100%)' : null;
+    for (let i = 0; i < canvases.length; i++) {
+      canvases[i].style.filter = filterValue;
+    }
   }
 
   resize(render = true) {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
index a2dba9dd257..a781877014e 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
@@ -32,8 +32,8 @@ export interface BoundingBox {
  * A class that manages and renders a data selection rectangle.
  */
 export class ScatterPlotRectangleSelector {
-  private svgElement: d3.Selection<any>;
-  private rectElement: d3.Selection<any>;
+  private svgElement: SVGElement;
+  private rectElement: SVGRectElement;
 
   private isMouseDown: boolean;
   private startCoordinates: [number, number];
@@ -51,20 +51,23 @@ export class ScatterPlotRectangleSelector {
   constructor(
       container: HTMLElement,
       selectionCallback: (boundingBox: BoundingBox) => void) {
-    this.svgElement = d3.select(container).select('#selector');
-    this.rectElement = this.svgElement.append('rect')
-                           .style('stroke', STROKE)
-                           .style('stroke-dasharray', STROKE_DASHARRAY)
-                           .style('stroke-width', STROKE_WIDTH)
-                           .style('fill', FILL)
-                           .style('fill-opacity', FILL_OPACITY);
+    this.svgElement = container.querySelector('#selector') as SVGElement;
+    this.rectElement =
+        document.createElementNS('http://www.w3.org/2000/svg', 'rect');
+    this.rectElement.style.stroke = STROKE;
+    this.rectElement.style.strokeDasharray = STROKE_DASHARRAY;
+    this.rectElement.style.strokeWidth = '' + STROKE_WIDTH;
+    this.rectElement.style.fill = FILL;
+    this.rectElement.style.fillOpacity = '' + FILL_OPACITY;
+    this.svgElement.appendChild(this.rectElement);
+
     this.selectionCallback = selectionCallback;
     this.isMouseDown = false;
   }
 
   onMouseDown(offsetX: number, offsetY: number) {
     this.isMouseDown = true;
-    this.svgElement.style('display', 'block');
+    this.rectElement.style.display = 'block';
 
     this.startCoordinates = [offsetX, offsetY];
     this.lastBoundingBox = {
@@ -87,19 +90,18 @@ export class ScatterPlotRectangleSelector {
     this.lastBoundingBox.height =
         this.lastBoundingBox.y - Math.min(offsetY, this.startCoordinates[1]);
 
-    this.rectElement.attr({
-      x: this.lastBoundingBox.x,
-      y: this.lastBoundingBox.y - this.lastBoundingBox.height,
-      width: this.lastBoundingBox.width,
-      height: this.lastBoundingBox.height
-    });
+    this.rectElement.setAttribute('x', '' + this.lastBoundingBox.x);
+    this.rectElement.setAttribute(
+        'y', '' + (this.lastBoundingBox.y - this.lastBoundingBox.height));
+    this.rectElement.setAttribute('width', '' + this.lastBoundingBox.width);
+    this.rectElement.setAttribute('height', '' + this.lastBoundingBox.height);
   }
 
   onMouseUp() {
     this.isMouseDown = false;
-    this.svgElement.style('display', 'none');
-    this.rectElement.attr('width', 0);
-    this.rectElement.attr('height', 0);
+    this.rectElement.style.display = 'none';
+    this.rectElement.setAttribute('width', '0');
+    this.rectElement.setAttribute('height', '0');
     this.selectionCallback(this.lastBoundingBox);
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
index cbd9785e2f6..7820af0d48d 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
@@ -38,7 +38,7 @@ const VERTICES_PER_GLYPH = 2 * 3;  // 2 triangles, 3 verts per triangle
  *            bottom center of the word is positioned at (0, 0);
  *    position: The position of the label in worldspace.
  *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
- *    color: The color of the label (matches the cooresponding point's color.)
+ *    color: The color of the label (matches the corresponding point's color.)
  *    wordShown: Boolean. Whether or not the label is visible.
  */
 
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
index 04603e0fba8..2f3146d213c 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
@@ -33,10 +33,14 @@ export class ScatterPlotVisualizerCanvasLabels implements
   private canvas: HTMLCanvasElement;
   private labelsActive: boolean = true;
 
-  constructor(container: d3.Selection<any>) {
-    this.canvas = container.append('canvas').node() as HTMLCanvasElement;
+  constructor(container: HTMLElement) {
+    this.canvas = document.createElement('canvas');
+    container.appendChild(this.canvas);
+
     this.gc = this.canvas.getContext('2d');
-    d3.select(this.canvas).style({position: 'absolute', left: 0, top: 0});
+    this.canvas.style.position = 'absolute';
+    this.canvas.style.left = '0';
+    this.canvas.style.top = '0';
     this.canvas.style.pointerEvents = 'none';
   }
 
@@ -69,7 +73,7 @@ export class ScatterPlotVisualizerCanvasLabels implements
     }
 
     let opacityMap =
-        d3.scale.pow()
+        d3.scalePow()
             .exponent(Math.E)
             .domain([rc.farthestCameraSpacePointZ, rc.nearestCameraSpacePointZ])
             .range([0.1, 1]);
@@ -151,10 +155,10 @@ export class ScatterPlotVisualizerCanvasLabels implements
 
   onResize(newWidth: number, newHeight: number) {
     let dpr = window.devicePixelRatio;
-    d3.select(this.canvas)
-        .attr('width', newWidth * dpr)
-        .attr('height', newHeight * dpr)
-        .style({width: newWidth + 'px', height: newHeight + 'px'});
+    this.canvas.width = newWidth * dpr;
+    this.canvas.height = newHeight * dpr;
+    this.canvas.style.width = newWidth + 'px';
+    this.canvas.style.height = newHeight + 'px';
   }
 
   dispose() {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts
new file mode 100644
index 00000000000..e6d4aeda28b
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts
@@ -0,0 +1,149 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataSet} from './data';
+import {RenderContext} from './renderContext';
+import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
+import * as util from './util';
+
+const RGB_NUM_ELEMENTS = 3;
+const XYZ_NUM_ELEMENTS = 3;
+
+/**
+ * Renders polylines that connect multiple points in the dataset.
+ */
+export class ScatterPlotVisualizerPolylines implements ScatterPlotVisualizer {
+  private dataSet: DataSet;
+  private scene: THREE.Scene;
+  private polylines: THREE.Line[];
+  private polylinePositionBuffer:
+      {[polylineIndex: number]: THREE.BufferAttribute} = {};
+  private polylineColorBuffer:
+      {[polylineIndex: number]: THREE.BufferAttribute} = {};
+
+  private updateSequenceIndicesInDataSet(ds: DataSet) {
+    for (let i = 0; i < ds.sequences.length; i++) {
+      const sequence = ds.sequences[i];
+      for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
+        ds.points[sequence.pointIndices[j]].sequenceIndex = i;
+        ds.points[sequence.pointIndices[j + 1]].sequenceIndex = i;
+      }
+    }
+  }
+
+  private createPolylines(scene: THREE.Scene) {
+    if (!this.dataSet || !this.dataSet.sequences) {
+      return;
+    }
+
+    this.updateSequenceIndicesInDataSet(this.dataSet);
+    this.polylines = [];
+
+    for (let i = 0; i < this.dataSet.sequences.length; i++) {
+      const geometry = new THREE.BufferGeometry();
+      geometry.addAttribute('position', this.polylinePositionBuffer[i]);
+      geometry.addAttribute('color', this.polylineColorBuffer[i]);
+
+      const material = new THREE.LineBasicMaterial({
+        linewidth: 1,  // unused default, overwritten by width array.
+        opacity: 1.0,  // unused default, overwritten by opacity array.
+        transparent: true,
+        vertexColors: THREE.VertexColors
+      });
+
+      const polyline = new THREE.LineSegments(geometry, material);
+      polyline.frustumCulled = false;
+      this.polylines.push(polyline);
+      scene.add(polyline);
+    }
+  }
+
+  dispose() {
+    if (this.polylines == null) {
+      return;
+    }
+    for (let i = 0; i < this.polylines.length; i++) {
+      this.scene.remove(this.polylines[i]);
+      this.polylines[i].geometry.dispose();
+    }
+    this.polylines = null;
+    this.polylinePositionBuffer = {};
+    this.polylineColorBuffer = {};
+  }
+
+  setScene(scene: THREE.Scene) {
+    this.scene = scene;
+  }
+
+  setDataSet(dataSet: DataSet) {
+    this.dataSet = dataSet;
+  }
+
+  onPointPositionsChanged(newPositions: Float32Array) {
+    if ((newPositions == null) || (this.polylines != null)) {
+      this.dispose();
+    }
+    if ((newPositions == null) || (this.dataSet == null)) {
+      return;
+    }
+    // Set up the position buffer arrays for each polyline.
+    for (let i = 0; i < this.dataSet.sequences.length; i++) {
+      let sequence = this.dataSet.sequences[i];
+      const vertexCount = 2 * (sequence.pointIndices.length - 1);
+
+      let polylines = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
+      this.polylinePositionBuffer[i] =
+          new THREE.BufferAttribute(polylines, XYZ_NUM_ELEMENTS);
+
+      let colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
+      this.polylineColorBuffer[i] =
+          new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
+    }
+    for (let i = 0; i < this.dataSet.sequences.length; i++) {
+      const sequence = this.dataSet.sequences[i];
+      let src = 0;
+      for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
+        const p1Index = sequence.pointIndices[j];
+        const p2Index = sequence.pointIndices[j + 1];
+        const p1 = util.vector3FromPackedArray(newPositions, p1Index);
+        const p2 = util.vector3FromPackedArray(newPositions, p2Index);
+        this.polylinePositionBuffer[i].setXYZ(src, p1.x, p1.y, p1.z);
+        this.polylinePositionBuffer[i].setXYZ(src + 1, p2.x, p2.y, p2.z);
+        src += 2;
+      }
+      this.polylinePositionBuffer[i].needsUpdate = true;
+    }
+
+    if (this.polylines == null) {
+      this.createPolylines(this.scene);
+    }
+  }
+
+  onRender(renderContext: RenderContext) {
+    if (this.polylines == null) {
+      return;
+    }
+    for (let i = 0; i < this.polylines.length; i++) {
+      this.polylines[i].material.opacity = renderContext.polylineOpacities[i];
+      (this.polylines[i].material as THREE.LineBasicMaterial).linewidth =
+          renderContext.polylineWidths[i];
+      this.polylineColorBuffer[i].array = renderContext.polylineColors[i];
+      this.polylineColorBuffer[i].needsUpdate = true;
+    }
+  }
+
+  onPickingRender(renderContext: RenderContext) {}
+  onResize(newWidth: number, newHeight: number) {}
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
index 90b6d0273ed..be9c1703c72 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
@@ -146,6 +146,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
   private standinTextureForPoints: THREE.Texture;
   private spritesPerRow: number;
   private spritesPerColumn: number;
+  private spriteDimensions: [number, number];
   private spriteIndexBufferAttribute: THREE.BufferAttribute;
   private renderMaterial: THREE.ShaderMaterial;
   private pickingMaterial: THREE.ShaderMaterial;
@@ -168,7 +169,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
     this.texture = util.createTexture(spriteAtlas);
     this.spritesPerRow = spriteAtlas.width / spriteDimensions[0];
     this.spritesPerColumn = spriteAtlas.height / spriteDimensions[1];
-
+    this.spriteDimensions = spriteDimensions;
     this.spriteIndexBufferAttribute =
         new THREE.BufferAttribute(spriteIndices, INDEX_NUM_ELEMENTS);
 
@@ -242,7 +243,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
 
   private calculatePointSize(sceneIs3D: boolean): number {
     if (this.texture != null) {
-      return IMAGE_SIZE;
+      return sceneIs3D ? IMAGE_SIZE : this.spriteDimensions[0];
     }
     const n = (this.worldSpacePointPositions != null) ?
         (this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS) :
@@ -329,7 +330,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
 
   setSpriteAtlas(
       spriteImage: HTMLImageElement, spriteDimensions: [number, number],
-      spriteIndices: Uint8Array) {
+      spriteIndices: Float32Array) {
     this.disposeTextureAtlas();
     this.createTextureFromSpriteAtlas(
         spriteImage, spriteDimensions, spriteIndices);
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts
deleted file mode 100644
index a1ff747ff3a..00000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataSet} from './data';
-import {RenderContext} from './renderContext';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-
-const RGB_NUM_ELEMENTS = 3;
-const XYZ_NUM_ELEMENTS = 3;
-
-/**
- * Renders 'traces' (polylines) that connect multiple points in the dataset
- */
-export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
-  private dataSet: DataSet;
-  private scene: THREE.Scene;
-  private traces: THREE.Line[];
-  private tracePositionBuffer: {[trace: number]: THREE.BufferAttribute} = {};
-  private traceColorBuffer: {[trace: number]: THREE.BufferAttribute} = {};
-
-  private updateTraceIndicesInDataSet(ds: DataSet) {
-    for (let i = 0; i < ds.traces.length; i++) {
-      const trace = ds.traces[i];
-      for (let j = 0; j < trace.pointIndices.length - 1; j++) {
-        ds.points[trace.pointIndices[j]].traceIndex = i;
-        ds.points[trace.pointIndices[j + 1]].traceIndex = i;
-      }
-    }
-  }
-
-  private createTraces(scene: THREE.Scene) {
-    if (!this.dataSet || !this.dataSet.traces) {
-      return;
-    }
-
-    this.updateTraceIndicesInDataSet(this.dataSet);
-    this.traces = [];
-
-    for (let i = 0; i < this.dataSet.traces.length; i++) {
-      const geometry = new THREE.BufferGeometry();
-      geometry.addAttribute('position', this.tracePositionBuffer[i]);
-      geometry.addAttribute('color', this.traceColorBuffer[i]);
-
-      const material = new THREE.LineBasicMaterial({
-        linewidth: 1,  // unused default, overwritten by width array.
-        opacity: 1.0,  // unused default, overwritten by opacity array.
-        transparent: true,
-        vertexColors: THREE.VertexColors
-      });
-
-      const trace = new THREE.LineSegments(geometry, material);
-      trace.frustumCulled = false;
-      this.traces.push(trace);
-      scene.add(trace);
-    }
-  }
-
-  dispose() {
-    if (this.traces == null) {
-      return;
-    }
-    for (let i = 0; i < this.traces.length; i++) {
-      this.scene.remove(this.traces[i]);
-      this.traces[i].geometry.dispose();
-    }
-    this.traces = null;
-    this.tracePositionBuffer = {};
-    this.traceColorBuffer = {};
-  }
-
-  setScene(scene: THREE.Scene) {
-    this.scene = scene;
-  }
-
-  setDataSet(dataSet: DataSet) {
-    this.dataSet = dataSet;
-  }
-
-  onPointPositionsChanged(newPositions: Float32Array) {
-    if ((newPositions == null) || (this.traces != null)) {
-      this.dispose();
-    }
-    if ((newPositions == null) || (this.dataSet == null)) {
-      return;
-    }
-    // Set up the position buffer arrays for each trace.
-    for (let i = 0; i < this.dataSet.traces.length; i++) {
-      let dataTrace = this.dataSet.traces[i];
-      const vertexCount = 2 * (dataTrace.pointIndices.length - 1);
-
-      let traces = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
-      this.tracePositionBuffer[i] =
-          new THREE.BufferAttribute(traces, XYZ_NUM_ELEMENTS);
-
-      let colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
-      this.traceColorBuffer[i] =
-          new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
-    }
-    for (let i = 0; i < this.dataSet.traces.length; i++) {
-      const dataTrace = this.dataSet.traces[i];
-      let src = 0;
-      for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-        const p1Index = dataTrace.pointIndices[j];
-        const p2Index = dataTrace.pointIndices[j + 1];
-        const p1 = util.vector3FromPackedArray(newPositions, p1Index);
-        const p2 = util.vector3FromPackedArray(newPositions, p2Index);
-        this.tracePositionBuffer[i].setXYZ(src, p1.x, p1.y, p1.z);
-        this.tracePositionBuffer[i].setXYZ(src + 1, p2.x, p2.y, p2.z);
-        src += 2;
-      }
-      this.tracePositionBuffer[i].needsUpdate = true;
-    }
-
-    if (this.traces == null) {
-      this.createTraces(this.scene);
-    }
-  }
-
-  onRender(renderContext: RenderContext) {
-    if (this.traces == null) {
-      return;
-    }
-    for (let i = 0; i < this.traces.length; i++) {
-      this.traces[i].material.opacity = renderContext.traceOpacities[i];
-      (this.traces[i].material as THREE.LineBasicMaterial).linewidth =
-          renderContext.traceWidths[i];
-      this.traceColorBuffer[i].array = renderContext.traceColors[i];
-      this.traceColorBuffer[i].needsUpdate = true;
-    }
-  }
-
-  onPickingRender(renderContext: RenderContext) {}
-  onResize(newWidth: number, newHeight: number) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/test/BUILD b/tensorflow/tensorboard/components/vz_projector/test/BUILD
new file mode 100644
index 00000000000..fc8659f06a3
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/BUILD
@@ -0,0 +1,36 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
+
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+ts_web_library(
+    name = "test",
+    srcs = [
+        "assert.ts",
+        "data-provider_test.ts",
+        "data_test.ts",
+        "sptree_test.ts",
+        "tests.html",
+        "util_test.ts",
+        # "scatterPlotRectangleSelector_test.ts",
+        # "vz-projector-projections-panel_test.ts",
+    ],
+    path = "/vz-projector/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:polymer",
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
+        "//tensorflow/tensorboard/components/vz_projector",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_projector/test/assert.ts b/tensorflow/tensorboard/components/vz_projector/test/assert.ts
new file mode 100644
index 00000000000..f489517a7f2
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/assert.ts
@@ -0,0 +1,16 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+const assert = chai.assert;
diff --git a/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts
new file mode 100644
index 00000000000..59a42ffbfd8
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataPoint, SpriteAndMetadataInfo} from '../data';
+import * as data_provider from '../data-provider';
+
+/**
+ * Converts a string to an ArrayBuffer.
+ */
+function stringToArrayBuffer(str: string): Promise<ArrayBuffer> {
+  return new Promise<ArrayBuffer>((resolve, reject) => {
+    let blob = new Blob([str]);
+    let file = new FileReader();
+    file.onload = (e: any) => {
+      resolve(e.target.result);
+    };
+    file.readAsArrayBuffer(blob);
+  });
+}
+
+/**
+ * Converts an data array to TSV format.
+ */
+function dataToTsv(data: string[][]|number[][]) {
+  let lines = [];
+  for (let i = 0; i < data.length; i++) {
+    lines.push(data[i].join('\t'));
+  }
+  return lines.join('\n');
+}
+
+describe('parse tensors', () => {
+  it('parseTensors', (doneFn) => {
+    let tensors = [[1.0, 2.0], [2.0, 3.0]];
+    stringToArrayBuffer(dataToTsv(tensors))
+        .then((tensorsArrayBuffer: ArrayBuffer) => {
+          data_provider.parseTensors(tensorsArrayBuffer)
+              .then((data: DataPoint[]) => {
+                assert.equal(2, data.length);
+
+                assert.deepEqual(new Float32Array(tensors[0]), data[0].vector);
+                assert.equal(0, data[0].index);
+                assert.isNull(data[0].projections);
+
+                assert.deepEqual(new Float32Array(tensors[1]), data[1].vector);
+                assert.equal(1, data[1].index);
+                assert.isNull(data[1].projections);
+                doneFn();
+              });
+        });
+  });
+  it('parseMetadata', (doneFn) => {
+    let metadata = [['label', 'fakecol'], ['Г', '0'], ['label1', '1']];
+
+    stringToArrayBuffer(dataToTsv(metadata))
+        .then((metadataArrayBuffer: ArrayBuffer) => {
+          data_provider.parseMetadata(metadataArrayBuffer)
+              .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => {
+                assert.equal(2, spriteAndMetadataInfo.stats.length);
+                assert.equal(metadata[0][0],
+                             spriteAndMetadataInfo.stats[0].name);
+                assert.isFalse(spriteAndMetadataInfo.stats[0].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[0].tooManyUniqueValues);
+                assert.equal(metadata[0][1],
+                             spriteAndMetadataInfo.stats[1].name);
+                assert.isTrue(spriteAndMetadataInfo.stats[1].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[1].tooManyUniqueValues);
+
+                assert.equal(2, spriteAndMetadataInfo.pointsInfo.length);
+                assert.equal(metadata[1][0],
+                             spriteAndMetadataInfo.pointsInfo[0]['label']);
+                assert.equal(+metadata[1][1],
+                             spriteAndMetadataInfo.pointsInfo[0]['fakecol']);
+                assert.equal(metadata[2][0],
+                             spriteAndMetadataInfo.pointsInfo[1]['label']);
+                assert.equal(+metadata[2][1],
+                             spriteAndMetadataInfo.pointsInfo[1]['fakecol']);
+                doneFn();
+              });
+        });
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/data_test.ts b/tensorflow/tensorboard/components/vz_projector/test/data_test.ts
new file mode 100644
index 00000000000..5e47c091c5b
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/data_test.ts
@@ -0,0 +1,104 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataPoint, DataSet, State, stateGetAccessorDimensions} from '../data';
+
+/**
+ * Helper method that makes a list of points given an array of
+ * sequence indexes.
+ *
+ * @param sequences The i-th entry holds the 'next' attribute for the i-th
+ * point.
+ */
+function makePointsWithSequences(
+    sequences: number[], nextAttr = '__seq_next__') {
+  let points: DataPoint[] = [];
+  sequences.forEach((t, i) => {
+    let metadata: {[key: string]: any} = {};
+    metadata[nextAttr] = t >= 0 ? t : null;
+    points.push({
+      vector: new Float32Array(0),
+      metadata: metadata,
+      projections: {},
+      index: i
+    });
+  });
+  return points;
+}
+
+describe('constructor_with_sequences', () => {
+  it('Simple forward pointing sequences, __seq_next__ metadata format', () => {
+    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
+    // one sequence 0->2->3.
+    const points = makePointsWithSequences([2, -1, 3, -1]);
+    let dataset = new DataSet(points);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
+  });
+
+  it('Simple forward pointing sequences, __next__ metadata format', () => {
+    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
+    // one sequence 0->2->3.
+    const points = makePointsWithSequences([2, -1, 3, -1], '__next__');
+    let dataset = new DataSet(points);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
+  });
+
+  it('No sequences', () => {
+    let points = makePointsWithSequences([-1, -1, -1, -1]);
+    let dataset = new DataSet(points);
+    assert.equal(0, dataset.sequences.length);
+  });
+
+  it('A sequence that goes backwards and forward in the array', () => {
+    // The input is: 0->2, 1->0, 2->nothing, 3->1. This should return
+    // one sequence 3->1->0->2.
+    let points = makePointsWithSequences([2, 0, -1, 1]);
+    let dataset = new DataSet(points);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([3, 1, 0, 2], dataset.sequences[0].pointIndices);
+  });
+});
+
+describe('stateGetAccessorDimensions', () => {
+  it('returns [0, 1] for 2d t-SNE', () => {
+    const state = new State();
+    state.selectedProjection = 'tsne';
+    state.tSNEis3d = false;
+    assert.deepEqual([0, 1], stateGetAccessorDimensions(state));
+  });
+
+  it('returns [0, 1, 2] for 3d t-SNE', () => {
+    const state = new State();
+    state.selectedProjection = 'tsne';
+    state.tSNEis3d = true;
+    assert.deepEqual([0, 1, 2], stateGetAccessorDimensions(state));
+  });
+
+  it('returns pca component dimensions array for pca', () => {
+    const state = new State();
+    state.selectedProjection = 'pca';
+    state.pcaComponentDimensions = [13, 12, 11, 10];
+    assert.deepEqual(state.pcaComponentDimensions,
+                     stateGetAccessorDimensions(state));
+  });
+
+  it('returns ["x", "y"] for custom projections', () => {
+    const state = new State();
+    state.selectedProjection = 'custom';
+    assert.deepEqual(['x', 'y'], stateGetAccessorDimensions(state));
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts b/tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts
similarity index 90%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts
rename to tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts
index a93aca74a20..0ee6cf620df 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {BoundingBox, ScatterPlotRectangleSelector} from './scatterPlotRectangleSelector';
+import {BoundingBox, ScatterPlotRectangleSelector} from '../scatterPlotRectangleSelector';
 
 describe('selector callbacks make bounding box start bottom left', () => {
   let containerElement: HTMLElement;
@@ -22,6 +22,10 @@ describe('selector callbacks make bounding box start bottom left', () => {
 
   beforeEach(() => {
     containerElement = document.createElement('div');
+    const selector = document.createElement('svg');
+    selector.id = 'selector';
+    containerElement.appendChild(selector);
+
     selectionCallback = jasmine.createSpy('selectionCallback');
     selection =
         new ScatterPlotRectangleSelector(containerElement, selectionCallback);
diff --git a/tensorflow/tensorboard/components/vz_projector/sptree_test.ts b/tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts
similarity index 97%
rename from tensorflow/tensorboard/components/vz_projector/sptree_test.ts
rename to tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts
index 440680bdf1e..7e340ea62f5 100644
--- a/tensorflow/tensorboard/components/vz_projector/sptree_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {SPTree} from './sptree';
-
-const assert = chai.assert;
+import {SPTree} from '../sptree';
 
 it('simple 2D data', () => {
   let data = [
diff --git a/tensorflow/tensorboard/components/vz_projector/test/tests.html b/tensorflow/tensorboard/components/vz_projector/test/tests.html
new file mode 100644
index 00000000000..a6843d0d6b8
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/tests.html
@@ -0,0 +1,31 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../bundle.html">
+<body>
+<script src="assert.js"></script>
+<script src="sptree_test.js"></script>
+<script src="data_test.js"></script>
+<script src="data-provider_test.js"></script>
+<script src="util_test.js"></script>
+<!-- TODO(smilkov): Migrate these away from jasmine. -->
+<!-- <script src="scatterPlotRectangleSelector_test.js"></script>
+     <script src="vz-projector-projections-panel_test.js"></script> -->
diff --git a/tensorflow/tensorboard/components/vz_projector/util_test.ts b/tensorflow/tensorboard/components/vz_projector/test/util_test.ts
similarity index 79%
rename from tensorflow/tensorboard/components/vz_projector/util_test.ts
rename to tensorflow/tensorboard/components/vz_projector/test/util_test.ts
index f7c0027c81b..c18db95eed7 100644
--- a/tensorflow/tensorboard/components/vz_projector/util_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector/test/util_test.ts
@@ -12,31 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import * as util from './util';
+import * as util from '../util';
 
 describe('getURLParams', () => {
   it('search query with valid param returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('search query with multiple valid params returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/&foo=bar');
-    expect(urlParams).toEqual({'config': 'http://google.com/', 'foo': 'bar'});
+    assert.deepEqual({'config': 'http://google.com/', 'foo': 'bar'}, urlParams);
   });
 
   it('search query with valid param with URL encoded characters', () => {
     let urlParams = util.getURLParams('?config=http://google.com/%20search');
-    expect(urlParams).toEqual({'config': 'http://google.com/ search'});
+    assert.deepEqual({'config': 'http://google.com/ search'}, urlParams);
   });
 
   it('search query with pound sign', () => {
     let urlParams = util.getURLParams('?config=http://google.com/#foo');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('no search query returns empty object', () => {
     let urlParams = util.getURLParams('');
-    expect(urlParams).toEqual({});
+    assert.deepEqual({}, urlParams);
   });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts
new file mode 100644
index 00000000000..2bf0c6eb48f
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts
@@ -0,0 +1,107 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {State} from '../data';
+import {ProjectionsPanel} from '../vz-projector-projections-panel';
+
+describe('restoreUIFromBookmark', () => {
+  let projectionsPanel: ProjectionsPanel;
+  beforeEach(() => {
+    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
+        ProjectionsPanel;
+
+    // Set up some of the UI so the elements are found in the production code.
+    const tsnePerplexityContainer = document.createElement('div');
+    tsnePerplexityContainer.className = 'tsne-perplexity';
+    const tsnePerplexity = document.createElement('span');
+    tsnePerplexityContainer.appendChild(tsnePerplexity);
+    projectionsPanel.appendChild(tsnePerplexityContainer);
+
+    const tsneLearningRateContainer = document.createElement('div');
+    tsneLearningRateContainer.className = 'tsne-learning-rate';
+    const tsneLearningRate = document.createElement('span');
+    tsneLearningRateContainer.appendChild(tsneLearningRate);
+    projectionsPanel.appendChild(tsneLearningRateContainer);
+  });
+
+  it('sets the pcaX/Y properties when setting 2D component values', () => {
+    spyOn(projectionsPanel, 'setZDropdownEnabled');
+
+    const s = new State();
+    s.pcaComponentDimensions = [0, 1];
+    projectionsPanel.restoreUIFromBookmark(s);
+
+    assert.equal(0, projectionsPanel.pcaX);
+    assert.equal(1, projectionsPanel.pcaY);
+
+    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(false);
+  });
+
+  it('sets the pcaX/Y properties when setting 3D component values', () => {
+    spyOn(projectionsPanel, 'setZDropdownEnabled');
+
+    const s = new State();
+    s.pcaComponentDimensions = [0, 1, 2];
+    projectionsPanel.restoreUIFromBookmark(s);
+
+    assert.equal(0, projectionsPanel.pcaX);
+    assert.equal(1, projectionsPanel.pcaY);
+    assert.equal(2, projectionsPanel.pcaZ);
+
+    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(true);
+  });
+});
+
+describe('populateBookmarkFromUI', () => {
+  let projectionsPanel: ProjectionsPanel;
+
+  beforeEach(() => {
+    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
+        ProjectionsPanel;
+
+    // Set up some of the UI so the elements are found in the production code.
+    const tsnePerplexityContainer = document.createElement('div');
+    tsnePerplexityContainer.className = 'tsne-perplexity';
+    const tsnePerplexity = document.createElement('span');
+    tsnePerplexityContainer.appendChild(tsnePerplexity);
+    projectionsPanel.appendChild(tsnePerplexityContainer);
+
+    const tsneLearningRateContainer = document.createElement('div');
+    tsneLearningRateContainer.className = 'tsne-learning-rate';
+    const tsneLearningRate = document.createElement('span');
+    tsneLearningRateContainer.appendChild(tsneLearningRate);
+    projectionsPanel.appendChild(tsneLearningRateContainer);
+  });
+
+  it('gets the PCA component UI values from a 2D PCA projection', () => {
+    projectionsPanel.pcaX = 0;
+    projectionsPanel.pcaY = 1;
+    projectionsPanel.pcaIs3d = false;
+
+    const s = new State();
+    projectionsPanel.populateBookmarkFromUI(s);
+    assert.deepEqual([0, 1], s.pcaComponentDimensions);
+  });
+
+  it('gets the PCA component UI values from a 3D PCA projection', () => {
+    projectionsPanel.pcaX = 0;
+    projectionsPanel.pcaY = 1;
+    projectionsPanel.pcaZ = 2;
+    projectionsPanel.pcaIs3d = true;
+
+    const s = new State();
+    projectionsPanel.populateBookmarkFromUI(s);
+    assert.deepEqual([0, 1, 2], s.pcaComponentDimensions);
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_projector/util.ts b/tensorflow/tensorboard/components/vz_projector/util.ts
index b2400bac83e..bd6df68b1a5 100644
--- a/tensorflow/tensorboard/components/vz_projector/util.ts
+++ b/tensorflow/tensorboard/components/vz_projector/util.ts
@@ -42,6 +42,33 @@ export function shuffle<T>(array: T[]): T[] {
   return array;
 }
 
+export function range(count: number): number[] {
+  const rangeOutput: number[] = [];
+  for (let i = 0; i < count; i++) {
+    rangeOutput.push(i);
+  }
+  return rangeOutput;
+}
+
+export function classed(
+    element: HTMLElement, className: string, enabled: boolean) {
+  const classNames = element.className.split(' ');
+  if (enabled) {
+    if (className in classNames) {
+      return;
+    } else {
+      classNames.push(className);
+    }
+  } else {
+    const index = classNames.indexOf(className);
+    if (index === -1) {
+      return;
+    }
+    classNames.splice(index, 1);
+  }
+  element.className = classNames.join(' ');
+}
+
 /** Projects a 3d point into screen space */
 export function vector3DToScreenCoords(
     cam: THREE.Camera, w: number, h: number, v: THREE.Vector3): Point2D {
@@ -153,7 +180,7 @@ export function runAsyncTask<T>(
   let autoClear = (msgId == null);
   msgId = logging.setModalMessage(message, msgId);
   return new Promise<T>((resolve, reject) => {
-    d3.timer(() => {
+    setTimeout(() => {
       try {
         let result = task();
         // Clearing the old message.
diff --git a/tensorflow/tensorboard/components/vz_projector/vector.ts b/tensorflow/tensorboard/components/vz_projector/vector.ts
index 88ca24b25dd..cab30483138 100644
--- a/tensorflow/tensorboard/components/vz_projector/vector.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vector.ts
@@ -203,7 +203,7 @@ export function centroid<T>(dataPoints: T[], accessor?: (a: T) => Vector):
  * a random (0, 1) gaussian distribution.
  */
 export function rn(size: number): Float32Array {
-  let normal = d3.random.normal();
+  const normal = d3.randomNormal();
   let result = new Float32Array(size);
   for (let i = 0; i < size; ++i) {
     result[i] = normal();
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
index 34aca77dde4..e19f0364c44 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
@@ -83,8 +83,9 @@ vz-projector {
       event-logging="[[eventLogging]]">
   </vz-projector>
 </div>
+</template>
 <!-- Google analytics -->
-<script>
+<script jscomp-nocompile>
   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
@@ -92,7 +93,6 @@ vz-projector {
 
   ga('create', 'UA-46457317-5', 'auto');
 </script>
-</template>
 <script>
   Polymer({
     is: 'vz-projector-app',
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
index c37d8d9571f..f3f3f59a948 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
@@ -21,6 +21,7 @@ limitations under the License.
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-bookmark-panel">
 <template>
@@ -202,4 +203,5 @@ paper-textarea {
 </div>
 
 </template>
+<script src="vz-projector-bookmark-panel.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
index d22904337ab..53195fa47c0 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
@@ -41,13 +41,17 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
   private selectedState: number;
   private ignoreNextProjectionEvent: boolean;
 
-  private dom: d3.Selection<any>;
+  private expandLessButton: HTMLButtonElement;
+  private expandMoreButton: HTMLButtonElement;
 
   ready() {
-    this.dom = d3.select(this);
     this.savedStates = [];
     this.setupUploadButton();
     this.ignoreNextProjectionEvent = false;
+    this.expandLessButton =
+        this.querySelector('#expand-less') as HTMLButtonElement;
+    this.expandMoreButton =
+        this.querySelector('#expand-more') as HTMLButtonElement;
   }
 
   initialize(
@@ -80,15 +84,15 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
   /** Handles a click on show bookmarks tray button. */
   _expandMore() {
     this.$.panel.show();
-    this.dom.select('#expand-more').style('display', 'none');
-    this.dom.select('#expand-less').style('display', '');
+    this.expandMoreButton.style.display = 'none';
+    this.expandLessButton.style.display = '';
   }
 
   /** Handles a click on hide bookmarks tray button. */
   _expandLess() {
     this.$.panel.hide();
-    this.dom.select('#expand-more').style('display', '');
-    this.dom.select('#expand-less').style('display', 'none');
+    this.expandMoreButton.style.display = '';
+    this.expandLessButton.style.display = 'none';
   }
 
   /** Handles a click on the add bookmark button. */
@@ -136,16 +140,16 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
 
   private setupUploadButton() {
     // Show and setup the load view button.
-    let fileInput = this.dom.select('#state-file');
-    fileInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileInput = this.querySelector('#state-file') as HTMLInputElement;
+    fileInput.onchange = () => {
+      const file: File = fileInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = (evt) => {
-        let str: string = (evt.target as any).result;
-        let savedStates = JSON.parse(str);
+        const str: string = fileReader.result;
+        const savedStates = JSON.parse(str);
 
         // Verify the bookmarks match.
         if (this.savedStatesValid(savedStates)) {
@@ -158,7 +162,7 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
         }
       };
       fileReader.readAsText(file);
-    });
+    };
   }
 
   addStates(savedStates?: State[]) {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
index b641bb0f293..8223c503ecd 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-dashboard-common/tf-no-data-warning.html">
 <link rel="import" href="vz-projector.html">
 
@@ -36,20 +37,43 @@ limitations under the License.
   </template>
 </template>
 <script>
-(function() {
+import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
+
 Polymer({
   is: 'vz-projector-dashboard',
+  factoryImpl: function(routePrefix) {
+    this.routePrefix = routePrefix;
+  },
   properties: {
     dataNotFound: Boolean,
-    routePrefix: String
+    routePrefix: String,
+    // Whether this dashboard is initialized. This dashboard should only be initialized once.
+    _initialized: Boolean,
+  },
+  behaviors: [
+    DashboardBehavior("embeddings"),
+  ],
+  reload: function() {
+    // Do not reload the embedding projector. Reloading could take a long time.
+  },
+  attached: function() {
+    if (this._initialized) {
+      return;
+    }
+    let xhr = new XMLHttpRequest();
+    xhr.open('GET', this.routePrefix + '/runs');
+    xhr.onload = () => {
+      // Set this to true so we only initialize once.
+      this._initialized = true;
+
+      let runs = JSON.parse(xhr.responseText);
+      this.set('dataNotFound', runs.length === 0);
+    };
+    xhr.onerror = () => {
+      this.set('dataNotFound', false);
+    };
+    xhr.send();
   },
-  ready() {
-    var self = this;
-    d3.json(this.routePrefix + '/runs', function(err, runs) {
-      self.dataNotFound = (runs.length === 0);
-    });
-  }
 });
-})();
 </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
index 97e17d597f0..d8dfd6e978c 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
@@ -27,8 +27,10 @@ limitations under the License.
 <link rel="import" href="../paper-dialog/paper-dialog.html">
 <link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html">
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
+<link rel="import" href="../tf-imports/d3.html">
 <link rel="import" href="vz-projector-legend.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-data-panel">
 <template>
@@ -275,7 +277,7 @@ paper-dropdown-menu paper-item {
       </paper-tooltip>
       <paper-button id="upload" class="ink-button" onclick="dataDialog.open()">Load data</paper-button>
     </span>
-    <span>
+    <span id="publish-container">
       <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
         Publish your embedding visualization and data
       </paper-tooltip>
@@ -286,7 +288,7 @@ paper-dropdown-menu paper-item {
     <paper-dialog id="dataDialog" with-backdrop>
       <h2>Load data from your computer</h2>
       <paper-dialog-scrollable class="scrollable-container">
-        <div class="data-step">
+        <div class="data-step" id="upload-tensors-step-container">
           <div class="upload-step">
             <div>
                 <b><span class="step-label">Step 1:</span> Load a TSV file of vectors.</b>
@@ -310,7 +312,7 @@ paper-dropdown-menu paper-item {
         <div class="data-step">
           <div class="upload-step">
             <div>
-                <span class="step-label"><b>Step 2</b> (optional):</span> <b>Load a TSV file of metadata.</b>
+                <span class="step-label" id="upload-metadata-label"><b>Step 2</b> (optional):</span> <b>Load a TSV file of metadata.</b>
             </div>
           </div>
           <div class="data-step-contents">
@@ -339,7 +341,7 @@ paper-dropdown-menu paper-item {
         <div>
           <p>
             If you'd like to share your visualization with the world, follow these simple steps.
-            See <a target=_blank href="https://www.tensorflow.org/versions/master/how_tos/embedding_viz/index.md">this tutorial</a> for more.
+            See <a target=_blank href="https://www.tensorflow.org/get_started/embedding_viz">this tutorial</a> for more.
           </p>
           <h4><span class="step-label">Step 1:</span> Make data public</h4>
           <p>
@@ -347,6 +349,7 @@ paper-dropdown-menu paper-item {
           </p>
           <p>
             One option is using a <a target=_blank href="https://gist.github.com/">github gist</a>.
+            If you choose this approach, make sure to link directly to the raw file.
           </p>
         </div>
         <div>
@@ -395,4 +398,5 @@ paper-dropdown-menu paper-item {
 </div>
 <!-- Closing global template -->
 </template>
+<script src="vz-projector-data-panel.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
index b34d0f60ed7..a9b6f6c5a06 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
@@ -47,7 +47,6 @@ export class DataPanel extends DataPanelPolymer {
   private labelOptions: string[];
   private colorOptions: ColorOption[];
   forceCategoricalColoring: boolean = false;
-  private dom: d3.Selection<any>;
 
   private selectedTensor: string;
   private selectedRun: string;
@@ -61,7 +60,6 @@ export class DataPanel extends DataPanelPolymer {
   private metadataFile: string;
 
   ready() {
-    this.dom = d3.select(this);
     this.normalizeData = true;
   }
 
@@ -129,9 +127,11 @@ export class DataPanel extends DataPanelPolymer {
   }
 
   private updateMetadataUI(columnStats: ColumnStats[], metadataFile: string) {
-    this.dom.select('#metadata-file')
-        .html(this.addWordBreaks(metadataFile))
-        .attr('title', metadataFile);
+    const metadataFileElement =
+        this.querySelector('#metadata-file') as HTMLSpanElement;
+    metadataFileElement.innerHTML = this.addWordBreaks(metadataFile);
+    metadataFileElement.title = metadataFile;
+
     // Label by options.
     let labelIndex = -1;
     this.labelOptions = columnStats.map((stats, i) => {
@@ -144,25 +144,25 @@ export class DataPanel extends DataPanelPolymer {
     this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
 
     // Color by options.
-    let standardColorOption: ColorOption[] = [
+    const standardColorOption: ColorOption[] = [
       {name: 'No color map'},
       // TODO(smilkov): Implement this.
       // {name: 'Distance of neighbors',
       //    desc: 'How far is each point from its neighbors'}
     ];
-    let metadataColorOption: ColorOption[] =
+    const metadataColorOption: ColorOption[] =
         columnStats
             .filter(stats => {
               return !stats.tooManyUniqueValues || stats.isNumeric;
             })
             .map(stats => {
-              let map: (v: string|number) => string;
+              let map;
               let items: {label: string, count: number}[];
               let thresholds: ColorLegendThreshold[];
               let isCategorical =
                   this.forceCategoricalColoring || !stats.tooManyUniqueValues;
               if (isCategorical) {
-                let scale = d3.scale.category20();
+                const scale = d3.scaleOrdinal(d3.schemeCategory20);
                 let range = scale.range();
                 // Re-order the range.
                 let newRange = range.map((color, i) => {
@@ -177,7 +177,7 @@ export class DataPanel extends DataPanelPolymer {
                   {color: '#ffffdd', value: stats.min},
                   {color: '#1f2d86', value: stats.max}
                 ];
-                map = d3.scale.linear<string>()
+                map = d3.scaleLinear<string, string>()
                           .domain(thresholds.map(t => t.value))
                           .range(thresholds.map(t => t.color));
               }
@@ -263,13 +263,15 @@ export class DataPanel extends DataPanelPolymer {
       this.tensorNames = names.map(name => {
         return {name, shape: this.getEmbeddingInfoByName(name).tensorShape};
       });
-      let wordBreakablePath =
+      const wordBreakablePath =
           this.addWordBreaks(this.projectorConfig.modelCheckpointPath);
-      this.dom.select('#checkpoint-file')
-          .html(wordBreakablePath)
-          .attr('title', this.projectorConfig.modelCheckpointPath);
+      const checkpointFile =
+          this.querySelector('#checkpoint-file') as HTMLSpanElement;
+      checkpointFile.innerHTML = wordBreakablePath;
+      checkpointFile.title = this.projectorConfig.modelCheckpointPath;
+
       // If in demo mode, let the order decide which tensor to load by default.
-      let defaultTensor = this.projector.servingMode === 'demo' ?
+      const defaultTensor = this.projector.servingMode === 'demo' ?
           this.projectorConfig.embeddings[0].tensorName :
           names[0];
       if (this.selectedTensor === defaultTensor) {
@@ -320,16 +322,17 @@ export class DataPanel extends DataPanelPolymer {
     this.projector.setSelectedColorOption(colorOption);
   }
 
-  private tensorWasReadFromFile(rawContents: string, fileName: string) {
+  private tensorWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
     parseRawTensors(rawContents, ds => {
-      this.dom.select('#checkpoint-file')
-          .text(fileName)
-          .attr('title', fileName);
+      const checkpointFile =
+          this.querySelector('#checkpoint-file') as HTMLSpanElement;
+      checkpointFile.innerText = fileName;
+      checkpointFile.title = fileName;
       this.projector.updateDataSet(ds);
     });
   }
 
-  private metadataWasReadFromFile(rawContents: string, fileName: string) {
+  private metadataWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
     parseRawMetadata(rawContents, metadata => {
       this.projector.updateDataSet(this.projector.dataSet, metadata, fileName);
     });
@@ -337,7 +340,7 @@ export class DataPanel extends DataPanelPolymer {
 
   private getEmbeddingInfoByName(tensorName: string): EmbeddingInfo {
     for (let i = 0; i < this.projectorConfig.embeddings.length; i++) {
-      let e = this.projectorConfig.embeddings[i];
+      const e = this.projectorConfig.embeddings[i];
       if (e.tensorName === tensorName) {
         return e;
       }
@@ -346,113 +349,127 @@ export class DataPanel extends DataPanelPolymer {
 
   private setupUploadButtons() {
     // Show and setup the upload button.
-    let fileInput = this.dom.select('#file');
-    fileInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileInput = this.querySelector('#file') as HTMLInputElement;
+    fileInput.onchange = () => {
+      const file: File = fileInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = evt => {
-        let content: string = (evt.target as any).result;
+        const content: ArrayBuffer = fileReader.result;
         this.tensorWasReadFromFile(content, file.name);
       };
-      fileReader.readAsText(file);
-    });
+      fileReader.readAsArrayBuffer(file);
+    };
 
-    let uploadButton = this.dom.select('#upload-tensors');
-    uploadButton.on('click', () => {
-      (fileInput.node() as HTMLInputElement).click();
-    });
+    const uploadButton =
+        this.querySelector('#upload-tensors') as HTMLButtonElement;
+    uploadButton.onclick = () => {
+      fileInput.click();
+    };
 
     // Show and setup the upload metadata button.
-    let fileMetadataInput = this.dom.select('#file-metadata');
-    fileMetadataInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileMetadataInput =
+        this.querySelector('#file-metadata') as HTMLInputElement;
+    fileMetadataInput.onchange = () => {
+      const file: File = fileMetadataInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileMetadataInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = evt => {
-        let contents: string = (evt.target as any).result;
+        const contents: ArrayBuffer = fileReader.result;
         this.metadataWasReadFromFile(contents, file.name);
       };
-      fileReader.readAsText(file);
-    });
+      fileReader.readAsArrayBuffer(file);
+    };
 
-    let uploadMetadataButton = this.dom.select('#upload-metadata');
-    uploadMetadataButton.on('click', () => {
-      (fileMetadataInput.node() as HTMLInputElement).click();
-    });
+    const uploadMetadataButton =
+        this.querySelector('#upload-metadata') as HTMLButtonElement;
+    uploadMetadataButton.onclick = () => {
+      fileMetadataInput.click();
+    };
 
-    if (this.projector.servingMode === 'demo') {
-      (this.$$('#demo-data-buttons-container') as HTMLElement).style.display =
-          'block';
+    if (this.projector.servingMode !== 'demo') {
+      (this.$$('#publish-container') as HTMLElement).style.display = 'none';
+      (this.$$('#upload-tensors-step-container') as HTMLElement).style.display =
+          'none';
+      (this.$$('#upload-metadata-label') as HTMLElement).style.display = 'none';
+    }
 
-      // Fill out the projector config.
-      let projectorConfigTemplate =
-          this.$$('#projector-config-template') as HTMLTextAreaElement;
-      let projectorConfigTemplateJson: ProjectorConfig = {
-        embeddings: [{
-          tensorName: 'My tensor',
-          tensorShape: [1000, 50],
-          tensorPath: 'https://gist.github.com/.../tensors.tsv',
-          metadataPath: 'https://gist.github.com/.../optional.metadata.tsv',
-        }],
-      };
+    (this.$$('#demo-data-buttons-container') as HTMLElement).style.display =
+        'block';
+
+    // Fill out the projector config.
+    const projectorConfigTemplate =
+        this.$$('#projector-config-template') as HTMLTextAreaElement;
+    const projectorConfigTemplateJson: ProjectorConfig = {
+      embeddings: [{
+        tensorName: 'My tensor',
+        tensorShape: [1000, 50],
+        tensorPath: 'https://raw.githubusercontent.com/.../tensors.tsv',
+        metadataPath:
+            'https://raw.githubusercontent.com/.../optional.metadata.tsv',
+      }],
+    };
+    this.setProjectorConfigTemplateJson(
+        projectorConfigTemplate, projectorConfigTemplateJson);
+
+    // Set up optional field checkboxes.
+    const spriteFieldCheckbox =
+        this.$$('#config-sprite-checkbox') as HTMLInputElement;
+    spriteFieldCheckbox.onchange = () => {
+      if ((spriteFieldCheckbox as any).checked) {
+        projectorConfigTemplateJson.embeddings[0].sprite = {
+          imagePath: 'https://github.com/.../optional.sprite.png',
+          singleImageDim: [32, 32]
+        };
+      } else {
+        delete projectorConfigTemplateJson.embeddings[0].sprite;
+      }
       this.setProjectorConfigTemplateJson(
           projectorConfigTemplate, projectorConfigTemplateJson);
+    };
+    const bookmarksFieldCheckbox =
+        this.$$('#config-bookmarks-checkbox') as HTMLInputElement;
+    bookmarksFieldCheckbox.onchange = () => {
+      if ((bookmarksFieldCheckbox as any).checked) {
+        projectorConfigTemplateJson.embeddings[0].bookmarksPath =
+            'https://raw.githubusercontent.com/.../bookmarks.txt';
+      } else {
+        delete projectorConfigTemplateJson.embeddings[0].bookmarksPath;
+      }
+      this.setProjectorConfigTemplateJson(
+          projectorConfigTemplate, projectorConfigTemplateJson);
+    };
+    const metadataFieldCheckbox =
+        this.$$('#config-metadata-checkbox') as HTMLInputElement;
+    metadataFieldCheckbox.onchange = () => {
+      if ((metadataFieldCheckbox as HTMLInputElement).checked) {
+        projectorConfigTemplateJson.embeddings[0].metadataPath =
+            'https://raw.githubusercontent.com/.../optional.metadata.tsv';
+      } else {
+        delete projectorConfigTemplateJson.embeddings[0].metadataPath;
+      }
+      this.setProjectorConfigTemplateJson(
+          projectorConfigTemplate, projectorConfigTemplateJson);
+    };
 
-      // Set up optional field checkboxes.
-      let spriteFieldCheckbox = this.$$('#config-sprite-checkbox');
-      spriteFieldCheckbox.addEventListener('change', () => {
-        if ((spriteFieldCheckbox as any).checked) {
-          projectorConfigTemplateJson.embeddings[0].sprite = {
-            imagePath: 'https://github.com/.../optional.sprite.png',
-            singleImageDim: [32, 32]
-          };
-        } else {
-          delete projectorConfigTemplateJson.embeddings[0].sprite;
-        }
-        this.setProjectorConfigTemplateJson(
-            projectorConfigTemplate, projectorConfigTemplateJson);
-      });
-      let bookmarksFieldCheckbox = this.$$('#config-bookmarks-checkbox');
-      bookmarksFieldCheckbox.addEventListener('change', () => {
-        if ((bookmarksFieldCheckbox as any).checked) {
-          projectorConfigTemplateJson.embeddings[0].bookmarksPath =
-              'https://gist.github.com/.../bookmarks.txt';
-        } else {
-          delete projectorConfigTemplateJson.embeddings[0].bookmarksPath;
-        }
-        this.setProjectorConfigTemplateJson(
-            projectorConfigTemplate, projectorConfigTemplateJson);
-      });
-      let metadataFieldCheckbox = this.$$('#config-metadata-checkbox');
-      metadataFieldCheckbox.addEventListener('change', () => {
-        if ((metadataFieldCheckbox as any).checked) {
-          projectorConfigTemplateJson.embeddings[0].metadataPath =
-              'https://gist.github.com/.../optional.metadata.tsv';
-        } else {
-          delete projectorConfigTemplateJson.embeddings[0].metadataPath;
-        }
-        this.setProjectorConfigTemplateJson(
-            projectorConfigTemplate, projectorConfigTemplateJson);
-      });
+    // Update the link and the readonly shareable URL.
+    const projectorConfigUrlInput =
+        this.$$('#projector-config-url') as HTMLInputElement;
+    const projectorConfigDemoUrlInput = this.$$('#projector-share-url');
+    const projectorConfigDemoUrlLink = this.$$('#projector-share-url-link');
+    projectorConfigUrlInput.onchange = () => {
+      let projectorDemoUrl = location.protocol + '//' + location.host +
+          location.pathname +
+          '?config=' + (projectorConfigUrlInput as HTMLInputElement).value;
 
-      // Update the link and the readonly shareable URL.
-      let projectorConfigUrlInput = this.$$('#projector-config-url');
-      let projectorConfigDemoUrlInput = this.$$('#projector-share-url');
-      let projectorConfigDemoUrlLink = this.$$('#projector-share-url-link');
-      projectorConfigUrlInput.addEventListener('input', () => {
-        let projectorDemoUrl = location.protocol + '//' + location.host +
-            location.pathname + '?config=' +
-            (projectorConfigUrlInput as any).value;
-
-        (projectorConfigDemoUrlInput as any).value = projectorDemoUrl;
-        (projectorConfigDemoUrlLink as any).href = projectorDemoUrl;
-      });
-    }
+      (projectorConfigDemoUrlInput as HTMLInputElement).value =
+          projectorDemoUrl;
+      (projectorConfigDemoUrlLink as HTMLLinkElement).href = projectorDemoUrl;
+    };
   }
 
   private setProjectorConfigTemplateJson(
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
index e77694426eb..0d7bf7cdda6 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
@@ -20,6 +20,7 @@ limitations under the License.
 <link rel="import" href="../paper-button/paper-button.html">
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-input">
 <template>
@@ -61,4 +62,5 @@ limitations under the License.
 
 <!-- Closing global template -->
 </template>
-</dom-module>
\ No newline at end of file
+<script src="vz-projector-input.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
index 6270185dd4a..e11346d327f 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
@@ -26,7 +26,6 @@ export interface InputChangedListener {
 
 /** Input control with custom capabilities (e.g. regex). */
 export class ProjectorInput extends PolymerClass {
-  private dom: d3.Selection<HTMLElement>;
   private textChangedListeners: InputChangedListener[];
   private paperInput: HTMLInputElement;
   private inRegexModeButton: HTMLButtonElement;
@@ -43,7 +42,6 @@ export class ProjectorInput extends PolymerClass {
   ready() {
     this.inRegexMode = false;
     this.textChangedListeners = [];
-    this.dom = d3.select(this);
     this.paperInput = this.querySelector('paper-input') as HTMLInputElement;
     this.inRegexModeButton =
         this.querySelector('paper-button') as HTMLButtonElement;
@@ -89,9 +87,12 @@ export class ProjectorInput extends PolymerClass {
   }
 
   private updateRegexModeDisplaySlashes() {
-    d3.select(this.paperInput)
-        .selectAll('.slash')
-        .style('display', this.inRegexMode ? null : 'none');
+    const slashes = this.paperInput.querySelectorAll('.slash');
+    const display = this.inRegexMode ? '' : 'none';
+
+    for (let i = 0; i < slashes.length; i++) {
+      (slashes[i] as HTMLDivElement).style.display = display;
+    }
   }
 
   getValue(): string {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
index 7554c322cef..1b81094776f 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
@@ -17,9 +17,9 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-slider/paper-slider.html">
-
-<link rel="import" href="styles.html">
 <link rel="import" href="vz-projector-input.html">
+<link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-inspector-panel">
 <style include="vz-projector-styles"></style>
@@ -163,7 +163,7 @@ limitations under the License.
   margin: 0 -12px 0 10px;
 }
 
-.euclidian {
+.euclidean {
   margin-right: 10px;
 }
 
@@ -223,7 +223,7 @@ limitations under the License.
         <span class="option-label">distance</span>
         <div class="options">
           <a class="selected cosine" href="javascript:void(0);">COSINE</a>
-          <a class="euclidean" href="javascript:void(0);">EUCLIDIAN</a>
+          <a class="euclidean" href="javascript:void(0);">EUCLIDEAN</a>
         </div>
       </div>
     </div>
@@ -237,4 +237,5 @@ limitations under the License.
 </div>
 <!-- Closing global template -->
 </template>
+<script src="vz-projector-inspector-panel.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
index 20dc67167f9..3ee2c2165f2 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
@@ -17,6 +17,7 @@ import {DistanceFunction, SpriteAndMetadataInfo, State} from './data';
 import * as knn from './knn';
 import {ProjectorEventContext} from './projectorEventContext';
 import * as adapter from './projectorScatterPlotAdapter';
+import * as util from './util';
 import * as vector from './vector';
 import {Projector} from './vz-projector';
 import {ProjectorInput} from './vz-projector-input';
@@ -40,23 +41,24 @@ export class InspectorPanel extends PolymerClass {
 
   private selectedMetadataField: string;
   private metadataFields: string[];
-  private dom: d3.Selection<HTMLElement>;
   private projector: Projector;
   private selectedPointIndices: number[];
   private neighborsOfFirstPoint: knn.NearestEntry[];
   private searchBox: ProjectorInput;
 
-  private resetFilterButton: d3.Selection<HTMLElement>;
-  private setFilterButton: d3.Selection<HTMLElement>;
-  private clearSelectionButton: d3.Selection<HTMLElement>;
-  private limitMessage: d3.Selection<HTMLElement>;
+  private resetFilterButton: HTMLButtonElement;
+  private setFilterButton: HTMLButtonElement;
+  private clearSelectionButton: HTMLButtonElement;
+  private limitMessage: HTMLDivElement;
 
   ready() {
-    this.dom = d3.select(this);
-    this.resetFilterButton = this.dom.select('.reset-filter');
-    this.setFilterButton = this.dom.select('.set-filter');
-    this.clearSelectionButton = this.dom.select('.clear-selection');
-    this.limitMessage = this.dom.select('.limit-msg');
+    this.resetFilterButton =
+        this.querySelector('.reset-filter') as HTMLButtonElement;
+    this.setFilterButton =
+        this.querySelector('.set-filter') as HTMLButtonElement;
+    this.clearSelectionButton =
+        this.querySelector('.clear-selection') as HTMLButtonElement;
+    this.limitMessage = this.querySelector('.limit-msg') as HTMLDivElement;
     this.searchBox = this.querySelector('#search-box') as ProjectorInput;
     // https://www.polymer-project.org/1.0/docs/devguide/styling#scope-subtree
     this.scopeSubtree(this, true);
@@ -88,7 +90,7 @@ export class InspectorPanel extends PolymerClass {
   }
 
   private enableResetFilterButton(enabled: boolean) {
-    this.resetFilterButton.attr('disabled', enabled ? null : true);
+    this.resetFilterButton.disabled = !enabled;
   }
 
   restoreUIFromBookmark(bookmark: State) {
@@ -113,143 +115,178 @@ export class InspectorPanel extends PolymerClass {
   }
 
   private updateSearchResults(indices: number[]) {
-    let container = this.dom.select('.matches-list');
-    container.style('display', indices.length ? null : 'none');
-    let list = container.select('.list');
-    list.html('');
+    const container = this.querySelector('.matches-list') as HTMLDivElement;
+    container.style.display = indices.length ? null : 'none';
+    const list = container.querySelector('.list') as HTMLDivElement;
+    list.innerHTML = '';
     if (indices.length === 0) {
       return;
     }
-    this.limitMessage.style(
-        'display', indices.length <= LIMIT_RESULTS ? 'none' : null);
+
+    this.limitMessage.style.display =
+        indices.length <= LIMIT_RESULTS ? 'none' : null;
     indices = indices.slice(0, LIMIT_RESULTS);
-    let rows = list.selectAll('.row').data(indices).enter().append('div').attr(
-        'class', 'row');
-    rows.append('a')
-        .attr('class', 'label')
-        .attr('title', index => this.getLabelFromIndex(index))
-        .text(index => this.getLabelFromIndex(index));
-    rows.on('mouseenter', index => {
-      this.projectorEventContext.notifyHoverOverPoint(index);
-    });
-    rows.on('mouseleave', () => {
-      this.projectorEventContext.notifyHoverOverPoint(null);
-    });
-    rows.on('click', index => {
-      this.projectorEventContext.notifySelectionChanged([index]);
-    });
+
+    for (let i = 0; i < indices.length; i++) {
+      const index = indices[i];
+
+      const row = document.createElement('div');
+      row.className = 'row';
+
+      const label = this.getLabelFromIndex(index);
+      const rowLink = document.createElement('a');
+      rowLink.className = 'label';
+      rowLink.title = label;
+      rowLink.innerText = label;
+
+      rowLink.onmouseenter = () => {
+        this.projectorEventContext.notifyHoverOverPoint(index);
+      };
+      rowLink.onmouseleave = () => {
+        this.projectorEventContext.notifyHoverOverPoint(null);
+      };
+      rowLink.onclick = () => {
+        this.projectorEventContext.notifySelectionChanged([index]);
+      };
+
+      row.appendChild(rowLink);
+      list.appendChild(row);
+    }
   }
 
   private getLabelFromIndex(pointIndex: number): string {
-    let point = this.projector.dataSet.points[pointIndex];
+    const point = this.projector.dataSet.points[pointIndex];
     return point.metadata[this.selectedMetadataField].toString();
   }
 
   private updateNeighborsList(neighbors: knn.NearestEntry[]) {
-    let nnlist = this.dom.select('.nn-list');
-    nnlist.html('');
-    this.dom.select('.nn').style('display', neighbors.length ? null : 'none');
+    const nnlist = this.querySelector('.nn-list') as HTMLDivElement;
+    nnlist.innerHTML = '';
+
+    (this.querySelector('.nn') as HTMLDivElement).style.display =
+        neighbors.length ? null : 'none';
 
     if (neighbors.length === 0) {
       return;
     }
 
     this.searchBox.message = '';
-    let minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
-    let n = nnlist.selectAll('.neighbor')
-                .data(neighbors)
-                .enter()
-                .append('div')
-                .attr('class', 'neighbor')
-                .append('a')
-                .attr('class', 'neighbor-link')
-                .attr('title', d => this.getLabelFromIndex(d.index));
+    const minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
 
+    for (let i = 0; i < neighbors.length; i++) {
+      const neighbor = neighbors[i];
 
-    let labelValue = n.append('div').attr('class', 'label-and-value');
-    labelValue.append('div')
-        .attr('class', 'label')
-        .style('color', d => adapter.dist2color(this.distFunc, d.dist, minDist))
-        .text(d => this.getLabelFromIndex(d.index));
+      const neighborElement = document.createElement('div');
+      neighborElement.className = 'neighbor';
 
-    labelValue.append('div')
-        .attr('class', 'value')
-        .text(d => d.dist.toFixed(3));
+      const neighborElementLink = document.createElement('a');
+      neighborElementLink.className = 'neighbor-link';
+      neighborElementLink.title = this.getLabelFromIndex(neighbor.index);
 
-    let bar = n.append('div').attr('class', 'bar');
+      const labelValueElement = document.createElement('div');
+      labelValueElement.className = 'label-and-value';
 
-    bar.append('div')
-        .attr('class', 'fill')
-        .style(
-            'border-top-color',
-            d => {
-              return adapter.dist2color(this.distFunc, d.dist, minDist);
-            })
-        .style(
-            'width',
-            d => adapter.normalizeDist(this.distFunc, d.dist, minDist) * 100 +
-                '%');
+      const labelElement = document.createElement('div');
+      labelElement.className = 'label';
+      labelElement.style.color =
+          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
+      labelElement.innerText = this.getLabelFromIndex(neighbor.index);
 
-    bar.selectAll('.tick')
-        .data(d3.range(1, 4))
-        .enter()
-        .append('div')
-        .attr('class', 'tick')
-        .style('left', d => d * 100 / 4 + '%');
-    n.on('mouseenter', d => {
-      this.projectorEventContext.notifyHoverOverPoint(d.index);
-    });
-    n.on('mouseleave', () => {
-      this.projectorEventContext.notifyHoverOverPoint(null);
-    });
-    n.on('click', d => {
-      this.projectorEventContext.notifySelectionChanged([d.index]);
-    });
+      const valueElement = document.createElement('div');
+      valueElement.className = 'value';
+      valueElement.innerText = neighbor.dist.toFixed(3);
+
+      labelValueElement.appendChild(labelElement);
+      labelValueElement.appendChild(valueElement);
+
+      const barElement = document.createElement('div');
+      barElement.className = 'bar';
+
+      const barFillElement = document.createElement('div');
+      barFillElement.className = 'fill';
+      barFillElement.style.borderTopColor =
+          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
+      barFillElement.style.width =
+          adapter.normalizeDist(this.distFunc, neighbor.dist, minDist) * 100 +
+          '%';
+      barElement.appendChild(barFillElement);
+
+      for (let j = 1; j < 4; j++) {
+        const tickElement = document.createElement('div');
+        tickElement.className = 'tick';
+        tickElement.style.left = j * 100 / 4 + '%';
+        barElement.appendChild(tickElement);
+      }
+
+      neighborElementLink.appendChild(labelValueElement);
+      neighborElementLink.appendChild(barElement);
+      neighborElement.appendChild(neighborElementLink);
+      nnlist.appendChild(neighborElement);
+
+      neighborElementLink.onmouseenter = () => {
+        this.projectorEventContext.notifyHoverOverPoint(neighbor.index);
+      };
+      neighborElementLink.onmouseleave = () => {
+        this.projectorEventContext.notifyHoverOverPoint(null);
+      };
+      neighborElementLink.onclick = () => {
+        this.projectorEventContext.notifySelectionChanged([neighbor.index]);
+      };
+    }
   }
 
   private updateFilterButtons(numPoints: number) {
     if (numPoints > 1) {
-      this.setFilterButton.text(`Isolate ${numPoints} points`)
-          .attr('disabled', null);
-      this.clearSelectionButton.attr('disabled', null);
+      this.setFilterButton.innerText = `Isolate ${numPoints} points`;
+      this.setFilterButton.disabled = null;
+      this.clearSelectionButton.disabled = null;
     } else {
-      this.setFilterButton.attr('disabled', true);
-      this.clearSelectionButton.attr('disabled', true);
+      this.setFilterButton.disabled = true;
+      this.clearSelectionButton.disabled = true;
     }
   }
 
   private setupUI(projector: Projector) {
     this.distFunc = vector.cosDist;
-    let eucDist = this.dom.select('.distance a.euclidean');
-    eucDist.on('click', () => {
-      this.dom.selectAll('.distance a').classed('selected', false);
-      eucDist.classed('selected', true);
+    const eucDist =
+        this.querySelector('.distance a.euclidean') as HTMLLinkElement;
+    eucDist.onclick = () => {
+      const links = this.querySelectorAll('.distance a');
+      for (let i = 0; i < links.length; i++) {
+        util.classed(links[i] as HTMLElement, 'selected', false);
+      }
+      util.classed(eucDist as HTMLElement, 'selected', true);
+
       this.distFunc = vector.dist;
       this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      let neighbors = projector.dataSet.findNeighbors(
+      const neighbors = projector.dataSet.findNeighbors(
           this.selectedPointIndices[0], this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
-    });
+    };
+
+    const cosDist = this.querySelector('.distance a.cosine') as HTMLLinkElement;
+    cosDist.onclick = () => {
+      const links = this.querySelectorAll('.distance a');
+      for (let i = 0; i < links.length; i++) {
+        util.classed(links[i] as HTMLElement, 'selected', false);
+      }
+      util.classed(cosDist, 'selected', true);
 
-    let cosDist = this.dom.select('.distance a.cosine');
-    cosDist.on('click', () => {
-      this.dom.selectAll('.distance a').classed('selected', false);
-      cosDist.classed('selected', true);
       this.distFunc = vector.cosDist;
       this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      let neighbors = projector.dataSet.findNeighbors(
+      const neighbors = projector.dataSet.findNeighbors(
           this.selectedPointIndices[0], this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
-    });
+    };
 
     // Called whenever the search text input changes.
-    let updateInput = (value: string, inRegexMode: boolean) => {
+    const updateInput = (value: string, inRegexMode: boolean) => {
       if (value == null || value.trim() === '') {
         this.searchBox.message = '';
         this.projectorEventContext.notifySelectionChanged([]);
         return;
       }
-      let indices = projector.dataSet.query(
+      const indices = projector.dataSet.query(
           value, inRegexMode, this.selectedMetadataField);
       if (indices.length === 0) {
         this.searchBox.message = '0 matches.';
@@ -263,10 +300,11 @@ export class InspectorPanel extends PolymerClass {
     });
 
     // Nearest neighbors controls.
-    let numNNInput = this.$$('#nn-slider') as HTMLInputElement;
-    let updateNumNN = () => {
+    const numNNInput = this.$$('#nn-slider') as HTMLInputElement;
+    const updateNumNN = () => {
       this.numNN = +numNNInput.value;
-      this.dom.select('.num-nn .nn-count').text(this.numNN);
+      (this.querySelector('.num-nn .nn-count') as HTMLSpanElement).innerText =
+          '' + this.numNN;
       if (this.selectedPointIndices != null) {
         this.projectorEventContext.notifySelectionChanged(
             [this.selectedPointIndices[0]]);
@@ -276,22 +314,22 @@ export class InspectorPanel extends PolymerClass {
     updateNumNN();
 
     // Filtering dataset.
-    this.setFilterButton.on('click', () => {
+    this.setFilterButton.onclick = () => {
       const indices = this.selectedPointIndices.concat(
           this.neighborsOfFirstPoint.map(n => n.index));
       projector.filterDataset(indices);
       this.enableResetFilterButton(true);
       this.updateFilterButtons(0);
-    });
+    };
 
-    this.resetFilterButton.on('click', () => {
+    this.resetFilterButton.onclick = () => {
       projector.resetFilterDataset();
       this.enableResetFilterButton(false);
-    });
+    };
 
-    this.clearSelectionButton.on('click', () => {
+    this.clearSelectionButton.onclick = () => {
       projector.adjustSelectionAndHover([]);
-    });
+    };
     this.enableResetFilterButton(false);
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
index 3fc5f4db158..4b98d8bded8 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
@@ -17,6 +17,7 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id='vz-projector-legend'>
 <template>
@@ -73,4 +74,5 @@ limitations under the License.
 </template>
 <!-- Closing global template -->
 </template>
-</dom-module>
\ No newline at end of file
+<script src="vz-projector-legend.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
index d30a9554805..1c4ddf940dc 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
@@ -44,11 +44,6 @@ export interface ColorLegendThreshold {
 
 export class Legend extends LegendPolymer {
   renderInfo: ColorLegendRenderInfo;
-  dom: d3.Selection<HTMLElement>;
-
-  ready() {
-    this.dom = d3.select(this);
-  }
 
   _renderInfoChanged() {
     if (this.renderInfo == null) {
@@ -70,29 +65,32 @@ export class Legend extends LegendPolymer {
   }
 
   private getOffset(value: number): string {
-    let min = this.renderInfo.thresholds[0].value;
-    let max =
+    const min = this.renderInfo.thresholds[0].value;
+    const max =
         this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1].value;
     return (100 * (value - min) / (max - min)).toFixed(2) + '%';
   }
 
   private setupLinearGradient() {
-    let linearGradient = this.dom.select('#gradient');
+    const linearGradient =
+        this.querySelector('#gradient') as SVGLinearGradientElement;
 
-    let width =
-        (this.dom.select('svg.gradient').node() as SVGElement).clientWidth;
+    const width =
+        (this.querySelector('svg.gradient') as SVGElement).clientWidth;
 
     // Set the svg <rect> to be the width of its <svg> parent.
-    this.dom.select('svg.gradient rect').attr('width', width);
+    (this.querySelector('svg.gradient rect') as SVGRectElement).style.width =
+        width + 'px';
 
     // Remove all <stop> children from before.
-    linearGradient.selectAll('*').remove();
+    linearGradient.innerHTML = '';
 
     // Add a <stop> child in <linearGradient> for each gradient threshold.
     this.renderInfo.thresholds.forEach(t => {
-      linearGradient.append('stop')
-          .attr('offset', this.getOffset(t.value))
-          .attr('stop-color', t.color);
+      const stopElement =
+          document.createElementNS('http://www.w3.org/2000/svg', 'stop');
+      stopElement.setAttribute('offset', this.getOffset(t.value));
+      stopElement.setAttribute('stop-color', t.color);
     });
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
index ebdcd72c77d..4231a61ff30 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
@@ -18,6 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../iron-collapse/iron-collapse.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-metadata-card">
 <template>
@@ -94,4 +95,5 @@ limitations under the License.
   </div>
 </template>
 </template>
+<script src="vz-projector-metadata-card.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
index 17a4700bb5c..939300f3878 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
@@ -28,8 +28,6 @@ export let MetadataCardPolymer = PolymerElement({
 });
 
 export class MetadataCard extends MetadataCardPolymer {
-  private dom: d3.Selection<any>;
-
   hasMetadata: boolean;
   metadata: Array<{key: string, value: string}>;
   label: string;
@@ -37,22 +35,28 @@ export class MetadataCard extends MetadataCardPolymer {
   private labelOption: string;
   private pointMetadata: PointMetadata;
 
-  ready() {
-    this.dom = d3.select(this);
-  }
+  private expandLessButton: HTMLButtonElement;
+  private expandMoreButton: HTMLButtonElement;
 
+  ready() {
+    this.expandLessButton =
+        this.querySelector('#expand-less') as HTMLButtonElement;
+    this.expandMoreButton =
+        this.querySelector('#expand-more') as HTMLButtonElement;
+  }
   /** Handles a click on the expand more icon. */
   _expandMore() {
     (this.$$('#metadata-container') as any).toggle();
-    this.dom.select('#expand-more').style('display', 'none');
-    this.dom.select('#expand-less').style('display', '');
+
+    this.expandMoreButton.style.display = 'none';
+    this.expandLessButton.style.display = '';
   }
 
   /** Handles a click on the expand less icon. */
   _expandLess() {
     (this.$$('#metadata-container') as any).toggle();
-    this.dom.select('#expand-more').style('display', '');
-    this.dom.select('#expand-less').style('display', 'none');
+    this.expandMoreButton.style.display = '';
+    this.expandLessButton.style.display = 'none';
   }
 
   updateMetadata(pointMetadata?: PointMetadata) {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
index cddcb2b7d08..b82f3f520b5 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
@@ -30,6 +30,7 @@ limitations under the License.
 <link rel="import" href="../paper-button/paper-button.html">
 <link rel="import" href="../paper-slider/paper-slider.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector-projections-panel">
 <template>
@@ -311,4 +312,5 @@ limitations under the License.
   </div>
 </div>
 </template>
+<script src="vz-projector-projections-panel.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
index 9df182ed489..377c6c11ad5 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
@@ -15,6 +15,7 @@ limitations under the License.
 
 import * as data from './data';
 import {DataSet, Projection, ProjectionType, SpriteAndMetadataInfo, State} from './data';
+import * as util from './util';
 import * as vector from './vector';
 import {Vector} from './vector';
 import {Projector} from './vz-projector';
@@ -92,13 +93,12 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   public customSelectedSearchByMetadataOption: string;
 
   /** Polymer elements. */
-  private dom: d3.Selection<any>;
-  private runTsneButton: d3.Selection<HTMLButtonElement>;
-  private stopTsneButton: d3.Selection<HTMLButtonElement>;
+  private runTsneButton: HTMLButtonElement;
+  private stopTsneButton: HTMLButtonElement;
   private perplexitySlider: HTMLInputElement;
   private learningRateInput: HTMLInputElement;
-  private zDropdown: d3.Selection<HTMLElement>;
-  private iterationLabel: d3.Selection<HTMLElement>;
+  private zDropdown: HTMLElement;
+  private iterationLabel: HTMLElement;
 
   private customProjectionXLeftInput: ProjectorInput;
   private customProjectionXRightInput: ProjectorInput;
@@ -121,14 +121,14 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   ready() {
-    this.dom = d3.select(this);
-    this.zDropdown = this.dom.select('#z-dropdown');
-    this.runTsneButton = this.dom.select('.run-tsne');
-    this.stopTsneButton = this.dom.select('.stop-tsne');
-    this.perplexitySlider = this.$$('#perplexity-slider') as HTMLInputElement;
+    this.zDropdown = this.querySelector('#z-dropdown') as HTMLElement;
+    this.runTsneButton = this.querySelector('.run-tsne') as HTMLButtonElement;
+    this.stopTsneButton = this.querySelector('.stop-tsne') as HTMLButtonElement;
+    this.perplexitySlider =
+        this.querySelector('#perplexity-slider') as HTMLInputElement;
     this.learningRateInput =
-        this.$$('#learning-rate-slider') as HTMLInputElement;
-    this.iterationLabel = this.dom.select('.run-tsne-iter');
+        this.querySelector('#learning-rate-slider') as HTMLInputElement;
+    this.iterationLabel = this.querySelector('.run-tsne-iter') as HTMLElement;
   }
 
   disablePolymerChangesTriggerReprojection() {
@@ -143,27 +143,33 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     if (this.perplexitySlider) {
       this.perplexity = +this.perplexitySlider.value;
     }
-    this.dom.select('.tsne-perplexity span').text(this.perplexity);
+    (this.querySelector('.tsne-perplexity span') as HTMLSpanElement).innerText =
+        '' + this.perplexity;
   }
 
   private updateTSNELearningRateFromUIChange() {
     if (this.learningRateInput) {
       this.learningRate = Math.pow(10, +this.learningRateInput.value);
     }
-    this.dom.select('.tsne-learning-rate span').text(this.learningRate);
+    (this.querySelector('.tsne-learning-rate span') as HTMLSpanElement)
+        .innerText = '' + this.learningRate;
   }
 
   private setupUIControls() {
     {
       const self = this;
-      this.dom.selectAll('.ink-tab').on('click', function() {
-        let id = this.getAttribute('data-tab');
-        self.showTab(id);
-      });
+      const inkTabs = this.querySelectorAll('.ink-tab');
+      for (let i = 0; i < inkTabs.length; i++) {
+        inkTabs[i].addEventListener('click', function() {
+          let id = this.getAttribute('data-tab');
+          self.showTab(id);
+        });
+      }
     }
 
-    this.runTsneButton.on('click', () => this.runTSNE());
-    this.stopTsneButton.on('click', () => this.dataSet.stopTSNE());
+    this.runTsneButton.addEventListener('click', () => this.runTSNE());
+    this.stopTsneButton.addEventListener(
+        'click', () => this.dataSet.stopTSNE());
 
     this.perplexitySlider.value = this.perplexity.toString();
     this.perplexitySlider.addEventListener(
@@ -177,8 +183,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.setupCustomProjectionInputFields();
     // TODO: figure out why `--paper-input-container-input` css mixin didn't
     // work.
-    this.dom.selectAll('paper-dropdown-menu paper-input input')
-        .style('font-size', '14px');
+    const inputs =
+        this.querySelectorAll('paper-dropdown-menu paper-input input');
+    for (let i = 0; i < inputs.length; i++) {
+      (inputs[i] as HTMLElement).style.fontSize = '14px';
+    }
   }
 
   restoreUIFromBookmark(bookmark: State) {
@@ -226,9 +235,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.updateTSNEPerplexityFromSliderChange();
     this.updateTSNELearningRateFromUIChange();
     if (this.iterationLabel) {
-      this.iterationLabel.text(bookmark.tSNEIteration.toString());
+      this.iterationLabel.innerText = bookmark.tSNEIteration.toString();
+    }
+    if (bookmark.selectedProjection != null) {
+      this.showTab(bookmark.selectedProjection);
     }
-    this.showTab(bookmark.selectedProjection);
     this.enablePolymerChangesTriggerReprojection();
   }
 
@@ -282,7 +293,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   // and the DOM.
   setZDropdownEnabled(enabled: boolean) {
     if (this.zDropdown) {
-      this.zDropdown.attr('disabled', enabled ? null : true);
+      if (enabled) {
+        this.zDropdown.removeAttribute('disabled');
+      } else {
+        this.zDropdown.setAttribute('disabled', 'true');
+      }
     }
   }
 
@@ -296,13 +311,13 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.updateTSNEPerplexityFromSliderChange();
     this.clearCentroids();
 
-    this.dom.select('#tsne-sampling')
-        .style('display', pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none');
+    (this.querySelector('#tsne-sampling') as HTMLElement).style.display =
+        pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none';
     const wasSampled =
         (dataSet == null) ? false : (dataSet.dim[0] > data.PCA_SAMPLE_DIM ||
                                      dataSet.dim[1] > data.PCA_SAMPLE_DIM);
-    this.dom.select('#pca-sampling')
-        .style('display', wasSampled ? null : 'none');
+    (this.querySelector('#pca-sampling') as HTMLElement).style.display =
+        wasSampled ? null : 'none';
     this.showTab('pca');
   }
 
@@ -332,12 +347,24 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   public showTab(id: ProjectionType) {
     this.currentProjection = id;
 
-    let tab = this.dom.select('.ink-tab[data-tab="' + id + '"]');
-    this.dom.selectAll('.ink-tab').classed('active', false);
-    tab.classed('active', true);
-    this.dom.selectAll('.ink-panel-content').classed('active', false);
-    this.dom.select('.ink-panel-content[data-panel="' + id + '"]')
-        .classed('active', true);
+    const tab =
+        this.querySelector('.ink-tab[data-tab="' + id + '"]') as HTMLElement;
+    const allTabs = this.querySelectorAll('.ink-tab');
+    for (let i = 0; i < allTabs.length; i++) {
+      util.classed(allTabs[i] as HTMLElement, 'active', false);
+    }
+
+    util.classed(tab, 'active', true);
+
+    const allTabContent = this.querySelectorAll('.ink-panel-content');
+    for (let i = 0; i < allTabContent.length; i++) {
+      util.classed(allTabContent[i] as HTMLElement, 'active', false);
+    }
+
+    util.classed(
+        this.querySelector('.ink-panel-content[data-panel="' + id + '"]') as
+            HTMLElement,
+        'active', true);
 
     // guard for unit tests, where polymer isn't attached and $ doesn't exist.
     if (this.$ != null) {
@@ -392,17 +419,17 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   private runTSNE() {
-    this.runTsneButton.attr('disabled', true);
-    this.stopTsneButton.attr('disabled', null);
+    this.runTsneButton.disabled = true;
+    this.stopTsneButton.disabled = null;
     this.dataSet.projectTSNE(
         this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2,
         (iteration: number) => {
           if (iteration != null) {
-            this.iterationLabel.text(iteration);
+            this.iterationLabel.innerText = '' + iteration;
             this.projector.notifyProjectionPositionsUpdated();
           } else {
-            this.runTsneButton.attr('disabled', null);
-            this.stopTsneButton.attr('disabled', true);
+            this.runTsneButton.disabled = null;
+            this.stopTsneButton.disabled = true;
           }
         });
   }
@@ -422,7 +449,7 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       totalVariance += variances[this.pcaZ];
     }
     msg += (totalVariance * 100).toFixed(1) + '%.';
-    this.dom.select('#total-variance').html(msg);
+    (this.querySelector('#total-variance') as HTMLElement).innerHTML = msg;
   }
 
   private showPCA() {
@@ -440,7 +467,7 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       this.projector.setProjection(projection);
       let numComponents = Math.min(NUM_PCA_COMPONENTS, this.dataSet.dim[1]);
       this.updateTotalVarianceMessage();
-      this.pcaComponents = d3.range(0, numComponents).map(i => {
+      this.pcaComponents = util.range(numComponents).map(i => {
         let fracVariance = this.dataSet.fracVariancesExplained[i];
         return {
           id: i,
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
deleted file mode 100644
index 3ce35afb743..00000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {State} from './data';
-import {ProjectionsPanel} from './vz-projector-projections-panel';
-
-const assert = chai.assert;
-
-describe('restoreUIFromBookmark', () => {
-  it('sets the pcaX/Y properties when setting 2D component values', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
-    spyOn(projectionsPanel, 'setZDropdownEnabled');
-
-    const s = new State();
-    s.pcaComponentDimensions = [0, 1];
-    projectionsPanel.restoreUIFromBookmark(s);
-
-    assert.equal(0, projectionsPanel.pcaX);
-    assert.equal(1, projectionsPanel.pcaY);
-
-    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(false);
-  });
-
-  it('sets the pcaX/Y properties when setting 3D component values', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
-    spyOn(projectionsPanel, 'setZDropdownEnabled');
-
-    const s = new State();
-    s.pcaComponentDimensions = [0, 1, 2];
-    projectionsPanel.restoreUIFromBookmark(s);
-
-    assert.equal(0, projectionsPanel.pcaX);
-    assert.equal(1, projectionsPanel.pcaY);
-    assert.equal(2, projectionsPanel.pcaZ);
-
-    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(true);
-  });
-});
-
-describe('populateBookmarkFromUI', () => {
-  it('gets the PCA component UI values from a 2D PCA projection', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
-    projectionsPanel.pcaX = 0;
-    projectionsPanel.pcaY = 1;
-    projectionsPanel.pcaIs3d = false;
-
-    const s = new State();
-    projectionsPanel.populateBookmarkFromUI(s);
-    assert.deepEqual([0, 1], s.pcaComponentDimensions);
-  });
-
-  it('gets the PCA component UI values from a 3D PCA projection', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
-    projectionsPanel.pcaX = 0;
-    projectionsPanel.pcaY = 1;
-    projectionsPanel.pcaZ = 2;
-    projectionsPanel.pcaIs3d = true;
-
-    const s = new State();
-    projectionsPanel.populateBookmarkFromUI(s);
-    assert.deepEqual([0, 1, 2], s.pcaComponentDimensions);
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.html b/tensorflow/tensorboard/components/vz_projector/vz-projector.html
index d4be2f26a5d..438ea9f4e97 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector.html
@@ -32,6 +32,7 @@ limitations under the License.
 <link rel="import" href="../paper-styles/typography.html">
 <link rel="import" href="../paper-spinner/paper-spinner-lite.html">
 <link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html">
+<link rel="import" href="../tf-imports/threejs.html">
 
 <link rel="import" href="vz-projector-bookmark-panel.html">
 <link rel="import" href="vz-projector-data-panel.html">
@@ -40,6 +41,7 @@ limitations under the License.
 <link rel="import" href="vz-projector-metadata-card.html">
 <link rel="import" href="vz-projector-projections-panel.html">
 <link rel="import" href="styles.html">
+<link rel="import" href="bundle.html">
 
 <dom-module id="vz-projector">
 <template>
@@ -340,4 +342,5 @@ limitations under the License.
 <paper-toast id="toast" always-on-top></paper-toast>
 
 </template> <!-- global template -->
+<script src="vz-projector.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
index ba0f669e56f..bf98a4d4785 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
@@ -70,7 +70,6 @@ export class Projector extends ProjectorPolymer implements
 
   private originalDataSet: DataSet;
   private dataSetBeforeFilter: DataSet;
-  private dom: d3.Selection<any>;
   private projectorScatterPlotAdapter: ProjectorScatterPlotAdapter;
   private dim: number;
 
@@ -94,13 +93,12 @@ export class Projector extends ProjectorPolymer implements
   private projectionsPanel: ProjectionsPanel;
   private metadataCard: MetadataCard;
 
-  private statusBar: d3.Selection<HTMLElement>;
+  private statusBar: HTMLDivElement;
   private analyticsLogger: AnalyticsLogger;
   private eventLogging: boolean;
   private pageViewLogging: boolean;
 
   ready() {
-    this.dom = d3.select(this);
     logging.setDomContainer(this);
 
     this.analyticsLogger =
@@ -130,7 +128,7 @@ export class Projector extends ProjectorPolymer implements
     this.bookmarkPanel = this.$['bookmark-panel'] as BookmarkPanel;
     this.bookmarkPanel.initialize(this, this as ProjectorEventContext);
     this.metadataCard = this.$['metadata-card'] as MetadataCard;
-    this.statusBar = this.dom.select('#status-bar');
+    this.statusBar = this.querySelector('#status-bar') as HTMLDivElement;
     this.scopeSubtree(this.$$('#notification-dialog'), true);
     this.setupUIControls();
     this.initializeDataProvider();
@@ -199,8 +197,8 @@ export class Projector extends ProjectorPolymer implements
       this.dataPanel.metadataChanged(spriteAndMetadata, metadataFile);
       // Set the container to a fixed height, otherwise in Colab the
       // height can grow indefinitely.
-      let container = this.dom.select('#container');
-      container.style('height', container.property('clientHeight') + 'px');
+      const container = this.querySelector('#container') as HTMLDivElement;
+      container.style.height = container.clientHeight + 'px';
     } else {
       this.setCurrentDataSet(null);
     }
@@ -226,7 +224,7 @@ export class Projector extends ProjectorPolymer implements
     this.dataSetFilterIndices = pointIndices;
     this.projectorScatterPlotAdapter.updateScatterPlotPositions();
     this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.adjustSelectionAndHover(d3.range(selectionSize));
+    this.adjustSelectionAndHover(util.range(selectionSize));
   }
 
   resetFilterDataset() {
@@ -387,8 +385,10 @@ export class Projector extends ProjectorPolymer implements
       ds.normalize();
     }
     this.dim = (ds == null) ? 0 : ds.dim[1];
-    this.dom.select('span.numDataPoints').text((ds == null) ? '0' : ds.dim[0]);
-    this.dom.select('span.dim').text((ds == null) ? '0' : ds.dim[1]);
+    (this.querySelector('span.numDataPoints') as HTMLSpanElement).innerText =
+        (ds == null) ? '0' : '' + ds.dim[0];
+    (this.querySelector('span.dim') as HTMLSpanElement).innerText =
+        (ds == null) ? '0' : '' + ds.dim[1];
 
     this.dataSet = ds;
 
@@ -425,10 +425,9 @@ export class Projector extends ProjectorPolymer implements
     });
 
     window.addEventListener('resize', () => {
-      let container = this.dom.select('#container');
-      let parentHeight =
-          (container.node().parentNode as HTMLElement).clientHeight;
-      container.style('height', parentHeight + 'px');
+      const container = this.querySelector('#container') as HTMLDivElement;
+      const parentHeight = (container.parentNode as HTMLElement).clientHeight;
+      container.style.height = parentHeight + 'px';
       this.projectorScatterPlotAdapter.resize();
     });
 
@@ -463,13 +462,13 @@ export class Projector extends ProjectorPolymer implements
       }
     }
     if (this.selectedPointIndices.length === 0) {
-      this.statusBar.style('display', hoverText ? null : 'none');
-      this.statusBar.text(hoverText);
+      this.statusBar.style.display = hoverText ? null : 'none';
+      this.statusBar.innerText = hoverText;
     }
   }
 
-  private getScatterContainer(): d3.Selection<any> {
-    return this.dom.select('#scatter');
+  private getScatterContainer(): HTMLDivElement {
+    return this.querySelector('#scatter') as HTMLDivElement;
   }
 
   private onSelectionChanged(
@@ -479,8 +478,8 @@ export class Projector extends ProjectorPolymer implements
     this.neighborsOfFirstPoint = neighborsOfFirstPoint;
     let totalNumPoints =
         this.selectedPointIndices.length + neighborsOfFirstPoint.length;
-    this.statusBar.text(`Selected ${totalNumPoints} points`)
-        .style('display', totalNumPoints > 0 ? null : 'none');
+    this.statusBar.innerText = `Selected ${totalNumPoints} points`;
+    this.statusBar.style.display = totalNumPoints > 0 ? null : 'none';
   }
 
   setProjection(projection: Projection) {
diff --git a/tensorflow/tensorboard/components/vz_sorting/BUILD b/tensorflow/tensorboard/components/vz_sorting/BUILD
index ae3f6e27774..e06b8ae1979 100644
--- a/tensorflow/tensorboard/components/vz_sorting/BUILD
+++ b/tensorflow/tensorboard/components/vz_sorting/BUILD
@@ -1,25 +1,24 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
+ts_web_library(
     name = "vz_sorting",
     srcs = [
+        "sorting.ts",
         "vz-sorting.html",
-        ":ts",
     ],
     path = "/vz-sorting",
     visibility = ["//visibility:public"],
 )
 
-tensorboard_typescript_genrule(
-    name = "ts",
-    srcs = ["sorting.ts"],
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [":vz_sorting"],
+    destdir = "vz-sorting",
 )
 
 filegroup(
@@ -27,22 +26,3 @@ filegroup(
     srcs = glob(["**"]),
     tags = ["notsan"],
 )
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "vz-sorting.html",
-        ":legacy_ts",
-    ],
-    visibility = ["//visibility:public"],
-    destdir = "vz-sorting",
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = ["sorting.ts"],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/vz_sorting/sorting.ts b/tensorflow/tensorboard/components/vz_sorting/sorting.ts
index c1a656c34b8..061184d24bf 100644
--- a/tensorflow/tensorboard/components/vz_sorting/sorting.ts
+++ b/tensorflow/tensorboard/components/vz_sorting/sorting.ts
@@ -13,95 +13,95 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module VZ.Sorting {
-  /**
-   * Compares tag names asciinumerically broken into components.
-   *
-   * <p>This is the comparison function used for sorting most string values in
-   * TensorBoard. Unlike the standard asciibetical comparator, this function
-   * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
-   * supported. This function also splits the input by slash and underscore to
-   * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
-   * though '+' < '/' in the ASCII table.
-   */
-  export function compareTagNames(a, b: string): number {
-    let ai = 0;
-    let bi = 0;
-    while (true) {
-      if (ai === a.length) {
-        return bi === b.length ? 0 : -1;
-      }
-      if (bi === b.length) {
-        return 1;
-      }
-      if (isDigit(a[ai]) && isDigit(b[bi])) {
-        const ais = ai;
-        const bis = bi;
-        ai = consumeNumber(a, ai + 1);
-        bi = consumeNumber(b, bi + 1);
-        const an = parseFloat(a.slice(ais, ai));
-        const bn = parseFloat(b.slice(bis, bi));
-        if (an < bn) {
-          return -1;
-        }
-        if (an > bn) {
-          return 1;
-        }
-        continue;
-      }
-      if (isBreak(a[ai])) {
-        if (!isBreak(b[bi])) {
-          return -1;
-        }
-      } else if (isBreak(b[bi])) {
-        return 1;
-      } else if (a[ai] < b[bi]) {
+/**
+ * Compares tag names asciinumerically broken into components.
+ *
+ * <p>This is the comparison function used for sorting most string values in
+ * TensorBoard. Unlike the standard asciibetical comparator, this function
+ * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
+ * supported. This function also splits the input by slash and underscore to
+ * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
+ * though '+' < '/' in the ASCII table.
+ */
+export function compareTagNames(a, b: string): number {
+  let ai = 0;
+  let bi = 0;
+  while (true) {
+    if (ai === a.length) {
+      return bi === b.length ? 0 : -1;
+    }
+    if (bi === b.length) {
+      return 1;
+    }
+    if (isDigit(a[ai]) && isDigit(b[bi])) {
+      const ais = ai;
+      const bis = bi;
+      ai = consumeNumber(a, ai + 1);
+      bi = consumeNumber(b, bi + 1);
+      const an = parseFloat(a.slice(ais, ai));
+      const bn = parseFloat(b.slice(bis, bi));
+      if (an < bn) {
         return -1;
-      } else if (a[ai] > b[bi]) {
+      }
+      if (an > bn) {
         return 1;
       }
-      ai++;
-      bi++;
+      continue;
     }
-  }
-
-  function consumeNumber(s: string, i: number): number {
-    enum State { NATURAL, REAL, EXPONENT_SIGN, EXPONENT }
-    let state = State.NATURAL;
-    for (; i < s.length; i++) {
-      if (state === State.NATURAL) {
-        if (s[i] === '.') {
-          state = State.REAL;
-        } else if (s[i] === 'e' || s[i] === 'E') {
-          state = State.EXPONENT_SIGN;
-        } else if (!isDigit(s[i])) {
-          break;
-        }
-      } else if (state === State.REAL) {
-        if (s[i] === 'e' || s[i] === 'E') {
-          state = State.EXPONENT_SIGN;
-        } else if (!isDigit(s[i])) {
-          break;
-        }
-      } else if (state === State.EXPONENT_SIGN) {
-        if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
-          state = State.EXPONENT;
-        } else {
-          break;
-        }
-      } else if (state === State.EXPONENT) {
-        if (!isDigit(s[i])) {
-          break;
-        }
+    if (isBreak(a[ai])) {
+      if (!isBreak(b[bi])) {
+        return -1;
       }
+    } else if (isBreak(b[bi])) {
+      return 1;
+    } else if (a[ai] < b[bi]) {
+      return -1;
+    } else if (a[ai] > b[bi]) {
+      return 1;
     }
-    return i;
-  }
-
-  function isDigit(c: string): boolean { return '0' <= c && c <= '9'; }
-
-  function isBreak(c: string): boolean {
-    // TODO(jart): Remove underscore when people stop using it like a slash.
-    return c === '/' || c === '_' || isDigit(c);
+    ai++;
+    bi++;
   }
 }
+
+function consumeNumber(s: string, i: number): number {
+  enum State { NATURAL, REAL, EXPONENT_SIGN, EXPONENT }
+  let state = State.NATURAL;
+  for (; i < s.length; i++) {
+    if (state === State.NATURAL) {
+      if (s[i] === '.') {
+        state = State.REAL;
+      } else if (s[i] === 'e' || s[i] === 'E') {
+        state = State.EXPONENT_SIGN;
+      } else if (!isDigit(s[i])) {
+        break;
+      }
+    } else if (state === State.REAL) {
+      if (s[i] === 'e' || s[i] === 'E') {
+        state = State.EXPONENT_SIGN;
+      } else if (!isDigit(s[i])) {
+        break;
+      }
+    } else if (state === State.EXPONENT_SIGN) {
+      if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
+        state = State.EXPONENT;
+      } else {
+        break;
+      }
+    } else if (state === State.EXPONENT) {
+      if (!isDigit(s[i])) {
+        break;
+      }
+    }
+  }
+  return i;
+}
+
+function isDigit(c: string): boolean {
+  return '0' <= c && c <= '9';
+}
+
+function isBreak(c: string): boolean {
+  // TODO(jart): Remove underscore when people stop using it like a slash.
+  return c === '/' || c === '_' || isDigit(c);
+}
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/BUILD b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
index f8b01b61f29..929e80d3728 100644
--- a/tensorflow/tensorboard/components/vz_sorting/test/BUILD
+++ b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
@@ -1,35 +1,37 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow/tensorboard:internal"],
+)
 
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_wct_test_suite")
+load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
+load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
 
 licenses(["notice"])  # Apache 2.0
 
+ts_web_library(
+    name = "test",
+    srcs = [
+        "sortingTests.ts",
+        "tests.html",
+    ],
+    path = "/vz-sorting/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
+        "//tensorflow/tensorboard/components/vz_sorting",
+    ],
+)
+
+tensorboard_html_binary(
+    name = "devserver",
+    compilation_level = "WHITESPACE_ONLY",
+    input_path = "/vz-sorting/test/tests.html",
+    output_path = "/vz-sorting/test/tests.html",
+    deps = [":test"],
+)
+
 filegroup(
     name = "all_files",
+    testonly = 0,
     srcs = glob(["**"]),
     tags = ["notsan"],
 )
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_wct_test_suite(
-    name = "legacy_test",
-    size = "medium",
-    srcs = ["index.html"],
-    deps = [
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    testonly = 1,
-    srcs = ["sortingTests.ts"],
-    deps = [
-        "//tensorflow/tensorboard/components:common_deps",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
-    ],
-)
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/index.html b/tensorflow/tensorboard/components/vz_sorting/test/index.html
deleted file mode 100644
index 7148bfb4181..00000000000
--- a/tensorflow/tensorboard/components/vz_sorting/test/index.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script src="../../web-component-tester/browser.js"></script>
-<body>
-<script src="../sorting.js"></script>
-<script src="sortingTests.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts b/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts
index 4dba3e35b9b..510685cb4b5 100644
--- a/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts
+++ b/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts
@@ -13,69 +13,65 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module VZ.Sorting {
+import {compareTagNames} from '../sorting';
+
+describe('compareTagNames', () => {
 
   const assert = chai.assert;
+  const sortTagNames = (a) => a.sort(compareTagNames);
 
-  describe('compareTagNames', () => {
-
-    const sortTagNames = (a) => a.sort(compareTagNames);
-
-    it('is asciibetical', () => {
-      assert.deepEqual(sortTagNames(['a', 'b']), ['a', 'b']);
-      assert.deepEqual(sortTagNames(['a', 'B']), ['B', 'a']);
-    });
-
-    it('sorts integer portions', () => {
-      assert.deepEqual(['03', '1'].sort(), ['03', '1']);
-      assert.deepEqual(sortTagNames(['03', '1']), ['1', '03']);
-      assert.deepEqual(sortTagNames(['a03', 'a1']), ['a1', 'a03']);
-      assert.deepEqual(sortTagNames(['a03', 'b1']), ['a03', 'b1']);
-      assert.deepEqual(sortTagNames(['x0a03', 'x0a1']), ['x0a1', 'x0a03']);
-      assert.deepEqual(sortTagNames(['a/b/03', 'a/b/1']), ['a/b/1', 'a/b/03']);
-    });
-
-    it('sorts fixed point numbers', () => {
-      assert.deepEqual(sortTagNames(['a0.1', 'a0.01']), ['a0.01', 'a0.1']);
-    });
-
-    it('sorts engineering notation', () => {
-      assert.deepEqual(sortTagNames(['a1e9', 'a9e8']), ['a9e8', 'a1e9']);
-      assert.deepEqual(sortTagNames(['a1e+9', 'a9e+8']), ['a9e+8', 'a1e+9']);
-      assert.deepEqual(sortTagNames(['a1e+5', 'a9e-6']), ['a9e-6', 'a1e+5']);
-      assert.deepEqual(
-          sortTagNames(['a1.0e9', 'a9.0e8']), ['a9.0e8', 'a1.0e9']);
-      assert.deepEqual(
-          sortTagNames(['a1.0e+9', 'a9.0e+8']), ['a9.0e+8', 'a1.0e+9']);
-    });
-
-    it('is componentized by slash', () => {
-      assert.deepEqual(['a+/a', 'a/a', 'ab/a'].sort(), ['a+/a', 'a/a', 'ab/a']);
-      assert.deepEqual(
-          sortTagNames(['a+/a', 'a/a', 'ab/a']), ['a/a', 'a+/a', 'ab/a']);
-    });
-
-    it('is componentized by underscore', () => {
-      assert.deepEqual(
-          sortTagNames(['a+_a', 'a_a', 'ab_a']), ['a_a', 'a+_a', 'ab_a']);
-      assert.deepEqual(
-          sortTagNames(['a+/a', 'a_a', 'ab_a']), ['a_a', 'a+/a', 'ab_a']);
-    });
-
-    it('is componentized by number boundaries', () => {
-      assert.deepEqual(
-          sortTagNames(['a+0a', 'a0a', 'ab0a']), ['a0a', 'a+0a', 'ab0a']);
-    });
-
-    it('empty comes first', () => {
-      assert.deepEqual(
-          sortTagNames(['a', '//', '/', '']), ['', '/', '//', 'a']);
-    });
-
-    it('decimal parsed correctly', () => {
-      assert.deepEqual(sortTagNames(['0.2', '0.03']), ['0.03', '0.2']);
-      assert.deepEqual(sortTagNames(['0..2', '0..03']), ['0..2', '0..03']);
-      assert.deepEqual(sortTagNames(['.2', '.03']), ['.2', '.03']);
-    });
+  it('is asciibetical', () => {
+    assert.deepEqual(sortTagNames(['a', 'b']), ['a', 'b']);
+    assert.deepEqual(sortTagNames(['a', 'B']), ['B', 'a']);
   });
-}
+
+  it('sorts integer portions', () => {
+    assert.deepEqual(['03', '1'].sort(), ['03', '1']);
+    assert.deepEqual(sortTagNames(['03', '1']), ['1', '03']);
+    assert.deepEqual(sortTagNames(['a03', 'a1']), ['a1', 'a03']);
+    assert.deepEqual(sortTagNames(['a03', 'b1']), ['a03', 'b1']);
+    assert.deepEqual(sortTagNames(['x0a03', 'x0a1']), ['x0a1', 'x0a03']);
+    assert.deepEqual(sortTagNames(['a/b/03', 'a/b/1']), ['a/b/1', 'a/b/03']);
+  });
+
+  it('sorts fixed point numbers', () => {
+    assert.deepEqual(sortTagNames(['a0.1', 'a0.01']), ['a0.01', 'a0.1']);
+  });
+
+  it('sorts engineering notation', () => {
+    assert.deepEqual(sortTagNames(['a1e9', 'a9e8']), ['a9e8', 'a1e9']);
+    assert.deepEqual(sortTagNames(['a1e+9', 'a9e+8']), ['a9e+8', 'a1e+9']);
+    assert.deepEqual(sortTagNames(['a1e+5', 'a9e-6']), ['a9e-6', 'a1e+5']);
+    assert.deepEqual(sortTagNames(['a1.0e9', 'a9.0e8']), ['a9.0e8', 'a1.0e9']);
+    assert.deepEqual(
+        sortTagNames(['a1.0e+9', 'a9.0e+8']), ['a9.0e+8', 'a1.0e+9']);
+  });
+
+  it('is componentized by slash', () => {
+    assert.deepEqual(['a+/a', 'a/a', 'ab/a'].sort(), ['a+/a', 'a/a', 'ab/a']);
+    assert.deepEqual(
+        sortTagNames(['a+/a', 'a/a', 'ab/a']), ['a/a', 'a+/a', 'ab/a']);
+  });
+
+  it('is componentized by underscore', () => {
+    assert.deepEqual(
+        sortTagNames(['a+_a', 'a_a', 'ab_a']), ['a_a', 'a+_a', 'ab_a']);
+    assert.deepEqual(
+        sortTagNames(['a+/a', 'a_a', 'ab_a']), ['a_a', 'a+/a', 'ab_a']);
+  });
+
+  it('is componentized by number boundaries', () => {
+    assert.deepEqual(
+        sortTagNames(['a+0a', 'a0a', 'ab0a']), ['a0a', 'a+0a', 'ab0a']);
+  });
+
+  it('empty comes first', () => {
+    assert.deepEqual(sortTagNames(['a', '//', '/', '']), ['', '/', '//', 'a']);
+  });
+
+  it('decimal parsed correctly', () => {
+    assert.deepEqual(sortTagNames(['0.2', '0.03']), ['0.03', '0.2']);
+    assert.deepEqual(sortTagNames(['0..2', '0..03']), ['0..2', '0..03']);
+    assert.deepEqual(sortTagNames(['.2', '.03']), ['.2', '.03']);
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/tests.html b/tensorflow/tensorboard/components/vz_sorting/test/tests.html
new file mode 100644
index 00000000000..f92c608cdb1
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting/test/tests.html
@@ -0,0 +1,23 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script vulcanize-noinline src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../vz-sorting.html">
+<body>
+<script src="sortingTests.js"></script>
diff --git a/tensorflow/tensorboard/defs.bzl b/tensorflow/tensorboard/defs.bzl
deleted file mode 100644
index 3488978ab2d..00000000000
--- a/tensorflow/tensorboard/defs.bzl
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-_DEFAULT_TYPINGS = [
-    "@com_microsoft_typescript//:lib.es6.d.ts",
-]
-
-def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs):
-  """Filegroup of compiled TypeScript sources.
-
-  This is a very unsophisticated TypeScript rule where the user is responsible
-  for passing all typings and sources via srcs. It's meant as a stopgap because
-  TypeScript rules currently don't exist for Bazel. The definition of this rule
-  will need to evolve as more ts_library rules are migrated.
-  """
-  for src in srcs:
-    if (src.startswith("/") or
-        src.endswith(".d.ts") or
-        not src.endswith(".ts")):
-      fail("srcs must be typescript sources in same package")
-  native.genrule(
-      name = name,
-      srcs = _DEFAULT_TYPINGS + typings + srcs,
-      outs = [src[:-3] + ".js" for src in srcs],
-      cmd = "$(location @com_microsoft_typescript//:tsc)" +
-            " --inlineSourceMap" +
-            " --inlineSources" +
-            " --outDir $(@D)" +
-            " $(SRCS)",
-      tools = ["@com_microsoft_typescript//:tsc"],
-      **kwargs
-  )
-
-def tensorboard_ts_library(**kwargs):
-  """Rules referencing this will be deleted from the codebase soon."""
-  pass
-
-def tensorboard_webcomponent_library(**kwargs):
-  """Rules referencing this will be deleted from the codebase soon."""
-  pass
-
-def tensorboard_wct_test_suite(**kwargs):
-  """Rules referencing this will be deleted from the codebase soon."""
-  pass
diff --git a/tensorflow/tensorboard/defs/BUILD b/tensorflow/tensorboard/defs/BUILD
new file mode 100644
index 00000000000..92a2af34048
--- /dev/null
+++ b/tensorflow/tensorboard/defs/BUILD
@@ -0,0 +1,14 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "ts_web_library_default_typings",
+    srcs = [
+        # Ordering probably matters.
+        "@com_microsoft_typescript//:lib.es6.d.ts",
+        "@io_angular_clutz//:src/resources/closure.lib.d.ts",
+        "clutz.d.ts",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/tensorboard/defs/clutz.d.ts b/tensorflow/tensorboard/defs/clutz.d.ts
new file mode 100644
index 00000000000..47cf307d261
--- /dev/null
+++ b/tensorflow/tensorboard/defs/clutz.d.ts
@@ -0,0 +1,19 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// tslint:disable
+declare namespace ಠ_ಠ.clutz {
+  interface IteratorIterable<T> extends Iterator<T>, Iterable<T> {}
+  interface IIterableResult<T> extends IteratorResult<T> {}
+}
diff --git a/tensorflow/tensorboard/defs/defs.bzl b/tensorflow/tensorboard/defs/defs.bzl
new file mode 100644
index 00000000000..94e2d7c540f
--- /dev/null
+++ b/tensorflow/tensorboard/defs/defs.bzl
@@ -0,0 +1,24 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+def tensorboard_webcomponent_library(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def _legacy_js_impl(target, ctx):
+  return struct()
+
+legacy_js = aspect(
+    implementation=_legacy_js_impl,
+    attr_aspects=["exports"])
diff --git a/tensorflow/tensorboard/defs/hacks.bzl b/tensorflow/tensorboard/defs/hacks.bzl
new file mode 100644
index 00000000000..f1d4be79061
--- /dev/null
+++ b/tensorflow/tensorboard/defs/hacks.bzl
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO(jart): Merge this file into defs.bzl once that file is sync unified.
+
+def tensorboard_typescript_bundle(
+    name,
+    out,
+    namespace_srcs,
+    namespace_symbol_aliases={},
+    namespace_symbol_aliases_public={},
+    **kwargs):
+  """Rolls TypeScript ES6 modules into one vanilla source file without imports.
+
+  This is a genrule wrapper that concatenates TypeScripts sources inside
+  namespace blocks while removing ^import lines. Because the sources themselves
+  are not parsed, the structure of the modules must be passed to this macro as
+  a Skylark data structure.
+
+  Args:
+    name: Name of this build rule target.
+    out: Path of outputted TypeScript source file.
+    namespace_srcs: Multimap of namespace strings to build file targets. The
+        ordering of the dictionary and nested lists does not matter when
+        generating a typings file, but *does* matter when generating a source
+        file.
+    namespace_symbol_aliases: Map of namespace strings where each value is a
+        map of symbol names to fully qualified symbol names.
+    namespace_symbol_aliases_public: Same as namespace_symbol_aliases but the
+        symbol will be visible to other namespaces.
+  """
+  cmd = ["(", "echo // GENERATED BY TENSORBOARD_TYPESCRIPT_BUNDLE"]
+  inputs = set()
+  for namespace, srcs in namespace_srcs.items():
+    cmd.append("echo")
+    if out[-5:] == ".d.ts":
+      cmd.append("echo 'declare namespace %s {'" % namespace)
+    elif out[-3:] == ".ts":
+      cmd.append("echo 'module %s {'" % namespace)
+    else:
+      fail("'out' must end with .ts or .d.ts: " + out)
+    for symbol, canon in namespace_symbol_aliases.get(namespace, {}).items():
+      cmd.append("echo 'import %s = %s;'" % (symbol, canon))
+    for symbol, canon in namespace_symbol_aliases_public.get(namespace,
+                                                             {}).items():
+      cmd.append("echo 'export import %s = %s;'" % (symbol, canon))
+    inputs += srcs
+    for src in srcs:
+      cmd.append("for f in $(locations %s); do" % src)
+      cmd.append("  echo")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo // " + namespace)
+      cmd.append("  echo // $$f")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo")
+      cmd.append("  sed 's!^import !// import !' $$f \\")
+      cmd.append("    | sed 's!^export declare !export !' \\")
+      cmd.append("    | sed '/^export .* from /d' \\")
+      cmd.append("    | sed '/^export {.*};$$/d'")
+      cmd.append("done")
+    cmd.append("echo '}'")
+  cmd.append(") >$@")
+  native.genrule(
+      name = name,
+      srcs = list(inputs),
+      outs = [out],
+      cmd = "\n".join(cmd),
+      **kwargs
+  )
diff --git a/tensorflow/tensorboard/defs/protos.bzl b/tensorflow/tensorboard/defs/protos.bzl
new file mode 100644
index 00000000000..6d1982e098d
--- /dev/null
+++ b/tensorflow/tensorboard/defs/protos.bzl
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@protobuf//:protobuf.bzl", "py_proto_library")
+
+def tb_proto_library(name, srcs = [], visibility = []):
+  py_proto_library(
+    name = name + "_py",
+    srcs = srcs,
+    srcs_version = "PY2AND3",
+    deps = ["@protobuf//:protobuf_python"],
+    protoc = "@protobuf//:protoc",
+    visibility = visibility,
+    default_runtime = "@protobuf//:protobuf_python",
+    testonly = 0,
+  )
\ No newline at end of file
diff --git a/tensorflow/tensorboard/defs/vulcanize.bzl b/tensorflow/tensorboard/defs/vulcanize.bzl
new file mode 100644
index 00000000000..6ff49a35ed7
--- /dev/null
+++ b/tensorflow/tensorboard/defs/vulcanize.bzl
@@ -0,0 +1,125 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "legacy_js")
+load("@io_bazel_rules_closure//closure/private:defs.bzl", "collect_js", "unfurl", "long_path")
+load("//tensorflow/tensorboard/defs:web.bzl", "web_aspect")
+
+def _tensorboard_html_binary(ctx):
+  deps = unfurl(ctx.attr.deps, provider="webfiles")
+  manifests = set(order="topological")
+  files = set()
+  webpaths = set()
+  for dep in deps:
+    manifests += dep.webfiles.manifests
+    webpaths += dep.webfiles.webpaths
+    files += dep.data_runfiles.files
+  webpaths += [ctx.attr.output_path]
+  closure_js_library=collect_js(
+      ctx, unfurl(ctx.attr.deps, provider="closure_js_library"))
+
+  # vulcanize
+  jslibs = depset(ctx.files._jslibs) + closure_js_library.srcs
+  ctx.action(
+      inputs=list(manifests | files | jslibs),
+      outputs=[ctx.outputs.html],
+      executable=ctx.executable._Vulcanize,
+      arguments=([ctx.attr.compilation_level,
+                  "true" if ctx.attr.testonly else "false",
+                  ctx.attr.input_path,
+                  ctx.attr.output_path,
+                  ctx.outputs.html.path] +
+                 [f.path for f in jslibs] +
+                 [f.path for f in manifests]),
+      progress_message="Vulcanizing %s" % ctx.attr.input_path)
+
+  # webfiles manifest
+  manifest_srcs = [struct(path=ctx.outputs.html.path,
+                          longpath=long_path(ctx, ctx.outputs.html),
+                          webpath=ctx.attr.output_path)]
+  manifest = ctx.new_file(ctx.configuration.bin_dir,
+                          "%s.pbtxt" % ctx.label.name)
+  ctx.file_action(
+      output=manifest,
+      content=struct(
+          label=str(ctx.label),
+          src=manifest_srcs).to_proto())
+  manifests += [manifest]
+
+  # webfiles server
+  params = struct(
+      label=str(ctx.label),
+      bind="[::]:6006",
+      manifest=[long_path(ctx, man) for man in manifests],
+      external_asset=[struct(webpath=k, path=v)
+                      for k, v in ctx.attr.external_assets.items()])
+  params_file = ctx.new_file(ctx.configuration.bin_dir,
+                             "%s_server_params.pbtxt" % ctx.label.name)
+  ctx.file_action(output=params_file, content=params.to_proto())
+  ctx.file_action(
+      executable=True,
+      output=ctx.outputs.executable,
+      content="#!/bin/sh\nexec %s %s" % (
+          ctx.executable._WebfilesServer.short_path,
+          long_path(ctx, params_file)))
+
+  transitive_runfiles = depset()
+  transitive_runfiles += ctx.attr._WebfilesServer.data_runfiles.files
+  for dep in deps:
+    transitive_runfiles += dep.data_runfiles.files
+  return struct(
+      files=depset([ctx.outputs.html]),
+      webfiles=struct(
+          manifest=manifest,
+          manifests=manifests,
+          webpaths=webpaths,
+          dummy=ctx.outputs.html),
+      runfiles=ctx.runfiles(
+          files=ctx.files.data + [manifest,
+                                  params_file,
+                                  ctx.outputs.html,
+                                  ctx.outputs.executable],
+          transitive_files=transitive_runfiles))
+
+tensorboard_html_binary = rule(
+    implementation=_tensorboard_html_binary,
+    executable=True,
+    attrs={
+        "compilation_level": attr.string(default="ADVANCED"),
+        "input_path": attr.string(mandatory=True),
+        "output_path": attr.string(mandatory=True),
+        "data": attr.label_list(cfg="data", allow_files=True),
+        "deps": attr.label_list(
+            aspects=[
+                web_aspect,
+                legacy_js,
+            ],
+            mandatory=True),
+        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
+        "_jslibs": attr.label(
+            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:jslibs"),
+            allow_files=True),
+        "_Vulcanize": attr.label(
+            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Vulcanize"),
+            executable=True,
+            cfg="host"),
+        "_WebfilesServer": attr.label(
+            default=Label(
+                "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
+            executable=True,
+            cfg="host"),
+    },
+    outputs={
+        "html": "%{name}.html",
+    })
diff --git a/tensorflow/tensorboard/defs/web.bzl b/tensorflow/tensorboard/defs/web.bzl
new file mode 100644
index 00000000000..103942b0a25
--- /dev/null
+++ b/tensorflow/tensorboard/defs/web.bzl
@@ -0,0 +1,419 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Same as web_library but supports TypeScript."""
+
+load("//tensorflow/tensorboard/defs:defs.bzl", "legacy_js")
+
+load("//third_party:clutz.bzl",
+     "CLUTZ_ATTRIBUTES",
+     "CLUTZ_OUTPUTS",
+     "clutz_aspect",
+     "extract_dts_from_closure_libraries")
+
+load("@io_bazel_rules_closure//closure/private:defs.bzl",
+     "CLOSURE_LIBRARY_BASE_ATTR",
+     "CLOSURE_LIBRARY_DEPS_ATTR",
+     "collect_js",
+     "collect_runfiles",
+     "convert_path_to_es6_module_name",
+     "create_argfile",
+     "difference",
+     "long_path",
+     "unfurl")
+
+_ASPECT_SLURP_FILE_TYPE = FileType([
+    ".html", ".js", ".css", ".gss", ".png", ".jpg", ".gif", ".ico", ".svg"])
+
+_CLOSURE_WORKER = attr.label(
+    default=Label("@io_bazel_rules_closure//java/io/bazel/rules/closure:ClosureWorker"),
+    executable=True,
+    cfg="host")
+
+def _ts_web_library(ctx):
+  if not ctx.attr.srcs:
+    if ctx.attr.deps:
+      fail("deps can not be set when srcs is not")
+    if not ctx.attr.exports:
+      fail("exports must be set if srcs is not")
+  if ctx.attr.path:
+    if not ctx.attr.path.startswith("/"):
+      fail("webpath must start with /")
+    if ctx.attr.path != "/" and ctx.attr.path.endswith("/"):
+      fail("webpath must not end with / unless it is /")
+    if "//" in ctx.attr.path:
+      fail("webpath must not have //")
+  elif ctx.attr.srcs:
+    fail("path must be set when srcs is set")
+  if "*" in ctx.attr.suppress and len(ctx.attr.suppress) != 1:
+    fail("when \"*\" is suppressed no other items should be present")
+
+  # process what came before
+  deps = unfurl(ctx.attr.deps, provider="webfiles")
+  webpaths = depset()
+  ts_typings = depset(ctx.files._default_typings)
+  ts_typings_paths = depset(
+      [long_path(ctx, f) for f in ctx.files._default_typings])
+  ts_typings_execroots = depset()
+  aspect_runfiles = depset()
+  for dep in deps:
+    webpaths += dep.webfiles.webpaths
+    if hasattr(dep.webfiles, "ts_typings"):
+      ts_typings += dep.webfiles.ts_typings
+    if hasattr(dep.webfiles, "ts_typings_paths"):
+      ts_typings_paths += dep.webfiles.ts_typings_paths
+    if hasattr(dep.webfiles, "ts_typings_execroots"):
+      ts_typings_execroots += dep.webfiles.ts_typings_execroots
+    if hasattr(dep.webfiles, "aspect_runfiles"):
+      aspect_runfiles += dep.webfiles.aspect_runfiles
+
+  # process what comes now
+  manifest_srcs = []
+  new_webpaths = []
+  ts_inputs = depset()
+  ts_outputs = []
+  ts_files = list(ts_typings_paths)
+  new_typings = []
+  new_typings_paths = []
+  new_typings_execroot = struct(inputs=[])
+  execroot = struct(
+      inputs=[(long_path(ctx, f), f.path) for f in ctx.files._default_typings],
+      outputs=[],
+      program=[ctx.executable._tsc.path, "-p"])
+  web_srcs = []
+  path = ctx.attr.path
+  strip = _get_strip(ctx)
+  for src in ctx.files.srcs:
+    suffix = _get_path_relative_to_package(src)
+    if strip:
+      if not suffix.startswith(strip):
+        fail("Relative src path not start with '%s': %s" % (strip, suffix))
+      suffix = suffix[len(strip):]
+    webpath = "%s/%s" % ("" if path == "/" else path, suffix)
+    _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs)
+    if suffix.endswith(".d.ts"):
+      web_srcs.append(src)
+      entry = (webpath[1:], src.path)
+      new_typings.append(src)
+      new_typings_paths.append(entry[0])
+      new_typings_execroot.inputs.append(entry)
+      ts_inputs += [src]
+      ts_files.append(entry[0])
+      execroot.inputs.append(entry)
+    elif suffix.endswith(".ts"):
+      noext = suffix[:-3]
+      js = ctx.new_file(ctx.bin_dir, "%s.js" % noext)
+      dts = ctx.new_file(ctx.bin_dir, "%s.d.ts" % noext)
+      webpath_js = webpath[:-3] + ".js"
+      webpath_dts = webpath[:-3] + ".d.ts"
+      _add_webpath(ctx, js, webpath_js, webpaths, new_webpaths, manifest_srcs)
+      _add_webpath(ctx, dts, webpath_dts, webpaths, new_webpaths, manifest_srcs)
+      ts_inputs += [src]
+      ts_outputs.append(js)
+      ts_outputs.append(dts)
+      web_srcs.append(dts)
+      web_srcs.append(js)
+      ts_files.append(webpath[1:])
+      execroot.inputs.append((webpath[1:], src.path))
+      execroot.outputs.append((webpath_js[1:], js.path))
+      execroot.outputs.append((webpath_dts[1:], dts.path))
+      new_typings.append(dts)
+      new_typings_paths.append(webpath_dts[1:])
+      new_typings_execroot.inputs.append((webpath_dts[1:], dts.path))
+    else:
+      web_srcs.append(src)
+
+  # get typings for closure code
+  clutz_dts = extract_dts_from_closure_libraries(ctx)
+  if clutz_dts:
+    entry = (long_path(ctx, clutz_dts), clutz_dts.path)
+    ts_inputs += [clutz_dts]
+    ts_files.append(entry[0])
+    execroot.inputs.append(entry)
+
+  # compile typescript
+  workspace = ""
+  if ctx.label.workspace_root:
+    workspace = "/" + ctx.label.workspace_root
+  if execroot.outputs:
+    ts_config = _new_file(ctx, "-tsc.json")
+    execroot.inputs.append(("tsconfig.json", ts_config.path))
+    ctx.file_action(
+        output=ts_config,
+        content=struct(
+            compilerOptions=struct(
+                baseUrl=".",
+                declaration=True,
+                inlineSourceMap=True,
+                inlineSources=True,
+                module="es6",
+                moduleResolution="node",
+                noResolve=True,
+                target="es5",
+            ),
+            files=ts_files,
+        ).to_json())
+    er_config = _new_file(ctx, "-tsc-execroot.json")
+    ctx.file_action(output=er_config, content=execroot.to_json())
+    ts_inputs += collect_runfiles([ctx.attr._tsc])
+    ts_inputs += ctx.files._tsc
+    ts_inputs += ts_typings
+    ts_inputs += ts_typings_execroots
+    ts_inputs += [ts_config, er_config]
+    ctx.action(
+        inputs=list(ts_inputs),
+        outputs=ts_outputs,
+        executable=ctx.executable._execrooter,
+        arguments=[er_config.path] + [f.path for f in ts_typings_execroots],
+        progress_message="Compiling %d TypeScript files %s" % (
+            len(ts_files), ctx.label))
+
+  # perform strict dependency checking
+  manifest = _make_manifest(ctx, manifest_srcs)
+  webpaths += new_webpaths
+  dummy, manifests = _run_webfiles_validator(ctx, web_srcs, deps, manifest)
+  web_srcs.append(dummy)
+
+  # define development web server that only applies to this transitive closure
+  params = struct(
+      label=str(ctx.label),
+      bind="[::]:6006",
+      manifest=[long_path(ctx, man) for man in manifests],
+      external_asset=[struct(webpath=k, path=v)
+                      for k, v in ctx.attr.external_assets.items()])
+  params_file = _new_file(ctx, "-params.pbtxt")
+  ctx.file_action(output=params_file, content=params.to_proto())
+  ctx.file_action(
+      executable=True,
+      output=ctx.outputs.executable,
+      content="#!/bin/sh\nexec %s %s" % (
+          ctx.executable._WebfilesServer.short_path,
+          long_path(ctx, params_file)))
+
+  if new_typings:
+    er_config = _new_file(ctx, "-typings-execroot.json")
+    ctx.file_action(output=er_config, content=new_typings_execroot.to_json())
+    ts_typings += new_typings
+    ts_typings_paths += new_typings_paths
+    ts_typings_execroots += [er_config]
+  else:
+    ts_typings = depset()
+    ts_typings_paths = depset()
+    ts_typings_execroots = depset()
+
+  # export data to parent rules
+  return struct(
+      files=depset(web_srcs + [dummy]),
+      exports=unfurl(ctx.attr.exports),
+      webfiles=struct(
+          manifest=manifest,
+          manifests=manifests,
+          webpaths=webpaths,
+          dummy=dummy,
+          ts_typings=ts_typings,
+          ts_typings_paths=ts_typings_paths,
+          ts_typings_execroots=ts_typings_execroots),
+      closure_js_library=collect_js(
+          ctx, unfurl(ctx.attr.deps, provider="closure_js_library")),
+      runfiles=ctx.runfiles(
+          files=ctx.files.srcs + ctx.files.data + ts_outputs + [
+              manifest,
+              params_file,
+              ctx.outputs.executable,
+              dummy],
+          transitive_files=(collect_runfiles([ctx.attr._WebfilesServer]) |
+                            collect_runfiles(deps) |
+                            collect_runfiles(ctx.attr.data) |
+                            aspect_runfiles)))
+
+def _web_aspect_impl(target, ctx):
+  if hasattr(target, "webfiles"):
+    return struct()
+  srcs = []
+  deps = []
+  if hasattr(ctx.rule.files, "srcs"):
+    srcs.extend(_ASPECT_SLURP_FILE_TYPE.filter(ctx.rule.files.srcs))
+  for attr in ("deps", "sticky_deps", "module_deps"):
+    value = getattr(ctx.rule.attr, attr, None)
+    if value:
+      deps.extend(value)
+  deps = unfurl(deps, provider="webfiles")
+  webpaths = depset()
+  aspect_runfiles = depset(srcs)
+  for dep in deps:
+    webpaths += dep.webfiles.webpaths
+    if hasattr(dep.webfiles, "aspect_runfiles"):
+      aspect_runfiles += dep.webfiles.aspect_runfiles
+  manifest_srcs = []
+  new_webpaths = []
+  for src in srcs:
+    webpath = "/" + long_path(ctx, src)
+    _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs)
+  webpaths += new_webpaths
+  manifest = _make_manifest(ctx, manifest_srcs)
+  dummy, manifests = _run_webfiles_validator(ctx, srcs, deps, manifest)
+  aspect_runfiles += [dummy, manifest]
+  return struct(
+      webfiles=struct(
+          manifest=manifest,
+          manifests=manifests,
+          webpaths=webpaths,
+          dummy=dummy,
+          aspect_runfiles=aspect_runfiles))
+
+def _make_manifest(ctx, src_list):
+  manifest = _new_file(ctx, "-webfiles.pbtxt")
+  ctx.file_action(
+      output=manifest,
+      content=struct(
+          label=str(ctx.label),
+          src=src_list).to_proto())
+  return manifest
+
+def _run_webfiles_validator(ctx, srcs, deps, manifest):
+  dummy = _new_file(ctx, "-webfiles.ignoreme")
+  manifests = depset(order="topological")
+  for dep in deps:
+    manifests += dep.webfiles.manifests
+  if srcs:
+    args = ["WebfilesValidator",
+            "--dummy", dummy.path,
+            "--target", manifest.path]
+    if hasattr(ctx, "attr") and hasattr(ctx.attr, "suppress"):
+      for category in ctx.attr.suppress:
+        args.append("--suppress")
+        args.append(category)
+    inputs = [manifest]
+    inputs.extend(srcs)
+    direct_manifests = depset()
+    for dep in deps:
+      inputs.append(dep.webfiles.dummy)
+      for f in dep.files:
+        inputs.append(f)
+      direct_manifests += [dep.webfiles.manifest]
+      inputs.append(dep.webfiles.manifest)
+      args.append("--direct_dep")
+      args.append(dep.webfiles.manifest.path)
+    for man in difference(manifests, direct_manifests):
+      inputs.append(man)
+      args.append("--transitive_dep")
+      args.append(man.path)
+    argfile = _new_file(ctx, "-webfiles-checker-args.txt")
+    ctx.file_action(output=argfile, content="\n".join(args))
+    inputs.append(argfile)
+    ctx.action(
+        inputs=inputs,
+        outputs=[dummy],
+        executable=(getattr(ctx.executable, "_ClosureWorker", None) or
+                    getattr(ctx.executable, "_ClosureWorkerAspect", None)),
+        arguments=["@@" + argfile.path],
+        mnemonic="Closure",
+        execution_requirements={"supports-workers": "1"},
+        progress_message="Checking webfiles %s" % ctx.label)
+  else:
+    ctx.file_action(output=dummy, content="BOO!")
+  manifests += [manifest]
+  return dummy, manifests
+
+def _new_file(ctx, suffix):
+  return ctx.new_file(ctx.bin_dir, "%s%s" % (ctx.label.name, suffix))
+
+def _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs):
+  if webpath in new_webpaths:
+    _fail(ctx, "multiple srcs within %s define the webpath %s " % (
+        ctx.label, webpath))
+  if webpath in webpaths:
+    _fail(ctx, "webpath %s was defined by %s when already defined by deps" % (
+        webpath, ctx.label))
+  new_webpaths.append(webpath)
+  manifest_srcs.append(struct(
+      path=src.path,
+      longpath=long_path(ctx, src),
+      webpath=webpath))
+
+def _fail(ctx, message):
+  if ctx.attr.suppress == ["*"]:
+    print(message)
+  else:
+    fail(message)
+
+def _get_path_relative_to_package(artifact):
+  """Returns file path relative to the package that declared it."""
+  path = artifact.path
+  for prefix in (artifact.root.path,
+                 artifact.owner.workspace_root if artifact.owner else '',
+                 artifact.owner.package if artifact.owner else ''):
+    if prefix:
+      prefix = prefix + "/"
+      if not path.startswith(prefix):
+        fail("Path %s doesn't start with %s" % (path, prefix))
+      path = path[len(prefix):]
+  return path
+
+def _get_strip(ctx):
+  strip = ctx.attr.strip_prefix
+  if strip:
+    if strip.startswith("/"):
+      _fail(ctx, "strip_prefix should not end with /")
+      strip = strip[1:]
+    if strip.endswith("/"):
+      _fail(ctx, "strip_prefix should not end with /")
+    else:
+      strip += "/"
+  return strip
+
+web_aspect = aspect(
+    implementation=_web_aspect_impl,
+    attr_aspects=["deps", "sticky_deps", "module_deps"],
+    attrs={"_ClosureWorkerAspect": _CLOSURE_WORKER})
+
+ts_web_library = rule(
+    implementation=_ts_web_library,
+    executable=True,
+    attrs=CLUTZ_ATTRIBUTES + {
+        "path": attr.string(),
+        "srcs": attr.label_list(allow_files=True),
+        "deps": attr.label_list(
+            aspects=[
+                web_aspect,
+                clutz_aspect,
+                legacy_js,
+            ]),
+        "exports": attr.label_list(),
+        "data": attr.label_list(cfg="data", allow_files=True),
+        "suppress": attr.string_list(),
+        "strip_prefix": attr.string(),
+        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
+        "clutz_entry_points": attr.string_list(),
+        "_execrooter": attr.label(
+            default=Label("//tensorflow/tensorboard/scripts:execrooter"),
+            executable=True,
+            cfg="host"),
+        "_tsc": attr.label(
+            default=Label("@com_microsoft_typescript//:tsc"),
+            allow_files=True,
+            executable=True,
+            cfg="host"),
+        "_default_typings": attr.label(
+            default=Label("//tensorflow/tensorboard:ts_web_library_default_typings"),
+            allow_files=True),
+        "_WebfilesServer": attr.label(
+            default=Label("@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
+            executable=True,
+            cfg="host"),
+        "_ClosureWorker": _CLOSURE_WORKER,
+        "_closure_library_base": CLOSURE_LIBRARY_BASE_ATTR,
+        "_closure_library_deps": CLOSURE_LIBRARY_DEPS_ATTR,
+    },
+    outputs=CLUTZ_OUTPUTS)
diff --git a/tensorflow/tensorboard/defs/zipper.bzl b/tensorflow/tensorboard/defs/zipper.bzl
new file mode 100644
index 00000000000..e98309ec9a5
--- /dev/null
+++ b/tensorflow/tensorboard/defs/zipper.bzl
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@io_bazel_rules_closure//closure/private:defs.bzl", "unfurl", "long_path")
+
+def _tensorboard_zip_file(ctx):
+  deps = unfurl(ctx.attr.deps, provider="webfiles")
+  manifests = set(order="link")
+  files = set()
+  webpaths = set()
+  for dep in deps:
+    manifests += dep.webfiles.manifests
+    webpaths += dep.webfiles.webpaths
+    files += dep.data_runfiles.files
+  ctx.action(
+      inputs=list(manifests + files),
+      outputs=[ctx.outputs.zip],
+      executable=ctx.executable._Zipper,
+      arguments=([ctx.outputs.zip.path] +
+                 [m.path for m in manifests]),
+      progress_message="Zipping %d files" % len(webpaths))
+  transitive_runfiles = set()
+  for dep in deps:
+    transitive_runfiles += dep.data_runfiles.files
+  return struct(
+      files=set([ctx.outputs.zip]),
+      runfiles=ctx.runfiles(
+          files=ctx.files.data + [ctx.outputs.zip],
+          transitive_files=transitive_runfiles))
+
+tensorboard_zip_file = rule(
+    implementation=_tensorboard_zip_file,
+    attrs={
+        "data": attr.label_list(cfg="data", allow_files=True),
+        "deps": attr.label_list(providers=["webfiles"], mandatory=True),
+        "_Zipper": attr.label(
+            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Zipper"),
+            executable=True,
+            cfg="host"),
+    },
+    outputs={
+        "zip": "%{name}.zip",
+    })
diff --git a/tensorflow/tensorboard/demo/BUILD b/tensorflow/tensorboard/demo/BUILD
new file mode 100644
index 00000000000..b253572ec55
--- /dev/null
+++ b/tensorflow/tensorboard/demo/BUILD
@@ -0,0 +1,20 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# THIS PACKAGE HAS MOVED
+# See tensorflow/tensorboard/components/tf_tensorboard:demo
+
+web_library(
+    name = "demo_data",
+    srcs = glob(["data/**"]),
+    path = "/",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/demo/data/logdir b/tensorflow/tensorboard/demo/data/logdir
new file mode 100644
index 00000000000..b6362b45d77
--- /dev/null
+++ b/tensorflow/tensorboard/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/index.html b/tensorflow/tensorboard/demo/index.html
deleted file mode 100644
index 581f8a27235..00000000000
--- a/tensorflow/tensorboard/demo/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!-- This demo index file serves statically serialized TensorBoard json.
-It is essentially a mocked version of the TensorBoard backend. -->
-<html>
-<head>
-  <script src="../components/webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../components/tf_tensorboard/tf-tensorboard.html">
-    <link rel="stylesheet" type="text/css" href="../lib/css/global.css">
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-</head>
-<body>
-  <tf-tensorboard demo-dir="data/" use-hash></tf-tensorboard>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/dist/bazel-html-imports.html b/tensorflow/tensorboard/dist/bazel-html-imports.html
deleted file mode 100644
index 2268e6d7d4c..00000000000
--- a/tensorflow/tensorboard/dist/bazel-html-imports.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!-- TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT -->
-<script src="../numericjs_numeric_min_js/file/numeric.min.js"></script>
-<script src="../three_js_three_min_js/file/three.min.js"></script>
-<script src="../three_js_orbitcontrols_js/file/OrbitControls.js"></script>
-<script src="../weblas_weblas_js/file/weblas.js"></script>
diff --git a/tensorflow/tensorboard/dist/index.html b/tensorflow/tensorboard/dist/index.html
deleted file mode 100644
index 66fce9fe9af..00000000000
--- a/tensorflow/tensorboard/dist/index.html
+++ /dev/null
@@ -1,32 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <title>TensorBoard</title>
-    <script src="webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="stylesheet" type="text/css" href="lib/css/global.css">
-    <link rel="stylesheet" type="text/css" href="plottable/plottable.css">
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-    <link rel="import" href="dist/bazel-html-imports.html">
-    <link rel="import" href="dist/tf-tensorboard.html">
-  </head>
-  <body>
-    <tf-tensorboard></tf-tensorboard>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/dist/tf-tensorboard.html b/tensorflow/tensorboard/dist/tf-tensorboard.html
deleted file mode 100644
index 3e077d1a73a..00000000000
--- a/tensorflow/tensorboard/dist/tf-tensorboard.html
+++ /dev/null
@@ -1,24940 +0,0 @@
-<!-- Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-============================================================================
-
-This file is generated by `gulp` & `vulcanize`. Do not directly change it.
-Instead, use `gulp regenerate` to create a new version with your changes.
--->
-
-<html><head><!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
---><meta charset="UTF-8">
-</head><body><div hidden="" by-vulcanize=""><link rel="import" href="../polymer/polymer.html"><link rel="import" href="../iron-icons/iron-icons.html"><link rel="import" href="../paper-tabs/paper-tabs.html"><link rel="import" href="../paper-dialog/paper-dialog.html"><link rel="import" href="../paper-checkbox/paper-checkbox.html"><link rel="import" href="../paper-toolbar/paper-toolbar.html"><link rel="import" href="../paper-button/paper-button.html"><link rel="import" href="../paper-icon-button/paper-icon-button.html"><link rel="import" href="../paper-header-panel/paper-header-panel.html"><dom-module id="tf-globals" assetpath="../tf-globals/">
-  <script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace */
-var TF;
-(function (TF) {
-    var Globals;
-    (function (Globals) {
-        // The names of TensorBoard tabs.
-        Globals.TABS = [
-            'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
-            'embeddings'
-        ];
-        // If true, TensorBoard stores its hash in the URI state.
-        // If false, tab switching in TensorBoard will not update location hash,
-        // because hash updates interfere with wct_tests.
-        Globals.USE_HASH = false;
-        // If USE_HASH is false, FAKE_HASH holds the hash contents.
-        Globals.FAKE_HASH = '';
-    })(Globals = TF.Globals || (TF.Globals = {}));
-})(TF || (TF = {}));
-</script>
-</dom-module>
-
-
-
-<link rel="import" href="../paper-slider/paper-slider.html"><link rel="import" href="../paper-input/paper-input.html"><script src="../lodash/lodash.min.js"></script><dom-module id="tf-smoothing-input" assetpath="../tf-scalar-dashboard/">
-  <template>
-    <h3 class="title">Smoothing</h3>
-    <div class="smoothing-block">
-      <paper-slider id="slider" value="{{weight}}" immediate-value="{{_immediateWeightNumberForPaperSlider}}" type="number" step="[[step]]" min="[[min]]" max="[[max]]"></paper-slider>
-      <paper-input id="input" label="weight" no-label-float="" value="{{_inputWeightStringForPaperInput}}" type="number" step="[[step]]" min="[[min]]" max="[[max]]"></paper-input>
-    </div>
-    <style>
-      .title {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-
-      .smoothing-block {
-        display: flex;
-      }
-
-      paper-slider {
-        margin-left: 12px;
-        --paper-slider-knob-color: var(--tb-orange-strong);
-        --paper-slider-active-color: var(--tb-orange-strong);
-        flex-grow: 2;
-      }
-
-      paper-input {
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        --paper-input-container-input: {
-          font-size: 14px;
-        };
-        --paper-input-container-label: {
-          font-size: 14px;
-        };
-        width: 60px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-smoothing-input",
-
-      properties: {
-        step: Number,
-        max: Number,
-        min: Number,
-
-        weight: {
-          type: Number,
-          value: 0.6,
-          notify: true
-        },
-
-        _immediateWeightNumberForPaperSlider: {
-          type: Number,
-          notify: true,
-          observer: '_immediateWeightNumberForPaperSliderChanged'
-        },
-
-        // Paper input treats values as strings even if you specify them as
-        // numbers.
-        _inputWeightStringForPaperInput: {
-          type: String,
-          notify: true,
-          observer: '_inputWeightStringForPaperInputChanged'
-        }
-      },
-
-      _updateWeight: _.debounce(function(val) {
-        this.weight = val;
-      }, 250),
-
-      _immediateWeightNumberForPaperSliderChanged: function() {
-        this._inputWeightStringForPaperInput =
-            this._immediateWeightNumberForPaperSlider.toString();
-        this._updateWeight.call(this, this._immediateWeightNumberForPaperSlider);
-      },
-
-      _inputWeightStringForPaperInputChanged: function() {
-        if (+this._inputWeightStringForPaperInput < 0) {
-          this._inputWeightStringForPaperInput = '0';
-        }
-        else if (+this._inputWeightStringForPaperInput > 1) {
-          this._inputWeightStringForPaperInput = '1';
-        }
-
-        var d = +this._inputWeightStringForPaperInput;
-        if (!isNaN(d)) {
-          this._updateWeight.call(this, d);
-        }
-      }
-    });
-  </script>
-</dom-module>
-<script src="../d3/d3.js"></script><script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var VZ;
-(function (VZ) {
-    var Sorting;
-    (function (Sorting) {
-        /**
-         * Compares tag names asciinumerically broken into components.
-         *
-         * <p>This is the comparison function used for sorting most string values in
-         * TensorBoard. Unlike the standard asciibetical comparator, this function
-         * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
-         * supported. This function also splits the input by slash and underscore to
-         * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
-         * though '+' < '/' in the ASCII table.
-         */
-        function compareTagNames(a, b) {
-            var ai = 0;
-            var bi = 0;
-            while (true) {
-                if (ai === a.length) {
-                    return bi === b.length ? 0 : -1;
-                }
-                if (bi === b.length) {
-                    return 1;
-                }
-                if (isDigit(a[ai]) && isDigit(b[bi])) {
-                    var ais = ai;
-                    var bis = bi;
-                    ai = consumeNumber(a, ai + 1);
-                    bi = consumeNumber(b, bi + 1);
-                    var an = parseFloat(a.slice(ais, ai));
-                    var bn = parseFloat(b.slice(bis, bi));
-                    if (an < bn) {
-                        return -1;
-                    }
-                    if (an > bn) {
-                        return 1;
-                    }
-                    continue;
-                }
-                if (isBreak(a[ai])) {
-                    if (!isBreak(b[bi])) {
-                        return -1;
-                    }
-                }
-                else if (isBreak(b[bi])) {
-                    return 1;
-                }
-                else if (a[ai] < b[bi]) {
-                    return -1;
-                }
-                else if (a[ai] > b[bi]) {
-                    return 1;
-                }
-                ai++;
-                bi++;
-            }
-        }
-        Sorting.compareTagNames = compareTagNames;
-        function consumeNumber(s, i) {
-            var State;
-            (function (State) {
-                State[State["NATURAL"] = 0] = "NATURAL";
-                State[State["REAL"] = 1] = "REAL";
-                State[State["EXPONENT_SIGN"] = 2] = "EXPONENT_SIGN";
-                State[State["EXPONENT"] = 3] = "EXPONENT";
-            })(State || (State = {}));
-            var state = State.NATURAL;
-            for (; i < s.length; i++) {
-                if (state === State.NATURAL) {
-                    if (s[i] === '.') {
-                        state = State.REAL;
-                    }
-                    else if (s[i] === 'e' || s[i] === 'E') {
-                        state = State.EXPONENT_SIGN;
-                    }
-                    else if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-                else if (state === State.REAL) {
-                    if (s[i] === 'e' || s[i] === 'E') {
-                        state = State.EXPONENT_SIGN;
-                    }
-                    else if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-                else if (state === State.EXPONENT_SIGN) {
-                    if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
-                        state = State.EXPONENT;
-                    }
-                    else {
-                        break;
-                    }
-                }
-                else if (state === State.EXPONENT) {
-                    if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-            }
-            return i;
-        }
-        function isDigit(c) { return '0' <= c && c <= '9'; }
-        function isBreak(c) {
-            // TODO(jart): Remove underscore when people stop using it like a slash.
-            return c === '/' || c === '_' || isDigit(c);
-        }
-    })(Sorting = VZ.Sorting || (VZ.Sorting = {}));
-})(VZ || (VZ = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        /**
-         * Manages many fetch requests. Launches up to nSimultaneousRequests
-         * simultaneously, and maintains a LIFO queue of requests to process when
-         * more urls are requested than can be handled at once. The queue can be
-         * cleared.
-         *
-         * When a request is made, a Promise is returned which resolves with the
-         * parsed JSON result from the request.
-         */
-        var RequestCancellationError = (function (_super) {
-            __extends(RequestCancellationError, _super);
-            function RequestCancellationError() {
-                _super.apply(this, arguments);
-                this.name = 'RequestCancellationError';
-            }
-            return RequestCancellationError;
-        }(Error));
-        Backend.RequestCancellationError = RequestCancellationError;
-        var RequestNetworkError = (function (_super) {
-            __extends(RequestNetworkError, _super);
-            function RequestNetworkError(req, url) {
-                _super.call(this);
-                this.message = "RequestNetworkError: " + req.status + " at " + url;
-                this.name = 'RequestNetworkError';
-                this.req = req;
-                this.url = url;
-            }
-            return RequestNetworkError;
-        }(Error));
-        Backend.RequestNetworkError = RequestNetworkError;
-        var RequestManager = (function () {
-            function RequestManager(nSimultaneousRequests, maxRetries) {
-                if (nSimultaneousRequests === void 0) { nSimultaneousRequests = 10; }
-                if (maxRetries === void 0) { maxRetries = 3; }
-                this._queue = [];
-                this._nActiveRequests = 0;
-                this._nSimultaneousRequests = nSimultaneousRequests;
-                this._maxRetries = maxRetries;
-            }
-            /* Gives a promise that loads assets from given url (respects queuing) */
-            RequestManager.prototype.request = function (url) {
-                var _this = this;
-                var promise = new Promise(function (resolve, reject) {
-                    var resolver = { resolve: resolve, reject: reject };
-                    _this._queue.push(resolver);
-                    _this.launchRequests();
-                })
-                    .then(function () {
-                    return _this.promiseWithRetries(url, _this._maxRetries);
-                })
-                    .then(function (response) {
-                    // Success - Let's free space for another active
-                    // reqest, and launch it
-                    _this._nActiveRequests--;
-                    _this.launchRequests();
-                    return response;
-                }, function (rejection) {
-                    if (rejection.name === 'RequestNetworkError') {
-                        // If we failed due to network error, we should
-                        // decrement
-                        // _nActiveRequests because this request was
-                        // active
-                        _this._nActiveRequests--;
-                        _this.launchRequests();
-                    }
-                    return Promise.reject(rejection);
-                });
-                return promise;
-            };
-            RequestManager.prototype.clearQueue = function () {
-                while (this._queue.length > 0) {
-                    this._queue.pop().reject(new RequestCancellationError('Request cancelled by clearQueue'));
-                }
-            };
-            /* Return number of currently pending requests */
-            RequestManager.prototype.activeRequests = function () {
-                return this._nActiveRequests;
-            };
-            /* Return total number of outstanding requests (includes queue) */
-            RequestManager.prototype.outstandingRequests = function () {
-                return this._nActiveRequests + this._queue.length;
-            };
-            RequestManager.prototype.launchRequests = function () {
-                while (this._nActiveRequests < this._nSimultaneousRequests &&
-                    this._queue.length > 0) {
-                    this._nActiveRequests++;
-                    this._queue.pop().resolve();
-                }
-            };
-            /**
-             * Try to request a given URL using overwritable _promiseFromUrl method.
-             * If the request fails for any reason, we will retry up to maxRetries
-             * times. In practice, this will help us paper over transient network issues
-             * like '502 Bad Gateway'.
-             * By default, Chrome displays network errors in console, so
-             * the user will be able to tell when the requests are failing. I think this
-             * is a feature, if the request failures and retries are causing any
-             * pain to users, they can see it and file issues.
-             */
-            RequestManager.prototype.promiseWithRetries = function (url, maxRetries) {
-                var _this = this;
-                var success = function (x) { return x; };
-                var failure = function (x) {
-                    if (maxRetries > 0) {
-                        return _this.promiseWithRetries(url, maxRetries - 1);
-                    }
-                    else {
-                        return Promise.reject(x);
-                    }
-                };
-                return this._promiseFromUrl(url).then(success, failure);
-            };
-            /* Actually get promise from url using XMLHttpRequest */
-            RequestManager.prototype._promiseFromUrl = function (url) {
-                return new Promise(function (resolve, reject) {
-                    var req = new XMLHttpRequest();
-                    req.open('GET', url);
-                    req.onload = function () {
-                        if (req.status === 200) {
-                            resolve(JSON.parse(req.responseText));
-                        }
-                        else {
-                            reject(new RequestNetworkError(req, url));
-                        }
-                    };
-                    req.onerror = function () {
-                        reject(new RequestNetworkError(req, url));
-                    };
-                    req.send();
-                });
-            };
-            return RequestManager;
-        }());
-        Backend.RequestManager = RequestManager;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        Backend.BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
-        /** Cleanup a url so that it can be loaded from a filesystem. */
-        function demoify(s) {
-            // for consistency with python's urllib.urlencode
-            s = s.replace(new RegExp('%20', 'g'), '+');
-            for (var i = 0; i < Backend.BAD_CHARACTERS.length; i++) {
-                var c = Backend.BAD_CHARACTERS[i];
-                s = s.replace(new RegExp('\\' + c, 'g'), '_');
-            }
-            return s;
-        }
-        Backend.demoify = demoify;
-        function queryEncoder(params) {
-            // It's important that the keys be sorted, so we always grab the right file
-            // if we are talking to the backend generated by serialze_tensorboard.py
-            if (params == null) {
-                return '';
-            }
-            var components = _.keys(params)
-                .sort()
-                .filter(function (k) { return params[k] !== undefined; })
-                .map(function (k) { return k + '=' + encodeURIComponent(params[k]); });
-            var result = components.length ? '?' + components.join('&') : '';
-            // Replace parens for consistency with urllib.urlencode
-            return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
-        }
-        Backend.queryEncoder = queryEncoder;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        ;
-        /**
-         * The standard router for communicating with the TensorBoard backend
-         * @param dataDir {string} The base prefix for finding data on server.
-         * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
-         */
-        function router(dataDir, demoMode) {
-            if (dataDir === void 0) { dataDir = '/data'; }
-            if (demoMode === void 0) { demoMode = false; }
-            var clean = demoMode ? Backend.demoify : function (x) { return x; };
-            if (dataDir[dataDir.length - 1] === '/') {
-                dataDir = dataDir.slice(0, dataDir.length - 1);
-            }
-            function standardRoute(route, demoExtension) {
-                if (demoExtension === void 0) { demoExtension = '.json'; }
-                return function (tag, run) {
-                    var url = dataDir + '/' + route + clean(Backend.queryEncoder({ tag: tag, run: run }));
-                    if (demoMode) {
-                        url += demoExtension;
-                    }
-                    return url;
-                };
-            }
-            function individualImageUrl(query, wallTime) {
-                var url = dataDir + '/' + clean('individualImage?' + query);
-                // Include wall_time just to disambiguate the URL and force the browser
-                // to reload the image when the URL changes. The backend doesn't care
-                // about the value.
-                url += demoMode ? '.png' : '&ts=' + wallTime;
-                return url;
-            }
-            function individualAudioUrl(query) {
-                var url = dataDir + '/' + clean('individualAudio?' + query);
-                if (demoMode) {
-                    url += '.wav';
-                }
-                return url;
-            }
-            function graphUrl(run, limit_attr_size, large_attrs_key) {
-                var query_params = [['run', clean(run)]];
-                if (limit_attr_size != null && !demoMode) {
-                    query_params.push(['limit_attr_size', String(limit_attr_size)]);
-                }
-                if (large_attrs_key != null && !demoMode) {
-                    query_params.push(['large_attrs_key', large_attrs_key]);
-                }
-                var query = query_params
-                    .map(function (param) {
-                    return param[0] + '=' + encodeURIComponent(param[1]);
-                })
-                    .join('&');
-                var url = dataDir + '/graph' + clean('?' + query);
-                if (demoMode) {
-                    url += '.pbtxt';
-                }
-                return url;
-            }
-            return {
-                logdir: function () { return dataDir + '/logdir'; },
-                runs: function () { return dataDir + '/runs' + (demoMode ? '.json' : ''); },
-                individualImage: individualImageUrl,
-                individualAudio: individualAudioUrl,
-                graph: graphUrl,
-                scalars: standardRoute('scalars'),
-                histograms: standardRoute('histograms'),
-                compressedHistograms: standardRoute('compressedHistograms'),
-                images: standardRoute('images'),
-                audio: standardRoute('audio'),
-                runMetadata: standardRoute('run_metadata', '.pbtxt'),
-            };
-        }
-        Backend.router = router;
-        ;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend_1) {
-        Backend_1.TYPES = [
-            'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
-            'runMetadata'
-        ];
-        /**
-         * The Backend class provides a convenient and typed interface to the backend.
-         *
-         * It provides methods corresponding to the different data sources on the
-         * TensorBoard backend. These methods return a promise containing the data
-         * from the backend. This class does some post-processing on the data; for
-         * example, converting data elements tuples into js objects so that they can
-         * be accessed in a more convenient and clearly-documented fashion.
-         */
-        var Backend = (function () {
-            /**
-             * Construct a Backend instance.
-             * @param router the Router with info on what urls to get data from
-             * @param requestManager The RequestManager, overwritable so you may
-             * manually clear request queue, etc. Defaults to a new RequestManager.
-             */
-            function Backend(router, requestManager) {
-                this.router = router;
-                this.requestManager = requestManager || new Backend_1.RequestManager();
-            }
-            /**
-             * Returns a promise for requesting the logdir string.
-             */
-            Backend.prototype.logdir = function () {
-                return this.requestManager.request(this.router.logdir());
-            };
-            /**
-             * Returns a listing of all the available data in the TensorBoard backend.
-             */
-            Backend.prototype.runs = function () {
-                return this.requestManager.request(this.router.runs());
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for scalar data.
-             */
-            Backend.prototype.scalarRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'scalars'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for histogram data.
-             */
-            Backend.prototype.histogramRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'histograms'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for image data.
-             */
-            Backend.prototype.imageRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'images'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for audio data.
-             */
-            Backend.prototype.audioRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'audio'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for compressedHistogram
-             * data.
-             */
-            Backend.prototype.compressedHistogramRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'compressedHistograms'); });
-            };
-            /**
-             * Return a promise showing list of runs that contain graphs.
-             */
-            Backend.prototype.graphRuns = function () {
-                return this.runs().then(function (x) { return _.keys(x).filter(function (k) { return x[k].graph; }); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
-             */
-            Backend.prototype.runMetadataRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'run_metadata'); });
-            };
-            /**
-             * Return a promise of a graph string from the backend.
-             */
-            Backend.prototype.graph = function (tag, limit_attr_size, large_attrs_key) {
-                var url = this.router.graph(tag, limit_attr_size, large_attrs_key);
-                return this.requestManager.request(url);
-            };
-            /**
-             * Return a promise containing ScalarDatums for given run and tag.
-             */
-            Backend.prototype.scalar = function (tag, run) {
-                var p;
-                var url = this.router.scalars(tag, run);
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(createScalar)));
-            };
-            /**
-             * Return a promise containing HistogramDatums for given run and tag.
-             */
-            Backend.prototype.histogram = function (tag, run) {
-                var p;
-                var url = this.router.histograms(tag, run);
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(createHistogram))).then(function (histos) {
-                    // Get the minimum and maximum values across all histograms so that the
-                    // visualization is aligned for all timesteps.
-                    var min = d3.min(histos, function (d) { return d.min; });
-                    var max = d3.max(histos, function (d) { return d.max; });
-                    return histos.map(function (histo, i) {
-                        return {
-                            wall_time: histo.wall_time,
-                            step: histo.step,
-                            bins: convertBins(histo, min, max)
-                        };
-                    });
-                });
-            };
-            /**
-             * Return a promise containing ImageDatums for given run and tag.
-             */
-            Backend.prototype.image = function (tag, run) {
-                var url = this.router.images(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(this.createImage.bind(this)));
-            };
-            /**
-             * Return a promise containing AudioDatums for given run and tag.
-             */
-            Backend.prototype.audio = function (tag, run) {
-                var url = this.router.audio(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(this.createAudio.bind(this)));
-            };
-            /**
-             * Returns a promise to load the string RunMetadata for given run/tag.
-             */
-            Backend.prototype.runMetadata = function (tag, run) {
-                var url = this.router.runMetadata(tag, run);
-                return this.requestManager.request(url);
-            };
-            /**
-             * Get compressedHistogram data.
-             * Unlike other methods, don't bother reprocessing this data into a nicer
-             * format. This is because we will deprecate this route.
-             */
-            Backend.prototype.compressedHistogram = function (tag, run) {
-                var url = this.router.compressedHistograms(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(function (x) { return x; })));
-            };
-            Backend.prototype.createImage = function (x) {
-                return {
-                    width: x.width,
-                    height: x.height,
-                    wall_time: timeToDate(x.wall_time),
-                    step: x.step,
-                    url: this.router.individualImage(x.query, x.wall_time),
-                };
-            };
-            Backend.prototype.createAudio = function (x) {
-                return {
-                    content_type: x.content_type,
-                    wall_time: timeToDate(x.wall_time),
-                    step: x.step,
-                    url: this.router.individualAudio(x.query),
-                };
-            };
-            return Backend;
-        }());
-        Backend_1.Backend = Backend;
-        /** Given a RunToTag, return sorted array of all runs */
-        function getRuns(r) {
-            return _.keys(r).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.getRuns = getRuns;
-        /** Given a RunToTag, return array of all tags (sorted + dedup'd) */
-        function getTags(r) {
-            return _.union.apply(null, _.values(r)).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.getTags = getTags;
-        /**
-         * Given a RunToTag and an array of runs, return every tag that appears for
-         * at least one run.
-         * Sorted, deduplicated.
-         */
-        function filterTags(r, runs) {
-            var result = [];
-            runs.forEach(function (x) { return result = result.concat(r[x]); });
-            return _.uniq(result).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.filterTags = filterTags;
-        function timeToDate(x) { return new Date(x * 1000); }
-        ;
-        /**  Just a curryable map to make things cute and tidy. */
-        function map(f) {
-            return function (arr) { return arr.map(f); };
-        }
-        ;
-        /**
-         * This is a higher order function that takes a function that transforms a
-         * T into a G, and returns a function that takes TupleData<T>s and converts
-         * them into the intersection of a G and a Datum.
-         */
-        function detupler(xform) {
-            return function (x) {
-                // Create a G, assert it has type <G & Datum>
-                var obj = xform(x[2]);
-                // ... patch in the properties of datum
-                obj.wall_time = timeToDate(x[0]);
-                obj.step = x[1];
-                return obj;
-            };
-        }
-        ;
-        function createScalar(x) { return { scalar: x }; }
-        ;
-        function createHistogram(x) {
-            return {
-                min: x[0],
-                max: x[1],
-                nItems: x[2],
-                sum: x[3],
-                sumSquares: x[4],
-                bucketRightEdges: x[5],
-                bucketCounts: x[6],
-            };
-        }
-        ;
-        /**
-         * Takes histogram data as stored by tensorboard backend and converts it to
-         * the standard d3 histogram data format to make it more compatible and easier
-         * to visualize. When visualizing histograms, having the left edge and width
-         * makes things quite a bit easier. The bins are also converted to have an
-         * uniform width, what makes the visualization easier to understand.
-         *
-         * @param histogram A histogram from tensorboard backend.
-         * @param min The leftmost edge. The binning will start on it.
-         * @param max The rightmost edge. The binning will end on it.
-         * @param numBins The number of bins of the converted data. The default of 30
-         * is a sensible default, using more starts to get artifacts because the event
-         * data is stored in buckets, and you start being able to see the aliased
-         * borders between each bucket.
-         * @return A histogram bin. Each bin has an x (left edge), a dx (width),
-         *     and a y (count).
-         *
-         * If given rightedges are inclusive, then these left edges (x) are exclusive.
-         */
-        function convertBins(histogram, min, max, numBins) {
-            if (numBins === void 0) { numBins = 30; }
-            if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
-                throw (new Error('Edges and counts are of different lengths.'));
-            }
-            var binWidth = (max - min) / numBins;
-            var bucketLeft = min; // Use the min as the starting point for the bins.
-            var bucketPos = 0;
-            return d3.range(min, max, binWidth).map(function (binLeft) {
-                var binRight = binLeft + binWidth;
-                // Take the count of each existing bucket, multiply it by the proportion
-                // of overlap with the new bin, then sum and store as the count for the
-                // new bin. If no overlap, will add to zero, if 100% overlap, will include
-                // the full count into new bin.
-                var binY = 0;
-                while (bucketPos < histogram.bucketRightEdges.length) {
-                    // Clip the right edge because right-most edge can be infinite-sized.
-                    var bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
-                    var intersect = Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
-                    var count = (intersect / (bucketRight - bucketLeft)) *
-                        histogram.bucketCounts[bucketPos];
-                    binY += intersect > 0 ? count : 0;
-                    // If bucketRight is bigger than binRight, than this bin is finished and
-                    // there is data for the next bin, so don't increment bucketPos.
-                    if (bucketRight > binRight) {
-                        break;
-                    }
-                    bucketLeft = Math.max(min, bucketRight);
-                    bucketPos++;
-                }
-                ;
-                return { x: binLeft, dx: binWidth, y: binY };
-            });
-        }
-        Backend_1.convertBins = convertBins;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        Backend.Behavior = {
-            properties: {
-                /** *** Required properties *** */
-                /** Data type. One of TF.Backend.TYPES */
-                dataType: {
-                    type: String,
-                    observer: '_throwErrorOnUnrecognizedType',
-                },
-                /** TF.Backend.Backend for data loading. */
-                backend: {
-                    type: Object,
-                },
-                /** Should it automatically load when configured ready? Default true. */
-                autoLoad: {
-                    type: Boolean,
-                    value: true,
-                },
-                /** *** Component-provided properties *** */
-                /** Every tag available for data type (sorted, dedpulicated) */
-                tags: {
-                    type: Array,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Every run available for data type (sorted) */
-                runs: {
-                    type: Array,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Mapping from runs to tags for the data type */
-                run2tag: {
-                    type: Object,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Promise provider for the data. Useful for passing to subcomponents */
-                dataProvider: { type: Function, computed: '_getDataProvider(dataType, backend)' },
-                /** Has the dashboard loaded yet? */
-                loadState: {
-                    type: String,
-                    value: 'noload',
-                    readOnly: true,
-                },
-                /**
-                 * True if dashboard has loaded, and no tags were found.
-                 * Persists through subsequent reloads (ie. still true while
-                 * next load is pending) so warning won't flash away every reload
-                 * when there is no data.
-                 */
-                dataNotFound: {
-                    type: Boolean,
-                    value: false,
-                    readOnly: true,
-                }
-            },
-            observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
-            /**
-             * Reloading works in two steps:
-             * Backend reload, which gets metadata on available runs, tags, etc from
-             *   the backend.
-             * Frontend reload, which loads new data for each chart or visual display.
-             * Backend reload logic is provided by this behaivor. The frontend reload
-             *   logic should be provided elsewhere, since it is component-specific.
-             * To keep things simple and consistent, we do the backend reload first,
-             *   and the frontend reload afterwards.
-             */
-            reload: function () {
-                var _this = this;
-                return this.backendReload().then(function (x) { return _this.frontendReload(); });
-            },
-            /**
-             * Load data from backend and then set run2tag, tags, runs, and loadState.
-             * Returns a promise that resolves/rejects when data is loaded.
-             */
-            backendReload: function () {
-                var _this = this;
-                if (this.dataType == null) {
-                    throw new Error('TF.Backend.Behavior: Need a dataType to reload.');
-                }
-                if (this.backend == null) {
-                    throw new Error('TF.Backend.Behavior: Need a backend to reload.');
-                }
-                var runsRoute = this.backend[this.dataType + 'Runs'].bind(this.backend);
-                this._setLoadState('pending');
-                return runsRoute().then(function (x) {
-                    _this._setLoadState('loaded');
-                    if (_.isEqual(x, _this.run2tag)) {
-                        // If x and run2tag are equal, let's avoid updating everything
-                        // since that can needlessly trigger run changes, reloads, etc
-                        return x;
-                    }
-                    _this._setRun2tag(x);
-                    var tags = TF.Backend.getTags(x);
-                    _this._setDataNotFound(tags.length === 0);
-                    _this._setTags(tags);
-                    _this._setRuns(TF.Backend.getRuns(x));
-                    return x;
-                }, function (fail) {
-                    _this._setLoadState('failure');
-                    return fail;
-                });
-            },
-            _do_autoLoad: function (type, backend, autoLoad) {
-                if (autoLoad) {
-                    this.reload();
-                }
-                ;
-            },
-            _getDataProvider: function (dataType, backend) {
-                return this.backend[this.dataType].bind(this.backend);
-            },
-            _throwErrorOnUnrecognizedType: function (dataType) {
-                if (TF.Backend.TYPES.indexOf(dataType) === -1) {
-                    throw new Error('TF.Backend.Behavior: Unknown dataType ' + dataType);
-                }
-            },
-        };
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script>
-<dom-module id="tf-color-scale" assetpath="../tf-color-scale/">
-  <script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    TF.palettes = {
-        googleStandard: [
-            '#db4437',
-            '#ff7043',
-            '#f4b400',
-            '#0f9d58',
-            '#00796b',
-            '#00acc1',
-            '#4285f4',
-            '#5c6bc0',
-            '#ab47bc' // purple 400
-        ],
-        googleCool: [
-            '#9e9d24',
-            '#0f9d58',
-            '#00796b',
-            '#00acc1',
-            '#4285f4',
-            '#5c6bc0',
-            '#607d8b' // blue gray 500
-        ],
-        googleWarm: [
-            '#795548',
-            '#ab47bc',
-            '#f06292',
-            '#c2185b',
-            '#db4437',
-            '#ff7043',
-            '#f4b400' // google yellow 700
-        ],
-        googleColorBlindAssist: [
-            '#ff7043',
-            '#00ACC1',
-            '#AB47BC',
-            '#2A56C6',
-            '#0b8043',
-            '#F7CB4D',
-            '#c0ca33',
-            '#5e35b1',
-            '#A52714',
-        ],
-        // These palettes try to be better for color differentiation.
-        // https://personal.sron.nl/~pault/
-        colorBlindAssist1: ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
-        colorBlindAssist2: [
-            '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677',
-            '#882255', '#aa4499'
-        ],
-        colorBlindAssist3: [
-            '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933',
-            '#ddcc77', '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
-        ],
-        // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
-        colorBlindAssist4: [
-            '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB',
-            '#490092'
-        ],
-        mldash: [
-            '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8',
-            '#00B7ED'
-        ]
-    };
-})(TF || (TF = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Example usage:
-// runs = ["train", "test", "test1", "test2"]
-// ccs = new TF.ColorScale();
-// ccs.domain(runs);
-// ccs.getColor("train");
-// ccs.getColor("test1");
-var TF;
-(function (TF) {
-    var ColorScale = (function () {
-        /**
-         * Creates a color scale with optional custom palette.
-         *  @param {string[]} [palette=TF.palettes.googleColorBlind] - The color
-         *                 palette you want as an Array of hex strings.
-         */
-        function ColorScale(palette) {
-            if (palette === void 0) { palette = TF.palettes.googleColorBlindAssist; }
-            this.identifiers = d3.map();
-            this.palette = palette;
-        }
-        /**
-         * Set the domain of strings.
-         * @param {string[]} strings - An array of possible strings to use as the
-         *                             domain for your scale.
-         */
-        ColorScale.prototype.domain = function (strings) {
-            var _this = this;
-            this.identifiers = d3.map();
-            strings.forEach(function (s, i) {
-                _this.identifiers.set(s, _this.palette[i % _this.palette.length]);
-            });
-            return this;
-        };
-        /**
-         * Use the color scale to transform an element in the domain into a color.
-         * @param {string} The input string to map to a color.
-         * @return {string} The color corresponding to that input string.
-         * @throws Will error if input string is not in the scale's domain.
-         */
-        ColorScale.prototype.scale = function (s) {
-            if (!this.identifiers.has(s)) {
-                throw new Error('String was not in the domain.');
-            }
-            return this.identifiers.get(s);
-        };
-        return ColorScale;
-    }());
-    TF.ColorScale = ColorScale;
-})(TF || (TF = {}));
-</script>
-  <script>
-    (function() {
-      Polymer({
-        is: "tf-color-scale",
-        properties: {
-          runs: {
-            type: Array,
-          },
-          outColorScale: {
-            type: Object,
-            readOnly: true,
-            notify: true,
-            value: function() {
-              return new TF.ColorScale();
-            },
-          },
-        },
-        observers: ['updateColorScale(runs.*)'],
-        updateColorScale: function(runsChange) {
-          this.outColorScale.domain(this.runs);
-        },
-      });
-    })();
-  </script>
-</dom-module>
-
-<link rel="import" href="../paper-styles/paper-styles.html"><dom-module id="scrollbar-style" assetpath="/">
-  <template>
-    <style>
-      .scrollbar::-webkit-scrollbar-track
-      {
-        visibility: hidden;
-      }
-
-      .scrollbar::-webkit-scrollbar
-      {
-        width: 10px;
-      }
-
-      .scrollbar::-webkit-scrollbar-thumb
-      {
-        border-radius: 10px;
-        -webkit-box-shadow: inset 0 0 2px rgba(0,0,0,.3);
-        background-color: var(--paper-grey-500);
-        color: var(--paper-grey-900);
-      }
-      .scrollbar {
-        box-sizing: border-box;
-      }
-    </style>
-  </template>
-</dom-module>
-<style is="custom-style">
-
-  :root {
-    --tb-orange-weak: #ffa726;
-    --tb-orange-strong: #f57c00;
-    --tb-grey-darker: #e2e2e2;
-    --tb-grey-lighter: #f3f3f3;
-    --tb-ui-dark-accent: #757575;
-    --tb-ui-light-accent: #e0e0e0;
-    --tb-graph-faded: #e0d4b3;
-  }
-
-</style>
-<dom-module id="tf-dashboard-layout" assetpath="../tf-dashboard-common/">
-  <template>
-    <div id="sidebar">
-      <content select=".sidebar"></content>
-    </div>
-
-    <div id="center" class="scrollbar">
-      <content select=".center"></content>
-    </div>
-    <style include="scrollbar-style"></style>
-    <style>
-      #sidebar {
-        width: inherit;
-        height: 100%;
-        overflow: ellipsis;
-        flex-grow: 0;
-        flex-shrink: 0;
-      }
-
-      #center {
-        height: 100%;
-        overflow-y: auto;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-      .tf-graph-dashboard #center {
-        background: white;
-      }
-
-      :host {
-        display: flex;
-        flex-direction: row;
-        height: 100%;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-dashboard-layout",
-    });
-  </script>
-</dom-module>
-<dom-module id="dashboard-style" assetpath="../tf-dashboard-common/">
-  <template>
-    <style>
-      .sidebar {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-        margin-right: 20px;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-        margin-top: 5px;
-      }
-    </style>
-  </template>
-</dom-module>
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html"><link rel="import" href="../paper-menu/paper-menu.html"><link rel="import" href="../paper-item/paper-item.html"><dom-module id="tf-downloader" assetpath="../tf-dashboard-common/">
-  <template>
-    <paper-dropdown-menu no-label-float="true" label="run to download" selected-item-label="{{_run}}">
-      <paper-menu class="dropdown-content">
-        <template is="dom-repeat" items="[[runs]]">
-          <paper-item no-label-float="true">[[item]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-    <div class="center">
-      <span>
-        <a download="[[_csvName(_run)]]" href="[[_csvUrl(_run, urlFn)]]">CSV</a>
-        <a download="[[_jsonName(_run)]]" href="[[_jsonUrl(_run, urlFn)]]">JSON</a>
-      </span>
-    </div>
-    <style>
-      :host {
-        display: flex;
-        height: 32px;
-      }
-      .center {
-        display: flex;
-        align-self: center;
-      }
-      paper-dropdown-menu {
-        width: 100px;
-        --paper-input-container-label: {
-          font-size: 10px;
-        }
-        --paper-input-container-input: {
-          font-size: 10px;
-        }
-      }
-      a {
-        font-size: 10px;
-        border-radius: 3px;
-        border: 1px solid #EEE;
-      }
-      paper-input {
-        font-size: 22px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-downloader",
-      properties: {
-        _run: String,
-        runs: Array,
-        tag: String,
-        urlFn: Function,
-      },
-      _csvUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run) + "&format=csv";
-      },
-      _jsonUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run);
-      },
-      _csvName: function(_run) {
-        return "run_" + _run + ",tag_" + this.tag + ".csv";
-      },
-      _jsonName: function(_run) {
-        return "run-" + _run + "-tag-" + this.tag + ".json";
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-no-data-warning" assetpath="../tf-dashboard-common/">
-  <template>
-    <template is="dom-if" if="[[showWarning]]">
-      <div class="warning">
-        <template is="dom-if" if="[[_isGraph(dataType)]]">
-          <h3>
-            No graph definition files were found.
-          </h3>
-          <p>
-            To store a graph, create a
-            <code>tf.summary.FileWriter</code>
-            and pass the graph either via the constructor, or by calling its
-            <code>add_graph()</code> method.
-            You may want to check out the
-            <a href="https://www.tensorflow.org/versions/master/how_tos/graph_viz/index.html">
-              graph visualizer tutorial
-            </a>
-            .
-          </p>
-        </template>
-        <template is="dom-if" if="[[_isProjector(dataType)]]">
-          <h3>
-            No checkpoint was found.
-          </h3>
-          <p>
-            Probable causes:
-            </p><ul>
-              <li>
-                No checkpoint has been saved yet. Please refresh the page periodically.
-              </li>
-              <li>
-                You are not saving any checkpoint. To save your model,
-                create a
-                <a href="https://www.tensorflow.org/versions/master/api_docs/python/state_ops.html#Saver">
-                  <code>tf.train.Saver</code>
-                </a>
-                and save your model periodically
-                by calling <code>saver.save(session, LOG_DIR/model.ckpt, step)</code>.
-              </li>
-            </ul>
-          <p></p>
-        </template>
-        <template is="dom-if" if="[[_isOther(dataType)]]">
-          <h3>
-            No <span>[[dataType]]</span> data was found.
-          </h3>
-          <p>
-            Probable causes:
-            </p><ul>
-              <li>
-                You haven't written any <span>[[dataType]]</span> data
-                to your event files.
-              </li>
-              <li>
-                TensorBoard can't find your event files.
-              </li>
-            </ul>
-          <p></p>
-        </template>
-        <p>
-          If you're new to using TensorBoard, and want to find out how to add
-          data and set up your event files, check out the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md">
-            README
-          </a>
-          and perhaps the
-          <a href="https://www.tensorflow.org/versions/master/how_tos/summaries_and_tensorboard/index.html">
-            TensorBoard tutorial
-          </a>.
-        </p>
-
-        <p>
-          If you think TensorBoard is configured properly, please see the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md#my-tensorboard-isnt-showing-any-data-whats-wrong">
-            section of the README devoted to missing data problems
-          </a>
-          and consider filing an issue on GitHub.
-        </p>
-
-      </div>
-    </template>
-    <style>
-      .warning {
-        max-width: 540px;
-        margin: 80px auto 0 auto;
-      }
-    </style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-no-data-warning",
-      properties: {
-        dataType: String,
-        showWarning: Boolean
-      },
-      _isGraph: function(dataType) {
-        return dataType === "graph";
-      },
-      _isProjector: function(dataType) {
-        return dataType === "projector";
-      },
-      _isOther: function(dataType) {
-        return !this._isGraph(dataType) && !this._isProjector(dataType);
-      }
-    });
-  </script>
-</dom-module>
-<script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Dashboard;
-    (function (Dashboard) {
-        /**
-         * ReloadBehavior: A simple behavior for dashboards where the
-         * frontendReload() function should find every child element with a
-         * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
-         * and call a `reload` method on that child.
-         * May later extend it so it has more sophisticated logic, e.g. reloading
-         * only tags that are in view.
-         */
-        function ReloadBehavior(tagName) {
-            return {
-                properties: {
-                    reloadTag: {
-                        type: String,
-                        value: tagName,
-                    },
-                },
-                frontendReload: function () {
-                    var elements = this.getElementsByTagName(this.reloadTag);
-                    Array.prototype.forEach.call(elements, function (x) { x.reload(); });
-                },
-            };
-        }
-        Dashboard.ReloadBehavior = ReloadBehavior;
-    })(Dashboard = TF.Dashboard || (TF.Dashboard = {}));
-})(TF || (TF = {}));
-</script>
-<dom-module id="tf-option-selector" assetpath="../tf-dashboard-common/">
-  <template>
-    <div id="wrap">
-      <h3>[[name]]</h3>
-      <div class="content-wrapper"><content></content></div>
-    </div>
-    <style>
-      .content-wrapper ::content > * {
-        width: 30%;
-        font-size: 13px;
-        background: none;
-        margin-top: 10px;
-        color: var(--tb-ui-dark-accent);
-      }
-
-      .content-wrapper ::content :first-of-type {
-        margin-left: 0;
-      }
-
-      .content-wrapper ::content .selected {
-        background-color: var(--tb-ui-dark-accent);
-        color: white!important;
-      }
-
-      h3 {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-        display: block;
-        pointer-events: none;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-option-selector",
-      properties: {
-        name: String,
-        selectedId: {
-          type: String,
-          notify: true,
-          observer: '_selectedIdChanged'
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.getEffectiveChildren().forEach(function(node) {
-            this.listen(node, 'tap', '_selectTarget');
-          }.bind(this));
-        });
-      },
-      _selectTarget: function(e) {
-        this.selectedId = e.currentTarget.id;
-      },
-      _selectedIdChanged: function() {
-        var selected = this.queryEffectiveChildren('#' + this.selectedId);
-        if (!selected) {
-          return;
-        }
-
-        this.getEffectiveChildren().forEach(function(node) {
-          node.classList.remove("selected");
-        });
-        selected.classList.add("selected");
-      }
-    });
-  </script>
-</dom-module>
-
-<link rel="import" href="../iron-collapse/iron-collapse.html"><dom-module id="tf-collapsable-pane" assetpath="../tf-dashboard-common/">
-  <template>
-    <button class="heading" on-tap="togglePane" open-button$="[[opened]]">
-    <span class="name">[[name]]</span>
-    <span class="count">
-      <span>[[count]]</span>
-    </span>
-  </button>
-    <iron-collapse opened="[[opened]]">
-      <div class="content">
-        <template is="dom-if" if="[[opened]]" restamp="[[restamp]]">
-          <content></content>
-        </template>
-      </div>
-    </iron-collapse>
-    <style>
-      :host {
-        display: block;
-        margin: 0 5px 1px 10px;
-      }
-
-      :host:first-of-type {
-        margin-top: 20px;
-      }
-      
-      :host:last-of-type {
-        margin-bottom: 20px;
-      }
-
-      .heading {
-        background-color: white;
-        border: none;
-        cursor: pointer;
-        width: 100%;
-        font-size: 15px;
-        line-height: 1;
-        box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-        padding: 10px 15px;
-      }
-
-      .content {
-        padding: 15px;
-        border: 1px solid #dedede;
-        border-top: none;
-        border-bottom-left-radius: 2px;
-        border-bottom-right-radius: 2px;
-        background: white;
-      }
-
-      [open-button] {
-        border-bottom-left-radius: 0px !important;
-        border-bottom-right-radius: 0px !important;
-      }
-
-      .name {
-        float: left;
-      }
-
-      .count {
-        float: right;
-        margin-right: 5px;
-        font-size: 12px;
-        color: var(--paper-grey-500);
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-collapsable-pane",
-      properties: {
-        opened: {type: Boolean, value: false},
-        restamp: {type: Boolean, value: true},
-        name: {type: String, observer: "hide"},
-        count: {type: Number},
-      },
-      hide: function() {
-        this.opened = false;
-      },
-      togglePane: function() {
-        this.opened = !this.opened;
-      }
-    });
-  </script>
-
-</dom-module>
-
-<script src="../plottable/plottable.js"></script><link rel="stylesheet" type="text/css" href="../plottable/plottable.css"><dom-module id="tf-chart-scaffold" assetpath="../tf-dashboard-common/">
-  <template>
-    <content></content>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-chart-scaffold",
-      properties: {
-        tag: String,
-        dataProvider: Function,
-        visibleSeries: Array,
-        _attached: {
-          type: Boolean,
-          value: false
-        }
-      },
-      observers: [
-        "reload(tag, dataProvider)",
-        "_changeSeries(visibleSeries.*)"
-      ],
-      ready: function() {
-        this.fire('ready');
-      },
-      attached: function() {
-        this._attached = true;
-        this._changeSeries();
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      reload: function() {
-        if (!this._attached) {
-          return;
-        }
-        else if (!this.dataProvider) {
-          throw new Error('tf-chart-scaffold requires a dataProvider.');
-        }
-        else if (!this.tag) {
-          throw new Error('tf-chart-scaffold requires a tag.');
-        }
-
-        this.visibleSeries.forEach(function(name) {
-          this.dataProvider(this.tag, name).then(function(data) {
-            this.chart().setSeriesData(name, data);
-          }.bind(this));
-        }.bind(this));
-      },
-      _changeSeries: function() {
-        if (!this._attached) {
-           return;
-        }
-        else if (!this.visibleSeries) {
-          throw new Error('tf-chart-scaffold requires a visibleSeries.');
-        }
-
-        this.chart().setVisibleSeries(this.visibleSeries);
-        this.reload();
-      },
-      chart: function() {
-        var children = this.getEffectiveChildren();
-        if (!children.length) {
-          throw new Error('tf-chart-scaffold has no children');
-        }
-
-        var child = children[0];
-        if (!child.setVisibleSeries || !child.setSeriesData) {
-          throw new Error("tf-chart-scaffold's content doesn't implement the " +
-              "required interface");
-        }
-        return child;
-      }
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-panes-helper" assetpath="../tf-dashboard-common/">
-  <template>
-    <content></content> 
-    <tf-no-data-warning data-type="[[dataType]]" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-
-    <template is="dom-repeat" items="[[categories]]" as="category">
-      <tf-collapsable-pane name="[[category.name]]" count="[[_count(category.tags, selectedRuns.*)]]">
-        <div class="layout horizontal wrap">
-          <template is="dom-repeat" items="[[_categoryCards(category, selectedRuns.*, run2tag.*)]]">
-              <div class="card">
-                <div class="card-title-container" style="border-color: [[_titleBorderColor(item.run)]]">
-                  <div class="card-title" inner-h-t-m-l="[[_break(item.tag)]]"></div>
-                  <template is="dom-if" if="[[repeatForRuns]]">
-                    <div class="card-subtitle" title="[[item.run]]">[[item.run]]</div>
-                  </template>
-                </div>
-                <div class="card-content">
-                  <tf-chart-scaffold tag="[[item.tag]]" data-provider="[[dataProvider]]" visible-series="[[item.runs]]" on-ready="_instantiateTemplate">
-                    
-                  </tf-chart-scaffold>
-                </div>
-                <div class="card-bottom-row">
-                  <paper-icon-button class="expand-button" icon="fullscreen" on-tap="_toggleExpanded"></paper-icon-button>
-                  <template is="dom-if" if="[[showDownloadLinks]]">
-                    <tf-downloader runs="[[item.runs]]" tag="[[item.tag]]" url-fn="[[downloadLinkUrlFunction]]">
-                    </tf-downloader>
-                  </template>
-                </div>
-              </div>
-          </template>
-        </div>
-      </tf-collapsable-pane>
-    </template>
-
-    <style>
-      .card {
-        height: var(--card-height, 200px);
-        width: var(--card-width, 300px);
-        display: flex;
-        flex-direction: column;
-        margin: 5px;
-        padding: 0 30px 35px 0;
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        position: relative;
-      }
-
-      .card-expanded {
-        height: var(--card-expanded-height, 400px);
-        width: var(--card-expanded-width, 100%);
-      }
-
-      .card-title, .card-subtitle {
-        flex-grow: 0;
-        flex-shrink: 0;
-        font-size: 14px;
-        text-overflow: ellipsis;
-        overflow: hidden;
-      }
-
-      .card-subtitle {
-        font-size: 12px;
-      }
-
-      .card-content {
-        flex-grow: 1;
-        flex-shrink: 1;
-        display: flex;
-        margin-top: 10px;
-      }
-
-      .card-bottom-row {
-        position: absolute;
-        left: 0px;
-        bottom: 0px;
-        width: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        pointer-events: none;
-      }
-
-      .card-title-container {
-        border-left: 4px solid;
-        padding-left: 5px;
-      }
-
-      .expand-button {
-        color: #2196F3;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-        pointer-events: auto;
-      }
-
-      .card-expanded .expand-button {
-        background: var(--tb-ui-light-accent);
-      }
-
-      tf-downloader {
-        margin-right: 30px;
-        pointer-events: auto;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-panes-helper",
-      properties: {
-        /**
-         * Categories that separate the template instances. Each category will
-         * be given its own collapsible pane. The category must be an array of
-         * objects, each with a 'name' property and a 'tags' array of strings.
-         */
-        categories: Array,
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * The name of the data type that is used by this dashboard. This will
-         * be used to display what is missing when there is no data available.
-         */
-        dataType: String,
-
-        /**
-         * The function that requests and returns a promise with the data of the
-         * required type for the templates from the backend.
-         */
-        dataProvider: Object,
-
-        /**
-         * If false, instantiates one template for each tag and calls
-         * setVisibleSeries on the first element of the template with all valid
-         * runs the tag has. If true, instantiates one template for each run of
-         * each tag, and calls setVisibleSeries of the first element of the
-         * instantiated template with just the one run.
-         */
-        repeatForRuns: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Array with the runs that are selected by the user (i.e. valid to be
-         * displayed).
-         */
-        selectedRuns: Array,
-
-        /**
-         * If true, shows a menu with download links for the template data.
-         * If this is set to true, urlFn must also be provided.
-         */
-        showDownloadLinks: Boolean,
-
-        /**
-         * Function that returns the route to get data to download. Must be
-         * provided if showDownloadLinks is enabled.
-         */
-        downloadLinkUrlFunction: Function,
-        _contentTemplate: {
-          type: Object,
-          value: null
-        },
-        _stampedTemplates: {
-          type: Array,
-          value: function() { return [] }
-        }
-      },
-      behaviors: [
-        Polymer.Templatizer,
-      ],
-
-      /**
-       * Initializes the Polymer.Templatizer behavior with the template supplied
-       * by the user. With this, all calls to this.stamp() will produce an
-       * instance of the user template.
-       */
-      _initTemplatizer: function() {
-        if (!this._contentTemplate) {
-          // First template is used as the content.
-          this._contentTemplate = Polymer.dom(this).querySelector('template');
-          this.templatize(this._contentTemplate);
-        }
-      },
-
-      /**
-       * Called every time a tf-chart-scaffold is ready, stamps the user
-       * template inside the scaffold element (before it is attached) and
-       * stores the stamped template in an array to use for data binding
-       * (forwardParentProp/Path).
-       */
-      _instantiateTemplate: function(e) {
-        var scaffold = e.target;
-        this._initTemplatizer();
-        var instance = this.stamp();
-        this._stampedTemplates.push(instance);
-        Polymer.dom(scaffold).appendChild(instance.root);
-      },
-      _toggleExpanded: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-        var scaffold = card.querySelector('tf-chart-scaffold');
-        card.classList.toggle('card-expanded');
-        scaffold.chart().redraw();
-      },
-      _count: function(tags) {
-        if (!this.repeatForRuns) {
-          return tags.length;
-        }
-
-        var targetTags = d3.set(tags);
-        var count = 0;
-        this.selectedRuns.forEach(function(r) {
-          this.run2tag[r].forEach(function(t) {
-            if (targetTags.has(t)) {
-              count++;
-            }
-          });
-        }.bind(this));
-        return count;
-      },
-      _categoryCards: function(category) {
-        var cards = [];
-        category.tags.forEach(function(tag) {
-          var runs = this.selectedRuns.filter(function(r) {
-            return this.run2tag[r] && this.run2tag[r].indexOf(tag) !== -1;
-          }.bind(this));
-
-          if (this.repeatForRuns) {
-            runs.forEach(function(run) {
-              cards.push({tag: tag, run: run, runs: [run]});
-            });
-          } else {
-            cards.push({tag: tag, runs: runs});
-          }
-        }.bind(this));
-
-        return cards;
-      },
-      _titleBorderColor: function(run) {
-        return this.repeatForRuns ? this.colorScale.scale(run) : 'white';
-      },
-
-      /*
-       * Polymer data binding forwarding functions. Check the
-       * Polymer.Templatizer documentation for more information.
-       */
-
-      _forwardParentProp: function(property, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance[property] = value;
-        });
-      },
-      _forwardParentPath: function(path, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance.notifyPath(path, value, true);
-        });
-      },
-      // TODO(renatoutsch): implement the instance forwarding for two-way data
-      // binding.
-      // Add breaks to input so it will wrap nicely
-      _break: function(ipt) {
-        return ipt.replace(/([\/_-])/g, "$1<wbr>")
-      },
-    });
-  </script>
-</dom-module>
-
-
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html"><dom-module id="tf-storage" assetpath="../tf-storage/">
- <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-/**
- * The Storage Module provides storage for URL parameters, and an API for
- * getting and setting TensorBoard's stateful URI.
- *
- * It generates URI components like: events&runPrefix=train*
- * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
- * to store state in the URI.
- */
-var TF;
-(function (TF) {
-    var URIStorage;
-    (function (URIStorage) {
-        /**
-         * A key that users cannot use, since TensorBoard uses this to store info
-         * about the active tab.
-         */
-        URIStorage.TAB = '__tab__';
-        /**
-         * The name of the property for users to set on a Polymer component
-         * in order for its stored properties to be stored in the URI unambiguously.
-         * (No need to set this if you want mutliple instances of the component to
-         * share URI state)
-         *
-         * Example:
-         * <my-component disambiguator="0"></my-component>
-         *
-         * The disambiguator should be set to any unique value so that multiple
-         * instances of the component can store properties in URI storage.
-         *
-         * Because it's hard to dereference this variable in HTML property bindings,
-         * it is NOT safe to change the disambiguator string without find+replace
-         * across the codebase.
-         */
-        URIStorage.DISAMBIGUATOR = 'disambiguator';
-        /**
-         * Return a boolean stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getBoolean(key) {
-            var items = _componentToDict(_readComponent());
-            var item = items[key];
-            return item === 'true' ? true : item === 'false' ? false : undefined;
-        }
-        URIStorage.getBoolean = getBoolean;
-        /**
-         * Store a boolean in the URI, with a corresponding key.
-         */
-        function setBoolean(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = value.toString();
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setBoolean = setBoolean;
-        /**
-         * Return a string stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getString(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key];
-        }
-        URIStorage.getString = getString;
-        /**
-         * Store a string in the URI, with a corresponding key.
-         */
-        function setString(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = value;
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setString = setString;
-        /**
-         * Return a number stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getNumber(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key] === undefined ? undefined : +items[key];
-        }
-        URIStorage.getNumber = getNumber;
-        /**
-         * Store a number in the URI, with a corresponding key.
-         */
-        function setNumber(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = '' + value;
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setNumber = setNumber;
-        /**
-         * Return an object stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getObject(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key] === undefined ? undefined : JSON.parse(atob(items[key]));
-        }
-        URIStorage.getObject = getObject;
-        /**
-         * Store an object in the URI, with a corresponding key.
-         */
-        function setObject(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = btoa(JSON.stringify(value));
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setObject = setObject;
-        /**
-         * Get a unique storage name for a (Polymer component, propertyName) tuple.
-         *
-         * DISAMBIGUATOR must be set on the component, if other components use the
-         * same propertyName.
-         */
-        function getURIStorageName(component, propertyName) {
-            var d = component[URIStorage.DISAMBIGUATOR];
-            var components = d == null ? [propertyName] : [d, propertyName];
-            return components.join('.');
-        }
-        URIStorage.getURIStorageName = getURIStorageName;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer boolean property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getBooleanInitializer(propertyName, defaultVal) {
-            return _getInitializer(getBoolean, propertyName, defaultVal);
-        }
-        URIStorage.getBooleanInitializer = getBooleanInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer string property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getStringInitializer(propertyName, defaultVal) {
-            return _getInitializer(getString, propertyName, defaultVal);
-        }
-        URIStorage.getStringInitializer = getStringInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer number property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getNumberInitializer(propertyName, defaultVal) {
-            return _getInitializer(getNumber, propertyName, defaultVal);
-        }
-        URIStorage.getNumberInitializer = getNumberInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer Object property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         *
-         * Generates a deep clone of the defaultVal to avoid mutation issues.
-         */
-        function getObjectInitializer(propertyName, defaultVal) {
-            return _getInitializer(getObject, propertyName, defaultVal);
-        }
-        URIStorage.getObjectInitializer = getObjectInitializer;
-        /**
-         * Return a function that updates URIStorage when a string property changes.
-         */
-        function getBooleanObserver(propertyName, defaultVal) {
-            return _getObserver(getBoolean, setBoolean, propertyName, defaultVal);
-        }
-        URIStorage.getBooleanObserver = getBooleanObserver;
-        /**
-         * Return a function that updates URIStorage when a string property changes.
-         */
-        function getStringObserver(propertyName, defaultVal) {
-            return _getObserver(getString, setString, propertyName, defaultVal);
-        }
-        URIStorage.getStringObserver = getStringObserver;
-        /**
-         * Return a function that updates URIStorage when a number property changes.
-         */
-        function getNumberObserver(propertyName, defaultVal) {
-            return _getObserver(getNumber, setNumber, propertyName, defaultVal);
-        }
-        URIStorage.getNumberObserver = getNumberObserver;
-        /**
-         * Return a function that updates URIStorage when an object property changes.
-         * Generates a deep clone of the defaultVal to avoid mutation issues.
-         */
-        function getObjectObserver(propertyName, defaultVal) {
-            var clone = _.cloneDeep(defaultVal);
-            return _getObserver(getObject, setObject, propertyName, clone);
-        }
-        URIStorage.getObjectObserver = getObjectObserver;
-        /**
-         * Read component from URI (e.g. returns "events&runPrefix=train*").
-         */
-        function _readComponent() {
-            return TF.Globals.USE_HASH ? window.location.hash.slice(1) :
-                TF.Globals.FAKE_HASH;
-        }
-        /**
-         * Write component to URI.
-         */
-        function _writeComponent(component) {
-            if (TF.Globals.USE_HASH) {
-                window.location.hash = component;
-            }
-            else {
-                TF.Globals.FAKE_HASH = component;
-            }
-        }
-        /**
-         * Convert dictionary of strings into a URI Component.
-         * All key value entries get added as key value pairs in the component,
-         * with the exception of a key with the TAB value, which if present
-         * gets prepended to the URI Component string for backwards comptability
-         * reasons.
-         */
-        function _dictToComponent(items) {
-            var component = '';
-            // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
-            // for backwards compatbility.
-            if (items[URIStorage.TAB] !== undefined) {
-                component += items[URIStorage.TAB];
-            }
-            // Join other strings with &key=value notation
-            var nonTab = _.pairs(items)
-                .filter(function (pair) { return pair[0] !== URIStorage.TAB; })
-                .map(function (pair) {
-                return encodeURIComponent(pair[0]) + '=' +
-                    encodeURIComponent(pair[1]);
-            })
-                .join('&');
-            return nonTab.length > 0 ? (component + '&' + nonTab) : component;
-        }
-        /**
-         * Convert a URI Component into a dictionary of strings.
-         * Component should consist of key-value pairs joined by a delimiter
-         * with the exception of the tabName.
-         * Returns dict consisting of all key-value pairs and
-         * dict[TAB] = tabName
-         */
-        function _componentToDict(component) {
-            var items = {};
-            var tokens = component.split('&');
-            tokens.forEach(function (token) {
-                var kv = token.split('=');
-                // Special backwards compatibility for URI components like #events
-                if (kv.length === 1 && _.contains(TF.Globals.TABS, kv[0])) {
-                    items[URIStorage.TAB] = kv[0];
-                }
-                else if (kv.length === 2) {
-                    items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
-                }
-            });
-            return items;
-        }
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function _getInitializer(get, propertyName, defaultVal) {
-            return function () {
-                var _this = this;
-                var URIStorageName = getURIStorageName(this, propertyName);
-                // setComponentValue will be called every time the hash changes, and is
-                // responsible for ensuring that new state in the hash will be propagated
-                // to the component with that property.
-                // It is important that this function does not re-assign needlessly,
-                // to avoid Polymer observer churn.
-                var setComponentValue = function () {
-                    var uriValue = get(URIStorageName);
-                    var currentValue = _this[propertyName];
-                    // if uriValue is undefined, we will ensure that the property has the
-                    // default value
-                    if (uriValue === undefined) {
-                        if (!_.isEqual(currentValue, defaultVal)) {
-                            // If we don't have an explicit URI value, then we need to ensure
-                            // the property value is equal to the default value.
-                            // We will assign a clone rather than the canonical default, because
-                            // the component receiving this property may mutate it, and we need
-                            // to keep a pristine copy of the default.
-                            _this[propertyName] = _.clone(defaultVal);
-                        }
-                    }
-                    else if (!_.isEqual(uriValue, currentValue)) {
-                        _this[propertyName] = uriValue;
-                    }
-                };
-                // Set the value on the property.
-                setComponentValue();
-                // Update it when the hashchanges.
-                window.addEventListener('hashchange', setComponentValue);
-            };
-        }
-        /**
-         * Return a function that updates URIStorage when a property changes.
-         */
-        function _getObserver(get, set, propertyName, defaultVal) {
-            return function () {
-                var URIStorageName = getURIStorageName(this, propertyName);
-                var newVal = this[propertyName];
-                if (!_.isEqual(newVal, get(URIStorageName))) {
-                    if (_.isEqual(newVal, defaultVal)) {
-                        _unset(URIStorageName);
-                    }
-                    else {
-                        set(URIStorageName, newVal);
-                    }
-                }
-            };
-        }
-        /**
-         * Delete a key from the URI.
-         */
-        function _unset(key) {
-            var items = _componentToDict(_readComponent());
-            delete items[key];
-            _writeComponent(_dictToComponent(items));
-        }
-    })(URIStorage = TF.URIStorage || (TF.URIStorage = {}));
-})(TF || (TF = {}));
-</script>
-</dom-module>
-
-<dom-module id="tf-regex-group" assetpath="/">
-  <template>
-    <div class="regex-list">
-      <template is="dom-repeat" items="{{rawRegexes}}">
-        <div class="regex-line">
-          <paper-input id="text-input" class="regex-input" label="Write a regex to create a tag group" no-label-float="" value="{{item.regex}}" invalid="[[!item.valid]]" on-keyup="moveFocus"></paper-input>
-          <paper-icon-button icon="close" class="delete-button" aria-label="Delete Regex" tabindex="0" on-tap="deleteRegex"></paper-icon-button>
-        </div>
-        <style>
-          .regex-input {
-            width: 250px;
-            display: inline-block;
-            margin-left: -3px;
-          }
-
-          .delete-button {
-            color: var(--paper-grey-700);
-            width: 40px;
-            height: 40px;
-            margin-right: -10px;
-          }
-
-          .regex-list {
-            margin-bottom: 10px;
-          }
-
-          paper-input {
-            --paper-input-container-focus-color: var(--tb-orange-strong);
-            --paper-input-container-input: {
-              font-size: 14px;
-            };
-            --paper-input-container-label: {
-              font-size: 14px;
-            };
-          }
-        </style>
-      </template>
-    </div>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-regex-group",
-      properties: {
-        rawRegexes: {
-          type: Array,
-          value: TF.URIStorage.getObjectInitializer('rawRegexes', [{regex: "", valid: true}]),
-        },
-        regexes: {type: Array, computed: "usableRegexes(rawRegexes.*)", notify: true},
-      },
-      observers: [
-        "addNewRegexIfNeeded(rawRegexes.*)",
-        "checkValidity(rawRegexes.*)",
-        "_uriStoreRegexes(rawRegexes.*)",
-      ],
-      _uriStoreRegexes: TF.URIStorage.getObjectObserver('rawRegexes', [{regex: "", valid: true}]),
-      checkValidity: function(x) {
-        var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
-        if (match) {
-          var idx = match[1];
-          this.set("rawRegexes." + idx + ".valid", this.isValid(x.value));
-        }
-      },
-      isValid: function(s) {
-        try {
-          new RegExp(s);
-          return true;
-        } catch (e) {
-          return false;
-        }
-      },
-      usableRegexes: function(regexes) {
-        var isValid = this.isValid;
-        return regexes.base.filter(function (r) {
-          // Checking validity here (rather than using the data property)
-          // is necessary because otherwise we might send invalid regexes due
-          // to the fact that this function can call before the observer does
-          return r.regex !== "" && isValid(r.regex);
-        }).map(function(r) {
-          return r.regex;
-        });
-      },
-      addNewRegexIfNeeded: function() {
-        var last = this.rawRegexes[this.rawRegexes.length - 1];
-        if (last.regex !== "") {
-          this.push("rawRegexes", {regex: "", valid: true});
-        }
-      },
-      deleteRegex: function(e) {
-        if (this.rawRegexes.length > 1) {
-          this.splice("rawRegexes", e.model.index, 1);
-        }
-      },
-      moveFocus: function(e) {
-        if (e.keyCode === 13) {
-          var idx = e.model.index;
-          var inputs = Polymer.dom(this.root).querySelectorAll(".regex-input");
-          if (idx < this.rawRegexes.length - 1) {
-            inputs[idx+1].$.input.focus();
-          } else {
-            document.activeElement.blur();
-          }
-        }
-      }
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-categorizer" assetpath="../tf-dashboard-common/">
-  <template>
-    <div class="inputs">
-      <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
-    </div>
-    <div id="underscore-categorization">
-      <paper-checkbox checked="{{splitOnUnderscore}}">Split on underscores</paper-checkbox>
-    </div>
-    <style>
-      :host {
-        display: block;
-        padding-bottom: 15px;
-      }
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--paper-grey-600);
-        --paper-checkbox-unchecked-color: var(--paper-grey-600);
-        font-size: 14px;
-      }
-      #underscore-categorization {
-        color: var(--paper-grey-700);
-      }
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var Categorizer;
-(function (Categorizer) {
-    /* Canonical TensorFlow ops are namespaced using forward slashes.
-     * This fallback categorizer categorizes by the top-level namespace.
-     */
-    Categorizer.topLevelNamespaceCategorizer = splitCategorizer(/\//);
-    // Try to produce good categorizations on legacy graphs, which often
-    // are namespaced like l1_foo/bar or l2_baz/bam.
-    // If there is no leading underscore before the first forward slash,
-    // then it behaves the same as topLevelNamespaceCategorizer
-    Categorizer.legacyUnderscoreCategorizer = splitCategorizer(/[\/_]/);
-    function fallbackCategorizer(s) {
-        switch (s) {
-            case 'TopLevelNamespaceCategorizer':
-                return Categorizer.topLevelNamespaceCategorizer;
-            case 'LegacyUnderscoreCategorizer':
-                return Categorizer.legacyUnderscoreCategorizer;
-            default:
-                throw new Error('Unrecognized categorization strategy: ' + s);
-        }
-    }
-    Categorizer.fallbackCategorizer = fallbackCategorizer;
-    /* An 'extractor' is a function that takes a tag name, and 'extracts' a
-     * category name.
-     * This function takes an extractor, and produces a categorizer.
-     * Currently, it is just used for the fallbackCategorizer, but we may want to
-     * refactor the general categorization logic to use the concept of extractors.
-     */
-    function extractorToCategorizer(extractor) {
-        return function (tags) {
-            if (tags.length === 0) {
-                return [];
-            }
-            var sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-            var categories = [];
-            var currentCategory = {
-                name: extractor(sortedTags[0]),
-                tags: [],
-            };
-            sortedTags.forEach(function (t) {
-                var topLevel = extractor(t);
-                if (currentCategory.name !== topLevel) {
-                    categories.push(currentCategory);
-                    currentCategory = {
-                        name: topLevel,
-                        tags: [],
-                    };
-                }
-                currentCategory.tags.push(t);
-            });
-            categories.push(currentCategory);
-            return categories;
-        };
-    }
-    function splitCategorizer(r) {
-        var extractor = function (t) {
-            return t.split(r)[0];
-        };
-        return extractorToCategorizer(extractor);
-    }
-    function defineCategory(ruledef) {
-        var r = new RegExp(ruledef);
-        var f = function (tag) {
-            return r.test(tag);
-        };
-        return { name: ruledef, matches: f };
-    }
-    Categorizer.defineCategory = defineCategory;
-    function _categorizer(rules, fallback) {
-        return function (tags) {
-            var remaining = d3.set(tags);
-            var userSpecified = rules.map(function (def) {
-                var tags = [];
-                remaining.forEach(function (t) {
-                    if (def.matches(t)) {
-                        tags.push(t);
-                    }
-                });
-                var cat = { name: def.name, tags: tags.sort(VZ.Sorting.compareTagNames) };
-                return cat;
-            });
-            var defaultCategories = fallback(remaining.values());
-            return userSpecified.concat(defaultCategories);
-        };
-    }
-    Categorizer._categorizer = _categorizer;
-    function categorizer(s) {
-        var rules = s.categoryDefinitions.map(defineCategory);
-        var fallback = fallbackCategorizer(s.fallbackCategorizer);
-        return _categorizer(rules, fallback);
-    }
-    Categorizer.categorizer = categorizer;
-    ;
-})(Categorizer || (Categorizer = {}));
-</script>
-  <script>
-    Polymer({
-      is: "tf-categorizer",
-      properties: {
-        regexes: {type: Array},
-        tags: {type: Array},
-        categoriesAreExclusive: {type: Boolean, value: true},
-        fallbackCategorizer: {
-          type: String,
-          computed: "chooseFallbackCategorizer(splitOnUnderscore)"
-        },
-        splitOnUnderscore: {
-          type: Boolean,
-          notify: true,
-          value: TF.URIStorage.getBooleanInitializer('splitOnUnderscore',
-              false),
-          observer: '_splitOnUnderscoreObserver'
-        },
-        categorizer: {
-          type: Object,
-          computed: "computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)",
-        },
-        categories: {type: Array, value: function() {return [];}, notify: true, readOnly: true},
-      },
-      observers: ['recategorize(tags.*, categorizer)'],
-      computeCategorization: function(regexes, categoriesAreExclusive, fallbackCategorizer) {
-        var categorizationStrategy = {
-          categoryDefinitions: regexes.base,
-          categoriesAreExclusive: categoriesAreExclusive,
-          fallbackCategorizer: fallbackCategorizer,
-        };
-        return Categorizer.categorizer(categorizationStrategy);
-      },
-      recategorize: function() {
-        this.debounce("tf-categorizer-recategorize", function (){
-          var categories = this.categorizer(this.tags);
-          this._setCategories(categories);
-        })
-      },
-      chooseFallbackCategorizer: function(splitOnUnderscore) {
-        if (splitOnUnderscore) {
-          return "LegacyUnderscoreCategorizer";
-        } else {
-          return "TopLevelNamespaceCategorizer";
-        }
-      },
-      _splitOnUnderscoreObserver: TF.URIStorage.getBooleanObserver(
-          'splitOnUnderscore', false)
-    });
-  </script>
-</dom-module>
-
-
-<dom-module id="run-color-style" assetpath="/">
-  <template>
-    <style>
-    [color-class="light-blue"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-light-blue-500);
-      --paper-checkbox-checked-ink-color: var(--paper-light-blue-500);
-      --paper-checkbox-unchecked-color: var(--paper-light-blue-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-light-blue-900);
-    }
-    [color-class="red"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-red-500);
-      --paper-checkbox-checked-ink-color: var(--paper-red-500);
-      --paper-checkbox-unchecked-color: var(--paper-red-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-red-900);
-    }
-    [color-class="green"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-green-500);
-      --paper-checkbox-checked-ink-color: var(--paper-green-500);
-      --paper-checkbox-unchecked-color: var(--paper-green-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-green-900);
-    }
-    [color-class="purple"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-purple-500);
-      --paper-checkbox-checked-ink-color: var(--paper-purple-500);
-      --paper-checkbox-unchecked-color: var(--paper-purple-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-purple-900);
-    }
-    [color-class="teal"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-teal-500);
-      --paper-checkbox-checked-ink-color: var(--paper-teal-500);
-      --paper-checkbox-unchecked-color: var(--paper-teal-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-teal-900);
-    }
-    [color-class="pink"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-pink-500);
-      --paper-checkbox-checked-ink-color: var(--paper-pink-500);
-      --paper-checkbox-unchecked-color: var(--paper-pink-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-pink-900);
-    }
-    [color-class="orange"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-orange-500);
-      --paper-checkbox-checked-ink-color: var(--paper-orange-500);
-      --paper-checkbox-unchecked-color: var(--paper-orange-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-orange-900);
-    }
-    [color-class="brown"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-brown-500);
-      --paper-checkbox-checked-ink-color: var(--paper-brown-500);
-      --paper-checkbox-unchecked-color: var(--paper-brown-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-brown-900);
-    }
-    [color-class="indigo"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-indigo-500);
-      --paper-checkbox-checked-ink-color: var(--paper-indigo-500);
-      --paper-checkbox-unchecked-color: var(--paper-indigo-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-indigo-900);
-    }
-    </style>
-  </template>
-</dom-module>
-<dom-module id="tf-multi-checkbox" assetpath="/">
-  <style include="scrollbar-style"></style>
-  <style include="run-color-style"></style>
-
-  <template>
-      <paper-input id="runs-regex" no-label-float="" label="Write a regex to filter runs" value="[[regexInput]]" on-bind-value-changed="_debouncedRegexChange"></paper-input>
-    <div id="outer-container" class="scrollbar">
-      <template is="dom-repeat" items="[[namesMatchingRegex]]">
-        <div class="run-row">
-          <div class="icon-container checkbox-container vertical-align-container">
-            <paper-checkbox class="checkbox vertical-align-center" name="[[item]]" checked$="[[_isChecked(item, runsDisabled.*)]]" on-change="_checkboxChange"></paper-checkbox>
-
-          </div>
-          <div class="icon-container isolator-container vertical-align-container">
-            <paper-icon-button icon="radio-button-unchecked" class="isolator vertical-align-center" on-tap="_isolateRun" name="[[item]]"></paper-icon-button>
-          </div>
-          <div class="item-label-container">
-            <span>[[item]]</span>
-          </div>
-        </div>
-      </template>
-    </div>
-  <style>
-    paper-input {
-      --paper-input-container-focus-color: var(--tb-orange-strong);
-      --paper-input-container-input: {
-        font-size: 14px;
-      };
-      --paper-input-container-label: {
-        font-size: 14px;
-      };
-    }
-    :host {
-      display: flex;
-      flex-direction: column;
-      height: 100%;
-    }
-    #outer-container {
-      overflow-y: auto;
-      overflow-x: hidden;
-      width: 100%;
-      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
-      flex-grow: 1;
-      flex-shrink: 1;
-      word-wrap: break-word;
-    }
-    .run-row {
-      padding-top: 5px;
-      padding-bottom: 5px;
-      display: flex;
-      flex-direction: row;
-      font-size: 13px;
-    }
-    .icon-container {
-      flex-grow: 0;
-      flex-shrink: 0;
-      padding-left: 2px;
-    }
-    .checkbox {
-      padding-left: 2px;
-      width: 18px;
-      height: 18px;
-    }
-    .isolator {
-      width: 18px;
-      height: 18px;
-      padding: 0px;
-    }
-    .isolator-container {
-      padding-left: 6px;
-      padding-right: 3px;
-    }
-    .checkbox-container {
-      padding-left: 2px;
-    }
-    .item-label-container {
-      padding-left: 5px;
-      flex-grow: 1;
-      flex-shrink: 1;
-      width: 0px; /* hack to get the flex-grow to work properly */
-    }
-    .tooltip-value-container {
-      display: flex;
-      justify-content: center;
-      flex-grow: 0;
-      flex-shrink: 0;
-      text-align:right;
-      padding-left: 2px;
-    }
-    .vertical-align-container {
-      display: flex;
-      justify-content: center;
-    }
-    .vertical-align-container .vertical-align-center {
-      align-self: center;
-    }
-    .vertical-align-container .vertical-align-top {
-      align-self: start;
-    }
-  </style>
-  </template>
-
-  <script>
-  Polymer({
-    is: "tf-multi-checkbox",
-    properties: {
-      names: {
-        type: Array,
-        value: function() {return [];},
-      }, // All the runs in consideration
-      regexInput: {
-        type: String,
-        value: TF.URIStorage.getStringInitializer("regexInput", ""),
-        observer: "_regexInputObserver",
-      }, // Regex for filtering the runs
-      regex: {
-        type: Object,
-        computed: "_makeRegex(regexInput)"
-      },
-      namesMatchingRegex: {
-        type: Array,
-        computed: "computeNamesMatchingRegex(names.*, regex)"
-      }, // Runs that match the regex
-      runsDisabled: {
-        type: Object,
-        value: TF.URIStorage.getObjectInitializer('runsDisabled', {}),
-      }, // Every run that is disabled is stored in the map (with value true)
-      // (Allows state to persist across regex filtering)
-      outSelected: {
-        type: Array,
-        notify: true,
-        computed: 'computeOutSelected(namesMatchingRegex.*, runsDisabled.*)'
-      },
-      colorScale: {
-        type: Object,
-        observer: "synchronizeColors",
-      }, // map from run name to css class
-      _debouncedRegexChange: {
-        type: Function,
-        // Updating the regex can be slow, because it involves updating styles
-        // on a large number of Polymer paper-checkboxes. We don't want to do
-        // this while the user is typing, as it may make a bad, laggy UI.
-        // So we debounce the updates that come from user typing.
-        value: function() {
-          _this = this;
-          var debounced = _.debounce(function(r) {
-            _this.regexInput = r;
-          }, 150, {leading: false});
-          return function() {
-            var r = this.$$("#runs-regex").value;
-            if (r == "") {
-              // If the user cleared the field, they may be done typing, so
-              // update more quickly.
-              this.async(function() {
-                _this.regexInput = r;
-              }, 30);
-            } else {
-              debounced(r);
-            };
-          };
-        },
-      },
-    },
-    listeners: {
-      'dom-change': 'synchronizeColors',
-    },
-    observers: [
-      "_setIsolatorIcon(runsDisabled, names)",
-      "_storeRunToIsCheckedMapping(runsDisabled)",
-    ],
-    _storeRunToIsCheckedMapping: TF.URIStorage.getObjectObserver('runsDisabled', {}),
-    _makeRegex: function(regex) {
-      try {
-        return new RegExp(regex)
-      } catch (e) {
-        return null;
-      }
-    },
-    _setIsolatorIcon: function() {
-      var runMap = this.runsDisabled;
-      var numChecked = this.names.length - _.filter(_.values(runMap)).length;
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-
-      buttons.forEach(function(b) {
-        if (numChecked === 1 && !runMap[b.name]) {
-          b.icon = "radio-button-checked";
-        } else {
-          b.icon = "radio-button-unchecked";
-        }
-      });
-    },
-    computeNamesMatchingRegex: function(__, ___) {
-      var regex = this.regex;
-      return this.names.filter(function(n) {
-        return regex == null || regex.test(n);
-      });
-    },
-    computeOutSelected: function(__, ___) {
-      var runsDisabled = this.runsDisabled;
-      return this.namesMatchingRegex.filter(function(n) {
-        return !runsDisabled[n];
-      });
-    },
-    synchronizeColors: function(e) {
-      if (!this.colorScale) return;
-
-      this._setIsolatorIcon();
-
-      var checkboxes = Array.prototype.slice.call(this.querySelectorAll("paper-checkbox"));
-      var scale = this.colorScale;
-      checkboxes.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.customStyle['--paper-checkbox-checked-color'] = color;
-        p.customStyle['--paper-checkbox-checked-ink-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
-      });
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-      buttons.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.style['color'] = color;
-      });
-      // The updateStyles call fails silently if the browser doesn't have focus,
-      // e.g. if TensorBoard was opened into a new tab that isn't visible.
-      // So we wait for requestAnimationFrame.
-      var _this = this;
-      window.requestAnimationFrame(function() {_this.updateStyles();});
-    },
-    _isolateRun: function(e) {
-      // If user clicks on the label for one run, enable it and disable all other runs.
-
-      var name = Polymer.dom(e).localTarget.name;
-      var newDisabled = {};
-      this.names.forEach(function(n) {
-        newDisabled[n] = true;
-      })
-      delete newDisabled[name];
-      this.runsDisabled = newDisabled;
-    },
-    _checkboxChange: function(e) {
-      var target = Polymer.dom(e).localTarget;
-      if (target.checked) {
-        delete this.runsDisabled[target.name];
-      } else {
-        this.runsDisabled[target.name] = true;
-      }
-      // n.b. notifyPath won't work because run names may have periods.
-      this.runsDisabled = _.clone(this.runsDisabled);
-    },
-    _isChecked: function(item, outSelectedChange) {
-      return this.runsDisabled[item] == undefined;
-    },
-    _initializeRuns: function(change) {
-      this.outSelected = change.base.slice();
-    },
-    _regexInputObserver: TF.URIStorage.getStringObserver("regexInput", ""),
-    toggleAll: function() {
-      var _this = this;
-      var allOn = this.namesMatchingRegex
-                    .filter(function(n) {return _this.runsDisabled[n]})
-                    .length === 0;
-      let newRunsDisabled = {}
-      if (allOn) {
-        this.names.forEach(function(n) {
-          newRunsDisabled[n] = true;
-        })
-      }
-      this.runsDisabled = newRunsDisabled;
-    },
-  });
-  </script>
-</dom-module>
-<dom-module id="tf-run-selector" assetpath="../tf-dashboard-common/">
-  <template>
-    <div id="top-text">
-      <h3 id="tooltip-help" class="tooltip-container">
-        Runs
-      </h3>
-    </div>
-    <tf-multi-checkbox id="multiCheckbox" names="[[runs]]" out-selected="{{outSelected}}" color-scale="[[colorScale]]"></tf-multi-checkbox>
-    <paper-button class="x-button" id="toggle-all" on-tap="_toggleAll">
-    Toggle All Runs
-    </paper-button>
-    <template is="dom-if" if="[[logdir]]">
-      <div id="logdir" inner-h-t-m-l="{{_breakString(logdir)}}"></div>
-    </template>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        padding-bottom: 10px;
-        box-sizing: border-box;
-      }
-      #top-text {
-        width: 100%;
-        flex-grow: 0;
-        flex-shrink: 0;
-        padding-right: 16px;
-        box-sizing: border-box;
-        color: var(--paper-grey-800);
-      }
-      tf-multi-checkbox {
-        display: flex;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      .x-button {
-        font-size: 13px;
-        background-color: var(--tb-ui-light-accent);
-        color: var(--tb-ui-dark-accent);
-      }
-      #tooltip-help {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-      paper-button {
-        margin-left: 0;
-      }
-      #logdir {
-        color: var(--tb-ui-dark-accent);
-        font-size: 13px;
-        margin: 5px 0 0 0;
-        max-width: 288px;
-      }
-    </style>
-  </template>
-  <script>
-  Polymer({
-    is: "tf-run-selector",
-    properties: {
-      backend: Object,
-      outSelected: {type: Array, notify: true},
-      // runs: an array of strings, representing the run names that may be chosen
-      runs: Array,
-      colorScale: Object, // TF.ColorScale
-      logdir: {
-        type: String,
-        notify: true,
-      },
-    },
-    ready: function() {
-      // Populate the logdir.
-      this.backend.logdir().then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
-    _toggleAll: function() {
-      this.$.multiCheckbox.toggleAll();
-    },
-    // Break the string at natural points, including commas, equals, and slashes
-    _breakString: function(originalString) {
-      return originalString.replace(/([\/=-_,])/g, "$1<wbr>")
-    },
-  });
-  </script>
-</dom-module>
-<dom-module id="tf-sidebar-helper" assetpath="../tf-dashboard-common/">
-  <template>
-    <div class="sidebar-section">
-      <tf-categorizer id="categorizer" tags="[[tags]]" categories="{{categories}}"></tf-categorizer>
-      <content select=".extend-first-section"></content>
-    </div>
-    <content></content>
-    <div class="sidebar-section">
-      <tf-run-selector id="runSelector" backend="[[backend]]" runs="[[runs]]" color-scale="[[colorScale]]" out-selected="{{selectedRuns}}"></tf-run-selector>
-    </div>
-    <style include="dashboard-style"></style>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-      }
-
-      #categorizer {
-        flex-shrink: 0;
-      }
-
-      #runSelector {
-        flex-shrink: 1;
-        flex-grow: 1;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-sidebar-helper",
-      properties: {
-        /**
-         * The backend object used to issue requests.
-         */
-        backend: Object,
-
-        /**
-         * This is an output of the categories that the user selected to
-         * separate the different tags. Each category here should be given its
-         * own collapsible pane.
-         */
-        categories: {
-          type: Array,
-          notify: true,
-        },
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Input of all valid runs that can be selected by the user.
-         */
-        runs: Array,
-
-        /**
-         * Outputs an array with the runs that are selected by the user (i.e.
-         * valid to be displayed).
-         */
-        selectedRuns: {
-          type: Array,
-          notify: true,
-        },
-
-        tags: {
-          type: Array,
-          computed: "_getTags(run2tag.*)"
-        },
-      },
-      _getTags: function() {
-        return _.union.apply(null, _.values(this.run2tag));
-      },
-    })
-  </script>
-</dom-module>
-
-<dom-module id="vz-line-chart" assetpath="../vz-line-chart/">
-  <template>
-    <div id="tooltip">
-      <table>
-        <thead>
-          <tr>
-            <th></th>
-            <th>Name</th>
-            <template is="dom-if" if="{{smoothingEnabled}}">
-              <th>Smoothed</th>
-            </template>
-            <th>Value</th>
-            <th>Step</th>
-            <th>Time</th>
-            <th>Relative</th>
-          </tr>
-        </thead>
-        <tbody>
-        </tbody>
-      </table>
-    </div>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-        outline: none;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      td {
-        padding-left: 5px;
-        padding-right: 5px;
-        font-size: 13px;
-        opacity: 1;
-      }
-      #tooltip {
-        pointer-events: none;
-        position: absolute;
-        opacity: 0;
-        box-shadow: 0 1px 4px rgba(0, 0, 0, 0.3);
-        font-size: 14px;
-        background: rgba(0, 0, 0, 0.8);
-        color: white;
-        border-radius: 4px;
-        line-height: 1.4em;
-        padding: 8px;
-        z-index: 5;
-        cursor: none;
-        margin-top: 10px;
-      }
-      .swatch {
-        border-radius: 50%;
-        width: 14px;
-        height: 14px;
-        display: block;
-        border: 2px solid rgba(0,0,0,0);
-      }
-      .closest .swatch {
-        border: 2px solid white;
-      }
-      th {
-        padding-left: 5px;
-        padding-right: 5px;
-        text-align: left;
-      }
-      .distant td {
-        opacity: 0.8;
-      }
-
-      .distant td.swatch {
-        opacity: 1;
-      }
-
-      .ghost {
-        opacity: 0.2;
-        stroke-width: 1px;
-      }
-
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var Plottable;
-(function (Plottable) {
-    var DragZoomLayer = (function (_super) {
-        __extends(DragZoomLayer, _super);
-        /**
-         * Constructs a SelectionBoxLayer with an attached DragInteraction and
-         * ClickInteraction. On drag, it triggers an animated zoom into the box
-         * that was dragged. On double click, it zooms back out to the original
-         * view, before any zooming.
-         * The zoom animation uses an easing function (default
-         * d3.ease('cubic-in-out')) and is customizable.
-         * Usage: Construct the selection box layer and attach x and y scales,
-         * and then add the layer over the plot you are zooming on using a
-         * Component Group.
-         * TODO(danmane) - merge this into Plottable
-         */
-        function DragZoomLayer(xScale, yScale) {
-            _super.call(this);
-            this.isZoomed = false;
-            this.easeFn = d3.ease('cubic-in-out');
-            this._animationTime = 750;
-            this.xScale(xScale);
-            this.yScale(yScale);
-            this._dragInteraction = new Plottable.Interactions.Drag();
-            this._dragInteraction.attachTo(this);
-            this._doubleClickInteraction = new Plottable.Interactions.DoubleClick();
-            this._doubleClickInteraction.attachTo(this);
-            this.setupCallbacks();
-        }
-        /**
-         * Register a method that calls when the DragZoom interaction starts.
-         */
-        DragZoomLayer.prototype.interactionStart = function (cb) { this.onStart = cb; };
-        /**
-         * Register a method that calls when the DragZoom interaction ends.
-         */
-        DragZoomLayer.prototype.interactionEnd = function (cb) { this.onEnd = cb; };
-        DragZoomLayer.prototype.setupCallbacks = function () {
-            var _this = this;
-            var dragging = false;
-            this._dragInteraction.onDragStart(function (startPoint) {
-                _this.bounds({
-                    topLeft: startPoint,
-                    bottomRight: startPoint,
-                });
-                _this.onStart();
-            });
-            this._dragInteraction.onDrag(function (startPoint, endPoint) {
-                _this.bounds({ topLeft: startPoint, bottomRight: endPoint });
-                _this.boxVisible(true);
-                dragging = true;
-            });
-            this._dragInteraction.onDragEnd(function (startPoint, endPoint) {
-                _this.boxVisible(false);
-                _this.bounds({ topLeft: startPoint, bottomRight: endPoint });
-                if (dragging) {
-                    _this.zoom();
-                }
-                else {
-                    _this.onEnd();
-                }
-                dragging = false;
-            });
-            this._doubleClickInteraction.onDoubleClick(this.unzoom.bind(this));
-        };
-        DragZoomLayer.prototype.animationTime = function (animationTime) {
-            if (animationTime == null) {
-                return this._animationTime;
-            }
-            if (animationTime < 0) {
-                throw new Error('animationTime cannot be negative');
-            }
-            this._animationTime = animationTime;
-            return this;
-        };
-        /**
-         * Set the easing function, which determines how the zoom interpolates
-         * over time.
-         */
-        DragZoomLayer.prototype.ease = function (fn) {
-            if (typeof (fn) !== 'function') {
-                throw new Error('ease function must be a function');
-            }
-            if (fn(0) !== 0 || fn(1) !== 1) {
-                Plottable.Utils.Window.warn('Easing function does not maintain invariant ' +
-                    'f(0)==0 && f(1)==1. Bad behavior may result.');
-            }
-            this.easeFn = fn;
-            return this;
-        };
-        // Zoom into extent of the selection box bounds
-        DragZoomLayer.prototype.zoom = function () {
-            var x0 = this.xExtent()[0].valueOf();
-            var x1 = this.xExtent()[1].valueOf();
-            var y0 = this.yExtent()[1].valueOf();
-            var y1 = this.yExtent()[0].valueOf();
-            if (x0 === x1 || y0 === y1) {
-                return;
-            }
-            if (!this.isZoomed) {
-                this.isZoomed = true;
-            }
-            this.interpolateZoom(x0, x1, y0, y1);
-        };
-        // Restore the scales to their state before any zoom
-        DragZoomLayer.prototype.unzoom = function () {
-            if (!this.isZoomed) {
-                return;
-            }
-            this.isZoomed = false;
-            // Some Plottable magic follows which ensures that when we un-zoom, we
-            // un-zoom to the current extent of the data; i.e. if new data was loaded
-            // since we zoomed, we should un-zoom to the extent of the new data.
-            // this basically replicates the autoDomain logic in Plottable.
-            // it uses the internal methods to get the same boundaries that autoDomain
-            // would, but allows us to interpolate the zoom with a nice animation.
-            var xScale = this.xScale();
-            var yScale = this.yScale();
-            xScale._domainMin = null;
-            xScale._domainMax = null;
-            yScale._domainMin = null;
-            yScale._domainMax = null;
-            var xDomain = xScale._getExtent();
-            var yDomain = yScale._getExtent();
-            this.interpolateZoom(xDomain[0], xDomain[1], yDomain[0], yDomain[1]);
-        };
-        // If we are zooming, disable interactions, to avoid contention
-        DragZoomLayer.prototype.isZooming = function (isZooming) {
-            this._dragInteraction.enabled(!isZooming);
-            this._doubleClickInteraction.enabled(!isZooming);
-        };
-        DragZoomLayer.prototype.interpolateZoom = function (x0f, x1f, y0f, y1f) {
-            var _this = this;
-            var x0s = this.xScale().domain()[0].valueOf();
-            var x1s = this.xScale().domain()[1].valueOf();
-            var y0s = this.yScale().domain()[0].valueOf();
-            var y1s = this.yScale().domain()[1].valueOf();
-            // Copy a ref to the ease fn, so that changing ease wont affect zooms in
-            // progress.
-            var ease = this.easeFn;
-            var interpolator = function (a, b, p) {
-                return d3.interpolateNumber(a, b)(ease(p));
-            };
-            this.isZooming(true);
-            var start = Date.now();
-            var draw = function () {
-                var now = Date.now();
-                var passed = now - start;
-                var p = _this._animationTime === 0 ?
-                    1 :
-                    Math.min(1, passed / _this._animationTime);
-                var x0 = interpolator(x0s, x0f, p);
-                var x1 = interpolator(x1s, x1f, p);
-                var y0 = interpolator(y0s, y0f, p);
-                var y1 = interpolator(y1s, y1f, p);
-                _this.xScale().domain([x0, x1]);
-                _this.yScale().domain([y0, y1]);
-                if (p < 1) {
-                    Plottable.Utils.DOM.requestAnimationFramePolyfill(draw);
-                }
-                else {
-                    _this.onEnd();
-                    _this.isZooming(false);
-                }
-            };
-            draw();
-        };
-        return DragZoomLayer;
-    }(Plottable.Components.SelectionBoxLayer));
-    Plottable.DragZoomLayer = DragZoomLayer;
-})(Plottable || (Plottable = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var LineChart = (function () {
-        function LineChart(xType, yScaleType, colorScale, tooltip) {
-            this.seriesNames = [];
-            this.name2datasets = {};
-            this.colorScale = colorScale;
-            this.tooltip = tooltip;
-            this.datasets = [];
-            // lastPointDataset is a dataset that contains just the last point of
-            // every dataset we're currently drawing.
-            this.lastPointsDataset = new Plottable.Dataset();
-            this.nanDataset = new Plottable.Dataset();
-            // need to do a single bind, so we can deregister the callback from
-            // old Plottable.Datasets. (Deregistration is done by identity checks.)
-            this.onDatasetChanged = this._onDatasetChanged.bind(this);
-            this.buildChart(xType, yScaleType);
-        }
-        LineChart.prototype.buildChart = function (xType, yScaleType) {
-            if (this.outer) {
-                this.outer.destroy();
-            }
-            var xComponents = VZ.ChartHelpers.getXComponents(xType);
-            this.xAccessor = xComponents.accessor;
-            this.xScale = xComponents.scale;
-            this.xAxis = xComponents.axis;
-            this.xAxis.margin(0).tickLabelPadding(3);
-            this.yScale = LineChart.getYScaleFromType(yScaleType);
-            this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-            var yFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-            this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-            this.yAxis.usesTextWidthApproximation(true);
-            this.dzl = new Plottable.DragZoomLayer(this.xScale, this.yScale);
-            var center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-            this.gridlines =
-                new Plottable.Components.Gridlines(this.xScale, this.yScale);
-            this.center =
-                new Plottable.Components.Group([this.gridlines, center, this.dzl]);
-            this.outer = new Plottable.Components.Table([
-                [this.yAxis, this.center],
-                [null, this.xAxis]
-            ]);
-        };
-        LineChart.prototype.buildPlot = function (xAccessor, xScale, yScale) {
-            var _this = this;
-            this.scalarAccessor = function (d) { return d.scalar; };
-            this.smoothedAccessor = function (d) { return d.smoothed; };
-            var linePlot = new Plottable.Plots.Line();
-            linePlot.x(xAccessor, xScale);
-            linePlot.y(this.scalarAccessor, yScale);
-            linePlot.attr('stroke', function (d, i, dataset) {
-                return _this.colorScale.scale(dataset.metadata().name);
-            });
-            this.linePlot = linePlot;
-            var group = this.setupTooltips(linePlot);
-            var smoothLinePlot = new Plottable.Plots.Line();
-            smoothLinePlot.x(xAccessor, xScale);
-            smoothLinePlot.y(this.smoothedAccessor, yScale);
-            smoothLinePlot.attr('stroke', function (d, i, dataset) {
-                return _this.colorScale.scale(dataset.metadata().name);
-            });
-            this.smoothLinePlot = smoothLinePlot;
-            // The scatterPlot will display the last point for each dataset.
-            // This way, if there is only one datum for the series, it is still
-            // visible. We hide it when tooltips are active to keep things clean.
-            var scatterPlot = new Plottable.Plots.Scatter();
-            scatterPlot.x(xAccessor, xScale);
-            scatterPlot.y(this.scalarAccessor, yScale);
-            scatterPlot.attr('fill', function (d) { return _this.colorScale.scale(d.name); });
-            scatterPlot.attr('opacity', 1);
-            scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
-            scatterPlot.datasets([this.lastPointsDataset]);
-            this.scatterPlot = scatterPlot;
-            var nanDisplay = new Plottable.Plots.Scatter();
-            nanDisplay.x(xAccessor, xScale);
-            nanDisplay.y(function (x) { return x.displayY; }, yScale);
-            nanDisplay.attr('fill', function (d) { return _this.colorScale.scale(d.name); });
-            nanDisplay.attr('opacity', 1);
-            nanDisplay.size(VZ.ChartHelpers.NAN_SYMBOL_SIZE * 2);
-            nanDisplay.datasets([this.nanDataset]);
-            nanDisplay.symbol(Plottable.SymbolFactories.triangleUp);
-            this.nanDisplay = nanDisplay;
-            return new Plottable.Components.Group([nanDisplay, scatterPlot, smoothLinePlot, group]);
-        };
-        /** Updates the chart when a dataset changes. Called every time the data of
-         * a dataset changes to update the charts.
-         */
-        LineChart.prototype._onDatasetChanged = function (dataset) {
-            if (this.smoothingEnabled) {
-                this.resmoothDataset(dataset);
-            }
-            this.updateSpecialDatasets();
-        };
-        LineChart.prototype.updateSpecialDatasets = function () {
-            if (this.smoothingEnabled) {
-                this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-            }
-            else {
-                this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-            }
-        };
-        /** Constructs special datasets. Each special dataset contains exceptional
-         * values from all of the regular datasets, e.g. last points in series, or
-         * NaN values. Those points will have a `name` and `relative` property added
-         * (since usually those are context in the surrounding dataset).
-         * The accessor will point to the correct data to access.
-         */
-        LineChart.prototype.updateSpecialDatasetsWithAccessor = function (accessor) {
-            var lastPointsData = this.datasets
-                .map(function (d) {
-                var datum = null;
-                // filter out NaNs to ensure last point is a clean one
-                var nonNanData = d.data().filter(function (x) { return !isNaN(accessor(x, -1, d)); });
-                if (nonNanData.length > 0) {
-                    var idx = nonNanData.length - 1;
-                    datum = nonNanData[idx];
-                    datum.name = d.metadata().name;
-                    datum.relative =
-                        VZ.ChartHelpers.relativeAccessor(datum, -1, d);
-                }
-                return datum;
-            })
-                .filter(function (x) { return x != null; });
-            this.lastPointsDataset.data(lastPointsData);
-            // Take a dataset, return an array of NaN data points
-            // the NaN points will have a "displayY" property which is the
-            // y-value of a nearby point that was not NaN (0 if all points are NaN)
-            var datasetToNaNData = function (d) {
-                var displayY = null;
-                var data = d.data();
-                var i = 0;
-                while (i < data.length && displayY == null) {
-                    if (!isNaN(accessor(data[i], -1, d))) {
-                        displayY = accessor(data[i], -1, d);
-                    }
-                    i++;
-                }
-                if (displayY == null) {
-                    displayY = 0;
-                }
-                var nanData = [];
-                for (i = 0; i < data.length; i++) {
-                    if (!isNaN(accessor(data[i], -1, d))) {
-                        displayY = accessor(data[i], -1, d);
-                    }
-                    else {
-                        data[i].name = d.metadata().name;
-                        data[i].displayY = displayY;
-                        data[i].relative = VZ.ChartHelpers.relativeAccessor(data[i], -1, d);
-                        nanData.push(data[i]);
-                    }
-                }
-                return nanData;
-            };
-            var nanData = _.flatten(this.datasets.map(datasetToNaNData));
-            this.nanDataset.data(nanData);
-        };
-        LineChart.prototype.setupTooltips = function (plot) {
-            var _this = this;
-            var pi = new Plottable.Interactions.Pointer();
-            pi.attachTo(plot);
-            // PointsComponent is a Plottable Component that will hold the little
-            // circles we draw over the closest data points
-            var pointsComponent = new Plottable.Component();
-            var group = new Plottable.Components.Group([plot, pointsComponent]);
-            var hideTooltips = function () {
-                _this.tooltip.style('opacity', 0);
-                _this.scatterPlot.attr('opacity', 1);
-                pointsComponent.content().selectAll('.point').remove();
-            };
-            var enabled = true;
-            var disableTooltips = function () {
-                enabled = false;
-                hideTooltips();
-            };
-            var enableTooltips = function () { enabled = true; };
-            this.dzl.interactionStart(disableTooltips);
-            this.dzl.interactionEnd(enableTooltips);
-            pi.onPointerMove(function (p) {
-                if (!enabled) {
-                    return;
-                }
-                var target = {
-                    x: p.x,
-                    y: p.y,
-                    datum: null,
-                    dataset: null,
-                };
-                var bbox = _this.gridlines.content().node().getBBox();
-                // pts is the closets point to the tooltip for each dataset
-                var pts = plot.datasets()
-                    .map(function (dataset) { return _this.findClosestPoint(target, dataset); })
-                    .filter(function (x) { return x != null; });
-                var intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
-                // We draw tooltips for points that are NaN, or are currently visible
-                var ptsForTooltips = pts.filter(function (p) { return intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar); });
-                // Only draw little indicator circles for the non-NaN points
-                var ptsToCircle = ptsForTooltips.filter(function (p) { return !isNaN(p.datum.scalar); });
-                var ptsSelection = pointsComponent.content().selectAll('.point').data(ptsToCircle, function (p) { return p.dataset.metadata().name; });
-                if (pts.length !== 0) {
-                    ptsSelection.enter().append('circle').classed('point', true);
-                    ptsSelection.attr('r', VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE)
-                        .attr('cx', function (p) { return p.x; })
-                        .attr('cy', function (p) { return p.y; })
-                        .style('stroke', 'none')
-                        .attr('fill', function (p) { return _this.colorScale.scale(p.dataset.metadata().name); });
-                    ptsSelection.exit().remove();
-                    _this.drawTooltips(ptsForTooltips, target);
-                }
-                else {
-                    hideTooltips();
-                }
-            });
-            pi.onPointerExit(hideTooltips);
-            return group;
-        };
-        LineChart.prototype.drawTooltips = function (points, target) {
-            var _this = this;
-            // Formatters for value, step, and wall_time
-            this.scatterPlot.attr('opacity', 0);
-            var valueFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
-            var dist = function (p) {
-                return Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
-            };
-            var closestDist = _.min(points.map(dist));
-            var valueSortMethod = this.scalarAccessor;
-            if (this.smoothingEnabled) {
-                valueSortMethod = this.smoothedAccessor;
-            }
-            if (this.tooltipSortingMethod === 'ascending') {
-                points =
-                    _.sortBy(points, function (d) { return valueSortMethod(d.datum, -1, d.dataset); });
-            }
-            else if (this.tooltipSortingMethod === 'descending') {
-                points =
-                    _.sortBy(points, function (d) { return valueSortMethod(d.datum, -1, d.dataset); })
-                        .reverse();
-            }
-            else if (this.tooltipSortingMethod === 'nearest') {
-                points = _.sortBy(points, dist);
-            }
-            else {
-                // The 'default' sorting method maintains the order of names passed to
-                // setVisibleSeries(). However we reverse that order when defining the
-                // datasets. So we must call reverse again to restore the order.
-                points = points.slice(0).reverse();
-            }
-            var rows = this.tooltip.select('tbody')
-                .html('')
-                .selectAll('tr')
-                .data(points)
-                .enter()
-                .append('tr');
-            // Grey out the point if any of the following are true:
-            // - The cursor is outside of the x-extent of the dataset
-            // - The point's y value is NaN
-            rows.classed('distant', function (d) {
-                var firstPoint = d.dataset.data()[0];
-                var lastPoint = _.last(d.dataset.data());
-                var firstX = _this.xScale.scale(_this.xAccessor(firstPoint, 0, d.dataset));
-                var lastX = _this.xScale.scale(_this.xAccessor(lastPoint, 0, d.dataset));
-                var s = _this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
-                return target.x < firstX || target.x > lastX || isNaN(s);
-            });
-            rows.classed('closest', function (p) { return dist(p) === closestDist; });
-            // It is a bit hacky that we are manually applying the width to the swatch
-            // and the nowrap property to the text here. The reason is as follows:
-            // the style gets updated asynchronously by Polymer scopeSubtree observer.
-            // Which means we would get incorrect sizing information since the text
-            // would wrap by default. However, we need correct measurements so that
-            // we can stop the text from falling off the edge of the screen.
-            // therefore, we apply the size-critical styles directly.
-            rows.style('white-space', 'nowrap');
-            rows.append('td')
-                .append('span')
-                .classed('swatch', true)
-                .style('background-color', function (d) { return _this.colorScale.scale(d.dataset.metadata().name); });
-            rows.append('td').text(function (d) { return d.dataset.metadata().name; });
-            if (this.smoothingEnabled) {
-                rows.append('td').text(function (d) { return isNaN(d.datum.smoothed) ? 'NaN' :
-                    valueFormatter(d.datum.smoothed); });
-            }
-            rows.append('td').text(function (d) {
-                return isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar);
-            });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.stepFormatter(d.datum.step); });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.timeFormatter(d.datum.wall_time); });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.relativeFormatter(VZ.ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)); });
-            // compute left position
-            var documentWidth = document.body.clientWidth;
-            var node = this.tooltip.node();
-            var parentRect = node.parentElement.getBoundingClientRect();
-            var nodeRect = node.getBoundingClientRect();
-            // prevent it from falling off the right side of the screen
-            var left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
-            if (this.tooltipPosition === 'right') {
-                left = Math.min(parentRect.width, left);
-            }
-            else {
-                left = Math.min(0, left);
-                top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
-            }
-            this.tooltip.style('transform', 'translate(' + left + 'px,' + top + 'px)');
-            this.tooltip.style('opacity', 1);
-        };
-        LineChart.prototype.findClosestPoint = function (target, dataset) {
-            var _this = this;
-            var points = dataset.data().map(function (d, i) {
-                var x = _this.xAccessor(d, i, dataset);
-                var y = _this.smoothingEnabled ? _this.smoothedAccessor(d, i, dataset) :
-                    _this.scalarAccessor(d, i, dataset);
-                return {
-                    x: _this.xScale.scale(x),
-                    y: _this.yScale.scale(y),
-                    datum: d,
-                    dataset: dataset,
-                };
-            });
-            var idx = _.sortedIndex(points, target, function (p) { return p.x; });
-            if (idx === points.length) {
-                return points[points.length - 1];
-            }
-            else if (idx === 0) {
-                return points[0];
-            }
-            else {
-                var prev = points[idx - 1];
-                var next = points[idx];
-                var prevDist = Math.abs(prev.x - target.x);
-                var nextDist = Math.abs(next.x - target.x);
-                return prevDist < nextDist ? prev : next;
-            }
-        };
-        LineChart.prototype.resmoothDataset = function (dataset) {
-            // When increasing the smoothing window, it smoothes a lot with the first
-            // few points and then starts to gradually smooth slower, so using an
-            // exponential function makes the slider more consistent. 1000^x has a
-            // range of [1, 1000], so subtracting 1 and dividing by 999 results in a
-            // range of [0, 1], which can be used as the percentage of the data, so
-            // that the kernel size can be specified as a percentage instead of a
-            // hardcoded number, what would be bad with multiple series.
-            var factor = (Math.pow(1000, this.smoothingWeight) - 1) / 999;
-            var data = dataset.data();
-            var kernelRadius = Math.floor(data.length * factor / 2);
-            data.forEach(function (d, i) {
-                var actualKernelRadius = Math.min(kernelRadius, i, data.length - i - 1);
-                var start = i - actualKernelRadius;
-                var end = i + actualKernelRadius + 1;
-                // Only smooth finite numbers.
-                if (!_.isFinite(d.scalar)) {
-                    d.smoothed = d.scalar;
-                }
-                else {
-                    d.smoothed = d3.mean(data.slice(start, end).filter(function (d) { return _.isFinite(d.scalar); }), function (d) { return d.scalar; });
-                }
-            });
-        };
-        LineChart.prototype.getDataset = function (name) {
-            if (this.name2datasets[name] === undefined) {
-                this.name2datasets[name] = new Plottable.Dataset([], { name: name });
-            }
-            return this.name2datasets[name];
-        };
-        LineChart.getYScaleFromType = function (yScaleType) {
-            if (yScaleType === 'log') {
-                return new Plottable.Scales.ModifiedLog();
-            }
-            else if (yScaleType === 'linear') {
-                return new Plottable.Scales.Linear();
-            }
-            else {
-                throw new Error('Unrecognized yScale type ' + yScaleType);
-            }
-        };
-        /**
-         * Update the selected series on the chart.
-         */
-        LineChart.prototype.setVisibleSeries = function (names) {
-            var _this = this;
-            names = names.sort();
-            this.seriesNames = names;
-            names.reverse(); // draw first series on top
-            this.datasets.forEach(function (d) { return d.offUpdate(_this.onDatasetChanged); });
-            this.datasets = names.map(function (r) { return _this.getDataset(r); });
-            this.datasets.forEach(function (d) { return d.onUpdate(_this.onDatasetChanged); });
-            this.linePlot.datasets(this.datasets);
-            if (this.smoothingEnabled) {
-                this.smoothLinePlot.datasets(this.datasets);
-            }
-            this.updateSpecialDatasets();
-        };
-        /**
-         * Set the data of a series on the chart.
-         */
-        LineChart.prototype.setSeriesData = function (name, data) {
-            this.getDataset(name).data(data);
-        };
-        LineChart.prototype.smoothingUpdate = function (weight) {
-            var _this = this;
-            this.smoothingWeight = weight;
-            this.datasets.forEach(function (d) { return _this.resmoothDataset(d); });
-            if (!this.smoothingEnabled) {
-                this.linePlot.addClass('ghost');
-                this.scatterPlot.y(this.smoothedAccessor, this.yScale);
-                this.smoothingEnabled = true;
-                this.smoothLinePlot.datasets(this.datasets);
-            }
-            this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-        };
-        LineChart.prototype.smoothingDisable = function () {
-            if (this.smoothingEnabled) {
-                this.linePlot.removeClass('ghost');
-                this.scatterPlot.y(this.scalarAccessor, this.yScale);
-                this.smoothLinePlot.datasets([]);
-                this.smoothingEnabled = false;
-                this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-            }
-        };
-        LineChart.prototype.setTooltipSortingMethod = function (method) {
-            this.tooltipSortingMethod = method;
-        };
-        LineChart.prototype.setTooltipPosition = function (position) {
-            this.tooltipPosition = position;
-        };
-        LineChart.prototype.renderTo = function (targetSVG) {
-            this.targetSVG = targetSVG;
-            this.setViewBox();
-            this.outer.renderTo(targetSVG);
-        };
-        /** There's an issue in Chrome where the svg overflow is a bit
-         * "flickery". There is a border on the gridlines on the extreme edge of the
-         * chart, which behaves inconsistently and causes the screendiffing tests to
-         * flake. We can solve this by creating 1px effective margin for the svg by
-         * setting the viewBox on the containing svg.
-         */
-        LineChart.prototype.setViewBox = function () {
-            // There's an issue in Firefox where if we measure with the old viewbox
-            // set, we get horrible results.
-            this.targetSVG.attr('viewBox', null);
-            var parent = this.targetSVG.node().parentNode;
-            var w = parent.clientWidth;
-            var h = parent.clientHeight;
-            this.targetSVG.attr({
-                'height': h,
-                'viewBox': "0 0 " + (w + 1) + " " + (h + 1),
-            });
-        };
-        LineChart.prototype.redraw = function () {
-            this.outer.redraw();
-            this.setViewBox();
-        };
-        LineChart.prototype.destroy = function () { this.outer.destroy(); };
-        return LineChart;
-    }());
-    VZ.LineChart = LineChart;
-})(VZ || (VZ = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var ChartHelpers;
-    (function (ChartHelpers) {
-        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION = 4;
-        ChartHelpers.STEP_FORMATTER_PRECISION = 4;
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION = 3;
-        ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET = 20;
-        ChartHelpers.TOOLTIP_CIRCLE_SIZE = 4;
-        ChartHelpers.NAN_SYMBOL_SIZE = 6;
-        /* Create a formatter function that will switch between exponential and
-         * regular display depending on the scale of the number being formatted,
-         * and show `digits` significant digits.
-         */
-        function multiscaleFormatter(digits) {
-            return function (v) {
-                var absv = Math.abs(v);
-                if (absv < 1E-15) {
-                    // Sometimes zero-like values get an annoying representation
-                    absv = 0;
-                }
-                var f;
-                if (absv >= 1E4) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else if (absv > 0 && absv < 0.01) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else {
-                    f = d3.format('.' + digits + 'g');
-                }
-                return f(v);
-            };
-        }
-        ChartHelpers.multiscaleFormatter = multiscaleFormatter;
-        function accessorize(key) {
-            return function (d, index, dataset) { return d[key]; };
-        }
-        ChartHelpers.accessorize = accessorize;
-        ChartHelpers.stepFormatter = Plottable.Formatters.siSuffix(ChartHelpers.STEP_FORMATTER_PRECISION);
-        function stepX() {
-            var scale = new Plottable.Scales.Linear();
-            var axis = new Plottable.Axes.Numeric(scale, 'bottom');
-            axis.formatter(ChartHelpers.stepFormatter);
-            return {
-                scale: scale,
-                axis: axis,
-                accessor: function (d) { return d.step; },
-            };
-        }
-        ChartHelpers.stepX = stepX;
-        ChartHelpers.timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-        function wallX() {
-            var scale = new Plottable.Scales.Time();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Time(scale, 'bottom'),
-                accessor: function (d) { return d.wall_time; },
-            };
-        }
-        ChartHelpers.wallX = wallX;
-        ChartHelpers.relativeAccessor = function (d, index, dataset) {
-            // We may be rendering the final-point datum for scatterplot.
-            // If so, we will have already provided the 'relative' property
-            if (d.relative != null) {
-                return d.relative;
-            }
-            var data = dataset.data();
-            // I can't imagine how this function would be called when the data is
-            // empty (after all, it iterates over the data), but lets guard just
-            // to be safe.
-            var first = data.length > 0 ? +data[0].wall_time : 0;
-            return (+d.wall_time - first) / (60 * 60 * 1000); // ms to hours
-        };
-        ChartHelpers.relativeFormatter = function (n) {
-            // we will always show 2 units of precision, e.g days and hours, or
-            // minutes and seconds, but not hours and minutes and seconds
-            var ret = '';
-            var days = Math.floor(n / 24);
-            n -= (days * 24);
-            if (days) {
-                ret += days + 'd ';
-            }
-            var hours = Math.floor(n);
-            n -= hours;
-            n *= 60;
-            if (hours || days) {
-                ret += hours + 'h ';
-            }
-            var minutes = Math.floor(n);
-            n -= minutes;
-            n *= 60;
-            if (minutes || hours || days) {
-                ret += minutes + 'm ';
-            }
-            var seconds = Math.floor(n);
-            return ret + seconds + 's';
-        };
-        function relativeX() {
-            var scale = new Plottable.Scales.Linear();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-                accessor: ChartHelpers.relativeAccessor,
-            };
-        }
-        ChartHelpers.relativeX = relativeX;
-        // a very literal definition of NaN: true for NaN for a non-number type
-        // or null, etc. False for Infinity or -Infinity
-        ChartHelpers.isNaN = function (x) { return +x !== x; };
-        function getXComponents(xType) {
-            switch (xType) {
-                case 'step':
-                    return stepX();
-                case 'wall_time':
-                    return wallX();
-                case 'relative':
-                    return relativeX();
-                default:
-                    throw new Error('invalid xType: ' + xType);
-            }
-        }
-        ChartHelpers.getXComponents = getXComponents;
-    })(ChartHelpers = VZ.ChartHelpers || (VZ.ChartHelpers = {}));
-})(VZ || (VZ = {}));
-</script>
-  <script>
-    Polymer({
-      is: "vz-line-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-
-        /**
-         * Whether smoothing is enabled or not. If true, smoothed lines will be
-         * plotted in the chart while the unsmoothed lines will be ghosted in
-         * the background.
-         *
-         * The smoothing algorithm is a simple moving average, which, given a
-         * point p and a window w, replaces p with a simple average of the
-         * points in the [p - floor(w/2), p + floor(w/2)] range.
-         * If there aren't enough points to cover the entire window to the left
-         * and to the right, the window is reduced to fit exactly the amount of
-         * elements available. This means that the smoothed line will be less
-         * smoothed at both extremities and gradually become more smooth until
-         * the desired window is reached.
-         */
-        smoothingEnabled: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
-         * the window size, and a weight of 1.0 means using the entire length of
-         * each dataset as the window, while a weight of 0.0 means using a
-         * window of 0 (and thus replacing each point with themselves).
-         *
-         * The growth between 0.0 and 1.0 is not linear though. Because changing
-         * the window from 0% to 30% of the dataset smooths the line a lot more
-         * than changing the window from 70% to 100%, an exponential function is
-         * used instead. This function increases the size of the window slowly
-         * at the beginning and gradually speeds up the growth, but 0.0 still
-         * means a window of 0 and 1.0 still means a window of the dataset's
-         * length.
-         */
-        smoothingWeight: {
-          type: Number,
-          value: 0.6
-        },
-
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-
-        /**
-         * The scale for the y-axis. Allows:
-         * - "linear" - linear scale (Plottable.Scales.Linear)
-         * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
-         */
-        yScaleType: {
-          type: String,
-          value: 'linear'
-        },
-
-        /**
-         * Change how the tooltip is sorted. Allows:
-         * - "default" - Sort the tooltip by input order.
-         * - "ascending" - Sort the tooltip by ascending value.
-         * - "descending" - Sort the tooltip by descending value.
-         * - "nearest" - Sort the tooltip by closest to cursor.
-         */
-        tooltipSortingMethod: {
-          type: String,
-          value: 'default'
-        },
-
-        /**
-         * Change how the tooltip is positioned. Allows:
-         * - "bottom" - Position the tooltip on the bottom of the chart.
-         * - "right" - Position the tooltip to the right of the chart.
-         */
-        tooltipPosition: {
-          type: String,
-          value: 'bottom'
-        },
-
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: {
-          type: Number,
-          value: null
-        }
-      },
-      observers: [
-        "_makeChart(xType, yScaleType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-        "_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)",
-        "_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)",
-        "_tooltipPositionChanged(tooltipPosition, _chart)"
-      ],
-
-      /**
-       * Sets the series that the chart displays. Series with other names will
-       * not be displayed.
-       *
-       * @param {String[]} names Array with the names of the series to
-       * display.
-       */
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-
-      /**
-       * Sets the data of one of the series. Note that to display this series
-       * its name must be in the setVisibleSeries() array.
-       *
-       * @param {String} name Name of the series.
-       * @param {VZ.ChartHelpers.ScalarDatum[]} data Data of the series. This is
-       * an array of objects with at least the following properties:
-       * - step: (Number) - index of the datum.
-       * - wall_time: (Date) - Date object with the datum's time.
-       * - scalar: (Number) - Value of the datum.
-       */
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-
-      /**
-       * Re-renders the chart. Useful if e.g. the container size changed.
-       */
-      redraw: function() {
-        this._chart.redraw();
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.tooltip, true);
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, yScaleType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId !== null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-          this._makeChartAsyncCallbackId = null;
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!this._attached) return;
-          if (this._chart) this._chart.destroy();
-          var tooltip = d3.select(this.$.tooltip);
-          var chart = new VZ.LineChart(xType, yScaleType, colorScale, tooltip);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      _smoothingChanged: function() {
-        if(!this._chart) {
-          return;
-        }
-        if(this.smoothingEnabled) {
-          this._chart.smoothingUpdate(this.smoothingWeight);
-        }
-        else {
-          this._chart.smoothingDisable();
-        }
-      },
-      _tooltipSortingMethodChanged: function() {
-        if(this._chart) {
-          this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
-        }
-      },
-      _tooltipPositionChanged: function() {
-        if (this._chart) {
-          this._chart.setTooltipPosition(this.tooltipPosition);
-        }
-      }
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-scalar-dashboard" assetpath="../tf-scalar-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-          <div class="extend-first-section">
-            <paper-checkbox id="download-option" checked="{{_showDownloadLinks}}">Data download links</paper-checkbox>
-            <div id="tooltip-sorting">
-              <div id="tooltip-sorting-label">Tooltip sorting method:</div>
-              <paper-dropdown-menu no-label-float="" selected-item-label="{{_tooltipSortingMethod}}">
-                <paper-menu class="dropdown-content" selected="0">
-                  <paper-item>default</paper-item>
-                  <paper-item>descending</paper-item>
-                  <paper-item>ascending</paper-item>
-                  <paper-item>nearest</paper-item>
-                </paper-menu>
-              </paper-dropdown-menu>
-            </div>
-          </div>
-          <div class="sidebar-section">
-            <tf-smoothing-input weight="{{_smoothingWeight}}" step="0.001" min="0" max="1"></tf-smoothing-input>
-          </div>
-          <div class="sidebar-section">
-            <tf-option-selector id="xTypeSelector" name="Horizontal Axis" selected-id="{{_xType}}">
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
-          </div>
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" show-download-links="[[_showDownloadLinks]]" download-link-url-function="[[scalarUrl]]">
-          <template>
-            <vz-line-chart x-type="[[_xType]]" color-scale="[[_colorScale]]" smoothing-enabled="[[_smoothingEnabled]]" smoothing-weight="[[_smoothingWeight]]" tooltip-sorting-method="[[_tooltipSortingMethod]]"></vz-line-chart>
-            <paper-icon-button class="log-button" icon="line-weight" on-tap="toggleLogScale" title="Toggle y-axis log scale"></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      .log-button {
-        position: absolute;
-        left: 35px;
-        bottom: -35px;
-        color: #2196F3;
-        background: #fff;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-      }
-
-      .log-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #tooltip-sorting {
-        display: flex;
-        font-size: 14px;
-        margin-top: 5px;
-      }
-
-      #tooltip-sorting-label {
-        margin-top: 13px;
-        margin-left: 28px;
-      }
-
-      #tooltip-sorting paper-dropdown-menu {
-        margin-left: 10px;
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        width: 105px;
-      }
-    </style>
-
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-scalar-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "scalar"
-        },
-        router: Object,
-        scalarUrl: {
-          type: Function,
-          computed: "_getScalarUrl(router)"
-        },
-        _showDownloadLinks: {
-          type: Boolean,
-          notify: true,
-          value: TF.URIStorage.getBooleanInitializer('_showDownloadLinks',
-              false),
-          observer: '_showDownloadLinksObserver'
-        },
-        _smoothingWeight: {
-          type: Number,
-          notify: true,
-          value: TF.URIStorage.getNumberInitializer('_smoothingWeight', 0.6),
-          observer: '_smoothingWeightObserver'
-        },
-        _smoothingEnabled: {
-          type: Boolean,
-          computed: '_computeSmoothingEnabled(_smoothingWeight)'
-        },
-        _xType: {
-          type: String,
-          value: "step"
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _getScalarUrl: function() {
-        return this.router.scalars;
-      },
-      _showDownloadLinksObserver: TF.URIStorage.getBooleanObserver(
-          '_showDownloadLinks', false),
-      _smoothingWeightObserver: TF.URIStorage.getNumberObserver(
-          '_smoothingWeight', 0.6),
-      _computeSmoothingEnabled: function(_smoothingWeight) {
-        return _smoothingWeight > 0;
-      },
-      toggleLogScale: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var button = currentTarget.parentNode.querySelector('.log-button');
-        var chart = currentTarget.parentNode.querySelector('vz-line-chart');
-
-        button.classList.toggle("log-button-selected");
-        chart.yScaleType = chart.yScaleType === 'log' ? 'linear' : 'log';
-        chart.redraw();
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="vz-distribution-chart" assetpath="../vz-distribution-chart/">
-  <template>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var DistributionChart = (function () {
-        function DistributionChart(xType, colorScale) {
-            this.run2datasets = {};
-            this.colorScale = colorScale;
-            this.buildChart(xType);
-        }
-        DistributionChart.prototype.getDataset = function (run) {
-            if (this.run2datasets[run] === undefined) {
-                this.run2datasets[run] = new Plottable.Dataset([], { run: run });
-            }
-            return this.run2datasets[run];
-        };
-        DistributionChart.prototype.buildChart = function (xType) {
-            if (this.outer) {
-                this.outer.destroy();
-            }
-            var xComponents = VZ.ChartHelpers.getXComponents(xType);
-            this.xAccessor = xComponents.accessor;
-            this.xScale = xComponents.scale;
-            this.xAxis = xComponents.axis;
-            this.xAxis.margin(0).tickLabelPadding(3);
-            this.yScale = new Plottable.Scales.Linear();
-            this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-            var yFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-            this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-            this.yAxis.usesTextWidthApproximation(true);
-            var center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-            this.gridlines =
-                new Plottable.Components.Gridlines(this.xScale, this.yScale);
-            this.center = new Plottable.Components.Group([this.gridlines, center]);
-            this.outer = new Plottable.Components.Table([[this.yAxis, this.center], [null, this.xAxis]]);
-        };
-        DistributionChart.prototype.buildPlot = function (xAccessor, xScale, yScale) {
-            var _this = this;
-            var percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
-            var opacities = _.range(percents.length - 1)
-                .map(function (i) { return (percents[i + 1] - percents[i]) / 2500; });
-            var accessors = percents.map(function (p, i) { return function (datum) { return datum[i][1]; }; });
-            var median = 4;
-            var medianAccessor = accessors[median];
-            var plots = _.range(accessors.length - 1).map(function (i) {
-                var p = new Plottable.Plots.Area();
-                p.x(xAccessor, xScale);
-                var y0 = i > median ? accessors[i] : accessors[i + 1];
-                var y = i > median ? accessors[i + 1] : accessors[i];
-                p.y(y, yScale);
-                p.y0(y0);
-                p.attr('fill', function (d, i, dataset) {
-                    return _this.colorScale.scale(dataset.metadata().run);
-                });
-                p.attr('stroke', function (d, i, dataset) {
-                    return _this.colorScale.scale(dataset.metadata().run);
-                });
-                p.attr('stroke-weight', function (d, i, m) { return '0.5px'; });
-                p.attr('stroke-opacity', function () { return opacities[i]; });
-                p.attr('fill-opacity', function () { return opacities[i]; });
-                return p;
-            });
-            var medianPlot = new Plottable.Plots.Line();
-            medianPlot.x(xAccessor, xScale);
-            medianPlot.y(medianAccessor, yScale);
-            medianPlot.attr('stroke', function (d, i, m) { return _this.colorScale.scale(m.run); });
-            this.plots = plots;
-            return new Plottable.Components.Group(plots);
-        };
-        DistributionChart.prototype.setVisibleSeries = function (runs) {
-            var _this = this;
-            this.runs = runs;
-            var datasets = runs.map(function (r) { return _this.getDataset(r); });
-            this.plots.forEach(function (p) { return p.datasets(datasets); });
-        };
-        /**
-         * Set the data of a series on the chart.
-         */
-        DistributionChart.prototype.setSeriesData = function (name, data) {
-            this.getDataset(name).data(data);
-        };
-        DistributionChart.prototype.renderTo = function (targetSVG) {
-            this.targetSVG = targetSVG;
-            this.setViewBox();
-            this.outer.renderTo(targetSVG);
-        };
-        /** There's an issue in Chrome where the svg overflow is a bit
-         * "flickery". There is a border on the gridlines on the extreme edge of the
-         * chart, which behaves inconsistently and causes the screendiffing tests to
-         * flake. We can solve this by creating 1px effective margin for the svg by
-         * setting the viewBox on the containing svg.
-         */
-        DistributionChart.prototype.setViewBox = function () {
-            // There's an issue in Firefox where if we measure with the old viewbox
-            // set, we get horrible results.
-            this.targetSVG.attr('viewBox', null);
-            var parent = this.targetSVG.node().parentNode;
-            var w = parent.clientWidth;
-            var h = parent.clientHeight;
-            this.targetSVG.attr({
-                'height': h,
-                'viewBox': "0 0 " + (w + 1) + " " + (h + 1),
-            });
-        };
-        DistributionChart.prototype.redraw = function () {
-            this.outer.redraw();
-            this.setViewBox();
-        };
-        DistributionChart.prototype.destroy = function () { this.outer.destroy(); };
-        return DistributionChart;
-    }());
-    VZ.DistributionChart = DistributionChart;
-})(VZ || (VZ = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var ChartHelpers;
-    (function (ChartHelpers) {
-        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION = 4;
-        ChartHelpers.STEP_FORMATTER_PRECISION = 4;
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION = 3;
-        ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET = 20;
-        ChartHelpers.TOOLTIP_CIRCLE_SIZE = 4;
-        ChartHelpers.NAN_SYMBOL_SIZE = 6;
-        /* Create a formatter function that will switch between exponential and
-         * regular display depending on the scale of the number being formatted,
-         * and show `digits` significant digits.
-         */
-        function multiscaleFormatter(digits) {
-            return function (v) {
-                var absv = Math.abs(v);
-                if (absv < 1E-15) {
-                    // Sometimes zero-like values get an annoying representation
-                    absv = 0;
-                }
-                var f;
-                if (absv >= 1E4) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else if (absv > 0 && absv < 0.01) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else {
-                    f = d3.format('.' + digits + 'g');
-                }
-                return f(v);
-            };
-        }
-        ChartHelpers.multiscaleFormatter = multiscaleFormatter;
-        function accessorize(key) {
-            return function (d, index, dataset) { return d[key]; };
-        }
-        ChartHelpers.accessorize = accessorize;
-        ChartHelpers.stepFormatter = Plottable.Formatters.siSuffix(ChartHelpers.STEP_FORMATTER_PRECISION);
-        function stepX() {
-            var scale = new Plottable.Scales.Linear();
-            var axis = new Plottable.Axes.Numeric(scale, 'bottom');
-            axis.formatter(ChartHelpers.stepFormatter);
-            return {
-                scale: scale,
-                axis: axis,
-                accessor: function (d) { return d.step; },
-            };
-        }
-        ChartHelpers.stepX = stepX;
-        ChartHelpers.timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-        function wallX() {
-            var scale = new Plottable.Scales.Time();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Time(scale, 'bottom'),
-                accessor: function (d) { return d.wall_time; },
-            };
-        }
-        ChartHelpers.wallX = wallX;
-        ChartHelpers.relativeAccessor = function (d, index, dataset) {
-            // We may be rendering the final-point datum for scatterplot.
-            // If so, we will have already provided the 'relative' property
-            if (d.relative != null) {
-                return d.relative;
-            }
-            var data = dataset.data();
-            // I can't imagine how this function would be called when the data is
-            // empty (after all, it iterates over the data), but lets guard just
-            // to be safe.
-            var first = data.length > 0 ? +data[0].wall_time : 0;
-            return (+d.wall_time - first) / (60 * 60 * 1000); // ms to hours
-        };
-        ChartHelpers.relativeFormatter = function (n) {
-            // we will always show 2 units of precision, e.g days and hours, or
-            // minutes and seconds, but not hours and minutes and seconds
-            var ret = '';
-            var days = Math.floor(n / 24);
-            n -= (days * 24);
-            if (days) {
-                ret += days + 'd ';
-            }
-            var hours = Math.floor(n);
-            n -= hours;
-            n *= 60;
-            if (hours || days) {
-                ret += hours + 'h ';
-            }
-            var minutes = Math.floor(n);
-            n -= minutes;
-            n *= 60;
-            if (minutes || hours || days) {
-                ret += minutes + 'm ';
-            }
-            var seconds = Math.floor(n);
-            return ret + seconds + 's';
-        };
-        function relativeX() {
-            var scale = new Plottable.Scales.Linear();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-                accessor: ChartHelpers.relativeAccessor,
-            };
-        }
-        ChartHelpers.relativeX = relativeX;
-        // a very literal definition of NaN: true for NaN for a non-number type
-        // or null, etc. False for Infinity or -Infinity
-        ChartHelpers.isNaN = function (x) { return +x !== x; };
-        function getXComponents(xType) {
-            switch (xType) {
-                case 'step':
-                    return stepX();
-                case 'wall_time':
-                    return wallX();
-                case 'relative':
-                    return relativeX();
-                default:
-                    throw new Error('invalid xType: ' + xType);
-            }
-        }
-        ChartHelpers.getXComponents = getXComponents;
-    })(ChartHelpers = VZ.ChartHelpers || (VZ.ChartHelpers = {}));
-})(VZ || (VZ = {}));
-</script>
-  <script>
-    Polymer({
-      is: "vz-distribution-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: { type: Number, value: null }
-      },
-      observers: [
-        "_makeChart(xType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-      ],
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-      redraw: function() {
-        this._chart.redraw();
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId === null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!_attached) return;
-          if (this._chart) this._chart.destroy();
-          var chart = new VZ.DistributionChart(xType, colorScale);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      }
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-distribution-dashboard" assetpath="../tf-distribution-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-        <div class="sidebar-section">
-          <tf-option-selector id="xTypeSelector" name="Horizontal Axis" selected-id="{{_xType}}">
-            <paper-button id="step">step</paper-button>
-            <paper-button id="relative">relative</paper-button>
-            <paper-button id="wall_time">wall</paper-button>
-          </tf-option-selector>
-        </div>
-        </tf-sidebar-helper>
-      </div>
-
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <vz-distribution-chart x-type="[[_xType]]" color-scale="[[_colorScale]]"></vz-distribution-chart>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-distribution-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        _xType: {
-          type: String,
-          value: "step"
-        },
-        dataType: {value: "compressedHistogram"},
-      },
-    });
-  </script>
-</dom-module>
-
-
-<dom-module id="vz-histogram-timeseries" assetpath="../vz-histogram-timeseries/">
-    <template>
-      <div id="tooltip"><span></span></div>
-      <svg id="svg">
-        <g>
-          <g class="axis x"></g>
-          <g class="axis y"></g>
-          <g class="axis y slice"></g>
-          <g class="stage">
-            <rect class="background"></rect>
-          </g>
-          <g class="x-axis-hover"></g>
-          <g class="y-axis-hover"></g>
-          <g class="y-slice-axis-hover"></g>
-        </g>
-      </svg>
-
-      <style>
-        :host {
-          display: flex;
-          flex-direction: column;
-          flex-grow: 1;
-          flex-shrink: 1;
-          position: relative;
-        }
-
-        svg {
-          font-family: roboto, sans-serif;
-          overflow: visible;
-          display: block;
-          width: 100%;
-          flex-grow: 1;
-          flex-shrink: 1;
-        }
-
-        #tooltip {
-          position: absolute;
-          display: block;
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-        }
-
-        .background {
-          fill-opacity: 0;
-          fill: red;
-        }
-
-        .histogram {
-          pointer-events: none;
-        }
-
-        .hover {
-          font-size: 9px;
-          dominant-baseline: middle;
-          opacity: 0;
-        }
-
-        .hover circle {
-          stroke: white;
-          stroke-opacity: 0.5;
-          stroke-width: 1px;
-        }
-
-        .hover text {
-          fill: black;
-          opacity: 0;
-        }
-
-        .hover.hover-closest circle {
-          fill: black!important;
-        }
-
-        .hover.hover-closest text {
-          opacity: 1;
-        }
-
-        .baseline {
-          stroke: black;
-          stroke-opacity: 0.1;
-        }
-
-        .outline {
-          fill: none;
-          stroke: white;
-          stroke-opacity: 0.5;
-        }
-
-        .outline.outline-hover {
-          stroke: black!important;
-          stroke-opacity: 1;
-        }
-
-        .x-axis-hover,
-        .y-axis-hover,
-        .y-slice-axis-hover {
-          pointer-events: none;
-        }
-
-        .x-axis-hover .label,
-        .y-axis-hover .label,
-        .y-slice-axis-hover .label {
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-          text-anchor: end;
-        }
-
-        .x-axis-hover text {
-          text-anchor: middle;
-        }
-
-        .y-axis-hover text,
-        .y-slice-axis-hover text {
-          text-anchor: start;
-        }
-
-        .x-axis-hover line,
-        .y-axis-hover line,
-        .y-slice-axis-hover line {
-          stroke: black;
-        }
-
-        .x-axis-hover rect,
-        .y-axis-hover rect,
-        .y-slice-axis-hover rect {
-          fill: white;
-        }
-
-        .axis {
-          font-size: 10px;
-          fill: #aaa;
-        }
-
-        .axis path.domain {
-          fill: none;
-        }
-
-        .axis .tick line {
-          stroke: #ddd;
-        }
-
-        .axis.slice {
-          opacity: 0;
-        }
-
-        .axis.slice .tick line {
-          stroke-dasharray: 2;
-        }
-
-        .small .axis text { display: none; }
-        .small .axis .tick:first-of-type text { display: block; }
-        .small .axis .tick:last-of-type text { display: block; }
-        .medium .axis text { display: none; }
-        .medium .axis .tick:nth-child(2n + 1) text { display: block; }
-        .large .axis text { display: none; }
-        .large .axis .tick:nth-child(2n + 1) text { display: block; }
-
-      </style>
-    </template>
-
-    <script>
-    Polymer({
-      is: "vz-histogram-timeseries",
-      properties: {
-        /**
-         * Defines which view mode is being used by the chart. Supported values
-         * are:
-         * - "offset" - Offset view of the data showing all timesteps.
-         * - "overlay" - Overlays all timesteps into one 2D view, with the
-         * brighter lines representing the newer timesteps.
-         */
-        mode: {
-          type: String,
-          value: "offset"
-        },
-
-        /*
-         * The name of the datum's property that contains the time values.
-         * Allows:
-         * - "step" - Linear scale using the "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale starting at 0 created by using
-         * the "wall_time" property of the datum.
-         */
-        timeProperty: {
-          type: String,
-          value: "step"
-        },
-
-        /**
-         * The name of the data's property that contains the bins.
-         */
-        bins: {
-          type: String,
-          value: "bins"
-        },
-
-        /**
-         * The name of the datum's property that contains the x values.
-         */
-        x: {
-          type: String,
-          value: "x"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin width values.
-         */
-        dx: {
-          type: String,
-          value: "dx"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin height.
-         */
-        y: {
-          type: String,
-          value: "y"
-        },
-
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return d3.scale.category10();
-          }
-        },
-
-        /**
-         * Duration of the transition between histogram modes.
-         */
-        modeTransitionDuration: {
-          type: Number,
-          value: 500
-        },
-
-        _attached: Boolean,
-        _name: { type: String, value: null },
-        _data: { type: Array, value: null },
-      },
-      observers: [
-        'redraw(timeProperty, _attached)',
-        '_modeRedraw(mode)'
-      ],
-      ready: function() {
-        // Polymer's way of scoping styles on nodes that d3 created
-        this.scopeSubtree(this.$.svg, true);
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      setVisibleSeries: function(names) {
-        // Do nothing.
-      },
-      setSeriesData: function(name, data) {
-        this._name = name;
-        this._data = data;
-        this.redraw();
-      },
-
-      /**
-       * Redraws the chart. This is only called if the chart is attached to the
-       * screen and if the chart has data.
-       */
-      redraw: function() {
-        this._draw(0);
-      },
-
-      _modeRedraw: function() {
-        this._draw(this.modeTransitionDuration);
-      },
-
-      _draw: function(duration) {
-        if (!this._attached || !this._data) {
-          return;
-        }
-
-        //
-        // Data verification
-        //
-        if (duration === undefined) throw(new Error("vz-histogram-timeseries _draw needs duration"));
-        if (this._data.length <= 0) throw(new Error("Not enough steps in the data"));
-        if (!this._data[0].hasOwnProperty(this.bins)) throw(new Error("No bins property of '" + this.bins + "' in data"));
-        if (this._data[0][this.bins].length <= 0) throw(new Error("Must have at least one bin in bins in data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.x)) throw(new Error("No x property '" + this.x + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.dx)) throw(new Error("No dx property '" + this.dx + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.y)) throw(new Error("No y property '" + this.y + "' on bins data"));
-
-        //
-        // Initialization
-        //
-        var timeProp = this.timeProperty;
-        var xProp = this.x;
-        var binsProp = this.bins;
-        var dxProp = this.dx;
-        var yProp = this.y;
-
-        var data = this._data;
-        var name = this._name;
-        var mode = this.mode;
-        var color = d3.hcl(this.colorScale(name));
-        var tooltip = d3.select(this.$.tooltip);
-
-        var xAccessor = function(d) { return d[xProp] };
-        var yAccessor = function(d) { return d[yProp] };
-        var dxAccessor = function(d) { return d[dxProp] };
-        var xRightAccessor = function(d) { return d[xProp] + d[dxProp] };
-        var timeAccessor = function(d) { return d[timeProp] };
-
-        if (timeProp === "relative") {
-          timeAccessor = function(d) { return d.wall_time - data[0].wall_time };
-        }
-
-        var brect = this.$.svg.getBoundingClientRect();
-        var outerWidth = brect.width,
-            outerHeight = brect.height;
-
-        var sliceHeight,
-            margin = {top: 5, right: 60, bottom: 20, left: 24};
-
-        if (mode === "offset") {
-          sliceHeight = outerHeight / 2.5;
-          margin.top = sliceHeight + 5;
-        } else {
-          sliceHeight = outerHeight - margin.top - margin.bottom;
-        }
-
-        var width = outerWidth - margin.left - margin.right,
-            height = outerHeight - margin.top - margin.bottom;
-
-        var leftMin = d3.min(data, xAccessor),
-            rightMax = d3.max(data, xRightAccessor);
-
-        //
-        // Text formatters
-        //
-        var format = d3.format(".3n");
-        var yAxisFormat = d3.format(".0f");
-
-        if (timeProp === "wall_time") {
-          yAxisFormat = d3.time.format("%m/%d %X");
-        } else if (timeProp === "relative") {
-          yAxisFormat = function(d) {
-            return d3.format(".1r")(d / 3.6e6) + 'h'; // Convert to hours.
-          };
-        }
-
-        //
-        // Calculate the extents
-        //
-        var xExtents = data.map(function(d, i) {
-          return [
-            d3.min(d[binsProp], xAccessor),
-            d3.max(d[binsProp], xRightAccessor)
-          ];
-        });
-        var yExtents = data.map(function(d) {
-          return d3.extent(d[binsProp], yAccessor);
-        });
-
-        //
-        // Scales and axis
-        //
-        var outlineCanvasSize = 500;
-
-        var extent = d3.extent(data, timeAccessor);
-
-        var yScale = (timeProp === "wall_time" ? d3.time.scale() : d3.scale.linear())
-            .domain(extent)
-            .range([0, (mode === "offset" ? height : 0)]);
-
-        var ySliceScale = d3.scale.linear()
-            .domain([0, d3.max(data, function(d, i) { return yExtents[i][1]; })])
-            .range([sliceHeight, 0]);
-
-        var yLineScale = d3.scale.linear()
-            .domain(ySliceScale.domain())
-            .range([outlineCanvasSize, 0]);
-
-        var xScale = d3.scale.linear()
-            .domain([
-              d3.min(data, function(d, i) { return xExtents[i][0]; }),
-              d3.max(data, function(d, i) { return xExtents[i][1]; })
-            ])
-            .nice()
-            .range([0, width]);
-
-        var xLineScale = d3.scale.linear()
-            .domain(xScale.domain())
-            .range([0, outlineCanvasSize]);
-
-        var outlineColor = d3.scale.linear()
-            .domain(d3.extent(data, timeAccessor))
-            .range([color.darker(), color.brighter()])
-            .interpolate(d3.interpolateHcl);
-
-        var xAxis = d3.svg.axis()
-            .scale(xScale)
-            .ticks(Math.max(2, width / 20))
-            .orient("bottom");
-
-        var yAxis = d3.svg.axis()
-            .scale(yScale)
-            .ticks(Math.max(2, height / 15))
-            .tickFormat(yAxisFormat)
-            .orient("right");
-
-        var ySliceAxis = d3.svg.axis()
-            .scale(ySliceScale)
-            .ticks(Math.max(2, height / 15))
-            .tickSize(width + 5)
-            .tickFormat(format)
-            .orient("right");
-
-        var xBinCentroid = function(d) {
-          return d[xProp] + d[dxProp] / 2;
-        };
-
-        var linePath = d3.svg.line()
-            .interpolate("linear")
-            .x(function(d) { return xLineScale(xBinCentroid(d)); })
-            .y(function(d) { return yLineScale(d[yProp]); });
-
-        var path = function(d) {
-          // Draw a line from 0 to the first point and from the last point to 0.
-          return 'M' + xLineScale(xBinCentroid(d[0])) + ',' + yLineScale(0) +
-              'L' + linePath(d).slice(1) +
-              "L" + xLineScale(xBinCentroid(d[d.length - 1])) + "," + yLineScale(0);
-        };
-
-        //
-        // Render
-        //
-        var svgNode = this.$.svg;
-
-        var svg = d3.select(svgNode)
-
-        var svgTransition = svg.transition().duration(duration);
-
-        var g = svg.select("g")
-            .classed("small", function() { return (width > 0 && width <= 150); })
-            .classed("medium", function() { return (width > 150 && width <= 300); })
-            .classed("large", function() { return (width > 300); })
-
-        var gTransition = svgTransition.select("g")
-            .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
-
-        var bisect = d3.bisector(xRightAccessor).left;
-        var stage = g.select(".stage")
-            .on("mouseover", function() {
-              hoverUpdate.style("opacity", 1);
-              xAxisHoverUpdate.style("opacity", 1);
-              yAxisHoverUpdate.style("opacity", 1);
-              ySliceAxisHoverUpdate.style("opacity", 1);
-              tooltip.style("opacity", 1);
-            })
-            .on("mouseout", function() {
-              hoverUpdate.style("opacity", 0);
-              xAxisHoverUpdate.style("opacity", 0);
-              yAxisHoverUpdate.style("opacity", 0);
-              ySliceAxisHoverUpdate.style("opacity", 0);
-              hoverUpdate.classed("hover-closest", false);
-              outlineUpdate.classed("outline-hover", false);
-              tooltip.style("opacity", 0);
-            })
-            .on("mousemove", onMouseMove);
-
-        var background = stage.select(".background")
-            .attr("transform", "translate(" + -margin.left + "," + -margin.top + ")")
-            .attr("width", outerWidth)
-            .attr("height", outerHeight);
-
-        var histogram = stage.selectAll(".histogram").data(data),
-            histogramExit = histogram.exit().remove(),
-            histogramEnter = histogram.enter().append("g").attr("class", "histogram"),
-            histogramUpdate = histogram
-                .sort(function(a, b) { return timeAccessor(a) - timeAccessor(b); }),
-            histogramTransition = gTransition.selectAll(".histogram")
-                .attr("transform", function(d) {
-                  return "translate(0, " +
-                    (mode === "offset" ? (yScale(timeAccessor(d)) - sliceHeight) : 0) + ")";
-                });
-
-        var baselineEnter = histogramEnter.append("line").attr("class", "baseline"),
-            baselineUpdate = histogramTransition.select(".baseline")
-                .style("stroke-opacity", function(d) { return (mode === "offset" ? 0.1 : 0); })
-                .attr("y1", sliceHeight)
-                .attr("y2", sliceHeight)
-                .attr("x2", width);
-
-        var outlineEnter = histogramEnter.append("path").attr("class", "outline"),
-            outlineUpdate = histogramUpdate.select(".outline")
-                .attr("vector-effect", "non-scaling-stroke")
-                .attr("d", function(d) { return path(d[binsProp]); })
-                .style("stroke-width", 1),
-            outlineTransition = histogramTransition.select(".outline")
-                .attr("transform", "scale(" + width / outlineCanvasSize + ", " +
-                      sliceHeight / outlineCanvasSize + ")")
-                .style("stroke", function(d) {
-                  return (mode === "offset" ? "white" : outlineColor(timeAccessor(d)));
-                })
-                .style("fill-opacity", function(d) { return (mode === "offset" ? 1 : 0); })
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); });
-
-        var hoverEnter = histogramEnter.append("g")
-                .attr("class", "hover")
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); }),
-            hoverUpdate = histogramUpdate.select(".hover");
-
-        hoverEnter.append("circle")
-            .attr("r", 2);
-
-        hoverEnter.append("text")
-            .style("display", "none")
-            .attr("dx", 4);
-
-        var xAxisHover = g.select(".x-axis-hover").selectAll(".label").data(["x"]),
-            xAxisHoverEnter = xAxisHover.enter().append("g").attr("class", "label"),
-            xAxisHoverUpdate = xAxisHover;
-
-        xAxisHoverEnter.append("rect")
-            .attr("x", -20)
-            .attr("y", 6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        xAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 0)
-            .attr("y1", 0)
-            .attr("y2", 6);
-
-        xAxisHoverEnter.append("text")
-            .attr("dy", 18);
-
-        var yAxisHover = g.select(".y-axis-hover").selectAll(".label").data(["y"]),
-            yAxisHoverEnter = yAxisHover.enter().append("g").attr("class", "label"),
-            yAxisHoverUpdate = yAxisHover;
-
-        yAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        yAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        yAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        var ySliceAxisHover = g.select(".y-slice-axis-hover").selectAll(".label").data(["y"]),
-            ySliceAxisHoverEnter = ySliceAxisHover.enter().append("g").attr("class", "label"),
-            ySliceAxisHoverUpdate = ySliceAxisHover;
-
-        ySliceAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        ySliceAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        ySliceAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        gTransition.select(".y.axis.slice")
-            .style("opacity", mode === "offset" ? 0 : 1)
-            .attr("transform", "translate(0, " + (mode === "offset" ? -sliceHeight : 0) + ")")
-            .call(ySliceAxis);
-
-        gTransition.select(".x.axis")
-            .attr("transform", "translate(0, " + height + ")")
-            .call(xAxis);
-
-        gTransition.select(".y.axis")
-            .style("opacity", mode === "offset" ? 1 : 0)
-            .attr("transform", "translate(" + width + ", " + (mode === "offset" ? 0 : height) + ")")
-            .call(yAxis);
-
-        function onMouseMove() {
-          var m = d3.mouse(this),
-              v = xScale.invert(m[0]),
-              t = yScale.invert(m[1]);
-
-          function hoverXIndex(d) {
-            return Math.min(d[binsProp].length - 1, bisect(d[binsProp], v));
-          }
-          var closestSliceData;
-          var closestSliceDistance = Infinity;
-          var lastSliceData;
-          hoverUpdate
-            .attr("transform", function(d, i) {
-              var index = hoverXIndex(d);
-              lastSliceData = d;
-              var x = xScale(d[binsProp][index][xProp] + d[binsProp][index][dxProp] / 2);
-              var y = ySliceScale(d[binsProp][index][yProp]);
-              var globalY = (mode === "offset" ? yScale(timeAccessor(d)) - (sliceHeight - y) : y);
-              var dist = Math.abs(m[1] - globalY);
-              if (dist < closestSliceDistance) {
-                closestSliceDistance = dist;
-                closestSliceData = d;
-              }
-              return "translate(" + x + "," + y + ")";
-            });
-          hoverUpdate.select("text").text(function(d) {
-            var index = hoverXIndex(d);
-            return d[binsProp][index][yProp];
-          })
-          hoverUpdate.classed("hover-closest", function(d) { return d === closestSliceData; });
-          outlineUpdate.classed("outline-hover", function(d) { return d === closestSliceData; });
-
-          var index = hoverXIndex(lastSliceData);
-
-          xAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" +
-                  xScale(lastSliceData[binsProp][index][xProp] +
-                         lastSliceData[binsProp][index][dxProp] / 2) + ", " +
-                  height + ")";
-              })
-            .select("text")
-              .text(function(d) { return format(lastSliceData[binsProp][index][xProp] +
-                                                lastSliceData[binsProp][index][dxProp] / 2); });
-
-          var fy = yAxis.tickFormat();
-          yAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? yScale(timeAccessor(closestSliceData)) : 0) + ")";
-              })
-              .style("display", mode === "offset" ? "" : "none")
-            .select("text")
-              .text(function(d) { return fy(timeAccessor(closestSliceData));});
-
-          var fsy = ySliceAxis.tickFormat();
-          ySliceAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? 0 : ySliceScale(closestSliceData[binsProp][index][yProp])) +
-                  ")";
-              })
-              .style("display", mode === "offset" ? "none" : "")
-            .select("text")
-              .text(function(d) { return fsy(closestSliceData[binsProp][index][yProp]); });
-
-          var svgMouse = d3.mouse(svgNode);
-          tooltip.style("transform", "translate(" + (svgMouse[0] + 15) + "px," +
-              (svgMouse[1] - 15) + "px)")
-            .select('span')
-            .text(mode === "offset" ?
-                fsy(closestSliceData[binsProp][index][yProp]) :
-                (timeProp === "step" ? "step " : "") +
-                fy(timeAccessor(closestSliceData)));
-        }
-      }
-    });
-    </script>
-
-  </dom-module>
-<dom-module id="tf-histogram-dashboard" assetpath="../tf-histogram-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}" show-download-links="{{_showDownloadLinks}}">
-          <div class="sidebar-section">
-            <tf-option-selector id="histogramModeSelector" name="Histogram Mode" selected-id="{{_histogramMode}}">
-              <paper-button id="overlay">overlay</paper-button>
-              <paper-button id="offset">offset</paper-button>
-            </tf-option-selector>
-          </div>
-          <div class="sidebar-section">
-            <tf-option-selector id="timePropertySelector" name="Offset Time Axis" selected-id="{{_timeProperty}}">
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
-          
-       </div>
-      </tf-sidebar-helper></div>
-
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <vz-histogram-timeseries time-property="[[_timeProperty]]" mode="[[_histogramMode]]" color-scale="[[_colorScaleFunction]]"></vz-histogram-timeseries>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-expanded-height: 500px;
-        --card-expanded-width: 700px;
-      }
-    </style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-histogram-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "histogram"
-        },
-        _histogramMode: {
-          type: String,
-          value: "offset"
-        },
-        _timeProperty: {
-          type: String,
-          value: "step"
-        },
-        _colorScaleFunction: {
-          type: Function,
-          computed: "_getColorScaleFunction(_colorScale)"
-        },
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _getColorScaleFunction: function() {
-        return this._colorScale.scale.bind(this._colorScale);
-      },
-    });
-  </script>
-</dom-module>
-
-
-<dom-module id="tf-image-loader" assetpath="../tf-image-dashboard/">
-  <template>
-    <div id="image-annotation">
-      step [[step]]
-      <template is="dom-if" if="[[wallTime]]">
-        ([[wallTime]])
-      </template>
-    </div>
-    <img id="img" src="[[imageUrl]]" on-error="reload">
-    <style>
-      :host {
-        display: block;
-        width: 100%;
-        height: auto;
-        position: relative;
-      }
-
-      #image-annotation {
-        border-left: 4px solid;
-        padding-left: 5px;
-        font-size: 12px;
-        margin: -10px 0 10px 0;
-      }
-
-      img {
-        image-rendering: -moz-crisp-edges;
-        image-rendering: pixelated;
-        display: block;
-        width: 100%;
-        height: auto;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-image-loader",
-      properties: {
-        colorScale: Object,
-        imageUrl: String,
-        run: String,
-        step: Number,
-        wallTime: String,
-      },
-      setVisibleSeries: function(runs) {
-        // Do nothing.
-      },
-      setSeriesData: function(run, data) {
-        var last = _.last(data);
-        this.redraw(last);
-
-        // Update the border color based on the run.
-        this.$$('#image-annotation').style.borderColor = this.colorScale.scale(run);
-      },
-      redraw: function(imageData) {
-        var url = imageData.url || this.imageUrl;
-        this.imageUrl = ""; // Force redraw
-        this.imageUrl = url;
-
-        // Update the step if the value fetched is a valid number >= 0
-        // (not null, NaN, etc).
-        this.step = imageData.step >= 0 ? imageData.step : this.step;
-        if (imageData.wall_time) {
-          this.wallTime = imageData.wall_time.toString();
-        }
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-image-dashboard" assetpath="../tf-image-dashboard/">
-  <template>
-    <paper-dialog with-backdrop="" id="actual-image-size-dialog"></paper-dialog>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <tf-image-loader color-scale="[[_colorScale]]"></tf-image-loader>
-            <paper-icon-button class="actual-size-button" icon="aspect-ratio" on-tap="_showActualSize" title="Show the image at its true pixel size"></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-width: 340px;
-        --card-height: auto;
-        --card-expanded-width: 700px;
-        --card-expanded-height: auto;
-      }
-
-      .actual-size-button {
-        background: #fff;
-        border-radius: 100%;
-        bottom: -35px;
-        color: #2196f3;
-        height: 32px;
-        left: 35px;
-        padding: 4px;
-        pointer-events: auto;
-        position: absolute;
-        width: 32px;
-      }
-
-      .actual-size-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #actual-image-size-dialog {
-        overflow: auto;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-image-dashboard",
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "image"
-        },
-      },
-      behaviors: [
-          TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-          TF.Backend.Behavior,
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _showActualSize: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-
-        // Create a full-size copy of the image.
-        var newImage = card.querySelector('#img').cloneNode();
-        newImage.style.height = 'auto';
-        newImage.style.width = 'auto';
-        newImage.style.margin = 0;
-        newImage.style.padding = 0;
-        newImage.classList.add("actual-size-image");
-
-        // When the user clicks on the image, empty and close the dialog.
-        var dialog = this.$$('#actual-image-size-dialog');
-        newImage.addEventListener('click', function() {
-          dialog.close();
-        });
-
-        // Update dialog content. Show the dialog.
-        dialog.innerHTML = '';
-        dialog.appendChild(newImage);
-        dialog.open();
-      }
-    });
-  </script>
-</dom-module>
-
-
-
-<dom-module id="tf-audio-loader" assetpath="/">
-  <style>
-  :host {
-    display: block;
-  }
-  img {
-    width: 100%;
-    height: 100%;
-    image-rendering: pixelated;
-  }
-  </style>
-  <template>
-    <template is="dom-if" if="[[audioUrl]]">
-      <audio controls="" loop="">
-        <source src="[[audioUrl]]" type="[[audioContentType]]">
-      </audio>
-    </template>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-loader",
-      properties: {
-        run: String,
-        tag: String,
-        audioGenerator: Function,
-        audioUrl: String,
-        audioContentType: String
-      },
-      reload: function() {
-        var _this = this;
-        this.audioUrl = ""; // force reload
-        this.audioContentType = "";
-        this.audioGenerator(this.tag, this.run).then(function(metadatas) {
-          var last_metadata = _.last(metadatas);
-          _this.audioUrl = last_metadata.url;
-          _this.audioContentType = last_metadata.content_type;
-        })
-      },
-      ready: function() {
-        // Need to test so that it will not error if it is constructed w/o
-        // all properties (so that it's possible to use stub to mock it out)
-        if (this.run != null && this.tag != null && this.audioGenerator != null) {
-          this.reload();
-        }
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-audio-grid" assetpath="../tf-audio-dashboard/">
-  <template>
-    <style include="scrollbar-style"></style>
-    <div id="fullContainer" class="container scrollbar">
-      <div id="topRow" class="container">
-        <div class="noshrink" id="paddingCell"></div>
-        <template is="dom-repeat" items="[[runs]]" as="run">
-        <div class="run-name-cell noshrink">
-          <span>[[run]]</span>
-        </div>
-      </template>
-      </div>
-      <div id="bottomContainer" class="container">
-        <template is="dom-repeat" items="[[tags]]" as="tag">
-          <div class="audio-row container noshrink">
-            <div class="tag-name-cell noshrink">
-              <span class="tag-name">[[tag]]</span>
-            </div>
-            <template is="dom-repeat" items="[[runs]]" as="run">
-              <div class="audio-cell noshrink">
-                <template is="dom-if" if="[[_exists(run, tag, runToAudio.*)]]">
-                  <tf-audio-loader id="loader" run="[[run]]" tag="[[tag]]" audio-generator="[[audioGenerator]]">
-                  </tf-audio-loader>
-                </template>
-              </div>
-            </template>
-          </div>
-        </template>
-      </div>
-    </div>
-    <style>
-      :host {
-        display: block;
-        height: 100%;
-      }
-      .container {
-        display: flex;
-        flex-wrap: nowrap;
-      }
-      #fullContainer {
-        width: 100%;
-        height: 100%;
-        flex-direction: column;
-        padding-top: 20px;
-        overflow: auto;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      #topRow {
-        flex-direction: row;
-      }
-      #bottomContainer {
-        flex-direction: column;
-        height: 100%;
-        width: 100%;
-      }
-      .audio-row {
-        flex-direction: row;
-        padding-top: 5px;
-      }
-      .audio-cell {
-        width: 300px;
-        height: 30px;
-        border: 1px solid black;
-      }
-      .tag-name-cell {
-        width: 300px;
-        height: 30px;
-        display:flex;
-        flex-direction: column;
-        justify-content: center;
-      }
-      .tag-name {
-        word-wrap: break-word;
-        text-align: center;
-        white-space: nowrap;
-      }
-      .run-name-cell {
-        width: 300px;
-        height: 30px;
-        text-align: center;
-      }
-      .noshrink {
-        flex-shrink: 0;
-      }
-      #paddingCell {
-        width: 300px;
-        height: 30px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-grid",
-      properties: {
-        runToAudio: Object,
-        tags: Array,
-        runs: Array,
-        audioGenerator: Function,
-      },
-      _exists: function (run, tag) {
-        return this.runToAudio[run].indexOf(tag) !== -1;
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-audio-dashboard" assetpath="../tf-audio-dashboard/">
-  <template>
-    <div class="center">
-      <tf-no-data-warning data-type="audio" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-      <tf-audio-grid id="audioGrid" run-to-audio="[[run2tag]]" audio-generator="[[dataProvider]]" tags="[[tags]]" runs="[[runs]]"></tf-audio-grid>
-    </div>
-
-    <style>
-      .center {
-        height: 100%;
-        width: 100%;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      :host {
-        height: 100%;
-        display: block;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-dashboard",
-      properties: {
-        dataType: {value: "audio"},
-      },
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-audio-loader"),
-        TF.Backend.Behavior
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _hasAudio: function(runToAudioChange) {
-        return _.values(runToAudioChange.base).some(function(arr) {
-          return arr.length > 0;
-        });
-      },
-    });
-  </script>
-</dom-module>
-
-
-<dom-module id="tf-graph-loader" assetpath="../tf-graph-loader/">
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph-loader',
-
-  properties: {
-    /**
-     * @type {value: number, msg: string}
-     *
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     */
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    datasets: Array,
-    selectedDataset: Number,
-    selectedFile: {
-      type: Object,
-      observer: '_selectedFileChanged'
-    },
-    outGraphHierarchy: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outGraph: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outHierarchyParams: {
-      type: Object,
-      readOnly: true,
-      notify: true
-    },
-    outStats: {
-      type: Object,
-      readOnly: true, // This property produces data.
-      notify: true
-    }
-  },
-  observers: [
-    '_selectedDatasetChanged(selectedDataset, datasets)',
-    '_readAndParseMetadata(selectedMetadataTag)'
-  ],
-  _readAndParseMetadata: function(metadataIndex) {
-    if (metadataIndex == -1 || this.datasets[this.selectedDataset] == null ||
-        this.datasets[this.selectedDataset].runMetadata == null ||
-        this.datasets[this.selectedDataset].runMetadata[metadataIndex] == null) {
-      this._setOutStats(null);
-      return;
-    }
-    var path = this.datasets[this.selectedDataset].runMetadata[metadataIndex].path;
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    tf.graph.parser.fetchAndParseMetadata(path, tracker)
-    .then(function(stats) {
-      this._setOutStats(stats);
-    }.bind(this));
-  },
-  _parseAndConstructHierarchicalGraph: function(path, pbTxtFile) {
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyParams = {
-      verifyTemplate: true,
-      // If a set of numbered op nodes has at least this number of nodes
-      // then group them into a series node.
-      seriesNodeMinSize: 5,
-      // A map of series node names to series grouping settings, to indicate
-      // if a series is to be rendered as grouped or ungrouped.
-      // Starts out empty which allows the renderer to decide which series
-      // are initially rendered grouped and which aren't.
-      seriesMap: {},
-    };
-    this._setOutHierarchyParams(hierarchyParams);
-    var dataTracker = tf.graph.util.getSubtaskTracker(tracker, 30, 'Data');
-    tf.graph.parser.fetchAndParseGraphData(path, pbTxtFile, dataTracker)
-    .then(function(graph) {
-      if (!graph) {
-        throw 'The graph is empty. Make sure that the graph is passed to the ' +
-            'SummaryWriter after the graph is defined.';
-      }
-
-      // Build the flat graph (consists only of Op nodes).
-
-      // This is the whitelist of inputs on op types that are considered
-      // reference edges. "Assign 0" indicates that the first input to
-      // an OpNode with operation type "Assign" is a reference edge.
-      var refEdges = {};
-      refEdges["Assign 0"] = true;
-      refEdges["AssignAdd 0"] = true;
-      refEdges["AssignSub 0"] = true;
-      refEdges["assign 0"] = true;
-      refEdges["assign_add 0"] = true;
-      refEdges["assign_sub 0"] = true;
-      refEdges["count_up_to 0"] = true;
-      refEdges["ScatterAdd 0"] = true;
-      refEdges["ScatterSub 0"] = true;
-      refEdges["ScatterUpdate 0"] = true;
-      refEdges["scatter_add 0"] = true;
-      refEdges["scatter_sub 0"] = true;
-      refEdges["scatter_update 0"] = true;
-      var buildParams = {
-        enableEmbedding: true,
-        inEmbeddingTypes: ['Const'],
-        outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
-        refEdges: refEdges
-      };
-      var graphTracker = tf.graph.util.getSubtaskTracker(tracker, 20, 'Graph');
-      return tf.graph.build(graph, buildParams, graphTracker);
-    })
-    .then(function(graph) {
-      this._setOutGraph(graph);
-      var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 50,
-          'Namespace hierarchy');
-      return tf.graph.hierarchy.build(graph, hierarchyParams, hierarchyTracker);
-    }.bind(this))
-    .then(function(graphHierarchy) {
-      // Update the properties which notify the parent with the
-      // graph hierarchy and whether the data has live stats or not.
-      this._setOutGraphHierarchy(graphHierarchy);
-    }.bind(this))
-    .catch(function(e) {
-      // Generic error catch, for errors that happened outside
-      // asynchronous tasks.
-      tracker.reportError("Graph visualization failed: " + e, e);
-    });
-  },
-  _selectedDatasetChanged: function(datasetIndex, datasets) {
-    this._parseAndConstructHierarchicalGraph(datasets[datasetIndex].path);
-  },
-  _selectedFileChanged: function(e) {
-    if (!e) {
-      return;
-    }
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-
-    // Clear out the value of the file chooser. This ensures that if the user
-    // selects the same file, we'll re-read it.
-    e.target.value = '';
-
-    this._parseAndConstructHierarchicalGraph(null, file);
-  }
-});
-</script>
-
-<link rel="import" href="../iron-flex-layout/iron-flex-layout.html"><script src="../lodash/lodash.min.js"></script><script src="../graphlib/dist/graphlib.core.js"></script><script src="../dagre/dist/dagre.core.js"></script><script src="../lodash/lodash.min.js"></script><script src="../graphlib/dist/graphlib.core.js"></script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    /**
-     * Mapping from color palette name to color palette, which contains
-     * exact colors for multiple states of a single color palette.
-     */
-    tf.COLORS = [
-        {
-            'name': 'Google Blue',
-            'color': '#4184f3',
-            'active': '#3a53c5',
-            'disabled': '#cad8fc'
-        },
-        {
-            'name': 'Google Red',
-            'color': '#db4437',
-            'active': '#8f2a0c',
-            'disabled': '#e8c6c1'
-        },
-        {
-            'name': 'Google Yellow',
-            'color': '#f4b400',
-            'active': '#db9200',
-            'disabled': '#f7e8b0'
-        },
-        {
-            'name': 'Google Green',
-            'color': '#0f9d58',
-            'active': '#488046',
-            'disabled': '#c2e1cc'
-        },
-        {
-            'name': 'Purple',
-            'color': '#aa46bb',
-            'active': '#5c1398',
-            'disabled': '#d7bce6'
-        },
-        {
-            'name': 'Teal',
-            'color': '#00abc0',
-            'active': '#47828e',
-            'disabled': '#c2eaf2'
-        },
-        {
-            'name': 'Deep Orange',
-            'color': '#ff6f42',
-            'active': '#ca4a06',
-            'disabled': '#f2cbba'
-        },
-        {
-            'name': 'Lime',
-            'color': '#9d9c23',
-            'active': '#7f771d',
-            'disabled': '#f1f4c2'
-        },
-        {
-            'name': 'Indigo',
-            'color': '#5b6abf',
-            'active': '#3e47a9',
-            'disabled': '#c5c8e8'
-        },
-        {
-            'name': 'Pink',
-            'color': '#ef6191',
-            'active': '#ca1c60',
-            'disabled': '#e9b9ce'
-        },
-        {
-            'name': 'Deep Teal',
-            'color': '#00786a',
-            'active': '#2b4f43',
-            'disabled': '#bededa'
-        },
-        {
-            'name': 'Deep Pink',
-            'color': '#c1175a',
-            'active': '#75084f',
-            'disabled': '#de8cae'
-        },
-        {
-            'name': 'Gray',
-            'color': '#9E9E9E',
-            'active': '#424242',
-            'disabled': 'F5F5F5' // 100
-        }
-    ].reduce(function (m, c) {
-        m[c.name] = c;
-        return m;
-    }, {});
-    /**
-     * Mapping from op category to color palette name
-     * e.g.,  OP_GROUP_COLORS['state_ops'] = 'Google Blue';
-     */
-    tf.OP_GROUP_COLORS = [
-        {
-            color: 'Google Red',
-            groups: [
-                'gen_legacy_ops', 'legacy_ops', 'legacy_flogs_input',
-                'legacy_image_input', 'legacy_input_example_input',
-                'legacy_sequence_input', 'legacy_seti_input_input'
-            ]
-        },
-        { color: 'Deep Orange', groups: ['constant_ops'] },
-        { color: 'Indigo', groups: ['state_ops'] },
-        { color: 'Purple', groups: ['nn_ops', 'nn'] },
-        { color: 'Google Green', groups: ['math_ops'] },
-        { color: 'Lime', groups: ['array_ops'] },
-        { color: 'Teal', groups: ['control_flow_ops', 'data_flow_ops'] },
-        { color: 'Pink', groups: ['summary_ops'] },
-        { color: 'Deep Pink', groups: ['io_ops'] }
-    ].reduce(function (m, c) {
-        c.groups.forEach(function (group) { m[group] = c.color; });
-        return m;
-    }, {});
-})(tf || (tf = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        /** Delimiter used in node names to denote namespaces. */
-        graph_1.NAMESPACE_DELIM = '/';
-        graph_1.ROOT_NAME = '__root__';
-        /** Attribute key used for storing attributes that are too large. */
-        graph_1.LARGE_ATTRS_KEY = '_too_large_attrs';
-        /**
-         * Maximum allowed size in bytes, before the attribute is considered large
-         * and filtered out of the graph.
-         */
-        graph_1.LIMIT_ATTR_SIZE = 1024;
-        // Separator between the source and the destination name of the edge.
-        graph_1.EDGE_KEY_DELIM = '--';
-        (function (GraphType) {
-            GraphType[GraphType["FULL"] = 0] = "FULL";
-            GraphType[GraphType["EMBEDDED"] = 1] = "EMBEDDED";
-            GraphType[GraphType["META"] = 2] = "META";
-            GraphType[GraphType["SERIES"] = 3] = "SERIES";
-            GraphType[GraphType["CORE"] = 4] = "CORE";
-            GraphType[GraphType["SHADOW"] = 5] = "SHADOW";
-            GraphType[GraphType["BRIDGE"] = 6] = "BRIDGE";
-            GraphType[GraphType["EDGE"] = 7] = "EDGE";
-        })(graph_1.GraphType || (graph_1.GraphType = {}));
-        var GraphType = graph_1.GraphType;
-        ;
-        (function (NodeType) {
-            NodeType[NodeType["META"] = 0] = "META";
-            NodeType[NodeType["OP"] = 1] = "OP";
-            NodeType[NodeType["SERIES"] = 2] = "SERIES";
-            NodeType[NodeType["BRIDGE"] = 3] = "BRIDGE";
-            NodeType[NodeType["ELLIPSIS"] = 4] = "ELLIPSIS";
-        })(graph_1.NodeType || (graph_1.NodeType = {}));
-        var NodeType = graph_1.NodeType;
-        ;
-        /** Indicates if a node is to be included in the main graph when rendered. */
-        (function (InclusionType) {
-            InclusionType[InclusionType["INCLUDE"] = 0] = "INCLUDE";
-            InclusionType[InclusionType["EXCLUDE"] = 1] = "EXCLUDE";
-            InclusionType[InclusionType["UNSPECIFIED"] = 2] = "UNSPECIFIED";
-        })(graph_1.InclusionType || (graph_1.InclusionType = {}));
-        var InclusionType = graph_1.InclusionType;
-        ;
-        /** Indicates if a series is to be grouped in the graph when rendered. */
-        (function (SeriesGroupingType) {
-            SeriesGroupingType[SeriesGroupingType["GROUP"] = 0] = "GROUP";
-            SeriesGroupingType[SeriesGroupingType["UNGROUP"] = 1] = "UNGROUP";
-        })(graph_1.SeriesGroupingType || (graph_1.SeriesGroupingType = {}));
-        var SeriesGroupingType = graph_1.SeriesGroupingType;
-        ;
-        /** Attribute key reserved for the shapes of the output tensors. */
-        var OUTPUT_SHAPES_KEY = '_output_shapes';
-        /**
-         * A SlimGraph is inspired by graphlib.Graph, but having only the functionality
-         * that we need.
-         */
-        var SlimGraph = (function () {
-            function SlimGraph() {
-                this.nodes = {};
-                this.edges = [];
-            }
-            return SlimGraph;
-        }());
-        graph_1.SlimGraph = SlimGraph;
-        var EllipsisNodeImpl = (function () {
-            /**
-             * Constructs a new ellipsis annotation node.
-             *
-             * @param numNodes The number of additional annotations this node represents.
-             */
-            function EllipsisNodeImpl(numNodes) {
-                this.type = NodeType.ELLIPSIS;
-                this.isGroupNode = false;
-                this.cardinality = 1;
-                this.parentNode = null;
-                this.stats = null;
-                this.setNumMoreNodes(numNodes);
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            EllipsisNodeImpl.prototype.setNumMoreNodes = function (numNodes) {
-                this.numMoreNodes = numNodes;
-                this.name = '... ' + numNodes + ' more';
-            };
-            return EllipsisNodeImpl;
-        }());
-        graph_1.EllipsisNodeImpl = EllipsisNodeImpl;
-        ;
-        /**
-         * A label object for nodes in the full graph and leaf nodes in the render
-         * graph.
-         */
-        var OpNodeImpl = (function () {
-            /**
-             * Constructs a new Op node.
-             *
-             * @param rawNode The raw node.
-             */
-            function OpNodeImpl(rawNode) {
-                this.op = rawNode.op;
-                this.name = rawNode.name;
-                this.device = rawNode.device;
-                this.attr = rawNode.attr;
-                // An array of normalized inputs that denote the incoming edges to
-                // the current node. Each input contains the normalized name of the
-                // source node, whether it has a number part and whether it is a
-                // control dependency.
-                this.inputs = normalizeInputs(rawNode.input);
-                this.outputShapes = extractOutputShapes(rawNode.attr);
-                // additional properties
-                this.type = NodeType.OP;
-                this.isGroupNode = false;
-                this.cardinality = 1;
-                this.inEmbeddings = [];
-                this.outEmbeddings = [];
-                this.parentNode = null;
-                this.include = InclusionType.UNSPECIFIED;
-                this.owningSeries = null;
-            }
-            return OpNodeImpl;
-        }());
-        graph_1.OpNodeImpl = OpNodeImpl;
-        ;
-        function createMetanode(name, opt) {
-            if (opt === void 0) { opt = {}; }
-            return new MetanodeImpl(name, opt);
-        }
-        graph_1.createMetanode = createMetanode;
-        /**
-         * Joins the information from the stats file (memory, compute time) with the
-         * graph information.
-         */
-        function joinStatsInfoWithGraph(graph, stats, devicesForStats) {
-            // Reset stats for each node.
-            _.each(graph.nodes, function (node) { node.stats = null; });
-            _.each(stats.dev_stats, function (devStats) {
-                // Ignore devices that are not selected.
-                if (devicesForStats && !devicesForStats[devStats.device]) {
-                    return;
-                }
-                _.each(devStats.node_stats, function (nodeStats) {
-                    // Lookup the node in the graph by its original name, e.g. A. If not
-                    // found, lookup by the rewritten name A/(A) in case the name is both
-                    // a namespace and a node name.
-                    var nodeName = nodeStats.node_name in graph.nodes ? nodeStats.node_name :
-                        nodeStats.node_name +
-                            graph_1.NAMESPACE_DELIM + '(' + nodeStats.node_name + ')';
-                    // Couldn't find a matching node.
-                    if (!(nodeName in graph.nodes)) {
-                        return;
-                    }
-                    // Compute the total bytes used.
-                    var totalBytes = 0;
-                    if (nodeStats.memory) {
-                        _.each(nodeStats.memory, function (alloc) {
-                            if (alloc.total_bytes) {
-                                if (alloc.total_bytes > 0) {
-                                    totalBytes += Number(alloc.total_bytes);
-                                }
-                                else {
-                                    /* tslint:disable */
-                                    console.log('ignoring negative memory allocation for ' + nodeName);
-                                }
-                            }
-                        });
-                    }
-                    var outputSize = null;
-                    if (nodeStats.output) {
-                        outputSize = _.map(nodeStats.output, function (output) {
-                            return _.map(output.tensor_description.shape.dim, function (dim) { return Number(dim.size); });
-                        });
-                    }
-                    graph.nodes[nodeName].device = devStats.device;
-                    if (graph.nodes[nodeName].stats == null) {
-                        graph.nodes[nodeName].stats = new NodeStats(outputSize);
-                    }
-                    graph.nodes[nodeName].stats.addBytesAllocation(totalBytes);
-                    if (nodeStats.all_end_rel_micros) {
-                        if (nodeStats.all_end_rel_micros > 0) {
-                            graph.nodes[nodeName].stats.addExecutionTime(nodeStats.all_start_micros, nodeStats.all_start_micros + nodeStats.all_end_rel_micros);
-                        }
-                        else {
-                            /* tslint:disable */
-                            console.log('ignoring negative runtime for ' + nodeName);
-                        }
-                    }
-                });
-            });
-        }
-        graph_1.joinStatsInfoWithGraph = joinStatsInfoWithGraph;
-        /**
-         * Execution stats for the node.
-         */
-        var NodeStats = (function () {
-            function NodeStats(outputSize) {
-                /**
-                 * Total number of bytes used for the node. Sum of all children
-                 * if it is a Group node.
-                 */
-                this.totalBytes = 0;
-                this.outputSize = outputSize;
-            }
-            /**
-             * Add the start and end time for a particular kernel execution of this op.
-             * Ops can have multiple kernel executions within the same session run.
-             */
-            NodeStats.prototype.addExecutionTime = function (startTime, endTime) {
-                if (this.startTime != null) {
-                    this.startTime = Math.min(this.startTime, startTime);
-                }
-                else {
-                    this.startTime = startTime;
-                }
-                if (this.endTime != null) {
-                    this.endTime = Math.max(this.endTime, endTime);
-                }
-                else {
-                    this.endTime = endTime;
-                }
-            };
-            /**
-             * Add the bytes allocated for a particular kernel execution of this op.
-             * Ops can have multiple kernel executions within the same session run.
-             */
-            NodeStats.prototype.addBytesAllocation = function (totalBytes) {
-                if (this.totalBytes != null) {
-                    this.totalBytes = Math.max(this.totalBytes, totalBytes);
-                }
-                else {
-                    this.totalBytes = totalBytes;
-                }
-            };
-            Object.defineProperty(NodeStats.prototype, "totalMicros", {
-                /**
-                 * Total number of compute time in microseconds used for the node.
-                 * Sum of all children if it is a Group node. Null if it is unknown.
-                 */
-                get: function () {
-                    if (this.startTime == null || this.endTime == null) {
-                        return null;
-                    }
-                    return this.endTime - this.startTime;
-                },
-                enumerable: true,
-                configurable: true
-            });
-            /**
-             * Combines the specified stats with the current stats.
-             * Modifies the current object. This method is used to
-             * compute aggregate stats for group nodes.
-             */
-            NodeStats.prototype.combine = function (stats) {
-                if (stats.totalBytes != null) {
-                    this.totalBytes += stats.totalBytes;
-                }
-                if (stats.totalMicros != null) {
-                    this.addExecutionTime(stats.startTime, stats.endTime);
-                }
-            };
-            return NodeStats;
-        }());
-        graph_1.NodeStats = NodeStats;
-        var MetanodeImpl = (function () {
-            /** A label object for meta-nodes in the graph hierarchy */
-            function MetanodeImpl(name, opt) {
-                if (opt === void 0) { opt = {}; }
-                this.name = name;
-                this.type = NodeType.META;
-                /** number of levels under this group */
-                this.depth = 1;
-                this.isGroupNode = true;
-                /** # of leaf nodes (including embedded ones) */
-                this.cardinality = 0;
-                /** graph contains metanodes, nodes, edges
-                 * and metaedges for main items within this metanode
-                 */
-                this.metagraph =
-                    createGraph(name, GraphType.META, opt);
-                /** bridgegraph must be constructed lazily-see hierarchy.getBridgegraph() */
-                this.bridgegraph = null;
-                /**
-                 * A dictionary that count ops type of nodes in this metanode
-                 * (op type => count).
-                 */
-                this.opHistogram = {};
-                this.deviceHistogram = {};
-                /** unique id for a metanode of similar subgraph */
-                this.templateId = null;
-                /** Metanode which contains this node, if any */
-                this.parentNode = null;
-                this.hasNonControlEdges = false;
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            MetanodeImpl.prototype.getFirstChild = function () {
-                return this.metagraph.node(this.metagraph.nodes()[0]);
-            };
-            /**
-             * Returns the op node associated with the metanode.
-             * For example, if the metanode is 'sgd', the associated
-             * op node is sgd/(sgd).
-             */
-            MetanodeImpl.prototype.getRootOp = function () {
-                var nameSplit = this.name.split('/');
-                var rootOpName = this.name + '/(' + nameSplit[nameSplit.length - 1] + ')';
-                return this.metagraph.node(rootOpName);
-            };
-            /**
-             * Return an array of the names of all the leaves (non-GroupNodes) inside
-             * this metanode. This performs a breadth-first search of the tree, so
-             * immediate child leaves will appear earlier in the output array than
-             * descendant leaves.
-             */
-            MetanodeImpl.prototype.leaves = function () {
-                var leaves = [];
-                var queue = [this];
-                var metagraph; // Defined here due to a limitation of ES6->5 compilation.
-                while (queue.length) {
-                    var node = queue.shift();
-                    if (node.isGroupNode) {
-                        metagraph = node.metagraph;
-                        _.each(metagraph.nodes(), function (name) { return queue.push(metagraph.node(name)); });
-                    }
-                    else {
-                        leaves.push(node.name);
-                    }
-                }
-                return leaves;
-            };
-            return MetanodeImpl;
-        }());
-        graph_1.MetanodeImpl = MetanodeImpl;
-        ;
-        function createMetaedge(v, w) {
-            return new MetaedgeImpl(v, w);
-        }
-        graph_1.createMetaedge = createMetaedge;
-        /**
-         * A label object for edges between metanodes of subgraphs in the render graph.
-         */
-        var MetaedgeImpl = (function () {
-            function MetaedgeImpl(v, w) {
-                this.v = v;
-                this.w = w;
-                this.baseEdgeList = [];
-                this.inbound = null;
-                this.numRegularEdges = 0;
-                this.numControlEdges = 0;
-                this.numRefEdges = 0;
-                this.totalSize = 0;
-            }
-            MetaedgeImpl.prototype.addBaseEdge = function (edge, h) {
-                this.baseEdgeList.push(edge);
-                if (edge.isControlDependency) {
-                    this.numControlEdges += 1;
-                }
-                else {
-                    this.numRegularEdges += 1;
-                }
-                if (edge.isReferenceEdge) {
-                    this.numRefEdges += 1;
-                }
-                // Compute the size of the tensor flowing through this
-                // base edge.
-                this.totalSize += MetaedgeImpl.computeSizeOfEdge(edge, h);
-                h.maxMetaEdgeSize = Math.max(h.maxMetaEdgeSize, this.totalSize);
-            };
-            MetaedgeImpl.computeSizeOfEdge = function (edge, h) {
-                var opNode = h.node(edge.v);
-                if (opNode.outputShapes == null) {
-                    // No shape information. Asssume a single number. This gives
-                    // a lower bound for the total size.
-                    return 1;
-                }
-                h.hasShapeInfo = true;
-                // Sum the sizes of all output tensors.
-                return _(opNode.outputShapes).map(function (shape) {
-                    // If the shape is unknown, treat it as 1 when computing
-                    // total size. This gives a lower bound for the total size.
-                    if (shape == null) {
-                        return 1;
-                    }
-                    // Multiply all shapes to get the total size of the tensor.
-                    // E.g. The total size of [4, 2, 1] is 4 * 2 * 1.
-                    return _(shape).reduce(function (accumulated, currSize) {
-                        // If this particular dimension is unknown, treat
-                        // it as 1 when computing total size. This gives a lower bound
-                        // for the total size.
-                        if (currSize === -1) {
-                            currSize = 1;
-                        }
-                        return accumulated * currSize;
-                    }, 1);
-                }).sum();
-            };
-            return MetaedgeImpl;
-        }());
-        graph_1.MetaedgeImpl = MetaedgeImpl;
-        function createSeriesNode(prefix, suffix, parent, clusterId, name) {
-            return new SeriesNodeImpl(prefix, suffix, parent, clusterId, name);
-        }
-        graph_1.createSeriesNode = createSeriesNode;
-        function getSeriesNodeName(prefix, suffix, parent, startId, endId) {
-            var numRepresentation = (typeof startId !== 'undefined' && typeof endId !== 'undefined') ?
-                '[' + startId + '-' + endId + ']' :
-                '#';
-            var pattern = prefix + numRepresentation + suffix;
-            return (parent ? parent + '/' : '') + pattern;
-        }
-        graph_1.getSeriesNodeName = getSeriesNodeName;
-        var SeriesNodeImpl = (function () {
-            function SeriesNodeImpl(prefix, suffix, parent, clusterId, name) {
-                this.name = name || getSeriesNodeName(prefix, suffix, parent);
-                this.type = NodeType.SERIES;
-                this.hasLoop = false;
-                this.prefix = prefix;
-                this.suffix = suffix;
-                this.clusterId = clusterId;
-                this.ids = [];
-                this.parent = parent;
-                this.isGroupNode = true;
-                this.cardinality = 0;
-                this.metagraph = createGraph(name, GraphType.SERIES);
-                // bridgegraph must be constructed lazily-see hierarchy.getBridgegraph()
-                this.bridgegraph = null;
-                this.parentNode = null;
-                this.deviceHistogram = {};
-                this.hasNonControlEdges = false;
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            return SeriesNodeImpl;
-        }());
-        /**
-         * Extracts the shapes of the output tensors from the attr property in the
-         * node proto.
-         */
-        function extractOutputShapes(attr) {
-            var result = null;
-            // We don't know anything about the output tensors.
-            if (!attr) {
-                return null;
-            }
-            for (var i = 0; i < attr.length; i++) {
-                var _a = attr[i], key = _a.key, value = _a.value;
-                if (key === OUTPUT_SHAPES_KEY) {
-                    // Map all output tensors into array of numbers denoting their shape.
-                    var result_1 = value.list.shape.map(function (shape) {
-                        if (shape.unknown_rank) {
-                            // This output tensor is of unknown rank. We don't know if it is a
-                            // scalar, or a tensor, or of what shape it is.
-                            return null;
-                        }
-                        if (shape.dim == null ||
-                            (shape.dim.length === 1 && shape.dim[0].size == null)) {
-                            // This output tensor is a scalar.
-                            return [];
-                        }
-                        // This output tensor has a known rank. Map each dimension size
-                        // into a number.
-                        return shape.dim.map(function (dim) {
-                            // Size can be -1 if this particular dimension is unknown.
-                            return dim.size;
-                        });
-                    });
-                    // Since we already processed it, remove the entry from the attribute
-                    // list (saves memory).
-                    attr.splice(i, 1);
-                    return result_1;
-                }
-            }
-            // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
-            // about the output tensors.
-            return null;
-        }
-        /**
-         * Normalizes the inputs and extracts associated metadata:
-         * 1) Inputs can contain a colon followed by a number at the end
-         *    (e.g. inputName:1) and we remove this from the input name, and take note
-         *    that the input was numbered.
-         * 2) Control dependency inputs contain caret at the beginning and we
-         *    remove this and annotate the edge as a control dependency.
-         * @param inputs Array of unnormalized names of input nodes.
-         */
-        function normalizeInputs(inputs) {
-            var normalizedInputs = [];
-            _.each(inputs, function (inputName) {
-                var start = inputName[0] === '^';
-                var colon = inputName.lastIndexOf(':');
-                var end = colon !== -1 &&
-                    inputName.length - colon > 1 &&
-                    !(/\D/).test(inputName.substring(colon + 1)) ?
-                    colon : inputName.length;
-                var name = inputName.substring(start ? 1 : 0, end);
-                if (normalizedInputs.length === 0 ||
-                    name !== normalizedInputs[normalizedInputs.length - 1].name) {
-                    normalizedInputs.push({
-                        name: name,
-                        outputTensorIndex: end === inputName.length ? 0 : Number(inputName.slice(colon + 1)),
-                        isControlDependency: start
-                    });
-                }
-            });
-            return normalizedInputs;
-        }
-        function addEdgeToGraph(graph, inputName, outputNode, input, params, index) {
-            // Don't allow loops in the graph.
-            if (inputName === outputNode.name) {
-                return;
-            }
-            // Check if this op type and input number corresponds to a
-            // reference edge using the refEdges dictionary in the params.
-            var isRefEdge = params.refEdges[outputNode.op + ' ' + index] === true;
-            graph.edges.push({
-                v: inputName,
-                w: outputNode.name,
-                outputTensorIndex: input.outputTensorIndex,
-                isControlDependency: input.isControlDependency,
-                isReferenceEdge: isRefEdge
-            });
-        }
-        function build(rawNodes, params, tracker) {
-            /**
-             * A dictionary that maps each in-embedding node name to the node
-             * object.
-             */
-            var inEmbedding = {};
-            /**
-             * A dictionary that maps each out-embedding node name to the node
-             * object.
-             */
-            var outEmbedding = {};
-            /**
-             * A dictionary that maps each node name to an array of the node's
-             * out-embedding node label objects.
-             */
-            var outEmbeddings = {};
-            var isInEmbeddedPred = getEmbedPredicate(params.inEmbeddingTypes);
-            var isOutEmbeddedPred = getEmbedPredicate(params.outEmbeddingTypes);
-            var embeddingNodeNames = [];
-            /**
-             * A list of all the non-embedding node names which appear in the processed
-             * list of raw nodes. Here we pre-allocate enough room for all the rawNodes,
-             * even though there will some number of embeddings. The excess array length
-             * is spliced off later.
-             *
-             * Experimentation shows that around 30% of the array will go unused, and
-             * even for very large networks that amounts to less than 10k spaces.
-             */
-            var nodeNames = new Array(rawNodes.length);
-            return tf.graph.util
-                .runAsyncTask('Normalizing names', 30, function () {
-                var opNodes = new Array(rawNodes.length);
-                var index = 0;
-                _.each(rawNodes, function (rawNode) {
-                    var opNode = new OpNodeImpl(rawNode);
-                    if (isInEmbeddedPred(opNode)) {
-                        embeddingNodeNames.push(opNode.name);
-                        inEmbedding[opNode.name] = opNode;
-                        return;
-                    }
-                    if (isOutEmbeddedPred(opNode)) {
-                        embeddingNodeNames.push(opNode.name);
-                        outEmbedding[opNode.name] = opNode;
-                        _.each(opNode.inputs, function (input) {
-                            var inputName = input.name;
-                            outEmbeddings[inputName] = outEmbeddings[inputName] || [];
-                            outEmbeddings[inputName].push(opNode);
-                        });
-                        return;
-                    }
-                    // The node is not an embedding, so add it to the names and nodes
-                    // lists.
-                    opNodes[index] = opNode;
-                    nodeNames[index] = opNode.name;
-                    index++;
-                });
-                opNodes.splice(index);
-                nodeNames.splice(index);
-                return opNodes;
-            }, tracker)
-                .then(function (opNodes) {
-                // Create the graph data structure from the graphlib library.
-                return tf.graph.util.runAsyncTask('Building the data structure', 70, function () {
-                    var normalizedNameDict = mapStrictHierarchy(nodeNames, embeddingNodeNames);
-                    var graph = new SlimGraph;
-                    // Add the nodes to the graph.
-                    _.each(opNodes, function (opNode) {
-                        var normalizedName = normalizedNameDict[opNode.name] || opNode.name;
-                        graph.nodes[normalizedName] = opNode;
-                        // Check if the node has out-embeddings. If yes, add them to the
-                        // node.
-                        if (opNode.name in outEmbeddings) {
-                            opNode.outEmbeddings = outEmbeddings[opNode.name];
-                            // Normalize the names of the out-embeddings.
-                            _.each(opNode.outEmbeddings, function (node) {
-                                node.name = normalizedNameDict[node.name] || node.name;
-                            });
-                        }
-                        // Update the name of the node.
-                        opNode.name = normalizedName;
-                    });
-                    // Visit each node's inputs to add the edges to the graph. If the
-                    // input
-                    // is an in-embedding, then add it to the node's in-embeddings
-                    // instead.
-                    _.each(opNodes, function (opNode) {
-                        _.each(opNode.inputs, function (input, i) {
-                            var inputName = input.name;
-                            if (inputName in inEmbedding) {
-                                var inEmbedNode = inEmbedding[inputName];
-                                opNode.inEmbeddings.push(inEmbedNode);
-                                // Move the inputs of the in-embedding node into incoming
-                                // edges of
-                                // the main node. E.g. the control dependency of a constant
-                                // node
-                                // should be moved to the op node where the constant is
-                                // embedded.
-                                for (var _i = 0, _a = inEmbedNode.inputs; _i < _a.length; _i++) {
-                                    var embedInput = _a[_i];
-                                    addEdgeToGraph(graph, normalizedNameDict[embedInput.name] ||
-                                        embedInput.name, opNode, embedInput, params, i);
-                                }
-                            }
-                            else if (inputName in outEmbedding) {
-                                // Move the inputs of the out-embedding node into inputs of
-                                // the main node where the out-embedding points to.
-                                var outEmbedNode = outEmbedding[inputName];
-                                for (var _b = 0, _c = outEmbedNode.inputs; _b < _c.length; _b++) {
-                                    var embedInput = _c[_b];
-                                    addEdgeToGraph(graph, normalizedNameDict[embedInput.name] ||
-                                        embedInput.name, opNode, input, params, i);
-                                }
-                            }
-                            else {
-                                addEdgeToGraph(graph, normalizedNameDict[inputName] || inputName, opNode, input, params, i);
-                            }
-                        });
-                    });
-                    // Normalize the names of in-embeddings.
-                    _.each(inEmbedding, function (node, name) {
-                        node.name = normalizedNameDict[node.name] || node.name;
-                    });
-                    return graph;
-                }, tracker);
-            });
-        }
-        graph_1.build = build;
-        ;
-        /**
-         * Create a new graphlib.Graph() instance with default parameters
-         */
-        function createGraph(name, type, opt) {
-            if (opt === void 0) { opt = {}; }
-            var graph = new graphlib.Graph(opt);
-            graph.setGraph({
-                name: name,
-                rankdir: 'BT',
-                type: type
-            });
-            return graph;
-        }
-        graph_1.createGraph = createGraph;
-        ;
-        /**
-         * Create a predicate for checking whether a node should be embedded based on
-         * the specified types.
-         */
-        function getEmbedPredicate(types) {
-            return function (node) {
-                // check types
-                for (var i = 0; i < types.length; i++) {
-                    var regExp = new RegExp(types[i]);
-                    if (node.op.match(regExp)) {
-                        return true;
-                    }
-                }
-                return false;
-            };
-        }
-        ;
-        /**
-         * Returns a strict node name (name => name/(name)) to avoid conflicts
-         * where the node name is also a namespace.
-         */
-        function getStrictName(name) {
-            var parts = name.split(graph_1.NAMESPACE_DELIM);
-            return name + graph_1.NAMESPACE_DELIM + '(' + parts[parts.length - 1] + ')';
-        }
-        graph_1.getStrictName = getStrictName;
-        /**
-         * For each op node (embedding or non-embedding), rename it if there is a
-         * non-embedding node under its namespace. For example, assume node name 'A'.
-         * If there is a non-embedding node under its namespace (e.g. 'A/B'), 'A' will
-         * be renamed to 'A/(A)'. Then the namespace 'A' will contain 2 nodes: '(A)'
-         * and 'B'. If all the nodes under 'A' are embedding nodes (e.g. constant and
-         * summary), keep 'A' as an Op node and don't create a namespace.
-         *
-         * @param nodeNames An array of regular (non-embedding) node names.
-         * @param embeddingNodeNames An array of embedding node names.
-         * @return Dictionary object mapping names that need to be renamed to
-         *     new names.
-         */
-        function mapStrictHierarchy(nodeNames, embeddingNodeNames) {
-            /** Dictionary that maps the old new to the new name */
-            var newNameDictionary = {};
-            /** Set used to store all namespaces. */
-            var namespaceSet = {};
-            // sort the nodes to make prefix check faster
-            nodeNames.sort();
-            // look for nodes with a prefix a,a/b -> a/(a),a/b
-            for (var i = 0; i < nodeNames.length - 1; ++i) {
-                var a = nodeNames[i];
-                // Get all the parent namespaces of the current node
-                // and add them in the namespace set.
-                _.each(getHierarchicalPath(a).slice(0, -1), function (ns) {
-                    namespaceSet[ns] = true;
-                });
-                for (var j = i + 1; j < nodeNames.length; ++j) {
-                    var b = nodeNames[j];
-                    if (_.startsWith(b, a)) {
-                        if (b.length > a.length && b.charAt(a.length) === graph_1.NAMESPACE_DELIM) {
-                            newNameDictionary[a] = getStrictName(a);
-                            break;
-                        }
-                    }
-                    else {
-                        break;
-                    }
-                }
-            }
-            // Go through all the embedding node names and rename them in case they
-            // collide with namespaces.
-            _.each(embeddingNodeNames, function (embeddingName) {
-                if (embeddingName in namespaceSet) {
-                    // Rename to follow strict hierarchy.
-                    newNameDictionary[embeddingName] = getStrictName(embeddingName);
-                }
-            });
-            return newNameDictionary;
-        }
-        ;
-        /**
-         * Returns a list of the degrees of each node in the graph.
-         */
-        function degreeSequence(graph) {
-            var degrees = graph.nodes().map(function (name) {
-                return graph.neighbors(name).length;
-            });
-            degrees.sort();
-            return degrees;
-        }
-        ;
-        /**
-         * Returns if the degree sequence of the two graphs is the same.
-         */
-        function hasSimilarDegreeSequence(graph1, graph2) {
-            var dg1 = degreeSequence(graph1);
-            var dg2 = degreeSequence(graph2);
-            for (var i = 0; i < dg1.length; i++) {
-                if (dg1[i] !== dg2[i]) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        graph_1.hasSimilarDegreeSequence = hasSimilarDegreeSequence;
-        ;
-        /**
-         * Returns the hierarchical path of the current node, based on the node's name.
-         * For example, if the name is 'a/b/c', the returned path is
-         * ['a', 'a/b', 'a/b/c'].
-         */
-        function getHierarchicalPath(name, seriesNames) {
-            var path = [];
-            var i = name.indexOf(graph_1.NAMESPACE_DELIM);
-            // Push all parent portions of the path.
-            while (i >= 0) {
-                path.push(name.substring(0, i));
-                i = name.indexOf(graph_1.NAMESPACE_DELIM, i + 1);
-            }
-            // If the node's path is under a series, then add the series node name to the
-            // hierarchical path as the parent of the leaf.
-            if (seriesNames) {
-                var seriesName = seriesNames[name];
-                if (seriesName) {
-                    path.push(seriesName);
-                }
-            }
-            // Push the leaf of the path.
-            path.push(name);
-            return path;
-        }
-        graph_1.getHierarchicalPath = getHierarchicalPath;
-        ;
-        /**
-         * Returns the string for the node inclusion toggle button, dependant
-         * on the provided current InclusionType.
-         */
-        function getIncludeNodeButtonString(include) {
-            if (include === tf.graph.InclusionType.EXCLUDE) {
-                return 'Add to main graph';
-            }
-            else {
-                return 'Remove from main graph';
-            }
-        }
-        graph_1.getIncludeNodeButtonString = getIncludeNodeButtonString;
-        ;
-        /**
-         * Returns the string for the series node grouping toggle button, dependant
-         * on the provided current SeriesGroupingType.
-         */
-        function getGroupSeriesNodeButtonString(group) {
-            if (group === tf.graph.SeriesGroupingType.GROUP) {
-                return 'Ungroup this series of nodes';
-            }
-            else {
-                return 'Group this series of nodes';
-            }
-        }
-        graph_1.getGroupSeriesNodeButtonString = getGroupSeriesNodeButtonString;
-        ;
-        /**
-         * Toggle the node series grouping option in the provided map, setting it
-         * to ungroup if the series is not already in the map.
-         */
-        function toggleNodeSeriesGroup(map, name) {
-            if (!(name in map) || map[name] === tf.graph.SeriesGroupingType.GROUP) {
-                map[name] = tf.graph.SeriesGroupingType.UNGROUP;
-            }
-            else {
-                map[name] = tf.graph.SeriesGroupingType.GROUP;
-            }
-        }
-        graph_1.toggleNodeSeriesGroup = toggleNodeSeriesGroup;
-        ;
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Graph Hierarchy for TensorFlow graph.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var hierarchy;
-        (function (hierarchy_1) {
-            /**
-             * Class for the Graph Hierarchy for TensorFlow graph.
-             */
-            var HierarchyImpl = (function () {
-                function HierarchyImpl() {
-                    this.hasShapeInfo = false;
-                    this.maxMetaEdgeSize = 1;
-                    this.root = graph_1.createMetanode(graph_1.ROOT_NAME, { compound: true });
-                    this.templates = null;
-                    this.devices = null;
-                    /**
-                     * @type {Object} Dictionary object that maps node name to the node
-                     * (could be op-node, metanode, or series-node)
-                     */
-                    this.index = {};
-                    this.index[graph_1.ROOT_NAME] = this.root;
-                    this.orderings = {};
-                }
-                HierarchyImpl.prototype.getNodeMap = function () {
-                    return this.index;
-                };
-                HierarchyImpl.prototype.node = function (name) {
-                    return this.index[name];
-                };
-                HierarchyImpl.prototype.setNode = function (name, node) {
-                    this.index[name] = node;
-                };
-                /**
-                 * Given the name of a node in this hierarchy, get its bridgegraph, creating
-                 * it on the fly if necessary. If the node is not a GroupNode, then this
-                 * method returns null. If the provided name does not map to a node in the
-                 * hierarchy, an error will be thrown.
-                 */
-                HierarchyImpl.prototype.getBridgegraph = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node in hierarchy: ' + nodeName);
-                    }
-                    if (!('metagraph' in node)) {
-                        return null;
-                    }
-                    var groupNode = node;
-                    if (groupNode.bridgegraph) {
-                        return groupNode.bridgegraph;
-                    }
-                    var bridgegraph = groupNode.bridgegraph =
-                        graph_1.createGraph('BRIDGEGRAPH', graph_1.GraphType.BRIDGE);
-                    if (!node.parentNode || !('metagraph' in node.parentNode)) {
-                        return bridgegraph;
-                    }
-                    var parentNode = node.parentNode;
-                    var parentMetagraph = parentNode.metagraph;
-                    var parentBridgegraph = this.getBridgegraph(parentNode.name);
-                    // For each of the parent node's two Metaedge containing graphs, process
-                    // each Metaedge involving this node.
-                    _.each([parentMetagraph, parentBridgegraph], function (parentGraph) {
-                        _(parentGraph.edges())
-                            .filter(function (e) { return e.v === nodeName || e.w === nodeName; })
-                            .each(function (parentEdgeObj) {
-                            var inbound = parentEdgeObj.w === nodeName;
-                            var parentMetaedge = parentGraph.edge(parentEdgeObj);
-                            // The parent's Metaedge represents some number of underlying
-                            // BaseEdges from the original full graph. For each of those, we need
-                            // to determine which immediate child is involved and make sure
-                            // there's a Metaedge in the bridgegraph that covers it.
-                            _.each(parentMetaedge.baseEdgeList, function (baseEdge) {
-                                // Based on the direction, figure out which is the descendant node
-                                // and which is the 'other' node (sibling of parent or ancestor).
-                                var _a = inbound ?
-                                    [baseEdge.w, parentEdgeObj.v] :
-                                    [baseEdge.v, parentEdgeObj.w], descendantName = _a[0], otherName = _a[1];
-                                // Determine the immediate child containing this descendant node.
-                                var childName = _this.getChildName(nodeName, descendantName);
-                                // Look for an existing Metaedge in the bridgegraph (or create a
-                                // new one) that covers the relationship between child and other.
-                                var bridgeEdgeObj = {
-                                    v: inbound ? otherName : childName,
-                                    w: inbound ? childName : otherName,
-                                };
-                                var bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-                                if (!bridgeMetaedge) {
-                                    bridgeMetaedge = graph_1.createMetaedge(bridgeEdgeObj.v, bridgeEdgeObj.w);
-                                    bridgeMetaedge.inbound = inbound;
-                                    bridgegraph.setEdge(bridgeEdgeObj.v, bridgeEdgeObj.w, bridgeMetaedge);
-                                }
-                                // Copy the BaseEdge from the parent's Metaedge into this
-                                // bridgegraph Metaedge.
-                                bridgeMetaedge.addBaseEdge(baseEdge, _this);
-                            });
-                        })
-                            .value(); // force lodash chain execution.
-                    });
-                    return bridgegraph;
-                };
-                /**
-                 * Utility function for determining the name of the immediate child under a
-                 * node for a given descendant path. If the descendant corresponds to no
-                 * immediate child, an error is thrown.
-                 */
-                HierarchyImpl.prototype.getChildName = function (nodeName, descendantName) {
-                    // Walk up the hierarchy from the descendant to find the child.
-                    var currentNode = this.index[descendantName];
-                    while (currentNode) {
-                        if (currentNode.parentNode && currentNode.parentNode.name === nodeName) {
-                            return currentNode.name;
-                        }
-                        currentNode = currentNode.parentNode;
-                    }
-                    throw Error('Could not find immediate child for descendant: ' + descendantName);
-                };
-                ;
-                /** Given the name of a node, return its incoming metaedges. */
-                HierarchyImpl.prototype.getPredecessors = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    var predecessors = this.getOneWayEdges(node, true);
-                    // Add embedded predecessors, such as constants.
-                    if (!node.isGroupNode) {
-                        _.each(node.inEmbeddings, function (embeddedNode) {
-                            _.each(node.inputs, function (input) {
-                                if (input.name === embeddedNode.name) {
-                                    // Make a new metaedge holding the edge between the
-                                    // node and the in-embedding.
-                                    var metaedge = new graph_1.MetaedgeImpl(embeddedNode.name, nodeName);
-                                    metaedge.addBaseEdge({
-                                        isControlDependency: input.isControlDependency,
-                                        outputTensorIndex: input.outputTensorIndex,
-                                        isReferenceEdge: false,
-                                        v: embeddedNode.name,
-                                        w: nodeName
-                                    }, _this);
-                                    predecessors.regular.push(metaedge);
-                                }
-                            });
-                        });
-                    }
-                    return predecessors;
-                };
-                /**
-                 * Given the name of a node, return its outgoing metaedges.
-                 *
-                 * This is the inverse of getPredecessors(). See that method's documentation
-                 * for an in-depth example.
-                 */
-                HierarchyImpl.prototype.getSuccessors = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    var successors = this.getOneWayEdges(node, false);
-                    // Add embedded successors, such as summaries.
-                    if (!node.isGroupNode) {
-                        _.each(node.outEmbeddings, function (embeddedNode) {
-                            _.each(embeddedNode.inputs, function (input) {
-                                if (input.name === nodeName) {
-                                    // Make a new metaedge holding the edge between the
-                                    // node and the out-embedding.
-                                    var metaedge = new graph_1.MetaedgeImpl(nodeName, embeddedNode.name);
-                                    metaedge.addBaseEdge({
-                                        isControlDependency: input.isControlDependency,
-                                        outputTensorIndex: input.outputTensorIndex,
-                                        isReferenceEdge: false,
-                                        v: nodeName,
-                                        w: embeddedNode.name
-                                    }, _this);
-                                    successors.regular.push(metaedge);
-                                }
-                            });
-                        });
-                    }
-                    return successors;
-                };
-                /** Helper method for getPredecessors and getSuccessors */
-                HierarchyImpl.prototype.getOneWayEdges = function (node, inEdges) {
-                    var edges = { control: [], regular: [] };
-                    // A node with no parent cannot have any edges.
-                    if (!node.parentNode || !node.parentNode.isGroupNode) {
-                        return edges;
-                    }
-                    var parentNode = node.parentNode;
-                    var metagraph = parentNode.metagraph;
-                    var bridgegraph = this.getBridgegraph(parentNode.name);
-                    findEdgeTargetsInGraph(metagraph, node, inEdges, edges);
-                    findEdgeTargetsInGraph(bridgegraph, node, inEdges, edges);
-                    return edges;
-                };
-                /**
-                 * For a given GroupNode, get or calculate an object which describes a
-                 * topological ordering of child nodes within that GroupNode's metagraph.
-                 *
-                 * This ordering is used when rendering bridge control edges which are
-                 * sometimes backwards relative to the dataflow.
-                 *
-                 * For example, say we have a graph with two edges A->B and A->C, and we're
-                 * interested in the ordering under ROOT. In this case, any of the following
-                 * would be legitimate return values:
-                 *
-                 *  - { 'A': 0, 'B': 1, 'C': 2 } -- most likely
-                 *  - { 'A': 0, 'B': 2, 'C': 1 } -- less likely
-                 *  - { 'A': 12, 'B': 100, 'C': 99 } -- unlikely, but still OK
-                 *
-                 * The algorithm does not guarantee that all numbers from 0-N (where N is
-                 * the number of nodes) appear exactly once. Rather it guarantees that if
-                 * there is a path between two nodes, the earlier one will have a lower
-                 * number in the ordering hash.
-                 *
-                 * When generating the ordering, we ignore control Metaedges (those which
-                 * represent only BaseEdges that have isControlDependency set to true).
-                 *
-                 * If there is no node with the specified name, an error is thrown. If the
-                 * node with the specified name is not a group node, null is returned.
-                 */
-                HierarchyImpl.prototype.getTopologicalOrdering = function (nodeName) {
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    if (!node.isGroupNode) {
-                        return null;
-                    }
-                    if (nodeName in this.orderings) {
-                        return this.orderings[nodeName];
-                    }
-                    // Mapping of a child node names to lists of their successors.
-                    var successors = {};
-                    // Set of node names which have appeared as a destination.
-                    var destinations = {};
-                    var metagraph = node.metagraph;
-                    _.each(metagraph.edges(), function (e) {
-                        if (!metagraph.edge(e).numRegularEdges) {
-                            return; // Skip control edges.
-                        }
-                        // Keep track of successors and destinations.
-                        if (!(e.v in successors)) {
-                            successors[e.v] = [];
-                        }
-                        successors[e.v].push(e.w);
-                        destinations[e.w] = true;
-                    });
-                    // Seed the queue with true sources (those that are not destinations).
-                    var queue = _.difference(_.keys(successors), _.keys(destinations));
-                    // Produce an ordering by traversing the graph breadth first.
-                    var ordering = this.orderings[nodeName] = {};
-                    var index = 0;
-                    while (queue.length) {
-                        var childName = queue.shift();
-                        ordering[childName] = index++;
-                        _.each(successors[childName], function (succName) { return queue.push(succName); });
-                        delete successors[childName]; // Prevent cycles from infinite looping.
-                    }
-                    return ordering;
-                };
-                /**
-                 * Returns a d3 Ordinal function that can be used to look up the index of
-                 * a node based on its template id.
-                 */
-                HierarchyImpl.prototype.getTemplateIndex = function () {
-                    var templateNames = d3.keys(this.templates);
-                    var templateIndex = d3.scale.ordinal()
-                        .domain(templateNames)
-                        .range(d3.range(0, templateNames.length));
-                    return function (templateId) { return templateIndex(templateId); };
-                };
-                return HierarchyImpl;
-            }());
-            /**
-             * Internal utility function - given a graph (should be either a metagraph or a
-             * bridgegraph) and a node which is known to be in that graph, determine
-             * the other ends of edges that involve that node in the direction specified
-             * by whether it's inbound.
-             *
-             * For example if you wanted to find the predecessors of a node, you'd call
-             * this method for the parent's metagraph and bridgegraph, specifying inbound
-             * as true (look at the source of inbound edges to the specified node).
-             *
-             * Discovered target names are appended to the targets array.
-             */
-            function findEdgeTargetsInGraph(graph, node, inbound, targets) {
-                var edges = inbound ? graph.inEdges(node.name) : graph.outEdges(node.name);
-                _.each(edges, function (e) {
-                    var metaedge = graph.edge(e);
-                    var targetList = metaedge.numRegularEdges ? targets.regular : targets.control;
-                    targetList.push(metaedge);
-                });
-            }
-            /**
-             * @param graph The raw graph.
-             * @param params Parameters used when building a hierarchy.
-             */
-            function build(graph, params, tracker) {
-                var h = new HierarchyImpl();
-                var seriesNames = {};
-                return tf.graph.util
-                    .runAsyncTask('Adding nodes', 20, function () {
-                    // Get all the possible device names.
-                    var deviceNames = {};
-                    _.each(graph.nodes, function (node, nodeName) {
-                        if (node.device != null) {
-                            deviceNames[node.device] = true;
-                        }
-                    });
-                    h.devices = _.keys(deviceNames);
-                    addNodes(h, graph);
-                }, tracker)
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Detect series', 20, function () {
-                        if (params.seriesNodeMinSize > 0) {
-                            groupSeries(h.root, h, seriesNames, params.seriesNodeMinSize, params.seriesMap);
-                        }
-                    }, tracker);
-                })
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Adding edges', 30, function () {
-                        addEdges(h, graph, seriesNames);
-                    }, tracker);
-                })
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Finding similar subgraphs', 30, function () {
-                        h.templates = graph_1.template.detect(h, params.verifyTemplate);
-                    }, tracker);
-                })
-                    .then(function () { return h; });
-            }
-            hierarchy_1.build = build;
-            ;
-            function joinAndAggregateStats(h, stats) {
-                // Get all the possible device names.
-                var deviceNames = {};
-                _.each(h.root.leaves(), function (nodeName) {
-                    var leaf = h.node(nodeName);
-                    if (leaf.device != null) {
-                        deviceNames[leaf.device] = true;
-                    }
-                });
-                h.devices = _.keys(deviceNames);
-                // Reset stats for each group node.
-                _.each(h.getNodeMap(), function (node, nodeName) {
-                    if (node.isGroupNode) {
-                        node.stats = new graph_1.NodeStats(null);
-                        node.deviceHistogram = {};
-                    }
-                });
-                // Bubble-up the stats and device distribution from leaves to parents.
-                _.each(h.root.leaves(), function (nodeName) {
-                    var leaf = h.node(nodeName);
-                    var node = leaf;
-                    while (node.parentNode != null) {
-                        if (leaf.device != null) {
-                            var deviceHistogram = node.parentNode.deviceHistogram;
-                            deviceHistogram[leaf.device] = (deviceHistogram[leaf.device] || 0) + 1;
-                        }
-                        if (leaf.stats != null) {
-                            node.parentNode.stats.combine(leaf.stats);
-                        }
-                        node = node.parentNode;
-                    }
-                });
-            }
-            hierarchy_1.joinAndAggregateStats = joinAndAggregateStats;
-            /**
-             * Creates the metanodes in the hierarchical graph and assigns parent-child
-             * relationship between them.
-             */
-            function addNodes(h, graph) {
-                _.each(graph.nodes, function (node, nodeName) {
-                    var path = graph_1.getHierarchicalPath(node.name);
-                    var parent = h.root;
-                    parent.depth = Math.max(path.length, parent.depth);
-                    // Create parent metanodes for each depth. For example if the node name
-                    // is 'a/b/c', then create metanodes 'a' and 'a/b', where 'a/b' is a child
-                    // of a.
-                    for (var i = 0; i < path.length; i++) {
-                        parent.depth = Math.max(parent.depth, path.length - i);
-                        parent.cardinality += node.cardinality;
-                        parent.opHistogram[node.op] = (parent.opHistogram[node.op] || 0) + 1;
-                        if (node.device != null) {
-                            parent.deviceHistogram[node.device] =
-                                (parent.deviceHistogram[node.device] || 0) + 1;
-                        }
-                        if (i === path.length - 1) {
-                            break;
-                        }
-                        var name_1 = path[i];
-                        var child = h.node(name_1);
-                        if (!child) {
-                            child = graph_1.createMetanode(name_1);
-                            child.parentNode = parent;
-                            h.setNode(name_1, child);
-                            parent.metagraph.setNode(name_1, child);
-                        }
-                        parent = child;
-                    }
-                    // Assuming node name is 'a/b/c', assign the OpNode as a child of the
-                    // metanode 'a/b'.
-                    h.setNode(node.name, node);
-                    node.parentNode = parent;
-                    parent.metagraph.setNode(node.name, node);
-                    // Add each of the in-embeddings and out-embeddings in the hierarchy.
-                    _.each(node.inEmbeddings, function (embedding) {
-                        h.setNode(embedding.name, embedding);
-                        embedding.parentNode = node;
-                    });
-                    _.each(node.outEmbeddings, function (embedding) {
-                        h.setNode(embedding.name, embedding);
-                        embedding.parentNode = node;
-                    });
-                });
-            }
-            ;
-            /**
-             * For each metanode in the hierarchical graph, this method adds:
-             * the edges in the metagraph. These are edges between nodes
-             * that share the same parent.
-             */
-            function addEdges(h, graph, seriesNames) {
-                var nodeIndex = h.getNodeMap();
-                // Ancestor paths for the source and destination nodes of an edge. These are
-                // reused for each edge rather than allocating new ones. It's about 10% faster
-                // than allocating new ones on each pass through the loop.
-                var sourcePath = [];
-                var destPath = [];
-                // Insert the ancestor path for a node into the provided array, including the
-                // node itself. Return the index of the last node inserted (always ROOT).
-                var getPath = function (node, path) {
-                    var i = 0;
-                    while (node) {
-                        path[i++] = node.name;
-                        node = node.parentNode;
-                    }
-                    return i - 1;
-                };
-                _.each(graph.edges, function (baseEdge) {
-                    // Get the hierarchical paths for the source and destination of the edge.
-                    var sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
-                    var destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);
-                    // If the hierarchical path cannot be found for either endpoint, then we
-                    // cannot create the edge. This happens for example when a node has a
-                    // control dependency on a summary node, which are embedded.
-                    if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
-                        return;
-                    }
-                    // Find the lowest shared ancestor between source and dest by looking for
-                    // the highest nodes that differ between their ancestor paths.
-                    while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
-                        sourceAncestorIndex--;
-                        destAncestorIndex--;
-                        if (sourceAncestorIndex < 0 || destAncestorIndex < 0) {
-                            // This would only occur if the two nodes were the same (a cycle in the
-                            // graph), or if one endpoint was a strict ancestor of the other. The
-                            // latter shouldn't happen because we rename nodes which are both
-                            // metanodes and op nodes. E.g. 'A/B' becomes 'A/B/(B)'.
-                            throw Error('No difference found between ancestor paths.');
-                        }
-                    }
-                    var sharedAncestorNode = nodeIndex[sourcePath[sourceAncestorIndex + 1]];
-                    var sourceAncestorName = sourcePath[sourceAncestorIndex];
-                    var destAncestorName = destPath[destAncestorIndex];
-                    // Find or create the Metaedge which should contain this BaseEdge inside
-                    // the shared ancestor.
-                    var metaedge = sharedAncestorNode.metagraph.edge(sourceAncestorName, destAncestorName);
-                    if (!metaedge) {
-                        metaedge = graph_1.createMetaedge(sourceAncestorName, destAncestorName);
-                        sharedAncestorNode.metagraph
-                            .setEdge(sourceAncestorName, destAncestorName, metaedge);
-                    }
-                    if (!sharedAncestorNode.hasNonControlEdges &&
-                        !baseEdge.isControlDependency) {
-                        sharedAncestorNode.hasNonControlEdges = true;
-                    }
-                    metaedge.addBaseEdge(baseEdge, h);
-                });
-            }
-            ;
-            /**
-             * Using the hierarchy template information, detect series in the provided
-             * metanode.  For each detected series, create a new SeriesNode
-             * and remove series members from the metanode's metagraph and move them to
-             * the new series node's metagraph.
-             *
-             * @param metanode
-             * @param hierarchy
-             * @param seriesNames Map of node names to their series they are contained in.
-             *     This should be provided empty and is populated by this method.
-             * @param threshold If the series has this many nodes or more, then group them
-             *     into a series.
-             * @param map Map of series names to their series grouping type, if one has
-             *     been set.
-             * @return A dictionary from node name to series node name that contains the
-             *     node.
-             */
-            function groupSeries(metanode, hierarchy, seriesNames, threshold, map) {
-                var metagraph = metanode.metagraph;
-                _.each(metagraph.nodes(), function (n) {
-                    var child = metagraph.node(n);
-                    if (child.type === tf.graph.NodeType.META) {
-                        groupSeries(child, hierarchy, seriesNames, threshold, map);
-                    }
-                });
-                var clusters = clusterNodes(metagraph);
-                var seriesDict = detectSeries(clusters, metagraph);
-                // Add each series node to the graph and add its grouped children to its own
-                // metagraph.
-                _.each(seriesDict, function (seriesNode, seriesName) {
-                    var nodeMemberNames = seriesNode.metagraph.nodes();
-                    _.each(nodeMemberNames, function (n) {
-                        var child = metagraph.node(n);
-                        if (!child.owningSeries) {
-                            child.owningSeries = seriesName;
-                        }
-                    });
-                    // If the series contains less than the threshold number of nodes and
-                    // this series has not been adding to the series map, then set this
-                    // series to be shown ungrouped in the map.
-                    if (nodeMemberNames.length < threshold && !(seriesNode.name in map)) {
-                        map[seriesNode.name] = tf.graph.SeriesGroupingType.UNGROUP;
-                    }
-                    // If the series is in the map as ungrouped then do not group the series.
-                    if (seriesNode.name in map
-                        && map[seriesNode.name] === tf.graph.SeriesGroupingType.UNGROUP) {
-                        return;
-                    }
-                    hierarchy.setNode(seriesName, seriesNode); // add to the index
-                    metagraph.setNode(seriesName, seriesNode);
-                    _.each(nodeMemberNames, function (n) {
-                        var child = metagraph.node(n);
-                        seriesNode.metagraph.setNode(n, child);
-                        seriesNode.parentNode = child.parentNode;
-                        seriesNode.cardinality++;
-                        if (child.device != null) {
-                            seriesNode.deviceHistogram[child.device] =
-                                (seriesNode.deviceHistogram[child.device] || 0) + 1;
-                        }
-                        child.parentNode = seriesNode;
-                        seriesNames[n] = seriesName;
-                        // Remove now-grouped node from its original parent's metagraph.
-                        metagraph.removeNode(n);
-                    });
-                });
-            }
-            ;
-            /** cluster op-nodes with similar op */
-            function clusterNodes(metagraph) {
-                var result = {};
-                return _.reduce(metagraph.nodes(), function (clusters, n) {
-                    var child = metagraph.node(n);
-                    if (child.type === graph_1.NodeType.META) {
-                        // skip metanodes
-                        return clusters;
-                    }
-                    var template = child.op;
-                    if (template) {
-                        clusters[template] = clusters[template] || [];
-                        clusters[template].push(child.name);
-                    }
-                    return clusters;
-                }, result);
-            }
-            /**
-             * For each cluster of op-nodes based op type, try to detect groupings.
-             * Infer series name using by trying to find pattern '<number>' in the node
-             * name.
-             *
-             * @param clusters Dictionary output from clusterNodes().
-             * @param metagraph
-             * @return A dictionary from series name => seriesNode
-             */
-            function detectSeries(clusters, metagraph) {
-                var seriesDict = {};
-                _.each(clusters, function (members, clusterId) {
-                    if (members.length <= 1) {
-                        return;
-                    } // isolated clusters can't make series
-                    /** @type {Object}  A dictionary mapping seriesName to seriesInfoArray,
-                     * which is an array that contains objects with name, id, prefix, suffix,
-                     * and parent properties.
-                     */
-                    var candidatesDict = {};
-                    // Group all nodes that have the same name, with the exception of a
-                    // number at the end of the name after an underscore, which is allowed to
-                    // vary.
-                    _.each(members, function (name) {
-                        var isGroup = name.charAt(name.length - 1) === '*';
-                        var namepath = name.split('/');
-                        var leaf = namepath[namepath.length - 1];
-                        var parent = namepath.slice(0, namepath.length - 1).join('/');
-                        var matches = leaf.match(/^(\D*)_(\d+)$/);
-                        var prefix;
-                        var id;
-                        var suffix = '';
-                        if (matches) {
-                            prefix = matches[1]; // the front non-numeric characters
-                            id = matches[2]; // the digits
-                        }
-                        else {
-                            prefix = isGroup ? leaf.substr(0, leaf.length - 1) : leaf;
-                            id = 0;
-                            suffix = isGroup ? '*' : '';
-                        }
-                        var seriesName = graph_1.getSeriesNodeName(prefix, suffix, parent);
-                        candidatesDict[seriesName] = candidatesDict[seriesName] || [];
-                        var seriesNode = graph_1.createSeriesNode(prefix, suffix, parent, +id, name);
-                        candidatesDict[seriesName].push(seriesNode);
-                    });
-                    // In each group of nodes, group nodes in bunches that have monotonically
-                    // increasing numbers in their names.  Each of these bunches is a series.
-                    _.each(candidatesDict, function (seriesInfoArray, seriesName) {
-                        if (seriesInfoArray.length < 2) {
-                            return;
-                        }
-                        seriesInfoArray.sort(function (a, b) {
-                            return (+a.clusterId) - (+b.clusterId);
-                        });
-                        // Loop through the nodes sorted by its detected series number, grouping
-                        // all nodes with monotonically-increasing series numbers.
-                        var seriesNodes = [seriesInfoArray[0]];
-                        for (var index = 1; index < seriesInfoArray.length; index++) {
-                            var nextNode = seriesInfoArray[index];
-                            if (nextNode.clusterId === seriesNodes[seriesNodes.length - 1].clusterId
-                                + 1) {
-                                seriesNodes.push(nextNode);
-                                continue;
-                            }
-                            addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-                            seriesNodes = [nextNode];
-                        }
-                        addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-                    });
-                });
-                return seriesDict;
-            }
-            /**
-             * Add a series to the provided dictionary mapping series names to series.
-             *
-             * @param seriesNodes the nodes in the series. Contains
-             *     name, id, prefix, suffix and parent properties of the node.
-             * @param seriesDict the dictionary of series
-             * @param clusterId ID of the template of the nodes of the series
-             * @param metagraph
-             */
-            function addSeriesToDict(seriesNodes, seriesDict, clusterId, metagraph) {
-                if (seriesNodes.length > 1) {
-                    var curSeriesName = graph_1.getSeriesNodeName(seriesNodes[0].prefix, seriesNodes[0].suffix, seriesNodes[0].parent, seriesNodes[0].clusterId, seriesNodes[seriesNodes.length - 1].clusterId);
-                    var curSeriesNode_1 = graph_1.createSeriesNode(seriesNodes[0].prefix, seriesNodes[0].suffix, seriesNodes[0].parent, clusterId, curSeriesName);
-                    _.each(seriesNodes, function (node) {
-                        curSeriesNode_1.ids.push(node.clusterId);
-                        curSeriesNode_1.metagraph.setNode(node.name, metagraph.node(node.name));
-                    });
-                    seriesDict[curSeriesName] = curSeriesNode_1;
-                }
-            }
-        })(hierarchy = graph_1.hierarchy || (graph_1.hierarchy = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph.hierarchy
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var layout;
-        (function (layout) {
-            /** Set of parameters that define the look and feel of the graph. */
-            layout.PARAMS = {
-                animation: {
-                    /** Default duration for graph animations in ms. */
-                    duration: 250
-                },
-                graph: {
-                    /** Graph parameter for metanode. */
-                    meta: {
-                        /**
-                         * Dagre's nodesep param - number of pixels that
-                         * separate nodes horizontally in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        nodeSep: 5,
-                        /**
-                         * Dagre's ranksep param - number of pixels
-                         * between each rank in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        rankSep: 25,
-                        /**
-                         * Dagre's edgesep param - number of pixels that separate
-                         * edges horizontally in the layout.
-                         */
-                        edgeSep: 5,
-                    },
-                    /** Graph parameter for metanode. */
-                    series: {
-                        /**
-                         * Dagre's nodesep param - number of pixels that
-                         * separate nodes horizontally in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        nodeSep: 5,
-                        /**
-                         * Dagre's ranksep param - number of pixels
-                         * between each rank in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        rankSep: 25,
-                        /**
-                         * Dagre's edgesep param - number of pixels that separate
-                         * edges horizontally in the layout.
-                         */
-                        edgeSep: 5
-                    },
-                    /**
-                     * Padding is used to correctly position the graph SVG inside of its parent
-                     * element. The padding amounts are applied using an SVG transform of X and
-                     * Y coordinates.
-                     */
-                    padding: { paddingTop: 40, paddingLeft: 20 }
-                },
-                subscene: {
-                    meta: {
-                        paddingTop: 10,
-                        paddingBottom: 10,
-                        paddingLeft: 10,
-                        paddingRight: 10,
-                        /**
-                         * Used to leave room for the label on top of the highest node in
-                         * the core graph.
-                         */
-                        labelHeight: 20,
-                        /** X-space between each extracted node and the core graph. */
-                        extractXOffset: 15,
-                        /** Y-space between each extracted node. */
-                        extractYOffset: 20
-                    },
-                    series: {
-                        paddingTop: 10,
-                        paddingBottom: 10,
-                        paddingLeft: 10,
-                        paddingRight: 10,
-                        labelHeight: 10
-                    }
-                },
-                nodeSize: {
-                    /** Size of meta nodes. */
-                    meta: {
-                        radius: 5,
-                        width: 60,
-                        maxLabelWidth: 52,
-                        /** A scale for the node's height based on number of nodes inside */
-                        height: d3.scale.linear().domain([1, 200]).range([15, 60]).clamp(true),
-                        /** The radius of the circle denoting the expand button. */
-                        expandButtonRadius: 3
-                    },
-                    /** Size of op nodes. */
-                    op: {
-                        width: 15,
-                        height: 6,
-                        radius: 3,
-                        labelOffset: -8,
-                        maxLabelWidth: 30
-                    },
-                    /** Size of series nodes. */
-                    series: {
-                        expanded: {
-                            // For expanded series nodes, width and height will be
-                            // computed to account for the subscene.
-                            radius: 10,
-                            labelOffset: 0,
-                        },
-                        vertical: {
-                            // When unexpanded, series whose underlying metagraphs contain
-                            // one or more non-control edges will show as a vertical stack
-                            // of ellipses.
-                            width: 16,
-                            height: 13,
-                            labelOffset: -13,
-                        },
-                        horizontal: {
-                            // When unexpanded, series whose underlying metagraphs contain
-                            // no non-control edges will show as a horizontal stack of
-                            // ellipses.
-                            width: 24,
-                            height: 8,
-                            radius: 10,
-                            labelOffset: -10,
-                        },
-                    },
-                    /** Size of bridge nodes. */
-                    bridge: {
-                        // NOTE: bridge nodes will normally be invisible, but they must
-                        // take up some space so that the layout step leaves room for
-                        // their edges.
-                        width: 20,
-                        height: 20,
-                        radius: 2,
-                        labelOffset: 0
-                    }
-                },
-                shortcutSize: {
-                    /** Size of shortcuts for op nodes */
-                    op: { width: 10, height: 4 },
-                    /** Size of shortcuts for meta nodes */
-                    meta: { width: 12, height: 4, radius: 1 },
-                    /** Size of shortcuts for series nodes */
-                    series: {
-                        width: 14,
-                        height: 4,
-                    }
-                },
-                annotations: {
-                    /** Maximum possible width of the bounding box for in annotations */
-                    inboxWidth: 50,
-                    /** Maximum possible width of the bounding box for out annotations */
-                    outboxWidth: 50,
-                    /** X-space between the shape and each annotation-node. */
-                    xOffset: 10,
-                    /** Y-space between each annotation-node. */
-                    yOffset: 3,
-                    /** X-space between each annotation-node and its label. */
-                    labelOffset: 2,
-                    /** Defines the max width for annotation label */
-                    maxLabelWidth: 120
-                },
-                constant: { size: { width: 4, height: 4 } },
-                series: {
-                    /** Maximum number of repeated item for unexpanded series node. */
-                    maxStackCount: 3,
-                    /**
-                     * Positioning offset ratio for collapsed stack
-                     * of parallel series (series without edges between its members).
-                     */
-                    parallelStackOffsetRatio: 0.2,
-                    /**
-                     * Positioning offset ratio for collapsed stack
-                     * of tower series (series with edges between its members).
-                     */
-                    towerStackOffsetRatio: 0.5
-                },
-                minimap: {
-                    /** The maximum width/height the minimap can have. */
-                    size: 150
-                }
-            };
-            /** Calculate layout for a scene of a group node. */
-            function layoutScene(renderNodeInfo) {
-                // Update layout, size, and annotations of its children nodes and edges.
-                if (renderNodeInfo.node.isGroupNode) {
-                    layoutChildren(renderNodeInfo);
-                }
-                // Update position of its children nodes and edges
-                if (renderNodeInfo.node.type === graph_1.NodeType.META) {
-                    layoutMetanode(renderNodeInfo);
-                }
-                else if (renderNodeInfo.node.type === graph_1.NodeType.SERIES) {
-                    layoutSeriesNode(renderNodeInfo);
-                }
-            }
-            layout.layoutScene = layoutScene;
-            ;
-            /**
-             * Updates the total width of an unexpanded node which includes the size of its
-             * in and out annotations.
-             */
-            function updateTotalWidthOfNode(renderInfo) {
-                renderInfo.inboxWidth = renderInfo.inAnnotations.list.length > 0 ?
-                    layout.PARAMS.annotations.inboxWidth : 0;
-                renderInfo.outboxWidth = renderInfo.outAnnotations.list.length > 0 ?
-                    layout.PARAMS.annotations.outboxWidth : 0;
-                // Assign the width of the core box (the main shape of the node).
-                renderInfo.coreBox.width = renderInfo.width;
-                renderInfo.coreBox.height = renderInfo.height;
-                // TODO(jimbo): Account for font width rather than using a magic number.
-                var labelLength = renderInfo.node.name.length -
-                    renderInfo.node.name.lastIndexOf(graph_1.NAMESPACE_DELIM) - 1;
-                var charWidth = 3; // 3 pixels per character.
-                // Compute the total width of the node.
-                renderInfo.width = Math.max(renderInfo.coreBox.width +
-                    renderInfo.inboxWidth + renderInfo.outboxWidth, labelLength * charWidth);
-            }
-            /**
-             * Update layout, size, and annotations of its children nodes and edges.
-             */
-            function layoutChildren(renderNodeInfo) {
-                var children = renderNodeInfo.coreGraph.nodes().map(function (n) {
-                    return renderNodeInfo.coreGraph.node(n);
-                }).concat(renderNodeInfo.isolatedInExtract, renderNodeInfo.isolatedOutExtract);
-                _.each(children, function (childNodeInfo) {
-                    // Set size of each child
-                    switch (childNodeInfo.node.type) {
-                        case graph_1.NodeType.OP:
-                            _.extend(childNodeInfo, layout.PARAMS.nodeSize.op);
-                            break;
-                        case graph_1.NodeType.BRIDGE:
-                            _.extend(childNodeInfo, layout.PARAMS.nodeSize.bridge);
-                            break;
-                        case graph_1.NodeType.META:
-                            if (!childNodeInfo.expanded) {
-                                // Set fixed width and scalable height based on cardinality
-                                _.extend(childNodeInfo, layout.PARAMS.nodeSize.meta);
-                                childNodeInfo.height =
-                                    layout.PARAMS.nodeSize.meta.height(childNodeInfo.node.cardinality);
-                            }
-                            else {
-                                var childGroupNodeInfo = childNodeInfo;
-                                layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-                            }
-                            break;
-                        case graph_1.NodeType.SERIES:
-                            if (childNodeInfo.expanded) {
-                                _.extend(childNodeInfo, layout.PARAMS.nodeSize.series.expanded);
-                                var childGroupNodeInfo = childNodeInfo;
-                                layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-                            }
-                            else {
-                                var childGroupNodeInfo = childNodeInfo;
-                                var seriesParams = childGroupNodeInfo.node.hasNonControlEdges ?
-                                    layout.PARAMS.nodeSize.series.vertical :
-                                    layout.PARAMS.nodeSize.series.horizontal;
-                                _.extend(childNodeInfo, seriesParams);
-                            }
-                            break;
-                        default:
-                            throw Error('Unrecognized node type: ' + childNodeInfo.node.type);
-                    }
-                    // Compute total width of un-expanded nodes. Width of expanded nodes
-                    // has already been computed.
-                    if (!childNodeInfo.expanded) {
-                        updateTotalWidthOfNode(childNodeInfo);
-                    }
-                    // Layout each child's annotations
-                    layoutAnnotation(childNodeInfo);
-                });
-            }
-            /**
-             * Calculate layout for a graph using dagre
-             * @param graph the graph to be laid out
-             * @param params layout parameters
-             * @return width and height of the core graph
-             */
-            function dagreLayout(graph, params) {
-                _.extend(graph.graph(), {
-                    nodesep: params.nodeSep,
-                    ranksep: params.rankSep,
-                    edgesep: params.edgeSep
-                });
-                var bridgeNodeNames = [];
-                var nonBridgeNodeNames = [];
-                // Split out nodes into bridge and non-bridge nodes, and calculate the total
-                // width we should use for bridge nodes.
-                _.each(graph.nodes(), function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    if (nodeInfo.node.type === graph_1.NodeType.BRIDGE) {
-                        bridgeNodeNames.push(nodeName);
-                    }
-                    else {
-                        nonBridgeNodeNames.push(nodeName);
-                    }
-                });
-                // If there are no non-bridge nodes, then the graph has zero size.
-                if (!nonBridgeNodeNames.length) {
-                    return {
-                        width: 0,
-                        height: 0,
-                    };
-                }
-                dagre.layout(graph);
-                // Calculate the true bounding box of the graph by iterating over nodes and
-                // edges rather than accepting dagre's word for it. In particular, we should
-                // ignore the extra-wide bridge nodes and bridge edges, and allow for
-                // annotation boxes and labels.
-                var minX = Infinity;
-                var minY = Infinity;
-                var maxX = -Infinity;
-                var maxY = -Infinity;
-                _.each(nonBridgeNodeNames, function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    var w = 0.5 * nodeInfo.width;
-                    var x1 = nodeInfo.x - w;
-                    var x2 = nodeInfo.x + w;
-                    minX = x1 < minX ? x1 : minX;
-                    maxX = x2 > maxX ? x2 : maxX;
-                    // TODO(jimbo): Account for the height of labels above op nodes here.
-                    var h = 0.5 * nodeInfo.height;
-                    var y1 = nodeInfo.y - h;
-                    var y2 = nodeInfo.y + h;
-                    minY = y1 < minY ? y1 : minY;
-                    maxY = y2 > maxY ? y2 : maxY;
-                });
-                _.each(graph.edges(), function (edgeObj) {
-                    var edgeInfo = graph.edge(edgeObj);
-                    if (edgeInfo.structural) {
-                        return; // Skip structural edges from min/max calculations.
-                    }
-                    // Since the node size passed to dagre includes the in and out
-                    // annotations, the endpoints of the edge produced by dagre may not
-                    // point to the actual node shape (rectangle, ellipse). We correct the
-                    // end-points by finding the intersection of a line between the
-                    // next-to-last (next-to-first) point and the destination (source)
-                    // rectangle.
-                    var sourceNode = graph.node(edgeInfo.metaedge.v);
-                    var destNode = graph.node(edgeInfo.metaedge.w);
-                    // Straight 3-points edges are special case, since they are curved after
-                    // our default correction. To keep them straight, we remove the mid point
-                    // and correct the first and the last point to be the center of the
-                    // source and destination node respectively.
-                    if (edgeInfo.points.length === 3 && isStraightLine(edgeInfo.points)) {
-                        if (sourceNode != null) {
-                            var cxSource = sourceNode.expanded ?
-                                sourceNode.x : computeCXPositionOfNodeShape(sourceNode);
-                            edgeInfo.points[0].x = cxSource;
-                        }
-                        if (destNode != null) {
-                            var cxDest = destNode.expanded ?
-                                destNode.x : computeCXPositionOfNodeShape(destNode);
-                            edgeInfo.points[2].x = cxDest;
-                        }
-                        // Remove the middle point so the edge doesn't curve.
-                        edgeInfo.points = [edgeInfo.points[0], edgeInfo.points[1]];
-                    }
-                    // Correct the destination endpoint of the edge.
-                    var nextToLastPoint = edgeInfo.points[edgeInfo.points.length - 2];
-                    // The destination node might be null if this is a bridge edge.
-                    if (destNode != null) {
-                        edgeInfo.points[edgeInfo.points.length - 1] =
-                            intersectPointAndNode(nextToLastPoint, destNode);
-                    }
-                    // Correct the source endpoint of the edge.
-                    var secondPoint = edgeInfo.points[1];
-                    // The source might be null if this is a bridge edge.
-                    if (sourceNode != null) {
-                        edgeInfo.points[0] = intersectPointAndNode(secondPoint, sourceNode);
-                    }
-                    _.each(edgeInfo.points, function (point) {
-                        minX = point.x < minX ? point.x : minX;
-                        maxX = point.x > maxX ? point.x : maxX;
-                        minY = point.y < minY ? point.y : minY;
-                        maxY = point.y > maxY ? point.y : maxY;
-                    });
-                });
-                // Shift all nodes and edge points to account for the left-padding amount,
-                // and the invisible bridge nodes.
-                _.each(graph.nodes(), function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    nodeInfo.x -= minX;
-                    nodeInfo.y -= minY;
-                });
-                _.each(graph.edges(), function (edgeObj) {
-                    _.each(graph.edge(edgeObj).points, function (point) {
-                        point.x -= minX;
-                        point.y -= minY;
-                    });
-                });
-                return {
-                    width: maxX - minX,
-                    height: maxY - minY
-                };
-            }
-            /** Layout a metanode. Only called for an expanded node. */
-            function layoutMetanode(renderNodeInfo) {
-                // First, copy params specific to meta nodes onto this render info object.
-                var params = layout.PARAMS.subscene.meta;
-                _.extend(renderNodeInfo, params);
-                // Invoke dagre.layout() on the core graph and record the bounding box
-                // dimensions.
-                _.extend(renderNodeInfo.coreBox, dagreLayout(renderNodeInfo.coreGraph, layout.PARAMS.graph.meta));
-                // Calculate the position of nodes in isolatedInExtract relative to the
-                // top-left corner of inExtractBox (the bounding box for all inExtract nodes)
-                // and calculate the size of the inExtractBox.
-                var maxInExtractWidth = _.max(renderNodeInfo.isolatedInExtract, function (renderNode) { return renderNode.width; }).width;
-                renderNodeInfo.inExtractBox.width = maxInExtractWidth != null ?
-                    maxInExtractWidth : 0;
-                renderNodeInfo.inExtractBox.height =
-                    _.reduce(renderNodeInfo.isolatedInExtract, function (height, child, i) {
-                        var yOffset = i > 0 ? params.extractYOffset : 0;
-                        // use width/height here to avoid overlaps between extracts
-                        child.x = 0;
-                        child.y = height + yOffset + child.height / 2;
-                        return height + yOffset + child.height;
-                    }, 0);
-                // Calculate the position of nodes in isolatedOutExtract relative to the
-                // top-left corner of outExtractBox (the bounding box for all outExtract
-                // nodes) and calculate the size of the outExtractBox.
-                var maxOutExtractWidth = _.max(renderNodeInfo.isolatedOutExtract, function (renderNode) { return renderNode.width; }).width;
-                renderNodeInfo.outExtractBox.width = maxOutExtractWidth != null ?
-                    maxOutExtractWidth : 0;
-                renderNodeInfo.outExtractBox.height =
-                    _.reduce(renderNodeInfo.isolatedOutExtract, function (height, child, i) {
-                        var yOffset = i > 0 ? params.extractYOffset : 0;
-                        // use width/height here to avoid overlaps between extracts
-                        child.x = 0;
-                        child.y = height + yOffset + child.height / 2;
-                        return height + yOffset + child.height;
-                    }, 0);
-                // Compute the total padding between the core graph, in-extract and
-                // out-extract boxes.
-                var numParts = 0;
-                if (renderNodeInfo.isolatedInExtract.length > 0) {
-                    numParts++;
-                }
-                if (renderNodeInfo.isolatedOutExtract.length > 0) {
-                    numParts++;
-                }
-                if (renderNodeInfo.coreGraph.nodeCount() > 0) {
-                    numParts++;
-                }
-                var offset = layout.PARAMS.subscene.meta.extractXOffset;
-                var padding = numParts <= 1 ? 0 : (numParts <= 2 ? offset : 2 * offset);
-                // Add the in-extract and out-extract width to the core box width.
-                renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
-                    renderNodeInfo.outExtractBox.width + padding;
-                renderNodeInfo.coreBox.height =
-                    params.labelHeight +
-                        Math.max(renderNodeInfo.inExtractBox.height, renderNodeInfo.coreBox.height, renderNodeInfo.outExtractBox.height);
-                // Determine the whole metanode's width (from left to right).
-                renderNodeInfo.width = renderNodeInfo.coreBox.width +
-                    params.paddingLeft + params.paddingRight;
-                // Determine the whole metanode's height (from top to bottom).
-                renderNodeInfo.height =
-                    renderNodeInfo.paddingTop +
-                        renderNodeInfo.coreBox.height +
-                        renderNodeInfo.paddingBottom;
-            }
-            /**
-             * Calculate layout for series node's core graph. Only called for an expanded
-             * series.
-             */
-            function layoutSeriesNode(node) {
-                var graph = node.coreGraph;
-                var params = layout.PARAMS.subscene.series;
-                _.extend(node, params);
-                // Layout the core.
-                _.extend(node.coreBox, dagreLayout(node.coreGraph, layout.PARAMS.graph.series));
-                _.each(graph.nodes(), function (nodeName) {
-                    graph.node(nodeName).excluded = false;
-                });
-                // Series do not have in/outExtractBox so no need to include them here.
-                node.width = node.coreBox.width + params.paddingLeft + params.paddingRight;
-                node.height = node.coreBox.height + params.paddingTop + params.paddingBottom;
-            }
-            /**
-             * Calculate layout for annotations of a given node.
-             * This will modify positions of the given node and its annotations.
-             *
-             * @see tf.graph.render.Node and tf.graph.render.Annotation
-             * for description of each property of each render node.
-             *
-             */
-            function layoutAnnotation(renderNodeInfo) {
-                // If the render node is an expanded metanode, then its annotations will not
-                // be visible and we should skip the annotation calculations.
-                if (renderNodeInfo.expanded) {
-                    return;
-                }
-                var inAnnotations = renderNodeInfo.inAnnotations.list;
-                var outAnnotations = renderNodeInfo.outAnnotations.list;
-                // Calculate size for in-annotations
-                _.each(inAnnotations, function (a) { return sizeAnnotation(a); });
-                // Calculate size for out-annotations
-                _.each(outAnnotations, function (a) { return sizeAnnotation(a); });
-                var params = layout.PARAMS.annotations;
-                // Calculate annotation node position (a.dx, a.dy)
-                // and total height for in-annotations
-                // After this chunk of code:
-                // inboxHeight = sum of annotation heights+ (annotation.length - 1 * yOffset)
-                var inboxHeight = _.reduce(inAnnotations, function (height, a, i) {
-                    var yOffset = i > 0 ? params.yOffset : 0;
-                    a.dx = -(renderNodeInfo.coreBox.width + a.width) / 2 - params.xOffset;
-                    a.dy = height + yOffset + a.height / 2;
-                    return height + yOffset + a.height;
-                }, 0);
-                _.each(inAnnotations, function (a) {
-                    a.dy -= inboxHeight / 2;
-                    a.labelOffset = params.labelOffset;
-                });
-                // Calculate annotation node position (a.dx, a.dy)
-                // and total height for out-annotations
-                // After this chunk of code:
-                // outboxHeight = sum of annotation heights +
-                //                (annotation.length - 1 * yOffset)
-                var outboxHeight = _.reduce(outAnnotations, function (height, a, i) {
-                    var yOffset = i > 0 ? params.yOffset : 0;
-                    a.dx = (renderNodeInfo.coreBox.width + a.width) / 2 + params.xOffset;
-                    a.dy = height + yOffset + a.height / 2;
-                    return height + yOffset + a.height;
-                }, 0);
-                _.each(outAnnotations, function (a) {
-                    // adjust by (half of ) the total height
-                    // so dy is relative to the host node's center.
-                    a.dy -= outboxHeight / 2;
-                    a.labelOffset = params.labelOffset;
-                });
-                // Creating scales for touch point between the in-annotation edges
-                // and their hosts.
-                var inTouchHeight = Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius, inboxHeight / 2);
-                inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
-                var inY = d3.scale.linear()
-                    .domain([0, inAnnotations.length - 1])
-                    .range([-inTouchHeight, inTouchHeight]);
-                // Calculate annotation edge position
-                _.each(inAnnotations, function (a, i) {
-                    a.points = [
-                        // The annotation node end
-                        {
-                            dx: a.dx + a.width / 2,
-                            dy: a.dy
-                        },
-                        // The host node end
-                        {
-                            dx: -renderNodeInfo.coreBox.width / 2,
-                            // only use scale if there are more than one,
-                            // otherwise center it vertically
-                            dy: inAnnotations.length > 1 ? inY(i) : 0
-                        }
-                    ];
-                });
-                // Creating scales for touch point between the out-annotation edges
-                // and their hosts.
-                var outTouchHeight = Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius, outboxHeight / 2);
-                outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
-                var outY = d3.scale.linear()
-                    .domain([0, outAnnotations.length - 1])
-                    .range([-outTouchHeight, outTouchHeight]);
-                _.each(outAnnotations, function (a, i) {
-                    // Add point from the border of the annotation node
-                    a.points = [
-                        // The host node end
-                        {
-                            dx: renderNodeInfo.coreBox.width / 2,
-                            // only use scale if there are more than one,
-                            // otherwise center it vertically
-                            dy: outAnnotations.length > 1 ? outY(i) : 0
-                        },
-                        // The annotation node end
-                        {
-                            dx: a.dx - a.width / 2,
-                            dy: a.dy
-                        }
-                    ];
-                });
-                renderNodeInfo.height =
-                    Math.max(renderNodeInfo.height, inboxHeight, outboxHeight);
-            }
-            /**
-             * Set size of an annotation node.
-             */
-            function sizeAnnotation(a) {
-                switch (a.annotationType) {
-                    case graph_1.render.AnnotationType.CONSTANT:
-                        _.extend(a, layout.PARAMS.constant.size);
-                        break;
-                    case graph_1.render.AnnotationType.SHORTCUT:
-                        if (a.node.type === graph_1.NodeType.OP) {
-                            _.extend(a, layout.PARAMS.shortcutSize.op);
-                        }
-                        else if (a.node.type === graph_1.NodeType.META) {
-                            _.extend(a, layout.PARAMS.shortcutSize.meta);
-                        }
-                        else if (a.node.type === graph_1.NodeType.SERIES) {
-                            _.extend(a, layout.PARAMS.shortcutSize.series);
-                        }
-                        else {
-                            throw Error('Invalid node type: ' + a.node.type);
-                        }
-                        break;
-                    case graph_1.render.AnnotationType.SUMMARY:
-                        _.extend(a, layout.PARAMS.constant.size);
-                        break;
-                }
-            }
-            /**
-             * Determines the center position of the node's shape. The position depends
-             * on if the node has in and out-annotations.
-             */
-            function computeCXPositionOfNodeShape(renderInfo) {
-                if (renderInfo.expanded) {
-                    return renderInfo.x;
-                }
-                var dx = renderInfo.inAnnotations.list.length ? renderInfo.inboxWidth : 0;
-                return renderInfo.x - renderInfo.width / 2 + dx +
-                    renderInfo.coreBox.width / 2;
-            }
-            layout.computeCXPositionOfNodeShape = computeCXPositionOfNodeShape;
-            /** Returns the angle (in degrees) between two points. */
-            function angleBetweenTwoPoints(a, b) {
-                var dx = b.x - a.x;
-                var dy = b.y - a.y;
-                return 180 * Math.atan(dy / dx) / Math.PI;
-            }
-            /**
-             * Returns if a line going through the specified points is a straight line.
-             */
-            function isStraightLine(points) {
-                var angle = angleBetweenTwoPoints(points[0], points[1]);
-                for (var i = 1; i < points.length - 1; i++) {
-                    var newAngle = angleBetweenTwoPoints(points[i], points[i + 1]);
-                    // Have a tolerance of 1 degree.
-                    if (Math.abs(newAngle - angle) > 1) {
-                        return false;
-                    }
-                    angle = newAngle;
-                }
-                return true;
-            }
-            /**
-             * Returns the intersection of a line between the provided point
-             * and the provided rectangle.
-             */
-            function intersectPointAndNode(point, node) {
-                // cx and cy are the center of the rectangle.
-                var cx = node.expanded ?
-                    node.x : computeCXPositionOfNodeShape(node);
-                var cy = node.y;
-                // Calculate the slope
-                var dx = point.x - cx;
-                var dy = point.y - cy;
-                var w = node.expanded ? node.width : node.coreBox.width;
-                var h = node.expanded ? node.height : node.coreBox.height;
-                var deltaX, deltaY;
-                if (Math.abs(dy) * w / 2 > Math.abs(dx) * h / 2) {
-                    // The intersection is above or below the rectangle.
-                    if (dy < 0) {
-                        h = -h;
-                    }
-                    deltaX = dy === 0 ? 0 : h / 2 * dx / dy;
-                    deltaY = h / 2;
-                }
-                else {
-                    // The intersection is left or right of the rectangle.
-                    if (dx < 0) {
-                        w = -w;
-                    }
-                    deltaX = w / 2;
-                    deltaY = dx === 0 ? 0 : w / 2 * dy / dx;
-                }
-                return { x: cx + deltaX, y: cy + deltaY };
-            }
-        })(layout = graph_1.layout || (graph_1.layout = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var parser;
-        (function (parser) {
-            /**
-             * Parses a native js value, which can be either a string, boolean or number.
-             *
-             * @param value The value to be parsed.
-             */
-            function parseValue(value) {
-                if (value === 'true') {
-                    return true;
-                }
-                if (value === 'false') {
-                    return false;
-                }
-                var firstChar = value[0];
-                if (firstChar === '"') {
-                    return value.substring(1, value.length - 1);
-                }
-                var num = parseFloat(value);
-                return isNaN(num) ? value : num;
-            }
-            /**
-             * Fetches a text file and returns a promise of the result.
-             */
-            function fetchPbTxt(filepath) {
-                return new Promise(function (resolve, reject) {
-                    var request = new XMLHttpRequest();
-                    request.open('GET', filepath);
-                    request.responseType = 'arraybuffer';
-                    request.onerror = function () { return reject(request.status); };
-                    request.onload = function () { return resolve(request.response); };
-                    request.send(null);
-                });
-            }
-            parser.fetchPbTxt = fetchPbTxt;
-            /**
-             * Fetches the metadata file, parses it and returns a promise of the result.
-             */
-            function fetchAndParseMetadata(path, tracker) {
-                return tf.graph.util
-                    .runTask('Reading metadata pbtxt', 40, function () {
-                    if (path == null) {
-                        return Promise.resolve(null);
-                    }
-                    return fetchPbTxt(path);
-                }, tracker)
-                    .then(function (arrayBuffer) {
-                    return tf.graph.util.runAsyncPromiseTask('Parsing metadata.pbtxt', 60, function () {
-                        return arrayBuffer != null ? parseStatsPbTxt(arrayBuffer) :
-                            Promise.resolve(null);
-                    }, tracker);
-                });
-            }
-            parser.fetchAndParseMetadata = fetchAndParseMetadata;
-            /**
-             * Fetches the graph file, parses it and returns a promise of the result. The
-             * result will be undefined if the graph is empty.
-             */
-            function fetchAndParseGraphData(path, pbTxtFile, tracker) {
-                return tf.graph.util
-                    .runTask('Reading graph pbtxt', 40, function () {
-                    if (pbTxtFile) {
-                        return new Promise(function (resolve, reject) {
-                            var fileReader = new FileReader();
-                            fileReader.onload = function () { return resolve(fileReader.result); };
-                            fileReader.onerror = function () { return reject(fileReader.error); };
-                            fileReader.readAsArrayBuffer(pbTxtFile);
-                        });
-                    }
-                    else {
-                        return fetchPbTxt(path);
-                    }
-                }, tracker)
-                    .then(function (arrayBuffer) {
-                    return tf.graph.util.runTask('Parsing graph.pbtxt', 60, function () {
-                        return parseGraphPbTxt(arrayBuffer);
-                    }, tracker);
-                });
-            }
-            parser.fetchAndParseGraphData = fetchAndParseGraphData;
-            /**
-             * Parse a file object in a streaming fashion line by line (or custom delim).
-             * Can handle very large files.
-             * @param input The file object as an array buffer.
-             * @param callback The callback called on each line
-             * @param chunkSize The size of each read chunk. (optional)
-             * @param delim The delimiter used to split a line. (optional)
-             * @returns A promise for when it is finished.
-             */
-            function streamParse(arrayBuffer, callback, chunkSize, delim) {
-                if (chunkSize === void 0) { chunkSize = 1000000; }
-                if (delim === void 0) { delim = '\n'; }
-                return new Promise(function (resolve, reject) {
-                    var offset = 0;
-                    var bufferSize = arrayBuffer.byteLength - 1;
-                    var data = '';
-                    function readHandler(str) {
-                        offset += chunkSize;
-                        var parts = str.split(delim);
-                        var first = data + parts[0];
-                        if (parts.length === 1) {
-                            data = first;
-                            readChunk(offset, chunkSize);
-                            return;
-                        }
-                        data = parts[parts.length - 1];
-                        callback(first);
-                        for (var i = 1; i < parts.length - 1; i++) {
-                            callback(parts[i]);
-                        }
-                        if (offset >= bufferSize) {
-                            if (data) {
-                                callback(data);
-                            }
-                            resolve(true);
-                            return;
-                        }
-                        readChunk(offset, chunkSize);
-                    }
-                    function readChunk(offset, size) {
-                        var arrayBufferChunk = arrayBuffer.slice(offset, offset + size);
-                        var blob = new Blob([arrayBufferChunk]);
-                        var file = new FileReader();
-                        file.onload = function (e) { return readHandler(e.target.result); };
-                        file.readAsText(blob);
-                    }
-                    readChunk(offset, chunkSize);
-                });
-            }
-            parser.streamParse = streamParse;
-            /**
-             * Since proto-txt doesn't explicitly say whether an attribute is repeated
-             * (an array) or not, we keep a hard-coded list of attributes that are known
-             * to be repeated. This list is used in parsing time to convert repeated
-             * attributes into arrays even when the attribute only shows up once in the
-             * object.
-             */
-            var GRAPH_REPEATED_FIELDS = {
-                'node': true,
-                'node.input': true,
-                'node.attr': true,
-                'node.attr.value.list.type': true,
-                'node.attr.value.shape.dim': true,
-                'node.attr.value.tensor.string_val': true,
-                'node.attr.value.tensor.tensor_shape.dim': true,
-                'node.attr.value.list.shape': true,
-                'node.attr.value.list.shape.dim': true,
-                'node.attr.value.list.s': true
-            };
-            var METADATA_REPEATED_FIELDS = {
-                'step_stats.dev_stats': true,
-                'step_stats.dev_stats.node_stats': true,
-                'step_stats.dev_stats.node_stats.output': true,
-                'step_stats.dev_stats.node_stats.memory': true,
-                'step_stats.dev_stats.node_stats.output.tensor_description.shape.dim': true
-            };
-            /**
-             * Parses an ArrayBuffer of a proto txt file into a raw Graph object.
-             */
-            function parseGraphPbTxt(input) {
-                return parsePbtxtFile(input, GRAPH_REPEATED_FIELDS).then(function (obj) { return obj['node']; });
-            }
-            parser.parseGraphPbTxt = parseGraphPbTxt;
-            /**
-             * Parses an ArrayBuffer of a proto txt file into a StepStats object.
-             */
-            function parseStatsPbTxt(input) {
-                return parsePbtxtFile(input, METADATA_REPEATED_FIELDS)
-                    .then(function (obj) { return obj['step_stats']; });
-            }
-            parser.parseStatsPbTxt = parseStatsPbTxt;
-            /**
-             * Parses a ArrayBuffer of a proto txt file into javascript object.
-             *
-             * @param input The ArrayBuffer or file object implementing slice.
-             * @param repeatedFields Map (Set) of all the repeated fields, since you can't
-             *   tell directly from the pbtxt if a field is repeated or not.
-             * @returns The parsed object.
-             */
-            function parsePbtxtFile(input, repeatedFields) {
-                var output = {};
-                var stack = [];
-                var path = [];
-                var current = output;
-                function splitNameAndValueInAttribute(line) {
-                    var colonIndex = line.indexOf(':');
-                    var name = line.substring(0, colonIndex).trim();
-                    var value = parseValue(line.substring(colonIndex + 2).trim());
-                    return {
-                        name: name,
-                        value: value
-                    };
-                }
-                /**
-                 * Adds a value, given the attribute name and the host object. If the
-                 * attribute already exists, but is not an array, it will convert it to an
-                 * array of values.
-                 *
-                 * @param obj The host object that holds the attribute.
-                 * @param name The attribute name (key).
-                 * @param value The attribute value.
-                 * @param path A path that identifies the attribute. Used to check if
-                 *     an attribute is an array or not.
-                 */
-                function addAttribute(obj, name, value, path) {
-                    // We treat 'node' specially since it is done so often.
-                    var existingValue = obj[name];
-                    if (existingValue == null) {
-                        obj[name] = path.join('.') in repeatedFields ? [value] : value;
-                    }
-                    else if (Array.isArray(existingValue)) {
-                        existingValue.push(value);
-                    }
-                    else {
-                        obj[name] = [existingValue, value];
-                    }
-                }
-                // Run through the file a line at a time.
-                return streamParse(input, function (line) {
-                    if (!line) {
-                        return;
-                    }
-                    line = line.trim();
-                    switch (line[line.length - 1]) {
-                        case '{':
-                            var name_1 = line.substring(0, line.length - 2).trim();
-                            var newValue = {};
-                            stack.push(current);
-                            path.push(name_1);
-                            addAttribute(current, name_1, newValue, path);
-                            current = newValue;
-                            break;
-                        case '}':
-                            current = stack.pop();
-                            path.pop();
-                            break;
-                        default:
-                            var x = splitNameAndValueInAttribute(line);
-                            addAttribute(current, x.name, x.value, path.concat(x.name));
-                            break;
-                    }
-                }).then(function () {
-                    return output;
-                });
-            }
-        })(parser = graph.parser || (graph.parser = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // Close module tf.graph.parser.
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script><script>var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Render Hierarchy for TensorFlow graph.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var render;
-        (function (render) {
-            /**
-             * Color parameters for op nodes.
-             */
-            render.OpNodeColors = { DEFAULT_FILL: 'white', DEFAULT_STROKE: '#b2b2b2' };
-            /**
-             * Color parameters for node encoding.
-             * @type {Object}
-             */
-            render.MetanodeColors = {
-                /**
-                 * Default fill and stroke to use when no other information is available.
-                 */
-                DEFAULT_FILL: '#d9d9d9',
-                DEFAULT_STROKE: '#a6a6a6',
-                SATURATION: 0.6,
-                LIGHTNESS: 0.85,
-                /**
-                 * Neutral color to use when the node is expanded (used when coloring by
-                 * compute time, memory and device).
-                 */
-                EXPANDED_COLOR: '#f0f0f0',
-                /**
-                 * Standard hue values for node color palette.
-                 */
-                HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
-                STRUCTURE_PALETTE: function (id, lightened) {
-                    // The code below is a flexible way to computationally create a set
-                    // of colors that go well together.
-                    var hues = render.MetanodeColors.HUES;
-                    var n = hues.length;
-                    var hue = hues[id % n];
-                    var m = Math.sin(hue * Math.PI / 360);
-                    var sat = lightened ? 30 : 90 - 60 * m;
-                    var light = lightened ? 95 : 80;
-                    return d3.hsl(hue, .01 * sat, .01 * light).toString();
-                },
-                DEVICE_PALETTE: function (index) { return render.MetanodeColors.STRUCTURE_PALETTE(index); },
-                UNKNOWN: '#eee',
-                GRADIENT_OUTLINE: '#888'
-            };
-            /**
-             * Color parameters for op nodes.
-             */
-            render.SeriesNodeColors = {
-                DEFAULT_FILL: 'white',
-                DEFAULT_STROKE: '#b2b2b2'
-            };
-            /**
-             * Parameters that affect how the graph is rendered on the screen.
-             */
-            var PARAMS = {
-                /**
-                 * Whether to extract high degree nodes from the core part of the graph.
-                 */
-                enableExtraction: true,
-                /**
-                 * The minimum number of nodes for a graph to have in order for high in and
-                 * out degree nodes to be extracted in auxiliary. The aim here is to prevent
-                 * nodes from being extracted from small graphs.
-                 */
-                minNodeCountForExtraction: 15,
-                /**
-                 * The minimum in or out degree a node must have in order to be possibly
-                 * extracted.
-                 */
-                minDegreeForExtraction: 5,
-                /**
-                 * Maximum number of control edges a node can have before they aren't
-                 * displayed.
-                 */
-                maxControlDegree: 4,
-                /**
-                 * Maximum in (for outbound bridge paths) or out (for inbound bridge paths)
-                 * degree of a node allowed for a bridge path to be rendered to it from a
-                 * subhierarchy of nodes. Having a max prevents having too many nodes emanate
-                 * from a subhierarchy and crowding up.
-                 */
-                maxBridgePathDegree: 4,
-                /**
-                 * Types patterns for predefined out-extract nodes, which are
-                 * sink-like nodes that will be extracted from the main graph.
-                 */
-                outExtractTypes: [
-                    'NoOp' // NoOps are sink-like used for managing control dependencies.
-                ],
-                /**
-                 * Types patterns for predefined in-extract nodes, which are
-                 * source-like nodes that will be extracted from the main graph.
-                 */
-                inExtractTypes: [],
-                /**
-                 * When removing edges from a high degree node, remove all of its edges if
-                 * detachAllEdgesForHighDegree is true.  Otherwise remove all in-edges if
-                 * the node has high in-degree, or all out-edges if the node has high
-                 * out-degree.
-                 */
-                detachAllEdgesForHighDegree: true,
-                /**
-                 * After extracting high in/out degree nodes and predefined
-                 * source-like/sink-like, extract isolated nodes to the side
-                 * if this extractIsolatedNodesWithAnnotationsOnOneSide is true.
-                 */
-                extractIsolatedNodesWithAnnotationsOnOneSide: true,
-                /**
-                 * Whether to add bridge nodes and edges to the core when building the
-                 * subhierarchy of an expanded metanode. See buildSubhierarchy().
-                 */
-                enableBridgegraph: true,
-                /**
-                 * 2 colors, for the minimum and maximum value respectively, whenever we
-                 * have a gradient scale.
-                 */
-                minMaxColors: ['#fff5f0', '#fb6a4a'],
-                /**
-                 * Maximum number of annotations to be displayed on a node before an
-                 * ellipsis is used.
-                 */
-                maxAnnotations: 5
-            };
-            /**
-             * Stores the rendering information, such as x and y coordinates,
-             * for each node in the graph.
-             */
-            var RenderGraphInfo = (function () {
-                function RenderGraphInfo(hierarchy, displayingStats) {
-                    this.hierarchy = hierarchy;
-                    this.displayingStats = displayingStats;
-                    this.index = {};
-                    this.computeScales();
-                    // Maps node name to whether the rendering hierarchy was already
-                    // constructed.
-                    this.hasSubhierarchy = {};
-                    this.root = new RenderGroupNodeInfo(hierarchy.root);
-                    this.index[hierarchy.root.name] = this.root;
-                    this.buildSubhierarchy(hierarchy.root.name);
-                    this.root.expanded = true;
-                    this.traceInputs = false;
-                }
-                RenderGraphInfo.prototype.computeScales = function () {
-                    this.deviceColorMap = d3.scale.ordinal()
-                        .domain(this.hierarchy.devices)
-                        .range(_.map(d3.range(this.hierarchy.devices.length), render.MetanodeColors.DEVICE_PALETTE));
-                    var topLevelGraph = this.hierarchy.root.metagraph;
-                    // Find the maximum and minimum memory usage.
-                    var memoryExtent = d3.extent(topLevelGraph.nodes(), function (nodeName, index) {
-                        var node = topLevelGraph.node(nodeName);
-                        // Some ops don't have stats at all.
-                        if (node.stats != null) {
-                            return node.stats.totalBytes;
-                        }
-                    });
-                    this.memoryUsageScale = d3.scale.linear()
-                        .domain(memoryExtent)
-                        .range(PARAMS.minMaxColors);
-                    // Find also the minimum and maximum compute time.
-                    var computeTimeExtent = d3.extent(topLevelGraph.nodes(), function (nodeName, index) {
-                        var node = topLevelGraph.node(nodeName);
-                        // Some ops don't have stats at all.
-                        if (node.stats != null) {
-                            return node.stats.totalMicros;
-                        }
-                    });
-                    this.computeTimeScale = d3.scale.linear()
-                        .domain(computeTimeExtent)
-                        .range(PARAMS.minMaxColors);
-                    this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
-                        graph_1.scene.edge.EDGE_WIDTH_SCALE :
-                        d3.scale.linear()
-                            .domain([1, this.hierarchy.maxMetaEdgeSize])
-                            .range([graph_1.scene.edge.MIN_EDGE_WIDTH, graph_1.scene.edge.MAX_EDGE_WIDTH]);
-                };
-                /**
-                 * Get a previously created RenderNodeInfo by its node name.
-                 */
-                RenderGraphInfo.prototype.getRenderNodeByName = function (nodeName) {
-                    return this.index[nodeName];
-                };
-                /**
-                 * Get the underlying node in the hierarchical graph by its name.
-                 */
-                RenderGraphInfo.prototype.getNodeByName = function (nodeName) {
-                    return this.hierarchy.node(nodeName);
-                };
-                /**
-                 * Get a previously created RenderNodeInfo for the specified node name,
-                 * or create one if it hasn't been created yet.
-                 */
-                RenderGraphInfo.prototype.getOrCreateRenderNodeByName = function (nodeName) {
-                    var _this = this;
-                    // Polymer may invoke this with null.
-                    if (!nodeName) {
-                        return null;
-                    }
-                    if (nodeName in this.index) {
-                        return this.index[nodeName];
-                    }
-                    var node = this.hierarchy.node(nodeName);
-                    // Exit early if the node does not exist in the hierarchy. This can happen
-                    // when a graph is reloaded while the infocard points to a node not visible
-                    // at the top-level.
-                    if (!node) {
-                        return null;
-                    }
-                    var renderInfo = node.isGroupNode ?
-                        new RenderGroupNodeInfo(node) :
-                        new RenderNodeInfo(node);
-                    this.index[nodeName] = renderInfo;
-                    if (node.stats) {
-                        renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
-                        renderInfo.computeTimeColor =
-                            this.computeTimeScale(node.stats.totalMicros);
-                    }
-                    // We only fade nodes when we're displaying stats.
-                    renderInfo.isFadedOut = this.displayingStats &&
-                        !tf.graph.util.hasDisplayableNodeStats(node.stats);
-                    if (node.isGroupNode) {
-                        // Make a list of tuples (device, proportion), where proportion
-                        // is the fraction of op nodes that have that device.
-                        var pairs = _.pairs(node.deviceHistogram);
-                        if (pairs.length > 0) {
-                            // Compute the total # of devices.
-                            var numDevices_1 = _.sum(pairs, _.last);
-                            renderInfo.deviceColors = _.map(pairs, function (pair) { return ({
-                                color: _this.deviceColorMap(pair[0]),
-                                // Normalize to a proportion of total # of devices.
-                                proportion: pair[1] / numDevices_1
-                            }); });
-                        }
-                    }
-                    else {
-                        var device = renderInfo.node.device;
-                        if (device) {
-                            renderInfo.deviceColors = [{
-                                    color: this.deviceColorMap(device),
-                                    proportion: 1.0
-                                }];
-                        }
-                    }
-                    return this.index[nodeName];
-                };
-                /**
-                 * Return the nearest ancestor node, including itself, that is visible
-                 * in the visualization. This method is used so that we can select
-                 * (highlight) a node that isn't drawn yet, by selecting (highlighting)
-                 * its nearest ancestor that has been drawn.
-                 */
-                RenderGraphInfo.prototype.getNearestVisibleAncestor = function (name) {
-                    var path = graph_1.getHierarchicalPath(name);
-                    for (var i = 0; i < path.length; i++) {
-                        var nodeName = path[i];
-                        // Op nodes have expanded set to false by default.
-                        if (!this.getRenderNodeByName(nodeName).expanded) {
-                            return nodeName;
-                        }
-                    }
-                    // Fallthrough. If everything was expanded return the node.
-                    return name;
-                };
-                // TODO(jimbo): Delete this an any code it touches (all deprecated).
-                RenderGraphInfo.prototype.setDepth = function (depth) {
-                    setGroupNodeDepth(this.root, +depth);
-                };
-                /**
-                 * Returns true if the renderNode is an isolated node within its parent node.
-                 */
-                RenderGraphInfo.prototype.isNodeAuxiliary = function (renderNode) {
-                    var parentNode = this.getRenderNodeByName(renderNode.node.parentNode.name);
-                    var found = _.find(parentNode.isolatedInExtract, function (node) {
-                        return node.node.name === renderNode.node.name;
-                    });
-                    if (found) {
-                        return true;
-                    }
-                    found = _.find(parentNode.isolatedOutExtract, function (node) {
-                        return node.node.name === renderNode.node.name;
-                    });
-                    return !!found;
-                };
-                RenderGraphInfo.prototype.buildSubhierarchy = function (nodeName) {
-                    var _this = this;
-                    // Terminate if the rendering hierarchy was already constructed
-                    // for this node.
-                    if (nodeName in this.hasSubhierarchy) {
-                        return;
-                    }
-                    var renderNodeInfo = this.index[nodeName];
-                    // If it is not a meta node or a series node, don't do anything.
-                    if (renderNodeInfo.node.type !== graph_1.NodeType.META &&
-                        renderNodeInfo.node.type !== graph_1.NodeType.SERIES) {
-                        return;
-                    }
-                    // At this point we know the rendering information is about a group node.
-                    var renderGroupNodeInfo = renderNodeInfo;
-                    var metagraph = renderGroupNodeInfo.node.metagraph;
-                    var coreGraph = renderGroupNodeInfo.coreGraph;
-                    // Create render nodes to represent each child from the metagraph. Although
-                    // these will initially be added to the coreGraph, they may later be
-                    // extracted. Also, due to extraction, the coreGraph may contain disjoint
-                    // groups between which there is no visible path (other than annotations).
-                    _.each(metagraph.nodes(), function (childName) {
-                        var childRenderInfo = _this.getOrCreateRenderNodeByName(childName);
-                        var childNode = childRenderInfo.node;
-                        coreGraph.setNode(childName, childRenderInfo);
-                        if (!childNode.isGroupNode) {
-                            _.each(childNode.inEmbeddings, function (embedding) {
-                                var renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-                                addInAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo, AnnotationType.CONSTANT);
-                                _this.index[embedding.name] = new RenderNodeInfo(embedding);
-                            });
-                            _.each(childNode.outEmbeddings, function (embedding) {
-                                var renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-                                addOutAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo, AnnotationType.SUMMARY);
-                                _this.index[embedding.name] = new RenderNodeInfo(embedding);
-                            });
-                        }
-                    });
-                    // Add render metaedge info for edges in the metagraph.
-                    _.each(metagraph.edges(), function (edgeObj) {
-                        var metaedge = metagraph.edge(edgeObj);
-                        var renderMetaedgeInfo = new RenderMetaedgeInfo(metaedge);
-                        renderMetaedgeInfo.isFadedOut =
-                            _this.index[edgeObj.v].isFadedOut || _this.index[edgeObj.w].isFadedOut;
-                        coreGraph.setEdge(edgeObj.v, edgeObj.w, renderMetaedgeInfo);
-                    });
-                    if (PARAMS.enableExtraction &&
-                        renderGroupNodeInfo.node.type === graph_1.NodeType.META) {
-                        extractHighDegrees(renderGroupNodeInfo);
-                    }
-                    // Record that we constructed the rendering hierarchy for this node, so we
-                    // don't construct it another time.
-                    this.hasSubhierarchy[nodeName] = true;
-                    // Look up the parent node's render information and short circuit if none.
-                    var parentNode = renderGroupNodeInfo.node.parentNode;
-                    if (!parentNode) {
-                        return;
-                    }
-                    var parentNodeInfo = this.index[parentNode.name];
-                    // Utility function for computing the name of a bridge node.
-                    var getBridgeNodeName = function (inbound) {
-                        var rest = [];
-                        for (var _i = 1; _i < arguments.length; _i++) {
-                            rest[_i - 1] = arguments[_i];
-                        }
-                        return rest.concat([inbound ? 'IN' : 'OUT']).join('~~');
-                    };
-                    // Build out the bridgegraph.
-                    var bridgegraph = this.hierarchy.getBridgegraph(nodeName);
-                    // Look for popular nodes so we can make annotations instead of paths.
-                    var otherCounts = {
-                        // Counts of edges coming INTO other nodes by name (outgoing from self).
-                        in: {},
-                        // Counts of edges going OUT from other nodes by name (coming into self).
-                        out: {},
-                        // Counts of all control edges involving other nodes by name.
-                        control: {},
-                    };
-                    _.each(bridgegraph.edges(), function (e) {
-                        // An edge is inbound if its destination node is in the metagraph.
-                        var inbound = !!metagraph.node(e.w);
-                        var otherName = inbound ? e.v : e.w;
-                        var metaedge = bridgegraph.edge(e);
-                        if (!metaedge.numRegularEdges) {
-                            otherCounts.control[otherName] =
-                                (otherCounts.control[otherName] || 0) + 1;
-                        }
-                        else if (inbound) {
-                            otherCounts.out[otherName] = (otherCounts.out[otherName] || 0) + 1;
-                        }
-                        else {
-                            otherCounts.in[otherName] = (otherCounts.in[otherName] || 0) + 1;
-                        }
-                    });
-                    // Add annotations and edges for bridgegraph relationships.
-                    var hierarchyNodeMap = this.hierarchy.getNodeMap();
-                    _.each(bridgegraph.edges(), function (bridgeEdgeObj) {
-                        var bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-                        // Determine whether this bridge edge is incoming by checking the
-                        // metagraph for a node that matches the destination end.
-                        var inbound = !!metagraph.node(bridgeEdgeObj.w);
-                        // Based on the direction of the edge, one endpoint will be an immediate
-                        // child of this renderNodeInfo, and the other endpoint will be a sibling
-                        // of the parent (or an ancestor further up).
-                        var _a = inbound ?
-                            [bridgeEdgeObj.w, bridgeEdgeObj.v] :
-                            [bridgeEdgeObj.v, bridgeEdgeObj.w], childName = _a[0], otherName = _a[1];
-                        var childRenderInfo = _this.index[childName];
-                        var otherRenderInfo = _this.index[otherName];
-                        var otherNode = otherRenderInfo ?
-                            otherRenderInfo.node :
-                            hierarchyNodeMap[otherName];
-                        // Determine whether this edge is a control edge between nodes where
-                        // either node is high-degree with respect to control edges. This will
-                        // be a signal to show it as an annotation instead of a bridge edge.
-                        var isHighDegreeControlEdge = !bridgeMetaedge.numRegularEdges &&
-                            otherCounts.control[otherName] > PARAMS.maxControlDegree;
-                        var _b = inbound ?
-                            [renderNodeInfo.inAnnotations, childRenderInfo.inAnnotations] :
-                            [renderNodeInfo.outAnnotations, childRenderInfo.outAnnotations], childAnnotations = _b[1];
-                        // Don't render a bridge path if the other node has in or out degree above
-                        // a threshold, lest bridge paths emanating out of a metagraph crowd up,
-                        // as was the case for the Fatcat LSTM lstm_1 > lstm_1 metagraph.
-                        var otherDegreeCount = (inbound ? otherCounts.out : otherCounts.in)[otherName];
-                        var isOtherHighDegree = otherDegreeCount > PARAMS.maxBridgePathDegree;
-                        // The adjoining render metaedge info from the parent's coreGraph, if any.
-                        // It will either be a Metaedge involving this node directly, if it
-                        // previously came from a metagraph, or it'll be a Metaedge involving
-                        // a previously created bridge node standing in for the other node.
-                        var adjoiningMetaedge = null;
-                        // We can only hope to render a bridge path if:
-                        //  - bridgegraph paths are enabled,
-                        //  - the other node is not too high-degree,
-                        //  - the child is in the core (not extracted for being high-degree), and
-                        //  - there's a path (in the traversal sense) between child and other.
-                        var canDrawBridgePath = false;
-                        if (PARAMS.enableBridgegraph &&
-                            !isOtherHighDegree &&
-                            !isHighDegreeControlEdge &&
-                            childRenderInfo.isInCore()) {
-                            // Utility function for finding an adjoining metaedge.
-                            var findAdjoiningMetaedge = function (targetName) {
-                                var adjoiningEdgeObj = inbound ?
-                                    { v: targetName, w: nodeName } :
-                                    { v: nodeName, w: targetName };
-                                return parentNodeInfo.coreGraph.edge(adjoiningEdgeObj);
-                            };
-                            adjoiningMetaedge = findAdjoiningMetaedge(otherName);
-                            if (!adjoiningMetaedge) {
-                                adjoiningMetaedge = findAdjoiningMetaedge(getBridgeNodeName(inbound, otherName, parentNode.name));
-                            }
-                            canDrawBridgePath = !!adjoiningMetaedge;
-                        }
-                        // Although dataflow edges are acyclic, control dependency edges may
-                        // actually point 'backwards' in the graph. If this bridgeMetaedge is
-                        // a control dependency, we need to determine whether it's backwards
-                        // pointing so that we render it appropriately.
-                        //
-                        // For instance, say we're rendering a graph with nodes named A/B and Z/Y,
-                        // and we're currently rendering the bridgegraph for A. Further, let's say
-                        // that there was an original BaseEdge from A/B->Z/Y and a CONTROL EDGE
-                        // from Z/Y=>A/B.
-                        //
-                        //     +----------------+
-                        //     | A              |
-                        //     |  +-----+       |         +------+
-                        //     |  | B   |>-----\x3e|>-------\x3e| Z    |
-                        //     |  |     |       |         |      |
-                        //     |  |     |   *   |         |      |
-                        //     |  |     |<=====<|<=======<|      |
-                        //     |  +-----+       |         +------+
-                        //     +----------------+
-                        //
-                        // When we render the subhierarchy for Metanode A, we'll come across a
-                        // control-only Metaedge in the bridgegraph from Z=>A/B (*). The question
-                        // is whether this edge is backwards.
-                        //
-                        // To answer that question, we follow the chain of adjoining metaedges
-                        // until we reach the topmost one. In this case, that's the control-only
-                        // Metaedge Z=>A in the ROOT's metagraph. We determine that this edge
-                        // is backwards by looking at the topological ordering of ROOT's metagraph
-                        // (which ignores control edges) and seeing that Z comes AFTER A.
-                        //
-                        // The property of being backwards is independent of whether the edge
-                        // is inbound or outbound. In the preceding example, if we were building
-                        // the subhierarchy for Z, we'd find bridge edge Z/Y=>A, walk to its
-                        // topmost adjoining metaedge Z=>A and discover that it's backwards.
-                        var backwards = false;
-                        if (adjoiningMetaedge && !bridgeMetaedge.numRegularEdges) {
-                            // Find the top-most adjoining render metaedge information, and the
-                            // GroupNode whose metagraph must contain the associated metaedge.
-                            var topAdjoiningMetaedge = adjoiningMetaedge;
-                            var topGroupNode = parentNodeInfo.node;
-                            while (topAdjoiningMetaedge.adjoiningMetaedge) {
-                                topAdjoiningMetaedge = topAdjoiningMetaedge.adjoiningMetaedge;
-                                topGroupNode = topGroupNode.parentNode;
-                            }
-                            // Check against the topological ordering for the top node. The current
-                            // bridge metaedge we're evaluating is backwards if its source comes
-                            // after its destination.
-                            var ordering = _this.hierarchy.getTopologicalOrdering(topGroupNode.name);
-                            var e = topAdjoiningMetaedge.metaedge;
-                            backwards = ordering[e.v] > ordering[e.w];
-                        }
-                        // Render backwards control edges as annotations.
-                        canDrawBridgePath = canDrawBridgePath && !backwards;
-                        // If we can't make a bridge path for any reason, then we add an
-                        // annotation instead.
-                        if (!canDrawBridgePath) {
-                            childAnnotations.push(new Annotation(otherNode, otherRenderInfo, new RenderMetaedgeInfo(bridgeMetaedge), AnnotationType.SHORTCUT, inbound));
-                            return;
-                        }
-                        // At this point, all conditions have been met for drawing a bridge path.
-                        // Find or create the IN/OUT node representing otherNode.
-                        var bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-                        var bridgeNodeName = getBridgeNodeName(inbound, otherName, nodeName);
-                        var bridgeNodeRenderInfo = coreGraph.node(bridgeNodeName);
-                        if (!bridgeNodeRenderInfo) {
-                            // Find or create the directional container for the bridge node.
-                            var bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-                            if (!bridgeContainerInfo) {
-                                var bridgeContainerNode = {
-                                    // Important node properties.
-                                    name: bridgeContainerName,
-                                    type: graph_1.NodeType.BRIDGE,
-                                    // Unused node properties.
-                                    isGroupNode: false,
-                                    cardinality: 0,
-                                    parentNode: null,
-                                    stats: null,
-                                    include: graph_1.InclusionType.UNSPECIFIED,
-                                    // BridgeNode properties.
-                                    inbound: inbound,
-                                };
-                                bridgeContainerInfo =
-                                    new RenderNodeInfo(bridgeContainerNode);
-                                _this.index[bridgeContainerName] = bridgeContainerInfo;
-                                coreGraph.setNode(bridgeContainerName, bridgeContainerInfo);
-                            }
-                            var bridgeNode = {
-                                // Important node properties.
-                                name: bridgeNodeName,
-                                type: graph_1.NodeType.BRIDGE,
-                                // Unimportant node properties.
-                                isGroupNode: false,
-                                cardinality: 1,
-                                parentNode: null,
-                                stats: null,
-                                include: graph_1.InclusionType.UNSPECIFIED,
-                                // BridgeNode properties.
-                                inbound: inbound,
-                            };
-                            bridgeNodeRenderInfo = new RenderNodeInfo(bridgeNode);
-                            _this.index[bridgeNodeName] = bridgeNodeRenderInfo;
-                            coreGraph.setNode(bridgeNodeName, bridgeNodeRenderInfo);
-                            // Set bridgeNode to be a graphlib child of the container node.
-                            coreGraph.setParent(bridgeNodeName, bridgeContainerName);
-                            bridgeContainerInfo.node.cardinality++;
-                        }
-                        // Create and add a bridge render metaedge.
-                        var bridgeRenderMetaedge = new RenderMetaedgeInfo(bridgeMetaedge);
-                        bridgeRenderMetaedge.adjoiningMetaedge = adjoiningMetaedge;
-                        inbound ?
-                            coreGraph.setEdge(bridgeNodeName, childName, bridgeRenderMetaedge) :
-                            coreGraph.setEdge(childName, bridgeNodeName, bridgeRenderMetaedge);
-                    }); // End _.each(bridgegraph.edges).
-                    // For each bridge container (IN and/or OUT), add structural edges between
-                    // terminal nodes and that container. A terminal node is one which has no
-                    // non-bridge edges in the direction of the container.
-                    //
-                    // For example, consider a Metanode A which contains two child nodes A/B
-                    // and A/C. Let's say it has one edge in the metagraph from A/B->A/C, and
-                    // one edge in the bridgegraph from Z->A/C.
-                    //
-                    // At this point, we've added a container bridge node IN to house all
-                    // incoming bridge nodes. We've also added a bridge node Z' (with parent IN)
-                    // to A, and a bridge edge from Z'->C.
-                    //
-                    //     +----------------------+
-                    //     | A          +---+     |
-                    //     |    +------\x3e| C |     |
-                    //     |    |       +---+     |
-                    //     |    |         ^       |
-                    //     |    |         |       |
-                    //     |    |    +----|----+  |
-                    //     |    |    | IN |    |  |
-                    //     |  +---+  |  +---+  |  |
-                    //     |  | B |  |  | Z'|  |  |
-                    //     |  +---+  |  +---+  |  |
-                    //     |         +---------+  |
-                    //     +----------------------+
-                    //
-                    // With no other help, dagre would lay out B and Z' on the same level,
-                    // because both of them have no incoming edges. In other words, B is a
-                    // terminal node in the INCOMING direction.
-                    //
-                    // But we want to force dagre to lay out Z' (and everything in IN) lower
-                    // than all non-bridge nodes, so that there's enough room for the bridge
-                    // edges after they've been adjusted to meet up with paths coming in from
-                    // outside.
-                    //
-                    // To force Z' (and all other bridge nodes) to be lowest in the graph, we
-                    // identify terminal nodes like B and give them structural edges to
-                    // a new structural bridge node S which we add to IN.
-                    //
-                    //     +----------------------+
-                    //     | A          +---+     |
-                    //     |       +---\x3e| C |     |
-                    //     |       |    +---+     |
-                    //     |     +---+    ^       |
-                    //     |     | B |    |       |
-                    //     |     +---+    |       |
-                    //     |       ^      |       |
-                    //     |       |      |       |
-                    //     |  +----|------|----+  |
-                    //     |  |IN  |      |    |  |
-                    //     |  |  +---+  +---+  |  |
-                    //     |  |  | S |  | Z'|  |  |
-                    //     |  |  +---+  +---+  |  |
-                    //     |  +----------------+  |
-                    //     +----------------------+
-                    //
-                    // This ensures that dagre will lay out the bridge containers strictly at
-                    // the ends of the graph. The structural edges will never be seen in the
-                    // visualization except as a debugging aid.
-                    _.each([true, false], function (inbound) {
-                        var bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-                        var bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-                        if (!bridgeContainerInfo) {
-                            return;
-                        }
-                        _.each(coreGraph.nodes(), function (childName) {
-                            // Short-circuit if this child is a bridge node or it's not a terminal
-                            // node in the direction we're interested in.
-                            var childNodeInfo = coreGraph.node(childName);
-                            if (childNodeInfo.node.type === graph_1.NodeType.BRIDGE) {
-                                return;
-                            }
-                            var isTerminal = inbound ?
-                                !coreGraph.predecessors(childName).length :
-                                !coreGraph.successors(childName).length;
-                            if (!isTerminal) {
-                                return;
-                            }
-                            // Find or create a bridge node in the container for all structural
-                            // metaedges. It would have been nice to skip this step and simply
-                            // set a metaedge between the terminal node and the container node, but
-                            // in that case, something about the graph upsets dagre.layout()'s
-                            // longestPath algorithm (was getting errors due to an undefined).
-                            var structuralNodeName = getBridgeNodeName(inbound, nodeName, 'STRUCTURAL_TARGET');
-                            var structuralRenderInfo = coreGraph.node(structuralNodeName);
-                            if (!structuralRenderInfo) {
-                                var bridgeNode = {
-                                    // Important Node properties.
-                                    name: structuralNodeName,
-                                    type: graph_1.NodeType.BRIDGE,
-                                    // Unimportant Node properties.
-                                    isGroupNode: false,
-                                    cardinality: 1,
-                                    parentNode: null,
-                                    stats: null,
-                                    include: graph_1.InclusionType.UNSPECIFIED,
-                                    // BridgeNode properties.
-                                    inbound: inbound,
-                                };
-                                structuralRenderInfo = new RenderNodeInfo(bridgeNode);
-                                structuralRenderInfo.structural = true;
-                                _this.index[structuralNodeName] = structuralRenderInfo;
-                                coreGraph.setNode(structuralNodeName, structuralRenderInfo);
-                                bridgeContainerInfo.node.cardinality++;
-                                coreGraph.setParent(structuralNodeName, bridgeContainerName);
-                            }
-                            // Create the structural Metaedge and insert it.
-                            var structuralMetaedgeInfo = new RenderMetaedgeInfo(null);
-                            structuralMetaedgeInfo.structural = true;
-                            structuralMetaedgeInfo.weight--; // Reduce weight for dagre layout.
-                            inbound ?
-                                coreGraph.setEdge(structuralNodeName, childName, structuralMetaedgeInfo) :
-                                coreGraph.setEdge(childName, structuralNodeName, structuralMetaedgeInfo);
-                        });
-                    });
-                };
-                return RenderGraphInfo;
-            }());
-            render.RenderGraphInfo = RenderGraphInfo;
-            /**
-             * A class for rendering annotation object which contains label
-             * about the node embedded as annotation, type of annotation and the location
-             * of both the annotation's node and edge.
-             *
-             * Annotation objects include embedded constants, embedded summary, and
-             * edge shortcuts.
-             */
-            var Annotation = (function () {
-                /**
-                 * Creates a new Annotation.
-                 *
-                 * @param node The underlying node this annotation points to.
-                 * @param renderNodeInfo The render information for the underlying node
-                 *     this annotation points to. This can be null if the annotation
-                 *     denotes an embedding (constant, summary), in which case we
-                 *     use the node property.
-                 * @param renderMetaedgeInfo The render information for the edge associated
-                 *     with the annotation.
-                 * @param type The type of the annotation.
-                 * @param isIn True if it is an in-annotation. False if it is an
-                 *     out-annotation.
-                 */
-                function Annotation(node, renderNodeInfo, renderMetaedgeInfo, type, isIn) {
-                    this.node = node;
-                    this.renderNodeInfo = renderNodeInfo;
-                    this.renderMetaedgeInfo = renderMetaedgeInfo;
-                    this.annotationType = type;
-                    // Properties specified by layout
-                    this.dx = 0;
-                    this.dy = 0;
-                    this.width = 0;
-                    this.height = 0;
-                    // Properties needed for generating an ID for the edge's path element if
-                    // this annotation is associated with a metaedge.
-                    if (renderMetaedgeInfo && renderMetaedgeInfo.metaedge) {
-                        this.v = renderMetaedgeInfo.metaedge.v;
-                        this.w = renderMetaedgeInfo.metaedge.w;
-                    }
-                    this.isIn = isIn;
-                    this.points = [];
-                }
-                return Annotation;
-            }());
-            render.Annotation = Annotation;
-            ;
-            (function (AnnotationType) {
-                AnnotationType[AnnotationType["SHORTCUT"] = 0] = "SHORTCUT";
-                AnnotationType[AnnotationType["CONSTANT"] = 1] = "CONSTANT";
-                AnnotationType[AnnotationType["SUMMARY"] = 2] = "SUMMARY";
-                AnnotationType[AnnotationType["ELLIPSIS"] = 3] = "ELLIPSIS";
-            })(render.AnnotationType || (render.AnnotationType = {}));
-            var AnnotationType = render.AnnotationType;
-            ;
-            /**
-             * Manages a list of annotations. Two will be used for each
-             * RenderNodeInfo, one for in annotations and one for out annotations.
-             */
-            var AnnotationList = (function () {
-                function AnnotationList() {
-                    this.list = [];
-                    this.nodeNames = {};
-                }
-                /**
-                 * Append an annotation to the list, or a stand-in ellipsis annotation instead
-                 * if this would make it too many.
-                 */
-                AnnotationList.prototype.push = function (annotation) {
-                    if (annotation.node.name in this.nodeNames) {
-                        return; // Skip duplicate annotation.
-                    }
-                    this.nodeNames[annotation.node.name] = true;
-                    if (this.list.length < PARAMS.maxAnnotations) {
-                        this.list.push(annotation);
-                        return;
-                    }
-                    var lastAnnotation = this.list[this.list.length - 1];
-                    if (lastAnnotation.annotationType === AnnotationType.ELLIPSIS) {
-                        var ellipsisNode_1 = lastAnnotation.node;
-                        ellipsisNode_1.setNumMoreNodes(++ellipsisNode_1.numMoreNodes);
-                        return;
-                    }
-                    var ellipsisNode = new tf.graph.EllipsisNodeImpl(1);
-                    this.list.push(new Annotation(ellipsisNode, new RenderNodeInfo(ellipsisNode), null, AnnotationType.ELLIPSIS, annotation.isIn));
-                };
-                return AnnotationList;
-            }());
-            render.AnnotationList = AnnotationList;
-            /**
-             * Contains rendering information about a node in the hierarchical graph.
-             */
-            var RenderNodeInfo = (function () {
-                function RenderNodeInfo(node) {
-                    this.node = node;
-                    this.expanded = false;
-                    this.inAnnotations = new AnnotationList();
-                    this.outAnnotations = new AnnotationList();
-                    // Params specified by layout
-                    this.x = 0;
-                    this.y = 0;
-                    this.width = 0;
-                    this.height = 0;
-                    this.inboxWidth = 0;
-                    this.outboxWidth = 0;
-                    this.excluded = false;
-                    // Params for bridge paths.
-                    this.structural = false;
-                    // Params for node box.
-                    this.labelOffset = 0;
-                    this.radius = 0;
-                    // Params for expanded node
-                    this.labelHeight = 0;
-                    this.paddingTop = 0;
-                    this.paddingLeft = 0;
-                    this.paddingRight = 0;
-                    this.paddingBottom = 0;
-                    this.isInExtract = false;
-                    this.isOutExtract = false;
-                    this.coreBox = { width: 0, height: 0 };
-                    // By default, we don't fade nodes out. Default to false for safety.
-                    this.isFadedOut = false;
-                }
-                RenderNodeInfo.prototype.isInCore = function () {
-                    return !this.isInExtract && !this.isOutExtract;
-                };
-                return RenderNodeInfo;
-            }());
-            render.RenderNodeInfo = RenderNodeInfo;
-            /**
-             * Contains rendering information about a Metaedge from the underlying
-             * hierarchical graph. It may be from either a metagraph or a bridgegraph.
-             */
-            var RenderMetaedgeInfo = (function () {
-                function RenderMetaedgeInfo(metaedge) {
-                    this.metaedge = metaedge;
-                    this.adjoiningMetaedge = null;
-                    this.structural = false;
-                    this.weight = 1;
-                    this.isFadedOut = false;
-                }
-                return RenderMetaedgeInfo;
-            }());
-            render.RenderMetaedgeInfo = RenderMetaedgeInfo;
-            function addInAnnotation(node, predecessor, predecessorRenderInfo, edge, type) {
-                var annotation = new Annotation(predecessor, predecessorRenderInfo, edge, type, true);
-                node.inAnnotations.push(annotation);
-            }
-            function addOutAnnotation(node, successor, successorRenderInfo, edge, type) {
-                var annotation = new Annotation(successor, successorRenderInfo, edge, type, false);
-                node.outAnnotations.push(annotation);
-            }
-            function setGraphDepth(graph, depth) {
-                _.each(graph.nodes(), function (nodeName) {
-                    var child = graph.node(nodeName);
-                    child.expanded = depth > 1; // set all child of depth 1 to collapsed
-                    if (depth > 0) {
-                        switch (child.node.type) {
-                            case graph_1.NodeType.META:
-                            case graph_1.NodeType.SERIES:
-                                setGroupNodeDepth(child, depth - 1);
-                                break;
-                        }
-                    }
-                });
-            }
-            ;
-            var RenderGroupNodeInfo = (function (_super) {
-                __extends(RenderGroupNodeInfo, _super);
-                function RenderGroupNodeInfo(groupNode) {
-                    _super.call(this, groupNode);
-                    var metagraph = groupNode.metagraph;
-                    var gl = metagraph.graph();
-                    this.coreGraph =
-                        graph_1.createGraph(gl.name, graph_1.GraphType.CORE, { compound: true });
-                    this.inExtractBox = { width: 0, height: 0 };
-                    this.outExtractBox = { width: 0, height: 0 };
-                    this.isolatedInExtract = [];
-                    this.isolatedOutExtract = [];
-                }
-                return RenderGroupNodeInfo;
-            }(RenderNodeInfo));
-            render.RenderGroupNodeInfo = RenderGroupNodeInfo;
-            function setGroupNodeDepth(renderInfo, depth) {
-                if (renderInfo.coreGraph) {
-                    setGraphDepth(renderInfo.coreGraph, depth);
-                }
-            }
-            /**
-             * Remove an edge from the graph and add annotations to both ends of the edge.
-             *
-             * @param The core graph.
-             * @param v Source name.
-             * @param w Sink name.
-             */
-            function createShortcut(graph, v, w) {
-                var src = graph.node(v);
-                var sink = graph.node(w);
-                var edge = graph.edge(v, w);
-                // If either of the nodes is explicitly included in the main graph and
-                // both nodes are in the main graph then do not create the shortcut
-                // and instead keep the real edge.
-                if ((src.node.include === graph_1.InclusionType.INCLUDE ||
-                    sink.node.include === graph_1.InclusionType.INCLUDE) &&
-                    src.node.include !== graph_1.InclusionType.EXCLUDE &&
-                    sink.node.include !== graph_1.InclusionType.EXCLUDE) {
-                    return;
-                }
-                // Add each annotation.
-                addOutAnnotation(src, sink.node, sink, edge, AnnotationType.SHORTCUT);
-                addInAnnotation(sink, src.node, src, edge, AnnotationType.SHORTCUT);
-                // Remove the edge from the core graph.
-                graph.removeEdge(v, w);
-            }
-            /**
-             * Remove edges from a node, and set its isOutExtract property to true,
-             * and remove the node and move it to isolatedOutExtract.
-             *
-             * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
-             * edges. Otherwise, only extract all in-edges.
-             */
-            function makeOutExtract(renderNode, n, forceDetach) {
-                var graph = renderNode.coreGraph;
-                var child = graph.node(n);
-                child.isOutExtract = true;
-                _.each(graph.predecessors(n), function (p, index) {
-                    createShortcut(graph, p, n);
-                });
-                if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-                    _.each(graph.successors(n), function (s, index) {
-                        createShortcut(graph, n, s);
-                    });
-                }
-                // Remove the node from the core graph if it no longer has neighbors.
-                if (graph.neighbors(n).length === 0) {
-                    child.node.include = graph_1.InclusionType.EXCLUDE;
-                    renderNode.isolatedOutExtract.push(child);
-                    graph.removeNode(n);
-                }
-            }
-            /**
-             * Remove edges from a node, set its isInExtract property to true,
-             * and remove the node and move it to isolatedInExtract.
-             *
-             * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
-             * edges. Otherwise, only remove all out-edges.
-             */
-            function makeInExtract(renderNode, n, forceDetach) {
-                var graph = renderNode.coreGraph;
-                var child = graph.node(n);
-                child.isInExtract = true;
-                _.each(graph.successors(n), function (s, index) {
-                    createShortcut(graph, n, s);
-                });
-                if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-                    _.each(graph.predecessors(n), function (p, index) {
-                        createShortcut(graph, p, n);
-                    });
-                }
-                // Remove the node from the core graph if it no longer has neighbors.
-                if (graph.neighbors(n).length === 0) {
-                    child.node.include = graph_1.InclusionType.EXCLUDE;
-                    renderNode.isolatedInExtract.push(child);
-                    graph.removeNode(n);
-                }
-            }
-            render.makeInExtract = makeInExtract;
-            /**
-             * Check whether the node's type is a member of the given list of types.
-             *
-             * @param node Node.
-             * @param types List of type to match.
-             */
-            function hasTypeIn(node, types) {
-                if (node.type === graph_1.NodeType.OP) {
-                    for (var i = 0; i < types.length; i++) {
-                        if (node.op === types[i]) {
-                            return true;
-                        }
-                    }
-                }
-                else if (node.type === graph_1.NodeType.META) {
-                    var rootOpNode = node.getRootOp();
-                    if (rootOpNode) {
-                        for (var i = 0; i < types.length; i++) {
-                            if (rootOpNode.op === types[i]) {
-                                return true;
-                            }
-                        }
-                    }
-                }
-                return false;
-            }
-            /** Move nodes that are specified to be excluded out of the core graph. */
-            function extractSpecifiedNodes(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include === graph_1.InclusionType.EXCLUDE) {
-                        if (renderNode.coreGraph.outEdges(n).length >
-                            renderNode.coreGraph.inEdges(n).length) {
-                            makeOutExtract(renderNode, n, true);
-                        }
-                        else {
-                            makeInExtract(renderNode, n, true);
-                        }
-                    }
-                });
-            }
-            /** Remove edges from pre-defined out-extract patterns */
-            function extractPredefinedSink(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (hasTypeIn(renderInfo.node, PARAMS.outExtractTypes)) {
-                        makeOutExtract(renderNode, n);
-                    }
-                });
-            }
-            /** Remove edges from pre-defined in-extract patterns */
-            function extractPredefinedSource(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (hasTypeIn(renderInfo.node, PARAMS.inExtractTypes)) {
-                        makeInExtract(renderNode, n);
-                    }
-                });
-            }
-            /** Extract nodes deemed to have either high in-degree or high out-degree. */
-            function extractHighInOrOutDegree(renderNode) {
-                var graph = renderNode.coreGraph;
-                // Create mappings from node to in and out degrees. Count the number of valid
-                // nodes along the way.
-                var nodeToInDegree = {};
-                var nodeToOutDegree = {};
-                var validNodeCount = 0;
-                _.each(graph.nodes(), function (currentNode) {
-                    if (graph.node(currentNode).node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        // This node is not included in the first place.
-                        return;
-                    }
-                    // Count the in and out degrees based on only regular edges, unless there
-                    // are no regular edges, in which case use the number of control edges.
-                    // This is done so that control edges don't affect if nodes are extracted
-                    // from the core graph, unless the node is only used for control.
-                    var inDegree = _.reduce(graph.predecessors(currentNode), function (inDegree, pred) {
-                        var metaedge = graph.edge(pred, currentNode).metaedge;
-                        return inDegree + (metaedge.numRegularEdges ? 1 : 0);
-                    }, 0);
-                    if (inDegree === 0 && graph.predecessors(currentNode).length > 0) {
-                        inDegree = graph.predecessors(currentNode).length;
-                    }
-                    var outDegree = _.reduce(graph.successors(currentNode), function (outDegree, succ) {
-                        var metaedge = graph.edge(currentNode, succ).metaedge;
-                        return outDegree + (metaedge.numRegularEdges ? 1 : 0);
-                    }, 0);
-                    if (outDegree === 0 && graph.successors(currentNode).length > 0) {
-                        outDegree = graph.successors(currentNode).length;
-                    }
-                    // Store the in and out degrees of this node to avoid recomputing.
-                    nodeToInDegree[currentNode] = inDegree;
-                    nodeToOutDegree[currentNode] = outDegree;
-                    validNodeCount++;
-                });
-                if (validNodeCount < PARAMS.minNodeCountForExtraction) {
-                    // This graph has few nodes. Do not extract any nodes.
-                    return;
-                }
-                // We only extract if the node has a min in or out degree greater than this.
-                var minUpperBound = PARAMS.minDegreeForExtraction - 1;
-                // Mark for extraction nodes with in-degree > Q3 + (Q3 - Q1).
-                var q3Index = Math.round(validNodeCount * 0.75);
-                var q1Index = Math.round(validNodeCount * 0.25);
-                var sortedByInDegree = Object.keys(nodeToInDegree).sort(function (node0, node1) {
-                    return nodeToInDegree[node0] - nodeToInDegree[node1];
-                });
-                var inDegreeQ3 = nodeToInDegree[sortedByInDegree[q3Index]];
-                var inDegreeQ1 = nodeToInDegree[sortedByInDegree[q1Index]];
-                var inDegreeUpperBound = inDegreeQ3 + inDegreeQ3 - inDegreeQ1;
-                // Only extract if the upper bound is high enough.
-                inDegreeUpperBound = Math.max(inDegreeUpperBound, minUpperBound);
-                for (var i = validNodeCount - 1; nodeToInDegree[sortedByInDegree[i]] > inDegreeUpperBound; i--) {
-                    // Extract a high in-degree node.
-                    makeInExtract(renderNode, sortedByInDegree[i]);
-                }
-                // Mark for extraction nodes with out-degree > Q3 + (Q3 - Q1) * 4.
-                var sortedByOutDegree = Object.keys(nodeToOutDegree).sort(function (node0, node1) {
-                    return nodeToOutDegree[node0] - nodeToOutDegree[node1];
-                });
-                var outDegreeQ3 = nodeToOutDegree[sortedByOutDegree[q3Index]];
-                var outDegreeQ1 = nodeToOutDegree[sortedByOutDegree[q1Index]];
-                // The upper bound for extracting out-degree nodes is higher than that for
-                // extracting in-degree ones (Note the "* 4") because, in practice, some
-                // graphs look worse with a smaller out-degree bound. For instance, a smaller
-                // out-degree bound removes the convolution nodes from cifar 10 train's graph.
-                var outDegreeUpperBound = outDegreeQ3 + (outDegreeQ3 - outDegreeQ1) * 4;
-                // Only extract if the upper bound is high enough.
-                outDegreeUpperBound = Math.max(outDegreeUpperBound, minUpperBound);
-                for (var i = validNodeCount - 1; nodeToOutDegree[sortedByOutDegree[i]] > outDegreeUpperBound; i--) {
-                    var node = graph.node(sortedByOutDegree[i]);
-                    if (!node || node.isInExtract) {
-                        // This node has already been extracted due to high in-degree. It might
-                        // have been removed from the graph in general (during in-degree
-                        // extraction) due to a lack of neighbors. Do not extract this node twice.
-                        continue;
-                    }
-                    // Extract a high out-degree node that has not already been extracted.
-                    makeOutExtract(renderNode, sortedByOutDegree[i]);
-                }
-            }
-            /** Remove control edges from nodes that have too many control edges */
-            function removeControlEdges(renderNode) {
-                var graph = renderNode.coreGraph;
-                // Collect control edges into a map by node name.
-                var map = {};
-                _.each(graph.edges(), function (e) {
-                    if (!graph.edge(e).metaedge.numRegularEdges) {
-                        (map[e.v] = map[e.v] || []).push(e);
-                        (map[e.w] = map[e.w] || []).push(e);
-                    }
-                });
-                // For each node with too many control edges, turn them into annotations.
-                _.each(map, function (edges, nodeName) {
-                    if (edges.length > PARAMS.maxControlDegree) {
-                        _.each(edges, function (e) { return createShortcut(graph, e.v, e.w); });
-                    }
-                });
-            }
-            /**
-             * Given an integer, picks a hue that is far apart from other colors.
-             * The formula for picking color that avoid collision is:
-             *     hue = (color range * golden ratio * index) % color range
-             */
-            function mapIndexToHue(id) {
-                var GOLDEN_RATIO = 1.61803398875;
-                // Hue of 0 is reserved for the gray nodes.
-                var MIN_HUE = 1;
-                var MAX_HUE = 359;
-                var COLOR_RANGE = MAX_HUE - MIN_HUE;
-                return MIN_HUE + ((COLOR_RANGE * GOLDEN_RATIO * id) % COLOR_RANGE);
-            }
-            render.mapIndexToHue = mapIndexToHue;
-            ;
-            /**
-             * Remove edges and add to annotation instead.
-             *
-             * For root node, consider predefined types for source and sink.
-             * We do not extract predefined type from non-root so that Variables and the
-             * sgd node (op type = 'NoOp') do not get extract from inside own group.
-             *
-             * The order of extraction is important here as swapping the order can totally
-             * screw up the graph layout.
-             *
-             * @param {Render.Node} renderNode Node to manipulate.
-             */
-            function extractHighDegrees(renderNode) {
-                extractSpecifiedNodes(renderNode);
-                if (PARAMS.outExtractTypes) {
-                    extractPredefinedSink(renderNode);
-                }
-                // This has to come before extract high in-degree to protect the core part
-                // that takes many variables.
-                if (PARAMS.inExtractTypes) {
-                    extractPredefinedSource(renderNode);
-                }
-                extractHighInOrOutDegree(renderNode);
-                if (PARAMS.maxControlDegree) {
-                    removeControlEdges(renderNode);
-                }
-                // Extract isolated nodes, which can be
-                // (1) source-like and sink-like nodes that are not originally isolated but
-                //     become isolated after further removal.
-                // (2) isolated nodes with annotations on one-side.  These might be either
-                //     - nodes that originally have high out-degree but because we remove
-                //       high in-degree nodes first, they no longer have high in-degree when
-                //       we check.  (Detecting all high-degree before removing also leads to
-                //       another problem.)
-                //     - nodes that do not have high degree, but their neighbors are all
-                //       extracted, so it might make sense to extract them too.
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var child = graph.node(n);
-                    var degree = graph.neighbors(n).length;
-                    if (child.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (degree === 0) {
-                        var hasOutAnnotations = child.outAnnotations.list.length > 0;
-                        var hasInAnnotations = child.inAnnotations.list.length > 0;
-                        if (child.isInExtract) {
-                            // This case only happens if detachAllEdgesForHighDegree is false.
-                            // (Otherwise all source-like nodes are all isolated already.)
-                            renderNode.isolatedInExtract.push(child);
-                            child.node.include = graph_1.InclusionType.EXCLUDE;
-                            graph.removeNode(n);
-                        }
-                        else if (child.isOutExtract) {
-                            // This case only happens if detachAllEdgesForHighDegree is false.
-                            // // (Otherwise all sink-like nodes are all isolated already.)
-                            renderNode.isolatedOutExtract.push(child);
-                            child.node.include = graph_1.InclusionType.EXCLUDE;
-                            graph.removeNode(n);
-                        }
-                        else if (PARAMS.extractIsolatedNodesWithAnnotationsOnOneSide) {
-                            if (hasOutAnnotations && !hasInAnnotations) {
-                                child.isInExtract = true; // for ones with high out-annotations
-                                renderNode.isolatedInExtract.push(child);
-                                child.node.include = graph_1.InclusionType.EXCLUDE;
-                                graph.removeNode(n);
-                            }
-                            else if (hasInAnnotations && !hasOutAnnotations) {
-                                child.isOutExtract = true; // for ones with high in-annotations
-                                renderNode.isolatedOutExtract.push(child);
-                                child.node.include = graph_1.InclusionType.EXCLUDE;
-                                graph.removeNode(n);
-                            }
-                            else {
-                            }
-                        }
-                    }
-                });
-            }
-        })(render = graph_1.render || (graph_1.render = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph.render
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var annotation;
-            (function (annotation_1) {
-                /**
-                 * Populate a given annotation container group
-                 *
-                 *     <g class='{in|out}-annotations'></g>
-                 *
-                 * with annotation group of the following structure:
-                 *
-                 * <g class='annotation'>
-                 *   <g class='annotation-node'>
-                 *   \x3c!--
-                 *   Content here determined by Scene.node.buildGroup.
-                 *   --\x3e
-                 *   </g>
-                 * </g>
-                 *
-                 * @param container selection of the container.
-                 * @param annotationData node.{in|out}Annotations
-                 * @param d node to build group for.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return selection of appended objects
-                 */
-                function buildGroup(container, annotationData, d, sceneElement) {
-                    // Select all children and join with data.
-                    var annotationGroups = container
-                        .selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes;
-                    })
-                        .data(annotationData.list, function (d) { return d.node.name; });
-                    annotationGroups.enter()
-                        .append('g')
-                        .attr('data-name', function (a) { return a.node.name; })
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        // Add annotation to the index in the scene
-                        sceneElement.addAnnotationGroup(a, d, aGroup);
-                        // Append annotation edge
-                        var edgeType = scene.Class.Annotation.EDGE;
-                        var metaedge = a.renderMetaedgeInfo && a.renderMetaedgeInfo.metaedge;
-                        if (metaedge && !metaedge.numRegularEdges) {
-                            edgeType += ' ' + scene.Class.Annotation.CONTROL_EDGE;
-                        }
-                        // If any edges are reference edges, add the reference edge class.
-                        if (metaedge && metaedge.numRefEdges) {
-                            edgeType += ' ' + scene.Class.Edge.REF_LINE;
-                        }
-                        scene.edge.appendEdge(aGroup, a, sceneElement, edgeType);
-                        if (a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                            addAnnotationLabelFromNode(aGroup, a);
-                            buildShape(aGroup, a);
-                        }
-                        else {
-                            addAnnotationLabel(aGroup, a.node.name, a, scene.Class.Annotation.ELLIPSIS);
-                        }
-                    });
-                    annotationGroups
-                        .attr('class', function (a) {
-                        return scene.Class.Annotation.GROUP + ' ' +
-                            annotationToClassName(a.annotationType) + ' ' +
-                            scene.node.nodeClass(a);
-                    })
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        update(aGroup, d, a, sceneElement);
-                        if (a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                            addInteraction(aGroup, d, a, sceneElement);
-                        }
-                    });
-                    annotationGroups.exit()
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        // Remove annotation from the index in the scene
-                        sceneElement.removeAnnotationGroup(a, d, aGroup);
-                    })
-                        .remove();
-                    return annotationGroups;
-                }
-                annotation_1.buildGroup = buildGroup;
-                ;
-                /**
-                 * Maps an annotation enum to a class name used in css rules.
-                 */
-                function annotationToClassName(annotationType) {
-                    return (graph.render.AnnotationType[annotationType] || '').toLowerCase() || null;
-                }
-                function buildShape(aGroup, a) {
-                    if (a.annotationType === graph.render.AnnotationType.SUMMARY) {
-                        var summary = scene.selectOrCreateChild(aGroup, 'use');
-                        summary.attr({
-                            'class': 'summary',
-                            'xlink:href': '#summary-icon',
-                            'cursor': 'pointer'
-                        });
-                    }
-                    else {
-                        var shape = scene.node.buildShape(aGroup, a, scene.Class.Annotation.NODE);
-                        // add title tag to get native tooltips
-                        scene.selectOrCreateChild(shape, 'title').text(a.node.name);
-                    }
-                }
-                function addAnnotationLabelFromNode(aGroup, a) {
-                    var namePath = a.node.name.split('/');
-                    var text = namePath[namePath.length - 1];
-                    return addAnnotationLabel(aGroup, text, a, null);
-                }
-                function addAnnotationLabel(aGroup, label, a, additionalClassNames) {
-                    var classNames = scene.Class.Annotation.LABEL;
-                    if (additionalClassNames) {
-                        classNames += ' ' + additionalClassNames;
-                    }
-                    var txtElement = aGroup.append('text')
-                        .attr('class', classNames)
-                        .attr('dy', '.35em')
-                        .attr('text-anchor', a.isIn ? 'end' : 'start')
-                        .text(label);
-                    return tf.graph.scene.node.enforceLabelWidth(txtElement, -1);
-                }
-                function addInteraction(selection, d, annotation, sceneElement) {
-                    selection
-                        .on('mouseover', function (a) {
-                        sceneElement.fire('annotation-highlight', { name: a.node.name, hostName: d.node.name });
-                    })
-                        .on('mouseout', function (a) {
-                        sceneElement.fire('annotation-unhighlight', { name: a.node.name, hostName: d.node.name });
-                    })
-                        .on('click', function (a) {
-                        // Stop this event's propagation so that it isn't also considered a
-                        // graph-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('annotation-select', { name: a.node.name, hostName: d.node.name });
-                    });
-                    if (annotation.annotationType !== graph.render.AnnotationType.SUMMARY &&
-                        annotation.annotationType !== graph.render.AnnotationType.CONSTANT) {
-                        selection.on('contextmenu', scene.contextmenu.getMenu(scene.node.getContextMenu(annotation.node, sceneElement)));
-                    }
-                }
-                ;
-                /**
-                 * Adjust annotation's position.
-                 *
-                 * @param aGroup selection of a 'g.annotation' element.
-                 * @param d Host node data.
-                 * @param a annotation node data.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function update(aGroup, d, a, sceneElement) {
-                    var cx = graph.layout.computeCXPositionOfNodeShape(d);
-                    // Annotations that point to embedded nodes (constants,summary)
-                    // don't have a render information attached so we don't stylize these.
-                    // Also we don't stylize ellipsis annotations (the string '... and X more').
-                    if (a.renderNodeInfo &&
-                        a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                        scene.node.stylize(aGroup, a.renderNodeInfo, sceneElement, scene.Class.Annotation.NODE);
-                    }
-                    if (a.annotationType === graph.render.AnnotationType.SUMMARY) {
-                        // Update the width of the annotation to give space for the image.
-                        a.width += 10;
-                    }
-                    // label position
-                    aGroup.select('text.' + scene.Class.Annotation.LABEL).transition().attr({
-                        x: cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset),
-                        y: d.y + a.dy
-                    });
-                    // Some annotations (such as summary) are represented using a 12x12 image tag.
-                    // Purposely omitted units (e.g. pixels) since the images are vector graphics.
-                    // If there is an image, we adjust the location of the image to be vertically
-                    // centered with the node and horizontally centered between the arrow and the
-                    // text label.
-                    aGroup.select('use.summary').transition().attr({
-                        x: cx + a.dx - 3,
-                        y: d.y + a.dy - 6
-                    });
-                    // Node position (only one of the shape selection will be non-empty.)
-                    scene.positionEllipse(aGroup.select('.' + scene.Class.Annotation.NODE + ' ellipse'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    scene.positionRect(aGroup.select('.' + scene.Class.Annotation.NODE + ' rect'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    scene.positionRect(aGroup.select('.' + scene.Class.Annotation.NODE + ' use'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    // Edge position
-                    aGroup.select('path.' + scene.Class.Annotation.EDGE).transition().attr('d', function (a) {
-                        // map relative position to absolute position
-                        var points = a.points.map(function (p) { return { x: p.dx + cx, y: p.dy + d.y }; });
-                        return scene.edge.interpolate(points);
-                    });
-                }
-                ;
-            })(annotation = scene.annotation || (scene.annotation = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var contextmenu;
-            (function (contextmenu) {
-                /**
-                 * Returns the event listener, which can be used as an argument for the d3
-                 * selection.on function. Renders the context menu that is to be displayed
-                 * in response to the event.
-                 */
-                function getMenu(menu) {
-                    var menuSelection = d3.select('.context-menu');
-                    // Close the menu when anything else is clicked.
-                    d3.select('body').on('click.context', function () { menuSelection.style('display', 'none'); });
-                    // Function called to populate the context menu.
-                    return function (data, index) {
-                        var _this = this;
-                        // Position and display the menu.
-                        var event = d3.event;
-                        menuSelection.style({
-                            'display': 'block',
-                            'left': (event.layerX + 1) + 'px',
-                            'top': (event.layerY + 1) + 'px'
-                        });
-                        // Stop the event from propagating further.
-                        event.preventDefault();
-                        event.stopPropagation();
-                        // Add provided items to the context menu.
-                        menuSelection.html('');
-                        var list = menuSelection.append('ul');
-                        list.selectAll('li')
-                            .data(menu)
-                            .enter()
-                            .append('li')
-                            .html(function (d) { return d.title(data); })
-                            .on('click', function (d, i) {
-                            d.action(_this, data, index);
-                            menuSelection.style('display', 'none');
-                        });
-                    };
-                }
-                contextmenu.getMenu = getMenu;
-                ;
-            })(contextmenu = scene.contextmenu || (scene.contextmenu = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var scene;
-        (function (scene) {
-            var edge;
-            (function (edge) {
-                /** Delimiter between dimensions when showing sizes of tensors. */
-                var TENSOR_SHAPE_DELIM = '×';
-                /** The minimum stroke width of an edge. */
-                edge.MIN_EDGE_WIDTH = 0.75;
-                /** The maximum stroke width of an edge. */
-                edge.MAX_EDGE_WIDTH = 12;
-                /** The exponent used in the power scale for edge thickness. */
-                var EDGE_WIDTH_SCALE_EXPONENT = 0.3;
-                /** The domain (min and max value) for the edge width. */
-                var DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
-                edge.EDGE_WIDTH_SCALE = d3.scale.pow()
-                    .exponent(EDGE_WIDTH_SCALE_EXPONENT)
-                    .domain(DOMAIN_EDGE_WIDTH_SCALE)
-                    .range([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH])
-                    .clamp(true);
-                var arrowheadMap = d3.scale.quantize().domain([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH]).range([
-                    'small', 'medium', 'large', 'xlarge'
-                ]);
-                /** Minimum stroke width to put edge labels in the middle of edges */
-                var CENTER_EDGE_LABEL_MIN_STROKE_WIDTH = 2.5;
-                function getEdgeKey(edgeObj) {
-                    return edgeObj.v + graph_1.EDGE_KEY_DELIM + edgeObj.w;
-                }
-                edge.getEdgeKey = getEdgeKey;
-                /**
-                 * Select or Create a 'g.edges' group to a given sceneGroup
-                 * and builds a number of 'g.edge' groups inside the group.
-                 *
-                 * Structure Pattern:
-                 *
-                 * <g class='edges'>
-                 *   <g class='edge'>
-                 *     <path class='edgeline'/>
-                 *   </g>
-                 *   ...
-                 * </g>
-                 *
-                 *
-                 * @param sceneGroup container
-                 * @param graph
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return selection of the created nodeGroups
-                 */
-                function buildGroup(sceneGroup, graph, sceneElement) {
-                    var edges = [];
-                    edges = _.reduce(graph.edges(), function (edges, edgeObj) {
-                        var edgeLabel = graph.edge(edgeObj);
-                        edges.push({
-                            v: edgeObj.v,
-                            w: edgeObj.w,
-                            label: edgeLabel
-                        });
-                        return edges;
-                    }, edges);
-                    var container = scene.selectOrCreateChild(sceneGroup, 'g', scene.Class.Edge.CONTAINER);
-                    // Select all children and join with data.
-                    // (Note that all children of g.edges are g.edge)
-                    var edgeGroups = container.selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes;
-                    }).data(edges, getEdgeKey);
-                    // Make edges a group to support rendering multiple lines for metaedge
-                    edgeGroups.enter()
-                        .append('g')
-                        .attr('class', scene.Class.Edge.GROUP)
-                        .attr('data-edge', getEdgeKey)
-                        .each(function (d) {
-                        var edgeGroup = d3.select(this);
-                        d.label.edgeGroup = edgeGroup;
-                        // index node group for quick highlighting
-                        sceneElement._edgeGroupIndex[getEdgeKey(d)] = edgeGroup;
-                        // Add line during enter because we're assuming that type of line
-                        // normally does not change.
-                        appendEdge(edgeGroup, d, sceneElement);
-                    });
-                    edgeGroups.each(position);
-                    edgeGroups.each(function (d) {
-                        stylize(d3.select(this), d, sceneElement);
-                    });
-                    edgeGroups.exit()
-                        .each(function (d) {
-                        delete sceneElement._edgeGroupIndex[getEdgeKey(d)];
-                    })
-                        .remove();
-                    return edgeGroups;
-                }
-                edge.buildGroup = buildGroup;
-                ;
-                /**
-                 * Returns the label for the given base edge.
-                 * The label is the shape of the underlying tensor.
-                 */
-                function getLabelForBaseEdge(baseEdge, renderInfo) {
-                    var node = renderInfo.getNodeByName(baseEdge.v);
-                    if (node.outputShapes == null || node.outputShapes.length === 0) {
-                        return null;
-                    }
-                    var shape = node.outputShapes[baseEdge.outputTensorIndex];
-                    if (shape == null) {
-                        return null;
-                    }
-                    if (shape.length === 0) {
-                        return 'scalar';
-                    }
-                    return shape.map(function (size) { return size === -1 ? '?' : size; })
-                        .join(TENSOR_SHAPE_DELIM);
-                }
-                edge.getLabelForBaseEdge = getLabelForBaseEdge;
-                /**
-                 * Creates the label for the given metaedge. If the metaedge consists
-                 * of only 1 tensor, and it's shape is known, the label will contain that
-                 * shape. Otherwise, the label will say the number of tensors in the metaedge.
-                 */
-                function getLabelForEdge(metaedge, renderInfo) {
-                    var isMultiEdge = metaedge.baseEdgeList.length > 1;
-                    return isMultiEdge ?
-                        metaedge.baseEdgeList.length + ' tensors' :
-                        getLabelForBaseEdge(metaedge.baseEdgeList[0], renderInfo);
-                }
-                edge.getLabelForEdge = getLabelForEdge;
-                /**
-                 * Shortens the path enought such that the tip of the start/end marker will
-                 * point to the start/end of the path. The marker can be of arbitrary size.
-                 *
-                 * @param points Array of path control points.
-                 * @param marker D3 selection of the <marker> svg element.
-                 * @param isStart Is the marker a `start-marker`. If false, the marker is
-                 *     an `end-marker`.
-                 * @return The new array of control points.
-                 */
-                function adjustPathPointsForMarker(points, marker, isStart) {
-                    var lineFunc = d3.svg.line()
-                        .x(function (d) { return d.x; })
-                        .y(function (d) { return d.y; });
-                    var path = d3.select(document.createElementNS('http://www.w3.org/2000/svg', 'path'))
-                        .attr('d', lineFunc(points));
-                    var markerWidth = +marker.attr('markerWidth');
-                    var viewBox = marker.attr('viewBox').split(' ').map(Number);
-                    var viewBoxWidth = viewBox[2] - viewBox[0];
-                    var refX = +marker.attr('refX');
-                    var pathNode = path.node();
-                    if (isStart) {
-                        var fractionStickingOut = refX / viewBoxWidth;
-                        var length_1 = markerWidth * fractionStickingOut;
-                        var point = pathNode.getPointAtLength(length_1);
-                        // Figure out how many segments of the path we need to remove in order
-                        // to shorten the path.
-                        var segIndex = pathNode.getPathSegAtLength(length_1);
-                        // Update the very first segment.
-                        points[segIndex - 1] = { x: point.x, y: point.y };
-                        // Ignore every point before segIndex - 1.
-                        return points.slice(segIndex - 1);
-                    }
-                    else {
-                        var fractionStickingOut = 1 - refX / viewBoxWidth;
-                        var length_2 = pathNode.getTotalLength() - markerWidth * fractionStickingOut;
-                        var point = pathNode.getPointAtLength(length_2);
-                        // Figure out how many segments of the path we need to remove in order
-                        // to shorten the path.
-                        var segIndex = pathNode.getPathSegAtLength(length_2);
-                        // Update the very last segment.
-                        points[segIndex] = { x: point.x, y: point.y };
-                        // Ignore every point after segIndex.
-                        return points.slice(0, segIndex + 1);
-                    }
-                }
-                /**
-                 * For a given d3 selection and data object, create a path to represent the
-                 * edge described in d.label.
-                 *
-                 * If d.label is defined, it will be a RenderMetaedgeInfo instance. It
-                 * will sometimes be undefined, for example for some Annotation edges for which
-                 * there is no underlying Metaedge in the hierarchical graph.
-                 */
-                function appendEdge(edgeGroup, d, sceneElement, edgeClass) {
-                    var size = 1;
-                    if (d.label != null && d.label.metaedge != null) {
-                        // There is an underlying Metaedge.
-                        size = d.label.metaedge.totalSize;
-                    }
-                    edgeClass = edgeClass || scene.Class.Edge.LINE; // set default type
-                    if (d.label && d.label.structural) {
-                        edgeClass += ' ' + scene.Class.Edge.STRUCTURAL;
-                    }
-                    // Give the path a unique id, which will be used to link
-                    // the textPath (edge label) to this path.
-                    var pathId = 'path_' + getEdgeKey(d);
-                    var strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
-                    var path = edgeGroup.append('path')
-                        .attr({
-                        'id': pathId,
-                        'class': edgeClass,
-                    })
-                        .style({ 'stroke-width': strokeWidth + 'px' });
-                    // Check if there is a reference edge and add an arrowhead of the right size.
-                    if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
-                        var markerId = "ref-arrowhead-" + arrowheadMap(strokeWidth);
-                        path.style('marker-start', "url(#" + markerId + ")");
-                        d.label.startMarkerId = markerId;
-                    }
-                    if (d.label == null || d.label.metaedge == null) {
-                        // There is no associated metaedge, thus no text.
-                        // This happens for annotation edges.
-                        return;
-                    }
-                    var labelForEdge = getLabelForEdge(d.label.metaedge, sceneElement.renderHierarchy);
-                    if (labelForEdge == null) {
-                        // We have no information to show on this edge.
-                        return;
-                    }
-                    // Put edge label in the middle of edge only if the edge is thick enough.
-                    var baseline = strokeWidth > CENTER_EDGE_LABEL_MIN_STROKE_WIDTH ?
-                        'central' :
-                        'text-after-edge';
-                    edgeGroup.append('text')
-                        .append('textPath')
-                        .attr({
-                        'xlink:href': '#' + pathId,
-                        'startOffset': '50%',
-                        'text-anchor': 'middle',
-                        'dominant-baseline': 'central'
-                    })
-                        .text(labelForEdge);
-                }
-                edge.appendEdge = appendEdge;
-                ;
-                edge.interpolate = d3.svg.line()
-                    .interpolate('basis')
-                    .x(function (d) { return d.x; })
-                    .y(function (d) { return d.y; });
-                /**
-                 * Returns a tween interpolator for the endpoint of an edge path.
-                 */
-                function getEdgePathInterpolator(d, i, a) {
-                    var renderMetaedgeInfo = d.label;
-                    var adjoiningMetaedge = renderMetaedgeInfo.adjoiningMetaedge;
-                    var points = renderMetaedgeInfo.points;
-                    // Adjust the path so that start/end markers point to the end
-                    // of the path.
-                    if (d.label.startMarkerId) {
-                        points = adjustPathPointsForMarker(points, d3.select('#' + d.label.startMarkerId), true);
-                    }
-                    if (d.label.endMarkerId) {
-                        points = adjustPathPointsForMarker(points, d3.select('#' + d.label.endMarkerId), false);
-                    }
-                    if (!adjoiningMetaedge) {
-                        return d3.interpolate(a, edge.interpolate(points));
-                    }
-                    var renderPath = this;
-                    // Get the adjoining path that matches the adjoining metaedge.
-                    var adjoiningPath = (adjoiningMetaedge.edgeGroup.node()
-                        .firstChild);
-                    // Find the desired SVGPoint along the adjoining path, then convert those
-                    // coordinates into the space of the renderPath using its Current
-                    // Transformation Matrix (CTM).
-                    var inbound = renderMetaedgeInfo.metaedge.inbound;
-                    return function (t) {
-                        var adjoiningPoint = adjoiningPath
-                            .getPointAtLength(inbound ? adjoiningPath.getTotalLength() : 0)
-                            .matrixTransform(adjoiningPath.getCTM())
-                            .matrixTransform(renderPath.getCTM().inverse());
-                        // Update the relevant point in the renderMetaedgeInfo's points list, then
-                        // re-interpolate the path.
-                        var index = inbound ? 0 : points.length - 1;
-                        points[index].x = adjoiningPoint.x;
-                        points[index].y = adjoiningPoint.y;
-                        var dPath = edge.interpolate(points);
-                        return dPath;
-                    };
-                }
-                function position(d) {
-                    d3.select(this)
-                        .select('path.' + scene.Class.Edge.LINE)
-                        .transition()
-                        .attrTween('d', getEdgePathInterpolator);
-                }
-                ;
-                /**
-                 * For a given d3 selection and data object, mark the edge as a control
-                 * dependency if it contains only control edges.
-                 *
-                 * d's label property will be a RenderMetaedgeInfo object.
-                 */
-                function stylize(edgeGroup, d, stylize) {
-                    edgeGroup.classed('faded', d.label.isFadedOut);
-                    var metaedge = d.label.metaedge;
-                    edgeGroup.select('path.' + scene.Class.Edge.LINE)
-                        .classed('control-dep', metaedge && !metaedge.numRegularEdges);
-                }
-                ;
-            })(edge = scene.edge || (scene.edge = {}));
-        })(scene = graph_1.scene || (graph_1.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var node;
-            (function (node_1) {
-                /**
-                 * Select or Create a 'g.nodes' group to a given sceneGroup
-                 * and builds a number of 'g.node' groups inside the group.
-                 *
-                 * Structure Pattern:
-                 *
-                 * <g class='nodes'>
-                 *   <g class='node'>
-                 *     <g class='in-annotations'>
-                 *       ...
-                 *     </g>
-                 *     <g class='out-annotations'>
-                 *       ...
-                 *     </g>
-                 *     <g class='nodeshape'>
-                 *      \x3c!--
-                 *      Content of the node shape should be for the node itself. For example a
-                 *      Metanode would have a <rect> with rounded edges, an op would have an
-                 *      <ellipse>. More complex nodes like series may contain multiple
-                 *      elements which are conditionally visible based on whether the node is
-                 *      expanded.
-                 *      --\x3e
-                 *     </g>
-                 *     <text class='label'>node name</text>
-                 *     <g class='subscene'>
-                 *       \x3c!--
-                 *       Content of  the subscene (only for metanode and series node).
-                 *
-                 *       Subscene is a svg group that contains content of the
-                 *       metanode's metagraph that is recursively generated by Scene.build().
-                 *
-                 *       When the graph is expanded multiple times, a subscene can contain
-                 *       nested subscenes inside.
-                 *       --\x3e
-                 *     </g>
-                 *   </g>
-                 *   ...
-                 * </g>
-                 *
-                 *
-                 * @param sceneGroup selection of the container
-                 * @param nodeData array of render node information to map
-                 * @param sceneElement <tf-graph-scene> polymer element
-                 * @return selection of the created nodeGroups
-                 */
-                function buildGroup(sceneGroup, nodeData, sceneElement) {
-                    var container = scene.selectOrCreateChild(sceneGroup, 'g', scene.Class.Node.CONTAINER);
-                    // Select all children and join with data.
-                    // (Note that all children of g.nodes are g.node)
-                    var nodeGroups = container
-                        .selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes; // this here refers to container.node()
-                    })
-                        .data(nodeData, function (d) {
-                        // make sure that we don't have to swap shape type
-                        return d.node.name + ':' + d.node.type;
-                    });
-                    // ENTER
-                    nodeGroups.enter()
-                        .append('g')
-                        .attr('data-name', function (d) { return d.node.name; })
-                        .each(function (d) {
-                        var nodeGroup = d3.select(this);
-                        // index node group for quick stylizing
-                        sceneElement.addNodeGroup(d.node.name, nodeGroup);
-                    });
-                    // UPDATE
-                    nodeGroups
-                        .attr('class', function (d) { return scene.Class.Node.GROUP + ' ' + nodeClass(d); })
-                        .each(function (d) {
-                        var nodeGroup = d3.select(this);
-                        // Add g.in-annotations (always add -- to keep layer order
-                        // consistent.)
-                        var inAnnotationBox = scene.selectOrCreateChild(nodeGroup, 'g', scene.Class.Annotation.INBOX);
-                        scene.annotation.buildGroup(inAnnotationBox, d.inAnnotations, d, sceneElement);
-                        // Add g.out-annotations  (always add -- to keep layer order
-                        // consistent.)
-                        var outAnnotationBox = scene.selectOrCreateChild(nodeGroup, 'g', scene.Class.Annotation.OUTBOX);
-                        scene.annotation.buildGroup(outAnnotationBox, d.outAnnotations, d, sceneElement);
-                        // Build .shape first (background of the node).
-                        var shape = buildShape(nodeGroup, d, scene.Class.Node.SHAPE);
-                        if (d.node.isGroupNode) {
-                            addButton(shape, d, sceneElement);
-                        }
-                        addInteraction(shape, d, sceneElement);
-                        // Build subscene on the top.
-                        subsceneBuild(nodeGroup, d, sceneElement);
-                        // Build label last. Should be on top of everything else.
-                        var label = labelBuild(nodeGroup, d, sceneElement);
-                        // Do not add interaction to metanode labels as they live inside the
-                        // metanode shape which already has the same interactions.
-                        addInteraction(label, d, sceneElement, d.node.type === graph.NodeType.META);
-                        stylize(nodeGroup, d, sceneElement);
-                        position(nodeGroup, d);
-                    });
-                    // EXIT
-                    nodeGroups.exit()
-                        .each(function (d) {
-                        // remove all indices on remove
-                        sceneElement.removeNodeGroup(d.node.name);
-                        var nodeGroup = d3.select(this);
-                        if (d.inAnnotations.list.length > 0) {
-                            nodeGroup.select('.' + scene.Class.Annotation.INBOX)
-                                .selectAll('.' + scene.Class.Annotation.GROUP)
-                                .each(function (a) { sceneElement.removeAnnotationGroup(a, d); });
-                        }
-                        if (d.outAnnotations.list.length > 0) {
-                            nodeGroup.select('.' + scene.Class.Annotation.OUTBOX)
-                                .selectAll('.' + scene.Class.Annotation.GROUP)
-                                .each(function (a) { sceneElement.removeAnnotationGroup(a, d); });
-                        }
-                    })
-                        .remove();
-                    return nodeGroups;
-                }
-                node_1.buildGroup = buildGroup;
-                ;
-                /**
-                 * Update or remove the subscene of a render group node depending on whether it
-                 * is a expanded. If the node is not a group node, this method has no effect.
-                 *
-                 * @param nodeGroup selection of the container
-                 * @param renderNodeInfo the render information for the node.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return Selection of the subscene group, or null if node group does not have
-                 *        a subscene. Op nodes, bridge nodes and unexpanded group nodes will
-                 *        not have a subscene.
-                 */
-                function subsceneBuild(nodeGroup, renderNodeInfo, sceneElement) {
-                    if (renderNodeInfo.node.isGroupNode) {
-                        if (renderNodeInfo.expanded) {
-                            // Recursively build the subscene.
-                            return scene.buildGroup(nodeGroup, renderNodeInfo, sceneElement, scene.Class.Subscene.GROUP);
-                        }
-                        // Clean out existing subscene if the node is not expanded.
-                        scene.selectChild(nodeGroup, 'g', scene.Class.Subscene.GROUP).remove();
-                    }
-                    return null;
-                }
-                ;
-                /**
-                 * Translate the subscene of the given node group
-                 */
-                function subscenePosition(nodeGroup, d) {
-                    var x0 = d.x - d.width / 2.0 + d.paddingLeft;
-                    var y0 = d.y - d.height / 2.0 + d.paddingTop;
-                    var subscene = scene.selectChild(nodeGroup, 'g', scene.Class.Subscene.GROUP);
-                    scene.translate(subscene, x0, y0);
-                }
-                ;
-                /**
-                 * Add an expand/collapse button to a group node
-                 *
-                 * @param selection The group node selection.
-                 * @param d Info about the node being rendered.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function addButton(selection, d, sceneElement) {
-                    var group = scene.selectOrCreateChild(selection, 'g', scene.Class.Node.BUTTON_CONTAINER);
-                    scene.selectOrCreateChild(group, 'circle', scene.Class.Node.BUTTON_CIRCLE);
-                    scene.selectOrCreateChild(group, 'path', scene.Class.Node.EXPAND_BUTTON)
-                        .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
-                    scene.selectOrCreateChild(group, 'path', scene.Class.Node.COLLAPSE_BUTTON)
-                        .attr('d', 'M-2.2,0 H2.2');
-                    group.on('click', function (d) {
-                        // Stop this event's propagation so that it isn't also considered a
-                        // node-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('node-toggle-expand', { name: d.node.name });
-                    });
-                    scene.positionButton(group, d);
-                }
-                ;
-                /**
-                 * Fire node-* events when the selection is interacted.
-                 *
-                 * @param disableInteraction When true, have the provided selection
-                 * ignore all pointer events. Used for text labels inside of metanodes, which
-                 * don't need interaction as their surrounding shape has interaction, and if
-                 * given interaction would cause conflicts with the expand/collapse button.
-                 */
-                function addInteraction(selection, d, sceneElement, disableInteraction) {
-                    if (disableInteraction) {
-                        selection.attr('pointer-events', 'none');
-                        return;
-                    }
-                    var contextMenuFunction = scene.contextmenu.getMenu(getContextMenu(d.node, sceneElement));
-                    selection
-                        .on('dblclick', function (d) {
-                        sceneElement.fire('node-toggle-expand', { name: d.node.name });
-                    })
-                        .on('mouseover', function (d) {
-                        // don't send mouseover over expanded group,
-                        // otherwise it is causing too much glitches
-                        if (sceneElement.isNodeExpanded(d)) {
-                            return;
-                        }
-                        sceneElement.fire('node-highlight', { name: d.node.name });
-                    })
-                        .on('mouseout', function (d) {
-                        // don't send mouseover over expanded group,
-                        // otherwise it is causing too much glitches
-                        if (sceneElement.isNodeExpanded(d)) {
-                            return;
-                        }
-                        sceneElement.fire('node-unhighlight', { name: d.node.name });
-                    })
-                        .on('click', function (d) {
-                        // Stop this event's propagation so that it isn't also considered
-                        // a graph-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('node-select', { name: d.node.name });
-                    })
-                        .on('contextmenu', function (d, i) {
-                        sceneElement.fire('node-select', { name: d.node.name });
-                        contextMenuFunction.call(d, i);
-                    });
-                }
-                ;
-                /**
-                 * Returns the d3 context menu specification for the provided node.
-                 */
-                function getContextMenu(node, sceneElement) {
-                    var menu = [{
-                            title: function (d) { return graph.getIncludeNodeButtonString(node.include); },
-                            action: function (elm, d, i) {
-                                sceneElement.fire('node-toggle-extract', { name: node.name });
-                            }
-                        }];
-                    if (canBeInSeries(node)) {
-                        menu.push({
-                            title: function (d) { return getGroupSettingLabel(node); },
-                            action: function (elm, d, i) {
-                                sceneElement.fire('node-toggle-seriesgroup', { name: getSeriesName(node) });
-                            }
-                        });
-                    }
-                    return menu;
-                }
-                node_1.getContextMenu = getContextMenu;
-                /** Returns if a node can be part of a grouped series */
-                function canBeInSeries(node) {
-                    return getSeriesName(node) !== null;
-                }
-                node_1.canBeInSeries = canBeInSeries;
-                /**
-                 * Returns the name of the possible grouped series containing this node.
-                 * Returns null if the node cannot be part of a grouped series of nodes.
-                 */
-                function getSeriesName(node) {
-                    if (!node) {
-                        return null;
-                    }
-                    if (node.type === graph.NodeType.SERIES) {
-                        return node.name;
-                    }
-                    if (node.type === graph.NodeType.OP) {
-                        var op = node;
-                        return op.owningSeries;
-                    }
-                    return null;
-                }
-                node_1.getSeriesName = getSeriesName;
-                /**
-                 * Returns the SeriesNode that represents the series that the provided node
-                 * is contained in (or itself if the provided node is itself a SeriesNode).
-                 * Returns null if the node is not rendered as part of a series.
-                 */
-                function getContainingSeries(node) {
-                    var s = null;
-                    if (!node) {
-                        return null;
-                    }
-                    else if (node.type === graph.NodeType.SERIES) {
-                        s = node;
-                    }
-                    else if (node.parentNode && node.parentNode.type === graph.NodeType.SERIES) {
-                        s = node.parentNode;
-                    }
-                    return s;
-                }
-                /**
-                 * Returns the label for a button to toggle the group setting of the provided
-                 * node.
-                 */
-                function getGroupSettingLabel(node) {
-                    return tf.graph.getGroupSeriesNodeButtonString(getContainingSeries(node) !== null ? tf.graph.SeriesGroupingType.GROUP :
-                        tf.graph.SeriesGroupingType.UNGROUP);
-                }
-                node_1.getGroupSettingLabel = getGroupSettingLabel;
-                /**
-                 * Append svg text for label and assign data.
-                 * @param nodeGroup
-                 * @param renderNodeInfo The render node information for the label.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function labelBuild(nodeGroup, renderNodeInfo, sceneElement) {
-                    var namePath = renderNodeInfo.node.name.split('/');
-                    var text = namePath[namePath.length - 1];
-                    // Truncate long labels for unexpanded Metanodes.
-                    var useFontScale = renderNodeInfo.node.type === graph.NodeType.META &&
-                        !renderNodeInfo.expanded;
-                    var label = scene.selectOrCreateChild(nodeGroup, 'text', scene.Class.Node.LABEL);
-                    // Make sure the label is visually on top among its siblings.
-                    var labelNode = label.node();
-                    labelNode.parentNode.appendChild(labelNode);
-                    label.attr('dy', '.35em').attr('text-anchor', 'middle');
-                    if (useFontScale) {
-                        if (text.length > sceneElement.maxMetanodeLabelLength) {
-                            text = text.substr(0, sceneElement.maxMetanodeLabelLength - 2) + '...';
-                        }
-                        var scale = getLabelFontScale(sceneElement);
-                        label.attr('font-size', scale(text.length) + 'px');
-                    }
-                    var txtElement = label.text(text);
-                    enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
-                    return label;
-                }
-                /**
-                 * This function shortens text which would exceed the maximum pixel width of
-                 * a label.
-                 *
-                 * @param txtElementSelection The text element containing the label's text as d3
-                 * selection.
-                 * @param nodeType The type of the node the label belongs to. If the node is
-                 * an annotation, the value is -1. Label widths are defined in
-                 * layout.PARAMS.nodeSize.{meta|op|...}.maxLabelWidth for nodes and
-                 * layout.PARAMS.annotations.labelWidth for annotations.
-                 * @param renderNodeInfo The render information about the node, required to
-                 * determine whether META nodes are collapsed or expanded.
-                 */
-                function enforceLabelWidth(txtElementSelection, nodeType, renderNodeInfo) {
-                    // Get text element itself and its on-screen width.
-                    var txtNode = txtElementSelection.node();
-                    var computedTxtLength = txtNode.getComputedTextLength();
-                    var labelContent = txtNode.textContent;
-                    // Get maximum length from settings.
-                    var maxLength = null;
-                    switch (nodeType) {
-                        case graph.NodeType.META:
-                            if (renderNodeInfo && !renderNodeInfo.expanded) {
-                                // node expanded.
-                                maxLength = graph.layout.PARAMS.nodeSize.meta.maxLabelWidth;
-                            }
-                            break;
-                        case graph.NodeType.OP:
-                            maxLength = graph.layout.PARAMS.nodeSize.op.maxLabelWidth;
-                            break;
-                        case -1:
-                            maxLength = graph.layout.PARAMS.annotations.maxLabelWidth;
-                            break;
-                        default:
-                            break;
-                    }
-                    // Return if no max length provided for node type, or current label length is
-                    // less than or equal to the provided length limit.
-                    if (maxLength === null || computedTxtLength <= maxLength) {
-                        return;
-                    }
-                    // Find the index of the character which exceeds the width.
-                    // getSubStringLength performs far better than getComputedTextLength, and
-                    // results in a 3x speed-up on average.
-                    var index = 1;
-                    while (txtNode.getSubStringLength(0, index) < maxLength) {
-                        index++;
-                    }
-                    // Shorten the label starting at the string length known to be one
-                    // character above max pixel length.
-                    // When shortened the original label's substring is concatenated with
-                    // '...', baseText contains the substring not including the '...'.
-                    var baseText = txtNode.textContent.substr(0, index);
-                    do {
-                        baseText = baseText.substr(0, baseText.length - 1);
-                        // Recompute text length.
-                        txtNode.textContent = baseText + '...';
-                        computedTxtLength = txtNode.getComputedTextLength();
-                    } while (computedTxtLength > maxLength && baseText.length > 0);
-                    // Add tooltip with full name and return.
-                    return txtElementSelection.append('title').text(labelContent);
-                }
-                node_1.enforceLabelWidth = enforceLabelWidth;
-                /**
-                 * d3 scale used for sizing font of labels, used by labelBuild,
-                 * initialized once by getLabelFontScale.
-                 */
-                var fontScale = null;
-                function getLabelFontScale(sceneElement) {
-                    if (!fontScale) {
-                        fontScale = d3.scale.linear()
-                            .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
-                            sceneElement.maxMetanodeLabelLength])
-                            .range([sceneElement.maxMetanodeLabelLengthFontSize,
-                            sceneElement.minMetanodeLabelLengthFontSize]).clamp(true);
-                    }
-                    return fontScale;
-                }
-                /**
-                 * Set label position of a given node group
-                 */
-                function labelPosition(nodeGroup, cx, cy, yOffset) {
-                    scene.selectChild(nodeGroup, 'text', scene.Class.Node.LABEL)
-                        .transition()
-                        .attr('x', cx)
-                        .attr('y', cy + yOffset);
-                }
-                ;
-                /**
-                 * Select or append/insert shape for a node and assign renderNode
-                 * as the shape's data.
-                 *
-                 * @param nodeGroup
-                 * @param d Render node information.
-                 * @param nodeClass class for the element.
-                 * @return Selection of the shape.
-                 */
-                function buildShape(nodeGroup, d, nodeClass) {
-                    // Create a group to house the underlying visual elements.
-                    var shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
-                    // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
-                    switch (d.node.type) {
-                        case graph.NodeType.OP:
-                            scene.selectOrCreateChild(shapeGroup, 'ellipse', scene.Class.Node.COLOR_TARGET);
-                            break;
-                        case graph.NodeType.SERIES:
-                            // Choose the correct stamp to use to represent this series.
-                            var stampType = 'annotation';
-                            var groupNodeInfo = d;
-                            if (groupNodeInfo.coreGraph) {
-                                stampType =
-                                    groupNodeInfo.node.hasNonControlEdges ? 'vertical' : 'horizontal';
-                            }
-                            var classList = [scene.Class.Node.COLOR_TARGET];
-                            if (groupNodeInfo.isFadedOut) {
-                                classList.push('faded-ellipse');
-                            }
-                            scene.selectOrCreateChild(shapeGroup, 'use', classList)
-                                .attr('xlink:href', '#op-series-' + stampType + '-stamp');
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        case graph.NodeType.BRIDGE:
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        case graph.NodeType.META:
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        default:
-                            throw Error('Unrecognized node type: ' + d.node.type);
-                    }
-                    return shapeGroup;
-                }
-                node_1.buildShape = buildShape;
-                ;
-                function nodeClass(d) {
-                    switch (d.node.type) {
-                        case graph.NodeType.OP:
-                            return scene.Class.OPNODE;
-                        case graph.NodeType.META:
-                            return scene.Class.METANODE;
-                        case graph.NodeType.SERIES:
-                            return scene.Class.SERIESNODE;
-                        case graph.NodeType.BRIDGE:
-                            return scene.Class.BRIDGENODE;
-                        case graph.NodeType.ELLIPSIS:
-                            return scene.Class.ELLIPSISNODE;
-                    }
-                    ;
-                    throw Error('Unrecognized node type: ' + d.node.type);
-                }
-                node_1.nodeClass = nodeClass;
-                ;
-                /** Modify node and its subscene and its label's positional attributes */
-                function position(nodeGroup, d) {
-                    var shapeGroup = scene.selectChild(nodeGroup, 'g', scene.Class.Node.SHAPE);
-                    var cx = graph.layout.computeCXPositionOfNodeShape(d);
-                    switch (d.node.type) {
-                        case graph.NodeType.OP: {
-                            // position shape
-                            var shape = scene.selectChild(shapeGroup, 'ellipse');
-                            scene.positionEllipse(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                            labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-                            break;
-                        }
-                        case graph.NodeType.META: {
-                            // position shape
-                            var shape = scene.selectChild(shapeGroup, 'rect');
-                            if (d.expanded) {
-                                scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                                subscenePosition(nodeGroup, d);
-                                // put label on top
-                                labelPosition(nodeGroup, cx, d.y, -d.height / 2 + d.labelHeight / 2);
-                            }
-                            else {
-                                scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                                labelPosition(nodeGroup, cx, d.y, 0);
-                            }
-                            break;
-                        }
-                        case graph.NodeType.SERIES: {
-                            var shape = scene.selectChild(shapeGroup, 'use');
-                            if (d.expanded) {
-                                scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                                subscenePosition(nodeGroup, d);
-                                // put label on top
-                                labelPosition(nodeGroup, cx, d.y, -d.height / 2 + d.labelHeight / 2);
-                            }
-                            else {
-                                scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                                labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-                            }
-                            break;
-                        }
-                        case graph.NodeType.BRIDGE: {
-                            // position shape
-                            // NOTE: In reality, these will not be visible, but it helps to put them
-                            // in the correct position for debugging purposes.
-                            var shape = scene.selectChild(shapeGroup, 'rect');
-                            scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                            break;
-                        }
-                        default: {
-                            throw Error('Unrecognized node type: ' + d.node.type);
-                        }
-                    }
-                }
-                ;
-                /** Enum specifying the options to color nodes by */
-                (function (ColorBy) {
-                    ColorBy[ColorBy["STRUCTURE"] = 0] = "STRUCTURE";
-                    ColorBy[ColorBy["DEVICE"] = 1] = "DEVICE";
-                    ColorBy[ColorBy["COMPUTE_TIME"] = 2] = "COMPUTE_TIME";
-                    ColorBy[ColorBy["MEMORY"] = 3] = "MEMORY";
-                })(node_1.ColorBy || (node_1.ColorBy = {}));
-                var ColorBy = node_1.ColorBy;
-                ;
-                /**
-                 * Returns the fill color for the node given its state and the 'color by'
-                 * option.
-                 */
-                function getFillForNode(templateIndex, colorBy, renderInfo, isExpanded) {
-                    var colorParams = graph.render.MetanodeColors;
-                    switch (colorBy) {
-                        case ColorBy.STRUCTURE:
-                            if (renderInfo.node.type === graph.NodeType.META) {
-                                var tid = renderInfo.node.templateId;
-                                return tid === null ?
-                                    colorParams.UNKNOWN :
-                                    colorParams.STRUCTURE_PALETTE(templateIndex(tid), isExpanded);
-                            }
-                            else if (renderInfo.node.type === graph.NodeType.SERIES) {
-                                // If expanded, we're showing the background rect, which we want to
-                                // appear gray. Otherwise we're showing a stack of ellipses which we
-                                // want to show white.
-                                return isExpanded ? colorParams.EXPANDED_COLOR : 'white';
-                            }
-                            else if (renderInfo.node.type === graph.NodeType.BRIDGE) {
-                                return renderInfo.structural ?
-                                    '#f0e' :
-                                    renderInfo.node.inbound ? '#0ef' : '#fe0';
-                            }
-                            else {
-                                // Op nodes are white.
-                                return 'white';
-                            }
-                        case ColorBy.DEVICE:
-                            if (renderInfo.deviceColors == null) {
-                                // Return the hue for unknown device.
-                                return colorParams.UNKNOWN;
-                            }
-                            var id = renderInfo.node.name;
-                            var escapedId = tf.graph.util.escapeQuerySelector(id);
-                            var gradientDefs = d3.select('svg#svg defs #linearGradients');
-                            var linearGradient_1 = gradientDefs.select('linearGradient#' + escapedId);
-                            // If the linear gradient is not there yet, create it.
-                            if (linearGradient_1.size() === 0) {
-                                linearGradient_1 = gradientDefs.append('linearGradient').attr('id', id);
-                                // Re-create the stops of the linear gradient.
-                                linearGradient_1.selectAll('*').remove();
-                                var cumulativeProportion_1 = 0;
-                                // For each device, create a stop using the proportion of that device.
-                                _.each(renderInfo.deviceColors, function (d) {
-                                    var color = d.color;
-                                    linearGradient_1.append('stop')
-                                        .attr('offset', cumulativeProportion_1)
-                                        .attr('stop-color', color);
-                                    linearGradient_1.append('stop')
-                                        .attr('offset', cumulativeProportion_1 + d.proportion)
-                                        .attr('stop-color', color);
-                                    cumulativeProportion_1 += d.proportion;
-                                });
-                            }
-                            return isExpanded ? colorParams.EXPANDED_COLOR : "url(#" + escapedId + ")";
-                        case ColorBy.COMPUTE_TIME:
-                            return isExpanded ?
-                                colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
-                                colorParams.UNKNOWN;
-                        case ColorBy.MEMORY:
-                            return isExpanded ?
-                                colorParams.EXPANDED_COLOR : renderInfo.memoryColor ||
-                                colorParams.UNKNOWN;
-                        default:
-                            throw new Error('Unknown case to color nodes by');
-                    }
-                }
-                node_1.getFillForNode = getFillForNode;
-                /**
-                 * Modify node style by toggling class and assign attributes (only for things
-                 * that can't be done in css).
-                 */
-                function stylize(nodeGroup, renderInfo, sceneElement, nodeClass) {
-                    nodeClass = nodeClass || scene.Class.Node.SHAPE;
-                    var isHighlighted = sceneElement.isNodeHighlighted(renderInfo.node.name);
-                    var isSelected = sceneElement.isNodeSelected(renderInfo.node.name);
-                    var isExtract = renderInfo.isInExtract || renderInfo.isOutExtract;
-                    var isExpanded = renderInfo.expanded;
-                    var isFadedOut = renderInfo.isFadedOut;
-                    nodeGroup.classed('highlighted', isHighlighted);
-                    nodeGroup.classed('selected', isSelected);
-                    nodeGroup.classed('extract', isExtract);
-                    nodeGroup.classed('expanded', isExpanded);
-                    nodeGroup.classed('faded', isFadedOut);
-                    // Main node always exists here and it will be reached before subscene,
-                    // so d3 selection is fine here.
-                    var node = nodeGroup.select('.' + nodeClass + ' .' + scene.Class.Node.COLOR_TARGET);
-                    var fillColor = getFillForNode(sceneElement.templateIndex, ColorBy[sceneElement.colorBy.toUpperCase()], renderInfo, isExpanded);
-                    node.style('fill', fillColor);
-                    // Choose outline to be darker version of node color if the node is a single
-                    // color and is not selected.
-                    node.style('stroke', isSelected ? null : getStrokeForFill(fillColor));
-                }
-                node_1.stylize = stylize;
-                ;
-                /**
-                 * Given a node's fill color/gradient, determine the stroke for the node.
-                 */
-                function getStrokeForFill(fill) {
-                    // If node is colored by a gradient, then use a dark gray outline.
-                    return fill.substring(0, 3) === 'url' ?
-                        graph.render.MetanodeColors.GRADIENT_OUTLINE :
-                        d3.rgb(fill).darker().toString();
-                }
-                node_1.getStrokeForFill = getStrokeForFill;
-                /**
-                 * Finds selected node and highlights all nodes which are providing direct
-                 * or indirect input to the node and all edges connecting these nodes
-                 * together and to the selected node.
-                 *
-                 * @param renderGraphInfo Information on the rendered state of the graph.
-                 */
-                function traceInputs(renderGraphInfo) {
-                    // Reset all styling.
-                    d3.selectAll('.input-highlight').classed('input-highlight', false);
-                    d3.selectAll('.non-input').classed('non-input', false);
-                    d3.selectAll('.input-parent').classed('input-parent', false);
-                    d3.selectAll('.input-child').classed('input-child', false);
-                    d3.selectAll('.input-edge-highlight').classed('input-edge-highlight', false);
-                    d3.selectAll('.non-input-edge-highlight')
-                        .classed('non-input-edge-highlight', false);
-                    d3.selectAll('.input-highlight-selected')
-                        .classed('input-highlight-selected', false);
-                    // Extract currently selected node. Return if input tracing disabled or no
-                    // node is selected.
-                    var selectedNodeSelectorString = 'g.node.selected,g.op.selected';
-                    var node = d3.select(selectedNodeSelectorString);
-                    var currentNode = undefined;
-                    if (renderGraphInfo && renderGraphInfo.traceInputs && node && node[0] &&
-                        node[0][0]) {
-                        currentNode = node[0][0];
-                    }
-                    else {
-                        return;
-                    }
-                    var nodeName = currentNode.getAttribute('data-name');
-                    var opNodes = _getAllContainedOpNodes(nodeName, renderGraphInfo);
-                    var allTracedNodes = {};
-                    _.each(opNodes, function (nodeInstance) {
-                        allTracedNodes =
-                            traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
-                    });
-                    d3.selectAll(selectedNodeSelectorString).classed({
-                        // Remove the input-highlight from the selected node.
-                        'input-highlight': false,
-                        // Add input-highlight-selected class to selected node, which allows
-                        // treating the selected not as a special case of an input node.
-                        'input-highlight-selected': true
-                    });
-                    // Highlight all parent nodes of each OpNode as input parent to allow
-                    // specific highlighting.
-                    var highlightedNodes = Object.keys(allTracedNodes);
-                    var visibleNodes = _findVisibleParentsFromOpNodes(renderGraphInfo, highlightedNodes);
-                    _markParentsOfNodes(visibleNodes);
-                    // Attach class to all non-input nodes and edges for styling.
-                    d3.selectAll('g.node:not(.selected):not(.input-highlight)' +
-                        ':not(.input-parent):not(.input-children)')
-                        .classed('non-input', true)
-                        .each(function (d) {
-                        // Mark all nodes with the specified name as non-inputs. This
-                        // results in Annotation nodes which are attached to inputs to be
-                        // tagged as well.
-                        var nodeName = d.node.name;
-                        d3.selectAll("[data-name=\"" + nodeName + "\"]").classed('non-input', true);
-                    });
-                    d3.selectAll('g.edge:not(.input-edge-highlight)')
-                        .classed('non-input-edge-highlight', true);
-                }
-                node_1.traceInputs = traceInputs;
-                /**
-                 * Recursively find all op nodes contained by the node identified by the
-                 * provided name.
-                 * @param nodeName The meta or op node of which the OpNode instances are
-                 * required.
-                 * @param renderGraphInfo The rendered graph information object.
-                 * @returns {Array} An array of OpNodeImpl instances.
-                 */
-                function _getAllContainedOpNodes(nodeName, renderGraphInfo) {
-                    var opNodes = [];
-                    // Get current node.
-                    var node = renderGraphInfo.getNodeByName(nodeName);
-                    // If node is already OpNode then return the node plus its input embeddings.
-                    if (node instanceof tf.graph.OpNodeImpl) {
-                        return [node].concat(node.inEmbeddings);
-                    }
-                    // Otherwise, make recursive call for each node contained by the GroupNode.
-                    var childNodeNames = node.metagraph.nodes();
-                    _.each(childNodeNames, function (childNodeName) {
-                        opNodes =
-                            opNodes.concat(_getAllContainedOpNodes(childNodeName, renderGraphInfo));
-                    });
-                    return opNodes;
-                }
-                node_1._getAllContainedOpNodes = _getAllContainedOpNodes;
-                function traceAllInputsOfOpNode(renderGraphInfo, startNode, allTracedNodes) {
-                    // To prevent infinite loops due to cyclical relationships and improving
-                    // performance by tracing OpNode which is input to 2+ nodes only once.
-                    if (allTracedNodes[startNode.name]) {
-                        return allTracedNodes;
-                    }
-                    else {
-                        allTracedNodes[startNode.name] = true;
-                    }
-                    // Extract the inputs.
-                    var inputs = startNode.inputs;
-                    // Get visible parent.
-                    var currentVisibleParent = getVisibleParent(renderGraphInfo, startNode);
-                    // Mark as input node.
-                    d3.select(".node[data-name=\"" + currentVisibleParent.name + "\"]")
-                        .classed('input-highlight', true);
-                    // Find the visible parent of each input.
-                    var visibleInputs = {};
-                    _.each(inputs, function (nodeInstance) {
-                        var resolvedNode = renderGraphInfo.getNodeByName(nodeInstance.name);
-                        if (resolvedNode === undefined) {
-                            // Node could not be found in rendered Hierarchy, which happens when
-                            // tracing inputs of a SummaryNode.
-                            return;
-                        }
-                        // Ensure node is resolved to OpNode if name collision with Metanode exists.
-                        if (resolvedNode instanceof graph.MetanodeImpl) {
-                            var resolvedNodeName = tf.graph.getStrictName(resolvedNode.name);
-                            resolvedNode = renderGraphInfo.getNodeByName(resolvedNodeName);
-                        }
-                        var visibleParent = getVisibleParent(renderGraphInfo, resolvedNode);
-                        // Append OpNode to visible parent entry.
-                        var visibleInputsEntry = visibleInputs[visibleParent.name];
-                        if (visibleInputsEntry) {
-                            visibleInputsEntry.opNodes.push(resolvedNode);
-                        }
-                        else {
-                            visibleInputs[visibleParent.name] = {
-                                visibleParent: visibleParent,
-                                opNodes: [resolvedNode]
-                            };
-                        }
-                    });
-                    // Find all parents of the start node.
-                    var startNodeParents = {};
-                    var indexedStartNodeParents = [currentVisibleParent];
-                    startNodeParents[currentVisibleParent.name] = {
-                        traced: false,
-                        index: 0,
-                        connectionEndpoints: []
-                    };
-                    var currentNode = currentVisibleParent;
-                    for (var index = 1; currentNode.name !== tf.graph.ROOT_NAME; index++) {
-                        currentNode = currentNode.parentNode;
-                        startNodeParents[currentNode.name] = {
-                            traced: false,
-                            index: index,
-                            connectionEndpoints: []
-                        };
-                        indexedStartNodeParents[index] = currentNode;
-                    }
-                    // Find first mutual parent of each input node and highlight connection.
-                    _.forOwn(visibleInputs, function (visibleParentInfo, key) {
-                        var nodeInstance = visibleParentInfo.visibleParent;
-                        // Make recursive call for each input-OpNode contained by the visible
-                        // parent.
-                        _.each(visibleParentInfo.opNodes, function (opNode) {
-                            allTracedNodes =
-                                traceAllInputsOfOpNode(renderGraphInfo, opNode, allTracedNodes);
-                        });
-                        if (nodeInstance.name !== currentVisibleParent.name) {
-                            _createVisibleTrace(nodeInstance, startNodeParents, indexedStartNodeParents);
-                        }
-                    });
-                    return allTracedNodes;
-                }
-                node_1.traceAllInputsOfOpNode = traceAllInputsOfOpNode;
-                /**
-                 * Colors the edges to connect the passed node to the start node. This is
-                 * done by:
-                 *
-                 * a) Finding the first (visible) common parent in the rendered
-                 * hierarchy.
-                 * NB: There are 2 types of connections:
-                 * 1) Direct connections between node A
-                 * and B, marked below as II,
-                 * 2) Connections from any node A to its parent, A'. Marked below as I and III.
-                 * For type 2 connection you need to know the inner-nested node, the
-                 * direct parent, and the ultimate destination of the connection.
-                 *
-                 *  A_parent      B_parent
-                 * +--------+    +---------+
-                 * |        |    |         |
-                 * |  +--+ I| II |III+--+  |
-                 * |  |A +----------\x3e+B |  |
-                 * |  +--+  |    |   +--+  |
-                 * |        |    |         |
-                 * +--------+    +---------+
-                 *
-                 *
-                 * b) Highlighting the direct connection between the parents of A and B,
-                 * called A_parent and B_parent, s.t. A_parent and B_parent are children of the
-                 * mutual parent of A and B found in a), marked above as II.
-                 *
-                 * c) Highlighting the connection from A to A_parent and B to B_parent
-                 * (through all layers of parents between A and A_parent and B and B_parent,
-                 * respectively). Marked above as I and III.
-                 *
-                 * @param nodeInstance The instance of the node to use as destination node, B.
-                 * @param startNodeParents Map of startNodeParent names to information objects
-                 * about the parent.
-                 * @param indexedStartNodeParents An array of all parents of the start node.
-                 * This is required to find the child of the mutual parent which is a parent
-                 * of the start node.
-                 * @private
-                 */
-                function _createVisibleTrace(nodeInstance, startNodeParents, indexedStartNodeParents) {
-                    var currentNode = nodeInstance;
-                    var previousNode = nodeInstance;
-                    // Ascend through parents until a mutual parent is found with the start
-                    // node.
-                    var destinationParentPairs = [];
-                    while (!startNodeParents[currentNode.name]) {
-                        if (previousNode.name !== currentNode.name) {
-                            destinationParentPairs.push([previousNode, currentNode]);
-                        }
-                        previousNode = currentNode;
-                        currentNode = currentNode.parentNode;
-                    }
-                    // Connection between nodes is drawn between the parents of each
-                    // respective node, both of which share the mutual parent.
-                    var startNodeIndex = startNodeParents[currentNode.name].index;
-                    var startNodeName = indexedStartNodeParents[Math.max(startNodeIndex - 1, 0)].name;
-                    var startNodeTopParentName = startNodeName;
-                    var targetNodeTopParentName = previousNode.name;
-                    var endNodeName = previousNode.name;
-                    d3.selectAll("[data-edge=\"" + endNodeName + "--" + startNodeName + "\"]")
-                        .classed('input-edge-highlight', true);
-                    // Trace up the parents of the input.
-                    _.each(destinationParentPairs, function (value) {
-                        var inner = value[0];
-                        var outer = value[1];
-                        var edgeSelector = ("[data-edge=\"" + inner.name + "--" + startNodeTopParentName) +
-                            ("~~" + outer.name + "~~OUT\"]");
-                        d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-                    });
-                    // Trace up the parents of the start node.
-                    for (var index = 1; index < startNodeIndex; index++) {
-                        var inner = indexedStartNodeParents[index - 1];
-                        var outer = indexedStartNodeParents[index];
-                        var edgeSelector = ("[data-edge=\"" + targetNodeTopParentName + "~~" + outer.name) +
-                            ("~~IN--" + inner.name + "\"]");
-                        d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-                    }
-                }
-                /**
-                 * Creates map { [name: string] -> Node } of all visible / rendered parents
-                 * of the nodes identified by the node names passed in.
-                 *
-                 * @param renderGraphInfo The information on the rendered graph.
-                 * @param nodeNames String array of node names.
-                 * @returns {[nodeName: string]: Node}
-                 * @private
-                 */
-                function _findVisibleParentsFromOpNodes(renderGraphInfo, nodeNames) {
-                    var visibleParents = {};
-                    _.each(nodeNames, function (nodeName) {
-                        var currentNode = renderGraphInfo.getNodeByName(nodeName);
-                        var visibleParent = getVisibleParent(renderGraphInfo, currentNode);
-                        visibleParents[visibleParent.name] = visibleParent;
-                    });
-                    return visibleParents;
-                }
-                /**
-                 * Traverse through the parents of all nodes in the list and mark each
-                 * encountered node as input-parent.
-                 * @param visibleNodes Map of input nodes, have to be visible/rendered when
-                 * called.
-                 * @private
-                 */
-                function _markParentsOfNodes(visibleNodes) {
-                    _.forOwn(visibleNodes, function (nodeInstance) {
-                        // Mark all parents of the node as input-parents.
-                        var currentNode = nodeInstance;
-                        while (currentNode.name !== tf.graph.ROOT_NAME) {
-                            var renderedElement = d3.select(".node[data-name=\"" + currentNode.name + "\"]");
-                            // Only mark the element as a parent node to an input if it is not
-                            // marked as input node itself.
-                            if (renderedElement[0][0] &&
-                                !renderedElement.classed('input-highlight') &&
-                                !renderedElement.classed('selected') &&
-                                // OpNode only parent if start node is embedded node, in which case
-                                // the OpNode should be faded as well.
-                                !renderedElement.classed('op')) {
-                                renderedElement.classed('input-parent', true);
-                            }
-                            currentNode = currentNode.parentNode;
-                        }
-                    });
-                }
-                /**
-                 * Find the parent of the passed in op node which is expanded. This is done
-                 * by going through all parents until the parent's parent is expanded, thus
-                 * finding the the first unexpanded parent which is rendered on the screen.
-                 * @param renderGraphInfo The graph info object used to gain access to the
-                 * render info of the parents.
-                 * @param currentNode The node whose parent is to be found.
-                 * @returns Node
-                 */
-                function getVisibleParent(renderGraphInfo, currentNode) {
-                    var found = false;
-                    var currentParent = currentNode;
-                    while (!found) {
-                        // Get parent element, to extract name.
-                        currentNode = currentParent;
-                        currentParent = currentNode.parentNode;
-                        if (currentParent === undefined) {
-                            found = true;
-                        }
-                        else {
-                            var renderNode = renderGraphInfo.getRenderNodeByName(currentParent.name);
-                            // Found if node is rendered on the screen (renderNode truthy), and
-                            // the parent is either expanded (i.e. it is a metanode or seriesnode)
-                            // or the parent is an OpNode in which case currentNode is an embedded
-                            // node which has another OpNode as parent.
-                            if (renderNode &&
-                                (renderNode.expanded || currentParent instanceof graph.OpNodeImpl)) {
-                                found = true;
-                            }
-                        }
-                    } // Close while loop.
-                    return currentNode;
-                }
-                node_1.getVisibleParent = getVisibleParent;
-            })(node = scene.node || (scene.node = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // Close module.
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            /** Enums element class of objects in the scene */
-            scene.Class = {
-                Node: {
-                    // <g> element that contains nodes.
-                    CONTAINER: 'nodes',
-                    // <g> element that contains detail about a node.
-                    GROUP: 'node',
-                    // <g> element that contains visual elements (like rect, ellipse).
-                    SHAPE: 'nodeshape',
-                    // <*> element(s) under SHAPE that should receive color updates.
-                    COLOR_TARGET: 'nodecolortarget',
-                    // <text> element showing the node's label.
-                    LABEL: 'nodelabel',
-                    // <g> element that contains all visuals for the expand/collapse
-                    // button for expandable group nodes.
-                    BUTTON_CONTAINER: 'buttoncontainer',
-                    // <circle> element that surrounds expand/collapse buttons.
-                    BUTTON_CIRCLE: 'buttoncircle',
-                    // <path> element of the expand button.
-                    EXPAND_BUTTON: 'expandbutton',
-                    // <path> element of the collapse button.
-                    COLLAPSE_BUTTON: 'collapsebutton'
-                },
-                Edge: {
-                    CONTAINER: 'edges',
-                    GROUP: 'edge',
-                    LINE: 'edgeline',
-                    REF_LINE: 'refline',
-                    STRUCTURAL: 'structural'
-                },
-                Annotation: {
-                    OUTBOX: 'out-annotations',
-                    INBOX: 'in-annotations',
-                    GROUP: 'annotation',
-                    NODE: 'annotation-node',
-                    EDGE: 'annotation-edge',
-                    CONTROL_EDGE: 'annotation-control-edge',
-                    LABEL: 'annotation-label',
-                    ELLIPSIS: 'annotation-ellipsis'
-                },
-                Scene: {
-                    GROUP: 'scene',
-                    CORE: 'core',
-                    INEXTRACT: 'in-extract',
-                    OUTEXTRACT: 'out-extract'
-                },
-                Subscene: { GROUP: 'subscene' },
-                OPNODE: 'op',
-                METANODE: 'meta',
-                SERIESNODE: 'series',
-                BRIDGENODE: 'bridge',
-                ELLIPSISNODE: 'ellipsis'
-            };
-            /**
-             * Helper method for fitting the graph in the svg view.
-             *
-             * @param svg The main svg.
-             * @param zoomG The svg group used for panning and zooming.
-             * @param d3zoom The zoom behavior.
-             * @param callback Called when the fitting is done.
-             */
-            function fit(svg, zoomG, d3zoom, callback) {
-                var svgRect = svg.getBoundingClientRect();
-                var sceneSize = null;
-                try {
-                    sceneSize = zoomG.getBBox();
-                    if (sceneSize.width === 0) {
-                        // There is no scene anymore. We have been detached from the dom.
-                        return;
-                    }
-                }
-                catch (e) {
-                    // Firefox produced NS_ERROR_FAILURE if we have been
-                    // detached from the dom.
-                    return;
-                }
-                var scale = 0.9 * Math.min(svgRect.width / sceneSize.width, svgRect.height / sceneSize.height, 2);
-                var params = graph.layout.PARAMS.graph;
-                var zoomEvent = d3zoom.scale(scale)
-                    .on('zoomend.fitted', function () {
-                    // Remove the listener for the zoomend event,
-                    // so we don't get called at the end of regular zoom events,
-                    // just those that fit the graph to screen.
-                    d3zoom.on('zoomend.fitted', null);
-                    callback();
-                })
-                    .translate([params.padding.paddingLeft, params.padding.paddingTop])
-                    .event;
-                d3.select(zoomG).transition().duration(500).call(zoomEvent);
-            }
-            scene.fit = fit;
-            ;
-            /**
-             * Helper method for panning the graph to center on the provided node,
-             * if the node is currently off-screen.
-             *
-             * @param nodeName The node to center the graph on
-             * @param svg The root SVG element for the graph
-             * @param zoomG The svg group used for panning and zooming.
-             * @param d3zoom The zoom behavior.
-             * @return True if the graph had to be panned to display the
-             *            provided node.
-             */
-            function panToNode(nodeName, svg, zoomG, d3zoom) {
-                var node = d3
-                    .select('[data-name="' + nodeName + '"].' + scene.Class.Node.GROUP)
-                    .node();
-                if (!node) {
-                    return false;
-                }
-                var translate = d3zoom.translate();
-                // Check if the selected node is off-screen in either
-                // X or Y dimension in either direction.
-                var nodeBox = node.getBBox();
-                var nodeCtm = node.getScreenCTM();
-                var pointTL = svg.createSVGPoint();
-                var pointBR = svg.createSVGPoint();
-                pointTL.x = nodeBox.x;
-                pointTL.y = nodeBox.y;
-                pointBR.x = nodeBox.x + nodeBox.width;
-                pointBR.y = nodeBox.y + nodeBox.height;
-                pointTL = pointTL.matrixTransform(nodeCtm);
-                pointBR = pointBR.matrixTransform(nodeCtm);
-                var isOutsideOfBounds = function (start, end, bound) {
-                    return end < 0 || start > bound;
-                };
-                var svgRect = svg.getBoundingClientRect();
-                if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
-                    isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
-                    // Determine the amount to transform the graph in both X and Y
-                    // dimensions in order to center the selected node. This takes into
-                    // acount the position of the node, the size of the svg scene, the
-                    // amount the scene has been scaled by through zooming, and any previous
-                    // transform already performed by this logic.
-                    var centerX = (pointTL.x + pointBR.x) / 2;
-                    var centerY = (pointTL.y + pointBR.y) / 2;
-                    var dx = ((svgRect.width / 2) - centerX);
-                    var dy = ((svgRect.height / 2) - centerY);
-                    var zoomEvent = d3zoom.translate([translate[0] + dx, translate[1] + dy])
-                        .event;
-                    d3.select(zoomG).transition().duration(500).call(zoomEvent);
-                    return true;
-                }
-                return false;
-            }
-            scene.panToNode = panToNode;
-            ;
-            /**
-             * Given a container d3 selection, select a child svg element of a given tag
-             * and class if exists or append / insert one otherwise.  If multiple children
-             * matches the tag and class name, returns only the first one.
-             *
-             * @param container
-             * @param tagName tag name.
-             * @param className (optional) Class name or a list of class names.
-             * @param before (optional) reference DOM node for insertion.
-             * @return selection of the element
-             */
-            function selectOrCreateChild(container, tagName, className, before) {
-                var child = selectChild(container, tagName, className);
-                if (!child.empty()) {
-                    return child;
-                }
-                var newElement = document.createElementNS('http://www.w3.org/2000/svg', tagName);
-                if (className instanceof Array) {
-                    for (var i = 0; i < className.length; i++) {
-                        newElement.classList.add(className[i]);
-                    }
-                }
-                else {
-                    newElement.classList.add(className);
-                }
-                if (before) {
-                    container.node().insertBefore(newElement, before);
-                }
-                else {
-                    container.node().appendChild(newElement);
-                }
-                return d3.select(newElement)
-                    .datum(container.datum());
-            }
-            scene.selectOrCreateChild = selectOrCreateChild;
-            ;
-            /**
-             * Given a container d3 selection, select a child element of a given tag and
-             * class. If multiple children matches the tag and class name, returns only
-             * the first one.
-             *
-             * @param container
-             * @param tagName tag name.
-             * @param className (optional) Class name or list of class names.
-             * @return selection of the element, or an empty selection
-             */
-            function selectChild(container, tagName, className) {
-                var children = container.node().childNodes;
-                for (var i = 0; i < children.length; i++) {
-                    var child = children[i];
-                    if (child.tagName === tagName) {
-                        if (className instanceof Array) {
-                            var hasAllClasses = true;
-                            for (var j = 0; j < className.length; j++) {
-                                hasAllClasses =
-                                    hasAllClasses && child.classList.contains(className[j]);
-                            }
-                            if (hasAllClasses) {
-                                return d3.select(child);
-                            }
-                        }
-                        else if ((!className || child.classList.contains(className))) {
-                            return d3.select(child);
-                        }
-                    }
-                }
-                return d3.select(null);
-            }
-            scene.selectChild = selectChild;
-            ;
-            /**
-             * Select or create a sceneGroup and build/update its nodes and edges.
-             *
-             * Structure Pattern:
-             *
-             * <g class='scene'>
-             *   <g class='core'>
-             *     <g class='edges'>
-             *       ... stuff from tf.graph.scene.edges.build ...
-             *     </g>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             *   <g class='in-extract'>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             *   <g class='out-extract'>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             * </g>
-             *
-             * @param container D3 selection of the parent.
-             * @param renderNode render node of a metanode or series node.
-             * @param sceneElement <tf-graph-scene> polymer element.
-             * @param sceneClass class attribute of the scene (default='scene').
-             */
-            function buildGroup(container, renderNode, sceneElement, sceneClass) {
-                sceneClass = sceneClass || scene.Class.Scene.GROUP;
-                var isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
-                var sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
-                // core
-                var coreGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.CORE);
-                var coreNodes = _.reduce(renderNode.coreGraph.nodes(), function (nodes, name) {
-                    var node = renderNode.coreGraph.node(name);
-                    if (!node.excluded) {
-                        nodes.push(node);
-                    }
-                    return nodes;
-                }, []);
-                if (renderNode.node.type === graph.NodeType.SERIES) {
-                    // For series, we want the first item on top, so reverse the array so
-                    // the first item in the series becomes last item in the top, and thus
-                    // is rendered on the top.
-                    coreNodes.reverse();
-                }
-                // Create the layer of edges for this scene (paths).
-                scene.edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
-                // Create the layer of nodes for this scene (ellipses, rects etc).
-                scene.node.buildGroup(coreGroup, coreNodes, sceneElement);
-                // In-extract
-                if (renderNode.isolatedInExtract.length > 0) {
-                    var inExtractGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT);
-                    scene.node.buildGroup(inExtractGroup, renderNode.isolatedInExtract, sceneElement);
-                }
-                else {
-                    selectChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT).remove();
-                }
-                // Out-extract
-                if (renderNode.isolatedOutExtract.length > 0) {
-                    var outExtractGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT);
-                    scene.node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract, sceneElement);
-                }
-                else {
-                    selectChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT).remove();
-                }
-                position(sceneGroup, renderNode);
-                // Fade in the scene group if it didn't already exist.
-                if (isNewSceneGroup) {
-                    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
-                }
-                return sceneGroup;
-            }
-            scene.buildGroup = buildGroup;
-            ;
-            /**
-             * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
-             * groups' position relative to the scene.
-             *
-             * @param sceneGroup
-             * @param renderNode render node of a metanode or series node.
-             */
-            function position(sceneGroup, renderNode) {
-                // Translate scenes down by the label height so that when showing graphs in
-                // expanded metanodes, the graphs are below the labels.  Do not shift them
-                // down for series nodes as series nodes don't have labels inside of their
-                // bounding boxes.
-                var yTranslate = renderNode.node.type === graph.NodeType.SERIES ?
-                    0 : graph.layout.PARAMS.subscene.meta.labelHeight;
-                // core
-                translate(selectChild(sceneGroup, 'g', scene.Class.Scene.CORE), 0, yTranslate);
-                // in-extract
-                var hasInExtract = renderNode.isolatedInExtract.length > 0;
-                var hasOutExtract = renderNode.isolatedOutExtract.length > 0;
-                if (hasInExtract) {
-                    var offset = graph.layout.PARAMS.subscene.meta.extractXOffset;
-                    var inExtractX = renderNode.coreBox.width -
-                        renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
-                        (hasOutExtract ? offset : 0);
-                    translate(selectChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT), inExtractX, yTranslate);
-                }
-                // out-extract
-                if (hasOutExtract) {
-                    var outExtractX = renderNode.coreBox.width -
-                        renderNode.outExtractBox.width / 2;
-                    translate(selectChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT), outExtractX, yTranslate);
-                }
-            }
-            ;
-            /** Adds a click listener to a group that fires a graph-select event */
-            function addGraphClickListener(graphGroup, sceneElement) {
-                d3.select(graphGroup).on('click', function () {
-                    sceneElement.fire('graph-select');
-                });
-            }
-            scene.addGraphClickListener = addGraphClickListener;
-            ;
-            /** Helper for adding transform: translate(x0, y0) */
-            function translate(selection, x0, y0) {
-                // If it is already placed on the screen, make it a transition.
-                if (selection.attr('transform') != null) {
-                    selection = selection.transition('position');
-                }
-                selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
-            }
-            scene.translate = translate;
-            ;
-            /**
-             * Helper for setting position of a svg rect
-             * @param rect rect to set position of.
-             * @param cx Center x.
-             * @param cy Center x.
-             * @param width Width to set.
-             * @param height Height to set.
-             */
-            function positionRect(rect, cx, cy, width, height) {
-                rect.transition().attr({
-                    x: cx - width / 2,
-                    y: cy - height / 2,
-                    width: width,
-                    height: height
-                });
-            }
-            scene.positionRect = positionRect;
-            ;
-            /**
-             * Helper for setting position of a svg expand/collapse button
-             * @param button container group
-             * @param renderNode the render node of the group node to position
-             *        the button on.
-             */
-            function positionButton(button, renderNode) {
-                var cx = graph.layout.computeCXPositionOfNodeShape(renderNode);
-                // Position the button in the top-right corner of the group node,
-                // with space given the draw the button inside of the corner.
-                var width = renderNode.expanded ?
-                    renderNode.width : renderNode.coreBox.width;
-                var height = renderNode.expanded ?
-                    renderNode.height : renderNode.coreBox.height;
-                var x = cx + width / 2 - 6;
-                var y = renderNode.y - height / 2 + 6;
-                // For unexpanded series nodes, the button has special placement due
-                // to the unique visuals of this group node.
-                if (renderNode.node.type === graph.NodeType.SERIES && !renderNode.expanded) {
-                    x += 10;
-                    y -= 2;
-                }
-                var translateStr = 'translate(' + x + ',' + y + ')';
-                button.selectAll('path').transition().attr('transform', translateStr);
-                button.select('circle').transition().attr({ cx: x, cy: y, r: graph.layout.PARAMS.nodeSize.meta.expandButtonRadius });
-            }
-            scene.positionButton = positionButton;
-            ;
-            /**
-             * Helper for setting position of a svg ellipse
-             * @param ellipse ellipse to set position of.
-             * @param cx Center x.
-             * @param cy Center x.
-             * @param width Width to set.
-             * @param height Height to set.
-             */
-            function positionEllipse(ellipse, cx, cy, width, height) {
-                ellipse.transition().attr({
-                    cx: cx,
-                    cy: cy,
-                    rx: width / 2,
-                    ry: height / 2
-                });
-            }
-            scene.positionEllipse = positionEllipse;
-            ;
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var template;
-        (function (template) {
-            /**
-             * Detect repeating patterns of subgraphs.
-             * Assign templateId to each subgraph if it belongs to a template.
-             * Returns clusters of similar subgraphs .
-             *
-             * @param graph
-             * @param verifyTemplate whether to run the template verification algorithm
-             * @return a dict (template id => Array of node names)
-             */
-            function detect(h, verifyTemplate) {
-                // In any particular subgraph, there are either
-                // - leaf nodes (which do not have subgraph)
-                // - metanode nodes - some of them have only one member (singular metanode)
-                //                    and some have multiple members (non-singular metanode)
-                // First, generate a nearest neighbor hash of metanode nodes.
-                var nnGroups = clusterSimilarSubgraphs(h);
-                // For each metanode, compare its subgraph (starting from shallower groups)
-                // and assign template id.
-                var templates = groupTemplateAndAssignId(nnGroups, verifyTemplate);
-                // Sort the templates by minimum level in the graph at which they appear,
-                // as this leads to optimal setting of the colors of each template for
-                // maximum differentiation.
-                return _(templates)
-                    .pairs()
-                    .sortBy(function (pair) {
-                    return pair[1].level;
-                })
-                    .map(function (pair) {
-                    return [pair[0], pair[1].nodes];
-                })
-                    .object()
-                    .value();
-            }
-            template.detect = detect;
-            ;
-            /**
-             * @return Unique string for a metanode based on depth, |V|, |E| and
-             * op type histogram.
-             */
-            function getSignature(metanode) {
-                // depth=<number> |V|=<number> |E|=<number>
-                var props = _.map({
-                    'depth': metanode.depth,
-                    '|V|': metanode.metagraph.nodes().length,
-                    '|E|': metanode.metagraph.edges().length
-                }, function (v, k) { return k + '=' + v; })
-                    .join(' ');
-                // optype1=count1,optype2=count2
-                var ops = _.map(metanode.opHistogram, function (count, op) {
-                    return op + '=' + count;
-                }).join(',');
-                return props + ' [ops] ' + ops;
-            }
-            /**
-             * Generate a nearest neighbor hash of metanodes
-             * based on depth, |V|, |E|, and opHistogram of their subgraph
-             * (excluding leaf nodes and singular metanodes).
-             * @param graph The graph
-             * @return Array of pairs of [signature,
-             *   Object with min level of the template and an Array of tf.graph.Group]
-             *   sort by ascending order of minimum depth at which metanode appears.
-             */
-            function clusterSimilarSubgraphs(h) {
-                /** a dict from metanode.signature() => Array of tf.graph.Groups */
-                var hashDict = _(h.getNodeMap()).reduce(function (hash, node, name) {
-                    if (node.type !== graph_1.NodeType.META) {
-                        return hash;
-                    }
-                    var levelOfMetaNode = name.split('/').length - 1;
-                    var signature = getSignature(node);
-                    var templateInfo = hash[signature] ||
-                        { nodes: [], level: levelOfMetaNode };
-                    hash[signature] = templateInfo;
-                    templateInfo.nodes.push(node);
-                    if (templateInfo.level > levelOfMetaNode) {
-                        templateInfo.level = levelOfMetaNode;
-                    }
-                    return hash;
-                }, {});
-                return _(hashDict)
-                    .pairs()
-                    .filter(function (pair) {
-                    return pair[1].nodes.length > 1;
-                })
-                    .sortBy(function (pair) {
-                    // sort by depth
-                    // (all members in the same nnGroup has equal depth)
-                    return pair[1].nodes[0].depth;
-                })
-                    .value();
-            }
-            function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
-                // For each metanode, compare its subgraph (starting from shallower groups)
-                // and assign template id.
-                var result = {};
-                return _.reduce(nnGroups, function (templates, nnGroupPair) {
-                    var signature = nnGroupPair[0], nnGroup = nnGroupPair[1].nodes, clusters = [];
-                    nnGroup.forEach(function (metanode) {
-                        // check with each existing cluster
-                        for (var i = 0; i < clusters.length; i++) {
-                            var similar = !verifyTemplate ||
-                                isSimilarSubgraph(clusters[i].metanode.metagraph, metanode.metagraph);
-                            // if similar, just add this metanode to the cluster
-                            if (similar) {
-                                // get template from the first one
-                                metanode.templateId = clusters[i].metanode.templateId;
-                                clusters[i].members.push(metanode.name);
-                                return;
-                            }
-                        }
-                        // otherwise create a new cluster with id 'signature [count] '
-                        metanode.templateId = signature + '[' + clusters.length + ']';
-                        clusters.push({
-                            metanode: metanode,
-                            members: [metanode.name]
-                        });
-                    });
-                    clusters.forEach(function (c) {
-                        templates[c.metanode.templateId] = {
-                            level: nnGroupPair[1].level,
-                            nodes: c.members
-                        };
-                    });
-                    return templates;
-                }, result);
-            }
-            function sortNodes(names, graph, prefix) {
-                return _.sortByAll(names, function (name) {
-                    var node = graph.node(name);
-                    return node.op;
-                }, function (name) {
-                    var node = graph.node(name);
-                    return node.templateId;
-                }, function (name) {
-                    return graph.neighbors(name).length;
-                }, function (name) {
-                    return graph.predecessors(name).length;
-                }, function (name) {
-                    return graph.successors(name).length;
-                }, function (name) {
-                    return name.substr(prefix.length);
-                });
-            }
-            function isSimilarSubgraph(g1, g2) {
-                if (!tf.graph.hasSimilarDegreeSequence(g1, g2)) {
-                    return false;
-                }
-                // if we want to skip, just return true here.
-                // return true;
-                // Verify sequence by running DFS
-                var g1prefix = g1.graph().name;
-                var g2prefix = g2.graph().name;
-                var visited1 = {};
-                var visited2 = {};
-                var stack = [];
-                /**
-                 * push sources or successors into the stack
-                 * if the visiting pattern has been similar.
-                 */
-                function stackPushIfNotDifferent(n1, n2) {
-                    var sub1 = n1.substr(g1prefix.length), sub2 = n2.substr(g2prefix.length);
-                    /* tslint:disable */
-                    if (visited1[sub1] ^ visited2[sub1]) {
-                        console.warn('different visit pattern', '[' + g1prefix + ']', sub1, '[' + g2prefix + ']', sub2);
-                        return true;
-                    }
-                    /* tslint:enable */
-                    if (!visited1[sub1]) {
-                        visited1[sub1] = visited2[sub2] = true;
-                        stack.push({ n1: n1, n2: n2 });
-                    }
-                    return false;
-                }
-                // check if have same # of sources then sort and push
-                var sources1 = g1.sources();
-                var sources2 = g2.sources();
-                if (sources1.length !== sources2.length) {
-                    /* tslint:disable */
-                    console.log('different source length');
-                    /* tslint:enable */
-                    return false;
-                }
-                sources1 = sortNodes(sources1, g1, g1prefix);
-                sources2 = sortNodes(sources2, g2, g2prefix);
-                for (var i = 0; i < sources1.length; i++) {
-                    var different = stackPushIfNotDifferent(sources1[i], sources2[i]);
-                    if (different) {
-                        return false;
-                    }
-                }
-                while (stack.length > 0) {
-                    var cur = stack.pop();
-                    // check node
-                    var similar = isSimilarNode(g1.node(cur.n1), g2.node(cur.n2));
-                    if (!similar) {
-                        return false;
-                    }
-                    // check if have same # of successors then sort and push
-                    var succ1 = g1.successors(cur.n1), succ2 = g2.successors(cur.n2);
-                    if (succ1.length !== succ2.length) {
-                        /* tslint:disable */
-                        console.log('# of successors mismatch', succ1, succ2);
-                        /* tslint:enable */
-                        return false;
-                    }
-                    succ1 = sortNodes(succ1, g1, g1prefix);
-                    succ2 = sortNodes(succ2, g2, g2prefix);
-                    for (var j = 0; j < succ1.length; j++) {
-                        var different = stackPushIfNotDifferent(succ1[j], succ2[j]);
-                        if (different) {
-                            return false;
-                        }
-                    }
-                }
-                return true;
-            }
-            /**
-             * Returns if two nodes have identical structure.
-             */
-            function isSimilarNode(n1, n2) {
-                if (n1.type === graph_1.NodeType.META) {
-                    // compare metanode
-                    var metanode1 = n1;
-                    var metanode2 = n2;
-                    return metanode1.templateId && metanode2.templateId &&
-                        metanode1.templateId === metanode2.templateId;
-                }
-                else if (n1.type === graph_1.NodeType.OP && n2.type === graph_1.NodeType.OP) {
-                    // compare leaf node
-                    return n1.op === n2.op;
-                }
-                else if (n1.type === graph_1.NodeType.SERIES && n2.type === graph_1.NodeType.SERIES) {
-                    // compare series node sizes and operations
-                    // (only need to check one op as all op nodes are identical in series)
-                    var sn1 = n1;
-                    var sn2 = n2;
-                    var seriesnode1Count = sn1.metagraph.nodeCount();
-                    return (seriesnode1Count === sn2.metagraph.nodeCount() &&
-                        (seriesnode1Count === 0 ||
-                            (sn1.metagraph.node(sn1.metagraph.nodes()[0]).op ===
-                                sn2.metagraph.node(sn2.metagraph.nodes()[0]).op)));
-                }
-                return false;
-            }
-        })(template = graph_1.template || (graph_1.template = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * @fileoverview Utility functions for the tensorflow graph visualizer.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var util;
-        (function (util) {
-            /**
-             * Recommended delay (ms) when running an expensive task asynchronously
-             * that gives enough time for the progress bar to update its UI.
-             */
-            var ASYNC_TASK_DELAY = 20;
-            function time(msg, task) {
-                var start = Date.now();
-                var result = task();
-                /* tslint:disable */
-                console.log(msg, ':', Date.now() - start, 'ms');
-                /* tslint:enable */
-                return result;
-            }
-            util.time = time;
-            /**
-             * Creates a tracker that sets the progress property of the
-             * provided polymer component. The provided component must have
-             * a property called 'progress' that is not read-only. The progress
-             * property is an object with a numerical 'value' property and a
-             * string 'msg' property.
-             */
-            function getTracker(polymerComponent) {
-                return {
-                    setMessage: function (msg) {
-                        polymerComponent.set('progress', { value: polymerComponent.progress.value, msg: msg });
-                    },
-                    updateProgress: function (value) {
-                        polymerComponent.set('progress', {
-                            value: polymerComponent.progress.value + value,
-                            msg: polymerComponent.progress.msg
-                        });
-                    },
-                    reportError: function (msg, err) {
-                        // Log the stack trace in the console.
-                        console.error(err.stack);
-                        // And send a user-friendly message to the UI.
-                        polymerComponent.set('progress', { value: polymerComponent.progress.value, msg: msg, error: true });
-                    },
-                };
-            }
-            util.getTracker = getTracker;
-            /**
-             * Creates a tracker for a subtask given the parent tracker, the total
-             * progress
-             * of the subtask and the subtask message. The parent task should pass a
-             * subtracker to its subtasks. The subtask reports its own progress which
-             * becames relative to the main task.
-             */
-            function getSubtaskTracker(parentTracker, impactOnTotalProgress, subtaskMsg) {
-                return {
-                    setMessage: function (progressMsg) {
-                        // The parent should show a concatenation of its message along with
-                        // its subtask tracker message.
-                        parentTracker.setMessage(subtaskMsg + ': ' + progressMsg);
-                    },
-                    updateProgress: function (incrementValue) {
-                        // Update the parent progress relative to the child progress.
-                        // For example, if the sub-task progresses by 30%, and the impact on the
-                        // total progress is 50%, then the task progresses by 30% * 50% = 15%.
-                        parentTracker.updateProgress(incrementValue * impactOnTotalProgress / 100);
-                    },
-                    reportError: function (msg, err) {
-                        // The parent should show a concatenation of its message along with
-                        // its subtask error message.
-                        parentTracker.reportError(subtaskMsg + ': ' + msg, err);
-                    }
-                };
-            }
-            util.getSubtaskTracker = getSubtaskTracker;
-            /**
-             * Runs an expensive task and return the result.
-             */
-            function runTask(msg, incProgressValue, task, tracker) {
-                // Update the progress message to say the current running task.
-                tracker.setMessage(msg);
-                // Run the expensive task with a delay that gives enough time for the
-                // UI to update.
-                try {
-                    var result = tf.graph.util.time(msg, task);
-                    // Update the progress value.
-                    tracker.updateProgress(incProgressValue);
-                    // Return the result to be used by other tasks.
-                    return result;
-                }
-                catch (e) {
-                    // Errors that happen inside asynchronous tasks are
-                    // reported to the tracker using a user-friendly message.
-                    tracker.reportError('Failed ' + msg, e);
-                }
-            }
-            util.runTask = runTask;
-            /**
-             * Runs an expensive task asynchronously and returns a promise of the result.
-             */
-            function runAsyncTask(msg, incProgressValue, task, tracker) {
-                return new Promise(function (resolve, reject) {
-                    // Update the progress message to say the current running task.
-                    tracker.setMessage(msg);
-                    // Run the expensive task with a delay that gives enough time for the
-                    // UI to update.
-                    setTimeout(function () {
-                        try {
-                            var result = tf.graph.util.time(msg, task);
-                            // Update the progress value.
-                            tracker.updateProgress(incProgressValue);
-                            // Return the result to be used by other tasks.
-                            resolve(result);
-                        }
-                        catch (e) {
-                            // Errors that happen inside asynchronous tasks are
-                            // reported to the tracker using a user-friendly message.
-                            tracker.reportError('Failed ' + msg, e);
-                        }
-                    }, ASYNC_TASK_DELAY);
-                });
-            }
-            util.runAsyncTask = runAsyncTask;
-            /**
-             * Asynchronously runs an expensive task that returns a promise. Updates the
-             * tracker's progress after the promise resolves. Returns a new promise that
-             * resolves after the progress is updated.
-             */
-            function runAsyncPromiseTask(msg, incProgressValue, task, tracker) {
-                return new Promise(function (resolve, reject) {
-                    var handleError = function (e) {
-                        // Errors that happen inside asynchronous tasks are
-                        // reported to the tracker using a user-friendly message.
-                        tracker.reportError('Failed ' + msg, e);
-                        reject(e);
-                    };
-                    // Update the progress message to say the current running task.
-                    tracker.setMessage(msg);
-                    // Run the expensive task with a delay that gives enough time for the
-                    // UI to update.
-                    setTimeout(function () {
-                        try {
-                            var start_1 = Date.now();
-                            task()
-                                .then(function (value) {
-                                /* tslint:disable */
-                                console.log(msg, ':', Date.now() - start_1, 'ms');
-                                // Update the progress value.
-                                tracker.updateProgress(incProgressValue);
-                                // Return the result to be used by other tasks.
-                                resolve(value);
-                            })
-                                .catch(handleError);
-                        }
-                        catch (e) {
-                            handleError(e);
-                        }
-                    }, ASYNC_TASK_DELAY);
-                });
-            }
-            util.runAsyncPromiseTask = runAsyncPromiseTask;
-            /**
-             * Returns a query selector with escaped special characters that are not
-             * allowed in a query selector.
-             */
-            function escapeQuerySelector(querySelector) {
-                return querySelector.replace(/([:.\[\],/\\\(\)])/g, '\\$1');
-            }
-            util.escapeQuerySelector = escapeQuerySelector;
-            // For unit conversion.
-            util.MEMORY_UNITS = [
-                // Atomic unit.
-                { symbol: 'B' },
-                // numUnits specifies how many previous units this unit contains.
-                { symbol: 'KB', numUnits: 1024 }, { symbol: 'MB', numUnits: 1024 },
-                { symbol: 'GB', numUnits: 1024 }, { symbol: 'TB', numUnits: 1024 },
-                { symbol: 'PB', numUnits: 1024 }
-            ];
-            util.TIME_UNITS = [
-                // Atomic unit. Finest granularity in TensorFlow stat collection.
-                { symbol: 'µs' },
-                // numUnits specifies how many previous units this unit contains.
-                { symbol: 'ms', numUnits: 1000 }, { symbol: 's', numUnits: 1000 },
-                { symbol: 'min', numUnits: 60 }, { symbol: 'hr', numUnits: 60 },
-                { symbol: 'days', numUnits: 24 }
-            ];
-            /**
-             * Returns the human readable version of the unit.
-             * (e.g. 1.35 GB, 23 MB, 34 ms, 6.53 min etc).
-             */
-            function convertUnitsToHumanReadable(value, units, unitIndex) {
-                unitIndex = unitIndex == null ? 0 : unitIndex;
-                if (unitIndex + 1 < units.length &&
-                    value >= units[unitIndex + 1].numUnits) {
-                    return tf.graph.util.convertUnitsToHumanReadable(value / units[unitIndex + 1].numUnits, units, unitIndex + 1);
-                }
-                // toPrecision() has the tendency to return a number in scientific
-                // notation and (number - 0) brings it back to normal notation.
-                return (value.toPrecision(3) - 0) + ' ' + units[unitIndex].symbol;
-            }
-            util.convertUnitsToHumanReadable = convertUnitsToHumanReadable;
-            function hasDisplayableNodeStats(stats) {
-                if (stats &&
-                    (stats.totalBytes > 0 || stats.totalMicros > 0 || stats.outputSize)) {
-                    return true;
-                }
-                return false;
-            }
-            util.hasDisplayableNodeStats = hasDisplayableNodeStats;
-            /**
-             * Given a list of strings, it returns a new list of strings with the longest
-             * common prefix removed. If the common prefix is one of the strings in the
-             * list, it returns the original strings.
-             */
-            function removeCommonPrefix(strings) {
-                if (strings.length < 2) {
-                    return strings;
-                }
-                var index = 0;
-                var largestIndex = 0;
-                // Find the shortest name across all strings.
-                var minLength = _.min(_.map(strings, function (str) { return str.length; }));
-                var _loop_1 = function() {
-                    index++;
-                    var prefixes = _.map(strings, function (str) { return str.substring(0, index); });
-                    var allTheSame = prefixes.every(function (prefix, i) {
-                        return (i === 0 ? true : prefix === prefixes[i - 1]);
-                    });
-                    if (allTheSame) {
-                        if (index >= minLength) {
-                            // There is a string whose whole name is a prefix to other string.
-                            // In this case, we return the original list of string.
-                            return { value: strings };
-                        }
-                        largestIndex = index;
-                    }
-                    else {
-                        return "break";
-                    }
-                };
-                while (true) {
-                    var state_1 = _loop_1();
-                    if (typeof state_1 === "object") return state_1.value;
-                    if (state_1 === "break") break;
-                }
-                return _.map(strings, function (str) { return str.substring(largestIndex); });
-            }
-            util.removeCommonPrefix = removeCommonPrefix;
-            /**
-             * Given a queryString, aka ?foo=1&bar=2, return the object representation.
-             */
-            function getQueryParams(queryString) {
-                if (queryString.charAt(0) === '?') {
-                    queryString = queryString.slice(1);
-                }
-                var queryParams = _.chain(queryString.split('&'))
-                    .map(function (item) {
-                    if (item) {
-                        return item.split('=');
-                    }
-                })
-                    .compact()
-                    .value();
-                return _.object(queryParams);
-            }
-            util.getQueryParams = getQueryParams;
-        })(util = graph.util || (graph.util = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {}));
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var scene;
-    (function (scene) {
-        /** Show minimap when the viewpoint area is less than X% of the whole area. */
-        var FRAC_VIEWPOINT_AREA = 0.8;
-        var Minimap = (function () {
-            /**
-             * Constructs a new minimap.
-             *
-             * @param svg The main svg element.
-             * @param zoomG The svg group used for panning and zooming the main svg.
-             * @param mainZoom The main zoom behavior.
-             * @param minimap The minimap container.
-             * @param maxWandH The maximum width/height for the minimap.
-             * @param labelPadding Padding in pixels due to the main graph labels.
-             */
-            function Minimap(svg, zoomG, mainZoom, minimap, maxWandH, labelPadding) {
-                var _this = this;
-                this.svg = svg;
-                this.labelPadding = labelPadding;
-                this.zoomG = zoomG;
-                this.mainZoom = mainZoom;
-                this.maxWandH = maxWandH;
-                var $minimap = d3.select(minimap);
-                // The minimap will have 2 main components: the canvas showing the content
-                // and an svg showing a rectangle of the currently zoomed/panned viewpoint.
-                var $minimapSvg = $minimap.select('svg');
-                // Make the viewpoint rectangle draggable.
-                var $viewpoint = $minimapSvg.select('rect');
-                var dragmove = function (d) {
-                    _this.viewpointCoord.x = d3.event.x;
-                    _this.viewpointCoord.y = d3.event.y;
-                    _this.updateViewpoint();
-                };
-                this.viewpointCoord = { x: 0, y: 0 };
-                var drag = d3.behavior.drag().origin(Object).on('drag', dragmove);
-                $viewpoint.datum(this.viewpointCoord).call(drag);
-                // Make the minimap clickable.
-                $minimapSvg.on('click', function () {
-                    if (d3.event.defaultPrevented) {
-                        // This click was part of a drag event, so suppress it.
-                        return;
-                    }
-                    // Update the coordinates of the viewpoint.
-                    var width = Number($viewpoint.attr('width'));
-                    var height = Number($viewpoint.attr('height'));
-                    var clickCoords = d3.mouse($minimapSvg.node());
-                    _this.viewpointCoord.x = clickCoords[0] - width / 2;
-                    _this.viewpointCoord.y = clickCoords[1] - height / 2;
-                    _this.updateViewpoint();
-                });
-                this.viewpoint = $viewpoint.node();
-                this.minimapSvg = $minimapSvg.node();
-                this.minimap = minimap;
-                this.canvas = $minimap.select('canvas.first').node();
-                this.canvasBuffer =
-                    $minimap.select('canvas.second').node();
-                this.downloadCanvas =
-                    $minimap.select('canvas.download').node();
-                d3.select(this.downloadCanvas).style('display', 'none');
-                this.update();
-            }
-            /**
-             * Updates the position and the size of the viewpoint rectangle.
-             * It also notifies the main svg about the new panned position.
-             */
-            Minimap.prototype.updateViewpoint = function () {
-                // Update the coordinates of the viewpoint rectangle.
-                d3.select(this.viewpoint)
-                    .attr('x', this.viewpointCoord.x)
-                    .attr('y', this.viewpointCoord.y);
-                // Update the translation vector of the main svg to reflect the
-                // new viewpoint.
-                var mainX = -this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
-                var mainY = -this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
-                var zoomEvent = this.mainZoom.translate([mainX, mainY]).event;
-                d3.select(this.zoomG).call(zoomEvent);
-            };
-            /**
-             * Redraws the minimap. Should be called whenever the main svg
-             * was updated (e.g. when a node was expanded).
-             */
-            Minimap.prototype.update = function () {
-                var _this = this;
-                var sceneSize = null;
-                try {
-                    // Get the size of the entire scene.
-                    sceneSize = this.zoomG.getBBox();
-                    if (sceneSize.width === 0) {
-                        // There is no scene anymore. We have been detached from the dom.
-                        return;
-                    }
-                }
-                catch (e) {
-                    // Firefox produced NS_ERROR_FAILURE if we have been
-                    // detached from the dom.
-                    return;
-                }
-                var $download = d3.select('#graphdownload');
-                this.download = $download.node();
-                $download.on('click', function (d) {
-                    _this.download.href = _this.downloadCanvas.toDataURL('image/png');
-                });
-                var $svg = d3.select(this.svg);
-                // Read all the style rules in the document and embed them into the svg.
-                // The svg needs to be self contained, i.e. all the style rules need to be
-                // embedded so the canvas output matches the origin.
-                var stylesText = '';
-                for (var k = 0; k < document.styleSheets.length; k++) {
-                    try {
-                        var cssRules = document.styleSheets[k].cssRules ||
-                            document.styleSheets[k].rules;
-                        if (cssRules == null) {
-                            continue;
-                        }
-                        for (var i = 0; i < cssRules.length; i++) {
-                            // Remove tf-* selectors from the styles.
-                            stylesText +=
-                                cssRules[i].cssText.replace(/ ?tf-[\w-]+ ?/g, '') + '\n';
-                        }
-                    }
-                    catch (e) {
-                        if (e.name !== 'SecurityError') {
-                            throw e;
-                        }
-                    }
-                }
-                // Temporarily add the css rules to the main svg.
-                var svgStyle = $svg.append('style');
-                svgStyle.text(stylesText);
-                // Temporarily remove the zoom/pan transform from the main svg since we
-                // want the minimap to show a zoomed-out and centered view.
-                var $zoomG = d3.select(this.zoomG);
-                var zoomTransform = $zoomG.attr('transform');
-                $zoomG.attr('transform', null);
-                // Since we add padding, account for that here.
-                sceneSize.height += this.labelPadding * 2;
-                sceneSize.width += this.labelPadding * 2;
-                // Temporarily assign an explicit width/height to the main svg, since
-                // it doesn't have one (uses flex-box), but we need it for the canvas
-                // to work.
-                $svg.attr({
-                    width: sceneSize.width,
-                    height: sceneSize.height,
-                });
-                // Since the content inside the svg changed (e.g. a node was expanded),
-                // the aspect ratio have also changed. Thus, we need to update the scale
-                // factor of the minimap. The scale factor is determined such that both
-                // the width and height of the minimap are <= maximum specified w/h.
-                this.scaleMinimap =
-                    this.maxWandH / Math.max(sceneSize.width, sceneSize.height);
-                this.minimapSize = {
-                    width: sceneSize.width * this.scaleMinimap,
-                    height: sceneSize.height * this.scaleMinimap
-                };
-                // Update the size of the minimap's svg, the buffer canvas and the
-                // viewpoint rect.
-                d3.select(this.minimapSvg).attr(this.minimapSize);
-                d3.select(this.canvasBuffer).attr(this.minimapSize);
-                // Download canvas width and height are multiples of the style width and
-                // height in order to increase pixel density of the PNG for clarity.
-                d3.select(this.downloadCanvas).style({ width: sceneSize.width, height: sceneSize.height });
-                d3.select(this.downloadCanvas).attr({ width: sceneSize.width * 3, height: sceneSize.height * 3 });
-                if (this.translate != null && this.zoom != null) {
-                    // Update the viewpoint rectangle shape since the aspect ratio of the
-                    // map has changed.
-                    requestAnimationFrame(function () { return _this.zoom(); });
-                }
-                // Serialize the main svg to a string which will be used as the rendering
-                // content for the canvas.
-                var svgXml = (new XMLSerializer()).serializeToString(this.svg);
-                // Now that the svg is serialized for rendering, remove the temporarily
-                // assigned styles, explicit width and height and bring back the pan/zoom
-                // transform.
-                svgStyle.remove();
-                $svg.attr({
-                    width: null,
-                    height: null
-                });
-                $zoomG.attr('transform', zoomTransform);
-                var image = new Image();
-                image.onload = function () {
-                    // Draw the svg content onto the buffer canvas.
-                    var context = _this.canvasBuffer.getContext('2d');
-                    context.clearRect(0, 0, _this.canvasBuffer.width, _this.canvasBuffer.height);
-                    context.drawImage(image, 0, 0, _this.minimapSize.width, _this.minimapSize.height);
-                    requestAnimationFrame(function () {
-                        // Hide the old canvas and show the new buffer canvas.
-                        d3.select(_this.canvasBuffer).style('display', null);
-                        d3.select(_this.canvas).style('display', 'none');
-                        // Swap the two canvases.
-                        _a = [_this.canvasBuffer, _this.canvas], _this.canvas = _a[0], _this.canvasBuffer = _a[1];
-                        var _a;
-                    });
-                    var downloadContext = _this.downloadCanvas.getContext('2d');
-                    downloadContext.clearRect(0, 0, _this.downloadCanvas.width, _this.downloadCanvas.height);
-                    downloadContext.drawImage(image, 0, 0, _this.downloadCanvas.width, _this.downloadCanvas.height);
-                };
-                image.onerror = function () {
-                    var blob = new Blob([svgXml], { type: 'image/svg+xml;charset=utf-8' });
-                    image.src = URL.createObjectURL(blob);
-                };
-                image.src =
-                    'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(svgXml);
-            };
-            /**
-             * Handles changes in zooming/panning. Should be called from the main svg
-             * to notify that a zoom/pan was performed and this minimap will update it's
-             * viewpoint rectangle.
-             *
-             * @param translate The translate vector, or none to use the last used one.
-             * @param scale The scaling factor, or none to use the last used one.
-             */
-            Minimap.prototype.zoom = function (translate, scale) {
-                if (this.scaleMinimap == null) {
-                    // Scene is not ready yet.
-                    return;
-                }
-                // Update the new translate and scale params, only if specified.
-                this.translate = translate || this.translate;
-                this.scaleMain = scale || this.scaleMain;
-                // Update the location of the viewpoint rectangle.
-                var svgRect = this.svg.getBoundingClientRect();
-                var $viewpoint = d3.select(this.viewpoint);
-                this.viewpointCoord.x = -this.translate[0] * this.scaleMinimap /
-                    this.scaleMain;
-                this.viewpointCoord.y = -this.translate[1] * this.scaleMinimap /
-                    this.scaleMain;
-                var viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
-                var viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
-                $viewpoint.attr({
-                    x: this.viewpointCoord.x,
-                    y: this.viewpointCoord.y,
-                    width: viewpointWidth,
-                    height: viewpointHeight
-                });
-                // Show/hide the minimap depending on the viewpoint area as fraction of the
-                // whole minimap.
-                var mapWidth = this.minimapSize.width;
-                var mapHeight = this.minimapSize.height;
-                var x = this.viewpointCoord.x;
-                var y = this.viewpointCoord.y;
-                var w = Math.min(Math.max(0, x + viewpointWidth), mapWidth) -
-                    Math.min(Math.max(0, x), mapWidth);
-                var h = Math.min(Math.max(0, y + viewpointHeight), mapHeight) -
-                    Math.min(Math.max(0, y), mapHeight);
-                var fracIntersect = (w * h) / (mapWidth * mapHeight);
-                if (fracIntersect < FRAC_VIEWPOINT_AREA) {
-                    this.minimap.classList.remove('hidden');
-                }
-                else {
-                    this.minimap.classList.add('hidden');
-                }
-            };
-            return Minimap;
-        }());
-        scene.Minimap = Minimap;
-    })(scene = tf.scene || (tf.scene = {}));
-})(tf || (tf = {})); // close module tf.scene
-</script><dom-module id="tf-graph-minimap" assetpath="/">
-<template>
-<style>
-:host {
-  background-color:white;
-  transition: opacity .3s linear;
-  pointer-events: auto;
-}
-
-:host.hidden {
-  opacity: 0;
-  pointer-events: none;
-}
-
-canvas {
-  border: 1px solid #999;
-}
-
-rect {
-  fill: white;
-  stroke: #111111;
-  stroke-width: 1px;
-  fill-opacity: 0;
-  filter: url("#minimapDropShadow");
-  cursor: move;
-}
-
-svg {
-  position: absolute;
-}
-</style>
-<svg>
-  <defs>
-    <filter id="minimapDropShadow" x="-20%" y="-20%" width="150%" height="150%">
-      <feOffset result="offOut" in="SourceGraphic" dx="1" dy="1"></feOffset>
-      <feColorMatrix result="matrixOut" in="offOut" type="matrix" values="0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.5 0"></feColorMatrix>
-      <feGaussianBlur result="blurOut" in="matrixOut" stdDeviation="2"></feGaussianBlur>
-      <feBlend in="SourceGraphic" in2="blurOut" mode="normal"></feBlend>
-    </filter>
-  </defs>
-  <rect></rect>
-</svg>
-<canvas class="first"></canvas>
-
-<canvas class="second"></canvas>
-<canvas class="download"></canvas>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-minimap',
-
-  /**
-   * Initializes the minimap and returns a minimap object to notify when
-   * things update.
-   *
-   * @param svg The main svg element.
-   * @param zoomG The svg group used for panning and zooming the main svg.
-   * @param mainZoom The main zoom behavior.
-   * @param maxWandH The maximum width/height for the minimap.
-   * @param labelPadding Padding in pixels due to the main graph labels.
-   */
-  init: function(svg, zoomG, mainZoom, maxWAndH, labelPadding) {
-    return new tf.scene.Minimap(svg, zoomG, mainZoom, this, maxWAndH,
-        labelPadding);
-  }
-});
-</script>
-</dom-module>
-<dom-module id="tf-graph-scene" assetpath="../tf-graph/">
-<template>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  font-size: 20px;
-}
-
-::content #svg {
-  overflow: hidden;
-  flex: 1;
-  height: 100%;
-  width: 100%;
-}
-
-::content #hidden {
-  position: fixed;
-  top: 0px;
-  visibility: hidden;
-}
-
-/* --- Node and annotation-node for Metanode --- */
-
-::content .meta > .nodeshape > rect,
-::content .meta > .annotation-node > rect {
-  cursor: pointer;
-  fill: hsl(0, 0%, 70%);
-}
-
-::content .node.meta.highlighted > .nodeshape > rect,
-::content .node.meta.highlighted > .annotation-node > rect {
-  stroke-width: 2;
-}
-
-::content .annotation.meta.highlighted > .nodeshape > rect,
-::content .annotation.meta.highlighted > .annotation-node > rect {
-  stroke-width: 1;
-}
-
-::content .meta.selected > .nodeshape > rect,
-::content .meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded > .nodeshape > rect,
-::content .node.meta.selected.expanded > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 3;
-}
-
-::content .annotation.meta.selected > .nodeshape > rect,
-::content .annotation.meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded.highlighted > .nodeshape > rect,
-::content .node.meta.selected.expanded.highlighted > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 4;
-}
-
-::content .faded,
-::content .faded rect,
-::content .faded ellipse,
-::content .faded path,
-::content .faded use,
-::content #rectHatch line,
-::content #ellipseHatch line {
-  color: #e0d4b3 !important;
-  fill: white;
-  stroke: #e0d4b3 !important;
-}
-
-
-::content .faded path {
-  stroke-width: 1px !important;
-}
-
-::content .faded rect {
-  fill: url("#rectHatch") !important;
-}
-
-::content .faded ellipse,
-::content .faded use {
-  fill: url("#ellipseHatch") !important;
-}
-
-::content .faded text {
-  opacity: 0;
-}
-
-/* Rules used for input-tracing. */
-::content .input-highlight > * > rect,
-::content .input-highlight > * > ellipse,
-::content .input-highlight > * > use
-{
-  fill: white;
-  stroke: #ff9800 !important;
-}
-
-/*  - Faded non-input styling */
-::content .non-input > * > rect,
-::content .non-input > * > ellipse,
-::content .non-input > * > use,
-/* For Const nodes. */
-::content .non-input > * > .constant:not([class*="input-highlight"]) >
-  .annotation-node > ellipse,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect {
-  stroke: #e0d4b3 !important;
-  stroke-width: inherit;
-  stroke-dasharray: inherit;
-}
-
-
-::content .non-input path {
-  visibility: hidden;
-}
-
-::content .non-input > .nodeshape > rect,
-::content .non-input > .annotation-node > rect,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect
-{
-  fill: url("#rectHatch") !important;
-}
-
-::content .non-input ellipse,
-::content .non-input use {
-  fill: url("#ellipseHatch") !important;
-}
-
-::content .non-input > text {
-  opacity: 0;
-}
-
-::content .non-input .annotation > .annotation-edge {
-  marker-end: url("#annotation-arrowhead-faded");
-}
-
-::content .non-input .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead-faded");
-}
-
-/* Input edges. */
-::content .input-edge-highlight > text {
-  fill: black !important;
-}
-::content .input-edge-highlight > path,
-::content .input-highlight > .in-annotations > .annotation > .annotation-edge,
-::content .input-highlight-selected > .in-annotations > .annotation >
-.annotation-edge {
-  stroke: #999 !important;
-}
-
-/* Non-input edges. */
-::content .non-input-edge-highlight,
-::content .non-input > g > .annotation > path,
-/* Annotation styles (label and edges respectively). */
-::content .non-input > g >
-.annotation:not(.input-highlight):not(.input-highlight-selected) >
-.annotation-label
-/*.annotation-edge*/
-{
-  visibility: hidden;
-}
-
-/* --- Op Node --- */
-
-::content .op > .nodeshape > ellipse,
-::content .op > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #fff;
-  stroke: #ccc;
-}
-
-::content .op.selected > .nodeshape > ellipse,
-::content .op.selected > .annotation-node > ellipse {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .op.highlighted > .nodeshape > ellipse,
-::content .op.highlighted > .annotation-node > ellipse {
-  stroke-width: 2;
-}
-
-/* --- Series Node --- */
-
-/* By default, don't show the series background <rect>. */
-::content .series > .nodeshape > rect {
-  fill: hsl(0, 0%, 70%);
-  fill-opacity: 0;
-  stroke-dasharray: 5, 5;
-  stroke-opacity: 0;
-  cursor: pointer;
-}
-
-/* Once expanded, show the series background <rect> and hide the <use>. */
-::content .series.expanded > .nodeshape > rect {
-  fill-opacity: 0.15;
-  stroke: hsl(0, 0%, 70%);
-  stroke-opacity: 1;
-}
-::content .series.expanded > .nodeshape > use {
-  visibility: hidden;
-}
-
-/**
- * TODO(jimbo): Simplify this by applying a stable class name to all <g>
- * elements that currently have either the nodeshape or annotation-node classes.
- */
-::content .series > .nodeshape > use ,
-::content .series > .annotation-node > use {
-  stroke: #ccc;
-}
-::content .series.highlighted > .nodeshape > use ,
-::content .series.highlighted > .annotation-node > use {
-  stroke-width: 2;
-}
-::content .series.selected > .nodeshape > use ,
-::content .series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .series.selected > .nodeshape > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .annotation.series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-/* --- Bridge Node --- */
-::content .bridge > .nodeshape > rect {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* --- Structural Elements --- */
-::content .edge > path.edgeline.structural {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* --- Series Nodes --- */
-
-/* Hide the rect for a series' annotation. */
-::content .series > .annotation-node > rect {
-  display: none;
-}
-
-/* --- Node label --- */
-
-
-::content .node > text.nodelabel {
-  cursor: pointer;
-  fill: #444;
-}
-
-::content .meta.expanded > text.nodelabel {
-  font-size: 9px;
-}
-
-::content .series > text.nodelabel {
-  font-size: 8px;
-}
-
-::content .op > text.nodelabel {
-  font-size: 6px;
-}
-
-::content .bridge > text.nodelabel {
-  display: none;
-}
-
-::content .node.meta.expanded > text.nodelabel{
-  cursor: normal;
-}
-
-::content .annotation.meta.highlighted > text.annotation-label {
-  fill: #50A3F7;
-}
-
-::content .annotation.meta.selected > text.annotation-label {
-  fill: #4285F4;
-}
-
-/* --- Annotation --- */
-
-/* only applied for annotations that are not summary or constant.
-(.summary, .constant gets overriden below) */
-::content .annotation > .annotation-node > * {
-  stroke-width: 0.5;
-  stroke-dasharray: 1, 1;
-}
-
-::content .annotation.summary > .annotation-node > *,
-::content .annotation.constant > .annotation-node > * {
-  stroke-width: 1;
-  stroke-dasharray: none;
-}
-
-::content .annotation > .annotation-edge {
-  fill: none;
-  stroke: #aaa;
-  stroke-width: 0.5;
-  marker-end: url("#annotation-arrowhead");
-}
-
-::content .faded .annotation > .annotation-edge {
-  marker-end: url("#annotation-arrowhead-faded");
-}
-
-::content .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead");
-}
-
-::content .faded .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead-faded");
-}
-
-::content .annotation > .annotation-control-edge {
-  stroke-dasharray: 1, 1;
-}
-
-::content #annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content #ref-annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #ref-annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content .annotation > .annotation-label {
-  font-size: 5px;
-  cursor: pointer;
-}
-::content .annotation > .annotation-label.annotation-ellipsis {
-  cursor: default;
-}
-
-/* Hide annotations on expanded meta nodes since they're redundant. */
-::content .expanded > .in-annotations,
-::content .expanded > .out-annotations {
-  display: none;
-}
-
-/* --- Annotation: Constant --- */
-
-::content .constant > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: white;
-  stroke: #848484;
-}
-
-::content .constant.selected > .annotation-node > ellipse {
-  fill: white;
-  stroke: red;
-}
-
-::content .constant.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Annotation: Summary --- */
-
-::content .summary > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #DB4437;
-  stroke: #DB4437;
-}
-
-::content .summary.selected > .annotation-node > ellipse {
-  fill: #A52714;
-  stroke: #A52714;
-}
-
-::content .summary.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Edge --- */
-
-::content .edge > path.edgeline {
-  fill: none;
-  stroke: #bbb;
-  stroke-linecap: round;
-  stroke-width: 0.75;
-}
-
-/* Labels showing tensor shapes on edges */
-::content .edge > text {
-  font-size: 3.5px;
-  fill: #666;
-}
-
-::content .ref-arrowhead {
-  fill: #bbb;
-}
-
-::content .edge .control-dep {
-  stroke-dasharray: 2, 2;
-}
-
-/* --- Group node expand/collapse button --- */
-
-/* Hides expand/collapse buttons when a node isn't expanded or highlighted. Using
-   incredibly small opacity so that the bounding box of the <g> parent still takes
-   this container into account even when it isn't visible */
-::content .node:not(.highlighted):not(.expanded) > .nodeshape > .buttoncontainer {
-  opacity: 0.01;
-}
-::content .node.highlighted > .nodeshape > .buttoncontainer {
-  cursor: pointer;
-}
-::content .buttoncircle {
-  fill: #E7811D;
-}
-::content .buttoncircle:hover {
-  fill: #B96717;
-}
-::content .expandbutton,
-::content .collapsebutton {
-  stroke: white;
-}
-/* Do not let the path elements in the button take pointer focus */
-::content .node > .nodeshape > .buttoncontainer > .expandbutton,
-::content .node > .nodeshape > .buttoncontainer > .collapsebutton {
-  pointer-events: none;
-}
-/* Only show the expand button when a node is collapsed and only show the
-   collapse button when a node is expanded. */
-::content .node.expanded > .nodeshape > .buttoncontainer > .expandbutton {
-  display: none;
-}
-::content .node:not(.expanded) > .nodeshape > .buttoncontainer > .collapsebutton {
-  display: none;
-}
-
-.titleContainer {
-  position: relative;
-  top: 20px;
-}
-
-.title {
-  position: absolute;
-}
-
-.auxTitle {
-  position: absolute;
-}
-
-#minimap {
-  position: absolute;
-  right: 20px;
-  bottom: 20px;
-}
-</style>
-<div class="titleContainer">
-  <div id="title" class="title">Main Graph</div>
-  <div id="auxTitle" class="auxTitle">Auxiliary Nodes</div>
-</div>
-<svg id="svg">
-  <defs>
-
-    
-    <path id="ref-arrowhead-path" d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"></path>
-    <marker class="ref-arrowhead" id="ref-arrowhead-small" viewBox="0 0 10 10" markerWidth="10" markerHeight="10" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-
-    
-    <marker id="annotation-arrowhead" markerWidth="5" markerHeight="5" refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"></path>
-    </marker>
-    <marker id="annotation-arrowhead-faded" markerWidth="5" markerHeight="5" refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"></path>
-    </marker>
-    <marker id="ref-annotation-arrowhead" markerWidth="5" markerHeight="5" refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"></path>
-    </marker>
-    <marker id="ref-annotation-arrowhead-faded" markerWidth="5" markerHeight="5" refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"></path>
-    </marker>
-    
-    <ellipse id="op-node-stamp" rx="7.5" ry="3" stroke="inherit" fill="inherit"></ellipse>
-    
-    <ellipse id="op-node-annotation-stamp" rx="5" ry="2" stroke="inherit" fill="inherit"></ellipse>
-    
-    <g id="op-series-vertical-stamp">
-      <use xlink:href="#op-node-stamp" x="8" y="9"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="6"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="3"></use>
-    </g>
-    
-    <g id="op-series-horizontal-stamp">
-      <use xlink:href="#op-node-stamp" x="16" y="4"></use>
-      <use xlink:href="#op-node-stamp" x="12" y="4"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="4"></use>
-    </g>
-    
-    <g id="op-series-annotation-stamp">
-      <use xlink:href="#op-node-annotation-stamp" x="9" y="2"></use>
-      <use xlink:href="#op-node-annotation-stamp" x="7" y="2"></use>
-      <use xlink:href="#op-node-annotation-stamp" x="5" y="2"></use>
-    </g>
-    <svg id="summary-icon" fill="#848484" height="12" viewBox="0 0 24 24" width="12">
-      <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"></path>
-    </svg>
-    
-    <g id="linearGradients"></g>
-
-    
-    <pattern id="rectHatch" patternTransform="rotate(45 0 0)" width="5" height="5" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="5" style="stroke-width: 1"></line>
-    </pattern>
-    <pattern id="ellipseHatch" patternTransform="rotate(45 0 0)" width="2" height="2" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="2" style="stroke-width: 1"></line>
-    </pattern>
-  </defs>
-  
-  <rect fill="white" width="10000" height="10000"></rect>
-  <g id="root"></g>
-</svg>
-<tf-graph-minimap id="minimap"></tf-graph-minimap>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-scene',
-  properties: {
-    renderHierarchy: Object,
-    name: String,
-    colorBy: String,
-    /** @type {d3_zoom} d3 zoom object */
-    _zoom: Object,
-    highlightedNode: {
-      type: String,
-      observer: '_highlightedNodeChanged'
-    },
-    selectedNode: {
-      type: String,
-      observer: '_selectedNodeChanged'
-    },
-    /** Keeps track of if the graph has been zoomed/panned since loading */
-    _zoomed: {
-      type: Boolean,
-      observer: '_onZoomChanged',
-      value: false
-    },
-    /** Keeps track of the starting coordinates of a graph zoom/pan */
-    _zoomStartCoords: {
-      type: Array,
-      value: null
-    },
-    /** Keeps track of the current coordinates of a graph zoom/pan */
-    _zoomCoords: {
-      type: Array,
-      value: null
-    },
-    /** Maximum distance of a zoom event for it to be interpreted as a click */
-    _maxZoomDistanceForClick: {
-      type: Number,
-      value: 20
-    },
-    /**
-     * @type {d3.scale.ordinal}
-     * Scale mapping from template name to a number between 0 and N-1
-     * where N is the number of different template names. Used by
-     * tf.graph.scene.node when computing node color by structure.
-     */
-    templateIndex: Function,
-    /**
-     * @type {tf.scene.Minimap}
-     * A minimap object to notify for zoom events.
-     */
-    minimap: Object,
-    /*
-     * Dictionary for easily stylizing nodes when state changes.
-     * _nodeGroupIndex[nodeName] = d3_selection of the nodeGroup
-     */
-    _nodeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing annotation nodes when state changes.
-     * _annotationGroupIndex[nodeName][hostNodeName] =
-     *   d3_selection of the annotationGroup
-     */
-    _annotationGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing edges when state changes.
-     * _edgeGroupIndex[edgeName] = d3_selection of the edgeGroup
-     */
-    _edgeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /**
-     * Max font size for metanode label strings.
-     */
-    maxMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 9
-    },
-    /**
-     * Min font size for metanode label strings.
-     */
-    minMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 6
-    },
-    /**
-     * Metanode label strings longer than this are given smaller fonts.
-     */
-    maxMetanodeLabelLengthLargeFont: {
-      type: Number,
-      value: 11
-    },
-    /**
-     * Metanode label strings longer than this are truncated with ellipses.
-     */
-    maxMetanodeLabelLength: {
-      type: Number,
-      value: 18
-    },
-    progress: Object
-  },
-  observers: [
-    '_colorByChanged(colorBy)',
-    '_buildAndFit(renderHierarchy)'
-  ],
-  getNode: function(nodeName) {
-    return this.renderHierarchy.getRenderNodeByName(nodeName);
-  },
-  isNodeExpanded: function(node) {
-    return node.expanded;
-  },
-  setNodeExpanded: function(renderNode) {
-    this._build(this.renderHierarchy);
-    this._updateLabels(!this._zoomed);
-  },
-  /**
-   * Resets the state of the component. Called whenever the whole graph
-   * (dataset) changes.
-   */
-  _resetState: function() {
-    // Reset the state of the component.
-    this._nodeGroupIndex = {};
-    this._annotationGroupIndex = {};
-    this._edgeGroupIndex = {};
-    this._updateLabels(false);
-    // Remove all svg elements under the 'root' svg group.
-    d3.select(this.$.svg).select('#root').selectAll('*').remove();
-    // And the defs.
-    d3.select(this.$.svg).select('defs #linearGradients')
-        .selectAll('*').remove();
-  },
-  /** Main method for building the scene */
-  _build: function(renderHierarchy) {
-    this.templateIndex = renderHierarchy.hierarchy.getTemplateIndex();
-    tf.graph.util.time('tf-graph-scene (layout):', function() {
-      // layout the scene for this meta / series node
-      tf.graph.layout.layoutScene(renderHierarchy.root, this);
-    }.bind(this));
-
-    tf.graph.util.time('tf-graph-scene (build scene):', function() {
-      tf.graph.scene.buildGroup(d3.select(this.$.root), renderHierarchy.root, this);
-      tf.graph.scene.addGraphClickListener(this.$.svg, this);
-      tf.graph.scene.node.traceInputs(renderHierarchy);
-    }.bind(this));
-    // Update the minimap again when the graph is done animating.
-    setTimeout(function() {
-      this.minimap.update();
-    }.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  ready: function() {
-    this._zoom = d3.behavior.zoom()
-      .on('zoomend', function() {
-        if (this._zoomStartCoords) {
-          // Calculate the total distance dragged during the zoom event.
-          // If it is sufficiently small, then fire an event indicating
-          // that zooming has ended. Otherwise wait to fire the zoom end
-          // event, so that a mouse click registered as part of this zooming
-          // is ignored (as this mouse click was part of a zooming, and should
-          // not be used to indicate an actual click on the graph).
-          var dragDistance = Math.sqrt(
-            Math.pow(this._zoomStartCoords[0] - this._zoomCoords[0], 2) +
-            Math.pow(this._zoomStartCoords[1] - this._zoomCoords[1], 2));
-          if (dragDistance < this._maxZoomDistanceForClick) {
-            this._fireEnableClick();
-          } else {
-            setTimeout(this._fireEnableClick.bind(this), 50);
-          }
-        }
-        this._zoomStartCoords = null;
-      }.bind(this))
-      .on('zoom', function() {
-        // Store the coordinates of the zoom event
-        this._zoomCoords = d3.event.translate;
-
-        // If this is the first zoom event after a zoom-end, then
-        // store the coordinates as the start coordinates as well,
-        // and fire an event to indicate that zooming has started.
-        // This doesn't use the zoomstart event, as d3 sends this
-        // event on mouse-down, even if there has been no dragging
-        // done to translate the graph around.
-        if (!this._zoomStartCoords) {
-          this._zoomStartCoords = this._zoomCoords.slice();
-          this.fire('disable-click');
-        }
-        this._zoomed = true;
-        d3.select(this.$.root).attr('transform',
-                    'translate(' + d3.event.translate + ')' +
-                    'scale(' + d3.event.scale + ')');
-        // Notify the minimap.
-        this.minimap.zoom(d3.event.translate, d3.event.scale);
-      }.bind(this));
-    d3.select(this.$.svg).call(this._zoom)
-      .on('dblclick.zoom', null);
-    d3.select(window).on('resize', function() {
-      // Notify the minimap that the user's window was resized.
-      // The minimap will figure out the new dimensions of the main svg
-      // and will use the existing translate and scale params.
-      this.minimap.zoom();
-    }.bind(this));
-    // Initialize the minimap.
-    this.minimap = this.$.minimap.init(this.$.svg, this.$.root, this._zoom,
-        tf.graph.layout.PARAMS.minimap.size,
-        tf.graph.layout.PARAMS.subscene.meta.labelHeight);
-  },
-  _buildAndFit: function(renderHierarchy) {
-    this._resetState();
-    this._build(renderHierarchy);
-    // Fit to screen after the graph is done animating.
-    setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  _updateLabels: function(showLabels) {
-    var mainGraphTitleElement = this.getElementsByClassName('title')[0];
-    var titleStyle = mainGraphTitleElement.style;
-    var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
-    var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-      tf.graph.scene.Class.Scene.CORE)[0][0];
-    // Only show labels if the graph is fully loaded.
-    if (showLabels && core && this.progress && this.progress.value === 100) {
-      var aux =
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.INEXTRACT)[0][0] ||
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.OUTEXTRACT)[0][0];
-      var coreX = core.getCTM().e;
-      var auxX = aux ? aux.getCTM().e : null;
-      titleStyle.display = 'inline';
-      titleStyle.left = coreX + 'px';
-      if (auxX !== null && auxX !== coreX) {
-        auxTitleStyle.display = 'inline';
-
-        // Make sure that the aux title is positioned rightwards enough so as to
-        // prevent overlap with the main graph title.
-        auxX = Math.max(
-            coreX + mainGraphTitleElement.getBoundingClientRect().width, auxX);
-
-        auxTitleStyle.left = auxX + 'px';
-      } else {
-        auxTitleStyle.display = 'none';
-      }
-    } else {
-      titleStyle.display='none';
-      auxTitleStyle.display = 'none';
-    }
-  },
-  /**
-    * Called whenever the user changed the 'color by' option in the
-    * UI controls.
-    */
-  _colorByChanged: function() {
-    if (this.renderHierarchy != null) {
-      // We iterate through each svg node and update its state.
-      _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
-        this._updateNodeState(nodeName);
-      }, this);
-      // Notify also the minimap.
-      this.minimap.update();
-    }
-  },
-  fit: function() {
-    tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
-      this._zoomed = false;
-    }.bind(this));
-  },
-  isNodeSelected: function(n) {
-    return n === this.selectedNode;
-  },
-  isNodeHighlighted: function(n) {
-    return n === this.highlightedNode;
-  },
-  addAnnotationGroup: function(a, d, selection) {
-    var an = a.node.name;
-    this._annotationGroupIndex[an] = this._annotationGroupIndex[an] || {};
-    this._annotationGroupIndex[an][d.node.name] = selection;
-  },
-  getAnnotationGroupsIndex: function(a) {
-    return this._annotationGroupIndex[a];
-  },
-  removeAnnotationGroup: function(a, d) {
-    delete this._annotationGroupIndex[a.node.name][d.node.name];
-  },
-  addNodeGroup: function(n, selection) {
-    this._nodeGroupIndex[n] = selection;
-  },
-  getNodeGroup: function(n) {
-    return this._nodeGroupIndex[n];
-  },
-  removeNodeGroup: function(n) {
-    delete this._nodeGroupIndex[n];
-  },
-  addEdgeGroup: function(n, selection) {
-    this._edgeGroupIndex[e] = selection;
-  },
-  getEdgeGroup: function(e) {
-    return this._edgeGroupIndex[e];
-  },
-  /**
-   * Update node and annotation node of the given name.
-   * @param  {String} n node name
-   */
-  _updateNodeState: function(n) {
-    var node = this.getNode(n);
-    var nodeGroup = this.getNodeGroup(n);
-
-    if (nodeGroup) {
-      tf.graph.scene.node.stylize(nodeGroup, node, this);
-    }
-
-    var annotationGroupIndex = this.getAnnotationGroupsIndex(n);
-    _.each(annotationGroupIndex, function(aGroup, hostName) {
-      tf.graph.scene.node.stylize(aGroup, node, this,
-          tf.graph.scene.Class.Annotation.NODE);
-    }, this);
-  },
-
-  /**
-   * Handles new node selection. 1) Updates the selected-state of each node,
-   * 2) triggers input tracing.
-   * @param selectedNode {string} The name of the newly selected node.
-   * @param oldSelectedNode {string} The name of the previously selected node.
-   * @private
-   */
-  _selectedNodeChanged: function(selectedNode, oldSelectedNode) {
-    if (selectedNode === oldSelectedNode) {
-      return;
-    }
-
-    if (selectedNode) {
-      this._updateNodeState(selectedNode);
-    }
-    if (oldSelectedNode) {
-      this._updateNodeState(oldSelectedNode);
-    }
-
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-
-    if (!selectedNode) {
-      return;
-    }
-
-
-    // Update the minimap to reflect the highlighted (selected) node.
-    this.minimap.update();
-    var node = this.renderHierarchy.hierarchy.node(selectedNode);
-    var nodeParents = [];
-    // Create list of all metanode parents of the selected node.
-    while (node.parentNode != null
-        && node.parentNode.name != tf.graph.ROOT_NAME) {
-      node = node.parentNode;
-      nodeParents.push(node.name);
-    }
-    // Ensure each parent metanode is built and expanded.
-    var topParentNodeToBeExpanded;
-    _.forEachRight(nodeParents, function(parentName) {
-      this.renderHierarchy.buildSubhierarchy(parentName);
-      var renderNode = this.renderHierarchy.getRenderNodeByName(parentName);
-      if (renderNode.node.isGroupNode && !renderNode.expanded) {
-        renderNode.expanded = true;
-        if (!topParentNodeToBeExpanded) {
-          topParentNodeToBeExpanded = renderNode;
-        }
-      }
-    }, this);
-    // If any expansion was needed to display this selected node, then
-    // inform the scene of the top-most expansion.
-    if (topParentNodeToBeExpanded) {
-      this.setNodeExpanded(topParentNodeToBeExpanded);
-      this._zoomed = true;
-    }
-
-    if (tf.graph.scene.panToNode(selectedNode, this.$.svg, this.$.root,
-        this._zoom)) {
-      this._zoomed = true;
-    }
-  },
-  _highlightedNodeChanged: function(highlightedNode, oldHighlightedNode) {
-    if (highlightedNode === oldHighlightedNode) {
-      return;
-    }
-
-    if (highlightedNode) {
-      this._updateNodeState(highlightedNode);
-    }
-    if (oldHighlightedNode) {
-      this._updateNodeState(oldHighlightedNode);
-    }
-  },
-  _onZoomChanged: function() {
-    this._updateLabels(!this._zoomed);
-  },
-  _fireEnableClick: function() {
-    this.fire('enable-click');
-  },
-});
-</script>
-</dom-module>
-<dom-module id="tf-graph" assetpath="../tf-graph/">
-<template>
-<style>
-.container {
-  width: 100%;
-  height: 100%;
-  background: white;
-  box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-}
-
-.vertical {
-  width:100%;
-  height:100%;
-  @apply(--layout-vertical);
-}
-
-.auto {
-  @apply(--layout-flex-auto);
-  @apply(--layout-vertical);
-}
-
-h2 {
-  text-align: center;
-}
-
-paper-button {
-  text-transform: none;
-}
-</style>
-<div class="container">
-  <div class="vertical">
-    <template is="dom-if" if="[[title]]">
-      <h2>[[title]]</h2>
-    </template>
-    <tf-graph-scene id="scene" class="auto" render-hierarchy="[[renderHierarchy]]" highlighted-node="[[_getVisible(highlightedNode)]]" selected-node="[[selectedNode]]" color-by="[[colorBy]]" progress="[[progress]]"></tf-graph-scene>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph',
-
-  properties: {
-    graphHierarchy: {
-      type: Object,
-      notify: true,
-      observer: '_graphChanged'
-    },
-    basicGraph: Object,
-    stats: Object,
-    devicesForStats: Object,
-    hierarchyParams: Object,
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    title: String,
-    selectedNode: {
-      type: String,
-      notify: true,
-    },
-    highlightedNode: {
-      type: String,
-      notify: true
-    },
-    /** What to color the nodes by (compute time, memory, device etc.) */
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true,
-      readOnly: true, // Produces and doesn't consume.
-    },
-    renderHierarchy: {
-      type: Object,
-      readOnly: true,
-      notify: true,
-    },
-    _renderDepth: {
-      type: Number,
-      value: 1
-    },
-    _allowGraphSelect: {
-      type: Boolean,
-      value: true
-    },
-  },
-  observers: [
-    '_statsChanged(stats, devicesForStats)',
-    '_buildRenderHierarchy(graphHierarchy)'
-  ],
-  _statsChanged: function(stats, devicesForStats) {
-    if (this.graphHierarchy) {
-      if (stats && devicesForStats) {
-        tf.graph.joinStatsInfoWithGraph(this.basicGraph, stats, devicesForStats);
-        tf.graph.hierarchy.joinAndAggregateStats(this.graphHierarchy, stats);
-      }
-      // Recompute the rendering information.
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }
-  },
-  _buildRenderHierarchy: function(graphHierarchy) {
-    tf.graph.util.time('new tf.graph.render.Hierarchy', function() {
-      if (graphHierarchy.root.type !== tf.graph.NodeType.META) {
-        // root must be metanode but sometimes Polymer's dom-if has not
-        // remove tf-graph element yet in <tf-node-info>
-        // and thus mistakenly pass non-metanode to this module.
-        return;
-      }
-      var renderGraph = new tf.graph.render.RenderGraphInfo(
-          graphHierarchy, !!this.stats /** displayingStats */);
-      // Producing the 'color by' parameters to be consumed
-      // by the tf-graph-controls panel. It contains information about the
-      // min and max values and their respective colors, as well as list
-      // of devices with their respective colors.
-
-      function getColorParamsFromScale(scale) {
-        return {
-          minValue: scale.domain()[0],
-          maxValue: scale.domain()[1],
-          startColor: scale.range()[0],
-          endColor: scale.range()[1]
-        };
-      }
-
-      this._setColorByParams({
-        compute_time: getColorParamsFromScale(renderGraph.computeTimeScale),
-        memory: getColorParamsFromScale(renderGraph.memoryUsageScale),
-        device: _.map(renderGraph.deviceColorMap.domain(),
-            function(deviceName) {
-          return {
-            device: deviceName,
-            color: renderGraph.deviceColorMap(deviceName)
-          };
-        })
-      });
-      this._setRenderHierarchy(renderGraph);
-      this.async(function() {
-        this.fire("rendered");
-      });
-    }.bind(this));
-  },
-  _getVisible: function(name) {
-    if (!name) {
-      return name;
-    }
-    return this.renderHierarchy.getNearestVisibleAncestor(name);
-  },
-  listeners: {
-    'graph-select': '_graphSelected',
-    'disable-click': '_disableClick',
-    'enable-click': '_enableClick',
-    // Nodes
-    'node-toggle-expand': '_nodeToggleExpand',
-    'node-select': '_nodeSelected',
-    'node-highlight': '_nodeHighlighted',
-    'node-unhighlight': '_nodeUnhighlighted',
-    'node-toggle-extract': '_nodeToggleExtract',
-    'node-toggle-seriesgroup': '_nodeToggleSeriesGroup',
-
-    // Annotations
-
-    /* Note: currently highlighting/selecting annotation node has the same
-      * behavior as highlighting/selecting actual node so we point to the same
-      * set of event listeners.  However, we might redesign this to be a bit
-      * different.
-      */
-    'annotation-select': '_nodeSelected',
-    'annotation-highlight': '_nodeHighlighted',
-    'annotation-unhighlight': '_nodeUnhighlighted',
-  },
-  _graphChanged: function() {
-    // When a new graph is loaded, fire this event so that there is no
-    // info-card being displayed for the previously-loaded graph.
-    this.fire('graph-select');
-  },
-  _graphSelected: function(event) {
-    // Graph selection is not allowed during an active zoom event, as the
-    // click seen during a zoom/pan is part of the zooming and does not
-    // indicate a user desire to click on a specific section of the graph.
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', null);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _disableClick: function(event) {
-    this._allowGraphSelect = false;
-  },
-  _enableClick: function(event) {
-    this._allowGraphSelect = true;
-  },
-  _nodeSelected: function(event) {
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', event.detail.name);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _nodeHighlighted: function(event) {
-    this.set('highlightedNode', event.detail.name);
-  },
-  _nodeUnhighlighted: function(event) {
-    this.set('highlightedNode', null);
-  },
-  _nodeToggleExpand: function(event) {
-    // Immediately select the node that is about to be expanded.
-    this._nodeSelected(event);
-
-    // Compute the sub-hierarchy scene.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    // Op nodes are not expandable.
-    if (renderNode.node.type === tf.graph.NodeType.OP) {
-      return;
-    }
-    this.renderHierarchy.buildSubhierarchy(nodeName);
-    renderNode.expanded = !renderNode.expanded;
-
-    // Expand the node with some delay so that the user can immediately see
-    // the visual effect of selecting that node, before the expansion is
-    // done.
-    this.async(function() {
-      this.querySelector('#scene').setNodeExpanded(renderNode);
-    }, 75);
-  },
-  _nodeToggleExtract: function(event) {
-    // Toggle the include setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    if (renderNode.node.include == tf.graph.InclusionType.INCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.EXCLUDE;
-    } else if (renderNode.node.include == tf.graph.InclusionType.EXCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.INCLUDE;
-    } else {
-      renderNode.node.include =
-       this.renderHierarchy.isNodeAuxiliary(renderNode)
-          ? tf.graph.InclusionType.INCLUDE : tf.graph.InclusionType.EXCLUDE;
-    }
-
-    // Rebuild the render hierarchy.
-    this._buildRenderHierarchy(this.graphHierarchy);
-  },
-  _nodeToggleSeriesGroup: function(event) {
-    // Toggle the group setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    tf.graph.toggleNodeSeriesGroup(this.hierarchyParams.seriesMap, nodeName);
-
-    // Rebuild the render hierarchy with the updated series grouping map.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 100,
-          'Namespace hierarchy');
-    tf.graph.hierarchy.build(this.basicGraph, this.hierarchyParams, hierarchyTracker)
-    .then(function(graphHierarchy) {
-      this.set('graphHierarchy', graphHierarchy);
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }.bind(this));
-  },
-  not: function(x) {
-    return !x;
-  }
-});
-</script>
-<link rel="import" href="../iron-list/iron-list.html"><link rel="import" href="../paper-item/all-imports.html"><dom-module id="tf-graph-icon" assetpath="../tf-graph/">
-  <style>
-    .faded-rect {
-      fill: url("#rectHatch");
-    }
-
-    .faded-ellipse {
-      fill: url("#ellipseHatch");
-    }
-
-    .faded-rect, .faded-ellipse, .faded-series {
-      stroke:   var(--tb-graph-faded) !important;
-    }
-  </style>
-  <template>
-    <template is="dom-if" if="[[_isType(node, type, 'OP')]]">
-      <template is="dom-if" if="[[_isConst(node, const)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
-          <circle cx="5" cy="5" r="3" fill$="[[_getFill(_computedFill, 'OP')]]" stroke$="[[_getStroke(_computedFill, 'OP')]]"></circle>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isSummary(node, summary)]]">
-        <svg width$="[[height]]" height$="[[height]]" viewBox="0 0 12 12">
-          <use x="0" y="0" xlink:href="#summary-icon"></use>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isRegularOp(node, const, summary)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 8">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-node-stamp" fill$="[[_getFill(_computedFill, 'OP')]]" stroke$="[[_getStroke(_computedFill, 'OP')]]" class$="{{_fadedClass(renderInfo, 'ellipse')}}" x="8" y="4"></use>
-        </svg>
-      </template>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'META')]]">
-      <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 37 16">
-        <rect x="1" y="1" fill$="[[_getFill(_computedFill, 'META')]]" stroke$="[[_getStroke(_computedFill, 'META')]]" class$="{{_fadedClass(renderInfo, 'rect')}}" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-      </svg>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'SERIES')]]">
-      <template is="dom-if" if="[[_isVertical(node, vertical)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 15">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-series-vertical-stamp" fill$="[[_getFill(_computedFill, 'SERIES')]]" stroke$="[[_getStroke(_computedFill, 'SERIES')]]" class$="{{_fadedClass(renderInfo, 'series')}}" x="0" y="2"></use>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[!_isVertical(node, vertical)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 24 10">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-series-horizontal-stamp" fill$="[[_getFill(_computedFill, 'SERIES')]]" stroke$="[[_getStroke(_computedFill, 'SERIES')]]" class$="{{_fadedClass(renderInfo, 'series')}}" x="0" y="1"></use>
-        </svg>
-      </template>
-    </template>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-graph-icon',
-
-        properties: {
-          /**
-           * Node to represent with an icon. Optional, but if specified, its
-           * properties override those defined in the type, vertical, const and
-           * summary properties.
-           * @type {tf.graph.Node}
-           */
-          node: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * Render node information associated with this node. Optional. If
-           * specified, this is only used when computing the fill of the icon
-           * element.
-           * @type {tf.graph.render.RenderNodeInfo}
-           */
-          renderInfo: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * String indicating the type of coloring to use for this node, used
-           * only for determining the fill.
-           */
-          colorBy: {
-            type: Object,
-            value: "structural"
-          },
-
-          /**
-           * Function used by structural coloring algorithm to determine which
-           * color to use based on the template ID of the node. Optional.
-           */
-          templateIndex: {
-            type: Function,
-            value: null
-          },
-
-          /** Type of node to draw (ignored if node is set). */
-          type: {
-            type: String,
-            value: null
-          },
-
-          /** Direction for series (ignored for other types). */
-          vertical: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is Const (ignored for non-ops). */
-          const: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is a Summary (ignored for non-ops). */
-          summary: {
-            type: Boolean,
-            value: false
-          },
-
-          /**
-           * Fill for the icon, optional. If fill is specified and node is not
-           * specified, then this value will override the default for the
-           * element. However, if node is specified, this value will be ignored.
-           */
-          fill: {
-            type: String,
-            value: null
-          },
-
-          /** Height of the SVG element in pixels, used for scaling. */
-          height: {
-            type: Number,
-            value: 20
-          },
-
-          /** The computed fill for the node. **/
-          _computedFill: {
-            type: String,
-            computed:
-              "_getComputedFill(node, renderInfo, colorBy, templateIndex, fill)"
-          }
-
-        },
-
-        /**
-         * Get the computed fill value for the element.
-         */
-        _getComputedFill: function(inputNode, inputRenderInfo, inputColorBy,
-            inputTemplateIndex, inputFill) {
-          if (inputNode && inputRenderInfo &&
-              inputColorBy && inputTemplateIndex) {
-            var ns = tf.graph.scene.node;
-            var colorBy = ns.ColorBy[inputColorBy.toUpperCase()];
-            return ns.getFillForNode(inputTemplateIndex, colorBy,
-                inputRenderInfo, false);
-          }
-          return inputFill;
-        },
-
-        /**
-         * Get the fill value for the element, or if that's not possible, return
-         * the default fill value for the node type.
-         */
-        _getFill: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill || ({
-            OP: tf.graph.render.OpNodeColors.DEFAULT_FILL,
-            META: tf.graph.render.MetanodeColors.DEFAULT_FILL,
-            SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_FILL
-          })[inputNodeType];
-        },
-
-        /**
-         * Get the stroke value for the element, or if that's not possible,
-         * return the default stroke value for the node type.
-         */
-        _getStroke: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill ?
-            tf.graph.scene.node.getStrokeForFill(inputComputedFill) :
-            ({
-              OP: tf.graph.render.OpNodeColors.DEFAULT_STROKE,
-              META: tf.graph.render.MetanodeColors.DEFAULT_STROKE,
-              SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_STROKE
-            })[inputNodeType];
-        },
-
-        /**
-         * Test whether the specified node's type, or the literal type string,
-         * match a particular other type.
-         */
-        _isType: function(inputNode, inputType, targetType) {
-          if (inputNode) {
-            return tf.graph.NodeType[inputNode.type] === targetType;
-          }
-          return inputType === targetType;
-        },
-
-        /**
-         * Test whether the specified node should be represented as a vertical
-         * series. Defaults to the value of the vertical property if node is
-         * not specified.
-         */
-        _isVertical: function(inputNode, inputVertical) {
-          if (inputNode) {
-            return inputNode.hasNonControlEdges;
-          }
-          return !!inputVertical;
-        },
-
-        /**
-         * Test whether the specified node is a constant. Defaults to the value
-         * of the const property if node is not specified.
-         */
-        _isConst: function(inputNode, inputConst) {
-          if (inputNode) {
-            return inputNode.op === 'Const';
-          }
-          return !!inputConst;
-        },
-
-        /**
-         * Test whether the specified node is a summary. Defaults to the value
-         * of the summary property if node is not specified.
-         */
-        _isSummary: function(inputNode, inputSummary) {
-          if (inputNode) {
-            return this._isType(inputNode, null, 'OP') &&
-                inputNode.op.substr(-7) === 'Summary';
-          }
-          return !!inputSummary;
-        },
-
-        /**
-         * Test whether the op node is a regular non-summary non-const node.
-         */
-        _isRegularOp: function(inputNode, inputConst, inputSummary) {
-          return !this._isConst(inputNode, inputConst) &&
-              !this._isSummary(inputNode, inputSummary);
-        },
-
-        _fadedClass: function(itemRenderInfo, shape) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded-' + shape : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<dom-module id="tf-node-list-item" assetpath="/">
-  <style>
-  #list-item {
-    width: 100%;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-    position: relative;
-    display: inline-block;
-  }
-
-  #list-item:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .clickable {
-    cursor: pointer;
-  }
-
-  #list-item span {
-    margin-left: 40px;
-  }
-
-  #list-item.excluded span {
-    color: #999;
-  }
-
-  #list-item span.edge-label {
-    float: right;
-    font-size: 10px;
-    margin-left: 3px;
-    margin-right: 5px;
-  }
-
-  .node-icon {
-    position: absolute;
-    top: 1px;
-    left: 2px;
-  }
-
-  .faded span {
-    color: var(--tb-graph-faded);
-  }
-  </style>
-  <template>
-    <div id="list-item" on-mouseover="_nodeListener" on-mouseout="_nodeListener" on-click="_nodeListener">
-      <div class$="{{_fadedClass(itemRenderInfo)}}">
-        <tf-graph-icon class="node-icon" height="12" color-by="[[colorBy]]" color-by-params="[[colorByParams]]" node="[[itemNode]]" render-info="[[itemRenderInfo]]" template-index="[[templateIndex]]"></tf-graph-icon>
-        <span title$="[[name]]">[[name]]</span>
-        <span class="edge-label">[[edgeLabel]]</span>
-      </div>
-    </div>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-list-item',
-
-        properties: {
-          /**
-           * The Node for the card itself, on which this item is being drawn.
-           * @type {tf.graph.Node}
-           */
-          cardNode: Object,
-          /**
-           * The Node for the item within the card, somehow related to cardNode.
-           * @type {tf.graph.Node}
-           */
-          itemNode: Object,
-          /** The edge label associated with this item. */
-          edgeLabel: String,
-          /**
-           * The render node information for the item node. Used by the graph
-           * icon in determining fill color.
-           */
-          itemRenderInfo: Object,
-          name: String,
-          itemType: {
-            type: String,
-            observer: '_itemTypeChanged'
-          },
-          colorBy: String,
-          colorByParams: Object,
-          templateIndex: Function
-        },
-
-        _itemTypeChanged: function() {
-          if (this.itemType !== 'subnode') {
-            this.$['list-item'].classList.add('clickable');
-          } else {
-            this.$['list-item'].classList.remove('clickable');
-          }
-        },
-
-        _nodeListener: function(event) {
-          // fire node.click/mouseover/mouseout
-          this.fire('node-list-item-' + event.type, {
-            cardNode: this.cardNode.name,
-            nodeName: this.name,
-            type: this.itemType
-          });
-        },
-
-        _fadedClass: function(itemRenderInfo) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded' : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<dom-module id="tf-node-info" assetpath="../tf-graph-info/">
-  <style>
-  .sub-list-group {
-    font-weight: 500;
-    font-size: 12pt;
-    padding-bottom: 8px;
-    width: 100%;
-  }
-
-  .sub-list {
-    max-height: 300px;
-    overflow-y: scroll;
-  }
-
-  .attr-left {
-    float: left;
-    width: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .attr-right {
-    margin-left: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-weight: 400;
-  }
-
-  .sub-list-table {
-    display: table;
-    width: 100%;
-  }
-
-  .sub-list-table-row {
-    display: table-row;
-  }
-
-  .sub-list-table-row .sub-list-table-cell:last-child {
-    text-align: right;
-  }
-
-  .sub-list-table-cell {
-    color: #565656;
-    display: table-cell;
-    font-size: 11pt;
-    font-weight: 400;
-    max-width: 200px;
-    padding: 0 4px;
-  }
-
-  paper-item {
-    padding: 0;
-    background: #e9e9e9;
-  }
-
-  paper-item-body[two-line] {
-    min-height: 0;
-    padding: 8px 12px 4px;
-  }
-
-  .expandedInfo {
-    padding: 8px 12px;
-  }
-
-  .controlDeps {
-    padding: 0 0 0 8px;
-  }
-
-  .node-name {
-    white-space: normal;
-    word-wrap: break-word;
-    font-size: 14pt;
-    font-weight: 500;
-  }
-
-  .node-icon {
-    float: right;
-  }
-
-  .subtitle {
-    font-size: 12pt;
-    color: #5e5e5e;
-  }
-
-  .controlLine {
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .toggle-button {
-    float: right;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .control-toggle-button {
-    float: left;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .toggle-include-group {
-    padding-top: 4px;
-  }
-
-  .toggle-include {
-    margin: 5px 6px;
-    text-transform: none;
-    padding: 4px 6px;
-    font-size: 10pt;
-    background-color: #fafafa;
-    color: #666;
-  }
-
-  .toggle-include:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .non-control-list-item {
-    padding-left: 10px;
-  }
-  </style>
-  <template>
-    <paper-item>
-      <paper-item-body two-line="">
-        <div>
-          <paper-icon-button icon="{{_getToggleIcon(_expanded)}}" on-click="_toggleExpanded" class="toggle-button">
-          </paper-icon-button>
-          <div class="node-name" id="nodetitle"></div>
-        </div>
-        <div secondary="">
-          <tf-graph-icon class="node-icon" node="[[_node]]" render-info="[[_getRenderInfo(nodeName, renderHierarchy)]]" color-by="[[colorBy]]" template-index="[[_templateIndex]]"></tf-graph-icon>
-          <template is="dom-if" if="{{_node.op}}">
-            <div class="subtitle">
-              Operation:
-              <span>[[_node.op]]</span>
-            </div>
-          </template>
-          <template is="dom-if" if="{{_node.metagraph}}">
-            <div class="subtitle">
-              Subgraph:
-              <span>[[_node.cardinality]]</span> nodes
-            </div>
-          </template>
-        </div>
-      </paper-item-body>
-    </paper-item>
-    <iron-collapse opened="{{_expanded}}">
-    <template is="dom-if" if="{{_expanded}}" restamp="true">
-      <div class="expandedInfo">
-        <div class="sub-list-group attributes">
-          Attributes
-          (<span>[[_attributes.length]]</span>)
-          <iron-list class="sub-list" id="attributesList" items="[[_attributes]]">
-            <template>
-              <div>
-                <div class="attr-left">[[item.key]]</div>
-                <div class="attr-right">[[item.value]]</div>
-              </div>
-            </template>
-          </iron-list>
-        </div>
-
-        <template is="dom-if" if="{{_device}}">
-          <div class="sub-list-group device">
-            <div class="attr-left">Device</div>
-            <div class="attr-right">[[_device]]</div>
-          </div>
-        </template>
-
-        <div class="sub-list-group predecessors">
-          Inputs
-          (<span>[[_totalPredecessors]]</span>)
-          <iron-list class="sub-list" id="inputsList" items="[[_predecessors.regular]]">
-            <template>
-              <tf-node-list-item class="non-control-list-item" card-node="[[_node]]" item-node="[[item.node]]" edge-label="[[item.edgeLabel]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="predecessors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_predecessors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button icon="{{_getToggleIcon(_openedControlPred)}}" on-click="_toggleControlPred" class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlPred}}" no-animation="">
-                <template is="dom-if" if="{{_openedControlPred}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_predecessors.control]]">
-                    <template>
-                      <tf-node-list-item card-node="[[_node]]" item-node="[[item.node]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="predecessors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-
-        <div class="sub-list-group successors">
-          Outputs
-          (<span>[[_totalSuccessors]]</span>)
-          <iron-list class="sub-list" id="outputsList" items="[[_successors.regular]]">
-            <template>
-              <tf-node-list-item class="non-control-list-item" card-node="[[_node]]" item-node="[[item.node]]" edge-label="[[item.edgeLabel]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="successor" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_successors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button icon="{{_getToggleIcon(_openedControlSucc)}}" on-click="_toggleControlSucc" class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlSucc}}" no-animation="">
-                <template is="dom-if" if="{{_openedControlSucc}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_successors.control]]">
-                    <template>
-                      <tf-node-list-item card-node="[[_node]]" item-node="[[item.node]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="successors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-        <template is="dom-if" if="{{_hasDisplayableNodeStats}}">
-          <div class="sub-list-group node-stats">
-            Node Stats
-            <div class="sub-list-table">
-              <template is="dom-if" if="{{_nodeStats.totalBytes}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Memory</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_nodeStats.totalMicros}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Compute Time</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_nodeStats.outputSize}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Tensor Output Sizes</div>
-                  <div class="sub-list-table-cell">
-                    <template is="dom-repeat" items="{{_nodeStatsFormattedOutputSizes}}">
-                      [[item]] <br>
-                    </template>
-                  </div>
-                </div>
-              </template>
-            </div>
-          </div>
-        </template>
-        <div class="toggle-include-group">
-          <paper-button raised="" class="toggle-include" on-click="_toggleInclude">
-            <span>[[_auxButtonText]]</span>
-          </paper-button>
-        </div>
-        <template is="dom-if" if="{{_isInSeries(_node)}}">
-          <div class="toggle-include-group">
-            <paper-button raised="" class="toggle-include" on-click="_toggleGroup">
-              <span>[[_groupButtonText]]</span>
-            </paper-button>
-          </div>
-        </template>
-      </div>
-    </template>
-    </iron-collapse>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-info',
-
-        properties: {
-          nodeName: String,
-          graphHierarchy: Object,
-          renderHierarchy: Object,
-          /** What to color the nodes by (compute time, memory, device etc.) */
-          colorBy: String,
-          _templateIndex: {
-            type: Function,
-            computed: '_getTemplateIndex(graphHierarchy)'
-          },
-          _node: {
-            type: Object,
-            computed: '_getNode(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _nodeStats: {
-            type: Object,
-            computed: '_getNodeStats(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _hasDisplayableNodeStats: {
-            type: Object,
-            computed: '_getHasDisplayableNodeStats(_nodeStats)',
-          },
-          _nodeStatsFormattedBytes: {
-            type: String,
-            computed: '_getNodeStatsFormattedBytes(_nodeStats)',
-          },
-          _nodeStatsFormattedComputeTime: {
-            type: String,
-            computed: '_getNodeStatsFormattedComputeTime(_nodeStats)',
-          },
-          _nodeStatsFormattedOutputSizes: {
-            type: Array,
-            computed: '_getNodeStatsFormattedOutputSizes(_nodeStats)',
-          },
-          // The enum value of the include property of the selected node.
-          nodeInclude: {
-            type: Number,
-            observer: '_nodeIncludeStateChanged'
-          },
-          _attributes: {
-            type: Array,
-            computed: '_getAttributes(_node)'
-          },
-          _device: {
-            type: String,
-            computed: '_getDevice(_node)'
-          },
-          _successors: {
-            type: Object,
-            computed: '_getSuccessors(_node, graphHierarchy)'
-          },
-          _predecessors: {
-            type: Object,
-            computed: '_getPredecessors(_node, graphHierarchy)'
-          },
-          _subnodes: {
-            type: Array,
-            computed: '_getSubnodes(_node)'
-          },
-          _expanded: {
-            type: Boolean,
-            value: true
-          },
-          _totalPredecessors: {
-            type: Number,
-            computed: '_getTotalPred(_predecessors)'
-          },
-          _totalSuccessors: {
-            type: Number,
-            computed: '_getTotalSucc(_successors)'
-          },
-          _openedControlPred: {
-            type: Boolean,
-            value: false
-          },
-          _openedControlSucc: {
-            type: Boolean,
-            value: false
-          },
-          _auxButtonText: String,
-          _groupButtonText: String
-        },
-        expandNode: function() {
-          this.fire('_node.expand', this.node);
-        },
-        _getTemplateIndex: function(graphHierarchy) {
-          return graphHierarchy.getTemplateIndex();
-        },
-        _getNode: function(nodeName, graphHierarchy) {
-          return graphHierarchy.node(nodeName);
-        },
-        _getNodeStats: function(nodeName, graphHierarchy) {
-          var node = this._getNode(nodeName, graphHierarchy);
-          if (node) {
-            return node.stats;
-          }
-          return null;
-        },
-        _getHasDisplayableNodeStats: function(stats) {
-          return tf.graph.util.hasDisplayableNodeStats(stats);
-        },
-        _getNodeStatsFormattedBytes: function(stats) {
-          if (!stats || !stats.totalBytes) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalBytes, tf.graph.util.MEMORY_UNITS);
-        },
-        _getNodeStatsFormattedComputeTime: function(stats) {
-          if (!stats || !stats.totalMicros) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalMicros, tf.graph.util.TIME_UNITS);
-        },
-        _getNodeStatsFormattedOutputSizes: function(stats) {
-          if (!stats || !stats.outputSize || !stats.outputSize.length) {
-            return;
-          }
-
-          return _.map(stats.outputSize, function(shape) {
-            if (shape.length === 0) {
-              return "scalar";
-            }
-            return "[" + shape.join(", ") + "]";
-          });
-        },
-        _getPrintableHTMLNodeName: function(nodeName) {
-          // Insert an optional line break before each slash so that
-          // long node names wrap cleanly at path boundaries.
-          return (nodeName || '').replace(/\//g, '<wbr>/');
-        },
-        _getRenderInfo: function(nodeName, renderHierarchy) {
-          return this.renderHierarchy.getOrCreateRenderNodeByName(nodeName);
-        },
-        _getAttributes: function(node) {
-          this.async(this._resizeList.bind(this, "#attributesList"));
-          if (!node || !node.attr) {
-            return [];
-          }
-          var attrs = [];
-          _.each(node.attr, function(entry) {
-            // Unpack the "too large" attributes into separate attributes
-            // in the info card, with values "too large to show".
-            if (entry.key === tf.graph.LARGE_ATTRS_KEY) {
-              attrs = attrs.concat(entry.value.list.s.map(function(key) {
-                return {key: key, value: "Too large to show..."};
-              }));
-            } else {
-              attrs.push({
-                key: entry.key,
-                value: JSON.stringify(entry.value)
-              });
-            }
-          });
-          return attrs;
-        },
-        _getDevice: function(node) {
-          return node ? node.device : null;
-        },
-        _getSuccessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#inputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getSuccessors(node.name), false, node.isGroupNode);
-        },
-        _getPredecessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#outputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getPredecessors(node.name), true, node.isGroupNode);
-        },
-        _convertEdgeListToEdgeInfoList: function(list, isPredecessor, isGroupNode) {
-
-          /**
-           * Unpacks the metaedge into a list of base edge information
-           * that can be rendered.
-           */
-          var unpackMetaedge = function(metaedge) {
-            return _.map(metaedge.baseEdgeList, function(baseEdge) {
-              name = isPredecessor ? baseEdge.v : baseEdge.w;
-              return {
-                name: name,
-                node: this._getNode(name, this.graphHierarchy),
-                edgeLabel: tf.graph.scene.edge.getLabelForBaseEdge(baseEdge,
-                    this.renderHierarchy),
-                renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-              };
-            }, this);
-          }.bind(this);
-
-          /**
-           * Converts a list of metaedges to a list of edge information
-           * that can be rendered.
-           */
-          var toEdgeInfoList = function(edges) {
-            var edgeInfoList = [];
-            _.each(edges, function(metaedge) {
-              var name = isPredecessor ? metaedge.v : metaedge.w;
-              // Enumerate all the base edges if the node is an OpNode, or the
-              // metaedge has only 1 edge in it.
-              if (!isGroupNode || metaedge.baseEdgeList.length == 1) {
-                edgeInfoList = edgeInfoList.concat(unpackMetaedge(metaedge));
-              } else {
-                edgeInfoList.push({
-                  name: name,
-                  node: this._getNode(name, this.graphHierarchy),
-                  edgeLabel: tf.graph.scene.edge.getLabelForEdge(metaedge,
-                      this.renderHierarchy),
-                  renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-                });
-              }
-            }, this);
-            return edgeInfoList;
-          }.bind(this);
-
-          return {
-            regular: toEdgeInfoList(list.regular),
-            control: toEdgeInfoList(list.control)
-          };
-        },
-        _getSubnodes: function(node) {
-          return node && node.metagraph ? node.metagraph.nodes() : null;
-        },
-        _getTotalPred: function(predecessors) {
-          return predecessors.regular.length + predecessors.control.length;
-        },
-        _getTotalSucc: function(successors) {
-          return successors.regular.length + successors.control.length;
-        },
-        _toggleControlPred: function() {
-          this._openedControlPred = !this._openedControlPred;
-        },
-        _toggleControlSucc: function() {
-          this._openedControlSucc = !this._openedControlSucc;
-        },
-        _toggleExpanded: function() {
-          this._expanded = !this._expanded;
-        },
-        _getToggleIcon: function(expanded) {
-          return expanded ? "expand-less" : "expand-more";
-        },
-        _resetState: function() {
-          this._openedControlPred = false;
-          this._openedControlSucc = false;
-
-          this.set("_groupButtonText",
-            tf.graph.scene.node.getGroupSettingLabel(this._node));
-
-          if (this._node) {
-            Polymer.dom(this.$.nodetitle).innerHTML =
-              this._getPrintableHTMLNodeName(this._node.name);
-          }
-        },
-        _resizeList: function(selector) {
-          var list = document.querySelector(selector);
-          if (list) {
-            list.fire('iron-resize');
-          }
-        },
-        _toggleInclude: function() {
-          var graphElem = document.querySelector("#graph");
-          graphElem.fire("node-toggle-extract", { name: this.nodeName });
-          var graphBoardElem = document.querySelector("#graphboard");
-          graphBoardElem.fire("node-toggle-extract");
-        },
-        _nodeIncludeStateChanged: function(include, oldInclude) {
-          this.set("_auxButtonText",
-            tf.graph.getIncludeNodeButtonString(include));
-        },
-        _toggleGroup: function() {
-          var graphElem = document.querySelector("#graph");
-          var seriesName = tf.graph.scene.node.getSeriesName(this._node);
-          graphElem.fire("node-toggle-seriesgroup", { name: seriesName });
-        },
-        _isInSeries: function(node) {
-          return tf.graph.scene.node.canBeInSeries(node);
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<dom-module id="tf-graph-info" assetpath="../tf-graph-info/">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  margin: 0;
-  padding: 0;
-  display: block;
-}
-
-h2 {
-  padding: 0;
-  text-align: center;
-  margin: 0;
-}
-</style>
-<template is="dom-if" if="{{selectedNode}}">
-  <paper-material elevation="1" class="card">
-    <tf-node-info graph-hierarchy="[[graphHierarchy]]" render-hierarchy="[[renderHierarchy]]" flat-graph="[[graph]]" node-name="[[selectedNode]]" node-include="[[selectedNodeInclude]]" highlighted-node="{{highlightedNode}}" color-by="[[colorBy]]">
-    </tf-node-info>
-  </paper-material>
-</template>
-</template>
-<script>
-(function() {
-  Polymer({
-    is: 'tf-graph-info',
-
-    properties: {
-      title: String,
-      graphHierarchy: Object,
-      graph: Object,
-      renderHierarchy: Object,
-      colorBy: String,
-      // Two-ways
-      selectedNode: {
-        type: String,
-        notify: true
-      },
-      highlightedNode: {
-        type: String,
-        notify: true
-      },
-      // The enum value of the include property of the selected node.
-      selectedNodeInclude: {
-        type: Number,
-        notify: true
-      }
-    },
-    listeners: {
-      'node-list-item-click': '_nodeListItemClicked',
-      'node-list-item-mouseover': '_nodeListItemMouseover',
-      'node-list-item-mouseout': '_nodeListItemMouseout'
-    },
-    _nodeListItemClicked: function(event) {
-      this.selectedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseover: function(event) {
-      this.highlightedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseout: function() {
-      this.highlightedNode = null;
-    }
-  });
-})();
-</script>
-</dom-module>
-<link rel="import" href="../paper-progress/paper-progress.html"><dom-module id="tf-graph-board" assetpath="../tf-graph-board/">
-<template>
-<style>
-::host {
-  display: block;
-}
-
-/deep/ .close {
-  position: absolute;
-  cursor: pointer;
-  left: 15px;
-  bottom: 15px;
-}
-
-.container {
-  width: 100%;
-  height: 100%;
-  opacity: 1;
-}
-
-.container.loading {
-  cursor: progress;
-  opacity: 0.1;
-}
-
-.container.loading.error {
-  cursor: auto;
-}
-
-#info {
-  position: absolute;
-  right: 5px;
-  top: 5px;
-  padding: 0px;
-  max-width: 380px;
-  min-width: 320px;
-  background-color: rgba(255,255,255,0.9);
-  @apply(--shadow-elevation-2dp);
-}
-
-#main {
-  width: 100%;
-  height: 100%;
-}
-
-#progress-bar {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  width: 100%;
-  position: absolute;
-  top: 40px;
-  left: 0;
-  font-size: 13px;
-}
-
-#progress-msg {
-  width: 400px;
-  margin-bottom: 5px;
-}
-
-paper-progress {
-  width: 400px;
-  --paper-progress-height: 6px;
-  --paper-progress-active-color: #f3913e;
-}
-
-.context-menu {
-  position: absolute;
-  display: none;
-  background-color: #e2e2e2;
-  border-radius: 2px;
-  font-size: 14px;
-  min-width: 150px;
-  border: 1px solid #d4d4d4;
-}
-
-/deep/ .context-menu ul {
-  list-style-type: none;
-  margin: 0;
-  padding: 0;
-  cursor: default;
-}
-
-/deep/ .context-menu ul li {
-  padding: 4px 16px;
-}
-
-/deep/ .context-menu ul li:hover {
-  background-color: #f3913e;
-  color: white;
-}
-</style>
-<template is="dom-if" if="[[_isNotComplete(progress)]]">
-  <div id="progress-bar">
-    <div id="progress-msg">[[progress.msg]]</div>
-    <paper-progress value="[[progress.value]]"></paper-progress>
-  </div>
-</template>
-<div class$="[[_getContainerClass(progress)]]">
-  <div id="main">
-    <tf-graph id="graph" graph-hierarchy="{{graphHierarchy}}" basic-graph="[[graph]]" hierarchy-params="[[hierarchyParams]]" render-hierarchy="{{renderHierarchy}}" devices-for-stats="[[devicesForStats]]" stats="[[stats]]" selected-node="{{_selectedNode}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="{{colorByParams}}" progress="{{progress}}"></tf-graph>
-  </div>
-  <div id="info">
-    <tf-graph-info id="graph-info" title="selected" graph-hierarchy="[[graphHierarchy]]" render-hierarchy="[[renderHierarchy]]" graph="[[graph]]" selected-node="{{_selectedNode}}" selected-node-include="{{_selectedNodeInclude}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="[[colorByParams]]"></tf-graph-info>
-  </div>
-  <div class="context-menu"></div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-  is: 'tf-graph-board',
-  properties: {
-    // Public API.
-    graphHierarchy: Object,
-    graph: Object,
-    stats: Object,
-    /**
-     * @type {value: number, msg: string}
-     *
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     */
-    progress: Object,
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true
-    },
-    // Private API: Data routing between child components.
-    _selectedNode: String,
-    // The enum value of the include property of the selected node.
-    _selectedNodeInclude: Number,
-    _highlightedNode: String
-  },
-  listeners: {
-    'node-toggle-extract': '_nodeToggleExtract'
-  },
-  observers: [
-    '_updateNodeInclude(_selectedNode)'
-  ],
-  /** True if the progress is not complete yet (< 100 %). */
-  _isNotComplete: function(progress) {
-    return progress.value < 100;
-  },
-  _getContainerClass: function(progress) {
-    var result = 'container';
-    if (progress.error) {
-      result += ' error';
-    }
-    if (this._isNotComplete(progress)) {
-      result += ' loading';
-    }
-    return result;
-  },
-  _updateNodeInclude: function(nodeName) {
-    var node = this.graphHierarchy.node(nodeName);
-    this.set("_selectedNodeInclude",
-      node ? node.include : tf.graph.InclusionType.UNSPECIFIED);
-  },
-  _nodeToggleExtract: function() {
-    this._updateNodeInclude(this._selectedNode);
-  }
-});
-</script>
-<link rel="import" href="../paper-radio-group/paper-radio-group.html"><link rel="import" href="../paper-tooltip/paper-tooltip.html"><dom-module id="tf-graph-controls" assetpath="../tf-graph/">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  color: gray;
-  --paper-font-subhead: {
-    font-size: 14px;
-    color: gray;
-  };
-  --paper-dropdown-menu-icon: {
-    width: 15px;
-    height: 15px;
-  };
-  --paper-dropdown-menu-button: {
-    padding: 0;
-  };
-  --paper-dropdown-menu-input: {
-    padding: 0;
-  };
-  --paper-item-min-height: 30px;
-}
-
-paper-button[raised].keyboard-focus {
-  font-weight: normal;
-}
-
-.run-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 25px;
-  };
-}
-
-.color-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 13px;
-  };
-}
-
-table {
-  border-collapse: collapse;
-  border-spacing: 0;
-}
-
-table td {
-  padding: 0;
-  margin: 0;
-}
-
-.allcontrols {
-  width: 188px;
-  padding: 0 30px;
-}
-
-.legend-holder {
-  position: absolute;
-  bottom: 0;
-  padding-bottom: 10px;
-}
-
-paper-radio-button {
-  display: block;
-  padding: 5px;
-}
-svg.icon {
-  width: 60px;
-  height: 18px;
-}
-.icon ellipse {
-  rx: 10px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 1px;
-  fill: #FFFFFF;
-  cy: 10px;
-}
-.icon rect {
-  height: 14px;
-  width: 35px;
-  rx: 5px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 2px;
-  fill: #D9D9D9;
-}
-.domainValues {
-  margin-bottom: 10px;
-  width: 165px;
-}
-.domainStart {
-  float: left;
-}
-.domainEnd {
-  float: right;
-}
-.colorBox {
-  width: 20px;
-}
-
-.image-icon {
-  width: 24px;
-  height: 24px;
-}
-
-.help-icon {
-  height: 15px;
-  margin: 0;
-  padding: 0;
-}
-
-.gray {
-  color: #666;
-}
-
-.title {
-  font-size: 16px;
-  margin: 8px 5px 8px 0;
-  color: black;
-}
-.title small {
-  font-weight: normal;
-}
-.deviceList {
-  max-height: 200px;
-  overflow-y: auto;
-}
-
-#file {
-  padding: 8px 0;
-}
-
-.color-legend-row {
-  clear: both;
-  height: 20px;
-  margin-top: 5px;
-  position: relative;
-}
-
-.color-legend-row svg {
-  position: absolute;
-  top: -1px;
-  width: 40px;
-}
-
-.color-legend-row span.color-legend-value {
-  margin-left: 60px;
-}
-
-#grey-rect {
-  fill: #eee;
-  stroke: #a6a6a6;
-}
-
-#faded-rect {
-  fill: url("#rectHatch");
-  stroke: var(--tb-graph-faded);
-}
-
-.button-text {
-  text-transform: none;
-  padding: 8px 18px 0 18px;
-  font-size: 14px
-}
-
-.upload-button {
-  width: 165px;
-  height: 25px;
-  text-transform: none;
-  margin-top: 4px;
-}
-
-.iconbutton {
-  padding: 2px;
-  width: 30px;
-  height: 30px;
-  color: var(--paper-orange-500);
-}
-
-.hidden-input {
-  height: 0px;
-  width: 0px;
-  overflow:hidden;
-}
-
-.allcontrols .control-holder {
-  display: flex;
-  clear: both;
-}
-
-.allcontrols .control-holder paper-radio-group {
-  margin-top: 5px;
-}
-
-span.counter {
-  font-size: 13px;
-  color: gray;
-}
-</style>
-<svg width="0" height="0">
-  <defs>
-    <g id="legend-rect">
-      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-    </g>
-    <g id="grey-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#legend-rect"></use>
-     </g>
-     <g id="faded-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#legend-rect"></use>
-     </g>
-  </defs>
-</svg>
-<div class="allcontrols">
-  <div class="control-holder">
-    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="fit">Fit to screen
-    </paper-button>
-  </div>
-  <div class="control-holder">
-    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="download">Download PNG
-    </paper-button>
-    <a href="#" id="graphdownload" class="title" download="graph.png">
-    </a>
-  </div>
-  <div class="control-holder">
-    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
-    <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
-      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
-        <template is="dom-repeat" items="[[datasets]]">
-          <paper-item>[[item.name]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-  </div>
-  <template is="dom-if" if="[[showSessionRunsDropdown]]">
-    <div class="control-holder">
-      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
-      <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
-          <template is="dom-repeat" items="[[metadataTags]]">
-            <paper-item>[[item.tag]]</paper-item>
-          </template>
-          <paper-item>None</paper-item>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-  </template>
-  <template is="dom-if" if="[[showUploadButton]]">
-    <div class="control-holder">
-      <div class="title">Upload</div>
-      <paper-button raised="" class="text-button upload-button" on-click="_getFile">Choose File</paper-button>
-      <div class="hidden-input">
-        <input type="file" id="file" name="file" on-change="_updateFileInput">
-      </div>
-    </div>
-  </template>
-  <div class="control-holder">
-    <div class="title">
-      Trace inputs
-    </div>
-    <paper-toggle-button id="trace-inputs">
-
-    </paper-toggle-button>
-  </div>
-  <div class="control-holder">
-    <div class="title">Color</div>
-    <paper-radio-group selected="{{colorBy}}">
-      <paper-radio-button name="structure">Structure</paper-radio-button>
-      <paper-radio-button name="device">Device</paper-radio-button>
-      <template is="dom-if" if="[[_statsNotNull(stats)]]">
-        <paper-radio-button name="compute_time">Compute time</paper-radio-button>
-        <paper-radio-button name="memory">Memory</paper-radio-button>
-      </template>
-    </paper-radio-group>
-  </div>
-  <div>
-    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
-      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
-        <defs>
-          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
-            <stop class="start" offset="0%" stop-color$="[[_currentGradientParams.startColor]]"></stop>
-            <stop class="end" offset="100%" stop-color$="[[_currentGradientParams.endColor]]"></stop>
-          </linearGradient>
-        </defs>
-        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)" stroke="black"></rect>
-      </svg>
-      <div class="domainValues color-text">
-        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
-        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
-      </div>
-      <br style="clear: both">
-      <div>Devices included in stats:</div>
-      <div class="deviceList">
-        <table>
-        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
-          <tr>
-            <td>
-              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked">
-            </td>
-            <td>
-              <div>
-                <span>[[item.suffix]]</span>
-                <template is="dom-if" if="[[item.ignoredMsg]]">
-                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
-                </template>
-              </div>
-            </td>
-          </tr>
-        </template>
-        </table>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
-      <div class="color-text">
-        <div class="color-legend-row">
-          <div style="position: absolute;">
-            colors
-          </div>
-          <span class="color-legend-value">same substructure</span>
-        </div>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#grey-rect" x="0" y="0"></use>
-          </svg>
-          <span class="color-legend-value">unique substructure</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
-      <div class="color-text">
-        <div class="deviceList">
-          <table>
-          <template is="dom-repeat" items="[[colorByParams.device]]">
-            <tr>
-              <td style$="[[_getBackgroundColor(item.color)]]">
-                <div class="colorBox"></div>
-              </td>
-              <td>
-                <div>[[item.device]]</div>
-              </td>
-            </tr>
-          </template>
-          </table>
-        </div>
-        <br>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#grey-rect" x="0" y="0"></use>
-          </svg>
-          <span class="color-legend-value">unknown device</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_statsNotNull(stats)]]">
-      <div class="color-legend-row">
-        <svg>
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#faded-rect" x="0" y="0"></use>
-        </svg>
-        <span class="color-legend-value">unused substructure</span>
-      </div>
-    </template>
-  </div>
-  
-  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
-    <div class="legend-holder">
-      <table>
-        <tbody><tr>
-          <td><div class="title">Graph</div></td>
-          <td>(* = expandable)</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <rect transform="translate(3, 1)" height="14" width="35" rx="5" ry="5"></rect>
-            </svg>
-          </td>
-          <td>Namespace<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
-              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5" y="6"></use>
-            </svg>
-          </td>
-          <td>OpNode</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 12 12">
-              <use xlink:href="#op-series-horizontal-stamp" fill="white" stroke="#ccc" x="2" y="2"></use>
-            </svg>
-          </td>
-          <td>Unconnected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <use xlink:href="#op-series-vertical-stamp" fill="white" stroke="#ccc" x="2" y="2"></use>
-            </svg>
-          </td>
-          <td>Connected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"></circle>
-            </svg>
-          </td>
-          <td>Constant</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
-              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"></use>
-            </svg>
-          </td>
-          <td>Summary</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <defs>
-                <marker id="ref-arrowhead-legend" fill="#bbb" markerWidth="10" markerHeight="10" refX="1" refY="5" orient="auto">
-                  <path d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"></path>
-                </marker>
-              </defs>
-              <path stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round"></path>
-            </svg>
-          </td>
-          <td>Dataflow edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round" stroke-dasharray="2, 2"></path>
-            </svg>
-          </td>
-          <td>Control dependency edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path marker-start="url(#ref-arrowhead-legend)" stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round"></path>
-            </svg>
-          </td>
-          <td>Reference edge</td>
-        </tr>
-      </tbody></table>
-    </div>
-  </template>
-  </div>
-</template>
-<script>
-(function() { // Private scope.
-/**
- * Stats from device names that match these regexes will be excluded by default.
- * The user can still turn on a device by selecting the checkbox in the device list.
- * See b/29089982 for context.
- */
-var DEVICE_NAMES_EXCLUDE = [
-  {
-    regex: /gpu:[0-9]+$/,
-    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
-  }
-];
-
-Polymer({
-  is: 'tf-graph-controls',
-  properties: {
-    // Public API.
-    stats: {
-      value: null,
-      type: Object,
-      observer: '_statsChanged'
-    },
-    devicesForStats: {
-      value: null,
-      type: Object,
-      notify: true,
-      readonly: true,
-    },
-    colorBy: {
-      type: String,
-      value: 'structure',
-      notify: true,
-      readonly: true
-    },
-    colorByParams: Object,
-    datasets: {
-      type: Array,
-      observer: '_datasetsChanged'
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true,
-    },
-    metadataTags: {
-      type: Array,
-      computed: '_getMetadataTags(selectedDataset, datasets)'
-    },
-    selectedDataset: {
-      type: Number,
-      notify: true,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    selectedFile: {
-      type: Object,
-      notify: true
-    },
-    selectedMetadataTag: {
-      type: Number,
-      notify: true,
-      value: -1
-    },
-    _currentGradientParams: {
-      type: Object,
-      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
-    },
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    }
-  },
-  listeners: {
-    'trace-inputs.change': '_traceInputToggleChanged'
-  },
-  _traceInputToggleChanged: function(event) {
-    // Flip the state of the trace inputs flag.
-    this.renderHierarchy.traceInputs = event.target.active;
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-  },
-  _statsNotNull: function(stats) {
-    return stats != null;
-  },
-  _statsChanged: function(stats) {
-    if (stats == null) {
-      return;
-    }
-    var devicesForStats = {};
-    var devices = _.each(stats.dev_stats, function(d) {
-      // Avoid device names that are ignored by default.
-      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
-        return rule.regex.test(d.device);
-      });
-      if (!exclude) {
-        devicesForStats[d.device] = true;
-      }
-    });
-    this.set('devicesForStats', devicesForStats);
-  },
-  _getDevices: function(devicesForStats) {
-    var devices = _.map(this.stats.dev_stats, function(d) {
-      return d.device;
-    });
-    // Devices names can be long so we remove the longest common prefix
-    // before showing the devices in a list.
-    var suffixes = tf.graph.util.removeCommonPrefix(devices);
-    return _.map(devices, function(device, i) {
-      var ignoredMsg = null;
-      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
-        if (rule.regex.test(device)) {
-          ignoredMsg = rule.msg;
-        }
-      });
-      return {
-        device: device,
-        suffix: suffixes[i],
-        used: devicesForStats[device],
-        ignoredMsg: ignoredMsg
-      };
-    });
-  },
-  _deviceCheckboxClicked: function(checkbox) {
-    // Update the device map.
-    var devicesForStats = _.extend({}, this.devicesForStats);
-    var device = checkbox.target.value;
-    if (checkbox.target.checked) {
-      devicesForStats[device] = true;
-    } else {
-      delete devicesForStats[device];
-    }
-    this.set('devicesForStats', devicesForStats);
-  },
-  _numSessionRuns: function(metadataTags) {
-    return metadataTags != null ? metadataTags.length : 0;
-  },
-  _getBackgroundColor: function(color) {
-    return 'background-color:' + color;
-  },
-  fit: function() {
-    document.querySelector('#scene').fit();
-  },
-  _isGradientColoring: function(stats, colorBy) {
-    return ["compute_time", "memory"].indexOf(colorBy) !== -1
-        && stats != null;
-  },
-  _equals: function(a, b) {
-    return a === b;
-  },
-  _getCurrentGradientParams: function(colorByParams, colorBy) {
-    if (!this._isGradientColoring(this.stats, colorBy)) {
-      return;
-    }
-    var params = colorByParams[colorBy];
-    var minValue = params.minValue;
-    var maxValue = params.maxValue;
-    if (colorBy === 'memory') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.MEMORY_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.MEMORY_UNITS);
-    } else if (colorBy === 'compute_time') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.TIME_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.TIME_UNITS);
-    }
-    return {
-      minValue: minValue,
-      maxValue: maxValue,
-      startColor: params.startColor,
-      endColor: params.endColor
-    };
-  },
-  download: function() {
-    this.$.graphdownload.click();
-  },
-  _updateFileInput: function(e) {
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-    this._setDownloadFilename(file.name);
-    this.set('selectedFile', e);
-  },
-  _datasetsChanged: function(newDatasets, oldDatasets) {
-    if (oldDatasets != null || this.selected == null) {
-      // Select the first dataset by default.
-      this.set('selectedDataset', 0);
-      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
-    }
-  },
-  _getMetadataTags: function(selectedDataset, datasets) {
-    return this.datasets[selectedDataset].runMetadata;
-  },
-  _selectedDatasetChanged: function(newDataset, oldDataset) {
-    if (this.datasets) {
-      this.set('selectedMetadataTag', -1);
-      this.set('colorBy', 'structure');
-      this.$['trace-inputs'].active = false; // Set trace input to off-state.
-      this._setDownloadFilename(this.datasets[newDataset].path);
-    }
-  },
-  _getFile: function() {
-    this.$$("#file").click();
-  },
-  _setDownloadFilename: function(graphPath) {
-    // Strip off everything before the last "/" and strip off the file
-    // extension in order to get the name of the PNG for the graph.
-    var dotIndex = graphPath.lastIndexOf('.');
-    if (dotIndex) {
-      graphPath = graphPath.substring(0, dotIndex);
-    }
-    var slashIndex = graphPath.lastIndexOf('/');
-    if (slashIndex) {
-      graphPath = graphPath.substring(slashIndex + 1);
-    }
-    this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  }
-});
-})(); // Closing private scope.
-</script>
-</dom-module>
-<dom-module id="tf-graph-dashboard" assetpath="../tf-graph-dashboard/">
-<template>
-<tf-no-data-warning data-type="graph" show-warning="[[_datasetsEmpty(_datasets)]]"></tf-no-data-warning>
-<template is="dom-if" if="[[!_datasetsEmpty(_datasets)]]">
-<tf-dashboard-layout>
-<div class="sidebar">
-  <tf-graph-controls id="controls" devices-for-stats="{{_devicesForStats}}" color-by-params="[[_colorByParams]]" stats="[[_stats]]" color-by="{{_colorBy}}" datasets="[[_datasets]]" render-hierarchy="[[_renderHierarchy]]" selected-dataset="{{_selectedDataset}}" selected-file="{{_selectedFile}}" selected-metadata-tag="{{_selectedMetadataTag}}"></tf-graph-controls>
-  <tf-graph-loader id="loader" datasets="[[_datasets]]" selected-dataset="[[_selectedDataset]]" selected-metadata-tag="[[_selectedMetadataTag]]" selected-file="[[_selectedFile]]" out-graph-hierarchy="{{_graphHierarchy}}" out-graph="{{_graph}}" out-stats="{{_stats}}" progress="{{_progress}}" out-hierarchy-params="{{_hierarchyParams}}"></tf-graph-loader>
-</div>
-<div class="center">
-    <tf-graph-board id="graphboard" devices-for-stats="[[_devicesForStats]]" color-by="[[_colorBy]]" color-by-params="{{_colorByParams}}" graph-hierarchy="[[_graphHierarchy]]" graph="[[_graph]]" hierarchy-params="[[_hierarchyParams]]" progress="[[_progress]]" render-hierarchy="{{_renderHierarchy}}" stats="[[_stats]]"></tf-graph-board>
-</div>
-</tf-dashboard-layout>
-</template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.center {
-  position: relative;
-  height: 100%;
-}
-
-</style>
-</template>
-</dom-module>
-
-<script>
-(function() {
-Polymer({
-  is: 'tf-graph-dashboard',
-  properties: {
-    _datasets: Object,
-    _renderHierarchy: Object,
-    backend: {type: Object, observer: 'reload'},
-    runs: Array
-  },
-  reload: function() {
-    Promise.all([this.backend.graphRuns(), this.backend.runMetadataRuns()])
-    .then(function(result) {
-      var runsWithGraph = result[0].sort(VZ.Sorting.compareTagNames);
-      var runToMetadata = result[1];
-      var datasets = _.map(runsWithGraph, function(runName) {
-        return {
-          name: runName,
-          path: this.backend.router.graph(runName, tf.graph.LIMIT_ATTR_SIZE,
-            tf.graph.LARGE_ATTRS_KEY),
-          runMetadata: runToMetadata[runName] ? _.map(
-            runToMetadata[runName].sort(VZ.Sorting.compareTagNames), function(tag) {
-              return {
-                tag: tag,
-                path: this.backend.router.runMetadata(tag, runName)
-              };
-            }, this) : []
-        };
-      }, this);
-      this.set('_datasets', datasets);
-    }.bind(this));
-  },
-  _datasetsEmpty: function(datasets) {
-    return !datasets || !datasets.length;
-  }
-});
-})();
-</script>
-<link rel="import" href="../paper-listbox/paper-listbox.html"><link rel="import" href="../iron-icons/image-icons.html"><link rel="import" href="../paper-toast/paper-toast.html"><link rel="import" href="../paper-styles/typography.html"><link rel="import" href="../paper-spinner/paper-spinner-lite.html"><link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html"><link rel="import" href="../paper-input/paper-textarea.html"><dom-module id="vz-projector-styles" assetpath="/">
-<template>
-<style>
-:host {
-  --paper-input-container-label: {
-    font-size: 14px;
-  };
-  --paper-input-container-input: {
-    font-size: 14px;
-  };
-  /* TODO: Figure out why this doesn't work */
-  --paper-dropdown-menu-input: {
-    font-size: 14px;
-  };
-}
-
-paper-button {
-  background: #e3e3e3;
-  margin-left: 0;
-  text-transform: none;
-}
-
-paper-dropdown-menu paper-item {
-  font-size: 13px;
-}
-
-paper-tooltip {
-  max-width: 200px;
-  --paper-tooltip: {
-    font-size: 12px;
-  };
-}
-
-paper-checkbox {
-  --paper-checkbox-checked-color: #880E4F;
-}
-
-paper-toggle-button {
-  --paper-toggle-button-checked-bar-color:  #880E4F;
-  --paper-toggle-button-checked-button-color:  #880E4F;
-  --paper-toggle-button-checked-ink-color: #880E4F;
-}
-
-paper-icon-button {
-  border-radius: 50%;
-}
-
-paper-icon-button[active] {
-  color: white;
-  background-color: #880E4F;
-}
-
-.slider {
-  display: flex;
-  align-items: center;
-  margin-bottom: 10px;
-  justify-content: space-between;
-}
-
-.slider span {
-  width: 35px;
-  text-align: right;
-}
-
-.slider label {
-  align-items: center;
-  display: flex;
-}
-
-.help-icon {
-  height: 15px;
-  left: 2px;
-  min-width: 15px;
-  min-height: 15px;
-  margin: 0;
-  padding: 0;
-  top: -2px;
-  width: 15px;
-}
-
-.ink-panel {
-  display: flex;
-  flex-direction: column;
-  font-size: 14px;
-}
-
-.ink-panel h4 {
-  border-bottom: 1px solid #ddd;
-  font-size: 14px;
-  font-weight: 500;
-  margin: 0;
-  margin-bottom: 10px;
-  padding-bottom: 5px;
-}
-
-.ink-panel-header {
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  height: 50px;
-}
-
-.ink-panel-content {
-  display: none;
-  height: 100%;
-}
-
-.ink-panel-content.active {
-  display: block;
-}
-
-.ink-panel-content h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin-top: 20px;
-  margin-bottom: 5px;
-  text-transform: uppercase;
-}
-
-.ink-panel-header h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin: 0;
-  padding: 0 24px;
-  text-transform: uppercase;
-}
-
-
-/* - Tabs */
-.ink-tab-group {
-  align-items: center;
-  box-sizing: border-box;
-  display: flex;
-  height: 100%;
-  justify-content: space-around;
-}
-
-.ink-tab-group .projection-tab {
-  color: rgba(0, 0, 0, 0.5);
-  cursor: pointer;
-  font-weight: 300;
-  line-height: 49px;
-  padding: 0 12px;
-  text-align: center;
-  text-transform: uppercase;
-}
-
-.ink-tab-group .projection-tab:hover {
-  color: black;
-}
-
-.ink-tab-group .projection-tab.active {
-  border-bottom: 2px solid black;
-  color: black;
-  font-weight: 500;
-}
-
-h4 {
-  margin: 30px 0 10px 0;
-}
-
-.dismiss-dialog-note {
-  margin-top: 25px;
-  font-size: 11px;
-  text-align: right;
-}
-</style>
-</template>
-</dom-module>
-<dom-module id="vz-projector-bookmark-panel" assetpath="/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-#title {
-  background-color: #fafafa;
-  color: black;
-  font-weight: 500;
-  left: 0;
-  line-height: 60px;
-  padding-left: 24px;
-  position: absolute;
-  width: 276px;
-}
-#bookmark-container {
-  background-color: #fafafa;
-}
-#icon-container {
-  line-height: 60px;
-  position: absolute;
-  right: 0;
-}
-#header {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  position: relative;
-}
-#panel {
-  background-color: #fafafa;
-  position: relative;
-  overflow-y: scroll;
-  top: 60px;
-  max-height: 50vh;
-}
-
-#save-container {
-  text-align: center;
-}
-
-.state-radio {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 16px;
-}
-
-.state-label {
-  display: table-cell;
-  vertical-align: middle;
-  top: 14px;
-}
-
-.state-label-input {
-  width: 194px;
-}
-
-.state-clear {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 20px;
-}
-#state-file {
-  display: none;
-}
-#no-bookmarks {
-  padding: 0 24px;
-}
-#action-buttons-container .add-icon-button {
-  background-color: #03a9f4;
-  color: white;
-  margin: 0 4px 4px auto;
-  right: 7px;
-  top: -4px;
-}
-.upload-download-icon-button {
-  padding: 0;
-}
-#action-buttons-container {
-  display: flex;
-  margin-left: 34px;
-  margin-top: 6px;
-}
-.ink-fab {
-  border-radius: 50%;
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-paper-textarea {
-  --paper-input-container-input: {
-    font-size: 12px;
-  }
-  --paper-font-caption: {
-    display: none
-  }
-}
-</style>
-
-
-<div id="bookmark-container">
-  <div id="header">
-    <div id="title">
-      BOOKMARKS ([[savedStates.length]])
-      <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" position="top" offset="0">
-        Open this drawer to save a set of views of the projection, including
-        selected points. A file containing the bookmarks can then be saved and
-        later loaded to view them.
-      </paper-tooltip>
-    </div>
-    <div id="icon-container">
-      
-      <paper-icon-button id="expand-more" icon="expand-less" on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less" style="display: none" icon="expand-more" on-tap="_expandLess"></paper-icon-button>
-    </div>
-  </div>
-  <iron-collapse id="panel">
-    
-    <div id="state-section">
-      <template is="dom-if" if="[[!savedStates.length]]">
-        <p id="no-bookmarks">
-            No bookmarks yet, upload a bookmarks file or add a new bookmark by clicking the "+" below.
-        </p>
-      </template>
-
-      <template is="dom-repeat" items="{{savedStates}}">
-        <div class="state-row">
-          <div class="state-radio">
-            <template is="dom-if" if="{{item.isSelected}}">
-              <paper-icon-button icon="radio-button-checked"></paper-icon-button>
-            </template>
-            <template is="dom-if" if="{{!item.isSelected}}">
-              <paper-icon-button icon="radio-button-unchecked" data-index$="{{index}}" on-tap="_radioButtonHandler"></paper-icon-button>
-            </template>
-          </div>
-          <div class="state-label">
-            <paper-textarea value="[[item.label]]" class="state-label-input" on-keyup="_labelChange" data-index$="[[index]]" autoresizing="">
-          </paper-textarea></div>
-          <div class="state-clear">
-            <paper-icon-button icon="clear" data-index$="{{index}}" on-tap="_clearButtonHandler"></paper-icon-button>
-          </div>
-        </div>
-      </template>
-
-      <div id="action-buttons-container">
-        <paper-icon-button class="upload-download-icon-button" icon="save" title="Save bookmarks" disabled="[[!hasStates]]" on-tap="_downloadFile"></paper-icon-button>
-        <paper-icon-button class="upload-download-icon-button" icon="file-upload" title="Load bookmarks" on-tap="_uploadFile"></paper-icon-button>
-        <paper-icon-button class="add-icon-button ink-fab" icon="add" title="Add bookmark" on-tap="_addBookmark"></paper-icon-button>
-        <input type="file" id="state-file" name="state-file">
-      </div>
-    </div>
-  </iron-collapse>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-legend" assetpath="/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.item {
-  display: flex;
-  align-items: flex-start;
-  margin-bottom: 10px;
-}
-
-.shape {
-  width: 10px;
-  height: 10px;
-  margin-right: 10px;
-  margin-top: 5px;
-  border-radius: 50%;
-}
-
-.label {
-  flex-grow: 1;
-}
-
-.gradient {
-  width: 100%;
-  height: 10px;
-}
-
-.gradient-boundaries {
-  display: flex;
-  justify-content: space-between;
-}
-</style>
-
-<template is="dom-repeat" items="[[renderInfo.items]]">
-  <div class="item">
-    <div class="shape" style="background-color: [[item.color]];"></div>
-    <div class="label">[[item.label]]</div>
-    <div class="info" style="color: [[item.color]];">[[item.count]]</div>
-  </div>
-</template>
-
-<template is="dom-if" if="[[renderInfo.thresholds]]">
-  <svg class="gradient">
-    <defs>
-      <linearGradient id="gradient" x1="0%" y1="100%" x2="100%" y2="100%"></linearGradient>
-    </defs>
-    <rect height="10" style="fill: url(&quot;#gradient&quot;);"></rect>
-  </svg>
-  <div class="gradient-boundaries">
-    <div>[[renderInfo.thresholds.0.value]]</div>
-    <div>[[_getLastThreshold(renderInfo.thresholds)]]</div>
-  </div>
-</template>
-
-</template>
-</dom-module><dom-module id="vz-projector-data-panel" assetpath="/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.container {
-  padding: 10px 20px 20px 20px;
-}
-
-input[type=file] {
-  display: none;
-}
-
-.file-name {
-  margin-right: 10px;
-}
-
-.dirs {
-  color: rgba(0, 0, 0, 0.7);
-  font-size: 12px;
-}
-
-.dirs table tr {
-  vertical-align: top;
-}
-
-.dirs table tr td {
-  padding-bottom: 10px;
-}
-
-paper-item {
-  --paper-item-disabled: {
-    border-bottom: 1px solid black;
-    justify-content: center;
-    font-size: 12px;
-    line-height: normal;
-    min-height: 0px;
-  };
-}
-
-.item-details {
-  margin-left: 5px;
-  color: gray;
-  font-size: 12px;
-}
-
-paper-dropdown-menu {
-  width: 100%;
-}
-
-paper-dropdown-menu paper-item {
-  justify-content: space-between;
-}
-
-.title {
-  align-items: center;
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  color: black;
-  display: flex;
-  font-weight: 500;
-  height: 59px;
-  padding-left: 20px;
-}
-
-#normalize-data-checkbox {
-  margin: 10px 0;
-}
-
-#projector-config-template {
-  --paper-input-container-input: {
-    line-height: 13px;
-    font-family: monospace;
-    font-size: 12px;
-  };
-}
-
-#generate-share-url {
-  padding: 16px;
-  margin-left: 24px;
-}
-
-#projector-share-button-container {
-  margin: 10px 0;
-}
-
-.config-checkbox {
-  display: inline-block;
-  font-size: 11px;
-  margin-left: 10px;
-}
-
-.projector-config-options {
-  margin-top: 12px;
-}
-
-.projector-config-dialog-container {
-  padding: 24px;
-}
-
-.code {
-  background-color: #f7f7f7;
-  display: table;
-  font-family: monospace;
-  margin-top: 7px;
-  padding: 15px;
-}
-
-.delimiter {
-  color: #B71C1C;
-}
-
-.upload-step {
-  display: flex;
-  justify-content: space-between;
-  margin-bottom: 6px;
-}
-
-.upload-step paper-button {
-  margin-left: 30px;
-}
-
-.step-label {
-  color: rgb(38, 180, 226);
-}
-
-.scrollable-container {
-  margin-top: 0;
-  min-width: 400px;
-}
-
-#projectorConfigDialog p {
-  margin: 8px 0 8px;
-}
-
-.data-step {
-  margin-top: 40px;
-}
-
-.data-step-contents {
-  display: table;
-  width: 100%;
-}
-
-.data-step-contents-contents {
-  display: table-cell;
-  margin-top: 6px;
-}
-
-.data-step-contents-upload {
-  display: table-cell;
-  text-align: right;
-  vertical-align: bottom;
-}
-
-#demo-data-buttons-container {
-  display: none;
-}
-
-</style>
-<div class="title">DATA</div>
-<div class="container">
-  
-  <template is="dom-if" if="[[_hasChoices(runNames)]]">
-    <paper-dropdown-menu no-animations="" label="[[_getNumRunsLabel(runNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedRun}}">
-        <template is="dom-repeat" items="[[runNames]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  <template is="dom-if" if="[[tensorNames]]">
-    
-    <paper-dropdown-menu no-animations="" label="[[_getNumTensorsLabel(tensorNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedTensor}}">
-        <template is="dom-repeat" items="[[tensorNames]]">
-          <paper-item value="[[item.name]]" label="[[item.name]]">
-            [[item.name]]
-            <span class="item-details">
-              [[item.shape.0]]x[[item.shape.1]]
-            </span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-  
-  <template is="dom-if" if="[[_hasChoices(labelOptions)]]">
-    <paper-dropdown-menu no-animations="" label="Label by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedLabelOption}}">
-        <template is="dom-repeat" items="[[labelOptions]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  
-  <template is="dom-if" if="[[_hasChoices(colorOptions)]]">
-    <paper-dropdown-menu id="colorby" no-animations="" label="Color by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedColorOptionName}}">
-        <template is="dom-repeat" items="[[colorOptions]]">
-          <paper-item class$="[[getSeparatorClass(item.isSeparator)]]" value="[[item.name]]" label="[[item.name]]" disabled="[[item.isSeparator]]">
-            [[item.name]]
-            <span class="item-details">[[item.desc]]</span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-    <vz-projector-legend render-info="[[colorLegendRenderInfo]]"></vz-projector-legend>
-  </template>
-  <paper-checkbox id="normalize-data-checkbox" checked="{{normalizeData}}">
-    Sphereize data
-    <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-    <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-      The data is normalized by shifting each point by the centroid and making
-      it unit norm.
-    </paper-tooltip>
-  </paper-checkbox>
-  <p id="demo-data-buttons-container">
-    <span>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Load data from your computer
-      </paper-tooltip>
-      <paper-button id="upload" class="ink-button" onclick="dataDialog.open()">Load data</paper-button>
-    </span>
-    <span>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Publish your embedding visualization and data
-      </paper-tooltip>
-      <paper-button id="host-embedding" class="ink-button" onclick="projectorConfigDialog.open()">Publish</paper-button>
-    </span>
-  </p>
-  <div>
-    <paper-dialog id="dataDialog" with-backdrop="">
-      <h2>Load data from your computer</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div class="data-step">
-          <div class="upload-step">
-            <div>
-                <b><span class="step-label">Step 1:</span> Load a TSV file of vectors.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 vectors with dimension 4:
-              <div class="code">
-                0.1<span class="delimiter">\t</span>0.2<span class="delimiter">\t</span>0.5<span class="delimiter">\t</span>0.9<br>
-                0.2<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>5.0<span class="delimiter">\t</span>0.2<br>
-                0.4<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>7.0<span class="delimiter">\t</span>0.8
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-tensors" title="Choose a TSV tensor file">Choose file</paper-button>
-              <input type="file" id="file" name="file">
-            </div>
-          </div>
-        </div>
-        <div class="data-step">
-          <div class="upload-step">
-            <div>
-                <span class="step-label"><b>Step 2</b> (optional):</span> <b>Load a TSV file of metadata.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 data points and 2 columns.<br>
-              <i>Note: the first row is a header.</i>
-              <div class="code">
-                <b>Pokémon<span class="delimiter">\t</span>Species</b><br>
-                Wartortle<span class="delimiter">\t</span>Turtle<br>
-                Venusaur<span class="delimiter">\t</span>Seed<br>
-                Charmeleon<span class="delimiter">\t</span>Flame
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-metadata" title="Choose a TSV metadata file" class="ink-button">Choose file</paper-button>
-              <input type="file" id="file-metadata" name="file-metadata">
-            </div>
-          </div>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-    <paper-dialog id="projectorConfigDialog" with-backdrop="">
-      <h2>Publish your embedding visualization and data</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div>
-          <p>
-            If you'd like to share your visualization with the world, follow these simple steps.
-            See <a target="_blank" href="https://www.tensorflow.org/versions/master/how_tos/embedding_viz/index.md">this tutorial</a> for more.
-          </p>
-          <h4><span class="step-label">Step 1:</span> Make data public</h4>
-          <p>
-            Host tensors, metadata, sprite image, and bookmarks TSV files <i>publicly</i> on the web.
-          </p>
-          <p>
-            One option is using a <a target="_blank" href="https://gist.github.com/">github gist</a>.
-          </p>
-        </div>
-        <div>
-          <h4><span class="step-label">Step 2:</span> Projector config</h4>
-          <div class="projector-config-options">
-            <i>Optional:</i>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-metadata-checkbox" checked="">Metadata</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-sprite-checkbox">Sprite</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-bookmarks-checkbox">Bookmarks</paper-checkbox>
-            </div>
-          </div>
-        </div>
-        <paper-textarea id="projector-config-template" label="template_projector_config.json"></paper-textarea>
-        <div>
-          <h4><span class="step-label">Step 3:</span> Host projector config</h4>
-          After you have hosted the projector config JSON file you built above, paste the URL to the config below.
-        </div>
-        <paper-input id="projector-config-url" label="Path to projector config"></paper-input>
-        <paper-input id="projector-share-url" label="Your shareable URL" readonly=""></paper-input>
-        <div id="projector-share-button-container">
-          <a target="_blank" id="projector-share-url-link">
-            <paper-button title="Test your shareable URL" class="ink-button">Test your shareable URL</paper-button>
-          </a>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-  </div>
-  <div class="dirs">
-    <table>
-      <tbody><tr>
-        <td>Checkpoint:</td>
-        <td><span id="checkpoint-file"></span></td>
-      </tr>
-      <tr>
-        <td>Metadata:</td>
-        <td><span id="metadata-file"></span></td>
-      </tr>
-    </tbody></table>
-  </div>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-input" assetpath="/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.info {
-  color: rgba(0, 0, 0, 0.5);
-  display: block;
-  font-size: 11px;
-}
-
-.toggle {
-  font-size: 12px;
-  height: 21px;
-  margin: 0px;
-  min-width: 0px;
-  min-height: 0px;
-  padding: 0;
-  width: 17px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-</style>
-
-<paper-input label="[[label]]">
-  <div class="slash" prefix="">/</div>
-  <div class="slash" suffix="">/</div>
-  <div suffix="">
-    <paper-button id="regex" toggles="" class="toggle">.*</paper-button>
-  </div>
-</paper-input>
-<paper-tooltip for="regex" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-  Enable/disable regex mode.
-</paper-tooltip>
-<span class="info">[[message]]</span>
-
-
-</template>
-</dom-module><dom-module id="vz-projector-inspector-panel" assetpath="/">
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-   display: flex;
-   flex-direction: column;
-   /* Account for the bookmark pane at the bottom */
-   height: calc(100% - 55px);
-}
-
-.container {
-  display: block;
-  padding: 10px 20px 0 20px;
-}
-
-.buttons {
-  display: flex;
-  height: 60px;
-}
-
-.button {
-  margin-right: 10px;
-  border: none;
-  border-radius: 7px;
-  font-size: 13px;
-  padding: 10px;
-  background: #e3e3e3;
-}
-
-.button:last-child {
-  margin-right: 0;
-}
-
-.nn {
-  display: flex;
-  flex-direction: column;
-}
-
-.nn > * {
-  padding: 0 20px;
-}
-
-.nn-list {
-  overflow-y: auto;
-}
-
-.nn-list .neighbor {
-  font-size: 12px;
-  margin-bottom: 8px;
-}
-
-.nn-list .label-and-value {
-  display: flex;
-  justify-content: space-between;
-}
-
-.label {
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
-}
-
-.nn-list .value {
-  color: #666;
-  float: right;
-  font-weight: 300;
-  margin-left: 8px;
-}
-
-.nn-list .bar {
-  position: relative;
-  border-top: 1px solid rgba(0, 0, 0, 0.15);
-  margin: 2px 0;
-}
-
-.nn-list .bar .fill {
-  position: absolute;
-  top: -1px;
-  border-top: 1px solid white;
-}
-
-.nn-list .tick {
-  position: absolute;
-  top: 0px;
-  height: 3px;
-  border-left: 1px solid rgba(0, 0, 0, 0.15);
-}
-
-.nn-list .neighbor-link:hover {
-  cursor: pointer;
-}
-
-.search-by {
-  display: flex;
-}
-
-.search-by vz-projector-input {
-  width: 100%;
-}
-
-.search-by paper-dropdown-menu {
-  margin-left: 10px;
-  width: 100px;
-}
-
-.distance .options {
-  float: right;
-}
-
-.options a {
-  color: #727272;
-  font-size: 13px;
-  margin-left: 12px;
-  text-decoration: none;
-}
-
-.options a.selected {
-  color: #009EFE;
-}
-
-.neighbors {
-  margin-bottom: 30px;
-}
-
-.neighbors-options {
-  margin-top: 6px;
-}
-
-.neighbors-options .option-label, .distance .option-label {
-  color: #727272;
-  margin-right: 2px;
-  width: auto;
-}
-
-.num-neighbors-container {
-  display: inline-block;
-}
-
-#nn-slider {
-  margin: 0 -12px 0 10px;
-}
-
-.euclidian {
-  margin-right: 10px;
-}
-
-.matches-list {
-  padding: 0 20px;
-}
-
-.matches-list .row {
-  border-bottom: 1px solid #ddd;
-  cursor: pointer;
-  display: flex;
-  font-size: 12px;
-  margin: 5px 0;
-  padding: 4px 0;
-}
-
-.results {
-  display: flex;
-  flex-direction: column;
-}
-</style>
-<template>
-<div class="container">
-  <div class="buttons">
-    <button class="button reset-filter">Show All Data</button>
-    <button class="button set-filter">Isolate selection</button>
-    <button class="button clear-selection">Clear selection</button>
-  </div>
-  <div class="search-by">
-    <vz-projector-input id="search-box" label="Search"></vz-projector-input>
-    <paper-dropdown-menu no-animations="" label="by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedMetadataField}}">
-        <template is="dom-repeat" items="[[metadataFields]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </div>
-</div>
-<div class="results">
-  <div class="nn" style="display: none">
-    <div class="neighbors">
-      <div class="neighbors-options">
-        <div class="slider num-nn">
-          <span class="option-label">neighbors</span>
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-            The number of neighbors (in the original space) to show when clicking on a point.
-          </paper-tooltip>
-          <paper-slider id="nn-slider" pin="" min="5" max="1000" value="100"></paper-slider>
-          <span class="nn-count"></span>
-        </div>
-      </div>
-      <div class="distance">
-        <span class="option-label">distance</span>
-        <div class="options">
-          <a class="selected cosine" href="javascript:void(0);">COSINE</a>
-          <a class="euclidean" href="javascript:void(0);">EUCLIDIAN</a>
-        </div>
-      </div>
-    </div>
-    <p>Nearest points in the original space:
-    </p><div class="nn-list"></div>
-  </div>
-  <div class="matches-list" style="display: none">
-    <div class="list"></div>
-    <div class="limit-msg">Showing only the first 100 results...</div>
-  </div>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-metadata-card" assetpath="/">
-<template>
-<style>
-#metadata-card {
-  background-color: rgba(255,255,255,0.9);
-  box-shadow: 0 2px 2px 0 rgba(0, 0, 0, 0.14),
-      0 1px 5px 0 rgba(0, 0, 0, 0.12), 0 3px 1px -2px rgba(0, 0, 0, 0.2);
-  width: 280px;
-}
-
-#header {
-  background: #e9e9e9;
-}
-
-#icon-container {
-  position: absolute;
-  right: 0;
-  top: 4px;
-}
-
-#metadata-label {
-  font-weight: 400;
-  font-size: 14px;
-  line-height: 24px;
-  padding: 12px 12px 8px;
-  width: 230px;
-}
-
-#metadata-table {
-  display: table;
-  padding: 8px 12px 4px;
-}
-
-.metadata-row {
-  display: table-row;
-}
-
-.metadata-key {
-  font-weight: bold;
-}
-
-.metadata-key, .metadata-value {
-  display: table-cell;
-  font-size: 12px;
-  padding: 3px 3px;
-}
-</style>
-
-<template is="dom-if" if="[[hasMetadata]]">
-  <div id="metadata-card">
-    <div id="icon-container">
-      <paper-icon-button id="expand-more" style="display: none" icon="expand-more" on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less" on-tap="_expandLess" icon="expand-less"></paper-icon-button>
-    </div>
-    <div id="header">
-      <div id="metadata-label">[[label]]</div>
-    </div>
-    <iron-collapse id="metadata-container" opened="">
-      <div id="metadata-table">
-        <template is="dom-repeat" items="[[metadata]]">
-          <div class="metadata-row">
-            <div class="metadata-key">[[item.key]]</div>
-            <div class="metadata-value">[[item.value]]</div>
-          </div>
-        </template>
-      </div>
-    </iron-collapse>
-  </div>
-</template>
-</template>
-</dom-module>
-<dom-module id="vz-projector-projections-panel" assetpath="/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  transition: height 0.2s;
-}
-
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.ink-panel-buttons {
-  margin-bottom: 10px;
-}
-
-.two-way-toggle {
-  display: flex;
-  flex-direction: row;
-}
-
-.two-way-toggle span {
-  padding-right: 7px;
-}
-
-.has-border {
-  border: 1px solid rgba(0, 0, 0, 0.1);
-}
-
-.toggle {
-  min-width: 0px;
-  font-size: 12px;
-  width: 17px;
-  min-height: 0px;
-  height: 21px;
-  padding: 0;
-  margin: 0px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-
-.two-columns {
-  display:flex;
-  justify-content: space-between;
-}
-
-.two-columns > :first-child {
-  margin-right: 15px;
-}
-
-.two-columns > div {
-  width: 50%;
-}
-
-.dropdown-item {
-  justify-content: space-between;
-  min-height: 35px;
-}
-
-#z-container {
-  display: flex;
-  align-items: center;
-  width: 50%;
-}
-
-#z-checkbox {
-  margin: 27px 0 0 5px;
-  width: 18px;
-}
-
-#z-dropdown {
-  flex-grow: 1;
-}
-
-.notice {
-  color: #880E4F;
-}
-
-.container {
-  padding: 20px;
-}
-
-.book-icon {
-  height: 20px;
-  color: rgba(0, 0, 0, 0.7);
-}
-
-.item-details {
-  color: gray;
-  font-size: 12px;
-  margin-left: 5px;
-}
-
-.pca-dropdown {
-  width: 100%;
-}
-
-.pca-dropdown paper-listbox {
-  width: 135px;
-}
-
-.dropdown-item.header {
-  border-bottom: 1px solid #aaa;
-  color: #333;
-  font-weight: bold;
-}
-
-#total-variance {
-  color: rgba(0, 0, 0, 0.7);
-}
-</style>
-<div id="main">
-  <div class="ink-panel-header">
-    <div class="ink-tab-group">
-
-      <div data-tab="tsne" id="tsne-tab" class="ink-tab projection-tab">t-SNE</div>
-      <paper-tooltip for="tsne-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        t-distributed stochastic neighbor embedding
-      </paper-tooltip>
-
-      <div data-tab="pca" id="pca-tab" class="ink-tab projection-tab">PCA</div>
-      <paper-tooltip for="pca-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Principal component analysis
-      </paper-tooltip>
-
-      <div data-tab="custom" id="custom-tab" class="ink-tab projection-tab" title="Linear projection of two custom vectors">Custom</div>
-      <paper-tooltip for="custom-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Search for two vectors upon which to project all points.
-      </paper-tooltip>
-
-    </div>
-  </div>
-  <div class="container">
-    
-    <div data-panel="tsne" class="ink-panel-content">
-      <div class="slider">
-        <label>Dimension</label>
-        <div class="two-way-toggle">
-          <span>2D</span>
-          <paper-toggle-button id="tsne-toggle" checked="{{tSNEis3d}}">3D</paper-toggle-button>
-        </div>
-      </div>
-      <div class="slider tsne-perplexity">
-        <label>
-          Perplexity
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds="">
-            The most appropriate perplexity value depends on the density of the
-            data. Loosely speaking, a larger / denser dataset
-            requires a larger perplexity. Typical values for perplexity range
-            between 5 and 50.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="perplexity-slider" pin="" min="2" max="100" value="30"></paper-slider>
-        <span></span>
-      </div>
-      <div class="slider tsne-learning-rate">
-        <label>
-          Learning rate
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds="">
-            The ideal learning rate often depends on the size of the data,
-            with smaller datasets requiring smaller learning rates.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="learning-rate-slider" snaps="" min="-3" max="2" step="1" value="1" max-markers="6">
-        </paper-slider>
-        <span></span>
-      </div>
-      <p>
-        <button class="run-tsne ink-button" title="Re-run t-SNE">Re-run</button>
-        <button class="stop-tsne ink-button" title="Stop t-SNE">Stop</button>
-      </p>
-      <p>Iteration: <span class="run-tsne-iter">0</span></p>
-      <p id="tsne-sampling" class="notice">
-        For fast results, the data will be sampled down to [[getTsneSampleSizeText()]] points.
-      </p>
-      <p>
-        <iron-icon icon="book" class="book-icon"></iron-icon>
-        <a target="_blank" href="http://distill.pub/2016/misread-tsne/">
-          How to use t-SNE effectively.
-        </a>
-      </p>
-    </div>
-    
-    <div data-panel="pca" class="ink-panel-content">
-      <div class="two-columns">
-        <div> 
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations="" label="X">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaX}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-dropdown-menu class="pca-dropdown" no-animations="" vertical-align="bottom" label="Z" disabled="[[!hasPcaZ]]" id="z-dropdown">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaZ}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-        </div>
-        <div> 
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations="" label="Y">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaY}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-checkbox id="z-checkbox" checked="{{pcaIs3d}}"></paper-checkbox>
-        </div>
-      </div>
-      <p id="pca-sampling" class="notice">
-        PCA is approximate.
-        <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      </p>
-      <div id="total-variance">Total variance</div>
-      <paper-tooltip for="pca-sampling" position="top" animation-delay="0" fit-to-visible-bounds="">
-        For fast results, the data was sampled to [[getPcaSampleSizeText()]] points and randomly projected down to [[getPcaSampledDimText()]] dimensions.
-      </paper-tooltip>
-    </div>
-    
-    <div data-panel="custom" class="ink-panel-content">
-      <paper-dropdown-menu style="width: 100%" no-animations="" label="Search by">
-        <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{customSelectedSearchByMetadataOption}}">
-          <template is="dom-repeat" items="[[searchByMetadataOptions]]">
-            <paper-item class="dropdown-item" value="[[item]]" label="[[item]]">
-              [[item]]
-            </paper-item>
-          </template>
-        </paper-listbox>
-      </paper-dropdown-menu>
-      <div class="two-columns">
-        <vz-projector-input id="xLeft" label="Left"></vz-projector-input>
-        <vz-projector-input id="xRight" label="Right"></vz-projector-input>
-      </div>
-      <div class="two-columns">
-        <vz-projector-input id="yUp" label="Up"></vz-projector-input>
-        <vz-projector-input id="yDown" label="Down"></vz-projector-input>
-      </div>
-    </div>
-  </div>
-</div>
-</template>
-</dom-module>
-<dom-module id="vz-projector" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  height: 100%;
-}
-
-#container {
-  display: flex;
-  width: 100%;
-  height: 100%;
-  overflow: hidden;
-}
-
-.hidden {
-  display: none !important;
-}
-
-/* Main */
-
-#main {
-  position: relative;
-  flex-grow: 2;
-}
-
-#main .stage {
-  position: relative;
-  flex-grow: 2;
-}
-
-#scatter {
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-#selector {
-  display: none;
-  height: 100%;
-  position: absolute;
-  width: 100%;
-}
-
-#left-pane {
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  min-width: 312px;
-  width: 312px;
-  border-right: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-}
-
-#right-pane {
-  border-left: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-  display: flex;
-  height: 100%;
-  min-width: 300px;
-  width: 300px;
-}
-
-.file-name {
-  margin-right: 5px;
-}
-
-.control input[type=text]:focus {
-  outline: none;
-  border-bottom: 1px solid rgba(0, 0, 0, 1);
-}
-
-.control {
-  display: inline-block;
-  width: 45%;
-  vertical-align: top;
-  margin-right: 10px;
-  overflow-x: hidden;
-}
-
-.control.last {
-  margin-right: 0;
-}
-
-#notification-dialog {
-  width: 400px;
-  padding-bottom: 20px;
-}
-
-#notification-dialog paper-button {
-  background: none;
-  text-transform: uppercase;
-}
-
-#notification-dialog .progress {
-  --paper-spinner-color: #880E4F;
-  --paper-spinner-stroke-width: 2px;
-}
-
-#notify-msgs {
-  text-align: center;
-  display: block;
-}
-
-.notify-msg {
-  font-weight: 500;
-  margin: 0;
-  padding: 0;
-}
-
-.notify-msg.error {
-  text-align: left;
-}
-
-.brush .extent {
-  stroke: #fff;
-  fill-opacity: .125;
-  shape-rendering: crispEdges;
-}
-
-.origin text {
-  font-size: 12px;
-  font-weight: 500;
-}
-
-.origin line {
-  stroke: black;
-  stroke-opacity: 0.2;
-}
-
-/* Ink Framework */
-
-/* - Buttons */
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.status-bar-panel {
-  display: flex;
-  align-items: center;
-}
-
-.status-bar-entry {
-  border-left: 1px solid rgba(0, 0, 0, 0.5);
-  margin-left: 5px;
-  padding-left: 5px;
-}
-
-/* - Menubar */
-
-.ink-panel-menubar {
-  align-items: center;
-  position: relative;
-  height: 60px;
-  border-bottom: solid 1px #eee;
-  padding: 0 24px;
-  display: flex;
-}
-
-.ink-panel-menubar .ink-fabs {
-  position: absolute;
-  right: 12px;
-  top: 40px;
-  z-index: 1;
-}
-
-#bookmark-panel {
-  bottom: 0;
-  position: absolute;
-  width: 300px;
-}
-#bookmark-panel-container {
-  bottom: 60px;
-  position: absolute;
-}
-
-.ink-fab {
-  margin-left: 8px;
-  border: 1px solid rgba(0, 0, 0, 0.02);
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-
-#metadata-card {
-  position: absolute;
-  right: 5px;
-  top: 25px;
-}
-
-#help-3d-icon {
-  position: absolute;
-  top: 20px;
-  left: 20px;
-}
-
-#help3dDialog .main {
-  margin: 0;
-  padding: 20px;
-}
-
-#help3dDialog h3 {
-  margin-top: 20px;
-  margin-bottom: 5px;
-}
-
-#help3dDialog h3:first-child {
-  margin-top: 0;
-}
-
-#data-panel {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  overflow-y: auto;
-}
-
-#toast {
-  display: flex;
-  align-items: center;
-  --paper-toast-color: #eeff41;
-}
-</style>
-<paper-dialog id="notification-dialog" modal="">
-  <h2 id="notification-title"></h2>
-  <paper-dialog-scrollable>
-    <div id="notify-msgs"></div>
-  </paper-dialog-scrollable>
-  <div style="text-align: center;"><paper-spinner-lite active="" class="progress"></paper-spinner-lite></div>
-  <div class="buttons">
-    <paper-button class="close-button" dialog-confirm="" autofocus="">Close</paper-button>
-  </div>
-</paper-dialog>
-<div id="container">
-  <div id="left-pane" class="ink-panel">
-    <vz-projector-data-panel id="data-panel"></vz-projector-data-panel>
-    <vz-projector-projections-panel id="projections-panel"></vz-projector-projections-panel>
-  </div>
-  <div id="main" class="ink-panel">
-    <div class="ink-panel-menubar">
-      <paper-icon-button id="selectMode" alt="Bounding box selection" toggles="" icon="image:photo-size-select-small"></paper-icon-button>
-      <paper-tooltip for="selectMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Bounding box selection</paper-tooltip>
-
-      <paper-icon-button id="nightDayMode" alt="Enable/disable night mode" toggles="" icon="image:brightness-2"></paper-icon-button>
-      <paper-tooltip for="nightDayMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Enable/disable night mode</paper-tooltip>
-
-      <paper-icon-button id="labels3DMode" alt="Enable/disable 3D labels mode" toggles="" icon="font-download"></paper-icon-button>
-      <paper-tooltip for="labels3DMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Enable/disable 3D labels mode</paper-tooltip>
-      <div class="status-bar-panel">
-        <div class="status-bar-entry">Points: <span class="numDataPoints">Loading...</span></div>
-        <div class="status-bar-entry">Dimension: <span class="dim">Loading...</span></div>
-        <div id="status-bar" class="status-bar-entry" style="display: none;"></div>
-      </div>
-      <div class="ink-fabs">
-        <paper-icon-button id="reset-zoom" class="ink-fab" alt="Reset zoom to fit all points" icon="home"></paper-icon-button>
-        <paper-tooltip for="reset-zoom" position="left" animation-delay="0">Reset zoom to fit all points</paper-tooltip>
-      </div>
-    </div>
-    <div class="stage">
-      <div id="scatter">
-        <svg id="selector"></svg>
-      </div>
-      <vz-projector-metadata-card id="metadata-card"></vz-projector-metadata-card>
-      <paper-icon-button raised="" onclick="help3dDialog.open()" icon="help-outline" id="help-3d-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" for="help-3d-icon">Help with interaction controls.</paper-tooltip>
-      <paper-dialog id="help3dDialog" with-backdrop="">
-        <div class="main" dialog-confirm="" autofocus="">
-          <h3>3D controls</h3>
-            <b>Rotate</b> Mouse left click.<br>
-            <b>Pan</b> Mouse right click.<br>
-            <b>Zoom</b> Mouse wheel.<br>
-            Holding <b>ctrl</b> reverses the mouse clicks.
-          <h3>2D controls</h3>
-            <b>Pan</b> Mouse left click.<br>
-            <b>Zoom</b> Mouse wheel.
-          <div class="dismiss-dialog-note"> Click anywhere to dismiss.</div>
-        </div>
-      </paper-dialog>
-    </div>
-  </div>
-  <div id="right-pane" class="ink-panel">
-    <div class="ink-panel-content active">
-      <vz-projector-inspector-panel id="inspector-panel"></vz-projector-inspector-panel>
-    </div>
-    <div id="bookmark-panel-container">
-      <vz-projector-bookmark-panel id="bookmark-panel"></vz-projector-bookmark-panel>
-    </div>
-  </div>
-</div>
-<paper-toast id="toast" always-on-top=""></paper-toast>
-
-</template> 
-</dom-module>
-<dom-module id="vz-projector-dashboard" assetpath="../vz-projector/">
-<template>
-  <tf-no-data-warning data-type="projector" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-  <template is="dom-if" if="[[!dataNotFound]]">
-    <vz-projector id="projector" route-prefix="[[routePrefix]]" serving-mode="server" page-view-logging="" event-logging=""></vz-projector>
-  </template>
-</template>
-<script>
-(function() {
-Polymer({
-  is: 'vz-projector-dashboard',
-  properties: {
-    dataNotFound: Boolean,
-    routePrefix: String
-  },
-  ready() {
-    var self = this;
-    d3.json(this.routePrefix + '/runs', function(err, runs) {
-      self.dataNotFound = (runs.length === 0);
-    });
-  }
-});
-})();
-</script>
-</dom-module>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var TensorBoard;
-    (function (TensorBoard) {
-        TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
-        var getAutoReloadFromLocalStorage = function () {
-            var val = window.localStorage.getItem(TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY);
-            return val === 'true' || val == null; // defaults to true
-        };
-        TensorBoard.AutoReloadBehavior = {
-            properties: {
-                autoReloadEnabled: {
-                    type: Boolean,
-                    observer: '_autoReloadObserver',
-                    value: getAutoReloadFromLocalStorage,
-                },
-                _autoReloadId: {
-                    type: Number,
-                },
-                autoReloadIntervalSecs: {
-                    type: Number,
-                    value: 120,
-                },
-            },
-            detached: function () { window.clearTimeout(this._autoReloadId); },
-            _autoReloadObserver: function (autoReload) {
-                window.localStorage.setItem(TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
-                if (autoReload) {
-                    var _this = this;
-                    this._autoReloadId = window.setTimeout(this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-                }
-                else {
-                    window.clearTimeout(this._autoReloadId);
-                }
-            },
-            _doAutoReload: function () {
-                if (this.reload == null) {
-                    throw new Error('AutoReloadBehavior requires a reload method');
-                }
-                this.reload();
-                this._autoReloadId = window.setTimeout(this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-            }
-        };
-    })(TensorBoard = TF.TensorBoard || (TF.TensorBoard = {}));
-})(TF || (TF = {}));
-</script></div><dom-module id="tf-tensorboard">
-  <template>
-    <paper-dialog with-backdrop="" id="settings">
-      <h2>Settings</h2>
-      <paper-checkbox id="auto-reload-checkbox" checked="{{autoReloadEnabled}}">
-        Reload data every <span>[[autoReloadIntervalSecs]]</span>s.
-      </paper-checkbox>
-    </paper-dialog>
-    <paper-header-panel>
-      <paper-toolbar id="toolbar">
-        <div id="toolbar-content">
-          <div class="toolbar-title">TensorBoard</div>
-          <paper-tabs selected="{{modeIndex}}" noink="" class="tabs" id="tabs">
-            <template is="dom-repeat" items="[[tabs]]">
-              <paper-tab data-mode="[[item]]">[[item]]</paper-tab>
-            </template>
-          </paper-tabs>
-          <div class="global-actions">
-            <paper-icon-button icon="refresh" on-tap="reload" disabled$="[[_modeIsGraphs(mode)]]" id="reload-button"></paper-icon-button>
-            <paper-icon-button icon="settings" on-tap="openSettings" id="settings-button"></paper-icon-button>
-            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
-              <paper-icon-button icon="help-outline"></paper-icon-button>
-            </a>
-          </div>
-        </div>
-      </paper-toolbar>
-
-      <div id="content" class="fit">
-        <content id="injected-overview"></content>
-
-        <template is="dom-if" if="[[_modeIsScalars(mode)]]">
-          <tf-scalar-dashboard id="scalars" backend="[[_backend]]" router="[[router]]"></tf-scalar-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsImages(mode)]]">
-          <tf-image-dashboard id="images" backend="[[_backend]]"></tf-image-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsAudio(mode)]]">
-          <tf-audio-dashboard id="audio" backend="[[_backend]]"></tf-audio-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsGraphs(mode)]]">
-          <tf-graph-dashboard id="graphs" backend="[[_backend]]"></tf-graph-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
-          <tf-distribution-dashboard id="distributions" backend="[[_backend]]"></tf-distribution-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
-          <tf-histogram-dashboard id="histograms" backend="[[_backend]]"></tf-histogram-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsEmbeddings(mode)]]">
-          <vz-projector-dashboard id="projector" route-prefix="/data/plugin/projector">
-          </vz-projector-dashboard>
-        </template>
-      </div>
-    </paper-header-panel>
-
-    <style>
-      :host {
-        height: 100%;
-        display: block;
-        background-color: var(--paper-grey-100);
-      }
-
-      #toolbar {
-        background-color: var(--tb-toolbar-background-color, --tb-orange-strong);
-        -webkit-font-smoothing: antialiased;
-      }
-
-      .toolbar-title {
-        font-size: 20px;
-        margin-left: 10px;
-        text-rendering: optimizeLegibility;
-        letter-spacing: -0.025em;
-        font-weight: 500;
-        flex-grow: 2;
-        display: var(--tb-toolbar-title-display, block);
-      }
-
-      .tabs {
-        flex-grow: 1;
-        text-transform: uppercase;
-        height: 100%;
-      }
-
-      paper-tabs {
-        --paper-tabs-selection-bar-color: white;
-      }
-
-      .global-actions {
-        flex-grow: 2;
-        display: inline-flex; /* Ensure that icons stay aligned */
-        justify-content: flex-end;
-        text-align: right;
-        color: white;
-      }
-
-      .global-actions a {
-        color: white;
-      }
-
-      #toolbar-content {
-        width: 100%;
-        height: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        align-items: center;
-      }
-
-      #content {
-        height: 100%;
-      }
-
-      [disabled] {
-        opacity: 0.2;
-        color: white;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-tensorboard",
-      behaviors: [TF.TensorBoard.AutoReloadBehavior],
-      properties: {
-        router: {
-          type: Object,
-          value: function() {
-            return TF.Backend.router();
-          },
-        },
-        _backend: {
-          type: Object,
-          computed: "_makeBackend(router, demoDir)",
-        },
-        // Which tab is selected (scalars, graph, images etc).
-        mode: {
-          type: String,
-          computed: '_getModeFromIndex(modeIndex)',
-          notify: true,
-        },
-        tabs: {
-          type: Array,
-          readOnly: true,
-          value: TF.Globals.TABS,
-        },
-        // If this is set to a string, TensorBoard will switch to "demo mode"
-        // and attempt to load serialized json data from that directory. You can
-        // generate conformant json using
-        // tensorboard/scripts/serialize_tensorboard.py
-        demoDir: {
-          type: String,
-          value: null,
-        },
-        // Set this to true to store state in URI hash. Should be true for all non-test purposes.
-        useHash: {
-          type: Boolean,
-          value: false,
-        },
-      },
-      _getModeFromIndex: function(modeIndex) {
-        var mode = this.tabs[modeIndex];
-        TF.URIStorage.setString(TF.URIStorage.TAB, mode);
-        return mode;
-      },
-      _makeBackend: function(router, demoDir) {
-        // use the demoDir if it is set, otherwise use the provided router
-        if (demoDir != null) {
-          router = TF.Backend.router(demoDir, true);
-        }
-        return new TF.Backend.Backend(router);
-      },
-      _modeIsScalars: function(mode) {
-        return mode === "scalars";
-      },
-      _modeIsImages: function(mode) {
-        return mode === "images";
-      },
-      _modeIsAudio: function(mode) {
-        return mode === "audio";
-      },
-      _modeIsGraphs: function(mode) {
-        return mode === "graphs";
-      },
-      _modeIsEmbeddings: function(mode) {
-        return mode === "embeddings";
-      },
-      _modeIsDistributions: function(mode) {
-        return mode === "distributions";
-      },
-      _modeIsHistograms: function(mode) {
-        return mode === "histograms";
-      },
-      selectedDashboard: function() {
-        var dashboard = this.$$("#" + this.mode);
-        if (dashboard == null) {
-          throw new Error(`Unable to find dashboard for mode: ${this.mode}`);
-        }
-        return dashboard;
-      },
-      ready: function() {
-        TF.Globals.USE_HASH = this.useHash;
-
-        this._getModeFromHash();
-        window.addEventListener('hashchange', function() {
-          this._getModeFromHash();
-        }.bind(this));
-      },
-      _getModeFromHash: function() {
-        var tabName = TF.URIStorage.getString(TF.URIStorage.TAB);
-        var modeIndex = this.tabs.indexOf(tabName);
-        if (modeIndex == -1 && this.modeIndex == null) {
-          // Select the first tab as default.
-          this.set('modeIndex', 0);
-        }
-        if (modeIndex != -1 && modeIndex != this.modeIndex) {
-          this.set('modeIndex', modeIndex);
-        }
-      },
-      reload: function() {
-        if (this.mode === "graphs" || this.mode === "embeddings") {
-          return;
-        }
-        this.selectedDashboard().reload();
-      },
-      openSettings: function() {
-        this.$.settings.open();
-      },
-    });
-  </script>
-  
-  <script>(function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
-"use strict";
-var AnalyticsLogger = (function () {
-    /**
-     * Constructs an event logger using Google Analytics. It assumes there is a
-     * Google Analytics script added to the page elsewhere. If there is no such
-     * script, the logger acts as a no-op.
-     *
-     * @param pageViewLogging Whether to log page views.
-     * @param eventLogging Whether to log user interaction.
-     */
-    function AnalyticsLogger(pageViewLogging, eventLogging) {
-        if (typeof ga === 'undefined' || ga == null) {
-            this.eventLogging = false;
-            this.pageViewLogging = false;
-            return;
-        }
-        this.eventLogging = eventLogging;
-        this.pageViewLogging = pageViewLogging;
-    }
-    AnalyticsLogger.prototype.logPageView = function (pageTitle) {
-        if (this.pageViewLogging) {
-            // Always send a page view.
-            ga('send', { hitType: 'pageview', page: "/v/" + pageTitle });
-        }
-    };
-    AnalyticsLogger.prototype.logProjectionChanged = function (projection) {
-        if (this.eventLogging) {
-            ga('send', {
-                hitType: 'event',
-                eventCategory: 'Projection',
-                eventAction: 'click',
-                eventLabel: projection
-            });
-        }
-    };
-    AnalyticsLogger.prototype.logWebGLDisabled = function () {
-        if (this.eventLogging) {
-            ga('send', {
-                hitType: 'event',
-                eventCategory: 'Error',
-                eventAction: 'PageLoad',
-                eventLabel: 'WebGL_disabled'
-            });
-        }
-    };
-    return AnalyticsLogger;
-}());
-exports.AnalyticsLogger = AnalyticsLogger;
-
-},{}],2:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * This is a fork of the Karpathy's TSNE.js (original license below).
- * This fork implements Barnes-Hut approximation and runs in O(NlogN)
- * time, as opposed to the Karpathy's O(N^2) version.
- *
- * @author smilkov@google.com (Daniel Smilkov)
- */
-/**
- * The MIT License (MIT)
- * Copyright (c) 2015 Andrej Karpathy
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-var sptree_1 = require('./sptree');
-/**
- * Barnes-hut approximation level. Higher means more approximation and faster
- * results. Recommended value mentioned in the paper is 0.8.
- */
-var THETA = 0.8;
-var MIN_POSSIBLE_PROB = 1E-9;
-// Variables used for memorizing the second random number since running
-// gaussRandom() generates two random numbers at the cost of 1 atomic
-// computation. This optimization results in 2X speed-up of the generator.
-var return_v = false;
-var v_val = 0.0;
-/** Returns the square euclidean distance between two vectors. */
-function dist2(a, b) {
-    if (a.length !== b.length) {
-        throw new Error('Vectors a and b must be of same length');
-    }
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-    }
-    return result;
-}
-exports.dist2 = dist2;
-/** Returns the square euclidean distance between two 2D points. */
-function dist2_2D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist2_2D = dist2_2D;
-/** Returns the square euclidean distance between two 3D points. */
-function dist2_3D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    var dZ = a[2] - b[2];
-    return dX * dX + dY * dY + dZ * dZ;
-}
-exports.dist2_3D = dist2_3D;
-function gaussRandom(rng) {
-    if (return_v) {
-        return_v = false;
-        return v_val;
-    }
-    var u = 2 * rng() - 1;
-    var v = 2 * rng() - 1;
-    var r = u * u + v * v;
-    if (r === 0 || r > 1) {
-        return gaussRandom(rng);
-    }
-    var c = Math.sqrt(-2 * Math.log(r) / r);
-    v_val = v * c; // cache this for next function call for efficiency
-    return_v = true;
-    return u * c;
-}
-;
-// return random normal number
-function randn(rng, mu, std) {
-    return mu + gaussRandom(rng) * std;
-}
-;
-// utilitity that creates contiguous vector of zeros of size n
-function zeros(n) {
-    return new Float64Array(n);
-}
-;
-// utility that returns a matrix filled with random numbers
-// generated by the provided generator.
-function randnMatrix(n, d, rng) {
-    var nd = n * d;
-    var x = zeros(nd);
-    for (var i = 0; i < nd; ++i) {
-        x[i] = randn(rng, 0.0, 1E-4);
-    }
-    return x;
-}
-;
-// utility that returns a matrix filled with the provided value.
-function arrayofs(n, d, val) {
-    var x = [];
-    for (var i = 0; i < n; ++i) {
-        x.push(d === 3 ? [val, val, val] : [val, val]);
-    }
-    return x;
-}
-;
-// compute (p_{i|j} + p_{j|i})/(2n)
-function nearest2P(nearest, perplexity, tol) {
-    var N = nearest.length;
-    var Htarget = Math.log(perplexity); // target entropy of distribution
-    var P = zeros(N * N); // temporary probability matrix
-    var K = nearest[0].length;
-    var pRow = new Array(K); // pij[].
-    for (var i = 0; i < N; ++i) {
-        var neighbors = nearest[i];
-        var betaMin = -Infinity;
-        var betaMax = Infinity;
-        var beta = 1; // initial value of precision
-        var maxTries = 50;
-        // perform binary search to find a suitable precision beta
-        // so that the entropy of the distribution is appropriate
-        var numTries = 0;
-        while (true) {
-            // compute entropy and kernel row with beta precision
-            var psum = 0.0;
-            for (var k = 0; k < neighbors.length; ++k) {
-                var neighbor = neighbors[k];
-                var pij = (i === neighbor.index) ? 0 : Math.exp(-neighbor.dist * beta);
-                pij = Math.max(pij, MIN_POSSIBLE_PROB);
-                pRow[k] = pij;
-                psum += pij;
-            }
-            // normalize p and compute entropy
-            var Hhere = 0.0;
-            for (var k = 0; k < pRow.length; ++k) {
-                pRow[k] /= psum;
-                var pij = pRow[k];
-                if (pij > 1E-7) {
-                    Hhere -= pij * Math.log(pij);
-                }
-                ;
-            }
-            // adjust beta based on result
-            if (Hhere > Htarget) {
-                // entropy was too high (distribution too diffuse)
-                // so we need to increase the precision for more peaky distribution
-                betaMin = beta; // move up the bounds
-                if (betaMax === Infinity) {
-                    beta = beta * 2;
-                }
-                else {
-                    beta = (beta + betaMax) / 2;
-                }
-            }
-            else {
-                // converse case. make distrubtion less peaky
-                betaMax = beta;
-                if (betaMin === -Infinity) {
-                    beta = beta / 2;
-                }
-                else {
-                    beta = (beta + betaMin) / 2;
-                }
-            }
-            numTries++;
-            // stopping conditions: too many tries or got a good precision
-            if (numTries >= maxTries || Math.abs(Hhere - Htarget) < tol) {
-                break;
-            }
-        }
-        // copy over the final prow to P at row i
-        for (var k = 0; k < pRow.length; ++k) {
-            var pij = pRow[k];
-            var j = neighbors[k].index;
-            P[i * N + j] = pij;
-        }
-    } // end loop over examples i
-    // symmetrize P and normalize it to sum to 1 over all ij
-    var N2 = N * 2;
-    for (var i = 0; i < N; ++i) {
-        for (var j = i + 1; j < N; ++j) {
-            var i_j = i * N + j;
-            var j_i = j * N + i;
-            var value = (P[i_j] + P[j_i]) / N2;
-            P[i_j] = value;
-            P[j_i] = value;
-        }
-    }
-    return P;
-}
-;
-// helper function
-function sign(x) {
-    return x > 0 ? 1 : x < 0 ? -1 : 0;
-}
-function computeForce_2d(force, mult, pointA, pointB) {
-    force[0] += mult * (pointA[0] - pointB[0]);
-    force[1] += mult * (pointA[1] - pointB[1]);
-}
-function computeForce_3d(force, mult, pointA, pointB) {
-    force[0] += mult * (pointA[0] - pointB[0]);
-    force[1] += mult * (pointA[1] - pointB[1]);
-    force[2] += mult * (pointA[2] - pointB[2]);
-}
-var TSNE = (function () {
-    function TSNE(opt) {
-        this.iter = 0;
-        opt = opt || { dim: 2 };
-        this.perplexity = opt.perplexity || 30;
-        this.epsilon = opt.epsilon || 10;
-        this.rng = opt.rng || Math.random;
-        this.dim = opt.dim;
-        if (opt.dim === 2) {
-            this.dist2 = dist2_2D;
-            this.computeForce = computeForce_2d;
-        }
-        else if (opt.dim === 3) {
-            this.dist2 = dist2_3D;
-            this.computeForce = computeForce_3d;
-        }
-        else {
-            throw new Error('Only 2D and 3D is supported');
-        }
-    }
-    // this function takes a fattened distance matrix and creates
-    // matrix P from them.
-    // D is assumed to be provided as an array of size N^2.
-    TSNE.prototype.initDataDist = function (nearest) {
-        var N = nearest.length;
-        this.nearest = nearest;
-        this.P = nearest2P(nearest, this.perplexity, 1E-4);
-        this.N = N;
-        this.initSolution(); // refresh this
-    };
-    // (re)initializes the solution to random
-    TSNE.prototype.initSolution = function () {
-        // generate random solution to t-SNE
-        this.Y = randnMatrix(this.N, this.dim, this.rng); // the solution
-        this.gains = arrayofs(this.N, this.dim, 1.0); // step gains
-        // to accelerate progress in unchanging directions
-        this.ystep = arrayofs(this.N, this.dim, 0.0); // momentum accumulator
-        this.iter = 0;
-    };
-    // return pointer to current solution
-    TSNE.prototype.getSolution = function () { return this.Y; };
-    // perform a single step of optimization to improve the embedding
-    TSNE.prototype.step = function () {
-        this.iter += 1;
-        var N = this.N;
-        var grad = this.costGrad(this.Y); // evaluate gradient
-        // perform gradient step
-        var ymean = this.dim === 3 ? [0, 0, 0] : [0, 0];
-        for (var i = 0; i < N; ++i) {
-            for (var d = 0; d < this.dim; ++d) {
-                var gid = grad[i][d];
-                var sid = this.ystep[i][d];
-                var gainid = this.gains[i][d];
-                // compute gain update
-                var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
-                if (newgain < 0.01) {
-                    newgain = 0.01; // clamp
-                }
-                this.gains[i][d] = newgain; // store for next turn
-                // compute momentum step direction
-                var momval = this.iter < 250 ? 0.5 : 0.8;
-                var newsid = momval * sid - this.epsilon * newgain * grad[i][d];
-                this.ystep[i][d] = newsid; // remember the step we took
-                // step!
-                var i_d = i * this.dim + d;
-                this.Y[i_d] += newsid;
-                ymean[d] += this.Y[i_d]; // accumulate mean so that we
-            }
-        }
-        // reproject Y to be zero mean
-        for (var i = 0; i < N; ++i) {
-            for (var d = 0; d < this.dim; ++d) {
-                this.Y[i * this.dim + d] -= ymean[d] / N;
-            }
-        }
-    };
-    // return cost and gradient, given an arrangement
-    TSNE.prototype.costGrad = function (Y) {
-        var _this = this;
-        var N = this.N;
-        var P = this.P;
-        // Trick that helps with local optima.
-        var alpha = this.iter < 100 ? 4 : 1;
-        // Make data for the SP tree.
-        var points = new Array(N); // (x, y)[]
-        for (var i = 0; i < N; ++i) {
-            var iTimesD = i * this.dim;
-            var row = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                row[d] = Y[iTimesD + d];
-            }
-            points[i] = row;
-        }
-        // Make a tree.
-        var tree = new sptree_1.SPTree(points);
-        var root = tree.root;
-        // Annotate the tree.
-        var annotateTree = function (node) {
-            var numCells = 1;
-            if (node.children == null) {
-                // Update the current node and tell the parent.
-                node.numCells = numCells;
-                node.yCell = node.point;
-                return { numCells: numCells, yCell: node.yCell };
-            }
-            // node.point is a 2 or 3-dim number[], so slice() makes a copy.
-            var yCell = node.point.slice();
-            for (var i = 0; i < node.children.length; ++i) {
-                var child = node.children[i];
-                if (child == null) {
-                    continue;
-                }
-                var result = annotateTree(child);
-                numCells += result.numCells;
-                for (var d = 0; d < _this.dim; ++d) {
-                    yCell[d] += result.yCell[d];
-                }
-            }
-            // Update the node and tell the parent.
-            node.numCells = numCells;
-            node.yCell = yCell.map(function (v) { return v / numCells; });
-            return { numCells: numCells, yCell: yCell };
-        };
-        // Augment the tree with more info.
-        annotateTree(root);
-        tree.visit(function (node, low, high) {
-            node.rCell = high[0] - low[0];
-            return false;
-        });
-        // compute current Q distribution, unnormalized first
-        var grad = [];
-        var Z = 0;
-        var forces = new Array(N);
-        var _loop_1 = function(i) {
-            var pointI = points[i];
-            // Compute the positive forces for the i-th node.
-            var Fpos = this_1.dim === 3 ? [0, 0, 0] : [0, 0];
-            var neighbors = this_1.nearest[i];
-            for (var k = 0; k < neighbors.length; ++k) {
-                var j = neighbors[k].index;
-                var pij = P[i * N + j];
-                var pointJ = points[j];
-                var squaredDistItoJ = this_1.dist2(pointI, pointJ);
-                var premult = pij / (1 + squaredDistItoJ);
-                this_1.computeForce(Fpos, premult, pointI, pointJ);
-            }
-            // Compute the negative forces for the i-th node.
-            var FnegZ = this_1.dim === 3 ? [0, 0, 0] : [0, 0];
-            tree.visit(function (node) {
-                var squaredDistToCell = _this.dist2(pointI, node.yCell);
-                // Squared distance from point i to cell.
-                if (node.children == null ||
-                    (squaredDistToCell > 0 &&
-                        node.rCell / Math.sqrt(squaredDistToCell) < THETA)) {
-                    var qijZ_1 = 1 / (1 + squaredDistToCell);
-                    var dZ = node.numCells * qijZ_1;
-                    Z += dZ;
-                    dZ *= qijZ_1;
-                    _this.computeForce(FnegZ, dZ, pointI, node.yCell);
-                    return true;
-                }
-                // Cell is too close to approximate.
-                var squaredDistToPoint = _this.dist2(pointI, node.point);
-                var qijZ = 1 / (1 + squaredDistToPoint);
-                Z += qijZ;
-                qijZ *= qijZ;
-                _this.computeForce(FnegZ, qijZ, pointI, node.point);
-                return false;
-            }, true);
-            forces[i] = [Fpos, FnegZ];
-        };
-        var this_1 = this;
-        for (var i = 0; i < N; ++i) {
-            _loop_1(i);
-        }
-        // Normalize the negative forces and compute the gradient.
-        var A = 4 * alpha;
-        var B = 4 / Z;
-        for (var i = 0; i < N; ++i) {
-            var _a = forces[i], FPos = _a[0], FNegZ = _a[1];
-            var gsum = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                gsum[d] = A * FPos[d] - B * FNegZ[d];
-            }
-            grad.push(gsum);
-        }
-        return grad;
-    };
-    return TSNE;
-}());
-exports.TSNE = TSNE;
-
-},{"./sptree":23}],3:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var data_provider_1 = require('./data-provider');
-var dataProvider = require('./data-provider');
-var logging = require('./logging');
-var BYTES_EXTENSION = '.bytes';
-/** Data provider that loads data from a demo folder. */
-var DemoDataProvider = (function () {
-    function DemoDataProvider(projectorConfigPath) {
-        this.projectorConfigPath = projectorConfigPath;
-    }
-    DemoDataProvider.prototype.getEmbeddingInfo = function (tensorName) {
-        var embeddings = this.projectorConfig.embeddings;
-        for (var i = 0; i < embeddings.length; i++) {
-            var embedding = embeddings[i];
-            if (embedding.tensorName === tensorName) {
-                return embedding;
-            }
-        }
-        return null;
-    };
-    DemoDataProvider.prototype.retrieveRuns = function (callback) {
-        callback(['Demo']);
-    };
-    DemoDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        var _this = this;
-        var msgId = logging.setModalMessage('Fetching projector config...');
-        d3.json(this.projectorConfigPath, function (err, projectorConfig) {
-            if (err) {
-                logging.setErrorMessage(err.responseText);
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            _this.projectorConfig = projectorConfig;
-            callback(projectorConfig);
-        });
-    };
-    DemoDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var url = "" + embedding.tensorPath;
-        if (embedding.tensorPath.substr(-1 * BYTES_EXTENSION.length) ===
-            BYTES_EXTENSION) {
-            dataProvider.retrieveTensorAsBytes(this, this.getEmbeddingInfo(tensorName), run, tensorName, url, callback);
-        }
-        else {
-            logging.setModalMessage('Fetching tensors...', data_provider_1.TENSORS_MSG_ID);
-            d3.text(url, function (error, dataString) {
-                if (error) {
-                    logging.setErrorMessage(error.responseText);
-                    return;
-                }
-                dataProvider.parseTensors(dataString).then(function (points) {
-                    callback(new data_1.DataSet(points));
-                });
-            });
-        }
-    };
-    DemoDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var spriteImagePath = null;
-        if (embedding.sprite && embedding.sprite.imagePath) {
-            spriteImagePath = embedding.sprite.imagePath;
-        }
-        dataProvider.retrieveSpriteAndMetadataInfo(embedding.metadataPath, spriteImagePath, embedding.sprite, callback);
-    };
-    DemoDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var msgId = logging.setModalMessage('Fetching bookmarks...');
-        d3.json(embedding.bookmarksPath, function (err, bookmarks) {
-            if (err) {
-                logging.setErrorMessage(err.responseText);
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            callback(bookmarks);
-        });
-    };
-    return DemoDataProvider;
-}());
-exports.DemoDataProvider = DemoDataProvider;
-
-},{"./data":7,"./data-provider":6,"./logging":12}],4:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var data_provider_1 = require('./data-provider');
-var ProtoDataProvider = (function () {
-    function ProtoDataProvider(dataProto) {
-        this.dataProto = dataProto;
-    }
-    ProtoDataProvider.prototype.retrieveRuns = function (callback) {
-        callback(['proto']);
-    };
-    ProtoDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        callback({
-            modelCheckpointPath: 'proto',
-            embeddings: [{
-                    tensorName: 'proto',
-                    tensorShape: this.dataProto.shape,
-                    metadataPath: 'proto'
-                }]
-        });
-    };
-    ProtoDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        callback(this.flatArrayToDataset(this.dataProto.tensor));
-    };
-    ProtoDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var columnNames = this.dataProto.metadata.columns.map(function (c) { return c.name; });
-        var n = this.dataProto.shape[0];
-        var pointsMetadata = new Array(n);
-        this.dataProto.metadata.columns.forEach(function (c) {
-            var values = c.numericValues || c.stringValues;
-            for (var i = 0; i < n; i++) {
-                pointsMetadata[i] = pointsMetadata[i] || {};
-                pointsMetadata[i][c.name] = values[i];
-            }
-        });
-        callback({
-            stats: data_provider_1.analyzeMetadata(columnNames, pointsMetadata),
-            pointsInfo: pointsMetadata
-        });
-    };
-    ProtoDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        return callback([]);
-    };
-    ProtoDataProvider.prototype.flatArrayToDataset = function (tensor) {
-        var points = [];
-        var n = this.dataProto.shape[0];
-        var d = this.dataProto.shape[1];
-        if (n * d !== tensor.length) {
-            throw 'The shape doesn\'t match the length of the flattened array';
-        }
-        for (var i = 0; i < n; i++) {
-            var offset = i * d;
-            points.push({
-                vector: new Float32Array(tensor.slice(offset, offset + d)),
-                metadata: {},
-                projections: null,
-                index: i
-            });
-        }
-        return new data_1.DataSet(points);
-    };
-    return ProtoDataProvider;
-}());
-exports.ProtoDataProvider = ProtoDataProvider;
-
-},{"./data":7,"./data-provider":6}],5:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var dataProvider = require('./data-provider');
-var logging = require('./logging');
-// Limit for the number of data points we receive from the server.
-exports.LIMIT_NUM_POINTS = 100000;
-/**
- * Data provider that loads data provided by a python server (usually backed
- * by a checkpoint file).
- */
-var ServerDataProvider = (function () {
-    function ServerDataProvider(routePrefix) {
-        this.runProjectorConfigCache = {};
-        this.routePrefix = routePrefix;
-    }
-    ServerDataProvider.prototype.getEmbeddingInfo = function (run, tensorName, callback) {
-        this.retrieveProjectorConfig(run, function (config) {
-            var embeddings = config.embeddings;
-            for (var i = 0; i < embeddings.length; i++) {
-                var embedding = embeddings[i];
-                if (embedding.tensorName === tensorName) {
-                    callback(embedding);
-                    return;
-                }
-            }
-            callback(null);
-        });
-    };
-    ServerDataProvider.prototype.retrieveRuns = function (callback) {
-        var msgId = logging.setModalMessage('Fetching runs...');
-        d3.json(this.routePrefix + "/runs", function (err, runs) {
-            if (err) {
-                logging.setErrorMessage(err.responseText);
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            callback(runs);
-        });
-    };
-    ServerDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        var _this = this;
-        if (run in this.runProjectorConfigCache) {
-            callback(this.runProjectorConfigCache[run]);
-            return;
-        }
-        var msgId = logging.setModalMessage('Fetching projector config...');
-        d3.json(this.routePrefix + "/info?run=" + run, function (err, config) {
-            if (err) {
-                logging.setErrorMessage(err.responseText);
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            _this.runProjectorConfigCache[run] = config;
-            callback(config);
-        });
-    };
-    ServerDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        var _this = this;
-        this.getEmbeddingInfo(run, tensorName, function (embedding) {
-            dataProvider.retrieveTensorAsBytes(_this, embedding, run, tensorName, (_this.routePrefix + "/tensor?run=" + run + "&name=" + tensorName) +
-                ("&num_rows=" + exports.LIMIT_NUM_POINTS), callback);
-        });
-    };
-    ServerDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var _this = this;
-        this.getEmbeddingInfo(run, tensorName, function (embedding) {
-            var metadataPath = null;
-            if (embedding.metadataPath) {
-                metadataPath =
-                    (_this.routePrefix + "/metadata?") +
-                        ("run=" + run + "&name=" + tensorName + "&num_rows=" + exports.LIMIT_NUM_POINTS);
-            }
-            var spriteImagePath = null;
-            if (embedding.sprite && embedding.sprite.imagePath) {
-                spriteImagePath =
-                    _this.routePrefix + "/sprite_image?run=" + run + "&name=" + tensorName;
-            }
-            dataProvider.retrieveSpriteAndMetadataInfo(metadataPath, spriteImagePath, embedding.sprite, callback);
-        });
-    };
-    ServerDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        var msgId = logging.setModalMessage('Fetching bookmarks...');
-        d3.json(this.routePrefix + "/bookmarks?run=" + run + "&name=" + tensorName, function (err, bookmarks) {
-            logging.setModalMessage(null, msgId);
-            if (!err) {
-                callback(bookmarks);
-            }
-        });
-    };
-    return ServerDataProvider;
-}());
-exports.ServerDataProvider = ServerDataProvider;
-
-},{"./data-provider":6,"./logging":12}],6:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var logging = require('./logging');
-var util_1 = require('./util');
-/** Maximum number of colors supported in the color map. */
-var NUM_COLORS_COLOR_MAP = 50;
-var MAX_SPRITE_IMAGE_SIZE_PX = 8192;
-exports.METADATA_MSG_ID = 'metadata';
-exports.TENSORS_MSG_ID = 'tensors';
-function retrieveTensorAsBytes(dp, embedding, run, tensorName, tensorsPath, callback) {
-    // Get the tensor.
-    logging.setModalMessage('Fetching tensor values...', exports.TENSORS_MSG_ID);
-    var xhr = new XMLHttpRequest();
-    xhr.open('GET', tensorsPath);
-    xhr.responseType = 'arraybuffer';
-    xhr.onprogress = function (ev) {
-        if (ev.lengthComputable) {
-            var percent = (ev.loaded * 100 / ev.total).toFixed(1);
-            logging.setModalMessage('Fetching tensor values: ' + percent + '%', exports.TENSORS_MSG_ID);
-        }
-    };
-    xhr.onload = function () {
-        if (xhr.status !== 200) {
-            var msg = String.fromCharCode.apply(null, new Uint8Array(xhr.response));
-            logging.setErrorMessage(msg);
-            return;
-        }
-        var data = new Float32Array(xhr.response);
-        var dim = embedding.tensorShape[1];
-        var N = data.length / dim;
-        if (embedding.tensorShape[0] > N) {
-            logging.setWarningMessage(("Showing the first " + N.toLocaleString()) +
-                (" of " + embedding.tensorShape[0].toLocaleString() + " data points"));
-        }
-        parseTensorsFromFloat32Array(data, dim).then(function (dataPoints) {
-            callback(new data_1.DataSet(dataPoints));
-        });
-    };
-    xhr.send();
-}
-exports.retrieveTensorAsBytes = retrieveTensorAsBytes;
-function parseRawTensors(content, callback) {
-    parseTensors(content).then(function (data) {
-        callback(new data_1.DataSet(data));
-    });
-}
-exports.parseRawTensors = parseRawTensors;
-function parseRawMetadata(contents, callback) {
-    parseMetadata(contents).then(function (result) { return callback(result); });
-}
-exports.parseRawMetadata = parseRawMetadata;
-/** Parses a tsv text file. */
-function parseTensors(content, delim) {
-    if (delim === void 0) { delim = '\t'; }
-    var data = [];
-    var numDim;
-    return util_1.runAsyncTask('Parsing tensors...', function () {
-        var lines = content.split('\n');
-        lines.forEach(function (line) {
-            line = line.trim();
-            if (line === '') {
-                return;
-            }
-            var row = line.split(delim);
-            var dataPoint = {
-                metadata: {},
-                vector: null,
-                index: data.length,
-                projections: null,
-            };
-            // If the first label is not a number, take it as the label.
-            if (isNaN(row[0]) || numDim === row.length - 1) {
-                dataPoint.metadata['label'] = row[0];
-                dataPoint.vector = new Float32Array(row.slice(1).map(Number));
-            }
-            else {
-                dataPoint.vector = new Float32Array(row.map(Number));
-            }
-            data.push(dataPoint);
-            if (numDim == null) {
-                numDim = dataPoint.vector.length;
-            }
-            if (numDim !== dataPoint.vector.length) {
-                logging.setModalMessage('Parsing failed. Vector dimensions do not match');
-                throw Error('Parsing failed');
-            }
-            if (numDim <= 1) {
-                logging.setModalMessage('Parsing failed. Found a vector with only one dimension?');
-                throw Error('Parsing failed');
-            }
-        });
-        return data;
-    }, exports.TENSORS_MSG_ID).then(function (dataPoints) {
-        logging.setModalMessage(null, exports.TENSORS_MSG_ID);
-        return dataPoints;
-    });
-}
-exports.parseTensors = parseTensors;
-/** Parses a tsv text file. */
-function parseTensorsFromFloat32Array(data, dim) {
-    return util_1.runAsyncTask('Parsing tensors...', function () {
-        var N = data.length / dim;
-        var dataPoints = [];
-        var offset = 0;
-        for (var i = 0; i < N; ++i) {
-            dataPoints.push({
-                metadata: {},
-                vector: data.subarray(offset, offset + dim),
-                index: i,
-                projections: null,
-            });
-            offset += dim;
-        }
-        return dataPoints;
-    }, exports.TENSORS_MSG_ID).then(function (dataPoints) {
-        logging.setModalMessage(null, exports.TENSORS_MSG_ID);
-        return dataPoints;
-    });
-}
-exports.parseTensorsFromFloat32Array = parseTensorsFromFloat32Array;
-function analyzeMetadata(columnNames, pointsMetadata) {
-    var columnStats = columnNames.map(function (name) {
-        return {
-            name: name,
-            isNumeric: true,
-            tooManyUniqueValues: false,
-            min: Number.POSITIVE_INFINITY,
-            max: Number.NEGATIVE_INFINITY
-        };
-    });
-    var mapOfValues = columnNames.map(function () { return d3.map(); });
-    pointsMetadata.forEach(function (metadata) {
-        columnNames.forEach(function (name, colIndex) {
-            var stats = columnStats[colIndex];
-            var map = mapOfValues[colIndex];
-            var value = metadata[name];
-            // Skip missing values.
-            if (value == null) {
-                return;
-            }
-            if (!stats.tooManyUniqueValues) {
-                if (map.has(value)) {
-                    map.set(value, map.get(value) + 1);
-                }
-                else {
-                    map.set(value, 1);
-                }
-                if (map.size() > NUM_COLORS_COLOR_MAP) {
-                    stats.tooManyUniqueValues = true;
-                }
-            }
-            if (isNaN(value)) {
-                stats.isNumeric = false;
-            }
-            else {
-                metadata[name] = +value;
-                stats.min = Math.min(stats.min, +value);
-                stats.max = Math.max(stats.max, +value);
-            }
-        });
-    });
-    columnStats.forEach(function (stats, colIndex) {
-        var map = mapOfValues[colIndex];
-        if (!stats.tooManyUniqueValues) {
-            stats.uniqueEntries = map.entries().map(function (e) {
-                return { label: e.key, count: e.value };
-            });
-        }
-    });
-    return columnStats;
-}
-exports.analyzeMetadata = analyzeMetadata;
-function parseMetadata(content) {
-    return util_1.runAsyncTask('Parsing metadata...', function () {
-        var lines = content.split('\n').filter(function (line) { return line.trim().length > 0; });
-        var hasHeader = lines[0].indexOf('\t') >= 0;
-        var pointsMetadata = [];
-        // If the first row doesn't contain metadata keys, we assume that the values
-        // are labels.
-        var columnNames = ['label'];
-        if (hasHeader) {
-            columnNames = lines[0].split('\t');
-            lines = lines.slice(1);
-        }
-        lines.forEach(function (line) {
-            var rowValues = line.split('\t');
-            var metadata = {};
-            pointsMetadata.push(metadata);
-            columnNames.forEach(function (name, colIndex) {
-                var value = rowValues[colIndex];
-                // Normalize missing values.
-                value = (value === '' ? null : value);
-                metadata[name] = value;
-            });
-        });
-        return {
-            stats: analyzeMetadata(columnNames, pointsMetadata),
-            pointsInfo: pointsMetadata
-        };
-    }, exports.METADATA_MSG_ID).then(function (metadata) {
-        logging.setModalMessage(null, exports.METADATA_MSG_ID);
-        return metadata;
-    });
-}
-exports.parseMetadata = parseMetadata;
-function fetchImage(url) {
-    return new Promise(function (resolve, reject) {
-        var image = new Image();
-        image.onload = function () { return resolve(image); };
-        image.onerror = function (err) { return reject(err); };
-        image.crossOrigin = '';
-        image.src = url;
-    });
-}
-exports.fetchImage = fetchImage;
-function retrieveSpriteAndMetadataInfo(metadataPath, spriteImagePath, spriteMetadata, callback) {
-    var metadataPromise = Promise.resolve({});
-    if (metadataPath) {
-        metadataPromise = new Promise(function (resolve, reject) {
-            logging.setModalMessage('Fetching metadata...', exports.METADATA_MSG_ID);
-            d3.text(metadataPath, function (err, rawMetadata) {
-                if (err) {
-                    logging.setErrorMessage(err.responseText);
-                    reject(err);
-                    return;
-                }
-                resolve(parseMetadata(rawMetadata));
-            });
-        });
-    }
-    var spriteMsgId = null;
-    var spritesPromise = null;
-    if (spriteImagePath) {
-        spriteMsgId = logging.setModalMessage('Fetching sprite image...');
-        spritesPromise = fetchImage(spriteImagePath);
-    }
-    // Fetch the metadata and the image in parallel.
-    Promise.all([metadataPromise, spritesPromise]).then(function (values) {
-        if (spriteMsgId) {
-            logging.setModalMessage(null, spriteMsgId);
-        }
-        var metadata = values[0], spriteImage = values[1];
-        if (spriteImage && (spriteImage.height > MAX_SPRITE_IMAGE_SIZE_PX ||
-            spriteImage.width > MAX_SPRITE_IMAGE_SIZE_PX)) {
-            logging.setModalMessage(("Error: Sprite image of dimensions " + spriteImage.width + "px x ") +
-                (spriteImage.height + "px exceeds maximum dimensions ") +
-                (MAX_SPRITE_IMAGE_SIZE_PX + "px x " + MAX_SPRITE_IMAGE_SIZE_PX + "px"));
-        }
-        else {
-            metadata.spriteImage = spriteImage;
-            metadata.spriteMetadata = spriteMetadata;
-            callback(metadata);
-        }
-    });
-}
-exports.retrieveSpriteAndMetadataInfo = retrieveSpriteAndMetadataInfo;
-
-},{"./data":7,"./logging":12,"./util":24}],7:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var bh_tsne_1 = require('./bh_tsne');
-var knn = require('./knn');
-var logging = require('./logging');
-var util = require('./util');
-var vector = require('./vector');
-var IS_FIREFOX = navigator.userAgent.toLowerCase().indexOf('firefox') >= 0;
-/** Controls whether nearest neighbors computation is done on the GPU or CPU. */
-var KNN_GPU_ENABLED = util.hasWebGLSupport() && !IS_FIREFOX;
-exports.TSNE_SAMPLE_SIZE = 10000;
-exports.PCA_SAMPLE_SIZE = 50000;
-/** Number of dimensions to sample when doing approximate PCA. */
-exports.PCA_SAMPLE_DIM = 200;
-/** Number of pca components to compute. */
-var NUM_PCA_COMPONENTS = 10;
-/** Reserved metadata attribute used for trace information. */
-var TRACE_METADATA_ATTR = '__next__';
-/**
- * Dataset contains a DataPoints array that should be treated as immutable. This
- * acts as a working subset of the original data, with cached properties
- * from computationally expensive operations. Because creating a subset
- * requires normalizing and shifting the vector space, we make a copy of the
- * data so we can still always create new subsets based on the original data.
- */
-var DataSet = (function () {
-    /** Creates a new Dataset */
-    function DataSet(points, spriteAndMetadataInfo) {
-        this.shuffledDataIndices = [];
-        /**
-         * This keeps a list of all current projections so you can easily test to see
-         * if it's been calculated already.
-         */
-        this.projections = d3.set();
-        this.tSNEIteration = 0;
-        this.tSNEShouldStop = true;
-        this.dim = [0, 0];
-        this.hasTSNERun = false;
-        this.points = points;
-        this.shuffledDataIndices = util.shuffle(d3.range(this.points.length));
-        this.traces = this.computeTraces(points);
-        this.dim = [this.points.length, this.points[0].vector.length];
-        this.spriteAndMetadataInfo = spriteAndMetadataInfo;
-    }
-    DataSet.prototype.computeTraces = function (points) {
-        // Keep a list of indices seen so we don't compute traces for a given
-        // point twice.
-        var indicesSeen = new Int8Array(points.length);
-        // Compute traces.
-        var indexToTrace = {};
-        var traces = [];
-        for (var i = 0; i < points.length; i++) {
-            if (indicesSeen[i]) {
-                continue;
-            }
-            indicesSeen[i] = 1;
-            // Ignore points without a trace attribute.
-            var next = points[i].metadata[TRACE_METADATA_ATTR];
-            if (next == null || next === '') {
-                continue;
-            }
-            if (next in indexToTrace) {
-                var existingTrace = indexToTrace[+next];
-                // Pushing at the beginning of the array.
-                existingTrace.pointIndices.unshift(i);
-                indexToTrace[i] = existingTrace;
-                continue;
-            }
-            // The current point is pointing to a new/unseen trace.
-            var newTrace = { pointIndices: [] };
-            indexToTrace[i] = newTrace;
-            traces.push(newTrace);
-            var currentIndex = i;
-            while (points[currentIndex]) {
-                newTrace.pointIndices.push(currentIndex);
-                var next_1 = points[currentIndex].metadata[TRACE_METADATA_ATTR];
-                if (next_1 != null && next_1 !== '') {
-                    indicesSeen[+next_1] = 1;
-                    currentIndex = +next_1;
-                }
-                else {
-                    currentIndex = -1;
-                }
-            }
-        }
-        return traces;
-    };
-    DataSet.prototype.projectionCanBeRendered = function (projection) {
-        if (projection !== 'tsne') {
-            return true;
-        }
-        return this.tSNEIteration > 0;
-    };
-    /**
-     * Returns a new subset dataset by copying out data. We make a copy because
-     * we have to modify the vectors by normalizing them.
-     *
-     * @param subset Array of indices of points that we want in the subset.
-     *
-     * @return A subset of the original dataset.
-     */
-    DataSet.prototype.getSubset = function (subset) {
-        var _this = this;
-        var pointsSubset = ((subset != null) && (subset.length > 0)) ?
-            subset.map(function (i) { return _this.points[i]; }) :
-            this.points;
-        var points = pointsSubset.map(function (dp) {
-            return {
-                metadata: dp.metadata,
-                index: dp.index,
-                vector: dp.vector.slice(),
-                projections: {}
-            };
-        });
-        return new DataSet(points, this.spriteAndMetadataInfo);
-    };
-    /**
-     * Computes the centroid, shifts all points to that centroid,
-     * then makes them all unit norm.
-     */
-    DataSet.prototype.normalize = function () {
-        // Compute the centroid of all data points.
-        var centroid = vector.centroid(this.points, function (a) { return a.vector; });
-        if (centroid == null) {
-            throw Error('centroid should not be null');
-        }
-        // Shift all points by the centroid and make them unit norm.
-        for (var id = 0; id < this.points.length; ++id) {
-            var dataPoint = this.points[id];
-            dataPoint.vector = vector.sub(dataPoint.vector, centroid);
-            vector.unit(dataPoint.vector);
-        }
-    };
-    /** Projects the dataset onto a given vector and caches the result. */
-    DataSet.prototype.projectLinear = function (dir, label) {
-        this.projections.add(label);
-        this.points.forEach(function (dataPoint) {
-            dataPoint.projections[label] = vector.dot(dataPoint.vector, dir);
-        });
-    };
-    /** Projects the dataset along the top 10 principal components. */
-    DataSet.prototype.projectPCA = function () {
-        var _this = this;
-        if (this.projections.has('pca-0')) {
-            return Promise.resolve(null);
-        }
-        return util.runAsyncTask('Computing PCA...', function () {
-            // Approximate pca vectors by sampling the dimensions.
-            var dim = _this.points[0].vector.length;
-            var vectors = _this.shuffledDataIndices.map(function (i) { return _this.points[i].vector; });
-            if (dim > exports.PCA_SAMPLE_DIM) {
-                vectors = vector.projectRandom(vectors, exports.PCA_SAMPLE_DIM);
-            }
-            var sampledVectors = vectors.slice(0, exports.PCA_SAMPLE_SIZE);
-            var sigma = numeric.div(numeric.dot(numeric.transpose(sampledVectors), sampledVectors), sampledVectors.length);
-            var svd = numeric.svd(sigma);
-            var variances = svd.S;
-            var totalVariance = 0;
-            for (var i = 0; i < variances.length; ++i) {
-                totalVariance += variances[i];
-            }
-            for (var i = 0; i < variances.length; ++i) {
-                variances[i] /= totalVariance;
-            }
-            _this.fracVariancesExplained = variances;
-            var U = svd.U;
-            var pcaVectors = vectors.map(function (vector) {
-                var newV = new Float32Array(NUM_PCA_COMPONENTS);
-                for (var newDim = 0; newDim < NUM_PCA_COMPONENTS; newDim++) {
-                    var dot = 0;
-                    for (var oldDim = 0; oldDim < vector.length; oldDim++) {
-                        dot += vector[oldDim] * U[oldDim][newDim];
-                    }
-                    newV[newDim] = dot;
-                }
-                return newV;
-            });
-            for (var d = 0; d < NUM_PCA_COMPONENTS; d++) {
-                var label = 'pca-' + d;
-                _this.projections.add(label);
-                for (var i = 0; i < pcaVectors.length; i++) {
-                    var pointIndex = _this.shuffledDataIndices[i];
-                    _this.points[pointIndex].projections[label] = pcaVectors[i][d];
-                }
-            }
-        });
-    };
-    /** Runs tsne on the data. */
-    DataSet.prototype.projectTSNE = function (perplexity, learningRate, tsneDim, stepCallback) {
-        var _this = this;
-        this.hasTSNERun = true;
-        var k = Math.floor(3 * perplexity);
-        var opt = { epsilon: learningRate, perplexity: perplexity, dim: tsneDim };
-        this.tsne = new bh_tsne_1.TSNE(opt);
-        this.tSNEShouldStop = false;
-        this.tSNEIteration = 0;
-        var sampledIndices = this.shuffledDataIndices.slice(0, exports.TSNE_SAMPLE_SIZE);
-        var step = function () {
-            if (_this.tSNEShouldStop) {
-                stepCallback(null);
-                _this.tsne = null;
-                return;
-            }
-            _this.tsne.step();
-            var result = _this.tsne.getSolution();
-            sampledIndices.forEach(function (index, i) {
-                var dataPoint = _this.points[index];
-                dataPoint.projections['tsne-0'] = result[i * tsneDim + 0];
-                dataPoint.projections['tsne-1'] = result[i * tsneDim + 1];
-                if (tsneDim === 3) {
-                    dataPoint.projections['tsne-2'] = result[i * tsneDim + 2];
-                }
-            });
-            _this.tSNEIteration++;
-            stepCallback(_this.tSNEIteration);
-            requestAnimationFrame(step);
-        };
-        // Nearest neighbors calculations.
-        var knnComputation;
-        if (this.nearest != null && k === this.nearestK) {
-            // We found the nearest neighbors before and will reuse them.
-            knnComputation = Promise.resolve(this.nearest);
-        }
-        else {
-            var sampledData = sampledIndices.map(function (i) { return _this.points[i]; });
-            this.nearestK = k;
-            knnComputation = KNN_GPU_ENABLED ?
-                knn.findKNNGPUCosine(sampledData, k, (function (d) { return d.vector; })) :
-                knn.findKNN(sampledData, k, (function (d) { return d.vector; }), function (a, b, limit) { return vector.cosDistNorm(a, b); });
-        }
-        knnComputation.then(function (nearest) {
-            _this.nearest = nearest;
-            util.runAsyncTask('Initializing T-SNE...', function () {
-                _this.tsne.initDataDist(_this.nearest);
-            }).then(step);
-        });
-    };
-    DataSet.prototype.mergeMetadata = function (metadata) {
-        var _this = this;
-        if (metadata.pointsInfo.length !== this.points.length) {
-            logging.setWarningMessage(("Number of tensors (" + this.points.length + ") do not match") +
-                (" the number of lines in metadata (" + metadata.pointsInfo.length + ")."));
-        }
-        this.spriteAndMetadataInfo = metadata;
-        metadata.pointsInfo.slice(0, this.points.length)
-            .forEach(function (m, i) { return _this.points[i].metadata = m; });
-    };
-    DataSet.prototype.stopTSNE = function () {
-        this.tSNEShouldStop = true;
-    };
-    /**
-     * Finds the nearest neighbors of the query point using a
-     * user-specified distance metric.
-     */
-    DataSet.prototype.findNeighbors = function (pointIndex, distFunc, numNN) {
-        // Find the nearest neighbors of a particular point.
-        var neighbors = knn.findKNNofPoint(this.points, pointIndex, numNN, (function (d) { return d.vector; }), distFunc);
-        // TODO(smilkov): Figure out why we slice.
-        var result = neighbors.slice(0, numNN);
-        return result;
-    };
-    /**
-     * Search the dataset based on a metadata field.
-     */
-    DataSet.prototype.query = function (query, inRegexMode, fieldName) {
-        var predicate = util.getSearchPredicate(query, inRegexMode, fieldName);
-        var matches = [];
-        this.points.forEach(function (point, id) {
-            if (predicate(point)) {
-                matches.push(id);
-            }
-        });
-        return matches;
-    };
-    return DataSet;
-}());
-exports.DataSet = DataSet;
-var Projection = (function () {
-    function Projection(projectionType, projectionComponents, dimensionality, dataSet) {
-        this.projectionType = projectionType;
-        this.projectionComponents = projectionComponents;
-        this.dimensionality = dimensionality;
-        this.dataSet = dataSet;
-    }
-    return Projection;
-}());
-exports.Projection = Projection;
-/**
- * An interface that holds all the data for serializing the current state of
- * the world.
- */
-var State = (function () {
-    function State() {
-        /** A label identifying this state. */
-        this.label = '';
-        /** Whether this State is selected in the bookmarks pane. */
-        this.isSelected = false;
-        /** t-SNE parameters */
-        this.tSNEIteration = 0;
-        this.tSNEPerplexity = 0;
-        this.tSNELearningRate = 0;
-        this.tSNEis3d = true;
-        /** PCA projection component dimensions */
-        this.pcaComponentDimensions = [];
-        /** The computed projections of the tensors. */
-        this.projections = [];
-        /** The indices of selected points. */
-        this.selectedPoints = [];
-    }
-    return State;
-}());
-exports.State = State;
-function getProjectionComponents(projection, components) {
-    if (components.length > 3) {
-        throw new RangeError('components length must be <= 3');
-    }
-    var projectionComponents = [null, null, null];
-    var prefix = (projection === 'custom') ? 'linear' : projection;
-    for (var i = 0; i < components.length; ++i) {
-        if (components[i] == null) {
-            continue;
-        }
-        projectionComponents[i] = prefix + '-' + components[i];
-    }
-    return projectionComponents;
-}
-exports.getProjectionComponents = getProjectionComponents;
-function stateGetAccessorDimensions(state) {
-    var dimensions;
-    switch (state.selectedProjection) {
-        case 'pca':
-            dimensions = state.pcaComponentDimensions.slice();
-            break;
-        case 'tsne':
-            dimensions = [0, 1];
-            if (state.tSNEis3d) {
-                dimensions.push(2);
-            }
-            break;
-        case 'custom':
-            dimensions = ['x', 'y'];
-            break;
-        default:
-            throw new Error('Unexpected fallthrough');
-    }
-    return dimensions;
-}
-exports.stateGetAccessorDimensions = stateGetAccessorDimensions;
-
-},{"./bh_tsne":2,"./knn":10,"./logging":12,"./util":24,"./vector":25}],8:[function(require,module,exports){
-
-},{}],9:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * Min-heap data structure. Provides O(1) for peek, returning the smallest key.
- */
-// TODO(jart): Rename to Heap and use Comparator.
-var MinHeap = (function () {
-    function MinHeap() {
-        this.arr = [];
-    }
-    /** Push an element with the provided key. */
-    MinHeap.prototype.push = function (key, value) {
-        this.arr.push({ key: key, value: value });
-        this.bubbleUp(this.arr.length - 1);
-    };
-    /** Pop the element with the smallest key. */
-    MinHeap.prototype.pop = function () {
-        if (this.arr.length === 0) {
-            throw new Error('pop() called on empty binary heap');
-        }
-        var item = this.arr[0];
-        var last = this.arr.length - 1;
-        this.arr[0] = this.arr[last];
-        this.arr.pop();
-        if (last > 0) {
-            this.bubbleDown(0);
-        }
-        return item;
-    };
-    ;
-    /** Returns, but doesn't remove the element with the smallest key */
-    MinHeap.prototype.peek = function () { return this.arr[0]; };
-    /**
-     * Pops the element with the smallest key and at the same time
-     * adds the newly provided element. This is faster than calling
-     * pop() and push() separately.
-     */
-    MinHeap.prototype.popPush = function (key, value) {
-        if (this.arr.length === 0) {
-            throw new Error('pop() called on empty binary heap');
-        }
-        var item = this.arr[0];
-        this.arr[0] = { key: key, value: value };
-        if (this.arr.length > 0) {
-            this.bubbleDown(0);
-        }
-        return item;
-    };
-    /** Returns the number of elements in the heap. */
-    MinHeap.prototype.size = function () { return this.arr.length; };
-    /** Returns all the items in the heap. */
-    MinHeap.prototype.items = function () { return this.arr; };
-    MinHeap.prototype.swap = function (a, b) {
-        var temp = this.arr[a];
-        this.arr[a] = this.arr[b];
-        this.arr[b] = temp;
-    };
-    MinHeap.prototype.bubbleDown = function (pos) {
-        var left = (pos << 1) + 1;
-        var right = left + 1;
-        var largest = pos;
-        if (left < this.arr.length && this.arr[left].key < this.arr[largest].key) {
-            largest = left;
-        }
-        if (right < this.arr.length &&
-            this.arr[right].key < this.arr[largest].key) {
-            largest = right;
-        }
-        if (largest !== pos) {
-            this.swap(largest, pos);
-            this.bubbleDown(largest);
-        }
-    };
-    MinHeap.prototype.bubbleUp = function (pos) {
-        if (pos <= 0) {
-            return;
-        }
-        var parent = ((pos - 1) >> 1);
-        if (this.arr[pos].key < this.arr[parent].key) {
-            this.swap(pos, parent);
-            this.bubbleUp(parent);
-        }
-    };
-    return MinHeap;
-}());
-exports.MinHeap = MinHeap;
-/** List that keeps the K elements with the smallest keys. */
-var KMin = (function () {
-    /** Constructs a new k-min data structure with the provided k. */
-    function KMin(k) {
-        this.maxHeap = new MinHeap();
-        this.k = k;
-    }
-    /** Adds an element to the list. */
-    KMin.prototype.add = function (key, value) {
-        if (this.maxHeap.size() < this.k) {
-            this.maxHeap.push(-key, value);
-            return;
-        }
-        var largest = this.maxHeap.peek();
-        // If the new element is smaller, replace the largest with the new element.
-        if (key < -largest.key) {
-            this.maxHeap.popPush(-key, value);
-        }
-    };
-    /** Returns the k items with the smallest keys. */
-    KMin.prototype.getMinKItems = function () {
-        var items = this.maxHeap.items();
-        items.sort(function (a, b) { return b.key - a.key; });
-        return items.map(function (a) { return a.value; });
-    };
-    /** Returns the size of the list. */
-    KMin.prototype.getSize = function () { return this.maxHeap.size(); };
-    /** Returns the largest key in the list. */
-    KMin.prototype.getLargestKey = function () {
-        return this.maxHeap.size() === 0 ? null : -this.maxHeap.peek().key;
-    };
-    return KMin;
-}());
-exports.KMin = KMin;
-
-},{}],10:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util_1 = require('./util');
-var logging = require('./logging');
-var heap_1 = require('./heap');
-var vector = require('./vector');
-/**
- * Optimal size for the height of the matrix when doing computation on the GPU
- * using WebGL. This was found experimentally.
- *
- * This also guarantees that for computing pair-wise distance for up to 10K
- * vectors, no more than 40MB will be allocated in the GPU. Without the
- * allocation limit, we can freeze the graphics of the whole OS.
- */
-var OPTIMAL_GPU_BLOCK_SIZE = 256;
-/** Id of message box used for knn gpu progress bar. */
-var KNN_GPU_MSG_ID = 'knn-gpu';
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the GPU (WebGL) using cosine distance.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- */
-function findKNNGPUCosine(dataPoints, k, accessor) {
-    var N = dataPoints.length;
-    var dim = accessor(dataPoints[0]).length;
-    // The goal is to compute a large matrix multiplication A*A.T where A is of
-    // size NxD and A.T is its transpose. This results in a NxN matrix which
-    // could be too big to store on the GPU memory. To avoid memory overflow, we
-    // compute multiple A*partial_A.T where partial_A is of size BxD (B is much
-    // smaller than N). This results in storing only NxB size matrices on the GPU
-    // at a given time.
-    // A*A.T will give us NxN matrix holding the cosine distance between every
-    // pair of points, which we sort using KMin data structure to obtain the
-    // K nearest neighbors for each point.
-    var typedArray = vector.toTypedArray(dataPoints, accessor);
-    var bigMatrix = new weblas.pipeline.Tensor([N, dim], typedArray);
-    var nearest = new Array(N);
-    var numPieces = Math.ceil(N / OPTIMAL_GPU_BLOCK_SIZE);
-    var M = Math.floor(N / numPieces);
-    var modulo = N % numPieces;
-    var offset = 0;
-    var progress = 0;
-    var progressDiff = 1 / (2 * numPieces);
-    var piece = 0;
-    function step(resolve) {
-        var progressMsg = 'Finding nearest neighbors: ' + (progress * 100).toFixed() + '%';
-        util_1.runAsyncTask(progressMsg, function () {
-            var B = piece < modulo ? M + 1 : M;
-            var typedB = new Float32Array(B * dim);
-            for (var i = 0; i < B; ++i) {
-                var vector_1 = accessor(dataPoints[offset + i]);
-                for (var d = 0; d < dim; ++d) {
-                    typedB[i * dim + d] = vector_1[d];
-                }
-            }
-            var partialMatrix = new weblas.pipeline.Tensor([B, dim], typedB);
-            // Result is N x B matrix.
-            var result = weblas.pipeline.sgemm(1, bigMatrix, partialMatrix, null, null);
-            var partial = result.transfer();
-            partialMatrix.delete();
-            result.delete();
-            progress += progressDiff;
-            for (var i = 0; i < B; i++) {
-                var kMin = new heap_1.KMin(k);
-                var iReal = offset + i;
-                for (var j = 0; j < N; j++) {
-                    if (j === iReal) {
-                        continue;
-                    }
-                    var cosDist = 1 - partial[j * B + i]; // [j, i];
-                    kMin.add(cosDist, { index: j, dist: cosDist });
-                }
-                nearest[iReal] = kMin.getMinKItems();
-            }
-            progress += progressDiff;
-            offset += B;
-            piece++;
-        }, KNN_GPU_MSG_ID).then(function () {
-            if (piece < numPieces) {
-                step(resolve);
-            }
-            else {
-                logging.setModalMessage(null, KNN_GPU_MSG_ID);
-                bigMatrix.delete();
-                resolve(nearest);
-            }
-        }, function (error) {
-            // GPU failed. Reverting back to CPU.
-            logging.setModalMessage(null, KNN_GPU_MSG_ID);
-            var distFunc = function (a, b, limit) { return vector.cosDistNorm(a, b); };
-            findKNN(dataPoints, k, accessor, distFunc).then(function (nearest) {
-                resolve(nearest);
-            });
-        });
-    }
-    return new Promise(function (resolve) { return step(resolve); });
-}
-exports.findKNNGPUCosine = findKNNGPUCosine;
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the CPU using a user-specified distance method.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- * @param dist Method that takes two vectors and a limit, and computes the
- *   distance between two vectors, with the ability to stop early if the
- *   distance is above the limit.
- */
-function findKNN(dataPoints, k, accessor, dist) {
-    return util_1.runAsyncTask('Finding nearest neighbors...', function () {
-        var N = dataPoints.length;
-        var nearest = new Array(N);
-        // Find the distances from node i.
-        var kMin = new Array(N);
-        for (var i = 0; i < N; i++) {
-            kMin[i] = new heap_1.KMin(k);
-        }
-        for (var i = 0; i < N; i++) {
-            var a = accessor(dataPoints[i]);
-            var kMinA = kMin[i];
-            for (var j = i + 1; j < N; j++) {
-                var kMinB = kMin[j];
-                var limitI = kMinA.getSize() === k ?
-                    kMinA.getLargestKey() || Number.MAX_VALUE :
-                    Number.MAX_VALUE;
-                var limitJ = kMinB.getSize() === k ?
-                    kMinB.getLargestKey() || Number.MAX_VALUE :
-                    Number.MAX_VALUE;
-                var limit = Math.max(limitI, limitJ);
-                var dist2ItoJ = dist(a, accessor(dataPoints[j]), limit);
-                if (dist2ItoJ >= 0) {
-                    kMinA.add(dist2ItoJ, { index: j, dist: dist2ItoJ });
-                    kMinB.add(dist2ItoJ, { index: i, dist: dist2ItoJ });
-                }
-            }
-        }
-        for (var i = 0; i < N; i++) {
-            nearest[i] = kMin[i].getMinKItems();
-        }
-        return nearest;
-    });
-}
-exports.findKNN = findKNN;
-/** Calculates the minimum distance between a search point and a rectangle. */
-function minDist(point, x1, y1, x2, y2) {
-    var x = point[0];
-    var y = point[1];
-    var dx1 = x - x1;
-    var dx2 = x - x2;
-    var dy1 = y - y1;
-    var dy2 = y - y2;
-    if (dx1 * dx2 <= 0) {
-        if (dy1 * dy2 <= 0) {
-            return 0; // return 0 as point is in rect
-        }
-        return Math.min(Math.abs(dy1), Math.abs(dy2));
-    }
-    if (dy1 * dy2 <= 0) {
-        // We know it is already inside the rectangle
-        return Math.min(Math.abs(dx1), Math.abs(dx2));
-    }
-    var corner;
-    if (x > x2) {
-        // Upper-right vs lower-right.
-        corner = y > y2 ? [x2, y2] : [x2, y1];
-    }
-    else {
-        // Upper-left vs lower-left.
-        corner = y > y2 ? [x1, y2] : [x1, y1];
-    }
-    return Math.sqrt(vector.dist22D([x, y], corner));
-}
-/**
- * Returns the nearest neighbors of a particular point.
- *
- * @param dataPoints List of data points.
- * @param pointIndex The index of the point we need the nearest neighbors of.
- * @param k Number of nearest neighbors to search for.
- * @param accessor Method that maps a data point => vector (array of numbers).
- * @param distance Method that takes two vectors and returns their distance.
- */
-function findKNNofPoint(dataPoints, pointIndex, k, accessor, distance) {
-    var kMin = new heap_1.KMin(k);
-    var a = accessor(dataPoints[pointIndex]);
-    for (var i = 0; i < dataPoints.length; ++i) {
-        if (i === pointIndex) {
-            continue;
-        }
-        var b = accessor(dataPoints[i]);
-        var dist = distance(a, b);
-        kMin.add(dist, { index: i, dist: dist });
-    }
-    return kMin.getMinKItems();
-}
-exports.findKNNofPoint = findKNNofPoint;
-
-},{"./heap":9,"./logging":12,"./util":24,"./vector":25}],11:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * Accelerates label placement by dividing the view into a uniform grid.
- * Labels only need to be tested for collision with other labels that overlap
- * the same grid cells. This is a fork of {@code amoeba.CollisionGrid}.
- */
-var CollisionGrid = (function () {
-    /**
-     * Constructs a new Collision grid.
-     *
-     * @param bound The bound of the grid. Labels out of bounds will be rejected.
-     * @param cellWidth Width of a cell in the grid.
-     * @param cellHeight Height of a cell in the grid.
-     */
-    function CollisionGrid(bound, cellWidth, cellHeight) {
-        /** The bound of the grid. Labels out of bounds will be rejected. */
-        this.bound = bound;
-        /** Width of a cell in the grid. */
-        this.cellWidth = cellWidth;
-        /** Height of a cell in the grid. */
-        this.cellHeight = cellHeight;
-        /** Number of grid cells along the x axis. */
-        this.numHorizCells = Math.ceil(this.boundWidth(bound) / cellWidth);
-        /** Number of grid cells along the y axis. */
-        this.numVertCells = Math.ceil(this.boundHeight(bound) / cellHeight);
-        /**
-         * The 2d grid (stored as a 1d array.) Each cell consists of an array of
-         * BoundingBoxes for objects that are in the cell.
-         */
-        this.grid = new Array(this.numHorizCells * this.numVertCells);
-    }
-    CollisionGrid.prototype.boundWidth = function (bound) { return bound.hiX - bound.loX; };
-    CollisionGrid.prototype.boundHeight = function (bound) { return bound.hiY - bound.loY; };
-    CollisionGrid.prototype.boundsIntersect = function (a, b) {
-        return !(a.loX > b.hiX || a.loY > b.hiY || a.hiX < b.loX || a.hiY < b.loY);
-    };
-    /**
-     * Checks if a given bounding box has any conflicts in the grid and inserts it
-     * if none are found.
-     *
-     * @param bound The bound to insert.
-     * @param justTest If true, just test if it conflicts, without inserting.
-     * @return True if the bound was successfully inserted; false if it
-     *         could not be inserted due to a conflict.
-     */
-    CollisionGrid.prototype.insert = function (bound, justTest) {
-        if (justTest === void 0) { justTest = false; }
-        // Reject if the label is out of bounds.
-        if ((bound.hiX < this.bound.loX) || (bound.loX > this.bound.hiX) ||
-            (bound.hiY < this.bound.loY) || (bound.loY > this.bound.hiY)) {
-            return false;
-        }
-        var minCellX = this.getCellX(bound.loX);
-        var maxCellX = this.getCellX(bound.hiX);
-        var minCellY = this.getCellY(bound.loY);
-        var maxCellY = this.getCellY(bound.hiY);
-        // Check all overlapped cells to verify that we can insert.
-        var baseIdx = minCellY * this.numHorizCells + minCellX;
-        var idx = baseIdx;
-        for (var j = minCellY; j <= maxCellY; j++) {
-            for (var i = minCellX; i <= maxCellX; i++) {
-                var cell = this.grid[idx++];
-                if (cell) {
-                    for (var k = 0; k < cell.length; k++) {
-                        if (this.boundsIntersect(bound, cell[k])) {
-                            return false;
-                        }
-                    }
-                }
-            }
-            idx += this.numHorizCells - (maxCellX - minCellX + 1);
-        }
-        if (justTest) {
-            return true;
-        }
-        // Insert into the overlapped cells.
-        idx = baseIdx;
-        for (var j = minCellY; j <= maxCellY; j++) {
-            for (var i = minCellX; i <= maxCellX; i++) {
-                if (!this.grid[idx]) {
-                    this.grid[idx] = [bound];
-                }
-                else {
-                    this.grid[idx].push(bound);
-                }
-                idx++;
-            }
-            idx += this.numHorizCells - (maxCellX - minCellX + 1);
-        }
-        return true;
-    };
-    /**
-     * Returns the x index of the grid cell where the given x coordinate falls.
-     *
-     * @param x the coordinate, in world space.
-     * @return the x index of the cell.
-     */
-    CollisionGrid.prototype.getCellX = function (x) {
-        return Math.floor((x - this.bound.loX) / this.cellWidth);
-    };
-    ;
-    /**
-     * Returns the y index of the grid cell where the given y coordinate falls.
-     *
-     * @param y the coordinate, in world space.
-     * @return the y index of the cell.
-     */
-    CollisionGrid.prototype.getCellY = function (y) {
-        return Math.floor((y - this.bound.loY) / this.cellHeight);
-    };
-    ;
-    return CollisionGrid;
-}());
-exports.CollisionGrid = CollisionGrid;
-
-},{}],12:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/** Duration in ms for showing warning messages to the user */
-var WARNING_DURATION_MS = 10000;
-var dom = null;
-var msgId = 0;
-var numActiveMessages = 0;
-function setDomContainer(domElement) {
-    dom = domElement;
-}
-exports.setDomContainer = setDomContainer;
-/**
- * Updates the user message with the provided id.
- *
- * @param msg The message shown to the user. If null, the message is removed.
- * @param id The id of an existing message. If no id is provided, a unique id
- *     is assigned.
- * @param title The title of the notification.
- * @param isErrorMsg If true, the message is error and the dialog will have a
- *                   close button.
- * @return The id of the message.
- */
-function setModalMessage(msg, id, title, isErrorMsg) {
-    if (id === void 0) { id = null; }
-    if (title === void 0) { title = null; }
-    if (isErrorMsg === void 0) { isErrorMsg = false; }
-    if (dom == null) {
-        console.warn('Can\'t show modal message before the dom is initialized');
-        return;
-    }
-    if (id == null) {
-        id = (msgId++).toString();
-    }
-    var dialog = dom.querySelector('#notification-dialog');
-    dialog.querySelector('.close-button').style.display =
-        isErrorMsg ? null : 'none';
-    var spinner = dialog.querySelector('.progress');
-    spinner.style.display = isErrorMsg ? 'none' : null;
-    spinner.active = isErrorMsg ? null : true;
-    dialog.querySelector('#notification-title').innerHTML = title;
-    var msgsContainer = dialog.querySelector('#notify-msgs');
-    if (isErrorMsg) {
-        d3.select(msgsContainer).html('');
-    }
-    else {
-        d3.select(msgsContainer).selectAll('.error').remove();
-    }
-    var divId = "notify-msg-" + id;
-    var msgDiv = d3.select(dialog.querySelector('#' + divId));
-    var exists = msgDiv.size() > 0;
-    if (!exists) {
-        msgDiv = d3.select(msgsContainer)
-            .insert('div', ':first-child')
-            .attr('class', 'notify-msg')
-            .classed('error', isErrorMsg)
-            .attr('id', divId);
-        if (!isErrorMsg) {
-            numActiveMessages++;
-        }
-        else {
-            numActiveMessages = 0;
-        }
-    }
-    if (msg == null) {
-        numActiveMessages--;
-        if (numActiveMessages === 0) {
-            dialog.close();
-        }
-        msgDiv.remove();
-    }
-    else {
-        msgDiv.text(msg);
-        dialog.open();
-    }
-    return id;
-}
-exports.setModalMessage = setModalMessage;
-function setErrorMessage(errMsg) {
-    setModalMessage(errMsg, null, 'Error', true);
-}
-exports.setErrorMessage = setErrorMessage;
-/**
- * Shows a warning message to the user for a certain amount of time.
- */
-function setWarningMessage(msg) {
-    var toast = dom.querySelector('#toast');
-    toast.text = msg;
-    toast.duration = WARNING_DURATION_MS;
-    toast.open();
-}
-exports.setWarningMessage = setWarningMessage;
-
-},{}],13:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-
-},{}],14:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var scatterPlot_1 = require('./scatterPlot');
-var scatterPlotVisualizer3DLabels_1 = require('./scatterPlotVisualizer3DLabels');
-var scatterPlotVisualizerCanvasLabels_1 = require('./scatterPlotVisualizerCanvasLabels');
-var scatterPlotVisualizerSprites_1 = require('./scatterPlotVisualizerSprites');
-var scatterPlotVisualizerTraces_1 = require('./scatterPlotVisualizerTraces');
-var vector = require('./vector');
-var LABEL_FONT_SIZE = 10;
-var LABEL_SCALE_DEFAULT = 1.0;
-var LABEL_SCALE_LARGE = 2;
-var LABEL_FILL_COLOR_SELECTED = 0x000000;
-var LABEL_FILL_COLOR_HOVER = 0x000000;
-var LABEL_FILL_COLOR_NEIGHBOR = 0x000000;
-var LABEL_STROKE_COLOR_SELECTED = 0xFFFFFF;
-var LABEL_STROKE_COLOR_HOVER = 0xFFFFFF;
-var LABEL_STROKE_COLOR_NEIGHBOR = 0xFFFFFF;
-var POINT_COLOR_UNSELECTED = 0xE3E3E3;
-var POINT_COLOR_NO_SELECTION = 0x7575D9;
-var POINT_COLOR_SELECTED = 0xFA6666;
-var POINT_COLOR_HOVER = 0x760B4F;
-var POINT_SCALE_DEFAULT = 1.0;
-var POINT_SCALE_SELECTED = 1.2;
-var POINT_SCALE_NEIGHBOR = 1.2;
-var POINT_SCALE_HOVER = 1.2;
-var LABELS_3D_COLOR_UNSELECTED = 0xFFFFFF;
-var LABELS_3D_COLOR_NO_SELECTION = 0xFFFFFF;
-var SPRITE_IMAGE_COLOR_UNSELECTED = 0xFFFFFF;
-var SPRITE_IMAGE_COLOR_NO_SELECTION = 0xFFFFFF;
-var TRACE_START_HUE = 60;
-var TRACE_END_HUE = 360;
-var TRACE_SATURATION = 1;
-var TRACE_LIGHTNESS = .3;
-var TRACE_DEFAULT_OPACITY = .2;
-var TRACE_DEFAULT_LINEWIDTH = 2;
-var TRACE_SELECTED_OPACITY = .9;
-var TRACE_SELECTED_LINEWIDTH = 3;
-var TRACE_DESELECTED_OPACITY = .05;
-var SCATTER_PLOT_CUBE_LENGTH = 2;
-/** Color scale for nearest neighbors. */
-var NN_COLOR_SCALE = d3.scale.linear()
-    .domain([1, 0.7, 0.4])
-    .range(['hsl(285, 80%, 40%)', 'hsl(0, 80%, 65%)', 'hsl(40, 70%, 60%)'])
-    .clamp(true);
-/**
- * Interprets projector events and assembes the arrays and commands necessary
- * to use the ScatterPlot to render the current projected data set.
- */
-var ProjectorScatterPlotAdapter = (function () {
-    function ProjectorScatterPlotAdapter(scatterPlotContainer, projectorEventContext) {
-        var _this = this;
-        this.renderLabelsIn3D = false;
-        this.scatterPlot =
-            new scatterPlot_1.ScatterPlot(scatterPlotContainer, projectorEventContext);
-        this.scatterPlotContainer = scatterPlotContainer;
-        projectorEventContext.registerProjectionChangedListener(function (projection) {
-            _this.projection = projection;
-            _this.updateScatterPlotWithNewProjection(projection);
-        });
-        projectorEventContext.registerSelectionChangedListener(function (selectedPointIndices, neighbors) {
-            _this.selectedPointIndices = selectedPointIndices;
-            _this.neighborsOfFirstSelectedPoint = neighbors;
-            _this.updateScatterPlotPositions();
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        projectorEventContext.registerHoverListener(function (hoverPointIndex) {
-            _this.hoverPointIndex = hoverPointIndex;
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        projectorEventContext.registerDistanceMetricChangedListener(function (distanceMetric) {
-            _this.distanceMetric = distanceMetric;
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        this.createVisualizers(false);
-    }
-    ProjectorScatterPlotAdapter.prototype.notifyProjectionPositionsUpdated = function () {
-        this.updateScatterPlotPositions();
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.setDataSet = function (dataSet) {
-        if (this.projection != null) {
-            // TODO(nicholsonc): setDataSet needs to go away, the projection is the
-            // atomic unit of update.
-            this.projection.dataSet = dataSet;
-        }
-        if (this.traceVisualizer != null) {
-            this.traceVisualizer.setDataSet(dataSet);
-        }
-        if (this.labels3DVisualizer != null) {
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(dataSet, this.labelPointAccessor));
-        }
-        if (this.spriteVisualizer == null) {
-            return;
-        }
-        this.spriteVisualizer.clearSpriteAtlas();
-        if ((dataSet == null) || (dataSet.spriteAndMetadataInfo == null)) {
-            return;
-        }
-        var metadata = dataSet.spriteAndMetadataInfo;
-        if ((metadata.spriteImage == null) || (metadata.spriteMetadata == null)) {
-            return;
-        }
-        var n = dataSet.points.length;
-        var spriteIndices = new Float32Array(n);
-        for (var i = 0; i < n; ++i) {
-            spriteIndices[i] = dataSet.points[i].index;
-        }
-        this.spriteVisualizer.setSpriteAtlas(metadata.spriteImage, metadata.spriteMetadata.singleImageDim, spriteIndices);
-    };
-    ProjectorScatterPlotAdapter.prototype.set3DLabelMode = function (renderLabelsIn3D) {
-        this.renderLabelsIn3D = renderLabelsIn3D;
-        this.createVisualizers(renderLabelsIn3D);
-        this.updateScatterPlotAttributes();
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.setLegendPointColorer = function (legendPointColorer) {
-        this.legendPointColorer = legendPointColorer;
-    };
-    ProjectorScatterPlotAdapter.prototype.setLabelPointAccessor = function (labelPointAccessor) {
-        this.labelPointAccessor = labelPointAccessor;
-        if (this.labels3DVisualizer != null) {
-            var ds = (this.projection == null) ? null : this.projection.dataSet;
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(ds, labelPointAccessor));
-        }
-    };
-    ProjectorScatterPlotAdapter.prototype.resize = function () {
-        this.scatterPlot.resize();
-    };
-    ProjectorScatterPlotAdapter.prototype.populateBookmarkFromUI = function (state) {
-        state.cameraDef = this.scatterPlot.getCameraDef();
-    };
-    ProjectorScatterPlotAdapter.prototype.restoreUIFromBookmark = function (state) {
-        this.scatterPlot.setCameraParametersForNextCameraCreation(state.cameraDef, false);
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotPositions = function () {
-        var ds = (this.projection == null) ? null : this.projection.dataSet;
-        var projectionComponents = (this.projection == null) ? null : this.projection.projectionComponents;
-        var newPositions = this.generatePointPositionArray(ds, projectionComponents);
-        this.scatterPlot.setPointPositions(newPositions);
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotAttributes = function () {
-        if (this.projection == null) {
-            return;
-        }
-        var dataSet = this.projection.dataSet;
-        var selectedSet = this.selectedPointIndices;
-        var hoverIndex = this.hoverPointIndex;
-        var neighbors = this.neighborsOfFirstSelectedPoint;
-        var pointColorer = this.legendPointColorer;
-        var pointColors = this.generatePointColorArray(dataSet, pointColorer, this.distanceMetric, selectedSet, neighbors, hoverIndex, this.renderLabelsIn3D, this.getSpriteImageMode());
-        var pointScaleFactors = this.generatePointScaleFactorArray(dataSet, selectedSet, neighbors, hoverIndex);
-        var labels = this.generateVisibleLabelRenderParams(dataSet, selectedSet, neighbors, hoverIndex);
-        var traceColors = this.generateLineSegmentColorMap(dataSet, pointColorer);
-        var traceOpacities = this.generateLineSegmentOpacityArray(dataSet, selectedSet);
-        var traceWidths = this.generateLineSegmentWidthArray(dataSet, selectedSet);
-        this.scatterPlot.setPointColors(pointColors);
-        this.scatterPlot.setPointScaleFactors(pointScaleFactors);
-        this.scatterPlot.setLabels(labels);
-        this.scatterPlot.setTraceColors(traceColors);
-        this.scatterPlot.setTraceOpacities(traceOpacities);
-        this.scatterPlot.setTraceWidths(traceWidths);
-    };
-    ProjectorScatterPlotAdapter.prototype.render = function () {
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointPositionArray = function (ds, projectionComponents) {
-        if (ds == null) {
-            return null;
-        }
-        var xScaler = d3.scale.linear();
-        var yScaler = d3.scale.linear();
-        var zScaler = null;
-        {
-            // Determine max and min of each axis of our data.
-            var xExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[0]]; });
-            var yExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[1]]; });
-            var range = [-SCATTER_PLOT_CUBE_LENGTH / 2, SCATTER_PLOT_CUBE_LENGTH / 2];
-            xScaler.domain(xExtent).range(range);
-            yScaler.domain(yExtent).range(range);
-            if (projectionComponents[2] != null) {
-                var zExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[2]]; });
-                zScaler = d3.scale.linear();
-                zScaler.domain(zExtent).range(range);
-            }
-        }
-        var positions = new Float32Array(ds.points.length * 3);
-        var dst = 0;
-        ds.points.forEach(function (d, i) {
-            positions[dst++] =
-                xScaler(ds.points[i].projections[projectionComponents[0]]);
-            positions[dst++] =
-                yScaler(ds.points[i].projections[projectionComponents[1]]);
-            positions[dst++] = 0.0;
-        });
-        if (zScaler) {
-            dst = 2;
-            ds.points.forEach(function (d, i) {
-                positions[dst] =
-                    zScaler(ds.points[i].projections[projectionComponents[2]]);
-                dst += 3;
-            });
-        }
-        return positions;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateVisibleLabelRenderParams = function (ds, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex) {
-        if (ds == null) {
-            return null;
-        }
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        var n = selectedPointCount + neighborCount +
-            ((hoverPointIndex != null) ? 1 : 0);
-        var visibleLabels = new Uint32Array(n);
-        var scale = new Float32Array(n);
-        var opacityFlags = new Int8Array(n);
-        var fillColors = new Uint8Array(n * 3);
-        var strokeColors = new Uint8Array(n * 3);
-        var labelStrings = [];
-        scale.fill(LABEL_SCALE_DEFAULT);
-        opacityFlags.fill(1);
-        var dst = 0;
-        if (hoverPointIndex != null) {
-            labelStrings.push(this.getLabelText(ds, hoverPointIndex, this.labelPointAccessor));
-            visibleLabels[dst] = hoverPointIndex;
-            scale[dst] = LABEL_SCALE_LARGE;
-            opacityFlags[dst] = 0;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_HOVER);
-            packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_HOVER);
-            packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[1]);
-            ++dst;
-        }
-        // Selected points
-        {
-            var n_1 = selectedPointCount;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_SELECTED);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_SELECTED);
-            for (var i = 0; i < n_1; ++i) {
-                var labelIndex = selectedPointIndices[i];
-                labelStrings.push(this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-                visibleLabels[dst] = labelIndex;
-                scale[dst] = LABEL_SCALE_LARGE;
-                opacityFlags[dst] = (n_1 === 1) ? 0 : 1;
-                packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-                packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-                ++dst;
-            }
-        }
-        // Neighbors
-        {
-            var n_2 = neighborCount;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_NEIGHBOR);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_NEIGHBOR);
-            for (var i = 0; i < n_2; ++i) {
-                var labelIndex = neighborsOfFirstPoint[i].index;
-                labelStrings.push(this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-                visibleLabels[dst] = labelIndex;
-                packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-                packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-                ++dst;
-            }
-        }
-        return new renderContext_1.LabelRenderParams(visibleLabels, labelStrings, scale, opacityFlags, LABEL_FONT_SIZE, fillColors, strokeColors);
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointScaleFactorArray = function (ds, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var scale = new Float32Array(ds.points.length);
-        scale.fill(POINT_SCALE_DEFAULT);
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        // Scale up all selected points.
-        {
-            var n = selectedPointCount;
-            for (var i = 0; i < n; ++i) {
-                var p = selectedPointIndices[i];
-                scale[p] = POINT_SCALE_SELECTED;
-            }
-        }
-        // Scale up the neighbor points.
-        {
-            var n = neighborCount;
-            for (var i = 0; i < n; ++i) {
-                var p = neighborsOfFirstPoint[i].index;
-                scale[p] = POINT_SCALE_NEIGHBOR;
-            }
-        }
-        // Scale up the hover point.
-        if (hoverPointIndex != null) {
-            scale[hoverPointIndex] = POINT_SCALE_HOVER;
-        }
-        return scale;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentColorMap = function (ds, legendPointColorer) {
-        var traceColorArrayMap = {};
-        if (ds == null) {
-            return traceColorArrayMap;
-        }
-        for (var i = 0; i < ds.traces.length; i++) {
-            var dataTrace = ds.traces[i];
-            var colors = new Float32Array(2 * (dataTrace.pointIndices.length - 1) * 3);
-            var colorIndex = 0;
-            if (legendPointColorer) {
-                for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                    var c1 = new THREE.Color(legendPointColorer(ds, dataTrace.pointIndices[j]));
-                    var c2 = new THREE.Color(legendPointColorer(ds, dataTrace.pointIndices[j + 1]));
-                    colors[colorIndex++] = c1.r;
-                    colors[colorIndex++] = c1.g;
-                    colors[colorIndex++] = c1.b;
-                    colors[colorIndex++] = c2.r;
-                    colors[colorIndex++] = c2.g;
-                    colors[colorIndex++] = c2.b;
-                }
-            }
-            else {
-                for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                    var c1 = getDefaultPointInTraceColor(j, dataTrace.pointIndices.length);
-                    var c2 = getDefaultPointInTraceColor(j + 1, dataTrace.pointIndices.length);
-                    colors[colorIndex++] = c1.r;
-                    colors[colorIndex++] = c1.g;
-                    colors[colorIndex++] = c1.b;
-                    colors[colorIndex++] = c2.r;
-                    colors[colorIndex++] = c2.g;
-                    colors[colorIndex++] = c2.b;
-                }
-            }
-            traceColorArrayMap[i] = colors;
-        }
-        return traceColorArrayMap;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentOpacityArray = function (ds, selectedPoints) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var opacities = new Float32Array(ds.traces.length);
-        var selectedPointCount = (selectedPoints == null) ? 0 : selectedPoints.length;
-        if (selectedPointCount > 0) {
-            opacities.fill(TRACE_DESELECTED_OPACITY);
-            var i = ds.points[selectedPoints[0]].traceIndex;
-            opacities[i] = TRACE_SELECTED_OPACITY;
-        }
-        else {
-            opacities.fill(TRACE_DEFAULT_OPACITY);
-        }
-        return opacities;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentWidthArray = function (ds, selectedPoints) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var widths = new Float32Array(ds.traces.length);
-        widths.fill(TRACE_DEFAULT_LINEWIDTH);
-        var selectedPointCount = (selectedPoints == null) ? 0 : selectedPoints.length;
-        if (selectedPointCount > 0) {
-            var i = ds.points[selectedPoints[0]].traceIndex;
-            widths[i] = TRACE_SELECTED_LINEWIDTH;
-        }
-        return widths;
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointColorArray = function (ds, legendPointColorer, distFunc, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex, label3dMode, spriteImageMode) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        var colors = new Float32Array(ds.points.length * 3);
-        var unselectedColor = POINT_COLOR_UNSELECTED;
-        var noSelectionColor = POINT_COLOR_NO_SELECTION;
-        if (label3dMode) {
-            unselectedColor = LABELS_3D_COLOR_UNSELECTED;
-            noSelectionColor = LABELS_3D_COLOR_NO_SELECTION;
-        }
-        if (spriteImageMode) {
-            unselectedColor = SPRITE_IMAGE_COLOR_UNSELECTED;
-            noSelectionColor = SPRITE_IMAGE_COLOR_NO_SELECTION;
-        }
-        // Give all points the unselected color.
-        {
-            var n = ds.points.length;
-            var dst = 0;
-            if (selectedPointCount > 0) {
-                var c = new THREE.Color(unselectedColor);
-                for (var i = 0; i < n; ++i) {
-                    colors[dst++] = c.r;
-                    colors[dst++] = c.g;
-                    colors[dst++] = c.b;
-                }
-            }
-            else {
-                if (legendPointColorer != null) {
-                    for (var i = 0; i < n; ++i) {
-                        var c = new THREE.Color(legendPointColorer(ds, i));
-                        colors[dst++] = c.r;
-                        colors[dst++] = c.g;
-                        colors[dst++] = c.b;
-                    }
-                }
-                else {
-                    var c = new THREE.Color(noSelectionColor);
-                    for (var i = 0; i < n; ++i) {
-                        colors[dst++] = c.r;
-                        colors[dst++] = c.g;
-                        colors[dst++] = c.b;
-                    }
-                }
-            }
-        }
-        // Color the selected points.
-        {
-            var n = selectedPointCount;
-            var c = new THREE.Color(POINT_COLOR_SELECTED);
-            for (var i = 0; i < n; ++i) {
-                var dst = selectedPointIndices[i] * 3;
-                colors[dst++] = c.r;
-                colors[dst++] = c.g;
-                colors[dst++] = c.b;
-            }
-        }
-        // Color the neighbors.
-        {
-            var n = neighborCount;
-            var minDist = n > 0 ? neighborsOfFirstPoint[0].dist : 0;
-            for (var i = 0; i < n; ++i) {
-                var c = new THREE.Color(dist2color(distFunc, neighborsOfFirstPoint[i].dist, minDist));
-                var dst = neighborsOfFirstPoint[i].index * 3;
-                colors[dst++] = c.r;
-                colors[dst++] = c.g;
-                colors[dst++] = c.b;
-            }
-        }
-        // Color the hover point.
-        if (hoverPointIndex != null) {
-            var c = new THREE.Color(POINT_COLOR_HOVER);
-            var dst = hoverPointIndex * 3;
-            colors[dst++] = c.r;
-            colors[dst++] = c.g;
-            colors[dst++] = c.b;
-        }
-        return colors;
-    };
-    ProjectorScatterPlotAdapter.prototype.generate3DLabelsArray = function (ds, accessor) {
-        if ((ds == null) || (accessor == null)) {
-            return null;
-        }
-        var labels = [];
-        var n = ds.points.length;
-        for (var i = 0; i < n; ++i) {
-            labels.push(this.getLabelText(ds, i, accessor));
-        }
-        return labels;
-    };
-    ProjectorScatterPlotAdapter.prototype.getLabelText = function (ds, i, accessor) {
-        return ds.points[i].metadata[accessor].toString();
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotWithNewProjection = function (projection) {
-        if (projection == null) {
-            this.createVisualizers(this.renderLabelsIn3D);
-            this.scatterPlot.render();
-            return;
-        }
-        this.setDataSet(projection.dataSet);
-        this.scatterPlot.setDimensions(projection.dimensionality);
-        if (projection.dataSet.projectionCanBeRendered(projection.projectionType)) {
-            this.updateScatterPlotAttributes();
-            this.notifyProjectionPositionsUpdated();
-        }
-        this.scatterPlot.setCameraParametersForNextCameraCreation(null, false);
-    };
-    ProjectorScatterPlotAdapter.prototype.createVisualizers = function (inLabels3DMode) {
-        var ds = (this.projection == null) ? null : this.projection.dataSet;
-        var scatterPlot = this.scatterPlot;
-        scatterPlot.removeAllVisualizers();
-        this.labels3DVisualizer = null;
-        this.canvasLabelsVisualizer = null;
-        this.spriteVisualizer = null;
-        this.traceVisualizer = null;
-        if (inLabels3DMode) {
-            this.labels3DVisualizer = new scatterPlotVisualizer3DLabels_1.ScatterPlotVisualizer3DLabels();
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(ds, this.labelPointAccessor));
-        }
-        else {
-            this.spriteVisualizer = new scatterPlotVisualizerSprites_1.ScatterPlotVisualizerSprites();
-            scatterPlot.addVisualizer(this.spriteVisualizer);
-            this.canvasLabelsVisualizer =
-                new scatterPlotVisualizerCanvasLabels_1.ScatterPlotVisualizerCanvasLabels(this.scatterPlotContainer);
-        }
-        this.traceVisualizer = new scatterPlotVisualizerTraces_1.ScatterPlotVisualizerTraces();
-        this.setDataSet(ds);
-        if (this.spriteVisualizer) {
-            scatterPlot.addVisualizer(this.spriteVisualizer);
-        }
-        if (this.labels3DVisualizer) {
-            scatterPlot.addVisualizer(this.labels3DVisualizer);
-        }
-        if (this.canvasLabelsVisualizer) {
-            scatterPlot.addVisualizer(this.canvasLabelsVisualizer);
-        }
-        scatterPlot.addVisualizer(this.traceVisualizer);
-    };
-    ProjectorScatterPlotAdapter.prototype.getSpriteImageMode = function () {
-        if (this.projection == null) {
-            return false;
-        }
-        var ds = this.projection.dataSet;
-        if ((ds == null) || (ds.spriteAndMetadataInfo == null)) {
-            return false;
-        }
-        return ds.spriteAndMetadataInfo.spriteImage != null;
-    };
-    return ProjectorScatterPlotAdapter;
-}());
-exports.ProjectorScatterPlotAdapter = ProjectorScatterPlotAdapter;
-function packRgbIntoUint8Array(rgbArray, labelIndex, r, g, b) {
-    rgbArray[labelIndex * 3] = r;
-    rgbArray[labelIndex * 3 + 1] = g;
-    rgbArray[labelIndex * 3 + 2] = b;
-}
-function styleRgbFromHexColor(hex) {
-    var c = new THREE.Color(hex);
-    return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
-}
-function getDefaultPointInTraceColor(index, totalPoints) {
-    var hue = TRACE_START_HUE + (TRACE_END_HUE - TRACE_START_HUE) * index / totalPoints;
-    var rgb = d3.hsl(hue, TRACE_SATURATION, TRACE_LIGHTNESS).rgb();
-    return new THREE.Color(rgb.r / 255, rgb.g / 255, rgb.b / 255);
-}
-/**
- * Normalizes the distance so it can be visually encoded with color.
- * The normalization depends on the distance metric (cosine vs euclidean).
- */
-function normalizeDist(distFunc, d, minDist) {
-    return (distFunc === vector.dist) ? (minDist / d) : (1 - d);
-}
-exports.normalizeDist = normalizeDist;
-/** Normalizes and encodes the provided distance with color. */
-function dist2color(distFunc, d, minDist) {
-    return NN_COLOR_SCALE(normalizeDist(distFunc, d, minDist));
-}
-exports.dist2color = dist2color;
-
-},{"./renderContext":15,"./scatterPlot":16,"./scatterPlotVisualizer3DLabels":19,"./scatterPlotVisualizerCanvasLabels":20,"./scatterPlotVisualizerSprites":21,"./scatterPlotVisualizerTraces":22,"./vector":25}],15:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http:www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * LabelRenderParams describes the set of points that should have labels
- * rendered next to them.
- */
-var LabelRenderParams = (function () {
-    function LabelRenderParams(pointIndices, labelStrings, scaleFactors, useSceneOpacityFlags, defaultFontSize, fillColors, strokeColors) {
-        this.pointIndices = pointIndices;
-        this.labelStrings = labelStrings;
-        this.scaleFactors = scaleFactors;
-        this.useSceneOpacityFlags = useSceneOpacityFlags;
-        this.defaultFontSize = defaultFontSize;
-        this.fillColors = fillColors;
-        this.strokeColors = strokeColors;
-    }
-    return LabelRenderParams;
-}());
-exports.LabelRenderParams = LabelRenderParams;
-/** Details about the camera projection being used to render the scene. */
-(function (CameraType) {
-    CameraType[CameraType["Perspective"] = 0] = "Perspective";
-    CameraType[CameraType["Orthographic"] = 1] = "Orthographic";
-})(exports.CameraType || (exports.CameraType = {}));
-var CameraType = exports.CameraType;
-/**
- * RenderContext contains all of the state required to color and render the data
- * set. ScatterPlot passes this to every attached visualizer as part of the
- * render callback.
- * TODO(nicholsonc): This should only contain the data that's changed between
- * each frame. Data like colors / scale factors / labels should be reapplied
- * only when they change.
- */
-var RenderContext = (function () {
-    function RenderContext(camera, cameraType, cameraTarget, screenWidth, screenHeight, nearestCameraSpacePointZ, farthestCameraSpacePointZ, backgroundColor, pointColors, pointScaleFactors, labels, traceColors, traceOpacities, traceWidths) {
-        this.camera = camera;
-        this.cameraType = cameraType;
-        this.cameraTarget = cameraTarget;
-        this.screenWidth = screenWidth;
-        this.screenHeight = screenHeight;
-        this.nearestCameraSpacePointZ = nearestCameraSpacePointZ;
-        this.farthestCameraSpacePointZ = farthestCameraSpacePointZ;
-        this.backgroundColor = backgroundColor;
-        this.pointColors = pointColors;
-        this.pointScaleFactors = pointScaleFactors;
-        this.labels = labels;
-        this.traceColors = traceColors;
-        this.traceOpacities = traceOpacities;
-        this.traceWidths = traceWidths;
-    }
-    return RenderContext;
-}());
-exports.RenderContext = RenderContext;
-
-},{}],16:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var scatterPlotRectangleSelector_1 = require('./scatterPlotRectangleSelector');
-var util = require('./util');
-var BACKGROUND_COLOR = 0xffffff;
-/**
- * The length of the cube (diameter of the circumscribing sphere) where all the
- * points live.
- */
-var CUBE_LENGTH = 2;
-var MAX_ZOOM = 5 * CUBE_LENGTH;
-var MIN_ZOOM = 0.025 * CUBE_LENGTH;
-// Constants relating to the camera parameters.
-var PERSP_CAMERA_FOV_VERTICAL = 70;
-var PERSP_CAMERA_NEAR_CLIP_PLANE = 0.01;
-var PERSP_CAMERA_FAR_CLIP_PLANE = 100;
-var ORTHO_CAMERA_FRUSTUM_HALF_EXTENT = 1.2;
-// Key presses.
-var SHIFT_KEY = 16;
-var CTRL_KEY = 17;
-var START_CAMERA_POS_3D = new THREE.Vector3(0.45, 0.9, 1.6);
-var START_CAMERA_TARGET_3D = new THREE.Vector3(0, 0, 0);
-var START_CAMERA_POS_2D = new THREE.Vector3(0, 0, 4);
-var START_CAMERA_TARGET_2D = new THREE.Vector3(0, 0, 0);
-var ORBIT_MOUSE_ROTATION_SPEED = 1;
-var ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS = 7;
-/** Supported modes of interaction. */
-(function (MouseMode) {
-    MouseMode[MouseMode["AREA_SELECT"] = 0] = "AREA_SELECT";
-    MouseMode[MouseMode["CAMERA_AND_CLICK_SELECT"] = 1] = "CAMERA_AND_CLICK_SELECT";
-})(exports.MouseMode || (exports.MouseMode = {}));
-var MouseMode = exports.MouseMode;
-/** Defines a camera, suitable for serialization. */
-var CameraDef = (function () {
-    function CameraDef() {
-        this.orthographic = false;
-    }
-    return CameraDef;
-}());
-exports.CameraDef = CameraDef;
-/**
- * Maintains a three.js instantiation and context,
- * animation state, and all other logic that's
- * independent of how a 3D scatter plot is actually rendered. Also holds an
- * array of visualizers and dispatches application events to them.
- */
-var ScatterPlot = (function () {
-    function ScatterPlot(container, projectorEventContext) {
-        var _this = this;
-        this.visualizers = [];
-        this.onCameraMoveListeners = [];
-        this.backgroundColor = BACKGROUND_COLOR;
-        this.dimensionality = 3;
-        this.cameraDef = null;
-        this.orbitAnimationOnNextCameraCreation = false;
-        this.selecting = false;
-        this.mouseIsDown = false;
-        this.isDragSequence = false;
-        this.containerNode = container.node();
-        this.projectorEventContext = projectorEventContext;
-        this.getLayoutValues();
-        this.scene = new THREE.Scene();
-        this.renderer = new THREE.WebGLRenderer({ alpha: true, premultipliedAlpha: false, antialias: false });
-        this.renderer.setClearColor(BACKGROUND_COLOR, 1);
-        this.containerNode.appendChild(this.renderer.domElement);
-        this.light = new THREE.PointLight(0xFFECBF, 1, 0);
-        this.scene.add(this.light);
-        this.setDimensions(3);
-        this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-        this.renderer.render(this.scene, this.camera);
-        this.rectangleSelector = new scatterPlotRectangleSelector_1.ScatterPlotRectangleSelector(this.containerNode, function (boundingBox) { return _this.selectBoundingBox(boundingBox); });
-        this.addInteractionListeners();
-    }
-    ScatterPlot.prototype.addInteractionListeners = function () {
-        this.containerNode.addEventListener('mousemove', this.onMouseMove.bind(this));
-        this.containerNode.addEventListener('mousedown', this.onMouseDown.bind(this));
-        this.containerNode.addEventListener('mouseup', this.onMouseUp.bind(this));
-        this.containerNode.addEventListener('click', this.onClick.bind(this));
-        window.addEventListener('keydown', this.onKeyDown.bind(this), false);
-        window.addEventListener('keyup', this.onKeyUp.bind(this), false);
-    };
-    ScatterPlot.prototype.addCameraControlsEventListeners = function (cameraControls) {
-        var _this = this;
-        // Start is called when the user stars interacting with
-        // controls.
-        cameraControls.addEventListener('start', function () {
-            _this.stopOrbitAnimation();
-            _this.onCameraMoveListeners.forEach(function (l) { return l(_this.camera.position, cameraControls.target); });
-        });
-        // Change is called everytime the user interacts with the controls.
-        cameraControls.addEventListener('change', function () {
-            _this.render();
-        });
-        // End is called when the user stops interacting with the
-        // controls (e.g. on mouse up, after dragging).
-        cameraControls.addEventListener('end', function () { });
-    };
-    ScatterPlot.prototype.makeOrbitControls = function (camera, cameraDef, cameraIs3D) {
-        if (this.orbitCameraControls != null) {
-            this.orbitCameraControls.dispose();
-        }
-        var occ = new THREE.OrbitControls(camera, this.renderer.domElement);
-        occ.target0 = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-        occ.position0 = new THREE.Vector3().copy(camera.position);
-        occ.zoom0 = cameraDef.zoom;
-        occ.enableRotate = cameraIs3D;
-        occ.autoRotate = false;
-        occ.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-        if (cameraIs3D) {
-            occ.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            occ.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        else {
-            occ.mouseButtons.ORBIT = null;
-            occ.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-        occ.reset();
-        this.camera = camera;
-        this.orbitCameraControls = occ;
-        this.addCameraControlsEventListeners(this.orbitCameraControls);
-    };
-    ScatterPlot.prototype.makeCamera3D = function (cameraDef, w, h) {
-        var camera;
-        {
-            var aspectRatio = w / h;
-            camera = new THREE.PerspectiveCamera(PERSP_CAMERA_FOV_VERTICAL, aspectRatio, PERSP_CAMERA_NEAR_CLIP_PLANE, PERSP_CAMERA_FAR_CLIP_PLANE);
-            camera.position.set(cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-            var at = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-            camera.lookAt(at);
-            camera.zoom = cameraDef.zoom;
-            camera.updateProjectionMatrix();
-        }
-        this.camera = camera;
-        this.makeOrbitControls(camera, cameraDef, true);
-    };
-    ScatterPlot.prototype.makeCamera2D = function (cameraDef, w, h) {
-        var camera;
-        var target = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-        {
-            var aspectRatio = w / h;
-            var left = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var right = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var bottom = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var top_1 = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            // Scale up the larger of (w, h) to match the aspect ratio.
-            if (aspectRatio > 1) {
-                left *= aspectRatio;
-                right *= aspectRatio;
-            }
-            else {
-                top_1 /= aspectRatio;
-                bottom /= aspectRatio;
-            }
-            camera =
-                new THREE.OrthographicCamera(left, right, top_1, bottom, -1000, 1000);
-            camera.position.set(cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-            camera.up = new THREE.Vector3(0, 1, 0);
-            camera.lookAt(target);
-            camera.zoom = cameraDef.zoom;
-            camera.updateProjectionMatrix();
-        }
-        this.camera = camera;
-        this.makeOrbitControls(camera, cameraDef, false);
-    };
-    ScatterPlot.prototype.makeDefaultCameraDef = function (dimensionality) {
-        var def = new CameraDef();
-        def.orthographic = (dimensionality === 2);
-        def.zoom = 1.0;
-        if (def.orthographic) {
-            def.position =
-                [START_CAMERA_POS_2D.x, START_CAMERA_POS_2D.y, START_CAMERA_POS_2D.z];
-            def.target = [
-                START_CAMERA_TARGET_2D.x, START_CAMERA_TARGET_2D.y,
-                START_CAMERA_TARGET_2D.z
-            ];
-        }
-        else {
-            def.position =
-                [START_CAMERA_POS_3D.x, START_CAMERA_POS_3D.y, START_CAMERA_POS_3D.z];
-            def.target = [
-                START_CAMERA_TARGET_3D.x, START_CAMERA_TARGET_3D.y,
-                START_CAMERA_TARGET_3D.z
-            ];
-        }
-        return def;
-    };
-    /** Recreate the scatter plot camera from a definition structure. */
-    ScatterPlot.prototype.recreateCamera = function (cameraDef) {
-        if (cameraDef.orthographic) {
-            this.makeCamera2D(cameraDef, this.width, this.height);
-        }
-        else {
-            this.makeCamera3D(cameraDef, this.width, this.height);
-        }
-        this.orbitCameraControls.minDistance = MIN_ZOOM;
-        this.orbitCameraControls.maxDistance = MAX_ZOOM;
-        this.orbitCameraControls.update();
-        if (this.orbitAnimationOnNextCameraCreation) {
-            this.startOrbitAnimation();
-        }
-    };
-    ScatterPlot.prototype.onClick = function (e, notify) {
-        if (notify === void 0) { notify = true; }
-        if (e && this.selecting) {
-            return;
-        }
-        // Only call event handlers if the click originated from the scatter plot.
-        if (!this.isDragSequence && notify) {
-            var selection = (this.nearestPoint != null) ? [this.nearestPoint] : [];
-            this.projectorEventContext.notifySelectionChanged(selection);
-        }
-        this.isDragSequence = false;
-        this.render();
-    };
-    ScatterPlot.prototype.onMouseDown = function (e) {
-        this.isDragSequence = false;
-        this.mouseIsDown = true;
-        if (this.selecting) {
-            this.orbitCameraControls.enabled = false;
-            this.rectangleSelector.onMouseDown(e.offsetX, e.offsetY);
-            this.setNearestPointToMouse(e);
-        }
-        else if (!e.ctrlKey && this.sceneIs3D() &&
-            this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.RIGHT) {
-            // The user happened to press the ctrl key when the tab was active,
-            // unpressed the ctrl when the tab was inactive, and now he/she
-            // is back to the projector tab.
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        else if (e.ctrlKey && this.sceneIs3D() &&
-            this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.LEFT) {
-            // Similarly to the situation above.
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-    };
-    /** When we stop dragging/zooming, return to normal behavior. */
-    ScatterPlot.prototype.onMouseUp = function (e) {
-        if (this.selecting) {
-            this.orbitCameraControls.enabled = true;
-            this.rectangleSelector.onMouseUp();
-            this.render();
-        }
-        this.mouseIsDown = false;
-    };
-    /**
-     * When the mouse moves, find the nearest point (if any) and send it to the
-     * hoverlisteners (usually called from embedding.ts)
-     */
-    ScatterPlot.prototype.onMouseMove = function (e) {
-        this.isDragSequence = this.mouseIsDown;
-        // Depending if we're selecting or just navigating, handle accordingly.
-        if (this.selecting && this.mouseIsDown) {
-            this.rectangleSelector.onMouseMove(e.offsetX, e.offsetY);
-            this.render();
-        }
-        else if (!this.mouseIsDown) {
-            this.setNearestPointToMouse(e);
-            this.projectorEventContext.notifyHoverOverPoint(this.nearestPoint);
-        }
-    };
-    /** For using ctrl + left click as right click, and for circle select */
-    ScatterPlot.prototype.onKeyDown = function (e) {
-        // If ctrl is pressed, use left click to orbit
-        if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-        // If shift is pressed, start selecting
-        if (e.keyCode === SHIFT_KEY) {
-            this.selecting = true;
-            this.containerNode.style.cursor = 'crosshair';
-        }
-    };
-    /** For using ctrl + left click as right click, and for circle select */
-    ScatterPlot.prototype.onKeyUp = function (e) {
-        if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        // If shift is released, stop selecting
-        if (e.keyCode === SHIFT_KEY) {
-            this.selecting = (this.getMouseMode() === MouseMode.AREA_SELECT);
-            if (!this.selecting) {
-                this.containerNode.style.cursor = 'default';
-            }
-            this.render();
-        }
-    };
-    /**
-     * Returns a list of indices of points in a bounding box from the picking
-     * texture.
-     * @param boundingBox The bounding box to select from.
-     */
-    ScatterPlot.prototype.getPointIndicesFromPickingTexture = function (boundingBox) {
-        if (this.worldSpacePointPositions == null) {
-            return null;
-        }
-        var pointCount = this.worldSpacePointPositions.length / 3;
-        var dpr = window.devicePixelRatio || 1;
-        var x = Math.floor(boundingBox.x * dpr);
-        var y = Math.floor(boundingBox.y * dpr);
-        var width = Math.floor(boundingBox.width * dpr);
-        var height = Math.floor(boundingBox.height * dpr);
-        // Create buffer for reading all of the pixels from the texture.
-        var pixelBuffer = new Uint8Array(width * height * 4);
-        // Read the pixels from the bounding box.
-        this.renderer.readRenderTargetPixels(this.pickingTexture, x, this.pickingTexture.height - y, width, height, pixelBuffer);
-        // Keep a flat list of each point and whether they are selected or not. This
-        // approach is more efficient than using an object keyed by the index.
-        var pointIndicesSelection = new Uint8Array(this.worldSpacePointPositions.length);
-        for (var i = 0; i < width * height; i++) {
-            var id = (pixelBuffer[i * 4] << 16) | (pixelBuffer[i * 4 + 1] << 8) |
-                pixelBuffer[i * 4 + 2];
-            if (id !== 0xffffff && (id < pointCount)) {
-                pointIndicesSelection[id] = 1;
-            }
-        }
-        var pointIndices = [];
-        for (var i = 0; i < pointIndicesSelection.length; i++) {
-            if (pointIndicesSelection[i] === 1) {
-                pointIndices.push(i);
-            }
-        }
-        return pointIndices;
-    };
-    ScatterPlot.prototype.selectBoundingBox = function (boundingBox) {
-        var pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-        this.projectorEventContext.notifySelectionChanged(pointIndices);
-    };
-    ScatterPlot.prototype.setNearestPointToMouse = function (e) {
-        if (this.pickingTexture == null) {
-            this.nearestPoint = null;
-            return;
-        }
-        var boundingBox = { x: e.offsetX, y: e.offsetY, width: 1, height: 1 };
-        var pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-        this.nearestPoint = (pointIndices != null) ? pointIndices[0] : null;
-    };
-    ScatterPlot.prototype.getLayoutValues = function () {
-        this.width = this.containerNode.offsetWidth;
-        this.height = Math.max(1, this.containerNode.offsetHeight);
-        return [this.width, this.height];
-    };
-    ScatterPlot.prototype.sceneIs3D = function () {
-        return this.dimensionality === 3;
-    };
-    ScatterPlot.prototype.remove3dAxisFromScene = function () {
-        var axes = this.scene.getObjectByName('axes');
-        if (axes != null) {
-            this.scene.remove(axes);
-        }
-        return axes;
-    };
-    ScatterPlot.prototype.add3dAxis = function () {
-        var axes = new THREE.AxisHelper();
-        axes.name = 'axes';
-        this.scene.add(axes);
-    };
-    /** Set 2d vs 3d mode. */
-    ScatterPlot.prototype.setDimensions = function (dimensionality) {
-        if ((dimensionality !== 2) && (dimensionality !== 3)) {
-            throw new RangeError('dimensionality must be 2 or 3');
-        }
-        this.dimensionality = dimensionality;
-        var def = this.cameraDef || this.makeDefaultCameraDef(dimensionality);
-        this.recreateCamera(def);
-        this.remove3dAxisFromScene();
-        if (dimensionality === 3) {
-            this.add3dAxis();
-        }
-    };
-    /** Gets the current camera information, suitable for serialization. */
-    ScatterPlot.prototype.getCameraDef = function () {
-        var def = new CameraDef();
-        var pos = this.camera.position;
-        var tgt = this.orbitCameraControls.target;
-        def.orthographic = !this.sceneIs3D();
-        def.position = [pos.x, pos.y, pos.z];
-        def.target = [tgt.x, tgt.y, tgt.z];
-        def.zoom = this.camera.zoom;
-        return def;
-    };
-    /** Sets parameters for the next camera recreation. */
-    ScatterPlot.prototype.setCameraParametersForNextCameraCreation = function (def, orbitAnimation) {
-        this.cameraDef = def;
-        this.orbitAnimationOnNextCameraCreation = orbitAnimation;
-    };
-    /** Gets the current camera position. */
-    ScatterPlot.prototype.getCameraPosition = function () {
-        var currPos = this.camera.position;
-        return [currPos.x, currPos.y, currPos.z];
-    };
-    /** Gets the current camera target. */
-    ScatterPlot.prototype.getCameraTarget = function () {
-        var currTarget = this.orbitCameraControls.target;
-        return [currTarget.x, currTarget.y, currTarget.z];
-    };
-    /** Sets up the camera from given position and target coordinates. */
-    ScatterPlot.prototype.setCameraPositionAndTarget = function (position, target) {
-        this.stopOrbitAnimation();
-        this.camera.position.set(position[0], position[1], position[2]);
-        this.orbitCameraControls.target.set(target[0], target[1], target[2]);
-        this.orbitCameraControls.update();
-        this.render();
-    };
-    /** Starts orbiting the camera around its current lookat target. */
-    ScatterPlot.prototype.startOrbitAnimation = function () {
-        if (!this.sceneIs3D()) {
-            return;
-        }
-        if (this.orbitAnimationId != null) {
-            this.stopOrbitAnimation();
-        }
-        this.orbitCameraControls.autoRotate = true;
-        this.orbitCameraControls.rotateSpeed =
-            ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS;
-        this.updateOrbitAnimation();
-    };
-    ScatterPlot.prototype.updateOrbitAnimation = function () {
-        var _this = this;
-        this.orbitCameraControls.update();
-        this.orbitAnimationId =
-            requestAnimationFrame(function () { return _this.updateOrbitAnimation(); });
-    };
-    /** Stops the orbiting animation on the camera. */
-    ScatterPlot.prototype.stopOrbitAnimation = function () {
-        this.orbitCameraControls.autoRotate = false;
-        this.orbitCameraControls.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-        if (this.orbitAnimationId != null) {
-            cancelAnimationFrame(this.orbitAnimationId);
-            this.orbitAnimationId = null;
-        }
-    };
-    /** Adds a visualizer to the set, will start dispatching events to it */
-    ScatterPlot.prototype.addVisualizer = function (visualizer) {
-        if (this.scene) {
-            visualizer.setScene(this.scene);
-        }
-        visualizer.onResize(this.width, this.height);
-        visualizer.onPointPositionsChanged(this.worldSpacePointPositions);
-        this.visualizers.push(visualizer);
-    };
-    /** Removes all visualizers attached to this scatter plot. */
-    ScatterPlot.prototype.removeAllVisualizers = function () {
-        this.visualizers.forEach(function (v) { return v.dispose(); });
-        this.visualizers = [];
-    };
-    /** Update scatter plot with a new array of packed xyz point positions. */
-    ScatterPlot.prototype.setPointPositions = function (worldSpacePointPositions) {
-        this.worldSpacePointPositions = worldSpacePointPositions;
-        this.visualizers.forEach(function (v) { return v.onPointPositionsChanged(worldSpacePointPositions); });
-    };
-    ScatterPlot.prototype.render = function () {
-        {
-            var lightPos = this.camera.position.clone();
-            lightPos.x += 1;
-            lightPos.y += 1;
-            this.light.position.set(lightPos.x, lightPos.y, lightPos.z);
-        }
-        var cameraType = (this.camera instanceof THREE.PerspectiveCamera) ?
-            renderContext_1.CameraType.Perspective :
-            renderContext_1.CameraType.Orthographic;
-        var cameraSpacePointExtents = [0, 0];
-        if (this.worldSpacePointPositions != null) {
-            cameraSpacePointExtents = util.getNearFarPoints(this.worldSpacePointPositions, this.camera.position, this.orbitCameraControls.target);
-        }
-        var rc = new renderContext_1.RenderContext(this.camera, cameraType, this.orbitCameraControls.target, this.width, this.height, cameraSpacePointExtents[0], cameraSpacePointExtents[1], this.backgroundColor, this.pointColors, this.pointScaleFactors, this.labels, this.traceColors, this.traceOpacities, this.traceWidths);
-        // Render first pass to picking target. This render fills pickingTexture
-        // with colors that are actually point ids, so that sampling the texture at
-        // the mouse's current x,y coordinates will reveal the data point that the
-        // mouse is over.
-        this.visualizers.forEach(function (v) { return v.onPickingRender(rc); });
-        {
-            var axes = this.remove3dAxisFromScene();
-            this.renderer.render(this.scene, this.camera, this.pickingTexture);
-            if (axes != null) {
-                this.scene.add(axes);
-            }
-        }
-        // Render second pass to color buffer, to be displayed on the canvas.
-        this.visualizers.forEach(function (v) { return v.onRender(rc); });
-        this.renderer.render(this.scene, this.camera);
-    };
-    ScatterPlot.prototype.setMouseMode = function (mouseMode) {
-        this.mouseMode = mouseMode;
-        if (mouseMode === MouseMode.AREA_SELECT) {
-            this.selecting = true;
-            this.containerNode.style.cursor = 'crosshair';
-        }
-        else {
-            this.selecting = false;
-            this.containerNode.style.cursor = 'default';
-        }
-    };
-    /** Set the colors for every data point. (RGB triplets) */
-    ScatterPlot.prototype.setPointColors = function (colors) {
-        this.pointColors = colors;
-    };
-    /** Set the scale factors for every data point. (scalars) */
-    ScatterPlot.prototype.setPointScaleFactors = function (scaleFactors) {
-        this.pointScaleFactors = scaleFactors;
-    };
-    /** Set the labels to rendered */
-    ScatterPlot.prototype.setLabels = function (labels) {
-        this.labels = labels;
-    };
-    /** Set the colors for every data trace. (RGB triplets) */
-    ScatterPlot.prototype.setTraceColors = function (colors) {
-        this.traceColors = colors;
-    };
-    ScatterPlot.prototype.setTraceOpacities = function (opacities) {
-        this.traceOpacities = opacities;
-    };
-    ScatterPlot.prototype.setTraceWidths = function (widths) {
-        this.traceWidths = widths;
-    };
-    ScatterPlot.prototype.getMouseMode = function () {
-        return this.mouseMode;
-    };
-    ScatterPlot.prototype.resetZoom = function () {
-        this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-        this.render();
-    };
-    ScatterPlot.prototype.setDayNightMode = function (isNight) {
-        d3.select(this.containerNode)
-            .selectAll('canvas')
-            .style('filter', isNight ? 'invert(100%)' : null);
-    };
-    ScatterPlot.prototype.resize = function (render) {
-        if (render === void 0) { render = true; }
-        var _a = [this.width, this.height], oldW = _a[0], oldH = _a[1];
-        var _b = this.getLayoutValues(), newW = _b[0], newH = _b[1];
-        if (this.dimensionality === 3) {
-            var camera = this.camera;
-            camera.aspect = newW / newH;
-            camera.updateProjectionMatrix();
-        }
-        else {
-            var camera = this.camera;
-            // Scale the ortho frustum by however much the window changed.
-            var scaleW = newW / oldW;
-            var scaleH = newH / oldH;
-            var newCamHalfWidth = ((camera.right - camera.left) * scaleW) / 2;
-            var newCamHalfHeight = ((camera.top - camera.bottom) * scaleH) / 2;
-            camera.top = newCamHalfHeight;
-            camera.bottom = -newCamHalfHeight;
-            camera.left = -newCamHalfWidth;
-            camera.right = newCamHalfWidth;
-            camera.updateProjectionMatrix();
-        }
-        // Accouting for retina displays.
-        var dpr = window.devicePixelRatio || 1;
-        this.renderer.setPixelRatio(dpr);
-        this.renderer.setSize(newW, newH);
-        // the picking texture needs to be exactly the same as the render texture.
-        {
-            var renderCanvasSize = this.renderer.getSize();
-            var pixelRatio = this.renderer.getPixelRatio();
-            this.pickingTexture = new THREE.WebGLRenderTarget(renderCanvasSize.width * pixelRatio, renderCanvasSize.height * pixelRatio);
-            this.pickingTexture.texture.minFilter = THREE.LinearFilter;
-        }
-        this.visualizers.forEach(function (v) { return v.onResize(newW, newH); });
-        if (render) {
-            this.render();
-        }
-        ;
-    };
-    ScatterPlot.prototype.onCameraMove = function (listener) {
-        this.onCameraMoveListeners.push(listener);
-    };
-    ScatterPlot.prototype.clickOnPoint = function (pointIndex) {
-        this.nearestPoint = pointIndex;
-        this.onClick(null, false);
-    };
-    return ScatterPlot;
-}());
-exports.ScatterPlot = ScatterPlot;
-
-},{"./renderContext":15,"./scatterPlotRectangleSelector":17,"./util":24}],17:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var FILL = '#dddddd';
-var FILL_OPACITY = .2;
-var STROKE = '#aaaaaa';
-var STROKE_WIDTH = 2;
-var STROKE_DASHARRAY = '10 5';
-/**
- * A class that manages and renders a data selection rectangle.
- */
-var ScatterPlotRectangleSelector = (function () {
-    /**
-     * @param container The container HTML element that the selection SVG rect
-     *     will be a child of.
-     * @param selectionCallback The callback that accepts a bounding box to be
-     *     called when selection changes. Currently, we only call the callback on
-     *     mouseUp.
-     */
-    function ScatterPlotRectangleSelector(container, selectionCallback) {
-        this.svgElement = d3.select(container).select('#selector');
-        this.rectElement = this.svgElement.append('rect')
-            .style('stroke', STROKE)
-            .style('stroke-dasharray', STROKE_DASHARRAY)
-            .style('stroke-width', STROKE_WIDTH)
-            .style('fill', FILL)
-            .style('fill-opacity', FILL_OPACITY);
-        this.selectionCallback = selectionCallback;
-        this.isMouseDown = false;
-    }
-    ScatterPlotRectangleSelector.prototype.onMouseDown = function (offsetX, offsetY) {
-        this.isMouseDown = true;
-        this.svgElement.style('display', 'block');
-        this.startCoordinates = [offsetX, offsetY];
-        this.lastBoundingBox = {
-            x: this.startCoordinates[0],
-            y: this.startCoordinates[1],
-            width: 1,
-            height: 1
-        };
-    };
-    ScatterPlotRectangleSelector.prototype.onMouseMove = function (offsetX, offsetY) {
-        if (!this.isMouseDown) {
-            return;
-        }
-        this.lastBoundingBox.x = Math.min(offsetX, this.startCoordinates[0]);
-        this.lastBoundingBox.y = Math.max(offsetY, this.startCoordinates[1]);
-        this.lastBoundingBox.width =
-            Math.max(offsetX, this.startCoordinates[0]) - this.lastBoundingBox.x;
-        this.lastBoundingBox.height =
-            this.lastBoundingBox.y - Math.min(offsetY, this.startCoordinates[1]);
-        this.rectElement.attr({
-            x: this.lastBoundingBox.x,
-            y: this.lastBoundingBox.y - this.lastBoundingBox.height,
-            width: this.lastBoundingBox.width,
-            height: this.lastBoundingBox.height
-        });
-    };
-    ScatterPlotRectangleSelector.prototype.onMouseUp = function () {
-        this.isMouseDown = false;
-        this.svgElement.style('display', 'none');
-        this.rectElement.attr('width', 0);
-        this.rectElement.attr('height', 0);
-        this.selectionCallback(this.lastBoundingBox);
-    };
-    return ScatterPlotRectangleSelector;
-}());
-exports.ScatterPlotRectangleSelector = ScatterPlotRectangleSelector;
-
-},{}],18:[function(require,module,exports){
-arguments[4][13][0].apply(exports,arguments)
-},{"dup":13}],19:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util = require('./util');
-var FONT_SIZE = 80;
-var ONE_OVER_FONT_SIZE = 1 / FONT_SIZE;
-var LABEL_SCALE = 2.2; // at 1:1 texel/pixel ratio
-var LABEL_COLOR = 'black';
-var LABEL_BACKGROUND = 'white';
-var MAX_CANVAS_DIMENSION = 8192;
-var NUM_GLYPHS = 256;
-var RGB_ELEMENTS_PER_ENTRY = 3;
-var XYZ_ELEMENTS_PER_ENTRY = 3;
-var UV_ELEMENTS_PER_ENTRY = 2;
-var VERTICES_PER_GLYPH = 2 * 3; // 2 triangles, 3 verts per triangle
-/**
- * Each label is made up of triangles (two per letter.) Each vertex, then, is
- * the corner of one of these triangles (and thus the corner of a letter
- * rectangle.)
- * Each has the following attributes:
- *    posObj: The (x, y) position of the vertex within the label, where the
- *            bottom center of the word is positioned at (0, 0);
- *    position: The position of the label in worldspace.
- *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
- *    color: The color of the label (matches the cooresponding point's color.)
- *    wordShown: Boolean. Whether or not the label is visible.
- */
-var VERTEX_SHADER = "\n    attribute vec2 posObj;\n    attribute vec3 color;\n    varying vec2 vUv;\n    varying vec3 vColor;\n\n    void main() {\n      vUv = uv;\n      vColor = color;\n\n      // Rotate label to face camera.\n\n      vec4 vRight = vec4(\n        modelViewMatrix[0][0], modelViewMatrix[1][0], modelViewMatrix[2][0], 0);\n\n      vec4 vUp = vec4(\n        modelViewMatrix[0][1], modelViewMatrix[1][1], modelViewMatrix[2][1], 0);\n\n      vec4 vAt = -vec4(\n        modelViewMatrix[0][2], modelViewMatrix[1][2], modelViewMatrix[2][2], 0);\n\n      mat4 pointToCamera = mat4(vRight, vUp, vAt, vec4(0, 0, 0, 1));\n\n      vec2 scaledPos = posObj * " + ONE_OVER_FONT_SIZE + " * " + LABEL_SCALE + ";\n\n      vec4 posRotated = pointToCamera * vec4(scaledPos, 0, 1);\n      vec4 mvPosition = modelViewMatrix * (vec4(position, 0) + posRotated);\n      gl_Position = projectionMatrix * mvPosition;\n    }";
-var FRAGMENT_SHADER = "\n    uniform sampler2D texture;\n    uniform bool picking;\n    varying vec2 vUv;\n    varying vec3 vColor;\n\n    void main() {\n      if (picking) {\n        gl_FragColor = vec4(vColor, 1.0);\n      } else {\n        vec4 fromTexture = texture2D(texture, vUv);\n        gl_FragColor = vec4(vColor, 1.0) * fromTexture;\n      }\n    }";
-/**
- * Renders the text labels as 3d geometry in the world.
- */
-var ScatterPlotVisualizer3DLabels = (function () {
-    function ScatterPlotVisualizer3DLabels() {
-    }
-    ScatterPlotVisualizer3DLabels.prototype.createGlyphTexture = function () {
-        var canvas = document.createElement('canvas');
-        canvas.width = MAX_CANVAS_DIMENSION;
-        canvas.height = FONT_SIZE;
-        var ctx = canvas.getContext('2d');
-        ctx.font = 'bold ' + FONT_SIZE * 0.75 + 'px roboto';
-        ctx.textBaseline = 'top';
-        ctx.fillStyle = LABEL_BACKGROUND;
-        ctx.rect(0, 0, canvas.width, canvas.height);
-        ctx.fill();
-        ctx.fillStyle = LABEL_COLOR;
-        var spaceOffset = ctx.measureText(' ').width;
-        // For each letter, store length, position at the encoded index.
-        var glyphLengths = new Float32Array(NUM_GLYPHS);
-        var glyphOffset = new Float32Array(NUM_GLYPHS);
-        var leftCoord = 0;
-        for (var i = 0; i < NUM_GLYPHS; i++) {
-            var text = ' ' + String.fromCharCode(i);
-            var textLength = ctx.measureText(text).width;
-            glyphLengths[i] = textLength - spaceOffset;
-            glyphOffset[i] = leftCoord;
-            ctx.fillText(text, leftCoord - spaceOffset, 0);
-            leftCoord += textLength;
-        }
-        var tex = util.createTexture(canvas);
-        return { texture: tex, lengths: glyphLengths, offsets: glyphOffset };
-    };
-    ScatterPlotVisualizer3DLabels.prototype.processLabelVerts = function (pointCount) {
-        var numTotalLetters = 0;
-        this.labelVertexMap = [];
-        for (var i = 0; i < pointCount; i++) {
-            var label = this.labelStrings[i];
-            var vertsArray = [];
-            for (var j = 0; j < label.length; j++) {
-                for (var k = 0; k < VERTICES_PER_GLYPH; k++) {
-                    vertsArray.push(numTotalLetters * VERTICES_PER_GLYPH + k);
-                }
-                numTotalLetters++;
-            }
-            this.labelVertexMap.push(vertsArray);
-        }
-        this.totalVertexCount = numTotalLetters * VERTICES_PER_GLYPH;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.createColorBuffers = function (pointCount) {
-        var _this = this;
-        this.pickingColors =
-            new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        this.renderColors =
-            new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        var _loop_1 = function(i) {
-            var color = new THREE.Color(i);
-            this_1.labelVertexMap[i].forEach(function (j) {
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j] = color.r;
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = color.g;
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = color.b;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j] = 1.0;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = 1.0;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = 1.0;
-            });
-        };
-        var this_1 = this;
-        for (var i = 0; i < pointCount; i++) {
-            _loop_1(i);
-        }
-    };
-    ScatterPlotVisualizer3DLabels.prototype.createLabels = function () {
-        var _this = this;
-        if ((this.labelStrings == null) ||
-            (this.worldSpacePointPositions == null)) {
-            return;
-        }
-        var pointCount = this.worldSpacePointPositions.length / XYZ_ELEMENTS_PER_ENTRY;
-        if (pointCount !== this.labelStrings.length) {
-            return;
-        }
-        this.glyphTexture = this.createGlyphTexture();
-        this.uniforms = {
-            texture: { type: 't' },
-            picking: { type: 'bool' },
-        };
-        this.material = new THREE.ShaderMaterial({
-            uniforms: this.uniforms,
-            transparent: true,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER,
-        });
-        this.processLabelVerts(pointCount);
-        this.createColorBuffers(pointCount);
-        var positionArray = new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-        this.positions =
-            new THREE.BufferAttribute(positionArray, XYZ_ELEMENTS_PER_ENTRY);
-        var posArray = new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-        var uvArray = new Float32Array(this.totalVertexCount * UV_ELEMENTS_PER_ENTRY);
-        var colorsArray = new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        var positionObject = new THREE.BufferAttribute(posArray, 2);
-        var uv = new THREE.BufferAttribute(uvArray, UV_ELEMENTS_PER_ENTRY);
-        var colors = new THREE.BufferAttribute(colorsArray, RGB_ELEMENTS_PER_ENTRY);
-        this.geometry = new THREE.BufferGeometry();
-        this.geometry.addAttribute('posObj', positionObject);
-        this.geometry.addAttribute('position', this.positions);
-        this.geometry.addAttribute('uv', uv);
-        this.geometry.addAttribute('color', colors);
-        var lettersSoFar = 0;
-        for (var i = 0; i < pointCount; i++) {
-            var label = this.labelStrings[i];
-            var leftOffset = 0;
-            // Determine length of word in pixels.
-            for (var j = 0; j < label.length; j++) {
-                var letterCode = label.charCodeAt(j);
-                leftOffset += this.glyphTexture.lengths[letterCode];
-            }
-            leftOffset /= -2; // centers text horizontally around the origin
-            for (var j = 0; j < label.length; j++) {
-                var letterCode = label.charCodeAt(j);
-                var letterWidth = this.glyphTexture.lengths[letterCode];
-                var scale = FONT_SIZE;
-                var right = (leftOffset + letterWidth) / scale;
-                var left = (leftOffset) / scale;
-                var top_1 = FONT_SIZE / scale;
-                // First triangle
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, left, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, right, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, left, top_1);
-                // Second triangle
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, left, top_1);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, right, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, right, top_1);
-                // Set UVs based on letter.
-                var uLeft = (this.glyphTexture.offsets[letterCode]);
-                var uRight = (this.glyphTexture.offsets[letterCode] + letterWidth);
-                // Scale so that uvs lie between 0 and 1 on the texture.
-                uLeft /= MAX_CANVAS_DIMENSION;
-                uRight /= MAX_CANVAS_DIMENSION;
-                var vTop = 1;
-                var vBottom = 0;
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, uLeft, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, uRight, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, uLeft, vBottom);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, uLeft, vBottom);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, uRight, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, uRight, vBottom);
-                lettersSoFar++;
-                leftOffset += letterWidth;
-            }
-        }
-        var _loop_2 = function(i) {
-            var p = util.vector3FromPackedArray(this_2.worldSpacePointPositions, i);
-            this_2.labelVertexMap[i].forEach(function (j) {
-                _this.positions.setXYZ(j, p.x, p.y, p.z);
-            });
-        };
-        var this_2 = this;
-        for (var i = 0; i < pointCount; i++) {
-            _loop_2(i);
-        }
-        ;
-        this.labelsMesh = new THREE.Mesh(this.geometry, this.material);
-        this.labelsMesh.frustumCulled = false;
-        this.scene.add(this.labelsMesh);
-    };
-    ScatterPlotVisualizer3DLabels.prototype.colorLabels = function (pointColors) {
-        if (this.labelStrings == null || this.geometry == null ||
-            pointColors == null) {
-            return;
-        }
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.renderColors;
-        var n = pointColors.length / XYZ_ELEMENTS_PER_ENTRY;
-        var src = 0;
-        for (var i = 0; i < n; ++i) {
-            var c = new THREE.Color(pointColors[src], pointColors[src + 1], pointColors[src + 2]);
-            var m = this.labelVertexMap[i].length;
-            for (var j = 0; j < m; ++j) {
-                colors.setXYZ(this.labelVertexMap[i][j], c.r, c.g, c.b);
-            }
-            src += RGB_ELEMENTS_PER_ENTRY;
-        }
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.dispose = function () {
-        if (this.labelsMesh) {
-            if (this.scene) {
-                this.scene.remove(this.labelsMesh);
-            }
-            this.labelsMesh = null;
-        }
-        if (this.geometry) {
-            this.geometry.dispose();
-            this.geometry = null;
-        }
-        if ((this.glyphTexture != null) && (this.glyphTexture.texture != null)) {
-            this.glyphTexture.texture.dispose();
-            this.glyphTexture.texture = null;
-        }
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onPickingRender = function (rc) {
-        if (this.geometry == null) {
-            this.createLabels();
-        }
-        if (this.geometry == null) {
-            return;
-        }
-        this.material.uniforms.texture.value = this.glyphTexture.texture;
-        this.material.uniforms.picking.value = true;
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.pickingColors;
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onRender = function (rc) {
-        if (this.geometry == null) {
-            this.createLabels();
-        }
-        if (this.geometry == null) {
-            return;
-        }
-        this.colorLabels(rc.pointColors);
-        this.material.uniforms.texture.value = this.glyphTexture.texture;
-        this.material.uniforms.picking.value = false;
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.renderColors;
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onPointPositionsChanged = function (newPositions) {
-        this.worldSpacePointPositions = newPositions;
-        this.dispose();
-    };
-    ScatterPlotVisualizer3DLabels.prototype.setLabelStrings = function (labelStrings) {
-        this.labelStrings = labelStrings;
-        this.dispose();
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizer3DLabels;
-}());
-exports.ScatterPlotVisualizer3DLabels = ScatterPlotVisualizer3DLabels;
-
-},{"./util":24}],20:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var label_1 = require('./label');
-var renderContext_1 = require('./renderContext');
-var util = require('./util');
-var MAX_LABELS_ON_SCREEN = 10000;
-var LABEL_STROKE_WIDTH = 3;
-var LABEL_FILL_WIDTH = 6;
-/**
- * Creates and maintains a 2d canvas on top of the GL canvas. All labels, when
- * active, are rendered to the 2d canvas as part of the visible render pass.
- */
-var ScatterPlotVisualizerCanvasLabels = (function () {
-    function ScatterPlotVisualizerCanvasLabels(container) {
-        this.labelsActive = true;
-        this.canvas = container.append('canvas').node();
-        this.gc = this.canvas.getContext('2d');
-        d3.select(this.canvas).style({ position: 'absolute', left: 0, top: 0 });
-        this.canvas.style.pointerEvents = 'none';
-    }
-    ScatterPlotVisualizerCanvasLabels.prototype.removeAllLabels = function () {
-        var pixelWidth = this.canvas.width * window.devicePixelRatio;
-        var pixelHeight = this.canvas.height * window.devicePixelRatio;
-        this.gc.clearRect(0, 0, pixelWidth, pixelHeight);
-    };
-    /** Render all of the non-overlapping visible labels to the canvas. */
-    ScatterPlotVisualizerCanvasLabels.prototype.makeLabels = function (rc) {
-        if ((rc.labels == null) || (rc.labels.pointIndices.length === 0)) {
-            return;
-        }
-        if (this.worldSpacePointPositions == null) {
-            return;
-        }
-        var lrc = rc.labels;
-        var sceneIs3D = (rc.cameraType === renderContext_1.CameraType.Perspective);
-        var labelHeight = parseInt(this.gc.font, 10);
-        var dpr = window.devicePixelRatio;
-        var grid;
-        {
-            var pixw = this.canvas.width * dpr;
-            var pixh = this.canvas.height * dpr;
-            var bb = { loX: 0, hiX: pixw, loY: 0, hiY: pixh };
-            grid = new label_1.CollisionGrid(bb, pixw / 25, pixh / 50);
-        }
-        var opacityMap = d3.scale.pow()
-            .exponent(Math.E)
-            .domain([rc.farthestCameraSpacePointZ, rc.nearestCameraSpacePointZ])
-            .range([0.1, 1]);
-        var camPos = rc.camera.position;
-        var camToTarget = camPos.clone().sub(rc.cameraTarget);
-        var camToPoint = new THREE.Vector3();
-        this.gc.textBaseline = 'middle';
-        this.gc.miterLimit = 2;
-        // Have extra space between neighboring labels. Don't pack too tightly.
-        var labelMargin = 2;
-        // Shift the label to the right of the point circle.
-        var xShift = 4;
-        var n = Math.min(MAX_LABELS_ON_SCREEN, lrc.pointIndices.length);
-        for (var i = 0; i < n; ++i) {
-            var point = void 0;
-            {
-                var pi = lrc.pointIndices[i];
-                point = util.vector3FromPackedArray(this.worldSpacePointPositions, pi);
-            }
-            // discard points that are behind the camera
-            camToPoint.copy(camPos).sub(point);
-            if (camToTarget.dot(camToPoint) < 0) {
-                continue;
-            }
-            var _a = util.vector3DToScreenCoords(rc.camera, rc.screenWidth, rc.screenHeight, point), x = _a[0], y = _a[1];
-            x += xShift;
-            // Computing the width of the font is expensive,
-            // so we assume width of 1 at first. Then, if the label doesn't
-            // conflict with other labels, we measure the actual width.
-            var textBoundingBox = {
-                loX: x - labelMargin,
-                hiX: x + 1 + labelMargin,
-                loY: y - labelHeight / 2 - labelMargin,
-                hiY: y + labelHeight / 2 + labelMargin
-            };
-            if (grid.insert(textBoundingBox, true)) {
-                var text = lrc.labelStrings[i];
-                var fontSize = lrc.defaultFontSize * lrc.scaleFactors[i] * dpr;
-                this.gc.font = fontSize + 'px roboto';
-                // Now, check with properly computed width.
-                textBoundingBox.hiX += this.gc.measureText(text).width - 1;
-                if (grid.insert(textBoundingBox)) {
-                    var opacity = 1;
-                    if (sceneIs3D && (lrc.useSceneOpacityFlags[i] === 1)) {
-                        opacity = opacityMap(camToPoint.length());
-                    }
-                    this.gc.fillStyle =
-                        this.styleStringFromPackedRgba(lrc.fillColors, i, opacity);
-                    this.gc.strokeStyle =
-                        this.styleStringFromPackedRgba(lrc.strokeColors, i, opacity);
-                    this.gc.lineWidth = LABEL_STROKE_WIDTH;
-                    this.gc.strokeText(text, x, y);
-                    this.gc.lineWidth = LABEL_FILL_WIDTH;
-                    this.gc.fillText(text, x, y);
-                }
-            }
-        }
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.styleStringFromPackedRgba = function (packedRgbaArray, colorIndex, opacity) {
-        var offset = colorIndex * 3;
-        var r = packedRgbaArray[offset];
-        var g = packedRgbaArray[offset + 1];
-        var b = packedRgbaArray[offset + 2];
-        return 'rgba(' + r + ',' + g + ',' + b + ',' + opacity + ')';
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onResize = function (newWidth, newHeight) {
-        var dpr = window.devicePixelRatio;
-        d3.select(this.canvas)
-            .attr('width', newWidth * dpr)
-            .attr('height', newHeight * dpr)
-            .style({ width: newWidth + 'px', height: newHeight + 'px' });
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.dispose = function () {
-        this.removeAllLabels();
-        this.canvas = null;
-        this.gc = null;
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onPointPositionsChanged = function (newPositions) {
-        this.worldSpacePointPositions = newPositions;
-        this.removeAllLabels();
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onRender = function (rc) {
-        if (!this.labelsActive) {
-            return;
-        }
-        this.removeAllLabels();
-        this.makeLabels(rc);
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.setScene = function (scene) { };
-    ScatterPlotVisualizerCanvasLabels.prototype.onPickingRender = function (renderContext) { };
-    return ScatterPlotVisualizerCanvasLabels;
-}());
-exports.ScatterPlotVisualizerCanvasLabels = ScatterPlotVisualizerCanvasLabels;
-
-},{"./label":11,"./renderContext":15,"./util":24}],21:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var util = require('./util');
-var NUM_POINTS_FOG_THRESHOLD = 5000;
-var MIN_POINT_SIZE = 5.0;
-var IMAGE_SIZE = 30;
-// Constants relating to the indices of buffer arrays.
-var RGB_NUM_ELEMENTS = 3;
-var INDEX_NUM_ELEMENTS = 1;
-var XYZ_NUM_ELEMENTS = 3;
-var VERTEX_SHADER = "\n  // Index of the specific vertex (passed in as bufferAttribute), and the\n  // variable that will be used to pass it to the fragment shader.\n  attribute float spriteIndex;\n  attribute vec3 color;\n  attribute float scaleFactor;\n\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n\n  uniform bool sizeAttenuation;\n  uniform float pointSize;\n  uniform float spritesPerRow;\n  uniform float spritesPerColumn;\n\n  void main() {\n    // Pass index and color values to fragment shader.\n    vColor = color;\n    xyIndex = vec2(mod(spriteIndex, spritesPerRow),\n              floor(spriteIndex / spritesPerColumn));\n\n    // Transform current vertex by modelViewMatrix (model world position and\n    // camera world position matrix).\n    vec4 cameraSpacePos = modelViewMatrix * vec4(position, 1.0);\n\n    // Project vertex in camera-space to screen coordinates using the camera's\n    // projection matrix.\n    gl_Position = projectionMatrix * cameraSpacePos;\n\n    // Create size attenuation (if we're in 3D mode) by making the size of\n    // each point inversly proportional to its distance to the camera.\n    float outputPointSize = pointSize;\n    if (sizeAttenuation) {\n      outputPointSize = -pointSize / cameraSpacePos.z;\n    }\n\n    gl_PointSize =\n      max(outputPointSize * scaleFactor, " + MIN_POINT_SIZE.toFixed(1) + ");\n  }";
-var FRAGMENT_SHADER_POINT_TEST_CHUNK = "\n  bool point_in_unit_circle(vec2 spriteCoord) {\n    vec2 centerToP = spriteCoord - vec2(0.5, 0.5);\n    return dot(centerToP, centerToP) < (0.5 * 0.5);\n  }\n\n  bool point_in_unit_equilateral_triangle(vec2 spriteCoord) {\n    vec3 v0 = vec3(0, 1, 0);\n    vec3 v1 = vec3(0.5, 0, 0);\n    vec3 v2 = vec3(1, 1, 0);\n    vec3 p = vec3(spriteCoord, 0);\n    float p_in_v0_v1 = cross(v1 - v0, p - v0).z;\n    float p_in_v1_v2 = cross(v2 - v1, p - v1).z;\n    return (p_in_v0_v1 > 0.0) && (p_in_v1_v2 > 0.0);\n  }\n\n  bool point_in_unit_square(vec2 spriteCoord) {\n    return true;\n  }\n";
-var FRAGMENT_SHADER = "\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n\n  uniform sampler2D texture;\n  uniform float spritesPerRow;\n  uniform float spritesPerColumn;\n  uniform bool isImage;\n\n  " + THREE.ShaderChunk['common'] + "\n  " + THREE.ShaderChunk['fog_pars_fragment'] + "\n  " + FRAGMENT_SHADER_POINT_TEST_CHUNK + "\n\n  void main() {\n    if (isImage) {\n      // Coordinates of the vertex within the entire sprite image.\n      vec2 coords =\n        (gl_PointCoord + xyIndex) / vec2(spritesPerRow, spritesPerColumn);\n      gl_FragColor = vec4(vColor, 1.0) * texture2D(texture, coords);\n    } else {\n      bool inside = point_in_unit_circle(gl_PointCoord);\n      if (!inside) {\n        discard;\n      }\n      gl_FragColor = vec4(vColor, 1);\n    }\n    " + THREE.ShaderChunk['fog_fragment'] + "\n  }";
-var FRAGMENT_SHADER_PICKING = "\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n  uniform bool isImage;\n\n  " + FRAGMENT_SHADER_POINT_TEST_CHUNK + "\n\n  void main() {\n    xyIndex; // Silence 'unused variable' warning.\n    if (isImage) {\n      gl_FragColor = vec4(vColor, 1);\n    } else {\n      bool inside = point_in_unit_circle(gl_PointCoord);\n      if (!inside) {\n        discard;\n      }\n      gl_FragColor = vec4(vColor, 1);\n    }\n  }";
-/**
- * Uses GL point sprites to render the dataset.
- */
-var ScatterPlotVisualizerSprites = (function () {
-    function ScatterPlotVisualizerSprites() {
-        this.texture = null;
-        this.standinTextureForPoints =
-            util.createTexture(document.createElement('canvas'));
-        this.renderMaterial = this.createRenderMaterial(false);
-        this.pickingMaterial = this.createPickingMaterial(false);
-    }
-    ScatterPlotVisualizerSprites.prototype.createTextureFromSpriteAtlas = function (spriteAtlas, spriteDimensions, spriteIndices) {
-        this.texture = util.createTexture(spriteAtlas);
-        this.spritesPerRow = spriteAtlas.width / spriteDimensions[0];
-        this.spritesPerColumn = spriteAtlas.height / spriteDimensions[1];
-        this.spriteIndexBufferAttribute =
-            new THREE.BufferAttribute(spriteIndices, INDEX_NUM_ELEMENTS);
-        if (this.points != null) {
-            this.points.geometry
-                .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.createUniforms = function () {
-        return {
-            texture: { type: 't' },
-            spritesPerRow: { type: 'f' },
-            spritesPerColumn: { type: 'f' },
-            fogColor: { type: 'c' },
-            fogNear: { type: 'f' },
-            fogFar: { type: 'f' },
-            isImage: { type: 'bool' },
-            sizeAttenuation: { type: 'bool' },
-            pointSize: { type: 'f' }
-        };
-    };
-    ScatterPlotVisualizerSprites.prototype.createRenderMaterial = function (haveImage) {
-        var uniforms = this.createUniforms();
-        return new THREE.ShaderMaterial({
-            uniforms: uniforms,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER,
-            transparent: !haveImage,
-            depthTest: haveImage,
-            depthWrite: haveImage,
-            fog: true,
-            blending: THREE.MultiplyBlending,
-        });
-    };
-    ScatterPlotVisualizerSprites.prototype.createPickingMaterial = function (haveImage) {
-        var uniforms = this.createUniforms();
-        return new THREE.ShaderMaterial({
-            uniforms: uniforms,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER_PICKING,
-            transparent: true,
-            depthTest: true,
-            depthWrite: true,
-            fog: false,
-            blending: THREE.NormalBlending,
-        });
-    };
-    /**
-     * Create points, set their locations and actually instantiate the
-     * geometry.
-     */
-    ScatterPlotVisualizerSprites.prototype.createPointSprites = function (scene, positions) {
-        var pointCount = (positions != null) ? (positions.length / XYZ_NUM_ELEMENTS) : 0;
-        var geometry = this.createGeometry(pointCount);
-        this.fog = new THREE.Fog(0xFFFFFF); // unused value, gets overwritten.
-        this.points = new THREE.Points(geometry, this.renderMaterial);
-        this.points.frustumCulled = false;
-        if (this.spriteIndexBufferAttribute != null) {
-            this.points.geometry
-                .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-        }
-        scene.add(this.points);
-    };
-    ScatterPlotVisualizerSprites.prototype.calculatePointSize = function (sceneIs3D) {
-        if (this.texture != null) {
-            return IMAGE_SIZE;
-        }
-        var n = (this.worldSpacePointPositions != null) ?
-            (this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS) :
-            1;
-        var SCALE = 200;
-        var LOG_BASE = 8;
-        var DIVISOR = 1.5;
-        // Scale point size inverse-logarithmically to the number of points.
-        var pointSize = SCALE / Math.log(n) / Math.log(LOG_BASE);
-        return sceneIs3D ? pointSize : (pointSize / DIVISOR);
-    };
-    /**
-     * Set up buffer attributes to be used for the points/images.
-     */
-    ScatterPlotVisualizerSprites.prototype.createGeometry = function (pointCount) {
-        var n = pointCount;
-        // Fill pickingColors with each point's unique id as its color.
-        this.pickingColors = new Float32Array(n * RGB_NUM_ELEMENTS);
-        {
-            var dst = 0;
-            for (var i = 0; i < n; i++) {
-                var c = new THREE.Color(i);
-                this.pickingColors[dst++] = c.r;
-                this.pickingColors[dst++] = c.g;
-                this.pickingColors[dst++] = c.b;
-            }
-        }
-        var geometry = new THREE.BufferGeometry();
-        geometry.addAttribute('position', new THREE.BufferAttribute(null, XYZ_NUM_ELEMENTS));
-        geometry.addAttribute('color', new THREE.BufferAttribute(null, RGB_NUM_ELEMENTS));
-        geometry.addAttribute('scaleFactor', new THREE.BufferAttribute(null, INDEX_NUM_ELEMENTS));
-        return geometry;
-    };
-    ScatterPlotVisualizerSprites.prototype.setFogDistances = function (sceneIs3D, nearestPointZ, farthestPointZ) {
-        if (sceneIs3D) {
-            var n = this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS;
-            this.fog.near = nearestPointZ;
-            // If there are fewer points we want less fog. We do this
-            // by making the "far" value (that is, the distance from the camera to the
-            // far edge of the fog) proportional to the number of points.
-            var multiplier = 2 - Math.min(n, NUM_POINTS_FOG_THRESHOLD) / NUM_POINTS_FOG_THRESHOLD;
-            this.fog.far = farthestPointZ * multiplier;
-        }
-        else {
-            this.fog.near = Infinity;
-            this.fog.far = Infinity;
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.dispose = function () {
-        this.disposeGeometry();
-        this.disposeTextureAtlas();
-    };
-    ScatterPlotVisualizerSprites.prototype.disposeGeometry = function () {
-        if (this.points != null) {
-            this.scene.remove(this.points);
-            this.points.geometry.dispose();
-            this.points = null;
-            this.worldSpacePointPositions = null;
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.disposeTextureAtlas = function () {
-        if (this.texture != null) {
-            this.texture.dispose();
-        }
-        this.texture = null;
-        this.renderMaterial = null;
-        this.pickingMaterial = null;
-    };
-    ScatterPlotVisualizerSprites.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizerSprites.prototype.setSpriteAtlas = function (spriteImage, spriteDimensions, spriteIndices) {
-        this.disposeTextureAtlas();
-        this.createTextureFromSpriteAtlas(spriteImage, spriteDimensions, spriteIndices);
-        this.renderMaterial = this.createRenderMaterial(true);
-        this.pickingMaterial = this.createPickingMaterial(true);
-    };
-    ScatterPlotVisualizerSprites.prototype.clearSpriteAtlas = function () {
-        this.disposeTextureAtlas();
-        this.renderMaterial = this.createRenderMaterial(false);
-        this.pickingMaterial = this.createPickingMaterial(false);
-    };
-    ScatterPlotVisualizerSprites.prototype.onPointPositionsChanged = function (newPositions) {
-        if ((newPositions == null) || (newPositions.length === 0)) {
-            this.dispose();
-            return;
-        }
-        if (this.points != null) {
-            if (this.worldSpacePointPositions.length !== newPositions.length) {
-                this.disposeGeometry();
-            }
-        }
-        this.worldSpacePointPositions = newPositions;
-        if (this.points == null) {
-            this.createPointSprites(this.scene, newPositions);
-        }
-        var positions = this.points.geometry
-            .getAttribute('position');
-        positions.array = newPositions;
-        positions.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onPickingRender = function (rc) {
-        if (this.points == null) {
-            return;
-        }
-        var sceneIs3D = (rc.cameraType === renderContext_1.CameraType.Perspective);
-        this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-        this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerColumn;
-        this.pickingMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-        this.pickingMaterial.uniforms.pointSize.value =
-            this.calculatePointSize(sceneIs3D);
-        this.points.material = this.pickingMaterial;
-        var colors = this.points.geometry
-            .getAttribute('color');
-        colors.array = this.pickingColors;
-        colors.needsUpdate = true;
-        var scaleFactors = this.points.geometry
-            .getAttribute('scaleFactor');
-        scaleFactors.array = rc.pointScaleFactors;
-        scaleFactors.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onRender = function (rc) {
-        if (!this.points) {
-            return;
-        }
-        var sceneIs3D = (rc.camera instanceof THREE.PerspectiveCamera);
-        this.setFogDistances(sceneIs3D, rc.nearestCameraSpacePointZ, rc.farthestCameraSpacePointZ);
-        this.scene.fog = this.fog;
-        this.scene.fog.color = new THREE.Color(rc.backgroundColor);
-        this.renderMaterial.uniforms.fogColor.value = this.scene.fog.color;
-        this.renderMaterial.uniforms.fogNear.value = this.fog.near;
-        this.renderMaterial.uniforms.fogFar.value = this.fog.far;
-        this.renderMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-        this.renderMaterial.uniforms.spritesPerColumn.value = this.spritesPerColumn;
-        this.renderMaterial.uniforms.isImage.value = (this.texture != null);
-        this.renderMaterial.uniforms.texture.value =
-            (this.texture != null) ? this.texture : this.standinTextureForPoints;
-        this.renderMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-        this.renderMaterial.uniforms.pointSize.value =
-            this.calculatePointSize(sceneIs3D);
-        this.points.material = this.renderMaterial;
-        var colors = this.points.geometry
-            .getAttribute('color');
-        this.renderColors = rc.pointColors;
-        colors.array = this.renderColors;
-        colors.needsUpdate = true;
-        var scaleFactors = this.points.geometry
-            .getAttribute('scaleFactor');
-        scaleFactors.array = rc.pointScaleFactors;
-        scaleFactors.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizerSprites;
-}());
-exports.ScatterPlotVisualizerSprites = ScatterPlotVisualizerSprites;
-
-},{"./renderContext":15,"./util":24}],22:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util = require('./util');
-var RGB_NUM_ELEMENTS = 3;
-var XYZ_NUM_ELEMENTS = 3;
-/**
- * Renders 'traces' (polylines) that connect multiple points in the dataset
- */
-var ScatterPlotVisualizerTraces = (function () {
-    function ScatterPlotVisualizerTraces() {
-        this.tracePositionBuffer = {};
-        this.traceColorBuffer = {};
-    }
-    ScatterPlotVisualizerTraces.prototype.updateTraceIndicesInDataSet = function (ds) {
-        for (var i = 0; i < ds.traces.length; i++) {
-            var trace = ds.traces[i];
-            for (var j = 0; j < trace.pointIndices.length - 1; j++) {
-                ds.points[trace.pointIndices[j]].traceIndex = i;
-                ds.points[trace.pointIndices[j + 1]].traceIndex = i;
-            }
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.createTraces = function (scene) {
-        if (!this.dataSet || !this.dataSet.traces) {
-            return;
-        }
-        this.updateTraceIndicesInDataSet(this.dataSet);
-        this.traces = [];
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var geometry = new THREE.BufferGeometry();
-            geometry.addAttribute('position', this.tracePositionBuffer[i]);
-            geometry.addAttribute('color', this.traceColorBuffer[i]);
-            var material = new THREE.LineBasicMaterial({
-                linewidth: 1,
-                opacity: 1.0,
-                transparent: true,
-                vertexColors: THREE.VertexColors
-            });
-            var trace = new THREE.LineSegments(geometry, material);
-            trace.frustumCulled = false;
-            this.traces.push(trace);
-            scene.add(trace);
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.dispose = function () {
-        if (this.traces == null) {
-            return;
-        }
-        for (var i = 0; i < this.traces.length; i++) {
-            this.scene.remove(this.traces[i]);
-            this.traces[i].geometry.dispose();
-        }
-        this.traces = null;
-        this.tracePositionBuffer = {};
-        this.traceColorBuffer = {};
-    };
-    ScatterPlotVisualizerTraces.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizerTraces.prototype.setDataSet = function (dataSet) {
-        this.dataSet = dataSet;
-    };
-    ScatterPlotVisualizerTraces.prototype.onPointPositionsChanged = function (newPositions) {
-        if ((newPositions == null) || (this.traces != null)) {
-            this.dispose();
-        }
-        if ((newPositions == null) || (this.dataSet == null)) {
-            return;
-        }
-        // Set up the position buffer arrays for each trace.
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var dataTrace = this.dataSet.traces[i];
-            var vertexCount = 2 * (dataTrace.pointIndices.length - 1);
-            var traces = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
-            this.tracePositionBuffer[i] =
-                new THREE.BufferAttribute(traces, XYZ_NUM_ELEMENTS);
-            var colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
-            this.traceColorBuffer[i] =
-                new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
-        }
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var dataTrace = this.dataSet.traces[i];
-            var src = 0;
-            for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                var p1Index = dataTrace.pointIndices[j];
-                var p2Index = dataTrace.pointIndices[j + 1];
-                var p1 = util.vector3FromPackedArray(newPositions, p1Index);
-                var p2 = util.vector3FromPackedArray(newPositions, p2Index);
-                this.tracePositionBuffer[i].setXYZ(src, p1.x, p1.y, p1.z);
-                this.tracePositionBuffer[i].setXYZ(src + 1, p2.x, p2.y, p2.z);
-                src += 2;
-            }
-            this.tracePositionBuffer[i].needsUpdate = true;
-        }
-        if (this.traces == null) {
-            this.createTraces(this.scene);
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.onRender = function (renderContext) {
-        if (this.traces == null) {
-            return;
-        }
-        for (var i = 0; i < this.traces.length; i++) {
-            this.traces[i].material.opacity = renderContext.traceOpacities[i];
-            this.traces[i].material.linewidth =
-                renderContext.traceWidths[i];
-            this.traceColorBuffer[i].array = renderContext.traceColors[i];
-            this.traceColorBuffer[i].needsUpdate = true;
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.onPickingRender = function (renderContext) { };
-    ScatterPlotVisualizerTraces.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizerTraces;
-}());
-exports.ScatterPlotVisualizerTraces = ScatterPlotVisualizerTraces;
-
-},{"./util":24}],23:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * A Space-partitioning tree (https://en.wikipedia.org/wiki/Space_partitioning)
- * that recursively divides the space into regions of equal sizes. This data
- * structure can act both as a Quad tree and an Octree when the data is 2 or
- * 3 dimensional respectively. One usage is in t-SNE in order to do Barnes-Hut
- * approximation.
- */
-var SPTree = (function () {
-    /**
-     * Constructs a new tree with the provided data.
-     *
-     * @param data List of n-dimensional data points.
-     * @param capacity Number of data points to store in a single node.
-     */
-    function SPTree(data) {
-        if (data.length < 1) {
-            throw new Error('There should be at least 1 data point');
-        }
-        // Make a bounding box based on the extent of the data.
-        this.dim = data[0].length;
-        // Each node has 2^d children, where d is the dimension of the space.
-        // Binary masks (e.g. 000, 001, ... 111 in 3D) are used to determine in
-        // which child (e.g. quadron in 2D) the new point is going to be assigned.
-        // For more details, see the insert() method and its comments.
-        this.masks = new Array(Math.pow(2, this.dim));
-        for (var d = 0; d < this.masks.length; ++d) {
-            this.masks[d] = (1 << d);
-        }
-        var min = new Array(this.dim);
-        fillArray(min, Number.POSITIVE_INFINITY);
-        var max = new Array(this.dim);
-        fillArray(max, Number.NEGATIVE_INFINITY);
-        for (var i = 0; i < data.length; ++i) {
-            // For each dim get the min and max.
-            // E.g. For 2-D, get the x_min, x_max, y_min, y_max.
-            for (var d = 0; d < this.dim; ++d) {
-                min[d] = Math.min(min[d], data[i][d]);
-                max[d] = Math.max(max[d], data[i][d]);
-            }
-        }
-        // Create a bounding box with the center of the largest span.
-        var center = new Array(this.dim);
-        var halfDim = 0;
-        for (var d = 0; d < this.dim; ++d) {
-            var span = max[d] - min[d];
-            center[d] = min[d] + span / 2;
-            halfDim = Math.max(halfDim, span / 2);
-        }
-        this.root = { box: { center: center, halfDim: halfDim }, point: data[0] };
-        for (var i = 1; i < data.length; ++i) {
-            this.insert(this.root, data[i]);
-        }
-    }
-    /**
-     * Visits every node in the tree. Each node can store 1 or more points,
-     * depending on the node capacity provided in the constructor.
-     *
-     * @param accessor Method that takes the currently visited node, and the
-     * low and high point of the region that this node occupies. E.g. in 2D,
-     * the low and high points will be the lower-left corner and the upper-right
-     * corner.
-     */
-    SPTree.prototype.visit = function (accessor, noBox) {
-        if (noBox === void 0) { noBox = false; }
-        this.visitNode(this.root, accessor, noBox);
-    };
-    SPTree.prototype.visitNode = function (node, accessor, noBox) {
-        var skipChildren;
-        if (noBox) {
-            skipChildren = accessor(node);
-        }
-        else {
-            var lowPoint = new Array(this.dim);
-            var highPoint = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                lowPoint[d] = node.box.center[d] - node.box.halfDim;
-                highPoint[d] = node.box.center[d] + node.box.halfDim;
-            }
-            skipChildren = accessor(node, lowPoint, highPoint);
-        }
-        if (!node.children || skipChildren) {
-            return;
-        }
-        for (var i = 0; i < node.children.length; ++i) {
-            var child = node.children[i];
-            if (child) {
-                this.visitNode(child, accessor, noBox);
-            }
-        }
-    };
-    SPTree.prototype.insert = function (node, p) {
-        // Subdivide and then add the point to whichever node will accept it.
-        if (node.children == null) {
-            node.children = new Array(this.masks.length);
-        }
-        // Decide which child will get the new point by constructing a D-bits binary
-        // signature (D=3 for 3D) where the k-th bit is 1 if the point's k-th
-        // coordinate is greater than the node's k-th coordinate, 0 otherwise.
-        // Then the binary signature in decimal system gives us the index of the
-        // child where the new point should be.
-        var index = 0;
-        for (var d = 0; d < this.dim; ++d) {
-            if (p[d] > node.box.center[d]) {
-                index |= this.masks[d];
-            }
-        }
-        if (node.children[index] == null) {
-            this.makeChild(node, index, p);
-        }
-        else {
-            this.insert(node.children[index], p);
-        }
-    };
-    SPTree.prototype.makeChild = function (node, index, p) {
-        var oldC = node.box.center;
-        var h = node.box.halfDim / 2;
-        var newC = new Array(this.dim);
-        for (var d = 0; d < this.dim; ++d) {
-            newC[d] = (index & (1 << d)) ? oldC[d] + h : oldC[d] - h;
-        }
-        node.children[index] = { box: { center: newC, halfDim: h }, point: p };
-    };
-    return SPTree;
-}());
-exports.SPTree = SPTree;
-function fillArray(arr, value) {
-    for (var i = 0; i < arr.length; ++i) {
-        arr[i] = value;
-    }
-}
-
-},{}],24:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var logging = require('./logging');
-/**
- * Delay for running expensive tasks, in milliseconds.
- * The duration was empirically found so that it leaves enough time for the
- * browser to update its UI state before starting an expensive UI-blocking task.
- */
-var TASK_DELAY_MS = 200;
-/** Shuffles the array in-place in O(n) time using Fisher-Yates algorithm. */
-function shuffle(array) {
-    var m = array.length;
-    var t;
-    var i;
-    // While there remain elements to shuffle.
-    while (m) {
-        // Pick a remaining element
-        i = Math.floor(Math.random() * m--);
-        // And swap it with the current element.
-        t = array[m];
-        array[m] = array[i];
-        array[i] = t;
-    }
-    return array;
-}
-exports.shuffle = shuffle;
-/** Projects a 3d point into screen space */
-function vector3DToScreenCoords(cam, w, h, v) {
-    var dpr = window.devicePixelRatio;
-    var pv = new THREE.Vector3().copy(v).project(cam);
-    // The screen-space origin is at the middle of the screen, with +y up.
-    var coords = [((pv.x + 1) / 2 * w) * dpr, -((pv.y - 1) / 2 * h) * dpr];
-    return coords;
-}
-exports.vector3DToScreenCoords = vector3DToScreenCoords;
-/** Loads 3 contiguous elements from a packed xyz array into a Vector3. */
-function vector3FromPackedArray(a, pointIndex) {
-    var offset = pointIndex * 3;
-    return new THREE.Vector3(a[offset], a[offset + 1], a[offset + 2]);
-}
-exports.vector3FromPackedArray = vector3FromPackedArray;
-/**
- * Gets the camera-space z coordinates of the nearest and farthest points.
- * Ignores points that are behind the camera.
- */
-function getNearFarPoints(worldSpacePoints, cameraPos, cameraTarget) {
-    var shortestDist = Infinity;
-    var furthestDist = 0;
-    var camToTarget = new THREE.Vector3().copy(cameraTarget).sub(cameraPos);
-    var camPlaneNormal = new THREE.Vector3().copy(camToTarget).normalize();
-    var n = worldSpacePoints.length / 3;
-    var src = 0;
-    var p = new THREE.Vector3();
-    var camToPoint = new THREE.Vector3();
-    for (var i = 0; i < n; i++) {
-        p.x = worldSpacePoints[src];
-        p.y = worldSpacePoints[src + 1];
-        p.z = worldSpacePoints[src + 2];
-        src += 3;
-        camToPoint.copy(p).sub(cameraPos);
-        var dist = camPlaneNormal.dot(camToPoint);
-        if (dist < 0) {
-            continue;
-        }
-        furthestDist = (dist > furthestDist) ? dist : furthestDist;
-        shortestDist = (dist < shortestDist) ? dist : shortestDist;
-    }
-    return [shortestDist, furthestDist];
-}
-exports.getNearFarPoints = getNearFarPoints;
-/**
- * Generate a texture for the points/images and sets some initial params
- */
-function createTexture(image) {
-    var tex = new THREE.Texture(image);
-    tex.needsUpdate = true;
-    // Used if the texture isn't a power of 2.
-    tex.minFilter = THREE.LinearFilter;
-    tex.generateMipmaps = false;
-    tex.flipY = false;
-    return tex;
-}
-exports.createTexture = createTexture;
-/**
- * Assert that the condition is satisfied; if not, log user-specified message
- * to the console.
- */
-function assert(condition, message) {
-    if (!condition) {
-        message = message || 'Assertion failed';
-        throw new Error(message);
-    }
-}
-exports.assert = assert;
-function getSearchPredicate(query, inRegexMode, fieldName) {
-    var predicate;
-    if (inRegexMode) {
-        var regExp_1 = new RegExp(query, 'i');
-        predicate = function (p) { return regExp_1.test(p.metadata[fieldName].toString()); };
-    }
-    else {
-        // Doing a case insensitive substring match.
-        query = query.toLowerCase();
-        predicate = function (p) {
-            var label = p.metadata[fieldName].toString().toLowerCase();
-            return label.indexOf(query) >= 0;
-        };
-    }
-    return predicate;
-}
-exports.getSearchPredicate = getSearchPredicate;
-/**
- * Runs an expensive task asynchronously with some delay
- * so that it doesn't block the UI thread immediately.
- *
- * @param message The message to display to the user.
- * @param task The expensive task to run.
- * @param msgId Optional. ID of an existing message. If provided, will overwrite
- *     an existing message and won't automatically clear the message when the
- *     task is done.
- * @return The value returned by the task.
- */
-function runAsyncTask(message, task, msgId) {
-    if (msgId === void 0) { msgId = null; }
-    var autoClear = (msgId == null);
-    msgId = logging.setModalMessage(message, msgId);
-    return new Promise(function (resolve, reject) {
-        d3.timer(function () {
-            try {
-                var result = task();
-                // Clearing the old message.
-                if (autoClear) {
-                    logging.setModalMessage(null, msgId);
-                }
-                resolve(result);
-            }
-            catch (ex) {
-                reject(ex);
-            }
-            return true;
-        }, TASK_DELAY_MS);
-    });
-}
-exports.runAsyncTask = runAsyncTask;
-/**
- * Parses the URL for query parameters, e.g. ?foo=1&bar=2 will return
- *   {'foo': '1', 'bar': '2'}.
- * @param url The URL to parse.
- * @return A map of queryParam key to its value.
- */
-function getURLParams(url) {
-    if (!url) {
-        return {};
-    }
-    var queryString = url.indexOf('?') !== -1 ? url.split('?')[1] : url;
-    if (queryString.indexOf('#')) {
-        queryString = queryString.split('#')[0];
-    }
-    var queryEntries = queryString.split('&');
-    var queryParams = {};
-    for (var i = 0; i < queryEntries.length; i++) {
-        var queryEntryComponents = queryEntries[i].split('=');
-        queryParams[queryEntryComponents[0].toLowerCase()] =
-            decodeURIComponent(queryEntryComponents[1]);
-    }
-    return queryParams;
-}
-exports.getURLParams = getURLParams;
-/** List of substrings that auto generated tensors have in their name. */
-var SUBSTR_GEN_TENSORS = ['/Adagrad'];
-/** Returns true if the tensor was automatically generated by TF API calls. */
-function tensorIsGenerated(tensorName) {
-    for (var i = 0; i < SUBSTR_GEN_TENSORS.length; i++) {
-        if (tensorName.indexOf(SUBSTR_GEN_TENSORS[i]) >= 0) {
-            return true;
-        }
-    }
-    return false;
-}
-exports.tensorIsGenerated = tensorIsGenerated;
-function xor(cond1, cond2) {
-    return (cond1 || cond2) && !(cond1 && cond2);
-}
-exports.xor = xor;
-/** Checks to see if the browser supports webgl. */
-function hasWebGLSupport() {
-    try {
-        var c = document.createElement('canvas');
-        var gl = c.getContext('webgl') || c.getContext('experimental-webgl');
-        return gl != null && typeof weblas !== 'undefined';
-    }
-    catch (e) {
-        return false;
-    }
-}
-exports.hasWebGLSupport = hasWebGLSupport;
-
-},{"./logging":12}],25:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util_1 = require('./util');
-/** Returns the dot product of two vectors. */
-function dot(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i] * b[i];
-    }
-    return result;
-}
-exports.dot = dot;
-/** Sums all the elements in the vector */
-function sum(a) {
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i];
-    }
-    return result;
-}
-exports.sum = sum;
-/** Returns the sum of two vectors, i.e. a + b */
-function add(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = new Float32Array(a.length);
-    for (var i = 0; i < a.length; ++i) {
-        result[i] = a[i] + b[i];
-    }
-    return result;
-}
-exports.add = add;
-/** Subtracts vector b from vector a, i.e. returns a - b */
-function sub(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = new Float32Array(a.length);
-    for (var i = 0; i < a.length; ++i) {
-        result[i] = a[i] - b[i];
-    }
-    return result;
-}
-exports.sub = sub;
-/** Returns the square norm of the vector */
-function norm2(a) {
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i] * a[i];
-    }
-    return result;
-}
-exports.norm2 = norm2;
-/** Returns the euclidean distance between two vectors. */
-function dist(a, b) {
-    return Math.sqrt(dist2(a, b));
-}
-exports.dist = dist;
-/** Returns the square euclidean distance between two vectors. */
-function dist2(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-    }
-    return result;
-}
-exports.dist2 = dist2;
-/** Returns the square euclidean distance between two 2D points. */
-function dist2_2D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist2_2D = dist2_2D;
-/** Returns the square euclidean distance between two 3D points. */
-function dist2_3D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    var dZ = a[2] - b[2];
-    return dX * dX + dY * dY + dZ * dZ;
-}
-exports.dist2_3D = dist2_3D;
-/** Returns the euclidean distance between 2 3D points. */
-function dist_3D(a, b) {
-    return Math.sqrt(dist2_3D(a, b));
-}
-exports.dist_3D = dist_3D;
-/**
- * Returns the square euclidean distance between two vectors, with an early
- * exit (returns -1) if the distance is >= to the provided limit.
- */
-function dist2WithLimit(a, b, limit) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-        if (result >= limit) {
-            return -1;
-        }
-    }
-    return result;
-}
-exports.dist2WithLimit = dist2WithLimit;
-/** Returns the square euclidean distance between two 2D points. */
-function dist22D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist22D = dist22D;
-/** Modifies the vector in-place to have unit norm. */
-function unit(a) {
-    var norm = Math.sqrt(norm2(a));
-    util_1.assert(norm >= 0, 'Norm of the vector must be > 0');
-    for (var i = 0; i < a.length; ++i) {
-        a[i] /= norm;
-    }
-}
-exports.unit = unit;
-/**
- *  Projects the vectors to a lower dimension
- *
- * @param vectors Array of vectors to be projected.
- * @param newDim The resulting dimension of the vectors.
- */
-function projectRandom(vectors, newDim) {
-    var dim = vectors[0].length;
-    var N = vectors.length;
-    var newVectors = new Array(N);
-    for (var i = 0; i < N; ++i) {
-        newVectors[i] = new Float32Array(newDim);
-    }
-    // Make nDim projections.
-    for (var k = 0; k < newDim; ++k) {
-        var randomVector = rn(dim);
-        for (var i = 0; i < N; ++i) {
-            newVectors[i][k] = dot(vectors[i], randomVector);
-        }
-    }
-    return newVectors;
-}
-exports.projectRandom = projectRandom;
-/**
- * Projects a vector onto a 2D plane specified by the two direction vectors.
- */
-function project2d(a, dir1, dir2) {
-    return [dot(a, dir1), dot(a, dir2)];
-}
-exports.project2d = project2d;
-/**
- * Computes the centroid of the data points. If the provided data points are not
- * vectors, an accessor function needs to be provided.
- */
-function centroid(dataPoints, accessor) {
-    if (dataPoints.length === 0) {
-        return null;
-    }
-    if (accessor == null) {
-        accessor = function (a) { return a; };
-    }
-    util_1.assert(dataPoints.length >= 0, '`vectors` must be of length >= 1');
-    var centroid = new Float32Array(accessor(dataPoints[0]).length);
-    for (var i = 0; i < dataPoints.length; ++i) {
-        var dataPoint = dataPoints[i];
-        var vector = accessor(dataPoint);
-        for (var j = 0; j < centroid.length; ++j) {
-            centroid[j] += vector[j];
-        }
-    }
-    for (var j = 0; j < centroid.length; ++j) {
-        centroid[j] /= dataPoints.length;
-    }
-    return centroid;
-}
-exports.centroid = centroid;
-/**
- * Generates a vector of the specified size where each component is drawn from
- * a random (0, 1) gaussian distribution.
- */
-function rn(size) {
-    var normal = d3.random.normal();
-    var result = new Float32Array(size);
-    for (var i = 0; i < size; ++i) {
-        result[i] = normal();
-    }
-    return result;
-}
-exports.rn = rn;
-/**
- * Returns the cosine distance ([0, 2]) between two vectors
- * that have been normalized to unit norm.
- */
-function cosDistNorm(a, b) {
-    return 1 - dot(a, b);
-}
-exports.cosDistNorm = cosDistNorm;
-/**
- * Returns the cosine distance ([0, 2]) between two vectors.
- */
-function cosDist(a, b) {
-    return 1 - cosSim(a, b);
-}
-exports.cosDist = cosDist;
-/** Returns the cosine similarity ([-1, 1]) between two vectors. */
-function cosSim(a, b) {
-    return dot(a, b) / Math.sqrt(norm2(a) * norm2(b));
-}
-exports.cosSim = cosSim;
-/**
- * Converts list of vectors (matrix) into a 1-dimensional
- * typed array with row-first order.
- */
-function toTypedArray(dataPoints, accessor) {
-    var N = dataPoints.length;
-    var dim = accessor(dataPoints[0]).length;
-    var result = new Float32Array(N * dim);
-    for (var i = 0; i < N; ++i) {
-        var vector = accessor(dataPoints[i]);
-        for (var d = 0; d < dim; ++d) {
-            result[i * dim + d] = vector[d];
-        }
-    }
-    return result;
-}
-exports.toTypedArray = toTypedArray;
-/**
- * Transposes an RxC matrix represented as a flat typed array
- * into a CxR matrix, again represented as a flat typed array.
- */
-function transposeTypedArray(r, c, typedArray) {
-    var result = new Float32Array(r * c);
-    for (var i = 0; i < r; ++i) {
-        for (var j = 0; j < c; ++j) {
-            result[j * r + i] = typedArray[i * c + j];
-        }
-    }
-    return result;
-}
-exports.transposeTypedArray = transposeTypedArray;
-
-},{"./util":24}],26:[function(require,module,exports){
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var logging = require('./logging');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.BookmarkPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-bookmark-panel',
-    properties: {
-        savedStates: Object,
-        // Keep a separate polymer property because the savedStates doesn't change
-        // when adding and removing states.
-        hasStates: { type: Boolean, value: false },
-        selectedState: Number
-    }
-});
-var BookmarkPanel = (function (_super) {
-    __extends(BookmarkPanel, _super);
-    function BookmarkPanel() {
-        _super.apply(this, arguments);
-        this.hasStates = false;
-    }
-    BookmarkPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.savedStates = [];
-        this.setupUploadButton();
-        this.ignoreNextProjectionEvent = false;
-    };
-    BookmarkPanel.prototype.initialize = function (projector, projectorEventContext) {
-        var _this = this;
-        this.projector = projector;
-        projectorEventContext.registerProjectionChangedListener(function () {
-            if (_this.ignoreNextProjectionEvent) {
-                _this.ignoreNextProjectionEvent = false;
-            }
-            else {
-                _this.clearStateSelection();
-            }
-        });
-    };
-    BookmarkPanel.prototype.setSelectedTensor = function (run, tensorInfo, dataProvider) {
-        var _this = this;
-        // Clear any existing bookmarks.
-        this.addStates(null);
-        if (tensorInfo && tensorInfo.bookmarksPath) {
-            // Get any bookmarks that may come when the projector starts up.
-            dataProvider.getBookmarks(run, tensorInfo.tensorName, function (bookmarks) {
-                _this.addStates(bookmarks);
-                _this._expandMore();
-            });
-        }
-        else {
-            this._expandLess();
-        }
-    };
-    /** Handles a click on show bookmarks tray button. */
-    BookmarkPanel.prototype._expandMore = function () {
-        this.$.panel.show();
-        this.dom.select('#expand-more').style('display', 'none');
-        this.dom.select('#expand-less').style('display', '');
-    };
-    /** Handles a click on hide bookmarks tray button. */
-    BookmarkPanel.prototype._expandLess = function () {
-        this.$.panel.hide();
-        this.dom.select('#expand-more').style('display', '');
-        this.dom.select('#expand-less').style('display', 'none');
-    };
-    /** Handles a click on the add bookmark button. */
-    BookmarkPanel.prototype._addBookmark = function () {
-        var currentState = this.projector.getCurrentState();
-        currentState.label = 'State ' + this.savedStates.length;
-        currentState.isSelected = true;
-        this.selectedState = this.savedStates.length;
-        for (var i = 0; i < this.savedStates.length; i++) {
-            this.savedStates[i].isSelected = false;
-            // We have to call notifyPath so that polymer knows this element was
-            // updated.
-            this.notifyPath('savedStates.' + i + '.isSelected', false, false);
-        }
-        this.push('savedStates', currentState);
-        this.updateHasStates();
-    };
-    /** Handles a click on the download bookmarks button. */
-    BookmarkPanel.prototype._downloadFile = function () {
-        var serializedState = this.serializeAllSavedStates();
-        var blob = new Blob([serializedState], { type: 'text/plain' });
-        var textFile = window.URL.createObjectURL(blob);
-        // Force a download.
-        var a = document.createElement('a');
-        document.body.appendChild(a);
-        a.style.display = 'none';
-        a.href = textFile;
-        a.download = 'state';
-        a.click();
-        document.body.removeChild(a);
-        window.URL.revokeObjectURL(textFile);
-    };
-    /** Handles a click on the upload bookmarks button. */
-    BookmarkPanel.prototype._uploadFile = function () {
-        var fileInput = this.dom.select('#state-file');
-        fileInput.node().click();
-    };
-    BookmarkPanel.prototype.setupUploadButton = function () {
-        var _this = this;
-        // Show and setup the load view button.
-        var fileInput = this.dom.select('#state-file');
-        fileInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var str = evt.target.result;
-                var savedStates = JSON.parse(str);
-                // Verify the bookmarks match.
-                if (_this.savedStatesValid(savedStates)) {
-                    _this.addStates(savedStates);
-                    _this.loadSavedState(0);
-                }
-                else {
-                    logging.setWarningMessage("Unable to load bookmarks: wrong dataset, expected dataset " +
-                        ("with shape (" + savedStates[0].dataSetDimensions + ")."));
-                }
-            };
-            fileReader.readAsText(file);
-        });
-    };
-    BookmarkPanel.prototype.addStates = function (savedStates) {
-        if (savedStates == null) {
-            this.savedStates = [];
-        }
-        else {
-            for (var i = 0; i < savedStates.length; i++) {
-                savedStates[i].isSelected = false;
-                this.push('savedStates', savedStates[i]);
-            }
-        }
-        this.updateHasStates();
-    };
-    /** Deselects any selected state selection. */
-    BookmarkPanel.prototype.clearStateSelection = function () {
-        for (var i = 0; i < this.savedStates.length; i++) {
-            this.setSelectionState(i, false);
-        }
-    };
-    /** Handles a radio button click on a saved state. */
-    BookmarkPanel.prototype._radioButtonHandler = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.loadSavedState(index);
-        this.setSelectionState(index, true);
-    };
-    BookmarkPanel.prototype.loadSavedState = function (index) {
-        for (var i = 0; i < this.savedStates.length; i++) {
-            if (this.savedStates[i].isSelected) {
-                this.setSelectionState(i, false);
-            }
-            else if (index === i) {
-                this.setSelectionState(i, true);
-                this.ignoreNextProjectionEvent = true;
-                this.projector.loadState(this.savedStates[i]);
-            }
-        }
-    };
-    BookmarkPanel.prototype.setSelectionState = function (stateIndex, selected) {
-        this.savedStates[stateIndex].isSelected = selected;
-        var path = 'savedStates.' + stateIndex + '.isSelected';
-        this.notifyPath(path, selected, false);
-    };
-    /**
-     * Crawls up the DOM to find an ancestor with a data-index attribute. This is
-     * used to match events to their bookmark index.
-     */
-    BookmarkPanel.prototype.getParentDataIndex = function (evt) {
-        for (var i = 0; i < evt.path.length; i++) {
-            var dataIndex = evt.path[i].getAttribute('data-index');
-            if (dataIndex != null) {
-                return +dataIndex;
-            }
-        }
-        return -1;
-    };
-    /** Handles a clear button click on a bookmark. */
-    BookmarkPanel.prototype._clearButtonHandler = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.splice('savedStates', index, 1);
-        this.updateHasStates();
-    };
-    /** Handles a label change event on a bookmark. */
-    BookmarkPanel.prototype._labelChange = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.savedStates[index].label = evt.target.value;
-    };
-    /**
-     * Used to determine whether to select the radio button for a given bookmark.
-     */
-    BookmarkPanel.prototype._isSelectedState = function (index) {
-        return index === this.selectedState;
-    };
-    BookmarkPanel.prototype._isNotSelectedState = function (index) {
-        return index !== this.selectedState;
-    };
-    /**
-     * Gets all of the saved states as a serialized string.
-     */
-    BookmarkPanel.prototype.serializeAllSavedStates = function () {
-        return JSON.stringify(this.savedStates);
-    };
-    /**
-     * Loads all of the serialized states and shows them in the list of
-     * viewable states.
-     */
-    BookmarkPanel.prototype.loadSavedStates = function (serializedStates) {
-        this.savedStates = JSON.parse(serializedStates);
-        this.updateHasStates();
-    };
-    /**
-     * Updates the hasState polymer property.
-     */
-    BookmarkPanel.prototype.updateHasStates = function () {
-        this.hasStates = (this.savedStates.length !== 0);
-    };
-    /** Sanity checks a State array to ensure it matches the current dataset. */
-    BookmarkPanel.prototype.savedStatesValid = function (states) {
-        for (var i = 0; i < states.length; i++) {
-            if (states[i].dataSetDimensions[0] !== this.projector.dataSet.dim[0] ||
-                states[i].dataSetDimensions[1] !== this.projector.dataSet.dim[1]) {
-                return false;
-            }
-        }
-        return true;
-    };
-    return BookmarkPanel;
-}(exports.BookmarkPanelPolymer));
-exports.BookmarkPanel = BookmarkPanel;
-document.registerElement(BookmarkPanel.prototype.is, BookmarkPanel);
-
-},{"./logging":12,"./vz-projector-util":33}],27:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var data_provider_1 = require('./data-provider');
-var util = require('./util');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-exports.DataPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-data-panel',
-    properties: {
-        selectedTensor: { type: String, observer: '_selectedTensorChanged' },
-        selectedRun: { type: String, observer: '_selectedRunChanged' },
-        selectedColorOptionName: {
-            type: String,
-            notify: true,
-            observer: '_selectedColorOptionNameChanged'
-        },
-        selectedLabelOption: { type: String, notify: true, observer: '_selectedLabelOptionChanged' },
-        normalizeData: Boolean
-    }
-});
-var DataPanel = (function (_super) {
-    __extends(DataPanel, _super);
-    function DataPanel() {
-        _super.apply(this, arguments);
-    }
-    DataPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.normalizeData = true;
-    };
-    DataPanel.prototype.initialize = function (projector, dp) {
-        var _this = this;
-        this.projector = projector;
-        this.dataProvider = dp;
-        this.setupUploadButtons();
-        // Tell the projector whenever the data normalization changes.
-        // Unknown why, but the polymer checkbox button stops working as soon as
-        // you do d3.select() on it.
-        this.querySelector('#normalize-data-checkbox')
-            .addEventListener('change', function () {
-            _this.projector.setNormalizeData(_this.normalizeData);
-        });
-        // Get all the runs.
-        this.dataProvider.retrieveRuns(function (runs) {
-            _this.runNames = runs;
-            // Choose the first run by default.
-            if (_this.runNames.length > 0) {
-                _this.selectedRun = runs[0];
-            }
-        });
-    };
-    DataPanel.prototype.getSeparatorClass = function (isSeparator) {
-        return isSeparator ? 'separator' : null;
-    };
-    DataPanel.prototype.metadataChanged = function (spriteAndMetadata, metadataFile) {
-        this.updateMetadataUI(spriteAndMetadata.stats, metadataFile);
-    };
-    DataPanel.prototype.addWordBreaks = function (longString) {
-        if (longString == null) {
-            return '';
-        }
-        return longString.replace(/([\/=-_,])/g, '$1<wbr>');
-    };
-    DataPanel.prototype.updateMetadataUI = function (columnStats, metadataFile) {
-        this.dom.select('#metadata-file')
-            .html(this.addWordBreaks(metadataFile))
-            .attr('title', metadataFile);
-        // Label by options.
-        var labelIndex = -1;
-        this.labelOptions = columnStats.map(function (stats, i) {
-            // Make the default label by the first non-numeric column.
-            if (!stats.isNumeric && labelIndex === -1) {
-                labelIndex = i;
-            }
-            return stats.name;
-        });
-        this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
-        // Color by options.
-        var standardColorOption = [
-            { name: 'No color map' },
-        ];
-        var metadataColorOption = columnStats
-            .filter(function (stats) {
-            return !stats.tooManyUniqueValues || stats.isNumeric;
-        })
-            .map(function (stats) {
-            var map;
-            var items;
-            var thresholds;
-            if (!stats.tooManyUniqueValues) {
-                var scale = d3.scale.category20();
-                var range_1 = scale.range();
-                // Re-order the range.
-                var newRange = range_1.map(function (color, i) {
-                    var index = (i * 3) % range_1.length;
-                    return range_1[index];
-                });
-                items = stats.uniqueEntries;
-                scale.range(newRange).domain(items.map(function (x) { return x.label; }));
-                map = scale;
-            }
-            else {
-                thresholds = [
-                    { color: '#ffffdd', value: stats.min },
-                    { color: '#1f2d86', value: stats.max }
-                ];
-                map = d3.scale.linear()
-                    .domain(thresholds.map(function (t) { return t.value; }))
-                    .range(thresholds.map(function (t) { return t.color; }));
-            }
-            var desc = stats.tooManyUniqueValues ?
-                'gradient' :
-                stats.uniqueEntries.length +
-                    ((stats.uniqueEntries.length > 20) ? ' non-unique' : '') +
-                    ' colors';
-            return { name: stats.name, desc: desc, map: map, items: items, thresholds: thresholds };
-        });
-        if (metadataColorOption.length > 0) {
-            // Add a separator line between built-in color maps
-            // and those based on metadata columns.
-            standardColorOption.push({ name: 'Metadata', isSeparator: true });
-        }
-        this.colorOptions = standardColorOption.concat(metadataColorOption);
-        this.selectedColorOptionName = this.colorOptions[0].name;
-    };
-    DataPanel.prototype.setNormalizeData = function (normalizeData) {
-        this.normalizeData = normalizeData;
-    };
-    DataPanel.prototype._selectedTensorChanged = function () {
-        var _this = this;
-        this.projector.updateDataSet(null, null, null);
-        if (this.selectedTensor == null) {
-            return;
-        }
-        this.dataProvider.retrieveTensor(this.selectedRun, this.selectedTensor, function (ds) {
-            var metadataFile = _this.getEmbeddingInfoByName(_this.selectedTensor).metadataPath;
-            _this.dataProvider.retrieveSpriteAndMetadata(_this.selectedRun, _this.selectedTensor, function (metadata) {
-                _this.projector.updateDataSet(ds, metadata, metadataFile);
-            });
-        });
-        this.projector.setSelectedTensor(this.selectedRun, this.getEmbeddingInfoByName(this.selectedTensor));
-    };
-    DataPanel.prototype._selectedRunChanged = function () {
-        var _this = this;
-        this.dataProvider.retrieveProjectorConfig(this.selectedRun, function (info) {
-            _this.projectorConfig = info;
-            var names = _this.projectorConfig.embeddings.map(function (e) { return e.tensorName; })
-                .filter(function (name) {
-                var shape = _this.getEmbeddingInfoByName(name).tensorShape;
-                return shape.length === 2 && shape[0] > 1 && shape[1] > 1;
-            })
-                .sort(function (a, b) {
-                var embA = _this.getEmbeddingInfoByName(a);
-                var embB = _this.getEmbeddingInfoByName(b);
-                // Prefer tensors with metadata.
-                if (util.xor(!!embA.metadataPath, !!embB.metadataPath)) {
-                    return embA.metadataPath ? -1 : 1;
-                }
-                // Prefer non-generated tensors.
-                var isGenA = util.tensorIsGenerated(a);
-                var isGenB = util.tensorIsGenerated(b);
-                if (util.xor(isGenA, isGenB)) {
-                    return isGenB ? -1 : 1;
-                }
-                // Prefer bigger tensors.
-                var sizeA = embA.tensorShape[0];
-                var sizeB = embB.tensorShape[0];
-                if (sizeA !== sizeB) {
-                    return sizeB - sizeA;
-                }
-                // Sort alphabetically by tensor name.
-                return a <= b ? -1 : 1;
-            });
-            _this.tensorNames = names.map(function (name) {
-                return { name: name, shape: _this.getEmbeddingInfoByName(name).tensorShape };
-            });
-            var wordBreakablePath = _this.addWordBreaks(_this.projectorConfig.modelCheckpointPath);
-            _this.dom.select('#checkpoint-file')
-                .html(wordBreakablePath)
-                .attr('title', _this.projectorConfig.modelCheckpointPath);
-            // If in demo mode, let the order decide which tensor to load by default.
-            var defaultTensor = _this.projector.servingMode === 'demo' ?
-                _this.projectorConfig.embeddings[0].tensorName :
-                names[0];
-            if (_this.selectedTensor === defaultTensor) {
-                // Explicitly call the observer. Polymer won't call it if the previous
-                // string matches the current string.
-                _this._selectedTensorChanged();
-            }
-            else {
-                _this.selectedTensor = defaultTensor;
-            }
-        });
-    };
-    DataPanel.prototype._selectedLabelOptionChanged = function () {
-        this.projector.setSelectedLabelOption(this.selectedLabelOption);
-    };
-    DataPanel.prototype._selectedColorOptionNameChanged = function () {
-        var colorOption;
-        for (var i = 0; i < this.colorOptions.length; i++) {
-            if (this.colorOptions[i].name === this.selectedColorOptionName) {
-                colorOption = this.colorOptions[i];
-                break;
-            }
-        }
-        if (!colorOption) {
-            return;
-        }
-        if (colorOption.map == null) {
-            this.colorLegendRenderInfo = null;
-        }
-        else if (colorOption.items) {
-            var items = colorOption.items.map(function (item) {
-                return {
-                    color: colorOption.map(item.label),
-                    label: item.label,
-                    count: item.count
-                };
-            });
-            this.colorLegendRenderInfo = { items: items, thresholds: null };
-        }
-        else {
-            this.colorLegendRenderInfo = {
-                items: null,
-                thresholds: colorOption.thresholds
-            };
-        }
-        this.projector.setSelectedColorOption(colorOption);
-    };
-    DataPanel.prototype.tensorWasReadFromFile = function (rawContents, fileName) {
-        var _this = this;
-        data_provider_1.parseRawTensors(rawContents, function (ds) {
-            _this.dom.select('#checkpoint-file')
-                .text(fileName)
-                .attr('title', fileName);
-            _this.projector.updateDataSet(ds);
-        });
-    };
-    DataPanel.prototype.metadataWasReadFromFile = function (rawContents, fileName) {
-        var _this = this;
-        data_provider_1.parseRawMetadata(rawContents, function (metadata) {
-            _this.projector.updateDataSet(_this.projector.dataSet, metadata, fileName);
-        });
-    };
-    DataPanel.prototype.getEmbeddingInfoByName = function (tensorName) {
-        for (var i = 0; i < this.projectorConfig.embeddings.length; i++) {
-            var e = this.projectorConfig.embeddings[i];
-            if (e.tensorName === tensorName) {
-                return e;
-            }
-        }
-    };
-    DataPanel.prototype.setupUploadButtons = function () {
-        var _this = this;
-        // Show and setup the upload button.
-        var fileInput = this.dom.select('#file');
-        fileInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var content = evt.target.result;
-                _this.tensorWasReadFromFile(content, file.name);
-            };
-            fileReader.readAsText(file);
-        });
-        var uploadButton = this.dom.select('#upload-tensors');
-        uploadButton.on('click', function () {
-            fileInput.node().click();
-        });
-        // Show and setup the upload metadata button.
-        var fileMetadataInput = this.dom.select('#file-metadata');
-        fileMetadataInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var contents = evt.target.result;
-                _this.metadataWasReadFromFile(contents, file.name);
-            };
-            fileReader.readAsText(file);
-        });
-        var uploadMetadataButton = this.dom.select('#upload-metadata');
-        uploadMetadataButton.on('click', function () {
-            fileMetadataInput.node().click();
-        });
-        if (this.projector.servingMode === 'demo') {
-            this.$$('#demo-data-buttons-container').style.display =
-                'block';
-            // Fill out the projector config.
-            var projectorConfigTemplate_1 = this.$$('#projector-config-template');
-            var projectorConfigTemplateJson_1 = {
-                embeddings: [{
-                        tensorName: 'My tensor',
-                        tensorShape: [1000, 50],
-                        tensorPath: 'https://gist.github.com/.../tensors.tsv',
-                        metadataPath: 'https://gist.github.com/.../optional.metadata.tsv',
-                    }],
-            };
-            this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            // Set up optional field checkboxes.
-            var spriteFieldCheckbox_1 = this.$$('#config-sprite-checkbox');
-            spriteFieldCheckbox_1.addEventListener('change', function () {
-                if (spriteFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].sprite = {
-                        imagePath: 'https://github.com/.../optional.sprite.png',
-                        singleImageDim: [32, 32]
-                    };
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].sprite;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            var bookmarksFieldCheckbox_1 = this.$$('#config-bookmarks-checkbox');
-            bookmarksFieldCheckbox_1.addEventListener('change', function () {
-                if (bookmarksFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].bookmarksPath =
-                        'https://gist.github.com/.../bookmarks.txt';
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].bookmarksPath;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            var metadataFieldCheckbox_1 = this.$$('#config-metadata-checkbox');
-            metadataFieldCheckbox_1.addEventListener('change', function () {
-                if (metadataFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].metadataPath =
-                        'https://gist.github.com/.../optional.metadata.tsv';
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].metadataPath;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            // Update the link and the readonly shareable URL.
-            var projectorConfigUrlInput_1 = this.$$('#projector-config-url');
-            var projectorConfigDemoUrlInput_1 = this.$$('#projector-share-url');
-            var projectorConfigDemoUrlLink_1 = this.$$('#projector-share-url-link');
-            projectorConfigUrlInput_1.addEventListener('input', function () {
-                var projectorDemoUrl = location.protocol + '//' + location.host +
-                    location.pathname + '?config=' +
-                    projectorConfigUrlInput_1.value;
-                projectorConfigDemoUrlInput_1.value = projectorDemoUrl;
-                projectorConfigDemoUrlLink_1.href = projectorDemoUrl;
-            });
-        }
-    };
-    DataPanel.prototype.setProjectorConfigTemplateJson = function (projectorConfigTemplate, config) {
-        projectorConfigTemplate.value =
-            JSON.stringify(config, null, /** replacer */ 2 /** white space */);
-    };
-    DataPanel.prototype._getNumTensorsLabel = function () {
-        return this.tensorNames.length === 1 ? '1 tensor' :
-            this.tensorNames.length + ' tensors';
-    };
-    DataPanel.prototype._getNumRunsLabel = function () {
-        return this.runNames.length === 1 ? '1 run' :
-            this.runNames.length + ' runs';
-    };
-    DataPanel.prototype._hasChoices = function (choices) {
-        return choices.length > 1;
-    };
-    return DataPanel;
-}(exports.DataPanelPolymer));
-exports.DataPanel = DataPanel;
-document.registerElement(DataPanel.prototype.is, DataPanel);
-
-},{"./data-provider":6,"./util":24,"./vz-projector-util":33}],28:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.PolymerClass = vz_projector_util_1.PolymerElement({ is: 'vz-projector-input', properties: { label: String, message: String } });
-/** Input control with custom capabilities (e.g. regex). */
-var ProjectorInput = (function (_super) {
-    __extends(ProjectorInput, _super);
-    function ProjectorInput() {
-        _super.apply(this, arguments);
-    }
-    /** Subscribe to be called everytime the input changes. */
-    ProjectorInput.prototype.registerInputChangedListener = function (listener) {
-        this.textChangedListeners.push(listener);
-    };
-    ProjectorInput.prototype.ready = function () {
-        var _this = this;
-        this.inRegexMode = false;
-        this.textChangedListeners = [];
-        this.dom = d3.select(this);
-        this.paperInput = this.querySelector('paper-input');
-        this.inRegexModeButton =
-            this.querySelector('paper-button');
-        this.paperInput.setAttribute('error-message', 'Invalid regex');
-        this.paperInput.addEventListener('input', function () {
-            _this.onTextChanged();
-        });
-        this.paperInput.addEventListener('keydown', function (event) {
-            event.stopPropagation();
-        });
-        this.inRegexModeButton.addEventListener('click', function () { return _this.onClickRegexModeButton(); });
-        this.updateRegexModeDisplaySlashes();
-        this.onTextChanged();
-    };
-    ProjectorInput.prototype.onClickRegexModeButton = function () {
-        this.inRegexMode = this.inRegexModeButton.active;
-        this.updateRegexModeDisplaySlashes();
-        this.onTextChanged();
-    };
-    ProjectorInput.prototype.notifyInputChanged = function (value, inRegexMode) {
-        this.textChangedListeners.forEach(function (l) { return l(value, inRegexMode); });
-    };
-    ProjectorInput.prototype.onTextChanged = function () {
-        try {
-            if (this.inRegexMode) {
-                new RegExp(this.paperInput.value);
-            }
-        }
-        catch (invalidRegexException) {
-            this.paperInput.setAttribute('invalid', 'true');
-            this.message = '';
-            this.notifyInputChanged(null, true);
-            return;
-        }
-        this.paperInput.removeAttribute('invalid');
-        this.notifyInputChanged(this.paperInput.value, this.inRegexMode);
-    };
-    ProjectorInput.prototype.updateRegexModeDisplaySlashes = function () {
-        d3.select(this.paperInput)
-            .selectAll('.slash')
-            .style('display', this.inRegexMode ? null : 'none');
-    };
-    ProjectorInput.prototype.getValue = function () {
-        return this.paperInput.value;
-    };
-    ProjectorInput.prototype.getInRegexMode = function () {
-        return this.inRegexMode;
-    };
-    ProjectorInput.prototype.set = function (value, inRegexMode) {
-        this.inRegexModeButton.active = inRegexMode;
-        this.paperInput.value = value;
-        this.onClickRegexModeButton();
-    };
-    return ProjectorInput;
-}(exports.PolymerClass));
-exports.ProjectorInput = ProjectorInput;
-document.registerElement(ProjectorInput.prototype.is, ProjectorInput);
-
-},{"./vz-projector-util":33}],29:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var adapter = require('./projectorScatterPlotAdapter');
-var vector = require('./vector');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-/** Limit the number of search results we show to the user. */
-var LIMIT_RESULTS = 100;
-// tslint:disable-next-line
-exports.PolymerClass = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-inspector-panel',
-    properties: { selectedMetadataField: String, metadataFields: Array }
-});
-var InspectorPanel = (function (_super) {
-    __extends(InspectorPanel, _super);
-    function InspectorPanel() {
-        _super.apply(this, arguments);
-    }
-    InspectorPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.resetFilterButton = this.dom.select('.reset-filter');
-        this.setFilterButton = this.dom.select('.set-filter');
-        this.clearSelectionButton = this.dom.select('.clear-selection');
-        this.limitMessage = this.dom.select('.limit-msg');
-        this.searchBox = this.querySelector('#search-box');
-        // https://www.polymer-project.org/1.0/docs/devguide/styling#scope-subtree
-        this.scopeSubtree(this, true);
-    };
-    InspectorPanel.prototype.initialize = function (projector, projectorEventContext) {
-        var _this = this;
-        this.projector = projector;
-        this.projectorEventContext = projectorEventContext;
-        this.setupUI(projector);
-        projectorEventContext.registerSelectionChangedListener(function (selection, neighbors) {
-            return _this.updateInspectorPane(selection, neighbors);
-        });
-    };
-    /** Updates the nearest neighbors list in the inspector. */
-    InspectorPanel.prototype.updateInspectorPane = function (indices, neighbors) {
-        this.neighborsOfFirstPoint = neighbors;
-        this.selectedPointIndices = indices;
-        this.updateFilterButtons(indices.length + neighbors.length);
-        this.updateNeighborsList(neighbors);
-        if (neighbors.length === 0) {
-            this.updateSearchResults(indices);
-        }
-        else {
-            this.updateSearchResults([]);
-        }
-    };
-    InspectorPanel.prototype.enableResetFilterButton = function (enabled) {
-        this.resetFilterButton.attr('disabled', enabled ? null : true);
-    };
-    InspectorPanel.prototype.restoreUIFromBookmark = function (bookmark) {
-        this.enableResetFilterButton(bookmark.filteredPoints != null);
-    };
-    InspectorPanel.prototype.metadataChanged = function (spriteAndMetadata) {
-        var labelIndex = -1;
-        this.metadataFields = spriteAndMetadata.stats.map(function (stats, i) {
-            if (!stats.isNumeric && labelIndex === -1) {
-                labelIndex = i;
-            }
-            return stats.name;
-        });
-        labelIndex = Math.max(0, labelIndex);
-        // Make the default label the first non-numeric column.
-        this.selectedMetadataField = spriteAndMetadata.stats[labelIndex].name;
-    };
-    InspectorPanel.prototype.datasetChanged = function () {
-        this.enableResetFilterButton(false);
-    };
-    InspectorPanel.prototype.updateSearchResults = function (indices) {
-        var _this = this;
-        var container = this.dom.select('.matches-list');
-        container.style('display', indices.length ? null : 'none');
-        var list = container.select('.list');
-        list.html('');
-        if (indices.length === 0) {
-            return;
-        }
-        this.limitMessage.style('display', indices.length <= LIMIT_RESULTS ? 'none' : null);
-        indices = indices.slice(0, LIMIT_RESULTS);
-        var rows = list.selectAll('.row').data(indices).enter().append('div').attr('class', 'row');
-        rows.append('a')
-            .attr('class', 'label')
-            .attr('title', function (index) { return _this.getLabelFromIndex(index); })
-            .text(function (index) { return _this.getLabelFromIndex(index); });
-        rows.on('mouseenter', function (index) {
-            _this.projectorEventContext.notifyHoverOverPoint(index);
-        });
-        rows.on('mouseleave', function () {
-            _this.projectorEventContext.notifyHoverOverPoint(null);
-        });
-        rows.on('click', function (index) {
-            _this.projectorEventContext.notifySelectionChanged([index]);
-        });
-    };
-    InspectorPanel.prototype.getLabelFromIndex = function (pointIndex) {
-        var point = this.projector.dataSet.points[pointIndex];
-        return point.metadata[this.selectedMetadataField].toString();
-    };
-    InspectorPanel.prototype.updateNeighborsList = function (neighbors) {
-        var _this = this;
-        var nnlist = this.dom.select('.nn-list');
-        nnlist.html('');
-        this.dom.select('.nn').style('display', neighbors.length ? null : 'none');
-        if (neighbors.length === 0) {
-            return;
-        }
-        this.searchBox.message = '';
-        var minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
-        var n = nnlist.selectAll('.neighbor')
-            .data(neighbors)
-            .enter()
-            .append('div')
-            .attr('class', 'neighbor')
-            .append('a')
-            .attr('class', 'neighbor-link')
-            .attr('title', function (d) { return _this.getLabelFromIndex(d.index); });
-        var labelValue = n.append('div').attr('class', 'label-and-value');
-        labelValue.append('div')
-            .attr('class', 'label')
-            .style('color', function (d) { return adapter.dist2color(_this.distFunc, d.dist, minDist); })
-            .text(function (d) { return _this.getLabelFromIndex(d.index); });
-        labelValue.append('div')
-            .attr('class', 'value')
-            .text(function (d) { return d.dist.toFixed(3); });
-        var bar = n.append('div').attr('class', 'bar');
-        bar.append('div')
-            .attr('class', 'fill')
-            .style('border-top-color', function (d) {
-            return adapter.dist2color(_this.distFunc, d.dist, minDist);
-        })
-            .style('width', function (d) { return adapter.normalizeDist(_this.distFunc, d.dist, minDist) * 100 +
-            '%'; });
-        bar.selectAll('.tick')
-            .data(d3.range(1, 4))
-            .enter()
-            .append('div')
-            .attr('class', 'tick')
-            .style('left', function (d) { return d * 100 / 4 + '%'; });
-        n.on('mouseenter', function (d) {
-            _this.projectorEventContext.notifyHoverOverPoint(d.index);
-        });
-        n.on('mouseleave', function () {
-            _this.projectorEventContext.notifyHoverOverPoint(null);
-        });
-        n.on('click', function (d) {
-            _this.projectorEventContext.notifySelectionChanged([d.index]);
-        });
-    };
-    InspectorPanel.prototype.updateFilterButtons = function (numPoints) {
-        if (numPoints > 1) {
-            this.setFilterButton.text("Isolate " + numPoints + " points")
-                .attr('disabled', null);
-            this.clearSelectionButton.attr('disabled', null);
-        }
-        else {
-            this.setFilterButton.attr('disabled', true);
-            this.clearSelectionButton.attr('disabled', true);
-        }
-    };
-    InspectorPanel.prototype.setupUI = function (projector) {
-        var _this = this;
-        this.distFunc = vector.cosDist;
-        var eucDist = this.dom.select('.distance a.euclidean');
-        eucDist.on('click', function () {
-            _this.dom.selectAll('.distance a').classed('selected', false);
-            eucDist.classed('selected', true);
-            _this.distFunc = vector.dist;
-            _this.projectorEventContext.notifyDistanceMetricChanged(_this.distFunc);
-            var neighbors = projector.dataSet.findNeighbors(_this.selectedPointIndices[0], _this.distFunc, _this.numNN);
-            _this.updateNeighborsList(neighbors);
-        });
-        var cosDist = this.dom.select('.distance a.cosine');
-        cosDist.on('click', function () {
-            _this.dom.selectAll('.distance a').classed('selected', false);
-            cosDist.classed('selected', true);
-            _this.distFunc = vector.cosDist;
-            _this.projectorEventContext.notifyDistanceMetricChanged(_this.distFunc);
-            var neighbors = projector.dataSet.findNeighbors(_this.selectedPointIndices[0], _this.distFunc, _this.numNN);
-            _this.updateNeighborsList(neighbors);
-        });
-        // Called whenever the search text input changes.
-        var updateInput = function (value, inRegexMode) {
-            if (value == null || value.trim() === '') {
-                _this.searchBox.message = '';
-                _this.projectorEventContext.notifySelectionChanged([]);
-                return;
-            }
-            var indices = projector.dataSet.query(value, inRegexMode, _this.selectedMetadataField);
-            if (indices.length === 0) {
-                _this.searchBox.message = '0 matches.';
-            }
-            else {
-                _this.searchBox.message = indices.length + " matches.";
-            }
-            _this.projectorEventContext.notifySelectionChanged(indices);
-        };
-        this.searchBox.registerInputChangedListener(function (value, inRegexMode) {
-            updateInput(value, inRegexMode);
-        });
-        // Nearest neighbors controls.
-        var numNNInput = this.$$('#nn-slider');
-        var updateNumNN = function () {
-            _this.numNN = +numNNInput.value;
-            _this.dom.select('.num-nn .nn-count').text(_this.numNN);
-            if (_this.selectedPointIndices != null) {
-                _this.projectorEventContext.notifySelectionChanged([_this.selectedPointIndices[0]]);
-            }
-        };
-        numNNInput.addEventListener('change', updateNumNN);
-        updateNumNN();
-        // Filtering dataset.
-        this.setFilterButton.on('click', function () {
-            var indices = _this.selectedPointIndices.concat(_this.neighborsOfFirstPoint.map(function (n) { return n.index; }));
-            projector.filterDataset(indices);
-            _this.enableResetFilterButton(true);
-            _this.updateFilterButtons(0);
-        });
-        this.resetFilterButton.on('click', function () {
-            projector.resetFilterDataset();
-            _this.enableResetFilterButton(false);
-        });
-        this.clearSelectionButton.on('click', function () {
-            projector.adjustSelectionAndHover([]);
-        });
-        this.enableResetFilterButton(false);
-    };
-    return InspectorPanel;
-}(exports.PolymerClass));
-exports.InspectorPanel = InspectorPanel;
-document.registerElement(InspectorPanel.prototype.is, InspectorPanel);
-
-},{"./projectorScatterPlotAdapter":14,"./vector":25,"./vz-projector-util":33}],30:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.LegendPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-legend',
-    properties: { renderInfo: { type: Object, observer: '_renderInfoChanged' } }
-});
-var Legend = (function (_super) {
-    __extends(Legend, _super);
-    function Legend() {
-        _super.apply(this, arguments);
-    }
-    Legend.prototype.ready = function () {
-        this.dom = d3.select(this);
-    };
-    Legend.prototype._renderInfoChanged = function () {
-        var _this = this;
-        if (this.renderInfo == null) {
-            return;
-        }
-        if (this.renderInfo.thresholds) {
-            // <linearGradient> is under dom-if so we should wait for it to be
-            // inserted in the dom tree using async().
-            this.async(function () { return _this.setupLinearGradient(); });
-        }
-    };
-    Legend.prototype._getLastThreshold = function () {
-        if (this.renderInfo == null || this.renderInfo.thresholds == null) {
-            return;
-        }
-        return this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1]
-            .value;
-    };
-    Legend.prototype.getOffset = function (value) {
-        var min = this.renderInfo.thresholds[0].value;
-        var max = this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1].value;
-        return (100 * (value - min) / (max - min)).toFixed(2) + '%';
-    };
-    Legend.prototype.setupLinearGradient = function () {
-        var _this = this;
-        var linearGradient = this.dom.select('#gradient');
-        var width = this.dom.select('svg.gradient').node().clientWidth;
-        // Set the svg <rect> to be the width of its <svg> parent.
-        this.dom.select('svg.gradient rect').attr('width', width);
-        // Remove all <stop> children from before.
-        linearGradient.selectAll('*').remove();
-        // Add a <stop> child in <linearGradient> for each gradient threshold.
-        this.renderInfo.thresholds.forEach(function (t) {
-            linearGradient.append('stop')
-                .attr('offset', _this.getOffset(t.value))
-                .attr('stop-color', t.color);
-        });
-    };
-    return Legend;
-}(exports.LegendPolymer));
-exports.Legend = Legend;
-document.registerElement(Legend.prototype.is, Legend);
-
-},{"./vz-projector-util":33}],31:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.MetadataCardPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-metadata-card',
-    properties: {
-        hasMetadata: { type: Boolean, value: false },
-        metadata: { type: Array },
-        label: String
-    }
-});
-var MetadataCard = (function (_super) {
-    __extends(MetadataCard, _super);
-    function MetadataCard() {
-        _super.apply(this, arguments);
-    }
-    MetadataCard.prototype.ready = function () {
-        this.dom = d3.select(this);
-    };
-    /** Handles a click on the expand more icon. */
-    MetadataCard.prototype._expandMore = function () {
-        this.$$('#metadata-container').toggle();
-        this.dom.select('#expand-more').style('display', 'none');
-        this.dom.select('#expand-less').style('display', '');
-    };
-    /** Handles a click on the expand less icon. */
-    MetadataCard.prototype._expandLess = function () {
-        this.$$('#metadata-container').toggle();
-        this.dom.select('#expand-more').style('display', '');
-        this.dom.select('#expand-less').style('display', 'none');
-    };
-    MetadataCard.prototype.updateMetadata = function (pointMetadata) {
-        this.pointMetadata = pointMetadata;
-        this.hasMetadata = (pointMetadata != null);
-        if (pointMetadata) {
-            var metadata = [];
-            for (var metadataKey in pointMetadata) {
-                if (!pointMetadata.hasOwnProperty(metadataKey)) {
-                    continue;
-                }
-                metadata.push({ key: metadataKey, value: pointMetadata[metadataKey] });
-            }
-            this.metadata = metadata;
-            this.label = '' + this.pointMetadata[this.labelOption];
-        }
-    };
-    MetadataCard.prototype.setLabelOption = function (labelOption) {
-        this.labelOption = labelOption;
-        if (this.pointMetadata) {
-            this.label = '' + this.pointMetadata[this.labelOption];
-        }
-    };
-    return MetadataCard;
-}(exports.MetadataCardPolymer));
-exports.MetadataCard = MetadataCard;
-document.registerElement(MetadataCard.prototype.is, MetadataCard);
-
-},{"./vz-projector-util":33}],32:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var data = require('./data');
-var data_1 = require('./data');
-var vector = require('./vector');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-var NUM_PCA_COMPONENTS = 10;
-// tslint:disable-next-line
-exports.ProjectionsPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-projections-panel',
-    properties: {
-        pcaIs3d: { type: Boolean, value: true, observer: '_pcaDimensionToggleObserver' },
-        tSNEis3d: { type: Boolean, value: true, observer: '_tsneDimensionToggleObserver' },
-        // PCA projection.
-        pcaComponents: Array,
-        pcaX: { type: Number, value: 0, observer: 'showPCAIfEnabled' },
-        pcaY: { type: Number, value: 1, observer: 'showPCAIfEnabled' },
-        pcaZ: { type: Number, value: 2, observer: 'showPCAIfEnabled' },
-        // Custom projection.
-        customSelectedSearchByMetadataOption: {
-            type: String,
-            observer: '_customSelectedSearchByMetadataOptionChanged'
-        },
-    }
-});
-/**
- * A polymer component which handles the projection tabs in the projector.
- */
-var ProjectionsPanel = (function (_super) {
-    __extends(ProjectionsPanel, _super);
-    function ProjectionsPanel() {
-        _super.apply(this, arguments);
-    }
-    ProjectionsPanel.prototype.initialize = function (projector) {
-        this.polymerChangesTriggerReprojection = true;
-        this.projector = projector;
-        // Set up TSNE projections.
-        this.perplexity = 30;
-        this.learningRate = 10;
-        // Setup Custom projections.
-        this.centroidValues = { xLeft: null, xRight: null, yUp: null, yDown: null };
-        this.clearCentroids();
-        this.setupUIControls();
-    };
-    ProjectionsPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.zDropdown = this.dom.select('#z-dropdown');
-        this.runTsneButton = this.dom.select('.run-tsne');
-        this.stopTsneButton = this.dom.select('.stop-tsne');
-        this.perplexitySlider = this.$$('#perplexity-slider');
-        this.learningRateInput =
-            this.$$('#learning-rate-slider');
-        this.iterationLabel = this.dom.select('.run-tsne-iter');
-    };
-    ProjectionsPanel.prototype.disablePolymerChangesTriggerReprojection = function () {
-        this.polymerChangesTriggerReprojection = false;
-    };
-    ProjectionsPanel.prototype.enablePolymerChangesTriggerReprojection = function () {
-        this.polymerChangesTriggerReprojection = true;
-    };
-    ProjectionsPanel.prototype.updateTSNEPerplexityFromSliderChange = function () {
-        if (this.perplexitySlider) {
-            this.perplexity = +this.perplexitySlider.value;
-        }
-        this.dom.select('.tsne-perplexity span').text(this.perplexity);
-    };
-    ProjectionsPanel.prototype.updateTSNELearningRateFromUIChange = function () {
-        if (this.learningRateInput) {
-            this.learningRate = Math.pow(10, +this.learningRateInput.value);
-        }
-        this.dom.select('.tsne-learning-rate span').text(this.learningRate);
-    };
-    ProjectionsPanel.prototype.setupUIControls = function () {
-        var _this = this;
-        {
-            var self_1 = this;
-            this.dom.selectAll('.ink-tab').on('click', function () {
-                var id = this.getAttribute('data-tab');
-                self_1.showTab(id);
-            });
-        }
-        this.runTsneButton.on('click', function () { return _this.runTSNE(); });
-        this.stopTsneButton.on('click', function () { return _this.dataSet.stopTSNE(); });
-        this.perplexitySlider.value = this.perplexity.toString();
-        this.perplexitySlider.addEventListener('change', function () { return _this.updateTSNEPerplexityFromSliderChange(); });
-        this.updateTSNEPerplexityFromSliderChange();
-        this.learningRateInput.addEventListener('change', function () { return _this.updateTSNELearningRateFromUIChange(); });
-        this.updateTSNELearningRateFromUIChange();
-        this.setupCustomProjectionInputFields();
-        // TODO: figure out why `--paper-input-container-input` css mixin didn't
-        // work.
-        this.dom.selectAll('paper-dropdown-menu paper-input input')
-            .style('font-size', '14px');
-    };
-    ProjectionsPanel.prototype.restoreUIFromBookmark = function (bookmark) {
-        this.disablePolymerChangesTriggerReprojection();
-        // PCA
-        this.pcaX = bookmark.pcaComponentDimensions[0];
-        this.pcaY = bookmark.pcaComponentDimensions[1];
-        if (bookmark.pcaComponentDimensions.length === 3) {
-            this.pcaZ = bookmark.pcaComponentDimensions[2];
-        }
-        this.pcaIs3d = (bookmark.pcaComponentDimensions.length === 3);
-        // t-SNE
-        if (this.perplexitySlider) {
-            this.perplexitySlider.value = bookmark.tSNEPerplexity.toString();
-        }
-        if (this.learningRateInput) {
-            this.learningRateInput.value = bookmark.tSNELearningRate.toString();
-        }
-        this.tSNEis3d = bookmark.tSNEis3d;
-        // custom
-        this.customSelectedSearchByMetadataOption =
-            bookmark.customSelectedSearchByMetadataOption;
-        if (this.customProjectionXLeftInput) {
-            this.customProjectionXLeftInput.set(bookmark.customXLeftText, bookmark.customXLeftRegex);
-        }
-        if (this.customProjectionXRightInput) {
-            this.customProjectionXRightInput.set(bookmark.customXRightText, bookmark.customXRightRegex);
-        }
-        if (this.customProjectionYUpInput) {
-            this.customProjectionYUpInput.set(bookmark.customYUpText, bookmark.customYUpRegex);
-        }
-        if (this.customProjectionYDownInput) {
-            this.customProjectionYDownInput.set(bookmark.customYDownText, bookmark.customYDownRegex);
-        }
-        this.computeAllCentroids();
-        this.setZDropdownEnabled(this.pcaIs3d);
-        this.updateTSNEPerplexityFromSliderChange();
-        this.updateTSNELearningRateFromUIChange();
-        if (this.iterationLabel) {
-            this.iterationLabel.text(bookmark.tSNEIteration.toString());
-        }
-        this.showTab(bookmark.selectedProjection);
-        this.enablePolymerChangesTriggerReprojection();
-    };
-    ProjectionsPanel.prototype.populateBookmarkFromUI = function (bookmark) {
-        this.disablePolymerChangesTriggerReprojection();
-        // PCA
-        bookmark.pcaComponentDimensions = [this.pcaX, this.pcaY];
-        if (this.pcaIs3d) {
-            bookmark.pcaComponentDimensions.push(this.pcaZ);
-        }
-        // t-SNE
-        if (this.perplexitySlider != null) {
-            bookmark.tSNEPerplexity = +this.perplexitySlider.value;
-        }
-        if (this.learningRateInput != null) {
-            bookmark.tSNELearningRate = +this.learningRateInput.value;
-        }
-        bookmark.tSNEis3d = this.tSNEis3d;
-        // custom
-        bookmark.customSelectedSearchByMetadataOption =
-            this.customSelectedSearchByMetadataOption;
-        if (this.customProjectionXLeftInput != null) {
-            bookmark.customXLeftText = this.customProjectionXLeftInput.getValue();
-            bookmark.customXLeftRegex =
-                this.customProjectionXLeftInput.getInRegexMode();
-        }
-        if (this.customProjectionXRightInput != null) {
-            bookmark.customXRightText = this.customProjectionXRightInput.getValue();
-            bookmark.customXRightRegex =
-                this.customProjectionXRightInput.getInRegexMode();
-        }
-        if (this.customProjectionYUpInput != null) {
-            bookmark.customYUpText = this.customProjectionYUpInput.getValue();
-            bookmark.customYUpRegex = this.customProjectionYUpInput.getInRegexMode();
-        }
-        if (this.customProjectionYDownInput != null) {
-            bookmark.customYDownText = this.customProjectionYDownInput.getValue();
-            bookmark.customYDownRegex =
-                this.customProjectionYDownInput.getInRegexMode();
-        }
-        this.enablePolymerChangesTriggerReprojection();
-    };
-    // This method is marked as public as it is used as the view method that
-    // abstracts DOM manipulation so we can stub it in a test.
-    // TODO(nsthorat): Move this to its own class as the glue between this class
-    // and the DOM.
-    ProjectionsPanel.prototype.setZDropdownEnabled = function (enabled) {
-        if (this.zDropdown) {
-            this.zDropdown.attr('disabled', enabled ? null : true);
-        }
-    };
-    ProjectionsPanel.prototype.dataSetUpdated = function (dataSet, originalDataSet, dim) {
-        this.dataSet = dataSet;
-        this.originalDataSet = originalDataSet;
-        this.dim = dim;
-        var pointCount = (dataSet == null) ? 0 : dataSet.points.length;
-        var perplexity = Math.max(5, Math.ceil(Math.sqrt(pointCount) / 4));
-        this.perplexitySlider.value = perplexity.toString();
-        this.updateTSNEPerplexityFromSliderChange();
-        this.clearCentroids();
-        this.dom.select('#tsne-sampling')
-            .style('display', pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none');
-        var wasSampled = (dataSet == null) ? false : (dataSet.dim[0] > data.PCA_SAMPLE_DIM ||
-            dataSet.dim[1] > data.PCA_SAMPLE_DIM);
-        this.dom.select('#pca-sampling')
-            .style('display', wasSampled ? null : 'none');
-        this.showTab('pca');
-    };
-    ProjectionsPanel.prototype._pcaDimensionToggleObserver = function () {
-        this.setZDropdownEnabled(this.pcaIs3d);
-        this.beginProjection(this.currentProjection);
-    };
-    ProjectionsPanel.prototype._tsneDimensionToggleObserver = function () {
-        this.beginProjection(this.currentProjection);
-    };
-    ProjectionsPanel.prototype.metadataChanged = function (spriteAndMetadata) {
-        // Project by options for custom projections.
-        var searchByMetadataIndex = -1;
-        this.searchByMetadataOptions = spriteAndMetadata.stats.map(function (stats, i) {
-            // Make the default label by the first non-numeric column.
-            if (!stats.isNumeric && searchByMetadataIndex === -1) {
-                searchByMetadataIndex = i;
-            }
-            return stats.name;
-        });
-        this.customSelectedSearchByMetadataOption =
-            this.searchByMetadataOptions[Math.max(0, searchByMetadataIndex)];
-    };
-    ProjectionsPanel.prototype.showTab = function (id) {
-        var _this = this;
-        this.currentProjection = id;
-        var tab = this.dom.select('.ink-tab[data-tab="' + id + '"]');
-        this.dom.selectAll('.ink-tab').classed('active', false);
-        tab.classed('active', true);
-        this.dom.selectAll('.ink-panel-content').classed('active', false);
-        this.dom.select('.ink-panel-content[data-panel="' + id + '"]')
-            .classed('active', true);
-        // guard for unit tests, where polymer isn't attached and $ doesn't exist.
-        if (this.$ != null) {
-            var main_1 = this.$['main'];
-            // In order for the projections panel to animate its height, we need to
-            // set it explicitly.
-            requestAnimationFrame(function () {
-                _this.style.height = main_1.clientHeight + 'px';
-            });
-        }
-        this.beginProjection(id);
-    };
-    ProjectionsPanel.prototype.beginProjection = function (projection) {
-        if (this.polymerChangesTriggerReprojection === false) {
-            return;
-        }
-        if (projection === 'pca') {
-            if (this.dataSet != null) {
-                this.dataSet.stopTSNE();
-            }
-            this.showPCA();
-        }
-        else if (projection === 'tsne') {
-            this.showTSNE();
-        }
-        else if (projection === 'custom') {
-            if (this.dataSet != null) {
-                this.dataSet.stopTSNE();
-            }
-            this.computeAllCentroids();
-            this.reprojectCustom();
-        }
-    };
-    ProjectionsPanel.prototype.showTSNE = function () {
-        var dataSet = this.dataSet;
-        if (dataSet == null) {
-            return;
-        }
-        var accessors = data.getProjectionComponents('tsne', [0, 1, this.tSNEis3d ? 2 : null]);
-        var dimensionality = this.tSNEis3d ? 3 : 2;
-        var projection = new data_1.Projection('tsne', accessors, dimensionality, dataSet);
-        this.projector.setProjection(projection);
-        if (!this.dataSet.hasTSNERun) {
-            this.runTSNE();
-        }
-        else {
-            this.projector.notifyProjectionPositionsUpdated();
-        }
-    };
-    ProjectionsPanel.prototype.runTSNE = function () {
-        var _this = this;
-        this.runTsneButton.attr('disabled', true);
-        this.stopTsneButton.attr('disabled', null);
-        this.dataSet.projectTSNE(this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2, function (iteration) {
-            if (iteration != null) {
-                _this.iterationLabel.text(iteration);
-                _this.projector.notifyProjectionPositionsUpdated();
-            }
-            else {
-                _this.runTsneButton.attr('disabled', null);
-                _this.stopTsneButton.attr('disabled', true);
-            }
-        });
-    };
-    // tslint:disable-next-line:no-unused-variable
-    ProjectionsPanel.prototype.showPCAIfEnabled = function () {
-        if (this.polymerChangesTriggerReprojection) {
-            this.showPCA();
-        }
-    };
-    ProjectionsPanel.prototype.updateTotalVarianceMessage = function () {
-        var variances = this.dataSet.fracVariancesExplained;
-        var totalVariance = variances[this.pcaX] + variances[this.pcaY];
-        var msg = 'Total variance described: ';
-        if (this.pcaIs3d) {
-            totalVariance += variances[this.pcaZ];
-        }
-        msg += (totalVariance * 100).toFixed(1) + '%.';
-        this.dom.select('#total-variance').html(msg);
-    };
-    ProjectionsPanel.prototype.showPCA = function () {
-        var _this = this;
-        if (this.dataSet == null) {
-            return;
-        }
-        this.dataSet.projectPCA().then(function () {
-            // Polymer properties are 1-based.
-            var accessors = data.getProjectionComponents('pca', [_this.pcaX, _this.pcaY, _this.pcaZ]);
-            var dimensionality = _this.pcaIs3d ? 3 : 2;
-            var projection = new data_1.Projection('pca', accessors, dimensionality, _this.dataSet);
-            _this.projector.setProjection(projection);
-            var numComponents = Math.min(NUM_PCA_COMPONENTS, _this.dataSet.dim[1]);
-            _this.updateTotalVarianceMessage();
-            _this.pcaComponents = d3.range(0, numComponents).map(function (i) {
-                var fracVariance = _this.dataSet.fracVariancesExplained[i];
-                return {
-                    id: i,
-                    componentNumber: i + 1,
-                    percVariance: (fracVariance * 100).toFixed(1)
-                };
-            });
-        });
-    };
-    ProjectionsPanel.prototype.reprojectCustom = function () {
-        if (this.centroids == null || this.centroids.xLeft == null ||
-            this.centroids.xRight == null || this.centroids.yUp == null ||
-            this.centroids.yDown == null) {
-            return;
-        }
-        var xDir = vector.sub(this.centroids.xRight, this.centroids.xLeft);
-        this.dataSet.projectLinear(xDir, 'linear-x');
-        var yDir = vector.sub(this.centroids.yUp, this.centroids.yDown);
-        this.dataSet.projectLinear(yDir, 'linear-y');
-        var accessors = data.getProjectionComponents('custom', ['x', 'y']);
-        var projection = new data_1.Projection('custom', accessors, 2, this.dataSet);
-        this.projector.setProjection(projection);
-    };
-    ProjectionsPanel.prototype.clearCentroids = function () {
-        this.centroids = { xLeft: null, xRight: null, yUp: null, yDown: null };
-        this.allCentroid = null;
-    };
-    ProjectionsPanel.prototype._customSelectedSearchByMetadataOptionChanged = function (newVal, oldVal) {
-        if (this.polymerChangesTriggerReprojection === false) {
-            return;
-        }
-        if (this.currentProjection === 'custom') {
-            this.computeAllCentroids();
-            this.reprojectCustom();
-        }
-    };
-    ProjectionsPanel.prototype.setupCustomProjectionInputFields = function () {
-        this.customProjectionXLeftInput =
-            this.setupCustomProjectionInputField('xLeft');
-        this.customProjectionXRightInput =
-            this.setupCustomProjectionInputField('xRight');
-        this.customProjectionYUpInput = this.setupCustomProjectionInputField('yUp');
-        this.customProjectionYDownInput =
-            this.setupCustomProjectionInputField('yDown');
-    };
-    ProjectionsPanel.prototype.computeAllCentroids = function () {
-        this.computeCentroid('xLeft');
-        this.computeCentroid('xRight');
-        this.computeCentroid('yUp');
-        this.computeCentroid('yDown');
-    };
-    ProjectionsPanel.prototype.computeCentroid = function (name) {
-        var input = this.querySelector('#' + name);
-        if (input == null) {
-            return;
-        }
-        var value = input.getValue();
-        if (value == null) {
-            return;
-        }
-        var inRegexMode = input.getInRegexMode();
-        var result = this.getCentroid(value, inRegexMode);
-        if (result.numMatches === 0) {
-            input.message = '0 matches. Using a random vector.';
-            result.centroid = vector.rn(this.dim);
-        }
-        else {
-            input.message = result.numMatches + " matches.";
-        }
-        this.centroids[name] = result.centroid;
-        this.centroidValues[name] = value;
-    };
-    ProjectionsPanel.prototype.setupCustomProjectionInputField = function (name) {
-        var _this = this;
-        var input = this.querySelector('#' + name);
-        input.registerInputChangedListener(function (input, inRegexMode) {
-            if (_this.polymerChangesTriggerReprojection) {
-                _this.computeCentroid(name);
-                _this.reprojectCustom();
-            }
-        });
-        return input;
-    };
-    ProjectionsPanel.prototype.getCentroid = function (pattern, inRegexMode) {
-        var _this = this;
-        if (pattern == null || pattern === '') {
-            return { numMatches: 0 };
-        }
-        // Search by the original dataset since we often want to filter and project
-        // only the nearest neighbors of A onto B-C where B and C are not nearest
-        // neighbors of A.
-        var accessor = function (i) { return _this.originalDataSet.points[i].vector; };
-        var r = this.originalDataSet.query(pattern, inRegexMode, this.customSelectedSearchByMetadataOption);
-        return { centroid: vector.centroid(r, accessor), numMatches: r.length };
-    };
-    ProjectionsPanel.prototype.getPcaSampledDimText = function () {
-        return data.PCA_SAMPLE_DIM.toLocaleString();
-    };
-    ProjectionsPanel.prototype.getPcaSampleSizeText = function () {
-        return data.PCA_SAMPLE_SIZE.toLocaleString();
-    };
-    ProjectionsPanel.prototype.getTsneSampleSizeText = function () {
-        return data.TSNE_SAMPLE_SIZE.toLocaleString();
-    };
-    return ProjectionsPanel;
-}(exports.ProjectionsPanelPolymer));
-exports.ProjectionsPanel = ProjectionsPanel;
-document.registerElement(ProjectionsPanel.prototype.is, ProjectionsPanel);
-
-},{"./data":7,"./vector":25,"./vz-projector-util":33}],33:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-function PolymerElement(spec) {
-    return Polymer.Class(spec);
-}
-exports.PolymerElement = PolymerElement;
-
-},{}],34:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var analyticsLogger_1 = require('./analyticsLogger');
-var data = require('./data');
-var data_1 = require('./data');
-var data_provider_demo_1 = require('./data-provider-demo');
-var data_provider_proto_1 = require('./data-provider-proto');
-var data_provider_server_1 = require('./data-provider-server');
-var logging = require('./logging');
-var projectorScatterPlotAdapter_1 = require('./projectorScatterPlotAdapter');
-var scatterPlot_1 = require('./scatterPlot');
-var util = require('./util');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-/**
- * The minimum number of dimensions the data should have to automatically
- * decide to normalize the data.
- */
-var THRESHOLD_DIM_NORMALIZE = 50;
-var POINT_COLOR_MISSING = 'black';
-exports.ProjectorPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector',
-    properties: {
-        routePrefix: String,
-        dataProto: { type: String, observer: '_dataProtoChanged' },
-        servingMode: String,
-        projectorConfigJsonPath: String,
-        pageViewLogging: Boolean,
-        eventLogging: Boolean
-    }
-});
-var INDEX_METADATA_FIELD = '__index__';
-var Projector = (function (_super) {
-    __extends(Projector, _super);
-    function Projector() {
-        _super.apply(this, arguments);
-    }
-    Projector.prototype.ready = function () {
-        this.dom = d3.select(this);
-        logging.setDomContainer(this);
-        this.analyticsLogger =
-            new analyticsLogger_1.AnalyticsLogger(this.pageViewLogging, this.eventLogging);
-        this.analyticsLogger.logPageView('embeddings');
-        if (!util.hasWebGLSupport()) {
-            this.analyticsLogger.logWebGLDisabled();
-            logging.setErrorMessage('Your browser or device does not have WebGL enabled. Please enable ' +
-                'hardware acceleration, or use a browser that supports WebGL.');
-            return;
-        }
-        this.selectionChangedListeners = [];
-        this.hoverListeners = [];
-        this.projectionChangedListeners = [];
-        this.distanceMetricChangedListeners = [];
-        this.selectedPointIndices = [];
-        this.neighborsOfFirstPoint = [];
-        this.dataPanel = this.$['data-panel'];
-        this.inspectorPanel = this.$['inspector-panel'];
-        this.inspectorPanel.initialize(this, this);
-        this.projectionsPanel = this.$['projections-panel'];
-        this.projectionsPanel.initialize(this);
-        this.bookmarkPanel = this.$['bookmark-panel'];
-        this.bookmarkPanel.initialize(this, this);
-        this.metadataCard = this.$['metadata-card'];
-        this.statusBar = this.dom.select('#status-bar');
-        this.scopeSubtree(this.$$('#notification-dialog'), true);
-        this.setupUIControls();
-        this.initializeDataProvider();
-    };
-    Projector.prototype.setSelectedLabelOption = function (labelOption) {
-        this.selectedLabelOption = labelOption;
-        this.metadataCard.setLabelOption(this.selectedLabelOption);
-        this.projectorScatterPlotAdapter.setLabelPointAccessor(labelOption);
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.projectorScatterPlotAdapter.render();
-    };
-    Projector.prototype.setSelectedColorOption = function (colorOption) {
-        this.selectedColorOption = colorOption;
-        this.projectorScatterPlotAdapter.setLegendPointColorer(this.getLegendPointColorer(colorOption));
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.projectorScatterPlotAdapter.render();
-    };
-    Projector.prototype.setNormalizeData = function (normalizeData) {
-        this.normalizeData = normalizeData;
-        this.setCurrentDataSet(this.originalDataSet.getSubset());
-    };
-    Projector.prototype.updateDataSet = function (ds, spriteAndMetadata, metadataFile) {
-        this.dataSetFilterIndices = null;
-        this.originalDataSet = ds;
-        if (ds != null) {
-            this.normalizeData =
-                this.originalDataSet.dim[1] >= THRESHOLD_DIM_NORMALIZE;
-            spriteAndMetadata = spriteAndMetadata || {};
-            if (spriteAndMetadata.pointsInfo == null) {
-                var _a = this.makeDefaultPointsInfoAndStats(ds.points), pointsInfo = _a[0], stats = _a[1];
-                spriteAndMetadata.pointsInfo = pointsInfo;
-                spriteAndMetadata.stats = stats;
-            }
-            ds.mergeMetadata(spriteAndMetadata);
-        }
-        if (this.projectorScatterPlotAdapter != null) {
-            if (ds == null) {
-                this.projectorScatterPlotAdapter.setLabelPointAccessor(null);
-                this.setProjection(null);
-            }
-            else {
-                this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-                this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-                this.projectorScatterPlotAdapter.resize();
-                this.projectorScatterPlotAdapter.render();
-            }
-        }
-        if (ds != null) {
-            this.dataPanel.setNormalizeData(this.normalizeData);
-            this.setCurrentDataSet(ds.getSubset());
-            this.projectorScatterPlotAdapter.setLabelPointAccessor(this.selectedLabelOption);
-            this.inspectorPanel.datasetChanged();
-            this.inspectorPanel.metadataChanged(spriteAndMetadata);
-            this.projectionsPanel.metadataChanged(spriteAndMetadata);
-            this.dataPanel.metadataChanged(spriteAndMetadata, metadataFile);
-            // Set the container to a fixed height, otherwise in Colab the
-            // height can grow indefinitely.
-            var container = this.dom.select('#container');
-            container.style('height', container.property('clientHeight') + 'px');
-        }
-        else {
-            this.setCurrentDataSet(null);
-        }
-    };
-    Projector.prototype.setSelectedTensor = function (run, tensorInfo) {
-        this.bookmarkPanel.setSelectedTensor(run, tensorInfo, this.dataProvider);
-    };
-    /**
-     * Registers a listener to be called any time the selected point set changes.
-     */
-    Projector.prototype.registerSelectionChangedListener = function (listener) {
-        this.selectionChangedListeners.push(listener);
-    };
-    Projector.prototype.filterDataset = function (pointIndices) {
-        var selectionSize = this.selectedPointIndices.length;
-        if (this.dataSetBeforeFilter == null) {
-            this.dataSetBeforeFilter = this.dataSet;
-        }
-        this.setCurrentDataSet(this.dataSet.getSubset(pointIndices));
-        this.dataSetFilterIndices = pointIndices;
-        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.adjustSelectionAndHover(d3.range(selectionSize));
-    };
-    Projector.prototype.resetFilterDataset = function () {
-        var _this = this;
-        var originalPointIndices = this.selectedPointIndices.map(function (filteredIndex) { return _this.dataSet.points[filteredIndex].index; });
-        this.setCurrentDataSet(this.dataSetBeforeFilter);
-        if (this.projection != null) {
-            this.projection.dataSet = this.dataSetBeforeFilter;
-        }
-        this.dataSetBeforeFilter = null;
-        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.dataSetFilterIndices = [];
-        this.adjustSelectionAndHover(originalPointIndices);
-    };
-    /**
-     * Used by clients to indicate that a selection has occurred.
-     */
-    Projector.prototype.notifySelectionChanged = function (newSelectedPointIndices) {
-        var _this = this;
-        this.selectedPointIndices = newSelectedPointIndices;
-        var neighbors = [];
-        if (newSelectedPointIndices.length === 1) {
-            neighbors = this.dataSet.findNeighbors(newSelectedPointIndices[0], this.inspectorPanel.distFunc, this.inspectorPanel.numNN);
-            this.metadataCard.updateMetadata(this.dataSet.points[newSelectedPointIndices[0]].metadata);
-        }
-        else {
-            this.metadataCard.updateMetadata(null);
-        }
-        this.selectionChangedListeners.forEach(function (l) { return l(_this.selectedPointIndices, neighbors); });
-    };
-    /**
-     * Registers a listener to be called any time the mouse hovers over a point.
-     */
-    Projector.prototype.registerHoverListener = function (listener) {
-        this.hoverListeners.push(listener);
-    };
-    /**
-     * Used by clients to indicate that a hover is occurring.
-     */
-    Projector.prototype.notifyHoverOverPoint = function (pointIndex) {
-        this.hoverListeners.forEach(function (l) { return l(pointIndex); });
-    };
-    Projector.prototype.registerProjectionChangedListener = function (listener) {
-        this.projectionChangedListeners.push(listener);
-    };
-    Projector.prototype.notifyProjectionChanged = function (projection) {
-        this.projectionChangedListeners.forEach(function (l) { return l(projection); });
-    };
-    Projector.prototype.registerDistanceMetricChangedListener = function (l) {
-        this.distanceMetricChangedListeners.push(l);
-    };
-    Projector.prototype.notifyDistanceMetricChanged = function (distMetric) {
-        this.distanceMetricChangedListeners.forEach(function (l) { return l(distMetric); });
-    };
-    Projector.prototype._dataProtoChanged = function (dataProtoString) {
-        var dataProto = dataProtoString ? JSON.parse(dataProtoString) : null;
-        this.initializeDataProvider(dataProto);
-    };
-    Projector.prototype.makeDefaultPointsInfoAndStats = function (points) {
-        var pointsInfo = [];
-        points.forEach(function (p) {
-            var pointInfo = {};
-            pointInfo[INDEX_METADATA_FIELD] = p.index;
-            pointsInfo.push(pointInfo);
-        });
-        var stats = [{
-                name: INDEX_METADATA_FIELD,
-                isNumeric: false,
-                tooManyUniqueValues: true,
-                min: 0,
-                max: pointsInfo.length - 1
-            }];
-        return [pointsInfo, stats];
-    };
-    Projector.prototype.initializeDataProvider = function (dataProto) {
-        if (this.servingMode === 'demo') {
-            var projectorConfigUrl = void 0;
-            // Only in demo mode do we allow the config being passed via URL.
-            var urlParams = util.getURLParams(window.location.search);
-            if ('config' in urlParams) {
-                projectorConfigUrl = urlParams['config'];
-            }
-            else {
-                projectorConfigUrl = this.projectorConfigJsonPath;
-            }
-            this.dataProvider = new data_provider_demo_1.DemoDataProvider(projectorConfigUrl);
-        }
-        else if (this.servingMode === 'server') {
-            if (!this.routePrefix) {
-                throw 'route-prefix is a required parameter';
-            }
-            this.dataProvider = new data_provider_server_1.ServerDataProvider(this.routePrefix);
-        }
-        else if (this.servingMode === 'proto' && dataProto != null) {
-            this.dataProvider = new data_provider_proto_1.ProtoDataProvider(dataProto);
-        }
-        this.dataPanel.initialize(this, this.dataProvider);
-    };
-    Projector.prototype.getLegendPointColorer = function (colorOption) {
-        var _this = this;
-        if ((colorOption == null) || (colorOption.map == null)) {
-            return null;
-        }
-        var colorer = function (ds, i) {
-            var value = ds.points[i].metadata[_this.selectedColorOption.name];
-            if (value == null) {
-                return POINT_COLOR_MISSING;
-            }
-            return colorOption.map(value);
-        };
-        return colorer;
-    };
-    Projector.prototype.get3DLabelModeButton = function () {
-        return this.querySelector('#labels3DMode');
-    };
-    Projector.prototype.get3DLabelMode = function () {
-        var label3DModeButton = this.get3DLabelModeButton();
-        return label3DModeButton.active;
-    };
-    Projector.prototype.adjustSelectionAndHover = function (selectedPointIndices, hoverIndex) {
-        this.notifySelectionChanged(selectedPointIndices);
-        this.notifyHoverOverPoint(hoverIndex);
-        this.setMouseMode(scatterPlot_1.MouseMode.CAMERA_AND_CLICK_SELECT);
-    };
-    Projector.prototype.setMouseMode = function (mouseMode) {
-        var selectModeButton = this.querySelector('#selectMode');
-        selectModeButton.active = (mouseMode === scatterPlot_1.MouseMode.AREA_SELECT);
-        this.projectorScatterPlotAdapter.scatterPlot.setMouseMode(mouseMode);
-    };
-    Projector.prototype.setCurrentDataSet = function (ds) {
-        this.adjustSelectionAndHover([]);
-        if (this.dataSet != null) {
-            this.dataSet.stopTSNE();
-        }
-        if ((ds != null) && this.normalizeData) {
-            ds.normalize();
-        }
-        this.dim = (ds == null) ? 0 : ds.dim[1];
-        this.dom.select('span.numDataPoints').text((ds == null) ? '0' : ds.dim[0]);
-        this.dom.select('span.dim').text((ds == null) ? '0' : ds.dim[1]);
-        this.dataSet = ds;
-        this.projectionsPanel.dataSetUpdated(this.dataSet, this.originalDataSet, this.dim);
-        this.projectorScatterPlotAdapter.setDataSet(this.dataSet);
-        this.projectorScatterPlotAdapter.scatterPlot
-            .setCameraParametersForNextCameraCreation(null, true);
-    };
-    Projector.prototype.setupUIControls = function () {
-        var _this = this;
-        // View controls
-        this.querySelector('#reset-zoom').addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.scatterPlot.resetZoom();
-            _this.projectorScatterPlotAdapter.scatterPlot.startOrbitAnimation();
-        });
-        var selectModeButton = this.querySelector('#selectMode');
-        selectModeButton.addEventListener('click', function (event) {
-            _this.setMouseMode(selectModeButton.active ? scatterPlot_1.MouseMode.AREA_SELECT :
-                scatterPlot_1.MouseMode.CAMERA_AND_CLICK_SELECT);
-        });
-        var nightModeButton = this.querySelector('#nightDayMode');
-        nightModeButton.addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.scatterPlot.setDayNightMode(nightModeButton.active);
-        });
-        var labels3DModeButton = this.get3DLabelModeButton();
-        labels3DModeButton.addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.set3DLabelMode(_this.get3DLabelMode());
-        });
-        window.addEventListener('resize', function () {
-            var container = _this.dom.select('#container');
-            var parentHeight = container.node().parentNode.clientHeight;
-            container.style('height', parentHeight + 'px');
-            _this.projectorScatterPlotAdapter.resize();
-        });
-        {
-            this.projectorScatterPlotAdapter = new projectorScatterPlotAdapter_1.ProjectorScatterPlotAdapter(this.getScatterContainer(), this);
-            this.projectorScatterPlotAdapter.setLabelPointAccessor(this.selectedLabelOption);
-        }
-        this.projectorScatterPlotAdapter.scatterPlot.onCameraMove(function (cameraPosition, cameraTarget) {
-            return _this.bookmarkPanel.clearStateSelection();
-        });
-        this.registerHoverListener(function (hoverIndex) { return _this.onHover(hoverIndex); });
-        this.registerSelectionChangedListener(function (selectedPointIndices, neighborsOfFirstPoint) {
-            return _this.onSelectionChanged(selectedPointIndices, neighborsOfFirstPoint);
-        });
-    };
-    Projector.prototype.onHover = function (hoverIndex) {
-        this.hoverPointIndex = hoverIndex;
-        var hoverText = null;
-        if (hoverIndex != null) {
-            var point = this.dataSet.points[hoverIndex];
-            if (point.metadata[this.selectedLabelOption]) {
-                hoverText = point.metadata[this.selectedLabelOption].toString();
-            }
-        }
-        if (this.selectedPointIndices.length === 0) {
-            this.statusBar.style('display', hoverText ? null : 'none');
-            this.statusBar.text(hoverText);
-        }
-    };
-    Projector.prototype.getScatterContainer = function () {
-        return this.dom.select('#scatter');
-    };
-    Projector.prototype.onSelectionChanged = function (selectedPointIndices, neighborsOfFirstPoint) {
-        this.selectedPointIndices = selectedPointIndices;
-        this.neighborsOfFirstPoint = neighborsOfFirstPoint;
-        var totalNumPoints = this.selectedPointIndices.length + neighborsOfFirstPoint.length;
-        this.statusBar.text("Selected " + totalNumPoints + " points")
-            .style('display', totalNumPoints > 0 ? null : 'none');
-    };
-    Projector.prototype.setProjection = function (projection) {
-        this.projection = projection;
-        if (projection != null) {
-            this.analyticsLogger.logProjectionChanged(projection.projectionType);
-        }
-        this.notifyProjectionChanged(projection);
-    };
-    Projector.prototype.notifyProjectionPositionsUpdated = function () {
-        this.projectorScatterPlotAdapter.notifyProjectionPositionsUpdated();
-    };
-    /**
-     * Gets the current view of the embedding and saves it as a State object.
-     */
-    Projector.prototype.getCurrentState = function () {
-        var state = new data_1.State();
-        // Save the individual datapoint projections.
-        state.projections = [];
-        for (var i = 0; i < this.dataSet.points.length; i++) {
-            var point = this.dataSet.points[i];
-            var projections = {};
-            var keys = Object.keys(point.projections);
-            for (var j = 0; j < keys.length; ++j) {
-                projections[keys[j]] = point.projections[keys[j]];
-            }
-            state.projections.push(projections);
-        }
-        state.selectedProjection = this.projection.projectionType;
-        state.dataSetDimensions = this.dataSet.dim;
-        state.tSNEIteration = this.dataSet.tSNEIteration;
-        state.selectedPoints = this.selectedPointIndices;
-        state.filteredPoints = this.dataSetFilterIndices;
-        this.projectorScatterPlotAdapter.populateBookmarkFromUI(state);
-        state.selectedColorOptionName = this.dataPanel.selectedColorOptionName;
-        state.selectedLabelOption = this.selectedLabelOption;
-        this.projectionsPanel.populateBookmarkFromUI(state);
-        return state;
-    };
-    /** Loads a State object into the world. */
-    Projector.prototype.loadState = function (state) {
-        this.setProjection(null);
-        {
-            this.projectionsPanel.disablePolymerChangesTriggerReprojection();
-            if (this.dataSetBeforeFilter != null) {
-                this.resetFilterDataset();
-            }
-            if (state.filteredPoints != null) {
-                this.filterDataset(state.filteredPoints);
-            }
-            this.projectionsPanel.enablePolymerChangesTriggerReprojection();
-        }
-        for (var i = 0; i < state.projections.length; i++) {
-            var point = this.dataSet.points[i];
-            var projection = state.projections[i];
-            var keys = Object.keys(projection);
-            for (var j = 0; j < keys.length; ++j) {
-                point.projections[keys[j]] = projection[keys[j]];
-            }
-        }
-        this.dataSet.hasTSNERun = (state.selectedProjection === 'tsne');
-        this.dataSet.tSNEIteration = state.tSNEIteration;
-        this.projectionsPanel.restoreUIFromBookmark(state);
-        this.inspectorPanel.restoreUIFromBookmark(state);
-        this.dataPanel.selectedColorOptionName = state.selectedColorOptionName;
-        this.selectedLabelOption = state.selectedLabelOption;
-        this.projectorScatterPlotAdapter.restoreUIFromBookmark(state);
-        {
-            var dimensions = data_1.stateGetAccessorDimensions(state);
-            var components = data.getProjectionComponents(state.selectedProjection, dimensions);
-            var projection = new data_1.Projection(state.selectedProjection, components, dimensions.length, this.dataSet);
-            this.setProjection(projection);
-        }
-        this.notifySelectionChanged(state.selectedPoints);
-    };
-    return Projector;
-}(exports.ProjectorPolymer));
-exports.Projector = Projector;
-document.registerElement(Projector.prototype.is, Projector);
-
-},{"./analyticsLogger":1,"./data":7,"./data-provider-demo":3,"./data-provider-proto":4,"./data-provider-server":5,"./logging":12,"./projectorScatterPlotAdapter":14,"./scatterPlot":16,"./util":24,"./vz-projector-util":33}],35:[function(require,module,exports){
-arguments[4][8][0].apply(exports,arguments)
-},{"dup":8}]},{},[35,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]);
-</script>
-</dom-module>
-</body></html>
\ No newline at end of file
diff --git a/tensorflow/tensorboard/gulp_tasks/bower.js b/tensorflow/tensorboard/gulp_tasks/bower.js
deleted file mode 100644
index 7c0e515c6c9..00000000000
--- a/tensorflow/tensorboard/gulp_tasks/bower.js
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var bower = require('gulp-bower');
-
-module.exports = function() {
-  return function() {
-    return bower();
-  }
-}
diff --git a/tensorflow/tensorboard/gulp_tasks/compile.js b/tensorflow/tensorboard/gulp_tasks/compile.js
deleted file mode 100644
index 3d0d725cfb2..00000000000
--- a/tensorflow/tensorboard/gulp_tasks/compile.js
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var ts = require('gulp-typescript');
-var typescript = require('typescript');
-var gutil = require('gulp-util');
-var filter = require('gulp-filter');
-var merge = require('merge2');
-var browserify = require('browserify');
-var tsify = require('tsify');
-var source = require('vinyl-source-stream');
-var glob = require('glob').sync;
-var concat = require('gulp-concat');
-
-var tsProject = ts.createProject('./tsconfig.json', {
-  typescript: typescript,
-  noExternalResolve: true, // opt-in for faster compilation!
-});
-
-/** List of components (and their external deps) that are using es6 modules. */
-var ES6_COMPONENTS = [{
-  name: 'vz_projector',
-  deps: [
-    'd3/d3.min.js', 'weblas/dist/weblas.js', 'three.js/build/three.min.js',
-    'three.js/examples/js/controls/OrbitControls.js',
-    'numericjs/lib/numeric-1.2.6.js'
-  ]
-}];
-
-module.exports = function(includeDeps) {
-  return function() {
-    // Compile all components that are using ES6 modules into a bundle.js
-    // using browserify.
-    var entries = ['typings/index.d.ts'];
-    var deps = {};
-    ES6_COMPONENTS.forEach(function(component) {
-      // Collect all the typescript files across the components.
-      entries = entries.concat(glob(
-          'components/' + component.name + '/**/*.ts',
-          // Do not include tests or IDE-purposed files.
-          {ignore: ['**/*_test.ts', '**/deps.d.ts']}));
-      // Collect the unique external deps across all components using es6
-      // modules.
-      component.deps.forEach(function(dep) {
-        deps['components/' + dep] = true;
-      });
-    });
-    deps = Object.keys(deps);
-
-    // Compile, bundle all the typescript files and prepend their deps.
-    browserify(entries)
-        .plugin(tsify)
-        .bundle()
-        .on('error', function(error) { console.error(error.toString()); })
-        .pipe(source('bundle.js'))
-        .pipe(gulp.dest('components'))
-        .on('end', function() {
-          // Typescript was compiled and bundled. Now we need to prepend
-          // the external dependencies.
-          if (includeDeps) {
-            gulp.src(deps.concat(['components/bundle.js']))
-                .pipe(concat('bundle.js'))
-                .pipe(gulp.dest('components'));
-          }
-        });
-
-    // Compile components that are using global namespaces producing 1 js file
-    // for each ts file.
-    var isComponent = filter([
-      'components/tf_*/**/*.ts', 'components/vz_*/**/*.ts', 'typings/**/*.ts',
-      'components/plottable/plottable.d.ts'
-      // Ignore components that use es6 modules.
-    ].concat(ES6_COMPONENTS.map(function(component) {
-      return '!components/' + component.name + '/**/*.ts';
-    })));
-
-    return tsProject.src()
-        .pipe(isComponent)
-        .pipe(ts(tsProject))
-        .js.pipe(gulp.dest('.'));
-  };
-};
diff --git a/tensorflow/tensorboard/gulp_tasks/test.js b/tensorflow/tensorboard/gulp_tasks/test.js
deleted file mode 100644
index ffa8122c7b5..00000000000
--- a/tensorflow/tensorboard/gulp_tasks/test.js
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var tester = require('web-component-tester').test;
-
-module.exports = function(done) {
-  tester({}, function(error) {
-    if (error) {
-      // Pretty error for gulp.
-      error = new Error(error.message || error);
-      error.showStack = false;
-    }
-    done(error);
-  });
-}
diff --git a/tensorflow/tensorboard/gulp_tasks/util.js b/tensorflow/tensorboard/gulp_tasks/util.js
deleted file mode 100644
index 7a1d2a58ab6..00000000000
--- a/tensorflow/tensorboard/gulp_tasks/util.js
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var fs = require('fs');
-var path = require('path');
-
-/**
- * Returns a list of web components inside the components directory for which
- * the name predicate is true.
- */
-exports.getComponents = function(namePredicate) {
-  return fs.readdirSync('components')
-      .filter(function(file) {
-        return fs.statSync(path.join('components', file)).isDirectory() &&
-            namePredicate(file);
-      })
-      .map(function(dir) { return '/' + dir + '/'; });
-};
-
-/**
- * Returns a list of tensorboard web components that are inside the components
- * directory.
- */
-exports.tbComponents = exports.getComponents(function(name) {
-  var prefix = name.slice(0, 3);
-  return prefix == 'tf_' || prefix == 'vz_';
-});
diff --git a/tensorflow/tensorboard/gulp_tasks/vulcanize.js b/tensorflow/tensorboard/gulp_tasks/vulcanize.js
deleted file mode 100644
index b8cdd80af02..00000000000
--- a/tensorflow/tensorboard/gulp_tasks/vulcanize.js
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var path = require('path');
-var util = require('./util');
-var vulcanize = require('gulp-vulcanize');
-var replace = require('gulp-replace');
-var rename = require('gulp-rename');
-var header = require('gulp-header');
-
-var HEADER_STR = '<!-- Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n\
-\n\
-Licensed under the Apache License, Version 2.0 (the "License");\n\
-you may not use this file except in compliance with the License.\n\
-You may obtain a copy of the License at\n\
-\n\
-   http://www.apache.org/licenses/LICENSE-2.0\n\
-\n\
-Unless required by applicable law or agreed to in writing, software\n\
-distributed under the License is distributed on an "AS IS" BASIS,\n\
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
-See the License for the specific language governing permissions and\n\
-limitations under the License.\n\
-============================================================================\n\
-\n\
-This file is generated by `gulp` & `vulcanize`. Do not directly change it.\n\
-Instead, use `gulp regenerate` to create a new version with your changes.\n\
--->\n\n'
-
-var base = path.join(__dirname, '../components');
-// List of redirects of the form path1|path2 for every tensorboard component
-// in order to replace dashes with underscores.
-// E.g. .../tf-tensorboard|.../tf_tensorboard
-var redirects = util.tbComponents.map(function(dir) {
-  return path.join(base, dir.replace(/_/g, '-')) + '|' + path.join(base, dir);
-});
-
-var nonTBComponents = util.getComponents(function(name) {
-  var prefix = name.slice(0, 3);
-  return prefix !== 'tf_'  && prefix !== 'vz_';
-});
-
-module.exports = function(overwrite) {
-  return function() {
-    var suffix = overwrite ? '' : '.OPENSOURCE';
-    // Vulcanize TensorBoard without external libraries.
-    gulp.src('components/tf_tensorboard/tf-tensorboard.html')
-        .pipe(vulcanize({
-          inlineScripts: true,
-          inlineCss: true,
-          stripComments: true,
-          excludes: nonTBComponents,
-          redirects: redirects
-        }))
-        .pipe(header(HEADER_STR))
-        .pipe(rename('tf-tensorboard.html' + suffix))
-        .pipe(gulp.dest('./dist'));
-  }
-}
diff --git a/tensorflow/tensorboard/gulpfile.js b/tensorflow/tensorboard/gulpfile.js
deleted file mode 100644
index 257ee0ab83d..00000000000
--- a/tensorflow/tensorboard/gulpfile.js
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var server = require('gulp-server-livereload');
-var minimist = require('minimist');
-var util = require('./gulp_tasks/util');
-
-var options = minimist(process.argv.slice(2), {
-  default: {
-    p: 8000,  // port for gulp server
-    h: '0.0.0.0', // host to serve on
-  }
-});
-
-function getTask(task) {
-    return require('./gulp_tasks/' + task);
-}
-
-
-gulp.task('compile', getTask('compile')(true));
-gulp.task('first-compile', getTask('compile')(true));
-gulp.task('compile-without-deps', getTask('compile')(false));
-gulp.task('test.onlytest', getTask('test'));
-gulp.task('test', ['compile'], getTask('test'));
-
-gulp.task('watch', [], function() {
-  // Avoid watching generated .d.ts in the build (aka output) directory.
-  return gulp.watch(
-      ['components/tf_*/**/*.ts', 'components/vz_*/**/*.ts'],
-      {ignoreInitial: true}, ['compile']);
-});
-
-var httpPrefix = 'http://' + options.h + ':' + options.p + '/components';
-var proxies = util.tbComponents.map(function(component) {
-  return {
-    source: '/components' + component.replace(/_/g, '-'),
-    target: httpPrefix + component
-  };
-});
-
-// Do first-compile before turning on server, to avoid spamming
-// livereload info
-// TODO(danmane): Disconnect this once we can get livereload to
-// no longer spam.
-gulp.task('server', ['first-compile'], function() {
-  gulp.src('.').pipe(server({
-    host: options.h,
-    port: options.p,
-    livereload: {
-      enable: true,
-      // Don't livereload on .ts changes, since they aren't loaded by browser.
-      filter: function(filePath, cb) { cb(!(/\.ts$/.test(filePath))); },
-      port: 27729 + options.p
-    },
-    proxies: proxies,
-    directoryListing: true,
-  }));
-});
-
-// TODO(danmane): When testing is nicer, integrate into vulcanize task
-// gulp vulcanize: Regenerate the tf-tensorboard.html.OPENSOURCE file for pre-release
-gulp.task(
-    'vulcanize', ['compile-without-deps'],
-    getTask('vulcanize')(false));
-// gulp regenerate: Regenerate the tf-tensorboard.html for interactive bazel development
-gulp.task(
-    'regenerate', ['compile-without-deps'],
-    getTask('vulcanize')(true));
-
-// TODO(danmane): consider making bower install part of default task
-gulp.task('default', ['watch', 'server']);
-
-// Clean all compiled JS files.
-var cleanCompiledTypeScript = require('gulp-clean-compiled-typescript');
-gulp.task('clean', function () {
-  return gulp.src(['./components/**/*.ts', '!./components/**/deps.d.ts'])
-      .pipe(cleanCompiledTypeScript());
-});
diff --git a/tensorflow/tensorboard/http_api.md b/tensorflow/tensorboard/http_api.md
index 16c2f95ae1c..c2885daf93c 100644
--- a/tensorflow/tensorboard/http_api.md
+++ b/tensorflow/tensorboard/http_api.md
@@ -36,42 +36,43 @@ Returns a JSON object with a key "logdir" that maps to the `logdir` argument
 
 The `logdir` argument is the path of the directory that contains events files.
 
+## `data/plugins_listing`
+
+Returns a dict mapping from plugin name to a boolean indicating whether the
+plugin is active. A plugin might be inactive, for instance, if it lacks relevant
+data. Every plugin has a key. This route helps the frontend avoid issuing
+requests to an inactive plugin - the routes of an inactive plugin do not work.
+
 ## `data/runs`
 
-Returns a dictionary mapping from `run name` (quoted string) to dictionaries
-mapping from all available tagTypes to a list of tags of that type available for
-the run. Think of this as a comprehensive index of all of the data available
-from the TensorBoard server. Here is an example:
+Returns an array containing the names of all the runs known to the
+TensorBoard backend at this time. Each entry is a string corresponding
+to a single run.
+
+We guarantee that as new runs are created in the log directory, they
+will always appear at the end of the list returned by this route. That
+is, the order of runs is persistent, and the result of this route is an
+&ldquo;append-only&rdquo; list.
+
+Example response:
+
+    ["train_run", "eval"]
+
+## `/data/plugin/scalars/tags`
+
+Returns a dictionary mapping from `run_name` (quoted string) to arrays of
+`tag_name` (quoted string), where each array contains the names of all
+scalar tags present in the corresponding run. Here is an example:
 
     {
-      "train_run": {
-        "histograms": ["foo_histogram", "bar_histogram"],
-        "compressedHistograms": ["foo_histogram", "bar_histogram"],
-        "scalars": ["xent", "loss", "learning_rate"],
-        "images": ["input"],
-        "audio": ["input_audio"],
-        "graph": true,
-        "firstEventTimestamp": 123456.789
-        "run_metadata": ["forward prop", "inference"]
-      },
-      "eval": {
-        "histograms": ["foo_histogram", "bar_histogram"],
-        "compressedHistograms": ["foo_histogram", "bar_histogram"],
-        "scalars": ["precision", "recall"],
-        "images": ["input"],
-        "audio": ["input_audio"],
-        "graph": false,
-        "run_metadata": []
-      }
-      }
+      "train_run": ["xent", "loss", "learning_rate"],
+      "eval": ["precision", "recall"]
+    }
 
-The `firstEventTimestamp` value is in seconds since the epoch.
+Note that runs without any scalar tags are included as keys with value the
+empty array.
 
-Note that the same tag may be present for many runs. It is not guaranteed that
-they will have the same meaning across runs. It is also not guaranteed that they
-will have the same tag type across different runs.
-
-## '/data/scalars?run=foo&tag=bar'
+## `/data/plugin/scalars/scalars?run=foo&tag=bar`
 
 Returns an array of event_accumulator.SimpleValueEvents ([wall_time, step,
 value]) for the given run and tag. wall_time is seconds since epoch.
@@ -93,28 +94,21 @@ format:
     1443857105.704628,3438,0.5427092909812927
     1443857225.705133,5417,0.5457325577735901
 
-## '/data/scalars?[sample_count=10]'
+## `/data/plugin/histograms/tags`
 
-Without any parameters, returns a dictionary mapping from run name to a
-dictionary mapping from tag name to a sampled list of scalars from that run and
-tag. The values are given in the same format as when the run and tag are
-specified. For example:
+Returns a dictionary mapping from `run_name` (quoted string) to arrays of
+`tag_name` (quoted string), where each array contains the names of all
+histogram tags present in the corresponding run. Here is an example:
 
     {
-      "train_run": {
-        "my_tag": [
-          [1443856985.705543, 1448, 0.7461960315704346],
-          [1443857105.704628, 3438, 0.5427092909812927],
-          [1443857225.705133, 5417, 0.5457325577735901]
-        ]
-      }
+      "train_run": ["foo_histogram", "bar_histogram"],
+      "eval": ["foo_histogram", "bar_histogram"]
     }
 
-The samples are distributed uniformly over the list of values. The sample_count
-parameter is optional and defaults to 10; it must be at least 2. The first and
-the last value will always be sampled.
+Note that runs without any histogram tags are included as keys with
+value the empty array.
 
-## '/data/histograms?run=foo&tag=bar'
+## `/data/plugin/histograms/histograms?run=foo&tag=bar`
 
 Returns an array of event_accumulator.HistogramEvents ([wall_time, step,
 HistogramValue]) for the given run and tag. A HistogramValue is [min, max, num,
@@ -141,7 +135,21 @@ Annotated Example: (note - real data is higher precision)
       ]
     ]
 
-## '/data/compressedHistograms?run=foo&tag=bar'
+## `/data/plugin/distributions/tags`
+
+Returns a dictionary mapping from `run_name` (quoted string) to arrays of
+`tag_name` (quoted string), where each array contains the names of all
+distribution tags present in the corresponding run. Here is an example:
+
+    {
+      "train_run": ["foo_histogram", "bar_histogram"],
+      "eval": ["foo_histogram", "bar_histogram"]
+    }
+
+Note that runs without any distribution tags are included as keys with
+value the empty array.
+
+## `/data/plugin/distributions/distributions?run=foo&tag=bar`
 
 Returns an array of event_accumulator.CompressedHistogramEvents ([wall_time,
 step, CompressedHistogramValues]) for the given run and tag.
@@ -161,8 +169,8 @@ Annotated Example: (note - real data is higher precision)
       [
         1441154832.580509,   # wall_time
         5,                   # step
-        [  [0, -3.67],       # CompressedHistogramValue for 0th percentile
-          [2500, -4.19],    # CompressedHistogramValue for 25th percentile
+        [ [0, -3.67],        # CompressedHistogramValue for 0th percentile
+          [2500, -4.19],     # CompressedHistogramValue for 25th percentile
           [5000, 6.29],
           [7500, 1.64],
           [10000, 3.67]
@@ -171,13 +179,13 @@ Annotated Example: (note - real data is higher precision)
       ...
     ]
 
-## `/data/images?run=foo&tag=bar`
+## `/data/plugin/images/images?run=foo&tag=bar`
 
 Gets a sample of ImageMetadatas for the given run and tag.
 
 Returns an array of objects containing information about available images,
 crucially including the query parameter that may be used to retrieve that image.
-(See /individualImage for details.)
+(See /data/plugin/images/individualImage for details.)
 
 For example:
 
@@ -190,7 +198,7 @@ For example:
                                      # param for /individualImage
       }
 
-## `/data/individualImage?{{query}}`
+## `/data/plugin/images/individualImage?{{query}}`
 
 Retrieves an individual image. The image query should not be generated by the
 frontend, but instead acquired from calling the /images route (the image
@@ -202,15 +210,29 @@ within a single run, as images may be removed from the sampling reservoir and
 replaced with other images. (See Notes for details on the reservoir sampling.)
 
 An example call to this route would look like this:
-/individualImage?index=0&tagname=input%2Fimage%2F2&run=train
+/data/plugin/images/individualImage?index=0&tagname=input%2Fimage%2F2&run=train
 
-## `/audio?run=foo&tag=bar`
+## `/data/plugin/images/tags`
+
+Returns a dictionary mapping from `run_name` (quoted string) to arrays of
+`tag_name` (quoted string), where each array contains the names of all image
+tags present in the corresponding run. Here is an example:
+
+    {
+      "train": ["foo_image", "bar_image"],
+      "eval": ["foo_image", "bar_image"]
+    }
+
+Note that runs without any image tags are included as keys with value the empty
+array.
+
+## `/data/plugin/audio/audio?run=foo&tag=bar`
 
 Gets a sample of AudioMetadatas for the given run and tag.
 
 Returns an array of objects containing information about available audio,
 crucially including the query parameter that may be used to retrieve that audio.
-(See /individualAudio for details.)
+(See /data/plugin/audio/individualAudio for details.)
 
 For example:
 
@@ -222,7 +244,7 @@ For example:
                                      # param for /individualAudio
       }
 
-## `/individualAudio?{{query}}`
+## `/data/plugin/audio/individualAudio?{{query}}`
 
 Retrieves an individual audio clip. The audio query should not be generated by
 the frontend, but instead acquired from calling the /audio route (the audio
@@ -236,11 +258,33 @@ replaced with other clips. (See Notes for details on the reservoir sampling.)
 An example call to this route would look like this:
 /individualAudio?index=0&tagname=input%2Faudio%2F2&run=train
 
-## `/data/graph?run=foo&limit_attr_size=1024&large_attrs_key=key`
+## `/data/plugin/audio/tags`
 
-Returns the graph definition for the given run in gzipped pbtxt format. The
-graph is composed of a list of nodes, where each node is a specific TensorFlow
-operation which takes as inputs other nodes (operations).
+Returns a dictionary mapping from `run_name` (quoted string) to arrays of
+`tag_name` (quoted string), where each array contains the names of all audio
+tags present in the corresponding run. Here is an example:
+
+    {
+      "train": ["foo_audio", "bar_audio"],
+      "eval": ["foo_audio", "bar_audio"],
+    }
+
+Note that runs without any audio tags are included as keys with value the empty
+array.
+
+## `/data/plugin/graphs/runs`
+
+Returns a list of runs that have associated graphs.
+
+For example:
+
+    ["train"]
+
+## `/data/plugin/graphs/graph?run=foo&limit_attr_size=1024&large_attrs_key=key`
+
+Returns the graph definition for the given run in pbtxt format. The
+graph is composed of a list of nodes, where each node is a specific
+TensorFlow operation which takes as inputs other nodes (operations).
 
 The query parameters `limit_attr_size` and `large_attrs_key` are optional.
 
@@ -253,7 +297,10 @@ attributes that are too large. The value of this key (list of strings)
 should be used by the client in order to determine which attributes
 have been filtered. Must be specified if `limit_attr_size` is specified.
 
-For the query `/graph?run=foo&limit_attr_size=1024&large_attrs_key=_too_large`,
+For the query
+
+    /data/plugin/graphs/graph?run=foo&limit_attr_size=1024&large_attrs_key=_too_large,
+
 here is an example pbtxt response of a graph with 3 nodes, where the second
 node had two large attributes "a" and "b" that were filtered out (size > 1024):
 
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
new file mode 100644
index 00000000000..f1f7746ff84
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
@@ -0,0 +1,56 @@
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+java_binary(
+    name = "Vulcanize",
+    srcs = ["Vulcanize.java"],
+    jvm_flags = [
+        "-Xss20m",  # JSCompiler needs big stacks for recursive parsing
+        "-XX:+UseParallelGC",  # Best GC when app isn't latency sensitive
+        "-Djava.util.logging.SimpleFormatter.format='%1$$tY-%1$$tm-%1$$td %1$$tH:%1$$tM:%1$$tS.%1$$tL %4$$-6s %5$$s%6$$s%n'",  # Less log spam
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_guava",
+        "@com_google_protobuf_java",
+        "@io_bazel_rules_closure//closure/compiler",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure:webpath",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
+        "@io_bazel_rules_closure//java/org/jsoup/nodes",
+        "@org_jsoup",
+    ],
+)
+
+java_binary(
+    name = "Zipper",
+    srcs = ["Zipper.java"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_guava",
+        "@com_google_protobuf_java",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
+    ],
+)
+
+# These JS files are always taken into consideration by the Closure Compiler
+# when vulcanizing, per vulcanize.bzl.
+filegroup(
+    name = "jslibs",
+    srcs = [
+        # Ordering probably matters
+        "@com_google_javascript_closure_compiler_externs",
+        "@com_google_javascript_closure_compiler_externs_polymer",
+        "externs.js",
+        "@com_google_javascript_closure_library//:closure/goog/base.js",
+        "@com_google_javascript_closure_library//:closure/goog/deps.js",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
new file mode 100644
index 00000000000..533907dd64d
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
@@ -0,0 +1,546 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.tensorflow.tensorboard.vulcanize;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Verify.verify;
+import static com.google.common.base.Verify.verifyNotNull;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Joiner;
+import com.google.common.base.Optional;
+import com.google.common.base.Splitter;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableMultimap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import com.google.javascript.jscomp.CheckLevel;
+import com.google.javascript.jscomp.CompilationLevel;
+import com.google.javascript.jscomp.Compiler;
+import com.google.javascript.jscomp.CompilerOptions;
+import com.google.javascript.jscomp.DiagnosticGroup;
+import com.google.javascript.jscomp.DiagnosticGroups;
+import com.google.javascript.jscomp.DiagnosticType;
+import com.google.javascript.jscomp.JSError;
+import com.google.javascript.jscomp.ModuleIdentifier;
+import com.google.javascript.jscomp.PropertyRenamingPolicy;
+import com.google.javascript.jscomp.Result;
+import com.google.javascript.jscomp.SourceFile;
+import com.google.javascript.jscomp.WarningsGuard;
+import com.google.protobuf.TextFormat;
+import io.bazel.rules.closure.Webpath;
+import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
+import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.Comment;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Html5Printer;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+
+/** Simple one-off solution for TensorBoard vulcanization. */
+public final class Vulcanize {
+
+  private static final Pattern IGNORE_PATHS_PATTERN =
+      Pattern.compile("/(?:polymer|marked-element)/.*");
+
+  private static final ImmutableSet<String> EXTRA_JSDOC_TAGS =
+      ImmutableSet.of("attribute", "hero", "group", "required");
+
+  private static final Pattern WEBPATH_PATTERN = Pattern.compile("//~~WEBPATH~~([^\n]+)");
+
+  private static final Parser parser = Parser.htmlParser();
+  private static final Map<Webpath, Path> webfiles = new HashMap<>();
+  private static final Set<Webpath> alreadyInlined = new HashSet<>();
+  private static final Set<String> legalese = new HashSet<>();
+  private static final List<String> licenses = new ArrayList<>();
+  private static final List<Webpath> stack = new ArrayList<>();
+  private static final List<SourceFile> externs = new ArrayList<>();
+  private static final List<SourceFile> sourcesFromJsLibraries = new ArrayList<>();
+  private static final Map<Webpath, String> sourcesFromScriptTags = new LinkedHashMap<>();
+  private static final Map<Webpath, Node> sourceTags = new LinkedHashMap<>();
+  private static final Multimap<Webpath, String> suppressions = HashMultimap.create();
+  private static CompilationLevel compilationLevel;
+  private static Webpath outputPath;
+  private static Node firstCompiledScript;
+  private static Node licenseComment;
+  private static int insideDemoSnippet;
+  private static boolean testOnly;
+
+  public static void main(String[] args) throws IOException {
+    compilationLevel = CompilationLevel.fromString(args[0]);
+    testOnly = args[1].equals("true");
+    Webpath inputPath = Webpath.get(args[2]);
+    outputPath = Webpath.get(args[3]);
+    Path output = Paths.get(args[4]);
+    for (int i = 5; i < args.length; i++) {
+      if (args[i].endsWith(".js")) {
+        String code = new String(Files.readAllBytes(Paths.get(args[i])), UTF_8);
+        SourceFile sourceFile = SourceFile.fromCode(args[i], code);
+        if (code.contains("@externs")) {
+          externs.add(sourceFile);
+        } else {
+          sourcesFromJsLibraries.add(sourceFile);
+        }
+        continue;
+      }
+      if (!args[i].endsWith(".pbtxt")) {
+        continue;
+      }
+      Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
+      for (WebfilesSource src : manifest.getSrcList()) {
+        webfiles.put(Webpath.get(src.getWebpath()), Paths.get(src.getPath()));
+      }
+    }
+    stack.add(inputPath);
+    Document document = parse(Files.readAllBytes(webfiles.get(inputPath)));
+    transform(document);
+    compile();
+    if (licenseComment != null) {
+      licenseComment.attr("comment", String.format("\n%s\n", Joiner.on("\n\n").join(licenses)));
+    }
+    Files.write(
+        output,
+        Html5Printer.stringify(document).getBytes(UTF_8),
+        StandardOpenOption.WRITE,
+        StandardOpenOption.CREATE,
+        StandardOpenOption.TRUNCATE_EXISTING);
+  }
+
+  private static void transform(Node root) throws IOException {
+    Node node = checkNotNull(root);
+    Node newNode;
+    while (true) {
+      newNode = enterNode(node);
+      if (node.equals(root)) {
+        root = newNode;
+      }
+      node = newNode;
+      if (node.childNodeSize() > 0) {
+        node = node.childNode(0);
+      } else {
+        while (true) {
+          newNode = leaveNode(node);
+          if (node.equals(root)) {
+            root = newNode;
+          }
+          node = newNode;
+          if (node.equals(root)) {
+            return;
+          }
+          Node next = node.nextSibling();
+          if (next == null) {
+            if (node.parentNode() == null) {
+              return;
+            }
+            node = verifyNotNull(node.parentNode(), "unexpected root: %s", node);
+          } else {
+            node = next;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  private static Node enterNode(Node node) throws IOException {
+    if (node.nodeName().equals("demo-snippet")) {
+      insideDemoSnippet++;
+    }
+    if (insideDemoSnippet > 0) {
+      return node;
+    }
+    if (node instanceof Element) {
+      if (!getAttrTransitive(node, "vulcanize-noinline").isPresent()) {
+        if (node.nodeName().equals("link") && node.attr("rel").equals("import")) {
+          // Inline HTML.
+          node = visitHtmlImport(node);
+        } else if (node.nodeName().equals("script")
+            && !shouldIgnoreUri(node.attr("src"))
+            && !node.hasAttr("jscomp-ignore")) {
+          node = visitScript(node);
+        } else if (node.nodeName().equals("link")
+            && node.attr("rel").equals("stylesheet")
+            && !node.attr("href").isEmpty()
+            && !shouldIgnoreUri(node.attr("href"))) {
+          node = visitStylesheet(node);
+        }
+      }
+      rootifyAttribute(node, "href");
+      rootifyAttribute(node, "src");
+      rootifyAttribute(node, "action");
+      rootifyAttribute(node, "assetpath");
+    } else if (node instanceof Comment) {
+      String text = ((Comment) node).getData();
+      if (text.contains("@license")) {
+        handleLicense(text);
+        if (licenseComment == null) {
+          licenseComment = node;
+        } else {
+          node = replaceNode(node, new TextNode("", node.baseUri()));
+        }
+      } else {
+        node = replaceNode(node, new TextNode("", node.baseUri()));
+      }
+    }
+    return node;
+  }
+
+  private static Node leaveNode(Node node) {
+    if (node instanceof Document) {
+      stack.remove(stack.size() - 1);
+    } else if (node.nodeName().equals("demo-snippet")) {
+      insideDemoSnippet--;
+    }
+    return node;
+  }
+
+  private static Node visitHtmlImport(Node node) throws IOException {
+    Webpath href = me().lookup(Webpath.get(node.attr("href")));
+    if (alreadyInlined.add(href)) {
+      stack.add(href);
+      Document subdocument = parse(Files.readAllBytes(getWebfile(href)));
+      for (Attribute attr : node.attributes()) {
+        subdocument.attr(attr.getKey(), attr.getValue());
+      }
+      return replaceNode(node, subdocument);
+    } else {
+      return replaceNode(node, new TextNode("", node.baseUri()));
+    }
+  }
+
+  private static Node visitScript(Node node) throws IOException {
+    Webpath path;
+    String script;
+    if (node.attr("src").isEmpty()) {
+      path = makeSyntheticName(".js");
+      script = getInlineScriptFromNode(node);
+    } else {
+      path = me().lookup(Webpath.get(node.attr("src")));
+      script = new String(Files.readAllBytes(getWebfile(path)), UTF_8);
+    }
+    if (node.attr("src").endsWith(".min.js")
+        || getAttrTransitive(node, "jscomp-nocompile").isPresent()) {
+      Node newScript =
+          new Element(Tag.valueOf("script"), node.baseUri(), node.attributes())
+              .appendChild(new DataNode(script, node.baseUri()))
+              .removeAttr("src")
+              .removeAttr("jscomp-nocompile");
+      if (firstCompiledScript != null) {
+        firstCompiledScript.before(newScript);
+        return replaceNode(node, new TextNode("", node.baseUri()));
+      } else {
+        return replaceNode(node, newScript);
+      }
+    } else {
+      if (firstCompiledScript == null) {
+        firstCompiledScript = node;
+      }
+      sourcesFromScriptTags.put(path, script);
+      sourceTags.put(path, node);
+      Optional<String> suppress = getAttrTransitive(node, "jscomp-suppress");
+      if (suppress.isPresent()) {
+        if (suppress.get().isEmpty()) {
+          suppressions.put(path, "*");
+        } else {
+          suppressions.putAll(path, Splitter.on(' ').split(suppress.get()));
+        }
+      }
+      return node;
+    }
+  }
+
+  private static Node visitStylesheet(Node node) throws IOException {
+    Webpath href = me().lookup(Webpath.get(node.attr("href")));
+    return replaceNode(
+        node,
+        new Element(Tag.valueOf("style"), node.baseUri(), node.attributes())
+            .appendChild(
+                new DataNode(
+                    new String(Files.readAllBytes(getWebfile(href)), UTF_8), node.baseUri()))
+            .removeAttr("rel")
+            .removeAttr("href"));
+  }
+
+  private static Optional<String> getAttrTransitive(Node node, String attr) {
+    while (node != null) {
+      if (node.hasAttr(attr)) {
+        return Optional.of(node.attr(attr));
+      }
+      node = node.parent();
+    }
+    return Optional.absent();
+  }
+
+  private static Node replaceNode(Node oldNode, Node newNode) {
+    oldNode.replaceWith(newNode);
+    return newNode;
+  }
+
+  private static Path getWebfile(Webpath path) {
+    return verifyNotNull(webfiles.get(path), "Bad ref: %s -> %s", me(), path);
+  }
+
+  private static void compile() {
+    if (sourcesFromScriptTags.isEmpty()) {
+      return;
+    }
+
+    CompilerOptions options = new CompilerOptions();
+    compilationLevel.setOptionsForCompilationLevel(options);
+
+    // Nice options.
+    options.setColorizeErrorOutput(true);
+    options.setContinueAfterErrors(true);
+    options.setLanguageIn(CompilerOptions.LanguageMode.ECMASCRIPT_2016);
+    options.setLanguageOut(CompilerOptions.LanguageMode.ECMASCRIPT5);
+    options.setGenerateExports(true);
+    options.setStrictModeInput(false);
+    options.setExtraAnnotationNames(EXTRA_JSDOC_TAGS);
+
+    // So we can chop JS binary back up into the original script tags.
+    options.setPrintInputDelimiter(true);
+    options.setInputDelimiter("//~~WEBPATH~~%name%");
+
+    // Optimizations that are too advanced for us right now.
+    options.setPropertyRenaming(PropertyRenamingPolicy.OFF);
+    options.setCheckGlobalThisLevel(CheckLevel.OFF);
+    options.setRemoveUnusedPrototypeProperties(false);
+    options.setRemoveUnusedPrototypePropertiesInExterns(false);
+    options.setRemoveUnusedClassProperties(false);
+
+    // Dependency management.
+    options.setClosurePass(true);
+    options.setManageClosureDependencies(true);
+    options.getDependencyOptions().setDependencyPruning(true);
+    options.getDependencyOptions().setDependencySorting(true);
+    options.getDependencyOptions().setMoocherDropping(false);
+    options.getDependencyOptions()
+        .setEntryPoints(
+            sourceTags
+                .keySet()
+                .stream()
+                .map(Webpath::toString)
+                .map(ModuleIdentifier::forFile)
+                .collect(Collectors.toList()));
+
+    // Polymer pass.
+    options.setPolymerVersion(1);
+
+    // Debug flags.
+    if (testOnly) {
+      options.setPrettyPrint(true);
+      options.setGeneratePseudoNames(true);
+      options.setExportTestFunctions(true);
+    }
+
+    // Don't print warnings from <script jscomp-suppress="group1 group2" ...> tags.
+    ImmutableMultimap<DiagnosticType, String> diagnosticGroups = initDiagnosticGroups();
+    options.addWarningsGuard(
+        new WarningsGuard() {
+          @Override
+          public CheckLevel level(JSError error) {
+            if (error.sourceName == null) {
+              return null;
+            }
+            if (error.sourceName.startsWith("javascript/externs")
+                || error.sourceName.contains("com_google_javascript_closure_compiler_externs")) {
+              // TODO(jart): Figure out why these "mismatch of the removeEventListener property on
+              //             type" warnings are showing up.
+              //             https://github.com/google/closure-compiler/pull/1959
+              return CheckLevel.OFF;
+            }
+            if (IGNORE_PATHS_PATTERN.matcher(error.sourceName).matches()) {
+              return CheckLevel.OFF;
+            }
+            if (error.sourceName.startsWith("/tf-graph")
+                && error.getType().key.equals("JSC_VAR_MULTIPLY_DECLARED_ERROR")) {
+              return CheckLevel.OFF; // TODO(jart): Remove when tf-graph is ES6 modules.
+            }
+            if (error.getType().key.equals("JSC_POLYMER_UNQUALIFIED_BEHAVIOR")
+                || error.getType().key.equals("JSC_POLYMER_UNANNOTATED_BEHAVIOR")) {
+              return CheckLevel.OFF; // TODO(jart): What is wrong with this thing?
+            }
+            Collection<String> codes = suppressions.get(Webpath.get(error.sourceName));
+            if (codes.contains("*") || codes.contains(error.getType().key)) {
+              return CheckLevel.OFF;
+            }
+            for (String group : diagnosticGroups.get(error.getType())) {
+              if (codes.contains(group)) {
+                return CheckLevel.OFF;
+              }
+            }
+            return null;
+          }
+        });
+
+    // Get reverse topological script tags and their web paths, which js_library stuff first.
+    List<SourceFile> sauce = Lists.newArrayList(sourcesFromJsLibraries);
+    for (Map.Entry<Webpath, String> source : sourcesFromScriptTags.entrySet()) {
+      sauce.add(SourceFile.fromCode(source.getKey().toString(), source.getValue()));
+    }
+
+    // Compile everything into a single script.
+    Compiler compiler = new Compiler();
+    compiler.disableThreads();
+    Result result = compiler.compile(externs, sauce, options);
+    if (!result.success) {
+      System.exit(1);
+    }
+    String jsBlob = compiler.toSource();
+
+    // Split apart the JS blob and put it back in the original <script> locations.
+    Deque<Map.Entry<Webpath, Node>> tags = new ArrayDeque<>();
+    tags.addAll(sourceTags.entrySet());
+    Matcher matcher = WEBPATH_PATTERN.matcher(jsBlob);
+    verify(matcher.find(), "Nothing found in compiled JS blob!");
+    Webpath path = Webpath.get(matcher.group(1));
+    int start = 0;
+    while (matcher.find()) {
+      if (sourceTags.containsKey(path)) {
+        swapScript(tags, path, jsBlob.substring(start, matcher.start()));
+        start = matcher.start();
+      }
+      path = Webpath.get(matcher.group(1));
+    }
+    swapScript(tags, path, jsBlob.substring(start));
+    verify(tags.isEmpty(), "<script> wasn't compiled: %s", tags);
+  }
+
+  private static void swapScript(
+      Deque<Map.Entry<Webpath, Node>> tags, Webpath path, String script) {
+    verify(!tags.isEmpty(), "jscomp compiled %s after last <script>?!", path);
+    Webpath want = tags.getFirst().getKey();
+    verify(path.equals(want), "<script> tag for %s should come before %s", path, want);
+    Node tag = tags.removeFirst().getValue();
+    tag.replaceWith(
+        new Element(Tag.valueOf("script"), tag.baseUri())
+            .appendChild(new DataNode(script, tag.baseUri())));
+  }
+
+  private static void handleLicense(String text) {
+    if (legalese.add(CharMatcher.whitespace().removeFrom(text))) {
+      licenses.add(CharMatcher.anyOf("\r\n").trimFrom(text));
+    }
+  }
+
+  private static Webpath me() {
+    return Iterables.getLast(stack);
+  }
+
+  private static Webpath makeSyntheticName(String extension) {
+    String me = me().toString();
+    Webpath result = Webpath.get(me + extension);
+    int n = 2;
+    while (sourcesFromScriptTags.containsKey(result)) {
+      result = Webpath.get(String.format("%s-%d%s", me, n++, extension));
+    }
+    return result;
+  }
+
+  private static void rootifyAttribute(Node node, String attribute) {
+    String value = node.attr(attribute);
+    if (value.isEmpty()) {
+      return;
+    }
+    Webpath uri = Webpath.get(value);
+    if (webfiles.containsKey(uri)) {
+      node.attr(attribute, outputPath.getParent().relativize(uri).toString());
+    }
+  }
+
+  private static String getInlineScriptFromNode(Node node) {
+    StringBuilder sb = new StringBuilder();
+    for (Node child : node.childNodes()) {
+      if (child instanceof DataNode) {
+        sb.append(((DataNode) child).getWholeData());
+      }
+    }
+    return sb.toString();
+  }
+
+  private static Document parse(byte[] bytes) {
+    return parse(new ByteArrayInputStream(bytes));
+  }
+
+  private static Document parse(InputStream input) {
+    Document document;
+    try {
+      document = Jsoup.parse(input, null, "", parser);
+    } catch (IOException e) {
+      throw new AssertionError("I/O error when parsing byte array D:", e);
+    }
+    document.outputSettings().indentAmount(0);
+    document.outputSettings().prettyPrint(false);
+    return document;
+  }
+
+  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
+    verify(path.toString().endsWith(".pbtxt"), "Not a pbtxt file: %s", path);
+    Webfiles.Builder build = Webfiles.newBuilder();
+    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
+    return build.build();
+  }
+
+  private static boolean shouldIgnoreUri(String uri) {
+    return uri.startsWith("#")
+        || uri.endsWith("/")
+        || uri.contains("//")
+        || uri.startsWith("data:")
+        || uri.startsWith("javascript:")
+        // The following are intended to filter out URLs with Polymer variables.
+        || (uri.contains("[[") && uri.contains("]]"))
+        || (uri.contains("{{") && uri.contains("}}"));
+  }
+
+  private static ImmutableMultimap<DiagnosticType, String> initDiagnosticGroups() {
+    DiagnosticGroups groups = new DiagnosticGroups();
+    Multimap<DiagnosticType, String> builder = HashMultimap.create();
+    for (Map.Entry<String, DiagnosticGroup> group : groups.getRegisteredGroups().entrySet()) {
+      for (DiagnosticType type : group.getValue().getTypes()) {
+        builder.put(type, group.getKey());
+      }
+    }
+    return ImmutableMultimap.copyOf(builder);
+  }
+}
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java
new file mode 100644
index 00000000000..31b3aa195e1
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java
@@ -0,0 +1,73 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.tensorflow.tensorboard.vulcanize;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.protobuf.TextFormat;
+import io.bazel.rules.closure.webfiles.BuildInfo.WebfileInfo;
+import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
+import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
+import io.bazel.rules.closure.webfiles.WebfilesWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.Deflater;
+
+/**
+ * Simple one-off solution for TensorBoard zipping of web_library rules.
+ *
+ * <p>This is intended to collect static assets for production web server deployment. The paths of
+ * files inside the zip will be web paths, with the prefix slash removed. These files will be
+ * topologically ordered, i.e. web files higher up in the build tree come first.
+ */
+public final class Zipper {
+
+  public static void main(String[] args) throws IOException {
+    Set<String> alreadyZipped = new HashSet<>();
+    try (WebfilesWriter writer =
+        new WebfilesWriter(
+            Files.newByteChannel(
+                Paths.get(args[0]),
+                StandardOpenOption.WRITE,
+                StandardOpenOption.CREATE,
+                StandardOpenOption.TRUNCATE_EXISTING),
+            Deflater.BEST_SPEED)) {
+      for (int i = 1; i < args.length; i++) {
+        Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
+        for (WebfilesSource src : manifest.getSrcList()) {
+          if (!alreadyZipped.add(src.getWebpath())) {
+            continue;
+          }
+          try (InputStream input = Files.newInputStream(Paths.get(src.getPath()))) {
+            writer.writeWebfile(
+                WebfileInfo.newBuilder().setWebpath(src.getWebpath()).build(), input);
+          }
+        }
+      }
+    }
+  }
+
+  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
+    Webfiles.Builder build = Webfiles.newBuilder();
+    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
+    return build.build();
+  }
+}
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js
new file mode 100644
index 00000000000..2e56562c1c4
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js
@@ -0,0 +1,48 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @fileoverview Miscellaneous JSCompiler externs needed for TensorBoard.
+ * @externs
+ */
+
+/** @type {!Object} */ var _;
+/** @type {!Object} */ var d3;
+/** @type {!Object} */ var dagre;
+/** @type {!Object} */ var weblas;
+/** @type {!Object} */ var graphlib;
+/** @type {!Object} */ var Plottable;
+/** @type {!Object} */ var GroupEffect;
+/** @type {!Function|undefined} */ var ga;
+/** @type {!Function|undefined} */ var KeyframeEffect;
+
+/**
+ * Some weird webcomponents-lite.js thing.
+ * @type {!Function|undefined}
+ */
+var wrap;
+
+/**
+ * Some weird webcomponents-lite.js thing.
+ * @type {!Function|undefined}
+ */
+window.wrap;
+
+var HTMLImports;
+
+/**
+ * @param {function()} callback
+ * @param {!HTMLDocument=} opt_doc
+ */
+HTMLImports.whenReady = function(callback, opt_doc) {};
diff --git a/tensorflow/tensorboard/lib/BUILD b/tensorflow/tensorboard/lib/BUILD
deleted file mode 100644
index 9c497396c68..00000000000
--- a/tensorflow/tensorboard/lib/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Description:
-# BUILD rules for the static resources in TensorBoard.
-
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files([
-    "LICENSE",
-])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "**/*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/lib/css/global.css b/tensorflow/tensorboard/lib/css/global.css
deleted file mode 100644
index cb6e966fdda..00000000000
--- a/tensorflow/tensorboard/lib/css/global.css
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-html, body {
-  margin: 0;
-  padding: 0;
-  height: 100%;
-  font-family: "RobotoDraft","Roboto",sans-serif;
-}
diff --git a/tensorflow/tensorboard/lib/python/BUILD b/tensorflow/tensorboard/lib/python/BUILD
deleted file mode 100644
index 740355f5ace..00000000000
--- a/tensorflow/tensorboard/lib/python/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# Description:
-# Public utilities from TensorBoard, a dashboard for investigating TensorFlow
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "http_util",
-    srcs = ["http_util.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":json_util",
-        "//tensorflow/python:util",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "http_util_test",
-    size = "small",
-    srcs = ["http_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":http_util",
-        "//tensorflow/python:client_testlib",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "json_util",
-    srcs = ["json_util.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python:util"],
-)
-
-py_test(
-    name = "json_util_test",
-    size = "small",
-    srcs = ["json_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":json_util",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/lib/python/__init__.py b/tensorflow/tensorboard/lib/python/__init__.py
deleted file mode 100644
index 1aa2ec9e030..00000000000
--- a/tensorflow/tensorboard/lib/python/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
diff --git a/tensorflow/tensorboard/main.py b/tensorflow/tensorboard/main.py
new file mode 100644
index 00000000000..3665d02ff55
--- /dev/null
+++ b/tensorflow/tensorboard/main.py
@@ -0,0 +1,222 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Serve TensorFlow summary data to a web frontend.
+
+This is a simple web server to proxy data from the event_loader to the web, and
+serve static web files.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging as base_logging
+import os
+import socket
+import sys
+
+import tensorflow as tf
+from werkzeug import serving
+
+
+from tensorflow.tensorboard.backend import application
+from tensorflow.tensorboard.backend.event_processing import event_file_inspector as efi
+from tensorflow.tensorboard.plugins.audio import audio_plugin
+from tensorflow.tensorboard.plugins.distributions import distributions_plugin
+from tensorflow.tensorboard.plugins.graphs import graphs_plugin
+from tensorflow.tensorboard.plugins.histograms import histograms_plugin
+from tensorflow.tensorboard.plugins.images import images_plugin
+from tensorflow.tensorboard.plugins.projector import projector_plugin
+from tensorflow.tensorboard.plugins.scalars import scalars_plugin
+from tensorflow.tensorboard.plugins.text import text_plugin
+
+# TensorBoard flags
+
+tf.flags.DEFINE_string('logdir', '', """logdir specifies the directory where
+TensorBoard will look to find TensorFlow event files that it can display.
+TensorBoard will recursively walk the directory structure rooted at logdir,
+looking for .*tfevents.* files.
+
+You may also pass a comma separated list of log directories, and TensorBoard
+will watch each directory. You can also assign names to individual log
+directories by putting a colon between the name and the path, as in
+
+tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
+""")
+
+tf.flags.DEFINE_string(
+    'host', '', 'What host to listen to. Defaults to '
+    'serving on all interfaces, set to 127.0.0.1 (localhost) to'
+    'disable remote access (also quiets security warnings).')
+
+tf.flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
+
+tf.flags.DEFINE_boolean(
+    'purge_orphaned_data', True, 'Whether to purge data that '
+    'may have been orphaned due to TensorBoard restarts. '
+    'Disabling purge_orphaned_data can be used to debug data '
+    'disappearance.')
+
+tf.flags.DEFINE_integer('reload_interval', 5,
+                        'How often the backend should load '
+                        'more data.')
+
+# Inspect Mode flags
+
+tf.flags.DEFINE_boolean('inspect', False, """Use this flag to print out a digest
+of your event files to the command line, when no data is shown on TensorBoard or
+the data shown looks weird.
+
+Example usages:
+tensorboard --inspect --event_file=myevents.out
+tensorboard --inspect --event_file=myevents.out --tag=loss
+tensorboard --inspect --logdir=mylogdir
+tensorboard --inspect --logdir=mylogdir --tag=loss
+
+See tensorflow/python/summary/event_file_inspector.py for more info and
+detailed usage.
+""")
+tf.flags.DEFINE_string(
+    'tag', '',
+    'The particular tag to query for. Only used if --inspect is present')
+tf.flags.DEFINE_string(
+    'event_file', '',
+    'The particular event file to query for. Only used if --inspect is present '
+    'and --logdir is not specified.')
+
+FLAGS = tf.flags.FLAGS
+
+
+def create_tb_app(plugins):
+  """Read the flags, and create a TensorBoard WSGI application.
+
+  Args:
+    plugins: A list of plugins for TensorBoard to initialize.
+
+  Raises:
+    ValueError: if a logdir is not specified.
+
+  Returns:
+    A new TensorBoard WSGI application.
+  """
+  if not FLAGS.logdir:
+    raise ValueError('A logdir must be specified. Run `tensorboard --help` for '
+                     'details and examples.')
+
+  logdir = os.path.expanduser(FLAGS.logdir)
+  return application.standard_tensorboard_wsgi(
+      logdir=logdir,
+      purge_orphaned_data=FLAGS.purge_orphaned_data,
+      reload_interval=FLAGS.reload_interval,
+      plugins=plugins)
+
+
+def make_simple_server(tb_app, host, port):
+  """Create an HTTP server for TensorBoard.
+
+  Args:
+    tb_app: The TensorBoard WSGI application to create a server for.
+    host: Indicates the interfaces to bind to ('::' or '0.0.0.0' for all
+        interfaces, '::1' or '127.0.0.1' for localhost). A blank value ('')
+        indicates protocol-agnostic all interfaces.
+    port: The port to bind to (0 indicates an unused port selected by the
+        operating system).
+  Returns:
+    A tuple of (server, url):
+      server: An HTTP server object configured to host TensorBoard.
+      url: A best guess at a URL where TensorBoard will be accessible once the
+        server has been started.
+  Raises:
+    socket.error: If a server could not be constructed with the host and port
+      specified. Also logs an error message.
+  """
+  # Mute the werkzeug logging.
+  base_logging.getLogger('werkzeug').setLevel(base_logging.WARNING)
+
+  try:
+    if host:
+      # The user gave us an explicit host
+      server = serving.make_server(host, port, tb_app, threaded=True)
+      if ':' in host and not host.startswith('['):
+        # Display IPv6 addresses as [::1]:80 rather than ::1:80
+        final_host = '[{}]'.format(host)
+      else:
+        final_host = host
+    else:
+      # We've promised to bind to all interfaces on this host. However, we're
+      # not sure whether that means IPv4 or IPv6 interfaces.
+      try:
+        # First try passing in a blank host (meaning all interfaces). This,
+        # unfortunately, defaults to IPv4 even if no IPv4 interface is available
+        # (yielding a socket.error).
+        server = serving.make_server(host, port, tb_app, threaded=True)
+      except socket.error:
+        # If a blank host didn't work, we explicitly request IPv6 interfaces.
+        server = serving.make_server('::', port, tb_app, threaded=True)
+      final_host = socket.gethostname()
+    server.daemon_threads = True
+  except socket.error as socket_error:
+    if port == 0:
+      msg = 'TensorBoard unable to find any open port'
+    else:
+      msg = (
+          'TensorBoard attempted to bind to port %d, but it was already in use'
+          % FLAGS.port)
+    tf.logging.error(msg)
+    print(msg)
+    raise socket_error
+
+  final_port = server.socket.getsockname()[1]
+  tensorboard_url = 'http://%s:%d' % (final_host, final_port)
+  return server, tensorboard_url
+
+
+def run_simple_server(tb_app):
+  """Run a TensorBoard HTTP server, and print some messages to the console."""
+  try:
+    server, url = make_simple_server(tb_app, FLAGS.host, FLAGS.port)
+  except socket.error:
+    # An error message was already logged
+    exit(-1)
+  msg = 'Starting TensorBoard %s at %s' % (tb_app.tag, url)
+  print(msg)
+  tf.logging.info(msg)
+  print('(Press CTRL+C to quit)')
+  sys.stdout.flush()
+
+  server.serve_forever()
+
+
+def main(unused_argv=None):
+  if FLAGS.inspect:
+    tf.logging.info('Not bringing up TensorBoard, but inspecting event files.')
+    event_file = os.path.expanduser(FLAGS.event_file)
+    efi.inspect(FLAGS.logdir, event_file, FLAGS.tag)
+    return 0
+  else:
+    plugins = [
+        scalars_plugin.ScalarsPlugin(),
+        images_plugin.ImagesPlugin(),
+        audio_plugin.AudioPlugin(),
+        graphs_plugin.GraphsPlugin(),
+        distributions_plugin.DistributionsPlugin(),
+        histograms_plugin.HistogramsPlugin(),
+        projector_plugin.ProjectorPlugin(),
+        text_plugin.TextPlugin(),
+    ]
+    tb = create_tb_app(plugins)
+    run_simple_server(tb)
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/tensorboard/package.json b/tensorflow/tensorboard/package.json
deleted file mode 100644
index ed79c48f1e3..00000000000
--- a/tensorflow/tensorboard/package.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "name": "tensorflow-vis",
-  "version": "0.0.0",
-  "description": "Visualizers for TensorFlow",
-  "scripts": {
-    "test": "gulp test",
-    "prepare": "npm install && bower install && typings install",
-    "compile": "gulp compile"
-  },
-  "keywords": [
-    "tensorflow"
-  ],
-  "author": "Google",
-  "license": "Apache-2.0",
-  "devDependencies": {
-    "browserify": "^13.1.0",
-    "gulp": "~3.9.0",
-    "gulp-bower": "0.0.13",
-    "gulp-clean-compiled-typescript": "~1.0.1",
-    "gulp-cli": "^1.1.0",
-    "gulp-concat": "^2.6.0",
-    "gulp-filter": "~3.0.1",
-    "gulp-header": "~1.7.1",
-    "gulp-rename": "~1.2.2",
-    "gulp-replace": "~0.5.4",
-    "gulp-server-livereload": "1.9.2",
-    "gulp-typescript": "~2.10.0",
-    "gulp-util": "~3.0.7",
-    "gulp-vulcanize": "~6.1.0",
-    "merge2": "~0.3.6",
-    "minimist": "~1.2.0",
-    "tsify": "^0.14.8",
-    "typescript": "2.0.6",
-    "typings": "1.4.0",
-    "vinyl-source-stream": "^1.1.0",
-    "vulcanize": "^1.14.0",
-    "web-component-tester": "4.2.2"
-  }
-}
diff --git a/tensorflow/tensorboard/plugins/BUILD b/tensorflow/tensorboard/plugins/BUILD
new file mode 100644
index 00000000000..7d3a96c0e15
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/BUILD
@@ -0,0 +1,20 @@
+# Description:
+# A plugin system for TensorBoard
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "base_plugin",
+    srcs = ["base_plugin.py"],
+    srcs_version = "PY2AND3",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/audio/BUILD b/tensorflow/tensorboard/plugins/audio/BUILD
new file mode 100644
index 00000000000..372aa3067c9
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/audio/BUILD
@@ -0,0 +1,56 @@
+# Description:
+# TensorBoard plugin for audio
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "audio_plugin",
+    srcs = ["audio_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "audio_plugin_test",
+    size = "small",
+    srcs = ["audio_plugin_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":audio_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_binary(
+    name = "audio_demo",
+    srcs = ["audio_demo.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/audio/audio_demo.py b/tensorflow/tensorboard/plugins/audio/audio_demo.py
new file mode 100644
index 00000000000..b89310d3a8a
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/audio/audio_demo.py
@@ -0,0 +1,223 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sample data exhibiting audio summaries, via a waveform generator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os.path
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+FLAGS = tf.flags.FLAGS
+
+tf.flags.DEFINE_string('logdir', '/tmp/audio_demo',
+                       'Directory into which to write TensorBoard data.')
+
+tf.flags.DEFINE_integer('steps', 500,
+                        'Number of frequencies of each waveform to generate.')
+
+# Parameters for the audio output.
+tf.flags.DEFINE_integer('sample_rate', 44100, 'Sample rate, in Hz.')
+tf.flags.DEFINE_float('duration', 2.0, 'Duration of each waveform, in s.')
+
+
+def _samples():
+  """Compute how many samples should be included in each waveform."""
+  return int(FLAGS.sample_rate * FLAGS.duration)
+
+
+def run(logdir, run_name, wave_name, wave_constructor):
+  """Generate wave data of the given form.
+
+  The provided function `wave_constructor` should accept a scalar tensor
+  of type float32, representing the frequency (in Hz) at which to
+  construct a wave, and return a tensor of shape [1, _samples(), `n`]
+  representing audio data (for some number of channels `n`).
+
+  Waves will be generated at frequencies ranging from A4 to A5.
+
+  Arguments:
+    logdir: the top-level directory into which to write summary data
+    run_name: the name of this run; will be created as a subdirectory
+      under logdir
+    wave_name: the name of the wave being generated
+    wave_constructor: see above
+  """
+  tf.reset_default_graph()
+  tf.set_random_seed(0)
+
+  # On each step `i`, we'll set this placeholder to `i`. This allows us
+  # to know "what time it is" at each step.
+  step_placeholder = tf.placeholder(tf.float32, shape=[])
+
+  # We want to linearly interpolate a frequency between A4 (440 Hz) and
+  # A5 (880 Hz).
+  f_min = 440.0
+  f_max = 880.0
+  t = step_placeholder / (FLAGS.steps - 1)
+  frequency = f_min * (1.0 - t) + f_max * t
+
+  # Let's log this frequency, just so that we can make sure that it's as
+  # expected.
+  tf.summary.scalar('frequency', frequency)
+
+  # Now, we pass this to the wave constructor to get our waveform. Doing
+  # so within a name scope means that any summaries that the wave
+  # constructor produces will be namespaced.
+  with tf.name_scope(wave_name):
+    waveform = wave_constructor(frequency)
+
+  # Here's the crucial piece: we interpret this result as audio.
+  tf.summary.audio('waveform', waveform, FLAGS.sample_rate)
+
+  # Now, we can collect up all the summaries and begin the run.
+  summ = tf.summary.merge_all()
+
+  sess = tf.Session()
+  writer = tf.summary.FileWriter(os.path.join(logdir, run_name))
+  writer.add_graph(sess.graph)
+  sess.run(tf.global_variables_initializer())
+  for step in xrange(FLAGS.steps):
+    s = sess.run(summ, feed_dict={step_placeholder: float(step)})
+    writer.add_summary(s, global_step=step)
+  writer.close()
+
+
+# Now, let's take a look at the kinds of waves that we can generate.
+
+
+def sine_wave(frequency):
+  """Emit a sine wave at the given frequency."""
+  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
+  ts = xs / FLAGS.sample_rate
+  return tf.sin(2 * math.pi * frequency * ts)
+
+
+def square_wave(frequency):
+  """Emit a square wave at the given frequency."""
+  # The square is just the sign of the sine!
+  return tf.sign(sine_wave(frequency))
+
+
+def triangle_wave(frequency):
+  """Emit a triangle wave at the given frequency."""
+  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
+  ts = xs / FLAGS.sample_rate
+  #
+  # A triangle wave looks like this:
+  #
+  #      /\      /\
+  #     /  \    /  \
+  #         \  /    \  /
+  #          \/      \/
+  #
+  # If we look at just half a period (the first four slashes in the
+  # diagram above), we can see that it looks like a transformed absolute
+  # value function.
+  #
+  # Let's start by computing the times relative to the start of each
+  # half-wave pulse (each individual "mountain" or "valley", of which
+  # there are four in the above diagram).
+  half_pulse_index = ts * (frequency * 2)
+  half_pulse_angle = half_pulse_index % 1.0  # in [0, 1]
+  #
+  # Now, we can see that each positive half-pulse ("mountain") has
+  # amplitude given by A(z) = 0.5 - abs(z - 0.5), and then normalized:
+  absolute_amplitude = (0.5 - tf.abs(half_pulse_angle - 0.5)) / 0.5
+  #
+  # But every other half-pulse is negative, so we should invert these.
+  half_pulse_parity = tf.sign(1 - (half_pulse_index % 2.0))
+  amplitude = half_pulse_parity * absolute_amplitude
+  #
+  # This is precisely the desired result, so we're done!
+  return amplitude
+
+
+# If we want to get fancy, we can use our above waves as primitives to
+# build more interesting waves.
+
+
+def bisine_wave(frequency):
+  """Emit two sine waves, in stereo at different octaves."""
+  #
+  # We can first our existing sine generator to generate two different
+  # waves.
+  f_hi = frequency
+  f_lo = frequency / 2.0
+  with tf.name_scope('hi'):
+    sine_hi = sine_wave(f_hi)
+  with tf.name_scope('lo'):
+    sine_lo = sine_wave(f_lo)
+  #
+  # Now, we have two tensors of shape [1, _samples(), 1]. By concatenating
+  # them along axis 2, we get a tensor of shape [1, _samples(), 2]---a
+  # stereo waveform.
+  return tf.concat([sine_lo, sine_hi], axis=2)
+
+
+def bisine_wahwah_wave(frequency):
+  """Emit two sine waves with balance oscillating left and right."""
+  #
+  # This is clearly intended to build on the bisine wave defined above,
+  # so we can start by generating that.
+  waves_a = bisine_wave(frequency)
+  #
+  # Then, by reversing axis 2, we swap the stereo channels. By mixing
+  # this with `waves_a`, we'll be able to create the desired effect.
+  waves_b = tf.reverse(waves_a, axis=[2])
+  #
+  # Let's have the balance oscillate from left to right four times.
+  iterations = 4
+  #
+  # Now, we compute the balance for each sample: `ts` has values
+  # in [0, 1] that indicate how much we should use `waves_a`.
+  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
+  thetas = xs / _samples() * iterations
+  ts = (tf.sin(math.pi * 2 * thetas) + 1) / 2
+  #
+  # Finally, we can mix the two together, and we're done.
+  return ts * waves_a + (1.0 - ts) * waves_b
+
+
+def run_all(logdir, verbose=False):
+  """Generate waves of the shapes defined above.
+
+  Arguments:
+    logdir: the directory into which to store all the runs' data
+    verbose: if true, print out each run's name as it begins
+  """
+  waves = [sine_wave, square_wave, triangle_wave,
+           bisine_wave, bisine_wahwah_wave]
+  for (i, wave_constructor) in enumerate(waves):
+    wave_name = wave_constructor.__name__
+    run_name = 'wave:%02d,%s' % (i + 1, wave_name)
+    if verbose:
+      print('--- Running: %s' % run_name)
+    run(logdir, run_name, wave_name, wave_constructor)
+
+
+def main(unused_argv):
+  print('Saving output to %s.' % FLAGS.logdir)
+  run_all(FLAGS.logdir, verbose=True)
+  print('Done. Output saved to %s.' % FLAGS.logdir)
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/audio/audio_plugin.py b/tensorflow/tensorboard/plugins/audio/audio_plugin.py
new file mode 100644
index 00000000000..ee63b67637d
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/audio/audio_plugin.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Audio plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import urllib
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = event_accumulator.AUDIO
+
+
+class AudioPlugin(base_plugin.TBPlugin):
+  """Audio Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/audio': self._serve_audio_metadata,
+        '/individualAudio': self._serve_individual_audio,
+        '/tags': self._serve_tags,
+    }
+
+  def is_active(self):
+    """The audio plugin is active iff any run has at least one relevant tag."""
+    return any(self.index_impl().values())
+
+  def _index_impl(self):
+    return {
+        run_name: run_data[event_accumulator.AUDIO]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.AUDIO in run_data
+    }
+
+  @wrappers.Request.application
+  def _serve_audio_metadata(self, request):
+    """Given a tag and list of runs, serve a list of metadata for audio.
+
+    Note that the audio themselves are not sent; instead, we respond with URLs
+    to the audio. The frontend should treat these URLs as opaque and should not
+    try to parse information about them or generate them itself, as the format
+    may change.
+
+    Args:
+      request: A werkzeug.wrappers.Request object.
+
+    Returns:
+      A werkzeug.Response application.
+    """
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+
+    audio_list = self._multiplexer.Audio(run, tag)
+    response = self._audio_response_for_run(audio_list, run, tag)
+    return http_util.Respond(request, response, 'application/json')
+
+  def _audio_response_for_run(self, run_audio, run, tag):
+    """Builds a JSON-serializable object with information about run_audio.
+
+    Args:
+      run_audio: A list of event_accumulator.AudioValueEvent objects.
+      run: The name of the run.
+      tag: The name of the tag the audio entries all belong to.
+
+    Returns:
+      A list of dictionaries containing the wall time, step, URL, width, and
+      height for each audio entry.
+    """
+    response = []
+    for index, run_audio_clip in enumerate(run_audio):
+      response.append({
+          'wall_time': run_audio_clip.wall_time,
+          'step': run_audio_clip.step,
+          'content_type': run_audio_clip.content_type,
+          'query': self._query_for_individual_audio(run, tag, index)
+      })
+    return response
+
+  def _query_for_individual_audio(self, run, tag, index):
+    """Builds a URL for accessing the specified audio.
+
+    This should be kept in sync with _serve_audio_metadata. Note that the URL is
+    *not* guaranteed to always return the same audio, since audio may be
+    unloaded from the reservoir as new audio entries come in.
+
+    Args:
+      run: The name of the run.
+      tag: The tag.
+      index: The index of the audio entry. Negative values are OK.
+
+    Returns:
+      A string representation of a URL that will load the index-th sampled audio
+      in the given run with the given tag.
+    """
+    query_string = urllib.parse.urlencode({
+        'run': run,
+        'tag': tag,
+        'index': index
+    })
+    return query_string
+
+  @wrappers.Request.application
+  def _serve_individual_audio(self, request):
+    """Serves an individual audio entry."""
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    index = int(request.args.get('index'))
+    audio = self._multiplexer.Audio(run, tag)[index]
+    return http_util.Respond(
+        request, audio.encoded_audio_string, audio.content_type)
+
+  @wrappers.Request.application
+  def _serve_tags(self, request):
+    index = self._index_impl()
+    return http_util.Respond(request, index, 'application/json')
diff --git a/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py b/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py
new file mode 100644
index 00000000000..961691086e1
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py
@@ -0,0 +1,157 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests the Tensorboard audio plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+import shutil
+import tempfile
+
+import numpy
+from six.moves import urllib
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from werkzeug import test as werkzeug_test
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import application
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.audio import audio_plugin
+
+
+class AudioPluginTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.log_dir = tempfile.mkdtemp()
+
+    # We use numpy.random to generate audio. We seed to avoid non-determinism
+    # in this test.
+    numpy.random.seed(42)
+
+    # Create audio summaries for run foo.
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.float32)
+    tf.summary.audio(name="baz", tensor=placeholder, sample_rate=44100)
+    merged_summary_op = tf.summary.merge_all()
+    foo_directory = os.path.join(self.log_dir, "foo")
+    writer = tf.summary.FileWriter(foo_directory)
+    writer.add_graph(sess.graph)
+    for step in xrange(2):
+      # The floats (sample data) range from -1 to 1.
+      writer.add_summary(sess.run(merged_summary_op, feed_dict={
+          placeholder: numpy.random.rand(42, 22050) * 2 - 1
+      }), global_step=step)
+    writer.close()
+
+    # Create audio summaries for run bar.
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.float32)
+    tf.summary.audio(name="quux", tensor=placeholder, sample_rate=44100)
+    merged_summary_op = tf.summary.merge_all()
+    bar_directory = os.path.join(self.log_dir, "bar")
+    writer = tf.summary.FileWriter(bar_directory)
+    writer.add_graph(sess.graph)
+    for step in xrange(2):
+      # The floats (sample data) range from -1 to 1.
+      writer.add_summary(sess.run(merged_summary_op, feed_dict={
+          placeholder: numpy.random.rand(42, 11025) * 2 - 1
+      }), global_step=step)
+    writer.close()
+
+    # Start a server with the plugin.
+    multiplexer = event_multiplexer.EventMultiplexer({
+        "foo": foo_directory,
+        "bar": bar_directory,
+    })
+    plugin = audio_plugin.AudioPlugin()
+    wsgi_app = application.TensorBoardWSGIApp(
+        self.log_dir, [plugin], multiplexer, reload_interval=0)
+    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
+    self.routes = plugin.get_plugin_apps(multiplexer, self.log_dir)
+
+  def tearDown(self):
+    shutil.rmtree(self.log_dir, ignore_errors=True)
+
+  def _DeserializeResponse(self, byte_content):
+    """Deserializes byte content that is a JSON encoding.
+
+    Args:
+      byte_content: The byte content of a response.
+
+    Returns:
+      The deserialized python object decoded from JSON.
+    """
+    return json.loads(byte_content.decode("utf-8"))
+
+  def testRoutesProvided(self):
+    """Tests that the plugin offers the correct routes."""
+    self.assertIsInstance(self.routes["/audio"], collections.Callable)
+    self.assertIsInstance(self.routes["/individualAudio"], collections.Callable)
+    self.assertIsInstance(self.routes["/tags"], collections.Callable)
+
+  def testAudioRoute(self):
+    """Tests that the /audio routes returns with the correct data."""
+    response = self.server.get(
+        "/data/plugin/audio/audio?run=foo&tag=baz/audio/0")
+    self.assertEqual(200, response.status_code)
+
+    # Verify that the correct entries are returned.
+    entries = self._DeserializeResponse(response.get_data())
+    self.assertEqual(2, len(entries))
+
+    # Verify that the 1st entry is correct.
+    entry = entries[0]
+    self.assertEqual(0, entry["step"])
+    parsed_query = urllib.parse.parse_qs(entry["query"])
+    self.assertListEqual(["0"], parsed_query["index"])
+    self.assertListEqual(["foo"], parsed_query["run"])
+    self.assertListEqual(["baz/audio/0"], parsed_query["tag"])
+
+    # Verify that the 2nd entry is correct.
+    entry = entries[1]
+    self.assertEqual(1, entry["step"])
+    parsed_query = urllib.parse.parse_qs(entry["query"])
+    self.assertListEqual(["1"], parsed_query["index"])
+    self.assertListEqual(["foo"], parsed_query["run"])
+    self.assertListEqual(["baz/audio/0"], parsed_query["tag"])
+
+  def testIndividualAudioRoute(self):
+    """Tests fetching an individual audio."""
+    response = self.server.get(
+        "/data/plugin/audio/individualAudio?run=bar&tag=quux/audio/0&index=0")
+    self.assertEqual(200, response.status_code)
+    self.assertEqual("audio/wav", response.headers.get("content-type"))
+
+  def testRunsRoute(self):
+    """Tests that the /runs route offers the correct run to tag mapping."""
+    response = self.server.get("/data/plugin/audio/tags")
+    self.assertEqual(200, response.status_code)
+    run_to_tags = self._DeserializeResponse(response.get_data())
+    self.assertItemsEqual(("foo", "bar"), run_to_tags.keys())
+    self.assertItemsEqual(
+        ["baz/audio/0", "baz/audio/1", "baz/audio/2"], run_to_tags["foo"])
+    self.assertItemsEqual(
+        ["quux/audio/0", "quux/audio/1", "quux/audio/2"], run_to_tags["bar"])
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/base_plugin.py b/tensorflow/tensorboard/plugins/base_plugin.py
index 87c817e591e..01ab06e26d3 100644
--- a/tensorflow/tensorboard/plugins/base_plugin.py
+++ b/tensorflow/tensorboard/plugins/base_plugin.py
@@ -30,18 +30,39 @@ class TBPlugin(object):
   """TensorBoard plugin interface. Every plugin must extend from this class."""
   __metaclass__ = ABCMeta
 
+  # The plugin_name will also be a prefix in the http handlers generated by
+  # the plugin, e.g. `data/plugins/$PLUGIN_NAME/$HANDLER`
+  # The plugin name must be unique for each registered plugin, or
+  # a ValueError will be thrown when the application is constructed. The
+  # plugin name must only contain characters among [A-Za-z0-9_.-], and
+  # must be nonempty, or a ValueError will similarly be thrown.
+  plugin_name = None
+
   @abstractmethod
-  def get_plugin_apps(self, run_paths, logdir):
+  def get_plugin_apps(self, multiplexer, logdir):
     """Returns a set of WSGI applications that the plugin implements.
 
     Each application gets registered with the tensorboard app and is served
     under a prefix path that includes the name of the plugin.
 
     Args:
-      run_paths: A dict mapping a run name to an event file path.
+      multiplexer: The event_multiplexer with underlying TB data.
       logdir: The logging directory TensorBoard was started with.
 
     Returns:
-      A dict mapping route paths to WSGI applications.
+      A dict mapping route paths to WSGI applications. Each route path
+      should include a leading slash.
+    """
+    raise NotImplementedError()
+
+  @abstractmethod
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    A plugin may not be active for instance if it lacks relevant data. If a
+    plugin is inactive, the frontend may avoid issuing requests to its routes.
+
+    Returns:
+      A boolean value. Whether this plugin is active.
     """
     raise NotImplementedError()
diff --git a/tensorflow/tensorboard/plugins/debugger/plugin.py b/tensorflow/tensorboard/plugins/debugger/plugin.py
deleted file mode 100644
index d94a24b0300..00000000000
--- a/tensorflow/tensorboard/plugins/debugger/plugin.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The plugin for serving data from a TensorFlow debugger."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-
-from werkzeug import wrappers
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.tensorboard.lib.python import http_util
-from tensorflow.tensorboard.plugins import base_plugin
-
-# The prefix of routes provided by this plugin.
-PLUGIN_PREFIX_ROUTE = 'debugger'
-
-# HTTP routes.
-_HEALTH_PILLS_ROUTE = '/health_pills'
-
-# The POST key value of the HEALTH_PILLS_ROUTE for a JSON list of node names.
-_NODE_NAMES_POST_KEY = 'node_names'
-
-
-class DebuggerPlugin(base_plugin.TBPlugin):
-  """TensorFlow Debugger plugin. Receives requests for debugger-related data.
-
-  That data could include health pills, which unveil the status of tensor
-  values.
-  """
-
-  def get_plugin_apps(self, unused_run_paths, unused_logdir):
-    """Obtains a mapping between routes and handlers.
-
-    Args:
-      unused_run_paths: A mapping between run paths and handlers.
-      unused_logdir: The logdir string - the directory of events files.
-
-    Returns:
-      A mapping between routes and handlers (functions that respond to
-      requests).
-    """
-    return {
-        _HEALTH_PILLS_ROUTE: self._serve_health_pills_handler,
-    }
-
-  @wrappers.Request.application
-  def _serve_health_pills_handler(self, request):
-    """A (wrapped) werkzeug handler for serving health pills.
-
-    NOTE(chizeng): This handler is currently not useful and does not behave as
-    expected. It currently merely responds with the provided list of op names
-    (instead of the health pills for those ops). Very soon, that will change.
-
-    We defer to another method for actually performing the main logic because
-    the @wrappers.Request.application decorator makes this logic hard to access
-    in tests.
-
-    Args:
-      request: The request issued by the client for health pills.
-
-    Returns:
-      A werkzeug BaseResponse object.
-    """
-    return self._serve_health_pills_helper(request)
-
-  def _serve_health_pills_helper(self, request):
-    """Responds with health pills.
-
-    Accepts POST requests and responds with health pills. Specifically, the
-    handler expects a "node_names" POST data key. The value of that key should
-    be a JSON-ified list of node names for which the client would like to
-    request health pills. This data is sent via POST instead of GET because URL
-    length is limited.
-
-    This handler responds with a JSON-ified object mapping from node names to a
-    list of HealthPillEvents. Node names for which there are no health pills to
-    be found are excluded from the mapping.
-
-    Args:
-      request: The request issued by the client for health pills.
-
-    Returns:
-      A werkzeug BaseResponse object.
-    """
-    if request.method != 'POST':
-      logging.error(
-          '%s requests are forbidden by the debugger plugin.', request.method)
-      return wrappers.Response(status=405)
-
-    if _NODE_NAMES_POST_KEY not in request.form:
-      logging.error(
-          'The %s POST key was not found in the request for health pills.',
-          _NODE_NAMES_POST_KEY)
-      return wrappers.Response(status=400)
-
-    jsonified_node_names = request.form[_NODE_NAMES_POST_KEY]
-    try:
-      node_names = json.loads(jsonified_node_names)
-    except Exception as e:  # pylint: disable=broad-except
-      # Different JSON libs raise different exceptions, so we just do a
-      # catch-all here. This problem is complicated by how Tensorboard might be
-      # run in many different environments, as it is open-source.
-      logging.error(
-          'Could not decode node name JSON string %s: %s',
-          jsonified_node_names, e)
-      return wrappers.Response(status=400)
-
-    if not isinstance(node_names, list):
-      logging.error(
-          '%s is not a JSON list of node names:', jsonified_node_names)
-      return wrappers.Response(status=400)
-
-    # TODO(chizeng): Actually respond with the health pills per node name.
-    return http_util.Respond(request, node_names, mimetype='application/json')
diff --git a/tensorflow/tensorboard/plugins/debugger/plugin_test.py b/tensorflow/tensorboard/plugins/debugger/plugin_test.py
deleted file mode 100644
index 986cb62f88f..00000000000
--- a/tensorflow/tensorboard/plugins/debugger/plugin_test.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the Tensorboard debugger data plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import json
-
-from tensorflow.python.platform import test
-from tensorflow.tensorboard.plugins.debugger import plugin as debugger_plugin
-
-
-class FakeRequest(object):
-  """A fake shell of a werkzeug request.
-
-  We fake instead of using a real request because the real request requires a
-  WSGI environment.
-  """
-
-  def __init__(self, method, post_data):
-    """Constructs a fake request, a simple version of a werkzeug request.
-
-    Args:
-      method: The uppercase method of the request, ie POST.
-      post_data: A dictionary of POST data.
-    """
-    self.method = method
-    self.form = post_data
-
-
-class DebuggerPluginTest(test.TestCase):
-
-  def setUp(self):
-    self.debugger_plugin = debugger_plugin.DebuggerPlugin()
-    self.unused_run_paths = {}
-    self.unused_logdir = '/logdir'
-
-  def testHealthPillsRouteProvided(self):
-    """Tests that the plugin offers the route for requesting health pills."""
-    apps = self.debugger_plugin.get_plugin_apps(self.unused_run_paths,
-                                                self.unused_logdir)
-    self.assertIn('/health_pills', apps)
-    self.assertIsInstance(apps['/health_pills'], collections.Callable)
-
-  def testGetRequestsUnsupported(self):
-    """Tests that GET requests are unsupported."""
-    request = FakeRequest('GET', {
-        'node_names': json.dumps(['layers/Matmul', 'logits/Add']),
-    })
-    self.assertEqual(
-        405,
-        self.debugger_plugin._serve_health_pills_helper(request).status_code)
-
-  def testRequestsWithoutProperPostKeyUnsupported(self):
-    """Tests that requests lacking the node_names POST key are unsupported."""
-    request = FakeRequest('POST', {})
-    self.assertEqual(
-        400,
-        self.debugger_plugin._serve_health_pills_helper(request).status_code)
-
-  def testRequestsWithBadJsonUnsupported(self):
-    """Tests that requests with undecodable JSON are unsupported."""
-    request = FakeRequest('POST',
-                          {'node_names': 'some obviously non JSON text',})
-    self.assertEqual(
-        400,
-        self.debugger_plugin._serve_health_pills_helper(request).status_code)
-
-  def testRequestsWithNonListPostDataUnsupported(self):
-    """Tests that requests with loads lacking lists of ops are unsupported."""
-    request = FakeRequest('POST', {
-        'node_names': json.dumps({
-            'this is a dict': 'and not a list.'
-        }),
-    })
-    self.assertEqual(
-        400,
-        self.debugger_plugin._serve_health_pills_helper(request).status_code)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/tensorboard/plugins/distributions/BUILD b/tensorflow/tensorboard/plugins/distributions/BUILD
new file mode 100644
index 00000000000..3ce765020e0
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/distributions/BUILD
@@ -0,0 +1,48 @@
+# Description:
+# TensorBoard plugin for distributions
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+## Distributions Plugin ##
+py_library(
+    name = "distributions_plugin",
+    srcs = ["distributions_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "distributions_plugin_test",
+    size = "small",
+    srcs = ["distributions_plugin_test.py"],
+    main = "distributions_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distributions_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py b/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py
new file mode 100644
index 00000000000..4bb9dfaf545
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Distributions (a.k.a. compressed histograms) plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = event_accumulator.COMPRESSED_HISTOGRAMS
+
+
+class DistributionsPlugin(base_plugin.TBPlugin):
+  """Distributions Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/distributions': self.distributions_route,
+        '/tags': self.tags_route,
+    }
+
+  def is_active(self):
+    """This plugin is active iff any run has at least one relevant tag."""
+    return any(self.index_impl().values())
+
+  def index_impl(self):
+    return {
+        run_name: run_data[event_accumulator.COMPRESSED_HISTOGRAMS]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.COMPRESSED_HISTOGRAMS in run_data
+    }
+
+  def distributions_impl(self, tag, run):
+    """Result of the form `(body, mime_type)`."""
+    values = self._multiplexer.CompressedHistograms(run, tag)
+    return (values, 'application/json')
+
+  @wrappers.Request.application
+  def tags_route(self, request):
+    index = self.index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  @wrappers.Request.application
+  def distributions_route(self, request):
+    """Given a tag and single run, return array of compressed histograms."""
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    (body, mime_type) = self.distributions_impl(tag, run)
+    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py b/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py
new file mode 100644
index 00000000000..b5aae6dea79
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Distributions Plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.distributions import distributions_plugin
+
+
+class DistributionsPluginTest(tf.test.TestCase):
+
+  _STEPS = 99
+
+  _DISTRIBUTION_TAG = 'my-favorite-distribution'
+  _SCALAR_TAG = 'my-boring-scalars'
+
+  _RUN_WITH_DISTRIBUTION = '_RUN_WITH_DISTRIBUTION'
+  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
+
+  def set_up_with_runs(self, run_names):
+    self.logdir = self.get_temp_dir()
+    for run_name in run_names:
+      self.generate_run(run_name)
+    multiplexer = event_multiplexer.EventMultiplexer(size_guidance={
+        # don't truncate my test data, please
+        event_accumulator.COMPRESSED_HISTOGRAMS:
+            self._STEPS,
+    })
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+    self.plugin = distributions_plugin.DistributionsPlugin()
+    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
+
+  def generate_run(self, run_name):
+    if run_name == self._RUN_WITH_DISTRIBUTION:
+      (use_distributions, use_scalars) = (True, False)
+    elif run_name == self._RUN_WITH_SCALARS:
+      (use_distributions, use_scalars) = (False, True)
+    else:
+      assert False, 'Invalid run name: %r' % run_name
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.float32, shape=[3])
+    if use_distributions:
+      tf.summary.histogram(self._DISTRIBUTION_TAG, placeholder)
+    if use_scalars:
+      tf.summary.scalar(self._SCALAR_TAG, tf.reduce_mean(placeholder))
+    summ = tf.summary.merge_all()
+
+    subdir = os.path.join(self.logdir, run_name)
+    writer = tf.summary.FileWriter(subdir)
+    writer.add_graph(sess.graph)
+    for step in xrange(self._STEPS):
+      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
+      s = sess.run(summ, feed_dict=feed_dict)
+      writer.add_summary(s, global_step=step)
+    writer.close()
+
+  def test_index(self):
+    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
+                           self._RUN_WITH_SCALARS])
+    self.assertEqual({
+        self._RUN_WITH_DISTRIBUTION: [self._DISTRIBUTION_TAG],
+        self._RUN_WITH_SCALARS: [],
+    }, self.plugin.index_impl())
+
+  def _test_distributions_json(self, run_name, should_have_distributions):
+    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
+                           self._RUN_WITH_SCALARS])
+    if should_have_distributions:
+      (data, mime_type) = self.plugin.distributions_impl(
+          self._DISTRIBUTION_TAG, run_name)
+      self.assertEqual('application/json', mime_type)
+      self.assertEqual(len(data), self._STEPS)
+      for i in xrange(self._STEPS):
+        self.assertEqual(i, data[i].step)
+    else:
+      with self.assertRaises(KeyError):
+        self.plugin.distributions_impl(
+            self._DISTRIBUTION_TAG, run_name)
+
+  def test_distributions_json_with_scalars(self):
+    self._test_distributions_json(self._RUN_WITH_DISTRIBUTION, True)
+
+  def test_distributions_json_with_histogram(self):
+    self._test_distributions_json(self._RUN_WITH_SCALARS, False)
+
+  def test_active_with_distribution(self):
+    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION])
+    self.assertTrue(self.plugin.is_active())
+
+  def test_active_with_scalars(self):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS])
+    self.assertFalse(self.plugin.is_active())
+
+  def test_active_with_both(self):
+    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
+                           self._RUN_WITH_SCALARS])
+    self.assertTrue(self.plugin.is_active())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/graphs/BUILD b/tensorflow/tensorboard/plugins/graphs/BUILD
new file mode 100644
index 00000000000..9ec659202ac
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/graphs/BUILD
@@ -0,0 +1,49 @@
+# Description:
+# TensorBoard plugin for graphs
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+## Graphs Plugin ##
+py_library(
+    name = "graphs_plugin",
+    srcs = ["graphs_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend:process_graph",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "graphs_plugin_test",
+    size = "small",
+    srcs = ["graphs_plugin_test.py"],
+    main = "graphs_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":graphs_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py b/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py
new file mode 100644
index 00000000000..7fdbf9903db
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py
@@ -0,0 +1,140 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Graphs plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend import process_graph
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = 'graphs'
+
+
+class GraphsPlugin(base_plugin.TBPlugin):
+  """Graphs Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/graph': self.graph_route,
+        '/runs': self.runs_route,
+        '/run_metadata': self.run_metadata_route,
+        '/run_metadata_tags': self.run_metadata_tags_route,
+    }
+
+  def is_active(self):
+    """The graphs plugin is active iff any run has a graph."""
+    return bool(self.index_impl())
+
+  def index_impl(self):
+    """Returns a list of all runs that have a graph."""
+    return [run_name
+            for (run_name, run_data) in self._multiplexer.Runs().items()
+            if run_data.get(event_accumulator.GRAPH)]
+
+  def run_metadata_index_impl(self):
+    """Returns a run-to-tag mapping for metadata."""
+    return {
+        run_name: run_data[event_accumulator.RUN_METADATA]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.RUN_METADATA in run_data
+    }
+
+  def graph_impl(self, run, limit_attr_size=None, large_attrs_key=None):
+    """Result of the form `(body, mime_type)`, or `None` if no graph exists."""
+    try:
+      graph = self._multiplexer.Graph(run)
+    except ValueError:
+      return None
+    # This next line might raise a ValueError if the limit parameters
+    # are invalid (size is negative, size present but key absent, etc.).
+    process_graph.prepare_graph_for_ui(graph, limit_attr_size, large_attrs_key)
+    return (str(graph), 'text/x-protobuf')  # pbtxt
+
+  def run_metadata_impl(self, run, tag):
+    """Result of the form `(body, mime_type)`, or `None` if no data exists."""
+    try:
+      run_metadata = self._multiplexer.RunMetadata(run, tag)
+    except ValueError:
+      return None
+    return (str(run_metadata), 'text/x-protobuf')  # pbtxt
+
+  @wrappers.Request.application
+  def runs_route(self, request):
+    index = self.index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  @wrappers.Request.application
+  def run_metadata_tags_route(self, request):
+    index = self.run_metadata_index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  @wrappers.Request.application
+  def graph_route(self, request):
+    """Given a single run, return the graph definition in protobuf format."""
+    run = request.args.get('run')
+    if run is None:
+      return http_util.Respond(
+          request, 'query parameter "run" is required', 'text/plain', 400)
+
+    limit_attr_size = request.args.get('limit_attr_size', None)
+    if limit_attr_size is not None:
+      try:
+        limit_attr_size = int(limit_attr_size)
+      except ValueError:
+        return http_util.Respond(
+            request, 'query parameter `limit_attr_size` must be an integer',
+            'text/plain', 400)
+
+    large_attrs_key = request.args.get('large_attrs_key', None)
+
+    try:
+      result = self.graph_impl(run, limit_attr_size, large_attrs_key)
+    except ValueError as e:
+      return http_util.Respond(request, e.message, 'text/plain', code=400)
+    else:
+      if result is not None:
+        (body, mime_type) = result  # pylint: disable=unpacking-non-sequence
+        return http_util.Respond(request, body, mime_type)
+      else:
+        return http_util.Respond(request, '404 Not Found', 'text/plain',
+                                 code=404)
+
+  @wrappers.Request.application
+  def run_metadata_route(self, request):
+    """Given a tag and a run, return the session.run() metadata."""
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    if tag is None:
+      return http_util.Respond(
+          request, 'query parameter "tag" is required', 'text/plain', 400)
+    if run is None:
+      return http_util.Respond(
+          request, 'query parameter "run" is required', 'text/plain', 400)
+    result = self.run_metadata_impl(run, tag)
+    if result is not None:
+      (body, mime_type) = result  # pylint: disable=unpacking-non-sequence
+      return http_util.Respond(request, body, mime_type)
+    else:
+      return http_util.Respond(request, '404 Not Found', 'text/plain',
+                               code=404)
diff --git a/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py b/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py
new file mode 100644
index 00000000000..db4d0cb1b3c
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Graphs Plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os.path
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.graphs import graphs_plugin
+
+
+class GraphsPluginTest(tf.test.TestCase):
+
+  _RUN_WITH_GRAPH = '_RUN_WITH_GRAPH'
+  _RUN_WITHOUT_GRAPH = '_RUN_WITHOUT_GRAPH'
+
+  _METADATA_TAG = 'secret-stats'
+  _MESSAGE_PREFIX_LENGTH_LOWER_BOUND = 1024
+
+  def generate_run(self, run_name, include_graph):
+    """Create a run with a text summary, metadata, and optionally a graph."""
+    tf.reset_default_graph()
+    k1 = tf.constant(math.pi, name='k1')
+    k2 = tf.constant(math.e, name='k2')
+    result = (k1 ** k2) - k1
+    expected = tf.constant(20.0, name='expected')
+    error = tf.abs(result - expected, name='error')
+    message_prefix_value = 'error ' * 1000
+    true_length = len(message_prefix_value)
+    assert true_length > self._MESSAGE_PREFIX_LENGTH_LOWER_BOUND, true_length
+    message_prefix = tf.constant(message_prefix_value, name='message_prefix')
+    error_message = tf.string_join([message_prefix,
+                                    tf.as_string(error, name='error_string')],
+                                   name='error_message')
+    summary_message = tf.summary.text('summary_message', error_message)
+
+    sess = tf.Session()
+    writer = tf.summary.FileWriter(os.path.join(self.logdir, run_name))
+    if include_graph:
+      writer.add_graph(sess.graph)
+    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+    run_metadata = tf.RunMetadata()
+    s = sess.run(summary_message, options=options, run_metadata=run_metadata)
+    writer.add_summary(s)
+    writer.add_run_metadata(run_metadata, self._METADATA_TAG)
+    writer.close()
+
+  def set_up_with_runs(self, with_graph=True, without_graph=True):
+    self.logdir = self.get_temp_dir()
+    if with_graph:
+      self.generate_run(self._RUN_WITH_GRAPH, include_graph=True)
+    if without_graph:
+      self.generate_run(self._RUN_WITHOUT_GRAPH, include_graph=False)
+    multiplexer = event_multiplexer.EventMultiplexer()
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+    self.plugin = graphs_plugin.GraphsPlugin()
+    self.plugin.get_plugin_apps(multiplexer, None)
+
+  def test_index(self):
+    self.set_up_with_runs()
+    self.assertItemsEqual([self._RUN_WITH_GRAPH], self.plugin.index_impl())
+
+  def test_run_metadata_index(self):
+    self.set_up_with_runs()
+    self.assertDictEqual({
+        self._RUN_WITH_GRAPH: [self._METADATA_TAG],
+        self._RUN_WITHOUT_GRAPH: [self._METADATA_TAG],
+    }, self.plugin.run_metadata_index_impl())
+
+  def _get_graph(self, *args, **kwargs):
+    """Set up runs, then fetch and return the graph as a proto."""
+    self.set_up_with_runs()
+    (graph_pbtxt, mime_type) = self.plugin.graph_impl(
+        self._RUN_WITH_GRAPH, *args, **kwargs)
+    self.assertEqual(mime_type, 'text/x-protobuf')
+    return text_format.Parse(graph_pbtxt, tf.GraphDef())
+
+  def test_graph_simple(self):
+    graph = self._get_graph()
+    node_names = set(node.name for node in graph.node)
+    self.assertEqual({'k1', 'k2', 'pow', 'sub', 'expected', 'sub_1', 'error',
+                      'message_prefix', 'error_string', 'error_message',
+                      'summary_message'},
+                     node_names)
+
+  def test_graph_large_attrs(self):
+    key = 'o---;;-;'
+    graph = self._get_graph(
+        limit_attr_size=self._MESSAGE_PREFIX_LENGTH_LOWER_BOUND,
+        large_attrs_key=key)
+    large_attrs = {
+        node.name: list(node.attr[key].list.s)
+        for node in graph.node
+        if key in node.attr
+    }
+    self.assertEqual({'message_prefix': [b'value']},
+                     large_attrs)
+
+  def test_run_metadata(self):
+    self.set_up_with_runs()
+    (metadata_pbtxt, mime_type) = self.plugin.run_metadata_impl(
+        self._RUN_WITH_GRAPH, self._METADATA_TAG)
+    self.assertEqual(mime_type, 'text/x-protobuf')
+    text_format.Parse(metadata_pbtxt, tf.RunMetadata())
+    # If it parses, we're happy.
+
+  def test_is_active_with_graph(self):
+    self.set_up_with_runs(with_graph=True, without_graph=False)
+    self.assertTrue(self.plugin.is_active())
+
+  def test_is_active_without_graph(self):
+    self.set_up_with_runs(with_graph=False, without_graph=True)
+    self.assertFalse(self.plugin.is_active())
+
+  def test_is_active_with_both(self):
+    self.set_up_with_runs(with_graph=True, without_graph=True)
+    self.assertTrue(self.plugin.is_active())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/histograms/BUILD b/tensorflow/tensorboard/plugins/histograms/BUILD
new file mode 100644
index 00000000000..110a93d20b9
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/histograms/BUILD
@@ -0,0 +1,48 @@
+# Description:
+# TensorBoard plugin for histograms
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+## Histograms Plugin ##
+py_library(
+    name = "histograms_plugin",
+    srcs = ["histograms_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "histograms_plugin_test",
+    size = "small",
+    srcs = ["histograms_plugin_test.py"],
+    main = "histograms_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":histograms_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py b/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py
new file mode 100644
index 00000000000..7a0e005e8b4
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Histograms plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = event_accumulator.HISTOGRAMS
+
+
+class HistogramsPlugin(base_plugin.TBPlugin):
+  """Histograms Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/histograms': self.histograms_route,
+        '/tags': self.tags_route,
+    }
+
+  def is_active(self):
+    """This plugin is active iff any run has at least one histograms tag."""
+    return any(self.index_impl().values())
+
+  def index_impl(self):
+    return {
+        run_name: run_data[event_accumulator.HISTOGRAMS]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.HISTOGRAMS in run_data
+    }
+
+  def histograms_impl(self, tag, run):
+    """Result of the form `(body, mime_type)`."""
+    values = self._multiplexer.Histograms(run, tag)
+    return (values, 'application/json')
+
+  @wrappers.Request.application
+  def tags_route(self, request):
+    index = self.index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  @wrappers.Request.application
+  def histograms_route(self, request):
+    """Given a tag and single run, return array of histogram values."""
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    (body, mime_type) = self.histograms_impl(tag, run)
+    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py b/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py
new file mode 100644
index 00000000000..ee895d9ba7b
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Histograms Plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.histograms import histograms_plugin
+
+
+class HistogramsPluginTest(tf.test.TestCase):
+
+  _STEPS = 99
+
+  _HISTOGRAM_TAG = 'my-favorite-histogram'
+  _SCALAR_TAG = 'my-boring-scalars'
+
+  _RUN_WITH_HISTOGRAM = '_RUN_WITH_HISTOGRAM'
+  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
+
+  def set_up_with_runs(self, run_names):
+    self.logdir = self.get_temp_dir()
+    for run_name in run_names:
+      self.generate_run(run_name)
+    multiplexer = event_multiplexer.EventMultiplexer(size_guidance={
+        # don't truncate my test data, please
+        event_accumulator.HISTOGRAMS:
+            self._STEPS,
+    })
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+    self.plugin = histograms_plugin.HistogramsPlugin()
+    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
+
+  def generate_run(self, run_name):
+    if run_name == self._RUN_WITH_HISTOGRAM:
+      (use_histogram, use_scalars) = (True, False)
+    elif run_name == self._RUN_WITH_SCALARS:
+      (use_histogram, use_scalars) = (False, True)
+    else:
+      assert False, 'Invalid run name: %r' % run_name
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.float32, shape=[3])
+    if use_histogram:
+      tf.summary.histogram(self._HISTOGRAM_TAG, placeholder)
+    if use_scalars:
+      tf.summary.scalar(self._SCALAR_TAG, tf.reduce_mean(placeholder))
+    summ = tf.summary.merge_all()
+
+    subdir = os.path.join(self.logdir, run_name)
+    writer = tf.summary.FileWriter(subdir)
+    writer.add_graph(sess.graph)
+    for step in xrange(self._STEPS):
+      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
+      s = sess.run(summ, feed_dict=feed_dict)
+      writer.add_summary(s, global_step=step)
+    writer.close()
+
+  def test_index(self):
+    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
+    self.assertEqual({
+        self._RUN_WITH_HISTOGRAM: [self._HISTOGRAM_TAG],
+        self._RUN_WITH_SCALARS: [],
+    }, self.plugin.index_impl())
+
+  def _test_histograms(self, run_name, should_have_histogram):
+    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
+    if should_have_histogram:
+      (data, mime_type) = self.plugin.histograms_impl(self._HISTOGRAM_TAG,
+                                                      run_name)
+      self.assertEqual('application/json', mime_type)
+      self.assertEqual(len(data), self._STEPS)
+      for i in xrange(self._STEPS):
+        frame = data[i]
+        self.assertEqual(i, frame.step)
+        self.assertEqual(1 + i, frame.histogram_value.min)
+        self.assertEqual(3 + i, frame.histogram_value.max)
+        self.assertAlmostEqual(
+            3,  # three items across all buckets
+            sum(frame.histogram_value.bucket))
+    else:
+      with self.assertRaises(KeyError):
+        self.plugin.histograms_impl(self._HISTOGRAM_TAG, run_name)
+
+  def test_histograms_with_scalars(self):
+    self._test_histograms(self._RUN_WITH_HISTOGRAM, True)
+
+  def test_histograms_with_histogram(self):
+    self._test_histograms(self._RUN_WITH_SCALARS, False)
+
+  def test_active_with_histogram(self):
+    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM])
+    self.assertTrue(self.plugin.is_active())
+
+  def test_active_with_scalars(self):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS])
+    self.assertFalse(self.plugin.is_active())
+
+  def test_active_with_both(self):
+    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
+    self.assertTrue(self.plugin.is_active())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/images/BUILD b/tensorflow/tensorboard/plugins/images/BUILD
new file mode 100644
index 00000000000..e9f88c4114d
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/images/BUILD
@@ -0,0 +1,46 @@
+# Description:
+# TensorBoard plugin for images
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "images_plugin",
+    srcs = ["images_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "images_plugin_test",
+    size = "small",
+    srcs = ["images_plugin_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":images_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/images/images_plugin.py b/tensorflow/tensorboard/plugins/images/images_plugin.py
new file mode 100644
index 00000000000..99704c36afe
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/images/images_plugin.py
@@ -0,0 +1,150 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Images plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import imghdr
+
+from six.moves import urllib
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = event_accumulator.IMAGES
+
+_IMGHDR_TO_MIMETYPE = {
+    'bmp': 'image/bmp',
+    'gif': 'image/gif',
+    'jpeg': 'image/jpeg',
+    'png': 'image/png'
+}
+
+_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
+
+
+class ImagesPlugin(base_plugin.TBPlugin):
+  """Images Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/images': self._serve_image_metadata,
+        '/individualImage': self._serve_individual_image,
+        '/tags': self._serve_tags,
+    }
+
+  def is_active(self):
+    """The images plugin is active iff any run has at least one relevant tag."""
+    return any(self.index_impl().values())
+
+  def _index_impl(self):
+    return {
+        run_name: run_data[event_accumulator.IMAGES]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.IMAGES in run_data
+    }
+
+  @wrappers.Request.application
+  def _serve_image_metadata(self, request):
+    """Given a tag and list of runs, serve a list of metadata for images.
+
+    Note that the images themselves are not sent; instead, we respond with URLs
+    to the images. The frontend should treat these URLs as opaque and should not
+    try to parse information about them or generate them itself, as the format
+    may change.
+
+    Args:
+      request: A werkzeug.wrappers.Request object.
+
+    Returns:
+      A werkzeug.Response application.
+    """
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+
+    images = self._multiplexer.Images(run, tag)
+    response = self._image_response_for_run(images, run, tag)
+    return http_util.Respond(request, response, 'application/json')
+
+  def _image_response_for_run(self, run_images, run, tag):
+    """Builds a JSON-serializable object with information about run_images.
+
+    Args:
+      run_images: A list of event_accumulator.ImageValueEvent objects.
+      run: The name of the run.
+      tag: The name of the tag the images all belong to.
+
+    Returns:
+      A list of dictionaries containing the wall time, step, URL, width, and
+      height for each image.
+    """
+    response = []
+    for index, run_image in enumerate(run_images):
+      response.append({
+          'wall_time': run_image.wall_time,
+          'step': run_image.step,
+          # We include the size so that the frontend can add that to the <img>
+          # tag so that the page layout doesn't change when the image loads.
+          'width': run_image.width,
+          'height': run_image.height,
+          'query': self._query_for_individual_image(run, tag, index)
+      })
+    return response
+
+  def _query_for_individual_image(self, run, tag, index):
+    """Builds a URL for accessing the specified image.
+
+    This should be kept in sync with _serve_image_metadata. Note that the URL is
+    *not* guaranteed to always return the same image, since images may be
+    unloaded from the reservoir as new images come in.
+
+    Args:
+      run: The name of the run.
+      tag: The tag.
+      index: The index of the image. Negative values are OK.
+
+    Returns:
+      A string representation of a URL that will load the index-th sampled image
+      in the given run with the given tag.
+    """
+    query_string = urllib.parse.urlencode({
+        'run': run,
+        'tag': tag,
+        'index': index
+    })
+    return query_string
+
+  @wrappers.Request.application
+  def _serve_individual_image(self, request):
+    """Serves an individual image."""
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    index = int(request.args.get('index'))
+    image = self._multiplexer.Images(run, tag)[index]
+    image_type = imghdr.what(None, image.encoded_image_string)
+    content_type = _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
+    return http_util.Respond(request, image.encoded_image_string, content_type)
+
+  @wrappers.Request.application
+  def _serve_tags(self, request):
+    index = self._index_impl()
+    return http_util.Respond(request, index, 'application/json')
diff --git a/tensorflow/tensorboard/plugins/images/images_plugin_test.py b/tensorflow/tensorboard/plugins/images/images_plugin_test.py
new file mode 100644
index 00000000000..cbace2b5f07
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/images/images_plugin_test.py
@@ -0,0 +1,159 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests the Tensorboard images plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+import shutil
+import tempfile
+
+import numpy
+from six.moves import urllib
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from werkzeug import test as werkzeug_test
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import application
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.images import images_plugin
+
+
+class ImagesPluginTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.log_dir = tempfile.mkdtemp()
+
+    # We use numpy.random to generate images. We seed to avoid non-determinism
+    # in this test.
+    numpy.random.seed(42)
+
+    # Create image summaries for run foo.
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.uint8)
+    tf.summary.image(name="baz", tensor=placeholder)
+    merged_summary_op = tf.summary.merge_all()
+    foo_directory = os.path.join(self.log_dir, "foo")
+    writer = tf.summary.FileWriter(foo_directory)
+    writer.add_graph(sess.graph)
+    for step in xrange(2):
+      writer.add_summary(sess.run(merged_summary_op, feed_dict={
+          placeholder: (numpy.random.rand(1, 16, 42, 3) * 255).astype(
+              numpy.uint8)
+      }), global_step=step)
+    writer.close()
+
+    # Create image summaries for run bar.
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.uint8)
+    tf.summary.image(name="quux", tensor=placeholder)
+    merged_summary_op = tf.summary.merge_all()
+    bar_directory = os.path.join(self.log_dir, "bar")
+    writer = tf.summary.FileWriter(bar_directory)
+    writer.add_graph(sess.graph)
+    for step in xrange(2):
+      writer.add_summary(sess.run(merged_summary_op, feed_dict={
+          placeholder: (numpy.random.rand(1, 6, 8, 3) * 255).astype(
+              numpy.uint8)
+      }), global_step=step)
+    writer.close()
+
+    # Start a server with the plugin.
+    multiplexer = event_multiplexer.EventMultiplexer({
+        "foo": foo_directory,
+        "bar": bar_directory,
+    })
+    plugin = images_plugin.ImagesPlugin()
+    wsgi_app = application.TensorBoardWSGIApp(
+        self.log_dir, [plugin], multiplexer, reload_interval=0)
+    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
+    self.routes = plugin.get_plugin_apps(multiplexer, self.log_dir)
+
+  def tearDown(self):
+    shutil.rmtree(self.log_dir, ignore_errors=True)
+
+  def _DeserializeResponse(self, byte_content):
+    """Deserializes byte content that is a JSON encoding.
+
+    Args:
+      byte_content: The byte content of a response.
+
+    Returns:
+      The deserialized python object decoded from JSON.
+    """
+    return json.loads(byte_content.decode("utf-8"))
+
+  def testRoutesProvided(self):
+    """Tests that the plugin offers the correct routes."""
+    self.assertIsInstance(self.routes["/images"], collections.Callable)
+    self.assertIsInstance(self.routes["/individualImage"], collections.Callable)
+    self.assertIsInstance(self.routes["/tags"], collections.Callable)
+
+  def testImagesRoute(self):
+    """Tests that the /images routes returns with the correct data."""
+    response = self.server.get(
+        "/data/plugin/images/images?run=foo&tag=baz/image/0")
+    self.assertEqual(200, response.status_code)
+
+    # Verify that the correct entries are returned.
+    entries = self._DeserializeResponse(response.get_data())
+    self.assertEqual(2, len(entries))
+
+    # Verify that the 1st entry is correct.
+    entry = entries[0]
+    self.assertEqual(42, entry["width"])
+    self.assertEqual(16, entry["height"])
+    self.assertEqual(0, entry["step"])
+    parsed_query = urllib.parse.parse_qs(entry["query"])
+    self.assertListEqual(["0"], parsed_query["index"])
+    self.assertListEqual(["foo"], parsed_query["run"])
+    self.assertListEqual(["baz/image/0"], parsed_query["tag"])
+
+    # Verify that the 2nd entry is correct.
+    entry = entries[1]
+    self.assertEqual(42, entry["width"])
+    self.assertEqual(16, entry["height"])
+    self.assertEqual(1, entry["step"])
+    parsed_query = urllib.parse.parse_qs(entry["query"])
+    self.assertListEqual(["1"], parsed_query["index"])
+    self.assertListEqual(["foo"], parsed_query["run"])
+    self.assertListEqual(["baz/image/0"], parsed_query["tag"])
+
+  def testIndividualImageRoute(self):
+    """Tests fetching an individual image."""
+    response = self.server.get(
+        "/data/plugin/images/individualImage?run=bar&tag=quux/image/0&index=0")
+    self.assertEqual(200, response.status_code)
+    self.assertEqual("image/png", response.headers.get("content-type"))
+
+  def testRunsRoute(self):
+    """Tests that the /runs route offers the correct run to tag mapping."""
+    response = self.server.get("/data/plugin/images/tags")
+    self.assertEqual(200, response.status_code)
+    self.assertDictEqual({
+        "foo": ["baz/image/0"],
+        "bar": ["quux/image/0"]
+    }, self._DeserializeResponse(response.get_data()))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/projector/BUILD b/tensorflow/tensorboard/plugins/projector/BUILD
new file mode 100644
index 00000000000..9f9a18b2f26
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/projector/BUILD
@@ -0,0 +1,58 @@
+# Embedding Projector plugin.
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/tensorboard/defs:protos.bzl", "tb_proto_library")
+
+py_library(
+    name = "projector_plugin",
+    srcs = ["projector_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/vis/projector:__subpackages__",
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        ":protos_all_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "//third_party/py/numpy",
+        "@org_pocoo_werkzeug//:werkzeug",
+    ],
+)
+
+py_test(
+    name = "projector_plugin_test",
+    size = "small",
+    srcs = ["projector_plugin_test.py"],
+    main = "projector_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":projector_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "//third_party/py/numpy",
+        "@org_pocoo_werkzeug//:werkzeug",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
+
+tb_proto_library(
+    name = "protos_all",
+    srcs = glob(["*.proto"]),
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/tensorboard:internal",
+    ],
+)
diff --git a/tensorflow/tensorboard/plugins/projector/plugin.py b/tensorflow/tensorboard/plugins/projector/plugin.py
deleted file mode 100644
index ad451b1962f..00000000000
--- a/tensorflow/tensorboard/plugins/projector/plugin.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The Embedding Projector plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import imghdr
-import os
-import numpy as np
-from werkzeug import wrappers
-
-from google.protobuf import json_format
-from google.protobuf import text_format
-from tensorflow.contrib.tensorboard.plugins.projector import PROJECTOR_FILENAME
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
-from tensorflow.python.framework import errors
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
-from tensorflow.python.training.saver import checkpoint_exists
-from tensorflow.python.training.saver import latest_checkpoint
-from tensorflow.tensorboard.lib.python.http_util import Respond
-from tensorflow.tensorboard.plugins.base_plugin import TBPlugin
-
-# The prefix of routes provided by this plugin.
-PLUGIN_PREFIX_ROUTE = 'projector'
-
-# HTTP routes.
-CONFIG_ROUTE = '/info'
-TENSOR_ROUTE = '/tensor'
-METADATA_ROUTE = '/metadata'
-RUNS_ROUTE = '/runs'
-BOOKMARKS_ROUTE = '/bookmarks'
-SPRITE_IMAGE_ROUTE = '/sprite_image'
-
-_IMGHDR_TO_MIMETYPE = {
-    'bmp': 'image/bmp',
-    'gif': 'image/gif',
-    'jpeg': 'image/jpeg',
-    'png': 'image/png'
-}
-_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
-
-
-def _read_tensor_file(fpath):
-  with file_io.FileIO(fpath, 'r') as f:
-    tensor = []
-    for line in f:
-      if line:
-        tensor.append(list(map(float, line.rstrip('\n').split('\t'))))
-  return np.array(tensor, dtype='float32')
-
-
-def _latest_checkpoints_changed(configs, run_path_pairs):
-  """Returns true if the latest checkpoint has changed in any of the runs."""
-  for run_name, logdir in run_path_pairs:
-    if run_name not in configs:
-      config = ProjectorConfig()
-      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
-      if file_io.file_exists(config_fpath):
-        file_content = file_io.read_file_to_string(config_fpath)
-        text_format.Merge(file_content, config)
-    else:
-      config = configs[run_name]
-
-    # See if you can find a checkpoint file in the logdir.
-    ckpt_path = latest_checkpoint(logdir)
-    if not ckpt_path:
-      # See if you can find a checkpoint in the parent of logdir.
-      ckpt_path = latest_checkpoint(os.path.join(logdir, os.pardir))
-      if not ckpt_path:
-        continue
-    if config.model_checkpoint_path != ckpt_path:
-      return True
-  return False
-
-
-def _parse_positive_int_param(request, param_name):
-  """Parses and asserts a positive (>0) integer query parameter.
-
-  Args:
-    request: The Werkzeug Request object
-    param_name: Name of the parameter.
-
-  Returns:
-    Param, or None, or -1 if parameter is not a positive integer.
-  """
-  param = request.args.get(param_name)
-  if not param:
-    return None
-  try:
-    param = int(param)
-    if param <= 0:
-      raise ValueError()
-    return param
-  except ValueError:
-    return -1
-
-
-class ProjectorPlugin(TBPlugin):
-  """Embedding projector."""
-
-  def __init__(self):
-    self._handlers = None
-    self.readers = {}
-    self.run_paths = None
-    self.logdir = None
-    self._configs = None
-    self.old_num_run_paths = None
-
-  def get_plugin_apps(self, run_paths, logdir):
-    self.run_paths = run_paths
-    self.logdir = logdir
-    self._handlers = {
-        RUNS_ROUTE: self._serve_runs,
-        CONFIG_ROUTE: self._serve_config,
-        TENSOR_ROUTE: self._serve_tensor,
-        METADATA_ROUTE: self._serve_metadata,
-        BOOKMARKS_ROUTE: self._serve_bookmarks,
-        SPRITE_IMAGE_ROUTE: self._serve_sprite_image
-    }
-    return self._handlers
-
-  @property
-  def configs(self):
-    """Returns a map of run paths to `ProjectorConfig` protos."""
-    run_path_pairs = list(self.run_paths.items())
-    # If there are no summary event files, the projector should still work,
-    # treating the `logdir` as the model checkpoint directory.
-    if not run_path_pairs:
-      run_path_pairs.append(('.', self.logdir))
-    if (self._run_paths_changed() or
-        _latest_checkpoints_changed(self._configs, run_path_pairs)):
-      self.readers = {}
-      self._configs, self.config_fpaths = self._read_latest_config_files(
-          run_path_pairs)
-      self._augment_configs_with_checkpoint_info()
-    return self._configs
-
-  def _run_paths_changed(self):
-    num_run_paths = len(list(self.run_paths.keys()))
-    if num_run_paths != self.old_num_run_paths:
-      self.old_num_run_paths = num_run_paths
-      return True
-    return False
-
-  def _augment_configs_with_checkpoint_info(self):
-    for run, config in self._configs.items():
-      for embedding in config.embeddings:
-        # Normalize the name of the embeddings.
-        if embedding.tensor_name.endswith(':0'):
-          embedding.tensor_name = embedding.tensor_name[:-2]
-        # Find the size of embeddings associated with a tensors file.
-        if embedding.tensor_path and not embedding.tensor_shape:
-          tensor = _read_tensor_file(embedding.tensor_path)
-          embedding.tensor_shape.extend([len(tensor), len(tensor[0])])
-
-      reader = self._get_reader_for_run(run)
-      if not reader:
-        continue
-      # Augment the configuration with the tensors in the checkpoint file.
-      special_embedding = None
-      if config.embeddings and not config.embeddings[0].tensor_name:
-        special_embedding = config.embeddings[0]
-        config.embeddings.remove(special_embedding)
-      var_map = reader.get_variable_to_shape_map()
-      for tensor_name, tensor_shape in var_map.items():
-        if len(tensor_shape) != 2:
-          continue
-        embedding = self._get_embedding(tensor_name, config)
-        if not embedding:
-          embedding = config.embeddings.add()
-          embedding.tensor_name = tensor_name
-          if special_embedding:
-            embedding.metadata_path = special_embedding.metadata_path
-            embedding.bookmarks_path = special_embedding.bookmarks_path
-        if not embedding.tensor_shape:
-          embedding.tensor_shape.extend(tensor_shape)
-
-    # Remove configs that do not have any valid (2D) tensors.
-    runs_to_remove = []
-    for run, config in self._configs.items():
-      if not config.embeddings:
-        runs_to_remove.append(run)
-    for run in runs_to_remove:
-      del self._configs[run]
-      del self.config_fpaths[run]
-
-  def _read_latest_config_files(self, run_path_pairs):
-    """Reads and returns the projector config files in every run directory."""
-    configs = {}
-    config_fpaths = {}
-    for run_name, logdir in run_path_pairs:
-      config = ProjectorConfig()
-      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
-      if file_io.file_exists(config_fpath):
-        file_content = file_io.read_file_to_string(config_fpath)
-        text_format.Merge(file_content, config)
-
-      has_tensor_files = False
-      for embedding in config.embeddings:
-        if embedding.tensor_path:
-          has_tensor_files = True
-          break
-
-      if not config.model_checkpoint_path:
-        # See if you can find a checkpoint file in the logdir.
-        ckpt_path = latest_checkpoint(logdir)
-        if not ckpt_path:
-          # Or in the parent of logdir.
-          ckpt_path = latest_checkpoint(os.path.join(logdir, os.pardir))
-          if not ckpt_path and not has_tensor_files:
-            continue
-        if ckpt_path:
-          config.model_checkpoint_path = ckpt_path
-
-      # Sanity check for the checkpoint file.
-      if (config.model_checkpoint_path and
-          not checkpoint_exists(config.model_checkpoint_path)):
-        logging.warning('Checkpoint file %s not found',
-                        config.model_checkpoint_path)
-        continue
-      configs[run_name] = config
-      config_fpaths[run_name] = config_fpath
-    return configs, config_fpaths
-
-  def _get_reader_for_run(self, run):
-    if run in self.readers:
-      return self.readers[run]
-
-    config = self._configs[run]
-    reader = None
-    if config.model_checkpoint_path:
-      try:
-        reader = NewCheckpointReader(config.model_checkpoint_path)
-      except Exception:  # pylint: disable=broad-except
-        logging.warning('Failed reading %s', config.model_checkpoint_path)
-    self.readers[run] = reader
-    return reader
-
-  def _get_metadata_file_for_tensor(self, tensor_name, config):
-    embedding_info = self._get_embedding(tensor_name, config)
-    if embedding_info:
-      return embedding_info.metadata_path
-    return None
-
-  def _get_bookmarks_file_for_tensor(self, tensor_name, config):
-    embedding_info = self._get_embedding(tensor_name, config)
-    if embedding_info:
-      return embedding_info.bookmarks_path
-    return None
-
-  def _canonical_tensor_name(self, tensor_name):
-    if ':' not in tensor_name:
-      return tensor_name + ':0'
-    else:
-      return tensor_name
-
-  def _get_embedding(self, tensor_name, config):
-    if not config.embeddings:
-      return None
-    for info in config.embeddings:
-      if (self._canonical_tensor_name(info.tensor_name) ==
-          self._canonical_tensor_name(tensor_name)):
-        return info
-    return None
-
-  @wrappers.Request.application
-  def _serve_runs(self, request):
-    """Returns a list of runs that have embeddings."""
-    return Respond(request, list(self.configs.keys()), 'application/json')
-
-  @wrappers.Request.application
-  def _serve_config(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    return Respond(request,
-                   json_format.MessageToJson(config), 'application/json')
-
-  @wrappers.Request.application
-  def _serve_metadata(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    num_rows = _parse_positive_int_param(request, 'num_rows')
-    if num_rows == -1:
-      return Respond(request, 'query parameter num_rows must be integer > 0',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    fpath = self._get_metadata_file_for_tensor(name, config)
-    if not fpath:
-      return Respond(
-          request,
-          'No metadata file found for tensor %s in the config file %s' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-    if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s is not a file' % fpath, 'text/plain', 400)
-
-    num_header_rows = 0
-    with file_io.FileIO(fpath, 'r') as f:
-      lines = []
-      # Stream reading the file with early break in case the file doesn't fit in
-      # memory.
-      for line in f:
-        lines.append(line)
-        if len(lines) == 1 and '\t' in lines[0]:
-          num_header_rows = 1
-        if num_rows and len(lines) >= num_rows + num_header_rows:
-          break
-    return Respond(request, ''.join(lines), 'text/plain')
-
-  @wrappers.Request.application
-  def _serve_tensor(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    num_rows = _parse_positive_int_param(request, 'num_rows')
-    if num_rows == -1:
-      return Respond(request, 'query parameter num_rows must be integer > 0',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
-
-    reader = self._get_reader_for_run(run)
-    config = self.configs[run]
-
-    if reader is None:
-      # See if there is a tensor file in the config.
-      embedding = self._get_embedding(name, config)
-      if not embedding or not embedding.tensor_path:
-        return Respond(request,
-                       'Tensor %s has no tensor_path in the config' % name,
-                       'text/plain', 400)
-      if not file_io.file_exists(embedding.tensor_path):
-        return Respond(request,
-                       'Tensor file %s does not exist' % embedding.tensor_path,
-                       'text/plain', 400)
-      tensor = _read_tensor_file(embedding.tensor_path)
-    else:
-      if not reader.has_tensor(name):
-        return Respond(request, 'Tensor %s not found in checkpoint dir %s' %
-                       (name, config.model_checkpoint_path), 'text/plain', 400)
-      try:
-        tensor = reader.get_tensor(name)
-      except errors.InvalidArgumentError as e:
-        return Respond(request, str(e), 'text/plain', 400)
-
-    if num_rows:
-      tensor = tensor[:num_rows]
-
-    if tensor.dtype != 'float32':
-      tensor = tensor.astype(dtype='float32', copy=False)
-    data_bytes = tensor.tobytes()
-    return Respond(request, data_bytes, 'application/octet-stream')
-
-  @wrappers.Request.application
-  def _serve_bookmarks(self, request):
-    run = request.args.get('run')
-    if not run:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    fpath = self._get_bookmarks_file_for_tensor(name, config)
-    if not fpath:
-      return Respond(
-          request,
-          'No bookmarks file found for tensor %s in the config file %s' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-    if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s is not a file' % fpath, 'text/plain', 400)
-
-    bookmarks_json = None
-    with file_io.FileIO(fpath, 'rb') as f:
-      bookmarks_json = f.read()
-    return Respond(request, bookmarks_json, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_sprite_image(self, request):
-    run = request.args.get('run')
-    if not run:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    embedding_info = self._get_embedding(name, config)
-
-    if not embedding_info or not embedding_info.sprite.image_path:
-      return Respond(
-          request,
-          'No sprite image file found for tensor %s in the config file %s' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-
-    fpath = embedding_info.sprite.image_path
-    if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s does not exist or is directory' % fpath,
-                     'text/plain', 400)
-    f = file_io.FileIO(fpath, 'rb')
-    encoded_image_string = f.read()
-    f.close()
-    image_type = imghdr.what(None, encoded_image_string)
-    mime_type = _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
-    return Respond(request, encoded_image_string, mime_type)
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto b/tensorflow/tensorboard/plugins/projector/projector_config.proto
similarity index 100%
rename from tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
rename to tensorflow/tensorboard/plugins/projector/projector_config.proto
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin.py b/tensorflow/tensorboard/plugins/projector/projector_plugin.py
new file mode 100644
index 00000000000..9a3a305d53a
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/projector/projector_plugin.py
@@ -0,0 +1,640 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Embedding Projector plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import imghdr
+import math
+import os
+import numpy as np
+import tensorflow as tf
+from werkzeug import wrappers
+
+from google.protobuf import json_format
+from google.protobuf import text_format
+from tensorflow.tensorboard.backend.http_util import Respond
+from tensorflow.tensorboard.plugins.base_plugin import TBPlugin
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
+
+# The prefix of routes provided by this plugin.
+_PLUGIN_PREFIX_ROUTE = 'projector'
+
+# FYI - the PROJECTOR_FILENAME is hardcoded in the visualize_embeddings
+# method in tf.contrib.tensorboard.plugins.projector module.
+# TODO(dandelion): Fix duplication when we find a permanent home for the
+# projector module.
+PROJECTOR_FILENAME = 'projector_config.pbtxt'
+_PLUGIN_NAME = 'org_tensorflow_tensorboard_projector'
+_PLUGINS_DIR = 'plugins'
+
+# Number of tensors in the LRU cache.
+_TENSOR_CACHE_CAPACITY = 1
+
+# HTTP routes.
+CONFIG_ROUTE = '/info'
+TENSOR_ROUTE = '/tensor'
+METADATA_ROUTE = '/metadata'
+RUNS_ROUTE = '/runs'
+BOOKMARKS_ROUTE = '/bookmarks'
+SPRITE_IMAGE_ROUTE = '/sprite_image'
+
+_IMGHDR_TO_MIMETYPE = {
+    'bmp': 'image/bmp',
+    'gif': 'image/gif',
+    'jpeg': 'image/jpeg',
+    'png': 'image/png'
+}
+_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
+
+
+class LRUCache(object):
+  """LRU cache. Used for storing the last used tensor."""
+
+  def __init__(self, size):
+    if size < 1:
+      raise ValueError('The cache size must be >=1')
+    self._size = size
+    self._dict = collections.OrderedDict()
+
+  def get(self, key):
+    try:
+      value = self._dict.pop(key)
+      self._dict[key] = value
+      return value
+    except KeyError:
+      return None
+
+  def set(self, key, value):
+    if value is None:
+      raise ValueError('value must be != None')
+    try:
+      self._dict.pop(key)
+    except KeyError:
+      if len(self._dict) >= self._size:
+        self._dict.popitem(last=False)
+    self._dict[key] = value
+
+
+class EmbeddingMetadata(object):
+  """Metadata container for an embedding.
+
+  The metadata holds different columns with values used for visualization
+  (color by, label by) in the "Embeddings" tab in TensorBoard.
+  """
+
+  def __init__(self, num_points):
+    """Constructs a metadata for an embedding of the specified size.
+
+    Args:
+      num_points: Number of points in the embedding.
+    """
+    self.num_points = num_points
+    self.column_names = []
+    self.name_to_values = {}
+
+  def add_column(self, column_name, column_values):
+    """Adds a named column of metadata values.
+
+    Args:
+      column_name: Name of the column.
+      column_values: 1D array/list/iterable holding the column values. Must be
+          of length `num_points`. The i-th value corresponds to the i-th point.
+
+    Raises:
+      ValueError: If `column_values` is not 1D array, or of length `num_points`,
+          or the `name` is already used.
+    """
+    # Sanity checks.
+    if isinstance(column_values, list) and isinstance(column_values[0], list):
+      raise ValueError('"column_values" must be a flat list, but we detected '
+                       'that its first entry is a list')
+
+    if isinstance(column_values, np.ndarray) and column_values.ndim != 1:
+      raise ValueError('"column_values" should be of rank 1, '
+                       'but is of rank %d' % column_values.ndim)
+    if len(column_values) != self.num_points:
+      raise ValueError('"column_values" should be of length %d, but is of '
+                       'length %d' % (self.num_points, len(column_values)))
+    if column_name in self.name_to_values:
+      raise ValueError('The column name "%s" is already used' % column_name)
+
+    self.column_names.append(column_name)
+    self.name_to_values[column_name] = column_values
+
+
+def _read_tensor_tsv_file(fpath):
+  with tf.gfile.GFile(fpath, 'r') as f:
+    tensor = []
+    for line in f:
+      if line:
+        tensor.append(list(map(float, line.rstrip('\n').split('\t'))))
+  return np.array(tensor, dtype='float32')
+
+
+def _assets_dir_to_logdir(assets_dir):
+  sub_path = os.path.sep + _PLUGINS_DIR + os.path.sep
+  if sub_path in assets_dir:
+    two_parents_up = os.pardir + os.path.sep + os.pardir
+    return os.path.abspath(os.path.join(assets_dir, two_parents_up))
+  return assets_dir
+
+
+def _latest_checkpoints_changed(configs, run_path_pairs):
+  """Returns true if the latest checkpoint has changed in any of the runs."""
+  for run_name, assets_dir in run_path_pairs:
+    if run_name not in configs:
+      config = projector_config_pb2.ProjectorConfig()
+      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
+      if tf.gfile.Exists(config_fpath):
+        with tf.gfile.GFile(config_fpath, 'r') as f:
+          file_content = f.read()
+        text_format.Merge(file_content, config)
+    else:
+      config = configs[run_name]
+
+    # See if you can find a checkpoint file in the logdir.
+    logdir = _assets_dir_to_logdir(assets_dir)
+    ckpt_path = _find_latest_checkpoint(logdir)
+    if not ckpt_path:
+      continue
+    if config.model_checkpoint_path != ckpt_path:
+      return True
+  return False
+
+
+def _parse_positive_int_param(request, param_name):
+  """Parses and asserts a positive (>0) integer query parameter.
+
+  Args:
+    request: The Werkzeug Request object
+    param_name: Name of the parameter.
+
+  Returns:
+    Param, or None, or -1 if parameter is not a positive integer.
+  """
+  param = request.args.get(param_name)
+  if not param:
+    return None
+  try:
+    param = int(param)
+    if param <= 0:
+      raise ValueError()
+    return param
+  except ValueError:
+    return -1
+
+
+def _rel_to_abs_asset_path(fpath, config_fpath):
+  fpath = os.path.expanduser(fpath)
+  if not os.path.isabs(fpath):
+    return os.path.join(os.path.dirname(config_fpath), fpath)
+  return fpath
+
+
+class ProjectorPlugin(TBPlugin):
+  """Embedding projector."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def __init__(self):
+    self._handlers = None
+    self.readers = {}
+    self.run_paths = None
+    self.logdir = None
+    self._configs = None
+    self.old_num_run_paths = None
+    self.multiplexer = None
+    self.tensor_cache = LRUCache(_TENSOR_CACHE_CAPACITY)
+
+  def get_plugin_apps(self, multiplexer, logdir):
+    self.multiplexer = multiplexer
+    self.run_paths = multiplexer.RunPaths()
+    self.logdir = logdir
+    self._handlers = {
+        RUNS_ROUTE: self._serve_runs,
+        CONFIG_ROUTE: self._serve_config,
+        TENSOR_ROUTE: self._serve_tensor,
+        METADATA_ROUTE: self._serve_metadata,
+        BOOKMARKS_ROUTE: self._serve_bookmarks,
+        SPRITE_IMAGE_ROUTE: self._serve_sprite_image
+    }
+    return self._handlers
+
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    This plugin is only active if any run has an embedding.
+
+    Returns:
+      A boolean. Whether this plugin is active.
+    """
+    return bool(self.configs)
+
+  @property
+  def configs(self):
+    """Returns a map of run paths to `ProjectorConfig` protos."""
+    run_path_pairs = list(self.run_paths.items())
+    self._append_plugin_asset_directories(run_path_pairs)
+    # If there are no summary event files, the projector should still work,
+    # treating the `logdir` as the model checkpoint directory.
+    if not run_path_pairs:
+      run_path_pairs.append(('.', self.logdir))
+    if (self._run_paths_changed() or
+        _latest_checkpoints_changed(self._configs, run_path_pairs)):
+      self.readers = {}
+      self._configs, self.config_fpaths = self._read_latest_config_files(
+          run_path_pairs)
+      self._augment_configs_with_checkpoint_info()
+    return self._configs
+
+  def _run_paths_changed(self):
+    num_run_paths = len(list(self.run_paths.keys()))
+    if num_run_paths != self.old_num_run_paths:
+      self.old_num_run_paths = num_run_paths
+      return True
+    return False
+
+  def _augment_configs_with_checkpoint_info(self):
+    for run, config in self._configs.items():
+      for embedding in config.embeddings:
+        # Normalize the name of the embeddings.
+        if embedding.tensor_name.endswith(':0'):
+          embedding.tensor_name = embedding.tensor_name[:-2]
+        # Find the size of embeddings associated with a tensors file.
+        if embedding.tensor_path and not embedding.tensor_shape:
+          fpath = _rel_to_abs_asset_path(embedding.tensor_path,
+                                         self.config_fpaths[run])
+          tensor = self.tensor_cache.get(embedding.tensor_name)
+          if tensor is None:
+            tensor = _read_tensor_tsv_file(fpath)
+            self.tensor_cache.set(embedding.tensor_name, tensor)
+          embedding.tensor_shape.extend([len(tensor), len(tensor[0])])
+
+      reader = self._get_reader_for_run(run)
+      if not reader:
+        continue
+      # Augment the configuration with the tensors in the checkpoint file.
+      special_embedding = None
+      if config.embeddings and not config.embeddings[0].tensor_name:
+        special_embedding = config.embeddings[0]
+        config.embeddings.remove(special_embedding)
+      var_map = reader.get_variable_to_shape_map()
+      for tensor_name, tensor_shape in var_map.items():
+        if len(tensor_shape) != 2:
+          continue
+        embedding = self._get_embedding(tensor_name, config)
+        if not embedding:
+          embedding = config.embeddings.add()
+          embedding.tensor_name = tensor_name
+          if special_embedding:
+            embedding.metadata_path = special_embedding.metadata_path
+            embedding.bookmarks_path = special_embedding.bookmarks_path
+        if not embedding.tensor_shape:
+          embedding.tensor_shape.extend(tensor_shape)
+
+    # Remove configs that do not have any valid (2D) tensors.
+    runs_to_remove = []
+    for run, config in self._configs.items():
+      if not config.embeddings:
+        runs_to_remove.append(run)
+    for run in runs_to_remove:
+      del self._configs[run]
+      del self.config_fpaths[run]
+
+  def _read_latest_config_files(self, run_path_pairs):
+    """Reads and returns the projector config files in every run directory."""
+    configs = {}
+    config_fpaths = {}
+    for run_name, assets_dir in run_path_pairs:
+      config = projector_config_pb2.ProjectorConfig()
+      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
+      if tf.gfile.Exists(config_fpath):
+        with tf.gfile.GFile(config_fpath, 'r') as f:
+          file_content = f.read()
+        text_format.Merge(file_content, config)
+      has_tensor_files = False
+      for embedding in config.embeddings:
+        if embedding.tensor_path:
+          if not embedding.tensor_name:
+            embedding.tensor_name = os.path.basename(embedding.tensor_path)
+          has_tensor_files = True
+          break
+
+      if not config.model_checkpoint_path:
+        # See if you can find a checkpoint file in the logdir.
+        logdir = _assets_dir_to_logdir(assets_dir)
+        ckpt_path = _find_latest_checkpoint(logdir)
+        if not ckpt_path and not has_tensor_files:
+          continue
+        if ckpt_path:
+          config.model_checkpoint_path = ckpt_path
+
+      # Sanity check for the checkpoint file.
+      if (config.model_checkpoint_path and
+          not tf.train.checkpoint_exists(config.model_checkpoint_path)):
+        tf.logging.warning('Checkpoint file "%s" not found',
+                           config.model_checkpoint_path)
+        continue
+      configs[run_name] = config
+      config_fpaths[run_name] = config_fpath
+    return configs, config_fpaths
+
+  def _get_reader_for_run(self, run):
+    if run in self.readers:
+      return self.readers[run]
+
+    config = self._configs[run]
+    reader = None
+    if config.model_checkpoint_path:
+      try:
+        reader = tf.pywrap_tensorflow.NewCheckpointReader(
+            config.model_checkpoint_path)
+      except Exception:  # pylint: disable=broad-except
+        tf.logging.warning('Failed reading "%s"', config.model_checkpoint_path)
+    self.readers[run] = reader
+    return reader
+
+  def _get_metadata_file_for_tensor(self, tensor_name, config):
+    embedding_info = self._get_embedding(tensor_name, config)
+    if embedding_info:
+      return embedding_info.metadata_path
+    return None
+
+  def _get_bookmarks_file_for_tensor(self, tensor_name, config):
+    embedding_info = self._get_embedding(tensor_name, config)
+    if embedding_info:
+      return embedding_info.bookmarks_path
+    return None
+
+  def _canonical_tensor_name(self, tensor_name):
+    if ':' not in tensor_name:
+      return tensor_name + ':0'
+    else:
+      return tensor_name
+
+  def _get_embedding(self, tensor_name, config):
+    if not config.embeddings:
+      return None
+    for info in config.embeddings:
+      if (self._canonical_tensor_name(info.tensor_name) ==
+          self._canonical_tensor_name(tensor_name)):
+        return info
+    return None
+
+  def _append_plugin_asset_directories(self, run_path_pairs):
+    for run, assets in self.multiplexer.PluginAssets(_PLUGIN_NAME).items():
+      if PROJECTOR_FILENAME not in assets:
+        continue
+      assets_dir = os.path.join(self.run_paths[run], _PLUGINS_DIR, _PLUGIN_NAME)
+      assets_path_pair = (run, os.path.abspath(assets_dir))
+      run_path_pairs.append(assets_path_pair)
+
+  @wrappers.Request.application
+  def _serve_runs(self, request):
+    """Returns a list of runs that have embeddings."""
+    return Respond(request, list(self.configs.keys()), 'application/json')
+
+  @wrappers.Request.application
+  def _serve_config(self, request):
+    run = request.args.get('run')
+    if run is None:
+      return Respond(request, 'query parameter "run" is required', 'text/plain',
+                     400)
+    if run not in self.configs:
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
+
+    config = self.configs[run]
+    return Respond(request,
+                   json_format.MessageToJson(config), 'application/json')
+
+  @wrappers.Request.application
+  def _serve_metadata(self, request):
+    run = request.args.get('run')
+    if run is None:
+      return Respond(request, 'query parameter "run" is required', 'text/plain',
+                     400)
+
+    name = request.args.get('name')
+    if name is None:
+      return Respond(request, 'query parameter "name" is required',
+                     'text/plain', 400)
+
+    num_rows = _parse_positive_int_param(request, 'num_rows')
+    if num_rows == -1:
+      return Respond(request, 'query parameter num_rows must be integer > 0',
+                     'text/plain', 400)
+
+    if run not in self.configs:
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
+
+    config = self.configs[run]
+    fpath = self._get_metadata_file_for_tensor(name, config)
+    if not fpath:
+      return Respond(
+          request,
+          'No metadata file found for tensor "%s" in the config file "%s"' %
+          (name, self.config_fpaths[run]), 'text/plain', 400)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
+    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
+      return Respond(request, '"%s" not found, or is not a file' % fpath,
+                     'text/plain', 400)
+
+    num_header_rows = 0
+    with tf.gfile.GFile(fpath, 'r') as f:
+      lines = []
+      # Stream reading the file with early break in case the file doesn't fit in
+      # memory.
+      for line in f:
+        lines.append(line)
+        if len(lines) == 1 and '\t' in lines[0]:
+          num_header_rows = 1
+        if num_rows and len(lines) >= num_rows + num_header_rows:
+          break
+    return Respond(request, ''.join(lines), 'text/plain')
+
+  @wrappers.Request.application
+  def _serve_tensor(self, request):
+    run = request.args.get('run')
+    if run is None:
+      return Respond(request, 'query parameter "run" is required', 'text/plain',
+                     400)
+
+    name = request.args.get('name')
+    if name is None:
+      return Respond(request, 'query parameter "name" is required',
+                     'text/plain', 400)
+
+    num_rows = _parse_positive_int_param(request, 'num_rows')
+    if num_rows == -1:
+      return Respond(request, 'query parameter num_rows must be integer > 0',
+                     'text/plain', 400)
+
+    if run not in self.configs:
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
+
+    config = self.configs[run]
+
+    tensor = self.tensor_cache.get(name)
+    if tensor is None:
+      # See if there is a tensor file in the config.
+      embedding = self._get_embedding(name, config)
+
+      if embedding and embedding.tensor_path:
+        fpath = _rel_to_abs_asset_path(embedding.tensor_path,
+                                       self.config_fpaths[run])
+        if not tf.gfile.Exists(fpath):
+          return Respond(request,
+                         'Tensor file "%s" does not exist' % fpath,
+                         'text/plain', 400)
+        tensor = _read_tensor_tsv_file(fpath)
+      else:
+        reader = self._get_reader_for_run(run)
+        if not reader or not reader.has_tensor(name):
+          return Respond(request,
+                         'Tensor "%s" not found in checkpoint dir "%s"' %
+                         (name, config.model_checkpoint_path), 'text/plain',
+                         400)
+        try:
+          tensor = reader.get_tensor(name)
+        except tf.errors.InvalidArgumentError as e:
+          return Respond(request, str(e), 'text/plain', 400)
+
+      self.tensor_cache.set(name, tensor)
+
+    if num_rows:
+      tensor = tensor[:num_rows]
+    if tensor.dtype != 'float32':
+      tensor = tensor.astype(dtype='float32', copy=False)
+    data_bytes = tensor.tobytes()
+    return Respond(request, data_bytes, 'application/octet-stream')
+
+  @wrappers.Request.application
+  def _serve_bookmarks(self, request):
+    run = request.args.get('run')
+    if not run:
+      return Respond(request, 'query parameter "run" is required', 'text/plain',
+                     400)
+
+    name = request.args.get('name')
+    if name is None:
+      return Respond(request, 'query parameter "name" is required',
+                     'text/plain', 400)
+
+    if run not in self.configs:
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
+
+    config = self.configs[run]
+    fpath = self._get_bookmarks_file_for_tensor(name, config)
+    if not fpath:
+      return Respond(
+          request,
+          'No bookmarks file found for tensor "%s" in the config file "%s"' %
+          (name, self.config_fpaths[run]), 'text/plain', 400)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
+    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
+      return Respond(request, '"%s" not found, or is not a file' % fpath,
+                     'text/plain', 400)
+
+    bookmarks_json = None
+    with tf.gfile.GFile(fpath, 'rb') as f:
+      bookmarks_json = f.read()
+    return Respond(request, bookmarks_json, 'application/json')
+
+  @wrappers.Request.application
+  def _serve_sprite_image(self, request):
+    run = request.args.get('run')
+    if not run:
+      return Respond(request, 'query parameter "run" is required', 'text/plain',
+                     400)
+
+    name = request.args.get('name')
+    if name is None:
+      return Respond(request, 'query parameter "name" is required',
+                     'text/plain', 400)
+
+    if run not in self.configs:
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
+
+    config = self.configs[run]
+    embedding_info = self._get_embedding(name, config)
+
+    if not embedding_info or not embedding_info.sprite.image_path:
+      return Respond(
+          request,
+          'No sprite image file found for tensor "%s" in the config file "%s"' %
+          (name, self.config_fpaths[run]), 'text/plain', 400)
+
+    fpath = os.path.expanduser(embedding_info.sprite.image_path)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
+    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
+      return Respond(request, '"%s" does not exist or is directory' % fpath,
+                     'text/plain', 400)
+    f = tf.gfile.GFile(fpath, 'rb')
+    encoded_image_string = f.read()
+    f.close()
+    image_type = imghdr.what(None, encoded_image_string)
+    mime_type = _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
+    return Respond(request, encoded_image_string, mime_type)
+
+
+def _find_latest_checkpoint(dir_path):
+  try:
+    ckpt_path = tf.train.latest_checkpoint(dir_path)
+    if not ckpt_path:
+      # Check the parent directory.
+      ckpt_path = tf.train.latest_checkpoint(os.path.join(dir_path, os.pardir))
+    return ckpt_path
+  except tf.errors.NotFoundError:
+    return None
+
+
+def _make_sprite_image(thumbnails, thumbnail_dim):
+  """Constructs a sprite image from thumbnails and returns the png bytes."""
+  if len(thumbnails) < 1:
+    raise ValueError('The length of "thumbnails" must be >= 1')
+
+  if isinstance(thumbnails, np.ndarray) and thumbnails.ndim != 4:
+    raise ValueError('"thumbnails" should be of rank 4, '
+                     'but is of rank %d' % thumbnails.ndim)
+  if isinstance(thumbnails, list):
+    if not isinstance(thumbnails[0], np.ndarray) or thumbnails[0].ndim != 3:
+      raise ValueError('Each element of "thumbnails" must be a 3D `ndarray`')
+    thumbnails = np.array(thumbnails)
+
+  with tf.Graph().as_default():
+    s = tf.Session()
+    resized_images = tf.image.resize_images(thumbnails, thumbnail_dim).eval(
+        session=s)
+    images_per_row = int(math.ceil(math.sqrt(len(thumbnails))))
+    thumb_height = thumbnail_dim[0]
+    thumb_width = thumbnail_dim[1]
+    master_height = images_per_row * thumb_height
+    master_width = images_per_row * thumb_width
+    num_channels = thumbnails.shape[3]
+    master = np.zeros([master_height, master_width, num_channels])
+    for idx, image in enumerate(resized_images):
+      left_idx = idx % images_per_row
+      top_idx = int(math.floor(idx / images_per_row))
+      left_start = left_idx * thumb_width
+      left_end = left_start + thumb_width
+      top_start = top_idx * thumb_height
+      top_end = top_start + thumb_height
+      master[top_start:top_end, left_start:left_end, :] = image
+
+    return tf.image.encode_png(master).eval(session=s)
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py b/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
new file mode 100644
index 00000000000..06cf2c3d0d4
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
@@ -0,0 +1,348 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Embedding Projector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import io
+import json
+import os
+import numpy as np
+import tensorflow as tf
+
+from werkzeug import test as werkzeug_test
+from werkzeug import wrappers
+
+from google.protobuf import text_format
+
+from tensorflow.tensorboard.backend import application
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
+from tensorflow.tensorboard.plugins.projector import projector_plugin
+
+
+class ProjectorAppTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.log_dir = self.get_temp_dir()
+
+  def testRunsWithValidCheckpoint(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+
+  def testRunsWithNoCheckpoint(self):
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testRunsWithInvalidModelCheckpointPath(self):
+    checkpoint_file = os.path.join(self.log_dir, 'checkpoint')
+    f = open(checkpoint_file, 'w')
+    f.write('model_checkpoint_path: "does_not_exist"\n')
+    f.write('all_model_checkpoint_paths: "does_not_exist"\n')
+    f.close()
+    self._SetupWSGIApp()
+
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testRunsWithInvalidModelCheckpointPathInConfig(self):
+    config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
+    config = projector_config_pb2.ProjectorConfig()
+    config.model_checkpoint_path = 'does_not_exist'
+    embedding = config.embeddings.add()
+    embedding.tensor_name = 'var1'
+    with tf.gfile.GFile(config_path, 'w') as f:
+      f.write(text_format.MessageToString(config))
+    self._SetupWSGIApp()
+
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testInfoWithValidCheckpointNoEventsData(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    info_json = self._GetJson('/data/plugin/projector/info?run=.')
+    self.assertItemsEqual(info_json['embeddings'], [{
+        'tensorShape': [1, 2],
+        'tensorName': 'var1',
+        'bookmarksPath': 'bookmarks.json'
+    }, {
+        'tensorShape': [10, 10],
+        'tensorName': 'var2'
+    }, {
+        'tensorShape': [100, 100],
+        'tensorName': 'var3'
+    }])
+
+  def testInfoWithValidCheckpointAndEventsData(self):
+    self._GenerateProjectorTestData()
+    self._GenerateEventsData()
+    self._SetupWSGIApp()
+
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+    run = run_json[0]
+    info_json = self._GetJson('/data/plugin/projector/info?run=%s' % run)
+    self.assertItemsEqual(info_json['embeddings'], [{
+        'tensorShape': [1, 2],
+        'tensorName': 'var1',
+        'bookmarksPath': 'bookmarks.json'
+    }, {
+        'tensorShape': [10, 10],
+        'tensorName': 'var2'
+    }, {
+        'tensorShape': [100, 100],
+        'tensorName': 'var3'
+    }])
+
+  def testTensorWithValidCheckpoint(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/tensor?run=.&name=var1'
+    tensor_bytes = self._Get(url).data
+    expected_tensor = np.array([[6, 6]], dtype=np.float32)
+    self._AssertTensorResponse(tensor_bytes, expected_tensor)
+
+  def testBookmarksRequestMissingRunAndName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksRequestMissingName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksRequestMissingRun(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?name=var1'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksUnknownRun(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=unknown&name=var1'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksUnknownName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.&name=unknown'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarks(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.&name=var1'
+    bookmark = self._GetJson(url)
+    self.assertEqual(bookmark, {'a': 'b'})
+
+  def testEndpointsNoAssets(self):
+    g = tf.Graph()
+
+    fw = tf.summary.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def _AssertTensorResponse(self, tensor_bytes, expected_tensor):
+    tensor = np.reshape(np.fromstring(tensor_bytes, dtype=np.float32),
+                        expected_tensor.shape)
+    self.assertTrue(np.array_equal(tensor, expected_tensor))
+
+  def testPluginIsActive(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    # Embedding data is available.
+    self.assertTrue(self.plugin.is_active())
+
+  def testPluginIsNotActive(self):
+    self._SetupWSGIApp()
+
+    # Embedding data is not available.
+    self.assertFalse(self.plugin.is_active())
+
+  def _SetupWSGIApp(self):
+    multiplexer = event_multiplexer.EventMultiplexer(
+        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
+        purge_orphaned_data=True)
+    self.plugin = projector_plugin.ProjectorPlugin()
+    wsgi_app = application.TensorBoardWSGIApp(
+        self.log_dir, [self.plugin], multiplexer, reload_interval=0)
+    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
+
+  def _Get(self, path):
+    return self.server.get(path)
+
+  def _GetJson(self, path):
+    response = self.server.get(path)
+    data = response.data
+    if response.headers.get('Content-Encoding') == 'gzip':
+      data = gzip.GzipFile('', 'rb', 9, io.BytesIO(data)).read()
+    return json.loads(data.decode('utf-8'))
+
+  def _GenerateEventsData(self):
+    fw = tf.summary.FileWriter(self.log_dir)
+    event = tf.Event(
+        wall_time=1,
+        step=1,
+        summary=tf.Summary(value=[tf.Summary.Value(tag='s1', simple_value=0)]))
+    fw.add_event(event)
+    fw.close()
+
+  def _GenerateProjectorTestData(self):
+    config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
+    config = projector_config_pb2.ProjectorConfig()
+    embedding = config.embeddings.add()
+    # Add an embedding by its canonical tensor name.
+    embedding.tensor_name = 'var1:0'
+
+    with tf.gfile.GFile(os.path.join(self.log_dir, 'bookmarks.json'), 'w') as f:
+      f.write('{"a": "b"}')
+    embedding.bookmarks_path = 'bookmarks.json'
+
+    config_pbtxt = text_format.MessageToString(config)
+    with tf.gfile.GFile(config_path, 'w') as f:
+      f.write(config_pbtxt)
+
+    # Write a checkpoint with some dummy variables.
+    with tf.Graph().as_default():
+      sess = tf.Session()
+      checkpoint_path = os.path.join(self.log_dir, 'model')
+      tf.get_variable('var1', [1, 2], initializer=tf.constant_initializer(6.0))
+      tf.get_variable('var2', [10, 10])
+      tf.get_variable('var3', [100, 100])
+      sess.run(tf.global_variables_initializer())
+      saver = tf.train.Saver(write_version=tf.train.SaverDef.V1)
+      saver.save(sess, checkpoint_path)
+
+
+class MetadataColumnsTest(tf.test.TestCase):
+
+  def testLengthDoesNotMatch(self):
+    metadata = projector_plugin.EmbeddingMetadata(10)
+
+    with self.assertRaises(ValueError):
+      metadata.add_column('Labels', [''] * 11)
+
+  def testValuesNot1D(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    values = np.array([[1, 2, 3]])
+
+    with self.assertRaises(ValueError):
+      metadata.add_column('Labels', values)
+
+  def testMultipleColumnsRetrieval(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('Sizes', [1, 2, 3])
+    metadata.add_column('Labels', ['a', 'b', 'c'])
+    self.assertEqual(metadata.column_names, ['Sizes', 'Labels'])
+    self.assertEqual(metadata.name_to_values['Labels'], ['a', 'b', 'c'])
+    self.assertEqual(metadata.name_to_values['Sizes'], [1, 2, 3])
+
+  def testValuesAreListofLists(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    values = [[1, 2, 3], [4, 5, 6]]
+    with self.assertRaises(ValueError):
+      metadata.add_column('Labels', values)
+
+  def testStringListRetrieval(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('Labels', ['a', 'B', 'c'])
+    self.assertEqual(metadata.name_to_values['Labels'], ['a', 'B', 'c'])
+    self.assertEqual(metadata.column_names, ['Labels'])
+
+  def testNumericListRetrieval(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('Labels', [1, 2, 3])
+    self.assertEqual(metadata.name_to_values['Labels'], [1, 2, 3])
+
+  def testNumericNdArrayRetrieval(self):
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('Labels', np.array([1, 2, 3]))
+    self.assertEqual(metadata.name_to_values['Labels'].tolist(), [1, 2, 3])
+
+  def testStringNdArrayRetrieval(self):
+    metadata = projector_plugin.EmbeddingMetadata(2)
+    metadata.add_column('Labels', np.array(['a', 'b']))
+    self.assertEqual(metadata.name_to_values['Labels'].tolist(), ['a', 'b'])
+
+  def testDuplicateColumnName(self):
+    metadata = projector_plugin.EmbeddingMetadata(2)
+    metadata.add_column('Labels', np.array(['a', 'b']))
+    with self.assertRaises(ValueError):
+      metadata.add_column('Labels', np.array(['a', 'b']))
+
+
+class LRUCacheTest(tf.test.TestCase):
+
+  def testInvalidSize(self):
+    with self.assertRaises(ValueError):
+      projector_plugin.LRUCache(0)
+
+  def testSimpleGetAndSet(self):
+    cache = projector_plugin.LRUCache(1)
+    value = cache.get('a')
+    self.assertIsNone(value)
+    cache.set('a', 10)
+    self.assertEqual(cache.get('a'), 10)
+
+  def testErrorsWhenSettingNoneAsValue(self):
+    cache = projector_plugin.LRUCache(1)
+    with self.assertRaises(ValueError):
+      cache.set('a', None)
+
+  def testLRUReplacementPolicy(self):
+    cache = projector_plugin.LRUCache(2)
+    cache.set('a', 1)
+    cache.set('b', 2)
+    cache.set('c', 3)
+    self.assertIsNone(cache.get('a'))
+    self.assertEqual(cache.get('b'), 2)
+    self.assertEqual(cache.get('c'), 3)
+
+    # Make 'b' the most recently used.
+    cache.get('b')
+    cache.set('d', 4)
+
+    # Make sure 'c' got replaced with 'd'.
+    self.assertIsNone(cache.get('c'))
+    self.assertEqual(cache.get('b'), 2)
+    self.assertEqual(cache.get('d'), 4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/scalars/BUILD b/tensorflow/tensorboard/plugins/scalars/BUILD
new file mode 100644
index 00000000000..1e95c7d701b
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/scalars/BUILD
@@ -0,0 +1,58 @@
+# Description:
+# TensorBoard plugin for scalars
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+## Scalars Plugin ##
+py_library(
+    name = "scalars_plugin",
+    srcs = ["scalars_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "scalars_plugin_test",
+    size = "small",
+    srcs = ["scalars_plugin_test.py"],
+    main = "scalars_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":scalars_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+py_binary(
+    name = "scalars_demo",
+    srcs = ["scalars_demo.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
new file mode 100644
index 00000000000..f3195fd849b
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
@@ -0,0 +1,130 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sample data exhibiting scalar summaries, via a temperature simulation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# Directory into which to write tensorboard data.
+LOGDIR = '/tmp/scalars_demo'
+
+# Duration of the simulation.
+STEPS = 1000
+
+
+def run(logdir, run_name,
+        initial_temperature, ambient_temperature, heat_coefficient):
+  """Run a temperature simulation.
+
+  This will simulate an object at temperature `initial_temperature`
+  sitting at rest in a large room at temperature `ambient_temperature`.
+  The object has some intrinsic `heat_coefficient`, which indicates
+  how much thermal conductivity it has: for instance, metals have high
+  thermal conductivity, while the thermal conductivity of water is low.
+
+  Over time, the object's temperature will adjust to match the
+  temperature of its environment. We'll track the object's temperature,
+  how far it is from the room's temperature, and how much it changes at
+  each time step.
+
+  Arguments:
+    logdir: the top-level directory into which to write summary data
+    run_name: the name of this run; will be created as a subdirectory
+      under logdir
+    initial_temperature: float; the object's initial temperature
+    ambient_temperature: float; the temperature of the enclosing room
+    heat_coefficient: float; a measure of the object's thermal
+      conductivity
+  """
+  tf.reset_default_graph()
+  tf.set_random_seed(0)
+
+  with tf.name_scope('temperature'):
+    # Create a mutable variable to hold the object's temperature, and
+    # create a scalar summary to track its value over time. The name of
+    # the summary will appear as "temperature/current" due to the
+    # name-scope above.
+    temperature = tf.Variable(tf.constant(initial_temperature),
+                              name='temperature')
+    tf.summary.scalar('current', temperature)
+
+    # Compute how much the object's temperature differs from that of its
+    # environment, and track this, too: likewise, as
+    # "temperature/difference_to_ambient".
+    ambient_difference = temperature - ambient_temperature
+    tf.summary.scalar('difference_to_ambient', ambient_difference)
+
+  # Newton suggested that the rate of change of the temperature of an
+  # object is directly proportional to this `ambient_difference` above,
+  # where the proportionality constant is what we called the heat
+  # coefficient. But in real life, not everything is quite so clean, so
+  # we'll add in some noise. (The value of 50 is arbitrary, chosen to
+  # make the data look somewhat interesting. :-) )
+  noise = 50 * tf.random_normal([])
+  delta = -heat_coefficient * (ambient_difference + noise)
+  tf.summary.scalar('delta', delta)
+
+  # Now, augment the current temperature by this delta that we computed.
+  update_step = temperature.assign_add(delta)
+
+  # Collect all the scalars that we want to keep track of.
+  summ = tf.summary.merge_all()
+
+  sess = tf.Session()
+  writer = tf.summary.FileWriter(os.path.join(logdir, run_name))
+  writer.add_graph(sess.graph)
+  sess.run(tf.global_variables_initializer())
+  for step in xrange(STEPS):
+    # By asking TensorFlow to compute the update step, we force it to
+    # change the value of the temperature variable. We don't actually
+    # care about this value, so we discard it; instead, we grab the
+    # summary data computed along the way.
+    (s, _) = sess.run([summ, update_step])
+    writer.add_summary(s, global_step=step)
+  writer.close()
+
+
+def run_all(logdir, verbose=False):
+  """Run simulations on a reasonable set of parameters.
+
+  Arguments:
+    logdir: the directory into which to store all the runs' data
+    verbose: if true, print out each run's name as it begins
+  """
+  for initial_temperature in [270.0, 310.0, 350.0]:
+    for final_temperature in [270.0, 310.0, 350.0]:
+      for heat_coefficient in [0.001, 0.005]:
+        run_name = 'temperature:t0=%g,tA=%g,kH=%g' % (
+            initial_temperature, final_temperature, heat_coefficient)
+        if verbose:
+          print('--- Running: %s' % run_name)
+        run(logdir, run_name,
+            initial_temperature, final_temperature, heat_coefficient)
+
+
+def main(unused_argv):
+  print('Saving output to %s.' % LOGDIR)
+  run_all(LOGDIR, verbose=True)
+  print('Done. Output saved to %s.' % LOGDIR)
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py b/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py
new file mode 100644
index 00000000000..ec4bf1089c1
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Scalars plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+
+from six import StringIO
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.backend.event_processing import event_accumulator
+from tensorflow.tensorboard.plugins import base_plugin
+
+_PLUGIN_PREFIX_ROUTE = event_accumulator.SCALARS
+
+
+class OutputFormat(object):
+  """An enum used to list the valid output formats for API calls."""
+  JSON = 'json'
+  CSV = 'csv'
+
+
+class ScalarsPlugin(base_plugin.TBPlugin):
+  """Scalars Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self._multiplexer = multiplexer
+    return {
+        '/scalars': self.scalars_route,
+        '/tags': self.tags_route,
+    }
+
+  def is_active(self):
+    """The scalars plugin is active iff any run has at least one scalar tag."""
+    return any(self.index_impl().values())
+
+  def index_impl(self):
+    return {
+        run_name: run_data[event_accumulator.SCALARS]
+        for (run_name, run_data) in self._multiplexer.Runs().items()
+        if event_accumulator.SCALARS in run_data
+    }
+
+  def scalars_impl(self, tag, run, output_format):
+    """Result of the form `(body, mime_type)`."""
+    values = self._multiplexer.Scalars(run, tag)
+    if output_format == OutputFormat.CSV:
+      string_io = StringIO()
+      writer = csv.writer(string_io)
+      writer.writerow(['Wall time', 'Step', 'Value'])
+      writer.writerows(values)
+      return (string_io.getvalue(), 'text/csv')
+    else:
+      return (values, 'application/json')
+
+  @wrappers.Request.application
+  def tags_route(self, request):
+    index = self.index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  @wrappers.Request.application
+  def scalars_route(self, request):
+    """Given a tag and single run, return array of ScalarEvents."""
+    # TODO(cassandrax): return HTTP status code for malformed requests
+    tag = request.args.get('tag')
+    run = request.args.get('run')
+    output_format = request.args.get('format')
+    (body, mime_type) = self.scalars_impl(tag, run, output_format)
+    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py b/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py
new file mode 100644
index 00000000000..fb6ed0bc3f2
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Scalars Plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import os.path
+
+from six import StringIO
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.scalars import scalars_plugin
+
+
+class ScalarsPluginTest(tf.test.TestCase):
+
+  _STEPS = 99
+
+  _SCALAR_TAG = 'simple-values'
+  _HISTOGRAM_TAG = 'complicated-values'
+
+  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
+  _RUN_WITH_HISTOGRAM = '_RUN_WITH_HISTOGRAM'
+
+  def set_up_with_runs(self, run_names):
+    self.logdir = self.get_temp_dir()
+    for run_name in run_names:
+      self.generate_run(run_name)
+    multiplexer = event_multiplexer.EventMultiplexer()
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+    self.plugin = scalars_plugin.ScalarsPlugin()
+    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
+
+  def generate_run(self, run_name):
+    if run_name == self._RUN_WITH_SCALARS:
+      (use_scalars, use_histogram) = (True, False)
+    elif run_name == self._RUN_WITH_HISTOGRAM:
+      (use_scalars, use_histogram) = (False, True)
+    else:
+      assert False, 'Invalid run name: %r' % run_name
+    tf.reset_default_graph()
+    sess = tf.Session()
+    if use_scalars:
+      scalar_placeholder = tf.placeholder(tf.int64)
+      tf.summary.scalar(self._SCALAR_TAG, scalar_placeholder)
+    if use_histogram:
+      histogram_placeholder = tf.placeholder(tf.float32, shape=[3])
+      tf.summary.histogram(self._HISTOGRAM_TAG, histogram_placeholder)
+    summ = tf.summary.merge_all()
+
+    subdir = os.path.join(self.logdir, run_name)
+    writer = tf.summary.FileWriter(subdir)
+    writer.add_graph(sess.graph)
+    for step in xrange(self._STEPS):
+      feed_dict = {}
+      if use_scalars:
+        feed_dict[scalar_placeholder] = int((43**step) % 47)
+      if use_histogram:
+        feed_dict[histogram_placeholder] = [1 + step, 2 + step, 3 + step]
+      s = sess.run(summ, feed_dict=feed_dict)
+      writer.add_summary(s, global_step=step)
+    writer.close()
+
+  def test_index(self):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
+    self.assertEqual({
+        self._RUN_WITH_SCALARS: [self._SCALAR_TAG],
+        self._RUN_WITH_HISTOGRAM: [],
+    }, self.plugin.index_impl())
+
+  def _test_scalars_json(self, run_name, should_have_scalars):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
+    if should_have_scalars:
+      (data, mime_type) = self.plugin.scalars_impl(
+          self._SCALAR_TAG, run_name, scalars_plugin.OutputFormat.JSON)
+      self.assertEqual('application/json', mime_type)
+      self.assertEqual(len(data), self._STEPS)
+    else:
+      with self.assertRaises(KeyError):
+        self.plugin.scalars_impl(self._SCALAR_TAG, run_name,
+                                 scalars_plugin.OutputFormat.JSON)
+
+  def _test_scalars_csv(self, run_name, should_have_scalars):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
+    if should_have_scalars:
+      (data, mime_type) = self.plugin.scalars_impl(
+          self._SCALAR_TAG, run_name, scalars_plugin.OutputFormat.CSV)
+      self.assertEqual('text/csv', mime_type)
+      s = StringIO(data)
+      reader = csv.reader(s)
+      self.assertEqual(['Wall time', 'Step', 'Value'], next(reader))
+      self.assertEqual(len(list(reader)), self._STEPS)
+    else:
+      with self.assertRaises(KeyError):
+        self.plugin.scalars_impl(self._SCALAR_TAG, run_name,
+                                 scalars_plugin.OutputFormat.CSV)
+
+  def test_scalars_json_with_scalars(self):
+    self._test_scalars_json(self._RUN_WITH_SCALARS, True)
+
+  def test_scalars_json_with_histogram(self):
+    self._test_scalars_json(self._RUN_WITH_HISTOGRAM, False)
+
+  def test_scalars_csv_with_scalars(self):
+    self._test_scalars_csv(self._RUN_WITH_SCALARS, True)
+
+  def test_scalars_csv_with_histogram(self):
+    self._test_scalars_csv(self._RUN_WITH_HISTOGRAM, False)
+
+  def test_active_with_scalars(self):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS])
+    self.assertTrue(self.plugin.is_active())
+
+  def test_active_with_histogram(self):
+    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM])
+    self.assertFalse(self.plugin.is_active())
+
+  def test_active_with_both(self):
+    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
+    self.assertTrue(self.plugin.is_active())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/text/BUILD b/tensorflow/tensorboard/plugins/text/BUILD
new file mode 100644
index 00000000000..8d30a01c017
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/text/BUILD
@@ -0,0 +1,49 @@
+# Description:
+# TensorBoard plugin for the Text
+
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+## Text Plugin ##
+py_library(
+    name = "text_plugin",
+    srcs = ["text_plugin.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/tensorboard:internal",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:http_util",
+        "//tensorflow/tensorboard/plugins:base_plugin",
+        "@org_mozilla_bleach",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@org_pythonhosted_markdown",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "text_plugin_test",
+    size = "small",
+    srcs = ["text_plugin_test.py"],
+    main = "text_plugin_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":text_plugin",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/tensorboard/backend:application",
+        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "@org_pocoo_werkzeug//:werkzeug",
+        "@six_archive//:six",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    visibility = ["//tensorflow:__pkg__"],
+)
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
new file mode 100644
index 00000000000..d0040e20be4
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/text/text_plugin.py
@@ -0,0 +1,304 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The TensorBoard Text plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import textwrap
+
+# pylint: disable=g-bad-import-order
+# Necessary for an internal test with special behavior for numpy.
+import numpy as np
+# pylint: enable=g-bad-import-order
+
+import bleach
+# pylint: disable=g-bad-import-order
+# Google-only: import markdown_freewisdom
+import markdown
+import six
+# pylint: enable=g-bad-import-order
+import tensorflow as tf
+from werkzeug import wrappers
+
+from tensorflow.tensorboard.backend import http_util
+from tensorflow.tensorboard.plugins import base_plugin
+
+# The prefix of routes provided by this plugin.
+_PLUGIN_PREFIX_ROUTE = 'text'
+
+# HTTP routes
+RUNS_ROUTE = '/runs'
+TEXT_ROUTE = '/text'
+
+ALLOWED_TAGS = [
+    'ul',
+    'ol',
+    'li',
+    'p',
+    'pre',
+    'code',
+    'blockquote',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'br',
+    'strong',
+    'em',
+    'a',
+    'img',
+    'table',
+    'thead',
+    'tbody',
+    'td',
+    'tr',
+    'th',
+]
+
+ALLOWED_ATTRIBUTES = {'a': ['href', 'title'], 'img': ['src', 'title', 'alt']}
+
+WARNING_TEMPLATE = textwrap.dedent("""\
+  **Warning:** This text summary contained data of dimensionality %d, but only \
+  2d tables are supported. Showing a 2d slice of the data instead.""")
+
+
+def markdown_and_sanitize(markdown_string):
+  """Takes a markdown string and converts it into sanitized html.
+
+  It uses the table extension; while that's not a part of standard
+  markdown, it is sure to be useful for TensorBoard users.
+
+  The sanitizer uses the allowed_tags and attributes specified above. Mostly,
+  we ensure that our standard use cases like tables and links are supported.
+
+  Args:
+    markdown_string: Markdown string to sanitize
+
+  Returns:
+    a string containing sanitized html for input markdown
+  """
+  # Convert to utf-8 whenever we have a binary input.
+  if isinstance(markdown_string, six.binary_type):
+    markdown_string = markdown_string.decode('utf-8')
+
+  string_html = markdown.markdown(
+      markdown_string, extensions=['markdown.extensions.tables'])
+  string_sanitized = bleach.clean(
+      string_html, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES)
+  return string_sanitized
+
+
+def make_table_row(contents, tag='td'):
+  """Given an iterable of string contents, make a table row.
+
+  Args:
+    contents: An iterable yielding strings.
+    tag: The tag to place contents in. Defaults to 'td', you might want 'th'.
+
+  Returns:
+    A string containing the content strings, organized into a table row.
+
+  Example: make_table_row(['one', 'two', 'three']) == '''
+  <tr>
+  <td>one</td>
+  <td>two</td>
+  <td>three</td>
+  </tr>'''
+  """
+  columns = ('<%s>%s</%s>\n' % (tag, s, tag) for s in contents)
+  return '<tr>\n' + ''.join(columns) + '</tr>\n'
+
+
+def make_table(contents, headers=None):
+  """Given a numpy ndarray of strings, concatenate them into a html table.
+
+  Args:
+    contents: A np.ndarray of strings. May be 1d or 2d. In the 1d case, the
+      table is laid out vertically (i.e. row-major).
+    headers: A np.ndarray or list of string header names for the table.
+
+  Returns:
+    A string containing all of the content strings, organized into a table.
+
+  Raises:
+    ValueError: If contents is not a np.ndarray.
+    ValueError: If contents is not 1d or 2d.
+    ValueError: If contents is empty.
+    ValueError: If headers is present and not a list, tuple, or ndarray.
+    ValueError: If headers is not 1d.
+    ValueError: If number of elements in headers does not correspond to number
+      of columns in contents.
+  """
+  if not isinstance(contents, np.ndarray):
+    raise ValueError('make_table contents must be a numpy ndarray')
+
+  if contents.ndim not in [1, 2]:
+    raise ValueError('make_table requires a 1d or 2d numpy array, was %dd' %
+                     contents.ndim)
+
+  if headers:
+    if isinstance(headers, list) or isinstance(headers, tuple):
+      headers = np.array(headers)
+    if not isinstance(headers, np.ndarray):
+      raise ValueError('Could not convert headers %s into np.ndarray' % headers)
+    if headers.ndim != 1:
+      raise ValueError('Headers must be 1d, is %dd' % headers.ndim)
+    expected_n_columns = contents.shape[1] if contents.ndim == 2 else 1
+    if headers.shape[0] != expected_n_columns:
+      raise ValueError('Number of headers %d must match number of columns %d' %
+                       (headers.shape[0], expected_n_columns))
+    header = '<thead>\n%s</thead>\n' % make_table_row(headers, tag='th')
+  else:
+    header = ''
+
+  n_rows = contents.shape[0]
+  if contents.ndim == 1:
+    # If it's a vector, we need to wrap each element in a new list, otherwise
+    # we would turn the string itself into a row (see test code)
+    rows = (make_table_row([contents[i]]) for i in range(n_rows))
+  else:
+    rows = (make_table_row(contents[i, :]) for i in range(n_rows))
+
+  return '<table>\n%s<tbody>\n%s</tbody>\n</table>' % (header, ''.join(rows))
+
+
+def reduce_to_2d(arr):
+  """Given a np.npdarray with nDims > 2, reduce it to 2d.
+
+  It does this by selecting the zeroth coordinate for every dimension greater
+  than two.
+
+  Args:
+    arr: a numpy ndarray of dimension at least 2.
+
+  Returns:
+    A two-dimensional subarray from the input array.
+
+  Raises:
+    ValueError: If the argument is not a numpy ndarray, or the dimensionality
+      is too low.
+  """
+  if not isinstance(arr, np.ndarray):
+    raise ValueError('reduce_to_2d requires a numpy.ndarray')
+
+  ndims = len(arr.shape)
+  if ndims < 2:
+    raise ValueError('reduce_to_2d requires an array of dimensionality >=2')
+  # slice(None) is equivalent to `:`, so we take arr[0,0,...0,:,:]
+  slices = ([0] * (ndims - 2)) + [slice(None), slice(None)]
+  return arr[slices]
+
+
+def text_array_to_html(text_arr):
+  """Take a numpy.ndarray containing strings, and convert it into html.
+
+  If the ndarray contains a single scalar string, that string is converted to
+  html via our sanitized markdown parser. If it contains an array of strings,
+  the strings are individually converted to html and then composed into a table
+  using make_table. If the array contains dimensionality greater than 2,
+  all but two of the dimensions are removed, and a warning message is prefixed
+  to the table.
+
+  Args:
+    text_arr: A numpy.ndarray containing strings.
+
+  Returns:
+    The array converted to html.
+  """
+  if not text_arr.shape:
+    # It is a scalar. No need to put it in a table, just apply markdown
+    return markdown_and_sanitize(text_arr.astype(np.dtype(str)).tostring())
+  warning = ''
+  if len(text_arr.shape) > 2:
+    warning = markdown_and_sanitize(WARNING_TEMPLATE % len(text_arr.shape))
+    text_arr = reduce_to_2d(text_arr)
+
+  html_arr = [markdown_and_sanitize(x) for x in text_arr.reshape(-1)]
+  html_arr = np.array(html_arr).reshape(text_arr.shape)
+
+  return warning + make_table(html_arr)
+
+
+def process_string_tensor_event(event):
+  """Convert a TensorEvent into a JSON-compatible response."""
+  string_arr = tf.make_ndarray(event.tensor_proto)
+  html = text_array_to_html(string_arr)
+  return {
+      'wall_time': event.wall_time,
+      'step': event.step,
+      'text': html,
+  }
+
+
+class TextPlugin(base_plugin.TBPlugin):
+  """Text Plugin for TensorBoard."""
+
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
+  def index_impl(self):
+    run_to_series = {}
+    name = 'tensorboard_text'
+    run_to_assets = self.multiplexer.PluginAssets(name)
+
+    for run, assets in run_to_assets.items():
+      if 'tensors.json' in assets:
+        tensors_json = self.multiplexer.RetrievePluginAsset(
+            run, name, 'tensors.json')
+        tensors = json.loads(tensors_json)
+        run_to_series[run] = tensors
+    return run_to_series
+
+  @wrappers.Request.application
+  def runs_route(self, request):
+    index = self.index_impl()
+    return http_util.Respond(request, index, 'application/json')
+
+  def text_impl(self, run, tag):
+    try:
+      text_events = self.multiplexer.Tensors(run, tag)
+    except KeyError:
+      text_events = []
+    responses = [process_string_tensor_event(ev) for ev in text_events]
+    return responses
+
+  @wrappers.Request.application
+  def text_route(self, request):
+    run = request.args.get('run')
+    tag = request.args.get('tag')
+    response = self.text_impl(run, tag)
+    return http_util.Respond(request, response, 'application/json')
+
+  def get_plugin_apps(self, multiplexer, unused_logdir):
+    self.multiplexer = multiplexer
+    return {
+        RUNS_ROUTE: self.runs_route,
+        TEXT_ROUTE: self.text_route,
+    }
+
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    This plugin is only active if TensorBoard sampled any text summaries.
+
+    Returns:
+      Whether this plugin is active.
+    """
+    return bool(self.index_impl())
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin_test.py b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
new file mode 100644
index 00000000000..5c4f6bfd333
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
@@ -0,0 +1,408 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for the Text Plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import textwrap
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.text import text_plugin
+
+GEMS = ['garnet', 'amethyst', 'pearl', 'steven']
+
+
+class TextPluginTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.logdir = self.get_temp_dir()
+    self.generate_testdata()
+    multiplexer = event_multiplexer.EventMultiplexer()
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+    self.plugin = text_plugin.TextPlugin()
+    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
+
+  def assertConverted(self, actual, expected):
+    expected_html = text_plugin.markdown_and_sanitize(expected)
+    self.assertEqual(actual, expected_html)
+
+  def generate_testdata(self):
+    tf.reset_default_graph()
+    sess = tf.Session()
+    placeholder = tf.placeholder(tf.string)
+    summary_tensor = tf.summary.text('message', placeholder)
+
+    vector_summary = tf.summary.text('vector', placeholder)
+
+    run_names = ['fry', 'leela']
+    for run_name in run_names:
+      subdir = os.path.join(self.logdir, run_name)
+      writer = tf.summary.FileWriter(subdir)
+      writer.add_graph(sess.graph)
+
+      step = 0
+      for gem in GEMS:
+        message = run_name + ' *loves* ' + gem
+        feed_dict = {placeholder: message}
+        summ = sess.run(summary_tensor, feed_dict=feed_dict)
+        writer.add_summary(summ, global_step=step)
+        step += 1
+
+      vector_message = ['one', 'two', 'three', 'four']
+      summ = sess.run(vector_summary, feed_dict={placeholder: vector_message})
+      writer.add_summary(summ)
+      writer.close()
+
+  def testIndex(self):
+    index = self.plugin.index_impl()
+    self.assertEqual(index, {
+        'fry': ['message', 'vector'],
+        'leela': ['message', 'vector'],
+    })
+
+  def testText(self):
+    fry = self.plugin.text_impl('fry', 'message')
+    leela = self.plugin.text_impl('leela', 'message')
+    self.assertEqual(len(fry), 4)
+    self.assertEqual(len(leela), 4)
+    for i in range(4):
+      self.assertEqual(fry[i]['step'], i)
+      self.assertConverted(fry[i]['text'], 'fry *loves* ' + GEMS[i])
+      self.assertEqual(leela[i]['step'], i)
+      self.assertConverted(leela[i]['text'], 'leela *loves* ' + GEMS[i])
+
+    table = self.plugin.text_impl('fry', 'vector')[0]['text']
+    self.assertEqual(table,
+                     textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>one</p></td>
+      </tr>
+      <tr>
+      <td><p>two</p></td>
+      </tr>
+      <tr>
+      <td><p>three</p></td>
+      </tr>
+      <tr>
+      <td><p>four</p></td>
+      </tr>
+      </tbody>
+      </table>"""))
+
+  def assertTextConverted(self, actual, expected):
+    self.assertEqual(text_plugin.markdown_and_sanitize(actual), expected)
+
+  def testMarkdownConversion(self):
+    emphasis = '*Italics1* _Italics2_ **bold1** __bold2__'
+    emphasis_converted = ('<p><em>Italics1</em> <em>Italics2</em> '
+                          '<strong>bold1</strong> <strong>bold2</strong></p>')
+
+    self.assertEqual(
+        text_plugin.markdown_and_sanitize(emphasis), emphasis_converted)
+
+    md_list = textwrap.dedent("""\
+    1. List item one.
+    2. List item two.
+      * Sublist
+      * Sublist2
+    1. List continues.
+    """)
+    md_list_converted = textwrap.dedent("""\
+    <ol>
+    <li>List item one.</li>
+    <li>List item two.</li>
+    <li>Sublist</li>
+    <li>Sublist2</li>
+    <li>List continues.</li>
+    </ol>""")
+    self.assertEqual(
+        text_plugin.markdown_and_sanitize(md_list), md_list_converted)
+
+    link = '[TensorFlow](http://tensorflow.org)'
+    link_converted = '<p><a href="http://tensorflow.org">TensorFlow</a></p>'
+    self.assertEqual(text_plugin.markdown_and_sanitize(link), link_converted)
+
+    table = textwrap.dedent("""\
+    An | Example | Table
+    --- | --- | ---
+    A | B | C
+    1 | 2 | 3
+    """)
+
+    table_converted = textwrap.dedent("""\
+    <table>
+    <thead>
+    <tr>
+    <th>An</th>
+    <th>Example</th>
+    <th>Table</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>A</td>
+    <td>B</td>
+    <td>C</td>
+    </tr>
+    <tr>
+    <td>1</td>
+    <td>2</td>
+    <td>3</td>
+    </tr>
+    </tbody>
+    </table>""")
+
+    self.assertEqual(text_plugin.markdown_and_sanitize(table), table_converted)
+
+  def testSanitization(self):
+    dangerous = "<script>alert('xss')</script>"
+    sanitized = "&lt;script&gt;alert('xss')&lt;/script&gt;"
+    self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
+
+    dangerous = textwrap.dedent("""\
+    hello <a name='n'
+    href='javascript:alert('xss')'>*you*</a>""")
+    sanitized = '<p>hello <a><em>you</em></a></p>'
+    self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
+
+  def testTableGeneration(self):
+    array2d = np.array([['one', 'two'], ['three', 'four']])
+    expected_table = textwrap.dedent("""\
+    <table>
+    <tbody>
+    <tr>
+    <td>one</td>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    <td>four</td>
+    </tr>
+    </tbody>
+    </table>""")
+    self.assertEqual(text_plugin.make_table(array2d), expected_table)
+
+    expected_table_with_headers = textwrap.dedent("""\
+    <table>
+    <thead>
+    <tr>
+    <th>c1</th>
+    <th>c2</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>one</td>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    <td>four</td>
+    </tr>
+    </tbody>
+    </table>""")
+
+    actual_with_headers = text_plugin.make_table(array2d, headers=['c1', 'c2'])
+    self.assertEqual(actual_with_headers, expected_table_with_headers)
+
+    array_1d = np.array(['one', 'two', 'three', 'four', 'five'])
+    expected_1d = textwrap.dedent("""\
+    <table>
+    <tbody>
+    <tr>
+    <td>one</td>
+    </tr>
+    <tr>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    </tr>
+    <tr>
+    <td>four</td>
+    </tr>
+    <tr>
+    <td>five</td>
+    </tr>
+    </tbody>
+    </table>""")
+    self.assertEqual(text_plugin.make_table(array_1d), expected_1d)
+
+    expected_1d_with_headers = textwrap.dedent("""\
+    <table>
+    <thead>
+    <tr>
+    <th>X</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>one</td>
+    </tr>
+    <tr>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    </tr>
+    <tr>
+    <td>four</td>
+    </tr>
+    <tr>
+    <td>five</td>
+    </tr>
+    </tbody>
+    </table>""")
+    actual_1d_with_headers = text_plugin.make_table(array_1d, headers=['X'])
+    self.assertEqual(actual_1d_with_headers, expected_1d_with_headers)
+
+  def testMakeTableExceptions(self):
+    # Verify that contents is being type-checked and shape-checked.
+    with self.assertRaises(ValueError):
+      text_plugin.make_table([])
+
+    with self.assertRaises(ValueError):
+      text_plugin.make_table('foo')
+
+    with self.assertRaises(ValueError):
+      invalid_shape = np.full((3, 3, 3), 'nope', dtype=np.dtype('S3'))
+      text_plugin.make_table(invalid_shape)
+
+    # Test headers exceptions in 2d array case.
+    test_array = np.full((3, 3), 'foo', dtype=np.dtype('S3'))
+    with self.assertRaises(ValueError):
+      # Headers is wrong type.
+      text_plugin.make_table(test_array, headers='foo')
+    with self.assertRaises(ValueError):
+      # Too many headers.
+      text_plugin.make_table(test_array, headers=['foo', 'bar', 'zod', 'zoink'])
+    with self.assertRaises(ValueError):
+      # headers is 2d
+      text_plugin.make_table(test_array, headers=test_array)
+
+    # Also make sure the column counting logic works in the 1d array case.
+    test_array = np.array(['foo', 'bar', 'zod'])
+    with self.assertRaises(ValueError):
+      # Too many headers.
+      text_plugin.make_table(test_array, headers=test_array)
+
+  def test_reduce_to_2d(self):
+
+    def make_range_array(dim):
+      """Produce an incrementally increasing multidimensional array.
+
+      Args:
+        dim: the number of dimensions for the array
+
+      Returns:
+        An array of increasing integer elements, with dim dimensions and size
+        two in each dimension.
+
+      Example: rangeArray(2) results in [[0,1],[2,3]].
+      """
+      return np.array(range(2**dim)).reshape([2] * dim)
+
+    for i in range(2, 5):
+      actual = text_plugin.reduce_to_2d(make_range_array(i))
+      expected = make_range_array(2)
+      np.testing.assert_array_equal(actual, expected)
+
+  def test_text_array_to_html(self):
+
+    convert = text_plugin.text_array_to_html
+    scalar = np.array('foo')
+    scalar_expected = '<p>foo</p>'
+    self.assertEqual(convert(scalar), scalar_expected)
+
+    vector = np.array(['foo', 'bar'])
+    vector_expected = textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      </tr>
+      <tr>
+      <td><p>bar</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(vector), vector_expected)
+
+    d2 = np.array([['foo', 'bar'], ['zoink', 'zod']])
+    d2_expected = textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      <td><p>bar</p></td>
+      </tr>
+      <tr>
+      <td><p>zoink</p></td>
+      <td><p>zod</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(d2), d2_expected)
+
+    d3 = np.array([[['foo', 'bar'], ['zoink', 'zod']], [['FOO', 'BAR'],
+                                                        ['ZOINK', 'ZOD']]])
+
+    warning = text_plugin.markdown_and_sanitize(text_plugin.WARNING_TEMPLATE %
+                                                3)
+    d3_expected = warning + textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      <td><p>bar</p></td>
+      </tr>
+      <tr>
+      <td><p>zoink</p></td>
+      <td><p>zod</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(d3), d3_expected)
+
+  def testPluginIsActive(self):
+    plugin = text_plugin.TextPlugin()
+    multiplexer = event_multiplexer.EventMultiplexer()
+    plugin.get_plugin_apps(event_multiplexer.EventMultiplexer(), None)
+
+    # The plugin is inactive because text summaries are not available.
+    self.assertFalse(plugin.is_active())
+
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+
+    # The plugin is active because text summaries are available.
+    self.assertTrue(self.plugin.is_active())
+
+  def testUnicode(self):
+    self.assertConverted(u'<p>Iñtërnâtiônàlizætiøn⚡💩</p>',
+                         'Iñtërnâtiônàlizætiøn⚡💩')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/tensorboard/scripts/BUILD b/tensorflow/tensorboard/scripts/BUILD
index 710191b238a..05425ee61d0 100644
--- a/tensorflow/tensorboard/scripts/BUILD
+++ b/tensorflow/tensorboard/scripts/BUILD
@@ -1,7 +1,7 @@
 # Description:
 # Some useful scripts that are bundled with TensorBoard.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//tensorflow/tensorboard:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -12,18 +12,19 @@ py_binary(
     srcs = ["generate_testdata.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_binary(
+    name = "execrooter",
+    srcs = ["execrooter.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(["*"]),
diff --git a/tensorflow/tensorboard/scripts/execrooter.py b/tensorflow/tensorboard/scripts/execrooter.py
new file mode 100644
index 00000000000..65569b91512
--- /dev/null
+++ b/tensorflow/tensorboard/scripts/execrooter.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility for running programs in a symlinked execroot."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+
+def run(inputs, program, outputs):
+  """Creates temp symlink tree, runs program, and copies back outputs.
+
+  Args:
+    inputs: List of fake paths to real paths, which are used for symlink tree.
+    program: List containing real path of program and its arguments. The
+        execroot directory will be appended as the last argument.
+    outputs: List of fake outputted paths to copy back to real paths.
+  Returns:
+    0 if succeeded or nonzero if failed.
+  """
+  root = tempfile.mkdtemp()
+  try:
+    cwd = os.getcwd()
+    for fake, real in inputs:
+      parent = os.path.join(root, os.path.dirname(fake))
+      if not os.path.exists(parent):
+        os.makedirs(parent)
+      os.symlink(os.path.join(cwd, real), os.path.join(root, fake))
+    if subprocess.call(program + [root]) != 0:
+      return 1
+    for fake, real in outputs:
+      shutil.copyfile(os.path.join(root, fake), real)
+    return 0
+  finally:
+    shutil.rmtree(root)
+
+
+def main(args):
+  """Invokes run function using a JSON file config.
+
+  Args:
+    args: CLI args, which can be a JSON file containing an object whose
+        attributes are the parameters to the run function. If multiple JSON
+        files are passed, their contents are concatenated.
+  Returns:
+    0 if succeeded or nonzero if failed.
+  Raises:
+    Exception: If input data is missing.
+  """
+  if not args:
+    raise Exception('Please specify at least one JSON config path')
+  inputs = []
+  program = []
+  outputs = []
+  for arg in args:
+    with open(arg) as fd:
+      config = json.load(fd)
+    inputs.extend(config.get('inputs', []))
+    program.extend(config.get('program', []))
+    outputs.extend(config.get('outputs', []))
+  if not program:
+    raise Exception('Please specify a program')
+  return run(inputs, program, outputs)
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv[1:]))
diff --git a/tensorflow/tensorboard/scripts/generate_testdata.py b/tensorflow/tensorboard/scripts/generate_testdata.py
index f89ab690ba3..f191d16a82d 100644
--- a/tensorflow/tensorboard/scripts/generate_testdata.py
+++ b/tensorflow/tensorboard/scripts/generate_testdata.py
@@ -28,20 +28,13 @@ import shutil
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-from tensorflow.python.summary.writer import writer as writer_lib
+import tensorflow as tf
 
-tf.flags.DEFINE_string("target", None, """The directoy where serialized data
+
+tf.flags.DEFINE_string("target", None, """The directory where serialized data
 will be written""")
 
-flags.DEFINE_boolean("overwrite", False, """Whether to remove and overwrite
+tf.flags.DEFINE_boolean("overwrite", False, """Whether to remove and overwrite
 TARGET if it already exists.""")
 
 FLAGS = tf.flags.FLAGS
@@ -76,7 +69,7 @@ def _MakeHistogram(values):
   bucket_limit = [lc[0] for lc in limit_counts]
   bucket = [lc[1] for lc in limit_counts]
   sum_sq = sum(v * v for v in values)
-  return summary_pb2.HistogramProto(
+  return tf.HistogramProto(
       min=min(values),
       max=max(values),
       num=len(values),
@@ -92,9 +85,9 @@ def WriteScalarSeries(writer, tag, f, n=5):
   wall_time = _start_time
   for i in xrange(n):
     v = f(i)
-    value = summary_pb2.Summary.Value(tag=tag, simple_value=v)
-    summary = summary_pb2.Summary(value=[value])
-    event = event_pb2.Event(wall_time=wall_time, step=step, summary=summary)
+    value = tf.Summary.Value(tag=tag, simple_value=v)
+    summary = tf.Summary(value=[value])
+    event = tf.Event(wall_time=wall_time, step=step, summary=summary)
     writer.add_event(event)
     step += 1
     wall_time += 10
@@ -107,10 +100,8 @@ def WriteHistogramSeries(writer, tag, mu_sigma_tuples, n=20):
   for [mean, stddev] in mu_sigma_tuples:
     data = [random.normalvariate(mean, stddev) for _ in xrange(n)]
     histo = _MakeHistogram(data)
-    summary = summary_pb2.Summary(
-        value=[summary_pb2.Summary.Value(
-            tag=tag, histo=histo)])
-    event = event_pb2.Event(wall_time=wall_time, step=step, summary=summary)
+    summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=histo)])
+    event = tf.Event(wall_time=wall_time, step=step, summary=summary)
     writer.add_event(event)
     step += 10
     wall_time += 100
@@ -119,9 +110,9 @@ def WriteHistogramSeries(writer, tag, mu_sigma_tuples, n=20):
 def WriteImageSeries(writer, tag, n_images=1):
   """Write a few dummy images to writer."""
   step = 0
-  session = session_lib.Session()
-  p = array_ops.placeholder("uint8", (1, 4, 4, 3))
-  s = logging_ops.image_summary(tag, p)
+  session = tf.Session()
+  p = tf.placeholder("uint8", (1, 4, 4, 3))
+  s = tf.summary.image(tag, p)
   for _ in xrange(n_images):
     im = np.random.random_integers(0, 255, (1, 4, 4, 3))
     summ = session.run(s, feed_dict={p: im})
@@ -133,18 +124,18 @@ def WriteImageSeries(writer, tag, n_images=1):
 def WriteAudioSeries(writer, tag, n_audio=1):
   """Write a few dummy audio clips to writer."""
   step = 0
-  session = session_lib.Session()
+  session = tf.Session()
 
   min_frequency_hz = 440
   max_frequency_hz = 880
   sample_rate = 4000
-  duration_frames = sample_rate * 0.5  # 0.5 seconds.
+  duration_frames = sample_rate // 2  # 0.5 seconds.
   frequencies_per_run = 1
   num_channels = 2
 
-  p = array_ops.placeholder("float32", (frequencies_per_run, duration_frames,
-                                        num_channels))
-  s = logging_ops.audio_summary(tag, p, sample_rate)
+  p = tf.placeholder("float32", (frequencies_per_run, duration_frames,
+                                 num_channels))
+  s = tf.summary.audio(tag, p, sample_rate)
 
   for _ in xrange(n_audio):
     # Generate a different frequency for each channel to show stereo works.
@@ -170,7 +161,7 @@ def GenerateTestData(path):
   """Generates the test data directory."""
   run1_path = os.path.join(path, "run1")
   os.makedirs(run1_path)
-  writer1 = writer_lib.FileWriter(run1_path)
+  writer1 = tf.summary.FileWriter(run1_path)
   WriteScalarSeries(writer1, "foo/square", lambda x: x * x)
   WriteScalarSeries(writer1, "bar/square", lambda x: x * x)
   WriteScalarSeries(writer1, "foo/sin", math.sin)
@@ -183,7 +174,7 @@ def GenerateTestData(path):
 
   run2_path = os.path.join(path, "run2")
   os.makedirs(run2_path)
-  writer2 = writer_lib.FileWriter(run2_path)
+  writer2 = tf.summary.FileWriter(run2_path)
   WriteScalarSeries(writer2, "foo/square", lambda x: x * x * 2)
   WriteScalarSeries(writer2, "bar/square", lambda x: x * x * 3)
   WriteScalarSeries(writer2, "foo/cos", lambda x: math.cos(x) * 2)
@@ -194,7 +185,7 @@ def GenerateTestData(path):
   WriteImageSeries(writer2, "im1")
   WriteAudioSeries(writer2, "au2")
 
-  graph_def = graph_pb2.GraphDef()
+  graph_def = tf.GraphDef()
   node1 = graph_def.node.add()
   node1.name = "a"
   node1.op = "matmul"
@@ -231,4 +222,4 @@ def main(unused_argv=None):
 
 
 if __name__ == "__main__":
-  app.run()
+  tf.app.run()
diff --git a/tensorflow/tensorboard/tensorboard.py b/tensorflow/tensorboard/tensorboard.py
deleted file mode 100644
index 87f4f9fe511..00000000000
--- a/tensorflow/tensorboard/tensorboard.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Serve TensorFlow summary data to a web frontend.
-
-This is a simple web server to proxy data from the event_loader to the web, and
-serve static web files.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import socket
-from werkzeug import serving
-
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import event_file_inspector as efi
-from tensorflow.python.summary import event_multiplexer
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.plugins.debugger import plugin as debugger_plugin
-from tensorflow.tensorboard.plugins.projector import plugin as projector_plugin
-
-flags.DEFINE_string('logdir', '', """logdir specifies the directory where
-TensorBoard will look to find TensorFlow event files that it can display.
-TensorBoard will recursively walk the directory structure rooted at logdir,
-looking for .*tfevents.* files.
-
-You may also pass a comma separated list of log directories, and TensorBoard
-will watch each directory. You can also assign names to individual log
-directories by putting a colon between the name and the path, as in
-
-tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
-""")
-
-flags.DEFINE_boolean(
-    'insecure_debug_mode', False, 'Whether to run the app in debug mode. '
-    'This increases log verbosity, and enables debugging on server exceptions.')
-
-flags.DEFINE_string('host', '0.0.0.0', 'What host to listen to. Defaults to '
-                    'serving on 0.0.0.0, set to 127.0.0.1 (localhost) to'
-                    'disable remote access (also quiets security warnings).')
-
-flags.DEFINE_boolean('inspect', False, """Use this flag to print out a digest
-of your event files to the command line, when no data is shown on TensorBoard or
-the data shown looks weird.
-
-Example usages:
-tensorboard --inspect --event_file=myevents.out
-tensorboard --inspect --event_file=myevents.out --tag=loss
-tensorboard --inspect --logdir=mylogdir
-tensorboard --inspect --logdir=mylogdir --tag=loss
-
-See tensorflow/python/summary/event_file_inspector.py for more info and
-detailed usage.
-""")
-flags.DEFINE_string(
-    'tag', '',
-    'The particular tag to query for. Only used if --inspect is present')
-flags.DEFINE_string(
-    'event_file', '',
-    'The particular event file to query for. Only used if --inspect is present '
-    'and --logdir is not specified.')
-
-flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
-
-flags.DEFINE_boolean('purge_orphaned_data', True, 'Whether to purge data that '
-                     'may have been orphaned due to TensorBoard restarts. '
-                     'Disabling purge_orphaned_data can be used to debug data '
-                     'disappearance.')
-
-flags.DEFINE_integer('reload_interval', 60, 'How often the backend should load '
-                     'more data.')
-
-FLAGS = flags.FLAGS
-
-
-class Server(object):
-  """A simple WSGI-compliant http server that can serve TensorBoard."""
-
-  def get_tag(self):
-    """Read the TensorBoard TAG number, and return it or an empty string."""
-    try:
-      tag = resource_loader.load_resource('tensorboard/TAG').strip()
-      logging.info('TensorBoard is tag: %s', tag)
-      return tag
-    except IOError:
-      logging.info('Unable to read TensorBoard tag')
-      return ''
-
-  def create_app(self):
-    """Creates a WSGI-compliant app than can handle TensorBoard requests.
-
-    Returns:
-      (function) A complete WSGI application that handles TensorBoard requests.
-    """
-
-    logdir = os.path.expanduser(FLAGS.logdir)
-    if not logdir:
-      msg = ('A logdir must be specified. Run `tensorboard --help` for '
-             'details and examples.')
-      logging.error(msg)
-      print(msg)
-      return -1
-
-    multiplexer = event_multiplexer.EventMultiplexer(
-        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
-        purge_orphaned_data=FLAGS.purge_orphaned_data)
-    plugins = {
-        debugger_plugin.PLUGIN_PREFIX_ROUTE:
-            debugger_plugin.DebuggerPlugin(),
-        projector_plugin.PLUGIN_PREFIX_ROUTE:
-            projector_plugin.ProjectorPlugin(),
-    }
-    return application.TensorBoardWSGIApp(
-        logdir,
-        plugins,
-        multiplexer,
-        reload_interval=FLAGS.reload_interval)
-
-  def serve(self):
-    """Starts a WSGI server that serves the TensorBoard app."""
-
-    tb_app = self.create_app()
-    logging.info('Starting TensorBoard in directory %s', os.getcwd())
-    debug = FLAGS.insecure_debug_mode
-    if debug:
-      logging.set_verbosity(logging.DEBUG)
-      logging.warning('TensorBoard is in debug mode. This is NOT SECURE.')
-
-    print('Starting TensorBoard %s on port %d' % (self.get_tag(), FLAGS.port))
-    if FLAGS.host == '0.0.0.0':
-      try:
-        host = socket.gethostbyname(socket.gethostname())
-        print('(You can navigate to http://%s:%d)' % (host, FLAGS.port))
-      except socket.gaierror:
-        pass
-    else:
-      print('(You can navigate to http://%s:%d)' % (FLAGS.host, FLAGS.port))
-
-    try:
-      serving.run_simple(
-          FLAGS.host,
-          FLAGS.port,
-          tb_app,
-          threaded=True,
-          use_reloader=debug,
-          use_evalex=debug,
-          use_debugger=debug)
-    except socket.error:
-      if FLAGS.port == 0:
-        msg = 'Unable to find any open ports.'
-        logging.error(msg)
-        print(msg)
-        return -2
-      else:
-        msg = 'Tried to connect to port %d, but address is in use.' % FLAGS.port
-        logging.error(msg)
-        print(msg)
-        return -3
-
-
-def main(unused_argv=None):
-  if FLAGS.inspect:
-    logging.info('Not bringing up TensorBoard, but inspecting event files.')
-    event_file = os.path.expanduser(FLAGS.event_file)
-    efi.inspect(FLAGS.logdir, event_file, FLAGS.tag)
-    return 0
-
-  Server().serve()
-
-
-if __name__ == '__main__':
-  app.run()
diff --git a/tensorflow/tensorboard/tsconfig.json b/tensorflow/tensorboard/tsconfig.json
deleted file mode 100644
index ac69c30533f..00000000000
--- a/tensorflow/tensorboard/tsconfig.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "compilerOptions": {
-    "noImplicitAny": false,
-    "noEmitOnError": true,
-    "target": "ES5",
-    "module": "commonjs"
-  },
-  "compileOnSave": false,
-  "exclude": [
-    "node_modules",
-    "typings/main.d.ts",
-    "typings/main",
-    "lib",
-    "components/**/deps.d.ts"
-  ]
-}
diff --git a/tensorflow/tensorboard/tslint.json b/tensorflow/tensorboard/tslint.json
deleted file mode 100644
index 2a5d995e710..00000000000
--- a/tensorflow/tensorboard/tslint.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "rules": {
-    "class-name": true,
-    "comment-format": [true, "check-space"],
-    "curly": true,
-    "eofline": true,
-    "forin": true,
-    "jsdoc-format": true,
-    "label-position": true,
-    "label-undefined": true,
-    "max-line-length": [true, 80],
-    "member-ordering": [false, "variables-before-functions"],
-    "no-arg": true,
-    "no-consecutive-blank-lines": true,
-    "no-console": [true,
-        "log",
-        "debug",
-        "info",
-        "time",
-        "timeEnd",
-        "trace",
-        "warn"
-    ],
-    "no-construct": true,
-    "no-constructor-vars": true,
-    "no-debugger": true,
-    "no-duplicate-key": true,
-    "no-duplicate-variable": true,
-    "no-empty": true,
-    "no-eval": true,
-    "no-trailing-whitespace": true,
-    "no-unreachable": true,
-    "no-unused-expression": true,
-    "no-unused-variable": false,
-    "no-use-before-declare": false,
-    "one-line": [true,
-        "check-catch",
-        "check-else",
-        "check-open-brace",
-        "check-whitespace"
-    ],
-    "quotemark": [true,
-        "single"
-    ],
-    "radix": true,
-    "semicolon": [true, "always"],
-    "triple-equals": [true,
-        "allow-null-check"
-    ],
-    "typedef-whitespace": [true, {
-        "call-signature": "nospace",
-        "index-signature": "nospace",
-        "parameter": "nospace",
-        "property-declaration": "nospace",
-        "variable-declaration": "nospace"
-    }],
-    "whitespace": [true,
-        "check-branch",
-        "check-decl",
-        "check-operator",
-        "check-type"
-    ]
-  }
-}
diff --git a/tensorflow/tensorboard/typings.json b/tensorflow/tensorboard/typings.json
deleted file mode 100644
index c36aa2fb9cc..00000000000
--- a/tensorflow/tensorboard/typings.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "name": "tensorflow-vis",
-  "dependencies": {},
-  "globalDependencies": {
-    "chai": "registry:dt/chai#3.4.0+20160317120654",
-    "d3": "registry:dt/d3#0.0.0+20160514171929",
-    "es6-promise": "registry:dt/es6-promise#0.0.0+20160423074304",
-    "lodash": "registry:dt/lodash#3.10.0+20160330154726",
-    "mocha": "registry:dt/mocha#2.2.5+20160317120654",
-    "polymer": "registry:dt/polymer#1.1.6+20160922133320",
-    "sinon": "registry:dt/sinon#1.16.0+20160517064723",
-    "three": "registry:dt/three#0.0.0+20160802154944",
-    "webcomponents.js": "registry:dt/webcomponents.js#0.6.0+20160728153134"
-  }
-}
diff --git a/tensorflow/tensorboard/wct.conf.json b/tensorflow/tensorboard/wct.conf.json
deleted file mode 100644
index 519218ce418..00000000000
--- a/tensorflow/tensorboard/wct.conf.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "suites": [
-    "components/tf_*/test",
-    "components/vz_*/test"
-  ],
-  "plugins": ["local"]
-}
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a82bcfee611..d01342827dc 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,136 +1,178 @@
 # -*- Python -*-
 
+
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
+
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
-    "tf_additional_xla_deps_py",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
-    "cuda_default_copts"
-)
+    "tf_additional_xla_deps_py",)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "cuda_default_copts")
 
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
+    "if_mkl",)
+
+
+def full_path(relative_paths):
+  return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
-  return ["//tensorflow/core:" + p
-          for p in core_proto_sources_relative]
+  return [
+      "//tensorflow/core:" + p for p in core_proto_sources_relative
+  ]
+
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
-  return (["//tensorflow/core/" + p.replace(".proto", ".pb.h")
-          for p in core_proto_sources_relative] +
-         ["//tensorflow/core/" + p.replace(".proto", ".proto.h")
-          for p in core_proto_sources_relative])
+  return ([
+      "//tensorflow/core/" + p.replace(".proto", ".pb.h")
+      for p in core_proto_sources_relative
+  ] + [
+      "//tensorflow/core/" + p.replace(".proto", ".proto.h")
+      for p in core_proto_sources_relative
+  ])
+
+
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+  return str(Label(dep))
+
+
+def if_android_x86(a):
+  return select({
+      clean_dep("//tensorflow:android_x86"): a,
+      clean_dep("//tensorflow:android_x86_64"): a,
+      "//conditions:default": [],
+  })
+
 
 def if_android_arm(a):
   return select({
-      "//tensorflow:android_arm": a,
+      clean_dep("//tensorflow:android_arm"): a,
       "//conditions:default": [],
   })
 
+
 def if_android_arm64(a):
   return select({
-      "//tensorflow:android_arm64": a,
+      clean_dep("//tensorflow:android_arm64"): a,
       "//conditions:default": [],
   })
 
+
 def if_not_android(a):
   return select({
-      "//tensorflow:android": [],
+      clean_dep("//tensorflow:android"): [],
       "//conditions:default": a,
   })
 
+
 def if_android(a):
   return select({
-      "//tensorflow:android": a,
+      clean_dep("//tensorflow:android"): a,
       "//conditions:default": [],
   })
 
+
 def if_ios(a):
   return select({
-      "//tensorflow:ios": a,
+      clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+
 def if_mobile(a):
   return select({
-      "//tensorflow:android": a,
-      "//tensorflow:ios": a,
+      clean_dep("//tensorflow:android"): a,
+      clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+
 def if_not_mobile(a):
   return select({
-      "//tensorflow:android": [],
-      "//tensorflow:ios": [],
+      clean_dep("//tensorflow:android"): [],
+      clean_dep("//tensorflow:ios"): [],
       "//conditions:default": a,
   })
 
+
 def if_not_windows(a):
   return select({
-      "//tensorflow:windows": [],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": a,
   })
 
+
 def if_x86(a):
   return select({
-      "//tensorflow:linux_x86_64": a,
-      "//tensorflow:windows": a,
+      clean_dep("//tensorflow:linux_x86_64"): a,
+      clean_dep("//tensorflow:windows"): a,
+      clean_dep("//tensorflow:windows_msvc"): a,
       "//conditions:default": [],
   })
 
+def if_darwin(a):
+  return select({
+      clean_dep("//tensorflow:darwin"): a,
+      "//conditions:default": [],
+  })
+
+WIN_COPTS = [
+    "/DLANG_CXX11",
+    "/D__VERSION__=\\\"MSVC\\\"",
+    "/DPLATFORM_WINDOWS",
+    "/DTF_COMPILE_LIBRARY",
+    "/DEIGEN_HAS_C99_MATH",
+    "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+]
+
 # LINT.IfChange
 def tf_copts():
-  return (["-DEIGEN_AVOID_STL_ARRAY",
-           "-Iexternal/gemmlowp",
-           "-Wno-sign-compare",
-           "-fno-exceptions",] +
-          if_cuda(["-DGOOGLE_CUDA=1"]) +
-          if_mkl(["-DINTEL_MKL=1"]) +
-          if_android_arm(["-mfpu=neon"]) +
-          if_x86(["-msse4.1"]) +
-          select({
-              "//tensorflow:android": [
-                  "-std=c++11",
-                  "-DTF_LEAN_BINARY",
-                  "-O2",
-              ],
-              "//tensorflow:darwin": [],
-              "//tensorflow:windows": [
-                "/DLANG_CXX11",
-                "/D__VERSION__=\\\"MSVC\\\"",
-                "/DPLATFORM_WINDOWS",
-                "/DEIGEN_HAS_C99_MATH",
-                "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-		"/DEIGEN_VECTORIZE_SSE3",  # To flush denormals without __SSE3__ set.
-              ],
-              "//tensorflow:ios": ["-std=c++11"],
-              "//conditions:default": ["-pthread"]}))
+  return ([
+      "-DEIGEN_AVOID_STL_ARRAY",
+      "-Iexternal/gemmlowp",
+      "-Wno-sign-compare",
+      "-fno-exceptions",
+  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
+      ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
+          clean_dep("//tensorflow:android"): [
+              "-std=c++11",
+              "-DTF_LEAN_BINARY",
+              "-O2",
+          ],
+          clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): WIN_COPTS,
+          clean_dep("//tensorflow:windows_msvc"): WIN_COPTS,
+          clean_dep("//tensorflow:ios"): ["-std=c++11"],
+          "//conditions:default": ["-pthread"]
+      }))
+
 
 def tf_opts_nortti_if_android():
   return if_android([
       "-fno-rtti",
       "-DGOOGLE_PROTOBUF_NO_RTTI",
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
-  ])
+  ]) + if_android_x86(["-msse4.1"])
+
+
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps=None):
@@ -139,16 +181,20 @@ def tf_gen_op_libs(op_lib_names, deps=None):
   if not deps:
     deps = []
   for n in op_lib_names:
-    native.cc_library(name=n + "_op_lib",
-                      copts=tf_copts(),
-                      srcs=["ops/" + n + ".cc"],
-                      deps=deps + ["//tensorflow/core:framework"],
-                      visibility=["//visibility:public"],
-                      alwayslink=1,
-                      linkstatic=1,)
+    native.cc_library(
+        name=n + "_op_lib",
+        copts=tf_copts(),
+        srcs=["ops/" + n + ".cc"],
+        deps=deps + [clean_dep("//tensorflow/core:framework")],
+        visibility=["//visibility:public"],
+        alwayslink=1,
+        linkstatic=1,)
 
-def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
-                         op_gen="//tensorflow/cc:cc_op_gen_main",
+
+def tf_gen_op_wrapper_cc(name,
+                         out_ops_file,
+                         pkg="",
+                         op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                          deps=None,
                          override_file=None,
                          include_internal_ops=0):
@@ -157,12 +203,11 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
   if deps == None:
     deps = [pkg + ":" + name + "_op_lib"]
   native.cc_binary(
-      name = tool,
-      copts = tf_copts(),
-      linkopts = ["-lm"],
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = [op_gen] + deps
-  )
+      name=tool,
+      copts=tf_copts(),
+      linkopts=["-lm"],
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=[op_gen] + deps)
 
   if override_file == None:
     srcs = []
@@ -172,14 +217,17 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
     override_arg = "$(location " + override_file + ")"
   native.genrule(
       name=name + "_genrule",
-      outs=[out_ops_file + ".h", out_ops_file + ".cc",
-            out_ops_file + "_internal.h", out_ops_file + "_internal.cc"],
+      outs=[
+          out_ops_file + ".h", out_ops_file + ".cc",
+          out_ops_file + "_internal.h", out_ops_file + "_internal.cc"
+      ],
       srcs=srcs,
       tools=[":" + tool],
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
            "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
            str(include_internal_ops)))
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
 # files for each of the ops files mentioned, and then generate a
@@ -206,18 +254,18 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
 #            hdrs = [ "ops/array_ops_internal.h",
 #                     "ops/math_ops_internal.h" ],
 #            deps = [ ... ])
-# TODO(josh11b): Cleaner approach for hidden ops.
+# TODO(joshl): Cleaner approach for hidden ops.
 def tf_gen_op_wrappers_cc(name,
                           op_lib_names=[],
                           other_srcs=[],
                           other_hdrs=[],
                           pkg="",
                           deps=[
-                              "//tensorflow/cc:ops",
-                              "//tensorflow/cc:scope",
-                              "//tensorflow/cc:const_op",
+                              clean_dep("//tensorflow/cc:ops"),
+                              clean_dep("//tensorflow/cc:scope"),
+                              clean_dep("//tensorflow/cc:const_op"),
                           ],
-                          op_gen="//tensorflow/cc:cc_op_gen_main",
+                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                           override_file=None,
                           include_internal_ops=0,
                           visibility=None):
@@ -227,59 +275,72 @@ def tf_gen_op_wrappers_cc(name,
   internalhdrs = []
   for n in op_lib_names:
     tf_gen_op_wrapper_cc(
-        n, "ops/" + n, pkg=pkg, op_gen=op_gen, override_file=override_file,
+        n,
+        "ops/" + n,
+        pkg=pkg,
+        op_gen=op_gen,
+        override_file=override_file,
         include_internal_ops=include_internal_ops)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
     internalsrcs += ["ops/" + n + "_internal.cc"]
     internalhdrs += ["ops/" + n + "_internal.h"]
 
-  native.cc_library(name=name,
-                    srcs=subsrcs,
-                    hdrs=subhdrs,
-                    deps=deps + if_not_android([
-                        "//tensorflow/core:core_cpu",
-                        "//tensorflow/core:framework",
-                        "//tensorflow/core:lib",
-                        "//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=visibility)
-  native.cc_library(name=name + "_internal",
-                    srcs=internalsrcs,
-                    hdrs=internalhdrs,
-                    deps=deps + if_not_android([
-                        "//tensorflow/core:core_cpu",
-                        "//tensorflow/core:framework",
-                        "//tensorflow/core:lib",
-                        "//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=["//tensorflow:internal"])
+  native.cc_library(
+      name=name,
+      srcs=subsrcs,
+      hdrs=subhdrs,
+      deps=deps + if_not_android([
+          clean_dep("//tensorflow/core:core_cpu"),
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib"),
+          clean_dep("//tensorflow/core:protos_all_cc"),
+      ]) + if_android([
+          clean_dep("//tensorflow/core:android_tensorflow_lib"),
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=visibility)
+  native.cc_library(
+      name=name + "_internal",
+      srcs=internalsrcs,
+      hdrs=internalhdrs,
+      deps=deps + if_not_android([
+          clean_dep("//tensorflow/core:core_cpu"),
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib"),
+          clean_dep("//tensorflow/core:protos_all_cc"),
+      ]) + if_android([
+          clean_dep("//tensorflow/core:android_tensorflow_lib"),
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=[clean_dep("//tensorflow:internal")])
+
 
 # Invoke this rule in .../tensorflow/python to build the wrapper library.
-def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
-                         require_shape_functions=False, hidden_file=None,
+def tf_gen_op_wrapper_py(name,
+                         out=None,
+                         hidden=None,
+                         visibility=None,
+                         deps=[],
+                         require_shape_functions=False,
+                         hidden_file=None,
                          generated_target_name=None):
   # Construct a cc_binary containing the specified ops.
   tool_name = "gen_" + name + "_py_wrappers_cc"
   if not deps:
-    deps = ["//tensorflow/core:" + name + "_op_lib"]
+    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
   native.cc_binary(
-      name = tool_name,
-      linkopts = ["-lm"],
-      copts = tf_copts(),
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = (["//tensorflow/core:framework",
-               "//tensorflow/python:python_op_gen_main"] + deps),
-      visibility = ["//tensorflow:internal"],
-  )
+      name=tool_name,
+      linkopts=["-lm"],
+      copts=tf_copts(),
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=([
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/python:python_op_gen_main")
+      ] + deps),
+      visibility=[clean_dep("//tensorflow:internal")],)
 
   # Invoke the previous cc_binary to generate a python file.
   if not out:
@@ -291,8 +352,8 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") " + ",".join(hidden)
-             + " " + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " + ",".join(hidden) + " " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   elif hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
@@ -301,77 +362,143 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         outs=[out],
         srcs=[hidden_file],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") @$(location "
-             + hidden_file + ") " + ("1" if require_shape_functions else "0")
-             + " > $@"))
+        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   else:
     # No ops should be hidden in the generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") "
-             + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
 
   # Make a py_library out of the generated python file.
   if not generated_target_name:
     generated_target_name = name
-  native.py_library(name=generated_target_name,
-                    srcs=[out],
-                    srcs_version="PY2AND3",
-                    visibility=visibility,
-                    deps=[
-                        "//tensorflow/python:framework_for_generated_wrappers",
-                    ],)
+  native.py_library(
+      name=generated_target_name,
+      srcs=[out],
+      srcs_version="PY2AND3",
+      visibility=visibility,
+      deps=[
+          clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
+      ],)
+
 
 # Define a bazel macro that creates cc_test for tensorflow.
 # TODO(opensource): we need to enable this to work around the hidden symbol
 # __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name, srcs, deps, linkstatic=0, tags=[], data=[], size="medium",
-               suffix="", args=None, linkopts=[]):
-  native.cc_test(name="%s%s" % (name, suffix),
-                 srcs=srcs,
-                 size=size,
-                 args=args,
-                 copts=tf_copts(),
-                 data=data,
-                 deps=deps,
-                 linkopts=["-lpthread", "-lm"] + linkopts,
-                 linkstatic=linkstatic,
-                 tags=tags)
+def tf_cc_test(name,
+               srcs,
+               deps,
+               linkstatic=0,
+               tags=[],
+               data=[],
+               size="medium",
+               suffix="",
+               args=None,
+               linkopts=[]):
+  native.cc_test(
+      name="%s%s" % (name, suffix),
+      srcs=srcs,
+      size=size,
+      args=args,
+      copts=tf_copts(),
+      data=data,
+      deps=deps,
+      linkopts=["-lpthread", "-lm"] + linkopts,
+      linkstatic=linkstatic,
+      tags=tags)
+
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name, srcs, deps, linkstatic=0, tags=[], data=[],
-                   size="medium", suffix="", args=None):
-  tf_cc_test(name, srcs, deps, linkstatic=linkstatic, tags=tags, data=data,
-             size=size, suffix=suffix, args=args)
+def tf_cc_test_gpu(name,
+                   srcs,
+                   deps,
+                   linkstatic=0,
+                   tags=[],
+                   data=[],
+                   size="medium",
+                   suffix="",
+                   args=None):
+  tf_cc_test(
+      name,
+      srcs,
+      deps,
+      linkstatic=linkstatic,
+      tags=tags,
+      data=data,
+      size=size,
+      suffix=suffix,
+      args=args)
 
-def tf_cuda_cc_test(name, srcs=[], deps=[], tags=[], data=[], size="medium",
-                    linkstatic=0, args=[], linkopts=[]):
-  tf_cc_test(name=name,
-             srcs=srcs,
-             deps=deps,
-             tags=tags + ["manual"],
-             data=data,
-             size=size,
-             linkstatic=linkstatic,
-             linkopts=linkopts,
-             args=args)
-  tf_cc_test(name=name,
-             srcs=srcs,
-             suffix="_gpu",
-             deps=deps + if_cuda(["//tensorflow/core:gpu_runtime"]),
-             linkstatic=if_cuda(1, 0),
-             tags=tags + tf_cuda_tests_tags(),
-             data=data,
-             size=size,
-             linkopts=linkopts,
-             args=args)
+
+def tf_cuda_cc_test(name,
+                    srcs=[],
+                    deps=[],
+                    tags=[],
+                    data=[],
+                    size="medium",
+                    linkstatic=0,
+                    args=[],
+                    linkopts=[]):
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      deps=deps,
+      tags=tags + ["manual"],
+      data=data,
+      size=size,
+      linkstatic=linkstatic,
+      linkopts=linkopts,
+      args=args)
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      suffix="_gpu",
+      deps=deps + if_cuda([clean_dep("//tensorflow/core:gpu_runtime")]),
+      linkstatic=if_cuda(1, 0),
+      tags=tags + tf_cuda_tests_tags(),
+      data=data,
+      size=size,
+      linkopts=linkopts,
+      args=args)
+
+def tf_cuda_only_cc_test(name,
+                    srcs=[],
+                    deps=[],
+                    tags=[],
+                    data=[],
+                    size="medium",
+                    linkstatic=0,
+                    args=[],
+                    linkopts=[]):
+  native.cc_test(
+    name="%s%s" % (name, "_gpu"),
+    srcs=srcs,
+    size=size,
+    args=args,
+    copts= _cuda_copts() + tf_copts(),
+    data=data,
+    deps=deps + if_cuda([
+        clean_dep("//tensorflow/core:cuda"),
+        clean_dep("//tensorflow/core:gpu_lib"),
+    ]),
+    linkopts=["-lpthread", "-lm"] + linkopts,
+    linkstatic=linkstatic,
+    tags=tags)
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                args=None, linkopts=[]):
+def tf_cc_tests(srcs,
+                deps,
+                name="",
+                linkstatic=0,
+                tags=[],
+                size="medium",
+                args=None,
+                linkopts=[]):
   for src in srcs:
     tf_cc_test(
         name=src_to_test_name(src),
@@ -383,17 +510,35 @@ def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
         args=args,
         linkopts=linkopts)
 
-def tf_cc_test_mkl(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                    args=None):
-  tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
-def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
+def tf_cc_test_mkl(srcs,
+                   deps,
+                   name="",
+                   linkstatic=0,
+                   tags=[],
+                   size="medium",
+                   args=None):
+  if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args))
+
+
+def tf_cc_tests_gpu(srcs,
+                    deps,
+                    name="",
+                    linkstatic=0,
+                    tags=[],
+                    size="medium",
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
 
-def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
-                     args=None, linkopts=[]):
+def tf_cuda_cc_tests(srcs,
+                     deps,
+                     name="",
+                     tags=[],
+                     size="medium",
+                     linkstatic=0,
+                     args=None,
+                     linkopts=[]):
   for src in srcs:
     tf_cuda_cc_test(
         name=src_to_test_name(src),
@@ -405,48 +550,52 @@ def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
         args=args,
         linkopts=linkopts)
 
+
 def _cuda_copts():
-    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+  """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
     If we're doing CUDA compilation, returns copts for our particular CUDA
     compiler.  If we're not doing CUDA compilation, returns an empty list.
 
     """
-    return cuda_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_cuda//cuda:using_nvcc": (
-            [
-                "-nvcc_options=relaxed-constexpr",
-                "-nvcc_options=ftz=true",
-            ]
-        ),
-        "@local_config_cuda//cuda:using_clang": (
-            [
-                "-fcuda-flush-denormals-to-zero",
-            ]
-        ),
-    })
+  return cuda_default_copts() + select({
+      "//conditions:default": [],
+      "@local_config_cuda//cuda:using_nvcc": ([
+          "-nvcc_options=relaxed-constexpr",
+          "-nvcc_options=ftz=true",
+      ]),
+      "@local_config_cuda//cuda:using_clang": ([
+          "-fcuda-flush-denormals-to-zero",
+      ]),
+  })
+
 
 # Build defs for TensorFlow kernels
 
+
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs, copts=[], cuda_copts=[], deps=[], hdrs=[],
+def tf_gpu_kernel_library(srcs,
+                          copts=[],
+                          cuda_copts=[],
+                          deps=[],
+                          hdrs=[],
                           **kwargs):
   copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
 
   native.cc_library(
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      deps = deps + if_cuda([
-          "//tensorflow/core:cuda",
-          "//tensorflow/core:gpu_lib",
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      deps=deps + if_cuda([
+          clean_dep("//tensorflow/core:cuda"),
+          clean_dep("//tensorflow/core:gpu_lib"),
       ]),
       alwayslink=1,
       **kwargs)
 
+
 def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
 
@@ -471,15 +620,23 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
     copts = []
 
   native.cc_library(
-      deps = deps + if_cuda(cuda_deps + [
-          "//tensorflow/core:cuda",
+      deps=deps + if_cuda(cuda_deps + [
+          clean_dep("//tensorflow/core:cuda"),
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
       **kwargs)
 
-def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
-                      deps=None, alwayslink=1, copts=tf_copts(), **kwargs):
+
+def tf_kernel_library(name,
+                      prefix=None,
+                      srcs=None,
+                      gpu_srcs=None,
+                      hdrs=None,
+                      deps=None,
+                      alwayslink=1,
+                      copts=tf_copts(),
+                      **kwargs):
   """A rule to build a TensorFlow OpKernel.
 
   May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
@@ -509,37 +666,59 @@ def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
     deps = []
 
   if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+    if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
       if not gpu_srcs:
         gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob([prefix + "*.cu.cc", prefix + "*.h"],
-                                        exclude = ["*test*"])
-    srcs = srcs + native.glob([prefix + "*.cc"],
-                              exclude = ["*test*", "*.cu.cc"])
-    hdrs = hdrs + native.glob([prefix + "*.h"], exclude = ["*test*", "*.cu.h"])
+      gpu_srcs = gpu_srcs + native.glob(
+          [prefix + "*.cu.cc", prefix + "*.h"], exclude=[prefix + "*test*"])
+    srcs = srcs + native.glob(
+        [prefix + "*.cc"], exclude=[prefix + "*test*", prefix + "*.cu.cc"])
+    hdrs = hdrs + native.glob(
+        [prefix + "*.h"], exclude=[prefix + "*test*", prefix + "*.cu.h"])
 
-  cuda_deps = ["//tensorflow/core:gpu_lib"]
+  cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
   if gpu_srcs:
     for gpu_src in gpu_srcs:
       if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".format(gpu_src))
+        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".
+             format(gpu_src))
     tf_gpu_kernel_library(
-        name = name + "_gpu",
-        srcs = gpu_srcs,
-        deps = deps,
-        **kwargs)
+        name=name + "_gpu", srcs=gpu_srcs, deps=deps, **kwargs)
     cuda_deps.extend([":" + name + "_gpu"])
   tf_cuda_library(
-      name = name,
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      cuda_deps = cuda_deps,
-      linkstatic = 1,   # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink = alwayslink,
-      deps = deps,
+      name=name,
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      cuda_deps=cuda_deps,
+      linkstatic=1,  # Needed since alwayslink is broken in bazel b/27630669
+      alwayslink=alwayslink,
+      deps=deps,
       **kwargs)
 
+
+def tf_mkl_kernel_library(name,
+                          prefix=None,
+                          srcs=None,
+                          gpu_srcs=None,
+                          hdrs=None,
+                          deps=None,
+                          alwayslink=1,
+                          copts=tf_copts(),
+                          **kwargs):
+  if_mkl(
+      tf_kernel_library(
+          name,
+          prefix=prefix,
+          srcs=srcs,
+          gpu_srcs=gpu_srcs,
+          hdrs=hdrs,
+          deps=deps,
+          alwayslink=alwayslink,
+          copts=copts,
+          **kwargs))
+
+
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
   srcs = ctx.files.srcs
@@ -555,59 +734,61 @@ def _py_wrap_cc_impl(ctx):
   inputs += ctx.files.toolchain_deps
   swig_include_dirs = set(_get_repository_roots(ctx, inputs))
   swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = ["-c++",
-          "-python",
-          "-module", module_name,
-          "-o", ctx.outputs.cc_out.path,
-          "-outdir", ctx.outputs.py_out.dirname]
+  args = [
+      "-c++", "-python", "-module", module_name, "-o", ctx.outputs.cc_out.path,
+      "-outdir", ctx.outputs.py_out.dirname
+  ]
   args += ["-l" + f.path for f in ctx.files.swig_includes]
   args += ["-I" + i for i in swig_include_dirs]
   args += [src.path]
-  outputs = [ctx.outputs.cc_out,
-             ctx.outputs.py_out]
-  ctx.action(executable=ctx.executable._swig,
-             arguments=args,
-             inputs=list(inputs),
-             outputs=outputs,
-             mnemonic="PythonSwig",
-             progress_message="SWIGing " + src.path)
+  outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
+  ctx.action(
+      executable=ctx.executable._swig,
+      arguments=args,
+      inputs=list(inputs),
+      outputs=outputs,
+      mnemonic="PythonSwig",
+      progress_message="SWIGing " + src.path)
   return struct(files=set(outputs))
 
+
 _py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            cfg = "data",
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
+    attrs={
+        "srcs":
+            attr.label_list(
+                mandatory=True,
+                allow_files=True,),
+        "swig_includes":
+            attr.label_list(
+                cfg="data",
+                allow_files=True,),
+        "deps":
+            attr.label_list(
+                allow_files=True,
+                providers=["cc"],),
+        "toolchain_deps":
+            attr.label_list(
+                allow_files=True,),
+        "module_name":
+            attr.string(mandatory=True),
+        "py_module_name":
+            attr.string(mandatory=True),
+        "_swig":
+            attr.label(
+                default=Label("@swig//:swig"),
+                executable=True,
+                cfg="host",),
+        "_swiglib":
+            attr.label(
+                default=Label("@swig//:templates"),
+                allow_files=True,),
     },
-    outputs = {
+    outputs={
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation = _py_wrap_cc_impl,
-)
+    implementation=_py_wrap_cc_impl,)
+
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -638,6 +819,7 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
+
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = set()
@@ -645,38 +827,36 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
+
 _transitive_hdrs = rule(
-    attrs = {
+    attrs={
         "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
+            allow_files=True,
+            providers=["cc"],),
     },
-    implementation = _transitive_hdrs_impl,
-)
+    implementation=_transitive_hdrs_impl,)
+
 
 def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.filegroup(name=name,
-                   srcs=[":" + name + "_gather"])
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.filegroup(name=name, srcs=[":" + name + "_gather"])
+
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    **kwargs)
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.cc_library(name=name, hdrs=[":" + name + "_gather"], **kwargs)
+
 
 def tf_custom_op_library_additional_deps():
   return [
-      "@protobuf//:protobuf",
-      "//third_party/eigen3",
-      "//tensorflow/core:framework_headers_lib",
+      "@protobuf//:protobuf_headers",
+      clean_dep("//third_party/eigen3"),
+      clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
 
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -690,14 +870,16 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
+
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl,
-    attr_aspects=["deps"])
+    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
+
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
+
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -709,137 +891,177 @@ def _check_deps_impl(ctx):
     for dep in input_dep.tf_collected_deps:
       for disallowed_dep in disallowed_deps:
         if dep == disallowed_dep.label:
-          fail(_dep_label(input_dep) + " cannot depend on " +
-               _dep_label(disallowed_dep))
+          fail(
+              _dep_label(input_dep) + " cannot depend on " + _dep_label(
+                  disallowed_dep))
   return struct()
 
+
 check_deps = rule(
     _check_deps_impl,
-    attrs = {
-        "deps": attr.label_list(
-            aspects=[collect_deps_aspect],
-            mandatory = True,
-            allow_files = True
-        ),
-        "disallowed_deps": attr.label_list(
-            mandatory = True,
-            allow_files = True
-        )},
-)
+    attrs={
+        "deps":
+            attr.label_list(
+                aspects=[collect_deps_aspect], mandatory=True,
+                allow_files=True),
+        "disallowed_deps":
+            attr.label_list(mandatory=True, allow_files=True)
+    },)
+
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
 def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
   cuda_deps = [
-      "//tensorflow/core:stream_executor_headers_lib",
+      clean_dep("//tensorflow/core:stream_executor_headers_lib"),
       "@local_config_cuda//cuda:cudart_static",
   ]
   deps = deps + tf_custom_op_library_additional_deps()
   if gpu_srcs:
     basename = name.split(".")[0]
     native.cc_library(
-        name = basename + "_gpu",
-        srcs = gpu_srcs,
-        copts = _cuda_copts(),
-        deps = deps + if_cuda(cuda_deps))
+        name=basename + "_gpu",
+        srcs=gpu_srcs,
+        copts=_cuda_copts(),
+        deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
-  check_deps(name=name+"_check_deps",
-             deps=deps + if_cuda(cuda_deps),
-             disallowed_deps=["//tensorflow/core:framework",
-                              "//tensorflow/core:lib"])
+  check_deps(
+      name=name + "_check_deps",
+      deps=deps + if_cuda(cuda_deps),
+      disallowed_deps=[
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib")
+      ])
+
+  native.cc_binary(
+      name=name,
+      srcs=srcs,
+      deps=deps + if_cuda(cuda_deps),
+      data=[name + "_check_deps"],
+      copts=tf_copts(),
+      linkshared=1,
+      linkopts=select({
+          "//conditions:default": [
+              "-lm",
+          ],
+          clean_dep("//tensorflow:darwin"): [],
+      }),)
+
+
+def tf_custom_op_py_library(name,
+                            srcs=[],
+                            dso=[],
+                            kernels=[],
+                            srcs_version="PY2AND3",
+                            visibility=None,
+                            deps=[]):
+  kernels = kernels  # unused argument
+  native.py_library(
+      name=name,
+      data=dso,
+      srcs=srcs,
+      srcs_version=srcs_version,
+      visibility=visibility,
+      deps=deps,)
 
-  native.cc_binary(name=name,
-                   srcs=srcs,
-                   deps=deps + if_cuda(cuda_deps),
-                   data=[name + "_check_deps"],
-                   copts=tf_copts(),
-                   linkshared=1,
-                   linkopts = select({
-                       "//conditions:default": [
-                           "-lm",
-                       ],
-                       "//tensorflow:darwin": [],
-                   }),
-  )
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
+
 def tf_extension_copts():
   return []  # No extension c opts
 
-def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
+
+def tf_py_wrap_cc(name,
+                             srcs,
+                             swig_includes=[],
+                             deps=[],
+                             copts=[],
+                             **kwargs):
   module_name = name.split("/")[-1]
   # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
   # and use that as the name for the rule producing the .so file.
   cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
+  cc_library_pyd_name = "/".join(
+      name.split("/")[:-1] + ["_" + module_name + ".pyd"])
   extra_deps = []
-  _py_wrap_cc(name=name + "_py_wrap",
-              srcs=srcs,
-              swig_includes=swig_includes,
-              deps=deps + extra_deps,
-              toolchain_deps=["//tools/defaults:crosstool"],
-              module_name=module_name,
-              py_module_name=name)
+  _py_wrap_cc(
+      name=name + "_py_wrap",
+      srcs=srcs,
+      swig_includes=swig_includes,
+      deps=deps + extra_deps,
+      toolchain_deps=["//tools/defaults:crosstool"],
+      module_name=module_name,
+      py_module_name=name)
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
-          "//tensorflow:tf_exported_symbols.lds"
-      ],
-      "//tensorflow:windows": [
+          clean_dep("//tensorflow:tf_exported_symbols.lds")
       ],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,--version-script",
-          "//tensorflow:tf_version_script.lds"
-      ]})
+          clean_dep("//tensorflow:tf_version_script.lds")
+      ]
+  })
   extra_deps += select({
       "@local_config_cuda//cuda:darwin": [
-        "//tensorflow:tf_exported_symbols.lds"
-      ],
-      "//tensorflow:windows": [
+          clean_dep("//tensorflow:tf_exported_symbols.lds")
       ],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
-        "//tensorflow:tf_version_script.lds"
+          clean_dep("//tensorflow:tf_version_script.lds")
       ]
   })
 
   native.cc_binary(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + ["-Wno-self-assign",
-                      "-Wno-sign-compare",
-                      "-Wno-write-strings"]
-             + tf_extension_copts()),
+      copts=(copts + [
+          "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
+      ] + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
       linkshared=1,
       deps=deps + extra_deps)
   native.genrule(
-      name = "gen_" + cc_library_pyd_name,
-      srcs = [":" + cc_library_name],
-      outs = [cc_library_pyd_name],
-      cmd = "cp $< $@",
-  )
-  native.py_library(name=name,
-                    srcs=[":" + name + ".py"],
-                    srcs_version="PY2AND3",
-                    data=select({
-                      "//tensorflow:windows": [":" + cc_library_pyd_name],
-                      "//conditions:default": [":" + cc_library_name],
-                    }))
+      name="gen_" + cc_library_pyd_name,
+      srcs=[":" + cc_library_name],
+      outs=[cc_library_pyd_name],
+      cmd="cp $< $@",)
+  native.py_library(
+      name=name,
+      srcs=[":" + name + ".py"],
+      srcs_version="PY2AND3",
+      data=select({
+          clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
+          "//conditions:default": [":" + cc_library_name],
+      }))
+
 
 def py_test(deps=[], **kwargs):
   native.py_test(
       deps=select({
-          "//conditions:default" : deps,
-          "//tensorflow:no_tensorflow_py_deps" : []
+          "//conditions:default": deps,
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
       }),
       **kwargs)
 
-def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-               tags=[], shard_count=1, additional_deps=[], flaky=0,
+
+def tf_py_test(name,
+               srcs,
+               size="medium",
+               data=[],
+               main=None,
+               args=[],
+               tags=[],
+               shard_count=1,
+               additional_deps=[],
+               flaky=0,
                xla_enabled=False):
   if xla_enabled:
     additional_deps += tf_additional_xla_deps_py()
@@ -850,50 +1072,71 @@ def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
       main=main,
       args=args,
       tags=tags,
-      visibility=["//tensorflow:internal"],
+      visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
       deps=select({
-          "//conditions:default" : [
-            "//tensorflow/python:extra_py_tests_deps",
-            "//tensorflow/python:gradient_checker",
+          "//conditions:default": [
+              clean_dep("//tensorflow/python:extra_py_tests_deps"),
+              clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          "//tensorflow:no_tensorflow_py_deps" : []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
       }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
-def cuda_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+
+def cuda_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(name=name,
-             size=size,
-             srcs=srcs,
-             data=data,
-             main=main,
-             args=args,
-             tags=test_tags,
-             shard_count=shard_count,
-             additional_deps=additional_deps,
-             flaky=flaky,
-             xla_enabled=xla_enabled)
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
 
-def sycl_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+
+def sycl_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
- test_tags = tags + tf_sycl_tests_tags()
- tf_py_test(name=name,
-            size=size,
-            srcs=srcs,
-            data=data,
-            main=main,
-            args=args,
-            tags=test_tags,
-            shard_count=shard_count,
-            additional_deps=additional_deps,
-            flaky=flaky,
-            xla_enabled=xla_enabled)
+  test_tags = tags + tf_sycl_tests_tags()
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
+
 
 def py_tests(name,
              srcs,
@@ -908,22 +1151,39 @@ def py_tests(name,
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
       test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(name=test_name,
-               size=size,
-               srcs=[src],
-               main=src,
-               tags=tags,
-               shard_count=shard_count,
-               data=data,
-               additional_deps=additional_deps,
-               xla_enabled=xla_enabled)
+    tf_py_test(
+        name=test_name,
+        size=size,
+        srcs=[src],
+        main=src,
+        tags=tags,
+        shard_count=shard_count,
+        data=data,
+        additional_deps=additional_deps,
+        xla_enabled=xla_enabled)
 
-def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
-                  shard_count=1, tags=[], prefix="", xla_enabled=False):
+
+def cuda_py_tests(name,
+                  srcs,
+                  size="medium",
+                  additional_deps=[],
+                  data=[],
+                  shard_count=1,
+                  tags=[],
+                  prefix="",
+                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  py_tests(name=name, size=size, srcs=srcs, additional_deps=additional_deps,
-           data=data, tags=test_tags, shard_count=shard_count,prefix=prefix,
-           xla_enabled=xla_enabled)
+  py_tests(
+      name=name,
+      size=size,
+      srcs=srcs,
+      additional_deps=additional_deps,
+      data=data,
+      tags=test_tags,
+      shard_count=shard_count,
+      prefix=prefix,
+      xla_enabled=xla_enabled)
+
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -931,40 +1191,46 @@ def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
 def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
-  out_hdrs = ([p.replace(".proto", ".pb_text.h") for p in srcs] +
-              [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
+  out_hdrs = (
+      [p.replace(".proto", ".pb_text.h")
+       for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
   out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
   native.genrule(
-        name = name,
-        srcs = srcs + ["//tensorflow/tools/proto_text:placeholder.txt"],
-        outs = out_hdrs + out_srcs,
-        cmd = "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
-              "$(@D) " + srcs_relative_dir + " $(SRCS)",
-        tools = ["//tensorflow/tools/proto_text:gen_proto_text_functions"],
-    )
+      name=name,
+      srcs=srcs + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+      outs=out_hdrs + out_srcs,
+      cmd=
+      "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
+      + "$(@D) " + srcs_relative_dir + " $(SRCS)",
+      tools=[
+          clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
+      ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
+
 def tf_genrule_cmd_append_to_srcs(to_append):
-    return ("cat $(SRCS) > $(@) && " +
-            "echo >> $(@) && " +
-            "echo " + to_append + " >> $(@)")
+  return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+          " >> $(@)")
 
 
 def tf_version_info_genrule():
   native.genrule(
-      name = "version_info_gen",
-      srcs = [
-          "//tensorflow/tools/git:gen/spec.json",
-          "//tensorflow/tools/git:gen/head",
-          "//tensorflow/tools/git:gen/branch_ref",
+      name="version_info_gen",
+      srcs=[
+          clean_dep("//tensorflow/tools/git:gen/spec.json"),
+          clean_dep("//tensorflow/tools/git:gen/head"),
+          clean_dep("//tensorflow/tools/git:gen/branch_ref"),
       ],
-      outs = ["util/version_info.cc"],
-      cmd = "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
-      local = 1,
-      tools = ["//tensorflow/tools/git:gen_git_source.py"],
-  )
+      outs=["util/version_info.cc"],
+      cmd=
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      local=1,
+      tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
-def cc_library_with_android_deps(deps, android_deps=[],
-                                common_deps=[], **kwargs):
+
+def cc_library_with_android_deps(deps,
+                                 android_deps=[],
+                                 common_deps=[],
+                                 **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
   native.cc_library(deps=deps, **kwargs)
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index cb81e89922c..1f4d900ec2b 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1,3 +1,4 @@
 *tensorflow*
 *perftools*gputools*
 *tf_*
+TF_*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 8c8c8be5a93..b368f7cf21d 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -2,6 +2,7 @@ tensorflow {
   global:
     *tensorflow*;
     *perftools*gputools*;
+    TF_*;
   local:
     *;
 };
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
new file mode 100644
index 00000000000..08436396a6c
--- /dev/null
+++ b/tensorflow/tools/api/golden/BUILD
@@ -0,0 +1,24 @@
+# TensorFlow API backwards compatibility test goldens.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "api_golden",
+    srcs = glob(["*.pbtxt"]),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
new file mode 100644
index 00000000000..f79029d3fe0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.AggregationMethod"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "ADD_N"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "EXPERIMENTAL_ACCUMULATE_N"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "EXPERIMENTAL_TREE"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
new file mode 100644
index 00000000000..0fb1aaba283
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.AttrValue.ListValue"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.ListValue\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "B_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FUNC_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "F_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "I_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "S_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
new file mode 100644
index 00000000000..e7a3a1f02fa
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
@@ -0,0 +1,120 @@
+path: "tensorflow.AttrValue"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrValue\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "B_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FUNC_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "F_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "I_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ListValue"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "PLACEHOLDER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "S_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
new file mode 100644
index 00000000000..c9a32c16b34
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.ConditionalAccumulatorBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
new file mode 100644
index 00000000000..d23b3bd0cae
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.ConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
new file mode 100644
index 00000000000..29bb3be35cb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.ConfigProto.DeviceCountEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.DeviceCountEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
new file mode 100644
index 00000000000..da6af3919e9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.ConfigProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.ConfigProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ALLOW_SOFT_PLACEMENT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CLUSTER_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DEVICE_COUNT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEVICE_FILTERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DeviceCountEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "GPU_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTER_OP_PARALLELISM_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTRA_OP_PARALLELISM_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_DEVICE_PLACEMENT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPERATION_TIMEOUT_IN_MS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PLACEMENT_PERIOD_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "RPC_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_INTER_OP_THREAD_POOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "USE_PER_SESSION_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
new file mode 100644
index 00000000000..0b5b88bba80
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.DType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "as_datatype_enum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "as_numpy_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "base_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_bool"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_complex"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_floating"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_integer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_numpy_compatible"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_quantized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_unsigned"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "limits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "min"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "real_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
new file mode 100644
index 00000000000..92e535c3414
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.DeviceSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "job"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_string"
+    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_from"
+    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "parse_from_string"
+    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_string"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
new file mode 100644
index 00000000000..a9ab27719b4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.Dimension"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
new file mode 100644
index 00000000000..9bf8c124288
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.Event"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILE_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "META_GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_LOG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUMMARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WALL_TIME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
new file mode 100644
index 00000000000..72cc5324476
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.FIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
new file mode 100644
index 00000000000..6933814a7b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.FixedLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
new file mode 100644
index 00000000000..c5387879519
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.FixedLenSequenceFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "allow_missing"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
new file mode 100644
index 00000000000..5c77b3dd5cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.FixedLengthRecordReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.FixedLengthRecordReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
new file mode 100644
index 00000000000..30f7e4e1165
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.GPUOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GPUOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ALLOCATOR_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ALLOW_GROWTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFERRED_DELETION_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "POLLING_ACTIVE_DELAY_USECS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "POLLING_INACTIVE_DELAY_MSECS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VISIBLE_DEVICE_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
new file mode 100644
index 00000000000..1495e847cb0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.GraphDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.graph_pb2.GraphDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LIBRARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
new file mode 100644
index 00000000000..ef2cfe3787e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.GraphKeys"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "ACTIVATIONS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ASSET_FILEPATHS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "BIASES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CONCATENATED_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "COND_CONTEXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "EVAL_STEP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GLOBAL_STEP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GLOBAL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_RESOURCES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOSSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MODEL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MOVING_AVERAGE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "QUEUE_RUNNERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "READY_FOR_LOCAL_INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "READY_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGULARIZATION_LOSSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "RESOURCES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVEABLE_OBJECTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUMMARIES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUMMARY_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TABLE_INITIALIZERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINABLE_RESOURCE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINABLE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAIN_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "UPDATE_OPS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "WEIGHTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "WHILE_CONTEXT"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
new file mode 100644
index 00000000000..0844f891cad
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.GraphOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GraphOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BUILD_COST_MODEL_AFTER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BUILD_COST_MODEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENABLE_BFLOAT16_SENDRECV_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ENABLE_RECV_SCHEDULING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "INFER_SHAPES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPTIMIZER_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PLACE_PRUNED_GRAPH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "REWRITE_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TIMELINE_STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
new file mode 100644
index 00000000000..75361803a39
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.Graph"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Graph\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "building_function"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "collections"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "finalized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def_versions"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "seed"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "version"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_to_collection"
+    argspec: "args=[\'self\', \'name\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_to_collections"
+    argspec: "args=[\'self\', \'names\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_graph_def"
+    argspec: "args=[\'self\', \'from_version\', \'add_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "as_graph_element"
+    argspec: "args=[\'self\', \'obj\', \'allow_tensor\', \'allow_operation\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "clear_collection"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_with"
+    argspec: "args=[\'self\', \'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "container"
+    argspec: "args=[\'self\', \'container_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "control_dependencies"
+    argspec: "args=[\'self\', \'control_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "create_op"
+    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "device"
+    argspec: "args=[\'self\', \'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_all_collection_keys"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'self\', \'name\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection_ref"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_name_scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_operation_by_name"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_operations"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_by_name"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gradient_override_map"
+    argspec: "args=[\'self\', \'op_type_map\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_feedable"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_fetchable"
+    argspec: "args=[\'self\', \'tensor_or_op\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "name_scope"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prevent_feeding"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prevent_fetching"
+    argspec: "args=[\'self\', \'op\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unique_name"
+    argspec: "args=[\'self\', \'name\', \'mark_as_used\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
new file mode 100644
index 00000000000..2567d2fe602
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
@@ -0,0 +1,104 @@
+path: "tensorflow.HistogramProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.HistogramProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BUCKET_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BUCKET_LIMIT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MAX_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MIN_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUM_SQUARES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
new file mode 100644
index 00000000000..2eda320d636
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.IdentityReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.IdentityReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
new file mode 100644
index 00000000000..fee84d85307
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
@@ -0,0 +1,42 @@
+path: "tensorflow.IndexedSlices"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'indices\', \'dense_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
new file mode 100644
index 00000000000..f5b0bae58d0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.InteractiveSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.client.session.InteractiveSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sess_str"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "list_devices"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_callable"
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run"
+    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run_setup"
+    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
new file mode 100644
index 00000000000..a43c5eb7e30
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.LogMessage"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.LogMessage\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEBUGGING"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ERROR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FATAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INFO"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Level"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNKNOWN"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
new file mode 100644
index 00000000000..3572126fbfd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.MetaGraphDef.CollectionDefEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.CollectionDefEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
new file mode 100644
index 00000000000..ebf49f434ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
@@ -0,0 +1,100 @@
+path: "tensorflow.MetaGraphDef.MetaInfoDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.MetaInfoDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ANY_INFO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "META_GRAPH_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STRIPPED_OP_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAGS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSORFLOW_GIT_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSORFLOW_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
new file mode 100644
index 00000000000..48fccac99d6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.MetaGraphDef.SignatureDefEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.SignatureDefEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
new file mode 100644
index 00000000000..3e683a87159
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.MetaGraphDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.MetaGraphDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ASSET_FILE_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "COLLECTION_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CollectionDefEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "META_INFO_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MetaInfoDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SAVER_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SIGNATURE_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SignatureDefEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
new file mode 100644
index 00000000000..2750bd780ca
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.NameAttrList.AttrEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
new file mode 100644
index 00000000000..d10faf67d02
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.NameAttrList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.NameAttrList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ATTR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AttrEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
new file mode 100644
index 00000000000..b1b62d60f1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.NodeDef.AttrEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.AttrEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
new file mode 100644
index 00000000000..b812b4df2b3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
@@ -0,0 +1,100 @@
+path: "tensorflow.NodeDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.NodeDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ATTR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AttrEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DEVICE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "INPUT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
new file mode 100644
index 00000000000..7e59615534f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.OpError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
new file mode 100644
index 00000000000..64240f70698
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.Operation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Operation\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "control_inputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "traceback"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "traceback_with_start_lines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'g\', \'inputs\', \'output_types\', \'control_inputs\', \'input_types\', \'original_op\', \'op_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocation_groups"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_attr"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "values"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
new file mode 100644
index 00000000000..5dd1ee47c96
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
@@ -0,0 +1,128 @@
+path: "tensorflow.OptimizerOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.OptimizerOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DO_COMMON_SUBEXPRESSION_ELIMINATION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DO_CONSTANT_FOLDING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DO_FUNCTION_INLINING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "GLOBAL_JIT_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GlobalJitLevel"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "L0"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "L1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Level"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "OFF"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ON_1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ON_2"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPT_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 00000000000..1bfe723ce75
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
new file mode 100644
index 00000000000..dbe25f3a5b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
new file mode 100644
index 00000000000..9263d73a511
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
@@ -0,0 +1,61 @@
+path: "tensorflow.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
new file mode 100644
index 00000000000..ec783ffe5a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
new file mode 100644
index 00000000000..f6a3ce76a15
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.ReaderBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reader_ref\', \'supports_serialize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt b/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
new file mode 100644
index 00000000000..4d6e4137d12
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.RegisterGradient"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.RegisterGradient\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
new file mode 100644
index 00000000000..808fa0fa217
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.RunMetadata"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunMetadata\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COST_GRAPH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "PARTITION_GRAPHS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_STATS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
new file mode 100644
index 00000000000..5ad6804a78c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
@@ -0,0 +1,116 @@
+path: "tensorflow.RunOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEBUG_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FULL_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "HARDWARE_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTER_OP_THREAD_POOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NO_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SOFTWARE_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TIMEOUT_IN_MS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TRACE_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TraceLevel"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
new file mode 100644
index 00000000000..ec66d7f3354
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.SessionLog"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CHECKPOINT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHECKPOINT_PATH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MSG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "START"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_UNSPECIFIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STOP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SessionStatus"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
new file mode 100644
index 00000000000..173cd1963e5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.Session"
+tf_class {
+  is_instance: "<class \'tensorflow.python.client.session.Session\'>"
+  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sess_str"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "list_devices"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_callable"
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run"
+    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run_setup"
+    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'target\', \'containers\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
new file mode 100644
index 00000000000..2260279ad2b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.SparseConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "apply_indexed_slices_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_indexed_slices_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
new file mode 100644
index 00000000000..d875394fb5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.SparseFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "already_sorted"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "index_key"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_key"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
new file mode 100644
index 00000000000..d33fd4d5d7b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.SparseTensorValue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorValue\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
new file mode 100644
index 00000000000..eac236d4982
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.SparseTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'sparse_tensor_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
new file mode 100644
index 00000000000..781010d75e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.Summary.Audio"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTENT_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LENGTH_FRAMES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_CHANNELS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAMPLE_RATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
new file mode 100644
index 00000000000..feb9c7ee927
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.Summary.Image"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COLORSPACE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HEIGHT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WIDTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
new file mode 100644
index 00000000000..ffb4f45fc5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.Summary.Value"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "AUDIO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HISTO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "IMAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SIMPLE_VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
new file mode 100644
index 00000000000..38de17fa9e5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.Summary"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "Audio"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "Image"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Value"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
new file mode 100644
index 00000000000..cdf79373919
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.TFRecordReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.TFRecordReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
new file mode 100644
index 00000000000..ed088c41ed3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.TensorArray"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.tensor_array_ops.TensorArray\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flow"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "handle"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'colocate_with_first_write_call\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "grad"
+    argspec: "args=[\'self\', \'source\', \'flow\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter"
+    argspec: "args=[\'self\', \'indices\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'self\', \'value\', \'lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unstack"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'index\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
new file mode 100644
index 00000000000..425c35e0674
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.TensorInfo.CooSparse"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.CooSparse\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DENSE_SHAPE_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "INDICES_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUES_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
new file mode 100644
index 00000000000..41ea393be51
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.TensorInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.TensorInfo\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COO_SPARSE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CooSparse"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DTYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
new file mode 100644
index 00000000000..d5b9cb8f5ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
@@ -0,0 +1,73 @@
+path: "tensorflow.TensorShape"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dims"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndims"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_proto"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_has_rank"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_fully_defined"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_same_rank"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_fully_defined"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_elements"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank_at_least"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank_at_most"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
new file mode 100644
index 00000000000..38d19bb5374
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.Tensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "OVERLOADABLE_OPERATORS"
+    mtype: "<type \'set\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_index"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'op\', \'value_index\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
new file mode 100644
index 00000000000..e9779f07620
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.TextLineReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.TextLineReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'skip_header_lines\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
new file mode 100644
index 00000000000..54b66f43f8e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.VarLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
new file mode 100644
index 00000000000..c9b2dfd6772
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
@@ -0,0 +1,97 @@
+path: "tensorflow.VariableScope"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variable_scope.VariableScope\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "caching_device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "custom_getter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "original_name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "partitioner"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reuse"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "use_resource"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reuse\', \'name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'name_scope\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'None\', \'None\', \'None\', \'\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable"
+    argspec: "args=[\'self\', \'var_store\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'reuse\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "global_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reuse_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_caching_device"
+    argspec: "args=[\'self\', \'caching_device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_custom_getter"
+    argspec: "args=[\'self\', \'custom_getter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_dtype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_initializer"
+    argspec: "args=[\'self\', \'initializer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_partitioner"
+    argspec: "args=[\'self\', \'partitioner\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_regularizer"
+    argspec: "args=[\'self\', \'regularizer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_use_resource"
+    argspec: "args=[\'self\', \'use_resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "trainable_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
new file mode 100644
index 00000000000..ac3ccd468b2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.Variable.SaveSliceInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.SaveSliceInfo\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
new file mode 100644
index 00000000000..d67a2713f7a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
@@ -0,0 +1,101 @@
+path: "tensorflow.Variable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "SaveSliceInfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign"
+    argspec: "args=[\'self\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "assign_add"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "assign_sub"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "count_up_to"
+    argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialized_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load"
+    argspec: "args=[\'self\', \'value\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scatter_sub"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
new file mode 100644
index 00000000000..4ac759891c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.WholeFileReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.WholeFileReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/tensorflow.app.pbtxt
new file mode 100644
index 00000000000..85044a89879
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.app.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.app"
+tf_module {
+  member {
+    name: "flags"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
new file mode 100644
index 00000000000..ccc60314001
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.compat"
+tf_module {
+  member {
+    name: "bytes_or_text_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "complex_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "integral_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "real_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member_method {
+    name: "as_bytes"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+  member_method {
+    name: "as_str"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+  member_method {
+    name: "as_str_any"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_text"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
new file mode 100644
index 00000000000..00ec669b168
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.constant_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
new file mode 100644
index 00000000000..ea9186b0b9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.AbortedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.AbortedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
new file mode 100644
index 00000000000..4e155081dd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.AlreadyExistsError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.AlreadyExistsError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
new file mode 100644
index 00000000000..b02a0e023aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.CancelledError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.CancelledError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
new file mode 100644
index 00000000000..c1fa66342a7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.DataLossError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.DataLossError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
new file mode 100644
index 00000000000..8e037936191
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.DeadlineExceededError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.DeadlineExceededError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
new file mode 100644
index 00000000000..384d4b534c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.FailedPreconditionError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.FailedPreconditionError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
new file mode 100644
index 00000000000..ac5c4d7879b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.InternalError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.InternalError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
new file mode 100644
index 00000000000..161edd4a7c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.InvalidArgumentError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.InvalidArgumentError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
new file mode 100644
index 00000000000..1e64730ac6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.NotFoundError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.NotFoundError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
new file mode 100644
index 00000000000..b1f14c0457d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.errors.OpError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
new file mode 100644
index 00000000000..6365e472868
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.OutOfRangeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OutOfRangeError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
new file mode 100644
index 00000000000..dc8a66f9ead
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.PermissionDeniedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.PermissionDeniedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
new file mode 100644
index 00000000000..85bb384b469
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.ResourceExhaustedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.ResourceExhaustedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
new file mode 100644
index 00000000000..d57d7ac2f20
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnauthenticatedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnauthenticatedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
new file mode 100644
index 00000000000..cc33e6ed8d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnavailableError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnavailableError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
new file mode 100644
index 00000000000..b8c2e22dbd7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnimplementedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnimplementedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
new file mode 100644
index 00000000000..8ffcfae95b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnknownError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnknownError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.pbtxt
new file mode 100644
index 00000000000..0ad1c19603b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.pbtxt
@@ -0,0 +1,151 @@
+path: "tensorflow.errors"
+tf_module {
+  member {
+    name: "ABORTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ALREADY_EXISTS"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AbortedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AlreadyExistsError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CANCELLED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CancelledError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DATA_LOSS"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEADLINE_EXCEEDED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DataLossError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DeadlineExceededError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FAILED_PRECONDITION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FailedPreconditionError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INTERNAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INVALID_ARGUMENT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "InternalError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InvalidArgumentError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NOT_FOUND"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NotFoundError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OK"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OUT_OF_RANGE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OpError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OutOfRangeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PERMISSION_DENIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PermissionDeniedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RESOURCE_EXHAUSTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ResourceExhaustedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNAUTHENTICATED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNAVAILABLE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNIMPLEMENTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNKNOWN"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UnauthenticatedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnavailableError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnimplementedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnknownError"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "error_code_from_exception_type"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exception_type_from_error_code"
+    argspec: "args=[\'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "raise_exception_on_not_ok_status"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
new file mode 100644
index 00000000000..5dbfe217264
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.estimator.EstimatorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "eval_metric_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "export_outputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "predictions"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scaffold"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "train_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "training_chief_hooks"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "training_hooks"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
new file mode 100644
index 00000000000..7a769fd546c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.estimator.Estimator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
new file mode 100644
index 00000000000..6a1c24fa63f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.estimator.ModeKeys"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "EVAL"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAIN"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
new file mode 100644
index 00000000000..d69c475a313
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.estimator.RunConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "evaluation_master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_every_n_hours"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_ps_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_worker_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summary_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tf_random_seed"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "replace"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
new file mode 100644
index 00000000000..3cf7af8da95
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.ClassificationOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
new file mode 100644
index 00000000000..2df1840c4a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.estimator.export.ClassificationOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scores"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scores\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
new file mode 100644
index 00000000000..5d165ccbf91
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.ExportOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
new file mode 100644
index 00000000000..fa62e8ced80
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.export.ExportOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
new file mode 100644
index 00000000000..743495ba98c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.PredictOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
new file mode 100644
index 00000000000..e0160b10ce1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.export.PredictOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "outputs"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
new file mode 100644
index 00000000000..dbf4e3dec85
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.RegressionOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
new file mode 100644
index 00000000000..905f0e05535
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.export.RegressionOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
new file mode 100644
index 00000000000..0d9e0443088
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.estimator.export.ServingInputReceiver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "features"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "receiver_tensors"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
new file mode 100644
index 00000000000..4d0dddb3bc0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.estimator.export"
+tf_module {
+  member {
+    name: "ClassificationOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "ExportOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "PredictOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "RegressionOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "ServingInputReceiver"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "build_parsing_serving_input_receiver_fn"
+    argspec: "args=[\'feature_spec\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "build_raw_serving_input_receiver_fn"
+    argspec: "args=[\'features\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
new file mode 100644
index 00000000000..b318fea1f82
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.estimator.inputs"
+tf_module {
+  member_method {
+    name: "numpy_input_fn"
+    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\'], "
+  }
+  member_method {
+    name: "pandas_input_fn"
+    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\', \'target_column\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\', \'target\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
new file mode 100644
index 00000000000..0d5dc73271d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator"
+tf_module {
+  member {
+    name: "Estimator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "EstimatorSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ModeKeys"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RunConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "export"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "inputs"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
new file mode 100644
index 00000000000..4c633a850f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.feature_column"
+tf_module {
+  member_method {
+    name: "bucketized_column"
+    argspec: "args=[\'source_column\', \'boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
+  member_method {
+    name: "crossed_column"
+    argspec: "args=[\'keys\', \'hash_bucket_size\', \'hash_key\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "embedding_column"
+    argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "indicator_column"
+    argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "input_layer"
+    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "linear_model"
+    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "make_parse_example_spec"
+    argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "weighted_categorical_column"
+    argspec: "args=[\'categorical_column\', \'weight_feature_key\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
new file mode 100644
index 00000000000..eecfaffd0a6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.FastGFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
new file mode 100644
index 00000000000..305251059d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
new file mode 100644
index 00000000000..6e8894180a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.Open"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
new file mode 100644
index 00000000000..65b55a8b7c4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.gfile"
+tf_module {
+  member {
+    name: "FastGFile"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Open"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Copy"
+    argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "DeleteRecursively"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exists"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Glob"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsDirectory"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ListDirectory"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MakeDirs"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MkDir"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Remove"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rename"
+    argspec: "args=[\'oldname\', \'newname\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "Stat"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Walk"
+    argspec: "args=[\'top\', \'in_order\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
new file mode 100644
index 00000000000..eeabf845dca
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.graph_util"
+tf_module {
+  member_method {
+    name: "convert_variables_to_constants"
+    argspec: "args=[\'sess\', \'input_graph_def\', \'output_node_names\', \'variable_names_whitelist\', \'variable_names_blacklist\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "extract_sub_graph"
+    argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "must_run_on_cpu"
+    argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "remove_training_nodes"
+    argspec: "args=[\'input_graph\', \'protected_nodes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_shape_from_node_def_name"
+    argspec: "args=[\'graph\', \'input_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
new file mode 100644
index 00000000000..dbc360b13ee
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.image.ResizeMethod"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.image_ops_impl.ResizeMethod\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "AREA"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BICUBIC"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BILINEAR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NEAREST_NEIGHBOR"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
new file mode 100644
index 00000000000..93257c84a1f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.image"
+tf_module {
+  member {
+    name: "ResizeMethod"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "adjust_brightness"
+    argspec: "args=[\'image\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_contrast"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_gamma"
+    argspec: "args=[\'image\', \'gamma\', \'gain\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
+  }
+  member_method {
+    name: "adjust_hue"
+    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_saturation"
+    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "central_crop"
+    argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_image_dtype"
+    argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "crop_and_resize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "crop_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "draw_bounding_boxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "extract_glimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "flip_left_right"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip_up_down"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "grayscale_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "hsv_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression_v2"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pad_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "per_image_standardization"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_contrast"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_left_right"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_up_down"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_hue"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_saturation"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "resize_area"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_image_with_crop_or_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_images"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
+  }
+  member_method {
+    name: "resize_nearest_neighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rgb_to_grayscale"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_hsv"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "total_variation"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose_image"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
new file mode 100644
index 00000000000..418ca3ea466
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.layers"
+tf_module {
+  member_method {
+    name: "average_pooling1d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "average_pooling2d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "average_pooling3d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "batch_normalization"
+    argspec: "args=[\'inputs\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'training\', \'trainable\', \'name\', \'reuse\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'None\', \'None\', \'False\', \'True\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'False\'], "
+  }
+  member_method {
+    name: "conv1d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_transpose"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'(1, 1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_transpose"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dense"
+    argspec: "args=[\'inputs\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dropout"
+    argspec: "args=[\'inputs\', \'rate\', \'noise_shape\', \'seed\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling1d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling2d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling3d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "separable_conv2d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/tensorflow.logging.pbtxt
new file mode 100644
index 00000000000..85bb15455da
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.logging.pbtxt
@@ -0,0 +1,83 @@
+path: "tensorflow.logging"
+tf_module {
+  member {
+    name: "DEBUG"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ERROR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FATAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INFO"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "TaskLevelStatusMessage"
+    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "debug"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "error"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "fatal"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_verbosity"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "info"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "log_every_n"
+    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_first_n"
+    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_if"
+    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vlog"
+    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "warn"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "warning"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
new file mode 100644
index 00000000000..4bdc73370bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
@@ -0,0 +1,32 @@
+path: "tensorflow.losses.Reduction"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "MEAN"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_BY_NONZERO_WEIGHTS"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "validate"
+    argspec: "args=[\'cls\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/tensorflow.losses.pbtxt
new file mode 100644
index 00000000000..79443839b9a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.losses.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.losses"
+tf_module {
+  member {
+    name: "Reduction"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "absolute_difference"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
+  }
+  member_method {
+    name: "compute_weighted_loss"
+    argspec: "args=[\'losses\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "cosine_distance"
+    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "get_losses"
+    argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
+  }
+  member_method {
+    name: "get_regularization_loss"
+    argspec: "args=[\'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'total_regularization_loss\'], "
+  }
+  member_method {
+    name: "get_regularization_losses"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_total_loss"
+    argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
+  }
+  member_method {
+    name: "hinge_loss"
+    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "huber_loss"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'delta\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "log_loss"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'epsilon\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1e-07\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "mean_pairwise_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\'], "
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "sigmoid_cross_entropy"
+    argspec: "args=[\'multi_class_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "softmax_cross_entropy"
+    argspec: "args=[\'onehot_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "sparse_softmax_cross_entropy"
+    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
new file mode 100644
index 00000000000..262d11c38e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
@@ -0,0 +1,99 @@
+path: "tensorflow.metrics"
+tf_module {
+  member_method {
+    name: "accuracy"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "auc"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\'], "
+  }
+  member_method {
+    name: "false_negatives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "false_positives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_cosine_distance"
+    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_iou"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_per_class_accuracy"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_relative_error"
+    argspec: "args=[\'labels\', \'predictions\', \'normalizer\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_tensor"
+    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "percentage_below"
+    argspec: "args=[\'values\', \'threshold\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "precision"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "precision_at_thresholds"
+    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall_at_thresholds"
+    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "root_mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sensitivity_at_specificity"
+    argspec: "args=[\'labels\', \'predictions\', \'specificity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_average_precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "specificity_at_sensitivity"
+    argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "true_positives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
new file mode 100644
index 00000000000..9f817beafd9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -0,0 +1,339 @@
+path: "tensorflow.nn"
+tf_module {
+  member {
+    name: "rnn_cell"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "atrous_conv2d"
+    argspec: "args=[\'value\', \'filters\', \'rate\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atrous_conv2d_transpose"
+    argspec: "args=[\'value\', \'filters\', \'output_shape\', \'rate\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "avg_pool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool3d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_norm_with_global_normalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_normalization"
+    argspec: "args=[\'x\', \'mean\', \'variance\', \'offset\', \'scale\', \'variance_epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bias_add"
+    argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "bidirectional_dynamic_rnn"
+    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "compute_accidental_hits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv1d"
+    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_transpose"
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_backprop_filter_v2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_transpose"
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "convolution"
+    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "crelu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ctc_beam_search_decoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'100\', \'1\', \'True\'], "
+  }
+  member_method {
+    name: "ctc_greedy_decoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "ctc_loss"
+    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "dilation2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dropout"
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "elu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "embedding_lookup"
+    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'name\', \'validate_indices\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "embedding_lookup_sparse"
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'name\', \'combiner\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "erosion2d"
+    argspec: "args=[\'value\', \'kernel\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fractional_avg_pool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fractional_max_pool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fused_batch_norm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "in_top_k"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "l2_loss"
+    argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'dim\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-12\', \'None\'], "
+  }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "local_response_normalization"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "log_poisson_loss"
+    argspec: "args=[\'targets\', \'log_input\', \'compute_full_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "log_softmax"
+    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "log_uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "lrn"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool3d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool_with_argmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "moments"
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "nce_loss"
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+  }
+  member_method {
+    name: "normalize_moments"
+    argspec: "args=[\'counts\', \'mean_ss\', \'variance_ss\', \'shift\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pool"
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_avg_pool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "quantized_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_max_pool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "quantized_relu_x"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "raw_rnn"
+    argspec: "args=[\'cell\', \'loop_fn\', \'parallel_iterations\', \'swap_memory\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "relu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "relu6"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "relu_layer"
+    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sampled_softmax_loss"
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
+  }
+  member_method {
+    name: "separable_conv2d"
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sigmoid_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "softmax_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "softplus"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "softsign"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_softmax_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "static_bidirectional_rnn"
+    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "static_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'initial_state\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "static_state_saving_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sufficient_statistics"
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "top_k"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "weighted_cross_entropy_with_logits"
+    argspec: "args=[\'targets\', \'logits\', \'pos_weight\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "weighted_moments"
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "with_space_to_batch"
+    argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "xw_plus_b"
+    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "zero_fraction"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
new file mode 100644
index 00000000000..fbf68c50a1a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
new file mode 100644
index 00000000000..606d20d8f0f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.BasicRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
new file mode 100644
index 00000000000..ead1d0cfc51
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.DeviceWrapper"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cell\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
new file mode 100644
index 00000000000..2db4996b2a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.DropoutWrapper"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cell\', \'input_keep_prob\', \'output_keep_prob\', \'state_keep_prob\', \'variational_recurrent\', \'input_size\', \'dtype\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'1.0\', \'False\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
new file mode 100644
index 00000000000..101f6df1d84
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.GRUCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
new file mode 100644
index 00000000000..c87546d5285
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.LSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
new file mode 100644
index 00000000000..1de8a55dcca
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.nn.rnn_cell.LSTMStateTuple"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMStateTuple\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMStateTuple\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "c"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "h"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
new file mode 100644
index 00000000000..bc01ccfa647
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.MultiRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cells\', \'state_is_tuple\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
new file mode 100644
index 00000000000..b19ee18b40f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -0,0 +1,94 @@
+path: "tensorflow.nn.rnn_cell.RNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
new file mode 100644
index 00000000000..b21d9a8ee33
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.nn.rnn_cell.ResidualWrapper"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cell\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt
new file mode 100644
index 00000000000..64697e8a02b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt
@@ -0,0 +1,43 @@
+path: "tensorflow.nn.rnn_cell"
+tf_module {
+  member {
+    name: "BasicLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BasicRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DeviceWrapper"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DropoutWrapper"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GRUCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LSTMStateTuple"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MultiRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ResidualWrapper"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
new file mode 100644
index 00000000000..210b56242b2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.ones_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
new file mode 100644
index 00000000000..13ec7454f41
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.orthogonal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
new file mode 100644
index 00000000000..342ee95f74d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -0,0 +1,1967 @@
+path: "tensorflow"
+tf_module {
+  member {
+    name: "AggregationMethod"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AttrValue"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "COMPILER_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ConditionalAccumulatorBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ConfigProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DeviceSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Dimension"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Event"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenSequenceFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLengthRecordReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GIT_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GPUOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_CONSUMER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Graph"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GraphDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "GraphKeys"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GraphOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "HistogramProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "IdentityReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "IndexedSlices"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InteractiveSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogMessage"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "MetaGraphDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "NameAttrList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "NodeDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "OpError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Operation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizerOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QUANTIZED_DTYPES"
+    mtype: "<type \'frozenset\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReaderBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RegisterGradient"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RunMetadata"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "RunOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Session"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionLog"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SparseConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensorValue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Summary"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TFRecordReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Tensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TensorArray"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TensorInfo"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TextLineReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VarLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Variable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VariableScope"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WholeFileReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "app"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "bfloat16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "bool"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "compat"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "constant_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "contrib"
+    mtype: "<class \'tensorflow.python.util.lazy_loader.LazyLoader\'>"
+  }
+  member {
+    name: "double"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "errors"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "estimator"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "feature_column"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "flags"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "graph_util"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "half"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "image"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "logging"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "newaxis"
+    mtype: "<type \'NoneType\'>"
+  }
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "ones_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "python_io"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "pywrap_tensorflow"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "qint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "random_normal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_uniform_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "resource"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "resource_loader"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "saved_model"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "sets"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "spectral"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "string"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "summary"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "sysconfig"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "test"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "train"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "truncated_normal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uniform_unit_scaling_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "user_ops"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "zeros_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "NoGradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NotDifferentiable"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "accumulate_n"
+    argspec: "args=[\'inputs\', \'shape\', \'tensor_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "acos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_check_numerics_ops"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_n"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_to_collection"
+    argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "all_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arg_max"
+    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "arg_min"
+    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_dtype"
+    argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "asin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_integer"
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_none_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_proper_iterable"
+    argspec: "args=[\'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_rank"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_at_least"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_same_float_dtype"
+    argspec: "args=[\'tensors\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_scalar"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_type"
+    argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_variables_initialized"
+    argspec: "args=[\'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign_add"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign_sub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "atan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atan2"
+    argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_to_space"
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_to_space_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "betainc"
+    argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "bitcast"
+    argspec: "args=[\'input\', \'type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "boolean_mask"
+    argspec: "args=[\'tensor\', \'mask\', \'name\'], varargs=None, keywords=None, defaults=[\'boolean_mask\'], "
+  }
+  member_method {
+    name: "broadcast_dynamic_shape"
+    argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_static_shape"
+    argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "case"
+    argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'case\'], "
+  }
+  member_method {
+    name: "cast"
+    argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ceil"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "check_numerics"
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cholesky_solve"
+    argspec: "args=[\'chol\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clip_by_average_norm"
+    argspec: "args=[\'t\', \'clip_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clip_by_global_norm"
+    argspec: "args=[\'t_list\', \'clip_norm\', \'use_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "clip_by_norm"
+    argspec: "args=[\'t\', \'clip_norm\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "clip_by_value"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "complex"
+    argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'concat\'], "
+  }
+  member_method {
+    name: "cond"
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "constant"
+    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
+  }
+  member_method {
+    name: "container"
+    argspec: "args=[\'container_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "control_dependencies"
+    argspec: "args=[\'control_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_to_tensor"
+    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert_to_tensor_or_indexed_slices"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert_to_tensor_or_sparse_tensor"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "cos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "count_up_to"
+    argspec: "args=[\'ref\', \'limit\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "create_partitioned_variables"
+    argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cross"
+    argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "decode_base64"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_csv"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_json_example"
+    argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_raw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "delete_session_tensor"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "deserialize_many_sparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "device"
+    argspec: "args=[\'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "digamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dynamic_partition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dynamic_stitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "edit_distance"
+    argspec: "args=[\'hypothesis\', \'truth\', \'normalize\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'edit_distance\'], "
+  }
+  member_method {
+    name: "einsum"
+    argspec: "args=[\'equation\'], varargs=inputs, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "encode_base64"
+    argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "erfc"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exp"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "expm1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "extract_image_patches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_args"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_args_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_per_channel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_per_channel_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fill"
+    argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fixed_size_partitioner"
+    argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "floor"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floor_div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floordiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floormod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "foldl"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "foldr"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "gather_nd"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection_ref"
+    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_default_graph"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_default_session"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_local_variable"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_seed"
+    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_session_handle"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_session_tensor"
+    argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_variable"
+    argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_variable_scope"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "global_norm"
+    argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "global_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "global_variables_initializer"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gradients"
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "greater"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "greater_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "hessians"
+    argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "histogram_fixed_width"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "igamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "igammac"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "import_graph_def"
+    argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "initialize_all_tables"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
+  member_method {
+    name: "initialize_all_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialize_local_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialize_variables"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
+  }
+  member_method {
+    name: "invert_permutation"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_numeric_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_variable_initialized"
+    argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "lbeta"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'lbeta\'], "
+  }
+  member_method {
+    name: "less"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "less_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lgamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lin_space"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "linspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "load_file_system_library"
+    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_op_library"
+    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "local_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "local_variables_initializer"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log1p"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log_sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_and"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_not"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_or"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'LogicalXor\'], "
+  }
+  member_method {
+    name: "make_ndarray"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_template"
+    argspec: "args=[\'name_\', \'func_\', \'create_scope_now_\', \'unique_name_\', \'custom_getter_\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "make_tensor_proto"
+    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "map_fn"
+    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "matching_files"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_band_part"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_determinant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_diag_part"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_inverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_set_diag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_solve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_solve_ls"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_transpose"
+    argspec: "args=[\'a\', \'name\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\'], "
+  }
+  member_method {
+    name: "matrix_triangular_solve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "meshgrid"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "min_max_variable_partitioner"
+    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "model_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "moving_average_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "name_scope"
+    argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_op"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_regularizer"
+    argspec: "args=[\'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "norm"
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "not_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "one_hot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ones"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ones_like"
+    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "op_scope"
+    argspec: "args=[\'values\', \'name\', \'default_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pad"
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
+  member_method {
+    name: "parallel_stack"
+    argspec: "args=[\'values\', \'name\'], varargs=None, keywords=None, defaults=[\'parallel_stack\'], "
+  }
+  member_method {
+    name: "parse_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_sequence_example"
+    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_tensor"
+    argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "placeholder_with_default"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "polygamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "py_func"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "qr"
+    argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantize_v2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_concat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_gamma"
+    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_poisson"
+    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_shuffle"
+    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_uniform"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
+  }
+  member_method {
+    name: "rank"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_file"
+    argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "realdiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reciprocal"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reduce_all"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_any"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_join"
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_logsumexp"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_mean"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_min"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_prod"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "register_tensor_conversion_function"
+    argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
+  }
+  member_method {
+    name: "report_uninitialized_variables"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'report_uninitialized_variables\'], "
+  }
+  member_method {
+    name: "required_space_to_batch_paddings"
+    argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "reset_default_graph"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reverse"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reverse_sequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reverse_v2"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rint"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rsqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "saturate_cast"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scalar_mul"
+    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scan"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_add"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_div"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_mul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_add"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_sub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_sub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_max"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_mean"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_sum"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "self_adjoint_eig"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "self_adjoint_eigvals"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_mask"
+    argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
+  }
+  member_method {
+    name: "serialize_many_sparse"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_sparse"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_random_seed"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setdiff1d"
+    argspec: "args=[\'x\', \'y\', \'index_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "shape"
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "shape_n"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "slice"
+    argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_batch_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_depth"
+    argspec: "args=[\'input\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_add"
+    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "sparse_concat"
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_fill_empty_rows"
+    argspec: "args=[\'sp_input\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_mask"
+    argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_maximum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_merge"
+    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "sparse_minimum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_sum"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_sum_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reorder"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_reset_shape"
+    argspec: "args=[\'sp_input\', \'new_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_reshape"
+    argspec: "args=[\'sp_input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_retain"
+    argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_segment_mean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_segment_sqrt_n"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_segment_sum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_softmax"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_split"
+    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_tensor_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_tensor_to_dense"
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_to_dense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_to_indicator"
+    argspec: "args=[\'sp_input\', \'vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_transpose"
+    argspec: "args=[\'sp_input\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'value\', \'num_or_size_splits\', \'axis\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'split\'], "
+  }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "squared_difference"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "squeeze"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'stack\'], "
+  }
+  member_method {
+    name: "stop_gradient"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "strided_slice"
+    argspec: "args=[\'input_\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'var\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'0\', \'0\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "string_join"
+    argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "string_split"
+    argspec: "args=[\'source\', \'delimiter\'], varargs=None, keywords=None, defaults=[\' \'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket_fast"
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket_strong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_number"
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "svd"
+    argspec: "args=[\'tensor\', \'full_matrices\', \'compute_uv\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "tables_initializer"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
+  member_method {
+    name: "tan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensordot"
+    argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_bfloat16"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
+  }
+  member_method {
+    name: "to_double"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
+  }
+  member_method {
+    name: "to_float"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToFloat\'], "
+  }
+  member_method {
+    name: "to_int32"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt32\'], "
+  }
+  member_method {
+    name: "to_int64"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "trainable_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\'], "
+  }
+  member_method {
+    name: "truediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "truncated_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "truncatediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "truncatemod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tuple"
+    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unique"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unique_with_counts"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_max"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unstack"
+    argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'unstack\'], "
+  }
+  member_method {
+    name: "variable_axis_size_partitioner"
+    argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
+  }
+  member_method {
+    name: "variable_op_scope"
+    argspec: "args=[\'values\', \'name_or_scope\', \'default_name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variable_scope"
+    argspec: "args=[\'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variables_initializer"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
+  }
+  member_method {
+    name: "verify_tensor_all_finite"
+    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "where"
+    argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "while_loop"
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "write_file"
+    argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "zeros"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "zeros_like"
+    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "zeta"
+    argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
new file mode 100644
index 00000000000..4941dda50e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.python_io.TFRecordCompressionType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordCompressionType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GZIP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ZLIB"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
new file mode 100644
index 00000000000..0853716023a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.python_io.TFRecordOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "compression_type_map"
+    mtype: "<type \'dict\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compression_type_string"
+    argspec: "args=[\'cls\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
new file mode 100644
index 00000000000..af0c11ca14d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.python_io.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
new file mode 100644
index 00000000000..7c9953e5fe3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.python_io"
+tf_module {
+  member {
+    name: "TFRecordCompressionType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "tf_record_iterator"
+    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
new file mode 100644
index 00000000000..5993fdeb9c2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.random_normal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
new file mode 100644
index 00000000000..a434ed1599e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.random_uniform_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
new file mode 100644
index 00000000000..288b78b4cd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.resource_loader"
+tf_module {
+  member_method {
+    name: "get_data_files_path"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_path_to_datafile"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_root_dir_with_all_resources"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_resource"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readahead_file_path"
+    argspec: "args=[\'path\', \'readahead\'], varargs=None, keywords=None, defaults=[\'128M\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
new file mode 100644
index 00000000000..56d76902fd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.saved_model.builder.SavedModelBuilder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph_and_variables"
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
new file mode 100644
index 00000000000..adc697ad1c0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.builder"
+tf_module {
+  member {
+    name: "SavedModelBuilder"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
new file mode 100644
index 00000000000..20e10aa094f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.saved_model.constants"
+tf_module {
+  member {
+    name: "ASSETS_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ASSETS_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LEGACY_INIT_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MAIN_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PBTXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_SCHEMA_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VARIABLES_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_FILENAME"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
new file mode 100644
index 00000000000..896e2160c69
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.loader"
+tf_module {
+  member_method {
+    name: "load"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\'], varargs=None, keywords=saver_kwargs, defaults=None"
+  }
+  member_method {
+    name: "maybe_saved_model_directory"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
new file mode 100644
index 00000000000..176cb788c24
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.main_op"
+tf_module {
+  member_method {
+    name: "main_op"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "main_op_with_restore"
+    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
new file mode 100644
index 00000000000..5683766b289
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.saved_model"
+tf_module {
+  member {
+    name: "builder"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "loader"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "main_op"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "signature_constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "signature_def_utils"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "tag_constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "utils"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
new file mode 100644
index 00000000000..478d410e066
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.saved_model.signature_constants"
+tf_module {
+  member {
+    name: "CLASSIFY_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_CLASSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_SCORES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
new file mode 100644
index 00000000000..e9867d84c3e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.saved_model.signature_def_utils"
+tf_module {
+  member_method {
+    name: "build_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "classification_signature_def"
+    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "regression_signature_def"
+    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
new file mode 100644
index 00000000000..7c24b7ad3cf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.tag_constants"
+tf_module {
+  member {
+    name: "SERVING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINING"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
new file mode 100644
index 00000000000..bc150e56a36
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.utils"
+tf_module {
+  member_method {
+    name: "build_tensor_info"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/tensorflow.sets.pbtxt
new file mode 100644
index 00000000000..8a196b1a556
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sets.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.sets"
+tf_module {
+  member_method {
+    name: "set_difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "set_intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
new file mode 100644
index 00000000000..84883c1a395
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.spectral"
+tf_module {
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
new file mode 100644
index 00000000000..ab3449d80f6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.summary.Event"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILE_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "META_GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_LOG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUMMARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WALL_TIME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
new file mode 100644
index 00000000000..2a5b63dceae
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.summary.FileWriterCache"
+tf_class {
+  is_instance: "<class \'tensorflow.python.summary.writer.writer_cache.FileWriterCache\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "clear"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
new file mode 100644
index 00000000000..dcf747971b7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.summary.FileWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.summary.writer.writer.FileWriter\'>"
+  is_instance: "<class \'tensorflow.python.summary.writer.writer.SummaryToEventTransformer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_event"
+    argspec: "args=[\'self\', \'event\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_graph"
+    argspec: "args=[\'self\', \'graph\', \'global_step\', \'graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'meta_graph_def\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_run_metadata"
+    argspec: "args=[\'self\', \'run_metadata\', \'tag\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_session_log"
+    argspec: "args=[\'self\', \'session_log\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_summary"
+    argspec: "args=[\'self\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_logdir"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reopen"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
new file mode 100644
index 00000000000..92ca4872caf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.summary.SessionLog"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CHECKPOINT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHECKPOINT_PATH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MSG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "START"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_UNSPECIFIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STOP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SessionStatus"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
new file mode 100644
index 00000000000..f93da2196ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.summary.SummaryDescription"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.SummaryDescription\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "TYPE_HINT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
new file mode 100644
index 00000000000..605e305e82c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.summary.Summary.Audio"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTENT_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LENGTH_FRAMES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_CHANNELS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAMPLE_RATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
new file mode 100644
index 00000000000..0646972196d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.summary.Summary.Image"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COLORSPACE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HEIGHT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WIDTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
new file mode 100644
index 00000000000..b319cd03d9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.summary.Summary.Value"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "AUDIO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HISTO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "IMAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SIMPLE_VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
new file mode 100644
index 00000000000..132ef1b7d2e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.summary.Summary"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "Audio"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "Image"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Value"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
new file mode 100644
index 00000000000..4dce20819de
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.summary.TaggedRunMetadata"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.TaggedRunMetadata\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
new file mode 100644
index 00000000000..19d822e61bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.summary"
+tf_module {
+  member {
+    name: "Event"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FileWriter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FileWriterCache"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionLog"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Summary"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SummaryDescription"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TaggedRunMetadata"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "audio"
+    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_summary_description"
+    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "histogram"
+    argspec: "args=[\'name\', \'values\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "image"
+    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "merge_all"
+    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=[\'summaries\'], "
+  }
+  member_method {
+    name: "scalar"
+    argspec: "args=[\'name\', \'tensor\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "tensor_summary"
+    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "text"
+    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
new file mode 100644
index 00000000000..02dec04b9cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.sysconfig"
+tf_module {
+  member_method {
+    name: "get_include"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_lib"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
new file mode 100644
index 00000000000..df528e26b60
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.test.Benchmark"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.benchmark.TensorFlowBenchmark\'>"
+  is_instance: "<class \'tensorflow.python.platform.benchmark.Benchmark\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "is_abstract"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "report_benchmark"
+    argspec: "args=[\'self\', \'iters\', \'cpu_time\', \'wall_time\', \'throughput\', \'extras\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "run_op_benchmark"
+    argspec: "args=[\'self\', \'sess\', \'op_or_tensor\', \'feed_dict\', \'burn_iters\', \'min_iters\', \'store_trace\', \'store_memory_usage\', \'name\', \'extras\', \'mbs\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'10\', \'False\', \'True\', \'None\', \'None\', \'0\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt
new file mode 100644
index 00000000000..e02a0c6097c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.test.StubOutForTesting"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.googletest.StubOutForTesting\'>"
+  member_method {
+    name: "CleanUp"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Set"
+    argspec: "args=[\'self\', \'parent\', \'child_name\', \'new_child\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SmartSet"
+    argspec: "args=[\'self\', \'obj\', \'attr_name\', \'new_attr\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SmartUnsetAll"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsetAll"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
new file mode 100644
index 00000000000..2a88f26ed02
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
@@ -0,0 +1,59 @@
+path: "tensorflow.test"
+tf_module {
+  member {
+    name: "Benchmark"
+    mtype: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
+  }
+  member {
+    name: "StubOutForTesting"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TestCase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "mock"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "assert_equal_graph_def"
+    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "compute_gradient"
+    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradient_error"
+    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_local_cluster"
+    argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\'], varargs=None, keywords=None, defaults=[\'grpc\'], "
+  }
+  member_method {
+    name: "get_temp_dir"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gpu_device_name"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_built_with_cuda"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_gpu_available"
+    argspec: "args=[\'cuda_only\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "main"
+    argspec: "args=[\'argv\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "test_src_dir_path"
+    argspec: "args=[\'relative_path\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
new file mode 100644
index 00000000000..8c91c5b4d9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdadeltaOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-08\', \'False\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
new file mode 100644
index 00000000000..05d38d62ccd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdagradDAOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'global_step\', \'initial_gradient_squared_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'AdagradDA\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
new file mode 100644
index 00000000000..19ca9f57637
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdagradOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
new file mode 100644
index 00000000000..c8144e2db78
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdamOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
new file mode 100644
index 00000000000..8cf52b817f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.BytesList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.BytesList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 00000000000..c3037baa8c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 00000000000..9d3688e5657
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.train.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
new file mode 100644
index 00000000000..abbe273be32
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.ChiefSessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.ChiefSessionCreator\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
new file mode 100644
index 00000000000..93ff856b09d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.ClusterDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.ClusterDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "JOB_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
new file mode 100644
index 00000000000..1658b15a5f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.train.ClusterSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.ClusterSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "jobs"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_cluster_def"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_dict"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "job_tasks"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_tasks"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "task_address"
+    argspec: "args=[\'self\', \'job_name\', \'task_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "task_indices"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
new file mode 100644
index 00000000000..11277f077ee
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.train.Coordinator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.coordinator.Coordinator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "joined"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'clean_stop_exception_types\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clear_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\', \'threads\', \'stop_grace_period_secs\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'120\', \'False\'], "
+  }
+  member_method {
+    name: "raise_requested_exception"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "register_thread"
+    argspec: "args=[\'self\', \'thread\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_on_exception"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+  member_method {
+    name: "wait_for_stop"
+    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
new file mode 100644
index 00000000000..f7215a20372
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.Example"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.example_pb2.Example\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
new file mode 100644
index 00000000000..737acbe07c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.train.ExponentialMovingAverage"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.moving_averages.ExponentialMovingAverage\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'decay\', \'num_updates\', \'zero_debias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'ExponentialMovingAverage\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "average"
+    argspec: "args=[\'self\', \'var\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "average_name"
+    argspec: "args=[\'self\', \'var\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables_to_restore"
+    argspec: "args=[\'self\', \'moving_avg_variables\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
new file mode 100644
index 00000000000..3ad98354d69
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.FeatureList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
new file mode 100644
index 00000000000..cd171f4ca3e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.FeatureLists.FeatureListEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureListEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
new file mode 100644
index 00000000000..3d95017d584
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.FeatureLists"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureLists\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FeatureListEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
new file mode 100644
index 00000000000..9cca132bba9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.train.Feature"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Feature\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BYTES_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FLOAT_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INT64_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
new file mode 100644
index 00000000000..858aee03415
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.Features.FeatureEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
new file mode 100644
index 00000000000..49cd12153bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.Features"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Features\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FeatureEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
new file mode 100644
index 00000000000..7bec4d032ce
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
new file mode 100644
index 00000000000..31cf9aaeb2c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.train.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
new file mode 100644
index 00000000000..e3f01334b54
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.FloatList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FloatList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
new file mode 100644
index 00000000000..2dc11df57b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.FtrlOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
new file mode 100644
index 00000000000..147448618e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
new file mode 100644
index 00000000000..bdd4c525685
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.GradientDescentOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'GradientDescent\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
new file mode 100644
index 00000000000..8917dc122cf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.Int64List"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Int64List\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
new file mode 100644
index 00000000000..ac6d81541a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.JobDef.TasksEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.TasksEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
new file mode 100644
index 00000000000..ce34537fa13
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.train.JobDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.JobDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TASKS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TasksEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
new file mode 100644
index 00000000000..9801c05df18
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
new file mode 100644
index 00000000000..c61859004e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
@@ -0,0 +1,73 @@
+path: "tensorflow.train.LooperThread"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.coordinator.LooperThread\'>"
+  is_instance: "<class \'threading.Thread\'>"
+  member {
+    name: "daemon"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ident"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "getName"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isAlive"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isDaemon"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_alive"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "loop"
+    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setDaemon"
+    argspec: "args=[\'self\', \'daemonic\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setName"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
new file mode 100644
index 00000000000..7cf5488a15e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.MomentumOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_locking\', \'name\', \'use_nesterov\'], varargs=None, keywords=None, defaults=[\'False\', \'Momentum\', \'False\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
new file mode 100644
index 00000000000..3a5cc015b4d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.train.MonitoredSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.MonitoredSession\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'session_creator\', \'hooks\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'120\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 00000000000..25fd5e75a79
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.train.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
new file mode 100644
index 00000000000..7d1c89f9b37
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
new file mode 100644
index 00000000000..20b0c4d1b56
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.train.Optimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
new file mode 100644
index 00000000000..571d846b6c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.ProximalAdagradOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'ProximalAdagrad\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
new file mode 100644
index 00000000000..1feb136e7f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.ProximalGradientDescentOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.0\', \'False\', \'ProximalGradientDescent\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
new file mode 100644
index 00000000000..d84d0058eea
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.QueueRunner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.queue_runner_impl.QueueRunner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cancel_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "close_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "enqueue_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "exceptions_raised"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_closed_exception_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'queue\', \'enqueue_ops\', \'close_op\', \'cancel_op\', \'queue_closed_exception_types\', \'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_threads"
+    argspec: "args=[\'self\', \'sess\', \'coord\', \'daemon\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
new file mode 100644
index 00000000000..2aa4ae6d2d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.RMSPropOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'decay\', \'momentum\', \'epsilon\', \'use_locking\', \'centered\', \'name\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.0\', \'1e-10\', \'False\', \'False\', \'RMSProp\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
new file mode 100644
index 00000000000..84498a64f5b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
@@ -0,0 +1,120 @@
+path: "tensorflow.train.SaverDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.saver_pb2.SaverDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CheckpointFormatVersion"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILENAME_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "KEEP_CHECKPOINT_EVERY_N_HOURS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LEGACY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MAX_TO_KEEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "RESTORE_OP_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAVE_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHARDED_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "V1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "V2"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
new file mode 100644
index 00000000000..04c11712cd4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
@@ -0,0 +1,53 @@
+path: "tensorflow.train.Saver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.saver.Saver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "last_checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'var_list\', \'reshape\', \'sharded\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\', \'name\', \'restore_sequentially\', \'saver_def\', \'builder\', \'defer_build\', \'allow_empty\', \'write_version\', \'pad_step_number\', \'save_relative_paths\', \'filename\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'5\', \'10000.0\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'2\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "as_saver_def"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "export_meta_graph"
+    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'saver_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "recover_last_checkpoints"
+    argspec: "args=[\'self\', \'checkpoint_paths\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'sess\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "set_last_checkpoints"
+    argspec: "args=[\'self\', \'last_checkpoints\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_last_checkpoints_with_time"
+    argspec: "args=[\'self\', \'last_checkpoints_with_time\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
new file mode 100644
index 00000000000..62b956c5ef7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.Scaffold"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.Scaffold\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "init_feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_for_local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "saver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_or_default"
+    argspec: "args=[\'arg_name\', \'collection_key\', \'default_constructor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
new file mode 100644
index 00000000000..3c5a6ac13cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.train.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
new file mode 100644
index 00000000000..9ab95537021
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.SequenceExample"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.example_pb2.SequenceExample\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTEXT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_LISTS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
new file mode 100644
index 00000000000..af0a3b73cc2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.train.ServerDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.ServerDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CLUSTER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT_SESSION_CONFIG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "JOB_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PROTOCOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TASK_INDEX_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
new file mode 100644
index 00000000000..9b8f185f5b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.train.Server"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "server_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'server_or_cluster_def\', \'job_name\', \'task_index\', \'protocol\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "create_local_server"
+    argspec: "args=[\'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
new file mode 100644
index 00000000000..beb232715f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.train.SessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
new file mode 100644
index 00000000000..cc31bb4e4b3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.SessionManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_manager.SessionManager\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\'], "
+  }
+  member_method {
+    name: "prepare_session"
+    argspec: "args=[\'self\', \'master\', \'init_op\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\', \'init_feed_dict\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'7200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recover_session"
+    argspec: "args=[\'self\', \'master\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'7200\', \'None\'], "
+  }
+  member_method {
+    name: "wait_for_session"
+    argspec: "args=[\'self\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'inf\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
new file mode 100644
index 00000000000..442990893e3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.train.SessionRunArgs"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "fetches"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
new file mode 100644
index 00000000000..d5adb15c95f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.train.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
new file mode 100644
index 00000000000..db1aa24acf0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.train.SessionRunHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
new file mode 100644
index 00000000000..0b401d59c40
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.train.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
new file mode 100644
index 00000000000..62bfdab40bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.SingularMonitoredSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SingularMonitoredSession\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "raw_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
new file mode 100644
index 00000000000..13261f6dde1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
new file mode 100644
index 00000000000..e388599b0bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
new file mode 100644
index 00000000000..697c3667b09
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
new file mode 100644
index 00000000000..cc9bd5c136b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
@@ -0,0 +1,153 @@
+path: "tensorflow.train.Supervisor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.supervisor.Supervisor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "USE_DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "coord"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_step"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_for_local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_model_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_path"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summaries_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "saver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_manager"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_writer"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "Loop"
+    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "PrepareSession"
+    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
+  }
+  member_method {
+    name: "RequestStop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShouldStop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StartQueueRunners"
+    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StartStandardServices"
+    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stop"
+    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "StopOnException"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SummaryComputed"
+    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WaitForStop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "loop"
+    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "managed_session"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+  member_method {
+    name: "prepare_or_wait_for_session"
+    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "start_standard_services"
+    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "stop_on_exception"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary_computed"
+    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "wait_for_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
new file mode 100644
index 00000000000..915d8501af0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.train.SyncReplicasOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_chief_queue_runner"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_init_tokens_op"
+    argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "make_session_run_hook"
+    argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
new file mode 100644
index 00000000000..140407651a9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.WorkerSessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.WorkerSessionCreator\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
new file mode 100644
index 00000000000..58fd5760c11
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -0,0 +1,407 @@
+path: "tensorflow.train"
+tf_module {
+  member {
+    name: "AdadeltaOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdagradDAOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdagradOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdamOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BytesList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ChiefSessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ClusterDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "ClusterSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Coordinator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Example"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "ExponentialMovingAverage"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Feature"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeatureList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeatureLists"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Features"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FloatList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FtrlOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GradientDescentOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Int64List"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "JobDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LooperThread"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MomentumOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MonitoredSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Optimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProximalAdagradOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProximalGradientDescentOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueRunner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RMSPropOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Saver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SaverDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Scaffold"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SequenceExample"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ServerDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionManager"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SingularMonitoredSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Supervisor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SyncReplicasOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerSessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "queue_runner"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "MonitoredTrainingSession"
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'100\', \'None\', \'None\', \'120\', \'100\'], "
+  }
+  member_method {
+    name: "NewCheckpointReader"
+    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_queue_runner"
+    argspec: "args=[\'qr\', \'collection\'], varargs=None, keywords=None, defaults=[\'queue_runners\'], "
+  }
+  member_method {
+    name: "assert_global_step"
+    argspec: "args=[\'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "basic_train_loop"
+    argspec: "args=[\'supervisor\', \'train_step_fn\', \'args\', \'kwargs\', \'master\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\'], "
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'num_threads\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "checkpoint_exists"
+    argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "create_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "do_quantize_training_on_graphdef"
+    argspec: "args=[\'input_graph\', \'num_bits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exponential_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_meta_graph"
+    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "generate_checkpoint_state_proto"
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_checkpoint_mtimes"
+    argspec: "args=[\'checkpoint_prefixes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_checkpoint_state"
+    argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_or_create_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "global_step"
+    argspec: "args=[\'sess\', \'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "import_meta_graph"
+    argspec: "args=[\'meta_graph_or_file\', \'clear_devices\', \'import_scope\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "input_producer"
+    argspec: "args=[\'input_tensor\', \'element_shape\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'summary_name\', \'name\', \'cancel_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'32\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_time_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "limit_epochs"
+    argspec: "args=[\'tensor\', \'num_epochs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "match_filenames_once"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "maybe_batch"
+    argspec: "args=[\'tensors\', \'keep_input\', \'batch_size\', \'num_threads\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_batch_join"
+    argspec: "args=[\'tensors_list\', \'keep_input\', \'batch_size\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_shuffle_batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'keep_input\', \'num_threads\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_shuffle_batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'keep_input\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "natural_exp_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "piecewise_constant"
+    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "polynomial_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "range_input_producer"
+    argspec: "args=[\'limit\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "replica_device_setter"
+    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sdca_fprint"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sdca_optimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sdca_shrink_l1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shuffle_batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'num_threads\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "shuffle_batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "slice_input_producer"
+    argspec: "args=[\'tensor_list\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'sess\', \'coord\', \'daemon\', \'start\', \'collection\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'True\', \'queue_runners\'], "
+  }
+  member_method {
+    name: "string_input_producer"
+    argspec: "args=[\'string_tensor\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\', \'cancel_op\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "summary_iterator"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_checkpoint_state"
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "write_graph"
+    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
new file mode 100644
index 00000000000..23d402de308
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.queue_runner.QueueRunner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.queue_runner_impl.QueueRunner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cancel_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "close_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "enqueue_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "exceptions_raised"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_closed_exception_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'queue\', \'enqueue_ops\', \'close_op\', \'cancel_op\', \'queue_closed_exception_types\', \'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_threads"
+    argspec: "args=[\'self\', \'sess\', \'coord\', \'daemon\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
new file mode 100644
index 00000000000..6e2d0430496
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.train.queue_runner"
+tf_module {
+  member {
+    name: "QueueRunner"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "add_queue_runner"
+    argspec: "args=[\'qr\', \'collection\'], varargs=None, keywords=None, defaults=[\'queue_runners\'], "
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'sess\', \'coord\', \'daemon\', \'start\', \'collection\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'True\', \'queue_runners\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
new file mode 100644
index 00000000000..c1e1c230a9f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.truncated_normal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
new file mode 100644
index 00000000000..e1b18dc92fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.uniform_unit_scaling_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.UniformUnitScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
new file mode 100644
index 00000000000..e229b02ceec
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.zeros_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
new file mode 100644
index 00000000000..cdfa0e7be52
--- /dev/null
+++ b/tensorflow/tools/api/lib/BUILD
@@ -0,0 +1,39 @@
+# Helper libraries for TensorFlow API compatibility test.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+tf_proto_library(
+    name = "api_objects_proto",
+    srcs = ["api_objects.proto"],
+)
+
+py_library(
+    name = "python_object_to_proto_visitor",
+    srcs = ["python_object_to_proto_visitor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":api_objects_proto_py",
+        "//tensorflow/tools/common:traverse",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/lib/api_objects.proto b/tensorflow/tools/api/lib/api_objects.proto
new file mode 100644
index 00000000000..0966a5f1d53
--- /dev/null
+++ b/tensorflow/tools/api/lib/api_objects.proto
@@ -0,0 +1,31 @@
+syntax = "proto2";
+
+package third_party.tensorflow.tools.api;
+
+message TFAPIMember {
+  optional string name = 1;
+  optional string mtype = 2;
+};
+
+message TFAPIMethod {
+  optional string name = 1;
+  optional string path = 2;
+  optional string argspec = 3;
+};
+
+message TFAPIModule {
+  repeated TFAPIMember member = 1;
+  repeated TFAPIMethod member_method = 2;
+};
+
+message TFAPIClass {
+  repeated string is_instance = 1;
+  repeated TFAPIMember member = 2;
+  repeated TFAPIMethod member_method = 3;
+};
+
+message TFAPIObject {
+  optional string path = 1;
+  optional TFAPIModule tf_module = 2;
+  optional TFAPIClass tf_class = 3;
+};
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
new file mode 100644
index 00000000000..43ba52f9834
--- /dev/null
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -0,0 +1,173 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""A visitor class that generates protobufs for each pyton object."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.api.lib import api_objects_pb2
+
+# Following object need to be handled individually.
+_CORNER_CASES = {
+    '': {'tools': {}},
+    'test.TestCase': {},
+    'test.TestCase.failureException': {},
+}
+
+
+def _SanitizedArgSpec(obj):
+  """Get an ArgSpec string that is free of addresses.
+
+  We have callables as function arg defaults. This results in addresses in
+  getargspec output. This function returns a sanitized string list of base
+  classes.
+
+  Args:
+    obj: A python routine for us the create the sanitized arspec of.
+
+  Returns:
+    string, a string representation of the argspec.
+  """
+  output_string = ''
+  unsanitized_arg_spec = tf_inspect.getargspec(obj)
+
+  for clean_attr in ('args', 'varargs', 'keywords'):
+    output_string += '%s=%s, ' % (clean_attr,
+                                  getattr(unsanitized_arg_spec, clean_attr))
+
+  if unsanitized_arg_spec.defaults:
+    sanitized_defaults = []
+    for val in unsanitized_arg_spec.defaults:
+      str_val = str(val)
+      # Sanitize argspecs that have hex code in them.
+      if ' at 0x' in str_val:
+        sanitized_defaults.append('%s instance>' % str_val.split(' at ')[0])
+      else:
+        sanitized_defaults.append(str_val)
+
+    output_string += 'defaults=%s, ' % sanitized_defaults
+
+  else:
+    output_string += 'defaults=None'
+
+  return output_string
+
+
+def _SanitizedMRO(obj):
+  """Get a list of superclasses with minimal amount of non-TF classes.
+
+  Based on many parameters like python version, OS, protobuf implementation
+  or changes in google core libraries the list of superclasses of a class
+  can change. We only return the first non-TF class to be robust to non API
+  affecting changes. The Method Resolution Order returned by `tf_inspect.getmro`
+  is still maintained in the return value.
+
+  Args:
+    obj: A python routine for us the create the sanitized arspec of.
+
+  Returns:
+    list of strings, string representation of the class names.
+  """
+  return_list = []
+  for cls in tf_inspect.getmro(obj):
+    str_repr = str(cls)
+    return_list.append(str_repr)
+    if 'tensorflow' not in str_repr:
+      break
+
+    # Hack - tensorflow.test.StubOutForTesting may or may not be type <object>
+    # depending on the environment. To avoid inconsistency, break after we add
+    # StubOutForTesting to the return_list.
+    if 'StubOutForTesting' in str_repr:
+      break
+
+  return return_list
+
+
+class PythonObjectToProtoVisitor(object):
+  """A visitor that summarizes given python objects as protobufs."""
+
+  def __init__(self):
+    # A dict to store all protocol buffers.
+    # Keyed by "path" to the object.
+    self._protos = {}
+
+  def GetProtos(self):
+    """Return the list of protos stored."""
+    return self._protos
+
+  def __call__(self, path, parent, children):
+    # The path to the object.
+    lib_path = 'tensorflow.%s' % path if path else 'tensorflow'
+
+    # A small helper method to construct members(children) protos.
+    def _AddMember(member_name, member_obj, proto):
+      """Add the child object to the object being constructed."""
+      _, member_obj = tf_decorator.unwrap(member_obj)
+      if member_name == '__init__' or not member_name.startswith('_'):
+        if tf_inspect.isroutine(member_obj):
+          new_method = proto.member_method.add()
+          new_method.name = member_name
+          # If member_obj is a python builtin, there is no way to get its
+          # argspec, because it is implemented on the C side. It also has no
+          # func_code.
+          if getattr(member_obj, 'func_code', None):
+            new_method.argspec = _SanitizedArgSpec(member_obj)
+        else:
+          new_member = proto.member.add()
+          new_member.name = member_name
+          new_member.mtype = str(type(member_obj))
+
+    parent_corner_cases = _CORNER_CASES.get(path, {})
+
+    if path not in _CORNER_CASES or parent_corner_cases:
+      # Decide if we have a module or a class.
+      if tf_inspect.ismodule(parent):
+        # Create a module object.
+        module_obj = api_objects_pb2.TFAPIModule()
+        for name, child in children:
+          if name in parent_corner_cases:
+            # If we have an empty entry, skip this object.
+            if parent_corner_cases[name]:
+              module_obj.member.add(**(parent_corner_cases[name]))
+          else:
+            _AddMember(name, child, module_obj)
+
+        # Store the constructed module object.
+        self._protos[lib_path] = api_objects_pb2.TFAPIObject(
+            path=lib_path, tf_module=module_obj)
+      elif tf_inspect.isclass(parent):
+        # Construct a class.
+        class_obj = api_objects_pb2.TFAPIClass()
+        class_obj.is_instance.extend(_SanitizedMRO(parent))
+        for name, child in children:
+          if name in parent_corner_cases:
+            # If we have an empty entry, skip this object.
+            if parent_corner_cases[name]:
+              module_obj.member.add(**(parent_corner_cases[name]))
+          else:
+            _AddMember(name, child, class_obj)
+
+        # Store the constructed class object.
+        self._protos[lib_path] = api_objects_pb2.TFAPIObject(
+            path=lib_path, tf_class=class_obj)
+      else:
+        logging.error('Illegal call to ApiProtoDump::_py_obj_to_proto.'
+                      'Object is neither a module nor a class: %s', path)
diff --git a/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt b/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt
new file mode 100644
index 00000000000..54b0cfcb3c1
--- /dev/null
+++ b/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt
@@ -0,0 +1,7 @@
+Golden file update requested!
+All test failures have been skipped, see the logs for detected diffs.
+This test is now going to write new golden files.
+Make sure to package the updates together with your change.
+
+You will need an explicit API approval. This may take longer than a normal
+review.
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
new file mode 100644
index 00000000000..8421d8fce28
--- /dev/null
+++ b/tensorflow/tools/api/tests/BUILD
@@ -0,0 +1,44 @@
+# TensorFlow API backwards compatibility tests.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "README.txt",
+    "API_UPDATE_WARNING.txt",
+])
+
+py_test(
+    name = "api_compatibility_test",
+    size = "small",
+    srcs = ["api_compatibility_test.py"],
+    data = [
+        "//tensorflow/tools/api/golden:api_golden",
+        "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
+        "//tensorflow/tools/api/tests:README.txt",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+        "@protobuf//:protobuf_python",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/tests/README.txt b/tensorflow/tools/api/tests/README.txt
new file mode 100644
index 00000000000..3463eeec6fe
--- /dev/null
+++ b/tensorflow/tools/api/tests/README.txt
@@ -0,0 +1,13 @@
+TensorFlow API backwards compatibility test
+This test ensures all changes to the public API of TensorFlow are intended.
+
+If this test fails, it means a change has been made to the public API. Backwards
+incompatible changes are not allowed. You can run the test as follows to update
+test goldens and package them with your change.
+
+    $ bazel build tensorflow/tools/api/tests:api_compatibility_test
+    $ bazel-bin/tensorflow/tools/api/tests/api_compatibility_test \
+          --update_goldens True
+
+You will need an API approval to make changes to the public TensorFlow API. This
+includes additions to the API.
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
new file mode 100644
index 00000000000..1ffa8fc26c0
--- /dev/null
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -0,0 +1,242 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""TensorFlow API compatibility tests.
+
+This test ensures all changes to the public API of TensorFlow are intended.
+
+If this test fails, it means a change has been made to the public API. Backwards
+incompatible changes are not allowed. You can run the test with
+"--update_goldens" flag set to "True" to update goldens when making changes to
+the public TF python API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import sys
+import unittest
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.tools.api.lib import api_objects_pb2
+from tensorflow.tools.api.lib import python_object_to_proto_visitor
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+
+# FLAGS defined at the bottom:
+FLAGS = None
+# DEFINE_boolean, update_goldens, default False:
+_UPDATE_GOLDENS_HELP = """
+     Update stored golden files if API is updated. WARNING: All API changes
+     have to be authorized by TensorFlow leads.
+"""
+
+# DEFINE_boolean, verbose_diffs, default False:
+_VERBOSE_DIFFS_HELP = """
+     If set to true, print line by line diffs on all libraries. If set to
+     false, only print which libraries have differences.
+"""
+
+_API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
+_TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
+_UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
+
+
+def _KeyToFilePath(key):
+  """From a given key, construct a filepath."""
+  def _ReplaceCapsWithDash(matchobj):
+    match = matchobj.group(0)
+    return '-%s' % (match.lower())
+
+  case_insensitive_key = re.sub('([A-Z]{1})', _ReplaceCapsWithDash, key)
+  return os.path.join(_API_GOLDEN_FOLDER, '%s.pbtxt' % case_insensitive_key)
+
+
+def _FileNameToKey(filename):
+  """From a given filename, construct a key we use for api objects."""
+  def _ReplaceDashWithCaps(matchobj):
+    match = matchobj.group(0)
+    return match[1].upper()
+
+  base_filename = os.path.basename(filename)
+  base_filename_without_ext = os.path.splitext(base_filename)[0]
+  api_object_key = re.sub(
+      '((-[a-z]){1})', _ReplaceDashWithCaps, base_filename_without_ext)
+  return api_object_key
+
+
+class ApiCompatibilityTest(test.TestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(ApiCompatibilityTest, self).__init__(*args, **kwargs)
+
+    golden_update_warning_filename = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _UPDATE_WARNING_FILE)
+    self._update_golden_warning = file_io.read_file_to_string(
+        golden_update_warning_filename)
+
+    test_readme_filename = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _TEST_README_FILE)
+    self._test_readme_message = file_io.read_file_to_string(
+        test_readme_filename)
+
+  def _AssertProtoDictEquals(self,
+                             expected_dict,
+                             actual_dict,
+                             verbose=False,
+                             update_goldens=False):
+    """Diff given dicts of protobufs and report differences a readable way.
+
+    Args:
+      expected_dict: a dict of TFAPIObject protos constructed from golden
+          files.
+      actual_dict: a ict of TFAPIObject protos constructed by reading from the
+          TF package linked to the test.
+      verbose: Whether to log the full diffs, or simply report which files were
+          different.
+      update_goldens: Whether to update goldens when there are diffs found.
+    """
+    diffs = []
+    verbose_diffs = []
+
+    expected_keys = set(expected_dict.keys())
+    actual_keys = set(actual_dict.keys())
+    only_in_expected = expected_keys - actual_keys
+    only_in_actual = actual_keys - expected_keys
+    all_keys = expected_keys | actual_keys
+
+    # This will be populated below.
+    updated_keys = []
+
+    for key in all_keys:
+      diff_message = ''
+      verbose_diff_message = ''
+      # First check if the key is not found in one or the other.
+      if key in only_in_expected:
+        diff_message = 'Object %s expected but not found (removed).' % key
+        verbose_diff_message = diff_message
+      elif key in only_in_actual:
+        diff_message = 'New object %s found (added).' % key
+        verbose_diff_message = diff_message
+      else:
+        # Now we can run an actual proto diff.
+        try:
+          self.assertProtoEquals(expected_dict[key], actual_dict[key])
+        except AssertionError as e:
+          updated_keys.append(key)
+          diff_message = 'Change detected in python object: %s.' % key
+          verbose_diff_message = str(e)
+
+      # All difference cases covered above. If any difference found, add to the
+      # list.
+      if diff_message:
+        diffs.append(diff_message)
+        verbose_diffs.append(verbose_diff_message)
+
+    # If diffs are found, handle them based on flags.
+    if diffs:
+      diff_count = len(diffs)
+      logging.error(self._test_readme_message)
+      logging.error('%d differences found between API and golden.', diff_count)
+      messages = verbose_diffs if verbose else diffs
+      for i in range(diff_count):
+        logging.error('Issue %d\t: %s', i + 1, messages[i])
+
+      if update_goldens:
+        # Write files if requested.
+        logging.warning(self._update_golden_warning)
+
+        # If the keys are only in expected, some objects are deleted.
+        # Remove files.
+        for key in only_in_expected:
+          filepath = _KeyToFilePath(key)
+          file_io.delete_file(filepath)
+
+        # If the files are only in actual (current library), these are new
+        # modules. Write them to files. Also record all updates in files.
+        for key in only_in_actual | set(updated_keys):
+          filepath = _KeyToFilePath(key)
+          file_io.write_string_to_file(
+              filepath, text_format.MessageToString(actual_dict[key]))
+      else:
+        # Fail if we cannot fix the test by updating goldens.
+        self.fail('%d differences found between API and golden.' % diff_count)
+
+    else:
+      logging.info('No differences found between API and golden.')
+
+  @unittest.skipUnless(
+      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
+      'API compabitility test goldens are generated using python2 on Linux.')
+  def testAPIBackwardsCompatibility(self):
+    # Extract all API stuff.
+    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
+
+    public_api_visitor = public_api.PublicAPIVisitor(visitor)
+    public_api_visitor.do_not_descend_map['tf'].append('contrib')
+    traverse.traverse(tf, public_api_visitor)
+
+    proto_dict = visitor.GetProtos()
+
+    # Read all golden files.
+    expression = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _KeyToFilePath('*'))
+    golden_file_list = file_io.get_matching_files(expression)
+
+    def _ReadFileToProto(filename):
+      """Read a filename, create a protobuf from its contents."""
+      ret_val = api_objects_pb2.TFAPIObject()
+      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
+      return ret_val
+
+    golden_proto_dict = {
+        _FileNameToKey(filename): _ReadFileToProto(filename)
+        for filename in golden_file_list
+    }
+
+    # Diff them. Do not fail if called with update.
+    # If the test is run to update goldens, only report diffs but do not fail.
+    self._AssertProtoDictEquals(
+        golden_proto_dict,
+        proto_dict,
+        verbose=FLAGS.verbose_diffs,
+        update_goldens=FLAGS.update_goldens)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
+  parser.add_argument(
+      '--verbose_diffs', type=bool, default=False, help=_VERBOSE_DIFFS_HELP)
+  FLAGS, unparsed = parser.parse_known_args()
+
+  # Now update argv, so that unittest library does not get confused.
+  sys.argv = [sys.argv[0]] + unparsed
+  test.main()
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 3b1901fd567..a2ffca97ecb 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -34,6 +34,7 @@ cc_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
+            "//tensorflow/core:framework_lite",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index 5cb1aa6cf85..fd1bebe8359 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -9,6 +9,8 @@ both on desktop machines and on Android.
 
 ### On Android:
 
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
+
 (1) build for your specific platform, e.g.:
 ```bash
 $bazel build -c opt \
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 180600e3b45..dfad11adf0b 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -109,8 +111,21 @@ void CreateTensorsFromInputInfo(
         InitializeTensor<uint8>(input.initialization_values, &input_tensor);
         break;
       }
+      case DT_BOOL: {
+        InitializeTensor<bool>(input.initialization_values, &input_tensor);
+        break;
+      }
+      case DT_STRING: {
+        if (!input.initialization_values.empty()) {
+          LOG(FATAL) << "Initialization values are not supported for strings";
+        }
+        auto type_tensor = input_tensor.flat<string>();
+        type_tensor = type_tensor.constant("");
+        break;
+      }
       default:
-        LOG(FATAL) << "Unsupported input type: " << input.data_type;
+        LOG(FATAL) << "Unsupported input type: "
+                   << DataTypeString(input.data_type);
     }
     input_tensors->push_back({input.name, input_tensor});
   }
@@ -195,7 +210,7 @@ Status CalculateFlops(const GraphDef& graph,
 
 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
                     const std::vector<string>& outputs, Session* session,
-                    StatSummarizer* stats) {
+                    StatSummarizer* stats, int64* inference_time_us) {
   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
 
@@ -204,21 +219,27 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
   tensorflow::Status s;
 
   RunOptions run_options;
-  run_options.set_trace_level(RunOptions::FULL_TRACE);
-  RunMetadata run_metadata;
+  if (stats != nullptr) {
+    run_options.set_trace_level(RunOptions::FULL_TRACE);
+  }
 
+  RunMetadata run_metadata;
+  const int64 start_time = Env::Default()->NowMicros();
   s = session->Run(run_options, input_tensors, outputs, {}, &output_tensors,
                    &run_metadata);
+  const int64 end_time = Env::Default()->NowMicros();
+  *inference_time_us = end_time - start_time;
 
   if (!s.ok()) {
     LOG(ERROR) << "Error during inference: " << s;
+    return s;
   }
 
-  assert(run_metadata.has_step_stats());
-
-  const StepStats& step_stats = run_metadata.step_stats();
-
-  stats->ProcessStepStats(step_stats);
+  if (stats != nullptr) {
+    assert(run_metadata.has_step_stats());
+    const StepStats& step_stats = run_metadata.step_stats();
+    stats->ProcessStepStats(step_stats);
+  }
 
   return s;
 }
@@ -226,15 +247,24 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
 Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                         const std::vector<InputLayerInfo>& inputs,
                         const std::vector<string>& outputs, Session* session,
-                        StatSummarizer* stats) {
+                        StatSummarizer* stats, int64* total_time_us) {
   // Convert the run_delay string into a timespec.
   timespec req;
   req.tv_sec = static_cast<time_t>(sleep_seconds);
   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
 
-  LOG(INFO) << "Running benchmark";
+  *total_time_us = 0;
+
+  LOG(INFO) << "Running benchmark for " << num_runs << " iterations "
+            << (stats != nullptr ? "with" : "without")
+            << " detailed stat logging:";
+
+  Stat<int64> stat;
   for (int i = 0; i < num_runs; ++i) {
-    Status run_status = RunBenchmark(inputs, outputs, session, stats);
+    int64 time;
+    Status run_status = RunBenchmark(inputs, outputs, session, stats, &time);
+    stat.UpdateStat(time);
+    *total_time_us += time;
     if (!run_status.ok()) {
       LOG(INFO) << "Failed on run " << i;
       return run_status;
@@ -244,9 +274,16 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs,
     // This can be helpful to determine the effect of mobile processor
     // scaling and thermal throttling.
     if (sleep_seconds > 0.0) {
+#ifdef PLATFORM_WINDOWS
+      Sleep(sleep_seconds * 1000);
+#else
       nanosleep(&req, nullptr);
+#endif
     }
   }
+  std::stringstream stream;
+  stat.OutputToStream(&stream);
+  LOG(INFO) << stream.str() << std::endl;
 
   return Status::OK();
 }
@@ -273,6 +310,7 @@ int Main(int argc, char** argv) {
   bool show_type = true;
   bool show_summary = true;
   bool show_flops = false;
+  int warmup_runs = 2;
 
   std::vector<Flag> flag_list = {
       Flag("graph", &graph, "graph file name"),
@@ -297,10 +335,11 @@ int Main(int argc, char** argv) {
       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
       Flag("memory_limit", &memory_limit,
            "how many items to show by memory used"),
-      Flag("show_type", &show_time, "whether to list stats by op type"),
-      Flag("show_summary", &show_time,
+      Flag("show_type", &show_type, "whether to list stats by op type"),
+      Flag("show_summary", &show_summary,
            "whether to show a summary of the stats"),
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
+      Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
   };
   string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
@@ -351,6 +390,7 @@ int Main(int argc, char** argv) {
   LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
   LOG(INFO) << "Output prefix: [" << output_prefix << "]";
   LOG(INFO) << "Show sizes: [" << show_sizes << "]";
+  LOG(INFO) << "Warmup runs: [" << warmup_runs << "]";
 
   std::unique_ptr<Session> session;
   std::unique_ptr<StatSummarizer> stats;
@@ -383,6 +423,12 @@ int Main(int argc, char** argv) {
     CHECK(str_util::SplitAndParseAsInts(input_layer_shapes[n], ',', &sizes))
         << "Incorrect size string specified: " << input_layer_shapes[n];
     for (int i = 0; i < sizes.size(); ++i) {
+      int32 size = sizes[i];
+      if (size == -1) {
+        LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
+                   << " with the size you want to benchmark with.";
+        return -1;
+      }
       input.shape.AddDim(sizes[i]);
     }
     input.name = input_layers[n];
@@ -395,18 +441,48 @@ int Main(int argc, char** argv) {
     inputs.push_back(input);
   }
 
-  const int64 start_time = Env::Default()->NowMicros();
-  Status time_status =
-      TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
-                       session.get(), stats.get());
-  const int64 end_time = Env::Default()->NowMicros();
-  const double wall_time = (end_time - start_time) / 1000000.0;
+  // If requested, run through the graph first to preinitialize everything
+  // before the benchmarking runs.
+  int64 warmup_time_us = 0;
+  if (warmup_runs > 0) {
+    Status warmup_time_status =
+        TimeMultipleRuns(sleep_seconds, warmup_runs, inputs, output_layers,
+                         session.get(), nullptr, &warmup_time_us);
+    if (!warmup_time_status.ok()) {
+      LOG(ERROR) << "Timing failed with " << warmup_time_status;
+      return -1;
+    }
+  }
 
-  if (!time_status.ok()) {
-    LOG(ERROR) << "Timing failed with " << time_status;
+  // Capture overall inference time without stat logging overhead. This is the
+  // timing data that can be compared to other libaries.
+  int64 no_stat_time_us = 0;
+  Status no_stat_time_status =
+      TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
+                       session.get(), nullptr, &no_stat_time_us);
+  const double no_stat_wall_time = no_stat_time_us / 1000000.0;
+  if (!no_stat_time_status.ok()) {
+    LOG(ERROR) << "Timing failed with " << no_stat_time_status;
     return -1;
   }
 
+  // Run again to gather detailed log stats to get a better idea of where
+  // relative time is going within the graph.
+  int64 stat_time_us = 0;
+  Status stat_time_status =
+      TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
+                       session.get(), stats.get(), &stat_time_us);
+  if (!stat_time_status.ok()) {
+    LOG(ERROR) << "Timing failed with " << stat_time_status;
+    return -1;
+  }
+
+  LOG(INFO) << "Average inference timings in us: "
+            << "Warmup: "
+            << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
+            << "no stats: " << no_stat_time_us / num_runs << ", "
+            << "with stats: " << stat_time_us / num_runs;
+
   stats->PrintStepStats();
 
   if (show_sizes) {
@@ -437,7 +513,7 @@ int Main(int argc, char** argv) {
       pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
     }
     LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
-    const double mean_run_time = wall_time / num_runs;
+    const double mean_run_time = no_stat_wall_time / num_runs;
     LOG(INFO) << "FLOPs/second: "
               << strings::HumanReadableNum(
                      static_cast<int64>(total_flops / mean_run_time));
@@ -448,15 +524,38 @@ int Main(int argc, char** argv) {
     int64 total_size = inputs[0].shape.num_elements();
 
     // Throughput in MB/s
-    const double throughput = DataTypeSize(inputs[0].data_type) * total_size *
-                              num_runs / static_cast<double>(wall_time) /
-                              (1024 * 1024);
+    const double throughput =
+        DataTypeSize(inputs[0].data_type) * total_size * num_runs /
+        static_cast<double>(no_stat_wall_time) / (1024 * 1024);
 
     // Report the stats.
     TestReporter reporter(output_prefix, benchmark_name);
-    reporter.Initialize();
-    reporter.Benchmark(num_runs, -1.0, wall_time, throughput);
-    reporter.Close();
+    TF_QCHECK_OK(reporter.Initialize());
+    TF_QCHECK_OK(
+        reporter.Benchmark(num_runs, -1.0, no_stat_wall_time, throughput));
+    TF_QCHECK_OK(reporter.Close());
+
+    std::map<string, int64> node_type_map_count;
+    std::map<string, int64> node_type_map_time;
+    std::map<string, int64> node_type_map_memory;
+    std::map<string, int64> node_type_map_times_called;
+
+    int64 accumulated_us;
+    stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                              &node_type_map_memory,
+                              &node_type_map_times_called, &accumulated_us);
+    for (const auto& time : node_type_map_time) {
+      std::stringstream stream;
+      stream << benchmark_name << "_" << time.first;
+      TestReporter node_reporter(output_prefix, stream.str());
+
+      LOG(INFO) << "Outputting: [" << time.first << "]";
+
+      TF_QCHECK_OK(node_reporter.Initialize());
+      TF_QCHECK_OK(node_reporter.Benchmark(
+          num_runs, -1.0, (time.second * num_runs) / 1000000.0f, -1.0));
+      TF_QCHECK_OK(node_reporter.Close());
+    }
   }
 
   return 0;
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index d2757e94fa6..b9c0a488a4b 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -38,13 +38,13 @@ Status InitializeSession(int num_threads, const string& graph,
 // Does a single run of the model that's been loaded into the given session.
 Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
                     const std::vector<string>& outputs, Session* session,
-                    StatSummarizer* stats);
+                    StatSummarizer* stats, int64* inference_time_us);
 
 // Runs the model multiple time, keeping track of timing information.
 Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                         const std::vector<InputLayerInfo>& inputs,
                         const std::vector<string>& outputs, Session* session,
-                        StatSummarizer* stats);
+                        StatSummarizer* stats, int64* total_time_us);
 
 // Handles all setup and argument parsing.
 int Main(int argc, char** argv);
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index 9e0a3bd9400..ee7f24c0cf7 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -61,8 +61,9 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
                                                   &loaded_graph_def));
   std::unique_ptr<StatSummarizer> stats;
   stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
+  int64 time;
   TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
-      0.0, 10, {input}, {output_name}, session.get(), stats.get()));
+      0.0, 10, {input}, {output_name}, session.get(), stats.get(), &time));
 }
 
 }  // namespace
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 1d888e4eaed..c6679f78826 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -6,21 +6,16 @@ MAINTAINER Jan Prach <jendap@google.com>
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_bazel.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
 
 # Install extra libraries for android sdk.
-# (see http://stackoverflow.com/questions/33427893/can-not-run-android-sdk-build-tools-23-0-2-aapt)
 RUN apt-get update && apt-get install -y \
         python-numpy \
-        lib32stdc++6 \
-        lib32z1 \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -31,28 +26,28 @@ ENV ANDROID_DEV_HOME /android
 RUN mkdir -p ${ANDROID_DEV_HOME}
 
 # Install Android SDK.
-ENV ANDROID_SDK_FILENAME android-sdk_r24.4.1-linux.tgz
-ENV ANDROID_SDK_URL http://dl.google.com/android/${ANDROID_SDK_FILENAME}
+ENV ANDROID_SDK_FILENAME tools_r25.2.5-linux.zip
+ENV ANDROID_SDK_URL https://dl.google.com/android/repository/${ANDROID_SDK_FILENAME}
 ENV ANDROID_API_LEVEL 23
-ENV ANDROID_BUILD_TOOLS_VERSION 23.0.2
+# Build Tools Version liable to change.
+ENV ANDROID_BUILD_TOOLS_VERSION 25.0.2
 ENV ANDROID_SDK_HOME ${ANDROID_DEV_HOME}/sdk
 ENV PATH ${PATH}:${ANDROID_SDK_HOME}/tools:${ANDROID_SDK_HOME}/platform-tools
 RUN cd ${ANDROID_DEV_HOME} && \
     wget -q ${ANDROID_SDK_URL} && \
-    tar -xzf ${ANDROID_SDK_FILENAME} && \
+    unzip ${ANDROID_SDK_FILENAME} -d android-sdk-linux && \
     rm ${ANDROID_SDK_FILENAME} && \
     bash -c "ln -s ${ANDROID_DEV_HOME}/android-sdk-* ${ANDROID_SDK_HOME}" && \
     echo y | android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r10e-linux-x86_64.bin
-ENV ANDROID_NDK_URL http://dl.google.com/android/ndk/${ANDROID_NDK_FILENAME}
+ENV ANDROID_NDK_FILENAME android-ndk-r12b-linux-x86_64.zip
+ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
 RUN cd ${ANDROID_DEV_HOME} && \
     wget -q ${ANDROID_NDK_URL} && \
-    chmod +x ${ANDROID_NDK_FILENAME} && \
-    ./${ANDROID_NDK_FILENAME} -o${ANDROID_DEV_HOME} && \
+    unzip ${ANDROID_NDK_FILENAME} -d ${ANDROID_DEV_HOME} && \
     rm ${ANDROID_NDK_FILENAME} && \
     bash -c "ln -s ${ANDROID_DEV_HOME}/android-ndk-* ${ANDROID_NDK_HOME}"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index 8a28fe6cdf9..9013dc012d9 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 FROM ubuntu:16.04
 
 MAINTAINER Shanqing Cai <cais@google.com>
@@ -7,9 +21,10 @@ COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN /install/install_deb_packages.sh
 
+RUN apt-get update
+RUN apt-get install -y --no-install-recommends python-pip
 RUN pip install --upgrade numpy
 
 # Install golang
 RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable
-RUN apt-get update
 RUN apt-get install -y golang
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu b/tensorflow/tools/ci_build/Dockerfile.cpu
index 86ead3dd4df..206108930a1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu
@@ -6,7 +6,6 @@ MAINTAINER Jan Prach <jendap@google.com>
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
@@ -14,7 +13,7 @@ RUN /install/install_bazel.sh
 RUN /install/install_proto3.sh
 RUN /install/install_buildifier.sh
 RUN /install/install_auditwheel.sh
+RUN /install/install_golang.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index fa74320b1e5..b914f51918c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -5,14 +5,22 @@ MAINTAINER Jan Prach <jendap@google.com>
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
-RUN echo "deb http://http.debian.net/debian jessie-backports main" | tee -a /etc/apt/sources.list
+RUN echo "deb http://http.debian.net/debian jessie-backports main" | \
+    tee -a /etc/apt/sources.list
+# Workaround bug in Jessie backport repository deb packages
+# http://serverfault.com/questions/830636/cannot-install-openjdk-8-jre-headless-on-debian-jessie
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends -t jessie-backports \
+        openjdk-8-jre-headless ca-certificates-java && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
+RUN /install/install_golang.sh
 
 # Fix a virtualenv install issue specific to Debian Jessie.
 RUN pip install --upgrade virtualenv
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 4d7f6ef95da..5d18295f68d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,20 +1,25 @@
-FROM nvidia/cuda:8.0-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
+# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
+RUN /install/install_golang.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
new file mode 100644
index 00000000000..c4342d17f5f
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -0,0 +1,36 @@
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+
+MAINTAINER Ilya Biryukov <ibiryukov@google.com>
+
+# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+
+# LLVM requires cmake version 3.4.3, but ppa:george-edison55/cmake-3.x only
+# provides version 3.2.2.
+# So we skip it in `install_deb_packages.sh`, and later install it from
+# https://cmake.org in `install_cmake_for_clang.sh`.
+RUN /install/install_deb_packages.sh --without_cmake
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_golang.sh
+
+# Install cmake and build clang
+RUN /install/install_cmake_for_clang.sh
+RUN /install/build_and_install_clang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# Configure the build for our CUDA configuration.
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_CLANG 1
+ENV CLANG_CUDA_COMPILER_PATH /usr/local/bin/clang
+ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/Dockerfile.hadoop b/tensorflow/tools/ci_build/Dockerfile.hadoop
index 8a97a4b466c..489493c26e4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.hadoop
+++ b/tensorflow/tools/ci_build/Dockerfile.hadoop
@@ -6,7 +6,6 @@ MAINTAINER Jonathan Hseu <jhseu@google.com>
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
@@ -15,6 +14,5 @@ RUN /install/install_proto3.sh
 RUN /install/install_buildifier.sh
 RUN /install/install_hadoop.sh
 
-# Set up bazelrc.
-COPY install/.bazelrc /root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.tensorboard b/tensorflow/tools/ci_build/Dockerfile.tensorboard
index 0ce2ab3aa54..9795872e2c4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.tensorboard
+++ b/tensorflow/tools/ci_build/Dockerfile.tensorboard
@@ -6,7 +6,6 @@ MAINTAINER Jan Prach <jendap@google.com>
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:mc3man/trusty-media && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 RUN /install/install_tensorboard_packages.sh
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 5c90fceaf70..ad83669950f 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -13,28 +13,32 @@ run continuous integration [ci.tensorflow.org](https://ci.tensorflow.org).
 
    You can run all the jobs **without docker** if you are on mac or on linux
    and you just don't want docker. Just install all the dependencies from
-   [os_setup.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_started/os_setup.md).
+   [Installing TensorFlow](https://www.tensorflow.org/install/).
    Then run any of the one liners below without the
    `tensorflow/tools/ci_build/ci_build.sh` in them.
 
 2. Clone tensorflow repository.
 
    ```bash
-git clone https://github.com/tensorflow/tensorflow.git
-```
+   git clone https://github.com/tensorflow/tensorflow.git
+   ```
 
 3. Go to tensorflow directory
 
    ```bash
-cd tensorflow
-```
+   cd tensorflow
+   ```
 
 4. Build what you want, for example
 
    ```bash
-tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-```
-
+   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+   ```
+   If you are using the Docker image on Windows or OS X, the Docker VM's default
+   memory limit may be too low to build TensorFlow. This can result in
+   strange-looking errors, e.g. the compilation may fail with `gcc: internal
+   compiler error: Killed (program cc1plus)`. Try increasing the memory limit in
+   the Docker preferences.
 
 
 ## Jobs
@@ -53,10 +57,10 @@ tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
 tensorflow/tools/ci_build/ci_build.sh GPU bazel test -c opt --config=cuda //tensorflow/...
 
 # build pip with gpu support
-tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
 
 # build and run gpu tests using python 3
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
 
 # build android example app
 tensorflow/tools/ci_build/ci_build.sh ANDROID tensorflow/tools/ci_build/builds/android.sh
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index 3282efa8d2d..63250e0a4da 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -31,8 +31,10 @@ configure_android_workspace
 CPUS=armeabi-v7a,arm64-v8a,x86,x86_64
 
 OUT_DIR="$(pwd)/out/"
+AAR_LIB_TMP="$(pwd)/aar_libs"
 
 rm -rf ${OUT_DIR}
+rm -rf ${AAR_LIB_TMP}
 
 # Build all relevant native libraries for each architecture.
 for CPU in ${CPUS//,/ }
@@ -50,6 +52,9 @@ do
     copy_lib bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so
     copy_lib bazel-bin/tensorflow/examples/android/libtensorflow_demo.so
     copy_lib bazel-bin/tensorflow/tools/benchmark/benchmark_model
+
+    mkdir -p ${AAR_LIB_TMP}/jni/${CPU}
+    cp bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so ${AAR_LIB_TMP}/jni/${CPU}
 done
 
 # Build Jar and also demo containing native libs for all architectures.
@@ -60,15 +65,41 @@ echo "========== Building TensorFlow Android Jar and Demo =========="
 bazel --bazelrc=/dev/null build -c opt --fat_apk_cpu=${CPUS} \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/contrib/android:android_tensorflow_inference_java \
+    //tensorflow/contrib/android:android_tensorflow_inference_java.aar \
     //tensorflow/examples/android:tensorflow_demo
 
-echo "Copying demo and Jar to ${OUT_DIR}"
+echo "Copying demo, AAR and Jar to ${OUT_DIR}"
 cp bazel-bin/tensorflow/examples/android/tensorflow_demo.apk \
     bazel-bin/tensorflow/contrib/android/libandroid_tensorflow_inference_java.jar ${OUT_DIR}
 
-echo "========== Makefile Build Test =========="
+cp bazel-bin/tensorflow/contrib/android/android_tensorflow_inference_java.aar \
+   ${OUT_DIR}/tensorflow.aar
+
+# TODO(andrewharp): build native libs into AAR directly once
+# https://github.com/bazelbuild/bazel/issues/348 is resolved.
+echo "Adding native libs to AAR"
+chmod +w ${OUT_DIR}/tensorflow.aar
+pushd ${AAR_LIB_TMP}
+zip -ur ${OUT_DIR}/tensorflow.aar $(find jni -name *.so)
+popd
+rm -rf ${AAR_LIB_TMP}
+
 # Test Makefile build just to make sure it still works.
 if [ -z "$NDK_ROOT" ]; then
    export NDK_ROOT=${ANDROID_NDK_HOME}
 fi
+
+echo "========== Benchmark Makefile Build Test =========="
 tensorflow/contrib/makefile/build_all_android.sh
+
+echo "========== Demo Makefile Build Test =========="
+tensorflow/contrib/makefile/build_all_android.sh \
+-s $(pwd)/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in \
+-t "libtensorflow_inference.so libtensorflow_demo.so"
+
+# Test Makefile build for tensorflow runtime with hexagon.
+# -b ... build only, -p ... use prebuilt binaries
+# This uses prebuilt binaries for hexagon dependencies because Building
+# hexagon binaries from source code requires qualcomm sdk.
+echo "========== Hexagon Build Test =========="
+tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh -bp
diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh
index d9a6ce96a58..fd9a14bd698 100644
--- a/tensorflow/tools/ci_build/builds/builds_common.sh
+++ b/tensorflow/tools/ci_build/builds/builds_common.sh
@@ -230,7 +230,7 @@ android_sdk_repository(
 android_ndk_repository(
     name="androidndk",
     path="${ANDROID_NDK_HOME}",
-    api_level=21)
+    api_level=14)
 EOF
     fi
   fi
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index f813d6c13f5..25cb51ea7cc 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -47,6 +47,10 @@ export CI_BUILD_PYTHON="${CI_BUILD_PYTHON:-python}"
 export PYTHON_BIN_PATH="${PYTHON_BIN_PATH:-$(which ${CI_BUILD_PYTHON})}"
 if [ "${CONTAINER_TYPE}" == "gpu" ]; then
   export TF_NEED_CUDA=1
+elif [ "${CONTAINER_TYPE}" == "gpu_clang" ]; then
+  export TF_NEED_CUDA=1
+  export TF_CUDA_CLANG=1
+  export CLANG_CUDA_COMPILER_PATH="/usr/local/bin/clang"
 else
   export TF_NEED_CUDA=0
 fi
diff --git a/tensorflow/tools/ci_build/builds/docker_test.sh b/tensorflow/tools/ci_build/builds/docker_test.sh
index ee004eb46c2..e337ea4b059 100755
--- a/tensorflow/tools/ci_build/builds/docker_test.sh
+++ b/tensorflow/tools/ci_build/builds/docker_test.sh
@@ -114,7 +114,7 @@ fi
 docker run -v ${BASE_DIR}:/tensorflow-src -w /tensorflow-src \
 ${GPU_EXTRA_PARAMS} \
 "${DOCKER_IMG_TAG}" \
-/bin/bash -c "tensorflow/tools/ci_build/builds/test_installation.sh && "\
+/bin/bash -c "tensorflow/tools/ci_build/builds/run_pip_tests.sh && "\
 "tensorflow/tools/ci_build/builds/test_tutorials.sh && "\
 "tensorflow/tools/ci_bukld/builds/integration_tests.sh"
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index ce0c8549573..5052d3626c9 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -14,19 +14,27 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to generate a tarball containing the TensorFlow C-library which
-# consists of the C API header file and libtensorflow.so.
+# Script to generate tarballs:
+# (1) The TensorFlow C-library: Containing C API header files and libtensorflow.so
+# (2) Native library for the TensorFlow Java API: Containing libtensorflow_jni.so
+# And jars:
+# (3) Java API .jar
+# (4) Java API sources .jar
 #
-# Work in progress but this is a step towards a "binary" distribution of the
-# TensorFlow C-library allowing TensorFlow language bindings to be used
-# without having to recompile the TensorFlow framework from sources, which
-# takes a while and also introduces many other dependencies.
+# These binary distributions will allow use of TensorFlow in various languages
+# without having to compile the TensorFlow framework from sources, which takes
+# a while and also introduces many other dependencies.
 #
 # Usage:
 # - Source this file in another bash script
 # - Execute build_libtensorflow_tarball SUFFIX
 #
-# Produces: lib_package/libtensorflow${SUFFIX}.tar.gz
+# Produces:
+# - lib_package/libtensorflow${SUFFIX}.tar.gz
+# - lib_package/libtensorflow_jni${SUFFIX}.tar.gz
+# - lib_package/libtensorflow.jar
+# - lib_package/libtensorflow-src.jar
+# - lib_package/libtensorflow_proto.zip
 #
 # ASSUMPTIONS:
 # - build_libtensorflow_tarball is invoked from the root of the git tree.
@@ -38,6 +46,10 @@ function build_libtensorflow_tarball() {
     echo "Must run this from the root of the bazel workspace"
     exit 1
   fi
+  # Delete any leftovers from previous builds in this workspace.
+  DIR=lib_package
+  rm -rf ${DIR}
+
   TARBALL_SUFFIX="${1}"
   BAZEL="bazel --bazelrc ./tensorflow/tools/ci_build/install/.bazelrc"
   BAZEL_OPTS="-c opt"
@@ -52,11 +64,23 @@ function build_libtensorflow_tarball() {
   # and https://github.com/bazelbuild/bazel/issues/1580
   # have been resolved and the "manual" tags on the BUILD targets
   # in tensorflow/tools/lib_package/BUILD are removed.
-  # Till then, must manually run the test.
-  bazel test ${BAZEL_OPTS} //tensorflow/tools/lib_package:libtensorflow_test
+  # Till then, must manually run the test since these tests are
+  # not covered by the continuous integration.
+  bazel test ${BAZEL_OPTS} \
+    //tensorflow/tools/lib_package:libtensorflow_test \
+    //tensorflow/tools/lib_package:libtensorflow_java_test
+
+  bazel build ${BAZEL_OPTS} \
+    //tensorflow/tools/lib_package:libtensorflow.tar.gz \
+    //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
+    //tensorflow/java:libtensorflow.jar \
+    //tensorflow/java:libtensorflow-src.jar \
+    //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
-  bazel build ${BAZEL_OPTS} //tensorflow/tools/lib_package:libtensorflow.tar.gz
-  DIR=lib_package
   mkdir -p ${DIR}
   cp bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz ${DIR}/libtensorflow${TARBALL_SUFFIX}.tar.gz
+  cp bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz ${DIR}/libtensorflow_jni${TARBALL_SUFFIX}.tar.gz
+  cp bazel-bin/tensorflow/java/libtensorflow.jar bazel-bin/tensorflow/java/libtensorflow-src.jar ${DIR}
+  cp bazel-genfiles/tensorflow/tools/lib_package/libtensorflow_proto.zip ${DIR}
+  chmod -x ${DIR}/*
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 1e55ad01245..85c712d3c6d 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -19,8 +19,7 @@
 # The PIP installation is done using the --user flag.
 #
 # Usage:
-#   pip.sh CONTAINER_TYPE [--mavx] [--mavx2]
-#                         [--test_tutorials] [--integration_tests]
+#   pip.sh CONTAINER_TYPE [--test_tutorials] [--integration_tests] [bazel flags]
 #
 # When executing the Python unit tests, the script obeys the shell
 # variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES,
@@ -30,7 +29,7 @@
 # script to perform bazel clean prior to main build and test steps.
 #
 # TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES overrides the default extra pip packages
-# to be installed in virtualenv before test_installation.sh is called. Multiple
+# to be installed in virtualenv before run_pip_tests.sh is called. Multiple
 # pakcage names are separated with spaces.
 #
 # If NO_TEST_ON_INSTALL has any non-empty and non-0 value, the test-on-install
@@ -39,8 +38,10 @@
 # If NO_TEST_USER_OPS has any non-empty and non-0 value, the testing of user-
 # defined ops against the installation will be skipped.
 #
-# Use --mavx or --mavx2 to let bazel use --copt=-mavx or --copt=-mavx2 options
-# while building the pip package, respectively.
+# If NO_TEST_TFDBG_BINARIES has any non-empty and non-0 value, the testing of
+# TensorFlow Debugger (tfdbg) binaries and examples will be skipped.
+#
+# Any flags not listed in the usage above will be passed directly to Bazel.
 #
 # If the --test_tutorials flag is set, it will cause the script to run the
 # tutorial tests (see test_tutorials.sh) after the PIP
@@ -49,6 +50,11 @@
 # to run.
 #
 
+# Helper function: Strip leading and trailing whitespaces
+str_strip () {
+  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
 # Fixed naming patterns for wheel (.whl) files given different python versions
 if [[ $(uname) == "Linux" ]]; then
   declare -A WHL_TAGS
@@ -66,32 +72,38 @@ source "${SCRIPT_DIR}/builds_common.sh"
 
 # Get the command line arguments
 CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift
 
-if [[ ! -z "${TF_BUILD_BAZEL_CLEAN}" ]] && \
+if [[ -n "${TF_BUILD_BAZEL_CLEAN}" ]] && \
    [[ "${TF_BUILD_BAZEL_CLEAN}" != "0" ]]; then
   echo "TF_BUILD_BAZEL_CLEAN=${TF_BUILD_BAZEL_CLEAN}: Performing 'bazel clean'"
   bazel clean
 fi
 
 DO_TEST_USER_OPS=1
-if [[ ! -z "${NO_TEST_USER_OPS}" ]] && \
+if [[ -n "${NO_TEST_USER_OPS}" ]] && \
    [[ "${NO_TEST_USER_OPS}" != "0" ]]; then
   echo "NO_TEST_USER_OPS=${NO_TEST_USER_OPS}: Will skip testing of user ops"
   DO_TEST_USER_OPS=0
 fi
 
+DO_TEST_TFDBG_BINARIES=1
+if [[ -n "${NO_TEST_TFDBG_BINARIES}" ]] && \
+   [[ "${NO_TEST_TFDBG_BINARIES}" != "0" ]]; then
+  echo "NO_TEST_TFDBG_BINARIES=${NO_TEST_TFDBG_BINARIES}: Will skip testing of tfdbg binaries"
+  DO_TEST_TFDBG_BINARIES=0
+fi
+
 DO_TEST_TUTORIALS=0
 DO_INTEGRATION_TESTS=0
-MAVX_FLAG=""
+BAZEL_FLAGS=""
 while true; do
   if [[ "${1}" == "--test_tutorials" ]]; then
     DO_TEST_TUTORIALS=1
   elif [[ "${1}" == "--integration_tests" ]]; then
     DO_INTEGRATION_TESTS=1
-  elif [[ "${1}" == "--mavx" ]]; then
-    MAVX_FLAG="--copt=-mavx"
-  elif [[ "${1}" == "--mavx2" ]]; then
-    MAVX_FLAG="--copt=-mavx2"
+  else
+    BAZEL_FLAGS="${BAZEL_FLAGS} ${1}"
   fi
 
   shift
@@ -100,18 +112,18 @@ while true; do
   fi
 done
 
-if [[ ! -z "${MAVX_FLAG}" ]]; then
-  echo "Using MAVX flag: ${MAVX_FLAG}"
-fi
+BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
+
+echo "Using Bazel flags: ${BAZEL_FLAGS}"
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
 GPU_FLAG=""
 if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
    [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
-  bazel build -c opt ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
 elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
-  bazel build -c opt --config=cuda ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+  bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
       die "Build failed."
   GPU_FLAG="--gpu"
 else
@@ -125,7 +137,7 @@ fi
 
 
 # If still in a virtualenv, deactivate it first
-if [[ ! -z "$(which deactivate)" ]]; then
+if [[ -n "$(which deactivate)" ]]; then
   echo "It appears that we are already in a virtualenv. Deactivating..."
   deactivate || die "FAILED: Unable to deactivate from existing virtualenv"
 fi
@@ -163,6 +175,11 @@ if [[ $(echo ${WHL_PATH} | wc -w) -ne 1 ]]; then
 "directory: ${PIP_WHL_DIR}"
 fi
 
+# Print the size of the PIP wheel file.
+echo
+echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
+echo
+
 # Rename the whl file properly so it will have the python
 # version tags and platform tags that won't cause pip install issues.
 if [[ $(uname) == "Linux" ]]; then
@@ -174,6 +191,8 @@ elif [[ $(uname) == "Darwin" ]]; then
     PY_TAGS="py2-none"
   elif [[ ${PY_MAJOR_MINOR_VER} == "3.5" ]]; then
     PY_TAGS="py3-none"
+  elif [[ ${PY_MAJOR_MINOR_VER} == "3.6" ]]; then
+    PY_TAGS="py3-none"
   fi
   PLATFORM_TAG="any"
 fi
@@ -181,19 +200,22 @@ fi
 WHL_DIR=$(dirname "${WHL_PATH}")
 WHL_BASE_NAME=$(basename "${WHL_PATH}")
 
-if [[ ! -z "${PY_TAGS}" ]]; then
+if [[ -n "${PY_TAGS}" ]]; then
   NEW_WHL_BASE_NAME=$(echo ${WHL_BASE_NAME} | cut -d \- -f 1)-\
 $(echo ${WHL_BASE_NAME} | cut -d \- -f 2)-${PY_TAGS}-${PLATFORM_TAG}.whl
 
   if [[ ! -f "${WHL_DIR}/${NEW_WHL_BASE_NAME}" ]]; then
-    cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}" && \
-      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}" || \
+    if cp "${WHL_DIR}/${WHL_BASE_NAME}" "${WHL_DIR}/${NEW_WHL_BASE_NAME}"
+    then
+      echo "Copied wheel file: ${WHL_BASE_NAME} --> ${NEW_WHL_BASE_NAME}"
+    else
       die "ERROR: Failed to copy wheel file to ${NEW_WHL_BASE_NAME}"
+    fi
   fi
 fi
 
 if [[ $(uname) == "Linux" ]]; then
-  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME} | sed "s/linux/manylinux1/")"
+  AUDITED_WHL_NAME="${WHL_DIR}/$(echo ${WHL_BASE_NAME//linux/manylinux1})"
 
   # Repair the wheels for cpu manylinux1
   if [[ ${CONTAINER_TYPE} == "cpu" ]]; then
@@ -221,14 +243,20 @@ echo "Installing pip whl file: ${WHL_PATH}"
 VENV_DIR="${PIP_TEST_ROOT}/venv"
 
 if [[ -d "${VENV_DIR}" ]]; then
-  rm -rf "${VENV_DIR}" && \
-      echo "Removed existing virtualenv directory: ${VENV_DIR}" || \
-      die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
+  if rm -rf "${VENV_DIR}"
+  then
+    echo "Removed existing virtualenv directory: ${VENV_DIR}"
+  else
+    die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
+  fi
 fi
 
-mkdir -p ${VENV_DIR} && \
-    echo "Created virtualenv directory: ${VENV_DIR}" || \
-    die "FAILED to create virtualenv directory: ${VENV_DIR}"
+if mkdir -p ${VENV_DIR}
+then
+  echo "Created virtualenv directory: ${VENV_DIR}"
+else
+  die "FAILED to create virtualenv directory: ${VENV_DIR}"
+fi
 
 # Verify that virtualenv exists
 if [[ -z $(which virtualenv) ]]; then
@@ -250,7 +278,7 @@ pip install --upgrade pip==8.1.2
 
 # Force tensorflow reinstallation. Otherwise it may not get installed from
 # last build if it had the same version number as previous build.
-PIP_FLAGS="--upgrade --force-reinstall --no-deps"
+PIP_FLAGS="--upgrade --force-reinstall"
 pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
 echo "Successfully installed pip package ${WHL_PATH}"
@@ -263,13 +291,13 @@ for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
       die "pip install ${PACKAGE} FAILED"
 done
 
-if [[ ! -z "${NO_TEST_ON_INSTALL}" ]] &&
+if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
    [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
   echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
   echo "  Skipping ALL Python unit tests on install"
 else
-  # Call test_installation.sh to perform test-on-install
-  "${SCRIPT_DIR}/test_installation.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG} ||
+  # Call run_pip_tests.sh to perform test-on-install
+  "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG} ||
       die "PIP tests-on-install FAILED"
 fi
 
@@ -279,6 +307,24 @@ if [[ "${DO_TEST_USER_OPS}" == "1" ]]; then
       die "PIP user-op tests-on-install FAILED"
 fi
 
+# Test TensorFlow Debugger (tfdbg) examples.
+if [[ "${DO_TEST_TFDBG_BINARIES}" == "1" ]]; then
+  echo
+  echo "Testing TensorFlow Debugger (tfdbg) binaries"
+  echo
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  "${SCRIPT_DIR}/../../../python/debug/examples/examples_test.sh" \
+      --virtualenv || \
+      die "PIP tests-on-install of tfdbg binaries FAILED"
+
+  popd
+fi
+
 # Optional: Run the tutorial tests
 if [[ "${DO_TEST_TUTORIALS}" == "1" ]]; then
   "${SCRIPT_DIR}/test_tutorials.sh" --virtualenv || \
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
new file mode 100755
index 00000000000..8e364f7ffb7
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+#
+# Run the python unit tests from the source code on the pip installation.
+#
+# Usage:
+#   run_pip_tests.sh [--virtualenv] [--gpu] [--mac]
+#
+# If the flag --virtualenv is set, the script will use "python" as the Python
+# binary path. Otherwise, it will use tools/python_bin_path.sh to determine
+# the Python binary path.
+#
+# The --gpu flag informs the script that this is a GPU build, so that the
+# appropriate test blacklists can be applied accordingly.
+#
+# The --mac flag informs the script that this is running on mac. Mac does not
+# have flock, so we should skip using parallel_gpu_execute on mac.
+#
+#   TF_BUILD_APPEND_ARGUMENTS:
+#                      Additional command line arguments for the bazel,
+#                      pip.sh or android.sh command
+
+# Script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+
+# Process input arguments
+IS_VIRTUALENV=0
+IS_GPU=0
+IS_MAC=0
+while true; do
+  if [[ "$1" == "--virtualenv" ]]; then
+    IS_VIRTUALENV=1
+  elif [[ "$1" == "--gpu" ]]; then
+    IS_GPU=1
+  elif [[ "$1" == "--mac" ]]; then
+    IS_MAC=1
+  fi
+  shift
+
+  if [[ -z "$1" ]]; then
+    break
+  fi
+done
+
+TF_GPU_COUNT=${TF_GPU_COUNT:-8}
+
+# PIP tests should have a "different" path. Different than the one we place
+# virtualenv, because we are deleting and recreating it here.
+PIP_TEST_PREFIX=bazel_pip
+PIP_TEST_ROOT=$(pwd)/${PIP_TEST_PREFIX}
+rm -rf $PIP_TEST_ROOT
+mkdir -p $PIP_TEST_ROOT
+ln -s $(pwd)/tensorflow ${PIP_TEST_ROOT}/tensorflow
+
+# Do not run tests with "no_pip" tag. If running GPU tests, also do not run
+# tests with no_pip_gpu tag.
+PIP_TEST_FILTER_TAG="-no_pip"
+if [[ ${IS_GPU} == "1" ]]; then
+  PIP_TEST_FILTER_TAG="-no_pip_gpu,${PIP_TEST_FILTER_TAG}"
+fi
+
+# Bazel flags we need for all tests:
+#     define=no_tensorflow_py_deps=true, to skip all test dependencies.
+#     test_lang_filters=py only py tests for pip package testing
+#     TF_BUILD_APPEND_ARGUMENTS any user supplied args.
+BAZEL_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py \
+  --build_tests_only -k --test_tag_filters=${PIP_TEST_FILTER_TAG} \
+  --test_timeout 300,450,1200,3600 ${TF_BUILD_APPEND_ARGUMENTS}"
+
+BAZEL_TEST_TARGETS="//${PIP_TEST_PREFIX}/tensorflow/contrib/... \
+  //${PIP_TEST_PREFIX}/tensorflow/python/... \
+  -//${PIP_TEST_PREFIX}/tensorflow/contrib/tensorboard/..."
+
+# Clean the bazel cache
+bazel clean
+
+# Run configure again, we might be using a different python path, due to
+# virtualenv.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_ENABLE_XLA=${TF_BUILD_ENABLE_XLA:-0}
+
+# Obtain the path to Python binary
+if [[ ${IS_VIRTUALENV} == "1" ]]; then
+  PYTHON_BIN_PATH="$(which python)"
+else
+  source tools/python_bin_path.sh
+  # Assume: PYTHON_BIN_PATH is exported by the script above
+fi
+
+export TF_NEED_CUDA=$IS_GPU
+yes "" | ./configure
+
+# Figure out how many concurrent tests we can run and do run the tests.
+BAZEL_PARALLEL_TEST_FLAGS=""
+if [[ $IS_GPU == 1 ]]; then
+  # Number of test threads is the number of GPU cards available.
+  if [[ $IS_MAC == 1 ]]; then
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+  else
+    PAR_TEST_JOBS=$TF_GPU_COUNT
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \
+        --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute"
+  fi
+else
+  # Number of test threads is the number of physical CPUs.
+  if [[ $IS_MAC == 1 ]]; then
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)"
+  else
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)"
+  fi
+fi
+
+# Actually run the tests.
+bazel test ${BAZEL_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} -- \
+    ${BAZEL_TEST_TARGETS}
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
deleted file mode 100755
index eb64fbcf185..00000000000
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ /dev/null
@@ -1,603 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Build the Python PIP installation package for TensorFlow
-# and run the Python unit tests from the source code on the installation
-#
-# Usage:
-#   test_installation.sh [--virtualenv] [--gpu] [--mac]
-#
-# If the flag --virtualenv is set, the script will use "python" as the Python
-# binary path. Otherwise, it will use tools/python_bin_path.sh to determine
-# the Python binary path.
-#
-# The --gpu flag informs the script that this is a GPU build, so that the
-# appropriate test blacklists can be applied accordingly.
-#
-# The --mac flag informs the script that this is running on mac. Mac does not
-# have flock, so we should skip using parallel_gpu_execute on mac.
-#
-# When executing the Python unit tests, the script obeys the shell
-# variables: PY_TEST_WHITELIST, PY_TEST_BLACKLIST, PY_TEST_GPU_BLACKLIST,
-#
-# To select only a subset of the Python tests to run, set the environment
-# variable PY_TEST_WHITELIST, e.g.,
-#   PY_TEST_WHITELIST="tensorflow/python/kernel_tests/shape_ops_test.py"
-# Separate the tests with a colon (:). Leave this environment variable empty
-# to disable the whitelist.
-#
-# You can also ignore a set of the tests by using the environment variable
-# PY_TEST_BLACKLIST. For example, you can include in PY_TEST_BLACKLIST the
-# tests that depend on Python modules in TensorFlow source that are not
-# exported publicly.
-#
-# In addition, you can put blacklist for only GPU build inthe environment
-# variable PY_TEST_GPU_BLACKLIST.
-#
-# TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
-# script to perform bazel clean prior to main build and test steps.
-#
-# TF_GPU_COUNT, Set the number of GPUs in the system. We run only this many
-# concurrent tests when running GPU tests.
-#
-# TF_BUILD_EXTRA_EXCLUSIVE_INSTALL_TESTS, add to the default list of
-# Python unit tests to run in exclusive mode (i.e., not concurrently with
-# other tests), separated with colons
-#
-# TF_BUILD_FILTER_INSTALL_TESTS_BY_TAG: If set to a non-empty string
-# (e.g., "local"), will filter the Python install-tests by that string as
-# bazel tags. Multiple filter tags can be used. Both the inclusive filtering
-# mode and the exclusive filtering mode can be used. For example:
-#
-#   TF_BUILD_FILTER_INSTALL_TESTS_BY_TAG="local,-manual"
-#
-# will let the script run the Python unit tests that have the tag "local"
-# and do not have the tag "manual". The "-" marks the exclusive filtering
-# mode. The inclusive mode is the default. Use commas to separate the tags.
-#
-# If the environmental variable NO_TEST_ON_INSTALL is set to any non-empty
-# value, the script will exit after the pip install step.
-
-# =============================================================================
-# Test blacklist: General
-#
-# tensorflow/python/framework/ops_test.py
-#   depends on depends on "test_ops", which is defined in a C++ file wrapped as
-#   a .py file through the Bazel rule “tf_gen_ops_wrapper_py”.
-# tensorflow/util/protobuf/compare_test.py:
-#   depends on compare_test_pb2 defined outside Python
-# tensorflow/python/framework/device_test.py:
-#   depends on CheckValid() and ToString(), both defined externally
-# tensorflow/python/framework/file_system_test.py:
-#   depends on having the .so which is not shipped in the pip package.
-# tensorflow/contrib/quantization/*:
-#   These depend on an .so mechanism that's not shipped in the pip package.
-# tensorflow/python/platform/default/*_test.py:
-#   These are obsolete and replaced by corresponding files in python/platform.
-#   They will be removed in the future.
-
-PY_TEST_BLACKLIST="${PY_TEST_BLACKLIST}:"\
-"tensorflow/python/framework/ops_test.py:"\
-"tensorflow/python/util/protobuf/compare_test.py:"\
-"tensorflow/python/framework/device_test.py:"\
-"tensorflow/python/framework/file_system_test.py:"\
-"tensorflow/contrib/quantization/python/dequantize_op_test.py:"\
-"tensorflow/contrib/quantization/python/quantized_conv_ops_test.py:"\
-"tensorflow/contrib/quantization/tools/quantize_graph_test.py:"\
-"tensorflow/contrib/session_bundle/bundle_shim_test.py:"\
-"tensorflow/contrib/session_bundle/exporter_test.py:"\
-"tensorflow/contrib/session_bundle/session_bundle_test.py:"\
-"tensorflow/python/platform/default/_resource_loader_test.py:"\
-"tensorflow/python/platform/default/flags_test.py:"\
-"tensorflow/python/platform/default/logging_test.py:"\
-"tensorflow/python/saved_model/saved_model_test.py:"\
-"tensorflow/contrib/learn/nonlinear_test.py:"\
-"tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py:"\
-"tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py:"
-
-# Test blacklist: GPU-only
-PY_TEST_GPU_BLACKLIST="${PY_TEST_GPU_BLACKLIST}:"\
-"tensorflow/python/client/session_test.py:"\
-"tensorflow/python/framework/function_test.py:"\
-"tensorflow/contrib/integrate/python/ops/odes_test.py:"\
-"tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py"
-
-# Tests that should be run in the exclusive mode (i.e., not parallel with
-# other tests)
-PY_TEST_EXCLUSIVE_LIST=""
-
-# Append custom list of exclusive tests
-if [[ ! -z "${TF_BUILD_EXTRA_EXCLUSIVE_INSTALL_TESTS}" ]]; then
-  PY_TEST_EXCLUSIVE_LIST="${PY_TEST_EXCLUSIVE_LIST}:"\
-"${TF_BUILD_EXTRA_EXCLUSIVE_INSTALL_TESTS}"
-fi
-
-# =============================================================================
-
-echo "PY_TEST_WHITELIST: ${PY_TEST_WHITELIST}"
-echo "PY_TEST_BLACKLIST: ${PY_TEST_BLACKLIST}"
-echo "PY_TEST_GPU_BLACKLIST: ${PY_TEST_GPU_BLACKLIST}"
-
-
-# Script directory
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/builds_common.sh"
-
-TF_GPU_COUNT=${TF_GPU_COUNT:-8}
-
-# Process input arguments
-IS_VIRTUALENV=0
-IS_GPU=0
-IS_MAC=0
-while true; do
-  if [[ "$1" == "--virtualenv" ]]; then
-    IS_VIRTUALENV=1
-  elif [[ "$1" == "--gpu" ]]; then
-    IS_GPU=1
-  elif [[ "$1" == "--mac" ]]; then
-    IS_MAC=1
-  fi
-  shift
-
-  if [[ -z "$1" ]]; then
-    break
-  fi
-done
-
-# Obtain the path to Python binary
-if [[ ${IS_VIRTUALENV} == "1" ]]; then
-  PYTHON_BIN_PATH="$(which python)"
-else
-  source tools/python_bin_path.sh
-  # Assume: PYTHON_BIN_PATH is exported by the script above
-fi
-
-# Obtain the path to head/ghead binary (for log file printing)
-HEAD_BIN="ghead"
-if [[ -z $(which "${HEAD_BIN}") ]]; then
-  # This is not Mac (which uses coreutils/ghead), use head.
-  HEAD_BIN="head"
-  if [[ -z $(which "${HEAD_BIN}") ]]; then
-     die "Unable to obtain path to head or ghead"
-  fi
-fi
-
-if [[ -z "${PYTHON_BIN_PATH}" ]]; then
-  die "PYTHON_BIN_PATH was not provided. If this is not virtualenv, "\
-"did you run configure?"
-fi
-
-# Append GPU-only test blacklist
-if [[ ${IS_GPU} == "1" ]]; then
-  PY_TEST_BLACKLIST="${PY_TEST_BLACKLIST}:${PY_TEST_GPU_BLACKLIST}"
-fi
-
-# Determine the major and minor versions of Python being used (e.g., 2.7)
-# This info will be useful for determining the directory of the local pip
-# installation of Python
-PY_MAJOR_MINOR_VER=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
-
-echo "Python binary path to be used in PIP install-test: ${PYTHON_BIN_PATH} "\
-"(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
-
-# Avoid permission issues outside container
-umask 000
-
-# Directory from which the unit-test files will be run
-PY_TEST_DIR_REL="pip_test/tests"
-PY_TEST_DIR=$(realpath ${PY_TEST_DIR_REL})  # Get absolute path
-rm -rf ${PY_TEST_DIR} && mkdir -p ${PY_TEST_DIR}
-
-# Create test log directory
-PY_TEST_LOG_DIR_REL=${PY_TEST_DIR_REL}/logs
-PY_TEST_LOG_DIR=$(realpath ${PY_TEST_LOG_DIR_REL})  # Absolute path
-
-mkdir ${PY_TEST_LOG_DIR}
-
-# Copy source files that are required by the tests but are not included in the
-# PIP package
-
-# Look for local Python library directory
-# pushd/popd avoids importing TensorFlow from the source directory.
-pushd /tmp > /dev/null
-TF_INSTALL_PATH=$(dirname \
-    $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.__file__)"))
-popd > /dev/null
-
-if [[ -z ${TF_INSTALL_PATH} ]]; then
-  die "Failed to find path where TensorFlow is installed."
-else
-  echo "Found TensorFlow install path: ${TF_INSTALL_PATH}"
-fi
-
-echo "Copying some source directories required by Python unit tests but "\
-"not included in install to TensorFlow install path: ${TF_INSTALL_PATH}"
-
-# Files for tensorflow.python.tools
-rm -rf ${TF_INSTALL_PATH}/python/tools
-cp -r tensorflow/python/tools \
-      ${TF_INSTALL_PATH}/python/tools
-touch ${TF_INSTALL_PATH}/python/tools/__init__.py  # Make module visible
-
-# Files for tensorflow.examples
-rm -rf ${TF_INSTALL_PATH}/examples/image_retraining
-mkdir -p ${TF_INSTALL_PATH}/examples/image_retraining
-cp -r tensorflow/examples/image_retraining/retrain.py \
-      ${TF_INSTALL_PATH}/examples/image_retraining/retrain.py
-touch ${TF_INSTALL_PATH}/examples/__init__.py
-touch ${TF_INSTALL_PATH}/examples/image_retraining/__init__.py
-
-echo "Copying additional files required by tests to working directory "\
-"for test: ${PY_TEST_DIR}"
-
-# Image files required by some tests, e.g., images_ops_test.py
-
-mkdir -p ${PY_TEST_DIR}/tensorflow/core/lib
-rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/jpeg
-cp -r tensorflow/core/lib/jpeg ${PY_TEST_DIR}/tensorflow/core/lib
-rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/png
-cp -r tensorflow/core/lib/png ${PY_TEST_DIR}/tensorflow/core/lib
-rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/gif
-cp -r tensorflow/core/lib/gif ${PY_TEST_DIR}/tensorflow/core/lib
-
-# Copy test data from tensorflow/contrib/ffmpeg
-
-mkdir -p ${PY_TEST_DIR}/tensorflow/contrib/ffmpeg
-rm -rf ${PY_TEST_DIR}/tensorflow/contrib/ffmpeg/testdata
-cp -r tensorflow/contrib/ffmpeg/testdata ${PY_TEST_DIR}
-
-# Run tests
-DIR0=$(pwd)
-ALL_PY_TESTS_0=$(find tensorflow/{contrib,examples,python,tensorboard} \
-    -type f \( -name "*_test.py" -o -name "test_*.py" \) | sort)
-
-
-# Subroutine for filtering test file names by a bazel tag.
-filter_tests_by_bazel_tag() {
-  # Usage: filter_tests_by_bazel_tag (--inclusive | --exclusive)
-  #            <BAZEL_TAG> <INPUT_TESTS>
-  #
-  #   E.g., filter_tests_by_bazel_tag --inclusive "local"
-  #             "dir1/test1.py dir2/test2.py"
-  #
-  # Use the flag --inclusive so that only the tests that have the tag will be
-  # included in the returned string.
-  # Use the flag --exclusive so that the returned string will consist of only
-  # the tests that do not have the tag.
-  # INPUT_TESTS are the name of the input Python unit test files, seperated by
-  # spaces.
-  #
-  # The output string (through stdout) is: OUTPUT_TESTS | DISCARDED_TESTS
-  # That is: a list of tests that passed the filter, followed by " | ",
-  # followed by a list of tests that are discarded
-
-  FILTER_MODE=$1
-  TAG=$2
-  INPUT_TESTS=$3
-
-  # Input sanity checks
-  if [[ "${FILTER_MODE}" != "--inclusive" ]] &&
-     [[ "${FILTER_MODE}" != "--exclusive" ]]; then
-    echo "ERROR: Unrecognized filter mode: ${FILTER_MODE}"
-    exit 1
-  fi
-  if [[ -z "${TAG}" ]]; then
-    echo "ERROR: Bazal tag is not supplied"
-    exit 1
-  fi
-  if [[ -z "${INPUT_TESTS}" ]]; then
-    echo "ERROR: INPUT_TESTS is not supplied"
-    exit 1
-  fi
-
-  # Check bazel on path
-  if [[ -z $(which bazel) ]]; then
-    echo "ERROR: bazel is not on path"
-    exit 1
-  fi
-
-  # Get all bazel targets that have the specified tag
-  BAZEL_TARGETS=\
-$(bazel query "kind(py_test, attr(tags, "${TAG}", //tensorflow/...))" | sort)
-
-  TARGET_ALIASES=":"
-  for TARGET in ${BAZEL_TARGETS}; do
-    # Transform, e.g., //tensorflow/python/kernel_tests:xent_op_test -->
-    #                  python-xent_op_test
-    # to be compared with the transformed strings from the Python unit test
-    # file names.
-    TARGET_1=$(echo "${TARGET}" | sed "s/:/ /g")
-    TARGET_PATH_1=$(echo "${TARGET_1}" | sed "s/\/\// /g" | sed "s/\// /g" \
-                    | awk '{print $2}')
-    TARGET_BASE_NAME=$(echo "${TARGET_1}" | awk '{print $NF}')
-    TARGET_ALIAS="${TARGET_PATH_1}-${TARGET_BASE_NAME}"
-
-    TARGET_ALIASES="${TARGET_ALIASES}${TARGET_ALIAS}:"
-  done
-  TARGET_ALIASES="${TARGET_ALIASES}:"
-
-  # Filter the list of tests obtained from listing files with the bazel query
-  # results.
-  TESTS_PASSED_FILTER=""
-  TESTS_BLOCKED_BY_FILTER=""
-  for PY_TEST in ${INPUT_TESTS}; do
-    # Transform, e.g., tensorflow/python/kernel_tests/xent_op_test.py -->
-    #                  python-xent_op_test
-    PY_TEST_PATH_1=$(echo "${PY_TEST}" | sed "s/\// /g" | awk '{print $2}')
-    PY_TEST_BASE_NAME=$(echo "${PY_TEST}" | sed "s/\// /g" \
-                        | awk '{print $NF}' | sed "s/\.py//g")
-    PY_TEST_ALIAS="${PY_TEST_PATH_1}-${PY_TEST_BASE_NAME}"
-
-    TO_INCLUDE=0
-    if [[ "${TARGET_ALIASES}" == *"${PY_TEST_ALIAS}"* ]] && \
-       [[ "${FILTER_MODE}" == "--inclusive" ]]; then
-      TO_INCLUDE=1
-    elif [[ "${TARGET_ALIASES}" != *"${PY_TEST_ALIAS}"* ]] && \
-         [[ "${FILTER_MODE}" == "--exclusive" ]]; then
-      TO_INCLUDE=1
-    fi
-
-    if [[ ${TO_INCLUDE} == 1 ]]; then
-      TESTS_PASSED_FILTER="${TESTS_PASSED_FILTER} ${PY_TEST}"
-    else
-      TESTS_BLOCKED_BY_FILTER="${TESTS_BLOCKED_BY_FILTER} ${PY_TEST}"
-    fi
-  done
-
-  echo "${TESTS_PASSED_FILTER} | ${TESTS_BLOCKED_BY_FILTER}"
-}
-
-
-if [[ ${TF_BUILD_FILTER_INSTALL_TESTS_BY_TAG} != "" ]]; then
-  # Iteratively apply the filter tags
-  TAGS=(${TF_BUILD_FILTER_INSTALL_TESTS_BY_TAG//,/ })
-  for TAG in ${TAGS[@]}; do
-    if [[ ${TAG} == "-"* ]]; then
-      MODE="--exclusive"
-      TAG_1=$(echo ${TAG} | sed 's/-//')
-    else
-      MODE="--inclusive"
-      TAG_1=${TAG}
-    fi
-
-    FILTER_OUTPUT=$(filter_tests_by_bazel_tag ${MODE} \
-                   "${TAG_1}" "${ALL_PY_TESTS_0}")
-    ALL_PY_TESTS_0=$(echo "${FILTER_OUTPUT}" | cut -d \| -f 1)
-    DISCARDED_TESTS=$(echo "${FILTER_OUTPUT}" | cut -d \| -f 2)
-    N_DISCARDED=$(echo "${DISCARDED_TESTS}" | wc -w)
-
-    echo ""
-    echo "Skipping ${N_DISCARDED} test(s) due to filter tag \"${TAG}\":"
-    echo "${DISCARDED_TESTS}"
-    echo ""
-  done
-fi
-
-# Move the exclusive tests to the back of the list
-EXCLUSIVE_LIST="$(echo "${PY_TEST_EXCLUSIVE_LIST}" | sed -e 's/:/ /g')"
-
-ALL_PY_TESTS=""
-for TEST in ${ALL_PY_TESTS_0}; do
-  if [[ ! ${PY_TEST_EXCLUSIVE_LIST} == *"${TEST}"* ]]; then
-    ALL_PY_TESTS="${ALL_PY_TESTS} ${TEST}"
-  fi
-done
-
-# Number of parallel (non-exclusive) tests
-N_PAR_TESTS=$(echo ${ALL_PY_TESTS} | wc -w)
-echo "Number of non-exclusive tests: ${N_PAR_TESTS}"
-
-for TEST in ${EXCLUSIVE_LIST}; do
-  ALL_PY_TESTS="${ALL_PY_TESTS} ${TEST}"
-done
-
-PY_TEST_COUNT=$(echo ${ALL_PY_TESTS} | wc -w)
-
-if [[ ${PY_TEST_COUNT} -eq 0 ]]; then
-  die "ERROR: Cannot find any tensorflow Python unit tests to run on install"
-fi
-
-# Iterate through all the Python unit test files using the installation
-TEST_COUNTER=0
-PASS_COUNTER=0
-FAIL_COUNTER=0
-SKIP_COUNTER=0
-FAILED_TESTS=""
-FAILED_TEST_LOGS=""
-
-if [[ "${IS_GPU}" == "1" ]]; then
-  if [[ "${IS_MAC}" == "1" ]]; then
-    N_JOBS=1
-  else
-    N_JOBS=$TF_GPU_COUNT
-  fi
-else
-  N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-  if [[ -z ${N_JOBS} ]]; then
-    # Try the Mac way of getting number of CPUs
-    N_JOBS=$(sysctl -n hw.ncpu)
-  fi
-
-  # If still cannot determine the number of CPUs, pick 8.
-  if [[ -z ${N_JOBS} ]]; then
-    N_JOBS=8
-    echo "Cannot determine the number of processors"
-    echo "Using default concurrent job counter ${N_JOBS}"
-  fi
-fi
-
-echo "Running Python tests-on-install with ${N_JOBS} concurrent jobs..."
-
-ALL_PY_TESTS=(${ALL_PY_TESTS})
-while true; do
-  TEST_LOGS=""
-  TEST_INDICES=""
-  TEST_FILE_PATHS=""
-  TEST_BASENAMES=""
-
-  ITER_COUNTER=0
-  while true; do
-    # Break if the end is reached
-    if [[ "${TEST_COUNTER}" -ge "${PY_TEST_COUNT}" ]]; then
-      break;
-    fi
-
-    # for TEST_FILE_PATH in ${ALL_PY_TESTS}; do
-    TEST_FILE_PATH=${ALL_PY_TESTS[TEST_COUNTER]}
-
-    ((TEST_COUNTER++))
-    ((ITER_COUNTER++))
-
-    # If PY_TEST_WHITELIST is not empty, only the white-listed tests will be run
-    if [[ ! -z ${PY_TEST_WHITELIST} ]] && \
-      [[ ! ${PY_TEST_WHITELIST} == *"${TEST_FILE_PATH}"* ]]; then
-      ((SKIP_COUNTER++))
-      echo "Non-whitelisted test SKIPPED: ${TEST_FILE_PATH}"
-
-      continue
-    fi
-
-    # If the test is in the black list, skip it
-    if [[ ${PY_TEST_BLACKLIST} == *"${TEST_FILE_PATH}"* ]]; then
-      ((SKIP_COUNTER++))
-      echo "Blacklisted test SKIPPED: ${TEST_FILE_PATH}"
-      continue
-    fi
-
-    TEST_INDICES="${TEST_INDICES} ${TEST_COUNTER}"
-    TEST_FILE_PATHS="${TEST_FILE_PATHS} ${TEST_FILE_PATH}"
-
-    # Copy to a separate directory to guard against the possibility of picking
-    # up modules in the source directory
-    cp ${TEST_FILE_PATH} ${PY_TEST_DIR}/
-
-    TEST_BASENAME=$(basename "${TEST_FILE_PATH}")
-    TEST_BASENAMES="${TEST_BASENAMES} ${TEST_BASENAME}"
-
-    # Relative path of the test log. Use long path in case there are duplicate
-    # file names in the Python tests
-    TEST_LOG_REL="${PY_TEST_LOG_DIR_REL}/${TEST_FILE_PATH}.log"
-    mkdir -p $(dirname ${TEST_LOG_REL})  # Create directory for log
-
-    TEST_LOG=$(realpath ${TEST_LOG_REL})  # Absolute path
-    TEST_LOGS="${TEST_LOGS} ${TEST_LOG}"
-
-    # Launch test asynchronously
-    if [[ "${IS_GPU}" == "1" ]] && [[ "${IS_MAC}" == "0" ]]; then
-      # Only use this script without mac. This uses flock, which is not
-      # available in MacOSX.
-      "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
-        "${SCRIPT_DIR}/py_test_delegate.sh" \
-        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
-    else
-      "${SCRIPT_DIR}/py_test_delegate.sh" \
-        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
-    fi
-
-    if [[ "${TEST_COUNTER}" -ge "${N_PAR_TESTS}" ]]; then
-      # Run in exclusive mode
-      if [[ "${TEST_COUNTER}" -gt "${N_PAR_TESTS}" ]]; then
-        echo "Run test exclusively: ${PY_TEST_DIR}/${TEST_BASENAME}"
-      fi
-      break
-    fi
-
-    if [[ "${ITER_COUNTER}" -ge "${N_JOBS}" ]] ||
-       [[ "${TEST_COUNTER}" -ge "${PY_TEST_COUNT}" ]]; then
-      break
-    fi
-
-  done
-
-  # Wait for all processes to complete
-  wait
-
-  TEST_LOGS=(${TEST_LOGS})
-  TEST_FILE_PATHS=(${TEST_FILE_PATHS})
-  TEST_BASENAMES=(${TEST_BASENAMES})
-
-  K=0
-  for TEST_INDEX in ${TEST_INDICES}; do
-    TEST_FILE_PATH=${TEST_FILE_PATHS[K]}
-    TEST_RESULT=$(tail -1 "${TEST_LOGS[K]}" | awk '{print $1}')
-    ELAPSED_TIME=$(tail -1 "${TEST_LOGS[K]}" | cut -d' ' -f2-)
-
-    PROG_STR="(${TEST_INDEX} / ${PY_TEST_COUNT})"
-    # Check for pass or failure status of the test outtput and exit
-    if [[ ${TEST_RESULT} -eq 0 ]]; then
-      ((PASS_COUNTER++))
-
-      echo "${PROG_STR} Python test-on-install PASSED (${ELAPSED_TIME}): ${TEST_FILE_PATH}"
-    else
-      ((FAIL_COUNTER++))
-
-      FAILED_TESTS="${FAILED_TESTS} ${TEST_FILE_PATH}"
-      FAILED_TEST_LOGS="${FAILED_TEST_LOGS} ${TEST_LOGS[K]}"
-
-      echo "${PROG_STR} Python test-on-install FAILED (${ELAPSED_TIME}): ${TEST_FILE_PATH}"
-
-      echo "  Log @: ${TEST_LOGS[K]}"
-      echo "============== BEGINS failure log content =============="
-      "${HEAD_BIN}" --lines=-1 "${TEST_LOGS[K]}"
-      echo "============== ENDS failure log content =============="
-      echo ""
-    fi
-    cd ${DIR0}
-
-    # Clean up files for this test
-    rm -f ${TEST_BASENAMES[K]}
-
-    ((K++))
-  done
-
-  # Stop if the end is reached
-  if [[ "${TEST_COUNTER}" -ge "${PY_TEST_COUNT}" ]]; then
-    break;
-  fi
-done
-
-# Clean up files copied for Python unit tests:
-rm -rf ${TF_INSTALL_PATH}/python/tools
-rm -rf ${TF_INSTALL_PATH}/examples/image_retraining
-rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/jpeg
-rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/png
-rm -rf ${PY_TEST_DIR}/testdata
-
-echo ""
-echo "${PY_TEST_COUNT} Python test(s):" \
-     "${PASS_COUNTER} passed;" \
-     "${FAIL_COUNTER} failed; " \
-     "${SKIP_COUNTER} skipped"
-echo "Test logs directory: ${PY_TEST_LOG_DIR_REL}"
-
-if [[ ${FAIL_COUNTER} -eq 0  ]]; then
-  echo ""
-  echo "Python test-on-install SUCCEEDED"
-
-  exit 0
-else
-  echo "FAILED test(s):"
-  FAILED_TEST_LOGS=($FAILED_TEST_LOGS)
-  FAIL_COUNTER=0
-  for TEST_NAME in ${FAILED_TESTS}; do
-    echo "  ${TEST_NAME} (Log @: ${FAILED_TEST_LOGS[${FAIL_COUNTER}]})"
-    ((FAIL_COUNTER++))
-  done
-
-  echo ""
-  echo "Python test-on-install FAILED"
-  exit 1
-fi
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 216abbe8e67..3b7e2348ad1 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -123,7 +123,7 @@ if [[ ${IS_GPU} == "0" ]]; then
   EXPECTED_OUTPUT="[42, 0, 0]"
 
   # Locate the op kernel C++ file
-  OP_KERNEL_CC="${SCRIPT_DIR}/../../../g3doc/how_tos/adding_an_op/zero_out_op_kernel_1.cc"
+  OP_KERNEL_CC="${SCRIPT_DIR}/user_ops/zero_out_op_kernel_1.cc"
   OP_KERNEL_CC=$(realpath "${OP_KERNEL_CC}")
 
   if [[ ! -f "${OP_KERNEL_CC}" ]]; then
@@ -162,13 +162,13 @@ else
   "${NVCC_BIN}" --version
   echo ""
 
-  OP_KERNEL_CU="${SCRIPT_DIR}/../../../g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc"
+  OP_KERNEL_CU="${SCRIPT_DIR}/user_ops/cuda_op_kernel.cu.cc"
   OP_KERNEL_CU=$(realpath "${OP_KERNEL_CU}")
   if [[ ! -f "${OP_KERNEL_CU}" ]]; then
     die "ERROR: Unable to find user-op kernel CUDA file at: ${OP_KERNEL_CU}"
   fi
 
-  OP_KERNEL_CC="${SCRIPT_DIR}/../../../g3doc/how_tos/adding_an_op/cuda_op_kernel.cc"
+  OP_KERNEL_CC="${SCRIPT_DIR}/user_ops/cuda_op_kernel.cc"
   OP_KERNEL_CC=$(realpath "${OP_KERNEL_CC}")
   if [[ ! -f "${OP_KERNEL_CC}" ]]; then
     die "ERROR: Unable to find user-op kernel C++ file at: ${OP_KERNEL_CC}"
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cc b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cc
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cc
rename to tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cc
diff --git a/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc
new file mode 100644
index 00000000000..65b50bd3ae9
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+__global__ void AddOneKernel(const int* in, const int N, int* out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] + 1;
+  }
+}
+
+void AddOneKernelLauncher(const int* in, const int N, int* out) {
+  AddOneKernel<<<32, 256>>>(in, N, out);
+}
+
+#endif
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_1.cc b/tensorflow/tools/ci_build/builds/user_ops/zero_out_op_kernel_1.cc
similarity index 100%
rename from tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_1.cc
rename to tensorflow/tools/ci_build/builds/user_ops/zero_out_op_kernel_1.cc
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 3697fd46a0e..9c1b75d0048 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -18,7 +18,7 @@
 #                    <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build:
-#                 e.g., (cpu | gpu | android | tensorboard)
+#                 e.g., (cpu | gpu | gpu_clang | android | tensorboard)
 #
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.
 #                  If this optional value is not supplied (via the
@@ -26,7 +26,7 @@
 #                  directory as this script will be used.
 #
 # COMMAND: Command to be executed in the docker container, e.g.,
-#          tensorflow/tools/ci_build/builds/pip.sh gpu
+#          tensorflow/tools/ci_build/builds/pip.sh gpu -c opt --config=cuda
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds/builds_common.sh"
@@ -80,11 +80,11 @@ fi
 
 # cmake (CPU) builds do not require configuration.
 if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
-  CI_COMMAND_PREFIX=""
+  CI_COMMAND_PREFIX=("")
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" == "gpu" ]] || [[ "${CONTAINER_TYPE}" == "gpu_clang" ]]; then
   DOCKER_BINARY="nvidia-docker"
 else
   DOCKER_BINARY="docker"
@@ -104,7 +104,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}"
 
 # Add extra params for cuda devices and libraries for GPU container.
 # And clear them if we are not building for GPU.
-if [ "${CONTAINER_TYPE}" != "gpu" ]; then
+if [[ "${CONTAINER_TYPE}" != "gpu" ]] && [[ "${CONTAINER_TYPE}" != "gpu_clang" ]]; then
   GPU_EXTRA_PARAMS=""
 fi
 
@@ -120,9 +120,9 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
 
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
-echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
-echo "COMMAND: ${COMMAND[@]}"
-echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[@]}"
+echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[*]}"
+echo "COMMAND: ${COMMAND[*]}"
+echo "CI_COMMAND_PREFIX: ${CI_COMMAND_PREFIX[*]}"
 echo "CONTAINER_TYPE: ${CONTAINER_TYPE}"
 echo "BUILD_TAG: ${BUILD_TAG}"
 echo "  (docker container name will be ${DOCKER_IMG_NAME})"
@@ -140,7 +140,7 @@ if [[ $? != "0" ]]; then
 fi
 
 # Run the command inside the container.
-echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
+echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 4fd1277d63b..1cf87d7c7c0 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -18,7 +18,7 @@
 #   ci_parameterized_build.sh
 #
 # The script obeys the following required environment variables:
-#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | ANDROID | ANDROID_FULL)
+#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | GPU_CLANG | ANDROID | ANDROID_FULL)
 #   TF_BUILD_PYTHON_VERSION:   (PYTHON2 | PYTHON3 | PYTHON3.5)
 #   TF_BUILD_IS_PIP:           (NO_PIP | PIP | BOTH)
 #
@@ -84,10 +84,14 @@
 #                      support for Google Cloud Platform (GCP), which is
 #                      enabled by default.
 #   TF_BUILD_OPTIONS:
-#                     (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2)
+#                     (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2_FMA | MAVX_DBG |
+#                      MAVX2_FMA_DBG)
 #                     Use the specified configurations when building.
 #                     When set, overrides TF_BUILD_IS_OPT and TF_BUILD_MAVX
 #                     options, as this will replace the two.
+#   TF_SKIP_CONTRIB_TESTS:
+#                     If set to any non-empty or non-0 value, will skipp running
+#                     contrib tests.
 #
 # This script can be used by Jenkins parameterized / matrix builds.
 
@@ -117,8 +121,7 @@ DOCKER_MAIN_CMD="${CI_BUILD_DIR}/ci_build.sh"
 NO_DOCKER_MAIN_CMD="${CI_BUILD_DIR}/builds/configured"
 
 # Additional option flags to apply when Docker is unavailable (e.g., on Mac)
-NO_DOCKER_OPT_FLAG="--linkopt=-headerpad_max_install_names "\
-"--genrule_strategy=standalone"
+NO_DOCKER_OPT_FLAG="--genrule_strategy=standalone"
 
 DO_DOCKER=1
 
@@ -147,6 +150,10 @@ else
   EXTRA_PARAMS="${EXTRA_PARAMS} -e TF_BUILD_ENABLE_XLA=1"
 fi
 
+if [[ -n "$TF_SKIP_CONTRIB_TESTS" ]]; then
+  BAZEL_TARGET="$BAZEL_TARGET -//tensorflow/contrib/..."
+fi
+
 TUT_TEST_DATA_DIR="/tmp/tf_tutorial_test_data"
 
 ##########################################################
@@ -193,8 +200,8 @@ echo "  TF_BUILD_ENABLE_XLA=${TF_BUILD_ENABLE_XLA}"
 function get_cuda_capability_version() {
   if [[ ! -z $(which deviceQuery) ]]; then
     # The first listed device is used
-    echo $(deviceQuery | grep "CUDA Capability .* version" | \
-        head -1 | awk '{print $NF}')
+    deviceQuery | grep "CUDA Capability .* version" | \
+        head -1 | awk '{print $NF}'
   fi
 }
 
@@ -217,8 +224,13 @@ fi
 # Process container type
 if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
   :
-elif [[ ${CTYPE} == "gpu" ]]; then
-  OPT_FLAG="${OPT_FLAG} --config=cuda"
+elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
+  if [[ ${CTYPE} == "gpu" ]]; then
+    OPT_FLAG="${OPT_FLAG} --config=cuda"
+  else # ${CTYPE} == "gpu_clang"
+    OPT_FLAG="${OPT_FLAG} --config=cuda_clang"
+  fi
+
 
   # Attempt to determine CUDA capability version automatically and use it if
   # CUDA capability version is not specified by the environment variables.
@@ -305,11 +317,14 @@ else
     MAVX)
       OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx"
       ;;
-    MAVXDBG)
+    MAVX_DBG)
       OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx"
       ;;
-    MAVX2)
-      OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2"
+    MAVX2_FMA)
+      OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2 --copt=-mfma"
+      ;;
+    MAVX2_FMA_DBG)
+      OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx2 --copt=-mfma"
       ;;
   esac
 fi
@@ -318,21 +333,35 @@ fi
 OPT_FLAG=$(str_strip "${OPT_FLAG}")
 
 
-# Filter out benchmark tests if this is not a benchmarks job
+# 1) Filter out benchmark tests if this is not a benchmarks job;
+# 2) Filter out tests with the "nomac" tag if the build is on Mac OS X.
 EXTRA_ARGS=""
+IS_MAC=0
+if [[ "$(uname)" == "Darwin" ]]; then
+  IS_MAC=1
+fi
 if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
   ITEMS=(${TF_BUILD_APPEND_ARGUMENTS})
 
   for ITEM in "${ITEMS[@]}"; do
-    if [[ ${ITEM} == *"--test_tag_filters="* ]] &&
-      [[ ${ITEM} != *"benchmark-test"* ]]; then
-      EXTRA_ARGS="${EXTRA_ARGS} ${ITEM},-benchmark-test"
+    if [[ ${ITEM} == *"--test_tag_filters="* ]]; then
+      NEW_ITEM="${ITEM}"
+      if [[ ${NEW_ITEM} != *"benchmark-test"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-benchmark-test"
+      fi
+      if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-nomac"
+      fi
+      EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
       EXTRA_ARGS="${EXTRA_ARGS} ${ITEM}"
     fi
   done
 else
   EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-benchmark-test"
+  if [[ ${IS_MAC} == "1" ]]; then
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+  fi
 fi
 
 # For any "tool" dependencies in genrules, Bazel will build them for host
@@ -353,7 +382,7 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
 "${BAZEL_TARGET}"
-  elif [[ ${CTYPE} == "gpu" ]]; then
+  elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
@@ -377,12 +406,7 @@ if [[ ${TF_BUILD_IS_PIP} == "pip" ]] ||
     exit 0
   fi
 
-  PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_AGRS}"
-
-  # Add flag for mavx/mavx2
-  if [[ ! -z "${TF_BUILD_MAVX}" ]]; then
-    PIP_MAIN_CMD="${PIP_MAIN_CMD} --${TF_BUILD_MAVX}"
-  fi
+  PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_ARGS} ${OPT_FLAG}"
 
   # Add flag for integration tests
   if [[ ! -z "${TF_BUILD_INTEGRATION_TESTS}" ]] &&
@@ -424,7 +448,8 @@ if [[ ${TF_BUILD_PYTHON_VERSION} == "python2" ]]; then
   :
 elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" || \
         ${TF_BUILD_PYTHON_VERSION} == "python3.4" || \
-        ${TF_BUILD_PYTHON_VERSION} == "python3.5" ]]; then
+        ${TF_BUILD_PYTHON_VERSION} == "python3.5" || \
+        ${TF_BUILD_PYTHON_VERSION} == "python3.6" ]]; then
   # Supply proper environment variable to select Python 3
   if [[ "${DO_DOCKER}" == "1" ]]; then
     EXTRA_PARAMS="${EXTRA_PARAMS} -e CI_BUILD_PYTHON=${TF_BUILD_PYTHON_VERSION}"
@@ -507,11 +532,14 @@ if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
   DOCKERFILE="${TMP_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}"
 
   # Replace a line in the Dockerfile
-  sed -i \
+  if sed -i \
       's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \
-      "${DOCKERFILE}" && \
-      echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}" || \
-      die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
+      "${DOCKERFILE}"
+  then
+    echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}"
+  else
+    die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
+  fi
 
   DOCKERFILE_FLAG="--dockerfile ${DOCKERFILE}"
 fi
@@ -549,7 +577,7 @@ rm -f ${TMP_SCRIPT}
 END_TIME=$(date +'%s')
 echo ""
 echo "Parameterized build ends with ${RESULT} at: $(date) "\
-"(Elapsed time: $((${END_TIME} - ${START_TIME})) s)"
+"(Elapsed time: $((END_TIME - START_TIME)) s)"
 
 
 # Clean up temporary directory if it exists
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 975a14e7d51..e428766a400 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -47,7 +47,7 @@ num_cpus() {
 # Get the hash of the last non-merge git commit on the current branch.
 # Usage: get_last_non_merge_git_commit
 get_last_non_merge_git_commit() {
-  echo $(git rev-list --no-merges -n 1 HEAD)
+  git rev-list --no-merges -n 1 HEAD
 }
 
 # List files changed (i.e., added, removed or revised) in the last non-merge
@@ -75,7 +75,7 @@ get_py_files_to_check() {
 
     echo "${PY_FILES}"
   else
-    echo $(find tensorflow -name '*.py')
+    find tensorflow -name '*.py'
   fi
 }
 
@@ -92,6 +92,8 @@ do_pylint() {
   ERROR_WHITELIST="^tensorflow/python/framework/function_test\.py.*\[E1123.*noinline "\
 "^tensorflow/python/platform/default/_gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/platform/default/_googletest\.py.*\[E0102.*function\salready\sdefined "\
+"^tensorflow/python/feature_column/feature_column_test\.py.*\[E0110.*abstract-class-instantiated "\
+"^tensorflow/contrib/layers/python/layers/feature_column\.py.*\[E0110.*abstract-class-instantiated "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
@@ -155,25 +157,25 @@ do_pylint() {
   NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
 
   rm -rf ${OUTPUT_FILE}
-  rm -rf ${ERRORS_FLIE}
+  rm -rf ${ERRORS_FILE}
   rm -rf ${NONWL_ERRORS_FILE}
   touch ${NONWL_ERRORS_FILE}
 
   ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
-      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} 2>&1 > ${OUTPUT_FILE}
+      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
   PYLINT_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pylint took $((${PYLINT_END_TIME} - ${PYLINT_START_TIME})) s"
+  echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
   echo ""
 
   grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
-  while read LINE; do
+  while read -r LINE; do
     IS_WHITELISTED=0
     for WL_REGEX in ${ERROR_WHITELIST}; do
-      if [[ ! -z $(echo ${LINE} | grep "${WL_REGEX}") ]]; then
+      if echo ${LINE} | grep -q "${WL_REGEX}"; then
         echo "Found a whitelisted error:"
         echo "  ${LINE}"
         IS_WHITELISTED=1
@@ -246,7 +248,7 @@ do_pep8() {
   PEP8_END_TIME=$(date +'%s')
 
   echo ""
-  echo "pep8 took $((${PEP8_END_TIME} - ${PEP8_START_TIME})) s"
+  echo "pep8 took $((PEP8_END_TIME - PEP8_START_TIME)) s"
   echo ""
 
   if [[ -s ${PEP8_OUTPUT_FILE} ]]; then
@@ -276,7 +278,7 @@ do_buildifier(){
   BUILDIFIER_END_TIME=$(date +'%s')
 
   echo ""
-  echo "buildifier took $((${BUILDIFIER_END_TIME} - ${BUILDIFIER_START_TIME})) s"
+  echo "buildifier took $((BUILDIFIER_END_TIME - BUILDIFIER_START_TIME)) s"
   echo ""
 
   if [[ -s ${BUILDIFIER_OUTPUT_FILE} ]]; then
@@ -304,7 +306,7 @@ do_external_licenses_check(){
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
  bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
-  | egrep -v "^//tensorflow" \
+  | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -313,7 +315,7 @@ do_external_licenses_check(){
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
   bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
-  | egrep -v "^//tensorflow" \
+  | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
   | uniq 2>&1 \
@@ -327,7 +329,7 @@ do_external_licenses_check(){
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
   echo
-  echo "do_external_licenses_check took $((${EXTERNAL_LICENSES_CHECK_END_TIME} - ${EXTERNAL_LICENSES_CHECK_START_TIME})) s"
+  echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
 
   if [[ -s ${MISSING_LICENSES_FILE} ]] || [[ -s ${EXTRA_LICENSES_FILE} ]] ; then
@@ -371,17 +373,20 @@ do_lib_package_licenses_check() {
     "//tensorflow/tools/lib_package:clicenses_generate"
 }
 
-# Run bazel build --nobuild to test the validity of the BUILD files
-do_bazel_nobuild() {
-  BUILD_TARGET="//tensorflow/..."
-  BUILD_CMD="bazel build --nobuild ${BUILD_TARGET}"
-
-  ${BUILD_CMD}
+do_java_package_licenses_check() {
+  echo "Running do_java_package_licenses_check"
+  echo ""
+  do_external_licenses_check \
+    "//tensorflow/java:libtensorflow_jni.so" \
+    "//tensorflow/tools/lib_package:jnilicenses_generate"
+}
 
+#Check for the bazel cmd status (First arg is error message)
+cmd_status(){
   if [[ $? != 0 ]]; then
     echo ""
     echo "FAIL: ${BUILD_CMD}"
-    echo "  This is due to invalid BUILD files. See lines above for details."
+    echo "  $1 See lines above for details."
     return 1
   else
     echo ""
@@ -390,9 +395,32 @@ do_bazel_nobuild() {
   fi
 }
 
+# Run bazel build --nobuild to test the validity of the BUILD files
+do_bazel_nobuild() {
+  BUILD_TARGET="//tensorflow/..."
+  BUILD_CMD="bazel build --nobuild ${BUILD_TARGET}"
+
+  ${BUILD_CMD}
+
+  cmd_status \
+    "This is due to invalid BUILD files."
+}
+
+do_pip_smoke_test() {
+  BUILD_CMD="bazel build //tensorflow/tools/pip_package:pip_smoke_test"
+  ${BUILD_CMD}
+  cmd_status \
+    "Pip smoke test has failed. Please make sure any new TensorFlow are added to the tensorflow/tools/pip_package:build_pip_package dependencies."
+
+  RUN_CMD="bazel-bin/tensorflow/tools/pip_package/pip_smoke_test"
+  ${RUN_CMD}
+  cmd_status \
+    "The pip smoke test failed."
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package")
 
 INCREMENTAL_FLAG=""
 
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index c33ea2d5cc6..6e7b752c06f 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -24,6 +24,17 @@
 #     TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the
 #                    value of --local_test_jobs flag for bazel.
 
+BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1)
+BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2)
+
+if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then
+  echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
+  exit 1
+elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then
+  echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
+  exit 1
+fi
+
 TF_GPU_COUNT=${TF_GPU_COUNT:-8}
 
 for i in `seq 0 $((TF_GPU_COUNT-1))`; do
diff --git a/tensorflow/tools/ci_build/install/build_and_install_clang.sh b/tensorflow/tools/ci_build/install/build_and_install_clang.sh
new file mode 100755
index 00000000000..3fb99649485
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/build_and_install_clang.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+LLVM_SVN_REVISION="299268"
+CLANG_TMP_DIR=/tmp/clang-build
+
+mkdir "$CLANG_TMP_DIR"
+
+pushd "$CLANG_TMP_DIR"
+
+# Checkout llvm+clang
+svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/llvm/trunk "$CLANG_TMP_DIR/llvm"
+svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/cfe/trunk "$CLANG_TMP_DIR/llvm/tools/clang"
+
+# Build 1st stage. Compile clang with system compiler
+mkdir "$CLANG_TMP_DIR/build-1"
+cd "$CLANG_TMP_DIR/build-1"
+cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release "$CLANG_TMP_DIR/llvm"
+make -j `nproc` clang clang-headers
+
+# Build 2nd stage. Compile clang with clang built in stage 1
+mkdir "$CLANG_TMP_DIR/build-2"
+cd "$CLANG_TMP_DIR/build-2"
+
+CC="$CLANG_TMP_DIR/build-1/bin/clang" \
+CXX="$CLANG_TMP_DIR/build-1/bin/clang++" \
+cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local "$CLANG_TMP_DIR/llvm"
+
+make -j `nproc` install-clang install-clang-headers
+
+popd
+
+# Cleanup
+rm -rf "$CLANG_TMP_DIR"
diff --git a/tensorflow/tools/ci_build/install/install_auditwheel.sh b/tensorflow/tools/ci_build/install/install_auditwheel.sh
index 2538a393d3f..e6f6124d567 100755
--- a/tensorflow/tools/ci_build/install/install_auditwheel.sh
+++ b/tensorflow/tools/ci_build/install/install_auditwheel.sh
@@ -16,7 +16,7 @@
 
 set -e
 
-sudo pip3 install auditwheel
+sudo pip3 install auditwheel==1.5.0
 
 set +e
 patchelf_location=$(which patchelf)
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 6807202f7e9..daba126f889 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.4.2"
+BAZEL_VERSION="0.5.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh
index 2f3470881a5..b2dfcf8db76 100755
--- a/tensorflow/tools/ci_build/install/install_buildifier.sh
+++ b/tensorflow/tools/ci_build/install/install_buildifier.sh
@@ -16,8 +16,9 @@
 
 set -e
 BUILDIFIER_DIR="buildifier"
-rm -rf ${BUILDIFIER_DIR}
-git clone https://github.com/bazelbuild/buildifier.git ${BUILDIFIER_DIR}
+mkdir ${BUILDIFIER_DIR}
+curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.5.tar.gz | \
+    tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz
 pushd ${BUILDIFIER_DIR}
 
 bazel build buildifier:buildifier --spawn_strategy=standalone --genrule_strategy=standalone
diff --git a/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh b/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
new file mode 100755
index 00000000000..3e626a69ab5
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz"
+
+wget -O - "${CMAKE_URL}" | tar xzf - -C /usr/local --strip-components=1
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 227b83ab9f6..da1f2199d0d 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -13,11 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+#
+# Usage:
+#     ./install_deb_packages [--without_cmake]
+# Pass --without_cmake to prevent cmake from being installed with apt-get
 
 set -e
 ubuntu_version=$(cat /etc/issue | grep -i ubuntu | awk '{print $2}' | \
   awk -F'.' '{print $1}')
 
+if [[ "$1" != "" ]] && [[ "$1" != "--without_cmake" ]]; then
+  echo "Unknown argument '$1'"
+  exit 1
+fi
+
 # Install dependencies from ubuntu deb repository.
 apt-get update
 
@@ -32,28 +41,38 @@ apt-get install -y --no-install-recommends \
     autoconf \
     automake \
     build-essential \
-    cmake \
     curl \
     ffmpeg \
     git \
     libcurl4-openssl-dev \
     libtool \
+    mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
     pkg-config \
     python-dev \
-    python-pip \
+    python-setuptools \
     python-virtualenv \
     python3-dev \
-    python3-pip \
+    python3-setuptools \
     rsync \
     sudo \
+    subversion \
     swig \
     unzip \
     wget \
     zip \
     zlib1g-dev
 
+# populate the database
+updatedb
+
+if [[ "$1" != "--without_cmake" ]]; then
+  apt-get install -y --no-install-recommends \
+    cmake
+fi
+
+
 # Install ca-certificates, and update the certificate store.
 apt-get install -y ca-certificates-java
 update-ca-certificates -f
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
new file mode 100755
index 00000000000..fef203b8697
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+GOLANG_URL="https://storage.googleapis.com/golang/go1.7.5.linux-amd64.tar.gz"
+
+sudo mkdir -p /usr/local
+wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 8e374df6321..c9867796f3a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -16,153 +16,75 @@
 
 set -e
 
+# We don't apt-get install so that we can install a newer version of pip. Not
+# needed after we upgrade to Ubuntu 16.04
+easy_install -U pip
+easy_install3 -U pip
+
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
-pip install wheel
+pip2 install wheel
 pip3 install wheel
 
 # Install six.
-pip install --upgrade six==1.10.0
+pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0
 
 # Install werkzeug.
-pip install --upgrade werkzeug==0.11.10
+pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
 
+# Install bleach. html5lib will be picked up as a dependency.
+pip2 install --upgrade bleach==1.5.0
+pip3 install --upgrade bleach==1.5.0
+
+# Install markdown.
+pip2 install --upgrade markdown==2.6.8
+pip3 install --upgrade markdown==2.6.8
+
 # Install protobuf.
-pip install --upgrade protobuf==3.0.0
-pip3 install --upgrade protobuf==3.0.0
+pip2 install --upgrade protobuf==3.3.0
+pip3 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
-set +e
-# Use pip to install numpy to a modern version, instead of 1.8.2 that comes
-# with apt-get in ubuntu:14.04.
-NUMPY_VERSION="1.11.0"
-numpy_ver_flat=$(echo $NUMPY_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_numpy_ver=$(python -c "import numpy; print(numpy.__version__)")
-local_numpy_ver_flat=$(echo $local_numpy_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_numpy_ver_flat ]]; then
-  local_numpy_ver_flat=0
-fi
-if (( $local_numpy_ver_flat < $numpy_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/06/92/3c786303889e6246971ad4c48ac2b4e37a1b1c67c0dc2106dc85cb15c18e/numpy-1.11.0-cp27-cp27mu-manylinux1_x86_64.whl#md5=6ffb66ff78c28c55bfa09a2ceee487df
-  mv numpy-1.11.0-cp27-cp27mu-manylinux1_x86_64.whl \
-     numpy-1.11.0-cp27-none-linux_x86_64.whl
-  pip install numpy-1.11.0-cp27-none-linux_x86_64.whl
-  rm numpy-1.11.0-cp27-none-linux_x86_64.whl
-fi
+# numpy needs to be installed from source to fix segfaults. See:
+# https://github.com/tensorflow/tensorflow/issues/6968
+# This workaround isn't needed for Ubuntu 16.04 or later.
+pip2 install --no-binary=:all: --upgrade numpy==1.12.0
+pip3 install --no-binary=:all: --upgrade numpy==1.12.0
 
-set +e
-local_numpy_ver=$(python3 -c "import numpy; print(numpy.__version__)")
-local_numpy_ver_flat=$(echo $local_numpy_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_numpy_ver_flat ]]; then
-  local_numpy_ver_flat=0
-fi
-if (( $local_numpy_ver_flat < $numpy_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/ea/ca/5e48a68be496e6f79c3c8d90f7c03ea09bbb154ea4511f5b3d6c825cefe5/numpy-1.11.0-cp34-cp34m-manylinux1_x86_64.whl#md5=08a002aeffa20354aa5045eadb549361
-  mv numpy-1.11.0-cp34-cp34m-manylinux1_x86_64.whl \
-     numpy-1.11.0-cp34-none-linux_x86_64.whl
-  pip3 install numpy-1.11.0-cp34-none-linux_x86_64.whl
-  rm numpy-1.11.0-cp34-none-linux_x86_64.whl
-fi
+pip2 install scipy==0.18.1
+pip3 install scipy==0.18.1
 
-# Use pip to install scipy to get the latest version, instead of 0.13 through
-# apt-get.
-# pip install scipy==0.15.1
-set +e
-SCIPY_VERSION="0.15.1"
-scipy_ver_flat=$(echo $SCIPY_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_scipy_ver=$(python -c "import scipy; print(scipy.__version__)")
-local_scipy_ver_flat=$(echo $local_scipy_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_scipy_ver_flat ]]; then
-  local_scipy_ver_flat=0
-fi
-if (( $local_scipy_ver_flat < $scipy_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/00/0f/060ec52cb74dc8df1a7ef1a524173eb0bcd329110404869b392685cfc5c8/scipy-0.15.1-cp27-cp27mu-manylinux1_x86_64.whl#md5=aaac02e6535742ab02f2075129890714
-  mv scipy-0.15.1-cp27-cp27mu-manylinux1_x86_64.whl \
-     scipy-0.15.1-cp27-none-linux_x86_64.whl
-  pip install scipy-0.15.1-cp27-none-linux_x86_64.whl
-  rm scipy-0.15.1-cp27-none-linux_x86_64.whl
-fi
-
-# pip3 install scipy==0.15.1
-set +e
-local_scipy_ver=$(python3 -c "import scipy; print(scipy.__version__)")
-local_scipy_ver_flat=$(echo $local_scipy_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_scipy_ver_flat ]]; then
-  local_scipy_ver_flat=0
-fi
-if (( $local_scipy_ver_flat < $scipy_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/56/c5/e0d36aaf719aa02ee3da19151045912e240d145586612e53b5eaa706e1db/scipy-0.15.1-cp34-cp34m-manylinux1_x86_64.whl#md5=d5243b0f9d85f4f4cb62514c82af93d4
-  mv scipy-0.15.1-cp34-cp34m-manylinux1_x86_64.whl \
-     scipy-0.15.1-cp34-cp34m-linux_x86_64.whl
-  pip3 install scipy-0.15.1-cp34-cp34m-linux_x86_64.whl
-  rm scipy-0.15.1-cp34-cp34m-linux_x86_64.whl
-fi
-
-# pip install sklearn
-set +e
-SKLEARN_VERSION="0.17.1"
-sklearn_ver_flat=$(echo $SKLEARN_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_sklearn_ver=$(python -c "import sklearn; print(sklearn.__version__)")
-local_sklearn_ver_flat=$(echo $local_sklearn_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_sklearn_ver_flat ]]; then
-  local_sklearn_ver_flat=0
-fi
-if (( $local_sklearn_ver_flat < $sklearn_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/bf/80/06e77e5a682c46a3880ec487a5f9d910f5c8d919df9aca58052089687c7e/scikit_learn-0.17.1-cp27-cp27mu-manylinux1_x86_64.whl#md5=337b91f502138ba7fd722803138f6dfd
-  mv scikit_learn-0.17.1-cp27-cp27mu-manylinux1_x86_64.whl \
-     scikit_learn-0.17.1-cp27-none-linux_x86_64.whl
-  pip install scikit_learn-0.17.1-cp27-none-linux_x86_64.whl
-  rm scikit_learn-0.17.1-cp27-none-linux_x86_64.whl
-fi
-
-# pip3 install scikit-learn
-set +e
-local_sklearn_ver=$(python3 -c "import sklearn; print(sklearn.__version__)")
-local_sklearn_ver_flat=$(echo $local_sklearn_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_sklearn_ver_flat ]]; then
-  local_sklearn_ver_flat=0
-fi
-if (( $local_sklearn_ver_flat < $sklearn_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/7e/f1/1cc8a1ae2b4de89bff0981aee904ff05779c49a4c660fa38178f9772d3a7/scikit_learn-0.17.1-cp34-cp34m-manylinux1_x86_64.whl#md5=a722a7372b64ec9f7b49a2532d21372b
-  mv scikit_learn-0.17.1-cp34-cp34m-manylinux1_x86_64.whl \
-     scikit_learn-0.17.1-cp34-cp34m-linux_x86_64.whl
-  pip3 install scikit_learn-0.17.1-cp34-cp34m-linux_x86_64.whl
-  rm scikit_learn-0.17.1-cp34-cp34m-linux_x86_64.whl
-fi
-
-set -e
+pip2 install scikit-learn==0.18.1
+pip3 install scikit-learn==0.18.1
 
 # pandas required by tf.learn/inflow
-pip install pandas==0.18.1
-pip3 install pandas==0.18.1
+pip2 install pandas==0.19.2
+pip3 install pandas==0.19.2
 
 # Benchmark tests require the following:
-pip install psutil
+pip2 install psutil
 pip3 install psutil
-pip install py-cpuinfo
+pip2 install py-cpuinfo
 pip3 install py-cpuinfo
 
 # pylint tests require the following:
-pip install pylint
-pip3 install pylint
+pip2 install pylint==1.6.4
+pip3 install pylint==1.6.4
 
 # pep8 tests require the following:
-pip install pep8
+pip2 install pep8
 pip3 install pep8
 
 # tf.mock require the following for python2:
-pip install mock
+pip2 install mock
 
-pip install portpicker
+pip2 install portpicker
 pip3 install portpicker
+
+pip2 install backports.weakref==1.0rc1
+pip3 install backports.weakref==1.0rc1
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 773c89b70bb..7934002b2c9 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,9 +17,9 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.0"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_protobuf_ver=$(protoc --version | awk '{print $2}')
+local_protobuf_ver=$(protoc --version)
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
 if [[ -z $local_protobuf_ver_flat ]]; then
   local_protobuf_ver_flat=0
@@ -30,7 +30,7 @@ if (( $local_protobuf_ver_flat < $protobuf_ver_flat )); then
   PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
   UNZIP_DEST="google-protobuf"
 
-  wget -q "${PROTOBUF_URL}"
+  wget "${PROTOBUF_URL}"
   unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
   cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 0c86db71192..33b3bc104bd 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -64,37 +64,31 @@ set -e
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.0.0
+pip3.5 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
 # Install numpy, scipy and scikit-learn required by the builds
-pip3.5 install --upgrade numpy
 
-set +e
-SCIPY_VERSION="0.17.1"
-scipy_ver_flat=$(echo $SCIPY_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_scipy_ver=$(python3.5 -c "import scipy; print(scipy.__version__)")
-local_scipy_ver_flat=$(echo $local_scipy_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_scipy_ver_flat ]]; then
-  local_scipy_ver_flat=0
-fi
-if (( $local_scipy_ver_flat < $scipy_ver_flat )); then
-  set -e
-  wget -q https://pypi.python.org/packages/91/f3/0052c245d53eb5f0e13b7215811e52af3791a8a7d31771605697c28466a0/scipy-0.17.1-cp35-cp35m-manylinux1_x86_64.whl#md5=8e77756904c81a6f79ed10e3abf0c544
-  pip3.5 install --upgrade scipy-0.17.1-cp35-cp35m-manylinux1_x86_64.whl
-  rm -f scipy-0.17.1-cp35-cp35m-manylinux1_x86_64.whl
-fi
+# numpy needs to be installed from source to fix segfaults. See:
+# https://github.com/tensorflow/tensorflow/issues/6968
+# This workaround isn't needed for Ubuntu 16.04 or later.
+pip3.5 install --no-binary=:all: --upgrade numpy==1.12.0
 
-set -e
-pip3.5 install --upgrade scikit-learn
+pip3.5 install scipy==0.18.1
+
+pip3.5 install scikit-learn==0.18.1
+
+# pandas required by tf.learn/inflow
+pip3 install pandas==0.19.2
 
 # Install recent-enough version of wheel for Python 3.5 wheel builds
 pip3.5 install wheel==0.29.0
 
-pip3.5 install --upgrade pandas==0.18.1
-
 pip3.5 install portpicker
 
 pip3.5 install werkzeug
+
+pip3.5 install backports.weakref==1.0rc1
+
diff --git a/tensorflow/tools/ci_build/linux/cmake/run.sh b/tensorflow/tools/ci_build/linux/cmake/run.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/tools/ci_build/linux/cpu/run.sh b/tensorflow/tools/ci_build/linux/cpu/run.sh
deleted file mode 100644
index 4ab545ecb9a..00000000000
--- a/tensorflow/tools/ci_build/linux/cpu/run.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-
-set -e
-set -x
-
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-
-echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
-echo ""
-
-# Run configure.
-export TF_NEED_GCP=0
-export TF_NEED_HDFS=0
-export TF_NEED_CUDA=0
-export PYTHON_BIN_PATH=`which python2`
-yes "" | ./configure
-
-# Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 //tensorflow/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
new file mode 100755
index 00000000000..467e4ab7e53
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+# Only running cc tests, python version does not matter.
+export PYTHON_BIN_PATH=`which python`
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=cc -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --test_output=errors -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
new file mode 100755
index 00000000000..e2bbc0e8c0b
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=`which python2`
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=py -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --test_output=errors -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
new file mode 100755
index 00000000000..a03cab0cca5
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=`which python3`
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --test_output=errors -- \
+    //tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
new file mode 100755
index 00000000000..32de5cea200
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=`which python3`
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=py -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --test_output=errors -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
new file mode 100755
index 00000000000..6acc2621383
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export PYTHON_BIN_PATH=`which python3`
+
+export TF_NEED_CUDA=1
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+    --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
new file mode 100755
index 00000000000..e73fe046c96
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export PYTHON_BIN_PATH=`which python3`
+
+export TF_NEED_CUDA=1
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow.sh b/tensorflow/tools/ci_build/linux/libtensorflow.sh
index bc64fabde5b..beef8e063b3 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow.sh
@@ -14,9 +14,8 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to produce a tarball release of the C-library and associated C API
-# header file. Intended to be run inside a docker container. See
-# libtensorflow_docker.sh
+# Script to produce binary releases for libtensorflow (C API, Java jars etc.).
+# Intended to be run inside a docker container. See libtensorflow_docker.sh
 
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
index c300c4670fd..4bf34dd2993 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
@@ -14,8 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to build a binary release tarball for the TensorFlow C-library without
-# GPU support.
+# Script to build a binary releases of libtensorflow without GPU support.
 
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 5423831caad..dcda8228bc2 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to produce a tarball release of the C-library and associated C API
-# header file. Builds a docker container and then builds the C-library in
-# said container.
+# Script to produce a tarball release of the C-library, Java native library
+# and Java .jars.
+# Builds a docker container and then builds in said container.
 #
 # See libtensorflow_cpu.sh and libtensorflow_gpu.sh
 
@@ -29,9 +29,9 @@ DOCKER_IMAGE="tf-libtensorflow-cpu"
 DOCKER_FILE="Dockerfile.cpu"
 DOCKER_BINARY="docker"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
-	DOCKER_IMAGE="tf-tensorflow-gpu"
-	DOCKER_BINARY="nvidia-docker"
-	DOCKER_FILE="Dockerfile.gpu"
+  DOCKER_IMAGE="tf-tensorflow-gpu"
+  DOCKER_BINARY="nvidia-docker"
+  DOCKER_FILE="Dockerfile.gpu"
 fi
 
 docker build \
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh
index e13098c220d..6dca0c37c87 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh
@@ -14,8 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to build a binary release tarball for the TensorFlow C-library without
-# GPU support.
+# Script to build a binary releases of libtensorflow with GPU support.
 
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
new file mode 100755
index 00000000000..e5f4a22f7ad
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(sysctl -n hw.ncpu)
+N_JOBS=$((N_JOBS+1))
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=$(which python2)
+yes "" | ./configure
+which bazel
+bazel test --test_tag_filters=-gpu,-benchmark-test,-nomac \
+    --test_timeout 300,450,1200,3600 \
+    --test_size_filters=small,medium \
+    --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... \
+    -//tensorflow/tensorboard/...
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 432201156d7..d90a1b905d9 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -14,9 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to produce a tarball release of the C-library and associated C API
-# header file.
-# Produces: lib_package/libtensorflow-gpu-darwin-x86_64.tar.gz
+# Script to produce binary release of libtensorflow (C API, Java jars etc.).
 
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -30,6 +28,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 5e6f4b9fc2d..79973647c11 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -14,9 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to produce a tarball release of the C-library and associated C API
-# header file.
-# Produces: lib_package/libtensorflow-gpu-darwin-x86_64.tar.gz
+# Script to produce binary release of libtensorflow (C API, Java jars etc.).
 
 set -ex
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -31,6 +29,7 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
index 59ba71f5df7..3e31aa1ce10 100755
--- a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
+++ b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.1"
 PYTHON_BIN=${PYTHON_BIN:-python}
 DIR=${PWD}/protobuf
 
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index 0779ed91bc3..e71017e621c 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -38,7 +38,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager
 
 
 # Set the cache size for astng objects.
@@ -322,4 +322,4 @@ indent-after-paren=4
 [GOOGLE LINES]
 
 # Regexp for a proper copyright notice.
-copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\.
\ No newline at end of file
+copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\.
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index 759c7e5f7e4..682f5329f58 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -61,7 +61,9 @@ fi
 MAJOR=$(echo "${NEW_VER}" | cut -d \. -f 1)
 MINOR=$(echo "${NEW_VER}" | cut -d \. -f 2)
 PATCH=$(echo "${NEW_VER}" | cut -d \. -f 3)
+PATCH_NUM=$(echo "$PATCH" | cut -d \- -f 1)
 PIP_PATCH="${PATCH//-}"
+SUFFIX=$(echo $NEW_VER | sed "s/${MAJOR}.${MINOR}.${PATCH%-*}//g")
 
 # Update tensorflow/core/public/version.h
 VERSION_H="${TF_SRC_DIR}/core/public/version.h"
@@ -71,13 +73,17 @@ OLD_MAJOR=$(cat ${VERSION_H} | grep -E "^#define TF_MAJOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
 OLD_MINOR=$(cat ${VERSION_H} | grep -E "^#define TF_MINOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
-OLD_PATCH=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]-]+" | \
+OLD_PATCH_NUM=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]-]+" | \
 cut -d ' ' -f 3)
+OLD_EXTENSION=$(cat ${VERSION_H} | grep -E "^#define TF_VERSION_SUFFIX \"[[:alnum:]-]+\"" | \
+cut -d ' ' -f 3)
+OLD_PATCH="$OLD_PATCH_NUM${OLD_EXTENSION//\"}"
+OLD_PIP_PATCH="${OLD_PATCH//-}"
 
 sed -i -e "s/^#define TF_MAJOR_VERSION ${OLD_MAJOR}/#define TF_MAJOR_VERSION ${MAJOR}/g" ${VERSION_H}
 sed -i -e "s/^#define TF_MINOR_VERSION ${OLD_MINOR}/#define TF_MINOR_VERSION ${MINOR}/g" ${VERSION_H}
-sed -i -e "s/^#define TF_PATCH_VERSION ${OLD_PATCH}/#define TF_PATCH_VERSION ${PATCH}/g" "${VERSION_H}"
-
+sed -i -e "s/^#define TF_PATCH_VERSION ${OLD_PATCH}/#define TF_PATCH_VERSION ${PATCH_NUM}/g" "${VERSION_H}"
+sed -i -e "s/^#define TF_VERSION_SUFFIX \".*\"/#define TF_VERSION_SUFFIX \"${SUFFIX}\"/g" "${VERSION_H}"
 
 # Update setup.py
 SETUP_PY="${TF_SRC_DIR}/tools/pip_package/setup.py"
@@ -85,23 +91,6 @@ check_existence file "${SETUP_PY}"
 
 sed -i -e "s/^\_VERSION = [\'\"].*[\'\"]/\_VERSION = \'${MAJOR}.${MINOR}.${PATCH}\'/g" "${SETUP_PY}"
 
-# Update cmake setup.py
-CMAKE_SETUP_PY="${TF_SRC_DIR}/contrib/cmake/setup.py"
-check_existence file "${CMAKE_SETUP_PY}"
-
-sed -i -e "s/^\_VERSION = [\'\"].*-cmake-experimental[\'\"]/\_VERSION = \'${MAJOR}.${MINOR}.${PATCH}-cmake-experimental\'/g" "${CMAKE_SETUP_PY}"
-
-
-# Update os_setup.md
-OS_SETUP="${TF_SRC_DIR}/g3doc/get_started/os_setup.md"
-check_existence file "${OS_SETUP}"
-
-sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow_gpu-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow_gpu-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PIP_PATCH}\3/g" "${OS_SETUP}"
-sed -i -r -e "s/(.*\`)([0-9]+\.[0-9]+\.[[:alnum:]-]+)(-gpu.*)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
-
 
 # Update README.md
 README_MD="./README.md"
@@ -109,6 +98,26 @@ check_existence file "${README_MD}"
 
 sed -i -r -e "s/${OLD_MAJOR}\.${OLD_MINOR}\.([[:alnum:]]+)-/${MAJOR}.${MINOR}.${PIP_PATCH}-/g" "${README_MD}"
 
+# Update the install md files
+NEW_PIP_TAG=$MAJOR.$MINOR.$PIP_PATCH
+OLD_PIP_TAG=$OLD_MAJOR.$OLD_MINOR.$OLD_PIP_PATCH
+
+for file in ${TF_SRC_DIR}/docs_src/install/install_{linux,mac,windows,sources}.md
+do
+  sed -i "s/tensorflow-${OLD_PIP_TAG}/tensorflow-${NEW_PIP_TAG}/g" $file
+  sed -i "s/tensorflow_gpu-${OLD_PIP_TAG}/tensorflow_gpu-${NEW_PIP_TAG}/g" $file
+  sed -i "s/TensorFlow ${OLD_PIP_TAG}/TensorFlow ${NEW_PIP_TAG}/g" $file
+done
+
+NEW_TAG=$MAJOR.$MINOR.$PATCH
+OLD_TAG=$OLD_MAJOR.$OLD_MINOR.$OLD_PATCH
+
+for file in ${TF_SRC_DIR}/docs_src/install/install_{java,go,c}.md
+do
+  sed -i "s/x86_64-${OLD_TAG}/x86_64-${NEW_TAG}/g" $file
+  sed -i "s/libtensorflow-${OLD_TAG}.jar/libtensorflow-${NEW_TAG}.jar/g" $file
+  sed -i "s/<version>${OLD_TAG}<\/version>/<version>${NEW_TAG}<\/version>/g" $file
+done
 
 # Updates to be made if there are major / minor version changes
 MAJOR_MINOR_CHANGE=0
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 4aae0378a8d..dff4707cbef 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -35,7 +35,6 @@ failing_cpu_cc_tests="\
 "
 
 broken_cpu_cc_tests="\
-    //tensorflow/core/kernels/hexagon:graph_transferer_test + \
     //tensorflow/cc:framework_cc_ops_test + \
     //tensorflow/core/platform/cloud:time_util_test + \
     //tensorflow/core/platform/cloud:oauth_client_test + \
@@ -43,7 +42,9 @@ broken_cpu_cc_tests="\
     //tensorflow/core/platform/cloud:google_auth_provider_test + \
     //tensorflow/core/platform/cloud:gcs_file_system_test + \
     //tensorflow/core/kernels/cloud:bigquery_table_accessor_test + \
+    //tensorflow/core/kernels/hexagon:graph_transferer_test + \
     //tensorflow/core/kernels/hexagon:quantized_matmul_op_for_hexagon_test + \
+    //tensorflow/core/kernels:remote_fused_graph_execute_utils_test + \
     //tensorflow/core/kernels:requantize_op_test + \
     //tensorflow/core/kernels:requantization_range_op_test + \
     //tensorflow/core/kernels:quantized_reshape_op_test + \
@@ -95,65 +96,6 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
-# Python tests
-# The first argument is the name of the python test direcotry
-function get_failing_cpu_py_tests() {
-    echo "
-    //$1/tensorflow/python:basic_session_run_hooks_test + \
-    //$1/tensorflow/python:bigquery_reader_ops_test + \
-    //$1/tensorflow/python:contrib_test + \
-    //$1/tensorflow/python:dequantize_op_test + \
-    //$1/tensorflow/python:directory_watcher_test + \
-    //$1/tensorflow/python:event_multiplexer_test + \
-    //$1/tensorflow/python:file_io_test + \
-    //$1/tensorflow/python:file_system_test + \
-    //$1/tensorflow/python:framework_meta_graph_test + \
-    //$1/tensorflow/python:framework_ops_test + \
-    //$1/tensorflow/python:framework_tensor_util_test + \
-    //$1/tensorflow/python:framework_test_util_test + \
-    //$1/tensorflow/python:gradients_test + \
-    //$1/tensorflow/python:image_ops_test + \
-    //$1/tensorflow/python:localhost_cluster_performance_test + \
-    //$1/tensorflow/python:monitored_session_test + \
-    //$1/tensorflow/python:nn_batchnorm_test + \
-    //$1/tensorflow/python:protobuf_compare_test + \
-    //$1/tensorflow/python:quantized_conv_ops_test + \
-    //$1/tensorflow/python:saver_large_variable_test + \
-    //$1/tensorflow/python:saver_test + \
-    //$1/tensorflow/python:session_test + \
-    //$1/tensorflow/python:supervisor_test + \
-    //$1/tensorflow/python:sync_replicas_optimizer_test + \
-    //$1/tensorflow/python/debug:curses_ui_test + \
-    //$1/tensorflow/python/kernel_tests:as_string_op_test + \
-    //$1/tensorflow/python/kernel_tests:benchmark_test + \
-    //$1/tensorflow/python/kernel_tests:cast_op_test + \
-    //$1/tensorflow/python/kernel_tests:clip_ops_test + \
-    //$1/tensorflow/python/kernel_tests:conv_ops_test + \
-    //$1/tensorflow/python/kernel_tests:decode_image_op_test + \
-    //$1/tensorflow/python/kernel_tests:depthwise_conv_op_test + \
-    //$1/tensorflow/python/kernel_tests:functional_ops_test + \
-    //$1/tensorflow/python/kernel_tests:py_func_test + \
-    //$1/tensorflow/python/kernel_tests:rnn_test + \
-    //$1/tensorflow/python/kernel_tests:sets_test + \
-    //$1/tensorflow/python/kernel_tests:sparse_matmul_op_test + \
-    //$1/tensorflow/python/kernel_tests:string_to_number_op_test + \
-    //$1/tensorflow/python/kernel_tests:summary_ops_test + \
-    //$1/tensorflow/python/kernel_tests:variable_scope_test + \
-    //$1/tensorflow/python/saved_model:saved_model_test \
-    "
-}
-
-function get_failing_gpu_py_tests() {
-    echo "
-    //$1/tensorflow/python/kernel_tests:diag_op_test + \
-    //$1/tensorflow/python/kernel_tests:one_hot_op_test + \
-    //$1/tensorflow/python/kernel_tests:rnn_test + \
-    //$1/tensorflow/python/kernel_tests:sets_test + \
-    //$1/tensorflow/python/kernel_tests:trace_op_test + \
-    $(get_failing_cpu_py_tests $1)
-    "
-}
-
 function clean_output_base() {
   # TODO(pcloudy): bazel clean --expunge doesn't work on Windows yet.
   # Clean the output base manually to ensure build correctness
@@ -177,6 +119,13 @@ function run_configure_for_cpu_build {
   if [ -z "$CC_OPT_FLAGS" ]; then
     export CC_OPT_FLAGS="-march=native"
   fi
+  if [ -z "$TF_NEED_MKL" ]; then
+    export TF_NEED_MKL=0
+  fi
+  export TF_NEED_VERBS=0
+  export TF_NEED_GCP=0
+  export TF_NEED_HDFS=0
+  export TF_NEED_OPENCL=0
   echo "" | ./configure
 }
 
@@ -196,6 +145,11 @@ function run_configure_for_gpu_build {
   if [ -z "$CC_OPT_FLAGS" ]; then
     export CC_OPT_FLAGS="-march=native"
   fi
+  export TF_NEED_VERBS=0
+  export TF_NEED_MKL=0
+  export TF_NEED_GCP=0
+  export TF_NEED_HDFS=0
+  export TF_NEED_OPENCL=0
   echo "" | ./configure
 }
 
@@ -207,5 +161,5 @@ function create_python_test_dir() {
 
 function reinstall_tensorflow_pip() {
   echo "y" | pip uninstall tensorflow -q || true
-  pip install ${1}
+  pip install ${1} --no-deps
 }
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 662de93c16b..8853dc53b17 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -30,10 +30,11 @@ export TMPDIR="C:/tmp"
 mkdir -p "$TMPDIR"
 
 # Set bash path
-export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
+export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
 # Set Python path for ./configure
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
+export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
 # Set Python path for cc_configure.bzl
 export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
@@ -54,4 +55,4 @@ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPT
 export PATH="/c/tools/cuda/bin:$PATH"
 
 # Set the common build options on Windows
-export BUILD_OPTS='--cpu=x64_windows_msvc --host_cpu=x64_windows_msvc --copt=/w --verbose_failures --experimental_ui'
+export BUILD_OPTS='--copt=-w --host_copt=-w --verbose_failures --experimental_ui'
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
deleted file mode 100644
index 6e7e555065a..00000000000
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This script assumes the standard setup on tensorflow Jenkins windows machines.
-# It is NOT guaranteed to work on any other machine. Use at your own risk!
-#
-# REQUIREMENTS:
-# * All installed in standard locations:
-#   - JDK8, and JAVA_HOME set.
-#   - Microsoft Visual Studio 2015 Community Edition
-#   - Msys2
-#   - Anaconda3
-# * Bazel windows executable copied as "bazel.exe" and included in PATH.
-
-# All commands shall pass, and all should be visible.
-set -x
-set -e
-
-# Use a temporary directory with a short name.
-export TMPDIR="C:/tmp"
-mkdir -p "$TMPDIR"
-
-# Set bash path
-export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
-
-# Set Python path for ./configure
-export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
-
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
-# Add python into PATH, it's needed because gen_git_source.py uses
-# '/usr/bin/env python' as a shebang
-export PATH="/c/Program Files/Anaconda3:$PATH"
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat b/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat
new file mode 100644
index 00000000000..6a88b1865a4
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_libtensorflow.bat
@@ -0,0 +1 @@
+c:\tools\msys64\usr\bin\bash -l %cd%/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh %*
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 62e97f3f071..07ad70dd344 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -22,11 +22,13 @@ CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
 :: Turn echo back on, above script turns it off.
 ECHO ON
 
-:: Some common variables to be shared between runs.
-SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe"
-SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe"
-SET PY_EXE="C:\Program Files\Anaconda3\python.exe"
-SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib"
+:: Set environment variables to be shared between runs. Do not override if they
+:: are set already.
+
+IF DEFINED CMAKE_EXE (ECHO CMAKE_EXE is set to %CMAKE_EXE%) ELSE (SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe")
+IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe")
+IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
+IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 9908762bca8..96fbadd1767 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -22,15 +22,13 @@ CD %BUILD_DIR%
 SET BUILD_CC_TESTS=OFF
 SET BUILD_PYTHON_TESTS=ON
 
-SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+:: Set pip binary location. Do not override if it is set already.
+IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
-:: Attempt to upgrade PIP to work around Anaconda issue #542.
-%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
-
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -38,7 +36,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 34844e60c80..61f5ed084cc 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,10 +42,10 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_cpu_build
 
+clean_output_base
+
 bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
@@ -58,12 +58,10 @@ create_python_test_dir "${PY_TEST_DIR}"
 PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
 reinstall_tensorflow_pip ${PIP_NAME}
 
-failing_cpu_py_tests=$(get_failing_cpu_py_tests ${PY_TEST_DIR})
-
-passing_tests=$(bazel query "kind(py_test,  //${PY_TEST_DIR}/tensorflow/python/...) - (${failing_cpu_py_tests})" |
-  # We need to strip \r so that the result could be store into a variable under MSYS
-  tr '\r' ' ')
-
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt $BUILD_OPTS -k $passing_tests --define=no_tensorflow_py_deps=true --test_output=errors
+bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
+  --test_tag_filters=-no_pip,-no_windows \
+  --build_tag_filters=-no_pip,-no_windows --build_tests_only \
+  //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index f124012edcb..b4f9cc84762 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -22,12 +22,14 @@ CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
 :: Turn echo back on, above script turns it off.
 ECHO ON
 
-:: Some common variables to be shared between runs.
-SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe"
-SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe"
-SET PY_EXE="C:\Program Files\Anaconda3\python.exe"
-SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib"
-SET CUDNN_HOME="c:\tools\cuda"
+:: Set environment variables to be shared between runs. Do not override if they
+:: are set already.
+
+IF DEFINED CMAKE_EXE (ECHO CMAKE_EXE is set to %CMAKE_EXE%) ELSE (SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe")
+IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe")
+IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
+IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
+IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
 
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 9307ebb66ba..e774a6e9168 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -22,15 +22,12 @@ CD %BUILD_DIR%
 SET BUILD_CC_TESTS=OFF
 SET BUILD_PYTHON_TESTS=ON
 
-SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
 
-:: Attempt to upgrade PIP to work around Anaconda issue #542.
-%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
-
 :: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
 DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
@@ -38,7 +35,7 @@ del wheel_filename_file
 
 :: Install the pip package.
 echo Installing PIP package...
-%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+%PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :: Run all python tests if the installation succeeded.
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index eaf9ef81583..cc157c33f50 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -42,10 +42,10 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_gpu_build
 
+clean_output_base
+
 bazel build -c opt --config=win-cuda $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
@@ -58,13 +58,11 @@ create_python_test_dir "${PY_TEST_DIR}"
 PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
 reinstall_tensorflow_pip ${PIP_NAME}
 
-failing_gpu_py_tests=$(get_failing_gpu_py_tests ${PY_TEST_DIR})
-
-passing_tests=$(bazel query "kind(py_test,  //${PY_TEST_DIR}/tensorflow/python/...) - (${failing_gpu_py_tests})" |
-  # We need to strip \r so that the result could be store into a variable under MSYS
-  tr '\r' ' ')
-
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# GPU tests are very flaky when running concurently, so set local_test_jobs=5
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k $passing_tests --define=no_tensorflow_py_deps=true --test_output=errors --local_test_jobs=5
+# GPU tests are very flaky when running concurently, so set local_test_jobs=1
+bazel test -c opt --config=win-cuda $BUILD_OPTS -k --test_output=errors \
+  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
+  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu \
+  --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
new file mode 100755
index 00000000000..9ac3613f27e
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce binary release of libtensorflow (C API, Java jars etc.).
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Setup environment for bazel builds
+source "${SCRIPT_DIR}/bazel/common_env.sh"
+source "${SCRIPT_DIR}/bazel/bazel_test_lib.sh"
+
+# Sanity check that this is being run from the root of the git repository.
+cd ${SCRIPT_DIR}/../../../..
+if [ ! -e "WORKSPACE" ]; then
+  echo "Must run this from the root of the bazel workspace"
+  echo "Currently at ${PWD}, script is at ${SCRIPT_DIR}"
+  exit 1
+fi
+
+# Enable JNI support for Windows in Bazel.
+# This can be removed once
+# https://github.com/bazelbuild/bazel/pull/2599
+# has been merged and we switch to a bazel release containing it.
+cp "${JAVA_HOME}/include/win32/jni_md.h" "./tensorflow/java/src/main/native/windows_jni_md.h"
+sed -i -e "s|@bazel_tools//tools/jdk:jni_md_header-linux|windows_jni_md.h|" ./tensorflow/java/src/main/native/BUILD
+#### END HACKS TO BE RESOLVED WITH NEW BAZEL VERSIONS ####
+
+export TF_BAZEL_TARGETS="//tensorflow:libtensorflow.so"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clicenses_generate"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
+
+clean_output_base
+run_configure_for_cpu_build
+
+# build_libtensorflow_tarball in ../builds/libtensorflow.sh
+# cannot be used on Windows since it relies on pkg_tar rules.
+# So we do something special here
+bazel build -c opt ${BUILD_OPTS} \
+  tensorflow:libtensorflow.so \
+  tensorflow/tools/lib_package:clicenses_generate \
+  tensorflow/java:libtensorflow_jni.so \
+  tensorflow/tools/lib_package:jnilicenses_generate
+
+# Revert the hacks above
+git checkout ./tensorflow/tools/pip_package/BUILD
+git checkout ./tensorflow/java/src/main/native/BUILD
+rm -f ./tensorflow/java/src/main/native/windows_jni_md.h
+
+DIR=lib_package
+rm -rf ${DIR}
+mkdir -p ${DIR}
+
+# Zip up the .dll and the LICENSE for the JNI library.
+cp bazel-bin/tensorflow/java/libtensorflow_jni.so ${DIR}/tensorflow_jni.dll
+zip -j ${DIR}/libtensorflow_jni-cpu-windows-$(uname -m).zip \
+  ${DIR}/tensorflow_jni.dll \
+  bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/jni/LICENSE
+rm -f ${DIR}/tensorflow_jni.dll
+
+# Zip up the .dll, LICENSE and include files for the C library.
+mkdir -p ${DIR}/include/tensorflow/c
+mkdir -p ${DIR}/lib
+cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/lib/tensorflow.dll
+cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
+cd ${DIR}
+zip -j libtensorflow-cpu-windows-$(uname -m).zip \
+  lib/tensorflow.dll \
+  include/tensorflow/c/c_api.h \
+  include/tensorflow/c/LICENSE
+rm -rf lib include
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
new file mode 100755
index 00000000000..11064130713
--- /dev/null
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export PYTHON_BIN_PATH=`which python3`
+
+export TF_NEED_CUDA=1
+export TF_ENABLE_XLA=1
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | ./configure
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+    //tensorflow/compiler/...
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 96ae9583d73..f92edd0dd88 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -9,6 +9,8 @@ package(
     default_visibility = ["//tensorflow:__subpackages__"],
 )
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "public_api",
     srcs = ["public_api.py"],
@@ -17,6 +19,7 @@ py_library(
 
 py_test(
     name = "public_api_test",
+    size = "small",
     srcs = ["public_api_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -33,6 +36,7 @@ py_library(
 
 py_test(
     name = "traverse_test",
+    size = "small",
     srcs = ["traverse_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/tools/common/public_api.py b/tensorflow/tools/common/public_api.py
index 4c1ccebd616..e0acead9195 100644
--- a/tensorflow/tools/common/public_api.py
+++ b/tensorflow/tools/common/public_api.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
+import re
+
+from tensorflow.python.util import tf_inspect
 
 
 class PublicAPIVisitor(object):
@@ -34,24 +36,52 @@ class PublicAPIVisitor(object):
       visitor: A visitor to call for the public API.
     """
     self._visitor = visitor
+    self._root_name = 'tf'
 
-  # Modules/classes we do not want to descend into if we hit them. Usually,
-  # sytem modules exposed through platforms for compatibility reasons.
-  # Each entry maps a module path to a name to ignore in traversal.
-  _do_not_descend_map = {
-      # TODO(drpng): This can be removed once sealed off.
-      '': ['platform', 'pywrap_tensorflow', 'user_ops', 'python'],
+    # Modules/classes we want to suppress entirely.
+    self._private_map = {
+        # Some implementations have this internal module that we shouldn't
+        # expose.
+        'tf.flags': ['cpp_flags'],
+    }
 
-      # Exclude protos, they leak a lot.
-      'core': ['protobuf'],
+    # Modules/classes we do not want to descend into if we hit them. Usually,
+    # system modules exposed through platforms for compatibility reasons.
+    # Each entry maps a module path to a name to ignore in traversal.
+    self._do_not_descend_map = {
+        'tf': [
+            'core',
+            'examples',
+            'flags',  # Don't add flags
+            # TODO(drpng): This can be removed once sealed off.
+            'platform',
+            # TODO(drpng): This can be removed once sealed.
+            'pywrap_tensorflow',
+            # TODO(drpng): This can be removed once sealed.
+            'user_ops',
+            'python',
+            'tools',
+            'tensorboard',
+        ],
 
-      # Some implementations have this internal module that we shouldn't expose.
-      'flags': ['cpp_flags'],
+        ## Everything below here is legitimate.
+        # It'll stay, but it's not officially part of the API.
+        'tf.app': ['flags'],
+        # Imported for compatibility between py2/3.
+        'tf.test': ['mock'],
+    }
 
-      # Everything below here is legitimate.
-      'app': ['flags'],  # It'll stay, but it's not officially part of the API.
-      'test': ['mock'],  # Imported for compatibility between py2/3.
-  }
+  @property
+  def private_map(self):
+    """A map from parents to symbols that should not be included at all.
+
+    This map can be edited, but it should not be edited once traversal has
+    begun.
+
+    Returns:
+      The map marking symbols to not include.
+    """
+    return self._private_map
 
   @property
   def do_not_descend_map(self):
@@ -65,10 +95,17 @@ class PublicAPIVisitor(object):
     """
     return self._do_not_descend_map
 
-  def _isprivate(self, name):
+  def set_root_name(self, root_name):
+    """Override the default root name of 'tf'."""
+    self._root_name = root_name
+
+  def _is_private(self, path, name):
     """Return whether a name is private."""
-    # TODO(wicke): We have to almost certainly add more exceptions than init.
-    return name.startswith('_') and name not in ['__init__']
+    # TODO(wicke): Find out what names to exclude.
+    return ((path in self._private_map and
+             name in self._private_map[path]) or
+            (name.startswith('_') and not re.match('__.*__$', name) or
+             name in ['__base__', '__class__']))
 
   def _do_not_descend(self, path, name):
     """Safely queries if a specific fully qualified name should be excluded."""
@@ -79,18 +116,22 @@ class PublicAPIVisitor(object):
     """Visitor interface, see `traverse` for details."""
 
     # Avoid long waits in cases of pretty unambiguous failure.
-    if inspect.ismodule(parent) and len(path.split('.')) > 10:
-      raise RuntimeError('Modules nested too deep:\n%s\n\nThis is likely a '
-                         'problem with an accidental public import.' % path)
+    if tf_inspect.ismodule(parent) and len(path.split('.')) > 10:
+      raise RuntimeError('Modules nested too deep:\n%s.%s\n\nThis is likely a '
+                         'problem with an accidental public import.' %
+                         (self._root_name, path))
+
+    # Includes self._root_name
+    full_path = '.'.join([self._root_name, path]) if path else self._root_name
 
     # Remove things that are not visible.
     for name, child in list(children):
-      if self._isprivate(name):
+      if self._is_private(full_path, name):
         children.remove((name, child))
 
     self._visitor(path, parent, children)
 
     # Remove things that are visible, but which should not be descended into.
     for name, child in list(children):
-      if self._do_not_descend(path, name):
+      if self._do_not_descend(full_path, name):
         children.remove((name, child))
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 443838d9682..9607f80686d 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import sys
 
+from tensorflow.python.util import tf_inspect
 
 __all__ = ['traverse']
 
@@ -29,11 +29,11 @@ def _traverse_internal(root, visit, stack, path):
   """Internal helper for traverse."""
 
   # Only traverse modules and classes
-  if not inspect.isclass(root) and not inspect.ismodule(root):
+  if not tf_inspect.isclass(root) and not tf_inspect.ismodule(root):
     return
 
   try:
-    children = inspect.getmembers(root)
+    children = tf_inspect.getmembers(root)
   except ImportError:
     # On some Python installations, some modules do not support enumerating
     # members (six in particular), leading to import errors.
@@ -43,7 +43,8 @@ def _traverse_internal(root, visit, stack, path):
   visit(path, root, children)
   for name, child in children:
     # Do not descend into built-in modules
-    if inspect.ismodule(child) and child.__name__ in sys.builtin_module_names:
+    if tf_inspect.ismodule(
+        child) and child.__name__ in sys.builtin_module_names:
       continue
 
     # Break cycles
@@ -72,8 +73,8 @@ def traverse(root, visit):
   never descends into built-in modules.
 
   `children`, a list of `(name, object)` pairs are determined by
-  `inspect.getmembers`. To avoid visiting parts of the tree, `children` can be
-  modified in place, using `del` or slice assignment.
+  `tf_inspect.getmembers`. To avoid visiting parts of the tree, `children` can
+  be modified in place, using `del` or slice assignment.
 
   Cycles (determined by reference equality, `is`) stop the traversal. A stack of
   objects is kept to find cycles. Objects forming cycles may appear in
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 0f3de10a0ad..fb40cf0833f 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -10,12 +10,16 @@ load(
 
 py_binary(
     name = "tf_upgrade",
-    srcs = ["tf_upgrade.py"],
+    srcs = [
+        "ast_edits.py",
+        "tf_upgrade.py",
+    ],
     srcs_version = "PY2AND3",
 )
 
 py_test(
     name = "tf_upgrade_test",
+    size = "small",
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index d3bf2aa0324..aabc7b253d6 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -11,7 +11,10 @@ It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
-tf_upgrade.py --intree coolcode -outtree coolcode-upgraded
+# just upgrade the .py files
+tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
+# after upgrade the .py files, then copy all the other files to the outtree
+tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
 ```
 
 In either case, it will also dump out a report e.g. which will detail changes
@@ -32,8 +35,8 @@ Renamed keyword argument from `squeeze_dims` to `axis`
 ## Caveats
 
 - Don't update parts of your code manually before running this script. In
-particular, functions that have had reordered arguments like `tf.concat`,
-`tf.split` will cause the script to incorrectly add keyword arguments that
+particular, functions that have had reordered arguments like `tf.concat`
+or `tf.split` will cause the script to incorrectly add keyword arguments that
 mismap arguments.
 
 - This script wouldn't actually reorder arguments. Instead, the script will add
@@ -46,6 +49,12 @@ a tensor of bools. If the script detects this, it will report this to stdout
 `tf.reverse(a, [False, True, True])` you will need to manually change it to
 `tf.reverse(a, [1, 2])`.
 
-
-
-
+- There are some syntaxes that are not handleable with this script as this
+script was designed to use only standard python packages. If the script fails
+with "A necessary keyword argument failed to be inserted." or
+"Failed to find keyword lexicographically. Fix manually.", you can try
+[@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
+[@machrisaa](https://github.com/machrisaa) has used the
+[RedBaron Python refactoring engine](https://redbaron.readthedocs.io/en/latest/)
+which is able to localize syntactic elements more reliably than the built-in
+`ast` module this script is based upon.
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
new file mode 100644
index 00000000000..e7e4c916921
--- /dev/null
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -0,0 +1,497 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts according to an API change specification."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+import collections
+import os
+import shutil
+import sys
+import tempfile
+import traceback
+
+
+class APIChangeSpec(object):
+  """This class defines the transformations that need to happen.
+
+  This class must provide the following fields:
+
+  * `function_keyword_renames`: maps function names to a map of old -> new
+    argument names
+  * `function_renames`: maps function names to new function names
+  * `change_to_function`: a set of function names that have changed (for
+    notifications)
+  * `function_reorders`: maps functions whose argument order has changed to the
+    list of arguments in the new order
+  * `function_handle`: maps function names to custom handlers for the function
+
+  For an example, see `TFAPIChangeSpec`.
+  """
+
+
+class _FileEditTuple(collections.namedtuple(
+    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+  """Each edit that is recorded by a _FileEditRecorder.
+
+  Fields:
+    comment: A description of the edit and why it was made.
+    line: The line number in the file where the edit occurs (1-indexed).
+    start: The line number in the file where the edit occurs (0-indexed).
+    old: text string to remove (this must match what was in file).
+    new: text string to add in place of `old`.
+  """
+
+  __slots__ = ()
+
+
+class _FileEditRecorder(object):
+  """Record changes that need to be done to the file."""
+
+  def __init__(self, filename):
+    # all edits are lists of chars
+    self._filename = filename
+
+    self._line_to_edit = collections.defaultdict(list)
+    self._errors = []
+
+  def process(self, text):
+    """Process a list of strings, each corresponding to the recorded changes.
+
+    Args:
+      text: A list of lines of text (assumed to contain newlines)
+    Returns:
+      A tuple of the modified text and a textual description of what is done.
+    Raises:
+      ValueError: if substitution source location does not have expected text.
+    """
+
+    change_report = ""
+
+    # Iterate of each line
+    for line, edits in self._line_to_edit.items():
+      offset = 0
+      # sort by column so that edits are processed in order in order to make
+      # indexing adjustments cumulative for changes that change the string
+      # length
+      edits.sort(key=lambda x: x.start)
+
+      # Extract each line to a list of characters, because mutable lists
+      # are editable, unlike immutable strings.
+      char_array = list(text[line - 1])
+
+      # Record a description of the change
+      change_report += "%r Line %d\n" % (self._filename, line)
+      change_report += "-" * 80 + "\n\n"
+      for e in edits:
+        change_report += "%s\n" % e.comment
+      change_report += "\n    Old: %s" % (text[line - 1])
+
+      # Make underscore buffers for underlining where in the line the edit was
+      change_list = [" "] * len(text[line - 1])
+      change_list_new = [" "] * len(text[line - 1])
+
+      # Iterate for each edit
+      for e in edits:
+        # Create effective start, end by accounting for change in length due
+        # to previous edits
+        start_eff = e.start + offset
+        end_eff = start_eff + len(e.old)
+
+        # Make sure the edit is changing what it should be changing
+        old_actual = "".join(char_array[start_eff:end_eff])
+        if old_actual != e.old:
+          raise ValueError("Expected text %r but got %r" %
+                           ("".join(e.old), "".join(old_actual)))
+        # Make the edit
+        char_array[start_eff:end_eff] = list(e.new)
+
+        # Create the underline highlighting of the before and after
+        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
+        change_list_new[start_eff:end_eff] = "~" * len(e.new)
+
+        # Keep track of how to generate effective ranges
+        offset += len(e.new) - len(e.old)
+
+      # Finish the report comment
+      change_report += "         %s\n" % "".join(change_list)
+      text[line - 1] = "".join(char_array)
+      change_report += "    New: %s" % (text[line - 1])
+      change_report += "         %s\n\n" % "".join(change_list_new)
+    return "".join(text), change_report, self._errors
+
+  def add(self, comment, line, start, old, new, error=None):
+    """Add a new change that is needed.
+
+    Args:
+      comment: A description of what was changed
+      line: Line number (1 indexed)
+      start: Column offset (0 indexed)
+      old: old text
+      new: new text
+      error: this "edit" is something that cannot be fixed automatically
+    Returns:
+      None
+    """
+
+    self._line_to_edit[line].append(
+        _FileEditTuple(comment, line, start, old, new))
+    if error:
+      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+
+
+class _ASTCallVisitor(ast.NodeVisitor):
+  """AST Visitor that processes function calls.
+
+  Updates function calls from old API version to new API version using a given
+  change spec.
+  """
+
+  def __init__(self, filename, lines, api_change_spec):
+    self._filename = filename
+    self._file_edit = _FileEditRecorder(filename)
+    self._lines = lines
+    self._api_change_spec = api_change_spec
+
+  def process(self, lines):
+    return self._file_edit.process(lines)
+
+  def generic_visit(self, node):
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def _rename_functions(self, node, full_name):
+    function_renames = self._api_change_spec.function_renames
+    try:
+      new_name = function_renames[full_name]
+      self._file_edit.add("Renamed function %r to %r" % (full_name,
+                                                         new_name),
+                          node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
+
+  def _find_true_position(self, node):
+    """Return correct line number and column offset for a given node.
+
+    This is necessary mainly because ListComp's location reporting reports
+    the next token after the list comprehension list opening.
+
+    Args:
+      node: Node for which we wish to know the lineno and col_offset
+    """
+    import re
+    find_open = re.compile("^\s*(\\[).*$")
+    find_string_chars = re.compile("['\"]")
+
+    if isinstance(node, ast.ListComp):
+      # Strangely, ast.ListComp returns the col_offset of the first token
+      # after the '[' token which appears to be a bug. Workaround by
+      # explicitly finding the real start of the list comprehension.
+      line = node.lineno
+      col = node.col_offset
+      # loop over lines
+      while 1:
+        # Reverse the text to and regular expression search for whitespace
+        text = self._lines[line-1]
+        reversed_preceding_text = text[:col][::-1]
+        # First find if a [ can be found with only whitespace between it and
+        # col.
+        m = find_open.match(reversed_preceding_text)
+        if m:
+          new_col_offset = col - m.start(1) - 1
+          return line, new_col_offset
+        else:
+          if (reversed_preceding_text=="" or
+             reversed_preceding_text.isspace()):
+            line = line - 1
+            prev_line = self._lines[line - 1]
+            # TODO(aselle):
+            # this is poor comment detection, but it is good enough for
+            # cases where the comment does not contain string literal starting/
+            # ending characters. If ast gave us start and end locations of the
+            # ast nodes rather than just start, we could use string literal
+            # node ranges to filter out spurious #'s that appear in string
+            # literals.
+            comment_start = prev_line.find("#")
+            if comment_start ==  -1:
+              col = len(prev_line) -1
+            elif find_string_chars.search(prev_line[comment_start:]) is None:
+              col = comment_start
+            else:
+              return None, None
+          else:
+            return None, None
+    # Most other nodes return proper locations (with notably does not), but
+    # it is not possible to use that in an argument.
+    return node.lineno, node.col_offset
+
+
+  def visit_Call(self, node):  # pylint: disable=invalid-name
+    """Handle visiting a call node in the AST.
+
+    Args:
+      node: Current Node
+    """
+
+
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
+
+    # Make sure the func is marked as being part of a call
+    node.func.is_function_for_call = True
+
+    if full_name:
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
+
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          lineno, col_offset = self._find_true_position(arg)
+          if lineno is None or col_offset is None:
+            self._file_edit.add(
+                "Failed to add keyword %r to reordered function %r"
+                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
+                "", "",
+                error="A necessary keyword argument failed to be inserted.")
+          else:
+            keyword_arg = reordered[idx]
+            if (full_name in function_keyword_renames and
+                keyword_arg in function_keyword_renames[full_name]):
+              keyword_arg = function_keyword_renames[full_name][keyword_arg]
+            self._file_edit.add("Added keyword %r to reordered function %r"
+                                % (reordered[idx], full_name), lineno,
+                                col_offset, "", keyword_arg + "=")
+
+      # Examine each keyword argument and convert it to the final renamed form
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+
+        if argkey in renamed_keywords:
+          argval_lineno, argval_col_offset = self._find_true_position(argval)
+          if argval_lineno is not None and argval_col_offset is not None:
+            # TODO(aselle): We should scan backward to find the start of the
+            # keyword key. Unfortunately ast does not give you the location of
+            # keyword keys, so we are forced to infer it from the keyword arg
+            # value.
+            key_start = argval_col_offset - len(argkey) - 1
+            key_end = key_start + len(argkey) + 1
+            if (self._lines[argval_lineno - 1][key_start:key_end] ==
+                argkey + "="):
+              self._file_edit.add("Renamed keyword argument from %r to %r" %
+                                  (argkey, renamed_keywords[argkey]),
+                                  argval_lineno,
+                                  argval_col_offset - len(argkey) - 1,
+                                  argkey + "=", renamed_keywords[argkey] + "=")
+              continue
+          self._file_edit.add(
+              "Failed to rename keyword argument from %r to %r" %
+              (argkey, renamed_keywords[argkey]),
+              argval.lineno,
+              argval.col_offset - len(argkey) - 1,
+              "", "",
+              error="Failed to find keyword lexographically. Fix manually.")
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name:
+      self._rename_functions(node, full_name)
+    if full_name in self._api_change_spec.change_to_function:
+      if not hasattr(node, "is_function_for_call"):
+        new_text = full_name + "()"
+        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+                            node.lineno, node.col_offset, full_name, new_text)
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+
+class ASTCodeUpgrader(object):
+  """Handles upgrading a set of Python files using a given API change spec."""
+
+  def __init__(self, api_change_spec):
+    if not isinstance(api_change_spec, APIChangeSpec):
+      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
+                      type(api_change_spec))
+    self._api_change_spec = api_change_spec
+
+  def process_file(self, in_filename, out_filename):
+    """Process the given python file for incompatible changes.
+
+    Args:
+      in_filename: filename to parse
+      out_filename: output file to write to
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(
+          in_filename, in_file, out_filename, temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
+
+  # Broad exceptions are required here because ast throws whatever it wants.
+  # pylint: disable=broad-except
+  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
+    """Process the given python file for incompatible changes.
+
+    This function is split out to facilitate StringIO testing from
+    tf_upgrade_test.py.
+
+    Args:
+      in_filename: filename to parse
+      in_file: opened file (or StringIO)
+      out_filename: output file to write to
+      out_file: opened file (or StringIO)
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+    process_errors = []
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+
+    parsed_ast = None
+    lines = in_file.readlines()
+    try:
+      parsed_ast = ast.parse("".join(lines))
+    except Exception:
+      text += "Failed to parse %r\n\n" % in_filename
+      text += traceback.format_exc()
+    if parsed_ast:
+      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
+      visitor.visit(parsed_ast)
+      out_text, new_text, process_errors = visitor.process(lines)
+      text += new_text
+      if out_file:
+        out_file.write(out_text)
+    text += "\n"
+    return 1, text, process_errors
+  # pylint: enable=broad-except
+
+  def process_tree(self, root_directory, output_root_directory,
+                   copy_other_files):
+    """Processes upgrades on an entire tree of python files in place.
+
+    Note that only Python files. If you have custom code in other languages,
+    you will need to manually upgrade those.
+
+    Args:
+      root_directory: Directory to walk and process.
+      output_root_directory: Directory to use as base.
+      copy_other_files: Copy files that are not touched by this converter.
+
+    Returns:
+      A tuple of files processed, the report string ofr all files, and errors
+    """
+
+    # make sure output directory doesn't exist
+    if output_root_directory and os.path.exists(output_root_directory):
+      print("Output directory %r must not already exist." % (
+          output_root_directory))
+      sys.exit(1)
+
+    # make sure output directory does not overlap with root_directory
+    norm_root = os.path.split(os.path.normpath(root_directory))
+    norm_output = os.path.split(os.path.normpath(output_root_directory))
+    if norm_root == norm_output:
+      print("Output directory %r same as input directory %r" % (
+          root_directory, output_root_directory))
+      sys.exit(1)
+
+    # Collect list of files to process (we do this to correctly handle if the
+    # user puts the output directory in some sub directory of the input dir)
+    files_to_process = []
+    files_to_copy = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [f for f in file_list if f.endswith(".py")]
+      copy_files = [f for f in file_list if not f.endswith(".py")]
+      for filename in py_files:
+        fullpath = os.path.join(dir_name, filename)
+        fullpath_output = os.path.join(
+            output_root_directory, os.path.relpath(fullpath, root_directory))
+        files_to_process.append((fullpath, fullpath_output))
+      if copy_other_files:
+        for filename in copy_files:
+          fullpath = os.path.join(dir_name, filename)
+          fullpath_output = os.path.join(
+              output_root_directory, os.path.relpath(fullpath, root_directory))
+          files_to_copy.append((fullpath, fullpath_output))
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for input_path, output_path in files_to_process:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      file_count += 1
+      _, l_report, l_errors = self.process_file(input_path, output_path)
+      tree_errors += l_errors
+      report += l_report
+    for input_path, output_path in files_to_copy:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      shutil.copy(input_path, output_path)
+    return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index bcff10f21d5..72fe4a48cdd 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -17,23 +17,23 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import argparse
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
+
+from tensorflow.tools.compatibility import ast_edits
 
 
-class APIChangeSpec(object):
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
     self.function_keyword_renames = {
+        "tf.batch_matmul": {
+            "adj_x": "adjoint_a",
+            "adj_y": "adjoint_b",
+        },
         "tf.count_nonzero": {
             "reduction_indices": "axis"
         },
@@ -140,6 +140,7 @@ class APIChangeSpec(object):
         "tf.batch_svd": "tf.svd",
         "tf.batch_fft": "tf.fft",
         "tf.batch_ifft": "tf.ifft",
+        "tf.batch_fft2d": "tf.fft2d",
         "tf.batch_ifft2d": "tf.ifft2d",
         "tf.batch_fft3d": "tf.fft3d",
         "tf.batch_ifft3d": "tf.ifft3d",
@@ -148,6 +149,7 @@ class APIChangeSpec(object):
         "tf.batch_matmul": "tf.matmul",
         "tf.pack": "tf.stack",
         "tf.unpack": "tf.unstack",
+        "tf.op_scope": "tf.name_scope",
     }
 
     self.change_to_function = {
@@ -168,11 +170,14 @@ class APIChangeSpec(object):
         "tf.nn.sparse_softmax_cross_entropy_with_logits": [
             "logits", "labels", "name"],
         "tf.nn.sigmoid_cross_entropy_with_logits": [
-            "logits", "labels", "name"]
+            "logits", "labels", "name"],
+        "tf.op_scope": ["values", "name", "default_name"],
     }
 
     # Specially handled functions.
-    self.function_handle = {"tf.reverse": self._reverse_handler}
+    self.function_handle = {
+        "tf.reverse": self._reverse_handler
+    }
 
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
@@ -189,437 +194,6 @@ class APIChangeSpec(object):
                            error="tf.reverse requires manual check.")
 
 
-class FileEditTuple(collections.namedtuple(
-    "FileEditTuple", ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class TensorFlowCallVisitor(ast.NodeVisitor):
-  """AST Visitor that finds TensorFlow Function calls.
-
-  Updates function calls from old API version to new API version.
-  """
-
-  def __init__(self, filename, lines):
-    self._filename = filename
-    self._file_edit = FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = APIChangeSpec()
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name and full_name.startswith("tf."):
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if (argval_lineno is not None and argval_col_offset is not None):
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if self._lines[argval_lineno - 1][key_start:key_end] == argkey + "=":
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                              (argkey, renamed_keywords[argkey]),
-                              argval_lineno,
-                              argval_col_offset - len(argkey) - 1,
-                              argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "", "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name and full_name.startswith("tf."):
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class TensorFlowCodeUpgrader(object):
-  """Class that handles upgrading a set of Python files to TensorFlow 1.0."""
-
-  def __init__(self):
-    pass
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = TensorFlowCallVisitor(in_filename, lines)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    return file_count, report, tree_errors
-
-
 if __name__ == "__main__":
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -648,6 +222,13 @@ Simple usage:
       dest="output_tree",
       help="If converting a whole tree of files, the output "
       "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=False)
   parser.add_argument(
       "--reportfile",
       dest="report_filename",
@@ -657,7 +238,7 @@ Simple usage:
       default="report.txt")
   args = parser.parse_args()
 
-  upgrade = TensorFlowCodeUpgrader()
+  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
@@ -667,7 +248,7 @@ Simple usage:
     files_processed = 1
   elif args.input_tree:
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree)
+        args.input_tree, args.output_tree, args.copy_other_files)
   else:
     parser.print_help()
   if report_text:
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index de4e3de73cd..ac838a2791f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -22,6 +22,7 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade
 
 
@@ -36,7 +37,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
-    upgrader = tf_upgrade.TensorFlowCodeUpgrader()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
                                      "test_out.py", out_file))
@@ -139,7 +140,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
     upgraded = "tf.multiply(a, b)\n"
     temp_file.write(original)
     temp_file.close()
-    upgrader = tf_upgrade.TensorFlowCodeUpgrader()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     upgrader.process_file(temp_file.name, temp_file.name)
     self.assertAllEqual(open(temp_file.name).read(), upgraded)
     os.unlink(temp_file.name)
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
index 65d7e1717e7..83bbeeca8a9 100644
--- a/tensorflow/tools/dist_test/Dockerfile
+++ b/tensorflow/tools/dist_test/Dockerfile
@@ -23,7 +23,7 @@ FROM ubuntu:16.04
 MAINTAINER Shanqing Cai <cais@google.com>
 
 RUN apt-get update
-RUN apt-get install -y --no-install-recommends \
+RUN apt-get install -y \
     curl \
     python \
     python-numpy \
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index f9f37ff0e11..7d7f92d246e 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -70,7 +70,7 @@ get_container_id_by_image_name() {
     # Get the id of a container by image name
     # Usage: get_docker_container_id_by_image_name <img_name>
 
-    echo $(docker ps | grep $1 | awk '{print $1}')
+    docker ps | grep $1 | awk '{print $1}'
 }
 
 # Parse input arguments
@@ -151,6 +151,8 @@ rm -rf "${BUILD_DIR}"
 # Run docker image for test.
 docker run ${DOCKER_IMG_NAME} \
     /var/tf_dist_test/scripts/dist_mnist_test.sh \
-    --ps_hosts "localhost:2000,localhost:2001" \
-    --worker_hosts "localhost:3000,localhost:3001" \
+    --ps_hosts $(seq -f "localhost:%g" -s "," \
+                 2000 $((2000 + NUM_PARAMETER_SERVERS - 1))) \
+    --worker_hosts $(seq -f "localhost:%g" -s "," \
+                     3000 $((3000 + NUM_WORKERS - 1))) \
     --num_gpus 0 ${SYNC_REPLICAS_FLAG}
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index db56a687f6b..3a557814960 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -133,7 +133,7 @@ class CensusDataSource(object):
       columns: Columns to retrieve from the data files (A list of strings)
       label_column: Name of the label column
       categorical_columns: Names of the categorical columns (A list of strings)
-      continuous_columns: Names of the continuous columsn (A list of strings)
+      continuous_columns: Names of the continuous columns (A list of strings)
     """
 
     # Retrieve data from disk (if available) or download from the web.
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index 7e68258b0a0..f7dbfea7fb0 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -16,9 +16,9 @@
 """Distributed MNIST training and validation, with model replicas.
 
 A simple softmax model with one hidden layer is defined. The parameters
-(weights and biases) are located on two parameter servers (ps), while the
-ops are defined on a worker node. The TF sessions also run on the worker
-node.
+(weights and biases) are located on one parameter server (ps), while the ops
+are executed on two worker nodes by default. The TF sessions also run on the 
+worker node.
 Multiple invocations of this script can be done in parallel, with different
 values for --task_index. There should be exactly one invocation with
 --task_index, which will create a master session that carries out variable
diff --git a/tensorflow/tools/dist_test/scripts/BUILD b/tensorflow/tools/dist_test/scripts/BUILD
new file mode 100644
index 00000000000..c329f0bbe87
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts/BUILD
@@ -0,0 +1,22 @@
+# Tools for running distributed benchmarks.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["k8s_tensorflow.py"])
+
+py_library(
+    name = "k8s_tensorflow_lib",
+    srcs = ["k8s_tensorflow_lib.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "k8s_tensorflow_test",
+    size = "small",
+    srcs = ["k8s_tensorflow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":k8s_tensorflow_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py
index 854c6b832a7..b325f030e36 100755
--- a/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py
+++ b/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py
@@ -25,6 +25,8 @@ from __future__ import print_function
 import argparse
 import sys
 
+import k8s_tensorflow_lib
+
 # Note: It is intentional that we do not import tensorflow in this script. The
 # machine that launches a TensorFlow k8s cluster does not have to have the
 # Python package of TensorFlow installed on it.
@@ -33,127 +35,12 @@ import sys
 DEFAULT_DOCKER_IMAGE = 'tensorflow/tf_grpc_test_server'
 DEFAULT_PORT = 2222
 
-# TODO(cais): Consider adding resource requests/limits to the pods.
-
-# Worker pods will mount host volume /shared, as a convenient way to create
-# shared storage among workers during local tests.
-WORKER_RC = (
-    """apiVersion: v1
-kind: ReplicationController
-metadata:
-  name: tf-worker{worker_id}
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        tf-worker: "{worker_id}"
-    spec:
-      containers:
-      - name: tf-worker{worker_id}
-        image: {docker_image}
-        args:
-          - --cluster_spec={cluster_spec}
-          - --job_name=worker
-          - --task_id={worker_id}
-        ports:
-        - containerPort: {port}
-        volumeMounts:
-        - name: shared
-          mountPath: /shared
-      volumes:
-      - name: shared
-        hostPath:
-          path: /shared
-""")
-WORKER_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: tf-worker{worker_id}
-  labels:
-    tf-worker: "{worker_id}"
-spec:
-  ports:
-  - port: {port}
-    targetPort: {port}
-  selector:
-    tf-worker: "{worker_id}"
-""")
-WORKER_LB_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: tf-worker{worker_id}
-  labels:
-    tf-worker: "{worker_id}"
-spec:
-  type: LoadBalancer
-  ports:
-  - port: {port}
-  selector:
-    tf-worker: "{worker_id}"
-""")
-PARAM_SERVER_RC = (
-    """apiVersion: v1
-kind: ReplicationController
-metadata:
-  name: tf-ps{param_server_id}
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        tf-ps: "{param_server_id}"
-    spec:
-      containers:
-      - name: tf-ps{param_server_id}
-        image: {docker_image}
-        args:
-          - --cluster_spec={cluster_spec}
-          - --job_name=ps
-          - --task_id={param_server_id}
-        ports:
-        - containerPort: {port}
-        volumeMounts:
-        - name: shared
-          mountPath: /shared
-      volumes:
-      - name: shared
-        hostPath:
-          path: /shared
-""")
-PARAM_SERVER_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: tf-ps{param_server_id}
-  labels:
-    tf-ps: "{param_server_id}"
-spec:
-  ports:
-  - port: {port}
-  selector:
-    tf-ps: "{param_server_id}"
-""")
-PARAM_LB_SVC = ("""apiVersion: v1
-kind: Service
-metadata:
-  name: tf-ps{param_server_id}
-  labels:
-    tf-ps: "{param_server_id}"
-spec:
-  type: LoadBalancer
-  ports:
-  - port: {port}
-  selector:
-    tf-ps: "{param_server_id}"
-""")
-
 
 def main():
   """Do arg parsing."""
   parser = argparse.ArgumentParser()
+  parser.register(
+      'type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes'))
   parser.add_argument('--num_workers',
                       type=int,
                       default=2,
@@ -167,7 +54,7 @@ def main():
                       default=DEFAULT_PORT,
                       help='GRPC server port (Default: %d)' % DEFAULT_PORT)
   parser.add_argument('--request_load_balancer',
-                      type=bool,
+                      type='bool',
                       default=False,
                       help='To request worker0 to be exposed on a public IP '
                       'address via an external load balancer, enabling you to '
@@ -177,6 +64,16 @@ def main():
                       default=DEFAULT_DOCKER_IMAGE,
                       help='Override default docker image for the TensorFlow '
                       'GRPC server')
+  parser.add_argument('--name_prefix',
+                      type=str,
+                      default='tf',
+                      help='Prefix for job names. Jobs will be named as '
+                      '<name_prefix>_worker|ps<task_id>')
+  parser.add_argument('--use_shared_volume',
+                      type='bool',
+                      default=True,
+                      help='Whether to mount /shared directory from host to '
+                      'the pod')
   args = parser.parse_args()
 
   if args.num_workers <= 0:
@@ -190,88 +87,17 @@ def main():
     sys.exit(1)
 
   # Generate contents of yaml config
-  yaml_config = GenerateConfig(args.num_workers,
-                               args.num_parameter_servers,
-                               args.grpc_port,
-                               args.request_load_balancer,
-                               args.docker_image)
+  yaml_config = k8s_tensorflow_lib.GenerateConfig(
+      args.num_workers,
+      args.num_parameter_servers,
+      args.grpc_port,
+      args.request_load_balancer,
+      args.docker_image,
+      args.name_prefix,
+      env_vars=None,
+      use_shared_volume=args.use_shared_volume)
   print(yaml_config)  # pylint: disable=superfluous-parens
 
 
-def GenerateConfig(num_workers,
-                   num_param_servers,
-                   port,
-                   request_load_balancer,
-                   docker_image):
-  """Generate configuration strings."""
-  config = ''
-  for worker in range(num_workers):
-    config += WORKER_RC.format(
-        port=port,
-        worker_id=worker,
-        docker_image=docker_image,
-        cluster_spec=WorkerClusterSpecString(num_workers,
-                                             num_param_servers,
-                                             port))
-    config += '---\n'
-    if request_load_balancer:
-      config += WORKER_LB_SVC.format(port=port,
-                                     worker_id=worker)
-    else:
-      config += WORKER_SVC.format(port=port,
-                                  worker_id=worker)
-    config += '---\n'
-
-  for param_server in range(num_param_servers):
-    config += PARAM_SERVER_RC.format(
-        port=port,
-        param_server_id=param_server,
-        docker_image=docker_image,
-        cluster_spec=ParamServerClusterSpecString(num_workers,
-                                                  num_param_servers,
-                                                  port))
-    config += '---\n'
-    if request_load_balancer:
-      config += PARAM_LB_SVC.format(port=port, param_server_id=param_server)
-    else:
-      config += PARAM_SERVER_SVC.format(port=port, param_server_id=param_server)
-    config += '---\n'
-
-  return config
-
-
-def WorkerClusterSpecString(num_workers,
-                            num_param_servers,
-                            port):
-  """Generates worker cluster spec."""
-  return ClusterSpecString(num_workers, num_param_servers, port)
-
-
-def ParamServerClusterSpecString(num_workers,
-                                 num_param_servers,
-                                 port):
-  """Generates parameter server spec."""
-  return ClusterSpecString(num_workers, num_param_servers, port)
-
-
-def ClusterSpecString(num_workers,
-                      num_param_servers,
-                      port):
-  """Generates general cluster spec."""
-  spec = 'worker|'
-  for worker in range(num_workers):
-    spec += 'tf-worker%d:%d' % (worker, port)
-    if worker != num_workers-1:
-      spec += ';'
-
-  spec += ',ps|'
-  for param_server in range(num_param_servers):
-    spec += 'tf-ps%d:%d' % (param_server, port)
-    if param_server != num_param_servers-1:
-      spec += ';'
-
-  return spec
-
-
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py
new file mode 100644
index 00000000000..8adbe387ba3
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py
@@ -0,0 +1,309 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Generates YAML configuration files for distributed TensorFlow workers.
+
+The workers will be run in a Kubernetes (k8s) container cluster.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Note: It is intentional that we do not import tensorflow in this script. The
+# machine that launches a TensorFlow k8s cluster does not have to have the
+# Python package of TensorFlow installed on it.
+
+# TODO(cais): Consider adding resource requests/limits to the pods.
+
+# Worker pods will mount host volume /shared, as a convenient way to create
+# shared storage among workers during local tests.
+WORKER_RC = (
+    """apiVersion: v1
+kind: ReplicationController
+metadata:
+  name: {name_prefix}-worker{worker_id}
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        tf-worker: "{worker_id}"
+        name-prefix: "{name_prefix}"
+        job: "worker"
+    spec:
+      containers:
+      - name: tf-worker{worker_id}
+        image: {docker_image}
+        args: [{args}]
+        ports:
+        - containerPort: {port}
+        env: [{env_vars}]
+        volumeMounts: [{volume_mounts}]
+      volumes: [{volumes}]
+""")
+WORKER_SVC = (
+    """apiVersion: v1
+kind: Service
+metadata:
+  name: {name_prefix}-worker{worker_id}
+  labels:
+    tf-worker: "{worker_id}"
+spec:
+  ports:
+  - port: {port}
+    targetPort: {port}
+  selector:
+    tf-worker: "{worker_id}"
+""")
+WORKER_LB_SVC = (
+    """apiVersion: v1
+kind: Service
+metadata:
+  name: {name_prefix}-worker{worker_id}
+  labels:
+    tf-worker: "{worker_id}"
+spec:
+  type: LoadBalancer
+  ports:
+  - port: {port}
+  selector:
+    tf-worker: "{worker_id}"
+""")
+PARAM_SERVER_RC = (
+    """apiVersion: v1
+kind: ReplicationController
+metadata:
+  name: {name_prefix}-ps{param_server_id}
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        tf-ps: "{param_server_id}"
+        name-prefix: "{name_prefix}"
+        job: "ps"
+    spec:
+      containers:
+      - name: tf-ps{param_server_id}
+        image: {docker_image}
+        args: [{args}]
+        ports:
+        - containerPort: {port}
+        env: [{env_vars}]
+        volumeMounts: [{volume_mounts}]
+      volumes: [{volumes}]
+""")
+PARAM_SERVER_SVC = (
+    """apiVersion: v1
+kind: Service
+metadata:
+  name: {name_prefix}-ps{param_server_id}
+  labels:
+    tf-ps: "{param_server_id}"
+spec:
+  ports:
+  - port: {port}
+  selector:
+    tf-ps: "{param_server_id}"
+""")
+PARAM_LB_SVC = ("""apiVersion: v1
+kind: Service
+metadata:
+  name: {name_prefix}-ps{param_server_id}
+  labels:
+    tf-ps: "{param_server_id}"
+spec:
+  type: LoadBalancer
+  ports:
+  - port: {port}
+  selector:
+    tf-ps: "{param_server_id}"
+""")
+VOLUME_MOUNTS = '{name: shared, mountPath: /shared}'
+VOLUMES = '{name: shared, hostPath: {path: /shared}}'
+_ENV_VAR_TEMPLATE = '{name: "%s", value: "%s"}'
+_ARG_TEMPLATE = '"--%s=%s"'
+
+
+def GenerateConfig(num_workers,
+                   num_param_servers,
+                   port,
+                   request_load_balancer,
+                   docker_image,
+                   name_prefix,
+                   env_vars=None,
+                   use_shared_volume=True,
+                   use_cluster_spec=True):
+  """Generate configuration strings.
+
+  Args:
+    num_workers: number of worker jobs.
+    num_param_servers: number of ps server jobs.
+    port: GRPC server port.
+    request_load_balancer: request worker0 to be exposed on a public IP
+      address via an external load balancer.
+    docker_image: docker image to use.
+    name_prefix: name to prepend to pod job names.
+    env_vars: dictionary of environment variables to set.
+    use_shared_volume: whether to add hostPath to /shared directory
+      to the kubernetes config.
+    use_cluster_spec: if true, pass --cluster_spec to worker and ps jobs.
+      If false, pass --worker_hosts and --ps_hosts to worker and ps jobs.
+
+  Returns:
+    Kubernetes yaml config.
+  """
+  if env_vars is None:
+    env_vars = {}
+  env_str = ', '.join([_ENV_VAR_TEMPLATE % (name, value)
+                       for name, value in env_vars.items()])
+  config = ''
+  common_args = GetCommonArgs(
+      num_workers, num_param_servers, port, name_prefix, use_cluster_spec)
+  for worker in range(num_workers):
+    worker_args = {
+        'job_name': 'worker',
+        'task_id': worker
+    }
+    worker_args.update(common_args)
+    arg_str = ', '.join([_ARG_TEMPLATE % (name, value)
+                         for name, value in worker_args.items()])
+    config += WORKER_RC.format(
+        port=port,
+        worker_id=worker,
+        docker_image=docker_image,
+        name_prefix=name_prefix,
+        volume_mounts=VOLUME_MOUNTS if use_shared_volume else '',
+        volumes=VOLUMES if use_shared_volume else '',
+        args=arg_str,
+        env_vars=env_str)
+    config += '---\n'
+    if request_load_balancer:
+      config += WORKER_LB_SVC.format(port=port,
+                                     worker_id=worker,
+                                     name_prefix=name_prefix)
+    else:
+      config += WORKER_SVC.format(port=port,
+                                  worker_id=worker,
+                                  name_prefix=name_prefix)
+    config += '---\n'
+
+  for param_server in range(num_param_servers):
+    ps_args = {
+        'job_name': 'ps',
+        'task_id': param_server
+    }
+    ps_args.update(common_args)
+    arg_str = ', '.join([_ARG_TEMPLATE % (name, value)
+                         for name, value in ps_args.items()])
+    config += PARAM_SERVER_RC.format(
+        port=port,
+        param_server_id=param_server,
+        docker_image=docker_image,
+        name_prefix=name_prefix,
+        volume_mounts=VOLUME_MOUNTS if use_shared_volume else '',
+        volumes=VOLUMES if use_shared_volume else '',
+        args=arg_str,
+        env_vars=env_str)
+    config += '---\n'
+    if request_load_balancer:
+      config += PARAM_LB_SVC.format(
+          port=port, param_server_id=param_server, name_prefix=name_prefix)
+    else:
+      config += PARAM_SERVER_SVC.format(
+          port=port, param_server_id=param_server, name_prefix=name_prefix)
+    config += '---\n'
+
+  return config
+
+
+def WorkerClusterSpecString(num_workers,
+                            num_param_servers,
+                            port,
+                            name_prefix):
+  """Generates worker cluster spec."""
+  return ClusterSpecString(num_workers, num_param_servers, port, name_prefix)
+
+
+def ParamServerClusterSpecString(num_workers,
+                                 num_param_servers,
+                                 port,
+                                 name_prefix):
+  """Generates parameter server spec."""
+  return ClusterSpecString(num_workers, num_param_servers, port,
+                           name_prefix)
+
+
+def ClusterSpecString(num_workers,
+                      num_param_servers,
+                      port,
+                      name_prefix):
+  """Generates general cluster spec."""
+  spec = 'worker|'
+  for worker in range(num_workers):
+    spec += '%s-worker%d:%d' % (name_prefix, worker, port)
+    if worker != num_workers-1:
+      spec += ';'
+
+  spec += ',ps|'
+  for param_server in range(num_param_servers):
+    spec += '%s-ps%d:%d' % (name_prefix, param_server, port)
+    if param_server != num_param_servers-1:
+      spec += ';'
+
+  return spec
+
+
+def GetCommonArgs(num_workers,
+                  num_param_servers,
+                  port,
+                  name_prefix,
+                  use_cluster_spec):
+  """Get arguments common to both worker and ps jobs.
+
+  Args:
+    num_workers: number of workers.
+    num_param_servers: number of ps servers.
+    port: worker and ps port number.
+    name_prefix: prefix to prepend to job names.
+    use_cluster_spec: if true, pass --cluster_spec argument.
+      If false, parse --worker_hosts and --ps_hosts arguments.
+
+  Returns:
+    A dictionary of argument names mapping to argument values.
+  """
+  common_args = {}
+  if use_cluster_spec:
+    common_args['cluster_spec'] = WorkerClusterSpecString(
+        num_workers,
+        num_param_servers,
+        port,
+        name_prefix)
+  else:
+    common_args['worker_hosts'] = WorkerHosts(num_workers, port, name_prefix)
+    common_args['ps_hosts'] = PsHosts(num_param_servers, port, name_prefix)
+  return common_args
+
+
+def WorkerHosts(num_workers, port, name_prefix):
+  worker_hosts = ['%s-worker%d:%d' % (name_prefix, i, port)
+                  for i in range(num_workers)]
+  return ','.join(worker_hosts)
+
+
+def PsHosts(num_ps, port, name_prefix):
+  ps_hosts = ['%s-ps%d:%d' % (name_prefix, i, port)
+              for i in range(num_ps)]
+  return ','.join(ps_hosts)
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py
new file mode 100644
index 00000000000..7d9b3f83f51
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.tools.dist_test.scripts.k8s_tensorflow_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import googletest
+from tensorflow.tools.dist_test.scripts import k8s_tensorflow_lib
+
+
+class K8sTensorflowTest(googletest.TestCase):
+
+  def testGenerateConfig_LoadBalancer(self):
+    # Use loadbalancer
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=True,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False)
+    self.assertTrue('LoadBalancer' in config)
+
+    # Don't use loadbalancer
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=False,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False)
+    self.assertFalse('LoadBalancer' in config)
+
+  def testGenerateConfig_SharedVolume(self):
+    # Use shared directory
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=False,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=True)
+    self.assertTrue('/shared' in config)
+
+    # Don't use shared directory
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=False,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False)
+    self.assertFalse('/shared' in config)
+
+  def testEnvVar(self):
+    # Use loadbalancer
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=True,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False,
+        env_vars={'test1': 'test1_value', 'test2': 'test2_value'})
+    self.assertTrue('{name: "test1", value: "test1_value"}' in config)
+    self.assertTrue('{name: "test2", value: "test2_value"}' in config)
+
+  def testClusterSpec(self):
+    # Use cluster_spec
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=True,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False,
+        use_cluster_spec=True)
+    self.assertFalse('worker_hosts' in config)
+    self.assertFalse('ps_hosts' in config)
+    self.assertTrue(
+        '"--cluster_spec=worker|abc-worker0:5000,ps|abc-ps0:5000"' in config)
+
+    # Don't use cluster_spec
+    config = k8s_tensorflow_lib.GenerateConfig(
+        num_workers=1,
+        num_param_servers=1,
+        port=5000,
+        request_load_balancer=True,
+        docker_image='test_image',
+        name_prefix='abc',
+        use_shared_volume=False,
+        use_cluster_spec=False)
+    self.assertFalse('cluster_spec' in config)
+    self.assertTrue('"--worker_hosts=abc-worker0:5000"' in config)
+    self.assertTrue('"--ps_hosts=abc-ps0:5000"' in config)
+
+  def testWorkerHosts(self):
+    self.assertEquals(
+        'test_prefix-worker0:1234',
+        k8s_tensorflow_lib.WorkerHosts(1, 1234, 'test_prefix'))
+    self.assertEquals(
+        'test_prefix-worker0:1234,test_prefix-worker1:1234',
+        k8s_tensorflow_lib.WorkerHosts(2, 1234, 'test_prefix'))
+
+  def testPsHosts(self):
+    self.assertEquals(
+        'test_prefix-ps0:1234,test_prefix-ps1:1234',
+        k8s_tensorflow_lib.PsHosts(2, 1234, 'test_prefix'))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 25efc83716e..865af8dd7b2 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -7,7 +7,9 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-py_library(
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_binary(
     name = "grpc_tensorflow_server",
     srcs = [
         "grpc_tensorflow_server.py",
diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile
index 4b13b814e39..fabc8a7105e 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@@ -17,7 +17,7 @@
 #
 # To build the image, use ../build_server.sh
 
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 MAINTAINER Shanqing Cai <cais@google.com>
 
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
index e2feb2227bb..908af8af9bb 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@@ -17,7 +17,7 @@
 #
 # To build the image, use ../build_server.sh --test
 
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 MAINTAINER Shanqing Cai <cais@google.com>
 
@@ -52,13 +52,13 @@ ADD . /var/tf-k8s
 # Download MNIST data for tests
 RUN mkdir -p /tmp/mnist-data
 RUN curl -o /tmp/mnist-data/train-labels-idx1-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz
 RUN curl -o /tmp/mnist-data/train-images-idx3-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz
 RUN curl -o /tmp/mnist-data/t10k-labels-idx1-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz
 RUN curl -o /tmp/mnist-data/t10k-images-idx3-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz
 
 # Download Census data for Wide & Deep test
 RUN mkdir -p /tmp/census-data
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
old mode 100755
new mode 100644
index 2d774577b6d..bd6700a0b1f
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 import argparse
 import sys
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
 from tensorflow.python.training import server_lib
@@ -103,8 +104,11 @@ def main(unused_args):
     raise ValueError("Invalid task_id: %d" % FLAGS.task_id)
   server_def.task_index = FLAGS.task_id
 
+  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
+      per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction))
+
   # Create GRPC Server instance
-  server = server_lib.Server(server_def)
+  server = server_lib.Server(server_def, config=config)
 
   # join() is blocking, unlike start()
   server.join()
@@ -137,6 +141,11 @@ if __name__ == "__main__":
       default=0,
       help="Task index, e.g., 0"
   )
+  parser.add_argument(
+      "--gpu_memory_fraction",
+      type=float,
+      default=1.0,
+      help="Fraction of GPU memory allocated",)
   parser.add_argument(
       "--verbose",
       type="bool",
@@ -145,5 +154,6 @@ if __name__ == "__main__":
       default=False,
       help="Verbose mode"
   )
+
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 4f00696be59..5b3f1f936a4 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
@@ -66,4 +66,4 @@ EXPOSE 8888
 
 WORKDIR "/notebooks"
 
-CMD ["/run_jupyter.sh"]
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 8cd6ee6f331..38a67f80aae 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,34 +48,21 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/root/.bazelrc
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.4.2
+ENV BAZEL_VERSION 0.5.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
-    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE.txt && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
     chmod +x bazel-*.sh && \
     ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
     cd / && \
@@ -83,7 +72,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.0
+    git checkout r1.2
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
@@ -93,7 +82,8 @@ WORKDIR /tensorflow
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index a3ccf919179..d0a038a9db6 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,34 +48,21 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/root/.bazelrc
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/root/.bazelrc
-ENV BAZELRC /root/.bazelrc
+    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.4.2
+ENV BAZEL_VERSION 0.5.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
-    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE.txt && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
     chmod +x bazel-*.sh && \
     ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
     cd / && \
@@ -83,7 +72,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.0
+    git checkout r1.2
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
@@ -93,7 +82,8 @@ ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
 
 RUN tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --config=cuda --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 77113c1d828..3ba1e963f92 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
@@ -69,4 +69,4 @@ EXPOSE 8888
 
 WORKDIR "/notebooks"
 
-CMD ["/run_jupyter.sh"]
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 77fd8fc0d4f..6d5a9bdc4ce 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -10,16 +10,16 @@ General installation instructions are
 quick links here:
 
 * [OSX](https://www.docker.com/products/docker#/mac)
-* [ubuntu](https://docs.docker.com/engine/installation/linux/ubuntulinux/)
+* [Ubuntu](https://docs.docker.com/engine/installation/linux/ubuntulinux/)
 
 ## Which containers exist?
 
-We currently maintain three Docker container images:
+We currently maintain two Docker container images:
 
 * `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only!
 
 * `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies
-  and support for Nvidia Cuda
+  and support for NVidia CUDA
 
 Note: We also publish the same containers into
 [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
@@ -37,9 +37,9 @@ For GPU support install NVidia drivers (ideally latest) and
     $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
 
 
-Note: If you would have a problem running nvidia-docker you may try the old way
-we have used. But it is not recomended. If you find a bug in nvidia-docker report
-it there please and try using the nvidia-docker as described above.
+Note: If you would have a problem running nvidia-docker you may try the old method
+we have used. But it is not recommended. If you find a bug in nvidia-docker, please report
+it there and try using nvidia-docker as described above.
 
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
@@ -49,11 +49,35 @@ it there please and try using the nvidia-docker as described above.
 ## More containers
 
 See all available [tags](https://hub.docker.com/r/tensorflow/tensorflow/tags/)
-for additional containers like release candidates or nighlty builds.
+for additional containers, such as release candidates or nightly builds.
 
 
 ## Rebuilding the containers
 
-Just pick the dockerfile corresponding to the container you want to build, and run
+Building TensorFlow Docker containers should be done through the
+[parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/README.md)
+script. The raw Dockerfiles should not be used directly as they contain strings
+to be replaced by the script during the build.
 
-    $ docker build --pull -t $USER/tensorflow-suffix -f Dockerfile.suffix .
+To use the script, specify the container type (`CPU` vs. `GPU`), the desired
+Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
+is to be built (`NO` vs. `YES`). In addition, you need to specify the central
+location from where the pip package of TensorFlow will be downloaded.
+
+For example, to build a CPU-only non-developer Docker image for Python 2, using
+TensorFlow's nightly pip package:
+
+``` bash
+export TF_DOCKER_BUILD_IS_DEVEL=NO
+export TF_DOCKER_BUILD_TYPE=CPU
+export TF_DOCKER_BUILD_PYTHON_VERSION=PYTHON2
+
+export NIGHTLY_VERSION="1.head"
+export TF_DOCKER_BUILD_CENTRAL_PIP=$(echo ${TF_DOCKER_BUILD_PYTHON_VERSION} | sed s^PYTHON2^http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION},label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp27-cp27mu-manylinux1_x86_64.whl^ | sed s^PYTHON3^http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp35-cp35m-manylinux1_x86_64.whl^)
+
+tensorflow/tools/docker/parameterized_docker_build.sh
+```
+
+If successful, the image will be tagged as `${USER}/tensorflow:latest` by default.
+
+Rebuilding GPU images requires [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 6b1ebc3ee0a..747beb8251e 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -22,5 +22,10 @@ c.MultiKernelManager.default_kernel_name = 'python2'
 
 # sets a password if PASSWORD is set in the environment
 if 'PASSWORD' in os.environ:
-  c.NotebookApp.password = passwd(os.environ['PASSWORD'])
+  password = os.environ['PASSWORD']
+  if password:
+    c.NotebookApp.password = passwd(password)
+  else:
+    c.NotebookApp.password = ''
+    c.NotebookApp.token = ''
   del os.environ['PASSWORD']
diff --git a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb b/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
index c0b9f10b2eb..0633b03259a 100644
--- a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
+++ b/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
@@ -72,7 +72,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -136,7 +135,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -181,7 +179,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -278,7 +275,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -343,7 +339,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -425,7 +420,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -512,7 +506,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
@@ -604,7 +597,6 @@
       },
       "outputs": [
         {
-          "metadata": {},
           "name": "stdout",
           "output_type": "stream",
           "text": [
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index b35b14df1fd..c9f2b1ab9ef 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -134,7 +134,7 @@
     "import os\n",
     "from six.moves.urllib.request import urlretrieve\n",
     "\n",
-    "SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'\n",
+    "SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'\n",
     "WORK_DIRECTORY = \"/tmp/mnist-data\"\n",
     "\n",
     "def maybe_download(filename):\n",
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 35c12184700..ea88d8165f4 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -64,7 +64,7 @@
 #
 #   TF_DOCKER_BUILD_OPTIONS
 #     (Optional)
-#     Specifices the desired build options. Defaults to OPT.
+#     Specifies the desired build options. Defaults to OPT.
 
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -233,13 +233,16 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+    if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
-        echo "Modified Dockerfile for python version "\
-"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
-        die "FAILED to modify ${DOCKERFILE} for python3"
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+    then
+      echo "Modified Dockerfile for python version "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
+    else
+      die "FAILED to modify ${DOCKERFILE} for python3"
+    fi
   fi
 else
   DOCKERFILE="${TMP_DIR}/Dockerfile"
@@ -250,14 +253,17 @@ else
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
+    if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
-        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
-        die "FAILED to modify ${DOCKERFILE} for python3"
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+    then
+      echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
+    else
+      die "FAILED to modify ${DOCKERFILE} for python3"
+    fi
   fi
 fi
 
@@ -266,7 +272,7 @@ fi
 IMG="${USER}/tensorflow:${FINAL_TAG}"
 echo "Building docker image with image name and tag: ${IMG}"
 
-"${DOCKER_BINARY}" build --no-cache -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
+"${DOCKER_BINARY}" build --no-cache --pull -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
 if [[ $? == "0" ]]; then
   echo "${DOCKER_BINARY} build of ${IMG} succeeded"
 else
@@ -277,7 +283,7 @@ fi
 
 # Make sure that there is no other containers of the same image running
 # TODO(cais): Move to an earlier place.
-if [[ ! -z $("${DOCKER_BINARY}" ps | grep "${IMG}") ]]; then
+if "${DOCKER_BINARY}" ps | grep -q "${IMG}"; then
   die "ERROR: It appears that there are docker containers of the image "\
 "${IMG} running. Please stop them before proceeding"
 fi
@@ -310,16 +316,22 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     # on the running docker container
     echo ""
     echo "Performing basic sanity checks on the running container..."
-    wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null && \
-        echo "  PASS: wget tree" || \
-        mark_check_failed "  FAIL: wget tree"
+    if wget -qO- "http://127.0.0.1:${CONTAINER_PORT}/tree" &> /dev/null
+    then
+      echo "  PASS: wget tree"
+    else
+      mark_check_failed "  FAIL: wget tree"
+    fi
 
     for NB in ${TMP_DIR}/notebooks/*.ipynb; do
       NB_BASENAME=$(basename "${NB}")
       NB_URL="http://127.0.0.1:${CONTAINER_PORT}/notebooks/${NB_BASENAME}"
-      wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null && \
-          echo "  PASS: wget ${NB_URL}" || \
-          mark_check_failed  "  FAIL: wget ${NB_URL}"
+      if wget -qO- "${NB_URL}" -o "${TMP_DIR}/${NB_BASENAME}" &> /dev/null
+      then
+        echo "  PASS: wget ${NB_URL}"
+      else
+        mark_check_failed  "  FAIL: wget ${NB_URL}"
+      fi
     done
   fi
 
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index f321354eb58..8e27b133c2f 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -9,12 +9,7 @@ package(
     default_visibility = ["//tensorflow:__subpackages__"],
 )
 
-py_binary(
-    name = "gen_cc_md",
-    srcs = ["gen_cc_md.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
     name = "doc_generator_visitor",
@@ -39,87 +34,110 @@ py_test(
 
 py_library(
     name = "parser",
-    srcs = [
-        "parser.py",
-    ],
+    srcs = ["parser.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
 )
 
 py_test(
     name = "parser_test",
     size = "small",
-    srcs = [
-        "parser_test.py",
-    ],
+    srcs = ["parser_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["manual"],
     deps = [
         ":parser",
         "//tensorflow/python:platform_test",
     ],
 )
 
+py_library(
+    name = "pretty_docs",
+    srcs = ["pretty_docs.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_binary(
+    name = "generate_lib",
+    srcs = ["generate_lib.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":doc_generator_visitor",
+        ":parser",
+        ":pretty_docs",
+        ":py_guide_parser",
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+    ],
+)
+
+py_test(
+    name = "generate_lib_test",
+    size = "small",
+    srcs = ["generate_lib_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":generate_lib",
+        ":parser",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/debug:debug_py",
+    ],
+)
+
 py_binary(
     name = "generate",
     srcs = ["generate.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":generate_lib",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/tools/common:public_api",
-        "//tensorflow/tools/common:traverse",
-        "//tensorflow/tools/docs:doc_generator_visitor",
-        "//tensorflow/tools/docs:parser",
+        "//tensorflow/python/debug:debug_py",
     ],
 )
 
 py_test(
-    name = "generate_test",
+    name = "build_docs_test",
     size = "small",
-    srcs = [
-        "generate_test.py",
-    ],
+    srcs = ["build_docs_test.py"],
+    data = ["//tensorflow:docs_src"],
     srcs_version = "PY2AND3",
     tags = ["manual"],
     deps = [
-        ":generate",
-        "//tensorflow/python:platform_test",
+        ":generate_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/debug:debug_py",
     ],
 )
 
 py_binary(
-    name = "make_py_guides",
-    srcs = ["make_py_guides.py"],
+    name = "generate_1_0",
+    srcs = ["generate_1_0.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/tools/docs:generate",
-        "//tensorflow/tools/docs:parser",
+        ":generate_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/debug:debug_py",
     ],
 )
 
-filegroup(
-    name = "doxy_config",
-    srcs = ["tf-doxy_for_md-config"],
+py_library(
+    name = "py_guide_parser",
+    srcs = ["py_guide_parser.py"],
+    srcs_version = "PY2AND3",
 )
 
-sh_binary(
-    name = "gen_docs",
-    srcs = ["gen_docs.sh"],
-    data = [
-        ":doxy_config",
-        ":gen_cc_md",
-        "//tensorflow/python:gen_docs_combined",
-    ],
-)
-
-sh_test(
-    name = "gen_docs_test",
+py_test(
+    name = "py_guide_parser_test",
     size = "small",
-    srcs = [
-        "gen_docs_test.sh",
-    ],
-    data = [
-        ":gen_docs",
-        "//tensorflow/core:all_files",
-        "//tensorflow/python:all_files",
+    srcs = ["py_guide_parser_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":py_guide_parser",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
new file mode 100644
index 00000000000..d28dd93b9a8
--- /dev/null
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -0,0 +1,51 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the python doc generator and fail if there are any broken links."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import resource_loader
+from tensorflow.tools.docs import generate_lib
+
+
+class Flags(object):
+  resource_root = resource_loader.get_root_dir_with_all_resources()
+  src_dir = os.path.join(resource_root, 'third_party/tensorflow/docs_src')
+  base_dir = os.path.join(resource_root, 'third_party/tensorflow/')
+  output_dir = googletest.GetTempDir()
+
+
+class BuildDocsTest(googletest.TestCase):
+
+  def testBuildDocs(self):
+    doc_generator = generate_lib.DocGenerator()
+
+    doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
+
+    status = doc_generator.build(Flags())
+
+    if status:
+      self.fail('Found %s Errors!' % status)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index d4ff33a0726..259a4694fdc 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -18,17 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 import six
 
+from tensorflow.python.util import tf_inspect
+
 
 class DocGeneratorVisitor(object):
   """A visitor that generates docs for a python object when __call__ed."""
 
-  def __init__(self):
+  def __init__(self, root_name=''):
+    """Make a visitor.
+
+    As this visitor is starting its traversal at a module or class, it will not
+    be told the name of that object during traversal. `root_name` is the name it
+    should use for that object, effectively prefixing all names with
+    "root_name.".
+
+    Args:
+      root_name: The name of the root module/class.
+    """
+    self.set_root_name(root_name)
     self._index = {}
     self._tree = {}
+    self._reverse_index = None
+    self._duplicates = None
+    self._duplicate_of = None
+
+  def set_root_name(self, root_name):
+    """Sets the root name for subsequent __call__s."""
+    self._root_name = root_name or ''
+    self._prefix = (root_name + '.') if root_name else ''
 
   @property
   def index(self):
@@ -53,6 +72,56 @@ class DocGeneratorVisitor(object):
     """
     return self._tree
 
+  @property
+  def reverse_index(self):
+    """A map from `id(object)` to the preferred fully qualified name.
+
+    This map only contains non-primitive objects (no numbers or strings) present
+    in `index` (for primitive objects, `id()` doesn't quite do the right thing).
+
+    It is computed when it, `duplicate_of`, or `duplicates` are first accessed.
+
+    Returns:
+      The `id(object)` to full name map.
+    """
+    self._maybe_find_duplicates()
+    return self._reverse_index
+
+  @property
+  def duplicate_of(self):
+    """A map from duplicate full names to a preferred fully qualified name.
+
+    This map only contains names that are not themself a preferred name.
+
+    It is computed when it, `reverse_index`, or `duplicates` are first accessed.
+
+    Returns:
+      The map from duplicate name to preferred name.
+    """
+    self._maybe_find_duplicates()
+    return self._duplicate_of
+
+  @property
+  def duplicates(self):
+    """A map from preferred full names to a list of all names for this symbol.
+
+    This function returns a map from preferred (master) name for a symbol to a
+    lexicographically sorted list of all aliases for that name (incl. the master
+    name). Symbols without duplicate names do not appear in this map.
+
+    It is computed when it, `reverse_index`, or `duplicate_of` are first
+    accessed.
+
+    Returns:
+      The map from master name to list of all duplicate names.
+    """
+    self._maybe_find_duplicates()
+    return self._duplicates
+
+  def _add_prefix(self, name):
+    """Adds the root name to a name."""
+    return self._prefix + name if name else self._root_name
+
   def __call__(self, parent_name, parent, children):
     """Visitor interface, see `tensorflow/tools/common:traverse` for details.
 
@@ -64,42 +133,48 @@ class DocGeneratorVisitor(object):
       parent_name: The fully qualified name of a symbol found during traversal.
       parent: The Python object referenced by `parent_name`.
       children: A list of `(name, py_object)` pairs enumerating, in alphabetical
-        order, the children (as determined by `inspect.getmembers`) of `parent`.
-        `name` is the local name of `py_object` in `parent`.
+        order, the children (as determined by `tf_inspect.getmembers`) of
+          `parent`. `name` is the local name of `py_object` in `parent`.
 
     Raises:
       RuntimeError: If this visitor is called with a `parent` that is not a
         class or module.
     """
+    parent_name = self._add_prefix(parent_name)
     self._index[parent_name] = parent
     self._tree[parent_name] = []
 
-    if inspect.ismodule(parent):
-      print('module %s: %r' % (parent_name, parent))
-    elif inspect.isclass(parent):
-      print('class %s: %r' % (parent_name, parent))
-    else:
-      raise RuntimeError('Unexpected type in visitor -- %s: %r' %
-                         (parent_name, parent))
+    if not (tf_inspect.ismodule(parent) or tf_inspect.isclass(parent)):
+      raise RuntimeError('Unexpected type in visitor -- %s: %r' % (parent_name,
+                                                                   parent))
+
+    for i, (name, child) in enumerate(list(children)):
+      # Don't document __metaclass__
+      if name in ['__metaclass__']:
+        del children[i]
+        continue
 
-    for name, child in children:
       full_name = '.'.join([parent_name, name]) if parent_name else name
       self._index[full_name] = child
       self._tree[parent_name].append(name)
 
-  def find_duplicates(self):
+  def _maybe_find_duplicates(self):
     """Compute data structures containing information about duplicates.
 
     Find duplicates in `index` and decide on one to be the "master" name.
 
-    Returns a map `duplicate_of` from aliases to their master name (the master
-    name itself has no entry in this map), and a map `duplicates` from master
-    names to a lexicographically sorted list of all aliases for that name (incl.
-    the master name).
+    Computes a reverse_index mapping each object id to its master name.
 
-    Returns:
-      A tuple `(duplicate_of, duplicates)` as described above.
+    Also computes a map `duplicate_of` from aliases to their master name (the
+    master name itself has no entry in this map), and a map `duplicates` from
+    master names to a lexicographically sorted list of all aliases for that name
+    (incl. the master name).
+
+    All these are computed and set as fields if they haven't already.
     """
+    if self._reverse_index is not None:
+      return
+
     # Maps the id of a symbol to its fully qualified name. For symbols that have
     # several aliases, this map contains the first one found.
     # We use id(py_object) to get a hashable value for py_object. Note all
@@ -110,15 +185,13 @@ class DocGeneratorVisitor(object):
     # maps the first name found to a list of all duplicate names.
     raw_duplicates = {}
     for full_name, py_object in six.iteritems(self._index):
-      # We cannot use the duplicate mechanism for constants, since e.g.,
+      # We cannot use the duplicate mechanism for some constants, since e.g.,
       # id(c1) == id(c2) with c1=1, c2=1. This is unproblematic since constants
       # have no usable docstring and won't be documented automatically.
-      if (inspect.ismodule(py_object) or
-          inspect.isclass(py_object) or
-          inspect.isfunction(py_object) or
-          inspect.isroutine(py_object) or
-          inspect.ismethod(py_object) or
-          isinstance(py_object, property)):
+      if (py_object is not None and
+          not isinstance(py_object, six.integer_types + six.string_types +
+                         (six.binary_type, six.text_type, float, complex, bool))
+          and py_object is not ()):
         object_id = id(py_object)
         if object_id in reverse_index:
           master_name = reverse_index[object_id]
@@ -148,4 +221,9 @@ class DocGeneratorVisitor(object):
         if name != master_name:
           duplicate_of[name] = master_name
 
-    return duplicate_of, duplicates
+      # Set the reverse index to the canonical name.
+      reverse_index[id(self._index[master_name])] = master_name
+
+    self._duplicate_of = duplicate_of
+    self._duplicates = duplicates
+    self._reverse_index = reverse_index
diff --git a/tensorflow/tools/docs/doc_generator_visitor_test.py b/tensorflow/tools/docs/doc_generator_visitor_test.py
index bbaa1c6474c..cf5be45f40e 100644
--- a/tensorflow/tools/docs/doc_generator_visitor_test.py
+++ b/tensorflow/tools/docs/doc_generator_visitor_test.py
@@ -75,8 +75,6 @@ class DocGeneratorVisitorTest(googletest.TestCase):
         [('index', doc_generator_visitor.DocGeneratorVisitor.index),
          ('index2', doc_generator_visitor.DocGeneratorVisitor.index)])
 
-    duplicate_of, duplicates = visitor.find_duplicates()
-
     # The shorter path should be master, or if equal, the lexicographically
     # first will be.
     self.assertEqual(
@@ -91,7 +89,7 @@ class DocGeneratorVisitorTest(googletest.TestCase):
              'DocGeneratorVisitor2.index',
              'DocGeneratorVisitor2.index2'
          ]),
-        }, duplicates)
+        }, visitor.duplicates)
     self.assertEqual({
         'submodule.DocGeneratorVisitor': 'DocGeneratorVisitor2',
         'submodule.DocGeneratorVisitor.index': 'DocGeneratorVisitor2.index',
@@ -100,8 +98,12 @@ class DocGeneratorVisitorTest(googletest.TestCase):
         'submodule2.DocGeneratorVisitor.index': 'DocGeneratorVisitor2.index',
         'submodule2.DocGeneratorVisitor.index2': 'DocGeneratorVisitor2.index',
         'DocGeneratorVisitor2.index2': 'DocGeneratorVisitor2.index'
-    }, duplicate_of)
-
+    }, visitor.duplicate_of)
+    self.assertEqual({
+        id(doc_generator_visitor.DocGeneratorVisitor): 'DocGeneratorVisitor2',
+        id(doc_generator_visitor.DocGeneratorVisitor.index):
+        'DocGeneratorVisitor2.index',
+    }, visitor.reverse_index)
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/tools/docs/gen_cc_md.py b/tensorflow/tools/docs/gen_cc_md.py
deleted file mode 100644
index 931df3230b4..00000000000
--- a/tensorflow/tools/docs/gen_cc_md.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convert Doxygen .xml files to MarkDown (.md files)."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import re
-
-from BeautifulSoup import BeautifulStoneSoup
-import tensorflow as tf
-
-ANCHOR_RE = re.compile(r'\W+')
-
-PAGE_TEMPLATE = '''# `{0} {1}`
-
-{2}
-
-###Member Details
-
-{3}'''
-
-INDEX_TEMPLATE = '''# TensorFlow C++ Session API reference documentation
-
-TensorFlow's public C++ API includes only the API for executing graphs, as of
-version 0.5. To control the execution of a graph from C++:
-
-1. Build the computation graph using the [Python API](../python/).
-1. Use [`tf.train.write_graph()`](../python/train.md#write_graph) to
-write the graph to a file.
-1. Load the graph using the C++ Session API. For example:
-
-  ```c++
-  // Reads a model graph definition from disk, and creates a session object you
-  // can use to run it.
-  Status LoadGraph(string graph_file_name, Session** session) {
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(
-        ReadBinaryProto(Env::Default(), graph_file_name, &graph_def));
-    TF_RETURN_IF_ERROR(NewSession(SessionOptions(), session));
-    TF_RETURN_IF_ERROR((*session)->Create(graph_def));
-    return Status::OK();
-  }
-```
-
-1. Run the graph with a call to `session->Run()`
-
-## Env
-
-@@Env
-@@RandomAccessFile
-@@WritableFile
-@@EnvWrapper
-
-## Session
-
-@@Session
-@@SessionOptions
-
-## Status
-
-@@Status
-@@Status::State
-
-## Tensor
-
-@@Tensor
-@@TensorShape
-@@TensorShapeDim
-@@TensorShapeUtils
-@@PartialTensorShape
-@@PartialTensorShapeUtils
-
-## Thread
-
-@@Thread
-@@ThreadOptions
-'''
-
-FLAGS = None
-
-
-def member_definition(member_elt):
-  def_text = ''
-
-  def_elt = member_elt.find('definition')
-  if def_elt:
-    def_text = def_elt.text
-
-  return def_text
-
-
-def member_sig(member_elt):
-  def_text = member_definition(member_elt)
-
-  argstring_text = ''
-  argstring = member_elt.find('argsstring')
-  if argstring:
-    argstring_text = argstring.text
-
-  sig = def_text + argstring_text
-  return sig
-
-
-def anchorize(name):
-  return ANCHOR_RE.sub('_', name)
-
-
-def element_text(member_elt, elt_name):
-  """Extract all `para` text from (`elt_name` in) `member_elt`."""
-  text = []
-  if elt_name:
-    elt = member_elt.find(elt_name)
-  else:
-    elt = member_elt
-
-  if elt:
-    paras = elt.findAll('para')
-    for p in paras:
-      text.append(p.getText(separator=u' ').strip())
-  return '\n\n'.join(text)
-
-
-def full_member_entry(member_elt):
-  """Generate the description of `member_elt` for "Member Details"."""
-  anchor = '{#' + anchorize(member_definition(member_elt)) + '}'
-  full_entry = '#### `%s` %s' % (member_sig(member_elt), anchor)
-
-  complete_descr = element_text(member_elt, 'briefdescription') + '\n\n'
-  complete_descr += element_text(member_elt, 'detaileddescription')
-
-  if complete_descr:
-    full_entry += '\n\n' + complete_descr
-
-  return full_entry
-
-
-def brief_member_entry(member_elt):
-  """Generate the description of `member_elt` for the "Member Summary"."""
-  brief_item = ''
-  brief_descr = element_text(member_elt, 'briefdescription')
-  if brief_descr:
-    brief_item = '\n  * ' + brief_descr
-  sig = member_sig(member_elt)
-  memdef = member_definition(member_elt)
-  linkified_sig = '[`{0}`](#{1})'.format(sig, anchorize(memdef))
-
-  return '* ' + linkified_sig + brief_item
-
-
-def all_briefs(members):
-  briefs = [brief_member_entry(member_elt) for member_elt in members]
-  return '\n'.join(briefs)
-
-
-def all_fulls(members):
-  fulls = [full_member_entry(member_elt) for member_elt in members]
-  return '\n\n'.join(fulls)
-
-
-def page_overview(class_elt):
-  """Returns the contents of the .md file for `class_elt`."""
-  overview_brief = ''
-  overview_details = ''
-
-  briefs = class_elt.findAll('briefdescription', recursive=False)
-  if briefs:
-    overview_brief = element_text(briefs[0], None)
-
-  details = class_elt.findAll('detaileddescription', recursive=False)
-  if details:
-    overview_details = element_text(details[0], None)
-
-  return overview_brief + '\n\n' + overview_details
-
-
-def page_with_name(pages, name):
-  def match(n):
-    for i in xrange(len(pages)):
-      if pages[i].get_name() == n:
-        return i
-    return None
-  return match(name) or match('tensorflow::' + name)
-
-
-def get_all_indexed_pages():
-  all_pages = set()
-  lines = INDEX_TEMPLATE.split('\n')
-  for i in range(len(lines)):
-    if lines[i].startswith('@@'):
-      name = lines[i][2:]
-      all_pages.add(name)
-  return all_pages
-
-
-def index_page(pages):
-  """Create the index page linking to `pages` using INDEX_TEMPLATE."""
-  pages = pages[:]
-  lines = INDEX_TEMPLATE.split('\n')
-  all_md_files = []
-  for i in range(len(lines)):
-    if lines[i].startswith('@@'):
-      name = lines[i][2:]
-      page_index = page_with_name(pages, name)
-      if page_index is None:
-        raise ValueError('Missing page with name: ' + name)
-      lines[i] = '* [{0}]({1})'.format(
-          pages[page_index].get_name(), pages[page_index].get_md_filename())
-      all_md_files.append(pages[page_index].get_md_filename())
-      pages.pop(page_index)
-
-  return '\n'.join(lines)
-
-
-def page_in_name_list(page, names):
-  for name in names:
-    if page.get_name() == name or page.get_name() == 'tensorflow::' + name:
-      return True
-  return False
-
-
-class Page(object):
-  """Holds the MarkDown converted contents of a .xml page."""
-
-  def __init__(self, xml_path, deftype):
-    self.type = deftype
-    xml_file = open(xml_path)
-    xml = xml_file.read()
-    xml = xml.replace('<computeroutput>', '`').replace('</computeroutput>', '`')
-    # TODO(josh11b): Should not use HTML entities inside ```...```.
-    soup = BeautifulStoneSoup(
-        xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
-    self.name = soup.find('compoundname').text
-    print('Making page with name ' + self.name + ' (from ' + xml_path + ')')
-    members = soup('memberdef', prot='public')
-    fulls = all_fulls(members)
-    self.overview = page_overview(soup.find('compounddef'))
-    self.page_text = PAGE_TEMPLATE.format(
-        self.type, self.name, self.overview, fulls)
-
-  def get_text(self):
-    return self.page_text
-
-  def get_name(self):
-    return self.name
-
-  def get_short_name(self):
-    parse = self.get_name().split('::')
-    return parse[len(parse)-1]
-
-  def get_type(self):
-    return self.type
-
-  def get_md_filename(self):
-    capitalized_type = self.get_type()[0].upper() + self.get_type()[1:]
-    return capitalized_type + anchorize(self.get_short_name()) + '.md'
-
-
-def main(unused_argv):
-  print('Converting in ' + FLAGS.src_dir)
-  pages = []
-  all_pages = get_all_indexed_pages()
-  xml_files = os.listdir(FLAGS.src_dir)
-  for fname in xml_files:
-    if len(fname) < 6: continue
-    newpage = None
-    if fname[0:5] == 'class':
-      newpage = Page(os.path.join(FLAGS.src_dir, fname), 'class')
-    elif fname[0:6] == 'struct':
-      newpage = Page(os.path.join(FLAGS.src_dir, fname), 'struct')
-    if newpage is not None and page_in_name_list(newpage, all_pages):
-      pages.append(newpage)
-      md_filename = newpage.get_md_filename()
-      print('Writing ' + md_filename)
-      md_file = open(os.path.join(FLAGS.out_dir, md_filename), 'w')
-      print(newpage.get_text(), file=md_file)
-
-  index_text = index_page(pages)
-  index_md_file = open(os.path.join(FLAGS.out_dir, 'index.md'), 'w')
-  print(index_text, file=index_md_file)
-  return 0
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--src_dir',
-      type=str,
-      default=None,
-      help='Directory containing the doxygen output.'
-  )
-  parser.add_argument(
-      '--out_dir',
-      type=str,
-      default=None,
-      help='Directory to which docs should be written.'
-  )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
diff --git a/tensorflow/tools/docs/gen_docs.sh b/tensorflow/tools/docs/gen_docs.sh
deleted file mode 100755
index 4f529270ab4..00000000000
--- a/tensorflow/tools/docs/gen_docs.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# This script needs to be run from the tensorflow/tools/docs directory
-# Pass -a to also rebuild C++ docs. This requires doxygen.
-
-set -e
-
-DOC_DIR="g3doc/api_docs"
-DOXYGEN_BIN=${DOXYGEN:-doxygen}
-DOXYGEN_CONFIG="tools/docs/tf-doxy_for_md-config"
-# The TMP_DIR is set inside DOXYGEN_CONFIG and cannot be changed independently
-TMP_DIR=/tmp/tensorflow-docs/xml
-
-if [ ! -f gen_docs.sh ]; then
-  echo "This script must be run from inside the tensorflow/tools/docs directory."
-  exit 1
-fi
-
-# go to the tensorflow/ directory
-pushd ../..
-BASE=$(pwd)
-
-# Make Python docs
-bazel run -- //tensorflow/python:gen_docs_combined \
-    --out_dir=$BASE/$DOC_DIR/python
-
-# Check if we should build c++ docs (if -a is given)
-if [ x$1 == x-a ]; then
-  mkdir -p $TMP_DIR
-  $DOXYGEN_BIN "$BASE/$DOXYGEN_CONFIG"
-  bazel run -- //tensorflow/tools/docs:gen_cc_md \
-      --out_dir=$BASE/$DOC_DIR/cc \
-      --src_dir=$TMP_DIR
-fi
-
-popd
diff --git a/tensorflow/tools/docs/gen_docs_test.sh b/tensorflow/tools/docs/gen_docs_test.sh
deleted file mode 100755
index c8c1955aa06..00000000000
--- a/tensorflow/tools/docs/gen_docs_test.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -eux
-
-if [ -d $TEST_SRCDIR/org_tensorflow ]; then
-  TFDIR=$TEST_SRCDIR/org_tensorflow/tensorflow
-else
-  # Support 0.2.1- runfiles.
-  TFDIR=$TEST_SRCDIR/tensorflow
-fi
-DOXYGEN=doxygen
-DOXYGEN_CONFIG="tf-doxy_for_md-config"
-TMP_DIR=/tmp/tensorflow-docs
-mkdir -p $TMP_DIR/python
-mkdir -p $TMP_DIR/xml
-mkdir -p $TMP_DIR/cc
-
-pushd $TFDIR
-python/gen_docs_combined --out_dir=$TMP_DIR/python
-
-# TODO(wicke): this does not work well inside the build/test jail
-#$DOXYGEN "tools/docs/$DOXYGEN_CONFIG"
-#tools/docs/gen_cc_md \
-#    --out_dir=$TMP_DIR/cc \
-#    --src_dir=$TMP_DIR/xml
-popd
-echo "PASS"
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index 8f2958d6a6a..fc93085e3e0 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -18,216 +18,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import inspect
 import os
+import sys
 
-import six
 import tensorflow as tf
 
-from tensorflow.tools.common import public_api
-from tensorflow.tools.common import traverse
-from tensorflow.tools.docs import doc_generator_visitor
-from tensorflow.tools.docs import parser
-
-
-def write_docs(output_dir, base_dir, duplicate_of, duplicates, index, tree):
-  """Write previously extracted docs to disk.
-
-  Write a docs page for each symbol in `index` to a tree of docs at
-  `output_dir`.
-
-  Symbols with multiple aliases will have only one page written about them,
-  which is referenced for all aliases. `duplicate_of` and `duplicates` are used
-  to determine which docs pages to write.
-
-  Args:
-    output_dir: Directory to write documentation markdown files to. Will be
-      created if it doesn't exist.
-    base_dir: Base directory of the code being documented. This prefix is
-      stripped from all file paths that are part of the documentation.
-    duplicate_of: A `dict` mapping fully qualified names to "master" names. This
-      is used to resolve "@{symbol}" references to the "master" name.
-    duplicates: A `dict` mapping fully qualified names to a set of all
-      aliases of this name. This is used to automatically generate a list of all
-      aliases for each name.
-    index: A `dict` mapping fully qualified names to the corresponding Python
-      objects. Used to produce docs for child objects, and to check the validity
-      of "@{symbol}" references.
-    tree: A `dict` mapping a fully qualified name to the names of all its
-      members. Used to populate the members section of a class or module page.
-  """
-  # Make output_dir.
-  try:
-    if not os.path.exists(output_dir):
-      os.makedirs(output_dir)
-  except OSError as e:
-    print('Creating output dir "%s" failed: %s' % (output_dir, e))
-    raise
-
-  # Parse and write Markdown pages, resolving cross-links (@{symbol}).
-  for full_name, py_object in six.iteritems(index):
-
-    if full_name in duplicate_of:
-      print('Not writing docs for %s, duplicate of %s.' % (
-          full_name, duplicate_of[full_name]))
-      continue
-
-    # Methods and some routines are documented only as part of their class.
-    if not (inspect.ismodule(py_object) or
-            inspect.isclass(py_object) or
-            inspect.isfunction(py_object)):
-      print('Not writing docs for %s, not a class, module, or function.' % (
-          full_name))
-      continue
-
-    print('Writing docs for %s (%r).' % (full_name, py_object))
-
-    # Generate docs for `py_object`, resolving references.
-    markdown = parser.generate_markdown(full_name, py_object,
-                                        duplicate_of=duplicate_of,
-                                        duplicates=duplicates,
-                                        index=index,
-                                        tree=tree,
-                                        base_dir=base_dir)
-
-    # TODO(deannarubin): use _tree to generate sidebar information.
-
-    path = os.path.join(output_dir, parser.documentation_path(full_name))
-    directory = os.path.dirname(path)
-    try:
-      if not os.path.exists(directory):
-        os.makedirs(directory)
-      with open(path, 'w') as f:
-        f.write(markdown)
-    except OSError as e:
-      print('Cannot write documentation for %s to %s: %s' % (full_name,
-                                                             directory, e))
-      raise
-    # TODO(deannarubin): write sidebar file?
-
-  # Write a global index containing all full names with links.
-  with open(os.path.join(output_dir, 'full_index.md'), 'w') as f:
-    f.write(parser.generate_global_index('TensorFlow', 'tensorflow',
-                                         index, duplicate_of))
-
-
-def extract():
-  """Extract docs from tf namespace and write them to disk."""
-  visitor = doc_generator_visitor.DocGeneratorVisitor()
-  api_visitor = public_api.PublicAPIVisitor(visitor)
-
-  # Access something in contrib so tf.contrib is properly loaded (it's hidden
-  # behind lazy loading)
-  _ = tf.contrib.__name__
-
-  # Exclude some libaries in contrib from the documentation altogether.
-  # TODO(wicke): Shrink this list.
-  api_visitor.do_not_descend_map.update({
-      'contrib': [
-          'compiler',
-          'factorization',
-          'grid_rnn',
-          'labeled_tensor',
-          'ndlstm',
-          'quantization',
-          'session_bundle',
-          'slim',
-          'solvers',
-          'specs',
-          'tensor_forest',
-          'tensorboard',
-          'testing',
-          'tfprof',
-          'training',
-      ],
-      'contrib.bayesflow': [
-          'entropy', 'monte_carlo',
-          'special_math', 'stochastic_gradient_estimators',
-          'stochastic_graph', 'stochastic_tensor',
-          'stochastic_variables', 'variational_inference'
-      ],
-      'contrib.distributions': ['bijector'],
-      'contrib.graph_editor': [
-          'edit',
-          'match',
-          'reroute',
-          'subgraph',
-          'transform',
-          'select',
-          'util'
-      ],
-      'contrib.layers': [
-          'feature_column',
-          'summaries'
-      ],
-      'contrib.learn': [
-          'datasets',
-          'head',
-          'graph_actions',
-          'io',
-          'models',
-          'monitors',
-          'ops',
-          'preprocessing',
-          'utils',
-      ],
-      'contrib.util': ['loader'],
-  })
-
-  traverse.traverse(tf, api_visitor)
-
-  return visitor
-
-
-def write(output_dir, base_dir, visitor):
-  """Write documentation for an index in a `DocGeneratorVisitor` to disk.
-
-  This function will create `output_dir` if it doesn't exist, and write
-  the documentation contained in `visitor`.
-
-  Args:
-    output_dir: The directory to write documentation to. Must not exist.
-    base_dir: The base dir of the library `visitor` has traversed. This is used
-      to compute relative paths for file references.
-    visitor: A `DocGeneratorVisitor` that has traversed a library located at
-      `base_dir`.
-  """
-  duplicate_of, duplicates = visitor.find_duplicates()
-  write_docs(output_dir, os.path.abspath(base_dir),
-             duplicate_of, duplicates, visitor.index, visitor.tree)
-
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import generate_lib
 
 if __name__ == '__main__':
-  argument_parser = argparse.ArgumentParser()
-  argument_parser.add_argument(
-      '--output_dir',
-      type=str,
-      default=None,
-      required=True,
-      help='Directory to write docs to. Must not exist.'
-  )
+  doc_generator = generate_lib.DocGenerator()
+  doc_generator.add_output_dir_argument()
+  doc_generator.add_src_dir_argument()
 
   # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, we can compute the base directory (three levels
-  # up), which is valid unless we're trying to apply this to a different code
-  # base, or are moving the script around.
-  script_dir = os.path.dirname(inspect.getfile(inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..', '..')
+  # at tensorflow/tools/docs, and all code is defined somewhere inside
+  # tensorflow/, we can compute the base directory (two levels up), which is
+  # valid unless we're trying to apply this to a different code base, or are
+  # moving the script around.
+  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
+  default_base_dir = os.path.join(script_dir, '..', '..')
+  doc_generator.add_base_dir_argument(default_base_dir)
 
-  argument_parser.add_argument(
-      '--base_dir',
-      type=str,
-      default=default_base_dir,
-      help=('Base directory to to strip from file names referenced in docs. '
-            'Defaults to three directories up from the location of this file.')
-  )
+  flags = doc_generator.parse_known_args()
 
-  flags, _ = argument_parser.parse_known_args()
+  # tf_debug is not imported with tf, it's a separate module altogether
+  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
-  if os.path.exists(flags.output_dir):
-    raise RuntimeError('output_dir %s exists.\n'
-                       'Cowardly refusing to wipe it, please do that yourself.'
-                       % flags.output_dir)
-
-  write(flags.output_dir, flags.base_dir, extract())
+  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
new file mode 100644
index 00000000000..cdc03fdcacf
--- /dev/null
+++ b/tensorflow/tools/docs/generate_1_0.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate docs for the TensorFlow Python API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import generate_lib
+
+if __name__ == '__main__':
+  doc_generator = generate_lib.DocGenerator()
+  doc_generator.add_output_dir_argument()
+  doc_generator.add_src_dir_argument()
+
+  # This doc generator works on the TensorFlow codebase. Since this script lives
+  # at tensorflow/tools/docs, and all code is defined somewhere inside
+  # tensorflow/, we can compute the base directory (two levels up), which is
+  # valid unless we're trying to apply this to a different code base, or are
+  # moving the script around.
+  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
+  default_base_dir = os.path.join(script_dir, '..', '..')
+  doc_generator.add_base_dir_argument(default_base_dir)
+
+  flags = doc_generator.parse_known_args()
+
+  # tf_debug is not imported with tf, it's a separate module altogether
+  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
+
+  doc_generator.set_do_not_descend_map({
+      'tf': ['cli', 'lib', 'wrappers'],
+      'tf.contrib': [
+          'compiler',
+          'factorization',
+          'grid_rnn',
+          'labeled_tensor',
+          'ndlstm',
+          'quantization',
+          'session_bundle',
+          'slim',
+          'solvers',
+          'specs',
+          'tensor_forest',
+          'tensorboard',
+          'testing',
+          'training',
+          'tfprof',
+      ],
+      'tf.contrib.bayesflow': [
+          'entropy', 'monte_carlo', 'special_math',
+          'stochastic_gradient_estimators', 'stochastic_graph',
+          'stochastic_tensor', 'stochastic_variables', 'variational_inference'
+      ],
+      'tf.contrib.distributions': ['bijector'],
+      'tf.contrib.ffmpeg': ['ffmpeg_ops'],
+      'tf.contrib.graph_editor': [
+          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
+      ],
+      'tf.contrib.layers': ['feature_column', 'summaries'],
+      'tf.contrib.learn': [
+          'datasets',
+          'head',
+          'graph_actions',
+          'io',
+          'models',
+          'monitors',
+          'ops',
+          'preprocessing',
+          'utils',
+      ],
+      'tf.contrib.util': ['loader'],
+  })
+
+  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
new file mode 100644
index 00000000000..99872e1d844
--- /dev/null
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -0,0 +1,511 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate docs for the TensorFlow Python API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+
+import six
+
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+from tensorflow.tools.docs import doc_generator_visitor
+from tensorflow.tools.docs import parser
+from tensorflow.tools.docs import pretty_docs
+from tensorflow.tools.docs import py_guide_parser
+
+
+def _is_free_function(py_object, full_name, index):
+  """Check if input is a free function (and not a class- or static method)."""
+  if not tf_inspect.isfunction(py_object):
+    return False
+
+  # Static methods are functions to tf_inspect (in 2.7), so check if the parent
+  # is a class. If there is no parent, it's not a function.
+  if '.' not in full_name:
+    return False
+
+  parent_name = full_name.rsplit('.', 1)[0]
+  if tf_inspect.isclass(index[parent_name]):
+    return False
+
+  return True
+
+
+def write_docs(output_dir, parser_config, yaml_toc):
+  """Write previously extracted docs to disk.
+
+  Write a docs page for each symbol included in the indices of parser_config to
+  a tree of docs at `output_dir`.
+
+  Symbols with multiple aliases will have only one page written about
+  them, which is referenced for all aliases.
+
+  Args:
+    output_dir: Directory to write documentation markdown files to. Will be
+      created if it doesn't exist.
+    parser_config: A `parser.ParserConfig` object, containing all the necessary
+      indices.
+    yaml_toc: Set to `True` to generate a "_toc.yaml" file.
+
+  Raises:
+    ValueError: if `output_dir` is not an absolute path
+  """
+  # Make output_dir.
+  if not os.path.isabs(output_dir):
+    raise ValueError(
+        "'output_dir' must be an absolute path.\n"
+        "    output_dir='%s'" % output_dir)
+
+  try:
+    if not os.path.exists(output_dir):
+      os.makedirs(output_dir)
+  except OSError as e:
+    print('Creating output dir "%s" failed: %s' % (output_dir, e))
+    raise
+
+  # These dictionaries are used for table-of-contents generation below
+  # They will contain, after the for-loop below::
+  #  - module name(string):classes and functions the module contains(list)
+  module_children = {}
+  #  - symbol name(string):pathname (string)
+  symbol_to_file = {}
+
+  # Parse and write Markdown pages, resolving cross-links (@{symbol}).
+  for full_name, py_object in six.iteritems(parser_config.index):
+
+    if full_name in parser_config.duplicate_of:
+      continue
+
+    # Methods and some routines are documented only as part of their class.
+    if not (tf_inspect.ismodule(py_object) or tf_inspect.isclass(py_object) or
+            _is_free_function(py_object, full_name, parser_config.index)):
+      continue
+
+    sitepath = os.path.join('api_docs/python',
+                            parser.documentation_path(full_name)[:-3])
+
+    # For TOC, we need to store a mapping from full_name to the file
+    # we're generating
+    symbol_to_file[full_name] = sitepath
+
+    # For a module, remember the module for the table-of-contents
+    if tf_inspect.ismodule(py_object):
+      if full_name in parser_config.tree:
+        module_children.setdefault(full_name, [])
+
+    # For something else that's documented,
+    # figure out what module it lives in
+    else:
+      subname = str(full_name)
+      while True:
+        subname = subname[:subname.rindex('.')]
+        if tf_inspect.ismodule(parser_config.index[subname]):
+          module_children.setdefault(subname, []).append(full_name)
+          break
+
+    print('Writing docs for %s (%r).' % (full_name, py_object))
+
+    # Generate docs for `py_object`, resolving references.
+    page_info = parser.docs_for_object(full_name, py_object, parser_config)
+
+    path = os.path.join(output_dir, parser.documentation_path(full_name))
+    directory = os.path.dirname(path)
+    try:
+      if not os.path.exists(directory):
+        os.makedirs(directory)
+      with open(path, 'w') as f:
+        f.write(pretty_docs.build_md_page(page_info))
+    except OSError as e:
+      print('Cannot write documentation for %s to %s: %s' % (full_name,
+                                                             directory, e))
+      raise
+
+  if yaml_toc:
+    # Generate table of contents
+
+    # Put modules in alphabetical order, case-insensitive
+    modules = sorted(module_children.keys(), key=lambda a: a.upper())
+
+    leftnav_path = os.path.join(output_dir, '_toc.yaml')
+    with open(leftnav_path, 'w') as f:
+
+      # Generate header
+      f.write('# Automatically generated file; please do not edit\ntoc:\n')
+      for module in modules:
+        f.write('  - title: ' + module + '\n'
+                '    section:\n' + '    - title: Overview\n' +
+                '      path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[module]
+                + '\n')
+
+        symbols_in_module = module_children.get(module, [])
+        # Sort case-insensitive, if equal sort case sensitive (upper first)
+        symbols_in_module.sort(key=lambda a: (a.upper(), a))
+
+        for full_name in symbols_in_module:
+          f.write('    - title: ' + full_name[len(module) + 1:] + '\n'
+                  '      path: /TARGET_DOC_ROOT/VERSION/' +
+                  symbol_to_file[full_name] + '\n')
+
+  # Write a global index containing all full names with links.
+  with open(os.path.join(output_dir, 'index.md'), 'w') as f:
+    f.write(
+        parser.generate_global_index('TensorFlow', parser_config.index,
+                                     parser_config.reference_resolver))
+
+
+def add_dict_to_dict(add_from, add_to):
+  for key in add_from:
+    if key in add_to:
+      add_to[key].extend(add_from[key])
+    else:
+      add_to[key] = add_from[key]
+
+
+# Exclude some libaries in contrib from the documentation altogether.
+def _get_default_private_map():
+  return {}
+
+
+# Exclude members of some libaries.
+def _get_default_do_not_descend_map():
+  # TODO(wicke): Shrink this list once the modules get sealed.
+  return {
+      'tf': ['cli', 'lib', 'wrappers'],
+      'tf.contrib': [
+          'compiler',
+          'factorization',
+          'grid_rnn',
+          'labeled_tensor',
+          'ndlstm',
+          'quantization',
+          'session_bundle',
+          'slim',
+          'solvers',
+          'specs',
+          'tensor_forest',
+          'tensorboard',
+          'testing',
+          'tfprof',
+      ],
+      'tf.contrib.bayesflow': [
+          'special_math', 'stochastic_gradient_estimators',
+          'stochastic_variables'
+      ],
+      'tf.contrib.ffmpeg': ['ffmpeg_ops'],
+      'tf.contrib.graph_editor': [
+          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
+      ],
+      'tf.contrib.keras': ['api', 'python'],
+      'tf.contrib.layers': ['feature_column', 'summaries'],
+      'tf.contrib.learn': [
+          'datasets',
+          'head',
+          'graph_actions',
+          'io',
+          'models',
+          'monitors',
+          'ops',
+          'preprocessing',
+          'utils',
+      ],
+      'tf.contrib.util': ['loader'],
+  }
+
+
+def extract(py_modules, private_map, do_not_descend_map):
+  """Extract docs from tf namespace and write them to disk."""
+  # Traverse the first module.
+  visitor = doc_generator_visitor.DocGeneratorVisitor(py_modules[0][0])
+  api_visitor = public_api.PublicAPIVisitor(visitor)
+  api_visitor.set_root_name(py_modules[0][0])
+  add_dict_to_dict(private_map, api_visitor.private_map)
+  add_dict_to_dict(do_not_descend_map, api_visitor.do_not_descend_map)
+
+  traverse.traverse(py_modules[0][1], api_visitor)
+
+  # Traverse all py_modules after the first:
+  for module_name, module in py_modules[1:]:
+    visitor.set_root_name(module_name)
+    api_visitor.set_root_name(module_name)
+    traverse.traverse(module, api_visitor)
+
+  return visitor
+
+
+class _GetMarkdownTitle(py_guide_parser.PyGuideParser):
+  """Extract the title from a .md file."""
+
+  def __init__(self):
+    self.title = None
+    py_guide_parser.PyGuideParser.__init__(self)
+
+  def process_title(self, _, title):
+    if self.title is None:  # only use the first title
+      self.title = title
+
+
+class _DocInfo(object):
+  """A simple struct for holding a doc's url and title."""
+
+  def __init__(self, url, title):
+    self.url = url
+    self.title = title
+
+
+def build_doc_index(src_dir):
+  """Build an index from a keyword designating a doc to _DocInfo objects."""
+  doc_index = {}
+  if not os.path.isabs(src_dir):
+    raise ValueError("'src_dir' must be an absolute path.\n"
+                     "    src_dir='%s'" % src_dir)
+
+  if not os.path.exists(src_dir):
+    raise ValueError("'src_dir' path must exist.\n"
+                     "    src_dir='%s'" % src_dir)
+
+  for dirpath, _, filenames in os.walk(src_dir):
+    suffix = os.path.relpath(path=dirpath, start=src_dir)
+    for base_name in filenames:
+      if not base_name.endswith('.md'):
+        continue
+      title_parser = _GetMarkdownTitle()
+      title_parser.process(os.path.join(dirpath, base_name))
+      key_parts = os.path.join(suffix, base_name[:-3]).split('/')
+      if key_parts[-1] == 'index':
+        key_parts = key_parts[:-1]
+      doc_info = _DocInfo(os.path.join(suffix, base_name), title_parser.title)
+      doc_index[key_parts[-1]] = doc_info
+      if len(key_parts) > 1:
+        doc_index['/'.join(key_parts[-2:])] = doc_info
+
+  return doc_index
+
+
+class _GuideRef(object):
+
+  def __init__(self, base_name, title, section_title, section_tag):
+    self.url = 'api_guides/python/' + (('%s#%s' % (base_name, section_tag))
+                                       if section_tag else base_name)
+    self.link_text = (('%s > %s' % (title, section_title))
+                      if section_title else title)
+
+  def make_md_link(self, url_prefix):
+    return '[%s](%s%s)' % (self.link_text, url_prefix, self.url)
+
+
+class _GenerateGuideIndex(py_guide_parser.PyGuideParser):
+  """Turn guide files into an index from symbol name to a list of _GuideRefs."""
+
+  def __init__(self):
+    self.index = {}
+    py_guide_parser.PyGuideParser.__init__(self)
+
+  def process(self, full_path, base_name):
+    """Index a file, reading from `full_path`, with `base_name` as the link."""
+    self.full_path = full_path
+    self.base_name = base_name
+    self.title = None
+    self.section_title = None
+    self.section_tag = None
+    py_guide_parser.PyGuideParser.process(self, full_path)
+
+  def process_title(self, _, title):
+    if self.title is None:  # only use the first title
+      self.title = title
+
+  def process_section(self, _, section_title, tag):
+    self.section_title = section_title
+    self.section_tag = tag
+
+  def process_line(self, _, line):
+    """Index @{symbol} references as in the current file & section."""
+    for match in parser.SYMBOL_REFERENCE_RE.finditer(line):
+      val = self.index.get(match.group(1), [])
+      val.append(
+          _GuideRef(self.base_name, self.title, self.section_title,
+                    self.section_tag))
+      self.index[match.group(1)] = val
+
+
+def _build_guide_index(guide_src_dir):
+  """Return dict: symbol name -> _GuideRef from the files in `guide_src_dir`."""
+  index_generator = _GenerateGuideIndex()
+  if os.path.exists(guide_src_dir):
+    for full_path, base_name in py_guide_parser.md_files_in_dir(guide_src_dir):
+      index_generator.process(full_path, base_name)
+  return index_generator.index
+
+
+class _UpdateTags(py_guide_parser.PyGuideParser):
+  """Rewrites a Python guide so that each section has an explicit tag."""
+
+  def process_section(self, line_number, section_title, tag):
+    self.replace_line(line_number, '<h2 id="%s">%s</h2>' % (tag, section_title))
+
+
+EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
+
+
+def _other_docs(src_dir, output_dir, reference_resolver):
+  """Convert all the files in `src_dir` and write results to `output_dir`."""
+  header = '<!-- DO NOT EDIT! Automatically generated file. -->\n'
+
+  # Iterate through all the source files and process them.
+  tag_updater = _UpdateTags()
+  for dirpath, _, filenames in os.walk(src_dir):
+    # How to get from `dirpath` to api_docs/python/
+    relative_path_to_root = os.path.relpath(
+        path=os.path.join(src_dir, 'api_docs/python'), start=dirpath)
+
+    # Make the directory under output_dir.
+    new_dir = os.path.join(output_dir,
+                           os.path.relpath(path=dirpath, start=src_dir))
+    try:
+      if not os.path.exists(new_dir):
+        os.makedirs(new_dir)
+    except OSError as e:
+      print('Creating output dir "%s" failed: %s' % (new_dir, e))
+      raise
+
+    for base_name in filenames:
+      if base_name in EXCLUDED:
+        print('Skipping excluded file %s...' % base_name)
+        continue
+      full_in_path = os.path.join(dirpath, base_name)
+      suffix = os.path.relpath(path=full_in_path, start=src_dir)
+      full_out_path = os.path.join(output_dir, suffix)
+      if not base_name.endswith('.md'):
+        print('Copying non-md file %s...' % suffix)
+        open(full_out_path, 'w').write(open(full_in_path).read())
+        continue
+      if dirpath.endswith('/api_guides/python'):
+        print('Processing Python guide %s...' % base_name)
+        md_string = tag_updater.process(full_in_path)
+      else:
+        print('Processing doc %s...' % suffix)
+        md_string = open(full_in_path).read()
+
+      output = reference_resolver.replace_references(md_string,
+                                                     relative_path_to_root)
+      with open(full_out_path, 'w') as f:
+        f.write(header + output)
+
+  print('Done.')
+
+
+class DocGenerator(object):
+  """Main entry point for generating docs."""
+
+  def __init__(self):
+    self.argument_parser = argparse.ArgumentParser()
+    self._py_modules = None
+    self._private_map = _get_default_private_map()
+    self._do_not_descend_map = _get_default_do_not_descend_map()
+    self.yaml_toc = True
+
+  def add_output_dir_argument(self):
+    self.argument_parser.add_argument(
+        '--output_dir',
+        type=str,
+        default=None,
+        required=True,
+        help='Directory to write docs to.')
+
+  def add_src_dir_argument(self):
+    self.argument_parser.add_argument(
+        '--src_dir',
+        type=str,
+        default=None,
+        required=True,
+        help='Directory with the source docs.')
+
+  def add_base_dir_argument(self, default_base_dir):
+    self.argument_parser.add_argument(
+        '--base_dir',
+        type=str,
+        default=default_base_dir,
+        help='Base directory to to strip from file names referenced in docs.')
+
+  def parse_known_args(self):
+    flags, _ = self.argument_parser.parse_known_args()
+    return flags
+
+  def add_to_private_map(self, d):
+    add_dict_to_dict(d, self._private_map)
+
+  def add_to_do_not_descend_map(self, d):
+    add_dict_to_dict(d, self._do_not_descend_map)
+
+  def set_private_map(self, d):
+    self._private_map = d
+
+  def set_do_not_descend_map(self, d):
+    self._do_not_descend_map = d
+
+  def set_py_modules(self, py_modules):
+    self._py_modules = py_modules
+
+  def py_module_names(self):
+    if self._py_modules is None:
+      raise RuntimeError(
+          'Must call set_py_modules() before running py_module_names().')
+    return [name for (name, _) in self._py_modules]
+
+  def make_reference_resolver(self, visitor, doc_index):
+    return parser.ReferenceResolver.from_visitor(
+        visitor, doc_index, py_module_names=self.py_module_names())
+
+  def make_parser_config(self, visitor, reference_resolver, guide_index,
+                         base_dir):
+    return parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates=visitor.duplicates,
+        duplicate_of=visitor.duplicate_of,
+        tree=visitor.tree,
+        index=visitor.index,
+        reverse_index=visitor.reverse_index,
+        guide_index=guide_index,
+        base_dir=base_dir)
+
+  def run_extraction(self):
+    return extract(
+        self._py_modules, self._private_map, self._do_not_descend_map)
+
+  def build(self, flags):
+    """Actually build the docs."""
+    doc_index = build_doc_index(flags.src_dir)
+    visitor = self.run_extraction()
+    reference_resolver = self.make_reference_resolver(visitor, doc_index)
+
+    guide_index = _build_guide_index(
+        os.path.join(flags.src_dir, 'api_guides/python'))
+
+    parser_config = self.make_parser_config(visitor, reference_resolver,
+                                            guide_index, flags.base_dir)
+    output_dir = os.path.join(flags.output_dir, 'api_docs/python')
+
+    write_docs(output_dir, parser_config, yaml_toc=self.yaml_toc)
+    _other_docs(flags.src_dir, flags.output_dir, reference_resolver)
+
+    if parser.all_errors:
+      print('Errors during processing:\n  ' + '\n  '.join(parser.all_errors))
+      return 1
+    return 0
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
new file mode 100644
index 00000000000..6e5deb6a36e
--- /dev/null
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -0,0 +1,151 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for doc generator traversal."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import generate_lib
+from tensorflow.tools.docs import parser
+
+
+def test_function():
+  """Docstring for test_function."""
+  pass
+
+
+class TestClass(object):
+  """Docstring for TestClass itself."""
+
+  class ChildClass(object):
+    """Docstring for a child class."""
+
+    class GrandChildClass(object):
+      """Docstring for a child of a child class."""
+      pass
+
+
+class DummyVisitor(object):
+
+  def __init__(self, index, duplicate_of):
+    self.index = index
+    self.duplicate_of = duplicate_of
+
+
+class GenerateTest(googletest.TestCase):
+
+  def test_extraction(self):
+    py_modules = [('tf', tf), ('tfdbg', tf_debug)]
+
+    try:
+      generate_lib.extract(py_modules,
+                           generate_lib._get_default_private_map(),
+                           generate_lib._get_default_do_not_descend_map())
+    except RuntimeError:
+      print('*****************************************************************')
+      print('If this test fails, you have most likely introduced an unsealed')
+      print('module. Make sure to use remove_undocumented or similar utilities')
+      print('to avoid leaking symbols. See below for more information on the')
+      print('failure.')
+      print('*****************************************************************')
+      raise
+
+  def test_write(self):
+    module = sys.modules[__name__]
+
+    index = {
+        'tf': sys,  # Can be any module, this test doesn't care about content.
+        'tf.TestModule': module,
+        'tf.test_function': test_function,
+        'tf.TestModule.test_function': test_function,
+        'tf.TestModule.TestClass': TestClass,
+        'tf.TestModule.TestClass.ChildClass': TestClass.ChildClass,
+        'tf.TestModule.TestClass.ChildClass.GrandChildClass':
+        TestClass.ChildClass.GrandChildClass,
+    }
+
+    tree = {
+        'tf': ['TestModule', 'test_function'],
+        'tf.TestModule': ['test_function', 'TestClass'],
+        'tf.TestModule.TestClass': ['ChildClass'],
+        'tf.TestModule.TestClass.ChildClass': ['GrandChildClass'],
+        'tf.TestModule.TestClass.ChildClass.GrandChildClass': []
+    }
+
+    duplicate_of = {'tf.test_function': 'tf.TestModule.test_function'}
+
+    duplicates = {
+        'tf.TestModule.test_function': [
+            'tf.test_function', 'tf.TestModule.test_function'
+        ]
+    }
+
+    base_dir = os.path.dirname(__file__)
+
+    visitor = DummyVisitor(index, duplicate_of)
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates=duplicates,
+        duplicate_of=duplicate_of,
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir=base_dir)
+
+    output_dir = googletest.GetTempDir()
+
+    generate_lib.write_docs(output_dir, parser_config, yaml_toc=True)
+
+    # Make sure that the right files are written to disk.
+    self.assertTrue(os.path.exists(os.path.join(output_dir, 'index.md')))
+    self.assertTrue(os.path.exists(os.path.join(output_dir, 'tf.md')))
+    self.assertTrue(os.path.exists(os.path.join(output_dir, '_toc.yaml')))
+    self.assertTrue(
+        os.path.exists(os.path.join(output_dir, 'tf/TestModule.md')))
+    self.assertFalse(
+        os.path.exists(os.path.join(output_dir, 'tf/test_function.md')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(output_dir, 'tf/TestModule/TestClass.md')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(output_dir,
+                         'tf/TestModule/TestClass/ChildClass.md')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                output_dir,
+                'tf/TestModule/TestClass/ChildClass/GrandChildClass.md')))
+    # Make sure that duplicates are not written
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(output_dir, 'tf/TestModule/test_function.md')))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/generate_test.py b/tensorflow/tools/docs/generate_test.py
deleted file mode 100644
index 4594676109c..00000000000
--- a/tensorflow/tools/docs/generate_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for doc generator traversal."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import tempfile
-
-from tensorflow.python.platform import googletest
-from tensorflow.tools.docs import generate
-
-
-def test_function():
-  """Docstring for test_function."""
-  pass
-
-
-class TestClass(object):
-  """Docstring for TestClass itself."""
-
-  class ChildClass(object):
-    """Docstring for a child class."""
-
-    class GrandChildClass(object):
-      """Docstring for a child of a child class."""
-      pass
-
-
-class GenerateTest(googletest.TestCase):
-
-  def test_extraction(self):
-    try:
-      generate.extract()
-    except RuntimeError:
-      print('*****************************************************************')
-      print('If this test fails, you have most likely introduced an unsealed')
-      print('module. Make sure to use remove_undocumented or similar utilities')
-      print('to avoid leaking symbols. See below for more information on the')
-      print('failure.')
-      print('*****************************************************************')
-      raise
-
-  def test_write(self):
-    module = sys.modules[__name__]
-
-    index = {
-        '': sys,  # Can be any module, this test doesn't care about content.
-        'TestModule': module,
-        'test_function': test_function,
-        'TestModule.test_function': test_function,
-        'TestModule.TestClass': TestClass,
-        'TestModule.TestClass.ChildClass': TestClass.ChildClass,
-        'TestModule.TestClass.ChildClass.GrandChildClass':
-        TestClass.ChildClass.GrandChildClass,
-    }
-
-    tree = {
-        '': ['TestModule', 'test_function'],
-        'TestModule': ['test_function', 'TestClass'],
-        'TestModule.TestClass': ['ChildClass'],
-        'TestModule.TestClass.ChildClass': ['GrandChildClass'],
-        'TestModule.TestClass.ChildClass.GrandChildClass': []
-    }
-
-    duplicate_of = {
-        'TestModule.test_function': 'test_function'
-    }
-
-    duplicates = {
-        'test_function': ['test_function', 'TestModule.test_function']
-    }
-
-    output_dir = tempfile.mkdtemp()
-    base_dir = os.path.dirname(__file__)
-
-    generate.write_docs(output_dir, base_dir,
-                        duplicate_of, duplicates,
-                        index, tree)
-
-    # Make sure that the right files are written to disk.
-    self.assertTrue(os.path.exists(os.path.join(output_dir, 'index.md')))
-    self.assertTrue(os.path.exists(os.path.join(output_dir, 'full_index.md')))
-    self.assertTrue(os.path.exists(os.path.join(output_dir, 'TestModule.md')))
-    self.assertTrue(os.path.exists(os.path.join(
-        output_dir, 'test_function.md')))
-    self.assertTrue(os.path.exists(os.path.join(
-        output_dir, 'TestModule/TestClass.md')))
-    self.assertTrue(os.path.exists(os.path.join(
-        output_dir, 'TestModule/TestClass/ChildClass.md')))
-    self.assertTrue(os.path.exists(os.path.join(
-        output_dir, 'TestModule/TestClass/ChildClass/GrandChildClass.md')))
-    # Make sure that duplicates are not written
-    self.assertFalse(os.path.exists(os.path.join(
-        output_dir, 'TestModule/test_function.md')))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/tools/docs/make_py_guides.py b/tensorflow/tools/docs/make_py_guides.py
deleted file mode 100644
index a5264f2f8dc..00000000000
--- a/tensorflow/tools/docs/make_py_guides.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Convert @{symbol} to MarkDown links in the Python API guides."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-
-from tensorflow.tools.docs import generate
-from tensorflow.tools.docs import parser
-
-
-def _md_files_in_dir(input_dir):
-  all_in_dir = [(os.path.join(input_dir, f), f) for f in os.listdir(input_dir)]
-  return [(full, f) for full, f in all_in_dir
-          if os.path.isfile(full) and f.endswith('.md')]
-
-
-def _main(input_dir, output_dir):
-  """Convert all the files in `input_dir` and write results to `output_dir`."""
-  visitor = generate.extract()
-  duplicate_of, unused_duplicates = visitor.find_duplicates()
-
-  # Make output_dir.
-  try:
-    if not os.path.exists(output_dir):
-      os.makedirs(output_dir)
-  except OSError as e:
-    print('Creating output dir "%s" failed: %s' % (output_dir, e))
-    raise
-
-  # How to get from api_guides/python/ to api_docs/python/
-  relative_path_to_root = '../../api_docs/python/'
-
-  # Iterate through all the source files and process them.
-  for full_path, base_name in _md_files_in_dir(input_dir):
-    print('Processing %s...' % base_name)
-    md_string = open(full_path).read()
-    output = parser.replace_references(
-        md_string, relative_path_to_root, duplicate_of)
-    open(os.path.join(output_dir, base_name), 'w').write(output)
-  print('Done.')
-
-
-if __name__ == '__main__':
-  argument_parser = argparse.ArgumentParser()
-  argument_parser.add_argument(
-      '--input_dir',
-      type=str,
-      default=None,
-      required=True,
-      help='Directory to copy docs from.'
-  )
-  argument_parser.add_argument(
-      '--output_dir',
-      type=str,
-      default=None,
-      required=True,
-      help='Directory to write docs to. Will be created, must not exist.'
-  )
-  flags, _ = argument_parser.parse_known_args()
-  if os.path.exists(flags.output_dir):
-    raise RuntimeError('output_dir %s exists.\n'
-                       'Cowardly refusing to wipe it, please do that yourself.'
-                       % flags.output_dir)
-
-  _main(flags.input_dir, flags.output_dir)
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 3a1a7cb82e9..7ae1d2abd9a 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -18,16 +18,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
+import collections
 import functools
-import inspect
+import json
 import os
 import re
 
+import codegen
 import six
 
+from google.protobuf.message import Message as ProtoMessage
+from tensorflow.python.util import tf_inspect
+
+
 # A regular expression capturing a python indentifier.
 IDENTIFIER_RE = '[a-zA-Z_][a-zA-Z0-9_]*'
 
+# Log of all reported errors
+all_errors = []
+
+
+def log_error(s):
+  all_errors.append(s)
+  print('ERROR:', s)
+
 
 def documentation_path(full_name):
   """Returns the file path for the documentation for the given API symbol.
@@ -35,8 +50,7 @@ def documentation_path(full_name):
   Given the fully qualified name of a library symbol, compute the path to which
   to write the documentation for that symbol (relative to a base directory).
   Documentation files are organized into directories that mirror the python
-  module/class structure. The path for the top-level module (whose full name is
-  '') is 'index.md'.
+  module/class structure.
 
   Args:
     full_name: Fully qualified name of a library symbol.
@@ -44,12 +58,7 @@ def documentation_path(full_name):
   Returns:
     The file path to which to write the documentation for `full_name`.
   """
-  # The main page is special, since it has no name in here.
-  if not full_name:
-    dirs = ['index']
-  else:
-    dirs = full_name.split('.')
-
+  dirs = full_name.split('.')
   return os.path.join(*dirs) + '.md'
 
 
@@ -63,181 +72,466 @@ def _get_raw_docstring(py_object):
   Returns:
     The docstring, or the empty string if no docstring was found.
   """
-  # For object instances, inspect.getdoc does give us the docstring of their
+  # For object instances, tf_inspect.getdoc does give us the docstring of their
   # type, which is not what we want. Only return the docstring if it is useful.
-  if (inspect.isclass(py_object) or inspect.ismethod(py_object) or
-      inspect.isfunction(py_object) or inspect.ismodule(py_object) or
+  if (tf_inspect.isclass(py_object) or tf_inspect.ismethod(py_object) or
+      tf_inspect.isfunction(py_object) or tf_inspect.ismodule(py_object) or
       isinstance(py_object, property)):
-    return inspect.getdoc(py_object) or ''
+    return tf_inspect.getdoc(py_object) or ''
   else:
     return ''
 
 
-def _get_brief_docstring(py_object):
-  """Gets the one line docstring of a python object."""
-  return _get_raw_docstring(py_object).split('\n')[0]
+# A regular expression for capturing a @{symbol} reference.
+SYMBOL_REFERENCE_RE = re.compile(r'@\{([^}]+)\}')
 
 
-def _reference_to_link(ref_full_name, relative_path_to_root, duplicate_of):
-  """Resolve a "@{symbol}" reference to a relative path, respecting duplicates.
-
-  The input to this function should already be stripped of the '@' and '{}', and
-  its output is only the link, not the full Markdown.
+class ReferenceResolver(object):
+  """Class for replacing @{...} references with Markdown links.
 
   Args:
-    ref_full_name: The fully qualified name of the symbol to link to.
-    relative_path_to_root: The relative path from the location of the current
-      document to the root of the API documentation.
-    duplicate_of: A map from duplicate full names to master names.
-
-  Returns:
-    A relative path that links from the documentation page of `from_full_name`
-    to the documentation page of `ref_full_name`.
+    duplicate_of: A map from duplicate names to preferred names of API
+      symbols.
+    doc_index: A `dict` mapping symbol name strings to objects with `url`
+      and `title` fields. Used to resolve @{$doc} references in docstrings.
+    index: A map from all full names to python objects.
+    py_module_names: A list of string names of Python modules.
   """
-  master_name = duplicate_of.get(ref_full_name, ref_full_name)
-  ref_path = documentation_path(master_name)
-  return os.path.join(relative_path_to_root, ref_path)
+
+  def __init__(self, duplicate_of, doc_index, is_class, is_module,
+               py_module_names):
+    self._duplicate_of = duplicate_of
+    self._doc_index = doc_index
+    self._is_class = is_class
+    self._is_module = is_module
+    self._all_names = set(is_class.keys())
+    self._py_module_names = py_module_names
+
+  @classmethod
+  def from_visitor(cls, visitor, doc_index, **kwargs):
+    """A factory function for building a ReferenceResolver from a visitor.
+
+    Args:
+      visitor: an instance of `DocGeneratorVisitor`
+      doc_index: a dictionary mapping document names to references objects with
+        "title" and "url" fields
+      **kwargs: all remaining args are passed to the constructor
+    Returns:
+      an instance of `ReferenceResolver` ()
+    """
+    is_class = {
+        name: tf_inspect.isclass(visitor.index[name])
+        for name, obj in visitor.index.items()
+    }
+
+    is_module = {
+        name: tf_inspect.ismodule(visitor.index[name])
+        for name, obj in visitor.index.items()
+    }
+
+    return cls(
+        duplicate_of=visitor.duplicate_of,
+        doc_index=doc_index,
+        is_class=is_class,
+        is_module=is_module,
+        **kwargs)
+
+  @classmethod
+  def from_json_file(cls, filepath, doc_index):
+    with open(filepath) as f:
+      json_dict = json.load(f)
+
+    return cls(doc_index=doc_index, **json_dict)
+
+  def to_json_file(self, filepath):
+    """Converts the RefenceResolver to json and writes it to the specified file.
+
+    Args:
+      filepath: The file path to write the json to.
+    """
+    json_dict = {}
+    for key, value in self.__dict__.items():
+      # Drop these two fields. `_doc_index` is not serializable. `_all_names` is
+      # generated by the constructor.
+      if key in ('_doc_index', '_all_names'):
+        continue
+
+      # Strip off any leading underscores on field names as these are not
+      # recognized by the constructor.
+      json_dict[key.lstrip('_')] = value
+
+    with open(filepath, 'w') as f:
+      json.dump(json_dict, f)
+
+  def replace_references(self, string, relative_path_to_root):
+    """Replace "@{symbol}" references with links to symbol's documentation page.
+
+    This functions finds all occurrences of "@{symbol}" in `string`
+    and replaces them with markdown links to the documentation page
+    for "symbol".
+
+    `relative_path_to_root` is the relative path from the document
+    that contains the "@{symbol}" reference to the root of the API
+    documentation that is linked to. If the containing page is part of
+    the same API docset, `relative_path_to_root` can be set to
+    `os.path.dirname(documentation_path(name))`, where `name` is the
+    python name of the object whose documentation page the reference
+    lives on.
+
+    Args:
+      string: A string in which "@{symbol}" references should be replaced.
+      relative_path_to_root: The relative path from the containing document to
+        the root of the API documentation that is being linked to.
+
+    Returns:
+      `string`, with "@{symbol}" references replaced by Markdown links.
+    """
+    return re.sub(SYMBOL_REFERENCE_RE,
+                  lambda match: self._one_ref(match.group(1),  # pylint: disable=g-long-lambda
+                                              relative_path_to_root),
+                  string)
+
+  def python_link(self, link_text, ref_full_name, relative_path_to_root,
+                  code_ref=True):
+    """Resolve a "@{python symbol}" reference to a Markdown link.
+
+    This will pick the canonical location for duplicate symbols.  The
+    input to this function should already be stripped of the '@' and
+    '{}'.  This function returns a Markdown link. If `code_ref` is
+    true, it is assumed that this is a code reference, so the link
+    text will be rendered as code (using backticks).
+    `link_text` should refer to a library symbol, starting with 'tf.'.
+
+    Args:
+      link_text: The text of the Markdown link.
+      ref_full_name: The fully qualified name of the symbol to link to.
+      relative_path_to_root: The relative path from the location of the current
+        document to the root of the API documentation.
+      code_ref: If true (the default), put `link_text` in `...`.
+
+    Returns:
+      A markdown link to the documentation page of `ref_full_name`.
+    """
+    link = self.reference_to_url(ref_full_name, relative_path_to_root)
+    if code_ref:
+      return '[`%s`](%s)' % (link_text, link)
+    else:
+      return '[%s](%s)' % (link_text, link)
+
+  def py_master_name(self, full_name):
+    """Return the master name for a Python symbol name."""
+    return self._duplicate_of.get(full_name, full_name)
+
+  def reference_to_url(self, ref_full_name, relative_path_to_root):
+    """Resolve a "@{python symbol}" reference to a relative path.
+
+    The input to this function should already be stripped of the '@'
+    and '{}', and its output is only the link, not the full Markdown.
+
+    If `ref_full_name` is the name of a class member, method, or property, the
+    link will point to the page of the containing class, and it will include the
+    method name as an anchor. For example, `tf.module.MyClass.my_method` will be
+    translated into a link to
+    `os.join.path(relative_path_to_root, 'tf/module/MyClass.md#my_method')`.
+
+    Args:
+      ref_full_name: The fully qualified name of the symbol to link to.
+      relative_path_to_root: The relative path from the location of the current
+        document to the root of the API documentation.
+
+    Returns:
+      A relative path that links from the documentation page of `from_full_name`
+      to the documentation page of `ref_full_name`.
+
+    Raises:
+      RuntimeError: If `ref_full_name` is not documented.
+    """
+    master_name = self._duplicate_of.get(ref_full_name, ref_full_name)
+
+    # Check whether this link exists
+    if master_name not in self._all_names:
+      # TODO(josh11b): Make error reporting more uniform.
+      print('ERROR: Cannot make link to %s (original: %s): Not in index.' %
+            (master_name, ref_full_name))
+      return 'BROKEN_LINK'
+
+    # If this is a member of a class, link to the class page with an anchor.
+    ref_path = None
+    if not (self._is_class[master_name] or self._is_module[master_name]):
+      idents = master_name.split('.')
+      if len(idents) > 1:
+        class_name = '.'.join(idents[:-1])
+        assert class_name in self._all_names
+        if self._is_class[class_name]:
+          ref_path = documentation_path(class_name) + '#%s' % idents[-1]
+
+    if not ref_path:
+      ref_path = documentation_path(master_name)
+
+    return os.path.join(relative_path_to_root, ref_path)
+
+  def _one_ref(self, string, relative_path_to_root):
+    """Return a link for a single "@{symbol}" reference."""
+    # Look for link text after $.
+    dollar = string.rfind('$')
+    if dollar > 0:  # Ignore $ in first character
+      link_text = string[dollar + 1:]
+      string = string[:dollar]
+      manual_link_text = True
+    else:
+      link_text = string
+      manual_link_text = False
+
+    # Handle different types of references.
+    if string.startswith('$'):  # Doc reference
+      return self._doc_link(
+          string, link_text, manual_link_text, relative_path_to_root)
+
+    elif string.startswith('tensorflow::'):
+      # C++ symbol
+      return self._cc_link(
+          string, link_text, manual_link_text, relative_path_to_root)
+
+    else:
+      is_python = False
+      for py_module_name in self._py_module_names:
+        if string == py_module_name or string.startswith(py_module_name + '.'):
+          is_python = True
+          break
+      if is_python:  # Python symbol
+        return self.python_link(link_text, string, relative_path_to_root,
+                                code_ref=not manual_link_text)
+
+    # Error!
+    log_error('Did not understand "@{%s}"' % string)
+    return 'ERROR:%s' % string
+
+  def _doc_link(self, string, link_text, manual_link_text,
+                relative_path_to_root):
+    """Generate a link for a @{$...} reference."""
+    string = string[1:]  # remove leading $
+
+    # If string has a #, split that part into `hash_tag`
+    hash_pos = string.find('#')
+    if hash_pos > -1:
+      hash_tag = string[hash_pos:]
+      string = string[:hash_pos]
+    else:
+      hash_tag = ''
+
+    if string in self._doc_index:
+      if not manual_link_text: link_text = self._doc_index[string].title
+      url = os.path.normpath(os.path.join(
+          relative_path_to_root, '../..', self._doc_index[string].url))
+      return '[%s](%s%s)' % (link_text, url, hash_tag)
+    return self._doc_missing(string, hash_tag, link_text, manual_link_text,
+                             relative_path_to_root)
+
+  def _doc_missing(self, string, unused_hash_tag, link_text,
+                   unused_manual_link_text, unused_relative_path_to_root):
+    """Generate an error for unrecognized @{$...} references."""
+    log_error('Handle doc reference "@{$%s}"' % string)
+    return link_text
+
+  def _cc_link(self, string, link_text, unused_manual_link_text,
+               relative_path_to_root):
+    """Generate a link for a @{tensorflow::...} reference."""
+    # TODO(josh11b): Fix this hard-coding of paths.
+    if string == 'tensorflow::ClientSession':
+      ret = 'class/tensorflow/client-session.md'
+    elif string == 'tensorflow::Scope':
+      ret = 'class/tensorflow/scope.md'
+    elif string == 'tensorflow::Status':
+      ret = 'class/tensorflow/status.md'
+    elif string == 'tensorflow::Tensor':
+      ret = 'class/tensorflow/tensor.md'
+    elif string == 'tensorflow::ops::Const':
+      ret = 'namespace/tensorflow/ops.md#const'
+    else:
+      log_error('Handle C++ reference "@{%s}"' % string)
+      return 'TODO_C++:%s' % string
+    # relative_path_to_root gets you to api_docs/python, we go from there
+    # to api_docs/cc, and then add ret.
+    cc_relative_path = os.path.normpath(os.path.join(
+        relative_path_to_root, '../cc', ret))
+    return '[`%s`](%s)' % (link_text, cc_relative_path)
 
 
-def _markdown_link(link_text, ref_full_name, relative_path_to_root,
-                   duplicate_of):
-  """Resolve a "@{symbol}" reference to a Markdown link, respecting duplicates.
-
-  The input to this function should already be stripped of the '@' and '{}'.
-  This function returns a Markdown link. It is assumed that this is a code
-  reference, so the link text will always be rendered as code (using backticks).
-
-  `link_text` should refer to a library symbol. You can either refer to it with
-  or without the `tf.` prefix.
+# TODO(aselle): Collect these into a big list for all modules and functions
+# and make a rosetta stone page.
+def _handle_compatibility(doc):
+  """Parse and remove compatibility blocks from the main docstring.
 
   Args:
-    link_text: The text of the Markdown link.
-    ref_full_name: The fully qualified name of the symbol to link to
-      (may optionally include 'tf.').
-    relative_path_to_root: The relative path from the location of the current
-      document to the root of the API documentation.
-    duplicate_of: A map from duplicate full names to master names.
+    doc: The docstring that contains compatibility notes"
 
   Returns:
-    A markdown link from the documentation page of `from_full_name`
-    to the documentation page of `ref_full_name`.
+    a tuple of the modified doc string and a hash that maps from compatibility
+    note type to the text of the note.
   """
-  if ref_full_name.startswith('tf.'):
-    ref_full_name = ref_full_name[3:]
-
-  return '[`%s`](%s)' % (
-      link_text,
-      _reference_to_link(ref_full_name, relative_path_to_root, duplicate_of))
+  compatibility_notes = {}
+  match_compatibility = re.compile(r'[ \t]*@compatibility\((\w+)\)\s*\n'
+                                   r'((?:[^@\n]*\n)+)'
+                                   r'\s*@end_compatibility')
+  for f in match_compatibility.finditer(doc):
+    compatibility_notes[f.group(1)] = f.group(2)
+  return match_compatibility.subn(r'', doc)[0], compatibility_notes
 
 
-def replace_references(string, relative_path_to_root, duplicate_of):
-  """Replace "@{symbol}" references with links to symbol's documentation page.
-
-  This functions finds all occurrences of "@{symbol}" in `string` and replaces
-  them with markdown links to the documentation page for "symbol".
-
-  `relative_path_to_root` is the relative path from the document that contains
-  the "@{symbol}" reference to the root of the API documentation that is linked
-  to. If the containing page is part of the same API docset,
-  `relative_path_to_root` can be set to
-  `os.path.dirname(documentation_path(name))`, where `name` is the python name
-  of the object whose documentation page the reference lives on.
+def _gen_pairs(items):
+  """Given an list of items [a,b,a,b...], generate pairs [(a,b),(a,b)...].
 
   Args:
-    string: A string in which "@{symbol}" references should be replaced.
-    relative_path_to_root: The relative path from the contianing document to the
-      root of the API documentation that is being linked to.
-    duplicate_of: A map from duplicate names to preferred names of API symbols.
+    items: A list of items (length must be even)
+
+  Yields:
+    The original items, in pairs
+  """
+  assert len(items) % 2 == 0
+  items = iter(items)
+  while True:
+    yield next(items), next(items)
+
+
+class _FunctionDetail(
+    collections.namedtuple('_FunctionDetail', ['keyword', 'header', 'items'])):
+  """A simple class to contain function details.
+
+  Composed of a "keyword", a possibly empty "header" string, and a possibly
+  empty
+  list of key-value pair "items".
+  """
+  __slots__ = []
+
+  def __str__(self):
+    """Return the original string that represents the function detail."""
+    parts = [self.keyword + ':\n']
+    parts.append(self.header)
+    for key, value in self.items:
+      parts.append('  ' + key + ':')
+      parts.append(value)
+
+    return ''.join(parts)
+
+
+def _parse_function_details(docstring):
+  r"""Given a docstring, split off the header and parse the function details.
+
+  For example the docstring of tf.nn.relu:
+
+  '''Computes rectified linear: `max(features, 0)`.
+
+  Args:
+    features: A `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`,
+      `half`.
+    name: A name for the operation (optional).
 
   Returns:
-    `string`, with "@{symbol}" references replaced by Markdown links.
+    A `Tensor`. Has the same type as `features`.
+  '''
+
+  This is parsed, and returned as:
+
+  ```
+  ('Computes rectified linear: `max(features, 0)`.\n\n', [
+      _FunctionDetail(
+          keyword='Args',
+          header='',
+          items=[
+              ('features', ' A `Tensor`. Must be ...'),
+              ('name', ' A name for the operation (optional).\n\n')]),
+      _FunctionDetail(
+          keyword='Returns',
+          header='  A `Tensor`. Has the same type as `features`.',
+          items=[])
+  ])
+  ```
+
+  Args:
+    docstring: The docstring to parse
+
+  Returns:
+    A (header, function_details) pair, where header is a string and
+    function_details is a (possibly empty) list of `_FunctionDetail` objects.
   """
-  full_name_re = '%s(.%s)*' % (IDENTIFIER_RE, IDENTIFIER_RE)
-  symbol_reference_re = re.compile(r'@\{(' + full_name_re + r')\}')
-  match = symbol_reference_re.search(string)
-  while match:
-    symbol_name = match.group(1)
-    link_text = _markdown_link(symbol_name, symbol_name,
-                               relative_path_to_root, duplicate_of)
 
-    # Remove only the '@symbol' part of the match, and replace with the link.
-    string = string[:match.start()] + link_text + string[match.end():]
-    match = symbol_reference_re.search(string,
-                                       pos=match.start() + len(link_text))
-  return string
+  detail_keywords = '|'.join([
+      'Args', 'Arguments', 'Fields', 'Returns', 'Yields', 'Raises', 'Attributes'
+  ])
+  tag_re = re.compile('(?<=\n)(' + detail_keywords + '):\n', re.MULTILINE)
+  parts = tag_re.split(docstring)
+
+  # The first part is the main docstring
+  docstring = parts[0]
+
+  # Everything else alternates keyword-content
+  pairs = list(_gen_pairs(parts[1:]))
+
+  function_details = []
+  item_re = re.compile(r'^  (\w+):', re.MULTILINE)
+
+  for keyword, content in pairs:
+    content = item_re.split(content)
+    header = content[0]
+    items = list(_gen_pairs(content[1:]))
+
+    function_details.append(_FunctionDetail(keyword, header, items))
+
+  return docstring, function_details
 
 
-def _md_docstring(py_object, relative_path_to_root, duplicate_of):
-  """Get the docstring from an object and make it into nice Markdown.
+_DocstringInfo = collections.namedtuple('_DocstringInfo', [
+    'brief', 'docstring', 'function_details', 'compatibility'
+])
+
+
+def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
+  """Parse the object's docstring and return a `_DocstringInfo`.
+
+  This function clears @@'s from the docstring, and replaces @{} references
+  with markdown links.
 
   For links within the same set of docs, the `relative_path_to_root` for a
-  docstring on the page for `full_name` can be set to
+  docstring on the page for `full_name` can be set to:
 
   ```python
   relative_path_to_root = os.path.relpath(
-    os.path.dirname(documentation_path(full_name)) or '.', '.')
+    path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
   ```
 
   Args:
     py_object: A python object to retrieve the docs for (class, function/method,
       or module).
     relative_path_to_root: The relative path from the location of the current
-      document to the root of the API documentation. This is used to compute
-      links for "@symbol" references.
-    duplicate_of: A map from duplicate symbol names to master names. Used to
-      resolve "@symbol" references.
+      document to the root of the Python API documentation. This is used to
+      compute links for "@{symbol}" references.
+    reference_resolver: An instance of ReferenceResolver.
 
   Returns:
-    The docstring, or the empty string if no docstring was found.
+    A _DocstringInfo object, all fields will be empty if no docstring was found.
   """
   # TODO(wicke): If this is a partial, use the .func docstring and add a note.
   raw_docstring = _get_raw_docstring(py_object)
-  raw_lines = raw_docstring.split('\n')
 
-  # Define regular expressions used during parsing below.
-  symbol_list_item_re = re.compile(r'^  (%s): ' % IDENTIFIER_RE)
-  section_re = re.compile(r'^(\w+):\s*$')
+  raw_docstring = reference_resolver.replace_references(
+      raw_docstring, relative_path_to_root)
 
-  # Translate docstring line by line.
-  in_special_section = False
-  lines = []
+  atat_re = re.compile(r' *@@[a-zA-Z_.0-9]+ *$')
+  raw_docstring = '\n'.join(
+      line for line in raw_docstring.split('\n') if not atat_re.match(line))
 
-  def is_section_start(i):
-    # Previous line is empty, line i is "Word:", and next line is indented.
-    return (i > 0 and not raw_lines[i-1].strip() and
-            re.match(section_re, raw_lines[i]) and
-            len(raw_lines) > i+1 and raw_lines[i+1].startswith('  '))
+  docstring, compatibility = _handle_compatibility(raw_docstring)
+  docstring, function_details = _parse_function_details(docstring)
 
-  for i, line in enumerate(raw_lines):
-    if not in_special_section and is_section_start(i):
-      in_special_section = True
-      lines.append('#### ' + section_re.sub(r'\1:', line))
-      lines.append('')
-      continue
-
-    # If the next line starts a new section, this one ends. Add an extra line.
-    if in_special_section and is_section_start(i+1):
-      in_special_section = False
-      lines.append('')
-
-    if in_special_section:
-      # Translate symbols in 'Args:', 'Parameters:', 'Raises:', etc. sections.
-      lines.append(symbol_list_item_re.sub(r'* <b>`\1`</b>: ', line))
-    else:
-      lines.append(line)
-
-  docstring = '\n'.join(lines)
-
-  # TODO(deannarubin): Improve formatting for devsite
-  # TODO(deannarubin): Interpret @compatibility and other formatting notes.
-
-  return replace_references(docstring, relative_path_to_root, duplicate_of)
+  return _DocstringInfo(
+      docstring.split('\n')[0], docstring, function_details, compatibility)
 
 
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `inspect.getargspec`. For `functools.partial` objects,
+  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
   corrects the signature of the underlying function to take into account the
   removed arguments.
 
@@ -246,11 +540,11 @@ def _get_arg_spec(func):
 
   Returns:
     An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `inspect.getargspec`.
+    by `tf_inspect.getargspec`.
   """
   # getargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = inspect.getargspec(func.func)
+    argspec = tf_inspect.getargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -273,19 +567,24 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return inspect.ArgSpec(args=argspec_args,
-                           varargs=argspec.varargs,
-                           keywords=argspec.keywords,
-                           defaults=tuple(argspec_defaults))
+    return tf_inspect.ArgSpec(args=argspec_args,
+                              varargs=argspec.varargs,
+                              keywords=argspec.keywords,
+                              defaults=tuple(argspec_defaults))
   else:  # Regular function or method, getargspec will work fine.
-    return inspect.getargspec(func)
+    return tf_inspect.getargspec(func)
 
 
-def _generate_signature(func):
-  """Given a function, returns a string representing its args.
+def _remove_first_line_indent(string):
+  indent = len(re.match(r'^\s*', string).group(0))
+  return '\n'.join([line[indent:] for line in string.split('\n')])
 
-  This function produces a string representing the arguments to a python
-  function, including surrounding parentheses. It uses inspect.getargspec, which
+
+def _generate_signature(func, reverse_index):
+  """Given a function, returns a list of strings representing its args.
+
+  This function produces a list of strings representing the arguments to a
+  python function. It uses tf_inspect.getargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -297,16 +596,14 @@ def _generate_signature(func):
   document, it should be typeset as code (using backticks), or escaped.
 
   Args:
-    func: A function of method to extract the signature for (anything
-      `inspect.getargspec` will accept).
+    func: A function, method, or functools.partial to extract the signature for.
+    reverse_index: A map from object ids to canonical full names to use.
 
   Returns:
-    A string representing the signature of `func` as python code.
+    A list of strings representing the argument signature of `func` as python
+    code.
   """
 
-  # This produces poor signatures for decorated functions.
-  # TODO(wicke): We need to use something like the decorator module to fix it.
-
   args_list = []
 
   argspec = _get_arg_spec(func)
@@ -324,14 +621,49 @@ def _generate_signature(func):
 
   # Add all args with defaults.
   if argspec.defaults:
-    for arg, default in zip(
-        argspec.args[first_arg_with_default:], argspec.defaults):
-      # Some callables don't have __name__, fall back to including their repr.
-      # TODO(wicke): This could be improved at least for common cases.
-      if callable(default) and hasattr(default, '__name__'):
-        args_list.append('%s=%s' % (arg, default.__name__))
+    try:
+      source = _remove_first_line_indent(tf_inspect.getsource(func))
+      func_ast = ast.parse(source)
+      ast_defaults = func_ast.body[0].args.defaults
+    except IOError:  # If this is a builtin, getsource fails with IOError
+      # If we cannot get the source, assume the AST would be equal to the repr
+      # of the defaults.
+      ast_defaults = [None] * len(argspec.defaults)
+
+    for arg, default, ast_default in zip(
+        argspec.args[first_arg_with_default:], argspec.defaults, ast_defaults):
+      if id(default) in reverse_index:
+        default_text = reverse_index[id(default)]
+      elif ast_default is not None:
+        default_text = codegen.to_source(ast_default)
+        if default_text != repr(default):
+          # This may be an internal name. If so, handle the ones we know about.
+          # TODO(wicke): This should be replaced with a lookup in the index.
+          # TODO(wicke): (replace first ident with tf., check if in index)
+          internal_names = {
+              'ops.GraphKeys': 'tf.GraphKeys',
+              '_ops.GraphKeys': 'tf.GraphKeys',
+              'init_ops.zeros_initializer': 'tf.zeros_initializer',
+              'init_ops.ones_initializer': 'tf.ones_initializer',
+              'saver_pb2.SaverDef': 'tf.train.SaverDef',
+          }
+          full_name_re = '^%s(.%s)+' % (IDENTIFIER_RE, IDENTIFIER_RE)
+          match = re.match(full_name_re, default_text)
+          if match:
+            lookup_text = default_text
+            for internal_name, public_name in six.iteritems(internal_names):
+              if match.group(0).startswith(internal_name):
+                lookup_text = public_name + default_text[len(internal_name):]
+                break
+            if default_text is lookup_text:
+              print('WARNING: Using default arg, failed lookup: %s, repr: %r' %
+                    (default_text, default))
+            else:
+              default_text = lookup_text
       else:
-        args_list.append('%s=%r' % (arg, default))
+        default_text = repr(default)
+
+      args_list.append('%s=%s' % (arg, default_text))
 
   # Add *args and *kwargs.
   if argspec.varargs:
@@ -339,198 +671,624 @@ def _generate_signature(func):
   if argspec.keywords:
     args_list.append('**' + argspec.keywords)
 
-  return '(%s)' % ', '.join(args_list)
+  return args_list
 
 
-def _generate_markdown_for_function(full_name, duplicate_names,
-                                    function, duplicate_of):
-  """Generate Markdown docs for a function or method.
+def _get_guides_markdown(duplicate_names, guide_index, relative_path):
+  all_guides = []
+  for name in duplicate_names:
+    all_guides.extend(guide_index.get(name, []))
+  if not all_guides: return ''
+  prefix = '../' * (relative_path.count('/') + 3)
+  links = sorted(set([guide_ref.make_md_link(prefix)
+                      for guide_ref in all_guides]))
+  return 'See the guide%s: %s\n\n' % (
+      's' if len(links) > 1 else '', ', '.join(links))
 
-  This function creates a documentation page for a function. It uses the
-  function name (incl. signature) as the title, followed by a list of duplicate
-  names (if there are any), and the Markdown formatted docstring of the
-  function.
 
-  Args:
-    full_name: The preferred name of the function. Used in the title. Must not
-      be present in `duplicate_of` (master names never are).
-    duplicate_names: A sorted list of alternative names (incl. `full_name`).
-    function: The python object referenced by `full_name`.
-    duplicate_of: A map of duplicate full names to master names. Used to resolve
-      @{symbol} references in the docstring.
+def _get_defining_class(py_class, name):
+  for cls in tf_inspect.getmro(py_class):
+    if name in cls.__dict__:
+      return cls
+  return None
 
-  Returns:
-    A string that can be written to a documentation file for this function.
+
+class _LinkInfo(
+    collections.namedtuple(
+        '_LinkInfo', ['short_name', 'full_name', 'obj', 'doc', 'url'])):
+
+  __slots__ = []
+
+  def is_link(self):
+    return True
+
+
+class _OtherMemberInfo(
+    collections.namedtuple('_OtherMemberInfo',
+                           ['short_name', 'full_name', 'obj', 'doc'])):
+
+  __slots__ = []
+
+  def is_link(self):
+    return False
+
+
+_PropertyInfo = collections.namedtuple(
+    '_PropertyInfo', ['short_name', 'full_name', 'obj', 'doc'])
+
+_MethodInfo = collections.namedtuple(
+    '_MethodInfo', ['short_name', 'full_name', 'obj', 'doc', 'signature'])
+
+
+class _FunctionPageInfo(object):
+  """Collects docs For a function Page."""
+
+  def __init__(self, full_name):
+    self._full_name = full_name
+    self._defined_in = None
+    self._aliases = None
+    self._doc = None
+    self._guides = None
+
+    self._signature = None
+
+  def for_function(self):
+    return True
+
+  def for_class(self):
+    return False
+
+  def for_module(self):
+    return False
+
+  @property
+  def full_name(self):
+    return self._full_name
+
+  @property
+  def short_name(self):
+    return self._full_name.split('.')[-1]
+
+  @property
+  def defined_in(self):
+    return self._defined_in
+
+  def set_defined_in(self, defined_in):
+    assert self.defined_in is None
+    self._defined_in = defined_in
+
+  @property
+  def aliases(self):
+    return self._aliases
+
+  def set_aliases(self, aliases):
+    assert self.aliases is None
+    self._aliases = aliases
+
+  @property
+  def doc(self):
+    return self._doc
+
+  def set_doc(self, doc):
+    assert self.doc is None
+    self._doc = doc
+
+  @property
+  def guides(self):
+    return self._guides
+
+  def set_guides(self, guides):
+    assert self.guides is None
+    self._guides = guides
+
+  @property
+  def signature(self):
+    return self._signature
+
+  def set_signature(self, function, reverse_index):
+    """Attach the function's signature.
+
+    Args:
+      function: The python function being documented.
+      reverse_index: A map from object ids in the index to full names.
+    """
+
+    assert self.signature is None
+    self._signature = _generate_signature(function, reverse_index)
+
+
+class _ClassPageInfo(object):
+  """Collects docs for a class page.
+
+  Attributes:
+    full_name: The fully qualified name of the object at the master
+      location. Aka `master_name`. For example: `tf.nn.sigmoid`.
+    short_name: The last component of the `full_name`. For example: `sigmoid`.
+    defined_in: The path to the file where this object is defined.
+    aliases: The list of all fully qualified names for the locations where the
+      object is visible in the public api. This includes the master location.
+    doc: A `_DocstringInfo` object representing the object's docstring (can be
+      created with `_parse_md_docstring`).
+    guides: A markdown string, of back links pointing to the api_guides that
+      reference this object.
+    bases: A list of `_LinkInfo` objects pointing to the docs for the parent
+      classes.
+    properties: A list of `_PropertyInfo` objects documenting the class'
+      properties (attributes that use `@property`).
+    methods: A list of `_MethodInfo` objects documenting the class' methods.
+    classes: A list of `_LinkInfo` objects pointing to docs for any nested
+      classes.
+    other_members: A list of `_OtherMemberInfo` objects documenting any other
+      object's defined inside the class object (mostly enum style fields).
   """
-  # TODO(wicke): Make sure this works for partials.
-  relative_path = os.path.relpath(
-      os.path.dirname(documentation_path(full_name)) or '.', '.')
-  docstring = _md_docstring(function, relative_path, duplicate_of)
-  signature = _generate_signature(function)
 
-  if duplicate_names:
-    aliases = '\n'.join(['### `%s`' % (name + signature)
-                         for name in duplicate_names])
-    aliases += '\n\n'
-  else:
-    aliases = ''
+  def __init__(self, full_name):
+    self._full_name = full_name
+    self._defined_in = None
+    self._aliases = None
+    self._doc = None
+    self._guides = None
 
-  return '#`%s%s`\n\n%s%s' % (full_name, signature, aliases, docstring)
+    self._bases = None
+    self._properties = []
+    self._methods = []
+    self._classes = []
+    self._other_members = []
+
+  def for_function(self):
+    """Returns true if this object documents a function."""
+    return False
+
+  def for_class(self):
+    """Returns true if this object documents a class."""
+    return True
+
+  def for_module(self):
+    """Returns true if this object documents a module."""
+    return False
+
+  @property
+  def full_name(self):
+    """Returns the documented object's fully qualified name."""
+    return self._full_name
+
+  @property
+  def short_name(self):
+    """Returns the documented object's short name."""
+    return self._full_name.split('.')[-1]
+
+  @property
+  def defined_in(self):
+    """Returns the path to the file where the documented object is defined."""
+    return self._defined_in
+
+  def set_defined_in(self, defined_in):
+    """Sets the `defined_in` path."""
+    assert self.defined_in is None
+    self._defined_in = defined_in
+
+  @property
+  def aliases(self):
+    """Returns a list of all full names for the documented object."""
+    return self._aliases
+
+  def set_aliases(self, aliases):
+    """Sets the `aliases` list.
+
+    Args:
+      aliases: A list of strings. Containing all the obejct's full names.
+    """
+    assert self.aliases is None
+    self._aliases = aliases
+
+  @property
+  def doc(self):
+    """Returns a `_DocstringInfo` created from the object's docstring."""
+    return self._doc
+
+  def set_doc(self, doc):
+    """Sets the `doc` field.
+
+    Args:
+      doc: An instance of `_DocstringInfo`.
+    """
+    assert self.doc is None
+    self._doc = doc
+
+  @property
+  def guides(self):
+    """Returns a markdown string containing backlinks to relevant api_guides."""
+    return self._guides
+
+  def set_guides(self, guides):
+    """Sets the `guides` field.
+
+    Args:
+      guides: A markdown string containing backlinks to all the api_guides that
+        link to the documented object.
+    """
+    assert self.guides is None
+    self._guides = guides
+
+  @property
+  def bases(self):
+    """Returns a list of `_LinkInfo` objects pointing to the class' parents."""
+    return self._bases
+
+  def _set_bases(self, relative_path, parser_config):
+    """Builds the `bases` attribute, to document this class' parent-classes.
+
+    This method sets the `bases` to a list of `_LinkInfo` objects point to the
+    doc pages for the class' parents.
+
+    Args:
+      relative_path: The relative path from the doc this object describes to
+        the documentation root.
+      parser_config: An instance of `ParserConfig`.
+    """
+    bases = []
+    obj = parser_config.py_name_to_object(self.full_name)
+    for base in obj.__bases__:
+      base_full_name = parser_config.reverse_index.get(id(base), None)
+      if base_full_name is None:
+        continue
+      base_doc = _parse_md_docstring(base, relative_path,
+                                     parser_config.reference_resolver)
+      base_url = parser_config.reference_resolver.reference_to_url(
+          base_full_name, relative_path)
+
+      link_info = _LinkInfo(short_name=base_full_name.split('.')[-1],
+                            full_name=base_full_name, obj=base,
+                            doc=base_doc, url=base_url)
+      bases.append(link_info)
+
+    self._bases = bases
+
+  @property
+  def properties(self):
+    """Returns a list of `_PropertyInfo` describing the class' properties."""
+    return self._properties
+
+  def _add_property(self, short_name, full_name, obj, doc):
+    """Adds a `_PropertyInfo` entry to the `properties` list.
+
+    Args:
+      short_name: The property's short name.
+      full_name: The property's fully qualified name.
+      obj: The property object itself
+      doc: The property's parsed docstring, a `_DocstringInfo`.
+    """
+    property_info = _PropertyInfo(short_name, full_name, obj, doc)
+    self._properties.append(property_info)
+
+  @property
+  def methods(self):
+    """Returns a list of `_MethodInfo` describing the class' methods."""
+    return self._methods
+
+  def _add_method(self, short_name, full_name, obj, doc, signature):
+    """Adds a `_MethodInfo` entry to the `methods` list.
+
+    Args:
+      short_name: The method's short name.
+      full_name: The method's fully qualified name.
+      obj: The method object itself
+      doc: The method's parsed docstring, a `_DocstringInfo`
+      signature: The method's parsed signature (see: `_generate_signature`)
+    """
+    method_info = _MethodInfo(short_name, full_name, obj, doc, signature)
+    self._methods.append(method_info)
+
+  @property
+  def classes(self):
+    """Returns a list of `_LinkInfo` pointing to any nested classes."""
+    return self._classes
+
+  def _add_class(self, short_name, full_name, obj, doc, url):
+    """Adds a `_LinkInfo` for a nested class to `classes` list.
+
+    Args:
+      short_name: The class' short name.
+      full_name: The class' fully qualified name.
+      obj: The class object itself
+      doc: The class' parsed docstring, a `_DocstringInfo`
+      url: A url pointing to where the nested class is documented.
+    """
+    page_info = _LinkInfo(short_name, full_name, obj, doc, url)
+
+    self._classes.append(page_info)
+
+  @property
+  def other_members(self):
+    """Returns a list of `_OtherMemberInfo` describing any other contents."""
+    return self._other_members
+
+  def _add_other_member(self, short_name, full_name, obj, doc):
+    """Adds an `_OtherMemberInfo` entry to the `other_members` list.
+
+    Args:
+      short_name: The class' short name.
+      full_name: The class' fully qualified name.
+      obj: The class object itself
+      doc: The class' parsed docstring, a `_DocstringInfo`
+    """
+    other_member_info = _OtherMemberInfo(short_name, full_name, obj, doc)
+    self._other_members.append(other_member_info)
+
+  def collect_docs_for_class(self, py_class, parser_config):
+    """Collects information necessary specifically for a class's doc page.
+
+    Mainly, this is details about the class's members.
+
+    Args:
+      py_class: The class object being documented
+      parser_config: An instance of ParserConfig.
+    """
+    doc_path = documentation_path(self.full_name)
+    relative_path = os.path.relpath(
+        path='.', start=os.path.dirname(doc_path) or '.')
+
+    self._set_bases(relative_path, parser_config)
+
+    for short_name in parser_config.tree[self.full_name]:
+      # Remove builtin members that we never want to document.
+      if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
+                        '__module__', '__dict__', '__abstractmethods__',
+                        '__slots__', '__getnewargs__']:
+        continue
+
+      child_name = '.'.join([self.full_name, short_name])
+      child = parser_config.py_name_to_object(child_name)
+
+      # Don't document anything that is defined in object or by protobuf.
+      defining_class = _get_defining_class(py_class, short_name)
+      if (defining_class is object or
+          defining_class is type or defining_class is tuple or
+          defining_class is BaseException or defining_class is Exception or
+          # The following condition excludes most protobuf-defined symbols.
+          defining_class and defining_class.__name__ in ['CMessage', 'Message',
+                                                         'MessageMeta']):
+        continue
+      # TODO(markdaoust): Add a note in child docs showing the defining class.
+
+      child_doc = _parse_md_docstring(child, relative_path,
+                                      parser_config.reference_resolver)
+
+      if isinstance(child, property):
+        self._add_property(short_name, child_name, child, child_doc)
+
+      elif tf_inspect.isclass(child):
+        if defining_class is None:
+          continue
+        url = parser_config.reference_resolver.reference_to_url(
+            child_name, relative_path)
+        self._add_class(short_name, child_name, child, child_doc, url)
+
+      elif (tf_inspect.ismethod(child) or tf_inspect.isfunction(child) or
+            tf_inspect.isroutine(child)):
+        if defining_class is None:
+          continue
+
+        # Omit methods defined by namedtuple.
+        original_method = defining_class.__dict__[short_name]
+        if (hasattr(original_method, '__module__') and
+            (original_method.__module__ or '').startswith('namedtuple')):
+          continue
+
+        # Some methods are often overridden without documentation. Because it's
+        # obvious what they do, don't include them in the docs if there's no
+        # docstring.
+        if not child_doc.brief.strip() and short_name in [
+            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+          print('Skipping %s, defined in %s, no docstring.' % (child_name,
+                                                               defining_class))
+          continue
+
+        try:
+          child_signature = _generate_signature(child,
+                                                parser_config.reverse_index)
+        except TypeError:
+          # If this is a (dynamically created) slot wrapper, tf_inspect will
+          # raise typeerror when trying to get to the code. Ignore such
+          # functions.
+          continue
+
+        self._add_method(short_name, child_name, child, child_doc,
+                         child_signature)
+      else:
+        # Exclude members defined by protobuf that are useless
+        if issubclass(py_class, ProtoMessage):
+          if (short_name.endswith('_FIELD_NUMBER') or
+              short_name in ['__slots__', 'DESCRIPTOR']):
+            continue
+
+        # TODO(wicke): We may want to also remember the object itself.
+        self._add_other_member(short_name, child_name, child, child_doc)
 
 
-def _generate_markdown_for_class(full_name, duplicate_names, py_class,
-                                 duplicate_of, index, tree):
-  """Generate Markdown docs for a class.
+class _ModulePageInfo(object):
+  """Collects docs for a module page."""
 
-  This function creates a documentation page for a class. It uses the
-  class name as the title, followed by a list of duplicate
-  names (if there are any), the Markdown formatted docstring of the
-  class, a list of links to all child class docs, a list of all properties
-  including their docstrings, a list of all methods incl. their docstrings, and
-  a list of all class member names (public fields).
+  def __init__(self, full_name):
+    self._full_name = full_name
+    self._defined_in = None
+    self._aliases = None
+    self._doc = None
+    self._guides = None
 
-  Args:
-    full_name: The preferred name of the class. Used in the title. Must not
-      be present in `duplicate_of` (master names never are).
-    duplicate_names: A sorted list of alternative names (incl. `full_name`).
-    py_class: The python object referenced by `full_name`.
-    duplicate_of: A map of duplicate full names to master names. Used to resolve
-      @{symbol} references in the docstrings.
-    index: A map from full names to python object references.
-    tree: A map from full names to the names of all documentable child objects.
+    self._modules = []
+    self._classes = []
+    self._functions = []
+    self._other_members = []
 
-  Returns:
-    A string that can be written to a documentation file for this class.
-  """
-  relative_path = os.path.relpath(
-      os.path.dirname(documentation_path(full_name)) or '.', '.')
-  docstring = _md_docstring(py_class, relative_path, duplicate_of)
-  if duplicate_names:
-    aliases = '\n'.join(['### `class %s`' % name for name in duplicate_names])
-    aliases += '\n\n'
-  else:
-    aliases = ''
+  def for_function(self):
+    return False
 
-  docs = '# `%s`\n\n%s%s\n\n' % (full_name, aliases, docstring)
+  def for_class(self):
+    return False
 
-  field_names = []
-  properties = []
-  methods = []
-  class_links = []
-  for member in tree[full_name]:
-    child_name = '.'.join([full_name, member])
-    child = index[child_name]
+  def for_module(self):
+    return True
 
-    if isinstance(child, property):
-      properties.append((member, child))
-    elif inspect.isclass(child):
-      class_links.append(_markdown_link('class ' + member, child_name,
-                                        relative_path, duplicate_of))
-    elif inspect.ismethod(child) or inspect.isfunction(child):
-      methods.append((member, child))
-    else:
-      # TODO(wicke): We may want to also remember the object itself.
-      field_names.append(member)
+  @property
+  def full_name(self):
+    return self._full_name
 
-  if class_links:
-    docs += '## Child Classes\n'
-    docs += '\n\n'.join(sorted(class_links))
-    docs += '\n\n'
+  @property
+  def short_name(self):
+    return self._full_name.split('.')[-1]
 
-  if properties:
-    docs += '## Properties\n\n'
-    for property_name, prop in sorted(properties, key=lambda x: x[0]):
-      docs += '### `%s`\n\n%s\n\n' % (
-          property_name, _md_docstring(prop, relative_path, duplicate_of))
-    docs += '\n\n'
+  @property
+  def defined_in(self):
+    return self._defined_in
 
-  if methods:
-    docs += '## Methods\n\n'
-    for method_name, method in sorted(methods, key=lambda x: x[0]):
-      method_signature = method_name + _generate_signature(method)
-      docs += '### `%s`\n\n%s\n\n' % (method_signature,
-                                      _md_docstring(method, relative_path,
-                                                    duplicate_of))
-    docs += '\n\n'
+  def set_defined_in(self, defined_in):
+    assert self.defined_in is None
+    self._defined_in = defined_in
 
-  if field_names:
-    docs += '## Class Members\n\n'
-    # TODO(wicke): Document the value of the members, at least for basic types.
-    docs += '\n\n'.join(sorted(field_names))
-    docs += '\n\n'
+  @property
+  def aliases(self):
+    return self._aliases
 
-  return docs
+  def set_aliases(self, aliases):
+    assert self.aliases is None
+    self._aliases = aliases
+
+  @property
+  def doc(self):
+    return self._doc
+
+  def set_doc(self, doc):
+    assert self.doc is None
+    self._doc = doc
+
+  @property
+  def guides(self):
+    return self._guides
+
+  def set_guides(self, guides):
+    assert self.guides is None
+    self._guides = guides
+
+  @property
+  def modules(self):
+    return self._modules
+
+  def _add_module(self, short_name, full_name, obj, doc, url):
+    self._modules.append(_LinkInfo(short_name, full_name, obj, doc, url))
+
+  @property
+  def classes(self):
+    return self._classes
+
+  def _add_class(self, short_name, full_name, obj, doc, url):
+    self._classes.append(_LinkInfo(short_name, full_name, obj, doc, url))
+
+  @property
+  def functions(self):
+    return self._functions
+
+  def _add_function(self, short_name, full_name, obj, doc, url):
+    self._functions.append(_LinkInfo(short_name, full_name, obj, doc, url))
+
+  @property
+  def other_members(self):
+    return self._other_members
+
+  def _add_other_member(self, short_name, full_name, obj, doc):
+    self._other_members.append(
+        _OtherMemberInfo(short_name, full_name, obj, doc))
+
+  def collect_docs_for_module(self, parser_config):
+    """Collect information necessary specifically for a module's doc page.
+
+    Mainly this is information about the members of the module.
+
+    Args:
+      parser_config: An instance of ParserConfig.
+    """
+    relative_path = os.path.relpath(
+        path='.',
+        start=os.path.dirname(documentation_path(self.full_name)) or '.')
+
+    member_names = parser_config.tree.get(self.full_name, [])
+    for name in member_names:
+
+      if name in ['__builtins__', '__doc__', '__file__',
+                  '__name__', '__path__', '__package__']:
+        continue
+
+      member_full_name = self.full_name + '.' + name if self.full_name else name
+      member = parser_config.py_name_to_object(member_full_name)
+
+      member_doc = _parse_md_docstring(member, relative_path,
+                                       parser_config.reference_resolver)
+
+      url = parser_config.reference_resolver.reference_to_url(
+          member_full_name, relative_path)
+
+      if tf_inspect.ismodule(member):
+        self._add_module(name, member_full_name, member, member_doc, url)
+
+      elif tf_inspect.isclass(member):
+        self._add_class(name, member_full_name, member, member_doc, url)
+
+      elif tf_inspect.isfunction(member):
+        self._add_function(name, member_full_name, member, member_doc, url)
+
+      else:
+        self._add_other_member(name, member_full_name, member, member_doc)
 
 
-def _generate_markdown_for_module(full_name, duplicate_names, module,
-                                  duplicate_of, index, tree):
-  """Generate Markdown docs for a module.
+class ParserConfig(object):
+  """Stores all indexes required to parse the docs."""
 
-  This function creates a documentation page for a module. It uses the
-  module name as the title, followed by a list of duplicate
-  names (if there are any), the Markdown formatted docstring of the
-  class, and a list of links to all members of this module.
+  def __init__(self, reference_resolver, duplicates, duplicate_of, tree, index,
+               reverse_index, guide_index, base_dir):
+    """Object with the common config for docs_for_object() calls.
 
-  Args:
-    full_name: The preferred name of the module. Used in the title. Must not
-      be present in `duplicate_of` (master names never are).
-    duplicate_names: A sorted list of alternative names (incl. `full_name`).
-    module: The python object referenced by `full_name`.
-    duplicate_of: A map of duplicate full names to master names. Used to resolve
-      @{symbol} references in the docstrings.
-    index: A map from full names to python object references.
-    tree: A map from full names to the names of all documentable child objects.
+    Args:
+      reference_resolver: An instance of ReferenceResolver.
+      duplicates: A `dict` mapping fully qualified names to a set of all
+        aliases of this name. This is used to automatically generate a list of
+        all aliases for each name.
+      duplicate_of: A map from duplicate names to preferred names of API
+        symbols.
+      tree: A `dict` mapping a fully qualified name to the names of all its
+        members. Used to populate the members section of a class or module page.
+      index: A `dict` mapping full names to objects.
+      reverse_index: A `dict` mapping object ids to full names.
 
-  Returns:
-    A string that can be written to a documentation file for this module.
-  """
-  relative_path = os.path.relpath(
-      os.path.dirname(documentation_path(full_name)) or '.', '.')
-  docstring = _md_docstring(module, relative_path, duplicate_of)
-  if duplicate_names:
-    aliases = '\n'.join(['### Module `%s`' % name for name in duplicate_names])
-    aliases += '\n\n'
-  else:
-    aliases = ''
+      guide_index: A `dict` mapping symbol name strings to objects with a
+        `make_md_link()` method.
 
-  member_names = tree.get(full_name, [])
+      base_dir: A base path that is stripped from file locations written to the
+        docs.
+    """
+    self.reference_resolver = reference_resolver
+    self.duplicates = duplicates
+    self.duplicate_of = duplicate_of
+    self.tree = tree
+    self.reverse_index = reverse_index
+    self.index = index
+    self.guide_index = guide_index
+    self.base_dir = base_dir
+    self.defined_in_prefix = 'tensorflow/'
+    self.code_url_prefix = (
+        'https://www.tensorflow.org/code/tensorflow/')  # pylint: disable=line-too-long
 
-  # Make links to all members.
-  member_links = []
-  for name in member_names:
-    member_full_name = full_name + '.' + name if full_name else name
-    member = index[member_full_name]
-
-    if inspect.isclass(member):
-      link_text = 'class ' + name
-    elif inspect.isfunction(member):
-      link_text = name + _generate_signature(member)
-    else:
-      link_text = name
-
-    member_links.append(_markdown_link(link_text, member_full_name,
-                                       relative_path, duplicate_of))
-
-  # TODO(deannarubin): Make this list into a table and add the brief docstring.
-  # (use _get_brief_docstring)
-
-  return '# Module `%s`\n\n%s%s\n\n## Members\n\n%s' % (
-      full_name, aliases, docstring, '\n\n'.join(member_links))
+  def py_name_to_object(self, full_name):
+    """Return the Python object for a Python symbol name."""
+    return self.index[full_name]
 
 
-_CODE_URL_PREFIX = (
-    'https://www.tensorflow.org/code/')
+def docs_for_object(full_name, py_object, parser_config):
+  """Return a PageInfo object describing a given object from the TF API.
 
-
-def generate_markdown(full_name, py_object,
-                      duplicate_of, duplicates,
-                      index, tree, base_dir):
-  """Generate Markdown docs for a given object that's part of the TF API.
-
-  This function uses _md_docstring to obtain the docs pertaining to
+  This function uses _parse_md_docstring to parse the docs pertaining to
   `object`.
 
-  This function resolves '@symbol' references in the docstrings into links to
+  This function resolves '@{symbol}' references in the docstrings into links to
   the appropriate location. It also adds a list of alternative names for the
   symbol automatically.
 
@@ -538,28 +1296,16 @@ def generate_markdown(full_name, py_object,
   `documentation_path`, and that relative links to files within the
   documentation are resolvable.
 
-  The output is Markdown that can be written to file and published.
-
   Args:
-    full_name: The fully qualified name (excl. "tf.") of the symbol to be
+    full_name: The fully qualified name of the symbol to be
       documented.
     py_object: The Python object to be documented. Its documentation is sourced
       from `py_object`'s docstring.
-    duplicate_of: A `dict` mapping fully qualified names to "master" names. This
-      is used to resolve "@{symbol}" references to the "master" name.
-    duplicates: A `dict` mapping fully qualified names to a set of all
-      aliases of this name. This is used to automatically generate a list of all
-      aliases for each name.
-    index: A `dict` mapping fully qualified names to the corresponding Python
-      objects. Used to produce docs for child objects, and to check the validity
-      of "@{symbol}" references.
-    tree: A `dict` mapping a fully qualified name to the names of all its
-      members. Used to populate the members section of a class or module page.
-    base_dir: A base path that is stripped from file locations written to the
-      docs.
+    parser_config: A ParserConfig object.
 
   Returns:
-    A string containing the Markdown docs for `py_object`.
+    Either a `_FunctionPageInfo`, `_ClassPageInfo`, or a `_ModulePageInfo`
+    depending on the type of the python object being documented.
 
   Raises:
     RuntimeError: If an object is encountered for which we don't know how
@@ -567,47 +1313,183 @@ def generate_markdown(full_name, py_object,
   """
 
   # Which other aliases exist for the object referenced by full_name?
-  master_name = duplicate_of.get(full_name, full_name)
-  duplicate_names = duplicates.get(master_name, [full_name])
+  master_name = parser_config.reference_resolver.py_master_name(full_name)
+  duplicate_names = parser_config.duplicates.get(master_name, [full_name])
 
   # TODO(wicke): Once other pieces are ready, enable this also for partials.
-  if (inspect.ismethod(py_object) or inspect.isfunction(py_object) or
+  if (tf_inspect.ismethod(py_object) or tf_inspect.isfunction(py_object) or
       # Some methods in classes from extensions come in as routines.
-      inspect.isroutine(py_object)):
-    markdown = _generate_markdown_for_function(master_name, duplicate_names,
-                                               py_object, duplicate_of)
-  elif inspect.isclass(py_object):
-    markdown = _generate_markdown_for_class(master_name, duplicate_names,
-                                            py_object, duplicate_of,
-                                            index, tree)
-  elif inspect.ismodule(py_object):
-    markdown = _generate_markdown_for_module(master_name, duplicate_names,
-                                             py_object, duplicate_of,
-                                             index, tree)
+      tf_inspect.isroutine(py_object)):
+    page_info = _FunctionPageInfo(master_name)
+    page_info.set_signature(py_object, parser_config.reverse_index)
+
+  elif tf_inspect.isclass(py_object):
+    page_info = _ClassPageInfo(master_name)
+    page_info.collect_docs_for_class(py_object, parser_config)
+
+  elif tf_inspect.ismodule(py_object):
+    page_info = _ModulePageInfo(master_name)
+    page_info.collect_docs_for_module(parser_config)
+
   else:
     raise RuntimeError('Cannot make docs for object %s: %r' % (full_name,
                                                                py_object))
 
-  # Every page gets a note on the bottom about where this object is defined
+  relative_path = os.path.relpath(
+      path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
+
+  page_info.set_doc(_parse_md_docstring(
+      py_object, relative_path, parser_config.reference_resolver))
+
+  page_info.set_aliases(duplicate_names)
+
+  page_info.set_guides(_get_guides_markdown(
+      duplicate_names, parser_config.guide_index, relative_path))
+
+  page_info.set_defined_in(_get_defined_in(py_object, parser_config))
+
+  return page_info
+
+
+class _PythonBuiltin(object):
+  """This class indicated that the object in question is a python builtin.
+
+  This can be used for the `defined_in` slot of the `PageInfo` objects.
+  """
+
+  def is_builtin(self):
+    return True
+
+  def is_python_file(self):
+    return False
+
+  def is_generated_file(self):
+    return False
+
+  def __str__(self):
+    return 'This is an alias for a Python built-in.\n\n'
+
+
+class _PythonFile(object):
+  """This class indicates that the object is defined in a regular python file.
+
+  This can be used for the `defined_in` slot of the `PageInfo` obejcts.
+  """
+
+  def __init__(self, path, parser_config):
+    self.path = path
+    self.path_prefix = parser_config.defined_in_prefix
+    self.code_url_prefix = parser_config.code_url_prefix
+
+  def is_builtin(self):
+    return False
+
+  def is_python_file(self):
+    return True
+
+  def is_generated_file(self):
+    return False
+
+  def __str__(self):
+    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
+        path=self.path, prefix=self.path_prefix,
+        code_prefix=self.code_url_prefix)
+
+
+class _ProtoFile(object):
+  """This class indicates that the object is defined in a .proto file.
+
+  This can be used for the `defined_in` slot of the `PageInfo` objects.
+  """
+
+  def __init__(self, path, parser_config):
+    self.path = path
+    self.path_prefix = parser_config.defined_in_prefix
+    self.code_url_prefix = parser_config.code_url_prefix
+
+  def is_builtin(self):
+    return False
+
+  def is_python_file(self):
+    return False
+
+  def is_generated_file(self):
+    return False
+
+  def __str__(self):
+    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
+        path=self.path, prefix=self.path_prefix,
+        code_prefix=self.code_url_prefix)
+
+
+class _GeneratedFile(object):
+  """This class indicates that the object is defined in a generated python file.
+
+  Generated files should not be linked to directly.
+
+  This can be used for the `defined_in` slot of the `PageInfo` objects.
+  """
+
+  def __init__(self, path, parser_config):
+    self.path = path
+    self.path_prefix = parser_config.defined_in_prefix
+
+  def is_builtin(self):
+    return False
+
+  def is_python_file(self):
+    return False
+
+  def is_generated_file(self):
+    return True
+
+  def __str__(self):
+    return 'Defined in `%s%s`.\n\n' % (self.path_prefix, self.path)
+
+
+def _get_defined_in(py_object, parser_config):
+  """Returns a description of where the passed in python object was defined.
+
+  Arguments:
+    py_object: The Python object.
+    parser_config: A ParserConfig object.
+
+  Returns:
+    Either a `_PythonBuiltin`, `_PythonFile`, or a `_GeneratedFile`
+  """
+  # Every page gets a note about where this object is defined
   # TODO(wicke): If py_object is decorated, get the decorated object instead.
   # TODO(wicke): Only use decorators that support this in TF.
 
   try:
-    path = os.path.relpath(inspect.getfile(py_object), base_dir)
-
-    # TODO(wicke): If this is a generated file, point to the source instead.
-
-    # Never include links outside this code base.
-    if not path.startswith('..'):
-      markdown += '\n\nDefined in [`%s`](%s%s).\n\n' % (
-          path, _CODE_URL_PREFIX, path)
+    path = os.path.relpath(path=tf_inspect.getfile(py_object),
+                           start=parser_config.base_dir)
   except TypeError:  # getfile throws TypeError if py_object is a builtin.
-    markdown += '\n\nThis is an alias for a Python built-in.'
+    return _PythonBuiltin()
 
-  return markdown
+  # TODO(wicke): If this is a generated file, link to the source instead.
+  # TODO(wicke): Move all generated files to a generated/ directory.
+  # TODO(wicke): And make their source file predictable from the file name.
+
+  # In case this is compiled, point to the original
+  if path.endswith('.pyc'):
+    path = path[:-1]
+
+  # Never include links outside this code base.
+  if path.startswith('..'):
+    return None
+
+  if re.match(r'.*/gen_[^/]*\.py$', path):
+    return _GeneratedFile(path, parser_config)
+  elif re.match(r'.*_pb2\.py$', path):
+    # The _pb2.py files all appear right next to their defining .proto file.
+    return _ProtoFile(path[:-7] + '.proto', parser_config)
+  else:
+    return _PythonFile(path, parser_config)
 
 
-def generate_global_index(library_name, root_name, index, duplicate_of):
+# TODO(markdaoust): This should just parse, pretty_docs should generate the md.
+def generate_global_index(library_name, index, reference_resolver):
   """Given a dict of full names to python objects, generate an index page.
 
   The index page generated contains a list of links for all symbols in `index`
@@ -615,37 +1497,31 @@ def generate_global_index(library_name, root_name, index, duplicate_of):
 
   Args:
     library_name: The name for the documented library to use in the title.
-    root_name: The name to use for the root module.
     index: A dict mapping full names to python objects.
-    duplicate_of: A map of duplicate names to preferred names.
+    reference_resolver: An instance of ReferenceResolver.
 
   Returns:
     A string containing an index page as Markdown.
   """
   symbol_links = []
   for full_name, py_object in six.iteritems(index):
-    index_name = full_name or root_name
-    if (inspect.ismodule(py_object) or inspect.isfunction(py_object) or
-        inspect.isclass(py_object)):
+    if (tf_inspect.ismodule(py_object) or tf_inspect.isfunction(py_object) or
+        tf_inspect.isclass(py_object)):
       # In Python 3, unbound methods are functions, so eliminate those.
-      if inspect.isfunction(py_object):
+      if tf_inspect.isfunction(py_object):
         if full_name.count('.') == 0:
           parent_name = ''
         else:
           parent_name = full_name[:full_name.rfind('.')]
-        if parent_name in index and inspect.isclass(index[parent_name]):
+        if parent_name in index and tf_inspect.isclass(index[parent_name]):
           # Skip methods (=functions with class parents).
           continue
-      
-      symbol_links.append((index_name,
-                           _markdown_link(index_name, full_name,
-                                          '.', duplicate_of)))
+      symbol_links.append((
+          full_name, reference_resolver.python_link(full_name, full_name, '.')))
 
   lines = ['# All symbols in %s' % library_name, '']
   for _, link in sorted(symbol_links, key=lambda x: x[0]):
     lines.append('*  %s' % link)
 
-  # TODO(deannarubin): Make this list into a table and add the brief docstring.
-  # (use _get_brief_docstring)
-
+  # TODO(markdaoust): use a _ModulePageInfo -> prety_docs.build_md_page()
   return '\n'.join(lines)
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 521e2d4ed3b..3e02160130f 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -19,19 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import inspect
 import os
 import sys
 
 from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import parser
 
 
-def test_function_for_markdown_reference(unused_arg):
-  """Docstring with reference to @{test_function}."""
-  pass
-
-
 def test_function(unused_arg, unused_kwarg='default'):
   """Docstring for test function."""
   pass
@@ -42,19 +37,6 @@ def test_function_with_args_kwargs(unused_arg, *unused_args, **unused_kwargs):
   pass
 
 
-def test_function_with_fancy_docstring(arg):
-  """Function with a fancy docstring.
-
-  Args:
-    arg: An argument.
-
-  Returns:
-    arg: the input, and
-    arg: the input, again.
-  """
-  return arg, arg
-
-
 class TestClass(object):
   """Docstring for TestClass itself."""
 
@@ -74,26 +56,70 @@ class TestClass(object):
   CLASS_MEMBER = 'a class member'
 
 
+class DummyVisitor(object):
+
+  def __init__(self, index, duplicate_of):
+    self.index = index
+    self.duplicate_of = duplicate_of
+
+
 class ParserTest(googletest.TestCase):
 
   def test_documentation_path(self):
     self.assertEqual('test.md', parser.documentation_path('test'))
     self.assertEqual('test/module.md', parser.documentation_path('test.module'))
 
-  def test_documentation_path_empty(self):
-    self.assertEqual('index.md', parser.documentation_path(''))
-
   def test_replace_references(self):
-    string = 'A @{reference}, another @{tf.reference}, and a @{third}.'
-    duplicate_of = {'third': 'fourth'}
-    result = parser.replace_references(string, '../..', duplicate_of)
+    class HasOneMember(object):
+
+      def foo(self):
+        pass
+
+    string = ('A @{tf.reference}, another @{tf.reference}, '
+              'a member @{tf.reference.foo}, and a @{tf.third}.')
+    duplicate_of = {'tf.third': 'tf.fourth'}
+    index = {'tf.reference': HasOneMember,
+             'tf.reference.foo': HasOneMember.foo,
+             'tf.third': HasOneMember,
+             'tf.fourth': HasOneMember}
+
+    visitor = DummyVisitor(index, duplicate_of)
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    result = reference_resolver.replace_references(string, '../..')
     self.assertEqual(
-        'A [`reference`](../../reference.md), another '
-        '[`tf.reference`](../../reference.md), '
-        'and a [`third`](../../fourth.md).',
+        'A [`tf.reference`](../../tf/reference.md), another '
+        '[`tf.reference`](../../tf/reference.md), '
+        'a member [`tf.reference.foo`](../../tf/reference.md#foo), '
+        'and a [`tf.third`](../../tf/fourth.md).',
         result)
 
-  def test_generate_markdown_for_class(self):
+  def test_doc_replace_references(self):
+    string = '@{$doc1} @{$doc1#abc} @{$doc1$link} @{$doc1#def$zelda} @{$do/c2}'
+
+    class DocInfo(object):
+      pass
+    doc1 = DocInfo()
+    doc1.title = 'Title1'
+    doc1.url = 'URL1'
+    doc2 = DocInfo()
+    doc2.title = 'Two words'
+    doc2.url = 'somewhere/else'
+    doc_index = {'doc1': doc1, 'do/c2': doc2}
+
+    visitor = DummyVisitor(index={}, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index=doc_index, py_module_names=['tf'])
+    result = reference_resolver.replace_references(string, 'python')
+    self.assertEqual(
+        '[Title1](../URL1) [Title1](../URL1#abc) [link](../URL1) '
+        '[zelda](../URL1#def) [Two words](../somewhere/else)',
+        result)
+
+  def test_docs_for_class(self):
 
     index = {
         'TestClass': TestClass,
@@ -103,32 +129,48 @@ class ParserTest(googletest.TestCase):
         'TestClass.CLASS_MEMBER': TestClass.CLASS_MEMBER
     }
 
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
     tree = {
         'TestClass': ['a_method', 'a_property', 'ChildClass', 'CLASS_MEMBER']
     }
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
 
-    docs = parser.generate_markdown(full_name='TestClass', py_object=TestClass,
-                                    duplicate_of={}, duplicates={},
-                                    index=index, tree=tree, base_dir='/')
+    page_info = parser.docs_for_object(
+        full_name='TestClass', py_object=TestClass, parser_config=parser_config)
 
-    # Make sure all required docstrings are present.
-    self.assertTrue(inspect.getdoc(TestClass) in docs)
-    self.assertTrue(inspect.getdoc(TestClass.a_method) in docs)
-    self.assertTrue(inspect.getdoc(TestClass.a_property) in docs)
+    # Make sure the brief docstring is present
+    self.assertEqual(
+        tf_inspect.getdoc(TestClass).split('\n')[0], page_info.doc.brief)
+
+    # Make sure the method is present
+    self.assertEqual(TestClass.a_method, page_info.methods[0].obj)
 
     # Make sure that the signature is extracted properly and omits self.
-    self.assertTrue('a_method(arg=\'default\')' in docs)
+    self.assertEqual(["arg='default'"], page_info.methods[0].signature)
+
+    # Make sure the property is present
+    self.assertIs(TestClass.a_property, page_info.properties[0].obj)
 
     # Make sure there is a link to the child class and it points the right way.
-    self.assertTrue('[`class ChildClass`](./TestClass/ChildClass.md)' in docs)
-
-    # Make sure CLASS_MEMBER is mentioned.
-    self.assertTrue('CLASS_MEMBER' in docs)
+    self.assertIs(TestClass.ChildClass, page_info.classes[0].obj)
 
     # Make sure this file is contained as the definition location.
-    self.assertTrue(os.path.relpath(__file__, '/') in docs)
+    self.assertEqual(os.path.relpath(__file__, '/'), page_info.defined_in.path)
 
-  def test_generate_markdown_for_module(self):
+  def test_docs_for_module(self):
+    # Get the current module.
     module = sys.modules[__name__]
 
     index = {
@@ -139,127 +181,180 @@ class ParserTest(googletest.TestCase):
         'TestModule.TestClass': TestClass,
     }
 
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
     tree = {
         'TestModule': ['TestClass', 'test_function',
                        'test_function_with_args_kwargs']
     }
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
 
-    docs = parser.generate_markdown(full_name='TestModule', py_object=module,
-                                    duplicate_of={}, duplicates={},
-                                    index=index, tree=tree, base_dir='/')
+    page_info = parser.docs_for_object(
+        full_name='TestModule', py_object=module, parser_config=parser_config)
 
-    # Make sure all required docstrings are present.
-    self.assertTrue(inspect.getdoc(module) in docs)
+    # Make sure the brief docstring is present
+    self.assertEqual(tf_inspect.getdoc(module).split('\n')[0],
+                     page_info.doc.brief)
 
-    # Make sure that links to the members are there (not asserting on exact link
-    # text for functions).
-    self.assertTrue('./TestModule/test_function.md' in docs)
-    self.assertTrue('./TestModule/test_function_with_args_kwargs.md' in docs)
+    # Make sure that the members are there
+    funcs = {f_info.obj for f_info in page_info.functions}
+    self.assertEqual({test_function, test_function_with_args_kwargs}, funcs)
 
-    # Make sure there is a link to the child class and it points the right way.
-    self.assertTrue('[`class TestClass`](./TestModule/TestClass.md)' in docs)
+    classes = {cls_info.obj for cls_info in page_info.classes}
+    self.assertEqual({TestClass}, classes)
 
     # Make sure this file is contained as the definition location.
-    self.assertTrue(os.path.relpath(__file__, '/') in docs)
+    self.assertEqual(os.path.relpath(__file__, '/'), page_info.defined_in.path)
 
-  def test_generate_markdown_for_function(self):
+  def test_docs_for_function(self):
     index = {
         'test_function': test_function
     }
 
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
     tree = {
         '': ['test_function']
     }
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
 
-    docs = parser.generate_markdown(full_name='test_function',
-                                    py_object=test_function,
-                                    duplicate_of={}, duplicates={},
-                                    index=index, tree=tree, base_dir='/')
+    page_info = parser.docs_for_object(
+        full_name='test_function',
+        py_object=test_function,
+        parser_config=parser_config)
 
-    # Make sure docstring shows up.
-    self.assertTrue(inspect.getdoc(test_function) in docs)
+    # Make sure the brief docstring is present
+    self.assertEqual(
+        tf_inspect.getdoc(test_function).split('\n')[0], page_info.doc.brief)
 
     # Make sure the extracted signature is good.
-    self.assertTrue(
-        'test_function(unused_arg, unused_kwarg=\'default\')' in docs)
+    self.assertEqual(['unused_arg', "unused_kwarg='default'"],
+                     page_info.signature)
 
     # Make sure this file is contained as the definition location.
-    self.assertTrue(os.path.relpath(__file__, '/') in docs)
+    self.assertEqual(os.path.relpath(__file__, '/'), page_info.defined_in.path)
 
-  def test_generate_markdown_for_function_with_kwargs(self):
+  def test_docs_for_function_with_kwargs(self):
     index = {
         'test_function_with_args_kwargs': test_function_with_args_kwargs
     }
 
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
     tree = {
         '': ['test_function_with_args_kwargs']
     }
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
 
-    docs = parser.generate_markdown(full_name='test_function_with_args_kwargs',
-                                    py_object=test_function_with_args_kwargs,
-                                    duplicate_of={}, duplicates={},
-                                    index=index, tree=tree, base_dir='/')
+    page_info = parser.docs_for_object(
+        full_name='test_function_with_args_kwargs',
+        py_object=test_function_with_args_kwargs,
+        parser_config=parser_config)
 
-    # Make sure docstring shows up.
-    self.assertTrue(inspect.getdoc(test_function_with_args_kwargs) in docs)
+    # Make sure the brief docstring is present
+    self.assertEqual(
+        tf_inspect.getdoc(test_function_with_args_kwargs).split('\n')[0],
+        page_info.doc.brief)
 
     # Make sure the extracted signature is good.
-    self.assertTrue(
-        'test_function_with_args_kwargs(unused_arg,'
-        ' *unused_args, **unused_kwargs)' in docs)
+    self.assertEqual(['unused_arg', '*unused_args', '**unused_kwargs'],
+                     page_info.signature)
 
-  def test_references_replaced_in_generated_markdown(self):
+  def test_parse_md_docstring(self):
+
+    def test_function_with_fancy_docstring(arg):
+      """Function with a fancy docstring.
+
+      And a bunch of references: @{tf.reference}, another @{tf.reference},
+          a member @{tf.reference.foo}, and a @{tf.third}.
+
+      Args:
+        arg: An argument.
+
+      Raises:
+        an exception
+
+      Returns:
+        arg: the input, and
+        arg: the input, again.
+
+      @compatibility(numpy)
+      NumPy has nothing as awesome as this function.
+      @end_compatibility
+
+      @compatibility(theano)
+      Theano has nothing as awesome as this function.
+
+      Check it out.
+      @end_compatibility
+
+      """
+      return arg, arg
+
+    class HasOneMember(object):
+
+      def foo(self):
+        pass
+
+    duplicate_of = {'tf.third': 'tf.fourth'}
     index = {
-        'test_function_for_markdown_reference':
-        test_function_for_markdown_reference
+        'tf.fancy': test_function_with_fancy_docstring,
+        'tf.reference': HasOneMember,
+        'tf.reference.foo': HasOneMember.foo,
+        'tf.third': HasOneMember,
+        'tf.fourth': HasOneMember
     }
 
-    tree = {
-        '': ['test_function_for_markdown_reference']
-    }
+    visitor = DummyVisitor(index=index, duplicate_of=duplicate_of)
 
-    docs = parser.generate_markdown(
-        full_name='test_function_for_markdown_reference',
-        py_object=test_function_for_markdown_reference,
-        duplicate_of={}, duplicates={},
-        index=index, tree=tree, base_dir='/')
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
 
-    # Make sure docstring shows up and is properly processed.
-    expected_docs = parser.replace_references(
-        inspect.getdoc(test_function_for_markdown_reference),
-        relative_path_to_root='.', duplicate_of={})
+    doc_info = parser._parse_md_docstring(test_function_with_fancy_docstring,
+                                          '../..', reference_resolver)
 
-    self.assertTrue(expected_docs in docs)
+    self.assertNotIn('@', doc_info.docstring)
+    self.assertNotIn('compatibility', doc_info.docstring)
+    self.assertNotIn('Raises:', doc_info.docstring)
 
-  def test_docstring_special_section(self):
-    index = {
-        'test_function': test_function_with_fancy_docstring
-    }
+    self.assertEqual(len(doc_info.function_details), 3)
+    self.assertEqual(set(doc_info.compatibility.keys()), {'numpy', 'theano'})
 
-    tree = {
-        '': 'test_function'
-    }
-
-    docs = parser.generate_markdown(
-        full_name='test_function',
-        py_object=test_function_with_fancy_docstring,
-        duplicate_of={}, duplicates={},
-        index=index, tree=tree, base_dir='/')
-
-    expected = '\n'.join([
-        'Function with a fancy docstring.',
-        '',
-        '#### Args:',
-        '',
-        '* <b>`arg`</b>: An argument.',
-        '',
-        '',
-        '#### Returns:',
-        '',
-        '* <b>`arg`</b>: the input, and',
-        '* <b>`arg`</b>: the input, again.',
-        ''])
-    self.assertTrue(expected in docs)
+    self.assertEqual(doc_info.compatibility['numpy'],
+                     'NumPy has nothing as awesome as this function.\n')
 
   def test_generate_index(self):
     module = sys.modules[__name__]
@@ -273,27 +368,30 @@ class ParserTest(googletest.TestCase):
         'TestModule.TestClass.a_property': TestClass.a_property,
         'TestModule.TestClass.ChildClass': TestClass.ChildClass,
     }
-
     duplicate_of = {
         'TestModule.test_function': 'test_function'
     }
 
-    docs = parser.generate_global_index('TestLibrary', 'test',
-                                        index=index,
-                                        duplicate_of=duplicate_of)
+    visitor = DummyVisitor(index=index, duplicate_of=duplicate_of)
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    docs = parser.generate_global_index('TestLibrary', index=index,
+                                        reference_resolver=reference_resolver)
 
     # Make sure duplicates and non-top-level symbols are in the index, but
     # methods and properties are not.
-    self.assertTrue('a_method' not in docs)
-    self.assertTrue('a_property' not in docs)
-    self.assertTrue('TestModule.TestClass' in docs)
-    self.assertTrue('TestModule.TestClass.ChildClass' in docs)
-    self.assertTrue('TestModule.test_function' in docs)
+    self.assertNotIn('a_method', docs)
+    self.assertNotIn('a_property', docs)
+    self.assertIn('TestModule.TestClass', docs)
+    self.assertIn('TestModule.TestClass.ChildClass', docs)
+    self.assertIn('TestModule.test_function', docs)
     # Leading backtick to make sure it's included top-level.
     # This depends on formatting, but should be stable.
-    self.assertTrue('`test_function' in docs)
+    self.assertIn('`test_function', docs)
 
-  def test_argspec_for_functoos_partial(self):
+  def test_argspec_for_functools_partial(self):
 
     # pylint: disable=unused-argument
     def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
@@ -305,45 +403,117 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None, None,
-                               (1, 2))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+                                  None, (1, 2))
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None, None,
-                               (1, 2))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+                                  None, (1, 2))
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None, (1, 2))
+    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
+                                  (1, 2))
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # pylint: enable=protected-access
 
+  def testSaveReferenceResolver(self):
+    you_cant_serialize_this = object()
+
+    duplicate_of = {'AClass': ['AClass2']}
+    doc_index = {'doc': you_cant_serialize_this}
+    is_class = {
+        'tf': False,
+        'tf.AClass': True,
+        'tf.AClass2': True,
+        'tf.function': False
+    }
+    is_module = {
+        'tf': True,
+        'tf.AClass': False,
+        'tf.AClass2': False,
+        'tf.function': False
+    }
+    py_module_names = ['tf', 'tfdbg']
+
+    resolver = parser.ReferenceResolver(duplicate_of, doc_index, is_class,
+                                        is_module, py_module_names)
+
+    outdir = googletest.GetTempDir()
+
+    filepath = os.path.join(outdir, 'resolver.json')
+
+    resolver.to_json_file(filepath)
+    resolver2 = parser.ReferenceResolver.from_json_file(filepath, doc_index)
+
+    # There are no __slots__, so all fields are visible in __dict__.
+    self.assertEqual(resolver.__dict__, resolver2.__dict__)
+
+RELU_DOC = """Computes rectified linear: `max(features, 0)`
+
+Args:
+  features: A `Tensor`. Must be one of the following types: `float32`,
+    `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`,
+    `half`.
+  name: A name for the operation (optional)
+
+Returns:
+  A `Tensor`. Has the same type as `features`
+"""
+
+
+class TestParseFunctionDetails(googletest.TestCase):
+
+  def testParseFunctionDetails(self):
+    docstring, function_details = parser._parse_function_details(RELU_DOC)
+
+    self.assertEqual(len(function_details), 2)
+    args = function_details[0]
+    self.assertEqual(args.keyword, 'Args')
+    self.assertEmpty(args.header)
+    self.assertEqual(len(args.items), 2)
+    self.assertEqual(args.items[0][0], 'features')
+    self.assertEqual(args.items[1][0], 'name')
+    self.assertEqual(args.items[1][1],
+                     ' A name for the operation (optional)\n\n')
+    returns = function_details[1]
+    self.assertEqual(returns.keyword, 'Returns')
+
+    relu_doc_lines = RELU_DOC.split('\n')
+    self.assertEqual(docstring, relu_doc_lines[0] + '\n\n')
+    self.assertEqual(returns.header, relu_doc_lines[-2] + '\n')
+
+    self.assertEqual(
+        RELU_DOC,
+        docstring + ''.join(str(detail) for detail in function_details))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
new file mode 100644
index 00000000000..365008c3f09
--- /dev/null
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -0,0 +1,344 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A module for converting parsed doc content into markdown pages.
+
+The adjacent `parser` module creates `PageInfo` objects, containing all data
+necessary to document an element of the TensorFlow API.
+
+This module contains one public function, which handels the conversion of these
+`PageInfo` objects into a markdown string:
+
+    md_page = build_md_page(page_info)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+
+def build_md_page(page_info):
+  """Given a PageInfo object, return markdown for the page.
+
+  Args:
+    page_info: must be a `parser.FunctionPageInfo`, `parser.ClassPageInfo`, or
+        `parser.ModulePageInfo`
+
+  Returns:
+    Markdown for the page
+
+  Raises:
+    ValueError: if `page_info` is an instance of an unrecognized class
+  """
+  if page_info.for_function():
+    return _build_function_page(page_info)
+
+  if page_info.for_class():
+    return _build_class_page(page_info)
+
+  if page_info.for_module():
+    return _build_module_page(page_info)
+
+  raise ValueError('Unknown Page Info Type: %s' % type(page_info))
+
+
+def _build_function_page(page_info):
+  """Given a FunctionPageInfo object Return the page as an md string."""
+  parts = [_Metadata(page_info.full_name).build_html()]
+  parts.append('# %s\n\n' % page_info.full_name)
+
+  if len(page_info.aliases) > 1:
+    parts.append('### Aliases:\n\n')
+    parts.extend('* `%s`\n' % name for name in page_info.aliases)
+    parts.append('\n')
+
+  if page_info.signature is not None:
+    parts.append(_build_signature(page_info))
+
+  if page_info.defined_in:
+    parts.append('\n\n')
+    parts.append(str(page_info.defined_in))
+
+  parts.append(page_info.guides)
+  parts.append(page_info.doc.docstring)
+  parts.append(_build_function_details(page_info.doc.function_details))
+  parts.append(_build_compatibility(page_info.doc.compatibility))
+
+  return ''.join(parts)
+
+
+def _build_class_page(page_info):
+  """Given a ClassPageInfo object Return the page as an md string."""
+  meta_data = _Metadata(page_info.full_name)
+  for item in itertools.chain(
+      page_info.classes,
+      page_info.properties,
+      page_info.methods,
+      page_info.other_members):
+    meta_data.append(item)
+
+  parts = [meta_data.build_html()]
+
+  parts.append('# {page_info.full_name}\n\n'.format(page_info=page_info))
+
+  parts.append('## Class `%s`\n\n' % page_info.full_name.split('.')[-1])
+  if page_info.bases:
+    parts.append('Inherits From: ')
+
+    link_template = '[`{short_name}`]({url})'
+    parts.append(', '.join(
+        link_template.format(**base.__dict__) for base in page_info.bases))
+
+  parts.append('\n\n')
+
+  if len(page_info.aliases) > 1:
+    parts.append('### Aliases:\n\n')
+    parts.extend('* Class `%s`\n' % name for name in page_info.aliases)
+    parts.append('\n')
+
+  if page_info.defined_in is not None:
+    parts.append('\n\n')
+    parts.append(str(page_info.defined_in))
+
+  parts.append(page_info.guides)
+  parts.append(page_info.doc.docstring)
+  parts.append(_build_function_details(page_info.doc.function_details))
+  assert not page_info.doc.compatibility
+  parts.append('\n\n')
+
+  if page_info.classes:
+    parts.append('## Child Classes\n')
+
+    link_template = ('[`class {class_info.short_name}`]'
+                     '({class_info.url})\n\n')
+    class_links = sorted(
+        link_template.format(class_info=class_info)
+        for class_info in page_info.classes)
+
+    parts.extend(class_links)
+
+  if page_info.properties:
+    parts.append('## Properties\n\n')
+    for prop_info in sorted(page_info.properties):
+      h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
+      parts.append(h3.format(short_name=prop_info.short_name))
+
+      parts.append(prop_info.doc.docstring)
+      parts.append(_build_function_details(prop_info.doc.function_details))
+      assert not prop_info.doc.compatibility
+      parts.append('\n\n')
+
+    parts.append('\n\n')
+
+  if page_info.methods:
+    parts.append('## Methods\n\n')
+    # Sort the methods list, but make sure constructors come first.
+    constructors = ['__init__', '__new__']
+    inits = [method for method in page_info.methods
+             if method.short_name in constructors]
+    others = [method for method in page_info.methods
+              if method.short_name not in constructors]
+
+    for method_info in sorted(inits) + sorted(others):
+      h3 = ('<h3 id="{short_name}">'
+            '<code>{short_name}</code>'
+            '</h3>\n\n')
+      parts.append(h3.format(**method_info.__dict__))
+
+      if method_info.signature is not None:
+        parts.append(_build_signature(method_info))
+
+      parts.append(method_info.doc.docstring)
+      parts.append(_build_function_details(method_info.doc.function_details))
+      parts.append(_build_compatibility(method_info.doc.compatibility))
+      parts.append('\n\n')
+    parts.append('\n\n')
+
+  if page_info.other_members:
+    parts.append('## Class Members\n\n')
+
+    # TODO(markdaoust): Document the value of the members,
+    #                   at least for basic types.
+
+    h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
+    others_member_headings = (h3.format(short_name=info.short_name)
+                              for info in sorted(page_info.other_members))
+    parts.extend(others_member_headings)
+
+  return ''.join(parts)
+
+
+def _build_module_page(page_info):
+  """Given a ClassPageInfo object Return the page as an md string."""
+  meta_data = _Metadata(page_info.full_name)
+
+  # Objects with their own pages are not added to the matadata list for the
+  # module, as the only thing on the module page is a link to the object's page.
+  for item in page_info.other_members:
+    meta_data.append(item)
+
+  parts = [meta_data.build_html()]
+
+  parts.append(
+      '# Module: {full_name}\n\n'.format(full_name=page_info.full_name))
+
+  if len(page_info.aliases) > 1:
+    parts.append('### Aliases:\n\n')
+    parts.extend('* Module `%s`\n' % name for name in page_info.aliases)
+    parts.append('\n')
+
+  if page_info.defined_in is not None:
+    parts.append('\n\n')
+    parts.append(str(page_info.defined_in))
+
+  parts.append(page_info.doc.docstring)
+  parts.append('\n\n')
+
+  if page_info.modules:
+    parts.append('## Modules\n\n')
+    template = '[`{short_name}`]({url}) module'
+
+    for item in page_info.modules:
+      parts.append(template.format(**item.__dict__))
+
+      if item.doc.brief:
+        parts.append(': ' + item.doc.brief)
+
+      parts.append('\n\n')
+
+  if page_info.classes:
+    parts.append('## Classes\n\n')
+    template = '[`class {short_name}`]({url})'
+
+    for item in page_info.classes:
+      parts.append(template.format(**item.__dict__))
+
+      if item.doc.brief:
+        parts.append(': ' + item.doc.brief)
+
+      parts.append('\n\n')
+
+  if page_info.functions:
+    parts.append('## Functions\n\n')
+    template = '[`{short_name}(...)`]({url})'
+
+    for item in page_info.functions:
+      parts.append(template.format(**item.__dict__))
+
+      if item.doc.brief:
+        parts.append(': ' + item.doc.brief)
+
+      parts.append('\n\n')
+
+  if page_info.other_members:
+    # TODO(markdaoust): Document the value of the members,
+    #                   at least for basic types.
+    parts.append('## Other Members\n\n')
+
+    for item in page_info.other_members:
+      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+
+  return ''.join(parts)
+
+
+def _build_signature(obj_info):
+  """Returns a md code block showing the function signature."""
+  # Special case tf.range, since it has an optional first argument
+  if obj_info.full_name == 'tf.range':
+    return (
+        '``` python\n'
+        "range(limit, delta=1, dtype=None, name='range')\n"
+        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        '```\n\n')
+
+  signature_template = '\n'.join([
+      '``` python',
+      '{name}({sig})',
+      '```\n\n'])
+
+  if not obj_info.signature:
+    sig = ''
+  elif len(obj_info.signature) == 1:
+    sig = obj_info.signature[0]
+  else:
+    sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
+    sig = '\n'+sig+'\n'
+
+  return signature_template.format(name=obj_info.short_name, sig=sig)
+
+
+def _build_compatibility(compatibility):
+  """Return the compatibility section as an md string."""
+  parts = []
+  sorted_keys = sorted(compatibility.keys())
+  for key in sorted_keys:
+
+    value = compatibility[key]
+    parts.append('\n\n#### %s compatibility\n%s\n' % (key, value))
+
+  return ''.join(parts)
+
+
+def _build_function_details(function_details):
+  """Return the function details section as an md string."""
+  parts = []
+  for detail in function_details:
+    sub = []
+    sub.append('#### ' + detail.keyword + ':\n\n')
+    sub.append(detail.header)
+    for key, value in detail.items:
+      sub.append('* <b>`%s`</b>:%s' % (key, value))
+    parts.append(''.join(sub))
+
+  return '\n'.join(parts)
+
+
+class _Metadata(object):
+  """A class for building a page's Metadata block.
+
+  Attributes:
+    name: The name of the page being described by the Metadata block.
+  """
+
+  def __init__(self, name):
+    """Creata a Metadata builder.
+
+    Args:
+      name: The name of the page being described by the Metadata block.
+    """
+    self.name = name
+    self._content = []
+
+  def append(self, item):
+    """Add an item from the page to the Metadata block.
+
+    Args:
+      item: The parsed page section to add.
+    """
+    self._content.append(item.short_name)
+
+  def build_html(self):
+    """Return the Metadata block as an Html string."""
+    schema = 'http://developers.google.com/ReferenceObject'
+    parts = ['<div itemscope itemtype="%s">' % schema]
+
+    parts.append('<meta itemprop="name" content="%s" />' % self.name)
+    for item in self._content:
+      parts.append('<meta itemprop="property" content="%s"/>' % item)
+
+    parts.extend(['</div>', '', ''])
+
+    return '\n'.join(parts)
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
new file mode 100644
index 00000000000..216353ecee3
--- /dev/null
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Library for operating on Python API Guide files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+
+
+def md_files_in_dir(py_guide_src_dir):
+  """Returns a list of filename (full_path, base) pairs for guide files."""
+  all_in_dir = [(os.path.join(py_guide_src_dir, f), f)
+                for f in os.listdir(py_guide_src_dir)]
+  return [(full, f) for full, f in all_in_dir
+          if os.path.isfile(full) and f.endswith('.md')]
+
+
+class PyGuideParser(object):
+  """Simple parsing of a guide .md file.
+
+  Descendants can override the process_*() functions (called by process())
+  to either record information from the guide, or call replace_line()
+  to affect the return value of process().
+  """
+
+  def __init__(self):
+    self._lines = None
+
+  def process(self, full_path):
+    """Read and process the file at `full_path`."""
+    md_string = open(full_path).read()
+    self._lines = md_string.split('\n')
+    seen = set()
+
+    in_blockquote = False
+    for i, line in enumerate(self._lines):
+      if '```' in line:
+        in_blockquote = not in_blockquote
+
+      if not in_blockquote and line.startswith('# '):
+        self.process_title(i, line[2:])
+      elif not in_blockquote and line.startswith('## '):
+        section_title = line.strip()[3:]
+        existing_tag = re.search(' {([^}]+)} *$', line)
+        if existing_tag:
+          tag = existing_tag.group(1)
+        else:
+          tag = re.sub('[^a-zA-Z0-9]+', '_', section_title)
+          if tag in seen:
+            suffix = 0
+            while True:
+              candidate = '%s_%d' % (tag, suffix)
+              if candidate not in seen:
+                tag = candidate
+                break
+        seen.add(tag)
+        self.process_section(i, section_title, tag)
+
+      elif in_blockquote:
+        self.process_in_blockquote(i, line)
+      else:
+        self.process_line(i, line)
+
+    ret = '\n'.join(self._lines)
+    self._lines = None
+    return ret
+
+  def replace_line(self, line_number, line):
+    """Replace the contents of line numbered `line_number` with `line`."""
+    self._lines[line_number] = line
+
+  def process_title(self, line_number, title):
+    pass
+
+  def process_section(self, line_number, section_title, tag):
+    pass
+
+  def process_in_blockquote(self, line_number, line):
+    pass
+
+  def process_line(self, line_number, line):
+    pass
diff --git a/tensorflow/tools/docs/py_guide_parser_test.py b/tensorflow/tools/docs/py_guide_parser_test.py
new file mode 100644
index 00000000000..168b0535a94
--- /dev/null
+++ b/tensorflow/tools/docs/py_guide_parser_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for py_guide_parser."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.platform import test
+from tensorflow.tools.docs import py_guide_parser
+
+
+class TestPyGuideParser(py_guide_parser.PyGuideParser):
+
+  def __init__(self):
+    self.calls = []
+    py_guide_parser.PyGuideParser.__init__(self)
+
+  def process_title(self, line_number, title):
+    self.calls.append((line_number, 't', title))
+
+  def process_section(self, line_number, section_title, tag):
+    self.calls.append((line_number, 's', '%s : %s' % (section_title, tag)))
+
+  def process_in_blockquote(self, line_number, line):
+    self.calls.append((line_number, 'b', line))
+    self.replace_line(line_number, line + ' BQ')
+
+  def process_line(self, line_number, line):
+    self.calls.append((line_number, 'l', line))
+
+
+class PyGuideParserTest(test.TestCase):
+
+  def testBasics(self):
+    tmp = os.path.join(test.get_temp_dir(), 'py_guide_parser_test.md')
+    f = open(tmp, 'w')
+    f.write("""# a title
+a line
+## a section
+```shell
+in a blockquote
+```
+out of blockquote
+""")
+    f.close()
+    parser = TestPyGuideParser()
+    result = parser.process(tmp)
+    expected = """# a title
+a line
+## a section
+```shell BQ
+in a blockquote BQ
+```
+out of blockquote
+"""
+    self.assertEqual(expected, result)
+    expected = [(0, 't', 'a title'),
+                (1, 'l', 'a line'),
+                (2, 's', 'a section : a_section'),
+                (3, 'b', '```shell'),
+                (4, 'b', 'in a blockquote'),
+                (5, 'l', '```'),
+                (6, 'l', 'out of blockquote'),
+                (7, 'l', '')]
+    self.assertEqual(expected, parser.calls)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/tools/docs/tf-doxy_for_md-config b/tensorflow/tools/docs/tf-doxy_for_md-config
deleted file mode 100644
index b7fd6e95076..00000000000
--- a/tensorflow/tools/docs/tf-doxy_for_md-config
+++ /dev/null
@@ -1,2280 +0,0 @@
-# Doxyfile 1.8.5
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "TensorFlow"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 0.0.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = /tmp/tensorflow-docs/
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-
-# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi,
-# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en,
-# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish,
-# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
-# Turkish, Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C.
-#
-# Note For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. Do not use file names with spaces, bibtex cannot handle them. See
-# also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = core/framework core/lib/core core/platform core/public
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
-# defined cascading style sheet that is included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
-# Doxygen will copy the style sheet file to the output directory. For an example
-# see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = NO
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavours of web server based searching depending on the
-# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
-# searching and an index file used by the script. When EXTERNAL_SEARCH is
-# enabled the indexing and searching needs to be provided by external tools. See
-# the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
-# replace them by respectively the title of the page, the current date and time,
-# only the current date, the version number of doxygen, the project name (see
-# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                =
-
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all refrences to function-like macros that are alone on a line, have an
-# all uppercase name, and do not end with a semicolon. Such function macros are
-# typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have an unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font n the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif and svg.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 1df00692725..581bded65da 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -3,7 +3,7 @@ FROM ubuntu:16.04
 MAINTAINER Shanqing Cai <cais@google.com>
 
 RUN apt-get update
-RUN apt-get install -y --no-install-recommends \
+RUN apt-get install -y \
     curl \
     libcurl4-openssl-dev \
     python \
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
index 2ce0fb394f3..7146213b339 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -71,7 +71,7 @@ rm -rf ${LOG_FILE} || \
 
 # Invoke main Python file
 python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
-    2>&1 > "${LOG_FILE}"
+    > "${LOG_FILE}" 2>&1
 
 if [[ $? != "0" ]]; then
   cat ${LOG_FILE}
@@ -92,6 +92,9 @@ NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
 if [[ -z ${NEW_TFREC_URL} ]]; then
   die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
 fi
-"${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
-    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+if "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}"
+then
+  echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+else
+  die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+fi
diff --git a/tensorflow/tools/gcs_test/python/gcs_smoke.py b/tensorflow/tools/gcs_test/python/gcs_smoke.py
index 615e142c471..51933a52a66 100644
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@@ -36,7 +36,7 @@ flags.DEFINE_integer("num_examples", 10, "Number of examples to generate")
 FLAGS = flags.FLAGS
 
 def create_examples(num_examples, input_mean):
-  """Create ExampleProto's containg data."""
+  """Create ExampleProto's containing data."""
   ids = np.arange(num_examples).reshape([num_examples, 1])
   inputs = np.random.randn(num_examples, 1) + input_mean
   target = inputs - input_mean
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index adc6bdbd17e..a7f8b5bb5f2 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -76,7 +76,11 @@ def configure(src_base_path, debug=False):
   # Remove and recreate the path
   if os.path.exists(gen_path):
     if os.path.isdir(gen_path):
-      shutil.rmtree(gen_path)
+      try:
+        shutil.rmtree(gen_path)
+      except OSError:
+        raise RuntimeError("Cannot delete directory %s due to permission "
+                           "error, inspect and remove manually" % gen_path)
     else:
       raise RuntimeError("Cannot delete non-directory %s, inspect ",
                          "and remove manually" % gen_path)
@@ -149,7 +153,7 @@ def get_git_version(git_base_path):
   try:
     val = bytes(subprocess.check_output([
         "git", str("--git-dir=%s/.git" % git_base_path),
-        str("--work-tree=" + git_base_path), "describe", "--long", "--dirty", "--tags"
+        str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
     return val if val else unknown_label
   except subprocess.CalledProcessError:
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index ce128822d78..977fe16333d 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -20,7 +20,7 @@ if [[ -z "${OUTPUT_FILENAME}"  ]]; then
   exit 1
 fi
 
-GIT_VERSION=`git describe --long --dirty --tags`
+GIT_VERSION=$(git describe --long --tags)
 if [[ $? != 0 ]]; then
    GIT_VERSION=unknown;
 fi
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 509c8c310fe..fa2cf15cb16 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -45,8 +45,8 @@ tf_cc_test(
         ":transform_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -59,13 +59,14 @@ cc_library(
     name = "transforms_lib",
     srcs = [
         "add_default_attributes.cc",
+        "backports.cc",
         "fold_batch_norms.cc",
         "fold_constants_lib.cc",
         "fold_old_batch_norms.cc",
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
         "insert_logging.cc",
-        "obsfucate_names.cc",
+        "obfuscate_names.cc",
         "remove_attribute.cc",
         "remove_device.cc",
         "remove_nodes.cc",
@@ -73,6 +74,7 @@ cc_library(
         "rename_op.cc",
         "set_device.cc",
         "sort_by_execution_order.cc",
+        "sparsify_gather.cc",
         "strip_unused_nodes.cc",
     ] + if_not_windows([
         "quantize_nodes.cc",
@@ -95,6 +97,8 @@ cc_library(
         "//tensorflow/core:tensorflow",
     ] + if_not_windows([
         "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
+        "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
     ]),
     alwayslink = 1,
 )
@@ -104,13 +108,14 @@ tf_cc_test(
     size = "small",
     srcs = [
         "add_default_attributes_test.cc",
+        "backports_test.cc",
         "fold_batch_norms_test.cc",
         "fold_constants_test.cc",
         "fold_old_batch_norms_test.cc",
         "freeze_requantization_ranges_test.cc",
         "fuse_convolutions_test.cc",
         "insert_logging_test.cc",
-        "obsfucate_names_test.cc",
+        "obfuscate_names_test.cc",
         "quantize_nodes_test.cc",
         "quantize_weights_test.cc",
         "remove_attribute_test.cc",
@@ -121,6 +126,7 @@ tf_cc_test(
         "round_weights_test.cc",
         "set_device_test.cc",
         "sort_by_execution_order_test.cc",
+        "sparsify_gather_test.cc",
         "strip_unused_nodes_test.cc",
     ],
     deps = [
@@ -211,6 +217,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 37545feb4f0..b4274e67df3 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -13,6 +13,7 @@
     *   [Eight-bit Calculations](#eight-bit-calculations)
 *   [Transform Reference](#transform-reference)
     *   [add_default_attributes](#add_default_attributes)
+    *   [backport_concatv2](#backport_concatv2)
     *   [fold_batch_norms](#fold_batch_norms)
     *   [fold_constants](#fold_constants)
     *   [fold_old_batch_norms](#fold_old_batch_norms)
@@ -20,7 +21,7 @@
     *   [fuse_convolutions](#fuse_convolutions)
     *   [insert_logging](#insert_logging)
     *   [merge_duplicate_nodes](#merge_duplicate_nodes)
-    *   [obsfucate_names](#obsfucate_names)
+    *   [obfuscate_names](#obfuscate_names)
     *   [quantize_nodes](#quantize_nodes)
     *   [quantize_weights](#quantize_weights)
     *   [remove_attribute](#remove_attribute)
@@ -29,6 +30,7 @@
     *   [rename_attribute](#rename_attribute)
     *   [rename_op](#rename_op)
     *   [round_weights](#round_weights)
+    *   [sparsify_gather](#sparsify_gather)
     *   [set_device](#set_device)
     *   [sort_by_execution_order](#sort_by_execution_order)
     *   [strip_unused_nodes](#strip_unused_nodes)
@@ -79,10 +81,10 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul:0' \
 --outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_old_batch_norms \
+--transforms='
+strip_unused_nodes(type=float, shape="1,299,299,3")
+remove_nodes(op=Identity, op=CheckNumerics)
+fold_old_batch_norms
 '
 ```
 
@@ -92,7 +94,10 @@ transforms to modify the graph with. The transforms are given as a list of
 names, and can each have arguments themselves. These transforms define the
 pipeline of modifications that are applied in order to produce the output.
 Sometimes you need some transforms to happen before others, and the ordering
-within the list lets you specify which happen first.
+within the list lets you specify which happen first. 
+Note that the optimization 
+`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control 
+flow operations, such as `tf.cond`, `tf.map_fn`, and `tf.while`.
 
 ## Inspecting Graphs
 
@@ -101,8 +106,8 @@ output layers of the model are. The best source for these is the model training
 process, where for a classifier the inputs will be the nodes that receive the
 data from the training set, and the output will be the predictions. If you're
 unsure, the
-[summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph.cc)
-can inspect the model and provide guesses about likely input and output nodes,
+[`summarize_graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph_main.cc)
+tool can inspect the model and provide guesses about likely input and output nodes,
 as well as other information that's useful for debugging. Here's an example of
 how to use it on the [Inception V3
 graph](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz):
@@ -134,15 +139,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_constants(ignore_errors=true) \
-fold_batch_norms \
-fold_old_batch_norms\
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  remove_nodes(op=Identity, op=CheckNumerics)
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms'
 ```
 
 The batch norm folding is included twice because there are two different flavors
@@ -168,21 +172,20 @@ then you'll need to make local modifications to the build files to include the
 right .cc file that defines it. In a lot of cases the op is just a vestigial
 remnant from the training process though, and if that's true then you can run
 the [strip_unused_nodes](#strip_unused_nodes), specifying the inputs and outputs
-of your inference usage, to remove those unneccessary nodes:
+of your inference usage, to remove those unnecessary nodes:
 
 ```bash
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-fold_constants \
-fold_batch_norms \
-fold_old_batch_norms\
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms'
 ```
 
 ### Shrinking File Size
@@ -210,11 +213,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-round_weights(num_steps=256) \
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  round_weights(num_steps=256)'
 ```
 
 You should see that the `optimized_inception_graph.pb` output file is the same
@@ -234,11 +240,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-quantize_weights \
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  quantize_weights'
 ```
 
 You should see that the size of the output graph is about a quarter of the
@@ -250,7 +259,7 @@ results are cached and so you shouldn't see the graph run any more slowly.
 So far we've been concentrating on weights because those generally take up the
 most space. If you have a graph with a lot of small nodes in it, the names of
 those nodes can start to take up a noticeable amount of space too. To shrink
-those down, you can run the [obsfucate_names](#obsfucate_names) transform, which
+those down, you can run the [obfuscate_names](#obfuscate_names) transform, which
 replaces all the names (except for inputs and outputs) with short, cryptic but
 unique ids:
 
@@ -261,9 +270,8 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul:0' \
 --outputs='softmax:0' \
---transforms='\
-obsfucate_names \
-'
+--transforms='
+  obfuscate_names'
 ```
 
 ### Eight-bit Calculations
@@ -278,16 +286,19 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_old_batch_norms \
-quantize_weights \
-quantize_nodes \
-strip_unused_nodes \
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  add_default_attributes
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  remove_nodes(op=Identity, op=CheckNumerics)
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  quantize_weights
+  quantize_nodes
+  strip_unused_nodes
+  sort_by_execution_order'
 ```
 
 This process converts all the operations in the graph that have eight-bit
@@ -312,7 +323,7 @@ themselves contain commas (for example shape definitions).
 The --inputs and --outputs are shared across all transforms, since it's common
 to need to know what the ingoing and outgoing nodes in the graph are. You should
 make sure you set these correctly before calling the graph transform tool, and
-if you're in doubt check with the model's author, or use the `check_graph` tool
+if you're in doubt check with the model's author, or use the [`summarize_graph`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms#inspecting-graphs) tool
 to examine likely inputs and outputs.
 
 All transforms can be passed the `ignore_errors` flag, with the value set to
@@ -334,18 +345,28 @@ can be useful to run this update process as a transform. This process finds any
 op attributes that are defined in the current TensorFlow list of ops but not
 within the saved model, and sets them to the defined default for that attribute.
 
+### backport_concatv2
+
+Args: None
+
+If you have a GraphDef file that has been produced by a newer version of the
+TensorFlow framework and includes ConcatV2, and you want to run it on an older
+version that only supports Concat, this transform will take care of converting
+those newer ops to the equivalent older form.
+
 ### fold_batch_norms
 
 Args: None \
 Prerequisites: [fold_constants](#fold_constants)
 
 This transform tries to optimize away the Mul that's introduced after a Conv2D
-when batch normalization has been used during training. It scans the graph for
-any channel-wise multiplies immediately after convolutions, and multiplies the
-convolution's weights with the Mul instead so this can be omitted at inference
-time. You'll need to make sure you run [fold_constants](#fold_constants) first,
-since the pattern can only be spotted if the normal complex expression that's
-produced by training for the Mul input is collapsed down into a simple constant.
+(or a MatMul) when batch normalization has been used during training. It scans
+the graph for any channel-wise multiplies immediately after convolutions, and
+multiplies the convolution's (or matrix multiplication's) weights with the Mul
+instead so this can be omitted at inference time. You'll need to make sure you
+run [fold_constants](#fold_constants) first, since the pattern can only be
+spotted if the normal complex expression that's produced by training for the Mul
+input is collapsed down into a simple constant.
 
 ### fold_constants
 
@@ -410,12 +431,11 @@ graph:
 ```bash
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
---logtostderr \
 --in_graph=/tmp/quantized_inception.pb \
 --out_graph=/tmp/logged_quantized_inception.pb \
 --inputs=Mul \
 --outputs=softmax \
---transforms='\
+--transforms='
 insert_logging(op=RequantizationRange, show_name=true, message="__requant_min_max:")\
 '
 ```
@@ -429,12 +449,10 @@ log:
 bazel build tensorflow/examples/label_image:label_image
 bazel-bin/tensorflow/examples/label_image/label_image \
 --image=${HOME}/Downloads/grace_hopper.jpg \
---logtostderr \
 --input_layer=Mul \
 --output_layer=softmax \
 --graph=/tmp/logged_quantized_inception.pb \
---labels=learning/brain/models/image/inception_v3/imagenet_comp_graph_label_strings.txt \
---logtostderr \
+--labels=${HOME}/Downloads/imagenet_comp_graph_label_strings.txt \
 2>/tmp/min_max_log_small.txt
 ```
 
@@ -518,7 +536,7 @@ of redundancy (e.g. this transform is always run as part of
 [quantize_nodes](#quantize_nodes) since the processing there can introduce
 duplicates of constants that are used in the quantize/dequantize process).
 
-### obsfucate_names
+### obfuscate_names
 
 Args: None \
 Prerequisites: None
@@ -567,7 +585,10 @@ Converts any large (more than 15 element) float Const op into an eight-bit
 equivalent, followed by a float conversion op so that the result is usable by
 subsequent nodes. This is mostly useful for [shrinking file
 sizes](#shrinking-file-size), but also helps with the more advanced
-[quantize_nodes](#quantize_nodes) transform.
+[quantize_nodes](#quantize_nodes) transform. Even though there are no
+prerequisites, it is advisable to run [fold_batch_norms](#fold_batch_norms) or
+[fold_old_batch_norms](#fold_old_batch_norms), because rounding variances down
+to zero may cause significant loss of precision.
 
 ### remove_attribute
 
@@ -652,7 +673,21 @@ Rounds all float values in large Const ops (more than 15 elements) to the given
 number of steps. The unique values are chosen per buffer by linearly allocating
 between the largest and smallest values present. This is useful when you'll be
 deploying on mobile, and you want a model that will compress effectively. See
-[shrinking file size](#shrinking-file-size) for more details.
+[shrinking file size](#shrinking-file-size) for more details. Even though there
+are no prerequisites, it is advisable to run
+[fold_batch_norms](#fold_batch_norms) or
+[fold_old_batch_norms](#fold_old_batch_norms), because rounding variances down
+to zero may cause significant loss of precision.
+
+### sparsify_gather
+
+Args: None \
+Prerequisites: None
+
+Transform 'Gather' op to a sparsified version where 'params' input of 'Gather'
+is replaced from a dense 'Const' to a 'HashTable'. 'Gather' op itself is
+replaced by a hashtable lookup. This is mostly useful for reducing sparse
+TF.learn linear model memory footprint.
 
 ### set_device
 
@@ -713,7 +748,7 @@ shape arguments let you control the attributes of any new Placeholders that are
 created. Plain `type` and `shape` set global defaults, but if you have different
 inputs with varying characteristics, you'll need to pass in a list of arguments
 where the preceding name specifies what layer each applies to. For example, if
-you had two inputs in1 and in2, you could call `strip_unused_node(name=in1,
+you had two inputs in1 and in2, you could call `strip_unused_nodes(name=in1,
 type_for_name=int32, shape_for_name="2,3", name=in2, type_for_name=float,
 shape_for_name="1,10,10,3")`.
 
@@ -963,7 +998,7 @@ There are a few things to know about the `ReplaceMatchingOpTypes` function:
     important nodes are listed in the `output_nodes` argument that's passed into
     each replacement function call. You can disable this checking by setting
     `allow_inconsistencies` to true in the options, but otherwise any
-    replacements that break the graph constraints will be cancelled. If you do
+    replacements that break the graph constraints will be canceled. If you do
     allow inconsistencies, it's your transform's responsibility to fix them up
     before you return your final result. Functions like `RenameNodeInputs` can
     be useful if you are doing wholesale node renaming for example.
diff --git a/tensorflow/tools/graph_transforms/backports.cc b/tensorflow/tools/graph_transforms/backports.cc
new file mode 100644
index 00000000000..5c153e8cefc
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/backports.cc
@@ -0,0 +1,136 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
+
+#include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Switch any ConcatV2 nodes to the v1 version, swapping the input order.
+Status BackportConcatV2Transform(const GraphDef& input_graph_def,
+                                 const TransformFuncContext& context,
+                                 GraphDef* output_graph_def) {
+  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+      input_graph_def, {"ConcatV2"},
+      [](const NodeMatch& match, const std::set<string>& input_nodes,
+         const std::set<string>& output_nodes,
+         std::vector<NodeDef>* new_nodes) {
+        const NodeDef& concat_v2_node = match.node;
+        NodeDef concat_node = concat_v2_node;
+        concat_node.set_op("Concat");
+        // The last input is inserted at the head of the inputs, because Concat
+        // expects the dimension as the first input (not the last as in
+        // ConcatV2).
+        concat_node.mutable_input()->Clear();
+        const string& dim_input =
+            concat_v2_node.input(concat_v2_node.input_size() - 1);
+        concat_node.add_input(dim_input);
+        for (int i = 0; i < (concat_v2_node.input_size() - 1); ++i) {
+          concat_node.add_input(concat_v2_node.input(i));
+        }
+        // Tidx attribute must be deleted because it's not used in Concat.
+        concat_node.mutable_attr()->erase("Tidx");
+        new_nodes->push_back(concat_node);
+        return Status::OK();
+      },
+      {true}, output_graph_def));
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("backport_concatv2", BackportConcatV2Transform);
+
+// Switch any TensorArrayV3 nodes to the v2 version, removing the second output.
+Status BackportTensorArrayV3Transform(const GraphDef& input_graph_def,
+                                      const TransformFuncContext& context,
+                                      GraphDef* output_graph_def) {
+  std::map<string, string> inputs_to_rename;
+  GraphDef replaced_graph_def;
+  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+      input_graph_def, {"TensorArrayV3|TensorArrayGradV3"},
+      [&inputs_to_rename](const NodeMatch& match,
+                          const std::set<string>& input_nodes,
+                          const std::set<string>& output_nodes,
+                          std::vector<NodeDef>* new_nodes) {
+        const NodeDef& tensor_array_v3_node = match.node;
+
+        // All we need to do here is rename the op type, since the attributes
+        // remain the same.
+        NodeDef tensor_array_v2_node = tensor_array_v3_node;
+        if (tensor_array_v3_node.op() == "TensorArrayV3") {
+          tensor_array_v2_node.set_op("TensorArrayV2");
+        } else {
+          tensor_array_v2_node.set_op("TensorArrayGradV2");
+        }
+
+        // The v3 version has a second 'flow' output that's not present in v2,
+        // so substitute a dummy constant instead in any places that use it.
+        NodeDef replacement_flow_node;
+        replacement_flow_node.set_op("Const");
+        SetNodeAttr("dtype", DT_FLOAT, &replacement_flow_node);
+        replacement_flow_node.set_name(tensor_array_v3_node.name() +
+                                       "/replacement_flow_node");
+        Tensor replacement_flow_tensor(DT_FLOAT, {});
+        // I'm picking an arbitrary value for the gradient flow here, for lack
+        // of a better alternative.
+        replacement_flow_tensor.flat<float>()(0) = 1.0f;
+        SetNodeTensorAttr<float>("value", replacement_flow_tensor,
+                                 &replacement_flow_node);
+        inputs_to_rename[tensor_array_v3_node.name() + ":1"] =
+            replacement_flow_node.name();
+
+        new_nodes->push_back(tensor_array_v2_node);
+        new_nodes->push_back(replacement_flow_node);
+        return Status::OK();
+      },
+      {true}, &replaced_graph_def));
+  // Update the graph so that any nodes that referred to removed inputs now
+  // pull from the substitute constants we've added.
+  GraphDef renamed_graph_def;
+  TF_RETURN_IF_ERROR(RenameNodeInputs(replaced_graph_def, inputs_to_rename,
+                                      std::unordered_set<string>(),
+                                      &renamed_graph_def));
+  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+      renamed_graph_def,
+      {"TensorArrayWriteV3|TensorArrayReadV3|TensorArrayGatherV3|"
+       "TensorArrayScatterV3|TensorArrayConcatV3|TensorArraySplitV3|"
+       "TensorArraySizeV3|TensorArrayCloseV3"},
+      [](const NodeMatch& match, const std::set<string>& input_nodes,
+         const std::set<string>& output_nodes,
+         std::vector<NodeDef>* new_nodes) {
+        const NodeDef& v3_node = match.node;
+        NodeDef v2_node = v3_node;
+        v2_node.set_op(v3_node.op().substr(0, v3_node.op().size() - 1) + "2");
+        new_nodes->push_back(v2_node);
+        return Status::OK();
+      },
+      {true}, output_graph_def));
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("backport_tensor_array_v3",
+                         BackportTensorArrayV3Transform);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/backports_test.cc b/tensorflow/tools/graph_transforms/backports_test.cc
new file mode 100644
index 00000000000..ab9a61afa7e
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/backports_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declare here, so we don't need a public header.
+Status BackportConcatV2Transform(const GraphDef& input_graph_def,
+                                 const TransformFuncContext& context,
+                                 GraphDef* output_graph_def);
+Status BackportTensorArrayV3Transform(const GraphDef& input_graph_def,
+                                      const TransformFuncContext& context,
+                                      GraphDef* output_graph_def);
+
+class BackportConcatV2Test : public ::testing::Test {
+ protected:
+  void TestBackportConcatV2() {
+    GraphDef graph_def;
+
+    NodeDef* mul_node1 = graph_def.add_node();
+    mul_node1->set_name("mul_node1");
+    mul_node1->set_op("Mul");
+    mul_node1->add_input("add_node2");
+    mul_node1->add_input("add_node3");
+
+    NodeDef* add_node2 = graph_def.add_node();
+    add_node2->set_name("add_node2");
+    add_node2->set_op("Add");
+    add_node2->add_input("const_node1");
+    add_node2->add_input("const_node2");
+
+    NodeDef* add_node3 = graph_def.add_node();
+    add_node3->set_name("add_node3");
+    add_node3->set_op("Add");
+    add_node3->add_input("const_node1");
+    add_node3->add_input("const_node3");
+
+    NodeDef* const_node1 = graph_def.add_node();
+    const_node1->set_name("const_node1");
+    const_node1->set_op("Const");
+
+    NodeDef* const_node2 = graph_def.add_node();
+    const_node2->set_name("const_node2");
+    const_node2->set_op("Const");
+
+    NodeDef* const_node3 = graph_def.add_node();
+    const_node3->set_name("const_node3");
+    const_node3->set_op("Const");
+
+    NodeDef* concat_node = graph_def.add_node();
+    concat_node->set_name("concat_node");
+    concat_node->set_op("ConcatV2");
+    concat_node->add_input("const_node1");
+    concat_node->add_input("const_node2");
+    concat_node->add_input("const_node3");
+    SetNodeAttr("Tidx", DT_INT32, concat_node);
+
+    GraphDef result;
+    TransformFuncContext context;
+    context.input_names = {};
+    context.output_names = {"concat_node"};
+    TF_ASSERT_OK(BackportConcatV2Transform(graph_def, context, &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+    EXPECT_EQ(1, node_lookup.count("concat_node"));
+    EXPECT_EQ("Concat", node_lookup.at("concat_node")->op());
+    EXPECT_EQ(0, node_lookup.at("concat_node")->attr().count("Tidx"));
+    EXPECT_EQ("const_node3", node_lookup.at("concat_node")->input(0));
+    EXPECT_EQ("const_node1", node_lookup.at("concat_node")->input(1));
+    EXPECT_EQ("const_node2", node_lookup.at("concat_node")->input(2));
+    EXPECT_EQ(1, node_lookup.count("const_node1"));
+    EXPECT_EQ("Const", node_lookup.at("const_node1")->op());
+    EXPECT_EQ(1, node_lookup.count("const_node2"));
+    EXPECT_EQ("Const", node_lookup.at("const_node2")->op());
+    EXPECT_EQ(1, node_lookup.count("const_node3"));
+    EXPECT_EQ("Const", node_lookup.at("const_node3")->op());
+  }
+};
+
+TEST_F(BackportConcatV2Test, TestBackportConcatV2) { TestBackportConcatV2(); }
+
+TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3) {
+  GraphDef graph_def;
+
+  NodeDef* size_node = graph_def.add_node();
+  size_node->set_name("size_node");
+  size_node->set_op("Const");
+  Tensor size_tensor(DT_INT32, {});
+  size_tensor.flat<int32>()(0) = 1;
+  SetNodeTensorAttr<float>("value", size_tensor, size_node);
+
+  NodeDef* tensor_array_node = graph_def.add_node();
+  tensor_array_node->set_name("tensor_array_node");
+  tensor_array_node->set_op("TensorArrayV3");
+  tensor_array_node->add_input("size_node");
+  SetNodeAttr("dtype", DT_FLOAT, tensor_array_node);
+  SetNodeAttr("element_shape", TensorShape({1, 2}), tensor_array_node);
+  SetNodeAttr("dynamic_size", false, tensor_array_node);
+  SetNodeAttr("clear_after_read", true, tensor_array_node);
+  SetNodeAttr("tensor_array_name", "some_name", tensor_array_node);
+
+  NodeDef* handle_output_node = graph_def.add_node();
+  handle_output_node->set_name("handle_output_node");
+  handle_output_node->set_op("Identity");
+  handle_output_node->add_input("tensor_array_node:0");
+
+  NodeDef* flow_output_node = graph_def.add_node();
+  flow_output_node->set_name("flow_output_node");
+  flow_output_node->set_op("Identity");
+  flow_output_node->add_input("tensor_array_node:1");
+
+  NodeDef* tensor_array_grad_node = graph_def.add_node();
+  tensor_array_grad_node->set_name("tensor_array_grad_node");
+  tensor_array_grad_node->set_op("TensorArrayGradV3");
+  tensor_array_grad_node->add_input("tensor_array_node:0");
+  tensor_array_grad_node->add_input("tensor_array_node:1");
+  SetNodeAttr("source", "foo", tensor_array_grad_node);
+
+  NodeDef* grad_handle_output_node = graph_def.add_node();
+  grad_handle_output_node->set_name("grad_handle_output_node");
+  grad_handle_output_node->set_op("Identity");
+  grad_handle_output_node->add_input("tensor_array_grad_node:0");
+
+  NodeDef* grad_flow_output_node = graph_def.add_node();
+  grad_flow_output_node->set_name("grad_flow_output_node");
+  grad_flow_output_node->set_op("Identity");
+  grad_flow_output_node->add_input("tensor_array_grad_node:1");
+
+  GraphDef result;
+  TransformFuncContext context;
+  context.input_names = {};
+  context.output_names = {"handle_output_node", "grad_handle_output_node"};
+  TF_ASSERT_OK(BackportTensorArrayV3Transform(graph_def, context, &result));
+
+  std::map<string, const NodeDef*> node_lookup;
+  MapNamesToNodes(result, &node_lookup);
+  ASSERT_EQ(1, node_lookup.count("tensor_array_node"));
+  EXPECT_EQ("TensorArrayV2", node_lookup.at("tensor_array_node")->op());
+  EXPECT_EQ("TensorArrayGradV2",
+            node_lookup.at("tensor_array_grad_node")->op());
+
+  for (const NodeDef& node : result.node()) {
+    for (const string& input : node.input()) {
+      EXPECT_NE("tensor_array_node:1", input);
+    }
+  }
+}
+
+TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3Subtypes) {
+  const std::vector<string> v3_ops = {
+      "TensorArrayWriteV3",   "TensorArrayReadV3",   "TensorArrayGatherV3",
+      "TensorArrayScatterV3", "TensorArrayConcatV3", "TensorArraySplitV3",
+      "TensorArraySizeV3",    "TensorArrayCloseV3"};
+  for (const string& v3_op : v3_ops) {
+    GraphDef graph_def;
+    NodeDef* v3_node = graph_def.add_node();
+    v3_node->set_name("v3_node");
+    v3_node->set_op(v3_op);
+
+    GraphDef result;
+    TransformFuncContext context;
+    context.input_names = {};
+    context.output_names = {""};
+    TF_ASSERT_OK(BackportTensorArrayV3Transform(graph_def, context, &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+    ASSERT_EQ(1, node_lookup.count("v3_node"));
+    EXPECT_TRUE(StringPiece(node_lookup.at("v3_node")->op()).ends_with("V2"));
+  }
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/compare_graphs.cc b/tensorflow/tools/graph_transforms/compare_graphs.cc
index 67198da5e92..8fce16337f7 100644
--- a/tensorflow/tools/graph_transforms/compare_graphs.cc
+++ b/tensorflow/tools/graph_transforms/compare_graphs.cc
@@ -24,11 +24,11 @@ limitations under the License.
 // The return value is 0 if the graphs are equal, 1 if they're different, and -1
 // if there was a problem.
 
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 9f3393f1265..2ff3bb641e9 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -27,23 +27,24 @@ limitations under the License.
 namespace tensorflow {
 namespace graph_transforms {
 
-// Converts Conv2D ops followed by column-wise Muls into equivalent ops with the
-// Mul baked into the convolution weights, to save computation during inference.
+// Converts Conv2D or MatMul ops followed by column-wise Muls into equivalent
+// ops with the Mul baked into the convolution weights, to save computation
+// during inference.
 Status FoldBatchNorms(const GraphDef& input_graph_def,
                       const TransformFuncContext& context,
                       GraphDef* output_graph_def) {
   GraphDef replaced_graph_def;
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       input_graph_def,  // clang-format off
-      {"Mul",             // mul_node
+      {"Mul",                // mul_node
         {
-          {"Conv2D",      // conv_node
+          {"Conv2D|MatMul",  // conv_node
             {
-              {"*"},      // input_node
-              {"Const"},  // weights_node
+              {"*"},         // input_node
+              {"Const"},     // weights_node
             }
           },
-          {"Const"},      // mul_values_node
+          {"Const"},         // mul_values_node
         }
       },  // clang-format on
       [](const NodeMatch& match, const std::set<string>& input_nodes,
@@ -61,7 +62,8 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
 
         // Make sure all the inputs really are vectors, with as many entries as
         // there are columns in the weights.
-        const int64 weights_cols = weights.shape().dim_size(3);
+        const int weights_cols_index = conv_node.op() == "Conv2D" ? 3 : 1;
+        const int64 weights_cols = weights.shape().dim_size(weights_cols_index);
         if ((mul_values.shape().dims() != 1) ||
             (mul_values.shape().dim_size(0) != weights_cols)) {
           return errors::InvalidArgument(
@@ -91,7 +93,7 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
         new_nodes->push_back(input_node);
 
         NodeDef new_conv_node;
-        new_conv_node.CopyFrom(conv_node);
+        new_conv_node = conv_node;
         new_conv_node.set_name(mul_node.name());
         new_nodes->push_back(new_conv_node);
 
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
index b9983fdd0b2..ed741f002c5 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -35,7 +36,7 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
 
 class FoldBatchNormsTest : public ::testing::Test {
  protected:
-  void TestFoldBatchNorms() {
+  void TestFoldBatchNormsConv2D() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -85,9 +86,64 @@ class FoldBatchNormsTest : public ::testing::Test {
       EXPECT_NE("Mul", node.op());
     }
   }
+
+  void TestFoldBatchNormsMatMul() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({2, 2}));
+    test::FillValues<float>(&weights_data, {1.0f, 2.0f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output matmul_op =
+        MatMul(root.WithOpName("matmul_op"), input_op, weights_op);
+
+    Tensor mul_values_data(DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&mul_values_data, {2.0f, 3.0f});
+    Output mul_values_op = Const(root.WithOpName("mul_values"),
+                                 Input::Initializer(mul_values_data));
+
+    Output mul_op = Mul(root.WithOpName("output"), matmul_op, mul_values_op);
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(
+        FoldBatchNorms(original_graph_def, {{}, {"output"}}, &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("Mul", node.op());
+    }
+  }
 };
 
-TEST_F(FoldBatchNormsTest, TestFoldBatchNorms) { TestFoldBatchNorms(); }
+TEST_F(FoldBatchNormsTest, TestFoldBatchNormsConv2D) {
+  TestFoldBatchNormsConv2D();
+}
+TEST_F(FoldBatchNormsTest, TestFoldBatchNormsMatMul) {
+  TestFoldBatchNormsMatMul();
+}
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 8d1f19bf30b..79472ae5549 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -61,7 +61,7 @@ Status ReplaceSendRecvs(const GraphDef& original_graph_def,
       }
     }
     NodeDef new_node;
-    new_node.CopyFrom(node);
+    new_node = node;
     new_node.mutable_input()->Clear();
     for (const string& old_input : node.input()) {
       string input_prefix;
@@ -84,12 +84,12 @@ Status ReplaceSendRecvs(const GraphDef& original_graph_def,
     string removed_node_name = entry.second;
     const NodeDef* removed_node = original_map[removed_node_name];
     NodeDef new_node;
-    new_node.CopyFrom(*removed_node);
+    new_node = *removed_node;
     nodes_to_add.push_back(new_node);
   }
 
   for (const NodeDef& node : nodes_to_add) {
-    output_graph_def->mutable_node()->Add()->CopyFrom(node);
+    *output_graph_def->mutable_node()->Add() = node;
   }
   return Status::OK();
 }
@@ -147,13 +147,14 @@ Status FoldConstants(const GraphDef& input_graph_def,
   TF_RETURN_IF_ERROR(
       ImportGraphDef(import_opts, cleaned_graph_def, &input_graph, nullptr));
   DeviceAttributes device_attributes;
+  subgraph::RewriteGraphMetadata metadata;
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
       &input_graph, context.input_names, context.output_names, {},
-      device_attributes));
+      device_attributes, false /* use_function_convention */, &metadata));
   bool was_mutated;
-  TF_RETURN_IF_ERROR(DoConstantFoldingWithStatus(
-      ConstantFoldingOptions(), nullptr, Env::Default(), nullptr, &input_graph,
-      &was_mutated));
+  TF_RETURN_IF_ERROR(ConstantFold(ConstantFoldingOptions(), nullptr,
+                                  Env::Default(), nullptr, &input_graph,
+                                  &was_mutated));
   GraphDef folded_graph_def;
   input_graph.ToGraphDef(&folded_graph_def);
   GraphDef send_recvs_replaced;
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index dac13f5c321..902f92952a6 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -72,9 +72,9 @@ class ConstantFoldingTest : public ::testing::Test {
                         {"output_expect_remains"});
   }
 
-  void TestConstantFolding(const GraphDef graph_def,
+  void TestConstantFolding(const GraphDef& graph_def,
                            std::vector<std::pair<string, Tensor> > inputs,
-                           std::vector<string> outputs) {
+                           const std::vector<string>& outputs) {
     std::unique_ptr<tensorflow::Session> unfolded_session(
         tensorflow::NewSession(tensorflow::SessionOptions()));
     TF_ASSERT_OK(unfolded_session->Create(graph_def));
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index 8faad4a442d..2436c7e4a2d 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -108,7 +109,7 @@ Status FreezeRequantizationRanges(const GraphDef& input_graph_def,
   string min_max_log_file;
   TF_RETURN_IF_ERROR(
       context.GetOneStringParameter("min_max_log_file", "", &min_max_log_file));
-  if (min_max_log_file == "") {
+  if (min_max_log_file.empty()) {
     return errors::InvalidArgument(
         "You must pass a file name to min_max_log_file");
   }
@@ -198,12 +199,11 @@ Status FreezeRequantizationRanges(const GraphDef& input_graph_def,
       inputs_to_rename[node.name() + ":1"] = max_node->name() + ":0";
     } else {
       NodeDef* new_node = frozen_graph_def.mutable_node()->Add();
-      new_node->CopyFrom(node);
+      *new_node = node;
     }
   }
-  RenameNodeInputs(frozen_graph_def, inputs_to_rename,
-                   std::unordered_set<string>(), output_graph_def);
-  return Status::OK();
+  return RenameNodeInputs(frozen_graph_def, inputs_to_rename,
+                          std::unordered_set<string>(), output_graph_def);
 }
 
 REGISTER_GRAPH_TRANSFORM("freeze_requantization_ranges",
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index c9d72f3d7d5..e1ee2b420b0 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -93,7 +93,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
   GraphDef logged_graph_def;
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = logged_graph_def.mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     if (node_outputs[node.name()].empty()) {
       // There were no outputs found to this node, so skip it.
       continue;
@@ -141,10 +141,8 @@ Status InsertLogging(const GraphDef& input_graph_def,
   }
 
   output_graph_def->Clear();
-  RenameNodeInputs(logged_graph_def, inputs_to_rename, ignore_when_renaming,
-                   output_graph_def);
-
-  return Status::OK();
+  return RenameNodeInputs(logged_graph_def, inputs_to_rename,
+                          ignore_when_renaming, output_graph_def);
 }
 
 REGISTER_GRAPH_TRANSFORM("insert_logging", InsertLogging);
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names.cc b/tensorflow/tools/graph_transforms/obfuscate_names.cc
new file mode 100644
index 00000000000..c470b51b960
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/obfuscate_names.cc
@@ -0,0 +1,100 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
+
+#include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Renames all nodes not uses as graph inputs or outputs to short numerical
+// forms.
+Status ObfuscateNames(const GraphDef& input_graph_def,
+                      const TransformFuncContext& context,
+                      GraphDef* output_graph_def) {
+  std::unordered_set<string> required_nodes;
+  for (const string& input : context.input_names) {
+    required_nodes.insert(input);
+  }
+  for (const string& output : context.output_names) {
+    required_nodes.insert(output);
+  }
+
+  const string valid_chars =
+      "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const int64 chars_size = valid_chars.size();
+
+  std::map<string, string> new_names;
+  int64 name_index = 0;
+  for (const NodeDef& input_node : input_graph_def.node()) {
+    const string& old_name = input_node.name();
+    string new_name;
+    if (required_nodes.count(old_name)) {
+      new_name = old_name;
+    } else {
+      do {
+        int64 remaining = name_index;
+        new_name = "";
+        while (true) {
+          const int64 remainder = (remaining % chars_size);
+          const char current_char = valid_chars[remainder];
+          new_name = current_char + new_name;
+          remaining /= chars_size;
+          if (remaining <= 0) {
+            break;
+          }
+        }
+        ++name_index;
+      } while (required_nodes.count(new_name));
+    }
+    new_names[old_name] = new_name;
+  }
+
+  output_graph_def->Clear();
+  for (const NodeDef& input_node : input_graph_def.node()) {
+    NodeDef* node = output_graph_def->mutable_node()->Add();
+    *node = input_node;
+    const string& old_name = input_node.name();
+    node->set_name(new_names[old_name]);
+    node->mutable_input()->Clear();
+    for (const string& input_name : input_node.input()) {
+      string prefix;
+      string input_node_name;
+      string suffix;
+      NodeNamePartsFromInput(input_name, &prefix, &input_node_name, &suffix);
+      if (new_names.count(input_node_name) == 0) {
+        return errors::InvalidArgument("No node named ", input_node_name,
+                                       " for input to ", old_name);
+      }
+      string new_input_name = prefix + new_names[input_node_name] + suffix;
+      *(node->mutable_input()->Add()) = new_input_name;
+    }
+  }
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("obfuscate_names", ObfuscateNames);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names_test.cc b/tensorflow/tools/graph_transforms/obfuscate_names_test.cc
new file mode 100644
index 00000000000..14df7ba74e0
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/obfuscate_names_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declare here, so we don't need a public header.
+Status ObfuscateNames(const GraphDef& input_graph_def,
+                      const TransformFuncContext& context,
+                      GraphDef* output_graph_def);
+
+class ObfuscateNamesTest : public ::testing::Test {
+ protected:
+  void TestSimpleTree() {
+    GraphDef graph_def;
+
+    NodeDef* add_node1 = graph_def.add_node();
+    add_node1->set_name("add_node1");
+    add_node1->set_op("Add");
+    add_node1->add_input("add_node2");
+    add_node1->add_input("add_node3");
+
+    NodeDef* add_node2 = graph_def.add_node();
+    add_node2->set_name("add_node2");
+    add_node2->set_op("Add");
+    add_node2->add_input("const_node1");
+    add_node2->add_input("const_node2");
+
+    NodeDef* add_node3 = graph_def.add_node();
+    add_node3->set_name("add_node3");
+    add_node3->set_op("Add");
+    add_node3->add_input("const_node3");
+    add_node3->add_input("const_node4");
+
+    NodeDef* const_node1 = graph_def.add_node();
+    const_node1->set_name("const_node1");
+    const_node1->set_op("Const");
+
+    NodeDef* const_node2 = graph_def.add_node();
+    const_node2->set_name("const_node2");
+    const_node2->set_op("Const");
+
+    NodeDef* const_node3 = graph_def.add_node();
+    const_node3->set_name("const_node3");
+    const_node3->set_op("Const");
+
+    NodeDef* const_node4 = graph_def.add_node();
+    const_node4->set_name("const_node4");
+    const_node4->set_op("Const");
+
+    GraphDef result;
+    TF_ASSERT_OK(
+        ObfuscateNames(graph_def, {{"const_node1"}, {"add_node1"}}, &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+
+    EXPECT_EQ(1, node_lookup.count("add_node1"));
+    EXPECT_EQ(0, node_lookup.count("add_node2"));
+    EXPECT_EQ(0, node_lookup.count("add_node3"));
+    EXPECT_EQ(1, node_lookup.count("const_node1"));
+    EXPECT_EQ(0, node_lookup.count("const_node2"));
+    EXPECT_EQ(0, node_lookup.count("const_node3"));
+    EXPECT_EQ(0, node_lookup.count("const_node4"));
+  }
+
+  void TestManyNodes() {
+    GraphDef graph_def;
+    for (int i = 0; i < 1000; ++i) {
+      NodeDef* const_node = graph_def.add_node();
+      const_node->set_name(strings::StrCat("const_node", i));
+      const_node->set_op("Const");
+    }
+
+    GraphDef result;
+    TF_ASSERT_OK(ObfuscateNames(graph_def, {{"const_node0"}, {"const_node999"}},
+                                &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+    EXPECT_EQ(1, node_lookup.count("const_node0"));
+    EXPECT_EQ(0, node_lookup.count("const_node500"));
+    EXPECT_EQ(1, node_lookup.count("const_node999"));
+  }
+
+  void TestNameClashes() {
+    GraphDef graph_def;
+    for (int i = 0; i < 1000; ++i) {
+      NodeDef* const_node = graph_def.add_node();
+      const_node->set_name(strings::StrCat("1", i));
+      const_node->set_op("Const");
+    }
+
+    GraphDef result;
+    TF_ASSERT_OK(ObfuscateNames(graph_def, {{"10"}, {"19"}}, &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+    EXPECT_EQ(1, node_lookup.count("10"));
+    EXPECT_EQ(1, node_lookup.count("19"));
+
+    std::unordered_set<string> names;
+    for (const NodeDef& node : result.node()) {
+      EXPECT_EQ(0, names.count(node.name()))
+          << "Found multiple nodes with name '" << node.name() << "'";
+      names.insert(node.name());
+    }
+  }
+};
+
+TEST_F(ObfuscateNamesTest, TestSimpleTree) { TestSimpleTree(); }
+
+TEST_F(ObfuscateNamesTest, TestManyNodes) { TestManyNodes(); }
+
+TEST_F(ObfuscateNamesTest, TestNameClashes) { TestNameClashes(); }
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/obsfucate_names.cc b/tensorflow/tools/graph_transforms/obsfucate_names.cc
deleted file mode 100644
index c665ed947af..00000000000
--- a/tensorflow/tools/graph_transforms/obsfucate_names.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-
-#include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/subgraph.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// Renames all nodes not uses as graph inputs or outputs to short numerical
-// forms.
-Status ObsfucateNames(const GraphDef& input_graph_def,
-                      const TransformFuncContext& context,
-                      GraphDef* output_graph_def) {
-  std::unordered_set<string> required_nodes;
-  for (const string& input : context.input_names) {
-    required_nodes.insert(input);
-  }
-  for (const string& output : context.output_names) {
-    required_nodes.insert(output);
-  }
-
-  const string valid_chars =
-      "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
-  const int64 chars_size = valid_chars.size();
-
-  std::map<string, string> new_names;
-  int64 name_index = 0;
-  for (const NodeDef& input_node : input_graph_def.node()) {
-    const string& old_name = input_node.name();
-    string new_name;
-    if (required_nodes.count(old_name)) {
-      new_name = old_name;
-    } else {
-      do {
-        int64 remaining = name_index;
-        new_name = "";
-        while (true) {
-          const int64 remainder = (remaining % chars_size);
-          const char current_char = valid_chars[remainder];
-          new_name = current_char + new_name;
-          remaining /= chars_size;
-          if (remaining <= 0) {
-            break;
-          }
-        }
-        ++name_index;
-      } while (required_nodes.count(new_name));
-    }
-    new_names[old_name] = new_name;
-  }
-
-  output_graph_def->Clear();
-  for (const NodeDef& input_node : input_graph_def.node()) {
-    NodeDef* node = output_graph_def->mutable_node()->Add();
-    node->CopyFrom(input_node);
-    const string& old_name = input_node.name();
-    node->set_name(new_names[old_name]);
-    node->mutable_input()->Clear();
-    for (const string& input_name : input_node.input()) {
-      string prefix;
-      string input_node_name;
-      string suffix;
-      NodeNamePartsFromInput(input_name, &prefix, &input_node_name, &suffix);
-      if (new_names.count(input_node_name) == 0) {
-        return errors::InvalidArgument("No node named ", input_node_name,
-                                       " for input to ", old_name);
-      }
-      string new_input_name = prefix + new_names[input_node_name] + suffix;
-      *(node->mutable_input()->Add()) = new_input_name;
-    }
-  }
-
-  return Status::OK();
-}
-
-REGISTER_GRAPH_TRANSFORM("obsfucate_names", ObsfucateNames);
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/obsfucate_names_test.cc b/tensorflow/tools/graph_transforms/obsfucate_names_test.cc
deleted file mode 100644
index 90b34a707ab..00000000000
--- a/tensorflow/tools/graph_transforms/obsfucate_names_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/image_ops.h"
-#include "tensorflow/cc/ops/nn_ops.h"
-#include "tensorflow/cc/ops/sendrecv_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-// Declare here, so we don't need a public header.
-Status ObsfucateNames(const GraphDef& input_graph_def,
-                      const TransformFuncContext& context,
-                      GraphDef* output_graph_def);
-
-class ObsfucateNamesTest : public ::testing::Test {
- protected:
-  void TestSimpleTree() {
-    GraphDef graph_def;
-
-    NodeDef* add_node1 = graph_def.add_node();
-    add_node1->set_name("add_node1");
-    add_node1->set_op("Add");
-    add_node1->add_input("add_node2");
-    add_node1->add_input("add_node3");
-
-    NodeDef* add_node2 = graph_def.add_node();
-    add_node2->set_name("add_node2");
-    add_node2->set_op("Add");
-    add_node2->add_input("const_node1");
-    add_node2->add_input("const_node2");
-
-    NodeDef* add_node3 = graph_def.add_node();
-    add_node3->set_name("add_node3");
-    add_node3->set_op("Add");
-    add_node3->add_input("const_node3");
-    add_node3->add_input("const_node4");
-
-    NodeDef* const_node1 = graph_def.add_node();
-    const_node1->set_name("const_node1");
-    const_node1->set_op("Const");
-
-    NodeDef* const_node2 = graph_def.add_node();
-    const_node2->set_name("const_node2");
-    const_node2->set_op("Const");
-
-    NodeDef* const_node3 = graph_def.add_node();
-    const_node3->set_name("const_node3");
-    const_node3->set_op("Const");
-
-    NodeDef* const_node4 = graph_def.add_node();
-    const_node4->set_name("const_node4");
-    const_node4->set_op("Const");
-
-    GraphDef result;
-    TF_ASSERT_OK(
-        ObsfucateNames(graph_def, {{"const_node1"}, {"add_node1"}}, &result));
-
-    std::map<string, const NodeDef*> node_lookup;
-    MapNamesToNodes(result, &node_lookup);
-
-    EXPECT_EQ(1, node_lookup.count("add_node1"));
-    EXPECT_EQ(0, node_lookup.count("add_node2"));
-    EXPECT_EQ(0, node_lookup.count("add_node3"));
-    EXPECT_EQ(1, node_lookup.count("const_node1"));
-    EXPECT_EQ(0, node_lookup.count("const_node2"));
-    EXPECT_EQ(0, node_lookup.count("const_node3"));
-    EXPECT_EQ(0, node_lookup.count("const_node4"));
-  }
-
-  void TestManyNodes() {
-    GraphDef graph_def;
-    for (int i = 0; i < 1000; ++i) {
-      NodeDef* const_node = graph_def.add_node();
-      const_node->set_name(strings::StrCat("const_node", i));
-      const_node->set_op("Const");
-    }
-
-    GraphDef result;
-    TF_ASSERT_OK(ObsfucateNames(graph_def, {{"const_node0"}, {"const_node999"}},
-                                &result));
-
-    std::map<string, const NodeDef*> node_lookup;
-    MapNamesToNodes(result, &node_lookup);
-    EXPECT_EQ(1, node_lookup.count("const_node0"));
-    EXPECT_EQ(0, node_lookup.count("const_node500"));
-    EXPECT_EQ(1, node_lookup.count("const_node999"));
-  }
-
-  void TestNameClashes() {
-    GraphDef graph_def;
-    for (int i = 0; i < 1000; ++i) {
-      NodeDef* const_node = graph_def.add_node();
-      const_node->set_name(strings::StrCat("1", i));
-      const_node->set_op("Const");
-    }
-
-    GraphDef result;
-    TF_ASSERT_OK(ObsfucateNames(graph_def, {{"10"}, {"19"}}, &result));
-
-    std::map<string, const NodeDef*> node_lookup;
-    MapNamesToNodes(result, &node_lookup);
-    EXPECT_EQ(1, node_lookup.count("10"));
-    EXPECT_EQ(1, node_lookup.count("19"));
-
-    std::unordered_set<string> names;
-    for (const NodeDef& node : result.node()) {
-      EXPECT_EQ(0, names.count(node.name()))
-          << "Found multiple nodes with name '" << node.name() << "'";
-      names.insert(node.name());
-    }
-  }
-};
-
-TEST_F(ObsfucateNamesTest, TestSimpleTree) { TestSimpleTree(); }
-
-TEST_F(ObsfucateNamesTest, TestManyNodes) { TestManyNodes(); }
-
-TEST_F(ObsfucateNamesTest, TestNameClashes) { TestNameClashes(); }
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 11cc91cd77b..da064377ac3 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -56,6 +56,13 @@ struct QuantizedOpInfo {
 // conversion process can transform them.
 const std::vector<QuantizedOpInfo>& GetQuantizedOpList() {
   static const std::vector<QuantizedOpInfo> op_list = {
+      {"Add",
+       {},
+       {{"T1", DT_QUINT8}, {"T2", DT_QUINT8}, {"Toutput", DT_QINT32}},
+       DT_QUINT8,
+       DT_QINT32,
+       {},
+       QuantizedOpInfo::CONTIGUOUS_MIN_MAX},
       {"AvgPool",
        {"ksize", "strides", "padding"},
        {{"T", DT_QUINT8}},
@@ -142,7 +149,7 @@ string UniqueNodeNameFromInput(const string& input_name) {
     result += "__hat__";
   }
   result += node_name;
-  if (suffix != "") {
+  if (!suffix.empty()) {
     result += "__port__" + suffix.substr(1, suffix.size() - 1);
   }
   return result;
@@ -243,8 +250,9 @@ Status MergeDuplicateNodes(const GraphDef& input_graph_def,
     }
     // Update the graph so that any nodes that referred to removed inputs now
     // pull from the remaining duplicate.
-    RenameNodeInputs(merged_graph_def, inputs_to_rename,
-                     std::unordered_set<string>(), &current_graph_def);
+    TF_RETURN_IF_ERROR(RenameNodeInputs(merged_graph_def, inputs_to_rename,
+                                        std::unordered_set<string>(),
+                                        &current_graph_def));
   } while (any_duplicates_found);
 
   *output_graph_def = current_graph_def;
@@ -303,10 +311,8 @@ Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
       },
       {true}, &replaced_graph_def));
 
-  RenameNodeInputs(replaced_graph_def, inputs_to_rename,
-                   std::unordered_set<string>(), output_graph_def);
-
-  return Status::OK();
+  return RenameNodeInputs(replaced_graph_def, inputs_to_rename,
+                          std::unordered_set<string>(), output_graph_def);
 }
 
 // If the user has passed in the input_min and input_max args, then we need to
@@ -331,15 +337,14 @@ Status QuantizePlaceholders(const GraphDef& input_graph_def,
   placeholder_graph_def.Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     if (node.op() != "Placeholder") {
-      (placeholder_graph_def.mutable_node()->Add())->CopyFrom(node);
+      *(placeholder_graph_def.mutable_node()->Add()) = node;
     } else {
       string namespace_prefix = node.name() + "_eightbit";
 
       NodeDef quantized_placeholder;
-      quantized_placeholder.CopyFrom(node);
+      quantized_placeholder = node;
       SetNodeAttr("dtype", DT_QUINT8, &quantized_placeholder);
-      (placeholder_graph_def.mutable_node()->Add())
-          ->CopyFrom(quantized_placeholder);
+      *(placeholder_graph_def.mutable_node()->Add()) = quantized_placeholder;
 
       NodeDef min_node;
       min_node.set_op("Const");
@@ -348,7 +353,7 @@ Status QuantizePlaceholders(const GraphDef& input_graph_def,
       Tensor min_tensor(DT_FLOAT, {});
       min_tensor.flat<float>()(0) = input_min;
       SetNodeTensorAttr<float>("value", min_tensor, &min_node);
-      (placeholder_graph_def.mutable_node()->Add())->CopyFrom(min_node);
+      *(placeholder_graph_def.mutable_node()->Add()) = min_node;
 
       NodeDef max_node;
       max_node.set_op("Const");
@@ -357,7 +362,7 @@ Status QuantizePlaceholders(const GraphDef& input_graph_def,
       Tensor max_tensor(DT_FLOAT, {});
       max_tensor.flat<float>()(0) = input_max;
       SetNodeTensorAttr<float>("value", max_tensor, &max_node);
-      (placeholder_graph_def.mutable_node()->Add())->CopyFrom(max_node);
+      *(placeholder_graph_def.mutable_node()->Add()) = max_node;
 
       const string rename_suffix = "__RENAMED_PLACEHOLDER__";
       NodeDef dequantize_node;
@@ -368,7 +373,7 @@ Status QuantizePlaceholders(const GraphDef& input_graph_def,
       AddNodeInput(node.name() + rename_suffix, &dequantize_node);
       AddNodeInput(min_node.name(), &dequantize_node);
       AddNodeInput(max_node.name(), &dequantize_node);
-      (placeholder_graph_def.mutable_node()->Add())->CopyFrom(dequantize_node);
+      *(placeholder_graph_def.mutable_node()->Add()) = dequantize_node;
 
       // First make sure that any internal references to the old placeholder
       // now point to the dequantize result.
@@ -380,10 +385,12 @@ Status QuantizePlaceholders(const GraphDef& input_graph_def,
   }
 
   GraphDef first_pass_graph_def;
-  RenameNodeInputs(placeholder_graph_def, inputs_to_rename_first_pass,
-                   std::unordered_set<string>(), &first_pass_graph_def);
-  RenameNodeInputs(first_pass_graph_def, inputs_to_rename_second_pass,
-                   std::unordered_set<string>(), output_graph_def);
+  TF_RETURN_IF_ERROR(
+      RenameNodeInputs(placeholder_graph_def, inputs_to_rename_first_pass,
+                       std::unordered_set<string>(), &first_pass_graph_def));
+  TF_RETURN_IF_ERROR(
+      RenameNodeInputs(first_pass_graph_def, inputs_to_rename_second_pass,
+                       std::unordered_set<string>(), output_graph_def));
 
   return Status::OK();
 }
@@ -511,7 +518,7 @@ Status MergeAdjacentRequantizes(const GraphDef& input_graph_def,
         new_nodes->push_back(fake_requantize_max_node);
 
         NodeDef requantize_node;
-        requantize_node.CopyFrom(fake_requantize_node);
+        requantize_node = fake_requantize_node;
         requantize_node.mutable_input()->Clear();
         AddNodeInput(original_op_node.name() + ":0", &requantize_node);
         AddNodeInput(original_op_node.name() + ":1", &requantize_node);
@@ -560,7 +567,7 @@ Status HoistFakeQuants(const GraphDef& input_graph_def,
             current_match = current_match.inputs[0];
           }
           NodeDef new_fake_quant_node;
-          new_fake_quant_node.CopyFrom(fake_quant_node);
+          new_fake_quant_node = fake_quant_node;
           new_fake_quant_node.set_name(fake_quant_node.name() + "_hoisted");
           new_fake_quant_node.set_input(
               0, linear_nodes[linear_nodes.size() - 2].input(0));
@@ -635,14 +642,25 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
   // The result will end up with a lot of redundant dequantize/quantize pairs
   // between adjacent quantized ops, but a later pass removes these where it
   // can.
+
+  std::set<string> ops_to_ignore;
+  if (context.params.count("ignore_op") > 0) {
+    for (const string& name : context.params.at("ignore_op")) {
+      ops_to_ignore.insert(name);
+    }
+  }
+
   const std::vector<QuantizedOpInfo>& op_list = GetQuantizedOpList();
   string op_pattern;
   bool is_first = true;
   std::map<string, QuantizedOpInfo> op_map;
   for (const QuantizedOpInfo& op_info : op_list) {
-    strings::StrAppend(&op_pattern, (is_first ? "" : "|"), op_info.float_name);
-    op_map.insert({op_info.float_name, op_info});
-    is_first = false;
+    if (ops_to_ignore.count(op_info.float_name) == 0) {
+      strings::StrAppend(&op_pattern, (is_first ? "" : "|"),
+                         op_info.float_name);
+      op_map.insert({op_info.float_name, op_info});
+      is_first = false;
+    }
   }
 
   // If input_min and input max have been passed in, then we convert all float
@@ -929,7 +947,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
   // keep interoperability with float ops.
   TF_RETURN_IF_ERROR(RemoveRedundantQuantizations(deduped_graph_def, context,
                                                   output_graph_def));
-  TF_RETURN_IF_ERROR(IsGraphValid(merged_graph_def));
+  TF_RETURN_IF_ERROR(IsGraphValid(*output_graph_def));
 
   return Status::OK();
 }
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes_test.cc b/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
index c4de14d7a8d..d02655f3f9c 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
@@ -105,9 +105,9 @@ class QuantizeNodesTest : public ::testing::Test {
                                     &quantized_graph_def);
     // Reshape is not included here because it can be added as part of the
     // quantization process.
-    const std::set<string> quantizable_ops = {"BiasAdd", "Concat",  "Conv2D",
-                                              "MatMul",  "Relu",    "Relu6",
-                                              "AvgPool", "MaxPool", "Mul"};
+    const std::set<string> quantizable_ops = {
+        "Add",  "BiasAdd", "Concat",  "Conv2D",  "MatMul",
+        "Relu", "Relu6",   "AvgPool", "MaxPool", "Mul"};
     for (const NodeDef& node : quantized_graph_def.node()) {
       EXPECT_EQ(0, quantizable_ops.count(node.op()))
           << "Found quantizable node " << node.op() << " for node named "
@@ -152,6 +152,58 @@ class QuantizeNodesTest : public ::testing::Test {
                                     context, 2.0, quantized_graph_def);
   }
 
+  void TestIgnoreOps(std::initializer_list<string> ops_to_ignore) {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    // A small helper to construct a Const op.
+    auto const_op = [&](const string& name, const TensorShape& shape,
+                        std::initializer_list<float> values) {
+      Tensor tensor(DT_FLOAT, shape);
+      test::FillValues<float>(&tensor, values);
+      return Const(root.WithOpName(name), Input::Initializer(tensor));
+    };
+
+    // A simple graph with two different quantizable ops.
+    int m = 1;
+    int n = 1;
+    int k = 1;
+    Output a_op = const_op("a_op", {m, k}, {2});
+    Output b_op = const_op("b_op", {k, n}, {3});
+    Output c_op = const_op("c_op", {m, k}, {1});
+    Output d_op = const_op("d_op", {k, n}, {4});
+    Output mat_mul_op = MatMul(root.WithOpName("mat_mul_op"), a_op, b_op);
+    Output mul_op = Mul(root.WithOpName("mul"), c_op, d_op);
+
+    GraphDef float_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&float_graph_def));
+
+    TransformFuncContext context;
+    if (ops_to_ignore.size() > 0) {
+      context.params["ignore_op"] = ops_to_ignore;
+    }
+
+    GraphDef quantized_graph_def;
+    TestTransformedVersusFloatGraph(QuantizeNodes, float_graph_def, {}, {},
+                                    {"mat_mul_op", "mul"}, context, 1.0,
+                                    &quantized_graph_def);
+
+    // Make sure the quantized graph still contains the op that should have
+    // been ignored by QuantizeNodes.
+    for (const string& op_name : ops_to_ignore) {
+      bool exists_in_quantized_graph = false;
+      for (const NodeDef& node : quantized_graph_def.node()) {
+        if (node.op() == op_name) {
+          exists_in_quantized_graph = true;
+          break;
+        }
+      }
+      EXPECT_TRUE(exists_in_quantized_graph)
+          << "Op " << op_name
+          << " should not have been replace by a quantized version";
+    }
+  }
+
   void TestQuantizeMatMul(int m, int n, int k,
                           const std::vector<float>& a_values,
                           const std::vector<float>& b_values) {
@@ -225,6 +277,41 @@ class QuantizeNodesTest : public ::testing::Test {
     TestQuantizedVersusFloatGraph(float_graph_def, {}, {"mul"});
   }
 
+  void TestQuantizeAdd() {
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    std::vector<int64> x_shape({10, 100});
+    const size_t x_num_elements = TensorShape(x_shape).num_elements();
+    std::vector<float> x_values(x_num_elements);
+    for (int i = 0; i < x_num_elements; ++i) {
+      x_values[i] = (i % 256) / 256.0f;
+    }
+
+    std::vector<int64> y_shape({100});
+    const size_t y_num_elements = TensorShape(y_shape).num_elements();
+    std::vector<float> y_values(y_num_elements);
+    for (int i = 0; i < y_num_elements; ++i) {
+      y_values[i] = ((i + 23) % 123) - 50;
+    }
+
+    Scope root = Scope::NewRootScope();
+
+    Tensor x_float_tensor(DT_FLOAT, TensorShape(x_shape));
+    test::FillValues<float>(&x_float_tensor, x_values);
+    Output x = Const(root.WithOpName("x"), Input::Initializer(x_float_tensor));
+
+    Tensor y_float_tensor(DT_FLOAT, TensorShape(y_shape));
+    test::FillValues<float>(&y_float_tensor, y_values);
+    Output y = Const(root.WithOpName("y"), Input::Initializer(y_float_tensor));
+
+    Add add = Add(root.WithOpName("add"), x, y);
+
+    GraphDef float_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&float_graph_def));
+
+    TestQuantizedVersusFloatGraph(float_graph_def, {}, {"add"});
+  }
+
   void TestQuantizeConv2D(int depth, int input_width, int input_height,
                           int input_batch_count, int filter_size,
                           int filter_count, int stride, const string& padding,
@@ -1316,6 +1403,12 @@ class QuantizeNodesTest : public ::testing::Test {
   }
 };
 
+TEST_F(QuantizeNodesTest, TestIgnoreOps) {
+  TestIgnoreOps({});
+  TestIgnoreOps({"MatMul"});
+  TestIgnoreOps({"MatMul", "Mul"});
+}
+
 TEST_F(QuantizeNodesTest, TestQuantizeMatMulTiny) { TestQuantizeMatMulTiny(); }
 
 TEST_F(QuantizeNodesTest, TestQuantizeMatMulSmall) {
@@ -1324,6 +1417,8 @@ TEST_F(QuantizeNodesTest, TestQuantizeMatMulSmall) {
 
 TEST_F(QuantizeNodesTest, TestQuantizeMul) { TestQuantizeMul(); }
 
+TEST_F(QuantizeNodesTest, TestQuantizeAdd) { TestQuantizeAdd(); }
+
 TEST_F(QuantizeNodesTest, TestOddPaddingProblem) {
   // Tests one error case we ran into in a real graph.
   TestQuantizeConv2D(1, 4, 4, 1, 3, 1, 2, "SAME",
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index e6f1498224f..66d800f0da1 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -70,6 +70,10 @@ Status QuantizeWeights(const GraphDef& input_graph_def,
           min = std::min(min, value);
           max = std::max(max, value);
         }
+        // Make sure the quantization range includes 0.0f. Not all quantized
+        // Ops behave properly if 0.0f is not in the range.
+        min = std::min(min, 0.0f);
+        max = std::max(0.0f, max);
         // min_value == max_value is a tricky case. It can occur for general
         // tensors, and of course for scalars. The quantized ops cannot deal
         // with this case, so we set max_value to something else.
diff --git a/tensorflow/tools/graph_transforms/quantize_weights_test.cc b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
index cd5feed3580..63c5b5a64d9 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
@@ -35,51 +35,46 @@ Status QuantizeWeights(const GraphDef& input_graph_def,
 
 class QuantizeWeightsTest : public ::testing::Test {
  protected:
-  void TestQuantizeWeights() {
+  void BuildGraphDef(const TensorShape& input_shape,
+                     std::initializer_list<float> input_values,
+                     const TensorShape& weight_shape,
+                     std::initializer_list<float> weight_values,
+                     GraphDef* original_graph_def) {
     auto root = tensorflow::Scope::NewRootScope();
-    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
-    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
-    test::FillValues<float>(
-        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
-                      -5.0f, -3.0f, -6.0f});
+    Tensor input_data(DT_FLOAT, input_shape);
+    test::FillValues<float>(&input_data, input_values);
     Output input_op =
-        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+        ops::Const(root.WithOpName("input_op"), Input::Initializer(input_data));
 
-    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 10}));
-    test::FillValues<float>(
-        &weights_data,
-        {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
-         3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
-         0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
-         0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
-    Output weights_op =
-        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+    Tensor weights_data(DT_FLOAT, weight_shape);
+    test::FillValues<float>(&weights_data, weight_values);
+    Output weights_op = ops::Const(root.WithOpName("weights_op"),
+                                   Input::Initializer(weights_data));
 
-    Output conv_op = Conv2D(root.WithOpName("output"), input_op, weights_op,
-                            {1, 1, 1, 1}, "VALID");
+    Output conv_op = ops::Conv2D(root.WithOpName("output"), input_op,
+                                 weights_op, {1, 1, 1, 1}, "VALID");
 
+    TF_ASSERT_OK(root.ToGraphDef(original_graph_def));
+  }
+
+  void TestQuantizeWeights() {
     GraphDef original_graph_def;
-    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
-
-    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
-    TF_ASSERT_OK(original_session->Create(original_graph_def));
-    std::vector<Tensor> original_outputs;
-    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+    BuildGraphDef({1, 1, 6, 2},
+                  {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                   -5.0f, -3.0f, -6.0f},
+                  {1, 2, 2, 10},
+                  {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
+                   3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
+                   0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
+                   0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
+                  &original_graph_def);
 
     GraphDef quantized_graph_def;
     TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
                                  &quantized_graph_def));
 
-    std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
-    TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
-    std::vector<Tensor> quantized_outputs;
-    TF_ASSERT_OK(
-        quantized_session->Run({}, {"output"}, {}, &quantized_outputs));
-
-    test::ExpectTensorNear<float>(original_outputs[0], quantized_outputs[0],
-                                  0.5);
-
+    // Verify the structure of the quantized graph.
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(quantized_graph_def, &node_lookup);
     EXPECT_EQ(1, node_lookup.count("input_op"));
@@ -94,10 +89,69 @@ class QuantizeWeightsTest : public ::testing::Test {
     const NodeDef* q_weights_const = node_lookup.at(weights_const_name);
     EXPECT_EQ("Const", q_weights_const->op());
     EXPECT_EQ(DT_QUINT8, q_weights_const->attr().at("dtype").type());
+
+    // Run the original graph.
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    // Run the quantized graph.
+    std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
+    std::vector<Tensor> quantized_outputs;
+    TF_ASSERT_OK(
+        quantized_session->Run({}, {"output"}, {}, &quantized_outputs));
+
+    // Compare the results
+    test::ExpectTensorNear<float>(original_outputs[0], quantized_outputs[0],
+                                  0.5);
   }
 };
 
 TEST_F(QuantizeWeightsTest, TestQuantizeWeights) { TestQuantizeWeights(); }
 
+TEST_F(QuantizeWeightsTest, RangesAlwaysIncludeZero) {
+  GraphDef original_graph_def;
+  BuildGraphDef({1, 1, 4, 4},
+                {-1.0f, -4.0f, -2.0f, -5.0f, -1.0f, -4.0f, -2.0f, -5.0f, -1.0f,
+                 -4.0f, -2.0f, -5.0f, -1.0f, -4.0f, -2.0f, -5.0f},
+                {1, 2, 2, 10},
+                {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
+                 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
+                 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
+                 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
+                &original_graph_def);
+  GraphDef quantized_graph_def;
+  TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
+                               &quantized_graph_def));
+
+  std::map<string, const NodeDef*> node_lookup;
+  MapNamesToNodes(quantized_graph_def, &node_lookup);
+
+  auto expected_tensor = [](float value) {
+    Tensor tensor(DT_FLOAT, TensorShape({}));
+    test::FillValues<float>(&tensor, {value});
+    return tensor;
+  };
+  auto existing_tensor = [&node_lookup](string op) {
+    const NodeDef* node_def = node_lookup.at(op);
+    CHECK(node_def);
+    return GetNodeTensorAttr(*node_def, "value");
+  };
+
+  // The max of input_op is moved from -1.0 to 0.0.
+  test::ExpectTensorNear<float>(
+      expected_tensor(-5.0), existing_tensor("input_op_quantized_min"), 1e-5);
+  test::ExpectTensorNear<float>(
+      expected_tensor(0.0), existing_tensor("input_op_quantized_max"), 1e-5);
+
+  // The min of weights_op is moved from 0.1 to 0.0.
+  test::ExpectTensorNear<float>(
+      expected_tensor(0.0), existing_tensor("weights_op_quantized_min"), 1e-5);
+  test::ExpectTensorNear<float>(
+      expected_tensor(4.0), existing_tensor("weights_op_quantized_max"), 1e-5);
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index dd7ec8a0c63..d76c3ff87d0 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -54,7 +54,7 @@ Status RemoveAttribute(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     if (((op_name == "*") || (op_name == node.op())) &&
         (node.attr().count(attribute_name))) {
       new_node->mutable_attr()->erase(attribute_name);
diff --git a/tensorflow/tools/graph_transforms/remove_device.cc b/tensorflow/tools/graph_transforms/remove_device.cc
index 7f50dd60405..975fa370633 100644
--- a/tensorflow/tools/graph_transforms/remove_device.cc
+++ b/tensorflow/tools/graph_transforms/remove_device.cc
@@ -34,7 +34,7 @@ Status RemoveDevice(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     new_node->set_device("");
   }
 
diff --git a/tensorflow/tools/graph_transforms/remove_nodes.cc b/tensorflow/tools/graph_transforms/remove_nodes.cc
index 429dbdd0b19..119b44d6a4a 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes.cc
@@ -37,6 +37,9 @@ Status RemoveNodes(const GraphDef& input_graph_def,
         "remove_nodes expects at least one 'op'"
         "argument, e.g. remove_nodes(op=Identity)");
   }
+  int32 max_inputs;
+  TF_RETURN_IF_ERROR(
+      context.GetOneInt32Parameter("max_inputs", 1, &max_inputs));
 
   // Make sure we don't get rid of any nodes used as graph inputs or outputs.
   std::set<string> required_nodes;
@@ -50,38 +53,48 @@ Status RemoveNodes(const GraphDef& input_graph_def,
   std::vector<string> ops_to_remove = context.params.at("op");
   GraphDef current_graph_def = input_graph_def;
   for (const string& op : ops_to_remove) {
-    // Keep looking for nodes to remove until there are no more changes.
-    bool any_nodes_removed;
-    do {
-      any_nodes_removed = false;
-      std::map<string, string> inputs_to_rename;
-      GraphDef replaced_graph_def;
-      TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
-          current_graph_def, {op, {{"*"}}},
-          [&inputs_to_rename, &required_nodes, &any_nodes_removed](
-              const NodeMatch& match, const std::set<string>& input_nodes,
-              const std::set<string>& output_nodes,
-              std::vector<NodeDef>* new_nodes) {
-            const NodeDef& replace_node = match.node;
-            // If this node is needed in the inputs or outputs don't replace it.
-            if (required_nodes.count(replace_node.name())) {
-              LOG(INFO) << "Skipping replacement for " << replace_node.name();
-              CopyOriginalMatch(match, new_nodes);
+    for (int num_inputs = 1; num_inputs <= max_inputs; ++num_inputs) {
+      // Look for a variable number of inputs.
+      OpTypePattern pattern = {op};
+      pattern.inputs.resize(num_inputs);
+      for (int i = 0; i < num_inputs; ++i) {
+        pattern.inputs[i] = {"*"};
+      }
+      // Keep looking for nodes to remove until there are no more changes.
+      bool any_nodes_removed;
+      do {
+        any_nodes_removed = false;
+        std::map<string, string> inputs_to_rename;
+        GraphDef replaced_graph_def;
+        TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+            current_graph_def, pattern,
+            [&inputs_to_rename, &required_nodes, &any_nodes_removed](
+                const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
+              const NodeDef& replace_node = match.node;
+              // If this node is needed in the inputs or outputs don't replace
+              // it.
+              if (required_nodes.count(replace_node.name())) {
+                LOG(INFO) << "Skipping replacement for " << replace_node.name();
+                CopyOriginalMatch(match, new_nodes);
+                return Status::OK();
+              }
+              const NodeDef& input_node = match.inputs[0].node;
+              inputs_to_rename[replace_node.name()] = input_node.name();
+              inputs_to_rename["^" + replace_node.name()] =
+                  "^" + input_node.name();
+              new_nodes->push_back(input_node);
+              any_nodes_removed = true;
               return Status::OK();
-            }
-            const NodeDef& input_node = match.inputs[0].node;
-            inputs_to_rename[replace_node.name()] = input_node.name();
-            inputs_to_rename["^" + replace_node.name()] =
-                "^" + input_node.name();
-            new_nodes->push_back(input_node);
-            any_nodes_removed = true;
-            return Status::OK();
-          },
-          {true}, &replaced_graph_def));
-      // Make sure all references to removed nodes now point to their inputs.
-      RenameNodeInputs(replaced_graph_def, inputs_to_rename,
-                       std::unordered_set<string>(), &current_graph_def);
-    } while (any_nodes_removed);
+            },
+            {true}, &replaced_graph_def));
+        // Make sure all references to removed nodes now point to their inputs.
+        TF_RETURN_IF_ERROR(
+            RenameNodeInputs(replaced_graph_def, inputs_to_rename,
+                             std::unordered_set<string>(), &current_graph_def));
+      } while (any_nodes_removed);
+    }
   }
 
   *output_graph_def = current_graph_def;
diff --git a/tensorflow/tools/graph_transforms/remove_nodes_test.cc b/tensorflow/tools/graph_transforms/remove_nodes_test.cc
index e87ea1daa6f..d8d85a3b471 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes_test.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes_test.cc
@@ -210,6 +210,58 @@ class RemoveNodesTest : public ::testing::Test {
     EXPECT_EQ(0, node_lookup.count("identity_node2"));
     EXPECT_EQ(0, node_lookup.count("identity_node3"));
   }
+
+  void TestRemoveMultipleInputs() {
+    GraphDef graph_def;
+
+    NodeDef* const_node1 = graph_def.add_node();
+    const_node1->set_name("const_node1");
+    const_node1->set_op("Const");
+
+    NodeDef* const_node2 = graph_def.add_node();
+    const_node2->set_name("const_node2");
+    const_node2->set_op("Const");
+
+    NodeDef* const_node3 = graph_def.add_node();
+    const_node3->set_name("const_node3");
+    const_node3->set_op("Const");
+
+    NodeDef* const_node4 = graph_def.add_node();
+    const_node4->set_name("const_node4");
+    const_node4->set_op("Const");
+
+    NodeDef* fake_quant_node = graph_def.add_node();
+    fake_quant_node->set_name("fake_quant_node");
+    fake_quant_node->set_op("FakeQuantWithMinMaxVars");
+    fake_quant_node->add_input("const_node1");
+    fake_quant_node->add_input("const_node2");
+    fake_quant_node->add_input("const_node3");
+
+    NodeDef* add_node = graph_def.add_node();
+    add_node->set_name("add_node");
+    add_node->set_op("Add");
+    add_node->add_input("fake_quant_node");
+    add_node->add_input("const_node4");
+
+    GraphDef result;
+    TransformFuncContext context;
+    context.input_names = {};
+    context.output_names = {"add_node"};
+    context.params.insert(std::pair<string, std::vector<string>>(
+        {"op", {string("FakeQuantWithMinMaxVars")}}));
+    context.params.insert(
+        std::pair<string, std::vector<string>>({"max_inputs", {string("3")}}));
+    TF_ASSERT_OK(RemoveNodes(graph_def, context, &result));
+
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+    ASSERT_EQ(1, node_lookup.count("const_node1"));
+    ASSERT_EQ(1, node_lookup.count("const_node4"));
+    ASSERT_EQ(0, node_lookup.count("fake_quant_node"));
+    ASSERT_EQ(1, node_lookup.count("add_node"));
+    EXPECT_EQ("const_node1", node_lookup.at("add_node")->input(0));
+    EXPECT_EQ("const_node4", node_lookup.at("add_node")->input(1));
+  }
 };
 
 TEST_F(RemoveNodesTest, TestRemoveNodes) { TestRemoveNodes(); }
@@ -218,5 +270,9 @@ TEST_F(RemoveNodesTest, TestRemoveOutputNodes) { TestRemoveOutputNodes(); }
 
 TEST_F(RemoveNodesTest, TestRemoveChainedNodes) { TestRemoveChainedNodes(); }
 
+TEST_F(RemoveNodesTest, TestRemoveMultipleInputs) {
+  TestRemoveMultipleInputs();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index 3493cc37ea2..bd066aab5b9 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -52,7 +52,7 @@ Status RenameAttribute(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     if (((op_name == "*") || (op_name == node.op())) &&
         (node.attr().count(old_attribute_name))) {
       AttrValue attribute_value = node.attr().at(old_attribute_name);
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index 04441d028ff..e1e13c1be43 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -45,7 +45,7 @@ Status RenameOp(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     if (node.op() == old_op_name) {
       new_node->set_op(new_op_name);
     }
diff --git a/tensorflow/tools/graph_transforms/set_device.cc b/tensorflow/tools/graph_transforms/set_device.cc
index 4e4529f4b6d..08ade34ff4b 100644
--- a/tensorflow/tools/graph_transforms/set_device.cc
+++ b/tensorflow/tools/graph_transforms/set_device.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
@@ -31,8 +32,8 @@ Status SetDevice(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
-    if (!if_default || (node.device() == "")) {
+    *new_node = node;
+    if (!if_default || (node.device().empty())) {
       new_node->set_device(new_device);
     }
   }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
new file mode 100644
index 00000000000..c441a089ced
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -0,0 +1,276 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+using strings::StrCat;
+namespace graph_transforms {
+namespace {
+
+// Sparsify Tensor of shape [N, 1]. Return the indices and values vectors for
+// non-zero tensor content.
+Status SparsifyWeights(const Tensor& tensor, Tensor* indices_tensor,
+                       Tensor* values_tensor) {
+  if (tensor.dims() != 2 || tensor.dim_size(1) != 1) {
+    return tensorflow::errors::FailedPrecondition(
+        "Transform only applicable to subgraph with 'Const' with "
+        "tensor of shpae [N, 1]. But instead get shape ",
+        tensor.shape().DebugString(), ".");
+  }
+
+  auto flat = tensor.flat<float>();
+  std::vector<int64> indices;
+  std::vector<float> values;
+
+  for (int64 i = 0; i < flat.size(); i++) {
+    float val = flat(i);
+    if (std::abs(val) >= 1.0e-5) {
+      indices.push_back(i);
+      values.push_back(val);
+    }
+  }
+
+  // During model initialization, InitializeTableOp makes use of
+  // KeyValueTensorIterator, which does not accept empty keys or values.
+  // Consequently, adding a dummy pair of indices and values as a walkaround.
+  if (indices.empty() || values.empty()) {
+    indices.push_back(0);
+    values.push_back(0);
+  }
+  *indices_tensor = Tensor(DataTypeToEnum<int64>::value,
+                           {static_cast<int64>(indices.size())});
+  std::copy_n(indices.begin(), indices.size(),
+              indices_tensor->flat<int64>().data());
+
+  *values_tensor =
+      Tensor(DataTypeToEnum<float>::value, {static_cast<int64>(values.size())});
+  std::copy_n(values.begin(), values.size(),
+              values_tensor->flat<float>().data());
+
+  return Status::OK();
+}
+
+void CreateConstNode(const Tensor& tensor, const string& name,
+                     NodeDef* node_def) {
+  node_def->set_op("Const");
+  node_def->set_name(name);
+  SetNodeTensorAttr<float>("value", tensor, node_def);
+}
+}  // namespace
+
+Status SparsifyGather(const GraphDef& input_graph_def,
+                      const TransformFuncContext& context,
+                      GraphDef* output_graph_def) {
+  GraphDef current_graph_def = input_graph_def;
+  bool any_match_found = false;
+  // The subgraphs may have overlapping components, therefore GraphMatcher
+  // doesn't return all subgraphs in one round -- this has to be multi-round
+  // update.
+  do {
+    any_match_found = false;
+    GraphDef replaced_graph_def = current_graph_def;
+    std::vector<string> init_table_node_names;
+
+    TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+        current_graph_def,  // clang-format off
+      {"Gather",
+        {
+          {"Identity",
+            {
+              {"Const"}
+            }
+          },
+          {"*"},
+        }
+      },  // clang-format on
+        [&any_match_found, &init_table_node_names](
+            const NodeMatch& match, const std::set<string>& input_nodes,
+            const std::set<string>& output_nodes,
+            std::vector<NodeDef>* new_nodes) {
+          any_match_found = true;
+
+          // The captured subgraph should be of the following pattern:
+          // Const --> Identity --> Gather --> ...
+          //                          ^
+          //                          |
+          //                        (ids)
+          //
+          // After transform, it becomes:
+          //                   --> NoOp(group_deps)
+          //                   |
+          // Const --> InitializeTable --> HashTable
+          //                   ^              |
+          //                   |              |
+          // Const -------------              |
+          //                                  v
+          //               (ids) ---> LookupTableFind <--- Const(default)
+          //                                  |
+          //                                  v
+          //                                 ...
+
+          // clang-format off
+          // For each subgraph, do the following
+          // 1. Sparsify the `Const`, creating two `Const`, for hashtable
+          // key/val.
+          // 2. Create a `InitializeTable` op connecting to the above 2 `Const`.
+          // 3. Create a `HashTable` op connecting to `InitializeTable` op.
+          // 4. Replace the `Gather` with a `LookupTableFind` op.
+          // 5. Connect the `LookupTableFind` with
+          //    a. `HashTable`
+          //    b. `Gather`'s ids input
+          //    c. a `default_val` arg, valued at 0
+          // clang-format on
+          const NodeDef& gather_node = match.node;
+          const NodeDef& const_node = match.inputs[0].inputs[0].node;
+
+          DataType data_type;
+          TF_RETURN_IF_ERROR(GetNodeAttr(const_node, "dtype", &data_type));
+          if (data_type != DT_FLOAT) {
+            return tensorflow::errors::FailedPrecondition(
+                "Transform only applicable to subgraph with 'Const' of dtype "
+                "'DT_FLOAT'. Found 'Const' with name '",
+                const_node.name(), "' and dtype '", data_type, "'.");
+          }
+          Tensor weight = GetNodeTensorAttr(const_node, "value");
+          Tensor indices_tensor;
+          Tensor values_tensor;
+          TF_RETURN_IF_ERROR(
+              SparsifyWeights(weight, &indices_tensor, &values_tensor));
+
+          // indices and values of sparsified `Const`
+          DataType key_dtype = DT_INT64;
+          NodeDef indices_node;
+          CreateConstNode(indices_tensor, StrCat(const_node.name(), "/indices"),
+                          &indices_node);
+          SetNodeAttr("dtype", key_dtype, &indices_node);
+
+          NodeDef values_node;
+          CreateConstNode(values_tensor, StrCat(const_node.name(), "/values"),
+                          &values_node);
+          SetNodeAttr("dtype", data_type, &values_node);
+
+          // HashTable node
+          NodeDef hashtable_node;
+          hashtable_node.set_op("HashTable");
+          hashtable_node.set_name(StrCat(const_node.name(), "/HashTable"));
+          SetNodeAttr("key_dtype", key_dtype, &hashtable_node);
+          SetNodeAttr("value_dtype", data_type, &hashtable_node);
+
+          // InitializeTable node
+          NodeDef init_table_node;
+          init_table_node.set_op("InitializeTable");
+          init_table_node.set_name(
+              StrCat(const_node.name(), "/InitializeTable"));
+          SetNodeAttr("Tkey", key_dtype, &init_table_node);
+          SetNodeAttr("Tval", data_type, &init_table_node);
+          init_table_node_names.push_back(init_table_node.name());
+
+          // LookupTableFind node
+          NodeDef lookup_node;
+          lookup_node.set_op("LookupTableFind");
+          lookup_node.set_name(StrCat(gather_node.name(), "/LookupTableFind"));
+          SetNodeAttr("Tin", key_dtype, &lookup_node);
+          SetNodeAttr("Tout", data_type, &lookup_node);
+
+          // Default return value of hashtable lookup
+          Tensor zero_tensor(data_type, TensorShape({}));
+          zero_tensor.flat<float>()(0) = 0.0;
+          NodeDef default_value_node;
+          CreateConstNode(zero_tensor, StrCat(gather_node.name(), "/Const"),
+                          &default_value_node);
+          SetNodeAttr("dtype", data_type, &default_value_node);
+
+          // ExpandDims argument
+          Tensor dim_idx(DT_INT32, TensorShape({}));
+          dim_idx.flat<int32>()(0) = -1;
+          NodeDef dim_idx_node;
+          dim_idx_node.set_op("Const");
+          dim_idx_node.set_name(
+              StrCat(gather_node.name(), "/ExpandDims/Const"));
+          SetNodeAttr("value", dim_idx, &dim_idx_node);
+          SetNodeAttr("dtype", DT_INT32, &dim_idx_node);
+
+          // ExpandDims node
+          NodeDef expand_dims_node;
+          expand_dims_node.set_op("ExpandDims");
+          // Reuse gather_node's name so not to change dependent's inputs
+          expand_dims_node.set_name(gather_node.name());
+          SetNodeAttr("T", data_type, &expand_dims_node);
+
+          // Connect nodes
+          AddNodeInput(hashtable_node.name(), &init_table_node);
+          AddNodeInput(indices_node.name(), &init_table_node);
+          AddNodeInput(values_node.name(), &init_table_node);
+
+          AddNodeInput(hashtable_node.name(), &lookup_node);
+          AddNodeInput(gather_node.input(1), &lookup_node);
+          AddNodeInput(default_value_node.name(), &lookup_node);
+
+          AddNodeInput(lookup_node.name(), &expand_dims_node);
+          AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+
+          // Copy 'ids' input of original 'Gather'
+          new_nodes->push_back(match.inputs[1].node);
+          new_nodes->push_back(indices_node);
+          new_nodes->push_back(values_node);
+          new_nodes->push_back(hashtable_node);
+          new_nodes->push_back(init_table_node);
+          new_nodes->push_back(lookup_node);
+          new_nodes->push_back(default_value_node);
+          new_nodes->push_back(dim_idx_node);
+          new_nodes->push_back(expand_dims_node);
+
+          return Status::OK();
+        },
+        {true}, &replaced_graph_def));
+    NodeDef* init_op = nullptr;
+    for (int i = 0; i < replaced_graph_def.node_size(); i++) {
+      if (replaced_graph_def.node(i).name() == "group_deps" &&
+          replaced_graph_def.node(i).op() == "NoOp") {
+        if (init_op != nullptr) {
+          return tensorflow::errors::FailedPrecondition(
+              "Multiple nodes with name: 'group_deps' and type: 'NoOp'.");
+        }
+        init_op = replaced_graph_def.mutable_node(i);
+      }
+    }
+    if (!init_op) {
+      return tensorflow::errors::FailedPrecondition(
+          "No node found with name: 'group_deps' and type: 'NoOp'");
+    }
+    for (const string& name : init_table_node_names) {
+      // Add control dependence from init_table_node to group_deps_node
+      AddNodeInput(StrCat("^", name), init_op);
+    }
+    current_graph_def = replaced_graph_def;
+  } while (any_match_found);
+  *output_graph_def = current_graph_def;
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("sparsify_gather", SparsifyGather);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
new file mode 100644
index 00000000000..8d353d34763
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -0,0 +1,352 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declare here, so we don't need a public header.
+Status SparsifyGather(const GraphDef& input_graph_def,
+                      const TransformFuncContext& context,
+                      GraphDef* output_graph_def);
+
+class SparsifyGatherTest : public ::testing::Test {
+ protected:
+  NodeDef* CreateNode(const string& name, const string& op,
+                      const std::vector<NodeDef*>& inputs,
+                      GraphDef* graph_def) {
+    NodeDef* node_def = graph_def->add_node();
+    node_def->set_name(name);
+    node_def->set_op(op);
+    std::for_each(inputs.begin(), inputs.end(), [&node_def](NodeDef* input) {
+      node_def->add_input(input->name());
+    });
+    return node_def;
+  }
+
+  void TestSinglePartitionConst() {
+    GraphDef graph_def;
+
+    // Build the graph.
+    NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
+    NodeDef* const_node = CreateNode("const", "Const", {}, &graph_def);
+    SetNodeAttr("dtype", DT_FLOAT, const_node);
+    // Set 'Const' node value.
+    Tensor weights(DT_FLOAT, TensorShape({4, 1}));
+    test::FillValues<float>(&weights, {0.2, 0.000001, 1.2, 0.001});
+    SetNodeTensorAttr<float>("value", weights, const_node);
+
+    NodeDef* identity_node =
+        CreateNode("const/read", "Identity", {const_node}, &graph_def);
+    CreateNode("gather", "Gather", {identity_node, input_node}, &graph_def);
+    CreateNode("group_deps", "NoOp", {}, &graph_def);
+
+    // Run the op.
+    GraphDef result;
+    TransformFuncContext context;
+    context.input_names = {"ids"};
+    context.output_names = {"gather"};
+    TF_ASSERT_OK(SparsifyGather(graph_def, context, &result));
+
+    // Validation begins.
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+
+    // Check nodes.
+    EXPECT_EQ(1, node_lookup.count("ids"));
+    EXPECT_EQ("Const", node_lookup.at("ids")->op());
+
+    // Nodes in "const" scope.
+    EXPECT_EQ(1, node_lookup.count("const/indices"));
+    EXPECT_EQ("Const", node_lookup.at("const/indices")->op());
+    Tensor expected_indices_tensor(DT_INT64, TensorShape({3}));
+    test::FillValues<int64>(&expected_indices_tensor, {0, 2, 3});
+    test::ExpectTensorEqual<int64>(
+        expected_indices_tensor,
+        GetNodeTensorAttr(*(node_lookup.at("const/indices")), "value"));
+
+    EXPECT_EQ(1, node_lookup.count("const/values"));
+    EXPECT_EQ("Const", node_lookup.at("const/values")->op());
+    Tensor expected_values_tensor(DT_FLOAT, TensorShape({3}));
+    test::FillValues<float>(&expected_values_tensor, {0.2, 1.2, 0.001});
+    test::ExpectTensorNear<float>(
+        expected_values_tensor,
+        GetNodeTensorAttr(*(node_lookup.at("const/values")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("const/HashTable"));
+    EXPECT_EQ("HashTable", node_lookup.at("const/HashTable")->op());
+
+    EXPECT_EQ(1, node_lookup.count("const/InitializeTable"));
+    EXPECT_EQ("InitializeTable", node_lookup.at("const/InitializeTable")->op());
+
+    // Nodes in "gather" scope.
+    EXPECT_EQ(1, node_lookup.count("gather/LookupTableFind"));
+    EXPECT_EQ("LookupTableFind",
+              node_lookup.at("gather/LookupTableFind")->op());
+
+    EXPECT_EQ(1, node_lookup.count("gather/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather/Const")->op());
+    Tensor expected_gather_default_tensor(DT_FLOAT, TensorShape({}));
+    test::FillValues<float>(&expected_gather_default_tensor, {0.0});
+    test::ExpectTensorNear<float>(
+        expected_gather_default_tensor,
+        GetNodeTensorAttr(*(node_lookup.at("gather/Const")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("gather/ExpandDims/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather/ExpandDims/Const")->op());
+    Tensor expected_expand_dims_tensor(DT_INT32, TensorShape({}));
+    test::FillValues<int32>(&expected_expand_dims_tensor, {-1});
+    test::ExpectTensorEqual<int32>(
+        expected_expand_dims_tensor,
+        GetNodeTensorAttr(*(node_lookup.at("gather/ExpandDims/Const")),
+                          "value"));
+
+    EXPECT_EQ(1, node_lookup.count("gather"));
+    EXPECT_EQ("ExpandDims", node_lookup.at("gather")->op());
+
+    EXPECT_EQ(1, node_lookup.count("group_deps"));
+    EXPECT_EQ("NoOp", node_lookup.at("group_deps")->op());
+
+    // Check connections
+    EXPECT_EQ("const/HashTable",
+              node_lookup.at("const/InitializeTable")->input(0));
+    EXPECT_EQ("const/indices",
+              node_lookup.at("const/InitializeTable")->input(1));
+    EXPECT_EQ("const/values",
+              node_lookup.at("const/InitializeTable")->input(2));
+
+    EXPECT_EQ("const/HashTable",
+              node_lookup.at("gather/LookupTableFind")->input(0));
+    EXPECT_EQ("ids", node_lookup.at("gather/LookupTableFind")->input(1));
+    EXPECT_EQ("gather/Const",
+              node_lookup.at("gather/LookupTableFind")->input(2));
+
+    EXPECT_EQ("gather/LookupTableFind", node_lookup.at("gather")->input(0));
+
+    // Check control dependency.
+    EXPECT_NE(std::find(node_lookup.at("group_deps")->input().begin(),
+                        node_lookup.at("group_deps")->input().end(),
+                        "^const/InitializeTable"),
+              node_lookup.at("group_deps")->input().end());
+  }
+
+  void TestMultiPartitionConst() {
+    // The 'ids' node is served input for two 'Gather's.
+    GraphDef graph_def;
+
+    // Build Graph:
+    // Shared input node
+    NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
+    // Shared init node
+    CreateNode("group_deps", "NoOp", {}, &graph_def);
+
+    // Two partitions
+    NodeDef* const_node1 = CreateNode("const1", "Const", {}, &graph_def);
+    SetNodeAttr("dtype", DT_FLOAT, const_node1);
+    // Set 'Const' node value.
+    Tensor weights(DT_FLOAT, TensorShape({4, 1}));
+    test::FillValues<float>(&weights, {0.2, 0.000001, 1.2, 0.001});
+    SetNodeTensorAttr<float>("value", weights, const_node1);
+
+    NodeDef* const_node2 = CreateNode("const2", "Const", {}, &graph_def);
+    SetNodeAttr("dtype", DT_FLOAT, const_node2);
+    SetNodeTensorAttr<float>("value", weights, const_node2);
+
+    NodeDef* identity_node1 =
+        CreateNode("const1/read", "Identity", {const_node1}, &graph_def);
+    NodeDef* identity_node2 =
+        CreateNode("const2/read", "Identity", {const_node2}, &graph_def);
+    CreateNode("gather1", "Gather", {identity_node1, input_node}, &graph_def);
+    CreateNode("gather2", "Gather", {identity_node2, input_node}, &graph_def);
+
+    // Run the op.
+    GraphDef result;
+    TransformFuncContext context;
+    context.input_names = {"ids"};
+    context.output_names = {"gather1", "gather2"};
+    TF_ASSERT_OK(SparsifyGather(graph_def, context, &result));
+
+    // Validation begins.
+    std::map<string, const NodeDef*> node_lookup;
+    MapNamesToNodes(result, &node_lookup);
+
+    // Check nodes.
+    // Check shared nodes:
+    EXPECT_EQ(1, node_lookup.count("ids"));
+    EXPECT_EQ("Const", node_lookup.at("ids")->op());
+
+    EXPECT_EQ(1, node_lookup.count("group_deps"));
+    EXPECT_EQ("NoOp", node_lookup.at("group_deps")->op());
+
+    // Nodes in "const1" scope.
+    EXPECT_EQ(1, node_lookup.count("const1/indices"));
+    EXPECT_EQ("Const", node_lookup.at("const1/indices")->op());
+    Tensor expected_indices_tensor1(DT_INT64, TensorShape({3}));
+    test::FillValues<int64>(&expected_indices_tensor1, {0, 2, 3});
+    test::ExpectTensorEqual<int64>(
+        expected_indices_tensor1,
+        GetNodeTensorAttr(*(node_lookup.at("const1/indices")), "value"));
+
+    EXPECT_EQ(1, node_lookup.count("const1/values"));
+    EXPECT_EQ("Const", node_lookup.at("const1/values")->op());
+    Tensor expected_values_tensor1(DT_FLOAT, TensorShape({3}));
+    test::FillValues<float>(&expected_values_tensor1, {0.2, 1.2, 0.001});
+    test::ExpectTensorNear<float>(
+        expected_values_tensor1,
+        GetNodeTensorAttr(*(node_lookup.at("const1/values")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("const1/HashTable"));
+    EXPECT_EQ("HashTable", node_lookup.at("const1/HashTable")->op());
+
+    EXPECT_EQ(1, node_lookup.count("const1/InitializeTable"));
+    EXPECT_EQ("InitializeTable",
+              node_lookup.at("const1/InitializeTable")->op());
+
+    // Nodes in "gather1" scope.
+    EXPECT_EQ(1, node_lookup.count("gather1/LookupTableFind"));
+    EXPECT_EQ("LookupTableFind",
+              node_lookup.at("gather1/LookupTableFind")->op());
+
+    EXPECT_EQ(1, node_lookup.count("gather1/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather1/Const")->op());
+    Tensor expected_gather_default_tensor1(DT_FLOAT, TensorShape({}));
+    test::FillValues<float>(&expected_gather_default_tensor1, {0.0});
+    test::ExpectTensorNear<float>(
+        expected_gather_default_tensor1,
+        GetNodeTensorAttr(*(node_lookup.at("gather1/Const")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("gather1/ExpandDims/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather1/ExpandDims/Const")->op());
+    Tensor expected_expand_dims_tensor1(DT_INT32, TensorShape({}));
+    test::FillValues<int32>(&expected_expand_dims_tensor1, {-1});
+    test::ExpectTensorEqual<int32>(
+        expected_expand_dims_tensor1,
+        GetNodeTensorAttr(*(node_lookup.at("gather1/ExpandDims/Const")),
+                          "value"));
+
+    EXPECT_EQ(1, node_lookup.count("gather1"));
+    EXPECT_EQ("ExpandDims", node_lookup.at("gather1")->op());
+
+    // Nodes in "const2" scope.
+    EXPECT_EQ(1, node_lookup.count("const2/indices"));
+    EXPECT_EQ("Const", node_lookup.at("const2/indices")->op());
+    Tensor expected_indices_tensor2(DT_INT64, TensorShape({3}));
+    test::FillValues<int64>(&expected_indices_tensor2, {0, 2, 3});
+    test::ExpectTensorEqual<int64>(
+        expected_indices_tensor2,
+        GetNodeTensorAttr(*(node_lookup.at("const2/indices")), "value"));
+
+    EXPECT_EQ(1, node_lookup.count("const2/values"));
+    EXPECT_EQ("Const", node_lookup.at("const2/values")->op());
+    Tensor expected_values_tensor2(DT_FLOAT, TensorShape({3}));
+    test::FillValues<float>(&expected_values_tensor2, {0.2, 1.2, 0.001});
+    test::ExpectTensorNear<float>(
+        expected_values_tensor2,
+        GetNodeTensorAttr(*(node_lookup.at("const2/values")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("const2/HashTable"));
+    EXPECT_EQ("HashTable", node_lookup.at("const2/HashTable")->op());
+
+    EXPECT_EQ(1, node_lookup.count("const2/InitializeTable"));
+    EXPECT_EQ("InitializeTable",
+              node_lookup.at("const2/InitializeTable")->op());
+
+    // Nodes in "gather2" scope.
+    EXPECT_EQ(1, node_lookup.count("gather2/LookupTableFind"));
+    EXPECT_EQ("LookupTableFind",
+              node_lookup.at("gather2/LookupTableFind")->op());
+
+    EXPECT_EQ(1, node_lookup.count("gather2/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather2/Const")->op());
+    Tensor expected_gather_default_tensor2(DT_FLOAT, TensorShape({}));
+    test::FillValues<float>(&expected_gather_default_tensor2, {0.0});
+    test::ExpectTensorNear<float>(
+        expected_gather_default_tensor2,
+        GetNodeTensorAttr(*(node_lookup.at("gather2/Const")), "value"), 1e-5);
+
+    EXPECT_EQ(1, node_lookup.count("gather2/ExpandDims/Const"));
+    EXPECT_EQ("Const", node_lookup.at("gather2/ExpandDims/Const")->op());
+    Tensor expected_expand_dims_tensor2(DT_INT32, TensorShape({}));
+    test::FillValues<int32>(&expected_expand_dims_tensor2, {-1});
+    test::ExpectTensorEqual<int32>(
+        expected_expand_dims_tensor2,
+        GetNodeTensorAttr(*(node_lookup.at("gather2/ExpandDims/Const")),
+                          "value"));
+
+    EXPECT_EQ(1, node_lookup.count("gather2"));
+    EXPECT_EQ("ExpandDims", node_lookup.at("gather2")->op());
+
+    // Check connections
+    EXPECT_EQ("const1/HashTable",
+              node_lookup.at("const1/InitializeTable")->input(0));
+    EXPECT_EQ("const1/indices",
+              node_lookup.at("const1/InitializeTable")->input(1));
+    EXPECT_EQ("const1/values",
+              node_lookup.at("const1/InitializeTable")->input(2));
+
+    EXPECT_EQ("const2/HashTable",
+              node_lookup.at("const2/InitializeTable")->input(0));
+    EXPECT_EQ("const2/indices",
+              node_lookup.at("const2/InitializeTable")->input(1));
+    EXPECT_EQ("const2/values",
+              node_lookup.at("const2/InitializeTable")->input(2));
+
+    EXPECT_EQ("const1/HashTable",
+              node_lookup.at("gather1/LookupTableFind")->input(0));
+    EXPECT_EQ("ids", node_lookup.at("gather1/LookupTableFind")->input(1));
+    EXPECT_EQ("gather1/Const",
+              node_lookup.at("gather1/LookupTableFind")->input(2));
+    EXPECT_EQ("gather1/LookupTableFind", node_lookup.at("gather1")->input(0));
+
+    EXPECT_EQ("const2/HashTable",
+              node_lookup.at("gather2/LookupTableFind")->input(0));
+    EXPECT_EQ("ids", node_lookup.at("gather2/LookupTableFind")->input(1));
+    EXPECT_EQ("gather2/Const",
+              node_lookup.at("gather2/LookupTableFind")->input(2));
+    EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0));
+
+    // Check control deps.
+    EXPECT_EQ(2, node_lookup.at("group_deps")->input_size());
+    EXPECT_NE(std::find(node_lookup.at("group_deps")->input().begin(),
+                        node_lookup.at("group_deps")->input().end(),
+                        "^const1/InitializeTable"),
+              node_lookup.at("group_deps")->input().end());
+
+    EXPECT_NE(std::find(node_lookup.at("group_deps")->input().begin(),
+                        node_lookup.at("group_deps")->input().end(),
+                        "^const2/InitializeTable"),
+              node_lookup.at("group_deps")->input().end());
+  }
+};
+
+TEST_F(SparsifyGatherTest, TestSinglePartitionConst) {
+  TestSinglePartitionConst();
+}
+
+TEST_F(SparsifyGatherTest, TestMultiPartitionConst) {
+  TestMultiPartitionConst();
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index 786bf4f6da1..08de934916f 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -76,7 +76,7 @@ Status TypeForPlaceholder(const TransformFuncContext& context,
 
 // Takes a comma-separated string of numbers and parses them into a shape.
 bool TensorShapeFromString(const string& shape_string, TensorShape* result) {
-  if (shape_string == "") {
+  if (shape_string.empty()) {
     return false;
   }
   std::vector<int64> dims;
@@ -190,7 +190,7 @@ Status StripUnusedNodes(const GraphDef& input_graph_def,
     if (input_nodes.count(node.name())) {
       NodeDef placeholder_node;
       if (node.op() == "Placeholder") {
-        placeholder_node.CopyFrom(node);
+        placeholder_node = node;
       } else {
         placeholder_node.set_op("Placeholder");
         placeholder_node.set_name(node.name());
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index 8c1c007b1df..91670f54d49 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -23,6 +23,7 @@ limitations under the License.
 // bazel-bin/tensorflow/tools/graph_transforms/summarize_graph \
 // --in_graph=my_graph.pb
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -35,12 +36,102 @@ namespace tensorflow {
 namespace graph_transforms {
 namespace {
 
-Status SummarizeGraph(const GraphDef& graph) {
+void PrintNodeInfo(const NodeDef* node) {
+  string shape_description = "None";
+  if (node->attr().count("shape")) {
+    TensorShapeProto shape_proto = node->attr().at("shape").shape();
+    Status shape_status = PartialTensorShape::IsValidShape(shape_proto);
+    if (shape_status.ok()) {
+      shape_description = PartialTensorShape(shape_proto).DebugString();
+    } else {
+      shape_description = shape_status.error_message();
+    }
+  }
+  DataType dtype = DT_INVALID;
+  if (node->attr().count("dtype")) {
+    dtype = node->attr().at("dtype").type();
+  }
+  std::cout << "(name=" << node->name();
+  std::cout << ", type=" << DataTypeString(dtype) << "(" << dtype << ")";
+  std::cout << ", shape=" << shape_description << ") ";
+}
+
+void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
+                         const std::vector<const NodeDef*>& variables,
+                         const std::vector<const NodeDef*> outputs,
+                         const string& graph_path) {
+  std::vector<const NodeDef*> all_inputs(placeholders);
+  all_inputs.insert(all_inputs.end(), variables.begin(), variables.end());
+
+  std::vector<string> input_layers;
+  std::vector<string> input_layer_types;
+  std::vector<string> input_layer_shapes;
+  for (const NodeDef* node : all_inputs) {
+    input_layers.push_back(node->name());
+    DataType dtype = DT_INVALID;
+    if (node->attr().count("dtype")) {
+      dtype = node->attr().at("dtype").type();
+    }
+    input_layer_types.push_back(DataTypeString(dtype));
+    std::vector<int64> sizes;
+    PartialTensorShape shape;
+    if (node->attr().count("shape")) {
+      TensorShapeProto shape_proto = node->attr().at("shape").shape();
+      if (PartialTensorShape::IsValid(shape_proto)) {
+        shape = PartialTensorShape(shape_proto);
+      }
+    }
+    sizes.reserve(shape.dims());
+    for (int i = 0; i < shape.dims(); ++i) {
+      sizes.push_back(shape.dim_size(i));
+    }
+    string sizes_string = str_util::Join(sizes, ",");
+    input_layer_shapes.push_back(sizes_string);
+  }
+  std::vector<string> output_layers;
+  output_layers.reserve(outputs.size());
+  for (const NodeDef* node : outputs) {
+    output_layers.push_back(node->name());
+  }
+  string input_layer_value = str_util::Join(input_layers, ",");
+  string input_layer_type_value = str_util::Join(input_layer_types, ",");
+  string input_layer_shape_value = str_util::Join(input_layer_shapes, ":");
+  string output_layer_value = str_util::Join(output_layers, ",");
+
+  std::cout << "To use with tensorflow/tools/benchmark:benchmark_model try "
+               "these arguments:"
+            << std::endl;
+  std::cout << "bazel run tensorflow/tools/benchmark:benchmark_model --";
+  std::cout << " --graph=" << graph_path;
+  std::cout << " --show_flops";
+  std::cout << " --input_layer=" << input_layer_value;
+  std::cout << " --input_layer_type=" << input_layer_type_value;
+  std::cout << " --input_layer_shape=" << input_layer_shape_value;
+  std::cout << " --output_layer=" << output_layer_value;
+  std::cout << std::endl;
+}
+
+Status PrintStructure(const GraphDef& graph) {
+  GraphDef sorted_graph;
+  TF_RETURN_IF_ERROR(SortByExecutionOrder(graph, &sorted_graph));
+  for (const NodeDef& node : sorted_graph.node()) {
+    std::cout << node.name() << " (" << node.op() << "): ["
+              << str_util::Join(node.input(), ", ") << "]" << std::endl;
+  }
+  return Status::OK();
+}
+
+Status SummarizeGraph(const GraphDef& graph, const string& graph_path,
+                      bool print_structure) {
   std::vector<const NodeDef*> placeholders;
+  std::vector<const NodeDef*> variables;
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "Placeholder") {
       placeholders.push_back(&node);
     }
+    if (node.op() == "Variable" || node.op() == "VariableV2") {
+      variables.push_back(&node);
+    }
   }
 
   if (placeholders.empty()) {
@@ -48,15 +139,17 @@ Status SummarizeGraph(const GraphDef& graph) {
   } else {
     std::cout << "Found " << placeholders.size() << " possible inputs: ";
     for (const NodeDef* node : placeholders) {
-      TensorShape shape;
-      if (node->attr().count("shape")) {
-        TensorShapeProto shape_proto = node->attr().at("shape").shape();
-        shape = TensorShape(shape_proto);
-      }
-      DataType dtype = node->attr().at("dtype").type();
-      std::cout << "(name=" << node->name();
-      std::cout << ", type=" << DataTypeString(dtype) << "(" << dtype << ")";
-      std::cout << ", shape=" << shape.DebugString() << ") ";
+      PrintNodeInfo(node);
+    }
+    std::cout << std::endl;
+  }
+
+  if (variables.empty()) {
+    std::cout << "No variables spotted." << std::endl;
+  } else {
+    std::cout << "Found " << variables.size() << " variables: ";
+    for (const NodeDef* node : variables) {
+      PrintNodeInfo(node);
     }
     std::cout << std::endl;
   }
@@ -64,9 +157,11 @@ Status SummarizeGraph(const GraphDef& graph) {
   std::map<string, std::vector<const NodeDef*>> output_map;
   MapNodesToOutputs(graph, &output_map);
   std::vector<const NodeDef*> outputs;
+  std::unordered_set<string> unlikely_output_types = {"Const", "Assign", "NoOp",
+                                                      "Placeholder"};
   for (const NodeDef& node : graph.node()) {
-    if ((output_map.count(node.name()) == 0) && (node.op() != "Const") &&
-        (node.op() != "Assign")) {
+    if ((output_map.count(node.name()) == 0) &&
+        (unlikely_output_types.count(node.op()) == 0)) {
       outputs.push_back(&node);
     }
   }
@@ -92,12 +187,14 @@ Status SummarizeGraph(const GraphDef& graph) {
         ++control_edge_count;
       }
     }
-    if (node.device() != "") {
+    if (!node.device().empty()) {
       ++device_counts[node.device()];
     }
-    if ((node.op() == "Const") || (node.op() == "Variable")) {
+    if ((node.op() == "Const") || (node.op() == "Variable") ||
+        (node.op() == "VariableV2")) {
       Tensor tensor;
-      if (tensor.FromProto(node.attr().at("value").tensor())) {
+      if (node.attr().count("value") &&
+          tensor.FromProto(node.attr().at("value").tensor())) {
         const size_t num_elements = tensor.NumElements();
         if (node.op() == "Const") {
           const_parameter_count += num_elements;
@@ -156,17 +253,22 @@ Status SummarizeGraph(const GraphDef& graph) {
   }
   std::cout << std::endl;
 
+  PrintBenchmarkUsage(placeholders, variables, outputs, graph_path);
+
+  if (print_structure) {
+    TF_RETURN_IF_ERROR(PrintStructure(graph));
+  }
+
   return Status::OK();
 }
 
 int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   string in_graph = "";
-  string out_graph = "";
-  string inputs_string = "";
-  string outputs_string = "";
-  string transforms_string = "";
+  bool print_structure = false;
   std::vector<Flag> flag_list = {
       Flag("in_graph", &in_graph, "input graph file name"),
+      Flag("print_structure", &print_structure,
+           "whether to print the network connections of the graph"),
   };
   string usage = Flags::Usage(argv[0], flag_list);
 
@@ -188,7 +290,7 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   }
 
   GraphDef graph_def;
-  Status load_status = ReadBinaryProto(Env::Default(), in_graph, &graph_def);
+  Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
     LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
                << load_status.error_message();
@@ -196,7 +298,8 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
     return -1;
   }
 
-  Status summarize_result = SummarizeGraph(graph_def);
+  Status summarize_result =
+      SummarizeGraph(graph_def, in_graph, print_structure);
   if (!summarize_result.ok()) {
     LOG(ERROR) << summarize_result.error_message() << "\n" << usage;
     return -1;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index c48be92bb99..e7694104cbd 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/tools/graph_transforms/transform_graph.h"
 
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -47,11 +48,16 @@ Status ParseTransformParameters(const string& transforms_string,
       // Reset the list of parameters.
       func_parameters.clear();
       // Eat up any leading spaces.
-      Scanner(remaining).Any(Scanner::SPACE).GetResult(&remaining, &match);
+      Scanner(remaining).AnySpace().GetResult(&remaining, &match);
+      if (remaining.empty()) {
+        // Nothing remains after consuming trailing spaces.
+        // Consumed all transform parameter string without errors.
+        return Status::OK();
+      }
       // See if we have a valid transform name.
       const bool found_transform_name =
           Scanner(remaining)
-              .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+              .Many(Scanner::LETTER_DIGIT_UNDERSCORE)
               .GetResult(&remaining, &transform_name);
       if (!found_transform_name) {
         return errors::InvalidArgument("Looking for transform name, but found ",
@@ -73,11 +79,11 @@ Status ParseTransformParameters(const string& transforms_string,
       } else {
         // Eat up any leading spaces or commas.
         Scanner(remaining).ZeroOrOneLiteral(",").GetResult(&remaining, &match);
-        Scanner(remaining).Any(Scanner::SPACE).GetResult(&remaining, &match);
+        Scanner(remaining).AnySpace().GetResult(&remaining, &match);
         // See if we have a valid parameter name.
         const bool found_parameter_name =
             Scanner(remaining)
-                .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+                .Many(Scanner::LETTER_DIGIT_UNDERSCORE)
                 .GetResult(&remaining, &parameter_name);
         if (!found_parameter_name) {
           return errors::InvalidArgument(
@@ -105,7 +111,7 @@ Status ParseTransformParameters(const string& transforms_string,
         // See if we have a valid parameter name.
         found_parameter_value =
             Scanner(remaining)
-                .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
+                .Many(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
                 .GetResult(&remaining, &parameter_value);
       }
       if (!found_parameter_value) {
@@ -247,7 +253,7 @@ Status TransformGraph(const std::vector<string>& inputs,
   TransformRegistry* transform_registry = GetTransformRegistry();
   for (const auto& transform_info : transform_params) {
     const string& transform_name = transform_info.first;
-    if (transform_name == "") {
+    if (transform_name.empty()) {
       continue;
     }
     if (!transform_registry->count(transform_name)) {
@@ -277,7 +283,7 @@ Status TransformGraph(const std::vector<string>& inputs,
       }
     }
     // Copy over the library from the original input graph.
-    transformed_graph_def.mutable_library()->CopyFrom(graph_def->library());
+    *transformed_graph_def.mutable_library() = graph_def->library();
     TF_RETURN_IF_ERROR(IsGraphValid(transformed_graph_def));
 
     *graph_def = transformed_graph_def;
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index 12df4051fbb..bc2412fcbdb 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -145,33 +145,34 @@ class TransformGraphTest : public ::testing::Test {
   void TestParseTransformParameters() {
     TransformParameters params_list;
 
-    ParseTransformParameters("foo", &params_list);
+    TF_EXPECT_OK(ParseTransformParameters("foo", &params_list));
     EXPECT_EQ(1, params_list.size());
     EXPECT_EQ("foo", params_list[0].first);
     EXPECT_TRUE(params_list[0].second.empty());
 
-    ParseTransformParameters("foo bar", &params_list);
+    TF_EXPECT_OK(ParseTransformParameters("foo bar", &params_list));
     EXPECT_EQ(2, params_list.size());
     EXPECT_EQ("foo", params_list[0].first);
     EXPECT_TRUE(params_list[0].second.empty());
     EXPECT_EQ("bar", params_list[1].first);
     EXPECT_TRUE(params_list[1].second.empty());
 
-    ParseTransformParameters("foo() bar()", &params_list);
+    TF_EXPECT_OK(ParseTransformParameters("foo() bar()", &params_list));
     EXPECT_EQ(2, params_list.size());
     EXPECT_EQ("foo", params_list[0].first);
     EXPECT_TRUE(params_list[0].second.empty());
     EXPECT_EQ("bar", params_list[1].first);
     EXPECT_TRUE(params_list[1].second.empty());
 
-    ParseTransformParameters("foo(bob_something=sue)", &params_list);
+    TF_EXPECT_OK(
+        ParseTransformParameters("foo(bob_something=sue)", &params_list));
     EXPECT_EQ(1, params_list.size());
     EXPECT_EQ("foo", params_list[0].first);
     EXPECT_EQ(1, params_list[0].second.count("bob_something"));
     EXPECT_EQ(1, params_list[0].second["bob_something"].size());
     EXPECT_EQ("sue", params_list[0].second["bob_something"][0]);
 
-    ParseTransformParameters("bar(a=1, b=2, a=3)", &params_list);
+    TF_EXPECT_OK(ParseTransformParameters("bar(a=1, b=2, a=3)", &params_list));
     EXPECT_EQ(1, params_list.size());
     EXPECT_EQ("bar", params_list[0].first);
     EXPECT_EQ(1, params_list[0].second.count("a"));
@@ -182,7 +183,8 @@ class TransformGraphTest : public ::testing::Test {
     EXPECT_EQ(1, params_list[0].second["b"].size());
     EXPECT_EQ("2", params_list[0].second["b"][0]);
 
-    ParseTransformParameters("bar(a=\"1\", b=\"1,2,3\", a=3)", &params_list);
+    TF_EXPECT_OK(ParseTransformParameters("bar(a=\"1\", b=\"1,2,3\", a=3)",
+                                          &params_list));
     EXPECT_EQ(1, params_list.size());
     EXPECT_EQ("bar", params_list[0].first);
     EXPECT_EQ(1, params_list[0].second.count("a"));
@@ -194,6 +196,28 @@ class TransformGraphTest : public ::testing::Test {
     EXPECT_EQ("1,2,3", params_list[0].second["b"][0]);
   }
 
+  void TestParseEscapedNewline() {
+    // This sequence of characters caused an infinite loop in the parser, which
+    // is responsible for the hang mentioned in
+    // https://github.com/tensorflow/tensorflow/issues/7150
+    TransformParameters params_list;
+    ParseTransformParameters("\\\n", &params_list).IgnoreError();
+    EXPECT_EQ(0, params_list.size());
+  }
+
+  void TestParseExtraSpaces() {
+    TransformParameters params_list;
+    ParseTransformParameters(" ", &params_list).IgnoreError();
+    EXPECT_EQ(0, params_list.size());
+
+    TF_EXPECT_OK(ParseTransformParameters("  foo bar \\\n", &params_list));
+    EXPECT_EQ(2, params_list.size());
+    EXPECT_EQ("foo", params_list[0].first);
+    EXPECT_TRUE(params_list[0].second.empty());
+    EXPECT_EQ("bar", params_list[1].first);
+    EXPECT_TRUE(params_list[1].second.empty());
+  }
+
   void TestShouldIgnoreErrors() {
     bool ignore_errors;
     TF_EXPECT_OK(
@@ -222,6 +246,10 @@ TEST_F(TransformGraphTest, TestParseTransformParameters) {
   TestParseTransformParameters();
 }
 
+TEST_F(TransformGraphTest, TestParseEscapedNewline) {
+  TestParseEscapedNewline();
+}
+
 TEST_F(TransformGraphTest, TestShouldIgnoreErrors) { TestShouldIgnoreErrors(); }
 
 }  // namespace graph_transforms
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 310c331e8f0..0ef517acc5b 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -110,7 +110,7 @@ string CanonicalInputName(const string& input_name) {
   string node_name;
   string suffix;
   NodeNamePartsFromInput(input_name, &prefix, &node_name, &suffix);
-  if (suffix == "") {
+  if (suffix.empty()) {
     suffix = ":0";
   }
   return prefix + node_name + suffix;
@@ -146,7 +146,7 @@ void CopyNodeAttr(const NodeDef& source, const string& source_key,
                   const string& dest_key, NodeDef* dest) {
   CHECK_NE(0, source.attr().count(source_key))
       << "No key '" << source_key << "' found in " << source.DebugString();
-  (*(dest->mutable_attr()))[dest_key].CopyFrom(source.attr().at(source_key));
+  (*(dest->mutable_attr()))[dest_key] = source.attr().at(source_key);
 }
 
 Tensor GetNodeTensorAttr(const NodeDef& node, const string& key) {
@@ -162,7 +162,7 @@ void FilterGraphDef(const GraphDef& input_graph_def,
   output_graph_def->mutable_node()->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     if (selector(node)) {
-      output_graph_def->mutable_node()->Add()->CopyFrom(node);
+      *output_graph_def->mutable_node()->Add() = node;
     }
   }
 }
@@ -173,7 +173,7 @@ void RemoveAttributes(const GraphDef& input_graph_def,
   output_graph_def->mutable_node()->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     for (const string& attribute : attributes) {
       new_node->mutable_attr()->erase(attribute);
     }
@@ -237,7 +237,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
     ready.pop_back();
     ++processed;
     const NodeDef& node_def(input_graph_def.node(o));
-    output_graph_def->mutable_node()->Add()->CopyFrom(node_def);
+    *output_graph_def->mutable_node()->Add() = node_def;
 
     // Update pending_count for outputs.
     for (size_t i = 0; i < outputs[o].size(); ++i) {
@@ -277,7 +277,7 @@ string NodeMatch::DebugString() const {
 }
 
 GraphMatcher::GraphMatcher(const GraphDef& graph_def) {
-  SortByExecutionOrder(graph_def, &graph_def_);
+  SortByExecutionOrder(graph_def, &graph_def_).IgnoreError();
   MapNamesToNodes(graph_def_, &node_map_);
 }
 
@@ -362,7 +362,7 @@ Status ReplaceMatchingOpTypes(
   // Start off by retrieving all the matching subgraphs.
   GraphMatcher matcher(input_graph_def);
   std::vector<NodeMatch> matches;
-  matcher.GetOpTypeMatches(pattern, &matches);
+  TF_RETURN_IF_ERROR(matcher.GetOpTypeMatches(pattern, &matches));
 
   // Do some housekeeping so we can easily look up the resulting matches given
   // a node name.
@@ -446,18 +446,18 @@ Status ReplaceMatchingOpTypes(
         MatchedNodesAsArray(*match, &old_nodes);
         for (const NodeDef& old_node : old_nodes) {
           NodeDef* added_node = output_graph_def->mutable_node()->Add();
-          added_node->CopyFrom(old_node);
+          *added_node = old_node;
         }
       } else {
         for (const NodeDef& new_node : new_nodes) {
           NodeDef* added_node = output_graph_def->mutable_node()->Add();
-          added_node->CopyFrom(new_node);
+          *added_node = new_node;
         }
       }
     } else if (!matched_nodes.count(input_node.name())) {
       // This node isn't part of any match, so just copy it over.
       NodeDef* added_node = output_graph_def->mutable_node()->Add();
-      added_node->CopyFrom(input_node);
+      *added_node = input_node;
     } else {
       // Do nothing, because this is an internal part of a matching subgraph,
       // and so will have been replaced by a new replacement subgraph.
@@ -481,7 +481,7 @@ Status RenameNodeInputs(const GraphDef& input_graph_def,
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
-    new_node->CopyFrom(node);
+    *new_node = node;
     new_node->mutable_input()->Clear();
     for (const string& input_name : node.input()) {
       std::set<string> already_visited;
diff --git a/tensorflow/tools/graph_transforms/transform_utils.h b/tensorflow/tools/graph_transforms/transform_utils.h
index 54808efa9fb..6ed549a9589 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.h
+++ b/tensorflow/tools/graph_transforms/transform_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -106,7 +107,7 @@ void FilterGraphDef(const GraphDef& input_graph_def,
                     std::function<bool(const NodeDef&)> selector,
                     GraphDef* output_graph_def);
 
-// Creates a copy of the input graph, with all occurences of the attributes with
+// Creates a copy of the input graph, with all occurrences of the attributes with
 // the names in the argument removed from the node defs.
 void RemoveAttributes(const GraphDef& input_graph_def,
                       const std::vector<string>& attributes,
diff --git a/tensorflow/tools/graph_transforms/transform_utils_test.cc b/tensorflow/tools/graph_transforms/transform_utils_test.cc
index 92ebc358342..d068254b35f 100644
--- a/tensorflow/tools/graph_transforms/transform_utils_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -438,7 +438,7 @@ class TransformUtilsTest : public ::testing::Test {
            const std::set<string>& output_nodes,
            std::vector<NodeDef>* new_nodes) {
           NodeDef original_copy;
-          original_copy.CopyFrom(match.node);
+          original_copy = match.node;
           const string original_name = match.node.name();
           original_copy.set_name(original_name + "_before_identity");
           new_nodes->push_back(original_copy);
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 41e7221efe4..51ba3b7a0be 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -1,13 +1,17 @@
-# Packaging the TensorFlow C API into a small, standalone archive for use with
-# language bindings and installations without Python.
-#
-# TODO(ashankar): Something similar for the JNI library for Java?
-# TODO(ashankar): Something similar for the C++ API (caveat: ABI compatibility)
+# Packaging for TensorFlow artifacts other than the Python API (pip whl).
+# This includes the C API, Java API, and protocol buffer files.
 
 package(default_visibility = ["//visibility:private"])
 
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
 
+genrule(
+    name = "libtensorflow_proto",
+    srcs = ["//tensorflow/core:protos_all_proto_srcs"],
+    outs = ["libtensorflow_proto.zip"],
+    cmd = "zip $@ $(SRCS)",
+)
+
 pkg_tar(
     name = "libtensorflow",
     extension = "tar.gz",
@@ -24,6 +28,21 @@ pkg_tar(
     ],
 )
 
+pkg_tar(
+    name = "libtensorflow_jni",
+    extension = "tar.gz",
+    files = [
+        "include/tensorflow/jni/LICENSE",
+        "//tensorflow/java:libtensorflow_jni",
+    ],
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    tags = ["manual"],
+)
+
 pkg_tar(
     name = "cheaders",
     files = ["//tensorflow/c:headers"],
@@ -65,22 +84,24 @@ genrule(
     srcs = [
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
-        "@grpc//:LICENSE",
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
-        "@nanopb_git//:LICENSE.txt",
         "@png_archive//:LICENSE",
         "@protobuf//:LICENSE",
+        "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ],
     outs = ["include/tensorflow/c/LICENSE"],
@@ -88,6 +109,36 @@ genrule(
     tools = [":concat_licenses.sh"],
 )
 
+genrule(
+    name = "jnilicenses_generate",
+    srcs = [
+        "//third_party/hadoop:LICENSE.txt",
+        "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
+        "@boringssl//:LICENSE",
+        "@com_googlesource_code_re2//:LICENSE",
+        "@curl//:COPYING",
+        "@eigen_archive//:COPYING.MPL2",
+        "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
+        "@gemmlowp//:LICENSE",
+        "@gif_archive//:COPYING",
+        "@highwayhash//:LICENSE",
+        "@jemalloc//:COPYING",
+        "@jpeg//:LICENSE.md",
+        "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
+        "@local_config_sycl//sycl:LICENSE.text",
+        "@png_archive//:LICENSE",
+        "@protobuf//:LICENSE",
+        "@snappy//:COPYING",
+        "@zlib_archive//:zlib.h",
+    ],
+    outs = ["include/tensorflow/jni/LICENSE"],
+    cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
+    tools = [":concat_licenses.sh"],
+)
+
 sh_test(
     name = "libtensorflow_test",
     size = "small",
@@ -105,3 +156,22 @@ sh_test(
     # the release by tensorflow/tools/ci_build/builds/libtensorflow.sh
     tags = ["manual"],
 )
+
+sh_test(
+    name = "libtensorflow_java_test",
+    size = "small",
+    srcs = ["libtensorflow_java_test.sh"],
+    data = [
+        ":LibTensorFlowTest.java",
+        ":libtensorflow_jni.tar.gz",
+        "//tensorflow/java:libtensorflow.jar",
+    ],
+    # Mark as "manual" till
+    # https://github.com/bazelbuild/bazel/issues/2352
+    # and https://github.com/bazelbuild/bazel/issues/1580
+    # are resolved, otherwise these rules break when built
+    # with Python 3.
+    # Till then, this test is explicitly executed when building
+    # the release by tensorflow/tools/ci_build/builds/libtensorflow.sh
+    tags = ["manual"],
+)
diff --git a/tensorflow/tools/lib_package/LibTensorFlowTest.java b/tensorflow/tools/lib_package/LibTensorFlowTest.java
new file mode 100644
index 00000000000..be30beecf87
--- /dev/null
+++ b/tensorflow/tools/lib_package/LibTensorFlowTest.java
@@ -0,0 +1,24 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Companion source file for libtensorflow_java_test.sh
+
+import org.tensorflow.TensorFlow;
+
+public class LibTensorFlowTest {
+  public static void main(String[] args) {
+    System.out.println(TensorFlow.version());
+  }
+}
diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md
index e5d3760986f..70081482603 100644
--- a/tensorflow/tools/lib_package/README.md
+++ b/tensorflow/tools/lib_package/README.md
@@ -1,8 +1,7 @@
-Bazel rules to package the TensorFlow C-library and [header
-files](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
-into an archive.
+Bazel rules to package the TensorFlow APIs in languages other than Python into
+archives.
 
-## TensorFlow C library
+## C library
 
 The TensorFlow [C
 API](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
@@ -10,22 +9,48 @@ is typically a requirement of TensorFlow APIs in other languages such as
 [Go](https://www.tensorflow.org/code/tensorflow/go)
 and [Rust](https://github.com/tensorflow/rust).
 
-The command:
+The following commands:
 
 ```sh
+bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test
 bazel build --config opt //tensorflow/tools/lib_package:libtensorflow
 ```
 
-produces `bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz`, which
-can be distributed and installed using something like:
+test and produce the archive at
+`bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz`, which can be
+distributed and installed using something like:
 
 ```sh
 tar -C /usr/local -xzf libtensorflow.tar.gz
 ```
 
+## Java library
+
+The TensorFlow [Java
+API](https://www.tensorflow.org/code/tensorflow/java/README.md)
+consists of a native library (`libtensorflow_jni.so`) and a Java archive (JAR).
+The following commands:
+
+```sh
+bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test
+bazel build --config opt \
+  //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
+  //tensorflow/tools/lib_package:libtensorflow.jar \
+  //tensorflow/tools/lib_package:libtensorflow-src.jar
+```
+
+test and produce the following:
+
+-   The native library (`libtensorflow_jni.so`) packaged in an archive at:
+    `bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz`
+-   The Java archive at:
+    `bazel-bin/tensorflow/tools/lib_package/libtensorflow.jar`
+-   The Java archive for Java sources at:
+    `bazel-bin/tensorflow/tools/lib_package/libtensorflow-src.jar`
+
 ## Release
 
-Scripts to generate archives using these rules for release are in
+Scripts to build these archives for TensorFlow releases are in
 [tensorflow/tools/ci_build/linux](https://www.tensorflow.org/code/tensorflow/tools/ci_build/linux)
 and
 [tensorflow/tools/ci_build/osx](https://www.tensorflow.org/code/tensorflow/tools/ci_build/osx)
diff --git a/tensorflow/tools/lib_package/libtensorflow_java_test.sh b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
new file mode 100755
index 00000000000..c44978fc570
--- /dev/null
+++ b/tensorflow/tools/lib_package/libtensorflow_java_test.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+# Sanity test for the binary artifacts for the TensorFlow Java API.
+# - Unarchive
+# - Compile a trivial Java file that exercises the Java API and underlying
+#   native library.
+# - Run it
+
+# Tools needed: java, javac, tar
+JAVA="${JAVA}"
+JAVAC="${JAVAC}"
+TAR="${TAR}"
+
+[ -z "${JAVA}" ] && JAVA="java"
+[ -z "${JAVAC}" ] && JAVAC="javac"
+[ -z "${TAR}" ] && TAR="tar"
+
+# bazel tests run with ${PWD} set to the root of the bazel workspace
+TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz"
+JAVAFILE="${PWD}/tensorflow/tools/lib_package/LibTensorFlowTest.java"
+JARFILE="${PWD}/tensorflow/java/libtensorflow.jar"
+
+cd ${TEST_TMPDIR}
+
+# Extract the archive into a subdirectory 'jni'
+mkdir jni
+${TAR} -xzf ${TARFILE} -Cjni
+
+# Compile and run the .java file
+${JAVAC} -cp ${JARFILE} -d . ${JAVAFILE}
+OUTPUT=$(${JAVA} \
+  -cp "${JARFILE}:." \
+  -Djava.library.path=jni \
+  LibTensorFlowTest)
+if [ -z "${OUTPUT}" ]
+then
+  echo "Empty output, expecting version number"
+  exit 1
+fi
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.sh b/tensorflow/tools/lib_package/libtensorflow_test.sh
index b887079a083..7dfe8eefcc0 100755
--- a/tensorflow/tools/lib_package/libtensorflow_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_test.sh
@@ -26,7 +26,7 @@ CC="${CC}"
 TAR="${TAR}"
 
 [ -z "${CC}" ] && CC="/usr/bin/gcc"
-[ -z "${TAR}"] && TAR="tar"
+[ -z "${TAR}" ] && TAR="tar"
 
 # bazel tests run with ${PWD} set to the root of the bazel workspace
 TARFILE="${PWD}/tensorflow/tools/lib_package/libtensorflow.tar.gz"
@@ -47,5 +47,5 @@ ${CC} ${CFILE} -Itensorflow/include -Ltensorflow/lib -ltensorflow -oa.out
 # The tests for GPU require CUDA libraries to be accessible, which
 # are in DYLD_LIBRARY_PATH in the test harness for OS X.
 export DYLD_LIBRARY_PATH=tensorflow/lib:${DYLD_LIBRARY_PATH}
-export LD_LIBRARY_PATH=tensorflow/lib
+export LD_LIBRARY_PATH=tensorflow/lib:${LD_LIBRARY_PATH}
 ./a.out
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a0eb41a79fe..798338d7875 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -3,7 +3,11 @@
 
 package(default_visibility = ["//visibility:private"])
 
-load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_not_windows",
+    "transitive_hdrs",
+)
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
 
@@ -39,6 +43,18 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+py_test(
+    name = "pip_smoke_test",
+    srcs = ["pip_smoke_test.py"],
+    data = [
+        "//tensorflow:all_opensource_files",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
@@ -50,24 +66,25 @@ py_binary(
         "README",
         "setup.py",
         ":included_headers",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/ndlstm:all_files",
-        "//tensorflow/contrib/nn:all_files",
-        "//tensorflow/contrib/session_bundle:all_files",
-        "//tensorflow/contrib/slim:all_files",
-        "//tensorflow/contrib/slim/python/slim/data:all_files",
-        "//tensorflow/contrib/slim/python/slim/nets:all_files",
-        "//tensorflow/contrib/specs:all_files",
-        "//tensorflow/contrib/tensor_forest:all_files",
-        "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+        "//tensorflow/contrib/nn:nn_py",
+        "//tensorflow/contrib/session_bundle:session_bundle_pip",
+        "//tensorflow/contrib/signal:signal_py",
+        "//tensorflow/contrib/slim/python/slim/data:data_pip",
         "//tensorflow/python:util_example_parser_configuration",
-        "//tensorflow/python/debug:all_files",
-        "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/python/tools:all_files",
-        # The following target has an issue when archiving them into the python
-        # zip, exclude them for now.
+        "//tensorflow/python/debug:debug_pip",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/tools:tools_pip",
+        # These targets don't build on Windows yet. Exclude them for now.
+        # rules_closure currently doesn't build on Windows due to
+        # https://github.com/bazelbuild/rules_closure/pull/206
+        # Since tensorboard dependes on rules_closure, exclude tensorboard until it's fixed.
         # "//tensorflow/tensorboard",
-        # This package does not build. Exclude it in windows for now.
+        # "//tensorflow/contrib/ndlstm",
+        # "//tensorflow/contrib/slim",
+        # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+        # "//tensorflow/contrib/specs",
+        # "//tensorflow/contrib/tensor_forest:init_py",
+        # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
         # "//tensorflow/examples/tutorials/mnist:package",
     ],
     srcs_version = "PY2AND3",
@@ -78,12 +95,14 @@ filegroup(
     name = "licenses",
     data = [
         "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -91,14 +110,21 @@ filegroup(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nanopb_git//:LICENSE.txt",
+        "@org_html5lib//:LICENSE",
+        "@org_mozilla_bleach//:LICENSE",
+        "@org_pocoo_werkzeug//:LICENSE",
+        "@org_pythonhosted_markdown//:LICENSE.md",
         "@png_archive//:LICENSE",
         "@protobuf//:LICENSE",
         "@six_archive//:LICENSE",
-        "@org_pocoo_werkzeug//:LICENSE",
+        "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ] + tf_additional_license_deps(),
+    ] + if_not_windows([
+        "@nccl_archive//:LICENSE.txt",
+    ]) + tf_additional_license_deps(),
 )
 
 sh_binary(
@@ -106,6 +132,7 @@ sh_binary(
     srcs = ["build_pip_package.sh"],
     data = select({
         "//tensorflow:windows": [":simple_console_for_windows"],
+        "//tensorflow:windows_msvc": [":simple_console_for_windows"],
         "//conditions:default": [
             ":licenses",
             "MANIFEST.in",
@@ -114,21 +141,26 @@ sh_binary(
             ":included_headers",
             ":simple_console",
             "//tensorflow:tensorflow_py",
-            "//tensorflow/contrib/labeled_tensor:all_files",
-            "//tensorflow/contrib/ndlstm:all_files",
-            "//tensorflow/contrib/nn:all_files",
-            "//tensorflow/contrib/session_bundle:all_files",
-            "//tensorflow/contrib/slim:all_files",
-            "//tensorflow/contrib/slim/python/slim/data:all_files",
-            "//tensorflow/contrib/slim/python/slim/nets:all_files",
-            "//tensorflow/contrib/specs:all_files",
-            "//tensorflow/contrib/tensor_forest:all_files",
-            "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+            "//tensorflow/contrib/graph_editor:graph_editor_pip",
+            "//tensorflow/contrib/keras:keras",
+            "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+            "//tensorflow/contrib/ndlstm:ndlstm",
+            "//tensorflow/contrib/nn:nn_py",
+            "//tensorflow/contrib/session_bundle:session_bundle_pip",
+            "//tensorflow/contrib/signal:signal_py",
+            "//tensorflow/contrib/slim:slim",
+            "//tensorflow/contrib/slim/python/slim/data:data_pip",
+            "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+            "//tensorflow/contrib/specs:specs",
+            "//tensorflow/contrib/tensor_forest:init_py",
+            "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
             "//tensorflow/examples/tutorials/mnist:package",
+            "//tensorflow/python:distributed_framework_test_lib",
+            "//tensorflow/python:meta_graph_testdata",
             "//tensorflow/python:util_example_parser_configuration",
-            "//tensorflow/python/debug:all_files",
-            "//tensorflow/python/saved_model:all_files",
-            "//tensorflow/python/tools:all_files",
+            "//tensorflow/python/debug:debug_pip",
+            "//tensorflow/python/saved_model:saved_model",
+            "//tensorflow/python/tools:tools_pip",
             "//tensorflow/tensorboard",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 22b00c42843..ef6cf564211 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -1,4 +1,12 @@
 include README
 recursive-include * *.py
 recursive-include * *.so
+recursive-include * *.dll
+recursive-include * *.lib
 recursive-include * *.csv
+recursive-include tensorflow/include/tensorflow *.h
+recursive-include tensorflow/include/Eigen *
+recursive-include tensorflow/include/external *
+recursive-include tensorflow/include/google *.h
+recursive-include tensorflow/include/third_party *
+recursive-include tensorflow/include/unsupported *
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
new file mode 100644
index 00000000000..dec08157c2c
--- /dev/null
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -0,0 +1,138 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""This pip smoke test verifies dependency files exist in the pip package.
+
+This script runs bazel queries to see what python files are required by the
+tests and ensures they are in the pip package superset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import subprocess
+
+PIP_PACKAGE_QUERY = """bazel query \
+  'deps(//tensorflow/tools/pip_package:build_pip_package)'"""
+
+PY_TEST_QUERY = """bazel query 'deps(\
+  filter("^((?!benchmark).)*$",\
+  kind(py_test,\
+  //tensorflow/python/... \
+  + //tensorflow/contrib/... \
+  - //tensorflow/contrib/tensorboard/... \
+  - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'"""
+
+# Hard-coded blacklist of files if not included in pip package
+# TODO(amitpatankar): Clean up blacklist.
+BLACKLIST = [
+    "//tensorflow/python:extra_py_tests_deps",
+    "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    "//tensorflow:no_tensorflow_py_deps",
+    "//tensorflow/python:test_ops_2",
+    "//tensorflow/python:tf_optimizer",
+    "//tensorflow/python:compare_test_proto_py",
+    "//tensorflow/core:image_testdata",
+    "//tensorflow/core:lmdb_testdata",
+    "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
+    "//tensorflow/python/feature_column:vocabulary_testdata",
+    "//tensorflow/python:framework/test_file_system.so",
+    # contrib
+    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
+    "//tensorflow/contrib/keras:testing_utils",
+    "//tensorflow/contrib/ffmpeg:test_data",
+    "//tensorflow/contrib/factorization/examples:mnist",
+    "//tensorflow/contrib/factorization/examples:mnist.py",
+    "//tensorflow/contrib/factorization:factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/framework:checkpoint_ops_testdata",
+    "//tensorflow/contrib/bayesflow:reinforce_simple_example",
+    "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
+]
+
+
+def main():
+  """This script runs the pip smoke test.
+
+  Raises:
+    RuntimeError: If any dependencies for py_tests exist in subSet
+
+  Prerequisites:
+      1. Bazel is installed.
+      2. Running in github repo of tensorflow.
+      3. Configure has been run.
+
+  """
+
+  # pip_package_dependencies_list is the list of included files in pip packages
+  pip_package_dependencies = subprocess.check_output(
+      PIP_PACKAGE_QUERY, shell=True)
+  pip_package_dependencies_list = pip_package_dependencies.strip().split("\n")
+  print("Pip package superset size: %d" % len(pip_package_dependencies_list))
+
+  # tf_py_test_dependencies is the list of dependencies for all python
+  # tests in tensorflow
+  tf_py_test_dependencies = subprocess.check_output(
+      PY_TEST_QUERY, shell=True)
+  tf_py_test_dependencies_list = tf_py_test_dependencies.strip().split("\n")
+  print("Pytest dependency subset size: %d" % len(tf_py_test_dependencies_list))
+
+  missing_dependencies = []
+  # File extensions and endings to ignore
+  ignore_extensions = ["_test", "_test.py"]
+
+  ignored_files = 0
+  blacklisted_files = len(BLACKLIST)
+  # Compare dependencies
+  for dependency in tf_py_test_dependencies_list:
+    if dependency and dependency.startswith("//tensorflow"):
+      ignore = False
+      # Ignore extensions
+      if any(dependency.endswith(ext) for ext in ignore_extensions):
+        ignore = True
+        ignored_files += 1
+
+      # Check if the dependency is in the pip package, the blacklist, or
+      # should be ignored because of its file extension
+      if (ignore or
+          dependency in pip_package_dependencies_list or
+          dependency in BLACKLIST):
+        continue
+      else:
+        missing_dependencies.append(dependency)
+
+  print("Ignored files: %d" % ignored_files)
+  print("Blacklisted files: %d" % blacklisted_files)
+  if missing_dependencies:
+    print("Missing the following dependencies from pip_packages:")
+    for missing_dependency in missing_dependencies:
+      print("\nMissing dependency: %s " % missing_dependency)
+      print("Affected Tests:")
+      rdep_query = """bazel query 'rdeps(kind(py_test, \
+      //tensorflow/python/...), %s)'""" % missing_dependency
+      affected_tests = subprocess.check_output(rdep_query, shell=True)
+      affected_tests_list = affected_tests.split("\n")[:-2]
+      print("\n".join(affected_tests_list))
+
+    raise RuntimeError("""One or more dependencies are not in the pip package.
+Please either blacklist the dependencies in
+tensorflow/tensorflow/tensorflow/tools/pip_package/pip_smoke_test.py
+or add them to tensorflow/tensorflow/tensorflow/tools/pip_package/BUILD.""")
+
+  else:
+    print("TEST PASSED")
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 5d5083307cc..dd47b44001a 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,13 +29,17 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.0.0-rc1'
+_VERSION = '1.2.0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
     'six >= 1.10.0',
-    'protobuf >= 3.1.0',
+    'protobuf >= 3.3.0',
     'werkzeug >= 0.11.10',
+    'html5lib == 0.9999999',  # identical to 1.0b8
+    'markdown == 2.2.0',
+    'bleach == 1.5.0',
+    'backports.weakref == 1.0rc1',
 ]
 
 project_name = 'tensorflow'
@@ -56,6 +60,7 @@ else:
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'tensorboard = tensorflow.tensorboard.tensorboard:main',
+    'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
 ]
 # pylint: enable=line-too-long
 
@@ -154,9 +159,9 @@ matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
 matches += ['../' + x for x in find_files('*', '_solib_k8') if '.py' not in x]
 
 if os.name == 'nt':
-  EXTENSION_NAME = 'python/_pywrap_tensorflow.pyd'
+  EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
 else:
-  EXTENSION_NAME = 'python/_pywrap_tensorflow.so'
+  EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.so'
 
 headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'tensorflow/stream_executor')) +
@@ -184,13 +189,11 @@ setup(
     # Add in any packaged data.
     include_package_data=True,
     package_data={
-        'tensorflow': [EXTENSION_NAME,
-                       'tensorboard/dist/bazel-html-imports.html',
-                       'tensorboard/dist/index.html',
-                       'tensorboard/dist/tf-tensorboard.html',
-                       'tensorboard/lib/css/global.css',
-                       'tensorboard/TAG',
-                     ] + matches,
+        'tensorflow': [
+            EXTENSION_NAME,
+            'tensorboard/components/index.html',
+            'tensorboard/TAG',
+        ] + matches,
     },
     zip_safe=False,
     distclass=BinaryDistribution,
@@ -209,7 +212,6 @@ setup(
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Software Development :: Libraries :: Python Modules',
         'Topic :: Software Development :: Libraries',
-        ],
+    ],
     license='Apache 2.0',
-    keywords='tensorflow tensor machine learning',
-    )
+    keywords='tensorflow tensor machine learning',)
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index d439c9abfdc..3a60c8c9583 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -44,10 +44,15 @@ cc_library(
     hdrs = ["gen_proto_text_functions_lib.h"],
     linkopts = select({
         "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//tensorflow:darwin": [
             "-lm",
             "-lpthread",
         ],
+        "//tensorflow:ios": [
+            "-lm",
+            "-lpthread",
+        ],
         "//conditions:default": [
             "-lm",
             "-lpthread",
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index 17ab542a598..ecb29a65a08 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -130,6 +130,7 @@ int MainImpl(int argc, char** argv) {
 
       const string path = output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
+      if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
         return -1;
       }
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index c5b3ca38baf..62e29b5128f 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -28,7 +28,6 @@ using ::tensorflow::protobuf::EnumDescriptor;
 using ::tensorflow::protobuf::FieldDescriptor;
 using ::tensorflow::protobuf::FieldOptions;
 using ::tensorflow::protobuf::FileDescriptor;
-using ::tensorflow::protobuf::OneofDescriptor;
 
 namespace tensorflow {
 
@@ -59,7 +58,7 @@ string StrAppend(string* to_append, const Args&... args) {
 // the field names (it's a loop over all names), and tracking of has_seen.
 class Generator {
  public:
-  Generator(const string& tf_header_prefix)
+  explicit Generator(const string& tf_header_prefix)
       : tf_header_prefix_(tf_header_prefix),
         header_(&code_.header),
         header_impl_(&code_.header_impl),
@@ -72,7 +71,7 @@ class Generator {
 
  private:
   struct Section {
-    Section(string* str) : str(str) {}
+    explicit Section(string* str) : str(str) {}
     string* str;
     string indent;
   };
@@ -635,6 +634,7 @@ void Generator::AppendDebugStringFunctions(const Descriptor& md) {
   Print().Print("namespace internal {").Print();
   Print(sig, " {").Nest();
   std::vector<const FieldDescriptor*> fields;
+  fields.reserve(md.field_count());
   for (int i = 0; i < md.field_count(); ++i) {
     fields.push_back(md.field(i));
   }
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index 3fd43e70684..81f85e00096 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/tools/proto_text/test.pb.h"
 #include "tensorflow/tools/proto_text/test.pb_text.h"
+#include "tensorflow/tools/proto_text/test.pb.h"
 
 namespace tensorflow {
 namespace test {
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index e7d0cc73d42..cb41185219c 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -7,6 +7,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "quantize_graph_lib",
     srcs = ["quantize_graph.py"],
diff --git a/tensorflow/tools/quantization/quantize_graph.py b/tensorflow/tools/quantization/quantize_graph.py
index d09349a79b9..a0cfc352d4f 100644
--- a/tensorflow/tools/quantization/quantize_graph.py
+++ b/tensorflow/tools/quantization/quantize_graph.py
@@ -453,7 +453,8 @@ class GraphRewriter(object):
 
   def round_nodes_recursively(self, current_node):
     """The entry point for simple rounding quantization."""
-    if self.already_visited[current_node.name]:
+    if (current_node.name in self.already_visited
+       ) and self.already_visited[current_node.name]:
       return
     self.already_visited[current_node.name] = True
     for input_node_name in current_node.input:
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 6f73c799689..df71840b64d 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -503,6 +503,8 @@ class QuantizeGraphTest(test.TestCase):
                                           [a_identity_name, b_constant_name])
     quantize_graph.set_attr_dtype(add_node, "T", dtypes.float32)
     expected_output.node.extend([add_node])
+    expected_output.versions.CopyFrom(graph_def.versions)
+    expected_output.library.CopyFrom(graph_def.library)
 
     output = graph_util.remove_training_nodes(graph_def)
     stripped_output = graph_util.extract_sub_graph(output, [add_name])
@@ -686,7 +688,7 @@ class QuantizeGraphTest(test.TestCase):
 
   def test_quantized_input_range_bias_add(self):
     input_shape = [1, 1, 2, 6]
-    input_n = quantize_graph.create_node("PlaceholderV2", "input", [])
+    input_n = quantize_graph.create_node("Placeholder", "input", [])
     quantize_graph.set_attr_dtype(input_n, "dtype", dtypes.float32)
     quantize_graph.set_attr_shape(input_n, "shape", input_shape)
     offset_n = quantize_graph.create_constant_node(
@@ -711,7 +713,7 @@ class QuantizeGraphTest(test.TestCase):
     shapes = [[3, 2], [2, 4]]
     inputs = []
     for i, shape in enumerate(shapes):
-      node = quantize_graph.create_node("PlaceholderV2", "input_%s" % i, [])
+      node = quantize_graph.create_node("Placeholder", "input_%s" % i, [])
       quantize_graph.set_attr_dtype(node, "dtype", dtypes.float32)
       quantize_graph.set_attr_shape(node, "shape", shape)
       inputs.append(node)
@@ -950,6 +952,8 @@ class QuantizeGraphTest(test.TestCase):
     quantize_graph.set_attr_dtype(mat_mul_node, "T1", dtypes.uint8)
     quantize_graph.set_attr_dtype(mat_mul_node, "T2", dtypes.int32)
     expected_output.node.extend([mat_mul_node])
+    expected_output.versions.CopyFrom(graph_def.versions)
+    expected_output.library.CopyFrom(graph_def.library)
 
     rewriter = quantize_graph.GraphRewriter(
         graph_def, [mat_mul_name], quantized_input_range=None)
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 50acac30144..9367bcd4a34 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -8,6 +8,7 @@ load(
     "tf_cc_logged_benchmark",
     "tf_py_logged_benchmark",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 5a458fbf0b4..2956c6dde74 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -8,7 +8,8 @@ def tf_cc_logged_benchmark(
     target=None,
     benchmarks="..",
     tags=[],
-    test_log_output_prefix=""):
+    test_log_output_prefix="",
+    benchmark_type="cpp_microbenchmark"):
   if not name:
     fail("Must provide a name")
   if not target:
@@ -21,7 +22,7 @@ def tf_cc_logged_benchmark(
                    "//path/to:test. Received: %s" % target)))
 
   all_tags = list(set(tags) + \
-                  set(["benchmark-test", "local", "regression-test"]))
+                  set(["benchmark-test", "local", "manual", "regression-test"]))
 
   tf_py_test(
       name = name,
@@ -31,7 +32,8 @@ def tf_cc_logged_benchmark(
       args = [
           "--name=//%s:%s" % (PACKAGE_NAME, name),
           "--test_name=" + target,
-          "--test_args=--benchmarks=%s" % benchmarks
+          "--test_args=--benchmarks=%s" % benchmarks,
+          "--benchmark_type=%s" % benchmark_type,
       ],
       data = [
         target,
@@ -56,4 +58,5 @@ def tf_py_logged_benchmark(
     target=target,
     benchmarks=benchmarks,
     tags=tags,
-    test_log_output_prefix=test_log_output_prefix)
+    test_log_output_prefix=test_log_output_prefix,
+    benchmark_type="python_benchmark")
diff --git a/tensorflow/tools/test/run_and_gather_logs.py b/tensorflow/tools/test/run_and_gather_logs.py
index ca6eccbff39..f6b25bbfccb 100644
--- a/tensorflow/tools/test/run_and_gather_logs.py
+++ b/tensorflow/tools/test/run_and_gather_logs.py
@@ -68,8 +68,10 @@ def main(unused_args):
   name = FLAGS.name
   test_name = FLAGS.test_name
   test_args = FLAGS.test_args
-  test_results, _ = run_and_gather_logs_lib.run_and_gather_logs(name, test_name,
-                                                                test_args)
+  benchmark_type = FLAGS.benchmark_type
+  test_results, _ = run_and_gather_logs_lib.run_and_gather_logs(
+      name, test_name=test_name, test_args=test_args,
+      benchmark_type=benchmark_type)
 
   # Additional bits we receive from bazel
   test_results.build_configuration.CopyFrom(gather_build_configuration())
@@ -102,6 +104,11 @@ if __name__ == "__main__":
       "--name", type=str, default="", help="Benchmark target identifier.")
   parser.add_argument(
       "--test_name", type=str, default="", help="Test target to run.")
+  parser.add_argument(
+      "--benchmark_type",
+      type=str,
+      default="",
+      help="BenchmarkType enum string (benchmark type).")
   parser.add_argument(
       "--test_args",
       type=str,
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index 963e1e83216..570e09f1659 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 import shlex
 import subprocess
 import tempfile
@@ -26,6 +27,7 @@ import time
 
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.platform import gfile
+from tensorflow.tools.test import gpu_info_lib
 from tensorflow.tools.test import system_info_lib
 
 
@@ -46,15 +48,16 @@ def get_git_commit_sha():
   return os.getenv("GIT_COMMIT")
 
 
-def process_test_logs(name, test_name, test_args, start_time, run_time,
-                      log_files):
+def process_test_logs(name, test_name, test_args, benchmark_type,
+                      start_time, run_time, log_files):
   """Gather test information and put it in a TestResults proto.
 
   Args:
     name: Benchmark target identifier.
-    test_name:  A unique bazel target, e.g. "//path/to:test"
-    test_args:  A string containing all arguments to run the target with.
-
+    test_name: A unique bazel target, e.g. "//path/to:test"
+    test_args: A string containing all arguments to run the target with.
+    benchmark_type: A string representing the BenchmarkType enum; the
+      benchmark type for this target.
     start_time: Test starting time (epoch)
     run_time:   Wall time that the test ran for
     log_files:  Paths to the log files
@@ -68,6 +71,8 @@ def process_test_logs(name, test_name, test_args, start_time, run_time,
   results.target = test_name
   results.start_time = start_time
   results.run_time = run_time
+  results.benchmark_type = test_log_pb2.TestResults.BenchmarkType.Value(
+      benchmark_type.upper())
 
   # Gather source code information
   git_sha = get_git_commit_sha()
@@ -90,13 +95,16 @@ def process_benchmarks(log_files):
   return benchmarks
 
 
-def run_and_gather_logs(name, test_name, test_args):
+def run_and_gather_logs(name, test_name, test_args,
+                        benchmark_type):
   """Run the bazel test given by test_name.  Gather and return the logs.
 
   Args:
     name: Benchmark target identifier.
     test_name: A unique bazel target, e.g. "//path/to:test"
     test_args: A string containing all arguments to run the target with.
+    benchmark_type: A string representing the BenchmarkType enum; the
+      benchmark type for this target.
 
   Returns:
     A tuple (test_results, mangled_test_name), where
@@ -123,8 +131,18 @@ def run_and_gather_logs(name, test_name, test_args):
     # Hopefully running in sandboxed mode
     test_executable = os.path.join(".", test_executable)
 
+  test_adjusted_name = name
+  gpu_config = gpu_info_lib.gather_gpu_devices()
+  if gpu_config:
+    gpu_name = gpu_config[0].model
+    gpu_short_name_match = re.search(r"Tesla [KP][4,8]0", gpu_name)
+    if gpu_short_name_match:
+      gpu_short_name = gpu_short_name_match.group(0)
+      test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
+
   temp_directory = tempfile.mkdtemp(prefix="run_and_gather_logs")
-  mangled_test_name = test_name.strip("/").replace("/", "_").replace(":", "_")
+  mangled_test_name = (test_adjusted_name.strip("/")
+                       .replace("|", "_").replace("/", "_").replace(":", "_"))
   test_file_prefix = os.path.join(temp_directory, mangled_test_name)
   test_file_prefix = "%s." % test_file_prefix
 
@@ -144,9 +162,10 @@ def run_and_gather_logs(name, test_name, test_args):
       raise MissingLogsError("No log files found at %s." % test_file_prefix)
 
     return (process_test_logs(
-        name,
-        test_name,
-        test_args,
+        test_adjusted_name,
+        test_name=test_name,
+        test_args=test_args,
+        benchmark_type=benchmark_type,
         start_time=int(start_time),
         run_time=run_time,
         log_files=log_files), mangled_test_name)
diff --git a/tensorflow/tools/test/system_info_lib.py b/tensorflow/tools/test/system_info_lib.py
index c352abaa544..0cc261591bc 100644
--- a/tensorflow/tools/test/system_info_lib.py
+++ b/tensorflow/tools/test/system_info_lib.py
@@ -96,7 +96,7 @@ def gather_cpu_info():
   cpu_info.cpu_info = info['brand']
   cpu_info.num_cores = info['count']
   cpu_info.mhz_per_cpu = info['hz_advertised_raw'][0] / 1.0e6
-  l2_cache_size = re.match(r'(\d+)', str(info['l2_cache_size']))
+  l2_cache_size = re.match(r'(\d+)', str(info.get('l2_cache_size', '')))
   if l2_cache_size:
     # If a value is returned, it's in KB
     cpu_info.cache_size['L2'] = int(l2_cache_size.group(0)) * 1024
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index 3a55fe8ece4..54f3cd62f28 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,309 +1,86 @@
-# tfprof: A Profiling Tool for TensorFlow Models
+# tfprof: TensorFlow Profiler and Beyond
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+### Features
 
-Consultants: Jon Shlens, Pete Warden
+* Profile model architectures
+  * parameters, tensor shapes, float operations, device placement, etc.
+* Profile model performance
+  * execution time, memory consumption
+  * Profile multiple steps.
+* Auto detect and advise. (Experimental)
 
+### Interfaces
 
-###Major Features
+* Python API
+* Command Line
+* Visualization
+* C++ API (Not public, contact us if needed.)
 
-1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
-3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+### Views and Options
 
-[Python API Tutorials](#python-api-tutorials): It can be called directly from
-Python codes. Results are either printed
-to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
-the API to allow users to perform further analysis.
+tfprof provides 4 different views to organize the statistics.
 
-[CLI Tutorials](#cli-tutorials):
-It supports interactive mode for exploration and single-shot mode for
-scripts. Outputs can be dumped to files or printed in terminal.
+    *  code view: operations are grouped by Python codes that generate them.
+    *  op view: operations are grouped by operation type (E.g. MatMul, Conv2D).
+    *  scope view: operations are organized based on name scope hierarchies.
+    *  graph view: operations are organized based on input/output.
 
-[Options](#options):
-tfprof supports many options to selectively account/display/order ops and
-statistics.
+tfprof provides options to help user select, filter and order statistics.
+See [Options](g3doc/options.md) for detail instructions.
 
-## Python API Tutorials
-
-tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
-
-### Examine the shapes and sizes of all trainiable Variables.
-```python
-# Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
-sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
-
-### Examine the number of floating point operations
-``` python
-# Print to stdout an analysis of the number of floating point operations in the
-# model broken down by individual operations.
-#
-# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
-# also requires complete shape information. It is common that shape is unknown
-# statically. To complete the shape, provide run-time shape information with
-# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
-```
-
-### Examine the timing and memory usage
-You will first need to run the following set up in your model in order to
-compute the memory and timing statistics.
-
-```python
-# Generate the meta information for the model that contains the memory usage
-# and timing information.
-run_metadata = tf.RunMetadata()
-with tf.Session() as sess:
-  _ = sess.run(train_op,
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-```
-
-Finally, you may run `print_model_analysis` to explore the timing and memory
-demands of the model.
-
-``` python
-# Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
-```
-
-Users can change ```tfprof_options``` to fully leverage tfprof's power.
-
-
-## CLI Tutorials
-
-Tutorials below are based on a 32 layers ResNet.
-
-TODO(xpan): Provide graph.pbtxt, model.ckpt, tfprof_log and run_meta download.
-
-### Examples
-
-1) Start `tfprof` command line tool
-
-```shell
-# Build the tool.
-bazel build --config opt tensorflow/tools/tfprof/...
-
-# Help information, including detail 'option' instructions.
-bazel-bin/tensorflow/tools/tfprof/tfprof help
-#
-# The following command start tfprof in one-shot mode.
-#
-bazel-bin/tensorflow/tools/tfprof/tfprof scope \
-    --graph_path=graph.pbtxt \
-    --max_depth=3
-#
-# The following commands will start tfprof interactive mode.
-#
-# Profile model shapes and parameters only.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt
-#
-# Additionally profile checkpoint statistics and values.
-# Use '-account_type_regexes _checkpoint_variables' to select
-# checkpoint tensors.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --checkpoint_path=model.ckpt
-#
-# Additionally profile ops requested memory and timing.
-# See CLI Input Files section on generating run_meta file.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --run_meta_path=run_meta \
-    --checkpoint_path=model.ckpt
-#
-# tfprof_log is used to define customized op types and float ops.
-# Use tfprof_logger.write_op_log() to create tfprof_log.
-# See 11) in Examples section on generating tfprof_log file.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --run_meta_path=run_meta \
-    --op_log_path=tfprof_log \
-    --checkpoint_path=model.ckpt
-```
-Note that `graph.pbtxt` is an ASCII text format.
-
-2) Press enter to show the default options
-
-```shell
-tfprof>
-tfprof>
--max_depth                  4
+-max_depth                  10
 -min_bytes                  0
 -min_micros                 0
 -min_params                 0
 -min_float_ops              0
--device_regexes             .*
+-min_occurrence             0
+-step                       -1
 -order_by                   name
--account_type_regexes       Variable,VariableV2
+-account_type_regexes       .*
 -start_name_regexes         .*
 -trim_name_regexes
 -show_name_regexes          .*
--hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
+-hide_name_regexes
 -account_displayed_op_only  false
-# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
-# [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
 -select                     params
--viz                        false
--dump_to_file
+-output                     stdout:
 ```
 
-3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+### Tutorials
 
+*  [Python API](g3doc/python_api.md)
+*  [Command Line Interface](g3doc/command_line.md)
+*  [Profile Time](g3doc/profile_time.md)
+*  [Profile Memory](g3doc/profile_memory.md)
+*  [Profile Model Architecture](g3doc/profile_model_architecture.md)
+*  [Auto Detect and Advise](g3doc/advise.md)
+*  [Options](g3doc/options.md)
+
+## Demo
+
+### Attribute TensorFlow graph running time to your Python codes.
 ```shell
-# Requires --graph_path, --checkpoint_path.
-tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
-_TFProfRoot ()
-  unit_1_0/shared_activation/init_bn/gamma ()
-[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
-  unit_1_0/sub2/bn2/gamma ()
-[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
 ```
 
-4) I want to see my checkpoint tensors shape and number of parameters.
-
-```shell
-# Requires --graph_path, --checkpoint_path.
-# Increase -max_depth to see all tensors.
-tfprof> scope -account_type_regexes _checkpoint_variables -select params -max_depth 4
-_TFProfRoot (--/930.58k params)
-  global_step (0/0 params)
-  init/init_conv/DW (3x3x3x16, 432/864 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-    pool_logit/DW/Momentum (64x10, 640/640 params)
-  pool_logit/biases (10, 10/20 params)
-    pool_logit/biases/Momentum (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/128 params)
-  unit_last/final_bn/gamma (64, 64/128 params)
-  unit_last/final_bn/moving_mean (64, 64/64 params)
-  unit_last/final_bn/moving_variance (64, 64/64 params)
+### Show your model variables and the number of parameters.
 ```
-
-5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
-it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
-graph dependencies.
-
-```shell
-# Requires --graph_path, --run_meta_path.
-tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
-_TFProfRoot (0us/3.61sec)
-  init/init_conv/Conv2D (11.75ms/3.10sec)
-    random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
-  unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
-  unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
-  unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
-  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
-```
-
-6) I want to know the expensive operations during the back propagation.
-Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
-command to explore based on name scope hierarchies.
-
-```shell
-# Requires --graph_path, --run_meta_path.
-tfprof> scope -start_name_regexes gradient.* -max_depth 100 -min_micros 20000 -select micros -account_type_regexes .*
-_TFProfRoot (0us/2.29sec)
-  gradients/unit_1_0/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (54.96ms/54.96ms)
-  gradients/unit_1_0/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (83.63ms/83.63ms)
-  gradients/unit_1_1/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (99.25ms/99.25ms)
-  gradients/unit_1_2/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.40ms/95.40ms)
-  gradients/unit_1_2/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (99.83ms/99.83ms)
-  gradients/unit_1_3/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.39ms/95.39ms)
-  ...
-```
-
-7) Show the number of float operations in the model.
-Note: float operations calculation depends on
-1) op.RegisterStatistics. If an op doesn’t
-have RegisterStatistics defined, its float operations cannot be counted.
-2) fully defined shape is also necessary in order to calculate flops. Sometimes
-full shape is not available statically. Use RunMetadata to get run-time shape.
-float operations number is provided by tensorflow::tfprof::OpLog logged from
-Python API.
-
-```shell
-# Requires --graph_path, --op_log_path.
-tfprof> scope -min_float_ops 1 -max_depth 10 -select float_ops -account_type_regexes .*
-_TFProfRoot (0/17.63b flops)
-  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
-  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
-  init/init_conv/Conv2D (113.25m/113.25m flops)
-  pool_logit/xw_plus_b (1.28k/165.12k flops)
-    pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
-  unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
-  unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
-  unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
-  unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
-  ...
-```
-
-8) Show the number of parameters of all `tf.trainable_variables()` in the model.
-
-```shell
-# Requires --graph_path --op_log_path.
-# store option for future commands.
-tfprof> set -account_type_regexes _trainable_variables
-tfprof> scope -max_depth 4 -select params
-_TFProfRoot (--/464.15k params)
-  init/init_conv/DW (3x3x3x16, 432/432 params)
-  pool_logit/DW (64x10, 640/640 params)
-  pool_logit/biases (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/64 params)
-  unit_last/final_bn/gamma (64, 64/64 params)
-```
-
-Where does “_trainable_variables” come from? It is from the OpLog file
-generated by write_op_log() Python API. write_op_log() help users create some
-common op types implicitly. Users can define their own op types and log it
-through the write_op_log() API.
-
-9) What if I’m lazy and don’t want to define op type? I have given my ops
-well-defined names in my model’s code. And want to use names to select a group
-of ops. Let’s try it!
-
-```shell
-tfprof> set -account_type_regexes .*
-tfprof> scope -show_name_regexes unit_2_1.*DW -max_depth 100 -account_displayed_op_only
-_TFProfRoot (0/18.43k params)
-  unit_2_1/sub1/conv1/DW (3x3x32x32, 9.22k/9.22k params)
-  unit_2_1/sub2/conv2/DW (3x3x32x32, 9.22k/9.22k params)
-```
-
-The above command allows you to filter ops that match specific names.
-`-account_displayed_op_only` asks tfprof to only account ops displayed
-in terminal. Otherwise, tfprof accounts all ops matched by
-`-account_type_regexes` recursively even if they are hidden due to some
-options such as -max_depth.
-
-10) TensorFlow has built-in op types. For example, built-in op type `Variable`
-seems to include `Variable's` created by your model. However, be careful when
-depending on it because TensorFlow creates extra `Variable` ops implicitly and
-the implicitly created ops can have the same prefix as the `Variable's` you
-defined.
-
-In the following example, extra `Variables` are created and “/Momentum” is
-appended to their names. This might cause you “model capacity” calculation
-to get wrong.
-
-```shell
-tfprof> scope -account_type_regexes Variable -max_depth 4 -select params
+tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
 _TFProfRoot (--/930.58k params)
   global_step (1/1 params)
   init/init_conv/DW (3x3x3x16, 432/864 params)
@@ -317,168 +94,29 @@ _TFProfRoot (--/930.58k params)
   unit_last/final_bn/moving_variance (64, 64/64 params)
 ```
 
-
-11) A example of defining extra op type for ops using `OpLog`
-
-First, in Python code, create an `OpLog` proto and add op type
-information to it:
-
-```python
-
-op_log = tfprof_log_pb2.OpLog()
-entry = op_log.log_entries.add()
-entry.name = 'pool_logit/DW'
-entry.types.append('pool_logit')
-entry = op_log.log_entries.add()
-entry.name = 'pool_logit/biases'
-# Alternatively:
-# var = tf.get_variable(xxx)
-# entry.name = var.op.name
-entry.types.append('pool_logit')
+### Show the most expensive operation types.
+```
+tfprof> op -select micros,bytes,occurrence -order_by micros
+SoftmaxCrossEntropyWithLogits      36.58MB (100.00%, 0.05%),      1.37sec (100.00%, 23.56%),         30
+MatMul                        2720.57MB (99.95%, 3.66%),      988.90ms (76.44%, 17.05%),       3450
+ConcatV2                       741.37MB (96.29%, 1.00%),       421.44ms (59.38%, 7.27%),       6098
+Mul                           3957.24MB (95.29%, 5.33%),       418.90ms (52.12%, 7.22%),       9427
+Add                            740.05MB (89.96%, 1.00%),       335.26ms (44.89%, 5.78%),       2180
+Sub                             32.46MB (88.97%, 0.04%),       216.44ms (39.11%, 3.73%),       4372
+AddN                           733.21MB (88.92%, 0.99%),       208.46ms (35.38%, 3.59%),       5481
+Slice                          708.07MB (87.94%, 0.95%),       205.27ms (31.78%, 3.54%),       7277
+Fill                           954.27MB (86.98%, 1.28%),       154.50ms (28.24%, 2.66%),       9686
+Select                         312.33MB (85.70%, 0.42%),       123.04ms (25.58%, 2.12%),       5746
+Sigmoid                        152.57MB (85.28%, 0.21%),        96.66ms (23.46%, 1.67%),       2970
 ```
 
-Second, call write_op_log to write the OpLog proto.
+### Visualize time and memory.
+<left>
+[CodeTimeline](g3doc/graph_timeline.png)
+</left>
 
-```python
-tf.contrib.tfprof.tfprof_logger.write_op_log(
-    sess.graph, /tmp/my_op_log_dir, op_log)
+### Teams
 
-# Get run-time shape information in order to fill shapes and get flops.
-tf.contrib.tfprof.tfprof_logger.write_op_log(
-    sess.graph, /tmp/my_op_log_dir, op_log, run_meta)
-```
-
-Third, when starting the tfprof tool, specify
-"--op_log_path /tmp/my_op_log_dir/op_log"
-
-```shell
-tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
-_TFProfRoot (--/650 params)
-  pool_logit/DW (64x10, 640/640 params)
-  pool_logit/biases (10, 10/10 params)
-```
-
-Note that when you call
-`tf.contrib.tfprof.tfprof_logger.write_op_log(...)`,
-the tool adds all `Variables` inside `tf.trainable_variables()` to
-`_trainable_variables`.
-
-12) Run tfprof in one-shot mode and dump result to file.
-
-```shell
-# Printed to stdout if --dump_to_file is not set.
-tfprof scope --graph_path=graph.pbtxt  \
-             --max_depth=3 \
-             --dump_to_file="/tmp/dump"
-Reading Files...
-Parsing GraphDef...
-Preparing Views...
-
-cat /tmp/dump
-_TFProfRoot (--/930.58k params)
-  global_step (0/0 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-  pool_logit/biases (10, 10/20 params)
-```
-
-13) Analyze how balanced Variable are on parameter servers.
-
-In this tutorial, I'm going to use a seq2seq model, which are split
-on several gpus at workers and several parameter servers.
-
-In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
-gpu0. They share an op_type called 'gpu0'.
-
-```shell
-bazel-bin/tensorflow/tools/tfprof/tfprof \
-  --graph_path ~/tfprof/textsum/graph.pbtxt  \
-  --run_meta_path ~/tfprof/textsum/run_meta
-
-# Looks like ps task 1 is holding twice more parameters than task 0.
-tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
-_TFProfRoot (--/25.81m params)
-tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
-_TFProfRoot (--/58.84m params)
-```
-
-### CLI Input Files
-
-tfprof command line inference (CLI) loads dumped files from a tensorflow model.
-Convert them into in-memory data structures. To use it, users need to specify
-the locations of the dumped files. The following are the dumped files loaded
-by tfprof:
-
-<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
-representation of the model. For example, graph.pbtxt written by tf.Supervisor
-is a candidate. If you are not using tf.Supervisor, you can easily get GraphDef
-using tf.Graph.as_graph_def() or other API.
-
-<b>--run_meta_path:</b> tensorflow::RunMetadata.
-Used to get the memory and time consumption of
-each op of the model. Users need to enable it. For example, the following code
-snippet writes a RunMetadata file:
-
-```python
-run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
-run_metadata = config_pb2.RunMetadata()
-# Once a while, call it the get the RunMeta.
-_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
-with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
-  f.write(run_metadata.SerializeToString())
-```
-
-<b>--op_log_path:</b>
-tensorflow::tfprof::OpLog. A proto used to provide extra op information
-for ops. By giving a group of ops a type name, users can easily aggregate the
-statistics for those ops without accidently missing or including extra ops.
-tfprof exposes the following Python API to add op information and logging.
-
-```python
-tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
-```
-
-<b>--checkpoint_path:</b>
-TensorFlow checkpoint. It defines _checkpoint_variable op type. It also
-provides checkpointed tensors' values.
-
-
-##Options
-
-`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure.
-
-`-min_bytes`: Show ops that request at least this number of bytes.
-
-`-min_micros`: Show ops that spend at least this number of microseconds to run.
-
-`-min_params`: Show ops that contains at least this number of parameters.
-
-`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided
-
-`-device_regexes`: Show ops that a placed on the specified devices. regexes are comma-separated.
-
-`-order_by`: Order the results by [name|depth|bytes|micros|params|float_ops]
-
-`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated.
-
-`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated.
-
-`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated.
-
-`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated.
-
-`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated.
-
-Notes: For each op, `-account_type_regexes` is first evaluated, only ops with
-types matching the specified regexes are accounted and selected for displayed.
-`-start/trim/show/hide_name_regexes` are used to further filter ops for display.
-`-start_name_regexes` is evaluated first to search the starting ops to display.
-Descendants of starting ops are then evaluated against `-show/hide_name_regexes`
-to make display decision. If an op matches trim_name_regexes, all its
-descendants are hidden. Ops statistics are *accounted even if they are hidden*
-as long as they match the `-account_xxx` options.
-
-`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
-
-`-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types].
-
-`-dump_to_file`: Dump the output to a file, instead of terminal.
+* Xin Pan (xpan@google.com, github: panyx0718)
+* Jon Shlens
+* Yao Zhang
diff --git a/tensorflow/tools/tfprof/g3doc/advise.md b/tensorflow/tools/tfprof/g3doc/advise.md
new file mode 100644
index 00000000000..3bce6270ff8
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/advise.md
@@ -0,0 +1,44 @@
+## Auto Detect and Advise
+
+tfprof analyzes profiles and generates advises for common issues.
+
+### Run Advise.
+```python
+# First create a profiler. See profiler tutorials for more details.
+profiler = model_analyzer.Profiler(sess.graph)
+run_meta = config_pb2.RunMetadata()
+_ = sess.run(r1,
+             options=config_pb2.RunOptions(
+                 trace_level=config_pb2.RunOptions.FULL_TRACE),
+             run_metadata=run_meta)
+profiler.add_step(1, run_meta)
+
+# Start advise.
+profiler.advise()
+```
+
+### Checker
+
+There is no magic behind advise mode. tfprof builds the profiles first, then
+it runs through a list of `Checkers`, each one responsible for checking one
+area with the profile and report issues. A `Checker` is like a plugin.
+
+For example:
+
+####JobChecker (Not Available OSS)
+* Checking RecvTensor RPC latency and bandwidth.
+* Checking CPU/Memory utilization of the job.
+
+####AcceleratorUtilization Checker
+* Checks what percentage of time the accelerator spends on computation.
+
+####Operation Checker
+* Check whether the operation runs with optimal options.
+* Checks if there is a better implementation to replace the current operation.
+
+####Contribute Your Checker
+
+Follow examples of accelerator_utilization_checker.h
+
+
+
diff --git a/tensorflow/tools/tfprof/g3doc/code_timeline.png b/tensorflow/tools/tfprof/g3doc/code_timeline.png
new file mode 100644
index 00000000000..c5ab246f7da
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/code_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/command_line.md b/tensorflow/tools/tfprof/g3doc/command_line.md
new file mode 100644
index 00000000000..9f0de72e07e
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/command_line.md
@@ -0,0 +1,316 @@
+## Command Line Interface Tutorials
+
+* [Command Line Inputs](#command-line-inputs)
+* [Start `tfprof`](#start-tfprof)
+* [Examples](#examples)
+  * [Profile Python Time](#profile-python-time)
+  * [Profile Graph Time](#profile-graph-time)
+  * [Profile Checkpoint Value](#profile-checkpoint-value)
+  * [Profile Model Parameter](#profile-model-parameter)
+  * [Profile Device Placement](#profile-device-placement)
+  * [Define Customized Operation Type](#define-customized-operation-type)
+  * [Non-interactive Mode](#non-interactive-mode)
+
+
+### Command Line Inputs
+
+tfprof command line tool uses the following inputs:
+
+<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
+architecture of the model. For example, graph.pbtxt written by tf.Supervisor
+can be passed to --graph_path. You can also easily get GraphDef using
+tf.get_default_graph().as_graph_def(add_shapes=True) or other API.
+
+<b>--run_meta_path:</b> tensorflow::RunMetadata (optional).
+Used to get the memory consumption and execution time of
+each op of the model.
+
+The following code snippet writes a RunMetadata file:
+
+```python
+run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+run_metadata = config_pb2.RunMetadata()
+_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
+with tf.gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
+  f.write(run_metadata.SerializeToString())
+```
+
+<b>--op_log_path:</b>
+tensorflow::tfprof::OpLog (optional). A proto used to provide extra operation
+information. 1) float operations. 2) code traces. 3) define customized operation
+type for -account_type_regexes option.
+
+The following code snippet writes a OpLog file.
+
+```python
+tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
+```
+
+<b>--checkpoint_path:</b> TensorFlow checkpoint (optional).
+It defines _checkpoint_variable op type. It also provides checkpointed tensors' values.
+
+
+###Start `tfprof`
+
+#### Build `tfprof`
+
+```shell
+# Build the tool.
+bazel build --config opt tensorflow/tools/tfprof/...
+
+# Help information, including detail 'option' instructions.
+bazel-bin/tensorflow/tools/tfprof/tfprof help
+```
+
+#### Start `tfprof` Interactive Mode
+```shell
+# The following commands will start tfprof interactive mode.
+#
+# --graph_path contains the model architecutre and tensor shapes.
+# --run_meta_path contains the memory and time information.
+# --op_log_path contains float operation and code traces.
+# --checkpoint_path contains the model checkpoint data.
+#
+# Only includes model architecture, parameters and shapes.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt
+#
+# Additionally profile ops memory and timing.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+#
+# tfprof_log is used to define customized op types, float ops and code traces.
+# Use tfprof_logger.write_op_log() to create tfprof_log.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --op_log_path=tfprof_log \
+#
+# Additionally profile checkpoint statistics and values.
+# Use '-account_type_regexes _checkpoint_variables' to select
+# checkpoint tensors.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --op_log_path=tfprof_log \
+    --checkpoint_path=model.ckpt
+```
+
+#### Start `tfprof` Non-interactive Mode.
+
+```python
+# Runs tfprof in one-shot.
+bazel-bin/tensorflow/tools/tfprof/tfprof scope \
+    --graph_path=graph.pbtxt \
+    --max_depth=3
+```
+
+#### Press enter to show the default options
+
+Refer to [Options](options.md) for option instructions.
+
+```shell
+tfprof>
+-max_depth                  4
+-min_bytes                  0
+-min_micros                 0
+-min_params                 0
+-min_float_ops              0
+-min_occurrence             0
+-step                       -1
+-order_by                   name
+-account_type_regexes       Variable,VariableV2
+-start_name_regexes         .*
+-trim_name_regexes
+-show_name_regexes          .*
+-hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
+-account_displayed_op_only  false
+# supported select fields. Availability depends on --[run_meta|checkpoint|op_log]_path.
+# [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types]
+-select                     params
+# format: output_type:key=value,key=value...
+# output_types: stdout (default), timeline, file.
+# key=value pairs:
+#   1. timeline: outfile=<filename>
+#   2. file: outfile=<filename>
+#   3. stdout: None.
+# E.g. timeline:outfile=/tmp/timeline.json
+-output
+```
+
+###Examples
+
+####Profile Python Time
+```shell
+# Requires --graph_path --op_log_path
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
+```
+
+Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
+<left>
+![CodeTimeline](code_timeline.png)
+</left>
+
+#### Profile Graph Time
+
+```shell
+# I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+# it depends on take a long time to run.
+
+# Requires --graph_path, --run_meta_path.
+tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
+_TFProfRoot (0us/3.61sec)
+  init/init_conv/Conv2D (11.75ms/3.10sec)
+    random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
+  unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
+  unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
+  unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
+  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
+```
+
+#### Profile Checkpoint Value
+```shell
+# Requires --graph_path, --checkpoint_path.
+tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
+_TFProfRoot ()
+  unit_1_0/shared_activation/init_bn/gamma ()
+[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
+  unit_1_0/sub2/bn2/gamma ()
+[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
+```
+
+#### Profile Model Parameter
+
+```shell
+# Show the number of parameters of all `tf.trainable_variables()` in the model.
+# Requires --graph_path --op_log_path.
+# store option for future commands.
+tfprof> set -account_type_regexes _trainable_variables
+tfprof> scope -max_depth 4 -select params
+_TFProfRoot (--/464.15k params)
+  init/init_conv/DW (3x3x3x16, 432/432 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/64 params)
+  unit_last/final_bn/gamma (64, 64/64 params)
+```
+
+Where does `_trainable_variables` come from? It is customized operation type
+defined through the OpLog file.
+Users can [Define Customized Operation Type](#define-customized-operation-type)
+
+<b>Following example shows importance of defining customized operation type.</b>
+In this example, extra `Variables` are created by TensorFlow
+implicitly and “/Momentum” is appended to their names. They shouldn't be
+included in you “model capacity” calculation.
+
+```shell
+tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
+_TFProfRoot (--/930.58k params)
+  global_step (1/1 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+#### Profile Device Placement
+
+In this tutorial, a model is split
+on several gpus at workers and several parameter servers.
+
+In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
+gpu:0. They share an operation type.
+
+```shell
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+  --graph_path=/tmp/graph.pbtxt  \
+  --run_meta_path=/tmp/run_meta
+
+# Looks like ps task 1 is holding twice more parameters than task 0.
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
+_TFProfRoot (--/25.81m params)
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
+_TFProfRoot (--/58.84m params)
+```
+
+#### Define Customized Operation Type
+
+First, in Python code, create an `OpLog` proto and add op type
+information to it:
+
+```python
+
+op_log = tfprof_log_pb2.OpLog()
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/DW'
+entry.types.append('pool_logit')
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/biases'
+entry.types.append('pool_logit')
+```
+
+Second, call write_op_log to write the OpLog proto.
+
+```python
+tf.contrib.tfprof.tfprof_logger.write_op_log(
+    sess.graph, /tmp/my_op_log_dir, op_log)
+
+# Get run-time shape information in order to fill shapes and get flops.
+tf.contrib.tfprof.tfprof_logger.write_op_log(
+    sess.graph, /tmp/my_op_log_dir, op_log, run_meta)
+```
+
+Third, when starting the tfprof tool, specify
+"--op_log_path /tmp/my_op_log_dir/op_log"
+
+```shell
+tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
+_TFProfRoot (--/650 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+```
+
+Note that `tf.contrib.tfprof.tfprof_logger.write_op_log(...)` automatically
+assigns all `Variables` inside `tf.trainable_variables()` a customized
+operation type: `_trainable_variables`.
+
+
+#### Non-interactive Mode
+12) Run tfprof in one-shot mode and dump result to file.
+
+```shell
+# By default output to stdout. Use -output option to change output types.
+tfprof scope --graph_path=graph.pbtxt  \
+             --max_depth=3 \
+             --output="file:outfile=/tmp/dump"
+Reading Files...
+Parsing GraphDef...
+Preparing Views...
+
+cat /tmp/dump
+_TFProfRoot (--/930.58k params)
+  global_step (0/0 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+  pool_logit/biases (10, 10/20 params)
+```
diff --git a/tensorflow/tools/tfprof/g3doc/graph_timeline.png b/tensorflow/tools/tfprof/g3doc/graph_timeline.png
new file mode 100644
index 00000000000..98bfaa175f9
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/graph_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/options.md b/tensorflow/tools/tfprof/g3doc/options.md
new file mode 100644
index 00000000000..78c72bf5edd
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/options.md
@@ -0,0 +1,86 @@
+##Options
+
+###Overview
+
+For all tfprof views, the statistics are processed with the following procedures
+
+1) An in-memory data structure is used represent the view.
+
+2) `-account_type_regexes` is used to first select the operations that match
+   the specified operation types. An operation has its default type
+   (e.g. MatMul, Conv2D). `tfprof` also considers device as operation type.
+   User can also define customized operation type. Hence, an operation has
+   multiple types. Operations with matched
+   types are selected for display and their statistics are aggregated
+   by the in-memory data structure.
+
+3) Various `-xxx_name_regexes`,  `-min_xxx`, `-max_depth` etc options are then
+   applied to further filter based on names and values.
+   It's no limited operation name. In code view,
+   it's the code trace. In op view, it's the operation type name. Different
+   from `-account_type_regexes`, Statistics are used even if a name is not displayed.
+   For example, in code view, a callee might be hidden, but its statistics is
+   still aggregated by it's caller. `-account_displayed_op_only`, however,
+   breaks the rule and only use statistics of displayed names.
+
+4) Finally, the filtered data structure is displayed in a format depending
+   on the `-output` option.
+
+####Option Semantics In Different View
+options usually have the same semantics in different views. However, some
+can vary. For example `-max_depth` in scope view means the depth of
+name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>.
+In graph view, in means the number of hops in the <b>graph</b>.
+
+
+###Docs
+
+`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure.
+
+`-min_bytes`: Show ops that request at least this number of bytes.
+
+`-min_micros`: Show ops that spend at least this number of microseconds to run.
+
+`-min_params`: Show ops that contains at least this number of parameters.
+
+`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided
+
+`-min_occurrence`: Show ops that appear at least this number of times. Only available in "op" view.
+
+`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps."
+
+`-order_by`: Order the results by [name|depth|bytes|micros|params|float_ops|occurrence]
+
+`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated.
+
+`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated.
+
+`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated.
+
+`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated.
+
+`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated.
+
+Notes: For each op, `-account_type_regexes` is first evaluated, only ops with
+types matching the specified regexes are accounted and selected for displayed.
+`-start/trim/show/hide_name_regexes` are used to further filter ops for display.
+`-start_name_regexes` is evaluated first to search the starting ops to display.
+Descendants of starting ops are then evaluated against `-show/hide_name_regexes`
+to make display decision. If an op matches trim_name_regexes, all its
+descendants are hidden. Ops statistics are *accounted even if they are hidden*
+as long as they match the `-account_xxx` options.
+
+`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
+
+`-select`: Comma-separated list of metrics to show:
+[bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types|input_shapes].
+
+`-output`: Output results as stdout, file or timeline.
+The format is ```output_type:key=value,key=value```.
+For example: ```-output timeline:outfile=<filename>```.
+
+```shell
+timeline: key=outfile, value=<filename>.
+stdout: none.
+file: key=outfile, value=<filename>.
+```
diff --git a/tensorflow/tools/tfprof/g3doc/profile_memory.md b/tensorflow/tools/tfprof/g3doc/profile_memory.md
new file mode 100644
index 00000000000..e897967d3b7
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/profile_memory.md
@@ -0,0 +1,81 @@
+##Profile Memory
+
+It is generally a good idea to visualize the memory usage in timeline.
+It allows you to see the memory consumption of each GPU over time.
+
+```python
+#To get memory information, you need --graph_path and --run_meta_path
+tfprof> graph -max_depth 10000000 -step 0 -account_type_regexes .* -output timeline:outfile=<filename>
+generating trace file.
+
+******************************************************
+Timeline file is written to <filename>
+Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
+******************************************************
+```
+
+<left>
+TODO(xpan): Show the image correctly in github.
+![Timeline](graph_timeline.png)
+</left>
+
+
+```python
+# You can also visualize the memory information through other methods.
+
+# With op view, it shows you the aggregated output tensor bytes of each
+# operation type.
+tfprof> op -select bytes -order_by bytes
+node name | output bytes
+Identity                   32515.37MB (100.00%, 27.02%)
+FusedBatchNormGrad           10802.14MB (72.98%, 8.98%)
+FusedBatchNorm               10517.52MB (64.01%, 8.74%)
+Conv2D                       10509.25MB (55.27%, 8.73%)
+Conv2DBackpropInput           9701.39MB (46.54%, 8.06%)
+ReluGrad                      9206.45MB (38.48%, 7.65%)
+Relu                          8462.80MB (30.83%, 7.03%)
+DepthwiseConv2dNativeBackpropInput     7899.35MB (23.80%, 6.56%)
+DepthwiseConv2dNative         7425.17MB (17.23%, 6.17%)
+MaxPoolGrad                   3015.44MB (11.06%, 2.51%)
+AddN                           2741.49MB (8.56%, 2.28%)
+
+# With scope view, you can see the operations that outputs largest tensors.
+tfprof> scope -order_by bytes -select bytes -min_bytes 100000000
+node name | output bytes
+_TFProfRoot (--/120356.38MB)
+  tower_3/SepConv2d_2b_3x3/separable_conv2d (346.85MB/854.00MB)
+    tower_3/SepConv2d_2b_3x3/separable_conv2d/depthwise (507.15MB/507.15MB)
+  tower_0/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
+    tower_0/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
+  tower_2/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
+    tower_2/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
+  tower_1/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
+    tower_1/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
+  tower_3/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
+    tower_3/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
+  tower_2/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
+    tower_2/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
+  tower_0/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
+    tower_0/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
+  ...
+
+# code view.
+tfprof> code  -max_depth 10 -select bytes -order_by bytes -start_name_regexes .*seq2seq.* -min_bytes 1
+node name | output bytes
+_TFProfRoot (--/74148.60MB)
+  seq2seq_attention.py'>:168:run_filename_from...:none (0B/74148.60MB)
+    seq2seq_attention.py'>:33:_run_code_in_main:none (0B/74148.60MB)
+      seq2seq_attention.py:316:<module>:app.run() (0B/74148.60MB)
+        app.py:432:run:_run_main(main or... (0B/74148.60MB)
+          app.py:352:_run_main:sys.exit(main(arg... (0B/74148.60MB)
+            seq2seq_attention.py:270:main:_Train(model, bat... (0B/74148.60MB)
+              seq2seq_attention.py:128:_Train:model.build_graph() (0B/74148.60MB)
+                seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0B/48931.86MB)
+                  seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0B/46761.06MB)
+                  seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0B/2170.80MB)
+                  seq2seq_attention_model.py:312:_add_train_op:tf.train.exponent... (0B/2.56KB)
+                  seq2seq_attention_model.py:308:_add_train_op:tf.summary.scalar... (0B/64B)
+                  seq2seq_attention_model.py:320:_add_train_op:tf.summary.scalar... (0B/64B)
+                seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0B/25216.74MB)
+                  seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0B/21542.55MB)
+```
\ No newline at end of file
diff --git a/tensorflow/tools/tfprof/g3doc/profile_model_architecture.md b/tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
new file mode 100644
index 00000000000..5ad5a56513b
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
@@ -0,0 +1,87 @@
+##Profile Model Architecture
+
+* [Profile Model Parameters](#profile-model-parameters)
+* [Profile Model Float Operations](#profile-model-float-operations)
+
+###Profile Model Parameters
+
+<b>Notes:</b>
+`VariableV2` operation type might contain variables created by TensorFlow
+implicitly. User normally don't want to count them as "model capacity".
+We can use customized operation type to select a subset of variables.
+For example `_trainable_variables` is created automatically by tfprof Python
+API. User can also define customized operation type.
+
+```
+# parameters are created by operation type 'VariableV2' (For older model,
+# it's 'Variable'). scope view is usually suitable in this case.
+tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
+_TFProfRoot (--/930.58k params)
+  global_step (1/1 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+
+# The Python API profiles tf.trainable_variables() instead of VariableV2.
+#
+# By default, it's printed to stdout. User can update tfprof_options['output']
+# to write to file. The result is always returned as a proto buffer.
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_options=tf.contrib.tfprof.model_analyzer.
+        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
+```
+
+###Profile Model Float Operations
+
+####Caveats
+
+For an operation to have float operation statistics:
+
+* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
+use the definition to calculate float operations. Contributes are welcome.
+
+* It must have known "shape" information for RegisterStatistics('flops')
+to calculate the statistics. It is suggested to pass in `-run_meta_path` if
+shape is only known during runtime. tfprof can fill in the missing shape with
+the runtime shape information from RunMetadata.
+
+Hence, it is suggested to use `-account_displayed_name_only`
+option so that you know the statistics are only for the operations printed out.
+
+
+```python
+# To profile float opertions in commandline, you need to pass --graph_path
+# and --op_log_path.
+tfprof> scope -min_float_ops 1 -select float_ops -account_displayed_op_only
+node name | # float_ops
+_TFProfRoot (--/17.63b flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
+  init/init_conv/Conv2D (113.25m/113.25m flops)
+  pool_logit/xw_plus_b (1.28k/165.12k flops)
+    pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
+  unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
+
+# Some might prefer op view that aggregate by operation type.
+tfprof> op -min_float_ops 1 -select float_ops -account_displayed_op_only -order_by float_ops
+node name | # float_ops
+Conv2D                   17.63b float_ops (100.00%, 100.00%)
+MatMul                   491.52k float_ops (0.00%, 0.00%)
+BiasAdd                  1.28k float_ops (0.00%, 0.00%)
+
+# You can also do that in Python API.
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+```
diff --git a/tensorflow/tools/tfprof/g3doc/profile_time.md b/tensorflow/tools/tfprof/g3doc/profile_time.md
new file mode 100644
index 00000000000..c89d7b0b03f
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/profile_time.md
@@ -0,0 +1,147 @@
+##Profile Time
+
+* [Profile by Python Code](#profile-by-python-code)
+* [Profile by Operation Type](#profile-by-operation-type)
+* [Profile by Graph](#profile-by-graph)
+* [Profile by Name Scope](#profile-by-name-scope)
+
+###Profile by Python Code
+```python
+# In code view, the time of each line of Python code is the aggregated
+# times of all operations created by that line.
+# In command line, it requires --graph_path --op_log_path and --run_meta_path.
+# --op_log_path provides the code traces information.
+# --run_meta_path provides the time information.
+
+tfprof> code -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros
+node name | execution time
+_TFProfRoot (--/3.74sec)
+  seq2seq_attention.py'>:168:run_filename_from...:none (0us/3.74sec)
+    seq2seq_attention.py'>:33:_run_code_in_main:none (0us/3.74sec)
+      seq2seq_attention.py:316:<module>:app.run() (0us/3.74sec)
+        seq2seq_attention.py:270:main:_Train(model, bat... (0us/3.74sec)
+          seq2seq_attention.py:128:_Train:model.build_graph() (0us/3.74sec)
+            seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
+              seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
+              seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms)
+              seq2seq_attention_model.py:253:_add_seq2seq:initial_state_att... (0us/50.35ms)
+              seq2seq_attention_model.py:173:_add_seq2seq:for x in encoder_... (0us/8.72ms)
+              seq2seq_attention_model.py:218:_add_seq2seq:w_t = tf.transpos... (0us/2.39ms)
+              ...
+            seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0us/949.10ms)
+              seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0us/641.44ms)
+              seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0us/307.56ms)
+              ...
+            seq2seq_attention_model.py:364:build_graph:self._summaries =... (0us/13us)
+            seq2seq_attention_model.py:361:build_graph:self.global_step ... (0us/12us)
+            ...
+          seq2seq_attention.py:129:_Train:saver = tf.train.... (0us/0us)
+          seq2seq_attention.py:140:_Train:global_step=model... (0us/0us)
+
+# Sometimes you want to explore a specific function. You can do that
+# with -start_name_regexes.
+tfprof> code -start_name_regexes .*_add_seq2seq.* -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros
+node name | execution time
+_TFProfRoot (--/3.74sec)
+  seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
+    seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
+      seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize) (0us/2.46sec)
+      seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha... (0us/164us)
+
+# You can also dive deeper into tensorflow's libraries.
+tfprof> code  -max_depth 5 -select micros -order_by micros -start_name_regexes .*_add_seq2seq.* -min_micros 100000
+_TFProfRoot (--/3.74sec)
+  seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
+    seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
+      seq2seq_lib.py:181:sampled_sequence_...:average_across_ti... (0us/2.46sec)
+        seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f... (0us/2.46sec)
+    seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms)
+      seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ... (0us/127.27ms)
+        core_rnn.py:195:static_rnn:state_size=cell.s... (0us/127.20ms)
+      seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,... (0us/125.96ms)
+        core_rnn.py:195:static_rnn:state_size=cell.s... (0us/125.86ms)
+
+
+# It can also be done in Python API
+opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+opts['account_type_regexes'] = ['.*']
+opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
+opts['account_displayed_op_only'] = False
+opts['select'] = ['micros']
+
+tfprof_node = model_analyzer.print_model_analysis(
+    sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+```
+
+You can generate some visualization in code view:
+Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
+<left>
+![CodeTimeline](code_timeline.png)
+</left>
+
+
+###Profile by Operation Type
+```python
+# In op view, you can view the aggregated time of each operation type.
+tfprof> op -select micros,occurrence -order_by micros
+node name | execution time | op occurrence
+SoftmaxCrossEntropyWithLogits     1.37sec (100.00%, 36.44%),         30
+MatMul                        618.97ms (63.56%, 16.51%),       3450
+Add                            273.76ms (47.06%, 7.30%),       2180
+Sub                            215.41ms (39.76%, 5.74%),       4372
+ConcatV2                       203.88ms (34.01%, 5.44%),       6098
+Mul                            134.32ms (28.58%, 3.58%),       9427
+ApplyAdam                       92.66ms (25.00%, 2.47%),         27
+Switch                          72.43ms (22.53%, 1.93%),      30654
+LogUniformCandidateSampler       69.01ms (20.59%, 1.84%),         30
+Unique                          53.50ms (18.75%, 1.43%),          2
+AddN                            50.10ms (17.33%, 1.34%),       5481
+
+# You might be surprised to see that SoftmaxCrossEntropyWithLogits is
+# that expensive. As shown below, it is placed on cpu.
+tfprof> op -select micros,device -order_by micros
+node name | execution time | assigned devices
+SoftmaxCrossEntropyWithLogits     1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0
+MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3
+```
+
+
+###Profile by Graph
+
+Usually, use graph view to generate a timeline to visualize the result.
+
+In the chrome://tracing UI, click "Flow Event" in "View Options" of upper
+right corner to see the flow of tensors.
+
+<left>
+TODO(xpan): Show the image correctly in github.
+![Timeline](graph_timeline.png)
+</left>
+
+tfprof options allow users to generate timeline in some advanced ways.
+
+```python
+# Only generate timeline for gpu3 and cpu on workers.
+graph -max_depth 10000000 -step 0 -account_type_regexes .*gpu:3.*,.*worker.*cpu:0.* -output timeline:outfile=<filename.json>
+generating trace file.
+
+******************************************************
+Timeline file is written to <filename.json>.
+Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
+******************************************************
+```
+
+###Profile by Name Scope
+
+Usually scope view allows you to pin point the problematic places if you
+have properly named your operations with tf.name_scope or tf.variable_scope.
+
+```python
+tfprof> scope -max_depth 30 -select micros -min_micros 100000 -order_by micros
+node name | execution time
+_TFProfRoot (--/8.12sec)
+  tower_3/gradients/tower_3/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (126.34ms/126.34ms)
+  tower_1/gradients/tower_1/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (125.44ms/125.44ms)
+  tower_2/gradients/tower_2/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.85ms/124.85ms)
+  tower_0/gradients/tower_0/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.45ms/124.45ms)
+```
diff --git a/tensorflow/tools/tfprof/g3doc/python_api.md b/tensorflow/tools/tfprof/g3doc/python_api.md
new file mode 100644
index 00000000000..581e66baa29
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/python_api.md
@@ -0,0 +1,144 @@
+## Python API Tutorials
+
+* [Parameters and Shapes](#parameters-and-shapes)
+* [Float Operations](#float-operations)
+* [Time and Memory](#time-and-memory)
+* [Visualize](#visualize)
+* [Multi-step Profiling](#multi-step-profiling)
+
+```import tensorflow as tf```.
+
+### Parameters and Shapes.
+```python
+# Print trainable variable parameter statistics to stdout.
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_options=tf.contrib.tfprof.model_analyzer.
+        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+
+# Use code view to associate statistics with Python codes.
+opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_cmd='code'
+    tfprof_options=opts)
+
+# param_stats can be tensorflow.tfprof.TFGraphNodeProto or
+# tensorflow.tfprof.TFMultiGraphNodeProto, depending on the view.
+# Let's print the root below.
+sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
+```
+
+### Float Operations
+
+#### Note: See [Caveats](profile_model_architecture.md#caveats) in "Profile Model Architecture" Tutorial
+``` python
+# Print to stdout an analysis of the number of floating point operations in the
+# model broken down by individual operations.
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+```
+
+### Time and Memory
+You will first need to run the following set up in your model in order to
+compute the memory and timing statistics.
+
+```python
+# Generate the RunMetadata that contains the memory and timing information.
+#
+# Note: When run on GPU, a kernel is first scheduled (enqueued) and then
+#       executed asynchronously. tfprof only tracks the execution time.
+#
+run_metadata = tf.RunMetadata()
+with tf.Session() as sess:
+  _ = sess.run(train_op,
+               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
+               run_metadata=run_metadata)
+```
+
+Finally, you may run `print_model_analysis` to explore the timing and memory
+information of the model.
+
+``` python
+# See model_analyzer_test.py for more examples.
+#
+# Print to stdout an analysis of the memory usage and the timing information
+# broken down by python codes.
+opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+opts['show_name_regexes'] = ['.*my_code.py.*']
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    run_meta=run_metadata,
+    tfprof_cmd='code',
+    tfprof_options=opts)
+
+# Print to stdout an analysis of the memory usage and the timing information
+# broken down by operations.
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    run_meta=run_metadata,
+    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
+```
+
+### Visualize
+
+```
+To visualize the result of Python API results:
+Set opts['output'] = 'timeline:outfile=<filename>' to generate a timeline json file.
+Open a Chrome Browser, open URL chrome://tracing, and load the json file.
+```
+
+Below are 2 examples of graph view and scope view. See code view example in later examples.
+
+<left>
+![CodeTimeline](graph_timeline.png)
+![CodeTimeline](scope_timeline.png)
+</left>
+
+### Multi-step Profiling
+
+tfprof allows you to profile statistics across multiple steps.
+
+```python
+opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+opts['account_type_regexes'] = ['.*']
+
+with session.Session() as sess:
+  r1, r2, r3 = lib.BuildSplitableModel()
+  sess.run(variables.global_variables_initializer())
+
+  # Create a profiler.
+  profiler = model_analyzer.Profiler(sess.graph)
+  # Profile without RunMetadata of any step.
+  pb0 = profiler.profile_name_scope(opts)
+
+  run_meta = config_pb2.RunMetadata()
+  _ = sess.run(r1,
+               options=config_pb2.RunOptions(
+                   trace_level=config_pb2.RunOptions.FULL_TRACE),
+               run_metadata=run_meta)
+
+  # Add run_meta of step 1.
+  profiler.add_step(1, run_meta)
+  pb1 = profiler.profile_name_scope(opts)
+
+  run_meta2 = config_pb2.RunMetadata()
+  _ = sess.run(r2,
+               options=config_pb2.RunOptions(
+                   trace_level=config_pb2.RunOptions.FULL_TRACE),
+               run_metadata=run_meta2)
+  # Add run_meta of step 2.
+  profiler.add_step(2, run_meta2)
+  pb2 = profiler.profile_name_scope(opts)
+
+  run_meta3 = config_pb2.RunMetadata()
+  _ = sess.run(r3,
+               options=config_pb2.RunOptions(
+                   trace_level=config_pb2.RunOptions.FULL_TRACE),
+               run_metadata=run_meta3)
+  # Add run_meta of step 3.
+  profiler.add_step(3, run_meta3)
+  pb3 = profiler.profile_name_scope(opts)
+```
\ No newline at end of file
diff --git a/tensorflow/tools/tfprof/g3doc/scope_timeline.png b/tensorflow/tools/tfprof/g3doc/scope_timeline.png
new file mode 100644
index 00000000000..c6d95af84aa
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/scope_timeline.png differ
diff --git a/tensorflow/tools/tfprof/internal/BUILD b/tensorflow/tools/tfprof/internal/BUILD
index 7476a5ad412..9b77b0fb3f2 100644
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@@ -1,5 +1,5 @@
 package(
-    default_visibility = ["//tensorflow:__subpackages__"],
+    default_visibility = ["//tensorflow:internal"],
     features = [
         "-layering_check",
         "-parse_headers",
@@ -15,11 +15,14 @@ cc_library(
     srcs = ["tfprof_stats.cc"],
     hdrs = ["tfprof_stats.h"],
     deps = [
+        ":tfprof_code",
         ":tfprof_graph",
         ":tfprof_node",
+        ":tfprof_op",
         ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -29,14 +32,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_timeline",
+    srcs = ["tfprof_timeline.cc"],
+    hdrs = ["tfprof_timeline.h"],
+    deps = [
+        ":tfprof_node_show",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
 cc_library(
     name = "tfprof_node",
     srcs = ["tfprof_node.cc"],
     hdrs = ["tfprof_node.h"],
     deps = [
         ":tfprof_options",
+        ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
     ],
 )
 
@@ -61,6 +80,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_op",
+    srcs = ["tfprof_op.cc"],
+    hdrs = ["tfprof_op.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show_multi",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_code",
+    srcs = ["tfprof_code.cc"],
+    hdrs = ["tfprof_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show_multi",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_graph",
     srcs = ["tfprof_graph.cc"],
@@ -80,6 +140,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_node_show",
+    srcs = ["tfprof_node_show.cc"],
+    hdrs = ["tfprof_node_show.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_show",
     srcs = ["tfprof_show.cc"],
@@ -87,8 +162,31 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
+        ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_tensor",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_show_multi",
+    srcs = ["tfprof_show_multi.cc"],
+    hdrs = ["tfprof_show_multi.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_node_show",
+        ":tfprof_options",
+        ":tfprof_scope",
+        ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -100,9 +198,12 @@ cc_library(
 
 tf_cc_test(
     name = "tfprof_show_test",
+    size = "small",
     srcs = ["tfprof_show_test.cc"],
     data = [
-        "testdata/ckpt",
+        "testdata/ckpt.data-00000-of-00001",
+        "testdata/ckpt.index",
+        "testdata/ckpt.meta",
         "testdata/graph.pbtxt",
         "testdata/run_meta",
         "testdata/tfprof_log",
@@ -111,13 +212,27 @@ tf_cc_test(
         ":tfprof_constants",
         ":tfprof_options",
         ":tfprof_stats",
+        ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "tfprof_timeline_test",
+    size = "small",
+    srcs = ["tfprof_timeline_test.cc"],
+    data = [
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_tf_testlib",
+        ":tfprof_utils",
         "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
@@ -168,15 +283,19 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/tools/tfprof/internal/advisor:tfprof_advisor",
     ],
     alwayslink = 1,
 )
 
 tf_cc_test(
     name = "tfprof_stats_test",
+    size = "small",
     srcs = ["tfprof_stats_test.cc"],
     data = [
-        "testdata/ckpt",
+        "testdata/ckpt.data-00000-of-00001",
+        "testdata/ckpt.index",
+        "testdata/ckpt.meta",
         "testdata/graph.pbtxt",
         "testdata/run_meta",
         "testdata/tfprof_log",
@@ -185,13 +304,9 @@ tf_cc_test(
         ":tfprof_constants",
         ":tfprof_options",
         ":tfprof_stats",
+        ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
@@ -210,21 +325,20 @@ cc_library(
 
 tf_cc_test(
     name = "tfprof_tensor_test",
+    size = "small",
     srcs = ["tfprof_tensor_test.cc"],
     data = [
-        "testdata/ckpt",
+        "testdata/ckpt.data-00000-of-00001",
+        "testdata/ckpt.index",
+        "testdata/ckpt.meta",
         "testdata/graph.pbtxt",
     ],
     deps = [
         ":tfprof_options",
         ":tfprof_stats",
+        ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
@@ -235,6 +349,26 @@ cc_library(
     deps = [
     ],
 )
+
+cc_library(
+    name = "tfprof_tf_testlib",
+    testonly = 1,
+    deps = [
+        ":tfprof_tf_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "tfprof_tf_lib",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+    ],
+)
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/tools/tfprof/internal/advisor/BUILD b/tensorflow/tools/tfprof/internal/advisor/BUILD
new file mode 100644
index 00000000000..30012fa7b14
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/BUILD
@@ -0,0 +1,79 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "checker",
+    hdrs = ["checker.h"],
+    deps = [
+        "//tensorflow/tools/tfprof/internal:tfprof_stats",
+    ],
+)
+
+cc_library(
+    name = "internal_checker_runner_dummy",
+    srcs = ["internal_checker_runner_dummy.cc"],
+    hdrs = ["internal_checker_runner.h"],
+    deps = [
+        ":checker",
+    ],
+)
+
+cc_library(
+    name = "accelerator_utilization_checker",
+    hdrs = ["accelerator_utilization_checker.h"],
+    deps = [
+        ":checker",
+    ],
+)
+
+cc_library(
+    name = "operation_checker",
+    hdrs = ["operation_checker.h"],
+    deps = [
+        ":checker",
+    ],
+)
+
+cc_library(
+    name = "tfprof_advisor",
+    hdrs = ["tfprof_advisor.h"],
+    deps = [
+        ":accelerator_utilization_checker",
+        ":checker",
+        ":internal_checker_runner_dummy",
+        ":operation_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "tfprof_advisor_test",
+    srcs = ["tfprof_advisor_test.cc"],
+    deps = [
+        ":tfprof_advisor",
+        "//tensorflow/tools/tfprof/internal:tfprof_tf_testlib",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h b/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
new file mode 100644
index 00000000000..8f256584f7b
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
@@ -0,0 +1,111 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks the accelerator's utilization.
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+
+#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+struct ExecStats {
+ public:
+  // Earliest start time of a step.
+  int64 start_micros;
+  // Latest finish time of a step.
+  int64 end_micros;
+  // The duration spent on running a kernel during a step.
+  int64 exec_micros;
+};
+
+class AcceleratorUtilizationChecker : public Checker {
+ public:
+  string name() override { return "AcceleratorUtilizationChecker"; }
+
+ private:
+  std::vector<string> Check(const TFStats* stats) override {
+    if (!stats) {
+      fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
+              name().c_str());
+      return reports_;
+    }
+    for (const auto& n : stats->nodes()) {
+      BuildExecStats(n.second.get());
+    }
+    return CheckInternal();
+  }
+
+  std::vector<string> CheckInternal() {
+    for (const auto& s : accelerator_exec_stats_) {
+      const ExecStats& stat = s.second;
+      int64 total_micros = stat.end_micros - stat.start_micros;
+      if (total_micros <= 0) continue;
+      double utilization = 1.0 * stat.exec_micros / total_micros;
+      if (utilization >= 0.5) {
+        reports_.push_back(strings::Printf("%s: device: %s utilization: %.2f",
+                                           kLevel[0], s.first.c_str(),
+                                           utilization));
+      } else if (utilization < 0.5 && utilization > 0.2) {
+        reports_.push_back(
+            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[1],
+                            s.first.c_str(), utilization));
+      } else if (utilization <= 0.2) {
+        reports_.push_back(
+            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[2],
+                            s.first.c_str(), utilization));
+      }
+    }
+    return reports_;
+  }
+
+  void BuildExecStats(const TFGraphNode* node) {
+    const auto& execs = node->all_op_execs();
+    if (execs.empty()) {
+      return;
+    }
+    if (!IsAcceleratorDevice(node->canonical_device())) {
+      return;
+    }
+
+    if (accelerator_exec_stats_.find(node->canonical_device()) ==
+        accelerator_exec_stats_.end()) {
+      accelerator_exec_stats_.insert(
+          std::pair<string, ExecStats>(node->canonical_device(), ExecStats()));
+    }
+    ExecStats& stats = accelerator_exec_stats_.at(node->canonical_device());
+
+    // TODO(xpan): Use multiple steps?
+    const ExecStep& exec = execs.rbegin()->second;
+
+    if (stats.start_micros == 0) {
+      stats.start_micros = exec.all_start_micros();
+    } else if (exec.all_start_micros() != 0) {
+      stats.start_micros =
+          std::min(stats.start_micros, exec.all_start_micros());
+    }
+    stats.end_micros = std::max(stats.end_micros, exec.latest_end_micros());
+    stats.exec_micros += exec.accelerator_exec_micros();
+  }
+
+  std::map<string, ExecStats> accelerator_exec_stats_;
+  std::map<string, int64> ps_placement_;
+  std::vector<string> reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/checker.h b/tensorflow/tools/tfprof/internal/advisor/checker.h
new file mode 100644
index 00000000000..b8b057be5b1
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/checker.h
@@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+static const char* const kLevel[] = {
+    "NOTE",     // Good to know.
+    "SUGGEST",  // Might get better.
+    "WARN",     // Please do it for better.
+};
+
+class Checker {
+ public:
+  virtual ~Checker(){};
+
+  virtual string name() = 0;
+
+  std::vector<string> Run(const TFStats* stats) { return Check(stats); }
+
+ protected:
+  // Returns a vector of string, each one being an advice.
+  virtual std::vector<string> Check(const TFStats* stats) = 0;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
new file mode 100644
index 00000000000..1238b57f20b
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
@@ -0,0 +1,31 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFStats;
+
+std::map<string, std::vector<string>> RunInternalCheckers(const TFStats* stats);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
new file mode 100644
index 00000000000..8204d2b04e4
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
@@ -0,0 +1,26 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+std::map<string, std::vector<string>> RunInternalCheckers(
+    const TFStats* stats) {
+  return std::map<string, std::vector<string>>();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/advisor/operation_checker.h b/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
new file mode 100644
index 00000000000..78132e3a460
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks common wrong configurations of operations.
+//
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+
+#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class OperationChecker : public Checker {
+ public:
+  string name() override { return "OperationChecker"; }
+
+ private:
+  std::vector<string> Check(const TFStats* stats) override {
+    if (!stats) {
+      fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
+              name().c_str());
+      return reports_;
+    }
+    bool use_batch_norm = false;
+    bool use_fused_batch_norm = false;
+    bool recommend_nchw = false;
+    for (const auto& n : stats->nodes()) {
+      const TFGraphNode* node = n.second.get();
+      if (node->name().find("BatchNorm") != node->name().npos) {
+        use_batch_norm = true;
+      }
+      if (node->op_types().find("FusedBatchNorm") != node->op_types().end()) {
+        use_fused_batch_norm = true;
+      }
+      if (node->op_attrs().find("data_format") != node->op_attrs().end()) {
+        const AttrValue* attr_val = node->op_attrs().at("data_format");
+        if (attr_val->s() == "NHWC" &&
+            IsAcceleratorDevice(node->canonical_device())) {
+          recommend_nchw = true;
+        }
+      }
+    }
+    if (use_batch_norm && !use_fused_batch_norm) {
+      reports_.push_back(strings::Printf(
+          "%s: Maybe use faster FusedBatchNorm instead of BatchNorm",
+          kLevel[1]));
+    }
+    if (recommend_nchw) {
+      // TODO(xpan): Maybe print which Op supports NCHW.
+      reports_.push_back(strings::Printf(
+          "%s: Found operation using NHWC data_format on GPU. Maybe "
+          "NCHW is faster.",
+          kLevel[1]));
+    }
+    return reports_;
+  }
+
+ private:
+  std::vector<string> reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
new file mode 100644
index 00000000000..856f5154592
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
@@ -0,0 +1,58 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+
+#include "tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h"
+#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+#include "tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h"
+#include "tensorflow/tools/tfprof/internal/advisor/operation_checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// The Advisor runs a list of Checkers, each checks a specific area.
+class Advisor {
+ public:
+  Advisor(const TFStats* stats) : stats_(stats) {}
+
+  std::map<string, std::vector<string>> Advise() {
+    // Note: Release a checker's memory ASAP.
+    std::map<string, std::vector<string>> reports = RunInternalCheckers(stats_);
+    // TODO(xpan): Think of a way to turn off/on specific checkers.
+    AcceleratorUtilizationChecker au_checker;
+    reports[au_checker.name()] = au_checker.Run(stats_);
+    OperationChecker op_checker;
+    reports[op_checker.name()] = op_checker.Run(stats_);
+
+    for (const auto& checker_r : reports) {
+      fprintf(stdout, "%s reports:\n", checker_r.first.c_str());
+      for (const auto& r : checker_r.second) {
+        fprintf(stdout, "%s\n", r.c_str());
+      }
+    }
+    fflush(stdout);
+    return reports;
+  }
+
+ private:
+  const TFStats* stats_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
new file mode 100644
index 00000000000..b41d0770dc7
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h"
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFProfAdvisorTest : public ::testing::Test {
+ protected:
+  TFProfAdvisorTest() {
+    stats_.reset(new TFStats(std::unique_ptr<GraphDef>(new GraphDef()), nullptr,
+                             nullptr, nullptr));
+
+    stats_->AddNodeForTest(
+        "n1", CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 10, 2));
+    stats_->AddNodeForTest("n2", CreateNode("n2", "Conv2D", {}, 20, 2));
+    advisor_.reset(new Advisor(stats_.get()));
+  }
+
+  std::unique_ptr<TFGraphNode> CreateNode(const string& name,
+                                          const string& type,
+                                          std::map<string, string> attrs,
+                                          int64 start_miros,
+                                          int64 end_rel_micros) {
+    node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+    NodeDef* def = node_defs_.back().get();
+
+    def->set_name(name);
+    def->set_op(type);
+    for (const auto& attr : attrs) {
+      (*def->mutable_attr())[attr.first].set_s(attr.second);
+    }
+    std::unique_ptr<TFGraphNode> node(new TFGraphNode(def));
+
+    NodeExecStats node_stat;
+    node_stat.set_all_start_micros(start_miros);
+    node_stat.set_op_end_rel_micros(end_rel_micros);
+    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
+    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+                      node_stat);
+    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+                      node_stat);
+    return node;
+  }
+
+  std::unique_ptr<TFStats> stats_;
+  std::unique_ptr<Advisor> advisor_;
+  std::vector<std::unique_ptr<NodeDef>> node_defs_;
+};
+
+TEST_F(TFProfAdvisorTest, Basics) {
+  std::map<string, std::vector<string>> reports = advisor_->Advise();
+  EXPECT_TRUE(reports.find("AcceleratorUtilizationChecker") != reports.end());
+  EXPECT_TRUE(reports.find("OperationChecker") != reports.end());
+}
+
+TEST_F(TFProfAdvisorTest, OperationChecker) {
+  std::map<string, std::vector<string>> reports = advisor_->Advise();
+  EXPECT_EQ(reports["OperationChecker"].size(), 1);
+  EXPECT_TRUE(StringPiece(reports["OperationChecker"][0]).contains("NCHW"));
+}
+
+TEST_F(TFProfAdvisorTest, UtilizationChecker) {
+  std::map<string, std::vector<string>> reports = advisor_->Advise();
+  EXPECT_EQ(reports["AcceleratorUtilizationChecker"].size(), 1);
+  EXPECT_TRUE(StringPiece(reports["AcceleratorUtilizationChecker"][0])
+                  .contains("low utilization"));
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
index dfe4019fbb4..37d01db3a1c 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 #include "tensorflow/tools/tfprof/tfprof_log.pb.h"
@@ -30,6 +31,89 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+TFStats* tf_stat = nullptr;
+
+string RunProfile(const string& command, const string& options,
+                  TFStats* tf_stats) {
+  Options opts;
+  tensorflow::Status s = Options::FromProtoStr(options, &opts);
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    return "";
+  }
+
+  if (opts.output_type == kOutput[1]) {
+    printf("\n=========================Options=============================\n");
+    printf("%s", opts.ToString().c_str());
+    printf("\n==================Model Analysis Report======================\n");
+    string ret = "";
+    if (command == kCmds[2] || command == kCmds[3]) {
+      ret = tf_stats->ShowMultiGraphNode(command, opts).SerializeAsString();
+    } else if (command == kCmds[0] || command == kCmds[1]) {
+      ret = tf_stats->ShowGraphNode(command, opts).SerializeAsString();
+    } else {
+      fprintf(stderr, "Unknown command: %s\n", command.c_str());
+    }
+    printf("\n======================End of Report==========================\n");
+    fflush(stdout);
+    return ret;
+  }
+  if (command == kCmds[2] || command == kCmds[3]) {
+    return tf_stats->ShowMultiGraphNode(command, opts).SerializeAsString();
+  } else if (command == kCmds[0] || command == kCmds[1]) {
+    return tf_stats->ShowGraphNode(command, opts).SerializeAsString();
+  } else {
+    fprintf(stderr, "Unknown command: %s\n", command.c_str());
+    return "";
+  }
+}
+}  // namespace
+
+bool NewProfiler(const string* graph, const string* op_log) {
+  CHECK(!tf_stat) << "Currently only 1 living tfprof profiler is allowed";
+  CHECK(graph) << "graph mustn't be null";
+  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  graph_ptr->ParseFromString(*graph);
+
+  std::unique_ptr<OpLog> op_log_ptr;
+  if (op_log && !op_log->empty()) {
+    op_log_ptr.reset(new OpLog());
+    op_log_ptr->ParseFromString(*op_log);
+  }
+  tf_stat = new TFStats(std::move(graph_ptr), nullptr, std::move(op_log_ptr),
+                        nullptr);
+  return true;
+}
+
+void DeleteProfiler() {
+  delete tf_stat;
+  tf_stat = nullptr;
+}
+
+void AddStep(int64 step, const string* run_meta, const string* op_log) {
+  CHECK(tf_stat);
+  CHECK(run_meta && !run_meta->empty());
+  // TODO(xpan): Better error handling.
+  std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
+  run_meta_ptr->ParseFromString(*run_meta);
+  tf_stat->ParseRunMeta(step, std::move(run_meta_ptr));
+
+  std::unique_ptr<OpLog> op_log_ptr;
+  if (op_log && !op_log->empty()) {
+    op_log_ptr.reset(new OpLog());
+    op_log_ptr->ParseFromString(*op_log);
+  }
+  tf_stat->ParseOpLog(std::move(op_log_ptr));
+}
+
+string Profile(const string* command, const string* options) {
+  CHECK(tf_stat);
+  CHECK(command) << "command mustn't be null";
+  CHECK(options) << "options mustn't be null";
+  return RunProfile(*command, *options, tf_stat);
+}
+
 string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* op_log, const string* command,
                           const string* options) {
@@ -40,34 +124,27 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   graph_ptr->ParseFromString(*graph);
 
   std::unique_ptr<RunMetadata> run_meta_ptr;
-  if (run_meta) {
+  if (run_meta && !run_meta->empty()) {
     run_meta_ptr.reset(new RunMetadata());
     run_meta_ptr->ParseFromString(*run_meta);
   }
 
   std::unique_ptr<OpLog> op_log_ptr;
-  if (op_log) {
+  if (op_log && !op_log->empty()) {
     op_log_ptr.reset(new OpLog());
     op_log_ptr->ParseFromString(*op_log);
   }
 
+  // TODO(xpan): Maybe need to init the checkpoint reader?
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
 
   TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
                    std::move(op_log_ptr), std::move(ckpt_reader));
 
-  Options opts = Options::FromProtoStr(*options);
-
-  if (opts.dump_to_file.empty()) {
-    printf("\n=========================Options=============================\n");
-    printf("%s", opts.ToString().c_str());
-    printf("\n==================Model Analysis Report======================\n");
-    TFProfNode root(tf_stats.PrintGraph(*command, opts));
-    printf("\n======================End of Report==========================\n");
-    fflush(stdout);
-    return root.SerializeAsString();
-  }
-  return tf_stats.PrintGraph(*command, opts).SerializeAsString();
+  return RunProfile(*command, *options, &tf_stats);
 }
+
+void Advise() { Advisor(tf_stat).Advise(); }
+
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.h b/tensorflow/tools/tfprof/internal/print_model_analysis.h
index 61d609b798c..84165e542d7 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.h
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.h
@@ -23,7 +23,25 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 struct Options;
-// ***This API is only for swig. Don't user it directory!***
+
+// **********************
+// APIs in this file are only for swig.
+// Talk to xpan@ if you want to call it directly!
+// *********************
+
+// Multi-step Profiler.
+//
+bool NewProfiler(const string* graph, const string* op_log);
+
+void DeleteProfiler();
+
+void AddStep(int64 step, const string* run_meta, const string* op_log);
+
+string Profile(const string* command, const string* options);
+
+void Advise();
+
+// Single-step Profiler.
 //
 // Interface defined for Python API swig. Calls the tfprof core API.
 // 'graph', 'run_meta', 'op_log' are serialized GraphDef, RunMetadata,
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt b/tensorflow/tools/tfprof/internal/testdata/ckpt
deleted file mode 100644
index 2f59f071c59..00000000000
Binary files a/tensorflow/tools/tfprof/internal/testdata/ckpt and /dev/null differ
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.data-00000-of-00001 b/tensorflow/tools/tfprof/internal/testdata/ckpt.data-00000-of-00001
new file mode 100644
index 00000000000..045063943f9
Binary files /dev/null and b/tensorflow/tools/tfprof/internal/testdata/ckpt.data-00000-of-00001 differ
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.index b/tensorflow/tools/tfprof/internal/testdata/ckpt.index
new file mode 100644
index 00000000000..908198167da
Binary files /dev/null and b/tensorflow/tools/tfprof/internal/testdata/ckpt.index differ
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.meta b/tensorflow/tools/tfprof/internal/testdata/ckpt.meta
new file mode 100644
index 00000000000..94fe29ad5c8
Binary files /dev/null and b/tensorflow/tools/tfprof/internal/testdata/ckpt.meta differ
diff --git a/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt b/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
index fd54551776c..e6fae2c4cf6 100644
--- a/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
+++ b/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
@@ -17,10 +17,10 @@ node {
             size: 2
           }
           dim {
-            size: 6
+            size: 8
           }
           dim {
-            size: 6
+            size: 8
           }
           dim {
             size: 3
@@ -32,8 +32,203 @@ node {
   }
 }
 node {
-  name: "DW"
-  op: "Variable"
+  name: "conv2d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\003\000\000\000\003\000\000\000\005\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.288675129414
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.288675129414
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "conv2d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "conv2d/kernel/Initializer/random_uniform/max"
+  input: "conv2d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "conv2d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "conv2d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "conv2d/kernel/Initializer/random_uniform/mul"
+  input: "conv2d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
   attr {
     key: "container"
     value {
@@ -60,7 +255,7 @@ node {
           size: 3
         }
         dim {
-          size: 6
+          size: 5
         }
       }
     }
@@ -73,177 +268,10 @@ node {
   }
 }
 node {
-  name: "DW/Initializer/random_normal/shape"
-  op: "Const"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        tensor_content: "\003\000\000\000\003\000\000\000\003\000\000\000\006\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "DW/Initializer/random_normal/mean"
-  op: "Const"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0
-      }
-    }
-  }
-}
-node {
-  name: "DW/Initializer/random_normal/stddev"
-  op: "Const"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.0010000000475
-      }
-    }
-  }
-}
-node {
-  name: "DW/Initializer/random_normal/RandomStandardNormal"
-  op: "RandomStandardNormal"
-  input: "DW/Initializer/random_normal/shape"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 87654321
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 5
-    }
-  }
-}
-node {
-  name: "DW/Initializer/random_normal/mul"
-  op: "Mul"
-  input: "DW/Initializer/random_normal/RandomStandardNormal"
-  input: "DW/Initializer/random_normal/stddev"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-}
-node {
-  name: "DW/Initializer/random_normal"
-  op: "Add"
-  input: "DW/Initializer/random_normal/mul"
-  input: "DW/Initializer/random_normal/mean"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@DW"
-      }
-    }
-  }
-}
-node {
-  name: "DW/Assign"
+  name: "conv2d/kernel/Assign"
   op: "Assign"
-  input: "DW"
-  input: "DW/Initializer/random_normal"
+  input: "conv2d/kernel"
+  input: "conv2d/kernel/Initializer/random_uniform"
   attr {
     key: "T"
     value {
@@ -254,7 +282,7 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW"
+        s: "loc:@conv2d/kernel"
       }
     }
   }
@@ -272,9 +300,9 @@ node {
   }
 }
 node {
-  name: "DW/read"
+  name: "conv2d/kernel/read"
   op: "Identity"
-  input: "DW"
+  input: "conv2d/kernel"
   attr {
     key: "T"
     value {
@@ -285,16 +313,187 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW"
+        s: "loc:@conv2d/kernel"
       }
     }
   }
 }
 node {
-  name: "Conv2D"
+  name: "conv2d/bias/Initializer/Const"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/bias"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 5
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/bias"
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 5
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "conv2d/bias/Assign"
+  op: "Assign"
+  input: "conv2d/bias"
+  input: "conv2d/bias/Initializer/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/bias"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "conv2d/bias/read"
+  op: "Identity"
+  input: "conv2d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/bias"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/convolution/Shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\003\000\000\000\003\000\000\000\005\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/convolution/dilation_rate"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d/convolution"
   op: "Conv2D"
   input: "zeros"
-  input: "DW/read"
+  input: "conv2d/kernel/read"
   attr {
     key: "T"
     value {
@@ -310,7 +509,7 @@ node {
   attr {
     key: "padding"
     value {
-      s: "SAME"
+      s: "VALID"
     }
   }
   attr {
@@ -318,8 +517,8 @@ node {
     value {
       list {
         i: 1
-        i: 2
-        i: 2
+        i: 1
+        i: 1
         i: 1
       }
     }
@@ -332,54 +531,31 @@ node {
   }
 }
 node {
-  name: "DW2"
-  op: "Variable"
+  name: "conv2d/BiasAdd"
+  op: "BiasAdd"
+  input: "conv2d/convolution"
+  input: "conv2d/bias/read"
   attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
+    key: "T"
     value {
       type: DT_FLOAT
     }
   }
   attr {
-    key: "shape"
+    key: "data_format"
     value {
-      shape {
-        dim {
-          size: 2
-        }
-        dim {
-          size: 2
-        }
-        dim {
-          size: 6
-        }
-        dim {
-          size: 12
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
+      s: "NHWC"
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal/shape"
+  name: "conv2d_1/kernel/Initializer/random_uniform/shape"
   op: "Const"
   attr {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
@@ -399,19 +575,19 @@ node {
             size: 4
           }
         }
-        tensor_content: "\002\000\000\000\002\000\000\000\006\000\000\000\014\000\000\000"
+        tensor_content: "\003\000\000\000\003\000\000\000\005\000\000\000\005\000\000\000"
       }
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal/mean"
+  name: "conv2d_1/kernel/Initializer/random_uniform/min"
   op: "Const"
   attr {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
@@ -428,19 +604,19 @@ node {
         dtype: DT_FLOAT
         tensor_shape {
         }
-        float_val: 0.0
+        float_val: -0.25819888711
       }
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal/stddev"
+  name: "conv2d_1/kernel/Initializer/random_uniform/max"
   op: "Const"
   attr {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
@@ -457,15 +633,15 @@ node {
         dtype: DT_FLOAT
         tensor_shape {
         }
-        float_val: 0.0010000000475
+        float_val: 0.25819888711
       }
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal/RandomStandardNormal"
-  op: "RandomStandardNormal"
-  input: "DW2/Initializer/random_normal/shape"
+  name: "conv2d_1/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "conv2d_1/kernel/Initializer/random_uniform/shape"
   attr {
     key: "T"
     value {
@@ -476,7 +652,7 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
@@ -489,21 +665,41 @@ node {
   attr {
     key: "seed"
     value {
-      i: 87654321
+      i: 0
     }
   }
   attr {
     key: "seed2"
     value {
-      i: 15
+      i: 0
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal/mul"
+  name: "conv2d_1/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "conv2d_1/kernel/Initializer/random_uniform/max"
+  input: "conv2d_1/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d_1/kernel/Initializer/random_uniform/mul"
   op: "Mul"
-  input: "DW2/Initializer/random_normal/RandomStandardNormal"
-  input: "DW2/Initializer/random_normal/stddev"
+  input: "conv2d_1/kernel/Initializer/random_uniform/RandomUniform"
+  input: "conv2d_1/kernel/Initializer/random_uniform/sub"
   attr {
     key: "T"
     value {
@@ -514,16 +710,16 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
 }
 node {
-  name: "DW2/Initializer/random_normal"
+  name: "conv2d_1/kernel/Initializer/random_uniform"
   op: "Add"
-  input: "DW2/Initializer/random_normal/mul"
-  input: "DW2/Initializer/random_normal/mean"
+  input: "conv2d_1/kernel/Initializer/random_uniform/mul"
+  input: "conv2d_1/kernel/Initializer/random_uniform/min"
   attr {
     key: "T"
     value {
@@ -534,16 +730,65 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
 }
 node {
-  name: "DW2/Assign"
+  name: "conv2d_1/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/kernel"
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 3
+        }
+        dim {
+          size: 5
+        }
+        dim {
+          size: 5
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "conv2d_1/kernel/Assign"
   op: "Assign"
-  input: "DW2"
-  input: "DW2/Initializer/random_normal"
+  input: "conv2d_1/kernel"
+  input: "conv2d_1/kernel/Initializer/random_uniform"
   attr {
     key: "T"
     value {
@@ -554,7 +799,7 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
@@ -572,9 +817,9 @@ node {
   }
 }
 node {
-  name: "DW2/read"
+  name: "conv2d_1/kernel/read"
   op: "Identity"
-  input: "DW2"
+  input: "conv2d_1/kernel"
   attr {
     key: "T"
     value {
@@ -585,16 +830,187 @@ node {
     key: "_class"
     value {
       list {
-        s: "loc:@DW2"
+        s: "loc:@conv2d_1/kernel"
       }
     }
   }
 }
 node {
-  name: "Conv2D_1"
+  name: "conv2d_1/bias/Initializer/Const"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/bias"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 5
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "conv2d_1/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/bias"
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 5
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "conv2d_1/bias/Assign"
+  op: "Assign"
+  input: "conv2d_1/bias"
+  input: "conv2d_1/bias/Initializer/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/bias"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "conv2d_1/bias/read"
+  op: "Identity"
+  input: "conv2d_1/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/bias"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d_2/convolution/Shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\003\000\000\000\005\000\000\000\005\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d_2/convolution/dilation_rate"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "conv2d_2/convolution"
   op: "Conv2D"
-  input: "Conv2D"
-  input: "DW2/read"
+  input: "conv2d/BiasAdd"
+  input: "conv2d_1/kernel/read"
   attr {
     key: "T"
     value {
@@ -610,7 +1026,7 @@ node {
   attr {
     key: "padding"
     value {
-      s: "SAME"
+      s: "VALID"
     }
   }
   attr {
@@ -618,8 +1034,8 @@ node {
     value {
       list {
         i: 1
-        i: 2
-        i: 2
+        i: 1
+        i: 1
         i: 1
       }
     }
@@ -631,6 +1047,537 @@ node {
     }
   }
 }
-versions {
-  producer: 13
+node {
+  name: "conv2d_2/BiasAdd"
+  op: "BiasAdd"
+  input: "conv2d_2/convolution"
+  input: "conv2d_1/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "save/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "model"
+      }
+    }
+  }
+}
+node {
+  name: "save/SaveV2/tensor_names"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        string_val: "conv2d/bias"
+        string_val: "conv2d/kernel"
+        string_val: "conv2d_1/bias"
+        string_val: "conv2d_1/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "save/SaveV2/shape_and_slices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        string_val: ""
+        string_val: ""
+        string_val: ""
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "save/SaveV2"
+  op: "SaveV2"
+  input: "save/Const"
+  input: "save/SaveV2/tensor_names"
+  input: "save/SaveV2/shape_and_slices"
+  input: "conv2d/bias"
+  input: "conv2d/kernel"
+  input: "conv2d_1/bias"
+  input: "conv2d_1/kernel"
+  attr {
+    key: "dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+node {
+  name: "save/control_dependency"
+  op: "Identity"
+  input: "save/Const"
+  input: "^save/SaveV2"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@save/Const"
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2/tensor_names"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "conv2d/bias"
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2/shape_and_slices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2"
+  op: "RestoreV2"
+  input: "save/Const"
+  input: "save/RestoreV2/tensor_names"
+  input: "save/RestoreV2/shape_and_slices"
+  attr {
+    key: "dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+node {
+  name: "save/Assign"
+  op: "Assign"
+  input: "conv2d/bias"
+  input: "save/RestoreV2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/bias"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_1/tensor_names"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "conv2d/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_1/shape_and_slices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_1"
+  op: "RestoreV2"
+  input: "save/Const"
+  input: "save/RestoreV2_1/tensor_names"
+  input: "save/RestoreV2_1/shape_and_slices"
+  attr {
+    key: "dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+node {
+  name: "save/Assign_1"
+  op: "Assign"
+  input: "conv2d/kernel"
+  input: "save/RestoreV2_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_2/tensor_names"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "conv2d_1/bias"
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_2/shape_and_slices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_2"
+  op: "RestoreV2"
+  input: "save/Const"
+  input: "save/RestoreV2_2/tensor_names"
+  input: "save/RestoreV2_2/shape_and_slices"
+  attr {
+    key: "dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+node {
+  name: "save/Assign_2"
+  op: "Assign"
+  input: "conv2d_1/bias"
+  input: "save/RestoreV2_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/bias"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_3/tensor_names"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "conv2d_1/kernel"
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_3/shape_and_slices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "save/RestoreV2_3"
+  op: "RestoreV2"
+  input: "save/Const"
+  input: "save/RestoreV2_3/tensor_names"
+  input: "save/RestoreV2_3/shape_and_slices"
+  attr {
+    key: "dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+node {
+  name: "save/Assign_3"
+  op: "Assign"
+  input: "conv2d_1/kernel"
+  input: "save/RestoreV2_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv2d_1/kernel"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "save/restore_all"
+  op: "NoOp"
+  input: "^save/Assign"
+  input: "^save/Assign_1"
+  input: "^save/Assign_2"
+  input: "^save/Assign_3"
+}
+node {
+  name: "init"
+  op: "NoOp"
+  input: "^conv2d/kernel/Assign"
+  input: "^conv2d/bias/Assign"
+  input: "^conv2d_1/kernel/Assign"
+  input: "^conv2d_1/bias/Assign"
+}
+versions {
+  producer: 21
 }
diff --git a/tensorflow/tools/tfprof/internal/testdata/run_meta b/tensorflow/tools/tfprof/internal/testdata/run_meta
index 2d5bb7ddaff..6e9e0c38729 100644
Binary files a/tensorflow/tools/tfprof/internal/testdata/run_meta and b/tensorflow/tools/tfprof/internal/testdata/run_meta differ
diff --git a/tensorflow/tools/tfprof/internal/testdata/tfprof_log b/tensorflow/tools/tfprof/internal/testdata/tfprof_log
index c35d4338e97..2a317207c4e 100644
--- a/tensorflow/tools/tfprof/internal/testdata/tfprof_log
+++ b/tensorflow/tools/tfprof/internal/testdata/tfprof_log
@@ -1,9 +1,17 @@
 
-
-Conv2D_1�$
-
-DW2_trainable_variables
-
-DW_trainable_variables
-
-Conv2D�-
\ No newline at end of file
+
+conv2d_2/BiasAdd�
+
+conv2d/BiasAdd�
+%
+conv2d_1/bias_trainable_variables
+
+conv2d_2/convolution�p
+
+conv2d/convolution�
+#
+conv2d/bias_trainable_variables
+'
+conv2d_1/kernel_trainable_variables
+%
+conv2d/kernel_trainable_variables
\ No newline at end of file
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.cc b/tensorflow/tools/tfprof/internal/tfprof_code.cc
new file mode 100644
index 00000000000..f328e3b0cd3
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.cc
@@ -0,0 +1,294 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+
+namespace tensorflow {
+namespace tfprof {
+namespace {
+// Convert to Trace proto into a short readable string.
+string GetTraceString(const CodeDef::Trace& trace) {
+  string ntrace = "";
+  if (trace.file().find_last_of('/') != trace.file().npos) {
+    ntrace += trace.file().substr(trace.file().find_last_of('/') + 1);
+  } else {
+    ntrace += trace.file();
+  }
+  ntrace += strings::StrCat(":", trace.lineno());
+  if (trace.function().length() < 20) {
+    ntrace += ":" + trace.function();
+  } else {
+    ntrace += ":" + trace.function().substr(0, 17) + "...";
+  }
+  if (trace.line().length() < 20) {
+    ntrace += ":" + trace.line();
+  } else {
+    ntrace += ":" + trace.line().substr(0, 17) + "...";
+  }
+  return ntrace;
+}
+}  // namespace
+
+void TFCode::AddNode(TFGraphNode* node) {
+  if (node->code().traces_size() == 0) {
+    return;
+  }
+  TFMultiGraphNode* pre_trace_node = nullptr;
+  // TODO(xpan): Consider to release CodeDef after TFCode is built. It
+  // takes a lot of memory.
+  for (int i = 0; i < node->code().traces_size(); ++i) {
+    // Unlike op name, which is globally unique, trace name is only unique
+    // w.r.t. it's parent.
+    const string& trace = GetTraceString(node->code().traces(i));
+    if (i == 0) {
+      if (!trace_root_) {
+        trace_root_.reset(new TFMultiGraphNode(trace));
+      }
+      CHECK(trace_root_->name() == trace) << "Different trace root";
+      pre_trace_node = trace_root_.get();
+      continue;
+    }
+    pre_trace_node->AddChildren(trace);
+    TFMultiGraphNode* trace_node = pre_trace_node->children().at(trace).get();
+
+    if (i == node->code().traces_size() - 1) {
+      trace_node->AddGraphNode(node);
+    }
+    pre_trace_node = trace_node;
+  }
+}
+
+void TFCode::Build() {
+  if (root_) {
+    return;
+  }
+  tfprof_trace_root_.reset(new TFMultiGraphNode(kTFProfRoot));
+  root_.reset(new CodeNode(tfprof_trace_root_.get()));
+
+  if (trace_root_) {
+    code_root_ = BuildCodeNodes(trace_root_.get());
+    root_->children.push_back(code_root_);
+  }
+}
+
+CodeNode* TFCode::BuildCodeNodes(TFMultiGraphNode* root) {
+  auto code_root = std::unique_ptr<CodeNode>(new CodeNode(root));
+  CodeNode* code_root_ptr = code_root.get();
+  code_nodes_.insert(std::move(code_root));
+
+  for (auto it = root->children().cbegin(); it != root->children().cend();
+       ++it) {
+    code_root_ptr->children.push_back(BuildCodeNodes(it->second.get()));
+  }
+  return code_root_ptr;
+}
+
+const ShowMultiNode* TFCode::ShowInternal(const Options& opts,
+                                          Timeline* timeline) {
+  std::vector<CodeNode*> roots = Account(root_->children, opts);
+  root_->ResetTotalStats();
+  root_->show_children.clear();
+  for (CodeNode* n : roots) {
+    root_->AggregateTotalStats(n);
+  }
+
+  if (opts.start_name_regexes.size() != 1 ||
+      opts.start_name_regexes[0] != ".*") {
+    roots = SearchRoot(roots, opts.start_name_regexes);
+  }
+
+  root_->show_children.assign(roots.begin(), roots.end());
+
+  CodeNode* root = PrintScope({root_.get()}, opts, 1, 0)[0];
+
+  root->formatted_str = FormatLegend(opts) + root->formatted_str;
+  Format(root->show_children, &root->formatted_str, root->mutable_proto());
+
+  if (timeline) {
+    timeline->GenerateCodeTimeline(root);
+  }
+  return root;
+}
+
+void TFCode::Format(const std::vector<CodeNode*> roots, string* display_str,
+                    TFMultiGraphNodeProto* proto) {
+  for (CodeNode* node : roots) {
+    display_str->append(node->formatted_str);
+    TFMultiGraphNodeProto* child = proto->add_children();
+    child->MergeFrom(node->proto());
+    Format(node->show_children, display_str, child);
+  }
+}
+
+std::vector<CodeNode*> TFCode::SearchRoot(std::vector<CodeNode*> roots,
+                                          const std::vector<string>& regexes) {
+  std::vector<CodeNode*> res;
+  if (roots.empty()) {
+    return res;
+  }
+  for (CodeNode* root : roots) {
+    bool match_start_node = false;
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(root->name(), regex)) {
+        res.push_back(root);
+        match_start_node = true;
+        break;
+      }
+    }
+    if (match_start_node) {
+      // Found a start node at this branch, no need to continue.
+      continue;
+    }
+    std::vector<CodeNode*> nroots = SearchRoot(root->show_children, regexes);
+    res.insert(res.end(), nroots.begin(), nroots.end());
+  }
+  return res;
+}
+
+std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots,
+                                          const Options& opts, int depth,
+                                          int last_ident) {
+  std::vector<CodeNode*> show_nodes;
+
+  for (CodeNode* node : roots) {
+    int ident = last_ident;
+    bool show = ShouldShow(node, opts, depth);
+    if (show) ident += 2;
+
+    std::vector<CodeNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes) && depth <= opts.max_depth) {
+      show_cnodes = PrintScope(node->show_children, opts, depth + 1, ident);
+    }
+    if (show) {
+      node->show_children.clear();
+      if (opts.account_displayed_op_only) {
+        node->ResetTotalStats();
+        node->AddSelfToTotalStats();
+      }
+
+      show_cnodes = SortNodes(show_cnodes, opts);
+      for (CodeNode* sc : show_cnodes) {
+        node->show_children.push_back(sc);
+        if (opts.account_displayed_op_only) {
+          node->AggregateTotalStats(sc);
+        }
+      }
+
+      node->formatted_str = FormatNode(node, opts, last_ident);
+
+      if (opts.select.find(kShown[4]) != opts.select.end()) {
+        fprintf(stderr, "code view has no tensor value to show\n");
+      }
+      show_nodes.push_back(node);
+    } else {
+      show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
+                        show_cnodes.end());
+    }
+  }
+  return show_nodes;
+}
+
+std::vector<CodeNode*> TFCode::Account(const std::vector<CodeNode*>& roots,
+                                       const Options& opts) {
+  std::vector<CodeNode*> act_nodes;
+
+  for (CodeNode* node : roots) {
+    node->ResetTotalStats();
+    std::vector<CodeNode*> act_cnodes = Account(node->children, opts);
+    node->account = ReAccount(node, opts);
+    if (node->account || !act_cnodes.empty()) {
+      node->show_children.clear();
+      node->ResetTotalStats();
+      node->AddSelfToTotalStats();
+      for (CodeNode* c : act_cnodes) {
+        node->AggregateTotalStats(c);
+        node->show_children.push_back(c);
+      }
+      act_nodes.push_back(node);
+    }
+  }
+  return act_nodes;
+}
+
+string TFCode::FormatNode(CodeNode* node, const Options& opts, int64 indent) {
+  std::vector<string> attrs;
+
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    string params = FormatNumber(node->proto().total_parameters()) + " params";
+    if (node->account) {
+      params = FormatNumber(node->proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    attrs.push_back(params);
+  }
+
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(node->proto().total_float_ops()) + " flops";
+    if (node->account) {
+      fops = FormatNumber(node->proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    attrs.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(node->proto().total_requested_bytes());
+    if (node->account) {
+      memory = FormatMemory(node->proto().requested_bytes()) + "/" + memory;
+    } else {
+      memory = "--/" + memory;
+    }
+    attrs.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(node->proto().total_exec_micros());
+    if (node->account) {
+      time = FormatTime(node->proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    attrs.push_back(time);
+  }
+  if (opts.select.find(kShown[5]) != opts.select.end() &&
+      !node->node->devices().empty()) {
+    attrs.push_back(str_util::Join(node->node->devices(), "|"));
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    std::set<string> op_types = node->node->op_types();
+    attrs.push_back(str_util::Join(op_types, "|"));
+  }
+
+  if (opts.select.find(kShown[8]) != opts.select.end()) {
+    attrs.push_back(strings::Printf("%s N/A in code view", kShown[8]));
+  }
+
+  return strings::Printf("%s%s (%s)\n", string(indent, ' ').c_str(),
+                         node->name().c_str(),
+                         str_util::Join(attrs, ", ").c_str());
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.h b/tensorflow/tools/tfprof/internal/tfprof_code.h
new file mode 100644
index 00000000000..613f6267393
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.h
@@ -0,0 +1,80 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow model's python code stacks.
+// Stats are aggregated from descendants from ancestors.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFCode : public TFMultiShow {
+ public:
+  explicit TFCode() : code_root_(nullptr), trace_root_(nullptr) {}
+  ~TFCode() override {}
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  CodeNode* BuildCodeNodes(TFMultiGraphNode* root);
+
+  const ShowMultiNode* ShowInternal(const Options& opts,
+                                    Timeline* timeline) override;
+
+  std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
+                                    const std::vector<string>& regexes);
+
+  std::vector<CodeNode*> PrintScope(const std::vector<CodeNode*> roots,
+                                    const Options& opts, int depth,
+                                    int last_ident);
+
+  std::vector<CodeNode*> Account(const std::vector<CodeNode*>& roots,
+                                 const Options& opts);
+
+  void Format(const std::vector<CodeNode*> roots, string* display_str,
+              TFMultiGraphNodeProto* proto);
+
+  string FormatNode(CodeNode* node, const Options& opts, int64 indent);
+
+  std::unique_ptr<CodeNode> root_;
+  CodeNode* code_root_;
+  std::unique_ptr<TFMultiGraphNode> trace_root_;
+  std::unique_ptr<TFMultiGraphNode> tfprof_trace_root_;
+  std::set<std::unique_ptr<CodeNode>> code_nodes_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
index 469b258f98b..4c562ae8406 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
@@ -31,61 +31,82 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFGraph::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
+void TFGraph::AddNode(TFGraphNode* node) {
+  string name = node->name();
   nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(node));
 }
 
 void TFGraph::Build() {
-  if (!roots_.empty()) return;
+  if (root_) return;
 
   std::set<string> nonroots;
   // Filter out the root nodes (node not input of any other node).
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     GraphNode* node = it->second.get();
-    const std::map<string, TFNode*>& inputs = node->node->inputs();
+    const std::map<int, TFGraphNode*>& inputs = node->node->inputs();
     for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
          inputs_it++) {
-      nonroots.insert(inputs_it->first);
-      auto child_it = nodes_map_.find(inputs_it->first);
+      nonroots.insert(inputs_it->second->name());
+      auto child_it = nodes_map_.find(inputs_it->second->name());
       if (child_it != nodes_map_.end()) {
         node->children.push_back(child_it->second.get());
       }
     }
   }
+  std::vector<GraphNode*> roots;
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     if (nonroots.find(it->first) == nonroots.end()) {
-      roots_.push_back(it->second.get());
+      roots.push_back(it->second.get());
     }
   }
+  root_ = CreateParentNode(kTFProfRoot);
+  root_->children.insert(root_->children.end(), roots.begin(), roots.end());
 }
 
-const ShowNode* TFGraph::ShowInternal(const Options& opts) {
-  // Search the nodes to start from.
-  std::vector<GraphNode*> roots = roots_;
+const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
+  root_->ResetTotalStats();
+  root_->show_children.clear();
+  if (timeline && timeline->step() < 0) {
+    // TODO(xpan): Maybe pick a default step for users.
+    fprintf(stderr,
+            "Must specify -step option to generate timeline in graph view.\n");
+    return root_;
+  }
+  // 1. Account and aggregate the stats based on the graph structure.
+  // Returns a graph consists of accounted nodes.
+  std::set<string> visits;
+  std::vector<GraphNode*> roots = Account(root_->children, opts, &visits);
+  for (GraphNode* n : roots) {
+    root_->AggregateTotalStats(n);
+  }
+
+  // 2. Trim the nodes before start_name_regexes.
   if (opts.start_name_regexes.size() != 1 ||
       opts.start_name_regexes[0] != ".*") {
-    std::set<string> visited;
-    roots = SearchRoot(roots, opts.start_name_regexes, &visited);
+    visits.clear();
+    roots = SearchRoot(roots, opts.start_name_regexes, &visits);
   }
 
-  GraphNode* root = CreateParentNode(kTFProfRoot);
-  root->children.assign(roots.begin(), roots.end());
+  // 3. Trim the nodes not matching show/hide/trim_name_regexes.
+  // If account_displayed_name_only=true, redo the accounting.
+  visits.clear();
+  root_->show_children.assign(roots.begin(), roots.end());
+  GraphNode* root = PrintGraph({root_}, opts, 1, 0, &visits)[0];
 
-  std::map<string, int64> account_visits;
-  Account({root}, opts, &account_visits);
+  // 4. Prepare output based on the final graphs.
+  root->formatted_str = FormatLegend(opts) + root->formatted_str;
+  Format(root->show_children, &root->formatted_str, root->mutable_proto());
 
-  if (opts.viz) {
-    printf("Visualizing feature disabled...\n");
+  if (timeline) {
+    timeline->GenerateGraphTimeline(root->show_children);
   }
-  std::set<string> visits;
-  return PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+  return root;
 }
 
 std::vector<GraphNode*> TFGraph::SearchRoot(
@@ -113,15 +134,25 @@ std::vector<GraphNode*> TFGraph::SearchRoot(
       continue;
     }
     std::vector<GraphNode*> nroot =
-        SearchRoot(root->children, regexes, visited);
+        SearchRoot(root->show_children, regexes, visited);
     res.insert(res.end(), nroot.begin(), nroot.end());
   }
   return res;
 }
 
+void TFGraph::Format(const std::vector<GraphNode*> roots, string* display_str,
+                     TFGraphNodeProto* proto) {
+  for (GraphNode* node : roots) {
+    display_str->append(node->formatted_str);
+    TFGraphNodeProto* child = proto->add_children();
+    child->MergeFrom(node->proto());
+    Format(node->show_children, display_str, child);
+  }
+}
+
 std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
                                             const Options& opts, int depth,
-                                            int hidden, int last_ident,
+                                            int last_ident,
                                             std::set<string>* visits) {
   std::vector<GraphNode*> show_nodes;
 
@@ -129,50 +160,34 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
     if (visits->find(node->name()) != visits->end()) continue;
     visits->insert(node->name());
 
-    int nhidden = hidden;
-    int nlast_ident = last_ident;
     bool show = ShouldShow(node, opts, depth);
+    int indent = last_ident;
+    if (show) indent += 2;
+
+    std::vector<GraphNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes) && depth <= opts.max_depth) {
+      show_cnodes =
+          PrintGraph(node->show_children, opts, depth + 1, indent, visits);
+    }
     if (show) {
-      node->formatted_str.clear();
+      node->show_children.clear();
       if (opts.account_displayed_op_only) {
         node->ResetTotalStats();
         node->AddSelfToTotalStats();
       }
-      nhidden = 0;
-      nlast_ident = (hidden && opts.select.find(kShown[4]) != opts.select.end()
-                         ? last_ident + 4
-                         : last_ident + 2);
-    } else {
-      ++nhidden;
-    }
 
-    std::vector<GraphNode*> show_cnodes;
-    if (!ShouldTrim(node, opts.trim_name_regexes)) {
-      show_cnodes = PrintGraph(node->children, opts, depth + 1, nhidden,
-                               nlast_ident, visits);
-    }
-    if (show) {
       show_cnodes = SortNodes(show_cnodes, opts);
-      string children_str;
       for (GraphNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
-        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
       }
-      if (hidden && opts.select.find(kShown[4]) != opts.select.end()) {
-        node->formatted_str = strings::Printf(
-            "%s...hidden %d...\n", string(last_ident, ' ').c_str(), hidden);
-        node->formatted_str +=
-            strings::Printf("  %s%s\n", string(last_ident, ' ').c_str(),
-                            node->Format(opts).c_str());
-      } else {
-        node->formatted_str =
-            strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
-                            node->Format(opts).c_str());
-      }
-      if (opts.select.find(kShown[5]) != opts.select.end()) {
+      node->formatted_str =
+          strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
+                          FormatNode(node, opts).c_str());
+
+      if (opts.select.find(kShown[4]) != opts.select.end()) {
         std::unique_ptr<TFProfTensor> tfprof_tensor;
         if (LookUpCheckPoint(node->name(), &tfprof_tensor)) {
           string value_str;
@@ -181,8 +196,6 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
           node->formatted_str += value_str;
         }
       }
-
-      node->formatted_str += children_str;
       show_nodes.push_back(node);
     } else {
       show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
@@ -192,31 +205,34 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
   return show_nodes;
 }
 
-void TFGraph::Account(const std::vector<GraphNode*>& roots, const Options& opts,
-                      std::map<string, int64>* visits) {
-  if (roots.empty()) return;
-
+std::vector<GraphNode*> TFGraph::Account(const std::vector<GraphNode*>& roots,
+                                         const Options& opts,
+                                         std::set<string>* visits) {
+  std::vector<GraphNode*> act_nodes;
   for (GraphNode* node : roots) {
     if (visits->find(node->name()) != visits->end()) continue;
-    (*visits)[node->name()] = 1;
-    node->ResetTotalStats();
-    // Depth-firsth.
-    Account(node->children, opts, visits);
+    visits->insert(node->name());
+    // Depth-first.
+    std::vector<GraphNode*> act_cnodes = Account(node->children, opts, visits);
 
-    node->account = ShouldAccount(node, opts);
+    node->account = ReAccount(node, opts);
     if (node->account) {
+      node->show_children.clear();
+      node->ResetTotalStats();
       node->AddSelfToTotalStats();
-    }
-    // Aggregate its children stats.
-    for (GraphNode* c : node->children) {
-      // A node can be visited from multiple parents. Only account once.
-      // "visits==1" is when the node is visited through depth-first search.
-      (*visits)[c->name()] += 1;
-      if ((*visits)[c->name()] > 2) continue;
-
-      node->AggregateTotalStats(c);
+      // Aggregate its accounted children stats.
+      for (GraphNode* c : act_cnodes) {
+        node->AggregateTotalStats(c);
+        node->show_children.push_back(c);
+      }
+      act_nodes.push_back(node);
+    } else {
+      // If the current node is not accounted, pass the children to the
+      // ancestor.
+      act_nodes.insert(act_nodes.end(), act_cnodes.begin(), act_cnodes.end());
     }
   }
+  return act_nodes;
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
index b16f80b33db..fbeae8673dd 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -37,47 +37,22 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-class GraphNode : public ShowNode {
- public:
-  explicit GraphNode(TFNode* node) : ShowNode(node) {
-    mutable_proto()->set_inputs(node->inputs().size());
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  void AggregateTotalStats(GraphNode* node) {
-    ShowNode::AggregateTotalStats(node);
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      node->proto().total_inputs() + 1);
-  }
-
-  void AddSelfToTotalStats() {
-    ShowNode::AddSelfToTotalStats();
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      proto().inputs());
-  }
-
-  void ResetTotalStats() {
-    ShowNode::ResetTotalStats();
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  std::vector<GraphNode*> children;
-};
 
 // Organize tensorflow ops in a graph structure, pointing from output ops
 // to input ops.
 class TFGraph : public TFShow {
  public:
   explicit TFGraph(checkpoint::CheckpointReader* ckpt_reader)
-      : TFShow(ckpt_reader) {}
+      : TFShow(ckpt_reader), root_(nullptr) {}
   ~TFGraph() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
                          int depth) override {
@@ -91,22 +66,20 @@ class TFGraph : public TFShow {
                                      std::set<string>* visited);
 
   std::vector<GraphNode*> PrintGraph(const std::vector<GraphNode*> roots,
-                                     const Options& opts, int depth, int hidden,
+                                     const Options& opts, int depth,
                                      int last_ident, std::set<string>* visits);
 
-  void VisualizeGraph(GraphNode* root, const Options& opts);
+  std::vector<GraphNode*> Account(const std::vector<GraphNode*>& roots,
+                                  const Options& opts,
+                                  std::set<string>* visits);
 
-  std::vector<GraphNode*> GenerateGraphDot(
-      GraphNode* root, GraphNode* last_shown, const Options& opts, int depth,
-      int hidden, std::set<string>* declared_nodes,
-      std::set<string>* declared_edges, TFProfNode* parent);
+  void Format(const std::vector<GraphNode*> roots, string* display_str,
+              TFGraphNodeProto* proto);
 
-  void Account(const std::vector<GraphNode*>& roots, const Options& opts,
-               std::map<string, int64>* visits);
-
-  std::vector<GraphNode*> roots_;
+  MemoryTracker memory_tracker_;
+  GraphNode* root_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<GraphNode>> nodes_map_;
 };
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.cc b/tensorflow/tools/tfprof/internal/tfprof_node.cc
index 0e77439231d..6353813a26c 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.cc
@@ -15,33 +15,196 @@ limitations under the License.
 
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
 
-#include "tensorflow/core/framework/allocation_description.pb.h"
-#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 
 namespace tensorflow {
 namespace tfprof {
-void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
-  if (!device.empty()) {
-    // This might override device from GraphDef.
-    device_ = device;
-  }
-  step_stat_ = step_stat;
+// Notes about start and end time from the NodeExecStats proto:
+// For GPU, there is no difference between op_end_rel_micros and
+// all_end_rel_micros. All are kernel times.
+// For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
+// some post-processing. Besides, currently, there is no way to measure
+// the execution time of async ops accurately.
+//
+// Notes about device:
+// For ops on gpu:
+// It will appear in three different devices in RunMetadata: 1) gpu:x,
+// 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
+// of all different 3). 1) is the op scheduling, pre-processing and
+// post processing time. 3) is the execution time of GPU kernels on a stream.
+// For ops on cpu:
+// It will only appear as cpu:0.
 
-  op_start_micros_ = step_stat_->all_start_micros();
-  if (step_stat_->op_end_rel_micros() && step_stat_->op_start_rel_micros()) {
-    op_exec_micros_ =
-        step_stat_->op_end_rel_micros() - step_stat_->op_start_rel_micros();
-  }
-  all_spent_micros_ = step_stat_->all_end_rel_micros();
+void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
+  devices_.insert(dev);
+  if (step_stat.all_start_micros() > 0) {
+    if (all_start_micros_ > 0) {
+      all_start_micros_ = std::min(
+          all_start_micros_, static_cast<int64>(step_stat.all_start_micros()));
+    } else {
+      all_start_micros_ = step_stat.all_start_micros();
+    }
+    int64 op_end_rel_micros = step_stat.op_end_rel_micros();
+    // Round quick execution to 1 micro to be semantically robust.
+    if (op_end_rel_micros == 0) {
+      ++op_end_rel_micros;
+    }
+    latest_end_micros_ = std::max(
+        latest_end_micros_, step_stat.all_start_micros() + op_end_rel_micros);
 
-  for (const auto& output : step_stat_->output()) {
-    if (output.has_tensor_description() &&
-        output.tensor_description().has_allocation_description()) {
-      requested_bytes_ += output.tensor_description()
-                              .allocation_description()
-                              .requested_bytes();
+    op_execs_[dev].push_back(
+        std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
+
+    // TODO(xpan): Can a stream only in stream:all or doesn't in stream at all?
+    if (dev.find("stream") != dev.npos && dev.find("stream:all") == dev.npos) {
+      gpu_kernel_execs_[dev].push_back(
+          std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
     }
   }
 }
+
+void ExecStep::AddMemoryStats(const string& dev,
+                              const NodeExecStats& step_stat) {
+  if (mem_initiated_) {
+    return;
+  }
+  mem_initiated_ = true;
+
+  for (const auto& mem : step_stat.memory()) {
+    // TODO(xpan): Fix this hack. Currently the allocator name seems quite
+    // ad-hoc.
+    if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
+      continue;
+    }
+    allocator_bytes_in_use_ =
+        std::max(allocator_bytes_in_use_,
+                 static_cast<int64>(mem.allocator_bytes_in_use()));
+  }
+  int64 total_output_bytes = 0;
+  for (const auto& output : step_stat.output()) {
+    if (output.has_tensor_description() &&
+        output.tensor_description().has_allocation_description()) {
+      // TODO(xpan): Maybe allocated_bytes.
+      int64 output_bytes = std::max(output.tensor_description()
+                                        .allocation_description()
+                                        .allocated_bytes(),
+                                    output.tensor_description()
+                                        .allocation_description()
+                                        .requested_bytes());
+      uint64 output_ptr =
+          output.tensor_description().allocation_description().ptr();
+      total_output_bytes += output_bytes;
+      output_bytes_[output.slot()] = std::make_pair(output_bytes, output_ptr);
+    }
+  }
+  if (step_stat.has_memory_stats()) {
+    host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size();
+    host_persistent_bytes_ +=
+        step_stat.memory_stats().host_persistent_memory_size();
+    accelerator_temp_bytes_ +=
+        step_stat.memory_stats().device_temp_memory_size();
+    accelerator_persistent_bytes_ +=
+        step_stat.memory_stats().device_persistent_memory_size();
+  }
+  requested_bytes_ = total_output_bytes;
+}
+
+void TFGraphNode::AddStepStat(int64 step, const string& device,
+                              const NodeExecStats& step_stat) {
+  string dev = str_util::Lowercase(device);
+
+  // TODO(xpan): Test it.
+  if (RE2::FullMatch(dev, "/job:.*/replica:\\d+/task:\\d+/[a-z]+:\\d+")) {
+    if (!canonical_device_.empty()) {
+      if (canonical_device_ != dev) {
+        fprintf(stderr, "Unexpected: graph node changed device: %s->%s.\n",
+                canonical_device_.c_str(), dev.c_str());
+        return;
+      }
+    } else {
+      canonical_device_ = dev;
+      // TODO(xpan): Support things other than gpu?
+      host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+      AddOpType(canonical_device_);
+    }
+  }
+
+  auto exec = execs_.find(step);
+  if (exec == execs_.end()) {
+    execs_.insert(std::pair<int64, ExecStep>(step, ExecStep(this)));
+    exec = execs_.find(step);
+  }
+
+  exec->second.AddTimeStats(dev, step_stat);
+
+  if (dev == canonical_device_) {
+    exec->second.AddMemoryStats(dev, step_stat);
+  }
+}
+
+int64 ExecStep::exec_micros() const {
+  int64 total = accelerator_exec_micros();
+  if (total > 0) return total;
+
+  // If there is no gpu kernel time, fall back to assume it runs on cpu.
+  // TODO(xpan): No way to track CPU async op timing accurately?
+  if (op_execs_.size() > 1) {
+    fprintf(stderr, "Op: %s has over 1 no-gpu assignment\n",
+            node->name().c_str());
+  }
+  for (const auto& execs : op_execs_) {
+    for (const auto& exec : execs.second) {
+      total += exec.second;
+    }
+  }
+  return total;
+}
+
+int64 ExecStep::accelerator_exec_micros() const {
+  int64 total = 0;
+  for (const auto& execs : gpu_kernel_execs_) {
+    for (const auto& exec : execs.second) {
+      total += exec.second;
+    }
+  }
+  return total;
+}
+
+bool IsCombinedGPUStream(const string& device) {
+  return device.find("stream:all") != device.npos;
+}
+
+bool IsCPUDevice(const string& device) {
+  return device.find("cpu:0") != device.npos;
+}
+
+std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
+  std::vector<int64> shape_vec;
+  if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
+    // Scalar parameter with empty shape but known rank.
+    shape_vec.push_back(1);
+  } else {
+    for (const auto& d : shape_pb.dim()) {
+      shape_vec.push_back(d.size());
+    }
+  }
+  return shape_vec;
+}
+
+TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec) {
+  TensorShapeProto shape_pb;
+  if (shape_vec.empty()) {
+    shape_pb.set_unknown_rank(true);
+    return shape_pb;
+  }
+  for (const int64 s : shape_vec) {
+    shape_pb.add_dim()->set_size(s);
+  }
+  return shape_pb;
+}
+
+bool IsAcceleratorDevice(const string& device) {
+  return device.find("gpu") != device.npos;
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.h b/tensorflow/tools/tfprof/internal/tfprof_node.h
index 6ffb85506cc..d788f2acf4d 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.h
@@ -28,78 +28,417 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb);
 
-class TFNode {
+TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec);
+
+class TFGraphNode;
+
+class ExecStep {
  public:
-  TFNode(const NodeDef* node)
-      : node_(node),
-        step_stat_(nullptr),
-        op_start_micros_(0),
-        op_exec_micros_(0),
-        all_spent_micros_(0),
+  ExecStep(TFGraphNode* node)
+      : node(node),
+        all_start_micros_(0),
+        latest_end_micros_(0),
+        mem_initiated_(false),
         requested_bytes_(0),
-        float_ops_(0) {
-    if (!node) return;
+        host_temp_bytes_(0),
+        host_persistent_bytes_(0),
+        accelerator_temp_bytes_(0),
+        accelerator_persistent_bytes_(0),
+        allocator_bytes_in_use_(0) {}
 
+  void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
+
+  void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
+
+  // The execution time of an op. If it runs on accelerator, then it's
+  // accelerator_exec_micros(). Otherwise, it's CPU time.
+  int64 exec_micros() const;
+
+  // The execution time of an op. 0 if it runs on cpu.
+  int64 accelerator_exec_micros() const;
+
+  const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs()
+      const {
+    return op_execs_;
+  }
+  int64 all_start_micros() const { return all_start_micros_; }
+  int64 latest_end_micros() const { return latest_end_micros_; }
+
+  int64 requested_bytes() const { return requested_bytes_; }
+  int64 accelerator_temp_bytes() const { return accelerator_temp_bytes_; }
+  int64 host_temp_bytes() const { return host_temp_bytes_; }
+  int64 accelerator_persistent_bytes() const {
+    return accelerator_persistent_bytes_;
+  }
+  int64 host_persistent_bytes() const { return host_persistent_bytes_; }
+  const std::map<int64, std::pair<int64, uint64>>& output_bytes() const {
+    return output_bytes_;
+  }
+  int64 allocator_bytes_in_use() const { return allocator_bytes_in_use_; }
+
+ private:
+  TFGraphNode* node;
+  // The earliest/latest time including scheduling and execution.
+  int64 all_start_micros_;
+  int64 latest_end_micros_;
+  // device -> vector of {op_start_micros, op_exec_micros} pairs.
+  // For accelerator op, op_start_micros and op_exec_micros are kernel time.
+  // For cpu op, op_start_micros and op_exec_micros are scheduling time. (
+  // might include compute time if it's sync op).
+  std::map<string, std::vector<std::pair<int64, int64>>> gpu_kernel_execs_;
+  std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
+  // All devices the op is associated with (e.g. gpu:0 (scheduling),
+  // gpu:0:stream:xx (kernel exec), cpu:0 host)
+  std::set<string> devices_;
+
+  bool mem_initiated_;
+  // Total output bytes requested by the op.
+  int64 requested_bytes_;
+  // Total temporary bytes allocated and released by the op.
+  int64 host_temp_bytes_;
+  // Total persistent bytes (e.g. variable) allocated by the op.
+  int64 host_persistent_bytes_;
+  int64 accelerator_temp_bytes_;
+  int64 accelerator_persistent_bytes_;
+  // The total number of bytes currently allocated by the allocator if >0.
+  int64 allocator_bytes_in_use_;
+  // output_idx -> {output_bytes, memory_ptr}
+  std::map<int64, std::pair<int64, uint64>> output_bytes_;
+};
+
+class TFGraphNode {
+ public:
+  TFGraphNode(const NodeDef* node)
+      : node_(node), float_ops_(0), op_(node->op()) {
     for (const auto& attr : node->attr()) {
-      // TODO(xpan): Also consider _output_shapes.
-      if (attr.first != "shape" || !attr.second.has_shape()) continue;
-      if (!shape_.empty()) {
-        fprintf(stderr, "Found duplicated shapes!\n");
-        continue;
+      op_attrs_[attr.first] = &attr.second;
+      if (attr.first == "shape" && attr.second.has_shape()) {
+        if (!shape_.empty()) {
+          fprintf(stderr, "Found duplicated shapes!\n");
+          continue;
+        }
+        shape_ = ShapeProtoToVec(attr.second.shape());
+      } else if (attr.first == "_output_shapes" && attr.second.has_list()) {
+        if (!output_shapes_.empty()) {
+          fprintf(stderr, "Found duplicated output shapes!\n");
+          continue;
+        }
+        for (int i = 0; i < attr.second.list().shape_size(); ++i) {
+          output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i));
+        }
       }
-      std::vector<int64> shape_vec;
-      for (const auto& d : attr.second.shape().dim()) {
-        shape_vec.push_back(d.size());
-      }
-      update_shape(shape_vec);
     }
     op_types_.insert(node->op());
-    device_ = node->device();
   }
 
-  TFNode() : TFNode(nullptr) {}
+  void AddInput(TFGraphNode* input, int64 output_idx, int input_idx) {
+    src_output_idx_[input->name()] = output_idx;
 
-  void AddInput(TFNode* input) { inputs_[input->node_def()->name()] = input; }
+    inputs_[input_idx] = input;
+    const auto& output_shape = input->output_shapes().find(output_idx);
+    // Always create an empty vec even if the shape info might be missing.
+    std::vector<int64>& shape_vec = input_shapes_[input_idx];
+    if (output_shape != input->output_shapes().end()) {
+      shape_vec.assign(output_shape->second.begin(),
+                       output_shape->second.end());
+    }
+  }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
 
-  void AddStepStat(const string& device, const NodeExecStats* step_stat);
+  void AddStepStat(int64 step, const string& device,
+                   const NodeExecStats& step_stat);
 
   void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
 
-  const NodeDef* node_def() { return node_; }
-  const std::map<string, TFNode*>& inputs() { return inputs_; }
-  int64 op_start_micros() { return op_start_micros_; }
-  int64 op_exec_micros() { return op_exec_micros_; }
-  int64 all_spent_micros() { return all_spent_micros_; }
-  int64 requested_byptes() { return requested_bytes_; }
-  int64 float_ops() { return float_ops_; }
-  string device() { return device_; }
-  const std::set<string>& op_types() { return op_types_; }
+  // TODO(xpan): This could take a lot of memory.
+  void AddCode(const CodeDef& code) { code_.MergeFrom(code); }
 
-  const std::vector<int64>& shape() { return shape_; }
-  void update_shape(const std::vector<int64>& shape) { shape_ = shape; }
+  const string& name() const { return node_->name(); }
+  const string& op() const { return op_; }
+  const NodeDef* node_def() { return node_; }
+
+  bool trackable(int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) return false;
+
+    if (exec->second.all_start_micros() == 0) return false;
+    if (canonical_device_.empty() || host_device_.empty()) {
+      return false;
+    }
+    return true;
+  }
+
+  const std::map<int, TFGraphNode*>& inputs() const { return inputs_; }
+  const std::map<string, int64>& src_output_idx() const {
+    return src_output_idx_;
+  }
+
+  // This is time spent in kernel execution.
+  int64 kernel_exec_micros(int64 step) const {
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      CHECK(exec != execs_.end());
+      return exec->second.exec_micros();
+    }
+
+    int64 total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
+  int64 requested_bytes(int64 step) const {
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      CHECK(exec != execs_.end()) << "unknown step " << step;
+      return exec->second.requested_bytes();
+    }
+
+    int64 requested_bytes = 0;
+    for (const auto& exec : execs_) {
+      requested_bytes += exec.second.requested_bytes();
+    }
+    return requested_bytes / execs_.size();
+  }
+
+  int64 all_start_micros(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.all_start_micros();
+  }
+
+  int64 latest_end_micros(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.latest_end_micros();
+  }
+
+  const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
+      int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.op_execs();
+  }
+
+  const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
+
+  int64 accelerator_temp_bytes(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.accelerator_temp_bytes();
+  }
+  int64 host_temp_bytes(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.host_temp_bytes();
+  }
+  int64 accelerator_persistent_bytes(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.accelerator_persistent_bytes();
+  }
+  int64 host_persistent_bytes(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.host_persistent_bytes();
+  }
+  const std::map<int64, std::pair<int64, uint64>>& output_bytes(
+      int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.output_bytes();
+  }
+  int64 allocator_bytes_in_use(int64 step) const {
+    auto exec = execs_.find(step);
+    CHECK(exec != execs_.end()) << "unknown step " << step;
+    return exec->second.allocator_bytes_in_use();
+  }
+
+  int64 float_ops() const { return float_ops_; }
+  const CodeDef& code() { return code_; }
+  string canonical_device() const { return canonical_device_; }
+  string host_device() const { return host_device_; }
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::map<string, const AttrValue*>& op_attrs() const {
+    return op_attrs_;
+  }
+
+  const std::vector<int64>& shape() const { return shape_; }
+
+  const std::map<int, std::vector<int64>>& output_shapes() const {
+    return output_shapes_;
+  }
+  const std::map<int, std::vector<int64>>& input_shapes() const {
+    return input_shapes_;
+  }
 
  private:
-  std::map<string, TFNode*> inputs_;
+  std::map<int, TFGraphNode*> inputs_;
+  std::map<string, int64> src_output_idx_;
+
   const NodeDef* node_;
-  const NodeExecStats* step_stat_;
+
+  CodeDef code_;
 
   std::vector<int64> shape_;
+  // Won't missing input_idx. But some shapes might be empty (unknown).
+  std::map<int, std::vector<int64>> input_shapes_;
+  // Could miss output_idx if no _output_shapes attr. some shapes can also
+  // be empty.
+  std::map<int, std::vector<int64>> output_shapes_;
+
   std::set<string> op_types_;
-  string device_;
-  int64 op_start_micros_;
-  int64 op_exec_micros_;
-  int64 all_spent_micros_;
-  int64 requested_bytes_;
+  std::map<string, const AttrValue*> op_attrs_;
+
+  std::map<int64, ExecStep> execs_;
+
+  // /j:#/t:#/r:#/device:#. A canonical device name without extra suffix.
+  string canonical_device_;
+  // The host device name.
+  string host_device_;
+
   int64 float_ops_;
+
+  string op_;
 };
 
+class TFMultiGraphNode {
+ public:
+  TFMultiGraphNode(const string& name)
+      : name_(name),
+        kernel_exec_micros_(0),
+        requested_bytes_(0),
+        float_ops_(0) {}
+
+  bool SnapshotNodes(int64 step, const std::vector<string>& type_regexes) {
+    kernel_exec_micros_ = 0;
+    requested_bytes_ = 0;
+    float_ops_ = 0;
+    op_types_.clear();
+    shapes_.clear();
+    devices_.clear();
+    snapshot_nodes_.clear();
+
+    std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes);
+
+    if (nodes.empty()) {
+      return (type_regexes.size() == 1 && type_regexes[0] == ".*");
+    }
+
+    for (const TFGraphNode* node : nodes) {
+      op_types_.insert(node->op_types().begin(), node->op_types().end());
+
+      kernel_exec_micros_ += node->kernel_exec_micros(step);
+      requested_bytes_ += node->requested_bytes(step);
+      float_ops_ += node->float_ops();
+      if (node->shape().size() > 0) {
+        shapes_.push_back(node->shape());
+      }
+      devices_.insert(node->canonical_device());
+      snapshot_nodes_[node->name()] = node;
+    }
+    return true;
+  }
+
+  void AddGraphNode(const TFGraphNode* node) {
+    if (nodes_.find(node->name()) != nodes_.end()) {
+      return;
+    }
+    nodes_[node->name()] = node;
+  }
+
+  const std::map<string, const TFGraphNode*>& graph_nodes() const {
+    return snapshot_nodes_;
+  }
+
+  void AddChildren(const string& name) {
+    if (children_.find(name) != children_.end()) {
+      return;
+    }
+    children_[name].reset(new TFMultiGraphNode(name));
+  }
+  const std::map<string, std::unique_ptr<TFMultiGraphNode>>& children() const {
+    return children_;
+  }
+
+  const string& name() const { return name_; }
+
+  int64 kernel_exec_micros() const { return kernel_exec_micros_; }
+
+  int64 requested_bytes() const { return requested_bytes_; }
+
+  int64 float_ops() const { return float_ops_; }
+
+  const std::set<string>& devices() const { return devices_; }
+
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<std::vector<int64>>& shapes() const { return shapes_; }
+
+ private:
+  std::vector<const TFGraphNode*> pick_nodes(
+      const std::vector<string>& type_regexes) {
+    if (type_regexes.empty()) {
+      return {};
+    }
+    std::vector<const TFGraphNode*> ret;
+    if (type_regexes.size() == 1 && type_regexes[0] == ".*") {
+      for (const auto& n : nodes_) {
+        ret.push_back(n.second);
+      }
+      return ret;
+    }
+
+    for (const string& regex : type_regexes) {
+      for (const auto& n : nodes_) {
+        for (const string& type : n.second->op_types()) {
+          if (RE2::FullMatch(type, regex)) {
+            ret.push_back(n.second);
+            break;
+          }
+        }
+      }
+    }
+    return ret;
+  }
+
+  const string name_;
+  // Snapshot based on type_regexes
+  std::set<string> op_types_;
+  int64 kernel_exec_micros_;
+  int64 requested_bytes_;
+  int64 float_ops_;
+  std::set<string> devices_;
+  std::vector<std::vector<int64>> shapes_;
+  std::map<string, const TFGraphNode*> snapshot_nodes_;
+
+  // Overall data held by the TFMultiGraphNode.
+  std::map<string, const TFGraphNode*> nodes_;
+  std::map<string, std::unique_ptr<TFMultiGraphNode>> children_;
+};
+
+bool IsCombinedGPUStream(const string& device);
+bool IsCPUDevice(const string& device);
+bool IsAcceleratorDevice(const string& device);
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
new file mode 100644
index 00000000000..7b604e091a7
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
@@ -0,0 +1,182 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+namespace {}
+
+ShowNode::ShowNode(const TFGraphNode* node) : node(node), account(false) {
+  ReInit(-1);
+}
+
+void ShowNode::ReInit(int64 step) {
+  mutable_proto()->set_name(name());
+  mutable_proto()->clear_devices();
+  if (!node->canonical_device().empty()) {
+    mutable_proto()->add_devices(node->canonical_device());
+  }
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros(step));
+  mutable_proto()->set_requested_bytes(node->requested_bytes(step));
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  mutable_proto()->clear_input_shapes();
+  for (const auto& inp : node->input_shapes()) {
+    (*mutable_proto()->mutable_input_shapes())[inp.first].MergeFrom(
+        VecToShapeProto(inp.second));
+  }
+
+  proto_.clear_parameters();
+  if (!node->shape().empty()) {
+    int64 params = 1;
+    bool complete_shape = true;
+    for (int64 d : node->shape()) {
+      // Sometimes parameters could be <0 when a dim is unknown.
+      if (d < 0) {
+        complete_shape = false;
+        break;
+      }
+      params *= d;
+    }
+    if (complete_shape) {
+      mutable_proto()->set_parameters(proto_.parameters() + params);
+    } else {
+      fprintf(stderr, "Incomplete shape.");
+    }
+  }
+}
+
+TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
+
+const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
+
+void ShowNode::AggregateTotalStats(ShowNode* node) {
+  TFGraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+ShowMultiNode::ShowMultiNode(TFMultiGraphNode* node)
+    : node(node), account(false), show(false) {
+  ReInit(-1, {".*"});
+}
+
+bool ShowMultiNode::ReInit(int64 step,
+                           const std::vector<string>& type_regexes) {
+  bool has_matched_type = node->SnapshotNodes(step, type_regexes);
+
+  std::vector<ShowNode> snodes;
+  mutable_proto()->mutable_graph_nodes()->Clear();
+  for (auto it : node->graph_nodes()) {
+    ShowNode snode(it.second);
+    snodes.push_back(snode);
+    snodes.back().ReInit(step);
+    snodes.back().AddSelfToTotalStats();
+    mutable_proto()->add_graph_nodes()->MergeFrom(snodes.back().proto());
+  }
+
+  mutable_proto()->set_name(name());
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  mutable_proto()->clear_parameters();
+  if (!node->shapes().empty()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      int64 params = 1;
+      bool complete_shape = true;
+      for (int64 d : shape) {
+        // Sometimes parameters could be <0 when a dim is unknown.
+        if (d < 0) {
+          complete_shape = false;
+          break;
+        }
+        params *= d;
+      }
+      if (complete_shape) {
+        mutable_proto()->set_parameters(proto().parameters() + params);
+      } else {
+        fprintf(stderr, "Incomplete shape.");
+      }
+    }
+  }
+  return has_matched_type;
+}
+
+TFMultiGraphNodeProto* ShowMultiNode::mutable_proto() { return &proto_; }
+
+const TFMultiGraphNodeProto& ShowMultiNode::proto() const { return proto_; }
+
+void ShowMultiNode::AggregateTotalStats(ShowMultiNode* node) {
+  TFMultiGraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowMultiNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowMultiNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.h b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
new file mode 100644
index 00000000000..a1091f4d661
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
@@ -0,0 +1,130 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Node classes used for different views. They are wrappers with "show"
+// methods.
+//
+// ScopeNode is for scope view. GraphNode is for graph view, CodeNode
+// is for code view and OpNode for op view.
+// ScopeNode and GraphNode each maps to one TFGraphNode.
+// CodeNode and OpNode each maps to one TFMultiGraphNode.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ShowNode {
+ public:
+  explicit ShowNode(const TFGraphNode* node);
+  virtual ~ShowNode() {}
+
+  const string& name() const { return node->name(); }
+  TFGraphNodeProto* mutable_proto();
+  const TFGraphNodeProto& proto() const;
+
+  void ReInit(int64 step);
+
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  const TFGraphNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  TFGraphNodeProto proto_;
+};
+
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {}
+
+  bool Trackable(int64 step) const { return node->trackable(step); }
+
+  std::vector<GraphNode*> children;
+  std::vector<GraphNode*> show_children;
+};
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
+  ~ScopeNode() override {}
+
+  std::vector<ScopeNode*> children;
+  std::vector<ScopeNode*> show_children;
+};
+
+class ShowMultiNode {
+ public:
+  explicit ShowMultiNode(TFMultiGraphNode* node);
+  virtual ~ShowMultiNode() {}
+
+  bool ReInit(int64 step, const std::vector<string>& type_regexes);
+
+  const string& name() const { return node->name(); }
+  TFMultiGraphNodeProto* mutable_proto();
+  const TFMultiGraphNodeProto& proto() const;
+
+  void AggregateTotalStats(ShowMultiNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFMultiGraphNode* node;
+  bool account;
+  bool show;
+  string formatted_str;
+
+ protected:
+  TFMultiGraphNodeProto proto_;
+};
+
+class CodeNode : public ShowMultiNode {
+ public:
+  explicit CodeNode(TFMultiGraphNode* node) : ShowMultiNode(node) {}
+  ~CodeNode() override {}
+
+  std::vector<CodeNode*> children;
+  std::vector<CodeNode*> show_children;
+};
+
+class OpNode : public ShowMultiNode {
+ public:
+  explicit OpNode(TFMultiGraphNode* node) : ShowMultiNode(node) {}
+  ~OpNode() override {}
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.cc b/tensorflow/tools/tfprof/internal/tfprof_op.cc
new file mode 100644
index 00000000000..655569f1a28
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_op.cc
@@ -0,0 +1,249 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_op.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+void TFOp::AddNode(TFGraphNode* node) {
+  const string& op = node->op();
+  if (tfcnodes_map_.find(op) == tfcnodes_map_.end()) {
+    tfcnodes_map_[op] =
+        std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(op));
+  }
+  TFMultiGraphNode* tfcnode = tfcnodes_map_[op].get();
+  tfcnode->AddGraphNode(node);
+}
+
+void TFOp::Build() {
+  for (auto& tn : tfcnodes_map_) {
+    cnodes_map_[tn.first] =
+        std::unique_ptr<OpNode>(new OpNode(tn.second.get()));
+  }
+
+  tfcnodes_map_[kTFProfRoot] =
+      std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(kTFProfRoot));
+  root_.reset(new OpNode(tfcnodes_map_[kTFProfRoot].get()));
+}
+
+const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
+                                         Timeline* timeline) {
+  root_->ResetTotalStats();
+  if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+    root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
+  }
+  if (timeline) {
+    fprintf(stderr, "op view doesn't support timeline yet. "
+                    "Consider graph/scope/code view.\n");
+    return root_.get();
+  }
+  if (cnodes_map_.empty()) {
+    return root_.get();
+  }
+
+  std::vector<OpNode*> nodes;
+  for (auto& n : cnodes_map_) {
+    n.second->account = ReAccount(n.second.get(), opts);
+    n.second->ResetTotalStats();
+    n.second->AddSelfToTotalStats();
+    nodes.push_back(n.second.get());
+  }
+  nodes = SortNodes(nodes, opts);
+
+  OpNode* pre = nullptr;
+  std::vector<OpNode*> account_nodes;
+  for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+    if ((*it)->account) {
+      if (pre) (*it)->AggregateTotalStats(pre);
+      account_nodes.push_back(*it);
+      pre = *it;
+    }
+  }
+  std::reverse(std::begin(account_nodes), std::end(account_nodes));
+  if (pre) {
+    root_->AggregateTotalStats(pre);
+  }
+
+  // Perform the display and optionally redo accounting.
+  int64 depth = 0;
+  std::vector<OpNode*> show_nodes;
+  int64 start = SearchRoot(account_nodes, opts.start_name_regexes);
+  for (int64 i = start; i < account_nodes.size(); ++i, ++depth) {
+    OpNode* n = account_nodes[i];
+    if (ShouldTrim(n, opts.trim_name_regexes) || depth > opts.max_depth) {
+      break;
+    }
+    n->show = ShouldShow(n, opts, depth);
+    if (n->show) show_nodes.push_back(n);
+  }
+
+  pre = nullptr;
+  for (auto it = show_nodes.rbegin(); it != show_nodes.rend(); ++it) {
+    if (opts.account_displayed_op_only) {
+      (*it)->ResetTotalStats();
+      (*it)->AddSelfToTotalStats();
+      if (pre) (*it)->AggregateTotalStats(pre);
+    }
+    if (pre) {
+      (*it)->mutable_proto()->add_children()->MergeFrom(pre->proto());
+      pre->mutable_proto()->clear_children();
+    }
+    pre = *it;
+  }
+  if (opts.account_displayed_op_only) {
+    root_->ResetTotalStats();
+    if (pre) {
+      root_->AggregateTotalStats(pre);
+      root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
+      pre->mutable_proto()->clear_children();
+    }
+  }
+
+  if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+    string display_str = FormatLegend(opts);
+    for (OpNode* node : show_nodes) {
+      display_str += FormatNode(node, root_.get(), opts);
+    }
+    root_->formatted_str = display_str;
+  }
+  return root_.get();
+}
+
+int64 TFOp::SearchRoot(const std::vector<OpNode*> nodes,
+                       const std::vector<string>& regexes) {
+  if (regexes.empty() || (regexes.size() == 1 && regexes[0] == ".*")) {
+    return 0;
+  }
+  int64 i = 0;
+  for (; i < nodes.size(); ++i) {
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(nodes[i]->name(), regex)) {
+        return i;
+      }
+    }
+  }
+  return i;
+}
+
+string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) {
+  std::vector<string> attrs;
+
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    double accu_pct = 0.0;
+    double pct = 0.0;
+    if (node->proto().requested_bytes() > 0) {
+      accu_pct = 100.0 * node->proto().total_requested_bytes() /
+          root->proto().total_requested_bytes();
+      pct = 100.0 * node->proto().requested_bytes() /
+          root->proto().total_requested_bytes();
+    }
+    attrs.push_back(strings::Printf(
+        "%30s",
+        strings::Printf("%s (%.2f%%, %.2f%%)",
+                        FormatMemory(node->proto().requested_bytes()).c_str(),
+                        accu_pct, pct)
+            .c_str()));
+  }
+
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    double accu_pct = 0.0;
+    double pct = 0.0;
+    if (node->proto().total_exec_micros() > 0) {
+      accu_pct = 100.0 * node->proto().total_exec_micros() /
+          root->proto().total_exec_micros();
+      pct = 100.0 * node->proto().exec_micros() /
+          root->proto().total_exec_micros();
+    }
+
+    attrs.push_back(strings::Printf(
+        "%30s", strings::Printf("%s (%.2f%%, %.2f%%)",
+                                FormatTime(node->proto().exec_micros()).c_str(),
+                                accu_pct, pct)
+                    .c_str()));
+  }
+
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    double accu_pct = 0.0;
+    double pct = 0.0;
+    if (node->proto().total_parameters() > 0) {
+      accu_pct = 100.0 * node->proto().total_parameters() /
+          root->proto().total_parameters();
+      pct = 100.0 * node->proto().parameters() /
+          root->proto().total_parameters();
+    }
+    attrs.push_back(strings::Printf(
+        "%30s",
+        strings::Printf("%s params (%.2f%%, %.2f%%)",
+                        FormatNumber(node->proto().parameters()).c_str(),
+                        accu_pct, pct)
+            .c_str()));
+  }
+
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    double accu_pct = 0.0;
+    double pct = 0.0;
+    if (node->proto().total_float_ops() > 0) {
+      accu_pct = 100.0 * node->proto().total_float_ops() /
+          root->proto().total_float_ops();
+      pct = 100.0 * node->proto().float_ops() /
+          root->proto().total_float_ops();
+    }
+
+    attrs.push_back(strings::Printf(
+        "%30s", strings::Printf("%s float_ops (%.2f%%, %.2f%%)",
+                                FormatNumber(node->proto().float_ops()).c_str(),
+                                accu_pct, pct)
+                    .c_str()));
+  }
+
+  if (opts.select.find(kShown[5]) != opts.select.end()) {
+    attrs.push_back(str_util::Join(node->node->devices(), "|"));
+  }
+
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    std::set<string> op_types = node->node->op_types();
+    attrs.push_back(str_util::Join(op_types, "|"));
+  }
+
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    attrs.push_back(strings::Printf(
+        "%10s",
+        strings::Printf("%d", node->proto().graph_nodes_size()).c_str()));
+  }
+
+  string node_str = strings::Printf("%-25s%s\n", node->name().c_str(),
+                                    str_util::Join(attrs, ", ").c_str());
+
+  if (opts.select.find(kShown[8]) != opts.select.end()) {
+    string input_shape_str = FormatInputShapes(node->proto());
+    if (!input_shape_str.empty()) {
+      node_str = strings::Printf("%s\n%s\n\n", node_str.c_str(),
+                                 input_shape_str.c_str());
+    }
+  }
+  return node_str;
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.h b/tensorflow/tools/tfprof/internal/tfprof_op.h
new file mode 100644
index 00000000000..5b164903630
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_op.h
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a flat structure of ops.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// Organize tensorflow ops in a graph structure, pointing from output ops
+// to input ops.
+class TFOp : public TFMultiShow {
+ public:
+  explicit TFOp()
+      : TFMultiShow() {}
+  ~TFOp() override {}
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowMultiNode* ShowInternal(const Options& opts,
+                                   Timeline* timeline) override;
+
+  int64 SearchRoot(const std::vector<OpNode*> nodes,
+                   const std::vector<string>& regexes);
+
+  bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
+                         int depth) override {
+    if (opts.min_occurrence > node->node->graph_nodes().size()) {
+      return false;
+    }
+    return true;
+  }
+
+  string FormatNode(OpNode* node, OpNode* root, const Options& opts);
+
+  std::unique_ptr<OpNode> root_;
+  std::map<string, std::unique_ptr<OpNode>> cnodes_map_;
+  std::map<string, std::unique_ptr<TFMultiGraphNode>> tfcnodes_map_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.cc b/tensorflow/tools/tfprof/internal/tfprof_options.cc
index 03282533ffd..83c275756bf 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.cc
@@ -17,21 +17,136 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+string KeyValueToStr(const std::map<string, string>& kv_map) {
+  std::vector<string> kv_vec;
+  kv_vec.reserve(kv_map.size());
+  for (const auto& pair : kv_map) {
+    kv_vec.push_back(strings::StrCat(pair.first, "=", pair.second));
+  }
+  return str_util::Join(kv_vec, ",");
+}
+}  // namespace
 
-Options Options::FromProtoStr(const string& opts_proto_str) {
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options) {
+  // The default is to use stdout.
+  if (output_opt.empty()) {
+    *output_type = kOutput[1];
+    return tensorflow::Status::OK();
+  }
+
+  std::set<string> output_types(kOutput,
+                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  auto opt_split = output_opt.find(":");
+  std::vector<string> kv_split;
+  if (opt_split == output_opt.npos) {
+    if (output_types.find(output_opt) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_opt.c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    *output_type = output_opt;
+  } else {
+    *output_type = output_opt.substr(0, opt_split);
+    if (output_types.find(*output_type) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_type->c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    kv_split = str_util::Split(output_opt.substr(opt_split + 1), ",",
+                               str_util::SkipEmpty());
+  }
+
+  std::set<string> valid_options;
+  std::set<string> required_options;
+  if (*output_type == kOutput[0]) {
+    valid_options.insert(
+        kTimelineOpts,
+        kTimelineOpts + sizeof(kTimelineOpts) / sizeof(*kTimelineOpts));
+    required_options.insert(
+        kTimelineRequiredOpts,
+        kTimelineRequiredOpts +
+            sizeof(kTimelineRequiredOpts) / sizeof(*kTimelineRequiredOpts));
+  } else if (*output_type == kOutput[2]) {
+    valid_options.insert(kFileOpts,
+                         kFileOpts + sizeof(kFileOpts) / sizeof(*kFileOpts));
+    required_options.insert(kFileRequiredOpts,
+                            kFileRequiredOpts + sizeof(kFileRequiredOpts) /
+                                                    sizeof(*kFileRequiredOpts));
+  }
+
+  for (const string& kv_str : kv_split) {
+    const std::vector<string> kv =
+        str_util::Split(kv_str, "=", str_util::SkipEmpty());
+    if (kv.size() != 2) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          "Visualize format: -output timeline:key=value,key=value,...");
+    }
+    if (valid_options.find(kv[0]) == valid_options.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Unrecognized options %s for output_type: %s\n",
+                          kv[0].c_str(), output_type->c_str()));
+    }
+    (*output_options)[kv[0]] = kv[1];
+  }
+
+  for (const string& opt : required_options) {
+    if (output_options->find(opt) == output_options->end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Missing required output_options for %s\n"
+                          "E.g. -output %s:%s=...\n",
+                          output_type->c_str(), output_type->c_str(),
+                          opt.c_str()));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Options::FromProtoStr(const string& opts_proto_str,
+                                         Options* opts) {
   OptionsProto opts_pb;
-  CHECK(opts_pb.ParseFromString(opts_proto_str));
-  Options opts(
+  if (!opts_pb.ParseFromString(opts_proto_str)) {
+    return tensorflow::Status(
+        tensorflow::error::INTERNAL,
+        strings::StrCat("Failed to parse option string from Python API: ",
+                        opts_proto_str));
+  }
+
+  string output_type;
+  std::map<string, string> output_options;
+  tensorflow::Status s =
+      ParseOutput(opts_pb.output(), &output_type, &output_options);
+  if (!s.ok()) return s;
+
+  if (!opts_pb.dump_to_file().empty()) {
+    fprintf(stderr,
+            "-dump_to_file option is deprecated. "
+            "Please use -output file:outfile=<filename>\n");
+    fprintf(stderr, "-output %s is overwritten with -output file:outfile=%s\n",
+            opts_pb.output().c_str(), opts_pb.dump_to_file().c_str());
+    output_type = kOutput[2];
+    output_options.clear();
+    output_options[kFileOpts[0]] = opts_pb.dump_to_file();
+  }
+
+  *opts = Options(
       opts_pb.max_depth(), opts_pb.min_bytes(), opts_pb.min_micros(),
-      opts_pb.min_params(), opts_pb.min_float_ops(),
-      std::vector<string>(opts_pb.device_regexes().begin(),
-                          opts_pb.device_regexes().end()),
-      opts_pb.order_by(),
+      opts_pb.min_params(), opts_pb.min_float_ops(), opts_pb.min_occurrence(),
+      opts_pb.step(), opts_pb.order_by(),
       std::vector<string>(opts_pb.account_type_regexes().begin(),
                           opts_pb.account_type_regexes().end()),
       std::vector<string>(opts_pb.start_name_regexes().begin(),
@@ -44,8 +159,8 @@ Options Options::FromProtoStr(const string& opts_proto_str) {
                           opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
       std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
-      opts_pb.viz(), opts_pb.dump_to_file());
-  return opts;
+      output_type, output_options);
+  return tensorflow::Status::OK();
 }
 
 string Options::ToString() const {
@@ -55,6 +170,8 @@ string Options::ToString() const {
       "%-28s%lld\n"
       "%-28s%lld\n"
       "%-28s%lld\n"
+      "%-28s%lld\n"
+      "%-28s%lld\n"
       "%-28s%s\n"
       "%-28s%s\n"
       "%-28s%s\n"
@@ -63,21 +180,18 @@ string Options::ToString() const {
       "%-28s%s\n"
       "%-28s%s\n"
       "%-28s%s\n"
-      "%-28s%s\n"
-      "%-28s%s\n"
-      "%-28s%s\n",
+      "%-28s%s:%s\n",
       kOptions[0], max_depth, kOptions[1], min_bytes, kOptions[2], min_micros,
       kOptions[3], min_params, kOptions[4], min_float_ops, kOptions[5],
-      str_util::Join(device_regexes, ",").c_str(), kOptions[6],
-      order_by.c_str(), kOptions[7],
-      str_util::Join(account_type_regexes, ",").c_str(), kOptions[8],
-      str_util::Join(start_name_regexes, ",").c_str(), kOptions[9],
-      str_util::Join(trim_name_regexes, ",").c_str(), kOptions[10],
-      str_util::Join(show_name_regexes, ",").c_str(), kOptions[11],
-      str_util::Join(hide_name_regexes, ",").c_str(), kOptions[12],
-      (account_displayed_op_only ? "true" : "false"), kOptions[13],
-      str_util::Join(select, ",").c_str(), kOptions[14],
-      (viz ? "true" : "false"), kOptions[15], dump_to_file.c_str());
+      min_occurrence, kOptions[6], step, kOptions[7], order_by.c_str(),
+      kOptions[8], str_util::Join(account_type_regexes, ",").c_str(),
+      kOptions[9], str_util::Join(start_name_regexes, ",").c_str(),
+      kOptions[10], str_util::Join(trim_name_regexes, ",").c_str(),
+      kOptions[11], str_util::Join(show_name_regexes, ",").c_str(),
+      kOptions[12], str_util::Join(hide_name_regexes, ",").c_str(),
+      kOptions[13], (account_displayed_op_only ? "true" : "false"),
+      kOptions[14], str_util::Join(select, ",").c_str(), kOptions[15],
+      output_type.c_str(), KeyValueToStr(output_options).c_str());
   return s;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
index a5b55e77fac..d8c172e0a2c 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -31,7 +32,8 @@ static const char* const kOptions[] = {
     "-min_micros",
     "-min_params",
     "-min_float_ops",
-    "-device_regexes",
+    "-min_occurrence",
+    "-step",
     "-order_by",
     "-account_type_regexes",
     "-start_name_regexes",
@@ -40,46 +42,69 @@ static const char* const kOptions[] = {
     "-hide_name_regexes",
     "-account_displayed_op_only",
     "-select",
-    "-viz",
-    "-dump_to_file",
+    "-output",
 };
 
 static const char* const kOrderBy[] = {
-    "name", "bytes", "micros", "params", "float_ops",
+    "name", "bytes", "micros", "params", "float_ops", "occurrence",
 };
 
 // Append Only.
+// TODO(xpan): As we are adding more fields to be selected, we
+// need to have a way to tell users what fields are available in which view.
 static const char* const kShown[] = {
-    "bytes",          "micros",       "params", "float_ops",
-    "num_hidden_ops", "tensor_value", "device", "op_types",
-};
+    "bytes",  "micros",   "params",     "float_ops",   "tensor_value",
+    "device", "op_types", "occurrence", "input_shapes"};
 
 static const char* const kCmds[] = {
-    "scope", "graph", "set", "help",
+    "scope", "graph", "code", "op", "set", "help",
+};
+
+static const char* const kOutput[] = {"timeline", "stdout", "file"};
+
+static const char* const kTimelineOpts[] = {
+    "outfile",
+};
+
+static const char* const kTimelineRequiredOpts[] = {"outfile"};
+
+static const char* const kFileOpts[] = {
+    "outfile",
+};
+
+static const char* const kFileRequiredOpts[] = {
+    "outfile",
 };
 
 struct Options {
  public:
-  static Options FromProtoStr(const string& opts_proto_str);
+  static tensorflow::Status FromProtoStr(const string& opts_proto_str,
+                                         Options* opts);
 
   virtual ~Options() {}
+  Options()
+      : Options(0, 0, 0, 0, 0, 0, 0, "", {}, {}, {}, {}, {}, false, {}, "",
+                {}) {}
+
   Options(int max_depth, tensorflow::int64 min_bytes,
           tensorflow::int64 min_micros, tensorflow::int64 min_params,
-          tensorflow::int64 min_float_ops,
-          const std::vector<string>& device_regexes, const string& order_by,
+          tensorflow::int64 min_float_ops, tensorflow::int64 min_occurrence,
+          tensorflow::int64 step, const string& order_by,
           const std::vector<string>& account_type_regexes,
           const std::vector<string>& start_name_regexes,
           const std::vector<string>& trim_name_regexes,
           const std::vector<string>& show_name_regexes,
           const std::vector<string>& hide_name_regexes,
           bool account_displayed_op_only, const std::vector<string>& select,
-          bool viz, const string& dump_to_file = "")
+          const string& output_type,
+          const std::map<string, string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_micros(min_micros),
         min_params(min_params),
         min_float_ops(min_float_ops),
-        device_regexes(device_regexes),
+        min_occurrence(min_occurrence),
+        step(step),
         order_by(order_by),
         account_type_regexes(account_type_regexes),
         start_name_regexes(start_name_regexes),
@@ -88,8 +113,8 @@ struct Options {
         hide_name_regexes(hide_name_regexes),
         account_displayed_op_only(account_displayed_op_only),
         select(select.begin(), select.end()),
-        viz(viz),
-        dump_to_file(dump_to_file) {}
+        output_type(output_type),
+        output_options(output_options) {}
 
   string ToString() const;
 
@@ -98,7 +123,8 @@ struct Options {
   tensorflow::int64 min_micros;
   tensorflow::int64 min_params;
   tensorflow::int64 min_float_ops;
-  std::vector<string> device_regexes;
+  tensorflow::int64 min_occurrence;
+  tensorflow::int64 step;
   string order_by;
 
   std::vector<string> account_type_regexes;
@@ -109,10 +135,17 @@ struct Options {
   bool account_displayed_op_only;
 
   std::set<string> select;
-  bool viz;
-  string dump_to_file;
+
+  string output_type;
+  std::map<string, string> output_options;
 };
 
+// Parse the -output option.
+// 'output_opt': User input string with format: output_type:key=value,key=value.
+// 'output_type' and 'output_options' are extracted from 'output_opt'.
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
index 949d2d54e42..2b9e56e4517 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
@@ -35,15 +35,15 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFScope::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
-  if (nodes_map_.find(node->node_def()->name()) == nodes_map_.end()) {
+void TFScope::AddNode(TFGraphNode* node) {
+  string name = node->name();
+  if (nodes_map_.find(node->name()) == nodes_map_.end()) {
     nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(node));
   }
 
@@ -58,37 +58,60 @@ void TFScope::AddNode(TFNode* node) {
 }
 
 void TFScope::Build() {
-  if (!roots_.empty()) return;
+  if (root_) return;
+
+  std::vector<ScopeNode*> roots;
   // Found roots, which are nodes without "/".
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     ScopeNode* node = it->second.get();
     auto last_slash = node->name().find_last_of("/");
     if (last_slash == string::npos) {
-      roots_.push_back(node);
+      roots.push_back(node);
     } else {
       const string prefix = node->name().substr(0, last_slash);
       nodes_map_[prefix]->children.push_back(node);
     }
   }
+
+  root_ = CreateParentNode(kTFProfRoot);
+  root_->children.assign(roots.begin(), roots.end());
 }
 
-const ShowNode* TFScope::ShowInternal(const Options& opts) {
-  // Search from roots recursively to find start node, if start_name_regexes
-  // is specified.
-  std::vector<ScopeNode*> roots = roots_;
+const ShowNode* TFScope::ShowInternal(const Options& opts, Timeline* timeline) {
+  std::vector<ScopeNode*> roots = Account(root_->children, opts);
+  root_->ResetTotalStats();
+  root_->show_children.clear();
+  for (ScopeNode* n : roots) {
+    root_->AggregateTotalStats(n);
+  }
+
   if (opts.start_name_regexes.size() != 1 ||
       opts.start_name_regexes[0] != ".*") {
     roots = SearchRoot(roots, opts.start_name_regexes);
   }
 
-  ScopeNode* root = CreateParentNode(kTFProfRoot);
-  root->children.assign(roots.begin(), roots.end());
-  Account({root}, opts);
+  root_->show_children.assign(roots.begin(), roots.end());
+  ScopeNode* root = PrintScope({root_}, opts, 1, 0)[0];
 
-  root = PrintScope({root}, opts, 1, 0)[0];
+  root->formatted_str = FormatLegend(opts) + root->formatted_str;
+  Format(root->show_children, &root->formatted_str, root->mutable_proto());
+
+  if (timeline) {
+    timeline->GenerateScopeTimeline(root);
+  }
   return root;
 }
 
+void TFScope::Format(const std::vector<ScopeNode*> roots, string* display_str,
+                     TFGraphNodeProto* proto) {
+  for (ScopeNode* node : roots) {
+    display_str->append(node->formatted_str);
+    TFGraphNodeProto* child = proto->add_children();
+    child->MergeFrom(node->proto());
+    Format(node->show_children, display_str, child);
+  }
+}
+
 std::vector<ScopeNode*> TFScope::SearchRoot(
     std::vector<ScopeNode*> roots, const std::vector<string>& regexes) {
   std::vector<ScopeNode*> res;
@@ -108,7 +131,7 @@ std::vector<ScopeNode*> TFScope::SearchRoot(
       // Found a start node at this branch, no need to continue.
       continue;
     }
-    std::vector<ScopeNode*> nroots = SearchRoot(root->children, regexes);
+    std::vector<ScopeNode*> nroots = SearchRoot(root->show_children, regexes);
     res.insert(res.end(), nroots.begin(), nroots.end());
   }
   return res;
@@ -120,27 +143,24 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
   std::vector<ScopeNode*> show_nodes;
 
   for (ScopeNode* node : roots) {
-    int nlast_ident = last_ident;
+    int ident = last_ident;
     bool show = ShouldShow(node, opts, depth);
+    if (show) ident += 2;
+
+    std::vector<ScopeNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes) && depth <= opts.max_depth) {
+      show_cnodes = PrintScope(node->show_children, opts, depth + 1, ident);
+    }
     if (show) {
-      node->formatted_str.clear();
+      node->show_children.clear();
       if (opts.account_displayed_op_only) {
         node->ResetTotalStats();
         node->AddSelfToTotalStats();
       }
-      nlast_ident += 2;
-    }
 
-    std::vector<ScopeNode*> show_cnodes;
-    if (!ShouldTrim(node, opts.trim_name_regexes)) {
-      show_cnodes = PrintScope(node->children, opts, depth + 1, nlast_ident);
-    }
-    if (show) {
       show_cnodes = SortNodes(show_cnodes, opts);
-      string children_str;
       for (ScopeNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
-        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
@@ -148,9 +168,9 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
 
       node->formatted_str =
           strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
-                          node->Format(opts).c_str());
+                          FormatNode(node, opts).c_str());
 
-      if (opts.select.find(kShown[5]) != opts.select.end()) {
+      if (opts.select.find(kShown[4]) != opts.select.end()) {
         std::unique_ptr<TFProfTensor> tfprof_tensor;
         if (LookUpCheckPoint(node->name(), &tfprof_tensor)) {
           string value_str;
@@ -159,8 +179,6 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
           node->formatted_str += value_str;
         }
       }
-
-      node->formatted_str += children_str;
       show_nodes.push_back(node);
     } else {
       show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
@@ -170,22 +188,27 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
   return show_nodes;
 }
 
-void TFScope::Account(const std::vector<ScopeNode*>& roots,
-                      const Options& opts) {
-  if (roots.empty()) return;
+std::vector<ScopeNode*> TFScope::Account(const std::vector<ScopeNode*>& roots,
+                                         const Options& opts) {
+  std::vector<ScopeNode*> act_nodes;
 
   for (ScopeNode* node : roots) {
     node->ResetTotalStats();
-    Account(node->children, opts);
+    std::vector<ScopeNode*> act_cnodes = Account(node->children, opts);
 
-    node->account = ShouldAccount(node, opts);
-    if (node->account) {
+    node->account = ReAccount(node, opts);
+    if (node->account || !act_cnodes.empty()) {
+      node->show_children.clear();
+      node->ResetTotalStats();
       node->AddSelfToTotalStats();
-    }
-    for (ScopeNode* c : node->children) {
-      node->AggregateTotalStats(c);
+      for (ScopeNode* c : act_cnodes) {
+        node->AggregateTotalStats(c);
+        node->show_children.push_back(c);
+      }
+      act_nodes.push_back(node);
     }
   }
+  return act_nodes;
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.h b/tensorflow/tools/tfprof/internal/tfprof_scope.h
index a7c58920a24..ca5cabe4579 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.h
@@ -37,34 +37,19 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-class ScopeNode : public ShowNode {
- public:
-  explicit ScopeNode(TFNode* node) : ShowNode(node) {}
-  ~ScopeNode() override {}
-
-  void AggregateTotalStats(ScopeNode* node) {
-    ShowNode::AggregateTotalStats(node);
-  }
-
-  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
-
-  void ResetTotalStats() { ShowNode::ResetTotalStats(); }
-
-  std::vector<ScopeNode*> children;
-};
-
 class TFScope : public TFShow {
  public:
   explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
-      : TFShow(ckpt_reader) {}
+      : TFShow(ckpt_reader), root_(nullptr) {}
   ~TFScope() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   ScopeNode* CreateParentNode(const string& name);
 
@@ -75,11 +60,15 @@ class TFScope : public TFShow {
                                      const Options& opts, int depth,
                                      int last_ident);
 
-  void Account(const std::vector<ScopeNode*>& roots, const Options& opts);
+  std::vector<ScopeNode*> Account(const std::vector<ScopeNode*>& roots,
+                                  const Options& opts);
 
-  std::vector<ScopeNode*> roots_;
+  void Format(const std::vector<ScopeNode*> roots, string* display_str,
+              TFGraphNodeProto* proto);
+
+  ScopeNode* root_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<ScopeNode>> nodes_map_;
 };
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
index a8f1ac6ae94..517a09f0c74 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -18,154 +18,32 @@ limitations under the License.
 #include <memory>
 #include <set>
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
 namespace tfprof {
-ShowNode::ShowNode(TFNode* node) : node(node), account(true) {
-  mutable_proto()->set_name(name());
-  if (!node->device().empty()) {
-    mutable_proto()->set_device(node->device());
-  }
-  mutable_proto()->set_exec_micros(node->op_exec_micros());
-  mutable_proto()->set_requested_bytes(node->requested_byptes());
-  mutable_proto()->set_float_ops(node->float_ops());
 
-  if (!node->shape().empty()) {
-    int64 params = 1;
-    bool complete_shape = true;
-    for (int64 d : node->shape()) {
-      // Sometimes parameters could be <0 when a dim is unknown.
-      if (d < 0) {
-        complete_shape = false;
-        break;
-      }
-      params *= d;
-    }
-    if (complete_shape) {
-      mutable_proto()->set_parameters(proto_.parameters() + params);
-    } else {
-      fprintf(stderr, "Incomplete shape.");
-    }
-  }
-}
-
-string ShowNode::Format(const Options& opts) {
-  if (opts.select.empty()) {
-    return name();
-  }
-  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
-}
-
-string ShowNode::FormatMeta(const Options& opts) {
-  std::vector<string> info;
-  if (opts.select.find(kShown[2]) != opts.select.end()) {
-    const string shape = FormatShapes(node->shape());
-    if (!shape.empty()) {
-      info.push_back(shape);
-    }
-    string params = FormatNumber(proto().total_parameters()) + " params";
-    if (account) {
-      params = FormatNumber(proto().parameters()) + "/" + params;
-    } else {
-      params = "--/" + params;
-    }
-    info.push_back(params);
-  }
-  if (opts.select.find(kShown[3]) != opts.select.end()) {
-    string fops = FormatNumber(proto().total_float_ops()) + " flops";
-    if (account) {
-      fops = FormatNumber(proto().float_ops()) + "/" + fops;
-    } else {
-      fops = "--/" + fops;
-    }
-    info.push_back(fops);
-  }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(proto().total_requested_bytes());
-    if (account) {
-      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
-
-    } else {
-      memory = "--/" + memory;
-    }
-    info.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(proto().total_exec_micros());
-    if (account) {
-      time = FormatTime(proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
-  }
-  if (opts.select.find(kShown[6]) != opts.select.end()) {
-    if (!proto().device().empty()) {
-      info.push_back(proto().device());
-    }
-  }
-  if (opts.select.find(kShown[7]) != opts.select.end()) {
-    std::set<string> op_types = node->op_types();
-    // Device is considered a type.
-    if (!proto().device().empty()) {
-      op_types.insert(proto().device());
-    }
-    info.push_back(str_util::Join(op_types, "|"));
-  }
-  return str_util::Join(info, ", ");
-}
-
-TFProfNode* ShowNode::mutable_proto() { return &proto_; }
-
-const TFProfNode& ShowNode::proto() const { return proto_; }
-
-void ShowNode::AggregateTotalStats(ShowNode* node) {
-  TFProfNode* node_pb = node->mutable_proto();
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         node_pb->total_exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             node_pb->total_requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        node_pb->total_parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       node_pb->total_float_ops());
-}
-
-void ShowNode::AddSelfToTotalStats() {
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         proto().exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             proto().requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        proto().parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       proto().float_ops());
-}
-
-void ShowNode::ResetTotalStats() {
-  mutable_proto()->set_total_exec_micros(0);
-  mutable_proto()->set_total_requested_bytes(0);
-  mutable_proto()->set_total_parameters(0);
-  mutable_proto()->set_total_float_ops(0);
-}
-
-const TFProfNode& TFShow::Show(const Options& opts) {
-  const ShowNode* root = ShowInternal(opts);
-  if (opts.dump_to_file.empty()) {
-    printf("%s", root->formatted_str.c_str());
-    fflush(stdout);
-  } else {
-    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
-                                 root->formatted_str);
+const TFGraphNodeProto& TFShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
     }
+    return root->proto();
+  } else {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
   }
-  return root->proto();
 }
 
 bool TFShow::LookUpCheckPoint(const string& name,
@@ -190,8 +68,6 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
   // Always show kTFProfRoot.
   if (node->name() == kTFProfRoot) return true;
 
-  if (!node->account) return false;
-
   if (node->proto().requested_bytes() < opts.min_bytes ||
       node->proto().exec_micros() < opts.min_micros ||
       node->proto().parameters() < opts.min_params ||
@@ -201,20 +77,6 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
   }
 
   bool show = false;
-  if (opts.device_regexes.size() == 1 && opts.device_regexes[0] == ".*") {
-    show = true;
-  } else {
-    for (const string& regex : opts.device_regexes) {
-      if (RE2::FullMatch(node->proto().device(), regex)) {
-        show = true;
-        break;
-      }
-    }
-  }
-  // Don't show if device_regexes don't cover it.
-  if (!show) return false;
-
-  show = false;
   if (opts.show_name_regexes.size() == 1 && opts.show_name_regexes[0] == ".*") {
     show = true;
   } else {
@@ -243,7 +105,8 @@ bool TFShow::ShouldTrim(ShowNode* node, const std::vector<string>& regexes) {
   return false;
 }
 
-bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
+bool TFShow::ReAccount(ShowNode* node, const Options& opts) {
+  node->ReInit(opts.step);
   if (opts.account_type_regexes.size() == 1 &&
       opts.account_type_regexes[0] == ".*") {
     return true;
@@ -254,12 +117,104 @@ bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
         return true;
       }
     }
-    if (RE2::FullMatch(node->proto().device(), regex)) {
-      return true;
+  }
+  return false;
+}
+
+string TFShow::FormatNode(ShowNode* node, const Options& opts) {
+  std::vector<string> info;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    const string shape = FormatShapes(node->node->shape());
+    if (!shape.empty()) {
+      info.push_back(shape);
+    }
+    string params = FormatNumber(node->proto().total_parameters()) + " params";
+    if (node->account) {
+      params = FormatNumber(node->proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(node->proto().total_float_ops()) + " flops";
+    if (node->account) {
+      fops = FormatNumber(node->proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(node->proto().total_requested_bytes());
+    if (node->account) {
+      memory = FormatMemory(node->proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(node->proto().total_exec_micros());
+    if (node->account) {
+      time = FormatTime(node->proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[5]) != opts.select.end()) {
+    if (node->proto().devices_size() > 0) {
+      info.push_back(str_util::Join(node->proto().devices(), "|"));
     }
   }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    const std::set<string>& op_types = node->node->op_types();
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  if (opts.select.find(kShown[8]) != opts.select.end()) {
+    std::vector<string> shape_vec;
+    for (const auto& s : node->node->input_shapes()) {
+      if (s.second.empty()) {
+        shape_vec.push_back(strings::Printf("%d:unknown", s.first));
+      } else {
+        shape_vec.push_back(strings::Printf(
+            "%d:%s", s.first, str_util::Join(s.second, "x").c_str()));
+      }
+    }
+    info.push_back(str_util::Join(shape_vec, "|"));
+  }
 
-  return false;
+  return strings::Printf("%s (%s)", node->name().c_str(),
+                         str_util::Join(info, ", ").c_str());
+}
+
+string TFShow::FormatLegend(const Options& opts) {
+  std::vector<string> legends;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    legends.push_back("# parameters");
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    legends.push_back("# float_ops");
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    legends.push_back("output bytes");
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    legends.push_back("execution time");
+  }
+  if (opts.select.find(kShown[5]) != opts.select.end()) {
+    legends.push_back("assigned devices");
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    legends.push_back("op types");
+  }
+  if (opts.select.find(kShown[8]) != opts.select.end()) {
+    legends.push_back("input shapes");
+  }
+  return strings::Printf("node name | %s\n",
+                         str_util::Join(legends, " | ").c_str());
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
index a17358bb6b4..a337a584f7a 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -28,51 +28,27 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
-class ShowNode {
- public:
-  explicit ShowNode(TFNode* node);
-  virtual ~ShowNode() {}
-
-  const string& name() const { return node->node_def()->name(); }
-  TFProfNode* mutable_proto();
-  const TFProfNode& proto() const;
-
-  string Format(const Options& opts);
-
-  string FormatMeta(const Options& opts);
-
-  TFNode* node;
-  bool account;
-  string formatted_str;
-
- protected:
-  void AggregateTotalStats(ShowNode* node);
-
-  void AddSelfToTotalStats();
-
-  void ResetTotalStats();
-
-  TFProfNode proto_;
-};
-
 class TFShow {
  public:
   explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
       : ckpt_reader_(ckpt_reader) {}
   virtual ~TFShow() {}
-  virtual void AddNode(TFNode* node) = 0;
+  virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const TFProfNode& Show(const Options& opts);
+  const TFGraphNodeProto& Show(const Options& opts);
 
  protected:
-  virtual const ShowNode* ShowInternal(const Options& opts) = 0;
+  virtual const ShowNode* ShowInternal(const Options& opts,
+                                       Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
@@ -87,7 +63,11 @@ class TFShow {
 
   bool ShouldTrim(ShowNode* node, const std::vector<string>& regexes);
 
-  bool ShouldAccount(ShowNode* node, const Options& opts);
+  bool ReAccount(ShowNode* node, const Options& opts);
+
+  string FormatNode(ShowNode* node, const Options& opts);
+
+  string FormatLegend(const Options& opts);
 
   template <typename T>
   std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc b/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
new file mode 100644
index 00000000000..4714ffc33a6
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
@@ -0,0 +1,183 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
+
+#include <memory>
+#include <set>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+const TFMultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowMultiNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+    }
+    return root->proto();
+  } else {
+    const ShowMultiNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
+  }
+}
+
+bool TFMultiShow::ShouldShow(ShowMultiNode* node, const Options& opts,
+                            int depth) {
+  // Always show kTFProfRoot.
+  if (node->name() == kTFProfRoot) return true;
+
+  // TODO(xpan): Think more carefully about node filtering in code view.
+  // Unlike graph/scope view, which users want to see the exact leaf op.
+  // In code view, users want to see the middle code traces they wrote.
+  //
+  // This is a subtle difference from scope/graph view. Usually mostly
+  // want to see the middle code traces (i.e. their own codes.), instead
+  // of the TensorFlow internal codes traces.
+  if (node->proto().total_requested_bytes() < opts.min_bytes ||
+      node->proto().total_exec_micros() < opts.min_micros ||
+      node->proto().total_parameters() < opts.min_params ||
+      node->proto().total_float_ops() < opts.min_float_ops ||
+      depth > opts.max_depth || !ShouldShowIfExtra(node, opts, depth)) {
+    return false;
+  }
+
+  bool show = false;
+  if (opts.show_name_regexes.size() == 1 && opts.show_name_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.show_name_regexes) {
+      if (RE2::FullMatch(node->name(), regex)) {
+        show = true;
+        break;
+      }
+    }
+  }
+  // Don't show if show_name_regexes don't cover it.
+  if (!show) return false;
+  // Don't show if hide_name_regexes cover it.
+  for (const string& regex : opts.hide_name_regexes) {
+    if (RE2::FullMatch(node->name(), regex)) return false;
+  }
+  return true;
+}
+
+bool TFMultiShow::ShouldTrim(ShowMultiNode* node,
+                            const std::vector<string>& regexes) {
+  for (const string& regex : regexes) {
+    if (RE2::FullMatch(node->name(), regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TFMultiShow::ReAccount(ShowMultiNode* node, const Options& opts) {
+  return node->ReInit(opts.step, opts.account_type_regexes);
+}
+
+string TFMultiShow::FormatLegend(const Options& opts) {
+  std::vector<string> legends;
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    legends.push_back("output bytes");
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    legends.push_back("execution time");
+  }
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    legends.push_back("# parameters");
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    legends.push_back("# float_ops");
+  }
+  if (opts.select.find(kShown[5]) != opts.select.end()) {
+    legends.push_back("assigned devices");
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    legends.push_back("op types");
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    legends.push_back("op occurrence");
+  }
+  if (opts.select.find(kShown[8]) != opts.select.end()) {
+    legends.push_back("input shapes");
+  }
+  return strings::Printf("node name | %s\n",
+                         str_util::Join(legends, " | ").c_str());
+}
+
+string TFMultiShow::FormatInputShapes(const TFMultiGraphNodeProto& proto) {
+  std::map<string, int> input_shapes_str;
+  std::map<string, int> input_time_str;
+  for (int i = 0; i < proto.graph_nodes_size(); ++i) {
+    const TFGraphNodeProto& gnode = proto.graph_nodes(i);
+    // Convert and sort by input_idx.
+    std::map<int, std::vector<int64>> input_shapes;
+    for (const auto& inp : gnode.input_shapes()) {
+      input_shapes[inp.first] = ShapeProtoToVec(inp.second);
+    }
+
+    std::vector<string> input_vec;
+    for (const auto& s : input_shapes) {
+      if (s.second.empty()) {
+        input_vec.push_back(strings::Printf("%d:unknown", s.first));
+      } else {
+        input_vec.push_back(strings::Printf(
+            "%d:%s", s.first, str_util::Join(s.second, "x").c_str()));
+      }
+    }
+    string shape_type_str = strings::Printf(
+        "input_type: %s", str_util::Join(input_vec, ",\t").c_str());
+    input_shapes_str[shape_type_str] += 1;
+    input_time_str[shape_type_str] += gnode.exec_micros();
+  }
+  if (input_shapes_str.empty()) {
+    return "";
+  }
+
+  std::vector<std::pair<string, int>> shape_count_vec(input_shapes_str.begin(),
+                                                      input_shapes_str.end());
+  std::sort(shape_count_vec.begin(), shape_count_vec.end(),
+            [](const std::pair<const string, int>& a,
+               const std::pair<const string, int>& b) {
+              return a.second > b.second;
+            });
+
+  std::vector<string> input_types;
+  input_types.reserve(shape_count_vec.size());
+  for (const auto& s : shape_count_vec) {
+    input_types.push_back(
+        strings::Printf("%s\t(*%d)\texec_time: %s", s.first.c_str(), s.second,
+                        FormatTime(input_time_str[s.first]).c_str()));
+  }
+  return str_util::Join(input_types, "\n");
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.h b/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
new file mode 100644
index 00000000000..1181e45ee18
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
@@ -0,0 +1,109 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_code.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFMultiShow {
+ public:
+  explicit TFMultiShow() {}
+  virtual ~TFMultiShow() {}
+  virtual void AddNode(TFGraphNode* node) = 0;
+  virtual void Build() = 0;
+  const TFMultiGraphNodeProto& Show(const Options& opts);
+
+ protected:
+  virtual const ShowMultiNode* ShowInternal(const Options& opts,
+                                           Timeline* timeline) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
+                                 int depth) {
+    return true;
+  }
+
+  bool ShouldShow(ShowMultiNode* node, const Options& opts, int depth);
+
+  bool ShouldTrim(ShowMultiNode* node, const std::vector<string>& regexes);
+
+  bool ReAccount(ShowMultiNode* node, const Options& opts);
+
+  string FormatLegend(const Options& opts);
+  string FormatInputShapes(const TFMultiGraphNodeProto& proto);
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                } else if (opts.order_by == kOrderBy[5]) {
+                  return n1->node->graph_nodes().size() >
+                         n2->node->graph_nodes().size();
+                }
+                return name_cmp;
+              });
+    return sorted_nodes;
+  }
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index 820647f6271..498477de0a0 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -71,21 +71,48 @@ class TFProfShowTest : public ::testing::Test {
 
 TEST_F(TFProfShowTest, DumpScopeMode) {
   string dump_file = io::JoinPath(testing::TmpDir(), "dump");
-  Options opts(5, 0, 0, 0, 0, {".*"}, "name",
-               {"Variable"},  // accout_type_regexes
+  Options opts(5, 0, 0, 0, 0, 0, -1, "name",
+               {"VariableV2"},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false, dump_file);
-  tf_stats_->PrintGraph("scope", opts);
+               {"params", "bytes", "micros", "float_ops"}, "file",
+               {{"outfile", dump_file}});
+  tf_stats_->ShowGraphNode("scope", opts);
 
   string dump_str;
   TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
   EXPECT_EQ(
-      "_TFProfRoot (--/450 params, --/0 flops, --/1.80KB, --/0us)\n  DW "
-      "(3x3x3x6, 162/162 params, 0/0 flops, 648B/648B, 0us/0us)\n  DW2 "
-      "(2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/1.15KB, 0us/0us)\n",
+      "node name | # parameters | # float_ops | output bytes | execution "
+      "time\n_TFProfRoot (--/370 params, --/0 flops, --/1.48KB, --/5us)\n  "
+      "conv2d (--/140 params, --/0 flops, --/560B, --/2us)\n    conv2d/bias "
+      "(5, 5/5 params, 0/0 flops, 20B/20B, 1us/1us)\n    conv2d/kernel "
+      "(3x3x3x5, 135/135 params, 0/0 flops, 540B/540B, 1us/1us)\n  conv2d_1 "
+      "(--/230 params, --/0 flops, --/920B, --/3us)\n    conv2d_1/bias (5, 5/5 "
+      "params, 0/0 flops, 20B/20B, 1us/1us)\n    conv2d_1/kernel (3x3x5x5, "
+      "225/225 params, 0/0 flops, 900B/900B, 2us/2us)\n",
       dump_str);
 }
 
+TEST_F(TFProfShowTest, DumpOpMode) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(
+      5, 0, 0, 0, 0, 4, -1, "params", {".*"},  // accout_type_regexes
+      {".*"}, {""}, {".*"}, {""}, false,
+      {"params", "bytes", "micros", "float_ops", "occurrence", "input_shapes"},
+      "file", {{"outfile", dump_file}});
+  tf_stats_->ShowMultiGraphNode("op", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(
+      "nodename|outputbytes|executiontime|#parameters|#float_ops|opoccurrence|"
+      "inputshapes\nVariableV21.48KB(100.00%,17.10%),5us(100.00%,5.15%),"
+      "370params(100.00%,100.00%),0float_ops(100.00%,0.00%),4\n\ninput_type:\t("
+      "*4)\texec_time:5us\n\nAssign0B(0.00%,0.00%),0us(94.85%,0.00%),0params(0."
+      "00%,0.00%),0float_ops(100.00%,0.00%),8\n\ninput_type:0:unknown,\t1:"
+      "unknown\t(*8)\texec_time:0us\n\nConst1.54KB(58.87%,17.74%),1us(80.41%,1."
+      "03%),0params(0.00%,0.00%),0float_ops(98.49%,0.00%),24\n\ninput_type:\t(*"
+      "24)\texec_time:1us\n\n",
+      StringReplace(dump_str, " ", ""));
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
index 4bb3a07eafa..f5b8dad4e26 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -27,28 +30,21 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
                  std::unique_ptr<OpLog> op_log,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
     : graph_(std::move(graph)),
-      run_meta_(std::move(run_meta)),
-      op_log_(std::move(op_log)),
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph_) << "Must at least have GraphDef";
 
-  printf("Parsing GraphDef...\n");
+  printf("Parsing Inputs...\n");
   ParseGraph();
-  if (run_meta_) {
-    printf("Parsing RunMetadata...\n");
-    ParseRunMeta();
-  }
-  if (op_log_) {
-    printf("Parsing OpLog...\n");
-    ParseOpLog();
+  if (run_meta && run_meta->has_step_stats()) {
+    ParseRunMeta(0, std::move(run_meta));
   }
+  ParseOpLog(std::move(op_log));
 
   if (ckpt_reader_) {
-    printf("Parsing Checkpoint...\n");
     for (const auto& v : ckpt_reader_->GetVariableToShapeMap()) {
       auto node = nodes_map_.find(v.first);
       if (node != nodes_map_.end()) {
-        node->second.AddOpType("_checkpoint_variables");
+        node->second->AddOpType("_checkpoint_variables");
       }
     }
   }
@@ -56,37 +52,71 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
   printf("Preparing Views...\n");
   scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
   graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
+  code_view_ = std::unique_ptr<TFCode>(new TFCode());
+  op_view_ = std::unique_ptr<TFOp>(new TFOp());
+
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
-    scope_view_->AddNode(&it->second);
-    graph_view_->AddNode(&it->second);
+    scope_view_->AddNode(it->second.get());
+    graph_view_->AddNode(it->second.get());
+    code_view_->AddNode(it->second.get());
+    op_view_->AddNode(it->second.get());
   }
   scope_view_->Build();
   graph_view_->Build();
+  code_view_->Build();
+  op_view_->Build();
 }
 
-const TFProfNode& TFStats::PrintGraph(const string& cmd, const Options& opts) {
+const TFGraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
+                                               const Options& opts) {
+  if (!Validate(opts)) {
+    return empty_graph_node_;
+  }
   if (cmd == kCmds[0]) {
     return scope_view_->Show(opts);
   } else if (cmd == kCmds[1]) {
     return graph_view_->Show(opts);
   } else {
     fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
-    return empty_node_;
+    return empty_graph_node_;
+  }
+}
+
+const TFMultiGraphNodeProto& TFStats::ShowMultiGraphNode(const string& cmd,
+                                                         const Options& opts) {
+  if (!Validate(opts)) {
+    return empty_multi_graph_node_;
+  }
+  if (cmd == kCmds[2]) {
+    return code_view_->Show(opts);
+  } else if (cmd == kCmds[3]) {
+    return op_view_->Show(opts);
+  } else {
+    fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
+    return empty_multi_graph_node_;
   }
 }
 
 void TFStats::ParseGraph() {
   for (const NodeDef& node : graph_->node()) {
     CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
-    nodes_map_[node.name()] = TFNode(&node);
+    nodes_map_[node.name()] =
+        std::unique_ptr<TFGraphNode>(new TFGraphNode(&node));
   }
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
-    const NodeDef* node_def = it->second.node_def();
-    for (string node_input : node_def->input()) {
+    const NodeDef* node_def = it->second->node_def();
+    for (int i = 0; i < node_def->input_size(); ++i) {
+      string node_input = node_def->input(i);
+      int output_idx = 0;
       // input name format can be: "^node:src_output"
       auto prefix_pos = node_input.find(":");
       if (prefix_pos != node_input.npos) {
-        node_input.substr(0, prefix_pos);
+        std::vector<string> input_parts = str_util::Split(node_input, ":");
+        CHECK(input_parts.size() == 2)
+            << "Unknown NodeDef.input format: " << node_input;
+        node_input = input_parts[0];
+        CHECK(strings::safe_strto32(input_parts[1], &output_idx))
+            << "Failed to parse integer: " << output_idx;
       }
       if (node_input.substr(0, 1) == "^") {
         node_input = node_input.substr(1);
@@ -95,36 +125,68 @@ void TFStats::ParseGraph() {
       if (input_node == nodes_map_.end()) {
         continue;
       }
-      it->second.AddInput(&input_node->second);
+      it->second->AddInput(input_node->second.get(), output_idx, i);
     }
   }
 }
 
-void TFStats::ParseOpLog() {
-  for (const OpLogEntry& entry : op_log_->log_entries()) {
+void TFStats::ParseOpLog(std::unique_ptr<OpLog> op_log) {
+  if (!op_log) {
+    return;
+  }
+  for (const OpLogEntry& entry : op_log->log_entries()) {
     auto node = nodes_map_.find(entry.name());
     if (node == nodes_map_.end()) continue;
     for (const string& type : entry.types()) {
-      node->second.AddOpType(type);
+      node->second->AddOpType(type);
     }
     if (entry.float_ops()) {
-      node->second.AddFloatOps(entry.float_ops());
+      node->second->AddFloatOps(entry.float_ops());
+    }
+    if (entry.has_code_def()) {
+      node->second->AddCode(entry.code_def());
     }
   }
 }
 
-void TFStats::ParseRunMeta() {
-  if (!run_meta_->has_step_stats()) return;
+void TFStats::ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
+  if (!run_meta || !run_meta->has_step_stats()) {
+    fprintf(stderr, "Invalid RunMetadata for step %lld\n", step);
+    return;
+  }
+  if (steps_.find(step) != steps_.end()) {
+    fprintf(stderr, "The same step %lld has been added before.\n", step);
+    return;
+  }
+  steps_.insert(step);
 
-  for (const auto& dev_stat : run_meta_->step_stats().dev_stats()) {
-    for (const auto& node_stat : dev_stat.node_stats()) {
-      auto node = nodes_map_.find(node_stat.node_name());
-      if (node == nodes_map_.end()) {
-        continue;
+  for (const auto& dev_stat : run_meta->step_stats().dev_stats()) {
+    for (const NodeExecStats& node_stat : dev_stat.node_stats()) {
+      string name = node_stat.node_name();
+      // Sometimes the node_name is suffixed with unnecessary information.
+      auto split_pos = node_stat.node_name().find(":");
+      if (split_pos != node_stat.node_name().npos) {
+        name = node_stat.node_name().substr(0, split_pos);
+      }
+      auto node = nodes_map_.find(name);
+      if (node != nodes_map_.end()) {
+        node->second->AddStepStat(step, dev_stat.device(), node_stat);
       }
-      node->second.AddStepStat(dev_stat.device(), &node_stat);
     }
   }
 }
+
+bool TFStats::Validate(const Options& opts) {
+  if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) {
+    fprintf(stderr, "Options -step=%lld not found\n", opts.step);
+    return false;
+  }
+  return true;
+}
+
+void TFStats::AddNodeForTest(const string& name,
+                             std::unique_ptr<TFGraphNode> node) {
+  nodes_map_[name] = std::move(node);
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.h b/tensorflow/tools/tfprof/internal/tfprof_stats.h
index 3a8b46ae315..dfb190e703d 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.h
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_op.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_show.h"
@@ -54,26 +56,42 @@ class TFStats {
           std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
   ~TFStats() {}
 
-  // Prints the results to stdout. Also returns the printed output in
-  // a proto.
-  const TFProfNode& PrintGraph(const string& cmd, const Options& opts);
+  const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
+    return nodes_map_;
+  }
+
+  // Organize the TensorFlow model as different types of views, and generate
+  // outputs for profiling.
+  const TFGraphNodeProto& ShowGraphNode(const string& cmd, const Options& opts);
+  const TFMultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
+                                                  const Options& opts);
+
+  // Add a step of run time meta data.
+  void ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
+  // Add tfprof operation meta data, such as customized op type, float_ops,
+  // and code traces.
+  void ParseOpLog(std::unique_ptr<OpLog> op_log);
+
+  // For test purpose only.
+  void AddNodeForTest(const string& name, std::unique_ptr<TFGraphNode> node);
 
  private:
+  bool Validate(const Options& opts);
+
   void ParseGraph();
 
-  void ParseOpLog();
-
-  void ParseRunMeta();
-
+  std::set<int64> steps_;
+  std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
-  std::unique_ptr<GraphDef> graph_;
-  std::unique_ptr<RunMetadata> run_meta_;
-  std::unique_ptr<OpLog> op_log_;
+  std::unique_ptr<TFCode> code_view_;
+  std::unique_ptr<TFOp> op_view_;
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
-  // Store TFNode instead of TFNode* to avoid large number of dynamic alloc.
-  std::map<string, TFNode> nodes_map_;
-  TFProfNode empty_node_;
+  // Store TFGraphNode instead of TFGraphNode* to avoid large number of
+  // dynamic alloc.
+  std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_;
+  TFGraphNodeProto empty_graph_node_;
+  TFMultiGraphNodeProto empty_multi_graph_node_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index 2aa282ac127..a1e500f9492 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -71,101 +71,155 @@ class TFProfStatsTest : public ::testing::Test {
 };
 
 TEST_F(TFProfStatsTest, CustomOpType) {
-  Options opts(3, 0, 0, 0, 0, {".*"}, "name",
+  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
                {kTrainableVarType},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 1800\ntotal_parameters: "
-      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
-      "648\n  parameters: 162\n  total_exec_micros: 0\n  "
-      "total_requested_bytes: 648\n  total_parameters: 162\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
-      "total_float_ops: 0\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 0\n  "
-      "requested_bytes: 1152\n  parameters: 288\n  total_exec_micros: 0\n  "
-      "total_requested_bytes: 1152\n  total_parameters: 288\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
-      "total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
+      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
+      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
+      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
+      "total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d/kernel\"\n    "
+      "exec_micros: 1\n    requested_bytes: 540\n    parameters: 135\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 540\n    "
+      "total_parameters: 135\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
+      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
+      "920\n  total_parameters: 230\n  children {\n    name: "
+      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
+      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "20\n    total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d_1/kernel\"\n  "
+      "  exec_micros: 2\n    requested_bytes: 900\n    parameters: 225\n    "
+      "total_exec_micros: 2\n    total_requested_bytes: 900\n    "
+      "total_parameters: 225\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
+      "0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
 
 TEST_F(TFProfStatsTest, CheckPointOpType) {
-  Options opts(
-      3, 0, 0, 0, 0, {".*"}, "name", {kCkptVarType},  // accout_type_regexes
-      {".*"}, {""}, {".*"}, {""}, false,
-      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
+               {kCkptVarType},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 1800\ntotal_parameters: "
-      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
-      "648\n  parameters: 162\n  total_exec_micros: 0\n  "
-      "total_requested_bytes: 648\n  total_parameters: 162\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
-      "total_float_ops: 0\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 0\n  "
-      "requested_bytes: 1152\n  parameters: 288\n  total_exec_micros: 0\n  "
-      "total_requested_bytes: 1152\n  total_parameters: 288\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
-      "total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
+      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
+      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
+      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
+      "total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d/kernel\"\n    "
+      "exec_micros: 1\n    requested_bytes: 540\n    parameters: 135\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 540\n    "
+      "total_parameters: 135\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
+      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
+      "920\n  total_parameters: 230\n  children {\n    name: "
+      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
+      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "20\n    total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d_1/kernel\"\n  "
+      "  exec_micros: 2\n    requested_bytes: 900\n    parameters: 225\n    "
+      "total_exec_micros: 2\n    total_requested_bytes: 900\n    "
+      "total_parameters: 225\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
+      "0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
 
 TEST_F(TFProfStatsTest, TestGraph) {
-  Options opts(100, 0, 10000, 0, 0, {".*"}, "name", {".*"},
+  Options opts(100, 0, 10000, 0, 0, 0, -1, "name", {".*"},
                {"cost.*"},  // start_name_regexes
                {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("graph", opts);
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("graph", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\ninputs: "
-      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
-      "0\ntotal_inputs: 0\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\n"
+      "total_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nfloat_ops: "
+      "0\ntotal_float_ops: 34360\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
 
 TEST_F(TFProfStatsTest, TestFloatOps) {
-  Options opts(10, 0, 0, 0, 1, {".*"}, "name", {".*"}, {".*"}, {""}, {".*"},
-               {""}, false, {"float_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+  Options opts(10, 0, 0, 0, 1, 0, -1, "name", {".*"}, {".*"}, {""}, {".*"},
+               {""}, false, {"float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 11\ntotal_requested_bytes: "
-      "5280\ntotal_parameters: 450\nchildren {\n  name: \"Conv2D\"\n  "
-      "exec_micros: 0\n  requested_bytes: 432\n  total_exec_micros: 0\n  "
-      "total_requested_bytes: 432\n  total_parameters: 0\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 5832\n  "
-      "total_float_ops: 5832\n}\nchildren {\n  name: \"Conv2D_1\"\n  "
-      "exec_micros: 10\n  requested_bytes: 384\n  total_exec_micros: 10\n  "
-      "total_requested_bytes: 384\n  total_parameters: 0\n  device: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 4608\n  "
-      "total_float_ops: 4608\n}\nfloat_ops: 0\ntotal_float_ops: 10440\n",
+      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
+      "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
+      "total_float_ops: 360\n  input_shapes {\n    key: 0\n    value {\n      "
+      "unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    value "
+      "{\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  name: "
+      "\"conv2d/convolution\"\n  exec_micros: 60\n  requested_bytes: 1440\n  "
+      "total_exec_micros: 60\n  total_requested_bytes: 1440\n  "
+      "total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
+      "total_float_ops: 19440\n  input_shapes {\n    key: 0\n    value {\n     "
+      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
+      "value {\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  name: "
+      "\"conv2d_2/BiasAdd\"\n  exec_micros: 2\n  requested_bytes: 640\n  "
+      "total_exec_micros: 2\n  total_requested_bytes: 640\n  total_parameters: "
+      "0\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "160\n  total_float_ops: 160\n  input_shapes {\n    key: 0\n    value "
+      "{\n      unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n "
+      "   value {\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  "
+      "name: \"conv2d_2/convolution\"\n  exec_micros: 13\n  requested_bytes: "
+      "640\n  total_exec_micros: 13\n  total_requested_bytes: 640\n  "
+      "total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
+      "total_float_ops: 14400\n  input_shapes {\n    key: 0\n    value {\n     "
+      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
+      "value {\n      unknown_rank: true\n    }\n  }\n}\nfloat_ops: "
+      "0\ntotal_float_ops: 34360\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
 
 TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
-  Options opts(100, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
+  Options opts(100, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
                {"unit_2_1.*DW"},  // show_name_regexes.
                {""}, true,        // account_displayed_op_only.
-               {"params"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -175,16 +229,16 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
 }
 
 TEST_F(TFProfStatsTest, TestShowTensorValue) {
-  Options opts(10, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
+  Options opts(10, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
                {"unit_1_0.*gamma"}, {""}, false,
                {"tensor_value"},  // Show tensor value from checkpoint.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
-  TFProfNode expected;
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 11\ntotal_requested_bytes: "
-      "5280\ntotal_parameters: 450\nfloat_ops: 0\ntotal_float_ops: 10440\n",
+      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nfloat_ops: 0\ntotal_float_ops: 34360\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor.h b/tensorflow/tools/tfprof/internal/tfprof_tensor.h
index 4f6fffd6504..5804837ffb6 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor.h
@@ -76,7 +76,8 @@ class TFProfTensor {
           CHECK(strings::safe_strto64(sstream.str().c_str(), &int64_val));
           dim->add_value_int64(int64_val);
           formatted_str_ += strings::Printf(
-              "%lld ", dim->value_int64(dim->value_int64_size() - 1));
+              "%lld ", static_cast<int64>(
+                           dim->value_int64(dim->value_int64_size() - 1)));
         } else if (typeid(values[nstart]) == typeid(string)) {
           dim->add_value_str(sstream.str());
           formatted_str_ =
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index baa9fce1102..3dd721cbcc8 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -55,248 +55,222 @@ class TFProfTensorTest : public ::testing::Test {
 };
 
 TEST_F(TFProfTensorTest, Basics) {
-  Options opts(3, 0, 0, 0, 0, {".*"}, "name", {"Variable"}, {".*"}, {""},
+  Options opts(3, 0, 0, 0, 0, 0, -1, "name", {"VariableV2"}, {".*"}, {""},
                {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
-      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
-      "0\n  parameters: 162\n  total_exec_micros: 0\n  total_requested_bytes: "
-      "0\n  total_parameters: 162\n  float_ops: 0\n  total_float_ops: 0\n  "
-      "tensor_value {\n    dtype: DT_FLOAT\n    value_double: -0.00117808\n    "
-      "value_double: -0.000709941\n    value_double: -0.00174816\n    "
-      "value_double: -0.000495372\n    value_double: 0.000243039\n    "
-      "value_double: -0.000126313\n    value_double: -0.000663929\n    "
-      "value_double: -0.000495198\n    value_double: -0.000893934\n    "
-      "value_double: -0.00179659\n    value_double: 0.000408874\n    "
-      "value_double: -0.00120166\n    value_double: -0.00109484\n    "
-      "value_double: -0.000200362\n    value_double: 0.000726721\n    "
-      "value_double: -0.000277568\n    value_double: 0.00180584\n    "
-      "value_double: 0.000997271\n    value_double: -0.00185987\n    "
-      "value_double: -0.00113401\n    value_double: -0.000528852\n    "
-      "value_double: -0.000197412\n    value_double: 1.32871e-05\n    "
-      "value_double: -0.000285896\n    value_double: -0.000428898\n    "
-      "value_double: -0.000424633\n    value_double: 2.15488e-05\n    "
-      "value_double: 0.00149753\n    value_double: -0.000884576\n    "
-      "value_double: -0.0013795\n    value_double: -0.000650125\n    "
-      "value_double: 0.00191612\n    value_double: 4.71838e-05\n    "
-      "value_double: 0.000400201\n    value_double: 0.00239555\n    "
-      "value_double: -0.00177706\n    value_double: -0.000781899\n    "
-      "value_double: -0.00145247\n    value_double: 0.0020025\n    "
-      "value_double: 0.000597419\n    value_double: 0.00135456\n    "
-      "value_double: 0.0015876\n    value_double: -0.000993568\n    "
-      "value_double: 0.0006509\n    value_double: -0.000894533\n    "
-      "value_double: -0.00129322\n    value_double: 0.0003859\n    "
-      "value_double: 0.000415186\n    value_double: -0.000439212\n    "
-      "value_double: 0.000442138\n    value_double: 0.00212353\n    "
-      "value_double: 0.000702953\n    value_double: 0.000713424\n    "
-      "value_double: -0.000304877\n    value_double: -9.17046e-05\n    "
-      "value_double: -0.000801103\n    value_double: 0.000304854\n    "
-      "value_double: -0.00070527\n    value_double: -0.00106408\n    "
-      "value_double: -0.000909906\n    value_double: -4.49183e-05\n    "
-      "value_double: 0.000104172\n    value_double: -0.000438067\n    "
-      "value_double: -0.000317689\n    value_double: -0.000769914\n    "
-      "value_double: -0.00157729\n    value_double: 0.000220733\n    "
-      "value_double: 0.00107268\n    value_double: -0.000186449\n    "
-      "value_double: -0.000807328\n    value_double: 0.000456308\n    "
-      "value_double: -0.000593729\n    value_double: -0.000954873\n    "
-      "value_double: -0.000268676\n    value_double: 9.06328e-05\n    "
-      "value_double: -0.000323473\n    value_double: -0.000628768\n    "
-      "value_double: 0.000664985\n    value_double: 0.0020999\n    "
-      "value_double: -0.000932228\n    value_double: -0.00203203\n    "
-      "value_double: 0.000565405\n    value_double: 0.000167899\n    "
-      "value_double: 0.00054897\n    value_double: 0.000612407\n    "
-      "value_double: -0.000619301\n    value_double: 0.00169361\n    "
-      "value_double: -0.000188057\n    value_double: 0.000267652\n    "
-      "value_double: -0.00127341\n    value_double: -0.000218836\n    "
-      "value_double: -0.000431722\n    value_double: 5.41867e-05\n    "
-      "value_double: 0.000296628\n    value_double: 0.000819415\n    "
-      "value_double: -0.000758993\n    value_double: -0.000114477\n    "
-      "value_double: 6.29219e-05\n    value_double: 0.000726988\n    "
-      "value_double: -0.00135974\n    value_double: 2.28447e-05\n    "
-      "value_double: 0.00120547\n    value_double: -0.00136907\n    "
-      "value_double: -0.00140188\n    value_double: 0.000201145\n    "
-      "value_double: -0.000774109\n    value_double: 0.000798465\n    "
-      "value_double: -0.00131861\n    value_double: 3.08996e-05\n    "
-      "value_double: -0.000637026\n    value_double: 0.00228975\n    "
-      "value_double: -0.000633757\n    value_double: -0.00116047\n    "
-      "value_double: 7.66039e-05\n    value_double: 2.09167e-06\n    "
-      "value_double: -0.000296448\n    value_double: 0.000206795\n    "
-      "value_double: 0.000674405\n    value_double: -0.000722742\n    "
-      "value_double: -9.32443e-05\n    value_double: -0.00170917\n    "
-      "value_double: -0.000505279\n    value_double: 0.000628132\n    "
-      "value_double: -0.00145929\n    value_double: 0.00106077\n    "
-      "value_double: -0.000796743\n    value_double: 0.000498275\n    "
-      "value_double: -0.0002914\n    value_double: -0.00230622\n    "
-      "value_double: -9.42872e-05\n    value_double: 0.000200359\n    "
-      "value_double: -0.00305027\n    value_double: -0.0016218\n    "
-      "value_double: 0.00137126\n    value_double: -0.00215436\n    "
-      "value_double: -0.000743827\n    value_double: -0.00090007\n    "
-      "value_double: -0.000762207\n    value_double: -0.000149951\n    "
-      "value_double: -0.0013102\n    value_double: 0.00165781\n    "
-      "value_double: 0.000343809\n    value_double: -0.000826069\n    "
-      "value_double: -4.67404e-05\n    value_double: 0.0023931\n    "
-      "value_double: 0.00165338\n    value_double: -0.00050529\n    "
-      "value_double: 0.000178771\n    value_double: -0.000858287\n    "
-      "value_double: -0.00157031\n    value_double: -0.00165846\n    "
-      "value_double: -0.000713672\n    value_double: 0.00014357\n    "
-      "value_double: 0.00203632\n    value_double: -0.0010973\n    "
-      "value_double: -9.89852e-05\n    value_double: 0.000558808\n    "
-      "value_double: 0.00087211\n    value_double: 0.000661239\n    "
-      "value_double: 0.000389605\n    value_double: 0.00060653\n    "
-      "value_double: -0.000330104\n  }\n}\nchildren {\n  name: \"DW2\"\n  "
-      "exec_micros: 0\n  requested_bytes: 0\n  parameters: 288\n  "
-      "total_exec_micros: 0\n  total_requested_bytes: 0\n  total_parameters: "
-      "288\n  float_ops: 0\n  total_float_ops: 0\n  tensor_value {\n    dtype: "
-      "DT_FLOAT\n    value_double: 0.000704577\n    value_double: "
-      "0.000127421\n    value_double: 0.00105952\n    value_double: "
-      "0.000423765\n    value_double: -0.00025461\n    value_double: "
-      "-0.000857203\n    value_double: 0.000693494\n    value_double: "
-      "0.000282214\n    value_double: 0.00106185\n    value_double: "
-      "-0.000836552\n    value_double: -0.00116766\n    value_double: "
-      "0.000733674\n    value_double: -0.000669601\n    value_double: "
-      "-0.000275175\n    value_double: -0.000428215\n    value_double: "
-      "-0.000495715\n    value_double: -0.000125887\n    value_double: "
-      "-0.000715204\n    value_double: -0.00108936\n    value_double: "
-      "0.000738267\n    value_double: 0.000376081\n    value_double: "
-      "0.00191442\n    value_double: 0.001423\n    value_double: -0.00093811\n "
-      "   value_double: -5.91421e-05\n    value_double: -0.000221507\n    "
-      "value_double: -0.000104555\n    value_double: -0.00069682\n    "
-      "value_double: -0.000278325\n    value_double: -0.00122748\n    "
-      "value_double: -0.00112411\n    value_double: -0.000440511\n    "
-      "value_double: -0.000392247\n    value_double: -0.000419606\n    "
-      "value_double: -0.00167063\n    value_double: -0.000988578\n    "
-      "value_double: -0.00040159\n    value_double: 0.00238918\n    "
-      "value_double: -0.000892898\n    value_double: -0.000875976\n    "
-      "value_double: 0.00154401\n    value_double: -0.000719911\n    "
-      "value_double: 0.000753941\n    value_double: -0.000119961\n    "
-      "value_double: -0.000305115\n    value_double: 9.97947e-05\n    "
-      "value_double: -0.00128908\n    value_double: -0.000584184\n    "
-      "value_double: -0.000734685\n    value_double: -0.00146612\n    "
-      "value_double: 0.000670802\n    value_double: 0.000924219\n    "
-      "value_double: -0.000154409\n    value_double: 0.000198231\n    "
-      "value_double: -0.000340742\n    value_double: -0.00159646\n    "
-      "value_double: -1.19382e-05\n    value_double: 0.00165203\n    "
-      "value_double: 0.0017085\n    value_double: -0.000199614\n    "
-      "value_double: 0.000529526\n    value_double: 0.000769364\n    "
-      "value_double: 0.00135369\n    value_double: 0.00132873\n    "
-      "value_double: 0.000451174\n    value_double: 0.000255218\n    "
-      "value_double: 0.00102891\n    value_double: -0.00160068\n    "
-      "value_double: 0.000324269\n    value_double: -0.000492347\n    "
-      "value_double: 0.000925301\n    value_double: 0.00281998\n    "
-      "value_double: -0.000826404\n    value_double: -0.000602903\n    "
-      "value_double: 0.00126559\n    value_double: 0.000924364\n    "
-      "value_double: -9.19827e-05\n    value_double: -5.59275e-05\n    "
-      "value_double: 0.00107971\n    value_double: -9.91756e-05\n    "
-      "value_double: 0.000864708\n    value_double: 0.00121747\n    "
-      "value_double: 0.00146338\n    value_double: 0.000186883\n    "
-      "value_double: -0.00168195\n    value_double: -0.00062029\n    "
-      "value_double: 0.000658127\n    value_double: 0.00115682\n    "
-      "value_double: -0.00178359\n    value_double: 0.000685606\n    "
-      "value_double: -0.000503373\n    value_double: -0.000312999\n    "
-      "value_double: 0.000335383\n    value_double: -1.08597e-05\n    "
-      "value_double: -8.2499e-05\n    value_double: -0.000469726\n    "
-      "value_double: -0.00170868\n    value_double: 0.000118957\n    "
-      "value_double: -0.000460736\n    value_double: -5.56372e-05\n    "
-      "value_double: -0.00110148\n    value_double: 0.00059123\n    "
-      "value_double: 0.000386339\n    value_double: -0.00139967\n    "
-      "value_double: -0.000835664\n    value_double: 0.00103421\n    "
-      "value_double: -0.00104296\n    value_double: -0.000687497\n    "
-      "value_double: 1.1338e-05\n    value_double: 0.00176484\n    "
-      "value_double: 0.000531523\n    value_double: -0.000986387\n    "
-      "value_double: -0.00114152\n    value_double: 0.000256744\n    "
-      "value_double: 0.000228425\n    value_double: 0.00116583\n    "
-      "value_double: 0.0002726\n    value_double: -0.00100828\n    "
-      "value_double: -0.000950376\n    value_double: -0.00229074\n    "
-      "value_double: -0.000348272\n    value_double: -0.000526032\n    "
-      "value_double: -0.000133703\n    value_double: 0.000310979\n    "
-      "value_double: -0.00199278\n    value_double: -0.000874469\n    "
-      "value_double: -0.000631466\n    value_double: 0.0010534\n    "
-      "value_double: 0.00134646\n    value_double: -0.00172743\n    "
-      "value_double: 0.00131031\n    value_double: -0.000697506\n    "
-      "value_double: 0.000286747\n    value_double: 0.000140759\n    "
-      "value_double: 0.000568707\n    value_double: 0.000108177\n    "
-      "value_double: -0.00207337\n    value_double: -0.00138146\n    "
-      "value_double: 0.000483162\n    value_double: -0.00167096\n    "
-      "value_double: -0.000465813\n    value_double: 0.00067724\n    "
-      "value_double: 2.08388e-05\n    value_double: -0.00203279\n    "
-      "value_double: 7.8429e-05\n    value_double: 0.00161337\n    "
-      "value_double: -0.000269005\n    value_double: 0.000217822\n    "
-      "value_double: 0.000599886\n    value_double: 0.000317549\n    "
-      "value_double: 0.00146597\n    value_double: -0.00210947\n    "
-      "value_double: -0.000823917\n    value_double: -6.83766e-05\n    "
-      "value_double: 0.000656085\n    value_double: 0.000117134\n    "
-      "value_double: -0.000390405\n    value_double: 2.39565e-05\n    "
-      "value_double: 0.00104837\n    value_double: -0.000563671\n    "
-      "value_double: 0.000634073\n    value_double: -0.000554531\n    "
-      "value_double: 0.000677971\n    value_double: -0.000596207\n    "
-      "value_double: -0.00103335\n    value_double: 0.000645199\n    "
-      "value_double: 0.00162195\n    value_double: 0.000239246\n    "
-      "value_double: 0.00113519\n    value_double: 0.000787431\n    "
-      "value_double: -0.000471688\n    value_double: -0.000216625\n    "
-      "value_double: -0.000537156\n    value_double: 0.000551816\n    "
-      "value_double: 0.00094337\n    value_double: -0.000708127\n    "
-      "value_double: 0.000956955\n    value_double: -0.000904936\n    "
-      "value_double: -0.000424413\n    value_double: 0.000106455\n    "
-      "value_double: -0.000443952\n    value_double: 0.000185436\n    "
-      "value_double: 0.000944397\n    value_double: -0.000760572\n    "
-      "value_double: 0.000560002\n    value_double: 4.09886e-05\n    "
-      "value_double: -0.00075076\n    value_double: -0.000701856\n    "
-      "value_double: -0.000234851\n    value_double: -0.000131515\n    "
-      "value_double: -0.000761718\n    value_double: -0.000267808\n    "
-      "value_double: -0.00039682\n    value_double: 0.000542953\n    "
-      "value_double: -0.000817685\n    value_double: 0.00103851\n    "
-      "value_double: -0.000427176\n    value_double: 0.000517784\n    "
-      "value_double: -0.000823552\n    value_double: -0.000742637\n    "
-      "value_double: 0.000529213\n    value_double: -0.000372805\n    "
-      "value_double: 1.85745e-05\n    value_double: 0.00139891\n    "
-      "value_double: -0.000128417\n    value_double: -0.000404316\n    "
-      "value_double: -0.000671571\n    value_double: 0.000490311\n    "
-      "value_double: -0.00118493\n    value_double: -0.000897118\n    "
-      "value_double: 0.000939601\n    value_double: 0.000376399\n    "
-      "value_double: 0.0014709\n    value_double: 0.000134806\n    "
-      "value_double: -0.000294469\n    value_double: -0.000569142\n    "
-      "value_double: 0.00127266\n    value_double: -0.00140936\n    "
-      "value_double: 0.000870083\n    value_double: 0.000287246\n    "
-      "value_double: 0.000537685\n    value_double: 0.000125569\n    "
-      "value_double: 0.000360276\n    value_double: -0.000186268\n    "
-      "value_double: 0.0011141\n    value_double: -0.000605185\n    "
-      "value_double: -0.0016281\n    value_double: -0.000552758\n    "
-      "value_double: -0.000196755\n    value_double: -0.00265188\n    "
-      "value_double: 0.000480997\n    value_double: 0.00018776\n    "
-      "value_double: -0.00199234\n    value_double: 0.000959982\n    "
-      "value_double: 0.00040334\n    value_double: -0.000693596\n    "
-      "value_double: 0.00157678\n    value_double: -0.00134499\n    "
-      "value_double: 0.00121909\n    value_double: -0.000328734\n    "
-      "value_double: 0.000148554\n    value_double: -0.000209509\n    "
-      "value_double: -0.000266303\n    value_double: -0.00134084\n    "
-      "value_double: 5.21371e-05\n    value_double: 0.0005329\n    "
-      "value_double: -0.000168858\n    value_double: -0.00074875\n    "
-      "value_double: 0.000959397\n    value_double: -0.00159476\n    "
-      "value_double: -0.000368838\n    value_double: 0.0006077\n    "
-      "value_double: -0.00117243\n    value_double: -0.00146013\n    "
-      "value_double: 0.00031519\n    value_double: -0.000167911\n    "
-      "value_double: 0.000482571\n    value_double: -0.000752268\n    "
-      "value_double: -0.00042363\n    value_double: 0.00121219\n    "
-      "value_double: -0.000208159\n    value_double: 0.000128531\n    "
-      "value_double: -0.000406308\n    value_double: -0.000242663\n    "
-      "value_double: -3.96673e-05\n    value_double: 0.00144854\n    "
-      "value_double: -0.000787328\n    value_double: -0.000401958\n    "
-      "value_double: 0.00114091\n    value_double: -0.000739546\n    "
-      "value_double: 0.000483236\n    value_double: -0.000916945\n    "
-      "value_double: -0.00129577\n    value_double: -0.00186504\n    "
-      "value_double: 0.000806804\n    value_double: -0.000152251\n    "
-      "value_double: 0.000662576\n    value_double: -0.000533236\n    "
-      "value_double: 0.00151019\n    value_double: 0.00127805\n    "
-      "value_double: 0.00115399\n    value_double: -0.00130876\n    "
-      "value_double: 2.99457e-06\n    value_double: 0.000820777\n    "
-      "value_double: 0.000878393\n    value_double: -0.000562642\n    "
-      "value_double: -0.00070442\n    value_double: -0.00066277\n  "
-      "}\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 0\n  total_requested_bytes: "
+      "0\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n   "
+      " exec_micros: 0\n    requested_bytes: 0\n    parameters: 5\n    "
+      "total_exec_micros: 0\n    total_requested_bytes: 0\n    "
+      "total_parameters: 5\n    float_ops: 0\n    total_float_ops: 0\n    "
+      "tensor_value {\n      dtype: DT_FLOAT\n      value_double: 0\n      "
+      "value_double: 0\n      value_double: 0\n      value_double: 0\n      "
+      "value_double: 0\n    }\n  }\n  children {\n    name: "
+      "\"conv2d/kernel\"\n    exec_micros: 0\n    requested_bytes: 0\n    "
+      "parameters: 135\n    total_exec_micros: 0\n    total_requested_bytes: "
+      "0\n    total_parameters: 135\n    float_ops: 0\n    total_float_ops: "
+      "0\n    tensor_value {\n      dtype: DT_FLOAT\n      value_double: "
+      "-0.113138\n      value_double: 0.261431\n      value_double: 0.215777\n "
+      "     value_double: 0.24135\n      value_double: -0.113195\n      "
+      "value_double: -0.212639\n      value_double: -0.0907301\n      "
+      "value_double: 0.0221634\n      value_double: 0.21821\n      "
+      "value_double: 0.22715\n      value_double: -0.108698\n      "
+      "value_double: 0.240911\n      value_double: -0.138626\n      "
+      "value_double: -0.144752\n      value_double: -0.00962037\n      "
+      "value_double: 0.0971008\n      value_double: 0.00264764\n      "
+      "value_double: -0.272929\n      value_double: 0.0129845\n      "
+      "value_double: 0.0466554\n      value_double: -0.229184\n      "
+      "value_double: 0.153576\n      value_double: -0.169218\n      "
+      "value_double: -0.112991\n      value_double: 0.205739\n      "
+      "value_double: 0.257844\n      value_double: 0.107455\n      "
+      "value_double: -0.207914\n      value_double: 0.15211\n      "
+      "value_double: 0.277932\n      value_double: 0.145986\n      "
+      "value_double: -0.0883989\n      value_double: 0.167506\n      "
+      "value_double: 0.10237\n      value_double: 0.0542143\n      "
+      "value_double: 0.0334378\n      value_double: 0.159489\n      "
+      "value_double: 0.246583\n      value_double: 0.0154283\n      "
+      "value_double: 0.0872411\n      value_double: -0.25732\n      "
+      "value_double: 0.0499355\n      value_double: 0.0266221\n      "
+      "value_double: 0.088801\n      value_double: -0.0794552\n      "
+      "value_double: -0.00383255\n      value_double: -0.165267\n      "
+      "value_double: 0.0271328\n      value_double: 0.0729822\n      "
+      "value_double: 0.200795\n      value_double: 0.100276\n      "
+      "value_double: 0.285254\n      value_double: -0.171945\n      "
+      "value_double: -0.0187411\n      value_double: -0.218729\n      "
+      "value_double: 0.233753\n      value_double: 0.109184\n      "
+      "value_double: 0.247875\n      value_double: -0.224632\n      "
+      "value_double: 0.0940739\n      value_double: 0.00663087\n      "
+      "value_double: -0.075786\n      value_double: -0.179992\n      "
+      "value_double: -0.276016\n      value_double: 0.261207\n      "
+      "value_double: -0.0658191\n      value_double: -0.0747132\n      "
+      "value_double: -0.0839638\n      value_double: -0.0825393\n      "
+      "value_double: 0.0915958\n      value_double: -0.195425\n      "
+      "value_double: -0.255836\n      value_double: -0.08745\n      "
+      "value_double: -0.181623\n      value_double: -0.235936\n      "
+      "value_double: 0.0205423\n      value_double: 0.185447\n      "
+      "value_double: -0.0691599\n      value_double: -0.0451089\n      "
+      "value_double: -0.153922\n      value_double: -0.0279411\n      "
+      "value_double: 0.148915\n      value_double: -0.018026\n      "
+      "value_double: -0.144903\n      value_double: 0.0370046\n      "
+      "value_double: 0.0764987\n      value_double: 0.0586488\n      "
+      "value_double: -0.222919\n      value_double: 0.0238447\n      "
+      "value_double: -0.106012\n      value_double: -0.102202\n      "
+      "value_double: -0.159347\n      value_double: -0.0232876\n      "
+      "value_double: 0.109855\n      value_double: -0.141833\n      "
+      "value_double: 0.1376\n      value_double: -0.12413\n      value_double: "
+      "-0.208968\n      value_double: 0.0758635\n      value_double: "
+      "-0.217672\n      value_double: -0.20153\n      value_double: "
+      "-0.195414\n      value_double: -0.18549\n      value_double: "
+      "0.00298014\n      value_double: -0.279283\n      value_double: "
+      "0.200084\n      value_double: -0.0968328\n      value_double: -0.243\n  "
+      "    value_double: 0.239319\n      value_double: -0.236288\n      "
+      "value_double: 0.169477\n      value_double: 0.126673\n      "
+      "value_double: 0.182215\n      value_double: -0.028243\n      "
+      "value_double: 0.282762\n      value_double: -0.165548\n      "
+      "value_double: -0.0641245\n      value_double: -0.186382\n      "
+      "value_double: 0.0329038\n      value_double: 0.271848\n      "
+      "value_double: 0.084653\n      value_double: -0.108163\n      "
+      "value_double: 0.247094\n      value_double: 0.192687\n      "
+      "value_double: 0.171922\n      value_double: -0.187649\n      "
+      "value_double: 0.251253\n      value_double: 0.272077\n      "
+      "value_double: 0.19068\n      value_double: 0.220352\n      "
+      "value_double: -0.255741\n      value_double: 0.110853\n      "
+      "value_double: 0.146625\n      value_double: 0.167754\n      "
+      "value_double: 0.249554\n    }\n  }\n  float_ops: 0\n  total_float_ops: "
+      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 0\n  total_requested_bytes: "
+      "0\n  total_parameters: 230\n  children {\n    name: \"conv2d_1/bias\"\n "
+      "   exec_micros: 0\n    requested_bytes: 0\n    parameters: 5\n    "
+      "total_exec_micros: 0\n    total_requested_bytes: 0\n    "
+      "total_parameters: 5\n    float_ops: 0\n    total_float_ops: 0\n    "
+      "tensor_value {\n      dtype: DT_FLOAT\n      value_double: 0\n      "
+      "value_double: 0\n      value_double: 0\n      value_double: 0\n      "
+      "value_double: 0\n    }\n  }\n  children {\n    name: "
+      "\"conv2d_1/kernel\"\n    exec_micros: 0\n    requested_bytes: 0\n    "
+      "parameters: 225\n    total_exec_micros: 0\n    total_requested_bytes: "
+      "0\n    total_parameters: 225\n    float_ops: 0\n    total_float_ops: "
+      "0\n    tensor_value {\n      dtype: DT_FLOAT\n      value_double: "
+      "-0.00170514\n      value_double: 0.138601\n      value_double: "
+      "-0.224822\n      value_double: -0.0848449\n      value_double: "
+      "0.170551\n      value_double: 0.147666\n      value_double: "
+      "-0.0570606\n      value_double: -0.132805\n      value_double: "
+      "-0.172013\n      value_double: 0.249707\n      value_double: 0.149734\n "
+      "     value_double: 0.0365986\n      value_double: -0.0923146\n      "
+      "value_double: -0.17745\n      value_double: -0.169978\n      "
+      "value_double: -0.173298\n      value_double: -0.110407\n      "
+      "value_double: 0.1469\n      value_double: 0.0419576\n      "
+      "value_double: 0.0391093\n      value_double: -0.137381\n      "
+      "value_double: 0.212642\n      value_double: -0.067034\n      "
+      "value_double: -0.0727709\n      value_double: -0.0276531\n      "
+      "value_double: 0.218212\n      value_double: 0.0596479\n      "
+      "value_double: -0.0468102\n      value_double: -0.0250467\n      "
+      "value_double: -0.20391\n      value_double: -0.233801\n      "
+      "value_double: 0.135615\n      value_double: -0.182124\n      "
+      "value_double: 0.254205\n      value_double: 0.0819146\n      "
+      "value_double: -0.146696\n      value_double: -0.20095\n      "
+      "value_double: -0.250555\n      value_double: -0.226406\n      "
+      "value_double: 0.0421331\n      value_double: 0.0361264\n      "
+      "value_double: -0.188558\n      value_double: -0.0222711\n      "
+      "value_double: -0.128226\n      value_double: -0.148305\n      "
+      "value_double: -0.137598\n      value_double: -0.041647\n      "
+      "value_double: -0.0574933\n      value_double: 0.122506\n      "
+      "value_double: 0.0415936\n      value_double: 0.244957\n      "
+      "value_double: 0.00372121\n      value_double: -0.139939\n      "
+      "value_double: 0.250411\n      value_double: -0.23848\n      "
+      "value_double: -0.0717569\n      value_double: -0.00884159\n      "
+      "value_double: 0.135616\n      value_double: -0.0493895\n      "
+      "value_double: 0.254308\n      value_double: -0.181419\n      "
+      "value_double: -0.114829\n      value_double: -0.172638\n      "
+      "value_double: 0.06984\n      value_double: -0.086704\n      "
+      "value_double: 0.168515\n      value_double: -0.152275\n      "
+      "value_double: -0.230775\n      value_double: -0.254366\n      "
+      "value_double: -0.115397\n      value_double: 0.0418207\n      "
+      "value_double: -0.199607\n      value_double: -0.167001\n      "
+      "value_double: -0.187238\n      value_double: 0.0196097\n      "
+      "value_double: 0.201653\n      value_double: -0.143758\n      "
+      "value_double: 0.167187\n      value_double: -0.129141\n      "
+      "value_double: 0.230154\n      value_double: -0.119968\n      "
+      "value_double: -0.121843\n      value_double: -0.0118565\n      "
+      "value_double: 0.0285747\n      value_double: -0.0593699\n      "
+      "value_double: -0.175214\n      value_double: -0.211524\n      "
+      "value_double: 0.167042\n      value_double: -0.216357\n      "
+      "value_double: -0.0218886\n      value_double: -0.244211\n      "
+      "value_double: 0.175301\n      value_double: 0.0654932\n      "
+      "value_double: -0.0419763\n      value_double: -0.103275\n      "
+      "value_double: -0.0848433\n      value_double: -0.0845421\n      "
+      "value_double: -0.00269318\n      value_double: -0.145978\n      "
+      "value_double: -0.217061\n      value_double: -0.0937043\n      "
+      "value_double: 0.235796\n      value_double: -0.0893372\n      "
+      "value_double: 0.000827968\n      value_double: 0.0172743\n      "
+      "value_double: -0.234205\n      value_double: -0.0867703\n      "
+      "value_double: 0.131704\n      value_double: 0.134143\n      "
+      "value_double: -0.162257\n      value_double: -0.129706\n      "
+      "value_double: 0.0763288\n      value_double: 0.156988\n      "
+      "value_double: 0.220033\n      value_double: -0.179884\n      "
+      "value_double: 0.066697\n      value_double: 0.212322\n      "
+      "value_double: -0.0961226\n      value_double: -0.11223\n      "
+      "value_double: 0.249944\n      value_double: 0.115673\n      "
+      "value_double: -0.100203\n      value_double: 0.125645\n      "
+      "value_double: -0.256104\n      value_double: 0.0996534\n      "
+      "value_double: 0.167306\n      value_double: -0.00700775\n      "
+      "value_double: 0.242145\n      value_double: 0.088406\n      "
+      "value_double: 0.0975334\n      value_double: -0.0309525\n      "
+      "value_double: -0.0422794\n      value_double: 0.20739\n      "
+      "value_double: 0.113992\n      value_double: 0.253818\n      "
+      "value_double: -0.0857835\n      value_double: 0.223902\n      "
+      "value_double: 0.10291\n      value_double: 0.103091\n      "
+      "value_double: -0.177502\n      value_double: -0.0258242\n      "
+      "value_double: -0.130567\n      value_double: -0.15999\n      "
+      "value_double: -0.101484\n      value_double: 0.0188813\n      "
+      "value_double: 0.160626\n      value_double: 0.0467491\n      "
+      "value_double: 0.193634\n      value_double: -0.0910993\n      "
+      "value_double: 0.0440249\n      value_double: -0.255389\n      "
+      "value_double: -0.240244\n      value_double: -0.213171\n      "
+      "value_double: 0.175978\n      value_double: -0.0251202\n      "
+      "value_double: 0.0943941\n      value_double: -0.196194\n      "
+      "value_double: 0.163395\n      value_double: -0.010777\n      "
+      "value_double: -0.0626751\n      value_double: -0.246234\n      "
+      "value_double: 0.0662063\n      value_double: 0.120589\n      "
+      "value_double: 0.237322\n      value_double: 0.0849243\n      "
+      "value_double: -0.066591\n      value_double: 0.0512236\n      "
+      "value_double: -0.144309\n      value_double: -0.235415\n      "
+      "value_double: -0.0565311\n      value_double: 0.0882529\n      "
+      "value_double: -0.215923\n      value_double: -0.0873292\n      "
+      "value_double: -0.0691103\n      value_double: -0.00238678\n      "
+      "value_double: 0.147789\n      value_double: -0.124451\n      "
+      "value_double: 0.205044\n      value_double: -0.0596834\n      "
+      "value_double: 0.0268479\n      value_double: 0.0857448\n      "
+      "value_double: -0.0923855\n      value_double: -0.0960547\n      "
+      "value_double: 0.169869\n      value_double: 0.16988\n      "
+      "value_double: -0.032271\n      value_double: -0.120731\n      "
+      "value_double: -0.199086\n      value_double: 0.181199\n      "
+      "value_double: 0.00897732\n      value_double: -0.257469\n      "
+      "value_double: -0.135556\n      value_double: -0.149663\n      "
+      "value_double: -0.00990398\n      value_double: 0.221165\n      "
+      "value_double: 0.0327134\n      value_double: -0.0392821\n      "
+      "value_double: -0.0614503\n      value_double: 0.246602\n      "
+      "value_double: -0.171692\n      value_double: -0.150835\n      "
+      "value_double: -0.13854\n      value_double: -0.244668\n      "
+      "value_double: 0.0790781\n      value_double: 0.212678\n      "
+      "value_double: 0.0782059\n      value_double: -0.177888\n      "
+      "value_double: -0.165914\n      value_double: -0.164251\n      "
+      "value_double: 0.165007\n      value_double: 0.239615\n      "
+      "value_double: -0.217642\n      value_double: -0.219843\n      "
+      "value_double: 0.0828398\n      value_double: 0.00272235\n      "
+      "value_double: -0.0323662\n      value_double: -0.255953\n      "
+      "value_double: 0.237298\n      value_double: -0.0896481\n      "
+      "value_double: -0.0605349\n      value_double: 0.231679\n      "
+      "value_double: -0.123842\n      value_double: 0.0858642\n      "
+      "value_double: 0.23111\n      value_double: 0.0491742\n    }\n  }\n  "
+      "float_ops: 0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: "
+      "0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
new file mode 100644
index 00000000000..c98aa940c8c
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
@@ -0,0 +1,370 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+
+namespace tensorflow {
+namespace tfprof {
+namespace {
+string GetTimeDevName(const string& dev) {
+  if (dev.find("stream") != dev.npos) {
+    return strings::StrCat("Op execution threads: ", dev);
+  } else {
+    return strings::StrCat("Op scheduling threads: ", dev);
+  }
+}
+string GetMemoryLaneName(const string& dev) {
+  return strings::StrCat("mem usage on:", dev);
+}
+}  // namespace
+
+Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
+                                              const string& category,
+                                              const string& name, int64 pid,
+                                              int64 tid, int64 ts) {
+  Json::Value event(Json::objectValue);
+  event["ph"] = Json::Value(ph);
+  event["cat"] = Json::Value(category);
+  event["name"] = Json::Value(name);
+  event["pid"] = Json::Value(pid);
+  event["tid"] = Json::Value(tid);
+  event["ts"] = Json::Value(ts);
+  return event;
+}
+
+void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
+  Json::Value event(Json::objectValue);
+  event["name"] = Json::Value("process_name");
+  event["ph"] = Json::Value("M");
+  event["pid"] = Json::Value(pid);
+  Json::Value args(Json::objectValue);
+  args["name"] = Json::Value(name);
+  event["args"] = args;
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
+                                      int64 tid, const string& category,
+                                      const string& name, Json::Value args) {
+  Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
+  event["dur"] = Json::Value(duration);
+  event["args"] = std::move(args);
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
+                                         int64 pid, int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
+                                       int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitCounter(const string& category,
+                                       const string& name, int64 pid, int64 ts,
+                                       const string& device, int64 bytes) {
+  Json::Value event = CreateEvent("C", category, name, pid, 0, ts);
+  Json::Value args(Json::objectValue);
+  args[device] = Json::Value(bytes);
+  event["args"] = args;
+  events_.push_back(event);
+}
+
+string ChromeTraceFormatter::Format() {
+  Json::Value trace;
+  trace["traceEvents"] = Json::Value(Json::arrayValue);
+  for (const Json::Value& v : metadata_) {
+    trace["traceEvents"].append(v);
+  }
+  for (const Json::Value& v : events_) {
+    trace["traceEvents"].append(v);
+  }
+  Json::FastWriter writer;
+  string trace_str = writer.write(trace);
+  if (trace_str.length() > 200 * 1024 * 1024) {
+    fprintf(stderr,
+            "Trace file is over 200MB. Chrome might not be able to "
+            "display it. Consider to use filters (e.g. -min_micros "
+            "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
+  }
+  return trace_str;
+}
+
+void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
+  if (!node->Trackable(step)) {
+    return;
+  }
+  Device& dev = devices_[node->node->canonical_device()];
+  int64 end_micros = node->node->latest_end_micros(step);
+  if (node->node->accelerator_persistent_bytes(step) != 0) {
+    string tensor_name = strings::StrCat(node->name(), ":", -1);
+    dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
+    dev.tensor_size[tensor_name] =
+        node->node->accelerator_persistent_bytes(step);
+    // TODO(xpan): Need latest_ref?
+  }
+  if (node->node->accelerator_temp_bytes(step)) {
+    string tensor_name = strings::StrCat(node->name(), ":", -2);
+    dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
+    dev.latest_ref[tensor_name] = end_micros;
+    dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step);
+  }
+  if (node->node->allocator_bytes_in_use(step) > 0) {
+    dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step);
+  }
+}
+
+void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node,
+                                        const GraphNode* src) {
+  if (!node->Trackable(step) || !src->Trackable(step)) {
+    return;
+  }
+  const auto& output_idx = node->node->src_output_idx().find(src->name());
+  if (output_idx == node->node->src_output_idx().end()) {
+    return;
+  }
+  const auto& output = src->node->output_bytes(step).find(output_idx->second);
+  if (output == src->node->output_bytes(step).end()) {
+    return;
+  }
+  int64 output_bytes = output->second.first;
+  uint64 output_ptr = output->second.second;
+
+  Device& src_dev = devices_[src->node->canonical_device()];
+  string tensor_name = strings::StrCat(output_ptr);
+  if (output_ptr == 0) {
+    fprintf(stderr, "output no ptr\n");
+    tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second);
+  }
+
+  src_dev.tensor_size[tensor_name] = output_bytes;
+  src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step);
+
+  int64 src_end_micros = src->node->latest_end_micros(step);
+
+  if (src->node->canonical_device() != node->node->canonical_device()) {
+    int64 transfer_micros =
+        (src_end_micros + node->node->all_start_micros(step)) / 2;
+    src_dev.latest_ref[tensor_name] =
+        std::max(src_dev.latest_ref[tensor_name], transfer_micros);
+
+    Device& dest_dev = devices_[node->node->canonical_device()];
+    string dest_tensor_name =
+        strings::StrCat(tensor_name, node->node->canonical_device());
+    dest_dev.tensor_size[dest_tensor_name] = output_bytes;
+    dest_dev.earliest_ref[dest_tensor_name] = transfer_micros;
+    dest_dev.latest_ref[dest_tensor_name] =
+        std::max(dest_dev.latest_ref[dest_tensor_name],
+                 node->node->latest_end_micros(step));
+  } else {
+    src_dev.latest_ref[tensor_name] = std::max(
+        src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step));
+  }
+}
+
+void Timeline::AllocateTimeNodes(GraphNode* gnode) {
+  if (gnode->Trackable(step_)) {
+    TrackNode(gnode);
+    const TFGraphNode* node = gnode->node;
+    for (const auto& kernel_execs : node->op_execs(step_)) {
+      const string& device = kernel_execs.first;
+      if (!IsCombinedGPUStream(device) && !IsCPUDevice(device)) {
+        continue;
+      }
+
+      if (process_.find(device) == process_.end()) {
+        int64 pid = AllocatePID();
+        process_[device].reset(new Process(device, pid));
+        chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
+      }
+      Process* p = process_[device].get();
+
+      for (const auto& exec : kernel_execs.second) {
+        int64 start_micros = exec.first;
+        int64 exec_micros = exec.second;
+        // TODO(xpan): There might be start time duplication here.
+        if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
+          // TODO(xpan): Give each kernel call a unique_name.
+          tnodes_[device][start_micros].reset(
+              new TimeNode(p, gnode, start_micros, exec_micros));
+        }
+      }
+    }
+  }
+  for (GraphNode* n : gnode->show_children) {
+    AllocateTimeNodes(n);
+  }
+}
+
+void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
+  for (GraphNode* gnode : gnodes) {
+    AllocateTimeNodes(gnode);
+  }
+  for (auto& process : tnodes_) {
+    for (auto& tn : process.second) {
+      TimeNode* tnode = tn.second.get();
+      for (GraphNode* inp : tnode->node->children) {
+        if (!inp->account || !inp->Trackable(step_)) {
+          continue;
+        }
+        TrackNodeConnection(tnode->node, inp);
+        for (const auto& kernel_execs : inp->node->op_execs(step_)) {
+          if (process.first == kernel_execs.first) {
+            // Not interested in flow withthin the same device.
+            continue;
+          }
+          for (const auto& exec : kernel_execs.second) {
+            int64 start_micros = exec.first;
+            auto cprocess = tnodes_.find(kernel_execs.first);
+            if (cprocess == tnodes_.end()) continue;
+            auto ctn = cprocess->second.find(start_micros);
+            if (ctn == cprocess->second.end()) continue;
+            ctn->second->next_tnodes.push_back(tnode);
+          }
+        }
+      }
+    }
+  }
+
+  AllocateLanes();
+  fprintf(stdout, "generating trace file.\n");
+  int64 flow_id = 1;
+  for (const auto& process : alloc_nodes_) {
+    for (const auto& lane : process.second) {
+      for (const auto& node : lane.second) {
+        TimeNode* tnode = node.second;
+
+        Json::Value args(Json::objectValue);
+        args["name"] = Json::Value(tnode->name());
+        args["op"] = Json::Value(tnode->name());
+        chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
+                                     process.first, lane.first, "Op",
+                                     tnode->name(), args);
+        // Flow is a directed arrow pointing from src to dst.
+        // TODO(xpan): Disable flow to reduce json file size for now. Need
+        // to think of a better way to make flow interpretable.
+        for (TimeNode* next_tnode : node.second->next_tnodes) {
+          chrome_formatter_.EmitFlowStart(
+              tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
+              process.first, lane.first, flow_id);
+          chrome_formatter_.EmitFlowEnd(
+              tnode->name() + "_flow", next_tnode->start_micros,
+              next_tnode->process->pid, next_tnode->tid, flow_id);
+          flow_id += 1;
+        }
+      }
+    }
+  }
+  for (const auto& dev : mem_tracker_.devices()) {
+    int64 pid = AllocatePID();
+    chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
+    const MemoryTracker::Device& device = dev.second;
+
+    for (const auto& alloc_stats : device.allocator_stats) {
+      chrome_formatter_.EmitCounter("Memory", "Memory Series", pid,
+                                    alloc_stats.first, dev.first,
+                                    alloc_stats.second);
+    }
+  }
+  OutputTimeline();
+}
+
+void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::GenerateCodeTimeline(const CodeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::OutputTimeline() {
+  Status s =
+      WriteStringToFile(Env::Default(), outfile_, chrome_formatter_.Format());
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
+            outfile_.c_str(), s.ToString().c_str());
+    return;
+  }
+  fprintf(stdout, "\n******************************************************\n");
+  fprintf(stdout,
+          "Timeline file is written to %s.\n"
+          "Open a Chrome browser, enter URL chrome://tracing and "
+          "load the timeline file.",
+          outfile_.c_str());
+  fprintf(stdout, "\n******************************************************\n");
+  fflush(stdout);
+}
+
+void Timeline::AllocateLanes() {
+  for (auto& process : tnodes_) {
+    Process* p = process_[process.first].get();
+    for (auto& tnode : process.second) {
+      int64 start_time = tnode.second->start_micros;
+      int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
+      int64 l = -1;
+      for (int i = 0; i < p->lanes.size(); ++i) {
+        const auto& lane = p->lanes[i];
+        l = i;
+        for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
+          if (cur_it->second > start_time) {
+            l = -1;
+            break;
+          }
+          if (start_time > cur_it->second) {
+            break;
+          }
+        }
+        if (l >= 0) {
+          break;
+        }
+      }
+      if (l < 0) {
+        l = p->lanes.size();
+        std::map<int64, int64> nlane;
+        nlane[start_time] = end_time;
+        p->lanes.push_back(nlane);
+      } else {
+        p->lanes[l][start_time] = end_time;
+      }
+      tnode.second->tid = l;
+      alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
+    }
+  }
+}
+
+int64 Timeline::AllocatePID() {
+  int64 cur_pid = next_pid_;
+  next_pid_ += 1;
+  return cur_pid;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.h b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
new file mode 100644
index 00000000000..0bba67066f0
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
@@ -0,0 +1,194 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+typedef std::map<string, string> Event;
+
+class ChromeTraceFormatter {
+ public:
+  ChromeTraceFormatter() {}
+
+  Json::Value CreateEvent(const string& ph, const string& category,
+                          const string& name, int64 pid, int64 tid, int64 ts);
+
+  void EmitPID(const string& name, int64 pid);
+
+  void EmitRegion(int64 ts, int64 duration, int64 pid, int64 tid,
+                  const string& category, const string& name, Json::Value args);
+
+  void EmitFlowStart(const string& name, int64 ts, int64 pid, int64 tid,
+                     int64 flow_id);
+
+  void EmitFlowEnd(const string& name, int64 ts, int64 pid, int64 tid,
+                   int64 flow_id);
+
+  void EmitCounter(const string& category, const string& name, int64 pid,
+                   int64 ts, const string& device, int64 bytes);
+
+  string Format();
+
+ private:
+  std::vector<Json::Value> events_;
+  std::vector<Json::Value> metadata_;
+};
+
+class Process {
+ public:
+  Process(const string& device, int64 pid) : device(device), pid(pid) {}
+
+  // Each lane is a map from start_time to end_time.
+  std::vector<std::map<int64, int64>> lanes;
+  string device;
+  int64 pid;
+};
+
+class TimeNode {
+ public:
+  TimeNode(Process* process, GraphNode* node, int64 start_micros,
+           int64 exec_micros)
+      : process(process),
+        node(node),
+        start_micros(start_micros),
+        exec_micros(exec_micros),
+        tid(-1) {}
+  virtual ~TimeNode() {}
+
+  const string& name() { return node->name(); }
+
+  Process* process;
+  GraphNode* node;
+  int64 start_micros;
+  int64 exec_micros;
+  int64 tid;
+  std::vector<TimeNode*> next_tnodes;
+};
+
+// Tracking the memory based on the op input/output, temporary bytes and
+// persistent bytes.
+// Currently, we calculate a "predicted" memory, but do not use it for display.
+// The displayed memory timeline is directly from the TensorFlow allocator,
+// which is the groundtruth.
+class MemoryTracker {
+ public:
+  class Device {
+   public:
+    // The first 3 fields are predicted.
+    std::map<string, int64> tensor_size;
+    std::map<string, int64> earliest_ref;
+    std::map<string, int64> latest_ref;
+    // ground truth memory stats. time->bytes.
+    std::map<int64, int64> allocator_stats;
+  };
+
+  void TrackNode(int64 step, const GraphNode* node);
+
+  void TrackNodeConnection(int64 step, const GraphNode* node,
+                           const GraphNode* src);
+
+  const std::map<string, Device>& devices() const { return devices_; }
+
+ private:
+  std::map<string, Device> devices_;
+};
+
+class Timeline {
+ public:
+  Timeline(int64 step, const string& outfile)
+      : step_(step), outfile_(outfile) {}
+  ~Timeline() {}
+
+  int64 step() const { return step_; }
+  void SetStep(int64 step) { step_ = step; }
+
+  void GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes);
+
+  void GenerateScopeTimeline(const ScopeNode* node);
+
+  void GenerateCodeTimeline(const CodeNode* node);
+
+  void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
+
+  void TrackNodeConnection(GraphNode* node, GraphNode* src) {
+    mem_tracker_.TrackNodeConnection(step_, node, src);
+  }
+
+ private:
+  void OutputTimeline();
+
+  template <typename Node>
+  void EmitTreeNode(const Node* node, int64 start_time, int64 duration,
+                    int64 depth, std::set<int64>* visited_depth) {
+    if (visited_depth->find(depth) == visited_depth->end()) {
+      chrome_formatter_.EmitPID(strings::StrCat("Scope:", depth), depth);
+      visited_depth->insert(depth);
+    }
+
+    Json::Value args(Json::objectValue);
+    args["name"] = Json::Value(node->name());
+    args["op"] = Json::Value(node->name());
+    chrome_formatter_.EmitRegion(start_time, duration, depth, 0, "Op",
+                                 node->name(), args);
+
+    int64 total_micros = 0;
+    int64 c_start_time = start_time;
+    for (const Node* child : node->show_children) {
+      int64 total_exec_micros = child->proto().total_exec_micros();
+      if (total_exec_micros <= 0) {
+        continue;
+      }
+      EmitTreeNode(child, c_start_time, total_exec_micros, depth + 1,
+                   visited_depth);
+      c_start_time += total_exec_micros;
+      total_micros += total_exec_micros;
+    }
+    CHECK(total_micros <= duration) << node->name() << " parent:" << duration
+                                    << " children:" << total_micros;
+  }
+
+  void AllocateTimeNodes(GraphNode* gnode);
+
+  void AllocateLanes();
+
+  int64 AllocatePID();
+
+  int64 step_;
+  const string outfile_;
+  int64 next_pid_ = 0;
+  int64 allocator_pid_ = -1;
+  MemoryTracker mem_tracker_;
+  ChromeTraceFormatter chrome_formatter_;
+  std::map<string, int64> device_pids_;
+
+  std::map<string, std::unique_ptr<Process>> process_;
+  std::map<int64, std::map<int64, std::map<int64, TimeNode*>>> alloc_nodes_;
+  std::map<string, std::map<int64, std::unique_ptr<TimeNode>>> tnodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
new file mode 100644
index 00000000000..bcf2bf05946
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfTimelineTest : public ::testing::Test {
+ protected:
+  TFProfTimelineTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                nullptr, nullptr));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+// Before adding test, first dump the json file and
+// manually check it's correct
+TEST_F(TFProfTimelineTest, GraphView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(10000, 0, 0, 0, 0, 0, 0, "name", {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "timeline",
+               {{"outfile", dump_file}});
+  tf_stats_->ShowGraphNode("graph", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(5576767607271035974ull, Hash64(dump_str));
+}
+
+TEST_F(TFProfTimelineTest, ScopeView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, 0, 0, "name", {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "timeline",
+               {{"outfile", dump_file}});
+  tf_stats_->ShowGraphNode("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(10135186027625211652ull, Hash64(dump_str));
+}
+
+// TODO(xpan): tfprof_log is too large to include in testdata when adding
+// code traces.
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
index 6d557e91933..0bc12170125 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
@@ -94,7 +94,7 @@ string StripQuote(const string& s) {
   return s.substr(start, end - start + 1);
 }
 
-tensorflow::Status ReturnError(const std::vector<string> pieces, int idx) {
+tensorflow::Status ReturnError(const std::vector<string>& pieces, int idx) {
   string val;
   if (pieces.size() > idx + 1) {
     val = pieces[idx + 1];
@@ -110,7 +110,7 @@ bool CaseEqual(StringPiece s1, StringPiece s2) {
 }
 
 bool StringToBool(StringPiece str, bool* value) {
-  CHECK(value != NULL) << "NULL output boolean given.";
+  CHECK(value != nullptr) << "NULL output boolean given.";
   if (CaseEqual(str, "true") || CaseEqual(str, "t") || CaseEqual(str, "yes") ||
       CaseEqual(str, "y") || CaseEqual(str, "1")) {
     *value = true;
@@ -170,13 +170,18 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       }
       ++i;
     } else if (pieces[i] == tensorflow::tfprof::kOptions[5]) {
-      if (pieces.size() <= i + 1) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->min_occurrence)) {
         return ReturnError(pieces, i);
       }
-      opts->device_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
-                                             str_util::SkipEmpty());
       ++i;
     } else if (pieces[i] == tensorflow::tfprof::kOptions[6]) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->step)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[7]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
@@ -188,42 +193,42 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       }
       opts->order_by = *order_by;
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[7]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[8]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
       opts->account_type_regexes = str_util::Split(StripQuote(pieces[i + 1]),
                                                    ',', str_util::SkipEmpty());
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[8]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[9]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
       opts->start_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
                                                  str_util::SkipEmpty());
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[9]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[10]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
       opts->trim_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
                                                 str_util::SkipEmpty());
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[10]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[11]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
       opts->show_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
                                                 str_util::SkipEmpty());
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[11]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[12]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
       opts->hide_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
                                                 str_util::SkipEmpty());
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[12]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[13]) {
       if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
           pieces.size() == i + 1) {
         opts->account_displayed_op_only = true;
@@ -233,7 +238,7 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       } else {
         ++i;
       }
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[13]) {
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
@@ -250,20 +255,14 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       }
       opts->select = requested_set;
       ++i;
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
-      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
-          pieces.size() == i + 1) {
-        opts->viz = true;
-      } else if (!StringToBool(pieces[i + 1], &opts->viz)) {
-        return ReturnError(pieces, i);
-      } else {
-        ++i;
-      }
     } else if (pieces[i] == tensorflow::tfprof::kOptions[15]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
-      opts->dump_to_file = StripQuote(pieces[i + 1]);
+
+      tensorflow::Status s =
+          ParseOutput(pieces[i + 1], &opts->output_type, &opts->output_options);
+      if (!s.ok()) return s;
       ++i;
     } else {
       return ReturnError(pieces, i);
@@ -298,8 +297,10 @@ void PrintHelp() {
       "float operations. Only available if an op has "
       "op.RegisterStatistics() defined and OpLog is "
       "provided\n\n"
-      "  -device_regexes: Show ops that a placed on the specified devices. "
-      "regexes are comma-separated.\n\n"
+      "  -min_occurrence: Show the op types that are at least used this number "
+      "of times. Only available in op view.\n\n"
+      "  -step: Show the stats of a step when multiple steps of "
+      "RunMetadata were added. By default (-1), show the average of all steps."
       "  -order_by: Order the results by [name|depth|bytes|micros|params|"
       "float_ops]\n\n"
       "  -account_type_regexes: Account and display the ops whose types match "
@@ -332,7 +333,7 @@ void PrintHelp() {
       "ops eventually displayed. If False, account all "
       "op statistics matching -account_type_regexes recursively.\n\n"
       "  -select: Comma-separated list of metrics to show: [bytes|micros|"
-      "params|float_ops|num_hidden_ops|tensor_value|device|op_types]."
+      "params|float_ops|tensor_value|device|op_types]."
       "\n\n"
       "  -dump_to_file: Dump the output to a file, instead of terminal.\n\n"
       ""
diff --git a/tensorflow/tools/tfprof/tfprof_log.proto b/tensorflow/tools/tfprof/tfprof_log.proto
index cae6e1e3a8c..5c47142e0ab 100644
--- a/tensorflow/tools/tfprof/tfprof_log.proto
+++ b/tensorflow/tools/tfprof/tfprof_log.proto
@@ -2,6 +2,17 @@ syntax = "proto2";
 
 package tensorflow.tfprof;
 
+// It specifies the Python callstack that creates an op.
+message CodeDef {
+  repeated Trace traces = 1;
+  message Trace {
+    optional string file = 1;
+    optional int32 lineno = 2;
+    optional string function = 3;
+    optional string line = 4;
+  }
+}
+
 message OpLogEntry {
   // op name.
   optional string name = 1;
@@ -12,6 +23,8 @@ message OpLogEntry {
   // User can define extra op type information for an op. This allows the user
   // to select a group of ops precisely using op_type as a key.
   repeated string types = 3;
+  // Used to support tfprof "code" view.
+  optional CodeDef code_def = 4;
 }
 
 message OpLog {
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
index a8ed6e38132..ae02b526347 100644
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -40,7 +41,7 @@ limitations under the License.
 using tensorflow::str_util::Split;
 
 void completion(const char* buf, linenoiseCompletions* lc) {
-  tensorflow::string buf_str = tensorflow::string(buf);
+  tensorflow::string buf_str = buf;
   if (buf_str.find(" ") == buf_str.npos) {
     for (const char* opt : tensorflow::tfprof::kCmds) {
       if (tensorflow::string(opt).find(buf_str) == 0) {
@@ -68,22 +69,22 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_run_meta_path = "";
   tensorflow::string FLAGS_op_log_path = "";
   tensorflow::string FLAGS_checkpoint_path = "";
-  tensorflow::int32 FLAGS_max_depth = 4;
+  tensorflow::int32 FLAGS_max_depth = 10;
   tensorflow::int64 FLAGS_min_bytes = 0;
   tensorflow::int64 FLAGS_min_micros = 0;
   tensorflow::int64 FLAGS_min_params = 0;
   tensorflow::int64 FLAGS_min_float_ops = 0;
-  tensorflow::string FLAGS_device_regexes = ".*";
+  tensorflow::int64 FLAGS_min_occurrence = 0;
+  tensorflow::int64 FLAGS_step = -1;
   tensorflow::string FLAGS_order_by = "name";
-  tensorflow::string FLAGS_account_type_regexes = "Variable,VariableV2";
+  tensorflow::string FLAGS_account_type_regexes = ".*";
   tensorflow::string FLAGS_start_name_regexes = ".*";
   tensorflow::string FLAGS_trim_name_regexes = "";
   tensorflow::string FLAGS_show_name_regexes = ".*";
   tensorflow::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
   tensorflow::string FLAGS_select = "params";
-  bool FLAGS_viz = false;
-  tensorflow::string FLAGS_dump_to_file = "";
+  tensorflow::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     fprintf(stderr, "%s\n", argv[i]);
   }
@@ -92,7 +93,8 @@ int main(int argc, char** argv) {
       tensorflow::Flag("graph_path", &FLAGS_graph_path,
                        "GraphDef proto text file name"),
       tensorflow::Flag("run_meta_path", &FLAGS_run_meta_path,
-                       "RunMetadata proto binary file name"),
+                       "Comma-separated list of RunMetadata proto binary "
+                       "files. Each file is given step number 0,1,2,etc"),
       tensorflow::Flag("op_log_path", &FLAGS_op_log_path,
                        "tensorflow::tfprof::OpLog proto binary file name"),
       tensorflow::Flag("checkpoint_path", &FLAGS_checkpoint_path,
@@ -102,8 +104,10 @@ int main(int argc, char** argv) {
       tensorflow::Flag("min_micros", &FLAGS_min_micros, "min micros"),
       tensorflow::Flag("min_params", &FLAGS_min_params, "min params"),
       tensorflow::Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
-      tensorflow::Flag("device_regexes", &FLAGS_device_regexes,
-                       "device regexes"),
+      tensorflow::Flag("min_occurrence", &FLAGS_min_occurrence,
+                       "min occurrence"),
+      tensorflow::Flag("step", &FLAGS_step,
+                       "The stats of which step to use. By default average"),
       tensorflow::Flag("order_by", &FLAGS_order_by, "order by"),
       tensorflow::Flag("account_type_regexes", &FLAGS_start_name_regexes,
                        "start name regexes"),
@@ -117,7 +121,7 @@ int main(int argc, char** argv) {
                        &FLAGS_account_displayed_op_only,
                        "account displayed op only"),
       tensorflow::Flag("select", &FLAGS_select, "select"),
-      tensorflow::Flag("dump_to_file", &FLAGS_dump_to_file, "dump to file"),
+      tensorflow::Flag("output", &FLAGS_output, "output"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -129,8 +133,6 @@ int main(int argc, char** argv) {
 
   fprintf(stderr, "%s\n", FLAGS_graph_path.c_str());
 
-  std::vector<tensorflow::string> device_regexes =
-      Split(FLAGS_device_regexes, ',', tensorflow::str_util::SkipEmpty());
   std::vector<tensorflow::string> account_type_regexes =
       Split(FLAGS_account_type_regexes, ',', tensorflow::str_util::SkipEmpty());
   std::vector<tensorflow::string> start_name_regexes =
@@ -144,6 +146,12 @@ int main(int argc, char** argv) {
   std::vector<tensorflow::string> select =
       Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
 
+  tensorflow::string output_type;
+  std::map<tensorflow::string, tensorflow::string> output_options;
+  tensorflow::Status s = tensorflow::tfprof::ParseOutput(
+      FLAGS_output, &output_type, &output_options);
+  CHECK(s.ok()) << s.ToString();
+
   tensorflow::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty()) {
     printf("1) go/tfprof: Tutorial.\n");
@@ -160,12 +168,14 @@ int main(int argc, char** argv) {
         "Profiling everything!\n");
     return 0;
   } else if (argc > 1) {
-    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
+    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[5]) {
       tensorflow::tfprof::PrintHelp();
       return 0;
     }
     if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[0] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1]) {
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1] ||
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[2] ||
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
       cmd = argv[1];
     }
   }
@@ -175,20 +185,20 @@ int main(int argc, char** argv) {
   TF_CHECK_OK(tensorflow::tfprof::ReadGraphDef(tensorflow::Env::Default(),
                                                FLAGS_graph_path, graph.get()));
 
-  std::unique_ptr<tensorflow::RunMetadata> run_meta(
-      new tensorflow::RunMetadata());
-  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_run_meta_path,
-                       run_meta.get())
-           .ok()) {
-    run_meta.release();
-  }
-
   std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
       new tensorflow::tfprof::OpLog());
-  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_op_log_path,
-                       op_log.get())
-           .ok()) {
-    op_log.release();
+  if (!FLAGS_op_log_path.empty()) {
+    tensorflow::string op_log_str;
+    s = tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                     FLAGS_op_log_path, &op_log_str);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
+      return 1;
+    }
+    if (!tensorflow::ParseProtoUnlimited(op_log.get(), op_log_str)) {
+      fprintf(stderr, "Failed to parse op_log_path\n");
+      return 1;
+    }
   }
 
   std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
@@ -204,18 +214,38 @@ int main(int argc, char** argv) {
     TF_DeleteStatus(status);
   }
 
-  tensorflow::tfprof::TFStats tf_stat(std::move(graph), std::move(run_meta),
-                                      std::move(op_log),
-                                      std::move(ckpt_reader));
+  tensorflow::tfprof::TFStats tf_stat(
+      std::move(graph), nullptr, std::move(op_log), std::move(ckpt_reader));
+
+  std::vector<tensorflow::string> run_meta_files =
+      Split(FLAGS_run_meta_path, ',', tensorflow::str_util::SkipEmpty());
+  for (int i = 0; i < run_meta_files.size(); ++i) {
+    std::unique_ptr<tensorflow::RunMetadata> run_meta(
+        new tensorflow::RunMetadata());
+    s = ReadBinaryProto(tensorflow::Env::Default(), run_meta_files[i],
+                        run_meta.get());
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
+              run_meta_files[i].c_str(), s.ToString().c_str());
+      return 1;
+    }
+    tf_stat.ParseRunMeta(i, std::move(run_meta));
+  }
+
   tensorflow::tfprof::Options opts(
       FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
-      FLAGS_min_float_ops, device_regexes, FLAGS_order_by, account_type_regexes,
-      start_name_regexes, trim_name_regexes, show_name_regexes,
-      hide_name_regexes, FLAGS_account_displayed_op_only, select, FLAGS_viz,
-      FLAGS_dump_to_file);
+      FLAGS_min_float_ops, FLAGS_min_occurrence, FLAGS_step, FLAGS_order_by,
+      account_type_regexes, start_name_regexes, trim_name_regexes,
+      show_name_regexes, hide_name_regexes, FLAGS_account_displayed_op_only,
+      select, output_type, output_options);
 
-  if (!cmd.empty()) {
-    tf_stat.PrintGraph(cmd, opts);
+  if (cmd == tensorflow::tfprof::kCmds[2] ||
+      cmd == tensorflow::tfprof::kCmds[3]) {
+    tf_stat.ShowMultiGraphNode(cmd, opts);
+    return 0;
+  } else if (cmd == tensorflow::tfprof::kCmds[0] ||
+             cmd == tensorflow::tfprof::kCmds[1]) {
+    tf_stat.ShowGraphNode(cmd, opts);
     return 0;
   }
 
@@ -223,7 +253,7 @@ int main(int argc, char** argv) {
   linenoiseHistoryLoad(".tfprof_history.txt");
 
   for (char* line = nullptr; (line = linenoise("tfprof> ")) != nullptr;) {
-    tensorflow::string line_s = tensorflow::string(line);
+    tensorflow::string line_s = line;
     free(line);
 
     if (line_s.empty()) {
@@ -240,12 +270,16 @@ int main(int argc, char** argv) {
       fprintf(stderr, "E: %s\n", s.ToString().c_str());
       continue;
     }
-    if (cmd == tensorflow::tfprof::kCmds[2]) {
+    if (cmd == tensorflow::tfprof::kCmds[4]) {
       opts = new_opts;
-    } else if (cmd == tensorflow::tfprof::kCmds[3]) {
+    } else if (cmd == tensorflow::tfprof::kCmds[5]) {
       tensorflow::tfprof::PrintHelp();
-    } else {
-      tf_stat.PrintGraph(cmd, new_opts);
+    } else if (cmd == tensorflow::tfprof::kCmds[2] ||
+               cmd == tensorflow::tfprof::kCmds[3]) {
+      tf_stat.ShowMultiGraphNode(cmd, new_opts);
+    } else if (cmd == tensorflow::tfprof::kCmds[0] ||
+               cmd == tensorflow::tfprof::kCmds[1]) {
+      tf_stat.ShowGraphNode(cmd, new_opts);
     }
   }
   return 0;
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
index 0d8e6880390..27eafb1ca9c 100644
--- a/tensorflow/tools/tfprof/tfprof_options.proto
+++ b/tensorflow/tools/tfprof/tfprof_options.proto
@@ -10,7 +10,9 @@ message OptionsProto {
   optional int64 min_micros = 3;
   optional int64 min_params = 4;
   optional int64 min_float_ops = 5;
-  repeated string device_regexes = 6;
+  optional int64 min_occurrence = 17;
+  optional int64 step = 18 [default = -1];
+
   optional string order_by = 7;
   repeated string account_type_regexes = 8;
   repeated string start_name_regexes = 9;
@@ -19,6 +21,6 @@ message OptionsProto {
   repeated string hide_name_regexes = 12;
   optional bool account_displayed_op_only = 13;
   repeated string select = 14;
-  optional bool viz = 15;
+  optional string output = 15;
   optional string dump_to_file = 16;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
index 9afd41046e4..d00e93c939f 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/tools/tfprof/tfprof_output.proto
@@ -14,7 +14,8 @@ message TFProfTensorProto {
   repeated string value_str = 4;
 }
 
-message TFProfNode {
+// A node in TensorFlow graph. Used by scope/graph view.
+message TFGraphNodeProto {
   // op name.
   optional string name = 1;
   // tensor value restored from checkpoint.
@@ -27,23 +28,59 @@ message TFProfNode {
   optional int64 parameters = 4;
   // Number of float operations.
   optional int64 float_ops = 13;
-  // Number of inputs to the op.
-  optional int64 inputs = 5;
   // Device the op is assigned to.
-  optional string device = 10;
+  // Since an op can fire multiple kernel calls, there can be multiple devices.
+  repeated string devices = 10;
 
-  // The following are the aggregated stats from all accounted descendants and
-  // the op itself. The actual descendants depend on the data structure used
+  // The following are the aggregated stats from all accounted children and
+  // the node itself. The actual children depend on the data structure used
   // (scope, graph).
   optional int64 total_exec_micros = 6;
   optional int64 total_requested_bytes = 7;
   optional int64 total_parameters = 8;
   optional int64 total_float_ops = 14;
-  optional int64 total_inputs = 9;
 
   // shape information, if available.
+  // TODO(xpan): Why is this repeated?
   repeated TensorShapeProto shapes = 11;
+
+  map<int32, TensorShapeProto> input_shapes = 16;
+
   // Descendants of the graph. The actual descendants depend on the data
   // structure used (scope, graph).
-  repeated TFProfNode children = 12;
+  repeated TFGraphNodeProto children = 12;
+}
+
+// A node that groups multiple TFGraphNodeProto.
+// Depending on the 'view', the semantics of the TFmultiGraphNodeProto
+// is different:
+// code view: A node groups all TensorFlow graph nodes created by the
+//            Python code.
+// op view:   A node groups all TensorFlow graph nodes that are of type
+//            of the op (e.g. MatMul, Conv2D).
+message TFMultiGraphNodeProto {
+  // Name of the node.
+  optional string name = 1;
+
+  // code execution time.
+  optional int64 exec_micros = 2;
+  // Total requested bytes by the code.
+  optional int64 requested_bytes = 3;
+  // Number of parameters if available.
+  optional int64 parameters = 4;
+  // Number of float operations.
+  optional int64 float_ops = 5;
+
+  // The following are the aggregated stats from descendants.
+  // The actual descendants depend on the data structure used.
+  optional int64 total_exec_micros = 6;
+  optional int64 total_requested_bytes = 7;
+  optional int64 total_parameters = 8;
+  optional int64 total_float_ops = 9;
+
+  // TensorFlow graph nodes contained by the TFMultiGraphNodeProto.
+  repeated TFGraphNodeProto graph_nodes = 10;
+  // Descendants of the node. The actual descendants depend on the data
+  // structure used.
+  repeated TFMultiGraphNodeProto children = 11;
 }
\ No newline at end of file
diff --git a/tensorflow/user_ops/ackermann_op.cc b/tensorflow/user_ops/ackermann_op.cc
index 0002d658e82..d42ca6f662e 100644
--- a/tensorflow/user_ops/ackermann_op.cc
+++ b/tensorflow/user_ops/ackermann_op.cc
@@ -32,7 +32,7 @@ class AckermannOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     // Output a scalar string.
-    Tensor* output_tensor = NULL;
+    Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape(), &output_tensor));
     auto output = output_tensor->scalar<string>();
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 38fc144d52b..ec5922ada8f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,9 +1,29 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles_external")
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
+load("//third_party/py:python_configure.bzl", "python_configure")
+
+load("//third_party:polymer.bzl", "tensorboard_polymer_workspace")
+load("//third_party:python.bzl", "tensorboard_python_workspace")
+load("//third_party:js.bzl", "tensorboard_js_workspace")
+load("//third_party:typings.bzl", "tensorboard_typings_workspace")
+
+
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  return repository_ctx.os.name.lower().find("windows") != -1
+
+
+def _get_env_var(repository_ctx, name):
+  """Find an environment variable."""
+  if name in repository_ctx.os.environ:
+    return repository_ctx.os.environ[name]
+  else:
+    return None
 
 
 # Parse the bazel version string from `native.bazel_version`.
@@ -13,20 +33,23 @@ def _parse_bazel_version(bazel_version):
 
   # Split into (release, date) parts and only return the release
   # as a tuple of integers.
-  parts = version.split('-', 1)
+  parts = version.split("-", 1)
 
   # Turn "release" into a tuple of strings
   version_tuple = ()
-  for number in parts[0].split('.'):
+  for number in parts[0].split("."):
     version_tuple += (str(number),)
   return version_tuple
 
+
 # Check that a specific bazel version is being used.
 def check_version(bazel_version):
   if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % bazel_version)
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" %
+         bazel_version)
   elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("\nCurrent Bazel is not a release version, cannot check for " +
+          "compatibility.")
     print("Make sure that you are running at least Bazel %s.\n" % bazel_version)
   else:
     current_bazel_version = _parse_bazel_version(native.bazel_version)
@@ -34,55 +57,124 @@ def check_version(bazel_version):
     if minimum_bazel_version > current_bazel_version:
       fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
           native.bazel_version, bazel_version))
-  pass
+
+
+def _repos_are_siblings():
+  return Label("@foo//bar").workspace_root.startswith("../")
+
 
 # Temporary workaround to support including TensorFlow as a submodule until this
 # use-case is supported in the next Bazel release.
 def _temp_workaround_http_archive_impl(repo_ctx):
-   repo_ctx.template("BUILD", repo_ctx.attr.build_file,
-                     {"%ws%": repo_ctx.attr.repository}, False)
-   repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                 "", repo_ctx.attr.strip_prefix)
+  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
+      "%prefix%": ".." if _repos_are_siblings() else "external",
+      "%ws%": repo_ctx.attr.repository
+  }, False)
+  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
+                                "", repo_ctx.attr.strip_prefix)
+  if repo_ctx.attr.patch_file != None:
+    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+
 
 temp_workaround_http_archive = repository_rule(
-   implementation=_temp_workaround_http_archive_impl,
-   attrs = {
-      "build_file": attr.label(),
-      "repository": attr.string(),
-      "urls": attr.string_list(default = []),
-      "sha256": attr.string(default = ""),
-      "strip_prefix": attr.string(default = ""),
-   })
+    implementation = _temp_workaround_http_archive_impl,
+    attrs = {
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "patch_file": attr.label(default = None),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
+
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
+  if result.return_code != 0:
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1'
+def _apply_patch(repo_ctx, patch_file):
+  cmd = [
+      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
+  ]
+  if _is_windows(repo_ctx):
+    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  _execute_and_check_ret_code(repo_ctx, cmd)
+
+
+# Download the repository and apply a patch to its root
+def _patched_http_archive_impl(repo_ctx):
+  repo_ctx.download_and_extract(
+      repo_ctx.attr.urls,
+      sha256=repo_ctx.attr.sha256,
+      stripPrefix=repo_ctx.attr.strip_prefix)
+  _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+
+
+patched_http_archive = repository_rule(
+    implementation = _patched_http_archive_impl,
+    attrs = {
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
+
 
 # If TensorFlow is linked as a submodule.
-# path_prefix and tf_repo_name are no longer used.
-def tf_workspace(path_prefix = "", tf_repo_name = ""):
-  cuda_configure(name = "local_config_cuda")
-  sycl_configure(name = "local_config_sycl")
+# path_prefix is no longer used.
+# tf_repo_name is thought to be under consideration.
+def tf_workspace(path_prefix="", tf_repo_name=""):
+  # We must check the bazel version before trying to parse any other BUILD
+  # files, in case the parsing of those build files depends on the bazel
+  # version we require here.
+  check_version("0.4.5")
+  cuda_configure(name="local_config_cuda")
+  sycl_configure(name="local_config_sycl")
+  python_configure(name="local_config_python")
   if path_prefix:
-    print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
-  if tf_repo_name:
-    print("tf_repo_name was specified to tf_workspace but is no longer used and will be removed in the future.")
+    print("path_prefix was specified to tf_workspace but is no longer used " +
+          "and will be removed in the future.")
+
+  # TODO(dandelion): Take these out when TB exits TF
+  tensorboard_polymer_workspace()
+  tensorboard_python_workspace()
+  tensorboard_typings_workspace()
+  tensorboard_js_workspace()
 
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/60578b474802.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/60578b474802.tar.gz",
+          "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
       ],
-      sha256 = "7527cda827aff351981ebd910012e16be4d899c28a9ae7f143ae60e7f3f7b83d",
-      strip_prefix = "eigen-eigen-60578b474802",
+      sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
+      strip_prefix = "eigen-eigen-f3a22f35b044",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.7.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.7.tar.gz",
+          "http://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
       ],
-      sha256 = "2eea65624a697e74b939511cd2a686b4c957e90c99be168fe134d96771e811ad",
-      strip_prefix = "libxsmm-1.7",
+      sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
+      strip_prefix = "libxsmm-1.8.1",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
@@ -91,10 +183,21 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@libxsmm_archive//third_party:xsmm_avx",
   )
 
+  native.new_http_archive(
+      name = "ortools_archive",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+      ],
+      sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
+      strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
+      build_file = str(Label("//third_party:ortools.BUILD")),
+  )
+
   native.http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "http://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
           "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
       ],
       sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
@@ -104,7 +207,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "gemmlowp",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
+          "http://mirror.bazel.build/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
           "https://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
       ],
       sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
@@ -114,7 +217,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "farmhash_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
+          "http://mirror.bazel.build/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
           "https://github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
       ],
       sha256 = "4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
@@ -130,7 +233,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "highwayhash",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
           "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
@@ -141,7 +244,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "nasm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
+          "http://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
           "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
@@ -152,7 +255,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "jpeg",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "http://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
           "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
@@ -164,7 +267,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "png_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/glennrp/libpng/archive/v1.2.53.zip",
+          "http://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.zip",
           "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
       ],
       sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
@@ -175,7 +278,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "gif_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
       ],
@@ -187,7 +290,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "six_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
           "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
@@ -195,40 +298,57 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
-  native.new_http_archive(
-      name = "org_pocoo_werkzeug",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
-          "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
-      ],
-      strip_prefix = "Werkzeug-0.11.10",
-      sha256 = "cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
-      build_file = str(Label("//third_party:werkzeug.BUILD")),
-  )
-
   native.bind(
       name = "six",
       actual = "@six_archive//:six",
   )
 
-  native.http_archive(
+  patched_http_archive(
       name = "protobuf",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
-          "https://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
       ],
-      sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
-      strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
+      # TODO: remove patching when tensorflow stops linking same protos into
+      #       multiple shared libraries loaded in runtime by python.
+      #       This patch fixes a runtime crash when tensorflow is compiled
+      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
+      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
+  )
+
+  # We need to import the protobuf library under the names com_google_protobuf
+  # and com_google_protobuf_cc to enable proto_library support in bazel.
+  # Unfortunately there is no way to alias http_archives at the moment.
+  native.http_archive(
+      name = "com_google_protobuf",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
+      ],
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
+  )
+
+  native.http_archive(
+      name = "com_google_protobuf_cc",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/v3.3.1.tar.gz",
+          "https://github.com/google/protobuf/archive/v3.3.1.tar.gz",
+      ],
+      sha256 = "30f23a45c6f4515598702a6d19c4295ba92c4a635d7ad8d331a4db9fccff392d",
+      strip_prefix = "protobuf-3.3.1",
   )
 
   native.new_http_archive(
       name = "gmock_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pkgs.fedoraproject.org/repo/pkgs/gmock/gmock-1.7.0.zip/073b984d8798ea1594f5e44d85b20d66/gmock-1.7.0.zip",
-          "http://pkgs.fedoraproject.org/repo/pkgs/gmock/gmock-1.7.0.zip/073b984d8798ea1594f5e44d85b20d66/gmock-1.7.0.zip",
+          "http://mirror.bazel.build/github.com/google/googletest/archive/release-1.8.0.zip",
+          "https://github.com/google/googletest/archive/release-1.8.0.zip",
       ],
-      sha256 = "26fcbb5925b74ad5fc8c26b0495dfc96353f4d553492eb97e85a8a6d2f43095b",
-      strip_prefix = "gmock-1.7.0",
+      sha256 = "f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf",
+      strip_prefix = "googletest-release-1.8.0",
       build_file = str(Label("//third_party:gmock.BUILD")),
   )
 
@@ -242,6 +362,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@gmock_archive//:gtest_main",
   )
 
+  native.http_archive(
+      name = "com_github_gflags_gflags",
+      urls = [
+          "http://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+          "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+      ],
+      sha256 = "4d222fab8f1ede4709cdff417d15a1336f862d7334a81abf76d09c15ecf9acd1",
+      strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
+  )
+
   native.bind(
       name = "python_headers",
       actual = str(Label("//util/python:python_headers")),
@@ -251,7 +381,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
+          "http://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
       strip_prefix = "pcre-8.39",
@@ -262,7 +392,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
@@ -270,15 +400,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       build_file = str(Label("//third_party:swig.BUILD")),
   )
 
-  native.new_http_archive(
+  temp_workaround_http_archive(
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/curl.haxx.se/download/curl-7.49.1.tar.gz",
+          "http://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz",
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
       strip_prefix = "curl-7.49.1",
       build_file = str(Label("//third_party:curl.BUILD")),
+      repository = tf_repo_name
   )
 
   # grpc expects //external:protobuf_clib and //external:protobuf_compiler
@@ -296,7 +427,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "grpc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
+          "http://mirror.bazel.build/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
           "https://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
       ],
       sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
@@ -320,7 +451,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+          "http://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
@@ -332,19 +463,30 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "llvm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/llvm-mirror/llvm/archive/2276fd31f36aa58f39397c435a8be6632d8c8505.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/2276fd31f36aa58f39397c435a8be6632d8c8505.tar.gz",
+          "http://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/e156d99231a7735d06a97b5b83de70bf4ce4f034.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/e156d99231a7735d06a97b5b83de70bf4ce4f034.tar.gz",
       ],
-      sha256 = "0e08c91752732227280466d12f330a5854569deddf28ff4a6c3898334dbb0d16",
-      strip_prefix = "llvm-2276fd31f36aa58f39397c435a8be6632d8c8505",
+      sha256 = "72e34e2411a06d4200a2688ee83832805fbef23a12ea481f31c2b8866fde007a",
+      strip_prefix = "llvm-e156d99231a7735d06a97b5b83de70bf4ce4f034",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
       repository = tf_repo_name,
   )
 
+  native.new_http_archive(
+    name = "lmdb",
+    urls = [
+      "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+      "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+    ],
+    sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
+    strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+    build_file = str(Label("//third_party:lmdb.BUILD")),
+  )
+
   native.new_http_archive(
       name = "jsoncpp_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
           "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
@@ -360,7 +502,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "boringssl",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
+          "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
           "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
       ],
       sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
@@ -370,7 +512,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "nanopb_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
+          "http://mirror.bazel.build/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
           "https://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
       ],
       sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
@@ -386,7 +528,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "zlib_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/zlib.net/zlib-1.2.8.tar.gz",
+          "http://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
           "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
       ],
       sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
@@ -400,29 +542,68 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   )
 
   native.new_http_archive(
+      name = "fft2d",
+      urls = [
+          "http://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+          "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+      ],
+      sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
+      build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
+  )
+
+  temp_workaround_http_archive(
+      name = "snappy",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.zip",
+          "https://github.com/google/snappy/archive/1.1.4.zip",
+      ],
+      sha256 = "6c74d2b663170d68184da353cdd71b5b7d57bc8888ef1e99b4929b5d680dba54",
+      strip_prefix = "snappy-1.1.4",
+      build_file = str(Label("//third_party:snappy.BUILD")),
+      repository = tf_repo_name,
+  )
+
+  temp_workaround_http_archive(
       name = "nccl_archive",
-      url = "https://github.com/NVIDIA/nccl/archive/2a974f5ca2aa12b178046b2206b43f1fd69d9fae.tar.gz",
-      sha256 = "d6aa1a3f20ae85358890d9a96f49c51a75baa1d3af3598501f29ff9ef8a3107d",
-      strip_prefix = "nccl-2a974f5ca2aa12b178046b2206b43f1fd69d9fae",
+      urls = [
+          "http://mirror.bazel.build/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
+          "https://github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
+      ],
+      sha256 = "6c34a0862d9f8ed4ad5984c6a8206b351957bb14cf6ad7822720f285f4aada04",
+      strip_prefix = "nccl-ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b",
       build_file = str(Label("//third_party:nccl.BUILD")),
+      repository = tf_repo_name,
   )
 
-  # Make junit-4.12 available as //external:junit
-  native.http_jar(
-      name = "junit_jar",
-      url = "https://github.com/junit-team/junit4/releases/download/r4.12/junit-4.12.jar",
-      sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
-  )
-
-  native.bind(
+  java_import_external(
       name = "junit",
-      actual = "@junit_jar//jar",
+      jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
+      jar_urls = [
+          "http://mirror.bazel.build/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
+          "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
+          "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar",
+      ],
+      licenses = ["reciprocal"],  # Common Public License Version 1.0
+      testonly_ = True,
+      deps = ["@org_hamcrest_core"],
+  )
+
+  java_import_external(
+      name = "org_hamcrest_core",
+      jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
+      jar_urls = [
+          "http://mirror.bazel.build/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+          "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+          "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+      ],
+      licenses = ["notice"],  # New BSD License
+      testonly_ = True,
   )
 
   temp_workaround_http_archive(
       name = "jemalloc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
           "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
@@ -431,1505 +612,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       repository = tf_repo_name,
   )
 
-  ##############################################################################
-  # TensorBoard Build Tools
-
-  filegroup_external(
-      name = "org_nodejs",
-      # MIT with portions licensed:
-      # - MIT
-      # - Old MIT
-      # - 2-Clause-BSD
-      # - 3-Clause-BSD
-      # - ISC
-      # - Unicode
-      # - zlib
-      # - Artistic 2.0
-      licenses = ["notice"],
-      sha256_urls_extract_macos = {
-          "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
-              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
-          ],
-      },
-      sha256_urls_windows = {
-          "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.exe",
-              "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
-          ],
-          "451a40570099a95488d6438f175813629e0430f87f23c8659bc18dc42494820a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.lib",
-              "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
-          ],
-      },
-      sha256_urls_extract = {
-          "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
-              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
-          ],
-      },
-      strip_prefix = {
-          "node-v4.3.2-darwin-x64.tar.xz": "node-v4.3.2-darwin-x64",
-          "node-v4.3.2-linux-x64.tar.xz": "node-v4.3.2-linux-x64",
-      },
-      executable = [
-          "node",
-          "node.exe",
-      ],
-      # POSTED: Email jart@google.com before changing this whitelist.
-      visibility = ["@com_microsoft_typescript//:__pkg__"],
-  )
-
-  filegroup_external(
-      name = "com_microsoft_typescript",
-      licenses = ["notice"],  # Apache 2.0
-      sha256_urls = {
-          "92ae664a574c87a60ed0dc3aa08a28e366477ae40bc7ab23b512710d5c5b51cc": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.0.6/lib/tsc.js",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.0.6/lib/tsc.js",
-          ],
-          "f4de46e04293569a666f2045f850d90e16dc8ba059af02b5a062942245007a71": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.0.6/lib/lib.es6.d.ts",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.0.6/lib/lib.es6.d.ts",
-          ],
-      },
-      extra_build_file_content = "\n".join([
-          "sh_binary(",
-          "    name = \"tsc\",",
-          "    srcs = [\"tsc.sh\"],",
-          "    data = [",
-          "        \"tsc.js\",",
-          "        \"@org_nodejs\",",
-          "    ],",
-          ")",
-          "",
-          "genrule(",
-          "    name = \"tsc_sh\",",
-          "    outs = [\"tsc.sh\"],",
-          "    cmd = \"cat >$@ <<'EOF'\\n\" +",
-          "          \"#!/bin/bash\\n\" +",
-          "          \"NODE=external/org_nodejs/bin/node\\n\" +",
-          "          \"if [[ -e external/org_nodejs/node.exe ]]; then\\n\" +",
-          "          \"  NODE=external/org_nodejs/node.exe\\n\" +",
-          "          \"fi\\n\" +",
-          "          \"exec $${NODE} external/com_microsoft_typescript/tsc.js \\\"$$@\\\"\\n\" +",
-          "          \"EOF\",",
-          "    executable = True,",
-          ")",
-      ]),
-  )
-
-  ##############################################################################
-  # TensorBoard JavaScript Production Dependencies
-
-  filegroup_external(
-      name = "com_lodash",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7c7b391810bc08cf815683431857c51b5ee190062ae4f557e1e4689d6dd910ea": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-              "https://raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "com_numericjs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
-              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "com_palantir_plottable",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "77510d7538dbd3b59f1c8a06f68131b38562e3be546364747618d5112723e818": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
-              "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
-          ],
-          "cd46dc709b01cd361e8399f797760871a6a207bc832e08fcff385ced02ef2b43": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
-              "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
-          ],
-          "32647b0fb4175fa875a71e6d56c761b88d975186ed6a8820e2c7854165a8988d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
-              "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_cpettitt_dagre",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
-              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_cpettitt_graphlib",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
-              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_waylonflinn_weblas",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
-              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_d3js",
-      # no @license header
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256_urls = {
-          "bc1e38838f5c5c8e040132d41efee6bfddbef728210bd566479dc1694af1d3f5": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
-              "https://raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_definitelytyped",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
-          ],
-          "177293828c7a206bf2a7f725753d51396d38668311aa37c96445f91bbf8128a7": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
-          ],
-          "e4cd3d5de0eb3bc7b1063b50d336764a0ac82a658b39b5cf90511f489ffdee60": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
-          ],
-          "695a03dd2ccb238161d97160b239ab841562710e5c4e42886aefd4ace2ce152e": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_threejs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-          ],
-          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-          ],
-      },
-  )
-
-  ##############################################################################
-  # TensorBoard JavaScript Testing Dependencies
-
-  filegroup_external(
-      name = "com_chaijs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "b926b325ad9843bf0b7a6d580ef78bb560e47c484b98680098d4fd9b31b77cd9": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
-              "https://raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_mochajs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "e36d865a17ffdf5868e55e736526ae30f3d4bc667c85a2a28cd5c850a82361e2": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
-              "https://raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
-          ],
-      },
-  )
-
-  ##############################################################################
-  # TensorBoard Polymer Dependencies
-
-  webfiles_external(
-      name = "org_polymer_font_roboto",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
+  native.new_http_archive(
+      name = "com_google_pprof",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
-          "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
+          "http://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+          "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
       ],
-      strip_prefix = "font-roboto-1.0.1",
-      path = "/font-roboto",
-      srcs = ["roboto.html"],
+      sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
+      strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
+      build_file = str(Label("//third_party:pprof.BUILD")),
   )
 
-  webfiles_external(
-      name = "org_polymer_iron_a11y_announcer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-          "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-      ],
-      strip_prefix = "iron-a11y-announcer-1.0.5",
-      path = "/iron-a11y-announcer",
-      srcs = ["iron-a11y-announcer.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_a11y_keys_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-          "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-      ],
-      strip_prefix = "iron-a11y-keys-behavior-1.1.8",
-      path = "/iron-a11y-keys-behavior",
-      srcs = ["iron-a11y-keys-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_ajax",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
-          "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
-      ],
-      strip_prefix = "iron-ajax-1.2.0",
-      path = "/iron-ajax",
-      srcs = [
-          "iron-ajax.html",
-          "iron-request.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_promise_polyfill",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_autogrow_textarea",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-          "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-      ],
-      strip_prefix = "iron-autogrow-textarea-1.0.12",
-      path = "/iron-autogrow-textarea",
-      srcs = ["iron-autogrow-textarea.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
-          "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
-      ],
-      strip_prefix = "iron-behaviors-1.0.17",
-      path = "/iron-behaviors",
-      srcs = [
-          "iron-button-state.html",
-          "iron-control-state.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_checked_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "iron-checked-element-behavior-1.0.4",
-      path = "/iron-checked-element-behavior",
-      srcs = ["iron-checked-element-behavior.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_collapse",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
-          "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
-      ],
-      strip_prefix = "iron-collapse-1.0.8",
-      path = "/iron-collapse",
-      srcs = ["iron-collapse.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_resizable_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_demo_helpers",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
-          "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
-      ],
-      strip_prefix = "iron-demo-helpers-1.1.0",
-      path = "/iron-demo-helpers",
-      srcs = [
-          "demo-pages-shared-styles.html",
-          "demo-snippet.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_icons",
-          "@org_polymer_marked_element",
-          "@org_polymer_paper_icon_button",
-          "@org_polymer_paper_styles",
-          "@org_polymer_prism_element",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_dropdown",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "iron-dropdown-1.4.0",
-      path = "/iron-dropdown",
-      srcs = [
-          "iron-dropdown.html",
-          "iron-dropdown-scroll-manager.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_overlay_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_neon_animation",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_fit_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-          "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-      ],
-      strip_prefix = "iron-fit-behavior-1.2.5",
-      path = "/iron-fit-behavior",
-      srcs = ["iron-fit-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_flex_layout",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
-          "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
-      ],
-      strip_prefix = "iron-flex-layout-1.3.0",
-      path = "/iron-flex-layout",
-      srcs = [
-          "classes/iron-flex-layout.html",
-          "classes/iron-shadow-flex-layout.html",
-          "iron-flex-layout.html",
-          "iron-flex-layout-classes.html",
-      ],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_form_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-          "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-      ],
-      strip_prefix = "iron-form-element-behavior-1.0.6",
-      path = "/iron-form-element-behavior",
-      srcs = ["iron-form-element-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_icon",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
-          "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
-      ],
-      strip_prefix = "iron-icon-1.0.11",
-      path = "/iron-icon",
-      srcs = ["iron-icon.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_meta",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_icons",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "iron-icons-1.1.3",
-      path = "/iron-icons",
-      srcs = [
-          "av-icons.html",
-          "communication-icons.html",
-          "device-icons.html",
-          "editor-icons.html",
-          "hardware-icons.html",
-          "image-icons.html",
-          "iron-icons.html",
-          "maps-icons.html",
-          "notification-icons.html",
-          "places-icons.html",
-          "social-icons.html",
-      ],
-      deps = [
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_iconset_svg",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-          "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-      ],
-      strip_prefix = "iron-iconset-svg-1.1.0",
-      path = "/iron-iconset-svg",
-      srcs = ["iron-iconset-svg.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
-          "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
-      ],
-      strip_prefix = "iron-input-1.0.10",
-      path = "/iron-input",
-      srcs = ["iron-input.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_announcer",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_list",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
-          "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
-      ],
-      strip_prefix = "iron-list-1.3.9",
-      path = "/iron-list",
-      srcs = ["iron-list.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_iron_scroll_target_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_menu_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-          "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-      ],
-      strip_prefix = "iron-menu-behavior-1.1.10",
-      path = "/iron-menu-behavior",
-      srcs = [
-          "iron-menu-behavior.html",
-          "iron-menubar-behavior.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_selector",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_meta",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "iron-meta-1.1.1",
-      path = "/iron-meta",
-      srcs = ["iron-meta.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_overlay_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-          "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-      ],
-      strip_prefix = "iron-overlay-behavior-1.10.1",
-      path = "/iron-overlay-behavior",
-      srcs = [
-          "iron-focusables-helper.html",
-          "iron-overlay-backdrop.html",
-          "iron-overlay-behavior.html",
-          "iron-overlay-manager.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_fit_behavior",
-          "@org_polymer_iron_resizable_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_range_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "iron-range-behavior-1.0.4",
-      path = "/iron-range-behavior",
-      srcs = ["iron-range-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_resizable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-          "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-      ],
-      strip_prefix = "iron-resizable-behavior-1.0.3",
-      path = "/iron-resizable-behavior",
-      srcs = ["iron-resizable-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_scroll_target_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-          "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-      ],
-      strip_prefix = "iron-scroll-target-behavior-1.0.3",
-      path = "/iron-scroll-target-behavior",
-      srcs = ["iron-scroll-target-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_selector",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
-          "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
-      ],
-      strip_prefix = "iron-selector-1.5.2",
-      path = "/iron-selector",
-      srcs = [
-          "iron-multi-selectable.html",
-          "iron-selectable.html",
-          "iron-selection.html",
-          "iron-selector.html",
-      ],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_iron_validatable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "iron-validatable-behavior-1.1.1",
-      path = "/iron-validatable-behavior",
-      srcs = ["iron-validatable-behavior.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_marked",
-      licenses = ["notice"],  # MIT
-      sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/chjj/marked/archive/v0.3.2.zip",
-          "https://github.com/chjj/marked/archive/v0.3.2.zip",
-      ],
-      strip_prefix = "marked-0.3.2",
-      path = "/marked",
-      srcs = ["lib/marked.js"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_marked_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "marked-element-1.1.3",
-      path = "/marked-element",
-      srcs = [
-          "marked-element.html",
-          "marked-import.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_marked",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_neon_animation",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
-          "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
-      ],
-      strip_prefix = "neon-animation-1.2.2",
-      path = "/neon-animation",
-      srcs = [
-          "animations/cascaded-animation.html",
-          "animations/fade-in-animation.html",
-          "animations/fade-out-animation.html",
-          "animations/hero-animation.html",
-          "animations/opaque-animation.html",
-          "animations/reverse-ripple-animation.html",
-          "animations/ripple-animation.html",
-          "animations/scale-down-animation.html",
-          "animations/scale-up-animation.html",
-          "animations/slide-down-animation.html",
-          "animations/slide-from-bottom-animation.html",
-          "animations/slide-from-left-animation.html",
-          "animations/slide-from-right-animation.html",
-          "animations/slide-from-top-animation.html",
-          "animations/slide-left-animation.html",
-          "animations/slide-right-animation.html",
-          "animations/slide-up-animation.html",
-          "animations/transform-animation.html",
-          "neon-animatable.html",
-          "neon-animatable-behavior.html",
-          "neon-animated-pages.html",
-          "neon-animation.html",
-          "neon-animation-behavior.html",
-          "neon-animation-runner-behavior.html",
-          "neon-animations.html",
-          "neon-shared-element-animatable-behavior.html",
-          "neon-shared-element-animation-behavior.html",
-          "web-animations.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_iron_selector",
-          "@org_polymer_web_animations_js",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
-          "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
-      ],
-      strip_prefix = "paper-behaviors-1.0.12",
-      path = "/paper-behaviors",
-      srcs = [
-          "paper-button-behavior.html",
-          "paper-checked-element-behavior.html",
-          "paper-inky-focus-behavior.html",
-          "paper-ripple-behavior.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_checked_element_behavior",
-          "@org_polymer_paper_ripple",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
-          "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
-      ],
-      strip_prefix = "paper-button-1.0.11",
-      path = "/paper-button",
-      srcs = ["paper-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_material",
-          "@org_polymer_paper_ripple",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_checkbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "paper-checkbox-1.4.0",
-      path = "/paper-checkbox",
-      srcs = ["paper-checkbox.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_dialog",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-1.0.4",
-      path = "/paper-dialog",
-      srcs = ["paper-dialog.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_neon_animation",
-          "@org_polymer_paper_dialog_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_dialog_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-behavior-1.2.5",
-      path = "/paper-dialog-behavior",
-      srcs = [
-          "paper-dialog-behavior.html",
-          "paper-dialog-common.css",
-          "paper-dialog-shared-styles.html",
-      ],
-      suppress = ["cssSyntax"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_overlay_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_dialog_scrollable",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-scrollable-1.1.5",
-      path = "/paper-dialog-scrollable",
-      srcs = ["paper-dialog-scrollable.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_dialog_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_dropdown_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "paper-dropdown-menu-1.4.0",
-      path = "/paper-dropdown-menu",
-      srcs = [
-          "paper-dropdown-menu.html",
-          "paper-dropdown-menu-icons.html",
-          "paper-dropdown-menu-light.html",
-          "paper-dropdown-menu-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-          "@org_polymer_iron_validatable_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_input",
-          "@org_polymer_paper_menu_button",
-          "@org_polymer_paper_ripple",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_header_panel",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-header-panel-1.1.4",
-      path = "/paper-header-panel",
-      srcs = ["paper-header-panel.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_icon_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "paper-icon-button-1.1.3",
-      path = "/paper-icon-button",
-      srcs = [
-          "paper-icon-button.html",
-          "paper-icon-button-light.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_icon",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
-          "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
-      ],
-      strip_prefix = "paper-input-1.1.18",
-      path = "/paper-input",
-      srcs = [
-          "paper-input.html",
-          "paper-input-addon-behavior.html",
-          "paper-input-behavior.html",
-          "paper-input-char-counter.html",
-          "paper-input-container.html",
-          "paper-input-error.html",
-          "paper-textarea.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_autogrow_textarea",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_input",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_item",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-item-1.1.4",
-      path = "/paper-item",
-      srcs = [
-          "paper-icon-item.html",
-          "paper-item.html",
-          "paper-item-behavior.html",
-          "paper-item-body.html",
-          "paper-item-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_listbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-listbox-1.1.2",
-      path = "/paper-listbox",
-      srcs = ["paper-listbox.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_material",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
-          "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
-      ],
-      strip_prefix = "paper-material-1.0.6",
-      path = "/paper-material",
-      srcs = [
-          "paper-material.html",
-          "paper-material-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
-          "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
-      ],
-      strip_prefix = "paper-menu-1.2.2",
-      path = "/paper-menu",
-      srcs = [
-          "paper-menu.html",
-          "paper-menu-shared-styles.html",
-          "paper-submenu.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_collapse",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_menu_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
-          "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
-      ],
-      strip_prefix = "paper-menu-button-1.5.1",
-      path = "/paper-menu-button",
-      srcs = [
-          "paper-menu-button.html",
-          "paper-menu-button-animations.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_dropdown",
-          "@org_polymer_neon_animation",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_progress",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
-          "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
-      ],
-      strip_prefix = "paper-progress-1.0.9",
-      path = "/paper-progress",
-      srcs = ["paper-progress.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_range_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_radio_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-radio-button-1.1.2",
-      path = "/paper-radio-button",
-      srcs = ["paper-radio-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_radio_group",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
-          "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
-      ],
-      strip_prefix = "paper-radio-group-1.0.9",
-      path = "/paper-radio-group",
-      srcs = ["paper-radio-group.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_selector",
-          "@org_polymer_paper_radio_button",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_ripple",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
-          "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
-      ],
-      strip_prefix = "paper-ripple-1.0.5",
-      path = "/paper-ripple",
-      srcs = ["paper-ripple.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_slider",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
-          "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
-      ],
-      strip_prefix = "paper-slider-1.0.10",
-      path = "/paper-slider",
-      srcs = ["paper-slider.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_range_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_input",
-          "@org_polymer_paper_progress",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_spinner",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "paper-spinner-1.1.1",
-      path = "/paper-spinner",
-      srcs = [
-          "paper-spinner.html",
-          "paper-spinner-behavior.html",
-          "paper-spinner-lite.html",
-          "paper-spinner-styles.html"
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_styles",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-styles-1.1.4",
-      path = "/paper-styles",
-      srcs = [
-          "classes/global.html",
-          "classes/shadow.html",
-          "classes/shadow-layout.html",
-          "classes/typography.html",
-          "color.html",
-          "default-theme.html",
-          "demo.css",
-          "demo-pages.html",
-          "paper-styles.html",
-          "paper-styles-classes.html",
-          "shadow.html",
-          "typography.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_font_roboto",
-          "@org_polymer_iron_flex_layout",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_tabs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
-          "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
-      ],
-      strip_prefix = "paper-tabs-1.7.0",
-      path = "/paper-tabs",
-      srcs = [
-          "paper-tab.html",
-          "paper-tabs.html",
-          "paper-tabs-icons.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_icon_button",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_toast",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
-          "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
-      ],
-      strip_prefix = "paper-toast-1.3.0",
-      path = "/paper-toast",
-      srcs = ["paper-toast.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_announcer",
-          "@org_polymer_iron_overlay_behavior",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_toggle_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
-          "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
-      ],
-      strip_prefix = "paper-toggle-button-1.2.0",
-      path = "/paper-toggle-button",
-      srcs = ["paper-toggle-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_toolbar",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-toolbar-1.1.4",
-      path = "/paper-toolbar",
-      srcs = ["paper-toolbar.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_paper_tooltip",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-tooltip-1.1.2",
-      path = "/paper-tooltip",
-      srcs = ["paper-tooltip.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_neon_animation",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
-      strip_prefix = "polymer-1.7.0",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-          "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-      ],
-      path = "/polymer",
-      srcs = [
-          "polymer.html",
-          "polymer-micro.html",
-          "polymer-mini.html",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_prism",
-      licenses = ["notice"],  # MIT
-      sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
-          "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
-      ],
-      strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
-      path = "/prism",
-      srcs = [
-          "prism.js",
-          "themes/prism.css",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_prism_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
-          "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
-      ],
-      strip_prefix = "prism-element-1.0.4",
-      path = "/prism-element",
-      srcs = [
-          "prism-highlighter.html",
-          "prism-import.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_prism",
-      ],
-  )
-
-  webfiles_external(
-      name = "org_polymer_promise_polyfill",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
-      strip_prefix = "promise-polyfill-1.0.0",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
-          "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
-      ],
-      path = "/promise-polyfill",
-      srcs = [
-          "Promise.js",
-          "Promise-Statics.js",
-          "promise-polyfill.html",
-          "promise-polyfill-lite.html"
-      ],
-      deps = ["@org_polymer"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_web_animations_js",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-          "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-      ],
-      strip_prefix = "web-animations-js-2.2.1",
-      path = "/web-animations-js",
-      srcs = ["web-animations-next-lite.min.js"],
-  )
-
-  webfiles_external(
-      name = "org_polymer_webcomponentsjs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-          "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-      ],
-      strip_prefix = "webcomponentsjs-0.7.22",
-      path = "/webcomponentsjs",
-      srcs = [
-          "CustomElements.js",
-          "CustomElements.min.js",
-          "HTMLImports.js",
-          "HTMLImports.min.js",
-          "MutationObserver.js",
-          "MutationObserver.min.js",
-          "ShadowDOM.js",
-          "ShadowDOM.min.js",
-          "webcomponents.js",
-          "webcomponents.min.js",
-          "webcomponents-lite.js",
-          "webcomponents-lite.min.js",
-      ],
-  )
diff --git a/third_party/bleach.BUILD b/third_party/bleach.BUILD
new file mode 100644
index 00000000000..1bf75b84a76
--- /dev/null
+++ b/third_party/bleach.BUILD
@@ -0,0 +1,20 @@
+# Description:
+#   Build file for Bleach.
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "org_mozilla_bleach",
+    srcs = [
+        "bleach/__init__.py",
+        "bleach/callbacks.py",
+        "bleach/encoding.py",
+        "bleach/sanitizer.py",
+        "bleach/version.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = ["@org_html5lib"],
+)
diff --git a/third_party/clutz.BUILD b/third_party/clutz.BUILD
new file mode 100644
index 00000000000..593b70366a3
--- /dev/null
+++ b/third_party/clutz.BUILD
@@ -0,0 +1,44 @@
+# Description:
+#   Build tool for making TypeScript .d.ts files from Closure JavaScript.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+exports_files([
+    "LICENSE",
+    "src/resources/closure.lib.d.ts",
+])
+
+JVM_FLAGS = [
+    "-Xss20m",  # JSCompiler needs big stacks for recursive parsing
+    "-XX:+UseParallelGC",  # Best GC when app isn't latency sensitive
+]
+
+java_binary(
+    name = "clutz",
+    srcs = glob(["src/main/java/com/google/javascript/clutz/**/*.java"]),
+    jvm_flags = JVM_FLAGS,
+    main_class = "com.google.javascript.clutz.DeclarationGenerator",
+    deps = [
+        "@args4j",
+        "@com_google_code_findbugs_jsr305",
+        "@com_google_code_gson",
+        "@com_google_guava",
+        "@com_google_javascript_closure_compiler",
+    ],
+)
+
+java_binary(
+    name = "gents",
+    srcs = glob(["src/main/java/com/google/javascript/gents/**/*.java"]),
+    jvm_flags = JVM_FLAGS,
+    main_class = "com.google.javascript.gents.TypeScriptGenerator",
+    deps = [
+        "@args4j",
+        "@com_google_code_findbugs_jsr305",
+        "@com_google_code_gson",
+        "@com_google_guava",
+        "@com_google_javascript_closure_compiler",
+    ],
+)
diff --git a/third_party/clutz.bzl b/third_party/clutz.bzl
new file mode 100644
index 00000000000..f273c78c794
--- /dev/null
+++ b/third_party/clutz.bzl
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build definitions for TypeScript from Closure JavaScript libraries."""
+
+load("@io_bazel_rules_closure//closure/private:defs.bzl",
+     "JS_FILE_TYPE",
+     "collect_js",
+     "unfurl")
+
+CLUTZ_ATTRIBUTES = {
+    "_clutz": attr.label(
+        default=Label("@io_angular_clutz//:clutz"),
+        executable=True,
+        cfg="host"),
+    "_clutz_externs": attr.label(
+        default=Label("@com_google_javascript_closure_compiler_externs"),
+        allow_files=True),
+}
+
+def extract_dts_from_closure_libraries(ctx):
+  """Extracts type definitions from closure dependencies.
+
+  This just generates one big .d.ts file for all transitive Closure sources,
+  and does not pass it down. That means each rule has to duplicate the effort,
+  but on the other hand allows transitive dependencies on shared rules without
+  causing duplicate definition errors.
+
+  Args:
+      ctx: A Skylark context.
+  Returns:
+      The generated Clutz typings file, or None if there were no JS deps.
+  """
+  deps = unfurl(ctx.attr.deps, provider="closure_js_library")
+  js = collect_js(ctx, deps)
+  if not js.srcs:
+    return None
+  js_typings = ctx.new_file(ctx.bin_dir, "%s-js-typings.d.ts" % ctx.label.name)
+  srcs = depset(JS_FILE_TYPE.filter(ctx.files._clutz_externs)) + js.srcs
+  args = ["-o", js_typings.path]
+  for src in srcs:
+    args.append(src.path)
+  if getattr(ctx.attr, "clutz_entry_points", None):
+    args.append("--closure_entry_points")
+    args.extend(ctx.attr.clutz_entry_points)
+  ctx.action(
+      inputs=list(srcs),
+      outputs=[js_typings],
+      executable=ctx.executable._clutz,
+      arguments=args,
+      mnemonic="Clutz",
+      progress_message="Running Clutz on %d JS files %s" % (
+          len(srcs), ctx.label))
+  return js_typings
+
+################################################################################
+# The following definitions are for API compatibility with internal clutz.bzl
+
+CLUTZ_OUTPUTS = {}
+
+def _clutz_aspect_impl(target, ctx):
+  return struct()
+
+clutz_aspect = aspect(
+    implementation=_clutz_aspect_impl,
+    attr_aspects=["exports"])
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index c1c0f69dc08..882967df1c1 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -5,6 +5,26 @@ licenses(["notice"])  # MIT/X derivative license
 
 exports_files(["COPYING"])
 
+CURL_WIN_COPTS = [
+    "/I%prefix%/curl/lib",
+    "/DHAVE_CONFIG_H",
+    "/DCURL_DISABLE_FTP",
+    "/DCURL_DISABLE_NTLM",
+    "/DHAVE_LIBZ",
+    "/DHAVE_ZLIB_H",
+    # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
+    # detection of what OS releases we can build on with VC 2012. This
+    # may not be needed (or may have to change) if the WINVER setting
+    # changes in //third_party/msvc/vc_12_0/CROSSTOOL.
+    "/D_USING_V110_SDK71_",
+]
+
+CURL_WIN_SRCS = [
+    "lib/asyn-thread.c",
+    "lib/inet_ntop.c",
+    "lib/system_win32.c",
+]
+
 cc_library(
     name = "curl",
     srcs = [
@@ -204,17 +224,14 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        ":darwin": [
+        "@%ws%//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        ":ios": [
+        "@%ws%//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        ":windows": [
-            "lib/asyn-thread.c",
-            "lib/inet_ntop.c",
-            "lib/system_win32.c",
-        ],
+        "@%ws%//tensorflow:windows": CURL_WIN_SRCS,
+        "@%ws%//tensorflow:windows_msvc": CURL_WIN_SRCS,
         "//conditions:default": [
             "lib/vtls/openssl.c",
         ],
@@ -231,21 +248,10 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        ":windows": [
-            "/Iexternal/curl/lib",
-            "/DHAVE_CONFIG_H",
-            "/DCURL_DISABLE_FTP",
-            "/DCURL_DISABLE_NTLM",
-            "/DHAVE_LIBZ",
-            "/DHAVE_ZLIB_H",
-            # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
-            # detection of what OS releases we can build on with VC 2012. This
-            # may not be needed (or may have to change) if the WINVER setting
-            # changes in //third_party/msvc/vc_12_0/CROSSTOOL.
-            "/D_USING_V110_SDK71_",
-        ],
+        "@%ws%//tensorflow:windows": CURL_WIN_COPTS,
+        "@%ws%//tensorflow:windows_msvc": CURL_WIN_COPTS,
         "//conditions:default": [
-            "-Iexternal/curl/lib",
+            "-I%prefix%/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_FTP",
@@ -255,10 +261,14 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        ":darwin": [
+        "@%ws%//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        ":windows": [
+        "@%ws%//tensorflow:windows": [
+            # See curl.h for discussion of write size and Windows
+            "/DCURL_MAX_WRITE_SIZE=16384",
+        ],
+        "@%ws%//tensorflow:windows_msvc": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -268,18 +278,21 @@ cc_library(
     }),
     includes = ["include"],
     linkopts = select({
-        ":android": [
+        "@%ws%//tensorflow:android": [
             "-pie",
         ],
-        ":darwin": [
+        "@%ws%//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        ":ios": [],
-        ":windows": [
-            "ws2_32.lib",
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "-Wl,ws2_32.lib",
+        ],
+        "@%ws%//tensorflow:windows_msvc": [
+            "-Wl,ws2_32.lib",
         ],
         "//conditions:default": [
             "-lrt",
@@ -289,14 +302,21 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        ":ios": [],
-        ":windows": [],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [],
+        "@%ws%//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
     }),
 )
 
+CURL_BIN_WIN_COPTS = [
+    "/I%prefix%/curl/lib",
+    "/DHAVE_CONFIG_H",
+    "/DCURL_DISABLE_LIBCURL_OPTION",
+]
+
 cc_binary(
     name = "curl_bin",
     srcs = [
@@ -386,13 +406,10 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        ":windows": [
-            "/Iexternal/curl/lib",
-            "/DHAVE_CONFIG_H",
-            "/DCURL_DISABLE_LIBCURL_OPTION",
-        ],
+        "@%ws%//tensorflow:windows": CURL_BIN_WIN_COPTS,
+        "@%ws%//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
-            "-Iexternal/curl/lib",
+            "-I%prefix%/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_LIBCURL_OPTION",
@@ -657,23 +674,3 @@ genrule(
         "EOF",
     ]),
 )
-
-config_setting(
-    name = "ios",
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index ff5323939d6..f38a26717e1 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -10,9 +10,12 @@ licenses([
 ])
 
 exports_files(["LICENSE"])
+
 # INTEL_MKL start
 load("//tensorflow:tensorflow.bzl", "if_mkl")
+
 # INTEL_MKL end
+load("//tensorflow:tensorflow.bzl", "if_mkl")
 
 cc_library(
     name = "eigen3",
@@ -28,10 +31,8 @@ cc_library(
         "unsupported/Eigen/CXX11/Tensor",
         "unsupported/Eigen/CXX11/FixedPoint",
     ],
+    includes = if_mkl(["./mkl_include"]),
     visibility = ["//visibility:public"],
-# INTEL_MKL start
-		includes = if_mkl(["./mkl_include"]),
-# INTEL_MKL end
     deps = [
         "@eigen_archive//:eigen",
         "@local_config_sycl//sycl:sycl",
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
index b0a73aac79d..eb604d38b11 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
@@ -31,7 +31,7 @@
 #include "src/FixedPoint/FixedPointTypes.h"
 
 // Use optimized implementations whenever available
-#ifdef EIGEN_VECTORIZE_AVX512
+#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
 #include "src/FixedPoint/PacketMathAVX512.h"
 #include "src/FixedPoint/TypeCastingAVX512.h"
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 00d2e7c0c78..861a87b68bf 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1,9 +1,11 @@
-#ifdef _WIN32
-#define sleep(seconds) Sleep(1000*seconds)
-#endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef _WIN32
+#ifndef SLEEP_FUNC_HEADER_GUARD
+#define SLEEP_FUNC_HEADER_GUARD
+inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
+#endif
+
 // On Windows, Eigen will include Windows.h, which defines various
 // macros that conflict with TensorFlow symbols. Undefine them here to
 // prevent clashes.
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 98deb1742e8..078be83e0dc 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -11,6 +11,13 @@ typedef struct Packet32q8i {
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
+typedef struct Packet16q16i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet16q16i();
+  Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
@@ -32,6 +39,13 @@ typedef struct Packet16q8u {
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
+typedef struct Packet8q16i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet8q16i();
+  Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
@@ -92,6 +106,28 @@ struct packet_traits<QUInt8> : default_packet_traits {
   };
 };
 template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
 struct packet_traits<QInt32> : default_packet_traits {
   typedef Packet8q32i type;
   typedef Packet4q32i half;
@@ -122,6 +158,12 @@ struct unpacket_traits<Packet32q8i> {
   enum { size = 32, alignment=Aligned32 };
 };
 template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 16, alignment=Aligned32 };
+};
+template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
@@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
 template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16(a.val, 0);
+}
+template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
   return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
 }
@@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
   return _mm256_add_epi32(a.val, b.val);
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
   return _mm256_sub_epi32(a.val, b.val);
@@ -264,6 +334,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
   return _mm256_max_epi32(a.val, b.val);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.val, b.val);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
@@ -304,6 +385,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
       _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
 }
 
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
   __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index b754bbf009b..7a222fddc1c 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,25 @@ template <>
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64 };
+  enum { size = 64, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32 };
+  enum { size = 32, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64 };
+  enum { size = 64, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16 };
+  enum { size = 16, alignment=Aligned64 };
 };
 
 // Unaligned load
@@ -457,7 +457,7 @@ EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   std::uint32_t w =
       pfirst(
         _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
+  return std::max({
            static_cast<std::int16_t>(w >> 16),
            static_cast<std::int16_t>(w)
          });
@@ -493,7 +493,7 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   std::uint32_t w =
       pfirst(
         _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
+  return std::max({
            static_cast<std::uint8_t>(w >> 24),
            static_cast<std::uint8_t>(w >> 16),
            static_cast<std::uint8_t>(w >> 8),
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
index 94d616f2b52..cbcce9e2826 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
@@ -80,11 +80,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/third_party/farmhash.BUILD b/third_party/farmhash.BUILD
index 6a1d4da6e55..a51e1511c1f 100644
--- a/third_party/farmhash.BUILD
+++ b/third_party/farmhash.BUILD
@@ -3,12 +3,19 @@ licenses(["notice"])  # MIT
 exports_files(["COPYING"])
 
 config_setting(
-    name = "windows",
+    name = "windows_msvc",
     values = {
         "cpu": "x64_windows_msvc",
     },
 )
 
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+)
+
 cc_library(
     name = "farmhash",
     srcs = ["src/farmhash.cc"],
@@ -16,6 +23,7 @@ cc_library(
     # Disable __builtin_expect support on Windows
     copts = select({
         ":windows": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
+        ":windows_msvc": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
         "//conditions:default": [],
     }),
     includes = ["src/."],
diff --git a/third_party/fft2d/BUILD b/third_party/fft2d/BUILD
new file mode 100644
index 00000000000..93ea06e81b8
--- /dev/null
+++ b/third_party/fft2d/BUILD
@@ -0,0 +1,30 @@
+# Headers for 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+# This is a separate package because the original downloaded archive doesn't
+# contain any header files.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+# See fft/readme.txt
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "fft2d_headers",
+    srcs = ["fft.h"],
+)
+
+objc_library(
+    name = "fft2d_headersd_ios",
+    srcs = ["fft.h"],
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+    name = "fft2d_headers_srcs",
+    srcs = ["fft.h"],
+)
diff --git a/third_party/fft2d/LICENSE b/third_party/fft2d/LICENSE
new file mode 100644
index 00000000000..2bd85506a8c
--- /dev/null
+++ b/third_party/fft2d/LICENSE
@@ -0,0 +1,3 @@
+Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+You may use, copy, modify this code for any purpose and 
+without fee. You may distribute this ORIGINAL package.
diff --git a/third_party/fft2d/fft.h b/third_party/fft2d/fft.h
new file mode 100644
index 00000000000..252cc01fec3
--- /dev/null
+++ b/third_party/fft2d/fft.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations for 1D FFT routines in third_party/fft2d/fft.
+
+#ifndef THIRD_PARTY_FFT2D_FFT_H__
+#define THIRD_PARTY_FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft(int, int, double *, int *, double *);
+extern void rdft(int, int, double *, int *, double *);
+extern void ddct(int, int, double *, int *, double *);
+extern void ddst(int, int, double *, int *, double *);
+extern void dfct(int, double *, double *, int *, double *);
+extern void dfst(int, double *, double *, int *, double *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_FFT2D_FFT_H__
diff --git a/third_party/fft2d/fft2d.BUILD b/third_party/fft2d/fft2d.BUILD
new file mode 100644
index 00000000000..3dbd36aec04
--- /dev/null
+++ b/third_party/fft2d/fft2d.BUILD
@@ -0,0 +1,36 @@
+# 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+licenses(["notice"])
+
+exports_files(["fft/readme.txt"])
+
+FFT2D_SRCS = [
+    "fft/fftsg.c",
+]
+
+# This is the main 2D FFT library.  The 2D FFTs in this library call
+# 1D FFTs.  In addition, fast DCTs are provided for the special case
+# of 8x8 and 16x16.  This code in this library is referred to as
+# "Version II" on http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html.
+cc_library(
+    name = "fft2d",
+    srcs = FFT2D_SRCS,
+    linkopts = ["-lm"],
+)
+
+objc_library(
+    name = "fft2d_ios",
+    srcs = FFT2D_SRCS,
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+    name = "fft2d_srcs",
+    srcs = FFT2D_SRCS,
+)
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index fec7449130c..ad6821af3cc 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -24,6 +24,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":windows_polyfill"],
+        ":windows_msvc": [":windows_polyfill"],
         "//conditions:default": [],
     }),
 )
@@ -41,6 +42,15 @@ genrule(
 )
 
 config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows_msvc"},
+    name = "windows_msvc",
+    values = {
+        "cpu": "x64_windows_msvc",
+    },
+)
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
 )
diff --git a/third_party/gmock.BUILD b/third_party/gmock.BUILD
index 501e322529d..b800cac954b 100644
--- a/third_party/gmock.BUILD
+++ b/third_party/gmock.BUILD
@@ -9,19 +9,19 @@ exports_files(["LICENSE"])
 cc_library(
     name = "gtest",
     srcs = [
-        "gtest/src/gtest-all.cc",
-        "src/gmock-all.cc",
+        "googlemock/src/gmock-all.cc",
+        "googletest/src/gtest-all.cc",
     ],
     hdrs = glob([
         "**/*.h",
-        "gtest/src/*.cc",
-        "src/*.cc",
+        "googletest/src/*.cc",
+        "googlemock/src/*.cc",
     ]),
     includes = [
-        ".",
-        "gtest",
-        "gtest/include",
-        "include",
+        "googlemock",
+        "googlemock/include",
+        "googletest",
+        "googletest/include",
     ],
     linkopts = ["-pthread"],
     visibility = ["//visibility:public"],
@@ -29,7 +29,7 @@ cc_library(
 
 cc_library(
     name = "gtest_main",
-    srcs = ["src/gmock_main.cc"],
+    srcs = ["googlemock/src/gmock_main.cc"],
     linkopts = ["-pthread"],
     visibility = ["//visibility:public"],
     deps = [":gtest"],
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
deleted file mode 100644
index b77a45c3257..00000000000
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,249 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  # As part of the TensorFlow release, we place some cuda-related compilation
-  # files in @local_config_cuda//crosstool/clang/bin, and this relative
-  # path, combined with the rest of our Bazel configuration causes our
-  # compilation to use those files.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-%{gcc_host_compiler_includes}
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-fPIE"
-  linker_flag: "-pie"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  cxx_flag: "-std=c++11"
-  ar_flag: "-static"
-  ar_flag: "-s"
-  ar_flag: "-o"
-  linker_flag: "-lc++"
-  linker_flag: "-undefined"
-  linker_flag: "dynamic_lookup"
-  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
-  # setting from the local compiler, and also how to make incremental builds correct.
-  cxx_builtin_include_directory: "/"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified.
-  compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
-    # However, that can't happen here, as it requires special handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-  }
-}
diff --git a/third_party/gpus/crosstool/CROSSTOOL_clang.tpl b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl
new file mode 100644
index 00000000000..e4363d60457
--- /dev/null
+++ b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl
@@ -0,0 +1,292 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin/"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "%{clang_path}" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+%{host_compiler_includes}
+}
diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
new file mode 100644
index 00000000000..05290d647ea
--- /dev/null
+++ b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
@@ -0,0 +1,249 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  # As part of the TensorFlow release, we place some cuda-related compilation
+  # files in @local_config_cuda//crosstool/clang/bin, and this relative
+  # path, combined with the rest of our Bazel configuration causes our
+  # compilation to use those files.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+%{host_compiler_includes}
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-fPIE"
+  linker_flag: "-pie"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  # compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wunused-but-set-parameter"
+  # But disable some that are problematic.
+  compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+  # Have gcc return the exit code from ld.
+  linker_flag: "-pass-exit-codes"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+  # Gold linker only? Can we enable this by default?
+  # linker_flag: "-Wl,--warn-execstack"
+  # linker_flag: "-Wl,--detect-odr-violations"
+
+  # Include directory for cuda headers.
+%{cuda_include_path}
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+  cxx_flag: "-std=c++11"
+  ar_flag: "-static"
+  ar_flag: "-s"
+  ar_flag: "-o"
+  linker_flag: "-lc++"
+  linker_flag: "-undefined"
+  linker_flag: "dynamic_lookup"
+  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
+  # setting from the local compiler, and also how to make incremental builds correct.
+  cxx_builtin_include_directory: "/"
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified.
+  compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+
+  # Include directory for cuda headers.
+%{cuda_include_path}
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
+    # However, that can't happen here, as it requires special handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+  }
+}
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index b7d6cc61dd7..242439daf45 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -49,9 +49,7 @@ import pipes
 CPU_COMPILER = ('%{cpu_compiler}')
 GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-NVCC_PATH = CURRENT_DIR + '/../../../cuda/bin/nvcc'
-LLVM_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
@@ -229,7 +227,7 @@ def InvokeNvcc(argv, log=False):
 
   # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
   # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ' ' + cmd
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
   if log: Log(cmd)
   return os.system(cmd)
 
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 87541056198..f7610dd7a99 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,7 +1,5 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
-load("@local_config_cuda//cuda:platform.bzl", "readlink_command")
-
 package(default_visibility = ["//visibility:public"])
 
 config_setting(
@@ -33,11 +31,18 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cuda_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        %{cuda_headers}
+    ],
     includes = [
         ".",
         "include",
@@ -48,9 +53,11 @@ cc_library(
 cc_library(
     name = "cudart_static",
     srcs = ["lib/%{cudart_static_lib}"],
-    includes = ["include/"],
-    linkopts = [
-        "-ldl",
+    includes = ["include"],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
         "-lpthread",
         %{cudart_static_linkopt}
     ],
@@ -60,7 +67,7 @@ cc_library(
 cc_library(
     name = "cuda_driver",
     srcs = ["lib/%{cuda_driver_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     visibility = ["//visibility:public"],
 )
 
@@ -68,7 +75,7 @@ cc_library(
     name = "cudart",
     srcs = ["lib/%{cudart_lib}"],
     data = ["lib/%{cudart_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -77,16 +84,26 @@ cc_library(
     name = "cublas",
     srcs = ["lib/%{cublas_lib}"],
     data = ["lib/%{cublas_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cusolver",
+    srcs = ["lib/%{cusolver_lib}"],
+    data = ["lib/%{cusolver_lib}"],
+    includes = ["include"],
+    linkstatic = 1,
+    linkopts = ["-lgomp"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cudnn",
     srcs = ["lib/%{cudnn_lib}"],
     data = ["lib/%{cudnn_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -95,7 +112,7 @@ cc_library(
     name = "cufft",
     srcs = ["lib/%{cufft_lib}"],
     data = ["lib/%{cufft_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -104,7 +121,7 @@ cc_library(
     name = "curand",
     srcs = ["lib/%{curand_lib}"],
     data = ["lib/%{curand_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -124,9 +141,10 @@ cc_library(
 
 cc_library(
     name = "cupti_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        ":cuda-extras",
+    ],
     includes = [
         ".",
         "extras/CUPTI/include/",
@@ -142,6 +160,8 @@ cc_library(
 
 cc_library(
     name = "libdevice_root",
-    data = glob(["nvvm/libdevice/*.bc"]),
+    data = [":cuda-nvvm"],
     visibility = ["//visibility:public"],
 )
+
+%{cuda_include_genrules}
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index a497ed98f03..ca8bbc1ee22 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -8,12 +8,14 @@ def if_cuda(if_true, if_false = []):
     """
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
         "//conditions:default": if_false
     })
 
+
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"])
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts})
 
 
 def cuda_is_configured():
diff --git a/third_party/gpus/cuda/platform.bzl.tpl b/third_party/gpus/cuda/platform.bzl.tpl
deleted file mode 100644
index 01ef24b94ed..00000000000
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ /dev/null
@@ -1,15 +0,0 @@
-CUDA_VERSION = "%{cuda_version}"
-CUDNN_VERSION = "%{cudnn_version}"
-PLATFORM = "%{platform}"
-
-def cuda_sdk_version():
-  return CUDA_VERSION
-
-def cudnn_sdk_version():
-  return CUDNN_VERSION
-
-def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 15e1dfc5217..61932a8e6d1 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -5,6 +5,9 @@
 
   * `TF_NEED_CUDA`: Whether to enable building with CUDA.
   * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation if TF_CUDA_CLANG is 1.
   * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
     `/usr/local/cuda`.
   * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
@@ -17,6 +20,7 @@
 """
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
 _CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
 _TF_CUDA_VERSION = "TF_CUDA_VERSION"
 _TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
@@ -35,19 +39,30 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
   """Find the C++ compiler."""
-  cc_name = "gcc"
-  if _GCC_HOST_COMPILER_PATH in repository_ctx.os.environ:
-    cc_name = repository_ctx.os.environ[_GCC_HOST_COMPILER_PATH].strip()
-    if not cc_name:
-      cc_name = "gcc"
+  # On Windows, we use Bazel's MSVC CROSSTOOL for GPU build
+  # Return a dummy value for GCC detection here to avoid error
+  if _is_windows(repository_ctx):
+    return "/use/--config=win-cuda --cpu=x64_windows_msvc/instead"
+
+  if _use_cuda_clang(repository_ctx):
+    target_cc_name = "clang"
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+  else:
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+  cc_name = target_cc_name
+
+  if cc_path_envvar in repository_ctx.os.environ:
+    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+    if cc_name_from_env:
+      cc_name = cc_name_from_env
   if cc_name.startswith("/"):
-    # Absolute path, maybe we should make this suported by our which function.
+    # Absolute path, maybe we should make this supported by our which function.
     return cc_name
   cc = repository_ctx.which(cc_name)
   if cc == None:
-    fail(
-        "Cannot find gcc, either correct your path or set the CC" +
-        " environment variable")
+    fail(("Cannot find {}, either correct your path or set the {}" +
+          " environment variable").format(target_cc_name, cc_path_envvar))
   return cc
 
 
@@ -64,10 +79,17 @@ def _cxx_inc_convert(path):
     path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
   return path
 
-
-def get_cxx_inc_directories(repository_ctx, cc):
-  """Compute the list of default C++ include directories."""
-  result = repository_ctx.execute([cc, "-E", "-xc++", "-", "-v"])
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+  """Compute the list of default C or C++ include directories."""
+  if lang_is_cpp:
+    lang = "c++"
+  else:
+    lang = "c"
+  # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
+  #       but in cuda_clang CROSSTOOL file that is a `feature` and we should
+  #       handle the case when it's disabled and no flag is passed
+  result = repository_ctx.execute([cc, "-no-canonical-prefixes",
+                                   "-E", "-x" + lang, "-", "-v"])
   index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
   if index1 == -1:
     return []
@@ -86,15 +108,28 @@ def get_cxx_inc_directories(repository_ctx, cc):
   return [repository_ctx.path(_cxx_inc_convert(p))
           for p in inc_dirs.split("\n")]
 
+def get_cxx_inc_directories(repository_ctx, cc):
+  """Compute the list of default C and C++ include directories."""
+  # For some reason `clang -xc` sometimes returns include paths that are
+  # different from the ones from `clang -xc++`. (Symlink and a dir)
+  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+  includes_cpp_set = set(includes_cpp)
+  return includes_cpp + [inc for inc in includes_c
+                         if inc not in includes_cpp_set]
+
+
 def auto_configure_fail(msg):
-  """Output failure message when auto configuration fails."""
+  """Output failure message when cuda configuration fails."""
   red = "\033[0;31m"
   no_color = "\033[0m"
-  fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 # END cc_configure common functions (see TODO above).
 
 
-def _gcc_host_compiler_includes(repository_ctx, cc):
+def _host_compiler_includes(repository_ctx, cc):
   """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
 
   Args:
@@ -112,6 +147,36 @@ def _gcc_host_compiler_includes(repository_ctx, cc):
     inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
   return "\n".join(inc_entries)
 
+def _cuda_include_path(repository_ctx, cuda_config):
+  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+
+  Args:
+    repository_ctx: The repository context.
+    cc: The path to the gcc host compiler.
+
+  Returns:
+    A string containing the cxx_builtin_include_directory for each of the gcc
+    host compiler include directories, which can be added to the CROSSTOOL
+    file.
+  """
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                  (cuda_config.cuda_toolkit_path,
+                                   ".exe" if cuda_config.cpu_value == "Windows" else ""))
+  result = repository_ctx.execute([nvcc_path, '-v',
+                                  '/dev/null', '-o', '/dev/null'])
+  target_dir = ""
+  for one_line in result.stderr.splitlines():
+    if one_line.startswith('#$ _TARGET_DIR_='):
+      target_dir = (cuda_config.cuda_toolkit_path + '/' +
+                    one_line.replace('#$ _TARGET_DIR_=', '') + "/include")
+  inc_entries = []
+  if target_dir != "":
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+  default_include = cuda_config.cuda_toolkit_path + '/include'
+  inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
+                     default_include)
+  return "\n".join(inc_entries)
+
 
 def _enable_cuda(repository_ctx):
   if "TF_NEED_CUDA" in repository_ctx.os.environ:
@@ -267,7 +332,7 @@ def _find_cuda_define(repository_ctx, cudnn_header_dir, define):
   cudnn_h_path = repository_ctx.path("%s/cudnn.h" % cudnn_header_dir)
   if not cudnn_h_path.exists:
     auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
-  result = repository_ctx.execute(["grep", "-E", define, str(cudnn_h_path)])
+  result = repository_ctx.execute(["grep", "--color=never", "-E", define, str(cudnn_h_path)])
   if result.stderr:
     auto_configure_fail("Error reading %s: %s" %
                         (result.stderr, str(cudnn_h_path)))
@@ -356,6 +421,10 @@ def _cpu_value(repository_ctx):
   return result.stdout.strip()
 
 
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  return _cpu_value(repository_ctx) == "Windows"
+
 def _lib_name(lib, cpu_value, version="", static=False):
   """Constructs the platform-specific name of a library.
 
@@ -368,7 +437,7 @@ def _lib_name(lib, cpu_value, version="", static=False):
   Returns:
     The platform-specific name of the library.
   """
-  if cpu_value == "Linux":
+  if cpu_value in ("Linux", "FreeBSD"):
     if static:
       return "lib%s.a" % lib
     else:
@@ -506,6 +575,9 @@ def _find_libs(repository_ctx, cuda_config):
       "cublas": _find_cuda_lib(
           "cublas", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
           cuda_config.cuda_version),
+      "cusolver": _find_cuda_lib(
+          "cusolver", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
       "curand": _find_cuda_lib(
           "curand", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
           cuda_config.cuda_version),
@@ -645,11 +717,8 @@ def _create_dummy_repository(repository_ctx):
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
-           "%{cuda_is_configured}": "False"
-       })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+           "%{cuda_is_configured}": "False",
+           "%{cuda_extra_copts}": "[]"
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -659,29 +728,13 @@ def _create_dummy_repository(repository_ctx):
            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
            "%{cudart_lib}": _lib_name("cudart", cpu_value),
            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+           "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
            "%{cufft_lib}": _lib_name("cufft", cpu_value),
            "%{curand_lib}": _lib_name("curand", cpu_value),
            "%{cupti_lib}": _lib_name("cupti", cpu_value),
-       })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
-           "%{cudart_static_lib}": _lib_name("cudart_static", cpu_value,
-                                             static=True),
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-           "%{cudart_lib}": _lib_name("cudart", cpu_value),
-           "%{cublas_lib}": _lib_name("cublas", cpu_value),
-           "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
-           "%{cufft_lib}": _lib_name("cufft", cpu_value),
-           "%{curand_lib}": _lib_name("curand", cpu_value),
-           "%{cupti_lib}": _lib_name("cupti", cpu_value),
-       })
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-           "%{platform}": cpu_value,
+           "%{cuda_include_genrules}": '',
+           "%{cuda_headers}": '',
        })
 
   # Create dummy files for the CUDA toolkit since they are still required by
@@ -694,6 +747,7 @@ def _create_dummy_repository(repository_ctx):
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudart", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cusolver", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudnn", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("curand", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cufft", cpu_value))
@@ -718,19 +772,125 @@ def _create_dummy_repository(repository_ctx):
                       _DUMMY_CROSSTOOL_BZL_FILE)
   repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-def _symlink_dir(repository_ctx, src_dir, dest_dir):
-  """Symlinks all the files in a directory.
+
+def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
+             empty_stdout_fine=False):
+  """Executes an arbitrary shell command.
 
   Args:
-    repository_ctx: The repository context.
-    src_dir: The source directory.
-    dest_dir: The destination directory to create the symlinks in.
+    repository_ctx: the repository_ctx object
+    cmdline: list of strings, the command to execute
+    error_msg: string, a summary of the error if the command fails
+    error_details: string, details about the error or steps to fix it
+    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+      it's an error
+  Return:
+    the result of repository_ctx.execute(cmdline)
   """
-  files = repository_ctx.path(src_dir).readdir()
-  for src_file in files:
-    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    auto_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else ""]))
+  return result
 
 
+def _norm_path(path):
+  """Returns a path with '/' and remove the trailing slash."""
+  path = path.replace("\\", "/")
+  if path[-1] == "/":
+    path = path[:-1]
+  return path
+
+
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
+  """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
+  """
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
+  command = []
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
+      # On Windows, symlink is not supported, so we just copy all the files.
+      cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
+      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
+      outs.append('      "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
+                     "\n".join(outs))
+  return genrule
+
+
+def _genrule(src_dir, genrule_name, command, outs):
+  """Returns a string with a genrule.
+
+  Genrule executes the given command and produces the given outputs.
+  """
+  return (
+      'genrule(\n' +
+      '    name = "' +
+      genrule_name + '",\n' +
+      '    outs = [\n' +
+      outs +
+      '    ],\n' +
+      '    cmd = """\n' +
+      command +
+      '    """,\n' +
+      ')\n\n'
+  )
+
+
+def _read_dir(repository_ctx, src_dir):
+  """Returns a string with all files in a directory.
+
+  Finds all files inside a directory, traversing subfolders and following
+  symlinks. The returned string contains the full path of all files
+  separated by line breaks.
+  """
+  if _is_windows(repository_ctx):
+    src_dir = src_dir.replace("/", "\\")
+    find_result = _execute(
+        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+        empty_stdout_fine=True)
+    # src_files will be used in genrule.outs where the paths must
+    # use forward slashes.
+    result = find_result.stdout.replace("\\", "/")
+  else:
+    find_result = _execute(
+        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True)
+    result = find_result.stdout
+  return result
+
+
+def _use_cuda_clang(repository_ctx):
+  if "TF_CUDA_CLANG" in repository_ctx.os.environ:
+    enable_cuda = repository_ctx.os.environ["TF_CUDA_CLANG"].strip()
+    return enable_cuda == "1"
+  return False
+
+def _compute_cuda_extra_copts(repository_ctx, cuda_config):
+  if _use_cuda_clang(repository_ctx):
+    capability_flags = ["--cuda-gpu-arch=sm_" + cap.replace(".", "") for cap in cuda_config.compute_capabilities]
+  else:
+    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+    capability_flags = []
+  return str(capability_flags)
+
 def _create_cuda_repository(repository_ctx):
   """Creates the repository containing files set up to build with CUDA."""
   cuda_config = _get_cuda_config(repository_ctx)
@@ -738,30 +898,49 @@ def _create_cuda_repository(repository_ctx):
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cuda_config.cudnn_install_basedir)
 
-  # Set up symbolic links for the cuda toolkit. We link at the individual file
-  # level not at the directory level. This is because the external library may
-  # have a different file layout from our desired structure.
+  # Set up symbolic links for the cuda toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/include", "cuda/include")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/bin", "cuda/bin")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/nvvm", "cuda/nvvm")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/extras/CUPTI/include",
-               "cuda/extras/CUPTI/include")
+  cuda_include_path = cuda_toolkit_path + "/include"
+  genrules = [_symlink_genrule_for_dir(repository_ctx,
+      cuda_include_path, "include", "cuda-include")]
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/nvvm", "nvvm", "cuda-nvvm"))
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/extras/CUPTI/include",
+      "extras/CUPTI/include", "cuda-extras"))
 
   cuda_libs = _find_libs(repository_ctx, cuda_config)
+  cuda_lib_src = []
+  cuda_lib_dest = []
   for lib in cuda_libs.values():
-    repository_ctx.symlink(lib.path, "cuda/lib/" + lib.file_name)
+    cuda_lib_src.append(lib.path)
+    cuda_lib_dest.append("lib/" + lib.file_name)
+  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                       cuda_lib_src, cuda_lib_dest))
 
   # Set up the symbolic links for cudnn if cudnn was was not installed to
   # CUDA_TOOLKIT_PATH.
-  if not repository_ctx.path("cuda/include/cudnn.h").exists:
-    repository_ctx.symlink(cudnn_header_dir + "/cudnn.h",
-                           "cuda/include/cudnn.h")
+  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+      cuda_include_path, '').splitlines()
+  if '/cudnn.h' not in included_files:
+    genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "include/",
+        "cudnn-include", [cudnn_header_dir + "/cudnn.h"], ["cudnn.h"]))
+  else:
+    genrules.append(
+            'filegroup(\n' +
+            '    name = "cudnn-include",\n' +
+            '    srcs = [],\n' +
+            ')\n'
+        )
 
   # Set up BUILD file for cuda/
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
-           "%{cuda_is_configured}": "True"
+           "%{cuda_is_configured}": "True",
+           "%{cuda_extra_copts}": _compute_cuda_extra_copts(repository_ctx, cuda_config),
+
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -771,37 +950,42 @@ def _create_cuda_repository(repository_ctx):
                cuda_config.cpu_value),
            "%{cudart_lib}": cuda_libs["cudart"].file_name,
            "%{cublas_lib}": cuda_libs["cublas"].file_name,
+           "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
            "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
            "%{cufft_lib}": cuda_libs["cufft"].file_name,
            "%{curand_lib}": cuda_libs["curand"].file_name,
            "%{cupti_lib}": cuda_libs["cupti"].file_name,
+           "%{cuda_include_genrules}": "\n".join(genrules),
+           "%{cuda_headers}": ('":cuda-include",\n' +
+                               '        ":cudnn-include",')
        })
-
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{cudnn_version}": cuda_config.cudnn_version,
-           "%{platform}": cuda_config.cpu_value,
-       })
-
   # Set up crosstool/
   _file(repository_ctx, "crosstool:BUILD")
   cc = find_cc(repository_ctx)
-  gcc_host_compiler_includes = _gcc_host_compiler_includes(repository_ctx, cc)
-  _tpl(repository_ctx, "crosstool:CROSSTOOL",
-       {
-           "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include',
-           "%{gcc_host_compiler_includes}": gcc_host_compiler_includes,
-       })
-  _tpl(repository_ctx,
-       "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-       {
-           "%{cpu_compiler}": str(cc),
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{gcc_host_compiler_path}": str(cc),
-           "%{cuda_compute_capabilities}": ", ".join(
-               ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
-       })
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
+  cuda_defines = {
+           "%{cuda_include_path}": _cuda_include_path(repository_ctx,
+                                                      cuda_config),
+           "%{host_compiler_includes}": host_compiler_includes,
+       }
+  if _use_cuda_clang(repository_ctx):
+    cuda_defines["%{clang_path}"] = cc
+    _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
+  else:
+    nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
+        (cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "")))
+    _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
+    _tpl(repository_ctx,
+         "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+         {
+             "%{cpu_compiler}": str(cc),
+             "%{cuda_version}": cuda_config.cuda_version,
+             "%{nvcc_path}": nvcc_path,
+             "%{gcc_host_compiler_path}": str(cc),
+             "%{cuda_compute_capabilities}": ", ".join(
+                 ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
+         })
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
@@ -824,10 +1008,20 @@ def _cuda_autoconf_impl(repository_ctx):
     _create_cuda_repository(repository_ctx)
 
 
+
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
-    local = True,
+    environ = [
+        _GCC_HOST_COMPILER_PATH,
+        "TF_NEED_CUDA",
+        _CUDA_TOOLKIT_PATH,
+        _CUDNN_INSTALL_PATH,
+        _TF_CUDA_VERSION,
+        _TF_CUDNN_VERSION,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+    ],
 )
+
 """Detects and configures the local CUDA toolchain.
 
 Add the following to your WORKSPACE FILE:
diff --git a/third_party/grpc.BUILD b/third_party/grpc.BUILD
index 1d1e2222dea..b79259618f2 100644
--- a/third_party/grpc.BUILD
+++ b/third_party/grpc.BUILD
@@ -176,8 +176,7 @@ cc_library(
         ".",
         "include",
     ],
-    deps = [
-    ],
+    linkopts = ["-lpthread"],
 )
 
 cc_library(
@@ -1782,6 +1781,7 @@ cc_library(
         ".",
         "include",
     ],
+    linkopts = ["-lpthread"],
     deps = [
         ":gpr",
         ":grpc_unsecure",
diff --git a/third_party/hadoop/hdfs.h b/third_party/hadoop/hdfs.h
index 560d8bba0e0..a664f3b50cf 100644
--- a/third_party/hadoop/hdfs.h
+++ b/third_party/hadoop/hdfs.h
@@ -171,7 +171,7 @@ void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
  * Connect to the hdfs.
  * @param nn   The NameNode.  See hdfsBuilderSetNameNode for details.
  * @param port The port on which the server is listening.
- * @param user the user name (this is hadoop domain user). Or NULL is equivelant
+ * @param user the user name (this is hadoop domain user). Or NULL is equivalent
  * to hhdfsConnect(host, port)
  * @return Returns a handle to the filesystem or NULL on error.
  * @deprecated Use hdfsBuilderConnect instead.
@@ -397,7 +397,7 @@ hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize,
                       short replication, tSize blocksize);
 
 /**
- * hdfsTruncateFile - Truncate a hdfs file to given lenght.
+ * hdfsTruncateFile - Truncate a hdfs file to given length.
  * @param fs The configured filesystem handle.
  * @param path The full path to the file.
  * @param newlength The size the file is to be truncated to
diff --git a/third_party/html5lib.BUILD b/third_party/html5lib.BUILD
new file mode 100644
index 00000000000..63aac14f155
--- /dev/null
+++ b/third_party/html5lib.BUILD
@@ -0,0 +1,17 @@
+# Description:
+# Import of html5lib library.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # BSD-like notice-style license, see LICENSE file
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "org_html5lib",
+    srcs = glob(["html5lib/**/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "@six_archive//:six",
+    ],
+)
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index aabff39d7b2..3a9a9a80f2e 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -89,6 +89,17 @@ cc_library(
         "-D_REENTRANT",
     ],
     includes = ["include"],
+    # pthread_atfork() is called for PPC.
+    linkopts = select({
+        "@%ws%//tensorflow:linux_ppc64le": [
+            "-lpthread",
+        ],
+        "@%ws%//tensorflow:linux_x86_64": [
+            "-lpthread",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     visibility = ["//visibility:public"],
 )
 
@@ -183,12 +194,17 @@ sh_binary(
     srcs = ["include/jemalloc/internal/size_classes.sh"],
 )
 
-# Size classes for Linux x86_64. Update if adding builds for other
+# Size classes for Linux x86_64 and ppc64le. Update if adding builds for other
 # architectures. See size_classes.sh for details on the arguments.
+# For default case, kept the arguments same as that of  x86_64 for now.
 genrule(
     name = "size_classes_h",
     outs = ["include/jemalloc/internal/size_classes.h"],
-    cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    cmd = select({
+        "@%ws%//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
+        "@%ws%//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+        "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    }),
     tools = [":size_classes_sh"],
 )
 
@@ -210,7 +226,13 @@ template_rule(
         "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
         "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
         "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
-        "#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+        "#undef CPU_SPINWAIT": "\n".join([
+            "#if defined(__powerpc64__) || defined(__powerpc__)",
+            "#define CPU_SPINWAIT __asm__ volatile(\"or 27,27,27\")",
+            "#else",
+            "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+            "#endif",
+        ]),
         "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
         "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
         "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
@@ -226,7 +248,13 @@ template_rule(
         "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
         "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
         "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
-        "#undef LG_PAGE": "#define LG_PAGE 12",
+        "#undef LG_PAGE": "\n".join([
+            "#if defined(__powerpc64__) || defined(__powerpc__)",
+            "#define LG_PAGE 16",
+            "#else",
+            "#define LG_PAGE 12",
+            "#endif",
+        ]),
         "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
         "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
         "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 78e03eadcfa..f6078052ece 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -9,17 +9,20 @@ load("@%ws%//third_party:common.bzl", "template_rule")
 
 libjpegturbo_nocopts = "-[W]error"
 
+WIN_COPTS = [
+    "/Ox",
+    "/w14711",  # function 'function' selected for inline expansion
+    "/w14710",  # 'function' : function not inlined
+]
+
 libjpegturbo_copts = select({
     ":android": [
         "-O2",
         "-fPIE",
         "-w",
     ],
-    ":windows": [
-        "/Ox",
-        "/w14711",  # function 'function' selected for inline expansion
-        "/w14710",  # 'function' : function not inlined
-    ],
+    ":windows": WIN_COPTS,
+    ":windows_msvc": WIN_COPTS,
     "//conditions:default": [
         "-O3",
         "-w",
@@ -370,6 +373,7 @@ genrule(
     outs = ["jconfig.h"],
     cmd = select({
         ":windows": "cp $(location jconfig_win.h) $@",
+        ":windows_msvc": "cp $(location jconfig_win.h) $@",
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
@@ -386,6 +390,7 @@ genrule(
     outs = ["jconfigint.h"],
     cmd = select({
         ":windows": "cp $(location jconfigint_win.h) $@",
+        ":windows_msvc": "cp $(location jconfigint_win.h) $@",
         "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
     }),
 )
@@ -482,5 +487,10 @@ config_setting(
 
 config_setting(
     name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+    name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
diff --git a/third_party/js.bzl b/third_party/js.bzl
new file mode 100644
index 00000000000..2d2339c95e5
--- /dev/null
+++ b/third_party/js.bzl
@@ -0,0 +1,420 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TensorBoard external JS dependencies (both infrastructure and frontend libs)
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
+
+
+  ##############################################################################
+  # TensorBoard Build Tools
+def tensorboard_js_workspace():
+  filegroup_external(
+      name = "org_nodejs",
+      # MIT with portions licensed:
+      # - MIT
+      # - Old MIT
+      # - 2-Clause-BSD
+      # - 3-Clause-BSD
+      # - ISC
+      # - Unicode
+      # - zlib
+      # - Artistic 2.0
+      licenses = ["notice"],
+      sha256_urls_extract_macos = {
+          "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
+              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
+          ],
+      },
+      sha256_urls_windows = {
+          "3d4cfca9dcec556a077a2324bf5bd165ea3e6e64a2bfd7fc6e7a1f0dc4eb552b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
+              "https://raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
+          ],
+          "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.exe",
+              "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
+          ],
+          "451a40570099a95488d6438f175813629e0430f87f23c8659bc18dc42494820a": [
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.lib",
+              "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
+          ],
+      },
+      sha256_urls_extract = {
+          "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
+              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
+          ],
+      },
+      strip_prefix = {
+          "node-v4.3.2-darwin-x64.tar.xz": "node-v4.3.2-darwin-x64",
+          "node-v4.3.2-linux-x64.tar.xz": "node-v4.3.2-linux-x64",
+      },
+      executable = [
+          "node",
+          "node.exe",
+      ],
+  )
+  
+  filegroup_external(
+      name = "com_microsoft_typescript",
+      licenses = ["notice"],  # Apache 2.0
+      sha256_urls = {
+          "a7d00bfd54525bc694b6e32f64c7ebcf5e6b7ae3657be5cc12767bce74654a47": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
+          ],
+          "8465342c318f9c4cf0a29b109fa63ee3742dd4dc7080d05d9fd8f604814d04cf": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
+          ],
+          "a67e36da3029d232e4e938e61a0a3302f516d71e7100d54dbf5362ad8618e994": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
+          ],
+      },
+      extra_build_file_content = "\n".join([
+          "sh_binary(",
+          "    name = \"tsc\",",
+          "    srcs = [\"tsc.sh\"],",
+          "    data = [",
+          "        \"tsc.js\",",
+          "        \"@org_nodejs\",",
+          "    ],",
+          ")",
+          "",
+          "genrule(",
+          "    name = \"tsc_sh\",",
+          "    outs = [\"tsc.sh\"],",
+          "    cmd = \"cat >$@ <<'EOF'\\n\" +",
+          "          \"#!/bin/bash\\n\" +",
+          "          \"NODE=external/org_nodejs/bin/node\\n\" +",
+          "          \"if [[ -e external/org_nodejs/node.exe ]]; then\\n\" +",
+          "          \"  NODE=external/org_nodejs/node.exe\\n\" +",
+          "          \"fi\\n\" +",
+          "          \"exec $${NODE} external/com_microsoft_typescript/tsc.js \\\"$$@\\\"\\n\" +",
+          "          \"EOF\",",
+          "    executable = True,",
+          ")",
+      ]),
+  )
+
+
+  native.new_http_archive(
+      name = "io_angular_clutz",
+      build_file = "//third_party:clutz.BUILD",
+      sha256 = "2981de41d1ff4774b544423da9a2cd8beb3be649e95aef2ef2fd83957300b3fe",
+      strip_prefix = "clutz-b0db5ade9bb535d387f05292316c422790c9848e",
+      urls = [
+          "http://mirror.bazel.build/github.com/angular/clutz/archive/b0db5ade9bb535d387f05292316c422790c9848e.tar.gz",  # 2017-05-22
+          "https://github.com/angular/clutz/archive/b0db5ade9bb535d387f05292316c422790c9848e.tar.gz",
+      ],
+  )
+
+  filegroup_external(
+      name = "com_google_javascript_closure_compiler_externs",
+      licenses = ["notice"],  # Apache 2.0
+      sha256_urls_extract = {
+          "0f515a6ebfa138490b3c5ea9f3591ea1a7e4a930d3074f18b3eca86084ad9b66": [
+              "http://mirror.bazel.build/github.com/google/closure-compiler/archive/b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz",  # 2017-06-02
+              "https://github.com/google/closure-compiler/archive/b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz",
+          ],
+      },
+      strip_prefix = {"b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz": "closure-compiler-b37e6000001b0a6bf4c0be49024ebda14a8711d9/externs"},
+  )
+
+  filegroup_external(
+      name = "com_google_javascript_closure_compiler_externs_polymer",
+      licenses = ["notice"],  # Apache 2.0
+      sha256_urls = {
+          "23baad9a200a717a821c6df504c84d3a893d7ea9102b14876eb80097e3b94292": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/google/closure-compiler/0e8dc5597a295ee259e3fecd98d6535dc621232f/contrib/externs/polymer-1.0.js",  # 2017-05-27
+              "https://raw.githubusercontent.com/google/closure-compiler/0e8dc5597a295ee259e3fecd98d6535dc621232f/contrib/externs/polymer-1.0.js",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_threejs",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+          ],
+          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+          ],
+      },
+  )
+  
+  ##############################################################################
+  # TensorBoard JavaScript Production Dependencies
+  web_library_external(
+      name = "com_lodash",
+      licenses = ["notice"],  # MIT
+      sha256 = "0e88207e5f90af4ce8790d6e1e7d09d2702d81bce0bafdc253d18c0a5bf7661e",
+      urls = [
+          "http://mirror.bazel.build/github.com/lodash/lodash/archive/3.10.1.tar.gz",
+          "https://github.com/lodash/lodash/archive/3.10.1.tar.gz",
+      ],
+      strip_prefix = "lodash-3.10.1",
+      path = "/lodash",
+      srcs = ["lodash.js"],
+  )
+
+  filegroup_external(
+      name = "com_numericjs",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "0e94aada97f12dee6118064add9170484c55022f5d53206ee4407143cd36ddcd": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
+              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
+          ],
+          "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
+              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "com_palantir_plottable",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls_extract = {
+          # Plottable doesn't have a release tarball on GitHub. Using the
+          # sources directly from git also requires running Node tooling
+          # beforehand to generate files. NPM is the only place to get it.
+          "e3159beb279391c47433789f22b32bac88488cfcad6c0b6ec8605ce6b0081b0d": [
+              "http://mirror.bazel.build/registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+              "https://registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "io_github_cpettitt_dagre",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
+              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
+          ],
+          "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
+              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "io_github_cpettitt_graphlib",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
+              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
+          ],
+          "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
+              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "io_github_waylonflinn_weblas",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "633f2861a9a862b9cd7967e841e14dd3527912f209d6563595774fa31e3d84cb": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSES",
+              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSE",
+          ],
+          "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_d3js",
+      # no @license header
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256_urls_extract = {
+          "b5fac5b296bc196e6aa7b59f9e33986fc44d23d59a0e211705187be9e35b943d": [
+              "http://mirror.bazel.build/github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+              "https://github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+          ],
+      },
+      # TODO(jart): Use srcs=["d3.js"] instead of this once supported.
+      generated_rule_name = "all_files",
+      extra_build_file_content = "\n".join([
+          "filegroup(",
+          "    name = \"org_d3js\",",
+          "    srcs = [\"d3.js\"],",
+          ")",
+      ]),
+  )
+
+  filegroup_external(
+      name = "org_chromium_catapult_vulcanized_trace_viewer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256_urls = {
+          "f0df289ba9d03d857ad1c2f5918861376b1510b71588ffc60eff5c7a7bfedb09": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/LICENSE",
+              "https://raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/LICENSE",
+          ],
+          "9e99e79439ea5a1471bd4dd325bd6733e133bcb3da4df4b878ed6d2aec7c8d86": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/trace_viewer_full.html",
+              "https://raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/trace_viewer_full.html"
+          ],
+      },
+  )
+
+  ##############################################################################
+  # TensorBoard Testing Dependencies
+  web_library_external(
+      name = "org_npmjs_registry_accessibility_developer_tools",
+      licenses = ["notice"],  # Apache License 2.0
+      sha256 = "1d6a72f401c9d53f68238c617dd43a05cd85ca5aa2e676a5b3c352711448e093",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+          "https://registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/accessibility-developer-tools",
+      suppress = ["strictDependencies"],
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_async",
+      licenses = ["notice"],  # MIT
+      sha256 = "08655255ae810bf4d1cb1642df57658fcce823776d3ba8f4b46f4bbff6c87ece",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/async/-/async-1.5.0.tgz",
+          "https://registry.npmjs.org/async/-/async-1.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/async",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_chai",
+      licenses = ["notice"],  # MIT
+      sha256 = "aca8137bed5bb295bd7173325b7ad604cd2aeb341d739232b4f9f0b26745be90",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+          "https://registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_mocha",
+      licenses = ["notice"],  # MIT
+      sha256 = "13ef37a071196a2fba680799b906555d3f0ab61e80a7e8f73f93e77914590dd4",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+          "https://registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+      ],
+      suppress = ["strictDependencies"],
+      strip_prefix = "package",
+      path = "/mocha",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "49edb057695fc9019aae992bf7e677a07de7c6ce2bf9f9facde4a245045d1532",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+          "https://registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+      ],
+      strip_prefix = "package/lib",
+      path = "/sinonjs",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon_chai",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "b85fc56f713832960b56fe9269ee4bb2cd41edd2ceb130b0936e5bdbed5dea63",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+          "https://registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/sinon-chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_stacky",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c659e60f7957d9d80c23a7aacc4d71b19c6421a08f91174c0062de369595acae",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+          "https://registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/stacky",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_web_component_tester",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9d4ebd4945df8a936916d4d32b7f280f2a3afa35f79e7ca8ad3ed0a42770c537",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+          "https://registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/web-component-tester",
+      suppress = [
+          "absolutePaths",
+          "strictDependencies",
+      ],
+      deps = [
+          "@com_lodash",
+          "@org_npmjs_registry_accessibility_developer_tools",
+          "@org_npmjs_registry_async",
+          "@org_npmjs_registry_chai",
+          "@org_npmjs_registry_mocha",
+          "@org_npmjs_registry_sinon",
+          "@org_npmjs_registry_sinon_chai",
+          "@org_npmjs_registry_stacky",
+          "@org_polymer_test_fixture",
+      ],
+  )
+
+  web_library_external(
+      name = "org_polymer_test_fixture",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "59d6cfb1187733b71275becfea181fe0aa1f734df5ff77f5850c806bbbf9a0d9",
+      strip_prefix = "test-fixture-2.0.1",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+          "https://github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+      ],
+      path = "/test-fixture",
+      exclude = ["test/**"],
+  )
+
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 037009c072d..4124f2db637 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -11,18 +11,8 @@ exports_files(["LICENSE"])
 libxsmm_interface_arguments = "0 1"
 
 # Arguments to ./scripts/libxsmm_config.py, see that file for detailed description.
-#  ilp64: 0 (no)
-#  big: 0 (no)
-#  offload: 0 (no)
-#  alignment [b]
-#  prefetch: -1 (auto)
-#  threshold: 0 (auto)
-#  synchronize: yes
-#  jit: 1 (yes)
-#  flags: 0 (none)
-#  alpha = 1
-#  beta = 1
-libxsmm_config_arguments = "0 0 0 64 -1 0 1 1 0 1 1"
+# rely on default arguments
+libxsmm_config_arguments = ""
 
 # Arguments to ./scripts/libxsmm_dispatch.py, see that file for detailed description.
 #  (dummy argument)
@@ -56,22 +46,28 @@ genrule(
 cc_library(
     name = "xsmm_avx",
     srcs = [
-        "src/libxsmm_main.c",
+        "src/libxsmm_cpuid_x86.c",
+        "src/libxsmm_dnn.c",
+        "src/libxsmm_dnn_convolution_backward.c",
+        "src/libxsmm_dnn_convolution_forward.c",
+        "src/libxsmm_dnn_convolution_weight_update.c",
+        "src/libxsmm_dnn_convolution_winograd_backward.c",
+        "src/libxsmm_dnn_convolution_winograd_forward.c",
+        "src/libxsmm_dnn_convolution_winograd_weight_update.c",
+        "src/libxsmm_dnn_handle.c",
         "src/libxsmm_dump.c",
-        "src/libxsmm_malloc.c",
+        "src/libxsmm_ext_gemm.c",
+        "src/libxsmm_ext_trans.c",
+        "src/libxsmm_fsspmdm.c",
         "src/libxsmm_gemm.c",
+        "src/libxsmm_main.c",
+        "src/libxsmm_malloc.c",
+        "src/libxsmm_perf.c",
+        "src/libxsmm_spmdm.c",
+        "src/libxsmm_sync.c",
         "src/libxsmm_timer.c",
         "src/libxsmm_trace.c",
         "src/libxsmm_trans.c",
-        "src/libxsmm_sync.c",
-        "src/libxsmm_perf.c",
-        "src/libxsmm_spmdm.c",
-        "src/libxsmm_dnn.c",
-        "src/libxsmm_dnn_handle.c",
-        "src/libxsmm_dnn_convolution_forward.c",
-        "src/libxsmm_dnn_convolution_backward.c",
-        "src/libxsmm_dnn_convolution_weight_update.c",
-        "src/libxsmm_cpuid_x86.c",
     ] + glob([
         "src/generator_*.c",
     ]),
@@ -79,6 +75,7 @@ cc_library(
         "include/libxsmm_cpuid.h",
         "include/libxsmm_dnn.h",
         "include/libxsmm_frontend.h",
+        "include/libxsmm_fsspmdm.h",
         "include/libxsmm_generator.h",
         "include/libxsmm_intrinsics_x86.h",
         "include/libxsmm_macros.h",
@@ -87,18 +84,23 @@ cc_library(
         "include/libxsmm_sync.h",
         "include/libxsmm_timer.h",
         "include/libxsmm_typedefs.h",
+        # Source files #included internally:
+        "src/libxsmm_gemm_diff.c",
+        "src/libxsmm_hash.c",
         # Generated:
         "include/libxsmm.h",
         "include/libxsmm_config.h",
         "include/libxsmm_dispatch.h",
-    ],
+    ] + glob([
+        # trigger rebuild if template changed
+        "src/template/*.c",
+    ]),
     copts = [
         "-mavx",  # JIT does not work without avx anyway, and this silences some CRC32 warnings.
         "-Wno-vla",  # Libxsmm convolutions heavily use VLA.
     ],
     defines = [
         "LIBXSMM_BUILD",
-        "LIBXSMM_CPUID_X86_NOINLINE",
         "__BLAS=0",
     ],
     includes = [
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 02f13939a91..32266997a7e 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -24,23 +24,20 @@ llvm_host_triple = "x86_64-unknown-linux_gnu"
 
 llvm_targets = [
     "AArch64",
+    # Uncomment to enable the AMDGPU backend.
+    # TODO(phawkins): use a configure-time test.
+    # "AMDGPU",
     "ARM",
     "NVPTX",
     "PowerPC",
     "X86",
 ]
 
-llvm_target_asm_parsers = [
-    "AArch64",
-    "ARM",
-    "NVPTX",
-    "PowerPC",
-    "X86",
-]
+llvm_target_asm_parsers = llvm_targets
 
-llvm_target_asm_printers = llvm_target_asm_parsers
+llvm_target_asm_printers = llvm_targets
 
-llvm_target_disassemblers = llvm_target_asm_parsers
+llvm_target_disassemblers = llvm_targets
 
 # TODO(phawkins): the set of CMake variables was hardcoded for expediency.
 # However, we should really detect many of these via configure-time tests.
@@ -70,6 +67,7 @@ cmake_vars = {
 
     # Features
     "HAVE_BACKTRACE": 1,
+    "BACKTRACE_HEADER": "execinfo.h",
     "HAVE_DLOPEN": 1,
     "HAVE_FUTIMES": 1,
     "HAVE_GETCWD": 1,
@@ -151,6 +149,11 @@ all_cmake_vars = select({
         cmake_vars + llvm_target_cmake_vars("X86", "x86_64-apple-darwin") +
         darwin_cmake_vars,
     ),
+    "@%ws%//tensorflow:linux_ppc64le": cmake_var_string(
+        cmake_vars +
+        llvm_target_cmake_vars("PowerPC", "powerpc64le-unknown-linux_gnu") +
+        linux_cmake_vars,
+    ),
     "//conditions:default": cmake_var_string(
         cmake_vars +
         llvm_target_cmake_vars("X86", "x86_64-unknown-linux_gnu") +
@@ -346,11 +349,32 @@ llvm_target_list = [
             ("-gen-searchable-tables", "lib/Target/AArch64/AArch64GenSystemOperands.inc"),
         ],
     },
+    {
+        "name": "AMDGPU",
+        "lower_name": "amdgpu",
+        "short_name": "AMDGPU",
+        "tbl_outs": [
+            ("-gen-register-bank", "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc"),
+            ("-gen-register-info", "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc"),
+            ("-gen-instr-info", "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc"),
+            ("-gen-dag-isel", "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc"),
+            ("-gen-callingconv", "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc"),
+            ("-gen-subtarget", "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc"),
+            ("-gen-tgt-intrinsic", "lib/Target/AMDGPU/AMDGPUGenIntrinsics.inc"),
+            ("-gen-emitter", "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc"),
+            ("-gen-dfa-packetizer", "lib/Target/AMDGPU/AMDGPUGenDFAPacketizer.inc"),
+            ("-gen-asm-writer", "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc"),
+            ("-gen-asm-matcher", "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc"),
+            ("-gen-disassembler", "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc"),
+            ("-gen-pseudo-lowering", "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc"),
+        ],
+    },
     {
         "name": "ARM",
         "lower_name": "arm",
         "short_name": "ARM",
         "tbl_outs": [
+            ("-gen-register-bank", "lib/Target/ARM/ARMGenRegisterBank.inc"),
             ("-gen-register-info", "lib/Target/ARM/ARMGenRegisterInfo.inc"),
             ("-gen-instr-info", "lib/Target/ARM/ARMGenInstrInfo.inc"),
             ("-gen-emitter", "lib/Target/ARM/ARMGenMCCodeEmitter.inc"),
@@ -359,6 +383,7 @@ llvm_target_list = [
             ("-gen-asm-matcher", "lib/Target/ARM/ARMGenAsmMatcher.inc"),
             ("-gen-dag-isel", "lib/Target/ARM/ARMGenDAGISel.inc"),
             ("-gen-fast-isel", "lib/Target/ARM/ARMGenFastISel.inc"),
+            ("-gen-global-isel", "lib/Target/ARM/ARMGenGlobalISel.inc"),
             ("-gen-callingconv", "lib/Target/ARM/ARMGenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/ARM/ARMGenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/ARM/ARMGenDisassemblerTables.inc"),
@@ -398,6 +423,7 @@ llvm_target_list = [
         "lower_name": "x86",
         "short_name": "X86",
         "tbl_outs": [
+            ("-gen-register-bank", "lib/Target/X86/X86GenRegisterBank.inc"),
             ("-gen-register-info", "lib/Target/X86/X86GenRegisterInfo.inc"),
             ("-gen-disassembler", "lib/Target/X86/X86GenDisassemblerTables.inc"),
             ("-gen-instr-info", "lib/Target/X86/X86GenInstrInfo.inc"),
@@ -406,8 +432,10 @@ llvm_target_list = [
             ("-gen-asm-matcher", "lib/Target/X86/X86GenAsmMatcher.inc"),
             ("-gen-dag-isel", "lib/Target/X86/X86GenDAGISel.inc"),
             ("-gen-fast-isel", "lib/Target/X86/X86GenFastISel.inc"),
+            ("-gen-global-isel", "lib/Target/X86/X86GenGlobalISel.inc"),
             ("-gen-callingconv", "lib/Target/X86/X86GenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/X86/X86GenSubtargetInfo.inc"),
+            ("-gen-x86-EVEX2VEX-tables", "lib/Target/X86/X86GenEVEX2VEXTables.inc"),
         ],
     },
 ]
@@ -430,6 +458,16 @@ llvm_target_list = [
     for target in llvm_target_list
 ]
 
+# This target is used to provide *.def files to x86_code_gen.
+# Files with '.def' extension are not allowed in 'srcs' of 'cc_library' rule.
+cc_library(
+    name = "x86_defs",
+    hdrs = glob([
+        "lib/Target/X86/*.def",
+    ]),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "aarch64_asm_parser",
     srcs = glob([
@@ -617,6 +655,7 @@ cc_library(
         "lib/Analysis/*.cpp",
         "lib/Analysis/*.inc",
         "include/llvm/Transforms/Utils/Local.h",
+        "include/llvm/Transforms/Scalar.h",
         "lib/Analysis/*.h",
     ]),
     hdrs = glob([
@@ -625,6 +664,7 @@ cc_library(
         "include/llvm/Analysis/*.inc",
     ]),
     deps = [
+        ":binary_format",
         ":config",
         ":core",
         ":object",
@@ -633,6 +673,184 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "amdgpu_desc",
+    srcs = glob([
+        "lib/Target/AMDGPU/MCTargetDesc/*.c",
+        "lib/Target/AMDGPU/MCTargetDesc/*.cpp",
+        "lib/Target/AMDGPU/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.h",
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.def",
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.inc",
+        "lib/Target/AMDGPU/MCTargetDesc/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_asm_printer",
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_disassembler",
+    srcs = glob([
+        "lib/Target/AMDGPU/Disassembler/*.c",
+        "lib/Target/AMDGPU/Disassembler/*.cpp",
+        "lib/Target/AMDGPU/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/Disassembler/*.h",
+        "include/llvm/Target/AMDGPU/Disassembler/*.def",
+        "include/llvm/Target/AMDGPU/Disassembler/*.inc",
+        "lib/Target/AMDGPU/Disassembler/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_desc",
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":config",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_info",
+    srcs = glob([
+        "lib/Target/AMDGPU/TargetInfo/*.c",
+        "lib/Target/AMDGPU/TargetInfo/*.cpp",
+        "lib/Target/AMDGPU/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/TargetInfo/*.h",
+        "include/llvm/Target/AMDGPU/TargetInfo/*.def",
+        "include/llvm/Target/AMDGPU/TargetInfo/*.inc",
+        "lib/Target/AMDGPU/TargetInfo/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_target_gen",
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_utils",
+    srcs = glob([
+        "lib/Target/AMDGPU/Utils/*.c",
+        "lib/Target/AMDGPU/Utils/*.cpp",
+        "lib/Target/AMDGPU/Utils/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/Utils/*.h",
+        "include/llvm/Target/AMDGPU/Utils/*.def",
+        "include/llvm/Target/AMDGPU/Utils/*.inc",
+        "lib/Target/AMDGPU/Utils/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_target_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_asm_parser",
+    srcs = glob([
+        "lib/Target/AMDGPU/AsmParser/*.c",
+        "lib/Target/AMDGPU/AsmParser/*.cpp",
+        "lib/Target/AMDGPU/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/AsmParser/*.h",
+        "include/llvm/Target/AMDGPU/AsmParser/*.def",
+        "include/llvm/Target/AMDGPU/AsmParser/*.inc",
+        "lib/Target/AMDGPU/AsmParser/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_desc",
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_asm_printer",
+    srcs = glob([
+        "lib/Target/AMDGPU/InstPrinter/*.c",
+        "lib/Target/AMDGPU/InstPrinter/*.cpp",
+        "lib/Target/AMDGPU/InstPrinter/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/InstPrinter/*.h",
+        "include/llvm/Target/AMDGPU/InstPrinter/*.def",
+        "include/llvm/Target/AMDGPU/InstPrinter/*.inc",
+        "lib/Target/AMDGPU/InstPrinter/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_utils",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_code_gen",
+    srcs = glob([
+        "lib/Target/AMDGPU/*.c",
+        "lib/Target/AMDGPU/*.cpp",
+        "lib/Target/AMDGPU/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/*.h",
+        "include/llvm/Target/AMDGPU/*.def",
+        "include/llvm/Target/AMDGPU/*.inc",
+        "lib/Target/AMDGPU/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_asm_printer",
+        ":amdgpu_desc",
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":global_i_sel",
+        ":ipo",
+        ":mc",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+        ":vectorize",
+    ],
+)
+
 cc_library(
     name = "arm_asm_parser",
     srcs = glob([
@@ -719,6 +937,7 @@ cc_library(
         "lib/Target/ARM/MCTargetDesc/*.cpp",
         "lib/Target/ARM/MCTargetDesc/*.inc",
         "lib/Target/ARM/*.h",
+        "include/llvm/CodeGen/GlobalISel/GISelAccessor.h",
     ]),
     hdrs = glob([
         "include/llvm/Target/ARM/MCTargetDesc/*.h",
@@ -800,6 +1019,7 @@ cc_library(
         "include/llvm/AsmParser/*.inc",
     ]),
     deps = [
+        ":binary_format",
         ":config",
         ":core",
         ":support",
@@ -818,9 +1038,11 @@ cc_library(
         "include/llvm/CodeGen/AsmPrinter/*.h",
         "include/llvm/CodeGen/AsmPrinter/*.def",
         "include/llvm/CodeGen/AsmPrinter/*.inc",
+        "lib/CodeGen/AsmPrinter/*.def",
     ]),
     deps = [
         ":analysis",
+        ":binary_format",
         ":code_gen",
         ":config",
         ":core",
@@ -833,6 +1055,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "binary_format",
+    srcs = glob([
+        "lib/BinaryFormat/*.c",
+        "lib/BinaryFormat/*.cpp",
+        "lib/BinaryFormat/*.inc",
+        "lib/BinaryFormat/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/BinaryFormat/*.h",
+        "include/llvm/BinaryFormat/*.def",
+        "include/llvm/BinaryFormat/*.inc",
+    ]),
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "bit_reader",
     srcs = glob([
@@ -874,6 +1115,7 @@ cc_library(
         ":analysis",
         ":config",
         ":core",
+        ":mc",
         ":support",
     ],
 )
@@ -898,7 +1140,9 @@ cc_library(
         ":bit_writer",
         ":config",
         ":core",
+        ":instrumentation",
         ":mc",
+        ":profile_data",
         ":scalar",
         ":support",
         ":target",
@@ -929,6 +1173,7 @@ cc_library(
     deps = [
         ":attributes_compat_gen",
         ":attributes_gen",
+        ":binary_format",
         ":config",
         ":intrinsics_gen",
         ":support",
@@ -1095,6 +1340,9 @@ cc_library(
         "lib/Transforms/IPO/*.c",
         "lib/Transforms/IPO/*.cpp",
         "lib/Transforms/IPO/*.inc",
+        "include/llvm/Transforms/SampleProfile.h",
+        "include/llvm-c/Transforms/IPO.h",
+        "include/llvm-c/Transforms/PassManagerBuilder.h",
         "lib/Transforms/IPO/*.h",
     ]),
     hdrs = glob([
@@ -1104,6 +1352,7 @@ cc_library(
     ]),
     deps = [
         ":analysis",
+        ":bit_reader",
         ":bit_writer",
         ":config",
         ":core",
@@ -1345,6 +1594,7 @@ cc_library(
         "include/llvm/Object/*.inc",
     ]),
     deps = [
+        ":binary_format",
         ":bit_reader",
         ":config",
         ":core",
@@ -1360,6 +1610,7 @@ cc_library(
         "lib/Transforms/ObjCARC/*.c",
         "lib/Transforms/ObjCARC/*.cpp",
         "lib/Transforms/ObjCARC/*.inc",
+        "include/llvm/Transforms/ObjCARC.h",
         "lib/Transforms/ObjCARC/*.h",
     ]),
     hdrs = glob([
@@ -1669,6 +1920,7 @@ cc_library(
         "lib/Support/Unix/*.inc",
         "lib/Support/Unix/*.h",
         "include/llvm-c/*.h",
+        "include/llvm/CodeGen/MachineValueType.h",
         "lib/Support/*.h",
     ]),
     hdrs = glob([
@@ -1677,7 +1929,11 @@ cc_library(
         "include/llvm/Support/*.inc",
         "include/llvm/ADT/*.h",
         "include/llvm/Support/ELFRelocs/*.def",
-    ]) + ["include/llvm/Support/DataTypes.h"],
+        "include/llvm/Support/WasmRelocs/*.def",
+    ]) + [
+        "include/llvm/Support/DataTypes.h",
+        "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
+    ],
     deps = [
         ":config",
         ":demangle",
@@ -1774,6 +2030,7 @@ cc_library(
         ":analysis",
         ":config",
         ":core",
+        ":scalar",
         ":support",
         ":transform_utils",
     ],
@@ -1853,6 +2110,7 @@ cc_library(
         ":support",
         ":target",
         ":x86_asm_printer",
+        ":x86_defs",
         ":x86_desc",
         ":x86_info",
         ":x86_utils",
diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD
new file mode 100644
index 00000000000..7c6e3dc3f05
--- /dev/null
+++ b/third_party/lmdb.BUILD
@@ -0,0 +1,37 @@
+# Description:
+#   LMDB is the Lightning Memory-mapped Database.
+
+licenses(["notice"])  # OpenLDAP Public License
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "lmdb",
+    srcs = [
+        "mdb.c",
+        "midl.c",
+    ],
+    hdrs = [
+        "lmdb.h",
+        "midl.h",
+    ],
+    copts = [
+        "-w",
+    ],
+    linkopts = select({
+        ":windows": ["-Wl,advapi32.lib"],  # InitializeSecurityDescriptor, SetSecurityDescriptorDacl
+        ":windows_msvc": ["-Wl,advapi32.lib"],
+        "//conditions:default": ["-lpthread"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+    name = "windows_msvc",
+    values = {"cpu": "x64_windows_msvc"},
+)
diff --git a/third_party/markdown.BUILD b/third_party/markdown.BUILD
new file mode 100644
index 00000000000..fa3e85d5304
--- /dev/null
+++ b/third_party/markdown.BUILD
@@ -0,0 +1,15 @@
+# Description:
+#   Markdown processor
+
+package(default_visibility = ["//visibility:public"])
+
+# This software says they use a BSD license.
+licenses(["notice"])
+
+exports_files(["LICENSE.md"])
+
+py_library(
+    name = "org_pythonhosted_markdown",
+    srcs = glob(["markdown/**/*.py"]),
+    srcs_version = "PY2AND3",
+)
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 7dd348975dd..8c86766effa 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -8,9 +8,18 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+
 cc_library(
     name = "intel_binary_blob",
+    srcs = if_mkl([
+        "libdl.so.2",
+        "libmklml_intel.so",
+        "libiomp5.so",
+    ]),
     includes = ["."],
-    srcs = ["libmklml_intel.so", "libiomp5.so"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index da89330c5af..9a28b312c2d 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -1,13 +1,13 @@
-# Macros for building MKL code.
-
-def if_mkl(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with MKL.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "//third_party/mkl:using_mkl": if_true,
-        "//conditions:default": if_false
-    })
+# Macros for building MKL code.
+
+def if_mkl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with MKL.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "//third_party/mkl:using_mkl": if_true,
+        "//conditions:default": if_false
+    })
diff --git a/third_party/mpi/BUILD b/third_party/mpi/BUILD
new file mode 100644
index 00000000000..ff3f437e924
--- /dev/null
+++ b/third_party/mpi/BUILD
@@ -0,0 +1,25 @@
+licenses(["restricted"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load("//third_party/mpi:mpi.bzl", "mpi_hdr")
+load("//third_party/mpi:mpi.bzl", "if_mpi")
+
+cc_library(
+    name = "mpi",
+    srcs = if_mpi([
+        "libmpi.so",
+    ]),
+    hdrs = if_mpi(mpi_hdr()),
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl
new file mode 100644
index 00000000000..38ce91c4d06
--- /dev/null
+++ b/third_party/mpi/mpi.bzl
@@ -0,0 +1,17 @@
+#OpenMPI and Mvapich/mpich require different headers
+#based on the configuration options return one or the other
+
+def mpi_hdr():
+    MPI_LIB_IS_OPENMPI=True
+    hdrs = []    
+    if MPI_LIB_IS_OPENMPI:
+        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
+    else:
+        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
+    return hdrs
+
+def if_mpi(if_true, if_false = []):
+    return select({
+        "//tensorflow:with_mpi_support": if_true,
+        "//conditions:default": if_false
+    })
diff --git a/third_party/nasm.BUILD b/third_party/nasm.BUILD
index 0450b353944..341d58068be 100644
--- a/third_party/nasm.BUILD
+++ b/third_party/nasm.BUILD
@@ -101,15 +101,30 @@ cc_binary(
     ],
     copts = select({
         ":windows": [],
+        ":windows_msvc": [],
         "//conditions:default": [
             "-w",
             "-std=c99",
         ],
     }),
+    defines = select({
+        ":windows": [],
+        ":windows_msvc": [],
+        "//conditions:default": ["HAVE_SNPRINTF"],
+    }),
     visibility = ["@jpeg//:__pkg__"],
 )
 
 config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows_msvc"},
+    name = "windows_msvc",
+    values = {
+        "cpu": "x64_windows_msvc",
+    },
+)
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
 )
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index bb460a05e08..06b9b8ff68a 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -43,6 +43,24 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    linkopts = select({
+        "@%ws%//tensorflow:android": [
+            "-pie",
+        ],
+        "@%ws%//tensorflow:darwin": [
+            "-Wl,-framework",
+            "-Wl,CoreFoundation",
+            "-Wl,-framework",
+            "-Wl,Security",
+        ],
+        "@%ws%//tensorflow:ios": [],
+        "@%ws%//tensorflow:windows": [
+            "ws2_32.lib",
+        ],
+        "//conditions:default": [
+            "-lrt",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/ortools.BUILD b/third_party/ortools.BUILD
new file mode 100644
index 00000000000..61191e3d271
--- /dev/null
+++ b/third_party/ortools.BUILD
@@ -0,0 +1,13 @@
+# Google's software suite for combinatorial optimization
+
+licenses(["notice"])  # Apache2 license
+
+exports_files(["LICENSE-2.0.txt"])
+
+native.cc_library(
+    name = "linear_solver_glop",
+    deps = [
+        "@ortools_archive//linear_solver:linear_solver_glop",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/polymer.bzl b/third_party/polymer.bzl
new file mode 100644
index 00000000000..bd6e05803cf
--- /dev/null
+++ b/third_party/polymer.bzl
@@ -0,0 +1,1335 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TensorBoard Polymer Dependencies
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
+
+def tensorboard_polymer_workspace():
+  web_library_external(
+      name = "org_polymer_font_roboto",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
+          "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
+      ],
+      strip_prefix = "font-roboto-1.0.1",
+      path = "/font-roboto",
+      srcs = ["roboto.html"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_hydrolysis",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "703b50f6b00f9e0546b5a3451da57bb20f77a166e27e4967923b9e835bab9b80",
+      urls = [
+          "http://mirror.bazel.build/github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+          "https://github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+      ],
+      strip_prefix = "polymer-analyzer-1.19.3",
+      path = "/hydrolysis",
+      srcs = [
+          "hydrolysis-analyzer.html",
+          "hydrolysis.html",
+          "hydrolysis.js",
+      ],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_a11y_announcer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
+          "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
+      ],
+      strip_prefix = "iron-a11y-announcer-1.0.5",
+      path = "/iron-a11y-announcer",
+      srcs = ["iron-a11y-announcer.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_a11y_keys_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
+          "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
+      ],
+      strip_prefix = "iron-a11y-keys-behavior-1.1.8",
+      path = "/iron-a11y-keys-behavior",
+      srcs = ["iron-a11y-keys-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_ajax",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
+          "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
+      ],
+      strip_prefix = "iron-ajax-1.2.0",
+      path = "/iron-ajax",
+      srcs = [
+          "iron-ajax.html",
+          "iron-request.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_promise_polyfill",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_autogrow_textarea",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
+          "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
+      ],
+      strip_prefix = "iron-autogrow-textarea-1.0.12",
+      path = "/iron-autogrow-textarea",
+      srcs = ["iron-autogrow-textarea.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_form_element_behavior",
+          "@org_polymer_iron_validatable_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_behaviors",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
+          "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
+      ],
+      strip_prefix = "iron-behaviors-1.0.17",
+      path = "/iron-behaviors",
+      srcs = [
+          "iron-button-state.html",
+          "iron-control-state.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_checked_element_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
+          "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
+      ],
+      strip_prefix = "iron-checked-element-behavior-1.0.4",
+      path = "/iron-checked-element-behavior",
+      srcs = ["iron-checked-element-behavior.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_form_element_behavior",
+          "@org_polymer_iron_validatable_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_component_page",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3636e8b9a1f229fc33b5aad3933bd02a9825f66e679a0be31855d7c8245c4b4b",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "iron-component-page-1.1.4",
+      path = "/iron-component-page",
+      srcs = ["iron-component-page.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_hydrolysis",
+          "@org_polymer_iron_ajax",
+          "@org_polymer_iron_doc_viewer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_icons",
+          "@org_polymer_iron_selector",
+          "@org_polymer_paper_header_panel",
+          "@org_polymer_paper_styles",
+          "@org_polymer_paper_toolbar",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_collapse",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
+          "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
+      ],
+      strip_prefix = "iron-collapse-1.0.8",
+      path = "/iron-collapse",
+      srcs = ["iron-collapse.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_resizable_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_demo_helpers",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
+          "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
+      ],
+      strip_prefix = "iron-demo-helpers-1.1.0",
+      path = "/iron-demo-helpers",
+      srcs = [
+          "demo-pages-shared-styles.html",
+          "demo-snippet.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_icons",
+          "@org_polymer_marked_element",
+          "@org_polymer_paper_icon_button",
+          "@org_polymer_paper_styles",
+          "@org_polymer_prism_element",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_doc_viewer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "f0e9dfbbcd94d7e88ce82cb61e615406ace63c185fee9396f7f182206ca5cc9a",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+          "https://github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+      ],
+      strip_prefix = "iron-doc-viewer-1.0.12",
+      path = "/iron-doc-viewer",
+      srcs = [
+          "iron-doc-property-styles.html",
+          "iron-doc-property.html",
+          "iron-doc-viewer-styles.html",
+          "iron-doc-viewer.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_marked_element",
+          "@org_polymer_paper_button",
+          "@org_polymer_paper_styles",
+          "@org_polymer_prism_element",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_dropdown",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
+          "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
+      ],
+      strip_prefix = "iron-dropdown-1.4.0",
+      path = "/iron-dropdown",
+      srcs = [
+          "iron-dropdown.html",
+          "iron-dropdown-scroll-manager.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_overlay_behavior",
+          "@org_polymer_iron_resizable_behavior",
+          "@org_polymer_neon_animation",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_fit_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
+          "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
+      ],
+      strip_prefix = "iron-fit-behavior-1.2.5",
+      path = "/iron-fit-behavior",
+      srcs = ["iron-fit-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_flex_layout",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
+          "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
+      ],
+      strip_prefix = "iron-flex-layout-1.3.0",
+      path = "/iron-flex-layout",
+      srcs = [
+          "classes/iron-flex-layout.html",
+          "classes/iron-shadow-flex-layout.html",
+          "iron-flex-layout.html",
+          "iron-flex-layout-classes.html",
+      ],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_form_element_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
+          "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
+      ],
+      strip_prefix = "iron-form-element-behavior-1.0.6",
+      path = "/iron-form-element-behavior",
+      srcs = ["iron-form-element-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_icon",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
+          "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
+      ],
+      strip_prefix = "iron-icon-1.0.11",
+      path = "/iron-icon",
+      srcs = ["iron-icon.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_meta",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_icons",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
+          "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
+      ],
+      strip_prefix = "iron-icons-1.1.3",
+      path = "/iron-icons",
+      srcs = [
+          "av-icons.html",
+          "communication-icons.html",
+          "device-icons.html",
+          "editor-icons.html",
+          "hardware-icons.html",
+          "image-icons.html",
+          "iron-icons.html",
+          "maps-icons.html",
+          "notification-icons.html",
+          "places-icons.html",
+          "social-icons.html",
+      ],
+      deps = [
+          "@org_polymer_iron_icon",
+          "@org_polymer_iron_iconset_svg",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_iconset_svg",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
+          "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
+      ],
+      strip_prefix = "iron-iconset-svg-1.1.0",
+      path = "/iron-iconset-svg",
+      srcs = ["iron-iconset-svg.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_meta",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_input",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
+          "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
+      ],
+      strip_prefix = "iron-input-1.0.10",
+      path = "/iron-input",
+      srcs = ["iron-input.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_announcer",
+          "@org_polymer_iron_validatable_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_list",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
+          "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
+      ],
+      strip_prefix = "iron-list-1.3.9",
+      path = "/iron-list",
+      srcs = ["iron-list.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_resizable_behavior",
+          "@org_polymer_iron_scroll_target_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_menu_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
+          "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
+      ],
+      strip_prefix = "iron-menu-behavior-1.1.10",
+      path = "/iron-menu-behavior",
+      srcs = [
+          "iron-menu-behavior.html",
+          "iron-menubar-behavior.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_selector",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_meta",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
+          "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
+      ],
+      strip_prefix = "iron-meta-1.1.1",
+      path = "/iron-meta",
+      srcs = ["iron-meta.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_overlay_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
+          "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
+      ],
+      strip_prefix = "iron-overlay-behavior-1.10.1",
+      path = "/iron-overlay-behavior",
+      srcs = [
+          "iron-focusables-helper.html",
+          "iron-overlay-backdrop.html",
+          "iron-overlay-behavior.html",
+          "iron-overlay-manager.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_fit_behavior",
+          "@org_polymer_iron_resizable_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_range_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
+          "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
+      ],
+      strip_prefix = "iron-range-behavior-1.0.4",
+      path = "/iron-range-behavior",
+      srcs = ["iron-range-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_resizable_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
+          "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
+      ],
+      strip_prefix = "iron-resizable-behavior-1.0.3",
+      path = "/iron-resizable-behavior",
+      srcs = ["iron-resizable-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_scroll_target_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
+          "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
+      ],
+      strip_prefix = "iron-scroll-target-behavior-1.0.3",
+      path = "/iron-scroll-target-behavior",
+      srcs = ["iron-scroll-target-behavior.html"],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_selector",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
+          "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
+      ],
+      strip_prefix = "iron-selector-1.5.2",
+      path = "/iron-selector",
+      srcs = [
+          "iron-multi-selectable.html",
+          "iron-selectable.html",
+          "iron-selection.html",
+          "iron-selector.html",
+      ],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_iron_validatable_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
+          "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
+      ],
+      strip_prefix = "iron-validatable-behavior-1.1.1",
+      path = "/iron-validatable-behavior",
+      srcs = ["iron-validatable-behavior.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_meta",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_marked",
+      licenses = ["notice"],  # MIT
+      sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
+      urls = [
+          "http://mirror.bazel.build/github.com/chjj/marked/archive/v0.3.2.zip",
+          "https://github.com/chjj/marked/archive/v0.3.2.zip",
+      ],
+      strip_prefix = "marked-0.3.2",
+      path = "/marked",
+      srcs = ["lib/marked.js"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_marked_element",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
+          "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
+      ],
+      strip_prefix = "marked-element-1.1.3",
+      path = "/marked-element",
+      srcs = [
+          "marked-element.html",
+          "marked-import.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_marked",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_neon_animation",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
+          "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
+      ],
+      strip_prefix = "neon-animation-1.2.2",
+      path = "/neon-animation",
+      srcs = [
+          "animations/cascaded-animation.html",
+          "animations/fade-in-animation.html",
+          "animations/fade-out-animation.html",
+          "animations/hero-animation.html",
+          "animations/opaque-animation.html",
+          "animations/reverse-ripple-animation.html",
+          "animations/ripple-animation.html",
+          "animations/scale-down-animation.html",
+          "animations/scale-up-animation.html",
+          "animations/slide-down-animation.html",
+          "animations/slide-from-bottom-animation.html",
+          "animations/slide-from-left-animation.html",
+          "animations/slide-from-right-animation.html",
+          "animations/slide-from-top-animation.html",
+          "animations/slide-left-animation.html",
+          "animations/slide-right-animation.html",
+          "animations/slide-up-animation.html",
+          "animations/transform-animation.html",
+          "neon-animatable.html",
+          "neon-animatable-behavior.html",
+          "neon-animated-pages.html",
+          "neon-animation.html",
+          "neon-animation-behavior.html",
+          "neon-animation-runner-behavior.html",
+          "neon-animations.html",
+          "neon-shared-element-animatable-behavior.html",
+          "neon-shared-element-animation-behavior.html",
+          "web-animations.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_meta",
+          "@org_polymer_iron_resizable_behavior",
+          "@org_polymer_iron_selector",
+          "@org_polymer_web_animations_js",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_behaviors",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
+          "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
+      ],
+      strip_prefix = "paper-behaviors-1.0.12",
+      path = "/paper-behaviors",
+      srcs = [
+          "paper-button-behavior.html",
+          "paper-checked-element-behavior.html",
+          "paper-inky-focus-behavior.html",
+          "paper-ripple-behavior.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_checked_element_behavior",
+          "@org_polymer_paper_ripple",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_button",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
+          "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
+      ],
+      strip_prefix = "paper-button-1.0.11",
+      path = "/paper-button",
+      srcs = ["paper-button.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_material",
+          "@org_polymer_paper_ripple",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_checkbox",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
+          "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
+      ],
+      strip_prefix = "paper-checkbox-1.4.0",
+      path = "/paper-checkbox",
+      srcs = ["paper-checkbox.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_dialog",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
+          "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
+      ],
+      strip_prefix = "paper-dialog-1.0.4",
+      path = "/paper-dialog",
+      srcs = ["paper-dialog.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_neon_animation",
+          "@org_polymer_paper_dialog_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_dialog_behavior",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
+          "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
+      ],
+      strip_prefix = "paper-dialog-behavior-1.2.5",
+      path = "/paper-dialog-behavior",
+      srcs = [
+          "paper-dialog-behavior.html",
+          "paper-dialog-common.css",
+          "paper-dialog-shared-styles.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_overlay_behavior",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_dialog_scrollable",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
+          "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
+      ],
+      strip_prefix = "paper-dialog-scrollable-1.1.5",
+      path = "/paper-dialog-scrollable",
+      srcs = ["paper-dialog-scrollable.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_dialog_behavior",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_dropdown_menu",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
+          "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
+      ],
+      strip_prefix = "paper-dropdown-menu-1.4.0",
+      path = "/paper-dropdown-menu",
+      srcs = [
+          "paper-dropdown-menu.html",
+          "paper-dropdown-menu-icons.html",
+          "paper-dropdown-menu-light.html",
+          "paper-dropdown-menu-shared-styles.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_form_element_behavior",
+          "@org_polymer_iron_icon",
+          "@org_polymer_iron_iconset_svg",
+          "@org_polymer_iron_validatable_behavior",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_input",
+          "@org_polymer_paper_menu_button",
+          "@org_polymer_paper_ripple",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_header_panel",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "paper-header-panel-1.1.4",
+      path = "/paper-header-panel",
+      srcs = ["paper-header-panel.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_icon_button",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
+          "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
+      ],
+      strip_prefix = "paper-icon-button-1.1.3",
+      path = "/paper-icon-button",
+      srcs = [
+          "paper-icon-button.html",
+          "paper-icon-button-light.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_icon",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_input",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
+          "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
+      ],
+      strip_prefix = "paper-input-1.1.18",
+      path = "/paper-input",
+      srcs = [
+          "paper-input.html",
+          "paper-input-addon-behavior.html",
+          "paper-input-behavior.html",
+          "paper-input-char-counter.html",
+          "paper-input-container.html",
+          "paper-input-error.html",
+          "paper-textarea.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_autogrow_textarea",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_form_element_behavior",
+          "@org_polymer_iron_input",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_item",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "paper-item-1.1.4",
+      path = "/paper-item",
+      srcs = [
+          "paper-icon-item.html",
+          "paper-item.html",
+          "paper-item-behavior.html",
+          "paper-item-body.html",
+          "paper-item-shared-styles.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_listbox",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
+          "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
+      ],
+      strip_prefix = "paper-listbox-1.1.2",
+      path = "/paper-listbox",
+      srcs = ["paper-listbox.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_menu_behavior",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_material",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
+          "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
+      ],
+      strip_prefix = "paper-material-1.0.6",
+      path = "/paper-material",
+      srcs = [
+          "paper-material.html",
+          "paper-material-shared-styles.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_menu",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
+          "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
+      ],
+      strip_prefix = "paper-menu-1.2.2",
+      path = "/paper-menu",
+      srcs = [
+          "paper-menu.html",
+          "paper-menu-shared-styles.html",
+          "paper-submenu.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_collapse",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_menu_behavior",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_menu_button",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
+          "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
+      ],
+      strip_prefix = "paper-menu-button-1.5.1",
+      path = "/paper-menu-button",
+      srcs = [
+          "paper-menu-button.html",
+          "paper-menu-button-animations.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_dropdown",
+          "@org_polymer_neon_animation",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_progress",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
+          "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
+      ],
+      strip_prefix = "paper-progress-1.0.9",
+      path = "/paper-progress",
+      srcs = ["paper-progress.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_range_behavior",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_radio_button",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
+          "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
+      ],
+      strip_prefix = "paper-radio-button-1.1.2",
+      path = "/paper-radio-button",
+      srcs = ["paper-radio-button.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_radio_group",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
+          "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
+      ],
+      strip_prefix = "paper-radio-group-1.0.9",
+      path = "/paper-radio-group",
+      srcs = ["paper-radio-group.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_selector",
+          "@org_polymer_paper_radio_button",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_ripple",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
+          "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
+      ],
+      strip_prefix = "paper-ripple-1.0.5",
+      path = "/paper-ripple",
+      srcs = ["paper-ripple.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_slider",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
+          "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
+      ],
+      strip_prefix = "paper-slider-1.0.10",
+      path = "/paper-slider",
+      srcs = ["paper-slider.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_keys_behavior",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_form_element_behavior",
+          "@org_polymer_iron_range_behavior",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_input",
+          "@org_polymer_paper_progress",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_spinner",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
+          "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
+      ],
+      strip_prefix = "paper-spinner-1.1.1",
+      path = "/paper-spinner",
+      srcs = [
+          "paper-spinner.html", "paper-spinner-behavior.html",
+          "paper-spinner-lite.html", "paper-spinner-styles.html"
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_styles",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "paper-styles-1.1.4",
+      path = "/paper-styles",
+      srcs = [
+          "classes/global.html",
+          "classes/shadow.html",
+          "classes/shadow-layout.html",
+          "classes/typography.html",
+          "color.html",
+          "default-theme.html",
+          "demo.css",
+          "demo-pages.html",
+          "paper-styles.html",
+          "paper-styles-classes.html",
+          "shadow.html",
+          "typography.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_font_roboto",
+          "@org_polymer_iron_flex_layout",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_tabs",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
+          "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
+      ],
+      strip_prefix = "paper-tabs-1.7.0",
+      path = "/paper-tabs",
+      srcs = [
+          "paper-tab.html",
+          "paper-tabs.html",
+          "paper-tabs-icons.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_behaviors",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_icon",
+          "@org_polymer_iron_iconset_svg",
+          "@org_polymer_iron_menu_behavior",
+          "@org_polymer_iron_resizable_behavior",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_icon_button",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_toast",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
+          "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
+      ],
+      strip_prefix = "paper-toast-1.3.0",
+      path = "/paper-toast",
+      srcs = ["paper-toast.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_a11y_announcer",
+          "@org_polymer_iron_overlay_behavior",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_toggle_button",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
+          "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
+      ],
+      strip_prefix = "paper-toggle-button-1.2.0",
+      path = "/paper-toggle-button",
+      srcs = ["paper-toggle-button.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_behaviors",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_toolbar",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "paper-toolbar-1.1.4",
+      path = "/paper-toolbar",
+      srcs = ["paper-toolbar.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_paper_styles",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_paper_tooltip",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
+          "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
+      ],
+      strip_prefix = "paper-tooltip-1.1.2",
+      path = "/paper-tooltip",
+      srcs = ["paper-tooltip.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_neon_animation",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
+      strip_prefix = "polymer-1.7.0",
+      urls = [
+          "http://mirror.bazel.build/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
+          "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
+      ],
+      path = "/polymer",
+      srcs = [
+          "polymer.html",
+          "polymer-micro.html",
+          "polymer-mini.html",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_prism",
+      licenses = ["notice"],  # MIT
+      sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
+      urls = [
+          "http://mirror.bazel.build/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
+          "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
+      ],
+      strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
+      path = "/prism",
+      srcs = [
+          "prism.js",
+          "themes/prism.css",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_prism_element",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
+          "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
+      ],
+      strip_prefix = "prism-element-1.0.4",
+      path = "/prism-element",
+      srcs = [
+          "prism-highlighter.html",
+          "prism-import.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_prism",
+      ],
+  )
+  
+  web_library_external(
+      name = "org_polymer_promise_polyfill",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
+      strip_prefix = "promise-polyfill-1.0.0",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
+          "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
+      ],
+      path = "/promise-polyfill",
+      srcs = [
+          "Promise.js",
+          "Promise-Statics.js",
+          "promise-polyfill.html",
+          "promise-polyfill-lite.html"
+      ],
+      deps = ["@org_polymer"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_web_animations_js",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
+      urls = [
+          "http://mirror.bazel.build/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
+          "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
+      ],
+      strip_prefix = "web-animations-js-2.2.1",
+      path = "/web-animations-js",
+      srcs = ["web-animations-next-lite.min.js"],
+  )
+  
+  web_library_external(
+      name = "org_polymer_webcomponentsjs",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
+      urls = [
+          "http://mirror.bazel.build/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
+          "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
+      ],
+      strip_prefix = "webcomponentsjs-0.7.22",
+      path = "/webcomponentsjs",
+      srcs = [
+          "CustomElements.js",
+          "CustomElements.min.js",
+          "HTMLImports.js",
+          "HTMLImports.min.js",
+          "MutationObserver.js",
+          "MutationObserver.min.js",
+          "ShadowDOM.js",
+          "ShadowDOM.min.js",
+          "webcomponents.js",
+          "webcomponents.min.js",
+          "webcomponents-lite.js",
+          "webcomponents-lite.min.js",
+      ],
+  )
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
new file mode 100644
index 00000000000..edd52095949
--- /dev/null
+++ b/third_party/pprof.BUILD
@@ -0,0 +1,18 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+load("@protobuf//:protobuf.bzl", "py_proto_library")
+
+exports_files(["pprof/LICENSE"])
+
+py_proto_library(
+    name = "pprof_proto_py",
+    srcs = ["proto/profile.proto"],
+    default_runtime = "@protobuf//:protobuf_python",
+    protoc = "@protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@protobuf//:protobuf_python"],
+)
diff --git a/third_party/protobuf/BUILD b/third_party/protobuf/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
new file mode 100644
index 00000000000..af74798f067
--- /dev/null
+++ b/third_party/protobuf/add_noinlines.patch
@@ -0,0 +1,30 @@
+diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
+--- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
++++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
+@@ -557,7 +557,7 @@
+         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
+         "}\n"
+         "\n"
+-        "void protobuf_AssignDescriptorsOnce() {\n"
++        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
+         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
+         "}\n"
+@@ -656,7 +656,7 @@
+   printer->Print(
+       "}\n"
+       "\n"
+-      "void InitDefaults() {\n"
++      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
+       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
+       "}\n");
+@@ -737,7 +737,7 @@
+   printer->Print(
+       "}\n"
+       "\n"
+-      "void AddDescriptors() {\n"
++      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
+       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
+       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
+       "}\n");
diff --git a/third_party/py/BUILD b/third_party/py/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
new file mode 100644
index 00000000000..1ee9c071adb
--- /dev/null
+++ b/third_party/py/BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+%{PYTHON_INCLUDE_GENRULE}
+
+%{NUMPY_INCLUDE_GENRULE}
diff --git a/third_party/py/numpy/BUILD b/third_party/py/numpy/BUILD
index 1d461505a69..be8332572b1 100644
--- a/third_party/py/numpy/BUILD
+++ b/third_party/py/numpy/BUILD
@@ -8,11 +8,9 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-cc_library(
+alias(
     name = "headers",
-    hdrs = glob(["numpy_include/**/*.h"]),
-    data = ["//util/python:python_checked"],
-    includes = ["numpy_include"],
+    actual = "@local_config_python//:numpy_headers",
 )
 
 genrule(
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
new file mode 100644
index 00000000000..b4a98af7b6e
--- /dev/null
+++ b/third_party/py/python_configure.bzl
@@ -0,0 +1,322 @@
+# -*- Python -*-
+"""Repository rule for Python autoconfiguration.
+
+`python_configure` depends on the following environment variables:
+
+  * `NUMPY_INCLUDE_PATH`: Location of Numpy libraries.
+  * `PYTHON_BIN_PATH`: location of python binary.
+  * `PYTHON_INCLUDE_PATH`: Location of python binaries.
+  * `PYTHON_LIB_PATH`: Location of python libraries.
+"""
+
+_NUMPY_INCLUDE_PATH = "NUMPY_INCLUDE_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_PYTHON_INCLUDE_PATH = "PYTHON_INCLUDE_PATH"
+_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
+
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl
+  repository_ctx.template(
+      out,
+      Label("//third_party/py:%s.tpl" % tpl),
+      substitutions)
+
+
+def _python_configure_warning(msg):
+  """Output warning message during auto configuration."""
+  yellow = "\033[1;33m"
+  no_color = "\033[0m"
+  print("%sPython Configuration Warning:%s %s" % (yellow, no_color, msg))
+
+
+def _python_configure_fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
+
+
+def _get_env_var(repository_ctx, name, default = None, enable_warning = True):
+  """Find an environment variable in system path."""
+  if name in repository_ctx.os.environ:
+    return repository_ctx.os.environ[name]
+  if default != None:
+    if enable_warning:
+      _python_configure_warning(
+          "'%s' environment variable is not set, using '%s' as default" % (name, default))
+    return default
+  _python_configure_fail("'%s' environment variable is not set" % name)
+
+
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  os_name = repository_ctx.os.name.lower()
+  if os_name.find("windows") != -1:
+    return True
+  return False
+
+
+def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
+             empty_stdout_fine=False):
+  """Executes an arbitrary shell command.
+
+  Args:
+    repository_ctx: the repository_ctx object
+    cmdline: list of strings, the command to execute
+    error_msg: string, a summary of the error if the command fails
+    error_details: string, details about the error or steps to fix it
+    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+      it's an error
+  Return:
+    the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    _python_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else ""]))
+  return result
+
+
+def _read_dir(repository_ctx, src_dir):
+  """Returns a string with all files in a directory.
+
+  Finds all files inside a directory, traversing subfolders and following
+  symlinks. The returned string contains the full path of all files
+  separated by line breaks.
+  """
+  if _is_windows(repository_ctx):
+    src_dir = src_dir.replace("/", "\\")
+    find_result = _execute(
+        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+        empty_stdout_fine=True)
+    # src_files will be used in genrule.outs where the paths must
+    # use forward slashes.
+    result = find_result.stdout.replace("\\", "/")
+  else:
+    find_result = _execute(
+        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True)
+    result = find_result.stdout
+  return result
+
+
+def _genrule(src_dir, genrule_name, command, outs):
+  """Returns a string with a genrule.
+
+  Genrule executes the given command and produces the given outputs.
+  """
+  return (
+      'genrule(\n' +
+      '    name = "' +
+      genrule_name + '",\n' +
+      '    outs = [\n' +
+      outs +
+      '    ],\n' +
+      '    cmd = """\n' +
+      command +
+      '    """,\n' +
+      ')\n\n'
+  )
+
+
+def _norm_path(path):
+  """Returns a path with '/' and remove the trailing slash."""
+  path = path.replace("\\", "/")
+  if path[-1] == "/":
+    path = path[:-1]
+  return path
+
+
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
+  """Returns a genrule to symlink(or copy if on Windows) a set of files.
+  """
+  src_dir = _norm_path(src_dir)
+  dest_dir = _norm_path(dest_dir)
+  files = _read_dir(repository_ctx, src_dir)
+  # Create a list with the src_dir stripped to use for outputs.
+  dest_files = files.replace(src_dir, '').splitlines()
+  src_files = files.splitlines()
+  command = []
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
+      # On Windows, symlink is not supported, so we just copy all the files.
+      cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
+      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
+      outs.append('      "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
+                     "\n".join(outs))
+  return genrule
+
+
+def _get_python_lib(repository_ctx, python_bin):
+  """Gets the python lib path."""
+  print_lib = ("<<END\n" +
+      "from __future__ import print_function\n" +
+      "import site\n" +
+      "import os\n" +
+      "\n" +
+      "try:\n" +
+      "  input = raw_input\n" +
+      "except NameError:\n" +
+      "  pass\n" +
+      "\n" +
+      "python_paths = []\n" +
+      "if os.getenv('PYTHONPATH') is not None:\n" +
+      "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
+      "try:\n" +
+      "  library_paths = site.getsitepackages()\n" +
+      "except AttributeError:\n" +
+      " from distutils.sysconfig import get_python_lib\n" +
+      " library_paths = [get_python_lib()]\n" +
+      "all_paths = set(python_paths + library_paths)\n" +
+      "paths = []\n" +
+      "for path in all_paths:\n" +
+      "  if os.path.isdir(path):\n" +
+      "    paths.append(path)\n" +
+      "if len(paths) >=1:\n" +
+      "  print(paths[0])\n" +
+      "END")
+  cmd = '%s - %s' % (python_bin, print_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  return result.stdout.strip('\n')
+
+
+def _check_python_lib(repository_ctx, python_lib):
+  """Checks the python lib path."""
+  cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  if result.return_code == 1:
+    _python_configure_fail("Invalid python library path:  %s" % python_lib)
+
+
+def _check_python_bin(repository_ctx, python_bin):
+  """Checks the python bin path."""
+  cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  if result.return_code == 1:
+    _python_configure_fail(
+        "PYTHON_BIN_PATH is not executable.  Is it the python binary?")
+
+
+def _get_python_include(repository_ctx, python_bin):
+  """Gets the python include path."""
+  result = _execute(repository_ctx,
+                    [python_bin, "-c",
+                     'from __future__ import print_function;' +
+                     'from distutils import sysconfig;' +
+                     'print(sysconfig.get_python_inc())'],
+                    error_msg="Problem getting python include path.",
+                    error_details=("Is the Python binary path set up right? " +
+                                   "(See ./configure or BAZEL_BIN_PATH.) " +
+                                   "Is distutils installed?"))
+  return result.stdout.splitlines()[0]
+
+
+def _get_numpy_include(repository_ctx, python_bin):
+  """Gets the numpy include path."""
+  return _execute(repository_ctx,
+                  [python_bin, "-c",
+                   'from __future__ import print_function;' +
+                   'import numpy;' +
+                   ' print(numpy.get_include());'],
+                  error_msg="Problem getting numpy include path.",
+                  error_details="Is numpy installed?").stdout.splitlines()[0]
+
+
+def _create_local_python_repository(repository_ctx):
+  """Creates the repository containing files set up to build with Python."""
+  python_include = None
+  numpy_include = None
+  empty_config = False
+  # If local checks were requested, the python and numpy include will be auto
+  # detected on the host config (using _PYTHON_BIN_PATH).
+  if repository_ctx.attr.local_checks:
+    # TODO(nlopezgi): The default argument here is a workaround until
+    #                 bazelbuild/bazel#3057 is resolved.
+    python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
+                              "/usr/bin/python")
+    _check_python_bin(repository_ctx, python_bin)
+    python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH,
+                              _get_python_lib(repository_ctx, python_bin))
+    _check_python_lib(repository_ctx, python_lib)
+    python_include = _get_python_include(repository_ctx, python_bin)
+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
+  else:
+    # Otherwise, we assume user provides all paths (via ENV or attrs)
+    python_include = _get_env_var(repository_ctx, _PYTHON_INCLUDE_PATH,
+                                  repository_ctx.attr.python_include)
+    numpy_include = _get_env_var(repository_ctx, _NUMPY_INCLUDE_PATH,
+                                 repository_ctx.attr.numpy_include) + '/numpy'
+  if empty_config:
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "python_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+        "%{NUMPY_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "numpy_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+    })
+  else:
+    python_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, python_include, 'python_include', 'python_include')
+    numpy_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+    })
+
+
+def _create_remote_python_repository(repository_ctx):
+  """Creates pointers to a remotely configured repo set up to build with Python.
+  """
+  _tpl(repository_ctx, "remote.BUILD", {
+      "%{REMOTE_PYTHON_REPO}": repository_ctx.attr.remote_config_repo,
+  }, "BUILD")
+
+
+def _python_autoconf_impl(repository_ctx):
+  """Implementation of the python_autoconf repository rule."""
+  if repository_ctx.attr.remote_config_repo != "":
+    _create_remote_python_repository(repository_ctx)
+  else:
+    _create_local_python_repository(repository_ctx)
+
+
+python_configure = repository_rule(
+    implementation = _python_autoconf_impl,
+    attrs = {
+        "local_checks": attr.bool(mandatory = False, default = True),
+        "python_include": attr.string(mandatory = False),
+        "numpy_include": attr.string(mandatory = False),
+        "remote_config_repo": attr.string(mandatory = False, default =""),
+    },
+    environ = [
+        _PYTHON_BIN_PATH,
+        _PYTHON_INCLUDE_PATH,
+        _PYTHON_LIB_PATH,
+        _NUMPY_INCLUDE_PATH,
+    ],
+)
+"""Detects and configures the local Python.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+python_configure(name = "local_config_python")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/py/remote.BUILD.tpl b/third_party/py/remote.BUILD.tpl
new file mode 100644
index 00000000000..edcac41ec6f
--- /dev/null
+++ b/third_party/py/remote.BUILD.tpl
@@ -0,0 +1,13 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+alias(
+    name = "python_headers",
+    actual = "%{REMOTE_PYTHON_REPO}:python_headers",
+)
+
+alias(
+    name = "numpy_headers",
+    actual = "%{REMOTE_PYTHON_REPO}:numpy_headers",
+)
diff --git a/third_party/python.bzl b/third_party/python.bzl
new file mode 100644
index 00000000000..25c2ae3e780
--- /dev/null
+++ b/third_party/python.bzl
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TensorBoard external dependencies that are used on the python side.
+# Protobuf and six were deliberately left in the top-level workspace, as they
+# are used in TensorFlow as well.
+
+def tensorboard_python_workspace():
+  native.new_http_archive(
+      name = "org_pythonhosted_markdown",
+      urls = [
+          "http://mirror.bazel.build/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
+          "https://pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
+      ],
+      strip_prefix = "Markdown-2.6.8",
+      sha256 = "0ac8a81e658167da95d063a9279c9c1b2699f37c7c4153256a458b3a43860e33",
+      build_file = str(Label("//third_party:markdown.BUILD")),
+  )
+
+  native.new_http_archive(
+      name = "org_html5lib",
+      urls = [
+          "http://mirror.bazel.build/github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",
+          "https://github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",  # identical to 1.0b8
+      ],
+      sha256 = "184257f98539159a433e2a2197309657ae1283b4c44dbd9c87b2f02ff36adce8",
+      strip_prefix = "html5lib-python-0.9999999",
+      build_file = str(Label("//third_party:html5lib.BUILD")),
+  )
+
+  native.new_http_archive(
+      name = "org_mozilla_bleach",
+      urls = [
+          "http://mirror.bazel.build/github.com/mozilla/bleach/archive/v1.5.tar.gz",
+          "https://github.com/mozilla/bleach/archive/v1.5.tar.gz",
+      ],
+      strip_prefix = "bleach-1.5",
+      sha256 = "0d68713d02ba4148c417ab1637dd819333d96929a34401d0233947bec0881ad8",
+      build_file = str(Label("//third_party:bleach.BUILD")),
+  )
+  
+  native.new_http_archive(
+      name = "org_pocoo_werkzeug",
+      urls = [
+          "http://mirror.bazel.build/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+          "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+      ],
+      strip_prefix = "Werkzeug-0.11.10",
+      sha256 = "cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
+      build_file = str(Label("//third_party:werkzeug.BUILD")),
+  )
\ No newline at end of file
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
new file mode 100644
index 00000000000..120028dc52a
--- /dev/null
+++ b/third_party/snappy.BUILD
@@ -0,0 +1,45 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # BSD 3-Clause
+
+exports_files(["COPYING"])
+
+cc_library(
+    name = "snappy",
+    srcs = [
+        "snappy.cc",
+        "snappy.h",
+        "snappy-c.cc",
+        "snappy-c.h",
+        "snappy-internal.h",
+        "snappy-sinksource.cc",
+        "snappy-sinksource.h",
+        "snappy-stubs-internal.cc",
+        "snappy-stubs-internal.h",
+        "snappy-stubs-public.h",
+    ],
+    hdrs = ["snappy.h"],
+    copts = [
+        "-Wno-shift-negative-value",
+        "-Wno-implicit-function-declaration",
+    ],
+)
+
+genrule(
+    name = "snappy_stubs_public_h",
+    srcs = ["snappy-stubs-public.h.in"],
+    outs = ["snappy-stubs-public.h"],
+    cmd = ("sed " +
+           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
+           "-e 's/@ac_cv_have_stddef_h@/1/g' " +
+           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
+           select({
+               "@%ws%//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@%ws%//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
+           }) +
+           "-e 's/@SNAPPY_MAJOR@/1/g' " +
+           "-e 's/@SNAPPY_MINOR@/1/g' " +
+           "-e 's/@SNAPPY_PATCHLEVEL@/4/g' " +
+           "$< >$@"),
+)
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
index bea5d6b5314..d698fa934ba 100644
--- a/third_party/swig.BUILD
+++ b/third_party/swig.BUILD
@@ -70,7 +70,8 @@ cc_binary(
         "Source/Swig/wrapfunc.c",
     ],
     copts = ["$(STACK_FRAME_UNLIMITED)"] + select({
-        ":x64_windows_msvc": [],
+        ":windows": [],
+        ":windows_msvc": [],
         "//conditions:default": [
             "-Wno-parentheses",
             "-Wno-unused-variable",
@@ -331,6 +332,11 @@ genrule(
 )
 
 config_setting(
-    name = "x64_windows_msvc",
+    name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD
index 8b137891791..e69de29bb2d 100644
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
@@ -1 +0,0 @@
-
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
index 19b6f3ae32c..2a96cdbf95c 100755
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -7,6 +7,11 @@ default_toolchain {
   toolchain_identifier: "local_linux"
 }
 
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_arm"
+}
+
 toolchain {
   abi_version: "local"
   abi_libc_version: "local"
@@ -49,6 +54,7 @@ toolchain {
   cxx_builtin_include_directory: "/usr/include"
 
   cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
+  cxx_builtin_include_directory: "%{python_lib_path}"
 
   tool_path { name: "gcov" path: "/usr/bin/gcov" }
 
@@ -101,3 +107,96 @@ toolchain {
     compiler_flag: "-DNDEBUG"
   }
 }
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  supports_thin_archives: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_arm"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "computecpp" }
+  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
+  # and the device compiler to use "-std=c++11".
+  cxx_flag: "-std=c++11"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/lib"
+  cxx_builtin_include_directory: "/usr/lib64"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
+  cxx_builtin_include_directory: "%{python_lib_path}"
+
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+
+  # Anticipated future default.
+  linker_flag: "-Wl,-no-as-needed"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+
+  linking_mode_flags { mode: DYNAMIC }
+
+  compilation_mode_flags {
+    mode: FASTBUILD
+    compiler_flag: "-O0"
+  }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-DNDEBUG"
+  }
+}
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index 532d7779f9a..94c5e6aaad0 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
 
 import os
-import subprocess
 import sys
+import tempfile
+from subprocess import call, Popen, PIPE
 
 CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
 CPU_C_COMPILER = ('%{host_c_compiler}')
@@ -13,53 +14,81 @@ COMPUTECPP_DRIVER= COMPUTECPP_ROOT + 'bin/compute++'
 COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
 
 def main():
-  compiler_flags = []
-
-  # remove -fsamotoze-coverage from string
-  if CPU_CXX_COMPILER.find("g++") != -1:
-    compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(('-Wl,--no-undefined', '-fsanitize-coverage', '-Wno-unused-but-set-variable', '-Wignored-attributes'))]
-  else:
-    compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes'))]
+  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes')
+    # remove -fsamotoze-coverage from string with g++
+  if 'g++' in CPU_CXX_COMPILER:
+    remove_flags += ('-fsanitize-coverage',)
+  compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
 
   output_file_index = compiler_flags.index('-o') + 1
   output_file_name = compiler_flags[output_file_index]
 
-  if(output_file_index == 1):
+  if output_file_index == 1:
     # we are linking
-    return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
-
-  compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1']
+    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
 
   # find what we compile
-  compiling_cpp = 0
-  if('-c' in compiler_flags):
-      compiled_file_index = compiler_flags.index('-c') + 1
-      compited_file_name = compiler_flags[compiled_file_index]
-      if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
-          compiling_cpp = 1;
+  compiling_cpp = False
+  if '-c' in compiler_flags:
+    compiled_file_index = compiler_flags.index('-c') + 1
+    compiled_file_name = compiler_flags[compiled_file_index]
+    compiling_cpp = compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))
 
-  if(compiling_cpp == 1):
-      filename, file_extension = os.path.splitext(output_file_name)
-      bc_out = filename + '.sycl'
+  # add -D_GLIBCXX_USE_CXX11_ABI=0 to the command line if you have custom installation of GCC/Clang
+  compiler_flags = compiler_flags + ['-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
 
-      # strip asan for the device
-      computecpp_device_compiler_flags = [flag for flag in compiler_flags if not flag.startswith(('-fsanitize'))]
-      computecpp_device_compiler_flags = ['-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem',
-      COMPUTECPP_INCLUDE, '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop', '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt'] + computecpp_device_compiler_flags
-
-      x = subprocess.call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags )
-      if(x == 0):
-          # dont want that in case of compiling with computecpp first
-          host_compiler_flags = [flag for flag in compiler_flags
-                                    if not flag.startswith(('-MF', '-MD',))
-                                    if not '.d' in flag]
-
-          host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags
-          x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
-      return x
-  else:
+  if not compiling_cpp:
     # compile for C
-    return subprocess.call([CPU_C_COMPILER] + compiler_flags)
+    return call([CPU_C_COMPILER] + compiler_flags)
+
+  # create a blacklist of folders that will be skipped when compiling with ComputeCpp
+  skip_extensions = [".cu.cc"]
+  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "tensorflow/tensorboard", "third_party", "external", "hexagon"]
+  skip_folders = [(folder + '/') for folder in skip_folders]
+  # if compiling external project skip computecpp
+  if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
+    return call([CPU_CXX_COMPILER] + compiler_flags)
+
+  # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
+  flags_without_output = list(compiler_flags)
+  del flags_without_output[output_file_index]   # remove output_file_name
+  del flags_without_output[output_file_index - 1] # remove '-o'
+  # create preprocessed of the file and store it for later use
+  pipe = Popen([CPU_CXX_COMPILER] + flags_without_output + ["-E"], stdout=PIPE)
+  preprocessed_file_str = pipe.communicate()[0]
+  if pipe.returncode != 0:
+    return pipe.returncode
+
+  # check if it has parallel_for in it
+  if not '.parallel_for' in preprocessed_file_str:
+    # call CXX compiler like usual
+    with tempfile.NamedTemporaryFile(suffix=".ii") as preprocessed_file: # Force '.ii' extension so that g++ does not preprocess the file again
+      preprocessed_file.write(preprocessed_file_str)
+      preprocessed_file.flush()
+      compiler_flags[compiled_file_index] = preprocessed_file.name
+      return call([CPU_CXX_COMPILER] + compiler_flags)
+  del preprocessed_file_str   # save some memory as this string can be quite big
+
+  filename, file_extension = os.path.splitext(output_file_name)
+  bc_out = filename + '.sycl'
+
+  # strip asan for the device
+  computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable',
+                                      '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE,
+                                      '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop',
+                                      '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
+  # disable flags enabling SIMD instructions
+  computecpp_device_compiler_flags += [flag for flag in compiler_flags if \
+    not any(x in flag.lower() for x in ('-fsanitize', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
+
+  x = call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags)
+  if x == 0:
+    # dont want that in case of compiling with computecpp first
+    host_compiler_flags = [flag for flag in compiler_flags if (not flag.startswith(('-MF', '-MD',)) and not '.d' in flag)]
+    host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
+    host_compiler_flags = ['-xc++', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
+    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
+  return x
 
 if __name__ == '__main__':
   sys.exit(main())
diff --git a/third_party/sycl/sycl/LICENSE.text b/third_party/sycl/sycl/LICENSE.text
index 0c2955c4d76..8d3f050b392 100644
--- a/third_party/sycl/sycl/LICENSE.text
+++ b/third_party/sycl/sycl/LICENSE.text
@@ -67,7 +67,7 @@ you; so please press the "CANCEL" button to cancel your download.
         ComputeCpp within its marketing materials, without the
         express prior written permission of Codeplay.
  4. Support. Codeplay does not provide any guarantees of support for
-    the Software to the user. Codeplay will use reasonable endeavours
+    the Software to the user. Codeplay will use reasonable endeavors
     to respond to users' support requests, for the most recent
     release only, via the community support website at https://
     computecpp.codeplay.com.
@@ -78,7 +78,7 @@ you; so please press the "CANCEL" button to cancel your download.
     copyrights, trade secrets and other proprietary rights in the
     Software, including the rights to make and license the use of all
     copies. To the extent that any patents owned by Codeplay or its
-    licensors relate to any component of the Software, the licence
+    licensors relate to any component of the Software, the license
     granted to the user in accordance with this Agreement allows for
     the lawful use of such patents but only for the purposes of this
     Agreement and not further or otherwise. Therefore, the user may
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 6ad498487fe..7af063178e0 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -5,18 +5,20 @@
   * HOST_CXX_COMPILER:  The host C++ compiler
   * HOST_C_COMPILER:    The host C compiler
   * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
+  * PYTHON_LIB_PATH: The path to the python lib
 """
 
 _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
 _HOST_C_COMPILER= "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
+_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 def _enable_sycl(repository_ctx):
   if "TF_NEED_OPENCL" in repository_ctx.os.environ:
     enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip()
     return enable_sycl == "1"
   return False
-  
+
 def auto_configure_fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
@@ -55,7 +57,14 @@ def find_computecpp_root(repository_ctx):
     sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
   if sycl_name.startswith("/"):
     return sycl_name
-  fail( "Cannot find SYCL compiler, please correct your path")
+  fail("Cannot find SYCL compiler, please correct your path")
+
+def find_python_lib(repository_ctx):
+  """Returns python path."""
+  if _PYTHON_LIB_PATH in repository_ctx.os.environ:
+    return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
+  fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
+
 
 def _check_lib(repository_ctx, toolkit_path, lib):
   """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
@@ -168,12 +177,13 @@ def _sycl_autoconf_imp(repository_ctx):
       "%{host_c_compiler}" : find_c(repository_ctx),
     })
 
-    computecpp_root = find_computecpp_root(repository_ctx);
+    computecpp_root = find_computecpp_root(repository_ctx)
     _check_dir(repository_ctx, computecpp_root)
 
     _tpl(repository_ctx, "crosstool:CROSSTOOL",
     {
       "%{computecpp_toolkit_path}" : computecpp_root,
+      "%{python_lib_path}" : find_python_lib(repository_ctx),
     })
 
     # symlink libraries
diff --git a/third_party/typings.bzl b/third_party/typings.bzl
new file mode 100644
index 00000000000..d0c9eddbb3f
--- /dev/null
+++ b/third_party/typings.bzl
@@ -0,0 +1,365 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TensorBoard typing dependencies
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+
+def tensorboard_typings_workspace():
+  filegroup_external(
+      name = "org_definitelytyped",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
+          ],
+          "177293828c7a206bf2a7f725753d51396d38668311aa37c96445f91bbf8128a7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
+          ],
+          "e4cd3d5de0eb3bc7b1063b50d336764a0ac82a658b39b5cf90511f489ffdee60": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
+          ],
+          "695a03dd2ccb238161d97160b239ab841562710e5c4e42886aefd4ace2ce152e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
+          ],
+          "513ccd9ee1c708881120eeacd56788fc3b3da8e5c6172b20324cebbe858803fe": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+          ],
+          "44eba36339bd1c0792072b7b204ee926fe5ffe1e9e2da916e67ac55548e3668a": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
+          ],
+          "9453c3e6bae824e90758c3b38975c1ed77e6abd79bf513bcb08368fcdb14898e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+          ],
+          "691756a6eb455f340c9e834de0d49fff269e7b8c1799c2454465dcd6a4435b80": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_array",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "61e7abb7b1f01fbcb0cab8cf39003392f422566209edd681fbd070eaa84ca000": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_axis",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "95f75c8dcc89850b2e72581d96a7b5f46ea4ac852f828893f141f14a597421f9": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_brush",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a2738e693ce8a8640c2d29001e77582c9c361fd23bda44db471629866b60ada7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_chord",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c54d24756eb6d744b31e538ad9bab3a75f6d54e2288b29cc72338d4a057d3e83": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_collection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "f987667167b1d2970911247e325eb1c37ca0823646f81ccec837ae59039822f7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_color",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "9580c81f38ddcce7be0ac9bd3d0d083adebc34e17441709f90b9e4dcd1c19a56": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dispatch",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "169f80b4cceca8e2e9ed384d81a5db0624cc01a26451dfb5a7e0cec6ea9cfb06": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_drag",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "08d35d139dde58c2722be98d718d01204fd6167d310f09b379e832f3c741489d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dsv",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62594d00cf9e4bb895339c8e56f64330e202a5eb2a0fa580a1f6e6336f2c93ce": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_ease",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "d1cf8f99b7bf758c2ba3c0a4ce553e151d4d9b4cf45a6e8bd0edec7ce90f725b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_force",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "288421e2008668d2076a4684657dd3d29b992832ef02c552981eb94a91042553": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_format",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "b42cb17e580c1fd0b64d478f7bd80ca806efaefda24426a833cf1f30a7275bca": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_hierarchy",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a5683f5835d8716c6b89c075235078438cfab5897023ed720bfa492e244e969e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_interpolate",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "590a71b741323ac3139b333ec8b743e24717fdd5b32bcff48ee521162a9dfe1c": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_path",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "96f35ba041bcaa265e2b373ee675177410d44d31c980e4f7fbeefd4bcba15b00": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_polygon",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "ce453451e8105cac6a4f4a4263ca2142ebb4bf442e342f470a81da691f220fcb": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_quadtree",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "238e278f1be5d6985a19800800cffee80f81199f71d848e3bbc288d1791a6f90": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_queue",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "e6ae19aad83495475653578de64fb9d6bf9764eda6c84d70f7935ec84bcc482e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_random",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "d31b92ed86c23ec0a4776f99fa81ff033c95b96c8304d8aa9baf3b94af779aa8": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_request",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "44bb7b07d977028e6567540a3303b06fc9b33fb0960bc75c520e0733c840d89f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_scale",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "02ce7c644ba34bd1abb84da2e832f248b048b6a23812be4365bd837f186c9f1f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_selection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "699043ddb28dfa5e46d87bc6a24cfc6d604237f298259d3fb3c7066e05e8c86e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_shape",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62668a7aaaf6232762b544f9f89c0f557ca7cfb0cd343a358dda7ecbe26f5739": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_time",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "0502490ce682fd9265fb1d5d693ce6cd82e3b05e5f5ee3433731266ecb03d5fc": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_timer",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "6f191f9aea704aa64b1defa40dfdff1447a6e6bb815feff1660f894500a9c94d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_transition",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a0a7c0c9bfb5c7d6d9d22a8d16b4484b66d13f2ed226954037546cb3da4098ba": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_voronoi",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c6bd5f229f915151d0ef678fe50b1aa6a62334ea0a8c6fc0effbac9f7032efc7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+          ],
+      },
+  )
+  
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_zoom",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a25dc17fbd304cf7a0e5e7bbb8339c930d464eb40c4d6e5f839ce9c0191f4110": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+          ],
+      },
+  )
diff --git a/tools/bazel.rc b/tools/bazel.rc
new file mode 100644
index 00000000000..e67a290cf40
--- /dev/null
+++ b/tools/bazel.rc
@@ -0,0 +1,30 @@
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true
+
+build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:mkl --define=using_mkl=true
+
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+
+build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+
+build --define=use_fast_cpp_protos=true
+build --define=allow_oversize_protos=true
+
+build --spawn_strategy=standalone
+test --spawn_strategy=standalone
+run --spawn_strategy=standalone
+
+build --genrule_strategy=standalone
+test --genrule_strategy=standalone
+run --genrule_strategy=standalone
+
+build -c opt
+test -c opt
+run -c opt
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
deleted file mode 100644
index 48c9f5aa3f9..00000000000
--- a/tools/bazel.rc.template
+++ /dev/null
@@ -1,34 +0,0 @@
-build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
-build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
-build:mkl --define=using_mkl=true
-
-build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true
-
-build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -fsanitize=address --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -lasan
-
-build --force_python=py$PYTHON_MAJOR_VERSION
-build --host_force_python=py$PYTHON_MAJOR_VERSION
-build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY
-build --define=use_fast_cpp_protos=true
-build --define=allow_oversize_protos=true
-
-build --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --force_python=py$PYTHON_MAJOR_VERSION
-test --host_force_python=py$PYTHON_MAJOR_VERSION
-run --define PYTHON_BIN_PATH=$PYTHON_BINARY
-
-build --spawn_strategy=standalone
-test --spawn_strategy=standalone
-run --spawn_strategy=standalone
-
-build --genrule_strategy=standalone
-test --genrule_strategy=standalone
-run --genrule_strategy=standalone
-
-build -c opt
-test -c opt
-run -c opt
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
new file mode 100755
index 00000000000..a1c9c88c58c
--- /dev/null
+++ b/tools/tf_env_collect.sh
@@ -0,0 +1,123 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -u  # Check for undefined variables
+
+die() {
+  # Print a message and exit with code 1.
+  #
+  # Usage: die <error_message>
+  #   e.g., die "Something bad happened."
+
+  echo $@
+  exit 1
+}
+
+echo "Collecting system information..."
+
+OUTPUT_FILE=tf_env.txt
+python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
+
+{
+  echo
+  echo "== cat /etc/issue ==============================================="
+  uname -a
+  uname=`uname -s`
+  if [ "$(uname)" == "Darwin" ]; then
+    echo Mac OS X `sw_vers -productVersion`
+  elif [ "$(uname)" == "Linux" ]; then
+    cat /etc/*release | grep VERSION
+  fi
+  
+  echo
+  echo '== are we in docker ============================================='
+  num=`cat /proc/1/cgroup | grep docker | wc -l`;
+  if [ $num -ge 1 ]; then
+    echo "Yes"
+  else
+    echo "No"
+  fi
+  
+  echo
+  echo '== compiler ====================================================='
+  c++ --version 2>&1
+  
+  echo
+  echo '== uname -a ====================================================='
+  uname -a
+  
+  echo
+  echo '== check pips ==================================================='
+  pip list 2>&1 | grep "proto\|numpy\|tensorflow"
+  
+  
+  echo
+  echo '== check for virtualenv ========================================='
+  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
+  
+  echo
+  echo '== tensorflow import ============================================'
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_tf.py
+import tensorflow as tf;
+print("tf.VERSION = %s" % tf.VERSION)
+print("tf.GIT_VERSION = %s" % tf.GIT_VERSION)
+print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
+with tf.Session() as sess:
+  print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
+EOF
+${python_bin_path} /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
+
+DEBUG_LD=libs ${python_bin_path} -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+
+{
+  grep libcudnn.so /tmp/loadedlibs
+  echo
+  echo '== env =========================================================='
+  if [ -z ${LD_LIBRARY_PATH+x} ]; then
+    echo "LD_LIBRARY_PATH is unset";
+  else
+    echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
+  fi
+  if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+    echo "DYLD_LIBRARY_PATH is unset";
+  else
+    echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
+  fi
+  
+  
+  echo
+  echo '== nvidia-smi ==================================================='
+  nvidia-smi 2>&1
+  
+  echo
+  echo '== cuda libs  ==================================================='
+} >> ${OUTPUT_FILE}
+
+find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+
+# Remove any words with google.
+mv $OUTPUT_FILE old-$OUTPUT_FILE
+grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
+
+echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
+echo "and use it to populate the fields in the github issue template."
+echo
+echo "cat ${OUTPUT_FILE}"
+echo
+
diff --git a/util/python/BUILD b/util/python/BUILD
index 29688b875df..96daf9947ad 100644
--- a/util/python/BUILD
+++ b/util/python/BUILD
@@ -2,31 +2,7 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
-cc_library(
+alias(
     name = "python_headers",
-    hdrs = glob([
-        "python_include/**/*.h",
-    ]),
-    data = [":python_checked"],
-    includes = ["python_include"],
-)
-
-genrule(
-    name = "python_check",
-    srcs = [
-        "python_config.sh",
-        "configure_files",
-    ],
-    outs = [
-        "python_checked",
-    ],
-    cmd = "OUTPUTDIR=\"$(@D)/\"; $(location :python_config.sh) --check && touch $$OUTPUTDIR/python_checked",
-    local = 1,
-)
-
-filegroup(
-    name = "configure_files",
-    data = glob([
-        "*",
-    ]),
+    actual = "@local_config_python//:python_headers",
 )
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
deleted file mode 100755
index 789c4b35b35..00000000000
--- a/util/python/python_config.sh
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e -o errexit
-
-if [ -d "../org_tensorflow" ]; then
-  script_path="../org_tensorflow"
-else
-  # Prefix expected paths with ./ locally and external/reponame/ for remote repos.
-  # TODO(kchodorow): remove once runfiles paths are fixed, see
-  # https://github.com/bazelbuild/bazel/issues/848.
-  script_path=$(dirname $(dirname $(dirname "$0")))
-  script_path=${script_path:-.}
-fi
-
-EXPECTED_PATHS="$script_path/util/python/python_include"\
-" $script_path/util/python/python_lib"\
-" $script_path/third_party/py/numpy/numpy_include"
-
-function main {
-  argument="$1"
-  shift
-  case $argument in
-    --check)
-      check_python
-      exit 0
-      ;;
-    --setup)
-      setup_python "$1"
-      exit 0
-      ;;
-  esac
-}
-
-function python_path {
-  "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import site
-import os
-
-try:
-  input = raw_input
-except NameError:
-  pass
-
-python_paths = []
-if os.getenv('PYTHONPATH') is not None:
-  python_paths = os.getenv('PYTHONPATH').split(':')
-try:
-  library_paths = site.getsitepackages()
-except AttributeError:
- from distutils.sysconfig import get_python_lib
- library_paths = [get_python_lib()]
-all_paths = set(python_paths + library_paths)
-
-paths = []
-for path in all_paths:
-  if os.path.isdir(path):
-    paths.append(path)
-
-if len(paths) == 1:
-  print(paths[0])
-else:
-  ret_paths = ",".join(paths)
-  print(ret_paths)
-END
-}
-
-function default_python_path {
-  PYTHON_ARG="$1" "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import os
-
-default = os.getenv('PYTHON_ARG')
-default = str(default)
-print(default)
-END
-}
-
-function setup_python {
-  PYTHON_BIN_PATH="$1";
-
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH was not provided.  Did you run configure?"
-    exit 1
-  fi
-  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
-    exit 1
-  fi
-
-  local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);')
-  if [ "$python_major_version" == "" ]; then
-    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
-    exit 1
-  fi
-
-  local python_include="$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());')"
-  if [ "$python_include" == "" ]; then
-    echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
-    exit 1
-  fi
-
-  if [ -z "$PYTHON_LIB_PATH" ]; then
-    local python_lib_path
-    # Split python_path into an array of paths, this allows path containing spaces
-    IFS=','
-    python_lib_path=($(python_path))
-    unset IFS
-
-    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
-      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-      echo "Using python library path: $PYTHON_LIB_PATH"
-
-    else
-      echo "Found possible Python library paths:"
-      for x in "${python_lib_path[@]}"; do
-        echo "  $x"
-      done
-      set -- "${python_lib_path[@]}"
-      echo "Please input the desired Python library path to use.  Default is ["$1"]"
-      read b || true
-      if [ "$b" == "" ]; then
-        PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-        echo "Using python library path: $PYTHON_LIB_PATH"
-      else
-        PYTHON_LIB_PATH="$b"
-      fi
-    fi
-  fi
-
-  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
-    python_lib="$PYTHON_LIB_PATH"
-  else
-    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
-    exit 1
-  fi
-
-  local numpy_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import numpy; print(numpy.get_include());')
-  if [ "$numpy_include" == "" ]; then
-    echo -e "\n\nERROR: Problem getting numpy include path.  Is numpy installed?"
-    exit 1
-  fi
-
-  for x in $EXPECTED_PATHS; do
-    if [ -e "$x" ]; then
-      rm -rf "$x"
-    fi
-  done
-
-# ln -sf is actually implemented as copying in msys since creating symbolic
-# links is privileged on Windows. But copying is too slow, so invoke mklink
-# to create junctions on Windows.
-  if is_windows; then
-    cmd /c "mklink /J util\\python\\python_include \"${python_include}\""
-    cmd /c "mklink /J util\\python\\python_lib \"${python_lib}\""
-    cmd /c "mklink /J third_party\\py\\numpy\\numpy_include \"${numpy_include}\""
-  else
-    ln -sf "${python_include}" util/python/python_include
-    ln -sf "${python_lib}" util/python/python_lib
-    ln -sf "${numpy_include}" third_party/py/numpy/numpy_include
-  fi
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows; then
-    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
-  fi
-
-  # Write tools/bazel.rc
-  echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
-  sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
-      -e "s[\$PYTHON_BINARY[\"$PYTHON_BIN_PATH\"[g" \
-      tools/bazel.rc.template >> tools/bazel.rc
-  # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
-}
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-function is_windows() {
-  # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function check_python {
-  for x in $EXPECTED_PATHS; do
-    if [ ! -e "$x" ]; then
-      echo -e "\n\nERROR: Cannot find '${x}'.  Did you run configure?\n\n" 1>&2
-      exit 1
-    fi
-    # Don't check symbolic link on Windows
-    if ! is_windows && [ ! -L "${x}" ]; then
-      echo -e "\n\nERROR: '${x}' is not a symbolic link.  Internal error.\n\n" 1>&2
-      exit 1
-    fi
-    if is_windows; then
-      # In msys, readlink <path> doesn't work, because no symbolic link on
-      # Windows. readlink -f <path> returns the real path of a junction.
-      true_path=$(readlink -f "${x}")
-    else
-      true_path=$(readlink "${x}")
-    fi
-    if [ ! -d "${true_path}" ]; then
-      echo -e "\n\nERROR: '${x}' does not refer to an existing directory: ${true_path}.  Do you need to rerun configure?\n\n" 1>&2
-      exit 1
-    fi
-  done
-}
-
-main "$@"